diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..e7cb295259d4307056333da73955133f503d9442 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-13032/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-16290/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-19548/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-22806/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-26064/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-29322/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3258/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-32580/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-6516/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-9774/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoint-13032/config.json b/checkpoint-13032/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a29af639fbf705188c21aae22660a85fee1ca26e --- /dev/null +++ b/checkpoint-13032/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "freeze_mm_mlp_adapter": false, + "gen_hidden_size": 1792, + "gen_pooling": "early_pool2d_4", + "gen_vision_tower": "eva-clip-E-14-plus", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "image_aspect_ratio": "square", + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "llava_llama", + "n_query": 64, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "tokenizer_model_max_length": 256, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "tune_mm_mlp_adapter": false, + "use_cache": false, + "use_mm_proj": true, + "vision_tower_pretrained": null, + "vocab_size": 128260 +} diff --git a/checkpoint-13032/generation_config.json b/checkpoint-13032/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..05892c70fa899883072c585fa444b4aa7175d6bc --- /dev/null +++ b/checkpoint-13032/generation_config.json @@ -0,0 +1,13 @@ +{ + "attn_implementation": "flash_attention_2", + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-13032/latest b/checkpoint-13032/latest new file mode 100644 index 0000000000000000000000000000000000000000..1f1c0ef35d2aa47ce3652ed68b9449e9ebd35ea1 --- /dev/null +++ b/checkpoint-13032/latest @@ -0,0 +1 @@ +global_step13032 \ No newline at end of file diff --git a/checkpoint-13032/model-00001-of-00003.safetensors b/checkpoint-13032/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..77225b27583af42be3ef5d55d49a111967402139 --- /dev/null +++ b/checkpoint-13032/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf6a51200f4a9ff056e63ebbdca1de325020eaca963e1e4f88722061dbd18129 +size 4955415870 diff --git a/checkpoint-13032/model-00002-of-00003.safetensors b/checkpoint-13032/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1c6f3bf70f8abb1e7ffb233219debc10bc20bfc --- /dev/null +++ b/checkpoint-13032/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b088e0e2c4fb5916f448522fa5aef361db713e2c2c0ceac534662c8d52e330d +size 4971563008 diff --git a/checkpoint-13032/model-00003-of-00003.safetensors b/checkpoint-13032/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..20be42a1921d16a76e34e96965857c635a7a5b4f --- /dev/null +++ b/checkpoint-13032/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bac9ccfacb7ba8ba232013e84043e195da9b628a898d3cedd3ee9a669437fa5 +size 4180840856 diff --git a/checkpoint-13032/model.safetensors.index.json b/checkpoint-13032/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c911c94f46f802ae304903dd7796da96c28604 --- /dev/null +++ b/checkpoint-13032/model.safetensors.index.json @@ -0,0 +1,2358 @@ +{ + "metadata": { + "total_size": 14107506086 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.bias": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.cls_token": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.pos_embed": "model-00001-of-00003.safetensors", + "model.latent_queries": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/checkpoint-13032/rng_state_0.pth b/checkpoint-13032/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8de7c559acd8dd23bf603b1aa08e69446e488f07 --- /dev/null +++ b/checkpoint-13032/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1250708e750f5a743ee5aae273b4bbecae64c11dc6a0e4eee353ca778922e0d +size 15984 diff --git a/checkpoint-13032/rng_state_1.pth b/checkpoint-13032/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a4969e15073bada70cf1a560aedf8b3402dace7a --- /dev/null +++ b/checkpoint-13032/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6771e7e039ca20bca1903339ca060f58177008918466a4360c88ca3c265cad4a +size 15984 diff --git a/checkpoint-13032/rng_state_10.pth b/checkpoint-13032/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..05a37351ea7fa3e5c05158d20819c9d0356c0dcc --- /dev/null +++ b/checkpoint-13032/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45c63ab24380aabd7961f4bab021d0ae003adbbc95ee5cb602e0ad559b102d9e +size 15997 diff --git a/checkpoint-13032/rng_state_11.pth b/checkpoint-13032/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a82264753122685679a70776c9c26e4fca7b097 --- /dev/null +++ b/checkpoint-13032/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd332a894753e0e280562363cc156bf96269a3fa53c10cff8a23d8dbe3ad9179 +size 15997 diff --git a/checkpoint-13032/rng_state_12.pth b/checkpoint-13032/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..308dc910aa51b597bd01fa045e7cb9b8ad9bd160 --- /dev/null +++ b/checkpoint-13032/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:badfca90d87489a40b4a17f2c02023150a3a6000a5029fea0ea7ce364191863f +size 15997 diff --git a/checkpoint-13032/rng_state_13.pth b/checkpoint-13032/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..bdbc9f4193ffb747f50ab227a4800107b9d5de8b --- /dev/null +++ b/checkpoint-13032/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ca144cb79015fbd174790533ab41773834c0fdddad1fb2c04ff73e7ce0ed3d2 +size 15997 diff --git a/checkpoint-13032/rng_state_14.pth b/checkpoint-13032/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f0950f3c964925e3fce120e31182d22836d2a0a --- /dev/null +++ b/checkpoint-13032/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e85eebed5813b2cd761fb1c41968bb5f47cec528aa83fbb1dfe65607ce3e995a +size 15997 diff --git a/checkpoint-13032/rng_state_15.pth b/checkpoint-13032/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..068d54c7bd53558aaf65dfc774d6fe2908f5c1f7 --- /dev/null +++ b/checkpoint-13032/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:604bf1defec2a1c11d542d061991d2e97d70537aaed6a797748d0b881346f171 +size 15997 diff --git a/checkpoint-13032/rng_state_16.pth b/checkpoint-13032/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..19ca7018dbaf36e83f04dc09bc9f138da2b1c3bd --- /dev/null +++ b/checkpoint-13032/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:972dc9086c932c76f1d06d01f314b7177865d9e498df412897b077e11f520623 +size 15997 diff --git a/checkpoint-13032/rng_state_17.pth b/checkpoint-13032/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..41b041f4d7c8cc6fe16b94ffbd0daf614cfff433 --- /dev/null +++ b/checkpoint-13032/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27327fa06c61a0dfa3593e2ec2f52b33c55d184e2f0f79f89837b1c01606558b +size 15997 diff --git a/checkpoint-13032/rng_state_18.pth b/checkpoint-13032/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..5859cc5ab69070518284c9fa2c0a0e7cf3b8cc6b --- /dev/null +++ b/checkpoint-13032/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f20f3f49e5916e64bbec46beab7c3097919150abc36a85ebbc1e841676f3e0f7 +size 15997 diff --git a/checkpoint-13032/rng_state_19.pth b/checkpoint-13032/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a8fa6a5a55288cd385f67b502bd2cb7c69f098e --- /dev/null +++ b/checkpoint-13032/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7197b7b6db6817c597c9c3ad319a1dfe47e9b2e88639d36b0ac75676f31a956 +size 15997 diff --git a/checkpoint-13032/rng_state_2.pth b/checkpoint-13032/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..db7600c6ff89c7deb8fe5cd39a0872f189838ae8 --- /dev/null +++ b/checkpoint-13032/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89c9b8c22c1e24b14f7788d8520cea4b8c08ecca78f13ca57386f8243a6909e5 +size 15984 diff --git a/checkpoint-13032/rng_state_20.pth b/checkpoint-13032/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..8667659aba887b643efae6769753149bb1c001c0 --- /dev/null +++ b/checkpoint-13032/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f4d4368710f1bd11e2d14bdeafbfa4c2b1a3255687cb4802f5f8eafe1ddda6c +size 15997 diff --git a/checkpoint-13032/rng_state_21.pth b/checkpoint-13032/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..cdefb3f27e9f6d9814f21762b170251ce045ce7d --- /dev/null +++ b/checkpoint-13032/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:741fb8a2b28efad07ed2eac404a06fcb43a30806485d139e0af372645d969a6c +size 15997 diff --git a/checkpoint-13032/rng_state_22.pth b/checkpoint-13032/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c10bd087acfb0c40322a1132c21c61378e02720 --- /dev/null +++ b/checkpoint-13032/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6bf8e67661c6ff1ecb8edb86fb0770feed71f7b244e88d5ce61f05ec83d3eb7 +size 15997 diff --git a/checkpoint-13032/rng_state_23.pth b/checkpoint-13032/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..142cc42aa45cbcae6dc7cc400a430e9c577395f1 --- /dev/null +++ b/checkpoint-13032/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aea2dcc82e2a321585eebc7cadce2f0c85921eb2699af1e4af2eb97eb2cde94c +size 15997 diff --git a/checkpoint-13032/rng_state_24.pth b/checkpoint-13032/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..83b7e73636d0c9599ed9ebd7832e08a9ad04e4e3 --- /dev/null +++ b/checkpoint-13032/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73c5203f34f502393aff31f4f36bd843117113fc94a4c2018cec70f84698f799 +size 15997 diff --git a/checkpoint-13032/rng_state_25.pth b/checkpoint-13032/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9b29fd61ea113c811d0bcb789faaf80f22acb8d --- /dev/null +++ b/checkpoint-13032/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a2b0ab827e69b540ec019bd019fc27bddd7dfd9bd583926947ac197d33f6fcf +size 15997 diff --git a/checkpoint-13032/rng_state_26.pth b/checkpoint-13032/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..d404a99ad13e9d3fed4b4c78122d1392263095de --- /dev/null +++ b/checkpoint-13032/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02b89bad48a72b9238da5e2155566d38a60a4900dcb55f0b6f5009d366d9df34 +size 15997 diff --git a/checkpoint-13032/rng_state_27.pth b/checkpoint-13032/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..7859acba17b911b53908701aa58d2b72c95d4cb0 --- /dev/null +++ b/checkpoint-13032/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d51f3607e746d1a262e4ee8be1f205650db797c63981c14d8d438cbb63b13e +size 15997 diff --git a/checkpoint-13032/rng_state_28.pth b/checkpoint-13032/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..3e5baaa14e7bc4b1012af42cc4347376cac7bf4f --- /dev/null +++ b/checkpoint-13032/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7b9ec01d8779daa5c52f8b97a9ee223857e30e695620b6d6f315cb448608d30 +size 15997 diff --git a/checkpoint-13032/rng_state_29.pth b/checkpoint-13032/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c6abecfc4d29c8dc53330f80edd4a14907a9338 --- /dev/null +++ b/checkpoint-13032/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:461997afee47dabd6ee84f6697b4f75177ea1464dbb21da582842c7412f63085 +size 15997 diff --git a/checkpoint-13032/rng_state_3.pth b/checkpoint-13032/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..49244ef82b3271a5db67eaa240f8bdc02e338438 --- /dev/null +++ b/checkpoint-13032/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e16c9b60f6b2a8b0dc74724b09ce07b56dae17e497aa02a27eb879e710d13c7 +size 15984 diff --git a/checkpoint-13032/rng_state_30.pth b/checkpoint-13032/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..047c82f44c5529b821e116b220a9b760a2e63376 --- /dev/null +++ b/checkpoint-13032/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df7fecd37ed32f22b38e17b668370adcd5f7438c6137af863ef79978fc440ca7 +size 15997 diff --git a/checkpoint-13032/rng_state_31.pth b/checkpoint-13032/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..0500bfa0bcc73b585de2b24d7431aa6ab5d7ffda --- /dev/null +++ b/checkpoint-13032/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:310f0eb8237ba444f6d0397504f58fe5e231b4b474bcfeba5a25ca9ab5bbd06e +size 15997 diff --git a/checkpoint-13032/rng_state_32.pth b/checkpoint-13032/rng_state_32.pth new file mode 100644 index 0000000000000000000000000000000000000000..afeb6f6df8a7d463308a1818b0ff2cde4cc88773 --- /dev/null +++ b/checkpoint-13032/rng_state_32.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0caea246e8e15a0ed40a8f809365095f537e947d127481530dba0d078f88a75c +size 15997 diff --git a/checkpoint-13032/rng_state_33.pth b/checkpoint-13032/rng_state_33.pth new file mode 100644 index 0000000000000000000000000000000000000000..c3607f48b11f1139c587f3889a92977ca09f2368 --- /dev/null +++ b/checkpoint-13032/rng_state_33.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfbc9e8d6b4bf1d5137acb8ac84717c5a9a03c2cae6cbbfd336b77775cb3b50c +size 15997 diff --git a/checkpoint-13032/rng_state_34.pth b/checkpoint-13032/rng_state_34.pth new file mode 100644 index 0000000000000000000000000000000000000000..9f8ffa990ad2ae40b2af4cbf995eedbc2363830b --- /dev/null +++ b/checkpoint-13032/rng_state_34.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b796128eb946b95fc0f2ec71451ba678d13315f621c184f13655d8f6786e64fe +size 15997 diff --git a/checkpoint-13032/rng_state_35.pth b/checkpoint-13032/rng_state_35.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf01f094f555d9a3c5813d7c5b3d62cef1278c0c --- /dev/null +++ b/checkpoint-13032/rng_state_35.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34eadfcd0fd7718d951b06b1dab5dbee5d740569245bceca75d6dba0a65951da +size 15997 diff --git a/checkpoint-13032/rng_state_36.pth b/checkpoint-13032/rng_state_36.pth new file mode 100644 index 0000000000000000000000000000000000000000..5281b2ad6f8a5d655dd0f556678f1676fbf492e4 --- /dev/null +++ b/checkpoint-13032/rng_state_36.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7a3899131e6e5fb383a7adbdc824fc8bcf6e59405fee3aaa5caf57e48de4790 +size 15997 diff --git a/checkpoint-13032/rng_state_37.pth b/checkpoint-13032/rng_state_37.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc1bb0a948fbfb2dd5f3852a00ca92745aa25afd --- /dev/null +++ b/checkpoint-13032/rng_state_37.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e8c4bbb372f532f9436b772090271be4af4fcad936ebef2a670d789057cc5cd +size 15997 diff --git a/checkpoint-13032/rng_state_38.pth b/checkpoint-13032/rng_state_38.pth new file mode 100644 index 0000000000000000000000000000000000000000..21124b71b6bec89374b274248a3224db2d1a7797 --- /dev/null +++ b/checkpoint-13032/rng_state_38.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4cc98bfe8fab43753b0e7f818cff0041d632b457f8038f9f9e25dd91a014be9 +size 15997 diff --git a/checkpoint-13032/rng_state_39.pth b/checkpoint-13032/rng_state_39.pth new file mode 100644 index 0000000000000000000000000000000000000000..a7afcf2fe380779c154a9ed81982a48908a2adce --- /dev/null +++ b/checkpoint-13032/rng_state_39.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c585807d8870b76d66703ff0b18fb9a486db00813bb4d72da4bc950ecbfc6174 +size 15997 diff --git a/checkpoint-13032/rng_state_4.pth b/checkpoint-13032/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8addcd16a05b980e4055a4cd184839a8aea29861 --- /dev/null +++ b/checkpoint-13032/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3243d263cf92dbf4b97f4dca4fdf529dd2762e18dbac8d200452881d1f4de0e +size 15984 diff --git a/checkpoint-13032/rng_state_40.pth b/checkpoint-13032/rng_state_40.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c0a37566904b0679591949b87d2458a57ac100a --- /dev/null +++ b/checkpoint-13032/rng_state_40.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a8ad576755846e458bedd5536c0338128c2b9595a21bb4bd27f942158a6148c +size 15997 diff --git a/checkpoint-13032/rng_state_41.pth b/checkpoint-13032/rng_state_41.pth new file mode 100644 index 0000000000000000000000000000000000000000..5745856bcb89d357923d781ed74cae0c19d53c9e --- /dev/null +++ b/checkpoint-13032/rng_state_41.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7f5dbd4cb551a19c3dd1f9fdc8e678a3eea12166eb4e0582d1084ac42cecb6f +size 15997 diff --git a/checkpoint-13032/rng_state_42.pth b/checkpoint-13032/rng_state_42.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac379a15868bf90fad78e22499a15414a52e8c1d --- /dev/null +++ b/checkpoint-13032/rng_state_42.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d414a036dd246a610733170c857d8b468cc5f3942df6a709968eba1000a4bb22 +size 15997 diff --git a/checkpoint-13032/rng_state_43.pth b/checkpoint-13032/rng_state_43.pth new file mode 100644 index 0000000000000000000000000000000000000000..c41de7ad94f8da27400f78b2b8fa7dd7da622a50 --- /dev/null +++ b/checkpoint-13032/rng_state_43.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25e7da42d3864ad87b96e279111c6a3c746071e4332a1815b13265dea858a136 +size 15997 diff --git a/checkpoint-13032/rng_state_44.pth b/checkpoint-13032/rng_state_44.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1f5864ad9883d34331906325eacc7b89bd3dca0 --- /dev/null +++ b/checkpoint-13032/rng_state_44.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2f25f88f5ccc42dfe27f6953b1b23e63f46b5c588b73624bb3c9e0bc78bfea6 +size 15997 diff --git a/checkpoint-13032/rng_state_45.pth b/checkpoint-13032/rng_state_45.pth new file mode 100644 index 0000000000000000000000000000000000000000..95765949b4920ef4105556d53ba02bccc9c1694c --- /dev/null +++ b/checkpoint-13032/rng_state_45.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f4b3d73b30c830ac07cd48af3617806e9d2a0ef16ac7da7256fd6518ddf39a9 +size 15997 diff --git a/checkpoint-13032/rng_state_46.pth b/checkpoint-13032/rng_state_46.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b3cc4882566bb2ffbe1b3b917d675235011f7b1 --- /dev/null +++ b/checkpoint-13032/rng_state_46.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3fa82458d50fed2ad1f127ec5b11e3288bc964b25bc778e18052f6f359a445d +size 15997 diff --git a/checkpoint-13032/rng_state_47.pth b/checkpoint-13032/rng_state_47.pth new file mode 100644 index 0000000000000000000000000000000000000000..704b4a23e35d17e79ae75f51e867d3856476ed01 --- /dev/null +++ b/checkpoint-13032/rng_state_47.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f7216c5d1bd6e8a26a896bca4c38fb55f97f106682a75019f39eba3f0920e2b +size 15997 diff --git a/checkpoint-13032/rng_state_48.pth b/checkpoint-13032/rng_state_48.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a46514904129b7e83536966e1c10d9777b7807b --- /dev/null +++ b/checkpoint-13032/rng_state_48.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a0e181c3472e483957d969f4218953438c457e0bde63b913baa97e5a861d70 +size 15997 diff --git a/checkpoint-13032/rng_state_49.pth b/checkpoint-13032/rng_state_49.pth new file mode 100644 index 0000000000000000000000000000000000000000..22974c5c1483536e8752555187b44dba819e7ca4 --- /dev/null +++ b/checkpoint-13032/rng_state_49.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea03b3bce92d565a41283068d902abcf3f2d2073722661f99ffd8f12af005f80 +size 15997 diff --git a/checkpoint-13032/rng_state_5.pth b/checkpoint-13032/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf1b29db673f827804a5d6b6d62c73fd10ba24fc --- /dev/null +++ b/checkpoint-13032/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da3d7f2687b95c748718ed42cf8a777cfff37bc0a1c6f5a60e400bc7c976b00d +size 15984 diff --git a/checkpoint-13032/rng_state_50.pth b/checkpoint-13032/rng_state_50.pth new file mode 100644 index 0000000000000000000000000000000000000000..1176a1d1c89e4be2acd539a82caaeb47b35db403 --- /dev/null +++ b/checkpoint-13032/rng_state_50.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ad8b23f9d1e761fbf5f6572f28633c3b5c2e718ec5d9c11e7380804d57e0622 +size 15997 diff --git a/checkpoint-13032/rng_state_51.pth b/checkpoint-13032/rng_state_51.pth new file mode 100644 index 0000000000000000000000000000000000000000..e28bbf005836b2d2f96b48a5fb312fa1db01a518 --- /dev/null +++ b/checkpoint-13032/rng_state_51.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1243ae07d6d7b4d194f50e735b0dd4a5a22b338186b0e38f422d91d6e892d945 +size 15997 diff --git a/checkpoint-13032/rng_state_52.pth b/checkpoint-13032/rng_state_52.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d6ab84e0f0bdbc14b8bfffd310f8671dbd362c9 --- /dev/null +++ b/checkpoint-13032/rng_state_52.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c34330d02dfa78f12bc97aa724bf4d1b8d5604552800d0dc776fdb7ede33dba7 +size 15997 diff --git a/checkpoint-13032/rng_state_53.pth b/checkpoint-13032/rng_state_53.pth new file mode 100644 index 0000000000000000000000000000000000000000..25ea41db46d4268220c96a2ecf7b42413020ee86 --- /dev/null +++ b/checkpoint-13032/rng_state_53.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cad1957ca2f2e720cf1822f92c78dbd17c833854c6bd4ffbce10877d13a85e1 +size 15997 diff --git a/checkpoint-13032/rng_state_54.pth b/checkpoint-13032/rng_state_54.pth new file mode 100644 index 0000000000000000000000000000000000000000..76b3a250e184472bcb4cba196785a70cbde696ac --- /dev/null +++ b/checkpoint-13032/rng_state_54.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdcd877e19fe19ed3b0f48b366aaba3c9d941936aa0b39da2d6e893f0a230291 +size 15997 diff --git a/checkpoint-13032/rng_state_55.pth b/checkpoint-13032/rng_state_55.pth new file mode 100644 index 0000000000000000000000000000000000000000..2577e3081fe54595635c7ac4e5dc47f60d7a8805 --- /dev/null +++ b/checkpoint-13032/rng_state_55.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02fd307604836b0c6a35b255b85af3c776bcc9c3666a8bcc1eb97102f187bf7c +size 15997 diff --git a/checkpoint-13032/rng_state_56.pth b/checkpoint-13032/rng_state_56.pth new file mode 100644 index 0000000000000000000000000000000000000000..98a2c9e9308c55c9424a1a503483036f44482231 --- /dev/null +++ b/checkpoint-13032/rng_state_56.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dcb7e204acc10da4c65ce4f07dfd8499bd96cef84248a46d3dee8677da6cb34 +size 15997 diff --git a/checkpoint-13032/rng_state_57.pth b/checkpoint-13032/rng_state_57.pth new file mode 100644 index 0000000000000000000000000000000000000000..f1927bf6b6dafd06a0268b16013efdbf94264f5d --- /dev/null +++ b/checkpoint-13032/rng_state_57.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95267e16cbe625436699cd9de9fae377451a46ce6ad75eddd32eed30fbbfb17f +size 15997 diff --git a/checkpoint-13032/rng_state_58.pth b/checkpoint-13032/rng_state_58.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ea240157cf728a51747e7a84bb9b6a12014dddd --- /dev/null +++ b/checkpoint-13032/rng_state_58.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be670dd19eaa1510778ad30adb67f42cda4c265d39aab91ca39f67f1f97c7e90 +size 15997 diff --git a/checkpoint-13032/rng_state_59.pth b/checkpoint-13032/rng_state_59.pth new file mode 100644 index 0000000000000000000000000000000000000000..650e28285c2ea1ef256376cd7f609cfef6af8a3a --- /dev/null +++ b/checkpoint-13032/rng_state_59.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f179bb20d434f591d1a9f741a791478b6161e15efd084d31aac685e3a1063e5b +size 15997 diff --git a/checkpoint-13032/rng_state_6.pth b/checkpoint-13032/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..afd1f25b70b0aa423f090ec594c0958ce5aef9cb --- /dev/null +++ b/checkpoint-13032/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0c27ebd3e332d3d489ff32600af8709bfe59854e791adb08950f93f5623680b +size 15984 diff --git a/checkpoint-13032/rng_state_60.pth b/checkpoint-13032/rng_state_60.pth new file mode 100644 index 0000000000000000000000000000000000000000..96965bab6221d60375385ce3a47c66354d7955b9 --- /dev/null +++ b/checkpoint-13032/rng_state_60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:524fd5e6c89474603317ef948b94268e6b96479e2d0668859c869f92888c9ffd +size 15997 diff --git a/checkpoint-13032/rng_state_61.pth b/checkpoint-13032/rng_state_61.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ddbe60c83c4716a6d38c618cf015a26ed6dc535 --- /dev/null +++ b/checkpoint-13032/rng_state_61.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e1eba1db0b9fcab66a8b8c7e77400bb2b0eb834ebf6958708d9a8b273490bbc +size 15997 diff --git a/checkpoint-13032/rng_state_62.pth b/checkpoint-13032/rng_state_62.pth new file mode 100644 index 0000000000000000000000000000000000000000..f33d63e5593d5aaf941c6b67e524c69f1db90855 --- /dev/null +++ b/checkpoint-13032/rng_state_62.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c3b8afd724418e5ee9085fcb006cc63f3663de01abd770da53aa49907bfe5b4 +size 15997 diff --git a/checkpoint-13032/rng_state_63.pth b/checkpoint-13032/rng_state_63.pth new file mode 100644 index 0000000000000000000000000000000000000000..aa840c40678f40495508ecebe7099a6ee0aca430 --- /dev/null +++ b/checkpoint-13032/rng_state_63.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ba54877d9760c18c20061c3349e0ccbd5dbc384ffb444942893be4ea0f01553 +size 15997 diff --git a/checkpoint-13032/rng_state_7.pth b/checkpoint-13032/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a0f806fe91698c6678f1b1b013b62bfed36b26e --- /dev/null +++ b/checkpoint-13032/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c035774be40758897466fe6fb1036cfc30ebbb263ca60bf3583b5710e837d396 +size 15984 diff --git a/checkpoint-13032/rng_state_8.pth b/checkpoint-13032/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..ebb2957783c365bb2bfb9ab3b3771e2c20933102 --- /dev/null +++ b/checkpoint-13032/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f0670a12eafbe37116c5b1996eb84869df77af9c6367deada14af54368e726b +size 15984 diff --git a/checkpoint-13032/rng_state_9.pth b/checkpoint-13032/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..f4138fc2bc6dc6ce29a501588f3783237b0b7639 --- /dev/null +++ b/checkpoint-13032/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48bcb94c7550d616169d4aff3d2b3905816d81d9ffe24c46aa66d3261711da1b +size 15984 diff --git a/checkpoint-13032/scheduler.pt b/checkpoint-13032/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e419355702ae0d42d5b44c61c6b11dc2e1c8ba2 --- /dev/null +++ b/checkpoint-13032/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1228648be1aed957182380aeb24d860204f6ca2baf9e8148e4ebf30ceaf7c238 +size 1064 diff --git a/checkpoint-13032/special_tokens_map.json b/checkpoint-13032/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad43db72a0e94321a5a9455dce616c68d1f9673 --- /dev/null +++ b/checkpoint-13032/special_tokens_map.json @@ -0,0 +1,46 @@ +{ + "additional_special_tokens": [ + { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-13032/tokenizer.json b/checkpoint-13032/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..444d43e1c25d11b63381073024becd006c83d4f6 --- /dev/null +++ b/checkpoint-13032/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fbef9068a1d82c7fafc3fdfd7c717524c8bfbcaea19c14ce4f8a4e616deb57 +size 17210651 diff --git a/checkpoint-13032/tokenizer_config.json b/checkpoint-13032/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a54102d00c210427fe2da524cea00c5ace13686 --- /dev/null +++ b/checkpoint-13032/tokenizer_config.json @@ -0,0 +1,2102 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "[IMG]", + "[/IMG]", + "" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 256, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-13032/trainer_state.json b/checkpoint-13032/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c7fd01590caaf881e37d0d0f3e0cd7b8a125ba39 --- /dev/null +++ b/checkpoint-13032/trainer_state.json @@ -0,0 +1,91258 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 13032, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003069367710251688, + "grad_norm": 1.3492016792297363, + "learning_rate": 0.0, + "loss": 6.5185, + "step": 1 + }, + { + "epoch": 0.0006138735420503376, + "grad_norm": 1.4303781986236572, + "learning_rate": 1.0224948875255626e-07, + "loss": 6.5124, + "step": 2 + }, + { + "epoch": 0.0009208103130755065, + "grad_norm": 1.3981783390045166, + "learning_rate": 2.0449897750511251e-07, + "loss": 6.5204, + "step": 3 + }, + { + "epoch": 0.0012277470841006752, + "grad_norm": 1.3760672807693481, + "learning_rate": 3.0674846625766876e-07, + "loss": 6.502, + "step": 4 + }, + { + "epoch": 0.001534683855125844, + "grad_norm": 1.3704107999801636, + "learning_rate": 4.0899795501022503e-07, + "loss": 6.5021, + "step": 5 + }, + { + "epoch": 0.001841620626151013, + "grad_norm": 1.3109549283981323, + "learning_rate": 5.112474437627812e-07, + "loss": 6.521, + "step": 6 + }, + { + "epoch": 0.002148557397176182, + "grad_norm": 1.475183367729187, + "learning_rate": 6.134969325153375e-07, + "loss": 6.521, + "step": 7 + }, + { + "epoch": 0.0024554941682013503, + "grad_norm": 1.4563297033309937, + "learning_rate": 7.157464212678937e-07, + "loss": 6.5075, + "step": 8 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 1.437183141708374, + "learning_rate": 8.179959100204501e-07, + "loss": 6.5135, + "step": 9 + }, + { + "epoch": 0.003069367710251688, + "grad_norm": 1.336928129196167, + "learning_rate": 9.202453987730062e-07, + "loss": 6.5138, + "step": 10 + }, + { + "epoch": 0.003376304481276857, + "grad_norm": 1.3220698833465576, + "learning_rate": 1.0224948875255625e-06, + "loss": 6.5187, + "step": 11 + }, + { + "epoch": 0.003683241252302026, + "grad_norm": 1.3990652561187744, + "learning_rate": 1.1247443762781187e-06, + "loss": 6.5129, + "step": 12 + }, + { + "epoch": 0.003990178023327195, + "grad_norm": 1.4394340515136719, + "learning_rate": 1.226993865030675e-06, + "loss": 6.5078, + "step": 13 + }, + { + "epoch": 0.004297114794352364, + "grad_norm": 1.3675259351730347, + "learning_rate": 1.3292433537832312e-06, + "loss": 6.5115, + "step": 14 + }, + { + "epoch": 0.004604051565377533, + "grad_norm": 1.3085063695907593, + "learning_rate": 1.4314928425357874e-06, + "loss": 6.5092, + "step": 15 + }, + { + "epoch": 0.004910988336402701, + "grad_norm": 1.4214227199554443, + "learning_rate": 1.5337423312883435e-06, + "loss": 6.5026, + "step": 16 + }, + { + "epoch": 0.0052179251074278695, + "grad_norm": 1.377146601676941, + "learning_rate": 1.6359918200409001e-06, + "loss": 6.4882, + "step": 17 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.3461124897003174, + "learning_rate": 1.7382413087934563e-06, + "loss": 6.4935, + "step": 18 + }, + { + "epoch": 0.005831798649478207, + "grad_norm": 1.3161669969558716, + "learning_rate": 1.8404907975460124e-06, + "loss": 6.4795, + "step": 19 + }, + { + "epoch": 0.006138735420503376, + "grad_norm": 1.2915974855422974, + "learning_rate": 1.942740286298569e-06, + "loss": 6.4529, + "step": 20 + }, + { + "epoch": 0.006445672191528545, + "grad_norm": 1.2675414085388184, + "learning_rate": 2.044989775051125e-06, + "loss": 6.454, + "step": 21 + }, + { + "epoch": 0.006752608962553714, + "grad_norm": 1.2769283056259155, + "learning_rate": 2.147239263803681e-06, + "loss": 6.4574, + "step": 22 + }, + { + "epoch": 0.007059545733578883, + "grad_norm": 1.2556813955307007, + "learning_rate": 2.2494887525562373e-06, + "loss": 6.4486, + "step": 23 + }, + { + "epoch": 0.007366482504604052, + "grad_norm": 1.2158268690109253, + "learning_rate": 2.3517382413087935e-06, + "loss": 6.4357, + "step": 24 + }, + { + "epoch": 0.007673419275629221, + "grad_norm": 1.2383767366409302, + "learning_rate": 2.45398773006135e-06, + "loss": 6.4347, + "step": 25 + }, + { + "epoch": 0.00798035604665439, + "grad_norm": 1.2865383625030518, + "learning_rate": 2.5562372188139062e-06, + "loss": 6.3611, + "step": 26 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 1.1501989364624023, + "learning_rate": 2.6584867075664624e-06, + "loss": 6.3247, + "step": 27 + }, + { + "epoch": 0.008594229588704727, + "grad_norm": 1.0971378087997437, + "learning_rate": 2.7607361963190186e-06, + "loss": 6.3078, + "step": 28 + }, + { + "epoch": 0.008901166359729895, + "grad_norm": 1.1365599632263184, + "learning_rate": 2.8629856850715747e-06, + "loss": 6.3211, + "step": 29 + }, + { + "epoch": 0.009208103130755065, + "grad_norm": 1.1228944063186646, + "learning_rate": 2.965235173824131e-06, + "loss": 6.3185, + "step": 30 + }, + { + "epoch": 0.009515039901780233, + "grad_norm": 1.126287579536438, + "learning_rate": 3.067484662576687e-06, + "loss": 6.2845, + "step": 31 + }, + { + "epoch": 0.009821976672805401, + "grad_norm": 1.1070353984832764, + "learning_rate": 3.1697341513292436e-06, + "loss": 6.2855, + "step": 32 + }, + { + "epoch": 0.010128913443830571, + "grad_norm": 1.101291537284851, + "learning_rate": 3.2719836400818002e-06, + "loss": 6.2764, + "step": 33 + }, + { + "epoch": 0.010435850214855739, + "grad_norm": 1.0643113851547241, + "learning_rate": 3.374233128834356e-06, + "loss": 6.2363, + "step": 34 + }, + { + "epoch": 0.010742786985880909, + "grad_norm": 0.9714563488960266, + "learning_rate": 3.4764826175869125e-06, + "loss": 6.1771, + "step": 35 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8998560309410095, + "learning_rate": 3.5787321063394683e-06, + "loss": 6.1202, + "step": 36 + }, + { + "epoch": 0.011356660527931247, + "grad_norm": 0.8481987714767456, + "learning_rate": 3.680981595092025e-06, + "loss": 6.0954, + "step": 37 + }, + { + "epoch": 0.011663597298956415, + "grad_norm": 0.8124909996986389, + "learning_rate": 3.783231083844581e-06, + "loss": 6.0832, + "step": 38 + }, + { + "epoch": 0.011970534069981584, + "grad_norm": 0.7968178391456604, + "learning_rate": 3.885480572597138e-06, + "loss": 6.0661, + "step": 39 + }, + { + "epoch": 0.012277470841006752, + "grad_norm": 0.7714207768440247, + "learning_rate": 3.987730061349693e-06, + "loss": 6.0385, + "step": 40 + }, + { + "epoch": 0.012584407612031922, + "grad_norm": 0.7436742782592773, + "learning_rate": 4.08997955010225e-06, + "loss": 6.0227, + "step": 41 + }, + { + "epoch": 0.01289134438305709, + "grad_norm": 0.7447277307510376, + "learning_rate": 4.192229038854806e-06, + "loss": 6.0208, + "step": 42 + }, + { + "epoch": 0.013198281154082258, + "grad_norm": 0.6983785629272461, + "learning_rate": 4.294478527607362e-06, + "loss": 6.0295, + "step": 43 + }, + { + "epoch": 0.013505217925107428, + "grad_norm": 0.6630908250808716, + "learning_rate": 4.3967280163599184e-06, + "loss": 6.004, + "step": 44 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.6481929421424866, + "learning_rate": 4.498977505112475e-06, + "loss": 5.9986, + "step": 45 + }, + { + "epoch": 0.014119091467157766, + "grad_norm": 0.7187685966491699, + "learning_rate": 4.601226993865031e-06, + "loss": 6.0008, + "step": 46 + }, + { + "epoch": 0.014426028238182934, + "grad_norm": 0.6550983190536499, + "learning_rate": 4.703476482617587e-06, + "loss": 5.9735, + "step": 47 + }, + { + "epoch": 0.014732965009208104, + "grad_norm": 0.6780675649642944, + "learning_rate": 4.805725971370143e-06, + "loss": 5.9568, + "step": 48 + }, + { + "epoch": 0.015039901780233272, + "grad_norm": 0.703427791595459, + "learning_rate": 4.9079754601227e-06, + "loss": 5.961, + "step": 49 + }, + { + "epoch": 0.015346838551258441, + "grad_norm": 0.6507543921470642, + "learning_rate": 5.0102249488752554e-06, + "loss": 5.9557, + "step": 50 + }, + { + "epoch": 0.01565377532228361, + "grad_norm": 0.5959481000900269, + "learning_rate": 5.1124744376278124e-06, + "loss": 5.9391, + "step": 51 + }, + { + "epoch": 0.01596071209330878, + "grad_norm": 0.5798730254173279, + "learning_rate": 5.214723926380368e-06, + "loss": 5.9488, + "step": 52 + }, + { + "epoch": 0.016267648864333947, + "grad_norm": 0.5932896137237549, + "learning_rate": 5.316973415132925e-06, + "loss": 5.9176, + "step": 53 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5772561430931091, + "learning_rate": 5.419222903885481e-06, + "loss": 5.9069, + "step": 54 + }, + { + "epoch": 0.016881522406384283, + "grad_norm": 0.5578178763389587, + "learning_rate": 5.521472392638037e-06, + "loss": 5.8924, + "step": 55 + }, + { + "epoch": 0.017188459177409455, + "grad_norm": 0.5458457469940186, + "learning_rate": 5.623721881390593e-06, + "loss": 5.9001, + "step": 56 + }, + { + "epoch": 0.017495395948434623, + "grad_norm": 0.5381231904029846, + "learning_rate": 5.7259713701431494e-06, + "loss": 5.8827, + "step": 57 + }, + { + "epoch": 0.01780233271945979, + "grad_norm": 0.540920615196228, + "learning_rate": 5.828220858895706e-06, + "loss": 5.8763, + "step": 58 + }, + { + "epoch": 0.01810926949048496, + "grad_norm": 0.5378615260124207, + "learning_rate": 5.930470347648262e-06, + "loss": 5.865, + "step": 59 + }, + { + "epoch": 0.01841620626151013, + "grad_norm": 0.5139282941818237, + "learning_rate": 6.032719836400819e-06, + "loss": 5.873, + "step": 60 + }, + { + "epoch": 0.0187231430325353, + "grad_norm": 0.5298904776573181, + "learning_rate": 6.134969325153374e-06, + "loss": 5.861, + "step": 61 + }, + { + "epoch": 0.019030079803560467, + "grad_norm": 0.503131628036499, + "learning_rate": 6.237218813905931e-06, + "loss": 5.844, + "step": 62 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.5133433938026428, + "learning_rate": 6.339468302658487e-06, + "loss": 5.8535, + "step": 63 + }, + { + "epoch": 0.019643953345610803, + "grad_norm": 0.4909187853336334, + "learning_rate": 6.4417177914110434e-06, + "loss": 5.8378, + "step": 64 + }, + { + "epoch": 0.019950890116635974, + "grad_norm": 0.6916642785072327, + "learning_rate": 6.5439672801636004e-06, + "loss": 5.8385, + "step": 65 + }, + { + "epoch": 0.020257826887661142, + "grad_norm": 0.4801484942436218, + "learning_rate": 6.646216768916155e-06, + "loss": 5.8089, + "step": 66 + }, + { + "epoch": 0.02056476365868631, + "grad_norm": 0.47745251655578613, + "learning_rate": 6.748466257668712e-06, + "loss": 5.8119, + "step": 67 + }, + { + "epoch": 0.020871700429711478, + "grad_norm": 0.4693359136581421, + "learning_rate": 6.850715746421268e-06, + "loss": 5.8038, + "step": 68 + }, + { + "epoch": 0.02117863720073665, + "grad_norm": 0.46996453404426575, + "learning_rate": 6.952965235173825e-06, + "loss": 5.7966, + "step": 69 + }, + { + "epoch": 0.021485573971761818, + "grad_norm": 0.45779168605804443, + "learning_rate": 7.05521472392638e-06, + "loss": 5.7959, + "step": 70 + }, + { + "epoch": 0.021792510742786986, + "grad_norm": 0.49008259177207947, + "learning_rate": 7.1574642126789366e-06, + "loss": 5.7861, + "step": 71 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.44727766513824463, + "learning_rate": 7.259713701431494e-06, + "loss": 5.7716, + "step": 72 + }, + { + "epoch": 0.022406384284837322, + "grad_norm": 0.4392741918563843, + "learning_rate": 7.36196319018405e-06, + "loss": 5.7776, + "step": 73 + }, + { + "epoch": 0.022713321055862493, + "grad_norm": 0.43525391817092896, + "learning_rate": 7.464212678936605e-06, + "loss": 5.7687, + "step": 74 + }, + { + "epoch": 0.02302025782688766, + "grad_norm": 0.4370710253715515, + "learning_rate": 7.566462167689162e-06, + "loss": 5.7504, + "step": 75 + }, + { + "epoch": 0.02332719459791283, + "grad_norm": 0.4349770247936249, + "learning_rate": 7.668711656441718e-06, + "loss": 5.7425, + "step": 76 + }, + { + "epoch": 0.023634131368937997, + "grad_norm": 0.42710933089256287, + "learning_rate": 7.770961145194275e-06, + "loss": 5.7562, + "step": 77 + }, + { + "epoch": 0.02394106813996317, + "grad_norm": 0.42816224694252014, + "learning_rate": 7.87321063394683e-06, + "loss": 5.7301, + "step": 78 + }, + { + "epoch": 0.024248004910988337, + "grad_norm": 0.4183364510536194, + "learning_rate": 7.975460122699386e-06, + "loss": 5.7131, + "step": 79 + }, + { + "epoch": 0.024554941682013505, + "grad_norm": 0.4179428517818451, + "learning_rate": 8.077709611451943e-06, + "loss": 5.7057, + "step": 80 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.40880727767944336, + "learning_rate": 8.1799591002045e-06, + "loss": 5.7179, + "step": 81 + }, + { + "epoch": 0.025168815224063844, + "grad_norm": 0.40961235761642456, + "learning_rate": 8.282208588957055e-06, + "loss": 5.7008, + "step": 82 + }, + { + "epoch": 0.025475751995089013, + "grad_norm": 0.46789029240608215, + "learning_rate": 8.384458077709612e-06, + "loss": 5.7071, + "step": 83 + }, + { + "epoch": 0.02578268876611418, + "grad_norm": 0.4776248335838318, + "learning_rate": 8.486707566462168e-06, + "loss": 5.6829, + "step": 84 + }, + { + "epoch": 0.02608962553713935, + "grad_norm": 0.40660589933395386, + "learning_rate": 8.588957055214725e-06, + "loss": 5.6732, + "step": 85 + }, + { + "epoch": 0.026396562308164517, + "grad_norm": 0.3984324038028717, + "learning_rate": 8.69120654396728e-06, + "loss": 5.6777, + "step": 86 + }, + { + "epoch": 0.026703499079189688, + "grad_norm": 0.3972148597240448, + "learning_rate": 8.793456032719837e-06, + "loss": 5.6598, + "step": 87 + }, + { + "epoch": 0.027010435850214856, + "grad_norm": 0.3906182050704956, + "learning_rate": 8.895705521472392e-06, + "loss": 5.6468, + "step": 88 + }, + { + "epoch": 0.027317372621240024, + "grad_norm": 0.38598939776420593, + "learning_rate": 8.99795501022495e-06, + "loss": 5.6452, + "step": 89 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.405943363904953, + "learning_rate": 9.100204498977506e-06, + "loss": 5.6408, + "step": 90 + }, + { + "epoch": 0.027931246163290364, + "grad_norm": 0.3859459161758423, + "learning_rate": 9.202453987730062e-06, + "loss": 5.613, + "step": 91 + }, + { + "epoch": 0.028238182934315532, + "grad_norm": 0.3773545026779175, + "learning_rate": 9.304703476482619e-06, + "loss": 5.6277, + "step": 92 + }, + { + "epoch": 0.0285451197053407, + "grad_norm": 0.36915943026542664, + "learning_rate": 9.406952965235174e-06, + "loss": 5.618, + "step": 93 + }, + { + "epoch": 0.028852056476365868, + "grad_norm": 0.3732316792011261, + "learning_rate": 9.509202453987731e-06, + "loss": 5.6066, + "step": 94 + }, + { + "epoch": 0.029158993247391036, + "grad_norm": 0.3670802414417267, + "learning_rate": 9.611451942740286e-06, + "loss": 5.6189, + "step": 95 + }, + { + "epoch": 0.029465930018416207, + "grad_norm": 0.3672202229499817, + "learning_rate": 9.713701431492843e-06, + "loss": 5.6046, + "step": 96 + }, + { + "epoch": 0.029772866789441375, + "grad_norm": 0.3624509871006012, + "learning_rate": 9.8159509202454e-06, + "loss": 5.585, + "step": 97 + }, + { + "epoch": 0.030079803560466543, + "grad_norm": 0.36265870928764343, + "learning_rate": 9.918200408997956e-06, + "loss": 5.5867, + "step": 98 + }, + { + "epoch": 0.03038674033149171, + "grad_norm": 0.3606979548931122, + "learning_rate": 1.0020449897750511e-05, + "loss": 5.5658, + "step": 99 + }, + { + "epoch": 0.030693677102516883, + "grad_norm": 0.36800363659858704, + "learning_rate": 1.0122699386503068e-05, + "loss": 5.5494, + "step": 100 + }, + { + "epoch": 0.03100061387354205, + "grad_norm": 0.3641016483306885, + "learning_rate": 1.0224948875255625e-05, + "loss": 5.5553, + "step": 101 + }, + { + "epoch": 0.03130755064456722, + "grad_norm": 0.36807990074157715, + "learning_rate": 1.032719836400818e-05, + "loss": 5.5315, + "step": 102 + }, + { + "epoch": 0.03161448741559239, + "grad_norm": 0.37071728706359863, + "learning_rate": 1.0429447852760736e-05, + "loss": 5.522, + "step": 103 + }, + { + "epoch": 0.03192142418661756, + "grad_norm": 0.3549076020717621, + "learning_rate": 1.0531697341513293e-05, + "loss": 5.5354, + "step": 104 + }, + { + "epoch": 0.03222836095764273, + "grad_norm": 0.3589537441730499, + "learning_rate": 1.063394683026585e-05, + "loss": 5.534, + "step": 105 + }, + { + "epoch": 0.032535297728667895, + "grad_norm": 0.4341397285461426, + "learning_rate": 1.0736196319018407e-05, + "loss": 5.5088, + "step": 106 + }, + { + "epoch": 0.03284223449969306, + "grad_norm": 0.37220680713653564, + "learning_rate": 1.0838445807770962e-05, + "loss": 5.5213, + "step": 107 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.3776145875453949, + "learning_rate": 1.0940695296523517e-05, + "loss": 5.4955, + "step": 108 + }, + { + "epoch": 0.0334561080417434, + "grad_norm": 0.38651829957962036, + "learning_rate": 1.1042944785276074e-05, + "loss": 5.4916, + "step": 109 + }, + { + "epoch": 0.03376304481276857, + "grad_norm": 0.3749970495700836, + "learning_rate": 1.1145194274028631e-05, + "loss": 5.4686, + "step": 110 + }, + { + "epoch": 0.03406998158379374, + "grad_norm": 0.38184404373168945, + "learning_rate": 1.1247443762781187e-05, + "loss": 5.4694, + "step": 111 + }, + { + "epoch": 0.03437691835481891, + "grad_norm": 0.38783952593803406, + "learning_rate": 1.1349693251533742e-05, + "loss": 5.4447, + "step": 112 + }, + { + "epoch": 0.03468385512584408, + "grad_norm": 0.369125097990036, + "learning_rate": 1.1451942740286299e-05, + "loss": 5.4506, + "step": 113 + }, + { + "epoch": 0.034990791896869246, + "grad_norm": 0.3773012161254883, + "learning_rate": 1.1554192229038856e-05, + "loss": 5.4637, + "step": 114 + }, + { + "epoch": 0.035297728667894414, + "grad_norm": 0.47702446579933167, + "learning_rate": 1.1656441717791411e-05, + "loss": 5.4487, + "step": 115 + }, + { + "epoch": 0.03560466543891958, + "grad_norm": 0.5288241505622864, + "learning_rate": 1.1758691206543968e-05, + "loss": 5.4216, + "step": 116 + }, + { + "epoch": 0.03591160220994475, + "grad_norm": 0.49916699528694153, + "learning_rate": 1.1860940695296524e-05, + "loss": 5.4055, + "step": 117 + }, + { + "epoch": 0.03621853898096992, + "grad_norm": 0.5027921795845032, + "learning_rate": 1.196319018404908e-05, + "loss": 5.4141, + "step": 118 + }, + { + "epoch": 0.036525475751995086, + "grad_norm": 0.5069209933280945, + "learning_rate": 1.2065439672801638e-05, + "loss": 5.4277, + "step": 119 + }, + { + "epoch": 0.03683241252302026, + "grad_norm": 0.5208525657653809, + "learning_rate": 1.2167689161554193e-05, + "loss": 5.4023, + "step": 120 + }, + { + "epoch": 0.03713934929404543, + "grad_norm": 0.7059593796730042, + "learning_rate": 1.2269938650306748e-05, + "loss": 5.3797, + "step": 121 + }, + { + "epoch": 0.0374462860650706, + "grad_norm": 0.71112060546875, + "learning_rate": 1.2372188139059305e-05, + "loss": 5.3619, + "step": 122 + }, + { + "epoch": 0.037753222836095765, + "grad_norm": 0.5095361471176147, + "learning_rate": 1.2474437627811862e-05, + "loss": 5.3667, + "step": 123 + }, + { + "epoch": 0.03806015960712093, + "grad_norm": 0.986062228679657, + "learning_rate": 1.2576687116564418e-05, + "loss": 5.3459, + "step": 124 + }, + { + "epoch": 0.0383670963781461, + "grad_norm": 0.693392813205719, + "learning_rate": 1.2678936605316975e-05, + "loss": 5.3165, + "step": 125 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7835625410079956, + "learning_rate": 1.278118609406953e-05, + "loss": 5.3205, + "step": 126 + }, + { + "epoch": 0.03898096992019644, + "grad_norm": 0.6314569711685181, + "learning_rate": 1.2883435582822087e-05, + "loss": 5.3287, + "step": 127 + }, + { + "epoch": 0.039287906691221605, + "grad_norm": 0.9079526662826538, + "learning_rate": 1.2985685071574644e-05, + "loss": 5.2935, + "step": 128 + }, + { + "epoch": 0.03959484346224678, + "grad_norm": 0.6998131275177002, + "learning_rate": 1.3087934560327201e-05, + "loss": 5.315, + "step": 129 + }, + { + "epoch": 0.03990178023327195, + "grad_norm": 0.7570182085037231, + "learning_rate": 1.3190184049079754e-05, + "loss": 5.293, + "step": 130 + }, + { + "epoch": 0.040208717004297116, + "grad_norm": 0.6972737908363342, + "learning_rate": 1.329243353783231e-05, + "loss": 5.2863, + "step": 131 + }, + { + "epoch": 0.040515653775322284, + "grad_norm": 0.8841190934181213, + "learning_rate": 1.3394683026584867e-05, + "loss": 5.2518, + "step": 132 + }, + { + "epoch": 0.04082259054634745, + "grad_norm": 0.6792641282081604, + "learning_rate": 1.3496932515337424e-05, + "loss": 5.2386, + "step": 133 + }, + { + "epoch": 0.04112952731737262, + "grad_norm": 0.9234145879745483, + "learning_rate": 1.359918200408998e-05, + "loss": 5.2418, + "step": 134 + }, + { + "epoch": 0.04143646408839779, + "grad_norm": 1.1438226699829102, + "learning_rate": 1.3701431492842536e-05, + "loss": 5.2298, + "step": 135 + }, + { + "epoch": 0.041743400859422956, + "grad_norm": 0.910861074924469, + "learning_rate": 1.3803680981595093e-05, + "loss": 5.2437, + "step": 136 + }, + { + "epoch": 0.042050337630448124, + "grad_norm": 0.8995844721794128, + "learning_rate": 1.390593047034765e-05, + "loss": 5.2456, + "step": 137 + }, + { + "epoch": 0.0423572744014733, + "grad_norm": 0.8543404936790466, + "learning_rate": 1.4008179959100204e-05, + "loss": 5.1888, + "step": 138 + }, + { + "epoch": 0.04266421117249847, + "grad_norm": 0.7565917372703552, + "learning_rate": 1.411042944785276e-05, + "loss": 5.1939, + "step": 139 + }, + { + "epoch": 0.042971147943523635, + "grad_norm": 0.7103878259658813, + "learning_rate": 1.4212678936605318e-05, + "loss": 5.1693, + "step": 140 + }, + { + "epoch": 0.0432780847145488, + "grad_norm": 1.008686900138855, + "learning_rate": 1.4314928425357873e-05, + "loss": 5.1467, + "step": 141 + }, + { + "epoch": 0.04358502148557397, + "grad_norm": 1.1134421825408936, + "learning_rate": 1.441717791411043e-05, + "loss": 5.1695, + "step": 142 + }, + { + "epoch": 0.04389195825659914, + "grad_norm": 0.7418283820152283, + "learning_rate": 1.4519427402862987e-05, + "loss": 5.1556, + "step": 143 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 1.3332276344299316, + "learning_rate": 1.4621676891615542e-05, + "loss": 5.1736, + "step": 144 + }, + { + "epoch": 0.044505831798649476, + "grad_norm": 0.99709153175354, + "learning_rate": 1.47239263803681e-05, + "loss": 5.1326, + "step": 145 + }, + { + "epoch": 0.044812768569674644, + "grad_norm": 2.0185158252716064, + "learning_rate": 1.4826175869120657e-05, + "loss": 5.1075, + "step": 146 + }, + { + "epoch": 0.04511970534069982, + "grad_norm": 0.9810693264007568, + "learning_rate": 1.492842535787321e-05, + "loss": 5.1181, + "step": 147 + }, + { + "epoch": 0.04542664211172499, + "grad_norm": 1.3122087717056274, + "learning_rate": 1.5030674846625767e-05, + "loss": 5.1104, + "step": 148 + }, + { + "epoch": 0.045733578882750155, + "grad_norm": 1.230662226676941, + "learning_rate": 1.5132924335378324e-05, + "loss": 5.0721, + "step": 149 + }, + { + "epoch": 0.04604051565377532, + "grad_norm": 0.9584419131278992, + "learning_rate": 1.523517382413088e-05, + "loss": 5.0574, + "step": 150 + }, + { + "epoch": 0.04634745242480049, + "grad_norm": 1.3933353424072266, + "learning_rate": 1.5337423312883436e-05, + "loss": 5.0468, + "step": 151 + }, + { + "epoch": 0.04665438919582566, + "grad_norm": 1.2336134910583496, + "learning_rate": 1.5439672801635993e-05, + "loss": 5.0596, + "step": 152 + }, + { + "epoch": 0.04696132596685083, + "grad_norm": 1.3005256652832031, + "learning_rate": 1.554192229038855e-05, + "loss": 5.0236, + "step": 153 + }, + { + "epoch": 0.047268262737875995, + "grad_norm": 1.2528692483901978, + "learning_rate": 1.5644171779141108e-05, + "loss": 5.0269, + "step": 154 + }, + { + "epoch": 0.04757519950890117, + "grad_norm": 1.0448148250579834, + "learning_rate": 1.574642126789366e-05, + "loss": 5.0338, + "step": 155 + }, + { + "epoch": 0.04788213627992634, + "grad_norm": 1.2372045516967773, + "learning_rate": 1.5848670756646218e-05, + "loss": 4.9544, + "step": 156 + }, + { + "epoch": 0.048189073050951506, + "grad_norm": 1.2700645923614502, + "learning_rate": 1.5950920245398772e-05, + "loss": 4.9723, + "step": 157 + }, + { + "epoch": 0.048496009821976674, + "grad_norm": 1.1283228397369385, + "learning_rate": 1.605316973415133e-05, + "loss": 4.9801, + "step": 158 + }, + { + "epoch": 0.04880294659300184, + "grad_norm": 1.5563665628433228, + "learning_rate": 1.6155419222903886e-05, + "loss": 4.9118, + "step": 159 + }, + { + "epoch": 0.04910988336402701, + "grad_norm": 1.3759487867355347, + "learning_rate": 1.6257668711656443e-05, + "loss": 4.9552, + "step": 160 + }, + { + "epoch": 0.04941682013505218, + "grad_norm": 1.2167878150939941, + "learning_rate": 1.6359918200409e-05, + "loss": 4.9186, + "step": 161 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.6424930095672607, + "learning_rate": 1.6462167689161557e-05, + "loss": 4.9143, + "step": 162 + }, + { + "epoch": 0.050030693677102514, + "grad_norm": 1.0009948015213013, + "learning_rate": 1.656441717791411e-05, + "loss": 4.8615, + "step": 163 + }, + { + "epoch": 0.05033763044812769, + "grad_norm": 1.8803274631500244, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.8558, + "step": 164 + }, + { + "epoch": 0.05064456721915286, + "grad_norm": 1.1819735765457153, + "learning_rate": 1.6768916155419224e-05, + "loss": 4.8453, + "step": 165 + }, + { + "epoch": 0.050951503990178025, + "grad_norm": 1.9724273681640625, + "learning_rate": 1.6871165644171778e-05, + "loss": 4.8573, + "step": 166 + }, + { + "epoch": 0.05125844076120319, + "grad_norm": 1.4624557495117188, + "learning_rate": 1.6973415132924335e-05, + "loss": 4.8494, + "step": 167 + }, + { + "epoch": 0.05156537753222836, + "grad_norm": 1.4750267267227173, + "learning_rate": 1.7075664621676892e-05, + "loss": 4.8296, + "step": 168 + }, + { + "epoch": 0.05187231430325353, + "grad_norm": 1.3206923007965088, + "learning_rate": 1.717791411042945e-05, + "loss": 4.7834, + "step": 169 + }, + { + "epoch": 0.0521792510742787, + "grad_norm": 1.4332681894302368, + "learning_rate": 1.7280163599182006e-05, + "loss": 4.8008, + "step": 170 + }, + { + "epoch": 0.052486187845303865, + "grad_norm": 1.612804651260376, + "learning_rate": 1.738241308793456e-05, + "loss": 4.7885, + "step": 171 + }, + { + "epoch": 0.05279312461632903, + "grad_norm": 1.3880311250686646, + "learning_rate": 1.7484662576687117e-05, + "loss": 4.8034, + "step": 172 + }, + { + "epoch": 0.05310006138735421, + "grad_norm": 1.7550631761550903, + "learning_rate": 1.7586912065439674e-05, + "loss": 4.7568, + "step": 173 + }, + { + "epoch": 0.053406998158379376, + "grad_norm": 1.653678297996521, + "learning_rate": 1.768916155419223e-05, + "loss": 4.7294, + "step": 174 + }, + { + "epoch": 0.053713934929404544, + "grad_norm": 1.6094826459884644, + "learning_rate": 1.7791411042944784e-05, + "loss": 4.7409, + "step": 175 + }, + { + "epoch": 0.05402087170042971, + "grad_norm": 1.7453033924102783, + "learning_rate": 1.789366053169734e-05, + "loss": 4.7191, + "step": 176 + }, + { + "epoch": 0.05432780847145488, + "grad_norm": 1.3073794841766357, + "learning_rate": 1.79959100204499e-05, + "loss": 4.7347, + "step": 177 + }, + { + "epoch": 0.05463474524248005, + "grad_norm": 2.096515655517578, + "learning_rate": 1.8098159509202455e-05, + "loss": 4.7396, + "step": 178 + }, + { + "epoch": 0.054941682013505216, + "grad_norm": 1.3826024532318115, + "learning_rate": 1.8200408997955012e-05, + "loss": 4.6988, + "step": 179 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 1.9290310144424438, + "learning_rate": 1.8302658486707566e-05, + "loss": 4.6653, + "step": 180 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.7404149770736694, + "learning_rate": 1.8404907975460123e-05, + "loss": 4.7102, + "step": 181 + }, + { + "epoch": 0.05586249232658073, + "grad_norm": 1.7535779476165771, + "learning_rate": 1.850715746421268e-05, + "loss": 4.7124, + "step": 182 + }, + { + "epoch": 0.056169429097605895, + "grad_norm": 1.7792351245880127, + "learning_rate": 1.8609406952965237e-05, + "loss": 4.6969, + "step": 183 + }, + { + "epoch": 0.056476365868631064, + "grad_norm": 2.048332452774048, + "learning_rate": 1.8711656441717794e-05, + "loss": 4.6134, + "step": 184 + }, + { + "epoch": 0.05678330263965623, + "grad_norm": 1.9558366537094116, + "learning_rate": 1.8813905930470348e-05, + "loss": 4.6739, + "step": 185 + }, + { + "epoch": 0.0570902394106814, + "grad_norm": 2.5299644470214844, + "learning_rate": 1.8916155419222905e-05, + "loss": 4.6248, + "step": 186 + }, + { + "epoch": 0.05739717618170657, + "grad_norm": 2.143704891204834, + "learning_rate": 1.9018404907975462e-05, + "loss": 4.6664, + "step": 187 + }, + { + "epoch": 0.057704112952731736, + "grad_norm": 1.925010323524475, + "learning_rate": 1.9120654396728015e-05, + "loss": 4.5657, + "step": 188 + }, + { + "epoch": 0.058011049723756904, + "grad_norm": 1.8223596811294556, + "learning_rate": 1.9222903885480572e-05, + "loss": 4.6124, + "step": 189 + }, + { + "epoch": 0.05831798649478207, + "grad_norm": 1.9519827365875244, + "learning_rate": 1.932515337423313e-05, + "loss": 4.5937, + "step": 190 + }, + { + "epoch": 0.05862492326580725, + "grad_norm": 2.062534809112549, + "learning_rate": 1.9427402862985686e-05, + "loss": 4.6023, + "step": 191 + }, + { + "epoch": 0.058931860036832415, + "grad_norm": 1.8512892723083496, + "learning_rate": 1.9529652351738243e-05, + "loss": 4.5709, + "step": 192 + }, + { + "epoch": 0.05923879680785758, + "grad_norm": 2.7771248817443848, + "learning_rate": 1.96319018404908e-05, + "loss": 4.5902, + "step": 193 + }, + { + "epoch": 0.05954573357888275, + "grad_norm": 1.8911874294281006, + "learning_rate": 1.9734151329243354e-05, + "loss": 4.4973, + "step": 194 + }, + { + "epoch": 0.05985267034990792, + "grad_norm": 2.261096715927124, + "learning_rate": 1.983640081799591e-05, + "loss": 4.5343, + "step": 195 + }, + { + "epoch": 0.06015960712093309, + "grad_norm": 1.833983302116394, + "learning_rate": 1.9938650306748465e-05, + "loss": 4.5604, + "step": 196 + }, + { + "epoch": 0.060466543891958255, + "grad_norm": 2.6909141540527344, + "learning_rate": 2.0040899795501022e-05, + "loss": 4.5411, + "step": 197 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.8085883855819702, + "learning_rate": 2.014314928425358e-05, + "loss": 4.5127, + "step": 198 + }, + { + "epoch": 0.06108041743400859, + "grad_norm": 3.082063913345337, + "learning_rate": 2.0245398773006136e-05, + "loss": 4.5055, + "step": 199 + }, + { + "epoch": 0.061387354205033766, + "grad_norm": 1.6942392587661743, + "learning_rate": 2.0347648261758693e-05, + "loss": 4.4852, + "step": 200 + }, + { + "epoch": 0.061694290976058934, + "grad_norm": 2.428569793701172, + "learning_rate": 2.044989775051125e-05, + "loss": 4.4876, + "step": 201 + }, + { + "epoch": 0.0620012277470841, + "grad_norm": 2.1669068336486816, + "learning_rate": 2.0552147239263807e-05, + "loss": 4.5156, + "step": 202 + }, + { + "epoch": 0.06230816451810927, + "grad_norm": 1.8558237552642822, + "learning_rate": 2.065439672801636e-05, + "loss": 4.495, + "step": 203 + }, + { + "epoch": 0.06261510128913444, + "grad_norm": 2.86224627494812, + "learning_rate": 2.0756646216768917e-05, + "loss": 4.4881, + "step": 204 + }, + { + "epoch": 0.06292203806015961, + "grad_norm": 2.263230562210083, + "learning_rate": 2.085889570552147e-05, + "loss": 4.4349, + "step": 205 + }, + { + "epoch": 0.06322897483118478, + "grad_norm": 2.533039093017578, + "learning_rate": 2.0961145194274028e-05, + "loss": 4.4921, + "step": 206 + }, + { + "epoch": 0.06353591160220995, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.1063394683026585e-05, + "loss": 4.4581, + "step": 207 + }, + { + "epoch": 0.06384284837323512, + "grad_norm": 1.9801981449127197, + "learning_rate": 2.1165644171779142e-05, + "loss": 4.4646, + "step": 208 + }, + { + "epoch": 0.06414978514426029, + "grad_norm": 2.8499860763549805, + "learning_rate": 2.12678936605317e-05, + "loss": 4.3913, + "step": 209 + }, + { + "epoch": 0.06445672191528545, + "grad_norm": 1.8176993131637573, + "learning_rate": 2.1370143149284256e-05, + "loss": 4.4414, + "step": 210 + }, + { + "epoch": 0.06476365868631062, + "grad_norm": 3.1497061252593994, + "learning_rate": 2.1472392638036813e-05, + "loss": 4.4164, + "step": 211 + }, + { + "epoch": 0.06507059545733579, + "grad_norm": 2.0509049892425537, + "learning_rate": 2.1574642126789367e-05, + "loss": 4.4198, + "step": 212 + }, + { + "epoch": 0.06537753222836096, + "grad_norm": 2.5346014499664307, + "learning_rate": 2.1676891615541924e-05, + "loss": 4.3628, + "step": 213 + }, + { + "epoch": 0.06568446899938613, + "grad_norm": 2.281947135925293, + "learning_rate": 2.1779141104294477e-05, + "loss": 4.3824, + "step": 214 + }, + { + "epoch": 0.0659914057704113, + "grad_norm": 2.9005074501037598, + "learning_rate": 2.1881390593047034e-05, + "loss": 4.4227, + "step": 215 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 2.5869741439819336, + "learning_rate": 2.198364008179959e-05, + "loss": 4.4231, + "step": 216 + }, + { + "epoch": 0.06660527931246163, + "grad_norm": 2.339655637741089, + "learning_rate": 2.208588957055215e-05, + "loss": 4.3901, + "step": 217 + }, + { + "epoch": 0.0669122160834868, + "grad_norm": 2.430664539337158, + "learning_rate": 2.2188139059304705e-05, + "loss": 4.3487, + "step": 218 + }, + { + "epoch": 0.06721915285451197, + "grad_norm": 2.1791040897369385, + "learning_rate": 2.2290388548057262e-05, + "loss": 4.3404, + "step": 219 + }, + { + "epoch": 0.06752608962553713, + "grad_norm": 2.7054920196533203, + "learning_rate": 2.239263803680982e-05, + "loss": 4.4186, + "step": 220 + }, + { + "epoch": 0.0678330263965623, + "grad_norm": 2.516566514968872, + "learning_rate": 2.2494887525562373e-05, + "loss": 4.4102, + "step": 221 + }, + { + "epoch": 0.06813996316758748, + "grad_norm": 2.3522324562072754, + "learning_rate": 2.259713701431493e-05, + "loss": 4.4062, + "step": 222 + }, + { + "epoch": 0.06844689993861265, + "grad_norm": 2.557600259780884, + "learning_rate": 2.2699386503067484e-05, + "loss": 4.3711, + "step": 223 + }, + { + "epoch": 0.06875383670963782, + "grad_norm": 2.0590531826019287, + "learning_rate": 2.280163599182004e-05, + "loss": 4.3546, + "step": 224 + }, + { + "epoch": 0.06906077348066299, + "grad_norm": 4.704878330230713, + "learning_rate": 2.2903885480572598e-05, + "loss": 4.39, + "step": 225 + }, + { + "epoch": 0.06936771025168816, + "grad_norm": 2.237440347671509, + "learning_rate": 2.3006134969325155e-05, + "loss": 4.3425, + "step": 226 + }, + { + "epoch": 0.06967464702271332, + "grad_norm": 3.9394450187683105, + "learning_rate": 2.3108384458077712e-05, + "loss": 4.3641, + "step": 227 + }, + { + "epoch": 0.06998158379373849, + "grad_norm": 2.4857213497161865, + "learning_rate": 2.321063394683027e-05, + "loss": 4.3435, + "step": 228 + }, + { + "epoch": 0.07028852056476366, + "grad_norm": 2.893437147140503, + "learning_rate": 2.3312883435582822e-05, + "loss": 4.329, + "step": 229 + }, + { + "epoch": 0.07059545733578883, + "grad_norm": 2.6498284339904785, + "learning_rate": 2.341513292433538e-05, + "loss": 4.3058, + "step": 230 + }, + { + "epoch": 0.070902394106814, + "grad_norm": 2.4182214736938477, + "learning_rate": 2.3517382413087936e-05, + "loss": 4.3147, + "step": 231 + }, + { + "epoch": 0.07120933087783916, + "grad_norm": 2.532050371170044, + "learning_rate": 2.361963190184049e-05, + "loss": 4.3388, + "step": 232 + }, + { + "epoch": 0.07151626764886433, + "grad_norm": 2.5818533897399902, + "learning_rate": 2.3721881390593047e-05, + "loss": 4.3023, + "step": 233 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 2.1860098838806152, + "learning_rate": 2.3824130879345604e-05, + "loss": 4.2571, + "step": 234 + }, + { + "epoch": 0.07213014119091467, + "grad_norm": 3.5780131816864014, + "learning_rate": 2.392638036809816e-05, + "loss": 4.3336, + "step": 235 + }, + { + "epoch": 0.07243707796193984, + "grad_norm": 2.24653697013855, + "learning_rate": 2.4028629856850718e-05, + "loss": 4.3013, + "step": 236 + }, + { + "epoch": 0.072744014732965, + "grad_norm": 3.59663987159729, + "learning_rate": 2.4130879345603275e-05, + "loss": 4.3248, + "step": 237 + }, + { + "epoch": 0.07305095150399017, + "grad_norm": 2.818321943283081, + "learning_rate": 2.423312883435583e-05, + "loss": 4.2876, + "step": 238 + }, + { + "epoch": 0.07335788827501534, + "grad_norm": 2.457371950149536, + "learning_rate": 2.4335378323108386e-05, + "loss": 4.2584, + "step": 239 + }, + { + "epoch": 0.07366482504604052, + "grad_norm": 3.6243598461151123, + "learning_rate": 2.4437627811860943e-05, + "loss": 4.2786, + "step": 240 + }, + { + "epoch": 0.07397176181706569, + "grad_norm": 2.113060474395752, + "learning_rate": 2.4539877300613496e-05, + "loss": 4.2071, + "step": 241 + }, + { + "epoch": 0.07427869858809086, + "grad_norm": 5.355374813079834, + "learning_rate": 2.4642126789366053e-05, + "loss": 4.2871, + "step": 242 + }, + { + "epoch": 0.07458563535911603, + "grad_norm": 2.4509847164154053, + "learning_rate": 2.474437627811861e-05, + "loss": 4.2073, + "step": 243 + }, + { + "epoch": 0.0748925721301412, + "grad_norm": 3.313793659210205, + "learning_rate": 2.4846625766871167e-05, + "loss": 4.2938, + "step": 244 + }, + { + "epoch": 0.07519950890116636, + "grad_norm": 2.731903553009033, + "learning_rate": 2.4948875255623724e-05, + "loss": 4.2023, + "step": 245 + }, + { + "epoch": 0.07550644567219153, + "grad_norm": 2.6218042373657227, + "learning_rate": 2.505112474437628e-05, + "loss": 4.2492, + "step": 246 + }, + { + "epoch": 0.0758133824432167, + "grad_norm": 3.2865426540374756, + "learning_rate": 2.5153374233128835e-05, + "loss": 4.2358, + "step": 247 + }, + { + "epoch": 0.07612031921424187, + "grad_norm": 2.21870756149292, + "learning_rate": 2.5255623721881395e-05, + "loss": 4.1989, + "step": 248 + }, + { + "epoch": 0.07642725598526703, + "grad_norm": 4.095842361450195, + "learning_rate": 2.535787321063395e-05, + "loss": 4.2484, + "step": 249 + }, + { + "epoch": 0.0767341927562922, + "grad_norm": 2.21420955657959, + "learning_rate": 2.5460122699386503e-05, + "loss": 4.1985, + "step": 250 + }, + { + "epoch": 0.07704112952731737, + "grad_norm": 3.011272668838501, + "learning_rate": 2.556237218813906e-05, + "loss": 4.2182, + "step": 251 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 2.930999279022217, + "learning_rate": 2.5664621676891613e-05, + "loss": 4.1985, + "step": 252 + }, + { + "epoch": 0.0776550030693677, + "grad_norm": 2.8528032302856445, + "learning_rate": 2.5766871165644174e-05, + "loss": 4.1859, + "step": 253 + }, + { + "epoch": 0.07796193984039287, + "grad_norm": 3.215587854385376, + "learning_rate": 2.5869120654396727e-05, + "loss": 4.2416, + "step": 254 + }, + { + "epoch": 0.07826887661141804, + "grad_norm": 3.1349990367889404, + "learning_rate": 2.5971370143149288e-05, + "loss": 4.2204, + "step": 255 + }, + { + "epoch": 0.07857581338244321, + "grad_norm": 3.146942377090454, + "learning_rate": 2.607361963190184e-05, + "loss": 4.17, + "step": 256 + }, + { + "epoch": 0.07888275015346839, + "grad_norm": 2.2611942291259766, + "learning_rate": 2.6175869120654402e-05, + "loss": 4.191, + "step": 257 + }, + { + "epoch": 0.07918968692449356, + "grad_norm": 3.434574604034424, + "learning_rate": 2.6278118609406955e-05, + "loss": 4.1854, + "step": 258 + }, + { + "epoch": 0.07949662369551873, + "grad_norm": 2.3132400512695312, + "learning_rate": 2.638036809815951e-05, + "loss": 4.233, + "step": 259 + }, + { + "epoch": 0.0798035604665439, + "grad_norm": 3.2676596641540527, + "learning_rate": 2.6482617586912066e-05, + "loss": 4.1586, + "step": 260 + }, + { + "epoch": 0.08011049723756906, + "grad_norm": 2.6182920932769775, + "learning_rate": 2.658486707566462e-05, + "loss": 4.164, + "step": 261 + }, + { + "epoch": 0.08041743400859423, + "grad_norm": 2.872018814086914, + "learning_rate": 2.668711656441718e-05, + "loss": 4.1642, + "step": 262 + }, + { + "epoch": 0.0807243707796194, + "grad_norm": 3.147237539291382, + "learning_rate": 2.6789366053169734e-05, + "loss": 4.147, + "step": 263 + }, + { + "epoch": 0.08103130755064457, + "grad_norm": 2.363360643386841, + "learning_rate": 2.6891615541922294e-05, + "loss": 4.1388, + "step": 264 + }, + { + "epoch": 0.08133824432166974, + "grad_norm": 3.364442825317383, + "learning_rate": 2.6993865030674848e-05, + "loss": 4.1678, + "step": 265 + }, + { + "epoch": 0.0816451810926949, + "grad_norm": 2.393705368041992, + "learning_rate": 2.7096114519427408e-05, + "loss": 4.1626, + "step": 266 + }, + { + "epoch": 0.08195211786372007, + "grad_norm": 3.8512558937072754, + "learning_rate": 2.719836400817996e-05, + "loss": 4.1613, + "step": 267 + }, + { + "epoch": 0.08225905463474524, + "grad_norm": 3.0992584228515625, + "learning_rate": 2.7300613496932515e-05, + "loss": 4.1486, + "step": 268 + }, + { + "epoch": 0.08256599140577041, + "grad_norm": 3.481079578399658, + "learning_rate": 2.7402862985685072e-05, + "loss": 4.1772, + "step": 269 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 3.2167513370513916, + "learning_rate": 2.7505112474437626e-05, + "loss": 4.1253, + "step": 270 + }, + { + "epoch": 0.08317986494782074, + "grad_norm": 2.9698429107666016, + "learning_rate": 2.7607361963190186e-05, + "loss": 4.0897, + "step": 271 + }, + { + "epoch": 0.08348680171884591, + "grad_norm": 3.2549962997436523, + "learning_rate": 2.770961145194274e-05, + "loss": 4.0851, + "step": 272 + }, + { + "epoch": 0.08379373848987108, + "grad_norm": 3.089301824569702, + "learning_rate": 2.78118609406953e-05, + "loss": 4.1378, + "step": 273 + }, + { + "epoch": 0.08410067526089625, + "grad_norm": 3.1799745559692383, + "learning_rate": 2.7914110429447854e-05, + "loss": 4.159, + "step": 274 + }, + { + "epoch": 0.08440761203192143, + "grad_norm": 2.7577199935913086, + "learning_rate": 2.8016359918200408e-05, + "loss": 4.0524, + "step": 275 + }, + { + "epoch": 0.0847145488029466, + "grad_norm": 3.709740161895752, + "learning_rate": 2.8118609406952968e-05, + "loss": 4.0877, + "step": 276 + }, + { + "epoch": 0.08502148557397177, + "grad_norm": 2.930482864379883, + "learning_rate": 2.822085889570552e-05, + "loss": 4.0408, + "step": 277 + }, + { + "epoch": 0.08532842234499693, + "grad_norm": 3.8216278553009033, + "learning_rate": 2.832310838445808e-05, + "loss": 4.0915, + "step": 278 + }, + { + "epoch": 0.0856353591160221, + "grad_norm": 2.7614903450012207, + "learning_rate": 2.8425357873210636e-05, + "loss": 4.0793, + "step": 279 + }, + { + "epoch": 0.08594229588704727, + "grad_norm": 4.005281448364258, + "learning_rate": 2.8527607361963193e-05, + "loss": 4.1234, + "step": 280 + }, + { + "epoch": 0.08624923265807244, + "grad_norm": 2.731640338897705, + "learning_rate": 2.8629856850715746e-05, + "loss": 4.1408, + "step": 281 + }, + { + "epoch": 0.0865561694290976, + "grad_norm": 4.439471244812012, + "learning_rate": 2.8732106339468307e-05, + "loss": 4.08, + "step": 282 + }, + { + "epoch": 0.08686310620012277, + "grad_norm": 2.929032564163208, + "learning_rate": 2.883435582822086e-05, + "loss": 4.0521, + "step": 283 + }, + { + "epoch": 0.08717004297114794, + "grad_norm": 3.3943557739257812, + "learning_rate": 2.8936605316973414e-05, + "loss": 4.0936, + "step": 284 + }, + { + "epoch": 0.08747697974217311, + "grad_norm": 2.9899704456329346, + "learning_rate": 2.9038854805725974e-05, + "loss": 4.0985, + "step": 285 + }, + { + "epoch": 0.08778391651319828, + "grad_norm": 2.8169870376586914, + "learning_rate": 2.9141104294478528e-05, + "loss": 4.1044, + "step": 286 + }, + { + "epoch": 0.08809085328422345, + "grad_norm": 4.312693119049072, + "learning_rate": 2.9243353783231085e-05, + "loss": 4.0515, + "step": 287 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 2.9270846843719482, + "learning_rate": 2.9345603271983642e-05, + "loss": 4.0221, + "step": 288 + }, + { + "epoch": 0.08870472682627378, + "grad_norm": 3.9831974506378174, + "learning_rate": 2.94478527607362e-05, + "loss": 4.0807, + "step": 289 + }, + { + "epoch": 0.08901166359729895, + "grad_norm": 2.721794605255127, + "learning_rate": 2.9550102249488753e-05, + "loss": 4.0732, + "step": 290 + }, + { + "epoch": 0.08931860036832412, + "grad_norm": 4.721047878265381, + "learning_rate": 2.9652351738241313e-05, + "loss": 4.0457, + "step": 291 + }, + { + "epoch": 0.08962553713934929, + "grad_norm": 2.785738229751587, + "learning_rate": 2.9754601226993867e-05, + "loss": 4.0288, + "step": 292 + }, + { + "epoch": 0.08993247391037447, + "grad_norm": 4.842009544372559, + "learning_rate": 2.985685071574642e-05, + "loss": 4.1193, + "step": 293 + }, + { + "epoch": 0.09023941068139964, + "grad_norm": 2.802044153213501, + "learning_rate": 2.995910020449898e-05, + "loss": 4.0055, + "step": 294 + }, + { + "epoch": 0.0905463474524248, + "grad_norm": 3.7060954570770264, + "learning_rate": 3.0061349693251534e-05, + "loss": 4.0478, + "step": 295 + }, + { + "epoch": 0.09085328422344997, + "grad_norm": 2.8033370971679688, + "learning_rate": 3.0163599182004095e-05, + "loss": 4.0344, + "step": 296 + }, + { + "epoch": 0.09116022099447514, + "grad_norm": 3.148653984069824, + "learning_rate": 3.026584867075665e-05, + "loss": 3.9825, + "step": 297 + }, + { + "epoch": 0.09146715776550031, + "grad_norm": 3.925459384918213, + "learning_rate": 3.0368098159509205e-05, + "loss": 4.0253, + "step": 298 + }, + { + "epoch": 0.09177409453652548, + "grad_norm": 2.8502724170684814, + "learning_rate": 3.047034764826176e-05, + "loss": 4.0192, + "step": 299 + }, + { + "epoch": 0.09208103130755065, + "grad_norm": 3.8444268703460693, + "learning_rate": 3.057259713701431e-05, + "loss": 4.0354, + "step": 300 + }, + { + "epoch": 0.09238796807857581, + "grad_norm": 2.935976982116699, + "learning_rate": 3.067484662576687e-05, + "loss": 4.0397, + "step": 301 + }, + { + "epoch": 0.09269490484960098, + "grad_norm": 2.9375271797180176, + "learning_rate": 3.0777096114519427e-05, + "loss": 3.975, + "step": 302 + }, + { + "epoch": 0.09300184162062615, + "grad_norm": 3.7623329162597656, + "learning_rate": 3.087934560327199e-05, + "loss": 4.0259, + "step": 303 + }, + { + "epoch": 0.09330877839165132, + "grad_norm": 3.1480228900909424, + "learning_rate": 3.098159509202454e-05, + "loss": 3.9676, + "step": 304 + }, + { + "epoch": 0.09361571516267649, + "grad_norm": 4.572622299194336, + "learning_rate": 3.10838445807771e-05, + "loss": 4.0123, + "step": 305 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 2.469806671142578, + "learning_rate": 3.1186094069529655e-05, + "loss": 4.012, + "step": 306 + }, + { + "epoch": 0.09422958870472682, + "grad_norm": 5.133090019226074, + "learning_rate": 3.1288343558282215e-05, + "loss": 3.9892, + "step": 307 + }, + { + "epoch": 0.09453652547575199, + "grad_norm": 3.379105567932129, + "learning_rate": 3.139059304703477e-05, + "loss": 4.0286, + "step": 308 + }, + { + "epoch": 0.09484346224677716, + "grad_norm": 3.1413521766662598, + "learning_rate": 3.149284253578732e-05, + "loss": 4.0238, + "step": 309 + }, + { + "epoch": 0.09515039901780234, + "grad_norm": 2.832242250442505, + "learning_rate": 3.159509202453988e-05, + "loss": 3.9955, + "step": 310 + }, + { + "epoch": 0.09545733578882751, + "grad_norm": 4.405134201049805, + "learning_rate": 3.1697341513292436e-05, + "loss": 4.0093, + "step": 311 + }, + { + "epoch": 0.09576427255985268, + "grad_norm": 2.8928587436676025, + "learning_rate": 3.179959100204499e-05, + "loss": 3.9518, + "step": 312 + }, + { + "epoch": 0.09607120933087784, + "grad_norm": 3.8899731636047363, + "learning_rate": 3.1901840490797544e-05, + "loss": 3.9773, + "step": 313 + }, + { + "epoch": 0.09637814610190301, + "grad_norm": 2.768199920654297, + "learning_rate": 3.2004089979550104e-05, + "loss": 3.9671, + "step": 314 + }, + { + "epoch": 0.09668508287292818, + "grad_norm": 3.834092378616333, + "learning_rate": 3.210633946830266e-05, + "loss": 3.9641, + "step": 315 + }, + { + "epoch": 0.09699201964395335, + "grad_norm": 3.566220998764038, + "learning_rate": 3.220858895705521e-05, + "loss": 3.9585, + "step": 316 + }, + { + "epoch": 0.09729895641497852, + "grad_norm": 3.1876113414764404, + "learning_rate": 3.231083844580777e-05, + "loss": 3.9689, + "step": 317 + }, + { + "epoch": 0.09760589318600368, + "grad_norm": 3.122142791748047, + "learning_rate": 3.2413087934560325e-05, + "loss": 3.9601, + "step": 318 + }, + { + "epoch": 0.09791282995702885, + "grad_norm": 3.825195789337158, + "learning_rate": 3.2515337423312886e-05, + "loss": 3.9413, + "step": 319 + }, + { + "epoch": 0.09821976672805402, + "grad_norm": 3.3126778602600098, + "learning_rate": 3.261758691206544e-05, + "loss": 4.0414, + "step": 320 + }, + { + "epoch": 0.09852670349907919, + "grad_norm": 3.7704360485076904, + "learning_rate": 3.2719836400818e-05, + "loss": 3.9224, + "step": 321 + }, + { + "epoch": 0.09883364027010436, + "grad_norm": 2.997194290161133, + "learning_rate": 3.282208588957055e-05, + "loss": 3.9454, + "step": 322 + }, + { + "epoch": 0.09914057704112952, + "grad_norm": 3.4990131855010986, + "learning_rate": 3.2924335378323114e-05, + "loss": 3.8682, + "step": 323 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 3.146879196166992, + "learning_rate": 3.302658486707567e-05, + "loss": 3.8863, + "step": 324 + }, + { + "epoch": 0.09975445058317986, + "grad_norm": 4.963291645050049, + "learning_rate": 3.312883435582822e-05, + "loss": 3.9951, + "step": 325 + }, + { + "epoch": 0.10006138735420503, + "grad_norm": 2.4511775970458984, + "learning_rate": 3.323108384458078e-05, + "loss": 3.875, + "step": 326 + }, + { + "epoch": 0.1003683241252302, + "grad_norm": 5.670922756195068, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.0446, + "step": 327 + }, + { + "epoch": 0.10067526089625538, + "grad_norm": 3.54237699508667, + "learning_rate": 3.3435582822085895e-05, + "loss": 3.9877, + "step": 328 + }, + { + "epoch": 0.10098219766728055, + "grad_norm": 2.9059271812438965, + "learning_rate": 3.353783231083845e-05, + "loss": 3.949, + "step": 329 + }, + { + "epoch": 0.10128913443830571, + "grad_norm": 3.870962381362915, + "learning_rate": 3.3640081799591e-05, + "loss": 3.8985, + "step": 330 + }, + { + "epoch": 0.10159607120933088, + "grad_norm": 3.275129556655884, + "learning_rate": 3.3742331288343556e-05, + "loss": 4.0209, + "step": 331 + }, + { + "epoch": 0.10190300798035605, + "grad_norm": 3.040931224822998, + "learning_rate": 3.3844580777096117e-05, + "loss": 3.9938, + "step": 332 + }, + { + "epoch": 0.10220994475138122, + "grad_norm": 4.3355584144592285, + "learning_rate": 3.394683026584867e-05, + "loss": 3.876, + "step": 333 + }, + { + "epoch": 0.10251688152240639, + "grad_norm": 3.0981085300445557, + "learning_rate": 3.4049079754601224e-05, + "loss": 3.9014, + "step": 334 + }, + { + "epoch": 0.10282381829343155, + "grad_norm": 3.2902655601501465, + "learning_rate": 3.4151329243353784e-05, + "loss": 3.9599, + "step": 335 + }, + { + "epoch": 0.10313075506445672, + "grad_norm": 3.496514081954956, + "learning_rate": 3.425357873210634e-05, + "loss": 3.9005, + "step": 336 + }, + { + "epoch": 0.10343769183548189, + "grad_norm": 3.4680685997009277, + "learning_rate": 3.43558282208589e-05, + "loss": 3.8591, + "step": 337 + }, + { + "epoch": 0.10374462860650706, + "grad_norm": 3.3041694164276123, + "learning_rate": 3.445807770961145e-05, + "loss": 3.9566, + "step": 338 + }, + { + "epoch": 0.10405156537753223, + "grad_norm": 3.519709825515747, + "learning_rate": 3.456032719836401e-05, + "loss": 3.9219, + "step": 339 + }, + { + "epoch": 0.1043585021485574, + "grad_norm": 3.932344436645508, + "learning_rate": 3.4662576687116566e-05, + "loss": 3.9155, + "step": 340 + }, + { + "epoch": 0.10466543891958256, + "grad_norm": 3.3109822273254395, + "learning_rate": 3.476482617586912e-05, + "loss": 3.9729, + "step": 341 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 4.556341648101807, + "learning_rate": 3.486707566462168e-05, + "loss": 3.9459, + "step": 342 + }, + { + "epoch": 0.1052793124616329, + "grad_norm": 2.9105725288391113, + "learning_rate": 3.4969325153374234e-05, + "loss": 3.9384, + "step": 343 + }, + { + "epoch": 0.10558624923265807, + "grad_norm": 3.865682601928711, + "learning_rate": 3.5071574642126794e-05, + "loss": 3.9826, + "step": 344 + }, + { + "epoch": 0.10589318600368323, + "grad_norm": 2.8606700897216797, + "learning_rate": 3.517382413087935e-05, + "loss": 3.8184, + "step": 345 + }, + { + "epoch": 0.10620012277470842, + "grad_norm": 4.323507785797119, + "learning_rate": 3.527607361963191e-05, + "loss": 3.8772, + "step": 346 + }, + { + "epoch": 0.10650705954573358, + "grad_norm": 2.890390157699585, + "learning_rate": 3.537832310838446e-05, + "loss": 3.8769, + "step": 347 + }, + { + "epoch": 0.10681399631675875, + "grad_norm": 4.008283615112305, + "learning_rate": 3.5480572597137015e-05, + "loss": 3.8796, + "step": 348 + }, + { + "epoch": 0.10712093308778392, + "grad_norm": 3.3605823516845703, + "learning_rate": 3.558282208588957e-05, + "loss": 3.8924, + "step": 349 + }, + { + "epoch": 0.10742786985880909, + "grad_norm": 3.6573123931884766, + "learning_rate": 3.568507157464213e-05, + "loss": 3.812, + "step": 350 + }, + { + "epoch": 0.10773480662983426, + "grad_norm": 3.0771777629852295, + "learning_rate": 3.578732106339468e-05, + "loss": 3.8958, + "step": 351 + }, + { + "epoch": 0.10804174340085942, + "grad_norm": 3.6483314037323, + "learning_rate": 3.5889570552147236e-05, + "loss": 3.8863, + "step": 352 + }, + { + "epoch": 0.10834868017188459, + "grad_norm": 3.1320669651031494, + "learning_rate": 3.59918200408998e-05, + "loss": 3.8194, + "step": 353 + }, + { + "epoch": 0.10865561694290976, + "grad_norm": 3.6510627269744873, + "learning_rate": 3.609406952965235e-05, + "loss": 3.8916, + "step": 354 + }, + { + "epoch": 0.10896255371393493, + "grad_norm": 3.0419273376464844, + "learning_rate": 3.619631901840491e-05, + "loss": 3.7907, + "step": 355 + }, + { + "epoch": 0.1092694904849601, + "grad_norm": 4.519289493560791, + "learning_rate": 3.6298568507157465e-05, + "loss": 3.8902, + "step": 356 + }, + { + "epoch": 0.10957642725598526, + "grad_norm": 2.938493251800537, + "learning_rate": 3.6400817995910025e-05, + "loss": 3.8675, + "step": 357 + }, + { + "epoch": 0.10988336402701043, + "grad_norm": 4.398004531860352, + "learning_rate": 3.650306748466258e-05, + "loss": 3.9535, + "step": 358 + }, + { + "epoch": 0.1101903007980356, + "grad_norm": 2.9128408432006836, + "learning_rate": 3.660531697341513e-05, + "loss": 3.944, + "step": 359 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 5.364169597625732, + "learning_rate": 3.670756646216769e-05, + "loss": 3.9289, + "step": 360 + }, + { + "epoch": 0.11080417434008594, + "grad_norm": 2.8434085845947266, + "learning_rate": 3.6809815950920246e-05, + "loss": 3.8204, + "step": 361 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.494234561920166, + "learning_rate": 3.6912065439672807e-05, + "loss": 3.8518, + "step": 362 + }, + { + "epoch": 0.11141804788213629, + "grad_norm": 2.959608554840088, + "learning_rate": 3.701431492842536e-05, + "loss": 3.8365, + "step": 363 + }, + { + "epoch": 0.11172498465316145, + "grad_norm": 3.4115726947784424, + "learning_rate": 3.711656441717792e-05, + "loss": 3.8507, + "step": 364 + }, + { + "epoch": 0.11203192142418662, + "grad_norm": 3.8023531436920166, + "learning_rate": 3.7218813905930474e-05, + "loss": 3.8544, + "step": 365 + }, + { + "epoch": 0.11233885819521179, + "grad_norm": 3.0639398097991943, + "learning_rate": 3.732106339468303e-05, + "loss": 3.8772, + "step": 366 + }, + { + "epoch": 0.11264579496623696, + "grad_norm": 4.241199016571045, + "learning_rate": 3.742331288343559e-05, + "loss": 3.7739, + "step": 367 + }, + { + "epoch": 0.11295273173726213, + "grad_norm": 2.977330446243286, + "learning_rate": 3.752556237218814e-05, + "loss": 3.8376, + "step": 368 + }, + { + "epoch": 0.1132596685082873, + "grad_norm": 4.574001789093018, + "learning_rate": 3.7627811860940696e-05, + "loss": 3.8761, + "step": 369 + }, + { + "epoch": 0.11356660527931246, + "grad_norm": 3.1499617099761963, + "learning_rate": 3.773006134969325e-05, + "loss": 3.8884, + "step": 370 + }, + { + "epoch": 0.11387354205033763, + "grad_norm": 3.81887149810791, + "learning_rate": 3.783231083844581e-05, + "loss": 3.8474, + "step": 371 + }, + { + "epoch": 0.1141804788213628, + "grad_norm": 3.424117088317871, + "learning_rate": 3.793456032719836e-05, + "loss": 3.8715, + "step": 372 + }, + { + "epoch": 0.11448741559238797, + "grad_norm": 4.431595325469971, + "learning_rate": 3.8036809815950924e-05, + "loss": 3.8305, + "step": 373 + }, + { + "epoch": 0.11479435236341314, + "grad_norm": 3.1664443016052246, + "learning_rate": 3.813905930470348e-05, + "loss": 3.8203, + "step": 374 + }, + { + "epoch": 0.1151012891344383, + "grad_norm": 4.312273025512695, + "learning_rate": 3.824130879345603e-05, + "loss": 3.8195, + "step": 375 + }, + { + "epoch": 0.11540822590546347, + "grad_norm": 3.0893726348876953, + "learning_rate": 3.834355828220859e-05, + "loss": 3.8248, + "step": 376 + }, + { + "epoch": 0.11571516267648864, + "grad_norm": 4.526726722717285, + "learning_rate": 3.8445807770961145e-05, + "loss": 3.8505, + "step": 377 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 2.5805325508117676, + "learning_rate": 3.8548057259713705e-05, + "loss": 3.8153, + "step": 378 + }, + { + "epoch": 0.11632903621853898, + "grad_norm": 4.6043381690979, + "learning_rate": 3.865030674846626e-05, + "loss": 3.8248, + "step": 379 + }, + { + "epoch": 0.11663597298956414, + "grad_norm": 3.0713136196136475, + "learning_rate": 3.875255623721882e-05, + "loss": 3.7687, + "step": 380 + }, + { + "epoch": 0.11694290976058933, + "grad_norm": 3.6344685554504395, + "learning_rate": 3.885480572597137e-05, + "loss": 3.8061, + "step": 381 + }, + { + "epoch": 0.1172498465316145, + "grad_norm": 3.6261723041534424, + "learning_rate": 3.895705521472393e-05, + "loss": 3.7939, + "step": 382 + }, + { + "epoch": 0.11755678330263966, + "grad_norm": 3.811779260635376, + "learning_rate": 3.905930470347649e-05, + "loss": 3.7973, + "step": 383 + }, + { + "epoch": 0.11786372007366483, + "grad_norm": 3.741685628890991, + "learning_rate": 3.916155419222904e-05, + "loss": 3.8149, + "step": 384 + }, + { + "epoch": 0.11817065684469, + "grad_norm": 3.330526351928711, + "learning_rate": 3.92638036809816e-05, + "loss": 3.8058, + "step": 385 + }, + { + "epoch": 0.11847759361571517, + "grad_norm": 3.2102115154266357, + "learning_rate": 3.9366053169734155e-05, + "loss": 3.7199, + "step": 386 + }, + { + "epoch": 0.11878453038674033, + "grad_norm": 3.670474052429199, + "learning_rate": 3.946830265848671e-05, + "loss": 3.8087, + "step": 387 + }, + { + "epoch": 0.1190914671577655, + "grad_norm": 3.218390941619873, + "learning_rate": 3.957055214723926e-05, + "loss": 3.7631, + "step": 388 + }, + { + "epoch": 0.11939840392879067, + "grad_norm": 4.2256693840026855, + "learning_rate": 3.967280163599182e-05, + "loss": 3.7624, + "step": 389 + }, + { + "epoch": 0.11970534069981584, + "grad_norm": 2.86247181892395, + "learning_rate": 3.9775051124744376e-05, + "loss": 3.7638, + "step": 390 + }, + { + "epoch": 0.120012277470841, + "grad_norm": 4.083118915557861, + "learning_rate": 3.987730061349693e-05, + "loss": 3.7581, + "step": 391 + }, + { + "epoch": 0.12031921424186617, + "grad_norm": 2.836794376373291, + "learning_rate": 3.997955010224949e-05, + "loss": 3.7466, + "step": 392 + }, + { + "epoch": 0.12062615101289134, + "grad_norm": 4.071137428283691, + "learning_rate": 4.0081799591002043e-05, + "loss": 3.7836, + "step": 393 + }, + { + "epoch": 0.12093308778391651, + "grad_norm": 3.3141064643859863, + "learning_rate": 4.0184049079754604e-05, + "loss": 3.754, + "step": 394 + }, + { + "epoch": 0.12124002455494168, + "grad_norm": 3.6064393520355225, + "learning_rate": 4.028629856850716e-05, + "loss": 3.8379, + "step": 395 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 3.7306606769561768, + "learning_rate": 4.038854805725972e-05, + "loss": 3.6848, + "step": 396 + }, + { + "epoch": 0.12185389809699201, + "grad_norm": 3.5877859592437744, + "learning_rate": 4.049079754601227e-05, + "loss": 3.8201, + "step": 397 + }, + { + "epoch": 0.12216083486801718, + "grad_norm": 3.930271625518799, + "learning_rate": 4.059304703476483e-05, + "loss": 3.7507, + "step": 398 + }, + { + "epoch": 0.12246777163904236, + "grad_norm": 2.974968194961548, + "learning_rate": 4.0695296523517386e-05, + "loss": 3.7545, + "step": 399 + }, + { + "epoch": 0.12277470841006753, + "grad_norm": 4.655934810638428, + "learning_rate": 4.079754601226994e-05, + "loss": 3.8093, + "step": 400 + }, + { + "epoch": 0.1230816451810927, + "grad_norm": 3.201986312866211, + "learning_rate": 4.08997955010225e-05, + "loss": 3.7252, + "step": 401 + }, + { + "epoch": 0.12338858195211787, + "grad_norm": 4.447626113891602, + "learning_rate": 4.100204498977505e-05, + "loss": 3.7132, + "step": 402 + }, + { + "epoch": 0.12369551872314304, + "grad_norm": 2.6518118381500244, + "learning_rate": 4.1104294478527614e-05, + "loss": 3.7637, + "step": 403 + }, + { + "epoch": 0.1240024554941682, + "grad_norm": 5.116448402404785, + "learning_rate": 4.120654396728017e-05, + "loss": 3.6991, + "step": 404 + }, + { + "epoch": 0.12430939226519337, + "grad_norm": 2.7780613899230957, + "learning_rate": 4.130879345603272e-05, + "loss": 3.7555, + "step": 405 + }, + { + "epoch": 0.12461632903621854, + "grad_norm": 4.281010627746582, + "learning_rate": 4.1411042944785274e-05, + "loss": 3.688, + "step": 406 + }, + { + "epoch": 0.12492326580724371, + "grad_norm": 2.851562023162842, + "learning_rate": 4.1513292433537835e-05, + "loss": 3.7557, + "step": 407 + }, + { + "epoch": 0.1252302025782689, + "grad_norm": 4.092229843139648, + "learning_rate": 4.161554192229039e-05, + "loss": 3.7179, + "step": 408 + }, + { + "epoch": 0.12553713934929406, + "grad_norm": 3.410094976425171, + "learning_rate": 4.171779141104294e-05, + "loss": 3.7292, + "step": 409 + }, + { + "epoch": 0.12584407612031923, + "grad_norm": 4.266562461853027, + "learning_rate": 4.18200408997955e-05, + "loss": 3.8204, + "step": 410 + }, + { + "epoch": 0.1261510128913444, + "grad_norm": 2.997642755508423, + "learning_rate": 4.1922290388548056e-05, + "loss": 3.7773, + "step": 411 + }, + { + "epoch": 0.12645794966236956, + "grad_norm": 4.50873327255249, + "learning_rate": 4.2024539877300617e-05, + "loss": 3.7255, + "step": 412 + }, + { + "epoch": 0.12676488643339473, + "grad_norm": 3.65312123298645, + "learning_rate": 4.212678936605317e-05, + "loss": 3.6472, + "step": 413 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 3.985487222671509, + "learning_rate": 4.222903885480573e-05, + "loss": 3.6915, + "step": 414 + }, + { + "epoch": 0.12737875997544507, + "grad_norm": 3.6020219326019287, + "learning_rate": 4.2331288343558284e-05, + "loss": 3.7299, + "step": 415 + }, + { + "epoch": 0.12768569674647023, + "grad_norm": 3.414529323577881, + "learning_rate": 4.243353783231084e-05, + "loss": 3.7827, + "step": 416 + }, + { + "epoch": 0.1279926335174954, + "grad_norm": 3.537292718887329, + "learning_rate": 4.25357873210634e-05, + "loss": 3.751, + "step": 417 + }, + { + "epoch": 0.12829957028852057, + "grad_norm": 3.5442280769348145, + "learning_rate": 4.263803680981595e-05, + "loss": 3.6828, + "step": 418 + }, + { + "epoch": 0.12860650705954574, + "grad_norm": 3.9816019535064697, + "learning_rate": 4.274028629856851e-05, + "loss": 3.7668, + "step": 419 + }, + { + "epoch": 0.1289134438305709, + "grad_norm": 3.1632657051086426, + "learning_rate": 4.2842535787321066e-05, + "loss": 3.6946, + "step": 420 + }, + { + "epoch": 0.12922038060159607, + "grad_norm": 4.731013298034668, + "learning_rate": 4.2944785276073626e-05, + "loss": 3.7078, + "step": 421 + }, + { + "epoch": 0.12952731737262124, + "grad_norm": 2.7973382472991943, + "learning_rate": 4.304703476482618e-05, + "loss": 3.5934, + "step": 422 + }, + { + "epoch": 0.1298342541436464, + "grad_norm": 4.555461406707764, + "learning_rate": 4.3149284253578733e-05, + "loss": 3.7406, + "step": 423 + }, + { + "epoch": 0.13014119091467158, + "grad_norm": 3.25795841217041, + "learning_rate": 4.3251533742331294e-05, + "loss": 3.6302, + "step": 424 + }, + { + "epoch": 0.13044812768569675, + "grad_norm": 3.9974427223205566, + "learning_rate": 4.335378323108385e-05, + "loss": 3.6995, + "step": 425 + }, + { + "epoch": 0.13075506445672191, + "grad_norm": 3.4234917163848877, + "learning_rate": 4.34560327198364e-05, + "loss": 3.727, + "step": 426 + }, + { + "epoch": 0.13106200122774708, + "grad_norm": 3.40573787689209, + "learning_rate": 4.3558282208588955e-05, + "loss": 3.6964, + "step": 427 + }, + { + "epoch": 0.13136893799877225, + "grad_norm": 3.6903765201568604, + "learning_rate": 4.3660531697341515e-05, + "loss": 3.7139, + "step": 428 + }, + { + "epoch": 0.13167587476979742, + "grad_norm": 3.3252439498901367, + "learning_rate": 4.376278118609407e-05, + "loss": 3.7221, + "step": 429 + }, + { + "epoch": 0.1319828115408226, + "grad_norm": 3.591610908508301, + "learning_rate": 4.386503067484663e-05, + "loss": 3.6592, + "step": 430 + }, + { + "epoch": 0.13228974831184775, + "grad_norm": 3.584683418273926, + "learning_rate": 4.396728016359918e-05, + "loss": 3.695, + "step": 431 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 3.5093443393707275, + "learning_rate": 4.4069529652351736e-05, + "loss": 3.6368, + "step": 432 + }, + { + "epoch": 0.1329036218538981, + "grad_norm": 3.5040347576141357, + "learning_rate": 4.41717791411043e-05, + "loss": 3.6463, + "step": 433 + }, + { + "epoch": 0.13321055862492326, + "grad_norm": 3.534536361694336, + "learning_rate": 4.427402862985685e-05, + "loss": 3.681, + "step": 434 + }, + { + "epoch": 0.13351749539594843, + "grad_norm": 4.016106605529785, + "learning_rate": 4.437627811860941e-05, + "loss": 3.7592, + "step": 435 + }, + { + "epoch": 0.1338244321669736, + "grad_norm": 3.4661898612976074, + "learning_rate": 4.4478527607361964e-05, + "loss": 3.6437, + "step": 436 + }, + { + "epoch": 0.13413136893799876, + "grad_norm": 3.917189359664917, + "learning_rate": 4.4580777096114525e-05, + "loss": 3.6809, + "step": 437 + }, + { + "epoch": 0.13443830570902393, + "grad_norm": 3.472147226333618, + "learning_rate": 4.468302658486708e-05, + "loss": 3.5978, + "step": 438 + }, + { + "epoch": 0.1347452424800491, + "grad_norm": 3.2357044219970703, + "learning_rate": 4.478527607361964e-05, + "loss": 3.6758, + "step": 439 + }, + { + "epoch": 0.13505217925107427, + "grad_norm": 3.8607826232910156, + "learning_rate": 4.488752556237219e-05, + "loss": 3.7155, + "step": 440 + }, + { + "epoch": 0.13535911602209943, + "grad_norm": 3.085242509841919, + "learning_rate": 4.4989775051124746e-05, + "loss": 3.674, + "step": 441 + }, + { + "epoch": 0.1356660527931246, + "grad_norm": 4.0473432540893555, + "learning_rate": 4.5092024539877307e-05, + "loss": 3.6542, + "step": 442 + }, + { + "epoch": 0.1359729895641498, + "grad_norm": 3.4742088317871094, + "learning_rate": 4.519427402862986e-05, + "loss": 3.6226, + "step": 443 + }, + { + "epoch": 0.13627992633517497, + "grad_norm": 3.8838884830474854, + "learning_rate": 4.5296523517382414e-05, + "loss": 3.695, + "step": 444 + }, + { + "epoch": 0.13658686310620013, + "grad_norm": 3.1551895141601562, + "learning_rate": 4.539877300613497e-05, + "loss": 3.6886, + "step": 445 + }, + { + "epoch": 0.1368937998772253, + "grad_norm": 3.6824824810028076, + "learning_rate": 4.550102249488753e-05, + "loss": 3.6397, + "step": 446 + }, + { + "epoch": 0.13720073664825047, + "grad_norm": 3.3671298027038574, + "learning_rate": 4.560327198364008e-05, + "loss": 3.5983, + "step": 447 + }, + { + "epoch": 0.13750767341927564, + "grad_norm": 4.11976957321167, + "learning_rate": 4.570552147239264e-05, + "loss": 3.6371, + "step": 448 + }, + { + "epoch": 0.1378146101903008, + "grad_norm": 3.2035205364227295, + "learning_rate": 4.5807770961145195e-05, + "loss": 3.6097, + "step": 449 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 4.944174289703369, + "learning_rate": 4.591002044989775e-05, + "loss": 3.6317, + "step": 450 + }, + { + "epoch": 0.13842848373235114, + "grad_norm": 3.0040266513824463, + "learning_rate": 4.601226993865031e-05, + "loss": 3.6407, + "step": 451 + }, + { + "epoch": 0.1387354205033763, + "grad_norm": 5.124639511108398, + "learning_rate": 4.611451942740286e-05, + "loss": 3.6539, + "step": 452 + }, + { + "epoch": 0.13904235727440148, + "grad_norm": 2.792884349822998, + "learning_rate": 4.6216768916155423e-05, + "loss": 3.6542, + "step": 453 + }, + { + "epoch": 0.13934929404542665, + "grad_norm": 4.394725799560547, + "learning_rate": 4.631901840490798e-05, + "loss": 3.6811, + "step": 454 + }, + { + "epoch": 0.13965623081645182, + "grad_norm": 3.209400177001953, + "learning_rate": 4.642126789366054e-05, + "loss": 3.6635, + "step": 455 + }, + { + "epoch": 0.13996316758747698, + "grad_norm": 3.6599526405334473, + "learning_rate": 4.652351738241309e-05, + "loss": 3.5732, + "step": 456 + }, + { + "epoch": 0.14027010435850215, + "grad_norm": 3.6527204513549805, + "learning_rate": 4.6625766871165645e-05, + "loss": 3.5979, + "step": 457 + }, + { + "epoch": 0.14057704112952732, + "grad_norm": 3.4562110900878906, + "learning_rate": 4.6728016359918205e-05, + "loss": 3.6761, + "step": 458 + }, + { + "epoch": 0.1408839779005525, + "grad_norm": 3.5935721397399902, + "learning_rate": 4.683026584867076e-05, + "loss": 3.6598, + "step": 459 + }, + { + "epoch": 0.14119091467157766, + "grad_norm": 3.4518251419067383, + "learning_rate": 4.693251533742332e-05, + "loss": 3.5707, + "step": 460 + }, + { + "epoch": 0.14149785144260282, + "grad_norm": 3.3248815536499023, + "learning_rate": 4.703476482617587e-05, + "loss": 3.6949, + "step": 461 + }, + { + "epoch": 0.141804788213628, + "grad_norm": 3.6379971504211426, + "learning_rate": 4.7137014314928426e-05, + "loss": 3.6265, + "step": 462 + }, + { + "epoch": 0.14211172498465316, + "grad_norm": 4.068325996398926, + "learning_rate": 4.723926380368098e-05, + "loss": 3.6096, + "step": 463 + }, + { + "epoch": 0.14241866175567833, + "grad_norm": 3.0870959758758545, + "learning_rate": 4.734151329243354e-05, + "loss": 3.5201, + "step": 464 + }, + { + "epoch": 0.1427255985267035, + "grad_norm": 4.013638973236084, + "learning_rate": 4.7443762781186094e-05, + "loss": 3.5845, + "step": 465 + }, + { + "epoch": 0.14303253529772866, + "grad_norm": 3.421921968460083, + "learning_rate": 4.754601226993865e-05, + "loss": 3.6718, + "step": 466 + }, + { + "epoch": 0.14333947206875383, + "grad_norm": 3.4814112186431885, + "learning_rate": 4.764826175869121e-05, + "loss": 3.6225, + "step": 467 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 2.9323105812072754, + "learning_rate": 4.775051124744376e-05, + "loss": 3.5881, + "step": 468 + }, + { + "epoch": 0.14395334561080417, + "grad_norm": 3.862344264984131, + "learning_rate": 4.785276073619632e-05, + "loss": 3.6264, + "step": 469 + }, + { + "epoch": 0.14426028238182934, + "grad_norm": 2.950495481491089, + "learning_rate": 4.7955010224948876e-05, + "loss": 3.5891, + "step": 470 + }, + { + "epoch": 0.1445672191528545, + "grad_norm": 4.360744476318359, + "learning_rate": 4.8057259713701436e-05, + "loss": 3.6746, + "step": 471 + }, + { + "epoch": 0.14487415592387967, + "grad_norm": 2.689297914505005, + "learning_rate": 4.815950920245399e-05, + "loss": 3.616, + "step": 472 + }, + { + "epoch": 0.14518109269490484, + "grad_norm": 4.433006286621094, + "learning_rate": 4.826175869120655e-05, + "loss": 3.6259, + "step": 473 + }, + { + "epoch": 0.14548802946593, + "grad_norm": 2.9184467792510986, + "learning_rate": 4.8364008179959104e-05, + "loss": 3.59, + "step": 474 + }, + { + "epoch": 0.14579496623695518, + "grad_norm": 4.472714424133301, + "learning_rate": 4.846625766871166e-05, + "loss": 3.5608, + "step": 475 + }, + { + "epoch": 0.14610190300798034, + "grad_norm": 3.0839431285858154, + "learning_rate": 4.856850715746422e-05, + "loss": 3.6069, + "step": 476 + }, + { + "epoch": 0.1464088397790055, + "grad_norm": 3.8900411128997803, + "learning_rate": 4.867075664621677e-05, + "loss": 3.5387, + "step": 477 + }, + { + "epoch": 0.14671577655003068, + "grad_norm": 3.0446956157684326, + "learning_rate": 4.877300613496933e-05, + "loss": 3.5374, + "step": 478 + }, + { + "epoch": 0.14702271332105588, + "grad_norm": 3.805018901824951, + "learning_rate": 4.8875255623721885e-05, + "loss": 3.6032, + "step": 479 + }, + { + "epoch": 0.14732965009208104, + "grad_norm": 2.9937491416931152, + "learning_rate": 4.897750511247444e-05, + "loss": 3.548, + "step": 480 + }, + { + "epoch": 0.1476365868631062, + "grad_norm": 4.103757858276367, + "learning_rate": 4.907975460122699e-05, + "loss": 3.6292, + "step": 481 + }, + { + "epoch": 0.14794352363413138, + "grad_norm": 2.8275530338287354, + "learning_rate": 4.918200408997955e-05, + "loss": 3.5885, + "step": 482 + }, + { + "epoch": 0.14825046040515655, + "grad_norm": 4.104444980621338, + "learning_rate": 4.928425357873211e-05, + "loss": 3.5566, + "step": 483 + }, + { + "epoch": 0.14855739717618172, + "grad_norm": 2.820648670196533, + "learning_rate": 4.938650306748466e-05, + "loss": 3.6576, + "step": 484 + }, + { + "epoch": 0.14886433394720688, + "grad_norm": 4.639568328857422, + "learning_rate": 4.948875255623722e-05, + "loss": 3.583, + "step": 485 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 2.8675858974456787, + "learning_rate": 4.9591002044989774e-05, + "loss": 3.5982, + "step": 486 + }, + { + "epoch": 0.14947820748925722, + "grad_norm": 4.820484638214111, + "learning_rate": 4.9693251533742335e-05, + "loss": 3.5479, + "step": 487 + }, + { + "epoch": 0.1497851442602824, + "grad_norm": 2.9569075107574463, + "learning_rate": 4.979550102249489e-05, + "loss": 3.5846, + "step": 488 + }, + { + "epoch": 0.15009208103130756, + "grad_norm": 4.402152061462402, + "learning_rate": 4.989775051124745e-05, + "loss": 3.5368, + "step": 489 + }, + { + "epoch": 0.15039901780233272, + "grad_norm": 3.0454704761505127, + "learning_rate": 5e-05, + "loss": 3.5233, + "step": 490 + }, + { + "epoch": 0.1507059545733579, + "grad_norm": 3.564425468444824, + "learning_rate": 5.010224948875256e-05, + "loss": 3.5747, + "step": 491 + }, + { + "epoch": 0.15101289134438306, + "grad_norm": 3.2065536975860596, + "learning_rate": 5.020449897750511e-05, + "loss": 3.4803, + "step": 492 + }, + { + "epoch": 0.15131982811540823, + "grad_norm": 4.06170129776001, + "learning_rate": 5.030674846625767e-05, + "loss": 3.5867, + "step": 493 + }, + { + "epoch": 0.1516267648864334, + "grad_norm": 2.937181234359741, + "learning_rate": 5.040899795501023e-05, + "loss": 3.5098, + "step": 494 + }, + { + "epoch": 0.15193370165745856, + "grad_norm": 3.7272653579711914, + "learning_rate": 5.051124744376279e-05, + "loss": 3.5959, + "step": 495 + }, + { + "epoch": 0.15224063842848373, + "grad_norm": 2.8606886863708496, + "learning_rate": 5.061349693251534e-05, + "loss": 3.4881, + "step": 496 + }, + { + "epoch": 0.1525475751995089, + "grad_norm": 3.4861185550689697, + "learning_rate": 5.07157464212679e-05, + "loss": 3.563, + "step": 497 + }, + { + "epoch": 0.15285451197053407, + "grad_norm": 3.1362967491149902, + "learning_rate": 5.081799591002045e-05, + "loss": 3.5564, + "step": 498 + }, + { + "epoch": 0.15316144874155924, + "grad_norm": 3.360508441925049, + "learning_rate": 5.0920245398773005e-05, + "loss": 3.5307, + "step": 499 + }, + { + "epoch": 0.1534683855125844, + "grad_norm": 3.2896840572357178, + "learning_rate": 5.1022494887525566e-05, + "loss": 3.4843, + "step": 500 + }, + { + "epoch": 0.15377532228360957, + "grad_norm": 3.320429801940918, + "learning_rate": 5.112474437627812e-05, + "loss": 3.484, + "step": 501 + }, + { + "epoch": 0.15408225905463474, + "grad_norm": 3.409586191177368, + "learning_rate": 5.122699386503068e-05, + "loss": 3.506, + "step": 502 + }, + { + "epoch": 0.1543891958256599, + "grad_norm": 3.0944409370422363, + "learning_rate": 5.1329243353783227e-05, + "loss": 3.5011, + "step": 503 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 3.7220418453216553, + "learning_rate": 5.143149284253579e-05, + "loss": 3.5629, + "step": 504 + }, + { + "epoch": 0.15500306936771024, + "grad_norm": 3.217435359954834, + "learning_rate": 5.153374233128835e-05, + "loss": 3.4957, + "step": 505 + }, + { + "epoch": 0.1553100061387354, + "grad_norm": 4.0457444190979, + "learning_rate": 5.163599182004091e-05, + "loss": 3.5152, + "step": 506 + }, + { + "epoch": 0.15561694290976058, + "grad_norm": 2.9380006790161133, + "learning_rate": 5.1738241308793455e-05, + "loss": 3.5261, + "step": 507 + }, + { + "epoch": 0.15592387968078575, + "grad_norm": 4.134535312652588, + "learning_rate": 5.1840490797546015e-05, + "loss": 3.5622, + "step": 508 + }, + { + "epoch": 0.15623081645181092, + "grad_norm": 2.8209407329559326, + "learning_rate": 5.1942740286298575e-05, + "loss": 3.5335, + "step": 509 + }, + { + "epoch": 0.15653775322283608, + "grad_norm": 4.4260711669921875, + "learning_rate": 5.204498977505112e-05, + "loss": 3.5554, + "step": 510 + }, + { + "epoch": 0.15684468999386125, + "grad_norm": 2.8649590015411377, + "learning_rate": 5.214723926380368e-05, + "loss": 3.4989, + "step": 511 + }, + { + "epoch": 0.15715162676488642, + "grad_norm": 4.0349812507629395, + "learning_rate": 5.224948875255624e-05, + "loss": 3.4883, + "step": 512 + }, + { + "epoch": 0.1574585635359116, + "grad_norm": 2.841923475265503, + "learning_rate": 5.2351738241308803e-05, + "loss": 3.4748, + "step": 513 + }, + { + "epoch": 0.15776550030693678, + "grad_norm": 3.8810653686523438, + "learning_rate": 5.245398773006135e-05, + "loss": 3.5403, + "step": 514 + }, + { + "epoch": 0.15807243707796195, + "grad_norm": 3.0830774307250977, + "learning_rate": 5.255623721881391e-05, + "loss": 3.513, + "step": 515 + }, + { + "epoch": 0.15837937384898712, + "grad_norm": 3.8688604831695557, + "learning_rate": 5.265848670756647e-05, + "loss": 3.5409, + "step": 516 + }, + { + "epoch": 0.1586863106200123, + "grad_norm": 2.854600429534912, + "learning_rate": 5.276073619631902e-05, + "loss": 3.4441, + "step": 517 + }, + { + "epoch": 0.15899324739103746, + "grad_norm": 3.9125611782073975, + "learning_rate": 5.286298568507158e-05, + "loss": 3.4953, + "step": 518 + }, + { + "epoch": 0.15930018416206262, + "grad_norm": 2.8626177310943604, + "learning_rate": 5.296523517382413e-05, + "loss": 3.5279, + "step": 519 + }, + { + "epoch": 0.1596071209330878, + "grad_norm": 3.5023677349090576, + "learning_rate": 5.306748466257669e-05, + "loss": 3.4886, + "step": 520 + }, + { + "epoch": 0.15991405770411296, + "grad_norm": 2.960505962371826, + "learning_rate": 5.316973415132924e-05, + "loss": 3.5278, + "step": 521 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 3.976245164871216, + "learning_rate": 5.32719836400818e-05, + "loss": 3.5236, + "step": 522 + }, + { + "epoch": 0.1605279312461633, + "grad_norm": 3.078248977661133, + "learning_rate": 5.337423312883436e-05, + "loss": 3.5194, + "step": 523 + }, + { + "epoch": 0.16083486801718846, + "grad_norm": 3.7498552799224854, + "learning_rate": 5.347648261758691e-05, + "loss": 3.5315, + "step": 524 + }, + { + "epoch": 0.16114180478821363, + "grad_norm": 2.87638258934021, + "learning_rate": 5.357873210633947e-05, + "loss": 3.434, + "step": 525 + }, + { + "epoch": 0.1614487415592388, + "grad_norm": 3.786454677581787, + "learning_rate": 5.368098159509203e-05, + "loss": 3.4985, + "step": 526 + }, + { + "epoch": 0.16175567833026397, + "grad_norm": 2.915156364440918, + "learning_rate": 5.378323108384459e-05, + "loss": 3.4979, + "step": 527 + }, + { + "epoch": 0.16206261510128914, + "grad_norm": 4.095824718475342, + "learning_rate": 5.3885480572597135e-05, + "loss": 3.4605, + "step": 528 + }, + { + "epoch": 0.1623695518723143, + "grad_norm": 2.793501853942871, + "learning_rate": 5.3987730061349695e-05, + "loss": 3.476, + "step": 529 + }, + { + "epoch": 0.16267648864333947, + "grad_norm": 3.9074480533599854, + "learning_rate": 5.4089979550102256e-05, + "loss": 3.4636, + "step": 530 + }, + { + "epoch": 0.16298342541436464, + "grad_norm": 2.8382515907287598, + "learning_rate": 5.4192229038854816e-05, + "loss": 3.4364, + "step": 531 + }, + { + "epoch": 0.1632903621853898, + "grad_norm": 3.4670751094818115, + "learning_rate": 5.429447852760736e-05, + "loss": 3.5033, + "step": 532 + }, + { + "epoch": 0.16359729895641498, + "grad_norm": 2.8805580139160156, + "learning_rate": 5.439672801635992e-05, + "loss": 3.471, + "step": 533 + }, + { + "epoch": 0.16390423572744015, + "grad_norm": 3.745434522628784, + "learning_rate": 5.4498977505112484e-05, + "loss": 3.4565, + "step": 534 + }, + { + "epoch": 0.1642111724984653, + "grad_norm": 3.290579319000244, + "learning_rate": 5.460122699386503e-05, + "loss": 3.47, + "step": 535 + }, + { + "epoch": 0.16451810926949048, + "grad_norm": 3.2988481521606445, + "learning_rate": 5.470347648261759e-05, + "loss": 3.3781, + "step": 536 + }, + { + "epoch": 0.16482504604051565, + "grad_norm": 3.3673248291015625, + "learning_rate": 5.4805725971370145e-05, + "loss": 3.4891, + "step": 537 + }, + { + "epoch": 0.16513198281154082, + "grad_norm": 3.1917717456817627, + "learning_rate": 5.4907975460122705e-05, + "loss": 3.4493, + "step": 538 + }, + { + "epoch": 0.16543891958256599, + "grad_norm": 3.3869614601135254, + "learning_rate": 5.501022494887525e-05, + "loss": 3.3954, + "step": 539 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 2.896742820739746, + "learning_rate": 5.511247443762781e-05, + "loss": 3.4465, + "step": 540 + }, + { + "epoch": 0.16605279312461632, + "grad_norm": 3.771268844604492, + "learning_rate": 5.521472392638037e-05, + "loss": 3.4889, + "step": 541 + }, + { + "epoch": 0.1663597298956415, + "grad_norm": 2.8693349361419678, + "learning_rate": 5.531697341513292e-05, + "loss": 3.3661, + "step": 542 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.093103885650635, + "learning_rate": 5.541922290388548e-05, + "loss": 3.4451, + "step": 543 + }, + { + "epoch": 0.16697360343769183, + "grad_norm": 3.050361394882202, + "learning_rate": 5.552147239263804e-05, + "loss": 3.4203, + "step": 544 + }, + { + "epoch": 0.167280540208717, + "grad_norm": 3.041480302810669, + "learning_rate": 5.56237218813906e-05, + "loss": 3.4173, + "step": 545 + }, + { + "epoch": 0.16758747697974216, + "grad_norm": 3.385680675506592, + "learning_rate": 5.572597137014315e-05, + "loss": 3.4408, + "step": 546 + }, + { + "epoch": 0.16789441375076733, + "grad_norm": 2.88845157623291, + "learning_rate": 5.582822085889571e-05, + "loss": 3.4536, + "step": 547 + }, + { + "epoch": 0.1682013505217925, + "grad_norm": 3.7155961990356445, + "learning_rate": 5.593047034764827e-05, + "loss": 3.4392, + "step": 548 + }, + { + "epoch": 0.1685082872928177, + "grad_norm": 3.4626615047454834, + "learning_rate": 5.6032719836400815e-05, + "loss": 3.4395, + "step": 549 + }, + { + "epoch": 0.16881522406384286, + "grad_norm": 3.182154417037964, + "learning_rate": 5.6134969325153376e-05, + "loss": 3.5239, + "step": 550 + }, + { + "epoch": 0.16912216083486803, + "grad_norm": 3.478602886199951, + "learning_rate": 5.6237218813905936e-05, + "loss": 3.4258, + "step": 551 + }, + { + "epoch": 0.1694290976058932, + "grad_norm": 2.9652369022369385, + "learning_rate": 5.6339468302658496e-05, + "loss": 3.3919, + "step": 552 + }, + { + "epoch": 0.16973603437691837, + "grad_norm": 3.736821413040161, + "learning_rate": 5.644171779141104e-05, + "loss": 3.4491, + "step": 553 + }, + { + "epoch": 0.17004297114794353, + "grad_norm": 2.7791361808776855, + "learning_rate": 5.6543967280163604e-05, + "loss": 3.4748, + "step": 554 + }, + { + "epoch": 0.1703499079189687, + "grad_norm": 4.583637714385986, + "learning_rate": 5.664621676891616e-05, + "loss": 3.4554, + "step": 555 + }, + { + "epoch": 0.17065684468999387, + "grad_norm": 2.8527474403381348, + "learning_rate": 5.674846625766872e-05, + "loss": 3.4327, + "step": 556 + }, + { + "epoch": 0.17096378146101904, + "grad_norm": 4.116163730621338, + "learning_rate": 5.685071574642127e-05, + "loss": 3.4043, + "step": 557 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 3.0130903720855713, + "learning_rate": 5.6952965235173825e-05, + "loss": 3.4823, + "step": 558 + }, + { + "epoch": 0.17157765500306937, + "grad_norm": 3.3556432723999023, + "learning_rate": 5.7055214723926385e-05, + "loss": 3.4464, + "step": 559 + }, + { + "epoch": 0.17188459177409454, + "grad_norm": 2.854952573776245, + "learning_rate": 5.715746421267893e-05, + "loss": 3.3768, + "step": 560 + }, + { + "epoch": 0.1721915285451197, + "grad_norm": 3.9891982078552246, + "learning_rate": 5.725971370143149e-05, + "loss": 3.3949, + "step": 561 + }, + { + "epoch": 0.17249846531614488, + "grad_norm": 2.980468511581421, + "learning_rate": 5.736196319018405e-05, + "loss": 3.459, + "step": 562 + }, + { + "epoch": 0.17280540208717005, + "grad_norm": 3.453510284423828, + "learning_rate": 5.7464212678936613e-05, + "loss": 3.4549, + "step": 563 + }, + { + "epoch": 0.1731123388581952, + "grad_norm": 2.8926782608032227, + "learning_rate": 5.756646216768916e-05, + "loss": 3.392, + "step": 564 + }, + { + "epoch": 0.17341927562922038, + "grad_norm": 3.3722894191741943, + "learning_rate": 5.766871165644172e-05, + "loss": 3.4002, + "step": 565 + }, + { + "epoch": 0.17372621240024555, + "grad_norm": 2.8093647956848145, + "learning_rate": 5.777096114519428e-05, + "loss": 3.3862, + "step": 566 + }, + { + "epoch": 0.17403314917127072, + "grad_norm": 4.1722731590271, + "learning_rate": 5.787321063394683e-05, + "loss": 3.3903, + "step": 567 + }, + { + "epoch": 0.17434008594229589, + "grad_norm": 2.778069257736206, + "learning_rate": 5.797546012269939e-05, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.17464702271332105, + "grad_norm": 3.8501908779144287, + "learning_rate": 5.807770961145195e-05, + "loss": 3.4094, + "step": 569 + }, + { + "epoch": 0.17495395948434622, + "grad_norm": 2.5164549350738525, + "learning_rate": 5.817995910020451e-05, + "loss": 3.4343, + "step": 570 + }, + { + "epoch": 0.1752608962553714, + "grad_norm": 4.0673065185546875, + "learning_rate": 5.8282208588957056e-05, + "loss": 3.3993, + "step": 571 + }, + { + "epoch": 0.17556783302639656, + "grad_norm": 2.7882072925567627, + "learning_rate": 5.8384458077709616e-05, + "loss": 3.4759, + "step": 572 + }, + { + "epoch": 0.17587476979742173, + "grad_norm": 3.3252487182617188, + "learning_rate": 5.848670756646217e-05, + "loss": 3.3562, + "step": 573 + }, + { + "epoch": 0.1761817065684469, + "grad_norm": 2.7499115467071533, + "learning_rate": 5.8588957055214724e-05, + "loss": 3.3376, + "step": 574 + }, + { + "epoch": 0.17648864333947206, + "grad_norm": 4.061224460601807, + "learning_rate": 5.8691206543967284e-05, + "loss": 3.3521, + "step": 575 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 3.022193431854248, + "learning_rate": 5.879345603271984e-05, + "loss": 3.3933, + "step": 576 + }, + { + "epoch": 0.1771025168815224, + "grad_norm": 3.2442128658294678, + "learning_rate": 5.88957055214724e-05, + "loss": 3.4531, + "step": 577 + }, + { + "epoch": 0.17740945365254757, + "grad_norm": 2.9524872303009033, + "learning_rate": 5.8997955010224945e-05, + "loss": 3.332, + "step": 578 + }, + { + "epoch": 0.17771639042357273, + "grad_norm": 3.4604902267456055, + "learning_rate": 5.9100204498977505e-05, + "loss": 3.3706, + "step": 579 + }, + { + "epoch": 0.1780233271945979, + "grad_norm": 3.05216646194458, + "learning_rate": 5.9202453987730066e-05, + "loss": 3.463, + "step": 580 + }, + { + "epoch": 0.17833026396562307, + "grad_norm": 3.427311658859253, + "learning_rate": 5.9304703476482626e-05, + "loss": 3.4204, + "step": 581 + }, + { + "epoch": 0.17863720073664824, + "grad_norm": 2.5583856105804443, + "learning_rate": 5.940695296523517e-05, + "loss": 3.4686, + "step": 582 + }, + { + "epoch": 0.1789441375076734, + "grad_norm": 3.85471248626709, + "learning_rate": 5.950920245398773e-05, + "loss": 3.4518, + "step": 583 + }, + { + "epoch": 0.17925107427869857, + "grad_norm": 2.6894235610961914, + "learning_rate": 5.9611451942740294e-05, + "loss": 3.4179, + "step": 584 + }, + { + "epoch": 0.17955801104972377, + "grad_norm": 3.7592904567718506, + "learning_rate": 5.971370143149284e-05, + "loss": 3.3197, + "step": 585 + }, + { + "epoch": 0.17986494782074894, + "grad_norm": 2.8180313110351562, + "learning_rate": 5.98159509202454e-05, + "loss": 3.4098, + "step": 586 + }, + { + "epoch": 0.1801718845917741, + "grad_norm": 3.5678224563598633, + "learning_rate": 5.991820040899796e-05, + "loss": 3.3644, + "step": 587 + }, + { + "epoch": 0.18047882136279927, + "grad_norm": 2.920607328414917, + "learning_rate": 6.002044989775052e-05, + "loss": 3.4158, + "step": 588 + }, + { + "epoch": 0.18078575813382444, + "grad_norm": 2.9465436935424805, + "learning_rate": 6.012269938650307e-05, + "loss": 3.3369, + "step": 589 + }, + { + "epoch": 0.1810926949048496, + "grad_norm": 3.8760533332824707, + "learning_rate": 6.022494887525563e-05, + "loss": 3.4205, + "step": 590 + }, + { + "epoch": 0.18139963167587478, + "grad_norm": 3.2972259521484375, + "learning_rate": 6.032719836400819e-05, + "loss": 3.3234, + "step": 591 + }, + { + "epoch": 0.18170656844689995, + "grad_norm": 2.8855841159820557, + "learning_rate": 6.0429447852760736e-05, + "loss": 3.4172, + "step": 592 + }, + { + "epoch": 0.18201350521792511, + "grad_norm": 3.3035166263580322, + "learning_rate": 6.05316973415133e-05, + "loss": 3.3235, + "step": 593 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 2.5975232124328613, + "learning_rate": 6.063394683026585e-05, + "loss": 3.3245, + "step": 594 + }, + { + "epoch": 0.18262737875997545, + "grad_norm": 3.68007755279541, + "learning_rate": 6.073619631901841e-05, + "loss": 3.4348, + "step": 595 + }, + { + "epoch": 0.18293431553100062, + "grad_norm": 2.774419069290161, + "learning_rate": 6.083844580777096e-05, + "loss": 3.2763, + "step": 596 + }, + { + "epoch": 0.1832412523020258, + "grad_norm": 3.686140298843384, + "learning_rate": 6.094069529652352e-05, + "loss": 3.29, + "step": 597 + }, + { + "epoch": 0.18354818907305095, + "grad_norm": 2.71142315864563, + "learning_rate": 6.104294478527609e-05, + "loss": 3.3899, + "step": 598 + }, + { + "epoch": 0.18385512584407612, + "grad_norm": 3.725736141204834, + "learning_rate": 6.114519427402863e-05, + "loss": 3.3844, + "step": 599 + }, + { + "epoch": 0.1841620626151013, + "grad_norm": 2.691237211227417, + "learning_rate": 6.124744376278119e-05, + "loss": 3.3138, + "step": 600 + }, + { + "epoch": 0.18446899938612646, + "grad_norm": 3.467499256134033, + "learning_rate": 6.134969325153375e-05, + "loss": 3.3501, + "step": 601 + }, + { + "epoch": 0.18477593615715163, + "grad_norm": 2.776309013366699, + "learning_rate": 6.14519427402863e-05, + "loss": 3.3278, + "step": 602 + }, + { + "epoch": 0.1850828729281768, + "grad_norm": 3.4674019813537598, + "learning_rate": 6.155419222903885e-05, + "loss": 3.262, + "step": 603 + }, + { + "epoch": 0.18538980969920196, + "grad_norm": 2.8091421127319336, + "learning_rate": 6.165644171779141e-05, + "loss": 3.3296, + "step": 604 + }, + { + "epoch": 0.18569674647022713, + "grad_norm": 3.4938528537750244, + "learning_rate": 6.175869120654397e-05, + "loss": 3.4028, + "step": 605 + }, + { + "epoch": 0.1860036832412523, + "grad_norm": 2.5200188159942627, + "learning_rate": 6.186094069529653e-05, + "loss": 3.3726, + "step": 606 + }, + { + "epoch": 0.18631062001227747, + "grad_norm": 3.6415109634399414, + "learning_rate": 6.196319018404908e-05, + "loss": 3.3539, + "step": 607 + }, + { + "epoch": 0.18661755678330263, + "grad_norm": 2.553532123565674, + "learning_rate": 6.206543967280163e-05, + "loss": 3.2971, + "step": 608 + }, + { + "epoch": 0.1869244935543278, + "grad_norm": 3.7287046909332275, + "learning_rate": 6.21676891615542e-05, + "loss": 3.3987, + "step": 609 + }, + { + "epoch": 0.18723143032535297, + "grad_norm": 2.6285226345062256, + "learning_rate": 6.226993865030674e-05, + "loss": 3.2446, + "step": 610 + }, + { + "epoch": 0.18753836709637814, + "grad_norm": 3.453766107559204, + "learning_rate": 6.237218813905931e-05, + "loss": 3.2644, + "step": 611 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 2.7924115657806396, + "learning_rate": 6.247443762781186e-05, + "loss": 3.3056, + "step": 612 + }, + { + "epoch": 0.18815224063842848, + "grad_norm": 3.4854533672332764, + "learning_rate": 6.257668711656443e-05, + "loss": 3.3468, + "step": 613 + }, + { + "epoch": 0.18845917740945364, + "grad_norm": 2.8738653659820557, + "learning_rate": 6.267893660531697e-05, + "loss": 3.3079, + "step": 614 + }, + { + "epoch": 0.1887661141804788, + "grad_norm": 3.496342420578003, + "learning_rate": 6.278118609406954e-05, + "loss": 3.3453, + "step": 615 + }, + { + "epoch": 0.18907305095150398, + "grad_norm": 3.1935245990753174, + "learning_rate": 6.288343558282209e-05, + "loss": 3.303, + "step": 616 + }, + { + "epoch": 0.18937998772252915, + "grad_norm": 2.9726579189300537, + "learning_rate": 6.298568507157464e-05, + "loss": 3.284, + "step": 617 + }, + { + "epoch": 0.18968692449355432, + "grad_norm": 2.8515241146087646, + "learning_rate": 6.30879345603272e-05, + "loss": 3.2748, + "step": 618 + }, + { + "epoch": 0.18999386126457948, + "grad_norm": 3.216681480407715, + "learning_rate": 6.319018404907977e-05, + "loss": 3.2613, + "step": 619 + }, + { + "epoch": 0.19030079803560468, + "grad_norm": 2.9164562225341797, + "learning_rate": 6.329243353783232e-05, + "loss": 3.3234, + "step": 620 + }, + { + "epoch": 0.19060773480662985, + "grad_norm": 2.6724259853363037, + "learning_rate": 6.339468302658487e-05, + "loss": 3.3271, + "step": 621 + }, + { + "epoch": 0.19091467157765502, + "grad_norm": 3.298551082611084, + "learning_rate": 6.349693251533743e-05, + "loss": 3.2715, + "step": 622 + }, + { + "epoch": 0.19122160834868018, + "grad_norm": 2.609632968902588, + "learning_rate": 6.359918200408998e-05, + "loss": 3.2392, + "step": 623 + }, + { + "epoch": 0.19152854511970535, + "grad_norm": 3.6469385623931885, + "learning_rate": 6.370143149284253e-05, + "loss": 3.428, + "step": 624 + }, + { + "epoch": 0.19183548189073052, + "grad_norm": 2.4231622219085693, + "learning_rate": 6.380368098159509e-05, + "loss": 3.3436, + "step": 625 + }, + { + "epoch": 0.1921424186617557, + "grad_norm": 3.9182474613189697, + "learning_rate": 6.390593047034765e-05, + "loss": 3.3375, + "step": 626 + }, + { + "epoch": 0.19244935543278086, + "grad_norm": 2.3975942134857178, + "learning_rate": 6.400817995910021e-05, + "loss": 3.2711, + "step": 627 + }, + { + "epoch": 0.19275629220380602, + "grad_norm": 3.061039447784424, + "learning_rate": 6.411042944785276e-05, + "loss": 3.3124, + "step": 628 + }, + { + "epoch": 0.1930632289748312, + "grad_norm": 2.9461817741394043, + "learning_rate": 6.421267893660532e-05, + "loss": 3.2954, + "step": 629 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 2.6603662967681885, + "learning_rate": 6.431492842535788e-05, + "loss": 3.2138, + "step": 630 + }, + { + "epoch": 0.19367710251688153, + "grad_norm": 3.339444875717163, + "learning_rate": 6.441717791411042e-05, + "loss": 3.2796, + "step": 631 + }, + { + "epoch": 0.1939840392879067, + "grad_norm": 2.59061861038208, + "learning_rate": 6.451942740286299e-05, + "loss": 3.3906, + "step": 632 + }, + { + "epoch": 0.19429097605893186, + "grad_norm": 3.704300880432129, + "learning_rate": 6.462167689161554e-05, + "loss": 3.2604, + "step": 633 + }, + { + "epoch": 0.19459791282995703, + "grad_norm": 3.110203266143799, + "learning_rate": 6.472392638036811e-05, + "loss": 3.3236, + "step": 634 + }, + { + "epoch": 0.1949048496009822, + "grad_norm": 3.016730308532715, + "learning_rate": 6.482617586912065e-05, + "loss": 3.2911, + "step": 635 + }, + { + "epoch": 0.19521178637200737, + "grad_norm": 2.896956205368042, + "learning_rate": 6.492842535787322e-05, + "loss": 3.35, + "step": 636 + }, + { + "epoch": 0.19551872314303254, + "grad_norm": 2.7913663387298584, + "learning_rate": 6.503067484662577e-05, + "loss": 3.3474, + "step": 637 + }, + { + "epoch": 0.1958256599140577, + "grad_norm": 3.285518169403076, + "learning_rate": 6.513292433537832e-05, + "loss": 3.2131, + "step": 638 + }, + { + "epoch": 0.19613259668508287, + "grad_norm": 2.588491201400757, + "learning_rate": 6.523517382413088e-05, + "loss": 3.2955, + "step": 639 + }, + { + "epoch": 0.19643953345610804, + "grad_norm": 2.9417827129364014, + "learning_rate": 6.533742331288345e-05, + "loss": 3.2917, + "step": 640 + }, + { + "epoch": 0.1967464702271332, + "grad_norm": 3.2209408283233643, + "learning_rate": 6.5439672801636e-05, + "loss": 3.233, + "step": 641 + }, + { + "epoch": 0.19705340699815838, + "grad_norm": 2.8424925804138184, + "learning_rate": 6.554192229038855e-05, + "loss": 3.3194, + "step": 642 + }, + { + "epoch": 0.19736034376918354, + "grad_norm": 2.9005842208862305, + "learning_rate": 6.56441717791411e-05, + "loss": 3.275, + "step": 643 + }, + { + "epoch": 0.1976672805402087, + "grad_norm": 3.0277016162872314, + "learning_rate": 6.574642126789366e-05, + "loss": 3.2881, + "step": 644 + }, + { + "epoch": 0.19797421731123388, + "grad_norm": 2.8932368755340576, + "learning_rate": 6.584867075664623e-05, + "loss": 3.2799, + "step": 645 + }, + { + "epoch": 0.19828115408225905, + "grad_norm": 2.994464635848999, + "learning_rate": 6.595092024539877e-05, + "loss": 3.258, + "step": 646 + }, + { + "epoch": 0.19858809085328422, + "grad_norm": 2.943040132522583, + "learning_rate": 6.605316973415133e-05, + "loss": 3.1994, + "step": 647 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 2.942765712738037, + "learning_rate": 6.615541922290389e-05, + "loss": 3.1802, + "step": 648 + }, + { + "epoch": 0.19920196439533455, + "grad_norm": 2.8036246299743652, + "learning_rate": 6.625766871165644e-05, + "loss": 3.2426, + "step": 649 + }, + { + "epoch": 0.19950890116635972, + "grad_norm": 2.814507484436035, + "learning_rate": 6.6359918200409e-05, + "loss": 3.2978, + "step": 650 + }, + { + "epoch": 0.1998158379373849, + "grad_norm": 2.8133158683776855, + "learning_rate": 6.646216768916156e-05, + "loss": 3.2435, + "step": 651 + }, + { + "epoch": 0.20012277470841006, + "grad_norm": 2.8596129417419434, + "learning_rate": 6.656441717791412e-05, + "loss": 3.2154, + "step": 652 + }, + { + "epoch": 0.20042971147943522, + "grad_norm": 2.663926839828491, + "learning_rate": 6.666666666666667e-05, + "loss": 3.2487, + "step": 653 + }, + { + "epoch": 0.2007366482504604, + "grad_norm": 3.40561580657959, + "learning_rate": 6.676891615541922e-05, + "loss": 3.1509, + "step": 654 + }, + { + "epoch": 0.20104358502148556, + "grad_norm": 2.5786798000335693, + "learning_rate": 6.687116564417179e-05, + "loss": 3.2686, + "step": 655 + }, + { + "epoch": 0.20135052179251076, + "grad_norm": 3.007436752319336, + "learning_rate": 6.697341513292433e-05, + "loss": 3.2543, + "step": 656 + }, + { + "epoch": 0.20165745856353592, + "grad_norm": 2.5966951847076416, + "learning_rate": 6.70756646216769e-05, + "loss": 3.2643, + "step": 657 + }, + { + "epoch": 0.2019643953345611, + "grad_norm": 3.2698333263397217, + "learning_rate": 6.717791411042945e-05, + "loss": 3.2002, + "step": 658 + }, + { + "epoch": 0.20227133210558626, + "grad_norm": 2.513129472732544, + "learning_rate": 6.7280163599182e-05, + "loss": 3.1551, + "step": 659 + }, + { + "epoch": 0.20257826887661143, + "grad_norm": 2.9690299034118652, + "learning_rate": 6.738241308793456e-05, + "loss": 3.3037, + "step": 660 + }, + { + "epoch": 0.2028852056476366, + "grad_norm": 2.6644227504730225, + "learning_rate": 6.748466257668711e-05, + "loss": 3.3225, + "step": 661 + }, + { + "epoch": 0.20319214241866176, + "grad_norm": 2.6990232467651367, + "learning_rate": 6.758691206543968e-05, + "loss": 3.227, + "step": 662 + }, + { + "epoch": 0.20349907918968693, + "grad_norm": 3.6271350383758545, + "learning_rate": 6.768916155419223e-05, + "loss": 3.32, + "step": 663 + }, + { + "epoch": 0.2038060159607121, + "grad_norm": 2.6351428031921387, + "learning_rate": 6.779141104294479e-05, + "loss": 3.2104, + "step": 664 + }, + { + "epoch": 0.20411295273173727, + "grad_norm": 3.980685234069824, + "learning_rate": 6.789366053169734e-05, + "loss": 3.2602, + "step": 665 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 2.5207509994506836, + "learning_rate": 6.799591002044991e-05, + "loss": 3.2256, + "step": 666 + }, + { + "epoch": 0.2047268262737876, + "grad_norm": 3.0568666458129883, + "learning_rate": 6.809815950920245e-05, + "loss": 3.2918, + "step": 667 + }, + { + "epoch": 0.20503376304481277, + "grad_norm": 2.6476826667785645, + "learning_rate": 6.820040899795501e-05, + "loss": 3.2745, + "step": 668 + }, + { + "epoch": 0.20534069981583794, + "grad_norm": 3.0413191318511963, + "learning_rate": 6.830265848670757e-05, + "loss": 3.2683, + "step": 669 + }, + { + "epoch": 0.2056476365868631, + "grad_norm": 2.6214709281921387, + "learning_rate": 6.840490797546014e-05, + "loss": 3.1399, + "step": 670 + }, + { + "epoch": 0.20595457335788828, + "grad_norm": 3.0577988624572754, + "learning_rate": 6.850715746421268e-05, + "loss": 3.2131, + "step": 671 + }, + { + "epoch": 0.20626151012891344, + "grad_norm": 2.795365571975708, + "learning_rate": 6.860940695296524e-05, + "loss": 3.1633, + "step": 672 + }, + { + "epoch": 0.2065684468999386, + "grad_norm": 3.3030495643615723, + "learning_rate": 6.87116564417178e-05, + "loss": 3.2036, + "step": 673 + }, + { + "epoch": 0.20687538367096378, + "grad_norm": 2.3182966709136963, + "learning_rate": 6.881390593047035e-05, + "loss": 3.2154, + "step": 674 + }, + { + "epoch": 0.20718232044198895, + "grad_norm": 3.133702039718628, + "learning_rate": 6.89161554192229e-05, + "loss": 3.1828, + "step": 675 + }, + { + "epoch": 0.20748925721301412, + "grad_norm": 2.555358409881592, + "learning_rate": 6.901840490797547e-05, + "loss": 3.1434, + "step": 676 + }, + { + "epoch": 0.20779619398403928, + "grad_norm": 2.990675687789917, + "learning_rate": 6.912065439672802e-05, + "loss": 3.2182, + "step": 677 + }, + { + "epoch": 0.20810313075506445, + "grad_norm": 2.5072035789489746, + "learning_rate": 6.922290388548058e-05, + "loss": 3.2735, + "step": 678 + }, + { + "epoch": 0.20841006752608962, + "grad_norm": 3.311474323272705, + "learning_rate": 6.932515337423313e-05, + "loss": 3.2152, + "step": 679 + }, + { + "epoch": 0.2087170042971148, + "grad_norm": 2.7110986709594727, + "learning_rate": 6.942740286298569e-05, + "loss": 3.1633, + "step": 680 + }, + { + "epoch": 0.20902394106813996, + "grad_norm": 2.6963095664978027, + "learning_rate": 6.952965235173824e-05, + "loss": 3.2097, + "step": 681 + }, + { + "epoch": 0.20933087783916512, + "grad_norm": 2.7126448154449463, + "learning_rate": 6.963190184049079e-05, + "loss": 3.232, + "step": 682 + }, + { + "epoch": 0.2096378146101903, + "grad_norm": 2.723257541656494, + "learning_rate": 6.973415132924336e-05, + "loss": 3.1024, + "step": 683 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 2.985407829284668, + "learning_rate": 6.983640081799591e-05, + "loss": 3.215, + "step": 684 + }, + { + "epoch": 0.21025168815224063, + "grad_norm": 2.4878063201904297, + "learning_rate": 6.993865030674847e-05, + "loss": 3.2543, + "step": 685 + }, + { + "epoch": 0.2105586249232658, + "grad_norm": 3.417191505432129, + "learning_rate": 7.004089979550102e-05, + "loss": 3.217, + "step": 686 + }, + { + "epoch": 0.21086556169429096, + "grad_norm": 2.606513738632202, + "learning_rate": 7.014314928425359e-05, + "loss": 3.1831, + "step": 687 + }, + { + "epoch": 0.21117249846531613, + "grad_norm": 2.777334213256836, + "learning_rate": 7.024539877300614e-05, + "loss": 3.1513, + "step": 688 + }, + { + "epoch": 0.2114794352363413, + "grad_norm": 2.718494415283203, + "learning_rate": 7.03476482617587e-05, + "loss": 3.1695, + "step": 689 + }, + { + "epoch": 0.21178637200736647, + "grad_norm": 3.041794776916504, + "learning_rate": 7.044989775051125e-05, + "loss": 3.2078, + "step": 690 + }, + { + "epoch": 0.21209330877839166, + "grad_norm": 2.6473169326782227, + "learning_rate": 7.055214723926382e-05, + "loss": 3.177, + "step": 691 + }, + { + "epoch": 0.21240024554941683, + "grad_norm": 3.2349517345428467, + "learning_rate": 7.065439672801636e-05, + "loss": 3.2144, + "step": 692 + }, + { + "epoch": 0.212707182320442, + "grad_norm": 2.6024651527404785, + "learning_rate": 7.075664621676892e-05, + "loss": 3.2204, + "step": 693 + }, + { + "epoch": 0.21301411909146717, + "grad_norm": 2.9090511798858643, + "learning_rate": 7.085889570552148e-05, + "loss": 3.2473, + "step": 694 + }, + { + "epoch": 0.21332105586249234, + "grad_norm": 3.230525255203247, + "learning_rate": 7.096114519427403e-05, + "loss": 3.2552, + "step": 695 + }, + { + "epoch": 0.2136279926335175, + "grad_norm": 2.2609128952026367, + "learning_rate": 7.106339468302658e-05, + "loss": 3.1302, + "step": 696 + }, + { + "epoch": 0.21393492940454267, + "grad_norm": 3.484372854232788, + "learning_rate": 7.116564417177914e-05, + "loss": 3.1578, + "step": 697 + }, + { + "epoch": 0.21424186617556784, + "grad_norm": 2.130702257156372, + "learning_rate": 7.12678936605317e-05, + "loss": 3.2089, + "step": 698 + }, + { + "epoch": 0.214548802946593, + "grad_norm": 3.0673611164093018, + "learning_rate": 7.137014314928426e-05, + "loss": 3.214, + "step": 699 + }, + { + "epoch": 0.21485573971761818, + "grad_norm": 2.572826862335205, + "learning_rate": 7.147239263803681e-05, + "loss": 3.1824, + "step": 700 + }, + { + "epoch": 0.21516267648864335, + "grad_norm": 2.8327746391296387, + "learning_rate": 7.157464212678937e-05, + "loss": 3.2384, + "step": 701 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 2.863041877746582, + "learning_rate": 7.167689161554193e-05, + "loss": 3.1102, + "step": 702 + }, + { + "epoch": 0.21577655003069368, + "grad_norm": 2.2519750595092773, + "learning_rate": 7.177914110429447e-05, + "loss": 3.1541, + "step": 703 + }, + { + "epoch": 0.21608348680171885, + "grad_norm": 3.197129011154175, + "learning_rate": 7.188139059304704e-05, + "loss": 3.2407, + "step": 704 + }, + { + "epoch": 0.21639042357274402, + "grad_norm": 2.32582426071167, + "learning_rate": 7.19836400817996e-05, + "loss": 3.1895, + "step": 705 + }, + { + "epoch": 0.21669736034376919, + "grad_norm": 3.0128488540649414, + "learning_rate": 7.208588957055215e-05, + "loss": 3.2839, + "step": 706 + }, + { + "epoch": 0.21700429711479435, + "grad_norm": 2.503342390060425, + "learning_rate": 7.21881390593047e-05, + "loss": 3.2093, + "step": 707 + }, + { + "epoch": 0.21731123388581952, + "grad_norm": 2.7540833950042725, + "learning_rate": 7.229038854805727e-05, + "loss": 3.2143, + "step": 708 + }, + { + "epoch": 0.2176181706568447, + "grad_norm": 2.8838772773742676, + "learning_rate": 7.239263803680982e-05, + "loss": 3.2051, + "step": 709 + }, + { + "epoch": 0.21792510742786986, + "grad_norm": 2.7495758533477783, + "learning_rate": 7.249488752556238e-05, + "loss": 3.0701, + "step": 710 + }, + { + "epoch": 0.21823204419889503, + "grad_norm": 2.684539794921875, + "learning_rate": 7.259713701431493e-05, + "loss": 3.1917, + "step": 711 + }, + { + "epoch": 0.2185389809699202, + "grad_norm": 2.8330819606781006, + "learning_rate": 7.26993865030675e-05, + "loss": 3.1685, + "step": 712 + }, + { + "epoch": 0.21884591774094536, + "grad_norm": 2.6974711418151855, + "learning_rate": 7.280163599182005e-05, + "loss": 3.0953, + "step": 713 + }, + { + "epoch": 0.21915285451197053, + "grad_norm": 2.5129306316375732, + "learning_rate": 7.29038854805726e-05, + "loss": 3.1371, + "step": 714 + }, + { + "epoch": 0.2194597912829957, + "grad_norm": 2.7884230613708496, + "learning_rate": 7.300613496932516e-05, + "loss": 3.1386, + "step": 715 + }, + { + "epoch": 0.21976672805402087, + "grad_norm": 2.296306610107422, + "learning_rate": 7.310838445807771e-05, + "loss": 3.1735, + "step": 716 + }, + { + "epoch": 0.22007366482504603, + "grad_norm": 2.777911424636841, + "learning_rate": 7.321063394683026e-05, + "loss": 3.1726, + "step": 717 + }, + { + "epoch": 0.2203806015960712, + "grad_norm": 2.5349695682525635, + "learning_rate": 7.331288343558282e-05, + "loss": 3.1603, + "step": 718 + }, + { + "epoch": 0.22068753836709637, + "grad_norm": 2.415412425994873, + "learning_rate": 7.341513292433539e-05, + "loss": 3.1378, + "step": 719 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 2.7188358306884766, + "learning_rate": 7.351738241308794e-05, + "loss": 3.1321, + "step": 720 + }, + { + "epoch": 0.2213014119091467, + "grad_norm": 2.4872183799743652, + "learning_rate": 7.361963190184049e-05, + "loss": 3.1283, + "step": 721 + }, + { + "epoch": 0.22160834868017187, + "grad_norm": 2.454535961151123, + "learning_rate": 7.372188139059305e-05, + "loss": 3.1085, + "step": 722 + }, + { + "epoch": 0.22191528545119704, + "grad_norm": 2.5621426105499268, + "learning_rate": 7.382413087934561e-05, + "loss": 3.1307, + "step": 723 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.513777256011963, + "learning_rate": 7.392638036809815e-05, + "loss": 3.1103, + "step": 724 + }, + { + "epoch": 0.22252915899324738, + "grad_norm": 2.596559762954712, + "learning_rate": 7.402862985685072e-05, + "loss": 3.1563, + "step": 725 + }, + { + "epoch": 0.22283609576427257, + "grad_norm": 2.371487617492676, + "learning_rate": 7.413087934560327e-05, + "loss": 3.1344, + "step": 726 + }, + { + "epoch": 0.22314303253529774, + "grad_norm": 2.7252206802368164, + "learning_rate": 7.423312883435584e-05, + "loss": 3.2139, + "step": 727 + }, + { + "epoch": 0.2234499693063229, + "grad_norm": 2.2834722995758057, + "learning_rate": 7.433537832310838e-05, + "loss": 3.1461, + "step": 728 + }, + { + "epoch": 0.22375690607734808, + "grad_norm": 3.0965540409088135, + "learning_rate": 7.443762781186095e-05, + "loss": 3.1433, + "step": 729 + }, + { + "epoch": 0.22406384284837325, + "grad_norm": 2.351365804672241, + "learning_rate": 7.45398773006135e-05, + "loss": 3.1737, + "step": 730 + }, + { + "epoch": 0.2243707796193984, + "grad_norm": 3.0938596725463867, + "learning_rate": 7.464212678936606e-05, + "loss": 3.1689, + "step": 731 + }, + { + "epoch": 0.22467771639042358, + "grad_norm": 2.415039300918579, + "learning_rate": 7.474437627811861e-05, + "loss": 3.1146, + "step": 732 + }, + { + "epoch": 0.22498465316144875, + "grad_norm": 2.8242318630218506, + "learning_rate": 7.484662576687118e-05, + "loss": 3.0812, + "step": 733 + }, + { + "epoch": 0.22529158993247392, + "grad_norm": 2.4347777366638184, + "learning_rate": 7.494887525562373e-05, + "loss": 3.203, + "step": 734 + }, + { + "epoch": 0.22559852670349909, + "grad_norm": 2.953418016433716, + "learning_rate": 7.505112474437628e-05, + "loss": 3.109, + "step": 735 + }, + { + "epoch": 0.22590546347452425, + "grad_norm": 2.600888252258301, + "learning_rate": 7.515337423312884e-05, + "loss": 3.1859, + "step": 736 + }, + { + "epoch": 0.22621240024554942, + "grad_norm": 2.7484869956970215, + "learning_rate": 7.525562372188139e-05, + "loss": 3.1169, + "step": 737 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 2.4797677993774414, + "learning_rate": 7.535787321063396e-05, + "loss": 3.0696, + "step": 738 + }, + { + "epoch": 0.22682627378759976, + "grad_norm": 2.641873359680176, + "learning_rate": 7.54601226993865e-05, + "loss": 3.1545, + "step": 739 + }, + { + "epoch": 0.22713321055862493, + "grad_norm": 2.3956825733184814, + "learning_rate": 7.556237218813907e-05, + "loss": 3.1295, + "step": 740 + }, + { + "epoch": 0.2274401473296501, + "grad_norm": 2.8832130432128906, + "learning_rate": 7.566462167689162e-05, + "loss": 3.1119, + "step": 741 + }, + { + "epoch": 0.22774708410067526, + "grad_norm": 2.3001184463500977, + "learning_rate": 7.576687116564417e-05, + "loss": 3.0068, + "step": 742 + }, + { + "epoch": 0.22805402087170043, + "grad_norm": 2.8682122230529785, + "learning_rate": 7.586912065439673e-05, + "loss": 3.0562, + "step": 743 + }, + { + "epoch": 0.2283609576427256, + "grad_norm": 2.2176413536071777, + "learning_rate": 7.59713701431493e-05, + "loss": 3.1395, + "step": 744 + }, + { + "epoch": 0.22866789441375077, + "grad_norm": 3.698274612426758, + "learning_rate": 7.607361963190185e-05, + "loss": 3.209, + "step": 745 + }, + { + "epoch": 0.22897483118477593, + "grad_norm": 2.141063928604126, + "learning_rate": 7.61758691206544e-05, + "loss": 3.1734, + "step": 746 + }, + { + "epoch": 0.2292817679558011, + "grad_norm": 2.728498697280884, + "learning_rate": 7.627811860940695e-05, + "loss": 3.1498, + "step": 747 + }, + { + "epoch": 0.22958870472682627, + "grad_norm": 2.271678924560547, + "learning_rate": 7.638036809815952e-05, + "loss": 3.1538, + "step": 748 + }, + { + "epoch": 0.22989564149785144, + "grad_norm": 2.6095521450042725, + "learning_rate": 7.648261758691206e-05, + "loss": 3.155, + "step": 749 + }, + { + "epoch": 0.2302025782688766, + "grad_norm": 2.410792112350464, + "learning_rate": 7.658486707566463e-05, + "loss": 3.0478, + "step": 750 + }, + { + "epoch": 0.23050951503990177, + "grad_norm": 2.6980888843536377, + "learning_rate": 7.668711656441718e-05, + "loss": 3.1369, + "step": 751 + }, + { + "epoch": 0.23081645181092694, + "grad_norm": 2.353308916091919, + "learning_rate": 7.678936605316974e-05, + "loss": 3.0052, + "step": 752 + }, + { + "epoch": 0.2311233885819521, + "grad_norm": 2.4530155658721924, + "learning_rate": 7.689161554192229e-05, + "loss": 3.1348, + "step": 753 + }, + { + "epoch": 0.23143032535297728, + "grad_norm": 2.393601894378662, + "learning_rate": 7.699386503067484e-05, + "loss": 2.9941, + "step": 754 + }, + { + "epoch": 0.23173726212400245, + "grad_norm": 2.576876401901245, + "learning_rate": 7.709611451942741e-05, + "loss": 3.114, + "step": 755 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 2.0420913696289062, + "learning_rate": 7.719836400817996e-05, + "loss": 3.132, + "step": 756 + }, + { + "epoch": 0.23235113566605278, + "grad_norm": 3.0095622539520264, + "learning_rate": 7.730061349693252e-05, + "loss": 3.1763, + "step": 757 + }, + { + "epoch": 0.23265807243707795, + "grad_norm": 2.224005937576294, + "learning_rate": 7.740286298568507e-05, + "loss": 3.0703, + "step": 758 + }, + { + "epoch": 0.23296500920810312, + "grad_norm": 2.7559845447540283, + "learning_rate": 7.750511247443764e-05, + "loss": 3.1026, + "step": 759 + }, + { + "epoch": 0.2332719459791283, + "grad_norm": 2.2965753078460693, + "learning_rate": 7.760736196319018e-05, + "loss": 3.0284, + "step": 760 + }, + { + "epoch": 0.23357888275015345, + "grad_norm": 2.374398708343506, + "learning_rate": 7.770961145194275e-05, + "loss": 3.0636, + "step": 761 + }, + { + "epoch": 0.23388581952117865, + "grad_norm": 2.4315314292907715, + "learning_rate": 7.78118609406953e-05, + "loss": 3.0906, + "step": 762 + }, + { + "epoch": 0.23419275629220382, + "grad_norm": 2.5609946250915527, + "learning_rate": 7.791411042944787e-05, + "loss": 3.0692, + "step": 763 + }, + { + "epoch": 0.234499693063229, + "grad_norm": 2.419597864151001, + "learning_rate": 7.80163599182004e-05, + "loss": 3.1934, + "step": 764 + }, + { + "epoch": 0.23480662983425415, + "grad_norm": 3.0499062538146973, + "learning_rate": 7.811860940695297e-05, + "loss": 3.18, + "step": 765 + }, + { + "epoch": 0.23511356660527932, + "grad_norm": 2.464421510696411, + "learning_rate": 7.822085889570553e-05, + "loss": 3.1591, + "step": 766 + }, + { + "epoch": 0.2354205033763045, + "grad_norm": 3.4370174407958984, + "learning_rate": 7.832310838445808e-05, + "loss": 3.1156, + "step": 767 + }, + { + "epoch": 0.23572744014732966, + "grad_norm": 2.207406520843506, + "learning_rate": 7.842535787321063e-05, + "loss": 3.0557, + "step": 768 + }, + { + "epoch": 0.23603437691835483, + "grad_norm": 2.484807014465332, + "learning_rate": 7.85276073619632e-05, + "loss": 3.1003, + "step": 769 + }, + { + "epoch": 0.23634131368938, + "grad_norm": 2.33217716217041, + "learning_rate": 7.862985685071576e-05, + "loss": 3.0707, + "step": 770 + }, + { + "epoch": 0.23664825046040516, + "grad_norm": 2.493717670440674, + "learning_rate": 7.873210633946831e-05, + "loss": 3.127, + "step": 771 + }, + { + "epoch": 0.23695518723143033, + "grad_norm": 2.5824413299560547, + "learning_rate": 7.883435582822086e-05, + "loss": 3.1042, + "step": 772 + }, + { + "epoch": 0.2372621240024555, + "grad_norm": 2.4137654304504395, + "learning_rate": 7.893660531697342e-05, + "loss": 3.136, + "step": 773 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 2.4657833576202393, + "learning_rate": 7.903885480572597e-05, + "loss": 3.038, + "step": 774 + }, + { + "epoch": 0.23787599754450584, + "grad_norm": 2.426260471343994, + "learning_rate": 7.914110429447852e-05, + "loss": 3.0102, + "step": 775 + }, + { + "epoch": 0.238182934315531, + "grad_norm": 2.4658050537109375, + "learning_rate": 7.924335378323109e-05, + "loss": 3.0645, + "step": 776 + }, + { + "epoch": 0.23848987108655617, + "grad_norm": 2.186267614364624, + "learning_rate": 7.934560327198364e-05, + "loss": 3.0585, + "step": 777 + }, + { + "epoch": 0.23879680785758134, + "grad_norm": 2.8824141025543213, + "learning_rate": 7.94478527607362e-05, + "loss": 3.0796, + "step": 778 + }, + { + "epoch": 0.2391037446286065, + "grad_norm": 1.9940539598464966, + "learning_rate": 7.955010224948875e-05, + "loss": 2.9894, + "step": 779 + }, + { + "epoch": 0.23941068139963168, + "grad_norm": 2.9386861324310303, + "learning_rate": 7.965235173824132e-05, + "loss": 3.1147, + "step": 780 + }, + { + "epoch": 0.23971761817065684, + "grad_norm": 2.241983413696289, + "learning_rate": 7.975460122699386e-05, + "loss": 2.9977, + "step": 781 + }, + { + "epoch": 0.240024554941682, + "grad_norm": 2.4796900749206543, + "learning_rate": 7.985685071574643e-05, + "loss": 3.0507, + "step": 782 + }, + { + "epoch": 0.24033149171270718, + "grad_norm": 2.6178741455078125, + "learning_rate": 7.995910020449898e-05, + "loss": 3.0299, + "step": 783 + }, + { + "epoch": 0.24063842848373235, + "grad_norm": 2.157179594039917, + "learning_rate": 8.006134969325155e-05, + "loss": 3.0419, + "step": 784 + }, + { + "epoch": 0.24094536525475752, + "grad_norm": 2.49029541015625, + "learning_rate": 8.016359918200409e-05, + "loss": 3.0785, + "step": 785 + }, + { + "epoch": 0.24125230202578268, + "grad_norm": 2.254014492034912, + "learning_rate": 8.026584867075665e-05, + "loss": 3.0009, + "step": 786 + }, + { + "epoch": 0.24155923879680785, + "grad_norm": 2.514465570449829, + "learning_rate": 8.036809815950921e-05, + "loss": 3.0221, + "step": 787 + }, + { + "epoch": 0.24186617556783302, + "grad_norm": 2.309812545776367, + "learning_rate": 8.047034764826176e-05, + "loss": 2.9822, + "step": 788 + }, + { + "epoch": 0.2421731123388582, + "grad_norm": 2.5367796421051025, + "learning_rate": 8.057259713701431e-05, + "loss": 2.966, + "step": 789 + }, + { + "epoch": 0.24248004910988336, + "grad_norm": 2.4668943881988525, + "learning_rate": 8.067484662576688e-05, + "loss": 3.1177, + "step": 790 + }, + { + "epoch": 0.24278698588090852, + "grad_norm": 2.9424917697906494, + "learning_rate": 8.077709611451944e-05, + "loss": 3.078, + "step": 791 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 2.3068933486938477, + "learning_rate": 8.087934560327199e-05, + "loss": 3.0415, + "step": 792 + }, + { + "epoch": 0.24340085942295886, + "grad_norm": 2.675631284713745, + "learning_rate": 8.098159509202454e-05, + "loss": 3.012, + "step": 793 + }, + { + "epoch": 0.24370779619398403, + "grad_norm": 2.0261662006378174, + "learning_rate": 8.10838445807771e-05, + "loss": 3.0023, + "step": 794 + }, + { + "epoch": 0.2440147329650092, + "grad_norm": 3.32330322265625, + "learning_rate": 8.118609406952966e-05, + "loss": 3.0992, + "step": 795 + }, + { + "epoch": 0.24432166973603436, + "grad_norm": 2.1587088108062744, + "learning_rate": 8.12883435582822e-05, + "loss": 3.0922, + "step": 796 + }, + { + "epoch": 0.24462860650705956, + "grad_norm": 2.639254331588745, + "learning_rate": 8.139059304703477e-05, + "loss": 2.9856, + "step": 797 + }, + { + "epoch": 0.24493554327808473, + "grad_norm": 1.9976975917816162, + "learning_rate": 8.149284253578732e-05, + "loss": 3.0015, + "step": 798 + }, + { + "epoch": 0.2452424800491099, + "grad_norm": 2.763504981994629, + "learning_rate": 8.159509202453988e-05, + "loss": 3.0437, + "step": 799 + }, + { + "epoch": 0.24554941682013506, + "grad_norm": 1.9080138206481934, + "learning_rate": 8.169734151329243e-05, + "loss": 3.0009, + "step": 800 + }, + { + "epoch": 0.24585635359116023, + "grad_norm": 3.1276164054870605, + "learning_rate": 8.1799591002045e-05, + "loss": 3.0433, + "step": 801 + }, + { + "epoch": 0.2461632903621854, + "grad_norm": 2.0463218688964844, + "learning_rate": 8.190184049079755e-05, + "loss": 2.988, + "step": 802 + }, + { + "epoch": 0.24647022713321057, + "grad_norm": 2.8476648330688477, + "learning_rate": 8.20040899795501e-05, + "loss": 3.0238, + "step": 803 + }, + { + "epoch": 0.24677716390423574, + "grad_norm": 1.9715898036956787, + "learning_rate": 8.210633946830266e-05, + "loss": 3.0657, + "step": 804 + }, + { + "epoch": 0.2470841006752609, + "grad_norm": 3.369995594024658, + "learning_rate": 8.220858895705523e-05, + "loss": 3.0181, + "step": 805 + }, + { + "epoch": 0.24739103744628607, + "grad_norm": 2.0333900451660156, + "learning_rate": 8.231083844580777e-05, + "loss": 3.0589, + "step": 806 + }, + { + "epoch": 0.24769797421731124, + "grad_norm": 2.5702931880950928, + "learning_rate": 8.241308793456033e-05, + "loss": 2.9908, + "step": 807 + }, + { + "epoch": 0.2480049109883364, + "grad_norm": 2.12131929397583, + "learning_rate": 8.251533742331289e-05, + "loss": 3.0519, + "step": 808 + }, + { + "epoch": 0.24831184775936158, + "grad_norm": 2.5457377433776855, + "learning_rate": 8.261758691206544e-05, + "loss": 3.019, + "step": 809 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 2.0954740047454834, + "learning_rate": 8.2719836400818e-05, + "loss": 2.9805, + "step": 810 + }, + { + "epoch": 0.2489257213014119, + "grad_norm": 2.2456700801849365, + "learning_rate": 8.282208588957055e-05, + "loss": 3.0627, + "step": 811 + }, + { + "epoch": 0.24923265807243708, + "grad_norm": 2.4453790187835693, + "learning_rate": 8.292433537832312e-05, + "loss": 3.0447, + "step": 812 + }, + { + "epoch": 0.24953959484346225, + "grad_norm": 2.1835873126983643, + "learning_rate": 8.302658486707567e-05, + "loss": 3.0008, + "step": 813 + }, + { + "epoch": 0.24984653161448742, + "grad_norm": 2.292989492416382, + "learning_rate": 8.312883435582822e-05, + "loss": 2.9175, + "step": 814 + }, + { + "epoch": 0.2501534683855126, + "grad_norm": 2.408888816833496, + "learning_rate": 8.323108384458078e-05, + "loss": 2.9649, + "step": 815 + }, + { + "epoch": 0.2504604051565378, + "grad_norm": 2.1873834133148193, + "learning_rate": 8.333333333333334e-05, + "loss": 2.9812, + "step": 816 + }, + { + "epoch": 0.25076734192756295, + "grad_norm": 2.2599284648895264, + "learning_rate": 8.343558282208588e-05, + "loss": 3.0086, + "step": 817 + }, + { + "epoch": 0.2510742786985881, + "grad_norm": 2.1902761459350586, + "learning_rate": 8.353783231083845e-05, + "loss": 2.9295, + "step": 818 + }, + { + "epoch": 0.2513812154696133, + "grad_norm": 2.4830422401428223, + "learning_rate": 8.3640081799591e-05, + "loss": 2.9808, + "step": 819 + }, + { + "epoch": 0.25168815224063845, + "grad_norm": 2.2274281978607178, + "learning_rate": 8.374233128834357e-05, + "loss": 2.9525, + "step": 820 + }, + { + "epoch": 0.2519950890116636, + "grad_norm": 2.2949111461639404, + "learning_rate": 8.384458077709611e-05, + "loss": 3.0313, + "step": 821 + }, + { + "epoch": 0.2523020257826888, + "grad_norm": 2.2345564365386963, + "learning_rate": 8.394683026584868e-05, + "loss": 2.9024, + "step": 822 + }, + { + "epoch": 0.25260896255371396, + "grad_norm": 2.488744020462036, + "learning_rate": 8.404907975460123e-05, + "loss": 2.9907, + "step": 823 + }, + { + "epoch": 0.2529158993247391, + "grad_norm": 1.9192837476730347, + "learning_rate": 8.415132924335379e-05, + "loss": 2.9792, + "step": 824 + }, + { + "epoch": 0.2532228360957643, + "grad_norm": 2.6426947116851807, + "learning_rate": 8.425357873210634e-05, + "loss": 2.972, + "step": 825 + }, + { + "epoch": 0.25352977286678946, + "grad_norm": 1.9950047731399536, + "learning_rate": 8.435582822085891e-05, + "loss": 2.9885, + "step": 826 + }, + { + "epoch": 0.25383670963781463, + "grad_norm": 2.30191969871521, + "learning_rate": 8.445807770961146e-05, + "loss": 2.9358, + "step": 827 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 2.1111395359039307, + "learning_rate": 8.456032719836401e-05, + "loss": 3.0343, + "step": 828 + }, + { + "epoch": 0.25445058317986496, + "grad_norm": 2.7292258739471436, + "learning_rate": 8.466257668711657e-05, + "loss": 2.9465, + "step": 829 + }, + { + "epoch": 0.25475751995089013, + "grad_norm": 1.9130604267120361, + "learning_rate": 8.476482617586912e-05, + "loss": 2.9443, + "step": 830 + }, + { + "epoch": 0.2550644567219153, + "grad_norm": 2.4240024089813232, + "learning_rate": 8.486707566462168e-05, + "loss": 2.963, + "step": 831 + }, + { + "epoch": 0.25537139349294047, + "grad_norm": 2.062875509262085, + "learning_rate": 8.496932515337423e-05, + "loss": 3.0127, + "step": 832 + }, + { + "epoch": 0.25567833026396564, + "grad_norm": 2.223639726638794, + "learning_rate": 8.50715746421268e-05, + "loss": 2.944, + "step": 833 + }, + { + "epoch": 0.2559852670349908, + "grad_norm": 2.2969272136688232, + "learning_rate": 8.517382413087935e-05, + "loss": 2.9495, + "step": 834 + }, + { + "epoch": 0.256292203806016, + "grad_norm": 2.1343178749084473, + "learning_rate": 8.52760736196319e-05, + "loss": 3.0383, + "step": 835 + }, + { + "epoch": 0.25659914057704114, + "grad_norm": 2.2348313331604004, + "learning_rate": 8.537832310838446e-05, + "loss": 2.9205, + "step": 836 + }, + { + "epoch": 0.2569060773480663, + "grad_norm": 2.2653896808624268, + "learning_rate": 8.548057259713702e-05, + "loss": 2.9699, + "step": 837 + }, + { + "epoch": 0.2572130141190915, + "grad_norm": 2.1332547664642334, + "learning_rate": 8.558282208588958e-05, + "loss": 2.9318, + "step": 838 + }, + { + "epoch": 0.25751995089011664, + "grad_norm": 2.5935778617858887, + "learning_rate": 8.568507157464213e-05, + "loss": 2.9754, + "step": 839 + }, + { + "epoch": 0.2578268876611418, + "grad_norm": 2.073923110961914, + "learning_rate": 8.578732106339469e-05, + "loss": 3.0396, + "step": 840 + }, + { + "epoch": 0.258133824432167, + "grad_norm": 2.485049247741699, + "learning_rate": 8.588957055214725e-05, + "loss": 2.9297, + "step": 841 + }, + { + "epoch": 0.25844076120319215, + "grad_norm": 1.9425253868103027, + "learning_rate": 8.599182004089979e-05, + "loss": 3.0131, + "step": 842 + }, + { + "epoch": 0.2587476979742173, + "grad_norm": 2.6248724460601807, + "learning_rate": 8.609406952965236e-05, + "loss": 3.0345, + "step": 843 + }, + { + "epoch": 0.2590546347452425, + "grad_norm": 1.9123374223709106, + "learning_rate": 8.619631901840491e-05, + "loss": 3.0259, + "step": 844 + }, + { + "epoch": 0.25936157151626765, + "grad_norm": 2.457913637161255, + "learning_rate": 8.629856850715747e-05, + "loss": 3.0015, + "step": 845 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 2.0444202423095703, + "learning_rate": 8.640081799591002e-05, + "loss": 2.9663, + "step": 846 + }, + { + "epoch": 0.259975445058318, + "grad_norm": 2.1673583984375, + "learning_rate": 8.650306748466259e-05, + "loss": 3.0646, + "step": 847 + }, + { + "epoch": 0.26028238182934316, + "grad_norm": 2.1198627948760986, + "learning_rate": 8.660531697341514e-05, + "loss": 2.8769, + "step": 848 + }, + { + "epoch": 0.2605893186003683, + "grad_norm": 2.379960775375366, + "learning_rate": 8.67075664621677e-05, + "loss": 2.9637, + "step": 849 + }, + { + "epoch": 0.2608962553713935, + "grad_norm": 2.3954226970672607, + "learning_rate": 8.680981595092025e-05, + "loss": 3.025, + "step": 850 + }, + { + "epoch": 0.26120319214241866, + "grad_norm": 2.254746198654175, + "learning_rate": 8.69120654396728e-05, + "loss": 2.9962, + "step": 851 + }, + { + "epoch": 0.26151012891344383, + "grad_norm": 2.0851991176605225, + "learning_rate": 8.701431492842537e-05, + "loss": 2.9399, + "step": 852 + }, + { + "epoch": 0.261817065684469, + "grad_norm": 2.2800698280334473, + "learning_rate": 8.711656441717791e-05, + "loss": 2.9465, + "step": 853 + }, + { + "epoch": 0.26212400245549416, + "grad_norm": 2.3628437519073486, + "learning_rate": 8.721881390593048e-05, + "loss": 3.0298, + "step": 854 + }, + { + "epoch": 0.26243093922651933, + "grad_norm": 1.9642207622528076, + "learning_rate": 8.732106339468303e-05, + "loss": 2.8462, + "step": 855 + }, + { + "epoch": 0.2627378759975445, + "grad_norm": 2.5833423137664795, + "learning_rate": 8.742331288343558e-05, + "loss": 2.9024, + "step": 856 + }, + { + "epoch": 0.26304481276856967, + "grad_norm": 1.7022998332977295, + "learning_rate": 8.752556237218814e-05, + "loss": 2.9948, + "step": 857 + }, + { + "epoch": 0.26335174953959484, + "grad_norm": 3.181725025177002, + "learning_rate": 8.76278118609407e-05, + "loss": 3.0634, + "step": 858 + }, + { + "epoch": 0.26365868631062, + "grad_norm": 1.8931077718734741, + "learning_rate": 8.773006134969326e-05, + "loss": 2.9974, + "step": 859 + }, + { + "epoch": 0.2639656230816452, + "grad_norm": 2.5016703605651855, + "learning_rate": 8.783231083844581e-05, + "loss": 3.0109, + "step": 860 + }, + { + "epoch": 0.26427255985267034, + "grad_norm": 1.810957908630371, + "learning_rate": 8.793456032719837e-05, + "loss": 3.0143, + "step": 861 + }, + { + "epoch": 0.2645794966236955, + "grad_norm": 2.3004086017608643, + "learning_rate": 8.803680981595093e-05, + "loss": 2.9825, + "step": 862 + }, + { + "epoch": 0.2648864333947207, + "grad_norm": 2.23740816116333, + "learning_rate": 8.813905930470347e-05, + "loss": 2.8897, + "step": 863 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 2.441157579421997, + "learning_rate": 8.824130879345604e-05, + "loss": 2.8966, + "step": 864 + }, + { + "epoch": 0.265500306936771, + "grad_norm": 2.063201665878296, + "learning_rate": 8.83435582822086e-05, + "loss": 2.9468, + "step": 865 + }, + { + "epoch": 0.2658072437077962, + "grad_norm": 2.1484951972961426, + "learning_rate": 8.844580777096115e-05, + "loss": 2.9199, + "step": 866 + }, + { + "epoch": 0.26611418047882135, + "grad_norm": 2.167827844619751, + "learning_rate": 8.85480572597137e-05, + "loss": 2.9403, + "step": 867 + }, + { + "epoch": 0.2664211172498465, + "grad_norm": 2.193556070327759, + "learning_rate": 8.865030674846625e-05, + "loss": 2.9171, + "step": 868 + }, + { + "epoch": 0.2667280540208717, + "grad_norm": 2.0754151344299316, + "learning_rate": 8.875255623721882e-05, + "loss": 2.9605, + "step": 869 + }, + { + "epoch": 0.26703499079189685, + "grad_norm": 2.1351094245910645, + "learning_rate": 8.885480572597138e-05, + "loss": 2.9272, + "step": 870 + }, + { + "epoch": 0.267341927562922, + "grad_norm": 2.0486347675323486, + "learning_rate": 8.895705521472393e-05, + "loss": 3.0308, + "step": 871 + }, + { + "epoch": 0.2676488643339472, + "grad_norm": 2.3303308486938477, + "learning_rate": 8.905930470347648e-05, + "loss": 2.9061, + "step": 872 + }, + { + "epoch": 0.26795580110497236, + "grad_norm": 1.9345083236694336, + "learning_rate": 8.916155419222905e-05, + "loss": 2.9644, + "step": 873 + }, + { + "epoch": 0.2682627378759975, + "grad_norm": 2.451918601989746, + "learning_rate": 8.926380368098159e-05, + "loss": 2.9536, + "step": 874 + }, + { + "epoch": 0.2685696746470227, + "grad_norm": 1.6964573860168457, + "learning_rate": 8.936605316973416e-05, + "loss": 2.9228, + "step": 875 + }, + { + "epoch": 0.26887661141804786, + "grad_norm": 2.2414000034332275, + "learning_rate": 8.946830265848671e-05, + "loss": 2.9776, + "step": 876 + }, + { + "epoch": 0.26918354818907303, + "grad_norm": 1.725002408027649, + "learning_rate": 8.957055214723928e-05, + "loss": 2.9837, + "step": 877 + }, + { + "epoch": 0.2694904849600982, + "grad_norm": 2.1498587131500244, + "learning_rate": 8.967280163599182e-05, + "loss": 2.8684, + "step": 878 + }, + { + "epoch": 0.26979742173112337, + "grad_norm": 1.814738392829895, + "learning_rate": 8.977505112474438e-05, + "loss": 2.9077, + "step": 879 + }, + { + "epoch": 0.27010435850214853, + "grad_norm": 2.3086628913879395, + "learning_rate": 8.987730061349694e-05, + "loss": 2.9482, + "step": 880 + }, + { + "epoch": 0.2704112952731737, + "grad_norm": 1.7470855712890625, + "learning_rate": 8.997955010224949e-05, + "loss": 2.9775, + "step": 881 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 2.2822775840759277, + "learning_rate": 9.008179959100205e-05, + "loss": 3.0004, + "step": 882 + }, + { + "epoch": 0.27102516881522404, + "grad_norm": 1.9530903100967407, + "learning_rate": 9.018404907975461e-05, + "loss": 2.949, + "step": 883 + }, + { + "epoch": 0.2713321055862492, + "grad_norm": 2.0626885890960693, + "learning_rate": 9.028629856850717e-05, + "loss": 2.9184, + "step": 884 + }, + { + "epoch": 0.2716390423572744, + "grad_norm": 2.0040712356567383, + "learning_rate": 9.038854805725972e-05, + "loss": 2.8562, + "step": 885 + }, + { + "epoch": 0.2719459791282996, + "grad_norm": 2.026193141937256, + "learning_rate": 9.049079754601227e-05, + "loss": 2.883, + "step": 886 + }, + { + "epoch": 0.27225291589932477, + "grad_norm": 1.8337095975875854, + "learning_rate": 9.059304703476483e-05, + "loss": 2.8512, + "step": 887 + }, + { + "epoch": 0.27255985267034993, + "grad_norm": 2.1098122596740723, + "learning_rate": 9.069529652351738e-05, + "loss": 2.9024, + "step": 888 + }, + { + "epoch": 0.2728667894413751, + "grad_norm": 2.065650701522827, + "learning_rate": 9.079754601226993e-05, + "loss": 2.9291, + "step": 889 + }, + { + "epoch": 0.27317372621240027, + "grad_norm": 2.204819679260254, + "learning_rate": 9.08997955010225e-05, + "loss": 2.9153, + "step": 890 + }, + { + "epoch": 0.27348066298342544, + "grad_norm": 1.7931475639343262, + "learning_rate": 9.100204498977506e-05, + "loss": 2.9104, + "step": 891 + }, + { + "epoch": 0.2737875997544506, + "grad_norm": 2.4288859367370605, + "learning_rate": 9.110429447852761e-05, + "loss": 2.9974, + "step": 892 + }, + { + "epoch": 0.2740945365254758, + "grad_norm": 2.095872640609741, + "learning_rate": 9.120654396728016e-05, + "loss": 2.8446, + "step": 893 + }, + { + "epoch": 0.27440147329650094, + "grad_norm": 2.054410696029663, + "learning_rate": 9.130879345603273e-05, + "loss": 2.9008, + "step": 894 + }, + { + "epoch": 0.2747084100675261, + "grad_norm": 2.1989710330963135, + "learning_rate": 9.141104294478528e-05, + "loss": 2.8808, + "step": 895 + }, + { + "epoch": 0.2750153468385513, + "grad_norm": 2.531081199645996, + "learning_rate": 9.151329243353784e-05, + "loss": 2.8928, + "step": 896 + }, + { + "epoch": 0.27532228360957645, + "grad_norm": 2.010425567626953, + "learning_rate": 9.161554192229039e-05, + "loss": 2.9051, + "step": 897 + }, + { + "epoch": 0.2756292203806016, + "grad_norm": 1.9320241212844849, + "learning_rate": 9.171779141104296e-05, + "loss": 2.8675, + "step": 898 + }, + { + "epoch": 0.2759361571516268, + "grad_norm": 2.2280430793762207, + "learning_rate": 9.18200408997955e-05, + "loss": 2.9082, + "step": 899 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 1.9172335863113403, + "learning_rate": 9.192229038854807e-05, + "loss": 2.8947, + "step": 900 + }, + { + "epoch": 0.2765500306936771, + "grad_norm": 2.0846056938171387, + "learning_rate": 9.202453987730062e-05, + "loss": 2.9161, + "step": 901 + }, + { + "epoch": 0.2768569674647023, + "grad_norm": 1.875034213066101, + "learning_rate": 9.212678936605317e-05, + "loss": 2.8937, + "step": 902 + }, + { + "epoch": 0.27716390423572745, + "grad_norm": 2.230164051055908, + "learning_rate": 9.222903885480573e-05, + "loss": 2.8396, + "step": 903 + }, + { + "epoch": 0.2774708410067526, + "grad_norm": 1.6204382181167603, + "learning_rate": 9.233128834355828e-05, + "loss": 2.9367, + "step": 904 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.4218156337738037, + "learning_rate": 9.243353783231085e-05, + "loss": 2.9727, + "step": 905 + }, + { + "epoch": 0.27808471454880296, + "grad_norm": 1.7401793003082275, + "learning_rate": 9.25357873210634e-05, + "loss": 2.8957, + "step": 906 + }, + { + "epoch": 0.2783916513198281, + "grad_norm": 2.2128076553344727, + "learning_rate": 9.263803680981595e-05, + "loss": 2.8725, + "step": 907 + }, + { + "epoch": 0.2786985880908533, + "grad_norm": 2.004179000854492, + "learning_rate": 9.274028629856851e-05, + "loss": 2.8879, + "step": 908 + }, + { + "epoch": 0.27900552486187846, + "grad_norm": 2.198784112930298, + "learning_rate": 9.284253578732107e-05, + "loss": 2.9655, + "step": 909 + }, + { + "epoch": 0.27931246163290363, + "grad_norm": 1.8064004182815552, + "learning_rate": 9.294478527607362e-05, + "loss": 2.7801, + "step": 910 + }, + { + "epoch": 0.2796193984039288, + "grad_norm": 2.1273581981658936, + "learning_rate": 9.304703476482618e-05, + "loss": 2.8615, + "step": 911 + }, + { + "epoch": 0.27992633517495397, + "grad_norm": 1.7843197584152222, + "learning_rate": 9.314928425357874e-05, + "loss": 2.8735, + "step": 912 + }, + { + "epoch": 0.28023327194597913, + "grad_norm": 2.234886884689331, + "learning_rate": 9.325153374233129e-05, + "loss": 2.9444, + "step": 913 + }, + { + "epoch": 0.2805402087170043, + "grad_norm": 2.0565783977508545, + "learning_rate": 9.335378323108384e-05, + "loss": 2.9784, + "step": 914 + }, + { + "epoch": 0.28084714548802947, + "grad_norm": 1.836901068687439, + "learning_rate": 9.345603271983641e-05, + "loss": 2.9217, + "step": 915 + }, + { + "epoch": 0.28115408225905464, + "grad_norm": 2.0981357097625732, + "learning_rate": 9.355828220858896e-05, + "loss": 2.9091, + "step": 916 + }, + { + "epoch": 0.2814610190300798, + "grad_norm": 1.9199821949005127, + "learning_rate": 9.366053169734152e-05, + "loss": 2.8882, + "step": 917 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 1.9928756952285767, + "learning_rate": 9.376278118609407e-05, + "loss": 2.8463, + "step": 918 + }, + { + "epoch": 0.28207489257213014, + "grad_norm": 1.9580156803131104, + "learning_rate": 9.386503067484664e-05, + "loss": 2.7814, + "step": 919 + }, + { + "epoch": 0.2823818293431553, + "grad_norm": 2.016144275665283, + "learning_rate": 9.396728016359919e-05, + "loss": 2.8725, + "step": 920 + }, + { + "epoch": 0.2826887661141805, + "grad_norm": 1.967668890953064, + "learning_rate": 9.406952965235175e-05, + "loss": 2.912, + "step": 921 + }, + { + "epoch": 0.28299570288520565, + "grad_norm": 1.8826593160629272, + "learning_rate": 9.41717791411043e-05, + "loss": 2.7885, + "step": 922 + }, + { + "epoch": 0.2833026396562308, + "grad_norm": 2.0615732669830322, + "learning_rate": 9.427402862985685e-05, + "loss": 2.9111, + "step": 923 + }, + { + "epoch": 0.283609576427256, + "grad_norm": 1.7132701873779297, + "learning_rate": 9.43762781186094e-05, + "loss": 2.89, + "step": 924 + }, + { + "epoch": 0.28391651319828115, + "grad_norm": 2.1561272144317627, + "learning_rate": 9.447852760736196e-05, + "loss": 2.8741, + "step": 925 + }, + { + "epoch": 0.2842234499693063, + "grad_norm": 1.727338433265686, + "learning_rate": 9.458077709611453e-05, + "loss": 2.8449, + "step": 926 + }, + { + "epoch": 0.2845303867403315, + "grad_norm": 2.19234299659729, + "learning_rate": 9.468302658486708e-05, + "loss": 2.8499, + "step": 927 + }, + { + "epoch": 0.28483732351135665, + "grad_norm": 1.7370812892913818, + "learning_rate": 9.478527607361963e-05, + "loss": 2.882, + "step": 928 + }, + { + "epoch": 0.2851442602823818, + "grad_norm": 2.0576157569885254, + "learning_rate": 9.488752556237219e-05, + "loss": 2.7869, + "step": 929 + }, + { + "epoch": 0.285451197053407, + "grad_norm": 1.7926486730575562, + "learning_rate": 9.498977505112476e-05, + "loss": 2.906, + "step": 930 + }, + { + "epoch": 0.28575813382443216, + "grad_norm": 1.6877856254577637, + "learning_rate": 9.50920245398773e-05, + "loss": 2.8422, + "step": 931 + }, + { + "epoch": 0.2860650705954573, + "grad_norm": 2.3053178787231445, + "learning_rate": 9.519427402862986e-05, + "loss": 2.9039, + "step": 932 + }, + { + "epoch": 0.2863720073664825, + "grad_norm": 1.7746092081069946, + "learning_rate": 9.529652351738242e-05, + "loss": 2.9082, + "step": 933 + }, + { + "epoch": 0.28667894413750766, + "grad_norm": 2.1900086402893066, + "learning_rate": 9.539877300613498e-05, + "loss": 2.8511, + "step": 934 + }, + { + "epoch": 0.28698588090853283, + "grad_norm": 1.781988501548767, + "learning_rate": 9.550102249488752e-05, + "loss": 2.8264, + "step": 935 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 1.845797061920166, + "learning_rate": 9.560327198364009e-05, + "loss": 2.8657, + "step": 936 + }, + { + "epoch": 0.28759975445058317, + "grad_norm": 1.8794586658477783, + "learning_rate": 9.570552147239264e-05, + "loss": 2.8365, + "step": 937 + }, + { + "epoch": 0.28790669122160834, + "grad_norm": 2.078359603881836, + "learning_rate": 9.58077709611452e-05, + "loss": 2.8829, + "step": 938 + }, + { + "epoch": 0.2882136279926335, + "grad_norm": 1.8091285228729248, + "learning_rate": 9.591002044989775e-05, + "loss": 2.8083, + "step": 939 + }, + { + "epoch": 0.28852056476365867, + "grad_norm": 2.0130608081817627, + "learning_rate": 9.601226993865032e-05, + "loss": 2.8922, + "step": 940 + }, + { + "epoch": 0.28882750153468384, + "grad_norm": 1.8504360914230347, + "learning_rate": 9.611451942740287e-05, + "loss": 2.8034, + "step": 941 + }, + { + "epoch": 0.289134438305709, + "grad_norm": 1.860420823097229, + "learning_rate": 9.621676891615543e-05, + "loss": 2.8249, + "step": 942 + }, + { + "epoch": 0.2894413750767342, + "grad_norm": 2.157158374786377, + "learning_rate": 9.631901840490798e-05, + "loss": 2.8629, + "step": 943 + }, + { + "epoch": 0.28974831184775934, + "grad_norm": 1.8066895008087158, + "learning_rate": 9.642126789366053e-05, + "loss": 2.7965, + "step": 944 + }, + { + "epoch": 0.2900552486187845, + "grad_norm": 1.9674500226974487, + "learning_rate": 9.65235173824131e-05, + "loss": 2.8043, + "step": 945 + }, + { + "epoch": 0.2903621853898097, + "grad_norm": 1.7899354696273804, + "learning_rate": 9.662576687116564e-05, + "loss": 2.8803, + "step": 946 + }, + { + "epoch": 0.29066912216083485, + "grad_norm": 2.220201015472412, + "learning_rate": 9.672801635991821e-05, + "loss": 2.8201, + "step": 947 + }, + { + "epoch": 0.29097605893186, + "grad_norm": 1.76320219039917, + "learning_rate": 9.683026584867076e-05, + "loss": 2.8921, + "step": 948 + }, + { + "epoch": 0.2912829957028852, + "grad_norm": 1.6863081455230713, + "learning_rate": 9.693251533742331e-05, + "loss": 2.8208, + "step": 949 + }, + { + "epoch": 0.29158993247391035, + "grad_norm": 2.1578476428985596, + "learning_rate": 9.703476482617587e-05, + "loss": 2.8972, + "step": 950 + }, + { + "epoch": 0.2918968692449355, + "grad_norm": 1.6925181150436401, + "learning_rate": 9.713701431492844e-05, + "loss": 2.8225, + "step": 951 + }, + { + "epoch": 0.2922038060159607, + "grad_norm": 1.8861147165298462, + "learning_rate": 9.723926380368099e-05, + "loss": 2.8707, + "step": 952 + }, + { + "epoch": 0.29251074278698586, + "grad_norm": 1.5894604921340942, + "learning_rate": 9.734151329243354e-05, + "loss": 2.7576, + "step": 953 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 1.9092673063278198, + "learning_rate": 9.74437627811861e-05, + "loss": 2.8659, + "step": 954 + }, + { + "epoch": 0.2931246163290362, + "grad_norm": 1.8600605726242065, + "learning_rate": 9.754601226993866e-05, + "loss": 2.752, + "step": 955 + }, + { + "epoch": 0.29343155310006136, + "grad_norm": 2.005805015563965, + "learning_rate": 9.76482617586912e-05, + "loss": 2.8511, + "step": 956 + }, + { + "epoch": 0.2937384898710866, + "grad_norm": 1.9485148191452026, + "learning_rate": 9.775051124744377e-05, + "loss": 2.9726, + "step": 957 + }, + { + "epoch": 0.29404542664211175, + "grad_norm": 1.9197280406951904, + "learning_rate": 9.785276073619632e-05, + "loss": 2.7753, + "step": 958 + }, + { + "epoch": 0.2943523634131369, + "grad_norm": 1.6279773712158203, + "learning_rate": 9.795501022494888e-05, + "loss": 2.8855, + "step": 959 + }, + { + "epoch": 0.2946593001841621, + "grad_norm": 2.0233097076416016, + "learning_rate": 9.805725971370143e-05, + "loss": 2.749, + "step": 960 + }, + { + "epoch": 0.29496623695518726, + "grad_norm": 1.550295352935791, + "learning_rate": 9.815950920245399e-05, + "loss": 2.7991, + "step": 961 + }, + { + "epoch": 0.2952731737262124, + "grad_norm": 2.3194360733032227, + "learning_rate": 9.826175869120655e-05, + "loss": 2.8208, + "step": 962 + }, + { + "epoch": 0.2955801104972376, + "grad_norm": 1.634867787361145, + "learning_rate": 9.83640081799591e-05, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.29588704726826276, + "grad_norm": 2.1152596473693848, + "learning_rate": 9.846625766871166e-05, + "loss": 2.7667, + "step": 964 + }, + { + "epoch": 0.2961939840392879, + "grad_norm": 1.8927233219146729, + "learning_rate": 9.856850715746421e-05, + "loss": 2.8308, + "step": 965 + }, + { + "epoch": 0.2965009208103131, + "grad_norm": 1.765026330947876, + "learning_rate": 9.867075664621678e-05, + "loss": 2.7546, + "step": 966 + }, + { + "epoch": 0.29680785758133826, + "grad_norm": 1.7491015195846558, + "learning_rate": 9.877300613496932e-05, + "loss": 2.8156, + "step": 967 + }, + { + "epoch": 0.29711479435236343, + "grad_norm": 1.8352077007293701, + "learning_rate": 9.887525562372189e-05, + "loss": 2.8542, + "step": 968 + }, + { + "epoch": 0.2974217311233886, + "grad_norm": 1.8892323970794678, + "learning_rate": 9.897750511247444e-05, + "loss": 2.8216, + "step": 969 + }, + { + "epoch": 0.29772866789441377, + "grad_norm": 1.7171403169631958, + "learning_rate": 9.907975460122701e-05, + "loss": 2.8428, + "step": 970 + }, + { + "epoch": 0.29803560466543894, + "grad_norm": 1.8318040370941162, + "learning_rate": 9.918200408997955e-05, + "loss": 2.7821, + "step": 971 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 1.5829975605010986, + "learning_rate": 9.928425357873212e-05, + "loss": 2.9091, + "step": 972 + }, + { + "epoch": 0.29864947820748927, + "grad_norm": 1.7248235940933228, + "learning_rate": 9.938650306748467e-05, + "loss": 2.7914, + "step": 973 + }, + { + "epoch": 0.29895641497851444, + "grad_norm": 1.7741187810897827, + "learning_rate": 9.948875255623722e-05, + "loss": 2.8711, + "step": 974 + }, + { + "epoch": 0.2992633517495396, + "grad_norm": 1.7419151067733765, + "learning_rate": 9.959100204498978e-05, + "loss": 2.8933, + "step": 975 + }, + { + "epoch": 0.2995702885205648, + "grad_norm": 1.6603926420211792, + "learning_rate": 9.969325153374234e-05, + "loss": 2.7138, + "step": 976 + }, + { + "epoch": 0.29987722529158994, + "grad_norm": 1.8423576354980469, + "learning_rate": 9.97955010224949e-05, + "loss": 2.7776, + "step": 977 + }, + { + "epoch": 0.3001841620626151, + "grad_norm": 1.5548568964004517, + "learning_rate": 9.989775051124745e-05, + "loss": 2.8193, + "step": 978 + }, + { + "epoch": 0.3004910988336403, + "grad_norm": 1.711785078048706, + "learning_rate": 0.0001, + "loss": 2.7082, + "step": 979 + }, + { + "epoch": 0.30079803560466545, + "grad_norm": 1.6395221948623657, + "learning_rate": 9.999999975293535e-05, + "loss": 2.7526, + "step": 980 + }, + { + "epoch": 0.3011049723756906, + "grad_norm": 1.829174518585205, + "learning_rate": 9.999999901174139e-05, + "loss": 2.7555, + "step": 981 + }, + { + "epoch": 0.3014119091467158, + "grad_norm": 1.5807569026947021, + "learning_rate": 9.999999777641814e-05, + "loss": 2.848, + "step": 982 + }, + { + "epoch": 0.30171884591774095, + "grad_norm": 2.014803171157837, + "learning_rate": 9.99999960469656e-05, + "loss": 2.8318, + "step": 983 + }, + { + "epoch": 0.3020257826887661, + "grad_norm": 1.4732542037963867, + "learning_rate": 9.99999938233838e-05, + "loss": 2.8143, + "step": 984 + }, + { + "epoch": 0.3023327194597913, + "grad_norm": 2.4888343811035156, + "learning_rate": 9.999999110567275e-05, + "loss": 2.7979, + "step": 985 + }, + { + "epoch": 0.30263965623081646, + "grad_norm": 1.4265737533569336, + "learning_rate": 9.99999878938325e-05, + "loss": 2.7968, + "step": 986 + }, + { + "epoch": 0.3029465930018416, + "grad_norm": 2.0397326946258545, + "learning_rate": 9.999998418786303e-05, + "loss": 2.7413, + "step": 987 + }, + { + "epoch": 0.3032535297728668, + "grad_norm": 1.6565579175949097, + "learning_rate": 9.999997998776443e-05, + "loss": 2.8249, + "step": 988 + }, + { + "epoch": 0.30356046654389196, + "grad_norm": 1.8470033407211304, + "learning_rate": 9.999997529353673e-05, + "loss": 2.7815, + "step": 989 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 1.571768045425415, + "learning_rate": 9.999997010517995e-05, + "loss": 2.7202, + "step": 990 + }, + { + "epoch": 0.3041743400859423, + "grad_norm": 1.6217811107635498, + "learning_rate": 9.999996442269417e-05, + "loss": 2.832, + "step": 991 + }, + { + "epoch": 0.30448127685696746, + "grad_norm": 1.745591640472412, + "learning_rate": 9.999995824607943e-05, + "loss": 2.8271, + "step": 992 + }, + { + "epoch": 0.30478821362799263, + "grad_norm": 1.6469355821609497, + "learning_rate": 9.99999515753358e-05, + "loss": 2.7699, + "step": 993 + }, + { + "epoch": 0.3050951503990178, + "grad_norm": 1.733182430267334, + "learning_rate": 9.999994441046334e-05, + "loss": 2.7927, + "step": 994 + }, + { + "epoch": 0.30540208717004297, + "grad_norm": 1.6043230295181274, + "learning_rate": 9.999993675146213e-05, + "loss": 2.7536, + "step": 995 + }, + { + "epoch": 0.30570902394106814, + "grad_norm": 1.8154711723327637, + "learning_rate": 9.999992859833222e-05, + "loss": 2.7795, + "step": 996 + }, + { + "epoch": 0.3060159607120933, + "grad_norm": 1.7553666830062866, + "learning_rate": 9.999991995107374e-05, + "loss": 2.8128, + "step": 997 + }, + { + "epoch": 0.3063228974831185, + "grad_norm": 1.702697992324829, + "learning_rate": 9.999991080968672e-05, + "loss": 2.7234, + "step": 998 + }, + { + "epoch": 0.30662983425414364, + "grad_norm": 1.512619972229004, + "learning_rate": 9.99999011741713e-05, + "loss": 2.7555, + "step": 999 + }, + { + "epoch": 0.3069367710251688, + "grad_norm": 1.735844612121582, + "learning_rate": 9.999989104452753e-05, + "loss": 2.7847, + "step": 1000 + }, + { + "epoch": 0.307243707796194, + "grad_norm": 1.4687904119491577, + "learning_rate": 9.999988042075555e-05, + "loss": 2.8039, + "step": 1001 + }, + { + "epoch": 0.30755064456721914, + "grad_norm": 1.6867917776107788, + "learning_rate": 9.999986930285542e-05, + "loss": 2.7643, + "step": 1002 + }, + { + "epoch": 0.3078575813382443, + "grad_norm": 1.6974400281906128, + "learning_rate": 9.99998576908273e-05, + "loss": 2.7284, + "step": 1003 + }, + { + "epoch": 0.3081645181092695, + "grad_norm": 1.6622353792190552, + "learning_rate": 9.999984558467126e-05, + "loss": 2.8364, + "step": 1004 + }, + { + "epoch": 0.30847145488029465, + "grad_norm": 1.7920496463775635, + "learning_rate": 9.999983298438744e-05, + "loss": 2.7769, + "step": 1005 + }, + { + "epoch": 0.3087783916513198, + "grad_norm": 1.7111997604370117, + "learning_rate": 9.999981988997598e-05, + "loss": 2.7323, + "step": 1006 + }, + { + "epoch": 0.309085328422345, + "grad_norm": 1.6372064352035522, + "learning_rate": 9.9999806301437e-05, + "loss": 2.8128, + "step": 1007 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 1.841002345085144, + "learning_rate": 9.999979221877061e-05, + "loss": 2.7049, + "step": 1008 + }, + { + "epoch": 0.3096992019643953, + "grad_norm": 1.4474141597747803, + "learning_rate": 9.999977764197697e-05, + "loss": 2.64, + "step": 1009 + }, + { + "epoch": 0.3100061387354205, + "grad_norm": 1.6599560976028442, + "learning_rate": 9.999976257105622e-05, + "loss": 2.7989, + "step": 1010 + }, + { + "epoch": 0.31031307550644566, + "grad_norm": 1.7502890825271606, + "learning_rate": 9.999974700600851e-05, + "loss": 2.7949, + "step": 1011 + }, + { + "epoch": 0.3106200122774708, + "grad_norm": 1.8119313716888428, + "learning_rate": 9.9999730946834e-05, + "loss": 2.7577, + "step": 1012 + }, + { + "epoch": 0.310926949048496, + "grad_norm": 1.4398404359817505, + "learning_rate": 9.999971439353284e-05, + "loss": 2.7369, + "step": 1013 + }, + { + "epoch": 0.31123388581952116, + "grad_norm": 1.8501840829849243, + "learning_rate": 9.999969734610522e-05, + "loss": 2.6651, + "step": 1014 + }, + { + "epoch": 0.31154082259054633, + "grad_norm": 1.450804352760315, + "learning_rate": 9.999967980455125e-05, + "loss": 2.7231, + "step": 1015 + }, + { + "epoch": 0.3118477593615715, + "grad_norm": 1.9445282220840454, + "learning_rate": 9.999966176887115e-05, + "loss": 2.795, + "step": 1016 + }, + { + "epoch": 0.31215469613259667, + "grad_norm": 1.6361008882522583, + "learning_rate": 9.99996432390651e-05, + "loss": 2.8894, + "step": 1017 + }, + { + "epoch": 0.31246163290362183, + "grad_norm": 2.0804831981658936, + "learning_rate": 9.999962421513325e-05, + "loss": 2.8313, + "step": 1018 + }, + { + "epoch": 0.312768569674647, + "grad_norm": 1.3779852390289307, + "learning_rate": 9.999960469707582e-05, + "loss": 2.6776, + "step": 1019 + }, + { + "epoch": 0.31307550644567217, + "grad_norm": 1.7727700471878052, + "learning_rate": 9.999958468489299e-05, + "loss": 2.8076, + "step": 1020 + }, + { + "epoch": 0.31338244321669734, + "grad_norm": 1.5273795127868652, + "learning_rate": 9.999956417858496e-05, + "loss": 2.7069, + "step": 1021 + }, + { + "epoch": 0.3136893799877225, + "grad_norm": 1.8135402202606201, + "learning_rate": 9.999954317815193e-05, + "loss": 2.7375, + "step": 1022 + }, + { + "epoch": 0.3139963167587477, + "grad_norm": 1.6642818450927734, + "learning_rate": 9.99995216835941e-05, + "loss": 2.8085, + "step": 1023 + }, + { + "epoch": 0.31430325352977284, + "grad_norm": 1.681378722190857, + "learning_rate": 9.999949969491169e-05, + "loss": 2.807, + "step": 1024 + }, + { + "epoch": 0.314610190300798, + "grad_norm": 1.5521160364151, + "learning_rate": 9.999947721210493e-05, + "loss": 2.7266, + "step": 1025 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 1.486830711364746, + "learning_rate": 9.999945423517403e-05, + "loss": 2.774, + "step": 1026 + }, + { + "epoch": 0.3152240638428484, + "grad_norm": 1.5730900764465332, + "learning_rate": 9.99994307641192e-05, + "loss": 2.7101, + "step": 1027 + }, + { + "epoch": 0.31553100061387357, + "grad_norm": 1.4835596084594727, + "learning_rate": 9.999940679894071e-05, + "loss": 2.8195, + "step": 1028 + }, + { + "epoch": 0.31583793738489874, + "grad_norm": 1.7885956764221191, + "learning_rate": 9.999938233963877e-05, + "loss": 2.796, + "step": 1029 + }, + { + "epoch": 0.3161448741559239, + "grad_norm": 1.4036259651184082, + "learning_rate": 9.999935738621362e-05, + "loss": 2.7167, + "step": 1030 + }, + { + "epoch": 0.3164518109269491, + "grad_norm": 1.7480512857437134, + "learning_rate": 9.999933193866554e-05, + "loss": 2.6774, + "step": 1031 + }, + { + "epoch": 0.31675874769797424, + "grad_norm": 1.66177499294281, + "learning_rate": 9.999930599699473e-05, + "loss": 2.7635, + "step": 1032 + }, + { + "epoch": 0.3170656844689994, + "grad_norm": 1.5088306665420532, + "learning_rate": 9.999927956120147e-05, + "loss": 2.7284, + "step": 1033 + }, + { + "epoch": 0.3173726212400246, + "grad_norm": 1.6847199201583862, + "learning_rate": 9.999925263128605e-05, + "loss": 2.8287, + "step": 1034 + }, + { + "epoch": 0.31767955801104975, + "grad_norm": 1.6092369556427002, + "learning_rate": 9.999922520724869e-05, + "loss": 2.7189, + "step": 1035 + }, + { + "epoch": 0.3179864947820749, + "grad_norm": 1.41717529296875, + "learning_rate": 9.999919728908969e-05, + "loss": 2.7134, + "step": 1036 + }, + { + "epoch": 0.3182934315531001, + "grad_norm": 1.6256498098373413, + "learning_rate": 9.999916887680931e-05, + "loss": 2.7312, + "step": 1037 + }, + { + "epoch": 0.31860036832412525, + "grad_norm": 1.4934377670288086, + "learning_rate": 9.999913997040784e-05, + "loss": 2.7548, + "step": 1038 + }, + { + "epoch": 0.3189073050951504, + "grad_norm": 1.6037719249725342, + "learning_rate": 9.999911056988557e-05, + "loss": 2.7682, + "step": 1039 + }, + { + "epoch": 0.3192142418661756, + "grad_norm": 1.4746284484863281, + "learning_rate": 9.999908067524277e-05, + "loss": 2.7256, + "step": 1040 + }, + { + "epoch": 0.31952117863720075, + "grad_norm": 1.4633710384368896, + "learning_rate": 9.999905028647976e-05, + "loss": 2.6779, + "step": 1041 + }, + { + "epoch": 0.3198281154082259, + "grad_norm": 1.6108646392822266, + "learning_rate": 9.999901940359684e-05, + "loss": 2.781, + "step": 1042 + }, + { + "epoch": 0.3201350521792511, + "grad_norm": 1.4130996465682983, + "learning_rate": 9.999898802659428e-05, + "loss": 2.6327, + "step": 1043 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 2.110307455062866, + "learning_rate": 9.999895615547244e-05, + "loss": 2.7965, + "step": 1044 + }, + { + "epoch": 0.3207489257213014, + "grad_norm": 1.500618815422058, + "learning_rate": 9.99989237902316e-05, + "loss": 2.7874, + "step": 1045 + }, + { + "epoch": 0.3210558624923266, + "grad_norm": 1.577890157699585, + "learning_rate": 9.999889093087207e-05, + "loss": 2.6816, + "step": 1046 + }, + { + "epoch": 0.32136279926335176, + "grad_norm": 1.2820981740951538, + "learning_rate": 9.999885757739422e-05, + "loss": 2.6799, + "step": 1047 + }, + { + "epoch": 0.32166973603437693, + "grad_norm": 1.629936695098877, + "learning_rate": 9.999882372979835e-05, + "loss": 2.6783, + "step": 1048 + }, + { + "epoch": 0.3219766728054021, + "grad_norm": 1.3119972944259644, + "learning_rate": 9.999878938808478e-05, + "loss": 2.6403, + "step": 1049 + }, + { + "epoch": 0.32228360957642727, + "grad_norm": 1.720093846321106, + "learning_rate": 9.999875455225389e-05, + "loss": 2.709, + "step": 1050 + }, + { + "epoch": 0.32259054634745243, + "grad_norm": 1.446273922920227, + "learning_rate": 9.999871922230599e-05, + "loss": 2.6463, + "step": 1051 + }, + { + "epoch": 0.3228974831184776, + "grad_norm": 1.5000908374786377, + "learning_rate": 9.999868339824145e-05, + "loss": 2.7502, + "step": 1052 + }, + { + "epoch": 0.32320441988950277, + "grad_norm": 1.6257869005203247, + "learning_rate": 9.999864708006061e-05, + "loss": 2.6984, + "step": 1053 + }, + { + "epoch": 0.32351135666052794, + "grad_norm": 1.509638786315918, + "learning_rate": 9.999861026776384e-05, + "loss": 2.6931, + "step": 1054 + }, + { + "epoch": 0.3238182934315531, + "grad_norm": 1.5305874347686768, + "learning_rate": 9.999857296135149e-05, + "loss": 2.8423, + "step": 1055 + }, + { + "epoch": 0.3241252302025783, + "grad_norm": 1.7664300203323364, + "learning_rate": 9.999853516082394e-05, + "loss": 2.7703, + "step": 1056 + }, + { + "epoch": 0.32443216697360344, + "grad_norm": 1.4633153676986694, + "learning_rate": 9.999849686618157e-05, + "loss": 2.7588, + "step": 1057 + }, + { + "epoch": 0.3247391037446286, + "grad_norm": 1.5177773237228394, + "learning_rate": 9.999845807742473e-05, + "loss": 2.7376, + "step": 1058 + }, + { + "epoch": 0.3250460405156538, + "grad_norm": 1.6122089624404907, + "learning_rate": 9.999841879455383e-05, + "loss": 2.7871, + "step": 1059 + }, + { + "epoch": 0.32535297728667895, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.999837901756926e-05, + "loss": 2.6602, + "step": 1060 + }, + { + "epoch": 0.3256599140577041, + "grad_norm": 1.5714327096939087, + "learning_rate": 9.99983387464714e-05, + "loss": 2.6279, + "step": 1061 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 1.399731993675232, + "learning_rate": 9.999829798126065e-05, + "loss": 2.7957, + "step": 1062 + }, + { + "epoch": 0.32627378759975445, + "grad_norm": 1.694368839263916, + "learning_rate": 9.999825672193741e-05, + "loss": 2.6859, + "step": 1063 + }, + { + "epoch": 0.3265807243707796, + "grad_norm": 1.2585967779159546, + "learning_rate": 9.99982149685021e-05, + "loss": 2.7964, + "step": 1064 + }, + { + "epoch": 0.3268876611418048, + "grad_norm": 1.802262306213379, + "learning_rate": 9.999817272095512e-05, + "loss": 2.6325, + "step": 1065 + }, + { + "epoch": 0.32719459791282995, + "grad_norm": 1.213222861289978, + "learning_rate": 9.99981299792969e-05, + "loss": 2.718, + "step": 1066 + }, + { + "epoch": 0.3275015346838551, + "grad_norm": 1.5745760202407837, + "learning_rate": 9.999808674352785e-05, + "loss": 2.8589, + "step": 1067 + }, + { + "epoch": 0.3278084714548803, + "grad_norm": 1.516995906829834, + "learning_rate": 9.999804301364839e-05, + "loss": 2.6691, + "step": 1068 + }, + { + "epoch": 0.32811540822590546, + "grad_norm": 1.4223122596740723, + "learning_rate": 9.999799878965897e-05, + "loss": 2.6899, + "step": 1069 + }, + { + "epoch": 0.3284223449969306, + "grad_norm": 1.4502828121185303, + "learning_rate": 9.999795407156003e-05, + "loss": 2.7801, + "step": 1070 + }, + { + "epoch": 0.3287292817679558, + "grad_norm": 1.4692026376724243, + "learning_rate": 9.999790885935198e-05, + "loss": 2.6869, + "step": 1071 + }, + { + "epoch": 0.32903621853898096, + "grad_norm": 1.4182246923446655, + "learning_rate": 9.999786315303532e-05, + "loss": 2.7802, + "step": 1072 + }, + { + "epoch": 0.32934315531000613, + "grad_norm": 1.781173586845398, + "learning_rate": 9.999781695261046e-05, + "loss": 2.7522, + "step": 1073 + }, + { + "epoch": 0.3296500920810313, + "grad_norm": 1.3958306312561035, + "learning_rate": 9.999777025807786e-05, + "loss": 2.6894, + "step": 1074 + }, + { + "epoch": 0.32995702885205647, + "grad_norm": 1.7938110828399658, + "learning_rate": 9.9997723069438e-05, + "loss": 2.6468, + "step": 1075 + }, + { + "epoch": 0.33026396562308163, + "grad_norm": 1.2314528226852417, + "learning_rate": 9.999767538669134e-05, + "loss": 2.7446, + "step": 1076 + }, + { + "epoch": 0.3305709023941068, + "grad_norm": 1.4881565570831299, + "learning_rate": 9.999762720983835e-05, + "loss": 2.6904, + "step": 1077 + }, + { + "epoch": 0.33087783916513197, + "grad_norm": 1.3903130292892456, + "learning_rate": 9.999757853887948e-05, + "loss": 2.7315, + "step": 1078 + }, + { + "epoch": 0.33118477593615714, + "grad_norm": 1.491129755973816, + "learning_rate": 9.999752937381525e-05, + "loss": 2.7325, + "step": 1079 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 1.4748190641403198, + "learning_rate": 9.999747971464612e-05, + "loss": 2.7288, + "step": 1080 + }, + { + "epoch": 0.3317986494782075, + "grad_norm": 1.5664055347442627, + "learning_rate": 9.99974295613726e-05, + "loss": 2.8225, + "step": 1081 + }, + { + "epoch": 0.33210558624923264, + "grad_norm": 1.4422696828842163, + "learning_rate": 9.999737891399518e-05, + "loss": 2.6537, + "step": 1082 + }, + { + "epoch": 0.3324125230202578, + "grad_norm": 1.397817850112915, + "learning_rate": 9.999732777251436e-05, + "loss": 2.6329, + "step": 1083 + }, + { + "epoch": 0.332719459791283, + "grad_norm": 1.4253548383712769, + "learning_rate": 9.999727613693063e-05, + "loss": 2.7028, + "step": 1084 + }, + { + "epoch": 0.33302639656230815, + "grad_norm": 1.4327688217163086, + "learning_rate": 9.999722400724451e-05, + "loss": 2.6524, + "step": 1085 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.999717138345654e-05, + "loss": 2.7278, + "step": 1086 + }, + { + "epoch": 0.3336402701043585, + "grad_norm": 1.536656379699707, + "learning_rate": 9.999711826556719e-05, + "loss": 2.5858, + "step": 1087 + }, + { + "epoch": 0.33394720687538365, + "grad_norm": 1.4210286140441895, + "learning_rate": 9.999706465357703e-05, + "loss": 2.7057, + "step": 1088 + }, + { + "epoch": 0.3342541436464088, + "grad_norm": 1.4605839252471924, + "learning_rate": 9.999701054748657e-05, + "loss": 2.6461, + "step": 1089 + }, + { + "epoch": 0.334561080417434, + "grad_norm": 1.4764037132263184, + "learning_rate": 9.999695594729636e-05, + "loss": 2.608, + "step": 1090 + }, + { + "epoch": 0.33486801718845915, + "grad_norm": 1.630843162536621, + "learning_rate": 9.99969008530069e-05, + "loss": 2.6165, + "step": 1091 + }, + { + "epoch": 0.3351749539594843, + "grad_norm": 1.3693522214889526, + "learning_rate": 9.999684526461879e-05, + "loss": 2.72, + "step": 1092 + }, + { + "epoch": 0.3354818907305095, + "grad_norm": 1.609580636024475, + "learning_rate": 9.999678918213254e-05, + "loss": 2.7602, + "step": 1093 + }, + { + "epoch": 0.33578882750153466, + "grad_norm": 1.3815720081329346, + "learning_rate": 9.999673260554872e-05, + "loss": 2.6297, + "step": 1094 + }, + { + "epoch": 0.3360957642725598, + "grad_norm": 1.4511120319366455, + "learning_rate": 9.999667553486787e-05, + "loss": 2.7515, + "step": 1095 + }, + { + "epoch": 0.336402701043585, + "grad_norm": 1.486387848854065, + "learning_rate": 9.999661797009057e-05, + "loss": 2.6839, + "step": 1096 + }, + { + "epoch": 0.33670963781461016, + "grad_norm": 1.239160180091858, + "learning_rate": 9.999655991121739e-05, + "loss": 2.6033, + "step": 1097 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 1.499598741531372, + "learning_rate": 9.999650135824891e-05, + "loss": 2.5582, + "step": 1098 + }, + { + "epoch": 0.33732351135666055, + "grad_norm": 1.32973051071167, + "learning_rate": 9.999644231118571e-05, + "loss": 2.6253, + "step": 1099 + }, + { + "epoch": 0.3376304481276857, + "grad_norm": 1.4025259017944336, + "learning_rate": 9.999638277002833e-05, + "loss": 2.6199, + "step": 1100 + }, + { + "epoch": 0.3379373848987109, + "grad_norm": 1.3162082433700562, + "learning_rate": 9.999632273477742e-05, + "loss": 2.5528, + "step": 1101 + }, + { + "epoch": 0.33824432166973606, + "grad_norm": 1.5454723834991455, + "learning_rate": 9.999626220543352e-05, + "loss": 2.6724, + "step": 1102 + }, + { + "epoch": 0.3385512584407612, + "grad_norm": 1.45896315574646, + "learning_rate": 9.999620118199727e-05, + "loss": 2.688, + "step": 1103 + }, + { + "epoch": 0.3388581952117864, + "grad_norm": 1.3940998315811157, + "learning_rate": 9.999613966446926e-05, + "loss": 2.6991, + "step": 1104 + }, + { + "epoch": 0.33916513198281156, + "grad_norm": 1.4427480697631836, + "learning_rate": 9.999607765285009e-05, + "loss": 2.6869, + "step": 1105 + }, + { + "epoch": 0.33947206875383673, + "grad_norm": 1.260373830795288, + "learning_rate": 9.999601514714036e-05, + "loss": 2.7011, + "step": 1106 + }, + { + "epoch": 0.3397790055248619, + "grad_norm": 1.5985103845596313, + "learning_rate": 9.999595214734072e-05, + "loss": 2.599, + "step": 1107 + }, + { + "epoch": 0.34008594229588707, + "grad_norm": 1.1968494653701782, + "learning_rate": 9.999588865345179e-05, + "loss": 2.6346, + "step": 1108 + }, + { + "epoch": 0.34039287906691224, + "grad_norm": 1.4565916061401367, + "learning_rate": 9.999582466547417e-05, + "loss": 2.6303, + "step": 1109 + }, + { + "epoch": 0.3406998158379374, + "grad_norm": 1.2992361783981323, + "learning_rate": 9.999576018340851e-05, + "loss": 2.6121, + "step": 1110 + }, + { + "epoch": 0.34100675260896257, + "grad_norm": 1.402471899986267, + "learning_rate": 9.999569520725543e-05, + "loss": 2.6697, + "step": 1111 + }, + { + "epoch": 0.34131368937998774, + "grad_norm": 1.3006439208984375, + "learning_rate": 9.99956297370156e-05, + "loss": 2.6347, + "step": 1112 + }, + { + "epoch": 0.3416206261510129, + "grad_norm": 1.4235650300979614, + "learning_rate": 9.999556377268966e-05, + "loss": 2.6869, + "step": 1113 + }, + { + "epoch": 0.3419275629220381, + "grad_norm": 1.3288183212280273, + "learning_rate": 9.999549731427824e-05, + "loss": 2.5834, + "step": 1114 + }, + { + "epoch": 0.34223449969306324, + "grad_norm": 1.430736780166626, + "learning_rate": 9.999543036178203e-05, + "loss": 2.6248, + "step": 1115 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 1.467417597770691, + "learning_rate": 9.999536291520167e-05, + "loss": 2.6563, + "step": 1116 + }, + { + "epoch": 0.3428483732351136, + "grad_norm": 1.3988397121429443, + "learning_rate": 9.999529497453782e-05, + "loss": 2.6634, + "step": 1117 + }, + { + "epoch": 0.34315531000613875, + "grad_norm": 1.2072746753692627, + "learning_rate": 9.999522653979117e-05, + "loss": 2.6129, + "step": 1118 + }, + { + "epoch": 0.3434622467771639, + "grad_norm": 1.5297373533248901, + "learning_rate": 9.999515761096239e-05, + "loss": 2.6359, + "step": 1119 + }, + { + "epoch": 0.3437691835481891, + "grad_norm": 1.2022082805633545, + "learning_rate": 9.999508818805214e-05, + "loss": 2.6934, + "step": 1120 + }, + { + "epoch": 0.34407612031921425, + "grad_norm": 1.5655800104141235, + "learning_rate": 9.999501827106114e-05, + "loss": 2.6132, + "step": 1121 + }, + { + "epoch": 0.3443830570902394, + "grad_norm": 1.1639407873153687, + "learning_rate": 9.999494785999007e-05, + "loss": 2.6416, + "step": 1122 + }, + { + "epoch": 0.3446899938612646, + "grad_norm": 1.5784116983413696, + "learning_rate": 9.999487695483962e-05, + "loss": 2.5967, + "step": 1123 + }, + { + "epoch": 0.34499693063228976, + "grad_norm": 1.1812770366668701, + "learning_rate": 9.999480555561049e-05, + "loss": 2.6303, + "step": 1124 + }, + { + "epoch": 0.3453038674033149, + "grad_norm": 1.5105888843536377, + "learning_rate": 9.99947336623034e-05, + "loss": 2.58, + "step": 1125 + }, + { + "epoch": 0.3456108041743401, + "grad_norm": 1.2969506978988647, + "learning_rate": 9.999466127491904e-05, + "loss": 2.6857, + "step": 1126 + }, + { + "epoch": 0.34591774094536526, + "grad_norm": 1.679018259048462, + "learning_rate": 9.999458839345812e-05, + "loss": 2.6304, + "step": 1127 + }, + { + "epoch": 0.3462246777163904, + "grad_norm": 1.2718015909194946, + "learning_rate": 9.99945150179214e-05, + "loss": 2.6929, + "step": 1128 + }, + { + "epoch": 0.3465316144874156, + "grad_norm": 1.5834014415740967, + "learning_rate": 9.999444114830957e-05, + "loss": 2.6477, + "step": 1129 + }, + { + "epoch": 0.34683855125844076, + "grad_norm": 1.1575955152511597, + "learning_rate": 9.999436678462338e-05, + "loss": 2.6908, + "step": 1130 + }, + { + "epoch": 0.34714548802946593, + "grad_norm": 1.6231988668441772, + "learning_rate": 9.999429192686352e-05, + "loss": 2.6741, + "step": 1131 + }, + { + "epoch": 0.3474524248004911, + "grad_norm": 1.1616390943527222, + "learning_rate": 9.99942165750308e-05, + "loss": 2.5977, + "step": 1132 + }, + { + "epoch": 0.34775936157151627, + "grad_norm": 1.6188498735427856, + "learning_rate": 9.999414072912592e-05, + "loss": 2.6776, + "step": 1133 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 1.3885529041290283, + "learning_rate": 9.999406438914962e-05, + "loss": 2.7136, + "step": 1134 + }, + { + "epoch": 0.3483732351135666, + "grad_norm": 1.4522851705551147, + "learning_rate": 9.999398755510269e-05, + "loss": 2.6817, + "step": 1135 + }, + { + "epoch": 0.34868017188459177, + "grad_norm": 1.2695082426071167, + "learning_rate": 9.999391022698588e-05, + "loss": 2.6257, + "step": 1136 + }, + { + "epoch": 0.34898710865561694, + "grad_norm": 1.1735594272613525, + "learning_rate": 9.999383240479993e-05, + "loss": 2.5908, + "step": 1137 + }, + { + "epoch": 0.3492940454266421, + "grad_norm": 1.4158523082733154, + "learning_rate": 9.999375408854564e-05, + "loss": 2.572, + "step": 1138 + }, + { + "epoch": 0.3496009821976673, + "grad_norm": 1.1342333555221558, + "learning_rate": 9.999367527822376e-05, + "loss": 2.6918, + "step": 1139 + }, + { + "epoch": 0.34990791896869244, + "grad_norm": 1.4462997913360596, + "learning_rate": 9.999359597383509e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 0.3502148557397176, + "grad_norm": 1.254346251487732, + "learning_rate": 9.99935161753804e-05, + "loss": 2.6426, + "step": 1141 + }, + { + "epoch": 0.3505217925107428, + "grad_norm": 1.5101851224899292, + "learning_rate": 9.999343588286048e-05, + "loss": 2.6261, + "step": 1142 + }, + { + "epoch": 0.35082872928176795, + "grad_norm": 1.2910065650939941, + "learning_rate": 9.999335509627612e-05, + "loss": 2.5587, + "step": 1143 + }, + { + "epoch": 0.3511356660527931, + "grad_norm": 1.4421133995056152, + "learning_rate": 9.999327381562812e-05, + "loss": 2.6812, + "step": 1144 + }, + { + "epoch": 0.3514426028238183, + "grad_norm": 1.3265037536621094, + "learning_rate": 9.999319204091728e-05, + "loss": 2.6506, + "step": 1145 + }, + { + "epoch": 0.35174953959484345, + "grad_norm": 1.346258521080017, + "learning_rate": 9.999310977214443e-05, + "loss": 2.7038, + "step": 1146 + }, + { + "epoch": 0.3520564763658686, + "grad_norm": 1.3683836460113525, + "learning_rate": 9.999302700931037e-05, + "loss": 2.5823, + "step": 1147 + }, + { + "epoch": 0.3523634131368938, + "grad_norm": 1.3593783378601074, + "learning_rate": 9.99929437524159e-05, + "loss": 2.5705, + "step": 1148 + }, + { + "epoch": 0.35267034990791896, + "grad_norm": 1.4077095985412598, + "learning_rate": 9.999286000146186e-05, + "loss": 2.6259, + "step": 1149 + }, + { + "epoch": 0.3529772866789441, + "grad_norm": 1.3095922470092773, + "learning_rate": 9.99927757564491e-05, + "loss": 2.683, + "step": 1150 + }, + { + "epoch": 0.3532842234499693, + "grad_norm": 1.4188631772994995, + "learning_rate": 9.999269101737841e-05, + "loss": 2.619, + "step": 1151 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 1.2483123540878296, + "learning_rate": 9.999260578425063e-05, + "loss": 2.6477, + "step": 1152 + }, + { + "epoch": 0.35389809699201963, + "grad_norm": 1.4601099491119385, + "learning_rate": 9.999252005706663e-05, + "loss": 2.5861, + "step": 1153 + }, + { + "epoch": 0.3542050337630448, + "grad_norm": 1.107335090637207, + "learning_rate": 9.999243383582726e-05, + "loss": 2.6308, + "step": 1154 + }, + { + "epoch": 0.35451197053406996, + "grad_norm": 1.60590398311615, + "learning_rate": 9.999234712053334e-05, + "loss": 2.7057, + "step": 1155 + }, + { + "epoch": 0.35481890730509513, + "grad_norm": 1.2256578207015991, + "learning_rate": 9.999225991118575e-05, + "loss": 2.6371, + "step": 1156 + }, + { + "epoch": 0.3551258440761203, + "grad_norm": 1.4451910257339478, + "learning_rate": 9.999217220778535e-05, + "loss": 2.6424, + "step": 1157 + }, + { + "epoch": 0.35543278084714547, + "grad_norm": 1.184781789779663, + "learning_rate": 9.999208401033299e-05, + "loss": 2.6576, + "step": 1158 + }, + { + "epoch": 0.35573971761817064, + "grad_norm": 1.3395711183547974, + "learning_rate": 9.999199531882956e-05, + "loss": 2.6109, + "step": 1159 + }, + { + "epoch": 0.3560466543891958, + "grad_norm": 1.2052571773529053, + "learning_rate": 9.999190613327594e-05, + "loss": 2.5486, + "step": 1160 + }, + { + "epoch": 0.356353591160221, + "grad_norm": 1.2690850496292114, + "learning_rate": 9.999181645367299e-05, + "loss": 2.6457, + "step": 1161 + }, + { + "epoch": 0.35666052793124614, + "grad_norm": 1.2832787036895752, + "learning_rate": 9.999172628002162e-05, + "loss": 2.6097, + "step": 1162 + }, + { + "epoch": 0.3569674647022713, + "grad_norm": 1.3791579008102417, + "learning_rate": 9.999163561232272e-05, + "loss": 2.7458, + "step": 1163 + }, + { + "epoch": 0.3572744014732965, + "grad_norm": 1.260743498802185, + "learning_rate": 9.999154445057715e-05, + "loss": 2.594, + "step": 1164 + }, + { + "epoch": 0.35758133824432164, + "grad_norm": 1.1595406532287598, + "learning_rate": 9.999145279478585e-05, + "loss": 2.5315, + "step": 1165 + }, + { + "epoch": 0.3578882750153468, + "grad_norm": 1.3424396514892578, + "learning_rate": 9.999136064494972e-05, + "loss": 2.6017, + "step": 1166 + }, + { + "epoch": 0.358195211786372, + "grad_norm": 1.317750334739685, + "learning_rate": 9.999126800106963e-05, + "loss": 2.5787, + "step": 1167 + }, + { + "epoch": 0.35850214855739715, + "grad_norm": 1.104471206665039, + "learning_rate": 9.999117486314657e-05, + "loss": 2.6801, + "step": 1168 + }, + { + "epoch": 0.3588090853284224, + "grad_norm": 1.5555830001831055, + "learning_rate": 9.99910812311814e-05, + "loss": 2.6575, + "step": 1169 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 1.1883453130722046, + "learning_rate": 9.999098710517507e-05, + "loss": 2.5801, + "step": 1170 + }, + { + "epoch": 0.3594229588704727, + "grad_norm": 1.3885222673416138, + "learning_rate": 9.99908924851285e-05, + "loss": 2.5637, + "step": 1171 + }, + { + "epoch": 0.3597298956414979, + "grad_norm": 1.1860510110855103, + "learning_rate": 9.999079737104262e-05, + "loss": 2.6528, + "step": 1172 + }, + { + "epoch": 0.36003683241252304, + "grad_norm": 1.4319096803665161, + "learning_rate": 9.99907017629184e-05, + "loss": 2.579, + "step": 1173 + }, + { + "epoch": 0.3603437691835482, + "grad_norm": 1.256819725036621, + "learning_rate": 9.999060566075676e-05, + "loss": 2.5638, + "step": 1174 + }, + { + "epoch": 0.3606507059545734, + "grad_norm": 1.5452641248703003, + "learning_rate": 9.999050906455865e-05, + "loss": 2.6318, + "step": 1175 + }, + { + "epoch": 0.36095764272559855, + "grad_norm": 1.1933847665786743, + "learning_rate": 9.999041197432503e-05, + "loss": 2.5451, + "step": 1176 + }, + { + "epoch": 0.3612645794966237, + "grad_norm": 1.245689034461975, + "learning_rate": 9.999031439005684e-05, + "loss": 2.5452, + "step": 1177 + }, + { + "epoch": 0.3615715162676489, + "grad_norm": 1.2228111028671265, + "learning_rate": 9.99902163117551e-05, + "loss": 2.5856, + "step": 1178 + }, + { + "epoch": 0.36187845303867405, + "grad_norm": 1.3547098636627197, + "learning_rate": 9.999011773942071e-05, + "loss": 2.6604, + "step": 1179 + }, + { + "epoch": 0.3621853898096992, + "grad_norm": 1.25395929813385, + "learning_rate": 9.999001867305469e-05, + "loss": 2.5947, + "step": 1180 + }, + { + "epoch": 0.3624923265807244, + "grad_norm": 1.1676687002182007, + "learning_rate": 9.9989919112658e-05, + "loss": 2.5728, + "step": 1181 + }, + { + "epoch": 0.36279926335174956, + "grad_norm": 1.2076375484466553, + "learning_rate": 9.998981905823163e-05, + "loss": 2.569, + "step": 1182 + }, + { + "epoch": 0.3631062001227747, + "grad_norm": 1.3417900800704956, + "learning_rate": 9.998971850977659e-05, + "loss": 2.5552, + "step": 1183 + }, + { + "epoch": 0.3634131368937999, + "grad_norm": 1.135088324546814, + "learning_rate": 9.998961746729383e-05, + "loss": 2.5883, + "step": 1184 + }, + { + "epoch": 0.36372007366482506, + "grad_norm": 1.3329869508743286, + "learning_rate": 9.998951593078438e-05, + "loss": 2.6398, + "step": 1185 + }, + { + "epoch": 0.36402701043585023, + "grad_norm": 1.1681292057037354, + "learning_rate": 9.998941390024923e-05, + "loss": 2.6082, + "step": 1186 + }, + { + "epoch": 0.3643339472068754, + "grad_norm": 1.4083843231201172, + "learning_rate": 9.998931137568939e-05, + "loss": 2.6585, + "step": 1187 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 1.0879896879196167, + "learning_rate": 9.998920835710587e-05, + "loss": 2.4779, + "step": 1188 + }, + { + "epoch": 0.36494782074892573, + "grad_norm": 1.2977828979492188, + "learning_rate": 9.99891048444997e-05, + "loss": 2.6586, + "step": 1189 + }, + { + "epoch": 0.3652547575199509, + "grad_norm": 1.2552378177642822, + "learning_rate": 9.998900083787188e-05, + "loss": 2.5211, + "step": 1190 + }, + { + "epoch": 0.36556169429097607, + "grad_norm": 1.178227186203003, + "learning_rate": 9.998889633722348e-05, + "loss": 2.5365, + "step": 1191 + }, + { + "epoch": 0.36586863106200124, + "grad_norm": 1.36601722240448, + "learning_rate": 9.99887913425555e-05, + "loss": 2.6108, + "step": 1192 + }, + { + "epoch": 0.3661755678330264, + "grad_norm": 1.1947816610336304, + "learning_rate": 9.998868585386898e-05, + "loss": 2.5269, + "step": 1193 + }, + { + "epoch": 0.3664825046040516, + "grad_norm": 1.3113429546356201, + "learning_rate": 9.998857987116497e-05, + "loss": 2.5241, + "step": 1194 + }, + { + "epoch": 0.36678944137507674, + "grad_norm": 1.1573466062545776, + "learning_rate": 9.99884733944445e-05, + "loss": 2.5772, + "step": 1195 + }, + { + "epoch": 0.3670963781461019, + "grad_norm": 1.3841795921325684, + "learning_rate": 9.998836642370866e-05, + "loss": 2.6254, + "step": 1196 + }, + { + "epoch": 0.3674033149171271, + "grad_norm": 1.3332045078277588, + "learning_rate": 9.998825895895848e-05, + "loss": 2.6846, + "step": 1197 + }, + { + "epoch": 0.36771025168815225, + "grad_norm": 1.1578748226165771, + "learning_rate": 9.9988151000195e-05, + "loss": 2.4717, + "step": 1198 + }, + { + "epoch": 0.3680171884591774, + "grad_norm": 1.1045753955841064, + "learning_rate": 9.998804254741934e-05, + "loss": 2.6433, + "step": 1199 + }, + { + "epoch": 0.3683241252302026, + "grad_norm": 1.3260962963104248, + "learning_rate": 9.998793360063254e-05, + "loss": 2.6385, + "step": 1200 + }, + { + "epoch": 0.36863106200122775, + "grad_norm": 1.1483805179595947, + "learning_rate": 9.998782415983568e-05, + "loss": 2.6013, + "step": 1201 + }, + { + "epoch": 0.3689379987722529, + "grad_norm": 1.1897181272506714, + "learning_rate": 9.998771422502984e-05, + "loss": 2.485, + "step": 1202 + }, + { + "epoch": 0.3692449355432781, + "grad_norm": 1.2124346494674683, + "learning_rate": 9.99876037962161e-05, + "loss": 2.6271, + "step": 1203 + }, + { + "epoch": 0.36955187231430325, + "grad_norm": 1.2274240255355835, + "learning_rate": 9.998749287339557e-05, + "loss": 2.6072, + "step": 1204 + }, + { + "epoch": 0.3698588090853284, + "grad_norm": 1.2045015096664429, + "learning_rate": 9.998738145656934e-05, + "loss": 2.5567, + "step": 1205 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 1.187698483467102, + "learning_rate": 9.998726954573852e-05, + "loss": 2.6251, + "step": 1206 + }, + { + "epoch": 0.37047268262737876, + "grad_norm": 1.1760836839675903, + "learning_rate": 9.998715714090419e-05, + "loss": 2.6544, + "step": 1207 + }, + { + "epoch": 0.3707796193984039, + "grad_norm": 1.2181260585784912, + "learning_rate": 9.998704424206746e-05, + "loss": 2.6258, + "step": 1208 + }, + { + "epoch": 0.3710865561694291, + "grad_norm": 1.2106094360351562, + "learning_rate": 9.998693084922947e-05, + "loss": 2.5932, + "step": 1209 + }, + { + "epoch": 0.37139349294045426, + "grad_norm": 1.2973625659942627, + "learning_rate": 9.998681696239133e-05, + "loss": 2.5257, + "step": 1210 + }, + { + "epoch": 0.37170042971147943, + "grad_norm": 1.2477924823760986, + "learning_rate": 9.998670258155417e-05, + "loss": 2.6579, + "step": 1211 + }, + { + "epoch": 0.3720073664825046, + "grad_norm": 1.3301422595977783, + "learning_rate": 9.998658770671913e-05, + "loss": 2.4903, + "step": 1212 + }, + { + "epoch": 0.37231430325352977, + "grad_norm": 1.224321722984314, + "learning_rate": 9.998647233788732e-05, + "loss": 2.5865, + "step": 1213 + }, + { + "epoch": 0.37262124002455493, + "grad_norm": 1.3110655546188354, + "learning_rate": 9.99863564750599e-05, + "loss": 2.6134, + "step": 1214 + }, + { + "epoch": 0.3729281767955801, + "grad_norm": 1.2323014736175537, + "learning_rate": 9.998624011823801e-05, + "loss": 2.5892, + "step": 1215 + }, + { + "epoch": 0.37323511356660527, + "grad_norm": 1.0873770713806152, + "learning_rate": 9.998612326742279e-05, + "loss": 2.4897, + "step": 1216 + }, + { + "epoch": 0.37354205033763044, + "grad_norm": 1.2789679765701294, + "learning_rate": 9.998600592261539e-05, + "loss": 2.5603, + "step": 1217 + }, + { + "epoch": 0.3738489871086556, + "grad_norm": 1.1311540603637695, + "learning_rate": 9.998588808381699e-05, + "loss": 2.5327, + "step": 1218 + }, + { + "epoch": 0.3741559238796808, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.998576975102876e-05, + "loss": 2.4789, + "step": 1219 + }, + { + "epoch": 0.37446286065070594, + "grad_norm": 1.1840651035308838, + "learning_rate": 9.998565092425182e-05, + "loss": 2.5026, + "step": 1220 + }, + { + "epoch": 0.3747697974217311, + "grad_norm": 1.3145099878311157, + "learning_rate": 9.998553160348743e-05, + "loss": 2.5424, + "step": 1221 + }, + { + "epoch": 0.3750767341927563, + "grad_norm": 1.2192758321762085, + "learning_rate": 9.998541178873668e-05, + "loss": 2.5556, + "step": 1222 + }, + { + "epoch": 0.37538367096378145, + "grad_norm": 1.1329905986785889, + "learning_rate": 9.99852914800008e-05, + "loss": 2.4624, + "step": 1223 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 1.2490339279174805, + "learning_rate": 9.9985170677281e-05, + "loss": 2.5016, + "step": 1224 + }, + { + "epoch": 0.3759975445058318, + "grad_norm": 1.1884582042694092, + "learning_rate": 9.998504938057841e-05, + "loss": 2.5345, + "step": 1225 + }, + { + "epoch": 0.37630448127685695, + "grad_norm": 1.2075775861740112, + "learning_rate": 9.998492758989428e-05, + "loss": 2.5206, + "step": 1226 + }, + { + "epoch": 0.3766114180478821, + "grad_norm": 1.238457441329956, + "learning_rate": 9.99848053052298e-05, + "loss": 2.6748, + "step": 1227 + }, + { + "epoch": 0.3769183548189073, + "grad_norm": 1.3056883811950684, + "learning_rate": 9.998468252658618e-05, + "loss": 2.6146, + "step": 1228 + }, + { + "epoch": 0.37722529158993245, + "grad_norm": 1.191575050354004, + "learning_rate": 9.998455925396461e-05, + "loss": 2.4743, + "step": 1229 + }, + { + "epoch": 0.3775322283609576, + "grad_norm": 1.2834603786468506, + "learning_rate": 9.998443548736635e-05, + "loss": 2.5504, + "step": 1230 + }, + { + "epoch": 0.3778391651319828, + "grad_norm": 1.3023632764816284, + "learning_rate": 9.99843112267926e-05, + "loss": 2.5832, + "step": 1231 + }, + { + "epoch": 0.37814610190300796, + "grad_norm": 1.1219336986541748, + "learning_rate": 9.998418647224458e-05, + "loss": 2.5715, + "step": 1232 + }, + { + "epoch": 0.3784530386740331, + "grad_norm": 1.0666810274124146, + "learning_rate": 9.998406122372354e-05, + "loss": 2.4865, + "step": 1233 + }, + { + "epoch": 0.3787599754450583, + "grad_norm": 1.3699263334274292, + "learning_rate": 9.998393548123072e-05, + "loss": 2.5523, + "step": 1234 + }, + { + "epoch": 0.37906691221608346, + "grad_norm": 1.1383014917373657, + "learning_rate": 9.998380924476733e-05, + "loss": 2.7054, + "step": 1235 + }, + { + "epoch": 0.37937384898710863, + "grad_norm": 1.1304205656051636, + "learning_rate": 9.998368251433465e-05, + "loss": 2.5007, + "step": 1236 + }, + { + "epoch": 0.3796807857581338, + "grad_norm": 1.2220405340194702, + "learning_rate": 9.998355528993394e-05, + "loss": 2.5635, + "step": 1237 + }, + { + "epoch": 0.37998772252915897, + "grad_norm": 1.1126691102981567, + "learning_rate": 9.998342757156642e-05, + "loss": 2.5795, + "step": 1238 + }, + { + "epoch": 0.38029465930018413, + "grad_norm": 1.1675945520401, + "learning_rate": 9.998329935923339e-05, + "loss": 2.564, + "step": 1239 + }, + { + "epoch": 0.38060159607120936, + "grad_norm": 1.1286569833755493, + "learning_rate": 9.998317065293607e-05, + "loss": 2.5476, + "step": 1240 + }, + { + "epoch": 0.3809085328422345, + "grad_norm": 1.1252213716506958, + "learning_rate": 9.998304145267579e-05, + "loss": 2.5406, + "step": 1241 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 1.1931700706481934, + "learning_rate": 9.998291175845378e-05, + "loss": 2.5277, + "step": 1242 + }, + { + "epoch": 0.38152240638428486, + "grad_norm": 1.2148306369781494, + "learning_rate": 9.998278157027136e-05, + "loss": 2.5178, + "step": 1243 + }, + { + "epoch": 0.38182934315531003, + "grad_norm": 1.1597660779953003, + "learning_rate": 9.998265088812978e-05, + "loss": 2.5522, + "step": 1244 + }, + { + "epoch": 0.3821362799263352, + "grad_norm": 1.105973243713379, + "learning_rate": 9.998251971203035e-05, + "loss": 2.4558, + "step": 1245 + }, + { + "epoch": 0.38244321669736037, + "grad_norm": 1.1082781553268433, + "learning_rate": 9.998238804197437e-05, + "loss": 2.5504, + "step": 1246 + }, + { + "epoch": 0.38275015346838553, + "grad_norm": 1.2124732732772827, + "learning_rate": 9.998225587796312e-05, + "loss": 2.5536, + "step": 1247 + }, + { + "epoch": 0.3830570902394107, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.998212321999795e-05, + "loss": 2.4837, + "step": 1248 + }, + { + "epoch": 0.38336402701043587, + "grad_norm": 1.353562355041504, + "learning_rate": 9.998199006808014e-05, + "loss": 2.4554, + "step": 1249 + }, + { + "epoch": 0.38367096378146104, + "grad_norm": 1.2103357315063477, + "learning_rate": 9.998185642221098e-05, + "loss": 2.4843, + "step": 1250 + }, + { + "epoch": 0.3839779005524862, + "grad_norm": 1.2572352886199951, + "learning_rate": 9.998172228239185e-05, + "loss": 2.497, + "step": 1251 + }, + { + "epoch": 0.3842848373235114, + "grad_norm": 1.0910226106643677, + "learning_rate": 9.998158764862402e-05, + "loss": 2.577, + "step": 1252 + }, + { + "epoch": 0.38459177409453654, + "grad_norm": 1.2550606727600098, + "learning_rate": 9.998145252090886e-05, + "loss": 2.5087, + "step": 1253 + }, + { + "epoch": 0.3848987108655617, + "grad_norm": 1.0103787183761597, + "learning_rate": 9.998131689924768e-05, + "loss": 2.5306, + "step": 1254 + }, + { + "epoch": 0.3852056476365869, + "grad_norm": 1.2965941429138184, + "learning_rate": 9.998118078364184e-05, + "loss": 2.5622, + "step": 1255 + }, + { + "epoch": 0.38551258440761205, + "grad_norm": 1.0791535377502441, + "learning_rate": 9.998104417409269e-05, + "loss": 2.5608, + "step": 1256 + }, + { + "epoch": 0.3858195211786372, + "grad_norm": 1.3277596235275269, + "learning_rate": 9.998090707060155e-05, + "loss": 2.5748, + "step": 1257 + }, + { + "epoch": 0.3861264579496624, + "grad_norm": 1.004031777381897, + "learning_rate": 9.99807694731698e-05, + "loss": 2.5532, + "step": 1258 + }, + { + "epoch": 0.38643339472068755, + "grad_norm": 1.4802277088165283, + "learning_rate": 9.998063138179877e-05, + "loss": 2.585, + "step": 1259 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 1.0821146965026855, + "learning_rate": 9.998049279648987e-05, + "loss": 2.5248, + "step": 1260 + }, + { + "epoch": 0.3870472682627379, + "grad_norm": 1.2902108430862427, + "learning_rate": 9.998035371724443e-05, + "loss": 2.5134, + "step": 1261 + }, + { + "epoch": 0.38735420503376305, + "grad_norm": 1.082943320274353, + "learning_rate": 9.998021414406385e-05, + "loss": 2.5937, + "step": 1262 + }, + { + "epoch": 0.3876611418047882, + "grad_norm": 1.2164193391799927, + "learning_rate": 9.998007407694949e-05, + "loss": 2.5106, + "step": 1263 + }, + { + "epoch": 0.3879680785758134, + "grad_norm": 1.0999115705490112, + "learning_rate": 9.997993351590276e-05, + "loss": 2.5458, + "step": 1264 + }, + { + "epoch": 0.38827501534683856, + "grad_norm": 1.2275537252426147, + "learning_rate": 9.997979246092503e-05, + "loss": 2.5664, + "step": 1265 + }, + { + "epoch": 0.3885819521178637, + "grad_norm": 1.3246204853057861, + "learning_rate": 9.997965091201769e-05, + "loss": 2.5289, + "step": 1266 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.2404677867889404, + "learning_rate": 9.997950886918214e-05, + "loss": 2.5302, + "step": 1267 + }, + { + "epoch": 0.38919582565991406, + "grad_norm": 1.0993810892105103, + "learning_rate": 9.99793663324198e-05, + "loss": 2.5085, + "step": 1268 + }, + { + "epoch": 0.38950276243093923, + "grad_norm": 1.3394049406051636, + "learning_rate": 9.997922330173206e-05, + "loss": 2.5882, + "step": 1269 + }, + { + "epoch": 0.3898096992019644, + "grad_norm": 1.1464321613311768, + "learning_rate": 9.997907977712036e-05, + "loss": 2.5211, + "step": 1270 + }, + { + "epoch": 0.39011663597298957, + "grad_norm": 1.1246297359466553, + "learning_rate": 9.997893575858608e-05, + "loss": 2.4204, + "step": 1271 + }, + { + "epoch": 0.39042357274401474, + "grad_norm": 1.1278076171875, + "learning_rate": 9.997879124613067e-05, + "loss": 2.4405, + "step": 1272 + }, + { + "epoch": 0.3907305095150399, + "grad_norm": 1.2284942865371704, + "learning_rate": 9.997864623975555e-05, + "loss": 2.5674, + "step": 1273 + }, + { + "epoch": 0.39103744628606507, + "grad_norm": 1.1243138313293457, + "learning_rate": 9.997850073946215e-05, + "loss": 2.489, + "step": 1274 + }, + { + "epoch": 0.39134438305709024, + "grad_norm": 1.198461890220642, + "learning_rate": 9.997835474525193e-05, + "loss": 2.51, + "step": 1275 + }, + { + "epoch": 0.3916513198281154, + "grad_norm": 1.1643213033676147, + "learning_rate": 9.997820825712629e-05, + "loss": 2.5688, + "step": 1276 + }, + { + "epoch": 0.3919582565991406, + "grad_norm": 1.2107082605361938, + "learning_rate": 9.997806127508671e-05, + "loss": 2.5614, + "step": 1277 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 1.1856440305709839, + "learning_rate": 9.997791379913464e-05, + "loss": 2.5893, + "step": 1278 + }, + { + "epoch": 0.3925721301411909, + "grad_norm": 1.166395664215088, + "learning_rate": 9.997776582927153e-05, + "loss": 2.539, + "step": 1279 + }, + { + "epoch": 0.3928790669122161, + "grad_norm": 1.1638765335083008, + "learning_rate": 9.997761736549886e-05, + "loss": 2.5384, + "step": 1280 + }, + { + "epoch": 0.39318600368324125, + "grad_norm": 1.107485055923462, + "learning_rate": 9.997746840781806e-05, + "loss": 2.559, + "step": 1281 + }, + { + "epoch": 0.3934929404542664, + "grad_norm": 1.174592137336731, + "learning_rate": 9.997731895623063e-05, + "loss": 2.5132, + "step": 1282 + }, + { + "epoch": 0.3937998772252916, + "grad_norm": 1.0407745838165283, + "learning_rate": 9.997716901073806e-05, + "loss": 2.4871, + "step": 1283 + }, + { + "epoch": 0.39410681399631675, + "grad_norm": 1.059743046760559, + "learning_rate": 9.997701857134179e-05, + "loss": 2.4865, + "step": 1284 + }, + { + "epoch": 0.3944137507673419, + "grad_norm": 1.0606070756912231, + "learning_rate": 9.997686763804335e-05, + "loss": 2.5651, + "step": 1285 + }, + { + "epoch": 0.3947206875383671, + "grad_norm": 1.0753284692764282, + "learning_rate": 9.99767162108442e-05, + "loss": 2.4699, + "step": 1286 + }, + { + "epoch": 0.39502762430939226, + "grad_norm": 1.1155509948730469, + "learning_rate": 9.997656428974585e-05, + "loss": 2.5326, + "step": 1287 + }, + { + "epoch": 0.3953345610804174, + "grad_norm": 1.2243739366531372, + "learning_rate": 9.99764118747498e-05, + "loss": 2.5189, + "step": 1288 + }, + { + "epoch": 0.3956414978514426, + "grad_norm": 1.2526514530181885, + "learning_rate": 9.997625896585757e-05, + "loss": 2.5464, + "step": 1289 + }, + { + "epoch": 0.39594843462246776, + "grad_norm": 1.297153115272522, + "learning_rate": 9.997610556307062e-05, + "loss": 2.5752, + "step": 1290 + }, + { + "epoch": 0.39625537139349293, + "grad_norm": 1.1064956188201904, + "learning_rate": 9.997595166639054e-05, + "loss": 2.5743, + "step": 1291 + }, + { + "epoch": 0.3965623081645181, + "grad_norm": 1.255810022354126, + "learning_rate": 9.997579727581879e-05, + "loss": 2.7087, + "step": 1292 + }, + { + "epoch": 0.39686924493554326, + "grad_norm": 1.4290298223495483, + "learning_rate": 9.997564239135692e-05, + "loss": 2.5417, + "step": 1293 + }, + { + "epoch": 0.39717618170656843, + "grad_norm": 1.1937109231948853, + "learning_rate": 9.997548701300648e-05, + "loss": 2.4862, + "step": 1294 + }, + { + "epoch": 0.3974831184775936, + "grad_norm": 1.1707425117492676, + "learning_rate": 9.997533114076897e-05, + "loss": 2.4715, + "step": 1295 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 1.1248551607131958, + "learning_rate": 9.997517477464596e-05, + "loss": 2.4859, + "step": 1296 + }, + { + "epoch": 0.39809699201964394, + "grad_norm": 1.1656453609466553, + "learning_rate": 9.997501791463897e-05, + "loss": 2.5402, + "step": 1297 + }, + { + "epoch": 0.3984039287906691, + "grad_norm": 0.9916674494743347, + "learning_rate": 9.997486056074956e-05, + "loss": 2.5116, + "step": 1298 + }, + { + "epoch": 0.39871086556169427, + "grad_norm": 1.3229619264602661, + "learning_rate": 9.997470271297928e-05, + "loss": 2.5565, + "step": 1299 + }, + { + "epoch": 0.39901780233271944, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.997454437132971e-05, + "loss": 2.5191, + "step": 1300 + }, + { + "epoch": 0.3993247391037446, + "grad_norm": 1.2117778062820435, + "learning_rate": 9.997438553580241e-05, + "loss": 2.558, + "step": 1301 + }, + { + "epoch": 0.3996316758747698, + "grad_norm": 1.1083563566207886, + "learning_rate": 9.997422620639892e-05, + "loss": 2.4734, + "step": 1302 + }, + { + "epoch": 0.39993861264579494, + "grad_norm": 0.9662174582481384, + "learning_rate": 9.997406638312084e-05, + "loss": 2.4866, + "step": 1303 + }, + { + "epoch": 0.4002455494168201, + "grad_norm": 1.0886632204055786, + "learning_rate": 9.997390606596976e-05, + "loss": 2.5397, + "step": 1304 + }, + { + "epoch": 0.4005524861878453, + "grad_norm": 1.2318742275238037, + "learning_rate": 9.997374525494723e-05, + "loss": 2.6281, + "step": 1305 + }, + { + "epoch": 0.40085942295887045, + "grad_norm": 1.1717815399169922, + "learning_rate": 9.997358395005487e-05, + "loss": 2.5202, + "step": 1306 + }, + { + "epoch": 0.4011663597298956, + "grad_norm": 1.0533723831176758, + "learning_rate": 9.997342215129427e-05, + "loss": 2.5096, + "step": 1307 + }, + { + "epoch": 0.4014732965009208, + "grad_norm": 1.0814248323440552, + "learning_rate": 9.997325985866701e-05, + "loss": 2.5513, + "step": 1308 + }, + { + "epoch": 0.40178023327194595, + "grad_norm": 1.078261137008667, + "learning_rate": 9.997309707217472e-05, + "loss": 2.5115, + "step": 1309 + }, + { + "epoch": 0.4020871700429711, + "grad_norm": 1.0834710597991943, + "learning_rate": 9.997293379181897e-05, + "loss": 2.4754, + "step": 1310 + }, + { + "epoch": 0.40239410681399634, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.997277001760142e-05, + "loss": 2.5068, + "step": 1311 + }, + { + "epoch": 0.4027010435850215, + "grad_norm": 1.3008345365524292, + "learning_rate": 9.997260574952366e-05, + "loss": 2.4675, + "step": 1312 + }, + { + "epoch": 0.4030079803560467, + "grad_norm": 1.176858901977539, + "learning_rate": 9.997244098758732e-05, + "loss": 2.4786, + "step": 1313 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 1.0121303796768188, + "learning_rate": 9.997227573179403e-05, + "loss": 2.476, + "step": 1314 + }, + { + "epoch": 0.403621853898097, + "grad_norm": 1.326298713684082, + "learning_rate": 9.997210998214542e-05, + "loss": 2.4093, + "step": 1315 + }, + { + "epoch": 0.4039287906691222, + "grad_norm": 0.9008898735046387, + "learning_rate": 9.997194373864314e-05, + "loss": 2.4523, + "step": 1316 + }, + { + "epoch": 0.40423572744014735, + "grad_norm": 1.0441854000091553, + "learning_rate": 9.99717770012888e-05, + "loss": 2.5419, + "step": 1317 + }, + { + "epoch": 0.4045426642111725, + "grad_norm": 1.0490028858184814, + "learning_rate": 9.997160977008408e-05, + "loss": 2.4855, + "step": 1318 + }, + { + "epoch": 0.4048496009821977, + "grad_norm": 1.0244388580322266, + "learning_rate": 9.997144204503063e-05, + "loss": 2.4555, + "step": 1319 + }, + { + "epoch": 0.40515653775322286, + "grad_norm": 1.1217700242996216, + "learning_rate": 9.99712738261301e-05, + "loss": 2.4872, + "step": 1320 + }, + { + "epoch": 0.405463474524248, + "grad_norm": 1.031691551208496, + "learning_rate": 9.997110511338414e-05, + "loss": 2.4094, + "step": 1321 + }, + { + "epoch": 0.4057704112952732, + "grad_norm": 1.1658705472946167, + "learning_rate": 9.997093590679444e-05, + "loss": 2.407, + "step": 1322 + }, + { + "epoch": 0.40607734806629836, + "grad_norm": 1.1527072191238403, + "learning_rate": 9.997076620636266e-05, + "loss": 2.5041, + "step": 1323 + }, + { + "epoch": 0.40638428483732353, + "grad_norm": 1.2039116621017456, + "learning_rate": 9.997059601209049e-05, + "loss": 2.4682, + "step": 1324 + }, + { + "epoch": 0.4066912216083487, + "grad_norm": 1.142160177230835, + "learning_rate": 9.997042532397957e-05, + "loss": 2.4629, + "step": 1325 + }, + { + "epoch": 0.40699815837937386, + "grad_norm": 0.972081184387207, + "learning_rate": 9.997025414203164e-05, + "loss": 2.3941, + "step": 1326 + }, + { + "epoch": 0.40730509515039903, + "grad_norm": 1.0181753635406494, + "learning_rate": 9.99700824662484e-05, + "loss": 2.5649, + "step": 1327 + }, + { + "epoch": 0.4076120319214242, + "grad_norm": 1.145769715309143, + "learning_rate": 9.996991029663148e-05, + "loss": 2.5284, + "step": 1328 + }, + { + "epoch": 0.40791896869244937, + "grad_norm": 1.0604028701782227, + "learning_rate": 9.996973763318262e-05, + "loss": 2.4488, + "step": 1329 + }, + { + "epoch": 0.40822590546347454, + "grad_norm": 1.161383867263794, + "learning_rate": 9.996956447590354e-05, + "loss": 2.6081, + "step": 1330 + }, + { + "epoch": 0.4085328422344997, + "grad_norm": 1.0880714654922485, + "learning_rate": 9.996939082479591e-05, + "loss": 2.4695, + "step": 1331 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 1.036556601524353, + "learning_rate": 9.99692166798615e-05, + "loss": 2.4428, + "step": 1332 + }, + { + "epoch": 0.40914671577655004, + "grad_norm": 1.079179286956787, + "learning_rate": 9.996904204110198e-05, + "loss": 2.4543, + "step": 1333 + }, + { + "epoch": 0.4094536525475752, + "grad_norm": 1.0588144063949585, + "learning_rate": 9.996886690851912e-05, + "loss": 2.4755, + "step": 1334 + }, + { + "epoch": 0.4097605893186004, + "grad_norm": 1.0359580516815186, + "learning_rate": 9.996869128211462e-05, + "loss": 2.4933, + "step": 1335 + }, + { + "epoch": 0.41006752608962554, + "grad_norm": 1.0067389011383057, + "learning_rate": 9.996851516189021e-05, + "loss": 2.4291, + "step": 1336 + }, + { + "epoch": 0.4103744628606507, + "grad_norm": 1.0173524618148804, + "learning_rate": 9.996833854784766e-05, + "loss": 2.4856, + "step": 1337 + }, + { + "epoch": 0.4106813996316759, + "grad_norm": 1.0740927457809448, + "learning_rate": 9.99681614399887e-05, + "loss": 2.5248, + "step": 1338 + }, + { + "epoch": 0.41098833640270105, + "grad_norm": 0.9638547301292419, + "learning_rate": 9.99679838383151e-05, + "loss": 2.4777, + "step": 1339 + }, + { + "epoch": 0.4112952731737262, + "grad_norm": 1.0349369049072266, + "learning_rate": 9.996780574282856e-05, + "loss": 2.5188, + "step": 1340 + }, + { + "epoch": 0.4116022099447514, + "grad_norm": 1.099743127822876, + "learning_rate": 9.996762715353089e-05, + "loss": 2.4141, + "step": 1341 + }, + { + "epoch": 0.41190914671577655, + "grad_norm": 1.027178406715393, + "learning_rate": 9.996744807042386e-05, + "loss": 2.5134, + "step": 1342 + }, + { + "epoch": 0.4122160834868017, + "grad_norm": 1.1933472156524658, + "learning_rate": 9.996726849350922e-05, + "loss": 2.4821, + "step": 1343 + }, + { + "epoch": 0.4125230202578269, + "grad_norm": 1.1663923263549805, + "learning_rate": 9.996708842278872e-05, + "loss": 2.4593, + "step": 1344 + }, + { + "epoch": 0.41282995702885206, + "grad_norm": 1.2633854150772095, + "learning_rate": 9.996690785826418e-05, + "loss": 2.5524, + "step": 1345 + }, + { + "epoch": 0.4131368937998772, + "grad_norm": 1.03873610496521, + "learning_rate": 9.996672679993737e-05, + "loss": 2.5403, + "step": 1346 + }, + { + "epoch": 0.4134438305709024, + "grad_norm": 1.106656789779663, + "learning_rate": 9.996654524781009e-05, + "loss": 2.5172, + "step": 1347 + }, + { + "epoch": 0.41375076734192756, + "grad_norm": 1.015608310699463, + "learning_rate": 9.996636320188411e-05, + "loss": 2.423, + "step": 1348 + }, + { + "epoch": 0.41405770411295273, + "grad_norm": 1.0672087669372559, + "learning_rate": 9.996618066216124e-05, + "loss": 2.4861, + "step": 1349 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 1.1289842128753662, + "learning_rate": 9.996599762864329e-05, + "loss": 2.3944, + "step": 1350 + }, + { + "epoch": 0.41467157765500307, + "grad_norm": 1.080428957939148, + "learning_rate": 9.996581410133207e-05, + "loss": 2.4563, + "step": 1351 + }, + { + "epoch": 0.41497851442602823, + "grad_norm": 1.257104516029358, + "learning_rate": 9.996563008022939e-05, + "loss": 2.437, + "step": 1352 + }, + { + "epoch": 0.4152854511970534, + "grad_norm": 1.039293646812439, + "learning_rate": 9.996544556533706e-05, + "loss": 2.4654, + "step": 1353 + }, + { + "epoch": 0.41559238796807857, + "grad_norm": 1.0976085662841797, + "learning_rate": 9.996526055665692e-05, + "loss": 2.4755, + "step": 1354 + }, + { + "epoch": 0.41589932473910374, + "grad_norm": 0.937647819519043, + "learning_rate": 9.996507505419078e-05, + "loss": 2.4687, + "step": 1355 + }, + { + "epoch": 0.4162062615101289, + "grad_norm": 1.0461267232894897, + "learning_rate": 9.996488905794047e-05, + "loss": 2.4092, + "step": 1356 + }, + { + "epoch": 0.4165131982811541, + "grad_norm": 1.0510658025741577, + "learning_rate": 9.996470256790787e-05, + "loss": 2.4806, + "step": 1357 + }, + { + "epoch": 0.41682013505217924, + "grad_norm": 1.2323371171951294, + "learning_rate": 9.996451558409478e-05, + "loss": 2.5017, + "step": 1358 + }, + { + "epoch": 0.4171270718232044, + "grad_norm": 0.9880139827728271, + "learning_rate": 9.996432810650307e-05, + "loss": 2.5171, + "step": 1359 + }, + { + "epoch": 0.4174340085942296, + "grad_norm": 1.2572466135025024, + "learning_rate": 9.996414013513458e-05, + "loss": 2.4285, + "step": 1360 + }, + { + "epoch": 0.41774094536525475, + "grad_norm": 0.8809559941291809, + "learning_rate": 9.996395166999118e-05, + "loss": 2.398, + "step": 1361 + }, + { + "epoch": 0.4180478821362799, + "grad_norm": 0.9389429688453674, + "learning_rate": 9.996376271107471e-05, + "loss": 2.4539, + "step": 1362 + }, + { + "epoch": 0.4183548189073051, + "grad_norm": 0.8821789026260376, + "learning_rate": 9.996357325838705e-05, + "loss": 2.4762, + "step": 1363 + }, + { + "epoch": 0.41866175567833025, + "grad_norm": 1.0148484706878662, + "learning_rate": 9.99633833119301e-05, + "loss": 2.5292, + "step": 1364 + }, + { + "epoch": 0.4189686924493554, + "grad_norm": 0.9861947894096375, + "learning_rate": 9.996319287170569e-05, + "loss": 2.4285, + "step": 1365 + }, + { + "epoch": 0.4192756292203806, + "grad_norm": 1.1907099485397339, + "learning_rate": 9.996300193771573e-05, + "loss": 2.4325, + "step": 1366 + }, + { + "epoch": 0.41958256599140575, + "grad_norm": 1.0746681690216064, + "learning_rate": 9.99628105099621e-05, + "loss": 2.3349, + "step": 1367 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 1.2040268182754517, + "learning_rate": 9.996261858844669e-05, + "loss": 2.4427, + "step": 1368 + }, + { + "epoch": 0.4201964395334561, + "grad_norm": 1.0487430095672607, + "learning_rate": 9.99624261731714e-05, + "loss": 2.4305, + "step": 1369 + }, + { + "epoch": 0.42050337630448126, + "grad_norm": 1.0047999620437622, + "learning_rate": 9.996223326413812e-05, + "loss": 2.4442, + "step": 1370 + }, + { + "epoch": 0.4208103130755064, + "grad_norm": 1.147078275680542, + "learning_rate": 9.996203986134879e-05, + "loss": 2.5189, + "step": 1371 + }, + { + "epoch": 0.4211172498465316, + "grad_norm": 1.2269455194473267, + "learning_rate": 9.996184596480529e-05, + "loss": 2.3905, + "step": 1372 + }, + { + "epoch": 0.42142418661755676, + "grad_norm": 0.9716771245002747, + "learning_rate": 9.996165157450954e-05, + "loss": 2.4246, + "step": 1373 + }, + { + "epoch": 0.42173112338858193, + "grad_norm": 1.0569939613342285, + "learning_rate": 9.996145669046347e-05, + "loss": 2.529, + "step": 1374 + }, + { + "epoch": 0.4220380601596071, + "grad_norm": 1.1145942211151123, + "learning_rate": 9.996126131266899e-05, + "loss": 2.3965, + "step": 1375 + }, + { + "epoch": 0.42234499693063227, + "grad_norm": 0.9990974068641663, + "learning_rate": 9.996106544112805e-05, + "loss": 2.4991, + "step": 1376 + }, + { + "epoch": 0.42265193370165743, + "grad_norm": 0.9536247253417969, + "learning_rate": 9.99608690758426e-05, + "loss": 2.4347, + "step": 1377 + }, + { + "epoch": 0.4229588704726826, + "grad_norm": 1.0053460597991943, + "learning_rate": 9.996067221681452e-05, + "loss": 2.4213, + "step": 1378 + }, + { + "epoch": 0.42326580724370777, + "grad_norm": 1.0727168321609497, + "learning_rate": 9.99604748640458e-05, + "loss": 2.4479, + "step": 1379 + }, + { + "epoch": 0.42357274401473294, + "grad_norm": 1.2539277076721191, + "learning_rate": 9.996027701753841e-05, + "loss": 2.4721, + "step": 1380 + }, + { + "epoch": 0.4238796807857581, + "grad_norm": 1.0348230600357056, + "learning_rate": 9.996007867729427e-05, + "loss": 2.4263, + "step": 1381 + }, + { + "epoch": 0.42418661755678333, + "grad_norm": 1.051802158355713, + "learning_rate": 9.995987984331533e-05, + "loss": 2.4492, + "step": 1382 + }, + { + "epoch": 0.4244935543278085, + "grad_norm": 1.0394505262374878, + "learning_rate": 9.995968051560361e-05, + "loss": 2.4625, + "step": 1383 + }, + { + "epoch": 0.42480049109883367, + "grad_norm": 1.1121852397918701, + "learning_rate": 9.995948069416103e-05, + "loss": 2.4999, + "step": 1384 + }, + { + "epoch": 0.42510742786985883, + "grad_norm": 0.9693613052368164, + "learning_rate": 9.995928037898957e-05, + "loss": 2.4112, + "step": 1385 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 1.1416810750961304, + "learning_rate": 9.995907957009123e-05, + "loss": 2.5452, + "step": 1386 + }, + { + "epoch": 0.42572130141190917, + "grad_norm": 1.010640025138855, + "learning_rate": 9.995887826746797e-05, + "loss": 2.412, + "step": 1387 + }, + { + "epoch": 0.42602823818293434, + "grad_norm": 1.0800373554229736, + "learning_rate": 9.99586764711218e-05, + "loss": 2.4451, + "step": 1388 + }, + { + "epoch": 0.4263351749539595, + "grad_norm": 1.058931589126587, + "learning_rate": 9.995847418105471e-05, + "loss": 2.474, + "step": 1389 + }, + { + "epoch": 0.4266421117249847, + "grad_norm": 1.0727131366729736, + "learning_rate": 9.99582713972687e-05, + "loss": 2.468, + "step": 1390 + }, + { + "epoch": 0.42694904849600984, + "grad_norm": 1.0237464904785156, + "learning_rate": 9.995806811976576e-05, + "loss": 2.5208, + "step": 1391 + }, + { + "epoch": 0.427255985267035, + "grad_norm": 1.036582112312317, + "learning_rate": 9.995786434854793e-05, + "loss": 2.4338, + "step": 1392 + }, + { + "epoch": 0.4275629220380602, + "grad_norm": 0.9617817997932434, + "learning_rate": 9.995766008361719e-05, + "loss": 2.4465, + "step": 1393 + }, + { + "epoch": 0.42786985880908535, + "grad_norm": 1.2188911437988281, + "learning_rate": 9.995745532497556e-05, + "loss": 2.5069, + "step": 1394 + }, + { + "epoch": 0.4281767955801105, + "grad_norm": 1.0796585083007812, + "learning_rate": 9.99572500726251e-05, + "loss": 2.4839, + "step": 1395 + }, + { + "epoch": 0.4284837323511357, + "grad_norm": 0.9843130111694336, + "learning_rate": 9.99570443265678e-05, + "loss": 2.4968, + "step": 1396 + }, + { + "epoch": 0.42879066912216085, + "grad_norm": 1.0441415309906006, + "learning_rate": 9.99568380868057e-05, + "loss": 2.4134, + "step": 1397 + }, + { + "epoch": 0.429097605893186, + "grad_norm": 0.9156177639961243, + "learning_rate": 9.995663135334085e-05, + "loss": 2.4891, + "step": 1398 + }, + { + "epoch": 0.4294045426642112, + "grad_norm": 1.1159545183181763, + "learning_rate": 9.995642412617529e-05, + "loss": 2.4507, + "step": 1399 + }, + { + "epoch": 0.42971147943523635, + "grad_norm": 0.8944577574729919, + "learning_rate": 9.995621640531107e-05, + "loss": 2.4465, + "step": 1400 + }, + { + "epoch": 0.4300184162062615, + "grad_norm": 0.9043408036231995, + "learning_rate": 9.995600819075025e-05, + "loss": 2.3726, + "step": 1401 + }, + { + "epoch": 0.4303253529772867, + "grad_norm": 0.9028464555740356, + "learning_rate": 9.995579948249486e-05, + "loss": 2.427, + "step": 1402 + }, + { + "epoch": 0.43063228974831186, + "grad_norm": 0.9497705101966858, + "learning_rate": 9.995559028054699e-05, + "loss": 2.4666, + "step": 1403 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.927601158618927, + "learning_rate": 9.995538058490868e-05, + "loss": 2.3679, + "step": 1404 + }, + { + "epoch": 0.4312461632903622, + "grad_norm": 1.050394892692566, + "learning_rate": 9.995517039558204e-05, + "loss": 2.4096, + "step": 1405 + }, + { + "epoch": 0.43155310006138736, + "grad_norm": 1.3011974096298218, + "learning_rate": 9.995495971256911e-05, + "loss": 2.4439, + "step": 1406 + }, + { + "epoch": 0.43186003683241253, + "grad_norm": 1.0740708112716675, + "learning_rate": 9.9954748535872e-05, + "loss": 2.4891, + "step": 1407 + }, + { + "epoch": 0.4321669736034377, + "grad_norm": 1.1132466793060303, + "learning_rate": 9.995453686549279e-05, + "loss": 2.46, + "step": 1408 + }, + { + "epoch": 0.43247391037446287, + "grad_norm": 1.063275933265686, + "learning_rate": 9.995432470143356e-05, + "loss": 2.5035, + "step": 1409 + }, + { + "epoch": 0.43278084714548803, + "grad_norm": 1.065679669380188, + "learning_rate": 9.99541120436964e-05, + "loss": 2.4471, + "step": 1410 + }, + { + "epoch": 0.4330877839165132, + "grad_norm": 1.017587423324585, + "learning_rate": 9.995389889228344e-05, + "loss": 2.4879, + "step": 1411 + }, + { + "epoch": 0.43339472068753837, + "grad_norm": 0.9744442701339722, + "learning_rate": 9.995368524719678e-05, + "loss": 2.3923, + "step": 1412 + }, + { + "epoch": 0.43370165745856354, + "grad_norm": 0.8916706442832947, + "learning_rate": 9.995347110843851e-05, + "loss": 2.3965, + "step": 1413 + }, + { + "epoch": 0.4340085942295887, + "grad_norm": 0.916221559047699, + "learning_rate": 9.995325647601075e-05, + "loss": 2.4742, + "step": 1414 + }, + { + "epoch": 0.4343155310006139, + "grad_norm": 0.9388782978057861, + "learning_rate": 9.995304134991565e-05, + "loss": 2.453, + "step": 1415 + }, + { + "epoch": 0.43462246777163904, + "grad_norm": 1.057085633277893, + "learning_rate": 9.995282573015532e-05, + "loss": 2.5791, + "step": 1416 + }, + { + "epoch": 0.4349294045426642, + "grad_norm": 1.055145025253296, + "learning_rate": 9.995260961673187e-05, + "loss": 2.3565, + "step": 1417 + }, + { + "epoch": 0.4352363413136894, + "grad_norm": 1.0733528137207031, + "learning_rate": 9.995239300964747e-05, + "loss": 2.5413, + "step": 1418 + }, + { + "epoch": 0.43554327808471455, + "grad_norm": 1.1478198766708374, + "learning_rate": 9.995217590890425e-05, + "loss": 2.4093, + "step": 1419 + }, + { + "epoch": 0.4358502148557397, + "grad_norm": 0.8663081526756287, + "learning_rate": 9.995195831450432e-05, + "loss": 2.3968, + "step": 1420 + }, + { + "epoch": 0.4361571516267649, + "grad_norm": 0.9811860918998718, + "learning_rate": 9.995174022644988e-05, + "loss": 2.3536, + "step": 1421 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.9883477687835693, + "learning_rate": 9.995152164474306e-05, + "loss": 2.5372, + "step": 1422 + }, + { + "epoch": 0.4367710251688152, + "grad_norm": 1.2196532487869263, + "learning_rate": 9.995130256938603e-05, + "loss": 2.429, + "step": 1423 + }, + { + "epoch": 0.4370779619398404, + "grad_norm": 1.000264286994934, + "learning_rate": 9.995108300038096e-05, + "loss": 2.4116, + "step": 1424 + }, + { + "epoch": 0.43738489871086556, + "grad_norm": 1.1259286403656006, + "learning_rate": 9.995086293773e-05, + "loss": 2.4405, + "step": 1425 + }, + { + "epoch": 0.4376918354818907, + "grad_norm": 0.9334595203399658, + "learning_rate": 9.995064238143533e-05, + "loss": 2.3849, + "step": 1426 + }, + { + "epoch": 0.4379987722529159, + "grad_norm": 0.8880285620689392, + "learning_rate": 9.995042133149914e-05, + "loss": 2.4177, + "step": 1427 + }, + { + "epoch": 0.43830570902394106, + "grad_norm": 0.8823251724243164, + "learning_rate": 9.995019978792362e-05, + "loss": 2.4876, + "step": 1428 + }, + { + "epoch": 0.4386126457949662, + "grad_norm": 0.9289014339447021, + "learning_rate": 9.994997775071094e-05, + "loss": 2.4725, + "step": 1429 + }, + { + "epoch": 0.4389195825659914, + "grad_norm": 0.9100427627563477, + "learning_rate": 9.994975521986329e-05, + "loss": 2.3834, + "step": 1430 + }, + { + "epoch": 0.43922651933701656, + "grad_norm": 0.8956978917121887, + "learning_rate": 9.99495321953829e-05, + "loss": 2.4418, + "step": 1431 + }, + { + "epoch": 0.43953345610804173, + "grad_norm": 1.1248396635055542, + "learning_rate": 9.994930867727195e-05, + "loss": 2.4389, + "step": 1432 + }, + { + "epoch": 0.4398403928790669, + "grad_norm": 0.9285669922828674, + "learning_rate": 9.994908466553266e-05, + "loss": 2.3922, + "step": 1433 + }, + { + "epoch": 0.44014732965009207, + "grad_norm": 0.9604844450950623, + "learning_rate": 9.994886016016723e-05, + "loss": 2.4365, + "step": 1434 + }, + { + "epoch": 0.44045426642111724, + "grad_norm": 1.0534024238586426, + "learning_rate": 9.99486351611779e-05, + "loss": 2.4377, + "step": 1435 + }, + { + "epoch": 0.4407612031921424, + "grad_norm": 1.1028003692626953, + "learning_rate": 9.994840966856686e-05, + "loss": 2.4299, + "step": 1436 + }, + { + "epoch": 0.44106813996316757, + "grad_norm": 1.119832158088684, + "learning_rate": 9.994818368233639e-05, + "loss": 2.4656, + "step": 1437 + }, + { + "epoch": 0.44137507673419274, + "grad_norm": 0.9782878160476685, + "learning_rate": 9.994795720248867e-05, + "loss": 2.3661, + "step": 1438 + }, + { + "epoch": 0.4416820135052179, + "grad_norm": 1.0002741813659668, + "learning_rate": 9.994773022902597e-05, + "loss": 2.4157, + "step": 1439 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 1.051486611366272, + "learning_rate": 9.994750276195053e-05, + "loss": 2.452, + "step": 1440 + }, + { + "epoch": 0.44229588704726824, + "grad_norm": 1.0375488996505737, + "learning_rate": 9.994727480126457e-05, + "loss": 2.4406, + "step": 1441 + }, + { + "epoch": 0.4426028238182934, + "grad_norm": 0.9407445192337036, + "learning_rate": 9.99470463469704e-05, + "loss": 2.3434, + "step": 1442 + }, + { + "epoch": 0.4429097605893186, + "grad_norm": 1.0371474027633667, + "learning_rate": 9.994681739907022e-05, + "loss": 2.5094, + "step": 1443 + }, + { + "epoch": 0.44321669736034375, + "grad_norm": 1.057519555091858, + "learning_rate": 9.994658795756632e-05, + "loss": 2.4501, + "step": 1444 + }, + { + "epoch": 0.4435236341313689, + "grad_norm": 0.9340078234672546, + "learning_rate": 9.994635802246097e-05, + "loss": 2.4151, + "step": 1445 + }, + { + "epoch": 0.4438305709023941, + "grad_norm": 0.8906050324440002, + "learning_rate": 9.994612759375644e-05, + "loss": 2.3837, + "step": 1446 + }, + { + "epoch": 0.44413750767341925, + "grad_norm": 0.8349595665931702, + "learning_rate": 9.994589667145497e-05, + "loss": 2.4317, + "step": 1447 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.9362117648124695, + "learning_rate": 9.994566525555891e-05, + "loss": 2.4586, + "step": 1448 + }, + { + "epoch": 0.4447513812154696, + "grad_norm": 0.869215190410614, + "learning_rate": 9.99454333460705e-05, + "loss": 2.4458, + "step": 1449 + }, + { + "epoch": 0.44505831798649476, + "grad_norm": 0.904531717300415, + "learning_rate": 9.994520094299204e-05, + "loss": 2.4198, + "step": 1450 + }, + { + "epoch": 0.4453652547575199, + "grad_norm": 0.9153178930282593, + "learning_rate": 9.994496804632583e-05, + "loss": 2.3718, + "step": 1451 + }, + { + "epoch": 0.44567219152854515, + "grad_norm": 1.0229307413101196, + "learning_rate": 9.994473465607418e-05, + "loss": 2.3787, + "step": 1452 + }, + { + "epoch": 0.4459791282995703, + "grad_norm": 1.0449415445327759, + "learning_rate": 9.994450077223938e-05, + "loss": 2.4965, + "step": 1453 + }, + { + "epoch": 0.4462860650705955, + "grad_norm": 1.0524135828018188, + "learning_rate": 9.994426639482375e-05, + "loss": 2.3518, + "step": 1454 + }, + { + "epoch": 0.44659300184162065, + "grad_norm": 1.0612086057662964, + "learning_rate": 9.994403152382961e-05, + "loss": 2.4501, + "step": 1455 + }, + { + "epoch": 0.4468999386126458, + "grad_norm": 1.0568779706954956, + "learning_rate": 9.994379615925929e-05, + "loss": 2.3754, + "step": 1456 + }, + { + "epoch": 0.447206875383671, + "grad_norm": 1.0984265804290771, + "learning_rate": 9.994356030111509e-05, + "loss": 2.4318, + "step": 1457 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.9227646589279175, + "learning_rate": 9.994332394939936e-05, + "loss": 2.3928, + "step": 1458 + }, + { + "epoch": 0.4478207489257213, + "grad_norm": 1.0073471069335938, + "learning_rate": 9.994308710411442e-05, + "loss": 2.4203, + "step": 1459 + }, + { + "epoch": 0.4481276856967465, + "grad_norm": 1.1347973346710205, + "learning_rate": 9.994284976526263e-05, + "loss": 2.4991, + "step": 1460 + }, + { + "epoch": 0.44843462246777166, + "grad_norm": 0.9912654757499695, + "learning_rate": 9.994261193284631e-05, + "loss": 2.471, + "step": 1461 + }, + { + "epoch": 0.4487415592387968, + "grad_norm": 1.0599550008773804, + "learning_rate": 9.994237360686784e-05, + "loss": 2.505, + "step": 1462 + }, + { + "epoch": 0.449048496009822, + "grad_norm": 0.9811004996299744, + "learning_rate": 9.994213478732957e-05, + "loss": 2.3868, + "step": 1463 + }, + { + "epoch": 0.44935543278084716, + "grad_norm": 0.8389631509780884, + "learning_rate": 9.994189547423384e-05, + "loss": 2.4766, + "step": 1464 + }, + { + "epoch": 0.44966236955187233, + "grad_norm": 0.8475043773651123, + "learning_rate": 9.994165566758302e-05, + "loss": 2.3666, + "step": 1465 + }, + { + "epoch": 0.4499693063228975, + "grad_norm": 0.8922824859619141, + "learning_rate": 9.994141536737951e-05, + "loss": 2.3823, + "step": 1466 + }, + { + "epoch": 0.45027624309392267, + "grad_norm": 1.0286083221435547, + "learning_rate": 9.994117457362564e-05, + "loss": 2.4639, + "step": 1467 + }, + { + "epoch": 0.45058317986494784, + "grad_norm": 1.094282865524292, + "learning_rate": 9.994093328632383e-05, + "loss": 2.3984, + "step": 1468 + }, + { + "epoch": 0.450890116635973, + "grad_norm": 1.0993603467941284, + "learning_rate": 9.994069150547642e-05, + "loss": 2.3719, + "step": 1469 + }, + { + "epoch": 0.45119705340699817, + "grad_norm": 1.0274133682250977, + "learning_rate": 9.994044923108585e-05, + "loss": 2.3644, + "step": 1470 + }, + { + "epoch": 0.45150399017802334, + "grad_norm": 0.8834434747695923, + "learning_rate": 9.994020646315448e-05, + "loss": 2.4955, + "step": 1471 + }, + { + "epoch": 0.4518109269490485, + "grad_norm": 0.8540776968002319, + "learning_rate": 9.993996320168473e-05, + "loss": 2.4292, + "step": 1472 + }, + { + "epoch": 0.4521178637200737, + "grad_norm": 0.8735383749008179, + "learning_rate": 9.993971944667897e-05, + "loss": 2.4343, + "step": 1473 + }, + { + "epoch": 0.45242480049109884, + "grad_norm": 0.976224422454834, + "learning_rate": 9.993947519813965e-05, + "loss": 2.4173, + "step": 1474 + }, + { + "epoch": 0.452731737262124, + "grad_norm": 0.9638139009475708, + "learning_rate": 9.993923045606917e-05, + "loss": 2.4322, + "step": 1475 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.9689927697181702, + "learning_rate": 9.993898522046992e-05, + "loss": 2.4625, + "step": 1476 + }, + { + "epoch": 0.45334561080417435, + "grad_norm": 1.0496052503585815, + "learning_rate": 9.993873949134437e-05, + "loss": 2.4788, + "step": 1477 + }, + { + "epoch": 0.4536525475751995, + "grad_norm": 1.0285090208053589, + "learning_rate": 9.993849326869491e-05, + "loss": 2.4119, + "step": 1478 + }, + { + "epoch": 0.4539594843462247, + "grad_norm": 0.9423730373382568, + "learning_rate": 9.993824655252401e-05, + "loss": 2.3919, + "step": 1479 + }, + { + "epoch": 0.45426642111724985, + "grad_norm": 1.0312988758087158, + "learning_rate": 9.993799934283407e-05, + "loss": 2.3829, + "step": 1480 + }, + { + "epoch": 0.454573357888275, + "grad_norm": 1.0985655784606934, + "learning_rate": 9.993775163962755e-05, + "loss": 2.3958, + "step": 1481 + }, + { + "epoch": 0.4548802946593002, + "grad_norm": 0.9346623420715332, + "learning_rate": 9.993750344290691e-05, + "loss": 2.3611, + "step": 1482 + }, + { + "epoch": 0.45518723143032536, + "grad_norm": 1.039681315422058, + "learning_rate": 9.993725475267459e-05, + "loss": 2.3989, + "step": 1483 + }, + { + "epoch": 0.4554941682013505, + "grad_norm": 0.9941854476928711, + "learning_rate": 9.993700556893304e-05, + "loss": 2.3092, + "step": 1484 + }, + { + "epoch": 0.4558011049723757, + "grad_norm": 0.9752130508422852, + "learning_rate": 9.993675589168473e-05, + "loss": 2.3727, + "step": 1485 + }, + { + "epoch": 0.45610804174340086, + "grad_norm": 0.9946039319038391, + "learning_rate": 9.993650572093216e-05, + "loss": 2.4121, + "step": 1486 + }, + { + "epoch": 0.45641497851442603, + "grad_norm": 1.1340489387512207, + "learning_rate": 9.993625505667774e-05, + "loss": 2.4477, + "step": 1487 + }, + { + "epoch": 0.4567219152854512, + "grad_norm": 0.9300981760025024, + "learning_rate": 9.993600389892399e-05, + "loss": 2.4045, + "step": 1488 + }, + { + "epoch": 0.45702885205647636, + "grad_norm": 0.8670973181724548, + "learning_rate": 9.993575224767338e-05, + "loss": 2.3596, + "step": 1489 + }, + { + "epoch": 0.45733578882750153, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.99355001029284e-05, + "loss": 2.4191, + "step": 1490 + }, + { + "epoch": 0.4576427255985267, + "grad_norm": 0.9099079370498657, + "learning_rate": 9.993524746469154e-05, + "loss": 2.4139, + "step": 1491 + }, + { + "epoch": 0.45794966236955187, + "grad_norm": 0.9740153551101685, + "learning_rate": 9.99349943329653e-05, + "loss": 2.4269, + "step": 1492 + }, + { + "epoch": 0.45825659914057704, + "grad_norm": 0.9112171530723572, + "learning_rate": 9.993474070775217e-05, + "loss": 2.3575, + "step": 1493 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 1.124553918838501, + "learning_rate": 9.993448658905466e-05, + "loss": 2.5518, + "step": 1494 + }, + { + "epoch": 0.4588704726826274, + "grad_norm": 1.1732012033462524, + "learning_rate": 9.99342319768753e-05, + "loss": 2.4346, + "step": 1495 + }, + { + "epoch": 0.45917740945365254, + "grad_norm": 0.8880025148391724, + "learning_rate": 9.993397687121659e-05, + "loss": 2.3593, + "step": 1496 + }, + { + "epoch": 0.4594843462246777, + "grad_norm": 0.9916797876358032, + "learning_rate": 9.993372127208105e-05, + "loss": 2.3283, + "step": 1497 + }, + { + "epoch": 0.4597912829957029, + "grad_norm": 0.9372622966766357, + "learning_rate": 9.99334651794712e-05, + "loss": 2.3868, + "step": 1498 + }, + { + "epoch": 0.46009821976672804, + "grad_norm": 1.0630989074707031, + "learning_rate": 9.99332085933896e-05, + "loss": 2.3605, + "step": 1499 + }, + { + "epoch": 0.4604051565377532, + "grad_norm": 1.000473976135254, + "learning_rate": 9.993295151383874e-05, + "loss": 2.3478, + "step": 1500 + }, + { + "epoch": 0.4607120933087784, + "grad_norm": 1.0269688367843628, + "learning_rate": 9.99326939408212e-05, + "loss": 2.4104, + "step": 1501 + }, + { + "epoch": 0.46101903007980355, + "grad_norm": 0.9003174901008606, + "learning_rate": 9.993243587433952e-05, + "loss": 2.3461, + "step": 1502 + }, + { + "epoch": 0.4613259668508287, + "grad_norm": 0.7938058972358704, + "learning_rate": 9.993217731439623e-05, + "loss": 2.3463, + "step": 1503 + }, + { + "epoch": 0.4616329036218539, + "grad_norm": 0.8715407252311707, + "learning_rate": 9.993191826099391e-05, + "loss": 2.3962, + "step": 1504 + }, + { + "epoch": 0.46193984039287905, + "grad_norm": 0.8319756984710693, + "learning_rate": 9.99316587141351e-05, + "loss": 2.342, + "step": 1505 + }, + { + "epoch": 0.4622467771639042, + "grad_norm": 0.846592903137207, + "learning_rate": 9.993139867382238e-05, + "loss": 2.4064, + "step": 1506 + }, + { + "epoch": 0.4625537139349294, + "grad_norm": 0.8567312955856323, + "learning_rate": 9.99311381400583e-05, + "loss": 2.3603, + "step": 1507 + }, + { + "epoch": 0.46286065070595456, + "grad_norm": 0.8784321546554565, + "learning_rate": 9.993087711284546e-05, + "loss": 2.4031, + "step": 1508 + }, + { + "epoch": 0.4631675874769797, + "grad_norm": 0.838233232498169, + "learning_rate": 9.993061559218641e-05, + "loss": 2.3156, + "step": 1509 + }, + { + "epoch": 0.4634745242480049, + "grad_norm": 0.8804462552070618, + "learning_rate": 9.993035357808376e-05, + "loss": 2.4322, + "step": 1510 + }, + { + "epoch": 0.46378146101903006, + "grad_norm": 1.1055982112884521, + "learning_rate": 9.99300910705401e-05, + "loss": 2.5006, + "step": 1511 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.9872145056724548, + "learning_rate": 9.992982806955799e-05, + "loss": 2.3547, + "step": 1512 + }, + { + "epoch": 0.4643953345610804, + "grad_norm": 1.0710479021072388, + "learning_rate": 9.99295645751401e-05, + "loss": 2.4867, + "step": 1513 + }, + { + "epoch": 0.46470227133210557, + "grad_norm": 0.9858919382095337, + "learning_rate": 9.992930058728894e-05, + "loss": 2.2986, + "step": 1514 + }, + { + "epoch": 0.46500920810313073, + "grad_norm": 0.9031065702438354, + "learning_rate": 9.992903610600719e-05, + "loss": 2.3172, + "step": 1515 + }, + { + "epoch": 0.4653161448741559, + "grad_norm": 0.923160970211029, + "learning_rate": 9.992877113129744e-05, + "loss": 2.4231, + "step": 1516 + }, + { + "epoch": 0.46562308164518107, + "grad_norm": 1.0130947828292847, + "learning_rate": 9.992850566316231e-05, + "loss": 2.3593, + "step": 1517 + }, + { + "epoch": 0.46593001841620624, + "grad_norm": 0.8947033286094666, + "learning_rate": 9.992823970160441e-05, + "loss": 2.3324, + "step": 1518 + }, + { + "epoch": 0.4662369551872314, + "grad_norm": 0.8819900155067444, + "learning_rate": 9.992797324662639e-05, + "loss": 2.2885, + "step": 1519 + }, + { + "epoch": 0.4665438919582566, + "grad_norm": 0.9434374570846558, + "learning_rate": 9.99277062982309e-05, + "loss": 2.427, + "step": 1520 + }, + { + "epoch": 0.46685082872928174, + "grad_norm": 0.9568646550178528, + "learning_rate": 9.99274388564205e-05, + "loss": 2.4059, + "step": 1521 + }, + { + "epoch": 0.4671577655003069, + "grad_norm": 0.9125105142593384, + "learning_rate": 9.992717092119794e-05, + "loss": 2.3306, + "step": 1522 + }, + { + "epoch": 0.46746470227133213, + "grad_norm": 0.8893206715583801, + "learning_rate": 9.992690249256578e-05, + "loss": 2.4211, + "step": 1523 + }, + { + "epoch": 0.4677716390423573, + "grad_norm": 0.8655402660369873, + "learning_rate": 9.992663357052672e-05, + "loss": 2.3493, + "step": 1524 + }, + { + "epoch": 0.46807857581338247, + "grad_norm": 0.7973037958145142, + "learning_rate": 9.99263641550834e-05, + "loss": 2.4255, + "step": 1525 + }, + { + "epoch": 0.46838551258440764, + "grad_norm": 0.8158934116363525, + "learning_rate": 9.992609424623849e-05, + "loss": 2.3518, + "step": 1526 + }, + { + "epoch": 0.4686924493554328, + "grad_norm": 0.7919436693191528, + "learning_rate": 9.992582384399465e-05, + "loss": 2.3762, + "step": 1527 + }, + { + "epoch": 0.468999386126458, + "grad_norm": 0.911490261554718, + "learning_rate": 9.992555294835455e-05, + "loss": 2.454, + "step": 1528 + }, + { + "epoch": 0.46930632289748314, + "grad_norm": 0.9504674077033997, + "learning_rate": 9.992528155932088e-05, + "loss": 2.3554, + "step": 1529 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.9833991527557373, + "learning_rate": 9.99250096768963e-05, + "loss": 2.4245, + "step": 1530 + }, + { + "epoch": 0.4699201964395335, + "grad_norm": 0.9994687438011169, + "learning_rate": 9.992473730108354e-05, + "loss": 2.3269, + "step": 1531 + }, + { + "epoch": 0.47022713321055865, + "grad_norm": 0.977237343788147, + "learning_rate": 9.992446443188526e-05, + "loss": 2.3938, + "step": 1532 + }, + { + "epoch": 0.4705340699815838, + "grad_norm": 1.018334150314331, + "learning_rate": 9.992419106930415e-05, + "loss": 2.3076, + "step": 1533 + }, + { + "epoch": 0.470841006752609, + "grad_norm": 0.9752077460289001, + "learning_rate": 9.992391721334293e-05, + "loss": 2.4224, + "step": 1534 + }, + { + "epoch": 0.47114794352363415, + "grad_norm": 0.9457291960716248, + "learning_rate": 9.992364286400428e-05, + "loss": 2.3859, + "step": 1535 + }, + { + "epoch": 0.4714548802946593, + "grad_norm": 0.9112275838851929, + "learning_rate": 9.992336802129096e-05, + "loss": 2.3343, + "step": 1536 + }, + { + "epoch": 0.4717618170656845, + "grad_norm": 0.7701164484024048, + "learning_rate": 9.992309268520563e-05, + "loss": 2.3912, + "step": 1537 + }, + { + "epoch": 0.47206875383670965, + "grad_norm": 0.826822817325592, + "learning_rate": 9.992281685575105e-05, + "loss": 2.3794, + "step": 1538 + }, + { + "epoch": 0.4723756906077348, + "grad_norm": 0.8690019249916077, + "learning_rate": 9.992254053292994e-05, + "loss": 2.3474, + "step": 1539 + }, + { + "epoch": 0.47268262737876, + "grad_norm": 0.935954213142395, + "learning_rate": 9.9922263716745e-05, + "loss": 2.3794, + "step": 1540 + }, + { + "epoch": 0.47298956414978516, + "grad_norm": 1.0606616735458374, + "learning_rate": 9.992198640719901e-05, + "loss": 2.3491, + "step": 1541 + }, + { + "epoch": 0.4732965009208103, + "grad_norm": 1.0020630359649658, + "learning_rate": 9.992170860429469e-05, + "loss": 2.4723, + "step": 1542 + }, + { + "epoch": 0.4736034376918355, + "grad_norm": 0.9738268256187439, + "learning_rate": 9.992143030803476e-05, + "loss": 2.4282, + "step": 1543 + }, + { + "epoch": 0.47391037446286066, + "grad_norm": 1.0320461988449097, + "learning_rate": 9.992115151842203e-05, + "loss": 2.3935, + "step": 1544 + }, + { + "epoch": 0.47421731123388583, + "grad_norm": 0.926980197429657, + "learning_rate": 9.992087223545921e-05, + "loss": 2.4403, + "step": 1545 + }, + { + "epoch": 0.474524248004911, + "grad_norm": 0.8760805130004883, + "learning_rate": 9.992059245914906e-05, + "loss": 2.3282, + "step": 1546 + }, + { + "epoch": 0.47483118477593617, + "grad_norm": 0.807569146156311, + "learning_rate": 9.992031218949435e-05, + "loss": 2.351, + "step": 1547 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.7491574883460999, + "learning_rate": 9.992003142649788e-05, + "loss": 2.3788, + "step": 1548 + }, + { + "epoch": 0.4754450583179865, + "grad_norm": 0.8402566909790039, + "learning_rate": 9.99197501701624e-05, + "loss": 2.4025, + "step": 1549 + }, + { + "epoch": 0.47575199508901167, + "grad_norm": 0.9501824975013733, + "learning_rate": 9.991946842049067e-05, + "loss": 2.4433, + "step": 1550 + }, + { + "epoch": 0.47605893186003684, + "grad_norm": 1.0070267915725708, + "learning_rate": 9.99191861774855e-05, + "loss": 2.4267, + "step": 1551 + }, + { + "epoch": 0.476365868631062, + "grad_norm": 0.9052779078483582, + "learning_rate": 9.991890344114969e-05, + "loss": 2.37, + "step": 1552 + }, + { + "epoch": 0.4766728054020872, + "grad_norm": 0.9453344345092773, + "learning_rate": 9.9918620211486e-05, + "loss": 2.4687, + "step": 1553 + }, + { + "epoch": 0.47697974217311234, + "grad_norm": 0.9836863875389099, + "learning_rate": 9.991833648849725e-05, + "loss": 2.4005, + "step": 1554 + }, + { + "epoch": 0.4772866789441375, + "grad_norm": 0.856532633304596, + "learning_rate": 9.991805227218624e-05, + "loss": 2.329, + "step": 1555 + }, + { + "epoch": 0.4775936157151627, + "grad_norm": 0.8338705897331238, + "learning_rate": 9.991776756255579e-05, + "loss": 2.3648, + "step": 1556 + }, + { + "epoch": 0.47790055248618785, + "grad_norm": 0.7738644480705261, + "learning_rate": 9.991748235960869e-05, + "loss": 2.2784, + "step": 1557 + }, + { + "epoch": 0.478207489257213, + "grad_norm": 0.7771223783493042, + "learning_rate": 9.991719666334778e-05, + "loss": 2.2747, + "step": 1558 + }, + { + "epoch": 0.4785144260282382, + "grad_norm": 0.7564612627029419, + "learning_rate": 9.991691047377588e-05, + "loss": 2.2964, + "step": 1559 + }, + { + "epoch": 0.47882136279926335, + "grad_norm": 0.7877290844917297, + "learning_rate": 9.99166237908958e-05, + "loss": 2.3149, + "step": 1560 + }, + { + "epoch": 0.4791282995702885, + "grad_norm": 0.7967450022697449, + "learning_rate": 9.991633661471039e-05, + "loss": 2.4035, + "step": 1561 + }, + { + "epoch": 0.4794352363413137, + "grad_norm": 0.8993534445762634, + "learning_rate": 9.991604894522248e-05, + "loss": 2.4028, + "step": 1562 + }, + { + "epoch": 0.47974217311233885, + "grad_norm": 0.9135516881942749, + "learning_rate": 9.991576078243494e-05, + "loss": 2.3968, + "step": 1563 + }, + { + "epoch": 0.480049109883364, + "grad_norm": 0.8438525795936584, + "learning_rate": 9.991547212635057e-05, + "loss": 2.3589, + "step": 1564 + }, + { + "epoch": 0.4803560466543892, + "grad_norm": 0.8979686498641968, + "learning_rate": 9.991518297697226e-05, + "loss": 2.3835, + "step": 1565 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.8821539878845215, + "learning_rate": 9.991489333430286e-05, + "loss": 2.3503, + "step": 1566 + }, + { + "epoch": 0.4809699201964395, + "grad_norm": 0.8649077415466309, + "learning_rate": 9.991460319834523e-05, + "loss": 2.3806, + "step": 1567 + }, + { + "epoch": 0.4812768569674647, + "grad_norm": 0.8360965847969055, + "learning_rate": 9.991431256910223e-05, + "loss": 2.3997, + "step": 1568 + }, + { + "epoch": 0.48158379373848986, + "grad_norm": 0.9178828597068787, + "learning_rate": 9.991402144657673e-05, + "loss": 2.3611, + "step": 1569 + }, + { + "epoch": 0.48189073050951503, + "grad_norm": 0.7961607575416565, + "learning_rate": 9.991372983077161e-05, + "loss": 2.3588, + "step": 1570 + }, + { + "epoch": 0.4821976672805402, + "grad_norm": 0.8136993646621704, + "learning_rate": 9.991343772168978e-05, + "loss": 2.3241, + "step": 1571 + }, + { + "epoch": 0.48250460405156537, + "grad_norm": 0.8421273231506348, + "learning_rate": 9.991314511933407e-05, + "loss": 2.3493, + "step": 1572 + }, + { + "epoch": 0.48281154082259053, + "grad_norm": 0.774861752986908, + "learning_rate": 9.991285202370743e-05, + "loss": 2.362, + "step": 1573 + }, + { + "epoch": 0.4831184775936157, + "grad_norm": 0.9181589484214783, + "learning_rate": 9.991255843481273e-05, + "loss": 2.443, + "step": 1574 + }, + { + "epoch": 0.48342541436464087, + "grad_norm": 0.873884379863739, + "learning_rate": 9.991226435265286e-05, + "loss": 2.3819, + "step": 1575 + }, + { + "epoch": 0.48373235113566604, + "grad_norm": 0.923200786113739, + "learning_rate": 9.991196977723077e-05, + "loss": 2.4152, + "step": 1576 + }, + { + "epoch": 0.4840392879066912, + "grad_norm": 0.9097923040390015, + "learning_rate": 9.99116747085493e-05, + "loss": 2.4072, + "step": 1577 + }, + { + "epoch": 0.4843462246777164, + "grad_norm": 0.8885805010795593, + "learning_rate": 9.991137914661143e-05, + "loss": 2.3963, + "step": 1578 + }, + { + "epoch": 0.48465316144874154, + "grad_norm": 0.9016655683517456, + "learning_rate": 9.991108309142006e-05, + "loss": 2.4287, + "step": 1579 + }, + { + "epoch": 0.4849600982197667, + "grad_norm": 0.957548201084137, + "learning_rate": 9.99107865429781e-05, + "loss": 2.4306, + "step": 1580 + }, + { + "epoch": 0.4852670349907919, + "grad_norm": 0.9604195356369019, + "learning_rate": 9.99104895012885e-05, + "loss": 2.3721, + "step": 1581 + }, + { + "epoch": 0.48557397176181705, + "grad_norm": 1.0423815250396729, + "learning_rate": 9.991019196635419e-05, + "loss": 2.3847, + "step": 1582 + }, + { + "epoch": 0.4858809085328422, + "grad_norm": 0.9538045525550842, + "learning_rate": 9.990989393817809e-05, + "loss": 2.4307, + "step": 1583 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 1.0103334188461304, + "learning_rate": 9.990959541676318e-05, + "loss": 2.409, + "step": 1584 + }, + { + "epoch": 0.48649478207489255, + "grad_norm": 1.0780646800994873, + "learning_rate": 9.99092964021124e-05, + "loss": 2.3314, + "step": 1585 + }, + { + "epoch": 0.4868017188459177, + "grad_norm": 1.0062072277069092, + "learning_rate": 9.99089968942287e-05, + "loss": 2.3922, + "step": 1586 + }, + { + "epoch": 0.4871086556169429, + "grad_norm": 1.0575196743011475, + "learning_rate": 9.990869689311504e-05, + "loss": 2.4156, + "step": 1587 + }, + { + "epoch": 0.48741559238796806, + "grad_norm": 0.9953998923301697, + "learning_rate": 9.990839639877438e-05, + "loss": 2.381, + "step": 1588 + }, + { + "epoch": 0.4877225291589932, + "grad_norm": 0.8848470449447632, + "learning_rate": 9.99080954112097e-05, + "loss": 2.4178, + "step": 1589 + }, + { + "epoch": 0.4880294659300184, + "grad_norm": 0.7849117517471313, + "learning_rate": 9.990779393042397e-05, + "loss": 2.3021, + "step": 1590 + }, + { + "epoch": 0.48833640270104356, + "grad_norm": 0.7611599564552307, + "learning_rate": 9.990749195642016e-05, + "loss": 2.4426, + "step": 1591 + }, + { + "epoch": 0.4886433394720687, + "grad_norm": 0.8361895084381104, + "learning_rate": 9.990718948920127e-05, + "loss": 2.3442, + "step": 1592 + }, + { + "epoch": 0.4889502762430939, + "grad_norm": 0.8249576687812805, + "learning_rate": 9.990688652877028e-05, + "loss": 2.2745, + "step": 1593 + }, + { + "epoch": 0.4892572130141191, + "grad_norm": 0.763889729976654, + "learning_rate": 9.990658307513019e-05, + "loss": 2.3123, + "step": 1594 + }, + { + "epoch": 0.4895641497851443, + "grad_norm": 0.7517281770706177, + "learning_rate": 9.990627912828399e-05, + "loss": 2.3811, + "step": 1595 + }, + { + "epoch": 0.48987108655616945, + "grad_norm": 0.8254112005233765, + "learning_rate": 9.990597468823468e-05, + "loss": 2.4269, + "step": 1596 + }, + { + "epoch": 0.4901780233271946, + "grad_norm": 0.8267236948013306, + "learning_rate": 9.99056697549853e-05, + "loss": 2.354, + "step": 1597 + }, + { + "epoch": 0.4904849600982198, + "grad_norm": 0.8511303067207336, + "learning_rate": 9.990536432853881e-05, + "loss": 2.3755, + "step": 1598 + }, + { + "epoch": 0.49079189686924496, + "grad_norm": 0.8639636635780334, + "learning_rate": 9.990505840889828e-05, + "loss": 2.3828, + "step": 1599 + }, + { + "epoch": 0.4910988336402701, + "grad_norm": 0.8371795415878296, + "learning_rate": 9.990475199606672e-05, + "loss": 2.4235, + "step": 1600 + }, + { + "epoch": 0.4914057704112953, + "grad_norm": 0.7639186382293701, + "learning_rate": 9.990444509004713e-05, + "loss": 2.3547, + "step": 1601 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.7835492491722107, + "learning_rate": 9.990413769084257e-05, + "loss": 2.2983, + "step": 1602 + }, + { + "epoch": 0.49201964395334563, + "grad_norm": 0.8301565647125244, + "learning_rate": 9.990382979845609e-05, + "loss": 2.4109, + "step": 1603 + }, + { + "epoch": 0.4923265807243708, + "grad_norm": 0.9005976915359497, + "learning_rate": 9.99035214128907e-05, + "loss": 2.3618, + "step": 1604 + }, + { + "epoch": 0.49263351749539597, + "grad_norm": 1.0234936475753784, + "learning_rate": 9.990321253414945e-05, + "loss": 2.4622, + "step": 1605 + }, + { + "epoch": 0.49294045426642114, + "grad_norm": 1.1613819599151611, + "learning_rate": 9.990290316223542e-05, + "loss": 2.3231, + "step": 1606 + }, + { + "epoch": 0.4932473910374463, + "grad_norm": 0.9382983446121216, + "learning_rate": 9.990259329715165e-05, + "loss": 2.357, + "step": 1607 + }, + { + "epoch": 0.49355432780847147, + "grad_norm": 1.0277435779571533, + "learning_rate": 9.990228293890121e-05, + "loss": 2.3497, + "step": 1608 + }, + { + "epoch": 0.49386126457949664, + "grad_norm": 0.9809542894363403, + "learning_rate": 9.990197208748716e-05, + "loss": 2.363, + "step": 1609 + }, + { + "epoch": 0.4941682013505218, + "grad_norm": 1.151412844657898, + "learning_rate": 9.990166074291255e-05, + "loss": 2.4859, + "step": 1610 + }, + { + "epoch": 0.494475138121547, + "grad_norm": 0.9663482308387756, + "learning_rate": 9.990134890518051e-05, + "loss": 2.3848, + "step": 1611 + }, + { + "epoch": 0.49478207489257214, + "grad_norm": 0.9619266986846924, + "learning_rate": 9.990103657429405e-05, + "loss": 2.3381, + "step": 1612 + }, + { + "epoch": 0.4950890116635973, + "grad_norm": 1.1306475400924683, + "learning_rate": 9.990072375025634e-05, + "loss": 2.3859, + "step": 1613 + }, + { + "epoch": 0.4953959484346225, + "grad_norm": 1.127801537513733, + "learning_rate": 9.990041043307043e-05, + "loss": 2.4259, + "step": 1614 + }, + { + "epoch": 0.49570288520564765, + "grad_norm": 0.9880200624465942, + "learning_rate": 9.990009662273941e-05, + "loss": 2.3629, + "step": 1615 + }, + { + "epoch": 0.4960098219766728, + "grad_norm": 0.940493643283844, + "learning_rate": 9.989978231926636e-05, + "loss": 2.3716, + "step": 1616 + }, + { + "epoch": 0.496316758747698, + "grad_norm": 0.7923702597618103, + "learning_rate": 9.989946752265445e-05, + "loss": 2.3017, + "step": 1617 + }, + { + "epoch": 0.49662369551872315, + "grad_norm": 0.7668408155441284, + "learning_rate": 9.989915223290673e-05, + "loss": 2.3273, + "step": 1618 + }, + { + "epoch": 0.4969306322897483, + "grad_norm": 0.7134098410606384, + "learning_rate": 9.989883645002636e-05, + "loss": 2.302, + "step": 1619 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.6878800392150879, + "learning_rate": 9.989852017401643e-05, + "loss": 2.3047, + "step": 1620 + }, + { + "epoch": 0.49754450583179866, + "grad_norm": 0.8099397420883179, + "learning_rate": 9.989820340488008e-05, + "loss": 2.4747, + "step": 1621 + }, + { + "epoch": 0.4978514426028238, + "grad_norm": 0.9677640795707703, + "learning_rate": 9.989788614262043e-05, + "loss": 2.3347, + "step": 1622 + }, + { + "epoch": 0.498158379373849, + "grad_norm": 0.7592893838882446, + "learning_rate": 9.989756838724064e-05, + "loss": 2.3238, + "step": 1623 + }, + { + "epoch": 0.49846531614487416, + "grad_norm": 0.872529923915863, + "learning_rate": 9.989725013874382e-05, + "loss": 2.4117, + "step": 1624 + }, + { + "epoch": 0.49877225291589933, + "grad_norm": 1.023362159729004, + "learning_rate": 9.989693139713315e-05, + "loss": 2.3307, + "step": 1625 + }, + { + "epoch": 0.4990791896869245, + "grad_norm": 0.8994693756103516, + "learning_rate": 9.989661216241172e-05, + "loss": 2.3661, + "step": 1626 + }, + { + "epoch": 0.49938612645794966, + "grad_norm": 0.8854429125785828, + "learning_rate": 9.989629243458275e-05, + "loss": 2.311, + "step": 1627 + }, + { + "epoch": 0.49969306322897483, + "grad_norm": 0.8326926231384277, + "learning_rate": 9.989597221364937e-05, + "loss": 2.302, + "step": 1628 + }, + { + "epoch": 0.5, + "grad_norm": 0.8778239488601685, + "learning_rate": 9.989565149961475e-05, + "loss": 2.4653, + "step": 1629 + }, + { + "epoch": 0.5003069367710252, + "grad_norm": 0.9369759559631348, + "learning_rate": 9.989533029248205e-05, + "loss": 2.4165, + "step": 1630 + }, + { + "epoch": 0.5006138735420503, + "grad_norm": 0.8510915637016296, + "learning_rate": 9.989500859225445e-05, + "loss": 2.3345, + "step": 1631 + }, + { + "epoch": 0.5009208103130756, + "grad_norm": 0.787972629070282, + "learning_rate": 9.989468639893513e-05, + "loss": 2.283, + "step": 1632 + }, + { + "epoch": 0.5012277470841007, + "grad_norm": 0.7370568513870239, + "learning_rate": 9.989436371252729e-05, + "loss": 2.2867, + "step": 1633 + }, + { + "epoch": 0.5015346838551259, + "grad_norm": 0.8459502458572388, + "learning_rate": 9.989404053303409e-05, + "loss": 2.2875, + "step": 1634 + }, + { + "epoch": 0.501841620626151, + "grad_norm": 0.9123181700706482, + "learning_rate": 9.989371686045874e-05, + "loss": 2.2653, + "step": 1635 + }, + { + "epoch": 0.5021485573971762, + "grad_norm": 1.1908178329467773, + "learning_rate": 9.989339269480445e-05, + "loss": 2.4849, + "step": 1636 + }, + { + "epoch": 0.5024554941682013, + "grad_norm": 0.8162623643875122, + "learning_rate": 9.989306803607439e-05, + "loss": 2.2409, + "step": 1637 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.9289522171020508, + "learning_rate": 9.98927428842718e-05, + "loss": 2.455, + "step": 1638 + }, + { + "epoch": 0.5030693677102517, + "grad_norm": 1.212346076965332, + "learning_rate": 9.989241723939988e-05, + "loss": 2.3461, + "step": 1639 + }, + { + "epoch": 0.5033763044812769, + "grad_norm": 0.8971593976020813, + "learning_rate": 9.989209110146184e-05, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.503683241252302, + "grad_norm": 0.9293156862258911, + "learning_rate": 9.989176447046092e-05, + "loss": 2.3235, + "step": 1641 + }, + { + "epoch": 0.5039901780233272, + "grad_norm": 0.8665596842765808, + "learning_rate": 9.989143734640034e-05, + "loss": 2.4694, + "step": 1642 + }, + { + "epoch": 0.5042971147943524, + "grad_norm": 0.7732648253440857, + "learning_rate": 9.989110972928333e-05, + "loss": 2.1985, + "step": 1643 + }, + { + "epoch": 0.5046040515653776, + "grad_norm": 0.8124692440032959, + "learning_rate": 9.989078161911314e-05, + "loss": 2.315, + "step": 1644 + }, + { + "epoch": 0.5049109883364027, + "grad_norm": 0.8534342050552368, + "learning_rate": 9.989045301589301e-05, + "loss": 2.3491, + "step": 1645 + }, + { + "epoch": 0.5052179251074279, + "grad_norm": 0.8351274132728577, + "learning_rate": 9.989012391962617e-05, + "loss": 2.3416, + "step": 1646 + }, + { + "epoch": 0.505524861878453, + "grad_norm": 0.9143189787864685, + "learning_rate": 9.988979433031588e-05, + "loss": 2.4665, + "step": 1647 + }, + { + "epoch": 0.5058317986494782, + "grad_norm": 0.8978474140167236, + "learning_rate": 9.988946424796542e-05, + "loss": 2.389, + "step": 1648 + }, + { + "epoch": 0.5061387354205034, + "grad_norm": 1.0245648622512817, + "learning_rate": 9.988913367257802e-05, + "loss": 2.3391, + "step": 1649 + }, + { + "epoch": 0.5064456721915286, + "grad_norm": 0.9991573691368103, + "learning_rate": 9.988880260415695e-05, + "loss": 2.405, + "step": 1650 + }, + { + "epoch": 0.5067526089625537, + "grad_norm": 1.042378306388855, + "learning_rate": 9.98884710427055e-05, + "loss": 2.3467, + "step": 1651 + }, + { + "epoch": 0.5070595457335789, + "grad_norm": 0.9569510817527771, + "learning_rate": 9.988813898822694e-05, + "loss": 2.31, + "step": 1652 + }, + { + "epoch": 0.507366482504604, + "grad_norm": 0.9343158006668091, + "learning_rate": 9.988780644072456e-05, + "loss": 2.3659, + "step": 1653 + }, + { + "epoch": 0.5076734192756293, + "grad_norm": 0.7857093811035156, + "learning_rate": 9.988747340020162e-05, + "loss": 2.3424, + "step": 1654 + }, + { + "epoch": 0.5079803560466544, + "grad_norm": 0.7613041996955872, + "learning_rate": 9.988713986666144e-05, + "loss": 2.2698, + "step": 1655 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.8077516555786133, + "learning_rate": 9.98868058401073e-05, + "loss": 2.3827, + "step": 1656 + }, + { + "epoch": 0.5085942295887047, + "grad_norm": 0.8794304132461548, + "learning_rate": 9.98864713205425e-05, + "loss": 2.3079, + "step": 1657 + }, + { + "epoch": 0.5089011663597299, + "grad_norm": 0.8333674073219299, + "learning_rate": 9.988613630797036e-05, + "loss": 2.3622, + "step": 1658 + }, + { + "epoch": 0.509208103130755, + "grad_norm": 0.9654781222343445, + "learning_rate": 9.988580080239417e-05, + "loss": 2.3979, + "step": 1659 + }, + { + "epoch": 0.5095150399017803, + "grad_norm": 0.9278727769851685, + "learning_rate": 9.988546480381727e-05, + "loss": 2.3728, + "step": 1660 + }, + { + "epoch": 0.5098219766728054, + "grad_norm": 0.7971704006195068, + "learning_rate": 9.988512831224298e-05, + "loss": 2.2983, + "step": 1661 + }, + { + "epoch": 0.5101289134438306, + "grad_norm": 0.8991698026657104, + "learning_rate": 9.988479132767459e-05, + "loss": 2.3992, + "step": 1662 + }, + { + "epoch": 0.5104358502148557, + "grad_norm": 1.0208392143249512, + "learning_rate": 9.988445385011546e-05, + "loss": 2.3847, + "step": 1663 + }, + { + "epoch": 0.5107427869858809, + "grad_norm": 0.878237247467041, + "learning_rate": 9.988411587956891e-05, + "loss": 2.2851, + "step": 1664 + }, + { + "epoch": 0.511049723756906, + "grad_norm": 0.903287410736084, + "learning_rate": 9.98837774160383e-05, + "loss": 2.4233, + "step": 1665 + }, + { + "epoch": 0.5113566605279313, + "grad_norm": 0.8845674991607666, + "learning_rate": 9.988343845952697e-05, + "loss": 2.2923, + "step": 1666 + }, + { + "epoch": 0.5116635972989564, + "grad_norm": 0.7729392051696777, + "learning_rate": 9.988309901003825e-05, + "loss": 2.3044, + "step": 1667 + }, + { + "epoch": 0.5119705340699816, + "grad_norm": 0.719302237033844, + "learning_rate": 9.988275906757551e-05, + "loss": 2.3207, + "step": 1668 + }, + { + "epoch": 0.5122774708410067, + "grad_norm": 0.7205179333686829, + "learning_rate": 9.988241863214211e-05, + "loss": 2.341, + "step": 1669 + }, + { + "epoch": 0.512584407612032, + "grad_norm": 0.7318145036697388, + "learning_rate": 9.988207770374142e-05, + "loss": 2.3419, + "step": 1670 + }, + { + "epoch": 0.5128913443830571, + "grad_norm": 0.770630955696106, + "learning_rate": 9.98817362823768e-05, + "loss": 2.27, + "step": 1671 + }, + { + "epoch": 0.5131982811540823, + "grad_norm": 0.6485452651977539, + "learning_rate": 9.988139436805162e-05, + "loss": 2.2715, + "step": 1672 + }, + { + "epoch": 0.5135052179251074, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.988105196076925e-05, + "loss": 2.2806, + "step": 1673 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.695818305015564, + "learning_rate": 9.98807090605331e-05, + "loss": 2.3387, + "step": 1674 + }, + { + "epoch": 0.5141190914671577, + "grad_norm": 0.7685426473617554, + "learning_rate": 9.988036566734655e-05, + "loss": 2.2921, + "step": 1675 + }, + { + "epoch": 0.514426028238183, + "grad_norm": 0.6522897481918335, + "learning_rate": 9.988002178121301e-05, + "loss": 2.2507, + "step": 1676 + }, + { + "epoch": 0.5147329650092081, + "grad_norm": 0.7442181706428528, + "learning_rate": 9.987967740213583e-05, + "loss": 2.3292, + "step": 1677 + }, + { + "epoch": 0.5150399017802333, + "grad_norm": 0.8093023300170898, + "learning_rate": 9.987933253011846e-05, + "loss": 2.3384, + "step": 1678 + }, + { + "epoch": 0.5153468385512584, + "grad_norm": 0.8014655113220215, + "learning_rate": 9.987898716516428e-05, + "loss": 2.3619, + "step": 1679 + }, + { + "epoch": 0.5156537753222836, + "grad_norm": 0.8230258822441101, + "learning_rate": 9.987864130727671e-05, + "loss": 2.3242, + "step": 1680 + }, + { + "epoch": 0.5159607120933087, + "grad_norm": 0.9222247004508972, + "learning_rate": 9.987829495645918e-05, + "loss": 2.3907, + "step": 1681 + }, + { + "epoch": 0.516267648864334, + "grad_norm": 0.9293351769447327, + "learning_rate": 9.987794811271511e-05, + "loss": 2.3632, + "step": 1682 + }, + { + "epoch": 0.5165745856353591, + "grad_norm": 0.9555168747901917, + "learning_rate": 9.987760077604791e-05, + "loss": 2.3273, + "step": 1683 + }, + { + "epoch": 0.5168815224063843, + "grad_norm": 0.9839370250701904, + "learning_rate": 9.987725294646102e-05, + "loss": 2.3451, + "step": 1684 + }, + { + "epoch": 0.5171884591774094, + "grad_norm": 1.097970962524414, + "learning_rate": 9.987690462395791e-05, + "loss": 2.308, + "step": 1685 + }, + { + "epoch": 0.5174953959484346, + "grad_norm": 0.9345484972000122, + "learning_rate": 9.987655580854198e-05, + "loss": 2.3051, + "step": 1686 + }, + { + "epoch": 0.5178023327194597, + "grad_norm": 0.8075851798057556, + "learning_rate": 9.987620650021668e-05, + "loss": 2.3005, + "step": 1687 + }, + { + "epoch": 0.518109269490485, + "grad_norm": 0.7287935614585876, + "learning_rate": 9.987585669898549e-05, + "loss": 2.3709, + "step": 1688 + }, + { + "epoch": 0.5184162062615101, + "grad_norm": 0.7611173987388611, + "learning_rate": 9.987550640485184e-05, + "loss": 2.3265, + "step": 1689 + }, + { + "epoch": 0.5187231430325353, + "grad_norm": 0.7932588458061218, + "learning_rate": 9.987515561781921e-05, + "loss": 2.3625, + "step": 1690 + }, + { + "epoch": 0.5190300798035604, + "grad_norm": 0.7837479114532471, + "learning_rate": 9.987480433789106e-05, + "loss": 2.2614, + "step": 1691 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.905799925327301, + "learning_rate": 9.987445256507085e-05, + "loss": 2.2915, + "step": 1692 + }, + { + "epoch": 0.5196439533456108, + "grad_norm": 0.9417183995246887, + "learning_rate": 9.987410029936208e-05, + "loss": 2.3624, + "step": 1693 + }, + { + "epoch": 0.519950890116636, + "grad_norm": 0.9971327185630798, + "learning_rate": 9.987374754076822e-05, + "loss": 2.3913, + "step": 1694 + }, + { + "epoch": 0.5202578268876611, + "grad_norm": 0.8719072341918945, + "learning_rate": 9.987339428929274e-05, + "loss": 2.3412, + "step": 1695 + }, + { + "epoch": 0.5205647636586863, + "grad_norm": 0.8198116421699524, + "learning_rate": 9.987304054493916e-05, + "loss": 2.333, + "step": 1696 + }, + { + "epoch": 0.5208717004297114, + "grad_norm": 0.7450931668281555, + "learning_rate": 9.987268630771096e-05, + "loss": 2.2817, + "step": 1697 + }, + { + "epoch": 0.5211786372007366, + "grad_norm": 0.6867587566375732, + "learning_rate": 9.987233157761164e-05, + "loss": 2.3456, + "step": 1698 + }, + { + "epoch": 0.5214855739717618, + "grad_norm": 0.7537778615951538, + "learning_rate": 9.987197635464471e-05, + "loss": 2.176, + "step": 1699 + }, + { + "epoch": 0.521792510742787, + "grad_norm": 0.8347577452659607, + "learning_rate": 9.987162063881366e-05, + "loss": 2.3296, + "step": 1700 + }, + { + "epoch": 0.5220994475138122, + "grad_norm": 0.8714643120765686, + "learning_rate": 9.987126443012205e-05, + "loss": 2.3648, + "step": 1701 + }, + { + "epoch": 0.5224063842848373, + "grad_norm": 0.8579849004745483, + "learning_rate": 9.987090772857336e-05, + "loss": 2.4189, + "step": 1702 + }, + { + "epoch": 0.5227133210558625, + "grad_norm": 0.8651238083839417, + "learning_rate": 9.987055053417114e-05, + "loss": 2.3036, + "step": 1703 + }, + { + "epoch": 0.5230202578268877, + "grad_norm": 0.8447873592376709, + "learning_rate": 9.98701928469189e-05, + "loss": 2.3243, + "step": 1704 + }, + { + "epoch": 0.5233271945979129, + "grad_norm": 0.8218941688537598, + "learning_rate": 9.986983466682019e-05, + "loss": 2.3888, + "step": 1705 + }, + { + "epoch": 0.523634131368938, + "grad_norm": 0.7862920761108398, + "learning_rate": 9.986947599387855e-05, + "loss": 2.335, + "step": 1706 + }, + { + "epoch": 0.5239410681399632, + "grad_norm": 0.8096200227737427, + "learning_rate": 9.986911682809749e-05, + "loss": 2.4034, + "step": 1707 + }, + { + "epoch": 0.5242480049109883, + "grad_norm": 0.8217427730560303, + "learning_rate": 9.986875716948062e-05, + "loss": 2.2659, + "step": 1708 + }, + { + "epoch": 0.5245549416820136, + "grad_norm": 0.7676928043365479, + "learning_rate": 9.986839701803146e-05, + "loss": 2.2736, + "step": 1709 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.7783572673797607, + "learning_rate": 9.986803637375356e-05, + "loss": 2.3611, + "step": 1710 + }, + { + "epoch": 0.5251688152240639, + "grad_norm": 0.7657338380813599, + "learning_rate": 9.98676752366505e-05, + "loss": 2.3573, + "step": 1711 + }, + { + "epoch": 0.525475751995089, + "grad_norm": 0.8946976065635681, + "learning_rate": 9.986731360672585e-05, + "loss": 2.3443, + "step": 1712 + }, + { + "epoch": 0.5257826887661142, + "grad_norm": 0.8047227263450623, + "learning_rate": 9.986695148398318e-05, + "loss": 2.345, + "step": 1713 + }, + { + "epoch": 0.5260896255371393, + "grad_norm": 0.8407939672470093, + "learning_rate": 9.986658886842605e-05, + "loss": 2.2828, + "step": 1714 + }, + { + "epoch": 0.5263965623081646, + "grad_norm": 0.8460215330123901, + "learning_rate": 9.986622576005806e-05, + "loss": 2.2786, + "step": 1715 + }, + { + "epoch": 0.5267034990791897, + "grad_norm": 0.8291949033737183, + "learning_rate": 9.986586215888283e-05, + "loss": 2.3491, + "step": 1716 + }, + { + "epoch": 0.5270104358502149, + "grad_norm": 0.8812628388404846, + "learning_rate": 9.98654980649039e-05, + "loss": 2.3392, + "step": 1717 + }, + { + "epoch": 0.52731737262124, + "grad_norm": 0.8666933178901672, + "learning_rate": 9.98651334781249e-05, + "loss": 2.2585, + "step": 1718 + }, + { + "epoch": 0.5276243093922652, + "grad_norm": 0.8393275737762451, + "learning_rate": 9.986476839854941e-05, + "loss": 2.3315, + "step": 1719 + }, + { + "epoch": 0.5279312461632903, + "grad_norm": 0.8431777954101562, + "learning_rate": 9.986440282618105e-05, + "loss": 2.268, + "step": 1720 + }, + { + "epoch": 0.5282381829343156, + "grad_norm": 0.8020747900009155, + "learning_rate": 9.986403676102346e-05, + "loss": 2.2306, + "step": 1721 + }, + { + "epoch": 0.5285451197053407, + "grad_norm": 0.817395806312561, + "learning_rate": 9.986367020308022e-05, + "loss": 2.2914, + "step": 1722 + }, + { + "epoch": 0.5288520564763659, + "grad_norm": 0.8034493327140808, + "learning_rate": 9.986330315235497e-05, + "loss": 2.3598, + "step": 1723 + }, + { + "epoch": 0.529158993247391, + "grad_norm": 0.9001252055168152, + "learning_rate": 9.986293560885131e-05, + "loss": 2.3456, + "step": 1724 + }, + { + "epoch": 0.5294659300184162, + "grad_norm": 0.9782349467277527, + "learning_rate": 9.986256757257293e-05, + "loss": 2.231, + "step": 1725 + }, + { + "epoch": 0.5297728667894414, + "grad_norm": 1.0022578239440918, + "learning_rate": 9.98621990435234e-05, + "loss": 2.3457, + "step": 1726 + }, + { + "epoch": 0.5300798035604666, + "grad_norm": 1.0705206394195557, + "learning_rate": 9.986183002170642e-05, + "loss": 2.2775, + "step": 1727 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.8464064598083496, + "learning_rate": 9.98614605071256e-05, + "loss": 2.4006, + "step": 1728 + }, + { + "epoch": 0.5306936771025169, + "grad_norm": 0.7128132581710815, + "learning_rate": 9.98610904997846e-05, + "loss": 2.3273, + "step": 1729 + }, + { + "epoch": 0.531000613873542, + "grad_norm": 0.8113927245140076, + "learning_rate": 9.986071999968706e-05, + "loss": 2.3467, + "step": 1730 + }, + { + "epoch": 0.5313075506445673, + "grad_norm": 0.9236831665039062, + "learning_rate": 9.986034900683669e-05, + "loss": 2.3815, + "step": 1731 + }, + { + "epoch": 0.5316144874155924, + "grad_norm": 0.9325668811798096, + "learning_rate": 9.985997752123713e-05, + "loss": 2.3411, + "step": 1732 + }, + { + "epoch": 0.5319214241866176, + "grad_norm": 0.9585117101669312, + "learning_rate": 9.985960554289203e-05, + "loss": 2.3309, + "step": 1733 + }, + { + "epoch": 0.5322283609576427, + "grad_norm": 0.9459986686706543, + "learning_rate": 9.98592330718051e-05, + "loss": 2.3525, + "step": 1734 + }, + { + "epoch": 0.5325352977286679, + "grad_norm": 0.971592366695404, + "learning_rate": 9.985886010797997e-05, + "loss": 2.3665, + "step": 1735 + }, + { + "epoch": 0.532842234499693, + "grad_norm": 0.8533779978752136, + "learning_rate": 9.985848665142039e-05, + "loss": 2.26, + "step": 1736 + }, + { + "epoch": 0.5331491712707183, + "grad_norm": 0.8224228620529175, + "learning_rate": 9.985811270213002e-05, + "loss": 2.3523, + "step": 1737 + }, + { + "epoch": 0.5334561080417434, + "grad_norm": 0.8649810552597046, + "learning_rate": 9.985773826011255e-05, + "loss": 2.3262, + "step": 1738 + }, + { + "epoch": 0.5337630448127686, + "grad_norm": 0.8099339604377747, + "learning_rate": 9.98573633253717e-05, + "loss": 2.3038, + "step": 1739 + }, + { + "epoch": 0.5340699815837937, + "grad_norm": 0.6788219213485718, + "learning_rate": 9.985698789791115e-05, + "loss": 2.3278, + "step": 1740 + }, + { + "epoch": 0.5343769183548189, + "grad_norm": 0.8716040253639221, + "learning_rate": 9.985661197773464e-05, + "loss": 2.2955, + "step": 1741 + }, + { + "epoch": 0.534683855125844, + "grad_norm": 0.8377614617347717, + "learning_rate": 9.985623556484587e-05, + "loss": 2.2801, + "step": 1742 + }, + { + "epoch": 0.5349907918968693, + "grad_norm": 0.8452683091163635, + "learning_rate": 9.985585865924853e-05, + "loss": 2.3313, + "step": 1743 + }, + { + "epoch": 0.5352977286678944, + "grad_norm": 0.8226203918457031, + "learning_rate": 9.98554812609464e-05, + "loss": 2.3464, + "step": 1744 + }, + { + "epoch": 0.5356046654389196, + "grad_norm": 0.7476974725723267, + "learning_rate": 9.985510336994316e-05, + "loss": 2.3721, + "step": 1745 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.7132230997085571, + "learning_rate": 9.98547249862426e-05, + "loss": 2.2657, + "step": 1746 + }, + { + "epoch": 0.5362185389809699, + "grad_norm": 0.7022002339363098, + "learning_rate": 9.98543461098484e-05, + "loss": 2.2656, + "step": 1747 + }, + { + "epoch": 0.536525475751995, + "grad_norm": 0.7174789309501648, + "learning_rate": 9.985396674076435e-05, + "loss": 2.2914, + "step": 1748 + }, + { + "epoch": 0.5368324125230203, + "grad_norm": 0.78509920835495, + "learning_rate": 9.985358687899417e-05, + "loss": 2.3155, + "step": 1749 + }, + { + "epoch": 0.5371393492940454, + "grad_norm": 0.7670894861221313, + "learning_rate": 9.985320652454162e-05, + "loss": 2.2608, + "step": 1750 + }, + { + "epoch": 0.5374462860650706, + "grad_norm": 0.6196603178977966, + "learning_rate": 9.985282567741047e-05, + "loss": 2.2796, + "step": 1751 + }, + { + "epoch": 0.5377532228360957, + "grad_norm": 0.7119829058647156, + "learning_rate": 9.985244433760448e-05, + "loss": 2.2262, + "step": 1752 + }, + { + "epoch": 0.538060159607121, + "grad_norm": 0.6665359735488892, + "learning_rate": 9.98520625051274e-05, + "loss": 2.2714, + "step": 1753 + }, + { + "epoch": 0.5383670963781461, + "grad_norm": 0.7960934042930603, + "learning_rate": 9.985168017998303e-05, + "loss": 2.3703, + "step": 1754 + }, + { + "epoch": 0.5386740331491713, + "grad_norm": 0.9428521394729614, + "learning_rate": 9.985129736217513e-05, + "loss": 2.3334, + "step": 1755 + }, + { + "epoch": 0.5389809699201964, + "grad_norm": 0.9900842905044556, + "learning_rate": 9.985091405170751e-05, + "loss": 2.2369, + "step": 1756 + }, + { + "epoch": 0.5392879066912216, + "grad_norm": 0.9340593814849854, + "learning_rate": 9.985053024858393e-05, + "loss": 2.4332, + "step": 1757 + }, + { + "epoch": 0.5395948434622467, + "grad_norm": 0.9241896271705627, + "learning_rate": 9.985014595280818e-05, + "loss": 2.3484, + "step": 1758 + }, + { + "epoch": 0.539901780233272, + "grad_norm": 0.7724506258964539, + "learning_rate": 9.984976116438408e-05, + "loss": 2.282, + "step": 1759 + }, + { + "epoch": 0.5402087170042971, + "grad_norm": 0.9098101854324341, + "learning_rate": 9.984937588331543e-05, + "loss": 2.3039, + "step": 1760 + }, + { + "epoch": 0.5405156537753223, + "grad_norm": 0.9430370330810547, + "learning_rate": 9.984899010960601e-05, + "loss": 2.2555, + "step": 1761 + }, + { + "epoch": 0.5408225905463474, + "grad_norm": 0.8927021026611328, + "learning_rate": 9.984860384325965e-05, + "loss": 2.3034, + "step": 1762 + }, + { + "epoch": 0.5411295273173726, + "grad_norm": 0.8331896662712097, + "learning_rate": 9.98482170842802e-05, + "loss": 2.3341, + "step": 1763 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.8311246633529663, + "learning_rate": 9.984782983267142e-05, + "loss": 2.3913, + "step": 1764 + }, + { + "epoch": 0.541743400859423, + "grad_norm": 0.7459335923194885, + "learning_rate": 9.98474420884372e-05, + "loss": 2.2912, + "step": 1765 + }, + { + "epoch": 0.5420503376304481, + "grad_norm": 0.84760981798172, + "learning_rate": 9.984705385158131e-05, + "loss": 2.316, + "step": 1766 + }, + { + "epoch": 0.5423572744014733, + "grad_norm": 0.888793408870697, + "learning_rate": 9.984666512210762e-05, + "loss": 2.3452, + "step": 1767 + }, + { + "epoch": 0.5426642111724984, + "grad_norm": 0.7977499961853027, + "learning_rate": 9.984627590001999e-05, + "loss": 2.3325, + "step": 1768 + }, + { + "epoch": 0.5429711479435236, + "grad_norm": 0.8059934377670288, + "learning_rate": 9.984588618532224e-05, + "loss": 2.3347, + "step": 1769 + }, + { + "epoch": 0.5432780847145487, + "grad_norm": 0.8190197348594666, + "learning_rate": 9.984549597801822e-05, + "loss": 2.3446, + "step": 1770 + }, + { + "epoch": 0.543585021485574, + "grad_norm": 0.774773895740509, + "learning_rate": 9.98451052781118e-05, + "loss": 2.2598, + "step": 1771 + }, + { + "epoch": 0.5438919582565992, + "grad_norm": 0.7341485023498535, + "learning_rate": 9.984471408560682e-05, + "loss": 2.2728, + "step": 1772 + }, + { + "epoch": 0.5441988950276243, + "grad_norm": 0.6881145238876343, + "learning_rate": 9.984432240050719e-05, + "loss": 2.2922, + "step": 1773 + }, + { + "epoch": 0.5445058317986495, + "grad_norm": 0.6896151304244995, + "learning_rate": 9.984393022281673e-05, + "loss": 2.2915, + "step": 1774 + }, + { + "epoch": 0.5448127685696746, + "grad_norm": 0.6902059316635132, + "learning_rate": 9.984353755253932e-05, + "loss": 2.31, + "step": 1775 + }, + { + "epoch": 0.5451197053406999, + "grad_norm": 0.7594140768051147, + "learning_rate": 9.984314438967888e-05, + "loss": 2.3092, + "step": 1776 + }, + { + "epoch": 0.545426642111725, + "grad_norm": 0.8682328462600708, + "learning_rate": 9.984275073423927e-05, + "loss": 2.2851, + "step": 1777 + }, + { + "epoch": 0.5457335788827502, + "grad_norm": 0.8747107982635498, + "learning_rate": 9.98423565862244e-05, + "loss": 2.2927, + "step": 1778 + }, + { + "epoch": 0.5460405156537753, + "grad_norm": 0.9824326038360596, + "learning_rate": 9.984196194563813e-05, + "loss": 2.3622, + "step": 1779 + }, + { + "epoch": 0.5463474524248005, + "grad_norm": 1.0006790161132812, + "learning_rate": 9.984156681248438e-05, + "loss": 2.2531, + "step": 1780 + }, + { + "epoch": 0.5466543891958257, + "grad_norm": 0.9501944184303284, + "learning_rate": 9.984117118676705e-05, + "loss": 2.3902, + "step": 1781 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.7835353016853333, + "learning_rate": 9.984077506849005e-05, + "loss": 2.2754, + "step": 1782 + }, + { + "epoch": 0.547268262737876, + "grad_norm": 0.7310026288032532, + "learning_rate": 9.984037845765732e-05, + "loss": 2.2742, + "step": 1783 + }, + { + "epoch": 0.5475751995089012, + "grad_norm": 0.9469361901283264, + "learning_rate": 9.983998135427275e-05, + "loss": 2.4026, + "step": 1784 + }, + { + "epoch": 0.5478821362799263, + "grad_norm": 1.0639240741729736, + "learning_rate": 9.983958375834025e-05, + "loss": 2.3522, + "step": 1785 + }, + { + "epoch": 0.5481890730509515, + "grad_norm": 0.7771989703178406, + "learning_rate": 9.983918566986379e-05, + "loss": 2.216, + "step": 1786 + }, + { + "epoch": 0.5484960098219767, + "grad_norm": 0.6809307932853699, + "learning_rate": 9.983878708884728e-05, + "loss": 2.256, + "step": 1787 + }, + { + "epoch": 0.5488029465930019, + "grad_norm": 0.7300165891647339, + "learning_rate": 9.983838801529469e-05, + "loss": 2.3156, + "step": 1788 + }, + { + "epoch": 0.549109883364027, + "grad_norm": 0.8352389335632324, + "learning_rate": 9.98379884492099e-05, + "loss": 2.3344, + "step": 1789 + }, + { + "epoch": 0.5494168201350522, + "grad_norm": 0.830585777759552, + "learning_rate": 9.983758839059692e-05, + "loss": 2.3076, + "step": 1790 + }, + { + "epoch": 0.5497237569060773, + "grad_norm": 0.7384640574455261, + "learning_rate": 9.983718783945968e-05, + "loss": 2.2387, + "step": 1791 + }, + { + "epoch": 0.5500306936771026, + "grad_norm": 0.7133243083953857, + "learning_rate": 9.983678679580213e-05, + "loss": 2.2933, + "step": 1792 + }, + { + "epoch": 0.5503376304481277, + "grad_norm": 0.8462459444999695, + "learning_rate": 9.983638525962823e-05, + "loss": 2.3294, + "step": 1793 + }, + { + "epoch": 0.5506445672191529, + "grad_norm": 0.7841110825538635, + "learning_rate": 9.983598323094199e-05, + "loss": 2.3156, + "step": 1794 + }, + { + "epoch": 0.550951503990178, + "grad_norm": 0.8454114198684692, + "learning_rate": 9.983558070974735e-05, + "loss": 2.2203, + "step": 1795 + }, + { + "epoch": 0.5512584407612032, + "grad_norm": 0.7741531729698181, + "learning_rate": 9.983517769604826e-05, + "loss": 2.2585, + "step": 1796 + }, + { + "epoch": 0.5515653775322283, + "grad_norm": 0.717714250087738, + "learning_rate": 9.983477418984876e-05, + "loss": 2.3127, + "step": 1797 + }, + { + "epoch": 0.5518723143032536, + "grad_norm": 0.7546361088752747, + "learning_rate": 9.983437019115283e-05, + "loss": 2.2591, + "step": 1798 + }, + { + "epoch": 0.5521792510742787, + "grad_norm": 0.7947681546211243, + "learning_rate": 9.983396569996442e-05, + "loss": 2.337, + "step": 1799 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.9286270141601562, + "learning_rate": 9.983356071628756e-05, + "loss": 2.371, + "step": 1800 + }, + { + "epoch": 0.552793124616329, + "grad_norm": 1.0236682891845703, + "learning_rate": 9.983315524012625e-05, + "loss": 2.2673, + "step": 1801 + }, + { + "epoch": 0.5531000613873542, + "grad_norm": 1.043534278869629, + "learning_rate": 9.983274927148447e-05, + "loss": 2.3204, + "step": 1802 + }, + { + "epoch": 0.5534069981583793, + "grad_norm": 0.9694257378578186, + "learning_rate": 9.983234281036626e-05, + "loss": 2.2642, + "step": 1803 + }, + { + "epoch": 0.5537139349294046, + "grad_norm": 0.8890992403030396, + "learning_rate": 9.983193585677563e-05, + "loss": 2.2546, + "step": 1804 + }, + { + "epoch": 0.5540208717004297, + "grad_norm": 0.8109140396118164, + "learning_rate": 9.983152841071662e-05, + "loss": 2.3088, + "step": 1805 + }, + { + "epoch": 0.5543278084714549, + "grad_norm": 0.7762413620948792, + "learning_rate": 9.983112047219323e-05, + "loss": 2.2277, + "step": 1806 + }, + { + "epoch": 0.55463474524248, + "grad_norm": 0.7949336767196655, + "learning_rate": 9.983071204120951e-05, + "loss": 2.3004, + "step": 1807 + }, + { + "epoch": 0.5549416820135052, + "grad_norm": 0.9118300080299377, + "learning_rate": 9.983030311776946e-05, + "loss": 2.3986, + "step": 1808 + }, + { + "epoch": 0.5552486187845304, + "grad_norm": 0.874891996383667, + "learning_rate": 9.982989370187717e-05, + "loss": 2.2721, + "step": 1809 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.8089940547943115, + "learning_rate": 9.982948379353667e-05, + "loss": 2.2846, + "step": 1810 + }, + { + "epoch": 0.5558624923265807, + "grad_norm": 0.7407395839691162, + "learning_rate": 9.982907339275198e-05, + "loss": 2.2848, + "step": 1811 + }, + { + "epoch": 0.5561694290976059, + "grad_norm": 0.7487329244613647, + "learning_rate": 9.982866249952721e-05, + "loss": 2.266, + "step": 1812 + }, + { + "epoch": 0.556476365868631, + "grad_norm": 0.7910557389259338, + "learning_rate": 9.982825111386638e-05, + "loss": 2.2975, + "step": 1813 + }, + { + "epoch": 0.5567833026396563, + "grad_norm": 0.767186164855957, + "learning_rate": 9.982783923577356e-05, + "loss": 2.2867, + "step": 1814 + }, + { + "epoch": 0.5570902394106814, + "grad_norm": 0.7296959757804871, + "learning_rate": 9.982742686525284e-05, + "loss": 2.2167, + "step": 1815 + }, + { + "epoch": 0.5573971761817066, + "grad_norm": 0.6536411643028259, + "learning_rate": 9.982701400230827e-05, + "loss": 2.2278, + "step": 1816 + }, + { + "epoch": 0.5577041129527317, + "grad_norm": 0.7393643260002136, + "learning_rate": 9.982660064694394e-05, + "loss": 2.3275, + "step": 1817 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.7837240099906921, + "learning_rate": 9.982618679916396e-05, + "loss": 2.3516, + "step": 1818 + }, + { + "epoch": 0.558317986494782, + "grad_norm": 0.8186847567558289, + "learning_rate": 9.982577245897238e-05, + "loss": 2.4104, + "step": 1819 + }, + { + "epoch": 0.5586249232658073, + "grad_norm": 0.733651340007782, + "learning_rate": 9.98253576263733e-05, + "loss": 2.2151, + "step": 1820 + }, + { + "epoch": 0.5589318600368324, + "grad_norm": 0.7452411651611328, + "learning_rate": 9.982494230137086e-05, + "loss": 2.3288, + "step": 1821 + }, + { + "epoch": 0.5592387968078576, + "grad_norm": 0.7369456887245178, + "learning_rate": 9.982452648396913e-05, + "loss": 2.3023, + "step": 1822 + }, + { + "epoch": 0.5595457335788827, + "grad_norm": 0.794789731502533, + "learning_rate": 9.982411017417222e-05, + "loss": 2.2774, + "step": 1823 + }, + { + "epoch": 0.5598526703499079, + "grad_norm": 0.7677412033081055, + "learning_rate": 9.982369337198425e-05, + "loss": 2.3213, + "step": 1824 + }, + { + "epoch": 0.560159607120933, + "grad_norm": 0.8195241689682007, + "learning_rate": 9.982327607740934e-05, + "loss": 2.3721, + "step": 1825 + }, + { + "epoch": 0.5604665438919583, + "grad_norm": 0.867115318775177, + "learning_rate": 9.982285829045162e-05, + "loss": 2.3653, + "step": 1826 + }, + { + "epoch": 0.5607734806629834, + "grad_norm": 0.8519865870475769, + "learning_rate": 9.98224400111152e-05, + "loss": 2.3646, + "step": 1827 + }, + { + "epoch": 0.5610804174340086, + "grad_norm": 0.9408721923828125, + "learning_rate": 9.982202123940425e-05, + "loss": 2.2051, + "step": 1828 + }, + { + "epoch": 0.5613873542050337, + "grad_norm": 0.985325813293457, + "learning_rate": 9.982160197532287e-05, + "loss": 2.3402, + "step": 1829 + }, + { + "epoch": 0.5616942909760589, + "grad_norm": 1.018094539642334, + "learning_rate": 9.982118221887521e-05, + "loss": 2.2712, + "step": 1830 + }, + { + "epoch": 0.562001227747084, + "grad_norm": 0.9246920347213745, + "learning_rate": 9.982076197006543e-05, + "loss": 2.3808, + "step": 1831 + }, + { + "epoch": 0.5623081645181093, + "grad_norm": 0.8519729971885681, + "learning_rate": 9.982034122889768e-05, + "loss": 2.3774, + "step": 1832 + }, + { + "epoch": 0.5626151012891344, + "grad_norm": 0.801567018032074, + "learning_rate": 9.981991999537612e-05, + "loss": 2.2713, + "step": 1833 + }, + { + "epoch": 0.5629220380601596, + "grad_norm": 0.7212518453598022, + "learning_rate": 9.981949826950492e-05, + "loss": 2.1902, + "step": 1834 + }, + { + "epoch": 0.5632289748311847, + "grad_norm": 0.7644798755645752, + "learning_rate": 9.981907605128822e-05, + "loss": 2.2751, + "step": 1835 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.7941999435424805, + "learning_rate": 9.981865334073022e-05, + "loss": 2.2991, + "step": 1836 + }, + { + "epoch": 0.5638428483732351, + "grad_norm": 0.7274888753890991, + "learning_rate": 9.981823013783508e-05, + "loss": 2.3536, + "step": 1837 + }, + { + "epoch": 0.5641497851442603, + "grad_norm": 0.845024585723877, + "learning_rate": 9.9817806442607e-05, + "loss": 2.2796, + "step": 1838 + }, + { + "epoch": 0.5644567219152854, + "grad_norm": 0.8225597739219666, + "learning_rate": 9.981738225505015e-05, + "loss": 2.3339, + "step": 1839 + }, + { + "epoch": 0.5647636586863106, + "grad_norm": 0.8456425070762634, + "learning_rate": 9.981695757516873e-05, + "loss": 2.2583, + "step": 1840 + }, + { + "epoch": 0.5650705954573357, + "grad_norm": 1.0066497325897217, + "learning_rate": 9.981653240296695e-05, + "loss": 2.3628, + "step": 1841 + }, + { + "epoch": 0.565377532228361, + "grad_norm": 0.9574379920959473, + "learning_rate": 9.981610673844899e-05, + "loss": 2.306, + "step": 1842 + }, + { + "epoch": 0.5656844689993862, + "grad_norm": 0.7427437901496887, + "learning_rate": 9.981568058161905e-05, + "loss": 2.267, + "step": 1843 + }, + { + "epoch": 0.5659914057704113, + "grad_norm": 0.6984857320785522, + "learning_rate": 9.981525393248138e-05, + "loss": 2.2095, + "step": 1844 + }, + { + "epoch": 0.5662983425414365, + "grad_norm": 0.748062789440155, + "learning_rate": 9.981482679104016e-05, + "loss": 2.211, + "step": 1845 + }, + { + "epoch": 0.5666052793124616, + "grad_norm": 0.7978217005729675, + "learning_rate": 9.981439915729964e-05, + "loss": 2.2437, + "step": 1846 + }, + { + "epoch": 0.5669122160834869, + "grad_norm": 0.807849109172821, + "learning_rate": 9.981397103126401e-05, + "loss": 2.3063, + "step": 1847 + }, + { + "epoch": 0.567219152854512, + "grad_norm": 0.8626619577407837, + "learning_rate": 9.981354241293752e-05, + "loss": 2.3616, + "step": 1848 + }, + { + "epoch": 0.5675260896255372, + "grad_norm": 0.8991526961326599, + "learning_rate": 9.981311330232442e-05, + "loss": 2.2355, + "step": 1849 + }, + { + "epoch": 0.5678330263965623, + "grad_norm": 0.7399953007698059, + "learning_rate": 9.981268369942894e-05, + "loss": 2.2452, + "step": 1850 + }, + { + "epoch": 0.5681399631675875, + "grad_norm": 0.7787104845046997, + "learning_rate": 9.981225360425533e-05, + "loss": 2.4141, + "step": 1851 + }, + { + "epoch": 0.5684468999386126, + "grad_norm": 0.8570892214775085, + "learning_rate": 9.98118230168078e-05, + "loss": 2.2487, + "step": 1852 + }, + { + "epoch": 0.5687538367096379, + "grad_norm": 0.8277538418769836, + "learning_rate": 9.981139193709068e-05, + "loss": 2.2602, + "step": 1853 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.7638106942176819, + "learning_rate": 9.981096036510817e-05, + "loss": 2.2886, + "step": 1854 + }, + { + "epoch": 0.5693677102516882, + "grad_norm": 0.8480616807937622, + "learning_rate": 9.981052830086454e-05, + "loss": 2.2893, + "step": 1855 + }, + { + "epoch": 0.5696746470227133, + "grad_norm": 0.8568599820137024, + "learning_rate": 9.98100957443641e-05, + "loss": 2.3802, + "step": 1856 + }, + { + "epoch": 0.5699815837937385, + "grad_norm": 0.7863987684249878, + "learning_rate": 9.98096626956111e-05, + "loss": 2.2996, + "step": 1857 + }, + { + "epoch": 0.5702885205647636, + "grad_norm": 0.7636334896087646, + "learning_rate": 9.980922915460979e-05, + "loss": 2.2569, + "step": 1858 + }, + { + "epoch": 0.5705954573357889, + "grad_norm": 0.7514677047729492, + "learning_rate": 9.98087951213645e-05, + "loss": 2.3317, + "step": 1859 + }, + { + "epoch": 0.570902394106814, + "grad_norm": 0.717637300491333, + "learning_rate": 9.980836059587951e-05, + "loss": 2.2855, + "step": 1860 + }, + { + "epoch": 0.5712093308778392, + "grad_norm": 0.728518545627594, + "learning_rate": 9.98079255781591e-05, + "loss": 2.3166, + "step": 1861 + }, + { + "epoch": 0.5715162676488643, + "grad_norm": 0.7158043384552002, + "learning_rate": 9.980749006820757e-05, + "loss": 2.2639, + "step": 1862 + }, + { + "epoch": 0.5718232044198895, + "grad_norm": 0.7565107941627502, + "learning_rate": 9.980705406602924e-05, + "loss": 2.2833, + "step": 1863 + }, + { + "epoch": 0.5721301411909147, + "grad_norm": 0.7873388528823853, + "learning_rate": 9.980661757162841e-05, + "loss": 2.201, + "step": 1864 + }, + { + "epoch": 0.5724370779619399, + "grad_norm": 0.7818259596824646, + "learning_rate": 9.980618058500939e-05, + "loss": 2.242, + "step": 1865 + }, + { + "epoch": 0.572744014732965, + "grad_norm": 0.7464665770530701, + "learning_rate": 9.98057431061765e-05, + "loss": 2.2325, + "step": 1866 + }, + { + "epoch": 0.5730509515039902, + "grad_norm": 0.7778184413909912, + "learning_rate": 9.980530513513406e-05, + "loss": 2.3258, + "step": 1867 + }, + { + "epoch": 0.5733578882750153, + "grad_norm": 0.825661301612854, + "learning_rate": 9.980486667188642e-05, + "loss": 2.3477, + "step": 1868 + }, + { + "epoch": 0.5736648250460405, + "grad_norm": 0.8448848724365234, + "learning_rate": 9.980442771643788e-05, + "loss": 2.3523, + "step": 1869 + }, + { + "epoch": 0.5739717618170657, + "grad_norm": 0.8330404758453369, + "learning_rate": 9.98039882687928e-05, + "loss": 2.2274, + "step": 1870 + }, + { + "epoch": 0.5742786985880909, + "grad_norm": 0.7520943284034729, + "learning_rate": 9.98035483289555e-05, + "loss": 2.2773, + "step": 1871 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.8312448263168335, + "learning_rate": 9.980310789693037e-05, + "loss": 2.302, + "step": 1872 + }, + { + "epoch": 0.5748925721301412, + "grad_norm": 0.7383994460105896, + "learning_rate": 9.980266697272173e-05, + "loss": 2.2168, + "step": 1873 + }, + { + "epoch": 0.5751995089011663, + "grad_norm": 0.9612922072410583, + "learning_rate": 9.980222555633394e-05, + "loss": 2.3558, + "step": 1874 + }, + { + "epoch": 0.5755064456721916, + "grad_norm": 0.9921227097511292, + "learning_rate": 9.980178364777136e-05, + "loss": 2.2913, + "step": 1875 + }, + { + "epoch": 0.5758133824432167, + "grad_norm": 0.9152889847755432, + "learning_rate": 9.980134124703837e-05, + "loss": 2.2615, + "step": 1876 + }, + { + "epoch": 0.5761203192142419, + "grad_norm": 0.8090541362762451, + "learning_rate": 9.980089835413936e-05, + "loss": 2.2661, + "step": 1877 + }, + { + "epoch": 0.576427255985267, + "grad_norm": 0.8074322938919067, + "learning_rate": 9.980045496907865e-05, + "loss": 2.3209, + "step": 1878 + }, + { + "epoch": 0.5767341927562922, + "grad_norm": 0.784649670124054, + "learning_rate": 9.980001109186065e-05, + "loss": 2.241, + "step": 1879 + }, + { + "epoch": 0.5770411295273173, + "grad_norm": 0.768108069896698, + "learning_rate": 9.979956672248978e-05, + "loss": 2.3333, + "step": 1880 + }, + { + "epoch": 0.5773480662983426, + "grad_norm": 0.798058271408081, + "learning_rate": 9.97991218609704e-05, + "loss": 2.3564, + "step": 1881 + }, + { + "epoch": 0.5776550030693677, + "grad_norm": 0.7606865763664246, + "learning_rate": 9.97986765073069e-05, + "loss": 2.2277, + "step": 1882 + }, + { + "epoch": 0.5779619398403929, + "grad_norm": 0.8320558667182922, + "learning_rate": 9.979823066150369e-05, + "loss": 2.3715, + "step": 1883 + }, + { + "epoch": 0.578268876611418, + "grad_norm": 0.7935798168182373, + "learning_rate": 9.979778432356517e-05, + "loss": 2.2605, + "step": 1884 + }, + { + "epoch": 0.5785758133824432, + "grad_norm": 0.6914796829223633, + "learning_rate": 9.979733749349578e-05, + "loss": 2.2699, + "step": 1885 + }, + { + "epoch": 0.5788827501534684, + "grad_norm": 0.6546899676322937, + "learning_rate": 9.979689017129989e-05, + "loss": 2.1908, + "step": 1886 + }, + { + "epoch": 0.5791896869244936, + "grad_norm": 0.7231267094612122, + "learning_rate": 9.979644235698195e-05, + "loss": 2.2084, + "step": 1887 + }, + { + "epoch": 0.5794966236955187, + "grad_norm": 0.668933093547821, + "learning_rate": 9.979599405054639e-05, + "loss": 2.2722, + "step": 1888 + }, + { + "epoch": 0.5798035604665439, + "grad_norm": 0.678191602230072, + "learning_rate": 9.979554525199763e-05, + "loss": 2.2312, + "step": 1889 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.6407462954521179, + "learning_rate": 9.97950959613401e-05, + "loss": 2.2381, + "step": 1890 + }, + { + "epoch": 0.5804174340085942, + "grad_norm": 0.6920403242111206, + "learning_rate": 9.979464617857826e-05, + "loss": 2.2678, + "step": 1891 + }, + { + "epoch": 0.5807243707796194, + "grad_norm": 0.6907110810279846, + "learning_rate": 9.979419590371651e-05, + "loss": 2.2579, + "step": 1892 + }, + { + "epoch": 0.5810313075506446, + "grad_norm": 0.7683933973312378, + "learning_rate": 9.979374513675935e-05, + "loss": 2.2184, + "step": 1893 + }, + { + "epoch": 0.5813382443216697, + "grad_norm": 0.797286868095398, + "learning_rate": 9.979329387771121e-05, + "loss": 2.2518, + "step": 1894 + }, + { + "epoch": 0.5816451810926949, + "grad_norm": 0.8192877769470215, + "learning_rate": 9.979284212657657e-05, + "loss": 2.2271, + "step": 1895 + }, + { + "epoch": 0.58195211786372, + "grad_norm": 0.7510090470314026, + "learning_rate": 9.979238988335986e-05, + "loss": 2.2864, + "step": 1896 + }, + { + "epoch": 0.5822590546347453, + "grad_norm": 0.7541393041610718, + "learning_rate": 9.979193714806558e-05, + "loss": 2.239, + "step": 1897 + }, + { + "epoch": 0.5825659914057704, + "grad_norm": 0.7353073358535767, + "learning_rate": 9.97914839206982e-05, + "loss": 2.2145, + "step": 1898 + }, + { + "epoch": 0.5828729281767956, + "grad_norm": 0.6813456416130066, + "learning_rate": 9.979103020126218e-05, + "loss": 2.194, + "step": 1899 + }, + { + "epoch": 0.5831798649478207, + "grad_norm": 0.6922066807746887, + "learning_rate": 9.979057598976202e-05, + "loss": 2.2335, + "step": 1900 + }, + { + "epoch": 0.5834868017188459, + "grad_norm": 0.5800344944000244, + "learning_rate": 9.97901212862022e-05, + "loss": 2.2159, + "step": 1901 + }, + { + "epoch": 0.583793738489871, + "grad_norm": 0.5770835280418396, + "learning_rate": 9.978966609058722e-05, + "loss": 2.2217, + "step": 1902 + }, + { + "epoch": 0.5841006752608963, + "grad_norm": 0.6217128038406372, + "learning_rate": 9.978921040292158e-05, + "loss": 2.2703, + "step": 1903 + }, + { + "epoch": 0.5844076120319214, + "grad_norm": 0.6684436798095703, + "learning_rate": 9.97887542232098e-05, + "loss": 2.2747, + "step": 1904 + }, + { + "epoch": 0.5847145488029466, + "grad_norm": 0.6261670589447021, + "learning_rate": 9.978829755145633e-05, + "loss": 2.2867, + "step": 1905 + }, + { + "epoch": 0.5850214855739717, + "grad_norm": 0.646051824092865, + "learning_rate": 9.978784038766575e-05, + "loss": 2.2493, + "step": 1906 + }, + { + "epoch": 0.5853284223449969, + "grad_norm": 0.6757060885429382, + "learning_rate": 9.978738273184254e-05, + "loss": 2.218, + "step": 1907 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.7867937684059143, + "learning_rate": 9.978692458399122e-05, + "loss": 2.3405, + "step": 1908 + }, + { + "epoch": 0.5859422958870473, + "grad_norm": 0.8349789381027222, + "learning_rate": 9.978646594411636e-05, + "loss": 2.3292, + "step": 1909 + }, + { + "epoch": 0.5862492326580724, + "grad_norm": 0.8739562034606934, + "learning_rate": 9.978600681222243e-05, + "loss": 2.2132, + "step": 1910 + }, + { + "epoch": 0.5865561694290976, + "grad_norm": 0.8187520503997803, + "learning_rate": 9.978554718831402e-05, + "loss": 2.3078, + "step": 1911 + }, + { + "epoch": 0.5868631062001227, + "grad_norm": 0.8463271856307983, + "learning_rate": 9.978508707239565e-05, + "loss": 2.1924, + "step": 1912 + }, + { + "epoch": 0.5871700429711479, + "grad_norm": 0.8674206733703613, + "learning_rate": 9.978462646447187e-05, + "loss": 2.2185, + "step": 1913 + }, + { + "epoch": 0.5874769797421732, + "grad_norm": 0.7828893065452576, + "learning_rate": 9.978416536454722e-05, + "loss": 2.3137, + "step": 1914 + }, + { + "epoch": 0.5877839165131983, + "grad_norm": 0.7868914604187012, + "learning_rate": 9.978370377262629e-05, + "loss": 2.2202, + "step": 1915 + }, + { + "epoch": 0.5880908532842235, + "grad_norm": 0.811596155166626, + "learning_rate": 9.97832416887136e-05, + "loss": 2.3463, + "step": 1916 + }, + { + "epoch": 0.5883977900552486, + "grad_norm": 0.9281075596809387, + "learning_rate": 9.978277911281375e-05, + "loss": 2.2394, + "step": 1917 + }, + { + "epoch": 0.5887047268262738, + "grad_norm": 0.8862313628196716, + "learning_rate": 9.978231604493129e-05, + "loss": 2.2456, + "step": 1918 + }, + { + "epoch": 0.589011663597299, + "grad_norm": 0.8411116600036621, + "learning_rate": 9.978185248507081e-05, + "loss": 2.2409, + "step": 1919 + }, + { + "epoch": 0.5893186003683242, + "grad_norm": 0.8205060958862305, + "learning_rate": 9.978138843323688e-05, + "loss": 2.2468, + "step": 1920 + }, + { + "epoch": 0.5896255371393493, + "grad_norm": 0.8103171586990356, + "learning_rate": 9.97809238894341e-05, + "loss": 2.2979, + "step": 1921 + }, + { + "epoch": 0.5899324739103745, + "grad_norm": 0.7937025427818298, + "learning_rate": 9.978045885366704e-05, + "loss": 2.3582, + "step": 1922 + }, + { + "epoch": 0.5902394106813996, + "grad_norm": 0.7983896136283875, + "learning_rate": 9.977999332594032e-05, + "loss": 2.2725, + "step": 1923 + }, + { + "epoch": 0.5905463474524248, + "grad_norm": 0.8274399042129517, + "learning_rate": 9.977952730625852e-05, + "loss": 2.3091, + "step": 1924 + }, + { + "epoch": 0.59085328422345, + "grad_norm": 0.9385362863540649, + "learning_rate": 9.977906079462627e-05, + "loss": 2.4322, + "step": 1925 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.8405537009239197, + "learning_rate": 9.977859379104814e-05, + "loss": 2.1606, + "step": 1926 + }, + { + "epoch": 0.5914671577655003, + "grad_norm": 0.8082418441772461, + "learning_rate": 9.97781262955288e-05, + "loss": 2.2929, + "step": 1927 + }, + { + "epoch": 0.5917740945365255, + "grad_norm": 0.7444280385971069, + "learning_rate": 9.977765830807283e-05, + "loss": 2.3217, + "step": 1928 + }, + { + "epoch": 0.5920810313075506, + "grad_norm": 0.7369982600212097, + "learning_rate": 9.977718982868485e-05, + "loss": 2.2658, + "step": 1929 + }, + { + "epoch": 0.5923879680785759, + "grad_norm": 0.6842257380485535, + "learning_rate": 9.977672085736951e-05, + "loss": 2.2243, + "step": 1930 + }, + { + "epoch": 0.592694904849601, + "grad_norm": 0.6954882740974426, + "learning_rate": 9.977625139413145e-05, + "loss": 2.2802, + "step": 1931 + }, + { + "epoch": 0.5930018416206262, + "grad_norm": 0.749829888343811, + "learning_rate": 9.97757814389753e-05, + "loss": 2.3166, + "step": 1932 + }, + { + "epoch": 0.5933087783916513, + "grad_norm": 0.7725609540939331, + "learning_rate": 9.977531099190569e-05, + "loss": 2.2367, + "step": 1933 + }, + { + "epoch": 0.5936157151626765, + "grad_norm": 0.7467440366744995, + "learning_rate": 9.977484005292728e-05, + "loss": 2.2704, + "step": 1934 + }, + { + "epoch": 0.5939226519337016, + "grad_norm": 0.7104424834251404, + "learning_rate": 9.977436862204475e-05, + "loss": 2.1983, + "step": 1935 + }, + { + "epoch": 0.5942295887047269, + "grad_norm": 0.7562711834907532, + "learning_rate": 9.977389669926272e-05, + "loss": 2.2857, + "step": 1936 + }, + { + "epoch": 0.594536525475752, + "grad_norm": 0.7803298830986023, + "learning_rate": 9.977342428458585e-05, + "loss": 2.3526, + "step": 1937 + }, + { + "epoch": 0.5948434622467772, + "grad_norm": 0.7487826943397522, + "learning_rate": 9.977295137801885e-05, + "loss": 2.2338, + "step": 1938 + }, + { + "epoch": 0.5951503990178023, + "grad_norm": 0.6969291567802429, + "learning_rate": 9.977247797956639e-05, + "loss": 2.2185, + "step": 1939 + }, + { + "epoch": 0.5954573357888275, + "grad_norm": 0.6293052434921265, + "learning_rate": 9.977200408923311e-05, + "loss": 2.2767, + "step": 1940 + }, + { + "epoch": 0.5957642725598526, + "grad_norm": 0.7457680702209473, + "learning_rate": 9.97715297070237e-05, + "loss": 2.2688, + "step": 1941 + }, + { + "epoch": 0.5960712093308779, + "grad_norm": 0.7255130410194397, + "learning_rate": 9.977105483294288e-05, + "loss": 2.2157, + "step": 1942 + }, + { + "epoch": 0.596378146101903, + "grad_norm": 0.739815890789032, + "learning_rate": 9.977057946699532e-05, + "loss": 2.306, + "step": 1943 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.7493855357170105, + "learning_rate": 9.977010360918571e-05, + "loss": 2.1893, + "step": 1944 + }, + { + "epoch": 0.5969920196439533, + "grad_norm": 0.7976173758506775, + "learning_rate": 9.976962725951878e-05, + "loss": 2.3288, + "step": 1945 + }, + { + "epoch": 0.5972989564149785, + "grad_norm": 0.9487287998199463, + "learning_rate": 9.976915041799921e-05, + "loss": 2.4484, + "step": 1946 + }, + { + "epoch": 0.5976058931860037, + "grad_norm": 0.9866845011711121, + "learning_rate": 9.976867308463174e-05, + "loss": 2.3223, + "step": 1947 + }, + { + "epoch": 0.5979128299570289, + "grad_norm": 0.9258660674095154, + "learning_rate": 9.976819525942107e-05, + "loss": 2.2358, + "step": 1948 + }, + { + "epoch": 0.598219766728054, + "grad_norm": 0.9822832345962524, + "learning_rate": 9.976771694237192e-05, + "loss": 2.2951, + "step": 1949 + }, + { + "epoch": 0.5985267034990792, + "grad_norm": 1.005528450012207, + "learning_rate": 9.976723813348902e-05, + "loss": 2.2604, + "step": 1950 + }, + { + "epoch": 0.5988336402701043, + "grad_norm": 0.8988018035888672, + "learning_rate": 9.976675883277711e-05, + "loss": 2.3419, + "step": 1951 + }, + { + "epoch": 0.5991405770411296, + "grad_norm": 0.7386319041252136, + "learning_rate": 9.976627904024091e-05, + "loss": 2.2357, + "step": 1952 + }, + { + "epoch": 0.5994475138121547, + "grad_norm": 0.7715404033660889, + "learning_rate": 9.976579875588518e-05, + "loss": 2.3482, + "step": 1953 + }, + { + "epoch": 0.5997544505831799, + "grad_norm": 0.7529712319374084, + "learning_rate": 9.976531797971464e-05, + "loss": 2.1735, + "step": 1954 + }, + { + "epoch": 0.600061387354205, + "grad_norm": 0.8589643836021423, + "learning_rate": 9.97648367117341e-05, + "loss": 2.305, + "step": 1955 + }, + { + "epoch": 0.6003683241252302, + "grad_norm": 0.9038915634155273, + "learning_rate": 9.976435495194823e-05, + "loss": 2.2123, + "step": 1956 + }, + { + "epoch": 0.6006752608962553, + "grad_norm": 0.9388678073883057, + "learning_rate": 9.976387270036186e-05, + "loss": 2.1792, + "step": 1957 + }, + { + "epoch": 0.6009821976672806, + "grad_norm": 0.7970952391624451, + "learning_rate": 9.976338995697974e-05, + "loss": 2.2425, + "step": 1958 + }, + { + "epoch": 0.6012891344383057, + "grad_norm": 0.7219900488853455, + "learning_rate": 9.976290672180662e-05, + "loss": 2.1984, + "step": 1959 + }, + { + "epoch": 0.6015960712093309, + "grad_norm": 0.639715313911438, + "learning_rate": 9.976242299484728e-05, + "loss": 2.2796, + "step": 1960 + }, + { + "epoch": 0.601903007980356, + "grad_norm": 0.6734911799430847, + "learning_rate": 9.976193877610652e-05, + "loss": 2.3066, + "step": 1961 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.8328932523727417, + "learning_rate": 9.976145406558912e-05, + "loss": 2.3958, + "step": 1962 + }, + { + "epoch": 0.6025168815224063, + "grad_norm": 0.9552088379859924, + "learning_rate": 9.976096886329986e-05, + "loss": 2.3246, + "step": 1963 + }, + { + "epoch": 0.6028238182934316, + "grad_norm": 0.8407328128814697, + "learning_rate": 9.976048316924354e-05, + "loss": 2.2922, + "step": 1964 + }, + { + "epoch": 0.6031307550644567, + "grad_norm": 0.6899709105491638, + "learning_rate": 9.975999698342495e-05, + "loss": 2.1808, + "step": 1965 + }, + { + "epoch": 0.6034376918354819, + "grad_norm": 0.8114390969276428, + "learning_rate": 9.975951030584892e-05, + "loss": 2.3516, + "step": 1966 + }, + { + "epoch": 0.603744628606507, + "grad_norm": 0.8071461319923401, + "learning_rate": 9.975902313652024e-05, + "loss": 2.2044, + "step": 1967 + }, + { + "epoch": 0.6040515653775322, + "grad_norm": 0.8767913579940796, + "learning_rate": 9.975853547544372e-05, + "loss": 2.24, + "step": 1968 + }, + { + "epoch": 0.6043585021485574, + "grad_norm": 0.817095935344696, + "learning_rate": 9.975804732262419e-05, + "loss": 2.169, + "step": 1969 + }, + { + "epoch": 0.6046654389195826, + "grad_norm": 0.6818623542785645, + "learning_rate": 9.975755867806648e-05, + "loss": 2.2869, + "step": 1970 + }, + { + "epoch": 0.6049723756906077, + "grad_norm": 0.7248693704605103, + "learning_rate": 9.97570695417754e-05, + "loss": 2.2159, + "step": 1971 + }, + { + "epoch": 0.6052793124616329, + "grad_norm": 0.6425455212593079, + "learning_rate": 9.975657991375581e-05, + "loss": 2.2173, + "step": 1972 + }, + { + "epoch": 0.605586249232658, + "grad_norm": 0.6856566071510315, + "learning_rate": 9.975608979401252e-05, + "loss": 2.2994, + "step": 1973 + }, + { + "epoch": 0.6058931860036832, + "grad_norm": 0.6731004118919373, + "learning_rate": 9.97555991825504e-05, + "loss": 2.2286, + "step": 1974 + }, + { + "epoch": 0.6062001227747084, + "grad_norm": 0.7461759448051453, + "learning_rate": 9.975510807937428e-05, + "loss": 2.2057, + "step": 1975 + }, + { + "epoch": 0.6065070595457336, + "grad_norm": 0.7256236672401428, + "learning_rate": 9.975461648448902e-05, + "loss": 2.2686, + "step": 1976 + }, + { + "epoch": 0.6068139963167587, + "grad_norm": 0.7254514098167419, + "learning_rate": 9.975412439789949e-05, + "loss": 2.2748, + "step": 1977 + }, + { + "epoch": 0.6071209330877839, + "grad_norm": 0.7280047535896301, + "learning_rate": 9.975363181961052e-05, + "loss": 2.27, + "step": 1978 + }, + { + "epoch": 0.607427869858809, + "grad_norm": 0.6801813244819641, + "learning_rate": 9.9753138749627e-05, + "loss": 2.2356, + "step": 1979 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.841946005821228, + "learning_rate": 9.975264518795382e-05, + "loss": 2.3887, + "step": 1980 + }, + { + "epoch": 0.6080417434008594, + "grad_norm": 0.9610007405281067, + "learning_rate": 9.975215113459582e-05, + "loss": 2.2857, + "step": 1981 + }, + { + "epoch": 0.6083486801718846, + "grad_norm": 0.8726536631584167, + "learning_rate": 9.975165658955791e-05, + "loss": 2.3137, + "step": 1982 + }, + { + "epoch": 0.6086556169429097, + "grad_norm": 0.9275946021080017, + "learning_rate": 9.975116155284498e-05, + "loss": 2.291, + "step": 1983 + }, + { + "epoch": 0.6089625537139349, + "grad_norm": 0.9045402407646179, + "learning_rate": 9.97506660244619e-05, + "loss": 2.2183, + "step": 1984 + }, + { + "epoch": 0.6092694904849602, + "grad_norm": 0.7913599610328674, + "learning_rate": 9.975017000441358e-05, + "loss": 2.349, + "step": 1985 + }, + { + "epoch": 0.6095764272559853, + "grad_norm": 0.714824378490448, + "learning_rate": 9.974967349270492e-05, + "loss": 2.2163, + "step": 1986 + }, + { + "epoch": 0.6098833640270105, + "grad_norm": 0.7178559899330139, + "learning_rate": 9.974917648934084e-05, + "loss": 2.2338, + "step": 1987 + }, + { + "epoch": 0.6101903007980356, + "grad_norm": 0.8417280912399292, + "learning_rate": 9.97486789943262e-05, + "loss": 2.1961, + "step": 1988 + }, + { + "epoch": 0.6104972375690608, + "grad_norm": 0.8488532304763794, + "learning_rate": 9.9748181007666e-05, + "loss": 2.2509, + "step": 1989 + }, + { + "epoch": 0.6108041743400859, + "grad_norm": 0.796309769153595, + "learning_rate": 9.974768252936509e-05, + "loss": 2.2948, + "step": 1990 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.7163965702056885, + "learning_rate": 9.974718355942843e-05, + "loss": 2.2136, + "step": 1991 + }, + { + "epoch": 0.6114180478821363, + "grad_norm": 0.6620060205459595, + "learning_rate": 9.974668409786095e-05, + "loss": 2.2442, + "step": 1992 + }, + { + "epoch": 0.6117249846531615, + "grad_norm": 0.6843542456626892, + "learning_rate": 9.974618414466759e-05, + "loss": 2.1972, + "step": 1993 + }, + { + "epoch": 0.6120319214241866, + "grad_norm": 0.699847936630249, + "learning_rate": 9.974568369985327e-05, + "loss": 2.2194, + "step": 1994 + }, + { + "epoch": 0.6123388581952118, + "grad_norm": 0.693384051322937, + "learning_rate": 9.974518276342293e-05, + "loss": 2.2446, + "step": 1995 + }, + { + "epoch": 0.612645794966237, + "grad_norm": 0.6022316813468933, + "learning_rate": 9.974468133538155e-05, + "loss": 2.2037, + "step": 1996 + }, + { + "epoch": 0.6129527317372622, + "grad_norm": 0.6317062377929688, + "learning_rate": 9.974417941573409e-05, + "loss": 2.1855, + "step": 1997 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.7291355133056641, + "learning_rate": 9.974367700448547e-05, + "loss": 2.2179, + "step": 1998 + }, + { + "epoch": 0.6135666052793125, + "grad_norm": 0.6776867508888245, + "learning_rate": 9.97431741016407e-05, + "loss": 2.2437, + "step": 1999 + }, + { + "epoch": 0.6138735420503376, + "grad_norm": 0.6598517298698425, + "learning_rate": 9.97426707072047e-05, + "loss": 2.2775, + "step": 2000 + }, + { + "epoch": 0.6141804788213628, + "grad_norm": 0.6681709289550781, + "learning_rate": 9.974216682118249e-05, + "loss": 2.2004, + "step": 2001 + }, + { + "epoch": 0.614487415592388, + "grad_norm": 0.6725168228149414, + "learning_rate": 9.974166244357903e-05, + "loss": 2.2922, + "step": 2002 + }, + { + "epoch": 0.6147943523634132, + "grad_norm": 0.6547908782958984, + "learning_rate": 9.974115757439931e-05, + "loss": 2.2195, + "step": 2003 + }, + { + "epoch": 0.6151012891344383, + "grad_norm": 0.7195348739624023, + "learning_rate": 9.974065221364831e-05, + "loss": 2.2862, + "step": 2004 + }, + { + "epoch": 0.6154082259054635, + "grad_norm": 0.7992655038833618, + "learning_rate": 9.974014636133103e-05, + "loss": 2.3109, + "step": 2005 + }, + { + "epoch": 0.6157151626764886, + "grad_norm": 0.7932934165000916, + "learning_rate": 9.973964001745249e-05, + "loss": 2.2869, + "step": 2006 + }, + { + "epoch": 0.6160220994475138, + "grad_norm": 0.7778924107551575, + "learning_rate": 9.973913318201763e-05, + "loss": 2.2046, + "step": 2007 + }, + { + "epoch": 0.616329036218539, + "grad_norm": 0.7951294183731079, + "learning_rate": 9.973862585503155e-05, + "loss": 2.221, + "step": 2008 + }, + { + "epoch": 0.6166359729895642, + "grad_norm": 0.729552686214447, + "learning_rate": 9.97381180364992e-05, + "loss": 2.2929, + "step": 2009 + }, + { + "epoch": 0.6169429097605893, + "grad_norm": 0.731516420841217, + "learning_rate": 9.973760972642561e-05, + "loss": 2.2673, + "step": 2010 + }, + { + "epoch": 0.6172498465316145, + "grad_norm": 0.6950094103813171, + "learning_rate": 9.973710092481581e-05, + "loss": 2.2029, + "step": 2011 + }, + { + "epoch": 0.6175567833026396, + "grad_norm": 0.6260825395584106, + "learning_rate": 9.973659163167484e-05, + "loss": 2.3037, + "step": 2012 + }, + { + "epoch": 0.6178637200736649, + "grad_norm": 0.6949467658996582, + "learning_rate": 9.97360818470077e-05, + "loss": 2.2699, + "step": 2013 + }, + { + "epoch": 0.61817065684469, + "grad_norm": 0.7322572469711304, + "learning_rate": 9.973557157081945e-05, + "loss": 2.2921, + "step": 2014 + }, + { + "epoch": 0.6184775936157152, + "grad_norm": 0.8999563455581665, + "learning_rate": 9.973506080311514e-05, + "loss": 2.2499, + "step": 2015 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.9269914031028748, + "learning_rate": 9.973454954389981e-05, + "loss": 2.2676, + "step": 2016 + }, + { + "epoch": 0.6190914671577655, + "grad_norm": 0.8630712628364563, + "learning_rate": 9.973403779317852e-05, + "loss": 2.1379, + "step": 2017 + }, + { + "epoch": 0.6193984039287906, + "grad_norm": 0.8249645233154297, + "learning_rate": 9.97335255509563e-05, + "loss": 2.3109, + "step": 2018 + }, + { + "epoch": 0.6197053406998159, + "grad_norm": 0.7832711338996887, + "learning_rate": 9.973301281723824e-05, + "loss": 2.1316, + "step": 2019 + }, + { + "epoch": 0.620012277470841, + "grad_norm": 0.7502821683883667, + "learning_rate": 9.97324995920294e-05, + "loss": 2.2188, + "step": 2020 + }, + { + "epoch": 0.6203192142418662, + "grad_norm": 0.7804487347602844, + "learning_rate": 9.973198587533483e-05, + "loss": 2.2639, + "step": 2021 + }, + { + "epoch": 0.6206261510128913, + "grad_norm": 0.9198356866836548, + "learning_rate": 9.973147166715963e-05, + "loss": 2.2574, + "step": 2022 + }, + { + "epoch": 0.6209330877839165, + "grad_norm": 0.8792869448661804, + "learning_rate": 9.97309569675089e-05, + "loss": 2.2228, + "step": 2023 + }, + { + "epoch": 0.6212400245549416, + "grad_norm": 0.779772937297821, + "learning_rate": 9.97304417763877e-05, + "loss": 2.2179, + "step": 2024 + }, + { + "epoch": 0.6215469613259669, + "grad_norm": 0.7702100276947021, + "learning_rate": 9.972992609380111e-05, + "loss": 2.3872, + "step": 2025 + }, + { + "epoch": 0.621853898096992, + "grad_norm": 0.8576669096946716, + "learning_rate": 9.972940991975426e-05, + "loss": 2.2279, + "step": 2026 + }, + { + "epoch": 0.6221608348680172, + "grad_norm": 0.8312802314758301, + "learning_rate": 9.972889325425223e-05, + "loss": 2.3507, + "step": 2027 + }, + { + "epoch": 0.6224677716390423, + "grad_norm": 0.7873719930648804, + "learning_rate": 9.972837609730013e-05, + "loss": 2.2252, + "step": 2028 + }, + { + "epoch": 0.6227747084100675, + "grad_norm": 0.7763897180557251, + "learning_rate": 9.972785844890307e-05, + "loss": 2.2559, + "step": 2029 + }, + { + "epoch": 0.6230816451810927, + "grad_norm": 0.7053700685501099, + "learning_rate": 9.972734030906617e-05, + "loss": 2.2248, + "step": 2030 + }, + { + "epoch": 0.6233885819521179, + "grad_norm": 0.8800643682479858, + "learning_rate": 9.972682167779453e-05, + "loss": 2.3111, + "step": 2031 + }, + { + "epoch": 0.623695518723143, + "grad_norm": 0.7237632274627686, + "learning_rate": 9.97263025550933e-05, + "loss": 2.2255, + "step": 2032 + }, + { + "epoch": 0.6240024554941682, + "grad_norm": 0.7139064073562622, + "learning_rate": 9.97257829409676e-05, + "loss": 2.2065, + "step": 2033 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.6514315009117126, + "learning_rate": 9.972526283542259e-05, + "loss": 2.2176, + "step": 2034 + }, + { + "epoch": 0.6246163290362186, + "grad_norm": 0.726828932762146, + "learning_rate": 9.972474223846337e-05, + "loss": 2.2236, + "step": 2035 + }, + { + "epoch": 0.6249232658072437, + "grad_norm": 0.7121313810348511, + "learning_rate": 9.97242211500951e-05, + "loss": 2.2696, + "step": 2036 + }, + { + "epoch": 0.6252302025782689, + "grad_norm": 0.7203021049499512, + "learning_rate": 9.972369957032293e-05, + "loss": 2.2418, + "step": 2037 + }, + { + "epoch": 0.625537139349294, + "grad_norm": 0.6843051910400391, + "learning_rate": 9.972317749915203e-05, + "loss": 2.2408, + "step": 2038 + }, + { + "epoch": 0.6258440761203192, + "grad_norm": 0.6523141264915466, + "learning_rate": 9.972265493658754e-05, + "loss": 2.1693, + "step": 2039 + }, + { + "epoch": 0.6261510128913443, + "grad_norm": 0.6263946294784546, + "learning_rate": 9.972213188263463e-05, + "loss": 2.2477, + "step": 2040 + }, + { + "epoch": 0.6264579496623696, + "grad_norm": 0.6428464651107788, + "learning_rate": 9.972160833729847e-05, + "loss": 2.2131, + "step": 2041 + }, + { + "epoch": 0.6267648864333947, + "grad_norm": 0.6333484649658203, + "learning_rate": 9.972108430058423e-05, + "loss": 2.2806, + "step": 2042 + }, + { + "epoch": 0.6270718232044199, + "grad_norm": 0.7168832421302795, + "learning_rate": 9.97205597724971e-05, + "loss": 2.2468, + "step": 2043 + }, + { + "epoch": 0.627378759975445, + "grad_norm": 0.7522227168083191, + "learning_rate": 9.972003475304226e-05, + "loss": 2.249, + "step": 2044 + }, + { + "epoch": 0.6276856967464702, + "grad_norm": 0.6810066103935242, + "learning_rate": 9.971950924222488e-05, + "loss": 2.1988, + "step": 2045 + }, + { + "epoch": 0.6279926335174953, + "grad_norm": 0.6983187198638916, + "learning_rate": 9.971898324005018e-05, + "loss": 2.2444, + "step": 2046 + }, + { + "epoch": 0.6282995702885206, + "grad_norm": 0.7261439561843872, + "learning_rate": 9.971845674652333e-05, + "loss": 2.1789, + "step": 2047 + }, + { + "epoch": 0.6286065070595457, + "grad_norm": 0.6844322681427002, + "learning_rate": 9.971792976164957e-05, + "loss": 2.2666, + "step": 2048 + }, + { + "epoch": 0.6289134438305709, + "grad_norm": 0.7166746258735657, + "learning_rate": 9.971740228543407e-05, + "loss": 2.3002, + "step": 2049 + }, + { + "epoch": 0.629220380601596, + "grad_norm": 0.7386785745620728, + "learning_rate": 9.971687431788207e-05, + "loss": 2.1798, + "step": 2050 + }, + { + "epoch": 0.6295273173726212, + "grad_norm": 0.6873611211776733, + "learning_rate": 9.971634585899878e-05, + "loss": 2.184, + "step": 2051 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.8005948066711426, + "learning_rate": 9.971581690878941e-05, + "loss": 2.2778, + "step": 2052 + }, + { + "epoch": 0.6301411909146716, + "grad_norm": 0.8972415924072266, + "learning_rate": 9.971528746725922e-05, + "loss": 2.2822, + "step": 2053 + }, + { + "epoch": 0.6304481276856968, + "grad_norm": 0.7935822010040283, + "learning_rate": 9.97147575344134e-05, + "loss": 2.1732, + "step": 2054 + }, + { + "epoch": 0.6307550644567219, + "grad_norm": 0.7891644239425659, + "learning_rate": 9.971422711025721e-05, + "loss": 2.2765, + "step": 2055 + }, + { + "epoch": 0.6310620012277471, + "grad_norm": 0.7857005000114441, + "learning_rate": 9.971369619479589e-05, + "loss": 2.2386, + "step": 2056 + }, + { + "epoch": 0.6313689379987723, + "grad_norm": 0.6909852623939514, + "learning_rate": 9.97131647880347e-05, + "loss": 2.1251, + "step": 2057 + }, + { + "epoch": 0.6316758747697975, + "grad_norm": 0.6352387070655823, + "learning_rate": 9.971263288997885e-05, + "loss": 2.1883, + "step": 2058 + }, + { + "epoch": 0.6319828115408226, + "grad_norm": 0.5811386704444885, + "learning_rate": 9.971210050063364e-05, + "loss": 2.281, + "step": 2059 + }, + { + "epoch": 0.6322897483118478, + "grad_norm": 0.6227630376815796, + "learning_rate": 9.971156762000432e-05, + "loss": 2.1346, + "step": 2060 + }, + { + "epoch": 0.6325966850828729, + "grad_norm": 0.6628422737121582, + "learning_rate": 9.971103424809616e-05, + "loss": 2.2617, + "step": 2061 + }, + { + "epoch": 0.6329036218538981, + "grad_norm": 0.7212308645248413, + "learning_rate": 9.97105003849144e-05, + "loss": 2.1764, + "step": 2062 + }, + { + "epoch": 0.6332105586249233, + "grad_norm": 0.8368894457817078, + "learning_rate": 9.970996603046435e-05, + "loss": 2.2897, + "step": 2063 + }, + { + "epoch": 0.6335174953959485, + "grad_norm": 0.8797467350959778, + "learning_rate": 9.970943118475129e-05, + "loss": 2.1987, + "step": 2064 + }, + { + "epoch": 0.6338244321669736, + "grad_norm": 0.9241101145744324, + "learning_rate": 9.970889584778047e-05, + "loss": 2.2759, + "step": 2065 + }, + { + "epoch": 0.6341313689379988, + "grad_norm": 0.8636183142662048, + "learning_rate": 9.970836001955723e-05, + "loss": 2.2188, + "step": 2066 + }, + { + "epoch": 0.6344383057090239, + "grad_norm": 0.8965754508972168, + "learning_rate": 9.970782370008682e-05, + "loss": 2.2845, + "step": 2067 + }, + { + "epoch": 0.6347452424800492, + "grad_norm": 0.9064372777938843, + "learning_rate": 9.970728688937459e-05, + "loss": 2.1787, + "step": 2068 + }, + { + "epoch": 0.6350521792510743, + "grad_norm": 0.7387171387672424, + "learning_rate": 9.970674958742579e-05, + "loss": 2.1805, + "step": 2069 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.6220484972000122, + "learning_rate": 9.970621179424578e-05, + "loss": 2.2762, + "step": 2070 + }, + { + "epoch": 0.6356660527931246, + "grad_norm": 0.6268464922904968, + "learning_rate": 9.970567350983984e-05, + "loss": 2.2491, + "step": 2071 + }, + { + "epoch": 0.6359729895641498, + "grad_norm": 0.6385738253593445, + "learning_rate": 9.97051347342133e-05, + "loss": 2.2126, + "step": 2072 + }, + { + "epoch": 0.6362799263351749, + "grad_norm": 0.7084285020828247, + "learning_rate": 9.970459546737148e-05, + "loss": 2.2364, + "step": 2073 + }, + { + "epoch": 0.6365868631062002, + "grad_norm": 0.6957145929336548, + "learning_rate": 9.97040557093197e-05, + "loss": 2.266, + "step": 2074 + }, + { + "epoch": 0.6368937998772253, + "grad_norm": 0.6037309169769287, + "learning_rate": 9.970351546006334e-05, + "loss": 2.1514, + "step": 2075 + }, + { + "epoch": 0.6372007366482505, + "grad_norm": 0.6342970132827759, + "learning_rate": 9.97029747196077e-05, + "loss": 2.1602, + "step": 2076 + }, + { + "epoch": 0.6375076734192756, + "grad_norm": 0.5793863534927368, + "learning_rate": 9.970243348795812e-05, + "loss": 2.1853, + "step": 2077 + }, + { + "epoch": 0.6378146101903008, + "grad_norm": 0.5420103073120117, + "learning_rate": 9.970189176511997e-05, + "loss": 2.1885, + "step": 2078 + }, + { + "epoch": 0.638121546961326, + "grad_norm": 0.6713188886642456, + "learning_rate": 9.97013495510986e-05, + "loss": 2.2641, + "step": 2079 + }, + { + "epoch": 0.6384284837323512, + "grad_norm": 0.7410796880722046, + "learning_rate": 9.970080684589935e-05, + "loss": 2.2248, + "step": 2080 + }, + { + "epoch": 0.6387354205033763, + "grad_norm": 0.7138017416000366, + "learning_rate": 9.970026364952761e-05, + "loss": 2.1975, + "step": 2081 + }, + { + "epoch": 0.6390423572744015, + "grad_norm": 0.7553584575653076, + "learning_rate": 9.969971996198873e-05, + "loss": 2.2482, + "step": 2082 + }, + { + "epoch": 0.6393492940454266, + "grad_norm": 0.7082852125167847, + "learning_rate": 9.969917578328808e-05, + "loss": 2.1681, + "step": 2083 + }, + { + "epoch": 0.6396562308164518, + "grad_norm": 0.6190223097801208, + "learning_rate": 9.969863111343105e-05, + "loss": 2.1995, + "step": 2084 + }, + { + "epoch": 0.639963167587477, + "grad_norm": 0.6640429496765137, + "learning_rate": 9.969808595242302e-05, + "loss": 2.2969, + "step": 2085 + }, + { + "epoch": 0.6402701043585022, + "grad_norm": 0.761377215385437, + "learning_rate": 9.969754030026936e-05, + "loss": 2.2412, + "step": 2086 + }, + { + "epoch": 0.6405770411295273, + "grad_norm": 0.7226401567459106, + "learning_rate": 9.969699415697551e-05, + "loss": 2.1852, + "step": 2087 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.6474639177322388, + "learning_rate": 9.969644752254681e-05, + "loss": 2.1867, + "step": 2088 + }, + { + "epoch": 0.6411909146715776, + "grad_norm": 0.6725835800170898, + "learning_rate": 9.96959003969887e-05, + "loss": 2.1962, + "step": 2089 + }, + { + "epoch": 0.6414978514426029, + "grad_norm": 0.6669641733169556, + "learning_rate": 9.969535278030657e-05, + "loss": 2.2045, + "step": 2090 + }, + { + "epoch": 0.641804788213628, + "grad_norm": 0.7604048252105713, + "learning_rate": 9.969480467250583e-05, + "loss": 2.2543, + "step": 2091 + }, + { + "epoch": 0.6421117249846532, + "grad_norm": 0.9369953870773315, + "learning_rate": 9.969425607359191e-05, + "loss": 2.2461, + "step": 2092 + }, + { + "epoch": 0.6424186617556783, + "grad_norm": 1.116156816482544, + "learning_rate": 9.969370698357022e-05, + "loss": 2.2447, + "step": 2093 + }, + { + "epoch": 0.6427255985267035, + "grad_norm": 0.9179674983024597, + "learning_rate": 9.96931574024462e-05, + "loss": 2.2164, + "step": 2094 + }, + { + "epoch": 0.6430325352977286, + "grad_norm": 0.7629393339157104, + "learning_rate": 9.969260733022526e-05, + "loss": 2.22, + "step": 2095 + }, + { + "epoch": 0.6433394720687539, + "grad_norm": 0.7152948379516602, + "learning_rate": 9.969205676691286e-05, + "loss": 2.1967, + "step": 2096 + }, + { + "epoch": 0.643646408839779, + "grad_norm": 0.7527763247489929, + "learning_rate": 9.969150571251442e-05, + "loss": 2.2263, + "step": 2097 + }, + { + "epoch": 0.6439533456108042, + "grad_norm": 0.9889422655105591, + "learning_rate": 9.96909541670354e-05, + "loss": 2.2127, + "step": 2098 + }, + { + "epoch": 0.6442602823818293, + "grad_norm": 1.0340619087219238, + "learning_rate": 9.969040213048125e-05, + "loss": 2.2392, + "step": 2099 + }, + { + "epoch": 0.6445672191528545, + "grad_norm": 0.735322892665863, + "learning_rate": 9.968984960285743e-05, + "loss": 2.1351, + "step": 2100 + }, + { + "epoch": 0.6448741559238796, + "grad_norm": 0.6575397849082947, + "learning_rate": 9.968929658416936e-05, + "loss": 2.2481, + "step": 2101 + }, + { + "epoch": 0.6451810926949049, + "grad_norm": 0.6891960501670837, + "learning_rate": 9.968874307442258e-05, + "loss": 2.2164, + "step": 2102 + }, + { + "epoch": 0.64548802946593, + "grad_norm": 0.792298436164856, + "learning_rate": 9.968818907362248e-05, + "loss": 2.1681, + "step": 2103 + }, + { + "epoch": 0.6457949662369552, + "grad_norm": 0.8438142538070679, + "learning_rate": 9.968763458177459e-05, + "loss": 2.2123, + "step": 2104 + }, + { + "epoch": 0.6461019030079803, + "grad_norm": 0.7494921088218689, + "learning_rate": 9.968707959888436e-05, + "loss": 2.1863, + "step": 2105 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.7049927115440369, + "learning_rate": 9.968652412495731e-05, + "loss": 2.2364, + "step": 2106 + }, + { + "epoch": 0.6467157765500307, + "grad_norm": 0.7586455345153809, + "learning_rate": 9.968596815999889e-05, + "loss": 2.1976, + "step": 2107 + }, + { + "epoch": 0.6470227133210559, + "grad_norm": 0.7762691974639893, + "learning_rate": 9.968541170401462e-05, + "loss": 2.2323, + "step": 2108 + }, + { + "epoch": 0.647329650092081, + "grad_norm": 0.8127642869949341, + "learning_rate": 9.968485475700998e-05, + "loss": 2.1577, + "step": 2109 + }, + { + "epoch": 0.6476365868631062, + "grad_norm": 0.6762635111808777, + "learning_rate": 9.968429731899049e-05, + "loss": 2.1972, + "step": 2110 + }, + { + "epoch": 0.6479435236341313, + "grad_norm": 0.675707995891571, + "learning_rate": 9.968373938996165e-05, + "loss": 2.1932, + "step": 2111 + }, + { + "epoch": 0.6482504604051565, + "grad_norm": 0.6996815204620361, + "learning_rate": 9.968318096992898e-05, + "loss": 2.2695, + "step": 2112 + }, + { + "epoch": 0.6485573971761817, + "grad_norm": 0.8519851565361023, + "learning_rate": 9.968262205889799e-05, + "loss": 2.2662, + "step": 2113 + }, + { + "epoch": 0.6488643339472069, + "grad_norm": 0.7621145844459534, + "learning_rate": 9.968206265687421e-05, + "loss": 2.2888, + "step": 2114 + }, + { + "epoch": 0.649171270718232, + "grad_norm": 0.786609411239624, + "learning_rate": 9.968150276386317e-05, + "loss": 2.3354, + "step": 2115 + }, + { + "epoch": 0.6494782074892572, + "grad_norm": 0.7693428993225098, + "learning_rate": 9.96809423798704e-05, + "loss": 2.1981, + "step": 2116 + }, + { + "epoch": 0.6497851442602823, + "grad_norm": 0.72762131690979, + "learning_rate": 9.968038150490145e-05, + "loss": 2.2387, + "step": 2117 + }, + { + "epoch": 0.6500920810313076, + "grad_norm": 0.737617015838623, + "learning_rate": 9.967982013896184e-05, + "loss": 2.258, + "step": 2118 + }, + { + "epoch": 0.6503990178023327, + "grad_norm": 0.7320968508720398, + "learning_rate": 9.967925828205712e-05, + "loss": 2.3248, + "step": 2119 + }, + { + "epoch": 0.6507059545733579, + "grad_norm": 0.7904484868049622, + "learning_rate": 9.967869593419286e-05, + "loss": 2.2121, + "step": 2120 + }, + { + "epoch": 0.651012891344383, + "grad_norm": 0.7519722580909729, + "learning_rate": 9.967813309537461e-05, + "loss": 2.1999, + "step": 2121 + }, + { + "epoch": 0.6513198281154082, + "grad_norm": 0.7201504707336426, + "learning_rate": 9.967756976560793e-05, + "loss": 2.2022, + "step": 2122 + }, + { + "epoch": 0.6516267648864333, + "grad_norm": 0.6134514808654785, + "learning_rate": 9.96770059448984e-05, + "loss": 2.2105, + "step": 2123 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.6086028218269348, + "learning_rate": 9.967644163325156e-05, + "loss": 2.212, + "step": 2124 + }, + { + "epoch": 0.6522406384284838, + "grad_norm": 0.6550475358963013, + "learning_rate": 9.967587683067302e-05, + "loss": 2.181, + "step": 2125 + }, + { + "epoch": 0.6525475751995089, + "grad_norm": 0.7557916045188904, + "learning_rate": 9.967531153716835e-05, + "loss": 2.3194, + "step": 2126 + }, + { + "epoch": 0.6528545119705341, + "grad_norm": 0.8859965801239014, + "learning_rate": 9.967474575274314e-05, + "loss": 2.2104, + "step": 2127 + }, + { + "epoch": 0.6531614487415592, + "grad_norm": 0.8049005270004272, + "learning_rate": 9.967417947740296e-05, + "loss": 2.2949, + "step": 2128 + }, + { + "epoch": 0.6534683855125845, + "grad_norm": 0.708297073841095, + "learning_rate": 9.967361271115343e-05, + "loss": 2.1703, + "step": 2129 + }, + { + "epoch": 0.6537753222836096, + "grad_norm": 0.6764169335365295, + "learning_rate": 9.967304545400016e-05, + "loss": 2.2177, + "step": 2130 + }, + { + "epoch": 0.6540822590546348, + "grad_norm": 0.6987971067428589, + "learning_rate": 9.967247770594872e-05, + "loss": 2.1699, + "step": 2131 + }, + { + "epoch": 0.6543891958256599, + "grad_norm": 0.7212976217269897, + "learning_rate": 9.967190946700476e-05, + "loss": 2.1217, + "step": 2132 + }, + { + "epoch": 0.6546961325966851, + "grad_norm": 0.6805562973022461, + "learning_rate": 9.967134073717386e-05, + "loss": 2.2295, + "step": 2133 + }, + { + "epoch": 0.6550030693677102, + "grad_norm": 0.665428102016449, + "learning_rate": 9.967077151646167e-05, + "loss": 2.1742, + "step": 2134 + }, + { + "epoch": 0.6553100061387355, + "grad_norm": 0.6691353917121887, + "learning_rate": 9.967020180487378e-05, + "loss": 2.2313, + "step": 2135 + }, + { + "epoch": 0.6556169429097606, + "grad_norm": 0.7095547914505005, + "learning_rate": 9.966963160241587e-05, + "loss": 2.1367, + "step": 2136 + }, + { + "epoch": 0.6559238796807858, + "grad_norm": 0.7050215601921082, + "learning_rate": 9.966906090909353e-05, + "loss": 2.3234, + "step": 2137 + }, + { + "epoch": 0.6562308164518109, + "grad_norm": 0.7592353820800781, + "learning_rate": 9.966848972491245e-05, + "loss": 2.1722, + "step": 2138 + }, + { + "epoch": 0.6565377532228361, + "grad_norm": 0.6520100831985474, + "learning_rate": 9.96679180498782e-05, + "loss": 2.2401, + "step": 2139 + }, + { + "epoch": 0.6568446899938613, + "grad_norm": 0.6650902628898621, + "learning_rate": 9.966734588399651e-05, + "loss": 2.2094, + "step": 2140 + }, + { + "epoch": 0.6571516267648865, + "grad_norm": 0.7236151099205017, + "learning_rate": 9.966677322727299e-05, + "loss": 2.3021, + "step": 2141 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.7160753011703491, + "learning_rate": 9.966620007971327e-05, + "loss": 2.1992, + "step": 2142 + }, + { + "epoch": 0.6577655003069368, + "grad_norm": 0.6761705279350281, + "learning_rate": 9.966562644132309e-05, + "loss": 2.1853, + "step": 2143 + }, + { + "epoch": 0.6580724370779619, + "grad_norm": 0.7017555236816406, + "learning_rate": 9.966505231210806e-05, + "loss": 2.208, + "step": 2144 + }, + { + "epoch": 0.6583793738489871, + "grad_norm": 0.7652586102485657, + "learning_rate": 9.966447769207387e-05, + "loss": 2.3065, + "step": 2145 + }, + { + "epoch": 0.6586863106200123, + "grad_norm": 0.7148436307907104, + "learning_rate": 9.966390258122621e-05, + "loss": 2.1388, + "step": 2146 + }, + { + "epoch": 0.6589932473910375, + "grad_norm": 0.5885360240936279, + "learning_rate": 9.966332697957076e-05, + "loss": 2.1463, + "step": 2147 + }, + { + "epoch": 0.6593001841620626, + "grad_norm": 0.6800816655158997, + "learning_rate": 9.966275088711321e-05, + "loss": 2.3397, + "step": 2148 + }, + { + "epoch": 0.6596071209330878, + "grad_norm": 0.6856956481933594, + "learning_rate": 9.966217430385925e-05, + "loss": 2.0893, + "step": 2149 + }, + { + "epoch": 0.6599140577041129, + "grad_norm": 0.6302888989448547, + "learning_rate": 9.966159722981456e-05, + "loss": 2.1108, + "step": 2150 + }, + { + "epoch": 0.6602209944751382, + "grad_norm": 0.6145252585411072, + "learning_rate": 9.966101966498486e-05, + "loss": 2.2668, + "step": 2151 + }, + { + "epoch": 0.6605279312461633, + "grad_norm": 0.7258949279785156, + "learning_rate": 9.966044160937586e-05, + "loss": 2.2163, + "step": 2152 + }, + { + "epoch": 0.6608348680171885, + "grad_norm": 0.6809847950935364, + "learning_rate": 9.965986306299327e-05, + "loss": 2.1828, + "step": 2153 + }, + { + "epoch": 0.6611418047882136, + "grad_norm": 0.6673223376274109, + "learning_rate": 9.96592840258428e-05, + "loss": 2.232, + "step": 2154 + }, + { + "epoch": 0.6614487415592388, + "grad_norm": 0.6483572721481323, + "learning_rate": 9.96587044979302e-05, + "loss": 2.199, + "step": 2155 + }, + { + "epoch": 0.6617556783302639, + "grad_norm": 0.6227185726165771, + "learning_rate": 9.965812447926115e-05, + "loss": 2.166, + "step": 2156 + }, + { + "epoch": 0.6620626151012892, + "grad_norm": 0.5982463955879211, + "learning_rate": 9.965754396984142e-05, + "loss": 2.2074, + "step": 2157 + }, + { + "epoch": 0.6623695518723143, + "grad_norm": 0.6357809901237488, + "learning_rate": 9.965696296967673e-05, + "loss": 2.2086, + "step": 2158 + }, + { + "epoch": 0.6626764886433395, + "grad_norm": 0.5908147692680359, + "learning_rate": 9.965638147877283e-05, + "loss": 2.1103, + "step": 2159 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.591332733631134, + "learning_rate": 9.965579949713545e-05, + "loss": 2.1698, + "step": 2160 + }, + { + "epoch": 0.6632903621853898, + "grad_norm": 0.5748336911201477, + "learning_rate": 9.965521702477038e-05, + "loss": 2.1812, + "step": 2161 + }, + { + "epoch": 0.663597298956415, + "grad_norm": 0.6643908023834229, + "learning_rate": 9.965463406168334e-05, + "loss": 2.2129, + "step": 2162 + }, + { + "epoch": 0.6639042357274402, + "grad_norm": 0.637627124786377, + "learning_rate": 9.965405060788011e-05, + "loss": 2.226, + "step": 2163 + }, + { + "epoch": 0.6642111724984653, + "grad_norm": 0.6170387268066406, + "learning_rate": 9.965346666336644e-05, + "loss": 2.2025, + "step": 2164 + }, + { + "epoch": 0.6645181092694905, + "grad_norm": 0.6038833260536194, + "learning_rate": 9.965288222814812e-05, + "loss": 2.1761, + "step": 2165 + }, + { + "epoch": 0.6648250460405156, + "grad_norm": 0.5705585479736328, + "learning_rate": 9.965229730223092e-05, + "loss": 2.1511, + "step": 2166 + }, + { + "epoch": 0.6651319828115408, + "grad_norm": 0.5994759798049927, + "learning_rate": 9.965171188562059e-05, + "loss": 2.1763, + "step": 2167 + }, + { + "epoch": 0.665438919582566, + "grad_norm": 0.5887313485145569, + "learning_rate": 9.965112597832296e-05, + "loss": 2.2185, + "step": 2168 + }, + { + "epoch": 0.6657458563535912, + "grad_norm": 0.5688689947128296, + "learning_rate": 9.96505395803438e-05, + "loss": 2.2387, + "step": 2169 + }, + { + "epoch": 0.6660527931246163, + "grad_norm": 0.6121554970741272, + "learning_rate": 9.96499526916889e-05, + "loss": 2.1938, + "step": 2170 + }, + { + "epoch": 0.6663597298956415, + "grad_norm": 0.6048038005828857, + "learning_rate": 9.964936531236407e-05, + "loss": 2.197, + "step": 2171 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6442995071411133, + "learning_rate": 9.96487774423751e-05, + "loss": 2.1725, + "step": 2172 + }, + { + "epoch": 0.6669736034376919, + "grad_norm": 0.7136862874031067, + "learning_rate": 9.964818908172783e-05, + "loss": 2.2166, + "step": 2173 + }, + { + "epoch": 0.667280540208717, + "grad_norm": 0.6902804970741272, + "learning_rate": 9.964760023042805e-05, + "loss": 2.2318, + "step": 2174 + }, + { + "epoch": 0.6675874769797422, + "grad_norm": 0.6946488618850708, + "learning_rate": 9.964701088848158e-05, + "loss": 2.177, + "step": 2175 + }, + { + "epoch": 0.6678944137507673, + "grad_norm": 0.6283712983131409, + "learning_rate": 9.964642105589425e-05, + "loss": 2.2227, + "step": 2176 + }, + { + "epoch": 0.6682013505217925, + "grad_norm": 0.5768510103225708, + "learning_rate": 9.96458307326719e-05, + "loss": 2.1559, + "step": 2177 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.6045784950256348, + "learning_rate": 9.964523991882035e-05, + "loss": 2.2018, + "step": 2178 + }, + { + "epoch": 0.6688152240638429, + "grad_norm": 0.5962889790534973, + "learning_rate": 9.964464861434544e-05, + "loss": 2.1898, + "step": 2179 + }, + { + "epoch": 0.669122160834868, + "grad_norm": 0.6611660718917847, + "learning_rate": 9.964405681925301e-05, + "loss": 2.1989, + "step": 2180 + }, + { + "epoch": 0.6694290976058932, + "grad_norm": 0.6764575242996216, + "learning_rate": 9.964346453354891e-05, + "loss": 2.2764, + "step": 2181 + }, + { + "epoch": 0.6697360343769183, + "grad_norm": 0.6795048117637634, + "learning_rate": 9.964287175723899e-05, + "loss": 2.1313, + "step": 2182 + }, + { + "epoch": 0.6700429711479435, + "grad_norm": 0.6697003841400146, + "learning_rate": 9.964227849032914e-05, + "loss": 2.1999, + "step": 2183 + }, + { + "epoch": 0.6703499079189686, + "grad_norm": 0.669682502746582, + "learning_rate": 9.964168473282519e-05, + "loss": 2.202, + "step": 2184 + }, + { + "epoch": 0.6706568446899939, + "grad_norm": 0.6823530793190002, + "learning_rate": 9.9641090484733e-05, + "loss": 2.2326, + "step": 2185 + }, + { + "epoch": 0.670963781461019, + "grad_norm": 0.7460775971412659, + "learning_rate": 9.964049574605848e-05, + "loss": 2.1594, + "step": 2186 + }, + { + "epoch": 0.6712707182320442, + "grad_norm": 0.8075460195541382, + "learning_rate": 9.963990051680744e-05, + "loss": 2.1506, + "step": 2187 + }, + { + "epoch": 0.6715776550030693, + "grad_norm": 0.8041695356369019, + "learning_rate": 9.963930479698585e-05, + "loss": 2.123, + "step": 2188 + }, + { + "epoch": 0.6718845917740945, + "grad_norm": 0.9129732251167297, + "learning_rate": 9.963870858659955e-05, + "loss": 2.116, + "step": 2189 + }, + { + "epoch": 0.6721915285451197, + "grad_norm": 0.9989685416221619, + "learning_rate": 9.963811188565444e-05, + "loss": 2.3194, + "step": 2190 + }, + { + "epoch": 0.6724984653161449, + "grad_norm": 1.0353670120239258, + "learning_rate": 9.96375146941564e-05, + "loss": 2.113, + "step": 2191 + }, + { + "epoch": 0.67280540208717, + "grad_norm": 0.897750735282898, + "learning_rate": 9.963691701211135e-05, + "loss": 2.1038, + "step": 2192 + }, + { + "epoch": 0.6731123388581952, + "grad_norm": 0.7353916168212891, + "learning_rate": 9.96363188395252e-05, + "loss": 2.2185, + "step": 2193 + }, + { + "epoch": 0.6734192756292203, + "grad_norm": 0.6474063992500305, + "learning_rate": 9.963572017640385e-05, + "loss": 2.2229, + "step": 2194 + }, + { + "epoch": 0.6737262124002455, + "grad_norm": 0.7194583415985107, + "learning_rate": 9.963512102275322e-05, + "loss": 2.2172, + "step": 2195 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.6638131737709045, + "learning_rate": 9.963452137857926e-05, + "loss": 2.2212, + "step": 2196 + }, + { + "epoch": 0.6743400859422959, + "grad_norm": 0.7219048738479614, + "learning_rate": 9.963392124388782e-05, + "loss": 2.3302, + "step": 2197 + }, + { + "epoch": 0.6746470227133211, + "grad_norm": 0.7941164374351501, + "learning_rate": 9.963332061868491e-05, + "loss": 2.2982, + "step": 2198 + }, + { + "epoch": 0.6749539594843462, + "grad_norm": 0.7356888055801392, + "learning_rate": 9.963271950297643e-05, + "loss": 2.1761, + "step": 2199 + }, + { + "epoch": 0.6752608962553714, + "grad_norm": 0.6705774664878845, + "learning_rate": 9.963211789676831e-05, + "loss": 2.2483, + "step": 2200 + }, + { + "epoch": 0.6755678330263966, + "grad_norm": 0.7958056926727295, + "learning_rate": 9.963151580006653e-05, + "loss": 2.2209, + "step": 2201 + }, + { + "epoch": 0.6758747697974218, + "grad_norm": 0.7215412259101868, + "learning_rate": 9.9630913212877e-05, + "loss": 2.1676, + "step": 2202 + }, + { + "epoch": 0.6761817065684469, + "grad_norm": 0.705649197101593, + "learning_rate": 9.963031013520572e-05, + "loss": 2.1855, + "step": 2203 + }, + { + "epoch": 0.6764886433394721, + "grad_norm": 0.7050254344940186, + "learning_rate": 9.962970656705861e-05, + "loss": 2.171, + "step": 2204 + }, + { + "epoch": 0.6767955801104972, + "grad_norm": 0.7163556218147278, + "learning_rate": 9.962910250844167e-05, + "loss": 2.1295, + "step": 2205 + }, + { + "epoch": 0.6771025168815225, + "grad_norm": 0.7195280194282532, + "learning_rate": 9.962849795936083e-05, + "loss": 2.1436, + "step": 2206 + }, + { + "epoch": 0.6774094536525476, + "grad_norm": 0.7356030344963074, + "learning_rate": 9.962789291982208e-05, + "loss": 2.2739, + "step": 2207 + }, + { + "epoch": 0.6777163904235728, + "grad_norm": 0.783649742603302, + "learning_rate": 9.962728738983143e-05, + "loss": 2.2461, + "step": 2208 + }, + { + "epoch": 0.6780233271945979, + "grad_norm": 0.6966754794120789, + "learning_rate": 9.962668136939481e-05, + "loss": 2.1977, + "step": 2209 + }, + { + "epoch": 0.6783302639656231, + "grad_norm": 0.6986487507820129, + "learning_rate": 9.962607485851825e-05, + "loss": 2.1806, + "step": 2210 + }, + { + "epoch": 0.6786372007366482, + "grad_norm": 0.6502536535263062, + "learning_rate": 9.962546785720774e-05, + "loss": 2.174, + "step": 2211 + }, + { + "epoch": 0.6789441375076735, + "grad_norm": 0.6797144412994385, + "learning_rate": 9.962486036546926e-05, + "loss": 2.2635, + "step": 2212 + }, + { + "epoch": 0.6792510742786986, + "grad_norm": 0.7190150022506714, + "learning_rate": 9.962425238330884e-05, + "loss": 2.2231, + "step": 2213 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.6770560145378113, + "learning_rate": 9.962364391073245e-05, + "loss": 2.1639, + "step": 2214 + }, + { + "epoch": 0.6798649478207489, + "grad_norm": 0.624911904335022, + "learning_rate": 9.962303494774614e-05, + "loss": 2.1754, + "step": 2215 + }, + { + "epoch": 0.6801718845917741, + "grad_norm": 0.7127423286437988, + "learning_rate": 9.96224254943559e-05, + "loss": 2.2047, + "step": 2216 + }, + { + "epoch": 0.6804788213627992, + "grad_norm": 0.6729345321655273, + "learning_rate": 9.962181555056778e-05, + "loss": 2.2245, + "step": 2217 + }, + { + "epoch": 0.6807857581338245, + "grad_norm": 0.7142044901847839, + "learning_rate": 9.96212051163878e-05, + "loss": 2.1827, + "step": 2218 + }, + { + "epoch": 0.6810926949048496, + "grad_norm": 0.686295211315155, + "learning_rate": 9.962059419182196e-05, + "loss": 2.1784, + "step": 2219 + }, + { + "epoch": 0.6813996316758748, + "grad_norm": 0.7207211256027222, + "learning_rate": 9.961998277687634e-05, + "loss": 2.2603, + "step": 2220 + }, + { + "epoch": 0.6817065684468999, + "grad_norm": 0.814552903175354, + "learning_rate": 9.961937087155697e-05, + "loss": 2.2328, + "step": 2221 + }, + { + "epoch": 0.6820135052179251, + "grad_norm": 0.851860761642456, + "learning_rate": 9.96187584758699e-05, + "loss": 2.2334, + "step": 2222 + }, + { + "epoch": 0.6823204419889503, + "grad_norm": 0.9232058525085449, + "learning_rate": 9.961814558982117e-05, + "loss": 2.2259, + "step": 2223 + }, + { + "epoch": 0.6826273787599755, + "grad_norm": 0.8393358588218689, + "learning_rate": 9.961753221341684e-05, + "loss": 2.1347, + "step": 2224 + }, + { + "epoch": 0.6829343155310006, + "grad_norm": 0.7124439477920532, + "learning_rate": 9.961691834666297e-05, + "loss": 2.195, + "step": 2225 + }, + { + "epoch": 0.6832412523020258, + "grad_norm": 0.644290566444397, + "learning_rate": 9.961630398956565e-05, + "loss": 2.1967, + "step": 2226 + }, + { + "epoch": 0.6835481890730509, + "grad_norm": 0.6896283030509949, + "learning_rate": 9.961568914213092e-05, + "loss": 2.1781, + "step": 2227 + }, + { + "epoch": 0.6838551258440762, + "grad_norm": 0.711643636226654, + "learning_rate": 9.961507380436487e-05, + "loss": 2.1091, + "step": 2228 + }, + { + "epoch": 0.6841620626151013, + "grad_norm": 0.7056689858436584, + "learning_rate": 9.961445797627358e-05, + "loss": 2.1848, + "step": 2229 + }, + { + "epoch": 0.6844689993861265, + "grad_norm": 0.60573410987854, + "learning_rate": 9.961384165786314e-05, + "loss": 2.1156, + "step": 2230 + }, + { + "epoch": 0.6847759361571516, + "grad_norm": 0.5612443089485168, + "learning_rate": 9.961322484913963e-05, + "loss": 2.2311, + "step": 2231 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.6356449723243713, + "learning_rate": 9.961260755010916e-05, + "loss": 2.1945, + "step": 2232 + }, + { + "epoch": 0.6853898096992019, + "grad_norm": 0.7393341660499573, + "learning_rate": 9.961198976077782e-05, + "loss": 2.2743, + "step": 2233 + }, + { + "epoch": 0.6856967464702272, + "grad_norm": 0.7658794522285461, + "learning_rate": 9.961137148115171e-05, + "loss": 2.1729, + "step": 2234 + }, + { + "epoch": 0.6860036832412523, + "grad_norm": 0.790540337562561, + "learning_rate": 9.961075271123697e-05, + "loss": 2.1372, + "step": 2235 + }, + { + "epoch": 0.6863106200122775, + "grad_norm": 0.71295565366745, + "learning_rate": 9.961013345103968e-05, + "loss": 2.1325, + "step": 2236 + }, + { + "epoch": 0.6866175567833026, + "grad_norm": 0.6648302674293518, + "learning_rate": 9.960951370056597e-05, + "loss": 2.1626, + "step": 2237 + }, + { + "epoch": 0.6869244935543278, + "grad_norm": 0.6276865601539612, + "learning_rate": 9.960889345982198e-05, + "loss": 2.1848, + "step": 2238 + }, + { + "epoch": 0.6872314303253529, + "grad_norm": 0.6786942481994629, + "learning_rate": 9.960827272881383e-05, + "loss": 2.2402, + "step": 2239 + }, + { + "epoch": 0.6875383670963782, + "grad_norm": 0.7752293348312378, + "learning_rate": 9.960765150754764e-05, + "loss": 2.2187, + "step": 2240 + }, + { + "epoch": 0.6878453038674033, + "grad_norm": 0.7958577871322632, + "learning_rate": 9.960702979602956e-05, + "loss": 2.1995, + "step": 2241 + }, + { + "epoch": 0.6881522406384285, + "grad_norm": 0.7327582240104675, + "learning_rate": 9.960640759426575e-05, + "loss": 2.1709, + "step": 2242 + }, + { + "epoch": 0.6884591774094536, + "grad_norm": 0.7002710103988647, + "learning_rate": 9.960578490226233e-05, + "loss": 2.1966, + "step": 2243 + }, + { + "epoch": 0.6887661141804788, + "grad_norm": 0.6163785457611084, + "learning_rate": 9.960516172002548e-05, + "loss": 2.2012, + "step": 2244 + }, + { + "epoch": 0.689073050951504, + "grad_norm": 0.6808127760887146, + "learning_rate": 9.960453804756134e-05, + "loss": 2.1704, + "step": 2245 + }, + { + "epoch": 0.6893799877225292, + "grad_norm": 0.6571208834648132, + "learning_rate": 9.960391388487609e-05, + "loss": 2.17, + "step": 2246 + }, + { + "epoch": 0.6896869244935543, + "grad_norm": 0.7180834412574768, + "learning_rate": 9.960328923197588e-05, + "loss": 2.229, + "step": 2247 + }, + { + "epoch": 0.6899938612645795, + "grad_norm": 0.7283746600151062, + "learning_rate": 9.96026640888669e-05, + "loss": 2.195, + "step": 2248 + }, + { + "epoch": 0.6903007980356046, + "grad_norm": 0.6808122992515564, + "learning_rate": 9.960203845555531e-05, + "loss": 2.1327, + "step": 2249 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.7105094790458679, + "learning_rate": 9.960141233204731e-05, + "loss": 2.2747, + "step": 2250 + }, + { + "epoch": 0.690914671577655, + "grad_norm": 0.7650291919708252, + "learning_rate": 9.960078571834909e-05, + "loss": 2.2751, + "step": 2251 + }, + { + "epoch": 0.6912216083486802, + "grad_norm": 0.8347647786140442, + "learning_rate": 9.960015861446684e-05, + "loss": 2.2101, + "step": 2252 + }, + { + "epoch": 0.6915285451197053, + "grad_norm": 0.7774063348770142, + "learning_rate": 9.959953102040672e-05, + "loss": 2.1275, + "step": 2253 + }, + { + "epoch": 0.6918354818907305, + "grad_norm": 0.7466274499893188, + "learning_rate": 9.959890293617497e-05, + "loss": 2.1352, + "step": 2254 + }, + { + "epoch": 0.6921424186617556, + "grad_norm": 0.7451669573783875, + "learning_rate": 9.959827436177781e-05, + "loss": 2.1229, + "step": 2255 + }, + { + "epoch": 0.6924493554327809, + "grad_norm": 0.651746392250061, + "learning_rate": 9.959764529722142e-05, + "loss": 2.1416, + "step": 2256 + }, + { + "epoch": 0.692756292203806, + "grad_norm": 0.6267968416213989, + "learning_rate": 9.959701574251203e-05, + "loss": 2.1346, + "step": 2257 + }, + { + "epoch": 0.6930632289748312, + "grad_norm": 0.6087000966072083, + "learning_rate": 9.959638569765586e-05, + "loss": 2.2136, + "step": 2258 + }, + { + "epoch": 0.6933701657458563, + "grad_norm": 0.6032208204269409, + "learning_rate": 9.959575516265914e-05, + "loss": 2.1211, + "step": 2259 + }, + { + "epoch": 0.6936771025168815, + "grad_norm": 0.83074551820755, + "learning_rate": 9.95951241375281e-05, + "loss": 2.2951, + "step": 2260 + }, + { + "epoch": 0.6939840392879066, + "grad_norm": 0.8564106225967407, + "learning_rate": 9.959449262226897e-05, + "loss": 2.1496, + "step": 2261 + }, + { + "epoch": 0.6942909760589319, + "grad_norm": 0.8558153510093689, + "learning_rate": 9.9593860616888e-05, + "loss": 2.2325, + "step": 2262 + }, + { + "epoch": 0.694597912829957, + "grad_norm": 0.7391008734703064, + "learning_rate": 9.959322812139143e-05, + "loss": 2.1133, + "step": 2263 + }, + { + "epoch": 0.6949048496009822, + "grad_norm": 0.6090536713600159, + "learning_rate": 9.959259513578552e-05, + "loss": 2.1453, + "step": 2264 + }, + { + "epoch": 0.6952117863720073, + "grad_norm": 0.5893986821174622, + "learning_rate": 9.95919616600765e-05, + "loss": 2.2035, + "step": 2265 + }, + { + "epoch": 0.6955187231430325, + "grad_norm": 0.6274020671844482, + "learning_rate": 9.959132769427065e-05, + "loss": 2.2118, + "step": 2266 + }, + { + "epoch": 0.6958256599140578, + "grad_norm": 0.6287395358085632, + "learning_rate": 9.959069323837424e-05, + "loss": 2.2167, + "step": 2267 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.6281611323356628, + "learning_rate": 9.959005829239354e-05, + "loss": 2.1945, + "step": 2268 + }, + { + "epoch": 0.6964395334561081, + "grad_norm": 0.6422389149665833, + "learning_rate": 9.958942285633481e-05, + "loss": 2.1826, + "step": 2269 + }, + { + "epoch": 0.6967464702271332, + "grad_norm": 0.6461887955665588, + "learning_rate": 9.958878693020434e-05, + "loss": 2.2454, + "step": 2270 + }, + { + "epoch": 0.6970534069981584, + "grad_norm": 0.562102735042572, + "learning_rate": 9.958815051400841e-05, + "loss": 2.1375, + "step": 2271 + }, + { + "epoch": 0.6973603437691835, + "grad_norm": 0.5737003087997437, + "learning_rate": 9.958751360775331e-05, + "loss": 2.2344, + "step": 2272 + }, + { + "epoch": 0.6976672805402088, + "grad_norm": 0.5516494512557983, + "learning_rate": 9.958687621144535e-05, + "loss": 2.249, + "step": 2273 + }, + { + "epoch": 0.6979742173112339, + "grad_norm": 0.7148357629776001, + "learning_rate": 9.958623832509081e-05, + "loss": 2.2383, + "step": 2274 + }, + { + "epoch": 0.6982811540822591, + "grad_norm": 0.7151525020599365, + "learning_rate": 9.958559994869599e-05, + "loss": 2.1697, + "step": 2275 + }, + { + "epoch": 0.6985880908532842, + "grad_norm": 0.6927846670150757, + "learning_rate": 9.958496108226722e-05, + "loss": 2.1534, + "step": 2276 + }, + { + "epoch": 0.6988950276243094, + "grad_norm": 0.811660647392273, + "learning_rate": 9.958432172581079e-05, + "loss": 2.2197, + "step": 2277 + }, + { + "epoch": 0.6992019643953346, + "grad_norm": 0.9680081009864807, + "learning_rate": 9.958368187933305e-05, + "loss": 2.2241, + "step": 2278 + }, + { + "epoch": 0.6995089011663598, + "grad_norm": 0.9996320605278015, + "learning_rate": 9.958304154284028e-05, + "loss": 2.1598, + "step": 2279 + }, + { + "epoch": 0.6998158379373849, + "grad_norm": 1.008695363998413, + "learning_rate": 9.958240071633884e-05, + "loss": 2.2082, + "step": 2280 + }, + { + "epoch": 0.7001227747084101, + "grad_norm": 0.9931860566139221, + "learning_rate": 9.958175939983506e-05, + "loss": 2.1478, + "step": 2281 + }, + { + "epoch": 0.7004297114794352, + "grad_norm": 0.8637800812721252, + "learning_rate": 9.958111759333528e-05, + "loss": 2.149, + "step": 2282 + }, + { + "epoch": 0.7007366482504604, + "grad_norm": 0.7089012861251831, + "learning_rate": 9.958047529684582e-05, + "loss": 2.1845, + "step": 2283 + }, + { + "epoch": 0.7010435850214856, + "grad_norm": 0.6083673238754272, + "learning_rate": 9.957983251037303e-05, + "loss": 2.1542, + "step": 2284 + }, + { + "epoch": 0.7013505217925108, + "grad_norm": 0.7092905044555664, + "learning_rate": 9.957918923392331e-05, + "loss": 2.2305, + "step": 2285 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.8416675925254822, + "learning_rate": 9.957854546750297e-05, + "loss": 2.2975, + "step": 2286 + }, + { + "epoch": 0.7019643953345611, + "grad_norm": 0.7778663039207458, + "learning_rate": 9.957790121111838e-05, + "loss": 2.2363, + "step": 2287 + }, + { + "epoch": 0.7022713321055862, + "grad_norm": 0.7886617183685303, + "learning_rate": 9.957725646477592e-05, + "loss": 2.1547, + "step": 2288 + }, + { + "epoch": 0.7025782688766115, + "grad_norm": 0.6596038937568665, + "learning_rate": 9.957661122848194e-05, + "loss": 2.1537, + "step": 2289 + }, + { + "epoch": 0.7028852056476366, + "grad_norm": 0.6441544890403748, + "learning_rate": 9.957596550224285e-05, + "loss": 2.1678, + "step": 2290 + }, + { + "epoch": 0.7031921424186618, + "grad_norm": 0.7106116414070129, + "learning_rate": 9.957531928606499e-05, + "loss": 2.2039, + "step": 2291 + }, + { + "epoch": 0.7034990791896869, + "grad_norm": 0.6948207020759583, + "learning_rate": 9.957467257995476e-05, + "loss": 2.176, + "step": 2292 + }, + { + "epoch": 0.7038060159607121, + "grad_norm": 0.6834874153137207, + "learning_rate": 9.957402538391859e-05, + "loss": 2.2182, + "step": 2293 + }, + { + "epoch": 0.7041129527317372, + "grad_norm": 0.6246630549430847, + "learning_rate": 9.957337769796282e-05, + "loss": 2.1181, + "step": 2294 + }, + { + "epoch": 0.7044198895027625, + "grad_norm": 0.6421988606452942, + "learning_rate": 9.957272952209389e-05, + "loss": 2.1352, + "step": 2295 + }, + { + "epoch": 0.7047268262737876, + "grad_norm": 0.5955870151519775, + "learning_rate": 9.95720808563182e-05, + "loss": 2.1852, + "step": 2296 + }, + { + "epoch": 0.7050337630448128, + "grad_norm": 0.6961265206336975, + "learning_rate": 9.957143170064214e-05, + "loss": 2.242, + "step": 2297 + }, + { + "epoch": 0.7053406998158379, + "grad_norm": 0.6966063380241394, + "learning_rate": 9.957078205507213e-05, + "loss": 2.1505, + "step": 2298 + }, + { + "epoch": 0.7056476365868631, + "grad_norm": 0.6155996322631836, + "learning_rate": 9.957013191961459e-05, + "loss": 2.1928, + "step": 2299 + }, + { + "epoch": 0.7059545733578882, + "grad_norm": 0.6092718839645386, + "learning_rate": 9.956948129427597e-05, + "loss": 2.138, + "step": 2300 + }, + { + "epoch": 0.7062615101289135, + "grad_norm": 0.645746111869812, + "learning_rate": 9.95688301790627e-05, + "loss": 2.2334, + "step": 2301 + }, + { + "epoch": 0.7065684468999386, + "grad_norm": 0.5959149599075317, + "learning_rate": 9.956817857398116e-05, + "loss": 2.1985, + "step": 2302 + }, + { + "epoch": 0.7068753836709638, + "grad_norm": 0.7127073407173157, + "learning_rate": 9.956752647903785e-05, + "loss": 2.2157, + "step": 2303 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.5589274764060974, + "learning_rate": 9.956687389423917e-05, + "loss": 2.1251, + "step": 2304 + }, + { + "epoch": 0.7074892572130141, + "grad_norm": 0.5502300262451172, + "learning_rate": 9.95662208195916e-05, + "loss": 2.1344, + "step": 2305 + }, + { + "epoch": 0.7077961939840393, + "grad_norm": 0.6577275991439819, + "learning_rate": 9.95655672551016e-05, + "loss": 2.1646, + "step": 2306 + }, + { + "epoch": 0.7081031307550645, + "grad_norm": 0.6241618394851685, + "learning_rate": 9.956491320077559e-05, + "loss": 2.1153, + "step": 2307 + }, + { + "epoch": 0.7084100675260896, + "grad_norm": 0.5846728086471558, + "learning_rate": 9.956425865662007e-05, + "loss": 2.1477, + "step": 2308 + }, + { + "epoch": 0.7087170042971148, + "grad_norm": 0.6005275249481201, + "learning_rate": 9.95636036226415e-05, + "loss": 2.2034, + "step": 2309 + }, + { + "epoch": 0.7090239410681399, + "grad_norm": 0.6545519828796387, + "learning_rate": 9.956294809884635e-05, + "loss": 2.23, + "step": 2310 + }, + { + "epoch": 0.7093308778391652, + "grad_norm": 0.7513750791549683, + "learning_rate": 9.956229208524108e-05, + "loss": 2.2497, + "step": 2311 + }, + { + "epoch": 0.7096378146101903, + "grad_norm": 0.7308349609375, + "learning_rate": 9.956163558183219e-05, + "loss": 2.166, + "step": 2312 + }, + { + "epoch": 0.7099447513812155, + "grad_norm": 0.6278798580169678, + "learning_rate": 9.956097858862619e-05, + "loss": 2.1994, + "step": 2313 + }, + { + "epoch": 0.7102516881522406, + "grad_norm": 0.6725621223449707, + "learning_rate": 9.956032110562953e-05, + "loss": 2.2212, + "step": 2314 + }, + { + "epoch": 0.7105586249232658, + "grad_norm": 0.7116945385932922, + "learning_rate": 9.955966313284872e-05, + "loss": 2.2033, + "step": 2315 + }, + { + "epoch": 0.7108655616942909, + "grad_norm": 0.5906245112419128, + "learning_rate": 9.95590046702903e-05, + "loss": 2.1419, + "step": 2316 + }, + { + "epoch": 0.7111724984653162, + "grad_norm": 0.6911863684654236, + "learning_rate": 9.955834571796073e-05, + "loss": 2.1697, + "step": 2317 + }, + { + "epoch": 0.7114794352363413, + "grad_norm": 0.600350558757782, + "learning_rate": 9.955768627586655e-05, + "loss": 2.0864, + "step": 2318 + }, + { + "epoch": 0.7117863720073665, + "grad_norm": 0.6246278285980225, + "learning_rate": 9.955702634401427e-05, + "loss": 2.1549, + "step": 2319 + }, + { + "epoch": 0.7120933087783916, + "grad_norm": 0.6530009508132935, + "learning_rate": 9.95563659224104e-05, + "loss": 2.1457, + "step": 2320 + }, + { + "epoch": 0.7124002455494168, + "grad_norm": 0.6566256880760193, + "learning_rate": 9.955570501106148e-05, + "loss": 2.1589, + "step": 2321 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.6607041358947754, + "learning_rate": 9.955504360997404e-05, + "loss": 2.1692, + "step": 2322 + }, + { + "epoch": 0.7130141190914672, + "grad_norm": 0.7257810235023499, + "learning_rate": 9.95543817191546e-05, + "loss": 2.2067, + "step": 2323 + }, + { + "epoch": 0.7133210558624923, + "grad_norm": 0.7413349151611328, + "learning_rate": 9.955371933860973e-05, + "loss": 2.1817, + "step": 2324 + }, + { + "epoch": 0.7136279926335175, + "grad_norm": 0.6968317031860352, + "learning_rate": 9.955305646834596e-05, + "loss": 2.2574, + "step": 2325 + }, + { + "epoch": 0.7139349294045426, + "grad_norm": 0.8065732717514038, + "learning_rate": 9.955239310836983e-05, + "loss": 2.1957, + "step": 2326 + }, + { + "epoch": 0.7142418661755678, + "grad_norm": 0.7563133835792542, + "learning_rate": 9.955172925868792e-05, + "loss": 2.2113, + "step": 2327 + }, + { + "epoch": 0.714548802946593, + "grad_norm": 0.6790496110916138, + "learning_rate": 9.955106491930678e-05, + "loss": 2.103, + "step": 2328 + }, + { + "epoch": 0.7148557397176182, + "grad_norm": 0.65167236328125, + "learning_rate": 9.955040009023298e-05, + "loss": 2.1919, + "step": 2329 + }, + { + "epoch": 0.7151626764886433, + "grad_norm": 0.6869332790374756, + "learning_rate": 9.954973477147307e-05, + "loss": 2.2141, + "step": 2330 + }, + { + "epoch": 0.7154696132596685, + "grad_norm": 0.8613699078559875, + "learning_rate": 9.954906896303363e-05, + "loss": 2.1962, + "step": 2331 + }, + { + "epoch": 0.7157765500306936, + "grad_norm": 0.8827282786369324, + "learning_rate": 9.954840266492127e-05, + "loss": 2.216, + "step": 2332 + }, + { + "epoch": 0.7160834868017188, + "grad_norm": 0.9737905263900757, + "learning_rate": 9.954773587714255e-05, + "loss": 2.2118, + "step": 2333 + }, + { + "epoch": 0.716390423572744, + "grad_norm": 0.9978635311126709, + "learning_rate": 9.954706859970404e-05, + "loss": 2.0998, + "step": 2334 + }, + { + "epoch": 0.7166973603437692, + "grad_norm": 0.8694623112678528, + "learning_rate": 9.954640083261238e-05, + "loss": 2.1533, + "step": 2335 + }, + { + "epoch": 0.7170042971147943, + "grad_norm": 0.641293466091156, + "learning_rate": 9.954573257587415e-05, + "loss": 2.2095, + "step": 2336 + }, + { + "epoch": 0.7173112338858195, + "grad_norm": 0.6289860010147095, + "learning_rate": 9.954506382949594e-05, + "loss": 2.1683, + "step": 2337 + }, + { + "epoch": 0.7176181706568447, + "grad_norm": 0.8292246460914612, + "learning_rate": 9.954439459348437e-05, + "loss": 2.1729, + "step": 2338 + }, + { + "epoch": 0.7179251074278699, + "grad_norm": 0.8990920782089233, + "learning_rate": 9.954372486784605e-05, + "loss": 2.0888, + "step": 2339 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.7905614376068115, + "learning_rate": 9.954305465258762e-05, + "loss": 2.2262, + "step": 2340 + }, + { + "epoch": 0.7185389809699202, + "grad_norm": 0.7142611145973206, + "learning_rate": 9.954238394771567e-05, + "loss": 2.1311, + "step": 2341 + }, + { + "epoch": 0.7188459177409454, + "grad_norm": 0.68161541223526, + "learning_rate": 9.954171275323684e-05, + "loss": 2.2622, + "step": 2342 + }, + { + "epoch": 0.7191528545119705, + "grad_norm": 0.7524895668029785, + "learning_rate": 9.954104106915779e-05, + "loss": 2.1709, + "step": 2343 + }, + { + "epoch": 0.7194597912829958, + "grad_norm": 0.7419885396957397, + "learning_rate": 9.954036889548511e-05, + "loss": 2.1528, + "step": 2344 + }, + { + "epoch": 0.7197667280540209, + "grad_norm": 0.8045634031295776, + "learning_rate": 9.953969623222547e-05, + "loss": 2.1774, + "step": 2345 + }, + { + "epoch": 0.7200736648250461, + "grad_norm": 0.6680217385292053, + "learning_rate": 9.953902307938554e-05, + "loss": 2.2345, + "step": 2346 + }, + { + "epoch": 0.7203806015960712, + "grad_norm": 0.6900907754898071, + "learning_rate": 9.953834943697193e-05, + "loss": 2.1696, + "step": 2347 + }, + { + "epoch": 0.7206875383670964, + "grad_norm": 0.7231009006500244, + "learning_rate": 9.953767530499132e-05, + "loss": 2.2556, + "step": 2348 + }, + { + "epoch": 0.7209944751381215, + "grad_norm": 0.7766092419624329, + "learning_rate": 9.953700068345036e-05, + "loss": 2.1522, + "step": 2349 + }, + { + "epoch": 0.7213014119091468, + "grad_norm": 0.7361852526664734, + "learning_rate": 9.953632557235574e-05, + "loss": 2.2427, + "step": 2350 + }, + { + "epoch": 0.7216083486801719, + "grad_norm": 0.7170109152793884, + "learning_rate": 9.953564997171411e-05, + "loss": 2.2439, + "step": 2351 + }, + { + "epoch": 0.7219152854511971, + "grad_norm": 0.7192662954330444, + "learning_rate": 9.953497388153214e-05, + "loss": 2.1242, + "step": 2352 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.7363288402557373, + "learning_rate": 9.953429730181653e-05, + "loss": 2.2748, + "step": 2353 + }, + { + "epoch": 0.7225291589932474, + "grad_norm": 0.8516983985900879, + "learning_rate": 9.953362023257397e-05, + "loss": 2.2471, + "step": 2354 + }, + { + "epoch": 0.7228360957642725, + "grad_norm": 0.7928574681282043, + "learning_rate": 9.953294267381114e-05, + "loss": 2.164, + "step": 2355 + }, + { + "epoch": 0.7231430325352978, + "grad_norm": 0.6803320646286011, + "learning_rate": 9.953226462553474e-05, + "loss": 2.1671, + "step": 2356 + }, + { + "epoch": 0.7234499693063229, + "grad_norm": 0.6811994910240173, + "learning_rate": 9.953158608775147e-05, + "loss": 2.1042, + "step": 2357 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.6077840328216553, + "learning_rate": 9.953090706046804e-05, + "loss": 2.2161, + "step": 2358 + }, + { + "epoch": 0.7240638428483732, + "grad_norm": 0.5938412547111511, + "learning_rate": 9.953022754369114e-05, + "loss": 2.1177, + "step": 2359 + }, + { + "epoch": 0.7243707796193984, + "grad_norm": 0.6752299070358276, + "learning_rate": 9.952954753742751e-05, + "loss": 2.2255, + "step": 2360 + }, + { + "epoch": 0.7246777163904236, + "grad_norm": 0.6745245456695557, + "learning_rate": 9.952886704168387e-05, + "loss": 2.1817, + "step": 2361 + }, + { + "epoch": 0.7249846531614488, + "grad_norm": 0.6645397543907166, + "learning_rate": 9.95281860564669e-05, + "loss": 2.2495, + "step": 2362 + }, + { + "epoch": 0.7252915899324739, + "grad_norm": 0.6758745312690735, + "learning_rate": 9.95275045817834e-05, + "loss": 2.2059, + "step": 2363 + }, + { + "epoch": 0.7255985267034991, + "grad_norm": 0.6584516763687134, + "learning_rate": 9.952682261764006e-05, + "loss": 2.1868, + "step": 2364 + }, + { + "epoch": 0.7259054634745242, + "grad_norm": 0.6335561871528625, + "learning_rate": 9.952614016404363e-05, + "loss": 2.1352, + "step": 2365 + }, + { + "epoch": 0.7262124002455494, + "grad_norm": 0.6656816601753235, + "learning_rate": 9.952545722100087e-05, + "loss": 2.1805, + "step": 2366 + }, + { + "epoch": 0.7265193370165746, + "grad_norm": 0.6262782216072083, + "learning_rate": 9.95247737885185e-05, + "loss": 2.1435, + "step": 2367 + }, + { + "epoch": 0.7268262737875998, + "grad_norm": 0.569795548915863, + "learning_rate": 9.952408986660329e-05, + "loss": 2.1547, + "step": 2368 + }, + { + "epoch": 0.7271332105586249, + "grad_norm": 0.5249118208885193, + "learning_rate": 9.952340545526199e-05, + "loss": 2.1213, + "step": 2369 + }, + { + "epoch": 0.7274401473296501, + "grad_norm": 0.5581740140914917, + "learning_rate": 9.952272055450139e-05, + "loss": 2.1866, + "step": 2370 + }, + { + "epoch": 0.7277470841006752, + "grad_norm": 0.5986969470977783, + "learning_rate": 9.952203516432821e-05, + "loss": 2.143, + "step": 2371 + }, + { + "epoch": 0.7280540208717005, + "grad_norm": 0.6426723599433899, + "learning_rate": 9.952134928474926e-05, + "loss": 2.2132, + "step": 2372 + }, + { + "epoch": 0.7283609576427256, + "grad_norm": 0.5856953263282776, + "learning_rate": 9.952066291577133e-05, + "loss": 2.1502, + "step": 2373 + }, + { + "epoch": 0.7286678944137508, + "grad_norm": 0.5420570969581604, + "learning_rate": 9.951997605740117e-05, + "loss": 2.1213, + "step": 2374 + }, + { + "epoch": 0.7289748311847759, + "grad_norm": 0.6201688647270203, + "learning_rate": 9.951928870964558e-05, + "loss": 2.218, + "step": 2375 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.7023850083351135, + "learning_rate": 9.951860087251137e-05, + "loss": 2.2787, + "step": 2376 + }, + { + "epoch": 0.7295887047268262, + "grad_norm": 0.733650803565979, + "learning_rate": 9.951791254600532e-05, + "loss": 2.1861, + "step": 2377 + }, + { + "epoch": 0.7298956414978515, + "grad_norm": 0.7177363038063049, + "learning_rate": 9.951722373013421e-05, + "loss": 2.1905, + "step": 2378 + }, + { + "epoch": 0.7302025782688766, + "grad_norm": 0.7963547706604004, + "learning_rate": 9.95165344249049e-05, + "loss": 2.1842, + "step": 2379 + }, + { + "epoch": 0.7305095150399018, + "grad_norm": 0.8466546535491943, + "learning_rate": 9.951584463032416e-05, + "loss": 2.1661, + "step": 2380 + }, + { + "epoch": 0.7308164518109269, + "grad_norm": 0.7288870811462402, + "learning_rate": 9.951515434639882e-05, + "loss": 2.1153, + "step": 2381 + }, + { + "epoch": 0.7311233885819521, + "grad_norm": 0.6168704032897949, + "learning_rate": 9.951446357313571e-05, + "loss": 2.121, + "step": 2382 + }, + { + "epoch": 0.7314303253529773, + "grad_norm": 0.6534848809242249, + "learning_rate": 9.951377231054166e-05, + "loss": 2.2087, + "step": 2383 + }, + { + "epoch": 0.7317372621240025, + "grad_norm": 0.7872020602226257, + "learning_rate": 9.951308055862347e-05, + "loss": 2.2428, + "step": 2384 + }, + { + "epoch": 0.7320441988950276, + "grad_norm": 0.864799439907074, + "learning_rate": 9.9512388317388e-05, + "loss": 2.2392, + "step": 2385 + }, + { + "epoch": 0.7323511356660528, + "grad_norm": 0.7365485429763794, + "learning_rate": 9.95116955868421e-05, + "loss": 2.1614, + "step": 2386 + }, + { + "epoch": 0.7326580724370779, + "grad_norm": 0.6509390473365784, + "learning_rate": 9.95110023669926e-05, + "loss": 2.1917, + "step": 2387 + }, + { + "epoch": 0.7329650092081031, + "grad_norm": 0.7660403847694397, + "learning_rate": 9.951030865784635e-05, + "loss": 2.2414, + "step": 2388 + }, + { + "epoch": 0.7332719459791283, + "grad_norm": 0.9997872114181519, + "learning_rate": 9.950961445941022e-05, + "loss": 2.2063, + "step": 2389 + }, + { + "epoch": 0.7335788827501535, + "grad_norm": 1.0113418102264404, + "learning_rate": 9.950891977169106e-05, + "loss": 2.1898, + "step": 2390 + }, + { + "epoch": 0.7338858195211786, + "grad_norm": 0.8849206566810608, + "learning_rate": 9.950822459469573e-05, + "loss": 2.1503, + "step": 2391 + }, + { + "epoch": 0.7341927562922038, + "grad_norm": 0.6561055779457092, + "learning_rate": 9.950752892843112e-05, + "loss": 2.1234, + "step": 2392 + }, + { + "epoch": 0.7344996930632289, + "grad_norm": 0.5568758845329285, + "learning_rate": 9.950683277290407e-05, + "loss": 2.2129, + "step": 2393 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.7019078135490417, + "learning_rate": 9.950613612812149e-05, + "loss": 2.1162, + "step": 2394 + }, + { + "epoch": 0.7351135666052793, + "grad_norm": 0.7633521556854248, + "learning_rate": 9.950543899409026e-05, + "loss": 2.2427, + "step": 2395 + }, + { + "epoch": 0.7354205033763045, + "grad_norm": 0.6743205785751343, + "learning_rate": 9.950474137081726e-05, + "loss": 2.2213, + "step": 2396 + }, + { + "epoch": 0.7357274401473296, + "grad_norm": 0.6008336544036865, + "learning_rate": 9.950404325830941e-05, + "loss": 2.1605, + "step": 2397 + }, + { + "epoch": 0.7360343769183548, + "grad_norm": 0.648760199546814, + "learning_rate": 9.950334465657357e-05, + "loss": 2.2298, + "step": 2398 + }, + { + "epoch": 0.7363413136893799, + "grad_norm": 0.6996559500694275, + "learning_rate": 9.950264556561667e-05, + "loss": 2.1616, + "step": 2399 + }, + { + "epoch": 0.7366482504604052, + "grad_norm": 0.741629421710968, + "learning_rate": 9.950194598544561e-05, + "loss": 2.2162, + "step": 2400 + }, + { + "epoch": 0.7369551872314303, + "grad_norm": 0.6144673824310303, + "learning_rate": 9.95012459160673e-05, + "loss": 2.15, + "step": 2401 + }, + { + "epoch": 0.7372621240024555, + "grad_norm": 0.5826541781425476, + "learning_rate": 9.950054535748867e-05, + "loss": 2.1792, + "step": 2402 + }, + { + "epoch": 0.7375690607734806, + "grad_norm": 0.6489288806915283, + "learning_rate": 9.949984430971665e-05, + "loss": 2.1703, + "step": 2403 + }, + { + "epoch": 0.7378759975445058, + "grad_norm": 0.6752250790596008, + "learning_rate": 9.949914277275814e-05, + "loss": 2.2561, + "step": 2404 + }, + { + "epoch": 0.738182934315531, + "grad_norm": 0.5570092797279358, + "learning_rate": 9.94984407466201e-05, + "loss": 2.1418, + "step": 2405 + }, + { + "epoch": 0.7384898710865562, + "grad_norm": 0.5966812968254089, + "learning_rate": 9.949773823130944e-05, + "loss": 2.2168, + "step": 2406 + }, + { + "epoch": 0.7387968078575813, + "grad_norm": 0.6253142952919006, + "learning_rate": 9.949703522683314e-05, + "loss": 2.1646, + "step": 2407 + }, + { + "epoch": 0.7391037446286065, + "grad_norm": 0.6673659086227417, + "learning_rate": 9.94963317331981e-05, + "loss": 2.1904, + "step": 2408 + }, + { + "epoch": 0.7394106813996317, + "grad_norm": 0.6243279576301575, + "learning_rate": 9.949562775041133e-05, + "loss": 2.2568, + "step": 2409 + }, + { + "epoch": 0.7397176181706568, + "grad_norm": 0.7014298439025879, + "learning_rate": 9.949492327847973e-05, + "loss": 2.2331, + "step": 2410 + }, + { + "epoch": 0.7400245549416821, + "grad_norm": 0.698403537273407, + "learning_rate": 9.94942183174103e-05, + "loss": 2.1928, + "step": 2411 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.6354022026062012, + "learning_rate": 9.949351286721001e-05, + "loss": 2.0975, + "step": 2412 + }, + { + "epoch": 0.7406384284837324, + "grad_norm": 0.595302164554596, + "learning_rate": 9.949280692788579e-05, + "loss": 2.177, + "step": 2413 + }, + { + "epoch": 0.7409453652547575, + "grad_norm": 0.6844484210014343, + "learning_rate": 9.949210049944465e-05, + "loss": 2.1962, + "step": 2414 + }, + { + "epoch": 0.7412523020257827, + "grad_norm": 0.6242616176605225, + "learning_rate": 9.949139358189357e-05, + "loss": 2.2143, + "step": 2415 + }, + { + "epoch": 0.7415592387968079, + "grad_norm": 0.6524595022201538, + "learning_rate": 9.949068617523954e-05, + "loss": 2.1438, + "step": 2416 + }, + { + "epoch": 0.7418661755678331, + "grad_norm": 0.6667510867118835, + "learning_rate": 9.948997827948953e-05, + "loss": 2.2115, + "step": 2417 + }, + { + "epoch": 0.7421731123388582, + "grad_norm": 0.7688906192779541, + "learning_rate": 9.948926989465056e-05, + "loss": 2.1887, + "step": 2418 + }, + { + "epoch": 0.7424800491098834, + "grad_norm": 0.6888165473937988, + "learning_rate": 9.948856102072958e-05, + "loss": 2.1349, + "step": 2419 + }, + { + "epoch": 0.7427869858809085, + "grad_norm": 0.5672495365142822, + "learning_rate": 9.948785165773367e-05, + "loss": 2.1109, + "step": 2420 + }, + { + "epoch": 0.7430939226519337, + "grad_norm": 0.5714489221572876, + "learning_rate": 9.94871418056698e-05, + "loss": 2.1483, + "step": 2421 + }, + { + "epoch": 0.7434008594229589, + "grad_norm": 0.6061533093452454, + "learning_rate": 9.948643146454498e-05, + "loss": 2.211, + "step": 2422 + }, + { + "epoch": 0.7437077961939841, + "grad_norm": 0.6132726073265076, + "learning_rate": 9.948572063436625e-05, + "loss": 2.23, + "step": 2423 + }, + { + "epoch": 0.7440147329650092, + "grad_norm": 0.684301495552063, + "learning_rate": 9.948500931514062e-05, + "loss": 2.129, + "step": 2424 + }, + { + "epoch": 0.7443216697360344, + "grad_norm": 0.6325442790985107, + "learning_rate": 9.948429750687512e-05, + "loss": 2.129, + "step": 2425 + }, + { + "epoch": 0.7446286065070595, + "grad_norm": 0.6245989203453064, + "learning_rate": 9.948358520957678e-05, + "loss": 2.1999, + "step": 2426 + }, + { + "epoch": 0.7449355432780848, + "grad_norm": 0.6638534069061279, + "learning_rate": 9.948287242325267e-05, + "loss": 2.203, + "step": 2427 + }, + { + "epoch": 0.7452424800491099, + "grad_norm": 0.6121437549591064, + "learning_rate": 9.94821591479098e-05, + "loss": 2.1204, + "step": 2428 + }, + { + "epoch": 0.7455494168201351, + "grad_norm": 0.7919846177101135, + "learning_rate": 9.948144538355522e-05, + "loss": 2.2353, + "step": 2429 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.7246984839439392, + "learning_rate": 9.948073113019602e-05, + "loss": 2.1284, + "step": 2430 + }, + { + "epoch": 0.7461632903621854, + "grad_norm": 0.6120265126228333, + "learning_rate": 9.948001638783921e-05, + "loss": 2.0873, + "step": 2431 + }, + { + "epoch": 0.7464702271332105, + "grad_norm": 0.628588080406189, + "learning_rate": 9.947930115649189e-05, + "loss": 2.1713, + "step": 2432 + }, + { + "epoch": 0.7467771639042358, + "grad_norm": 0.63116854429245, + "learning_rate": 9.947858543616111e-05, + "loss": 2.123, + "step": 2433 + }, + { + "epoch": 0.7470841006752609, + "grad_norm": 0.6533017754554749, + "learning_rate": 9.947786922685394e-05, + "loss": 2.1593, + "step": 2434 + }, + { + "epoch": 0.7473910374462861, + "grad_norm": 0.6854177117347717, + "learning_rate": 9.947715252857749e-05, + "loss": 2.162, + "step": 2435 + }, + { + "epoch": 0.7476979742173112, + "grad_norm": 0.7257967591285706, + "learning_rate": 9.94764353413388e-05, + "loss": 2.2644, + "step": 2436 + }, + { + "epoch": 0.7480049109883364, + "grad_norm": 0.6806700825691223, + "learning_rate": 9.947571766514498e-05, + "loss": 2.0875, + "step": 2437 + }, + { + "epoch": 0.7483118477593615, + "grad_norm": 0.6616181135177612, + "learning_rate": 9.947499950000312e-05, + "loss": 2.1353, + "step": 2438 + }, + { + "epoch": 0.7486187845303868, + "grad_norm": 0.7249685525894165, + "learning_rate": 9.947428084592032e-05, + "loss": 2.148, + "step": 2439 + }, + { + "epoch": 0.7489257213014119, + "grad_norm": 0.6372905969619751, + "learning_rate": 9.947356170290369e-05, + "loss": 2.1749, + "step": 2440 + }, + { + "epoch": 0.7492326580724371, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.947284207096031e-05, + "loss": 2.1909, + "step": 2441 + }, + { + "epoch": 0.7495395948434622, + "grad_norm": 0.5830507278442383, + "learning_rate": 9.94721219500973e-05, + "loss": 2.1351, + "step": 2442 + }, + { + "epoch": 0.7498465316144874, + "grad_norm": 0.650262713432312, + "learning_rate": 9.94714013403218e-05, + "loss": 2.2602, + "step": 2443 + }, + { + "epoch": 0.7501534683855126, + "grad_norm": 0.6658717393875122, + "learning_rate": 9.947068024164091e-05, + "loss": 2.0919, + "step": 2444 + }, + { + "epoch": 0.7504604051565378, + "grad_norm": 0.7299105525016785, + "learning_rate": 9.946995865406177e-05, + "loss": 2.2079, + "step": 2445 + }, + { + "epoch": 0.7507673419275629, + "grad_norm": 0.762246310710907, + "learning_rate": 9.946923657759148e-05, + "loss": 2.2225, + "step": 2446 + }, + { + "epoch": 0.7510742786985881, + "grad_norm": 0.7019835710525513, + "learning_rate": 9.946851401223722e-05, + "loss": 2.175, + "step": 2447 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.6214791536331177, + "learning_rate": 9.946779095800611e-05, + "loss": 2.2095, + "step": 2448 + }, + { + "epoch": 0.7516881522406385, + "grad_norm": 0.6380667090415955, + "learning_rate": 9.94670674149053e-05, + "loss": 2.2325, + "step": 2449 + }, + { + "epoch": 0.7519950890116636, + "grad_norm": 0.6175886392593384, + "learning_rate": 9.946634338294191e-05, + "loss": 2.1431, + "step": 2450 + }, + { + "epoch": 0.7523020257826888, + "grad_norm": 0.6642621159553528, + "learning_rate": 9.946561886212315e-05, + "loss": 2.1538, + "step": 2451 + }, + { + "epoch": 0.7526089625537139, + "grad_norm": 0.7078617215156555, + "learning_rate": 9.946489385245614e-05, + "loss": 2.1544, + "step": 2452 + }, + { + "epoch": 0.7529158993247391, + "grad_norm": 0.6939398050308228, + "learning_rate": 9.946416835394806e-05, + "loss": 2.1131, + "step": 2453 + }, + { + "epoch": 0.7532228360957642, + "grad_norm": 0.7080716490745544, + "learning_rate": 9.946344236660608e-05, + "loss": 2.2135, + "step": 2454 + }, + { + "epoch": 0.7535297728667895, + "grad_norm": 0.7451115250587463, + "learning_rate": 9.946271589043736e-05, + "loss": 2.1475, + "step": 2455 + }, + { + "epoch": 0.7538367096378146, + "grad_norm": 0.6718367338180542, + "learning_rate": 9.946198892544909e-05, + "loss": 2.1853, + "step": 2456 + }, + { + "epoch": 0.7541436464088398, + "grad_norm": 0.7071637511253357, + "learning_rate": 9.946126147164847e-05, + "loss": 2.0981, + "step": 2457 + }, + { + "epoch": 0.7544505831798649, + "grad_norm": 0.6745624542236328, + "learning_rate": 9.946053352904267e-05, + "loss": 2.1914, + "step": 2458 + }, + { + "epoch": 0.7547575199508901, + "grad_norm": 0.7267486453056335, + "learning_rate": 9.945980509763888e-05, + "loss": 2.1091, + "step": 2459 + }, + { + "epoch": 0.7550644567219152, + "grad_norm": 0.6128695607185364, + "learning_rate": 9.94590761774443e-05, + "loss": 2.1721, + "step": 2460 + }, + { + "epoch": 0.7553713934929405, + "grad_norm": 0.6574678421020508, + "learning_rate": 9.945834676846615e-05, + "loss": 2.1609, + "step": 2461 + }, + { + "epoch": 0.7556783302639656, + "grad_norm": 0.6209995150566101, + "learning_rate": 9.945761687071164e-05, + "loss": 2.1889, + "step": 2462 + }, + { + "epoch": 0.7559852670349908, + "grad_norm": 0.7425361275672913, + "learning_rate": 9.945688648418795e-05, + "loss": 2.2189, + "step": 2463 + }, + { + "epoch": 0.7562922038060159, + "grad_norm": 1.0604934692382812, + "learning_rate": 9.945615560890234e-05, + "loss": 2.1858, + "step": 2464 + }, + { + "epoch": 0.7565991405770411, + "grad_norm": 0.7162829041481018, + "learning_rate": 9.945542424486201e-05, + "loss": 2.101, + "step": 2465 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.6361207962036133, + "learning_rate": 9.945469239207416e-05, + "loss": 2.0836, + "step": 2466 + }, + { + "epoch": 0.7572130141190915, + "grad_norm": 0.5858156085014343, + "learning_rate": 9.945396005054609e-05, + "loss": 2.2059, + "step": 2467 + }, + { + "epoch": 0.7575199508901166, + "grad_norm": 0.7322074174880981, + "learning_rate": 9.945322722028498e-05, + "loss": 2.2295, + "step": 2468 + }, + { + "epoch": 0.7578268876611418, + "grad_norm": 0.775900661945343, + "learning_rate": 9.945249390129811e-05, + "loss": 2.2171, + "step": 2469 + }, + { + "epoch": 0.7581338244321669, + "grad_norm": 0.8801379799842834, + "learning_rate": 9.94517600935927e-05, + "loss": 2.1632, + "step": 2470 + }, + { + "epoch": 0.7584407612031921, + "grad_norm": 0.8258405923843384, + "learning_rate": 9.945102579717602e-05, + "loss": 2.1591, + "step": 2471 + }, + { + "epoch": 0.7587476979742173, + "grad_norm": 0.7472482323646545, + "learning_rate": 9.945029101205532e-05, + "loss": 2.2242, + "step": 2472 + }, + { + "epoch": 0.7590546347452425, + "grad_norm": 0.6594643592834473, + "learning_rate": 9.944955573823785e-05, + "loss": 2.1217, + "step": 2473 + }, + { + "epoch": 0.7593615715162676, + "grad_norm": 0.6547524333000183, + "learning_rate": 9.944881997573088e-05, + "loss": 2.131, + "step": 2474 + }, + { + "epoch": 0.7596685082872928, + "grad_norm": 0.6630129814147949, + "learning_rate": 9.94480837245417e-05, + "loss": 2.1264, + "step": 2475 + }, + { + "epoch": 0.7599754450583179, + "grad_norm": 0.6877384781837463, + "learning_rate": 9.944734698467757e-05, + "loss": 2.2453, + "step": 2476 + }, + { + "epoch": 0.7602823818293432, + "grad_norm": 0.6736158728599548, + "learning_rate": 9.944660975614579e-05, + "loss": 2.1425, + "step": 2477 + }, + { + "epoch": 0.7605893186003683, + "grad_norm": 0.6140786409378052, + "learning_rate": 9.944587203895361e-05, + "loss": 2.1345, + "step": 2478 + }, + { + "epoch": 0.7608962553713935, + "grad_norm": 0.5515910387039185, + "learning_rate": 9.944513383310837e-05, + "loss": 2.086, + "step": 2479 + }, + { + "epoch": 0.7612031921424187, + "grad_norm": 0.49419671297073364, + "learning_rate": 9.944439513861731e-05, + "loss": 2.1069, + "step": 2480 + }, + { + "epoch": 0.7615101289134438, + "grad_norm": 0.5526577234268188, + "learning_rate": 9.944365595548777e-05, + "loss": 2.1702, + "step": 2481 + }, + { + "epoch": 0.761817065684469, + "grad_norm": 0.5430580973625183, + "learning_rate": 9.944291628372702e-05, + "loss": 2.121, + "step": 2482 + }, + { + "epoch": 0.7621240024554942, + "grad_norm": 0.5333554148674011, + "learning_rate": 9.94421761233424e-05, + "loss": 2.1154, + "step": 2483 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.5856761932373047, + "learning_rate": 9.944143547434124e-05, + "loss": 2.1734, + "step": 2484 + }, + { + "epoch": 0.7627378759975445, + "grad_norm": 0.6619083881378174, + "learning_rate": 9.944069433673082e-05, + "loss": 2.2068, + "step": 2485 + }, + { + "epoch": 0.7630448127685697, + "grad_norm": 0.5791018009185791, + "learning_rate": 9.943995271051849e-05, + "loss": 2.0834, + "step": 2486 + }, + { + "epoch": 0.7633517495395948, + "grad_norm": 0.5942522287368774, + "learning_rate": 9.943921059571155e-05, + "loss": 2.2001, + "step": 2487 + }, + { + "epoch": 0.7636586863106201, + "grad_norm": 0.6285880208015442, + "learning_rate": 9.943846799231738e-05, + "loss": 2.1601, + "step": 2488 + }, + { + "epoch": 0.7639656230816452, + "grad_norm": 0.6337715983390808, + "learning_rate": 9.943772490034326e-05, + "loss": 2.1722, + "step": 2489 + }, + { + "epoch": 0.7642725598526704, + "grad_norm": 0.6912121772766113, + "learning_rate": 9.94369813197966e-05, + "loss": 2.1933, + "step": 2490 + }, + { + "epoch": 0.7645794966236955, + "grad_norm": 0.8028284311294556, + "learning_rate": 9.943623725068469e-05, + "loss": 2.129, + "step": 2491 + }, + { + "epoch": 0.7648864333947207, + "grad_norm": 0.8527138233184814, + "learning_rate": 9.943549269301491e-05, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.7651933701657458, + "grad_norm": 0.8422580361366272, + "learning_rate": 9.943474764679462e-05, + "loss": 2.2958, + "step": 2493 + }, + { + "epoch": 0.7655003069367711, + "grad_norm": 0.7698150873184204, + "learning_rate": 9.943400211203118e-05, + "loss": 2.1415, + "step": 2494 + }, + { + "epoch": 0.7658072437077962, + "grad_norm": 0.6360690593719482, + "learning_rate": 9.943325608873196e-05, + "loss": 2.1188, + "step": 2495 + }, + { + "epoch": 0.7661141804788214, + "grad_norm": 0.6225799918174744, + "learning_rate": 9.943250957690433e-05, + "loss": 2.1006, + "step": 2496 + }, + { + "epoch": 0.7664211172498465, + "grad_norm": 0.6694490909576416, + "learning_rate": 9.943176257655567e-05, + "loss": 2.2455, + "step": 2497 + }, + { + "epoch": 0.7667280540208717, + "grad_norm": 0.6188158988952637, + "learning_rate": 9.943101508769335e-05, + "loss": 2.0853, + "step": 2498 + }, + { + "epoch": 0.7670349907918969, + "grad_norm": 0.5934504866600037, + "learning_rate": 9.943026711032477e-05, + "loss": 2.0718, + "step": 2499 + }, + { + "epoch": 0.7673419275629221, + "grad_norm": 0.6261292695999146, + "learning_rate": 9.942951864445732e-05, + "loss": 2.1747, + "step": 2500 + }, + { + "epoch": 0.7676488643339472, + "grad_norm": 0.5891184210777283, + "learning_rate": 9.94287696900984e-05, + "loss": 2.1637, + "step": 2501 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.5321740508079529, + "learning_rate": 9.94280202472554e-05, + "loss": 2.0717, + "step": 2502 + }, + { + "epoch": 0.7682627378759975, + "grad_norm": 0.5563281178474426, + "learning_rate": 9.942727031593573e-05, + "loss": 2.1654, + "step": 2503 + }, + { + "epoch": 0.7685696746470227, + "grad_norm": 0.5672664046287537, + "learning_rate": 9.942651989614681e-05, + "loss": 2.0853, + "step": 2504 + }, + { + "epoch": 0.7688766114180479, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.942576898789606e-05, + "loss": 2.0636, + "step": 2505 + }, + { + "epoch": 0.7691835481890731, + "grad_norm": 0.5802470445632935, + "learning_rate": 9.942501759119088e-05, + "loss": 2.0924, + "step": 2506 + }, + { + "epoch": 0.7694904849600982, + "grad_norm": 0.5630003213882446, + "learning_rate": 9.94242657060387e-05, + "loss": 2.1975, + "step": 2507 + }, + { + "epoch": 0.7697974217311234, + "grad_norm": 0.6001835465431213, + "learning_rate": 9.942351333244697e-05, + "loss": 2.1187, + "step": 2508 + }, + { + "epoch": 0.7701043585021485, + "grad_norm": 0.6702088117599487, + "learning_rate": 9.942276047042311e-05, + "loss": 2.1489, + "step": 2509 + }, + { + "epoch": 0.7704112952731738, + "grad_norm": 0.7941808700561523, + "learning_rate": 9.942200711997456e-05, + "loss": 2.1404, + "step": 2510 + }, + { + "epoch": 0.7707182320441989, + "grad_norm": 0.8202539682388306, + "learning_rate": 9.942125328110876e-05, + "loss": 2.1242, + "step": 2511 + }, + { + "epoch": 0.7710251688152241, + "grad_norm": 0.7667655348777771, + "learning_rate": 9.942049895383319e-05, + "loss": 2.118, + "step": 2512 + }, + { + "epoch": 0.7713321055862492, + "grad_norm": 0.6766887307167053, + "learning_rate": 9.941974413815527e-05, + "loss": 2.2632, + "step": 2513 + }, + { + "epoch": 0.7716390423572744, + "grad_norm": 0.5923287272453308, + "learning_rate": 9.941898883408248e-05, + "loss": 2.1096, + "step": 2514 + }, + { + "epoch": 0.7719459791282995, + "grad_norm": 0.8847586512565613, + "learning_rate": 9.941823304162227e-05, + "loss": 2.2629, + "step": 2515 + }, + { + "epoch": 0.7722529158993248, + "grad_norm": 1.2274069786071777, + "learning_rate": 9.941747676078211e-05, + "loss": 2.2493, + "step": 2516 + }, + { + "epoch": 0.7725598526703499, + "grad_norm": 0.8637729287147522, + "learning_rate": 9.94167199915695e-05, + "loss": 2.1545, + "step": 2517 + }, + { + "epoch": 0.7728667894413751, + "grad_norm": 0.7852178812026978, + "learning_rate": 9.941596273399187e-05, + "loss": 2.1984, + "step": 2518 + }, + { + "epoch": 0.7731737262124002, + "grad_norm": 0.6839576959609985, + "learning_rate": 9.941520498805677e-05, + "loss": 2.1913, + "step": 2519 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.7051649689674377, + "learning_rate": 9.941444675377163e-05, + "loss": 2.1678, + "step": 2520 + }, + { + "epoch": 0.7737875997544506, + "grad_norm": 0.702549159526825, + "learning_rate": 9.941368803114395e-05, + "loss": 2.1426, + "step": 2521 + }, + { + "epoch": 0.7740945365254758, + "grad_norm": 0.6717942953109741, + "learning_rate": 9.941292882018127e-05, + "loss": 2.1873, + "step": 2522 + }, + { + "epoch": 0.7744014732965009, + "grad_norm": 0.6705282926559448, + "learning_rate": 9.941216912089104e-05, + "loss": 2.1363, + "step": 2523 + }, + { + "epoch": 0.7747084100675261, + "grad_norm": 0.5858317017555237, + "learning_rate": 9.941140893328082e-05, + "loss": 2.1019, + "step": 2524 + }, + { + "epoch": 0.7750153468385512, + "grad_norm": 0.6353682279586792, + "learning_rate": 9.941064825735808e-05, + "loss": 2.1765, + "step": 2525 + }, + { + "epoch": 0.7753222836095764, + "grad_norm": 0.6573354601860046, + "learning_rate": 9.940988709313035e-05, + "loss": 2.0636, + "step": 2526 + }, + { + "epoch": 0.7756292203806016, + "grad_norm": 0.6040489077568054, + "learning_rate": 9.940912544060517e-05, + "loss": 2.0902, + "step": 2527 + }, + { + "epoch": 0.7759361571516268, + "grad_norm": 0.7024530172348022, + "learning_rate": 9.940836329979004e-05, + "loss": 2.2198, + "step": 2528 + }, + { + "epoch": 0.7762430939226519, + "grad_norm": 0.6910196542739868, + "learning_rate": 9.940760067069251e-05, + "loss": 2.0546, + "step": 2529 + }, + { + "epoch": 0.7765500306936771, + "grad_norm": 0.6841506361961365, + "learning_rate": 9.940683755332012e-05, + "loss": 2.2159, + "step": 2530 + }, + { + "epoch": 0.7768569674647022, + "grad_norm": 0.6503066420555115, + "learning_rate": 9.940607394768038e-05, + "loss": 2.2156, + "step": 2531 + }, + { + "epoch": 0.7771639042357275, + "grad_norm": 0.6512146592140198, + "learning_rate": 9.940530985378089e-05, + "loss": 2.1417, + "step": 2532 + }, + { + "epoch": 0.7774708410067526, + "grad_norm": 0.6234787106513977, + "learning_rate": 9.940454527162914e-05, + "loss": 2.1315, + "step": 2533 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.6279457211494446, + "learning_rate": 9.940378020123273e-05, + "loss": 2.2699, + "step": 2534 + }, + { + "epoch": 0.7780847145488029, + "grad_norm": 0.6793956160545349, + "learning_rate": 9.940301464259921e-05, + "loss": 2.2488, + "step": 2535 + }, + { + "epoch": 0.7783916513198281, + "grad_norm": 0.721234142780304, + "learning_rate": 9.940224859573614e-05, + "loss": 2.1183, + "step": 2536 + }, + { + "epoch": 0.7786985880908532, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.94014820606511e-05, + "loss": 2.0995, + "step": 2537 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.6358578205108643, + "learning_rate": 9.940071503735165e-05, + "loss": 2.2024, + "step": 2538 + }, + { + "epoch": 0.7793124616329036, + "grad_norm": 0.6250868439674377, + "learning_rate": 9.939994752584538e-05, + "loss": 2.1574, + "step": 2539 + }, + { + "epoch": 0.7796193984039288, + "grad_norm": 0.7657763361930847, + "learning_rate": 9.939917952613989e-05, + "loss": 2.2625, + "step": 2540 + }, + { + "epoch": 0.7799263351749539, + "grad_norm": 0.7625400424003601, + "learning_rate": 9.939841103824275e-05, + "loss": 2.1809, + "step": 2541 + }, + { + "epoch": 0.7802332719459791, + "grad_norm": 0.8593107461929321, + "learning_rate": 9.939764206216155e-05, + "loss": 2.2359, + "step": 2542 + }, + { + "epoch": 0.7805402087170042, + "grad_norm": 0.8441007733345032, + "learning_rate": 9.93968725979039e-05, + "loss": 2.1844, + "step": 2543 + }, + { + "epoch": 0.7808471454880295, + "grad_norm": 0.6408470273017883, + "learning_rate": 9.93961026454774e-05, + "loss": 2.1871, + "step": 2544 + }, + { + "epoch": 0.7811540822590546, + "grad_norm": 0.6779976487159729, + "learning_rate": 9.939533220488966e-05, + "loss": 2.1651, + "step": 2545 + }, + { + "epoch": 0.7814610190300798, + "grad_norm": 0.5885556936264038, + "learning_rate": 9.93945612761483e-05, + "loss": 2.0172, + "step": 2546 + }, + { + "epoch": 0.7817679558011049, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.939378985926094e-05, + "loss": 2.1358, + "step": 2547 + }, + { + "epoch": 0.7820748925721301, + "grad_norm": 0.685183584690094, + "learning_rate": 9.939301795423519e-05, + "loss": 2.1822, + "step": 2548 + }, + { + "epoch": 0.7823818293431553, + "grad_norm": 0.6666997671127319, + "learning_rate": 9.939224556107869e-05, + "loss": 2.288, + "step": 2549 + }, + { + "epoch": 0.7826887661141805, + "grad_norm": 0.6401170492172241, + "learning_rate": 9.939147267979905e-05, + "loss": 2.1038, + "step": 2550 + }, + { + "epoch": 0.7829957028852057, + "grad_norm": 0.645182728767395, + "learning_rate": 9.939069931040396e-05, + "loss": 2.1285, + "step": 2551 + }, + { + "epoch": 0.7833026396562308, + "grad_norm": 0.6795851588249207, + "learning_rate": 9.9389925452901e-05, + "loss": 2.1844, + "step": 2552 + }, + { + "epoch": 0.783609576427256, + "grad_norm": 0.7027488946914673, + "learning_rate": 9.938915110729788e-05, + "loss": 2.1712, + "step": 2553 + }, + { + "epoch": 0.7839165131982812, + "grad_norm": 0.7076524496078491, + "learning_rate": 9.93883762736022e-05, + "loss": 2.1812, + "step": 2554 + }, + { + "epoch": 0.7842234499693064, + "grad_norm": 0.5979459881782532, + "learning_rate": 9.938760095182165e-05, + "loss": 2.0877, + "step": 2555 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.6408665776252747, + "learning_rate": 9.938682514196387e-05, + "loss": 2.191, + "step": 2556 + }, + { + "epoch": 0.7848373235113567, + "grad_norm": 0.6545908451080322, + "learning_rate": 9.938604884403654e-05, + "loss": 2.0933, + "step": 2557 + }, + { + "epoch": 0.7851442602823818, + "grad_norm": 0.7271838784217834, + "learning_rate": 9.938527205804733e-05, + "loss": 2.1804, + "step": 2558 + }, + { + "epoch": 0.785451197053407, + "grad_norm": 0.6371840834617615, + "learning_rate": 9.938449478400391e-05, + "loss": 2.1161, + "step": 2559 + }, + { + "epoch": 0.7857581338244322, + "grad_norm": 0.5922467708587646, + "learning_rate": 9.938371702191398e-05, + "loss": 2.0929, + "step": 2560 + }, + { + "epoch": 0.7860650705954574, + "grad_norm": 0.536125898361206, + "learning_rate": 9.938293877178522e-05, + "loss": 2.0815, + "step": 2561 + }, + { + "epoch": 0.7863720073664825, + "grad_norm": 0.6026225090026855, + "learning_rate": 9.93821600336253e-05, + "loss": 2.1719, + "step": 2562 + }, + { + "epoch": 0.7866789441375077, + "grad_norm": 0.584267795085907, + "learning_rate": 9.938138080744192e-05, + "loss": 2.1515, + "step": 2563 + }, + { + "epoch": 0.7869858809085328, + "grad_norm": 0.6616362929344177, + "learning_rate": 9.938060109324281e-05, + "loss": 2.2425, + "step": 2564 + }, + { + "epoch": 0.787292817679558, + "grad_norm": 0.669987678527832, + "learning_rate": 9.937982089103566e-05, + "loss": 2.1883, + "step": 2565 + }, + { + "epoch": 0.7875997544505832, + "grad_norm": 0.6769465208053589, + "learning_rate": 9.937904020082815e-05, + "loss": 2.1508, + "step": 2566 + }, + { + "epoch": 0.7879066912216084, + "grad_norm": 0.5796112418174744, + "learning_rate": 9.937825902262805e-05, + "loss": 2.0925, + "step": 2567 + }, + { + "epoch": 0.7882136279926335, + "grad_norm": 0.5895870923995972, + "learning_rate": 9.937747735644305e-05, + "loss": 2.1002, + "step": 2568 + }, + { + "epoch": 0.7885205647636587, + "grad_norm": 0.5870219469070435, + "learning_rate": 9.937669520228088e-05, + "loss": 2.1189, + "step": 2569 + }, + { + "epoch": 0.7888275015346838, + "grad_norm": 0.6191404461860657, + "learning_rate": 9.937591256014925e-05, + "loss": 2.1783, + "step": 2570 + }, + { + "epoch": 0.7891344383057091, + "grad_norm": 0.6033806204795837, + "learning_rate": 9.937512943005592e-05, + "loss": 2.1507, + "step": 2571 + }, + { + "epoch": 0.7894413750767342, + "grad_norm": 0.6319470405578613, + "learning_rate": 9.937434581200863e-05, + "loss": 2.2088, + "step": 2572 + }, + { + "epoch": 0.7897483118477594, + "grad_norm": 0.621004581451416, + "learning_rate": 9.93735617060151e-05, + "loss": 2.1523, + "step": 2573 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.6069821715354919, + "learning_rate": 9.937277711208311e-05, + "loss": 2.1437, + "step": 2574 + }, + { + "epoch": 0.7903621853898097, + "grad_norm": 0.6186996102333069, + "learning_rate": 9.937199203022039e-05, + "loss": 2.1541, + "step": 2575 + }, + { + "epoch": 0.7906691221608348, + "grad_norm": 0.6531949639320374, + "learning_rate": 9.937120646043471e-05, + "loss": 2.1928, + "step": 2576 + }, + { + "epoch": 0.7909760589318601, + "grad_norm": 0.5974560379981995, + "learning_rate": 9.937042040273383e-05, + "loss": 2.1814, + "step": 2577 + }, + { + "epoch": 0.7912829957028852, + "grad_norm": 0.59506756067276, + "learning_rate": 9.936963385712552e-05, + "loss": 2.2143, + "step": 2578 + }, + { + "epoch": 0.7915899324739104, + "grad_norm": 0.5878757834434509, + "learning_rate": 9.936884682361755e-05, + "loss": 2.0718, + "step": 2579 + }, + { + "epoch": 0.7918968692449355, + "grad_norm": 0.6318243145942688, + "learning_rate": 9.936805930221769e-05, + "loss": 2.1465, + "step": 2580 + }, + { + "epoch": 0.7922038060159607, + "grad_norm": 0.6474836468696594, + "learning_rate": 9.936727129293376e-05, + "loss": 2.0869, + "step": 2581 + }, + { + "epoch": 0.7925107427869859, + "grad_norm": 0.6589438915252686, + "learning_rate": 9.936648279577349e-05, + "loss": 2.1422, + "step": 2582 + }, + { + "epoch": 0.7928176795580111, + "grad_norm": 0.6935134530067444, + "learning_rate": 9.93656938107447e-05, + "loss": 2.1571, + "step": 2583 + }, + { + "epoch": 0.7931246163290362, + "grad_norm": 0.655430793762207, + "learning_rate": 9.936490433785522e-05, + "loss": 2.1044, + "step": 2584 + }, + { + "epoch": 0.7934315531000614, + "grad_norm": 0.6856111288070679, + "learning_rate": 9.93641143771128e-05, + "loss": 2.0551, + "step": 2585 + }, + { + "epoch": 0.7937384898710865, + "grad_norm": 0.6783097386360168, + "learning_rate": 9.936332392852527e-05, + "loss": 2.1475, + "step": 2586 + }, + { + "epoch": 0.7940454266421118, + "grad_norm": 0.6746678948402405, + "learning_rate": 9.936253299210045e-05, + "loss": 2.1462, + "step": 2587 + }, + { + "epoch": 0.7943523634131369, + "grad_norm": 0.6854017972946167, + "learning_rate": 9.936174156784614e-05, + "loss": 2.1649, + "step": 2588 + }, + { + "epoch": 0.7946593001841621, + "grad_norm": 0.6740380525588989, + "learning_rate": 9.936094965577017e-05, + "loss": 2.06, + "step": 2589 + }, + { + "epoch": 0.7949662369551872, + "grad_norm": 0.6354179978370667, + "learning_rate": 9.936015725588037e-05, + "loss": 2.1938, + "step": 2590 + }, + { + "epoch": 0.7952731737262124, + "grad_norm": 0.6496716141700745, + "learning_rate": 9.935936436818453e-05, + "loss": 2.089, + "step": 2591 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.5996106266975403, + "learning_rate": 9.935857099269057e-05, + "loss": 2.2254, + "step": 2592 + }, + { + "epoch": 0.7958870472682628, + "grad_norm": 0.5630382895469666, + "learning_rate": 9.935777712940625e-05, + "loss": 2.069, + "step": 2593 + }, + { + "epoch": 0.7961939840392879, + "grad_norm": 0.5480468273162842, + "learning_rate": 9.935698277833946e-05, + "loss": 2.1288, + "step": 2594 + }, + { + "epoch": 0.7965009208103131, + "grad_norm": 0.5127096772193909, + "learning_rate": 9.935618793949803e-05, + "loss": 2.0753, + "step": 2595 + }, + { + "epoch": 0.7968078575813382, + "grad_norm": 0.6451439261436462, + "learning_rate": 9.935539261288983e-05, + "loss": 2.3005, + "step": 2596 + }, + { + "epoch": 0.7971147943523634, + "grad_norm": 0.7047737836837769, + "learning_rate": 9.935459679852271e-05, + "loss": 2.1307, + "step": 2597 + }, + { + "epoch": 0.7974217311233885, + "grad_norm": 0.6382983922958374, + "learning_rate": 9.935380049640454e-05, + "loss": 2.1136, + "step": 2598 + }, + { + "epoch": 0.7977286678944138, + "grad_norm": 0.7337773442268372, + "learning_rate": 9.935300370654317e-05, + "loss": 2.0719, + "step": 2599 + }, + { + "epoch": 0.7980356046654389, + "grad_norm": 0.7481197118759155, + "learning_rate": 9.935220642894652e-05, + "loss": 2.2263, + "step": 2600 + }, + { + "epoch": 0.7983425414364641, + "grad_norm": 0.7383365631103516, + "learning_rate": 9.93514086636224e-05, + "loss": 2.2207, + "step": 2601 + }, + { + "epoch": 0.7986494782074892, + "grad_norm": 0.800762951374054, + "learning_rate": 9.935061041057876e-05, + "loss": 2.1848, + "step": 2602 + }, + { + "epoch": 0.7989564149785144, + "grad_norm": 0.6972829699516296, + "learning_rate": 9.934981166982346e-05, + "loss": 2.1301, + "step": 2603 + }, + { + "epoch": 0.7992633517495396, + "grad_norm": 0.5842304229736328, + "learning_rate": 9.93490124413644e-05, + "loss": 2.1311, + "step": 2604 + }, + { + "epoch": 0.7995702885205648, + "grad_norm": 0.6070491075515747, + "learning_rate": 9.934821272520946e-05, + "loss": 2.2226, + "step": 2605 + }, + { + "epoch": 0.7998772252915899, + "grad_norm": 0.6141406297683716, + "learning_rate": 9.934741252136656e-05, + "loss": 2.1425, + "step": 2606 + }, + { + "epoch": 0.8001841620626151, + "grad_norm": 0.5515148043632507, + "learning_rate": 9.934661182984363e-05, + "loss": 2.1138, + "step": 2607 + }, + { + "epoch": 0.8004910988336402, + "grad_norm": 0.5819688439369202, + "learning_rate": 9.934581065064854e-05, + "loss": 2.0835, + "step": 2608 + }, + { + "epoch": 0.8007980356046654, + "grad_norm": 0.593979001045227, + "learning_rate": 9.934500898378922e-05, + "loss": 2.2262, + "step": 2609 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.6978363990783691, + "learning_rate": 9.934420682927361e-05, + "loss": 2.1283, + "step": 2610 + }, + { + "epoch": 0.8014119091467158, + "grad_norm": 0.6205853223800659, + "learning_rate": 9.934340418710963e-05, + "loss": 2.1254, + "step": 2611 + }, + { + "epoch": 0.8017188459177409, + "grad_norm": 0.5547113418579102, + "learning_rate": 9.93426010573052e-05, + "loss": 2.0895, + "step": 2612 + }, + { + "epoch": 0.8020257826887661, + "grad_norm": 0.5652415156364441, + "learning_rate": 9.934179743986827e-05, + "loss": 2.1496, + "step": 2613 + }, + { + "epoch": 0.8023327194597912, + "grad_norm": 0.5833094120025635, + "learning_rate": 9.934099333480678e-05, + "loss": 2.1159, + "step": 2614 + }, + { + "epoch": 0.8026396562308165, + "grad_norm": 0.5929473638534546, + "learning_rate": 9.934018874212866e-05, + "loss": 2.1512, + "step": 2615 + }, + { + "epoch": 0.8029465930018416, + "grad_norm": 0.6359207630157471, + "learning_rate": 9.93393836618419e-05, + "loss": 2.1384, + "step": 2616 + }, + { + "epoch": 0.8032535297728668, + "grad_norm": 0.5934728384017944, + "learning_rate": 9.933857809395441e-05, + "loss": 2.1087, + "step": 2617 + }, + { + "epoch": 0.8035604665438919, + "grad_norm": 0.5685787796974182, + "learning_rate": 9.933777203847418e-05, + "loss": 2.1521, + "step": 2618 + }, + { + "epoch": 0.8038674033149171, + "grad_norm": 0.6276339292526245, + "learning_rate": 9.933696549540918e-05, + "loss": 2.1151, + "step": 2619 + }, + { + "epoch": 0.8041743400859422, + "grad_norm": 0.6206804513931274, + "learning_rate": 9.933615846476736e-05, + "loss": 2.1872, + "step": 2620 + }, + { + "epoch": 0.8044812768569675, + "grad_norm": 0.6645623445510864, + "learning_rate": 9.933535094655671e-05, + "loss": 2.217, + "step": 2621 + }, + { + "epoch": 0.8047882136279927, + "grad_norm": 0.6639950275421143, + "learning_rate": 9.93345429407852e-05, + "loss": 2.1479, + "step": 2622 + }, + { + "epoch": 0.8050951503990178, + "grad_norm": 0.6284301280975342, + "learning_rate": 9.933373444746081e-05, + "loss": 2.1763, + "step": 2623 + }, + { + "epoch": 0.805402087170043, + "grad_norm": 0.5974198579788208, + "learning_rate": 9.933292546659156e-05, + "loss": 2.1453, + "step": 2624 + }, + { + "epoch": 0.8057090239410681, + "grad_norm": 0.6465814113616943, + "learning_rate": 9.933211599818541e-05, + "loss": 2.1999, + "step": 2625 + }, + { + "epoch": 0.8060159607120934, + "grad_norm": 0.6099503040313721, + "learning_rate": 9.933130604225038e-05, + "loss": 2.1523, + "step": 2626 + }, + { + "epoch": 0.8063228974831185, + "grad_norm": 0.5749596953392029, + "learning_rate": 9.933049559879448e-05, + "loss": 2.0802, + "step": 2627 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.5541282892227173, + "learning_rate": 9.93296846678257e-05, + "loss": 2.0851, + "step": 2628 + }, + { + "epoch": 0.8069367710251688, + "grad_norm": 0.5884469747543335, + "learning_rate": 9.932887324935207e-05, + "loss": 2.1824, + "step": 2629 + }, + { + "epoch": 0.807243707796194, + "grad_norm": 0.7330854535102844, + "learning_rate": 9.93280613433816e-05, + "loss": 2.1463, + "step": 2630 + }, + { + "epoch": 0.8075506445672191, + "grad_norm": 0.7012677192687988, + "learning_rate": 9.932724894992232e-05, + "loss": 2.0907, + "step": 2631 + }, + { + "epoch": 0.8078575813382444, + "grad_norm": 0.6487980484962463, + "learning_rate": 9.932643606898224e-05, + "loss": 2.2131, + "step": 2632 + }, + { + "epoch": 0.8081645181092695, + "grad_norm": 0.7956567406654358, + "learning_rate": 9.932562270056941e-05, + "loss": 2.2289, + "step": 2633 + }, + { + "epoch": 0.8084714548802947, + "grad_norm": 0.7904889583587646, + "learning_rate": 9.932480884469187e-05, + "loss": 2.195, + "step": 2634 + }, + { + "epoch": 0.8087783916513198, + "grad_norm": 0.8088505864143372, + "learning_rate": 9.932399450135766e-05, + "loss": 2.1199, + "step": 2635 + }, + { + "epoch": 0.809085328422345, + "grad_norm": 0.7557070851325989, + "learning_rate": 9.932317967057483e-05, + "loss": 2.177, + "step": 2636 + }, + { + "epoch": 0.8093922651933702, + "grad_norm": 0.8585113286972046, + "learning_rate": 9.932236435235143e-05, + "loss": 2.2215, + "step": 2637 + }, + { + "epoch": 0.8096992019643954, + "grad_norm": 0.9541242718696594, + "learning_rate": 9.932154854669551e-05, + "loss": 2.0971, + "step": 2638 + }, + { + "epoch": 0.8100061387354205, + "grad_norm": 0.9696017503738403, + "learning_rate": 9.932073225361513e-05, + "loss": 2.1723, + "step": 2639 + }, + { + "epoch": 0.8103130755064457, + "grad_norm": 0.9876028895378113, + "learning_rate": 9.931991547311839e-05, + "loss": 2.2266, + "step": 2640 + }, + { + "epoch": 0.8106200122774708, + "grad_norm": 0.9169884324073792, + "learning_rate": 9.931909820521332e-05, + "loss": 2.1453, + "step": 2641 + }, + { + "epoch": 0.810926949048496, + "grad_norm": 0.7645174860954285, + "learning_rate": 9.931828044990801e-05, + "loss": 2.1683, + "step": 2642 + }, + { + "epoch": 0.8112338858195212, + "grad_norm": 0.6733110547065735, + "learning_rate": 9.931746220721056e-05, + "loss": 2.0869, + "step": 2643 + }, + { + "epoch": 0.8115408225905464, + "grad_norm": 0.6033461689949036, + "learning_rate": 9.931664347712904e-05, + "loss": 2.1395, + "step": 2644 + }, + { + "epoch": 0.8118477593615715, + "grad_norm": 0.5953301191329956, + "learning_rate": 9.931582425967154e-05, + "loss": 2.0886, + "step": 2645 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.6587704420089722, + "learning_rate": 9.931500455484616e-05, + "loss": 2.1846, + "step": 2646 + }, + { + "epoch": 0.8124616329036218, + "grad_norm": 0.5837808847427368, + "learning_rate": 9.931418436266101e-05, + "loss": 2.0953, + "step": 2647 + }, + { + "epoch": 0.8127685696746471, + "grad_norm": 0.5593163967132568, + "learning_rate": 9.931336368312417e-05, + "loss": 2.1044, + "step": 2648 + }, + { + "epoch": 0.8130755064456722, + "grad_norm": 0.5758668780326843, + "learning_rate": 9.931254251624378e-05, + "loss": 2.1813, + "step": 2649 + }, + { + "epoch": 0.8133824432166974, + "grad_norm": 0.7128240466117859, + "learning_rate": 9.931172086202793e-05, + "loss": 2.1743, + "step": 2650 + }, + { + "epoch": 0.8136893799877225, + "grad_norm": 0.6214346885681152, + "learning_rate": 9.931089872048476e-05, + "loss": 2.0566, + "step": 2651 + }, + { + "epoch": 0.8139963167587477, + "grad_norm": 0.6279975771903992, + "learning_rate": 9.931007609162239e-05, + "loss": 2.1487, + "step": 2652 + }, + { + "epoch": 0.8143032535297728, + "grad_norm": 0.6137428879737854, + "learning_rate": 9.930925297544895e-05, + "loss": 2.1281, + "step": 2653 + }, + { + "epoch": 0.8146101903007981, + "grad_norm": 0.7433622479438782, + "learning_rate": 9.930842937197255e-05, + "loss": 2.2398, + "step": 2654 + }, + { + "epoch": 0.8149171270718232, + "grad_norm": 0.7490934729576111, + "learning_rate": 9.930760528120137e-05, + "loss": 2.0626, + "step": 2655 + }, + { + "epoch": 0.8152240638428484, + "grad_norm": 0.6829020380973816, + "learning_rate": 9.930678070314352e-05, + "loss": 2.0685, + "step": 2656 + }, + { + "epoch": 0.8155310006138735, + "grad_norm": 0.6328942775726318, + "learning_rate": 9.930595563780718e-05, + "loss": 2.1415, + "step": 2657 + }, + { + "epoch": 0.8158379373848987, + "grad_norm": 0.6919183135032654, + "learning_rate": 9.930513008520048e-05, + "loss": 2.1764, + "step": 2658 + }, + { + "epoch": 0.8161448741559238, + "grad_norm": 0.6600683331489563, + "learning_rate": 9.930430404533158e-05, + "loss": 2.2252, + "step": 2659 + }, + { + "epoch": 0.8164518109269491, + "grad_norm": 0.6614112257957458, + "learning_rate": 9.930347751820866e-05, + "loss": 2.0842, + "step": 2660 + }, + { + "epoch": 0.8167587476979742, + "grad_norm": 0.634395182132721, + "learning_rate": 9.930265050383987e-05, + "loss": 2.1784, + "step": 2661 + }, + { + "epoch": 0.8170656844689994, + "grad_norm": 0.6563819050788879, + "learning_rate": 9.930182300223338e-05, + "loss": 2.1845, + "step": 2662 + }, + { + "epoch": 0.8173726212400245, + "grad_norm": 0.7023175954818726, + "learning_rate": 9.93009950133974e-05, + "loss": 2.1913, + "step": 2663 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.6042037010192871, + "learning_rate": 9.930016653734007e-05, + "loss": 2.1624, + "step": 2664 + }, + { + "epoch": 0.8179864947820749, + "grad_norm": 0.5729875564575195, + "learning_rate": 9.929933757406962e-05, + "loss": 2.0439, + "step": 2665 + }, + { + "epoch": 0.8182934315531001, + "grad_norm": 0.5399687886238098, + "learning_rate": 9.929850812359421e-05, + "loss": 2.1438, + "step": 2666 + }, + { + "epoch": 0.8186003683241252, + "grad_norm": 0.6325745582580566, + "learning_rate": 9.929767818592205e-05, + "loss": 2.1644, + "step": 2667 + }, + { + "epoch": 0.8189073050951504, + "grad_norm": 0.6303146481513977, + "learning_rate": 9.929684776106134e-05, + "loss": 2.1106, + "step": 2668 + }, + { + "epoch": 0.8192142418661755, + "grad_norm": 0.6482712030410767, + "learning_rate": 9.929601684902027e-05, + "loss": 2.0877, + "step": 2669 + }, + { + "epoch": 0.8195211786372008, + "grad_norm": 0.6858036518096924, + "learning_rate": 9.92951854498071e-05, + "loss": 2.1263, + "step": 2670 + }, + { + "epoch": 0.8198281154082259, + "grad_norm": 0.6214284896850586, + "learning_rate": 9.929435356343e-05, + "loss": 2.1516, + "step": 2671 + }, + { + "epoch": 0.8201350521792511, + "grad_norm": 0.5486865639686584, + "learning_rate": 9.92935211898972e-05, + "loss": 2.1199, + "step": 2672 + }, + { + "epoch": 0.8204419889502762, + "grad_norm": 0.62936931848526, + "learning_rate": 9.929268832921693e-05, + "loss": 2.1555, + "step": 2673 + }, + { + "epoch": 0.8207489257213014, + "grad_norm": 0.6402064561843872, + "learning_rate": 9.929185498139744e-05, + "loss": 2.1017, + "step": 2674 + }, + { + "epoch": 0.8210558624923265, + "grad_norm": 0.7254593372344971, + "learning_rate": 9.929102114644693e-05, + "loss": 2.1145, + "step": 2675 + }, + { + "epoch": 0.8213627992633518, + "grad_norm": 0.776472806930542, + "learning_rate": 9.929018682437366e-05, + "loss": 2.2582, + "step": 2676 + }, + { + "epoch": 0.8216697360343769, + "grad_norm": 0.7073757648468018, + "learning_rate": 9.928935201518587e-05, + "loss": 2.1135, + "step": 2677 + }, + { + "epoch": 0.8219766728054021, + "grad_norm": 0.7075079679489136, + "learning_rate": 9.928851671889184e-05, + "loss": 2.128, + "step": 2678 + }, + { + "epoch": 0.8222836095764272, + "grad_norm": 0.7937450408935547, + "learning_rate": 9.928768093549979e-05, + "loss": 2.1401, + "step": 2679 + }, + { + "epoch": 0.8225905463474524, + "grad_norm": 0.7523970603942871, + "learning_rate": 9.928684466501797e-05, + "loss": 2.2055, + "step": 2680 + }, + { + "epoch": 0.8228974831184775, + "grad_norm": 0.6644876599311829, + "learning_rate": 9.928600790745466e-05, + "loss": 2.1449, + "step": 2681 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.6054069399833679, + "learning_rate": 9.928517066281816e-05, + "loss": 2.1191, + "step": 2682 + }, + { + "epoch": 0.8235113566605279, + "grad_norm": 0.6610973477363586, + "learning_rate": 9.92843329311167e-05, + "loss": 2.2247, + "step": 2683 + }, + { + "epoch": 0.8238182934315531, + "grad_norm": 0.69968181848526, + "learning_rate": 9.928349471235858e-05, + "loss": 2.149, + "step": 2684 + }, + { + "epoch": 0.8241252302025782, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.928265600655206e-05, + "loss": 2.1906, + "step": 2685 + }, + { + "epoch": 0.8244321669736034, + "grad_norm": 0.6621972918510437, + "learning_rate": 9.928181681370547e-05, + "loss": 2.1259, + "step": 2686 + }, + { + "epoch": 0.8247391037446286, + "grad_norm": 0.6452053785324097, + "learning_rate": 9.928097713382708e-05, + "loss": 2.1301, + "step": 2687 + }, + { + "epoch": 0.8250460405156538, + "grad_norm": 0.6137326955795288, + "learning_rate": 9.928013696692519e-05, + "loss": 2.0942, + "step": 2688 + }, + { + "epoch": 0.8253529772866789, + "grad_norm": 0.6449215412139893, + "learning_rate": 9.92792963130081e-05, + "loss": 2.2135, + "step": 2689 + }, + { + "epoch": 0.8256599140577041, + "grad_norm": 0.5838732123374939, + "learning_rate": 9.927845517208411e-05, + "loss": 2.1161, + "step": 2690 + }, + { + "epoch": 0.8259668508287292, + "grad_norm": 0.6642805337905884, + "learning_rate": 9.927761354416157e-05, + "loss": 2.1228, + "step": 2691 + }, + { + "epoch": 0.8262737875997545, + "grad_norm": 0.653274416923523, + "learning_rate": 9.927677142924874e-05, + "loss": 2.1777, + "step": 2692 + }, + { + "epoch": 0.8265807243707797, + "grad_norm": 0.6471827030181885, + "learning_rate": 9.927592882735398e-05, + "loss": 2.0756, + "step": 2693 + }, + { + "epoch": 0.8268876611418048, + "grad_norm": 0.6215457916259766, + "learning_rate": 9.927508573848562e-05, + "loss": 2.0691, + "step": 2694 + }, + { + "epoch": 0.82719459791283, + "grad_norm": 0.6343390345573425, + "learning_rate": 9.927424216265198e-05, + "loss": 2.2145, + "step": 2695 + }, + { + "epoch": 0.8275015346838551, + "grad_norm": 0.5296334624290466, + "learning_rate": 9.927339809986138e-05, + "loss": 2.0861, + "step": 2696 + }, + { + "epoch": 0.8278084714548803, + "grad_norm": 0.6457146406173706, + "learning_rate": 9.92725535501222e-05, + "loss": 2.1703, + "step": 2697 + }, + { + "epoch": 0.8281154082259055, + "grad_norm": 0.753579318523407, + "learning_rate": 9.927170851344276e-05, + "loss": 2.1628, + "step": 2698 + }, + { + "epoch": 0.8284223449969307, + "grad_norm": 0.7327163815498352, + "learning_rate": 9.927086298983141e-05, + "loss": 2.105, + "step": 2699 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.7786175608634949, + "learning_rate": 9.927001697929653e-05, + "loss": 2.084, + "step": 2700 + }, + { + "epoch": 0.829036218538981, + "grad_norm": 0.6370857357978821, + "learning_rate": 9.926917048184646e-05, + "loss": 2.0888, + "step": 2701 + }, + { + "epoch": 0.8293431553100061, + "grad_norm": 0.6600006818771362, + "learning_rate": 9.926832349748955e-05, + "loss": 2.148, + "step": 2702 + }, + { + "epoch": 0.8296500920810314, + "grad_norm": 0.6266845464706421, + "learning_rate": 9.926747602623422e-05, + "loss": 2.2182, + "step": 2703 + }, + { + "epoch": 0.8299570288520565, + "grad_norm": 0.588934600353241, + "learning_rate": 9.92666280680888e-05, + "loss": 2.1879, + "step": 2704 + }, + { + "epoch": 0.8302639656230817, + "grad_norm": 0.6467881202697754, + "learning_rate": 9.926577962306168e-05, + "loss": 2.1082, + "step": 2705 + }, + { + "epoch": 0.8305709023941068, + "grad_norm": 0.6256638765335083, + "learning_rate": 9.926493069116127e-05, + "loss": 2.1007, + "step": 2706 + }, + { + "epoch": 0.830877839165132, + "grad_norm": 0.5710256099700928, + "learning_rate": 9.926408127239592e-05, + "loss": 2.0783, + "step": 2707 + }, + { + "epoch": 0.8311847759361571, + "grad_norm": 0.5836597681045532, + "learning_rate": 9.926323136677405e-05, + "loss": 2.1292, + "step": 2708 + }, + { + "epoch": 0.8314917127071824, + "grad_norm": 0.6420408487319946, + "learning_rate": 9.926238097430405e-05, + "loss": 2.1191, + "step": 2709 + }, + { + "epoch": 0.8317986494782075, + "grad_norm": 0.6192520260810852, + "learning_rate": 9.926153009499433e-05, + "loss": 2.1401, + "step": 2710 + }, + { + "epoch": 0.8321055862492327, + "grad_norm": 0.5986925959587097, + "learning_rate": 9.92606787288533e-05, + "loss": 2.0466, + "step": 2711 + }, + { + "epoch": 0.8324125230202578, + "grad_norm": 0.6386710405349731, + "learning_rate": 9.925982687588937e-05, + "loss": 2.1975, + "step": 2712 + }, + { + "epoch": 0.832719459791283, + "grad_norm": 0.6678250432014465, + "learning_rate": 9.925897453611095e-05, + "loss": 2.1744, + "step": 2713 + }, + { + "epoch": 0.8330263965623081, + "grad_norm": 0.628873348236084, + "learning_rate": 9.925812170952648e-05, + "loss": 2.0901, + "step": 2714 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6365368366241455, + "learning_rate": 9.925726839614438e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.8336402701043585, + "grad_norm": 0.6812825798988342, + "learning_rate": 9.925641459597309e-05, + "loss": 2.1163, + "step": 2716 + }, + { + "epoch": 0.8339472068753837, + "grad_norm": 0.6961301565170288, + "learning_rate": 9.925556030902103e-05, + "loss": 2.1634, + "step": 2717 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.687017023563385, + "learning_rate": 9.925470553529666e-05, + "loss": 2.1921, + "step": 2718 + }, + { + "epoch": 0.834561080417434, + "grad_norm": 0.6528787612915039, + "learning_rate": 9.925385027480841e-05, + "loss": 2.1148, + "step": 2719 + }, + { + "epoch": 0.8348680171884592, + "grad_norm": 0.6092917323112488, + "learning_rate": 9.925299452756476e-05, + "loss": 2.0154, + "step": 2720 + }, + { + "epoch": 0.8351749539594844, + "grad_norm": 0.6537092328071594, + "learning_rate": 9.925213829357413e-05, + "loss": 2.1775, + "step": 2721 + }, + { + "epoch": 0.8354818907305095, + "grad_norm": 0.6560773849487305, + "learning_rate": 9.925128157284503e-05, + "loss": 2.1628, + "step": 2722 + }, + { + "epoch": 0.8357888275015347, + "grad_norm": 0.5976104140281677, + "learning_rate": 9.925042436538588e-05, + "loss": 2.1527, + "step": 2723 + }, + { + "epoch": 0.8360957642725598, + "grad_norm": 0.6577131152153015, + "learning_rate": 9.924956667120516e-05, + "loss": 2.1449, + "step": 2724 + }, + { + "epoch": 0.836402701043585, + "grad_norm": 0.6574232578277588, + "learning_rate": 9.924870849031136e-05, + "loss": 2.0517, + "step": 2725 + }, + { + "epoch": 0.8367096378146102, + "grad_norm": 0.5988326072692871, + "learning_rate": 9.924784982271297e-05, + "loss": 2.0975, + "step": 2726 + }, + { + "epoch": 0.8370165745856354, + "grad_norm": 0.5970706939697266, + "learning_rate": 9.924699066841845e-05, + "loss": 2.1754, + "step": 2727 + }, + { + "epoch": 0.8373235113566605, + "grad_norm": 0.6547200679779053, + "learning_rate": 9.924613102743632e-05, + "loss": 2.1651, + "step": 2728 + }, + { + "epoch": 0.8376304481276857, + "grad_norm": 0.643358588218689, + "learning_rate": 9.924527089977504e-05, + "loss": 2.1355, + "step": 2729 + }, + { + "epoch": 0.8379373848987108, + "grad_norm": 0.6696504950523376, + "learning_rate": 9.924441028544314e-05, + "loss": 2.1444, + "step": 2730 + }, + { + "epoch": 0.8382443216697361, + "grad_norm": 0.5923263430595398, + "learning_rate": 9.924354918444911e-05, + "loss": 2.1656, + "step": 2731 + }, + { + "epoch": 0.8385512584407612, + "grad_norm": 0.6507698893547058, + "learning_rate": 9.924268759680146e-05, + "loss": 2.1172, + "step": 2732 + }, + { + "epoch": 0.8388581952117864, + "grad_norm": 0.6240561008453369, + "learning_rate": 9.924182552250873e-05, + "loss": 2.113, + "step": 2733 + }, + { + "epoch": 0.8391651319828115, + "grad_norm": 0.7350605726242065, + "learning_rate": 9.92409629615794e-05, + "loss": 2.2099, + "step": 2734 + }, + { + "epoch": 0.8394720687538367, + "grad_norm": 0.679027795791626, + "learning_rate": 9.924009991402202e-05, + "loss": 2.1202, + "step": 2735 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.7187801003456116, + "learning_rate": 9.923923637984512e-05, + "loss": 2.1994, + "step": 2736 + }, + { + "epoch": 0.8400859422958871, + "grad_norm": 0.7437569499015808, + "learning_rate": 9.92383723590572e-05, + "loss": 2.1778, + "step": 2737 + }, + { + "epoch": 0.8403928790669122, + "grad_norm": 0.7004902958869934, + "learning_rate": 9.923750785166686e-05, + "loss": 2.1478, + "step": 2738 + }, + { + "epoch": 0.8406998158379374, + "grad_norm": 0.632478654384613, + "learning_rate": 9.923664285768258e-05, + "loss": 2.1785, + "step": 2739 + }, + { + "epoch": 0.8410067526089625, + "grad_norm": 0.6399826407432556, + "learning_rate": 9.923577737711295e-05, + "loss": 2.1708, + "step": 2740 + }, + { + "epoch": 0.8413136893799877, + "grad_norm": 0.649340033531189, + "learning_rate": 9.92349114099665e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.8416206261510129, + "grad_norm": 0.6143749952316284, + "learning_rate": 9.923404495625182e-05, + "loss": 2.0696, + "step": 2742 + }, + { + "epoch": 0.8419275629220381, + "grad_norm": 0.655846357345581, + "learning_rate": 9.923317801597742e-05, + "loss": 2.1163, + "step": 2743 + }, + { + "epoch": 0.8422344996930632, + "grad_norm": 0.588096022605896, + "learning_rate": 9.923231058915192e-05, + "loss": 2.0893, + "step": 2744 + }, + { + "epoch": 0.8425414364640884, + "grad_norm": 0.5445908904075623, + "learning_rate": 9.923144267578386e-05, + "loss": 2.1223, + "step": 2745 + }, + { + "epoch": 0.8428483732351135, + "grad_norm": 0.5372910499572754, + "learning_rate": 9.923057427588182e-05, + "loss": 2.1386, + "step": 2746 + }, + { + "epoch": 0.8431553100061387, + "grad_norm": 0.5118899345397949, + "learning_rate": 9.922970538945442e-05, + "loss": 2.0532, + "step": 2747 + }, + { + "epoch": 0.8434622467771639, + "grad_norm": 0.5252440571784973, + "learning_rate": 9.922883601651019e-05, + "loss": 2.1679, + "step": 2748 + }, + { + "epoch": 0.8437691835481891, + "grad_norm": 0.5978875160217285, + "learning_rate": 9.922796615705776e-05, + "loss": 2.2054, + "step": 2749 + }, + { + "epoch": 0.8440761203192142, + "grad_norm": 0.5642610788345337, + "learning_rate": 9.922709581110572e-05, + "loss": 2.1886, + "step": 2750 + }, + { + "epoch": 0.8443830570902394, + "grad_norm": 0.6332407593727112, + "learning_rate": 9.922622497866265e-05, + "loss": 2.1618, + "step": 2751 + }, + { + "epoch": 0.8446899938612645, + "grad_norm": 0.6971728801727295, + "learning_rate": 9.922535365973718e-05, + "loss": 2.1011, + "step": 2752 + }, + { + "epoch": 0.8449969306322898, + "grad_norm": 0.6917250156402588, + "learning_rate": 9.922448185433792e-05, + "loss": 2.1408, + "step": 2753 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.748960554599762, + "learning_rate": 9.922360956247348e-05, + "loss": 2.1612, + "step": 2754 + }, + { + "epoch": 0.8456108041743401, + "grad_norm": 0.6739722490310669, + "learning_rate": 9.922273678415245e-05, + "loss": 2.1234, + "step": 2755 + }, + { + "epoch": 0.8459177409453652, + "grad_norm": 0.6310722827911377, + "learning_rate": 9.922186351938351e-05, + "loss": 2.1476, + "step": 2756 + }, + { + "epoch": 0.8462246777163904, + "grad_norm": 0.5992079973220825, + "learning_rate": 9.922098976817527e-05, + "loss": 2.1009, + "step": 2757 + }, + { + "epoch": 0.8465316144874155, + "grad_norm": 0.5697188973426819, + "learning_rate": 9.922011553053637e-05, + "loss": 2.1277, + "step": 2758 + }, + { + "epoch": 0.8468385512584408, + "grad_norm": 0.7005256414413452, + "learning_rate": 9.921924080647541e-05, + "loss": 2.1592, + "step": 2759 + }, + { + "epoch": 0.8471454880294659, + "grad_norm": 0.7664382457733154, + "learning_rate": 9.921836559600109e-05, + "loss": 2.2328, + "step": 2760 + }, + { + "epoch": 0.8474524248004911, + "grad_norm": 0.8668230772018433, + "learning_rate": 9.921748989912201e-05, + "loss": 2.2285, + "step": 2761 + }, + { + "epoch": 0.8477593615715162, + "grad_norm": 0.9423169493675232, + "learning_rate": 9.921661371584685e-05, + "loss": 2.1172, + "step": 2762 + }, + { + "epoch": 0.8480662983425414, + "grad_norm": 0.8547552824020386, + "learning_rate": 9.921573704618428e-05, + "loss": 2.1426, + "step": 2763 + }, + { + "epoch": 0.8483732351135667, + "grad_norm": 0.7568690776824951, + "learning_rate": 9.921485989014294e-05, + "loss": 2.0861, + "step": 2764 + }, + { + "epoch": 0.8486801718845918, + "grad_norm": 0.6535828709602356, + "learning_rate": 9.92139822477315e-05, + "loss": 2.1705, + "step": 2765 + }, + { + "epoch": 0.848987108655617, + "grad_norm": 0.6099218130111694, + "learning_rate": 9.921310411895867e-05, + "loss": 2.1666, + "step": 2766 + }, + { + "epoch": 0.8492940454266421, + "grad_norm": 0.6315065026283264, + "learning_rate": 9.92122255038331e-05, + "loss": 2.1868, + "step": 2767 + }, + { + "epoch": 0.8496009821976673, + "grad_norm": 0.6861329078674316, + "learning_rate": 9.921134640236344e-05, + "loss": 2.1056, + "step": 2768 + }, + { + "epoch": 0.8499079189686924, + "grad_norm": 0.6357519626617432, + "learning_rate": 9.921046681455844e-05, + "loss": 2.1272, + "step": 2769 + }, + { + "epoch": 0.8502148557397177, + "grad_norm": 0.6245810389518738, + "learning_rate": 9.920958674042676e-05, + "loss": 2.1313, + "step": 2770 + }, + { + "epoch": 0.8505217925107428, + "grad_norm": 0.6087192296981812, + "learning_rate": 9.920870617997709e-05, + "loss": 2.123, + "step": 2771 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.6384228467941284, + "learning_rate": 9.920782513321814e-05, + "loss": 2.1343, + "step": 2772 + }, + { + "epoch": 0.8511356660527931, + "grad_norm": 0.6143882274627686, + "learning_rate": 9.920694360015863e-05, + "loss": 2.0706, + "step": 2773 + }, + { + "epoch": 0.8514426028238183, + "grad_norm": 0.5561975240707397, + "learning_rate": 9.920606158080725e-05, + "loss": 2.1015, + "step": 2774 + }, + { + "epoch": 0.8517495395948435, + "grad_norm": 0.5434146523475647, + "learning_rate": 9.920517907517275e-05, + "loss": 2.1306, + "step": 2775 + }, + { + "epoch": 0.8520564763658687, + "grad_norm": 0.6028591990470886, + "learning_rate": 9.920429608326382e-05, + "loss": 2.1665, + "step": 2776 + }, + { + "epoch": 0.8523634131368938, + "grad_norm": 0.6491599082946777, + "learning_rate": 9.920341260508918e-05, + "loss": 2.0715, + "step": 2777 + }, + { + "epoch": 0.852670349907919, + "grad_norm": 0.6350167989730835, + "learning_rate": 9.92025286406576e-05, + "loss": 2.1492, + "step": 2778 + }, + { + "epoch": 0.8529772866789441, + "grad_norm": 0.5726897120475769, + "learning_rate": 9.92016441899778e-05, + "loss": 2.1128, + "step": 2779 + }, + { + "epoch": 0.8532842234499693, + "grad_norm": 0.5680630207061768, + "learning_rate": 9.92007592530585e-05, + "loss": 2.0718, + "step": 2780 + }, + { + "epoch": 0.8535911602209945, + "grad_norm": 0.5901346802711487, + "learning_rate": 9.919987382990845e-05, + "loss": 2.0577, + "step": 2781 + }, + { + "epoch": 0.8538980969920197, + "grad_norm": 0.5756994485855103, + "learning_rate": 9.919898792053643e-05, + "loss": 2.106, + "step": 2782 + }, + { + "epoch": 0.8542050337630448, + "grad_norm": 0.5831238031387329, + "learning_rate": 9.919810152495116e-05, + "loss": 2.0507, + "step": 2783 + }, + { + "epoch": 0.85451197053407, + "grad_norm": 0.529931902885437, + "learning_rate": 9.919721464316143e-05, + "loss": 2.0934, + "step": 2784 + }, + { + "epoch": 0.8548189073050951, + "grad_norm": 0.603672981262207, + "learning_rate": 9.919632727517597e-05, + "loss": 2.164, + "step": 2785 + }, + { + "epoch": 0.8551258440761204, + "grad_norm": 0.5741528868675232, + "learning_rate": 9.919543942100357e-05, + "loss": 2.0948, + "step": 2786 + }, + { + "epoch": 0.8554327808471455, + "grad_norm": 0.5689142942428589, + "learning_rate": 9.919455108065303e-05, + "loss": 2.1572, + "step": 2787 + }, + { + "epoch": 0.8557397176181707, + "grad_norm": 0.5767523646354675, + "learning_rate": 9.919366225413308e-05, + "loss": 2.0528, + "step": 2788 + }, + { + "epoch": 0.8560466543891958, + "grad_norm": 0.6004374623298645, + "learning_rate": 9.919277294145252e-05, + "loss": 2.1078, + "step": 2789 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.6199560761451721, + "learning_rate": 9.919188314262017e-05, + "loss": 2.034, + "step": 2790 + }, + { + "epoch": 0.8566605279312461, + "grad_norm": 0.5928464531898499, + "learning_rate": 9.919099285764478e-05, + "loss": 2.1226, + "step": 2791 + }, + { + "epoch": 0.8569674647022714, + "grad_norm": 0.5620111227035522, + "learning_rate": 9.919010208653517e-05, + "loss": 2.1387, + "step": 2792 + }, + { + "epoch": 0.8572744014732965, + "grad_norm": 0.6035314798355103, + "learning_rate": 9.918921082930015e-05, + "loss": 2.0888, + "step": 2793 + }, + { + "epoch": 0.8575813382443217, + "grad_norm": 0.6842171549797058, + "learning_rate": 9.91883190859485e-05, + "loss": 2.15, + "step": 2794 + }, + { + "epoch": 0.8578882750153468, + "grad_norm": 0.7600229978561401, + "learning_rate": 9.918742685648906e-05, + "loss": 2.1776, + "step": 2795 + }, + { + "epoch": 0.858195211786372, + "grad_norm": 0.641504168510437, + "learning_rate": 9.918653414093065e-05, + "loss": 2.086, + "step": 2796 + }, + { + "epoch": 0.8585021485573971, + "grad_norm": 0.6062462329864502, + "learning_rate": 9.918564093928207e-05, + "loss": 2.0772, + "step": 2797 + }, + { + "epoch": 0.8588090853284224, + "grad_norm": 0.5259165167808533, + "learning_rate": 9.918474725155214e-05, + "loss": 2.1034, + "step": 2798 + }, + { + "epoch": 0.8591160220994475, + "grad_norm": 0.532511830329895, + "learning_rate": 9.918385307774973e-05, + "loss": 2.103, + "step": 2799 + }, + { + "epoch": 0.8594229588704727, + "grad_norm": 0.5996485352516174, + "learning_rate": 9.918295841788366e-05, + "loss": 2.1698, + "step": 2800 + }, + { + "epoch": 0.8597298956414978, + "grad_norm": 0.5895976424217224, + "learning_rate": 9.918206327196276e-05, + "loss": 2.132, + "step": 2801 + }, + { + "epoch": 0.860036832412523, + "grad_norm": 0.6363179087638855, + "learning_rate": 9.918116763999588e-05, + "loss": 2.0967, + "step": 2802 + }, + { + "epoch": 0.8603437691835482, + "grad_norm": 0.6594113707542419, + "learning_rate": 9.918027152199187e-05, + "loss": 2.1266, + "step": 2803 + }, + { + "epoch": 0.8606507059545734, + "grad_norm": 0.694879412651062, + "learning_rate": 9.917937491795961e-05, + "loss": 2.0694, + "step": 2804 + }, + { + "epoch": 0.8609576427255985, + "grad_norm": 0.6310710906982422, + "learning_rate": 9.917847782790793e-05, + "loss": 2.1546, + "step": 2805 + }, + { + "epoch": 0.8612645794966237, + "grad_norm": 0.6166081428527832, + "learning_rate": 9.917758025184572e-05, + "loss": 2.131, + "step": 2806 + }, + { + "epoch": 0.8615715162676488, + "grad_norm": 0.5857066512107849, + "learning_rate": 9.917668218978182e-05, + "loss": 2.1529, + "step": 2807 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.6374151706695557, + "learning_rate": 9.917578364172513e-05, + "loss": 2.151, + "step": 2808 + }, + { + "epoch": 0.8621853898096992, + "grad_norm": 0.6760959625244141, + "learning_rate": 9.917488460768453e-05, + "loss": 2.1955, + "step": 2809 + }, + { + "epoch": 0.8624923265807244, + "grad_norm": 0.6308501362800598, + "learning_rate": 9.917398508766889e-05, + "loss": 2.1449, + "step": 2810 + }, + { + "epoch": 0.8627992633517495, + "grad_norm": 0.615181028842926, + "learning_rate": 9.91730850816871e-05, + "loss": 2.0326, + "step": 2811 + }, + { + "epoch": 0.8631062001227747, + "grad_norm": 0.6746891736984253, + "learning_rate": 9.917218458974809e-05, + "loss": 2.1472, + "step": 2812 + }, + { + "epoch": 0.8634131368937998, + "grad_norm": 0.6594959497451782, + "learning_rate": 9.91712836118607e-05, + "loss": 2.0879, + "step": 2813 + }, + { + "epoch": 0.8637200736648251, + "grad_norm": 0.6843087077140808, + "learning_rate": 9.91703821480339e-05, + "loss": 2.13, + "step": 2814 + }, + { + "epoch": 0.8640270104358502, + "grad_norm": 0.7513928413391113, + "learning_rate": 9.916948019827653e-05, + "loss": 2.1866, + "step": 2815 + }, + { + "epoch": 0.8643339472068754, + "grad_norm": 0.7352319955825806, + "learning_rate": 9.916857776259755e-05, + "loss": 2.0844, + "step": 2816 + }, + { + "epoch": 0.8646408839779005, + "grad_norm": 0.6901769638061523, + "learning_rate": 9.916767484100587e-05, + "loss": 2.086, + "step": 2817 + }, + { + "epoch": 0.8649478207489257, + "grad_norm": 0.621734619140625, + "learning_rate": 9.91667714335104e-05, + "loss": 2.0764, + "step": 2818 + }, + { + "epoch": 0.8652547575199508, + "grad_norm": 0.5779813528060913, + "learning_rate": 9.916586754012008e-05, + "loss": 2.0568, + "step": 2819 + }, + { + "epoch": 0.8655616942909761, + "grad_norm": 0.566251814365387, + "learning_rate": 9.916496316084385e-05, + "loss": 2.1624, + "step": 2820 + }, + { + "epoch": 0.8658686310620012, + "grad_norm": 0.6039763689041138, + "learning_rate": 9.916405829569062e-05, + "loss": 2.0412, + "step": 2821 + }, + { + "epoch": 0.8661755678330264, + "grad_norm": 0.587469220161438, + "learning_rate": 9.916315294466935e-05, + "loss": 2.1513, + "step": 2822 + }, + { + "epoch": 0.8664825046040515, + "grad_norm": 0.5792883634567261, + "learning_rate": 9.916224710778901e-05, + "loss": 2.055, + "step": 2823 + }, + { + "epoch": 0.8667894413750767, + "grad_norm": 0.5533844232559204, + "learning_rate": 9.916134078505852e-05, + "loss": 2.1237, + "step": 2824 + }, + { + "epoch": 0.8670963781461019, + "grad_norm": 0.6140845417976379, + "learning_rate": 9.916043397648685e-05, + "loss": 2.1481, + "step": 2825 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.6092365384101868, + "learning_rate": 9.915952668208295e-05, + "loss": 2.1567, + "step": 2826 + }, + { + "epoch": 0.8677102516881522, + "grad_norm": 0.5712884068489075, + "learning_rate": 9.915861890185578e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.8680171884591774, + "grad_norm": 0.5314213633537292, + "learning_rate": 9.915771063581434e-05, + "loss": 2.0408, + "step": 2828 + }, + { + "epoch": 0.8683241252302025, + "grad_norm": 0.5258345007896423, + "learning_rate": 9.915680188396759e-05, + "loss": 2.0968, + "step": 2829 + }, + { + "epoch": 0.8686310620012277, + "grad_norm": 0.6071497797966003, + "learning_rate": 9.915589264632453e-05, + "loss": 2.0924, + "step": 2830 + }, + { + "epoch": 0.8689379987722529, + "grad_norm": 0.6742420792579651, + "learning_rate": 9.915498292289408e-05, + "loss": 2.1276, + "step": 2831 + }, + { + "epoch": 0.8692449355432781, + "grad_norm": 0.7642729878425598, + "learning_rate": 9.915407271368533e-05, + "loss": 2.204, + "step": 2832 + }, + { + "epoch": 0.8695518723143032, + "grad_norm": 0.8024489283561707, + "learning_rate": 9.915316201870718e-05, + "loss": 2.163, + "step": 2833 + }, + { + "epoch": 0.8698588090853284, + "grad_norm": 0.8268367648124695, + "learning_rate": 9.915225083796871e-05, + "loss": 2.117, + "step": 2834 + }, + { + "epoch": 0.8701657458563536, + "grad_norm": 0.7761407494544983, + "learning_rate": 9.915133917147888e-05, + "loss": 2.0727, + "step": 2835 + }, + { + "epoch": 0.8704726826273788, + "grad_norm": 0.7515753507614136, + "learning_rate": 9.91504270192467e-05, + "loss": 2.075, + "step": 2836 + }, + { + "epoch": 0.870779619398404, + "grad_norm": 0.6203973889350891, + "learning_rate": 9.914951438128119e-05, + "loss": 2.1163, + "step": 2837 + }, + { + "epoch": 0.8710865561694291, + "grad_norm": 0.6056976318359375, + "learning_rate": 9.914860125759138e-05, + "loss": 2.1515, + "step": 2838 + }, + { + "epoch": 0.8713934929404543, + "grad_norm": 0.6472234725952148, + "learning_rate": 9.914768764818627e-05, + "loss": 2.1618, + "step": 2839 + }, + { + "epoch": 0.8717004297114794, + "grad_norm": 0.5981749892234802, + "learning_rate": 9.914677355307491e-05, + "loss": 2.0763, + "step": 2840 + }, + { + "epoch": 0.8720073664825047, + "grad_norm": 0.5721938014030457, + "learning_rate": 9.914585897226634e-05, + "loss": 2.0916, + "step": 2841 + }, + { + "epoch": 0.8723143032535298, + "grad_norm": 0.6079535484313965, + "learning_rate": 9.914494390576958e-05, + "loss": 2.0767, + "step": 2842 + }, + { + "epoch": 0.872621240024555, + "grad_norm": 0.6684066653251648, + "learning_rate": 9.914402835359368e-05, + "loss": 2.2712, + "step": 2843 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.6992711424827576, + "learning_rate": 9.91431123157477e-05, + "loss": 2.0813, + "step": 2844 + }, + { + "epoch": 0.8732351135666053, + "grad_norm": 0.6585392951965332, + "learning_rate": 9.914219579224065e-05, + "loss": 2.1303, + "step": 2845 + }, + { + "epoch": 0.8735420503376304, + "grad_norm": 0.7267395257949829, + "learning_rate": 9.914127878308164e-05, + "loss": 2.2253, + "step": 2846 + }, + { + "epoch": 0.8738489871086557, + "grad_norm": 0.6764006018638611, + "learning_rate": 9.91403612882797e-05, + "loss": 2.0886, + "step": 2847 + }, + { + "epoch": 0.8741559238796808, + "grad_norm": 0.612808108329773, + "learning_rate": 9.91394433078439e-05, + "loss": 2.0469, + "step": 2848 + }, + { + "epoch": 0.874462860650706, + "grad_norm": 0.5598782896995544, + "learning_rate": 9.913852484178334e-05, + "loss": 2.1745, + "step": 2849 + }, + { + "epoch": 0.8747697974217311, + "grad_norm": 0.6498168706893921, + "learning_rate": 9.913760589010707e-05, + "loss": 2.2657, + "step": 2850 + }, + { + "epoch": 0.8750767341927563, + "grad_norm": 0.6796014904975891, + "learning_rate": 9.913668645282418e-05, + "loss": 2.1056, + "step": 2851 + }, + { + "epoch": 0.8753836709637814, + "grad_norm": 0.7409440279006958, + "learning_rate": 9.913576652994376e-05, + "loss": 2.1533, + "step": 2852 + }, + { + "epoch": 0.8756906077348067, + "grad_norm": 0.7044464945793152, + "learning_rate": 9.913484612147488e-05, + "loss": 2.2088, + "step": 2853 + }, + { + "epoch": 0.8759975445058318, + "grad_norm": 0.6333544254302979, + "learning_rate": 9.913392522742666e-05, + "loss": 2.132, + "step": 2854 + }, + { + "epoch": 0.876304481276857, + "grad_norm": 0.603382408618927, + "learning_rate": 9.91330038478082e-05, + "loss": 2.0657, + "step": 2855 + }, + { + "epoch": 0.8766114180478821, + "grad_norm": 0.5919856429100037, + "learning_rate": 9.913208198262858e-05, + "loss": 2.0854, + "step": 2856 + }, + { + "epoch": 0.8769183548189073, + "grad_norm": 0.6033365726470947, + "learning_rate": 9.913115963189694e-05, + "loss": 2.0825, + "step": 2857 + }, + { + "epoch": 0.8772252915899325, + "grad_norm": 0.5917964577674866, + "learning_rate": 9.913023679562238e-05, + "loss": 2.1608, + "step": 2858 + }, + { + "epoch": 0.8775322283609577, + "grad_norm": 0.5953360795974731, + "learning_rate": 9.912931347381402e-05, + "loss": 2.1454, + "step": 2859 + }, + { + "epoch": 0.8778391651319828, + "grad_norm": 0.5949352979660034, + "learning_rate": 9.9128389666481e-05, + "loss": 2.1575, + "step": 2860 + }, + { + "epoch": 0.878146101903008, + "grad_norm": 0.5468181371688843, + "learning_rate": 9.912746537363243e-05, + "loss": 2.151, + "step": 2861 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.5476632714271545, + "learning_rate": 9.912654059527746e-05, + "loss": 2.1015, + "step": 2862 + }, + { + "epoch": 0.8787599754450584, + "grad_norm": 0.6881390810012817, + "learning_rate": 9.912561533142521e-05, + "loss": 2.2002, + "step": 2863 + }, + { + "epoch": 0.8790669122160835, + "grad_norm": 0.6663404703140259, + "learning_rate": 9.912468958208486e-05, + "loss": 2.0691, + "step": 2864 + }, + { + "epoch": 0.8793738489871087, + "grad_norm": 0.5739100575447083, + "learning_rate": 9.91237633472655e-05, + "loss": 2.0852, + "step": 2865 + }, + { + "epoch": 0.8796807857581338, + "grad_norm": 0.5227558016777039, + "learning_rate": 9.912283662697635e-05, + "loss": 2.1144, + "step": 2866 + }, + { + "epoch": 0.879987722529159, + "grad_norm": 0.5626821517944336, + "learning_rate": 9.912190942122652e-05, + "loss": 2.0796, + "step": 2867 + }, + { + "epoch": 0.8802946593001841, + "grad_norm": 0.5367855429649353, + "learning_rate": 9.912098173002518e-05, + "loss": 2.0768, + "step": 2868 + }, + { + "epoch": 0.8806015960712094, + "grad_norm": 0.5285482406616211, + "learning_rate": 9.912005355338152e-05, + "loss": 2.0832, + "step": 2869 + }, + { + "epoch": 0.8809085328422345, + "grad_norm": 0.5384502410888672, + "learning_rate": 9.91191248913047e-05, + "loss": 2.0187, + "step": 2870 + }, + { + "epoch": 0.8812154696132597, + "grad_norm": 0.5099567770957947, + "learning_rate": 9.91181957438039e-05, + "loss": 2.0865, + "step": 2871 + }, + { + "epoch": 0.8815224063842848, + "grad_norm": 0.5513966679573059, + "learning_rate": 9.911726611088831e-05, + "loss": 2.1097, + "step": 2872 + }, + { + "epoch": 0.88182934315531, + "grad_norm": 0.5411790609359741, + "learning_rate": 9.911633599256709e-05, + "loss": 2.0964, + "step": 2873 + }, + { + "epoch": 0.8821362799263351, + "grad_norm": 0.6151100397109985, + "learning_rate": 9.911540538884947e-05, + "loss": 2.1006, + "step": 2874 + }, + { + "epoch": 0.8824432166973604, + "grad_norm": 0.754391610622406, + "learning_rate": 9.911447429974461e-05, + "loss": 2.1493, + "step": 2875 + }, + { + "epoch": 0.8827501534683855, + "grad_norm": 0.7485715746879578, + "learning_rate": 9.911354272526172e-05, + "loss": 2.1136, + "step": 2876 + }, + { + "epoch": 0.8830570902394107, + "grad_norm": 0.6808591485023499, + "learning_rate": 9.911261066541003e-05, + "loss": 2.1238, + "step": 2877 + }, + { + "epoch": 0.8833640270104358, + "grad_norm": 0.5771127343177795, + "learning_rate": 9.911167812019874e-05, + "loss": 2.0846, + "step": 2878 + }, + { + "epoch": 0.883670963781461, + "grad_norm": 0.5991767048835754, + "learning_rate": 9.911074508963705e-05, + "loss": 2.1486, + "step": 2879 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.6899440884590149, + "learning_rate": 9.91098115737342e-05, + "loss": 2.1357, + "step": 2880 + }, + { + "epoch": 0.8842848373235114, + "grad_norm": 0.7102574110031128, + "learning_rate": 9.91088775724994e-05, + "loss": 2.1269, + "step": 2881 + }, + { + "epoch": 0.8845917740945365, + "grad_norm": 0.7238754034042358, + "learning_rate": 9.910794308594189e-05, + "loss": 2.0829, + "step": 2882 + }, + { + "epoch": 0.8848987108655617, + "grad_norm": 0.7232441902160645, + "learning_rate": 9.91070081140709e-05, + "loss": 2.1704, + "step": 2883 + }, + { + "epoch": 0.8852056476365868, + "grad_norm": 0.7136173844337463, + "learning_rate": 9.910607265689569e-05, + "loss": 2.1553, + "step": 2884 + }, + { + "epoch": 0.885512584407612, + "grad_norm": 0.6566216945648193, + "learning_rate": 9.910513671442547e-05, + "loss": 2.0856, + "step": 2885 + }, + { + "epoch": 0.8858195211786372, + "grad_norm": 0.5712916851043701, + "learning_rate": 9.910420028666951e-05, + "loss": 2.1399, + "step": 2886 + }, + { + "epoch": 0.8861264579496624, + "grad_norm": 0.727664589881897, + "learning_rate": 9.910326337363707e-05, + "loss": 2.088, + "step": 2887 + }, + { + "epoch": 0.8864333947206875, + "grad_norm": 0.799963653087616, + "learning_rate": 9.91023259753374e-05, + "loss": 2.0984, + "step": 2888 + }, + { + "epoch": 0.8867403314917127, + "grad_norm": 0.9462977051734924, + "learning_rate": 9.910138809177975e-05, + "loss": 2.1262, + "step": 2889 + }, + { + "epoch": 0.8870472682627378, + "grad_norm": 0.9130533933639526, + "learning_rate": 9.910044972297343e-05, + "loss": 2.1967, + "step": 2890 + }, + { + "epoch": 0.887354205033763, + "grad_norm": 0.6971304416656494, + "learning_rate": 9.909951086892767e-05, + "loss": 2.0797, + "step": 2891 + }, + { + "epoch": 0.8876611418047882, + "grad_norm": 0.5822353363037109, + "learning_rate": 9.909857152965176e-05, + "loss": 2.1152, + "step": 2892 + }, + { + "epoch": 0.8879680785758134, + "grad_norm": 0.5885453820228577, + "learning_rate": 9.9097631705155e-05, + "loss": 2.0323, + "step": 2893 + }, + { + "epoch": 0.8882750153468385, + "grad_norm": 0.6249284744262695, + "learning_rate": 9.909669139544666e-05, + "loss": 2.1076, + "step": 2894 + }, + { + "epoch": 0.8885819521178637, + "grad_norm": 0.6117702722549438, + "learning_rate": 9.909575060053604e-05, + "loss": 2.0608, + "step": 2895 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.560357928276062, + "learning_rate": 9.909480932043245e-05, + "loss": 2.145, + "step": 2896 + }, + { + "epoch": 0.8891958256599141, + "grad_norm": 0.5442607998847961, + "learning_rate": 9.909386755514516e-05, + "loss": 2.1091, + "step": 2897 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.5653077363967896, + "learning_rate": 9.909292530468351e-05, + "loss": 2.1097, + "step": 2898 + }, + { + "epoch": 0.8898096992019644, + "grad_norm": 0.531939685344696, + "learning_rate": 9.909198256905679e-05, + "loss": 2.0866, + "step": 2899 + }, + { + "epoch": 0.8901166359729895, + "grad_norm": 0.6238400340080261, + "learning_rate": 9.909103934827433e-05, + "loss": 2.1421, + "step": 2900 + }, + { + "epoch": 0.8904235727440147, + "grad_norm": 0.5685901045799255, + "learning_rate": 9.909009564234543e-05, + "loss": 2.0019, + "step": 2901 + }, + { + "epoch": 0.8907305095150398, + "grad_norm": 0.5979083180427551, + "learning_rate": 9.908915145127945e-05, + "loss": 2.0891, + "step": 2902 + }, + { + "epoch": 0.8910374462860651, + "grad_norm": 0.5847237706184387, + "learning_rate": 9.90882067750857e-05, + "loss": 2.1165, + "step": 2903 + }, + { + "epoch": 0.8913443830570903, + "grad_norm": 0.6281530261039734, + "learning_rate": 9.908726161377351e-05, + "loss": 2.1396, + "step": 2904 + }, + { + "epoch": 0.8916513198281154, + "grad_norm": 0.5685252547264099, + "learning_rate": 9.908631596735225e-05, + "loss": 2.0781, + "step": 2905 + }, + { + "epoch": 0.8919582565991406, + "grad_norm": 0.5427065491676331, + "learning_rate": 9.908536983583123e-05, + "loss": 2.1387, + "step": 2906 + }, + { + "epoch": 0.8922651933701657, + "grad_norm": 0.5972270965576172, + "learning_rate": 9.908442321921982e-05, + "loss": 2.0546, + "step": 2907 + }, + { + "epoch": 0.892572130141191, + "grad_norm": 0.562685489654541, + "learning_rate": 9.908347611752735e-05, + "loss": 2.093, + "step": 2908 + }, + { + "epoch": 0.8928790669122161, + "grad_norm": 0.6781734824180603, + "learning_rate": 9.908252853076323e-05, + "loss": 2.1589, + "step": 2909 + }, + { + "epoch": 0.8931860036832413, + "grad_norm": 0.7591540813446045, + "learning_rate": 9.908158045893678e-05, + "loss": 2.164, + "step": 2910 + }, + { + "epoch": 0.8934929404542664, + "grad_norm": 0.7161938548088074, + "learning_rate": 9.908063190205738e-05, + "loss": 2.079, + "step": 2911 + }, + { + "epoch": 0.8937998772252916, + "grad_norm": 0.7338036298751831, + "learning_rate": 9.907968286013442e-05, + "loss": 2.0033, + "step": 2912 + }, + { + "epoch": 0.8941068139963168, + "grad_norm": 0.7641176581382751, + "learning_rate": 9.907873333317727e-05, + "loss": 2.187, + "step": 2913 + }, + { + "epoch": 0.894413750767342, + "grad_norm": 0.6073760390281677, + "learning_rate": 9.90777833211953e-05, + "loss": 2.0589, + "step": 2914 + }, + { + "epoch": 0.8947206875383671, + "grad_norm": 0.49493756890296936, + "learning_rate": 9.907683282419791e-05, + "loss": 2.0555, + "step": 2915 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.6428996920585632, + "learning_rate": 9.907588184219449e-05, + "loss": 2.1083, + "step": 2916 + }, + { + "epoch": 0.8953345610804174, + "grad_norm": 0.6752644777297974, + "learning_rate": 9.907493037519447e-05, + "loss": 2.0987, + "step": 2917 + }, + { + "epoch": 0.8956414978514426, + "grad_norm": 0.5719494223594666, + "learning_rate": 9.907397842320719e-05, + "loss": 2.1735, + "step": 2918 + }, + { + "epoch": 0.8959484346224678, + "grad_norm": 0.5799626111984253, + "learning_rate": 9.907302598624211e-05, + "loss": 2.0978, + "step": 2919 + }, + { + "epoch": 0.896255371393493, + "grad_norm": 0.5407500267028809, + "learning_rate": 9.907207306430861e-05, + "loss": 2.0303, + "step": 2920 + }, + { + "epoch": 0.8965623081645181, + "grad_norm": 0.5950884222984314, + "learning_rate": 9.907111965741614e-05, + "loss": 2.0721, + "step": 2921 + }, + { + "epoch": 0.8968692449355433, + "grad_norm": 0.7711441516876221, + "learning_rate": 9.907016576557409e-05, + "loss": 2.1693, + "step": 2922 + }, + { + "epoch": 0.8971761817065684, + "grad_norm": 0.5522177815437317, + "learning_rate": 9.906921138879191e-05, + "loss": 2.1057, + "step": 2923 + }, + { + "epoch": 0.8974831184775937, + "grad_norm": 0.5743894577026367, + "learning_rate": 9.906825652707903e-05, + "loss": 2.119, + "step": 2924 + }, + { + "epoch": 0.8977900552486188, + "grad_norm": 0.5996440649032593, + "learning_rate": 9.906730118044486e-05, + "loss": 2.1251, + "step": 2925 + }, + { + "epoch": 0.898096992019644, + "grad_norm": 0.691302478313446, + "learning_rate": 9.906634534889887e-05, + "loss": 2.1459, + "step": 2926 + }, + { + "epoch": 0.8984039287906691, + "grad_norm": 0.6125866770744324, + "learning_rate": 9.90653890324505e-05, + "loss": 2.0739, + "step": 2927 + }, + { + "epoch": 0.8987108655616943, + "grad_norm": 0.5285681486129761, + "learning_rate": 9.906443223110919e-05, + "loss": 2.0398, + "step": 2928 + }, + { + "epoch": 0.8990178023327194, + "grad_norm": 0.5747935771942139, + "learning_rate": 9.90634749448844e-05, + "loss": 2.0688, + "step": 2929 + }, + { + "epoch": 0.8993247391037447, + "grad_norm": 0.5686646103858948, + "learning_rate": 9.90625171737856e-05, + "loss": 2.1196, + "step": 2930 + }, + { + "epoch": 0.8996316758747698, + "grad_norm": 0.5320247411727905, + "learning_rate": 9.906155891782225e-05, + "loss": 2.1069, + "step": 2931 + }, + { + "epoch": 0.899938612645795, + "grad_norm": 0.5626047849655151, + "learning_rate": 9.906060017700383e-05, + "loss": 2.1091, + "step": 2932 + }, + { + "epoch": 0.9002455494168201, + "grad_norm": 0.5284978151321411, + "learning_rate": 9.905964095133979e-05, + "loss": 2.036, + "step": 2933 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.5362093448638916, + "learning_rate": 9.905868124083962e-05, + "loss": 2.1273, + "step": 2934 + }, + { + "epoch": 0.9008594229588704, + "grad_norm": 0.5583781599998474, + "learning_rate": 9.90577210455128e-05, + "loss": 2.0871, + "step": 2935 + }, + { + "epoch": 0.9011663597298957, + "grad_norm": 0.5552016496658325, + "learning_rate": 9.905676036536883e-05, + "loss": 2.0785, + "step": 2936 + }, + { + "epoch": 0.9014732965009208, + "grad_norm": 0.6875657439231873, + "learning_rate": 9.905579920041724e-05, + "loss": 2.083, + "step": 2937 + }, + { + "epoch": 0.901780233271946, + "grad_norm": 0.5396340489387512, + "learning_rate": 9.905483755066744e-05, + "loss": 2.0717, + "step": 2938 + }, + { + "epoch": 0.9020871700429711, + "grad_norm": 0.594739556312561, + "learning_rate": 9.9053875416129e-05, + "loss": 2.1305, + "step": 2939 + }, + { + "epoch": 0.9023941068139963, + "grad_norm": 0.6208831667900085, + "learning_rate": 9.905291279681143e-05, + "loss": 2.0034, + "step": 2940 + }, + { + "epoch": 0.9027010435850215, + "grad_norm": 0.5154325366020203, + "learning_rate": 9.90519496927242e-05, + "loss": 2.098, + "step": 2941 + }, + { + "epoch": 0.9030079803560467, + "grad_norm": 0.5217738151550293, + "learning_rate": 9.905098610387687e-05, + "loss": 2.0467, + "step": 2942 + }, + { + "epoch": 0.9033149171270718, + "grad_norm": 0.5623623728752136, + "learning_rate": 9.905002203027894e-05, + "loss": 2.1854, + "step": 2943 + }, + { + "epoch": 0.903621853898097, + "grad_norm": 0.5365456938743591, + "learning_rate": 9.904905747193993e-05, + "loss": 2.1021, + "step": 2944 + }, + { + "epoch": 0.9039287906691221, + "grad_norm": 0.5391906499862671, + "learning_rate": 9.904809242886941e-05, + "loss": 2.1102, + "step": 2945 + }, + { + "epoch": 0.9042357274401474, + "grad_norm": 0.5439971685409546, + "learning_rate": 9.904712690107687e-05, + "loss": 2.0691, + "step": 2946 + }, + { + "epoch": 0.9045426642111725, + "grad_norm": 0.539383053779602, + "learning_rate": 9.904616088857189e-05, + "loss": 2.0514, + "step": 2947 + }, + { + "epoch": 0.9048496009821977, + "grad_norm": 0.5370060801506042, + "learning_rate": 9.904519439136399e-05, + "loss": 2.1069, + "step": 2948 + }, + { + "epoch": 0.9051565377532228, + "grad_norm": 0.5136541724205017, + "learning_rate": 9.904422740946274e-05, + "loss": 2.0519, + "step": 2949 + }, + { + "epoch": 0.905463474524248, + "grad_norm": 0.4970051348209381, + "learning_rate": 9.904325994287768e-05, + "loss": 2.0624, + "step": 2950 + }, + { + "epoch": 0.9057704112952731, + "grad_norm": 0.5003986954689026, + "learning_rate": 9.90422919916184e-05, + "loss": 2.135, + "step": 2951 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.5559821724891663, + "learning_rate": 9.904132355569443e-05, + "loss": 2.0733, + "step": 2952 + }, + { + "epoch": 0.9063842848373235, + "grad_norm": 0.5450533628463745, + "learning_rate": 9.904035463511537e-05, + "loss": 2.1491, + "step": 2953 + }, + { + "epoch": 0.9066912216083487, + "grad_norm": 0.5789141058921814, + "learning_rate": 9.903938522989076e-05, + "loss": 2.0604, + "step": 2954 + }, + { + "epoch": 0.9069981583793738, + "grad_norm": 0.6327412128448486, + "learning_rate": 9.903841534003023e-05, + "loss": 2.1307, + "step": 2955 + }, + { + "epoch": 0.907305095150399, + "grad_norm": 0.5694023966789246, + "learning_rate": 9.90374449655433e-05, + "loss": 2.1322, + "step": 2956 + }, + { + "epoch": 0.9076120319214241, + "grad_norm": 0.6241337060928345, + "learning_rate": 9.903647410643963e-05, + "loss": 2.1026, + "step": 2957 + }, + { + "epoch": 0.9079189686924494, + "grad_norm": 0.6257766485214233, + "learning_rate": 9.903550276272878e-05, + "loss": 2.0449, + "step": 2958 + }, + { + "epoch": 0.9082259054634745, + "grad_norm": 0.708626389503479, + "learning_rate": 9.903453093442032e-05, + "loss": 2.095, + "step": 2959 + }, + { + "epoch": 0.9085328422344997, + "grad_norm": 0.6769086122512817, + "learning_rate": 9.903355862152391e-05, + "loss": 2.0939, + "step": 2960 + }, + { + "epoch": 0.9088397790055248, + "grad_norm": 0.6221890449523926, + "learning_rate": 9.903258582404913e-05, + "loss": 2.1552, + "step": 2961 + }, + { + "epoch": 0.90914671577655, + "grad_norm": 0.7477858662605286, + "learning_rate": 9.903161254200561e-05, + "loss": 2.1155, + "step": 2962 + }, + { + "epoch": 0.9094536525475752, + "grad_norm": 0.665538489818573, + "learning_rate": 9.903063877540294e-05, + "loss": 2.1032, + "step": 2963 + }, + { + "epoch": 0.9097605893186004, + "grad_norm": 0.5973435044288635, + "learning_rate": 9.902966452425076e-05, + "loss": 2.0793, + "step": 2964 + }, + { + "epoch": 0.9100675260896255, + "grad_norm": 0.6544547080993652, + "learning_rate": 9.90286897885587e-05, + "loss": 2.1566, + "step": 2965 + }, + { + "epoch": 0.9103744628606507, + "grad_norm": 0.7162452936172485, + "learning_rate": 9.90277145683364e-05, + "loss": 2.1234, + "step": 2966 + }, + { + "epoch": 0.9106813996316758, + "grad_norm": 0.8400503993034363, + "learning_rate": 9.902673886359349e-05, + "loss": 2.216, + "step": 2967 + }, + { + "epoch": 0.910988336402701, + "grad_norm": 1.0350611209869385, + "learning_rate": 9.902576267433961e-05, + "loss": 2.0785, + "step": 2968 + }, + { + "epoch": 0.9112952731737262, + "grad_norm": 0.9551987051963806, + "learning_rate": 9.90247860005844e-05, + "loss": 2.0652, + "step": 2969 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.839712381362915, + "learning_rate": 9.902380884233751e-05, + "loss": 2.1197, + "step": 2970 + }, + { + "epoch": 0.9119091467157765, + "grad_norm": 0.6588022708892822, + "learning_rate": 9.902283119960863e-05, + "loss": 2.155, + "step": 2971 + }, + { + "epoch": 0.9122160834868017, + "grad_norm": 0.6532430052757263, + "learning_rate": 9.902185307240739e-05, + "loss": 2.0947, + "step": 2972 + }, + { + "epoch": 0.9125230202578268, + "grad_norm": 0.7890481352806091, + "learning_rate": 9.902087446074346e-05, + "loss": 2.0246, + "step": 2973 + }, + { + "epoch": 0.9128299570288521, + "grad_norm": 0.6234511137008667, + "learning_rate": 9.901989536462652e-05, + "loss": 2.1033, + "step": 2974 + }, + { + "epoch": 0.9131368937998773, + "grad_norm": 0.5875300168991089, + "learning_rate": 9.901891578406623e-05, + "loss": 2.0553, + "step": 2975 + }, + { + "epoch": 0.9134438305709024, + "grad_norm": 0.6868174076080322, + "learning_rate": 9.901793571907231e-05, + "loss": 2.1398, + "step": 2976 + }, + { + "epoch": 0.9137507673419276, + "grad_norm": 0.7423301339149475, + "learning_rate": 9.90169551696544e-05, + "loss": 2.1034, + "step": 2977 + }, + { + "epoch": 0.9140577041129527, + "grad_norm": 0.588916003704071, + "learning_rate": 9.901597413582222e-05, + "loss": 2.078, + "step": 2978 + }, + { + "epoch": 0.914364640883978, + "grad_norm": 0.5895309448242188, + "learning_rate": 9.901499261758544e-05, + "loss": 2.0902, + "step": 2979 + }, + { + "epoch": 0.9146715776550031, + "grad_norm": 0.5403301119804382, + "learning_rate": 9.901401061495379e-05, + "loss": 2.0291, + "step": 2980 + }, + { + "epoch": 0.9149785144260283, + "grad_norm": 0.6102077960968018, + "learning_rate": 9.901302812793696e-05, + "loss": 2.0415, + "step": 2981 + }, + { + "epoch": 0.9152854511970534, + "grad_norm": 0.6728450059890747, + "learning_rate": 9.901204515654465e-05, + "loss": 2.105, + "step": 2982 + }, + { + "epoch": 0.9155923879680786, + "grad_norm": 0.5886163711547852, + "learning_rate": 9.901106170078657e-05, + "loss": 2.0186, + "step": 2983 + }, + { + "epoch": 0.9158993247391037, + "grad_norm": 0.539252758026123, + "learning_rate": 9.901007776067247e-05, + "loss": 2.0604, + "step": 2984 + }, + { + "epoch": 0.916206261510129, + "grad_norm": 0.6169516444206238, + "learning_rate": 9.900909333621205e-05, + "loss": 2.1257, + "step": 2985 + }, + { + "epoch": 0.9165131982811541, + "grad_norm": 0.5624274015426636, + "learning_rate": 9.900810842741506e-05, + "loss": 2.0325, + "step": 2986 + }, + { + "epoch": 0.9168201350521793, + "grad_norm": 0.5931735634803772, + "learning_rate": 9.900712303429119e-05, + "loss": 2.0815, + "step": 2987 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.5720505714416504, + "learning_rate": 9.900613715685023e-05, + "loss": 2.1261, + "step": 2988 + }, + { + "epoch": 0.9174340085942296, + "grad_norm": 0.5752067565917969, + "learning_rate": 9.900515079510189e-05, + "loss": 2.1402, + "step": 2989 + }, + { + "epoch": 0.9177409453652547, + "grad_norm": 0.5836917757987976, + "learning_rate": 9.900416394905591e-05, + "loss": 2.0523, + "step": 2990 + }, + { + "epoch": 0.91804788213628, + "grad_norm": 0.6408325433731079, + "learning_rate": 9.900317661872209e-05, + "loss": 2.1874, + "step": 2991 + }, + { + "epoch": 0.9183548189073051, + "grad_norm": 0.6188341379165649, + "learning_rate": 9.900218880411013e-05, + "loss": 2.0903, + "step": 2992 + }, + { + "epoch": 0.9186617556783303, + "grad_norm": 0.5740565657615662, + "learning_rate": 9.900120050522985e-05, + "loss": 2.1243, + "step": 2993 + }, + { + "epoch": 0.9189686924493554, + "grad_norm": 0.635638952255249, + "learning_rate": 9.900021172209096e-05, + "loss": 2.089, + "step": 2994 + }, + { + "epoch": 0.9192756292203806, + "grad_norm": 0.5538209676742554, + "learning_rate": 9.899922245470326e-05, + "loss": 2.0489, + "step": 2995 + }, + { + "epoch": 0.9195825659914058, + "grad_norm": 0.5440292954444885, + "learning_rate": 9.899823270307654e-05, + "loss": 2.0534, + "step": 2996 + }, + { + "epoch": 0.919889502762431, + "grad_norm": 0.6203792691230774, + "learning_rate": 9.899724246722055e-05, + "loss": 2.2799, + "step": 2997 + }, + { + "epoch": 0.9201964395334561, + "grad_norm": 0.6299278140068054, + "learning_rate": 9.89962517471451e-05, + "loss": 2.0813, + "step": 2998 + }, + { + "epoch": 0.9205033763044813, + "grad_norm": 0.6156774759292603, + "learning_rate": 9.899526054285997e-05, + "loss": 2.1345, + "step": 2999 + }, + { + "epoch": 0.9208103130755064, + "grad_norm": 0.5940032601356506, + "learning_rate": 9.899426885437496e-05, + "loss": 2.133, + "step": 3000 + }, + { + "epoch": 0.9211172498465316, + "grad_norm": 0.6210232377052307, + "learning_rate": 9.899327668169987e-05, + "loss": 2.0275, + "step": 3001 + }, + { + "epoch": 0.9214241866175568, + "grad_norm": 0.5578985214233398, + "learning_rate": 9.89922840248445e-05, + "loss": 2.0806, + "step": 3002 + }, + { + "epoch": 0.921731123388582, + "grad_norm": 0.5264963507652283, + "learning_rate": 9.899129088381866e-05, + "loss": 2.1233, + "step": 3003 + }, + { + "epoch": 0.9220380601596071, + "grad_norm": 0.5414119958877563, + "learning_rate": 9.899029725863218e-05, + "loss": 2.1052, + "step": 3004 + }, + { + "epoch": 0.9223449969306323, + "grad_norm": 0.5933207869529724, + "learning_rate": 9.898930314929486e-05, + "loss": 2.108, + "step": 3005 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.6170317530632019, + "learning_rate": 9.898830855581654e-05, + "loss": 2.0997, + "step": 3006 + }, + { + "epoch": 0.9229588704726827, + "grad_norm": 0.5930282473564148, + "learning_rate": 9.898731347820705e-05, + "loss": 2.0507, + "step": 3007 + }, + { + "epoch": 0.9232658072437078, + "grad_norm": 0.5894142985343933, + "learning_rate": 9.898631791647619e-05, + "loss": 2.0687, + "step": 3008 + }, + { + "epoch": 0.923572744014733, + "grad_norm": 0.6560437083244324, + "learning_rate": 9.898532187063383e-05, + "loss": 2.096, + "step": 3009 + }, + { + "epoch": 0.9238796807857581, + "grad_norm": 0.6083245873451233, + "learning_rate": 9.898432534068983e-05, + "loss": 2.0526, + "step": 3010 + }, + { + "epoch": 0.9241866175567833, + "grad_norm": 0.5152565240859985, + "learning_rate": 9.8983328326654e-05, + "loss": 2.0802, + "step": 3011 + }, + { + "epoch": 0.9244935543278084, + "grad_norm": 0.6326588988304138, + "learning_rate": 9.89823308285362e-05, + "loss": 2.1246, + "step": 3012 + }, + { + "epoch": 0.9248004910988337, + "grad_norm": 0.6821309328079224, + "learning_rate": 9.898133284634632e-05, + "loss": 2.1106, + "step": 3013 + }, + { + "epoch": 0.9251074278698588, + "grad_norm": 0.6192164421081543, + "learning_rate": 9.898033438009419e-05, + "loss": 2.0475, + "step": 3014 + }, + { + "epoch": 0.925414364640884, + "grad_norm": 0.6112427115440369, + "learning_rate": 9.897933542978967e-05, + "loss": 2.0904, + "step": 3015 + }, + { + "epoch": 0.9257213014119091, + "grad_norm": 0.5729427933692932, + "learning_rate": 9.897833599544268e-05, + "loss": 2.1151, + "step": 3016 + }, + { + "epoch": 0.9260282381829343, + "grad_norm": 0.6200255751609802, + "learning_rate": 9.897733607706305e-05, + "loss": 2.0815, + "step": 3017 + }, + { + "epoch": 0.9263351749539595, + "grad_norm": 0.635920524597168, + "learning_rate": 9.897633567466068e-05, + "loss": 2.0724, + "step": 3018 + }, + { + "epoch": 0.9266421117249847, + "grad_norm": 0.5916038155555725, + "learning_rate": 9.897533478824546e-05, + "loss": 2.1527, + "step": 3019 + }, + { + "epoch": 0.9269490484960098, + "grad_norm": 0.5552941560745239, + "learning_rate": 9.897433341782727e-05, + "loss": 2.0958, + "step": 3020 + }, + { + "epoch": 0.927255985267035, + "grad_norm": 0.562383770942688, + "learning_rate": 9.897333156341602e-05, + "loss": 2.0939, + "step": 3021 + }, + { + "epoch": 0.9275629220380601, + "grad_norm": 0.5227869153022766, + "learning_rate": 9.897232922502158e-05, + "loss": 2.1358, + "step": 3022 + }, + { + "epoch": 0.9278698588090853, + "grad_norm": 0.5671074986457825, + "learning_rate": 9.897132640265391e-05, + "loss": 2.0877, + "step": 3023 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.5176356434822083, + "learning_rate": 9.897032309632287e-05, + "loss": 2.0392, + "step": 3024 + }, + { + "epoch": 0.9284837323511357, + "grad_norm": 0.5160155296325684, + "learning_rate": 9.89693193060384e-05, + "loss": 2.069, + "step": 3025 + }, + { + "epoch": 0.9287906691221608, + "grad_norm": 0.5034440159797668, + "learning_rate": 9.896831503181042e-05, + "loss": 2.0348, + "step": 3026 + }, + { + "epoch": 0.929097605893186, + "grad_norm": 0.5146151781082153, + "learning_rate": 9.896731027364884e-05, + "loss": 2.0884, + "step": 3027 + }, + { + "epoch": 0.9294045426642111, + "grad_norm": 0.7153071165084839, + "learning_rate": 9.896630503156361e-05, + "loss": 2.2295, + "step": 3028 + }, + { + "epoch": 0.9297114794352364, + "grad_norm": 0.7201753258705139, + "learning_rate": 9.896529930556464e-05, + "loss": 2.1285, + "step": 3029 + }, + { + "epoch": 0.9300184162062615, + "grad_norm": 0.7110029458999634, + "learning_rate": 9.89642930956619e-05, + "loss": 2.1371, + "step": 3030 + }, + { + "epoch": 0.9303253529772867, + "grad_norm": 0.695444643497467, + "learning_rate": 9.896328640186531e-05, + "loss": 2.0698, + "step": 3031 + }, + { + "epoch": 0.9306322897483118, + "grad_norm": 0.6157357096672058, + "learning_rate": 9.896227922418482e-05, + "loss": 2.1294, + "step": 3032 + }, + { + "epoch": 0.930939226519337, + "grad_norm": 0.5473730564117432, + "learning_rate": 9.896127156263039e-05, + "loss": 2.0487, + "step": 3033 + }, + { + "epoch": 0.9312461632903621, + "grad_norm": 0.6400229334831238, + "learning_rate": 9.896026341721198e-05, + "loss": 2.0422, + "step": 3034 + }, + { + "epoch": 0.9315531000613874, + "grad_norm": 0.5046324729919434, + "learning_rate": 9.895925478793955e-05, + "loss": 2.0715, + "step": 3035 + }, + { + "epoch": 0.9318600368324125, + "grad_norm": 0.5316528081893921, + "learning_rate": 9.895824567482307e-05, + "loss": 2.11, + "step": 3036 + }, + { + "epoch": 0.9321669736034377, + "grad_norm": 0.5760478973388672, + "learning_rate": 9.895723607787251e-05, + "loss": 2.0885, + "step": 3037 + }, + { + "epoch": 0.9324739103744628, + "grad_norm": 0.5034705996513367, + "learning_rate": 9.895622599709785e-05, + "loss": 2.0024, + "step": 3038 + }, + { + "epoch": 0.932780847145488, + "grad_norm": 0.46088743209838867, + "learning_rate": 9.895521543250906e-05, + "loss": 2.0794, + "step": 3039 + }, + { + "epoch": 0.9330877839165131, + "grad_norm": 0.5219544172286987, + "learning_rate": 9.895420438411616e-05, + "loss": 2.1002, + "step": 3040 + }, + { + "epoch": 0.9333947206875384, + "grad_norm": 0.5363453030586243, + "learning_rate": 9.89531928519291e-05, + "loss": 2.0629, + "step": 3041 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.5860787630081177, + "learning_rate": 9.89521808359579e-05, + "loss": 2.0999, + "step": 3042 + }, + { + "epoch": 0.9340085942295887, + "grad_norm": 0.7155836224555969, + "learning_rate": 9.895116833621255e-05, + "loss": 2.1674, + "step": 3043 + }, + { + "epoch": 0.9343155310006138, + "grad_norm": 0.8029196262359619, + "learning_rate": 9.895015535270307e-05, + "loss": 2.0776, + "step": 3044 + }, + { + "epoch": 0.934622467771639, + "grad_norm": 0.6973832845687866, + "learning_rate": 9.894914188543946e-05, + "loss": 2.0537, + "step": 3045 + }, + { + "epoch": 0.9349294045426643, + "grad_norm": 0.6646706461906433, + "learning_rate": 9.894812793443175e-05, + "loss": 2.0857, + "step": 3046 + }, + { + "epoch": 0.9352363413136894, + "grad_norm": 0.6343888640403748, + "learning_rate": 9.894711349968995e-05, + "loss": 2.0832, + "step": 3047 + }, + { + "epoch": 0.9355432780847146, + "grad_norm": 0.54819256067276, + "learning_rate": 9.894609858122407e-05, + "loss": 2.1576, + "step": 3048 + }, + { + "epoch": 0.9358502148557397, + "grad_norm": 0.6905701160430908, + "learning_rate": 9.894508317904419e-05, + "loss": 2.0685, + "step": 3049 + }, + { + "epoch": 0.9361571516267649, + "grad_norm": 0.605591356754303, + "learning_rate": 9.894406729316028e-05, + "loss": 2.0931, + "step": 3050 + }, + { + "epoch": 0.93646408839779, + "grad_norm": 0.5702943801879883, + "learning_rate": 9.89430509235824e-05, + "loss": 2.1224, + "step": 3051 + }, + { + "epoch": 0.9367710251688153, + "grad_norm": 0.5855122804641724, + "learning_rate": 9.894203407032064e-05, + "loss": 2.0747, + "step": 3052 + }, + { + "epoch": 0.9370779619398404, + "grad_norm": 0.6002167463302612, + "learning_rate": 9.894101673338498e-05, + "loss": 2.0991, + "step": 3053 + }, + { + "epoch": 0.9373848987108656, + "grad_norm": 0.5914842486381531, + "learning_rate": 9.893999891278553e-05, + "loss": 2.0427, + "step": 3054 + }, + { + "epoch": 0.9376918354818907, + "grad_norm": 0.6283048391342163, + "learning_rate": 9.893898060853232e-05, + "loss": 2.0558, + "step": 3055 + }, + { + "epoch": 0.937998772252916, + "grad_norm": 0.5955209136009216, + "learning_rate": 9.893796182063542e-05, + "loss": 2.1286, + "step": 3056 + }, + { + "epoch": 0.9383057090239411, + "grad_norm": 0.5579878687858582, + "learning_rate": 9.893694254910489e-05, + "loss": 2.0799, + "step": 3057 + }, + { + "epoch": 0.9386126457949663, + "grad_norm": 0.5690281391143799, + "learning_rate": 9.893592279395082e-05, + "loss": 2.0699, + "step": 3058 + }, + { + "epoch": 0.9389195825659914, + "grad_norm": 0.5189259648323059, + "learning_rate": 9.893490255518327e-05, + "loss": 2.0627, + "step": 3059 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.5205439925193787, + "learning_rate": 9.893388183281233e-05, + "loss": 2.0136, + "step": 3060 + }, + { + "epoch": 0.9395334561080417, + "grad_norm": 0.492593914270401, + "learning_rate": 9.89328606268481e-05, + "loss": 2.0799, + "step": 3061 + }, + { + "epoch": 0.939840392879067, + "grad_norm": 0.6511666178703308, + "learning_rate": 9.893183893730067e-05, + "loss": 2.1297, + "step": 3062 + }, + { + "epoch": 0.9401473296500921, + "grad_norm": 0.7640050053596497, + "learning_rate": 9.89308167641801e-05, + "loss": 2.1384, + "step": 3063 + }, + { + "epoch": 0.9404542664211173, + "grad_norm": 0.7526536583900452, + "learning_rate": 9.892979410749654e-05, + "loss": 2.0454, + "step": 3064 + }, + { + "epoch": 0.9407612031921424, + "grad_norm": 0.7140639424324036, + "learning_rate": 9.892877096726007e-05, + "loss": 2.0219, + "step": 3065 + }, + { + "epoch": 0.9410681399631676, + "grad_norm": 0.6584374308586121, + "learning_rate": 9.89277473434808e-05, + "loss": 2.0943, + "step": 3066 + }, + { + "epoch": 0.9413750767341927, + "grad_norm": 0.5889024138450623, + "learning_rate": 9.892672323616888e-05, + "loss": 2.1088, + "step": 3067 + }, + { + "epoch": 0.941682013505218, + "grad_norm": 0.6196749806404114, + "learning_rate": 9.892569864533438e-05, + "loss": 2.101, + "step": 3068 + }, + { + "epoch": 0.9419889502762431, + "grad_norm": 0.6432211399078369, + "learning_rate": 9.892467357098744e-05, + "loss": 2.0828, + "step": 3069 + }, + { + "epoch": 0.9422958870472683, + "grad_norm": 0.6448069214820862, + "learning_rate": 9.892364801313823e-05, + "loss": 2.1389, + "step": 3070 + }, + { + "epoch": 0.9426028238182934, + "grad_norm": 0.597197949886322, + "learning_rate": 9.892262197179682e-05, + "loss": 2.0902, + "step": 3071 + }, + { + "epoch": 0.9429097605893186, + "grad_norm": 0.625348687171936, + "learning_rate": 9.892159544697341e-05, + "loss": 2.0659, + "step": 3072 + }, + { + "epoch": 0.9432166973603437, + "grad_norm": 0.5109166502952576, + "learning_rate": 9.892056843867812e-05, + "loss": 2.0895, + "step": 3073 + }, + { + "epoch": 0.943523634131369, + "grad_norm": 0.5917959213256836, + "learning_rate": 9.891954094692108e-05, + "loss": 2.0646, + "step": 3074 + }, + { + "epoch": 0.9438305709023941, + "grad_norm": 0.5320633053779602, + "learning_rate": 9.891851297171249e-05, + "loss": 2.107, + "step": 3075 + }, + { + "epoch": 0.9441375076734193, + "grad_norm": 0.5271332263946533, + "learning_rate": 9.891748451306246e-05, + "loss": 2.0984, + "step": 3076 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.5389983057975769, + "learning_rate": 9.89164555709812e-05, + "loss": 2.1097, + "step": 3077 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.5536573529243469, + "learning_rate": 9.891542614547885e-05, + "loss": 2.1271, + "step": 3078 + }, + { + "epoch": 0.9450583179864948, + "grad_norm": 0.5481712222099304, + "learning_rate": 9.891439623656558e-05, + "loss": 2.0975, + "step": 3079 + }, + { + "epoch": 0.94536525475752, + "grad_norm": 0.626431941986084, + "learning_rate": 9.891336584425157e-05, + "loss": 2.1561, + "step": 3080 + }, + { + "epoch": 0.9456721915285451, + "grad_norm": 0.7452689409255981, + "learning_rate": 9.891233496854702e-05, + "loss": 2.0791, + "step": 3081 + }, + { + "epoch": 0.9459791282995703, + "grad_norm": 0.9399113059043884, + "learning_rate": 9.89113036094621e-05, + "loss": 2.0706, + "step": 3082 + }, + { + "epoch": 0.9462860650705954, + "grad_norm": 1.0733267068862915, + "learning_rate": 9.891027176700701e-05, + "loss": 2.0705, + "step": 3083 + }, + { + "epoch": 0.9465930018416207, + "grad_norm": 0.7521542906761169, + "learning_rate": 9.890923944119194e-05, + "loss": 2.0862, + "step": 3084 + }, + { + "epoch": 0.9468999386126458, + "grad_norm": 0.5447198152542114, + "learning_rate": 9.890820663202713e-05, + "loss": 2.1047, + "step": 3085 + }, + { + "epoch": 0.947206875383671, + "grad_norm": 0.5733833312988281, + "learning_rate": 9.890717333952273e-05, + "loss": 2.121, + "step": 3086 + }, + { + "epoch": 0.9475138121546961, + "grad_norm": 0.7225440144538879, + "learning_rate": 9.890613956368899e-05, + "loss": 2.0533, + "step": 3087 + }, + { + "epoch": 0.9478207489257213, + "grad_norm": 0.6377096176147461, + "learning_rate": 9.89051053045361e-05, + "loss": 2.07, + "step": 3088 + }, + { + "epoch": 0.9481276856967464, + "grad_norm": 0.556656002998352, + "learning_rate": 9.890407056207432e-05, + "loss": 2.1103, + "step": 3089 + }, + { + "epoch": 0.9484346224677717, + "grad_norm": 0.6807621121406555, + "learning_rate": 9.890303533631382e-05, + "loss": 2.1351, + "step": 3090 + }, + { + "epoch": 0.9487415592387968, + "grad_norm": 0.7187803983688354, + "learning_rate": 9.890199962726487e-05, + "loss": 2.0582, + "step": 3091 + }, + { + "epoch": 0.949048496009822, + "grad_norm": 0.6201196908950806, + "learning_rate": 9.890096343493771e-05, + "loss": 2.0799, + "step": 3092 + }, + { + "epoch": 0.9493554327808471, + "grad_norm": 0.6258496046066284, + "learning_rate": 9.889992675934257e-05, + "loss": 2.156, + "step": 3093 + }, + { + "epoch": 0.9496623695518723, + "grad_norm": 0.6191570162773132, + "learning_rate": 9.889888960048967e-05, + "loss": 2.0121, + "step": 3094 + }, + { + "epoch": 0.9499693063228974, + "grad_norm": 0.5668848752975464, + "learning_rate": 9.88978519583893e-05, + "loss": 2.0954, + "step": 3095 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.5596859455108643, + "learning_rate": 9.88968138330517e-05, + "loss": 2.1274, + "step": 3096 + }, + { + "epoch": 0.9505831798649478, + "grad_norm": 0.6199706196784973, + "learning_rate": 9.889577522448712e-05, + "loss": 2.0588, + "step": 3097 + }, + { + "epoch": 0.950890116635973, + "grad_norm": 0.5129860639572144, + "learning_rate": 9.889473613270584e-05, + "loss": 2.0722, + "step": 3098 + }, + { + "epoch": 0.9511970534069981, + "grad_norm": 0.513263463973999, + "learning_rate": 9.88936965577181e-05, + "loss": 2.0298, + "step": 3099 + }, + { + "epoch": 0.9515039901780233, + "grad_norm": 0.4870156943798065, + "learning_rate": 9.88926564995342e-05, + "loss": 2.025, + "step": 3100 + }, + { + "epoch": 0.9518109269490485, + "grad_norm": 0.5310595035552979, + "learning_rate": 9.889161595816442e-05, + "loss": 2.0767, + "step": 3101 + }, + { + "epoch": 0.9521178637200737, + "grad_norm": 0.5993812084197998, + "learning_rate": 9.889057493361903e-05, + "loss": 2.1931, + "step": 3102 + }, + { + "epoch": 0.9524248004910988, + "grad_norm": 0.6157637238502502, + "learning_rate": 9.888953342590832e-05, + "loss": 2.0757, + "step": 3103 + }, + { + "epoch": 0.952731737262124, + "grad_norm": 0.6280032992362976, + "learning_rate": 9.88884914350426e-05, + "loss": 2.0042, + "step": 3104 + }, + { + "epoch": 0.9530386740331491, + "grad_norm": 0.6740781664848328, + "learning_rate": 9.888744896103212e-05, + "loss": 2.0663, + "step": 3105 + }, + { + "epoch": 0.9533456108041743, + "grad_norm": 0.5851804614067078, + "learning_rate": 9.888640600388725e-05, + "loss": 2.0585, + "step": 3106 + }, + { + "epoch": 0.9536525475751995, + "grad_norm": 0.6590312719345093, + "learning_rate": 9.888536256361825e-05, + "loss": 2.0698, + "step": 3107 + }, + { + "epoch": 0.9539594843462247, + "grad_norm": 0.5356595516204834, + "learning_rate": 9.888431864023544e-05, + "loss": 2.1019, + "step": 3108 + }, + { + "epoch": 0.9542664211172498, + "grad_norm": 0.6401084661483765, + "learning_rate": 9.888327423374915e-05, + "loss": 2.1176, + "step": 3109 + }, + { + "epoch": 0.954573357888275, + "grad_norm": 0.6582900285720825, + "learning_rate": 9.888222934416968e-05, + "loss": 2.0375, + "step": 3110 + }, + { + "epoch": 0.9548802946593001, + "grad_norm": 0.6245424151420593, + "learning_rate": 9.888118397150738e-05, + "loss": 1.9913, + "step": 3111 + }, + { + "epoch": 0.9551872314303254, + "grad_norm": 0.5871780514717102, + "learning_rate": 9.888013811577256e-05, + "loss": 2.1434, + "step": 3112 + }, + { + "epoch": 0.9554941682013505, + "grad_norm": 0.6295487284660339, + "learning_rate": 9.887909177697559e-05, + "loss": 2.0805, + "step": 3113 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.5844045877456665, + "learning_rate": 9.887804495512676e-05, + "loss": 2.076, + "step": 3114 + }, + { + "epoch": 0.9561080417434008, + "grad_norm": 0.5581921339035034, + "learning_rate": 9.887699765023645e-05, + "loss": 2.131, + "step": 3115 + }, + { + "epoch": 0.956414978514426, + "grad_norm": 0.6659174561500549, + "learning_rate": 9.8875949862315e-05, + "loss": 2.0759, + "step": 3116 + }, + { + "epoch": 0.9567219152854513, + "grad_norm": 0.5852961540222168, + "learning_rate": 9.887490159137276e-05, + "loss": 2.0486, + "step": 3117 + }, + { + "epoch": 0.9570288520564764, + "grad_norm": 0.6077566146850586, + "learning_rate": 9.887385283742011e-05, + "loss": 2.1132, + "step": 3118 + }, + { + "epoch": 0.9573357888275016, + "grad_norm": 0.5991361141204834, + "learning_rate": 9.88728036004674e-05, + "loss": 2.0322, + "step": 3119 + }, + { + "epoch": 0.9576427255985267, + "grad_norm": 0.5832391977310181, + "learning_rate": 9.887175388052499e-05, + "loss": 2.135, + "step": 3120 + }, + { + "epoch": 0.9579496623695519, + "grad_norm": 0.5479732751846313, + "learning_rate": 9.887070367760327e-05, + "loss": 2.1222, + "step": 3121 + }, + { + "epoch": 0.958256599140577, + "grad_norm": 0.5630220770835876, + "learning_rate": 9.88696529917126e-05, + "loss": 2.1247, + "step": 3122 + }, + { + "epoch": 0.9585635359116023, + "grad_norm": 0.7052439451217651, + "learning_rate": 9.88686018228634e-05, + "loss": 2.204, + "step": 3123 + }, + { + "epoch": 0.9588704726826274, + "grad_norm": 0.5995638370513916, + "learning_rate": 9.8867550171066e-05, + "loss": 2.0153, + "step": 3124 + }, + { + "epoch": 0.9591774094536526, + "grad_norm": 0.5689408779144287, + "learning_rate": 9.886649803633086e-05, + "loss": 2.0341, + "step": 3125 + }, + { + "epoch": 0.9594843462246777, + "grad_norm": 0.5247456431388855, + "learning_rate": 9.886544541866832e-05, + "loss": 2.0657, + "step": 3126 + }, + { + "epoch": 0.9597912829957029, + "grad_norm": 0.5596463084220886, + "learning_rate": 9.886439231808882e-05, + "loss": 2.0829, + "step": 3127 + }, + { + "epoch": 0.960098219766728, + "grad_norm": 0.4993874430656433, + "learning_rate": 9.886333873460275e-05, + "loss": 2.0517, + "step": 3128 + }, + { + "epoch": 0.9604051565377533, + "grad_norm": 0.5776910185813904, + "learning_rate": 9.886228466822054e-05, + "loss": 2.0124, + "step": 3129 + }, + { + "epoch": 0.9607120933087784, + "grad_norm": 0.5871354341506958, + "learning_rate": 9.886123011895258e-05, + "loss": 2.0327, + "step": 3130 + }, + { + "epoch": 0.9610190300798036, + "grad_norm": 0.5873207449913025, + "learning_rate": 9.886017508680931e-05, + "loss": 2.0756, + "step": 3131 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.6422720551490784, + "learning_rate": 9.885911957180113e-05, + "loss": 2.0649, + "step": 3132 + }, + { + "epoch": 0.9616329036218539, + "grad_norm": 0.6040814518928528, + "learning_rate": 9.885806357393853e-05, + "loss": 2.066, + "step": 3133 + }, + { + "epoch": 0.961939840392879, + "grad_norm": 0.6629621982574463, + "learning_rate": 9.885700709323189e-05, + "loss": 2.0824, + "step": 3134 + }, + { + "epoch": 0.9622467771639043, + "grad_norm": 0.572485625743866, + "learning_rate": 9.885595012969168e-05, + "loss": 2.0572, + "step": 3135 + }, + { + "epoch": 0.9625537139349294, + "grad_norm": 0.5050783753395081, + "learning_rate": 9.885489268332833e-05, + "loss": 2.0645, + "step": 3136 + }, + { + "epoch": 0.9628606507059546, + "grad_norm": 0.5744417309761047, + "learning_rate": 9.885383475415229e-05, + "loss": 2.0549, + "step": 3137 + }, + { + "epoch": 0.9631675874769797, + "grad_norm": 0.5604275465011597, + "learning_rate": 9.885277634217403e-05, + "loss": 2.1339, + "step": 3138 + }, + { + "epoch": 0.963474524248005, + "grad_norm": 0.6182584762573242, + "learning_rate": 9.8851717447404e-05, + "loss": 2.0397, + "step": 3139 + }, + { + "epoch": 0.9637814610190301, + "grad_norm": 0.510515570640564, + "learning_rate": 9.885065806985266e-05, + "loss": 1.9761, + "step": 3140 + }, + { + "epoch": 0.9640883977900553, + "grad_norm": 0.4881763756275177, + "learning_rate": 9.884959820953048e-05, + "loss": 2.005, + "step": 3141 + }, + { + "epoch": 0.9643953345610804, + "grad_norm": 0.47206851840019226, + "learning_rate": 9.884853786644794e-05, + "loss": 2.0661, + "step": 3142 + }, + { + "epoch": 0.9647022713321056, + "grad_norm": 0.5691676735877991, + "learning_rate": 9.884747704061552e-05, + "loss": 2.1316, + "step": 3143 + }, + { + "epoch": 0.9650092081031307, + "grad_norm": 0.5338765978813171, + "learning_rate": 9.884641573204372e-05, + "loss": 2.0715, + "step": 3144 + }, + { + "epoch": 0.965316144874156, + "grad_norm": 0.5721597075462341, + "learning_rate": 9.884535394074299e-05, + "loss": 2.1004, + "step": 3145 + }, + { + "epoch": 0.9656230816451811, + "grad_norm": 0.5269518494606018, + "learning_rate": 9.884429166672384e-05, + "loss": 2.1233, + "step": 3146 + }, + { + "epoch": 0.9659300184162063, + "grad_norm": 0.5264385342597961, + "learning_rate": 9.884322890999678e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.9662369551872314, + "grad_norm": 0.6094604730606079, + "learning_rate": 9.88421656705723e-05, + "loss": 2.1009, + "step": 3148 + }, + { + "epoch": 0.9665438919582566, + "grad_norm": 0.5538906455039978, + "learning_rate": 9.884110194846093e-05, + "loss": 2.0055, + "step": 3149 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.591526985168457, + "learning_rate": 9.884003774367313e-05, + "loss": 2.0655, + "step": 3150 + }, + { + "epoch": 0.967157765500307, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.883897305621948e-05, + "loss": 2.0775, + "step": 3151 + }, + { + "epoch": 0.9674647022713321, + "grad_norm": 0.5074640512466431, + "learning_rate": 9.883790788611045e-05, + "loss": 2.0322, + "step": 3152 + }, + { + "epoch": 0.9677716390423573, + "grad_norm": 0.5111376047134399, + "learning_rate": 9.883684223335661e-05, + "loss": 2.0972, + "step": 3153 + }, + { + "epoch": 0.9680785758133824, + "grad_norm": 0.5187644362449646, + "learning_rate": 9.883577609796846e-05, + "loss": 2.072, + "step": 3154 + }, + { + "epoch": 0.9683855125844076, + "grad_norm": 0.5285201072692871, + "learning_rate": 9.883470947995654e-05, + "loss": 2.0468, + "step": 3155 + }, + { + "epoch": 0.9686924493554327, + "grad_norm": 0.49360916018486023, + "learning_rate": 9.883364237933142e-05, + "loss": 2.07, + "step": 3156 + }, + { + "epoch": 0.968999386126458, + "grad_norm": 0.6359294056892395, + "learning_rate": 9.88325747961036e-05, + "loss": 2.1169, + "step": 3157 + }, + { + "epoch": 0.9693063228974831, + "grad_norm": 0.6274764537811279, + "learning_rate": 9.883150673028367e-05, + "loss": 2.1412, + "step": 3158 + }, + { + "epoch": 0.9696132596685083, + "grad_norm": 0.5755917429924011, + "learning_rate": 9.883043818188215e-05, + "loss": 2.0547, + "step": 3159 + }, + { + "epoch": 0.9699201964395334, + "grad_norm": 0.4765770137310028, + "learning_rate": 9.882936915090964e-05, + "loss": 2.02, + "step": 3160 + }, + { + "epoch": 0.9702271332105586, + "grad_norm": 0.5085053443908691, + "learning_rate": 9.882829963737667e-05, + "loss": 2.0355, + "step": 3161 + }, + { + "epoch": 0.9705340699815838, + "grad_norm": 0.49804505705833435, + "learning_rate": 9.882722964129385e-05, + "loss": 2.1274, + "step": 3162 + }, + { + "epoch": 0.970841006752609, + "grad_norm": 0.5575076341629028, + "learning_rate": 9.882615916267171e-05, + "loss": 2.0661, + "step": 3163 + }, + { + "epoch": 0.9711479435236341, + "grad_norm": 0.5678727626800537, + "learning_rate": 9.882508820152084e-05, + "loss": 2.1135, + "step": 3164 + }, + { + "epoch": 0.9714548802946593, + "grad_norm": 0.5505611896514893, + "learning_rate": 9.882401675785185e-05, + "loss": 2.0888, + "step": 3165 + }, + { + "epoch": 0.9717618170656844, + "grad_norm": 0.5224125385284424, + "learning_rate": 9.88229448316753e-05, + "loss": 2.0492, + "step": 3166 + }, + { + "epoch": 0.9720687538367097, + "grad_norm": 0.437215656042099, + "learning_rate": 9.882187242300178e-05, + "loss": 1.9927, + "step": 3167 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.4914848804473877, + "learning_rate": 9.882079953184192e-05, + "loss": 2.0309, + "step": 3168 + }, + { + "epoch": 0.97268262737876, + "grad_norm": 0.4990764260292053, + "learning_rate": 9.88197261582063e-05, + "loss": 2.0408, + "step": 3169 + }, + { + "epoch": 0.9729895641497851, + "grad_norm": 0.5283234715461731, + "learning_rate": 9.881865230210552e-05, + "loss": 2.0627, + "step": 3170 + }, + { + "epoch": 0.9732965009208103, + "grad_norm": 0.5771347284317017, + "learning_rate": 9.88175779635502e-05, + "loss": 2.1591, + "step": 3171 + }, + { + "epoch": 0.9736034376918354, + "grad_norm": 0.5020268559455872, + "learning_rate": 9.881650314255098e-05, + "loss": 2.0311, + "step": 3172 + }, + { + "epoch": 0.9739103744628607, + "grad_norm": 0.5476529002189636, + "learning_rate": 9.881542783911846e-05, + "loss": 2.1114, + "step": 3173 + }, + { + "epoch": 0.9742173112338858, + "grad_norm": 0.5630559921264648, + "learning_rate": 9.881435205326327e-05, + "loss": 2.0617, + "step": 3174 + }, + { + "epoch": 0.974524248004911, + "grad_norm": 0.5931001305580139, + "learning_rate": 9.881327578499604e-05, + "loss": 2.0376, + "step": 3175 + }, + { + "epoch": 0.9748311847759361, + "grad_norm": 0.6123979091644287, + "learning_rate": 9.881219903432742e-05, + "loss": 2.0995, + "step": 3176 + }, + { + "epoch": 0.9751381215469613, + "grad_norm": 0.6064465641975403, + "learning_rate": 9.881112180126802e-05, + "loss": 2.0533, + "step": 3177 + }, + { + "epoch": 0.9754450583179864, + "grad_norm": 0.6071485877037048, + "learning_rate": 9.881004408582852e-05, + "loss": 2.1007, + "step": 3178 + }, + { + "epoch": 0.9757519950890117, + "grad_norm": 0.6021482944488525, + "learning_rate": 9.880896588801954e-05, + "loss": 2.0528, + "step": 3179 + }, + { + "epoch": 0.9760589318600368, + "grad_norm": 0.5204832553863525, + "learning_rate": 9.880788720785177e-05, + "loss": 2.0489, + "step": 3180 + }, + { + "epoch": 0.976365868631062, + "grad_norm": 0.5347138047218323, + "learning_rate": 9.880680804533585e-05, + "loss": 2.1021, + "step": 3181 + }, + { + "epoch": 0.9766728054020871, + "grad_norm": 0.6318790912628174, + "learning_rate": 9.880572840048243e-05, + "loss": 2.0808, + "step": 3182 + }, + { + "epoch": 0.9769797421731123, + "grad_norm": 0.6978665590286255, + "learning_rate": 9.88046482733022e-05, + "loss": 2.0067, + "step": 3183 + }, + { + "epoch": 0.9772866789441375, + "grad_norm": 0.7986917495727539, + "learning_rate": 9.880356766380582e-05, + "loss": 2.0239, + "step": 3184 + }, + { + "epoch": 0.9775936157151627, + "grad_norm": 0.853898286819458, + "learning_rate": 9.880248657200402e-05, + "loss": 2.085, + "step": 3185 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.8207793235778809, + "learning_rate": 9.880140499790741e-05, + "loss": 2.0504, + "step": 3186 + }, + { + "epoch": 0.978207489257213, + "grad_norm": 0.7750336527824402, + "learning_rate": 9.880032294152673e-05, + "loss": 2.0962, + "step": 3187 + }, + { + "epoch": 0.9785144260282382, + "grad_norm": 0.7141241431236267, + "learning_rate": 9.879924040287263e-05, + "loss": 2.0655, + "step": 3188 + }, + { + "epoch": 0.9788213627992634, + "grad_norm": 0.6119080781936646, + "learning_rate": 9.879815738195585e-05, + "loss": 2.0611, + "step": 3189 + }, + { + "epoch": 0.9791282995702886, + "grad_norm": 0.5963751673698425, + "learning_rate": 9.879707387878708e-05, + "loss": 2.0978, + "step": 3190 + }, + { + "epoch": 0.9794352363413137, + "grad_norm": 0.5016428828239441, + "learning_rate": 9.879598989337703e-05, + "loss": 2.0323, + "step": 3191 + }, + { + "epoch": 0.9797421731123389, + "grad_norm": 0.5610151290893555, + "learning_rate": 9.87949054257364e-05, + "loss": 2.1362, + "step": 3192 + }, + { + "epoch": 0.980049109883364, + "grad_norm": 0.5687069296836853, + "learning_rate": 9.879382047587591e-05, + "loss": 2.0234, + "step": 3193 + }, + { + "epoch": 0.9803560466543892, + "grad_norm": 0.6210914254188538, + "learning_rate": 9.87927350438063e-05, + "loss": 2.0455, + "step": 3194 + }, + { + "epoch": 0.9806629834254144, + "grad_norm": 0.530215322971344, + "learning_rate": 9.879164912953827e-05, + "loss": 2.0607, + "step": 3195 + }, + { + "epoch": 0.9809699201964396, + "grad_norm": 0.5462486147880554, + "learning_rate": 9.879056273308258e-05, + "loss": 2.1229, + "step": 3196 + }, + { + "epoch": 0.9812768569674647, + "grad_norm": 0.5765405297279358, + "learning_rate": 9.878947585444994e-05, + "loss": 2.0575, + "step": 3197 + }, + { + "epoch": 0.9815837937384899, + "grad_norm": 0.531679630279541, + "learning_rate": 9.878838849365111e-05, + "loss": 2.0208, + "step": 3198 + }, + { + "epoch": 0.981890730509515, + "grad_norm": 0.5190781950950623, + "learning_rate": 9.878730065069683e-05, + "loss": 2.0073, + "step": 3199 + }, + { + "epoch": 0.9821976672805403, + "grad_norm": 0.6260761022567749, + "learning_rate": 9.878621232559784e-05, + "loss": 2.1144, + "step": 3200 + }, + { + "epoch": 0.9825046040515654, + "grad_norm": 0.664830207824707, + "learning_rate": 9.878512351836491e-05, + "loss": 2.1423, + "step": 3201 + }, + { + "epoch": 0.9828115408225906, + "grad_norm": 0.7107433676719666, + "learning_rate": 9.878403422900881e-05, + "loss": 2.0851, + "step": 3202 + }, + { + "epoch": 0.9831184775936157, + "grad_norm": 0.7426268458366394, + "learning_rate": 9.878294445754027e-05, + "loss": 2.0637, + "step": 3203 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.7643515467643738, + "learning_rate": 9.878185420397008e-05, + "loss": 2.0623, + "step": 3204 + }, + { + "epoch": 0.983732351135666, + "grad_norm": 0.644257664680481, + "learning_rate": 9.878076346830904e-05, + "loss": 2.103, + "step": 3205 + }, + { + "epoch": 0.9840392879066913, + "grad_norm": 0.5871284008026123, + "learning_rate": 9.877967225056787e-05, + "loss": 2.0695, + "step": 3206 + }, + { + "epoch": 0.9843462246777164, + "grad_norm": 0.6907737851142883, + "learning_rate": 9.877858055075742e-05, + "loss": 2.1148, + "step": 3207 + }, + { + "epoch": 0.9846531614487416, + "grad_norm": 0.6685691475868225, + "learning_rate": 9.877748836888843e-05, + "loss": 2.0356, + "step": 3208 + }, + { + "epoch": 0.9849600982197667, + "grad_norm": 0.797210156917572, + "learning_rate": 9.87763957049717e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.9852670349907919, + "grad_norm": 0.8397588133811951, + "learning_rate": 9.877530255901806e-05, + "loss": 2.0697, + "step": 3210 + }, + { + "epoch": 0.985573971761817, + "grad_norm": 0.6988976001739502, + "learning_rate": 9.877420893103828e-05, + "loss": 2.0676, + "step": 3211 + }, + { + "epoch": 0.9858809085328423, + "grad_norm": 0.5828577876091003, + "learning_rate": 9.877311482104319e-05, + "loss": 2.0988, + "step": 3212 + }, + { + "epoch": 0.9861878453038674, + "grad_norm": 0.66143798828125, + "learning_rate": 9.877202022904359e-05, + "loss": 2.101, + "step": 3213 + }, + { + "epoch": 0.9864947820748926, + "grad_norm": 0.7351155877113342, + "learning_rate": 9.877092515505028e-05, + "loss": 2.0198, + "step": 3214 + }, + { + "epoch": 0.9868017188459177, + "grad_norm": 0.6817437410354614, + "learning_rate": 9.876982959907413e-05, + "loss": 2.1182, + "step": 3215 + }, + { + "epoch": 0.9871086556169429, + "grad_norm": 0.6640676259994507, + "learning_rate": 9.876873356112592e-05, + "loss": 2.1264, + "step": 3216 + }, + { + "epoch": 0.987415592387968, + "grad_norm": 0.6146695017814636, + "learning_rate": 9.876763704121652e-05, + "loss": 2.0378, + "step": 3217 + }, + { + "epoch": 0.9877225291589933, + "grad_norm": 0.6681298017501831, + "learning_rate": 9.876654003935672e-05, + "loss": 2.1916, + "step": 3218 + }, + { + "epoch": 0.9880294659300184, + "grad_norm": 0.7407983541488647, + "learning_rate": 9.876544255555742e-05, + "loss": 2.0996, + "step": 3219 + }, + { + "epoch": 0.9883364027010436, + "grad_norm": 0.5995208621025085, + "learning_rate": 9.876434458982941e-05, + "loss": 2.0023, + "step": 3220 + }, + { + "epoch": 0.9886433394720687, + "grad_norm": 0.6491377949714661, + "learning_rate": 9.876324614218357e-05, + "loss": 2.129, + "step": 3221 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.6356569528579712, + "learning_rate": 9.876214721263074e-05, + "loss": 2.1396, + "step": 3222 + }, + { + "epoch": 0.9892572130141191, + "grad_norm": 0.6149557828903198, + "learning_rate": 9.876104780118182e-05, + "loss": 2.0204, + "step": 3223 + }, + { + "epoch": 0.9895641497851443, + "grad_norm": 0.600841224193573, + "learning_rate": 9.875994790784764e-05, + "loss": 2.0585, + "step": 3224 + }, + { + "epoch": 0.9898710865561694, + "grad_norm": 0.6398041248321533, + "learning_rate": 9.875884753263906e-05, + "loss": 2.1296, + "step": 3225 + }, + { + "epoch": 0.9901780233271946, + "grad_norm": 0.5978466272354126, + "learning_rate": 9.875774667556697e-05, + "loss": 1.9765, + "step": 3226 + }, + { + "epoch": 0.9904849600982197, + "grad_norm": 0.49499931931495667, + "learning_rate": 9.875664533664227e-05, + "loss": 2.0516, + "step": 3227 + }, + { + "epoch": 0.990791896869245, + "grad_norm": 0.5660768151283264, + "learning_rate": 9.875554351587579e-05, + "loss": 2.0743, + "step": 3228 + }, + { + "epoch": 0.9910988336402701, + "grad_norm": 0.56971275806427, + "learning_rate": 9.875444121327849e-05, + "loss": 2.0794, + "step": 3229 + }, + { + "epoch": 0.9914057704112953, + "grad_norm": 0.5806300044059753, + "learning_rate": 9.87533384288612e-05, + "loss": 2.1636, + "step": 3230 + }, + { + "epoch": 0.9917127071823204, + "grad_norm": 0.5485837459564209, + "learning_rate": 9.875223516263485e-05, + "loss": 2.025, + "step": 3231 + }, + { + "epoch": 0.9920196439533456, + "grad_norm": 0.6353451013565063, + "learning_rate": 9.875113141461034e-05, + "loss": 2.1033, + "step": 3232 + }, + { + "epoch": 0.9923265807243707, + "grad_norm": 0.577608048915863, + "learning_rate": 9.875002718479858e-05, + "loss": 2.1306, + "step": 3233 + }, + { + "epoch": 0.992633517495396, + "grad_norm": 0.5305901765823364, + "learning_rate": 9.874892247321046e-05, + "loss": 2.1123, + "step": 3234 + }, + { + "epoch": 0.9929404542664211, + "grad_norm": 0.5554118752479553, + "learning_rate": 9.874781727985693e-05, + "loss": 2.0524, + "step": 3235 + }, + { + "epoch": 0.9932473910374463, + "grad_norm": 0.48555269837379456, + "learning_rate": 9.87467116047489e-05, + "loss": 2.0699, + "step": 3236 + }, + { + "epoch": 0.9935543278084714, + "grad_norm": 0.578976035118103, + "learning_rate": 9.874560544789729e-05, + "loss": 2.0747, + "step": 3237 + }, + { + "epoch": 0.9938612645794966, + "grad_norm": 0.5508282780647278, + "learning_rate": 9.874449880931304e-05, + "loss": 2.0947, + "step": 3238 + }, + { + "epoch": 0.9941682013505218, + "grad_norm": 0.5458595752716064, + "learning_rate": 9.874339168900707e-05, + "loss": 2.0417, + "step": 3239 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.5668261647224426, + "learning_rate": 9.874228408699035e-05, + "loss": 2.0948, + "step": 3240 + }, + { + "epoch": 0.9947820748925721, + "grad_norm": 0.6127253174781799, + "learning_rate": 9.87411760032738e-05, + "loss": 2.0904, + "step": 3241 + }, + { + "epoch": 0.9950890116635973, + "grad_norm": 0.5736191868782043, + "learning_rate": 9.874006743786839e-05, + "loss": 2.0637, + "step": 3242 + }, + { + "epoch": 0.9953959484346224, + "grad_norm": 0.574163019657135, + "learning_rate": 9.873895839078507e-05, + "loss": 2.0925, + "step": 3243 + }, + { + "epoch": 0.9957028852056476, + "grad_norm": 0.5660602450370789, + "learning_rate": 9.873784886203478e-05, + "loss": 2.0743, + "step": 3244 + }, + { + "epoch": 0.9960098219766728, + "grad_norm": 0.6037993431091309, + "learning_rate": 9.87367388516285e-05, + "loss": 2.1274, + "step": 3245 + }, + { + "epoch": 0.996316758747698, + "grad_norm": 0.5664488673210144, + "learning_rate": 9.873562835957722e-05, + "loss": 2.0403, + "step": 3246 + }, + { + "epoch": 0.9966236955187231, + "grad_norm": 0.6170254349708557, + "learning_rate": 9.873451738589188e-05, + "loss": 2.0198, + "step": 3247 + }, + { + "epoch": 0.9969306322897483, + "grad_norm": 0.5582032799720764, + "learning_rate": 9.873340593058348e-05, + "loss": 2.1494, + "step": 3248 + }, + { + "epoch": 0.9972375690607734, + "grad_norm": 0.5565598607063293, + "learning_rate": 9.8732293993663e-05, + "loss": 2.1062, + "step": 3249 + }, + { + "epoch": 0.9975445058317987, + "grad_norm": 0.5526474118232727, + "learning_rate": 9.873118157514142e-05, + "loss": 2.1184, + "step": 3250 + }, + { + "epoch": 0.9978514426028238, + "grad_norm": 0.5864302515983582, + "learning_rate": 9.873006867502975e-05, + "loss": 2.1389, + "step": 3251 + }, + { + "epoch": 0.998158379373849, + "grad_norm": 0.5295118689537048, + "learning_rate": 9.872895529333899e-05, + "loss": 2.05, + "step": 3252 + }, + { + "epoch": 0.9984653161448741, + "grad_norm": 0.553537905216217, + "learning_rate": 9.872784143008012e-05, + "loss": 2.0464, + "step": 3253 + }, + { + "epoch": 0.9987722529158993, + "grad_norm": 0.558159589767456, + "learning_rate": 9.872672708526416e-05, + "loss": 2.1013, + "step": 3254 + }, + { + "epoch": 0.9990791896869244, + "grad_norm": 0.5479860901832581, + "learning_rate": 9.872561225890211e-05, + "loss": 2.0497, + "step": 3255 + }, + { + "epoch": 0.9993861264579497, + "grad_norm": 0.5538234114646912, + "learning_rate": 9.872449695100503e-05, + "loss": 2.1239, + "step": 3256 + }, + { + "epoch": 0.9996930632289748, + "grad_norm": 0.5970771908760071, + "learning_rate": 9.872338116158389e-05, + "loss": 2.0693, + "step": 3257 + }, + { + "epoch": 1.0, + "grad_norm": 0.5118132829666138, + "learning_rate": 9.872226489064975e-05, + "loss": 2.0302, + "step": 3258 + }, + { + "epoch": 1.0003069367710251, + "grad_norm": 0.538902223110199, + "learning_rate": 9.872114813821363e-05, + "loss": 2.0604, + "step": 3259 + }, + { + "epoch": 1.0006138735420504, + "grad_norm": 0.47124916315078735, + "learning_rate": 9.872003090428657e-05, + "loss": 2.054, + "step": 3260 + }, + { + "epoch": 1.0009208103130756, + "grad_norm": 0.5109235048294067, + "learning_rate": 9.87189131888796e-05, + "loss": 2.0107, + "step": 3261 + }, + { + "epoch": 1.0012277470841007, + "grad_norm": 0.5530306696891785, + "learning_rate": 9.871779499200377e-05, + "loss": 2.0914, + "step": 3262 + }, + { + "epoch": 1.0015346838551258, + "grad_norm": 0.6271992325782776, + "learning_rate": 9.871667631367017e-05, + "loss": 1.9855, + "step": 3263 + }, + { + "epoch": 1.0018416206261511, + "grad_norm": 0.5752004384994507, + "learning_rate": 9.871555715388978e-05, + "loss": 2.0689, + "step": 3264 + }, + { + "epoch": 1.0021485573971762, + "grad_norm": 0.6185278296470642, + "learning_rate": 9.871443751267373e-05, + "loss": 2.0751, + "step": 3265 + }, + { + "epoch": 1.0024554941682013, + "grad_norm": 0.625248908996582, + "learning_rate": 9.871331739003304e-05, + "loss": 2.102, + "step": 3266 + }, + { + "epoch": 1.0027624309392265, + "grad_norm": 0.6345300078392029, + "learning_rate": 9.87121967859788e-05, + "loss": 2.0898, + "step": 3267 + }, + { + "epoch": 1.0030693677102518, + "grad_norm": 0.6836622953414917, + "learning_rate": 9.871107570052207e-05, + "loss": 2.1348, + "step": 3268 + }, + { + "epoch": 1.003376304481277, + "grad_norm": 0.699739933013916, + "learning_rate": 9.870995413367397e-05, + "loss": 2.0085, + "step": 3269 + }, + { + "epoch": 1.003683241252302, + "grad_norm": 0.650558590888977, + "learning_rate": 9.870883208544553e-05, + "loss": 2.0927, + "step": 3270 + }, + { + "epoch": 1.0039901780233271, + "grad_norm": 0.6837300658226013, + "learning_rate": 9.870770955584785e-05, + "loss": 2.1415, + "step": 3271 + }, + { + "epoch": 1.0042971147943525, + "grad_norm": 0.595761239528656, + "learning_rate": 9.870658654489206e-05, + "loss": 2.0372, + "step": 3272 + }, + { + "epoch": 1.0046040515653776, + "grad_norm": 0.5177203416824341, + "learning_rate": 9.870546305258922e-05, + "loss": 2.053, + "step": 3273 + }, + { + "epoch": 1.0049109883364027, + "grad_norm": 0.5392438173294067, + "learning_rate": 9.870433907895045e-05, + "loss": 2.0886, + "step": 3274 + }, + { + "epoch": 1.0052179251074278, + "grad_norm": 0.594776451587677, + "learning_rate": 9.870321462398686e-05, + "loss": 2.0158, + "step": 3275 + }, + { + "epoch": 1.0055248618784531, + "grad_norm": 0.6363179683685303, + "learning_rate": 9.870208968770955e-05, + "loss": 2.0532, + "step": 3276 + }, + { + "epoch": 1.0058317986494782, + "grad_norm": 0.7506567239761353, + "learning_rate": 9.870096427012965e-05, + "loss": 2.1288, + "step": 3277 + }, + { + "epoch": 1.0061387354205034, + "grad_norm": 0.7155289053916931, + "learning_rate": 9.869983837125828e-05, + "loss": 2.0859, + "step": 3278 + }, + { + "epoch": 1.0064456721915285, + "grad_norm": 0.7589760422706604, + "learning_rate": 9.869871199110656e-05, + "loss": 2.1668, + "step": 3279 + }, + { + "epoch": 1.0067526089625538, + "grad_norm": 0.6161168217658997, + "learning_rate": 9.869758512968562e-05, + "loss": 2.0421, + "step": 3280 + }, + { + "epoch": 1.007059545733579, + "grad_norm": 0.5722637176513672, + "learning_rate": 9.86964577870066e-05, + "loss": 2.1333, + "step": 3281 + }, + { + "epoch": 1.007366482504604, + "grad_norm": 0.6443020701408386, + "learning_rate": 9.869532996308065e-05, + "loss": 2.0227, + "step": 3282 + }, + { + "epoch": 1.0076734192756291, + "grad_norm": 0.6603342890739441, + "learning_rate": 9.869420165791891e-05, + "loss": 2.0888, + "step": 3283 + }, + { + "epoch": 1.0079803560466545, + "grad_norm": 0.6666482090950012, + "learning_rate": 9.869307287153251e-05, + "loss": 2.0132, + "step": 3284 + }, + { + "epoch": 1.0082872928176796, + "grad_norm": 0.6691575646400452, + "learning_rate": 9.869194360393264e-05, + "loss": 2.0752, + "step": 3285 + }, + { + "epoch": 1.0085942295887047, + "grad_norm": 0.6142565011978149, + "learning_rate": 9.869081385513044e-05, + "loss": 2.0491, + "step": 3286 + }, + { + "epoch": 1.0089011663597298, + "grad_norm": 0.5869930386543274, + "learning_rate": 9.868968362513708e-05, + "loss": 2.1252, + "step": 3287 + }, + { + "epoch": 1.0092081031307552, + "grad_norm": 0.532183825969696, + "learning_rate": 9.868855291396373e-05, + "loss": 2.0589, + "step": 3288 + }, + { + "epoch": 1.0095150399017803, + "grad_norm": 0.616374135017395, + "learning_rate": 9.868742172162156e-05, + "loss": 2.0808, + "step": 3289 + }, + { + "epoch": 1.0098219766728054, + "grad_norm": 0.5750923156738281, + "learning_rate": 9.868629004812176e-05, + "loss": 2.0407, + "step": 3290 + }, + { + "epoch": 1.0101289134438305, + "grad_norm": 0.6161531209945679, + "learning_rate": 9.86851578934755e-05, + "loss": 2.0938, + "step": 3291 + }, + { + "epoch": 1.0104358502148558, + "grad_norm": 0.5369158983230591, + "learning_rate": 9.868402525769397e-05, + "loss": 2.1298, + "step": 3292 + }, + { + "epoch": 1.010742786985881, + "grad_norm": 0.5134824514389038, + "learning_rate": 9.868289214078837e-05, + "loss": 2.0345, + "step": 3293 + }, + { + "epoch": 1.011049723756906, + "grad_norm": 0.4972594082355499, + "learning_rate": 9.868175854276991e-05, + "loss": 2.1264, + "step": 3294 + }, + { + "epoch": 1.0113566605279312, + "grad_norm": 0.5727534890174866, + "learning_rate": 9.868062446364976e-05, + "loss": 2.1668, + "step": 3295 + }, + { + "epoch": 1.0116635972989565, + "grad_norm": 0.6384626030921936, + "learning_rate": 9.867948990343915e-05, + "loss": 2.1125, + "step": 3296 + }, + { + "epoch": 1.0119705340699816, + "grad_norm": 0.7591070532798767, + "learning_rate": 9.867835486214929e-05, + "loss": 2.0975, + "step": 3297 + }, + { + "epoch": 1.0122774708410067, + "grad_norm": 0.7940282821655273, + "learning_rate": 9.86772193397914e-05, + "loss": 2.0107, + "step": 3298 + }, + { + "epoch": 1.0125844076120318, + "grad_norm": 0.6877933144569397, + "learning_rate": 9.86760833363767e-05, + "loss": 2.0684, + "step": 3299 + }, + { + "epoch": 1.0128913443830572, + "grad_norm": 0.5361137986183167, + "learning_rate": 9.867494685191641e-05, + "loss": 2.0426, + "step": 3300 + }, + { + "epoch": 1.0131982811540823, + "grad_norm": 0.5104349851608276, + "learning_rate": 9.867380988642177e-05, + "loss": 2.0849, + "step": 3301 + }, + { + "epoch": 1.0135052179251074, + "grad_norm": 0.6133849024772644, + "learning_rate": 9.867267243990399e-05, + "loss": 2.0789, + "step": 3302 + }, + { + "epoch": 1.0138121546961325, + "grad_norm": 0.6607559323310852, + "learning_rate": 9.867153451237436e-05, + "loss": 2.0978, + "step": 3303 + }, + { + "epoch": 1.0141190914671578, + "grad_norm": 0.6853774189949036, + "learning_rate": 9.867039610384409e-05, + "loss": 2.1612, + "step": 3304 + }, + { + "epoch": 1.014426028238183, + "grad_norm": 0.6326626539230347, + "learning_rate": 9.866925721432442e-05, + "loss": 2.0887, + "step": 3305 + }, + { + "epoch": 1.014732965009208, + "grad_norm": 0.5483830571174622, + "learning_rate": 9.866811784382665e-05, + "loss": 2.0522, + "step": 3306 + }, + { + "epoch": 1.0150399017802332, + "grad_norm": 0.5980744957923889, + "learning_rate": 9.866697799236201e-05, + "loss": 2.0666, + "step": 3307 + }, + { + "epoch": 1.0153468385512585, + "grad_norm": 0.6047075986862183, + "learning_rate": 9.866583765994177e-05, + "loss": 2.0924, + "step": 3308 + }, + { + "epoch": 1.0156537753222836, + "grad_norm": 0.5932674407958984, + "learning_rate": 9.86646968465772e-05, + "loss": 2.0426, + "step": 3309 + }, + { + "epoch": 1.0159607120933087, + "grad_norm": 0.5349873304367065, + "learning_rate": 9.866355555227957e-05, + "loss": 2.027, + "step": 3310 + }, + { + "epoch": 1.0162676488643339, + "grad_norm": 0.5090891122817993, + "learning_rate": 9.866241377706015e-05, + "loss": 2.0554, + "step": 3311 + }, + { + "epoch": 1.0165745856353592, + "grad_norm": 0.605268120765686, + "learning_rate": 9.866127152093025e-05, + "loss": 2.0788, + "step": 3312 + }, + { + "epoch": 1.0168815224063843, + "grad_norm": 0.6006563305854797, + "learning_rate": 9.866012878390113e-05, + "loss": 2.0154, + "step": 3313 + }, + { + "epoch": 1.0171884591774094, + "grad_norm": 0.6412727236747742, + "learning_rate": 9.865898556598409e-05, + "loss": 2.0948, + "step": 3314 + }, + { + "epoch": 1.0174953959484345, + "grad_norm": 0.512140154838562, + "learning_rate": 9.865784186719046e-05, + "loss": 2.0314, + "step": 3315 + }, + { + "epoch": 1.0178023327194599, + "grad_norm": 0.48285913467407227, + "learning_rate": 9.865669768753151e-05, + "loss": 1.9689, + "step": 3316 + }, + { + "epoch": 1.018109269490485, + "grad_norm": 0.6067737340927124, + "learning_rate": 9.865555302701854e-05, + "loss": 2.1042, + "step": 3317 + }, + { + "epoch": 1.01841620626151, + "grad_norm": 0.6272363662719727, + "learning_rate": 9.865440788566289e-05, + "loss": 2.1092, + "step": 3318 + }, + { + "epoch": 1.0187231430325352, + "grad_norm": 0.6264182925224304, + "learning_rate": 9.865326226347586e-05, + "loss": 2.0445, + "step": 3319 + }, + { + "epoch": 1.0190300798035605, + "grad_norm": 0.5642834901809692, + "learning_rate": 9.86521161604688e-05, + "loss": 2.1041, + "step": 3320 + }, + { + "epoch": 1.0193370165745856, + "grad_norm": 0.5188324451446533, + "learning_rate": 9.865096957665297e-05, + "loss": 2.0174, + "step": 3321 + }, + { + "epoch": 1.0196439533456108, + "grad_norm": 0.5204416513442993, + "learning_rate": 9.864982251203976e-05, + "loss": 2.0927, + "step": 3322 + }, + { + "epoch": 1.0199508901166359, + "grad_norm": 0.5845292806625366, + "learning_rate": 9.86486749666405e-05, + "loss": 2.0751, + "step": 3323 + }, + { + "epoch": 1.0202578268876612, + "grad_norm": 0.5514994263648987, + "learning_rate": 9.86475269404665e-05, + "loss": 2.0976, + "step": 3324 + }, + { + "epoch": 1.0205647636586863, + "grad_norm": 0.6578981280326843, + "learning_rate": 9.864637843352915e-05, + "loss": 2.0668, + "step": 3325 + }, + { + "epoch": 1.0208717004297114, + "grad_norm": 0.6396434307098389, + "learning_rate": 9.864522944583976e-05, + "loss": 2.0648, + "step": 3326 + }, + { + "epoch": 1.0211786372007365, + "grad_norm": 0.548759400844574, + "learning_rate": 9.86440799774097e-05, + "loss": 2.0873, + "step": 3327 + }, + { + "epoch": 1.0214855739717619, + "grad_norm": 0.5739279985427856, + "learning_rate": 9.864293002825033e-05, + "loss": 2.0623, + "step": 3328 + }, + { + "epoch": 1.021792510742787, + "grad_norm": 0.5882315039634705, + "learning_rate": 9.864177959837303e-05, + "loss": 2.0399, + "step": 3329 + }, + { + "epoch": 1.022099447513812, + "grad_norm": 0.563359797000885, + "learning_rate": 9.864062868778914e-05, + "loss": 2.0839, + "step": 3330 + }, + { + "epoch": 1.0224063842848374, + "grad_norm": 0.6162607073783875, + "learning_rate": 9.863947729651006e-05, + "loss": 2.0439, + "step": 3331 + }, + { + "epoch": 1.0227133210558625, + "grad_norm": 0.6540365815162659, + "learning_rate": 9.863832542454715e-05, + "loss": 2.1234, + "step": 3332 + }, + { + "epoch": 1.0230202578268877, + "grad_norm": 0.6401089429855347, + "learning_rate": 9.86371730719118e-05, + "loss": 2.0418, + "step": 3333 + }, + { + "epoch": 1.0233271945979128, + "grad_norm": 0.6456391215324402, + "learning_rate": 9.86360202386154e-05, + "loss": 2.1191, + "step": 3334 + }, + { + "epoch": 1.023634131368938, + "grad_norm": 0.59992516040802, + "learning_rate": 9.863486692466933e-05, + "loss": 2.0582, + "step": 3335 + }, + { + "epoch": 1.0239410681399632, + "grad_norm": 0.5932520627975464, + "learning_rate": 9.8633713130085e-05, + "loss": 2.1812, + "step": 3336 + }, + { + "epoch": 1.0242480049109883, + "grad_norm": 0.6322866082191467, + "learning_rate": 9.863255885487384e-05, + "loss": 2.1523, + "step": 3337 + }, + { + "epoch": 1.0245549416820134, + "grad_norm": 0.6291313171386719, + "learning_rate": 9.863140409904719e-05, + "loss": 2.0495, + "step": 3338 + }, + { + "epoch": 1.0248618784530388, + "grad_norm": 0.6272565126419067, + "learning_rate": 9.863024886261653e-05, + "loss": 1.9812, + "step": 3339 + }, + { + "epoch": 1.025168815224064, + "grad_norm": 0.6485729217529297, + "learning_rate": 9.862909314559323e-05, + "loss": 2.0826, + "step": 3340 + }, + { + "epoch": 1.025475751995089, + "grad_norm": 0.608239471912384, + "learning_rate": 9.862793694798875e-05, + "loss": 2.0519, + "step": 3341 + }, + { + "epoch": 1.0257826887661141, + "grad_norm": 0.5492779612541199, + "learning_rate": 9.862678026981447e-05, + "loss": 1.9901, + "step": 3342 + }, + { + "epoch": 1.0260896255371394, + "grad_norm": 0.524030327796936, + "learning_rate": 9.862562311108187e-05, + "loss": 2.0695, + "step": 3343 + }, + { + "epoch": 1.0263965623081646, + "grad_norm": 0.6835227608680725, + "learning_rate": 9.862446547180235e-05, + "loss": 2.1312, + "step": 3344 + }, + { + "epoch": 1.0267034990791897, + "grad_norm": 0.6771748065948486, + "learning_rate": 9.862330735198736e-05, + "loss": 2.0566, + "step": 3345 + }, + { + "epoch": 1.0270104358502148, + "grad_norm": 0.609993577003479, + "learning_rate": 9.862214875164835e-05, + "loss": 2.1463, + "step": 3346 + }, + { + "epoch": 1.0273173726212401, + "grad_norm": 0.6617777347564697, + "learning_rate": 9.862098967079677e-05, + "loss": 2.0485, + "step": 3347 + }, + { + "epoch": 1.0276243093922652, + "grad_norm": 0.7935113906860352, + "learning_rate": 9.861983010944407e-05, + "loss": 2.0528, + "step": 3348 + }, + { + "epoch": 1.0279312461632903, + "grad_norm": 0.7510255575180054, + "learning_rate": 9.861867006760172e-05, + "loss": 1.9803, + "step": 3349 + }, + { + "epoch": 1.0282381829343155, + "grad_norm": 0.6944519281387329, + "learning_rate": 9.861750954528117e-05, + "loss": 2.0488, + "step": 3350 + }, + { + "epoch": 1.0285451197053408, + "grad_norm": 0.6057126522064209, + "learning_rate": 9.861634854249389e-05, + "loss": 2.1465, + "step": 3351 + }, + { + "epoch": 1.028852056476366, + "grad_norm": 0.6156182289123535, + "learning_rate": 9.861518705925135e-05, + "loss": 2.1227, + "step": 3352 + }, + { + "epoch": 1.029158993247391, + "grad_norm": 0.6016978621482849, + "learning_rate": 9.861402509556506e-05, + "loss": 2.0238, + "step": 3353 + }, + { + "epoch": 1.0294659300184161, + "grad_norm": 0.5987950563430786, + "learning_rate": 9.861286265144648e-05, + "loss": 2.0529, + "step": 3354 + }, + { + "epoch": 1.0297728667894415, + "grad_norm": 0.6011384725570679, + "learning_rate": 9.861169972690707e-05, + "loss": 2.0612, + "step": 3355 + }, + { + "epoch": 1.0300798035604666, + "grad_norm": 0.5217840671539307, + "learning_rate": 9.861053632195838e-05, + "loss": 2.0472, + "step": 3356 + }, + { + "epoch": 1.0303867403314917, + "grad_norm": 0.5202180743217468, + "learning_rate": 9.860937243661186e-05, + "loss": 2.1301, + "step": 3357 + }, + { + "epoch": 1.0306936771025168, + "grad_norm": 0.572290301322937, + "learning_rate": 9.860820807087905e-05, + "loss": 2.0309, + "step": 3358 + }, + { + "epoch": 1.0310006138735421, + "grad_norm": 0.5088694095611572, + "learning_rate": 9.860704322477142e-05, + "loss": 2.0789, + "step": 3359 + }, + { + "epoch": 1.0313075506445673, + "grad_norm": 0.5546056032180786, + "learning_rate": 9.860587789830052e-05, + "loss": 1.9708, + "step": 3360 + }, + { + "epoch": 1.0316144874155924, + "grad_norm": 0.5152996182441711, + "learning_rate": 9.860471209147782e-05, + "loss": 2.0656, + "step": 3361 + }, + { + "epoch": 1.0319214241866175, + "grad_norm": 0.4997018873691559, + "learning_rate": 9.860354580431488e-05, + "loss": 2.1404, + "step": 3362 + }, + { + "epoch": 1.0322283609576428, + "grad_norm": 0.5464209318161011, + "learning_rate": 9.860237903682321e-05, + "loss": 2.0013, + "step": 3363 + }, + { + "epoch": 1.032535297728668, + "grad_norm": 0.4934932589530945, + "learning_rate": 9.860121178901435e-05, + "loss": 2.0873, + "step": 3364 + }, + { + "epoch": 1.032842234499693, + "grad_norm": 0.5755184292793274, + "learning_rate": 9.860004406089982e-05, + "loss": 2.0706, + "step": 3365 + }, + { + "epoch": 1.0331491712707181, + "grad_norm": 0.6155427098274231, + "learning_rate": 9.859887585249117e-05, + "loss": 2.1153, + "step": 3366 + }, + { + "epoch": 1.0334561080417435, + "grad_norm": 0.6251068711280823, + "learning_rate": 9.859770716379995e-05, + "loss": 1.9988, + "step": 3367 + }, + { + "epoch": 1.0337630448127686, + "grad_norm": 0.5652515888214111, + "learning_rate": 9.85965379948377e-05, + "loss": 1.9834, + "step": 3368 + }, + { + "epoch": 1.0340699815837937, + "grad_norm": 0.49031418561935425, + "learning_rate": 9.859536834561599e-05, + "loss": 2.0719, + "step": 3369 + }, + { + "epoch": 1.0343769183548188, + "grad_norm": 0.5014585852622986, + "learning_rate": 9.859419821614635e-05, + "loss": 2.0309, + "step": 3370 + }, + { + "epoch": 1.0346838551258442, + "grad_norm": 0.5657221674919128, + "learning_rate": 9.859302760644036e-05, + "loss": 2.048, + "step": 3371 + }, + { + "epoch": 1.0349907918968693, + "grad_norm": 0.7023506164550781, + "learning_rate": 9.85918565165096e-05, + "loss": 2.033, + "step": 3372 + }, + { + "epoch": 1.0352977286678944, + "grad_norm": 0.5712850689888, + "learning_rate": 9.859068494636565e-05, + "loss": 2.1006, + "step": 3373 + }, + { + "epoch": 1.0356046654389195, + "grad_norm": 0.5352653861045837, + "learning_rate": 9.858951289602004e-05, + "loss": 1.9775, + "step": 3374 + }, + { + "epoch": 1.0359116022099448, + "grad_norm": 0.5282073616981506, + "learning_rate": 9.85883403654844e-05, + "loss": 2.0388, + "step": 3375 + }, + { + "epoch": 1.03621853898097, + "grad_norm": 0.6164727210998535, + "learning_rate": 9.85871673547703e-05, + "loss": 2.0758, + "step": 3376 + }, + { + "epoch": 1.036525475751995, + "grad_norm": 0.6034660935401917, + "learning_rate": 9.858599386388933e-05, + "loss": 2.0619, + "step": 3377 + }, + { + "epoch": 1.0368324125230202, + "grad_norm": 0.6129952073097229, + "learning_rate": 9.85848198928531e-05, + "loss": 2.0709, + "step": 3378 + }, + { + "epoch": 1.0371393492940455, + "grad_norm": 0.6287248134613037, + "learning_rate": 9.85836454416732e-05, + "loss": 2.1493, + "step": 3379 + }, + { + "epoch": 1.0374462860650706, + "grad_norm": 0.675419807434082, + "learning_rate": 9.858247051036124e-05, + "loss": 2.0558, + "step": 3380 + }, + { + "epoch": 1.0377532228360957, + "grad_norm": 0.6493481397628784, + "learning_rate": 9.858129509892882e-05, + "loss": 2.2019, + "step": 3381 + }, + { + "epoch": 1.0380601596071208, + "grad_norm": 0.6690036058425903, + "learning_rate": 9.85801192073876e-05, + "loss": 2.0069, + "step": 3382 + }, + { + "epoch": 1.0383670963781462, + "grad_norm": 0.6682954430580139, + "learning_rate": 9.857894283574913e-05, + "loss": 2.0559, + "step": 3383 + }, + { + "epoch": 1.0386740331491713, + "grad_norm": 0.6408236622810364, + "learning_rate": 9.857776598402508e-05, + "loss": 2.0837, + "step": 3384 + }, + { + "epoch": 1.0389809699201964, + "grad_norm": 0.7896385192871094, + "learning_rate": 9.85765886522271e-05, + "loss": 2.1344, + "step": 3385 + }, + { + "epoch": 1.0392879066912215, + "grad_norm": 0.7404007911682129, + "learning_rate": 9.857541084036677e-05, + "loss": 2.0937, + "step": 3386 + }, + { + "epoch": 1.0395948434622468, + "grad_norm": 0.6780609488487244, + "learning_rate": 9.857423254845577e-05, + "loss": 2.0279, + "step": 3387 + }, + { + "epoch": 1.039901780233272, + "grad_norm": 0.5989474654197693, + "learning_rate": 9.857305377650574e-05, + "loss": 2.0997, + "step": 3388 + }, + { + "epoch": 1.040208717004297, + "grad_norm": 0.5449484586715698, + "learning_rate": 9.857187452452832e-05, + "loss": 2.0544, + "step": 3389 + }, + { + "epoch": 1.0405156537753222, + "grad_norm": 0.6261779069900513, + "learning_rate": 9.857069479253516e-05, + "loss": 2.024, + "step": 3390 + }, + { + "epoch": 1.0408225905463475, + "grad_norm": 0.6665713787078857, + "learning_rate": 9.856951458053794e-05, + "loss": 2.1139, + "step": 3391 + }, + { + "epoch": 1.0411295273173726, + "grad_norm": 0.5861490964889526, + "learning_rate": 9.856833388854829e-05, + "loss": 2.0087, + "step": 3392 + }, + { + "epoch": 1.0414364640883977, + "grad_norm": 0.5511623620986938, + "learning_rate": 9.856715271657793e-05, + "loss": 2.106, + "step": 3393 + }, + { + "epoch": 1.0417434008594229, + "grad_norm": 0.5450705885887146, + "learning_rate": 9.856597106463848e-05, + "loss": 2.0669, + "step": 3394 + }, + { + "epoch": 1.0420503376304482, + "grad_norm": 0.5172801613807678, + "learning_rate": 9.856478893274163e-05, + "loss": 2.0492, + "step": 3395 + }, + { + "epoch": 1.0423572744014733, + "grad_norm": 0.580157458782196, + "learning_rate": 9.856360632089907e-05, + "loss": 2.0794, + "step": 3396 + }, + { + "epoch": 1.0426642111724984, + "grad_norm": 0.5138662457466125, + "learning_rate": 9.856242322912251e-05, + "loss": 2.0813, + "step": 3397 + }, + { + "epoch": 1.0429711479435237, + "grad_norm": 0.5626689791679382, + "learning_rate": 9.85612396574236e-05, + "loss": 2.071, + "step": 3398 + }, + { + "epoch": 1.0432780847145489, + "grad_norm": 0.6069894433021545, + "learning_rate": 9.856005560581407e-05, + "loss": 2.132, + "step": 3399 + }, + { + "epoch": 1.043585021485574, + "grad_norm": 0.547346293926239, + "learning_rate": 9.85588710743056e-05, + "loss": 2.0572, + "step": 3400 + }, + { + "epoch": 1.043891958256599, + "grad_norm": 0.5712311863899231, + "learning_rate": 9.855768606290992e-05, + "loss": 2.0943, + "step": 3401 + }, + { + "epoch": 1.0441988950276242, + "grad_norm": 0.5945014953613281, + "learning_rate": 9.85565005716387e-05, + "loss": 2.1004, + "step": 3402 + }, + { + "epoch": 1.0445058317986495, + "grad_norm": 0.5712563395500183, + "learning_rate": 9.85553146005037e-05, + "loss": 2.0817, + "step": 3403 + }, + { + "epoch": 1.0448127685696746, + "grad_norm": 0.552578866481781, + "learning_rate": 9.855412814951661e-05, + "loss": 2.0514, + "step": 3404 + }, + { + "epoch": 1.0451197053406998, + "grad_norm": 0.5654930472373962, + "learning_rate": 9.855294121868918e-05, + "loss": 2.1342, + "step": 3405 + }, + { + "epoch": 1.045426642111725, + "grad_norm": 0.516094446182251, + "learning_rate": 9.855175380803312e-05, + "loss": 2.01, + "step": 3406 + }, + { + "epoch": 1.0457335788827502, + "grad_norm": 0.5198549628257751, + "learning_rate": 9.855056591756018e-05, + "loss": 2.0423, + "step": 3407 + }, + { + "epoch": 1.0460405156537753, + "grad_norm": 0.45312678813934326, + "learning_rate": 9.854937754728209e-05, + "loss": 1.9767, + "step": 3408 + }, + { + "epoch": 1.0463474524248004, + "grad_norm": 0.4647958278656006, + "learning_rate": 9.854818869721059e-05, + "loss": 2.107, + "step": 3409 + }, + { + "epoch": 1.0466543891958258, + "grad_norm": 0.5034347772598267, + "learning_rate": 9.854699936735742e-05, + "loss": 2.0358, + "step": 3410 + }, + { + "epoch": 1.0469613259668509, + "grad_norm": 0.48189103603363037, + "learning_rate": 9.854580955773435e-05, + "loss": 2.0441, + "step": 3411 + }, + { + "epoch": 1.047268262737876, + "grad_norm": 0.5315099954605103, + "learning_rate": 9.854461926835316e-05, + "loss": 2.0222, + "step": 3412 + }, + { + "epoch": 1.047575199508901, + "grad_norm": 0.6013970971107483, + "learning_rate": 9.854342849922557e-05, + "loss": 2.09, + "step": 3413 + }, + { + "epoch": 1.0478821362799264, + "grad_norm": 0.7554240226745605, + "learning_rate": 9.854223725036339e-05, + "loss": 2.0411, + "step": 3414 + }, + { + "epoch": 1.0481890730509515, + "grad_norm": 0.7160158157348633, + "learning_rate": 9.854104552177835e-05, + "loss": 2.0858, + "step": 3415 + }, + { + "epoch": 1.0484960098219767, + "grad_norm": 0.5641576051712036, + "learning_rate": 9.853985331348225e-05, + "loss": 2.0287, + "step": 3416 + }, + { + "epoch": 1.0488029465930018, + "grad_norm": 0.5947676301002502, + "learning_rate": 9.853866062548687e-05, + "loss": 2.1177, + "step": 3417 + }, + { + "epoch": 1.049109883364027, + "grad_norm": 0.5780991911888123, + "learning_rate": 9.853746745780401e-05, + "loss": 2.024, + "step": 3418 + }, + { + "epoch": 1.0494168201350522, + "grad_norm": 0.6753053665161133, + "learning_rate": 9.853627381044543e-05, + "loss": 2.1303, + "step": 3419 + }, + { + "epoch": 1.0497237569060773, + "grad_norm": 0.7183442711830139, + "learning_rate": 9.853507968342295e-05, + "loss": 2.0845, + "step": 3420 + }, + { + "epoch": 1.0500306936771024, + "grad_norm": 0.6768840551376343, + "learning_rate": 9.853388507674837e-05, + "loss": 2.0991, + "step": 3421 + }, + { + "epoch": 1.0503376304481278, + "grad_norm": 0.624703049659729, + "learning_rate": 9.85326899904335e-05, + "loss": 2.0952, + "step": 3422 + }, + { + "epoch": 1.050644567219153, + "grad_norm": 0.523289144039154, + "learning_rate": 9.853149442449013e-05, + "loss": 2.0244, + "step": 3423 + }, + { + "epoch": 1.050951503990178, + "grad_norm": 0.4939860701560974, + "learning_rate": 9.853029837893008e-05, + "loss": 2.0312, + "step": 3424 + }, + { + "epoch": 1.0512584407612031, + "grad_norm": 0.5685132145881653, + "learning_rate": 9.852910185376519e-05, + "loss": 2.0863, + "step": 3425 + }, + { + "epoch": 1.0515653775322285, + "grad_norm": 0.5713129639625549, + "learning_rate": 9.852790484900725e-05, + "loss": 2.1182, + "step": 3426 + }, + { + "epoch": 1.0518723143032536, + "grad_norm": 0.5626100301742554, + "learning_rate": 9.852670736466813e-05, + "loss": 2.0187, + "step": 3427 + }, + { + "epoch": 1.0521792510742787, + "grad_norm": 0.5129684805870056, + "learning_rate": 9.852550940075965e-05, + "loss": 2.0354, + "step": 3428 + }, + { + "epoch": 1.0524861878453038, + "grad_norm": 0.6123769879341125, + "learning_rate": 9.852431095729361e-05, + "loss": 2.1315, + "step": 3429 + }, + { + "epoch": 1.0527931246163291, + "grad_norm": 0.66834956407547, + "learning_rate": 9.852311203428192e-05, + "loss": 2.1642, + "step": 3430 + }, + { + "epoch": 1.0531000613873542, + "grad_norm": 0.6253052353858948, + "learning_rate": 9.85219126317364e-05, + "loss": 2.0651, + "step": 3431 + }, + { + "epoch": 1.0534069981583793, + "grad_norm": 0.5162510871887207, + "learning_rate": 9.852071274966888e-05, + "loss": 2.0029, + "step": 3432 + }, + { + "epoch": 1.0537139349294045, + "grad_norm": 0.5725626349449158, + "learning_rate": 9.851951238809125e-05, + "loss": 2.0875, + "step": 3433 + }, + { + "epoch": 1.0540208717004298, + "grad_norm": 0.5319885611534119, + "learning_rate": 9.851831154701537e-05, + "loss": 2.0042, + "step": 3434 + }, + { + "epoch": 1.054327808471455, + "grad_norm": 0.5030925273895264, + "learning_rate": 9.851711022645307e-05, + "loss": 1.9805, + "step": 3435 + }, + { + "epoch": 1.05463474524248, + "grad_norm": 0.5786148309707642, + "learning_rate": 9.851590842641627e-05, + "loss": 2.1456, + "step": 3436 + }, + { + "epoch": 1.0549416820135051, + "grad_norm": 0.6246622800827026, + "learning_rate": 9.851470614691682e-05, + "loss": 2.042, + "step": 3437 + }, + { + "epoch": 1.0552486187845305, + "grad_norm": 0.5181210041046143, + "learning_rate": 9.851350338796662e-05, + "loss": 2.0423, + "step": 3438 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.5505120754241943, + "learning_rate": 9.851230014957754e-05, + "loss": 2.0478, + "step": 3439 + }, + { + "epoch": 1.0558624923265807, + "grad_norm": 0.6193632483482361, + "learning_rate": 9.851109643176147e-05, + "loss": 1.9904, + "step": 3440 + }, + { + "epoch": 1.0561694290976058, + "grad_norm": 0.6332803964614868, + "learning_rate": 9.85098922345303e-05, + "loss": 2.0037, + "step": 3441 + }, + { + "epoch": 1.0564763658686311, + "grad_norm": 0.5601481199264526, + "learning_rate": 9.850868755789595e-05, + "loss": 2.141, + "step": 3442 + }, + { + "epoch": 1.0567833026396563, + "grad_norm": 0.588182270526886, + "learning_rate": 9.850748240187033e-05, + "loss": 2.17, + "step": 3443 + }, + { + "epoch": 1.0570902394106814, + "grad_norm": 0.5955865383148193, + "learning_rate": 9.850627676646533e-05, + "loss": 2.1004, + "step": 3444 + }, + { + "epoch": 1.0573971761817065, + "grad_norm": 0.6412670612335205, + "learning_rate": 9.850507065169288e-05, + "loss": 2.0642, + "step": 3445 + }, + { + "epoch": 1.0577041129527318, + "grad_norm": 0.5597305297851562, + "learning_rate": 9.850386405756489e-05, + "loss": 2.0412, + "step": 3446 + }, + { + "epoch": 1.058011049723757, + "grad_norm": 0.5633887052536011, + "learning_rate": 9.850265698409328e-05, + "loss": 1.9976, + "step": 3447 + }, + { + "epoch": 1.058317986494782, + "grad_norm": 0.5924213528633118, + "learning_rate": 9.850144943128998e-05, + "loss": 2.0715, + "step": 3448 + }, + { + "epoch": 1.0586249232658071, + "grad_norm": 0.5968048572540283, + "learning_rate": 9.850024139916694e-05, + "loss": 2.0755, + "step": 3449 + }, + { + "epoch": 1.0589318600368325, + "grad_norm": 0.5745044946670532, + "learning_rate": 9.849903288773609e-05, + "loss": 2.0615, + "step": 3450 + }, + { + "epoch": 1.0592387968078576, + "grad_norm": 0.5154273509979248, + "learning_rate": 9.849782389700936e-05, + "loss": 2.0429, + "step": 3451 + }, + { + "epoch": 1.0595457335788827, + "grad_norm": 0.5307286977767944, + "learning_rate": 9.849661442699871e-05, + "loss": 2.0788, + "step": 3452 + }, + { + "epoch": 1.0598526703499078, + "grad_norm": 0.5445010662078857, + "learning_rate": 9.84954044777161e-05, + "loss": 2.0598, + "step": 3453 + }, + { + "epoch": 1.0601596071209332, + "grad_norm": 0.5858064889907837, + "learning_rate": 9.849419404917347e-05, + "loss": 2.069, + "step": 3454 + }, + { + "epoch": 1.0604665438919583, + "grad_norm": 0.5906962156295776, + "learning_rate": 9.84929831413828e-05, + "loss": 2.1256, + "step": 3455 + }, + { + "epoch": 1.0607734806629834, + "grad_norm": 0.6632845997810364, + "learning_rate": 9.849177175435605e-05, + "loss": 2.1002, + "step": 3456 + }, + { + "epoch": 1.0610804174340085, + "grad_norm": 0.6352782845497131, + "learning_rate": 9.849055988810518e-05, + "loss": 2.0901, + "step": 3457 + }, + { + "epoch": 1.0613873542050338, + "grad_norm": 0.5406731963157654, + "learning_rate": 9.848934754264218e-05, + "loss": 2.0562, + "step": 3458 + }, + { + "epoch": 1.061694290976059, + "grad_norm": 0.6067590117454529, + "learning_rate": 9.848813471797902e-05, + "loss": 2.0914, + "step": 3459 + }, + { + "epoch": 1.062001227747084, + "grad_norm": 0.5876826047897339, + "learning_rate": 9.84869214141277e-05, + "loss": 2.0065, + "step": 3460 + }, + { + "epoch": 1.0623081645181092, + "grad_norm": 0.611648440361023, + "learning_rate": 9.84857076311002e-05, + "loss": 2.1252, + "step": 3461 + }, + { + "epoch": 1.0626151012891345, + "grad_norm": 0.568358302116394, + "learning_rate": 9.848449336890853e-05, + "loss": 2.0312, + "step": 3462 + }, + { + "epoch": 1.0629220380601596, + "grad_norm": 0.5303518772125244, + "learning_rate": 9.848327862756466e-05, + "loss": 1.9989, + "step": 3463 + }, + { + "epoch": 1.0632289748311847, + "grad_norm": 0.5377182960510254, + "learning_rate": 9.848206340708062e-05, + "loss": 2.0759, + "step": 3464 + }, + { + "epoch": 1.06353591160221, + "grad_norm": 0.5178431868553162, + "learning_rate": 9.848084770746842e-05, + "loss": 2.0613, + "step": 3465 + }, + { + "epoch": 1.0638428483732352, + "grad_norm": 0.4605518877506256, + "learning_rate": 9.847963152874007e-05, + "loss": 1.9961, + "step": 3466 + }, + { + "epoch": 1.0641497851442603, + "grad_norm": 0.5262506604194641, + "learning_rate": 9.847841487090758e-05, + "loss": 2.032, + "step": 3467 + }, + { + "epoch": 1.0644567219152854, + "grad_norm": 0.5210484862327576, + "learning_rate": 9.847719773398298e-05, + "loss": 2.106, + "step": 3468 + }, + { + "epoch": 1.0647636586863105, + "grad_norm": 0.5159584283828735, + "learning_rate": 9.84759801179783e-05, + "loss": 2.07, + "step": 3469 + }, + { + "epoch": 1.0650705954573358, + "grad_norm": 0.5094224810600281, + "learning_rate": 9.847476202290557e-05, + "loss": 2.1379, + "step": 3470 + }, + { + "epoch": 1.065377532228361, + "grad_norm": 0.5180851221084595, + "learning_rate": 9.847354344877684e-05, + "loss": 2.0911, + "step": 3471 + }, + { + "epoch": 1.065684468999386, + "grad_norm": 0.5476199984550476, + "learning_rate": 9.847232439560412e-05, + "loss": 2.0654, + "step": 3472 + }, + { + "epoch": 1.0659914057704114, + "grad_norm": 0.5314182639122009, + "learning_rate": 9.84711048633995e-05, + "loss": 1.9829, + "step": 3473 + }, + { + "epoch": 1.0662983425414365, + "grad_norm": 0.549379825592041, + "learning_rate": 9.8469884852175e-05, + "loss": 2.0876, + "step": 3474 + }, + { + "epoch": 1.0666052793124616, + "grad_norm": 0.6280861496925354, + "learning_rate": 9.84686643619427e-05, + "loss": 2.1026, + "step": 3475 + }, + { + "epoch": 1.0669122160834867, + "grad_norm": 0.5838838219642639, + "learning_rate": 9.846744339271464e-05, + "loss": 2.0553, + "step": 3476 + }, + { + "epoch": 1.0672191528545119, + "grad_norm": 0.6090747117996216, + "learning_rate": 9.84662219445029e-05, + "loss": 2.0983, + "step": 3477 + }, + { + "epoch": 1.0675260896255372, + "grad_norm": 0.515504002571106, + "learning_rate": 9.846500001731955e-05, + "loss": 2.0992, + "step": 3478 + }, + { + "epoch": 1.0678330263965623, + "grad_norm": 0.5083954930305481, + "learning_rate": 9.846377761117667e-05, + "loss": 1.9851, + "step": 3479 + }, + { + "epoch": 1.0681399631675874, + "grad_norm": 0.5102222561836243, + "learning_rate": 9.846255472608632e-05, + "loss": 2.0553, + "step": 3480 + }, + { + "epoch": 1.0684468999386127, + "grad_norm": 0.5123574137687683, + "learning_rate": 9.846133136206061e-05, + "loss": 2.0382, + "step": 3481 + }, + { + "epoch": 1.0687538367096379, + "grad_norm": 0.5657833814620972, + "learning_rate": 9.84601075191116e-05, + "loss": 2.0735, + "step": 3482 + }, + { + "epoch": 1.069060773480663, + "grad_norm": 0.5460711121559143, + "learning_rate": 9.845888319725143e-05, + "loss": 2.0445, + "step": 3483 + }, + { + "epoch": 1.069367710251688, + "grad_norm": 0.42860034108161926, + "learning_rate": 9.845765839649217e-05, + "loss": 2.0166, + "step": 3484 + }, + { + "epoch": 1.0696746470227134, + "grad_norm": 0.5413190126419067, + "learning_rate": 9.845643311684592e-05, + "loss": 1.9923, + "step": 3485 + }, + { + "epoch": 1.0699815837937385, + "grad_norm": 0.4982166290283203, + "learning_rate": 9.84552073583248e-05, + "loss": 2.0279, + "step": 3486 + }, + { + "epoch": 1.0702885205647636, + "grad_norm": 0.4824393689632416, + "learning_rate": 9.845398112094091e-05, + "loss": 1.9661, + "step": 3487 + }, + { + "epoch": 1.0705954573357888, + "grad_norm": 0.5690898895263672, + "learning_rate": 9.845275440470639e-05, + "loss": 2.0866, + "step": 3488 + }, + { + "epoch": 1.070902394106814, + "grad_norm": 0.6087098717689514, + "learning_rate": 9.845152720963335e-05, + "loss": 2.055, + "step": 3489 + }, + { + "epoch": 1.0712093308778392, + "grad_norm": 0.5754218101501465, + "learning_rate": 9.845029953573392e-05, + "loss": 2.0577, + "step": 3490 + }, + { + "epoch": 1.0715162676488643, + "grad_norm": 0.619746744632721, + "learning_rate": 9.844907138302023e-05, + "loss": 2.0694, + "step": 3491 + }, + { + "epoch": 1.0718232044198894, + "grad_norm": 0.5165389776229858, + "learning_rate": 9.844784275150442e-05, + "loss": 1.9618, + "step": 3492 + }, + { + "epoch": 1.0721301411909148, + "grad_norm": 0.5098079442977905, + "learning_rate": 9.844661364119863e-05, + "loss": 2.0021, + "step": 3493 + }, + { + "epoch": 1.0724370779619399, + "grad_norm": 0.5978688597679138, + "learning_rate": 9.8445384052115e-05, + "loss": 2.0861, + "step": 3494 + }, + { + "epoch": 1.072744014732965, + "grad_norm": 0.5498695373535156, + "learning_rate": 9.844415398426572e-05, + "loss": 2.095, + "step": 3495 + }, + { + "epoch": 1.07305095150399, + "grad_norm": 0.4890369474887848, + "learning_rate": 9.844292343766289e-05, + "loss": 1.9819, + "step": 3496 + }, + { + "epoch": 1.0733578882750154, + "grad_norm": 0.49551400542259216, + "learning_rate": 9.844169241231871e-05, + "loss": 2.109, + "step": 3497 + }, + { + "epoch": 1.0736648250460405, + "grad_norm": 0.5358633399009705, + "learning_rate": 9.844046090824533e-05, + "loss": 2.0579, + "step": 3498 + }, + { + "epoch": 1.0739717618170657, + "grad_norm": 0.5990919470787048, + "learning_rate": 9.843922892545492e-05, + "loss": 2.1962, + "step": 3499 + }, + { + "epoch": 1.0742786985880908, + "grad_norm": 0.5973169207572937, + "learning_rate": 9.843799646395967e-05, + "loss": 2.0691, + "step": 3500 + }, + { + "epoch": 1.074585635359116, + "grad_norm": 0.5875831246376038, + "learning_rate": 9.843676352377172e-05, + "loss": 2.0807, + "step": 3501 + }, + { + "epoch": 1.0748925721301412, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.84355301049033e-05, + "loss": 2.0694, + "step": 3502 + }, + { + "epoch": 1.0751995089011663, + "grad_norm": 0.7694209814071655, + "learning_rate": 9.843429620736659e-05, + "loss": 2.1504, + "step": 3503 + }, + { + "epoch": 1.0755064456721914, + "grad_norm": 0.7930089831352234, + "learning_rate": 9.843306183117376e-05, + "loss": 2.0635, + "step": 3504 + }, + { + "epoch": 1.0758133824432168, + "grad_norm": 0.6518469452857971, + "learning_rate": 9.843182697633704e-05, + "loss": 2.0395, + "step": 3505 + }, + { + "epoch": 1.076120319214242, + "grad_norm": 0.49737605452537537, + "learning_rate": 9.843059164286861e-05, + "loss": 1.9875, + "step": 3506 + }, + { + "epoch": 1.076427255985267, + "grad_norm": 0.5311492085456848, + "learning_rate": 9.84293558307807e-05, + "loss": 2.1331, + "step": 3507 + }, + { + "epoch": 1.0767341927562921, + "grad_norm": 0.6801449656486511, + "learning_rate": 9.842811954008551e-05, + "loss": 2.0991, + "step": 3508 + }, + { + "epoch": 1.0770411295273175, + "grad_norm": 0.5404406189918518, + "learning_rate": 9.842688277079523e-05, + "loss": 2.0482, + "step": 3509 + }, + { + "epoch": 1.0773480662983426, + "grad_norm": 0.6136532425880432, + "learning_rate": 9.842564552292215e-05, + "loss": 2.1016, + "step": 3510 + }, + { + "epoch": 1.0776550030693677, + "grad_norm": 0.5874183773994446, + "learning_rate": 9.842440779647843e-05, + "loss": 2.0495, + "step": 3511 + }, + { + "epoch": 1.0779619398403928, + "grad_norm": 0.4891047775745392, + "learning_rate": 9.842316959147635e-05, + "loss": 2.0592, + "step": 3512 + }, + { + "epoch": 1.0782688766114181, + "grad_norm": 0.5115689635276794, + "learning_rate": 9.84219309079281e-05, + "loss": 2.0084, + "step": 3513 + }, + { + "epoch": 1.0785758133824432, + "grad_norm": 0.5662370324134827, + "learning_rate": 9.842069174584597e-05, + "loss": 2.1134, + "step": 3514 + }, + { + "epoch": 1.0788827501534684, + "grad_norm": 0.6859605312347412, + "learning_rate": 9.841945210524217e-05, + "loss": 2.1144, + "step": 3515 + }, + { + "epoch": 1.0791896869244935, + "grad_norm": 0.8003933429718018, + "learning_rate": 9.841821198612897e-05, + "loss": 2.0353, + "step": 3516 + }, + { + "epoch": 1.0794966236955188, + "grad_norm": 0.8481027483940125, + "learning_rate": 9.841697138851863e-05, + "loss": 2.1012, + "step": 3517 + }, + { + "epoch": 1.079803560466544, + "grad_norm": 0.7234178185462952, + "learning_rate": 9.84157303124234e-05, + "loss": 2.1134, + "step": 3518 + }, + { + "epoch": 1.080110497237569, + "grad_norm": 0.6129522919654846, + "learning_rate": 9.841448875785553e-05, + "loss": 2.0736, + "step": 3519 + }, + { + "epoch": 1.0804174340085941, + "grad_norm": 0.4983314573764801, + "learning_rate": 9.841324672482732e-05, + "loss": 2.0334, + "step": 3520 + }, + { + "epoch": 1.0807243707796195, + "grad_norm": 0.6069099307060242, + "learning_rate": 9.841200421335101e-05, + "loss": 2.0506, + "step": 3521 + }, + { + "epoch": 1.0810313075506446, + "grad_norm": 0.5841798186302185, + "learning_rate": 9.841076122343893e-05, + "loss": 2.0491, + "step": 3522 + }, + { + "epoch": 1.0813382443216697, + "grad_norm": 0.5629861354827881, + "learning_rate": 9.84095177551033e-05, + "loss": 2.0435, + "step": 3523 + }, + { + "epoch": 1.0816451810926948, + "grad_norm": 0.48676446080207825, + "learning_rate": 9.840827380835646e-05, + "loss": 2.0543, + "step": 3524 + }, + { + "epoch": 1.0819521178637201, + "grad_norm": 0.5119389295578003, + "learning_rate": 9.840702938321069e-05, + "loss": 2.0461, + "step": 3525 + }, + { + "epoch": 1.0822590546347453, + "grad_norm": 0.47259917855262756, + "learning_rate": 9.840578447967827e-05, + "loss": 2.0494, + "step": 3526 + }, + { + "epoch": 1.0825659914057704, + "grad_norm": 0.5083605647087097, + "learning_rate": 9.840453909777153e-05, + "loss": 2.0518, + "step": 3527 + }, + { + "epoch": 1.0828729281767955, + "grad_norm": 0.46149778366088867, + "learning_rate": 9.840329323750276e-05, + "loss": 2.0087, + "step": 3528 + }, + { + "epoch": 1.0831798649478208, + "grad_norm": 0.4698919951915741, + "learning_rate": 9.840204689888427e-05, + "loss": 2.0715, + "step": 3529 + }, + { + "epoch": 1.083486801718846, + "grad_norm": 0.514570951461792, + "learning_rate": 9.840080008192838e-05, + "loss": 2.1067, + "step": 3530 + }, + { + "epoch": 1.083793738489871, + "grad_norm": 0.5938723087310791, + "learning_rate": 9.839955278664743e-05, + "loss": 2.1246, + "step": 3531 + }, + { + "epoch": 1.0841006752608962, + "grad_norm": 0.58525550365448, + "learning_rate": 9.839830501305372e-05, + "loss": 2.0695, + "step": 3532 + }, + { + "epoch": 1.0844076120319215, + "grad_norm": 0.5693490505218506, + "learning_rate": 9.83970567611596e-05, + "loss": 2.0166, + "step": 3533 + }, + { + "epoch": 1.0847145488029466, + "grad_norm": 0.544964075088501, + "learning_rate": 9.839580803097738e-05, + "loss": 2.0093, + "step": 3534 + }, + { + "epoch": 1.0850214855739717, + "grad_norm": 0.5509639978408813, + "learning_rate": 9.839455882251945e-05, + "loss": 2.0511, + "step": 3535 + }, + { + "epoch": 1.0853284223449968, + "grad_norm": 0.5092516541481018, + "learning_rate": 9.83933091357981e-05, + "loss": 2.0586, + "step": 3536 + }, + { + "epoch": 1.0856353591160222, + "grad_norm": 0.5163968205451965, + "learning_rate": 9.83920589708257e-05, + "loss": 2.0541, + "step": 3537 + }, + { + "epoch": 1.0859422958870473, + "grad_norm": 0.49756479263305664, + "learning_rate": 9.839080832761464e-05, + "loss": 2.0495, + "step": 3538 + }, + { + "epoch": 1.0862492326580724, + "grad_norm": 0.6246916055679321, + "learning_rate": 9.838955720617722e-05, + "loss": 2.2082, + "step": 3539 + }, + { + "epoch": 1.0865561694290977, + "grad_norm": 0.5826153755187988, + "learning_rate": 9.838830560652585e-05, + "loss": 2.0318, + "step": 3540 + }, + { + "epoch": 1.0868631062001228, + "grad_norm": 0.6131548285484314, + "learning_rate": 9.838705352867287e-05, + "loss": 2.1172, + "step": 3541 + }, + { + "epoch": 1.087170042971148, + "grad_norm": 0.7028201818466187, + "learning_rate": 9.838580097263068e-05, + "loss": 2.061, + "step": 3542 + }, + { + "epoch": 1.087476979742173, + "grad_norm": 0.7061073780059814, + "learning_rate": 9.838454793841166e-05, + "loss": 2.0944, + "step": 3543 + }, + { + "epoch": 1.0877839165131982, + "grad_norm": 0.6820229887962341, + "learning_rate": 9.838329442602814e-05, + "loss": 2.072, + "step": 3544 + }, + { + "epoch": 1.0880908532842235, + "grad_norm": 0.5658139586448669, + "learning_rate": 9.838204043549257e-05, + "loss": 2.0499, + "step": 3545 + }, + { + "epoch": 1.0883977900552486, + "grad_norm": 0.5714126825332642, + "learning_rate": 9.838078596681731e-05, + "loss": 2.06, + "step": 3546 + }, + { + "epoch": 1.0887047268262737, + "grad_norm": 0.5343610048294067, + "learning_rate": 9.837953102001477e-05, + "loss": 2.0932, + "step": 3547 + }, + { + "epoch": 1.089011663597299, + "grad_norm": 0.5799851417541504, + "learning_rate": 9.837827559509735e-05, + "loss": 2.0615, + "step": 3548 + }, + { + "epoch": 1.0893186003683242, + "grad_norm": 0.5679401159286499, + "learning_rate": 9.837701969207745e-05, + "loss": 2.0161, + "step": 3549 + }, + { + "epoch": 1.0896255371393493, + "grad_norm": 0.5369420647621155, + "learning_rate": 9.83757633109675e-05, + "loss": 2.0066, + "step": 3550 + }, + { + "epoch": 1.0899324739103744, + "grad_norm": 0.5276355147361755, + "learning_rate": 9.837450645177988e-05, + "loss": 2.03, + "step": 3551 + }, + { + "epoch": 1.0902394106813997, + "grad_norm": 0.49717894196510315, + "learning_rate": 9.837324911452705e-05, + "loss": 1.9897, + "step": 3552 + }, + { + "epoch": 1.0905463474524248, + "grad_norm": 0.460783451795578, + "learning_rate": 9.837199129922142e-05, + "loss": 2.089, + "step": 3553 + }, + { + "epoch": 1.09085328422345, + "grad_norm": 0.505473792552948, + "learning_rate": 9.837073300587541e-05, + "loss": 2.035, + "step": 3554 + }, + { + "epoch": 1.091160220994475, + "grad_norm": 0.4588155150413513, + "learning_rate": 9.836947423450147e-05, + "loss": 2.0029, + "step": 3555 + }, + { + "epoch": 1.0914671577655004, + "grad_norm": 0.5151825547218323, + "learning_rate": 9.836821498511203e-05, + "loss": 2.1075, + "step": 3556 + }, + { + "epoch": 1.0917740945365255, + "grad_norm": 0.46669647097587585, + "learning_rate": 9.836695525771955e-05, + "loss": 2.0468, + "step": 3557 + }, + { + "epoch": 1.0920810313075506, + "grad_norm": 0.49291539192199707, + "learning_rate": 9.836569505233647e-05, + "loss": 2.1201, + "step": 3558 + }, + { + "epoch": 1.0923879680785757, + "grad_norm": 0.49323126673698425, + "learning_rate": 9.836443436897525e-05, + "loss": 1.9796, + "step": 3559 + }, + { + "epoch": 1.092694904849601, + "grad_norm": 0.4784039258956909, + "learning_rate": 9.836317320764832e-05, + "loss": 2.0267, + "step": 3560 + }, + { + "epoch": 1.0930018416206262, + "grad_norm": 0.5402999520301819, + "learning_rate": 9.836191156836818e-05, + "loss": 2.07, + "step": 3561 + }, + { + "epoch": 1.0933087783916513, + "grad_norm": 0.5989857912063599, + "learning_rate": 9.83606494511473e-05, + "loss": 2.0518, + "step": 3562 + }, + { + "epoch": 1.0936157151626764, + "grad_norm": 0.685855507850647, + "learning_rate": 9.835938685599811e-05, + "loss": 2.0632, + "step": 3563 + }, + { + "epoch": 1.0939226519337018, + "grad_norm": 0.7716066837310791, + "learning_rate": 9.835812378293312e-05, + "loss": 2.0758, + "step": 3564 + }, + { + "epoch": 1.0942295887047269, + "grad_norm": 0.6822659969329834, + "learning_rate": 9.835686023196481e-05, + "loss": 2.0077, + "step": 3565 + }, + { + "epoch": 1.094536525475752, + "grad_norm": 0.5031718611717224, + "learning_rate": 9.835559620310566e-05, + "loss": 2.0432, + "step": 3566 + }, + { + "epoch": 1.094843462246777, + "grad_norm": 0.5570902228355408, + "learning_rate": 9.835433169636818e-05, + "loss": 2.1203, + "step": 3567 + }, + { + "epoch": 1.0951503990178024, + "grad_norm": 0.6224993467330933, + "learning_rate": 9.835306671176484e-05, + "loss": 2.0281, + "step": 3568 + }, + { + "epoch": 1.0954573357888275, + "grad_norm": 0.67215895652771, + "learning_rate": 9.835180124930816e-05, + "loss": 2.1158, + "step": 3569 + }, + { + "epoch": 1.0957642725598526, + "grad_norm": 0.5764983892440796, + "learning_rate": 9.835053530901064e-05, + "loss": 1.9735, + "step": 3570 + }, + { + "epoch": 1.0960712093308778, + "grad_norm": 0.48459672927856445, + "learning_rate": 9.834926889088478e-05, + "loss": 2.0074, + "step": 3571 + }, + { + "epoch": 1.096378146101903, + "grad_norm": 0.4789890944957733, + "learning_rate": 9.834800199494312e-05, + "loss": 1.9942, + "step": 3572 + }, + { + "epoch": 1.0966850828729282, + "grad_norm": 0.5133237838745117, + "learning_rate": 9.834673462119817e-05, + "loss": 2.0204, + "step": 3573 + }, + { + "epoch": 1.0969920196439533, + "grad_norm": 0.638518750667572, + "learning_rate": 9.834546676966244e-05, + "loss": 2.1396, + "step": 3574 + }, + { + "epoch": 1.0972989564149784, + "grad_norm": 0.5471677780151367, + "learning_rate": 9.834419844034848e-05, + "loss": 1.99, + "step": 3575 + }, + { + "epoch": 1.0976058931860038, + "grad_norm": 0.5372926592826843, + "learning_rate": 9.83429296332688e-05, + "loss": 2.0241, + "step": 3576 + }, + { + "epoch": 1.0979128299570289, + "grad_norm": 0.5284983515739441, + "learning_rate": 9.834166034843597e-05, + "loss": 2.0705, + "step": 3577 + }, + { + "epoch": 1.098219766728054, + "grad_norm": 0.5212574601173401, + "learning_rate": 9.834039058586252e-05, + "loss": 2.0648, + "step": 3578 + }, + { + "epoch": 1.098526703499079, + "grad_norm": 0.439454048871994, + "learning_rate": 9.833912034556099e-05, + "loss": 1.9981, + "step": 3579 + }, + { + "epoch": 1.0988336402701044, + "grad_norm": 0.529550313949585, + "learning_rate": 9.833784962754394e-05, + "loss": 2.0092, + "step": 3580 + }, + { + "epoch": 1.0991405770411296, + "grad_norm": 0.5555844902992249, + "learning_rate": 9.833657843182394e-05, + "loss": 2.0457, + "step": 3581 + }, + { + "epoch": 1.0994475138121547, + "grad_norm": 0.56191086769104, + "learning_rate": 9.833530675841352e-05, + "loss": 2.0742, + "step": 3582 + }, + { + "epoch": 1.0997544505831798, + "grad_norm": 0.5119436383247375, + "learning_rate": 9.833403460732529e-05, + "loss": 2.0836, + "step": 3583 + }, + { + "epoch": 1.1000613873542051, + "grad_norm": 0.48049578070640564, + "learning_rate": 9.833276197857179e-05, + "loss": 2.0018, + "step": 3584 + }, + { + "epoch": 1.1003683241252302, + "grad_norm": 0.48501092195510864, + "learning_rate": 9.83314888721656e-05, + "loss": 2.0158, + "step": 3585 + }, + { + "epoch": 1.1006752608962553, + "grad_norm": 0.528548538684845, + "learning_rate": 9.833021528811932e-05, + "loss": 2.0327, + "step": 3586 + }, + { + "epoch": 1.1009821976672804, + "grad_norm": 0.5243194699287415, + "learning_rate": 9.832894122644551e-05, + "loss": 1.9874, + "step": 3587 + }, + { + "epoch": 1.1012891344383058, + "grad_norm": 0.46920302510261536, + "learning_rate": 9.832766668715681e-05, + "loss": 2.0487, + "step": 3588 + }, + { + "epoch": 1.101596071209331, + "grad_norm": 0.45994171500205994, + "learning_rate": 9.832639167026575e-05, + "loss": 2.0926, + "step": 3589 + }, + { + "epoch": 1.101903007980356, + "grad_norm": 0.5337465405464172, + "learning_rate": 9.832511617578497e-05, + "loss": 1.9957, + "step": 3590 + }, + { + "epoch": 1.1022099447513811, + "grad_norm": 0.5920217633247375, + "learning_rate": 9.832384020372707e-05, + "loss": 2.0571, + "step": 3591 + }, + { + "epoch": 1.1025168815224065, + "grad_norm": 0.651720404624939, + "learning_rate": 9.832256375410466e-05, + "loss": 2.0382, + "step": 3592 + }, + { + "epoch": 1.1028238182934316, + "grad_norm": 0.6063461899757385, + "learning_rate": 9.832128682693035e-05, + "loss": 1.9932, + "step": 3593 + }, + { + "epoch": 1.1031307550644567, + "grad_norm": 0.5111881494522095, + "learning_rate": 9.832000942221676e-05, + "loss": 1.9821, + "step": 3594 + }, + { + "epoch": 1.1034376918354818, + "grad_norm": 0.5419835448265076, + "learning_rate": 9.831873153997652e-05, + "loss": 2.0535, + "step": 3595 + }, + { + "epoch": 1.1037446286065071, + "grad_norm": 0.5685762763023376, + "learning_rate": 9.831745318022226e-05, + "loss": 2.0715, + "step": 3596 + }, + { + "epoch": 1.1040515653775322, + "grad_norm": 0.6095051765441895, + "learning_rate": 9.831617434296659e-05, + "loss": 2.0382, + "step": 3597 + }, + { + "epoch": 1.1043585021485574, + "grad_norm": 0.548292338848114, + "learning_rate": 9.831489502822217e-05, + "loss": 1.98, + "step": 3598 + }, + { + "epoch": 1.1046654389195825, + "grad_norm": 0.5056986808776855, + "learning_rate": 9.831361523600165e-05, + "loss": 2.0271, + "step": 3599 + }, + { + "epoch": 1.1049723756906078, + "grad_norm": 0.48790082335472107, + "learning_rate": 9.831233496631767e-05, + "loss": 1.9555, + "step": 3600 + }, + { + "epoch": 1.105279312461633, + "grad_norm": 0.4663766622543335, + "learning_rate": 9.831105421918287e-05, + "loss": 1.9985, + "step": 3601 + }, + { + "epoch": 1.105586249232658, + "grad_norm": 0.4549616277217865, + "learning_rate": 9.83097729946099e-05, + "loss": 2.0543, + "step": 3602 + }, + { + "epoch": 1.1058931860036831, + "grad_norm": 0.46699193120002747, + "learning_rate": 9.830849129261146e-05, + "loss": 2.0395, + "step": 3603 + }, + { + "epoch": 1.1062001227747085, + "grad_norm": 0.4600387215614319, + "learning_rate": 9.830720911320019e-05, + "loss": 2.0155, + "step": 3604 + }, + { + "epoch": 1.1065070595457336, + "grad_norm": 0.4854283034801483, + "learning_rate": 9.830592645638877e-05, + "loss": 2.0698, + "step": 3605 + }, + { + "epoch": 1.1068139963167587, + "grad_norm": 0.5249526500701904, + "learning_rate": 9.830464332218987e-05, + "loss": 2.0842, + "step": 3606 + }, + { + "epoch": 1.107120933087784, + "grad_norm": 0.6377332806587219, + "learning_rate": 9.830335971061616e-05, + "loss": 2.1399, + "step": 3607 + }, + { + "epoch": 1.1074278698588091, + "grad_norm": 0.632194995880127, + "learning_rate": 9.830207562168034e-05, + "loss": 2.1203, + "step": 3608 + }, + { + "epoch": 1.1077348066298343, + "grad_norm": 0.5585857629776001, + "learning_rate": 9.830079105539512e-05, + "loss": 2.0219, + "step": 3609 + }, + { + "epoch": 1.1080417434008594, + "grad_norm": 0.5613297820091248, + "learning_rate": 9.829950601177316e-05, + "loss": 2.0464, + "step": 3610 + }, + { + "epoch": 1.1083486801718845, + "grad_norm": 0.5213276743888855, + "learning_rate": 9.829822049082716e-05, + "loss": 2.0134, + "step": 3611 + }, + { + "epoch": 1.1086556169429098, + "grad_norm": 0.5008644461631775, + "learning_rate": 9.829693449256984e-05, + "loss": 1.9952, + "step": 3612 + }, + { + "epoch": 1.108962553713935, + "grad_norm": 0.5565455555915833, + "learning_rate": 9.829564801701392e-05, + "loss": 1.9737, + "step": 3613 + }, + { + "epoch": 1.10926949048496, + "grad_norm": 0.6150243878364563, + "learning_rate": 9.82943610641721e-05, + "loss": 2.0414, + "step": 3614 + }, + { + "epoch": 1.1095764272559854, + "grad_norm": 0.6731769442558289, + "learning_rate": 9.829307363405709e-05, + "loss": 2.0262, + "step": 3615 + }, + { + "epoch": 1.1098833640270105, + "grad_norm": 0.5681004524230957, + "learning_rate": 9.829178572668162e-05, + "loss": 2.0303, + "step": 3616 + }, + { + "epoch": 1.1101903007980356, + "grad_norm": 0.4748475253582001, + "learning_rate": 9.829049734205841e-05, + "loss": 1.9756, + "step": 3617 + }, + { + "epoch": 1.1104972375690607, + "grad_norm": 0.4218698740005493, + "learning_rate": 9.82892084802002e-05, + "loss": 2.0243, + "step": 3618 + }, + { + "epoch": 1.1108041743400858, + "grad_norm": 0.47928178310394287, + "learning_rate": 9.828791914111976e-05, + "loss": 2.0368, + "step": 3619 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5805749297142029, + "learning_rate": 9.828662932482977e-05, + "loss": 2.0071, + "step": 3620 + }, + { + "epoch": 1.1114180478821363, + "grad_norm": 0.5580070614814758, + "learning_rate": 9.828533903134302e-05, + "loss": 1.9568, + "step": 3621 + }, + { + "epoch": 1.1117249846531614, + "grad_norm": 0.572694718837738, + "learning_rate": 9.828404826067224e-05, + "loss": 2.0128, + "step": 3622 + }, + { + "epoch": 1.1120319214241867, + "grad_norm": 0.605338990688324, + "learning_rate": 9.828275701283021e-05, + "loss": 2.0638, + "step": 3623 + }, + { + "epoch": 1.1123388581952118, + "grad_norm": 0.550521969795227, + "learning_rate": 9.828146528782967e-05, + "loss": 2.118, + "step": 3624 + }, + { + "epoch": 1.112645794966237, + "grad_norm": 0.5420751571655273, + "learning_rate": 9.828017308568337e-05, + "loss": 2.0685, + "step": 3625 + }, + { + "epoch": 1.112952731737262, + "grad_norm": 0.5761057734489441, + "learning_rate": 9.827888040640414e-05, + "loss": 2.1111, + "step": 3626 + }, + { + "epoch": 1.1132596685082874, + "grad_norm": 0.5724154710769653, + "learning_rate": 9.827758725000468e-05, + "loss": 2.0596, + "step": 3627 + }, + { + "epoch": 1.1135666052793125, + "grad_norm": 0.5120618343353271, + "learning_rate": 9.827629361649783e-05, + "loss": 1.9811, + "step": 3628 + }, + { + "epoch": 1.1138735420503376, + "grad_norm": 0.4449520409107208, + "learning_rate": 9.827499950589633e-05, + "loss": 1.9935, + "step": 3629 + }, + { + "epoch": 1.1141804788213627, + "grad_norm": 0.5478667616844177, + "learning_rate": 9.827370491821302e-05, + "loss": 2.0142, + "step": 3630 + }, + { + "epoch": 1.114487415592388, + "grad_norm": 0.6170383095741272, + "learning_rate": 9.827240985346064e-05, + "loss": 2.0588, + "step": 3631 + }, + { + "epoch": 1.1147943523634132, + "grad_norm": 0.5950221419334412, + "learning_rate": 9.827111431165202e-05, + "loss": 2.0187, + "step": 3632 + }, + { + "epoch": 1.1151012891344383, + "grad_norm": 0.5250533819198608, + "learning_rate": 9.826981829279995e-05, + "loss": 2.0288, + "step": 3633 + }, + { + "epoch": 1.1154082259054634, + "grad_norm": 0.6252482533454895, + "learning_rate": 9.826852179691725e-05, + "loss": 2.1834, + "step": 3634 + }, + { + "epoch": 1.1157151626764887, + "grad_norm": 0.5258986353874207, + "learning_rate": 9.826722482401673e-05, + "loss": 1.9894, + "step": 3635 + }, + { + "epoch": 1.1160220994475138, + "grad_norm": 0.5532206892967224, + "learning_rate": 9.82659273741112e-05, + "loss": 2.013, + "step": 3636 + }, + { + "epoch": 1.116329036218539, + "grad_norm": 0.5178828835487366, + "learning_rate": 9.826462944721349e-05, + "loss": 1.955, + "step": 3637 + }, + { + "epoch": 1.116635972989564, + "grad_norm": 0.5466227531433105, + "learning_rate": 9.826333104333642e-05, + "loss": 2.1073, + "step": 3638 + }, + { + "epoch": 1.1169429097605894, + "grad_norm": 0.5513507723808289, + "learning_rate": 9.826203216249282e-05, + "loss": 2.0735, + "step": 3639 + }, + { + "epoch": 1.1172498465316145, + "grad_norm": 0.5485204458236694, + "learning_rate": 9.826073280469554e-05, + "loss": 2.0699, + "step": 3640 + }, + { + "epoch": 1.1175567833026396, + "grad_norm": 0.5148037075996399, + "learning_rate": 9.825943296995741e-05, + "loss": 1.9364, + "step": 3641 + }, + { + "epoch": 1.1178637200736647, + "grad_norm": 0.5639125108718872, + "learning_rate": 9.825813265829127e-05, + "loss": 2.078, + "step": 3642 + }, + { + "epoch": 1.11817065684469, + "grad_norm": 0.581631064414978, + "learning_rate": 9.825683186970997e-05, + "loss": 2.0404, + "step": 3643 + }, + { + "epoch": 1.1184775936157152, + "grad_norm": 0.5630286335945129, + "learning_rate": 9.82555306042264e-05, + "loss": 2.0615, + "step": 3644 + }, + { + "epoch": 1.1187845303867403, + "grad_norm": 0.5661062598228455, + "learning_rate": 9.825422886185338e-05, + "loss": 2.0432, + "step": 3645 + }, + { + "epoch": 1.1190914671577654, + "grad_norm": 0.4960556626319885, + "learning_rate": 9.825292664260379e-05, + "loss": 2.0576, + "step": 3646 + }, + { + "epoch": 1.1193984039287908, + "grad_norm": 0.5052362084388733, + "learning_rate": 9.825162394649048e-05, + "loss": 2.0615, + "step": 3647 + }, + { + "epoch": 1.1197053406998159, + "grad_norm": 0.566758930683136, + "learning_rate": 9.825032077352636e-05, + "loss": 2.0821, + "step": 3648 + }, + { + "epoch": 1.120012277470841, + "grad_norm": 0.5705568790435791, + "learning_rate": 9.824901712372429e-05, + "loss": 2.1455, + "step": 3649 + }, + { + "epoch": 1.120319214241866, + "grad_norm": 0.5584011673927307, + "learning_rate": 9.824771299709714e-05, + "loss": 2.0911, + "step": 3650 + }, + { + "epoch": 1.1206261510128914, + "grad_norm": 0.5621497631072998, + "learning_rate": 9.824640839365782e-05, + "loss": 2.1209, + "step": 3651 + }, + { + "epoch": 1.1209330877839165, + "grad_norm": 0.4893646240234375, + "learning_rate": 9.824510331341921e-05, + "loss": 1.977, + "step": 3652 + }, + { + "epoch": 1.1212400245549416, + "grad_norm": 0.5626688599586487, + "learning_rate": 9.82437977563942e-05, + "loss": 2.1114, + "step": 3653 + }, + { + "epoch": 1.1215469613259668, + "grad_norm": 0.5714966058731079, + "learning_rate": 9.824249172259573e-05, + "loss": 2.021, + "step": 3654 + }, + { + "epoch": 1.121853898096992, + "grad_norm": 0.5190821886062622, + "learning_rate": 9.824118521203666e-05, + "loss": 1.9788, + "step": 3655 + }, + { + "epoch": 1.1221608348680172, + "grad_norm": 0.46421363949775696, + "learning_rate": 9.823987822472994e-05, + "loss": 1.9762, + "step": 3656 + }, + { + "epoch": 1.1224677716390423, + "grad_norm": 0.5071156620979309, + "learning_rate": 9.823857076068846e-05, + "loss": 1.9625, + "step": 3657 + }, + { + "epoch": 1.1227747084100674, + "grad_norm": 0.5762679576873779, + "learning_rate": 9.823726281992515e-05, + "loss": 2.0543, + "step": 3658 + }, + { + "epoch": 1.1230816451810928, + "grad_norm": 0.6275226473808289, + "learning_rate": 9.823595440245294e-05, + "loss": 2.0878, + "step": 3659 + }, + { + "epoch": 1.1233885819521179, + "grad_norm": 0.6893213391304016, + "learning_rate": 9.823464550828476e-05, + "loss": 2.1059, + "step": 3660 + }, + { + "epoch": 1.123695518723143, + "grad_norm": 0.5521993041038513, + "learning_rate": 9.823333613743353e-05, + "loss": 2.035, + "step": 3661 + }, + { + "epoch": 1.124002455494168, + "grad_norm": 0.4918796718120575, + "learning_rate": 9.823202628991221e-05, + "loss": 1.9873, + "step": 3662 + }, + { + "epoch": 1.1243093922651934, + "grad_norm": 0.5177932977676392, + "learning_rate": 9.823071596573373e-05, + "loss": 2.0376, + "step": 3663 + }, + { + "epoch": 1.1246163290362186, + "grad_norm": 0.5337314009666443, + "learning_rate": 9.822940516491106e-05, + "loss": 2.1065, + "step": 3664 + }, + { + "epoch": 1.1249232658072437, + "grad_norm": 0.5179010629653931, + "learning_rate": 9.822809388745713e-05, + "loss": 1.9642, + "step": 3665 + }, + { + "epoch": 1.125230202578269, + "grad_norm": 0.5394679307937622, + "learning_rate": 9.82267821333849e-05, + "loss": 2.0275, + "step": 3666 + }, + { + "epoch": 1.1255371393492941, + "grad_norm": 0.582873523235321, + "learning_rate": 9.822546990270735e-05, + "loss": 2.0369, + "step": 3667 + }, + { + "epoch": 1.1258440761203192, + "grad_norm": 0.6595674753189087, + "learning_rate": 9.822415719543745e-05, + "loss": 1.9776, + "step": 3668 + }, + { + "epoch": 1.1261510128913443, + "grad_norm": 0.8103840947151184, + "learning_rate": 9.822284401158814e-05, + "loss": 2.0784, + "step": 3669 + }, + { + "epoch": 1.1264579496623695, + "grad_norm": 0.9062070250511169, + "learning_rate": 9.822153035117245e-05, + "loss": 1.9886, + "step": 3670 + }, + { + "epoch": 1.1267648864333948, + "grad_norm": 0.8718156814575195, + "learning_rate": 9.822021621420333e-05, + "loss": 2.0499, + "step": 3671 + }, + { + "epoch": 1.12707182320442, + "grad_norm": 0.6499583721160889, + "learning_rate": 9.821890160069375e-05, + "loss": 2.0734, + "step": 3672 + }, + { + "epoch": 1.127378759975445, + "grad_norm": 0.4573141932487488, + "learning_rate": 9.821758651065673e-05, + "loss": 2.0306, + "step": 3673 + }, + { + "epoch": 1.1276856967464703, + "grad_norm": 0.6441135406494141, + "learning_rate": 9.821627094410526e-05, + "loss": 2.051, + "step": 3674 + }, + { + "epoch": 1.1279926335174955, + "grad_norm": 0.7201390266418457, + "learning_rate": 9.821495490105235e-05, + "loss": 2.0187, + "step": 3675 + }, + { + "epoch": 1.1282995702885206, + "grad_norm": 0.6751874685287476, + "learning_rate": 9.821363838151099e-05, + "loss": 2.0363, + "step": 3676 + }, + { + "epoch": 1.1286065070595457, + "grad_norm": 0.5435949563980103, + "learning_rate": 9.821232138549419e-05, + "loss": 1.939, + "step": 3677 + }, + { + "epoch": 1.1289134438305708, + "grad_norm": 0.605248212814331, + "learning_rate": 9.821100391301497e-05, + "loss": 2.146, + "step": 3678 + }, + { + "epoch": 1.1292203806015961, + "grad_norm": 0.6798139810562134, + "learning_rate": 9.820968596408636e-05, + "loss": 2.0423, + "step": 3679 + }, + { + "epoch": 1.1295273173726212, + "grad_norm": 0.6683683395385742, + "learning_rate": 9.820836753872137e-05, + "loss": 1.9768, + "step": 3680 + }, + { + "epoch": 1.1298342541436464, + "grad_norm": 0.578346312046051, + "learning_rate": 9.820704863693304e-05, + "loss": 1.9313, + "step": 3681 + }, + { + "epoch": 1.1301411909146717, + "grad_norm": 0.5639599561691284, + "learning_rate": 9.820572925873441e-05, + "loss": 2.0706, + "step": 3682 + }, + { + "epoch": 1.1304481276856968, + "grad_norm": 0.5749368071556091, + "learning_rate": 9.82044094041385e-05, + "loss": 2.0072, + "step": 3683 + }, + { + "epoch": 1.130755064456722, + "grad_norm": 0.6490229368209839, + "learning_rate": 9.820308907315836e-05, + "loss": 1.9947, + "step": 3684 + }, + { + "epoch": 1.131062001227747, + "grad_norm": 0.6207692623138428, + "learning_rate": 9.820176826580705e-05, + "loss": 2.1426, + "step": 3685 + }, + { + "epoch": 1.1313689379987721, + "grad_norm": 0.6421573162078857, + "learning_rate": 9.82004469820976e-05, + "loss": 2.0558, + "step": 3686 + }, + { + "epoch": 1.1316758747697975, + "grad_norm": 0.5462764501571655, + "learning_rate": 9.81991252220431e-05, + "loss": 2.0072, + "step": 3687 + }, + { + "epoch": 1.1319828115408226, + "grad_norm": 0.49791282415390015, + "learning_rate": 9.819780298565657e-05, + "loss": 1.9949, + "step": 3688 + }, + { + "epoch": 1.1322897483118477, + "grad_norm": 0.5120366215705872, + "learning_rate": 9.819648027295112e-05, + "loss": 2.0503, + "step": 3689 + }, + { + "epoch": 1.132596685082873, + "grad_norm": 0.5118343830108643, + "learning_rate": 9.81951570839398e-05, + "loss": 2.0104, + "step": 3690 + }, + { + "epoch": 1.1329036218538981, + "grad_norm": 0.44520822167396545, + "learning_rate": 9.81938334186357e-05, + "loss": 2.0024, + "step": 3691 + }, + { + "epoch": 1.1332105586249233, + "grad_norm": 0.5505960583686829, + "learning_rate": 9.819250927705188e-05, + "loss": 2.0924, + "step": 3692 + }, + { + "epoch": 1.1335174953959484, + "grad_norm": 0.5269182920455933, + "learning_rate": 9.819118465920143e-05, + "loss": 2.0553, + "step": 3693 + }, + { + "epoch": 1.1338244321669735, + "grad_norm": 0.4864311218261719, + "learning_rate": 9.818985956509745e-05, + "loss": 2.0405, + "step": 3694 + }, + { + "epoch": 1.1341313689379988, + "grad_norm": 0.515202522277832, + "learning_rate": 9.818853399475304e-05, + "loss": 2.0211, + "step": 3695 + }, + { + "epoch": 1.134438305709024, + "grad_norm": 0.5360483527183533, + "learning_rate": 9.818720794818128e-05, + "loss": 2.1077, + "step": 3696 + }, + { + "epoch": 1.134745242480049, + "grad_norm": 0.5469255447387695, + "learning_rate": 9.818588142539531e-05, + "loss": 1.9538, + "step": 3697 + }, + { + "epoch": 1.1350521792510744, + "grad_norm": 0.5042214393615723, + "learning_rate": 9.818455442640819e-05, + "loss": 2.0477, + "step": 3698 + }, + { + "epoch": 1.1353591160220995, + "grad_norm": 0.5678744316101074, + "learning_rate": 9.81832269512331e-05, + "loss": 2.0871, + "step": 3699 + }, + { + "epoch": 1.1356660527931246, + "grad_norm": 0.5218677520751953, + "learning_rate": 9.818189899988308e-05, + "loss": 2.1014, + "step": 3700 + }, + { + "epoch": 1.1359729895641497, + "grad_norm": 0.5141727924346924, + "learning_rate": 9.818057057237132e-05, + "loss": 2.0385, + "step": 3701 + }, + { + "epoch": 1.136279926335175, + "grad_norm": 0.5288038849830627, + "learning_rate": 9.81792416687109e-05, + "loss": 2.0736, + "step": 3702 + }, + { + "epoch": 1.1365868631062002, + "grad_norm": 0.5533168911933899, + "learning_rate": 9.817791228891499e-05, + "loss": 2.032, + "step": 3703 + }, + { + "epoch": 1.1368937998772253, + "grad_norm": 0.4840674102306366, + "learning_rate": 9.81765824329967e-05, + "loss": 2.027, + "step": 3704 + }, + { + "epoch": 1.1372007366482504, + "grad_norm": 0.5060023069381714, + "learning_rate": 9.817525210096921e-05, + "loss": 2.0561, + "step": 3705 + }, + { + "epoch": 1.1375076734192757, + "grad_norm": 0.48830488324165344, + "learning_rate": 9.817392129284561e-05, + "loss": 1.9807, + "step": 3706 + }, + { + "epoch": 1.1378146101903008, + "grad_norm": 0.4644564390182495, + "learning_rate": 9.817259000863911e-05, + "loss": 1.9871, + "step": 3707 + }, + { + "epoch": 1.138121546961326, + "grad_norm": 0.4644739329814911, + "learning_rate": 9.817125824836283e-05, + "loss": 2.0253, + "step": 3708 + }, + { + "epoch": 1.138428483732351, + "grad_norm": 0.5376463532447815, + "learning_rate": 9.816992601202994e-05, + "loss": 2.0693, + "step": 3709 + }, + { + "epoch": 1.1387354205033764, + "grad_norm": 0.49980148673057556, + "learning_rate": 9.816859329965363e-05, + "loss": 2.0123, + "step": 3710 + }, + { + "epoch": 1.1390423572744015, + "grad_norm": 0.5452225208282471, + "learning_rate": 9.816726011124702e-05, + "loss": 2.0725, + "step": 3711 + }, + { + "epoch": 1.1393492940454266, + "grad_norm": 0.5428896546363831, + "learning_rate": 9.816592644682332e-05, + "loss": 2.0446, + "step": 3712 + }, + { + "epoch": 1.1396562308164517, + "grad_norm": 0.5448847413063049, + "learning_rate": 9.816459230639571e-05, + "loss": 2.0262, + "step": 3713 + }, + { + "epoch": 1.139963167587477, + "grad_norm": 0.48574572801589966, + "learning_rate": 9.816325768997736e-05, + "loss": 2.0105, + "step": 3714 + }, + { + "epoch": 1.1402701043585022, + "grad_norm": 0.5566397905349731, + "learning_rate": 9.816192259758147e-05, + "loss": 2.0665, + "step": 3715 + }, + { + "epoch": 1.1405770411295273, + "grad_norm": 0.6098625659942627, + "learning_rate": 9.816058702922124e-05, + "loss": 2.0589, + "step": 3716 + }, + { + "epoch": 1.1408839779005524, + "grad_norm": 0.6118699312210083, + "learning_rate": 9.815925098490985e-05, + "loss": 2.0683, + "step": 3717 + }, + { + "epoch": 1.1411909146715777, + "grad_norm": 0.5213121175765991, + "learning_rate": 9.815791446466053e-05, + "loss": 2.0226, + "step": 3718 + }, + { + "epoch": 1.1414978514426029, + "grad_norm": 0.45717960596084595, + "learning_rate": 9.815657746848648e-05, + "loss": 2.0371, + "step": 3719 + }, + { + "epoch": 1.141804788213628, + "grad_norm": 0.4613656997680664, + "learning_rate": 9.815523999640088e-05, + "loss": 2.0702, + "step": 3720 + }, + { + "epoch": 1.142111724984653, + "grad_norm": 0.4527476727962494, + "learning_rate": 9.8153902048417e-05, + "loss": 1.9893, + "step": 3721 + }, + { + "epoch": 1.1424186617556784, + "grad_norm": 0.4524305462837219, + "learning_rate": 9.815256362454801e-05, + "loss": 1.975, + "step": 3722 + }, + { + "epoch": 1.1427255985267035, + "grad_norm": 0.4421180188655853, + "learning_rate": 9.815122472480718e-05, + "loss": 1.9987, + "step": 3723 + }, + { + "epoch": 1.1430325352977286, + "grad_norm": 0.4833788275718689, + "learning_rate": 9.814988534920771e-05, + "loss": 2.0246, + "step": 3724 + }, + { + "epoch": 1.1433394720687537, + "grad_norm": 0.46547624468803406, + "learning_rate": 9.814854549776287e-05, + "loss": 2.0007, + "step": 3725 + }, + { + "epoch": 1.143646408839779, + "grad_norm": 0.43220648169517517, + "learning_rate": 9.814720517048587e-05, + "loss": 1.9845, + "step": 3726 + }, + { + "epoch": 1.1439533456108042, + "grad_norm": 0.473910391330719, + "learning_rate": 9.814586436738998e-05, + "loss": 2.0518, + "step": 3727 + }, + { + "epoch": 1.1442602823818293, + "grad_norm": 0.507354199886322, + "learning_rate": 9.814452308848843e-05, + "loss": 2.0708, + "step": 3728 + }, + { + "epoch": 1.1445672191528544, + "grad_norm": 0.4585053622722626, + "learning_rate": 9.814318133379448e-05, + "loss": 2.0124, + "step": 3729 + }, + { + "epoch": 1.1448741559238798, + "grad_norm": 0.5280457735061646, + "learning_rate": 9.81418391033214e-05, + "loss": 2.0424, + "step": 3730 + }, + { + "epoch": 1.1451810926949049, + "grad_norm": 0.5173056125640869, + "learning_rate": 9.814049639708245e-05, + "loss": 1.9666, + "step": 3731 + }, + { + "epoch": 1.14548802946593, + "grad_norm": 0.5850839018821716, + "learning_rate": 9.81391532150909e-05, + "loss": 2.0765, + "step": 3732 + }, + { + "epoch": 1.145794966236955, + "grad_norm": 0.5450417995452881, + "learning_rate": 9.813780955736002e-05, + "loss": 2.0696, + "step": 3733 + }, + { + "epoch": 1.1461019030079804, + "grad_norm": 0.4577319622039795, + "learning_rate": 9.81364654239031e-05, + "loss": 2.0493, + "step": 3734 + }, + { + "epoch": 1.1464088397790055, + "grad_norm": 0.5211838483810425, + "learning_rate": 9.813512081473339e-05, + "loss": 2.0578, + "step": 3735 + }, + { + "epoch": 1.1467157765500307, + "grad_norm": 0.6763051152229309, + "learning_rate": 9.813377572986422e-05, + "loss": 2.0859, + "step": 3736 + }, + { + "epoch": 1.1470227133210558, + "grad_norm": 0.8591815233230591, + "learning_rate": 9.813243016930887e-05, + "loss": 1.9743, + "step": 3737 + }, + { + "epoch": 1.147329650092081, + "grad_norm": 0.8573755025863647, + "learning_rate": 9.813108413308063e-05, + "loss": 2.048, + "step": 3738 + }, + { + "epoch": 1.1476365868631062, + "grad_norm": 0.6887713074684143, + "learning_rate": 9.812973762119281e-05, + "loss": 2.0184, + "step": 3739 + }, + { + "epoch": 1.1479435236341313, + "grad_norm": 0.5491438508033752, + "learning_rate": 9.81283906336587e-05, + "loss": 2.0373, + "step": 3740 + }, + { + "epoch": 1.1482504604051567, + "grad_norm": 0.6413923501968384, + "learning_rate": 9.812704317049164e-05, + "loss": 2.067, + "step": 3741 + }, + { + "epoch": 1.1485573971761818, + "grad_norm": 0.8731338381767273, + "learning_rate": 9.812569523170492e-05, + "loss": 1.9996, + "step": 3742 + }, + { + "epoch": 1.1488643339472069, + "grad_norm": 0.8043886423110962, + "learning_rate": 9.812434681731189e-05, + "loss": 2.0464, + "step": 3743 + }, + { + "epoch": 1.149171270718232, + "grad_norm": 0.5458279252052307, + "learning_rate": 9.812299792732584e-05, + "loss": 2.0026, + "step": 3744 + }, + { + "epoch": 1.149478207489257, + "grad_norm": 0.5135432481765747, + "learning_rate": 9.812164856176011e-05, + "loss": 2.0302, + "step": 3745 + }, + { + "epoch": 1.1497851442602824, + "grad_norm": 0.6673153638839722, + "learning_rate": 9.812029872062807e-05, + "loss": 2.0435, + "step": 3746 + }, + { + "epoch": 1.1500920810313076, + "grad_norm": 0.6777083873748779, + "learning_rate": 9.811894840394302e-05, + "loss": 2.0591, + "step": 3747 + }, + { + "epoch": 1.1503990178023327, + "grad_norm": 0.6660524010658264, + "learning_rate": 9.811759761171833e-05, + "loss": 2.0461, + "step": 3748 + }, + { + "epoch": 1.150705954573358, + "grad_norm": 0.6079594492912292, + "learning_rate": 9.811624634396733e-05, + "loss": 2.0708, + "step": 3749 + }, + { + "epoch": 1.1510128913443831, + "grad_norm": 0.5242465734481812, + "learning_rate": 9.811489460070337e-05, + "loss": 2.0513, + "step": 3750 + }, + { + "epoch": 1.1513198281154082, + "grad_norm": 0.7091820240020752, + "learning_rate": 9.811354238193984e-05, + "loss": 2.0356, + "step": 3751 + }, + { + "epoch": 1.1516267648864333, + "grad_norm": 0.6781896948814392, + "learning_rate": 9.811218968769007e-05, + "loss": 2.0693, + "step": 3752 + }, + { + "epoch": 1.1519337016574585, + "grad_norm": 0.6036314368247986, + "learning_rate": 9.811083651796744e-05, + "loss": 2.134, + "step": 3753 + }, + { + "epoch": 1.1522406384284838, + "grad_norm": 0.6173892617225647, + "learning_rate": 9.810948287278534e-05, + "loss": 2.056, + "step": 3754 + }, + { + "epoch": 1.152547575199509, + "grad_norm": 0.4903198182582855, + "learning_rate": 9.810812875215712e-05, + "loss": 2.0037, + "step": 3755 + }, + { + "epoch": 1.152854511970534, + "grad_norm": 0.5527236461639404, + "learning_rate": 9.810677415609619e-05, + "loss": 2.0334, + "step": 3756 + }, + { + "epoch": 1.1531614487415593, + "grad_norm": 0.5342993140220642, + "learning_rate": 9.81054190846159e-05, + "loss": 2.0376, + "step": 3757 + }, + { + "epoch": 1.1534683855125845, + "grad_norm": 0.4860527515411377, + "learning_rate": 9.810406353772968e-05, + "loss": 2.0009, + "step": 3758 + }, + { + "epoch": 1.1537753222836096, + "grad_norm": 0.49722176790237427, + "learning_rate": 9.810270751545089e-05, + "loss": 2.051, + "step": 3759 + }, + { + "epoch": 1.1540822590546347, + "grad_norm": 0.4714743196964264, + "learning_rate": 9.810135101779296e-05, + "loss": 2.0474, + "step": 3760 + }, + { + "epoch": 1.1543891958256598, + "grad_norm": 0.5183619856834412, + "learning_rate": 9.80999940447693e-05, + "loss": 2.1032, + "step": 3761 + }, + { + "epoch": 1.1546961325966851, + "grad_norm": 0.6118659377098083, + "learning_rate": 9.809863659639328e-05, + "loss": 2.0967, + "step": 3762 + }, + { + "epoch": 1.1550030693677102, + "grad_norm": 0.49166184663772583, + "learning_rate": 9.809727867267838e-05, + "loss": 2.0683, + "step": 3763 + }, + { + "epoch": 1.1553100061387354, + "grad_norm": 0.5190026164054871, + "learning_rate": 9.809592027363795e-05, + "loss": 2.0161, + "step": 3764 + }, + { + "epoch": 1.1556169429097607, + "grad_norm": 0.516914427280426, + "learning_rate": 9.809456139928546e-05, + "loss": 2.0886, + "step": 3765 + }, + { + "epoch": 1.1559238796807858, + "grad_norm": 0.49737948179244995, + "learning_rate": 9.809320204963433e-05, + "loss": 2.0111, + "step": 3766 + }, + { + "epoch": 1.156230816451811, + "grad_norm": 0.44676536321640015, + "learning_rate": 9.809184222469796e-05, + "loss": 2.0571, + "step": 3767 + }, + { + "epoch": 1.156537753222836, + "grad_norm": 0.5008999109268188, + "learning_rate": 9.809048192448983e-05, + "loss": 2.0489, + "step": 3768 + }, + { + "epoch": 1.1568446899938611, + "grad_norm": 0.5116657614707947, + "learning_rate": 9.80891211490234e-05, + "loss": 1.9571, + "step": 3769 + }, + { + "epoch": 1.1571516267648865, + "grad_norm": 0.49909651279449463, + "learning_rate": 9.808775989831207e-05, + "loss": 2.0568, + "step": 3770 + }, + { + "epoch": 1.1574585635359116, + "grad_norm": 0.5186662077903748, + "learning_rate": 9.80863981723693e-05, + "loss": 2.0283, + "step": 3771 + }, + { + "epoch": 1.1577655003069367, + "grad_norm": 0.4974740445613861, + "learning_rate": 9.808503597120858e-05, + "loss": 1.9525, + "step": 3772 + }, + { + "epoch": 1.158072437077962, + "grad_norm": 0.5369553565979004, + "learning_rate": 9.808367329484333e-05, + "loss": 1.9627, + "step": 3773 + }, + { + "epoch": 1.1583793738489871, + "grad_norm": 0.5084113478660583, + "learning_rate": 9.808231014328704e-05, + "loss": 1.9563, + "step": 3774 + }, + { + "epoch": 1.1586863106200123, + "grad_norm": 0.6059956550598145, + "learning_rate": 9.808094651655319e-05, + "loss": 2.078, + "step": 3775 + }, + { + "epoch": 1.1589932473910374, + "grad_norm": 0.5677124261856079, + "learning_rate": 9.807958241465523e-05, + "loss": 1.9977, + "step": 3776 + }, + { + "epoch": 1.1593001841620627, + "grad_norm": 0.5582616329193115, + "learning_rate": 9.807821783760667e-05, + "loss": 2.0053, + "step": 3777 + }, + { + "epoch": 1.1596071209330878, + "grad_norm": 0.5558032989501953, + "learning_rate": 9.807685278542097e-05, + "loss": 2.0015, + "step": 3778 + }, + { + "epoch": 1.159914057704113, + "grad_norm": 0.553292989730835, + "learning_rate": 9.807548725811165e-05, + "loss": 2.133, + "step": 3779 + }, + { + "epoch": 1.160220994475138, + "grad_norm": 0.5281317234039307, + "learning_rate": 9.807412125569217e-05, + "loss": 2.0018, + "step": 3780 + }, + { + "epoch": 1.1605279312461634, + "grad_norm": 0.45385050773620605, + "learning_rate": 9.807275477817605e-05, + "loss": 1.9986, + "step": 3781 + }, + { + "epoch": 1.1608348680171885, + "grad_norm": 0.5843673944473267, + "learning_rate": 9.80713878255768e-05, + "loss": 2.0653, + "step": 3782 + }, + { + "epoch": 1.1611418047882136, + "grad_norm": 0.6193283796310425, + "learning_rate": 9.807002039790792e-05, + "loss": 1.9646, + "step": 3783 + }, + { + "epoch": 1.1614487415592387, + "grad_norm": 0.5831897258758545, + "learning_rate": 9.806865249518292e-05, + "loss": 1.9708, + "step": 3784 + }, + { + "epoch": 1.161755678330264, + "grad_norm": 0.49771901965141296, + "learning_rate": 9.806728411741533e-05, + "loss": 1.9953, + "step": 3785 + }, + { + "epoch": 1.1620626151012892, + "grad_norm": 0.5003515481948853, + "learning_rate": 9.806591526461864e-05, + "loss": 2.0503, + "step": 3786 + }, + { + "epoch": 1.1623695518723143, + "grad_norm": 0.5710052847862244, + "learning_rate": 9.806454593680642e-05, + "loss": 1.9976, + "step": 3787 + }, + { + "epoch": 1.1626764886433394, + "grad_norm": 0.5180788040161133, + "learning_rate": 9.806317613399218e-05, + "loss": 1.9872, + "step": 3788 + }, + { + "epoch": 1.1629834254143647, + "grad_norm": 0.5202008485794067, + "learning_rate": 9.806180585618949e-05, + "loss": 1.9628, + "step": 3789 + }, + { + "epoch": 1.1632903621853898, + "grad_norm": 0.47358211874961853, + "learning_rate": 9.806043510341183e-05, + "loss": 1.9994, + "step": 3790 + }, + { + "epoch": 1.163597298956415, + "grad_norm": 0.4258720278739929, + "learning_rate": 9.80590638756728e-05, + "loss": 1.9547, + "step": 3791 + }, + { + "epoch": 1.16390423572744, + "grad_norm": 0.4487614035606384, + "learning_rate": 9.805769217298593e-05, + "loss": 1.9912, + "step": 3792 + }, + { + "epoch": 1.1642111724984654, + "grad_norm": 0.4970495104789734, + "learning_rate": 9.805631999536477e-05, + "loss": 2.0568, + "step": 3793 + }, + { + "epoch": 1.1645181092694905, + "grad_norm": 0.4535474479198456, + "learning_rate": 9.805494734282289e-05, + "loss": 2.0088, + "step": 3794 + }, + { + "epoch": 1.1648250460405156, + "grad_norm": 0.44582805037498474, + "learning_rate": 9.805357421537385e-05, + "loss": 1.9694, + "step": 3795 + }, + { + "epoch": 1.1651319828115407, + "grad_norm": 0.43872734904289246, + "learning_rate": 9.805220061303125e-05, + "loss": 2.0041, + "step": 3796 + }, + { + "epoch": 1.165438919582566, + "grad_norm": 0.5050458908081055, + "learning_rate": 9.805082653580861e-05, + "loss": 1.9963, + "step": 3797 + }, + { + "epoch": 1.1657458563535912, + "grad_norm": 0.5346884727478027, + "learning_rate": 9.804945198371956e-05, + "loss": 2.0334, + "step": 3798 + }, + { + "epoch": 1.1660527931246163, + "grad_norm": 0.5607240796089172, + "learning_rate": 9.804807695677764e-05, + "loss": 2.0474, + "step": 3799 + }, + { + "epoch": 1.1663597298956414, + "grad_norm": 0.5343592166900635, + "learning_rate": 9.804670145499648e-05, + "loss": 2.0542, + "step": 3800 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5195753574371338, + "learning_rate": 9.804532547838964e-05, + "loss": 2.0816, + "step": 3801 + }, + { + "epoch": 1.1669736034376919, + "grad_norm": 0.575821042060852, + "learning_rate": 9.804394902697075e-05, + "loss": 2.0182, + "step": 3802 + }, + { + "epoch": 1.167280540208717, + "grad_norm": 0.6385466456413269, + "learning_rate": 9.804257210075339e-05, + "loss": 2.0519, + "step": 3803 + }, + { + "epoch": 1.167587476979742, + "grad_norm": 0.7202457785606384, + "learning_rate": 9.804119469975117e-05, + "loss": 1.9871, + "step": 3804 + }, + { + "epoch": 1.1678944137507674, + "grad_norm": 0.696793258190155, + "learning_rate": 9.803981682397772e-05, + "loss": 2.1018, + "step": 3805 + }, + { + "epoch": 1.1682013505217925, + "grad_norm": 0.6217656135559082, + "learning_rate": 9.803843847344662e-05, + "loss": 2.1009, + "step": 3806 + }, + { + "epoch": 1.1685082872928176, + "grad_norm": 0.5296351313591003, + "learning_rate": 9.803705964817153e-05, + "loss": 2.1057, + "step": 3807 + }, + { + "epoch": 1.168815224063843, + "grad_norm": 0.5280975699424744, + "learning_rate": 9.803568034816606e-05, + "loss": 2.0019, + "step": 3808 + }, + { + "epoch": 1.169122160834868, + "grad_norm": 0.4981881380081177, + "learning_rate": 9.803430057344385e-05, + "loss": 1.9918, + "step": 3809 + }, + { + "epoch": 1.1694290976058932, + "grad_norm": 0.43662941455841064, + "learning_rate": 9.803292032401852e-05, + "loss": 2.0273, + "step": 3810 + }, + { + "epoch": 1.1697360343769183, + "grad_norm": 0.5039259791374207, + "learning_rate": 9.80315395999037e-05, + "loss": 2.0475, + "step": 3811 + }, + { + "epoch": 1.1700429711479434, + "grad_norm": 0.4330410957336426, + "learning_rate": 9.803015840111308e-05, + "loss": 1.99, + "step": 3812 + }, + { + "epoch": 1.1703499079189688, + "grad_norm": 0.4603813886642456, + "learning_rate": 9.802877672766026e-05, + "loss": 2.0288, + "step": 3813 + }, + { + "epoch": 1.1706568446899939, + "grad_norm": 0.45815590023994446, + "learning_rate": 9.802739457955894e-05, + "loss": 2.0026, + "step": 3814 + }, + { + "epoch": 1.170963781461019, + "grad_norm": 0.46995803713798523, + "learning_rate": 9.802601195682275e-05, + "loss": 2.0608, + "step": 3815 + }, + { + "epoch": 1.1712707182320443, + "grad_norm": 0.4511576294898987, + "learning_rate": 9.802462885946536e-05, + "loss": 1.9793, + "step": 3816 + }, + { + "epoch": 1.1715776550030694, + "grad_norm": 0.49079468846321106, + "learning_rate": 9.802324528750044e-05, + "loss": 2.0049, + "step": 3817 + }, + { + "epoch": 1.1718845917740945, + "grad_norm": 0.47245466709136963, + "learning_rate": 9.802186124094166e-05, + "loss": 1.9562, + "step": 3818 + }, + { + "epoch": 1.1721915285451197, + "grad_norm": 0.485575795173645, + "learning_rate": 9.80204767198027e-05, + "loss": 2.0212, + "step": 3819 + }, + { + "epoch": 1.1724984653161448, + "grad_norm": 0.5924440622329712, + "learning_rate": 9.801909172409724e-05, + "loss": 1.9875, + "step": 3820 + }, + { + "epoch": 1.17280540208717, + "grad_norm": 0.48908641934394836, + "learning_rate": 9.801770625383899e-05, + "loss": 1.9778, + "step": 3821 + }, + { + "epoch": 1.1731123388581952, + "grad_norm": 0.4372415840625763, + "learning_rate": 9.80163203090416e-05, + "loss": 1.9368, + "step": 3822 + }, + { + "epoch": 1.1734192756292203, + "grad_norm": 0.5811094641685486, + "learning_rate": 9.801493388971881e-05, + "loss": 2.1293, + "step": 3823 + }, + { + "epoch": 1.1737262124002457, + "grad_norm": 0.516983151435852, + "learning_rate": 9.801354699588428e-05, + "loss": 2.039, + "step": 3824 + }, + { + "epoch": 1.1740331491712708, + "grad_norm": 0.53409343957901, + "learning_rate": 9.801215962755175e-05, + "loss": 2.0294, + "step": 3825 + }, + { + "epoch": 1.1743400859422959, + "grad_norm": 0.5703202486038208, + "learning_rate": 9.801077178473492e-05, + "loss": 2.0241, + "step": 3826 + }, + { + "epoch": 1.174647022713321, + "grad_norm": 0.49341192841529846, + "learning_rate": 9.80093834674475e-05, + "loss": 1.9092, + "step": 3827 + }, + { + "epoch": 1.174953959484346, + "grad_norm": 0.46960577368736267, + "learning_rate": 9.800799467570321e-05, + "loss": 1.9994, + "step": 3828 + }, + { + "epoch": 1.1752608962553714, + "grad_norm": 0.468108594417572, + "learning_rate": 9.800660540951577e-05, + "loss": 1.9471, + "step": 3829 + }, + { + "epoch": 1.1755678330263966, + "grad_norm": 0.4133259057998657, + "learning_rate": 9.800521566889893e-05, + "loss": 2.0159, + "step": 3830 + }, + { + "epoch": 1.1758747697974217, + "grad_norm": 0.44991979002952576, + "learning_rate": 9.800382545386641e-05, + "loss": 2.0179, + "step": 3831 + }, + { + "epoch": 1.176181706568447, + "grad_norm": 0.43111294507980347, + "learning_rate": 9.800243476443195e-05, + "loss": 2.1092, + "step": 3832 + }, + { + "epoch": 1.1764886433394721, + "grad_norm": 0.4859693944454193, + "learning_rate": 9.800104360060929e-05, + "loss": 2.0134, + "step": 3833 + }, + { + "epoch": 1.1767955801104972, + "grad_norm": 0.474960058927536, + "learning_rate": 9.799965196241219e-05, + "loss": 2.0288, + "step": 3834 + }, + { + "epoch": 1.1771025168815223, + "grad_norm": 0.5269008278846741, + "learning_rate": 9.79982598498544e-05, + "loss": 2.063, + "step": 3835 + }, + { + "epoch": 1.1774094536525475, + "grad_norm": 0.4923003613948822, + "learning_rate": 9.799686726294965e-05, + "loss": 1.9506, + "step": 3836 + }, + { + "epoch": 1.1777163904235728, + "grad_norm": 0.5355561971664429, + "learning_rate": 9.799547420171175e-05, + "loss": 2.0066, + "step": 3837 + }, + { + "epoch": 1.178023327194598, + "grad_norm": 0.6095728874206543, + "learning_rate": 9.799408066615443e-05, + "loss": 1.9799, + "step": 3838 + }, + { + "epoch": 1.178330263965623, + "grad_norm": 0.5268104672431946, + "learning_rate": 9.799268665629148e-05, + "loss": 2.0409, + "step": 3839 + }, + { + "epoch": 1.1786372007366483, + "grad_norm": 0.4478130340576172, + "learning_rate": 9.799129217213667e-05, + "loss": 1.9521, + "step": 3840 + }, + { + "epoch": 1.1789441375076735, + "grad_norm": 0.4691653847694397, + "learning_rate": 9.798989721370379e-05, + "loss": 2.0432, + "step": 3841 + }, + { + "epoch": 1.1792510742786986, + "grad_norm": 0.5602376461029053, + "learning_rate": 9.798850178100661e-05, + "loss": 2.0557, + "step": 3842 + }, + { + "epoch": 1.1795580110497237, + "grad_norm": 0.5619905591011047, + "learning_rate": 9.798710587405893e-05, + "loss": 2.0258, + "step": 3843 + }, + { + "epoch": 1.179864947820749, + "grad_norm": 0.5845574736595154, + "learning_rate": 9.798570949287454e-05, + "loss": 2.0637, + "step": 3844 + }, + { + "epoch": 1.1801718845917741, + "grad_norm": 0.5339313745498657, + "learning_rate": 9.798431263746725e-05, + "loss": 2.0265, + "step": 3845 + }, + { + "epoch": 1.1804788213627992, + "grad_norm": 0.45720914006233215, + "learning_rate": 9.798291530785086e-05, + "loss": 1.9745, + "step": 3846 + }, + { + "epoch": 1.1807857581338244, + "grad_norm": 0.5121282935142517, + "learning_rate": 9.798151750403917e-05, + "loss": 2.0427, + "step": 3847 + }, + { + "epoch": 1.1810926949048497, + "grad_norm": 0.48100459575653076, + "learning_rate": 9.7980119226046e-05, + "loss": 2.0307, + "step": 3848 + }, + { + "epoch": 1.1813996316758748, + "grad_norm": 0.4424034655094147, + "learning_rate": 9.797872047388517e-05, + "loss": 1.9697, + "step": 3849 + }, + { + "epoch": 1.1817065684469, + "grad_norm": 0.45154938101768494, + "learning_rate": 9.797732124757051e-05, + "loss": 1.9689, + "step": 3850 + }, + { + "epoch": 1.182013505217925, + "grad_norm": 0.4807071387767792, + "learning_rate": 9.797592154711584e-05, + "loss": 1.9616, + "step": 3851 + }, + { + "epoch": 1.1823204419889504, + "grad_norm": 0.5113904476165771, + "learning_rate": 9.797452137253498e-05, + "loss": 2.0158, + "step": 3852 + }, + { + "epoch": 1.1826273787599755, + "grad_norm": 0.5456753969192505, + "learning_rate": 9.797312072384179e-05, + "loss": 1.977, + "step": 3853 + }, + { + "epoch": 1.1829343155310006, + "grad_norm": 0.5545704364776611, + "learning_rate": 9.797171960105012e-05, + "loss": 2.0622, + "step": 3854 + }, + { + "epoch": 1.1832412523020257, + "grad_norm": 0.651498556137085, + "learning_rate": 9.797031800417377e-05, + "loss": 2.0739, + "step": 3855 + }, + { + "epoch": 1.183548189073051, + "grad_norm": 0.748968780040741, + "learning_rate": 9.796891593322665e-05, + "loss": 2.0713, + "step": 3856 + }, + { + "epoch": 1.1838551258440762, + "grad_norm": 0.8724157214164734, + "learning_rate": 9.796751338822256e-05, + "loss": 2.0224, + "step": 3857 + }, + { + "epoch": 1.1841620626151013, + "grad_norm": 0.8158844709396362, + "learning_rate": 9.796611036917542e-05, + "loss": 2.0165, + "step": 3858 + }, + { + "epoch": 1.1844689993861264, + "grad_norm": 0.6231487989425659, + "learning_rate": 9.796470687609904e-05, + "loss": 1.9607, + "step": 3859 + }, + { + "epoch": 1.1847759361571517, + "grad_norm": 0.49367067217826843, + "learning_rate": 9.796330290900731e-05, + "loss": 2.0074, + "step": 3860 + }, + { + "epoch": 1.1850828729281768, + "grad_norm": 0.5546393990516663, + "learning_rate": 9.796189846791413e-05, + "loss": 1.9688, + "step": 3861 + }, + { + "epoch": 1.185389809699202, + "grad_norm": 0.5880963802337646, + "learning_rate": 9.796049355283333e-05, + "loss": 2.0192, + "step": 3862 + }, + { + "epoch": 1.185696746470227, + "grad_norm": 0.6064910292625427, + "learning_rate": 9.795908816377884e-05, + "loss": 2.0236, + "step": 3863 + }, + { + "epoch": 1.1860036832412524, + "grad_norm": 0.524116575717926, + "learning_rate": 9.795768230076454e-05, + "loss": 2.0315, + "step": 3864 + }, + { + "epoch": 1.1863106200122775, + "grad_norm": 0.449158251285553, + "learning_rate": 9.79562759638043e-05, + "loss": 1.9423, + "step": 3865 + }, + { + "epoch": 1.1866175567833026, + "grad_norm": 0.5623016953468323, + "learning_rate": 9.795486915291203e-05, + "loss": 2.096, + "step": 3866 + }, + { + "epoch": 1.1869244935543277, + "grad_norm": 0.6107217073440552, + "learning_rate": 9.795346186810164e-05, + "loss": 1.9994, + "step": 3867 + }, + { + "epoch": 1.187231430325353, + "grad_norm": 0.5559211373329163, + "learning_rate": 9.795205410938704e-05, + "loss": 2.0138, + "step": 3868 + }, + { + "epoch": 1.1875383670963782, + "grad_norm": 0.5022037029266357, + "learning_rate": 9.795064587678212e-05, + "loss": 2.0835, + "step": 3869 + }, + { + "epoch": 1.1878453038674033, + "grad_norm": 0.5760810971260071, + "learning_rate": 9.794923717030082e-05, + "loss": 2.0839, + "step": 3870 + }, + { + "epoch": 1.1881522406384284, + "grad_norm": 0.559018075466156, + "learning_rate": 9.794782798995706e-05, + "loss": 2.0397, + "step": 3871 + }, + { + "epoch": 1.1884591774094537, + "grad_norm": 0.48842501640319824, + "learning_rate": 9.794641833576477e-05, + "loss": 2.022, + "step": 3872 + }, + { + "epoch": 1.1887661141804788, + "grad_norm": 0.47267377376556396, + "learning_rate": 9.794500820773785e-05, + "loss": 1.9677, + "step": 3873 + }, + { + "epoch": 1.189073050951504, + "grad_norm": 0.5107980966567993, + "learning_rate": 9.794359760589026e-05, + "loss": 2.124, + "step": 3874 + }, + { + "epoch": 1.189379987722529, + "grad_norm": 0.4993875026702881, + "learning_rate": 9.794218653023595e-05, + "loss": 1.9528, + "step": 3875 + }, + { + "epoch": 1.1896869244935544, + "grad_norm": 0.49543896317481995, + "learning_rate": 9.794077498078885e-05, + "loss": 2.0257, + "step": 3876 + }, + { + "epoch": 1.1899938612645795, + "grad_norm": 0.5207403302192688, + "learning_rate": 9.79393629575629e-05, + "loss": 2.0853, + "step": 3877 + }, + { + "epoch": 1.1903007980356046, + "grad_norm": 0.44884833693504333, + "learning_rate": 9.793795046057208e-05, + "loss": 1.9366, + "step": 3878 + }, + { + "epoch": 1.1906077348066297, + "grad_norm": 0.47921934723854065, + "learning_rate": 9.793653748983033e-05, + "loss": 2.0614, + "step": 3879 + }, + { + "epoch": 1.190914671577655, + "grad_norm": 0.5371566414833069, + "learning_rate": 9.793512404535163e-05, + "loss": 2.0433, + "step": 3880 + }, + { + "epoch": 1.1912216083486802, + "grad_norm": 0.48760104179382324, + "learning_rate": 9.793371012714994e-05, + "loss": 2.0061, + "step": 3881 + }, + { + "epoch": 1.1915285451197053, + "grad_norm": 0.47291669249534607, + "learning_rate": 9.793229573523922e-05, + "loss": 2.0661, + "step": 3882 + }, + { + "epoch": 1.1918354818907306, + "grad_norm": 0.5348502397537231, + "learning_rate": 9.793088086963347e-05, + "loss": 2.0131, + "step": 3883 + }, + { + "epoch": 1.1921424186617557, + "grad_norm": 0.6291812062263489, + "learning_rate": 9.792946553034666e-05, + "loss": 2.0312, + "step": 3884 + }, + { + "epoch": 1.1924493554327809, + "grad_norm": 0.5620503425598145, + "learning_rate": 9.792804971739276e-05, + "loss": 2.0429, + "step": 3885 + }, + { + "epoch": 1.192756292203806, + "grad_norm": 0.4984607696533203, + "learning_rate": 9.792663343078581e-05, + "loss": 2.0183, + "step": 3886 + }, + { + "epoch": 1.193063228974831, + "grad_norm": 0.5867961645126343, + "learning_rate": 9.792521667053975e-05, + "loss": 2.0609, + "step": 3887 + }, + { + "epoch": 1.1933701657458564, + "grad_norm": 0.5819169282913208, + "learning_rate": 9.792379943666863e-05, + "loss": 1.9412, + "step": 3888 + }, + { + "epoch": 1.1936771025168815, + "grad_norm": 0.6232548952102661, + "learning_rate": 9.792238172918643e-05, + "loss": 2.0607, + "step": 3889 + }, + { + "epoch": 1.1939840392879066, + "grad_norm": 0.5859619379043579, + "learning_rate": 9.792096354810716e-05, + "loss": 2.0718, + "step": 3890 + }, + { + "epoch": 1.194290976058932, + "grad_norm": 0.47209057211875916, + "learning_rate": 9.791954489344485e-05, + "loss": 1.9872, + "step": 3891 + }, + { + "epoch": 1.194597912829957, + "grad_norm": 0.5183662176132202, + "learning_rate": 9.79181257652135e-05, + "loss": 2.0782, + "step": 3892 + }, + { + "epoch": 1.1949048496009822, + "grad_norm": 0.551873505115509, + "learning_rate": 9.791670616342715e-05, + "loss": 2.0477, + "step": 3893 + }, + { + "epoch": 1.1952117863720073, + "grad_norm": 0.47254955768585205, + "learning_rate": 9.791528608809984e-05, + "loss": 1.9859, + "step": 3894 + }, + { + "epoch": 1.1955187231430324, + "grad_norm": 0.45482897758483887, + "learning_rate": 9.791386553924556e-05, + "loss": 1.9939, + "step": 3895 + }, + { + "epoch": 1.1958256599140578, + "grad_norm": 0.4687066078186035, + "learning_rate": 9.79124445168784e-05, + "loss": 1.9982, + "step": 3896 + }, + { + "epoch": 1.1961325966850829, + "grad_norm": 0.4855460524559021, + "learning_rate": 9.791102302101236e-05, + "loss": 1.9667, + "step": 3897 + }, + { + "epoch": 1.196439533456108, + "grad_norm": 0.48152467608451843, + "learning_rate": 9.790960105166153e-05, + "loss": 1.9914, + "step": 3898 + }, + { + "epoch": 1.1967464702271333, + "grad_norm": 0.48487406969070435, + "learning_rate": 9.790817860883993e-05, + "loss": 1.9978, + "step": 3899 + }, + { + "epoch": 1.1970534069981584, + "grad_norm": 0.47665563225746155, + "learning_rate": 9.790675569256162e-05, + "loss": 1.9995, + "step": 3900 + }, + { + "epoch": 1.1973603437691835, + "grad_norm": 0.48938530683517456, + "learning_rate": 9.790533230284069e-05, + "loss": 2.0461, + "step": 3901 + }, + { + "epoch": 1.1976672805402087, + "grad_norm": 0.6336411237716675, + "learning_rate": 9.790390843969119e-05, + "loss": 2.0003, + "step": 3902 + }, + { + "epoch": 1.1979742173112338, + "grad_norm": 0.6946616172790527, + "learning_rate": 9.790248410312717e-05, + "loss": 1.9979, + "step": 3903 + }, + { + "epoch": 1.198281154082259, + "grad_norm": 0.7829384803771973, + "learning_rate": 9.790105929316274e-05, + "loss": 2.015, + "step": 3904 + }, + { + "epoch": 1.1985880908532842, + "grad_norm": 0.6874059438705444, + "learning_rate": 9.789963400981197e-05, + "loss": 1.9887, + "step": 3905 + }, + { + "epoch": 1.1988950276243093, + "grad_norm": 0.6074720025062561, + "learning_rate": 9.789820825308893e-05, + "loss": 2.0287, + "step": 3906 + }, + { + "epoch": 1.1992019643953347, + "grad_norm": 0.49311673641204834, + "learning_rate": 9.789678202300774e-05, + "loss": 1.9846, + "step": 3907 + }, + { + "epoch": 1.1995089011663598, + "grad_norm": 0.5266487002372742, + "learning_rate": 9.789535531958244e-05, + "loss": 2.017, + "step": 3908 + }, + { + "epoch": 1.1998158379373849, + "grad_norm": 0.6170570850372314, + "learning_rate": 9.789392814282721e-05, + "loss": 2.0615, + "step": 3909 + }, + { + "epoch": 1.20012277470841, + "grad_norm": 0.5820409059524536, + "learning_rate": 9.789250049275609e-05, + "loss": 2.0459, + "step": 3910 + }, + { + "epoch": 1.2004297114794351, + "grad_norm": 0.5220739841461182, + "learning_rate": 9.78910723693832e-05, + "loss": 2.0843, + "step": 3911 + }, + { + "epoch": 1.2007366482504604, + "grad_norm": 0.5884750485420227, + "learning_rate": 9.788964377272267e-05, + "loss": 2.1068, + "step": 3912 + }, + { + "epoch": 1.2010435850214856, + "grad_norm": 0.5634950995445251, + "learning_rate": 9.788821470278861e-05, + "loss": 2.0206, + "step": 3913 + }, + { + "epoch": 1.2013505217925107, + "grad_norm": 0.5219514966011047, + "learning_rate": 9.788678515959517e-05, + "loss": 2.0802, + "step": 3914 + }, + { + "epoch": 1.201657458563536, + "grad_norm": 0.5870078206062317, + "learning_rate": 9.788535514315642e-05, + "loss": 2.0149, + "step": 3915 + }, + { + "epoch": 1.2019643953345611, + "grad_norm": 0.4850577414035797, + "learning_rate": 9.788392465348653e-05, + "loss": 2.0424, + "step": 3916 + }, + { + "epoch": 1.2022713321055862, + "grad_norm": 0.5354881882667542, + "learning_rate": 9.788249369059964e-05, + "loss": 2.0822, + "step": 3917 + }, + { + "epoch": 1.2025782688766113, + "grad_norm": 0.5817529559135437, + "learning_rate": 9.788106225450988e-05, + "loss": 2.0384, + "step": 3918 + }, + { + "epoch": 1.2028852056476367, + "grad_norm": 0.5685575008392334, + "learning_rate": 9.78796303452314e-05, + "loss": 1.9777, + "step": 3919 + }, + { + "epoch": 1.2031921424186618, + "grad_norm": 0.5086472034454346, + "learning_rate": 9.787819796277835e-05, + "loss": 1.9109, + "step": 3920 + }, + { + "epoch": 1.203499079189687, + "grad_norm": 0.45905008912086487, + "learning_rate": 9.787676510716488e-05, + "loss": 1.9945, + "step": 3921 + }, + { + "epoch": 1.203806015960712, + "grad_norm": 0.6052672863006592, + "learning_rate": 9.787533177840516e-05, + "loss": 2.0873, + "step": 3922 + }, + { + "epoch": 1.2041129527317374, + "grad_norm": 0.636320173740387, + "learning_rate": 9.787389797651334e-05, + "loss": 1.954, + "step": 3923 + }, + { + "epoch": 1.2044198895027625, + "grad_norm": 0.5775459408760071, + "learning_rate": 9.78724637015036e-05, + "loss": 1.9632, + "step": 3924 + }, + { + "epoch": 1.2047268262737876, + "grad_norm": 0.4593936502933502, + "learning_rate": 9.787102895339013e-05, + "loss": 1.948, + "step": 3925 + }, + { + "epoch": 1.2050337630448127, + "grad_norm": 0.4568643867969513, + "learning_rate": 9.78695937321871e-05, + "loss": 1.977, + "step": 3926 + }, + { + "epoch": 1.205340699815838, + "grad_norm": 0.6079357266426086, + "learning_rate": 9.786815803790867e-05, + "loss": 1.9738, + "step": 3927 + }, + { + "epoch": 1.2056476365868631, + "grad_norm": 0.5991626977920532, + "learning_rate": 9.786672187056905e-05, + "loss": 1.9603, + "step": 3928 + }, + { + "epoch": 1.2059545733578882, + "grad_norm": 0.4844282865524292, + "learning_rate": 9.786528523018242e-05, + "loss": 1.9739, + "step": 3929 + }, + { + "epoch": 1.2062615101289134, + "grad_norm": 0.43694475293159485, + "learning_rate": 9.786384811676298e-05, + "loss": 1.957, + "step": 3930 + }, + { + "epoch": 1.2065684468999387, + "grad_norm": 0.5742451548576355, + "learning_rate": 9.786241053032496e-05, + "loss": 1.9872, + "step": 3931 + }, + { + "epoch": 1.2068753836709638, + "grad_norm": 0.6246824860572815, + "learning_rate": 9.786097247088255e-05, + "loss": 2.0747, + "step": 3932 + }, + { + "epoch": 1.207182320441989, + "grad_norm": 0.5364731550216675, + "learning_rate": 9.785953393844996e-05, + "loss": 1.9793, + "step": 3933 + }, + { + "epoch": 1.207489257213014, + "grad_norm": 0.42909273505210876, + "learning_rate": 9.785809493304139e-05, + "loss": 1.9959, + "step": 3934 + }, + { + "epoch": 1.2077961939840394, + "grad_norm": 0.43952879309654236, + "learning_rate": 9.785665545467108e-05, + "loss": 2.0019, + "step": 3935 + }, + { + "epoch": 1.2081031307550645, + "grad_norm": 0.45972180366516113, + "learning_rate": 9.785521550335323e-05, + "loss": 1.9504, + "step": 3936 + }, + { + "epoch": 1.2084100675260896, + "grad_norm": 0.5592246651649475, + "learning_rate": 9.785377507910212e-05, + "loss": 2.0214, + "step": 3937 + }, + { + "epoch": 1.2087170042971147, + "grad_norm": 0.6084285378456116, + "learning_rate": 9.785233418193196e-05, + "loss": 2.08, + "step": 3938 + }, + { + "epoch": 1.20902394106814, + "grad_norm": 0.5370670557022095, + "learning_rate": 9.785089281185698e-05, + "loss": 2.0877, + "step": 3939 + }, + { + "epoch": 1.2093308778391652, + "grad_norm": 0.466501921415329, + "learning_rate": 9.784945096889143e-05, + "loss": 1.9795, + "step": 3940 + }, + { + "epoch": 1.2096378146101903, + "grad_norm": 0.48617517948150635, + "learning_rate": 9.784800865304954e-05, + "loss": 2.0099, + "step": 3941 + }, + { + "epoch": 1.2099447513812154, + "grad_norm": 0.528110921382904, + "learning_rate": 9.78465658643456e-05, + "loss": 2.0597, + "step": 3942 + }, + { + "epoch": 1.2102516881522407, + "grad_norm": 0.47355538606643677, + "learning_rate": 9.784512260279385e-05, + "loss": 2.0145, + "step": 3943 + }, + { + "epoch": 1.2105586249232658, + "grad_norm": 0.46970823407173157, + "learning_rate": 9.784367886840856e-05, + "loss": 2.0533, + "step": 3944 + }, + { + "epoch": 1.210865561694291, + "grad_norm": 0.41206037998199463, + "learning_rate": 9.784223466120399e-05, + "loss": 1.9226, + "step": 3945 + }, + { + "epoch": 1.211172498465316, + "grad_norm": 0.4298155605792999, + "learning_rate": 9.784078998119442e-05, + "loss": 2.0686, + "step": 3946 + }, + { + "epoch": 1.2114794352363414, + "grad_norm": 0.4616359770298004, + "learning_rate": 9.783934482839412e-05, + "loss": 2.0063, + "step": 3947 + }, + { + "epoch": 1.2117863720073665, + "grad_norm": 0.476726233959198, + "learning_rate": 9.783789920281737e-05, + "loss": 1.9868, + "step": 3948 + }, + { + "epoch": 1.2120933087783916, + "grad_norm": 0.5075610876083374, + "learning_rate": 9.783645310447846e-05, + "loss": 2.1019, + "step": 3949 + }, + { + "epoch": 1.212400245549417, + "grad_norm": 0.49806225299835205, + "learning_rate": 9.78350065333917e-05, + "loss": 2.0503, + "step": 3950 + }, + { + "epoch": 1.212707182320442, + "grad_norm": 0.5278452634811401, + "learning_rate": 9.783355948957134e-05, + "loss": 2.0513, + "step": 3951 + }, + { + "epoch": 1.2130141190914672, + "grad_norm": 0.5634627938270569, + "learning_rate": 9.783211197303174e-05, + "loss": 2.1135, + "step": 3952 + }, + { + "epoch": 1.2133210558624923, + "grad_norm": 0.5152999758720398, + "learning_rate": 9.783066398378715e-05, + "loss": 2.0392, + "step": 3953 + }, + { + "epoch": 1.2136279926335174, + "grad_norm": 0.48095864057540894, + "learning_rate": 9.782921552185191e-05, + "loss": 1.982, + "step": 3954 + }, + { + "epoch": 1.2139349294045427, + "grad_norm": 0.47377893328666687, + "learning_rate": 9.782776658724034e-05, + "loss": 1.9538, + "step": 3955 + }, + { + "epoch": 1.2142418661755678, + "grad_norm": 0.5260181427001953, + "learning_rate": 9.782631717996675e-05, + "loss": 2.1197, + "step": 3956 + }, + { + "epoch": 1.214548802946593, + "grad_norm": 0.5640038251876831, + "learning_rate": 9.782486730004544e-05, + "loss": 2.0338, + "step": 3957 + }, + { + "epoch": 1.2148557397176183, + "grad_norm": 0.5091645121574402, + "learning_rate": 9.782341694749078e-05, + "loss": 1.9921, + "step": 3958 + }, + { + "epoch": 1.2151626764886434, + "grad_norm": 0.48285624384880066, + "learning_rate": 9.782196612231706e-05, + "loss": 2.0358, + "step": 3959 + }, + { + "epoch": 1.2154696132596685, + "grad_norm": 0.5013573169708252, + "learning_rate": 9.782051482453867e-05, + "loss": 1.9378, + "step": 3960 + }, + { + "epoch": 1.2157765500306936, + "grad_norm": 0.42000052332878113, + "learning_rate": 9.781906305416991e-05, + "loss": 1.9232, + "step": 3961 + }, + { + "epoch": 1.2160834868017187, + "grad_norm": 0.4651196599006653, + "learning_rate": 9.781761081122514e-05, + "loss": 2.0244, + "step": 3962 + }, + { + "epoch": 1.216390423572744, + "grad_norm": 0.48081469535827637, + "learning_rate": 9.781615809571871e-05, + "loss": 1.938, + "step": 3963 + }, + { + "epoch": 1.2166973603437692, + "grad_norm": 0.4692462086677551, + "learning_rate": 9.7814704907665e-05, + "loss": 1.9592, + "step": 3964 + }, + { + "epoch": 1.2170042971147943, + "grad_norm": 0.5545635223388672, + "learning_rate": 9.781325124707832e-05, + "loss": 2.0882, + "step": 3965 + }, + { + "epoch": 1.2173112338858196, + "grad_norm": 0.47801801562309265, + "learning_rate": 9.78117971139731e-05, + "loss": 2.0127, + "step": 3966 + }, + { + "epoch": 1.2176181706568447, + "grad_norm": 0.4705824851989746, + "learning_rate": 9.781034250836364e-05, + "loss": 2.0659, + "step": 3967 + }, + { + "epoch": 1.2179251074278699, + "grad_norm": 0.4757092297077179, + "learning_rate": 9.78088874302644e-05, + "loss": 1.9177, + "step": 3968 + }, + { + "epoch": 1.218232044198895, + "grad_norm": 0.4563291370868683, + "learning_rate": 9.780743187968968e-05, + "loss": 1.991, + "step": 3969 + }, + { + "epoch": 1.21853898096992, + "grad_norm": 0.4641762375831604, + "learning_rate": 9.78059758566539e-05, + "loss": 2.0357, + "step": 3970 + }, + { + "epoch": 1.2188459177409454, + "grad_norm": 0.510754406452179, + "learning_rate": 9.780451936117145e-05, + "loss": 2.0754, + "step": 3971 + }, + { + "epoch": 1.2191528545119705, + "grad_norm": 0.5595460534095764, + "learning_rate": 9.780306239325671e-05, + "loss": 2.0449, + "step": 3972 + }, + { + "epoch": 1.2194597912829956, + "grad_norm": 0.5778231620788574, + "learning_rate": 9.780160495292412e-05, + "loss": 2.0187, + "step": 3973 + }, + { + "epoch": 1.219766728054021, + "grad_norm": 0.5098022818565369, + "learning_rate": 9.780014704018803e-05, + "loss": 1.9881, + "step": 3974 + }, + { + "epoch": 1.220073664825046, + "grad_norm": 0.46725937724113464, + "learning_rate": 9.779868865506288e-05, + "loss": 1.9929, + "step": 3975 + }, + { + "epoch": 1.2203806015960712, + "grad_norm": 0.48517540097236633, + "learning_rate": 9.779722979756304e-05, + "loss": 1.9446, + "step": 3976 + }, + { + "epoch": 1.2206875383670963, + "grad_norm": 0.5013269186019897, + "learning_rate": 9.7795770467703e-05, + "loss": 2.0256, + "step": 3977 + }, + { + "epoch": 1.2209944751381214, + "grad_norm": 0.4918982982635498, + "learning_rate": 9.779431066549713e-05, + "loss": 1.9732, + "step": 3978 + }, + { + "epoch": 1.2213014119091468, + "grad_norm": 0.45646655559539795, + "learning_rate": 9.779285039095987e-05, + "loss": 1.9672, + "step": 3979 + }, + { + "epoch": 1.2216083486801719, + "grad_norm": 0.4712901711463928, + "learning_rate": 9.779138964410565e-05, + "loss": 2.0074, + "step": 3980 + }, + { + "epoch": 1.221915285451197, + "grad_norm": 0.4901394844055176, + "learning_rate": 9.77899284249489e-05, + "loss": 2.0073, + "step": 3981 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.579247772693634, + "learning_rate": 9.778846673350407e-05, + "loss": 2.0983, + "step": 3982 + }, + { + "epoch": 1.2225291589932474, + "grad_norm": 0.6108444929122925, + "learning_rate": 9.77870045697856e-05, + "loss": 2.0268, + "step": 3983 + }, + { + "epoch": 1.2228360957642725, + "grad_norm": 0.5592121481895447, + "learning_rate": 9.778554193380796e-05, + "loss": 2.0549, + "step": 3984 + }, + { + "epoch": 1.2231430325352977, + "grad_norm": 0.538088858127594, + "learning_rate": 9.778407882558556e-05, + "loss": 1.9398, + "step": 3985 + }, + { + "epoch": 1.223449969306323, + "grad_norm": 0.5928295850753784, + "learning_rate": 9.77826152451329e-05, + "loss": 2.0341, + "step": 3986 + }, + { + "epoch": 1.223756906077348, + "grad_norm": 0.566687822341919, + "learning_rate": 9.778115119246442e-05, + "loss": 2.0629, + "step": 3987 + }, + { + "epoch": 1.2240638428483732, + "grad_norm": 0.7019027471542358, + "learning_rate": 9.777968666759461e-05, + "loss": 1.9979, + "step": 3988 + }, + { + "epoch": 1.2243707796193983, + "grad_norm": 0.7198969721794128, + "learning_rate": 9.777822167053793e-05, + "loss": 1.9898, + "step": 3989 + }, + { + "epoch": 1.2246777163904237, + "grad_norm": 0.6319006085395813, + "learning_rate": 9.777675620130887e-05, + "loss": 1.9591, + "step": 3990 + }, + { + "epoch": 1.2249846531614488, + "grad_norm": 0.5372903347015381, + "learning_rate": 9.777529025992187e-05, + "loss": 1.9605, + "step": 3991 + }, + { + "epoch": 1.225291589932474, + "grad_norm": 0.47436487674713135, + "learning_rate": 9.777382384639147e-05, + "loss": 1.9667, + "step": 3992 + }, + { + "epoch": 1.225598526703499, + "grad_norm": 0.5885797739028931, + "learning_rate": 9.777235696073214e-05, + "loss": 2.0363, + "step": 3993 + }, + { + "epoch": 1.2259054634745243, + "grad_norm": 0.6333138346672058, + "learning_rate": 9.777088960295838e-05, + "loss": 1.9352, + "step": 3994 + }, + { + "epoch": 1.2262124002455494, + "grad_norm": 0.6364251971244812, + "learning_rate": 9.776942177308468e-05, + "loss": 1.9577, + "step": 3995 + }, + { + "epoch": 1.2265193370165746, + "grad_norm": 0.5114668607711792, + "learning_rate": 9.776795347112557e-05, + "loss": 2.0241, + "step": 3996 + }, + { + "epoch": 1.2268262737875997, + "grad_norm": 0.6139995455741882, + "learning_rate": 9.776648469709556e-05, + "loss": 1.9847, + "step": 3997 + }, + { + "epoch": 1.227133210558625, + "grad_norm": 0.6104671955108643, + "learning_rate": 9.776501545100911e-05, + "loss": 1.9311, + "step": 3998 + }, + { + "epoch": 1.2274401473296501, + "grad_norm": 0.5099297761917114, + "learning_rate": 9.776354573288081e-05, + "loss": 2.0877, + "step": 3999 + }, + { + "epoch": 1.2277470841006752, + "grad_norm": 0.48199233412742615, + "learning_rate": 9.776207554272516e-05, + "loss": 1.9802, + "step": 4000 + }, + { + "epoch": 1.2280540208717003, + "grad_norm": 0.5323067307472229, + "learning_rate": 9.776060488055667e-05, + "loss": 2.0278, + "step": 4001 + }, + { + "epoch": 1.2283609576427257, + "grad_norm": 0.49086472392082214, + "learning_rate": 9.775913374638988e-05, + "loss": 2.0242, + "step": 4002 + }, + { + "epoch": 1.2286678944137508, + "grad_norm": 0.4812946319580078, + "learning_rate": 9.775766214023936e-05, + "loss": 1.9762, + "step": 4003 + }, + { + "epoch": 1.228974831184776, + "grad_norm": 0.44118809700012207, + "learning_rate": 9.775619006211962e-05, + "loss": 1.9242, + "step": 4004 + }, + { + "epoch": 1.229281767955801, + "grad_norm": 0.4507352113723755, + "learning_rate": 9.775471751204522e-05, + "loss": 2.0015, + "step": 4005 + }, + { + "epoch": 1.2295887047268264, + "grad_norm": 0.4620691239833832, + "learning_rate": 9.775324449003072e-05, + "loss": 2.0269, + "step": 4006 + }, + { + "epoch": 1.2298956414978515, + "grad_norm": 0.5053025484085083, + "learning_rate": 9.775177099609065e-05, + "loss": 1.9764, + "step": 4007 + }, + { + "epoch": 1.2302025782688766, + "grad_norm": 0.5113483667373657, + "learning_rate": 9.775029703023961e-05, + "loss": 2.0583, + "step": 4008 + }, + { + "epoch": 1.2305095150399017, + "grad_norm": 0.517400324344635, + "learning_rate": 9.774882259249214e-05, + "loss": 2.0918, + "step": 4009 + }, + { + "epoch": 1.230816451810927, + "grad_norm": 0.5575035214424133, + "learning_rate": 9.774734768286282e-05, + "loss": 2.0573, + "step": 4010 + }, + { + "epoch": 1.2311233885819521, + "grad_norm": 0.5556582808494568, + "learning_rate": 9.774587230136622e-05, + "loss": 1.9612, + "step": 4011 + }, + { + "epoch": 1.2314303253529773, + "grad_norm": 0.541752815246582, + "learning_rate": 9.774439644801693e-05, + "loss": 2.0165, + "step": 4012 + }, + { + "epoch": 1.2317372621240024, + "grad_norm": 0.46944886445999146, + "learning_rate": 9.774292012282953e-05, + "loss": 2.0068, + "step": 4013 + }, + { + "epoch": 1.2320441988950277, + "grad_norm": 0.5507385730743408, + "learning_rate": 9.77414433258186e-05, + "loss": 2.0092, + "step": 4014 + }, + { + "epoch": 1.2323511356660528, + "grad_norm": 0.550862193107605, + "learning_rate": 9.773996605699875e-05, + "loss": 1.9887, + "step": 4015 + }, + { + "epoch": 1.232658072437078, + "grad_norm": 0.5281004905700684, + "learning_rate": 9.77384883163846e-05, + "loss": 2.0214, + "step": 4016 + }, + { + "epoch": 1.232965009208103, + "grad_norm": 0.5682541131973267, + "learning_rate": 9.77370101039907e-05, + "loss": 2.0021, + "step": 4017 + }, + { + "epoch": 1.2332719459791284, + "grad_norm": 0.5083168745040894, + "learning_rate": 9.77355314198317e-05, + "loss": 1.9589, + "step": 4018 + }, + { + "epoch": 1.2335788827501535, + "grad_norm": 0.48763957619667053, + "learning_rate": 9.773405226392218e-05, + "loss": 1.9517, + "step": 4019 + }, + { + "epoch": 1.2338858195211786, + "grad_norm": 0.4721868634223938, + "learning_rate": 9.77325726362768e-05, + "loss": 1.959, + "step": 4020 + }, + { + "epoch": 1.2341927562922037, + "grad_norm": 0.5072606205940247, + "learning_rate": 9.773109253691016e-05, + "loss": 2.0252, + "step": 4021 + }, + { + "epoch": 1.234499693063229, + "grad_norm": 0.483260840177536, + "learning_rate": 9.772961196583686e-05, + "loss": 2.0205, + "step": 4022 + }, + { + "epoch": 1.2348066298342542, + "grad_norm": 0.4468609392642975, + "learning_rate": 9.772813092307158e-05, + "loss": 2.0182, + "step": 4023 + }, + { + "epoch": 1.2351135666052793, + "grad_norm": 0.4950753152370453, + "learning_rate": 9.772664940862893e-05, + "loss": 2.0276, + "step": 4024 + }, + { + "epoch": 1.2354205033763046, + "grad_norm": 0.45740416646003723, + "learning_rate": 9.772516742252356e-05, + "loss": 1.9519, + "step": 4025 + }, + { + "epoch": 1.2357274401473297, + "grad_norm": 0.409072607755661, + "learning_rate": 9.772368496477011e-05, + "loss": 1.9441, + "step": 4026 + }, + { + "epoch": 1.2360343769183548, + "grad_norm": 0.44857287406921387, + "learning_rate": 9.772220203538325e-05, + "loss": 1.9941, + "step": 4027 + }, + { + "epoch": 1.23634131368938, + "grad_norm": 0.4610998034477234, + "learning_rate": 9.77207186343776e-05, + "loss": 1.9855, + "step": 4028 + }, + { + "epoch": 1.236648250460405, + "grad_norm": 0.4809660017490387, + "learning_rate": 9.771923476176784e-05, + "loss": 1.9596, + "step": 4029 + }, + { + "epoch": 1.2369551872314304, + "grad_norm": 0.5011657476425171, + "learning_rate": 9.771775041756865e-05, + "loss": 1.9537, + "step": 4030 + }, + { + "epoch": 1.2372621240024555, + "grad_norm": 0.476001501083374, + "learning_rate": 9.771626560179465e-05, + "loss": 1.9447, + "step": 4031 + }, + { + "epoch": 1.2375690607734806, + "grad_norm": 0.4733816385269165, + "learning_rate": 9.771478031446057e-05, + "loss": 2.08, + "step": 4032 + }, + { + "epoch": 1.237875997544506, + "grad_norm": 0.4763995409011841, + "learning_rate": 9.771329455558108e-05, + "loss": 1.9483, + "step": 4033 + }, + { + "epoch": 1.238182934315531, + "grad_norm": 0.4906281530857086, + "learning_rate": 9.771180832517082e-05, + "loss": 1.9619, + "step": 4034 + }, + { + "epoch": 1.2384898710865562, + "grad_norm": 0.48713672161102295, + "learning_rate": 9.77103216232445e-05, + "loss": 1.9753, + "step": 4035 + }, + { + "epoch": 1.2387968078575813, + "grad_norm": 0.5214180946350098, + "learning_rate": 9.770883444981683e-05, + "loss": 2.0407, + "step": 4036 + }, + { + "epoch": 1.2391037446286064, + "grad_norm": 0.5161129236221313, + "learning_rate": 9.77073468049025e-05, + "loss": 2.0298, + "step": 4037 + }, + { + "epoch": 1.2394106813996317, + "grad_norm": 0.5041607022285461, + "learning_rate": 9.770585868851621e-05, + "loss": 1.9898, + "step": 4038 + }, + { + "epoch": 1.2397176181706568, + "grad_norm": 0.5076795220375061, + "learning_rate": 9.770437010067264e-05, + "loss": 1.9899, + "step": 4039 + }, + { + "epoch": 1.240024554941682, + "grad_norm": 0.47992074489593506, + "learning_rate": 9.770288104138654e-05, + "loss": 1.9923, + "step": 4040 + }, + { + "epoch": 1.2403314917127073, + "grad_norm": 0.4655405580997467, + "learning_rate": 9.770139151067261e-05, + "loss": 2.0082, + "step": 4041 + }, + { + "epoch": 1.2406384284837324, + "grad_norm": 0.499953031539917, + "learning_rate": 9.769990150854558e-05, + "loss": 2.0412, + "step": 4042 + }, + { + "epoch": 1.2409453652547575, + "grad_norm": 0.5288184285163879, + "learning_rate": 9.769841103502016e-05, + "loss": 2.0163, + "step": 4043 + }, + { + "epoch": 1.2412523020257826, + "grad_norm": 0.6660463809967041, + "learning_rate": 9.769692009011107e-05, + "loss": 2.1644, + "step": 4044 + }, + { + "epoch": 1.2415592387968077, + "grad_norm": 0.7020677328109741, + "learning_rate": 9.769542867383306e-05, + "loss": 1.9921, + "step": 4045 + }, + { + "epoch": 1.241866175567833, + "grad_norm": 0.8394366502761841, + "learning_rate": 9.769393678620089e-05, + "loss": 2.0099, + "step": 4046 + }, + { + "epoch": 1.2421731123388582, + "grad_norm": 0.9541008472442627, + "learning_rate": 9.769244442722927e-05, + "loss": 2.0035, + "step": 4047 + }, + { + "epoch": 1.2424800491098833, + "grad_norm": 0.8454573750495911, + "learning_rate": 9.769095159693296e-05, + "loss": 2.0075, + "step": 4048 + }, + { + "epoch": 1.2427869858809086, + "grad_norm": 0.6634951233863831, + "learning_rate": 9.768945829532672e-05, + "loss": 2.0352, + "step": 4049 + }, + { + "epoch": 1.2430939226519337, + "grad_norm": 0.5453166365623474, + "learning_rate": 9.76879645224253e-05, + "loss": 2.0259, + "step": 4050 + }, + { + "epoch": 1.2434008594229589, + "grad_norm": 0.8018995523452759, + "learning_rate": 9.768647027824344e-05, + "loss": 2.0175, + "step": 4051 + }, + { + "epoch": 1.243707796193984, + "grad_norm": 0.8518994450569153, + "learning_rate": 9.768497556279596e-05, + "loss": 1.986, + "step": 4052 + }, + { + "epoch": 1.244014732965009, + "grad_norm": 0.670764684677124, + "learning_rate": 9.76834803760976e-05, + "loss": 1.9779, + "step": 4053 + }, + { + "epoch": 1.2443216697360344, + "grad_norm": 0.5042433142662048, + "learning_rate": 9.768198471816312e-05, + "loss": 1.9808, + "step": 4054 + }, + { + "epoch": 1.2446286065070595, + "grad_norm": 0.45487603545188904, + "learning_rate": 9.768048858900733e-05, + "loss": 2.011, + "step": 4055 + }, + { + "epoch": 1.2449355432780846, + "grad_norm": 0.5012104511260986, + "learning_rate": 9.767899198864502e-05, + "loss": 1.9945, + "step": 4056 + }, + { + "epoch": 1.24524248004911, + "grad_norm": 0.6275805234909058, + "learning_rate": 9.767749491709095e-05, + "loss": 2.0397, + "step": 4057 + }, + { + "epoch": 1.245549416820135, + "grad_norm": 0.601513683795929, + "learning_rate": 9.767599737435993e-05, + "loss": 2.0201, + "step": 4058 + }, + { + "epoch": 1.2458563535911602, + "grad_norm": 0.531112551689148, + "learning_rate": 9.767449936046678e-05, + "loss": 2.0449, + "step": 4059 + }, + { + "epoch": 1.2461632903621853, + "grad_norm": 0.48515528440475464, + "learning_rate": 9.767300087542626e-05, + "loss": 2.0318, + "step": 4060 + }, + { + "epoch": 1.2464702271332107, + "grad_norm": 0.49292388558387756, + "learning_rate": 9.767150191925321e-05, + "loss": 2.0004, + "step": 4061 + }, + { + "epoch": 1.2467771639042358, + "grad_norm": 0.6046907901763916, + "learning_rate": 9.767000249196242e-05, + "loss": 2.0141, + "step": 4062 + }, + { + "epoch": 1.2470841006752609, + "grad_norm": 0.5311875939369202, + "learning_rate": 9.766850259356876e-05, + "loss": 1.9909, + "step": 4063 + }, + { + "epoch": 1.247391037446286, + "grad_norm": 0.535664975643158, + "learning_rate": 9.7667002224087e-05, + "loss": 2.07, + "step": 4064 + }, + { + "epoch": 1.2476979742173113, + "grad_norm": 0.594886839389801, + "learning_rate": 9.766550138353199e-05, + "loss": 1.9646, + "step": 4065 + }, + { + "epoch": 1.2480049109883364, + "grad_norm": 0.6726763844490051, + "learning_rate": 9.766400007191856e-05, + "loss": 1.9778, + "step": 4066 + }, + { + "epoch": 1.2483118477593615, + "grad_norm": 0.6045297384262085, + "learning_rate": 9.766249828926154e-05, + "loss": 2.0215, + "step": 4067 + }, + { + "epoch": 1.2486187845303867, + "grad_norm": 0.56207275390625, + "learning_rate": 9.766099603557576e-05, + "loss": 2.0252, + "step": 4068 + }, + { + "epoch": 1.248925721301412, + "grad_norm": 0.6623022556304932, + "learning_rate": 9.765949331087611e-05, + "loss": 1.975, + "step": 4069 + }, + { + "epoch": 1.249232658072437, + "grad_norm": 0.6274738311767578, + "learning_rate": 9.76579901151774e-05, + "loss": 2.037, + "step": 4070 + }, + { + "epoch": 1.2495395948434622, + "grad_norm": 0.5161643028259277, + "learning_rate": 9.76564864484945e-05, + "loss": 1.969, + "step": 4071 + }, + { + "epoch": 1.2498465316144873, + "grad_norm": 0.5624449849128723, + "learning_rate": 9.765498231084227e-05, + "loss": 2.0322, + "step": 4072 + }, + { + "epoch": 1.2501534683855127, + "grad_norm": 0.6198796629905701, + "learning_rate": 9.765347770223556e-05, + "loss": 1.986, + "step": 4073 + }, + { + "epoch": 1.2504604051565378, + "grad_norm": 0.5928165316581726, + "learning_rate": 9.765197262268927e-05, + "loss": 1.9886, + "step": 4074 + }, + { + "epoch": 1.250767341927563, + "grad_norm": 0.476484090089798, + "learning_rate": 9.765046707221825e-05, + "loss": 2.0476, + "step": 4075 + }, + { + "epoch": 1.2510742786985882, + "grad_norm": 0.5001220703125, + "learning_rate": 9.764896105083738e-05, + "loss": 1.9222, + "step": 4076 + }, + { + "epoch": 1.2513812154696133, + "grad_norm": 0.5429214239120483, + "learning_rate": 9.764745455856156e-05, + "loss": 2.0005, + "step": 4077 + }, + { + "epoch": 1.2516881522406385, + "grad_norm": 0.49443748593330383, + "learning_rate": 9.764594759540566e-05, + "loss": 1.9746, + "step": 4078 + }, + { + "epoch": 1.2519950890116636, + "grad_norm": 0.46963369846343994, + "learning_rate": 9.764444016138458e-05, + "loss": 1.9133, + "step": 4079 + }, + { + "epoch": 1.2523020257826887, + "grad_norm": 0.5112172365188599, + "learning_rate": 9.764293225651324e-05, + "loss": 1.9488, + "step": 4080 + }, + { + "epoch": 1.252608962553714, + "grad_norm": 0.4584117829799652, + "learning_rate": 9.764142388080648e-05, + "loss": 1.9895, + "step": 4081 + }, + { + "epoch": 1.2529158993247391, + "grad_norm": 0.48059090971946716, + "learning_rate": 9.763991503427927e-05, + "loss": 2.0436, + "step": 4082 + }, + { + "epoch": 1.2532228360957642, + "grad_norm": 0.5877810120582581, + "learning_rate": 9.763840571694649e-05, + "loss": 1.97, + "step": 4083 + }, + { + "epoch": 1.2535297728667896, + "grad_norm": 0.5370834469795227, + "learning_rate": 9.763689592882306e-05, + "loss": 2.0369, + "step": 4084 + }, + { + "epoch": 1.2538367096378147, + "grad_norm": 0.5483170747756958, + "learning_rate": 9.763538566992392e-05, + "loss": 2.066, + "step": 4085 + }, + { + "epoch": 1.2541436464088398, + "grad_norm": 0.5209359526634216, + "learning_rate": 9.763387494026396e-05, + "loss": 2.0685, + "step": 4086 + }, + { + "epoch": 1.254450583179865, + "grad_norm": 0.5569130182266235, + "learning_rate": 9.763236373985813e-05, + "loss": 2.0253, + "step": 4087 + }, + { + "epoch": 1.25475751995089, + "grad_norm": 0.48483753204345703, + "learning_rate": 9.763085206872136e-05, + "loss": 1.9851, + "step": 4088 + }, + { + "epoch": 1.2550644567219154, + "grad_norm": 0.4289563000202179, + "learning_rate": 9.76293399268686e-05, + "loss": 1.9374, + "step": 4089 + }, + { + "epoch": 1.2553713934929405, + "grad_norm": 0.4691961109638214, + "learning_rate": 9.762782731431478e-05, + "loss": 1.9588, + "step": 4090 + }, + { + "epoch": 1.2556783302639656, + "grad_norm": 0.49626582860946655, + "learning_rate": 9.762631423107488e-05, + "loss": 1.999, + "step": 4091 + }, + { + "epoch": 1.255985267034991, + "grad_norm": 0.5099872946739197, + "learning_rate": 9.762480067716381e-05, + "loss": 2.013, + "step": 4092 + }, + { + "epoch": 1.256292203806016, + "grad_norm": 0.47525838017463684, + "learning_rate": 9.762328665259654e-05, + "loss": 1.9953, + "step": 4093 + }, + { + "epoch": 1.2565991405770411, + "grad_norm": 0.4277878999710083, + "learning_rate": 9.762177215738804e-05, + "loss": 1.9623, + "step": 4094 + }, + { + "epoch": 1.2569060773480663, + "grad_norm": 0.46068885922431946, + "learning_rate": 9.762025719155328e-05, + "loss": 2.0012, + "step": 4095 + }, + { + "epoch": 1.2572130141190914, + "grad_norm": 0.4566059410572052, + "learning_rate": 9.761874175510723e-05, + "loss": 1.9666, + "step": 4096 + }, + { + "epoch": 1.2575199508901167, + "grad_norm": 0.44656631350517273, + "learning_rate": 9.761722584806487e-05, + "loss": 1.9912, + "step": 4097 + }, + { + "epoch": 1.2578268876611418, + "grad_norm": 0.5149295330047607, + "learning_rate": 9.761570947044117e-05, + "loss": 1.9876, + "step": 4098 + }, + { + "epoch": 1.258133824432167, + "grad_norm": 0.5265617370605469, + "learning_rate": 9.761419262225111e-05, + "loss": 2.0817, + "step": 4099 + }, + { + "epoch": 1.2584407612031923, + "grad_norm": 0.5015068054199219, + "learning_rate": 9.76126753035097e-05, + "loss": 1.9767, + "step": 4100 + }, + { + "epoch": 1.2587476979742174, + "grad_norm": 0.5178890228271484, + "learning_rate": 9.761115751423192e-05, + "loss": 1.9968, + "step": 4101 + }, + { + "epoch": 1.2590546347452425, + "grad_norm": 0.46565014123916626, + "learning_rate": 9.760963925443279e-05, + "loss": 1.8977, + "step": 4102 + }, + { + "epoch": 1.2593615715162676, + "grad_norm": 0.466398686170578, + "learning_rate": 9.760812052412728e-05, + "loss": 2.0317, + "step": 4103 + }, + { + "epoch": 1.2596685082872927, + "grad_norm": 0.48445576429367065, + "learning_rate": 9.760660132333043e-05, + "loss": 1.9953, + "step": 4104 + }, + { + "epoch": 1.259975445058318, + "grad_norm": 0.5716978907585144, + "learning_rate": 9.760508165205724e-05, + "loss": 2.0468, + "step": 4105 + }, + { + "epoch": 1.2602823818293432, + "grad_norm": 0.5168376564979553, + "learning_rate": 9.760356151032273e-05, + "loss": 1.9896, + "step": 4106 + }, + { + "epoch": 1.2605893186003683, + "grad_norm": 0.5014469027519226, + "learning_rate": 9.760204089814192e-05, + "loss": 2.0855, + "step": 4107 + }, + { + "epoch": 1.2608962553713936, + "grad_norm": 0.5283352732658386, + "learning_rate": 9.760051981552984e-05, + "loss": 2.0477, + "step": 4108 + }, + { + "epoch": 1.2612031921424187, + "grad_norm": 0.4526209533214569, + "learning_rate": 9.759899826250153e-05, + "loss": 1.9638, + "step": 4109 + }, + { + "epoch": 1.2615101289134438, + "grad_norm": 0.4565027058124542, + "learning_rate": 9.759747623907203e-05, + "loss": 1.9401, + "step": 4110 + }, + { + "epoch": 1.261817065684469, + "grad_norm": 0.48825928568840027, + "learning_rate": 9.759595374525636e-05, + "loss": 1.9721, + "step": 4111 + }, + { + "epoch": 1.262124002455494, + "grad_norm": 0.4922933578491211, + "learning_rate": 9.759443078106958e-05, + "loss": 1.969, + "step": 4112 + }, + { + "epoch": 1.2624309392265194, + "grad_norm": 0.5227758884429932, + "learning_rate": 9.759290734652674e-05, + "loss": 2.0144, + "step": 4113 + }, + { + "epoch": 1.2627378759975445, + "grad_norm": 0.48013919591903687, + "learning_rate": 9.759138344164289e-05, + "loss": 1.9889, + "step": 4114 + }, + { + "epoch": 1.2630448127685696, + "grad_norm": 0.5039379596710205, + "learning_rate": 9.758985906643309e-05, + "loss": 1.9313, + "step": 4115 + }, + { + "epoch": 1.263351749539595, + "grad_norm": 0.5248776078224182, + "learning_rate": 9.758833422091244e-05, + "loss": 2.0091, + "step": 4116 + }, + { + "epoch": 1.26365868631062, + "grad_norm": 0.4788825809955597, + "learning_rate": 9.758680890509595e-05, + "loss": 2.0197, + "step": 4117 + }, + { + "epoch": 1.2639656230816452, + "grad_norm": 0.4926285743713379, + "learning_rate": 9.758528311899873e-05, + "loss": 2.0558, + "step": 4118 + }, + { + "epoch": 1.2642725598526703, + "grad_norm": 0.44785842299461365, + "learning_rate": 9.758375686263586e-05, + "loss": 1.9505, + "step": 4119 + }, + { + "epoch": 1.2645794966236954, + "grad_norm": 0.44693484902381897, + "learning_rate": 9.75822301360224e-05, + "loss": 1.9734, + "step": 4120 + }, + { + "epoch": 1.2648864333947207, + "grad_norm": 0.4691752791404724, + "learning_rate": 9.758070293917346e-05, + "loss": 2.0069, + "step": 4121 + }, + { + "epoch": 1.2651933701657458, + "grad_norm": 0.4718364477157593, + "learning_rate": 9.757917527210413e-05, + "loss": 1.9926, + "step": 4122 + }, + { + "epoch": 1.265500306936771, + "grad_norm": 0.47527435421943665, + "learning_rate": 9.757764713482949e-05, + "loss": 2.0304, + "step": 4123 + }, + { + "epoch": 1.2658072437077963, + "grad_norm": 0.5030924677848816, + "learning_rate": 9.757611852736467e-05, + "loss": 2.0281, + "step": 4124 + }, + { + "epoch": 1.2661141804788214, + "grad_norm": 0.5260440707206726, + "learning_rate": 9.757458944972475e-05, + "loss": 1.9952, + "step": 4125 + }, + { + "epoch": 1.2664211172498465, + "grad_norm": 0.5542300939559937, + "learning_rate": 9.757305990192486e-05, + "loss": 1.979, + "step": 4126 + }, + { + "epoch": 1.2667280540208716, + "grad_norm": 0.5589221715927124, + "learning_rate": 9.757152988398011e-05, + "loss": 2.0123, + "step": 4127 + }, + { + "epoch": 1.2670349907918967, + "grad_norm": 0.48933175206184387, + "learning_rate": 9.75699993959056e-05, + "loss": 1.9671, + "step": 4128 + }, + { + "epoch": 1.267341927562922, + "grad_norm": 0.4785501956939697, + "learning_rate": 9.75684684377165e-05, + "loss": 1.9452, + "step": 4129 + }, + { + "epoch": 1.2676488643339472, + "grad_norm": 0.5000367760658264, + "learning_rate": 9.75669370094279e-05, + "loss": 1.9637, + "step": 4130 + }, + { + "epoch": 1.2679558011049723, + "grad_norm": 0.5292743444442749, + "learning_rate": 9.756540511105496e-05, + "loss": 2.0464, + "step": 4131 + }, + { + "epoch": 1.2682627378759976, + "grad_norm": 0.4979592561721802, + "learning_rate": 9.75638727426128e-05, + "loss": 1.9863, + "step": 4132 + }, + { + "epoch": 1.2685696746470227, + "grad_norm": 0.4681611657142639, + "learning_rate": 9.756233990411656e-05, + "loss": 1.9978, + "step": 4133 + }, + { + "epoch": 1.2688766114180479, + "grad_norm": 0.5034354329109192, + "learning_rate": 9.756080659558142e-05, + "loss": 2.0332, + "step": 4134 + }, + { + "epoch": 1.269183548189073, + "grad_norm": 0.4815942347049713, + "learning_rate": 9.75592728170225e-05, + "loss": 1.9669, + "step": 4135 + }, + { + "epoch": 1.269490484960098, + "grad_norm": 0.49555137753486633, + "learning_rate": 9.755773856845498e-05, + "loss": 1.9774, + "step": 4136 + }, + { + "epoch": 1.2697974217311234, + "grad_norm": 0.5533550381660461, + "learning_rate": 9.755620384989401e-05, + "loss": 2.0236, + "step": 4137 + }, + { + "epoch": 1.2701043585021485, + "grad_norm": 0.49497511982917786, + "learning_rate": 9.755466866135476e-05, + "loss": 1.9266, + "step": 4138 + }, + { + "epoch": 1.2704112952731736, + "grad_norm": 0.5009804964065552, + "learning_rate": 9.755313300285239e-05, + "loss": 1.9463, + "step": 4139 + }, + { + "epoch": 1.270718232044199, + "grad_norm": 0.49870428442955017, + "learning_rate": 9.755159687440209e-05, + "loss": 1.9566, + "step": 4140 + }, + { + "epoch": 1.271025168815224, + "grad_norm": 0.49113500118255615, + "learning_rate": 9.755006027601905e-05, + "loss": 2.0075, + "step": 4141 + }, + { + "epoch": 1.2713321055862492, + "grad_norm": 0.45977187156677246, + "learning_rate": 9.754852320771845e-05, + "loss": 1.9358, + "step": 4142 + }, + { + "epoch": 1.2716390423572743, + "grad_norm": 0.5493664145469666, + "learning_rate": 9.754698566951545e-05, + "loss": 1.9996, + "step": 4143 + }, + { + "epoch": 1.2719459791282997, + "grad_norm": 0.4791078567504883, + "learning_rate": 9.75454476614253e-05, + "loss": 1.9426, + "step": 4144 + }, + { + "epoch": 1.2722529158993248, + "grad_norm": 0.4809282720088959, + "learning_rate": 9.754390918346315e-05, + "loss": 2.0197, + "step": 4145 + }, + { + "epoch": 1.2725598526703499, + "grad_norm": 0.5380387902259827, + "learning_rate": 9.754237023564423e-05, + "loss": 2.0261, + "step": 4146 + }, + { + "epoch": 1.272866789441375, + "grad_norm": 0.48302608728408813, + "learning_rate": 9.754083081798374e-05, + "loss": 2.0539, + "step": 4147 + }, + { + "epoch": 1.2731737262124003, + "grad_norm": 0.5752124786376953, + "learning_rate": 9.75392909304969e-05, + "loss": 2.0901, + "step": 4148 + }, + { + "epoch": 1.2734806629834254, + "grad_norm": 0.5538807511329651, + "learning_rate": 9.75377505731989e-05, + "loss": 1.9721, + "step": 4149 + }, + { + "epoch": 1.2737875997544506, + "grad_norm": 0.6331756114959717, + "learning_rate": 9.753620974610502e-05, + "loss": 2.0124, + "step": 4150 + }, + { + "epoch": 1.2740945365254759, + "grad_norm": 0.6422140598297119, + "learning_rate": 9.753466844923042e-05, + "loss": 2.0115, + "step": 4151 + }, + { + "epoch": 1.274401473296501, + "grad_norm": 0.6650347113609314, + "learning_rate": 9.753312668259038e-05, + "loss": 1.9735, + "step": 4152 + }, + { + "epoch": 1.274708410067526, + "grad_norm": 0.587230384349823, + "learning_rate": 9.753158444620013e-05, + "loss": 1.9382, + "step": 4153 + }, + { + "epoch": 1.2750153468385512, + "grad_norm": 0.5357664823532104, + "learning_rate": 9.75300417400749e-05, + "loss": 2.0437, + "step": 4154 + }, + { + "epoch": 1.2753222836095763, + "grad_norm": 0.5058115720748901, + "learning_rate": 9.752849856422994e-05, + "loss": 2.0031, + "step": 4155 + }, + { + "epoch": 1.2756292203806017, + "grad_norm": 0.5913745164871216, + "learning_rate": 9.75269549186805e-05, + "loss": 1.9923, + "step": 4156 + }, + { + "epoch": 1.2759361571516268, + "grad_norm": 0.6766920685768127, + "learning_rate": 9.752541080344181e-05, + "loss": 1.9619, + "step": 4157 + }, + { + "epoch": 1.276243093922652, + "grad_norm": 0.606132984161377, + "learning_rate": 9.752386621852919e-05, + "loss": 1.9689, + "step": 4158 + }, + { + "epoch": 1.2765500306936772, + "grad_norm": 0.521133542060852, + "learning_rate": 9.752232116395785e-05, + "loss": 1.9602, + "step": 4159 + }, + { + "epoch": 1.2768569674647023, + "grad_norm": 0.45266324281692505, + "learning_rate": 9.75207756397431e-05, + "loss": 2.0032, + "step": 4160 + }, + { + "epoch": 1.2771639042357275, + "grad_norm": 0.5078892707824707, + "learning_rate": 9.751922964590017e-05, + "loss": 2.0656, + "step": 4161 + }, + { + "epoch": 1.2774708410067526, + "grad_norm": 0.5042154788970947, + "learning_rate": 9.751768318244437e-05, + "loss": 1.9356, + "step": 4162 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.5866135954856873, + "learning_rate": 9.751613624939098e-05, + "loss": 1.9655, + "step": 4163 + }, + { + "epoch": 1.278084714548803, + "grad_norm": 0.6038163304328918, + "learning_rate": 9.751458884675527e-05, + "loss": 1.9445, + "step": 4164 + }, + { + "epoch": 1.2783916513198281, + "grad_norm": 0.4938269555568695, + "learning_rate": 9.751304097455254e-05, + "loss": 2.0164, + "step": 4165 + }, + { + "epoch": 1.2786985880908532, + "grad_norm": 0.4289272427558899, + "learning_rate": 9.75114926327981e-05, + "loss": 1.912, + "step": 4166 + }, + { + "epoch": 1.2790055248618786, + "grad_norm": 0.524058997631073, + "learning_rate": 9.750994382150724e-05, + "loss": 1.9279, + "step": 4167 + }, + { + "epoch": 1.2793124616329037, + "grad_norm": 0.6318224668502808, + "learning_rate": 9.750839454069527e-05, + "loss": 1.98, + "step": 4168 + }, + { + "epoch": 1.2796193984039288, + "grad_norm": 0.5709670782089233, + "learning_rate": 9.750684479037749e-05, + "loss": 2.0029, + "step": 4169 + }, + { + "epoch": 1.279926335174954, + "grad_norm": 0.4621523916721344, + "learning_rate": 9.750529457056924e-05, + "loss": 2.0295, + "step": 4170 + }, + { + "epoch": 1.280233271945979, + "grad_norm": 0.5812001824378967, + "learning_rate": 9.750374388128581e-05, + "loss": 2.0839, + "step": 4171 + }, + { + "epoch": 1.2805402087170044, + "grad_norm": 0.6389874219894409, + "learning_rate": 9.750219272254256e-05, + "loss": 2.0825, + "step": 4172 + }, + { + "epoch": 1.2808471454880295, + "grad_norm": 0.49902382493019104, + "learning_rate": 9.750064109435478e-05, + "loss": 1.8902, + "step": 4173 + }, + { + "epoch": 1.2811540822590546, + "grad_norm": 0.5641525983810425, + "learning_rate": 9.749908899673783e-05, + "loss": 2.0463, + "step": 4174 + }, + { + "epoch": 1.28146101903008, + "grad_norm": 0.5977841019630432, + "learning_rate": 9.749753642970704e-05, + "loss": 2.0253, + "step": 4175 + }, + { + "epoch": 1.281767955801105, + "grad_norm": 0.5438104271888733, + "learning_rate": 9.749598339327777e-05, + "loss": 1.9862, + "step": 4176 + }, + { + "epoch": 1.2820748925721301, + "grad_norm": 0.4542587697505951, + "learning_rate": 9.749442988746535e-05, + "loss": 1.9476, + "step": 4177 + }, + { + "epoch": 1.2823818293431553, + "grad_norm": 0.4900791347026825, + "learning_rate": 9.749287591228513e-05, + "loss": 2.0093, + "step": 4178 + }, + { + "epoch": 1.2826887661141804, + "grad_norm": 0.5837534666061401, + "learning_rate": 9.749132146775247e-05, + "loss": 2.0699, + "step": 4179 + }, + { + "epoch": 1.2829957028852057, + "grad_norm": 0.5315881967544556, + "learning_rate": 9.748976655388274e-05, + "loss": 1.9514, + "step": 4180 + }, + { + "epoch": 1.2833026396562308, + "grad_norm": 0.5284895300865173, + "learning_rate": 9.74882111706913e-05, + "loss": 2.0171, + "step": 4181 + }, + { + "epoch": 1.283609576427256, + "grad_norm": 0.521202802658081, + "learning_rate": 9.748665531819352e-05, + "loss": 2.025, + "step": 4182 + }, + { + "epoch": 1.2839165131982813, + "grad_norm": 0.5437573194503784, + "learning_rate": 9.748509899640479e-05, + "loss": 2.0352, + "step": 4183 + }, + { + "epoch": 1.2842234499693064, + "grad_norm": 0.5394143462181091, + "learning_rate": 9.748354220534048e-05, + "loss": 2.0245, + "step": 4184 + }, + { + "epoch": 1.2845303867403315, + "grad_norm": 0.47468093037605286, + "learning_rate": 9.748198494501597e-05, + "loss": 1.9719, + "step": 4185 + }, + { + "epoch": 1.2848373235113566, + "grad_norm": 0.5312216877937317, + "learning_rate": 9.748042721544666e-05, + "loss": 2.0111, + "step": 4186 + }, + { + "epoch": 1.2851442602823817, + "grad_norm": 0.525694727897644, + "learning_rate": 9.747886901664794e-05, + "loss": 2.0582, + "step": 4187 + }, + { + "epoch": 1.285451197053407, + "grad_norm": 0.4965955317020416, + "learning_rate": 9.74773103486352e-05, + "loss": 1.9777, + "step": 4188 + }, + { + "epoch": 1.2857581338244322, + "grad_norm": 0.4391513466835022, + "learning_rate": 9.747575121142385e-05, + "loss": 1.9725, + "step": 4189 + }, + { + "epoch": 1.2860650705954573, + "grad_norm": 0.48999011516571045, + "learning_rate": 9.74741916050293e-05, + "loss": 1.953, + "step": 4190 + }, + { + "epoch": 1.2863720073664826, + "grad_norm": 0.5297304391860962, + "learning_rate": 9.747263152946698e-05, + "loss": 2.0484, + "step": 4191 + }, + { + "epoch": 1.2866789441375077, + "grad_norm": 0.4878230690956116, + "learning_rate": 9.747107098475226e-05, + "loss": 2.0423, + "step": 4192 + }, + { + "epoch": 1.2869858809085328, + "grad_norm": 0.538070023059845, + "learning_rate": 9.74695099709006e-05, + "loss": 2.0699, + "step": 4193 + }, + { + "epoch": 1.287292817679558, + "grad_norm": 0.6656436324119568, + "learning_rate": 9.746794848792743e-05, + "loss": 2.0689, + "step": 4194 + }, + { + "epoch": 1.287599754450583, + "grad_norm": 0.6416848301887512, + "learning_rate": 9.746638653584819e-05, + "loss": 1.9796, + "step": 4195 + }, + { + "epoch": 1.2879066912216084, + "grad_norm": 0.5917447805404663, + "learning_rate": 9.746482411467827e-05, + "loss": 2.0324, + "step": 4196 + }, + { + "epoch": 1.2882136279926335, + "grad_norm": 0.5234537124633789, + "learning_rate": 9.746326122443314e-05, + "loss": 2.0468, + "step": 4197 + }, + { + "epoch": 1.2885205647636586, + "grad_norm": 0.4885808229446411, + "learning_rate": 9.746169786512827e-05, + "loss": 1.9619, + "step": 4198 + }, + { + "epoch": 1.288827501534684, + "grad_norm": 0.5776945948600769, + "learning_rate": 9.746013403677905e-05, + "loss": 2.0167, + "step": 4199 + }, + { + "epoch": 1.289134438305709, + "grad_norm": 0.5722271203994751, + "learning_rate": 9.745856973940099e-05, + "loss": 1.9751, + "step": 4200 + }, + { + "epoch": 1.2894413750767342, + "grad_norm": 0.49253931641578674, + "learning_rate": 9.745700497300951e-05, + "loss": 1.9821, + "step": 4201 + }, + { + "epoch": 1.2897483118477593, + "grad_norm": 0.4739282727241516, + "learning_rate": 9.74554397376201e-05, + "loss": 1.9926, + "step": 4202 + }, + { + "epoch": 1.2900552486187844, + "grad_norm": 0.5133153200149536, + "learning_rate": 9.745387403324823e-05, + "loss": 1.9655, + "step": 4203 + }, + { + "epoch": 1.2903621853898097, + "grad_norm": 0.48941388726234436, + "learning_rate": 9.745230785990935e-05, + "loss": 1.9401, + "step": 4204 + }, + { + "epoch": 1.2906691221608348, + "grad_norm": 0.5998152494430542, + "learning_rate": 9.745074121761896e-05, + "loss": 2.0223, + "step": 4205 + }, + { + "epoch": 1.29097605893186, + "grad_norm": 0.4423331618309021, + "learning_rate": 9.744917410639253e-05, + "loss": 1.9602, + "step": 4206 + }, + { + "epoch": 1.2912829957028853, + "grad_norm": 0.5387418866157532, + "learning_rate": 9.744760652624553e-05, + "loss": 2.0631, + "step": 4207 + }, + { + "epoch": 1.2915899324739104, + "grad_norm": 0.5992900729179382, + "learning_rate": 9.744603847719352e-05, + "loss": 1.9805, + "step": 4208 + }, + { + "epoch": 1.2918968692449355, + "grad_norm": 0.5033924579620361, + "learning_rate": 9.744446995925192e-05, + "loss": 1.9817, + "step": 4209 + }, + { + "epoch": 1.2922038060159606, + "grad_norm": 0.47493448853492737, + "learning_rate": 9.744290097243624e-05, + "loss": 2.0259, + "step": 4210 + }, + { + "epoch": 1.2925107427869857, + "grad_norm": 0.5161942839622498, + "learning_rate": 9.744133151676203e-05, + "loss": 1.9686, + "step": 4211 + }, + { + "epoch": 1.292817679558011, + "grad_norm": 0.4476351737976074, + "learning_rate": 9.743976159224477e-05, + "loss": 1.9488, + "step": 4212 + }, + { + "epoch": 1.2931246163290362, + "grad_norm": 0.5168361663818359, + "learning_rate": 9.743819119889999e-05, + "loss": 2.0645, + "step": 4213 + }, + { + "epoch": 1.2934315531000613, + "grad_norm": 0.5098811984062195, + "learning_rate": 9.743662033674319e-05, + "loss": 1.9889, + "step": 4214 + }, + { + "epoch": 1.2937384898710866, + "grad_norm": 0.5559372305870056, + "learning_rate": 9.74350490057899e-05, + "loss": 2.0348, + "step": 4215 + }, + { + "epoch": 1.2940454266421118, + "grad_norm": 0.5274948477745056, + "learning_rate": 9.743347720605566e-05, + "loss": 2.0566, + "step": 4216 + }, + { + "epoch": 1.2943523634131369, + "grad_norm": 0.5009967088699341, + "learning_rate": 9.743190493755601e-05, + "loss": 1.9915, + "step": 4217 + }, + { + "epoch": 1.2946593001841622, + "grad_norm": 0.5365834832191467, + "learning_rate": 9.743033220030646e-05, + "loss": 2.0581, + "step": 4218 + }, + { + "epoch": 1.2949662369551873, + "grad_norm": 0.519478976726532, + "learning_rate": 9.742875899432255e-05, + "loss": 1.9766, + "step": 4219 + }, + { + "epoch": 1.2952731737262124, + "grad_norm": 0.48030364513397217, + "learning_rate": 9.742718531961988e-05, + "loss": 2.0006, + "step": 4220 + }, + { + "epoch": 1.2955801104972375, + "grad_norm": 0.5257472991943359, + "learning_rate": 9.742561117621394e-05, + "loss": 2.0636, + "step": 4221 + }, + { + "epoch": 1.2958870472682626, + "grad_norm": 0.44784319400787354, + "learning_rate": 9.742403656412034e-05, + "loss": 1.9975, + "step": 4222 + }, + { + "epoch": 1.296193984039288, + "grad_norm": 0.4997022747993469, + "learning_rate": 9.742246148335459e-05, + "loss": 2.0167, + "step": 4223 + }, + { + "epoch": 1.296500920810313, + "grad_norm": 0.43378305435180664, + "learning_rate": 9.742088593393228e-05, + "loss": 1.9202, + "step": 4224 + }, + { + "epoch": 1.2968078575813382, + "grad_norm": 0.5256497859954834, + "learning_rate": 9.741930991586899e-05, + "loss": 2.0306, + "step": 4225 + }, + { + "epoch": 1.2971147943523635, + "grad_norm": 0.5017027258872986, + "learning_rate": 9.741773342918028e-05, + "loss": 2.0124, + "step": 4226 + }, + { + "epoch": 1.2974217311233887, + "grad_norm": 0.5393915176391602, + "learning_rate": 9.741615647388175e-05, + "loss": 2.0255, + "step": 4227 + }, + { + "epoch": 1.2977286678944138, + "grad_norm": 0.48618295788764954, + "learning_rate": 9.741457904998896e-05, + "loss": 1.9863, + "step": 4228 + }, + { + "epoch": 1.2980356046654389, + "grad_norm": 0.48060059547424316, + "learning_rate": 9.741300115751752e-05, + "loss": 2.0787, + "step": 4229 + }, + { + "epoch": 1.298342541436464, + "grad_norm": 0.4966236650943756, + "learning_rate": 9.741142279648298e-05, + "loss": 1.9818, + "step": 4230 + }, + { + "epoch": 1.2986494782074893, + "grad_norm": 0.5178021788597107, + "learning_rate": 9.7409843966901e-05, + "loss": 1.9847, + "step": 4231 + }, + { + "epoch": 1.2989564149785144, + "grad_norm": 0.5161063075065613, + "learning_rate": 9.740826466878716e-05, + "loss": 2.0028, + "step": 4232 + }, + { + "epoch": 1.2992633517495396, + "grad_norm": 0.5972462296485901, + "learning_rate": 9.740668490215705e-05, + "loss": 2.0205, + "step": 4233 + }, + { + "epoch": 1.2995702885205649, + "grad_norm": 0.5929185152053833, + "learning_rate": 9.740510466702629e-05, + "loss": 1.9802, + "step": 4234 + }, + { + "epoch": 1.29987722529159, + "grad_norm": 0.5496684908866882, + "learning_rate": 9.74035239634105e-05, + "loss": 1.9331, + "step": 4235 + }, + { + "epoch": 1.3001841620626151, + "grad_norm": 0.5822622179985046, + "learning_rate": 9.740194279132531e-05, + "loss": 2.1079, + "step": 4236 + }, + { + "epoch": 1.3004910988336402, + "grad_norm": 0.5886369943618774, + "learning_rate": 9.740036115078634e-05, + "loss": 1.9938, + "step": 4237 + }, + { + "epoch": 1.3007980356046653, + "grad_norm": 0.5259171724319458, + "learning_rate": 9.73987790418092e-05, + "loss": 2.0787, + "step": 4238 + }, + { + "epoch": 1.3011049723756907, + "grad_norm": 0.6112152934074402, + "learning_rate": 9.739719646440956e-05, + "loss": 2.0488, + "step": 4239 + }, + { + "epoch": 1.3014119091467158, + "grad_norm": 0.5786338448524475, + "learning_rate": 9.739561341860306e-05, + "loss": 1.9917, + "step": 4240 + }, + { + "epoch": 1.301718845917741, + "grad_norm": 0.5099230408668518, + "learning_rate": 9.739402990440531e-05, + "loss": 1.9949, + "step": 4241 + }, + { + "epoch": 1.3020257826887662, + "grad_norm": 0.5040346384048462, + "learning_rate": 9.739244592183198e-05, + "loss": 1.9368, + "step": 4242 + }, + { + "epoch": 1.3023327194597913, + "grad_norm": 0.48172008991241455, + "learning_rate": 9.739086147089871e-05, + "loss": 1.97, + "step": 4243 + }, + { + "epoch": 1.3026396562308165, + "grad_norm": 0.5350810885429382, + "learning_rate": 9.738927655162119e-05, + "loss": 2.0584, + "step": 4244 + }, + { + "epoch": 1.3029465930018416, + "grad_norm": 0.566371738910675, + "learning_rate": 9.738769116401505e-05, + "loss": 2.0138, + "step": 4245 + }, + { + "epoch": 1.3032535297728667, + "grad_norm": 0.5697746872901917, + "learning_rate": 9.738610530809598e-05, + "loss": 2.0319, + "step": 4246 + }, + { + "epoch": 1.303560466543892, + "grad_norm": 0.5186757445335388, + "learning_rate": 9.738451898387964e-05, + "loss": 1.9958, + "step": 4247 + }, + { + "epoch": 1.3038674033149171, + "grad_norm": 0.5318703651428223, + "learning_rate": 9.73829321913817e-05, + "loss": 2.0857, + "step": 4248 + }, + { + "epoch": 1.3041743400859422, + "grad_norm": 0.5013560056686401, + "learning_rate": 9.738134493061786e-05, + "loss": 1.9545, + "step": 4249 + }, + { + "epoch": 1.3044812768569676, + "grad_norm": 0.499009907245636, + "learning_rate": 9.737975720160382e-05, + "loss": 1.9773, + "step": 4250 + }, + { + "epoch": 1.3047882136279927, + "grad_norm": 0.5187140703201294, + "learning_rate": 9.737816900435522e-05, + "loss": 1.9826, + "step": 4251 + }, + { + "epoch": 1.3050951503990178, + "grad_norm": 0.4950683116912842, + "learning_rate": 9.73765803388878e-05, + "loss": 2.0061, + "step": 4252 + }, + { + "epoch": 1.305402087170043, + "grad_norm": 0.40729087591171265, + "learning_rate": 9.737499120521722e-05, + "loss": 1.9502, + "step": 4253 + }, + { + "epoch": 1.305709023941068, + "grad_norm": 0.4959156811237335, + "learning_rate": 9.737340160335924e-05, + "loss": 2.0975, + "step": 4254 + }, + { + "epoch": 1.3060159607120934, + "grad_norm": 0.5127618312835693, + "learning_rate": 9.737181153332952e-05, + "loss": 2.0098, + "step": 4255 + }, + { + "epoch": 1.3063228974831185, + "grad_norm": 0.45458972454071045, + "learning_rate": 9.737022099514381e-05, + "loss": 1.9475, + "step": 4256 + }, + { + "epoch": 1.3066298342541436, + "grad_norm": 0.5024627447128296, + "learning_rate": 9.736862998881779e-05, + "loss": 2.0682, + "step": 4257 + }, + { + "epoch": 1.306936771025169, + "grad_norm": 0.5217326283454895, + "learning_rate": 9.736703851436722e-05, + "loss": 2.0363, + "step": 4258 + }, + { + "epoch": 1.307243707796194, + "grad_norm": 0.4798679053783417, + "learning_rate": 9.736544657180781e-05, + "loss": 2.0357, + "step": 4259 + }, + { + "epoch": 1.3075506445672191, + "grad_norm": 0.6031736135482788, + "learning_rate": 9.73638541611553e-05, + "loss": 2.0143, + "step": 4260 + }, + { + "epoch": 1.3078575813382443, + "grad_norm": 0.4914969801902771, + "learning_rate": 9.736226128242542e-05, + "loss": 1.9292, + "step": 4261 + }, + { + "epoch": 1.3081645181092694, + "grad_norm": 0.40556418895721436, + "learning_rate": 9.736066793563392e-05, + "loss": 1.9528, + "step": 4262 + }, + { + "epoch": 1.3084714548802947, + "grad_norm": 0.45605841279029846, + "learning_rate": 9.735907412079652e-05, + "loss": 2.0704, + "step": 4263 + }, + { + "epoch": 1.3087783916513198, + "grad_norm": 0.4992324113845825, + "learning_rate": 9.7357479837929e-05, + "loss": 2.0211, + "step": 4264 + }, + { + "epoch": 1.309085328422345, + "grad_norm": 0.4904097020626068, + "learning_rate": 9.735588508704712e-05, + "loss": 1.987, + "step": 4265 + }, + { + "epoch": 1.3093922651933703, + "grad_norm": 0.5436086058616638, + "learning_rate": 9.735428986816661e-05, + "loss": 2.0704, + "step": 4266 + }, + { + "epoch": 1.3096992019643954, + "grad_norm": 0.4850294589996338, + "learning_rate": 9.735269418130326e-05, + "loss": 1.9576, + "step": 4267 + }, + { + "epoch": 1.3100061387354205, + "grad_norm": 0.44082164764404297, + "learning_rate": 9.735109802647283e-05, + "loss": 2.0018, + "step": 4268 + }, + { + "epoch": 1.3103130755064456, + "grad_norm": 0.4844531714916229, + "learning_rate": 9.73495014036911e-05, + "loss": 1.9852, + "step": 4269 + }, + { + "epoch": 1.3106200122774707, + "grad_norm": 0.547596275806427, + "learning_rate": 9.734790431297384e-05, + "loss": 2.0632, + "step": 4270 + }, + { + "epoch": 1.310926949048496, + "grad_norm": 0.517882764339447, + "learning_rate": 9.734630675433684e-05, + "loss": 1.9851, + "step": 4271 + }, + { + "epoch": 1.3112338858195212, + "grad_norm": 0.5148623585700989, + "learning_rate": 9.734470872779589e-05, + "loss": 2.0446, + "step": 4272 + }, + { + "epoch": 1.3115408225905463, + "grad_norm": 0.5872887372970581, + "learning_rate": 9.734311023336678e-05, + "loss": 2.0588, + "step": 4273 + }, + { + "epoch": 1.3118477593615716, + "grad_norm": 0.7116255164146423, + "learning_rate": 9.73415112710653e-05, + "loss": 2.0213, + "step": 4274 + }, + { + "epoch": 1.3121546961325967, + "grad_norm": 0.8191964626312256, + "learning_rate": 9.733991184090725e-05, + "loss": 1.9528, + "step": 4275 + }, + { + "epoch": 1.3124616329036218, + "grad_norm": 0.8214605450630188, + "learning_rate": 9.733831194290846e-05, + "loss": 1.9614, + "step": 4276 + }, + { + "epoch": 1.312768569674647, + "grad_norm": 0.7057182788848877, + "learning_rate": 9.733671157708472e-05, + "loss": 2.0767, + "step": 4277 + }, + { + "epoch": 1.313075506445672, + "grad_norm": 0.5114007592201233, + "learning_rate": 9.733511074345185e-05, + "loss": 1.946, + "step": 4278 + }, + { + "epoch": 1.3133824432166974, + "grad_norm": 0.5347970128059387, + "learning_rate": 9.733350944202566e-05, + "loss": 1.9658, + "step": 4279 + }, + { + "epoch": 1.3136893799877225, + "grad_norm": 0.6962214112281799, + "learning_rate": 9.733190767282202e-05, + "loss": 2.0943, + "step": 4280 + }, + { + "epoch": 1.3139963167587476, + "grad_norm": 0.5942707657814026, + "learning_rate": 9.733030543585668e-05, + "loss": 2.0101, + "step": 4281 + }, + { + "epoch": 1.314303253529773, + "grad_norm": 0.46218639612197876, + "learning_rate": 9.732870273114556e-05, + "loss": 2.0292, + "step": 4282 + }, + { + "epoch": 1.314610190300798, + "grad_norm": 0.5194444060325623, + "learning_rate": 9.732709955870445e-05, + "loss": 2.0666, + "step": 4283 + }, + { + "epoch": 1.3149171270718232, + "grad_norm": 0.5112141370773315, + "learning_rate": 9.732549591854918e-05, + "loss": 2.0205, + "step": 4284 + }, + { + "epoch": 1.3152240638428485, + "grad_norm": 0.5282790660858154, + "learning_rate": 9.732389181069566e-05, + "loss": 2.0704, + "step": 4285 + }, + { + "epoch": 1.3155310006138736, + "grad_norm": 0.4598311185836792, + "learning_rate": 9.732228723515968e-05, + "loss": 1.9485, + "step": 4286 + }, + { + "epoch": 1.3158379373848987, + "grad_norm": 0.4700186550617218, + "learning_rate": 9.732068219195711e-05, + "loss": 2.0329, + "step": 4287 + }, + { + "epoch": 1.3161448741559238, + "grad_norm": 0.4512452781200409, + "learning_rate": 9.731907668110384e-05, + "loss": 1.9829, + "step": 4288 + }, + { + "epoch": 1.316451810926949, + "grad_norm": 0.5053353309631348, + "learning_rate": 9.731747070261572e-05, + "loss": 2.0583, + "step": 4289 + }, + { + "epoch": 1.3167587476979743, + "grad_norm": 0.48143625259399414, + "learning_rate": 9.73158642565086e-05, + "loss": 2.014, + "step": 4290 + }, + { + "epoch": 1.3170656844689994, + "grad_norm": 0.4843716025352478, + "learning_rate": 9.73142573427984e-05, + "loss": 1.9951, + "step": 4291 + }, + { + "epoch": 1.3173726212400245, + "grad_norm": 0.45646217465400696, + "learning_rate": 9.731264996150098e-05, + "loss": 1.9701, + "step": 4292 + }, + { + "epoch": 1.3176795580110499, + "grad_norm": 0.5176306962966919, + "learning_rate": 9.73110421126322e-05, + "loss": 1.9915, + "step": 4293 + }, + { + "epoch": 1.317986494782075, + "grad_norm": 0.4862259328365326, + "learning_rate": 9.730943379620799e-05, + "loss": 2.0157, + "step": 4294 + }, + { + "epoch": 1.3182934315531, + "grad_norm": 0.4941593110561371, + "learning_rate": 9.730782501224423e-05, + "loss": 2.0164, + "step": 4295 + }, + { + "epoch": 1.3186003683241252, + "grad_norm": 0.46818530559539795, + "learning_rate": 9.73062157607568e-05, + "loss": 1.9749, + "step": 4296 + }, + { + "epoch": 1.3189073050951503, + "grad_norm": 0.41685113310813904, + "learning_rate": 9.730460604176163e-05, + "loss": 1.9443, + "step": 4297 + }, + { + "epoch": 1.3192142418661756, + "grad_norm": 0.40586861968040466, + "learning_rate": 9.73029958552746e-05, + "loss": 1.9227, + "step": 4298 + }, + { + "epoch": 1.3195211786372008, + "grad_norm": 0.3946068286895752, + "learning_rate": 9.730138520131167e-05, + "loss": 1.9073, + "step": 4299 + }, + { + "epoch": 1.3198281154082259, + "grad_norm": 0.3722321093082428, + "learning_rate": 9.729977407988871e-05, + "loss": 1.9299, + "step": 4300 + }, + { + "epoch": 1.3201350521792512, + "grad_norm": 0.39335691928863525, + "learning_rate": 9.729816249102164e-05, + "loss": 1.9673, + "step": 4301 + }, + { + "epoch": 1.3204419889502763, + "grad_norm": 0.4342779815196991, + "learning_rate": 9.729655043472643e-05, + "loss": 2.0704, + "step": 4302 + }, + { + "epoch": 1.3207489257213014, + "grad_norm": 0.46981000900268555, + "learning_rate": 9.729493791101899e-05, + "loss": 2.0593, + "step": 4303 + }, + { + "epoch": 1.3210558624923265, + "grad_norm": 0.4319849908351898, + "learning_rate": 9.729332491991524e-05, + "loss": 1.9378, + "step": 4304 + }, + { + "epoch": 1.3213627992633517, + "grad_norm": 0.4555012285709381, + "learning_rate": 9.729171146143115e-05, + "loss": 1.993, + "step": 4305 + }, + { + "epoch": 1.321669736034377, + "grad_norm": 0.5122297406196594, + "learning_rate": 9.729009753558262e-05, + "loss": 2.0237, + "step": 4306 + }, + { + "epoch": 1.321976672805402, + "grad_norm": 0.4814549386501312, + "learning_rate": 9.728848314238566e-05, + "loss": 2.0063, + "step": 4307 + }, + { + "epoch": 1.3222836095764272, + "grad_norm": 0.45410022139549255, + "learning_rate": 9.728686828185618e-05, + "loss": 2.0262, + "step": 4308 + }, + { + "epoch": 1.3225905463474525, + "grad_norm": 0.44759154319763184, + "learning_rate": 9.728525295401014e-05, + "loss": 1.9746, + "step": 4309 + }, + { + "epoch": 1.3228974831184777, + "grad_norm": 0.41539889574050903, + "learning_rate": 9.728363715886352e-05, + "loss": 1.9197, + "step": 4310 + }, + { + "epoch": 1.3232044198895028, + "grad_norm": 0.549961268901825, + "learning_rate": 9.72820208964323e-05, + "loss": 2.0168, + "step": 4311 + }, + { + "epoch": 1.3235113566605279, + "grad_norm": 0.6832249164581299, + "learning_rate": 9.728040416673243e-05, + "loss": 1.9711, + "step": 4312 + }, + { + "epoch": 1.323818293431553, + "grad_norm": 0.7458481788635254, + "learning_rate": 9.727878696977988e-05, + "loss": 2.1677, + "step": 4313 + }, + { + "epoch": 1.3241252302025783, + "grad_norm": 0.6268119812011719, + "learning_rate": 9.727716930559066e-05, + "loss": 2.0222, + "step": 4314 + }, + { + "epoch": 1.3244321669736034, + "grad_norm": 0.540987491607666, + "learning_rate": 9.727555117418075e-05, + "loss": 2.0552, + "step": 4315 + }, + { + "epoch": 1.3247391037446286, + "grad_norm": 0.6105024814605713, + "learning_rate": 9.727393257556612e-05, + "loss": 1.9287, + "step": 4316 + }, + { + "epoch": 1.325046040515654, + "grad_norm": 0.594327449798584, + "learning_rate": 9.727231350976277e-05, + "loss": 1.9737, + "step": 4317 + }, + { + "epoch": 1.325352977286679, + "grad_norm": 0.5686312913894653, + "learning_rate": 9.727069397678674e-05, + "loss": 1.988, + "step": 4318 + }, + { + "epoch": 1.3256599140577041, + "grad_norm": 0.5335875153541565, + "learning_rate": 9.726907397665399e-05, + "loss": 1.9992, + "step": 4319 + }, + { + "epoch": 1.3259668508287292, + "grad_norm": 0.514209508895874, + "learning_rate": 9.726745350938055e-05, + "loss": 2.0928, + "step": 4320 + }, + { + "epoch": 1.3262737875997543, + "grad_norm": 0.58844393491745, + "learning_rate": 9.726583257498242e-05, + "loss": 1.968, + "step": 4321 + }, + { + "epoch": 1.3265807243707797, + "grad_norm": 0.5247591733932495, + "learning_rate": 9.726421117347563e-05, + "loss": 1.9529, + "step": 4322 + }, + { + "epoch": 1.3268876611418048, + "grad_norm": 0.5057464241981506, + "learning_rate": 9.726258930487622e-05, + "loss": 2.0595, + "step": 4323 + }, + { + "epoch": 1.32719459791283, + "grad_norm": 0.564689040184021, + "learning_rate": 9.726096696920019e-05, + "loss": 1.9974, + "step": 4324 + }, + { + "epoch": 1.3275015346838552, + "grad_norm": 0.5755618214607239, + "learning_rate": 9.725934416646358e-05, + "loss": 1.9949, + "step": 4325 + }, + { + "epoch": 1.3278084714548803, + "grad_norm": 0.5969316959381104, + "learning_rate": 9.725772089668243e-05, + "loss": 1.972, + "step": 4326 + }, + { + "epoch": 1.3281154082259055, + "grad_norm": 0.5776877403259277, + "learning_rate": 9.725609715987278e-05, + "loss": 2.1018, + "step": 4327 + }, + { + "epoch": 1.3284223449969306, + "grad_norm": 0.5471270680427551, + "learning_rate": 9.725447295605071e-05, + "loss": 2.0153, + "step": 4328 + }, + { + "epoch": 1.3287292817679557, + "grad_norm": 0.49090373516082764, + "learning_rate": 9.725284828523222e-05, + "loss": 1.9651, + "step": 4329 + }, + { + "epoch": 1.329036218538981, + "grad_norm": 0.49420034885406494, + "learning_rate": 9.725122314743337e-05, + "loss": 2.0119, + "step": 4330 + }, + { + "epoch": 1.3293431553100061, + "grad_norm": 0.4841148853302002, + "learning_rate": 9.724959754267027e-05, + "loss": 1.974, + "step": 4331 + }, + { + "epoch": 1.3296500920810312, + "grad_norm": 0.42349007725715637, + "learning_rate": 9.724797147095893e-05, + "loss": 1.9779, + "step": 4332 + }, + { + "epoch": 1.3299570288520566, + "grad_norm": 0.47239863872528076, + "learning_rate": 9.724634493231545e-05, + "loss": 1.9184, + "step": 4333 + }, + { + "epoch": 1.3302639656230817, + "grad_norm": 0.5583773255348206, + "learning_rate": 9.72447179267559e-05, + "loss": 2.0742, + "step": 4334 + }, + { + "epoch": 1.3305709023941068, + "grad_norm": 0.486937552690506, + "learning_rate": 9.724309045429636e-05, + "loss": 2.0101, + "step": 4335 + }, + { + "epoch": 1.330877839165132, + "grad_norm": 0.42204493284225464, + "learning_rate": 9.724146251495289e-05, + "loss": 1.9564, + "step": 4336 + }, + { + "epoch": 1.331184775936157, + "grad_norm": 0.451628714799881, + "learning_rate": 9.723983410874163e-05, + "loss": 1.9949, + "step": 4337 + }, + { + "epoch": 1.3314917127071824, + "grad_norm": 0.4453491270542145, + "learning_rate": 9.723820523567861e-05, + "loss": 1.9415, + "step": 4338 + }, + { + "epoch": 1.3317986494782075, + "grad_norm": 0.4628424644470215, + "learning_rate": 9.723657589577999e-05, + "loss": 2.0296, + "step": 4339 + }, + { + "epoch": 1.3321055862492326, + "grad_norm": 0.5362148284912109, + "learning_rate": 9.723494608906181e-05, + "loss": 2.0719, + "step": 4340 + }, + { + "epoch": 1.332412523020258, + "grad_norm": 0.45357146859169006, + "learning_rate": 9.723331581554023e-05, + "loss": 1.9107, + "step": 4341 + }, + { + "epoch": 1.332719459791283, + "grad_norm": 0.5042485594749451, + "learning_rate": 9.723168507523133e-05, + "loss": 1.9838, + "step": 4342 + }, + { + "epoch": 1.3330263965623081, + "grad_norm": 0.4797585606575012, + "learning_rate": 9.723005386815123e-05, + "loss": 1.9779, + "step": 4343 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4489155113697052, + "learning_rate": 9.722842219431607e-05, + "loss": 1.9805, + "step": 4344 + }, + { + "epoch": 1.3336402701043584, + "grad_norm": 0.43091216683387756, + "learning_rate": 9.722679005374196e-05, + "loss": 1.9708, + "step": 4345 + }, + { + "epoch": 1.3339472068753837, + "grad_norm": 0.453937292098999, + "learning_rate": 9.722515744644502e-05, + "loss": 2.0038, + "step": 4346 + }, + { + "epoch": 1.3342541436464088, + "grad_norm": 0.38905346393585205, + "learning_rate": 9.722352437244138e-05, + "loss": 2.0042, + "step": 4347 + }, + { + "epoch": 1.334561080417434, + "grad_norm": 0.46686118841171265, + "learning_rate": 9.722189083174722e-05, + "loss": 2.0733, + "step": 4348 + }, + { + "epoch": 1.3348680171884593, + "grad_norm": 0.42737439274787903, + "learning_rate": 9.722025682437865e-05, + "loss": 1.9572, + "step": 4349 + }, + { + "epoch": 1.3351749539594844, + "grad_norm": 0.3857511878013611, + "learning_rate": 9.721862235035181e-05, + "loss": 1.9288, + "step": 4350 + }, + { + "epoch": 1.3354818907305095, + "grad_norm": 0.42448824644088745, + "learning_rate": 9.721698740968288e-05, + "loss": 1.99, + "step": 4351 + }, + { + "epoch": 1.3357888275015346, + "grad_norm": 0.4753642976284027, + "learning_rate": 9.721535200238802e-05, + "loss": 2.0268, + "step": 4352 + }, + { + "epoch": 1.3360957642725597, + "grad_norm": 0.5248960256576538, + "learning_rate": 9.721371612848336e-05, + "loss": 2.008, + "step": 4353 + }, + { + "epoch": 1.336402701043585, + "grad_norm": 0.5046865344047546, + "learning_rate": 9.721207978798507e-05, + "loss": 1.9248, + "step": 4354 + }, + { + "epoch": 1.3367096378146102, + "grad_norm": 0.48205190896987915, + "learning_rate": 9.721044298090937e-05, + "loss": 1.9895, + "step": 4355 + }, + { + "epoch": 1.3370165745856353, + "grad_norm": 0.46149346232414246, + "learning_rate": 9.720880570727238e-05, + "loss": 2.0001, + "step": 4356 + }, + { + "epoch": 1.3373235113566606, + "grad_norm": 0.6212405562400818, + "learning_rate": 9.72071679670903e-05, + "loss": 2.0772, + "step": 4357 + }, + { + "epoch": 1.3376304481276857, + "grad_norm": 0.6935828924179077, + "learning_rate": 9.720552976037934e-05, + "loss": 1.9865, + "step": 4358 + }, + { + "epoch": 1.3379373848987108, + "grad_norm": 0.6850154399871826, + "learning_rate": 9.720389108715564e-05, + "loss": 1.9964, + "step": 4359 + }, + { + "epoch": 1.3382443216697362, + "grad_norm": 0.5925734043121338, + "learning_rate": 9.720225194743544e-05, + "loss": 2.0109, + "step": 4360 + }, + { + "epoch": 1.3385512584407613, + "grad_norm": 0.47503459453582764, + "learning_rate": 9.720061234123492e-05, + "loss": 2.0406, + "step": 4361 + }, + { + "epoch": 1.3388581952117864, + "grad_norm": 0.44226083159446716, + "learning_rate": 9.719897226857026e-05, + "loss": 1.953, + "step": 4362 + }, + { + "epoch": 1.3391651319828115, + "grad_norm": 0.5688608884811401, + "learning_rate": 9.719733172945772e-05, + "loss": 1.9422, + "step": 4363 + }, + { + "epoch": 1.3394720687538366, + "grad_norm": 0.6097545027732849, + "learning_rate": 9.719569072391347e-05, + "loss": 2.0204, + "step": 4364 + }, + { + "epoch": 1.339779005524862, + "grad_norm": 0.44313064217567444, + "learning_rate": 9.719404925195374e-05, + "loss": 1.9458, + "step": 4365 + }, + { + "epoch": 1.340085942295887, + "grad_norm": 0.495632141828537, + "learning_rate": 9.719240731359476e-05, + "loss": 1.9682, + "step": 4366 + }, + { + "epoch": 1.3403928790669122, + "grad_norm": 0.5843736529350281, + "learning_rate": 9.719076490885275e-05, + "loss": 1.9948, + "step": 4367 + }, + { + "epoch": 1.3406998158379375, + "grad_norm": 0.6249645352363586, + "learning_rate": 9.718912203774395e-05, + "loss": 1.9675, + "step": 4368 + }, + { + "epoch": 1.3410067526089626, + "grad_norm": 0.48386043310165405, + "learning_rate": 9.718747870028457e-05, + "loss": 1.9678, + "step": 4369 + }, + { + "epoch": 1.3413136893799877, + "grad_norm": 0.4797835648059845, + "learning_rate": 9.718583489649088e-05, + "loss": 2.0118, + "step": 4370 + }, + { + "epoch": 1.3416206261510129, + "grad_norm": 0.6131169199943542, + "learning_rate": 9.718419062637911e-05, + "loss": 2.0057, + "step": 4371 + }, + { + "epoch": 1.341927562922038, + "grad_norm": 0.6230120062828064, + "learning_rate": 9.718254588996552e-05, + "loss": 1.9871, + "step": 4372 + }, + { + "epoch": 1.3422344996930633, + "grad_norm": 0.5323978662490845, + "learning_rate": 9.718090068726633e-05, + "loss": 1.9389, + "step": 4373 + }, + { + "epoch": 1.3425414364640884, + "grad_norm": 0.429446280002594, + "learning_rate": 9.717925501829786e-05, + "loss": 1.9928, + "step": 4374 + }, + { + "epoch": 1.3428483732351135, + "grad_norm": 0.5588231086730957, + "learning_rate": 9.717760888307632e-05, + "loss": 2.0197, + "step": 4375 + }, + { + "epoch": 1.3431553100061389, + "grad_norm": 0.608248770236969, + "learning_rate": 9.7175962281618e-05, + "loss": 1.9486, + "step": 4376 + }, + { + "epoch": 1.343462246777164, + "grad_norm": 0.6100868582725525, + "learning_rate": 9.717431521393918e-05, + "loss": 2.044, + "step": 4377 + }, + { + "epoch": 1.343769183548189, + "grad_norm": 0.5428611636161804, + "learning_rate": 9.717266768005611e-05, + "loss": 2.0078, + "step": 4378 + }, + { + "epoch": 1.3440761203192142, + "grad_norm": 0.4338260889053345, + "learning_rate": 9.71710196799851e-05, + "loss": 1.9206, + "step": 4379 + }, + { + "epoch": 1.3443830570902393, + "grad_norm": 0.4879632294178009, + "learning_rate": 9.716937121374243e-05, + "loss": 1.9852, + "step": 4380 + }, + { + "epoch": 1.3446899938612646, + "grad_norm": 0.5174580216407776, + "learning_rate": 9.716772228134438e-05, + "loss": 1.9328, + "step": 4381 + }, + { + "epoch": 1.3449969306322898, + "grad_norm": 0.4461662173271179, + "learning_rate": 9.716607288280726e-05, + "loss": 1.9653, + "step": 4382 + }, + { + "epoch": 1.3453038674033149, + "grad_norm": 0.49747103452682495, + "learning_rate": 9.716442301814735e-05, + "loss": 1.9904, + "step": 4383 + }, + { + "epoch": 1.3456108041743402, + "grad_norm": 0.5059060454368591, + "learning_rate": 9.716277268738097e-05, + "loss": 1.9408, + "step": 4384 + }, + { + "epoch": 1.3459177409453653, + "grad_norm": 0.47981831431388855, + "learning_rate": 9.716112189052445e-05, + "loss": 1.9604, + "step": 4385 + }, + { + "epoch": 1.3462246777163904, + "grad_norm": 0.48941048979759216, + "learning_rate": 9.715947062759405e-05, + "loss": 2.0005, + "step": 4386 + }, + { + "epoch": 1.3465316144874155, + "grad_norm": 0.4544732868671417, + "learning_rate": 9.715781889860613e-05, + "loss": 1.9641, + "step": 4387 + }, + { + "epoch": 1.3468385512584407, + "grad_norm": 0.4564060866832733, + "learning_rate": 9.715616670357701e-05, + "loss": 1.8786, + "step": 4388 + }, + { + "epoch": 1.347145488029466, + "grad_norm": 0.4216209352016449, + "learning_rate": 9.715451404252301e-05, + "loss": 1.9402, + "step": 4389 + }, + { + "epoch": 1.347452424800491, + "grad_norm": 0.5024694204330444, + "learning_rate": 9.715286091546046e-05, + "loss": 1.9815, + "step": 4390 + }, + { + "epoch": 1.3477593615715162, + "grad_norm": 0.523953378200531, + "learning_rate": 9.715120732240571e-05, + "loss": 2.008, + "step": 4391 + }, + { + "epoch": 1.3480662983425415, + "grad_norm": 0.5068427920341492, + "learning_rate": 9.714955326337508e-05, + "loss": 1.9984, + "step": 4392 + }, + { + "epoch": 1.3483732351135667, + "grad_norm": 0.4349055290222168, + "learning_rate": 9.714789873838494e-05, + "loss": 1.9576, + "step": 4393 + }, + { + "epoch": 1.3486801718845918, + "grad_norm": 0.4677357077598572, + "learning_rate": 9.714624374745162e-05, + "loss": 2.0491, + "step": 4394 + }, + { + "epoch": 1.3489871086556169, + "grad_norm": 0.5942007899284363, + "learning_rate": 9.71445882905915e-05, + "loss": 1.9951, + "step": 4395 + }, + { + "epoch": 1.349294045426642, + "grad_norm": 0.5354358553886414, + "learning_rate": 9.714293236782092e-05, + "loss": 2.0033, + "step": 4396 + }, + { + "epoch": 1.3496009821976673, + "grad_norm": 0.5081890821456909, + "learning_rate": 9.714127597915625e-05, + "loss": 1.9944, + "step": 4397 + }, + { + "epoch": 1.3499079189686924, + "grad_norm": 0.5279759764671326, + "learning_rate": 9.713961912461386e-05, + "loss": 2.025, + "step": 4398 + }, + { + "epoch": 1.3502148557397176, + "grad_norm": 0.41777312755584717, + "learning_rate": 9.713796180421012e-05, + "loss": 1.9214, + "step": 4399 + }, + { + "epoch": 1.350521792510743, + "grad_norm": 0.48946598172187805, + "learning_rate": 9.713630401796141e-05, + "loss": 1.9851, + "step": 4400 + }, + { + "epoch": 1.350828729281768, + "grad_norm": 0.45182350277900696, + "learning_rate": 9.713464576588413e-05, + "loss": 1.9825, + "step": 4401 + }, + { + "epoch": 1.3511356660527931, + "grad_norm": 0.4178939461708069, + "learning_rate": 9.713298704799465e-05, + "loss": 1.8944, + "step": 4402 + }, + { + "epoch": 1.3514426028238182, + "grad_norm": 0.4178236424922943, + "learning_rate": 9.713132786430937e-05, + "loss": 1.9884, + "step": 4403 + }, + { + "epoch": 1.3517495395948433, + "grad_norm": 0.45951130986213684, + "learning_rate": 9.712966821484467e-05, + "loss": 2.0786, + "step": 4404 + }, + { + "epoch": 1.3520564763658687, + "grad_norm": 0.4884461760520935, + "learning_rate": 9.712800809961697e-05, + "loss": 2.0494, + "step": 4405 + }, + { + "epoch": 1.3523634131368938, + "grad_norm": 0.5342240929603577, + "learning_rate": 9.712634751864268e-05, + "loss": 2.1068, + "step": 4406 + }, + { + "epoch": 1.352670349907919, + "grad_norm": 0.5503208637237549, + "learning_rate": 9.71246864719382e-05, + "loss": 1.9588, + "step": 4407 + }, + { + "epoch": 1.3529772866789442, + "grad_norm": 0.5576291084289551, + "learning_rate": 9.712302495951994e-05, + "loss": 2.0461, + "step": 4408 + }, + { + "epoch": 1.3532842234499693, + "grad_norm": 0.5063806772232056, + "learning_rate": 9.712136298140433e-05, + "loss": 1.9606, + "step": 4409 + }, + { + "epoch": 1.3535911602209945, + "grad_norm": 0.5391512513160706, + "learning_rate": 9.71197005376078e-05, + "loss": 2.0115, + "step": 4410 + }, + { + "epoch": 1.3538980969920196, + "grad_norm": 0.4934769868850708, + "learning_rate": 9.711803762814676e-05, + "loss": 1.9966, + "step": 4411 + }, + { + "epoch": 1.3542050337630447, + "grad_norm": 0.4658334255218506, + "learning_rate": 9.711637425303766e-05, + "loss": 1.9477, + "step": 4412 + }, + { + "epoch": 1.35451197053407, + "grad_norm": 0.4407191574573517, + "learning_rate": 9.711471041229693e-05, + "loss": 1.9334, + "step": 4413 + }, + { + "epoch": 1.3548189073050951, + "grad_norm": 0.5043092370033264, + "learning_rate": 9.711304610594104e-05, + "loss": 2.0068, + "step": 4414 + }, + { + "epoch": 1.3551258440761202, + "grad_norm": 0.4502009451389313, + "learning_rate": 9.711138133398639e-05, + "loss": 1.9389, + "step": 4415 + }, + { + "epoch": 1.3554327808471456, + "grad_norm": 0.41863033175468445, + "learning_rate": 9.710971609644945e-05, + "loss": 1.9244, + "step": 4416 + }, + { + "epoch": 1.3557397176181707, + "grad_norm": 0.47590091824531555, + "learning_rate": 9.71080503933467e-05, + "loss": 2.0144, + "step": 4417 + }, + { + "epoch": 1.3560466543891958, + "grad_norm": 0.47155439853668213, + "learning_rate": 9.71063842246946e-05, + "loss": 2.0729, + "step": 4418 + }, + { + "epoch": 1.356353591160221, + "grad_norm": 0.5231152176856995, + "learning_rate": 9.710471759050957e-05, + "loss": 2.0654, + "step": 4419 + }, + { + "epoch": 1.356660527931246, + "grad_norm": 0.5952544212341309, + "learning_rate": 9.710305049080812e-05, + "loss": 1.9983, + "step": 4420 + }, + { + "epoch": 1.3569674647022714, + "grad_norm": 0.4810022711753845, + "learning_rate": 9.710138292560673e-05, + "loss": 1.9725, + "step": 4421 + }, + { + "epoch": 1.3572744014732965, + "grad_norm": 0.553421676158905, + "learning_rate": 9.709971489492185e-05, + "loss": 2.0666, + "step": 4422 + }, + { + "epoch": 1.3575813382443216, + "grad_norm": 0.48790663480758667, + "learning_rate": 9.709804639877001e-05, + "loss": 1.9312, + "step": 4423 + }, + { + "epoch": 1.357888275015347, + "grad_norm": 0.42968273162841797, + "learning_rate": 9.709637743716764e-05, + "loss": 1.9061, + "step": 4424 + }, + { + "epoch": 1.358195211786372, + "grad_norm": 0.40183690190315247, + "learning_rate": 9.709470801013128e-05, + "loss": 2.0547, + "step": 4425 + }, + { + "epoch": 1.3585021485573971, + "grad_norm": 0.5162881016731262, + "learning_rate": 9.70930381176774e-05, + "loss": 2.0246, + "step": 4426 + }, + { + "epoch": 1.3588090853284225, + "grad_norm": 0.517995297908783, + "learning_rate": 9.709136775982252e-05, + "loss": 2.0029, + "step": 4427 + }, + { + "epoch": 1.3591160220994476, + "grad_norm": 0.47416025400161743, + "learning_rate": 9.708969693658314e-05, + "loss": 1.9517, + "step": 4428 + }, + { + "epoch": 1.3594229588704727, + "grad_norm": 0.4192255437374115, + "learning_rate": 9.708802564797578e-05, + "loss": 1.9138, + "step": 4429 + }, + { + "epoch": 1.3597298956414978, + "grad_norm": 0.4643617868423462, + "learning_rate": 9.708635389401697e-05, + "loss": 1.9753, + "step": 4430 + }, + { + "epoch": 1.360036832412523, + "grad_norm": 0.5007988214492798, + "learning_rate": 9.708468167472317e-05, + "loss": 1.9654, + "step": 4431 + }, + { + "epoch": 1.3603437691835483, + "grad_norm": 0.5188244581222534, + "learning_rate": 9.708300899011098e-05, + "loss": 1.9959, + "step": 4432 + }, + { + "epoch": 1.3606507059545734, + "grad_norm": 0.5209388732910156, + "learning_rate": 9.70813358401969e-05, + "loss": 2.0028, + "step": 4433 + }, + { + "epoch": 1.3609576427255985, + "grad_norm": 0.48829126358032227, + "learning_rate": 9.707966222499745e-05, + "loss": 2.0554, + "step": 4434 + }, + { + "epoch": 1.3612645794966238, + "grad_norm": 0.4373438358306885, + "learning_rate": 9.707798814452919e-05, + "loss": 1.9611, + "step": 4435 + }, + { + "epoch": 1.361571516267649, + "grad_norm": 0.4294830858707428, + "learning_rate": 9.707631359880867e-05, + "loss": 1.9049, + "step": 4436 + }, + { + "epoch": 1.361878453038674, + "grad_norm": 0.46988123655319214, + "learning_rate": 9.70746385878524e-05, + "loss": 1.9221, + "step": 4437 + }, + { + "epoch": 1.3621853898096992, + "grad_norm": 0.4956746995449066, + "learning_rate": 9.707296311167697e-05, + "loss": 1.9215, + "step": 4438 + }, + { + "epoch": 1.3624923265807243, + "grad_norm": 0.43748801946640015, + "learning_rate": 9.707128717029894e-05, + "loss": 1.9882, + "step": 4439 + }, + { + "epoch": 1.3627992633517496, + "grad_norm": 0.4926415979862213, + "learning_rate": 9.706961076373485e-05, + "loss": 1.9664, + "step": 4440 + }, + { + "epoch": 1.3631062001227747, + "grad_norm": 0.5239415764808655, + "learning_rate": 9.706793389200129e-05, + "loss": 1.9809, + "step": 4441 + }, + { + "epoch": 1.3634131368937998, + "grad_norm": 0.5134629607200623, + "learning_rate": 9.706625655511481e-05, + "loss": 1.9559, + "step": 4442 + }, + { + "epoch": 1.3637200736648252, + "grad_norm": 0.49562570452690125, + "learning_rate": 9.706457875309198e-05, + "loss": 1.9603, + "step": 4443 + }, + { + "epoch": 1.3640270104358503, + "grad_norm": 0.45000702142715454, + "learning_rate": 9.706290048594942e-05, + "loss": 1.9395, + "step": 4444 + }, + { + "epoch": 1.3643339472068754, + "grad_norm": 0.4216759502887726, + "learning_rate": 9.70612217537037e-05, + "loss": 1.8857, + "step": 4445 + }, + { + "epoch": 1.3646408839779005, + "grad_norm": 0.5022158622741699, + "learning_rate": 9.705954255637138e-05, + "loss": 1.9388, + "step": 4446 + }, + { + "epoch": 1.3649478207489256, + "grad_norm": 0.5086642503738403, + "learning_rate": 9.70578628939691e-05, + "loss": 1.9325, + "step": 4447 + }, + { + "epoch": 1.365254757519951, + "grad_norm": 0.4891139566898346, + "learning_rate": 9.705618276651342e-05, + "loss": 1.9068, + "step": 4448 + }, + { + "epoch": 1.365561694290976, + "grad_norm": 0.42479926347732544, + "learning_rate": 9.705450217402096e-05, + "loss": 2.0345, + "step": 4449 + }, + { + "epoch": 1.3658686310620012, + "grad_norm": 0.45347172021865845, + "learning_rate": 9.705282111650834e-05, + "loss": 1.9343, + "step": 4450 + }, + { + "epoch": 1.3661755678330265, + "grad_norm": 0.5443231463432312, + "learning_rate": 9.705113959399217e-05, + "loss": 2.0428, + "step": 4451 + }, + { + "epoch": 1.3664825046040516, + "grad_norm": 0.5320110321044922, + "learning_rate": 9.704945760648905e-05, + "loss": 2.0015, + "step": 4452 + }, + { + "epoch": 1.3667894413750767, + "grad_norm": 0.5018410086631775, + "learning_rate": 9.704777515401561e-05, + "loss": 1.9284, + "step": 4453 + }, + { + "epoch": 1.3670963781461019, + "grad_norm": 0.4587440490722656, + "learning_rate": 9.704609223658848e-05, + "loss": 1.8945, + "step": 4454 + }, + { + "epoch": 1.367403314917127, + "grad_norm": 0.4634784758090973, + "learning_rate": 9.70444088542243e-05, + "loss": 1.9564, + "step": 4455 + }, + { + "epoch": 1.3677102516881523, + "grad_norm": 0.43047839403152466, + "learning_rate": 9.70427250069397e-05, + "loss": 2.0417, + "step": 4456 + }, + { + "epoch": 1.3680171884591774, + "grad_norm": 0.46661630272865295, + "learning_rate": 9.70410406947513e-05, + "loss": 2.0563, + "step": 4457 + }, + { + "epoch": 1.3683241252302025, + "grad_norm": 0.46544912457466125, + "learning_rate": 9.703935591767579e-05, + "loss": 2.0115, + "step": 4458 + }, + { + "epoch": 1.3686310620012279, + "grad_norm": 0.466172993183136, + "learning_rate": 9.703767067572977e-05, + "loss": 1.9177, + "step": 4459 + }, + { + "epoch": 1.368937998772253, + "grad_norm": 0.44513949751853943, + "learning_rate": 9.703598496892994e-05, + "loss": 1.9954, + "step": 4460 + }, + { + "epoch": 1.369244935543278, + "grad_norm": 0.4502551257610321, + "learning_rate": 9.703429879729293e-05, + "loss": 1.9155, + "step": 4461 + }, + { + "epoch": 1.3695518723143032, + "grad_norm": 0.4618416726589203, + "learning_rate": 9.703261216083541e-05, + "loss": 2.015, + "step": 4462 + }, + { + "epoch": 1.3698588090853283, + "grad_norm": 0.4691082239151001, + "learning_rate": 9.703092505957405e-05, + "loss": 2.0332, + "step": 4463 + }, + { + "epoch": 1.3701657458563536, + "grad_norm": 0.5674530863761902, + "learning_rate": 9.702923749352553e-05, + "loss": 2.0, + "step": 4464 + }, + { + "epoch": 1.3704726826273788, + "grad_norm": 0.5828661322593689, + "learning_rate": 9.702754946270651e-05, + "loss": 1.9727, + "step": 4465 + }, + { + "epoch": 1.3707796193984039, + "grad_norm": 0.5861548781394958, + "learning_rate": 9.702586096713369e-05, + "loss": 2.0337, + "step": 4466 + }, + { + "epoch": 1.3710865561694292, + "grad_norm": 0.5607923865318298, + "learning_rate": 9.702417200682374e-05, + "loss": 1.9639, + "step": 4467 + }, + { + "epoch": 1.3713934929404543, + "grad_norm": 0.553827702999115, + "learning_rate": 9.702248258179337e-05, + "loss": 1.9644, + "step": 4468 + }, + { + "epoch": 1.3717004297114794, + "grad_norm": 0.6120470762252808, + "learning_rate": 9.702079269205925e-05, + "loss": 1.9562, + "step": 4469 + }, + { + "epoch": 1.3720073664825045, + "grad_norm": 0.6354473829269409, + "learning_rate": 9.70191023376381e-05, + "loss": 2.0984, + "step": 4470 + }, + { + "epoch": 1.3723143032535297, + "grad_norm": 0.5426626801490784, + "learning_rate": 9.701741151854665e-05, + "loss": 1.9473, + "step": 4471 + }, + { + "epoch": 1.372621240024555, + "grad_norm": 0.5632089376449585, + "learning_rate": 9.701572023480156e-05, + "loss": 2.0167, + "step": 4472 + }, + { + "epoch": 1.37292817679558, + "grad_norm": 0.5315039157867432, + "learning_rate": 9.701402848641957e-05, + "loss": 1.9537, + "step": 4473 + }, + { + "epoch": 1.3732351135666052, + "grad_norm": 0.4552931785583496, + "learning_rate": 9.70123362734174e-05, + "loss": 1.9553, + "step": 4474 + }, + { + "epoch": 1.3735420503376305, + "grad_norm": 0.49282166361808777, + "learning_rate": 9.701064359581176e-05, + "loss": 2.0409, + "step": 4475 + }, + { + "epoch": 1.3738489871086557, + "grad_norm": 0.46548575162887573, + "learning_rate": 9.700895045361939e-05, + "loss": 1.9707, + "step": 4476 + }, + { + "epoch": 1.3741559238796808, + "grad_norm": 0.4619027078151703, + "learning_rate": 9.7007256846857e-05, + "loss": 1.9531, + "step": 4477 + }, + { + "epoch": 1.3744628606507059, + "grad_norm": 0.5122626423835754, + "learning_rate": 9.700556277554138e-05, + "loss": 2.0625, + "step": 4478 + }, + { + "epoch": 1.374769797421731, + "grad_norm": 0.487246036529541, + "learning_rate": 9.700386823968922e-05, + "loss": 1.9667, + "step": 4479 + }, + { + "epoch": 1.3750767341927563, + "grad_norm": 0.5093865990638733, + "learning_rate": 9.700217323931729e-05, + "loss": 1.9982, + "step": 4480 + }, + { + "epoch": 1.3753836709637814, + "grad_norm": 0.47049981355667114, + "learning_rate": 9.700047777444232e-05, + "loss": 1.9876, + "step": 4481 + }, + { + "epoch": 1.3756906077348066, + "grad_norm": 0.4997411370277405, + "learning_rate": 9.699878184508109e-05, + "loss": 1.9925, + "step": 4482 + }, + { + "epoch": 1.375997544505832, + "grad_norm": 0.49374327063560486, + "learning_rate": 9.699708545125034e-05, + "loss": 1.9468, + "step": 4483 + }, + { + "epoch": 1.376304481276857, + "grad_norm": 0.44101378321647644, + "learning_rate": 9.699538859296686e-05, + "loss": 2.0577, + "step": 4484 + }, + { + "epoch": 1.3766114180478821, + "grad_norm": 0.47289925813674927, + "learning_rate": 9.699369127024741e-05, + "loss": 1.9611, + "step": 4485 + }, + { + "epoch": 1.3769183548189072, + "grad_norm": 0.4616342782974243, + "learning_rate": 9.699199348310875e-05, + "loss": 2.0196, + "step": 4486 + }, + { + "epoch": 1.3772252915899323, + "grad_norm": 0.45797309279441833, + "learning_rate": 9.699029523156766e-05, + "loss": 2.0168, + "step": 4487 + }, + { + "epoch": 1.3775322283609577, + "grad_norm": 0.5224477648735046, + "learning_rate": 9.698859651564095e-05, + "loss": 2.0312, + "step": 4488 + }, + { + "epoch": 1.3778391651319828, + "grad_norm": 0.4831027388572693, + "learning_rate": 9.698689733534539e-05, + "loss": 2.0084, + "step": 4489 + }, + { + "epoch": 1.378146101903008, + "grad_norm": 0.49492040276527405, + "learning_rate": 9.698519769069774e-05, + "loss": 1.9474, + "step": 4490 + }, + { + "epoch": 1.3784530386740332, + "grad_norm": 0.4911774694919586, + "learning_rate": 9.698349758171486e-05, + "loss": 1.987, + "step": 4491 + }, + { + "epoch": 1.3787599754450584, + "grad_norm": 0.5415390729904175, + "learning_rate": 9.69817970084135e-05, + "loss": 1.9927, + "step": 4492 + }, + { + "epoch": 1.3790669122160835, + "grad_norm": 0.6870381832122803, + "learning_rate": 9.698009597081048e-05, + "loss": 2.0348, + "step": 4493 + }, + { + "epoch": 1.3793738489871086, + "grad_norm": 0.6322616934776306, + "learning_rate": 9.697839446892263e-05, + "loss": 2.0119, + "step": 4494 + }, + { + "epoch": 1.3796807857581337, + "grad_norm": 0.5950151681900024, + "learning_rate": 9.697669250276675e-05, + "loss": 2.002, + "step": 4495 + }, + { + "epoch": 1.379987722529159, + "grad_norm": 0.4321151673793793, + "learning_rate": 9.697499007235966e-05, + "loss": 1.9173, + "step": 4496 + }, + { + "epoch": 1.3802946593001841, + "grad_norm": 0.4627344608306885, + "learning_rate": 9.697328717771818e-05, + "loss": 2.0289, + "step": 4497 + }, + { + "epoch": 1.3806015960712092, + "grad_norm": 0.5040726661682129, + "learning_rate": 9.697158381885915e-05, + "loss": 1.9844, + "step": 4498 + }, + { + "epoch": 1.3809085328422346, + "grad_norm": 0.5219398736953735, + "learning_rate": 9.696987999579939e-05, + "loss": 1.9536, + "step": 4499 + }, + { + "epoch": 1.3812154696132597, + "grad_norm": 0.487734317779541, + "learning_rate": 9.696817570855575e-05, + "loss": 1.9655, + "step": 4500 + }, + { + "epoch": 1.3815224063842848, + "grad_norm": 0.40818822383880615, + "learning_rate": 9.696647095714506e-05, + "loss": 1.9524, + "step": 4501 + }, + { + "epoch": 1.3818293431553101, + "grad_norm": 0.41752889752388, + "learning_rate": 9.69647657415842e-05, + "loss": 1.9927, + "step": 4502 + }, + { + "epoch": 1.3821362799263353, + "grad_norm": 0.44540464878082275, + "learning_rate": 9.696306006188998e-05, + "loss": 1.9207, + "step": 4503 + }, + { + "epoch": 1.3824432166973604, + "grad_norm": 0.44818806648254395, + "learning_rate": 9.696135391807927e-05, + "loss": 1.9054, + "step": 4504 + }, + { + "epoch": 1.3827501534683855, + "grad_norm": 0.430758535861969, + "learning_rate": 9.695964731016896e-05, + "loss": 1.9644, + "step": 4505 + }, + { + "epoch": 1.3830570902394106, + "grad_norm": 0.3787635564804077, + "learning_rate": 9.695794023817586e-05, + "loss": 1.9601, + "step": 4506 + }, + { + "epoch": 1.383364027010436, + "grad_norm": 0.42520588636398315, + "learning_rate": 9.695623270211689e-05, + "loss": 1.9681, + "step": 4507 + }, + { + "epoch": 1.383670963781461, + "grad_norm": 0.39063912630081177, + "learning_rate": 9.69545247020089e-05, + "loss": 2.0323, + "step": 4508 + }, + { + "epoch": 1.3839779005524862, + "grad_norm": 0.41405799984931946, + "learning_rate": 9.695281623786879e-05, + "loss": 1.9239, + "step": 4509 + }, + { + "epoch": 1.3842848373235115, + "grad_norm": 0.4275501072406769, + "learning_rate": 9.695110730971342e-05, + "loss": 1.941, + "step": 4510 + }, + { + "epoch": 1.3845917740945366, + "grad_norm": 0.5254966616630554, + "learning_rate": 9.694939791755968e-05, + "loss": 1.9997, + "step": 4511 + }, + { + "epoch": 1.3848987108655617, + "grad_norm": 0.581857442855835, + "learning_rate": 9.694768806142448e-05, + "loss": 2.0085, + "step": 4512 + }, + { + "epoch": 1.3852056476365868, + "grad_norm": 0.6330662965774536, + "learning_rate": 9.69459777413247e-05, + "loss": 1.9898, + "step": 4513 + }, + { + "epoch": 1.385512584407612, + "grad_norm": 0.693536639213562, + "learning_rate": 9.694426695727727e-05, + "loss": 1.9466, + "step": 4514 + }, + { + "epoch": 1.3858195211786373, + "grad_norm": 0.6494079232215881, + "learning_rate": 9.694255570929906e-05, + "loss": 1.9523, + "step": 4515 + }, + { + "epoch": 1.3861264579496624, + "grad_norm": 0.573515772819519, + "learning_rate": 9.694084399740701e-05, + "loss": 1.9789, + "step": 4516 + }, + { + "epoch": 1.3864333947206875, + "grad_norm": 0.5253448486328125, + "learning_rate": 9.693913182161805e-05, + "loss": 2.0348, + "step": 4517 + }, + { + "epoch": 1.3867403314917128, + "grad_norm": 0.49921590089797974, + "learning_rate": 9.693741918194904e-05, + "loss": 1.9684, + "step": 4518 + }, + { + "epoch": 1.387047268262738, + "grad_norm": 0.5164174437522888, + "learning_rate": 9.693570607841696e-05, + "loss": 2.0104, + "step": 4519 + }, + { + "epoch": 1.387354205033763, + "grad_norm": 0.5620231032371521, + "learning_rate": 9.693399251103872e-05, + "loss": 1.9969, + "step": 4520 + }, + { + "epoch": 1.3876611418047882, + "grad_norm": 0.495890349149704, + "learning_rate": 9.693227847983126e-05, + "loss": 2.0037, + "step": 4521 + }, + { + "epoch": 1.3879680785758133, + "grad_norm": 0.4942645728588104, + "learning_rate": 9.693056398481151e-05, + "loss": 2.0199, + "step": 4522 + }, + { + "epoch": 1.3882750153468386, + "grad_norm": 0.5366860628128052, + "learning_rate": 9.692884902599643e-05, + "loss": 2.0395, + "step": 4523 + }, + { + "epoch": 1.3885819521178637, + "grad_norm": 0.48179951310157776, + "learning_rate": 9.692713360340295e-05, + "loss": 2.0292, + "step": 4524 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.4709320366382599, + "learning_rate": 9.692541771704804e-05, + "loss": 2.006, + "step": 4525 + }, + { + "epoch": 1.3891958256599142, + "grad_norm": 0.4311975836753845, + "learning_rate": 9.692370136694864e-05, + "loss": 2.0122, + "step": 4526 + }, + { + "epoch": 1.3895027624309393, + "grad_norm": 0.4489841163158417, + "learning_rate": 9.692198455312172e-05, + "loss": 1.9635, + "step": 4527 + }, + { + "epoch": 1.3898096992019644, + "grad_norm": 0.40383243560791016, + "learning_rate": 9.692026727558425e-05, + "loss": 1.9352, + "step": 4528 + }, + { + "epoch": 1.3901166359729895, + "grad_norm": 0.4732677638530731, + "learning_rate": 9.691854953435319e-05, + "loss": 1.9882, + "step": 4529 + }, + { + "epoch": 1.3904235727440146, + "grad_norm": 0.5124688744544983, + "learning_rate": 9.691683132944553e-05, + "loss": 2.0068, + "step": 4530 + }, + { + "epoch": 1.39073050951504, + "grad_norm": 0.4810490906238556, + "learning_rate": 9.691511266087824e-05, + "loss": 2.0163, + "step": 4531 + }, + { + "epoch": 1.391037446286065, + "grad_norm": 0.4019710421562195, + "learning_rate": 9.691339352866831e-05, + "loss": 1.8943, + "step": 4532 + }, + { + "epoch": 1.3913443830570902, + "grad_norm": 0.4144287705421448, + "learning_rate": 9.691167393283274e-05, + "loss": 1.9457, + "step": 4533 + }, + { + "epoch": 1.3916513198281155, + "grad_norm": 0.42622655630111694, + "learning_rate": 9.690995387338851e-05, + "loss": 1.9618, + "step": 4534 + }, + { + "epoch": 1.3919582565991406, + "grad_norm": 0.4547794461250305, + "learning_rate": 9.690823335035259e-05, + "loss": 2.0243, + "step": 4535 + }, + { + "epoch": 1.3922651933701657, + "grad_norm": 0.4298909604549408, + "learning_rate": 9.690651236374205e-05, + "loss": 1.9872, + "step": 4536 + }, + { + "epoch": 1.3925721301411909, + "grad_norm": 0.40463829040527344, + "learning_rate": 9.690479091357386e-05, + "loss": 1.9617, + "step": 4537 + }, + { + "epoch": 1.392879066912216, + "grad_norm": 0.441487580537796, + "learning_rate": 9.690306899986502e-05, + "loss": 1.8965, + "step": 4538 + }, + { + "epoch": 1.3931860036832413, + "grad_norm": 0.4713582694530487, + "learning_rate": 9.690134662263256e-05, + "loss": 2.0112, + "step": 4539 + }, + { + "epoch": 1.3934929404542664, + "grad_norm": 0.5772922039031982, + "learning_rate": 9.689962378189351e-05, + "loss": 1.9903, + "step": 4540 + }, + { + "epoch": 1.3937998772252915, + "grad_norm": 0.6658890247344971, + "learning_rate": 9.689790047766489e-05, + "loss": 2.0569, + "step": 4541 + }, + { + "epoch": 1.3941068139963169, + "grad_norm": 0.6710116267204285, + "learning_rate": 9.689617670996372e-05, + "loss": 1.9692, + "step": 4542 + }, + { + "epoch": 1.394413750767342, + "grad_norm": 0.6778390407562256, + "learning_rate": 9.689445247880707e-05, + "loss": 2.0363, + "step": 4543 + }, + { + "epoch": 1.394720687538367, + "grad_norm": 0.6921203136444092, + "learning_rate": 9.689272778421192e-05, + "loss": 2.0104, + "step": 4544 + }, + { + "epoch": 1.3950276243093922, + "grad_norm": 0.48772117495536804, + "learning_rate": 9.689100262619537e-05, + "loss": 2.0006, + "step": 4545 + }, + { + "epoch": 1.3953345610804173, + "grad_norm": 0.4956360459327698, + "learning_rate": 9.688927700477445e-05, + "loss": 1.9724, + "step": 4546 + }, + { + "epoch": 1.3956414978514426, + "grad_norm": 0.6304072141647339, + "learning_rate": 9.68875509199662e-05, + "loss": 1.9904, + "step": 4547 + }, + { + "epoch": 1.3959484346224678, + "grad_norm": 0.6372275948524475, + "learning_rate": 9.68858243717877e-05, + "loss": 2.0328, + "step": 4548 + }, + { + "epoch": 1.3962553713934929, + "grad_norm": 0.48642870783805847, + "learning_rate": 9.688409736025601e-05, + "loss": 1.9898, + "step": 4549 + }, + { + "epoch": 1.3965623081645182, + "grad_norm": 0.41096800565719604, + "learning_rate": 9.688236988538817e-05, + "loss": 1.8945, + "step": 4550 + }, + { + "epoch": 1.3968692449355433, + "grad_norm": 0.48746830224990845, + "learning_rate": 9.68806419472013e-05, + "loss": 1.9809, + "step": 4551 + }, + { + "epoch": 1.3971761817065684, + "grad_norm": 0.5296676754951477, + "learning_rate": 9.687891354571242e-05, + "loss": 1.9194, + "step": 4552 + }, + { + "epoch": 1.3974831184775935, + "grad_norm": 0.43177086114883423, + "learning_rate": 9.687718468093865e-05, + "loss": 1.8785, + "step": 4553 + }, + { + "epoch": 1.3977900552486187, + "grad_norm": 0.4617565870285034, + "learning_rate": 9.687545535289705e-05, + "loss": 2.0021, + "step": 4554 + }, + { + "epoch": 1.398096992019644, + "grad_norm": 0.4460168182849884, + "learning_rate": 9.687372556160477e-05, + "loss": 1.9368, + "step": 4555 + }, + { + "epoch": 1.398403928790669, + "grad_norm": 0.5051010847091675, + "learning_rate": 9.687199530707882e-05, + "loss": 2.0321, + "step": 4556 + }, + { + "epoch": 1.3987108655616942, + "grad_norm": 0.5623685717582703, + "learning_rate": 9.687026458933636e-05, + "loss": 2.007, + "step": 4557 + }, + { + "epoch": 1.3990178023327196, + "grad_norm": 0.48149919509887695, + "learning_rate": 9.686853340839446e-05, + "loss": 1.9346, + "step": 4558 + }, + { + "epoch": 1.3993247391037447, + "grad_norm": 0.4651631712913513, + "learning_rate": 9.686680176427025e-05, + "loss": 1.9603, + "step": 4559 + }, + { + "epoch": 1.3996316758747698, + "grad_norm": 0.5255021452903748, + "learning_rate": 9.686506965698083e-05, + "loss": 2.0206, + "step": 4560 + }, + { + "epoch": 1.3999386126457949, + "grad_norm": 0.5137404799461365, + "learning_rate": 9.686333708654334e-05, + "loss": 1.9736, + "step": 4561 + }, + { + "epoch": 1.40024554941682, + "grad_norm": 0.5037943124771118, + "learning_rate": 9.686160405297487e-05, + "loss": 1.9886, + "step": 4562 + }, + { + "epoch": 1.4005524861878453, + "grad_norm": 0.46424365043640137, + "learning_rate": 9.685987055629256e-05, + "loss": 1.9316, + "step": 4563 + }, + { + "epoch": 1.4008594229588704, + "grad_norm": 0.4839535355567932, + "learning_rate": 9.685813659651355e-05, + "loss": 1.9651, + "step": 4564 + }, + { + "epoch": 1.4011663597298956, + "grad_norm": 0.48972323536872864, + "learning_rate": 9.685640217365497e-05, + "loss": 1.9544, + "step": 4565 + }, + { + "epoch": 1.401473296500921, + "grad_norm": 0.43038102984428406, + "learning_rate": 9.685466728773396e-05, + "loss": 1.9522, + "step": 4566 + }, + { + "epoch": 1.401780233271946, + "grad_norm": 0.5174641013145447, + "learning_rate": 9.685293193876765e-05, + "loss": 2.046, + "step": 4567 + }, + { + "epoch": 1.4020871700429711, + "grad_norm": 0.6731263995170593, + "learning_rate": 9.685119612677323e-05, + "loss": 2.0123, + "step": 4568 + }, + { + "epoch": 1.4023941068139965, + "grad_norm": 0.5863515734672546, + "learning_rate": 9.684945985176782e-05, + "loss": 1.9951, + "step": 4569 + }, + { + "epoch": 1.4027010435850216, + "grad_norm": 0.4479050934314728, + "learning_rate": 9.684772311376859e-05, + "loss": 1.9287, + "step": 4570 + }, + { + "epoch": 1.4030079803560467, + "grad_norm": 0.432740718126297, + "learning_rate": 9.68459859127927e-05, + "loss": 1.955, + "step": 4571 + }, + { + "epoch": 1.4033149171270718, + "grad_norm": 0.571775496006012, + "learning_rate": 9.684424824885731e-05, + "loss": 1.9519, + "step": 4572 + }, + { + "epoch": 1.403621853898097, + "grad_norm": 0.6454880237579346, + "learning_rate": 9.684251012197963e-05, + "loss": 1.9858, + "step": 4573 + }, + { + "epoch": 1.4039287906691222, + "grad_norm": 0.5274731516838074, + "learning_rate": 9.684077153217677e-05, + "loss": 1.9956, + "step": 4574 + }, + { + "epoch": 1.4042357274401474, + "grad_norm": 0.4459272027015686, + "learning_rate": 9.683903247946597e-05, + "loss": 2.0412, + "step": 4575 + }, + { + "epoch": 1.4045426642111725, + "grad_norm": 0.47089213132858276, + "learning_rate": 9.683729296386441e-05, + "loss": 1.9247, + "step": 4576 + }, + { + "epoch": 1.4048496009821978, + "grad_norm": 0.628490149974823, + "learning_rate": 9.683555298538927e-05, + "loss": 2.1311, + "step": 4577 + }, + { + "epoch": 1.405156537753223, + "grad_norm": 0.5498626232147217, + "learning_rate": 9.683381254405773e-05, + "loss": 1.9538, + "step": 4578 + }, + { + "epoch": 1.405463474524248, + "grad_norm": 0.4556458294391632, + "learning_rate": 9.6832071639887e-05, + "loss": 1.9957, + "step": 4579 + }, + { + "epoch": 1.4057704112952731, + "grad_norm": 0.5684164762496948, + "learning_rate": 9.68303302728943e-05, + "loss": 1.9339, + "step": 4580 + }, + { + "epoch": 1.4060773480662982, + "grad_norm": 0.5723292231559753, + "learning_rate": 9.682858844309682e-05, + "loss": 2.0043, + "step": 4581 + }, + { + "epoch": 1.4063842848373236, + "grad_norm": 0.4734770953655243, + "learning_rate": 9.682684615051178e-05, + "loss": 1.9854, + "step": 4582 + }, + { + "epoch": 1.4066912216083487, + "grad_norm": 0.49376189708709717, + "learning_rate": 9.682510339515642e-05, + "loss": 2.0436, + "step": 4583 + }, + { + "epoch": 1.4069981583793738, + "grad_norm": 0.6263520121574402, + "learning_rate": 9.682336017704793e-05, + "loss": 1.9426, + "step": 4584 + }, + { + "epoch": 1.4073050951503991, + "grad_norm": 0.5852357745170593, + "learning_rate": 9.682161649620355e-05, + "loss": 1.9865, + "step": 4585 + }, + { + "epoch": 1.4076120319214243, + "grad_norm": 0.45548367500305176, + "learning_rate": 9.681987235264052e-05, + "loss": 2.0454, + "step": 4586 + }, + { + "epoch": 1.4079189686924494, + "grad_norm": 0.4961472153663635, + "learning_rate": 9.681812774637607e-05, + "loss": 2.0414, + "step": 4587 + }, + { + "epoch": 1.4082259054634745, + "grad_norm": 0.5739028453826904, + "learning_rate": 9.681638267742741e-05, + "loss": 1.9591, + "step": 4588 + }, + { + "epoch": 1.4085328422344996, + "grad_norm": 0.546283483505249, + "learning_rate": 9.681463714581184e-05, + "loss": 1.9631, + "step": 4589 + }, + { + "epoch": 1.408839779005525, + "grad_norm": 0.4757421910762787, + "learning_rate": 9.681289115154659e-05, + "loss": 1.954, + "step": 4590 + }, + { + "epoch": 1.40914671577655, + "grad_norm": 0.5116898417472839, + "learning_rate": 9.681114469464891e-05, + "loss": 1.9816, + "step": 4591 + }, + { + "epoch": 1.4094536525475752, + "grad_norm": 0.6128544807434082, + "learning_rate": 9.680939777513607e-05, + "loss": 1.9408, + "step": 4592 + }, + { + "epoch": 1.4097605893186005, + "grad_norm": 0.5577036142349243, + "learning_rate": 9.680765039302531e-05, + "loss": 1.906, + "step": 4593 + }, + { + "epoch": 1.4100675260896256, + "grad_norm": 0.4608074128627777, + "learning_rate": 9.680590254833393e-05, + "loss": 1.9421, + "step": 4594 + }, + { + "epoch": 1.4103744628606507, + "grad_norm": 0.4221206307411194, + "learning_rate": 9.680415424107917e-05, + "loss": 1.9596, + "step": 4595 + }, + { + "epoch": 1.4106813996316758, + "grad_norm": 0.4278069734573364, + "learning_rate": 9.680240547127832e-05, + "loss": 1.9718, + "step": 4596 + }, + { + "epoch": 1.410988336402701, + "grad_norm": 0.48608019948005676, + "learning_rate": 9.680065623894869e-05, + "loss": 2.0595, + "step": 4597 + }, + { + "epoch": 1.4112952731737263, + "grad_norm": 0.4559817910194397, + "learning_rate": 9.679890654410753e-05, + "loss": 1.959, + "step": 4598 + }, + { + "epoch": 1.4116022099447514, + "grad_norm": 0.5122750997543335, + "learning_rate": 9.679715638677216e-05, + "loss": 2.0669, + "step": 4599 + }, + { + "epoch": 1.4119091467157765, + "grad_norm": 0.5203170776367188, + "learning_rate": 9.679540576695985e-05, + "loss": 1.9475, + "step": 4600 + }, + { + "epoch": 1.4122160834868018, + "grad_norm": 0.5420581698417664, + "learning_rate": 9.679365468468791e-05, + "loss": 1.9603, + "step": 4601 + }, + { + "epoch": 1.412523020257827, + "grad_norm": 0.527387261390686, + "learning_rate": 9.679190313997364e-05, + "loss": 1.9172, + "step": 4602 + }, + { + "epoch": 1.412829957028852, + "grad_norm": 0.48417946696281433, + "learning_rate": 9.679015113283438e-05, + "loss": 1.9619, + "step": 4603 + }, + { + "epoch": 1.4131368937998772, + "grad_norm": 0.49174100160598755, + "learning_rate": 9.678839866328742e-05, + "loss": 1.9959, + "step": 4604 + }, + { + "epoch": 1.4134438305709023, + "grad_norm": 0.5096092224121094, + "learning_rate": 9.678664573135006e-05, + "loss": 2.0046, + "step": 4605 + }, + { + "epoch": 1.4137507673419276, + "grad_norm": 0.4536958634853363, + "learning_rate": 9.678489233703965e-05, + "loss": 1.9289, + "step": 4606 + }, + { + "epoch": 1.4140577041129527, + "grad_norm": 0.40438196063041687, + "learning_rate": 9.678313848037353e-05, + "loss": 1.9488, + "step": 4607 + }, + { + "epoch": 1.4143646408839778, + "grad_norm": 0.4447456896305084, + "learning_rate": 9.6781384161369e-05, + "loss": 1.9638, + "step": 4608 + }, + { + "epoch": 1.4146715776550032, + "grad_norm": 0.44451746344566345, + "learning_rate": 9.677962938004342e-05, + "loss": 1.9026, + "step": 4609 + }, + { + "epoch": 1.4149785144260283, + "grad_norm": 0.4262266457080841, + "learning_rate": 9.677787413641412e-05, + "loss": 1.9408, + "step": 4610 + }, + { + "epoch": 1.4152854511970534, + "grad_norm": 0.42755937576293945, + "learning_rate": 9.677611843049845e-05, + "loss": 1.9542, + "step": 4611 + }, + { + "epoch": 1.4155923879680785, + "grad_norm": 0.43264830112457275, + "learning_rate": 9.677436226231375e-05, + "loss": 2.0244, + "step": 4612 + }, + { + "epoch": 1.4158993247391036, + "grad_norm": 0.4521278142929077, + "learning_rate": 9.67726056318774e-05, + "loss": 2.0343, + "step": 4613 + }, + { + "epoch": 1.416206261510129, + "grad_norm": 0.45257535576820374, + "learning_rate": 9.677084853920675e-05, + "loss": 1.9743, + "step": 4614 + }, + { + "epoch": 1.416513198281154, + "grad_norm": 0.42859771847724915, + "learning_rate": 9.676909098431915e-05, + "loss": 2.0067, + "step": 4615 + }, + { + "epoch": 1.4168201350521792, + "grad_norm": 0.4057050049304962, + "learning_rate": 9.6767332967232e-05, + "loss": 1.9074, + "step": 4616 + }, + { + "epoch": 1.4171270718232045, + "grad_norm": 0.46177807450294495, + "learning_rate": 9.676557448796264e-05, + "loss": 1.9899, + "step": 4617 + }, + { + "epoch": 1.4174340085942296, + "grad_norm": 0.44164395332336426, + "learning_rate": 9.676381554652846e-05, + "loss": 1.9759, + "step": 4618 + }, + { + "epoch": 1.4177409453652547, + "grad_norm": 0.42987993359565735, + "learning_rate": 9.676205614294684e-05, + "loss": 1.8783, + "step": 4619 + }, + { + "epoch": 1.4180478821362799, + "grad_norm": 0.541702389717102, + "learning_rate": 9.67602962772352e-05, + "loss": 2.0099, + "step": 4620 + }, + { + "epoch": 1.418354818907305, + "grad_norm": 0.42173272371292114, + "learning_rate": 9.67585359494109e-05, + "loss": 1.9281, + "step": 4621 + }, + { + "epoch": 1.4186617556783303, + "grad_norm": 0.432476669549942, + "learning_rate": 9.67567751594913e-05, + "loss": 1.9124, + "step": 4622 + }, + { + "epoch": 1.4189686924493554, + "grad_norm": 0.4952125549316406, + "learning_rate": 9.675501390749388e-05, + "loss": 1.973, + "step": 4623 + }, + { + "epoch": 1.4192756292203805, + "grad_norm": 0.5270698070526123, + "learning_rate": 9.6753252193436e-05, + "loss": 2.003, + "step": 4624 + }, + { + "epoch": 1.4195825659914059, + "grad_norm": 0.5735524892807007, + "learning_rate": 9.67514900173351e-05, + "loss": 1.9266, + "step": 4625 + }, + { + "epoch": 1.419889502762431, + "grad_norm": 0.508196234703064, + "learning_rate": 9.674972737920855e-05, + "loss": 1.9633, + "step": 4626 + }, + { + "epoch": 1.420196439533456, + "grad_norm": 0.4321250319480896, + "learning_rate": 9.674796427907379e-05, + "loss": 1.9994, + "step": 4627 + }, + { + "epoch": 1.4205033763044812, + "grad_norm": 0.5697643756866455, + "learning_rate": 9.674620071694826e-05, + "loss": 2.0018, + "step": 4628 + }, + { + "epoch": 1.4208103130755063, + "grad_norm": 0.6797513365745544, + "learning_rate": 9.674443669284936e-05, + "loss": 2.0514, + "step": 4629 + }, + { + "epoch": 1.4211172498465316, + "grad_norm": 0.6622742414474487, + "learning_rate": 9.674267220679456e-05, + "loss": 1.9315, + "step": 4630 + }, + { + "epoch": 1.4214241866175568, + "grad_norm": 0.5143589377403259, + "learning_rate": 9.674090725880125e-05, + "loss": 1.9691, + "step": 4631 + }, + { + "epoch": 1.4217311233885819, + "grad_norm": 0.4472220838069916, + "learning_rate": 9.673914184888692e-05, + "loss": 1.9629, + "step": 4632 + }, + { + "epoch": 1.4220380601596072, + "grad_norm": 0.4992378354072571, + "learning_rate": 9.6737375977069e-05, + "loss": 1.9202, + "step": 4633 + }, + { + "epoch": 1.4223449969306323, + "grad_norm": 0.5463345646858215, + "learning_rate": 9.673560964336493e-05, + "loss": 2.0143, + "step": 4634 + }, + { + "epoch": 1.4226519337016574, + "grad_norm": 0.4566437304019928, + "learning_rate": 9.673384284779217e-05, + "loss": 1.8907, + "step": 4635 + }, + { + "epoch": 1.4229588704726825, + "grad_norm": 0.41718652844429016, + "learning_rate": 9.673207559036816e-05, + "loss": 1.8955, + "step": 4636 + }, + { + "epoch": 1.4232658072437077, + "grad_norm": 0.5017329454421997, + "learning_rate": 9.673030787111043e-05, + "loss": 1.9745, + "step": 4637 + }, + { + "epoch": 1.423572744014733, + "grad_norm": 0.48890092968940735, + "learning_rate": 9.67285396900364e-05, + "loss": 1.9448, + "step": 4638 + }, + { + "epoch": 1.423879680785758, + "grad_norm": 0.4519537687301636, + "learning_rate": 9.672677104716352e-05, + "loss": 1.9572, + "step": 4639 + }, + { + "epoch": 1.4241866175567832, + "grad_norm": 0.4786919355392456, + "learning_rate": 9.672500194250932e-05, + "loss": 2.0212, + "step": 4640 + }, + { + "epoch": 1.4244935543278086, + "grad_norm": 0.4938487112522125, + "learning_rate": 9.672323237609127e-05, + "loss": 1.9842, + "step": 4641 + }, + { + "epoch": 1.4248004910988337, + "grad_norm": 0.5786599516868591, + "learning_rate": 9.672146234792686e-05, + "loss": 1.9575, + "step": 4642 + }, + { + "epoch": 1.4251074278698588, + "grad_norm": 0.5532247424125671, + "learning_rate": 9.671969185803356e-05, + "loss": 1.9972, + "step": 4643 + }, + { + "epoch": 1.4254143646408841, + "grad_norm": 0.5058014988899231, + "learning_rate": 9.671792090642889e-05, + "loss": 2.0042, + "step": 4644 + }, + { + "epoch": 1.4257213014119092, + "grad_norm": 0.46545106172561646, + "learning_rate": 9.671614949313033e-05, + "loss": 1.9853, + "step": 4645 + }, + { + "epoch": 1.4260282381829343, + "grad_norm": 0.47626879811286926, + "learning_rate": 9.671437761815541e-05, + "loss": 1.9725, + "step": 4646 + }, + { + "epoch": 1.4263351749539595, + "grad_norm": 0.4476237893104553, + "learning_rate": 9.671260528152165e-05, + "loss": 1.8876, + "step": 4647 + }, + { + "epoch": 1.4266421117249846, + "grad_norm": 0.4290693700313568, + "learning_rate": 9.671083248324651e-05, + "loss": 1.9766, + "step": 4648 + }, + { + "epoch": 1.42694904849601, + "grad_norm": 0.443131685256958, + "learning_rate": 9.670905922334757e-05, + "loss": 2.0201, + "step": 4649 + }, + { + "epoch": 1.427255985267035, + "grad_norm": 0.5181389451026917, + "learning_rate": 9.670728550184231e-05, + "loss": 2.0013, + "step": 4650 + }, + { + "epoch": 1.4275629220380601, + "grad_norm": 0.48453402519226074, + "learning_rate": 9.670551131874829e-05, + "loss": 1.9536, + "step": 4651 + }, + { + "epoch": 1.4278698588090855, + "grad_norm": 0.49652302265167236, + "learning_rate": 9.670373667408303e-05, + "loss": 1.9934, + "step": 4652 + }, + { + "epoch": 1.4281767955801106, + "grad_norm": 0.47071191668510437, + "learning_rate": 9.670196156786406e-05, + "loss": 2.0319, + "step": 4653 + }, + { + "epoch": 1.4284837323511357, + "grad_norm": 0.46828708052635193, + "learning_rate": 9.670018600010894e-05, + "loss": 1.9248, + "step": 4654 + }, + { + "epoch": 1.4287906691221608, + "grad_norm": 0.48472490906715393, + "learning_rate": 9.669840997083524e-05, + "loss": 1.9681, + "step": 4655 + }, + { + "epoch": 1.429097605893186, + "grad_norm": 0.48628562688827515, + "learning_rate": 9.669663348006044e-05, + "loss": 1.9818, + "step": 4656 + }, + { + "epoch": 1.4294045426642112, + "grad_norm": 0.40770742297172546, + "learning_rate": 9.669485652780215e-05, + "loss": 1.927, + "step": 4657 + }, + { + "epoch": 1.4297114794352364, + "grad_norm": 0.5005267858505249, + "learning_rate": 9.669307911407794e-05, + "loss": 2.0564, + "step": 4658 + }, + { + "epoch": 1.4300184162062615, + "grad_norm": 0.42432111501693726, + "learning_rate": 9.669130123890533e-05, + "loss": 1.9344, + "step": 4659 + }, + { + "epoch": 1.4303253529772868, + "grad_norm": 0.42347240447998047, + "learning_rate": 9.668952290230192e-05, + "loss": 1.962, + "step": 4660 + }, + { + "epoch": 1.430632289748312, + "grad_norm": 0.4718005955219269, + "learning_rate": 9.668774410428529e-05, + "loss": 2.0081, + "step": 4661 + }, + { + "epoch": 1.430939226519337, + "grad_norm": 0.45922374725341797, + "learning_rate": 9.6685964844873e-05, + "loss": 1.9378, + "step": 4662 + }, + { + "epoch": 1.4312461632903621, + "grad_norm": 0.43764227628707886, + "learning_rate": 9.668418512408263e-05, + "loss": 2.0084, + "step": 4663 + }, + { + "epoch": 1.4315531000613873, + "grad_norm": 0.42079678177833557, + "learning_rate": 9.668240494193179e-05, + "loss": 1.9675, + "step": 4664 + }, + { + "epoch": 1.4318600368324126, + "grad_norm": 0.4470539093017578, + "learning_rate": 9.668062429843808e-05, + "loss": 1.9781, + "step": 4665 + }, + { + "epoch": 1.4321669736034377, + "grad_norm": 0.4903084337711334, + "learning_rate": 9.667884319361906e-05, + "loss": 1.9612, + "step": 4666 + }, + { + "epoch": 1.4324739103744628, + "grad_norm": 0.4906228482723236, + "learning_rate": 9.667706162749234e-05, + "loss": 2.0115, + "step": 4667 + }, + { + "epoch": 1.4327808471454881, + "grad_norm": 0.4868105351924896, + "learning_rate": 9.667527960007556e-05, + "loss": 1.9648, + "step": 4668 + }, + { + "epoch": 1.4330877839165133, + "grad_norm": 0.5115882754325867, + "learning_rate": 9.667349711138632e-05, + "loss": 2.0366, + "step": 4669 + }, + { + "epoch": 1.4333947206875384, + "grad_norm": 0.47366276383399963, + "learning_rate": 9.66717141614422e-05, + "loss": 1.9467, + "step": 4670 + }, + { + "epoch": 1.4337016574585635, + "grad_norm": 0.6110171675682068, + "learning_rate": 9.666993075026086e-05, + "loss": 1.9272, + "step": 4671 + }, + { + "epoch": 1.4340085942295886, + "grad_norm": 0.5915683507919312, + "learning_rate": 9.66681468778599e-05, + "loss": 2.0444, + "step": 4672 + }, + { + "epoch": 1.434315531000614, + "grad_norm": 0.5783519744873047, + "learning_rate": 9.666636254425697e-05, + "loss": 1.9579, + "step": 4673 + }, + { + "epoch": 1.434622467771639, + "grad_norm": 0.4646502137184143, + "learning_rate": 9.66645777494697e-05, + "loss": 1.9172, + "step": 4674 + }, + { + "epoch": 1.4349294045426642, + "grad_norm": 0.4184744656085968, + "learning_rate": 9.666279249351571e-05, + "loss": 1.9189, + "step": 4675 + }, + { + "epoch": 1.4352363413136895, + "grad_norm": 0.5444575548171997, + "learning_rate": 9.666100677641266e-05, + "loss": 2.045, + "step": 4676 + }, + { + "epoch": 1.4355432780847146, + "grad_norm": 0.5232846140861511, + "learning_rate": 9.665922059817818e-05, + "loss": 2.0059, + "step": 4677 + }, + { + "epoch": 1.4358502148557397, + "grad_norm": 0.439259797334671, + "learning_rate": 9.665743395882994e-05, + "loss": 1.9164, + "step": 4678 + }, + { + "epoch": 1.4361571516267648, + "grad_norm": 0.405073344707489, + "learning_rate": 9.66556468583856e-05, + "loss": 1.9211, + "step": 4679 + }, + { + "epoch": 1.43646408839779, + "grad_norm": 0.47113174200057983, + "learning_rate": 9.665385929686279e-05, + "loss": 2.0732, + "step": 4680 + }, + { + "epoch": 1.4367710251688153, + "grad_norm": 0.4710143506526947, + "learning_rate": 9.665207127427923e-05, + "loss": 1.9153, + "step": 4681 + }, + { + "epoch": 1.4370779619398404, + "grad_norm": 0.41988152265548706, + "learning_rate": 9.665028279065254e-05, + "loss": 1.9985, + "step": 4682 + }, + { + "epoch": 1.4373848987108655, + "grad_norm": 0.4629889130592346, + "learning_rate": 9.664849384600042e-05, + "loss": 2.0188, + "step": 4683 + }, + { + "epoch": 1.4376918354818908, + "grad_norm": 0.42099106311798096, + "learning_rate": 9.664670444034051e-05, + "loss": 1.8915, + "step": 4684 + }, + { + "epoch": 1.437998772252916, + "grad_norm": 0.4132508337497711, + "learning_rate": 9.664491457369056e-05, + "loss": 1.9842, + "step": 4685 + }, + { + "epoch": 1.438305709023941, + "grad_norm": 0.4019499123096466, + "learning_rate": 9.664312424606822e-05, + "loss": 1.8653, + "step": 4686 + }, + { + "epoch": 1.4386126457949662, + "grad_norm": 0.40366294980049133, + "learning_rate": 9.664133345749118e-05, + "loss": 1.8993, + "step": 4687 + }, + { + "epoch": 1.4389195825659913, + "grad_norm": 0.4391988217830658, + "learning_rate": 9.663954220797715e-05, + "loss": 1.9471, + "step": 4688 + }, + { + "epoch": 1.4392265193370166, + "grad_norm": 0.44109684228897095, + "learning_rate": 9.663775049754382e-05, + "loss": 1.9579, + "step": 4689 + }, + { + "epoch": 1.4395334561080417, + "grad_norm": 0.45682960748672485, + "learning_rate": 9.663595832620891e-05, + "loss": 1.9757, + "step": 4690 + }, + { + "epoch": 1.4398403928790668, + "grad_norm": 0.4106207489967346, + "learning_rate": 9.663416569399013e-05, + "loss": 2.0038, + "step": 4691 + }, + { + "epoch": 1.4401473296500922, + "grad_norm": 0.4627512991428375, + "learning_rate": 9.66323726009052e-05, + "loss": 2.0253, + "step": 4692 + }, + { + "epoch": 1.4404542664211173, + "grad_norm": 0.43822941184043884, + "learning_rate": 9.663057904697182e-05, + "loss": 1.9565, + "step": 4693 + }, + { + "epoch": 1.4407612031921424, + "grad_norm": 0.46254315972328186, + "learning_rate": 9.662878503220772e-05, + "loss": 2.0042, + "step": 4694 + }, + { + "epoch": 1.4410681399631675, + "grad_norm": 0.49801671504974365, + "learning_rate": 9.662699055663065e-05, + "loss": 1.9725, + "step": 4695 + }, + { + "epoch": 1.4413750767341926, + "grad_norm": 0.40280646085739136, + "learning_rate": 9.662519562025832e-05, + "loss": 1.9016, + "step": 4696 + }, + { + "epoch": 1.441682013505218, + "grad_norm": 0.4095497131347656, + "learning_rate": 9.662340022310848e-05, + "loss": 2.0054, + "step": 4697 + }, + { + "epoch": 1.441988950276243, + "grad_norm": 0.44916659593582153, + "learning_rate": 9.662160436519889e-05, + "loss": 2.0126, + "step": 4698 + }, + { + "epoch": 1.4422958870472682, + "grad_norm": 0.47450655698776245, + "learning_rate": 9.661980804654725e-05, + "loss": 1.9679, + "step": 4699 + }, + { + "epoch": 1.4426028238182935, + "grad_norm": 0.4454696774482727, + "learning_rate": 9.661801126717136e-05, + "loss": 1.9335, + "step": 4700 + }, + { + "epoch": 1.4429097605893186, + "grad_norm": 0.5009927153587341, + "learning_rate": 9.661621402708896e-05, + "loss": 1.9777, + "step": 4701 + }, + { + "epoch": 1.4432166973603437, + "grad_norm": 0.49912458658218384, + "learning_rate": 9.66144163263178e-05, + "loss": 2.0095, + "step": 4702 + }, + { + "epoch": 1.4435236341313689, + "grad_norm": 0.4477069079875946, + "learning_rate": 9.661261816487568e-05, + "loss": 1.9265, + "step": 4703 + }, + { + "epoch": 1.443830570902394, + "grad_norm": 0.4170798361301422, + "learning_rate": 9.661081954278033e-05, + "loss": 1.9458, + "step": 4704 + }, + { + "epoch": 1.4441375076734193, + "grad_norm": 0.45160573720932007, + "learning_rate": 9.660902046004953e-05, + "loss": 1.9596, + "step": 4705 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4391551911830902, + "learning_rate": 9.660722091670109e-05, + "loss": 1.9158, + "step": 4706 + }, + { + "epoch": 1.4447513812154695, + "grad_norm": 0.5183218121528625, + "learning_rate": 9.660542091275276e-05, + "loss": 2.0055, + "step": 4707 + }, + { + "epoch": 1.4450583179864949, + "grad_norm": 0.49749481678009033, + "learning_rate": 9.660362044822235e-05, + "loss": 1.9695, + "step": 4708 + }, + { + "epoch": 1.44536525475752, + "grad_norm": 0.4839307963848114, + "learning_rate": 9.660181952312766e-05, + "loss": 1.9447, + "step": 4709 + }, + { + "epoch": 1.445672191528545, + "grad_norm": 0.5218588709831238, + "learning_rate": 9.660001813748647e-05, + "loss": 1.9892, + "step": 4710 + }, + { + "epoch": 1.4459791282995704, + "grad_norm": 0.5628986954689026, + "learning_rate": 9.659821629131658e-05, + "loss": 2.0598, + "step": 4711 + }, + { + "epoch": 1.4462860650705955, + "grad_norm": 0.5226300358772278, + "learning_rate": 9.65964139846358e-05, + "loss": 1.977, + "step": 4712 + }, + { + "epoch": 1.4465930018416207, + "grad_norm": 0.4345463216304779, + "learning_rate": 9.659461121746196e-05, + "loss": 1.9649, + "step": 4713 + }, + { + "epoch": 1.4468999386126458, + "grad_norm": 0.47233885526657104, + "learning_rate": 9.659280798981285e-05, + "loss": 1.9791, + "step": 4714 + }, + { + "epoch": 1.4472068753836709, + "grad_norm": 0.5272542238235474, + "learning_rate": 9.659100430170631e-05, + "loss": 2.0153, + "step": 4715 + }, + { + "epoch": 1.4475138121546962, + "grad_norm": 0.5567492246627808, + "learning_rate": 9.658920015316015e-05, + "loss": 2.0196, + "step": 4716 + }, + { + "epoch": 1.4478207489257213, + "grad_norm": 0.5393046140670776, + "learning_rate": 9.658739554419222e-05, + "loss": 1.9871, + "step": 4717 + }, + { + "epoch": 1.4481276856967464, + "grad_norm": 0.46408072113990784, + "learning_rate": 9.658559047482034e-05, + "loss": 1.9896, + "step": 4718 + }, + { + "epoch": 1.4484346224677718, + "grad_norm": 0.47001218795776367, + "learning_rate": 9.658378494506234e-05, + "loss": 2.0281, + "step": 4719 + }, + { + "epoch": 1.4487415592387969, + "grad_norm": 0.555749773979187, + "learning_rate": 9.658197895493608e-05, + "loss": 2.0184, + "step": 4720 + }, + { + "epoch": 1.449048496009822, + "grad_norm": 0.6206443905830383, + "learning_rate": 9.65801725044594e-05, + "loss": 1.9788, + "step": 4721 + }, + { + "epoch": 1.449355432780847, + "grad_norm": 0.533336877822876, + "learning_rate": 9.657836559365016e-05, + "loss": 1.9755, + "step": 4722 + }, + { + "epoch": 1.4496623695518722, + "grad_norm": 0.4553185701370239, + "learning_rate": 9.65765582225262e-05, + "loss": 1.9791, + "step": 4723 + }, + { + "epoch": 1.4499693063228976, + "grad_norm": 0.5754305124282837, + "learning_rate": 9.65747503911054e-05, + "loss": 1.9485, + "step": 4724 + }, + { + "epoch": 1.4502762430939227, + "grad_norm": 0.6812698245048523, + "learning_rate": 9.657294209940562e-05, + "loss": 2.0326, + "step": 4725 + }, + { + "epoch": 1.4505831798649478, + "grad_norm": 0.7532522678375244, + "learning_rate": 9.657113334744472e-05, + "loss": 1.9387, + "step": 4726 + }, + { + "epoch": 1.4508901166359731, + "grad_norm": 0.5618684887886047, + "learning_rate": 9.656932413524058e-05, + "loss": 1.9395, + "step": 4727 + }, + { + "epoch": 1.4511970534069982, + "grad_norm": 0.4818387031555176, + "learning_rate": 9.65675144628111e-05, + "loss": 1.9473, + "step": 4728 + }, + { + "epoch": 1.4515039901780233, + "grad_norm": 0.5152607560157776, + "learning_rate": 9.656570433017413e-05, + "loss": 1.894, + "step": 4729 + }, + { + "epoch": 1.4518109269490485, + "grad_norm": 0.5098578333854675, + "learning_rate": 9.656389373734759e-05, + "loss": 1.9519, + "step": 4730 + }, + { + "epoch": 1.4521178637200736, + "grad_norm": 0.5862317681312561, + "learning_rate": 9.656208268434936e-05, + "loss": 1.9968, + "step": 4731 + }, + { + "epoch": 1.452424800491099, + "grad_norm": 0.501220703125, + "learning_rate": 9.656027117119732e-05, + "loss": 1.993, + "step": 4732 + }, + { + "epoch": 1.452731737262124, + "grad_norm": 0.4974796772003174, + "learning_rate": 9.655845919790943e-05, + "loss": 2.0007, + "step": 4733 + }, + { + "epoch": 1.4530386740331491, + "grad_norm": 0.513671875, + "learning_rate": 9.655664676450351e-05, + "loss": 1.9321, + "step": 4734 + }, + { + "epoch": 1.4533456108041745, + "grad_norm": 0.5111755728721619, + "learning_rate": 9.655483387099756e-05, + "loss": 2.0187, + "step": 4735 + }, + { + "epoch": 1.4536525475751996, + "grad_norm": 0.47103258967399597, + "learning_rate": 9.655302051740942e-05, + "loss": 1.9716, + "step": 4736 + }, + { + "epoch": 1.4539594843462247, + "grad_norm": 0.4526553750038147, + "learning_rate": 9.655120670375707e-05, + "loss": 2.0424, + "step": 4737 + }, + { + "epoch": 1.4542664211172498, + "grad_norm": 0.44393640756607056, + "learning_rate": 9.65493924300584e-05, + "loss": 1.9318, + "step": 4738 + }, + { + "epoch": 1.454573357888275, + "grad_norm": 0.4070759415626526, + "learning_rate": 9.654757769633136e-05, + "loss": 1.9292, + "step": 4739 + }, + { + "epoch": 1.4548802946593002, + "grad_norm": 0.4010253846645355, + "learning_rate": 9.654576250259387e-05, + "loss": 1.9641, + "step": 4740 + }, + { + "epoch": 1.4551872314303254, + "grad_norm": 0.39156264066696167, + "learning_rate": 9.654394684886387e-05, + "loss": 1.9575, + "step": 4741 + }, + { + "epoch": 1.4554941682013505, + "grad_norm": 0.4360155463218689, + "learning_rate": 9.65421307351593e-05, + "loss": 1.9615, + "step": 4742 + }, + { + "epoch": 1.4558011049723758, + "grad_norm": 0.4203348755836487, + "learning_rate": 9.654031416149813e-05, + "loss": 1.9629, + "step": 4743 + }, + { + "epoch": 1.456108041743401, + "grad_norm": 0.42294225096702576, + "learning_rate": 9.653849712789828e-05, + "loss": 1.9756, + "step": 4744 + }, + { + "epoch": 1.456414978514426, + "grad_norm": 0.46253907680511475, + "learning_rate": 9.653667963437775e-05, + "loss": 2.0128, + "step": 4745 + }, + { + "epoch": 1.4567219152854511, + "grad_norm": 0.41743987798690796, + "learning_rate": 9.653486168095446e-05, + "loss": 1.938, + "step": 4746 + }, + { + "epoch": 1.4570288520564763, + "grad_norm": 0.43411263823509216, + "learning_rate": 9.653304326764639e-05, + "loss": 1.9744, + "step": 4747 + }, + { + "epoch": 1.4573357888275016, + "grad_norm": 0.4569607973098755, + "learning_rate": 9.653122439447151e-05, + "loss": 1.9844, + "step": 4748 + }, + { + "epoch": 1.4576427255985267, + "grad_norm": 0.41858115792274475, + "learning_rate": 9.652940506144781e-05, + "loss": 1.9835, + "step": 4749 + }, + { + "epoch": 1.4579496623695518, + "grad_norm": 0.4259703755378723, + "learning_rate": 9.652758526859324e-05, + "loss": 1.9467, + "step": 4750 + }, + { + "epoch": 1.4582565991405771, + "grad_norm": 0.49847620725631714, + "learning_rate": 9.652576501592583e-05, + "loss": 1.989, + "step": 4751 + }, + { + "epoch": 1.4585635359116023, + "grad_norm": 0.5898705720901489, + "learning_rate": 9.652394430346352e-05, + "loss": 1.9896, + "step": 4752 + }, + { + "epoch": 1.4588704726826274, + "grad_norm": 0.6528434157371521, + "learning_rate": 9.652212313122433e-05, + "loss": 1.9814, + "step": 4753 + }, + { + "epoch": 1.4591774094536525, + "grad_norm": 0.5704251527786255, + "learning_rate": 9.652030149922624e-05, + "loss": 1.9735, + "step": 4754 + }, + { + "epoch": 1.4594843462246776, + "grad_norm": 0.4349142014980316, + "learning_rate": 9.651847940748727e-05, + "loss": 1.9923, + "step": 4755 + }, + { + "epoch": 1.459791282995703, + "grad_norm": 0.43891096115112305, + "learning_rate": 9.651665685602542e-05, + "loss": 1.9429, + "step": 4756 + }, + { + "epoch": 1.460098219766728, + "grad_norm": 0.5881633758544922, + "learning_rate": 9.651483384485871e-05, + "loss": 2.0075, + "step": 4757 + }, + { + "epoch": 1.4604051565377532, + "grad_norm": 0.569064736366272, + "learning_rate": 9.651301037400515e-05, + "loss": 1.9968, + "step": 4758 + }, + { + "epoch": 1.4607120933087785, + "grad_norm": 0.49636805057525635, + "learning_rate": 9.651118644348276e-05, + "loss": 2.0844, + "step": 4759 + }, + { + "epoch": 1.4610190300798036, + "grad_norm": 0.4893283247947693, + "learning_rate": 9.650936205330955e-05, + "loss": 1.9635, + "step": 4760 + }, + { + "epoch": 1.4613259668508287, + "grad_norm": 0.5199632048606873, + "learning_rate": 9.650753720350358e-05, + "loss": 1.8934, + "step": 4761 + }, + { + "epoch": 1.4616329036218538, + "grad_norm": 0.5655859708786011, + "learning_rate": 9.650571189408287e-05, + "loss": 2.0473, + "step": 4762 + }, + { + "epoch": 1.461939840392879, + "grad_norm": 0.5004158020019531, + "learning_rate": 9.650388612506545e-05, + "loss": 1.9388, + "step": 4763 + }, + { + "epoch": 1.4622467771639043, + "grad_norm": 0.5075541734695435, + "learning_rate": 9.650205989646937e-05, + "loss": 2.0362, + "step": 4764 + }, + { + "epoch": 1.4625537139349294, + "grad_norm": 0.52835613489151, + "learning_rate": 9.650023320831267e-05, + "loss": 1.9849, + "step": 4765 + }, + { + "epoch": 1.4628606507059545, + "grad_norm": 0.5208338499069214, + "learning_rate": 9.649840606061342e-05, + "loss": 1.9619, + "step": 4766 + }, + { + "epoch": 1.4631675874769798, + "grad_norm": 0.4954691529273987, + "learning_rate": 9.649657845338966e-05, + "loss": 1.9282, + "step": 4767 + }, + { + "epoch": 1.463474524248005, + "grad_norm": 0.4260660409927368, + "learning_rate": 9.649475038665947e-05, + "loss": 2.0108, + "step": 4768 + }, + { + "epoch": 1.46378146101903, + "grad_norm": 0.4954771101474762, + "learning_rate": 9.64929218604409e-05, + "loss": 1.9995, + "step": 4769 + }, + { + "epoch": 1.4640883977900552, + "grad_norm": 0.6004415154457092, + "learning_rate": 9.649109287475202e-05, + "loss": 1.9816, + "step": 4770 + }, + { + "epoch": 1.4643953345610803, + "grad_norm": 0.6472858190536499, + "learning_rate": 9.648926342961092e-05, + "loss": 1.927, + "step": 4771 + }, + { + "epoch": 1.4647022713321056, + "grad_norm": 0.5293224453926086, + "learning_rate": 9.648743352503567e-05, + "loss": 1.9082, + "step": 4772 + }, + { + "epoch": 1.4650092081031307, + "grad_norm": 0.4413148760795593, + "learning_rate": 9.648560316104435e-05, + "loss": 1.9368, + "step": 4773 + }, + { + "epoch": 1.4653161448741558, + "grad_norm": 0.4727863371372223, + "learning_rate": 9.648377233765507e-05, + "loss": 1.944, + "step": 4774 + }, + { + "epoch": 1.4656230816451812, + "grad_norm": 0.5681154131889343, + "learning_rate": 9.648194105488589e-05, + "loss": 2.0003, + "step": 4775 + }, + { + "epoch": 1.4659300184162063, + "grad_norm": 0.5893644690513611, + "learning_rate": 9.648010931275493e-05, + "loss": 1.936, + "step": 4776 + }, + { + "epoch": 1.4662369551872314, + "grad_norm": 0.5034298300743103, + "learning_rate": 9.647827711128029e-05, + "loss": 2.0318, + "step": 4777 + }, + { + "epoch": 1.4665438919582565, + "grad_norm": 0.4954885244369507, + "learning_rate": 9.647644445048006e-05, + "loss": 2.0053, + "step": 4778 + }, + { + "epoch": 1.4668508287292816, + "grad_norm": 0.475923627614975, + "learning_rate": 9.647461133037236e-05, + "loss": 1.8911, + "step": 4779 + }, + { + "epoch": 1.467157765500307, + "grad_norm": 0.4725008010864258, + "learning_rate": 9.647277775097534e-05, + "loss": 1.8954, + "step": 4780 + }, + { + "epoch": 1.467464702271332, + "grad_norm": 0.4183707535266876, + "learning_rate": 9.647094371230707e-05, + "loss": 1.9891, + "step": 4781 + }, + { + "epoch": 1.4677716390423572, + "grad_norm": 0.4862513244152069, + "learning_rate": 9.64691092143857e-05, + "loss": 2.0364, + "step": 4782 + }, + { + "epoch": 1.4680785758133825, + "grad_norm": 0.5038082599639893, + "learning_rate": 9.646727425722936e-05, + "loss": 1.9304, + "step": 4783 + }, + { + "epoch": 1.4683855125844076, + "grad_norm": 0.47281327843666077, + "learning_rate": 9.646543884085618e-05, + "loss": 1.9453, + "step": 4784 + }, + { + "epoch": 1.4686924493554327, + "grad_norm": 0.42275354266166687, + "learning_rate": 9.646360296528431e-05, + "loss": 1.9434, + "step": 4785 + }, + { + "epoch": 1.468999386126458, + "grad_norm": 0.5757746696472168, + "learning_rate": 9.646176663053185e-05, + "loss": 2.0241, + "step": 4786 + }, + { + "epoch": 1.4693063228974832, + "grad_norm": 0.6757779121398926, + "learning_rate": 9.645992983661701e-05, + "loss": 1.9823, + "step": 4787 + }, + { + "epoch": 1.4696132596685083, + "grad_norm": 0.7052981853485107, + "learning_rate": 9.645809258355792e-05, + "loss": 2.0553, + "step": 4788 + }, + { + "epoch": 1.4699201964395334, + "grad_norm": 0.5630238652229309, + "learning_rate": 9.64562548713727e-05, + "loss": 2.0241, + "step": 4789 + }, + { + "epoch": 1.4702271332105585, + "grad_norm": 0.5034958124160767, + "learning_rate": 9.645441670007955e-05, + "loss": 1.9788, + "step": 4790 + }, + { + "epoch": 1.4705340699815839, + "grad_norm": 0.48978129029273987, + "learning_rate": 9.645257806969663e-05, + "loss": 1.9415, + "step": 4791 + }, + { + "epoch": 1.470841006752609, + "grad_norm": 0.4718508720397949, + "learning_rate": 9.645073898024211e-05, + "loss": 1.9657, + "step": 4792 + }, + { + "epoch": 1.471147943523634, + "grad_norm": 0.5171064734458923, + "learning_rate": 9.644889943173417e-05, + "loss": 1.9311, + "step": 4793 + }, + { + "epoch": 1.4714548802946594, + "grad_norm": 0.4556005597114563, + "learning_rate": 9.644705942419097e-05, + "loss": 1.9093, + "step": 4794 + }, + { + "epoch": 1.4717618170656845, + "grad_norm": 0.44836321473121643, + "learning_rate": 9.64452189576307e-05, + "loss": 1.9715, + "step": 4795 + }, + { + "epoch": 1.4720687538367097, + "grad_norm": 0.5139105916023254, + "learning_rate": 9.644337803207155e-05, + "loss": 1.967, + "step": 4796 + }, + { + "epoch": 1.4723756906077348, + "grad_norm": 0.49145743250846863, + "learning_rate": 9.644153664753173e-05, + "loss": 1.9679, + "step": 4797 + }, + { + "epoch": 1.4726826273787599, + "grad_norm": 0.4353790283203125, + "learning_rate": 9.643969480402942e-05, + "loss": 1.9438, + "step": 4798 + }, + { + "epoch": 1.4729895641497852, + "grad_norm": 0.39393118023872375, + "learning_rate": 9.643785250158283e-05, + "loss": 1.91, + "step": 4799 + }, + { + "epoch": 1.4732965009208103, + "grad_norm": 0.4250284731388092, + "learning_rate": 9.643600974021017e-05, + "loss": 1.9315, + "step": 4800 + }, + { + "epoch": 1.4736034376918354, + "grad_norm": 0.40301406383514404, + "learning_rate": 9.643416651992962e-05, + "loss": 1.9344, + "step": 4801 + }, + { + "epoch": 1.4739103744628608, + "grad_norm": 0.4428589940071106, + "learning_rate": 9.643232284075944e-05, + "loss": 1.9767, + "step": 4802 + }, + { + "epoch": 1.4742173112338859, + "grad_norm": 0.5098150372505188, + "learning_rate": 9.643047870271783e-05, + "loss": 2.0471, + "step": 4803 + }, + { + "epoch": 1.474524248004911, + "grad_norm": 0.5230079293251038, + "learning_rate": 9.642863410582302e-05, + "loss": 1.9647, + "step": 4804 + }, + { + "epoch": 1.474831184775936, + "grad_norm": 0.44200628995895386, + "learning_rate": 9.642678905009322e-05, + "loss": 1.9046, + "step": 4805 + }, + { + "epoch": 1.4751381215469612, + "grad_norm": 0.42684751749038696, + "learning_rate": 9.642494353554669e-05, + "loss": 1.82, + "step": 4806 + }, + { + "epoch": 1.4754450583179866, + "grad_norm": 0.3907437324523926, + "learning_rate": 9.642309756220165e-05, + "loss": 1.9257, + "step": 4807 + }, + { + "epoch": 1.4757519950890117, + "grad_norm": 0.43622660636901855, + "learning_rate": 9.642125113007636e-05, + "loss": 1.9319, + "step": 4808 + }, + { + "epoch": 1.4760589318600368, + "grad_norm": 0.4553097188472748, + "learning_rate": 9.641940423918905e-05, + "loss": 1.9699, + "step": 4809 + }, + { + "epoch": 1.4763658686310621, + "grad_norm": 0.48997193574905396, + "learning_rate": 9.641755688955798e-05, + "loss": 1.9843, + "step": 4810 + }, + { + "epoch": 1.4766728054020872, + "grad_norm": 0.5008227825164795, + "learning_rate": 9.641570908120141e-05, + "loss": 1.9616, + "step": 4811 + }, + { + "epoch": 1.4769797421731123, + "grad_norm": 0.49788615107536316, + "learning_rate": 9.64138608141376e-05, + "loss": 2.0233, + "step": 4812 + }, + { + "epoch": 1.4772866789441375, + "grad_norm": 0.509159505367279, + "learning_rate": 9.64120120883848e-05, + "loss": 1.9982, + "step": 4813 + }, + { + "epoch": 1.4775936157151626, + "grad_norm": 0.4976164996623993, + "learning_rate": 9.641016290396132e-05, + "loss": 1.9944, + "step": 4814 + }, + { + "epoch": 1.477900552486188, + "grad_norm": 0.4925370514392853, + "learning_rate": 9.640831326088539e-05, + "loss": 1.9547, + "step": 4815 + }, + { + "epoch": 1.478207489257213, + "grad_norm": 0.5058705806732178, + "learning_rate": 9.64064631591753e-05, + "loss": 2.0147, + "step": 4816 + }, + { + "epoch": 1.4785144260282381, + "grad_norm": 0.5614715814590454, + "learning_rate": 9.640461259884937e-05, + "loss": 1.9475, + "step": 4817 + }, + { + "epoch": 1.4788213627992635, + "grad_norm": 0.4417608380317688, + "learning_rate": 9.640276157992582e-05, + "loss": 1.9422, + "step": 4818 + }, + { + "epoch": 1.4791282995702886, + "grad_norm": 0.5124607682228088, + "learning_rate": 9.6400910102423e-05, + "loss": 1.9489, + "step": 4819 + }, + { + "epoch": 1.4794352363413137, + "grad_norm": 0.4931279420852661, + "learning_rate": 9.63990581663592e-05, + "loss": 1.9717, + "step": 4820 + }, + { + "epoch": 1.4797421731123388, + "grad_norm": 0.4716447591781616, + "learning_rate": 9.639720577175271e-05, + "loss": 1.9758, + "step": 4821 + }, + { + "epoch": 1.480049109883364, + "grad_norm": 0.4613695740699768, + "learning_rate": 9.639535291862183e-05, + "loss": 1.8998, + "step": 4822 + }, + { + "epoch": 1.4803560466543892, + "grad_norm": 0.4430600702762604, + "learning_rate": 9.639349960698489e-05, + "loss": 1.9539, + "step": 4823 + }, + { + "epoch": 1.4806629834254144, + "grad_norm": 0.45596009492874146, + "learning_rate": 9.639164583686018e-05, + "loss": 1.9626, + "step": 4824 + }, + { + "epoch": 1.4809699201964395, + "grad_norm": 0.4248705804347992, + "learning_rate": 9.638979160826604e-05, + "loss": 1.9627, + "step": 4825 + }, + { + "epoch": 1.4812768569674648, + "grad_norm": 0.43419960141181946, + "learning_rate": 9.63879369212208e-05, + "loss": 1.9589, + "step": 4826 + }, + { + "epoch": 1.48158379373849, + "grad_norm": 0.4715637266635895, + "learning_rate": 9.638608177574278e-05, + "loss": 1.981, + "step": 4827 + }, + { + "epoch": 1.481890730509515, + "grad_norm": 0.41809993982315063, + "learning_rate": 9.63842261718503e-05, + "loss": 1.9587, + "step": 4828 + }, + { + "epoch": 1.4821976672805401, + "grad_norm": 0.4085060656070709, + "learning_rate": 9.63823701095617e-05, + "loss": 1.9497, + "step": 4829 + }, + { + "epoch": 1.4825046040515653, + "grad_norm": 0.4199173152446747, + "learning_rate": 9.638051358889535e-05, + "loss": 1.9543, + "step": 4830 + }, + { + "epoch": 1.4828115408225906, + "grad_norm": 0.4560040235519409, + "learning_rate": 9.637865660986958e-05, + "loss": 1.9451, + "step": 4831 + }, + { + "epoch": 1.4831184775936157, + "grad_norm": 0.4059405028820038, + "learning_rate": 9.637679917250272e-05, + "loss": 1.9154, + "step": 4832 + }, + { + "epoch": 1.4834254143646408, + "grad_norm": 0.43314236402511597, + "learning_rate": 9.637494127681318e-05, + "loss": 1.9589, + "step": 4833 + }, + { + "epoch": 1.4837323511356661, + "grad_norm": 0.3866138458251953, + "learning_rate": 9.637308292281928e-05, + "loss": 1.9239, + "step": 4834 + }, + { + "epoch": 1.4840392879066913, + "grad_norm": 0.40781381726264954, + "learning_rate": 9.637122411053939e-05, + "loss": 1.9805, + "step": 4835 + }, + { + "epoch": 1.4843462246777164, + "grad_norm": 0.4605334401130676, + "learning_rate": 9.636936483999189e-05, + "loss": 1.9571, + "step": 4836 + }, + { + "epoch": 1.4846531614487415, + "grad_norm": 0.4730539917945862, + "learning_rate": 9.636750511119513e-05, + "loss": 1.9429, + "step": 4837 + }, + { + "epoch": 1.4849600982197666, + "grad_norm": 0.47973817586898804, + "learning_rate": 9.636564492416753e-05, + "loss": 1.9865, + "step": 4838 + }, + { + "epoch": 1.485267034990792, + "grad_norm": 0.4541794955730438, + "learning_rate": 9.636378427892744e-05, + "loss": 1.9796, + "step": 4839 + }, + { + "epoch": 1.485573971761817, + "grad_norm": 0.4863722026348114, + "learning_rate": 9.636192317549327e-05, + "loss": 1.9581, + "step": 4840 + }, + { + "epoch": 1.4858809085328422, + "grad_norm": 0.4559536278247833, + "learning_rate": 9.636006161388338e-05, + "loss": 1.9444, + "step": 4841 + }, + { + "epoch": 1.4861878453038675, + "grad_norm": 0.4385206401348114, + "learning_rate": 9.63581995941162e-05, + "loss": 1.9323, + "step": 4842 + }, + { + "epoch": 1.4864947820748926, + "grad_norm": 0.48802945017814636, + "learning_rate": 9.635633711621012e-05, + "loss": 1.9643, + "step": 4843 + }, + { + "epoch": 1.4868017188459177, + "grad_norm": 0.4051367938518524, + "learning_rate": 9.635447418018355e-05, + "loss": 1.9342, + "step": 4844 + }, + { + "epoch": 1.4871086556169428, + "grad_norm": 0.46384257078170776, + "learning_rate": 9.63526107860549e-05, + "loss": 1.9656, + "step": 4845 + }, + { + "epoch": 1.487415592387968, + "grad_norm": 0.3950713574886322, + "learning_rate": 9.635074693384257e-05, + "loss": 1.8673, + "step": 4846 + }, + { + "epoch": 1.4877225291589933, + "grad_norm": 0.4694644808769226, + "learning_rate": 9.634888262356501e-05, + "loss": 1.9484, + "step": 4847 + }, + { + "epoch": 1.4880294659300184, + "grad_norm": 0.45068567991256714, + "learning_rate": 9.63470178552406e-05, + "loss": 1.9221, + "step": 4848 + }, + { + "epoch": 1.4883364027010435, + "grad_norm": 0.44717836380004883, + "learning_rate": 9.634515262888781e-05, + "loss": 1.9968, + "step": 4849 + }, + { + "epoch": 1.4886433394720688, + "grad_norm": 0.42189615964889526, + "learning_rate": 9.634328694452506e-05, + "loss": 2.0262, + "step": 4850 + }, + { + "epoch": 1.488950276243094, + "grad_norm": 0.4895322322845459, + "learning_rate": 9.63414208021708e-05, + "loss": 2.0628, + "step": 4851 + }, + { + "epoch": 1.489257213014119, + "grad_norm": 0.4732883870601654, + "learning_rate": 9.633955420184342e-05, + "loss": 1.9487, + "step": 4852 + }, + { + "epoch": 1.4895641497851444, + "grad_norm": 0.4426051676273346, + "learning_rate": 9.633768714356143e-05, + "loss": 2.0181, + "step": 4853 + }, + { + "epoch": 1.4898710865561695, + "grad_norm": 0.5831739902496338, + "learning_rate": 9.633581962734326e-05, + "loss": 1.9311, + "step": 4854 + }, + { + "epoch": 1.4901780233271946, + "grad_norm": 0.6048587560653687, + "learning_rate": 9.633395165320734e-05, + "loss": 1.9159, + "step": 4855 + }, + { + "epoch": 1.4904849600982197, + "grad_norm": 0.60125732421875, + "learning_rate": 9.633208322117218e-05, + "loss": 1.9732, + "step": 4856 + }, + { + "epoch": 1.4907918968692448, + "grad_norm": 0.4806794822216034, + "learning_rate": 9.63302143312562e-05, + "loss": 1.9101, + "step": 4857 + }, + { + "epoch": 1.4910988336402702, + "grad_norm": 0.4032946228981018, + "learning_rate": 9.632834498347789e-05, + "loss": 1.9097, + "step": 4858 + }, + { + "epoch": 1.4914057704112953, + "grad_norm": 0.400632381439209, + "learning_rate": 9.632647517785571e-05, + "loss": 1.9949, + "step": 4859 + }, + { + "epoch": 1.4917127071823204, + "grad_norm": 0.49766576290130615, + "learning_rate": 9.632460491440818e-05, + "loss": 1.9762, + "step": 4860 + }, + { + "epoch": 1.4920196439533457, + "grad_norm": 0.6273209452629089, + "learning_rate": 9.632273419315372e-05, + "loss": 2.0797, + "step": 4861 + }, + { + "epoch": 1.4923265807243709, + "grad_norm": 0.5848406553268433, + "learning_rate": 9.632086301411087e-05, + "loss": 1.9366, + "step": 4862 + }, + { + "epoch": 1.492633517495396, + "grad_norm": 0.4683595597743988, + "learning_rate": 9.631899137729809e-05, + "loss": 1.9802, + "step": 4863 + }, + { + "epoch": 1.492940454266421, + "grad_norm": 0.43066033720970154, + "learning_rate": 9.63171192827339e-05, + "loss": 1.9621, + "step": 4864 + }, + { + "epoch": 1.4932473910374462, + "grad_norm": 0.47469422221183777, + "learning_rate": 9.63152467304368e-05, + "loss": 1.9795, + "step": 4865 + }, + { + "epoch": 1.4935543278084715, + "grad_norm": 0.5453927516937256, + "learning_rate": 9.631337372042526e-05, + "loss": 1.9711, + "step": 4866 + }, + { + "epoch": 1.4938612645794966, + "grad_norm": 0.5361614227294922, + "learning_rate": 9.631150025271782e-05, + "loss": 1.9849, + "step": 4867 + }, + { + "epoch": 1.4941682013505218, + "grad_norm": 0.4773578643798828, + "learning_rate": 9.6309626327333e-05, + "loss": 2.065, + "step": 4868 + }, + { + "epoch": 1.494475138121547, + "grad_norm": 0.428091824054718, + "learning_rate": 9.630775194428932e-05, + "loss": 1.9448, + "step": 4869 + }, + { + "epoch": 1.4947820748925722, + "grad_norm": 0.41679108142852783, + "learning_rate": 9.630587710360527e-05, + "loss": 1.9511, + "step": 4870 + }, + { + "epoch": 1.4950890116635973, + "grad_norm": 0.5072546601295471, + "learning_rate": 9.630400180529942e-05, + "loss": 1.9973, + "step": 4871 + }, + { + "epoch": 1.4953959484346224, + "grad_norm": 0.5230575799942017, + "learning_rate": 9.630212604939026e-05, + "loss": 1.9659, + "step": 4872 + }, + { + "epoch": 1.4957028852056475, + "grad_norm": 0.44307753443717957, + "learning_rate": 9.630024983589638e-05, + "loss": 1.9056, + "step": 4873 + }, + { + "epoch": 1.4960098219766729, + "grad_norm": 0.43783196806907654, + "learning_rate": 9.629837316483628e-05, + "loss": 1.9716, + "step": 4874 + }, + { + "epoch": 1.496316758747698, + "grad_norm": 0.4553990960121155, + "learning_rate": 9.629649603622852e-05, + "loss": 2.044, + "step": 4875 + }, + { + "epoch": 1.496623695518723, + "grad_norm": 0.49152833223342896, + "learning_rate": 9.629461845009164e-05, + "loss": 1.948, + "step": 4876 + }, + { + "epoch": 1.4969306322897484, + "grad_norm": 0.4371738135814667, + "learning_rate": 9.629274040644422e-05, + "loss": 1.9497, + "step": 4877 + }, + { + "epoch": 1.4972375690607735, + "grad_norm": 0.4973873198032379, + "learning_rate": 9.629086190530482e-05, + "loss": 2.0053, + "step": 4878 + }, + { + "epoch": 1.4975445058317987, + "grad_norm": 0.4250672459602356, + "learning_rate": 9.628898294669197e-05, + "loss": 1.9617, + "step": 4879 + }, + { + "epoch": 1.4978514426028238, + "grad_norm": 0.4514639675617218, + "learning_rate": 9.628710353062427e-05, + "loss": 1.9503, + "step": 4880 + }, + { + "epoch": 1.4981583793738489, + "grad_norm": 0.4960804879665375, + "learning_rate": 9.628522365712027e-05, + "loss": 1.9932, + "step": 4881 + }, + { + "epoch": 1.4984653161448742, + "grad_norm": 0.5604363083839417, + "learning_rate": 9.628334332619857e-05, + "loss": 2.0186, + "step": 4882 + }, + { + "epoch": 1.4987722529158993, + "grad_norm": 0.5125443935394287, + "learning_rate": 9.628146253787776e-05, + "loss": 1.9897, + "step": 4883 + }, + { + "epoch": 1.4990791896869244, + "grad_norm": 0.4029771089553833, + "learning_rate": 9.627958129217639e-05, + "loss": 1.9083, + "step": 4884 + }, + { + "epoch": 1.4993861264579498, + "grad_norm": 0.4608222544193268, + "learning_rate": 9.627769958911308e-05, + "loss": 2.0153, + "step": 4885 + }, + { + "epoch": 1.4996930632289749, + "grad_norm": 0.4253246486186981, + "learning_rate": 9.627581742870641e-05, + "loss": 1.9278, + "step": 4886 + }, + { + "epoch": 1.5, + "grad_norm": 0.4247463047504425, + "learning_rate": 9.6273934810975e-05, + "loss": 1.9456, + "step": 4887 + }, + { + "epoch": 1.5003069367710253, + "grad_norm": 0.44055816531181335, + "learning_rate": 9.627205173593744e-05, + "loss": 2.0225, + "step": 4888 + }, + { + "epoch": 1.5006138735420502, + "grad_norm": 0.47912710905075073, + "learning_rate": 9.627016820361235e-05, + "loss": 1.9716, + "step": 4889 + }, + { + "epoch": 1.5009208103130756, + "grad_norm": 0.47608625888824463, + "learning_rate": 9.626828421401832e-05, + "loss": 1.9444, + "step": 4890 + }, + { + "epoch": 1.5012277470841007, + "grad_norm": 0.4757349193096161, + "learning_rate": 9.6266399767174e-05, + "loss": 2.0699, + "step": 4891 + }, + { + "epoch": 1.5015346838551258, + "grad_norm": 0.5556650757789612, + "learning_rate": 9.6264514863098e-05, + "loss": 1.99, + "step": 4892 + }, + { + "epoch": 1.5018416206261511, + "grad_norm": 0.5072291493415833, + "learning_rate": 9.626262950180894e-05, + "loss": 1.9435, + "step": 4893 + }, + { + "epoch": 1.5021485573971762, + "grad_norm": 0.47811564803123474, + "learning_rate": 9.626074368332546e-05, + "loss": 1.9399, + "step": 4894 + }, + { + "epoch": 1.5024554941682013, + "grad_norm": 0.4613232910633087, + "learning_rate": 9.62588574076662e-05, + "loss": 1.9259, + "step": 4895 + }, + { + "epoch": 1.5027624309392267, + "grad_norm": 0.4170697331428528, + "learning_rate": 9.62569706748498e-05, + "loss": 1.9319, + "step": 4896 + }, + { + "epoch": 1.5030693677102516, + "grad_norm": 0.4731575548648834, + "learning_rate": 9.62550834848949e-05, + "loss": 1.9862, + "step": 4897 + }, + { + "epoch": 1.503376304481277, + "grad_norm": 0.49881401658058167, + "learning_rate": 9.625319583782016e-05, + "loss": 1.9837, + "step": 4898 + }, + { + "epoch": 1.503683241252302, + "grad_norm": 0.4689660668373108, + "learning_rate": 9.625130773364424e-05, + "loss": 1.9662, + "step": 4899 + }, + { + "epoch": 1.5039901780233271, + "grad_norm": 0.48389768600463867, + "learning_rate": 9.624941917238577e-05, + "loss": 2.0087, + "step": 4900 + }, + { + "epoch": 1.5042971147943525, + "grad_norm": 0.46716609597206116, + "learning_rate": 9.624753015406342e-05, + "loss": 1.9718, + "step": 4901 + }, + { + "epoch": 1.5046040515653776, + "grad_norm": 0.544793963432312, + "learning_rate": 9.62456406786959e-05, + "loss": 1.9878, + "step": 4902 + }, + { + "epoch": 1.5049109883364027, + "grad_norm": 0.44499701261520386, + "learning_rate": 9.624375074630183e-05, + "loss": 1.8849, + "step": 4903 + }, + { + "epoch": 1.505217925107428, + "grad_norm": 0.42464208602905273, + "learning_rate": 9.624186035689993e-05, + "loss": 1.8995, + "step": 4904 + }, + { + "epoch": 1.505524861878453, + "grad_norm": 0.41650670766830444, + "learning_rate": 9.623996951050885e-05, + "loss": 1.9138, + "step": 4905 + }, + { + "epoch": 1.5058317986494782, + "grad_norm": 0.37955889105796814, + "learning_rate": 9.62380782071473e-05, + "loss": 1.9746, + "step": 4906 + }, + { + "epoch": 1.5061387354205034, + "grad_norm": 0.3799228072166443, + "learning_rate": 9.623618644683394e-05, + "loss": 1.942, + "step": 4907 + }, + { + "epoch": 1.5064456721915285, + "grad_norm": 0.3799766004085541, + "learning_rate": 9.623429422958751e-05, + "loss": 1.9025, + "step": 4908 + }, + { + "epoch": 1.5067526089625538, + "grad_norm": 0.3780234456062317, + "learning_rate": 9.623240155542668e-05, + "loss": 1.9581, + "step": 4909 + }, + { + "epoch": 1.507059545733579, + "grad_norm": 0.36379706859588623, + "learning_rate": 9.623050842437014e-05, + "loss": 1.9299, + "step": 4910 + }, + { + "epoch": 1.507366482504604, + "grad_norm": 0.5230580568313599, + "learning_rate": 9.622861483643663e-05, + "loss": 2.0306, + "step": 4911 + }, + { + "epoch": 1.5076734192756294, + "grad_norm": 0.443945050239563, + "learning_rate": 9.622672079164486e-05, + "loss": 1.9032, + "step": 4912 + }, + { + "epoch": 1.5079803560466543, + "grad_norm": 0.4689701795578003, + "learning_rate": 9.622482629001355e-05, + "loss": 1.9901, + "step": 4913 + }, + { + "epoch": 1.5082872928176796, + "grad_norm": 0.4483632445335388, + "learning_rate": 9.622293133156139e-05, + "loss": 1.948, + "step": 4914 + }, + { + "epoch": 1.5085942295887047, + "grad_norm": 0.4064919948577881, + "learning_rate": 9.622103591630715e-05, + "loss": 1.9487, + "step": 4915 + }, + { + "epoch": 1.5089011663597298, + "grad_norm": 0.44170522689819336, + "learning_rate": 9.621914004426952e-05, + "loss": 1.9929, + "step": 4916 + }, + { + "epoch": 1.5092081031307552, + "grad_norm": 0.45979443192481995, + "learning_rate": 9.621724371546727e-05, + "loss": 1.9428, + "step": 4917 + }, + { + "epoch": 1.5095150399017803, + "grad_norm": 0.5258452892303467, + "learning_rate": 9.621534692991913e-05, + "loss": 2.0049, + "step": 4918 + }, + { + "epoch": 1.5098219766728054, + "grad_norm": 0.45191919803619385, + "learning_rate": 9.621344968764385e-05, + "loss": 2.0364, + "step": 4919 + }, + { + "epoch": 1.5101289134438307, + "grad_norm": 0.539245069026947, + "learning_rate": 9.621155198866016e-05, + "loss": 2.072, + "step": 4920 + }, + { + "epoch": 1.5104358502148556, + "grad_norm": 0.5410256385803223, + "learning_rate": 9.620965383298684e-05, + "loss": 2.0231, + "step": 4921 + }, + { + "epoch": 1.510742786985881, + "grad_norm": 0.4409741759300232, + "learning_rate": 9.620775522064264e-05, + "loss": 1.9024, + "step": 4922 + }, + { + "epoch": 1.511049723756906, + "grad_norm": 0.4911535680294037, + "learning_rate": 9.620585615164631e-05, + "loss": 2.0057, + "step": 4923 + }, + { + "epoch": 1.5113566605279312, + "grad_norm": 0.48139557242393494, + "learning_rate": 9.620395662601663e-05, + "loss": 2.0175, + "step": 4924 + }, + { + "epoch": 1.5116635972989565, + "grad_norm": 0.5130077004432678, + "learning_rate": 9.620205664377238e-05, + "loss": 1.952, + "step": 4925 + }, + { + "epoch": 1.5119705340699816, + "grad_norm": 0.5428542494773865, + "learning_rate": 9.62001562049323e-05, + "loss": 1.977, + "step": 4926 + }, + { + "epoch": 1.5122774708410067, + "grad_norm": 0.4586256444454193, + "learning_rate": 9.619825530951522e-05, + "loss": 1.9997, + "step": 4927 + }, + { + "epoch": 1.512584407612032, + "grad_norm": 0.3941349387168884, + "learning_rate": 9.61963539575399e-05, + "loss": 1.9174, + "step": 4928 + }, + { + "epoch": 1.512891344383057, + "grad_norm": 0.4396456480026245, + "learning_rate": 9.619445214902511e-05, + "loss": 1.9696, + "step": 4929 + }, + { + "epoch": 1.5131982811540823, + "grad_norm": 0.5413886904716492, + "learning_rate": 9.61925498839897e-05, + "loss": 2.0332, + "step": 4930 + }, + { + "epoch": 1.5135052179251074, + "grad_norm": 0.5946230888366699, + "learning_rate": 9.619064716245242e-05, + "loss": 2.0433, + "step": 4931 + }, + { + "epoch": 1.5138121546961325, + "grad_norm": 0.6353569030761719, + "learning_rate": 9.618874398443211e-05, + "loss": 1.9828, + "step": 4932 + }, + { + "epoch": 1.5141190914671578, + "grad_norm": 0.523690938949585, + "learning_rate": 9.618684034994754e-05, + "loss": 1.9024, + "step": 4933 + }, + { + "epoch": 1.514426028238183, + "grad_norm": 0.4437367022037506, + "learning_rate": 9.618493625901754e-05, + "loss": 1.9961, + "step": 4934 + }, + { + "epoch": 1.514732965009208, + "grad_norm": 0.48458734154701233, + "learning_rate": 9.618303171166094e-05, + "loss": 1.9515, + "step": 4935 + }, + { + "epoch": 1.5150399017802334, + "grad_norm": 0.47659310698509216, + "learning_rate": 9.618112670789657e-05, + "loss": 1.9943, + "step": 4936 + }, + { + "epoch": 1.5153468385512583, + "grad_norm": 0.49281415343284607, + "learning_rate": 9.617922124774322e-05, + "loss": 1.9311, + "step": 4937 + }, + { + "epoch": 1.5156537753222836, + "grad_norm": 0.4706041216850281, + "learning_rate": 9.617731533121972e-05, + "loss": 1.9478, + "step": 4938 + }, + { + "epoch": 1.5159607120933087, + "grad_norm": 0.4187149405479431, + "learning_rate": 9.617540895834496e-05, + "loss": 1.9915, + "step": 4939 + }, + { + "epoch": 1.5162676488643339, + "grad_norm": 0.3792540431022644, + "learning_rate": 9.617350212913772e-05, + "loss": 1.8609, + "step": 4940 + }, + { + "epoch": 1.5165745856353592, + "grad_norm": 0.46558165550231934, + "learning_rate": 9.617159484361688e-05, + "loss": 1.9574, + "step": 4941 + }, + { + "epoch": 1.5168815224063843, + "grad_norm": 0.4930344820022583, + "learning_rate": 9.616968710180127e-05, + "loss": 1.9924, + "step": 4942 + }, + { + "epoch": 1.5171884591774094, + "grad_norm": 0.44909337162971497, + "learning_rate": 9.616777890370976e-05, + "loss": 1.9674, + "step": 4943 + }, + { + "epoch": 1.5174953959484347, + "grad_norm": 0.43266600370407104, + "learning_rate": 9.616587024936119e-05, + "loss": 1.8899, + "step": 4944 + }, + { + "epoch": 1.5178023327194596, + "grad_norm": 0.43229207396507263, + "learning_rate": 9.616396113877444e-05, + "loss": 1.9671, + "step": 4945 + }, + { + "epoch": 1.518109269490485, + "grad_norm": 0.4609402120113373, + "learning_rate": 9.616205157196837e-05, + "loss": 1.9844, + "step": 4946 + }, + { + "epoch": 1.51841620626151, + "grad_norm": 0.4598314166069031, + "learning_rate": 9.616014154896184e-05, + "loss": 1.985, + "step": 4947 + }, + { + "epoch": 1.5187231430325352, + "grad_norm": 0.4746960997581482, + "learning_rate": 9.615823106977376e-05, + "loss": 2.0199, + "step": 4948 + }, + { + "epoch": 1.5190300798035605, + "grad_norm": 0.47560420632362366, + "learning_rate": 9.615632013442295e-05, + "loss": 1.8864, + "step": 4949 + }, + { + "epoch": 1.5193370165745856, + "grad_norm": 0.447837233543396, + "learning_rate": 9.615440874292835e-05, + "loss": 1.9699, + "step": 4950 + }, + { + "epoch": 1.5196439533456108, + "grad_norm": 0.49653175473213196, + "learning_rate": 9.615249689530883e-05, + "loss": 2.0645, + "step": 4951 + }, + { + "epoch": 1.519950890116636, + "grad_norm": 0.47083014249801636, + "learning_rate": 9.615058459158328e-05, + "loss": 2.01, + "step": 4952 + }, + { + "epoch": 1.520257826887661, + "grad_norm": 0.5299197435379028, + "learning_rate": 9.614867183177061e-05, + "loss": 2.0232, + "step": 4953 + }, + { + "epoch": 1.5205647636586863, + "grad_norm": 0.5005922317504883, + "learning_rate": 9.614675861588971e-05, + "loss": 1.9703, + "step": 4954 + }, + { + "epoch": 1.5208717004297114, + "grad_norm": 0.5131978392601013, + "learning_rate": 9.61448449439595e-05, + "loss": 1.9921, + "step": 4955 + }, + { + "epoch": 1.5211786372007365, + "grad_norm": 0.5278428196907043, + "learning_rate": 9.614293081599889e-05, + "loss": 1.9111, + "step": 4956 + }, + { + "epoch": 1.5214855739717619, + "grad_norm": 0.4914579689502716, + "learning_rate": 9.614101623202678e-05, + "loss": 2.0398, + "step": 4957 + }, + { + "epoch": 1.521792510742787, + "grad_norm": 0.454863041639328, + "learning_rate": 9.61391011920621e-05, + "loss": 1.9674, + "step": 4958 + }, + { + "epoch": 1.522099447513812, + "grad_norm": 0.464491605758667, + "learning_rate": 9.613718569612379e-05, + "loss": 2.0123, + "step": 4959 + }, + { + "epoch": 1.5224063842848374, + "grad_norm": 0.4252295196056366, + "learning_rate": 9.613526974423078e-05, + "loss": 1.9796, + "step": 4960 + }, + { + "epoch": 1.5227133210558625, + "grad_norm": 0.4643968641757965, + "learning_rate": 9.613335333640199e-05, + "loss": 1.9448, + "step": 4961 + }, + { + "epoch": 1.5230202578268877, + "grad_norm": 0.4204397201538086, + "learning_rate": 9.613143647265635e-05, + "loss": 2.0191, + "step": 4962 + }, + { + "epoch": 1.523327194597913, + "grad_norm": 0.3838767111301422, + "learning_rate": 9.612951915301283e-05, + "loss": 1.9057, + "step": 4963 + }, + { + "epoch": 1.5236341313689379, + "grad_norm": 0.4353863000869751, + "learning_rate": 9.612760137749035e-05, + "loss": 2.0435, + "step": 4964 + }, + { + "epoch": 1.5239410681399632, + "grad_norm": 0.4082738757133484, + "learning_rate": 9.612568314610788e-05, + "loss": 1.9229, + "step": 4965 + }, + { + "epoch": 1.5242480049109883, + "grad_norm": 0.4382591247558594, + "learning_rate": 9.612376445888437e-05, + "loss": 1.9185, + "step": 4966 + }, + { + "epoch": 1.5245549416820134, + "grad_norm": 0.48340749740600586, + "learning_rate": 9.61218453158388e-05, + "loss": 1.9669, + "step": 4967 + }, + { + "epoch": 1.5248618784530388, + "grad_norm": 0.47423556447029114, + "learning_rate": 9.611992571699012e-05, + "loss": 1.9372, + "step": 4968 + }, + { + "epoch": 1.525168815224064, + "grad_norm": 0.4070637822151184, + "learning_rate": 9.611800566235728e-05, + "loss": 2.0201, + "step": 4969 + }, + { + "epoch": 1.525475751995089, + "grad_norm": 0.43758198618888855, + "learning_rate": 9.61160851519593e-05, + "loss": 1.982, + "step": 4970 + }, + { + "epoch": 1.5257826887661143, + "grad_norm": 0.4724174737930298, + "learning_rate": 9.611416418581513e-05, + "loss": 1.9938, + "step": 4971 + }, + { + "epoch": 1.5260896255371392, + "grad_norm": 0.492405503988266, + "learning_rate": 9.611224276394374e-05, + "loss": 1.9462, + "step": 4972 + }, + { + "epoch": 1.5263965623081646, + "grad_norm": 0.5064161419868469, + "learning_rate": 9.611032088636418e-05, + "loss": 2.0326, + "step": 4973 + }, + { + "epoch": 1.5267034990791897, + "grad_norm": 0.4256031811237335, + "learning_rate": 9.610839855309537e-05, + "loss": 1.8885, + "step": 4974 + }, + { + "epoch": 1.5270104358502148, + "grad_norm": 0.4283316731452942, + "learning_rate": 9.610647576415636e-05, + "loss": 2.005, + "step": 4975 + }, + { + "epoch": 1.5273173726212401, + "grad_norm": 0.44234412908554077, + "learning_rate": 9.610455251956614e-05, + "loss": 1.9626, + "step": 4976 + }, + { + "epoch": 1.5276243093922652, + "grad_norm": 0.4135831594467163, + "learning_rate": 9.610262881934369e-05, + "loss": 1.9529, + "step": 4977 + }, + { + "epoch": 1.5279312461632903, + "grad_norm": 0.48090922832489014, + "learning_rate": 9.610070466350805e-05, + "loss": 2.0239, + "step": 4978 + }, + { + "epoch": 1.5282381829343157, + "grad_norm": 0.4546974301338196, + "learning_rate": 9.609878005207822e-05, + "loss": 1.9556, + "step": 4979 + }, + { + "epoch": 1.5285451197053406, + "grad_norm": 0.4197862148284912, + "learning_rate": 9.609685498507323e-05, + "loss": 1.9117, + "step": 4980 + }, + { + "epoch": 1.528852056476366, + "grad_norm": 0.4376974105834961, + "learning_rate": 9.60949294625121e-05, + "loss": 1.9514, + "step": 4981 + }, + { + "epoch": 1.529158993247391, + "grad_norm": 0.3671407401561737, + "learning_rate": 9.609300348441385e-05, + "loss": 1.9042, + "step": 4982 + }, + { + "epoch": 1.5294659300184161, + "grad_norm": 0.4326031506061554, + "learning_rate": 9.609107705079754e-05, + "loss": 1.9606, + "step": 4983 + }, + { + "epoch": 1.5297728667894415, + "grad_norm": 0.423308402299881, + "learning_rate": 9.608915016168218e-05, + "loss": 1.9663, + "step": 4984 + }, + { + "epoch": 1.5300798035604666, + "grad_norm": 0.46309906244277954, + "learning_rate": 9.608722281708683e-05, + "loss": 2.0114, + "step": 4985 + }, + { + "epoch": 1.5303867403314917, + "grad_norm": 0.4619913101196289, + "learning_rate": 9.608529501703053e-05, + "loss": 1.9328, + "step": 4986 + }, + { + "epoch": 1.530693677102517, + "grad_norm": 0.4335738718509674, + "learning_rate": 9.608336676153234e-05, + "loss": 1.9069, + "step": 4987 + }, + { + "epoch": 1.531000613873542, + "grad_norm": 0.40606966614723206, + "learning_rate": 9.608143805061129e-05, + "loss": 1.9243, + "step": 4988 + }, + { + "epoch": 1.5313075506445673, + "grad_norm": 0.45613235235214233, + "learning_rate": 9.607950888428649e-05, + "loss": 1.9943, + "step": 4989 + }, + { + "epoch": 1.5316144874155924, + "grad_norm": 0.4905582666397095, + "learning_rate": 9.607757926257696e-05, + "loss": 1.9649, + "step": 4990 + }, + { + "epoch": 1.5319214241866175, + "grad_norm": 0.44312527775764465, + "learning_rate": 9.607564918550179e-05, + "loss": 1.927, + "step": 4991 + }, + { + "epoch": 1.5322283609576428, + "grad_norm": 0.5193700790405273, + "learning_rate": 9.607371865308004e-05, + "loss": 1.9038, + "step": 4992 + }, + { + "epoch": 1.532535297728668, + "grad_norm": 0.5528806447982788, + "learning_rate": 9.607178766533078e-05, + "loss": 1.9194, + "step": 4993 + }, + { + "epoch": 1.532842234499693, + "grad_norm": 0.6561285257339478, + "learning_rate": 9.606985622227314e-05, + "loss": 2.0098, + "step": 4994 + }, + { + "epoch": 1.5331491712707184, + "grad_norm": 0.5642603635787964, + "learning_rate": 9.606792432392617e-05, + "loss": 1.9781, + "step": 4995 + }, + { + "epoch": 1.5334561080417433, + "grad_norm": 0.4974311590194702, + "learning_rate": 9.606599197030896e-05, + "loss": 1.9558, + "step": 4996 + }, + { + "epoch": 1.5337630448127686, + "grad_norm": 0.4324510395526886, + "learning_rate": 9.606405916144063e-05, + "loss": 1.9749, + "step": 4997 + }, + { + "epoch": 1.5340699815837937, + "grad_norm": 0.45244327187538147, + "learning_rate": 9.606212589734027e-05, + "loss": 1.8902, + "step": 4998 + }, + { + "epoch": 1.5343769183548188, + "grad_norm": 0.5418685078620911, + "learning_rate": 9.606019217802698e-05, + "loss": 1.9766, + "step": 4999 + }, + { + "epoch": 1.5346838551258442, + "grad_norm": 0.48479241132736206, + "learning_rate": 9.605825800351987e-05, + "loss": 1.9949, + "step": 5000 + }, + { + "epoch": 1.5349907918968693, + "grad_norm": 0.4958111643791199, + "learning_rate": 9.605632337383806e-05, + "loss": 1.988, + "step": 5001 + }, + { + "epoch": 1.5352977286678944, + "grad_norm": 0.47347983717918396, + "learning_rate": 9.605438828900067e-05, + "loss": 1.9157, + "step": 5002 + }, + { + "epoch": 1.5356046654389197, + "grad_norm": 0.4018974304199219, + "learning_rate": 9.605245274902684e-05, + "loss": 1.9347, + "step": 5003 + }, + { + "epoch": 1.5359116022099446, + "grad_norm": 0.46161791682243347, + "learning_rate": 9.605051675393565e-05, + "loss": 1.9785, + "step": 5004 + }, + { + "epoch": 1.53621853898097, + "grad_norm": 0.5113234519958496, + "learning_rate": 9.604858030374627e-05, + "loss": 1.9595, + "step": 5005 + }, + { + "epoch": 1.536525475751995, + "grad_norm": 0.6643409132957458, + "learning_rate": 9.604664339847784e-05, + "loss": 2.0395, + "step": 5006 + }, + { + "epoch": 1.5368324125230202, + "grad_norm": 0.6759974360466003, + "learning_rate": 9.604470603814948e-05, + "loss": 1.9058, + "step": 5007 + }, + { + "epoch": 1.5371393492940455, + "grad_norm": 0.5576213598251343, + "learning_rate": 9.604276822278035e-05, + "loss": 1.9326, + "step": 5008 + }, + { + "epoch": 1.5374462860650706, + "grad_norm": 0.4472630023956299, + "learning_rate": 9.60408299523896e-05, + "loss": 1.9553, + "step": 5009 + }, + { + "epoch": 1.5377532228360957, + "grad_norm": 0.48445144295692444, + "learning_rate": 9.603889122699638e-05, + "loss": 2.0136, + "step": 5010 + }, + { + "epoch": 1.538060159607121, + "grad_norm": 0.4793097972869873, + "learning_rate": 9.603695204661987e-05, + "loss": 1.9777, + "step": 5011 + }, + { + "epoch": 1.538367096378146, + "grad_norm": 0.5003167390823364, + "learning_rate": 9.60350124112792e-05, + "loss": 1.9672, + "step": 5012 + }, + { + "epoch": 1.5386740331491713, + "grad_norm": 0.5131042003631592, + "learning_rate": 9.603307232099355e-05, + "loss": 2.0058, + "step": 5013 + }, + { + "epoch": 1.5389809699201964, + "grad_norm": 0.4145869314670563, + "learning_rate": 9.603113177578212e-05, + "loss": 1.9332, + "step": 5014 + }, + { + "epoch": 1.5392879066912215, + "grad_norm": 0.4939991235733032, + "learning_rate": 9.602919077566404e-05, + "loss": 1.9967, + "step": 5015 + }, + { + "epoch": 1.5395948434622468, + "grad_norm": 0.4768902361392975, + "learning_rate": 9.602724932065853e-05, + "loss": 1.873, + "step": 5016 + }, + { + "epoch": 1.539901780233272, + "grad_norm": 0.45381611585617065, + "learning_rate": 9.602530741078476e-05, + "loss": 1.9416, + "step": 5017 + }, + { + "epoch": 1.540208717004297, + "grad_norm": 0.43104392290115356, + "learning_rate": 9.602336504606193e-05, + "loss": 1.9566, + "step": 5018 + }, + { + "epoch": 1.5405156537753224, + "grad_norm": 0.5354776978492737, + "learning_rate": 9.602142222650924e-05, + "loss": 1.9939, + "step": 5019 + }, + { + "epoch": 1.5408225905463473, + "grad_norm": 0.5623740553855896, + "learning_rate": 9.601947895214586e-05, + "loss": 1.9622, + "step": 5020 + }, + { + "epoch": 1.5411295273173726, + "grad_norm": 0.5234485268592834, + "learning_rate": 9.601753522299103e-05, + "loss": 1.9636, + "step": 5021 + }, + { + "epoch": 1.5414364640883977, + "grad_norm": 0.416384756565094, + "learning_rate": 9.601559103906396e-05, + "loss": 1.92, + "step": 5022 + }, + { + "epoch": 1.5417434008594229, + "grad_norm": 0.47080478072166443, + "learning_rate": 9.601364640038384e-05, + "loss": 1.9147, + "step": 5023 + }, + { + "epoch": 1.5420503376304482, + "grad_norm": 0.527463972568512, + "learning_rate": 9.601170130696988e-05, + "loss": 1.9458, + "step": 5024 + }, + { + "epoch": 1.5423572744014733, + "grad_norm": 0.4761022925376892, + "learning_rate": 9.600975575884134e-05, + "loss": 1.95, + "step": 5025 + }, + { + "epoch": 1.5426642111724984, + "grad_norm": 0.48202264308929443, + "learning_rate": 9.600780975601741e-05, + "loss": 1.9618, + "step": 5026 + }, + { + "epoch": 1.5429711479435237, + "grad_norm": 0.43222522735595703, + "learning_rate": 9.600586329851735e-05, + "loss": 1.9869, + "step": 5027 + }, + { + "epoch": 1.5432780847145486, + "grad_norm": 0.40816691517829895, + "learning_rate": 9.600391638636037e-05, + "loss": 1.991, + "step": 5028 + }, + { + "epoch": 1.543585021485574, + "grad_norm": 0.4365478754043579, + "learning_rate": 9.600196901956572e-05, + "loss": 1.9904, + "step": 5029 + }, + { + "epoch": 1.5438919582565993, + "grad_norm": 0.41411092877388, + "learning_rate": 9.600002119815268e-05, + "loss": 1.9449, + "step": 5030 + }, + { + "epoch": 1.5441988950276242, + "grad_norm": 0.41023650765419006, + "learning_rate": 9.599807292214045e-05, + "loss": 1.9318, + "step": 5031 + }, + { + "epoch": 1.5445058317986495, + "grad_norm": 0.4844631254673004, + "learning_rate": 9.599612419154831e-05, + "loss": 1.9884, + "step": 5032 + }, + { + "epoch": 1.5448127685696746, + "grad_norm": 0.4347037374973297, + "learning_rate": 9.59941750063955e-05, + "loss": 1.8992, + "step": 5033 + }, + { + "epoch": 1.5451197053406998, + "grad_norm": 0.6414445638656616, + "learning_rate": 9.59922253667013e-05, + "loss": 2.0268, + "step": 5034 + }, + { + "epoch": 1.545426642111725, + "grad_norm": 0.6607222557067871, + "learning_rate": 9.599027527248498e-05, + "loss": 2.0116, + "step": 5035 + }, + { + "epoch": 1.5457335788827502, + "grad_norm": 0.6406869292259216, + "learning_rate": 9.59883247237658e-05, + "loss": 1.9256, + "step": 5036 + }, + { + "epoch": 1.5460405156537753, + "grad_norm": 0.5388308167457581, + "learning_rate": 9.598637372056303e-05, + "loss": 1.906, + "step": 5037 + }, + { + "epoch": 1.5463474524248007, + "grad_norm": 0.42285510897636414, + "learning_rate": 9.598442226289596e-05, + "loss": 1.9137, + "step": 5038 + }, + { + "epoch": 1.5466543891958255, + "grad_norm": 0.5622994303703308, + "learning_rate": 9.598247035078389e-05, + "loss": 1.9825, + "step": 5039 + }, + { + "epoch": 1.5469613259668509, + "grad_norm": 0.7120574116706848, + "learning_rate": 9.59805179842461e-05, + "loss": 1.9467, + "step": 5040 + }, + { + "epoch": 1.547268262737876, + "grad_norm": 0.7050338983535767, + "learning_rate": 9.597856516330187e-05, + "loss": 1.9763, + "step": 5041 + }, + { + "epoch": 1.547575199508901, + "grad_norm": 0.4908922016620636, + "learning_rate": 9.597661188797051e-05, + "loss": 1.9826, + "step": 5042 + }, + { + "epoch": 1.5478821362799264, + "grad_norm": 0.47363361716270447, + "learning_rate": 9.597465815827133e-05, + "loss": 1.9769, + "step": 5043 + }, + { + "epoch": 1.5481890730509515, + "grad_norm": 0.6289864182472229, + "learning_rate": 9.597270397422364e-05, + "loss": 1.9364, + "step": 5044 + }, + { + "epoch": 1.5484960098219767, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.597074933584673e-05, + "loss": 1.949, + "step": 5045 + }, + { + "epoch": 1.548802946593002, + "grad_norm": 0.559152364730835, + "learning_rate": 9.596879424315993e-05, + "loss": 2.0194, + "step": 5046 + }, + { + "epoch": 1.5491098833640269, + "grad_norm": 0.4613901674747467, + "learning_rate": 9.596683869618257e-05, + "loss": 1.9658, + "step": 5047 + }, + { + "epoch": 1.5494168201350522, + "grad_norm": 0.6245483160018921, + "learning_rate": 9.596488269493396e-05, + "loss": 1.9265, + "step": 5048 + }, + { + "epoch": 1.5497237569060773, + "grad_norm": 0.8100824356079102, + "learning_rate": 9.596292623943343e-05, + "loss": 1.9536, + "step": 5049 + }, + { + "epoch": 1.5500306936771024, + "grad_norm": 0.7486092448234558, + "learning_rate": 9.596096932970035e-05, + "loss": 1.9801, + "step": 5050 + }, + { + "epoch": 1.5503376304481278, + "grad_norm": 0.4803295135498047, + "learning_rate": 9.595901196575401e-05, + "loss": 1.9943, + "step": 5051 + }, + { + "epoch": 1.550644567219153, + "grad_norm": 0.5027125477790833, + "learning_rate": 9.595705414761379e-05, + "loss": 1.9036, + "step": 5052 + }, + { + "epoch": 1.550951503990178, + "grad_norm": 0.5785070657730103, + "learning_rate": 9.595509587529902e-05, + "loss": 1.9489, + "step": 5053 + }, + { + "epoch": 1.5512584407612033, + "grad_norm": 0.6017338633537292, + "learning_rate": 9.595313714882906e-05, + "loss": 1.9964, + "step": 5054 + }, + { + "epoch": 1.5515653775322282, + "grad_norm": 0.5023195147514343, + "learning_rate": 9.595117796822326e-05, + "loss": 1.9778, + "step": 5055 + }, + { + "epoch": 1.5518723143032536, + "grad_norm": 0.4488884508609772, + "learning_rate": 9.594921833350099e-05, + "loss": 2.0141, + "step": 5056 + }, + { + "epoch": 1.5521792510742787, + "grad_norm": 0.47110801935195923, + "learning_rate": 9.59472582446816e-05, + "loss": 1.9294, + "step": 5057 + }, + { + "epoch": 1.5524861878453038, + "grad_norm": 0.5292330980300903, + "learning_rate": 9.594529770178449e-05, + "loss": 2.0427, + "step": 5058 + }, + { + "epoch": 1.5527931246163291, + "grad_norm": 0.522756814956665, + "learning_rate": 9.5943336704829e-05, + "loss": 1.9854, + "step": 5059 + }, + { + "epoch": 1.5531000613873542, + "grad_norm": 0.44659632444381714, + "learning_rate": 9.594137525383455e-05, + "loss": 2.028, + "step": 5060 + }, + { + "epoch": 1.5534069981583793, + "grad_norm": 0.4745616614818573, + "learning_rate": 9.593941334882048e-05, + "loss": 1.9994, + "step": 5061 + }, + { + "epoch": 1.5537139349294047, + "grad_norm": 0.41752973198890686, + "learning_rate": 9.593745098980622e-05, + "loss": 1.9466, + "step": 5062 + }, + { + "epoch": 1.5540208717004296, + "grad_norm": 0.4548248052597046, + "learning_rate": 9.593548817681115e-05, + "loss": 1.9064, + "step": 5063 + }, + { + "epoch": 1.554327808471455, + "grad_norm": 0.45780888199806213, + "learning_rate": 9.593352490985464e-05, + "loss": 2.0254, + "step": 5064 + }, + { + "epoch": 1.55463474524248, + "grad_norm": 0.4118718206882477, + "learning_rate": 9.593156118895613e-05, + "loss": 1.9761, + "step": 5065 + }, + { + "epoch": 1.5549416820135051, + "grad_norm": 0.41350236535072327, + "learning_rate": 9.592959701413501e-05, + "loss": 1.9476, + "step": 5066 + }, + { + "epoch": 1.5552486187845305, + "grad_norm": 0.4116091728210449, + "learning_rate": 9.59276323854107e-05, + "loss": 1.9325, + "step": 5067 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.44039735198020935, + "learning_rate": 9.592566730280259e-05, + "loss": 1.9916, + "step": 5068 + }, + { + "epoch": 1.5558624923265807, + "grad_norm": 0.4028816819190979, + "learning_rate": 9.592370176633012e-05, + "loss": 1.916, + "step": 5069 + }, + { + "epoch": 1.556169429097606, + "grad_norm": 0.42046302556991577, + "learning_rate": 9.592173577601271e-05, + "loss": 1.961, + "step": 5070 + }, + { + "epoch": 1.556476365868631, + "grad_norm": 0.3749450147151947, + "learning_rate": 9.591976933186982e-05, + "loss": 1.9279, + "step": 5071 + }, + { + "epoch": 1.5567833026396563, + "grad_norm": 0.3441384434700012, + "learning_rate": 9.591780243392081e-05, + "loss": 1.8967, + "step": 5072 + }, + { + "epoch": 1.5570902394106814, + "grad_norm": 0.4032546877861023, + "learning_rate": 9.59158350821852e-05, + "loss": 1.9912, + "step": 5073 + }, + { + "epoch": 1.5573971761817065, + "grad_norm": 0.44628265500068665, + "learning_rate": 9.591386727668238e-05, + "loss": 2.0539, + "step": 5074 + }, + { + "epoch": 1.5577041129527318, + "grad_norm": 0.43606969714164734, + "learning_rate": 9.59118990174318e-05, + "loss": 1.97, + "step": 5075 + }, + { + "epoch": 1.558011049723757, + "grad_norm": 0.42076775431632996, + "learning_rate": 9.590993030445295e-05, + "loss": 1.962, + "step": 5076 + }, + { + "epoch": 1.558317986494782, + "grad_norm": 0.34569117426872253, + "learning_rate": 9.590796113776526e-05, + "loss": 1.8815, + "step": 5077 + }, + { + "epoch": 1.5586249232658074, + "grad_norm": 0.3931111693382263, + "learning_rate": 9.590599151738817e-05, + "loss": 1.9016, + "step": 5078 + }, + { + "epoch": 1.5589318600368323, + "grad_norm": 0.3952369689941406, + "learning_rate": 9.590402144334117e-05, + "loss": 1.9277, + "step": 5079 + }, + { + "epoch": 1.5592387968078576, + "grad_norm": 0.3960857689380646, + "learning_rate": 9.590205091564372e-05, + "loss": 1.947, + "step": 5080 + }, + { + "epoch": 1.5595457335788827, + "grad_norm": 0.37946292757987976, + "learning_rate": 9.590007993431532e-05, + "loss": 1.9907, + "step": 5081 + }, + { + "epoch": 1.5598526703499078, + "grad_norm": 0.41619375348091125, + "learning_rate": 9.589810849937541e-05, + "loss": 1.9451, + "step": 5082 + }, + { + "epoch": 1.5601596071209332, + "grad_norm": 0.39266669750213623, + "learning_rate": 9.58961366108435e-05, + "loss": 2.0137, + "step": 5083 + }, + { + "epoch": 1.5604665438919583, + "grad_norm": 0.39510276913642883, + "learning_rate": 9.589416426873907e-05, + "loss": 1.947, + "step": 5084 + }, + { + "epoch": 1.5607734806629834, + "grad_norm": 0.40243181586265564, + "learning_rate": 9.58921914730816e-05, + "loss": 1.8957, + "step": 5085 + }, + { + "epoch": 1.5610804174340087, + "grad_norm": 0.39877578616142273, + "learning_rate": 9.58902182238906e-05, + "loss": 1.9497, + "step": 5086 + }, + { + "epoch": 1.5613873542050336, + "grad_norm": 0.39367151260375977, + "learning_rate": 9.588824452118557e-05, + "loss": 1.9616, + "step": 5087 + }, + { + "epoch": 1.561694290976059, + "grad_norm": 0.35690104961395264, + "learning_rate": 9.5886270364986e-05, + "loss": 1.9108, + "step": 5088 + }, + { + "epoch": 1.562001227747084, + "grad_norm": 0.39512762427330017, + "learning_rate": 9.588429575531141e-05, + "loss": 1.9909, + "step": 5089 + }, + { + "epoch": 1.5623081645181092, + "grad_norm": 0.39253926277160645, + "learning_rate": 9.588232069218132e-05, + "loss": 1.937, + "step": 5090 + }, + { + "epoch": 1.5626151012891345, + "grad_norm": 0.37811553478240967, + "learning_rate": 9.588034517561526e-05, + "loss": 1.8918, + "step": 5091 + }, + { + "epoch": 1.5629220380601596, + "grad_norm": 0.38191986083984375, + "learning_rate": 9.587836920563272e-05, + "loss": 1.9149, + "step": 5092 + }, + { + "epoch": 1.5632289748311847, + "grad_norm": 0.3903779089450836, + "learning_rate": 9.587639278225326e-05, + "loss": 1.9714, + "step": 5093 + }, + { + "epoch": 1.56353591160221, + "grad_norm": 0.4467499554157257, + "learning_rate": 9.587441590549639e-05, + "loss": 1.8822, + "step": 5094 + }, + { + "epoch": 1.563842848373235, + "grad_norm": 0.3819296956062317, + "learning_rate": 9.587243857538164e-05, + "loss": 1.9212, + "step": 5095 + }, + { + "epoch": 1.5641497851442603, + "grad_norm": 0.4305097162723541, + "learning_rate": 9.587046079192858e-05, + "loss": 1.9264, + "step": 5096 + }, + { + "epoch": 1.5644567219152854, + "grad_norm": 0.4135383367538452, + "learning_rate": 9.586848255515675e-05, + "loss": 1.9743, + "step": 5097 + }, + { + "epoch": 1.5647636586863105, + "grad_norm": 0.44688066840171814, + "learning_rate": 9.586650386508566e-05, + "loss": 1.8804, + "step": 5098 + }, + { + "epoch": 1.5650705954573358, + "grad_norm": 0.5358461737632751, + "learning_rate": 9.586452472173492e-05, + "loss": 1.9485, + "step": 5099 + }, + { + "epoch": 1.565377532228361, + "grad_norm": 0.5585343837738037, + "learning_rate": 9.586254512512408e-05, + "loss": 2.0901, + "step": 5100 + }, + { + "epoch": 1.565684468999386, + "grad_norm": 0.4682343602180481, + "learning_rate": 9.586056507527266e-05, + "loss": 1.8877, + "step": 5101 + }, + { + "epoch": 1.5659914057704114, + "grad_norm": 0.44076529145240784, + "learning_rate": 9.585858457220026e-05, + "loss": 1.93, + "step": 5102 + }, + { + "epoch": 1.5662983425414365, + "grad_norm": 0.4613071382045746, + "learning_rate": 9.585660361592646e-05, + "loss": 1.9689, + "step": 5103 + }, + { + "epoch": 1.5666052793124616, + "grad_norm": 0.4589289128780365, + "learning_rate": 9.585462220647082e-05, + "loss": 1.8876, + "step": 5104 + }, + { + "epoch": 1.566912216083487, + "grad_norm": 0.3495907485485077, + "learning_rate": 9.585264034385292e-05, + "loss": 1.9013, + "step": 5105 + }, + { + "epoch": 1.5672191528545119, + "grad_norm": 0.42263728380203247, + "learning_rate": 9.585065802809235e-05, + "loss": 1.8886, + "step": 5106 + }, + { + "epoch": 1.5675260896255372, + "grad_norm": 0.4275301694869995, + "learning_rate": 9.584867525920872e-05, + "loss": 1.9865, + "step": 5107 + }, + { + "epoch": 1.5678330263965623, + "grad_norm": 0.4228142201900482, + "learning_rate": 9.584669203722161e-05, + "loss": 1.8573, + "step": 5108 + }, + { + "epoch": 1.5681399631675874, + "grad_norm": 0.4422524571418762, + "learning_rate": 9.58447083621506e-05, + "loss": 1.924, + "step": 5109 + }, + { + "epoch": 1.5684468999386127, + "grad_norm": 0.41540947556495667, + "learning_rate": 9.584272423401532e-05, + "loss": 1.969, + "step": 5110 + }, + { + "epoch": 1.5687538367096379, + "grad_norm": 0.3963775336742401, + "learning_rate": 9.584073965283538e-05, + "loss": 1.9509, + "step": 5111 + }, + { + "epoch": 1.569060773480663, + "grad_norm": 0.41465985774993896, + "learning_rate": 9.583875461863037e-05, + "loss": 1.9393, + "step": 5112 + }, + { + "epoch": 1.5693677102516883, + "grad_norm": 0.4396083652973175, + "learning_rate": 9.583676913141991e-05, + "loss": 1.9872, + "step": 5113 + }, + { + "epoch": 1.5696746470227132, + "grad_norm": 0.4247182607650757, + "learning_rate": 9.583478319122366e-05, + "loss": 1.9807, + "step": 5114 + }, + { + "epoch": 1.5699815837937385, + "grad_norm": 0.3612080216407776, + "learning_rate": 9.583279679806119e-05, + "loss": 1.9563, + "step": 5115 + }, + { + "epoch": 1.5702885205647636, + "grad_norm": 0.40084055066108704, + "learning_rate": 9.583080995195217e-05, + "loss": 1.9099, + "step": 5116 + }, + { + "epoch": 1.5705954573357888, + "grad_norm": 0.432381272315979, + "learning_rate": 9.582882265291621e-05, + "loss": 2.0167, + "step": 5117 + }, + { + "epoch": 1.570902394106814, + "grad_norm": 0.45490768551826477, + "learning_rate": 9.5826834900973e-05, + "loss": 1.9179, + "step": 5118 + }, + { + "epoch": 1.5712093308778392, + "grad_norm": 0.39158329367637634, + "learning_rate": 9.582484669614211e-05, + "loss": 1.8716, + "step": 5119 + }, + { + "epoch": 1.5715162676488643, + "grad_norm": 0.45607441663742065, + "learning_rate": 9.582285803844324e-05, + "loss": 1.9631, + "step": 5120 + }, + { + "epoch": 1.5718232044198897, + "grad_norm": 0.42591094970703125, + "learning_rate": 9.582086892789604e-05, + "loss": 1.9809, + "step": 5121 + }, + { + "epoch": 1.5721301411909145, + "grad_norm": 0.46772903203964233, + "learning_rate": 9.581887936452015e-05, + "loss": 1.9991, + "step": 5122 + }, + { + "epoch": 1.5724370779619399, + "grad_norm": 0.4450485408306122, + "learning_rate": 9.581688934833524e-05, + "loss": 1.9471, + "step": 5123 + }, + { + "epoch": 1.572744014732965, + "grad_norm": 0.37539350986480713, + "learning_rate": 9.581489887936097e-05, + "loss": 1.8624, + "step": 5124 + }, + { + "epoch": 1.57305095150399, + "grad_norm": 0.4184030294418335, + "learning_rate": 9.581290795761702e-05, + "loss": 1.9746, + "step": 5125 + }, + { + "epoch": 1.5733578882750154, + "grad_norm": 0.43275317549705505, + "learning_rate": 9.581091658312305e-05, + "loss": 2.0484, + "step": 5126 + }, + { + "epoch": 1.5736648250460405, + "grad_norm": 0.48845502734184265, + "learning_rate": 9.580892475589876e-05, + "loss": 1.9331, + "step": 5127 + }, + { + "epoch": 1.5739717618170657, + "grad_norm": 0.4653528034687042, + "learning_rate": 9.580693247596383e-05, + "loss": 1.8888, + "step": 5128 + }, + { + "epoch": 1.574278698588091, + "grad_norm": 0.4371016323566437, + "learning_rate": 9.580493974333794e-05, + "loss": 1.9004, + "step": 5129 + }, + { + "epoch": 1.5745856353591159, + "grad_norm": 0.4274102747440338, + "learning_rate": 9.580294655804079e-05, + "loss": 1.9877, + "step": 5130 + }, + { + "epoch": 1.5748925721301412, + "grad_norm": 0.4053245484828949, + "learning_rate": 9.580095292009208e-05, + "loss": 1.9253, + "step": 5131 + }, + { + "epoch": 1.5751995089011663, + "grad_norm": 0.47868627309799194, + "learning_rate": 9.579895882951151e-05, + "loss": 1.9659, + "step": 5132 + }, + { + "epoch": 1.5755064456721914, + "grad_norm": 0.47420576214790344, + "learning_rate": 9.579696428631877e-05, + "loss": 1.9115, + "step": 5133 + }, + { + "epoch": 1.5758133824432168, + "grad_norm": 0.41192150115966797, + "learning_rate": 9.57949692905336e-05, + "loss": 1.8949, + "step": 5134 + }, + { + "epoch": 1.576120319214242, + "grad_norm": 0.44949471950531006, + "learning_rate": 9.57929738421757e-05, + "loss": 1.9393, + "step": 5135 + }, + { + "epoch": 1.576427255985267, + "grad_norm": 0.38450154662132263, + "learning_rate": 9.57909779412648e-05, + "loss": 1.8399, + "step": 5136 + }, + { + "epoch": 1.5767341927562923, + "grad_norm": 0.43553364276885986, + "learning_rate": 9.57889815878206e-05, + "loss": 1.9477, + "step": 5137 + }, + { + "epoch": 1.5770411295273172, + "grad_norm": 0.4546982944011688, + "learning_rate": 9.578698478186285e-05, + "loss": 1.9169, + "step": 5138 + }, + { + "epoch": 1.5773480662983426, + "grad_norm": 0.47802838683128357, + "learning_rate": 9.57849875234113e-05, + "loss": 1.9204, + "step": 5139 + }, + { + "epoch": 1.5776550030693677, + "grad_norm": 0.3648034930229187, + "learning_rate": 9.578298981248565e-05, + "loss": 1.9157, + "step": 5140 + }, + { + "epoch": 1.5779619398403928, + "grad_norm": 0.41951245069503784, + "learning_rate": 9.578099164910565e-05, + "loss": 1.9171, + "step": 5141 + }, + { + "epoch": 1.5782688766114181, + "grad_norm": 0.5198701620101929, + "learning_rate": 9.577899303329107e-05, + "loss": 1.9786, + "step": 5142 + }, + { + "epoch": 1.5785758133824432, + "grad_norm": 0.45244187116622925, + "learning_rate": 9.577699396506165e-05, + "loss": 2.0044, + "step": 5143 + }, + { + "epoch": 1.5788827501534684, + "grad_norm": 0.3874819874763489, + "learning_rate": 9.577499444443715e-05, + "loss": 1.9385, + "step": 5144 + }, + { + "epoch": 1.5791896869244937, + "grad_norm": 0.4578075110912323, + "learning_rate": 9.577299447143733e-05, + "loss": 1.9679, + "step": 5145 + }, + { + "epoch": 1.5794966236955186, + "grad_norm": 0.6001343727111816, + "learning_rate": 9.577099404608192e-05, + "loss": 1.9331, + "step": 5146 + }, + { + "epoch": 1.579803560466544, + "grad_norm": 0.5592501759529114, + "learning_rate": 9.576899316839074e-05, + "loss": 1.8968, + "step": 5147 + }, + { + "epoch": 1.580110497237569, + "grad_norm": 0.4333004951477051, + "learning_rate": 9.576699183838356e-05, + "loss": 2.0378, + "step": 5148 + }, + { + "epoch": 1.5804174340085941, + "grad_norm": 0.40593892335891724, + "learning_rate": 9.576499005608011e-05, + "loss": 1.9878, + "step": 5149 + }, + { + "epoch": 1.5807243707796195, + "grad_norm": 0.4805290400981903, + "learning_rate": 9.576298782150023e-05, + "loss": 1.9897, + "step": 5150 + }, + { + "epoch": 1.5810313075506446, + "grad_norm": 0.4620860517024994, + "learning_rate": 9.576098513466367e-05, + "loss": 1.9808, + "step": 5151 + }, + { + "epoch": 1.5813382443216697, + "grad_norm": 0.47085410356521606, + "learning_rate": 9.575898199559023e-05, + "loss": 1.9526, + "step": 5152 + }, + { + "epoch": 1.581645181092695, + "grad_norm": 0.512971043586731, + "learning_rate": 9.575697840429971e-05, + "loss": 1.9684, + "step": 5153 + }, + { + "epoch": 1.58195211786372, + "grad_norm": 0.5474939346313477, + "learning_rate": 9.575497436081193e-05, + "loss": 2.0052, + "step": 5154 + }, + { + "epoch": 1.5822590546347453, + "grad_norm": 0.6277830004692078, + "learning_rate": 9.575296986514666e-05, + "loss": 2.042, + "step": 5155 + }, + { + "epoch": 1.5825659914057704, + "grad_norm": 0.46941256523132324, + "learning_rate": 9.575096491732372e-05, + "loss": 1.952, + "step": 5156 + }, + { + "epoch": 1.5828729281767955, + "grad_norm": 0.4948115646839142, + "learning_rate": 9.574895951736294e-05, + "loss": 1.9573, + "step": 5157 + }, + { + "epoch": 1.5831798649478208, + "grad_norm": 0.5677160024642944, + "learning_rate": 9.574695366528411e-05, + "loss": 1.9696, + "step": 5158 + }, + { + "epoch": 1.583486801718846, + "grad_norm": 0.5915918350219727, + "learning_rate": 9.574494736110708e-05, + "loss": 1.9822, + "step": 5159 + }, + { + "epoch": 1.583793738489871, + "grad_norm": 0.556413471698761, + "learning_rate": 9.574294060485168e-05, + "loss": 1.9548, + "step": 5160 + }, + { + "epoch": 1.5841006752608964, + "grad_norm": 0.4706072509288788, + "learning_rate": 9.574093339653772e-05, + "loss": 2.0052, + "step": 5161 + }, + { + "epoch": 1.5844076120319213, + "grad_norm": 0.3931087553501129, + "learning_rate": 9.573892573618505e-05, + "loss": 1.9071, + "step": 5162 + }, + { + "epoch": 1.5847145488029466, + "grad_norm": 0.4590308368206024, + "learning_rate": 9.573691762381349e-05, + "loss": 2.048, + "step": 5163 + }, + { + "epoch": 1.5850214855739717, + "grad_norm": 0.4404078423976898, + "learning_rate": 9.573490905944293e-05, + "loss": 1.9426, + "step": 5164 + }, + { + "epoch": 1.5853284223449968, + "grad_norm": 0.486074298620224, + "learning_rate": 9.573290004309318e-05, + "loss": 1.9937, + "step": 5165 + }, + { + "epoch": 1.5856353591160222, + "grad_norm": 0.4650556445121765, + "learning_rate": 9.57308905747841e-05, + "loss": 1.9821, + "step": 5166 + }, + { + "epoch": 1.5859422958870473, + "grad_norm": 0.48193567991256714, + "learning_rate": 9.572888065453557e-05, + "loss": 2.0143, + "step": 5167 + }, + { + "epoch": 1.5862492326580724, + "grad_norm": 0.43178877234458923, + "learning_rate": 9.572687028236744e-05, + "loss": 2.0066, + "step": 5168 + }, + { + "epoch": 1.5865561694290977, + "grad_norm": 0.5256033539772034, + "learning_rate": 9.572485945829957e-05, + "loss": 2.0431, + "step": 5169 + }, + { + "epoch": 1.5868631062001226, + "grad_norm": 0.4714619517326355, + "learning_rate": 9.572284818235182e-05, + "loss": 1.9411, + "step": 5170 + }, + { + "epoch": 1.587170042971148, + "grad_norm": 0.4224734902381897, + "learning_rate": 9.572083645454411e-05, + "loss": 1.9648, + "step": 5171 + }, + { + "epoch": 1.5874769797421733, + "grad_norm": 0.45965152978897095, + "learning_rate": 9.571882427489628e-05, + "loss": 1.9241, + "step": 5172 + }, + { + "epoch": 1.5877839165131982, + "grad_norm": 0.459114670753479, + "learning_rate": 9.571681164342825e-05, + "loss": 2.0197, + "step": 5173 + }, + { + "epoch": 1.5880908532842235, + "grad_norm": 0.4278501272201538, + "learning_rate": 9.571479856015988e-05, + "loss": 1.9411, + "step": 5174 + }, + { + "epoch": 1.5883977900552486, + "grad_norm": 0.6875150799751282, + "learning_rate": 9.571278502511107e-05, + "loss": 1.8876, + "step": 5175 + }, + { + "epoch": 1.5887047268262737, + "grad_norm": 0.4596772789955139, + "learning_rate": 9.571077103830174e-05, + "loss": 1.9002, + "step": 5176 + }, + { + "epoch": 1.589011663597299, + "grad_norm": 0.47587937116622925, + "learning_rate": 9.570875659975178e-05, + "loss": 2.0034, + "step": 5177 + }, + { + "epoch": 1.5893186003683242, + "grad_norm": 0.42494842410087585, + "learning_rate": 9.570674170948109e-05, + "loss": 1.9668, + "step": 5178 + }, + { + "epoch": 1.5896255371393493, + "grad_norm": 0.4231310784816742, + "learning_rate": 9.570472636750957e-05, + "loss": 1.9365, + "step": 5179 + }, + { + "epoch": 1.5899324739103746, + "grad_norm": 0.4585247337818146, + "learning_rate": 9.570271057385719e-05, + "loss": 1.9707, + "step": 5180 + }, + { + "epoch": 1.5902394106813995, + "grad_norm": 0.4146895408630371, + "learning_rate": 9.570069432854382e-05, + "loss": 1.9405, + "step": 5181 + }, + { + "epoch": 1.5905463474524248, + "grad_norm": 0.42243605852127075, + "learning_rate": 9.56986776315894e-05, + "loss": 1.8893, + "step": 5182 + }, + { + "epoch": 1.59085328422345, + "grad_norm": 0.44299328327178955, + "learning_rate": 9.569666048301386e-05, + "loss": 1.9596, + "step": 5183 + }, + { + "epoch": 1.591160220994475, + "grad_norm": 0.4950970709323883, + "learning_rate": 9.569464288283716e-05, + "loss": 1.9066, + "step": 5184 + }, + { + "epoch": 1.5914671577655004, + "grad_norm": 0.4664969742298126, + "learning_rate": 9.569262483107919e-05, + "loss": 1.9485, + "step": 5185 + }, + { + "epoch": 1.5917740945365255, + "grad_norm": 0.5052160024642944, + "learning_rate": 9.569060632775993e-05, + "loss": 1.9189, + "step": 5186 + }, + { + "epoch": 1.5920810313075506, + "grad_norm": 0.4109063446521759, + "learning_rate": 9.568858737289932e-05, + "loss": 1.9236, + "step": 5187 + }, + { + "epoch": 1.592387968078576, + "grad_norm": 0.4078194499015808, + "learning_rate": 9.568656796651731e-05, + "loss": 1.9465, + "step": 5188 + }, + { + "epoch": 1.5926949048496009, + "grad_norm": 0.43199312686920166, + "learning_rate": 9.568454810863385e-05, + "loss": 1.9537, + "step": 5189 + }, + { + "epoch": 1.5930018416206262, + "grad_norm": 0.46389925479888916, + "learning_rate": 9.568252779926891e-05, + "loss": 1.9463, + "step": 5190 + }, + { + "epoch": 1.5933087783916513, + "grad_norm": 0.4130708575248718, + "learning_rate": 9.568050703844247e-05, + "loss": 1.948, + "step": 5191 + }, + { + "epoch": 1.5936157151626764, + "grad_norm": 0.4699256122112274, + "learning_rate": 9.567848582617448e-05, + "loss": 1.957, + "step": 5192 + }, + { + "epoch": 1.5939226519337018, + "grad_norm": 0.41965460777282715, + "learning_rate": 9.56764641624849e-05, + "loss": 1.9622, + "step": 5193 + }, + { + "epoch": 1.5942295887047269, + "grad_norm": 0.4313151240348816, + "learning_rate": 9.567444204739376e-05, + "loss": 1.981, + "step": 5194 + }, + { + "epoch": 1.594536525475752, + "grad_norm": 0.4149332642555237, + "learning_rate": 9.5672419480921e-05, + "loss": 1.9542, + "step": 5195 + }, + { + "epoch": 1.5948434622467773, + "grad_norm": 0.4456483721733093, + "learning_rate": 9.567039646308661e-05, + "loss": 2.0206, + "step": 5196 + }, + { + "epoch": 1.5951503990178022, + "grad_norm": 0.46637552976608276, + "learning_rate": 9.56683729939106e-05, + "loss": 2.0264, + "step": 5197 + }, + { + "epoch": 1.5954573357888275, + "grad_norm": 0.4809871315956116, + "learning_rate": 9.566634907341297e-05, + "loss": 1.9113, + "step": 5198 + }, + { + "epoch": 1.5957642725598526, + "grad_norm": 0.5220670104026794, + "learning_rate": 9.566432470161371e-05, + "loss": 1.9806, + "step": 5199 + }, + { + "epoch": 1.5960712093308778, + "grad_norm": 0.5020555853843689, + "learning_rate": 9.566229987853283e-05, + "loss": 1.9925, + "step": 5200 + }, + { + "epoch": 1.596378146101903, + "grad_norm": 0.5481683611869812, + "learning_rate": 9.566027460419034e-05, + "loss": 1.978, + "step": 5201 + }, + { + "epoch": 1.5966850828729282, + "grad_norm": 0.5014147758483887, + "learning_rate": 9.565824887860624e-05, + "loss": 1.9402, + "step": 5202 + }, + { + "epoch": 1.5969920196439533, + "grad_norm": 0.43973588943481445, + "learning_rate": 9.565622270180057e-05, + "loss": 1.9877, + "step": 5203 + }, + { + "epoch": 1.5972989564149787, + "grad_norm": 0.5172939300537109, + "learning_rate": 9.565419607379335e-05, + "loss": 1.9304, + "step": 5204 + }, + { + "epoch": 1.5976058931860035, + "grad_norm": 0.4767214357852936, + "learning_rate": 9.56521689946046e-05, + "loss": 1.9063, + "step": 5205 + }, + { + "epoch": 1.5979128299570289, + "grad_norm": 0.48810651898384094, + "learning_rate": 9.565014146425437e-05, + "loss": 1.9473, + "step": 5206 + }, + { + "epoch": 1.598219766728054, + "grad_norm": 0.4204402565956116, + "learning_rate": 9.564811348276269e-05, + "loss": 1.9562, + "step": 5207 + }, + { + "epoch": 1.598526703499079, + "grad_norm": 0.42679163813591003, + "learning_rate": 9.564608505014958e-05, + "loss": 1.8904, + "step": 5208 + }, + { + "epoch": 1.5988336402701044, + "grad_norm": 0.4240354299545288, + "learning_rate": 9.56440561664351e-05, + "loss": 1.9982, + "step": 5209 + }, + { + "epoch": 1.5991405770411296, + "grad_norm": 0.41588497161865234, + "learning_rate": 9.564202683163932e-05, + "loss": 1.9904, + "step": 5210 + }, + { + "epoch": 1.5994475138121547, + "grad_norm": 0.486240029335022, + "learning_rate": 9.563999704578226e-05, + "loss": 1.9379, + "step": 5211 + }, + { + "epoch": 1.59975445058318, + "grad_norm": 0.4628448188304901, + "learning_rate": 9.563796680888403e-05, + "loss": 2.0061, + "step": 5212 + }, + { + "epoch": 1.600061387354205, + "grad_norm": 0.4514544606208801, + "learning_rate": 9.563593612096464e-05, + "loss": 1.9692, + "step": 5213 + }, + { + "epoch": 1.6003683241252302, + "grad_norm": 0.3869803845882416, + "learning_rate": 9.563390498204419e-05, + "loss": 1.8801, + "step": 5214 + }, + { + "epoch": 1.6006752608962553, + "grad_norm": 0.47029098868370056, + "learning_rate": 9.563187339214274e-05, + "loss": 2.0457, + "step": 5215 + }, + { + "epoch": 1.6009821976672804, + "grad_norm": 0.49051982164382935, + "learning_rate": 9.562984135128037e-05, + "loss": 1.9121, + "step": 5216 + }, + { + "epoch": 1.6012891344383058, + "grad_norm": 0.5087830424308777, + "learning_rate": 9.562780885947717e-05, + "loss": 1.9165, + "step": 5217 + }, + { + "epoch": 1.601596071209331, + "grad_norm": 0.4597826600074768, + "learning_rate": 9.562577591675322e-05, + "loss": 1.9037, + "step": 5218 + }, + { + "epoch": 1.601903007980356, + "grad_norm": 0.43610528111457825, + "learning_rate": 9.562374252312858e-05, + "loss": 1.8785, + "step": 5219 + }, + { + "epoch": 1.6022099447513813, + "grad_norm": 0.45797282457351685, + "learning_rate": 9.56217086786234e-05, + "loss": 2.0713, + "step": 5220 + }, + { + "epoch": 1.6025168815224062, + "grad_norm": 0.46097078919410706, + "learning_rate": 9.561967438325777e-05, + "loss": 1.9176, + "step": 5221 + }, + { + "epoch": 1.6028238182934316, + "grad_norm": 0.47368288040161133, + "learning_rate": 9.561763963705176e-05, + "loss": 1.9333, + "step": 5222 + }, + { + "epoch": 1.6031307550644567, + "grad_norm": 0.5048179626464844, + "learning_rate": 9.561560444002551e-05, + "loss": 1.9473, + "step": 5223 + }, + { + "epoch": 1.6034376918354818, + "grad_norm": 0.42069435119628906, + "learning_rate": 9.56135687921991e-05, + "loss": 1.8507, + "step": 5224 + }, + { + "epoch": 1.6037446286065071, + "grad_norm": 0.37166985869407654, + "learning_rate": 9.561153269359269e-05, + "loss": 1.9404, + "step": 5225 + }, + { + "epoch": 1.6040515653775322, + "grad_norm": 0.42752668261528015, + "learning_rate": 9.560949614422637e-05, + "loss": 1.9791, + "step": 5226 + }, + { + "epoch": 1.6043585021485574, + "grad_norm": 0.4334527552127838, + "learning_rate": 9.560745914412029e-05, + "loss": 1.972, + "step": 5227 + }, + { + "epoch": 1.6046654389195827, + "grad_norm": 0.44162631034851074, + "learning_rate": 9.560542169329454e-05, + "loss": 1.9054, + "step": 5228 + }, + { + "epoch": 1.6049723756906076, + "grad_norm": 0.3891509771347046, + "learning_rate": 9.560338379176929e-05, + "loss": 1.9356, + "step": 5229 + }, + { + "epoch": 1.605279312461633, + "grad_norm": 0.3821989893913269, + "learning_rate": 9.56013454395647e-05, + "loss": 1.9197, + "step": 5230 + }, + { + "epoch": 1.605586249232658, + "grad_norm": 0.4338948428630829, + "learning_rate": 9.559930663670084e-05, + "loss": 2.002, + "step": 5231 + }, + { + "epoch": 1.6058931860036831, + "grad_norm": 0.4784114956855774, + "learning_rate": 9.559726738319794e-05, + "loss": 2.0344, + "step": 5232 + }, + { + "epoch": 1.6062001227747085, + "grad_norm": 0.43362441658973694, + "learning_rate": 9.559522767907612e-05, + "loss": 1.9282, + "step": 5233 + }, + { + "epoch": 1.6065070595457336, + "grad_norm": 0.40863800048828125, + "learning_rate": 9.559318752435553e-05, + "loss": 1.8468, + "step": 5234 + }, + { + "epoch": 1.6068139963167587, + "grad_norm": 0.4509727358818054, + "learning_rate": 9.559114691905633e-05, + "loss": 2.0175, + "step": 5235 + }, + { + "epoch": 1.607120933087784, + "grad_norm": 0.4650020897388458, + "learning_rate": 9.55891058631987e-05, + "loss": 1.9946, + "step": 5236 + }, + { + "epoch": 1.607427869858809, + "grad_norm": 0.4315911829471588, + "learning_rate": 9.55870643568028e-05, + "loss": 1.9271, + "step": 5237 + }, + { + "epoch": 1.6077348066298343, + "grad_norm": 0.4109809994697571, + "learning_rate": 9.558502239988882e-05, + "loss": 1.9791, + "step": 5238 + }, + { + "epoch": 1.6080417434008594, + "grad_norm": 0.4323776662349701, + "learning_rate": 9.558297999247692e-05, + "loss": 1.9745, + "step": 5239 + }, + { + "epoch": 1.6083486801718845, + "grad_norm": 0.4255007207393646, + "learning_rate": 9.558093713458729e-05, + "loss": 1.96, + "step": 5240 + }, + { + "epoch": 1.6086556169429098, + "grad_norm": 0.4045571982860565, + "learning_rate": 9.557889382624014e-05, + "loss": 1.9148, + "step": 5241 + }, + { + "epoch": 1.608962553713935, + "grad_norm": 0.39663615822792053, + "learning_rate": 9.557685006745564e-05, + "loss": 1.9313, + "step": 5242 + }, + { + "epoch": 1.60926949048496, + "grad_norm": 0.39130523800849915, + "learning_rate": 9.5574805858254e-05, + "loss": 2.0073, + "step": 5243 + }, + { + "epoch": 1.6095764272559854, + "grad_norm": 0.4071548581123352, + "learning_rate": 9.55727611986554e-05, + "loss": 1.9353, + "step": 5244 + }, + { + "epoch": 1.6098833640270105, + "grad_norm": 0.44347357749938965, + "learning_rate": 9.557071608868007e-05, + "loss": 1.9325, + "step": 5245 + }, + { + "epoch": 1.6101903007980356, + "grad_norm": 0.48900067806243896, + "learning_rate": 9.556867052834821e-05, + "loss": 2.0083, + "step": 5246 + }, + { + "epoch": 1.610497237569061, + "grad_norm": 0.44374197721481323, + "learning_rate": 9.556662451768006e-05, + "loss": 2.0143, + "step": 5247 + }, + { + "epoch": 1.6108041743400858, + "grad_norm": 0.385268896818161, + "learning_rate": 9.556457805669581e-05, + "loss": 1.8981, + "step": 5248 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.5355607867240906, + "learning_rate": 9.556253114541569e-05, + "loss": 2.0413, + "step": 5249 + }, + { + "epoch": 1.6114180478821363, + "grad_norm": 0.5672646164894104, + "learning_rate": 9.556048378385992e-05, + "loss": 1.9429, + "step": 5250 + }, + { + "epoch": 1.6117249846531614, + "grad_norm": 0.46225669980049133, + "learning_rate": 9.555843597204875e-05, + "loss": 1.9883, + "step": 5251 + }, + { + "epoch": 1.6120319214241867, + "grad_norm": 0.43236228823661804, + "learning_rate": 9.555638771000243e-05, + "loss": 1.9641, + "step": 5252 + }, + { + "epoch": 1.6123388581952118, + "grad_norm": 0.4843178987503052, + "learning_rate": 9.555433899774116e-05, + "loss": 1.9224, + "step": 5253 + }, + { + "epoch": 1.612645794966237, + "grad_norm": 0.4693675637245178, + "learning_rate": 9.555228983528523e-05, + "loss": 1.9774, + "step": 5254 + }, + { + "epoch": 1.6129527317372623, + "grad_norm": 0.3968529999256134, + "learning_rate": 9.555024022265487e-05, + "loss": 1.8939, + "step": 5255 + }, + { + "epoch": 1.6132596685082872, + "grad_norm": 0.42781850695610046, + "learning_rate": 9.554819015987033e-05, + "loss": 1.9561, + "step": 5256 + }, + { + "epoch": 1.6135666052793125, + "grad_norm": 0.5241015553474426, + "learning_rate": 9.554613964695189e-05, + "loss": 1.963, + "step": 5257 + }, + { + "epoch": 1.6138735420503376, + "grad_norm": 0.4292888641357422, + "learning_rate": 9.554408868391979e-05, + "loss": 2.0248, + "step": 5258 + }, + { + "epoch": 1.6141804788213627, + "grad_norm": 0.49197763204574585, + "learning_rate": 9.554203727079433e-05, + "loss": 1.9612, + "step": 5259 + }, + { + "epoch": 1.614487415592388, + "grad_norm": 0.45733556151390076, + "learning_rate": 9.553998540759575e-05, + "loss": 1.9093, + "step": 5260 + }, + { + "epoch": 1.6147943523634132, + "grad_norm": 0.4139576256275177, + "learning_rate": 9.553793309434436e-05, + "loss": 1.875, + "step": 5261 + }, + { + "epoch": 1.6151012891344383, + "grad_norm": 0.42295894026756287, + "learning_rate": 9.55358803310604e-05, + "loss": 1.9427, + "step": 5262 + }, + { + "epoch": 1.6154082259054636, + "grad_norm": 0.370761513710022, + "learning_rate": 9.55338271177642e-05, + "loss": 1.932, + "step": 5263 + }, + { + "epoch": 1.6157151626764885, + "grad_norm": 0.38912683725357056, + "learning_rate": 9.553177345447602e-05, + "loss": 1.9606, + "step": 5264 + }, + { + "epoch": 1.6160220994475138, + "grad_norm": 0.3901510238647461, + "learning_rate": 9.552971934121618e-05, + "loss": 1.9455, + "step": 5265 + }, + { + "epoch": 1.616329036218539, + "grad_norm": 0.4517458975315094, + "learning_rate": 9.552766477800494e-05, + "loss": 1.9291, + "step": 5266 + }, + { + "epoch": 1.616635972989564, + "grad_norm": 0.47282713651657104, + "learning_rate": 9.552560976486266e-05, + "loss": 1.9326, + "step": 5267 + }, + { + "epoch": 1.6169429097605894, + "grad_norm": 0.4741488993167877, + "learning_rate": 9.552355430180961e-05, + "loss": 1.9782, + "step": 5268 + }, + { + "epoch": 1.6172498465316145, + "grad_norm": 0.42634037137031555, + "learning_rate": 9.552149838886612e-05, + "loss": 1.9871, + "step": 5269 + }, + { + "epoch": 1.6175567833026396, + "grad_norm": 0.39007633924484253, + "learning_rate": 9.55194420260525e-05, + "loss": 1.9397, + "step": 5270 + }, + { + "epoch": 1.617863720073665, + "grad_norm": 0.41707170009613037, + "learning_rate": 9.551738521338906e-05, + "loss": 1.8555, + "step": 5271 + }, + { + "epoch": 1.6181706568446899, + "grad_norm": 0.46702343225479126, + "learning_rate": 9.551532795089616e-05, + "loss": 1.9987, + "step": 5272 + }, + { + "epoch": 1.6184775936157152, + "grad_norm": 0.44585564732551575, + "learning_rate": 9.551327023859411e-05, + "loss": 1.8512, + "step": 5273 + }, + { + "epoch": 1.6187845303867403, + "grad_norm": 0.42617684602737427, + "learning_rate": 9.551121207650324e-05, + "loss": 1.9405, + "step": 5274 + }, + { + "epoch": 1.6190914671577654, + "grad_norm": 0.39399340748786926, + "learning_rate": 9.55091534646439e-05, + "loss": 1.9787, + "step": 5275 + }, + { + "epoch": 1.6193984039287908, + "grad_norm": 0.44386324286460876, + "learning_rate": 9.550709440303642e-05, + "loss": 1.9791, + "step": 5276 + }, + { + "epoch": 1.6197053406998159, + "grad_norm": 0.3871287405490875, + "learning_rate": 9.550503489170117e-05, + "loss": 1.9354, + "step": 5277 + }, + { + "epoch": 1.620012277470841, + "grad_norm": 0.4131690263748169, + "learning_rate": 9.550297493065851e-05, + "loss": 1.9709, + "step": 5278 + }, + { + "epoch": 1.6203192142418663, + "grad_norm": 0.3919534683227539, + "learning_rate": 9.550091451992877e-05, + "loss": 1.8997, + "step": 5279 + }, + { + "epoch": 1.6206261510128912, + "grad_norm": 0.40001583099365234, + "learning_rate": 9.54988536595323e-05, + "loss": 1.9006, + "step": 5280 + }, + { + "epoch": 1.6209330877839165, + "grad_norm": 0.44222408533096313, + "learning_rate": 9.549679234948952e-05, + "loss": 2.0033, + "step": 5281 + }, + { + "epoch": 1.6212400245549416, + "grad_norm": 0.4243159592151642, + "learning_rate": 9.549473058982077e-05, + "loss": 1.9582, + "step": 5282 + }, + { + "epoch": 1.6215469613259668, + "grad_norm": 0.411408007144928, + "learning_rate": 9.549266838054641e-05, + "loss": 1.9244, + "step": 5283 + }, + { + "epoch": 1.621853898096992, + "grad_norm": 0.3833782970905304, + "learning_rate": 9.549060572168686e-05, + "loss": 1.9184, + "step": 5284 + }, + { + "epoch": 1.6221608348680172, + "grad_norm": 0.3925926685333252, + "learning_rate": 9.548854261326246e-05, + "loss": 1.9299, + "step": 5285 + }, + { + "epoch": 1.6224677716390423, + "grad_norm": 0.4472656846046448, + "learning_rate": 9.548647905529363e-05, + "loss": 2.0622, + "step": 5286 + }, + { + "epoch": 1.6227747084100677, + "grad_norm": 0.4842108488082886, + "learning_rate": 9.548441504780074e-05, + "loss": 1.9759, + "step": 5287 + }, + { + "epoch": 1.6230816451810925, + "grad_norm": 0.49826517701148987, + "learning_rate": 9.548235059080422e-05, + "loss": 1.9162, + "step": 5288 + }, + { + "epoch": 1.6233885819521179, + "grad_norm": 0.4672689735889435, + "learning_rate": 9.548028568432445e-05, + "loss": 1.9843, + "step": 5289 + }, + { + "epoch": 1.623695518723143, + "grad_norm": 0.48113325238227844, + "learning_rate": 9.547822032838182e-05, + "loss": 1.9426, + "step": 5290 + }, + { + "epoch": 1.624002455494168, + "grad_norm": 0.49646374583244324, + "learning_rate": 9.54761545229968e-05, + "loss": 1.908, + "step": 5291 + }, + { + "epoch": 1.6243093922651934, + "grad_norm": 0.42530664801597595, + "learning_rate": 9.547408826818974e-05, + "loss": 1.9189, + "step": 5292 + }, + { + "epoch": 1.6246163290362186, + "grad_norm": 0.592721164226532, + "learning_rate": 9.54720215639811e-05, + "loss": 1.9656, + "step": 5293 + }, + { + "epoch": 1.6249232658072437, + "grad_norm": 0.5530748963356018, + "learning_rate": 9.546995441039127e-05, + "loss": 1.8815, + "step": 5294 + }, + { + "epoch": 1.625230202578269, + "grad_norm": 0.4551030695438385, + "learning_rate": 9.546788680744073e-05, + "loss": 1.9485, + "step": 5295 + }, + { + "epoch": 1.625537139349294, + "grad_norm": 0.42004409432411194, + "learning_rate": 9.546581875514985e-05, + "loss": 1.9903, + "step": 5296 + }, + { + "epoch": 1.6258440761203192, + "grad_norm": 0.5363507270812988, + "learning_rate": 9.546375025353911e-05, + "loss": 1.93, + "step": 5297 + }, + { + "epoch": 1.6261510128913443, + "grad_norm": 0.457795649766922, + "learning_rate": 9.546168130262896e-05, + "loss": 1.9279, + "step": 5298 + }, + { + "epoch": 1.6264579496623695, + "grad_norm": 0.5061174631118774, + "learning_rate": 9.545961190243982e-05, + "loss": 1.9198, + "step": 5299 + }, + { + "epoch": 1.6267648864333948, + "grad_norm": 0.4366548955440521, + "learning_rate": 9.545754205299214e-05, + "loss": 1.9206, + "step": 5300 + }, + { + "epoch": 1.62707182320442, + "grad_norm": 0.361251562833786, + "learning_rate": 9.54554717543064e-05, + "loss": 1.8638, + "step": 5301 + }, + { + "epoch": 1.627378759975445, + "grad_norm": 0.45089036226272583, + "learning_rate": 9.545340100640303e-05, + "loss": 1.9206, + "step": 5302 + }, + { + "epoch": 1.6276856967464703, + "grad_norm": 0.38224726915359497, + "learning_rate": 9.545132980930251e-05, + "loss": 1.9893, + "step": 5303 + }, + { + "epoch": 1.6279926335174952, + "grad_norm": 0.43573206663131714, + "learning_rate": 9.544925816302533e-05, + "loss": 1.9358, + "step": 5304 + }, + { + "epoch": 1.6282995702885206, + "grad_norm": 0.5618723630905151, + "learning_rate": 9.544718606759193e-05, + "loss": 1.9745, + "step": 5305 + }, + { + "epoch": 1.6286065070595457, + "grad_norm": 0.517867386341095, + "learning_rate": 9.54451135230228e-05, + "loss": 2.0238, + "step": 5306 + }, + { + "epoch": 1.6289134438305708, + "grad_norm": 0.4745725393295288, + "learning_rate": 9.544304052933842e-05, + "loss": 1.999, + "step": 5307 + }, + { + "epoch": 1.6292203806015961, + "grad_norm": 0.4454270899295807, + "learning_rate": 9.544096708655928e-05, + "loss": 1.9215, + "step": 5308 + }, + { + "epoch": 1.6295273173726212, + "grad_norm": 0.5604696273803711, + "learning_rate": 9.543889319470586e-05, + "loss": 1.8756, + "step": 5309 + }, + { + "epoch": 1.6298342541436464, + "grad_norm": 0.645453155040741, + "learning_rate": 9.543681885379869e-05, + "loss": 1.9177, + "step": 5310 + }, + { + "epoch": 1.6301411909146717, + "grad_norm": 0.7018140554428101, + "learning_rate": 9.543474406385824e-05, + "loss": 1.9231, + "step": 5311 + }, + { + "epoch": 1.6304481276856968, + "grad_norm": 0.691644549369812, + "learning_rate": 9.543266882490501e-05, + "loss": 1.9055, + "step": 5312 + }, + { + "epoch": 1.630755064456722, + "grad_norm": 0.5484849810600281, + "learning_rate": 9.54305931369595e-05, + "loss": 1.8977, + "step": 5313 + }, + { + "epoch": 1.6310620012277472, + "grad_norm": 0.4035104811191559, + "learning_rate": 9.542851700004227e-05, + "loss": 1.9098, + "step": 5314 + }, + { + "epoch": 1.6313689379987721, + "grad_norm": 0.4578574299812317, + "learning_rate": 9.542644041417379e-05, + "loss": 1.9946, + "step": 5315 + }, + { + "epoch": 1.6316758747697975, + "grad_norm": 0.646272599697113, + "learning_rate": 9.542436337937462e-05, + "loss": 1.9489, + "step": 5316 + }, + { + "epoch": 1.6319828115408226, + "grad_norm": 0.5796291828155518, + "learning_rate": 9.542228589566524e-05, + "loss": 1.8396, + "step": 5317 + }, + { + "epoch": 1.6322897483118477, + "grad_norm": 0.42690619826316833, + "learning_rate": 9.542020796306623e-05, + "loss": 1.9691, + "step": 5318 + }, + { + "epoch": 1.632596685082873, + "grad_norm": 0.3943910002708435, + "learning_rate": 9.54181295815981e-05, + "loss": 1.8711, + "step": 5319 + }, + { + "epoch": 1.6329036218538981, + "grad_norm": 0.4636860489845276, + "learning_rate": 9.541605075128137e-05, + "loss": 1.8659, + "step": 5320 + }, + { + "epoch": 1.6332105586249233, + "grad_norm": 0.5485807061195374, + "learning_rate": 9.541397147213664e-05, + "loss": 2.031, + "step": 5321 + }, + { + "epoch": 1.6335174953959486, + "grad_norm": 0.40169721841812134, + "learning_rate": 9.541189174418441e-05, + "loss": 1.9346, + "step": 5322 + }, + { + "epoch": 1.6338244321669735, + "grad_norm": 0.3407663106918335, + "learning_rate": 9.540981156744524e-05, + "loss": 1.9238, + "step": 5323 + }, + { + "epoch": 1.6341313689379988, + "grad_norm": 0.4062422513961792, + "learning_rate": 9.540773094193971e-05, + "loss": 1.914, + "step": 5324 + }, + { + "epoch": 1.634438305709024, + "grad_norm": 0.47654685378074646, + "learning_rate": 9.540564986768836e-05, + "loss": 1.8957, + "step": 5325 + }, + { + "epoch": 1.634745242480049, + "grad_norm": 0.4369850754737854, + "learning_rate": 9.540356834471178e-05, + "loss": 1.968, + "step": 5326 + }, + { + "epoch": 1.6350521792510744, + "grad_norm": 0.38868457078933716, + "learning_rate": 9.540148637303052e-05, + "loss": 1.931, + "step": 5327 + }, + { + "epoch": 1.6353591160220995, + "grad_norm": 0.4998358190059662, + "learning_rate": 9.539940395266515e-05, + "loss": 1.9316, + "step": 5328 + }, + { + "epoch": 1.6356660527931246, + "grad_norm": 0.5497372150421143, + "learning_rate": 9.539732108363628e-05, + "loss": 1.9233, + "step": 5329 + }, + { + "epoch": 1.63597298956415, + "grad_norm": 0.5609846115112305, + "learning_rate": 9.539523776596445e-05, + "loss": 1.898, + "step": 5330 + }, + { + "epoch": 1.6362799263351748, + "grad_norm": 0.44984617829322815, + "learning_rate": 9.539315399967029e-05, + "loss": 2.0103, + "step": 5331 + }, + { + "epoch": 1.6365868631062002, + "grad_norm": 0.41710013151168823, + "learning_rate": 9.539106978477436e-05, + "loss": 1.9008, + "step": 5332 + }, + { + "epoch": 1.6368937998772253, + "grad_norm": 0.44854703545570374, + "learning_rate": 9.53889851212973e-05, + "loss": 1.9591, + "step": 5333 + }, + { + "epoch": 1.6372007366482504, + "grad_norm": 0.4259171485900879, + "learning_rate": 9.538690000925968e-05, + "loss": 1.915, + "step": 5334 + }, + { + "epoch": 1.6375076734192757, + "grad_norm": 0.4444480240345001, + "learning_rate": 9.53848144486821e-05, + "loss": 1.9562, + "step": 5335 + }, + { + "epoch": 1.6378146101903008, + "grad_norm": 0.40078794956207275, + "learning_rate": 9.538272843958518e-05, + "loss": 1.8802, + "step": 5336 + }, + { + "epoch": 1.638121546961326, + "grad_norm": 0.5346726179122925, + "learning_rate": 9.538064198198955e-05, + "loss": 2.0214, + "step": 5337 + }, + { + "epoch": 1.6384284837323513, + "grad_norm": 0.47136780619621277, + "learning_rate": 9.537855507591581e-05, + "loss": 1.9593, + "step": 5338 + }, + { + "epoch": 1.6387354205033762, + "grad_norm": 0.3839198052883148, + "learning_rate": 9.53764677213846e-05, + "loss": 1.9507, + "step": 5339 + }, + { + "epoch": 1.6390423572744015, + "grad_norm": 0.4565586447715759, + "learning_rate": 9.537437991841654e-05, + "loss": 1.9292, + "step": 5340 + }, + { + "epoch": 1.6393492940454266, + "grad_norm": 0.5139011740684509, + "learning_rate": 9.537229166703225e-05, + "loss": 1.9388, + "step": 5341 + }, + { + "epoch": 1.6396562308164517, + "grad_norm": 0.5421571135520935, + "learning_rate": 9.537020296725238e-05, + "loss": 1.9031, + "step": 5342 + }, + { + "epoch": 1.639963167587477, + "grad_norm": 0.4085434675216675, + "learning_rate": 9.536811381909758e-05, + "loss": 1.9167, + "step": 5343 + }, + { + "epoch": 1.6402701043585022, + "grad_norm": 0.3567824065685272, + "learning_rate": 9.536602422258849e-05, + "loss": 1.89, + "step": 5344 + }, + { + "epoch": 1.6405770411295273, + "grad_norm": 0.5427443385124207, + "learning_rate": 9.536393417774575e-05, + "loss": 2.0036, + "step": 5345 + }, + { + "epoch": 1.6408839779005526, + "grad_norm": 0.5275370478630066, + "learning_rate": 9.536184368459003e-05, + "loss": 1.94, + "step": 5346 + }, + { + "epoch": 1.6411909146715775, + "grad_norm": 0.3916989862918854, + "learning_rate": 9.535975274314198e-05, + "loss": 1.8769, + "step": 5347 + }, + { + "epoch": 1.6414978514426029, + "grad_norm": 0.4200802743434906, + "learning_rate": 9.535766135342228e-05, + "loss": 1.9384, + "step": 5348 + }, + { + "epoch": 1.641804788213628, + "grad_norm": 0.5287195444107056, + "learning_rate": 9.535556951545157e-05, + "loss": 1.9159, + "step": 5349 + }, + { + "epoch": 1.642111724984653, + "grad_norm": 0.5934851765632629, + "learning_rate": 9.535347722925055e-05, + "loss": 1.9927, + "step": 5350 + }, + { + "epoch": 1.6424186617556784, + "grad_norm": 0.49941807985305786, + "learning_rate": 9.535138449483987e-05, + "loss": 1.9124, + "step": 5351 + }, + { + "epoch": 1.6427255985267035, + "grad_norm": 0.41778016090393066, + "learning_rate": 9.534929131224024e-05, + "loss": 1.9468, + "step": 5352 + }, + { + "epoch": 1.6430325352977286, + "grad_norm": 0.5172474384307861, + "learning_rate": 9.534719768147233e-05, + "loss": 1.928, + "step": 5353 + }, + { + "epoch": 1.643339472068754, + "grad_norm": 0.6690294146537781, + "learning_rate": 9.534510360255683e-05, + "loss": 1.9697, + "step": 5354 + }, + { + "epoch": 1.6436464088397789, + "grad_norm": 0.617683470249176, + "learning_rate": 9.534300907551444e-05, + "loss": 1.9529, + "step": 5355 + }, + { + "epoch": 1.6439533456108042, + "grad_norm": 0.40067893266677856, + "learning_rate": 9.534091410036587e-05, + "loss": 1.915, + "step": 5356 + }, + { + "epoch": 1.6442602823818293, + "grad_norm": 0.46418440341949463, + "learning_rate": 9.53388186771318e-05, + "loss": 1.9056, + "step": 5357 + }, + { + "epoch": 1.6445672191528544, + "grad_norm": 0.6600098013877869, + "learning_rate": 9.533672280583295e-05, + "loss": 1.9641, + "step": 5358 + }, + { + "epoch": 1.6448741559238798, + "grad_norm": 0.6510347127914429, + "learning_rate": 9.533462648649004e-05, + "loss": 1.916, + "step": 5359 + }, + { + "epoch": 1.6451810926949049, + "grad_norm": 0.5004377365112305, + "learning_rate": 9.533252971912376e-05, + "loss": 1.9584, + "step": 5360 + }, + { + "epoch": 1.64548802946593, + "grad_norm": 0.45522230863571167, + "learning_rate": 9.533043250375488e-05, + "loss": 1.973, + "step": 5361 + }, + { + "epoch": 1.6457949662369553, + "grad_norm": 0.5304180383682251, + "learning_rate": 9.532833484040408e-05, + "loss": 1.8542, + "step": 5362 + }, + { + "epoch": 1.6461019030079802, + "grad_norm": 0.5320406556129456, + "learning_rate": 9.53262367290921e-05, + "loss": 1.9405, + "step": 5363 + }, + { + "epoch": 1.6464088397790055, + "grad_norm": 0.4377361536026001, + "learning_rate": 9.532413816983969e-05, + "loss": 1.9126, + "step": 5364 + }, + { + "epoch": 1.6467157765500307, + "grad_norm": 0.4632298946380615, + "learning_rate": 9.532203916266758e-05, + "loss": 1.9868, + "step": 5365 + }, + { + "epoch": 1.6470227133210558, + "grad_norm": 0.4861730635166168, + "learning_rate": 9.531993970759651e-05, + "loss": 1.895, + "step": 5366 + }, + { + "epoch": 1.647329650092081, + "grad_norm": 0.45012348890304565, + "learning_rate": 9.531783980464726e-05, + "loss": 1.9583, + "step": 5367 + }, + { + "epoch": 1.6476365868631062, + "grad_norm": 0.43772751092910767, + "learning_rate": 9.531573945384053e-05, + "loss": 1.9341, + "step": 5368 + }, + { + "epoch": 1.6479435236341313, + "grad_norm": 0.39253392815589905, + "learning_rate": 9.531363865519711e-05, + "loss": 1.8629, + "step": 5369 + }, + { + "epoch": 1.6482504604051567, + "grad_norm": 0.44614076614379883, + "learning_rate": 9.531153740873775e-05, + "loss": 1.9508, + "step": 5370 + }, + { + "epoch": 1.6485573971761815, + "grad_norm": 0.4442307949066162, + "learning_rate": 9.530943571448322e-05, + "loss": 1.9624, + "step": 5371 + }, + { + "epoch": 1.6488643339472069, + "grad_norm": 0.44962942600250244, + "learning_rate": 9.53073335724543e-05, + "loss": 1.9315, + "step": 5372 + }, + { + "epoch": 1.649171270718232, + "grad_norm": 0.4903222620487213, + "learning_rate": 9.530523098267173e-05, + "loss": 1.8776, + "step": 5373 + }, + { + "epoch": 1.649478207489257, + "grad_norm": 0.4733131229877472, + "learning_rate": 9.530312794515633e-05, + "loss": 1.958, + "step": 5374 + }, + { + "epoch": 1.6497851442602824, + "grad_norm": 0.4134232997894287, + "learning_rate": 9.530102445992886e-05, + "loss": 1.9184, + "step": 5375 + }, + { + "epoch": 1.6500920810313076, + "grad_norm": 0.43521758913993835, + "learning_rate": 9.529892052701012e-05, + "loss": 1.9383, + "step": 5376 + }, + { + "epoch": 1.6503990178023327, + "grad_norm": 0.5098583102226257, + "learning_rate": 9.52968161464209e-05, + "loss": 1.9596, + "step": 5377 + }, + { + "epoch": 1.650705954573358, + "grad_norm": 0.48421037197113037, + "learning_rate": 9.5294711318182e-05, + "loss": 1.9258, + "step": 5378 + }, + { + "epoch": 1.651012891344383, + "grad_norm": 0.4039461314678192, + "learning_rate": 9.52926060423142e-05, + "loss": 1.9975, + "step": 5379 + }, + { + "epoch": 1.6513198281154082, + "grad_norm": 0.491858571767807, + "learning_rate": 9.529050031883832e-05, + "loss": 1.9564, + "step": 5380 + }, + { + "epoch": 1.6516267648864333, + "grad_norm": 0.45920100808143616, + "learning_rate": 9.528839414777517e-05, + "loss": 1.8513, + "step": 5381 + }, + { + "epoch": 1.6519337016574585, + "grad_norm": 0.4812139868736267, + "learning_rate": 9.528628752914558e-05, + "loss": 1.9638, + "step": 5382 + }, + { + "epoch": 1.6522406384284838, + "grad_norm": 0.38021141290664673, + "learning_rate": 9.528418046297034e-05, + "loss": 1.848, + "step": 5383 + }, + { + "epoch": 1.652547575199509, + "grad_norm": 0.438681960105896, + "learning_rate": 9.52820729492703e-05, + "loss": 1.9931, + "step": 5384 + }, + { + "epoch": 1.652854511970534, + "grad_norm": 0.4387293756008148, + "learning_rate": 9.527996498806627e-05, + "loss": 1.9969, + "step": 5385 + }, + { + "epoch": 1.6531614487415593, + "grad_norm": 0.43315380811691284, + "learning_rate": 9.527785657937907e-05, + "loss": 1.9607, + "step": 5386 + }, + { + "epoch": 1.6534683855125845, + "grad_norm": 0.4800446927547455, + "learning_rate": 9.527574772322956e-05, + "loss": 1.9645, + "step": 5387 + }, + { + "epoch": 1.6537753222836096, + "grad_norm": 0.45495909452438354, + "learning_rate": 9.527363841963857e-05, + "loss": 1.8748, + "step": 5388 + }, + { + "epoch": 1.654082259054635, + "grad_norm": 0.4052638113498688, + "learning_rate": 9.527152866862696e-05, + "loss": 1.9491, + "step": 5389 + }, + { + "epoch": 1.6543891958256598, + "grad_norm": 0.44545745849609375, + "learning_rate": 9.526941847021558e-05, + "loss": 1.8938, + "step": 5390 + }, + { + "epoch": 1.6546961325966851, + "grad_norm": 0.5576399564743042, + "learning_rate": 9.526730782442526e-05, + "loss": 1.9656, + "step": 5391 + }, + { + "epoch": 1.6550030693677102, + "grad_norm": 0.5678401589393616, + "learning_rate": 9.526519673127686e-05, + "loss": 1.9914, + "step": 5392 + }, + { + "epoch": 1.6553100061387354, + "grad_norm": 0.4391598701477051, + "learning_rate": 9.526308519079127e-05, + "loss": 1.9452, + "step": 5393 + }, + { + "epoch": 1.6556169429097607, + "grad_norm": 0.4375559091567993, + "learning_rate": 9.526097320298934e-05, + "loss": 1.9335, + "step": 5394 + }, + { + "epoch": 1.6559238796807858, + "grad_norm": 0.4976498782634735, + "learning_rate": 9.525886076789194e-05, + "loss": 2.0065, + "step": 5395 + }, + { + "epoch": 1.656230816451811, + "grad_norm": 0.5966445207595825, + "learning_rate": 9.525674788551996e-05, + "loss": 1.9924, + "step": 5396 + }, + { + "epoch": 1.6565377532228363, + "grad_norm": 0.5119359493255615, + "learning_rate": 9.525463455589427e-05, + "loss": 2.0061, + "step": 5397 + }, + { + "epoch": 1.6568446899938611, + "grad_norm": 0.46835067868232727, + "learning_rate": 9.525252077903574e-05, + "loss": 1.9441, + "step": 5398 + }, + { + "epoch": 1.6571516267648865, + "grad_norm": 0.5319140553474426, + "learning_rate": 9.52504065549653e-05, + "loss": 1.9704, + "step": 5399 + }, + { + "epoch": 1.6574585635359116, + "grad_norm": 0.5132572054862976, + "learning_rate": 9.52482918837038e-05, + "loss": 1.9037, + "step": 5400 + }, + { + "epoch": 1.6577655003069367, + "grad_norm": 0.41260987520217896, + "learning_rate": 9.524617676527218e-05, + "loss": 1.9103, + "step": 5401 + }, + { + "epoch": 1.658072437077962, + "grad_norm": 0.41780540347099304, + "learning_rate": 9.524406119969131e-05, + "loss": 1.9419, + "step": 5402 + }, + { + "epoch": 1.6583793738489871, + "grad_norm": 0.42015889286994934, + "learning_rate": 9.524194518698211e-05, + "loss": 1.9143, + "step": 5403 + }, + { + "epoch": 1.6586863106200123, + "grad_norm": 0.4449796676635742, + "learning_rate": 9.523982872716548e-05, + "loss": 1.9794, + "step": 5404 + }, + { + "epoch": 1.6589932473910376, + "grad_norm": 0.4392293393611908, + "learning_rate": 9.523771182026237e-05, + "loss": 1.8687, + "step": 5405 + }, + { + "epoch": 1.6593001841620625, + "grad_norm": 0.49595963954925537, + "learning_rate": 9.523559446629366e-05, + "loss": 2.013, + "step": 5406 + }, + { + "epoch": 1.6596071209330878, + "grad_norm": 0.4456728994846344, + "learning_rate": 9.523347666528029e-05, + "loss": 1.9269, + "step": 5407 + }, + { + "epoch": 1.659914057704113, + "grad_norm": 0.3835284411907196, + "learning_rate": 9.52313584172432e-05, + "loss": 1.9042, + "step": 5408 + }, + { + "epoch": 1.660220994475138, + "grad_norm": 0.39068692922592163, + "learning_rate": 9.522923972220332e-05, + "loss": 1.999, + "step": 5409 + }, + { + "epoch": 1.6605279312461634, + "grad_norm": 0.4522729814052582, + "learning_rate": 9.522712058018157e-05, + "loss": 1.9546, + "step": 5410 + }, + { + "epoch": 1.6608348680171885, + "grad_norm": 0.3834155201911926, + "learning_rate": 9.522500099119891e-05, + "loss": 1.9184, + "step": 5411 + }, + { + "epoch": 1.6611418047882136, + "grad_norm": 0.36149126291275024, + "learning_rate": 9.522288095527629e-05, + "loss": 1.8973, + "step": 5412 + }, + { + "epoch": 1.661448741559239, + "grad_norm": 0.3502398729324341, + "learning_rate": 9.522076047243464e-05, + "loss": 1.8775, + "step": 5413 + }, + { + "epoch": 1.6617556783302638, + "grad_norm": 0.36552321910858154, + "learning_rate": 9.521863954269495e-05, + "loss": 1.901, + "step": 5414 + }, + { + "epoch": 1.6620626151012892, + "grad_norm": 0.37815216183662415, + "learning_rate": 9.521651816607814e-05, + "loss": 1.9143, + "step": 5415 + }, + { + "epoch": 1.6623695518723143, + "grad_norm": 0.4048994481563568, + "learning_rate": 9.52143963426052e-05, + "loss": 1.9892, + "step": 5416 + }, + { + "epoch": 1.6626764886433394, + "grad_norm": 0.35271233320236206, + "learning_rate": 9.52122740722971e-05, + "loss": 1.9209, + "step": 5417 + }, + { + "epoch": 1.6629834254143647, + "grad_norm": 0.405009925365448, + "learning_rate": 9.521015135517482e-05, + "loss": 1.9583, + "step": 5418 + }, + { + "epoch": 1.6632903621853898, + "grad_norm": 0.4041683077812195, + "learning_rate": 9.520802819125932e-05, + "loss": 1.8937, + "step": 5419 + }, + { + "epoch": 1.663597298956415, + "grad_norm": 0.41353970766067505, + "learning_rate": 9.520590458057157e-05, + "loss": 1.949, + "step": 5420 + }, + { + "epoch": 1.6639042357274403, + "grad_norm": 0.3704569637775421, + "learning_rate": 9.520378052313258e-05, + "loss": 1.9287, + "step": 5421 + }, + { + "epoch": 1.6642111724984652, + "grad_norm": 0.4043133854866028, + "learning_rate": 9.520165601896334e-05, + "loss": 1.9116, + "step": 5422 + }, + { + "epoch": 1.6645181092694905, + "grad_norm": 0.3976849317550659, + "learning_rate": 9.519953106808485e-05, + "loss": 1.9578, + "step": 5423 + }, + { + "epoch": 1.6648250460405156, + "grad_norm": 0.41225695610046387, + "learning_rate": 9.51974056705181e-05, + "loss": 1.8861, + "step": 5424 + }, + { + "epoch": 1.6651319828115407, + "grad_norm": 0.40096259117126465, + "learning_rate": 9.519527982628409e-05, + "loss": 1.926, + "step": 5425 + }, + { + "epoch": 1.665438919582566, + "grad_norm": 0.4373134970664978, + "learning_rate": 9.519315353540384e-05, + "loss": 1.8761, + "step": 5426 + }, + { + "epoch": 1.6657458563535912, + "grad_norm": 0.3798682689666748, + "learning_rate": 9.519102679789835e-05, + "loss": 1.8655, + "step": 5427 + }, + { + "epoch": 1.6660527931246163, + "grad_norm": 0.3889687955379486, + "learning_rate": 9.518889961378865e-05, + "loss": 1.8928, + "step": 5428 + }, + { + "epoch": 1.6663597298956416, + "grad_norm": 0.39567697048187256, + "learning_rate": 9.518677198309575e-05, + "loss": 1.9193, + "step": 5429 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.37571004033088684, + "learning_rate": 9.51846439058407e-05, + "loss": 1.9653, + "step": 5430 + }, + { + "epoch": 1.6669736034376919, + "grad_norm": 0.36011725664138794, + "learning_rate": 9.518251538204451e-05, + "loss": 1.9202, + "step": 5431 + }, + { + "epoch": 1.667280540208717, + "grad_norm": 0.42314839363098145, + "learning_rate": 9.518038641172822e-05, + "loss": 1.9883, + "step": 5432 + }, + { + "epoch": 1.667587476979742, + "grad_norm": 0.3986029326915741, + "learning_rate": 9.517825699491287e-05, + "loss": 1.9838, + "step": 5433 + }, + { + "epoch": 1.6678944137507674, + "grad_norm": 0.388236939907074, + "learning_rate": 9.517612713161949e-05, + "loss": 1.901, + "step": 5434 + }, + { + "epoch": 1.6682013505217925, + "grad_norm": 0.3849826455116272, + "learning_rate": 9.517399682186917e-05, + "loss": 1.9621, + "step": 5435 + }, + { + "epoch": 1.6685082872928176, + "grad_norm": 0.40182530879974365, + "learning_rate": 9.517186606568292e-05, + "loss": 1.9081, + "step": 5436 + }, + { + "epoch": 1.668815224063843, + "grad_norm": 0.4260261654853821, + "learning_rate": 9.516973486308181e-05, + "loss": 1.9701, + "step": 5437 + }, + { + "epoch": 1.6691221608348679, + "grad_norm": 0.4035099744796753, + "learning_rate": 9.516760321408692e-05, + "loss": 1.9269, + "step": 5438 + }, + { + "epoch": 1.6694290976058932, + "grad_norm": 0.42106589674949646, + "learning_rate": 9.51654711187193e-05, + "loss": 1.9026, + "step": 5439 + }, + { + "epoch": 1.6697360343769183, + "grad_norm": 0.4629819989204407, + "learning_rate": 9.516333857700001e-05, + "loss": 1.9128, + "step": 5440 + }, + { + "epoch": 1.6700429711479434, + "grad_norm": 0.3824837803840637, + "learning_rate": 9.516120558895014e-05, + "loss": 1.8861, + "step": 5441 + }, + { + "epoch": 1.6703499079189688, + "grad_norm": 0.37263223528862, + "learning_rate": 9.515907215459076e-05, + "loss": 1.9098, + "step": 5442 + }, + { + "epoch": 1.6706568446899939, + "grad_norm": 0.3980494439601898, + "learning_rate": 9.515693827394299e-05, + "loss": 1.9764, + "step": 5443 + }, + { + "epoch": 1.670963781461019, + "grad_norm": 0.5064507722854614, + "learning_rate": 9.515480394702786e-05, + "loss": 1.9771, + "step": 5444 + }, + { + "epoch": 1.6712707182320443, + "grad_norm": 0.5012909770011902, + "learning_rate": 9.515266917386649e-05, + "loss": 1.9162, + "step": 5445 + }, + { + "epoch": 1.6715776550030692, + "grad_norm": 0.5422279238700867, + "learning_rate": 9.515053395447999e-05, + "loss": 1.8913, + "step": 5446 + }, + { + "epoch": 1.6718845917740945, + "grad_norm": 0.4677022397518158, + "learning_rate": 9.514839828888946e-05, + "loss": 1.9156, + "step": 5447 + }, + { + "epoch": 1.6721915285451197, + "grad_norm": 0.39561185240745544, + "learning_rate": 9.514626217711597e-05, + "loss": 1.9203, + "step": 5448 + }, + { + "epoch": 1.6724984653161448, + "grad_norm": 0.4435743987560272, + "learning_rate": 9.514412561918068e-05, + "loss": 1.953, + "step": 5449 + }, + { + "epoch": 1.67280540208717, + "grad_norm": 0.5383535027503967, + "learning_rate": 9.514198861510467e-05, + "loss": 1.9662, + "step": 5450 + }, + { + "epoch": 1.6731123388581952, + "grad_norm": 0.4787214696407318, + "learning_rate": 9.513985116490906e-05, + "loss": 1.9278, + "step": 5451 + }, + { + "epoch": 1.6734192756292203, + "grad_norm": 0.40962034463882446, + "learning_rate": 9.513771326861501e-05, + "loss": 1.9267, + "step": 5452 + }, + { + "epoch": 1.6737262124002457, + "grad_norm": 0.43605929613113403, + "learning_rate": 9.513557492624359e-05, + "loss": 1.9537, + "step": 5453 + }, + { + "epoch": 1.6740331491712708, + "grad_norm": 0.46278494596481323, + "learning_rate": 9.513343613781599e-05, + "loss": 1.9383, + "step": 5454 + }, + { + "epoch": 1.6743400859422959, + "grad_norm": 0.4052918255329132, + "learning_rate": 9.513129690335331e-05, + "loss": 1.9289, + "step": 5455 + }, + { + "epoch": 1.6746470227133212, + "grad_norm": 0.37791141867637634, + "learning_rate": 9.51291572228767e-05, + "loss": 1.9185, + "step": 5456 + }, + { + "epoch": 1.674953959484346, + "grad_norm": 0.41135111451148987, + "learning_rate": 9.512701709640731e-05, + "loss": 2.0003, + "step": 5457 + }, + { + "epoch": 1.6752608962553714, + "grad_norm": 0.41175320744514465, + "learning_rate": 9.512487652396629e-05, + "loss": 1.9307, + "step": 5458 + }, + { + "epoch": 1.6755678330263966, + "grad_norm": 0.40061330795288086, + "learning_rate": 9.512273550557478e-05, + "loss": 1.9361, + "step": 5459 + }, + { + "epoch": 1.6758747697974217, + "grad_norm": 0.3938329219818115, + "learning_rate": 9.512059404125397e-05, + "loss": 1.9419, + "step": 5460 + }, + { + "epoch": 1.676181706568447, + "grad_norm": 0.42825883626937866, + "learning_rate": 9.511845213102498e-05, + "loss": 1.9201, + "step": 5461 + }, + { + "epoch": 1.6764886433394721, + "grad_norm": 0.3795798122882843, + "learning_rate": 9.511630977490901e-05, + "loss": 1.9872, + "step": 5462 + }, + { + "epoch": 1.6767955801104972, + "grad_norm": 0.3639005422592163, + "learning_rate": 9.511416697292724e-05, + "loss": 1.9066, + "step": 5463 + }, + { + "epoch": 1.6771025168815226, + "grad_norm": 0.4200088381767273, + "learning_rate": 9.511202372510082e-05, + "loss": 1.9928, + "step": 5464 + }, + { + "epoch": 1.6774094536525475, + "grad_norm": 0.436638742685318, + "learning_rate": 9.510988003145092e-05, + "loss": 1.8527, + "step": 5465 + }, + { + "epoch": 1.6777163904235728, + "grad_norm": 0.40901345014572144, + "learning_rate": 9.510773589199877e-05, + "loss": 1.9915, + "step": 5466 + }, + { + "epoch": 1.678023327194598, + "grad_norm": 0.39717167615890503, + "learning_rate": 9.510559130676553e-05, + "loss": 1.9682, + "step": 5467 + }, + { + "epoch": 1.678330263965623, + "grad_norm": 0.37574490904808044, + "learning_rate": 9.510344627577239e-05, + "loss": 1.9641, + "step": 5468 + }, + { + "epoch": 1.6786372007366483, + "grad_norm": 0.36686137318611145, + "learning_rate": 9.510130079904057e-05, + "loss": 1.9082, + "step": 5469 + }, + { + "epoch": 1.6789441375076735, + "grad_norm": 0.37321972846984863, + "learning_rate": 9.509915487659125e-05, + "loss": 1.8911, + "step": 5470 + }, + { + "epoch": 1.6792510742786986, + "grad_norm": 0.3911389112472534, + "learning_rate": 9.509700850844566e-05, + "loss": 1.9721, + "step": 5471 + }, + { + "epoch": 1.679558011049724, + "grad_norm": 0.41182973980903625, + "learning_rate": 9.509486169462499e-05, + "loss": 1.9188, + "step": 5472 + }, + { + "epoch": 1.6798649478207488, + "grad_norm": 0.4141900837421417, + "learning_rate": 9.509271443515047e-05, + "loss": 1.875, + "step": 5473 + }, + { + "epoch": 1.6801718845917741, + "grad_norm": 0.4259745478630066, + "learning_rate": 9.509056673004333e-05, + "loss": 1.9258, + "step": 5474 + }, + { + "epoch": 1.6804788213627992, + "grad_norm": 0.47081178426742554, + "learning_rate": 9.508841857932476e-05, + "loss": 2.0494, + "step": 5475 + }, + { + "epoch": 1.6807857581338244, + "grad_norm": 0.5346465110778809, + "learning_rate": 9.508626998301602e-05, + "loss": 1.9371, + "step": 5476 + }, + { + "epoch": 1.6810926949048497, + "grad_norm": 0.5532976388931274, + "learning_rate": 9.508412094113832e-05, + "loss": 1.8727, + "step": 5477 + }, + { + "epoch": 1.6813996316758748, + "grad_norm": 0.5262138843536377, + "learning_rate": 9.508197145371294e-05, + "loss": 1.9098, + "step": 5478 + }, + { + "epoch": 1.6817065684469, + "grad_norm": 0.47581788897514343, + "learning_rate": 9.507982152076108e-05, + "loss": 1.9174, + "step": 5479 + }, + { + "epoch": 1.6820135052179253, + "grad_norm": 0.41795024275779724, + "learning_rate": 9.507767114230399e-05, + "loss": 1.9333, + "step": 5480 + }, + { + "epoch": 1.6823204419889501, + "grad_norm": 0.5213392376899719, + "learning_rate": 9.507552031836295e-05, + "loss": 1.9731, + "step": 5481 + }, + { + "epoch": 1.6826273787599755, + "grad_norm": 0.624969482421875, + "learning_rate": 9.507336904895919e-05, + "loss": 1.965, + "step": 5482 + }, + { + "epoch": 1.6829343155310006, + "grad_norm": 0.5719303488731384, + "learning_rate": 9.507121733411397e-05, + "loss": 1.9325, + "step": 5483 + }, + { + "epoch": 1.6832412523020257, + "grad_norm": 0.45429563522338867, + "learning_rate": 9.506906517384858e-05, + "loss": 1.8846, + "step": 5484 + }, + { + "epoch": 1.683548189073051, + "grad_norm": 0.4679521322250366, + "learning_rate": 9.506691256818427e-05, + "loss": 1.9609, + "step": 5485 + }, + { + "epoch": 1.6838551258440762, + "grad_norm": 0.64385986328125, + "learning_rate": 9.50647595171423e-05, + "loss": 1.9138, + "step": 5486 + }, + { + "epoch": 1.6841620626151013, + "grad_norm": 0.6783073544502258, + "learning_rate": 9.506260602074398e-05, + "loss": 2.0252, + "step": 5487 + }, + { + "epoch": 1.6844689993861266, + "grad_norm": 0.6151844263076782, + "learning_rate": 9.506045207901058e-05, + "loss": 2.0077, + "step": 5488 + }, + { + "epoch": 1.6847759361571515, + "grad_norm": 0.43046683073043823, + "learning_rate": 9.505829769196338e-05, + "loss": 1.8945, + "step": 5489 + }, + { + "epoch": 1.6850828729281768, + "grad_norm": 0.44831258058547974, + "learning_rate": 9.505614285962366e-05, + "loss": 1.9775, + "step": 5490 + }, + { + "epoch": 1.685389809699202, + "grad_norm": 0.4917668402194977, + "learning_rate": 9.505398758201272e-05, + "loss": 1.9115, + "step": 5491 + }, + { + "epoch": 1.685696746470227, + "grad_norm": 0.4595036506652832, + "learning_rate": 9.505183185915187e-05, + "loss": 1.9103, + "step": 5492 + }, + { + "epoch": 1.6860036832412524, + "grad_norm": 0.43335607647895813, + "learning_rate": 9.504967569106243e-05, + "loss": 1.9147, + "step": 5493 + }, + { + "epoch": 1.6863106200122775, + "grad_norm": 0.42885956168174744, + "learning_rate": 9.504751907776567e-05, + "loss": 2.0085, + "step": 5494 + }, + { + "epoch": 1.6866175567833026, + "grad_norm": 0.4121492803096771, + "learning_rate": 9.504536201928295e-05, + "loss": 1.9212, + "step": 5495 + }, + { + "epoch": 1.686924493554328, + "grad_norm": 0.4387015700340271, + "learning_rate": 9.504320451563555e-05, + "loss": 1.9202, + "step": 5496 + }, + { + "epoch": 1.6872314303253528, + "grad_norm": 0.4333394467830658, + "learning_rate": 9.504104656684481e-05, + "loss": 1.9165, + "step": 5497 + }, + { + "epoch": 1.6875383670963782, + "grad_norm": 0.37835901975631714, + "learning_rate": 9.503888817293203e-05, + "loss": 1.9087, + "step": 5498 + }, + { + "epoch": 1.6878453038674033, + "grad_norm": 0.42156684398651123, + "learning_rate": 9.503672933391857e-05, + "loss": 1.8909, + "step": 5499 + }, + { + "epoch": 1.6881522406384284, + "grad_norm": 0.4315885603427887, + "learning_rate": 9.503457004982574e-05, + "loss": 1.8892, + "step": 5500 + }, + { + "epoch": 1.6884591774094537, + "grad_norm": 0.4349892735481262, + "learning_rate": 9.50324103206749e-05, + "loss": 1.9532, + "step": 5501 + }, + { + "epoch": 1.6887661141804788, + "grad_norm": 0.45786523818969727, + "learning_rate": 9.503025014648739e-05, + "loss": 1.9285, + "step": 5502 + }, + { + "epoch": 1.689073050951504, + "grad_norm": 0.36640092730522156, + "learning_rate": 9.502808952728456e-05, + "loss": 1.9167, + "step": 5503 + }, + { + "epoch": 1.6893799877225293, + "grad_norm": 0.46942031383514404, + "learning_rate": 9.502592846308775e-05, + "loss": 2.08, + "step": 5504 + }, + { + "epoch": 1.6896869244935542, + "grad_norm": 0.44714173674583435, + "learning_rate": 9.502376695391833e-05, + "loss": 1.9618, + "step": 5505 + }, + { + "epoch": 1.6899938612645795, + "grad_norm": 0.4216810464859009, + "learning_rate": 9.502160499979764e-05, + "loss": 1.888, + "step": 5506 + }, + { + "epoch": 1.6903007980356046, + "grad_norm": 0.40471377968788147, + "learning_rate": 9.501944260074709e-05, + "loss": 1.9048, + "step": 5507 + }, + { + "epoch": 1.6906077348066297, + "grad_norm": 0.399309366941452, + "learning_rate": 9.501727975678801e-05, + "loss": 1.8796, + "step": 5508 + }, + { + "epoch": 1.690914671577655, + "grad_norm": 0.36903873085975647, + "learning_rate": 9.501511646794176e-05, + "loss": 1.9607, + "step": 5509 + }, + { + "epoch": 1.6912216083486802, + "grad_norm": 0.40781939029693604, + "learning_rate": 9.501295273422977e-05, + "loss": 1.9328, + "step": 5510 + }, + { + "epoch": 1.6915285451197053, + "grad_norm": 0.38062483072280884, + "learning_rate": 9.50107885556734e-05, + "loss": 1.9552, + "step": 5511 + }, + { + "epoch": 1.6918354818907306, + "grad_norm": 0.4047648012638092, + "learning_rate": 9.500862393229402e-05, + "loss": 1.9503, + "step": 5512 + }, + { + "epoch": 1.6921424186617555, + "grad_norm": 0.3829517066478729, + "learning_rate": 9.500645886411305e-05, + "loss": 1.9034, + "step": 5513 + }, + { + "epoch": 1.6924493554327809, + "grad_norm": 0.3657867908477783, + "learning_rate": 9.500429335115188e-05, + "loss": 1.869, + "step": 5514 + }, + { + "epoch": 1.692756292203806, + "grad_norm": 0.410877525806427, + "learning_rate": 9.50021273934319e-05, + "loss": 1.9824, + "step": 5515 + }, + { + "epoch": 1.693063228974831, + "grad_norm": 0.420682817697525, + "learning_rate": 9.499996099097453e-05, + "loss": 1.969, + "step": 5516 + }, + { + "epoch": 1.6933701657458564, + "grad_norm": 0.44578227400779724, + "learning_rate": 9.499779414380115e-05, + "loss": 1.9513, + "step": 5517 + }, + { + "epoch": 1.6936771025168815, + "grad_norm": 0.42710423469543457, + "learning_rate": 9.499562685193319e-05, + "loss": 1.9423, + "step": 5518 + }, + { + "epoch": 1.6939840392879066, + "grad_norm": 0.4503214657306671, + "learning_rate": 9.49934591153921e-05, + "loss": 1.9849, + "step": 5519 + }, + { + "epoch": 1.694290976058932, + "grad_norm": 0.427157998085022, + "learning_rate": 9.499129093419926e-05, + "loss": 1.9502, + "step": 5520 + }, + { + "epoch": 1.6945979128299569, + "grad_norm": 0.4356638491153717, + "learning_rate": 9.498912230837611e-05, + "loss": 1.8593, + "step": 5521 + }, + { + "epoch": 1.6949048496009822, + "grad_norm": 0.3894338309764862, + "learning_rate": 9.498695323794409e-05, + "loss": 1.8857, + "step": 5522 + }, + { + "epoch": 1.6952117863720073, + "grad_norm": 0.4285121262073517, + "learning_rate": 9.498478372292464e-05, + "loss": 1.9774, + "step": 5523 + }, + { + "epoch": 1.6955187231430324, + "grad_norm": 0.4316183924674988, + "learning_rate": 9.498261376333916e-05, + "loss": 1.9067, + "step": 5524 + }, + { + "epoch": 1.6958256599140578, + "grad_norm": 0.3760167956352234, + "learning_rate": 9.498044335920914e-05, + "loss": 1.8375, + "step": 5525 + }, + { + "epoch": 1.6961325966850829, + "grad_norm": 0.4327097237110138, + "learning_rate": 9.497827251055602e-05, + "loss": 1.9333, + "step": 5526 + }, + { + "epoch": 1.696439533456108, + "grad_norm": 0.4169953167438507, + "learning_rate": 9.497610121740126e-05, + "loss": 1.9015, + "step": 5527 + }, + { + "epoch": 1.6967464702271333, + "grad_norm": 0.3915253281593323, + "learning_rate": 9.49739294797663e-05, + "loss": 1.8608, + "step": 5528 + }, + { + "epoch": 1.6970534069981584, + "grad_norm": 0.4071075916290283, + "learning_rate": 9.497175729767259e-05, + "loss": 1.9336, + "step": 5529 + }, + { + "epoch": 1.6973603437691835, + "grad_norm": 0.3550303876399994, + "learning_rate": 9.496958467114163e-05, + "loss": 1.8614, + "step": 5530 + }, + { + "epoch": 1.6976672805402089, + "grad_norm": 0.3757273554801941, + "learning_rate": 9.496741160019487e-05, + "loss": 1.9959, + "step": 5531 + }, + { + "epoch": 1.6979742173112338, + "grad_norm": 0.4126262366771698, + "learning_rate": 9.49652380848538e-05, + "loss": 1.935, + "step": 5532 + }, + { + "epoch": 1.698281154082259, + "grad_norm": 0.46366190910339355, + "learning_rate": 9.496306412513988e-05, + "loss": 1.9336, + "step": 5533 + }, + { + "epoch": 1.6985880908532842, + "grad_norm": 0.42553630471229553, + "learning_rate": 9.496088972107463e-05, + "loss": 1.9388, + "step": 5534 + }, + { + "epoch": 1.6988950276243093, + "grad_norm": 0.4060843884944916, + "learning_rate": 9.49587148726795e-05, + "loss": 1.917, + "step": 5535 + }, + { + "epoch": 1.6992019643953347, + "grad_norm": 0.37994736433029175, + "learning_rate": 9.495653957997601e-05, + "loss": 1.9268, + "step": 5536 + }, + { + "epoch": 1.6995089011663598, + "grad_norm": 0.4148559272289276, + "learning_rate": 9.495436384298563e-05, + "loss": 1.8936, + "step": 5537 + }, + { + "epoch": 1.6998158379373849, + "grad_norm": 0.39814767241477966, + "learning_rate": 9.495218766172989e-05, + "loss": 1.9468, + "step": 5538 + }, + { + "epoch": 1.7001227747084102, + "grad_norm": 0.40800294280052185, + "learning_rate": 9.495001103623027e-05, + "loss": 1.9649, + "step": 5539 + }, + { + "epoch": 1.7004297114794351, + "grad_norm": 0.4225989282131195, + "learning_rate": 9.49478339665083e-05, + "loss": 1.987, + "step": 5540 + }, + { + "epoch": 1.7007366482504604, + "grad_norm": 0.4280939996242523, + "learning_rate": 9.494565645258551e-05, + "loss": 2.0487, + "step": 5541 + }, + { + "epoch": 1.7010435850214856, + "grad_norm": 0.44816237688064575, + "learning_rate": 9.494347849448338e-05, + "loss": 1.9112, + "step": 5542 + }, + { + "epoch": 1.7013505217925107, + "grad_norm": 0.424629271030426, + "learning_rate": 9.494130009222346e-05, + "loss": 1.9284, + "step": 5543 + }, + { + "epoch": 1.701657458563536, + "grad_norm": 0.40010082721710205, + "learning_rate": 9.493912124582727e-05, + "loss": 1.9307, + "step": 5544 + }, + { + "epoch": 1.7019643953345611, + "grad_norm": 0.42541825771331787, + "learning_rate": 9.493694195531633e-05, + "loss": 2.0009, + "step": 5545 + }, + { + "epoch": 1.7022713321055862, + "grad_norm": 0.39693546295166016, + "learning_rate": 9.49347622207122e-05, + "loss": 1.9237, + "step": 5546 + }, + { + "epoch": 1.7025782688766116, + "grad_norm": 0.37853676080703735, + "learning_rate": 9.493258204203644e-05, + "loss": 1.9212, + "step": 5547 + }, + { + "epoch": 1.7028852056476365, + "grad_norm": 0.3856247663497925, + "learning_rate": 9.493040141931054e-05, + "loss": 1.926, + "step": 5548 + }, + { + "epoch": 1.7031921424186618, + "grad_norm": 0.3429555892944336, + "learning_rate": 9.492822035255608e-05, + "loss": 1.8854, + "step": 5549 + }, + { + "epoch": 1.703499079189687, + "grad_norm": 0.3500545620918274, + "learning_rate": 9.49260388417946e-05, + "loss": 1.8627, + "step": 5550 + }, + { + "epoch": 1.703806015960712, + "grad_norm": 0.3461480140686035, + "learning_rate": 9.49238568870477e-05, + "loss": 1.8962, + "step": 5551 + }, + { + "epoch": 1.7041129527317374, + "grad_norm": 0.36311015486717224, + "learning_rate": 9.492167448833691e-05, + "loss": 1.9398, + "step": 5552 + }, + { + "epoch": 1.7044198895027625, + "grad_norm": 0.36770105361938477, + "learning_rate": 9.491949164568379e-05, + "loss": 1.9083, + "step": 5553 + }, + { + "epoch": 1.7047268262737876, + "grad_norm": 0.42491769790649414, + "learning_rate": 9.491730835910993e-05, + "loss": 1.8874, + "step": 5554 + }, + { + "epoch": 1.705033763044813, + "grad_norm": 0.5321764945983887, + "learning_rate": 9.491512462863691e-05, + "loss": 1.9813, + "step": 5555 + }, + { + "epoch": 1.7053406998158378, + "grad_norm": 0.5481576323509216, + "learning_rate": 9.49129404542863e-05, + "loss": 1.8696, + "step": 5556 + }, + { + "epoch": 1.7056476365868631, + "grad_norm": 0.47720953822135925, + "learning_rate": 9.491075583607969e-05, + "loss": 1.9026, + "step": 5557 + }, + { + "epoch": 1.7059545733578882, + "grad_norm": 0.3976534605026245, + "learning_rate": 9.490857077403865e-05, + "loss": 1.8551, + "step": 5558 + }, + { + "epoch": 1.7062615101289134, + "grad_norm": 0.3744281828403473, + "learning_rate": 9.49063852681848e-05, + "loss": 2.012, + "step": 5559 + }, + { + "epoch": 1.7065684468999387, + "grad_norm": 0.3931918740272522, + "learning_rate": 9.490419931853974e-05, + "loss": 1.845, + "step": 5560 + }, + { + "epoch": 1.7068753836709638, + "grad_norm": 0.5411466956138611, + "learning_rate": 9.490201292512506e-05, + "loss": 2.0225, + "step": 5561 + }, + { + "epoch": 1.707182320441989, + "grad_norm": 0.6602910757064819, + "learning_rate": 9.489982608796237e-05, + "loss": 1.9559, + "step": 5562 + }, + { + "epoch": 1.7074892572130143, + "grad_norm": 0.5455329418182373, + "learning_rate": 9.489763880707329e-05, + "loss": 1.8855, + "step": 5563 + }, + { + "epoch": 1.7077961939840391, + "grad_norm": 0.42309099435806274, + "learning_rate": 9.489545108247941e-05, + "loss": 1.8784, + "step": 5564 + }, + { + "epoch": 1.7081031307550645, + "grad_norm": 0.3817001283168793, + "learning_rate": 9.489326291420239e-05, + "loss": 1.8926, + "step": 5565 + }, + { + "epoch": 1.7084100675260896, + "grad_norm": 0.5077582597732544, + "learning_rate": 9.489107430226381e-05, + "loss": 1.8742, + "step": 5566 + }, + { + "epoch": 1.7087170042971147, + "grad_norm": 0.5634065866470337, + "learning_rate": 9.488888524668533e-05, + "loss": 1.9251, + "step": 5567 + }, + { + "epoch": 1.70902394106814, + "grad_norm": 0.5182891488075256, + "learning_rate": 9.488669574748859e-05, + "loss": 1.9689, + "step": 5568 + }, + { + "epoch": 1.7093308778391652, + "grad_norm": 0.4180498719215393, + "learning_rate": 9.48845058046952e-05, + "loss": 1.9248, + "step": 5569 + }, + { + "epoch": 1.7096378146101903, + "grad_norm": 0.4833194315433502, + "learning_rate": 9.488231541832682e-05, + "loss": 2.0115, + "step": 5570 + }, + { + "epoch": 1.7099447513812156, + "grad_norm": 0.46525415778160095, + "learning_rate": 9.488012458840509e-05, + "loss": 1.9108, + "step": 5571 + }, + { + "epoch": 1.7102516881522405, + "grad_norm": 0.5051191449165344, + "learning_rate": 9.487793331495166e-05, + "loss": 1.9055, + "step": 5572 + }, + { + "epoch": 1.7105586249232658, + "grad_norm": 0.4713154137134552, + "learning_rate": 9.48757415979882e-05, + "loss": 1.9104, + "step": 5573 + }, + { + "epoch": 1.710865561694291, + "grad_norm": 0.44901835918426514, + "learning_rate": 9.487354943753635e-05, + "loss": 1.9536, + "step": 5574 + }, + { + "epoch": 1.711172498465316, + "grad_norm": 0.41106006503105164, + "learning_rate": 9.487135683361778e-05, + "loss": 1.9549, + "step": 5575 + }, + { + "epoch": 1.7114794352363414, + "grad_norm": 0.4571320116519928, + "learning_rate": 9.486916378625416e-05, + "loss": 1.859, + "step": 5576 + }, + { + "epoch": 1.7117863720073665, + "grad_norm": 0.4423540532588959, + "learning_rate": 9.486697029546718e-05, + "loss": 1.9621, + "step": 5577 + }, + { + "epoch": 1.7120933087783916, + "grad_norm": 0.44291070103645325, + "learning_rate": 9.48647763612785e-05, + "loss": 1.8567, + "step": 5578 + }, + { + "epoch": 1.712400245549417, + "grad_norm": 0.4374423921108246, + "learning_rate": 9.486258198370981e-05, + "loss": 1.9754, + "step": 5579 + }, + { + "epoch": 1.7127071823204418, + "grad_norm": 0.44008153676986694, + "learning_rate": 9.486038716278277e-05, + "loss": 1.8815, + "step": 5580 + }, + { + "epoch": 1.7130141190914672, + "grad_norm": 0.3571348190307617, + "learning_rate": 9.48581918985191e-05, + "loss": 1.8948, + "step": 5581 + }, + { + "epoch": 1.7133210558624923, + "grad_norm": 0.42260754108428955, + "learning_rate": 9.485599619094049e-05, + "loss": 1.9964, + "step": 5582 + }, + { + "epoch": 1.7136279926335174, + "grad_norm": 0.44568777084350586, + "learning_rate": 9.485380004006863e-05, + "loss": 1.9596, + "step": 5583 + }, + { + "epoch": 1.7139349294045427, + "grad_norm": 0.5488269925117493, + "learning_rate": 9.485160344592523e-05, + "loss": 1.9239, + "step": 5584 + }, + { + "epoch": 1.7142418661755678, + "grad_norm": 0.5653155446052551, + "learning_rate": 9.484940640853199e-05, + "loss": 1.9115, + "step": 5585 + }, + { + "epoch": 1.714548802946593, + "grad_norm": 0.4652312099933624, + "learning_rate": 9.484720892791064e-05, + "loss": 1.9973, + "step": 5586 + }, + { + "epoch": 1.7148557397176183, + "grad_norm": 0.41521382331848145, + "learning_rate": 9.484501100408288e-05, + "loss": 1.9395, + "step": 5587 + }, + { + "epoch": 1.7151626764886432, + "grad_norm": 0.46761438250541687, + "learning_rate": 9.484281263707043e-05, + "loss": 1.9465, + "step": 5588 + }, + { + "epoch": 1.7154696132596685, + "grad_norm": 0.46990182995796204, + "learning_rate": 9.484061382689501e-05, + "loss": 1.8969, + "step": 5589 + }, + { + "epoch": 1.7157765500306936, + "grad_norm": 0.44951021671295166, + "learning_rate": 9.48384145735784e-05, + "loss": 1.9925, + "step": 5590 + }, + { + "epoch": 1.7160834868017187, + "grad_norm": 0.4029327630996704, + "learning_rate": 9.483621487714227e-05, + "loss": 1.8574, + "step": 5591 + }, + { + "epoch": 1.716390423572744, + "grad_norm": 0.3501027226448059, + "learning_rate": 9.48340147376084e-05, + "loss": 1.9156, + "step": 5592 + }, + { + "epoch": 1.7166973603437692, + "grad_norm": 0.5058720111846924, + "learning_rate": 9.48318141549985e-05, + "loss": 2.071, + "step": 5593 + }, + { + "epoch": 1.7170042971147943, + "grad_norm": 0.5097518563270569, + "learning_rate": 9.482961312933435e-05, + "loss": 1.9609, + "step": 5594 + }, + { + "epoch": 1.7173112338858196, + "grad_norm": 0.4728573262691498, + "learning_rate": 9.482741166063769e-05, + "loss": 1.9552, + "step": 5595 + }, + { + "epoch": 1.7176181706568447, + "grad_norm": 0.44095897674560547, + "learning_rate": 9.482520974893026e-05, + "loss": 2.011, + "step": 5596 + }, + { + "epoch": 1.7179251074278699, + "grad_norm": 0.48331573605537415, + "learning_rate": 9.482300739423385e-05, + "loss": 1.9676, + "step": 5597 + }, + { + "epoch": 1.7182320441988952, + "grad_norm": 0.4890894293785095, + "learning_rate": 9.482080459657019e-05, + "loss": 1.9571, + "step": 5598 + }, + { + "epoch": 1.71853898096992, + "grad_norm": 0.4486929476261139, + "learning_rate": 9.481860135596109e-05, + "loss": 1.9205, + "step": 5599 + }, + { + "epoch": 1.7188459177409454, + "grad_norm": 0.44154083728790283, + "learning_rate": 9.48163976724283e-05, + "loss": 1.9995, + "step": 5600 + }, + { + "epoch": 1.7191528545119705, + "grad_norm": 0.4155641496181488, + "learning_rate": 9.481419354599358e-05, + "loss": 1.9192, + "step": 5601 + }, + { + "epoch": 1.7194597912829956, + "grad_norm": 0.453253835439682, + "learning_rate": 9.481198897667875e-05, + "loss": 2.0102, + "step": 5602 + }, + { + "epoch": 1.719766728054021, + "grad_norm": 0.4325653314590454, + "learning_rate": 9.480978396450557e-05, + "loss": 1.8859, + "step": 5603 + }, + { + "epoch": 1.720073664825046, + "grad_norm": 0.4191089868545532, + "learning_rate": 9.480757850949584e-05, + "loss": 2.0007, + "step": 5604 + }, + { + "epoch": 1.7203806015960712, + "grad_norm": 0.4182284474372864, + "learning_rate": 9.480537261167137e-05, + "loss": 1.9374, + "step": 5605 + }, + { + "epoch": 1.7206875383670965, + "grad_norm": 0.4695988893508911, + "learning_rate": 9.480316627105394e-05, + "loss": 1.983, + "step": 5606 + }, + { + "epoch": 1.7209944751381214, + "grad_norm": 0.4668160378932953, + "learning_rate": 9.480095948766536e-05, + "loss": 1.8705, + "step": 5607 + }, + { + "epoch": 1.7213014119091468, + "grad_norm": 0.3689236044883728, + "learning_rate": 9.479875226152744e-05, + "loss": 1.8695, + "step": 5608 + }, + { + "epoch": 1.7216083486801719, + "grad_norm": 0.4206932485103607, + "learning_rate": 9.4796544592662e-05, + "loss": 1.9494, + "step": 5609 + }, + { + "epoch": 1.721915285451197, + "grad_norm": 0.4420578181743622, + "learning_rate": 9.479433648109083e-05, + "loss": 1.8749, + "step": 5610 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.4243582487106323, + "learning_rate": 9.479212792683579e-05, + "loss": 1.9524, + "step": 5611 + }, + { + "epoch": 1.7225291589932474, + "grad_norm": 0.5053666234016418, + "learning_rate": 9.478991892991868e-05, + "loss": 1.9308, + "step": 5612 + }, + { + "epoch": 1.7228360957642725, + "grad_norm": 0.4365650713443756, + "learning_rate": 9.478770949036136e-05, + "loss": 1.9469, + "step": 5613 + }, + { + "epoch": 1.7231430325352979, + "grad_norm": 0.3916216194629669, + "learning_rate": 9.478549960818561e-05, + "loss": 1.8239, + "step": 5614 + }, + { + "epoch": 1.7234499693063228, + "grad_norm": 0.4051356911659241, + "learning_rate": 9.478328928341334e-05, + "loss": 1.892, + "step": 5615 + }, + { + "epoch": 1.723756906077348, + "grad_norm": 0.36592593789100647, + "learning_rate": 9.478107851606633e-05, + "loss": 1.8763, + "step": 5616 + }, + { + "epoch": 1.7240638428483732, + "grad_norm": 0.45741888880729675, + "learning_rate": 9.477886730616645e-05, + "loss": 1.9502, + "step": 5617 + }, + { + "epoch": 1.7243707796193983, + "grad_norm": 0.38170990347862244, + "learning_rate": 9.477665565373558e-05, + "loss": 1.8568, + "step": 5618 + }, + { + "epoch": 1.7246777163904237, + "grad_norm": 0.4193691313266754, + "learning_rate": 9.477444355879554e-05, + "loss": 1.9553, + "step": 5619 + }, + { + "epoch": 1.7249846531614488, + "grad_norm": 0.39682838320732117, + "learning_rate": 9.477223102136821e-05, + "loss": 1.9474, + "step": 5620 + }, + { + "epoch": 1.725291589932474, + "grad_norm": 0.391544371843338, + "learning_rate": 9.477001804147545e-05, + "loss": 1.9277, + "step": 5621 + }, + { + "epoch": 1.7255985267034992, + "grad_norm": 0.42348888516426086, + "learning_rate": 9.476780461913913e-05, + "loss": 1.8923, + "step": 5622 + }, + { + "epoch": 1.7259054634745241, + "grad_norm": 0.4393916130065918, + "learning_rate": 9.476559075438114e-05, + "loss": 1.9052, + "step": 5623 + }, + { + "epoch": 1.7262124002455494, + "grad_norm": 0.42631569504737854, + "learning_rate": 9.476337644722333e-05, + "loss": 1.8849, + "step": 5624 + }, + { + "epoch": 1.7265193370165746, + "grad_norm": 0.3514206111431122, + "learning_rate": 9.47611616976876e-05, + "loss": 1.9286, + "step": 5625 + }, + { + "epoch": 1.7268262737875997, + "grad_norm": 0.4104609191417694, + "learning_rate": 9.475894650579582e-05, + "loss": 1.9178, + "step": 5626 + }, + { + "epoch": 1.727133210558625, + "grad_norm": 0.44329676032066345, + "learning_rate": 9.475673087156992e-05, + "loss": 1.9789, + "step": 5627 + }, + { + "epoch": 1.7274401473296501, + "grad_norm": 0.41865840554237366, + "learning_rate": 9.475451479503175e-05, + "loss": 1.9105, + "step": 5628 + }, + { + "epoch": 1.7277470841006752, + "grad_norm": 0.4166790544986725, + "learning_rate": 9.475229827620326e-05, + "loss": 1.9089, + "step": 5629 + }, + { + "epoch": 1.7280540208717006, + "grad_norm": 0.353771448135376, + "learning_rate": 9.475008131510633e-05, + "loss": 1.9081, + "step": 5630 + }, + { + "epoch": 1.7283609576427255, + "grad_norm": 0.385046124458313, + "learning_rate": 9.474786391176284e-05, + "loss": 1.9268, + "step": 5631 + }, + { + "epoch": 1.7286678944137508, + "grad_norm": 0.3956538438796997, + "learning_rate": 9.474564606619474e-05, + "loss": 1.9445, + "step": 5632 + }, + { + "epoch": 1.728974831184776, + "grad_norm": 0.41305112838745117, + "learning_rate": 9.474342777842394e-05, + "loss": 1.9331, + "step": 5633 + }, + { + "epoch": 1.729281767955801, + "grad_norm": 0.39336860179901123, + "learning_rate": 9.474120904847237e-05, + "loss": 1.9792, + "step": 5634 + }, + { + "epoch": 1.7295887047268264, + "grad_norm": 0.41963186860084534, + "learning_rate": 9.473898987636194e-05, + "loss": 1.8719, + "step": 5635 + }, + { + "epoch": 1.7298956414978515, + "grad_norm": 0.4087338149547577, + "learning_rate": 9.473677026211458e-05, + "loss": 1.9121, + "step": 5636 + }, + { + "epoch": 1.7302025782688766, + "grad_norm": 0.3693830966949463, + "learning_rate": 9.473455020575226e-05, + "loss": 1.9293, + "step": 5637 + }, + { + "epoch": 1.730509515039902, + "grad_norm": 0.40699541568756104, + "learning_rate": 9.473232970729688e-05, + "loss": 1.94, + "step": 5638 + }, + { + "epoch": 1.7308164518109268, + "grad_norm": 0.4222811162471771, + "learning_rate": 9.473010876677041e-05, + "loss": 1.9416, + "step": 5639 + }, + { + "epoch": 1.7311233885819521, + "grad_norm": 0.41459110379219055, + "learning_rate": 9.472788738419477e-05, + "loss": 1.8801, + "step": 5640 + }, + { + "epoch": 1.7314303253529773, + "grad_norm": 0.36970487236976624, + "learning_rate": 9.472566555959195e-05, + "loss": 1.9122, + "step": 5641 + }, + { + "epoch": 1.7317372621240024, + "grad_norm": 0.35511577129364014, + "learning_rate": 9.472344329298388e-05, + "loss": 1.8646, + "step": 5642 + }, + { + "epoch": 1.7320441988950277, + "grad_norm": 0.3511577248573303, + "learning_rate": 9.472122058439252e-05, + "loss": 1.9047, + "step": 5643 + }, + { + "epoch": 1.7323511356660528, + "grad_norm": 0.3421955108642578, + "learning_rate": 9.471899743383986e-05, + "loss": 1.8732, + "step": 5644 + }, + { + "epoch": 1.732658072437078, + "grad_norm": 0.44008341431617737, + "learning_rate": 9.471677384134785e-05, + "loss": 1.8956, + "step": 5645 + }, + { + "epoch": 1.7329650092081033, + "grad_norm": 0.49410128593444824, + "learning_rate": 9.471454980693848e-05, + "loss": 1.9197, + "step": 5646 + }, + { + "epoch": 1.7332719459791281, + "grad_norm": 0.4664965867996216, + "learning_rate": 9.471232533063373e-05, + "loss": 1.8945, + "step": 5647 + }, + { + "epoch": 1.7335788827501535, + "grad_norm": 0.3789248764514923, + "learning_rate": 9.471010041245555e-05, + "loss": 1.9153, + "step": 5648 + }, + { + "epoch": 1.7338858195211786, + "grad_norm": 0.34556612372398376, + "learning_rate": 9.470787505242596e-05, + "loss": 1.9144, + "step": 5649 + }, + { + "epoch": 1.7341927562922037, + "grad_norm": 0.3466256856918335, + "learning_rate": 9.470564925056695e-05, + "loss": 1.8837, + "step": 5650 + }, + { + "epoch": 1.734499693063229, + "grad_norm": 0.34612321853637695, + "learning_rate": 9.470342300690051e-05, + "loss": 1.8667, + "step": 5651 + }, + { + "epoch": 1.7348066298342542, + "grad_norm": 0.3648833632469177, + "learning_rate": 9.470119632144864e-05, + "loss": 1.9499, + "step": 5652 + }, + { + "epoch": 1.7351135666052793, + "grad_norm": 0.3600454330444336, + "learning_rate": 9.469896919423334e-05, + "loss": 1.9093, + "step": 5653 + }, + { + "epoch": 1.7354205033763046, + "grad_norm": 0.41487598419189453, + "learning_rate": 9.469674162527664e-05, + "loss": 1.9714, + "step": 5654 + }, + { + "epoch": 1.7357274401473295, + "grad_norm": 0.35980695486068726, + "learning_rate": 9.469451361460053e-05, + "loss": 1.9006, + "step": 5655 + }, + { + "epoch": 1.7360343769183548, + "grad_norm": 0.42676928639411926, + "learning_rate": 9.469228516222705e-05, + "loss": 1.9286, + "step": 5656 + }, + { + "epoch": 1.73634131368938, + "grad_norm": 0.41541969776153564, + "learning_rate": 9.469005626817822e-05, + "loss": 1.9243, + "step": 5657 + }, + { + "epoch": 1.736648250460405, + "grad_norm": 0.4245065152645111, + "learning_rate": 9.468782693247604e-05, + "loss": 1.9427, + "step": 5658 + }, + { + "epoch": 1.7369551872314304, + "grad_norm": 0.46148940920829773, + "learning_rate": 9.468559715514257e-05, + "loss": 2.0201, + "step": 5659 + }, + { + "epoch": 1.7372621240024555, + "grad_norm": 0.47727301716804504, + "learning_rate": 9.468336693619985e-05, + "loss": 1.9792, + "step": 5660 + }, + { + "epoch": 1.7375690607734806, + "grad_norm": 0.4807848036289215, + "learning_rate": 9.46811362756699e-05, + "loss": 1.9036, + "step": 5661 + }, + { + "epoch": 1.737875997544506, + "grad_norm": 0.5129636526107788, + "learning_rate": 9.467890517357477e-05, + "loss": 1.8861, + "step": 5662 + }, + { + "epoch": 1.7381829343155308, + "grad_norm": 0.467804878950119, + "learning_rate": 9.467667362993651e-05, + "loss": 1.868, + "step": 5663 + }, + { + "epoch": 1.7384898710865562, + "grad_norm": 0.4179893136024475, + "learning_rate": 9.46744416447772e-05, + "loss": 1.9521, + "step": 5664 + }, + { + "epoch": 1.7387968078575813, + "grad_norm": 0.4384612739086151, + "learning_rate": 9.467220921811884e-05, + "loss": 1.9167, + "step": 5665 + }, + { + "epoch": 1.7391037446286064, + "grad_norm": 0.517855703830719, + "learning_rate": 9.466997634998354e-05, + "loss": 1.8919, + "step": 5666 + }, + { + "epoch": 1.7394106813996317, + "grad_norm": 0.4875940978527069, + "learning_rate": 9.466774304039334e-05, + "loss": 1.8774, + "step": 5667 + }, + { + "epoch": 1.7397176181706568, + "grad_norm": 0.44286540150642395, + "learning_rate": 9.466550928937034e-05, + "loss": 1.9696, + "step": 5668 + }, + { + "epoch": 1.740024554941682, + "grad_norm": 0.4092461168766022, + "learning_rate": 9.466327509693658e-05, + "loss": 1.9978, + "step": 5669 + }, + { + "epoch": 1.7403314917127073, + "grad_norm": 0.42797163128852844, + "learning_rate": 9.466104046311418e-05, + "loss": 1.9428, + "step": 5670 + }, + { + "epoch": 1.7406384284837324, + "grad_norm": 0.5174738764762878, + "learning_rate": 9.465880538792518e-05, + "loss": 1.9493, + "step": 5671 + }, + { + "epoch": 1.7409453652547575, + "grad_norm": 0.6263836622238159, + "learning_rate": 9.46565698713917e-05, + "loss": 1.9131, + "step": 5672 + }, + { + "epoch": 1.7412523020257828, + "grad_norm": 0.6452967524528503, + "learning_rate": 9.465433391353582e-05, + "loss": 2.0412, + "step": 5673 + }, + { + "epoch": 1.7415592387968077, + "grad_norm": 0.5004684925079346, + "learning_rate": 9.465209751437964e-05, + "loss": 1.8721, + "step": 5674 + }, + { + "epoch": 1.741866175567833, + "grad_norm": 0.4694507420063019, + "learning_rate": 9.464986067394526e-05, + "loss": 1.9614, + "step": 5675 + }, + { + "epoch": 1.7421731123388582, + "grad_norm": 0.4519532322883606, + "learning_rate": 9.464762339225479e-05, + "loss": 1.9687, + "step": 5676 + }, + { + "epoch": 1.7424800491098833, + "grad_norm": 0.4297941029071808, + "learning_rate": 9.464538566933033e-05, + "loss": 1.965, + "step": 5677 + }, + { + "epoch": 1.7427869858809086, + "grad_norm": 0.4612393081188202, + "learning_rate": 9.464314750519401e-05, + "loss": 1.9651, + "step": 5678 + }, + { + "epoch": 1.7430939226519337, + "grad_norm": 0.394142210483551, + "learning_rate": 9.464090889986794e-05, + "loss": 1.9185, + "step": 5679 + }, + { + "epoch": 1.7434008594229589, + "grad_norm": 0.39999979734420776, + "learning_rate": 9.463866985337424e-05, + "loss": 1.899, + "step": 5680 + }, + { + "epoch": 1.7437077961939842, + "grad_norm": 0.40942859649658203, + "learning_rate": 9.463643036573504e-05, + "loss": 1.9653, + "step": 5681 + }, + { + "epoch": 1.744014732965009, + "grad_norm": 0.4097300171852112, + "learning_rate": 9.463419043697248e-05, + "loss": 1.9944, + "step": 5682 + }, + { + "epoch": 1.7443216697360344, + "grad_norm": 0.41627535223960876, + "learning_rate": 9.463195006710868e-05, + "loss": 1.9156, + "step": 5683 + }, + { + "epoch": 1.7446286065070595, + "grad_norm": 0.3789215385913849, + "learning_rate": 9.46297092561658e-05, + "loss": 1.9262, + "step": 5684 + }, + { + "epoch": 1.7449355432780846, + "grad_norm": 0.4867783188819885, + "learning_rate": 9.462746800416595e-05, + "loss": 1.961, + "step": 5685 + }, + { + "epoch": 1.74524248004911, + "grad_norm": 0.6078580617904663, + "learning_rate": 9.462522631113133e-05, + "loss": 1.9694, + "step": 5686 + }, + { + "epoch": 1.745549416820135, + "grad_norm": 0.558968186378479, + "learning_rate": 9.462298417708406e-05, + "loss": 1.9537, + "step": 5687 + }, + { + "epoch": 1.7458563535911602, + "grad_norm": 0.4677596986293793, + "learning_rate": 9.46207416020463e-05, + "loss": 1.9253, + "step": 5688 + }, + { + "epoch": 1.7461632903621855, + "grad_norm": 0.40353646874427795, + "learning_rate": 9.461849858604023e-05, + "loss": 1.8992, + "step": 5689 + }, + { + "epoch": 1.7464702271332104, + "grad_norm": 0.3738614618778229, + "learning_rate": 9.4616255129088e-05, + "loss": 1.9109, + "step": 5690 + }, + { + "epoch": 1.7467771639042358, + "grad_norm": 0.4040324091911316, + "learning_rate": 9.461401123121179e-05, + "loss": 1.8981, + "step": 5691 + }, + { + "epoch": 1.7470841006752609, + "grad_norm": 0.44214901328086853, + "learning_rate": 9.461176689243376e-05, + "loss": 1.9244, + "step": 5692 + }, + { + "epoch": 1.747391037446286, + "grad_norm": 0.44187378883361816, + "learning_rate": 9.460952211277611e-05, + "loss": 1.9329, + "step": 5693 + }, + { + "epoch": 1.7476979742173113, + "grad_norm": 0.44287410378456116, + "learning_rate": 9.460727689226102e-05, + "loss": 1.97, + "step": 5694 + }, + { + "epoch": 1.7480049109883364, + "grad_norm": 0.3757341504096985, + "learning_rate": 9.460503123091067e-05, + "loss": 1.8766, + "step": 5695 + }, + { + "epoch": 1.7483118477593615, + "grad_norm": 0.4139314591884613, + "learning_rate": 9.460278512874725e-05, + "loss": 1.902, + "step": 5696 + }, + { + "epoch": 1.7486187845303869, + "grad_norm": 0.37526339292526245, + "learning_rate": 9.460053858579298e-05, + "loss": 1.9325, + "step": 5697 + }, + { + "epoch": 1.7489257213014118, + "grad_norm": 0.3770616948604584, + "learning_rate": 9.459829160207004e-05, + "loss": 1.9437, + "step": 5698 + }, + { + "epoch": 1.749232658072437, + "grad_norm": 0.4069806933403015, + "learning_rate": 9.459604417760064e-05, + "loss": 1.9454, + "step": 5699 + }, + { + "epoch": 1.7495395948434622, + "grad_norm": 0.42822694778442383, + "learning_rate": 9.459379631240699e-05, + "loss": 1.8798, + "step": 5700 + }, + { + "epoch": 1.7498465316144873, + "grad_norm": 0.44075292348861694, + "learning_rate": 9.459154800651131e-05, + "loss": 1.9842, + "step": 5701 + }, + { + "epoch": 1.7501534683855127, + "grad_norm": 0.4151122272014618, + "learning_rate": 9.458929925993583e-05, + "loss": 1.8495, + "step": 5702 + }, + { + "epoch": 1.7504604051565378, + "grad_norm": 0.41887882351875305, + "learning_rate": 9.458705007270275e-05, + "loss": 1.9611, + "step": 5703 + }, + { + "epoch": 1.750767341927563, + "grad_norm": 0.3976796865463257, + "learning_rate": 9.45848004448343e-05, + "loss": 1.8841, + "step": 5704 + }, + { + "epoch": 1.7510742786985882, + "grad_norm": 0.3783813416957855, + "learning_rate": 9.458255037635272e-05, + "loss": 1.8897, + "step": 5705 + }, + { + "epoch": 1.7513812154696131, + "grad_norm": 0.35153308510780334, + "learning_rate": 9.458029986728026e-05, + "loss": 1.911, + "step": 5706 + }, + { + "epoch": 1.7516881522406385, + "grad_norm": 0.38390985131263733, + "learning_rate": 9.457804891763913e-05, + "loss": 2.0105, + "step": 5707 + }, + { + "epoch": 1.7519950890116636, + "grad_norm": 0.3830740451812744, + "learning_rate": 9.457579752745161e-05, + "loss": 1.9635, + "step": 5708 + }, + { + "epoch": 1.7523020257826887, + "grad_norm": 0.3711417019367218, + "learning_rate": 9.457354569673993e-05, + "loss": 1.8553, + "step": 5709 + }, + { + "epoch": 1.752608962553714, + "grad_norm": 0.3670618236064911, + "learning_rate": 9.457129342552633e-05, + "loss": 1.9044, + "step": 5710 + }, + { + "epoch": 1.7529158993247391, + "grad_norm": 0.398863285779953, + "learning_rate": 9.45690407138331e-05, + "loss": 1.987, + "step": 5711 + }, + { + "epoch": 1.7532228360957642, + "grad_norm": 0.4100732207298279, + "learning_rate": 9.456678756168248e-05, + "loss": 1.8552, + "step": 5712 + }, + { + "epoch": 1.7535297728667896, + "grad_norm": 0.41883236169815063, + "learning_rate": 9.456453396909676e-05, + "loss": 1.9183, + "step": 5713 + }, + { + "epoch": 1.7538367096378145, + "grad_norm": 0.4063440263271332, + "learning_rate": 9.456227993609818e-05, + "loss": 1.8751, + "step": 5714 + }, + { + "epoch": 1.7541436464088398, + "grad_norm": 0.3880515694618225, + "learning_rate": 9.456002546270904e-05, + "loss": 1.9558, + "step": 5715 + }, + { + "epoch": 1.754450583179865, + "grad_norm": 0.38582444190979004, + "learning_rate": 9.45577705489516e-05, + "loss": 1.9588, + "step": 5716 + }, + { + "epoch": 1.75475751995089, + "grad_norm": 0.3678396940231323, + "learning_rate": 9.455551519484816e-05, + "loss": 1.9108, + "step": 5717 + }, + { + "epoch": 1.7550644567219154, + "grad_norm": 0.3590768277645111, + "learning_rate": 9.455325940042098e-05, + "loss": 1.9027, + "step": 5718 + }, + { + "epoch": 1.7553713934929405, + "grad_norm": 0.4104592204093933, + "learning_rate": 9.455100316569241e-05, + "loss": 1.9099, + "step": 5719 + }, + { + "epoch": 1.7556783302639656, + "grad_norm": 0.3774401843547821, + "learning_rate": 9.45487464906847e-05, + "loss": 1.9098, + "step": 5720 + }, + { + "epoch": 1.755985267034991, + "grad_norm": 0.38464388251304626, + "learning_rate": 9.454648937542019e-05, + "loss": 1.9194, + "step": 5721 + }, + { + "epoch": 1.7562922038060158, + "grad_norm": 0.435131698846817, + "learning_rate": 9.454423181992114e-05, + "loss": 1.9798, + "step": 5722 + }, + { + "epoch": 1.7565991405770411, + "grad_norm": 0.4583236575126648, + "learning_rate": 9.454197382420988e-05, + "loss": 1.9862, + "step": 5723 + }, + { + "epoch": 1.7569060773480663, + "grad_norm": 0.3644738793373108, + "learning_rate": 9.453971538830874e-05, + "loss": 1.8535, + "step": 5724 + }, + { + "epoch": 1.7572130141190914, + "grad_norm": 0.3644218444824219, + "learning_rate": 9.453745651224002e-05, + "loss": 1.8773, + "step": 5725 + }, + { + "epoch": 1.7575199508901167, + "grad_norm": 0.42884743213653564, + "learning_rate": 9.453519719602604e-05, + "loss": 1.882, + "step": 5726 + }, + { + "epoch": 1.7578268876611418, + "grad_norm": 0.41049477458000183, + "learning_rate": 9.453293743968916e-05, + "loss": 1.9133, + "step": 5727 + }, + { + "epoch": 1.758133824432167, + "grad_norm": 0.35882604122161865, + "learning_rate": 9.453067724325169e-05, + "loss": 1.9056, + "step": 5728 + }, + { + "epoch": 1.7584407612031923, + "grad_norm": 0.34516364336013794, + "learning_rate": 9.452841660673595e-05, + "loss": 1.8894, + "step": 5729 + }, + { + "epoch": 1.7587476979742172, + "grad_norm": 0.41804373264312744, + "learning_rate": 9.45261555301643e-05, + "loss": 1.8798, + "step": 5730 + }, + { + "epoch": 1.7590546347452425, + "grad_norm": 0.48584702610969543, + "learning_rate": 9.45238940135591e-05, + "loss": 1.9353, + "step": 5731 + }, + { + "epoch": 1.7593615715162676, + "grad_norm": 0.5693044662475586, + "learning_rate": 9.452163205694267e-05, + "loss": 1.8813, + "step": 5732 + }, + { + "epoch": 1.7596685082872927, + "grad_norm": 0.6146205067634583, + "learning_rate": 9.451936966033738e-05, + "loss": 1.9993, + "step": 5733 + }, + { + "epoch": 1.759975445058318, + "grad_norm": 0.4658338129520416, + "learning_rate": 9.451710682376558e-05, + "loss": 1.8977, + "step": 5734 + }, + { + "epoch": 1.7602823818293432, + "grad_norm": 0.35184696316719055, + "learning_rate": 9.451484354724964e-05, + "loss": 1.9924, + "step": 5735 + }, + { + "epoch": 1.7605893186003683, + "grad_norm": 0.48720163106918335, + "learning_rate": 9.451257983081194e-05, + "loss": 1.9054, + "step": 5736 + }, + { + "epoch": 1.7608962553713936, + "grad_norm": 0.6268271803855896, + "learning_rate": 9.451031567447482e-05, + "loss": 1.9956, + "step": 5737 + }, + { + "epoch": 1.7612031921424187, + "grad_norm": 0.5384534001350403, + "learning_rate": 9.450805107826068e-05, + "loss": 1.9169, + "step": 5738 + }, + { + "epoch": 1.7615101289134438, + "grad_norm": 0.4011121094226837, + "learning_rate": 9.450578604219188e-05, + "loss": 1.9845, + "step": 5739 + }, + { + "epoch": 1.7618170656844692, + "grad_norm": 0.4422668516635895, + "learning_rate": 9.450352056629082e-05, + "loss": 2.0014, + "step": 5740 + }, + { + "epoch": 1.762124002455494, + "grad_norm": 0.5033303499221802, + "learning_rate": 9.45012546505799e-05, + "loss": 1.9142, + "step": 5741 + }, + { + "epoch": 1.7624309392265194, + "grad_norm": 0.6074427366256714, + "learning_rate": 9.449898829508148e-05, + "loss": 1.9385, + "step": 5742 + }, + { + "epoch": 1.7627378759975445, + "grad_norm": 0.6405495405197144, + "learning_rate": 9.449672149981799e-05, + "loss": 1.9792, + "step": 5743 + }, + { + "epoch": 1.7630448127685696, + "grad_norm": 0.5432560443878174, + "learning_rate": 9.449445426481182e-05, + "loss": 1.9294, + "step": 5744 + }, + { + "epoch": 1.763351749539595, + "grad_norm": 0.41406089067459106, + "learning_rate": 9.449218659008536e-05, + "loss": 1.9266, + "step": 5745 + }, + { + "epoch": 1.76365868631062, + "grad_norm": 0.41278013586997986, + "learning_rate": 9.448991847566104e-05, + "loss": 1.9448, + "step": 5746 + }, + { + "epoch": 1.7639656230816452, + "grad_norm": 0.4682934582233429, + "learning_rate": 9.448764992156128e-05, + "loss": 1.9836, + "step": 5747 + }, + { + "epoch": 1.7642725598526705, + "grad_norm": 0.47673073410987854, + "learning_rate": 9.448538092780848e-05, + "loss": 2.0229, + "step": 5748 + }, + { + "epoch": 1.7645794966236954, + "grad_norm": 0.3956258296966553, + "learning_rate": 9.448311149442507e-05, + "loss": 1.9871, + "step": 5749 + }, + { + "epoch": 1.7648864333947207, + "grad_norm": 0.39578214287757874, + "learning_rate": 9.448084162143348e-05, + "loss": 1.8991, + "step": 5750 + }, + { + "epoch": 1.7651933701657458, + "grad_norm": 0.42902353405952454, + "learning_rate": 9.447857130885614e-05, + "loss": 1.9925, + "step": 5751 + }, + { + "epoch": 1.765500306936771, + "grad_norm": 0.45643556118011475, + "learning_rate": 9.44763005567155e-05, + "loss": 1.9662, + "step": 5752 + }, + { + "epoch": 1.7658072437077963, + "grad_norm": 0.39291635155677795, + "learning_rate": 9.447402936503398e-05, + "loss": 1.8925, + "step": 5753 + }, + { + "epoch": 1.7661141804788214, + "grad_norm": 0.36709296703338623, + "learning_rate": 9.447175773383404e-05, + "loss": 1.8669, + "step": 5754 + }, + { + "epoch": 1.7664211172498465, + "grad_norm": 0.41586652398109436, + "learning_rate": 9.446948566313812e-05, + "loss": 1.8925, + "step": 5755 + }, + { + "epoch": 1.7667280540208719, + "grad_norm": 0.42532578110694885, + "learning_rate": 9.446721315296867e-05, + "loss": 1.9923, + "step": 5756 + }, + { + "epoch": 1.7670349907918967, + "grad_norm": 0.45310646295547485, + "learning_rate": 9.446494020334817e-05, + "loss": 1.9908, + "step": 5757 + }, + { + "epoch": 1.767341927562922, + "grad_norm": 0.4391445219516754, + "learning_rate": 9.446266681429907e-05, + "loss": 1.9391, + "step": 5758 + }, + { + "epoch": 1.7676488643339472, + "grad_norm": 0.3728313446044922, + "learning_rate": 9.446039298584382e-05, + "loss": 1.9352, + "step": 5759 + }, + { + "epoch": 1.7679558011049723, + "grad_norm": 0.3862408697605133, + "learning_rate": 9.445811871800492e-05, + "loss": 1.9628, + "step": 5760 + }, + { + "epoch": 1.7682627378759976, + "grad_norm": 0.3704443573951721, + "learning_rate": 9.445584401080482e-05, + "loss": 1.9041, + "step": 5761 + }, + { + "epoch": 1.7685696746470227, + "grad_norm": 0.3490816652774811, + "learning_rate": 9.445356886426603e-05, + "loss": 1.9203, + "step": 5762 + }, + { + "epoch": 1.7688766114180479, + "grad_norm": 0.40135613083839417, + "learning_rate": 9.445129327841102e-05, + "loss": 1.9166, + "step": 5763 + }, + { + "epoch": 1.7691835481890732, + "grad_norm": 0.3794950246810913, + "learning_rate": 9.444901725326227e-05, + "loss": 1.8735, + "step": 5764 + }, + { + "epoch": 1.769490484960098, + "grad_norm": 0.3908408284187317, + "learning_rate": 9.444674078884228e-05, + "loss": 1.9044, + "step": 5765 + }, + { + "epoch": 1.7697974217311234, + "grad_norm": 0.45880573987960815, + "learning_rate": 9.444446388517354e-05, + "loss": 1.999, + "step": 5766 + }, + { + "epoch": 1.7701043585021485, + "grad_norm": 0.44833555817604065, + "learning_rate": 9.444218654227856e-05, + "loss": 1.8638, + "step": 5767 + }, + { + "epoch": 1.7704112952731736, + "grad_norm": 0.4608282446861267, + "learning_rate": 9.443990876017985e-05, + "loss": 2.0073, + "step": 5768 + }, + { + "epoch": 1.770718232044199, + "grad_norm": 0.41873493790626526, + "learning_rate": 9.44376305388999e-05, + "loss": 1.9337, + "step": 5769 + }, + { + "epoch": 1.771025168815224, + "grad_norm": 0.44395530223846436, + "learning_rate": 9.443535187846125e-05, + "loss": 1.9218, + "step": 5770 + }, + { + "epoch": 1.7713321055862492, + "grad_norm": 0.4347928464412689, + "learning_rate": 9.443307277888641e-05, + "loss": 1.9251, + "step": 5771 + }, + { + "epoch": 1.7716390423572745, + "grad_norm": 0.4892890155315399, + "learning_rate": 9.44307932401979e-05, + "loss": 1.9549, + "step": 5772 + }, + { + "epoch": 1.7719459791282994, + "grad_norm": 0.4234324097633362, + "learning_rate": 9.442851326241826e-05, + "loss": 1.9835, + "step": 5773 + }, + { + "epoch": 1.7722529158993248, + "grad_norm": 0.3614303171634674, + "learning_rate": 9.442623284557e-05, + "loss": 1.8942, + "step": 5774 + }, + { + "epoch": 1.7725598526703499, + "grad_norm": 0.4273429214954376, + "learning_rate": 9.442395198967566e-05, + "loss": 1.9363, + "step": 5775 + }, + { + "epoch": 1.772866789441375, + "grad_norm": 0.5049880146980286, + "learning_rate": 9.44216706947578e-05, + "loss": 1.904, + "step": 5776 + }, + { + "epoch": 1.7731737262124003, + "grad_norm": 0.5713424682617188, + "learning_rate": 9.441938896083895e-05, + "loss": 1.9756, + "step": 5777 + }, + { + "epoch": 1.7734806629834254, + "grad_norm": 0.4836362600326538, + "learning_rate": 9.441710678794166e-05, + "loss": 1.9657, + "step": 5778 + }, + { + "epoch": 1.7737875997544506, + "grad_norm": 0.39967820048332214, + "learning_rate": 9.44148241760885e-05, + "loss": 1.9566, + "step": 5779 + }, + { + "epoch": 1.7740945365254759, + "grad_norm": 0.38304075598716736, + "learning_rate": 9.4412541125302e-05, + "loss": 1.9055, + "step": 5780 + }, + { + "epoch": 1.7744014732965008, + "grad_norm": 0.3932463526725769, + "learning_rate": 9.441025763560474e-05, + "loss": 1.9603, + "step": 5781 + }, + { + "epoch": 1.774708410067526, + "grad_norm": 0.4528409242630005, + "learning_rate": 9.44079737070193e-05, + "loss": 2.0095, + "step": 5782 + }, + { + "epoch": 1.7750153468385512, + "grad_norm": 0.42075392603874207, + "learning_rate": 9.440568933956822e-05, + "loss": 1.8818, + "step": 5783 + }, + { + "epoch": 1.7753222836095763, + "grad_norm": 0.4114269018173218, + "learning_rate": 9.44034045332741e-05, + "loss": 1.8524, + "step": 5784 + }, + { + "epoch": 1.7756292203806017, + "grad_norm": 0.4052261412143707, + "learning_rate": 9.44011192881595e-05, + "loss": 1.9759, + "step": 5785 + }, + { + "epoch": 1.7759361571516268, + "grad_norm": 0.3551998436450958, + "learning_rate": 9.439883360424702e-05, + "loss": 1.9534, + "step": 5786 + }, + { + "epoch": 1.776243093922652, + "grad_norm": 0.404109925031662, + "learning_rate": 9.439654748155924e-05, + "loss": 1.8944, + "step": 5787 + }, + { + "epoch": 1.7765500306936772, + "grad_norm": 0.4092860519886017, + "learning_rate": 9.439426092011875e-05, + "loss": 2.0341, + "step": 5788 + }, + { + "epoch": 1.7768569674647021, + "grad_norm": 0.36132386326789856, + "learning_rate": 9.439197391994819e-05, + "loss": 1.8746, + "step": 5789 + }, + { + "epoch": 1.7771639042357275, + "grad_norm": 0.34845319390296936, + "learning_rate": 9.438968648107009e-05, + "loss": 1.8646, + "step": 5790 + }, + { + "epoch": 1.7774708410067526, + "grad_norm": 0.33360353112220764, + "learning_rate": 9.43873986035071e-05, + "loss": 1.901, + "step": 5791 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.348147988319397, + "learning_rate": 9.438511028728181e-05, + "loss": 1.8703, + "step": 5792 + }, + { + "epoch": 1.778084714548803, + "grad_norm": 0.385662704706192, + "learning_rate": 9.438282153241686e-05, + "loss": 1.9806, + "step": 5793 + }, + { + "epoch": 1.7783916513198281, + "grad_norm": 0.39457234740257263, + "learning_rate": 9.438053233893484e-05, + "loss": 1.9324, + "step": 5794 + }, + { + "epoch": 1.7786985880908532, + "grad_norm": 0.35745853185653687, + "learning_rate": 9.43782427068584e-05, + "loss": 1.9754, + "step": 5795 + }, + { + "epoch": 1.7790055248618786, + "grad_norm": 0.40866991877555847, + "learning_rate": 9.437595263621015e-05, + "loss": 1.959, + "step": 5796 + }, + { + "epoch": 1.7793124616329035, + "grad_norm": 0.3938930630683899, + "learning_rate": 9.437366212701274e-05, + "loss": 1.8746, + "step": 5797 + }, + { + "epoch": 1.7796193984039288, + "grad_norm": 0.36665603518486023, + "learning_rate": 9.437137117928878e-05, + "loss": 1.9209, + "step": 5798 + }, + { + "epoch": 1.779926335174954, + "grad_norm": 0.38514846563339233, + "learning_rate": 9.436907979306092e-05, + "loss": 1.8697, + "step": 5799 + }, + { + "epoch": 1.780233271945979, + "grad_norm": 0.4100898206233978, + "learning_rate": 9.43667879683518e-05, + "loss": 1.9606, + "step": 5800 + }, + { + "epoch": 1.7805402087170044, + "grad_norm": 0.40195250511169434, + "learning_rate": 9.43644957051841e-05, + "loss": 1.918, + "step": 5801 + }, + { + "epoch": 1.7808471454880295, + "grad_norm": 0.3943032920360565, + "learning_rate": 9.436220300358043e-05, + "loss": 1.9394, + "step": 5802 + }, + { + "epoch": 1.7811540822590546, + "grad_norm": 0.4171943664550781, + "learning_rate": 9.435990986356349e-05, + "loss": 1.9773, + "step": 5803 + }, + { + "epoch": 1.78146101903008, + "grad_norm": 0.4278806746006012, + "learning_rate": 9.435761628515589e-05, + "loss": 1.8696, + "step": 5804 + }, + { + "epoch": 1.7817679558011048, + "grad_norm": 0.4659377634525299, + "learning_rate": 9.435532226838036e-05, + "loss": 1.9387, + "step": 5805 + }, + { + "epoch": 1.7820748925721301, + "grad_norm": 0.4428139925003052, + "learning_rate": 9.435302781325952e-05, + "loss": 1.9673, + "step": 5806 + }, + { + "epoch": 1.7823818293431553, + "grad_norm": 0.4488377869129181, + "learning_rate": 9.435073291981607e-05, + "loss": 1.8493, + "step": 5807 + }, + { + "epoch": 1.7826887661141804, + "grad_norm": 0.5337218046188354, + "learning_rate": 9.434843758807268e-05, + "loss": 1.8631, + "step": 5808 + }, + { + "epoch": 1.7829957028852057, + "grad_norm": 0.5479410886764526, + "learning_rate": 9.434614181805202e-05, + "loss": 1.8548, + "step": 5809 + }, + { + "epoch": 1.7833026396562308, + "grad_norm": 0.5154398679733276, + "learning_rate": 9.434384560977681e-05, + "loss": 1.9558, + "step": 5810 + }, + { + "epoch": 1.783609576427256, + "grad_norm": 0.44863855838775635, + "learning_rate": 9.434154896326974e-05, + "loss": 1.9287, + "step": 5811 + }, + { + "epoch": 1.7839165131982813, + "grad_norm": 0.43923139572143555, + "learning_rate": 9.433925187855348e-05, + "loss": 1.9475, + "step": 5812 + }, + { + "epoch": 1.7842234499693064, + "grad_norm": 0.3602962791919708, + "learning_rate": 9.433695435565073e-05, + "loss": 1.8705, + "step": 5813 + }, + { + "epoch": 1.7845303867403315, + "grad_norm": 0.3956433832645416, + "learning_rate": 9.433465639458423e-05, + "loss": 1.9402, + "step": 5814 + }, + { + "epoch": 1.7848373235113568, + "grad_norm": 0.3382786810398102, + "learning_rate": 9.433235799537666e-05, + "loss": 1.9176, + "step": 5815 + }, + { + "epoch": 1.7851442602823817, + "grad_norm": 0.3681669533252716, + "learning_rate": 9.433005915805076e-05, + "loss": 1.8628, + "step": 5816 + }, + { + "epoch": 1.785451197053407, + "grad_norm": 0.32285505533218384, + "learning_rate": 9.432775988262921e-05, + "loss": 1.8875, + "step": 5817 + }, + { + "epoch": 1.7857581338244322, + "grad_norm": 0.35673508048057556, + "learning_rate": 9.432546016913477e-05, + "loss": 1.925, + "step": 5818 + }, + { + "epoch": 1.7860650705954573, + "grad_norm": 0.363308310508728, + "learning_rate": 9.432316001759015e-05, + "loss": 1.8711, + "step": 5819 + }, + { + "epoch": 1.7863720073664826, + "grad_norm": 0.36789265275001526, + "learning_rate": 9.432085942801808e-05, + "loss": 1.8578, + "step": 5820 + }, + { + "epoch": 1.7866789441375077, + "grad_norm": 0.3791796565055847, + "learning_rate": 9.43185584004413e-05, + "loss": 1.9162, + "step": 5821 + }, + { + "epoch": 1.7869858809085328, + "grad_norm": 0.3819539248943329, + "learning_rate": 9.431625693488256e-05, + "loss": 1.9042, + "step": 5822 + }, + { + "epoch": 1.7872928176795582, + "grad_norm": 0.36675095558166504, + "learning_rate": 9.43139550313646e-05, + "loss": 1.9775, + "step": 5823 + }, + { + "epoch": 1.787599754450583, + "grad_norm": 0.40895935893058777, + "learning_rate": 9.431165268991013e-05, + "loss": 1.9249, + "step": 5824 + }, + { + "epoch": 1.7879066912216084, + "grad_norm": 0.3866878151893616, + "learning_rate": 9.430934991054197e-05, + "loss": 1.8706, + "step": 5825 + }, + { + "epoch": 1.7882136279926335, + "grad_norm": 0.4892923831939697, + "learning_rate": 9.430704669328283e-05, + "loss": 1.9177, + "step": 5826 + }, + { + "epoch": 1.7885205647636586, + "grad_norm": 0.46216699481010437, + "learning_rate": 9.430474303815548e-05, + "loss": 1.8606, + "step": 5827 + }, + { + "epoch": 1.788827501534684, + "grad_norm": 0.4253760874271393, + "learning_rate": 9.430243894518271e-05, + "loss": 1.9123, + "step": 5828 + }, + { + "epoch": 1.789134438305709, + "grad_norm": 0.3316090404987335, + "learning_rate": 9.430013441438726e-05, + "loss": 1.9138, + "step": 5829 + }, + { + "epoch": 1.7894413750767342, + "grad_norm": 0.36144545674324036, + "learning_rate": 9.429782944579191e-05, + "loss": 1.8851, + "step": 5830 + }, + { + "epoch": 1.7897483118477595, + "grad_norm": 0.47213298082351685, + "learning_rate": 9.429552403941946e-05, + "loss": 1.9614, + "step": 5831 + }, + { + "epoch": 1.7900552486187844, + "grad_norm": 0.5166186094284058, + "learning_rate": 9.429321819529267e-05, + "loss": 1.9297, + "step": 5832 + }, + { + "epoch": 1.7903621853898097, + "grad_norm": 0.5276393294334412, + "learning_rate": 9.429091191343433e-05, + "loss": 1.8803, + "step": 5833 + }, + { + "epoch": 1.7906691221608348, + "grad_norm": 0.5736613869667053, + "learning_rate": 9.428860519386726e-05, + "loss": 1.9256, + "step": 5834 + }, + { + "epoch": 1.79097605893186, + "grad_norm": 0.6111080050468445, + "learning_rate": 9.428629803661421e-05, + "loss": 1.9624, + "step": 5835 + }, + { + "epoch": 1.7912829957028853, + "grad_norm": 0.45036107301712036, + "learning_rate": 9.428399044169802e-05, + "loss": 1.8625, + "step": 5836 + }, + { + "epoch": 1.7915899324739104, + "grad_norm": 0.35049325227737427, + "learning_rate": 9.428168240914148e-05, + "loss": 1.8988, + "step": 5837 + }, + { + "epoch": 1.7918968692449355, + "grad_norm": 0.4196048080921173, + "learning_rate": 9.427937393896739e-05, + "loss": 1.8593, + "step": 5838 + }, + { + "epoch": 1.7922038060159609, + "grad_norm": 0.5051491856575012, + "learning_rate": 9.42770650311986e-05, + "loss": 1.9283, + "step": 5839 + }, + { + "epoch": 1.7925107427869857, + "grad_norm": 0.5883297324180603, + "learning_rate": 9.427475568585787e-05, + "loss": 1.9211, + "step": 5840 + }, + { + "epoch": 1.792817679558011, + "grad_norm": 0.54326993227005, + "learning_rate": 9.427244590296807e-05, + "loss": 1.8856, + "step": 5841 + }, + { + "epoch": 1.7931246163290362, + "grad_norm": 0.3963034152984619, + "learning_rate": 9.4270135682552e-05, + "loss": 1.9302, + "step": 5842 + }, + { + "epoch": 1.7934315531000613, + "grad_norm": 0.3804232180118561, + "learning_rate": 9.426782502463251e-05, + "loss": 1.8615, + "step": 5843 + }, + { + "epoch": 1.7937384898710866, + "grad_norm": 0.5173880457878113, + "learning_rate": 9.426551392923244e-05, + "loss": 1.9702, + "step": 5844 + }, + { + "epoch": 1.7940454266421118, + "grad_norm": 0.5509253144264221, + "learning_rate": 9.42632023963746e-05, + "loss": 1.9091, + "step": 5845 + }, + { + "epoch": 1.7943523634131369, + "grad_norm": 0.4918860197067261, + "learning_rate": 9.426089042608186e-05, + "loss": 1.956, + "step": 5846 + }, + { + "epoch": 1.7946593001841622, + "grad_norm": 0.40632131695747375, + "learning_rate": 9.425857801837705e-05, + "loss": 1.978, + "step": 5847 + }, + { + "epoch": 1.794966236955187, + "grad_norm": 0.429643839597702, + "learning_rate": 9.425626517328303e-05, + "loss": 1.9293, + "step": 5848 + }, + { + "epoch": 1.7952731737262124, + "grad_norm": 0.46690109372138977, + "learning_rate": 9.425395189082267e-05, + "loss": 1.935, + "step": 5849 + }, + { + "epoch": 1.7955801104972375, + "grad_norm": 0.47745081782341003, + "learning_rate": 9.425163817101881e-05, + "loss": 1.9308, + "step": 5850 + }, + { + "epoch": 1.7958870472682626, + "grad_norm": 0.40971288084983826, + "learning_rate": 9.424932401389433e-05, + "loss": 1.8818, + "step": 5851 + }, + { + "epoch": 1.796193984039288, + "grad_norm": 0.44640809297561646, + "learning_rate": 9.424700941947209e-05, + "loss": 1.9298, + "step": 5852 + }, + { + "epoch": 1.796500920810313, + "grad_norm": 0.4068106412887573, + "learning_rate": 9.424469438777497e-05, + "loss": 1.9176, + "step": 5853 + }, + { + "epoch": 1.7968078575813382, + "grad_norm": 0.39228180050849915, + "learning_rate": 9.424237891882584e-05, + "loss": 1.9822, + "step": 5854 + }, + { + "epoch": 1.7971147943523635, + "grad_norm": 0.4050966203212738, + "learning_rate": 9.424006301264761e-05, + "loss": 2.0092, + "step": 5855 + }, + { + "epoch": 1.7974217311233884, + "grad_norm": 0.4402252733707428, + "learning_rate": 9.423774666926313e-05, + "loss": 1.9686, + "step": 5856 + }, + { + "epoch": 1.7977286678944138, + "grad_norm": 0.4362206757068634, + "learning_rate": 9.423542988869531e-05, + "loss": 1.9472, + "step": 5857 + }, + { + "epoch": 1.7980356046654389, + "grad_norm": 0.4363079369068146, + "learning_rate": 9.423311267096706e-05, + "loss": 1.9046, + "step": 5858 + }, + { + "epoch": 1.798342541436464, + "grad_norm": 0.4619371294975281, + "learning_rate": 9.423079501610123e-05, + "loss": 1.9322, + "step": 5859 + }, + { + "epoch": 1.7986494782074893, + "grad_norm": 0.3747330605983734, + "learning_rate": 9.42284769241208e-05, + "loss": 1.8859, + "step": 5860 + }, + { + "epoch": 1.7989564149785144, + "grad_norm": 0.46349939703941345, + "learning_rate": 9.422615839504863e-05, + "loss": 2.0343, + "step": 5861 + }, + { + "epoch": 1.7992633517495396, + "grad_norm": 0.4081406891345978, + "learning_rate": 9.422383942890762e-05, + "loss": 1.9261, + "step": 5862 + }, + { + "epoch": 1.7995702885205649, + "grad_norm": 0.4200274348258972, + "learning_rate": 9.42215200257207e-05, + "loss": 1.8922, + "step": 5863 + }, + { + "epoch": 1.7998772252915898, + "grad_norm": 0.4353233277797699, + "learning_rate": 9.421920018551084e-05, + "loss": 1.9263, + "step": 5864 + }, + { + "epoch": 1.8001841620626151, + "grad_norm": 0.43261346220970154, + "learning_rate": 9.42168799083009e-05, + "loss": 1.872, + "step": 5865 + }, + { + "epoch": 1.8004910988336402, + "grad_norm": 0.41588231921195984, + "learning_rate": 9.421455919411385e-05, + "loss": 1.9427, + "step": 5866 + }, + { + "epoch": 1.8007980356046653, + "grad_norm": 0.36490678787231445, + "learning_rate": 9.421223804297261e-05, + "loss": 1.9458, + "step": 5867 + }, + { + "epoch": 1.8011049723756907, + "grad_norm": 0.40656644105911255, + "learning_rate": 9.42099164549001e-05, + "loss": 1.8791, + "step": 5868 + }, + { + "epoch": 1.8014119091467158, + "grad_norm": 0.35529834032058716, + "learning_rate": 9.42075944299193e-05, + "loss": 1.8889, + "step": 5869 + }, + { + "epoch": 1.801718845917741, + "grad_norm": 0.3530628979206085, + "learning_rate": 9.420527196805314e-05, + "loss": 1.9093, + "step": 5870 + }, + { + "epoch": 1.8020257826887662, + "grad_norm": 0.35012003779411316, + "learning_rate": 9.420294906932457e-05, + "loss": 1.84, + "step": 5871 + }, + { + "epoch": 1.8023327194597911, + "grad_norm": 0.37993142008781433, + "learning_rate": 9.420062573375654e-05, + "loss": 1.9943, + "step": 5872 + }, + { + "epoch": 1.8026396562308165, + "grad_norm": 0.34801873564720154, + "learning_rate": 9.419830196137204e-05, + "loss": 1.9092, + "step": 5873 + }, + { + "epoch": 1.8029465930018416, + "grad_norm": 0.3381052017211914, + "learning_rate": 9.4195977752194e-05, + "loss": 1.9212, + "step": 5874 + }, + { + "epoch": 1.8032535297728667, + "grad_norm": 0.3624991476535797, + "learning_rate": 9.419365310624542e-05, + "loss": 1.9491, + "step": 5875 + }, + { + "epoch": 1.803560466543892, + "grad_norm": 0.3840768337249756, + "learning_rate": 9.419132802354925e-05, + "loss": 1.9531, + "step": 5876 + }, + { + "epoch": 1.8038674033149171, + "grad_norm": 0.377481073141098, + "learning_rate": 9.418900250412846e-05, + "loss": 1.9103, + "step": 5877 + }, + { + "epoch": 1.8041743400859422, + "grad_norm": 0.41462278366088867, + "learning_rate": 9.418667654800606e-05, + "loss": 1.944, + "step": 5878 + }, + { + "epoch": 1.8044812768569676, + "grad_norm": 0.5620705485343933, + "learning_rate": 9.418435015520502e-05, + "loss": 1.9184, + "step": 5879 + }, + { + "epoch": 1.8047882136279927, + "grad_norm": 0.6150699853897095, + "learning_rate": 9.418202332574833e-05, + "loss": 1.8971, + "step": 5880 + }, + { + "epoch": 1.8050951503990178, + "grad_norm": 0.5631645321846008, + "learning_rate": 9.4179696059659e-05, + "loss": 1.9668, + "step": 5881 + }, + { + "epoch": 1.8054020871700431, + "grad_norm": 0.4416831433773041, + "learning_rate": 9.417736835696001e-05, + "loss": 1.8531, + "step": 5882 + }, + { + "epoch": 1.805709023941068, + "grad_norm": 0.37340816855430603, + "learning_rate": 9.417504021767438e-05, + "loss": 1.8928, + "step": 5883 + }, + { + "epoch": 1.8060159607120934, + "grad_norm": 0.46018123626708984, + "learning_rate": 9.41727116418251e-05, + "loss": 1.8943, + "step": 5884 + }, + { + "epoch": 1.8063228974831185, + "grad_norm": 0.3852032721042633, + "learning_rate": 9.41703826294352e-05, + "loss": 1.8927, + "step": 5885 + }, + { + "epoch": 1.8066298342541436, + "grad_norm": 0.36783283948898315, + "learning_rate": 9.41680531805277e-05, + "loss": 1.9255, + "step": 5886 + }, + { + "epoch": 1.806936771025169, + "grad_norm": 0.39950302243232727, + "learning_rate": 9.416572329512559e-05, + "loss": 1.9215, + "step": 5887 + }, + { + "epoch": 1.807243707796194, + "grad_norm": 0.37217068672180176, + "learning_rate": 9.416339297325193e-05, + "loss": 1.8798, + "step": 5888 + }, + { + "epoch": 1.8075506445672191, + "grad_norm": 0.4334213137626648, + "learning_rate": 9.416106221492974e-05, + "loss": 1.9583, + "step": 5889 + }, + { + "epoch": 1.8078575813382445, + "grad_norm": 0.39610370993614197, + "learning_rate": 9.415873102018204e-05, + "loss": 1.9526, + "step": 5890 + }, + { + "epoch": 1.8081645181092694, + "grad_norm": 0.4256335496902466, + "learning_rate": 9.41563993890319e-05, + "loss": 1.9633, + "step": 5891 + }, + { + "epoch": 1.8084714548802947, + "grad_norm": 0.48030543327331543, + "learning_rate": 9.41540673215023e-05, + "loss": 1.8869, + "step": 5892 + }, + { + "epoch": 1.8087783916513198, + "grad_norm": 0.5549675822257996, + "learning_rate": 9.415173481761634e-05, + "loss": 1.9894, + "step": 5893 + }, + { + "epoch": 1.809085328422345, + "grad_norm": 0.5706361532211304, + "learning_rate": 9.414940187739708e-05, + "loss": 1.9721, + "step": 5894 + }, + { + "epoch": 1.8093922651933703, + "grad_norm": 0.4263947606086731, + "learning_rate": 9.414706850086754e-05, + "loss": 1.9408, + "step": 5895 + }, + { + "epoch": 1.8096992019643954, + "grad_norm": 0.3934611976146698, + "learning_rate": 9.414473468805078e-05, + "loss": 1.9444, + "step": 5896 + }, + { + "epoch": 1.8100061387354205, + "grad_norm": 0.4267776608467102, + "learning_rate": 9.41424004389699e-05, + "loss": 1.8774, + "step": 5897 + }, + { + "epoch": 1.8103130755064458, + "grad_norm": 0.46216219663619995, + "learning_rate": 9.414006575364795e-05, + "loss": 1.9648, + "step": 5898 + }, + { + "epoch": 1.8106200122774707, + "grad_norm": 0.4730767607688904, + "learning_rate": 9.413773063210798e-05, + "loss": 1.9528, + "step": 5899 + }, + { + "epoch": 1.810926949048496, + "grad_norm": 0.36383283138275146, + "learning_rate": 9.413539507437308e-05, + "loss": 1.843, + "step": 5900 + }, + { + "epoch": 1.8112338858195212, + "grad_norm": 0.343729168176651, + "learning_rate": 9.413305908046636e-05, + "loss": 1.9101, + "step": 5901 + }, + { + "epoch": 1.8115408225905463, + "grad_norm": 0.3774524927139282, + "learning_rate": 9.413072265041087e-05, + "loss": 1.8705, + "step": 5902 + }, + { + "epoch": 1.8118477593615716, + "grad_norm": 0.37734711170196533, + "learning_rate": 9.412838578422972e-05, + "loss": 1.868, + "step": 5903 + }, + { + "epoch": 1.8121546961325967, + "grad_norm": 0.3705524206161499, + "learning_rate": 9.4126048481946e-05, + "loss": 1.9587, + "step": 5904 + }, + { + "epoch": 1.8124616329036218, + "grad_norm": 0.45906612277030945, + "learning_rate": 9.41237107435828e-05, + "loss": 1.9872, + "step": 5905 + }, + { + "epoch": 1.8127685696746472, + "grad_norm": 0.5013484954833984, + "learning_rate": 9.412137256916323e-05, + "loss": 1.8692, + "step": 5906 + }, + { + "epoch": 1.813075506445672, + "grad_norm": 0.5123991370201111, + "learning_rate": 9.411903395871038e-05, + "loss": 1.9574, + "step": 5907 + }, + { + "epoch": 1.8133824432166974, + "grad_norm": 0.45425844192504883, + "learning_rate": 9.411669491224739e-05, + "loss": 1.9295, + "step": 5908 + }, + { + "epoch": 1.8136893799877225, + "grad_norm": 0.3939640522003174, + "learning_rate": 9.411435542979736e-05, + "loss": 1.9258, + "step": 5909 + }, + { + "epoch": 1.8139963167587476, + "grad_norm": 0.5032235383987427, + "learning_rate": 9.411201551138342e-05, + "loss": 1.9012, + "step": 5910 + }, + { + "epoch": 1.814303253529773, + "grad_norm": 0.6334826946258545, + "learning_rate": 9.410967515702869e-05, + "loss": 1.9699, + "step": 5911 + }, + { + "epoch": 1.814610190300798, + "grad_norm": 0.56645667552948, + "learning_rate": 9.41073343667563e-05, + "loss": 1.9346, + "step": 5912 + }, + { + "epoch": 1.8149171270718232, + "grad_norm": 0.461668461561203, + "learning_rate": 9.410499314058936e-05, + "loss": 1.9549, + "step": 5913 + }, + { + "epoch": 1.8152240638428485, + "grad_norm": 0.39917534589767456, + "learning_rate": 9.410265147855104e-05, + "loss": 1.9503, + "step": 5914 + }, + { + "epoch": 1.8155310006138734, + "grad_norm": 0.4409043788909912, + "learning_rate": 9.410030938066448e-05, + "loss": 1.897, + "step": 5915 + }, + { + "epoch": 1.8158379373848987, + "grad_norm": 0.5793384313583374, + "learning_rate": 9.40979668469528e-05, + "loss": 1.9526, + "step": 5916 + }, + { + "epoch": 1.8161448741559238, + "grad_norm": 0.4642924666404724, + "learning_rate": 9.409562387743917e-05, + "loss": 1.8993, + "step": 5917 + }, + { + "epoch": 1.816451810926949, + "grad_norm": 0.3799861669540405, + "learning_rate": 9.409328047214674e-05, + "loss": 1.9412, + "step": 5918 + }, + { + "epoch": 1.8167587476979743, + "grad_norm": 0.40758320689201355, + "learning_rate": 9.409093663109866e-05, + "loss": 1.9908, + "step": 5919 + }, + { + "epoch": 1.8170656844689994, + "grad_norm": 0.41446420550346375, + "learning_rate": 9.40885923543181e-05, + "loss": 1.8711, + "step": 5920 + }, + { + "epoch": 1.8173726212400245, + "grad_norm": 0.4744807183742523, + "learning_rate": 9.408624764182823e-05, + "loss": 2.0297, + "step": 5921 + }, + { + "epoch": 1.8176795580110499, + "grad_norm": 0.43377524614334106, + "learning_rate": 9.408390249365224e-05, + "loss": 1.9613, + "step": 5922 + }, + { + "epoch": 1.8179864947820747, + "grad_norm": 0.38450872898101807, + "learning_rate": 9.408155690981328e-05, + "loss": 1.8716, + "step": 5923 + }, + { + "epoch": 1.8182934315531, + "grad_norm": 0.4989684820175171, + "learning_rate": 9.407921089033452e-05, + "loss": 1.9909, + "step": 5924 + }, + { + "epoch": 1.8186003683241252, + "grad_norm": 0.4137042462825775, + "learning_rate": 9.407686443523918e-05, + "loss": 1.8778, + "step": 5925 + }, + { + "epoch": 1.8189073050951503, + "grad_norm": 0.3816729485988617, + "learning_rate": 9.407451754455042e-05, + "loss": 1.9355, + "step": 5926 + }, + { + "epoch": 1.8192142418661756, + "grad_norm": 0.48876214027404785, + "learning_rate": 9.407217021829145e-05, + "loss": 1.9256, + "step": 5927 + }, + { + "epoch": 1.8195211786372008, + "grad_norm": 0.5273690223693848, + "learning_rate": 9.406982245648547e-05, + "loss": 1.9456, + "step": 5928 + }, + { + "epoch": 1.8198281154082259, + "grad_norm": 0.4148990511894226, + "learning_rate": 9.406747425915566e-05, + "loss": 1.9184, + "step": 5929 + }, + { + "epoch": 1.8201350521792512, + "grad_norm": 0.4484131634235382, + "learning_rate": 9.406512562632526e-05, + "loss": 1.9305, + "step": 5930 + }, + { + "epoch": 1.820441988950276, + "grad_norm": 0.6036938428878784, + "learning_rate": 9.406277655801744e-05, + "loss": 1.9294, + "step": 5931 + }, + { + "epoch": 1.8207489257213014, + "grad_norm": 0.5399366021156311, + "learning_rate": 9.406042705425543e-05, + "loss": 1.9265, + "step": 5932 + }, + { + "epoch": 1.8210558624923265, + "grad_norm": 0.3591126501560211, + "learning_rate": 9.405807711506249e-05, + "loss": 1.8634, + "step": 5933 + }, + { + "epoch": 1.8213627992633517, + "grad_norm": 0.4474995732307434, + "learning_rate": 9.405572674046179e-05, + "loss": 2.0084, + "step": 5934 + }, + { + "epoch": 1.821669736034377, + "grad_norm": 0.4841657876968384, + "learning_rate": 9.405337593047657e-05, + "loss": 1.8885, + "step": 5935 + }, + { + "epoch": 1.821976672805402, + "grad_norm": 0.4786655008792877, + "learning_rate": 9.405102468513008e-05, + "loss": 1.9273, + "step": 5936 + }, + { + "epoch": 1.8222836095764272, + "grad_norm": 0.4675963521003723, + "learning_rate": 9.404867300444553e-05, + "loss": 1.9267, + "step": 5937 + }, + { + "epoch": 1.8225905463474525, + "grad_norm": 0.40235474705696106, + "learning_rate": 9.404632088844619e-05, + "loss": 2.0208, + "step": 5938 + }, + { + "epoch": 1.8228974831184774, + "grad_norm": 0.40626317262649536, + "learning_rate": 9.404396833715527e-05, + "loss": 1.9079, + "step": 5939 + }, + { + "epoch": 1.8232044198895028, + "grad_norm": 0.4164435565471649, + "learning_rate": 9.404161535059607e-05, + "loss": 1.8818, + "step": 5940 + }, + { + "epoch": 1.8235113566605279, + "grad_norm": 0.44487184286117554, + "learning_rate": 9.40392619287918e-05, + "loss": 1.9184, + "step": 5941 + }, + { + "epoch": 1.823818293431553, + "grad_norm": 0.4009508192539215, + "learning_rate": 9.403690807176572e-05, + "loss": 1.8814, + "step": 5942 + }, + { + "epoch": 1.8241252302025783, + "grad_norm": 0.3518575429916382, + "learning_rate": 9.403455377954112e-05, + "loss": 1.9319, + "step": 5943 + }, + { + "epoch": 1.8244321669736034, + "grad_norm": 0.36712533235549927, + "learning_rate": 9.403219905214125e-05, + "loss": 1.8609, + "step": 5944 + }, + { + "epoch": 1.8247391037446286, + "grad_norm": 0.3926267623901367, + "learning_rate": 9.402984388958937e-05, + "loss": 1.9328, + "step": 5945 + }, + { + "epoch": 1.825046040515654, + "grad_norm": 0.370781272649765, + "learning_rate": 9.402748829190878e-05, + "loss": 1.9848, + "step": 5946 + }, + { + "epoch": 1.8253529772866788, + "grad_norm": 0.38226625323295593, + "learning_rate": 9.402513225912273e-05, + "loss": 1.8933, + "step": 5947 + }, + { + "epoch": 1.8256599140577041, + "grad_norm": 0.40101101994514465, + "learning_rate": 9.402277579125451e-05, + "loss": 1.9231, + "step": 5948 + }, + { + "epoch": 1.8259668508287292, + "grad_norm": 0.41038060188293457, + "learning_rate": 9.402041888832744e-05, + "loss": 1.9445, + "step": 5949 + }, + { + "epoch": 1.8262737875997543, + "grad_norm": 0.37442395091056824, + "learning_rate": 9.401806155036479e-05, + "loss": 1.9271, + "step": 5950 + }, + { + "epoch": 1.8265807243707797, + "grad_norm": 0.43142926692962646, + "learning_rate": 9.401570377738984e-05, + "loss": 1.9489, + "step": 5951 + }, + { + "epoch": 1.8268876611418048, + "grad_norm": 0.38730981945991516, + "learning_rate": 9.401334556942591e-05, + "loss": 1.8802, + "step": 5952 + }, + { + "epoch": 1.82719459791283, + "grad_norm": 0.34189531207084656, + "learning_rate": 9.40109869264963e-05, + "loss": 1.9116, + "step": 5953 + }, + { + "epoch": 1.8275015346838552, + "grad_norm": 0.3632197678089142, + "learning_rate": 9.400862784862434e-05, + "loss": 1.8456, + "step": 5954 + }, + { + "epoch": 1.8278084714548803, + "grad_norm": 0.4008798599243164, + "learning_rate": 9.400626833583331e-05, + "loss": 1.9984, + "step": 5955 + }, + { + "epoch": 1.8281154082259055, + "grad_norm": 0.4087502062320709, + "learning_rate": 9.400390838814655e-05, + "loss": 1.8177, + "step": 5956 + }, + { + "epoch": 1.8284223449969308, + "grad_norm": 0.3753478229045868, + "learning_rate": 9.400154800558737e-05, + "loss": 1.864, + "step": 5957 + }, + { + "epoch": 1.8287292817679557, + "grad_norm": 0.37939608097076416, + "learning_rate": 9.399918718817911e-05, + "loss": 1.9331, + "step": 5958 + }, + { + "epoch": 1.829036218538981, + "grad_norm": 0.41382426023483276, + "learning_rate": 9.399682593594507e-05, + "loss": 1.9014, + "step": 5959 + }, + { + "epoch": 1.8293431553100061, + "grad_norm": 0.46129345893859863, + "learning_rate": 9.399446424890864e-05, + "loss": 1.9591, + "step": 5960 + }, + { + "epoch": 1.8296500920810312, + "grad_norm": 0.487870454788208, + "learning_rate": 9.399210212709312e-05, + "loss": 1.9073, + "step": 5961 + }, + { + "epoch": 1.8299570288520566, + "grad_norm": 0.4693615138530731, + "learning_rate": 9.398973957052185e-05, + "loss": 1.8336, + "step": 5962 + }, + { + "epoch": 1.8302639656230817, + "grad_norm": 0.38947850465774536, + "learning_rate": 9.39873765792182e-05, + "loss": 1.8599, + "step": 5963 + }, + { + "epoch": 1.8305709023941068, + "grad_norm": 0.372242271900177, + "learning_rate": 9.398501315320551e-05, + "loss": 1.9653, + "step": 5964 + }, + { + "epoch": 1.8308778391651321, + "grad_norm": 0.37679895758628845, + "learning_rate": 9.398264929250714e-05, + "loss": 1.8886, + "step": 5965 + }, + { + "epoch": 1.831184775936157, + "grad_norm": 0.347989022731781, + "learning_rate": 9.398028499714645e-05, + "loss": 1.8665, + "step": 5966 + }, + { + "epoch": 1.8314917127071824, + "grad_norm": 0.4297877550125122, + "learning_rate": 9.397792026714681e-05, + "loss": 1.9646, + "step": 5967 + }, + { + "epoch": 1.8317986494782075, + "grad_norm": 0.3698103427886963, + "learning_rate": 9.397555510253158e-05, + "loss": 1.9537, + "step": 5968 + }, + { + "epoch": 1.8321055862492326, + "grad_norm": 0.3268609941005707, + "learning_rate": 9.397318950332414e-05, + "loss": 1.8679, + "step": 5969 + }, + { + "epoch": 1.832412523020258, + "grad_norm": 0.3487341105937958, + "learning_rate": 9.397082346954788e-05, + "loss": 1.8936, + "step": 5970 + }, + { + "epoch": 1.832719459791283, + "grad_norm": 0.36363741755485535, + "learning_rate": 9.396845700122616e-05, + "loss": 1.8926, + "step": 5971 + }, + { + "epoch": 1.8330263965623081, + "grad_norm": 0.42258647084236145, + "learning_rate": 9.396609009838237e-05, + "loss": 1.9439, + "step": 5972 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.4087521433830261, + "learning_rate": 9.396372276103992e-05, + "loss": 1.8868, + "step": 5973 + }, + { + "epoch": 1.8336402701043584, + "grad_norm": 0.41857820749282837, + "learning_rate": 9.396135498922218e-05, + "loss": 1.9824, + "step": 5974 + }, + { + "epoch": 1.8339472068753837, + "grad_norm": 0.44207099080085754, + "learning_rate": 9.395898678295259e-05, + "loss": 1.9183, + "step": 5975 + }, + { + "epoch": 1.8342541436464088, + "grad_norm": 0.38295891880989075, + "learning_rate": 9.39566181422545e-05, + "loss": 1.8882, + "step": 5976 + }, + { + "epoch": 1.834561080417434, + "grad_norm": 0.4440687298774719, + "learning_rate": 9.395424906715136e-05, + "loss": 1.9401, + "step": 5977 + }, + { + "epoch": 1.8348680171884593, + "grad_norm": 0.3867577016353607, + "learning_rate": 9.395187955766655e-05, + "loss": 1.9243, + "step": 5978 + }, + { + "epoch": 1.8351749539594844, + "grad_norm": 0.47536182403564453, + "learning_rate": 9.394950961382354e-05, + "loss": 1.9248, + "step": 5979 + }, + { + "epoch": 1.8354818907305095, + "grad_norm": 0.4071936011314392, + "learning_rate": 9.394713923564569e-05, + "loss": 1.8701, + "step": 5980 + }, + { + "epoch": 1.8357888275015348, + "grad_norm": 0.41844502091407776, + "learning_rate": 9.394476842315645e-05, + "loss": 2.0087, + "step": 5981 + }, + { + "epoch": 1.8360957642725597, + "grad_norm": 0.40439316630363464, + "learning_rate": 9.394239717637927e-05, + "loss": 1.8945, + "step": 5982 + }, + { + "epoch": 1.836402701043585, + "grad_norm": 0.36738064885139465, + "learning_rate": 9.394002549533754e-05, + "loss": 1.9361, + "step": 5983 + }, + { + "epoch": 1.8367096378146102, + "grad_norm": 0.4733370542526245, + "learning_rate": 9.393765338005476e-05, + "loss": 1.9301, + "step": 5984 + }, + { + "epoch": 1.8370165745856353, + "grad_norm": 0.4467030465602875, + "learning_rate": 9.39352808305543e-05, + "loss": 1.8691, + "step": 5985 + }, + { + "epoch": 1.8373235113566606, + "grad_norm": 0.5276423692703247, + "learning_rate": 9.393290784685967e-05, + "loss": 1.9211, + "step": 5986 + }, + { + "epoch": 1.8376304481276857, + "grad_norm": 0.4791669547557831, + "learning_rate": 9.393053442899428e-05, + "loss": 1.9876, + "step": 5987 + }, + { + "epoch": 1.8379373848987108, + "grad_norm": 0.41468554735183716, + "learning_rate": 9.392816057698159e-05, + "loss": 1.9483, + "step": 5988 + }, + { + "epoch": 1.8382443216697362, + "grad_norm": 0.3979242742061615, + "learning_rate": 9.39257862908451e-05, + "loss": 1.8962, + "step": 5989 + }, + { + "epoch": 1.838551258440761, + "grad_norm": 0.47706472873687744, + "learning_rate": 9.392341157060822e-05, + "loss": 1.9028, + "step": 5990 + }, + { + "epoch": 1.8388581952117864, + "grad_norm": 0.5254244804382324, + "learning_rate": 9.392103641629446e-05, + "loss": 1.9244, + "step": 5991 + }, + { + "epoch": 1.8391651319828115, + "grad_norm": 0.49596595764160156, + "learning_rate": 9.391866082792727e-05, + "loss": 1.8731, + "step": 5992 + }, + { + "epoch": 1.8394720687538366, + "grad_norm": 0.3787136971950531, + "learning_rate": 9.391628480553013e-05, + "loss": 1.9404, + "step": 5993 + }, + { + "epoch": 1.839779005524862, + "grad_norm": 0.3986566960811615, + "learning_rate": 9.391390834912651e-05, + "loss": 1.9319, + "step": 5994 + }, + { + "epoch": 1.840085942295887, + "grad_norm": 0.4466419219970703, + "learning_rate": 9.391153145873992e-05, + "loss": 1.9755, + "step": 5995 + }, + { + "epoch": 1.8403928790669122, + "grad_norm": 0.43374884128570557, + "learning_rate": 9.390915413439385e-05, + "loss": 1.913, + "step": 5996 + }, + { + "epoch": 1.8406998158379375, + "grad_norm": 0.3897610902786255, + "learning_rate": 9.390677637611176e-05, + "loss": 1.9488, + "step": 5997 + }, + { + "epoch": 1.8410067526089624, + "grad_norm": 0.38407614827156067, + "learning_rate": 9.390439818391718e-05, + "loss": 1.8712, + "step": 5998 + }, + { + "epoch": 1.8413136893799877, + "grad_norm": 0.4159192740917206, + "learning_rate": 9.390201955783362e-05, + "loss": 1.9254, + "step": 5999 + }, + { + "epoch": 1.8416206261510129, + "grad_norm": 0.42220592498779297, + "learning_rate": 9.389964049788455e-05, + "loss": 1.9684, + "step": 6000 + }, + { + "epoch": 1.841927562922038, + "grad_norm": 0.3792029619216919, + "learning_rate": 9.389726100409351e-05, + "loss": 1.9091, + "step": 6001 + }, + { + "epoch": 1.8422344996930633, + "grad_norm": 0.37374788522720337, + "learning_rate": 9.389488107648401e-05, + "loss": 1.9498, + "step": 6002 + }, + { + "epoch": 1.8425414364640884, + "grad_norm": 0.4237084686756134, + "learning_rate": 9.389250071507958e-05, + "loss": 1.9177, + "step": 6003 + }, + { + "epoch": 1.8428483732351135, + "grad_norm": 0.5332993865013123, + "learning_rate": 9.38901199199037e-05, + "loss": 1.8994, + "step": 6004 + }, + { + "epoch": 1.8431553100061389, + "grad_norm": 0.42202335596084595, + "learning_rate": 9.388773869097996e-05, + "loss": 1.8365, + "step": 6005 + }, + { + "epoch": 1.8434622467771637, + "grad_norm": 0.3581100106239319, + "learning_rate": 9.388535702833185e-05, + "loss": 1.8536, + "step": 6006 + }, + { + "epoch": 1.843769183548189, + "grad_norm": 0.3670782446861267, + "learning_rate": 9.388297493198293e-05, + "loss": 1.8965, + "step": 6007 + }, + { + "epoch": 1.8440761203192142, + "grad_norm": 0.39181825518608093, + "learning_rate": 9.38805924019567e-05, + "loss": 1.8674, + "step": 6008 + }, + { + "epoch": 1.8443830570902393, + "grad_norm": 0.46757015585899353, + "learning_rate": 9.387820943827676e-05, + "loss": 1.8945, + "step": 6009 + }, + { + "epoch": 1.8446899938612646, + "grad_norm": 0.4656504690647125, + "learning_rate": 9.387582604096664e-05, + "loss": 1.8626, + "step": 6010 + }, + { + "epoch": 1.8449969306322898, + "grad_norm": 0.4699888825416565, + "learning_rate": 9.387344221004988e-05, + "loss": 1.9396, + "step": 6011 + }, + { + "epoch": 1.8453038674033149, + "grad_norm": 0.36591392755508423, + "learning_rate": 9.387105794555006e-05, + "loss": 1.8031, + "step": 6012 + }, + { + "epoch": 1.8456108041743402, + "grad_norm": 0.3563486933708191, + "learning_rate": 9.386867324749073e-05, + "loss": 1.8658, + "step": 6013 + }, + { + "epoch": 1.845917740945365, + "grad_norm": 0.4490883946418762, + "learning_rate": 9.386628811589547e-05, + "loss": 1.9809, + "step": 6014 + }, + { + "epoch": 1.8462246777163904, + "grad_norm": 0.39862295985221863, + "learning_rate": 9.38639025507878e-05, + "loss": 1.9268, + "step": 6015 + }, + { + "epoch": 1.8465316144874155, + "grad_norm": 0.3579883575439453, + "learning_rate": 9.386151655219138e-05, + "loss": 1.8538, + "step": 6016 + }, + { + "epoch": 1.8468385512584407, + "grad_norm": 0.411685973405838, + "learning_rate": 9.385913012012973e-05, + "loss": 1.9034, + "step": 6017 + }, + { + "epoch": 1.847145488029466, + "grad_norm": 0.44486066699028015, + "learning_rate": 9.385674325462643e-05, + "loss": 1.9279, + "step": 6018 + }, + { + "epoch": 1.847452424800491, + "grad_norm": 0.42794153094291687, + "learning_rate": 9.385435595570511e-05, + "loss": 1.9117, + "step": 6019 + }, + { + "epoch": 1.8477593615715162, + "grad_norm": 0.3652110695838928, + "learning_rate": 9.385196822338933e-05, + "loss": 1.9636, + "step": 6020 + }, + { + "epoch": 1.8480662983425415, + "grad_norm": 0.36490142345428467, + "learning_rate": 9.38495800577027e-05, + "loss": 1.9468, + "step": 6021 + }, + { + "epoch": 1.8483732351135667, + "grad_norm": 0.3946039080619812, + "learning_rate": 9.384719145866882e-05, + "loss": 1.8851, + "step": 6022 + }, + { + "epoch": 1.8486801718845918, + "grad_norm": 0.4236997067928314, + "learning_rate": 9.38448024263113e-05, + "loss": 2.0256, + "step": 6023 + }, + { + "epoch": 1.848987108655617, + "grad_norm": 0.34637942910194397, + "learning_rate": 9.384241296065374e-05, + "loss": 1.9032, + "step": 6024 + }, + { + "epoch": 1.849294045426642, + "grad_norm": 0.4096907079219818, + "learning_rate": 9.384002306171975e-05, + "loss": 1.9762, + "step": 6025 + }, + { + "epoch": 1.8496009821976673, + "grad_norm": 0.38225218653678894, + "learning_rate": 9.383763272953297e-05, + "loss": 2.023, + "step": 6026 + }, + { + "epoch": 1.8499079189686924, + "grad_norm": 0.4297153055667877, + "learning_rate": 9.3835241964117e-05, + "loss": 1.977, + "step": 6027 + }, + { + "epoch": 1.8502148557397176, + "grad_norm": 0.5225360989570618, + "learning_rate": 9.383285076549548e-05, + "loss": 1.919, + "step": 6028 + }, + { + "epoch": 1.850521792510743, + "grad_norm": 0.6799743175506592, + "learning_rate": 9.383045913369205e-05, + "loss": 1.9382, + "step": 6029 + }, + { + "epoch": 1.850828729281768, + "grad_norm": 0.6274817585945129, + "learning_rate": 9.382806706873031e-05, + "loss": 1.9782, + "step": 6030 + }, + { + "epoch": 1.8511356660527931, + "grad_norm": 0.4939708113670349, + "learning_rate": 9.382567457063392e-05, + "loss": 1.8794, + "step": 6031 + }, + { + "epoch": 1.8514426028238185, + "grad_norm": 0.3876135051250458, + "learning_rate": 9.382328163942656e-05, + "loss": 2.0153, + "step": 6032 + }, + { + "epoch": 1.8517495395948433, + "grad_norm": 0.592051088809967, + "learning_rate": 9.38208882751318e-05, + "loss": 1.9277, + "step": 6033 + }, + { + "epoch": 1.8520564763658687, + "grad_norm": 0.660763144493103, + "learning_rate": 9.381849447777337e-05, + "loss": 1.9177, + "step": 6034 + }, + { + "epoch": 1.8523634131368938, + "grad_norm": 0.5823151469230652, + "learning_rate": 9.381610024737489e-05, + "loss": 1.9363, + "step": 6035 + }, + { + "epoch": 1.852670349907919, + "grad_norm": 0.39519962668418884, + "learning_rate": 9.381370558396004e-05, + "loss": 1.8627, + "step": 6036 + }, + { + "epoch": 1.8529772866789442, + "grad_norm": 0.44657328724861145, + "learning_rate": 9.381131048755244e-05, + "loss": 1.9075, + "step": 6037 + }, + { + "epoch": 1.8532842234499693, + "grad_norm": 0.540743887424469, + "learning_rate": 9.380891495817581e-05, + "loss": 1.9518, + "step": 6038 + }, + { + "epoch": 1.8535911602209945, + "grad_norm": 0.4388680160045624, + "learning_rate": 9.38065189958538e-05, + "loss": 1.8485, + "step": 6039 + }, + { + "epoch": 1.8538980969920198, + "grad_norm": 0.37645572423934937, + "learning_rate": 9.38041226006101e-05, + "loss": 1.9542, + "step": 6040 + }, + { + "epoch": 1.8542050337630447, + "grad_norm": 0.4405656158924103, + "learning_rate": 9.380172577246837e-05, + "loss": 1.9054, + "step": 6041 + }, + { + "epoch": 1.85451197053407, + "grad_norm": 0.45483505725860596, + "learning_rate": 9.379932851145232e-05, + "loss": 1.9077, + "step": 6042 + }, + { + "epoch": 1.8548189073050951, + "grad_norm": 0.40666261315345764, + "learning_rate": 9.379693081758564e-05, + "loss": 1.9977, + "step": 6043 + }, + { + "epoch": 1.8551258440761202, + "grad_norm": 0.365241140127182, + "learning_rate": 9.379453269089202e-05, + "loss": 1.9047, + "step": 6044 + }, + { + "epoch": 1.8554327808471456, + "grad_norm": 0.40797916054725647, + "learning_rate": 9.379213413139516e-05, + "loss": 1.9621, + "step": 6045 + }, + { + "epoch": 1.8557397176181707, + "grad_norm": 0.4525306820869446, + "learning_rate": 9.378973513911875e-05, + "loss": 1.9479, + "step": 6046 + }, + { + "epoch": 1.8560466543891958, + "grad_norm": 0.45422959327697754, + "learning_rate": 9.378733571408652e-05, + "loss": 1.9754, + "step": 6047 + }, + { + "epoch": 1.8563535911602211, + "grad_norm": 0.381862998008728, + "learning_rate": 9.378493585632217e-05, + "loss": 1.8542, + "step": 6048 + }, + { + "epoch": 1.856660527931246, + "grad_norm": 0.40489691495895386, + "learning_rate": 9.378253556584944e-05, + "loss": 1.9331, + "step": 6049 + }, + { + "epoch": 1.8569674647022714, + "grad_norm": 0.40347445011138916, + "learning_rate": 9.378013484269201e-05, + "loss": 1.9414, + "step": 6050 + }, + { + "epoch": 1.8572744014732965, + "grad_norm": 0.35401904582977295, + "learning_rate": 9.377773368687363e-05, + "loss": 1.8094, + "step": 6051 + }, + { + "epoch": 1.8575813382443216, + "grad_norm": 0.4061582684516907, + "learning_rate": 9.377533209841805e-05, + "loss": 1.8686, + "step": 6052 + }, + { + "epoch": 1.857888275015347, + "grad_norm": 0.44419318437576294, + "learning_rate": 9.377293007734895e-05, + "loss": 1.929, + "step": 6053 + }, + { + "epoch": 1.858195211786372, + "grad_norm": 0.41038191318511963, + "learning_rate": 9.37705276236901e-05, + "loss": 1.9636, + "step": 6054 + }, + { + "epoch": 1.8585021485573971, + "grad_norm": 0.4431348145008087, + "learning_rate": 9.376812473746526e-05, + "loss": 1.953, + "step": 6055 + }, + { + "epoch": 1.8588090853284225, + "grad_norm": 0.42502057552337646, + "learning_rate": 9.376572141869814e-05, + "loss": 1.95, + "step": 6056 + }, + { + "epoch": 1.8591160220994474, + "grad_norm": 0.40050914883613586, + "learning_rate": 9.376331766741253e-05, + "loss": 1.9507, + "step": 6057 + }, + { + "epoch": 1.8594229588704727, + "grad_norm": 0.3863932490348816, + "learning_rate": 9.376091348363216e-05, + "loss": 1.8746, + "step": 6058 + }, + { + "epoch": 1.8597298956414978, + "grad_norm": 0.37295350432395935, + "learning_rate": 9.375850886738077e-05, + "loss": 1.8778, + "step": 6059 + }, + { + "epoch": 1.860036832412523, + "grad_norm": 0.37965887784957886, + "learning_rate": 9.375610381868217e-05, + "loss": 1.8511, + "step": 6060 + }, + { + "epoch": 1.8603437691835483, + "grad_norm": 0.3740752637386322, + "learning_rate": 9.37536983375601e-05, + "loss": 1.8988, + "step": 6061 + }, + { + "epoch": 1.8606507059545734, + "grad_norm": 0.40466782450675964, + "learning_rate": 9.375129242403834e-05, + "loss": 1.9195, + "step": 6062 + }, + { + "epoch": 1.8609576427255985, + "grad_norm": 0.3658956289291382, + "learning_rate": 9.374888607814067e-05, + "loss": 1.9598, + "step": 6063 + }, + { + "epoch": 1.8612645794966238, + "grad_norm": 0.3752783238887787, + "learning_rate": 9.374647929989085e-05, + "loss": 1.9791, + "step": 6064 + }, + { + "epoch": 1.8615715162676487, + "grad_norm": 0.408774733543396, + "learning_rate": 9.374407208931268e-05, + "loss": 1.88, + "step": 6065 + }, + { + "epoch": 1.861878453038674, + "grad_norm": 0.3968205749988556, + "learning_rate": 9.374166444642997e-05, + "loss": 1.8755, + "step": 6066 + }, + { + "epoch": 1.8621853898096992, + "grad_norm": 0.37851858139038086, + "learning_rate": 9.373925637126648e-05, + "loss": 1.9296, + "step": 6067 + }, + { + "epoch": 1.8624923265807243, + "grad_norm": 0.34285619854927063, + "learning_rate": 9.373684786384604e-05, + "loss": 2.0149, + "step": 6068 + }, + { + "epoch": 1.8627992633517496, + "grad_norm": 0.38841512799263, + "learning_rate": 9.373443892419242e-05, + "loss": 1.9134, + "step": 6069 + }, + { + "epoch": 1.8631062001227747, + "grad_norm": 0.4744485914707184, + "learning_rate": 9.373202955232943e-05, + "loss": 1.9164, + "step": 6070 + }, + { + "epoch": 1.8634131368937998, + "grad_norm": 0.522659420967102, + "learning_rate": 9.372961974828092e-05, + "loss": 1.9155, + "step": 6071 + }, + { + "epoch": 1.8637200736648252, + "grad_norm": 0.5794001817703247, + "learning_rate": 9.372720951207066e-05, + "loss": 1.9003, + "step": 6072 + }, + { + "epoch": 1.86402701043585, + "grad_norm": 0.5135447978973389, + "learning_rate": 9.372479884372247e-05, + "loss": 1.948, + "step": 6073 + }, + { + "epoch": 1.8643339472068754, + "grad_norm": 0.4060198664665222, + "learning_rate": 9.372238774326021e-05, + "loss": 1.8634, + "step": 6074 + }, + { + "epoch": 1.8646408839779005, + "grad_norm": 0.3880244195461273, + "learning_rate": 9.371997621070769e-05, + "loss": 1.8729, + "step": 6075 + }, + { + "epoch": 1.8649478207489256, + "grad_norm": 0.4862929582595825, + "learning_rate": 9.371756424608875e-05, + "loss": 1.9185, + "step": 6076 + }, + { + "epoch": 1.865254757519951, + "grad_norm": 0.4763035476207733, + "learning_rate": 9.371515184942719e-05, + "loss": 1.9696, + "step": 6077 + }, + { + "epoch": 1.865561694290976, + "grad_norm": 0.3552228808403015, + "learning_rate": 9.371273902074689e-05, + "loss": 1.9101, + "step": 6078 + }, + { + "epoch": 1.8658686310620012, + "grad_norm": 0.46329566836357117, + "learning_rate": 9.371032576007168e-05, + "loss": 1.8807, + "step": 6079 + }, + { + "epoch": 1.8661755678330265, + "grad_norm": 0.5176550149917603, + "learning_rate": 9.370791206742541e-05, + "loss": 1.9044, + "step": 6080 + }, + { + "epoch": 1.8664825046040514, + "grad_norm": 0.3929184675216675, + "learning_rate": 9.370549794283194e-05, + "loss": 1.8858, + "step": 6081 + }, + { + "epoch": 1.8667894413750767, + "grad_norm": 0.35135987401008606, + "learning_rate": 9.370308338631511e-05, + "loss": 1.8518, + "step": 6082 + }, + { + "epoch": 1.8670963781461019, + "grad_norm": 0.4229072034358978, + "learning_rate": 9.370066839789881e-05, + "loss": 1.891, + "step": 6083 + }, + { + "epoch": 1.867403314917127, + "grad_norm": 0.4862394630908966, + "learning_rate": 9.369825297760688e-05, + "loss": 1.9058, + "step": 6084 + }, + { + "epoch": 1.8677102516881523, + "grad_norm": 0.4775281548500061, + "learning_rate": 9.369583712546322e-05, + "loss": 1.9738, + "step": 6085 + }, + { + "epoch": 1.8680171884591774, + "grad_norm": 0.3831046521663666, + "learning_rate": 9.369342084149166e-05, + "loss": 1.9516, + "step": 6086 + }, + { + "epoch": 1.8683241252302025, + "grad_norm": 0.3970867395401001, + "learning_rate": 9.369100412571612e-05, + "loss": 2.0158, + "step": 6087 + }, + { + "epoch": 1.8686310620012279, + "grad_norm": 0.41662725806236267, + "learning_rate": 9.368858697816047e-05, + "loss": 1.86, + "step": 6088 + }, + { + "epoch": 1.8689379987722528, + "grad_norm": 0.44235244393348694, + "learning_rate": 9.36861693988486e-05, + "loss": 1.9257, + "step": 6089 + }, + { + "epoch": 1.869244935543278, + "grad_norm": 0.37863966822624207, + "learning_rate": 9.36837513878044e-05, + "loss": 1.8877, + "step": 6090 + }, + { + "epoch": 1.8695518723143032, + "grad_norm": 0.44757044315338135, + "learning_rate": 9.368133294505175e-05, + "loss": 1.8962, + "step": 6091 + }, + { + "epoch": 1.8698588090853283, + "grad_norm": 0.5299558639526367, + "learning_rate": 9.367891407061458e-05, + "loss": 1.8655, + "step": 6092 + }, + { + "epoch": 1.8701657458563536, + "grad_norm": 0.4899531900882721, + "learning_rate": 9.367649476451678e-05, + "loss": 1.8933, + "step": 6093 + }, + { + "epoch": 1.8704726826273788, + "grad_norm": 0.3883507251739502, + "learning_rate": 9.367407502678224e-05, + "loss": 1.88, + "step": 6094 + }, + { + "epoch": 1.8707796193984039, + "grad_norm": 0.40936750173568726, + "learning_rate": 9.367165485743493e-05, + "loss": 1.8926, + "step": 6095 + }, + { + "epoch": 1.8710865561694292, + "grad_norm": 0.5708447098731995, + "learning_rate": 9.36692342564987e-05, + "loss": 1.9701, + "step": 6096 + }, + { + "epoch": 1.8713934929404543, + "grad_norm": 0.5559602379798889, + "learning_rate": 9.366681322399751e-05, + "loss": 1.8962, + "step": 6097 + }, + { + "epoch": 1.8717004297114794, + "grad_norm": 0.45344826579093933, + "learning_rate": 9.366439175995528e-05, + "loss": 1.9766, + "step": 6098 + }, + { + "epoch": 1.8720073664825048, + "grad_norm": 0.4887133538722992, + "learning_rate": 9.366196986439592e-05, + "loss": 1.8982, + "step": 6099 + }, + { + "epoch": 1.8723143032535297, + "grad_norm": 0.536568284034729, + "learning_rate": 9.365954753734339e-05, + "loss": 1.9506, + "step": 6100 + }, + { + "epoch": 1.872621240024555, + "grad_norm": 0.4792746901512146, + "learning_rate": 9.365712477882162e-05, + "loss": 1.9392, + "step": 6101 + }, + { + "epoch": 1.87292817679558, + "grad_norm": 0.39836910367012024, + "learning_rate": 9.365470158885458e-05, + "loss": 1.8812, + "step": 6102 + }, + { + "epoch": 1.8732351135666052, + "grad_norm": 0.4263121783733368, + "learning_rate": 9.365227796746617e-05, + "loss": 1.8326, + "step": 6103 + }, + { + "epoch": 1.8735420503376305, + "grad_norm": 0.4158315360546112, + "learning_rate": 9.364985391468038e-05, + "loss": 1.8857, + "step": 6104 + }, + { + "epoch": 1.8738489871086557, + "grad_norm": 0.4384559094905853, + "learning_rate": 9.364742943052112e-05, + "loss": 1.9247, + "step": 6105 + }, + { + "epoch": 1.8741559238796808, + "grad_norm": 0.34221649169921875, + "learning_rate": 9.364500451501242e-05, + "loss": 1.8869, + "step": 6106 + }, + { + "epoch": 1.874462860650706, + "grad_norm": 0.38786688446998596, + "learning_rate": 9.364257916817817e-05, + "loss": 1.8879, + "step": 6107 + }, + { + "epoch": 1.874769797421731, + "grad_norm": 0.39408090710639954, + "learning_rate": 9.364015339004239e-05, + "loss": 1.8832, + "step": 6108 + }, + { + "epoch": 1.8750767341927563, + "grad_norm": 0.33985385298728943, + "learning_rate": 9.363772718062902e-05, + "loss": 1.8823, + "step": 6109 + }, + { + "epoch": 1.8753836709637814, + "grad_norm": 0.35319194197654724, + "learning_rate": 9.363530053996206e-05, + "loss": 1.9205, + "step": 6110 + }, + { + "epoch": 1.8756906077348066, + "grad_norm": 0.3455435335636139, + "learning_rate": 9.36328734680655e-05, + "loss": 1.9028, + "step": 6111 + }, + { + "epoch": 1.875997544505832, + "grad_norm": 0.3689115643501282, + "learning_rate": 9.363044596496329e-05, + "loss": 1.8996, + "step": 6112 + }, + { + "epoch": 1.876304481276857, + "grad_norm": 0.35776960849761963, + "learning_rate": 9.362801803067945e-05, + "loss": 1.9563, + "step": 6113 + }, + { + "epoch": 1.8766114180478821, + "grad_norm": 0.3524370491504669, + "learning_rate": 9.362558966523797e-05, + "loss": 1.9016, + "step": 6114 + }, + { + "epoch": 1.8769183548189075, + "grad_norm": 0.3725074529647827, + "learning_rate": 9.362316086866283e-05, + "loss": 1.9467, + "step": 6115 + }, + { + "epoch": 1.8772252915899323, + "grad_norm": 0.390055775642395, + "learning_rate": 9.362073164097807e-05, + "loss": 1.9326, + "step": 6116 + }, + { + "epoch": 1.8775322283609577, + "grad_norm": 0.39119964838027954, + "learning_rate": 9.361830198220764e-05, + "loss": 1.8723, + "step": 6117 + }, + { + "epoch": 1.8778391651319828, + "grad_norm": 0.3659103512763977, + "learning_rate": 9.36158718923756e-05, + "loss": 1.835, + "step": 6118 + }, + { + "epoch": 1.878146101903008, + "grad_norm": 0.3360283076763153, + "learning_rate": 9.361344137150597e-05, + "loss": 1.8622, + "step": 6119 + }, + { + "epoch": 1.8784530386740332, + "grad_norm": 0.35440295934677124, + "learning_rate": 9.361101041962272e-05, + "loss": 1.8523, + "step": 6120 + }, + { + "epoch": 1.8787599754450584, + "grad_norm": 1.2606174945831299, + "learning_rate": 9.36085790367499e-05, + "loss": 1.9826, + "step": 6121 + }, + { + "epoch": 1.8790669122160835, + "grad_norm": 0.49294769763946533, + "learning_rate": 9.360614722291157e-05, + "loss": 1.8478, + "step": 6122 + }, + { + "epoch": 1.8793738489871088, + "grad_norm": 0.5642881393432617, + "learning_rate": 9.360371497813172e-05, + "loss": 1.883, + "step": 6123 + }, + { + "epoch": 1.8796807857581337, + "grad_norm": 0.5257276296615601, + "learning_rate": 9.36012823024344e-05, + "loss": 1.8577, + "step": 6124 + }, + { + "epoch": 1.879987722529159, + "grad_norm": 0.36913231015205383, + "learning_rate": 9.359884919584366e-05, + "loss": 1.8934, + "step": 6125 + }, + { + "epoch": 1.8802946593001841, + "grad_norm": 0.43373262882232666, + "learning_rate": 9.359641565838353e-05, + "loss": 1.8354, + "step": 6126 + }, + { + "epoch": 1.8806015960712092, + "grad_norm": 0.5280462503433228, + "learning_rate": 9.359398169007807e-05, + "loss": 1.9446, + "step": 6127 + }, + { + "epoch": 1.8809085328422346, + "grad_norm": 0.4991915225982666, + "learning_rate": 9.359154729095135e-05, + "loss": 1.9003, + "step": 6128 + }, + { + "epoch": 1.8812154696132597, + "grad_norm": 0.3766331374645233, + "learning_rate": 9.358911246102738e-05, + "loss": 1.9149, + "step": 6129 + }, + { + "epoch": 1.8815224063842848, + "grad_norm": 0.39050692319869995, + "learning_rate": 9.358667720033026e-05, + "loss": 1.8945, + "step": 6130 + }, + { + "epoch": 1.8818293431553101, + "grad_norm": 0.47633904218673706, + "learning_rate": 9.358424150888405e-05, + "loss": 1.8772, + "step": 6131 + }, + { + "epoch": 1.882136279926335, + "grad_norm": 0.46322503685951233, + "learning_rate": 9.358180538671283e-05, + "loss": 1.893, + "step": 6132 + }, + { + "epoch": 1.8824432166973604, + "grad_norm": 0.39437612891197205, + "learning_rate": 9.357936883384066e-05, + "loss": 1.9394, + "step": 6133 + }, + { + "epoch": 1.8827501534683855, + "grad_norm": 0.4534996747970581, + "learning_rate": 9.357693185029162e-05, + "loss": 1.9689, + "step": 6134 + }, + { + "epoch": 1.8830570902394106, + "grad_norm": 0.4408230483531952, + "learning_rate": 9.35744944360898e-05, + "loss": 1.876, + "step": 6135 + }, + { + "epoch": 1.883364027010436, + "grad_norm": 0.5688899755477905, + "learning_rate": 9.35720565912593e-05, + "loss": 2.0153, + "step": 6136 + }, + { + "epoch": 1.883670963781461, + "grad_norm": 0.5005510449409485, + "learning_rate": 9.356961831582418e-05, + "loss": 1.9454, + "step": 6137 + }, + { + "epoch": 1.8839779005524862, + "grad_norm": 0.4002588987350464, + "learning_rate": 9.356717960980856e-05, + "loss": 1.9153, + "step": 6138 + }, + { + "epoch": 1.8842848373235115, + "grad_norm": 0.49053385853767395, + "learning_rate": 9.356474047323653e-05, + "loss": 1.9734, + "step": 6139 + }, + { + "epoch": 1.8845917740945364, + "grad_norm": 0.4828382432460785, + "learning_rate": 9.35623009061322e-05, + "loss": 1.8946, + "step": 6140 + }, + { + "epoch": 1.8848987108655617, + "grad_norm": 0.4389181137084961, + "learning_rate": 9.35598609085197e-05, + "loss": 1.9491, + "step": 6141 + }, + { + "epoch": 1.8852056476365868, + "grad_norm": 0.4010564982891083, + "learning_rate": 9.35574204804231e-05, + "loss": 1.8786, + "step": 6142 + }, + { + "epoch": 1.885512584407612, + "grad_norm": 0.4038756787776947, + "learning_rate": 9.355497962186657e-05, + "loss": 1.907, + "step": 6143 + }, + { + "epoch": 1.8858195211786373, + "grad_norm": 0.5030881762504578, + "learning_rate": 9.355253833287418e-05, + "loss": 1.8438, + "step": 6144 + }, + { + "epoch": 1.8861264579496624, + "grad_norm": 0.42690956592559814, + "learning_rate": 9.355009661347007e-05, + "loss": 1.8254, + "step": 6145 + }, + { + "epoch": 1.8864333947206875, + "grad_norm": 0.37733983993530273, + "learning_rate": 9.35476544636784e-05, + "loss": 1.9035, + "step": 6146 + }, + { + "epoch": 1.8867403314917128, + "grad_norm": 0.36874648928642273, + "learning_rate": 9.354521188352327e-05, + "loss": 1.885, + "step": 6147 + }, + { + "epoch": 1.8870472682627377, + "grad_norm": 0.36208659410476685, + "learning_rate": 9.354276887302885e-05, + "loss": 1.9416, + "step": 6148 + }, + { + "epoch": 1.887354205033763, + "grad_norm": 0.3952158987522125, + "learning_rate": 9.354032543221926e-05, + "loss": 1.9073, + "step": 6149 + }, + { + "epoch": 1.8876611418047882, + "grad_norm": 0.3603280782699585, + "learning_rate": 9.353788156111864e-05, + "loss": 1.9204, + "step": 6150 + }, + { + "epoch": 1.8879680785758133, + "grad_norm": 0.4325824975967407, + "learning_rate": 9.353543725975118e-05, + "loss": 1.9345, + "step": 6151 + }, + { + "epoch": 1.8882750153468386, + "grad_norm": 0.46270960569381714, + "learning_rate": 9.3532992528141e-05, + "loss": 1.9783, + "step": 6152 + }, + { + "epoch": 1.8885819521178637, + "grad_norm": 0.42317959666252136, + "learning_rate": 9.353054736631228e-05, + "loss": 1.9252, + "step": 6153 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.42653194069862366, + "learning_rate": 9.352810177428917e-05, + "loss": 1.9875, + "step": 6154 + }, + { + "epoch": 1.8891958256599142, + "grad_norm": 0.49819129705429077, + "learning_rate": 9.352565575209584e-05, + "loss": 1.9334, + "step": 6155 + }, + { + "epoch": 1.889502762430939, + "grad_norm": 0.4481790065765381, + "learning_rate": 9.352320929975646e-05, + "loss": 1.8939, + "step": 6156 + }, + { + "epoch": 1.8898096992019644, + "grad_norm": 0.41602686047554016, + "learning_rate": 9.352076241729524e-05, + "loss": 1.9207, + "step": 6157 + }, + { + "epoch": 1.8901166359729895, + "grad_norm": 0.4516759216785431, + "learning_rate": 9.351831510473633e-05, + "loss": 1.9384, + "step": 6158 + }, + { + "epoch": 1.8904235727440146, + "grad_norm": 0.5030251741409302, + "learning_rate": 9.351586736210391e-05, + "loss": 1.9787, + "step": 6159 + }, + { + "epoch": 1.89073050951504, + "grad_norm": 0.37176215648651123, + "learning_rate": 9.35134191894222e-05, + "loss": 1.8826, + "step": 6160 + }, + { + "epoch": 1.891037446286065, + "grad_norm": 0.3850235939025879, + "learning_rate": 9.351097058671537e-05, + "loss": 1.8689, + "step": 6161 + }, + { + "epoch": 1.8913443830570902, + "grad_norm": 0.3740260601043701, + "learning_rate": 9.350852155400764e-05, + "loss": 1.8624, + "step": 6162 + }, + { + "epoch": 1.8916513198281155, + "grad_norm": 0.386124849319458, + "learning_rate": 9.350607209132318e-05, + "loss": 1.8506, + "step": 6163 + }, + { + "epoch": 1.8919582565991406, + "grad_norm": 0.3743472993373871, + "learning_rate": 9.350362219868623e-05, + "loss": 1.9499, + "step": 6164 + }, + { + "epoch": 1.8922651933701657, + "grad_norm": 0.4257555603981018, + "learning_rate": 9.350117187612097e-05, + "loss": 1.9407, + "step": 6165 + }, + { + "epoch": 1.892572130141191, + "grad_norm": 0.37218552827835083, + "learning_rate": 9.349872112365163e-05, + "loss": 1.8772, + "step": 6166 + }, + { + "epoch": 1.892879066912216, + "grad_norm": 0.3443894386291504, + "learning_rate": 9.349626994130245e-05, + "loss": 1.8846, + "step": 6167 + }, + { + "epoch": 1.8931860036832413, + "grad_norm": 0.33507248759269714, + "learning_rate": 9.349381832909763e-05, + "loss": 1.9303, + "step": 6168 + }, + { + "epoch": 1.8934929404542664, + "grad_norm": 0.3844592869281769, + "learning_rate": 9.349136628706141e-05, + "loss": 1.9453, + "step": 6169 + }, + { + "epoch": 1.8937998772252915, + "grad_norm": 0.35765793919563293, + "learning_rate": 9.348891381521802e-05, + "loss": 1.8745, + "step": 6170 + }, + { + "epoch": 1.8941068139963169, + "grad_norm": 0.3732185661792755, + "learning_rate": 9.348646091359168e-05, + "loss": 1.9318, + "step": 6171 + }, + { + "epoch": 1.894413750767342, + "grad_norm": 0.3704257607460022, + "learning_rate": 9.348400758220666e-05, + "loss": 1.9285, + "step": 6172 + }, + { + "epoch": 1.894720687538367, + "grad_norm": 0.32159942388534546, + "learning_rate": 9.348155382108717e-05, + "loss": 1.8368, + "step": 6173 + }, + { + "epoch": 1.8950276243093924, + "grad_norm": 0.32755646109580994, + "learning_rate": 9.34790996302575e-05, + "loss": 1.8975, + "step": 6174 + }, + { + "epoch": 1.8953345610804173, + "grad_norm": 0.38797906041145325, + "learning_rate": 9.347664500974186e-05, + "loss": 1.9684, + "step": 6175 + }, + { + "epoch": 1.8956414978514426, + "grad_norm": 0.3870599865913391, + "learning_rate": 9.347418995956456e-05, + "loss": 1.963, + "step": 6176 + }, + { + "epoch": 1.8959484346224678, + "grad_norm": 0.35739025473594666, + "learning_rate": 9.347173447974982e-05, + "loss": 1.8912, + "step": 6177 + }, + { + "epoch": 1.8962553713934929, + "grad_norm": 0.3525852859020233, + "learning_rate": 9.346927857032193e-05, + "loss": 1.8455, + "step": 6178 + }, + { + "epoch": 1.8965623081645182, + "grad_norm": 0.39735934138298035, + "learning_rate": 9.346682223130514e-05, + "loss": 1.8824, + "step": 6179 + }, + { + "epoch": 1.8968692449355433, + "grad_norm": 0.3677692413330078, + "learning_rate": 9.346436546272373e-05, + "loss": 1.8723, + "step": 6180 + }, + { + "epoch": 1.8971761817065684, + "grad_norm": 0.3660476505756378, + "learning_rate": 9.346190826460199e-05, + "loss": 1.9674, + "step": 6181 + }, + { + "epoch": 1.8974831184775938, + "grad_norm": 0.4416230022907257, + "learning_rate": 9.34594506369642e-05, + "loss": 1.9309, + "step": 6182 + }, + { + "epoch": 1.8977900552486187, + "grad_norm": 0.39761826395988464, + "learning_rate": 9.345699257983466e-05, + "loss": 1.9408, + "step": 6183 + }, + { + "epoch": 1.898096992019644, + "grad_norm": 0.44419440627098083, + "learning_rate": 9.345453409323763e-05, + "loss": 2.0013, + "step": 6184 + }, + { + "epoch": 1.898403928790669, + "grad_norm": 0.4173676371574402, + "learning_rate": 9.345207517719743e-05, + "loss": 1.8462, + "step": 6185 + }, + { + "epoch": 1.8987108655616942, + "grad_norm": 0.39312002062797546, + "learning_rate": 9.344961583173837e-05, + "loss": 1.8716, + "step": 6186 + }, + { + "epoch": 1.8990178023327196, + "grad_norm": 0.389996737241745, + "learning_rate": 9.344715605688472e-05, + "loss": 1.9331, + "step": 6187 + }, + { + "epoch": 1.8993247391037447, + "grad_norm": 0.4575251340866089, + "learning_rate": 9.34446958526608e-05, + "loss": 1.9408, + "step": 6188 + }, + { + "epoch": 1.8996316758747698, + "grad_norm": 0.425075888633728, + "learning_rate": 9.344223521909097e-05, + "loss": 1.8632, + "step": 6189 + }, + { + "epoch": 1.899938612645795, + "grad_norm": 0.3622394800186157, + "learning_rate": 9.343977415619948e-05, + "loss": 1.8671, + "step": 6190 + }, + { + "epoch": 1.90024554941682, + "grad_norm": 0.38955047726631165, + "learning_rate": 9.343731266401068e-05, + "loss": 1.8955, + "step": 6191 + }, + { + "epoch": 1.9005524861878453, + "grad_norm": 0.40853381156921387, + "learning_rate": 9.34348507425489e-05, + "loss": 1.8477, + "step": 6192 + }, + { + "epoch": 1.9008594229588704, + "grad_norm": 0.36416095495224, + "learning_rate": 9.343238839183848e-05, + "loss": 1.8596, + "step": 6193 + }, + { + "epoch": 1.9011663597298956, + "grad_norm": 0.3371017277240753, + "learning_rate": 9.342992561190374e-05, + "loss": 1.9646, + "step": 6194 + }, + { + "epoch": 1.901473296500921, + "grad_norm": 0.3605191111564636, + "learning_rate": 9.3427462402769e-05, + "loss": 1.9165, + "step": 6195 + }, + { + "epoch": 1.901780233271946, + "grad_norm": 0.32952287793159485, + "learning_rate": 9.342499876445863e-05, + "loss": 1.8827, + "step": 6196 + }, + { + "epoch": 1.9020871700429711, + "grad_norm": 0.3627411425113678, + "learning_rate": 9.342253469699698e-05, + "loss": 1.9058, + "step": 6197 + }, + { + "epoch": 1.9023941068139965, + "grad_norm": 0.3830505311489105, + "learning_rate": 9.342007020040839e-05, + "loss": 1.89, + "step": 6198 + }, + { + "epoch": 1.9027010435850213, + "grad_norm": 0.36550065875053406, + "learning_rate": 9.341760527471722e-05, + "loss": 1.9004, + "step": 6199 + }, + { + "epoch": 1.9030079803560467, + "grad_norm": 0.4098506569862366, + "learning_rate": 9.341513991994782e-05, + "loss": 1.8656, + "step": 6200 + }, + { + "epoch": 1.9033149171270718, + "grad_norm": 0.5218825340270996, + "learning_rate": 9.341267413612456e-05, + "loss": 1.9179, + "step": 6201 + }, + { + "epoch": 1.903621853898097, + "grad_norm": 0.6201978921890259, + "learning_rate": 9.34102079232718e-05, + "loss": 1.9485, + "step": 6202 + }, + { + "epoch": 1.9039287906691222, + "grad_norm": 0.597594141960144, + "learning_rate": 9.340774128141395e-05, + "loss": 1.9074, + "step": 6203 + }, + { + "epoch": 1.9042357274401474, + "grad_norm": 0.477268248796463, + "learning_rate": 9.340527421057533e-05, + "loss": 1.9202, + "step": 6204 + }, + { + "epoch": 1.9045426642111725, + "grad_norm": 0.39805278182029724, + "learning_rate": 9.340280671078035e-05, + "loss": 1.8801, + "step": 6205 + }, + { + "epoch": 1.9048496009821978, + "grad_norm": 0.5815454721450806, + "learning_rate": 9.340033878205342e-05, + "loss": 1.8564, + "step": 6206 + }, + { + "epoch": 1.9051565377532227, + "grad_norm": 0.6385661363601685, + "learning_rate": 9.339787042441888e-05, + "loss": 1.8992, + "step": 6207 + }, + { + "epoch": 1.905463474524248, + "grad_norm": 0.5905124545097351, + "learning_rate": 9.339540163790116e-05, + "loss": 1.9608, + "step": 6208 + }, + { + "epoch": 1.9057704112952731, + "grad_norm": 0.37329113483428955, + "learning_rate": 9.339293242252465e-05, + "loss": 1.9037, + "step": 6209 + }, + { + "epoch": 1.9060773480662982, + "grad_norm": 0.4568968117237091, + "learning_rate": 9.339046277831374e-05, + "loss": 1.8719, + "step": 6210 + }, + { + "epoch": 1.9063842848373236, + "grad_norm": 0.43003782629966736, + "learning_rate": 9.338799270529284e-05, + "loss": 1.8594, + "step": 6211 + }, + { + "epoch": 1.9066912216083487, + "grad_norm": 0.3795240819454193, + "learning_rate": 9.338552220348637e-05, + "loss": 1.8645, + "step": 6212 + }, + { + "epoch": 1.9069981583793738, + "grad_norm": 0.3791581392288208, + "learning_rate": 9.338305127291876e-05, + "loss": 1.9076, + "step": 6213 + }, + { + "epoch": 1.9073050951503991, + "grad_norm": 0.3747733533382416, + "learning_rate": 9.338057991361438e-05, + "loss": 1.8665, + "step": 6214 + }, + { + "epoch": 1.907612031921424, + "grad_norm": 0.3994114100933075, + "learning_rate": 9.337810812559771e-05, + "loss": 1.9202, + "step": 6215 + }, + { + "epoch": 1.9079189686924494, + "grad_norm": 0.3808605670928955, + "learning_rate": 9.337563590889312e-05, + "loss": 1.9272, + "step": 6216 + }, + { + "epoch": 1.9082259054634745, + "grad_norm": 0.3461966812610626, + "learning_rate": 9.33731632635251e-05, + "loss": 1.8621, + "step": 6217 + }, + { + "epoch": 1.9085328422344996, + "grad_norm": 0.37272316217422485, + "learning_rate": 9.337069018951805e-05, + "loss": 1.8996, + "step": 6218 + }, + { + "epoch": 1.908839779005525, + "grad_norm": 0.40319329500198364, + "learning_rate": 9.336821668689642e-05, + "loss": 1.8852, + "step": 6219 + }, + { + "epoch": 1.90914671577655, + "grad_norm": 0.4059053659439087, + "learning_rate": 9.336574275568463e-05, + "loss": 1.9156, + "step": 6220 + }, + { + "epoch": 1.9094536525475752, + "grad_norm": 0.41244640946388245, + "learning_rate": 9.336326839590719e-05, + "loss": 1.9858, + "step": 6221 + }, + { + "epoch": 1.9097605893186005, + "grad_norm": 0.38230007886886597, + "learning_rate": 9.336079360758849e-05, + "loss": 1.8756, + "step": 6222 + }, + { + "epoch": 1.9100675260896254, + "grad_norm": 0.3620646297931671, + "learning_rate": 9.335831839075304e-05, + "loss": 1.9305, + "step": 6223 + }, + { + "epoch": 1.9103744628606507, + "grad_norm": 0.3700193166732788, + "learning_rate": 9.335584274542525e-05, + "loss": 1.8544, + "step": 6224 + }, + { + "epoch": 1.9106813996316758, + "grad_norm": 0.36827734112739563, + "learning_rate": 9.335336667162962e-05, + "loss": 1.8658, + "step": 6225 + }, + { + "epoch": 1.910988336402701, + "grad_norm": 0.33878061175346375, + "learning_rate": 9.33508901693906e-05, + "loss": 1.8638, + "step": 6226 + }, + { + "epoch": 1.9112952731737263, + "grad_norm": 0.3522186577320099, + "learning_rate": 9.334841323873269e-05, + "loss": 1.9109, + "step": 6227 + }, + { + "epoch": 1.9116022099447514, + "grad_norm": 0.3552776277065277, + "learning_rate": 9.334593587968035e-05, + "loss": 1.8499, + "step": 6228 + }, + { + "epoch": 1.9119091467157765, + "grad_norm": 0.3232300877571106, + "learning_rate": 9.334345809225805e-05, + "loss": 1.9078, + "step": 6229 + }, + { + "epoch": 1.9122160834868018, + "grad_norm": 0.3500599265098572, + "learning_rate": 9.33409798764903e-05, + "loss": 1.8953, + "step": 6230 + }, + { + "epoch": 1.9125230202578267, + "grad_norm": 0.4011479914188385, + "learning_rate": 9.333850123240159e-05, + "loss": 1.8961, + "step": 6231 + }, + { + "epoch": 1.912829957028852, + "grad_norm": 0.419539213180542, + "learning_rate": 9.333602216001642e-05, + "loss": 1.9381, + "step": 6232 + }, + { + "epoch": 1.9131368937998774, + "grad_norm": 0.364956259727478, + "learning_rate": 9.333354265935926e-05, + "loss": 1.8495, + "step": 6233 + }, + { + "epoch": 1.9134438305709023, + "grad_norm": 0.3322601318359375, + "learning_rate": 9.333106273045464e-05, + "loss": 1.8389, + "step": 6234 + }, + { + "epoch": 1.9137507673419276, + "grad_norm": 0.3706522583961487, + "learning_rate": 9.332858237332705e-05, + "loss": 1.904, + "step": 6235 + }, + { + "epoch": 1.9140577041129527, + "grad_norm": 0.3900963366031647, + "learning_rate": 9.332610158800104e-05, + "loss": 1.8974, + "step": 6236 + }, + { + "epoch": 1.9143646408839778, + "grad_norm": 0.3308334946632385, + "learning_rate": 9.332362037450108e-05, + "loss": 1.959, + "step": 6237 + }, + { + "epoch": 1.9146715776550032, + "grad_norm": 0.37876754999160767, + "learning_rate": 9.332113873285171e-05, + "loss": 1.9187, + "step": 6238 + }, + { + "epoch": 1.9149785144260283, + "grad_norm": 0.3557550609111786, + "learning_rate": 9.331865666307746e-05, + "loss": 1.9351, + "step": 6239 + }, + { + "epoch": 1.9152854511970534, + "grad_norm": 0.3792133927345276, + "learning_rate": 9.331617416520285e-05, + "loss": 1.8488, + "step": 6240 + }, + { + "epoch": 1.9155923879680787, + "grad_norm": 0.40517017245292664, + "learning_rate": 9.331369123925242e-05, + "loss": 1.9311, + "step": 6241 + }, + { + "epoch": 1.9158993247391036, + "grad_norm": 0.34011030197143555, + "learning_rate": 9.331120788525072e-05, + "loss": 1.8606, + "step": 6242 + }, + { + "epoch": 1.916206261510129, + "grad_norm": 0.39949584007263184, + "learning_rate": 9.330872410322227e-05, + "loss": 1.9156, + "step": 6243 + }, + { + "epoch": 1.916513198281154, + "grad_norm": 0.3771394193172455, + "learning_rate": 9.330623989319162e-05, + "loss": 1.8448, + "step": 6244 + }, + { + "epoch": 1.9168201350521792, + "grad_norm": 0.32114169001579285, + "learning_rate": 9.330375525518333e-05, + "loss": 1.8681, + "step": 6245 + }, + { + "epoch": 1.9171270718232045, + "grad_norm": 0.3438408672809601, + "learning_rate": 9.330127018922194e-05, + "loss": 1.8582, + "step": 6246 + }, + { + "epoch": 1.9174340085942296, + "grad_norm": 0.35971906781196594, + "learning_rate": 9.329878469533201e-05, + "loss": 1.9026, + "step": 6247 + }, + { + "epoch": 1.9177409453652547, + "grad_norm": 0.3953855633735657, + "learning_rate": 9.329629877353813e-05, + "loss": 1.8837, + "step": 6248 + }, + { + "epoch": 1.91804788213628, + "grad_norm": 0.36541905999183655, + "learning_rate": 9.329381242386485e-05, + "loss": 1.9156, + "step": 6249 + }, + { + "epoch": 1.918354818907305, + "grad_norm": 0.3577594459056854, + "learning_rate": 9.329132564633673e-05, + "loss": 1.8791, + "step": 6250 + }, + { + "epoch": 1.9186617556783303, + "grad_norm": 0.3869122564792633, + "learning_rate": 9.328883844097837e-05, + "loss": 1.9048, + "step": 6251 + }, + { + "epoch": 1.9189686924493554, + "grad_norm": 0.35097724199295044, + "learning_rate": 9.328635080781433e-05, + "loss": 1.9602, + "step": 6252 + }, + { + "epoch": 1.9192756292203805, + "grad_norm": 0.3813062012195587, + "learning_rate": 9.328386274686919e-05, + "loss": 1.9133, + "step": 6253 + }, + { + "epoch": 1.9195825659914059, + "grad_norm": 0.3950280249118805, + "learning_rate": 9.328137425816756e-05, + "loss": 1.9462, + "step": 6254 + }, + { + "epoch": 1.919889502762431, + "grad_norm": 0.41710540652275085, + "learning_rate": 9.327888534173402e-05, + "loss": 1.8616, + "step": 6255 + }, + { + "epoch": 1.920196439533456, + "grad_norm": 0.39998626708984375, + "learning_rate": 9.327639599759318e-05, + "loss": 1.8758, + "step": 6256 + }, + { + "epoch": 1.9205033763044814, + "grad_norm": 0.35425302386283875, + "learning_rate": 9.32739062257696e-05, + "loss": 1.8896, + "step": 6257 + }, + { + "epoch": 1.9208103130755063, + "grad_norm": 0.3487682640552521, + "learning_rate": 9.327141602628793e-05, + "loss": 1.8901, + "step": 6258 + }, + { + "epoch": 1.9211172498465316, + "grad_norm": 0.38767126202583313, + "learning_rate": 9.326892539917277e-05, + "loss": 1.9264, + "step": 6259 + }, + { + "epoch": 1.9214241866175568, + "grad_norm": 0.4265333116054535, + "learning_rate": 9.326643434444872e-05, + "loss": 1.9282, + "step": 6260 + }, + { + "epoch": 1.9217311233885819, + "grad_norm": 0.3386894166469574, + "learning_rate": 9.326394286214042e-05, + "loss": 1.8167, + "step": 6261 + }, + { + "epoch": 1.9220380601596072, + "grad_norm": 0.3594066798686981, + "learning_rate": 9.326145095227246e-05, + "loss": 1.9293, + "step": 6262 + }, + { + "epoch": 1.9223449969306323, + "grad_norm": 0.4041733741760254, + "learning_rate": 9.32589586148695e-05, + "loss": 2.0066, + "step": 6263 + }, + { + "epoch": 1.9226519337016574, + "grad_norm": 0.45588794350624084, + "learning_rate": 9.325646584995615e-05, + "loss": 1.9485, + "step": 6264 + }, + { + "epoch": 1.9229588704726828, + "grad_norm": 0.42583590745925903, + "learning_rate": 9.325397265755705e-05, + "loss": 1.8973, + "step": 6265 + }, + { + "epoch": 1.9232658072437077, + "grad_norm": 0.38701504468917847, + "learning_rate": 9.325147903769684e-05, + "loss": 1.9624, + "step": 6266 + }, + { + "epoch": 1.923572744014733, + "grad_norm": 0.4298608899116516, + "learning_rate": 9.324898499040017e-05, + "loss": 1.9033, + "step": 6267 + }, + { + "epoch": 1.923879680785758, + "grad_norm": 0.3692619800567627, + "learning_rate": 9.324649051569167e-05, + "loss": 1.973, + "step": 6268 + }, + { + "epoch": 1.9241866175567832, + "grad_norm": 0.40625011920928955, + "learning_rate": 9.324399561359602e-05, + "loss": 1.8629, + "step": 6269 + }, + { + "epoch": 1.9244935543278086, + "grad_norm": 0.43613263964653015, + "learning_rate": 9.324150028413784e-05, + "loss": 1.8928, + "step": 6270 + }, + { + "epoch": 1.9248004910988337, + "grad_norm": 0.4670937657356262, + "learning_rate": 9.323900452734182e-05, + "loss": 1.8809, + "step": 6271 + }, + { + "epoch": 1.9251074278698588, + "grad_norm": 0.43263986706733704, + "learning_rate": 9.323650834323262e-05, + "loss": 1.891, + "step": 6272 + }, + { + "epoch": 1.9254143646408841, + "grad_norm": 0.4253878891468048, + "learning_rate": 9.32340117318349e-05, + "loss": 2.0064, + "step": 6273 + }, + { + "epoch": 1.925721301411909, + "grad_norm": 0.3742302358150482, + "learning_rate": 9.323151469317332e-05, + "loss": 1.9441, + "step": 6274 + }, + { + "epoch": 1.9260282381829343, + "grad_norm": 0.37415632605552673, + "learning_rate": 9.32290172272726e-05, + "loss": 1.8901, + "step": 6275 + }, + { + "epoch": 1.9263351749539595, + "grad_norm": 0.402935266494751, + "learning_rate": 9.322651933415738e-05, + "loss": 1.9013, + "step": 6276 + }, + { + "epoch": 1.9266421117249846, + "grad_norm": 0.479819118976593, + "learning_rate": 9.322402101385235e-05, + "loss": 1.9713, + "step": 6277 + }, + { + "epoch": 1.92694904849601, + "grad_norm": 0.4472719430923462, + "learning_rate": 9.322152226638222e-05, + "loss": 1.9106, + "step": 6278 + }, + { + "epoch": 1.927255985267035, + "grad_norm": 0.36508920788764954, + "learning_rate": 9.321902309177168e-05, + "loss": 1.8999, + "step": 6279 + }, + { + "epoch": 1.9275629220380601, + "grad_norm": 0.38674476742744446, + "learning_rate": 9.321652349004542e-05, + "loss": 1.8653, + "step": 6280 + }, + { + "epoch": 1.9278698588090855, + "grad_norm": 0.3745587170124054, + "learning_rate": 9.321402346122814e-05, + "loss": 1.8764, + "step": 6281 + }, + { + "epoch": 1.9281767955801103, + "grad_norm": 0.37824445962905884, + "learning_rate": 9.321152300534454e-05, + "loss": 1.8712, + "step": 6282 + }, + { + "epoch": 1.9284837323511357, + "grad_norm": 0.3442685306072235, + "learning_rate": 9.320902212241936e-05, + "loss": 1.8242, + "step": 6283 + }, + { + "epoch": 1.9287906691221608, + "grad_norm": 0.3152186870574951, + "learning_rate": 9.32065208124773e-05, + "loss": 1.9282, + "step": 6284 + }, + { + "epoch": 1.929097605893186, + "grad_norm": 0.35380542278289795, + "learning_rate": 9.320401907554306e-05, + "loss": 1.8783, + "step": 6285 + }, + { + "epoch": 1.9294045426642112, + "grad_norm": 0.3140089511871338, + "learning_rate": 9.320151691164138e-05, + "loss": 1.9174, + "step": 6286 + }, + { + "epoch": 1.9297114794352364, + "grad_norm": 0.33666202425956726, + "learning_rate": 9.3199014320797e-05, + "loss": 1.8926, + "step": 6287 + }, + { + "epoch": 1.9300184162062615, + "grad_norm": 0.3297472894191742, + "learning_rate": 9.319651130303465e-05, + "loss": 1.8763, + "step": 6288 + }, + { + "epoch": 1.9303253529772868, + "grad_norm": 0.3323235511779785, + "learning_rate": 9.319400785837906e-05, + "loss": 1.9088, + "step": 6289 + }, + { + "epoch": 1.9306322897483117, + "grad_norm": 0.32601413130760193, + "learning_rate": 9.319150398685494e-05, + "loss": 1.8672, + "step": 6290 + }, + { + "epoch": 1.930939226519337, + "grad_norm": 0.35310089588165283, + "learning_rate": 9.318899968848708e-05, + "loss": 1.9492, + "step": 6291 + }, + { + "epoch": 1.9312461632903621, + "grad_norm": 0.3718548119068146, + "learning_rate": 9.31864949633002e-05, + "loss": 1.8692, + "step": 6292 + }, + { + "epoch": 1.9315531000613873, + "grad_norm": 0.42382025718688965, + "learning_rate": 9.318398981131908e-05, + "loss": 1.9693, + "step": 6293 + }, + { + "epoch": 1.9318600368324126, + "grad_norm": 0.5123299360275269, + "learning_rate": 9.318148423256845e-05, + "loss": 2.0117, + "step": 6294 + }, + { + "epoch": 1.9321669736034377, + "grad_norm": 0.4483809769153595, + "learning_rate": 9.317897822707308e-05, + "loss": 1.9165, + "step": 6295 + }, + { + "epoch": 1.9324739103744628, + "grad_norm": 0.4385908544063568, + "learning_rate": 9.317647179485776e-05, + "loss": 1.8869, + "step": 6296 + }, + { + "epoch": 1.9327808471454881, + "grad_norm": 0.42863771319389343, + "learning_rate": 9.317396493594724e-05, + "loss": 1.9484, + "step": 6297 + }, + { + "epoch": 1.933087783916513, + "grad_norm": 0.4130534529685974, + "learning_rate": 9.317145765036627e-05, + "loss": 1.9201, + "step": 6298 + }, + { + "epoch": 1.9333947206875384, + "grad_norm": 0.39024612307548523, + "learning_rate": 9.316894993813965e-05, + "loss": 1.9674, + "step": 6299 + }, + { + "epoch": 1.9337016574585635, + "grad_norm": 0.41060271859169006, + "learning_rate": 9.316644179929219e-05, + "loss": 1.9529, + "step": 6300 + }, + { + "epoch": 1.9340085942295886, + "grad_norm": 0.4302372634410858, + "learning_rate": 9.316393323384863e-05, + "loss": 1.8998, + "step": 6301 + }, + { + "epoch": 1.934315531000614, + "grad_norm": 0.3739410936832428, + "learning_rate": 9.316142424183379e-05, + "loss": 1.8812, + "step": 6302 + }, + { + "epoch": 1.934622467771639, + "grad_norm": 0.3965891897678375, + "learning_rate": 9.315891482327245e-05, + "loss": 1.8851, + "step": 6303 + }, + { + "epoch": 1.9349294045426642, + "grad_norm": 0.4486664831638336, + "learning_rate": 9.315640497818943e-05, + "loss": 1.9494, + "step": 6304 + }, + { + "epoch": 1.9352363413136895, + "grad_norm": 0.5530070662498474, + "learning_rate": 9.315389470660951e-05, + "loss": 1.9716, + "step": 6305 + }, + { + "epoch": 1.9355432780847146, + "grad_norm": 0.7142495512962341, + "learning_rate": 9.315138400855751e-05, + "loss": 1.947, + "step": 6306 + }, + { + "epoch": 1.9358502148557397, + "grad_norm": 0.7555594444274902, + "learning_rate": 9.314887288405827e-05, + "loss": 1.873, + "step": 6307 + }, + { + "epoch": 1.936157151626765, + "grad_norm": 0.6025232076644897, + "learning_rate": 9.314636133313654e-05, + "loss": 1.9189, + "step": 6308 + }, + { + "epoch": 1.93646408839779, + "grad_norm": 0.3686346113681793, + "learning_rate": 9.314384935581719e-05, + "loss": 1.8461, + "step": 6309 + }, + { + "epoch": 1.9367710251688153, + "grad_norm": 0.46265771985054016, + "learning_rate": 9.314133695212505e-05, + "loss": 1.8955, + "step": 6310 + }, + { + "epoch": 1.9370779619398404, + "grad_norm": 0.7023865580558777, + "learning_rate": 9.313882412208492e-05, + "loss": 1.9378, + "step": 6311 + }, + { + "epoch": 1.9373848987108655, + "grad_norm": 0.7163348197937012, + "learning_rate": 9.313631086572163e-05, + "loss": 1.9278, + "step": 6312 + }, + { + "epoch": 1.9376918354818908, + "grad_norm": 0.4772320091724396, + "learning_rate": 9.313379718306006e-05, + "loss": 1.9215, + "step": 6313 + }, + { + "epoch": 1.937998772252916, + "grad_norm": 0.4934171438217163, + "learning_rate": 9.313128307412501e-05, + "loss": 1.9725, + "step": 6314 + }, + { + "epoch": 1.938305709023941, + "grad_norm": 0.5988278985023499, + "learning_rate": 9.312876853894134e-05, + "loss": 1.9238, + "step": 6315 + }, + { + "epoch": 1.9386126457949664, + "grad_norm": 0.5819640159606934, + "learning_rate": 9.31262535775339e-05, + "loss": 1.9228, + "step": 6316 + }, + { + "epoch": 1.9389195825659913, + "grad_norm": 0.49525877833366394, + "learning_rate": 9.312373818992756e-05, + "loss": 1.8939, + "step": 6317 + }, + { + "epoch": 1.9392265193370166, + "grad_norm": 0.3778049647808075, + "learning_rate": 9.312122237614715e-05, + "loss": 1.8709, + "step": 6318 + }, + { + "epoch": 1.9395334561080417, + "grad_norm": 0.48716801404953003, + "learning_rate": 9.311870613621754e-05, + "loss": 1.9014, + "step": 6319 + }, + { + "epoch": 1.9398403928790668, + "grad_norm": 0.47298866510391235, + "learning_rate": 9.311618947016362e-05, + "loss": 1.8686, + "step": 6320 + }, + { + "epoch": 1.9401473296500922, + "grad_norm": 0.3709685206413269, + "learning_rate": 9.311367237801023e-05, + "loss": 1.9531, + "step": 6321 + }, + { + "epoch": 1.9404542664211173, + "grad_norm": 0.3898928761482239, + "learning_rate": 9.311115485978228e-05, + "loss": 1.8806, + "step": 6322 + }, + { + "epoch": 1.9407612031921424, + "grad_norm": 0.43091922998428345, + "learning_rate": 9.310863691550461e-05, + "loss": 1.9278, + "step": 6323 + }, + { + "epoch": 1.9410681399631677, + "grad_norm": 0.3788231909275055, + "learning_rate": 9.310611854520212e-05, + "loss": 1.893, + "step": 6324 + }, + { + "epoch": 1.9413750767341926, + "grad_norm": 0.4471469819545746, + "learning_rate": 9.310359974889972e-05, + "loss": 1.9706, + "step": 6325 + }, + { + "epoch": 1.941682013505218, + "grad_norm": 0.4047459661960602, + "learning_rate": 9.310108052662228e-05, + "loss": 1.8863, + "step": 6326 + }, + { + "epoch": 1.941988950276243, + "grad_norm": 0.4334566593170166, + "learning_rate": 9.309856087839468e-05, + "loss": 1.9543, + "step": 6327 + }, + { + "epoch": 1.9422958870472682, + "grad_norm": 0.3828316032886505, + "learning_rate": 9.309604080424185e-05, + "loss": 1.8601, + "step": 6328 + }, + { + "epoch": 1.9426028238182935, + "grad_norm": 0.3702560067176819, + "learning_rate": 9.30935203041887e-05, + "loss": 1.9055, + "step": 6329 + }, + { + "epoch": 1.9429097605893186, + "grad_norm": 0.4922797977924347, + "learning_rate": 9.309099937826011e-05, + "loss": 1.9589, + "step": 6330 + }, + { + "epoch": 1.9432166973603437, + "grad_norm": 0.4073271155357361, + "learning_rate": 9.308847802648102e-05, + "loss": 1.9727, + "step": 6331 + }, + { + "epoch": 1.943523634131369, + "grad_norm": 0.3833904266357422, + "learning_rate": 9.308595624887633e-05, + "loss": 1.8641, + "step": 6332 + }, + { + "epoch": 1.943830570902394, + "grad_norm": 0.44063761830329895, + "learning_rate": 9.308343404547095e-05, + "loss": 1.8996, + "step": 6333 + }, + { + "epoch": 1.9441375076734193, + "grad_norm": 0.4776977300643921, + "learning_rate": 9.308091141628983e-05, + "loss": 1.9353, + "step": 6334 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.39584699273109436, + "learning_rate": 9.307838836135792e-05, + "loss": 1.8521, + "step": 6335 + }, + { + "epoch": 1.9447513812154695, + "grad_norm": 0.3220890760421753, + "learning_rate": 9.30758648807001e-05, + "loss": 1.825, + "step": 6336 + }, + { + "epoch": 1.9450583179864949, + "grad_norm": 0.4301774501800537, + "learning_rate": 9.307334097434133e-05, + "loss": 1.9317, + "step": 6337 + }, + { + "epoch": 1.94536525475752, + "grad_norm": 0.439165323972702, + "learning_rate": 9.307081664230658e-05, + "loss": 1.8669, + "step": 6338 + }, + { + "epoch": 1.945672191528545, + "grad_norm": 0.4185279607772827, + "learning_rate": 9.306829188462076e-05, + "loss": 1.9512, + "step": 6339 + }, + { + "epoch": 1.9459791282995704, + "grad_norm": 0.4089502990245819, + "learning_rate": 9.306576670130885e-05, + "loss": 1.9607, + "step": 6340 + }, + { + "epoch": 1.9462860650705953, + "grad_norm": 0.508836567401886, + "learning_rate": 9.306324109239578e-05, + "loss": 1.9187, + "step": 6341 + }, + { + "epoch": 1.9465930018416207, + "grad_norm": 0.637534499168396, + "learning_rate": 9.306071505790652e-05, + "loss": 1.8237, + "step": 6342 + }, + { + "epoch": 1.9468999386126458, + "grad_norm": 0.5845112800598145, + "learning_rate": 9.305818859786603e-05, + "loss": 1.8238, + "step": 6343 + }, + { + "epoch": 1.9472068753836709, + "grad_norm": 0.4168374240398407, + "learning_rate": 9.305566171229932e-05, + "loss": 1.9343, + "step": 6344 + }, + { + "epoch": 1.9475138121546962, + "grad_norm": 0.43040701746940613, + "learning_rate": 9.305313440123129e-05, + "loss": 1.8774, + "step": 6345 + }, + { + "epoch": 1.9478207489257213, + "grad_norm": 0.6011641025543213, + "learning_rate": 9.305060666468696e-05, + "loss": 1.89, + "step": 6346 + }, + { + "epoch": 1.9481276856967464, + "grad_norm": 0.5530022382736206, + "learning_rate": 9.304807850269131e-05, + "loss": 2.0006, + "step": 6347 + }, + { + "epoch": 1.9484346224677718, + "grad_norm": 0.3707423210144043, + "learning_rate": 9.30455499152693e-05, + "loss": 1.9116, + "step": 6348 + }, + { + "epoch": 1.9487415592387967, + "grad_norm": 0.5013771653175354, + "learning_rate": 9.304302090244595e-05, + "loss": 1.8902, + "step": 6349 + }, + { + "epoch": 1.949048496009822, + "grad_norm": 0.5873609781265259, + "learning_rate": 9.304049146424623e-05, + "loss": 1.8879, + "step": 6350 + }, + { + "epoch": 1.949355432780847, + "grad_norm": 0.4389801621437073, + "learning_rate": 9.303796160069516e-05, + "loss": 1.9215, + "step": 6351 + }, + { + "epoch": 1.9496623695518722, + "grad_norm": 0.4004434645175934, + "learning_rate": 9.303543131181772e-05, + "loss": 1.9137, + "step": 6352 + }, + { + "epoch": 1.9499693063228976, + "grad_norm": 0.4928852617740631, + "learning_rate": 9.303290059763892e-05, + "loss": 1.9415, + "step": 6353 + }, + { + "epoch": 1.9502762430939227, + "grad_norm": 0.5045879483222961, + "learning_rate": 9.303036945818377e-05, + "loss": 1.8727, + "step": 6354 + }, + { + "epoch": 1.9505831798649478, + "grad_norm": 0.3434823453426361, + "learning_rate": 9.30278378934773e-05, + "loss": 1.8971, + "step": 6355 + }, + { + "epoch": 1.9508901166359731, + "grad_norm": 0.42980003356933594, + "learning_rate": 9.302530590354452e-05, + "loss": 1.9233, + "step": 6356 + }, + { + "epoch": 1.951197053406998, + "grad_norm": 0.3832406997680664, + "learning_rate": 9.302277348841042e-05, + "loss": 1.9317, + "step": 6357 + }, + { + "epoch": 1.9515039901780233, + "grad_norm": 0.37214264273643494, + "learning_rate": 9.30202406481001e-05, + "loss": 1.9172, + "step": 6358 + }, + { + "epoch": 1.9518109269490485, + "grad_norm": 0.3601585924625397, + "learning_rate": 9.30177073826385e-05, + "loss": 1.9286, + "step": 6359 + }, + { + "epoch": 1.9521178637200736, + "grad_norm": 0.36419349908828735, + "learning_rate": 9.301517369205072e-05, + "loss": 1.8624, + "step": 6360 + }, + { + "epoch": 1.952424800491099, + "grad_norm": 0.3808813691139221, + "learning_rate": 9.30126395763618e-05, + "loss": 1.8656, + "step": 6361 + }, + { + "epoch": 1.952731737262124, + "grad_norm": 0.39045700430870056, + "learning_rate": 9.301010503559675e-05, + "loss": 1.9205, + "step": 6362 + }, + { + "epoch": 1.9530386740331491, + "grad_norm": 0.37281444668769836, + "learning_rate": 9.300757006978065e-05, + "loss": 1.9162, + "step": 6363 + }, + { + "epoch": 1.9533456108041745, + "grad_norm": 0.4525204002857208, + "learning_rate": 9.300503467893851e-05, + "loss": 1.8999, + "step": 6364 + }, + { + "epoch": 1.9536525475751993, + "grad_norm": 0.41406187415122986, + "learning_rate": 9.300249886309542e-05, + "loss": 1.9804, + "step": 6365 + }, + { + "epoch": 1.9539594843462247, + "grad_norm": 0.4125058650970459, + "learning_rate": 9.299996262227644e-05, + "loss": 1.8464, + "step": 6366 + }, + { + "epoch": 1.9542664211172498, + "grad_norm": 0.41582876443862915, + "learning_rate": 9.299742595650663e-05, + "loss": 1.9937, + "step": 6367 + }, + { + "epoch": 1.954573357888275, + "grad_norm": 0.4360882639884949, + "learning_rate": 9.299488886581103e-05, + "loss": 1.9064, + "step": 6368 + }, + { + "epoch": 1.9548802946593002, + "grad_norm": 0.38369372487068176, + "learning_rate": 9.299235135021476e-05, + "loss": 1.9202, + "step": 6369 + }, + { + "epoch": 1.9551872314303254, + "grad_norm": 0.34401383996009827, + "learning_rate": 9.298981340974287e-05, + "loss": 1.844, + "step": 6370 + }, + { + "epoch": 1.9554941682013505, + "grad_norm": 0.3434326946735382, + "learning_rate": 9.298727504442044e-05, + "loss": 1.8206, + "step": 6371 + }, + { + "epoch": 1.9558011049723758, + "grad_norm": 0.35966724157333374, + "learning_rate": 9.298473625427257e-05, + "loss": 1.9, + "step": 6372 + }, + { + "epoch": 1.9561080417434007, + "grad_norm": 0.3726016581058502, + "learning_rate": 9.298219703932434e-05, + "loss": 1.9004, + "step": 6373 + }, + { + "epoch": 1.956414978514426, + "grad_norm": 0.3377366364002228, + "learning_rate": 9.297965739960084e-05, + "loss": 1.8747, + "step": 6374 + }, + { + "epoch": 1.9567219152854514, + "grad_norm": 0.36824578046798706, + "learning_rate": 9.297711733512718e-05, + "loss": 1.9059, + "step": 6375 + }, + { + "epoch": 1.9570288520564763, + "grad_norm": 0.3434023857116699, + "learning_rate": 9.297457684592847e-05, + "loss": 1.8624, + "step": 6376 + }, + { + "epoch": 1.9573357888275016, + "grad_norm": 0.36236703395843506, + "learning_rate": 9.297203593202979e-05, + "loss": 1.8558, + "step": 6377 + }, + { + "epoch": 1.9576427255985267, + "grad_norm": 0.3326953947544098, + "learning_rate": 9.296949459345625e-05, + "loss": 1.9189, + "step": 6378 + }, + { + "epoch": 1.9579496623695518, + "grad_norm": 0.3358452022075653, + "learning_rate": 9.2966952830233e-05, + "loss": 1.8601, + "step": 6379 + }, + { + "epoch": 1.9582565991405771, + "grad_norm": 0.36092114448547363, + "learning_rate": 9.296441064238514e-05, + "loss": 1.873, + "step": 6380 + }, + { + "epoch": 1.9585635359116023, + "grad_norm": 0.345683217048645, + "learning_rate": 9.296186802993778e-05, + "loss": 1.9122, + "step": 6381 + }, + { + "epoch": 1.9588704726826274, + "grad_norm": 0.32488611340522766, + "learning_rate": 9.295932499291606e-05, + "loss": 1.8709, + "step": 6382 + }, + { + "epoch": 1.9591774094536527, + "grad_norm": 0.34276288747787476, + "learning_rate": 9.295678153134512e-05, + "loss": 1.937, + "step": 6383 + }, + { + "epoch": 1.9594843462246776, + "grad_norm": 0.3953622877597809, + "learning_rate": 9.295423764525008e-05, + "loss": 1.9357, + "step": 6384 + }, + { + "epoch": 1.959791282995703, + "grad_norm": 0.37806951999664307, + "learning_rate": 9.29516933346561e-05, + "loss": 1.8813, + "step": 6385 + }, + { + "epoch": 1.960098219766728, + "grad_norm": 0.39551272988319397, + "learning_rate": 9.29491485995883e-05, + "loss": 1.8812, + "step": 6386 + }, + { + "epoch": 1.9604051565377532, + "grad_norm": 0.37042370438575745, + "learning_rate": 9.294660344007184e-05, + "loss": 1.9059, + "step": 6387 + }, + { + "epoch": 1.9607120933087785, + "grad_norm": 0.37503576278686523, + "learning_rate": 9.294405785613187e-05, + "loss": 1.9792, + "step": 6388 + }, + { + "epoch": 1.9610190300798036, + "grad_norm": 0.3515741229057312, + "learning_rate": 9.294151184779355e-05, + "loss": 1.8792, + "step": 6389 + }, + { + "epoch": 1.9613259668508287, + "grad_norm": 0.319890558719635, + "learning_rate": 9.293896541508205e-05, + "loss": 1.9222, + "step": 6390 + }, + { + "epoch": 1.961632903621854, + "grad_norm": 0.3517487645149231, + "learning_rate": 9.293641855802252e-05, + "loss": 1.8751, + "step": 6391 + }, + { + "epoch": 1.961939840392879, + "grad_norm": 0.33269986510276794, + "learning_rate": 9.293387127664012e-05, + "loss": 1.8372, + "step": 6392 + }, + { + "epoch": 1.9622467771639043, + "grad_norm": 0.36048516631126404, + "learning_rate": 9.293132357096007e-05, + "loss": 1.8944, + "step": 6393 + }, + { + "epoch": 1.9625537139349294, + "grad_norm": 0.4329642057418823, + "learning_rate": 9.292877544100751e-05, + "loss": 1.9868, + "step": 6394 + }, + { + "epoch": 1.9628606507059545, + "grad_norm": 0.445496529340744, + "learning_rate": 9.292622688680762e-05, + "loss": 1.9885, + "step": 6395 + }, + { + "epoch": 1.9631675874769798, + "grad_norm": 0.3818886876106262, + "learning_rate": 9.292367790838561e-05, + "loss": 1.9515, + "step": 6396 + }, + { + "epoch": 1.963474524248005, + "grad_norm": 0.3800121545791626, + "learning_rate": 9.292112850576664e-05, + "loss": 1.8838, + "step": 6397 + }, + { + "epoch": 1.96378146101903, + "grad_norm": 0.44252321124076843, + "learning_rate": 9.291857867897593e-05, + "loss": 1.9296, + "step": 6398 + }, + { + "epoch": 1.9640883977900554, + "grad_norm": 0.463766485452652, + "learning_rate": 9.291602842803867e-05, + "loss": 1.9164, + "step": 6399 + }, + { + "epoch": 1.9643953345610803, + "grad_norm": 0.4599217474460602, + "learning_rate": 9.291347775298006e-05, + "loss": 1.9277, + "step": 6400 + }, + { + "epoch": 1.9647022713321056, + "grad_norm": 0.371346652507782, + "learning_rate": 9.291092665382532e-05, + "loss": 1.9036, + "step": 6401 + }, + { + "epoch": 1.9650092081031307, + "grad_norm": 0.327197402715683, + "learning_rate": 9.290837513059965e-05, + "loss": 1.8214, + "step": 6402 + }, + { + "epoch": 1.9653161448741558, + "grad_norm": 0.3346688747406006, + "learning_rate": 9.290582318332826e-05, + "loss": 1.8671, + "step": 6403 + }, + { + "epoch": 1.9656230816451812, + "grad_norm": 0.342208594083786, + "learning_rate": 9.290327081203637e-05, + "loss": 1.9143, + "step": 6404 + }, + { + "epoch": 1.9659300184162063, + "grad_norm": 0.3430559039115906, + "learning_rate": 9.290071801674923e-05, + "loss": 1.9135, + "step": 6405 + }, + { + "epoch": 1.9662369551872314, + "grad_norm": 0.3335573971271515, + "learning_rate": 9.289816479749202e-05, + "loss": 1.9011, + "step": 6406 + }, + { + "epoch": 1.9665438919582567, + "grad_norm": 0.3464879095554352, + "learning_rate": 9.289561115429004e-05, + "loss": 1.9061, + "step": 6407 + }, + { + "epoch": 1.9668508287292816, + "grad_norm": 0.3513408899307251, + "learning_rate": 9.289305708716847e-05, + "loss": 1.8982, + "step": 6408 + }, + { + "epoch": 1.967157765500307, + "grad_norm": 0.3888663947582245, + "learning_rate": 9.289050259615256e-05, + "loss": 1.9196, + "step": 6409 + }, + { + "epoch": 1.967464702271332, + "grad_norm": 0.3414073884487152, + "learning_rate": 9.288794768126759e-05, + "loss": 1.932, + "step": 6410 + }, + { + "epoch": 1.9677716390423572, + "grad_norm": 0.33067384362220764, + "learning_rate": 9.288539234253876e-05, + "loss": 1.8547, + "step": 6411 + }, + { + "epoch": 1.9680785758133825, + "grad_norm": 0.31827688217163086, + "learning_rate": 9.288283657999135e-05, + "loss": 1.8691, + "step": 6412 + }, + { + "epoch": 1.9683855125844076, + "grad_norm": 0.32259073853492737, + "learning_rate": 9.288028039365062e-05, + "loss": 1.8889, + "step": 6413 + }, + { + "epoch": 1.9686924493554327, + "grad_norm": 0.37552687525749207, + "learning_rate": 9.287772378354182e-05, + "loss": 1.8709, + "step": 6414 + }, + { + "epoch": 1.968999386126458, + "grad_norm": 0.3446151316165924, + "learning_rate": 9.287516674969024e-05, + "loss": 1.8749, + "step": 6415 + }, + { + "epoch": 1.969306322897483, + "grad_norm": 0.3648208975791931, + "learning_rate": 9.287260929212111e-05, + "loss": 1.93, + "step": 6416 + }, + { + "epoch": 1.9696132596685083, + "grad_norm": 0.3430599868297577, + "learning_rate": 9.287005141085974e-05, + "loss": 1.8537, + "step": 6417 + }, + { + "epoch": 1.9699201964395334, + "grad_norm": 0.39110586047172546, + "learning_rate": 9.286749310593139e-05, + "loss": 1.987, + "step": 6418 + }, + { + "epoch": 1.9702271332105585, + "grad_norm": 0.4033393859863281, + "learning_rate": 9.286493437736136e-05, + "loss": 1.9793, + "step": 6419 + }, + { + "epoch": 1.9705340699815839, + "grad_norm": 0.3950151205062866, + "learning_rate": 9.286237522517491e-05, + "loss": 1.8781, + "step": 6420 + }, + { + "epoch": 1.970841006752609, + "grad_norm": 0.4614053964614868, + "learning_rate": 9.285981564939735e-05, + "loss": 1.9886, + "step": 6421 + }, + { + "epoch": 1.971147943523634, + "grad_norm": 0.4990023076534271, + "learning_rate": 9.285725565005398e-05, + "loss": 1.8957, + "step": 6422 + }, + { + "epoch": 1.9714548802946594, + "grad_norm": 0.501301109790802, + "learning_rate": 9.285469522717008e-05, + "loss": 1.8606, + "step": 6423 + }, + { + "epoch": 1.9717618170656843, + "grad_norm": 0.3820148706436157, + "learning_rate": 9.285213438077097e-05, + "loss": 1.9097, + "step": 6424 + }, + { + "epoch": 1.9720687538367097, + "grad_norm": 0.3959129750728607, + "learning_rate": 9.284957311088193e-05, + "loss": 1.8972, + "step": 6425 + }, + { + "epoch": 1.9723756906077348, + "grad_norm": 0.4914678931236267, + "learning_rate": 9.284701141752831e-05, + "loss": 1.9211, + "step": 6426 + }, + { + "epoch": 1.9726826273787599, + "grad_norm": 0.5992010831832886, + "learning_rate": 9.284444930073542e-05, + "loss": 1.917, + "step": 6427 + }, + { + "epoch": 1.9729895641497852, + "grad_norm": 0.6089407801628113, + "learning_rate": 9.284188676052856e-05, + "loss": 1.9497, + "step": 6428 + }, + { + "epoch": 1.9732965009208103, + "grad_norm": 0.5493173003196716, + "learning_rate": 9.283932379693306e-05, + "loss": 1.9888, + "step": 6429 + }, + { + "epoch": 1.9736034376918354, + "grad_norm": 0.4451984167098999, + "learning_rate": 9.283676040997426e-05, + "loss": 1.892, + "step": 6430 + }, + { + "epoch": 1.9739103744628608, + "grad_norm": 0.35765743255615234, + "learning_rate": 9.283419659967748e-05, + "loss": 1.8768, + "step": 6431 + }, + { + "epoch": 1.9742173112338857, + "grad_norm": 0.36561164259910583, + "learning_rate": 9.283163236606807e-05, + "loss": 1.825, + "step": 6432 + }, + { + "epoch": 1.974524248004911, + "grad_norm": 0.38473913073539734, + "learning_rate": 9.282906770917137e-05, + "loss": 1.9247, + "step": 6433 + }, + { + "epoch": 1.974831184775936, + "grad_norm": 0.324945867061615, + "learning_rate": 9.28265026290127e-05, + "loss": 1.8832, + "step": 6434 + }, + { + "epoch": 1.9751381215469612, + "grad_norm": 0.38697487115859985, + "learning_rate": 9.282393712561744e-05, + "loss": 1.9282, + "step": 6435 + }, + { + "epoch": 1.9754450583179866, + "grad_norm": 0.3772333264350891, + "learning_rate": 9.282137119901094e-05, + "loss": 1.8822, + "step": 6436 + }, + { + "epoch": 1.9757519950890117, + "grad_norm": 0.3522745668888092, + "learning_rate": 9.281880484921854e-05, + "loss": 1.9102, + "step": 6437 + }, + { + "epoch": 1.9760589318600368, + "grad_norm": 0.36745330691337585, + "learning_rate": 9.281623807626562e-05, + "loss": 1.8842, + "step": 6438 + }, + { + "epoch": 1.9763658686310621, + "grad_norm": 0.3990548253059387, + "learning_rate": 9.281367088017755e-05, + "loss": 1.9642, + "step": 6439 + }, + { + "epoch": 1.976672805402087, + "grad_norm": 0.3333520293235779, + "learning_rate": 9.281110326097969e-05, + "loss": 1.8541, + "step": 6440 + }, + { + "epoch": 1.9769797421731123, + "grad_norm": 0.3282802700996399, + "learning_rate": 9.280853521869739e-05, + "loss": 1.8416, + "step": 6441 + }, + { + "epoch": 1.9772866789441375, + "grad_norm": 0.3415268361568451, + "learning_rate": 9.280596675335607e-05, + "loss": 1.9009, + "step": 6442 + }, + { + "epoch": 1.9775936157151626, + "grad_norm": 0.3621836006641388, + "learning_rate": 9.28033978649811e-05, + "loss": 1.8584, + "step": 6443 + }, + { + "epoch": 1.977900552486188, + "grad_norm": 0.34778010845184326, + "learning_rate": 9.280082855359786e-05, + "loss": 1.9455, + "step": 6444 + }, + { + "epoch": 1.978207489257213, + "grad_norm": 0.36525633931159973, + "learning_rate": 9.279825881923174e-05, + "loss": 1.9182, + "step": 6445 + }, + { + "epoch": 1.9785144260282381, + "grad_norm": 0.3404203951358795, + "learning_rate": 9.279568866190815e-05, + "loss": 1.8853, + "step": 6446 + }, + { + "epoch": 1.9788213627992635, + "grad_norm": 0.4564785659313202, + "learning_rate": 9.279311808165249e-05, + "loss": 2.0012, + "step": 6447 + }, + { + "epoch": 1.9791282995702886, + "grad_norm": 0.4371441602706909, + "learning_rate": 9.279054707849015e-05, + "loss": 1.9372, + "step": 6448 + }, + { + "epoch": 1.9794352363413137, + "grad_norm": 0.3928726017475128, + "learning_rate": 9.278797565244652e-05, + "loss": 1.882, + "step": 6449 + }, + { + "epoch": 1.979742173112339, + "grad_norm": 0.483331561088562, + "learning_rate": 9.278540380354706e-05, + "loss": 1.9664, + "step": 6450 + }, + { + "epoch": 1.980049109883364, + "grad_norm": 0.39085066318511963, + "learning_rate": 9.278283153181716e-05, + "loss": 1.874, + "step": 6451 + }, + { + "epoch": 1.9803560466543892, + "grad_norm": 0.3549460172653198, + "learning_rate": 9.278025883728224e-05, + "loss": 1.9108, + "step": 6452 + }, + { + "epoch": 1.9806629834254144, + "grad_norm": 0.4260072410106659, + "learning_rate": 9.277768571996772e-05, + "loss": 1.8621, + "step": 6453 + }, + { + "epoch": 1.9809699201964395, + "grad_norm": 0.4531188905239105, + "learning_rate": 9.277511217989904e-05, + "loss": 1.9924, + "step": 6454 + }, + { + "epoch": 1.9812768569674648, + "grad_norm": 0.34916743636131287, + "learning_rate": 9.277253821710165e-05, + "loss": 1.9459, + "step": 6455 + }, + { + "epoch": 1.98158379373849, + "grad_norm": 0.45466169714927673, + "learning_rate": 9.276996383160095e-05, + "loss": 1.9129, + "step": 6456 + }, + { + "epoch": 1.981890730509515, + "grad_norm": 0.4948022663593292, + "learning_rate": 9.27673890234224e-05, + "loss": 1.9362, + "step": 6457 + }, + { + "epoch": 1.9821976672805404, + "grad_norm": 0.43365779519081116, + "learning_rate": 9.276481379259146e-05, + "loss": 1.9323, + "step": 6458 + }, + { + "epoch": 1.9825046040515653, + "grad_norm": 0.5301255583763123, + "learning_rate": 9.276223813913354e-05, + "loss": 1.9611, + "step": 6459 + }, + { + "epoch": 1.9828115408225906, + "grad_norm": 0.4785257577896118, + "learning_rate": 9.275966206307412e-05, + "loss": 1.8945, + "step": 6460 + }, + { + "epoch": 1.9831184775936157, + "grad_norm": 0.4091590940952301, + "learning_rate": 9.275708556443868e-05, + "loss": 1.9171, + "step": 6461 + }, + { + "epoch": 1.9834254143646408, + "grad_norm": 0.4031025767326355, + "learning_rate": 9.275450864325264e-05, + "loss": 1.9518, + "step": 6462 + }, + { + "epoch": 1.9837323511356661, + "grad_norm": 0.39147642254829407, + "learning_rate": 9.275193129954149e-05, + "loss": 1.8756, + "step": 6463 + }, + { + "epoch": 1.9840392879066913, + "grad_norm": 0.3863523006439209, + "learning_rate": 9.27493535333307e-05, + "loss": 1.8894, + "step": 6464 + }, + { + "epoch": 1.9843462246777164, + "grad_norm": 0.36373165249824524, + "learning_rate": 9.274677534464576e-05, + "loss": 1.8574, + "step": 6465 + }, + { + "epoch": 1.9846531614487417, + "grad_norm": 0.40247389674186707, + "learning_rate": 9.274419673351211e-05, + "loss": 1.832, + "step": 6466 + }, + { + "epoch": 1.9849600982197666, + "grad_norm": 0.3874013125896454, + "learning_rate": 9.274161769995526e-05, + "loss": 1.9079, + "step": 6467 + }, + { + "epoch": 1.985267034990792, + "grad_norm": 0.35506606101989746, + "learning_rate": 9.27390382440007e-05, + "loss": 1.8784, + "step": 6468 + }, + { + "epoch": 1.985573971761817, + "grad_norm": 0.406325101852417, + "learning_rate": 9.273645836567388e-05, + "loss": 1.9822, + "step": 6469 + }, + { + "epoch": 1.9858809085328422, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.273387806500036e-05, + "loss": 1.9334, + "step": 6470 + }, + { + "epoch": 1.9861878453038675, + "grad_norm": 0.4810343384742737, + "learning_rate": 9.273129734200561e-05, + "loss": 1.9598, + "step": 6471 + }, + { + "epoch": 1.9864947820748926, + "grad_norm": 0.4552834630012512, + "learning_rate": 9.272871619671513e-05, + "loss": 1.9504, + "step": 6472 + }, + { + "epoch": 1.9868017188459177, + "grad_norm": 0.38974207639694214, + "learning_rate": 9.272613462915443e-05, + "loss": 1.8811, + "step": 6473 + }, + { + "epoch": 1.987108655616943, + "grad_norm": 0.40983298420906067, + "learning_rate": 9.272355263934902e-05, + "loss": 1.8876, + "step": 6474 + }, + { + "epoch": 1.987415592387968, + "grad_norm": 0.3684757947921753, + "learning_rate": 9.272097022732443e-05, + "loss": 1.921, + "step": 6475 + }, + { + "epoch": 1.9877225291589933, + "grad_norm": 0.38384270668029785, + "learning_rate": 9.271838739310618e-05, + "loss": 1.9099, + "step": 6476 + }, + { + "epoch": 1.9880294659300184, + "grad_norm": 0.3783731460571289, + "learning_rate": 9.271580413671976e-05, + "loss": 1.9322, + "step": 6477 + }, + { + "epoch": 1.9883364027010435, + "grad_norm": 0.3686216473579407, + "learning_rate": 9.271322045819076e-05, + "loss": 1.914, + "step": 6478 + }, + { + "epoch": 1.9886433394720688, + "grad_norm": 0.38776305317878723, + "learning_rate": 9.271063635754466e-05, + "loss": 1.9331, + "step": 6479 + }, + { + "epoch": 1.988950276243094, + "grad_norm": 0.35099950432777405, + "learning_rate": 9.270805183480702e-05, + "loss": 1.9837, + "step": 6480 + }, + { + "epoch": 1.989257213014119, + "grad_norm": 0.3736453652381897, + "learning_rate": 9.270546689000339e-05, + "loss": 1.846, + "step": 6481 + }, + { + "epoch": 1.9895641497851444, + "grad_norm": 0.3654848635196686, + "learning_rate": 9.27028815231593e-05, + "loss": 1.8987, + "step": 6482 + }, + { + "epoch": 1.9898710865561693, + "grad_norm": 0.3534870147705078, + "learning_rate": 9.27002957343003e-05, + "loss": 1.868, + "step": 6483 + }, + { + "epoch": 1.9901780233271946, + "grad_norm": 0.3143392503261566, + "learning_rate": 9.269770952345197e-05, + "loss": 1.8042, + "step": 6484 + }, + { + "epoch": 1.9904849600982197, + "grad_norm": 0.37151026725769043, + "learning_rate": 9.269512289063982e-05, + "loss": 1.8392, + "step": 6485 + }, + { + "epoch": 1.9907918968692448, + "grad_norm": 0.39781463146209717, + "learning_rate": 9.269253583588947e-05, + "loss": 1.9911, + "step": 6486 + }, + { + "epoch": 1.9910988336402702, + "grad_norm": 0.44022107124328613, + "learning_rate": 9.268994835922643e-05, + "loss": 1.9644, + "step": 6487 + }, + { + "epoch": 1.9914057704112953, + "grad_norm": 0.4058530628681183, + "learning_rate": 9.268736046067632e-05, + "loss": 1.9062, + "step": 6488 + }, + { + "epoch": 1.9917127071823204, + "grad_norm": 0.3754481077194214, + "learning_rate": 9.268477214026467e-05, + "loss": 1.8278, + "step": 6489 + }, + { + "epoch": 1.9920196439533457, + "grad_norm": 0.318208247423172, + "learning_rate": 9.268218339801711e-05, + "loss": 1.8529, + "step": 6490 + }, + { + "epoch": 1.9923265807243706, + "grad_norm": 0.350777268409729, + "learning_rate": 9.267959423395918e-05, + "loss": 1.9024, + "step": 6491 + }, + { + "epoch": 1.992633517495396, + "grad_norm": 0.3145158588886261, + "learning_rate": 9.26770046481165e-05, + "loss": 1.934, + "step": 6492 + }, + { + "epoch": 1.992940454266421, + "grad_norm": 0.3347548842430115, + "learning_rate": 9.267441464051463e-05, + "loss": 1.8989, + "step": 6493 + }, + { + "epoch": 1.9932473910374462, + "grad_norm": 0.33111512660980225, + "learning_rate": 9.267182421117919e-05, + "loss": 1.8808, + "step": 6494 + }, + { + "epoch": 1.9935543278084715, + "grad_norm": 0.3135010898113251, + "learning_rate": 9.266923336013577e-05, + "loss": 1.895, + "step": 6495 + }, + { + "epoch": 1.9938612645794966, + "grad_norm": 0.3638830780982971, + "learning_rate": 9.266664208740998e-05, + "loss": 1.9331, + "step": 6496 + }, + { + "epoch": 1.9941682013505218, + "grad_norm": 0.3592624068260193, + "learning_rate": 9.266405039302743e-05, + "loss": 1.8963, + "step": 6497 + }, + { + "epoch": 1.994475138121547, + "grad_norm": 0.34216129779815674, + "learning_rate": 9.266145827701371e-05, + "loss": 1.9062, + "step": 6498 + }, + { + "epoch": 1.994782074892572, + "grad_norm": 0.4180343747138977, + "learning_rate": 9.265886573939447e-05, + "loss": 1.9351, + "step": 6499 + }, + { + "epoch": 1.9950890116635973, + "grad_norm": 0.36890342831611633, + "learning_rate": 9.265627278019531e-05, + "loss": 1.9037, + "step": 6500 + }, + { + "epoch": 1.9953959484346224, + "grad_norm": 0.36638152599334717, + "learning_rate": 9.265367939944188e-05, + "loss": 1.9524, + "step": 6501 + }, + { + "epoch": 1.9957028852056475, + "grad_norm": 0.44918373227119446, + "learning_rate": 9.265108559715976e-05, + "loss": 1.9236, + "step": 6502 + }, + { + "epoch": 1.9960098219766729, + "grad_norm": 0.3805326521396637, + "learning_rate": 9.264849137337462e-05, + "loss": 1.8526, + "step": 6503 + }, + { + "epoch": 1.996316758747698, + "grad_norm": 0.39035212993621826, + "learning_rate": 9.26458967281121e-05, + "loss": 1.8256, + "step": 6504 + }, + { + "epoch": 1.996623695518723, + "grad_norm": 0.330522358417511, + "learning_rate": 9.264330166139783e-05, + "loss": 1.8487, + "step": 6505 + }, + { + "epoch": 1.9969306322897484, + "grad_norm": 0.33569198846817017, + "learning_rate": 9.264070617325746e-05, + "loss": 1.8735, + "step": 6506 + }, + { + "epoch": 1.9972375690607733, + "grad_norm": 0.4121384918689728, + "learning_rate": 9.263811026371664e-05, + "loss": 2.0028, + "step": 6507 + }, + { + "epoch": 1.9975445058317987, + "grad_norm": 0.3419879972934723, + "learning_rate": 9.263551393280103e-05, + "loss": 1.8432, + "step": 6508 + }, + { + "epoch": 1.9978514426028238, + "grad_norm": 0.33369818329811096, + "learning_rate": 9.263291718053626e-05, + "loss": 1.8752, + "step": 6509 + }, + { + "epoch": 1.9981583793738489, + "grad_norm": 0.3580996096134186, + "learning_rate": 9.263032000694804e-05, + "loss": 1.9319, + "step": 6510 + }, + { + "epoch": 1.9984653161448742, + "grad_norm": 0.38216903805732727, + "learning_rate": 9.2627722412062e-05, + "loss": 1.9424, + "step": 6511 + }, + { + "epoch": 1.9987722529158993, + "grad_norm": 0.3836761713027954, + "learning_rate": 9.26251243959038e-05, + "loss": 1.9259, + "step": 6512 + }, + { + "epoch": 1.9990791896869244, + "grad_norm": 0.34978967905044556, + "learning_rate": 9.262252595849917e-05, + "loss": 1.8648, + "step": 6513 + }, + { + "epoch": 1.9993861264579498, + "grad_norm": 0.4190160632133484, + "learning_rate": 9.261992709987375e-05, + "loss": 1.9456, + "step": 6514 + }, + { + "epoch": 1.9996930632289747, + "grad_norm": 0.38700881600379944, + "learning_rate": 9.261732782005322e-05, + "loss": 1.8768, + "step": 6515 + }, + { + "epoch": 2.0, + "grad_norm": 0.3706338405609131, + "learning_rate": 9.261472811906328e-05, + "loss": 1.9247, + "step": 6516 + }, + { + "epoch": 2.0003069367710253, + "grad_norm": 0.36679908633232117, + "learning_rate": 9.261212799692962e-05, + "loss": 1.8193, + "step": 6517 + }, + { + "epoch": 2.0006138735420502, + "grad_norm": 0.45219072699546814, + "learning_rate": 9.260952745367795e-05, + "loss": 1.9019, + "step": 6518 + }, + { + "epoch": 2.0009208103130756, + "grad_norm": 0.6038491725921631, + "learning_rate": 9.260692648933393e-05, + "loss": 1.8834, + "step": 6519 + }, + { + "epoch": 2.001227747084101, + "grad_norm": 0.5823990106582642, + "learning_rate": 9.260432510392331e-05, + "loss": 1.9066, + "step": 6520 + }, + { + "epoch": 2.001534683855126, + "grad_norm": 0.4731088876724243, + "learning_rate": 9.260172329747178e-05, + "loss": 1.8997, + "step": 6521 + }, + { + "epoch": 2.001841620626151, + "grad_norm": 0.3397974669933319, + "learning_rate": 9.259912107000504e-05, + "loss": 1.9396, + "step": 6522 + }, + { + "epoch": 2.002148557397176, + "grad_norm": 0.374734103679657, + "learning_rate": 9.259651842154882e-05, + "loss": 1.9311, + "step": 6523 + }, + { + "epoch": 2.0024554941682013, + "grad_norm": 0.48218441009521484, + "learning_rate": 9.259391535212884e-05, + "loss": 1.948, + "step": 6524 + }, + { + "epoch": 2.0027624309392267, + "grad_norm": 0.40540626645088196, + "learning_rate": 9.259131186177082e-05, + "loss": 1.8541, + "step": 6525 + }, + { + "epoch": 2.0030693677102516, + "grad_norm": 0.3698440492153168, + "learning_rate": 9.258870795050048e-05, + "loss": 1.9622, + "step": 6526 + }, + { + "epoch": 2.003376304481277, + "grad_norm": 0.35084524750709534, + "learning_rate": 9.258610361834358e-05, + "loss": 1.8882, + "step": 6527 + }, + { + "epoch": 2.0036832412523022, + "grad_norm": 0.38982072472572327, + "learning_rate": 9.258349886532584e-05, + "loss": 1.9523, + "step": 6528 + }, + { + "epoch": 2.003990178023327, + "grad_norm": 0.3737744390964508, + "learning_rate": 9.258089369147302e-05, + "loss": 1.9091, + "step": 6529 + }, + { + "epoch": 2.0042971147943525, + "grad_norm": 0.36094167828559875, + "learning_rate": 9.257828809681083e-05, + "loss": 1.8711, + "step": 6530 + }, + { + "epoch": 2.0046040515653774, + "grad_norm": 0.3270244896411896, + "learning_rate": 9.257568208136506e-05, + "loss": 1.8738, + "step": 6531 + }, + { + "epoch": 2.0049109883364027, + "grad_norm": 0.3320237100124359, + "learning_rate": 9.257307564516145e-05, + "loss": 1.8889, + "step": 6532 + }, + { + "epoch": 2.005217925107428, + "grad_norm": 0.3091014623641968, + "learning_rate": 9.257046878822573e-05, + "loss": 1.8683, + "step": 6533 + }, + { + "epoch": 2.005524861878453, + "grad_norm": 0.3234712779521942, + "learning_rate": 9.25678615105837e-05, + "loss": 1.8787, + "step": 6534 + }, + { + "epoch": 2.0058317986494782, + "grad_norm": 0.38402292132377625, + "learning_rate": 9.25652538122611e-05, + "loss": 1.9414, + "step": 6535 + }, + { + "epoch": 2.0061387354205036, + "grad_norm": 0.41379863023757935, + "learning_rate": 9.256264569328372e-05, + "loss": 1.9185, + "step": 6536 + }, + { + "epoch": 2.0064456721915285, + "grad_norm": 0.35990384221076965, + "learning_rate": 9.256003715367733e-05, + "loss": 1.8756, + "step": 6537 + }, + { + "epoch": 2.006752608962554, + "grad_norm": 0.3489217460155487, + "learning_rate": 9.25574281934677e-05, + "loss": 1.8984, + "step": 6538 + }, + { + "epoch": 2.0070595457335787, + "grad_norm": 0.326541006565094, + "learning_rate": 9.255481881268064e-05, + "loss": 1.8559, + "step": 6539 + }, + { + "epoch": 2.007366482504604, + "grad_norm": 0.40900397300720215, + "learning_rate": 9.25522090113419e-05, + "loss": 1.8832, + "step": 6540 + }, + { + "epoch": 2.0076734192756294, + "grad_norm": 0.4130956828594208, + "learning_rate": 9.254959878947731e-05, + "loss": 1.8437, + "step": 6541 + }, + { + "epoch": 2.0079803560466543, + "grad_norm": 0.38869336247444153, + "learning_rate": 9.254698814711263e-05, + "loss": 1.8839, + "step": 6542 + }, + { + "epoch": 2.0082872928176796, + "grad_norm": 0.37832918763160706, + "learning_rate": 9.254437708427368e-05, + "loss": 1.9519, + "step": 6543 + }, + { + "epoch": 2.008594229588705, + "grad_norm": 0.35336560010910034, + "learning_rate": 9.254176560098625e-05, + "loss": 1.8928, + "step": 6544 + }, + { + "epoch": 2.00890116635973, + "grad_norm": 0.347260981798172, + "learning_rate": 9.253915369727617e-05, + "loss": 1.9133, + "step": 6545 + }, + { + "epoch": 2.009208103130755, + "grad_norm": 0.3706999719142914, + "learning_rate": 9.253654137316923e-05, + "loss": 1.9048, + "step": 6546 + }, + { + "epoch": 2.00951503990178, + "grad_norm": 0.40080907940864563, + "learning_rate": 9.253392862869127e-05, + "loss": 1.9169, + "step": 6547 + }, + { + "epoch": 2.0098219766728054, + "grad_norm": 0.3635334074497223, + "learning_rate": 9.253131546386808e-05, + "loss": 1.8623, + "step": 6548 + }, + { + "epoch": 2.0101289134438307, + "grad_norm": 0.32642990350723267, + "learning_rate": 9.252870187872552e-05, + "loss": 1.8624, + "step": 6549 + }, + { + "epoch": 2.0104358502148556, + "grad_norm": 0.32467779517173767, + "learning_rate": 9.25260878732894e-05, + "loss": 1.8867, + "step": 6550 + }, + { + "epoch": 2.010742786985881, + "grad_norm": 0.3496699631214142, + "learning_rate": 9.252347344758553e-05, + "loss": 1.8441, + "step": 6551 + }, + { + "epoch": 2.0110497237569063, + "grad_norm": 0.3624981939792633, + "learning_rate": 9.252085860163981e-05, + "loss": 1.9045, + "step": 6552 + }, + { + "epoch": 2.011356660527931, + "grad_norm": 0.3801099359989166, + "learning_rate": 9.251824333547801e-05, + "loss": 1.9273, + "step": 6553 + }, + { + "epoch": 2.0116635972989565, + "grad_norm": 0.355866402387619, + "learning_rate": 9.251562764912602e-05, + "loss": 1.9032, + "step": 6554 + }, + { + "epoch": 2.0119705340699814, + "grad_norm": 0.31210052967071533, + "learning_rate": 9.251301154260968e-05, + "loss": 1.8148, + "step": 6555 + }, + { + "epoch": 2.0122774708410067, + "grad_norm": 0.3583676218986511, + "learning_rate": 9.251039501595485e-05, + "loss": 1.9326, + "step": 6556 + }, + { + "epoch": 2.012584407612032, + "grad_norm": 0.40221846103668213, + "learning_rate": 9.250777806918737e-05, + "loss": 1.8968, + "step": 6557 + }, + { + "epoch": 2.012891344383057, + "grad_norm": 0.3403627574443817, + "learning_rate": 9.250516070233311e-05, + "loss": 1.8956, + "step": 6558 + }, + { + "epoch": 2.0131982811540823, + "grad_norm": 0.37752729654312134, + "learning_rate": 9.250254291541796e-05, + "loss": 1.9136, + "step": 6559 + }, + { + "epoch": 2.0135052179251076, + "grad_norm": 0.3661794364452362, + "learning_rate": 9.249992470846774e-05, + "loss": 1.8796, + "step": 6560 + }, + { + "epoch": 2.0138121546961325, + "grad_norm": 0.315603643655777, + "learning_rate": 9.249730608150837e-05, + "loss": 1.8711, + "step": 6561 + }, + { + "epoch": 2.014119091467158, + "grad_norm": 0.3187065124511719, + "learning_rate": 9.249468703456571e-05, + "loss": 1.8611, + "step": 6562 + }, + { + "epoch": 2.0144260282381827, + "grad_norm": 0.3018025755882263, + "learning_rate": 9.249206756766564e-05, + "loss": 1.786, + "step": 6563 + }, + { + "epoch": 2.014732965009208, + "grad_norm": 0.344963401556015, + "learning_rate": 9.248944768083406e-05, + "loss": 1.9428, + "step": 6564 + }, + { + "epoch": 2.0150399017802334, + "grad_norm": 0.29776978492736816, + "learning_rate": 9.248682737409687e-05, + "loss": 1.8089, + "step": 6565 + }, + { + "epoch": 2.0153468385512583, + "grad_norm": 0.348982572555542, + "learning_rate": 9.248420664747992e-05, + "loss": 1.8407, + "step": 6566 + }, + { + "epoch": 2.0156537753222836, + "grad_norm": 0.3413224518299103, + "learning_rate": 9.248158550100915e-05, + "loss": 1.9802, + "step": 6567 + }, + { + "epoch": 2.015960712093309, + "grad_norm": 0.3598950505256653, + "learning_rate": 9.247896393471044e-05, + "loss": 1.8882, + "step": 6568 + }, + { + "epoch": 2.016267648864334, + "grad_norm": 0.3609221875667572, + "learning_rate": 9.247634194860974e-05, + "loss": 1.934, + "step": 6569 + }, + { + "epoch": 2.016574585635359, + "grad_norm": 0.3893497586250305, + "learning_rate": 9.247371954273291e-05, + "loss": 1.8808, + "step": 6570 + }, + { + "epoch": 2.016881522406384, + "grad_norm": 0.347417950630188, + "learning_rate": 9.24710967171059e-05, + "loss": 1.863, + "step": 6571 + }, + { + "epoch": 2.0171884591774094, + "grad_norm": 0.35378298163414, + "learning_rate": 9.246847347175461e-05, + "loss": 1.8664, + "step": 6572 + }, + { + "epoch": 2.0174953959484347, + "grad_norm": 0.2819608151912689, + "learning_rate": 9.246584980670499e-05, + "loss": 1.9007, + "step": 6573 + }, + { + "epoch": 2.0178023327194596, + "grad_norm": 0.32445117831230164, + "learning_rate": 9.246322572198293e-05, + "loss": 1.9176, + "step": 6574 + }, + { + "epoch": 2.018109269490485, + "grad_norm": 0.33579203486442566, + "learning_rate": 9.24606012176144e-05, + "loss": 1.8192, + "step": 6575 + }, + { + "epoch": 2.0184162062615103, + "grad_norm": 0.40369588136672974, + "learning_rate": 9.245797629362532e-05, + "loss": 1.8731, + "step": 6576 + }, + { + "epoch": 2.018723143032535, + "grad_norm": 0.34241169691085815, + "learning_rate": 9.245535095004163e-05, + "loss": 1.8555, + "step": 6577 + }, + { + "epoch": 2.0190300798035605, + "grad_norm": 0.3627666234970093, + "learning_rate": 9.245272518688927e-05, + "loss": 1.9212, + "step": 6578 + }, + { + "epoch": 2.0193370165745854, + "grad_norm": 0.3330884873867035, + "learning_rate": 9.245009900419422e-05, + "loss": 1.8727, + "step": 6579 + }, + { + "epoch": 2.0196439533456108, + "grad_norm": 0.3259236514568329, + "learning_rate": 9.244747240198239e-05, + "loss": 1.8471, + "step": 6580 + }, + { + "epoch": 2.019950890116636, + "grad_norm": 0.3715277910232544, + "learning_rate": 9.244484538027976e-05, + "loss": 1.8925, + "step": 6581 + }, + { + "epoch": 2.020257826887661, + "grad_norm": 0.4752909541130066, + "learning_rate": 9.24422179391123e-05, + "loss": 1.889, + "step": 6582 + }, + { + "epoch": 2.0205647636586863, + "grad_norm": 0.5166791677474976, + "learning_rate": 9.243959007850597e-05, + "loss": 1.8637, + "step": 6583 + }, + { + "epoch": 2.0208717004297116, + "grad_norm": 0.5350266695022583, + "learning_rate": 9.243696179848673e-05, + "loss": 1.8916, + "step": 6584 + }, + { + "epoch": 2.0211786372007365, + "grad_norm": 0.6115607619285583, + "learning_rate": 9.243433309908055e-05, + "loss": 1.8847, + "step": 6585 + }, + { + "epoch": 2.021485573971762, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.243170398031343e-05, + "loss": 1.8889, + "step": 6586 + }, + { + "epoch": 2.021792510742787, + "grad_norm": 0.4547630846500397, + "learning_rate": 9.242907444221134e-05, + "loss": 1.8752, + "step": 6587 + }, + { + "epoch": 2.022099447513812, + "grad_norm": 0.39437413215637207, + "learning_rate": 9.242644448480027e-05, + "loss": 1.9318, + "step": 6588 + }, + { + "epoch": 2.0224063842848374, + "grad_norm": 0.39216291904449463, + "learning_rate": 9.24238141081062e-05, + "loss": 1.8799, + "step": 6589 + }, + { + "epoch": 2.0227133210558623, + "grad_norm": 0.4100605547428131, + "learning_rate": 9.242118331215513e-05, + "loss": 1.9278, + "step": 6590 + }, + { + "epoch": 2.0230202578268877, + "grad_norm": 0.38527074456214905, + "learning_rate": 9.241855209697307e-05, + "loss": 1.9085, + "step": 6591 + }, + { + "epoch": 2.023327194597913, + "grad_norm": 0.39856311678886414, + "learning_rate": 9.241592046258602e-05, + "loss": 1.8057, + "step": 6592 + }, + { + "epoch": 2.023634131368938, + "grad_norm": 0.4070499539375305, + "learning_rate": 9.241328840902e-05, + "loss": 1.8099, + "step": 6593 + }, + { + "epoch": 2.023941068139963, + "grad_norm": 0.40319183468818665, + "learning_rate": 9.241065593630097e-05, + "loss": 1.8654, + "step": 6594 + }, + { + "epoch": 2.0242480049109886, + "grad_norm": 0.3788430988788605, + "learning_rate": 9.240802304445499e-05, + "loss": 1.9419, + "step": 6595 + }, + { + "epoch": 2.0245549416820134, + "grad_norm": 0.3656894564628601, + "learning_rate": 9.240538973350809e-05, + "loss": 1.8625, + "step": 6596 + }, + { + "epoch": 2.0248618784530388, + "grad_norm": 0.4384852945804596, + "learning_rate": 9.240275600348625e-05, + "loss": 1.8893, + "step": 6597 + }, + { + "epoch": 2.0251688152240637, + "grad_norm": 0.5054775476455688, + "learning_rate": 9.240012185441554e-05, + "loss": 1.826, + "step": 6598 + }, + { + "epoch": 2.025475751995089, + "grad_norm": 0.4576725959777832, + "learning_rate": 9.239748728632196e-05, + "loss": 1.9319, + "step": 6599 + }, + { + "epoch": 2.0257826887661143, + "grad_norm": 0.40581515431404114, + "learning_rate": 9.239485229923157e-05, + "loss": 1.905, + "step": 6600 + }, + { + "epoch": 2.0260896255371392, + "grad_norm": 0.3168322443962097, + "learning_rate": 9.23922168931704e-05, + "loss": 1.8937, + "step": 6601 + }, + { + "epoch": 2.0263965623081646, + "grad_norm": 0.39211124181747437, + "learning_rate": 9.238958106816449e-05, + "loss": 1.8346, + "step": 6602 + }, + { + "epoch": 2.02670349907919, + "grad_norm": 0.4722496569156647, + "learning_rate": 9.23869448242399e-05, + "loss": 1.933, + "step": 6603 + }, + { + "epoch": 2.027010435850215, + "grad_norm": 0.47029170393943787, + "learning_rate": 9.238430816142268e-05, + "loss": 1.8873, + "step": 6604 + }, + { + "epoch": 2.02731737262124, + "grad_norm": 0.36421555280685425, + "learning_rate": 9.238167107973888e-05, + "loss": 1.8311, + "step": 6605 + }, + { + "epoch": 2.027624309392265, + "grad_norm": 0.36506712436676025, + "learning_rate": 9.237903357921455e-05, + "loss": 1.9025, + "step": 6606 + }, + { + "epoch": 2.0279312461632903, + "grad_norm": 0.5055087208747864, + "learning_rate": 9.237639565987579e-05, + "loss": 1.9138, + "step": 6607 + }, + { + "epoch": 2.0282381829343157, + "grad_norm": 0.5850993394851685, + "learning_rate": 9.237375732174867e-05, + "loss": 1.869, + "step": 6608 + }, + { + "epoch": 2.0285451197053406, + "grad_norm": 0.5053986310958862, + "learning_rate": 9.237111856485921e-05, + "loss": 1.8196, + "step": 6609 + }, + { + "epoch": 2.028852056476366, + "grad_norm": 0.40635839104652405, + "learning_rate": 9.236847938923354e-05, + "loss": 1.8399, + "step": 6610 + }, + { + "epoch": 2.0291589932473912, + "grad_norm": 0.32075709104537964, + "learning_rate": 9.236583979489771e-05, + "loss": 1.8532, + "step": 6611 + }, + { + "epoch": 2.029465930018416, + "grad_norm": 0.4474230408668518, + "learning_rate": 9.236319978187783e-05, + "loss": 1.8807, + "step": 6612 + }, + { + "epoch": 2.0297728667894415, + "grad_norm": 0.5391832590103149, + "learning_rate": 9.236055935019998e-05, + "loss": 1.8887, + "step": 6613 + }, + { + "epoch": 2.0300798035604664, + "grad_norm": 0.5129361748695374, + "learning_rate": 9.235791849989024e-05, + "loss": 1.8541, + "step": 6614 + }, + { + "epoch": 2.0303867403314917, + "grad_norm": 0.33113735914230347, + "learning_rate": 9.235527723097474e-05, + "loss": 1.8611, + "step": 6615 + }, + { + "epoch": 2.030693677102517, + "grad_norm": 0.3526761531829834, + "learning_rate": 9.235263554347956e-05, + "loss": 1.8436, + "step": 6616 + }, + { + "epoch": 2.031000613873542, + "grad_norm": 0.4380190670490265, + "learning_rate": 9.234999343743081e-05, + "loss": 1.854, + "step": 6617 + }, + { + "epoch": 2.0313075506445673, + "grad_norm": 0.4300559163093567, + "learning_rate": 9.23473509128546e-05, + "loss": 1.919, + "step": 6618 + }, + { + "epoch": 2.0316144874155926, + "grad_norm": 0.3445209860801697, + "learning_rate": 9.234470796977705e-05, + "loss": 1.88, + "step": 6619 + }, + { + "epoch": 2.0319214241866175, + "grad_norm": 0.35759109258651733, + "learning_rate": 9.234206460822428e-05, + "loss": 1.9244, + "step": 6620 + }, + { + "epoch": 2.032228360957643, + "grad_norm": 0.432804137468338, + "learning_rate": 9.23394208282224e-05, + "loss": 1.9312, + "step": 6621 + }, + { + "epoch": 2.0325352977286677, + "grad_norm": 0.446865439414978, + "learning_rate": 9.233677662979756e-05, + "loss": 1.8791, + "step": 6622 + }, + { + "epoch": 2.032842234499693, + "grad_norm": 0.37617436051368713, + "learning_rate": 9.233413201297588e-05, + "loss": 1.8794, + "step": 6623 + }, + { + "epoch": 2.0331491712707184, + "grad_norm": 0.33695775270462036, + "learning_rate": 9.233148697778349e-05, + "loss": 1.8649, + "step": 6624 + }, + { + "epoch": 2.0334561080417433, + "grad_norm": 0.3893069624900818, + "learning_rate": 9.232884152424654e-05, + "loss": 1.899, + "step": 6625 + }, + { + "epoch": 2.0337630448127686, + "grad_norm": 0.38993194699287415, + "learning_rate": 9.232619565239116e-05, + "loss": 1.8994, + "step": 6626 + }, + { + "epoch": 2.034069981583794, + "grad_norm": 0.3725507855415344, + "learning_rate": 9.23235493622435e-05, + "loss": 1.8758, + "step": 6627 + }, + { + "epoch": 2.034376918354819, + "grad_norm": 0.3236019015312195, + "learning_rate": 9.232090265382973e-05, + "loss": 1.9041, + "step": 6628 + }, + { + "epoch": 2.034683855125844, + "grad_norm": 0.3399617671966553, + "learning_rate": 9.231825552717599e-05, + "loss": 1.9081, + "step": 6629 + }, + { + "epoch": 2.034990791896869, + "grad_norm": 0.352096289396286, + "learning_rate": 9.231560798230845e-05, + "loss": 1.9001, + "step": 6630 + }, + { + "epoch": 2.0352977286678944, + "grad_norm": 0.39621952176094055, + "learning_rate": 9.231296001925327e-05, + "loss": 1.9258, + "step": 6631 + }, + { + "epoch": 2.0356046654389197, + "grad_norm": 0.36686012148857117, + "learning_rate": 9.23103116380366e-05, + "loss": 1.9325, + "step": 6632 + }, + { + "epoch": 2.0359116022099446, + "grad_norm": 0.36286696791648865, + "learning_rate": 9.230766283868466e-05, + "loss": 1.9623, + "step": 6633 + }, + { + "epoch": 2.03621853898097, + "grad_norm": 0.34748387336730957, + "learning_rate": 9.230501362122359e-05, + "loss": 1.8326, + "step": 6634 + }, + { + "epoch": 2.0365254757519953, + "grad_norm": 0.350993275642395, + "learning_rate": 9.230236398567958e-05, + "loss": 1.8333, + "step": 6635 + }, + { + "epoch": 2.03683241252302, + "grad_norm": 0.3181723356246948, + "learning_rate": 9.229971393207881e-05, + "loss": 1.8852, + "step": 6636 + }, + { + "epoch": 2.0371393492940455, + "grad_norm": 0.3446536660194397, + "learning_rate": 9.229706346044747e-05, + "loss": 1.8833, + "step": 6637 + }, + { + "epoch": 2.0374462860650704, + "grad_norm": 0.3077203631401062, + "learning_rate": 9.229441257081176e-05, + "loss": 1.8546, + "step": 6638 + }, + { + "epoch": 2.0377532228360957, + "grad_norm": 0.3659566342830658, + "learning_rate": 9.229176126319788e-05, + "loss": 1.8687, + "step": 6639 + }, + { + "epoch": 2.038060159607121, + "grad_norm": 0.379779577255249, + "learning_rate": 9.228910953763204e-05, + "loss": 1.9208, + "step": 6640 + }, + { + "epoch": 2.038367096378146, + "grad_norm": 0.4496903121471405, + "learning_rate": 9.228645739414042e-05, + "loss": 1.9471, + "step": 6641 + }, + { + "epoch": 2.0386740331491713, + "grad_norm": 0.37597209215164185, + "learning_rate": 9.228380483274923e-05, + "loss": 1.9047, + "step": 6642 + }, + { + "epoch": 2.0389809699201966, + "grad_norm": 0.3739323019981384, + "learning_rate": 9.228115185348471e-05, + "loss": 1.9697, + "step": 6643 + }, + { + "epoch": 2.0392879066912215, + "grad_norm": 0.3524092435836792, + "learning_rate": 9.227849845637306e-05, + "loss": 1.8716, + "step": 6644 + }, + { + "epoch": 2.039594843462247, + "grad_norm": 0.36939096450805664, + "learning_rate": 9.227584464144051e-05, + "loss": 1.9836, + "step": 6645 + }, + { + "epoch": 2.0399017802332717, + "grad_norm": 0.39015519618988037, + "learning_rate": 9.22731904087133e-05, + "loss": 1.907, + "step": 6646 + }, + { + "epoch": 2.040208717004297, + "grad_norm": 0.3725626468658447, + "learning_rate": 9.227053575821763e-05, + "loss": 1.9483, + "step": 6647 + }, + { + "epoch": 2.0405156537753224, + "grad_norm": 0.41595613956451416, + "learning_rate": 9.226788068997974e-05, + "loss": 1.9352, + "step": 6648 + }, + { + "epoch": 2.0408225905463473, + "grad_norm": 0.4026443660259247, + "learning_rate": 9.226522520402589e-05, + "loss": 1.9166, + "step": 6649 + }, + { + "epoch": 2.0411295273173726, + "grad_norm": 0.39883533120155334, + "learning_rate": 9.226256930038233e-05, + "loss": 1.8594, + "step": 6650 + }, + { + "epoch": 2.041436464088398, + "grad_norm": 0.35540083050727844, + "learning_rate": 9.225991297907526e-05, + "loss": 1.9065, + "step": 6651 + }, + { + "epoch": 2.041743400859423, + "grad_norm": 0.3799804747104645, + "learning_rate": 9.225725624013097e-05, + "loss": 1.9232, + "step": 6652 + }, + { + "epoch": 2.042050337630448, + "grad_norm": 0.37289959192276, + "learning_rate": 9.225459908357572e-05, + "loss": 1.9679, + "step": 6653 + }, + { + "epoch": 2.042357274401473, + "grad_norm": 0.38069143891334534, + "learning_rate": 9.225194150943574e-05, + "loss": 1.9699, + "step": 6654 + }, + { + "epoch": 2.0426642111724984, + "grad_norm": 0.43708884716033936, + "learning_rate": 9.224928351773731e-05, + "loss": 1.8907, + "step": 6655 + }, + { + "epoch": 2.0429711479435237, + "grad_norm": 0.47203195095062256, + "learning_rate": 9.22466251085067e-05, + "loss": 1.9615, + "step": 6656 + }, + { + "epoch": 2.0432780847145486, + "grad_norm": 0.405129998922348, + "learning_rate": 9.224396628177019e-05, + "loss": 1.9165, + "step": 6657 + }, + { + "epoch": 2.043585021485574, + "grad_norm": 0.33447468280792236, + "learning_rate": 9.224130703755403e-05, + "loss": 1.852, + "step": 6658 + }, + { + "epoch": 2.0438919582565993, + "grad_norm": 0.33780771493911743, + "learning_rate": 9.223864737588453e-05, + "loss": 1.875, + "step": 6659 + }, + { + "epoch": 2.044198895027624, + "grad_norm": 0.37942594289779663, + "learning_rate": 9.223598729678796e-05, + "loss": 1.9115, + "step": 6660 + }, + { + "epoch": 2.0445058317986495, + "grad_norm": 0.3368874192237854, + "learning_rate": 9.223332680029059e-05, + "loss": 1.822, + "step": 6661 + }, + { + "epoch": 2.044812768569675, + "grad_norm": 0.3029201924800873, + "learning_rate": 9.223066588641873e-05, + "loss": 1.8902, + "step": 6662 + }, + { + "epoch": 2.0451197053406998, + "grad_norm": 0.4605506360530853, + "learning_rate": 9.22280045551987e-05, + "loss": 1.9164, + "step": 6663 + }, + { + "epoch": 2.045426642111725, + "grad_norm": 0.5012617111206055, + "learning_rate": 9.222534280665675e-05, + "loss": 1.8859, + "step": 6664 + }, + { + "epoch": 2.04573357888275, + "grad_norm": 0.5177115797996521, + "learning_rate": 9.222268064081924e-05, + "loss": 1.93, + "step": 6665 + }, + { + "epoch": 2.0460405156537753, + "grad_norm": 0.3966628313064575, + "learning_rate": 9.222001805771244e-05, + "loss": 1.8817, + "step": 6666 + }, + { + "epoch": 2.0463474524248007, + "grad_norm": 0.3670666813850403, + "learning_rate": 9.221735505736269e-05, + "loss": 1.8224, + "step": 6667 + }, + { + "epoch": 2.0466543891958255, + "grad_norm": 0.4584221839904785, + "learning_rate": 9.221469163979628e-05, + "loss": 1.7788, + "step": 6668 + }, + { + "epoch": 2.046961325966851, + "grad_norm": 0.5598693490028381, + "learning_rate": 9.221202780503954e-05, + "loss": 1.9263, + "step": 6669 + }, + { + "epoch": 2.047268262737876, + "grad_norm": 0.44200289249420166, + "learning_rate": 9.22093635531188e-05, + "loss": 1.8455, + "step": 6670 + }, + { + "epoch": 2.047575199508901, + "grad_norm": 0.33257725834846497, + "learning_rate": 9.22066988840604e-05, + "loss": 1.9019, + "step": 6671 + }, + { + "epoch": 2.0478821362799264, + "grad_norm": 0.4716290831565857, + "learning_rate": 9.220403379789066e-05, + "loss": 1.9012, + "step": 6672 + }, + { + "epoch": 2.0481890730509513, + "grad_norm": 0.5600453615188599, + "learning_rate": 9.220136829463591e-05, + "loss": 1.9158, + "step": 6673 + }, + { + "epoch": 2.0484960098219767, + "grad_norm": 0.5345216393470764, + "learning_rate": 9.219870237432252e-05, + "loss": 1.931, + "step": 6674 + }, + { + "epoch": 2.048802946593002, + "grad_norm": 0.36617112159729004, + "learning_rate": 9.219603603697682e-05, + "loss": 1.9019, + "step": 6675 + }, + { + "epoch": 2.049109883364027, + "grad_norm": 0.33677804470062256, + "learning_rate": 9.219336928262514e-05, + "loss": 1.8897, + "step": 6676 + }, + { + "epoch": 2.049416820135052, + "grad_norm": 0.48563066124916077, + "learning_rate": 9.219070211129388e-05, + "loss": 1.9147, + "step": 6677 + }, + { + "epoch": 2.0497237569060776, + "grad_norm": 0.5029729008674622, + "learning_rate": 9.218803452300935e-05, + "loss": 1.8926, + "step": 6678 + }, + { + "epoch": 2.0500306936771024, + "grad_norm": 0.3969452977180481, + "learning_rate": 9.218536651779795e-05, + "loss": 1.9337, + "step": 6679 + }, + { + "epoch": 2.050337630448128, + "grad_norm": 0.37374138832092285, + "learning_rate": 9.218269809568603e-05, + "loss": 1.9147, + "step": 6680 + }, + { + "epoch": 2.0506445672191527, + "grad_norm": 0.416608065366745, + "learning_rate": 9.218002925669996e-05, + "loss": 1.975, + "step": 6681 + }, + { + "epoch": 2.050951503990178, + "grad_norm": 0.35848283767700195, + "learning_rate": 9.217736000086612e-05, + "loss": 1.9194, + "step": 6682 + }, + { + "epoch": 2.0512584407612033, + "grad_norm": 0.3294626772403717, + "learning_rate": 9.217469032821088e-05, + "loss": 1.8541, + "step": 6683 + }, + { + "epoch": 2.0515653775322282, + "grad_norm": 0.4164618253707886, + "learning_rate": 9.217202023876064e-05, + "loss": 1.8999, + "step": 6684 + }, + { + "epoch": 2.0518723143032536, + "grad_norm": 0.4067288935184479, + "learning_rate": 9.216934973254179e-05, + "loss": 1.8609, + "step": 6685 + }, + { + "epoch": 2.052179251074279, + "grad_norm": 0.38743069767951965, + "learning_rate": 9.216667880958069e-05, + "loss": 1.8571, + "step": 6686 + }, + { + "epoch": 2.052486187845304, + "grad_norm": 0.3430919647216797, + "learning_rate": 9.216400746990377e-05, + "loss": 1.9229, + "step": 6687 + }, + { + "epoch": 2.052793124616329, + "grad_norm": 0.3512028753757477, + "learning_rate": 9.21613357135374e-05, + "loss": 1.9331, + "step": 6688 + }, + { + "epoch": 2.053100061387354, + "grad_norm": 0.3708036541938782, + "learning_rate": 9.215866354050799e-05, + "loss": 1.8499, + "step": 6689 + }, + { + "epoch": 2.0534069981583793, + "grad_norm": 0.39376455545425415, + "learning_rate": 9.215599095084199e-05, + "loss": 1.8531, + "step": 6690 + }, + { + "epoch": 2.0537139349294047, + "grad_norm": 0.3855830430984497, + "learning_rate": 9.215331794456576e-05, + "loss": 1.8597, + "step": 6691 + }, + { + "epoch": 2.0540208717004296, + "grad_norm": 0.3515113592147827, + "learning_rate": 9.215064452170574e-05, + "loss": 1.8776, + "step": 6692 + }, + { + "epoch": 2.054327808471455, + "grad_norm": 0.3165057897567749, + "learning_rate": 9.214797068228833e-05, + "loss": 1.926, + "step": 6693 + }, + { + "epoch": 2.0546347452424802, + "grad_norm": 0.3516407310962677, + "learning_rate": 9.214529642633998e-05, + "loss": 1.9397, + "step": 6694 + }, + { + "epoch": 2.054941682013505, + "grad_norm": 0.36943888664245605, + "learning_rate": 9.214262175388713e-05, + "loss": 1.9114, + "step": 6695 + }, + { + "epoch": 2.0552486187845305, + "grad_norm": 0.3490065634250641, + "learning_rate": 9.213994666495616e-05, + "loss": 1.8637, + "step": 6696 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.30341869592666626, + "learning_rate": 9.213727115957356e-05, + "loss": 1.8525, + "step": 6697 + }, + { + "epoch": 2.0558624923265807, + "grad_norm": 0.3899247646331787, + "learning_rate": 9.213459523776573e-05, + "loss": 2.0578, + "step": 6698 + }, + { + "epoch": 2.056169429097606, + "grad_norm": 0.34904104471206665, + "learning_rate": 9.213191889955915e-05, + "loss": 1.9135, + "step": 6699 + }, + { + "epoch": 2.056476365868631, + "grad_norm": 0.3806450366973877, + "learning_rate": 9.212924214498024e-05, + "loss": 1.9252, + "step": 6700 + }, + { + "epoch": 2.0567833026396563, + "grad_norm": 0.33185848593711853, + "learning_rate": 9.212656497405547e-05, + "loss": 1.8457, + "step": 6701 + }, + { + "epoch": 2.0570902394106816, + "grad_norm": 0.356717050075531, + "learning_rate": 9.21238873868113e-05, + "loss": 1.9086, + "step": 6702 + }, + { + "epoch": 2.0573971761817065, + "grad_norm": 0.41743260622024536, + "learning_rate": 9.212120938327418e-05, + "loss": 1.9255, + "step": 6703 + }, + { + "epoch": 2.057704112952732, + "grad_norm": 0.3937377631664276, + "learning_rate": 9.211853096347058e-05, + "loss": 1.9529, + "step": 6704 + }, + { + "epoch": 2.0580110497237567, + "grad_norm": 0.43980923295021057, + "learning_rate": 9.211585212742698e-05, + "loss": 1.905, + "step": 6705 + }, + { + "epoch": 2.058317986494782, + "grad_norm": 0.36891186237335205, + "learning_rate": 9.211317287516984e-05, + "loss": 1.8109, + "step": 6706 + }, + { + "epoch": 2.0586249232658074, + "grad_norm": 0.3582547605037689, + "learning_rate": 9.211049320672563e-05, + "loss": 1.9633, + "step": 6707 + }, + { + "epoch": 2.0589318600368323, + "grad_norm": 0.3421446979045868, + "learning_rate": 9.210781312212087e-05, + "loss": 1.8956, + "step": 6708 + }, + { + "epoch": 2.0592387968078576, + "grad_norm": 0.34717023372650146, + "learning_rate": 9.210513262138199e-05, + "loss": 1.837, + "step": 6709 + }, + { + "epoch": 2.059545733578883, + "grad_norm": 0.32769930362701416, + "learning_rate": 9.210245170453553e-05, + "loss": 1.8588, + "step": 6710 + }, + { + "epoch": 2.059852670349908, + "grad_norm": 0.3694380223751068, + "learning_rate": 9.209977037160796e-05, + "loss": 1.9298, + "step": 6711 + }, + { + "epoch": 2.060159607120933, + "grad_norm": 0.38598594069480896, + "learning_rate": 9.209708862262578e-05, + "loss": 1.9011, + "step": 6712 + }, + { + "epoch": 2.060466543891958, + "grad_norm": 0.33520397543907166, + "learning_rate": 9.20944064576155e-05, + "loss": 1.9689, + "step": 6713 + }, + { + "epoch": 2.0607734806629834, + "grad_norm": 0.36898335814476013, + "learning_rate": 9.209172387660363e-05, + "loss": 1.9362, + "step": 6714 + }, + { + "epoch": 2.0610804174340087, + "grad_norm": 0.3989763855934143, + "learning_rate": 9.208904087961667e-05, + "loss": 1.8875, + "step": 6715 + }, + { + "epoch": 2.0613873542050336, + "grad_norm": 0.38079237937927246, + "learning_rate": 9.208635746668113e-05, + "loss": 1.8645, + "step": 6716 + }, + { + "epoch": 2.061694290976059, + "grad_norm": 0.3853057026863098, + "learning_rate": 9.208367363782355e-05, + "loss": 1.9346, + "step": 6717 + }, + { + "epoch": 2.0620012277470843, + "grad_norm": 0.33557942509651184, + "learning_rate": 9.208098939307044e-05, + "loss": 1.8629, + "step": 6718 + }, + { + "epoch": 2.062308164518109, + "grad_norm": 0.31848183274269104, + "learning_rate": 9.207830473244832e-05, + "loss": 1.7616, + "step": 6719 + }, + { + "epoch": 2.0626151012891345, + "grad_norm": 0.2901391088962555, + "learning_rate": 9.207561965598375e-05, + "loss": 1.8876, + "step": 6720 + }, + { + "epoch": 2.06292203806016, + "grad_norm": 0.33935174345970154, + "learning_rate": 9.207293416370322e-05, + "loss": 1.8407, + "step": 6721 + }, + { + "epoch": 2.0632289748311847, + "grad_norm": 0.3615114390850067, + "learning_rate": 9.207024825563331e-05, + "loss": 1.8378, + "step": 6722 + }, + { + "epoch": 2.06353591160221, + "grad_norm": 0.35903334617614746, + "learning_rate": 9.206756193180053e-05, + "loss": 1.8316, + "step": 6723 + }, + { + "epoch": 2.063842848373235, + "grad_norm": 0.35222968459129333, + "learning_rate": 9.206487519223146e-05, + "loss": 1.8786, + "step": 6724 + }, + { + "epoch": 2.0641497851442603, + "grad_norm": 0.3412967622280121, + "learning_rate": 9.206218803695264e-05, + "loss": 1.8682, + "step": 6725 + }, + { + "epoch": 2.0644567219152856, + "grad_norm": 0.4166354835033417, + "learning_rate": 9.205950046599062e-05, + "loss": 1.8871, + "step": 6726 + }, + { + "epoch": 2.0647636586863105, + "grad_norm": 0.4631161093711853, + "learning_rate": 9.205681247937196e-05, + "loss": 1.9328, + "step": 6727 + }, + { + "epoch": 2.065070595457336, + "grad_norm": 0.39197248220443726, + "learning_rate": 9.205412407712325e-05, + "loss": 1.9434, + "step": 6728 + }, + { + "epoch": 2.0653775322283607, + "grad_norm": 0.37939852476119995, + "learning_rate": 9.205143525927103e-05, + "loss": 1.9115, + "step": 6729 + }, + { + "epoch": 2.065684468999386, + "grad_norm": 0.35442814230918884, + "learning_rate": 9.204874602584186e-05, + "loss": 1.9197, + "step": 6730 + }, + { + "epoch": 2.0659914057704114, + "grad_norm": 0.3598809242248535, + "learning_rate": 9.204605637686235e-05, + "loss": 1.8684, + "step": 6731 + }, + { + "epoch": 2.0662983425414363, + "grad_norm": 0.3360415995121002, + "learning_rate": 9.204336631235905e-05, + "loss": 1.8531, + "step": 6732 + }, + { + "epoch": 2.0666052793124616, + "grad_norm": 0.4487619698047638, + "learning_rate": 9.204067583235859e-05, + "loss": 1.8509, + "step": 6733 + }, + { + "epoch": 2.066912216083487, + "grad_norm": 0.37166881561279297, + "learning_rate": 9.203798493688753e-05, + "loss": 1.8826, + "step": 6734 + }, + { + "epoch": 2.067219152854512, + "grad_norm": 0.35294032096862793, + "learning_rate": 9.203529362597244e-05, + "loss": 1.9029, + "step": 6735 + }, + { + "epoch": 2.067526089625537, + "grad_norm": 0.4115317165851593, + "learning_rate": 9.203260189963995e-05, + "loss": 1.9117, + "step": 6736 + }, + { + "epoch": 2.0678330263965625, + "grad_norm": 0.44137999415397644, + "learning_rate": 9.202990975791666e-05, + "loss": 1.8754, + "step": 6737 + }, + { + "epoch": 2.0681399631675874, + "grad_norm": 0.46055081486701965, + "learning_rate": 9.202721720082916e-05, + "loss": 1.8322, + "step": 6738 + }, + { + "epoch": 2.0684468999386127, + "grad_norm": 0.38548141717910767, + "learning_rate": 9.202452422840407e-05, + "loss": 1.8341, + "step": 6739 + }, + { + "epoch": 2.0687538367096376, + "grad_norm": 0.3542765974998474, + "learning_rate": 9.2021830840668e-05, + "loss": 1.9301, + "step": 6740 + }, + { + "epoch": 2.069060773480663, + "grad_norm": 0.35987207293510437, + "learning_rate": 9.201913703764755e-05, + "loss": 1.8756, + "step": 6741 + }, + { + "epoch": 2.0693677102516883, + "grad_norm": 0.4297364056110382, + "learning_rate": 9.201644281936938e-05, + "loss": 1.8549, + "step": 6742 + }, + { + "epoch": 2.069674647022713, + "grad_norm": 0.3679873049259186, + "learning_rate": 9.20137481858601e-05, + "loss": 1.8905, + "step": 6743 + }, + { + "epoch": 2.0699815837937385, + "grad_norm": 0.3402685523033142, + "learning_rate": 9.201105313714632e-05, + "loss": 1.8834, + "step": 6744 + }, + { + "epoch": 2.070288520564764, + "grad_norm": 0.40986955165863037, + "learning_rate": 9.200835767325469e-05, + "loss": 1.8861, + "step": 6745 + }, + { + "epoch": 2.0705954573357888, + "grad_norm": 0.4305949807167053, + "learning_rate": 9.200566179421186e-05, + "loss": 1.8977, + "step": 6746 + }, + { + "epoch": 2.070902394106814, + "grad_norm": 0.3948439359664917, + "learning_rate": 9.200296550004446e-05, + "loss": 1.8801, + "step": 6747 + }, + { + "epoch": 2.071209330877839, + "grad_norm": 0.3404015600681305, + "learning_rate": 9.200026879077912e-05, + "loss": 1.8417, + "step": 6748 + }, + { + "epoch": 2.0715162676488643, + "grad_norm": 0.39447101950645447, + "learning_rate": 9.199757166644252e-05, + "loss": 1.9675, + "step": 6749 + }, + { + "epoch": 2.0718232044198897, + "grad_norm": 0.44323647022247314, + "learning_rate": 9.199487412706129e-05, + "loss": 1.9014, + "step": 6750 + }, + { + "epoch": 2.0721301411909145, + "grad_norm": 0.47096556425094604, + "learning_rate": 9.199217617266212e-05, + "loss": 1.8783, + "step": 6751 + }, + { + "epoch": 2.07243707796194, + "grad_norm": 0.42863038182258606, + "learning_rate": 9.198947780327163e-05, + "loss": 1.8369, + "step": 6752 + }, + { + "epoch": 2.072744014732965, + "grad_norm": 0.414079874753952, + "learning_rate": 9.198677901891652e-05, + "loss": 1.9247, + "step": 6753 + }, + { + "epoch": 2.07305095150399, + "grad_norm": 0.3445589542388916, + "learning_rate": 9.198407981962345e-05, + "loss": 1.8494, + "step": 6754 + }, + { + "epoch": 2.0733578882750154, + "grad_norm": 0.4340321719646454, + "learning_rate": 9.198138020541908e-05, + "loss": 1.904, + "step": 6755 + }, + { + "epoch": 2.0736648250460403, + "grad_norm": 0.55349200963974, + "learning_rate": 9.197868017633013e-05, + "loss": 1.9368, + "step": 6756 + }, + { + "epoch": 2.0739717618170657, + "grad_norm": 0.5893970727920532, + "learning_rate": 9.197597973238326e-05, + "loss": 1.9329, + "step": 6757 + }, + { + "epoch": 2.074278698588091, + "grad_norm": 0.4942009449005127, + "learning_rate": 9.197327887360514e-05, + "loss": 1.7726, + "step": 6758 + }, + { + "epoch": 2.074585635359116, + "grad_norm": 0.36411046981811523, + "learning_rate": 9.197057760002247e-05, + "loss": 1.8214, + "step": 6759 + }, + { + "epoch": 2.074892572130141, + "grad_norm": 0.31520166993141174, + "learning_rate": 9.196787591166198e-05, + "loss": 1.8491, + "step": 6760 + }, + { + "epoch": 2.0751995089011666, + "grad_norm": 0.47392621636390686, + "learning_rate": 9.196517380855032e-05, + "loss": 2.0165, + "step": 6761 + }, + { + "epoch": 2.0755064456721914, + "grad_norm": 0.4768085181713104, + "learning_rate": 9.196247129071423e-05, + "loss": 1.9289, + "step": 6762 + }, + { + "epoch": 2.075813382443217, + "grad_norm": 0.396391361951828, + "learning_rate": 9.195976835818039e-05, + "loss": 1.9521, + "step": 6763 + }, + { + "epoch": 2.0761203192142417, + "grad_norm": 0.4030967950820923, + "learning_rate": 9.195706501097551e-05, + "loss": 1.8386, + "step": 6764 + }, + { + "epoch": 2.076427255985267, + "grad_norm": 0.48308777809143066, + "learning_rate": 9.195436124912635e-05, + "loss": 1.8874, + "step": 6765 + }, + { + "epoch": 2.0767341927562923, + "grad_norm": 0.5232771635055542, + "learning_rate": 9.19516570726596e-05, + "loss": 1.8822, + "step": 6766 + }, + { + "epoch": 2.0770411295273172, + "grad_norm": 0.3607174754142761, + "learning_rate": 9.194895248160198e-05, + "loss": 1.8995, + "step": 6767 + }, + { + "epoch": 2.0773480662983426, + "grad_norm": 0.4354429841041565, + "learning_rate": 9.194624747598022e-05, + "loss": 1.8629, + "step": 6768 + }, + { + "epoch": 2.077655003069368, + "grad_norm": 0.5405299067497253, + "learning_rate": 9.194354205582107e-05, + "loss": 1.8608, + "step": 6769 + }, + { + "epoch": 2.077961939840393, + "grad_norm": 0.5442025065422058, + "learning_rate": 9.194083622115123e-05, + "loss": 1.885, + "step": 6770 + }, + { + "epoch": 2.078268876611418, + "grad_norm": 0.4160112142562866, + "learning_rate": 9.193812997199749e-05, + "loss": 1.8617, + "step": 6771 + }, + { + "epoch": 2.078575813382443, + "grad_norm": 0.3550199866294861, + "learning_rate": 9.193542330838656e-05, + "loss": 1.9277, + "step": 6772 + }, + { + "epoch": 2.0788827501534684, + "grad_norm": 0.5224893093109131, + "learning_rate": 9.19327162303452e-05, + "loss": 1.7893, + "step": 6773 + }, + { + "epoch": 2.0791896869244937, + "grad_norm": 0.45021727681159973, + "learning_rate": 9.193000873790014e-05, + "loss": 1.8635, + "step": 6774 + }, + { + "epoch": 2.0794966236955186, + "grad_norm": 0.3087892532348633, + "learning_rate": 9.192730083107819e-05, + "loss": 1.842, + "step": 6775 + }, + { + "epoch": 2.079803560466544, + "grad_norm": 0.4304139018058777, + "learning_rate": 9.192459250990606e-05, + "loss": 1.8461, + "step": 6776 + }, + { + "epoch": 2.0801104972375692, + "grad_norm": 0.4388587474822998, + "learning_rate": 9.192188377441054e-05, + "loss": 1.8978, + "step": 6777 + }, + { + "epoch": 2.080417434008594, + "grad_norm": 0.3452616333961487, + "learning_rate": 9.19191746246184e-05, + "loss": 1.8849, + "step": 6778 + }, + { + "epoch": 2.0807243707796195, + "grad_norm": 0.3127618432044983, + "learning_rate": 9.191646506055638e-05, + "loss": 1.8703, + "step": 6779 + }, + { + "epoch": 2.0810313075506444, + "grad_norm": 0.3424977958202362, + "learning_rate": 9.191375508225131e-05, + "loss": 1.8446, + "step": 6780 + }, + { + "epoch": 2.0813382443216697, + "grad_norm": 0.3536671996116638, + "learning_rate": 9.191104468972993e-05, + "loss": 1.9079, + "step": 6781 + }, + { + "epoch": 2.081645181092695, + "grad_norm": 0.3689599633216858, + "learning_rate": 9.190833388301905e-05, + "loss": 1.8683, + "step": 6782 + }, + { + "epoch": 2.08195211786372, + "grad_norm": 0.30976906418800354, + "learning_rate": 9.190562266214546e-05, + "loss": 1.89, + "step": 6783 + }, + { + "epoch": 2.0822590546347453, + "grad_norm": 0.34682777523994446, + "learning_rate": 9.190291102713593e-05, + "loss": 1.8384, + "step": 6784 + }, + { + "epoch": 2.0825659914057706, + "grad_norm": 0.4135018587112427, + "learning_rate": 9.190019897801727e-05, + "loss": 1.8878, + "step": 6785 + }, + { + "epoch": 2.0828729281767955, + "grad_norm": 0.4247548580169678, + "learning_rate": 9.189748651481629e-05, + "loss": 1.9244, + "step": 6786 + }, + { + "epoch": 2.083179864947821, + "grad_norm": 0.3961609899997711, + "learning_rate": 9.18947736375598e-05, + "loss": 1.9539, + "step": 6787 + }, + { + "epoch": 2.0834868017188457, + "grad_norm": 0.4174231290817261, + "learning_rate": 9.18920603462746e-05, + "loss": 1.9705, + "step": 6788 + }, + { + "epoch": 2.083793738489871, + "grad_norm": 0.38771605491638184, + "learning_rate": 9.18893466409875e-05, + "loss": 1.9038, + "step": 6789 + }, + { + "epoch": 2.0841006752608964, + "grad_norm": 0.38480475544929504, + "learning_rate": 9.188663252172534e-05, + "loss": 1.8725, + "step": 6790 + }, + { + "epoch": 2.0844076120319213, + "grad_norm": 0.37508267164230347, + "learning_rate": 9.18839179885149e-05, + "loss": 1.8819, + "step": 6791 + }, + { + "epoch": 2.0847145488029466, + "grad_norm": 0.3970893621444702, + "learning_rate": 9.188120304138306e-05, + "loss": 1.9035, + "step": 6792 + }, + { + "epoch": 2.085021485573972, + "grad_norm": 0.42629706859588623, + "learning_rate": 9.18784876803566e-05, + "loss": 1.993, + "step": 6793 + }, + { + "epoch": 2.085328422344997, + "grad_norm": 0.40387317538261414, + "learning_rate": 9.18757719054624e-05, + "loss": 1.8987, + "step": 6794 + }, + { + "epoch": 2.085635359116022, + "grad_norm": 0.40304768085479736, + "learning_rate": 9.187305571672726e-05, + "loss": 1.9017, + "step": 6795 + }, + { + "epoch": 2.0859422958870475, + "grad_norm": 0.34255313873291016, + "learning_rate": 9.187033911417805e-05, + "loss": 1.8406, + "step": 6796 + }, + { + "epoch": 2.0862492326580724, + "grad_norm": 0.34713810682296753, + "learning_rate": 9.18676220978416e-05, + "loss": 1.8773, + "step": 6797 + }, + { + "epoch": 2.0865561694290977, + "grad_norm": 0.3651806712150574, + "learning_rate": 9.186490466774478e-05, + "loss": 1.9158, + "step": 6798 + }, + { + "epoch": 2.0868631062001226, + "grad_norm": 0.3859401047229767, + "learning_rate": 9.186218682391443e-05, + "loss": 1.8488, + "step": 6799 + }, + { + "epoch": 2.087170042971148, + "grad_norm": 0.34309303760528564, + "learning_rate": 9.185946856637742e-05, + "loss": 1.8373, + "step": 6800 + }, + { + "epoch": 2.0874769797421733, + "grad_norm": 0.3597384989261627, + "learning_rate": 9.18567498951606e-05, + "loss": 1.8297, + "step": 6801 + }, + { + "epoch": 2.087783916513198, + "grad_norm": 0.39170950651168823, + "learning_rate": 9.185403081029085e-05, + "loss": 1.9623, + "step": 6802 + }, + { + "epoch": 2.0880908532842235, + "grad_norm": 0.37024664878845215, + "learning_rate": 9.185131131179503e-05, + "loss": 1.8966, + "step": 6803 + }, + { + "epoch": 2.0883977900552484, + "grad_norm": 0.37869709730148315, + "learning_rate": 9.184859139970001e-05, + "loss": 1.9121, + "step": 6804 + }, + { + "epoch": 2.0887047268262737, + "grad_norm": 0.3808143436908722, + "learning_rate": 9.184587107403271e-05, + "loss": 1.918, + "step": 6805 + }, + { + "epoch": 2.089011663597299, + "grad_norm": 0.3864719271659851, + "learning_rate": 9.184315033481996e-05, + "loss": 1.9087, + "step": 6806 + }, + { + "epoch": 2.089318600368324, + "grad_norm": 0.41121476888656616, + "learning_rate": 9.184042918208869e-05, + "loss": 1.8971, + "step": 6807 + }, + { + "epoch": 2.0896255371393493, + "grad_norm": 0.33098986744880676, + "learning_rate": 9.183770761586576e-05, + "loss": 1.8497, + "step": 6808 + }, + { + "epoch": 2.0899324739103746, + "grad_norm": 0.336174339056015, + "learning_rate": 9.183498563617809e-05, + "loss": 1.8341, + "step": 6809 + }, + { + "epoch": 2.0902394106813995, + "grad_norm": 0.339040070772171, + "learning_rate": 9.183226324305258e-05, + "loss": 1.9228, + "step": 6810 + }, + { + "epoch": 2.090546347452425, + "grad_norm": 0.395000159740448, + "learning_rate": 9.182954043651613e-05, + "loss": 1.9773, + "step": 6811 + }, + { + "epoch": 2.09085328422345, + "grad_norm": 0.3884550929069519, + "learning_rate": 9.182681721659563e-05, + "loss": 1.9665, + "step": 6812 + }, + { + "epoch": 2.091160220994475, + "grad_norm": 0.38752105832099915, + "learning_rate": 9.182409358331801e-05, + "loss": 1.9337, + "step": 6813 + }, + { + "epoch": 2.0914671577655004, + "grad_norm": 0.3557493984699249, + "learning_rate": 9.182136953671017e-05, + "loss": 1.8506, + "step": 6814 + }, + { + "epoch": 2.0917740945365253, + "grad_norm": 0.36052554845809937, + "learning_rate": 9.181864507679906e-05, + "loss": 1.8336, + "step": 6815 + }, + { + "epoch": 2.0920810313075506, + "grad_norm": 0.3311133086681366, + "learning_rate": 9.181592020361158e-05, + "loss": 1.9121, + "step": 6816 + }, + { + "epoch": 2.092387968078576, + "grad_norm": 0.33922117948532104, + "learning_rate": 9.181319491717468e-05, + "loss": 1.8366, + "step": 6817 + }, + { + "epoch": 2.092694904849601, + "grad_norm": 0.30820000171661377, + "learning_rate": 9.181046921751527e-05, + "loss": 1.8931, + "step": 6818 + }, + { + "epoch": 2.093001841620626, + "grad_norm": 0.327374666929245, + "learning_rate": 9.180774310466031e-05, + "loss": 1.8818, + "step": 6819 + }, + { + "epoch": 2.0933087783916515, + "grad_norm": 0.3244091868400574, + "learning_rate": 9.180501657863672e-05, + "loss": 1.8542, + "step": 6820 + }, + { + "epoch": 2.0936157151626764, + "grad_norm": 0.32823657989501953, + "learning_rate": 9.180228963947144e-05, + "loss": 1.8745, + "step": 6821 + }, + { + "epoch": 2.0939226519337018, + "grad_norm": 0.32869017124176025, + "learning_rate": 9.179956228719144e-05, + "loss": 1.8497, + "step": 6822 + }, + { + "epoch": 2.0942295887047266, + "grad_norm": 0.3624805808067322, + "learning_rate": 9.179683452182369e-05, + "loss": 1.9499, + "step": 6823 + }, + { + "epoch": 2.094536525475752, + "grad_norm": 0.35709038376808167, + "learning_rate": 9.179410634339509e-05, + "loss": 1.8709, + "step": 6824 + }, + { + "epoch": 2.0948434622467773, + "grad_norm": 0.3875027298927307, + "learning_rate": 9.179137775193266e-05, + "loss": 1.883, + "step": 6825 + }, + { + "epoch": 2.095150399017802, + "grad_norm": 0.4203769862651825, + "learning_rate": 9.178864874746333e-05, + "loss": 1.814, + "step": 6826 + }, + { + "epoch": 2.0954573357888275, + "grad_norm": 0.46331214904785156, + "learning_rate": 9.178591933001407e-05, + "loss": 1.9821, + "step": 6827 + }, + { + "epoch": 2.095764272559853, + "grad_norm": 0.4264145791530609, + "learning_rate": 9.178318949961188e-05, + "loss": 1.9249, + "step": 6828 + }, + { + "epoch": 2.0960712093308778, + "grad_norm": 0.3697608709335327, + "learning_rate": 9.178045925628371e-05, + "loss": 2.0052, + "step": 6829 + }, + { + "epoch": 2.096378146101903, + "grad_norm": 0.39582517743110657, + "learning_rate": 9.177772860005656e-05, + "loss": 1.9086, + "step": 6830 + }, + { + "epoch": 2.096685082872928, + "grad_norm": 0.3287788927555084, + "learning_rate": 9.17749975309574e-05, + "loss": 1.8766, + "step": 6831 + }, + { + "epoch": 2.0969920196439533, + "grad_norm": 0.33648282289505005, + "learning_rate": 9.177226604901324e-05, + "loss": 1.933, + "step": 6832 + }, + { + "epoch": 2.0972989564149787, + "grad_norm": 0.34225910902023315, + "learning_rate": 9.176953415425106e-05, + "loss": 1.8801, + "step": 6833 + }, + { + "epoch": 2.0976058931860035, + "grad_norm": 0.35536935925483704, + "learning_rate": 9.176680184669786e-05, + "loss": 1.9472, + "step": 6834 + }, + { + "epoch": 2.097912829957029, + "grad_norm": 0.39152607321739197, + "learning_rate": 9.176406912638064e-05, + "loss": 1.9502, + "step": 6835 + }, + { + "epoch": 2.098219766728054, + "grad_norm": 0.3812694549560547, + "learning_rate": 9.176133599332643e-05, + "loss": 1.8746, + "step": 6836 + }, + { + "epoch": 2.098526703499079, + "grad_norm": 0.36225396394729614, + "learning_rate": 9.17586024475622e-05, + "loss": 1.8489, + "step": 6837 + }, + { + "epoch": 2.0988336402701044, + "grad_norm": 0.3953205943107605, + "learning_rate": 9.1755868489115e-05, + "loss": 1.8671, + "step": 6838 + }, + { + "epoch": 2.0991405770411293, + "grad_norm": 0.33443906903266907, + "learning_rate": 9.175313411801181e-05, + "loss": 1.8574, + "step": 6839 + }, + { + "epoch": 2.0994475138121547, + "grad_norm": 0.3358154892921448, + "learning_rate": 9.17503993342797e-05, + "loss": 1.8329, + "step": 6840 + }, + { + "epoch": 2.09975445058318, + "grad_norm": 0.45934513211250305, + "learning_rate": 9.174766413794566e-05, + "loss": 1.862, + "step": 6841 + }, + { + "epoch": 2.100061387354205, + "grad_norm": 0.46342480182647705, + "learning_rate": 9.174492852903673e-05, + "loss": 1.8747, + "step": 6842 + }, + { + "epoch": 2.1003683241252302, + "grad_norm": 0.4199588894844055, + "learning_rate": 9.174219250757996e-05, + "loss": 1.9308, + "step": 6843 + }, + { + "epoch": 2.1006752608962556, + "grad_norm": 0.3508588373661041, + "learning_rate": 9.173945607360238e-05, + "loss": 1.8622, + "step": 6844 + }, + { + "epoch": 2.1009821976672804, + "grad_norm": 0.3656609356403351, + "learning_rate": 9.173671922713104e-05, + "loss": 1.899, + "step": 6845 + }, + { + "epoch": 2.101289134438306, + "grad_norm": 0.43374791741371155, + "learning_rate": 9.173398196819295e-05, + "loss": 1.8725, + "step": 6846 + }, + { + "epoch": 2.1015960712093307, + "grad_norm": 0.49730411171913147, + "learning_rate": 9.17312442968152e-05, + "loss": 1.9224, + "step": 6847 + }, + { + "epoch": 2.101903007980356, + "grad_norm": 0.45392677187919617, + "learning_rate": 9.172850621302484e-05, + "loss": 1.8374, + "step": 6848 + }, + { + "epoch": 2.1022099447513813, + "grad_norm": 0.3507382273674011, + "learning_rate": 9.172576771684892e-05, + "loss": 1.8875, + "step": 6849 + }, + { + "epoch": 2.1025168815224062, + "grad_norm": 0.4124681055545807, + "learning_rate": 9.172302880831451e-05, + "loss": 1.8828, + "step": 6850 + }, + { + "epoch": 2.1028238182934316, + "grad_norm": 0.5120462775230408, + "learning_rate": 9.172028948744867e-05, + "loss": 1.8218, + "step": 6851 + }, + { + "epoch": 2.103130755064457, + "grad_norm": 0.5858038067817688, + "learning_rate": 9.171754975427848e-05, + "loss": 1.8679, + "step": 6852 + }, + { + "epoch": 2.103437691835482, + "grad_norm": 0.5196588039398193, + "learning_rate": 9.171480960883101e-05, + "loss": 1.8885, + "step": 6853 + }, + { + "epoch": 2.103744628606507, + "grad_norm": 0.38581255078315735, + "learning_rate": 9.171206905113335e-05, + "loss": 1.9127, + "step": 6854 + }, + { + "epoch": 2.104051565377532, + "grad_norm": 0.31531259417533875, + "learning_rate": 9.170932808121256e-05, + "loss": 1.84, + "step": 6855 + }, + { + "epoch": 2.1043585021485574, + "grad_norm": 0.4595080018043518, + "learning_rate": 9.170658669909575e-05, + "loss": 1.908, + "step": 6856 + }, + { + "epoch": 2.1046654389195827, + "grad_norm": 0.42485639452934265, + "learning_rate": 9.170384490481001e-05, + "loss": 1.8943, + "step": 6857 + }, + { + "epoch": 2.1049723756906076, + "grad_norm": 0.3465791344642639, + "learning_rate": 9.170110269838243e-05, + "loss": 1.8362, + "step": 6858 + }, + { + "epoch": 2.105279312461633, + "grad_norm": 0.26863181591033936, + "learning_rate": 9.16983600798401e-05, + "loss": 1.856, + "step": 6859 + }, + { + "epoch": 2.1055862492326582, + "grad_norm": 0.33826425671577454, + "learning_rate": 9.169561704921014e-05, + "loss": 1.8148, + "step": 6860 + }, + { + "epoch": 2.105893186003683, + "grad_norm": 0.3657929301261902, + "learning_rate": 9.169287360651967e-05, + "loss": 1.8978, + "step": 6861 + }, + { + "epoch": 2.1062001227747085, + "grad_norm": 0.2963617444038391, + "learning_rate": 9.169012975179579e-05, + "loss": 1.8432, + "step": 6862 + }, + { + "epoch": 2.1065070595457334, + "grad_norm": 0.32966092228889465, + "learning_rate": 9.168738548506559e-05, + "loss": 1.9137, + "step": 6863 + }, + { + "epoch": 2.1068139963167587, + "grad_norm": 0.4043191075325012, + "learning_rate": 9.168464080635622e-05, + "loss": 1.9294, + "step": 6864 + }, + { + "epoch": 2.107120933087784, + "grad_norm": 0.41461876034736633, + "learning_rate": 9.168189571569479e-05, + "loss": 1.8582, + "step": 6865 + }, + { + "epoch": 2.107427869858809, + "grad_norm": 0.34119492769241333, + "learning_rate": 9.167915021310845e-05, + "loss": 1.8245, + "step": 6866 + }, + { + "epoch": 2.1077348066298343, + "grad_norm": 0.3259434401988983, + "learning_rate": 9.167640429862429e-05, + "loss": 1.8962, + "step": 6867 + }, + { + "epoch": 2.1080417434008596, + "grad_norm": 0.3074548840522766, + "learning_rate": 9.167365797226951e-05, + "loss": 1.8617, + "step": 6868 + }, + { + "epoch": 2.1083486801718845, + "grad_norm": 0.40738388895988464, + "learning_rate": 9.167091123407121e-05, + "loss": 1.9701, + "step": 6869 + }, + { + "epoch": 2.10865561694291, + "grad_norm": 0.3931449055671692, + "learning_rate": 9.166816408405653e-05, + "loss": 1.8874, + "step": 6870 + }, + { + "epoch": 2.108962553713935, + "grad_norm": 0.3726460635662079, + "learning_rate": 9.166541652225264e-05, + "loss": 1.9307, + "step": 6871 + }, + { + "epoch": 2.10926949048496, + "grad_norm": 0.36566078662872314, + "learning_rate": 9.166266854868667e-05, + "loss": 1.8782, + "step": 6872 + }, + { + "epoch": 2.1095764272559854, + "grad_norm": 0.33448025584220886, + "learning_rate": 9.16599201633858e-05, + "loss": 1.8007, + "step": 6873 + }, + { + "epoch": 2.1098833640270103, + "grad_norm": 0.4261031150817871, + "learning_rate": 9.165717136637716e-05, + "loss": 1.9092, + "step": 6874 + }, + { + "epoch": 2.1101903007980356, + "grad_norm": 0.37860241532325745, + "learning_rate": 9.165442215768798e-05, + "loss": 1.8538, + "step": 6875 + }, + { + "epoch": 2.110497237569061, + "grad_norm": 0.35417279601097107, + "learning_rate": 9.165167253734535e-05, + "loss": 1.8859, + "step": 6876 + }, + { + "epoch": 2.110804174340086, + "grad_norm": 0.33357858657836914, + "learning_rate": 9.16489225053765e-05, + "loss": 1.8615, + "step": 6877 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.40441447496414185, + "learning_rate": 9.164617206180856e-05, + "loss": 1.8711, + "step": 6878 + }, + { + "epoch": 2.1114180478821365, + "grad_norm": 0.401530921459198, + "learning_rate": 9.164342120666876e-05, + "loss": 1.8378, + "step": 6879 + }, + { + "epoch": 2.1117249846531614, + "grad_norm": 0.36379504203796387, + "learning_rate": 9.164066993998426e-05, + "loss": 1.87, + "step": 6880 + }, + { + "epoch": 2.1120319214241867, + "grad_norm": 0.36242642998695374, + "learning_rate": 9.163791826178225e-05, + "loss": 1.9041, + "step": 6881 + }, + { + "epoch": 2.1123388581952116, + "grad_norm": 0.34601980447769165, + "learning_rate": 9.163516617208994e-05, + "loss": 1.9248, + "step": 6882 + }, + { + "epoch": 2.112645794966237, + "grad_norm": 0.4664660096168518, + "learning_rate": 9.163241367093451e-05, + "loss": 1.901, + "step": 6883 + }, + { + "epoch": 2.1129527317372623, + "grad_norm": 0.5991809964179993, + "learning_rate": 9.162966075834315e-05, + "loss": 1.9061, + "step": 6884 + }, + { + "epoch": 2.113259668508287, + "grad_norm": 0.5235050320625305, + "learning_rate": 9.16269074343431e-05, + "loss": 1.8958, + "step": 6885 + }, + { + "epoch": 2.1135666052793125, + "grad_norm": 0.39008161425590515, + "learning_rate": 9.162415369896153e-05, + "loss": 1.7935, + "step": 6886 + }, + { + "epoch": 2.113873542050338, + "grad_norm": 0.4212269186973572, + "learning_rate": 9.16213995522257e-05, + "loss": 1.9876, + "step": 6887 + }, + { + "epoch": 2.1141804788213627, + "grad_norm": 0.44495880603790283, + "learning_rate": 9.161864499416279e-05, + "loss": 1.9011, + "step": 6888 + }, + { + "epoch": 2.114487415592388, + "grad_norm": 0.40533384680747986, + "learning_rate": 9.161589002480006e-05, + "loss": 1.8734, + "step": 6889 + }, + { + "epoch": 2.114794352363413, + "grad_norm": 0.45783132314682007, + "learning_rate": 9.161313464416469e-05, + "loss": 1.9769, + "step": 6890 + }, + { + "epoch": 2.1151012891344383, + "grad_norm": 0.37975600361824036, + "learning_rate": 9.161037885228393e-05, + "loss": 1.8988, + "step": 6891 + }, + { + "epoch": 2.1154082259054636, + "grad_norm": 0.394987553358078, + "learning_rate": 9.160762264918504e-05, + "loss": 1.8076, + "step": 6892 + }, + { + "epoch": 2.1157151626764885, + "grad_norm": 0.4180262088775635, + "learning_rate": 9.160486603489522e-05, + "loss": 1.9497, + "step": 6893 + }, + { + "epoch": 2.116022099447514, + "grad_norm": 0.3917383849620819, + "learning_rate": 9.160210900944173e-05, + "loss": 1.9093, + "step": 6894 + }, + { + "epoch": 2.116329036218539, + "grad_norm": 0.3631739616394043, + "learning_rate": 9.15993515728518e-05, + "loss": 1.8724, + "step": 6895 + }, + { + "epoch": 2.116635972989564, + "grad_norm": 0.3304460942745209, + "learning_rate": 9.159659372515272e-05, + "loss": 1.8291, + "step": 6896 + }, + { + "epoch": 2.1169429097605894, + "grad_norm": 0.38202792406082153, + "learning_rate": 9.159383546637172e-05, + "loss": 1.8919, + "step": 6897 + }, + { + "epoch": 2.1172498465316143, + "grad_norm": 0.39544618129730225, + "learning_rate": 9.159107679653605e-05, + "loss": 1.8748, + "step": 6898 + }, + { + "epoch": 2.1175567833026396, + "grad_norm": 0.44175153970718384, + "learning_rate": 9.158831771567298e-05, + "loss": 1.9063, + "step": 6899 + }, + { + "epoch": 2.117863720073665, + "grad_norm": 0.3696559965610504, + "learning_rate": 9.158555822380979e-05, + "loss": 1.8356, + "step": 6900 + }, + { + "epoch": 2.11817065684469, + "grad_norm": 0.2917703688144684, + "learning_rate": 9.158279832097372e-05, + "loss": 1.8996, + "step": 6901 + }, + { + "epoch": 2.118477593615715, + "grad_norm": 0.3991266191005707, + "learning_rate": 9.158003800719208e-05, + "loss": 1.8872, + "step": 6902 + }, + { + "epoch": 2.1187845303867405, + "grad_norm": 0.41425880789756775, + "learning_rate": 9.157727728249213e-05, + "loss": 1.845, + "step": 6903 + }, + { + "epoch": 2.1190914671577654, + "grad_norm": 0.33590519428253174, + "learning_rate": 9.157451614690115e-05, + "loss": 1.8779, + "step": 6904 + }, + { + "epoch": 2.1193984039287908, + "grad_norm": 0.34963786602020264, + "learning_rate": 9.157175460044644e-05, + "loss": 1.8846, + "step": 6905 + }, + { + "epoch": 2.1197053406998156, + "grad_norm": 0.3274745047092438, + "learning_rate": 9.156899264315528e-05, + "loss": 1.8859, + "step": 6906 + }, + { + "epoch": 2.120012277470841, + "grad_norm": 0.35821303725242615, + "learning_rate": 9.156623027505498e-05, + "loss": 1.8314, + "step": 6907 + }, + { + "epoch": 2.1203192142418663, + "grad_norm": 0.41185733675956726, + "learning_rate": 9.156346749617283e-05, + "loss": 1.9162, + "step": 6908 + }, + { + "epoch": 2.120626151012891, + "grad_norm": 0.4120326042175293, + "learning_rate": 9.156070430653613e-05, + "loss": 1.8593, + "step": 6909 + }, + { + "epoch": 2.1209330877839165, + "grad_norm": 0.39017269015312195, + "learning_rate": 9.155794070617218e-05, + "loss": 1.9333, + "step": 6910 + }, + { + "epoch": 2.121240024554942, + "grad_norm": 0.3104727864265442, + "learning_rate": 9.155517669510832e-05, + "loss": 1.8274, + "step": 6911 + }, + { + "epoch": 2.1215469613259668, + "grad_norm": 0.38360875844955444, + "learning_rate": 9.155241227337183e-05, + "loss": 1.9013, + "step": 6912 + }, + { + "epoch": 2.121853898096992, + "grad_norm": 0.3752502501010895, + "learning_rate": 9.154964744099006e-05, + "loss": 1.9079, + "step": 6913 + }, + { + "epoch": 2.122160834868017, + "grad_norm": 0.32074928283691406, + "learning_rate": 9.154688219799033e-05, + "loss": 1.8232, + "step": 6914 + }, + { + "epoch": 2.1224677716390423, + "grad_norm": 0.39559221267700195, + "learning_rate": 9.154411654439993e-05, + "loss": 1.9273, + "step": 6915 + }, + { + "epoch": 2.1227747084100677, + "grad_norm": 0.4010276198387146, + "learning_rate": 9.154135048024623e-05, + "loss": 1.8368, + "step": 6916 + }, + { + "epoch": 2.1230816451810925, + "grad_norm": 0.5745936036109924, + "learning_rate": 9.153858400555658e-05, + "loss": 2.0344, + "step": 6917 + }, + { + "epoch": 2.123388581952118, + "grad_norm": 0.45708227157592773, + "learning_rate": 9.153581712035827e-05, + "loss": 1.9309, + "step": 6918 + }, + { + "epoch": 2.123695518723143, + "grad_norm": 0.43845629692077637, + "learning_rate": 9.153304982467868e-05, + "loss": 1.9213, + "step": 6919 + }, + { + "epoch": 2.124002455494168, + "grad_norm": 0.34456655383110046, + "learning_rate": 9.153028211854516e-05, + "loss": 1.9, + "step": 6920 + }, + { + "epoch": 2.1243093922651934, + "grad_norm": 0.3903563618659973, + "learning_rate": 9.152751400198502e-05, + "loss": 1.8619, + "step": 6921 + }, + { + "epoch": 2.1246163290362183, + "grad_norm": 0.3465174436569214, + "learning_rate": 9.152474547502566e-05, + "loss": 1.8253, + "step": 6922 + }, + { + "epoch": 2.1249232658072437, + "grad_norm": 0.38335317373275757, + "learning_rate": 9.152197653769444e-05, + "loss": 1.8824, + "step": 6923 + }, + { + "epoch": 2.125230202578269, + "grad_norm": 0.3583361506462097, + "learning_rate": 9.15192071900187e-05, + "loss": 1.8749, + "step": 6924 + }, + { + "epoch": 2.125537139349294, + "grad_norm": 0.38249272108078003, + "learning_rate": 9.151643743202582e-05, + "loss": 1.9289, + "step": 6925 + }, + { + "epoch": 2.1258440761203192, + "grad_norm": 0.3972204327583313, + "learning_rate": 9.151366726374318e-05, + "loss": 1.8259, + "step": 6926 + }, + { + "epoch": 2.1261510128913446, + "grad_norm": 0.42475268244743347, + "learning_rate": 9.151089668519814e-05, + "loss": 1.9026, + "step": 6927 + }, + { + "epoch": 2.1264579496623695, + "grad_norm": 0.39575010538101196, + "learning_rate": 9.15081256964181e-05, + "loss": 1.8835, + "step": 6928 + }, + { + "epoch": 2.126764886433395, + "grad_norm": 0.33592918515205383, + "learning_rate": 9.150535429743041e-05, + "loss": 1.9439, + "step": 6929 + }, + { + "epoch": 2.12707182320442, + "grad_norm": 0.41760140657424927, + "learning_rate": 9.150258248826249e-05, + "loss": 1.9326, + "step": 6930 + }, + { + "epoch": 2.127378759975445, + "grad_norm": 0.4759281575679779, + "learning_rate": 9.149981026894173e-05, + "loss": 1.8443, + "step": 6931 + }, + { + "epoch": 2.1276856967464703, + "grad_norm": 0.4669014513492584, + "learning_rate": 9.149703763949552e-05, + "loss": 1.9254, + "step": 6932 + }, + { + "epoch": 2.1279926335174952, + "grad_norm": 0.3498002588748932, + "learning_rate": 9.149426459995126e-05, + "loss": 1.8814, + "step": 6933 + }, + { + "epoch": 2.1282995702885206, + "grad_norm": 0.332998663187027, + "learning_rate": 9.149149115033637e-05, + "loss": 1.8223, + "step": 6934 + }, + { + "epoch": 2.128606507059546, + "grad_norm": 0.36990395188331604, + "learning_rate": 9.148871729067823e-05, + "loss": 1.917, + "step": 6935 + }, + { + "epoch": 2.128913443830571, + "grad_norm": 0.4807330369949341, + "learning_rate": 9.148594302100426e-05, + "loss": 1.9138, + "step": 6936 + }, + { + "epoch": 2.129220380601596, + "grad_norm": 0.4821743369102478, + "learning_rate": 9.14831683413419e-05, + "loss": 1.9201, + "step": 6937 + }, + { + "epoch": 2.129527317372621, + "grad_norm": 0.45373013615608215, + "learning_rate": 9.148039325171855e-05, + "loss": 1.88, + "step": 6938 + }, + { + "epoch": 2.1298342541436464, + "grad_norm": 0.3712935745716095, + "learning_rate": 9.147761775216166e-05, + "loss": 1.8424, + "step": 6939 + }, + { + "epoch": 2.1301411909146717, + "grad_norm": 0.32493939995765686, + "learning_rate": 9.147484184269862e-05, + "loss": 1.8691, + "step": 6940 + }, + { + "epoch": 2.1304481276856966, + "grad_norm": 0.41952449083328247, + "learning_rate": 9.14720655233569e-05, + "loss": 1.8468, + "step": 6941 + }, + { + "epoch": 2.130755064456722, + "grad_norm": 0.4730648398399353, + "learning_rate": 9.14692887941639e-05, + "loss": 2.0333, + "step": 6942 + }, + { + "epoch": 2.1310620012277472, + "grad_norm": 0.3745786249637604, + "learning_rate": 9.14665116551471e-05, + "loss": 1.8835, + "step": 6943 + }, + { + "epoch": 2.131368937998772, + "grad_norm": 0.3747421205043793, + "learning_rate": 9.146373410633392e-05, + "loss": 1.8958, + "step": 6944 + }, + { + "epoch": 2.1316758747697975, + "grad_norm": 0.4383934438228607, + "learning_rate": 9.146095614775182e-05, + "loss": 1.8527, + "step": 6945 + }, + { + "epoch": 2.131982811540823, + "grad_norm": 0.4657299220561981, + "learning_rate": 9.145817777942824e-05, + "loss": 1.9073, + "step": 6946 + }, + { + "epoch": 2.1322897483118477, + "grad_norm": 0.4741605818271637, + "learning_rate": 9.145539900139067e-05, + "loss": 1.8736, + "step": 6947 + }, + { + "epoch": 2.132596685082873, + "grad_norm": 0.4058460295200348, + "learning_rate": 9.145261981366653e-05, + "loss": 1.9365, + "step": 6948 + }, + { + "epoch": 2.132903621853898, + "grad_norm": 0.3430838882923126, + "learning_rate": 9.14498402162833e-05, + "loss": 1.8992, + "step": 6949 + }, + { + "epoch": 2.1332105586249233, + "grad_norm": 0.43009114265441895, + "learning_rate": 9.144706020926847e-05, + "loss": 1.925, + "step": 6950 + }, + { + "epoch": 2.1335174953959486, + "grad_norm": 0.47696158289909363, + "learning_rate": 9.144427979264949e-05, + "loss": 1.858, + "step": 6951 + }, + { + "epoch": 2.1338244321669735, + "grad_norm": 0.4477602243423462, + "learning_rate": 9.144149896645386e-05, + "loss": 1.9042, + "step": 6952 + }, + { + "epoch": 2.134131368937999, + "grad_norm": 0.3736960291862488, + "learning_rate": 9.143871773070903e-05, + "loss": 1.782, + "step": 6953 + }, + { + "epoch": 2.1344383057090237, + "grad_norm": 0.3065558075904846, + "learning_rate": 9.143593608544251e-05, + "loss": 1.8711, + "step": 6954 + }, + { + "epoch": 2.134745242480049, + "grad_norm": 0.41738569736480713, + "learning_rate": 9.143315403068178e-05, + "loss": 1.8651, + "step": 6955 + }, + { + "epoch": 2.1350521792510744, + "grad_norm": 0.4652978479862213, + "learning_rate": 9.143037156645435e-05, + "loss": 1.8225, + "step": 6956 + }, + { + "epoch": 2.1353591160220993, + "grad_norm": 0.3625001311302185, + "learning_rate": 9.142758869278769e-05, + "loss": 1.9045, + "step": 6957 + }, + { + "epoch": 2.1356660527931246, + "grad_norm": 0.34516090154647827, + "learning_rate": 9.142480540970933e-05, + "loss": 1.8527, + "step": 6958 + }, + { + "epoch": 2.13597298956415, + "grad_norm": 0.36983323097229004, + "learning_rate": 9.142202171724674e-05, + "loss": 1.7911, + "step": 6959 + }, + { + "epoch": 2.136279926335175, + "grad_norm": 0.46084535121917725, + "learning_rate": 9.141923761542748e-05, + "loss": 1.9489, + "step": 6960 + }, + { + "epoch": 2.1365868631062, + "grad_norm": 0.49472227692604065, + "learning_rate": 9.141645310427903e-05, + "loss": 1.9904, + "step": 6961 + }, + { + "epoch": 2.1368937998772255, + "grad_norm": 0.39878135919570923, + "learning_rate": 9.14136681838289e-05, + "loss": 1.8969, + "step": 6962 + }, + { + "epoch": 2.1372007366482504, + "grad_norm": 0.3451174795627594, + "learning_rate": 9.141088285410464e-05, + "loss": 1.9186, + "step": 6963 + }, + { + "epoch": 2.1375076734192757, + "grad_norm": 0.4497967064380646, + "learning_rate": 9.140809711513377e-05, + "loss": 1.8636, + "step": 6964 + }, + { + "epoch": 2.1378146101903006, + "grad_norm": 0.4643685221672058, + "learning_rate": 9.14053109669438e-05, + "loss": 1.8427, + "step": 6965 + }, + { + "epoch": 2.138121546961326, + "grad_norm": 0.3748690187931061, + "learning_rate": 9.140252440956229e-05, + "loss": 1.8529, + "step": 6966 + }, + { + "epoch": 2.1384284837323513, + "grad_norm": 0.3211230933666229, + "learning_rate": 9.139973744301675e-05, + "loss": 1.8849, + "step": 6967 + }, + { + "epoch": 2.138735420503376, + "grad_norm": 0.41169998049736023, + "learning_rate": 9.139695006733476e-05, + "loss": 1.8535, + "step": 6968 + }, + { + "epoch": 2.1390423572744015, + "grad_norm": 0.48356300592422485, + "learning_rate": 9.139416228254382e-05, + "loss": 1.8182, + "step": 6969 + }, + { + "epoch": 2.139349294045427, + "grad_norm": 0.4596598148345947, + "learning_rate": 9.139137408867153e-05, + "loss": 1.8522, + "step": 6970 + }, + { + "epoch": 2.1396562308164517, + "grad_norm": 0.37168747186660767, + "learning_rate": 9.138858548574543e-05, + "loss": 1.896, + "step": 6971 + }, + { + "epoch": 2.139963167587477, + "grad_norm": 0.34447649121284485, + "learning_rate": 9.138579647379305e-05, + "loss": 1.8473, + "step": 6972 + }, + { + "epoch": 2.140270104358502, + "grad_norm": 0.466169536113739, + "learning_rate": 9.138300705284197e-05, + "loss": 1.9131, + "step": 6973 + }, + { + "epoch": 2.1405770411295273, + "grad_norm": 0.4297258257865906, + "learning_rate": 9.138021722291977e-05, + "loss": 1.9013, + "step": 6974 + }, + { + "epoch": 2.1408839779005526, + "grad_norm": 0.29336342215538025, + "learning_rate": 9.1377426984054e-05, + "loss": 1.8242, + "step": 6975 + }, + { + "epoch": 2.1411909146715775, + "grad_norm": 0.4282750189304352, + "learning_rate": 9.137463633627226e-05, + "loss": 1.9159, + "step": 6976 + }, + { + "epoch": 2.141497851442603, + "grad_norm": 0.6071211099624634, + "learning_rate": 9.13718452796021e-05, + "loss": 1.9105, + "step": 6977 + }, + { + "epoch": 2.141804788213628, + "grad_norm": 0.5837090015411377, + "learning_rate": 9.136905381407113e-05, + "loss": 1.8735, + "step": 6978 + }, + { + "epoch": 2.142111724984653, + "grad_norm": 0.36910486221313477, + "learning_rate": 9.13662619397069e-05, + "loss": 1.9013, + "step": 6979 + }, + { + "epoch": 2.1424186617556784, + "grad_norm": 0.37497541308403015, + "learning_rate": 9.136346965653704e-05, + "loss": 1.8444, + "step": 6980 + }, + { + "epoch": 2.1427255985267033, + "grad_norm": 0.508252739906311, + "learning_rate": 9.136067696458911e-05, + "loss": 1.8756, + "step": 6981 + }, + { + "epoch": 2.1430325352977286, + "grad_norm": 0.4045214056968689, + "learning_rate": 9.135788386389077e-05, + "loss": 1.8843, + "step": 6982 + }, + { + "epoch": 2.143339472068754, + "grad_norm": 0.36260777711868286, + "learning_rate": 9.135509035446955e-05, + "loss": 1.9264, + "step": 6983 + }, + { + "epoch": 2.143646408839779, + "grad_norm": 0.4112427234649658, + "learning_rate": 9.135229643635309e-05, + "loss": 1.8843, + "step": 6984 + }, + { + "epoch": 2.143953345610804, + "grad_norm": 0.43893104791641235, + "learning_rate": 9.1349502109569e-05, + "loss": 1.9486, + "step": 6985 + }, + { + "epoch": 2.1442602823818295, + "grad_norm": 0.3942745625972748, + "learning_rate": 9.13467073741449e-05, + "loss": 1.8607, + "step": 6986 + }, + { + "epoch": 2.1445672191528544, + "grad_norm": 0.3920004963874817, + "learning_rate": 9.13439122301084e-05, + "loss": 1.8102, + "step": 6987 + }, + { + "epoch": 2.1448741559238798, + "grad_norm": 0.3774373531341553, + "learning_rate": 9.134111667748712e-05, + "loss": 1.8326, + "step": 6988 + }, + { + "epoch": 2.1451810926949046, + "grad_norm": 0.355228453874588, + "learning_rate": 9.13383207163087e-05, + "loss": 1.895, + "step": 6989 + }, + { + "epoch": 2.14548802946593, + "grad_norm": 0.40284648537635803, + "learning_rate": 9.133552434660077e-05, + "loss": 1.928, + "step": 6990 + }, + { + "epoch": 2.1457949662369553, + "grad_norm": 0.3974910378456116, + "learning_rate": 9.133272756839096e-05, + "loss": 1.8567, + "step": 6991 + }, + { + "epoch": 2.14610190300798, + "grad_norm": 0.3878382742404938, + "learning_rate": 9.13299303817069e-05, + "loss": 1.9125, + "step": 6992 + }, + { + "epoch": 2.1464088397790055, + "grad_norm": 0.36132267117500305, + "learning_rate": 9.132713278657625e-05, + "loss": 1.8395, + "step": 6993 + }, + { + "epoch": 2.146715776550031, + "grad_norm": 0.4648832082748413, + "learning_rate": 9.132433478302667e-05, + "loss": 1.8877, + "step": 6994 + }, + { + "epoch": 2.1470227133210558, + "grad_norm": 0.5171563625335693, + "learning_rate": 9.132153637108577e-05, + "loss": 1.857, + "step": 6995 + }, + { + "epoch": 2.147329650092081, + "grad_norm": 0.4256175756454468, + "learning_rate": 9.131873755078124e-05, + "loss": 1.8434, + "step": 6996 + }, + { + "epoch": 2.147636586863106, + "grad_norm": 0.3421500623226166, + "learning_rate": 9.131593832214072e-05, + "loss": 1.8747, + "step": 6997 + }, + { + "epoch": 2.1479435236341313, + "grad_norm": 0.3880314230918884, + "learning_rate": 9.131313868519188e-05, + "loss": 1.8592, + "step": 6998 + }, + { + "epoch": 2.1482504604051567, + "grad_norm": 0.41070252656936646, + "learning_rate": 9.131033863996239e-05, + "loss": 1.8746, + "step": 6999 + }, + { + "epoch": 2.1485573971761815, + "grad_norm": 0.3837376534938812, + "learning_rate": 9.130753818647992e-05, + "loss": 1.8722, + "step": 7000 + }, + { + "epoch": 2.148864333947207, + "grad_norm": 0.311184823513031, + "learning_rate": 9.130473732477217e-05, + "loss": 1.8964, + "step": 7001 + }, + { + "epoch": 2.149171270718232, + "grad_norm": 0.3548091948032379, + "learning_rate": 9.130193605486677e-05, + "loss": 1.9235, + "step": 7002 + }, + { + "epoch": 2.149478207489257, + "grad_norm": 0.3509860932826996, + "learning_rate": 9.129913437679143e-05, + "loss": 1.8088, + "step": 7003 + }, + { + "epoch": 2.1497851442602824, + "grad_norm": 0.3301749527454376, + "learning_rate": 9.129633229057384e-05, + "loss": 1.8926, + "step": 7004 + }, + { + "epoch": 2.150092081031308, + "grad_norm": 0.3071286082267761, + "learning_rate": 9.129352979624169e-05, + "loss": 1.8045, + "step": 7005 + }, + { + "epoch": 2.1503990178023327, + "grad_norm": 0.3222786486148834, + "learning_rate": 9.129072689382268e-05, + "loss": 1.877, + "step": 7006 + }, + { + "epoch": 2.150705954573358, + "grad_norm": 0.31817424297332764, + "learning_rate": 9.128792358334451e-05, + "loss": 1.8863, + "step": 7007 + }, + { + "epoch": 2.151012891344383, + "grad_norm": 0.29379183053970337, + "learning_rate": 9.128511986483487e-05, + "loss": 1.8339, + "step": 7008 + }, + { + "epoch": 2.1513198281154082, + "grad_norm": 0.3618883788585663, + "learning_rate": 9.128231573832149e-05, + "loss": 1.9521, + "step": 7009 + }, + { + "epoch": 2.1516267648864336, + "grad_norm": 0.3188464045524597, + "learning_rate": 9.127951120383205e-05, + "loss": 1.811, + "step": 7010 + }, + { + "epoch": 2.1519337016574585, + "grad_norm": 0.3257068395614624, + "learning_rate": 9.127670626139431e-05, + "loss": 1.9084, + "step": 7011 + }, + { + "epoch": 2.152240638428484, + "grad_norm": 0.3389057219028473, + "learning_rate": 9.127390091103595e-05, + "loss": 1.9272, + "step": 7012 + }, + { + "epoch": 2.1525475751995087, + "grad_norm": 0.3376730680465698, + "learning_rate": 9.127109515278471e-05, + "loss": 1.8841, + "step": 7013 + }, + { + "epoch": 2.152854511970534, + "grad_norm": 0.3032901883125305, + "learning_rate": 9.126828898666833e-05, + "loss": 1.8057, + "step": 7014 + }, + { + "epoch": 2.1531614487415593, + "grad_norm": 0.32034799456596375, + "learning_rate": 9.126548241271451e-05, + "loss": 1.7988, + "step": 7015 + }, + { + "epoch": 2.1534683855125842, + "grad_norm": 0.31879931688308716, + "learning_rate": 9.126267543095102e-05, + "loss": 1.8932, + "step": 7016 + }, + { + "epoch": 2.1537753222836096, + "grad_norm": 0.3282395005226135, + "learning_rate": 9.125986804140559e-05, + "loss": 1.907, + "step": 7017 + }, + { + "epoch": 2.154082259054635, + "grad_norm": 0.36310696601867676, + "learning_rate": 9.125706024410594e-05, + "loss": 1.9812, + "step": 7018 + }, + { + "epoch": 2.15438919582566, + "grad_norm": 0.39414262771606445, + "learning_rate": 9.125425203907985e-05, + "loss": 1.9112, + "step": 7019 + }, + { + "epoch": 2.154696132596685, + "grad_norm": 0.4457061290740967, + "learning_rate": 9.125144342635508e-05, + "loss": 1.8876, + "step": 7020 + }, + { + "epoch": 2.1550030693677105, + "grad_norm": 0.4651646316051483, + "learning_rate": 9.124863440595934e-05, + "loss": 1.8283, + "step": 7021 + }, + { + "epoch": 2.1553100061387354, + "grad_norm": 0.4404383897781372, + "learning_rate": 9.124582497792043e-05, + "loss": 1.8646, + "step": 7022 + }, + { + "epoch": 2.1556169429097607, + "grad_norm": 0.3569783866405487, + "learning_rate": 9.124301514226612e-05, + "loss": 1.9603, + "step": 7023 + }, + { + "epoch": 2.1559238796807856, + "grad_norm": 0.3878212571144104, + "learning_rate": 9.124020489902414e-05, + "loss": 1.889, + "step": 7024 + }, + { + "epoch": 2.156230816451811, + "grad_norm": 0.43005698919296265, + "learning_rate": 9.123739424822229e-05, + "loss": 1.9127, + "step": 7025 + }, + { + "epoch": 2.1565377532228363, + "grad_norm": 0.37798774242401123, + "learning_rate": 9.123458318988834e-05, + "loss": 1.8434, + "step": 7026 + }, + { + "epoch": 2.156844689993861, + "grad_norm": 0.38182979822158813, + "learning_rate": 9.123177172405007e-05, + "loss": 1.8905, + "step": 7027 + }, + { + "epoch": 2.1571516267648865, + "grad_norm": 0.4695180058479309, + "learning_rate": 9.122895985073524e-05, + "loss": 1.9035, + "step": 7028 + }, + { + "epoch": 2.1574585635359114, + "grad_norm": 0.37112870812416077, + "learning_rate": 9.12261475699717e-05, + "loss": 1.8497, + "step": 7029 + }, + { + "epoch": 2.1577655003069367, + "grad_norm": 0.36758264899253845, + "learning_rate": 9.122333488178721e-05, + "loss": 1.9015, + "step": 7030 + }, + { + "epoch": 2.158072437077962, + "grad_norm": 0.4691081643104553, + "learning_rate": 9.122052178620953e-05, + "loss": 1.9707, + "step": 7031 + }, + { + "epoch": 2.158379373848987, + "grad_norm": 0.47068753838539124, + "learning_rate": 9.121770828326653e-05, + "loss": 1.9103, + "step": 7032 + }, + { + "epoch": 2.1586863106200123, + "grad_norm": 0.38539063930511475, + "learning_rate": 9.121489437298593e-05, + "loss": 1.7872, + "step": 7033 + }, + { + "epoch": 2.1589932473910376, + "grad_norm": 0.43769749999046326, + "learning_rate": 9.121208005539563e-05, + "loss": 1.9654, + "step": 7034 + }, + { + "epoch": 2.1593001841620625, + "grad_norm": 0.4770655930042267, + "learning_rate": 9.120926533052338e-05, + "loss": 1.9754, + "step": 7035 + }, + { + "epoch": 2.159607120933088, + "grad_norm": 0.526979386806488, + "learning_rate": 9.120645019839702e-05, + "loss": 1.8833, + "step": 7036 + }, + { + "epoch": 2.159914057704113, + "grad_norm": 0.4734671413898468, + "learning_rate": 9.120363465904438e-05, + "loss": 1.8695, + "step": 7037 + }, + { + "epoch": 2.160220994475138, + "grad_norm": 0.40346798300743103, + "learning_rate": 9.120081871249326e-05, + "loss": 1.9216, + "step": 7038 + }, + { + "epoch": 2.1605279312461634, + "grad_norm": 0.38210105895996094, + "learning_rate": 9.119800235877149e-05, + "loss": 1.9334, + "step": 7039 + }, + { + "epoch": 2.1608348680171883, + "grad_norm": 0.5528677105903625, + "learning_rate": 9.119518559790694e-05, + "loss": 1.8858, + "step": 7040 + }, + { + "epoch": 2.1611418047882136, + "grad_norm": 0.6684148907661438, + "learning_rate": 9.11923684299274e-05, + "loss": 1.9105, + "step": 7041 + }, + { + "epoch": 2.161448741559239, + "grad_norm": 0.4497738778591156, + "learning_rate": 9.118955085486073e-05, + "loss": 1.8789, + "step": 7042 + }, + { + "epoch": 2.161755678330264, + "grad_norm": 0.4440831243991852, + "learning_rate": 9.11867328727348e-05, + "loss": 1.9966, + "step": 7043 + }, + { + "epoch": 2.162062615101289, + "grad_norm": 0.5910835266113281, + "learning_rate": 9.118391448357742e-05, + "loss": 1.8841, + "step": 7044 + }, + { + "epoch": 2.1623695518723145, + "grad_norm": 0.5312752723693848, + "learning_rate": 9.118109568741645e-05, + "loss": 1.8825, + "step": 7045 + }, + { + "epoch": 2.1626764886433394, + "grad_norm": 0.3885713815689087, + "learning_rate": 9.117827648427977e-05, + "loss": 1.8763, + "step": 7046 + }, + { + "epoch": 2.1629834254143647, + "grad_norm": 0.4274894893169403, + "learning_rate": 9.117545687419522e-05, + "loss": 1.8802, + "step": 7047 + }, + { + "epoch": 2.1632903621853896, + "grad_norm": 0.3984382748603821, + "learning_rate": 9.117263685719067e-05, + "loss": 1.8319, + "step": 7048 + }, + { + "epoch": 2.163597298956415, + "grad_norm": 0.3687778115272522, + "learning_rate": 9.1169816433294e-05, + "loss": 1.838, + "step": 7049 + }, + { + "epoch": 2.1639042357274403, + "grad_norm": 0.37597915530204773, + "learning_rate": 9.116699560253306e-05, + "loss": 1.8711, + "step": 7050 + }, + { + "epoch": 2.164211172498465, + "grad_norm": 0.41217467188835144, + "learning_rate": 9.116417436493574e-05, + "loss": 1.8552, + "step": 7051 + }, + { + "epoch": 2.1645181092694905, + "grad_norm": 0.3937448263168335, + "learning_rate": 9.116135272052994e-05, + "loss": 1.8548, + "step": 7052 + }, + { + "epoch": 2.164825046040516, + "grad_norm": 0.3545389175415039, + "learning_rate": 9.115853066934351e-05, + "loss": 1.8694, + "step": 7053 + }, + { + "epoch": 2.1651319828115407, + "grad_norm": 0.32625243067741394, + "learning_rate": 9.115570821140436e-05, + "loss": 1.8579, + "step": 7054 + }, + { + "epoch": 2.165438919582566, + "grad_norm": 0.32701975107192993, + "learning_rate": 9.115288534674038e-05, + "loss": 1.8676, + "step": 7055 + }, + { + "epoch": 2.165745856353591, + "grad_norm": 0.39372533559799194, + "learning_rate": 9.115006207537947e-05, + "loss": 1.8895, + "step": 7056 + }, + { + "epoch": 2.1660527931246163, + "grad_norm": 0.3688350021839142, + "learning_rate": 9.114723839734954e-05, + "loss": 1.8742, + "step": 7057 + }, + { + "epoch": 2.1663597298956416, + "grad_norm": 0.35461875796318054, + "learning_rate": 9.114441431267846e-05, + "loss": 1.8723, + "step": 7058 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.3331618010997772, + "learning_rate": 9.114158982139416e-05, + "loss": 1.8514, + "step": 7059 + }, + { + "epoch": 2.166973603437692, + "grad_norm": 0.3313215374946594, + "learning_rate": 9.113876492352458e-05, + "loss": 1.912, + "step": 7060 + }, + { + "epoch": 2.167280540208717, + "grad_norm": 0.3320949375629425, + "learning_rate": 9.113593961909759e-05, + "loss": 1.8908, + "step": 7061 + }, + { + "epoch": 2.167587476979742, + "grad_norm": 0.3292064070701599, + "learning_rate": 9.113311390814115e-05, + "loss": 1.8702, + "step": 7062 + }, + { + "epoch": 2.1678944137507674, + "grad_norm": 0.33991244435310364, + "learning_rate": 9.113028779068316e-05, + "loss": 1.8503, + "step": 7063 + }, + { + "epoch": 2.1682013505217923, + "grad_norm": 0.3602859377861023, + "learning_rate": 9.112746126675156e-05, + "loss": 1.9185, + "step": 7064 + }, + { + "epoch": 2.1685082872928176, + "grad_norm": 0.3354876637458801, + "learning_rate": 9.112463433637428e-05, + "loss": 1.8857, + "step": 7065 + }, + { + "epoch": 2.168815224063843, + "grad_norm": 0.32364192605018616, + "learning_rate": 9.112180699957926e-05, + "loss": 1.8548, + "step": 7066 + }, + { + "epoch": 2.169122160834868, + "grad_norm": 0.3617163896560669, + "learning_rate": 9.111897925639446e-05, + "loss": 1.9021, + "step": 7067 + }, + { + "epoch": 2.169429097605893, + "grad_norm": 0.3852904438972473, + "learning_rate": 9.111615110684778e-05, + "loss": 1.9331, + "step": 7068 + }, + { + "epoch": 2.1697360343769185, + "grad_norm": 0.332939088344574, + "learning_rate": 9.111332255096721e-05, + "loss": 1.9156, + "step": 7069 + }, + { + "epoch": 2.1700429711479434, + "grad_norm": 0.3386891186237335, + "learning_rate": 9.111049358878067e-05, + "loss": 1.8898, + "step": 7070 + }, + { + "epoch": 2.1703499079189688, + "grad_norm": 0.3559711277484894, + "learning_rate": 9.110766422031617e-05, + "loss": 1.8546, + "step": 7071 + }, + { + "epoch": 2.1706568446899936, + "grad_norm": 0.3440175950527191, + "learning_rate": 9.110483444560162e-05, + "loss": 1.9005, + "step": 7072 + }, + { + "epoch": 2.170963781461019, + "grad_norm": 0.3239493668079376, + "learning_rate": 9.110200426466499e-05, + "loss": 1.9258, + "step": 7073 + }, + { + "epoch": 2.1712707182320443, + "grad_norm": 0.3658723533153534, + "learning_rate": 9.109917367753428e-05, + "loss": 2.0203, + "step": 7074 + }, + { + "epoch": 2.171577655003069, + "grad_norm": 0.35419905185699463, + "learning_rate": 9.109634268423746e-05, + "loss": 1.8515, + "step": 7075 + }, + { + "epoch": 2.1718845917740945, + "grad_norm": 0.40852081775665283, + "learning_rate": 9.109351128480246e-05, + "loss": 1.8744, + "step": 7076 + }, + { + "epoch": 2.17219152854512, + "grad_norm": 0.3502386212348938, + "learning_rate": 9.109067947925732e-05, + "loss": 1.8785, + "step": 7077 + }, + { + "epoch": 2.1724984653161448, + "grad_norm": 0.42964309453964233, + "learning_rate": 9.108784726763e-05, + "loss": 1.9175, + "step": 7078 + }, + { + "epoch": 2.17280540208717, + "grad_norm": 0.39438319206237793, + "learning_rate": 9.108501464994849e-05, + "loss": 1.9072, + "step": 7079 + }, + { + "epoch": 2.1731123388581954, + "grad_norm": 0.5045785903930664, + "learning_rate": 9.108218162624079e-05, + "loss": 1.9246, + "step": 7080 + }, + { + "epoch": 2.1734192756292203, + "grad_norm": 0.4374946653842926, + "learning_rate": 9.107934819653488e-05, + "loss": 1.8669, + "step": 7081 + }, + { + "epoch": 2.1737262124002457, + "grad_norm": 0.3263556957244873, + "learning_rate": 9.107651436085878e-05, + "loss": 1.8402, + "step": 7082 + }, + { + "epoch": 2.1740331491712706, + "grad_norm": 0.4380986988544464, + "learning_rate": 9.107368011924048e-05, + "loss": 1.8948, + "step": 7083 + }, + { + "epoch": 2.174340085942296, + "grad_norm": 0.4350908696651459, + "learning_rate": 9.1070845471708e-05, + "loss": 1.8717, + "step": 7084 + }, + { + "epoch": 2.174647022713321, + "grad_norm": 0.37809762358665466, + "learning_rate": 9.106801041828936e-05, + "loss": 1.8703, + "step": 7085 + }, + { + "epoch": 2.174953959484346, + "grad_norm": 0.3473457992076874, + "learning_rate": 9.106517495901257e-05, + "loss": 1.8999, + "step": 7086 + }, + { + "epoch": 2.1752608962553714, + "grad_norm": 0.48066645860671997, + "learning_rate": 9.106233909390564e-05, + "loss": 1.8788, + "step": 7087 + }, + { + "epoch": 2.1755678330263963, + "grad_norm": 0.5873035788536072, + "learning_rate": 9.105950282299663e-05, + "loss": 1.8879, + "step": 7088 + }, + { + "epoch": 2.1758747697974217, + "grad_norm": 0.47609585523605347, + "learning_rate": 9.105666614631354e-05, + "loss": 1.8813, + "step": 7089 + }, + { + "epoch": 2.176181706568447, + "grad_norm": 0.3845362365245819, + "learning_rate": 9.10538290638844e-05, + "loss": 1.9629, + "step": 7090 + }, + { + "epoch": 2.176488643339472, + "grad_norm": 0.5463572144508362, + "learning_rate": 9.105099157573727e-05, + "loss": 1.9455, + "step": 7091 + }, + { + "epoch": 2.1767955801104972, + "grad_norm": 0.4875337779521942, + "learning_rate": 9.104815368190017e-05, + "loss": 1.9146, + "step": 7092 + }, + { + "epoch": 2.1771025168815226, + "grad_norm": 0.37513965368270874, + "learning_rate": 9.104531538240116e-05, + "loss": 1.8626, + "step": 7093 + }, + { + "epoch": 2.1774094536525475, + "grad_norm": 0.3477539122104645, + "learning_rate": 9.104247667726828e-05, + "loss": 1.878, + "step": 7094 + }, + { + "epoch": 2.177716390423573, + "grad_norm": 0.5122693181037903, + "learning_rate": 9.103963756652961e-05, + "loss": 1.8784, + "step": 7095 + }, + { + "epoch": 2.178023327194598, + "grad_norm": 0.49106159806251526, + "learning_rate": 9.103679805021317e-05, + "loss": 1.8441, + "step": 7096 + }, + { + "epoch": 2.178330263965623, + "grad_norm": 0.3801479637622833, + "learning_rate": 9.103395812834705e-05, + "loss": 1.8986, + "step": 7097 + }, + { + "epoch": 2.1786372007366483, + "grad_norm": 0.3429640233516693, + "learning_rate": 9.10311178009593e-05, + "loss": 1.8806, + "step": 7098 + }, + { + "epoch": 2.1789441375076732, + "grad_norm": 0.36715295910835266, + "learning_rate": 9.102827706807799e-05, + "loss": 1.8215, + "step": 7099 + }, + { + "epoch": 2.1792510742786986, + "grad_norm": 0.37225866317749023, + "learning_rate": 9.10254359297312e-05, + "loss": 1.8851, + "step": 7100 + }, + { + "epoch": 2.179558011049724, + "grad_norm": 0.3552459180355072, + "learning_rate": 9.102259438594702e-05, + "loss": 1.9345, + "step": 7101 + }, + { + "epoch": 2.179864947820749, + "grad_norm": 0.3876415193080902, + "learning_rate": 9.10197524367535e-05, + "loss": 1.8657, + "step": 7102 + }, + { + "epoch": 2.180171884591774, + "grad_norm": 0.4635472595691681, + "learning_rate": 9.101691008217875e-05, + "loss": 1.8527, + "step": 7103 + }, + { + "epoch": 2.1804788213627995, + "grad_norm": 0.46319296956062317, + "learning_rate": 9.101406732225086e-05, + "loss": 1.869, + "step": 7104 + }, + { + "epoch": 2.1807857581338244, + "grad_norm": 0.36179330945014954, + "learning_rate": 9.101122415699792e-05, + "loss": 1.9157, + "step": 7105 + }, + { + "epoch": 2.1810926949048497, + "grad_norm": 0.30921339988708496, + "learning_rate": 9.100838058644801e-05, + "loss": 1.858, + "step": 7106 + }, + { + "epoch": 2.1813996316758746, + "grad_norm": 0.4568884074687958, + "learning_rate": 9.100553661062925e-05, + "loss": 1.8663, + "step": 7107 + }, + { + "epoch": 2.1817065684469, + "grad_norm": 0.43856412172317505, + "learning_rate": 9.100269222956976e-05, + "loss": 1.8492, + "step": 7108 + }, + { + "epoch": 2.1820135052179253, + "grad_norm": 0.3025546967983246, + "learning_rate": 9.099984744329761e-05, + "loss": 1.8532, + "step": 7109 + }, + { + "epoch": 2.18232044198895, + "grad_norm": 0.38365665078163147, + "learning_rate": 9.099700225184096e-05, + "loss": 1.8883, + "step": 7110 + }, + { + "epoch": 2.1826273787599755, + "grad_norm": 0.4863334596157074, + "learning_rate": 9.099415665522788e-05, + "loss": 1.8682, + "step": 7111 + }, + { + "epoch": 2.182934315531001, + "grad_norm": 0.42789241671562195, + "learning_rate": 9.099131065348653e-05, + "loss": 1.8867, + "step": 7112 + }, + { + "epoch": 2.1832412523020257, + "grad_norm": 0.35933569073677063, + "learning_rate": 9.098846424664504e-05, + "loss": 1.9282, + "step": 7113 + }, + { + "epoch": 2.183548189073051, + "grad_norm": 0.42611026763916016, + "learning_rate": 9.09856174347315e-05, + "loss": 1.9609, + "step": 7114 + }, + { + "epoch": 2.183855125844076, + "grad_norm": 0.43970558047294617, + "learning_rate": 9.098277021777406e-05, + "loss": 1.823, + "step": 7115 + }, + { + "epoch": 2.1841620626151013, + "grad_norm": 0.36792683601379395, + "learning_rate": 9.097992259580089e-05, + "loss": 1.9231, + "step": 7116 + }, + { + "epoch": 2.1844689993861266, + "grad_norm": 0.3554590344429016, + "learning_rate": 9.097707456884008e-05, + "loss": 1.914, + "step": 7117 + }, + { + "epoch": 2.1847759361571515, + "grad_norm": 0.4271651804447174, + "learning_rate": 9.097422613691982e-05, + "loss": 1.8666, + "step": 7118 + }, + { + "epoch": 2.185082872928177, + "grad_norm": 0.32142770290374756, + "learning_rate": 9.097137730006822e-05, + "loss": 1.7989, + "step": 7119 + }, + { + "epoch": 2.185389809699202, + "grad_norm": 0.33245620131492615, + "learning_rate": 9.096852805831348e-05, + "loss": 1.8536, + "step": 7120 + }, + { + "epoch": 2.185696746470227, + "grad_norm": 0.3480495810508728, + "learning_rate": 9.09656784116837e-05, + "loss": 1.9008, + "step": 7121 + }, + { + "epoch": 2.1860036832412524, + "grad_norm": 0.35290226340293884, + "learning_rate": 9.09628283602071e-05, + "loss": 1.8593, + "step": 7122 + }, + { + "epoch": 2.1863106200122773, + "grad_norm": 0.3084987998008728, + "learning_rate": 9.095997790391183e-05, + "loss": 1.827, + "step": 7123 + }, + { + "epoch": 2.1866175567833026, + "grad_norm": 0.36295285820961, + "learning_rate": 9.095712704282604e-05, + "loss": 1.909, + "step": 7124 + }, + { + "epoch": 2.186924493554328, + "grad_norm": 0.3893873691558838, + "learning_rate": 9.095427577697791e-05, + "loss": 1.9221, + "step": 7125 + }, + { + "epoch": 2.187231430325353, + "grad_norm": 0.3699241578578949, + "learning_rate": 9.095142410639564e-05, + "loss": 1.9352, + "step": 7126 + }, + { + "epoch": 2.187538367096378, + "grad_norm": 0.3384705185890198, + "learning_rate": 9.094857203110738e-05, + "loss": 1.8541, + "step": 7127 + }, + { + "epoch": 2.1878453038674035, + "grad_norm": 0.377687007188797, + "learning_rate": 9.094571955114133e-05, + "loss": 1.8336, + "step": 7128 + }, + { + "epoch": 2.1881522406384284, + "grad_norm": 0.40227916836738586, + "learning_rate": 9.094286666652567e-05, + "loss": 1.9565, + "step": 7129 + }, + { + "epoch": 2.1884591774094537, + "grad_norm": 0.3679705560207367, + "learning_rate": 9.094001337728862e-05, + "loss": 1.8152, + "step": 7130 + }, + { + "epoch": 2.1887661141804786, + "grad_norm": 0.3197132647037506, + "learning_rate": 9.093715968345836e-05, + "loss": 1.9263, + "step": 7131 + }, + { + "epoch": 2.189073050951504, + "grad_norm": 0.3518284559249878, + "learning_rate": 9.09343055850631e-05, + "loss": 1.8675, + "step": 7132 + }, + { + "epoch": 2.1893799877225293, + "grad_norm": 0.3214010000228882, + "learning_rate": 9.093145108213103e-05, + "loss": 1.8991, + "step": 7133 + }, + { + "epoch": 2.189686924493554, + "grad_norm": 0.3563176393508911, + "learning_rate": 9.092859617469037e-05, + "loss": 1.8603, + "step": 7134 + }, + { + "epoch": 2.1899938612645795, + "grad_norm": 0.34053143858909607, + "learning_rate": 9.092574086276933e-05, + "loss": 1.8955, + "step": 7135 + }, + { + "epoch": 2.190300798035605, + "grad_norm": 0.3833705484867096, + "learning_rate": 9.092288514639613e-05, + "loss": 1.8845, + "step": 7136 + }, + { + "epoch": 2.1906077348066297, + "grad_norm": 0.3932427763938904, + "learning_rate": 9.092002902559901e-05, + "loss": 1.8608, + "step": 7137 + }, + { + "epoch": 2.190914671577655, + "grad_norm": 0.332955539226532, + "learning_rate": 9.091717250040617e-05, + "loss": 1.8558, + "step": 7138 + }, + { + "epoch": 2.1912216083486804, + "grad_norm": 0.3149980306625366, + "learning_rate": 9.091431557084584e-05, + "loss": 1.893, + "step": 7139 + }, + { + "epoch": 2.1915285451197053, + "grad_norm": 0.3679150640964508, + "learning_rate": 9.091145823694628e-05, + "loss": 1.9012, + "step": 7140 + }, + { + "epoch": 2.1918354818907306, + "grad_norm": 0.36836057901382446, + "learning_rate": 9.09086004987357e-05, + "loss": 1.9121, + "step": 7141 + }, + { + "epoch": 2.1921424186617555, + "grad_norm": 0.3581927418708801, + "learning_rate": 9.090574235624237e-05, + "loss": 1.8826, + "step": 7142 + }, + { + "epoch": 2.192449355432781, + "grad_norm": 0.40886545181274414, + "learning_rate": 9.09028838094945e-05, + "loss": 1.8828, + "step": 7143 + }, + { + "epoch": 2.192756292203806, + "grad_norm": 0.32729873061180115, + "learning_rate": 9.090002485852037e-05, + "loss": 1.8827, + "step": 7144 + }, + { + "epoch": 2.193063228974831, + "grad_norm": 0.35304784774780273, + "learning_rate": 9.089716550334819e-05, + "loss": 1.846, + "step": 7145 + }, + { + "epoch": 2.1933701657458564, + "grad_norm": 0.35022708773612976, + "learning_rate": 9.089430574400629e-05, + "loss": 1.9169, + "step": 7146 + }, + { + "epoch": 2.1936771025168813, + "grad_norm": 0.4137697219848633, + "learning_rate": 9.089144558052287e-05, + "loss": 1.9111, + "step": 7147 + }, + { + "epoch": 2.1939840392879066, + "grad_norm": 0.3193536102771759, + "learning_rate": 9.088858501292622e-05, + "loss": 1.8577, + "step": 7148 + }, + { + "epoch": 2.194290976058932, + "grad_norm": 0.35795432329177856, + "learning_rate": 9.08857240412446e-05, + "loss": 1.8645, + "step": 7149 + }, + { + "epoch": 2.194597912829957, + "grad_norm": 0.3626460134983063, + "learning_rate": 9.088286266550632e-05, + "loss": 1.9288, + "step": 7150 + }, + { + "epoch": 2.194904849600982, + "grad_norm": 0.3438000977039337, + "learning_rate": 9.08800008857396e-05, + "loss": 1.9112, + "step": 7151 + }, + { + "epoch": 2.1952117863720075, + "grad_norm": 0.3445241153240204, + "learning_rate": 9.087713870197276e-05, + "loss": 1.8711, + "step": 7152 + }, + { + "epoch": 2.1955187231430324, + "grad_norm": 0.34294596314430237, + "learning_rate": 9.087427611423408e-05, + "loss": 1.9061, + "step": 7153 + }, + { + "epoch": 2.1958256599140578, + "grad_norm": 0.3608735203742981, + "learning_rate": 9.087141312255184e-05, + "loss": 1.8634, + "step": 7154 + }, + { + "epoch": 2.196132596685083, + "grad_norm": 0.3417772352695465, + "learning_rate": 9.086854972695434e-05, + "loss": 1.9, + "step": 7155 + }, + { + "epoch": 2.196439533456108, + "grad_norm": 0.3516700863838196, + "learning_rate": 9.086568592746988e-05, + "loss": 1.9021, + "step": 7156 + }, + { + "epoch": 2.1967464702271333, + "grad_norm": 0.37481075525283813, + "learning_rate": 9.086282172412677e-05, + "loss": 1.8845, + "step": 7157 + }, + { + "epoch": 2.197053406998158, + "grad_norm": 0.3413105010986328, + "learning_rate": 9.08599571169533e-05, + "loss": 1.8128, + "step": 7158 + }, + { + "epoch": 2.1973603437691835, + "grad_norm": 0.3539934754371643, + "learning_rate": 9.085709210597777e-05, + "loss": 1.857, + "step": 7159 + }, + { + "epoch": 2.197667280540209, + "grad_norm": 0.4345060884952545, + "learning_rate": 9.085422669122851e-05, + "loss": 1.8698, + "step": 7160 + }, + { + "epoch": 2.1979742173112338, + "grad_norm": 0.40369880199432373, + "learning_rate": 9.085136087273386e-05, + "loss": 1.7948, + "step": 7161 + }, + { + "epoch": 2.198281154082259, + "grad_norm": 0.3832145035266876, + "learning_rate": 9.08484946505221e-05, + "loss": 1.8682, + "step": 7162 + }, + { + "epoch": 2.198588090853284, + "grad_norm": 0.2859131097793579, + "learning_rate": 9.084562802462158e-05, + "loss": 1.8123, + "step": 7163 + }, + { + "epoch": 2.1988950276243093, + "grad_norm": 0.3062222898006439, + "learning_rate": 9.084276099506062e-05, + "loss": 1.8448, + "step": 7164 + }, + { + "epoch": 2.1992019643953347, + "grad_norm": 0.3819046914577484, + "learning_rate": 9.083989356186757e-05, + "loss": 1.8661, + "step": 7165 + }, + { + "epoch": 2.1995089011663596, + "grad_norm": 0.5007020235061646, + "learning_rate": 9.083702572507074e-05, + "loss": 1.9144, + "step": 7166 + }, + { + "epoch": 2.199815837937385, + "grad_norm": 0.521885097026825, + "learning_rate": 9.083415748469849e-05, + "loss": 1.8695, + "step": 7167 + }, + { + "epoch": 2.2001227747084102, + "grad_norm": 0.35051268339157104, + "learning_rate": 9.083128884077916e-05, + "loss": 1.9378, + "step": 7168 + }, + { + "epoch": 2.200429711479435, + "grad_norm": 0.40265345573425293, + "learning_rate": 9.082841979334111e-05, + "loss": 1.8902, + "step": 7169 + }, + { + "epoch": 2.2007366482504604, + "grad_norm": 0.506377637386322, + "learning_rate": 9.082555034241267e-05, + "loss": 1.9115, + "step": 7170 + }, + { + "epoch": 2.201043585021486, + "grad_norm": 0.42828384041786194, + "learning_rate": 9.082268048802223e-05, + "loss": 1.8173, + "step": 7171 + }, + { + "epoch": 2.2013505217925107, + "grad_norm": 0.2979312539100647, + "learning_rate": 9.081981023019812e-05, + "loss": 1.8089, + "step": 7172 + }, + { + "epoch": 2.201657458563536, + "grad_norm": 0.3840465843677521, + "learning_rate": 9.081693956896872e-05, + "loss": 1.8557, + "step": 7173 + }, + { + "epoch": 2.201964395334561, + "grad_norm": 0.41454845666885376, + "learning_rate": 9.081406850436241e-05, + "loss": 1.8599, + "step": 7174 + }, + { + "epoch": 2.2022713321055862, + "grad_norm": 0.3305908739566803, + "learning_rate": 9.081119703640756e-05, + "loss": 1.8013, + "step": 7175 + }, + { + "epoch": 2.2025782688766116, + "grad_norm": 0.33649876713752747, + "learning_rate": 9.080832516513252e-05, + "loss": 1.9028, + "step": 7176 + }, + { + "epoch": 2.2028852056476365, + "grad_norm": 0.41247284412384033, + "learning_rate": 9.08054528905657e-05, + "loss": 1.8636, + "step": 7177 + }, + { + "epoch": 2.203192142418662, + "grad_norm": 0.4355279505252838, + "learning_rate": 9.080258021273548e-05, + "loss": 1.8923, + "step": 7178 + }, + { + "epoch": 2.203499079189687, + "grad_norm": 0.34598320722579956, + "learning_rate": 9.079970713167026e-05, + "loss": 1.9187, + "step": 7179 + }, + { + "epoch": 2.203806015960712, + "grad_norm": 0.3560951054096222, + "learning_rate": 9.07968336473984e-05, + "loss": 1.9382, + "step": 7180 + }, + { + "epoch": 2.2041129527317374, + "grad_norm": 0.3873176872730255, + "learning_rate": 9.079395975994834e-05, + "loss": 1.8377, + "step": 7181 + }, + { + "epoch": 2.2044198895027622, + "grad_norm": 0.38699567317962646, + "learning_rate": 9.079108546934844e-05, + "loss": 1.848, + "step": 7182 + }, + { + "epoch": 2.2047268262737876, + "grad_norm": 0.3658364713191986, + "learning_rate": 9.078821077562712e-05, + "loss": 1.9308, + "step": 7183 + }, + { + "epoch": 2.205033763044813, + "grad_norm": 0.35228830575942993, + "learning_rate": 9.078533567881281e-05, + "loss": 1.8886, + "step": 7184 + }, + { + "epoch": 2.205340699815838, + "grad_norm": 0.4177337884902954, + "learning_rate": 9.07824601789339e-05, + "loss": 1.8695, + "step": 7185 + }, + { + "epoch": 2.205647636586863, + "grad_norm": 0.4778536260128021, + "learning_rate": 9.077958427601882e-05, + "loss": 1.8288, + "step": 7186 + }, + { + "epoch": 2.2059545733578885, + "grad_norm": 0.46544820070266724, + "learning_rate": 9.077670797009599e-05, + "loss": 1.8974, + "step": 7187 + }, + { + "epoch": 2.2062615101289134, + "grad_norm": 0.36188805103302, + "learning_rate": 9.077383126119382e-05, + "loss": 1.8953, + "step": 7188 + }, + { + "epoch": 2.2065684468999387, + "grad_norm": 0.30941206216812134, + "learning_rate": 9.077095414934075e-05, + "loss": 1.8395, + "step": 7189 + }, + { + "epoch": 2.2068753836709636, + "grad_norm": 0.4497200846672058, + "learning_rate": 9.076807663456524e-05, + "loss": 1.8485, + "step": 7190 + }, + { + "epoch": 2.207182320441989, + "grad_norm": 0.4923233985900879, + "learning_rate": 9.076519871689568e-05, + "loss": 1.8233, + "step": 7191 + }, + { + "epoch": 2.2074892572130143, + "grad_norm": 0.32226502895355225, + "learning_rate": 9.076232039636053e-05, + "loss": 1.8563, + "step": 7192 + }, + { + "epoch": 2.207796193984039, + "grad_norm": 0.46719446778297424, + "learning_rate": 9.075944167298824e-05, + "loss": 1.8602, + "step": 7193 + }, + { + "epoch": 2.2081031307550645, + "grad_norm": 0.5534674525260925, + "learning_rate": 9.075656254680727e-05, + "loss": 1.8804, + "step": 7194 + }, + { + "epoch": 2.20841006752609, + "grad_norm": 0.4895678162574768, + "learning_rate": 9.075368301784606e-05, + "loss": 1.8893, + "step": 7195 + }, + { + "epoch": 2.2087170042971147, + "grad_norm": 0.33137625455856323, + "learning_rate": 9.075080308613306e-05, + "loss": 1.9158, + "step": 7196 + }, + { + "epoch": 2.20902394106814, + "grad_norm": 0.469319611787796, + "learning_rate": 9.074792275169674e-05, + "loss": 1.8628, + "step": 7197 + }, + { + "epoch": 2.209330877839165, + "grad_norm": 0.43872305750846863, + "learning_rate": 9.074504201456556e-05, + "loss": 1.8867, + "step": 7198 + }, + { + "epoch": 2.2096378146101903, + "grad_norm": 0.32900992035865784, + "learning_rate": 9.0742160874768e-05, + "loss": 1.8079, + "step": 7199 + }, + { + "epoch": 2.2099447513812156, + "grad_norm": 0.34231048822402954, + "learning_rate": 9.073927933233253e-05, + "loss": 1.9018, + "step": 7200 + }, + { + "epoch": 2.2102516881522405, + "grad_norm": 0.43461740016937256, + "learning_rate": 9.07363973872876e-05, + "loss": 1.8299, + "step": 7201 + }, + { + "epoch": 2.210558624923266, + "grad_norm": 0.43819913268089294, + "learning_rate": 9.073351503966174e-05, + "loss": 1.8641, + "step": 7202 + }, + { + "epoch": 2.210865561694291, + "grad_norm": 0.330683171749115, + "learning_rate": 9.073063228948339e-05, + "loss": 1.8595, + "step": 7203 + }, + { + "epoch": 2.211172498465316, + "grad_norm": 0.35648414492607117, + "learning_rate": 9.072774913678108e-05, + "loss": 1.8265, + "step": 7204 + }, + { + "epoch": 2.2114794352363414, + "grad_norm": 0.4420771300792694, + "learning_rate": 9.072486558158329e-05, + "loss": 1.902, + "step": 7205 + }, + { + "epoch": 2.2117863720073663, + "grad_norm": 0.41682472825050354, + "learning_rate": 9.072198162391849e-05, + "loss": 1.903, + "step": 7206 + }, + { + "epoch": 2.2120933087783916, + "grad_norm": 0.3194744288921356, + "learning_rate": 9.07190972638152e-05, + "loss": 1.8221, + "step": 7207 + }, + { + "epoch": 2.212400245549417, + "grad_norm": 0.35625776648521423, + "learning_rate": 9.071621250130192e-05, + "loss": 1.8737, + "step": 7208 + }, + { + "epoch": 2.212707182320442, + "grad_norm": 0.4136293828487396, + "learning_rate": 9.071332733640716e-05, + "loss": 1.7995, + "step": 7209 + }, + { + "epoch": 2.213014119091467, + "grad_norm": 0.39144495129585266, + "learning_rate": 9.071044176915947e-05, + "loss": 1.8446, + "step": 7210 + }, + { + "epoch": 2.2133210558624925, + "grad_norm": 0.3082813322544098, + "learning_rate": 9.07075557995873e-05, + "loss": 1.7635, + "step": 7211 + }, + { + "epoch": 2.2136279926335174, + "grad_norm": 0.3642291724681854, + "learning_rate": 9.070466942771921e-05, + "loss": 1.9471, + "step": 7212 + }, + { + "epoch": 2.2139349294045427, + "grad_norm": 0.4506807029247284, + "learning_rate": 9.070178265358372e-05, + "loss": 1.8542, + "step": 7213 + }, + { + "epoch": 2.214241866175568, + "grad_norm": 0.5011601448059082, + "learning_rate": 9.069889547720936e-05, + "loss": 1.9135, + "step": 7214 + }, + { + "epoch": 2.214548802946593, + "grad_norm": 0.3946228623390198, + "learning_rate": 9.069600789862467e-05, + "loss": 1.876, + "step": 7215 + }, + { + "epoch": 2.2148557397176183, + "grad_norm": 0.34833815693855286, + "learning_rate": 9.069311991785816e-05, + "loss": 1.8666, + "step": 7216 + }, + { + "epoch": 2.215162676488643, + "grad_norm": 0.43735191226005554, + "learning_rate": 9.069023153493839e-05, + "loss": 1.9238, + "step": 7217 + }, + { + "epoch": 2.2154696132596685, + "grad_norm": 0.5010718107223511, + "learning_rate": 9.06873427498939e-05, + "loss": 1.8724, + "step": 7218 + }, + { + "epoch": 2.215776550030694, + "grad_norm": 0.35850396752357483, + "learning_rate": 9.068445356275326e-05, + "loss": 1.8825, + "step": 7219 + }, + { + "epoch": 2.2160834868017187, + "grad_norm": 0.3528468906879425, + "learning_rate": 9.0681563973545e-05, + "loss": 1.8724, + "step": 7220 + }, + { + "epoch": 2.216390423572744, + "grad_norm": 0.34725508093833923, + "learning_rate": 9.067867398229767e-05, + "loss": 1.8722, + "step": 7221 + }, + { + "epoch": 2.216697360343769, + "grad_norm": 0.3343757092952728, + "learning_rate": 9.067578358903985e-05, + "loss": 1.8144, + "step": 7222 + }, + { + "epoch": 2.2170042971147943, + "grad_norm": 0.33384087681770325, + "learning_rate": 9.067289279380009e-05, + "loss": 1.832, + "step": 7223 + }, + { + "epoch": 2.2173112338858196, + "grad_norm": 0.3275810778141022, + "learning_rate": 9.067000159660697e-05, + "loss": 1.8819, + "step": 7224 + }, + { + "epoch": 2.2176181706568445, + "grad_norm": 0.405293732881546, + "learning_rate": 9.066710999748904e-05, + "loss": 1.8669, + "step": 7225 + }, + { + "epoch": 2.21792510742787, + "grad_norm": 0.3554569482803345, + "learning_rate": 9.066421799647491e-05, + "loss": 1.8331, + "step": 7226 + }, + { + "epoch": 2.218232044198895, + "grad_norm": 0.3896840810775757, + "learning_rate": 9.066132559359313e-05, + "loss": 1.891, + "step": 7227 + }, + { + "epoch": 2.21853898096992, + "grad_norm": 0.38668718934059143, + "learning_rate": 9.065843278887231e-05, + "loss": 1.9162, + "step": 7228 + }, + { + "epoch": 2.2188459177409454, + "grad_norm": 0.3593392074108124, + "learning_rate": 9.065553958234103e-05, + "loss": 1.866, + "step": 7229 + }, + { + "epoch": 2.2191528545119708, + "grad_norm": 0.3509809076786041, + "learning_rate": 9.065264597402788e-05, + "loss": 1.8979, + "step": 7230 + }, + { + "epoch": 2.2194597912829956, + "grad_norm": 0.35477882623672485, + "learning_rate": 9.064975196396144e-05, + "loss": 1.8425, + "step": 7231 + }, + { + "epoch": 2.219766728054021, + "grad_norm": 0.38763463497161865, + "learning_rate": 9.064685755217033e-05, + "loss": 1.8853, + "step": 7232 + }, + { + "epoch": 2.220073664825046, + "grad_norm": 0.33559930324554443, + "learning_rate": 9.064396273868316e-05, + "loss": 1.8825, + "step": 7233 + }, + { + "epoch": 2.220380601596071, + "grad_norm": 0.3130233585834503, + "learning_rate": 9.064106752352852e-05, + "loss": 1.8082, + "step": 7234 + }, + { + "epoch": 2.2206875383670965, + "grad_norm": 0.33321285247802734, + "learning_rate": 9.063817190673503e-05, + "loss": 1.8795, + "step": 7235 + }, + { + "epoch": 2.2209944751381214, + "grad_norm": 0.47564151883125305, + "learning_rate": 9.063527588833132e-05, + "loss": 1.9461, + "step": 7236 + }, + { + "epoch": 2.2213014119091468, + "grad_norm": 0.38102859258651733, + "learning_rate": 9.063237946834597e-05, + "loss": 1.8656, + "step": 7237 + }, + { + "epoch": 2.2216083486801717, + "grad_norm": 0.32240456342697144, + "learning_rate": 9.062948264680765e-05, + "loss": 1.8187, + "step": 7238 + }, + { + "epoch": 2.221915285451197, + "grad_norm": 0.2852800190448761, + "learning_rate": 9.062658542374496e-05, + "loss": 1.8172, + "step": 7239 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3506350815296173, + "learning_rate": 9.062368779918655e-05, + "loss": 1.8909, + "step": 7240 + }, + { + "epoch": 2.222529158993247, + "grad_norm": 0.29418817162513733, + "learning_rate": 9.062078977316104e-05, + "loss": 1.8078, + "step": 7241 + }, + { + "epoch": 2.2228360957642725, + "grad_norm": 0.31221407651901245, + "learning_rate": 9.061789134569707e-05, + "loss": 1.8813, + "step": 7242 + }, + { + "epoch": 2.223143032535298, + "grad_norm": 0.32314184308052063, + "learning_rate": 9.061499251682331e-05, + "loss": 1.8838, + "step": 7243 + }, + { + "epoch": 2.2234499693063228, + "grad_norm": 0.3329566419124603, + "learning_rate": 9.061209328656838e-05, + "loss": 1.8987, + "step": 7244 + }, + { + "epoch": 2.223756906077348, + "grad_norm": 0.35992133617401123, + "learning_rate": 9.060919365496094e-05, + "loss": 1.9194, + "step": 7245 + }, + { + "epoch": 2.2240638428483734, + "grad_norm": 0.33594536781311035, + "learning_rate": 9.060629362202964e-05, + "loss": 1.8303, + "step": 7246 + }, + { + "epoch": 2.2243707796193983, + "grad_norm": 0.3469938635826111, + "learning_rate": 9.060339318780316e-05, + "loss": 1.905, + "step": 7247 + }, + { + "epoch": 2.2246777163904237, + "grad_norm": 0.3989942967891693, + "learning_rate": 9.060049235231015e-05, + "loss": 1.8655, + "step": 7248 + }, + { + "epoch": 2.2249846531614486, + "grad_norm": 0.35004356503486633, + "learning_rate": 9.059759111557926e-05, + "loss": 1.8081, + "step": 7249 + }, + { + "epoch": 2.225291589932474, + "grad_norm": 0.38162320852279663, + "learning_rate": 9.059468947763919e-05, + "loss": 1.9243, + "step": 7250 + }, + { + "epoch": 2.2255985267034992, + "grad_norm": 0.3417564034461975, + "learning_rate": 9.059178743851859e-05, + "loss": 1.8246, + "step": 7251 + }, + { + "epoch": 2.225905463474524, + "grad_norm": 0.39185380935668945, + "learning_rate": 9.058888499824618e-05, + "loss": 1.9235, + "step": 7252 + }, + { + "epoch": 2.2262124002455494, + "grad_norm": 0.5741223096847534, + "learning_rate": 9.058598215685061e-05, + "loss": 1.9104, + "step": 7253 + }, + { + "epoch": 2.226519337016575, + "grad_norm": 0.6595804691314697, + "learning_rate": 9.058307891436057e-05, + "loss": 1.9956, + "step": 7254 + }, + { + "epoch": 2.2268262737875997, + "grad_norm": 0.6249661445617676, + "learning_rate": 9.058017527080476e-05, + "loss": 1.8913, + "step": 7255 + }, + { + "epoch": 2.227133210558625, + "grad_norm": 0.48208609223365784, + "learning_rate": 9.057727122621188e-05, + "loss": 1.9116, + "step": 7256 + }, + { + "epoch": 2.22744014732965, + "grad_norm": 0.37400147318840027, + "learning_rate": 9.057436678061062e-05, + "loss": 1.8828, + "step": 7257 + }, + { + "epoch": 2.2277470841006752, + "grad_norm": 0.40321463346481323, + "learning_rate": 9.057146193402968e-05, + "loss": 1.7984, + "step": 7258 + }, + { + "epoch": 2.2280540208717006, + "grad_norm": 0.43090149760246277, + "learning_rate": 9.056855668649778e-05, + "loss": 1.9135, + "step": 7259 + }, + { + "epoch": 2.2283609576427255, + "grad_norm": 0.3625677525997162, + "learning_rate": 9.056565103804362e-05, + "loss": 1.9005, + "step": 7260 + }, + { + "epoch": 2.228667894413751, + "grad_norm": 0.3386496901512146, + "learning_rate": 9.056274498869593e-05, + "loss": 1.879, + "step": 7261 + }, + { + "epoch": 2.228974831184776, + "grad_norm": 0.45207980275154114, + "learning_rate": 9.05598385384834e-05, + "loss": 1.8748, + "step": 7262 + }, + { + "epoch": 2.229281767955801, + "grad_norm": 0.38665562868118286, + "learning_rate": 9.055693168743478e-05, + "loss": 1.8828, + "step": 7263 + }, + { + "epoch": 2.2295887047268264, + "grad_norm": 0.3074968159198761, + "learning_rate": 9.05540244355788e-05, + "loss": 1.8443, + "step": 7264 + }, + { + "epoch": 2.2298956414978512, + "grad_norm": 0.36243903636932373, + "learning_rate": 9.055111678294418e-05, + "loss": 1.8681, + "step": 7265 + }, + { + "epoch": 2.2302025782688766, + "grad_norm": 0.4070085287094116, + "learning_rate": 9.054820872955965e-05, + "loss": 1.8643, + "step": 7266 + }, + { + "epoch": 2.230509515039902, + "grad_norm": 0.3784204125404358, + "learning_rate": 9.054530027545396e-05, + "loss": 1.9197, + "step": 7267 + }, + { + "epoch": 2.230816451810927, + "grad_norm": 0.32002586126327515, + "learning_rate": 9.054239142065583e-05, + "loss": 1.9, + "step": 7268 + }, + { + "epoch": 2.231123388581952, + "grad_norm": 0.3701259195804596, + "learning_rate": 9.053948216519405e-05, + "loss": 1.8815, + "step": 7269 + }, + { + "epoch": 2.2314303253529775, + "grad_norm": 0.32927554845809937, + "learning_rate": 9.053657250909734e-05, + "loss": 1.8599, + "step": 7270 + }, + { + "epoch": 2.2317372621240024, + "grad_norm": 0.2915503680706024, + "learning_rate": 9.053366245239445e-05, + "loss": 1.8553, + "step": 7271 + }, + { + "epoch": 2.2320441988950277, + "grad_norm": 0.3347928822040558, + "learning_rate": 9.053075199511416e-05, + "loss": 1.926, + "step": 7272 + }, + { + "epoch": 2.2323511356660526, + "grad_norm": 0.37499183416366577, + "learning_rate": 9.052784113728523e-05, + "loss": 1.8636, + "step": 7273 + }, + { + "epoch": 2.232658072437078, + "grad_norm": 0.38303107023239136, + "learning_rate": 9.05249298789364e-05, + "loss": 1.8739, + "step": 7274 + }, + { + "epoch": 2.2329650092081033, + "grad_norm": 0.356942355632782, + "learning_rate": 9.052201822009648e-05, + "loss": 1.8401, + "step": 7275 + }, + { + "epoch": 2.233271945979128, + "grad_norm": 0.3391316533088684, + "learning_rate": 9.051910616079422e-05, + "loss": 1.8954, + "step": 7276 + }, + { + "epoch": 2.2335788827501535, + "grad_norm": 0.3100464344024658, + "learning_rate": 9.051619370105839e-05, + "loss": 1.8726, + "step": 7277 + }, + { + "epoch": 2.233885819521179, + "grad_norm": 0.38745078444480896, + "learning_rate": 9.05132808409178e-05, + "loss": 1.9605, + "step": 7278 + }, + { + "epoch": 2.2341927562922037, + "grad_norm": 0.40631747245788574, + "learning_rate": 9.051036758040123e-05, + "loss": 1.8458, + "step": 7279 + }, + { + "epoch": 2.234499693063229, + "grad_norm": 0.4084717929363251, + "learning_rate": 9.050745391953745e-05, + "loss": 1.8696, + "step": 7280 + }, + { + "epoch": 2.234806629834254, + "grad_norm": 0.4426955282688141, + "learning_rate": 9.050453985835527e-05, + "loss": 1.9063, + "step": 7281 + }, + { + "epoch": 2.2351135666052793, + "grad_norm": 0.37360796332359314, + "learning_rate": 9.05016253968835e-05, + "loss": 1.9299, + "step": 7282 + }, + { + "epoch": 2.2354205033763046, + "grad_norm": 0.34415799379348755, + "learning_rate": 9.049871053515091e-05, + "loss": 1.8877, + "step": 7283 + }, + { + "epoch": 2.2357274401473295, + "grad_norm": 0.3745698928833008, + "learning_rate": 9.049579527318633e-05, + "loss": 1.9272, + "step": 7284 + }, + { + "epoch": 2.236034376918355, + "grad_norm": 0.3293079435825348, + "learning_rate": 9.049287961101857e-05, + "loss": 1.8599, + "step": 7285 + }, + { + "epoch": 2.23634131368938, + "grad_norm": 0.3563106060028076, + "learning_rate": 9.048996354867644e-05, + "loss": 1.938, + "step": 7286 + }, + { + "epoch": 2.236648250460405, + "grad_norm": 0.36354976892471313, + "learning_rate": 9.048704708618876e-05, + "loss": 1.9401, + "step": 7287 + }, + { + "epoch": 2.2369551872314304, + "grad_norm": 0.32659000158309937, + "learning_rate": 9.048413022358434e-05, + "loss": 1.8056, + "step": 7288 + }, + { + "epoch": 2.2372621240024557, + "grad_norm": 0.30486637353897095, + "learning_rate": 9.048121296089202e-05, + "loss": 1.8178, + "step": 7289 + }, + { + "epoch": 2.2375690607734806, + "grad_norm": 0.34506455063819885, + "learning_rate": 9.047829529814063e-05, + "loss": 1.8866, + "step": 7290 + }, + { + "epoch": 2.237875997544506, + "grad_norm": 0.3200983703136444, + "learning_rate": 9.047537723535902e-05, + "loss": 1.8218, + "step": 7291 + }, + { + "epoch": 2.238182934315531, + "grad_norm": 0.33315715193748474, + "learning_rate": 9.047245877257597e-05, + "loss": 1.8939, + "step": 7292 + }, + { + "epoch": 2.238489871086556, + "grad_norm": 0.38259127736091614, + "learning_rate": 9.046953990982039e-05, + "loss": 1.9566, + "step": 7293 + }, + { + "epoch": 2.2387968078575815, + "grad_norm": 0.32880350947380066, + "learning_rate": 9.04666206471211e-05, + "loss": 1.9056, + "step": 7294 + }, + { + "epoch": 2.2391037446286064, + "grad_norm": 0.39114195108413696, + "learning_rate": 9.046370098450692e-05, + "loss": 1.8773, + "step": 7295 + }, + { + "epoch": 2.2394106813996317, + "grad_norm": 0.37625813484191895, + "learning_rate": 9.046078092200675e-05, + "loss": 1.8685, + "step": 7296 + }, + { + "epoch": 2.2397176181706566, + "grad_norm": 0.3604978621006012, + "learning_rate": 9.045786045964942e-05, + "loss": 1.885, + "step": 7297 + }, + { + "epoch": 2.240024554941682, + "grad_norm": 0.32200589776039124, + "learning_rate": 9.045493959746381e-05, + "loss": 1.9146, + "step": 7298 + }, + { + "epoch": 2.2403314917127073, + "grad_norm": 0.3635976314544678, + "learning_rate": 9.045201833547876e-05, + "loss": 1.8597, + "step": 7299 + }, + { + "epoch": 2.240638428483732, + "grad_norm": 0.3326318562030792, + "learning_rate": 9.044909667372317e-05, + "loss": 1.8577, + "step": 7300 + }, + { + "epoch": 2.2409453652547575, + "grad_norm": 0.32209664583206177, + "learning_rate": 9.044617461222589e-05, + "loss": 1.844, + "step": 7301 + }, + { + "epoch": 2.241252302025783, + "grad_norm": 0.3654637634754181, + "learning_rate": 9.044325215101581e-05, + "loss": 1.8858, + "step": 7302 + }, + { + "epoch": 2.2415592387968077, + "grad_norm": 0.3583166003227234, + "learning_rate": 9.04403292901218e-05, + "loss": 1.8148, + "step": 7303 + }, + { + "epoch": 2.241866175567833, + "grad_norm": 0.3315606117248535, + "learning_rate": 9.043740602957276e-05, + "loss": 1.8504, + "step": 7304 + }, + { + "epoch": 2.2421731123388584, + "grad_norm": 0.36084556579589844, + "learning_rate": 9.043448236939758e-05, + "loss": 1.9167, + "step": 7305 + }, + { + "epoch": 2.2424800491098833, + "grad_norm": 0.43558987975120544, + "learning_rate": 9.043155830962514e-05, + "loss": 1.8937, + "step": 7306 + }, + { + "epoch": 2.2427869858809086, + "grad_norm": 0.455240398645401, + "learning_rate": 9.042863385028433e-05, + "loss": 1.9774, + "step": 7307 + }, + { + "epoch": 2.2430939226519335, + "grad_norm": 0.35868698358535767, + "learning_rate": 9.042570899140408e-05, + "loss": 1.7999, + "step": 7308 + }, + { + "epoch": 2.243400859422959, + "grad_norm": 0.33930447697639465, + "learning_rate": 9.042278373301327e-05, + "loss": 1.965, + "step": 7309 + }, + { + "epoch": 2.243707796193984, + "grad_norm": 0.34124335646629333, + "learning_rate": 9.041985807514082e-05, + "loss": 1.8916, + "step": 7310 + }, + { + "epoch": 2.244014732965009, + "grad_norm": 0.3905695974826813, + "learning_rate": 9.041693201781565e-05, + "loss": 1.9066, + "step": 7311 + }, + { + "epoch": 2.2443216697360344, + "grad_norm": 0.3108711242675781, + "learning_rate": 9.041400556106667e-05, + "loss": 1.8038, + "step": 7312 + }, + { + "epoch": 2.2446286065070598, + "grad_norm": 0.2853390872478485, + "learning_rate": 9.041107870492279e-05, + "loss": 1.8945, + "step": 7313 + }, + { + "epoch": 2.2449355432780846, + "grad_norm": 0.33351564407348633, + "learning_rate": 9.040815144941295e-05, + "loss": 1.8796, + "step": 7314 + }, + { + "epoch": 2.24524248004911, + "grad_norm": 0.3470609486103058, + "learning_rate": 9.040522379456606e-05, + "loss": 1.8914, + "step": 7315 + }, + { + "epoch": 2.245549416820135, + "grad_norm": 0.3474356532096863, + "learning_rate": 9.040229574041109e-05, + "loss": 1.838, + "step": 7316 + }, + { + "epoch": 2.24585635359116, + "grad_norm": 0.36590397357940674, + "learning_rate": 9.039936728697693e-05, + "loss": 1.86, + "step": 7317 + }, + { + "epoch": 2.2461632903621855, + "grad_norm": 0.35168272256851196, + "learning_rate": 9.039643843429257e-05, + "loss": 1.9337, + "step": 7318 + }, + { + "epoch": 2.2464702271332104, + "grad_norm": 0.3402341604232788, + "learning_rate": 9.039350918238691e-05, + "loss": 1.9291, + "step": 7319 + }, + { + "epoch": 2.2467771639042358, + "grad_norm": 0.3505321443080902, + "learning_rate": 9.03905795312889e-05, + "loss": 1.8252, + "step": 7320 + }, + { + "epoch": 2.247084100675261, + "grad_norm": 0.38366270065307617, + "learning_rate": 9.038764948102754e-05, + "loss": 1.8685, + "step": 7321 + }, + { + "epoch": 2.247391037446286, + "grad_norm": 0.3616010844707489, + "learning_rate": 9.038471903163176e-05, + "loss": 1.8734, + "step": 7322 + }, + { + "epoch": 2.2476979742173113, + "grad_norm": 0.2982875108718872, + "learning_rate": 9.038178818313048e-05, + "loss": 1.824, + "step": 7323 + }, + { + "epoch": 2.248004910988336, + "grad_norm": 0.41936174035072327, + "learning_rate": 9.037885693555273e-05, + "loss": 1.8799, + "step": 7324 + }, + { + "epoch": 2.2483118477593615, + "grad_norm": 0.3460717797279358, + "learning_rate": 9.037592528892744e-05, + "loss": 1.8889, + "step": 7325 + }, + { + "epoch": 2.248618784530387, + "grad_norm": 0.34347018599510193, + "learning_rate": 9.03729932432836e-05, + "loss": 1.8779, + "step": 7326 + }, + { + "epoch": 2.2489257213014118, + "grad_norm": 0.2988032400608063, + "learning_rate": 9.037006079865016e-05, + "loss": 1.8753, + "step": 7327 + }, + { + "epoch": 2.249232658072437, + "grad_norm": 0.32754310965538025, + "learning_rate": 9.036712795505613e-05, + "loss": 1.8896, + "step": 7328 + }, + { + "epoch": 2.2495395948434624, + "grad_norm": 0.3599032163619995, + "learning_rate": 9.036419471253049e-05, + "loss": 1.8752, + "step": 7329 + }, + { + "epoch": 2.2498465316144873, + "grad_norm": 0.3461225926876068, + "learning_rate": 9.03612610711022e-05, + "loss": 1.8723, + "step": 7330 + }, + { + "epoch": 2.2501534683855127, + "grad_norm": 0.3141838610172272, + "learning_rate": 9.035832703080027e-05, + "loss": 1.8825, + "step": 7331 + }, + { + "epoch": 2.250460405156538, + "grad_norm": 0.35188567638397217, + "learning_rate": 9.035539259165371e-05, + "loss": 1.8832, + "step": 7332 + }, + { + "epoch": 2.250767341927563, + "grad_norm": 0.3496280014514923, + "learning_rate": 9.035245775369151e-05, + "loss": 1.9084, + "step": 7333 + }, + { + "epoch": 2.2510742786985882, + "grad_norm": 0.34936273097991943, + "learning_rate": 9.034952251694266e-05, + "loss": 1.8142, + "step": 7334 + }, + { + "epoch": 2.251381215469613, + "grad_norm": 0.4227045774459839, + "learning_rate": 9.034658688143618e-05, + "loss": 1.9454, + "step": 7335 + }, + { + "epoch": 2.2516881522406385, + "grad_norm": 0.4042366147041321, + "learning_rate": 9.034365084720108e-05, + "loss": 1.8993, + "step": 7336 + }, + { + "epoch": 2.251995089011664, + "grad_norm": 0.392633318901062, + "learning_rate": 9.03407144142664e-05, + "loss": 1.9229, + "step": 7337 + }, + { + "epoch": 2.2523020257826887, + "grad_norm": 0.31304940581321716, + "learning_rate": 9.033777758266111e-05, + "loss": 1.8746, + "step": 7338 + }, + { + "epoch": 2.252608962553714, + "grad_norm": 0.3205752372741699, + "learning_rate": 9.033484035241426e-05, + "loss": 1.8224, + "step": 7339 + }, + { + "epoch": 2.252915899324739, + "grad_norm": 0.32164251804351807, + "learning_rate": 9.033190272355488e-05, + "loss": 1.8164, + "step": 7340 + }, + { + "epoch": 2.2532228360957642, + "grad_norm": 0.3567545413970947, + "learning_rate": 9.032896469611201e-05, + "loss": 1.8892, + "step": 7341 + }, + { + "epoch": 2.2535297728667896, + "grad_norm": 0.3475800156593323, + "learning_rate": 9.032602627011467e-05, + "loss": 1.8594, + "step": 7342 + }, + { + "epoch": 2.2538367096378145, + "grad_norm": 0.38770994544029236, + "learning_rate": 9.032308744559189e-05, + "loss": 1.8899, + "step": 7343 + }, + { + "epoch": 2.25414364640884, + "grad_norm": 0.3671153783798218, + "learning_rate": 9.032014822257273e-05, + "loss": 1.8795, + "step": 7344 + }, + { + "epoch": 2.254450583179865, + "grad_norm": 0.3415989875793457, + "learning_rate": 9.031720860108623e-05, + "loss": 1.9007, + "step": 7345 + }, + { + "epoch": 2.25475751995089, + "grad_norm": 0.3317084014415741, + "learning_rate": 9.031426858116145e-05, + "loss": 1.8604, + "step": 7346 + }, + { + "epoch": 2.2550644567219154, + "grad_norm": 0.3760251998901367, + "learning_rate": 9.031132816282745e-05, + "loss": 1.9061, + "step": 7347 + }, + { + "epoch": 2.2553713934929407, + "grad_norm": 0.4288908541202545, + "learning_rate": 9.030838734611326e-05, + "loss": 1.8621, + "step": 7348 + }, + { + "epoch": 2.2556783302639656, + "grad_norm": 0.3840491771697998, + "learning_rate": 9.030544613104797e-05, + "loss": 1.8743, + "step": 7349 + }, + { + "epoch": 2.255985267034991, + "grad_norm": 0.32746297121047974, + "learning_rate": 9.030250451766063e-05, + "loss": 1.8813, + "step": 7350 + }, + { + "epoch": 2.256292203806016, + "grad_norm": 0.31266525387763977, + "learning_rate": 9.029956250598032e-05, + "loss": 1.816, + "step": 7351 + }, + { + "epoch": 2.256599140577041, + "grad_norm": 0.34744998812675476, + "learning_rate": 9.029662009603613e-05, + "loss": 1.8728, + "step": 7352 + }, + { + "epoch": 2.2569060773480665, + "grad_norm": 0.36204856634140015, + "learning_rate": 9.029367728785709e-05, + "loss": 1.9331, + "step": 7353 + }, + { + "epoch": 2.2572130141190914, + "grad_norm": 0.3839271664619446, + "learning_rate": 9.029073408147234e-05, + "loss": 2.0018, + "step": 7354 + }, + { + "epoch": 2.2575199508901167, + "grad_norm": 0.34844526648521423, + "learning_rate": 9.028779047691094e-05, + "loss": 1.8873, + "step": 7355 + }, + { + "epoch": 2.2578268876611416, + "grad_norm": 0.31876906752586365, + "learning_rate": 9.028484647420196e-05, + "loss": 1.8569, + "step": 7356 + }, + { + "epoch": 2.258133824432167, + "grad_norm": 0.3633274435997009, + "learning_rate": 9.028190207337452e-05, + "loss": 1.8645, + "step": 7357 + }, + { + "epoch": 2.2584407612031923, + "grad_norm": 0.39025530219078064, + "learning_rate": 9.027895727445775e-05, + "loss": 1.911, + "step": 7358 + }, + { + "epoch": 2.258747697974217, + "grad_norm": 0.34168434143066406, + "learning_rate": 9.027601207748067e-05, + "loss": 1.8675, + "step": 7359 + }, + { + "epoch": 2.2590546347452425, + "grad_norm": 0.3539605438709259, + "learning_rate": 9.027306648247245e-05, + "loss": 1.9001, + "step": 7360 + }, + { + "epoch": 2.259361571516268, + "grad_norm": 0.30433401465415955, + "learning_rate": 9.02701204894622e-05, + "loss": 1.8598, + "step": 7361 + }, + { + "epoch": 2.2596685082872927, + "grad_norm": 0.35448700189590454, + "learning_rate": 9.026717409847898e-05, + "loss": 1.8845, + "step": 7362 + }, + { + "epoch": 2.259975445058318, + "grad_norm": 0.34060248732566833, + "learning_rate": 9.026422730955197e-05, + "loss": 1.9322, + "step": 7363 + }, + { + "epoch": 2.2602823818293434, + "grad_norm": 0.3370642364025116, + "learning_rate": 9.026128012271026e-05, + "loss": 1.8356, + "step": 7364 + }, + { + "epoch": 2.2605893186003683, + "grad_norm": 0.3148033022880554, + "learning_rate": 9.025833253798298e-05, + "loss": 1.7723, + "step": 7365 + }, + { + "epoch": 2.2608962553713936, + "grad_norm": 0.3062879145145416, + "learning_rate": 9.025538455539925e-05, + "loss": 1.8548, + "step": 7366 + }, + { + "epoch": 2.2612031921424185, + "grad_norm": 0.3378484547138214, + "learning_rate": 9.025243617498825e-05, + "loss": 1.9049, + "step": 7367 + }, + { + "epoch": 2.261510128913444, + "grad_norm": 0.277660608291626, + "learning_rate": 9.024948739677905e-05, + "loss": 1.7833, + "step": 7368 + }, + { + "epoch": 2.261817065684469, + "grad_norm": 0.3986060619354248, + "learning_rate": 9.024653822080083e-05, + "loss": 1.8837, + "step": 7369 + }, + { + "epoch": 2.262124002455494, + "grad_norm": 0.3013289272785187, + "learning_rate": 9.024358864708275e-05, + "loss": 1.8659, + "step": 7370 + }, + { + "epoch": 2.2624309392265194, + "grad_norm": 0.3403053879737854, + "learning_rate": 9.024063867565391e-05, + "loss": 1.8914, + "step": 7371 + }, + { + "epoch": 2.2627378759975443, + "grad_norm": 0.3488257825374603, + "learning_rate": 9.023768830654351e-05, + "loss": 1.8887, + "step": 7372 + }, + { + "epoch": 2.2630448127685696, + "grad_norm": 0.2950255274772644, + "learning_rate": 9.023473753978069e-05, + "loss": 1.8385, + "step": 7373 + }, + { + "epoch": 2.263351749539595, + "grad_norm": 0.35732173919677734, + "learning_rate": 9.023178637539461e-05, + "loss": 1.8769, + "step": 7374 + }, + { + "epoch": 2.26365868631062, + "grad_norm": 0.5403436422348022, + "learning_rate": 9.022883481341445e-05, + "loss": 1.9742, + "step": 7375 + }, + { + "epoch": 2.263965623081645, + "grad_norm": 0.5506799221038818, + "learning_rate": 9.022588285386935e-05, + "loss": 1.8667, + "step": 7376 + }, + { + "epoch": 2.2642725598526705, + "grad_norm": 0.4272395372390747, + "learning_rate": 9.02229304967885e-05, + "loss": 1.8336, + "step": 7377 + }, + { + "epoch": 2.2645794966236954, + "grad_norm": 0.34911462664604187, + "learning_rate": 9.021997774220108e-05, + "loss": 1.8608, + "step": 7378 + }, + { + "epoch": 2.2648864333947207, + "grad_norm": 0.3592715263366699, + "learning_rate": 9.021702459013626e-05, + "loss": 1.925, + "step": 7379 + }, + { + "epoch": 2.265193370165746, + "grad_norm": 0.38482216000556946, + "learning_rate": 9.021407104062323e-05, + "loss": 1.8553, + "step": 7380 + }, + { + "epoch": 2.265500306936771, + "grad_norm": 0.4675584137439728, + "learning_rate": 9.021111709369118e-05, + "loss": 1.9303, + "step": 7381 + }, + { + "epoch": 2.2658072437077963, + "grad_norm": 0.40397754311561584, + "learning_rate": 9.02081627493693e-05, + "loss": 1.9512, + "step": 7382 + }, + { + "epoch": 2.266114180478821, + "grad_norm": 0.3385498821735382, + "learning_rate": 9.02052080076868e-05, + "loss": 1.8314, + "step": 7383 + }, + { + "epoch": 2.2664211172498465, + "grad_norm": 0.40668871998786926, + "learning_rate": 9.020225286867285e-05, + "loss": 1.8658, + "step": 7384 + }, + { + "epoch": 2.266728054020872, + "grad_norm": 0.4566061198711395, + "learning_rate": 9.01992973323567e-05, + "loss": 1.8429, + "step": 7385 + }, + { + "epoch": 2.2670349907918967, + "grad_norm": 0.42283549904823303, + "learning_rate": 9.019634139876752e-05, + "loss": 1.8858, + "step": 7386 + }, + { + "epoch": 2.267341927562922, + "grad_norm": 0.3491251468658447, + "learning_rate": 9.019338506793454e-05, + "loss": 1.8389, + "step": 7387 + }, + { + "epoch": 2.267648864333947, + "grad_norm": 0.33846428990364075, + "learning_rate": 9.019042833988696e-05, + "loss": 1.8309, + "step": 7388 + }, + { + "epoch": 2.2679558011049723, + "grad_norm": 0.39968016743659973, + "learning_rate": 9.0187471214654e-05, + "loss": 1.8591, + "step": 7389 + }, + { + "epoch": 2.2682627378759976, + "grad_norm": 0.39926376938819885, + "learning_rate": 9.018451369226493e-05, + "loss": 1.9341, + "step": 7390 + }, + { + "epoch": 2.2685696746470225, + "grad_norm": 0.41112056374549866, + "learning_rate": 9.018155577274892e-05, + "loss": 1.8856, + "step": 7391 + }, + { + "epoch": 2.268876611418048, + "grad_norm": 0.49490058422088623, + "learning_rate": 9.017859745613521e-05, + "loss": 1.8458, + "step": 7392 + }, + { + "epoch": 2.269183548189073, + "grad_norm": 0.42149874567985535, + "learning_rate": 9.017563874245308e-05, + "loss": 1.862, + "step": 7393 + }, + { + "epoch": 2.269490484960098, + "grad_norm": 0.37284091114997864, + "learning_rate": 9.017267963173173e-05, + "loss": 1.8698, + "step": 7394 + }, + { + "epoch": 2.2697974217311234, + "grad_norm": 0.3743322193622589, + "learning_rate": 9.016972012400041e-05, + "loss": 1.8847, + "step": 7395 + }, + { + "epoch": 2.2701043585021488, + "grad_norm": 0.4327050447463989, + "learning_rate": 9.016676021928838e-05, + "loss": 1.8227, + "step": 7396 + }, + { + "epoch": 2.2704112952731736, + "grad_norm": 0.4334336519241333, + "learning_rate": 9.016379991762487e-05, + "loss": 1.9292, + "step": 7397 + }, + { + "epoch": 2.270718232044199, + "grad_norm": 0.37071630358695984, + "learning_rate": 9.016083921903915e-05, + "loss": 1.8045, + "step": 7398 + }, + { + "epoch": 2.271025168815224, + "grad_norm": 0.32131752371788025, + "learning_rate": 9.015787812356049e-05, + "loss": 1.8697, + "step": 7399 + }, + { + "epoch": 2.271332105586249, + "grad_norm": 0.3604664206504822, + "learning_rate": 9.015491663121813e-05, + "loss": 1.9259, + "step": 7400 + }, + { + "epoch": 2.2716390423572745, + "grad_norm": 0.3364580571651459, + "learning_rate": 9.015195474204136e-05, + "loss": 1.8964, + "step": 7401 + }, + { + "epoch": 2.2719459791282994, + "grad_norm": 0.3141402304172516, + "learning_rate": 9.014899245605944e-05, + "loss": 1.8536, + "step": 7402 + }, + { + "epoch": 2.2722529158993248, + "grad_norm": 0.3387024402618408, + "learning_rate": 9.014602977330162e-05, + "loss": 1.8362, + "step": 7403 + }, + { + "epoch": 2.27255985267035, + "grad_norm": 0.42270272970199585, + "learning_rate": 9.014306669379723e-05, + "loss": 1.8288, + "step": 7404 + }, + { + "epoch": 2.272866789441375, + "grad_norm": 0.4565230906009674, + "learning_rate": 9.01401032175755e-05, + "loss": 1.8573, + "step": 7405 + }, + { + "epoch": 2.2731737262124003, + "grad_norm": 0.38861140608787537, + "learning_rate": 9.013713934466576e-05, + "loss": 1.8778, + "step": 7406 + }, + { + "epoch": 2.2734806629834257, + "grad_norm": 0.31552520394325256, + "learning_rate": 9.01341750750973e-05, + "loss": 1.8342, + "step": 7407 + }, + { + "epoch": 2.2737875997544506, + "grad_norm": 0.3771591782569885, + "learning_rate": 9.013121040889938e-05, + "loss": 1.8847, + "step": 7408 + }, + { + "epoch": 2.274094536525476, + "grad_norm": 0.3689042925834656, + "learning_rate": 9.012824534610132e-05, + "loss": 1.9014, + "step": 7409 + }, + { + "epoch": 2.2744014732965008, + "grad_norm": 0.31477800011634827, + "learning_rate": 9.012527988673241e-05, + "loss": 1.8631, + "step": 7410 + }, + { + "epoch": 2.274708410067526, + "grad_norm": 0.3238977789878845, + "learning_rate": 9.012231403082199e-05, + "loss": 1.8319, + "step": 7411 + }, + { + "epoch": 2.2750153468385514, + "grad_norm": 0.3587593138217926, + "learning_rate": 9.011934777839932e-05, + "loss": 1.8982, + "step": 7412 + }, + { + "epoch": 2.2753222836095763, + "grad_norm": 0.35946986079216003, + "learning_rate": 9.011638112949376e-05, + "loss": 1.9206, + "step": 7413 + }, + { + "epoch": 2.2756292203806017, + "grad_norm": 0.3451001048088074, + "learning_rate": 9.01134140841346e-05, + "loss": 1.8122, + "step": 7414 + }, + { + "epoch": 2.2759361571516266, + "grad_norm": 0.3779532313346863, + "learning_rate": 9.011044664235116e-05, + "loss": 1.8851, + "step": 7415 + }, + { + "epoch": 2.276243093922652, + "grad_norm": 0.3812767267227173, + "learning_rate": 9.010747880417279e-05, + "loss": 1.902, + "step": 7416 + }, + { + "epoch": 2.2765500306936772, + "grad_norm": 0.3666127920150757, + "learning_rate": 9.01045105696288e-05, + "loss": 1.8296, + "step": 7417 + }, + { + "epoch": 2.276856967464702, + "grad_norm": 0.3588816225528717, + "learning_rate": 9.010154193874854e-05, + "loss": 1.9023, + "step": 7418 + }, + { + "epoch": 2.2771639042357275, + "grad_norm": 0.37766706943511963, + "learning_rate": 9.009857291156134e-05, + "loss": 1.7996, + "step": 7419 + }, + { + "epoch": 2.277470841006753, + "grad_norm": 0.4222901165485382, + "learning_rate": 9.009560348809654e-05, + "loss": 1.8802, + "step": 7420 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.39289870858192444, + "learning_rate": 9.009263366838348e-05, + "loss": 1.8988, + "step": 7421 + }, + { + "epoch": 2.278084714548803, + "grad_norm": 0.3670540750026703, + "learning_rate": 9.008966345245152e-05, + "loss": 1.8348, + "step": 7422 + }, + { + "epoch": 2.2783916513198283, + "grad_norm": 0.36671552062034607, + "learning_rate": 9.008669284032998e-05, + "loss": 1.9059, + "step": 7423 + }, + { + "epoch": 2.2786985880908532, + "grad_norm": 0.33226338028907776, + "learning_rate": 9.008372183204827e-05, + "loss": 1.8736, + "step": 7424 + }, + { + "epoch": 2.2790055248618786, + "grad_norm": 0.3424983322620392, + "learning_rate": 9.008075042763573e-05, + "loss": 1.8537, + "step": 7425 + }, + { + "epoch": 2.2793124616329035, + "grad_norm": 0.3336870074272156, + "learning_rate": 9.007777862712172e-05, + "loss": 1.8622, + "step": 7426 + }, + { + "epoch": 2.279619398403929, + "grad_norm": 0.3488881289958954, + "learning_rate": 9.007480643053561e-05, + "loss": 1.88, + "step": 7427 + }, + { + "epoch": 2.279926335174954, + "grad_norm": 0.34159761667251587, + "learning_rate": 9.007183383790676e-05, + "loss": 1.8893, + "step": 7428 + }, + { + "epoch": 2.280233271945979, + "grad_norm": 0.3075805604457855, + "learning_rate": 9.006886084926459e-05, + "loss": 1.8613, + "step": 7429 + }, + { + "epoch": 2.2805402087170044, + "grad_norm": 0.32371413707733154, + "learning_rate": 9.006588746463844e-05, + "loss": 1.909, + "step": 7430 + }, + { + "epoch": 2.2808471454880292, + "grad_norm": 0.34343451261520386, + "learning_rate": 9.006291368405769e-05, + "loss": 1.8696, + "step": 7431 + }, + { + "epoch": 2.2811540822590546, + "grad_norm": 0.34018251299858093, + "learning_rate": 9.005993950755177e-05, + "loss": 1.9155, + "step": 7432 + }, + { + "epoch": 2.28146101903008, + "grad_norm": 0.42582982778549194, + "learning_rate": 9.005696493515003e-05, + "loss": 1.8901, + "step": 7433 + }, + { + "epoch": 2.281767955801105, + "grad_norm": 0.44168829917907715, + "learning_rate": 9.005398996688188e-05, + "loss": 1.8693, + "step": 7434 + }, + { + "epoch": 2.28207489257213, + "grad_norm": 0.3650555908679962, + "learning_rate": 9.005101460277673e-05, + "loss": 1.8726, + "step": 7435 + }, + { + "epoch": 2.2823818293431555, + "grad_norm": 0.2945705056190491, + "learning_rate": 9.004803884286399e-05, + "loss": 1.8655, + "step": 7436 + }, + { + "epoch": 2.2826887661141804, + "grad_norm": 0.4192120432853699, + "learning_rate": 9.004506268717305e-05, + "loss": 1.9859, + "step": 7437 + }, + { + "epoch": 2.2829957028852057, + "grad_norm": 0.35403937101364136, + "learning_rate": 9.004208613573334e-05, + "loss": 1.785, + "step": 7438 + }, + { + "epoch": 2.283302639656231, + "grad_norm": 0.3038218021392822, + "learning_rate": 9.003910918857426e-05, + "loss": 1.8199, + "step": 7439 + }, + { + "epoch": 2.283609576427256, + "grad_norm": 0.3447442352771759, + "learning_rate": 9.003613184572522e-05, + "loss": 1.882, + "step": 7440 + }, + { + "epoch": 2.2839165131982813, + "grad_norm": 0.32208123803138733, + "learning_rate": 9.003315410721567e-05, + "loss": 1.8326, + "step": 7441 + }, + { + "epoch": 2.284223449969306, + "grad_norm": 0.31731268763542175, + "learning_rate": 9.003017597307504e-05, + "loss": 1.8947, + "step": 7442 + }, + { + "epoch": 2.2845303867403315, + "grad_norm": 0.3491910398006439, + "learning_rate": 9.002719744333273e-05, + "loss": 1.924, + "step": 7443 + }, + { + "epoch": 2.284837323511357, + "grad_norm": 0.32135117053985596, + "learning_rate": 9.00242185180182e-05, + "loss": 1.838, + "step": 7444 + }, + { + "epoch": 2.2851442602823817, + "grad_norm": 0.32201823592185974, + "learning_rate": 9.00212391971609e-05, + "loss": 1.8449, + "step": 7445 + }, + { + "epoch": 2.285451197053407, + "grad_norm": 0.3983609676361084, + "learning_rate": 9.001825948079024e-05, + "loss": 1.8897, + "step": 7446 + }, + { + "epoch": 2.285758133824432, + "grad_norm": 0.4174421727657318, + "learning_rate": 9.001527936893568e-05, + "loss": 1.8671, + "step": 7447 + }, + { + "epoch": 2.2860650705954573, + "grad_norm": 0.3456888496875763, + "learning_rate": 9.001229886162668e-05, + "loss": 1.9064, + "step": 7448 + }, + { + "epoch": 2.2863720073664826, + "grad_norm": 0.3092229664325714, + "learning_rate": 9.000931795889269e-05, + "loss": 1.8478, + "step": 7449 + }, + { + "epoch": 2.2866789441375075, + "grad_norm": 0.40093541145324707, + "learning_rate": 9.000633666076317e-05, + "loss": 1.9226, + "step": 7450 + }, + { + "epoch": 2.286985880908533, + "grad_norm": 0.41090336441993713, + "learning_rate": 9.000335496726759e-05, + "loss": 1.8542, + "step": 7451 + }, + { + "epoch": 2.287292817679558, + "grad_norm": 0.48479974269866943, + "learning_rate": 9.00003728784354e-05, + "loss": 1.9217, + "step": 7452 + }, + { + "epoch": 2.287599754450583, + "grad_norm": 0.662677526473999, + "learning_rate": 8.999739039429609e-05, + "loss": 1.957, + "step": 7453 + }, + { + "epoch": 2.2879066912216084, + "grad_norm": 0.6417959928512573, + "learning_rate": 8.999440751487911e-05, + "loss": 1.8273, + "step": 7454 + }, + { + "epoch": 2.2882136279926337, + "grad_norm": 0.5561745762825012, + "learning_rate": 8.999142424021396e-05, + "loss": 1.9081, + "step": 7455 + }, + { + "epoch": 2.2885205647636586, + "grad_norm": 0.3603537976741791, + "learning_rate": 8.998844057033013e-05, + "loss": 1.8256, + "step": 7456 + }, + { + "epoch": 2.288827501534684, + "grad_norm": 0.5149406790733337, + "learning_rate": 8.998545650525707e-05, + "loss": 1.8257, + "step": 7457 + }, + { + "epoch": 2.289134438305709, + "grad_norm": 0.6777750253677368, + "learning_rate": 8.99824720450243e-05, + "loss": 1.8581, + "step": 7458 + }, + { + "epoch": 2.289441375076734, + "grad_norm": 0.6244171857833862, + "learning_rate": 8.997948718966132e-05, + "loss": 1.9195, + "step": 7459 + }, + { + "epoch": 2.2897483118477595, + "grad_norm": 0.3903466463088989, + "learning_rate": 8.99765019391976e-05, + "loss": 1.8996, + "step": 7460 + }, + { + "epoch": 2.2900552486187844, + "grad_norm": 0.4231773614883423, + "learning_rate": 8.997351629366266e-05, + "loss": 1.9447, + "step": 7461 + }, + { + "epoch": 2.2903621853898097, + "grad_norm": 0.5735896825790405, + "learning_rate": 8.997053025308602e-05, + "loss": 1.9082, + "step": 7462 + }, + { + "epoch": 2.2906691221608346, + "grad_norm": 0.5015980005264282, + "learning_rate": 8.996754381749715e-05, + "loss": 1.8744, + "step": 7463 + }, + { + "epoch": 2.29097605893186, + "grad_norm": 0.3385339677333832, + "learning_rate": 8.996455698692558e-05, + "loss": 1.8908, + "step": 7464 + }, + { + "epoch": 2.2912829957028853, + "grad_norm": 0.35323935747146606, + "learning_rate": 8.996156976140086e-05, + "loss": 1.8739, + "step": 7465 + }, + { + "epoch": 2.29158993247391, + "grad_norm": 0.386081725358963, + "learning_rate": 8.995858214095248e-05, + "loss": 1.8734, + "step": 7466 + }, + { + "epoch": 2.2918968692449355, + "grad_norm": 0.32834386825561523, + "learning_rate": 8.995559412560996e-05, + "loss": 1.8849, + "step": 7467 + }, + { + "epoch": 2.292203806015961, + "grad_norm": 0.3868117034435272, + "learning_rate": 8.995260571540284e-05, + "loss": 1.8992, + "step": 7468 + }, + { + "epoch": 2.2925107427869857, + "grad_norm": 0.3869209885597229, + "learning_rate": 8.994961691036066e-05, + "loss": 1.8562, + "step": 7469 + }, + { + "epoch": 2.292817679558011, + "grad_norm": 0.39098650217056274, + "learning_rate": 8.994662771051294e-05, + "loss": 1.9077, + "step": 7470 + }, + { + "epoch": 2.2931246163290364, + "grad_norm": 0.4433341920375824, + "learning_rate": 8.994363811588923e-05, + "loss": 1.9193, + "step": 7471 + }, + { + "epoch": 2.2934315531000613, + "grad_norm": 0.37947940826416016, + "learning_rate": 8.99406481265191e-05, + "loss": 1.8843, + "step": 7472 + }, + { + "epoch": 2.2937384898710866, + "grad_norm": 0.4123954772949219, + "learning_rate": 8.993765774243206e-05, + "loss": 1.8847, + "step": 7473 + }, + { + "epoch": 2.2940454266421115, + "grad_norm": 0.3863835036754608, + "learning_rate": 8.993466696365768e-05, + "loss": 1.8226, + "step": 7474 + }, + { + "epoch": 2.294352363413137, + "grad_norm": 0.34903961420059204, + "learning_rate": 8.993167579022551e-05, + "loss": 1.9151, + "step": 7475 + }, + { + "epoch": 2.294659300184162, + "grad_norm": 0.439989298582077, + "learning_rate": 8.992868422216512e-05, + "loss": 1.8494, + "step": 7476 + }, + { + "epoch": 2.294966236955187, + "grad_norm": 0.42929476499557495, + "learning_rate": 8.992569225950607e-05, + "loss": 1.8174, + "step": 7477 + }, + { + "epoch": 2.2952731737262124, + "grad_norm": 0.39554497599601746, + "learning_rate": 8.992269990227792e-05, + "loss": 1.8692, + "step": 7478 + }, + { + "epoch": 2.2955801104972378, + "grad_norm": 0.29355254769325256, + "learning_rate": 8.991970715051026e-05, + "loss": 1.8033, + "step": 7479 + }, + { + "epoch": 2.2958870472682626, + "grad_norm": 0.3488605320453644, + "learning_rate": 8.991671400423265e-05, + "loss": 1.8979, + "step": 7480 + }, + { + "epoch": 2.296193984039288, + "grad_norm": 0.34984245896339417, + "learning_rate": 8.991372046347468e-05, + "loss": 1.8931, + "step": 7481 + }, + { + "epoch": 2.2965009208103133, + "grad_norm": 0.29404810070991516, + "learning_rate": 8.991072652826593e-05, + "loss": 1.8626, + "step": 7482 + }, + { + "epoch": 2.296807857581338, + "grad_norm": 0.2838701009750366, + "learning_rate": 8.990773219863598e-05, + "loss": 1.8542, + "step": 7483 + }, + { + "epoch": 2.2971147943523635, + "grad_norm": 0.28008925914764404, + "learning_rate": 8.990473747461444e-05, + "loss": 1.8354, + "step": 7484 + }, + { + "epoch": 2.2974217311233884, + "grad_norm": 0.3046751320362091, + "learning_rate": 8.99017423562309e-05, + "loss": 1.8657, + "step": 7485 + }, + { + "epoch": 2.2977286678944138, + "grad_norm": 0.28220781683921814, + "learning_rate": 8.989874684351494e-05, + "loss": 1.8349, + "step": 7486 + }, + { + "epoch": 2.298035604665439, + "grad_norm": 0.2665577232837677, + "learning_rate": 8.989575093649619e-05, + "loss": 1.8551, + "step": 7487 + }, + { + "epoch": 2.298342541436464, + "grad_norm": 0.2797924280166626, + "learning_rate": 8.989275463520423e-05, + "loss": 1.8568, + "step": 7488 + }, + { + "epoch": 2.2986494782074893, + "grad_norm": 0.2917410731315613, + "learning_rate": 8.98897579396687e-05, + "loss": 1.843, + "step": 7489 + }, + { + "epoch": 2.298956414978514, + "grad_norm": 0.3014819920063019, + "learning_rate": 8.98867608499192e-05, + "loss": 1.8527, + "step": 7490 + }, + { + "epoch": 2.2992633517495396, + "grad_norm": 0.28019243478775024, + "learning_rate": 8.988376336598537e-05, + "loss": 1.7744, + "step": 7491 + }, + { + "epoch": 2.299570288520565, + "grad_norm": 0.35014277696609497, + "learning_rate": 8.988076548789678e-05, + "loss": 1.9604, + "step": 7492 + }, + { + "epoch": 2.2998772252915898, + "grad_norm": 0.3060695230960846, + "learning_rate": 8.987776721568311e-05, + "loss": 1.8463, + "step": 7493 + }, + { + "epoch": 2.300184162062615, + "grad_norm": 0.29870638251304626, + "learning_rate": 8.987476854937395e-05, + "loss": 1.815, + "step": 7494 + }, + { + "epoch": 2.3004910988336404, + "grad_norm": 0.27395132184028625, + "learning_rate": 8.987176948899898e-05, + "loss": 1.8126, + "step": 7495 + }, + { + "epoch": 2.3007980356046653, + "grad_norm": 0.2982339859008789, + "learning_rate": 8.986877003458781e-05, + "loss": 1.9114, + "step": 7496 + }, + { + "epoch": 2.3011049723756907, + "grad_norm": 0.3113982081413269, + "learning_rate": 8.986577018617008e-05, + "loss": 1.8429, + "step": 7497 + }, + { + "epoch": 2.301411909146716, + "grad_norm": 0.3538585603237152, + "learning_rate": 8.986276994377544e-05, + "loss": 1.9045, + "step": 7498 + }, + { + "epoch": 2.301718845917741, + "grad_norm": 0.37576064467430115, + "learning_rate": 8.985976930743356e-05, + "loss": 1.8955, + "step": 7499 + }, + { + "epoch": 2.3020257826887662, + "grad_norm": 0.3080044388771057, + "learning_rate": 8.985676827717406e-05, + "loss": 1.7946, + "step": 7500 + }, + { + "epoch": 2.302332719459791, + "grad_norm": 0.33935341238975525, + "learning_rate": 8.985376685302662e-05, + "loss": 1.8817, + "step": 7501 + }, + { + "epoch": 2.3026396562308165, + "grad_norm": 0.3817180395126343, + "learning_rate": 8.98507650350209e-05, + "loss": 1.9178, + "step": 7502 + }, + { + "epoch": 2.302946593001842, + "grad_norm": 0.35170307755470276, + "learning_rate": 8.984776282318657e-05, + "loss": 1.9451, + "step": 7503 + }, + { + "epoch": 2.3032535297728667, + "grad_norm": 0.3451419770717621, + "learning_rate": 8.984476021755329e-05, + "loss": 1.9127, + "step": 7504 + }, + { + "epoch": 2.303560466543892, + "grad_norm": 0.4312259554862976, + "learning_rate": 8.984175721815071e-05, + "loss": 1.8784, + "step": 7505 + }, + { + "epoch": 2.303867403314917, + "grad_norm": 0.4684976041316986, + "learning_rate": 8.983875382500856e-05, + "loss": 1.8782, + "step": 7506 + }, + { + "epoch": 2.3041743400859422, + "grad_norm": 0.4230491518974304, + "learning_rate": 8.983575003815648e-05, + "loss": 1.8769, + "step": 7507 + }, + { + "epoch": 2.3044812768569676, + "grad_norm": 0.32715409994125366, + "learning_rate": 8.983274585762417e-05, + "loss": 1.8535, + "step": 7508 + }, + { + "epoch": 2.3047882136279925, + "grad_norm": 0.3857569396495819, + "learning_rate": 8.982974128344134e-05, + "loss": 1.8689, + "step": 7509 + }, + { + "epoch": 2.305095150399018, + "grad_norm": 0.46266329288482666, + "learning_rate": 8.982673631563766e-05, + "loss": 1.9151, + "step": 7510 + }, + { + "epoch": 2.305402087170043, + "grad_norm": 0.455713152885437, + "learning_rate": 8.98237309542428e-05, + "loss": 1.9304, + "step": 7511 + }, + { + "epoch": 2.305709023941068, + "grad_norm": 0.3413514792919159, + "learning_rate": 8.98207251992865e-05, + "loss": 1.8516, + "step": 7512 + }, + { + "epoch": 2.3060159607120934, + "grad_norm": 0.3705863058567047, + "learning_rate": 8.981771905079846e-05, + "loss": 1.8434, + "step": 7513 + }, + { + "epoch": 2.3063228974831187, + "grad_norm": 0.46615147590637207, + "learning_rate": 8.981471250880839e-05, + "loss": 1.9265, + "step": 7514 + }, + { + "epoch": 2.3066298342541436, + "grad_norm": 0.5400925278663635, + "learning_rate": 8.981170557334598e-05, + "loss": 1.9061, + "step": 7515 + }, + { + "epoch": 2.306936771025169, + "grad_norm": 0.40317288041114807, + "learning_rate": 8.980869824444096e-05, + "loss": 1.7916, + "step": 7516 + }, + { + "epoch": 2.307243707796194, + "grad_norm": 0.3522326648235321, + "learning_rate": 8.980569052212307e-05, + "loss": 1.867, + "step": 7517 + }, + { + "epoch": 2.307550644567219, + "grad_norm": 0.5134142637252808, + "learning_rate": 8.9802682406422e-05, + "loss": 1.8406, + "step": 7518 + }, + { + "epoch": 2.3078575813382445, + "grad_norm": 0.5792621970176697, + "learning_rate": 8.97996738973675e-05, + "loss": 1.8467, + "step": 7519 + }, + { + "epoch": 2.3081645181092694, + "grad_norm": 0.424405962228775, + "learning_rate": 8.979666499498928e-05, + "loss": 1.779, + "step": 7520 + }, + { + "epoch": 2.3084714548802947, + "grad_norm": 0.3233562409877777, + "learning_rate": 8.979365569931712e-05, + "loss": 1.9043, + "step": 7521 + }, + { + "epoch": 2.3087783916513196, + "grad_norm": 0.6043062806129456, + "learning_rate": 8.979064601038071e-05, + "loss": 1.9245, + "step": 7522 + }, + { + "epoch": 2.309085328422345, + "grad_norm": 0.6618810892105103, + "learning_rate": 8.978763592820982e-05, + "loss": 1.8601, + "step": 7523 + }, + { + "epoch": 2.3093922651933703, + "grad_norm": 0.44771909713745117, + "learning_rate": 8.978462545283418e-05, + "loss": 1.7836, + "step": 7524 + }, + { + "epoch": 2.309699201964395, + "grad_norm": 0.3473430871963501, + "learning_rate": 8.978161458428356e-05, + "loss": 1.8743, + "step": 7525 + }, + { + "epoch": 2.3100061387354205, + "grad_norm": 0.46158188581466675, + "learning_rate": 8.977860332258772e-05, + "loss": 1.8802, + "step": 7526 + }, + { + "epoch": 2.310313075506446, + "grad_norm": 0.42034098505973816, + "learning_rate": 8.977559166777639e-05, + "loss": 1.8773, + "step": 7527 + }, + { + "epoch": 2.3106200122774707, + "grad_norm": 0.30994895100593567, + "learning_rate": 8.977257961987936e-05, + "loss": 1.8042, + "step": 7528 + }, + { + "epoch": 2.310926949048496, + "grad_norm": 0.32265907526016235, + "learning_rate": 8.976956717892638e-05, + "loss": 1.8, + "step": 7529 + }, + { + "epoch": 2.3112338858195214, + "grad_norm": 0.3592197000980377, + "learning_rate": 8.976655434494723e-05, + "loss": 1.9053, + "step": 7530 + }, + { + "epoch": 2.3115408225905463, + "grad_norm": 0.36494702100753784, + "learning_rate": 8.97635411179717e-05, + "loss": 1.8982, + "step": 7531 + }, + { + "epoch": 2.3118477593615716, + "grad_norm": 0.3697327971458435, + "learning_rate": 8.976052749802952e-05, + "loss": 1.9446, + "step": 7532 + }, + { + "epoch": 2.3121546961325965, + "grad_norm": 0.5200048089027405, + "learning_rate": 8.975751348515052e-05, + "loss": 1.9429, + "step": 7533 + }, + { + "epoch": 2.312461632903622, + "grad_norm": 0.4033229947090149, + "learning_rate": 8.975449907936446e-05, + "loss": 1.8128, + "step": 7534 + }, + { + "epoch": 2.312768569674647, + "grad_norm": 0.35759851336479187, + "learning_rate": 8.975148428070115e-05, + "loss": 1.8721, + "step": 7535 + }, + { + "epoch": 2.313075506445672, + "grad_norm": 0.4578085243701935, + "learning_rate": 8.974846908919037e-05, + "loss": 1.8397, + "step": 7536 + }, + { + "epoch": 2.3133824432166974, + "grad_norm": 0.4557357132434845, + "learning_rate": 8.974545350486192e-05, + "loss": 1.8726, + "step": 7537 + }, + { + "epoch": 2.3136893799877223, + "grad_norm": 0.3946380615234375, + "learning_rate": 8.974243752774561e-05, + "loss": 1.8662, + "step": 7538 + }, + { + "epoch": 2.3139963167587476, + "grad_norm": 0.29723790287971497, + "learning_rate": 8.973942115787122e-05, + "loss": 1.8215, + "step": 7539 + }, + { + "epoch": 2.314303253529773, + "grad_norm": 0.37225791811943054, + "learning_rate": 8.973640439526858e-05, + "loss": 1.9422, + "step": 7540 + }, + { + "epoch": 2.314610190300798, + "grad_norm": 0.3359868824481964, + "learning_rate": 8.973338723996751e-05, + "loss": 1.7974, + "step": 7541 + }, + { + "epoch": 2.314917127071823, + "grad_norm": 0.2993139922618866, + "learning_rate": 8.973036969199782e-05, + "loss": 1.8691, + "step": 7542 + }, + { + "epoch": 2.3152240638428485, + "grad_norm": 0.3155567944049835, + "learning_rate": 8.972735175138933e-05, + "loss": 1.857, + "step": 7543 + }, + { + "epoch": 2.3155310006138734, + "grad_norm": 0.315820574760437, + "learning_rate": 8.972433341817188e-05, + "loss": 1.8597, + "step": 7544 + }, + { + "epoch": 2.3158379373848987, + "grad_norm": 0.32500606775283813, + "learning_rate": 8.972131469237526e-05, + "loss": 1.9293, + "step": 7545 + }, + { + "epoch": 2.316144874155924, + "grad_norm": 0.3481442332267761, + "learning_rate": 8.971829557402933e-05, + "loss": 1.8839, + "step": 7546 + }, + { + "epoch": 2.316451810926949, + "grad_norm": 0.3110404312610626, + "learning_rate": 8.971527606316394e-05, + "loss": 1.8717, + "step": 7547 + }, + { + "epoch": 2.3167587476979743, + "grad_norm": 0.319795161485672, + "learning_rate": 8.97122561598089e-05, + "loss": 1.8855, + "step": 7548 + }, + { + "epoch": 2.317065684468999, + "grad_norm": 0.33142411708831787, + "learning_rate": 8.970923586399407e-05, + "loss": 1.863, + "step": 7549 + }, + { + "epoch": 2.3173726212400245, + "grad_norm": 0.348715603351593, + "learning_rate": 8.970621517574929e-05, + "loss": 1.8886, + "step": 7550 + }, + { + "epoch": 2.31767955801105, + "grad_norm": 0.3179607689380646, + "learning_rate": 8.970319409510444e-05, + "loss": 1.8955, + "step": 7551 + }, + { + "epoch": 2.3179864947820747, + "grad_norm": 0.33166465163230896, + "learning_rate": 8.970017262208934e-05, + "loss": 1.8366, + "step": 7552 + }, + { + "epoch": 2.3182934315531, + "grad_norm": 0.30798691511154175, + "learning_rate": 8.969715075673386e-05, + "loss": 1.8437, + "step": 7553 + }, + { + "epoch": 2.3186003683241254, + "grad_norm": 0.292639821767807, + "learning_rate": 8.969412849906788e-05, + "loss": 1.8056, + "step": 7554 + }, + { + "epoch": 2.3189073050951503, + "grad_norm": 0.2972165048122406, + "learning_rate": 8.969110584912125e-05, + "loss": 1.8596, + "step": 7555 + }, + { + "epoch": 2.3192142418661756, + "grad_norm": 0.3346043527126312, + "learning_rate": 8.968808280692385e-05, + "loss": 1.8652, + "step": 7556 + }, + { + "epoch": 2.319521178637201, + "grad_norm": 0.31866857409477234, + "learning_rate": 8.968505937250555e-05, + "loss": 1.9263, + "step": 7557 + }, + { + "epoch": 2.319828115408226, + "grad_norm": 0.3511367440223694, + "learning_rate": 8.968203554589625e-05, + "loss": 1.8615, + "step": 7558 + }, + { + "epoch": 2.320135052179251, + "grad_norm": 0.36077243089675903, + "learning_rate": 8.96790113271258e-05, + "loss": 1.9155, + "step": 7559 + }, + { + "epoch": 2.320441988950276, + "grad_norm": 0.3335363268852234, + "learning_rate": 8.96759867162241e-05, + "loss": 1.8313, + "step": 7560 + }, + { + "epoch": 2.3207489257213014, + "grad_norm": 0.31834676861763, + "learning_rate": 8.967296171322105e-05, + "loss": 1.809, + "step": 7561 + }, + { + "epoch": 2.3210558624923268, + "grad_norm": 0.3629632890224457, + "learning_rate": 8.966993631814655e-05, + "loss": 1.854, + "step": 7562 + }, + { + "epoch": 2.3213627992633517, + "grad_norm": 0.3164220154285431, + "learning_rate": 8.966691053103049e-05, + "loss": 1.8431, + "step": 7563 + }, + { + "epoch": 2.321669736034377, + "grad_norm": 0.408178448677063, + "learning_rate": 8.966388435190276e-05, + "loss": 1.8652, + "step": 7564 + }, + { + "epoch": 2.321976672805402, + "grad_norm": 0.4244436025619507, + "learning_rate": 8.966085778079327e-05, + "loss": 1.8834, + "step": 7565 + }, + { + "epoch": 2.322283609576427, + "grad_norm": 0.44187989830970764, + "learning_rate": 8.965783081773195e-05, + "loss": 1.8822, + "step": 7566 + }, + { + "epoch": 2.3225905463474525, + "grad_norm": 0.30801042914390564, + "learning_rate": 8.965480346274869e-05, + "loss": 1.8145, + "step": 7567 + }, + { + "epoch": 2.3228974831184774, + "grad_norm": 0.30103740096092224, + "learning_rate": 8.965177571587343e-05, + "loss": 1.8207, + "step": 7568 + }, + { + "epoch": 2.3232044198895028, + "grad_norm": 0.417538046836853, + "learning_rate": 8.964874757713608e-05, + "loss": 1.9213, + "step": 7569 + }, + { + "epoch": 2.323511356660528, + "grad_norm": 0.4238434433937073, + "learning_rate": 8.964571904656656e-05, + "loss": 1.8309, + "step": 7570 + }, + { + "epoch": 2.323818293431553, + "grad_norm": 0.3717726171016693, + "learning_rate": 8.964269012419482e-05, + "loss": 1.8613, + "step": 7571 + }, + { + "epoch": 2.3241252302025783, + "grad_norm": 0.369182288646698, + "learning_rate": 8.963966081005078e-05, + "loss": 1.9232, + "step": 7572 + }, + { + "epoch": 2.3244321669736037, + "grad_norm": 0.40301385521888733, + "learning_rate": 8.963663110416436e-05, + "loss": 1.9509, + "step": 7573 + }, + { + "epoch": 2.3247391037446286, + "grad_norm": 0.3336825966835022, + "learning_rate": 8.963360100656553e-05, + "loss": 1.807, + "step": 7574 + }, + { + "epoch": 2.325046040515654, + "grad_norm": 0.4070039987564087, + "learning_rate": 8.963057051728423e-05, + "loss": 1.9349, + "step": 7575 + }, + { + "epoch": 2.325352977286679, + "grad_norm": 0.34244731068611145, + "learning_rate": 8.96275396363504e-05, + "loss": 1.8378, + "step": 7576 + }, + { + "epoch": 2.325659914057704, + "grad_norm": 0.3408849835395813, + "learning_rate": 8.962450836379401e-05, + "loss": 1.8087, + "step": 7577 + }, + { + "epoch": 2.3259668508287294, + "grad_norm": 0.34224358201026917, + "learning_rate": 8.962147669964498e-05, + "loss": 1.9158, + "step": 7578 + }, + { + "epoch": 2.3262737875997543, + "grad_norm": 0.36177051067352295, + "learning_rate": 8.961844464393332e-05, + "loss": 1.8774, + "step": 7579 + }, + { + "epoch": 2.3265807243707797, + "grad_norm": 0.3000224232673645, + "learning_rate": 8.961541219668895e-05, + "loss": 1.8092, + "step": 7580 + }, + { + "epoch": 2.3268876611418046, + "grad_norm": 0.34738194942474365, + "learning_rate": 8.961237935794185e-05, + "loss": 1.9107, + "step": 7581 + }, + { + "epoch": 2.32719459791283, + "grad_norm": 0.355585515499115, + "learning_rate": 8.960934612772203e-05, + "loss": 1.8343, + "step": 7582 + }, + { + "epoch": 2.3275015346838552, + "grad_norm": 0.29839828610420227, + "learning_rate": 8.96063125060594e-05, + "loss": 1.8345, + "step": 7583 + }, + { + "epoch": 2.32780847145488, + "grad_norm": 0.3695736229419708, + "learning_rate": 8.960327849298399e-05, + "loss": 1.8763, + "step": 7584 + }, + { + "epoch": 2.3281154082259055, + "grad_norm": 0.38834989070892334, + "learning_rate": 8.960024408852578e-05, + "loss": 1.8732, + "step": 7585 + }, + { + "epoch": 2.328422344996931, + "grad_norm": 0.4515606462955475, + "learning_rate": 8.959720929271474e-05, + "loss": 1.9685, + "step": 7586 + }, + { + "epoch": 2.3287292817679557, + "grad_norm": 0.39115825295448303, + "learning_rate": 8.959417410558087e-05, + "loss": 1.7969, + "step": 7587 + }, + { + "epoch": 2.329036218538981, + "grad_norm": 0.37858307361602783, + "learning_rate": 8.959113852715417e-05, + "loss": 1.9013, + "step": 7588 + }, + { + "epoch": 2.3293431553100064, + "grad_norm": 0.35533010959625244, + "learning_rate": 8.958810255746462e-05, + "loss": 1.8862, + "step": 7589 + }, + { + "epoch": 2.3296500920810312, + "grad_norm": 0.36994054913520813, + "learning_rate": 8.958506619654226e-05, + "loss": 1.9783, + "step": 7590 + }, + { + "epoch": 2.3299570288520566, + "grad_norm": 0.4424416124820709, + "learning_rate": 8.958202944441705e-05, + "loss": 1.9095, + "step": 7591 + }, + { + "epoch": 2.3302639656230815, + "grad_norm": 0.41932111978530884, + "learning_rate": 8.957899230111903e-05, + "loss": 1.8623, + "step": 7592 + }, + { + "epoch": 2.330570902394107, + "grad_norm": 0.4359748363494873, + "learning_rate": 8.957595476667822e-05, + "loss": 1.8917, + "step": 7593 + }, + { + "epoch": 2.330877839165132, + "grad_norm": 0.362957239151001, + "learning_rate": 8.957291684112463e-05, + "loss": 1.8478, + "step": 7594 + }, + { + "epoch": 2.331184775936157, + "grad_norm": 0.3442717492580414, + "learning_rate": 8.956987852448827e-05, + "loss": 1.862, + "step": 7595 + }, + { + "epoch": 2.3314917127071824, + "grad_norm": 0.33355212211608887, + "learning_rate": 8.956683981679918e-05, + "loss": 1.8319, + "step": 7596 + }, + { + "epoch": 2.3317986494782073, + "grad_norm": 0.36758801341056824, + "learning_rate": 8.95638007180874e-05, + "loss": 1.8989, + "step": 7597 + }, + { + "epoch": 2.3321055862492326, + "grad_norm": 0.3574751019477844, + "learning_rate": 8.956076122838294e-05, + "loss": 1.8304, + "step": 7598 + }, + { + "epoch": 2.332412523020258, + "grad_norm": 0.30615341663360596, + "learning_rate": 8.955772134771585e-05, + "loss": 1.9078, + "step": 7599 + }, + { + "epoch": 2.332719459791283, + "grad_norm": 0.38824397325515747, + "learning_rate": 8.955468107611618e-05, + "loss": 1.8733, + "step": 7600 + }, + { + "epoch": 2.333026396562308, + "grad_norm": 0.40545380115509033, + "learning_rate": 8.955164041361395e-05, + "loss": 1.8264, + "step": 7601 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.3104313910007477, + "learning_rate": 8.954859936023925e-05, + "loss": 1.8272, + "step": 7602 + }, + { + "epoch": 2.3336402701043584, + "grad_norm": 0.34795114398002625, + "learning_rate": 8.954555791602211e-05, + "loss": 1.8711, + "step": 7603 + }, + { + "epoch": 2.3339472068753837, + "grad_norm": 0.42790937423706055, + "learning_rate": 8.954251608099257e-05, + "loss": 1.8802, + "step": 7604 + }, + { + "epoch": 2.334254143646409, + "grad_norm": 0.3903054893016815, + "learning_rate": 8.953947385518072e-05, + "loss": 1.8489, + "step": 7605 + }, + { + "epoch": 2.334561080417434, + "grad_norm": 0.35869601368904114, + "learning_rate": 8.953643123861661e-05, + "loss": 1.8565, + "step": 7606 + }, + { + "epoch": 2.3348680171884593, + "grad_norm": 0.3960758447647095, + "learning_rate": 8.953338823133033e-05, + "loss": 1.9335, + "step": 7607 + }, + { + "epoch": 2.335174953959484, + "grad_norm": 0.3884136974811554, + "learning_rate": 8.953034483335191e-05, + "loss": 1.887, + "step": 7608 + }, + { + "epoch": 2.3354818907305095, + "grad_norm": 0.3734811246395111, + "learning_rate": 8.952730104471147e-05, + "loss": 1.861, + "step": 7609 + }, + { + "epoch": 2.335788827501535, + "grad_norm": 0.3074554204940796, + "learning_rate": 8.952425686543908e-05, + "loss": 1.8556, + "step": 7610 + }, + { + "epoch": 2.3360957642725597, + "grad_norm": 0.3098750412464142, + "learning_rate": 8.952121229556481e-05, + "loss": 1.8724, + "step": 7611 + }, + { + "epoch": 2.336402701043585, + "grad_norm": 0.3514649569988251, + "learning_rate": 8.951816733511875e-05, + "loss": 1.8023, + "step": 7612 + }, + { + "epoch": 2.33670963781461, + "grad_norm": 0.3275100290775299, + "learning_rate": 8.951512198413101e-05, + "loss": 1.8805, + "step": 7613 + }, + { + "epoch": 2.3370165745856353, + "grad_norm": 0.3380829989910126, + "learning_rate": 8.951207624263165e-05, + "loss": 1.8559, + "step": 7614 + }, + { + "epoch": 2.3373235113566606, + "grad_norm": 0.43179723620414734, + "learning_rate": 8.950903011065082e-05, + "loss": 1.937, + "step": 7615 + }, + { + "epoch": 2.337630448127686, + "grad_norm": 0.4981893002986908, + "learning_rate": 8.950598358821858e-05, + "loss": 1.8828, + "step": 7616 + }, + { + "epoch": 2.337937384898711, + "grad_norm": 0.42164552211761475, + "learning_rate": 8.950293667536506e-05, + "loss": 1.8898, + "step": 7617 + }, + { + "epoch": 2.338244321669736, + "grad_norm": 0.32897287607192993, + "learning_rate": 8.949988937212037e-05, + "loss": 1.9073, + "step": 7618 + }, + { + "epoch": 2.338551258440761, + "grad_norm": 0.38831618428230286, + "learning_rate": 8.949684167851462e-05, + "loss": 1.9694, + "step": 7619 + }, + { + "epoch": 2.3388581952117864, + "grad_norm": 0.3728467524051666, + "learning_rate": 8.949379359457793e-05, + "loss": 1.8803, + "step": 7620 + }, + { + "epoch": 2.3391651319828117, + "grad_norm": 0.4003579020500183, + "learning_rate": 8.949074512034044e-05, + "loss": 1.9306, + "step": 7621 + }, + { + "epoch": 2.3394720687538366, + "grad_norm": 0.35670751333236694, + "learning_rate": 8.948769625583224e-05, + "loss": 1.9176, + "step": 7622 + }, + { + "epoch": 2.339779005524862, + "grad_norm": 0.3257119357585907, + "learning_rate": 8.948464700108347e-05, + "loss": 1.8781, + "step": 7623 + }, + { + "epoch": 2.340085942295887, + "grad_norm": 0.2840226888656616, + "learning_rate": 8.94815973561243e-05, + "loss": 1.8112, + "step": 7624 + }, + { + "epoch": 2.340392879066912, + "grad_norm": 0.33156147599220276, + "learning_rate": 8.947854732098484e-05, + "loss": 1.8562, + "step": 7625 + }, + { + "epoch": 2.3406998158379375, + "grad_norm": 0.33335328102111816, + "learning_rate": 8.947549689569524e-05, + "loss": 1.8404, + "step": 7626 + }, + { + "epoch": 2.3410067526089624, + "grad_norm": 0.2913919985294342, + "learning_rate": 8.947244608028562e-05, + "loss": 1.83, + "step": 7627 + }, + { + "epoch": 2.3413136893799877, + "grad_norm": 0.32735875248908997, + "learning_rate": 8.946939487478618e-05, + "loss": 1.9047, + "step": 7628 + }, + { + "epoch": 2.341620626151013, + "grad_norm": 0.3421878516674042, + "learning_rate": 8.946634327922703e-05, + "loss": 1.8771, + "step": 7629 + }, + { + "epoch": 2.341927562922038, + "grad_norm": 0.33164483308792114, + "learning_rate": 8.946329129363835e-05, + "loss": 1.8463, + "step": 7630 + }, + { + "epoch": 2.3422344996930633, + "grad_norm": 0.35423099994659424, + "learning_rate": 8.946023891805029e-05, + "loss": 1.9254, + "step": 7631 + }, + { + "epoch": 2.3425414364640886, + "grad_norm": 0.3554958403110504, + "learning_rate": 8.9457186152493e-05, + "loss": 1.8949, + "step": 7632 + }, + { + "epoch": 2.3428483732351135, + "grad_norm": 0.35155919194221497, + "learning_rate": 8.94541329969967e-05, + "loss": 1.8432, + "step": 7633 + }, + { + "epoch": 2.343155310006139, + "grad_norm": 0.3210476338863373, + "learning_rate": 8.945107945159154e-05, + "loss": 1.8512, + "step": 7634 + }, + { + "epoch": 2.3434622467771637, + "grad_norm": 0.3587365746498108, + "learning_rate": 8.944802551630767e-05, + "loss": 1.8355, + "step": 7635 + }, + { + "epoch": 2.343769183548189, + "grad_norm": 0.41851457953453064, + "learning_rate": 8.94449711911753e-05, + "loss": 1.814, + "step": 7636 + }, + { + "epoch": 2.3440761203192144, + "grad_norm": 0.3516016900539398, + "learning_rate": 8.94419164762246e-05, + "loss": 1.8563, + "step": 7637 + }, + { + "epoch": 2.3443830570902393, + "grad_norm": 0.2917228937149048, + "learning_rate": 8.943886137148576e-05, + "loss": 1.8037, + "step": 7638 + }, + { + "epoch": 2.3446899938612646, + "grad_norm": 0.3597778379917145, + "learning_rate": 8.943580587698899e-05, + "loss": 1.8766, + "step": 7639 + }, + { + "epoch": 2.3449969306322895, + "grad_norm": 0.359642893075943, + "learning_rate": 8.943274999276445e-05, + "loss": 1.8485, + "step": 7640 + }, + { + "epoch": 2.345303867403315, + "grad_norm": 0.3543380796909332, + "learning_rate": 8.942969371884238e-05, + "loss": 1.8853, + "step": 7641 + }, + { + "epoch": 2.34561080417434, + "grad_norm": 0.371267706155777, + "learning_rate": 8.942663705525296e-05, + "loss": 1.869, + "step": 7642 + }, + { + "epoch": 2.345917740945365, + "grad_norm": 0.34073930978775024, + "learning_rate": 8.942358000202642e-05, + "loss": 1.831, + "step": 7643 + }, + { + "epoch": 2.3462246777163904, + "grad_norm": 0.3654492497444153, + "learning_rate": 8.942052255919293e-05, + "loss": 1.8697, + "step": 7644 + }, + { + "epoch": 2.3465316144874158, + "grad_norm": 0.31281957030296326, + "learning_rate": 8.941746472678275e-05, + "loss": 1.7908, + "step": 7645 + }, + { + "epoch": 2.3468385512584407, + "grad_norm": 0.3310844302177429, + "learning_rate": 8.941440650482607e-05, + "loss": 1.8523, + "step": 7646 + }, + { + "epoch": 2.347145488029466, + "grad_norm": 0.3187454342842102, + "learning_rate": 8.941134789335312e-05, + "loss": 1.8808, + "step": 7647 + }, + { + "epoch": 2.3474524248004913, + "grad_norm": 0.35980424284935, + "learning_rate": 8.940828889239415e-05, + "loss": 1.8713, + "step": 7648 + }, + { + "epoch": 2.347759361571516, + "grad_norm": 0.2960885763168335, + "learning_rate": 8.940522950197935e-05, + "loss": 1.8077, + "step": 7649 + }, + { + "epoch": 2.3480662983425415, + "grad_norm": 0.3056114912033081, + "learning_rate": 8.940216972213897e-05, + "loss": 1.8805, + "step": 7650 + }, + { + "epoch": 2.3483732351135664, + "grad_norm": 0.3047563135623932, + "learning_rate": 8.939910955290328e-05, + "loss": 1.793, + "step": 7651 + }, + { + "epoch": 2.3486801718845918, + "grad_norm": 0.3381251394748688, + "learning_rate": 8.939604899430248e-05, + "loss": 1.8267, + "step": 7652 + }, + { + "epoch": 2.348987108655617, + "grad_norm": 0.36855414509773254, + "learning_rate": 8.939298804636684e-05, + "loss": 1.9386, + "step": 7653 + }, + { + "epoch": 2.349294045426642, + "grad_norm": 0.3742626905441284, + "learning_rate": 8.93899267091266e-05, + "loss": 1.8695, + "step": 7654 + }, + { + "epoch": 2.3496009821976673, + "grad_norm": 0.3170017600059509, + "learning_rate": 8.938686498261201e-05, + "loss": 1.881, + "step": 7655 + }, + { + "epoch": 2.349907918968692, + "grad_norm": 0.2740418016910553, + "learning_rate": 8.938380286685334e-05, + "loss": 1.7992, + "step": 7656 + }, + { + "epoch": 2.3502148557397176, + "grad_norm": 0.3170342743396759, + "learning_rate": 8.938074036188087e-05, + "loss": 1.8281, + "step": 7657 + }, + { + "epoch": 2.350521792510743, + "grad_norm": 0.3487764298915863, + "learning_rate": 8.93776774677248e-05, + "loss": 1.8508, + "step": 7658 + }, + { + "epoch": 2.350828729281768, + "grad_norm": 0.3193725347518921, + "learning_rate": 8.937461418441549e-05, + "loss": 1.802, + "step": 7659 + }, + { + "epoch": 2.351135666052793, + "grad_norm": 0.30621078610420227, + "learning_rate": 8.937155051198312e-05, + "loss": 1.8723, + "step": 7660 + }, + { + "epoch": 2.3514426028238185, + "grad_norm": 0.3154527544975281, + "learning_rate": 8.936848645045803e-05, + "loss": 1.8276, + "step": 7661 + }, + { + "epoch": 2.3517495395948433, + "grad_norm": 0.3809822201728821, + "learning_rate": 8.936542199987048e-05, + "loss": 1.9682, + "step": 7662 + }, + { + "epoch": 2.3520564763658687, + "grad_norm": 0.3817490339279175, + "learning_rate": 8.936235716025076e-05, + "loss": 1.8896, + "step": 7663 + }, + { + "epoch": 2.352363413136894, + "grad_norm": 0.2996097207069397, + "learning_rate": 8.935929193162915e-05, + "loss": 1.7994, + "step": 7664 + }, + { + "epoch": 2.352670349907919, + "grad_norm": 0.30788013339042664, + "learning_rate": 8.935622631403596e-05, + "loss": 1.8243, + "step": 7665 + }, + { + "epoch": 2.3529772866789442, + "grad_norm": 0.331193745136261, + "learning_rate": 8.935316030750145e-05, + "loss": 1.9044, + "step": 7666 + }, + { + "epoch": 2.353284223449969, + "grad_norm": 0.31796711683273315, + "learning_rate": 8.935009391205598e-05, + "loss": 1.8006, + "step": 7667 + }, + { + "epoch": 2.3535911602209945, + "grad_norm": 0.3864014744758606, + "learning_rate": 8.934702712772979e-05, + "loss": 2.0193, + "step": 7668 + }, + { + "epoch": 2.35389809699202, + "grad_norm": 0.3923170566558838, + "learning_rate": 8.934395995455323e-05, + "loss": 1.9418, + "step": 7669 + }, + { + "epoch": 2.3542050337630447, + "grad_norm": 0.3210037052631378, + "learning_rate": 8.934089239255659e-05, + "loss": 1.7964, + "step": 7670 + }, + { + "epoch": 2.35451197053407, + "grad_norm": 0.32465317845344543, + "learning_rate": 8.933782444177019e-05, + "loss": 1.9405, + "step": 7671 + }, + { + "epoch": 2.354818907305095, + "grad_norm": 0.35554173588752747, + "learning_rate": 8.933475610222435e-05, + "loss": 1.8645, + "step": 7672 + }, + { + "epoch": 2.3551258440761202, + "grad_norm": 0.32723551988601685, + "learning_rate": 8.933168737394942e-05, + "loss": 1.8941, + "step": 7673 + }, + { + "epoch": 2.3554327808471456, + "grad_norm": 0.3295009732246399, + "learning_rate": 8.932861825697567e-05, + "loss": 1.9047, + "step": 7674 + }, + { + "epoch": 2.3557397176181705, + "grad_norm": 0.32315388321876526, + "learning_rate": 8.932554875133348e-05, + "loss": 1.8535, + "step": 7675 + }, + { + "epoch": 2.356046654389196, + "grad_norm": 0.31577154994010925, + "learning_rate": 8.932247885705315e-05, + "loss": 1.8697, + "step": 7676 + }, + { + "epoch": 2.356353591160221, + "grad_norm": 0.31099769473075867, + "learning_rate": 8.931940857416506e-05, + "loss": 1.8377, + "step": 7677 + }, + { + "epoch": 2.356660527931246, + "grad_norm": 0.32998642325401306, + "learning_rate": 8.931633790269954e-05, + "loss": 1.8528, + "step": 7678 + }, + { + "epoch": 2.3569674647022714, + "grad_norm": 0.29609233140945435, + "learning_rate": 8.93132668426869e-05, + "loss": 1.8646, + "step": 7679 + }, + { + "epoch": 2.3572744014732967, + "grad_norm": 0.31335413455963135, + "learning_rate": 8.931019539415752e-05, + "loss": 1.9011, + "step": 7680 + }, + { + "epoch": 2.3575813382443216, + "grad_norm": 0.3441788852214813, + "learning_rate": 8.930712355714174e-05, + "loss": 1.8673, + "step": 7681 + }, + { + "epoch": 2.357888275015347, + "grad_norm": 0.34610918164253235, + "learning_rate": 8.930405133166992e-05, + "loss": 1.8613, + "step": 7682 + }, + { + "epoch": 2.358195211786372, + "grad_norm": 0.31753265857696533, + "learning_rate": 8.930097871777245e-05, + "loss": 1.873, + "step": 7683 + }, + { + "epoch": 2.358502148557397, + "grad_norm": 0.29862073063850403, + "learning_rate": 8.929790571547966e-05, + "loss": 1.8392, + "step": 7684 + }, + { + "epoch": 2.3588090853284225, + "grad_norm": 0.2953017055988312, + "learning_rate": 8.929483232482194e-05, + "loss": 1.8402, + "step": 7685 + }, + { + "epoch": 2.3591160220994474, + "grad_norm": 0.36613956093788147, + "learning_rate": 8.929175854582966e-05, + "loss": 1.8954, + "step": 7686 + }, + { + "epoch": 2.3594229588704727, + "grad_norm": 0.3867746889591217, + "learning_rate": 8.928868437853319e-05, + "loss": 1.8496, + "step": 7687 + }, + { + "epoch": 2.359729895641498, + "grad_norm": 0.30742913484573364, + "learning_rate": 8.928560982296292e-05, + "loss": 1.82, + "step": 7688 + }, + { + "epoch": 2.360036832412523, + "grad_norm": 0.306905061006546, + "learning_rate": 8.928253487914921e-05, + "loss": 1.8299, + "step": 7689 + }, + { + "epoch": 2.3603437691835483, + "grad_norm": 0.3253326416015625, + "learning_rate": 8.927945954712247e-05, + "loss": 1.896, + "step": 7690 + }, + { + "epoch": 2.3606507059545736, + "grad_norm": 0.3139156699180603, + "learning_rate": 8.927638382691309e-05, + "loss": 1.838, + "step": 7691 + }, + { + "epoch": 2.3609576427255985, + "grad_norm": 0.3865121006965637, + "learning_rate": 8.927330771855147e-05, + "loss": 1.8502, + "step": 7692 + }, + { + "epoch": 2.361264579496624, + "grad_norm": 0.3640300929546356, + "learning_rate": 8.927023122206799e-05, + "loss": 1.8929, + "step": 7693 + }, + { + "epoch": 2.3615715162676487, + "grad_norm": 0.3446909487247467, + "learning_rate": 8.926715433749309e-05, + "loss": 1.864, + "step": 7694 + }, + { + "epoch": 2.361878453038674, + "grad_norm": 0.3086490035057068, + "learning_rate": 8.926407706485713e-05, + "loss": 1.8588, + "step": 7695 + }, + { + "epoch": 2.3621853898096994, + "grad_norm": 0.28351619839668274, + "learning_rate": 8.926099940419057e-05, + "loss": 1.8114, + "step": 7696 + }, + { + "epoch": 2.3624923265807243, + "grad_norm": 0.31882742047309875, + "learning_rate": 8.925792135552379e-05, + "loss": 1.8544, + "step": 7697 + }, + { + "epoch": 2.3627992633517496, + "grad_norm": 0.2691894769668579, + "learning_rate": 8.925484291888723e-05, + "loss": 1.8143, + "step": 7698 + }, + { + "epoch": 2.3631062001227745, + "grad_norm": 0.2815118432044983, + "learning_rate": 8.925176409431129e-05, + "loss": 1.8687, + "step": 7699 + }, + { + "epoch": 2.3634131368938, + "grad_norm": 0.34842196106910706, + "learning_rate": 8.924868488182643e-05, + "loss": 1.8673, + "step": 7700 + }, + { + "epoch": 2.363720073664825, + "grad_norm": 0.33553025126457214, + "learning_rate": 8.924560528146304e-05, + "loss": 1.8982, + "step": 7701 + }, + { + "epoch": 2.36402701043585, + "grad_norm": 0.30077221989631653, + "learning_rate": 8.924252529325159e-05, + "loss": 1.8155, + "step": 7702 + }, + { + "epoch": 2.3643339472068754, + "grad_norm": 0.3376595079898834, + "learning_rate": 8.923944491722252e-05, + "loss": 1.8871, + "step": 7703 + }, + { + "epoch": 2.3646408839779007, + "grad_norm": 0.3980284333229065, + "learning_rate": 8.923636415340622e-05, + "loss": 1.8414, + "step": 7704 + }, + { + "epoch": 2.3649478207489256, + "grad_norm": 0.4772777259349823, + "learning_rate": 8.92332830018332e-05, + "loss": 1.8393, + "step": 7705 + }, + { + "epoch": 2.365254757519951, + "grad_norm": 0.5061559081077576, + "learning_rate": 8.923020146253387e-05, + "loss": 1.9134, + "step": 7706 + }, + { + "epoch": 2.3655616942909763, + "grad_norm": 0.47147873044013977, + "learning_rate": 8.922711953553871e-05, + "loss": 1.9026, + "step": 7707 + }, + { + "epoch": 2.365868631062001, + "grad_norm": 0.37263748049736023, + "learning_rate": 8.922403722087814e-05, + "loss": 1.8474, + "step": 7708 + }, + { + "epoch": 2.3661755678330265, + "grad_norm": 0.3158501386642456, + "learning_rate": 8.922095451858265e-05, + "loss": 1.8771, + "step": 7709 + }, + { + "epoch": 2.3664825046040514, + "grad_norm": 0.3170566260814667, + "learning_rate": 8.921787142868271e-05, + "loss": 1.8111, + "step": 7710 + }, + { + "epoch": 2.3667894413750767, + "grad_norm": 0.3532208502292633, + "learning_rate": 8.921478795120877e-05, + "loss": 1.8708, + "step": 7711 + }, + { + "epoch": 2.367096378146102, + "grad_norm": 0.3211480379104614, + "learning_rate": 8.921170408619131e-05, + "loss": 1.8487, + "step": 7712 + }, + { + "epoch": 2.367403314917127, + "grad_norm": 0.2806071937084198, + "learning_rate": 8.920861983366083e-05, + "loss": 1.8325, + "step": 7713 + }, + { + "epoch": 2.3677102516881523, + "grad_norm": 0.30703970789909363, + "learning_rate": 8.920553519364777e-05, + "loss": 1.8364, + "step": 7714 + }, + { + "epoch": 2.368017188459177, + "grad_norm": 0.30848923325538635, + "learning_rate": 8.920245016618263e-05, + "loss": 1.833, + "step": 7715 + }, + { + "epoch": 2.3683241252302025, + "grad_norm": 0.31656739115715027, + "learning_rate": 8.919936475129588e-05, + "loss": 1.8884, + "step": 7716 + }, + { + "epoch": 2.368631062001228, + "grad_norm": 0.2806589603424072, + "learning_rate": 8.919627894901806e-05, + "loss": 1.7779, + "step": 7717 + }, + { + "epoch": 2.3689379987722528, + "grad_norm": 0.2943432629108429, + "learning_rate": 8.919319275937962e-05, + "loss": 1.8741, + "step": 7718 + }, + { + "epoch": 2.369244935543278, + "grad_norm": 0.2870347499847412, + "learning_rate": 8.919010618241111e-05, + "loss": 1.8415, + "step": 7719 + }, + { + "epoch": 2.3695518723143034, + "grad_norm": 0.3224312663078308, + "learning_rate": 8.918701921814297e-05, + "loss": 1.8594, + "step": 7720 + }, + { + "epoch": 2.3698588090853283, + "grad_norm": 0.3007681369781494, + "learning_rate": 8.918393186660575e-05, + "loss": 1.878, + "step": 7721 + }, + { + "epoch": 2.3701657458563536, + "grad_norm": 0.3083780109882355, + "learning_rate": 8.918084412782994e-05, + "loss": 1.9088, + "step": 7722 + }, + { + "epoch": 2.370472682627379, + "grad_norm": 0.30599063634872437, + "learning_rate": 8.917775600184608e-05, + "loss": 1.8743, + "step": 7723 + }, + { + "epoch": 2.370779619398404, + "grad_norm": 0.33503273129463196, + "learning_rate": 8.917466748868466e-05, + "loss": 1.9048, + "step": 7724 + }, + { + "epoch": 2.371086556169429, + "grad_norm": 0.3861919343471527, + "learning_rate": 8.917157858837622e-05, + "loss": 1.9073, + "step": 7725 + }, + { + "epoch": 2.371393492940454, + "grad_norm": 0.395945280790329, + "learning_rate": 8.916848930095128e-05, + "loss": 1.8678, + "step": 7726 + }, + { + "epoch": 2.3717004297114794, + "grad_norm": 0.3657386600971222, + "learning_rate": 8.916539962644037e-05, + "loss": 1.9138, + "step": 7727 + }, + { + "epoch": 2.3720073664825048, + "grad_norm": 0.32392752170562744, + "learning_rate": 8.916230956487402e-05, + "loss": 1.803, + "step": 7728 + }, + { + "epoch": 2.3723143032535297, + "grad_norm": 0.406703382730484, + "learning_rate": 8.915921911628278e-05, + "loss": 1.9222, + "step": 7729 + }, + { + "epoch": 2.372621240024555, + "grad_norm": 0.4293023645877838, + "learning_rate": 8.915612828069718e-05, + "loss": 1.8874, + "step": 7730 + }, + { + "epoch": 2.37292817679558, + "grad_norm": 0.45155876874923706, + "learning_rate": 8.915303705814777e-05, + "loss": 1.9059, + "step": 7731 + }, + { + "epoch": 2.373235113566605, + "grad_norm": 0.35105881094932556, + "learning_rate": 8.91499454486651e-05, + "loss": 1.8387, + "step": 7732 + }, + { + "epoch": 2.3735420503376305, + "grad_norm": 0.3197930157184601, + "learning_rate": 8.914685345227973e-05, + "loss": 1.8174, + "step": 7733 + }, + { + "epoch": 2.3738489871086554, + "grad_norm": 0.3610389232635498, + "learning_rate": 8.91437610690222e-05, + "loss": 1.841, + "step": 7734 + }, + { + "epoch": 2.3741559238796808, + "grad_norm": 0.3696954548358917, + "learning_rate": 8.91406682989231e-05, + "loss": 1.8511, + "step": 7735 + }, + { + "epoch": 2.374462860650706, + "grad_norm": 0.3364555239677429, + "learning_rate": 8.913757514201295e-05, + "loss": 1.8382, + "step": 7736 + }, + { + "epoch": 2.374769797421731, + "grad_norm": 0.4600698947906494, + "learning_rate": 8.913448159832236e-05, + "loss": 1.8247, + "step": 7737 + }, + { + "epoch": 2.3750767341927563, + "grad_norm": 0.5877843499183655, + "learning_rate": 8.913138766788187e-05, + "loss": 1.8449, + "step": 7738 + }, + { + "epoch": 2.3753836709637817, + "grad_norm": 0.5380640029907227, + "learning_rate": 8.912829335072208e-05, + "loss": 1.8647, + "step": 7739 + }, + { + "epoch": 2.3756906077348066, + "grad_norm": 0.5100306272506714, + "learning_rate": 8.912519864687357e-05, + "loss": 1.884, + "step": 7740 + }, + { + "epoch": 2.375997544505832, + "grad_norm": 0.48175910115242004, + "learning_rate": 8.91221035563669e-05, + "loss": 1.8378, + "step": 7741 + }, + { + "epoch": 2.376304481276857, + "grad_norm": 0.3296540081501007, + "learning_rate": 8.911900807923268e-05, + "loss": 1.8036, + "step": 7742 + }, + { + "epoch": 2.376611418047882, + "grad_norm": 0.32398131489753723, + "learning_rate": 8.911591221550149e-05, + "loss": 1.8415, + "step": 7743 + }, + { + "epoch": 2.3769183548189075, + "grad_norm": 0.33934786915779114, + "learning_rate": 8.911281596520393e-05, + "loss": 1.9002, + "step": 7744 + }, + { + "epoch": 2.3772252915899323, + "grad_norm": 0.33059465885162354, + "learning_rate": 8.91097193283706e-05, + "loss": 1.8194, + "step": 7745 + }, + { + "epoch": 2.3775322283609577, + "grad_norm": 0.2908796966075897, + "learning_rate": 8.91066223050321e-05, + "loss": 1.8272, + "step": 7746 + }, + { + "epoch": 2.3778391651319826, + "grad_norm": 0.31551963090896606, + "learning_rate": 8.910352489521904e-05, + "loss": 1.8717, + "step": 7747 + }, + { + "epoch": 2.378146101903008, + "grad_norm": 0.2886766493320465, + "learning_rate": 8.910042709896203e-05, + "loss": 1.8714, + "step": 7748 + }, + { + "epoch": 2.3784530386740332, + "grad_norm": 0.3288721740245819, + "learning_rate": 8.909732891629167e-05, + "loss": 1.9194, + "step": 7749 + }, + { + "epoch": 2.378759975445058, + "grad_norm": 0.42444637417793274, + "learning_rate": 8.90942303472386e-05, + "loss": 1.8871, + "step": 7750 + }, + { + "epoch": 2.3790669122160835, + "grad_norm": 0.3550770580768585, + "learning_rate": 8.909113139183343e-05, + "loss": 1.8639, + "step": 7751 + }, + { + "epoch": 2.379373848987109, + "grad_norm": 0.3291744589805603, + "learning_rate": 8.908803205010679e-05, + "loss": 1.8284, + "step": 7752 + }, + { + "epoch": 2.3796807857581337, + "grad_norm": 0.2803054451942444, + "learning_rate": 8.908493232208928e-05, + "loss": 1.8113, + "step": 7753 + }, + { + "epoch": 2.379987722529159, + "grad_norm": 0.30959245562553406, + "learning_rate": 8.908183220781158e-05, + "loss": 1.8821, + "step": 7754 + }, + { + "epoch": 2.3802946593001844, + "grad_norm": 0.37838777899742126, + "learning_rate": 8.907873170730431e-05, + "loss": 1.8749, + "step": 7755 + }, + { + "epoch": 2.3806015960712092, + "grad_norm": 0.34625449776649475, + "learning_rate": 8.907563082059813e-05, + "loss": 1.8804, + "step": 7756 + }, + { + "epoch": 2.3809085328422346, + "grad_norm": 0.3966830372810364, + "learning_rate": 8.907252954772364e-05, + "loss": 1.9295, + "step": 7757 + }, + { + "epoch": 2.3812154696132595, + "grad_norm": 0.3144119679927826, + "learning_rate": 8.906942788871151e-05, + "loss": 1.8486, + "step": 7758 + }, + { + "epoch": 2.381522406384285, + "grad_norm": 0.3498438596725464, + "learning_rate": 8.90663258435924e-05, + "loss": 1.8813, + "step": 7759 + }, + { + "epoch": 2.38182934315531, + "grad_norm": 0.32803723216056824, + "learning_rate": 8.906322341239696e-05, + "loss": 1.8282, + "step": 7760 + }, + { + "epoch": 2.382136279926335, + "grad_norm": 0.28600773215293884, + "learning_rate": 8.906012059515585e-05, + "loss": 1.8319, + "step": 7761 + }, + { + "epoch": 2.3824432166973604, + "grad_norm": 0.2743505537509918, + "learning_rate": 8.905701739189973e-05, + "loss": 1.8198, + "step": 7762 + }, + { + "epoch": 2.3827501534683857, + "grad_norm": 0.3011966347694397, + "learning_rate": 8.905391380265929e-05, + "loss": 1.8476, + "step": 7763 + }, + { + "epoch": 2.3830570902394106, + "grad_norm": 0.3022943437099457, + "learning_rate": 8.905080982746516e-05, + "loss": 1.9037, + "step": 7764 + }, + { + "epoch": 2.383364027010436, + "grad_norm": 0.3333243727684021, + "learning_rate": 8.904770546634805e-05, + "loss": 1.8487, + "step": 7765 + }, + { + "epoch": 2.3836709637814613, + "grad_norm": 0.3773072361946106, + "learning_rate": 8.904460071933862e-05, + "loss": 1.8828, + "step": 7766 + }, + { + "epoch": 2.383977900552486, + "grad_norm": 0.4382041096687317, + "learning_rate": 8.904149558646756e-05, + "loss": 1.9069, + "step": 7767 + }, + { + "epoch": 2.3842848373235115, + "grad_norm": 0.3963650166988373, + "learning_rate": 8.903839006776557e-05, + "loss": 1.816, + "step": 7768 + }, + { + "epoch": 2.3845917740945364, + "grad_norm": 0.35340386629104614, + "learning_rate": 8.903528416326333e-05, + "loss": 1.8853, + "step": 7769 + }, + { + "epoch": 2.3848987108655617, + "grad_norm": 0.31519120931625366, + "learning_rate": 8.903217787299153e-05, + "loss": 1.8953, + "step": 7770 + }, + { + "epoch": 2.385205647636587, + "grad_norm": 0.41126203536987305, + "learning_rate": 8.902907119698088e-05, + "loss": 1.9494, + "step": 7771 + }, + { + "epoch": 2.385512584407612, + "grad_norm": 0.4488140344619751, + "learning_rate": 8.902596413526205e-05, + "loss": 1.8717, + "step": 7772 + }, + { + "epoch": 2.3858195211786373, + "grad_norm": 0.36129191517829895, + "learning_rate": 8.902285668786578e-05, + "loss": 1.8472, + "step": 7773 + }, + { + "epoch": 2.386126457949662, + "grad_norm": 0.3357439935207367, + "learning_rate": 8.901974885482277e-05, + "loss": 1.8143, + "step": 7774 + }, + { + "epoch": 2.3864333947206875, + "grad_norm": 0.2832469046115875, + "learning_rate": 8.901664063616372e-05, + "loss": 1.7952, + "step": 7775 + }, + { + "epoch": 2.386740331491713, + "grad_norm": 0.31065669655799866, + "learning_rate": 8.901353203191937e-05, + "loss": 1.8651, + "step": 7776 + }, + { + "epoch": 2.3870472682627377, + "grad_norm": 0.2985263764858246, + "learning_rate": 8.901042304212042e-05, + "loss": 1.8106, + "step": 7777 + }, + { + "epoch": 2.387354205033763, + "grad_norm": 0.31606364250183105, + "learning_rate": 8.900731366679761e-05, + "loss": 1.8831, + "step": 7778 + }, + { + "epoch": 2.3876611418047884, + "grad_norm": 0.33167949318885803, + "learning_rate": 8.900420390598166e-05, + "loss": 1.9494, + "step": 7779 + }, + { + "epoch": 2.3879680785758133, + "grad_norm": 0.32814472913742065, + "learning_rate": 8.900109375970333e-05, + "loss": 1.8654, + "step": 7780 + }, + { + "epoch": 2.3882750153468386, + "grad_norm": 0.35307401418685913, + "learning_rate": 8.899798322799331e-05, + "loss": 1.904, + "step": 7781 + }, + { + "epoch": 2.388581952117864, + "grad_norm": 0.3936740458011627, + "learning_rate": 8.899487231088236e-05, + "loss": 1.8404, + "step": 7782 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.3675380349159241, + "learning_rate": 8.899176100840124e-05, + "loss": 1.8689, + "step": 7783 + }, + { + "epoch": 2.389195825659914, + "grad_norm": 0.34065911173820496, + "learning_rate": 8.898864932058067e-05, + "loss": 1.8819, + "step": 7784 + }, + { + "epoch": 2.389502762430939, + "grad_norm": 0.31531861424446106, + "learning_rate": 8.898553724745142e-05, + "loss": 1.8379, + "step": 7785 + }, + { + "epoch": 2.3898096992019644, + "grad_norm": 0.33485177159309387, + "learning_rate": 8.898242478904424e-05, + "loss": 1.9206, + "step": 7786 + }, + { + "epoch": 2.3901166359729897, + "grad_norm": 0.33116385340690613, + "learning_rate": 8.897931194538989e-05, + "loss": 1.8744, + "step": 7787 + }, + { + "epoch": 2.3904235727440146, + "grad_norm": 0.33216002583503723, + "learning_rate": 8.897619871651915e-05, + "loss": 1.8794, + "step": 7788 + }, + { + "epoch": 2.39073050951504, + "grad_norm": 0.3246794641017914, + "learning_rate": 8.897308510246273e-05, + "loss": 1.8739, + "step": 7789 + }, + { + "epoch": 2.391037446286065, + "grad_norm": 0.3038793206214905, + "learning_rate": 8.896997110325146e-05, + "loss": 1.8314, + "step": 7790 + }, + { + "epoch": 2.39134438305709, + "grad_norm": 0.35726267099380493, + "learning_rate": 8.896685671891612e-05, + "loss": 1.8764, + "step": 7791 + }, + { + "epoch": 2.3916513198281155, + "grad_norm": 0.421522855758667, + "learning_rate": 8.896374194948744e-05, + "loss": 1.8215, + "step": 7792 + }, + { + "epoch": 2.3919582565991404, + "grad_norm": 0.4456072747707367, + "learning_rate": 8.896062679499621e-05, + "loss": 1.9146, + "step": 7793 + }, + { + "epoch": 2.3922651933701657, + "grad_norm": 0.33498415350914, + "learning_rate": 8.895751125547325e-05, + "loss": 1.8372, + "step": 7794 + }, + { + "epoch": 2.392572130141191, + "grad_norm": 0.3279598355293274, + "learning_rate": 8.895439533094933e-05, + "loss": 1.8469, + "step": 7795 + }, + { + "epoch": 2.392879066912216, + "grad_norm": 0.4238305687904358, + "learning_rate": 8.895127902145524e-05, + "loss": 1.8259, + "step": 7796 + }, + { + "epoch": 2.3931860036832413, + "grad_norm": 0.473057359457016, + "learning_rate": 8.89481623270218e-05, + "loss": 1.8374, + "step": 7797 + }, + { + "epoch": 2.3934929404542666, + "grad_norm": 0.30914968252182007, + "learning_rate": 8.894504524767976e-05, + "loss": 1.7803, + "step": 7798 + }, + { + "epoch": 2.3937998772252915, + "grad_norm": 0.3433384597301483, + "learning_rate": 8.894192778345996e-05, + "loss": 1.8568, + "step": 7799 + }, + { + "epoch": 2.394106813996317, + "grad_norm": 0.4965706467628479, + "learning_rate": 8.893880993439323e-05, + "loss": 1.8576, + "step": 7800 + }, + { + "epoch": 2.3944137507673418, + "grad_norm": 0.4996519684791565, + "learning_rate": 8.893569170051032e-05, + "loss": 1.788, + "step": 7801 + }, + { + "epoch": 2.394720687538367, + "grad_norm": 0.31231364607810974, + "learning_rate": 8.893257308184212e-05, + "loss": 1.7846, + "step": 7802 + }, + { + "epoch": 2.3950276243093924, + "grad_norm": 0.32845574617385864, + "learning_rate": 8.89294540784194e-05, + "loss": 1.8811, + "step": 7803 + }, + { + "epoch": 2.3953345610804173, + "grad_norm": 0.525324285030365, + "learning_rate": 8.8926334690273e-05, + "loss": 1.8458, + "step": 7804 + }, + { + "epoch": 2.3956414978514426, + "grad_norm": 0.5107213854789734, + "learning_rate": 8.892321491743373e-05, + "loss": 1.8419, + "step": 7805 + }, + { + "epoch": 2.3959484346224675, + "grad_norm": 0.33831658959388733, + "learning_rate": 8.892009475993245e-05, + "loss": 1.811, + "step": 7806 + }, + { + "epoch": 2.396255371393493, + "grad_norm": 0.3781357407569885, + "learning_rate": 8.891697421779999e-05, + "loss": 1.9385, + "step": 7807 + }, + { + "epoch": 2.396562308164518, + "grad_norm": 0.43507882952690125, + "learning_rate": 8.891385329106717e-05, + "loss": 1.7705, + "step": 7808 + }, + { + "epoch": 2.396869244935543, + "grad_norm": 0.45114290714263916, + "learning_rate": 8.891073197976483e-05, + "loss": 1.8661, + "step": 7809 + }, + { + "epoch": 2.3971761817065684, + "grad_norm": 0.29369547963142395, + "learning_rate": 8.890761028392385e-05, + "loss": 1.873, + "step": 7810 + }, + { + "epoch": 2.3974831184775938, + "grad_norm": 0.3268595337867737, + "learning_rate": 8.890448820357506e-05, + "loss": 1.8461, + "step": 7811 + }, + { + "epoch": 2.3977900552486187, + "grad_norm": 0.4514225423336029, + "learning_rate": 8.890136573874931e-05, + "loss": 1.8458, + "step": 7812 + }, + { + "epoch": 2.398096992019644, + "grad_norm": 0.5288760662078857, + "learning_rate": 8.889824288947745e-05, + "loss": 1.8301, + "step": 7813 + }, + { + "epoch": 2.3984039287906693, + "grad_norm": 0.46517884731292725, + "learning_rate": 8.889511965579038e-05, + "loss": 1.8769, + "step": 7814 + }, + { + "epoch": 2.398710865561694, + "grad_norm": 0.29907044768333435, + "learning_rate": 8.889199603771892e-05, + "loss": 1.7815, + "step": 7815 + }, + { + "epoch": 2.3990178023327196, + "grad_norm": 0.36091622710227966, + "learning_rate": 8.888887203529398e-05, + "loss": 1.8375, + "step": 7816 + }, + { + "epoch": 2.3993247391037444, + "grad_norm": 0.5604190230369568, + "learning_rate": 8.88857476485464e-05, + "loss": 1.9176, + "step": 7817 + }, + { + "epoch": 2.3996316758747698, + "grad_norm": 0.48299452662467957, + "learning_rate": 8.888262287750707e-05, + "loss": 1.8682, + "step": 7818 + }, + { + "epoch": 2.399938612645795, + "grad_norm": 0.32829394936561584, + "learning_rate": 8.887949772220687e-05, + "loss": 1.9143, + "step": 7819 + }, + { + "epoch": 2.40024554941682, + "grad_norm": 0.401719868183136, + "learning_rate": 8.88763721826767e-05, + "loss": 1.8517, + "step": 7820 + }, + { + "epoch": 2.4005524861878453, + "grad_norm": 0.5205032825469971, + "learning_rate": 8.887324625894741e-05, + "loss": 1.811, + "step": 7821 + }, + { + "epoch": 2.4008594229588702, + "grad_norm": 0.3828800618648529, + "learning_rate": 8.887011995104993e-05, + "loss": 1.8042, + "step": 7822 + }, + { + "epoch": 2.4011663597298956, + "grad_norm": 0.31816062331199646, + "learning_rate": 8.886699325901514e-05, + "loss": 1.8998, + "step": 7823 + }, + { + "epoch": 2.401473296500921, + "grad_norm": 0.36172720789909363, + "learning_rate": 8.886386618287394e-05, + "loss": 1.8689, + "step": 7824 + }, + { + "epoch": 2.401780233271946, + "grad_norm": 0.3582005202770233, + "learning_rate": 8.886073872265725e-05, + "loss": 1.8565, + "step": 7825 + }, + { + "epoch": 2.402087170042971, + "grad_norm": 0.2915255129337311, + "learning_rate": 8.885761087839594e-05, + "loss": 1.8686, + "step": 7826 + }, + { + "epoch": 2.4023941068139965, + "grad_norm": 0.26619917154312134, + "learning_rate": 8.885448265012095e-05, + "loss": 1.7737, + "step": 7827 + }, + { + "epoch": 2.4027010435850213, + "grad_norm": 0.31685733795166016, + "learning_rate": 8.88513540378632e-05, + "loss": 1.9136, + "step": 7828 + }, + { + "epoch": 2.4030079803560467, + "grad_norm": 0.3427450954914093, + "learning_rate": 8.884822504165359e-05, + "loss": 1.8824, + "step": 7829 + }, + { + "epoch": 2.403314917127072, + "grad_norm": 0.3207513689994812, + "learning_rate": 8.884509566152306e-05, + "loss": 1.8332, + "step": 7830 + }, + { + "epoch": 2.403621853898097, + "grad_norm": 0.3301675319671631, + "learning_rate": 8.884196589750251e-05, + "loss": 1.9129, + "step": 7831 + }, + { + "epoch": 2.4039287906691222, + "grad_norm": 0.3232486844062805, + "learning_rate": 8.88388357496229e-05, + "loss": 1.8362, + "step": 7832 + }, + { + "epoch": 2.404235727440147, + "grad_norm": 0.3152230381965637, + "learning_rate": 8.883570521791514e-05, + "loss": 1.8586, + "step": 7833 + }, + { + "epoch": 2.4045426642111725, + "grad_norm": 0.3204822540283203, + "learning_rate": 8.883257430241019e-05, + "loss": 1.842, + "step": 7834 + }, + { + "epoch": 2.404849600982198, + "grad_norm": 0.28253886103630066, + "learning_rate": 8.882944300313897e-05, + "loss": 1.8521, + "step": 7835 + }, + { + "epoch": 2.4051565377532227, + "grad_norm": 0.37631165981292725, + "learning_rate": 8.882631132013245e-05, + "loss": 1.8838, + "step": 7836 + }, + { + "epoch": 2.405463474524248, + "grad_norm": 0.3606031537055969, + "learning_rate": 8.882317925342157e-05, + "loss": 1.8452, + "step": 7837 + }, + { + "epoch": 2.4057704112952734, + "grad_norm": 0.33793914318084717, + "learning_rate": 8.882004680303726e-05, + "loss": 1.8866, + "step": 7838 + }, + { + "epoch": 2.4060773480662982, + "grad_norm": 0.2714223265647888, + "learning_rate": 8.881691396901048e-05, + "loss": 1.7953, + "step": 7839 + }, + { + "epoch": 2.4063842848373236, + "grad_norm": 0.3588239252567291, + "learning_rate": 8.881378075137224e-05, + "loss": 1.9679, + "step": 7840 + }, + { + "epoch": 2.406691221608349, + "grad_norm": 0.3266383707523346, + "learning_rate": 8.881064715015344e-05, + "loss": 1.8747, + "step": 7841 + }, + { + "epoch": 2.406998158379374, + "grad_norm": 0.3498428761959076, + "learning_rate": 8.88075131653851e-05, + "loss": 1.8882, + "step": 7842 + }, + { + "epoch": 2.407305095150399, + "grad_norm": 0.36646100878715515, + "learning_rate": 8.880437879709815e-05, + "loss": 1.8624, + "step": 7843 + }, + { + "epoch": 2.407612031921424, + "grad_norm": 0.36088457703590393, + "learning_rate": 8.88012440453236e-05, + "loss": 1.8527, + "step": 7844 + }, + { + "epoch": 2.4079189686924494, + "grad_norm": 0.3267477750778198, + "learning_rate": 8.87981089100924e-05, + "loss": 1.8374, + "step": 7845 + }, + { + "epoch": 2.4082259054634747, + "grad_norm": 0.3262403607368469, + "learning_rate": 8.879497339143556e-05, + "loss": 1.8752, + "step": 7846 + }, + { + "epoch": 2.4085328422344996, + "grad_norm": 0.278877854347229, + "learning_rate": 8.879183748938405e-05, + "loss": 1.8056, + "step": 7847 + }, + { + "epoch": 2.408839779005525, + "grad_norm": 0.35509005188941956, + "learning_rate": 8.878870120396886e-05, + "loss": 1.8555, + "step": 7848 + }, + { + "epoch": 2.40914671577655, + "grad_norm": 0.3621126413345337, + "learning_rate": 8.8785564535221e-05, + "loss": 1.8084, + "step": 7849 + }, + { + "epoch": 2.409453652547575, + "grad_norm": 0.2772746682167053, + "learning_rate": 8.878242748317145e-05, + "loss": 1.8034, + "step": 7850 + }, + { + "epoch": 2.4097605893186005, + "grad_norm": 0.30938875675201416, + "learning_rate": 8.877929004785121e-05, + "loss": 1.8341, + "step": 7851 + }, + { + "epoch": 2.4100675260896254, + "grad_norm": 0.3349369764328003, + "learning_rate": 8.877615222929133e-05, + "loss": 1.8306, + "step": 7852 + }, + { + "epoch": 2.4103744628606507, + "grad_norm": 0.3109685778617859, + "learning_rate": 8.877301402752277e-05, + "loss": 1.7998, + "step": 7853 + }, + { + "epoch": 2.410681399631676, + "grad_norm": 0.3337927460670471, + "learning_rate": 8.876987544257655e-05, + "loss": 1.8766, + "step": 7854 + }, + { + "epoch": 2.410988336402701, + "grad_norm": 0.33891361951828003, + "learning_rate": 8.87667364744837e-05, + "loss": 1.8535, + "step": 7855 + }, + { + "epoch": 2.4112952731737263, + "grad_norm": 0.30946552753448486, + "learning_rate": 8.876359712327524e-05, + "loss": 1.8144, + "step": 7856 + }, + { + "epoch": 2.4116022099447516, + "grad_norm": 0.354981929063797, + "learning_rate": 8.87604573889822e-05, + "loss": 1.9253, + "step": 7857 + }, + { + "epoch": 2.4119091467157765, + "grad_norm": 0.42054516077041626, + "learning_rate": 8.875731727163559e-05, + "loss": 1.9122, + "step": 7858 + }, + { + "epoch": 2.412216083486802, + "grad_norm": 0.37435492873191833, + "learning_rate": 8.875417677126646e-05, + "loss": 1.8639, + "step": 7859 + }, + { + "epoch": 2.4125230202578267, + "grad_norm": 0.3742216229438782, + "learning_rate": 8.875103588790584e-05, + "loss": 1.8398, + "step": 7860 + }, + { + "epoch": 2.412829957028852, + "grad_norm": 0.3152104616165161, + "learning_rate": 8.874789462158478e-05, + "loss": 1.8078, + "step": 7861 + }, + { + "epoch": 2.4131368937998774, + "grad_norm": 0.32342761754989624, + "learning_rate": 8.87447529723343e-05, + "loss": 1.8632, + "step": 7862 + }, + { + "epoch": 2.4134438305709023, + "grad_norm": 0.31065210700035095, + "learning_rate": 8.874161094018547e-05, + "loss": 1.845, + "step": 7863 + }, + { + "epoch": 2.4137507673419276, + "grad_norm": 0.31379538774490356, + "learning_rate": 8.873846852516933e-05, + "loss": 1.8184, + "step": 7864 + }, + { + "epoch": 2.4140577041129525, + "grad_norm": 0.29058924317359924, + "learning_rate": 8.873532572731694e-05, + "loss": 1.8671, + "step": 7865 + }, + { + "epoch": 2.414364640883978, + "grad_norm": 0.3024691641330719, + "learning_rate": 8.873218254665936e-05, + "loss": 1.7977, + "step": 7866 + }, + { + "epoch": 2.414671577655003, + "grad_norm": 0.30356913805007935, + "learning_rate": 8.872903898322764e-05, + "loss": 1.8284, + "step": 7867 + }, + { + "epoch": 2.414978514426028, + "grad_norm": 0.29594334959983826, + "learning_rate": 8.872589503705287e-05, + "loss": 1.8651, + "step": 7868 + }, + { + "epoch": 2.4152854511970534, + "grad_norm": 0.2929564118385315, + "learning_rate": 8.872275070816612e-05, + "loss": 1.8671, + "step": 7869 + }, + { + "epoch": 2.4155923879680787, + "grad_norm": 0.30591902136802673, + "learning_rate": 8.871960599659842e-05, + "loss": 1.9341, + "step": 7870 + }, + { + "epoch": 2.4158993247391036, + "grad_norm": 0.3944799304008484, + "learning_rate": 8.87164609023809e-05, + "loss": 1.8947, + "step": 7871 + }, + { + "epoch": 2.416206261510129, + "grad_norm": 0.3568263351917267, + "learning_rate": 8.871331542554461e-05, + "loss": 1.8466, + "step": 7872 + }, + { + "epoch": 2.4165131982811543, + "grad_norm": 0.3182635009288788, + "learning_rate": 8.871016956612066e-05, + "loss": 1.8373, + "step": 7873 + }, + { + "epoch": 2.416820135052179, + "grad_norm": 0.31941649317741394, + "learning_rate": 8.870702332414012e-05, + "loss": 1.8356, + "step": 7874 + }, + { + "epoch": 2.4171270718232045, + "grad_norm": 0.3090899586677551, + "learning_rate": 8.870387669963407e-05, + "loss": 1.9308, + "step": 7875 + }, + { + "epoch": 2.4174340085942294, + "grad_norm": 0.3078390955924988, + "learning_rate": 8.870072969263364e-05, + "loss": 1.8521, + "step": 7876 + }, + { + "epoch": 2.4177409453652547, + "grad_norm": 0.29126885533332825, + "learning_rate": 8.869758230316992e-05, + "loss": 1.8091, + "step": 7877 + }, + { + "epoch": 2.41804788213628, + "grad_norm": 0.36473605036735535, + "learning_rate": 8.869443453127402e-05, + "loss": 1.8282, + "step": 7878 + }, + { + "epoch": 2.418354818907305, + "grad_norm": 0.3617660701274872, + "learning_rate": 8.869128637697702e-05, + "loss": 1.8843, + "step": 7879 + }, + { + "epoch": 2.4186617556783303, + "grad_norm": 0.33267220854759216, + "learning_rate": 8.868813784031005e-05, + "loss": 1.8647, + "step": 7880 + }, + { + "epoch": 2.418968692449355, + "grad_norm": 0.29990482330322266, + "learning_rate": 8.868498892130424e-05, + "loss": 1.7697, + "step": 7881 + }, + { + "epoch": 2.4192756292203805, + "grad_norm": 0.3618892431259155, + "learning_rate": 8.868183961999068e-05, + "loss": 1.7699, + "step": 7882 + }, + { + "epoch": 2.419582565991406, + "grad_norm": 0.29534587264060974, + "learning_rate": 8.867868993640051e-05, + "loss": 1.828, + "step": 7883 + }, + { + "epoch": 2.4198895027624308, + "grad_norm": 0.3086758255958557, + "learning_rate": 8.867553987056487e-05, + "loss": 1.8652, + "step": 7884 + }, + { + "epoch": 2.420196439533456, + "grad_norm": 0.3273947834968567, + "learning_rate": 8.867238942251487e-05, + "loss": 1.8553, + "step": 7885 + }, + { + "epoch": 2.4205033763044814, + "grad_norm": 0.3069070279598236, + "learning_rate": 8.866923859228165e-05, + "loss": 1.8057, + "step": 7886 + }, + { + "epoch": 2.4208103130755063, + "grad_norm": 0.2884439527988434, + "learning_rate": 8.866608737989635e-05, + "loss": 1.8479, + "step": 7887 + }, + { + "epoch": 2.4211172498465316, + "grad_norm": 0.32123002409935, + "learning_rate": 8.866293578539011e-05, + "loss": 1.916, + "step": 7888 + }, + { + "epoch": 2.421424186617557, + "grad_norm": 0.285966157913208, + "learning_rate": 8.865978380879407e-05, + "loss": 1.834, + "step": 7889 + }, + { + "epoch": 2.421731123388582, + "grad_norm": 0.28088799118995667, + "learning_rate": 8.865663145013941e-05, + "loss": 1.7794, + "step": 7890 + }, + { + "epoch": 2.422038060159607, + "grad_norm": 0.31160372495651245, + "learning_rate": 8.865347870945724e-05, + "loss": 1.8584, + "step": 7891 + }, + { + "epoch": 2.422344996930632, + "grad_norm": 0.3121089041233063, + "learning_rate": 8.865032558677874e-05, + "loss": 1.8797, + "step": 7892 + }, + { + "epoch": 2.4226519337016574, + "grad_norm": 0.35856643319129944, + "learning_rate": 8.864717208213506e-05, + "loss": 1.8664, + "step": 7893 + }, + { + "epoch": 2.4229588704726828, + "grad_norm": 0.32826781272888184, + "learning_rate": 8.864401819555739e-05, + "loss": 1.8473, + "step": 7894 + }, + { + "epoch": 2.4232658072437077, + "grad_norm": 0.34450921416282654, + "learning_rate": 8.86408639270769e-05, + "loss": 1.918, + "step": 7895 + }, + { + "epoch": 2.423572744014733, + "grad_norm": 0.39621153473854065, + "learning_rate": 8.86377092767247e-05, + "loss": 1.9411, + "step": 7896 + }, + { + "epoch": 2.423879680785758, + "grad_norm": 0.3765166103839874, + "learning_rate": 8.863455424453204e-05, + "loss": 1.9003, + "step": 7897 + }, + { + "epoch": 2.424186617556783, + "grad_norm": 0.3942621946334839, + "learning_rate": 8.863139883053007e-05, + "loss": 1.9647, + "step": 7898 + }, + { + "epoch": 2.4244935543278086, + "grad_norm": 0.4255806803703308, + "learning_rate": 8.862824303474996e-05, + "loss": 1.9147, + "step": 7899 + }, + { + "epoch": 2.424800491098834, + "grad_norm": 0.3993197977542877, + "learning_rate": 8.862508685722292e-05, + "loss": 1.8822, + "step": 7900 + }, + { + "epoch": 2.425107427869859, + "grad_norm": 0.3734201490879059, + "learning_rate": 8.862193029798013e-05, + "loss": 1.8745, + "step": 7901 + }, + { + "epoch": 2.425414364640884, + "grad_norm": 0.40955278277397156, + "learning_rate": 8.861877335705279e-05, + "loss": 1.877, + "step": 7902 + }, + { + "epoch": 2.425721301411909, + "grad_norm": 0.3975965678691864, + "learning_rate": 8.861561603447211e-05, + "loss": 1.868, + "step": 7903 + }, + { + "epoch": 2.4260282381829343, + "grad_norm": 0.30194091796875, + "learning_rate": 8.861245833026926e-05, + "loss": 1.7849, + "step": 7904 + }, + { + "epoch": 2.4263351749539597, + "grad_norm": 0.349930077791214, + "learning_rate": 8.860930024447547e-05, + "loss": 1.891, + "step": 7905 + }, + { + "epoch": 2.4266421117249846, + "grad_norm": 0.40644606947898865, + "learning_rate": 8.860614177712196e-05, + "loss": 1.8463, + "step": 7906 + }, + { + "epoch": 2.42694904849601, + "grad_norm": 0.3627426028251648, + "learning_rate": 8.86029829282399e-05, + "loss": 1.8518, + "step": 7907 + }, + { + "epoch": 2.427255985267035, + "grad_norm": 0.4019826054573059, + "learning_rate": 8.859982369786055e-05, + "loss": 1.7997, + "step": 7908 + }, + { + "epoch": 2.42756292203806, + "grad_norm": 0.375589519739151, + "learning_rate": 8.859666408601512e-05, + "loss": 1.9136, + "step": 7909 + }, + { + "epoch": 2.4278698588090855, + "grad_norm": 0.3135814070701599, + "learning_rate": 8.859350409273484e-05, + "loss": 1.8511, + "step": 7910 + }, + { + "epoch": 2.4281767955801103, + "grad_norm": 0.4534473717212677, + "learning_rate": 8.859034371805093e-05, + "loss": 1.9827, + "step": 7911 + }, + { + "epoch": 2.4284837323511357, + "grad_norm": 0.5559772849082947, + "learning_rate": 8.858718296199462e-05, + "loss": 1.8578, + "step": 7912 + }, + { + "epoch": 2.428790669122161, + "grad_norm": 0.4518011212348938, + "learning_rate": 8.858402182459715e-05, + "loss": 1.8374, + "step": 7913 + }, + { + "epoch": 2.429097605893186, + "grad_norm": 0.31662946939468384, + "learning_rate": 8.858086030588977e-05, + "loss": 1.8356, + "step": 7914 + }, + { + "epoch": 2.4294045426642112, + "grad_norm": 0.4660717844963074, + "learning_rate": 8.857769840590371e-05, + "loss": 1.7977, + "step": 7915 + }, + { + "epoch": 2.4297114794352366, + "grad_norm": 0.5611162185668945, + "learning_rate": 8.857453612467022e-05, + "loss": 1.8423, + "step": 7916 + }, + { + "epoch": 2.4300184162062615, + "grad_norm": 0.5055921077728271, + "learning_rate": 8.857137346222056e-05, + "loss": 1.8595, + "step": 7917 + }, + { + "epoch": 2.430325352977287, + "grad_norm": 0.3589123487472534, + "learning_rate": 8.856821041858597e-05, + "loss": 1.776, + "step": 7918 + }, + { + "epoch": 2.4306322897483117, + "grad_norm": 0.36849313974380493, + "learning_rate": 8.856504699379773e-05, + "loss": 1.8695, + "step": 7919 + }, + { + "epoch": 2.430939226519337, + "grad_norm": 0.47566625475883484, + "learning_rate": 8.856188318788709e-05, + "loss": 1.8578, + "step": 7920 + }, + { + "epoch": 2.4312461632903624, + "grad_norm": 0.554790735244751, + "learning_rate": 8.855871900088532e-05, + "loss": 1.8406, + "step": 7921 + }, + { + "epoch": 2.4315531000613873, + "grad_norm": 0.4846283197402954, + "learning_rate": 8.855555443282369e-05, + "loss": 1.8475, + "step": 7922 + }, + { + "epoch": 2.4318600368324126, + "grad_norm": 0.35256531834602356, + "learning_rate": 8.855238948373346e-05, + "loss": 1.8594, + "step": 7923 + }, + { + "epoch": 2.4321669736034375, + "grad_norm": 0.3713412880897522, + "learning_rate": 8.854922415364593e-05, + "loss": 1.893, + "step": 7924 + }, + { + "epoch": 2.432473910374463, + "grad_norm": 0.4289644658565521, + "learning_rate": 8.854605844259237e-05, + "loss": 1.8958, + "step": 7925 + }, + { + "epoch": 2.432780847145488, + "grad_norm": 0.4209578335285187, + "learning_rate": 8.854289235060406e-05, + "loss": 1.8419, + "step": 7926 + }, + { + "epoch": 2.433087783916513, + "grad_norm": 0.41226091980934143, + "learning_rate": 8.853972587771232e-05, + "loss": 1.958, + "step": 7927 + }, + { + "epoch": 2.4333947206875384, + "grad_norm": 0.36133915185928345, + "learning_rate": 8.853655902394841e-05, + "loss": 1.9181, + "step": 7928 + }, + { + "epoch": 2.4337016574585637, + "grad_norm": 0.44178202748298645, + "learning_rate": 8.853339178934363e-05, + "loss": 1.9242, + "step": 7929 + }, + { + "epoch": 2.4340085942295886, + "grad_norm": 0.4537523686885834, + "learning_rate": 8.853022417392929e-05, + "loss": 2.0451, + "step": 7930 + }, + { + "epoch": 2.434315531000614, + "grad_norm": 0.3214915990829468, + "learning_rate": 8.852705617773669e-05, + "loss": 1.8549, + "step": 7931 + }, + { + "epoch": 2.4346224677716393, + "grad_norm": 0.4621930420398712, + "learning_rate": 8.852388780079714e-05, + "loss": 1.8705, + "step": 7932 + }, + { + "epoch": 2.434929404542664, + "grad_norm": 0.52337646484375, + "learning_rate": 8.852071904314196e-05, + "loss": 1.8381, + "step": 7933 + }, + { + "epoch": 2.4352363413136895, + "grad_norm": 0.3846060633659363, + "learning_rate": 8.851754990480246e-05, + "loss": 1.828, + "step": 7934 + }, + { + "epoch": 2.4355432780847144, + "grad_norm": 0.34233763813972473, + "learning_rate": 8.851438038580994e-05, + "loss": 1.924, + "step": 7935 + }, + { + "epoch": 2.4358502148557397, + "grad_norm": 0.39583292603492737, + "learning_rate": 8.851121048619574e-05, + "loss": 1.8383, + "step": 7936 + }, + { + "epoch": 2.436157151626765, + "grad_norm": 0.3715476393699646, + "learning_rate": 8.850804020599119e-05, + "loss": 1.9251, + "step": 7937 + }, + { + "epoch": 2.43646408839779, + "grad_norm": 0.32089582085609436, + "learning_rate": 8.850486954522762e-05, + "loss": 1.9317, + "step": 7938 + }, + { + "epoch": 2.4367710251688153, + "grad_norm": 0.46823611855506897, + "learning_rate": 8.850169850393634e-05, + "loss": 1.9743, + "step": 7939 + }, + { + "epoch": 2.43707796193984, + "grad_norm": 0.405205637216568, + "learning_rate": 8.849852708214874e-05, + "loss": 1.8772, + "step": 7940 + }, + { + "epoch": 2.4373848987108655, + "grad_norm": 0.33672770857810974, + "learning_rate": 8.849535527989612e-05, + "loss": 1.8767, + "step": 7941 + }, + { + "epoch": 2.437691835481891, + "grad_norm": 0.38022953271865845, + "learning_rate": 8.849218309720983e-05, + "loss": 1.8882, + "step": 7942 + }, + { + "epoch": 2.4379987722529157, + "grad_norm": 0.4224186837673187, + "learning_rate": 8.848901053412124e-05, + "loss": 1.9016, + "step": 7943 + }, + { + "epoch": 2.438305709023941, + "grad_norm": 0.3890904486179352, + "learning_rate": 8.848583759066167e-05, + "loss": 1.8761, + "step": 7944 + }, + { + "epoch": 2.4386126457949664, + "grad_norm": 0.3747030794620514, + "learning_rate": 8.84826642668625e-05, + "loss": 1.8576, + "step": 7945 + }, + { + "epoch": 2.4389195825659913, + "grad_norm": 0.3317604959011078, + "learning_rate": 8.84794905627551e-05, + "loss": 1.9249, + "step": 7946 + }, + { + "epoch": 2.4392265193370166, + "grad_norm": 0.3294972777366638, + "learning_rate": 8.84763164783708e-05, + "loss": 1.8308, + "step": 7947 + }, + { + "epoch": 2.439533456108042, + "grad_norm": 0.42031124234199524, + "learning_rate": 8.847314201374101e-05, + "loss": 1.7884, + "step": 7948 + }, + { + "epoch": 2.439840392879067, + "grad_norm": 0.4018419682979584, + "learning_rate": 8.846996716889708e-05, + "loss": 1.8334, + "step": 7949 + }, + { + "epoch": 2.440147329650092, + "grad_norm": 0.39541858434677124, + "learning_rate": 8.846679194387036e-05, + "loss": 1.888, + "step": 7950 + }, + { + "epoch": 2.440454266421117, + "grad_norm": 0.34641456604003906, + "learning_rate": 8.846361633869228e-05, + "loss": 1.8521, + "step": 7951 + }, + { + "epoch": 2.4407612031921424, + "grad_norm": 0.42987826466560364, + "learning_rate": 8.846044035339419e-05, + "loss": 1.8789, + "step": 7952 + }, + { + "epoch": 2.4410681399631677, + "grad_norm": 0.3651089072227478, + "learning_rate": 8.845726398800749e-05, + "loss": 1.9024, + "step": 7953 + }, + { + "epoch": 2.4413750767341926, + "grad_norm": 0.3024137616157532, + "learning_rate": 8.845408724256356e-05, + "loss": 1.7773, + "step": 7954 + }, + { + "epoch": 2.441682013505218, + "grad_norm": 0.32426944375038147, + "learning_rate": 8.845091011709381e-05, + "loss": 1.7873, + "step": 7955 + }, + { + "epoch": 2.441988950276243, + "grad_norm": 0.34448274970054626, + "learning_rate": 8.844773261162962e-05, + "loss": 1.8854, + "step": 7956 + }, + { + "epoch": 2.442295887047268, + "grad_norm": 0.2942068874835968, + "learning_rate": 8.844455472620241e-05, + "loss": 1.8186, + "step": 7957 + }, + { + "epoch": 2.4426028238182935, + "grad_norm": 0.3849888741970062, + "learning_rate": 8.844137646084358e-05, + "loss": 1.905, + "step": 7958 + }, + { + "epoch": 2.4429097605893184, + "grad_norm": 0.44277897477149963, + "learning_rate": 8.843819781558452e-05, + "loss": 1.8836, + "step": 7959 + }, + { + "epoch": 2.4432166973603437, + "grad_norm": 0.34470248222351074, + "learning_rate": 8.843501879045667e-05, + "loss": 1.9368, + "step": 7960 + }, + { + "epoch": 2.443523634131369, + "grad_norm": 0.29713204503059387, + "learning_rate": 8.843183938549145e-05, + "loss": 1.8562, + "step": 7961 + }, + { + "epoch": 2.443830570902394, + "grad_norm": 0.370623379945755, + "learning_rate": 8.842865960072025e-05, + "loss": 1.8501, + "step": 7962 + }, + { + "epoch": 2.4441375076734193, + "grad_norm": 0.38828277587890625, + "learning_rate": 8.842547943617453e-05, + "loss": 1.884, + "step": 7963 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.294223427772522, + "learning_rate": 8.842229889188566e-05, + "loss": 1.857, + "step": 7964 + }, + { + "epoch": 2.4447513812154695, + "grad_norm": 0.31901589035987854, + "learning_rate": 8.841911796788516e-05, + "loss": 1.8675, + "step": 7965 + }, + { + "epoch": 2.445058317986495, + "grad_norm": 0.3586447834968567, + "learning_rate": 8.84159366642044e-05, + "loss": 1.86, + "step": 7966 + }, + { + "epoch": 2.4453652547575198, + "grad_norm": 0.30848199129104614, + "learning_rate": 8.841275498087482e-05, + "loss": 1.8153, + "step": 7967 + }, + { + "epoch": 2.445672191528545, + "grad_norm": 0.2694801688194275, + "learning_rate": 8.84095729179279e-05, + "loss": 1.7702, + "step": 7968 + }, + { + "epoch": 2.4459791282995704, + "grad_norm": 0.3068044185638428, + "learning_rate": 8.840639047539507e-05, + "loss": 1.8531, + "step": 7969 + }, + { + "epoch": 2.4462860650705953, + "grad_norm": 0.32885125279426575, + "learning_rate": 8.840320765330776e-05, + "loss": 1.9194, + "step": 7970 + }, + { + "epoch": 2.4465930018416207, + "grad_norm": 0.2949635088443756, + "learning_rate": 8.840002445169746e-05, + "loss": 1.8427, + "step": 7971 + }, + { + "epoch": 2.446899938612646, + "grad_norm": 0.27281275391578674, + "learning_rate": 8.83968408705956e-05, + "loss": 1.8279, + "step": 7972 + }, + { + "epoch": 2.447206875383671, + "grad_norm": 0.3038519620895386, + "learning_rate": 8.839365691003367e-05, + "loss": 1.8629, + "step": 7973 + }, + { + "epoch": 2.447513812154696, + "grad_norm": 0.28468266129493713, + "learning_rate": 8.839047257004311e-05, + "loss": 1.8765, + "step": 7974 + }, + { + "epoch": 2.4478207489257215, + "grad_norm": 0.29807159304618835, + "learning_rate": 8.83872878506554e-05, + "loss": 1.8152, + "step": 7975 + }, + { + "epoch": 2.4481276856967464, + "grad_norm": 0.3005301356315613, + "learning_rate": 8.838410275190201e-05, + "loss": 1.8577, + "step": 7976 + }, + { + "epoch": 2.4484346224677718, + "grad_norm": 0.3068598806858063, + "learning_rate": 8.838091727381442e-05, + "loss": 1.863, + "step": 7977 + }, + { + "epoch": 2.4487415592387967, + "grad_norm": 0.33748000860214233, + "learning_rate": 8.837773141642411e-05, + "loss": 1.7889, + "step": 7978 + }, + { + "epoch": 2.449048496009822, + "grad_norm": 0.344417542219162, + "learning_rate": 8.837454517976256e-05, + "loss": 1.9167, + "step": 7979 + }, + { + "epoch": 2.4493554327808473, + "grad_norm": 0.29128298163414, + "learning_rate": 8.837135856386127e-05, + "loss": 1.8246, + "step": 7980 + }, + { + "epoch": 2.449662369551872, + "grad_norm": 0.27023759484291077, + "learning_rate": 8.836817156875172e-05, + "loss": 1.8493, + "step": 7981 + }, + { + "epoch": 2.4499693063228976, + "grad_norm": 0.2792586088180542, + "learning_rate": 8.836498419446541e-05, + "loss": 1.8739, + "step": 7982 + }, + { + "epoch": 2.4502762430939224, + "grad_norm": 0.2715211510658264, + "learning_rate": 8.836179644103384e-05, + "loss": 1.8218, + "step": 7983 + }, + { + "epoch": 2.450583179864948, + "grad_norm": 0.273576557636261, + "learning_rate": 8.835860830848851e-05, + "loss": 1.9063, + "step": 7984 + }, + { + "epoch": 2.450890116635973, + "grad_norm": 0.2992589473724365, + "learning_rate": 8.835541979686093e-05, + "loss": 1.8799, + "step": 7985 + }, + { + "epoch": 2.451197053406998, + "grad_norm": 0.3231843411922455, + "learning_rate": 8.835223090618263e-05, + "loss": 1.8956, + "step": 7986 + }, + { + "epoch": 2.4515039901780233, + "grad_norm": 0.31108468770980835, + "learning_rate": 8.834904163648508e-05, + "loss": 1.8371, + "step": 7987 + }, + { + "epoch": 2.4518109269490487, + "grad_norm": 0.26657021045684814, + "learning_rate": 8.834585198779983e-05, + "loss": 1.8384, + "step": 7988 + }, + { + "epoch": 2.4521178637200736, + "grad_norm": 0.32093849778175354, + "learning_rate": 8.83426619601584e-05, + "loss": 1.8603, + "step": 7989 + }, + { + "epoch": 2.452424800491099, + "grad_norm": 0.32942765951156616, + "learning_rate": 8.833947155359231e-05, + "loss": 1.8306, + "step": 7990 + }, + { + "epoch": 2.4527317372621242, + "grad_norm": 0.31677374243736267, + "learning_rate": 8.83362807681331e-05, + "loss": 1.8339, + "step": 7991 + }, + { + "epoch": 2.453038674033149, + "grad_norm": 0.2739655673503876, + "learning_rate": 8.833308960381228e-05, + "loss": 1.8514, + "step": 7992 + }, + { + "epoch": 2.4533456108041745, + "grad_norm": 0.3194214105606079, + "learning_rate": 8.83298980606614e-05, + "loss": 1.8413, + "step": 7993 + }, + { + "epoch": 2.4536525475751993, + "grad_norm": 0.3346202075481415, + "learning_rate": 8.832670613871202e-05, + "loss": 1.8558, + "step": 7994 + }, + { + "epoch": 2.4539594843462247, + "grad_norm": 0.3400736451148987, + "learning_rate": 8.832351383799565e-05, + "loss": 1.8668, + "step": 7995 + }, + { + "epoch": 2.45426642111725, + "grad_norm": 0.2807479202747345, + "learning_rate": 8.832032115854385e-05, + "loss": 1.8361, + "step": 7996 + }, + { + "epoch": 2.454573357888275, + "grad_norm": 0.2977379262447357, + "learning_rate": 8.831712810038817e-05, + "loss": 1.84, + "step": 7997 + }, + { + "epoch": 2.4548802946593002, + "grad_norm": 0.3242948353290558, + "learning_rate": 8.831393466356019e-05, + "loss": 1.9421, + "step": 7998 + }, + { + "epoch": 2.455187231430325, + "grad_norm": 0.3289327025413513, + "learning_rate": 8.831074084809144e-05, + "loss": 1.9348, + "step": 7999 + }, + { + "epoch": 2.4554941682013505, + "grad_norm": 0.3378387987613678, + "learning_rate": 8.830754665401351e-05, + "loss": 1.7871, + "step": 8000 + }, + { + "epoch": 2.455801104972376, + "grad_norm": 0.29627665877342224, + "learning_rate": 8.830435208135794e-05, + "loss": 1.815, + "step": 8001 + }, + { + "epoch": 2.4561080417434007, + "grad_norm": 0.3509432375431061, + "learning_rate": 8.83011571301563e-05, + "loss": 1.9209, + "step": 8002 + }, + { + "epoch": 2.456414978514426, + "grad_norm": 0.3272305130958557, + "learning_rate": 8.829796180044019e-05, + "loss": 1.8437, + "step": 8003 + }, + { + "epoch": 2.4567219152854514, + "grad_norm": 0.33997493982315063, + "learning_rate": 8.829476609224119e-05, + "loss": 1.8827, + "step": 8004 + }, + { + "epoch": 2.4570288520564763, + "grad_norm": 0.30387789011001587, + "learning_rate": 8.829157000559084e-05, + "loss": 1.8427, + "step": 8005 + }, + { + "epoch": 2.4573357888275016, + "grad_norm": 0.30266425013542175, + "learning_rate": 8.828837354052075e-05, + "loss": 1.8274, + "step": 8006 + }, + { + "epoch": 2.457642725598527, + "grad_norm": 0.365546315908432, + "learning_rate": 8.828517669706254e-05, + "loss": 1.8455, + "step": 8007 + }, + { + "epoch": 2.457949662369552, + "grad_norm": 0.339226633310318, + "learning_rate": 8.828197947524774e-05, + "loss": 1.8665, + "step": 8008 + }, + { + "epoch": 2.458256599140577, + "grad_norm": 0.31167346239089966, + "learning_rate": 8.8278781875108e-05, + "loss": 1.7807, + "step": 8009 + }, + { + "epoch": 2.458563535911602, + "grad_norm": 0.2788028120994568, + "learning_rate": 8.82755838966749e-05, + "loss": 1.8834, + "step": 8010 + }, + { + "epoch": 2.4588704726826274, + "grad_norm": 0.34648752212524414, + "learning_rate": 8.827238553998005e-05, + "loss": 1.8981, + "step": 8011 + }, + { + "epoch": 2.4591774094536527, + "grad_norm": 0.3169974982738495, + "learning_rate": 8.826918680505504e-05, + "loss": 1.81, + "step": 8012 + }, + { + "epoch": 2.4594843462246776, + "grad_norm": 0.46924272179603577, + "learning_rate": 8.826598769193151e-05, + "loss": 1.9016, + "step": 8013 + }, + { + "epoch": 2.459791282995703, + "grad_norm": 0.38437098264694214, + "learning_rate": 8.826278820064106e-05, + "loss": 1.8924, + "step": 8014 + }, + { + "epoch": 2.460098219766728, + "grad_norm": 0.3350604474544525, + "learning_rate": 8.82595883312153e-05, + "loss": 1.8591, + "step": 8015 + }, + { + "epoch": 2.460405156537753, + "grad_norm": 0.3053742051124573, + "learning_rate": 8.825638808368588e-05, + "loss": 1.8114, + "step": 8016 + }, + { + "epoch": 2.4607120933087785, + "grad_norm": 0.29566875100135803, + "learning_rate": 8.82531874580844e-05, + "loss": 1.8055, + "step": 8017 + }, + { + "epoch": 2.4610190300798034, + "grad_norm": 0.3057360053062439, + "learning_rate": 8.824998645444249e-05, + "loss": 1.8268, + "step": 8018 + }, + { + "epoch": 2.4613259668508287, + "grad_norm": 0.27333348989486694, + "learning_rate": 8.82467850727918e-05, + "loss": 1.7876, + "step": 8019 + }, + { + "epoch": 2.461632903621854, + "grad_norm": 0.29202890396118164, + "learning_rate": 8.824358331316398e-05, + "loss": 1.8488, + "step": 8020 + }, + { + "epoch": 2.461939840392879, + "grad_norm": 0.3640623986721039, + "learning_rate": 8.824038117559064e-05, + "loss": 1.9665, + "step": 8021 + }, + { + "epoch": 2.4622467771639043, + "grad_norm": 0.35411131381988525, + "learning_rate": 8.823717866010344e-05, + "loss": 1.8561, + "step": 8022 + }, + { + "epoch": 2.4625537139349296, + "grad_norm": 0.3695240020751953, + "learning_rate": 8.823397576673403e-05, + "loss": 1.8489, + "step": 8023 + }, + { + "epoch": 2.4628606507059545, + "grad_norm": 0.36554715037345886, + "learning_rate": 8.823077249551406e-05, + "loss": 1.8523, + "step": 8024 + }, + { + "epoch": 2.46316758747698, + "grad_norm": 0.2982638478279114, + "learning_rate": 8.822756884647521e-05, + "loss": 1.8006, + "step": 8025 + }, + { + "epoch": 2.4634745242480047, + "grad_norm": 0.3693525791168213, + "learning_rate": 8.822436481964909e-05, + "loss": 1.8695, + "step": 8026 + }, + { + "epoch": 2.46378146101903, + "grad_norm": 0.46769842505455017, + "learning_rate": 8.82211604150674e-05, + "loss": 1.8509, + "step": 8027 + }, + { + "epoch": 2.4640883977900554, + "grad_norm": 0.5327584743499756, + "learning_rate": 8.82179556327618e-05, + "loss": 1.8642, + "step": 8028 + }, + { + "epoch": 2.4643953345610803, + "grad_norm": 0.5302795767784119, + "learning_rate": 8.821475047276398e-05, + "loss": 1.8645, + "step": 8029 + }, + { + "epoch": 2.4647022713321056, + "grad_norm": 0.43549028038978577, + "learning_rate": 8.821154493510557e-05, + "loss": 1.9193, + "step": 8030 + }, + { + "epoch": 2.4650092081031305, + "grad_norm": 0.3013847768306732, + "learning_rate": 8.82083390198183e-05, + "loss": 1.7819, + "step": 8031 + }, + { + "epoch": 2.465316144874156, + "grad_norm": 0.422325074672699, + "learning_rate": 8.820513272693383e-05, + "loss": 1.9307, + "step": 8032 + }, + { + "epoch": 2.465623081645181, + "grad_norm": 0.4823217988014221, + "learning_rate": 8.820192605648383e-05, + "loss": 1.8681, + "step": 8033 + }, + { + "epoch": 2.465930018416206, + "grad_norm": 0.3938382863998413, + "learning_rate": 8.819871900850001e-05, + "loss": 1.8483, + "step": 8034 + }, + { + "epoch": 2.4662369551872314, + "grad_norm": 0.30860164761543274, + "learning_rate": 8.819551158301406e-05, + "loss": 1.8818, + "step": 8035 + }, + { + "epoch": 2.4665438919582567, + "grad_norm": 0.3715503215789795, + "learning_rate": 8.819230378005767e-05, + "loss": 1.8443, + "step": 8036 + }, + { + "epoch": 2.4668508287292816, + "grad_norm": 0.4750272333621979, + "learning_rate": 8.818909559966255e-05, + "loss": 1.8379, + "step": 8037 + }, + { + "epoch": 2.467157765500307, + "grad_norm": 0.4794345796108246, + "learning_rate": 8.818588704186041e-05, + "loss": 1.8585, + "step": 8038 + }, + { + "epoch": 2.4674647022713323, + "grad_norm": 0.33470577001571655, + "learning_rate": 8.818267810668296e-05, + "loss": 1.8231, + "step": 8039 + }, + { + "epoch": 2.467771639042357, + "grad_norm": 0.31480371952056885, + "learning_rate": 8.817946879416191e-05, + "loss": 1.867, + "step": 8040 + }, + { + "epoch": 2.4680785758133825, + "grad_norm": 0.41635531187057495, + "learning_rate": 8.817625910432897e-05, + "loss": 1.9385, + "step": 8041 + }, + { + "epoch": 2.4683855125844074, + "grad_norm": 0.4570399522781372, + "learning_rate": 8.817304903721584e-05, + "loss": 1.7855, + "step": 8042 + }, + { + "epoch": 2.4686924493554327, + "grad_norm": 0.36506229639053345, + "learning_rate": 8.816983859285429e-05, + "loss": 1.808, + "step": 8043 + }, + { + "epoch": 2.468999386126458, + "grad_norm": 0.2650545537471771, + "learning_rate": 8.8166627771276e-05, + "loss": 1.8271, + "step": 8044 + }, + { + "epoch": 2.469306322897483, + "grad_norm": 0.3143758475780487, + "learning_rate": 8.816341657251272e-05, + "loss": 1.9016, + "step": 8045 + }, + { + "epoch": 2.4696132596685083, + "grad_norm": 0.3015407621860504, + "learning_rate": 8.81602049965962e-05, + "loss": 1.8357, + "step": 8046 + }, + { + "epoch": 2.4699201964395336, + "grad_norm": 0.26860085129737854, + "learning_rate": 8.815699304355819e-05, + "loss": 1.8223, + "step": 8047 + }, + { + "epoch": 2.4702271332105585, + "grad_norm": 0.2852436602115631, + "learning_rate": 8.81537807134304e-05, + "loss": 1.8298, + "step": 8048 + }, + { + "epoch": 2.470534069981584, + "grad_norm": 0.29519692063331604, + "learning_rate": 8.815056800624457e-05, + "loss": 1.863, + "step": 8049 + }, + { + "epoch": 2.470841006752609, + "grad_norm": 0.3163367807865143, + "learning_rate": 8.814735492203247e-05, + "loss": 1.878, + "step": 8050 + }, + { + "epoch": 2.471147943523634, + "grad_norm": 0.2955954968929291, + "learning_rate": 8.814414146082586e-05, + "loss": 1.8657, + "step": 8051 + }, + { + "epoch": 2.4714548802946594, + "grad_norm": 0.2773810029029846, + "learning_rate": 8.814092762265648e-05, + "loss": 1.7626, + "step": 8052 + }, + { + "epoch": 2.4717618170656843, + "grad_norm": 0.33908557891845703, + "learning_rate": 8.813771340755609e-05, + "loss": 1.8902, + "step": 8053 + }, + { + "epoch": 2.4720687538367097, + "grad_norm": 0.3083830773830414, + "learning_rate": 8.81344988155565e-05, + "loss": 1.876, + "step": 8054 + }, + { + "epoch": 2.472375690607735, + "grad_norm": 0.29082754254341125, + "learning_rate": 8.81312838466894e-05, + "loss": 1.8637, + "step": 8055 + }, + { + "epoch": 2.47268262737876, + "grad_norm": 0.3240490257740021, + "learning_rate": 8.81280685009866e-05, + "loss": 1.9096, + "step": 8056 + }, + { + "epoch": 2.472989564149785, + "grad_norm": 0.364561527967453, + "learning_rate": 8.812485277847991e-05, + "loss": 1.9361, + "step": 8057 + }, + { + "epoch": 2.47329650092081, + "grad_norm": 0.3420087695121765, + "learning_rate": 8.812163667920107e-05, + "loss": 1.9014, + "step": 8058 + }, + { + "epoch": 2.4736034376918354, + "grad_norm": 0.3346010148525238, + "learning_rate": 8.811842020318186e-05, + "loss": 1.9195, + "step": 8059 + }, + { + "epoch": 2.4739103744628608, + "grad_norm": 0.2990448772907257, + "learning_rate": 8.811520335045409e-05, + "loss": 1.8866, + "step": 8060 + }, + { + "epoch": 2.4742173112338857, + "grad_norm": 0.3047022223472595, + "learning_rate": 8.811198612104953e-05, + "loss": 1.8226, + "step": 8061 + }, + { + "epoch": 2.474524248004911, + "grad_norm": 0.300020307302475, + "learning_rate": 8.8108768515e-05, + "loss": 1.8496, + "step": 8062 + }, + { + "epoch": 2.4748311847759363, + "grad_norm": 0.31999605894088745, + "learning_rate": 8.810555053233729e-05, + "loss": 1.7853, + "step": 8063 + }, + { + "epoch": 2.4751381215469612, + "grad_norm": 0.3136597275733948, + "learning_rate": 8.810233217309318e-05, + "loss": 1.9317, + "step": 8064 + }, + { + "epoch": 2.4754450583179866, + "grad_norm": 0.3373543322086334, + "learning_rate": 8.809911343729948e-05, + "loss": 1.7827, + "step": 8065 + }, + { + "epoch": 2.475751995089012, + "grad_norm": 0.33876341581344604, + "learning_rate": 8.809589432498804e-05, + "loss": 1.8803, + "step": 8066 + }, + { + "epoch": 2.476058931860037, + "grad_norm": 0.3455486297607422, + "learning_rate": 8.809267483619061e-05, + "loss": 1.8987, + "step": 8067 + }, + { + "epoch": 2.476365868631062, + "grad_norm": 0.34245389699935913, + "learning_rate": 8.808945497093907e-05, + "loss": 1.8948, + "step": 8068 + }, + { + "epoch": 2.476672805402087, + "grad_norm": 0.3200787901878357, + "learning_rate": 8.808623472926521e-05, + "loss": 1.8234, + "step": 8069 + }, + { + "epoch": 2.4769797421731123, + "grad_norm": 0.3244795799255371, + "learning_rate": 8.808301411120083e-05, + "loss": 1.8974, + "step": 8070 + }, + { + "epoch": 2.4772866789441377, + "grad_norm": 0.30235809087753296, + "learning_rate": 8.80797931167778e-05, + "loss": 1.8461, + "step": 8071 + }, + { + "epoch": 2.4775936157151626, + "grad_norm": 0.3719651997089386, + "learning_rate": 8.807657174602792e-05, + "loss": 1.9717, + "step": 8072 + }, + { + "epoch": 2.477900552486188, + "grad_norm": 0.3349135220050812, + "learning_rate": 8.807334999898307e-05, + "loss": 1.9, + "step": 8073 + }, + { + "epoch": 2.478207489257213, + "grad_norm": 0.28822100162506104, + "learning_rate": 8.807012787567503e-05, + "loss": 1.7606, + "step": 8074 + }, + { + "epoch": 2.478514426028238, + "grad_norm": 0.33698850870132446, + "learning_rate": 8.806690537613568e-05, + "loss": 1.8909, + "step": 8075 + }, + { + "epoch": 2.4788213627992635, + "grad_norm": 0.35167089104652405, + "learning_rate": 8.806368250039687e-05, + "loss": 1.8529, + "step": 8076 + }, + { + "epoch": 2.4791282995702884, + "grad_norm": 0.3142544627189636, + "learning_rate": 8.806045924849044e-05, + "loss": 1.8169, + "step": 8077 + }, + { + "epoch": 2.4794352363413137, + "grad_norm": 0.3489094078540802, + "learning_rate": 8.805723562044824e-05, + "loss": 1.8822, + "step": 8078 + }, + { + "epoch": 2.479742173112339, + "grad_norm": 0.33814284205436707, + "learning_rate": 8.805401161630214e-05, + "loss": 1.7982, + "step": 8079 + }, + { + "epoch": 2.480049109883364, + "grad_norm": 0.26772376894950867, + "learning_rate": 8.805078723608398e-05, + "loss": 1.8354, + "step": 8080 + }, + { + "epoch": 2.4803560466543892, + "grad_norm": 0.3259965777397156, + "learning_rate": 8.804756247982563e-05, + "loss": 1.8292, + "step": 8081 + }, + { + "epoch": 2.4806629834254146, + "grad_norm": 0.32701683044433594, + "learning_rate": 8.804433734755899e-05, + "loss": 1.8339, + "step": 8082 + }, + { + "epoch": 2.4809699201964395, + "grad_norm": 0.3180190324783325, + "learning_rate": 8.804111183931589e-05, + "loss": 1.8839, + "step": 8083 + }, + { + "epoch": 2.481276856967465, + "grad_norm": 0.3318104147911072, + "learning_rate": 8.803788595512824e-05, + "loss": 1.9024, + "step": 8084 + }, + { + "epoch": 2.4815837937384897, + "grad_norm": 0.3849479854106903, + "learning_rate": 8.80346596950279e-05, + "loss": 1.8497, + "step": 8085 + }, + { + "epoch": 2.481890730509515, + "grad_norm": 0.48812124133110046, + "learning_rate": 8.803143305904676e-05, + "loss": 1.799, + "step": 8086 + }, + { + "epoch": 2.4821976672805404, + "grad_norm": 0.4957241415977478, + "learning_rate": 8.802820604721671e-05, + "loss": 1.8842, + "step": 8087 + }, + { + "epoch": 2.4825046040515653, + "grad_norm": 0.4011611342430115, + "learning_rate": 8.802497865956964e-05, + "loss": 1.8354, + "step": 8088 + }, + { + "epoch": 2.4828115408225906, + "grad_norm": 0.3676159679889679, + "learning_rate": 8.802175089613744e-05, + "loss": 1.8564, + "step": 8089 + }, + { + "epoch": 2.4831184775936155, + "grad_norm": 0.30699628591537476, + "learning_rate": 8.801852275695202e-05, + "loss": 1.8403, + "step": 8090 + }, + { + "epoch": 2.483425414364641, + "grad_norm": 0.4100657105445862, + "learning_rate": 8.801529424204527e-05, + "loss": 1.7885, + "step": 8091 + }, + { + "epoch": 2.483732351135666, + "grad_norm": 0.30880647897720337, + "learning_rate": 8.801206535144909e-05, + "loss": 1.8682, + "step": 8092 + }, + { + "epoch": 2.484039287906691, + "grad_norm": 0.2775783836841583, + "learning_rate": 8.800883608519541e-05, + "loss": 1.8179, + "step": 8093 + }, + { + "epoch": 2.4843462246777164, + "grad_norm": 0.3048902451992035, + "learning_rate": 8.800560644331613e-05, + "loss": 1.8799, + "step": 8094 + }, + { + "epoch": 2.4846531614487417, + "grad_norm": 0.30332526564598083, + "learning_rate": 8.800237642584318e-05, + "loss": 1.8892, + "step": 8095 + }, + { + "epoch": 2.4849600982197666, + "grad_norm": 0.27216237783432007, + "learning_rate": 8.799914603280847e-05, + "loss": 1.7896, + "step": 8096 + }, + { + "epoch": 2.485267034990792, + "grad_norm": 0.28771117329597473, + "learning_rate": 8.799591526424393e-05, + "loss": 1.8593, + "step": 8097 + }, + { + "epoch": 2.4855739717618173, + "grad_norm": 0.2986912429332733, + "learning_rate": 8.799268412018146e-05, + "loss": 1.8205, + "step": 8098 + }, + { + "epoch": 2.485880908532842, + "grad_norm": 0.3072153925895691, + "learning_rate": 8.798945260065306e-05, + "loss": 1.841, + "step": 8099 + }, + { + "epoch": 2.4861878453038675, + "grad_norm": 0.33869001269340515, + "learning_rate": 8.798622070569059e-05, + "loss": 1.8353, + "step": 8100 + }, + { + "epoch": 2.4864947820748924, + "grad_norm": 0.3075481951236725, + "learning_rate": 8.798298843532605e-05, + "loss": 1.8824, + "step": 8101 + }, + { + "epoch": 2.4868017188459177, + "grad_norm": 0.2758934795856476, + "learning_rate": 8.797975578959132e-05, + "loss": 1.8068, + "step": 8102 + }, + { + "epoch": 2.487108655616943, + "grad_norm": 0.3065447211265564, + "learning_rate": 8.79765227685184e-05, + "loss": 1.8661, + "step": 8103 + }, + { + "epoch": 2.487415592387968, + "grad_norm": 0.34466415643692017, + "learning_rate": 8.797328937213923e-05, + "loss": 1.8579, + "step": 8104 + }, + { + "epoch": 2.4877225291589933, + "grad_norm": 0.4202970862388611, + "learning_rate": 8.797005560048575e-05, + "loss": 1.8526, + "step": 8105 + }, + { + "epoch": 2.488029465930018, + "grad_norm": 0.35885924100875854, + "learning_rate": 8.796682145358991e-05, + "loss": 1.8194, + "step": 8106 + }, + { + "epoch": 2.4883364027010435, + "grad_norm": 0.3208492696285248, + "learning_rate": 8.796358693148372e-05, + "loss": 1.8379, + "step": 8107 + }, + { + "epoch": 2.488643339472069, + "grad_norm": 0.26514047384262085, + "learning_rate": 8.79603520341991e-05, + "loss": 1.7978, + "step": 8108 + }, + { + "epoch": 2.4889502762430937, + "grad_norm": 0.34550225734710693, + "learning_rate": 8.795711676176803e-05, + "loss": 1.8771, + "step": 8109 + }, + { + "epoch": 2.489257213014119, + "grad_norm": 0.3016511797904968, + "learning_rate": 8.795388111422248e-05, + "loss": 1.8184, + "step": 8110 + }, + { + "epoch": 2.4895641497851444, + "grad_norm": 0.34824177622795105, + "learning_rate": 8.795064509159444e-05, + "loss": 1.8486, + "step": 8111 + }, + { + "epoch": 2.4898710865561693, + "grad_norm": 0.341482013463974, + "learning_rate": 8.794740869391587e-05, + "loss": 1.7872, + "step": 8112 + }, + { + "epoch": 2.4901780233271946, + "grad_norm": 0.3366520404815674, + "learning_rate": 8.794417192121878e-05, + "loss": 1.838, + "step": 8113 + }, + { + "epoch": 2.49048496009822, + "grad_norm": 0.3168759047985077, + "learning_rate": 8.794093477353514e-05, + "loss": 1.8195, + "step": 8114 + }, + { + "epoch": 2.490791896869245, + "grad_norm": 0.36757516860961914, + "learning_rate": 8.793769725089693e-05, + "loss": 1.8825, + "step": 8115 + }, + { + "epoch": 2.49109883364027, + "grad_norm": 0.3936297297477722, + "learning_rate": 8.793445935333617e-05, + "loss": 1.855, + "step": 8116 + }, + { + "epoch": 2.491405770411295, + "grad_norm": 0.31962448358535767, + "learning_rate": 8.793122108088485e-05, + "loss": 1.8307, + "step": 8117 + }, + { + "epoch": 2.4917127071823204, + "grad_norm": 0.3082095980644226, + "learning_rate": 8.792798243357499e-05, + "loss": 1.8204, + "step": 8118 + }, + { + "epoch": 2.4920196439533457, + "grad_norm": 0.4574470520019531, + "learning_rate": 8.792474341143855e-05, + "loss": 1.8989, + "step": 8119 + }, + { + "epoch": 2.4923265807243706, + "grad_norm": 0.4596022367477417, + "learning_rate": 8.792150401450757e-05, + "loss": 1.8773, + "step": 8120 + }, + { + "epoch": 2.492633517495396, + "grad_norm": 0.32090309262275696, + "learning_rate": 8.791826424281407e-05, + "loss": 1.8621, + "step": 8121 + }, + { + "epoch": 2.4929404542664213, + "grad_norm": 0.3492026925086975, + "learning_rate": 8.791502409639006e-05, + "loss": 1.8887, + "step": 8122 + }, + { + "epoch": 2.493247391037446, + "grad_norm": 0.39859771728515625, + "learning_rate": 8.791178357526754e-05, + "loss": 1.8326, + "step": 8123 + }, + { + "epoch": 2.4935543278084715, + "grad_norm": 0.40439239144325256, + "learning_rate": 8.790854267947857e-05, + "loss": 1.8716, + "step": 8124 + }, + { + "epoch": 2.493861264579497, + "grad_norm": 0.4004671573638916, + "learning_rate": 8.790530140905515e-05, + "loss": 1.8253, + "step": 8125 + }, + { + "epoch": 2.4941682013505218, + "grad_norm": 0.31446993350982666, + "learning_rate": 8.790205976402934e-05, + "loss": 1.8356, + "step": 8126 + }, + { + "epoch": 2.494475138121547, + "grad_norm": 0.3069862723350525, + "learning_rate": 8.789881774443315e-05, + "loss": 1.8532, + "step": 8127 + }, + { + "epoch": 2.494782074892572, + "grad_norm": 0.3192054033279419, + "learning_rate": 8.789557535029864e-05, + "loss": 1.7991, + "step": 8128 + }, + { + "epoch": 2.4950890116635973, + "grad_norm": 0.30979350209236145, + "learning_rate": 8.789233258165783e-05, + "loss": 1.8874, + "step": 8129 + }, + { + "epoch": 2.4953959484346226, + "grad_norm": 0.3193976879119873, + "learning_rate": 8.788908943854279e-05, + "loss": 1.8218, + "step": 8130 + }, + { + "epoch": 2.4957028852056475, + "grad_norm": 0.3120083808898926, + "learning_rate": 8.788584592098557e-05, + "loss": 1.9542, + "step": 8131 + }, + { + "epoch": 2.496009821976673, + "grad_norm": 0.36913001537323, + "learning_rate": 8.788260202901819e-05, + "loss": 1.8543, + "step": 8132 + }, + { + "epoch": 2.4963167587476978, + "grad_norm": 0.40216776728630066, + "learning_rate": 8.787935776267275e-05, + "loss": 1.8645, + "step": 8133 + }, + { + "epoch": 2.496623695518723, + "grad_norm": 0.3553076684474945, + "learning_rate": 8.78761131219813e-05, + "loss": 1.8881, + "step": 8134 + }, + { + "epoch": 2.4969306322897484, + "grad_norm": 0.2926538288593292, + "learning_rate": 8.787286810697589e-05, + "loss": 1.8419, + "step": 8135 + }, + { + "epoch": 2.4972375690607733, + "grad_norm": 0.3412233293056488, + "learning_rate": 8.78696227176886e-05, + "loss": 1.8766, + "step": 8136 + }, + { + "epoch": 2.4975445058317987, + "grad_norm": 0.30935296416282654, + "learning_rate": 8.78663769541515e-05, + "loss": 1.8002, + "step": 8137 + }, + { + "epoch": 2.497851442602824, + "grad_norm": 0.31171828508377075, + "learning_rate": 8.786313081639666e-05, + "loss": 1.7795, + "step": 8138 + }, + { + "epoch": 2.498158379373849, + "grad_norm": 0.2874031364917755, + "learning_rate": 8.785988430445619e-05, + "loss": 1.8508, + "step": 8139 + }, + { + "epoch": 2.498465316144874, + "grad_norm": 0.3126043379306793, + "learning_rate": 8.785663741836215e-05, + "loss": 1.8328, + "step": 8140 + }, + { + "epoch": 2.4987722529158995, + "grad_norm": 0.32581454515457153, + "learning_rate": 8.785339015814662e-05, + "loss": 1.8333, + "step": 8141 + }, + { + "epoch": 2.4990791896869244, + "grad_norm": 0.329745888710022, + "learning_rate": 8.78501425238417e-05, + "loss": 1.8257, + "step": 8142 + }, + { + "epoch": 2.4993861264579498, + "grad_norm": 0.29101938009262085, + "learning_rate": 8.78468945154795e-05, + "loss": 1.8472, + "step": 8143 + }, + { + "epoch": 2.4996930632289747, + "grad_norm": 0.3123742341995239, + "learning_rate": 8.784364613309208e-05, + "loss": 1.9226, + "step": 8144 + }, + { + "epoch": 2.5, + "grad_norm": 0.3330230116844177, + "learning_rate": 8.784039737671159e-05, + "loss": 1.8768, + "step": 8145 + }, + { + "epoch": 2.5003069367710253, + "grad_norm": 0.3147718012332916, + "learning_rate": 8.783714824637011e-05, + "loss": 1.853, + "step": 8146 + }, + { + "epoch": 2.5006138735420502, + "grad_norm": 0.34790241718292236, + "learning_rate": 8.783389874209977e-05, + "loss": 1.8328, + "step": 8147 + }, + { + "epoch": 2.5009208103130756, + "grad_norm": 0.29425308108329773, + "learning_rate": 8.783064886393264e-05, + "loss": 1.8487, + "step": 8148 + }, + { + "epoch": 2.5012277470841005, + "grad_norm": 0.30555078387260437, + "learning_rate": 8.782739861190088e-05, + "loss": 1.8588, + "step": 8149 + }, + { + "epoch": 2.501534683855126, + "grad_norm": 0.29712429642677307, + "learning_rate": 8.78241479860366e-05, + "loss": 1.8056, + "step": 8150 + }, + { + "epoch": 2.501841620626151, + "grad_norm": 0.32512977719306946, + "learning_rate": 8.782089698637191e-05, + "loss": 1.9099, + "step": 8151 + }, + { + "epoch": 2.5021485573971765, + "grad_norm": 0.3660493493080139, + "learning_rate": 8.781764561293895e-05, + "loss": 1.905, + "step": 8152 + }, + { + "epoch": 2.5024554941682013, + "grad_norm": 0.33591583371162415, + "learning_rate": 8.781439386576984e-05, + "loss": 1.8353, + "step": 8153 + }, + { + "epoch": 2.5027624309392267, + "grad_norm": 0.3774370551109314, + "learning_rate": 8.781114174489673e-05, + "loss": 1.8626, + "step": 8154 + }, + { + "epoch": 2.5030693677102516, + "grad_norm": 0.3628109097480774, + "learning_rate": 8.780788925035178e-05, + "loss": 1.8549, + "step": 8155 + }, + { + "epoch": 2.503376304481277, + "grad_norm": 0.3089732825756073, + "learning_rate": 8.78046363821671e-05, + "loss": 1.835, + "step": 8156 + }, + { + "epoch": 2.5036832412523022, + "grad_norm": 0.3630690574645996, + "learning_rate": 8.780138314037482e-05, + "loss": 1.8308, + "step": 8157 + }, + { + "epoch": 2.503990178023327, + "grad_norm": 0.3658130466938019, + "learning_rate": 8.779812952500714e-05, + "loss": 1.8484, + "step": 8158 + }, + { + "epoch": 2.5042971147943525, + "grad_norm": 0.38401272892951965, + "learning_rate": 8.779487553609617e-05, + "loss": 1.8408, + "step": 8159 + }, + { + "epoch": 2.5046040515653774, + "grad_norm": 0.354514479637146, + "learning_rate": 8.77916211736741e-05, + "loss": 1.8491, + "step": 8160 + }, + { + "epoch": 2.5049109883364027, + "grad_norm": 0.3604681193828583, + "learning_rate": 8.778836643777309e-05, + "loss": 1.8887, + "step": 8161 + }, + { + "epoch": 2.505217925107428, + "grad_norm": 0.3155761957168579, + "learning_rate": 8.778511132842528e-05, + "loss": 1.8066, + "step": 8162 + }, + { + "epoch": 2.505524861878453, + "grad_norm": 0.35986092686653137, + "learning_rate": 8.778185584566286e-05, + "loss": 1.8348, + "step": 8163 + }, + { + "epoch": 2.5058317986494782, + "grad_norm": 0.558273434638977, + "learning_rate": 8.777859998951799e-05, + "loss": 1.9118, + "step": 8164 + }, + { + "epoch": 2.506138735420503, + "grad_norm": 0.6520169377326965, + "learning_rate": 8.777534376002285e-05, + "loss": 1.8747, + "step": 8165 + }, + { + "epoch": 2.5064456721915285, + "grad_norm": 0.5059971213340759, + "learning_rate": 8.777208715720963e-05, + "loss": 1.8218, + "step": 8166 + }, + { + "epoch": 2.506752608962554, + "grad_norm": 0.2873745560646057, + "learning_rate": 8.77688301811105e-05, + "loss": 1.8266, + "step": 8167 + }, + { + "epoch": 2.507059545733579, + "grad_norm": 0.4212021827697754, + "learning_rate": 8.776557283175765e-05, + "loss": 1.8553, + "step": 8168 + }, + { + "epoch": 2.507366482504604, + "grad_norm": 0.49324098229408264, + "learning_rate": 8.776231510918328e-05, + "loss": 1.8625, + "step": 8169 + }, + { + "epoch": 2.5076734192756294, + "grad_norm": 0.4414234459400177, + "learning_rate": 8.775905701341959e-05, + "loss": 1.7956, + "step": 8170 + }, + { + "epoch": 2.5079803560466543, + "grad_norm": 0.2691541612148285, + "learning_rate": 8.775579854449876e-05, + "loss": 1.8216, + "step": 8171 + }, + { + "epoch": 2.5082872928176796, + "grad_norm": 0.3366323411464691, + "learning_rate": 8.775253970245299e-05, + "loss": 1.8738, + "step": 8172 + }, + { + "epoch": 2.508594229588705, + "grad_norm": 0.49541351199150085, + "learning_rate": 8.77492804873145e-05, + "loss": 1.8281, + "step": 8173 + }, + { + "epoch": 2.50890116635973, + "grad_norm": 0.584227442741394, + "learning_rate": 8.774602089911548e-05, + "loss": 1.8248, + "step": 8174 + }, + { + "epoch": 2.509208103130755, + "grad_norm": 0.4493597149848938, + "learning_rate": 8.774276093788818e-05, + "loss": 1.8624, + "step": 8175 + }, + { + "epoch": 2.50951503990178, + "grad_norm": 0.29684513807296753, + "learning_rate": 8.77395006036648e-05, + "loss": 1.7806, + "step": 8176 + }, + { + "epoch": 2.5098219766728054, + "grad_norm": 0.38788866996765137, + "learning_rate": 8.773623989647754e-05, + "loss": 1.8334, + "step": 8177 + }, + { + "epoch": 2.5101289134438307, + "grad_norm": 0.44810980558395386, + "learning_rate": 8.773297881635865e-05, + "loss": 1.823, + "step": 8178 + }, + { + "epoch": 2.5104358502148556, + "grad_norm": 0.39918363094329834, + "learning_rate": 8.772971736334032e-05, + "loss": 1.8535, + "step": 8179 + }, + { + "epoch": 2.510742786985881, + "grad_norm": 0.3454466462135315, + "learning_rate": 8.772645553745484e-05, + "loss": 1.8532, + "step": 8180 + }, + { + "epoch": 2.511049723756906, + "grad_norm": 0.3523466885089874, + "learning_rate": 8.77231933387344e-05, + "loss": 1.8402, + "step": 8181 + }, + { + "epoch": 2.511356660527931, + "grad_norm": 0.41947969794273376, + "learning_rate": 8.771993076721126e-05, + "loss": 1.8509, + "step": 8182 + }, + { + "epoch": 2.5116635972989565, + "grad_norm": 0.43224433064460754, + "learning_rate": 8.771666782291765e-05, + "loss": 1.858, + "step": 8183 + }, + { + "epoch": 2.511970534069982, + "grad_norm": 0.3467538058757782, + "learning_rate": 8.771340450588584e-05, + "loss": 1.8528, + "step": 8184 + }, + { + "epoch": 2.5122774708410067, + "grad_norm": 0.33712685108184814, + "learning_rate": 8.771014081614803e-05, + "loss": 1.8741, + "step": 8185 + }, + { + "epoch": 2.512584407612032, + "grad_norm": 0.4289829134941101, + "learning_rate": 8.770687675373652e-05, + "loss": 1.8252, + "step": 8186 + }, + { + "epoch": 2.512891344383057, + "grad_norm": 0.4774068295955658, + "learning_rate": 8.770361231868356e-05, + "loss": 1.8285, + "step": 8187 + }, + { + "epoch": 2.5131982811540823, + "grad_norm": 0.3455580472946167, + "learning_rate": 8.77003475110214e-05, + "loss": 1.8025, + "step": 8188 + }, + { + "epoch": 2.5135052179251076, + "grad_norm": 0.3050900399684906, + "learning_rate": 8.769708233078231e-05, + "loss": 1.8764, + "step": 8189 + }, + { + "epoch": 2.5138121546961325, + "grad_norm": 0.42384061217308044, + "learning_rate": 8.769381677799855e-05, + "loss": 1.8937, + "step": 8190 + }, + { + "epoch": 2.514119091467158, + "grad_norm": 0.4084749221801758, + "learning_rate": 8.76905508527024e-05, + "loss": 1.8124, + "step": 8191 + }, + { + "epoch": 2.5144260282381827, + "grad_norm": 0.38785848021507263, + "learning_rate": 8.768728455492615e-05, + "loss": 1.8731, + "step": 8192 + }, + { + "epoch": 2.514732965009208, + "grad_norm": 0.28196588158607483, + "learning_rate": 8.768401788470206e-05, + "loss": 1.809, + "step": 8193 + }, + { + "epoch": 2.5150399017802334, + "grad_norm": 0.3551066815853119, + "learning_rate": 8.76807508420624e-05, + "loss": 1.8955, + "step": 8194 + }, + { + "epoch": 2.5153468385512583, + "grad_norm": 0.4327031373977661, + "learning_rate": 8.76774834270395e-05, + "loss": 1.8651, + "step": 8195 + }, + { + "epoch": 2.5156537753222836, + "grad_norm": 0.3748793303966522, + "learning_rate": 8.76742156396656e-05, + "loss": 1.8158, + "step": 8196 + }, + { + "epoch": 2.5159607120933085, + "grad_norm": 0.32504430413246155, + "learning_rate": 8.767094747997304e-05, + "loss": 1.8598, + "step": 8197 + }, + { + "epoch": 2.516267648864334, + "grad_norm": 0.3639826476573944, + "learning_rate": 8.76676789479941e-05, + "loss": 1.8829, + "step": 8198 + }, + { + "epoch": 2.516574585635359, + "grad_norm": 0.36793577671051025, + "learning_rate": 8.766441004376106e-05, + "loss": 1.8215, + "step": 8199 + }, + { + "epoch": 2.5168815224063845, + "grad_norm": 0.3245735466480255, + "learning_rate": 8.766114076730624e-05, + "loss": 1.8309, + "step": 8200 + }, + { + "epoch": 2.5171884591774094, + "grad_norm": 0.3022485673427582, + "learning_rate": 8.765787111866198e-05, + "loss": 1.8286, + "step": 8201 + }, + { + "epoch": 2.5174953959484347, + "grad_norm": 0.40962809324264526, + "learning_rate": 8.765460109786056e-05, + "loss": 1.8032, + "step": 8202 + }, + { + "epoch": 2.5178023327194596, + "grad_norm": 0.4123937487602234, + "learning_rate": 8.765133070493428e-05, + "loss": 1.9311, + "step": 8203 + }, + { + "epoch": 2.518109269490485, + "grad_norm": 0.30352556705474854, + "learning_rate": 8.764805993991551e-05, + "loss": 1.8197, + "step": 8204 + }, + { + "epoch": 2.5184162062615103, + "grad_norm": 0.3201169967651367, + "learning_rate": 8.764478880283653e-05, + "loss": 1.9355, + "step": 8205 + }, + { + "epoch": 2.518723143032535, + "grad_norm": 0.36343297362327576, + "learning_rate": 8.764151729372969e-05, + "loss": 1.9201, + "step": 8206 + }, + { + "epoch": 2.5190300798035605, + "grad_norm": 0.3273618817329407, + "learning_rate": 8.763824541262729e-05, + "loss": 1.8195, + "step": 8207 + }, + { + "epoch": 2.5193370165745854, + "grad_norm": 0.30200251936912537, + "learning_rate": 8.76349731595617e-05, + "loss": 1.8094, + "step": 8208 + }, + { + "epoch": 2.5196439533456108, + "grad_norm": 0.3177770674228668, + "learning_rate": 8.763170053456527e-05, + "loss": 1.8519, + "step": 8209 + }, + { + "epoch": 2.519950890116636, + "grad_norm": 0.3206307291984558, + "learning_rate": 8.762842753767031e-05, + "loss": 1.8496, + "step": 8210 + }, + { + "epoch": 2.520257826887661, + "grad_norm": 0.31902456283569336, + "learning_rate": 8.762515416890915e-05, + "loss": 1.9069, + "step": 8211 + }, + { + "epoch": 2.5205647636586863, + "grad_norm": 0.3088377118110657, + "learning_rate": 8.762188042831419e-05, + "loss": 1.8482, + "step": 8212 + }, + { + "epoch": 2.520871700429711, + "grad_norm": 0.3046402931213379, + "learning_rate": 8.761860631591773e-05, + "loss": 1.8241, + "step": 8213 + }, + { + "epoch": 2.5211786372007365, + "grad_norm": 0.291831910610199, + "learning_rate": 8.761533183175217e-05, + "loss": 1.846, + "step": 8214 + }, + { + "epoch": 2.521485573971762, + "grad_norm": 0.3514893054962158, + "learning_rate": 8.761205697584986e-05, + "loss": 1.9, + "step": 8215 + }, + { + "epoch": 2.521792510742787, + "grad_norm": 0.31843090057373047, + "learning_rate": 8.760878174824316e-05, + "loss": 1.78, + "step": 8216 + }, + { + "epoch": 2.522099447513812, + "grad_norm": 0.30090904235839844, + "learning_rate": 8.760550614896443e-05, + "loss": 1.8718, + "step": 8217 + }, + { + "epoch": 2.5224063842848374, + "grad_norm": 0.38502126932144165, + "learning_rate": 8.760223017804604e-05, + "loss": 1.8772, + "step": 8218 + }, + { + "epoch": 2.5227133210558623, + "grad_norm": 0.30862319469451904, + "learning_rate": 8.759895383552037e-05, + "loss": 1.8532, + "step": 8219 + }, + { + "epoch": 2.5230202578268877, + "grad_norm": 0.36331596970558167, + "learning_rate": 8.759567712141981e-05, + "loss": 1.8587, + "step": 8220 + }, + { + "epoch": 2.523327194597913, + "grad_norm": 0.3370853662490845, + "learning_rate": 8.759240003577673e-05, + "loss": 1.8065, + "step": 8221 + }, + { + "epoch": 2.523634131368938, + "grad_norm": 0.3047318160533905, + "learning_rate": 8.758912257862351e-05, + "loss": 1.8783, + "step": 8222 + }, + { + "epoch": 2.523941068139963, + "grad_norm": 0.3172069787979126, + "learning_rate": 8.758584474999257e-05, + "loss": 1.7844, + "step": 8223 + }, + { + "epoch": 2.524248004910988, + "grad_norm": 0.3063897490501404, + "learning_rate": 8.758256654991626e-05, + "loss": 1.8642, + "step": 8224 + }, + { + "epoch": 2.5245549416820134, + "grad_norm": 0.2535867393016815, + "learning_rate": 8.757928797842702e-05, + "loss": 1.7784, + "step": 8225 + }, + { + "epoch": 2.5248618784530388, + "grad_norm": 0.27732348442077637, + "learning_rate": 8.757600903555722e-05, + "loss": 1.8223, + "step": 8226 + }, + { + "epoch": 2.525168815224064, + "grad_norm": 0.29819566011428833, + "learning_rate": 8.757272972133927e-05, + "loss": 1.8237, + "step": 8227 + }, + { + "epoch": 2.525475751995089, + "grad_norm": 0.26726382970809937, + "learning_rate": 8.756945003580559e-05, + "loss": 1.8134, + "step": 8228 + }, + { + "epoch": 2.5257826887661143, + "grad_norm": 0.2845614552497864, + "learning_rate": 8.756616997898859e-05, + "loss": 1.8757, + "step": 8229 + }, + { + "epoch": 2.5260896255371392, + "grad_norm": 0.33399102091789246, + "learning_rate": 8.756288955092066e-05, + "loss": 1.9036, + "step": 8230 + }, + { + "epoch": 2.5263965623081646, + "grad_norm": 0.3839001953601837, + "learning_rate": 8.755960875163426e-05, + "loss": 1.8205, + "step": 8231 + }, + { + "epoch": 2.52670349907919, + "grad_norm": 0.3703761696815491, + "learning_rate": 8.75563275811618e-05, + "loss": 1.768, + "step": 8232 + }, + { + "epoch": 2.527010435850215, + "grad_norm": 0.3083760440349579, + "learning_rate": 8.755304603953568e-05, + "loss": 1.8621, + "step": 8233 + }, + { + "epoch": 2.52731737262124, + "grad_norm": 0.2995334267616272, + "learning_rate": 8.754976412678833e-05, + "loss": 1.8246, + "step": 8234 + }, + { + "epoch": 2.527624309392265, + "grad_norm": 0.3482929766178131, + "learning_rate": 8.754648184295222e-05, + "loss": 1.7982, + "step": 8235 + }, + { + "epoch": 2.5279312461632903, + "grad_norm": 0.37462911009788513, + "learning_rate": 8.754319918805978e-05, + "loss": 1.8458, + "step": 8236 + }, + { + "epoch": 2.5282381829343157, + "grad_norm": 0.3112029433250427, + "learning_rate": 8.753991616214343e-05, + "loss": 1.9116, + "step": 8237 + }, + { + "epoch": 2.5285451197053406, + "grad_norm": 0.309711217880249, + "learning_rate": 8.753663276523563e-05, + "loss": 1.8072, + "step": 8238 + }, + { + "epoch": 2.528852056476366, + "grad_norm": 0.3831833302974701, + "learning_rate": 8.753334899736882e-05, + "loss": 1.8769, + "step": 8239 + }, + { + "epoch": 2.529158993247391, + "grad_norm": 0.30272287130355835, + "learning_rate": 8.753006485857547e-05, + "loss": 1.7874, + "step": 8240 + }, + { + "epoch": 2.529465930018416, + "grad_norm": 0.3613976538181305, + "learning_rate": 8.752678034888801e-05, + "loss": 1.8591, + "step": 8241 + }, + { + "epoch": 2.5297728667894415, + "grad_norm": 0.35976549983024597, + "learning_rate": 8.75234954683389e-05, + "loss": 1.7831, + "step": 8242 + }, + { + "epoch": 2.530079803560467, + "grad_norm": 0.33987951278686523, + "learning_rate": 8.752021021696064e-05, + "loss": 1.7986, + "step": 8243 + }, + { + "epoch": 2.5303867403314917, + "grad_norm": 0.29231634736061096, + "learning_rate": 8.751692459478567e-05, + "loss": 1.8205, + "step": 8244 + }, + { + "epoch": 2.530693677102517, + "grad_norm": 0.3382028341293335, + "learning_rate": 8.751363860184644e-05, + "loss": 1.8403, + "step": 8245 + }, + { + "epoch": 2.531000613873542, + "grad_norm": 0.44643479585647583, + "learning_rate": 8.751035223817546e-05, + "loss": 1.8273, + "step": 8246 + }, + { + "epoch": 2.5313075506445673, + "grad_norm": 0.4412732720375061, + "learning_rate": 8.750706550380518e-05, + "loss": 1.7935, + "step": 8247 + }, + { + "epoch": 2.5316144874155926, + "grad_norm": 0.3826131820678711, + "learning_rate": 8.750377839876811e-05, + "loss": 1.8622, + "step": 8248 + }, + { + "epoch": 2.5319214241866175, + "grad_norm": 0.27509525418281555, + "learning_rate": 8.750049092309672e-05, + "loss": 1.8359, + "step": 8249 + }, + { + "epoch": 2.532228360957643, + "grad_norm": 0.36282727122306824, + "learning_rate": 8.749720307682348e-05, + "loss": 1.8531, + "step": 8250 + }, + { + "epoch": 2.5325352977286677, + "grad_norm": 0.3730177581310272, + "learning_rate": 8.749391485998091e-05, + "loss": 1.8616, + "step": 8251 + }, + { + "epoch": 2.532842234499693, + "grad_norm": 0.3347858190536499, + "learning_rate": 8.749062627260152e-05, + "loss": 1.8078, + "step": 8252 + }, + { + "epoch": 2.5331491712707184, + "grad_norm": 0.29422396421432495, + "learning_rate": 8.748733731471777e-05, + "loss": 1.8623, + "step": 8253 + }, + { + "epoch": 2.5334561080417433, + "grad_norm": 0.36915895342826843, + "learning_rate": 8.748404798636219e-05, + "loss": 1.8461, + "step": 8254 + }, + { + "epoch": 2.5337630448127686, + "grad_norm": 0.4497677981853485, + "learning_rate": 8.748075828756725e-05, + "loss": 1.8328, + "step": 8255 + }, + { + "epoch": 2.5340699815837935, + "grad_norm": 0.4770478308200836, + "learning_rate": 8.747746821836552e-05, + "loss": 1.8418, + "step": 8256 + }, + { + "epoch": 2.534376918354819, + "grad_norm": 0.39125776290893555, + "learning_rate": 8.747417777878946e-05, + "loss": 1.8044, + "step": 8257 + }, + { + "epoch": 2.534683855125844, + "grad_norm": 0.2976539731025696, + "learning_rate": 8.747088696887163e-05, + "loss": 1.8819, + "step": 8258 + }, + { + "epoch": 2.5349907918968695, + "grad_norm": 0.37511107325553894, + "learning_rate": 8.746759578864452e-05, + "loss": 1.8304, + "step": 8259 + }, + { + "epoch": 2.5352977286678944, + "grad_norm": 0.4462794363498688, + "learning_rate": 8.746430423814068e-05, + "loss": 1.8248, + "step": 8260 + }, + { + "epoch": 2.5356046654389197, + "grad_norm": 0.3465537130832672, + "learning_rate": 8.746101231739261e-05, + "loss": 1.7987, + "step": 8261 + }, + { + "epoch": 2.5359116022099446, + "grad_norm": 0.3182581663131714, + "learning_rate": 8.745772002643287e-05, + "loss": 1.8817, + "step": 8262 + }, + { + "epoch": 2.53621853898097, + "grad_norm": 0.43006083369255066, + "learning_rate": 8.745442736529398e-05, + "loss": 1.8003, + "step": 8263 + }, + { + "epoch": 2.5365254757519953, + "grad_norm": 0.45511460304260254, + "learning_rate": 8.745113433400849e-05, + "loss": 1.8735, + "step": 8264 + }, + { + "epoch": 2.53683241252302, + "grad_norm": 0.3625985085964203, + "learning_rate": 8.744784093260894e-05, + "loss": 1.8469, + "step": 8265 + }, + { + "epoch": 2.5371393492940455, + "grad_norm": 0.2977297306060791, + "learning_rate": 8.744454716112787e-05, + "loss": 1.7885, + "step": 8266 + }, + { + "epoch": 2.5374462860650704, + "grad_norm": 0.34910085797309875, + "learning_rate": 8.744125301959785e-05, + "loss": 1.8885, + "step": 8267 + }, + { + "epoch": 2.5377532228360957, + "grad_norm": 0.40707942843437195, + "learning_rate": 8.743795850805141e-05, + "loss": 1.8829, + "step": 8268 + }, + { + "epoch": 2.538060159607121, + "grad_norm": 0.4142697751522064, + "learning_rate": 8.743466362652114e-05, + "loss": 1.903, + "step": 8269 + }, + { + "epoch": 2.538367096378146, + "grad_norm": 0.38610437512397766, + "learning_rate": 8.743136837503958e-05, + "loss": 1.9245, + "step": 8270 + }, + { + "epoch": 2.5386740331491713, + "grad_norm": 0.2940465211868286, + "learning_rate": 8.742807275363928e-05, + "loss": 1.8532, + "step": 8271 + }, + { + "epoch": 2.538980969920196, + "grad_norm": 0.3257673978805542, + "learning_rate": 8.742477676235284e-05, + "loss": 1.8517, + "step": 8272 + }, + { + "epoch": 2.5392879066912215, + "grad_norm": 0.3709326982498169, + "learning_rate": 8.742148040121282e-05, + "loss": 1.872, + "step": 8273 + }, + { + "epoch": 2.539594843462247, + "grad_norm": 0.3433123826980591, + "learning_rate": 8.741818367025179e-05, + "loss": 1.8717, + "step": 8274 + }, + { + "epoch": 2.539901780233272, + "grad_norm": 0.39426255226135254, + "learning_rate": 8.741488656950234e-05, + "loss": 1.8155, + "step": 8275 + }, + { + "epoch": 2.540208717004297, + "grad_norm": 0.48205071687698364, + "learning_rate": 8.741158909899706e-05, + "loss": 1.8668, + "step": 8276 + }, + { + "epoch": 2.5405156537753224, + "grad_norm": 0.35280337929725647, + "learning_rate": 8.740829125876853e-05, + "loss": 1.7845, + "step": 8277 + }, + { + "epoch": 2.5408225905463473, + "grad_norm": 0.3148525059223175, + "learning_rate": 8.740499304884932e-05, + "loss": 1.8539, + "step": 8278 + }, + { + "epoch": 2.5411295273173726, + "grad_norm": 0.387932687997818, + "learning_rate": 8.740169446927207e-05, + "loss": 1.8514, + "step": 8279 + }, + { + "epoch": 2.541436464088398, + "grad_norm": 0.37375807762145996, + "learning_rate": 8.739839552006934e-05, + "loss": 1.8497, + "step": 8280 + }, + { + "epoch": 2.541743400859423, + "grad_norm": 0.3094288408756256, + "learning_rate": 8.739509620127375e-05, + "loss": 1.8675, + "step": 8281 + }, + { + "epoch": 2.542050337630448, + "grad_norm": 0.36951884627342224, + "learning_rate": 8.73917965129179e-05, + "loss": 1.8533, + "step": 8282 + }, + { + "epoch": 2.542357274401473, + "grad_norm": 0.39360809326171875, + "learning_rate": 8.73884964550344e-05, + "loss": 1.8688, + "step": 8283 + }, + { + "epoch": 2.5426642111724984, + "grad_norm": 0.29781201481819153, + "learning_rate": 8.738519602765586e-05, + "loss": 1.8285, + "step": 8284 + }, + { + "epoch": 2.5429711479435237, + "grad_norm": 0.29476743936538696, + "learning_rate": 8.73818952308149e-05, + "loss": 1.8234, + "step": 8285 + }, + { + "epoch": 2.5432780847145486, + "grad_norm": 0.3660123646259308, + "learning_rate": 8.737859406454416e-05, + "loss": 1.8933, + "step": 8286 + }, + { + "epoch": 2.543585021485574, + "grad_norm": 0.41587865352630615, + "learning_rate": 8.737529252887621e-05, + "loss": 1.8799, + "step": 8287 + }, + { + "epoch": 2.5438919582565993, + "grad_norm": 0.4183691143989563, + "learning_rate": 8.737199062384374e-05, + "loss": 1.8479, + "step": 8288 + }, + { + "epoch": 2.544198895027624, + "grad_norm": 0.35940057039260864, + "learning_rate": 8.736868834947935e-05, + "loss": 1.8164, + "step": 8289 + }, + { + "epoch": 2.5445058317986495, + "grad_norm": 0.26804691553115845, + "learning_rate": 8.736538570581568e-05, + "loss": 1.8017, + "step": 8290 + }, + { + "epoch": 2.544812768569675, + "grad_norm": 0.34537792205810547, + "learning_rate": 8.736208269288534e-05, + "loss": 1.9002, + "step": 8291 + }, + { + "epoch": 2.5451197053406998, + "grad_norm": 0.4636915624141693, + "learning_rate": 8.735877931072106e-05, + "loss": 1.8207, + "step": 8292 + }, + { + "epoch": 2.545426642111725, + "grad_norm": 0.4897560775279999, + "learning_rate": 8.735547555935537e-05, + "loss": 1.7981, + "step": 8293 + }, + { + "epoch": 2.54573357888275, + "grad_norm": 0.37379372119903564, + "learning_rate": 8.7352171438821e-05, + "loss": 1.8727, + "step": 8294 + }, + { + "epoch": 2.5460405156537753, + "grad_norm": 0.295436292886734, + "learning_rate": 8.734886694915059e-05, + "loss": 1.8321, + "step": 8295 + }, + { + "epoch": 2.5463474524248007, + "grad_norm": 0.40406084060668945, + "learning_rate": 8.734556209037676e-05, + "loss": 1.8666, + "step": 8296 + }, + { + "epoch": 2.5466543891958255, + "grad_norm": 0.3286290466785431, + "learning_rate": 8.734225686253221e-05, + "loss": 1.8574, + "step": 8297 + }, + { + "epoch": 2.546961325966851, + "grad_norm": 0.3200569152832031, + "learning_rate": 8.73389512656496e-05, + "loss": 1.8253, + "step": 8298 + }, + { + "epoch": 2.5472682627378758, + "grad_norm": 0.35550132393836975, + "learning_rate": 8.733564529976157e-05, + "loss": 1.8293, + "step": 8299 + }, + { + "epoch": 2.547575199508901, + "grad_norm": 0.3804685175418854, + "learning_rate": 8.733233896490081e-05, + "loss": 1.8689, + "step": 8300 + }, + { + "epoch": 2.5478821362799264, + "grad_norm": 0.34739598631858826, + "learning_rate": 8.73290322611e-05, + "loss": 1.8441, + "step": 8301 + }, + { + "epoch": 2.5481890730509518, + "grad_norm": 0.29757586121559143, + "learning_rate": 8.732572518839182e-05, + "loss": 1.8698, + "step": 8302 + }, + { + "epoch": 2.5484960098219767, + "grad_norm": 0.30403536558151245, + "learning_rate": 8.732241774680895e-05, + "loss": 1.8305, + "step": 8303 + }, + { + "epoch": 2.548802946593002, + "grad_norm": 0.326876699924469, + "learning_rate": 8.731910993638406e-05, + "loss": 1.8514, + "step": 8304 + }, + { + "epoch": 2.549109883364027, + "grad_norm": 0.3108467161655426, + "learning_rate": 8.731580175714986e-05, + "loss": 1.8509, + "step": 8305 + }, + { + "epoch": 2.549416820135052, + "grad_norm": 0.31641489267349243, + "learning_rate": 8.731249320913904e-05, + "loss": 1.9009, + "step": 8306 + }, + { + "epoch": 2.5497237569060776, + "grad_norm": 0.3166131377220154, + "learning_rate": 8.730918429238428e-05, + "loss": 1.8291, + "step": 8307 + }, + { + "epoch": 2.5500306936771024, + "grad_norm": 0.27900195121765137, + "learning_rate": 8.730587500691829e-05, + "loss": 1.856, + "step": 8308 + }, + { + "epoch": 2.550337630448128, + "grad_norm": 0.3000704050064087, + "learning_rate": 8.730256535277379e-05, + "loss": 1.839, + "step": 8309 + }, + { + "epoch": 2.5506445672191527, + "grad_norm": 0.30938518047332764, + "learning_rate": 8.729925532998348e-05, + "loss": 1.929, + "step": 8310 + }, + { + "epoch": 2.550951503990178, + "grad_norm": 0.3687250316143036, + "learning_rate": 8.729594493858007e-05, + "loss": 1.9214, + "step": 8311 + }, + { + "epoch": 2.5512584407612033, + "grad_norm": 0.3302690386772156, + "learning_rate": 8.729263417859625e-05, + "loss": 1.8667, + "step": 8312 + }, + { + "epoch": 2.5515653775322282, + "grad_norm": 0.32535505294799805, + "learning_rate": 8.728932305006478e-05, + "loss": 1.8298, + "step": 8313 + }, + { + "epoch": 2.5518723143032536, + "grad_norm": 0.3425545394420624, + "learning_rate": 8.728601155301834e-05, + "loss": 1.9479, + "step": 8314 + }, + { + "epoch": 2.5521792510742785, + "grad_norm": 0.29452621936798096, + "learning_rate": 8.72826996874897e-05, + "loss": 1.7963, + "step": 8315 + }, + { + "epoch": 2.552486187845304, + "grad_norm": 0.28749120235443115, + "learning_rate": 8.727938745351156e-05, + "loss": 1.7993, + "step": 8316 + }, + { + "epoch": 2.552793124616329, + "grad_norm": 0.29261404275894165, + "learning_rate": 8.727607485111669e-05, + "loss": 1.8307, + "step": 8317 + }, + { + "epoch": 2.5531000613873545, + "grad_norm": 0.2949221730232239, + "learning_rate": 8.727276188033778e-05, + "loss": 1.7918, + "step": 8318 + }, + { + "epoch": 2.5534069981583793, + "grad_norm": 0.2975117862224579, + "learning_rate": 8.726944854120757e-05, + "loss": 1.8488, + "step": 8319 + }, + { + "epoch": 2.5537139349294047, + "grad_norm": 0.30285659432411194, + "learning_rate": 8.726613483375885e-05, + "loss": 1.8763, + "step": 8320 + }, + { + "epoch": 2.5540208717004296, + "grad_norm": 0.3068414330482483, + "learning_rate": 8.726282075802435e-05, + "loss": 1.8684, + "step": 8321 + }, + { + "epoch": 2.554327808471455, + "grad_norm": 0.3904091715812683, + "learning_rate": 8.72595063140368e-05, + "loss": 1.8643, + "step": 8322 + }, + { + "epoch": 2.5546347452424802, + "grad_norm": 0.443294882774353, + "learning_rate": 8.725619150182897e-05, + "loss": 1.8268, + "step": 8323 + }, + { + "epoch": 2.554941682013505, + "grad_norm": 0.4574877619743347, + "learning_rate": 8.725287632143362e-05, + "loss": 1.8686, + "step": 8324 + }, + { + "epoch": 2.5552486187845305, + "grad_norm": 0.3246860206127167, + "learning_rate": 8.724956077288351e-05, + "loss": 1.8304, + "step": 8325 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.30745935440063477, + "learning_rate": 8.724624485621141e-05, + "loss": 1.8129, + "step": 8326 + }, + { + "epoch": 2.5558624923265807, + "grad_norm": 0.4026782214641571, + "learning_rate": 8.72429285714501e-05, + "loss": 1.8511, + "step": 8327 + }, + { + "epoch": 2.556169429097606, + "grad_norm": 0.41659530997276306, + "learning_rate": 8.723961191863232e-05, + "loss": 1.891, + "step": 8328 + }, + { + "epoch": 2.556476365868631, + "grad_norm": 0.31792551279067993, + "learning_rate": 8.723629489779088e-05, + "loss": 1.8413, + "step": 8329 + }, + { + "epoch": 2.5567833026396563, + "grad_norm": 0.3168247640132904, + "learning_rate": 8.723297750895856e-05, + "loss": 1.902, + "step": 8330 + }, + { + "epoch": 2.557090239410681, + "grad_norm": 0.27834242582321167, + "learning_rate": 8.72296597521681e-05, + "loss": 1.8185, + "step": 8331 + }, + { + "epoch": 2.5573971761817065, + "grad_norm": 0.2997399568557739, + "learning_rate": 8.722634162745236e-05, + "loss": 1.8389, + "step": 8332 + }, + { + "epoch": 2.557704112952732, + "grad_norm": 0.29116490483283997, + "learning_rate": 8.722302313484407e-05, + "loss": 1.8391, + "step": 8333 + }, + { + "epoch": 2.558011049723757, + "grad_norm": 0.2898460030555725, + "learning_rate": 8.721970427437605e-05, + "loss": 1.8891, + "step": 8334 + }, + { + "epoch": 2.558317986494782, + "grad_norm": 0.3231159746646881, + "learning_rate": 8.721638504608109e-05, + "loss": 1.826, + "step": 8335 + }, + { + "epoch": 2.5586249232658074, + "grad_norm": 0.38665273785591125, + "learning_rate": 8.721306544999203e-05, + "loss": 1.9162, + "step": 8336 + }, + { + "epoch": 2.5589318600368323, + "grad_norm": 0.367824912071228, + "learning_rate": 8.720974548614162e-05, + "loss": 1.8165, + "step": 8337 + }, + { + "epoch": 2.5592387968078576, + "grad_norm": 0.3095315098762512, + "learning_rate": 8.72064251545627e-05, + "loss": 1.8887, + "step": 8338 + }, + { + "epoch": 2.559545733578883, + "grad_norm": 0.316890150308609, + "learning_rate": 8.720310445528807e-05, + "loss": 1.8547, + "step": 8339 + }, + { + "epoch": 2.559852670349908, + "grad_norm": 0.2962728440761566, + "learning_rate": 8.719978338835057e-05, + "loss": 1.8252, + "step": 8340 + }, + { + "epoch": 2.560159607120933, + "grad_norm": 0.3351762890815735, + "learning_rate": 8.719646195378302e-05, + "loss": 1.8056, + "step": 8341 + }, + { + "epoch": 2.560466543891958, + "grad_norm": 0.2946149706840515, + "learning_rate": 8.719314015161822e-05, + "loss": 1.8219, + "step": 8342 + }, + { + "epoch": 2.5607734806629834, + "grad_norm": 0.30291053652763367, + "learning_rate": 8.718981798188899e-05, + "loss": 1.8161, + "step": 8343 + }, + { + "epoch": 2.5610804174340087, + "grad_norm": 0.30717429518699646, + "learning_rate": 8.71864954446282e-05, + "loss": 1.8763, + "step": 8344 + }, + { + "epoch": 2.5613873542050336, + "grad_norm": 0.28360515832901, + "learning_rate": 8.718317253986866e-05, + "loss": 1.7972, + "step": 8345 + }, + { + "epoch": 2.561694290976059, + "grad_norm": 0.34898701310157776, + "learning_rate": 8.717984926764322e-05, + "loss": 1.8843, + "step": 8346 + }, + { + "epoch": 2.562001227747084, + "grad_norm": 0.2702360451221466, + "learning_rate": 8.717652562798472e-05, + "loss": 1.7917, + "step": 8347 + }, + { + "epoch": 2.562308164518109, + "grad_norm": 0.30566295981407166, + "learning_rate": 8.7173201620926e-05, + "loss": 1.9027, + "step": 8348 + }, + { + "epoch": 2.5626151012891345, + "grad_norm": 0.2882433533668518, + "learning_rate": 8.716987724649991e-05, + "loss": 1.8167, + "step": 8349 + }, + { + "epoch": 2.56292203806016, + "grad_norm": 0.2616370916366577, + "learning_rate": 8.71665525047393e-05, + "loss": 1.7779, + "step": 8350 + }, + { + "epoch": 2.5632289748311847, + "grad_norm": 0.3033899664878845, + "learning_rate": 8.716322739567706e-05, + "loss": 1.9022, + "step": 8351 + }, + { + "epoch": 2.56353591160221, + "grad_norm": 0.30584800243377686, + "learning_rate": 8.7159901919346e-05, + "loss": 1.8808, + "step": 8352 + }, + { + "epoch": 2.563842848373235, + "grad_norm": 0.34650805592536926, + "learning_rate": 8.715657607577903e-05, + "loss": 1.8817, + "step": 8353 + }, + { + "epoch": 2.5641497851442603, + "grad_norm": 0.30568572878837585, + "learning_rate": 8.715324986500898e-05, + "loss": 1.8852, + "step": 8354 + }, + { + "epoch": 2.5644567219152856, + "grad_norm": 0.36174869537353516, + "learning_rate": 8.714992328706875e-05, + "loss": 1.8518, + "step": 8355 + }, + { + "epoch": 2.5647636586863105, + "grad_norm": 0.48538872599601746, + "learning_rate": 8.714659634199119e-05, + "loss": 1.8902, + "step": 8356 + }, + { + "epoch": 2.565070595457336, + "grad_norm": 0.44997766613960266, + "learning_rate": 8.71432690298092e-05, + "loss": 1.8914, + "step": 8357 + }, + { + "epoch": 2.5653775322283607, + "grad_norm": 0.30164965987205505, + "learning_rate": 8.713994135055566e-05, + "loss": 1.826, + "step": 8358 + }, + { + "epoch": 2.565684468999386, + "grad_norm": 0.35495996475219727, + "learning_rate": 8.713661330426345e-05, + "loss": 1.8006, + "step": 8359 + }, + { + "epoch": 2.5659914057704114, + "grad_norm": 0.4141593277454376, + "learning_rate": 8.713328489096545e-05, + "loss": 1.782, + "step": 8360 + }, + { + "epoch": 2.5662983425414367, + "grad_norm": 0.4758378267288208, + "learning_rate": 8.712995611069458e-05, + "loss": 1.8378, + "step": 8361 + }, + { + "epoch": 2.5666052793124616, + "grad_norm": 0.4852865934371948, + "learning_rate": 8.71266269634837e-05, + "loss": 1.8472, + "step": 8362 + }, + { + "epoch": 2.566912216083487, + "grad_norm": 0.43413496017456055, + "learning_rate": 8.712329744936576e-05, + "loss": 1.8118, + "step": 8363 + }, + { + "epoch": 2.567219152854512, + "grad_norm": 0.3100700080394745, + "learning_rate": 8.711996756837361e-05, + "loss": 1.8699, + "step": 8364 + }, + { + "epoch": 2.567526089625537, + "grad_norm": 0.31886258721351624, + "learning_rate": 8.711663732054021e-05, + "loss": 1.8022, + "step": 8365 + }, + { + "epoch": 2.5678330263965625, + "grad_norm": 0.38900697231292725, + "learning_rate": 8.711330670589841e-05, + "loss": 1.8119, + "step": 8366 + }, + { + "epoch": 2.5681399631675874, + "grad_norm": 0.4188348650932312, + "learning_rate": 8.710997572448119e-05, + "loss": 1.8561, + "step": 8367 + }, + { + "epoch": 2.5684468999386127, + "grad_norm": 0.3562021255493164, + "learning_rate": 8.710664437632143e-05, + "loss": 1.8605, + "step": 8368 + }, + { + "epoch": 2.5687538367096376, + "grad_norm": 0.3105112910270691, + "learning_rate": 8.710331266145206e-05, + "loss": 1.8122, + "step": 8369 + }, + { + "epoch": 2.569060773480663, + "grad_norm": 0.3209846615791321, + "learning_rate": 8.7099980579906e-05, + "loss": 1.8914, + "step": 8370 + }, + { + "epoch": 2.5693677102516883, + "grad_norm": 0.32560455799102783, + "learning_rate": 8.70966481317162e-05, + "loss": 1.9245, + "step": 8371 + }, + { + "epoch": 2.569674647022713, + "grad_norm": 0.29573267698287964, + "learning_rate": 8.709331531691558e-05, + "loss": 1.8576, + "step": 8372 + }, + { + "epoch": 2.5699815837937385, + "grad_norm": 0.2974778115749359, + "learning_rate": 8.708998213553707e-05, + "loss": 1.8464, + "step": 8373 + }, + { + "epoch": 2.5702885205647634, + "grad_norm": 0.3264322578907013, + "learning_rate": 8.708664858761362e-05, + "loss": 1.8945, + "step": 8374 + }, + { + "epoch": 2.5705954573357888, + "grad_norm": 0.28260353207588196, + "learning_rate": 8.708331467317816e-05, + "loss": 1.8296, + "step": 8375 + }, + { + "epoch": 2.570902394106814, + "grad_norm": 0.2991141676902771, + "learning_rate": 8.707998039226367e-05, + "loss": 1.9227, + "step": 8376 + }, + { + "epoch": 2.5712093308778394, + "grad_norm": 0.28582924604415894, + "learning_rate": 8.707664574490306e-05, + "loss": 1.8465, + "step": 8377 + }, + { + "epoch": 2.5715162676488643, + "grad_norm": 0.2860773205757141, + "learning_rate": 8.707331073112932e-05, + "loss": 1.8403, + "step": 8378 + }, + { + "epoch": 2.5718232044198897, + "grad_norm": 0.31145161390304565, + "learning_rate": 8.70699753509754e-05, + "loss": 1.8775, + "step": 8379 + }, + { + "epoch": 2.5721301411909145, + "grad_norm": 0.28711119294166565, + "learning_rate": 8.706663960447424e-05, + "loss": 1.8354, + "step": 8380 + }, + { + "epoch": 2.57243707796194, + "grad_norm": 0.2884272634983063, + "learning_rate": 8.706330349165884e-05, + "loss": 1.8772, + "step": 8381 + }, + { + "epoch": 2.572744014732965, + "grad_norm": 0.3581789433956146, + "learning_rate": 8.705996701256214e-05, + "loss": 1.8654, + "step": 8382 + }, + { + "epoch": 2.57305095150399, + "grad_norm": 0.41561809182167053, + "learning_rate": 8.705663016721712e-05, + "loss": 1.9112, + "step": 8383 + }, + { + "epoch": 2.5733578882750154, + "grad_norm": 0.301883727312088, + "learning_rate": 8.705329295565676e-05, + "loss": 1.803, + "step": 8384 + }, + { + "epoch": 2.5736648250460403, + "grad_norm": 0.37060779333114624, + "learning_rate": 8.704995537791405e-05, + "loss": 1.9371, + "step": 8385 + }, + { + "epoch": 2.5739717618170657, + "grad_norm": 0.44705548882484436, + "learning_rate": 8.704661743402195e-05, + "loss": 1.8599, + "step": 8386 + }, + { + "epoch": 2.574278698588091, + "grad_norm": 0.44097039103507996, + "learning_rate": 8.70432791240135e-05, + "loss": 1.8305, + "step": 8387 + }, + { + "epoch": 2.574585635359116, + "grad_norm": 0.3278143107891083, + "learning_rate": 8.703994044792161e-05, + "loss": 1.8817, + "step": 8388 + }, + { + "epoch": 2.574892572130141, + "grad_norm": 0.347153902053833, + "learning_rate": 8.703660140577934e-05, + "loss": 1.8182, + "step": 8389 + }, + { + "epoch": 2.575199508901166, + "grad_norm": 0.4667893052101135, + "learning_rate": 8.703326199761966e-05, + "loss": 1.8354, + "step": 8390 + }, + { + "epoch": 2.5755064456721914, + "grad_norm": 0.4956285059452057, + "learning_rate": 8.702992222347559e-05, + "loss": 1.8284, + "step": 8391 + }, + { + "epoch": 2.575813382443217, + "grad_norm": 0.3489355146884918, + "learning_rate": 8.702658208338012e-05, + "loss": 1.8439, + "step": 8392 + }, + { + "epoch": 2.576120319214242, + "grad_norm": 0.3054865002632141, + "learning_rate": 8.702324157736625e-05, + "loss": 1.8659, + "step": 8393 + }, + { + "epoch": 2.576427255985267, + "grad_norm": 0.3459004759788513, + "learning_rate": 8.701990070546703e-05, + "loss": 1.8644, + "step": 8394 + }, + { + "epoch": 2.5767341927562923, + "grad_norm": 0.34715306758880615, + "learning_rate": 8.701655946771544e-05, + "loss": 1.8765, + "step": 8395 + }, + { + "epoch": 2.5770411295273172, + "grad_norm": 0.35610535740852356, + "learning_rate": 8.701321786414452e-05, + "loss": 1.886, + "step": 8396 + }, + { + "epoch": 2.5773480662983426, + "grad_norm": 0.34869852662086487, + "learning_rate": 8.700987589478728e-05, + "loss": 1.8858, + "step": 8397 + }, + { + "epoch": 2.577655003069368, + "grad_norm": 0.33508050441741943, + "learning_rate": 8.700653355967675e-05, + "loss": 1.8429, + "step": 8398 + }, + { + "epoch": 2.577961939840393, + "grad_norm": 0.4707668721675873, + "learning_rate": 8.700319085884597e-05, + "loss": 1.8806, + "step": 8399 + }, + { + "epoch": 2.578268876611418, + "grad_norm": 0.5073609948158264, + "learning_rate": 8.699984779232797e-05, + "loss": 1.9252, + "step": 8400 + }, + { + "epoch": 2.578575813382443, + "grad_norm": 0.4120771884918213, + "learning_rate": 8.699650436015578e-05, + "loss": 1.9463, + "step": 8401 + }, + { + "epoch": 2.5788827501534684, + "grad_norm": 0.5639505386352539, + "learning_rate": 8.699316056236246e-05, + "loss": 1.9076, + "step": 8402 + }, + { + "epoch": 2.5791896869244937, + "grad_norm": 0.7611388564109802, + "learning_rate": 8.698981639898106e-05, + "loss": 1.8344, + "step": 8403 + }, + { + "epoch": 2.5794966236955186, + "grad_norm": 0.715629518032074, + "learning_rate": 8.69864718700446e-05, + "loss": 1.7928, + "step": 8404 + }, + { + "epoch": 2.579803560466544, + "grad_norm": 0.4248988926410675, + "learning_rate": 8.698312697558614e-05, + "loss": 1.835, + "step": 8405 + }, + { + "epoch": 2.580110497237569, + "grad_norm": 0.3638152778148651, + "learning_rate": 8.697978171563875e-05, + "loss": 1.8544, + "step": 8406 + }, + { + "epoch": 2.580417434008594, + "grad_norm": 0.40734997391700745, + "learning_rate": 8.697643609023547e-05, + "loss": 1.7759, + "step": 8407 + }, + { + "epoch": 2.5807243707796195, + "grad_norm": 0.41469305753707886, + "learning_rate": 8.697309009940939e-05, + "loss": 1.8989, + "step": 8408 + }, + { + "epoch": 2.581031307550645, + "grad_norm": 0.3003403842449188, + "learning_rate": 8.696974374319355e-05, + "loss": 1.8138, + "step": 8409 + }, + { + "epoch": 2.5813382443216697, + "grad_norm": 0.3475555181503296, + "learning_rate": 8.696639702162104e-05, + "loss": 1.8851, + "step": 8410 + }, + { + "epoch": 2.581645181092695, + "grad_norm": 0.3952930271625519, + "learning_rate": 8.696304993472493e-05, + "loss": 1.8421, + "step": 8411 + }, + { + "epoch": 2.58195211786372, + "grad_norm": 0.33059266209602356, + "learning_rate": 8.69597024825383e-05, + "loss": 1.886, + "step": 8412 + }, + { + "epoch": 2.5822590546347453, + "grad_norm": 0.291877806186676, + "learning_rate": 8.695635466509422e-05, + "loss": 1.8001, + "step": 8413 + }, + { + "epoch": 2.5825659914057706, + "grad_norm": 0.3707219064235687, + "learning_rate": 8.69530064824258e-05, + "loss": 1.8419, + "step": 8414 + }, + { + "epoch": 2.5828729281767955, + "grad_norm": 0.4656111001968384, + "learning_rate": 8.694965793456609e-05, + "loss": 1.8925, + "step": 8415 + }, + { + "epoch": 2.583179864947821, + "grad_norm": 0.4284421503543854, + "learning_rate": 8.694630902154821e-05, + "loss": 1.8794, + "step": 8416 + }, + { + "epoch": 2.5834868017188457, + "grad_norm": 0.25311100482940674, + "learning_rate": 8.694295974340525e-05, + "loss": 1.8004, + "step": 8417 + }, + { + "epoch": 2.583793738489871, + "grad_norm": 0.3463805615901947, + "learning_rate": 8.693961010017031e-05, + "loss": 1.8666, + "step": 8418 + }, + { + "epoch": 2.5841006752608964, + "grad_norm": 0.3193957209587097, + "learning_rate": 8.693626009187647e-05, + "loss": 1.8787, + "step": 8419 + }, + { + "epoch": 2.5844076120319213, + "grad_norm": 0.30919939279556274, + "learning_rate": 8.69329097185569e-05, + "loss": 1.9066, + "step": 8420 + }, + { + "epoch": 2.5847145488029466, + "grad_norm": 0.31369611620903015, + "learning_rate": 8.692955898024464e-05, + "loss": 1.8714, + "step": 8421 + }, + { + "epoch": 2.5850214855739715, + "grad_norm": 0.3191319406032562, + "learning_rate": 8.692620787697284e-05, + "loss": 1.8535, + "step": 8422 + }, + { + "epoch": 2.585328422344997, + "grad_norm": 0.3148418366909027, + "learning_rate": 8.692285640877462e-05, + "loss": 1.8648, + "step": 8423 + }, + { + "epoch": 2.585635359116022, + "grad_norm": 0.28245437145233154, + "learning_rate": 8.691950457568307e-05, + "loss": 1.8574, + "step": 8424 + }, + { + "epoch": 2.5859422958870475, + "grad_norm": 0.28383150696754456, + "learning_rate": 8.691615237773137e-05, + "loss": 1.7993, + "step": 8425 + }, + { + "epoch": 2.5862492326580724, + "grad_norm": 0.30522802472114563, + "learning_rate": 8.691279981495257e-05, + "loss": 1.8809, + "step": 8426 + }, + { + "epoch": 2.5865561694290977, + "grad_norm": 0.2936995327472687, + "learning_rate": 8.690944688737988e-05, + "loss": 1.745, + "step": 8427 + }, + { + "epoch": 2.5868631062001226, + "grad_norm": 0.2923533320426941, + "learning_rate": 8.69060935950464e-05, + "loss": 1.8929, + "step": 8428 + }, + { + "epoch": 2.587170042971148, + "grad_norm": 0.3280770182609558, + "learning_rate": 8.690273993798526e-05, + "loss": 1.8587, + "step": 8429 + }, + { + "epoch": 2.5874769797421733, + "grad_norm": 0.314712792634964, + "learning_rate": 8.689938591622962e-05, + "loss": 1.8569, + "step": 8430 + }, + { + "epoch": 2.587783916513198, + "grad_norm": 0.3230959475040436, + "learning_rate": 8.689603152981263e-05, + "loss": 1.8451, + "step": 8431 + }, + { + "epoch": 2.5880908532842235, + "grad_norm": 0.35917067527770996, + "learning_rate": 8.689267677876742e-05, + "loss": 1.7755, + "step": 8432 + }, + { + "epoch": 2.5883977900552484, + "grad_norm": 0.3590618968009949, + "learning_rate": 8.688932166312715e-05, + "loss": 1.8236, + "step": 8433 + }, + { + "epoch": 2.5887047268262737, + "grad_norm": 0.29416507482528687, + "learning_rate": 8.6885966182925e-05, + "loss": 1.7852, + "step": 8434 + }, + { + "epoch": 2.589011663597299, + "grad_norm": 0.24230079352855682, + "learning_rate": 8.688261033819409e-05, + "loss": 1.8006, + "step": 8435 + }, + { + "epoch": 2.5893186003683244, + "grad_norm": 0.2519497573375702, + "learning_rate": 8.687925412896762e-05, + "loss": 1.7787, + "step": 8436 + }, + { + "epoch": 2.5896255371393493, + "grad_norm": 0.2794395089149475, + "learning_rate": 8.687589755527874e-05, + "loss": 1.8408, + "step": 8437 + }, + { + "epoch": 2.5899324739103746, + "grad_norm": 0.28811511397361755, + "learning_rate": 8.687254061716063e-05, + "loss": 1.8961, + "step": 8438 + }, + { + "epoch": 2.5902394106813995, + "grad_norm": 0.28127825260162354, + "learning_rate": 8.686918331464647e-05, + "loss": 1.8235, + "step": 8439 + }, + { + "epoch": 2.590546347452425, + "grad_norm": 0.2869607210159302, + "learning_rate": 8.686582564776942e-05, + "loss": 1.8452, + "step": 8440 + }, + { + "epoch": 2.59085328422345, + "grad_norm": 0.36350393295288086, + "learning_rate": 8.686246761656268e-05, + "loss": 1.9262, + "step": 8441 + }, + { + "epoch": 2.591160220994475, + "grad_norm": 0.30231785774230957, + "learning_rate": 8.685910922105942e-05, + "loss": 1.8674, + "step": 8442 + }, + { + "epoch": 2.5914671577655004, + "grad_norm": 0.28321847319602966, + "learning_rate": 8.685575046129285e-05, + "loss": 1.8243, + "step": 8443 + }, + { + "epoch": 2.5917740945365253, + "grad_norm": 0.30235186219215393, + "learning_rate": 8.685239133729615e-05, + "loss": 1.8442, + "step": 8444 + }, + { + "epoch": 2.5920810313075506, + "grad_norm": 0.2684946060180664, + "learning_rate": 8.684903184910252e-05, + "loss": 1.8584, + "step": 8445 + }, + { + "epoch": 2.592387968078576, + "grad_norm": 0.33788567781448364, + "learning_rate": 8.684567199674514e-05, + "loss": 1.8296, + "step": 8446 + }, + { + "epoch": 2.592694904849601, + "grad_norm": 0.38110965490341187, + "learning_rate": 8.684231178025726e-05, + "loss": 1.8581, + "step": 8447 + }, + { + "epoch": 2.593001841620626, + "grad_norm": 0.36466923356056213, + "learning_rate": 8.683895119967204e-05, + "loss": 1.8799, + "step": 8448 + }, + { + "epoch": 2.593308778391651, + "grad_norm": 0.3052733838558197, + "learning_rate": 8.683559025502272e-05, + "loss": 1.8834, + "step": 8449 + }, + { + "epoch": 2.5936157151626764, + "grad_norm": 0.31457164883613586, + "learning_rate": 8.683222894634251e-05, + "loss": 1.8635, + "step": 8450 + }, + { + "epoch": 2.5939226519337018, + "grad_norm": 0.46189576387405396, + "learning_rate": 8.682886727366464e-05, + "loss": 1.8852, + "step": 8451 + }, + { + "epoch": 2.594229588704727, + "grad_norm": 0.467640221118927, + "learning_rate": 8.682550523702229e-05, + "loss": 1.8306, + "step": 8452 + }, + { + "epoch": 2.594536525475752, + "grad_norm": 0.3384416699409485, + "learning_rate": 8.682214283644873e-05, + "loss": 1.8298, + "step": 8453 + }, + { + "epoch": 2.5948434622467773, + "grad_norm": 0.2842169404029846, + "learning_rate": 8.681878007197717e-05, + "loss": 1.8091, + "step": 8454 + }, + { + "epoch": 2.595150399017802, + "grad_norm": 0.31266552209854126, + "learning_rate": 8.681541694364084e-05, + "loss": 1.8329, + "step": 8455 + }, + { + "epoch": 2.5954573357888275, + "grad_norm": 0.36803483963012695, + "learning_rate": 8.681205345147298e-05, + "loss": 1.8427, + "step": 8456 + }, + { + "epoch": 2.595764272559853, + "grad_norm": 0.37500229477882385, + "learning_rate": 8.680868959550684e-05, + "loss": 1.8865, + "step": 8457 + }, + { + "epoch": 2.5960712093308778, + "grad_norm": 0.30494266748428345, + "learning_rate": 8.680532537577565e-05, + "loss": 1.8375, + "step": 8458 + }, + { + "epoch": 2.596378146101903, + "grad_norm": 0.38320985436439514, + "learning_rate": 8.680196079231266e-05, + "loss": 1.8762, + "step": 8459 + }, + { + "epoch": 2.596685082872928, + "grad_norm": 0.48555347323417664, + "learning_rate": 8.679859584515112e-05, + "loss": 1.8558, + "step": 8460 + }, + { + "epoch": 2.5969920196439533, + "grad_norm": 0.3975796401500702, + "learning_rate": 8.67952305343243e-05, + "loss": 1.8265, + "step": 8461 + }, + { + "epoch": 2.5972989564149787, + "grad_norm": 0.3312734365463257, + "learning_rate": 8.679186485986544e-05, + "loss": 1.8346, + "step": 8462 + }, + { + "epoch": 2.5976058931860035, + "grad_norm": 0.37137889862060547, + "learning_rate": 8.67884988218078e-05, + "loss": 1.8894, + "step": 8463 + }, + { + "epoch": 2.597912829957029, + "grad_norm": 0.3645901083946228, + "learning_rate": 8.678513242018467e-05, + "loss": 1.8103, + "step": 8464 + }, + { + "epoch": 2.5982197667280538, + "grad_norm": 0.35010847449302673, + "learning_rate": 8.67817656550293e-05, + "loss": 1.8704, + "step": 8465 + }, + { + "epoch": 2.598526703499079, + "grad_norm": 0.36948931217193604, + "learning_rate": 8.677839852637492e-05, + "loss": 1.8413, + "step": 8466 + }, + { + "epoch": 2.5988336402701044, + "grad_norm": 0.3512018322944641, + "learning_rate": 8.67750310342549e-05, + "loss": 1.8222, + "step": 8467 + }, + { + "epoch": 2.5991405770411298, + "grad_norm": 0.3678590953350067, + "learning_rate": 8.677166317870245e-05, + "loss": 1.852, + "step": 8468 + }, + { + "epoch": 2.5994475138121547, + "grad_norm": 0.46718111634254456, + "learning_rate": 8.676829495975087e-05, + "loss": 1.8459, + "step": 8469 + }, + { + "epoch": 2.59975445058318, + "grad_norm": 0.4580456018447876, + "learning_rate": 8.676492637743345e-05, + "loss": 1.8547, + "step": 8470 + }, + { + "epoch": 2.600061387354205, + "grad_norm": 0.3790566921234131, + "learning_rate": 8.676155743178348e-05, + "loss": 1.8483, + "step": 8471 + }, + { + "epoch": 2.6003683241252302, + "grad_norm": 0.34775233268737793, + "learning_rate": 8.675818812283424e-05, + "loss": 1.9, + "step": 8472 + }, + { + "epoch": 2.6006752608962556, + "grad_norm": 0.4257417619228363, + "learning_rate": 8.675481845061906e-05, + "loss": 1.8354, + "step": 8473 + }, + { + "epoch": 2.6009821976672804, + "grad_norm": 0.46964964270591736, + "learning_rate": 8.675144841517122e-05, + "loss": 1.8305, + "step": 8474 + }, + { + "epoch": 2.601289134438306, + "grad_norm": 0.3592812120914459, + "learning_rate": 8.674807801652403e-05, + "loss": 1.778, + "step": 8475 + }, + { + "epoch": 2.6015960712093307, + "grad_norm": 0.3184985816478729, + "learning_rate": 8.674470725471078e-05, + "loss": 1.8706, + "step": 8476 + }, + { + "epoch": 2.601903007980356, + "grad_norm": 0.31306785345077515, + "learning_rate": 8.674133612976481e-05, + "loss": 1.8482, + "step": 8477 + }, + { + "epoch": 2.6022099447513813, + "grad_norm": 0.30568715929985046, + "learning_rate": 8.673796464171939e-05, + "loss": 1.8346, + "step": 8478 + }, + { + "epoch": 2.6025168815224062, + "grad_norm": 0.33701828122138977, + "learning_rate": 8.673459279060791e-05, + "loss": 1.8165, + "step": 8479 + }, + { + "epoch": 2.6028238182934316, + "grad_norm": 0.3153107166290283, + "learning_rate": 8.673122057646364e-05, + "loss": 1.8175, + "step": 8480 + }, + { + "epoch": 2.6031307550644565, + "grad_norm": 0.3428439497947693, + "learning_rate": 8.67278479993199e-05, + "loss": 1.8344, + "step": 8481 + }, + { + "epoch": 2.603437691835482, + "grad_norm": 0.39118432998657227, + "learning_rate": 8.672447505921006e-05, + "loss": 1.7904, + "step": 8482 + }, + { + "epoch": 2.603744628606507, + "grad_norm": 0.3845612108707428, + "learning_rate": 8.672110175616743e-05, + "loss": 1.8442, + "step": 8483 + }, + { + "epoch": 2.6040515653775325, + "grad_norm": 0.3402850329875946, + "learning_rate": 8.671772809022535e-05, + "loss": 1.8578, + "step": 8484 + }, + { + "epoch": 2.6043585021485574, + "grad_norm": 0.30314967036247253, + "learning_rate": 8.671435406141716e-05, + "loss": 1.8235, + "step": 8485 + }, + { + "epoch": 2.6046654389195827, + "grad_norm": 0.29402145743370056, + "learning_rate": 8.67109796697762e-05, + "loss": 1.8105, + "step": 8486 + }, + { + "epoch": 2.6049723756906076, + "grad_norm": 0.33207419514656067, + "learning_rate": 8.670760491533582e-05, + "loss": 1.9133, + "step": 8487 + }, + { + "epoch": 2.605279312461633, + "grad_norm": 0.3287195861339569, + "learning_rate": 8.670422979812938e-05, + "loss": 1.8344, + "step": 8488 + }, + { + "epoch": 2.6055862492326582, + "grad_norm": 0.37947842478752136, + "learning_rate": 8.670085431819021e-05, + "loss": 1.8504, + "step": 8489 + }, + { + "epoch": 2.605893186003683, + "grad_norm": 0.3688724935054779, + "learning_rate": 8.669747847555171e-05, + "loss": 1.8305, + "step": 8490 + }, + { + "epoch": 2.6062001227747085, + "grad_norm": 0.33962976932525635, + "learning_rate": 8.669410227024721e-05, + "loss": 1.861, + "step": 8491 + }, + { + "epoch": 2.6065070595457334, + "grad_norm": 0.27068057656288147, + "learning_rate": 8.669072570231009e-05, + "loss": 1.7666, + "step": 8492 + }, + { + "epoch": 2.6068139963167587, + "grad_norm": 0.32670122385025024, + "learning_rate": 8.668734877177371e-05, + "loss": 1.8434, + "step": 8493 + }, + { + "epoch": 2.607120933087784, + "grad_norm": 0.37303030490875244, + "learning_rate": 8.668397147867144e-05, + "loss": 1.8326, + "step": 8494 + }, + { + "epoch": 2.607427869858809, + "grad_norm": 0.2860218286514282, + "learning_rate": 8.668059382303666e-05, + "loss": 1.7993, + "step": 8495 + }, + { + "epoch": 2.6077348066298343, + "grad_norm": 0.3480636477470398, + "learning_rate": 8.667721580490278e-05, + "loss": 1.8895, + "step": 8496 + }, + { + "epoch": 2.608041743400859, + "grad_norm": 0.37609198689460754, + "learning_rate": 8.667383742430313e-05, + "loss": 1.8906, + "step": 8497 + }, + { + "epoch": 2.6083486801718845, + "grad_norm": 0.30747851729393005, + "learning_rate": 8.667045868127113e-05, + "loss": 1.8169, + "step": 8498 + }, + { + "epoch": 2.60865561694291, + "grad_norm": 0.3108443021774292, + "learning_rate": 8.666707957584016e-05, + "loss": 1.8296, + "step": 8499 + }, + { + "epoch": 2.608962553713935, + "grad_norm": 0.36353448033332825, + "learning_rate": 8.666370010804361e-05, + "loss": 1.879, + "step": 8500 + }, + { + "epoch": 2.60926949048496, + "grad_norm": 0.39959096908569336, + "learning_rate": 8.666032027791491e-05, + "loss": 1.8602, + "step": 8501 + }, + { + "epoch": 2.6095764272559854, + "grad_norm": 0.3505500853061676, + "learning_rate": 8.665694008548742e-05, + "loss": 1.861, + "step": 8502 + }, + { + "epoch": 2.6098833640270103, + "grad_norm": 0.3155219852924347, + "learning_rate": 8.665355953079457e-05, + "loss": 1.7911, + "step": 8503 + }, + { + "epoch": 2.6101903007980356, + "grad_norm": 0.2868075668811798, + "learning_rate": 8.665017861386975e-05, + "loss": 1.8023, + "step": 8504 + }, + { + "epoch": 2.610497237569061, + "grad_norm": 0.2890832722187042, + "learning_rate": 8.664679733474641e-05, + "loss": 1.8653, + "step": 8505 + }, + { + "epoch": 2.610804174340086, + "grad_norm": 0.3143366575241089, + "learning_rate": 8.66434156934579e-05, + "loss": 1.8024, + "step": 8506 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.28702911734580994, + "learning_rate": 8.664003369003772e-05, + "loss": 1.8231, + "step": 8507 + }, + { + "epoch": 2.611418047882136, + "grad_norm": 0.37087059020996094, + "learning_rate": 8.663665132451924e-05, + "loss": 1.8565, + "step": 8508 + }, + { + "epoch": 2.6117249846531614, + "grad_norm": 0.29796209931373596, + "learning_rate": 8.663326859693588e-05, + "loss": 1.8188, + "step": 8509 + }, + { + "epoch": 2.6120319214241867, + "grad_norm": 0.31352412700653076, + "learning_rate": 8.66298855073211e-05, + "loss": 1.806, + "step": 8510 + }, + { + "epoch": 2.612338858195212, + "grad_norm": 0.28749167919158936, + "learning_rate": 8.662650205570832e-05, + "loss": 1.8082, + "step": 8511 + }, + { + "epoch": 2.612645794966237, + "grad_norm": 0.26889678835868835, + "learning_rate": 8.662311824213099e-05, + "loss": 1.8211, + "step": 8512 + }, + { + "epoch": 2.6129527317372623, + "grad_norm": 0.2562754154205322, + "learning_rate": 8.661973406662253e-05, + "loss": 1.7519, + "step": 8513 + }, + { + "epoch": 2.613259668508287, + "grad_norm": 0.26967912912368774, + "learning_rate": 8.661634952921639e-05, + "loss": 1.8339, + "step": 8514 + }, + { + "epoch": 2.6135666052793125, + "grad_norm": 0.3468424081802368, + "learning_rate": 8.661296462994602e-05, + "loss": 1.9219, + "step": 8515 + }, + { + "epoch": 2.613873542050338, + "grad_norm": 0.34790560603141785, + "learning_rate": 8.660957936884489e-05, + "loss": 1.9089, + "step": 8516 + }, + { + "epoch": 2.6141804788213627, + "grad_norm": 0.350337952375412, + "learning_rate": 8.660619374594643e-05, + "loss": 1.8228, + "step": 8517 + }, + { + "epoch": 2.614487415592388, + "grad_norm": 0.37077057361602783, + "learning_rate": 8.660280776128411e-05, + "loss": 1.8658, + "step": 8518 + }, + { + "epoch": 2.614794352363413, + "grad_norm": 0.35846221446990967, + "learning_rate": 8.659942141489139e-05, + "loss": 1.8573, + "step": 8519 + }, + { + "epoch": 2.6151012891344383, + "grad_norm": 0.339101642370224, + "learning_rate": 8.659603470680173e-05, + "loss": 1.875, + "step": 8520 + }, + { + "epoch": 2.6154082259054636, + "grad_norm": 0.35074207186698914, + "learning_rate": 8.65926476370486e-05, + "loss": 1.8395, + "step": 8521 + }, + { + "epoch": 2.6157151626764885, + "grad_norm": 0.31544017791748047, + "learning_rate": 8.658926020566551e-05, + "loss": 1.8453, + "step": 8522 + }, + { + "epoch": 2.616022099447514, + "grad_norm": 0.30619683861732483, + "learning_rate": 8.658587241268587e-05, + "loss": 1.775, + "step": 8523 + }, + { + "epoch": 2.6163290362185387, + "grad_norm": 0.29331618547439575, + "learning_rate": 8.658248425814322e-05, + "loss": 1.8068, + "step": 8524 + }, + { + "epoch": 2.616635972989564, + "grad_norm": 0.2824336290359497, + "learning_rate": 8.6579095742071e-05, + "loss": 1.8759, + "step": 8525 + }, + { + "epoch": 2.6169429097605894, + "grad_norm": 0.2697986364364624, + "learning_rate": 8.657570686450271e-05, + "loss": 1.8295, + "step": 8526 + }, + { + "epoch": 2.6172498465316147, + "grad_norm": 0.3031822144985199, + "learning_rate": 8.657231762547186e-05, + "loss": 1.9205, + "step": 8527 + }, + { + "epoch": 2.6175567833026396, + "grad_norm": 0.2867984473705292, + "learning_rate": 8.656892802501196e-05, + "loss": 1.8638, + "step": 8528 + }, + { + "epoch": 2.617863720073665, + "grad_norm": 0.29799792170524597, + "learning_rate": 8.656553806315644e-05, + "loss": 1.8187, + "step": 8529 + }, + { + "epoch": 2.61817065684469, + "grad_norm": 0.3222150504589081, + "learning_rate": 8.656214773993884e-05, + "loss": 1.8661, + "step": 8530 + }, + { + "epoch": 2.618477593615715, + "grad_norm": 0.35999616980552673, + "learning_rate": 8.655875705539269e-05, + "loss": 1.9155, + "step": 8531 + }, + { + "epoch": 2.6187845303867405, + "grad_norm": 0.36571675539016724, + "learning_rate": 8.655536600955147e-05, + "loss": 1.8536, + "step": 8532 + }, + { + "epoch": 2.6190914671577654, + "grad_norm": 0.29667189717292786, + "learning_rate": 8.655197460244868e-05, + "loss": 1.8208, + "step": 8533 + }, + { + "epoch": 2.6193984039287908, + "grad_norm": 0.3216320276260376, + "learning_rate": 8.654858283411787e-05, + "loss": 1.8613, + "step": 8534 + }, + { + "epoch": 2.6197053406998156, + "grad_norm": 0.28880423307418823, + "learning_rate": 8.654519070459254e-05, + "loss": 1.8547, + "step": 8535 + }, + { + "epoch": 2.620012277470841, + "grad_norm": 0.3130050301551819, + "learning_rate": 8.654179821390621e-05, + "loss": 1.9355, + "step": 8536 + }, + { + "epoch": 2.6203192142418663, + "grad_norm": 0.3151358664035797, + "learning_rate": 8.653840536209241e-05, + "loss": 1.8462, + "step": 8537 + }, + { + "epoch": 2.620626151012891, + "grad_norm": 0.2702169120311737, + "learning_rate": 8.653501214918468e-05, + "loss": 1.7966, + "step": 8538 + }, + { + "epoch": 2.6209330877839165, + "grad_norm": 0.31494441628456116, + "learning_rate": 8.653161857521655e-05, + "loss": 1.7449, + "step": 8539 + }, + { + "epoch": 2.6212400245549414, + "grad_norm": 0.3219514787197113, + "learning_rate": 8.652822464022154e-05, + "loss": 1.8238, + "step": 8540 + }, + { + "epoch": 2.6215469613259668, + "grad_norm": 0.3237066864967346, + "learning_rate": 8.652483034423322e-05, + "loss": 1.8273, + "step": 8541 + }, + { + "epoch": 2.621853898096992, + "grad_norm": 0.31354910135269165, + "learning_rate": 8.65214356872851e-05, + "loss": 1.8662, + "step": 8542 + }, + { + "epoch": 2.6221608348680174, + "grad_norm": 0.30085036158561707, + "learning_rate": 8.651804066941077e-05, + "loss": 1.8922, + "step": 8543 + }, + { + "epoch": 2.6224677716390423, + "grad_norm": 0.337528258562088, + "learning_rate": 8.651464529064373e-05, + "loss": 1.8234, + "step": 8544 + }, + { + "epoch": 2.6227747084100677, + "grad_norm": 0.33202415704727173, + "learning_rate": 8.65112495510176e-05, + "loss": 1.8331, + "step": 8545 + }, + { + "epoch": 2.6230816451810925, + "grad_norm": 0.3288112282752991, + "learning_rate": 8.650785345056586e-05, + "loss": 1.8129, + "step": 8546 + }, + { + "epoch": 2.623388581952118, + "grad_norm": 0.35483047366142273, + "learning_rate": 8.650445698932214e-05, + "loss": 1.8488, + "step": 8547 + }, + { + "epoch": 2.623695518723143, + "grad_norm": 0.32108932733535767, + "learning_rate": 8.650106016731998e-05, + "loss": 1.8263, + "step": 8548 + }, + { + "epoch": 2.624002455494168, + "grad_norm": 0.2902318239212036, + "learning_rate": 8.649766298459295e-05, + "loss": 1.8352, + "step": 8549 + }, + { + "epoch": 2.6243093922651934, + "grad_norm": 0.29014477133750916, + "learning_rate": 8.64942654411746e-05, + "loss": 1.8568, + "step": 8550 + }, + { + "epoch": 2.6246163290362183, + "grad_norm": 0.3996742367744446, + "learning_rate": 8.649086753709855e-05, + "loss": 1.8928, + "step": 8551 + }, + { + "epoch": 2.6249232658072437, + "grad_norm": 0.3703175187110901, + "learning_rate": 8.648746927239835e-05, + "loss": 1.829, + "step": 8552 + }, + { + "epoch": 2.625230202578269, + "grad_norm": 0.33802542090415955, + "learning_rate": 8.64840706471076e-05, + "loss": 1.8827, + "step": 8553 + }, + { + "epoch": 2.625537139349294, + "grad_norm": 0.33303168416023254, + "learning_rate": 8.648067166125988e-05, + "loss": 1.8964, + "step": 8554 + }, + { + "epoch": 2.6258440761203192, + "grad_norm": 0.33449646830558777, + "learning_rate": 8.647727231488878e-05, + "loss": 1.8477, + "step": 8555 + }, + { + "epoch": 2.626151012891344, + "grad_norm": 0.3260989189147949, + "learning_rate": 8.647387260802788e-05, + "loss": 1.8623, + "step": 8556 + }, + { + "epoch": 2.6264579496623695, + "grad_norm": 0.2847815752029419, + "learning_rate": 8.647047254071082e-05, + "loss": 1.769, + "step": 8557 + }, + { + "epoch": 2.626764886433395, + "grad_norm": 0.30041372776031494, + "learning_rate": 8.646707211297116e-05, + "loss": 1.8451, + "step": 8558 + }, + { + "epoch": 2.62707182320442, + "grad_norm": 0.3557286560535431, + "learning_rate": 8.646367132484252e-05, + "loss": 1.8233, + "step": 8559 + }, + { + "epoch": 2.627378759975445, + "grad_norm": 0.39471131563186646, + "learning_rate": 8.646027017635851e-05, + "loss": 1.8364, + "step": 8560 + }, + { + "epoch": 2.6276856967464703, + "grad_norm": 0.37501803040504456, + "learning_rate": 8.645686866755273e-05, + "loss": 1.8129, + "step": 8561 + }, + { + "epoch": 2.6279926335174952, + "grad_norm": 0.374553918838501, + "learning_rate": 8.645346679845881e-05, + "loss": 1.9388, + "step": 8562 + }, + { + "epoch": 2.6282995702885206, + "grad_norm": 0.34410929679870605, + "learning_rate": 8.645006456911037e-05, + "loss": 1.8496, + "step": 8563 + }, + { + "epoch": 2.628606507059546, + "grad_norm": 0.28208592534065247, + "learning_rate": 8.644666197954103e-05, + "loss": 1.8405, + "step": 8564 + }, + { + "epoch": 2.628913443830571, + "grad_norm": 0.2913917005062103, + "learning_rate": 8.644325902978441e-05, + "loss": 1.8775, + "step": 8565 + }, + { + "epoch": 2.629220380601596, + "grad_norm": 0.33285796642303467, + "learning_rate": 8.643985571987414e-05, + "loss": 1.8217, + "step": 8566 + }, + { + "epoch": 2.629527317372621, + "grad_norm": 0.3419492244720459, + "learning_rate": 8.643645204984386e-05, + "loss": 1.8911, + "step": 8567 + }, + { + "epoch": 2.6298342541436464, + "grad_norm": 0.33901095390319824, + "learning_rate": 8.643304801972721e-05, + "loss": 1.8653, + "step": 8568 + }, + { + "epoch": 2.6301411909146717, + "grad_norm": 0.30073773860931396, + "learning_rate": 8.642964362955781e-05, + "loss": 1.7544, + "step": 8569 + }, + { + "epoch": 2.630448127685697, + "grad_norm": 0.3300367593765259, + "learning_rate": 8.642623887936933e-05, + "loss": 1.8764, + "step": 8570 + }, + { + "epoch": 2.630755064456722, + "grad_norm": 0.330671101808548, + "learning_rate": 8.642283376919542e-05, + "loss": 1.8227, + "step": 8571 + }, + { + "epoch": 2.6310620012277472, + "grad_norm": 0.3498590290546417, + "learning_rate": 8.64194282990697e-05, + "loss": 1.8639, + "step": 8572 + }, + { + "epoch": 2.631368937998772, + "grad_norm": 0.33145999908447266, + "learning_rate": 8.641602246902586e-05, + "loss": 1.8442, + "step": 8573 + }, + { + "epoch": 2.6316758747697975, + "grad_norm": 0.29510337114334106, + "learning_rate": 8.641261627909754e-05, + "loss": 1.829, + "step": 8574 + }, + { + "epoch": 2.631982811540823, + "grad_norm": 0.2788131833076477, + "learning_rate": 8.640920972931839e-05, + "loss": 1.7717, + "step": 8575 + }, + { + "epoch": 2.6322897483118477, + "grad_norm": 0.27459269762039185, + "learning_rate": 8.640580281972209e-05, + "loss": 1.7924, + "step": 8576 + }, + { + "epoch": 2.632596685082873, + "grad_norm": 0.3517146110534668, + "learning_rate": 8.640239555034232e-05, + "loss": 1.8921, + "step": 8577 + }, + { + "epoch": 2.632903621853898, + "grad_norm": 0.2852388620376587, + "learning_rate": 8.639898792121273e-05, + "loss": 1.8207, + "step": 8578 + }, + { + "epoch": 2.6332105586249233, + "grad_norm": 0.3164372742176056, + "learning_rate": 8.639557993236702e-05, + "loss": 1.8782, + "step": 8579 + }, + { + "epoch": 2.6335174953959486, + "grad_norm": 0.43939462304115295, + "learning_rate": 8.639217158383885e-05, + "loss": 1.8345, + "step": 8580 + }, + { + "epoch": 2.6338244321669735, + "grad_norm": 0.45321017503738403, + "learning_rate": 8.63887628756619e-05, + "loss": 1.904, + "step": 8581 + }, + { + "epoch": 2.634131368937999, + "grad_norm": 0.4423905611038208, + "learning_rate": 8.638535380786989e-05, + "loss": 1.8894, + "step": 8582 + }, + { + "epoch": 2.6344383057090237, + "grad_norm": 0.3929237723350525, + "learning_rate": 8.638194438049648e-05, + "loss": 1.8835, + "step": 8583 + }, + { + "epoch": 2.634745242480049, + "grad_norm": 0.3178403973579407, + "learning_rate": 8.637853459357536e-05, + "loss": 1.8125, + "step": 8584 + }, + { + "epoch": 2.6350521792510744, + "grad_norm": 0.3796660602092743, + "learning_rate": 8.637512444714024e-05, + "loss": 1.9376, + "step": 8585 + }, + { + "epoch": 2.6353591160220997, + "grad_norm": 0.34011390805244446, + "learning_rate": 8.637171394122483e-05, + "loss": 1.8339, + "step": 8586 + }, + { + "epoch": 2.6356660527931246, + "grad_norm": 0.3423489034175873, + "learning_rate": 8.636830307586281e-05, + "loss": 1.82, + "step": 8587 + }, + { + "epoch": 2.63597298956415, + "grad_norm": 0.3644867241382599, + "learning_rate": 8.636489185108791e-05, + "loss": 1.811, + "step": 8588 + }, + { + "epoch": 2.636279926335175, + "grad_norm": 0.35383811593055725, + "learning_rate": 8.636148026693384e-05, + "loss": 1.8228, + "step": 8589 + }, + { + "epoch": 2.6365868631062, + "grad_norm": 0.28066012263298035, + "learning_rate": 8.635806832343431e-05, + "loss": 1.7752, + "step": 8590 + }, + { + "epoch": 2.6368937998772255, + "grad_norm": 0.27132275700569153, + "learning_rate": 8.635465602062304e-05, + "loss": 1.8053, + "step": 8591 + }, + { + "epoch": 2.6372007366482504, + "grad_norm": 0.3076920211315155, + "learning_rate": 8.635124335853375e-05, + "loss": 1.77, + "step": 8592 + }, + { + "epoch": 2.6375076734192757, + "grad_norm": 0.35130617022514343, + "learning_rate": 8.634783033720015e-05, + "loss": 1.8272, + "step": 8593 + }, + { + "epoch": 2.6378146101903006, + "grad_norm": 0.3805561661720276, + "learning_rate": 8.634441695665601e-05, + "loss": 1.8549, + "step": 8594 + }, + { + "epoch": 2.638121546961326, + "grad_norm": 0.3168867230415344, + "learning_rate": 8.634100321693504e-05, + "loss": 1.9131, + "step": 8595 + }, + { + "epoch": 2.6384284837323513, + "grad_norm": 0.3061029314994812, + "learning_rate": 8.633758911807095e-05, + "loss": 1.84, + "step": 8596 + }, + { + "epoch": 2.638735420503376, + "grad_norm": 0.2766086459159851, + "learning_rate": 8.633417466009752e-05, + "loss": 1.8519, + "step": 8597 + }, + { + "epoch": 2.6390423572744015, + "grad_norm": 0.3250633180141449, + "learning_rate": 8.633075984304849e-05, + "loss": 1.8434, + "step": 8598 + }, + { + "epoch": 2.6393492940454264, + "grad_norm": 0.2819656729698181, + "learning_rate": 8.63273446669576e-05, + "loss": 1.8181, + "step": 8599 + }, + { + "epoch": 2.6396562308164517, + "grad_norm": 0.3506627678871155, + "learning_rate": 8.632392913185859e-05, + "loss": 1.8521, + "step": 8600 + }, + { + "epoch": 2.639963167587477, + "grad_norm": 0.3026714026927948, + "learning_rate": 8.632051323778521e-05, + "loss": 1.8183, + "step": 8601 + }, + { + "epoch": 2.6402701043585024, + "grad_norm": 0.31900104880332947, + "learning_rate": 8.631709698477124e-05, + "loss": 1.8615, + "step": 8602 + }, + { + "epoch": 2.6405770411295273, + "grad_norm": 0.3017260730266571, + "learning_rate": 8.631368037285044e-05, + "loss": 1.837, + "step": 8603 + }, + { + "epoch": 2.6408839779005526, + "grad_norm": 0.29461613297462463, + "learning_rate": 8.631026340205655e-05, + "loss": 1.8398, + "step": 8604 + }, + { + "epoch": 2.6411909146715775, + "grad_norm": 0.3405241370201111, + "learning_rate": 8.630684607242337e-05, + "loss": 1.9241, + "step": 8605 + }, + { + "epoch": 2.641497851442603, + "grad_norm": 0.36280715465545654, + "learning_rate": 8.630342838398465e-05, + "loss": 1.8319, + "step": 8606 + }, + { + "epoch": 2.641804788213628, + "grad_norm": 0.32274433970451355, + "learning_rate": 8.630001033677414e-05, + "loss": 1.8462, + "step": 8607 + }, + { + "epoch": 2.642111724984653, + "grad_norm": 0.28930720686912537, + "learning_rate": 8.629659193082571e-05, + "loss": 1.8251, + "step": 8608 + }, + { + "epoch": 2.6424186617556784, + "grad_norm": 0.30114278197288513, + "learning_rate": 8.629317316617305e-05, + "loss": 1.8037, + "step": 8609 + }, + { + "epoch": 2.6427255985267033, + "grad_norm": 0.31895074248313904, + "learning_rate": 8.628975404285e-05, + "loss": 1.808, + "step": 8610 + }, + { + "epoch": 2.6430325352977286, + "grad_norm": 0.31819066405296326, + "learning_rate": 8.62863345608903e-05, + "loss": 1.811, + "step": 8611 + }, + { + "epoch": 2.643339472068754, + "grad_norm": 0.3860008716583252, + "learning_rate": 8.628291472032779e-05, + "loss": 1.9041, + "step": 8612 + }, + { + "epoch": 2.643646408839779, + "grad_norm": 0.4598442614078522, + "learning_rate": 8.627949452119626e-05, + "loss": 1.788, + "step": 8613 + }, + { + "epoch": 2.643953345610804, + "grad_norm": 0.4720706641674042, + "learning_rate": 8.62760739635295e-05, + "loss": 1.8436, + "step": 8614 + }, + { + "epoch": 2.644260282381829, + "grad_norm": 0.3894381523132324, + "learning_rate": 8.627265304736131e-05, + "loss": 1.8188, + "step": 8615 + }, + { + "epoch": 2.6445672191528544, + "grad_norm": 0.2819352149963379, + "learning_rate": 8.626923177272551e-05, + "loss": 1.7804, + "step": 8616 + }, + { + "epoch": 2.6448741559238798, + "grad_norm": 0.33847305178642273, + "learning_rate": 8.626581013965588e-05, + "loss": 1.8628, + "step": 8617 + }, + { + "epoch": 2.645181092694905, + "grad_norm": 0.49113303422927856, + "learning_rate": 8.626238814818628e-05, + "loss": 1.821, + "step": 8618 + }, + { + "epoch": 2.64548802946593, + "grad_norm": 0.5562265515327454, + "learning_rate": 8.62589657983505e-05, + "loss": 1.8732, + "step": 8619 + }, + { + "epoch": 2.6457949662369553, + "grad_norm": 0.48525476455688477, + "learning_rate": 8.625554309018237e-05, + "loss": 1.8711, + "step": 8620 + }, + { + "epoch": 2.64610190300798, + "grad_norm": 0.35900986194610596, + "learning_rate": 8.62521200237157e-05, + "loss": 1.8922, + "step": 8621 + }, + { + "epoch": 2.6464088397790055, + "grad_norm": 0.2920636832714081, + "learning_rate": 8.624869659898435e-05, + "loss": 1.8121, + "step": 8622 + }, + { + "epoch": 2.646715776550031, + "grad_norm": 0.3626689314842224, + "learning_rate": 8.624527281602213e-05, + "loss": 1.8231, + "step": 8623 + }, + { + "epoch": 2.6470227133210558, + "grad_norm": 0.37683549523353577, + "learning_rate": 8.624184867486288e-05, + "loss": 1.8648, + "step": 8624 + }, + { + "epoch": 2.647329650092081, + "grad_norm": 0.293865829706192, + "learning_rate": 8.623842417554043e-05, + "loss": 1.8347, + "step": 8625 + }, + { + "epoch": 2.647636586863106, + "grad_norm": 0.28916221857070923, + "learning_rate": 8.623499931808863e-05, + "loss": 1.8337, + "step": 8626 + }, + { + "epoch": 2.6479435236341313, + "grad_norm": 0.439003586769104, + "learning_rate": 8.623157410254134e-05, + "loss": 1.8933, + "step": 8627 + }, + { + "epoch": 2.6482504604051567, + "grad_norm": 0.39125844836235046, + "learning_rate": 8.62281485289324e-05, + "loss": 1.7986, + "step": 8628 + }, + { + "epoch": 2.6485573971761815, + "grad_norm": 0.3968810439109802, + "learning_rate": 8.622472259729566e-05, + "loss": 1.8211, + "step": 8629 + }, + { + "epoch": 2.648864333947207, + "grad_norm": 0.37775713205337524, + "learning_rate": 8.622129630766498e-05, + "loss": 1.8976, + "step": 8630 + }, + { + "epoch": 2.6491712707182318, + "grad_norm": 0.329583078622818, + "learning_rate": 8.621786966007422e-05, + "loss": 1.9164, + "step": 8631 + }, + { + "epoch": 2.649478207489257, + "grad_norm": 0.3499230742454529, + "learning_rate": 8.621444265455725e-05, + "loss": 1.8589, + "step": 8632 + }, + { + "epoch": 2.6497851442602824, + "grad_norm": 0.504540741443634, + "learning_rate": 8.621101529114792e-05, + "loss": 1.7853, + "step": 8633 + }, + { + "epoch": 2.650092081031308, + "grad_norm": 0.47648704051971436, + "learning_rate": 8.620758756988012e-05, + "loss": 1.865, + "step": 8634 + }, + { + "epoch": 2.6503990178023327, + "grad_norm": 0.3592020869255066, + "learning_rate": 8.62041594907877e-05, + "loss": 1.886, + "step": 8635 + }, + { + "epoch": 2.650705954573358, + "grad_norm": 0.4862852096557617, + "learning_rate": 8.620073105390458e-05, + "loss": 1.8408, + "step": 8636 + }, + { + "epoch": 2.651012891344383, + "grad_norm": 0.5418413877487183, + "learning_rate": 8.619730225926462e-05, + "loss": 1.8715, + "step": 8637 + }, + { + "epoch": 2.6513198281154082, + "grad_norm": 0.4154299795627594, + "learning_rate": 8.619387310690168e-05, + "loss": 1.8879, + "step": 8638 + }, + { + "epoch": 2.6516267648864336, + "grad_norm": 0.3325296938419342, + "learning_rate": 8.619044359684968e-05, + "loss": 1.8422, + "step": 8639 + }, + { + "epoch": 2.6519337016574585, + "grad_norm": 0.4082878828048706, + "learning_rate": 8.61870137291425e-05, + "loss": 1.8375, + "step": 8640 + }, + { + "epoch": 2.652240638428484, + "grad_norm": 0.46948596835136414, + "learning_rate": 8.618358350381406e-05, + "loss": 1.8367, + "step": 8641 + }, + { + "epoch": 2.6525475751995087, + "grad_norm": 0.3770928978919983, + "learning_rate": 8.618015292089823e-05, + "loss": 1.8236, + "step": 8642 + }, + { + "epoch": 2.652854511970534, + "grad_norm": 0.27340826392173767, + "learning_rate": 8.617672198042892e-05, + "loss": 1.8446, + "step": 8643 + }, + { + "epoch": 2.6531614487415593, + "grad_norm": 0.4071608781814575, + "learning_rate": 8.617329068244004e-05, + "loss": 1.8576, + "step": 8644 + }, + { + "epoch": 2.6534683855125847, + "grad_norm": 0.5041884779930115, + "learning_rate": 8.61698590269655e-05, + "loss": 1.9075, + "step": 8645 + }, + { + "epoch": 2.6537753222836096, + "grad_norm": 0.4129817485809326, + "learning_rate": 8.616642701403921e-05, + "loss": 1.8592, + "step": 8646 + }, + { + "epoch": 2.654082259054635, + "grad_norm": 0.2837994694709778, + "learning_rate": 8.616299464369508e-05, + "loss": 1.8383, + "step": 8647 + }, + { + "epoch": 2.65438919582566, + "grad_norm": 0.3413170278072357, + "learning_rate": 8.615956191596707e-05, + "loss": 1.8083, + "step": 8648 + }, + { + "epoch": 2.654696132596685, + "grad_norm": 0.3661767244338989, + "learning_rate": 8.615612883088907e-05, + "loss": 1.9141, + "step": 8649 + }, + { + "epoch": 2.6550030693677105, + "grad_norm": 0.3209584951400757, + "learning_rate": 8.6152695388495e-05, + "loss": 1.8886, + "step": 8650 + }, + { + "epoch": 2.6553100061387354, + "grad_norm": 0.3161548674106598, + "learning_rate": 8.61492615888188e-05, + "loss": 1.832, + "step": 8651 + }, + { + "epoch": 2.6556169429097607, + "grad_norm": 0.3258545696735382, + "learning_rate": 8.614582743189441e-05, + "loss": 1.8747, + "step": 8652 + }, + { + "epoch": 2.6559238796807856, + "grad_norm": 0.3528682291507721, + "learning_rate": 8.614239291775579e-05, + "loss": 1.9192, + "step": 8653 + }, + { + "epoch": 2.656230816451811, + "grad_norm": 0.3430826961994171, + "learning_rate": 8.613895804643684e-05, + "loss": 1.8601, + "step": 8654 + }, + { + "epoch": 2.6565377532228363, + "grad_norm": 0.3221988379955292, + "learning_rate": 8.613552281797152e-05, + "loss": 1.9218, + "step": 8655 + }, + { + "epoch": 2.656844689993861, + "grad_norm": 0.2917289137840271, + "learning_rate": 8.613208723239379e-05, + "loss": 1.7443, + "step": 8656 + }, + { + "epoch": 2.6571516267648865, + "grad_norm": 0.28350377082824707, + "learning_rate": 8.612865128973762e-05, + "loss": 1.809, + "step": 8657 + }, + { + "epoch": 2.6574585635359114, + "grad_norm": 0.2758159339427948, + "learning_rate": 8.61252149900369e-05, + "loss": 1.8628, + "step": 8658 + }, + { + "epoch": 2.6577655003069367, + "grad_norm": 0.3537377417087555, + "learning_rate": 8.612177833332566e-05, + "loss": 1.8586, + "step": 8659 + }, + { + "epoch": 2.658072437077962, + "grad_norm": 0.38237693905830383, + "learning_rate": 8.611834131963783e-05, + "loss": 1.8869, + "step": 8660 + }, + { + "epoch": 2.6583793738489874, + "grad_norm": 0.30623751878738403, + "learning_rate": 8.611490394900739e-05, + "loss": 1.8508, + "step": 8661 + }, + { + "epoch": 2.6586863106200123, + "grad_norm": 0.2597752809524536, + "learning_rate": 8.611146622146828e-05, + "loss": 1.7931, + "step": 8662 + }, + { + "epoch": 2.6589932473910376, + "grad_norm": 0.2953357696533203, + "learning_rate": 8.61080281370545e-05, + "loss": 1.837, + "step": 8663 + }, + { + "epoch": 2.6593001841620625, + "grad_norm": 0.3018724322319031, + "learning_rate": 8.610458969580003e-05, + "loss": 1.871, + "step": 8664 + }, + { + "epoch": 2.659607120933088, + "grad_norm": 0.36607179045677185, + "learning_rate": 8.610115089773885e-05, + "loss": 1.9453, + "step": 8665 + }, + { + "epoch": 2.659914057704113, + "grad_norm": 0.38754695653915405, + "learning_rate": 8.609771174290493e-05, + "loss": 1.8886, + "step": 8666 + }, + { + "epoch": 2.660220994475138, + "grad_norm": 0.3752847909927368, + "learning_rate": 8.609427223133226e-05, + "loss": 1.8662, + "step": 8667 + }, + { + "epoch": 2.6605279312461634, + "grad_norm": 0.3301216661930084, + "learning_rate": 8.609083236305483e-05, + "loss": 1.8697, + "step": 8668 + }, + { + "epoch": 2.6608348680171883, + "grad_norm": 0.31682586669921875, + "learning_rate": 8.608739213810666e-05, + "loss": 1.8982, + "step": 8669 + }, + { + "epoch": 2.6611418047882136, + "grad_norm": 0.30835145711898804, + "learning_rate": 8.608395155652172e-05, + "loss": 1.8245, + "step": 8670 + }, + { + "epoch": 2.661448741559239, + "grad_norm": 0.32517582178115845, + "learning_rate": 8.608051061833402e-05, + "loss": 1.9117, + "step": 8671 + }, + { + "epoch": 2.661755678330264, + "grad_norm": 0.3120395541191101, + "learning_rate": 8.607706932357757e-05, + "loss": 1.76, + "step": 8672 + }, + { + "epoch": 2.662062615101289, + "grad_norm": 0.31719091534614563, + "learning_rate": 8.607362767228637e-05, + "loss": 1.8939, + "step": 8673 + }, + { + "epoch": 2.662369551872314, + "grad_norm": 0.28792136907577515, + "learning_rate": 8.607018566449445e-05, + "loss": 1.8403, + "step": 8674 + }, + { + "epoch": 2.6626764886433394, + "grad_norm": 0.28327643871307373, + "learning_rate": 8.606674330023581e-05, + "loss": 1.8204, + "step": 8675 + }, + { + "epoch": 2.6629834254143647, + "grad_norm": 0.29808422923088074, + "learning_rate": 8.606330057954446e-05, + "loss": 1.8325, + "step": 8676 + }, + { + "epoch": 2.66329036218539, + "grad_norm": 0.36162641644477844, + "learning_rate": 8.605985750245446e-05, + "loss": 1.8387, + "step": 8677 + }, + { + "epoch": 2.663597298956415, + "grad_norm": 0.3418589234352112, + "learning_rate": 8.605641406899978e-05, + "loss": 1.8139, + "step": 8678 + }, + { + "epoch": 2.6639042357274403, + "grad_norm": 0.31307870149612427, + "learning_rate": 8.605297027921451e-05, + "loss": 1.8897, + "step": 8679 + }, + { + "epoch": 2.664211172498465, + "grad_norm": 0.36962878704071045, + "learning_rate": 8.604952613313264e-05, + "loss": 1.9233, + "step": 8680 + }, + { + "epoch": 2.6645181092694905, + "grad_norm": 0.3502652049064636, + "learning_rate": 8.604608163078824e-05, + "loss": 1.8218, + "step": 8681 + }, + { + "epoch": 2.664825046040516, + "grad_norm": 0.3703038692474365, + "learning_rate": 8.604263677221533e-05, + "loss": 1.8484, + "step": 8682 + }, + { + "epoch": 2.6651319828115407, + "grad_norm": 0.2609662711620331, + "learning_rate": 8.603919155744796e-05, + "loss": 1.7645, + "step": 8683 + }, + { + "epoch": 2.665438919582566, + "grad_norm": 0.33297231793403625, + "learning_rate": 8.603574598652015e-05, + "loss": 1.8543, + "step": 8684 + }, + { + "epoch": 2.665745856353591, + "grad_norm": 0.28411462903022766, + "learning_rate": 8.603230005946601e-05, + "loss": 1.867, + "step": 8685 + }, + { + "epoch": 2.6660527931246163, + "grad_norm": 0.3209732174873352, + "learning_rate": 8.602885377631954e-05, + "loss": 1.8886, + "step": 8686 + }, + { + "epoch": 2.6663597298956416, + "grad_norm": 0.35397234559059143, + "learning_rate": 8.602540713711482e-05, + "loss": 1.8965, + "step": 8687 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2925071716308594, + "learning_rate": 8.602196014188593e-05, + "loss": 1.8027, + "step": 8688 + }, + { + "epoch": 2.666973603437692, + "grad_norm": 0.2902941107749939, + "learning_rate": 8.60185127906669e-05, + "loss": 1.8022, + "step": 8689 + }, + { + "epoch": 2.6672805402087167, + "grad_norm": 0.31528550386428833, + "learning_rate": 8.601506508349181e-05, + "loss": 1.8153, + "step": 8690 + }, + { + "epoch": 2.667587476979742, + "grad_norm": 0.32254844903945923, + "learning_rate": 8.601161702039477e-05, + "loss": 1.8199, + "step": 8691 + }, + { + "epoch": 2.6678944137507674, + "grad_norm": 0.2999059855937958, + "learning_rate": 8.600816860140979e-05, + "loss": 1.8404, + "step": 8692 + }, + { + "epoch": 2.6682013505217927, + "grad_norm": 0.32727453112602234, + "learning_rate": 8.6004719826571e-05, + "loss": 1.8148, + "step": 8693 + }, + { + "epoch": 2.6685082872928176, + "grad_norm": 0.3048906624317169, + "learning_rate": 8.600127069591245e-05, + "loss": 1.833, + "step": 8694 + }, + { + "epoch": 2.668815224063843, + "grad_norm": 0.43790102005004883, + "learning_rate": 8.599782120946826e-05, + "loss": 1.8537, + "step": 8695 + }, + { + "epoch": 2.669122160834868, + "grad_norm": 0.38096752762794495, + "learning_rate": 8.59943713672725e-05, + "loss": 1.8094, + "step": 8696 + }, + { + "epoch": 2.669429097605893, + "grad_norm": 0.3065931499004364, + "learning_rate": 8.599092116935927e-05, + "loss": 1.8878, + "step": 8697 + }, + { + "epoch": 2.6697360343769185, + "grad_norm": 0.41807904839515686, + "learning_rate": 8.598747061576264e-05, + "loss": 1.8753, + "step": 8698 + }, + { + "epoch": 2.6700429711479434, + "grad_norm": 0.4906943142414093, + "learning_rate": 8.598401970651676e-05, + "loss": 1.7642, + "step": 8699 + }, + { + "epoch": 2.6703499079189688, + "grad_norm": 0.37138858437538147, + "learning_rate": 8.598056844165567e-05, + "loss": 1.8191, + "step": 8700 + }, + { + "epoch": 2.6706568446899936, + "grad_norm": 0.2804940938949585, + "learning_rate": 8.597711682121354e-05, + "loss": 1.8238, + "step": 8701 + }, + { + "epoch": 2.670963781461019, + "grad_norm": 0.3853018581867218, + "learning_rate": 8.597366484522445e-05, + "loss": 1.8762, + "step": 8702 + }, + { + "epoch": 2.6712707182320443, + "grad_norm": 0.3066580295562744, + "learning_rate": 8.597021251372253e-05, + "loss": 1.7638, + "step": 8703 + }, + { + "epoch": 2.671577655003069, + "grad_norm": 0.30797824263572693, + "learning_rate": 8.596675982674186e-05, + "loss": 1.8574, + "step": 8704 + }, + { + "epoch": 2.6718845917740945, + "grad_norm": 0.3268548548221588, + "learning_rate": 8.596330678431661e-05, + "loss": 1.9184, + "step": 8705 + }, + { + "epoch": 2.6721915285451194, + "grad_norm": 0.4077534079551697, + "learning_rate": 8.595985338648087e-05, + "loss": 1.8967, + "step": 8706 + }, + { + "epoch": 2.6724984653161448, + "grad_norm": 0.4514889419078827, + "learning_rate": 8.595639963326881e-05, + "loss": 1.8491, + "step": 8707 + }, + { + "epoch": 2.67280540208717, + "grad_norm": 0.39269959926605225, + "learning_rate": 8.59529455247145e-05, + "loss": 1.7865, + "step": 8708 + }, + { + "epoch": 2.6731123388581954, + "grad_norm": 0.3139820694923401, + "learning_rate": 8.594949106085212e-05, + "loss": 1.8007, + "step": 8709 + }, + { + "epoch": 2.6734192756292203, + "grad_norm": 0.3423599600791931, + "learning_rate": 8.59460362417158e-05, + "loss": 1.8389, + "step": 8710 + }, + { + "epoch": 2.6737262124002457, + "grad_norm": 0.3829670548439026, + "learning_rate": 8.594258106733968e-05, + "loss": 1.8355, + "step": 8711 + }, + { + "epoch": 2.6740331491712706, + "grad_norm": 0.34447145462036133, + "learning_rate": 8.593912553775791e-05, + "loss": 1.8595, + "step": 8712 + }, + { + "epoch": 2.674340085942296, + "grad_norm": 0.34868502616882324, + "learning_rate": 8.593566965300465e-05, + "loss": 1.9195, + "step": 8713 + }, + { + "epoch": 2.674647022713321, + "grad_norm": 0.4919234812259674, + "learning_rate": 8.593221341311402e-05, + "loss": 1.8321, + "step": 8714 + }, + { + "epoch": 2.674953959484346, + "grad_norm": 0.4413202702999115, + "learning_rate": 8.59287568181202e-05, + "loss": 1.7976, + "step": 8715 + }, + { + "epoch": 2.6752608962553714, + "grad_norm": 0.3395153880119324, + "learning_rate": 8.592529986805736e-05, + "loss": 1.7974, + "step": 8716 + }, + { + "epoch": 2.6755678330263963, + "grad_norm": 0.30407002568244934, + "learning_rate": 8.592184256295965e-05, + "loss": 1.7929, + "step": 8717 + }, + { + "epoch": 2.6758747697974217, + "grad_norm": 0.31925150752067566, + "learning_rate": 8.591838490286121e-05, + "loss": 1.8413, + "step": 8718 + }, + { + "epoch": 2.676181706568447, + "grad_norm": 0.28456512093544006, + "learning_rate": 8.591492688779627e-05, + "loss": 1.8686, + "step": 8719 + }, + { + "epoch": 2.6764886433394723, + "grad_norm": 0.3286445438861847, + "learning_rate": 8.591146851779895e-05, + "loss": 1.8538, + "step": 8720 + }, + { + "epoch": 2.6767955801104972, + "grad_norm": 0.40354880690574646, + "learning_rate": 8.590800979290346e-05, + "loss": 1.8599, + "step": 8721 + }, + { + "epoch": 2.6771025168815226, + "grad_norm": 0.3654378652572632, + "learning_rate": 8.590455071314397e-05, + "loss": 1.8063, + "step": 8722 + }, + { + "epoch": 2.6774094536525475, + "grad_norm": 0.3211844861507416, + "learning_rate": 8.590109127855466e-05, + "loss": 1.8146, + "step": 8723 + }, + { + "epoch": 2.677716390423573, + "grad_norm": 0.30884361267089844, + "learning_rate": 8.589763148916973e-05, + "loss": 1.8725, + "step": 8724 + }, + { + "epoch": 2.678023327194598, + "grad_norm": 0.303095281124115, + "learning_rate": 8.589417134502336e-05, + "loss": 1.8994, + "step": 8725 + }, + { + "epoch": 2.678330263965623, + "grad_norm": 0.3086979389190674, + "learning_rate": 8.589071084614977e-05, + "loss": 1.7941, + "step": 8726 + }, + { + "epoch": 2.6786372007366483, + "grad_norm": 0.30298081040382385, + "learning_rate": 8.588724999258311e-05, + "loss": 1.8945, + "step": 8727 + }, + { + "epoch": 2.6789441375076732, + "grad_norm": 0.33253392577171326, + "learning_rate": 8.588378878435763e-05, + "loss": 1.8397, + "step": 8728 + }, + { + "epoch": 2.6792510742786986, + "grad_norm": 0.2782913148403168, + "learning_rate": 8.588032722150752e-05, + "loss": 1.8505, + "step": 8729 + }, + { + "epoch": 2.679558011049724, + "grad_norm": 0.3482373058795929, + "learning_rate": 8.587686530406697e-05, + "loss": 1.9144, + "step": 8730 + }, + { + "epoch": 2.679864947820749, + "grad_norm": 0.31985580921173096, + "learning_rate": 8.587340303207021e-05, + "loss": 1.7695, + "step": 8731 + }, + { + "epoch": 2.680171884591774, + "grad_norm": 0.3222995400428772, + "learning_rate": 8.586994040555147e-05, + "loss": 1.8624, + "step": 8732 + }, + { + "epoch": 2.680478821362799, + "grad_norm": 0.28178468346595764, + "learning_rate": 8.586647742454495e-05, + "loss": 1.8036, + "step": 8733 + }, + { + "epoch": 2.6807857581338244, + "grad_norm": 0.27367156744003296, + "learning_rate": 8.586301408908487e-05, + "loss": 1.801, + "step": 8734 + }, + { + "epoch": 2.6810926949048497, + "grad_norm": 0.2696636915206909, + "learning_rate": 8.585955039920547e-05, + "loss": 1.8211, + "step": 8735 + }, + { + "epoch": 2.681399631675875, + "grad_norm": 0.2880568504333496, + "learning_rate": 8.585608635494098e-05, + "loss": 1.8543, + "step": 8736 + }, + { + "epoch": 2.6817065684469, + "grad_norm": 0.28708669543266296, + "learning_rate": 8.585262195632562e-05, + "loss": 1.8311, + "step": 8737 + }, + { + "epoch": 2.6820135052179253, + "grad_norm": 0.2633354663848877, + "learning_rate": 8.584915720339364e-05, + "loss": 1.7815, + "step": 8738 + }, + { + "epoch": 2.68232044198895, + "grad_norm": 0.25772908329963684, + "learning_rate": 8.584569209617928e-05, + "loss": 1.8322, + "step": 8739 + }, + { + "epoch": 2.6826273787599755, + "grad_norm": 0.2665303647518158, + "learning_rate": 8.584222663471677e-05, + "loss": 1.8456, + "step": 8740 + }, + { + "epoch": 2.682934315531001, + "grad_norm": 0.26330938935279846, + "learning_rate": 8.583876081904038e-05, + "loss": 1.8552, + "step": 8741 + }, + { + "epoch": 2.6832412523020257, + "grad_norm": 0.29758915305137634, + "learning_rate": 8.583529464918434e-05, + "loss": 1.8362, + "step": 8742 + }, + { + "epoch": 2.683548189073051, + "grad_norm": 0.32018154859542847, + "learning_rate": 8.583182812518293e-05, + "loss": 1.8439, + "step": 8743 + }, + { + "epoch": 2.683855125844076, + "grad_norm": 0.33279770612716675, + "learning_rate": 8.582836124707036e-05, + "loss": 1.8629, + "step": 8744 + }, + { + "epoch": 2.6841620626151013, + "grad_norm": 0.40244174003601074, + "learning_rate": 8.582489401488096e-05, + "loss": 1.8221, + "step": 8745 + }, + { + "epoch": 2.6844689993861266, + "grad_norm": 0.3935016393661499, + "learning_rate": 8.582142642864895e-05, + "loss": 1.8564, + "step": 8746 + }, + { + "epoch": 2.6847759361571515, + "grad_norm": 0.3062369227409363, + "learning_rate": 8.58179584884086e-05, + "loss": 1.8587, + "step": 8747 + }, + { + "epoch": 2.685082872928177, + "grad_norm": 0.320422500371933, + "learning_rate": 8.58144901941942e-05, + "loss": 1.8758, + "step": 8748 + }, + { + "epoch": 2.6853898096992017, + "grad_norm": 0.3681413531303406, + "learning_rate": 8.581102154604001e-05, + "loss": 1.7899, + "step": 8749 + }, + { + "epoch": 2.685696746470227, + "grad_norm": 0.37779754400253296, + "learning_rate": 8.580755254398032e-05, + "loss": 1.8584, + "step": 8750 + }, + { + "epoch": 2.6860036832412524, + "grad_norm": 0.34761306643486023, + "learning_rate": 8.58040831880494e-05, + "loss": 1.8656, + "step": 8751 + }, + { + "epoch": 2.6863106200122777, + "grad_norm": 0.2833636403083801, + "learning_rate": 8.580061347828156e-05, + "loss": 1.8043, + "step": 8752 + }, + { + "epoch": 2.6866175567833026, + "grad_norm": 0.29990699887275696, + "learning_rate": 8.579714341471106e-05, + "loss": 1.8365, + "step": 8753 + }, + { + "epoch": 2.686924493554328, + "grad_norm": 0.3322729766368866, + "learning_rate": 8.579367299737222e-05, + "loss": 1.8541, + "step": 8754 + }, + { + "epoch": 2.687231430325353, + "grad_norm": 0.31999245285987854, + "learning_rate": 8.579020222629931e-05, + "loss": 1.8405, + "step": 8755 + }, + { + "epoch": 2.687538367096378, + "grad_norm": 0.332714319229126, + "learning_rate": 8.578673110152666e-05, + "loss": 1.9512, + "step": 8756 + }, + { + "epoch": 2.6878453038674035, + "grad_norm": 0.36372992396354675, + "learning_rate": 8.578325962308855e-05, + "loss": 1.8969, + "step": 8757 + }, + { + "epoch": 2.6881522406384284, + "grad_norm": 0.27239182591438293, + "learning_rate": 8.577978779101929e-05, + "loss": 1.7898, + "step": 8758 + }, + { + "epoch": 2.6884591774094537, + "grad_norm": 0.3552536070346832, + "learning_rate": 8.57763156053532e-05, + "loss": 1.8919, + "step": 8759 + }, + { + "epoch": 2.6887661141804786, + "grad_norm": 0.40591174364089966, + "learning_rate": 8.577284306612458e-05, + "loss": 1.8021, + "step": 8760 + }, + { + "epoch": 2.689073050951504, + "grad_norm": 0.37012994289398193, + "learning_rate": 8.576937017336777e-05, + "loss": 1.7803, + "step": 8761 + }, + { + "epoch": 2.6893799877225293, + "grad_norm": 0.33496031165122986, + "learning_rate": 8.576589692711707e-05, + "loss": 1.8573, + "step": 8762 + }, + { + "epoch": 2.689686924493554, + "grad_norm": 0.35000404715538025, + "learning_rate": 8.576242332740683e-05, + "loss": 1.8769, + "step": 8763 + }, + { + "epoch": 2.6899938612645795, + "grad_norm": 0.32730549573898315, + "learning_rate": 8.575894937427135e-05, + "loss": 1.823, + "step": 8764 + }, + { + "epoch": 2.6903007980356044, + "grad_norm": 0.31418806314468384, + "learning_rate": 8.575547506774497e-05, + "loss": 1.7646, + "step": 8765 + }, + { + "epoch": 2.6906077348066297, + "grad_norm": 0.277721107006073, + "learning_rate": 8.575200040786205e-05, + "loss": 1.8046, + "step": 8766 + }, + { + "epoch": 2.690914671577655, + "grad_norm": 0.3289557695388794, + "learning_rate": 8.574852539465688e-05, + "loss": 1.8145, + "step": 8767 + }, + { + "epoch": 2.6912216083486804, + "grad_norm": 0.28926602005958557, + "learning_rate": 8.574505002816385e-05, + "loss": 1.7627, + "step": 8768 + }, + { + "epoch": 2.6915285451197053, + "grad_norm": 0.2972332835197449, + "learning_rate": 8.574157430841727e-05, + "loss": 1.8294, + "step": 8769 + }, + { + "epoch": 2.6918354818907306, + "grad_norm": 0.28366953134536743, + "learning_rate": 8.57380982354515e-05, + "loss": 1.8535, + "step": 8770 + }, + { + "epoch": 2.6921424186617555, + "grad_norm": 0.2798771262168884, + "learning_rate": 8.57346218093009e-05, + "loss": 1.8298, + "step": 8771 + }, + { + "epoch": 2.692449355432781, + "grad_norm": 0.2614765465259552, + "learning_rate": 8.573114502999983e-05, + "loss": 1.8555, + "step": 8772 + }, + { + "epoch": 2.692756292203806, + "grad_norm": 0.30653777718544006, + "learning_rate": 8.572766789758265e-05, + "loss": 1.8507, + "step": 8773 + }, + { + "epoch": 2.693063228974831, + "grad_norm": 0.3189094066619873, + "learning_rate": 8.572419041208369e-05, + "loss": 1.8791, + "step": 8774 + }, + { + "epoch": 2.6933701657458564, + "grad_norm": 0.33381524682044983, + "learning_rate": 8.572071257353735e-05, + "loss": 1.8241, + "step": 8775 + }, + { + "epoch": 2.6936771025168813, + "grad_norm": 0.2776879668235779, + "learning_rate": 8.571723438197801e-05, + "loss": 1.7837, + "step": 8776 + }, + { + "epoch": 2.6939840392879066, + "grad_norm": 0.35845425724983215, + "learning_rate": 8.571375583744001e-05, + "loss": 1.8896, + "step": 8777 + }, + { + "epoch": 2.694290976058932, + "grad_norm": 0.28849005699157715, + "learning_rate": 8.571027693995775e-05, + "loss": 1.803, + "step": 8778 + }, + { + "epoch": 2.694597912829957, + "grad_norm": 0.3008786141872406, + "learning_rate": 8.57067976895656e-05, + "loss": 1.8559, + "step": 8779 + }, + { + "epoch": 2.694904849600982, + "grad_norm": 0.2924736440181732, + "learning_rate": 8.570331808629795e-05, + "loss": 1.8016, + "step": 8780 + }, + { + "epoch": 2.695211786372007, + "grad_norm": 0.2962380051612854, + "learning_rate": 8.569983813018917e-05, + "loss": 1.819, + "step": 8781 + }, + { + "epoch": 2.6955187231430324, + "grad_norm": 0.3141970634460449, + "learning_rate": 8.569635782127367e-05, + "loss": 1.8462, + "step": 8782 + }, + { + "epoch": 2.6958256599140578, + "grad_norm": 0.297061562538147, + "learning_rate": 8.569287715958584e-05, + "loss": 1.855, + "step": 8783 + }, + { + "epoch": 2.696132596685083, + "grad_norm": 0.30669623613357544, + "learning_rate": 8.568939614516009e-05, + "loss": 1.8626, + "step": 8784 + }, + { + "epoch": 2.696439533456108, + "grad_norm": 0.2782025933265686, + "learning_rate": 8.568591477803081e-05, + "loss": 1.8993, + "step": 8785 + }, + { + "epoch": 2.6967464702271333, + "grad_norm": 0.3644821345806122, + "learning_rate": 8.568243305823239e-05, + "loss": 1.8318, + "step": 8786 + }, + { + "epoch": 2.697053406998158, + "grad_norm": 0.4073259234428406, + "learning_rate": 8.567895098579925e-05, + "loss": 1.8963, + "step": 8787 + }, + { + "epoch": 2.6973603437691835, + "grad_norm": 0.40539780259132385, + "learning_rate": 8.567546856076583e-05, + "loss": 1.8644, + "step": 8788 + }, + { + "epoch": 2.697667280540209, + "grad_norm": 0.36739271879196167, + "learning_rate": 8.567198578316648e-05, + "loss": 1.8555, + "step": 8789 + }, + { + "epoch": 2.6979742173112338, + "grad_norm": 0.3339182138442993, + "learning_rate": 8.566850265303568e-05, + "loss": 1.8431, + "step": 8790 + }, + { + "epoch": 2.698281154082259, + "grad_norm": 0.3389740586280823, + "learning_rate": 8.566501917040784e-05, + "loss": 1.8271, + "step": 8791 + }, + { + "epoch": 2.698588090853284, + "grad_norm": 0.33819615840911865, + "learning_rate": 8.566153533531737e-05, + "loss": 1.8504, + "step": 8792 + }, + { + "epoch": 2.6988950276243093, + "grad_norm": 0.39106276631355286, + "learning_rate": 8.56580511477987e-05, + "loss": 1.7656, + "step": 8793 + }, + { + "epoch": 2.6992019643953347, + "grad_norm": 0.3374726474285126, + "learning_rate": 8.565456660788628e-05, + "loss": 1.8256, + "step": 8794 + }, + { + "epoch": 2.69950890116636, + "grad_norm": 0.33096614480018616, + "learning_rate": 8.565108171561452e-05, + "loss": 1.9486, + "step": 8795 + }, + { + "epoch": 2.699815837937385, + "grad_norm": 0.3202100396156311, + "learning_rate": 8.564759647101788e-05, + "loss": 1.7708, + "step": 8796 + }, + { + "epoch": 2.7001227747084102, + "grad_norm": 0.28830909729003906, + "learning_rate": 8.56441108741308e-05, + "loss": 1.8247, + "step": 8797 + }, + { + "epoch": 2.700429711479435, + "grad_norm": 0.32385459542274475, + "learning_rate": 8.564062492498772e-05, + "loss": 1.8338, + "step": 8798 + }, + { + "epoch": 2.7007366482504604, + "grad_norm": 0.3059900104999542, + "learning_rate": 8.56371386236231e-05, + "loss": 1.8321, + "step": 8799 + }, + { + "epoch": 2.701043585021486, + "grad_norm": 0.2922738492488861, + "learning_rate": 8.563365197007141e-05, + "loss": 1.7734, + "step": 8800 + }, + { + "epoch": 2.7013505217925107, + "grad_norm": 0.32542386651039124, + "learning_rate": 8.563016496436704e-05, + "loss": 1.8696, + "step": 8801 + }, + { + "epoch": 2.701657458563536, + "grad_norm": 0.2830851674079895, + "learning_rate": 8.562667760654452e-05, + "loss": 1.8237, + "step": 8802 + }, + { + "epoch": 2.701964395334561, + "grad_norm": 0.2794142961502075, + "learning_rate": 8.562318989663831e-05, + "loss": 1.8301, + "step": 8803 + }, + { + "epoch": 2.7022713321055862, + "grad_norm": 0.3149101436138153, + "learning_rate": 8.561970183468281e-05, + "loss": 1.8716, + "step": 8804 + }, + { + "epoch": 2.7025782688766116, + "grad_norm": 0.29530593752861023, + "learning_rate": 8.561621342071258e-05, + "loss": 1.9069, + "step": 8805 + }, + { + "epoch": 2.7028852056476365, + "grad_norm": 0.33965879678726196, + "learning_rate": 8.561272465476204e-05, + "loss": 1.8381, + "step": 8806 + }, + { + "epoch": 2.703192142418662, + "grad_norm": 0.3310995399951935, + "learning_rate": 8.560923553686569e-05, + "loss": 1.9293, + "step": 8807 + }, + { + "epoch": 2.7034990791896867, + "grad_norm": 0.3828842043876648, + "learning_rate": 8.5605746067058e-05, + "loss": 1.8789, + "step": 8808 + }, + { + "epoch": 2.703806015960712, + "grad_norm": 0.3666260242462158, + "learning_rate": 8.560225624537346e-05, + "loss": 1.8622, + "step": 8809 + }, + { + "epoch": 2.7041129527317374, + "grad_norm": 0.36732783913612366, + "learning_rate": 8.559876607184653e-05, + "loss": 1.8177, + "step": 8810 + }, + { + "epoch": 2.7044198895027627, + "grad_norm": 0.35554859042167664, + "learning_rate": 8.559527554651176e-05, + "loss": 1.884, + "step": 8811 + }, + { + "epoch": 2.7047268262737876, + "grad_norm": 0.3118159770965576, + "learning_rate": 8.55917846694036e-05, + "loss": 1.8779, + "step": 8812 + }, + { + "epoch": 2.705033763044813, + "grad_norm": 0.278105765581131, + "learning_rate": 8.558829344055657e-05, + "loss": 1.8513, + "step": 8813 + }, + { + "epoch": 2.705340699815838, + "grad_norm": 0.30809372663497925, + "learning_rate": 8.558480186000517e-05, + "loss": 1.8023, + "step": 8814 + }, + { + "epoch": 2.705647636586863, + "grad_norm": 0.28222522139549255, + "learning_rate": 8.558130992778388e-05, + "loss": 1.8421, + "step": 8815 + }, + { + "epoch": 2.7059545733578885, + "grad_norm": 0.29532718658447266, + "learning_rate": 8.557781764392725e-05, + "loss": 1.8131, + "step": 8816 + }, + { + "epoch": 2.7062615101289134, + "grad_norm": 0.2670072317123413, + "learning_rate": 8.557432500846975e-05, + "loss": 1.7856, + "step": 8817 + }, + { + "epoch": 2.7065684468999387, + "grad_norm": 0.3431483805179596, + "learning_rate": 8.557083202144594e-05, + "loss": 1.8484, + "step": 8818 + }, + { + "epoch": 2.7068753836709636, + "grad_norm": 0.3824561536312103, + "learning_rate": 8.556733868289033e-05, + "loss": 1.8954, + "step": 8819 + }, + { + "epoch": 2.707182320441989, + "grad_norm": 0.4189379811286926, + "learning_rate": 8.55638449928374e-05, + "loss": 1.7846, + "step": 8820 + }, + { + "epoch": 2.7074892572130143, + "grad_norm": 0.34948450326919556, + "learning_rate": 8.556035095132173e-05, + "loss": 1.7696, + "step": 8821 + }, + { + "epoch": 2.707796193984039, + "grad_norm": 0.2906292676925659, + "learning_rate": 8.555685655837783e-05, + "loss": 1.8359, + "step": 8822 + }, + { + "epoch": 2.7081031307550645, + "grad_norm": 0.2756035029888153, + "learning_rate": 8.555336181404023e-05, + "loss": 1.8684, + "step": 8823 + }, + { + "epoch": 2.7084100675260894, + "grad_norm": 0.3714772164821625, + "learning_rate": 8.554986671834346e-05, + "loss": 1.8833, + "step": 8824 + }, + { + "epoch": 2.7087170042971147, + "grad_norm": 0.41674792766571045, + "learning_rate": 8.554637127132209e-05, + "loss": 1.8272, + "step": 8825 + }, + { + "epoch": 2.70902394106814, + "grad_norm": 0.333915650844574, + "learning_rate": 8.554287547301063e-05, + "loss": 1.8343, + "step": 8826 + }, + { + "epoch": 2.7093308778391654, + "grad_norm": 0.33764639496803284, + "learning_rate": 8.553937932344365e-05, + "loss": 1.812, + "step": 8827 + }, + { + "epoch": 2.7096378146101903, + "grad_norm": 0.4445551931858063, + "learning_rate": 8.553588282265569e-05, + "loss": 1.8386, + "step": 8828 + }, + { + "epoch": 2.7099447513812156, + "grad_norm": 0.43314024806022644, + "learning_rate": 8.553238597068131e-05, + "loss": 1.7727, + "step": 8829 + }, + { + "epoch": 2.7102516881522405, + "grad_norm": 0.364596426486969, + "learning_rate": 8.552888876755506e-05, + "loss": 1.8875, + "step": 8830 + }, + { + "epoch": 2.710558624923266, + "grad_norm": 0.3023224174976349, + "learning_rate": 8.552539121331151e-05, + "loss": 1.8676, + "step": 8831 + }, + { + "epoch": 2.710865561694291, + "grad_norm": 0.3278682231903076, + "learning_rate": 8.552189330798522e-05, + "loss": 1.852, + "step": 8832 + }, + { + "epoch": 2.711172498465316, + "grad_norm": 0.34684303402900696, + "learning_rate": 8.551839505161077e-05, + "loss": 1.8449, + "step": 8833 + }, + { + "epoch": 2.7114794352363414, + "grad_norm": 0.3398132920265198, + "learning_rate": 8.551489644422271e-05, + "loss": 1.8493, + "step": 8834 + }, + { + "epoch": 2.7117863720073663, + "grad_norm": 0.2835905849933624, + "learning_rate": 8.551139748585563e-05, + "loss": 1.8283, + "step": 8835 + }, + { + "epoch": 2.7120933087783916, + "grad_norm": 0.30910351872444153, + "learning_rate": 8.55078981765441e-05, + "loss": 1.8429, + "step": 8836 + }, + { + "epoch": 2.712400245549417, + "grad_norm": 0.3802061676979065, + "learning_rate": 8.550439851632272e-05, + "loss": 1.8348, + "step": 8837 + }, + { + "epoch": 2.712707182320442, + "grad_norm": 0.3686448931694031, + "learning_rate": 8.550089850522606e-05, + "loss": 1.8652, + "step": 8838 + }, + { + "epoch": 2.713014119091467, + "grad_norm": 0.2919705808162689, + "learning_rate": 8.549739814328872e-05, + "loss": 1.8318, + "step": 8839 + }, + { + "epoch": 2.713321055862492, + "grad_norm": 0.34780198335647583, + "learning_rate": 8.549389743054527e-05, + "loss": 1.8781, + "step": 8840 + }, + { + "epoch": 2.7136279926335174, + "grad_norm": 0.3955966532230377, + "learning_rate": 8.549039636703034e-05, + "loss": 1.867, + "step": 8841 + }, + { + "epoch": 2.7139349294045427, + "grad_norm": 0.2836689054965973, + "learning_rate": 8.548689495277851e-05, + "loss": 1.7859, + "step": 8842 + }, + { + "epoch": 2.714241866175568, + "grad_norm": 0.369865357875824, + "learning_rate": 8.548339318782436e-05, + "loss": 1.8246, + "step": 8843 + }, + { + "epoch": 2.714548802946593, + "grad_norm": 0.2901081442832947, + "learning_rate": 8.547989107220256e-05, + "loss": 1.7888, + "step": 8844 + }, + { + "epoch": 2.7148557397176183, + "grad_norm": 0.2790970802307129, + "learning_rate": 8.547638860594764e-05, + "loss": 1.8311, + "step": 8845 + }, + { + "epoch": 2.715162676488643, + "grad_norm": 0.2935783267021179, + "learning_rate": 8.547288578909429e-05, + "loss": 1.857, + "step": 8846 + }, + { + "epoch": 2.7154696132596685, + "grad_norm": 0.27074959874153137, + "learning_rate": 8.546938262167708e-05, + "loss": 1.7457, + "step": 8847 + }, + { + "epoch": 2.715776550030694, + "grad_norm": 0.3042888343334198, + "learning_rate": 8.546587910373063e-05, + "loss": 1.8598, + "step": 8848 + }, + { + "epoch": 2.7160834868017187, + "grad_norm": 0.29088664054870605, + "learning_rate": 8.546237523528958e-05, + "loss": 1.8461, + "step": 8849 + }, + { + "epoch": 2.716390423572744, + "grad_norm": 0.3022211492061615, + "learning_rate": 8.545887101638857e-05, + "loss": 1.8327, + "step": 8850 + }, + { + "epoch": 2.716697360343769, + "grad_norm": 0.30194929242134094, + "learning_rate": 8.545536644706218e-05, + "loss": 1.8331, + "step": 8851 + }, + { + "epoch": 2.7170042971147943, + "grad_norm": 0.31702303886413574, + "learning_rate": 8.54518615273451e-05, + "loss": 1.8576, + "step": 8852 + }, + { + "epoch": 2.7173112338858196, + "grad_norm": 0.30386796593666077, + "learning_rate": 8.544835625727195e-05, + "loss": 1.8278, + "step": 8853 + }, + { + "epoch": 2.717618170656845, + "grad_norm": 0.30670568346977234, + "learning_rate": 8.544485063687735e-05, + "loss": 1.8123, + "step": 8854 + }, + { + "epoch": 2.71792510742787, + "grad_norm": 0.3896371126174927, + "learning_rate": 8.544134466619597e-05, + "loss": 1.8101, + "step": 8855 + }, + { + "epoch": 2.718232044198895, + "grad_norm": 0.4742000699043274, + "learning_rate": 8.543783834526245e-05, + "loss": 1.8402, + "step": 8856 + }, + { + "epoch": 2.71853898096992, + "grad_norm": 0.4234209954738617, + "learning_rate": 8.543433167411143e-05, + "loss": 1.8814, + "step": 8857 + }, + { + "epoch": 2.7188459177409454, + "grad_norm": 0.28478503227233887, + "learning_rate": 8.54308246527776e-05, + "loss": 1.8165, + "step": 8858 + }, + { + "epoch": 2.7191528545119708, + "grad_norm": 0.3534078896045685, + "learning_rate": 8.542731728129558e-05, + "loss": 1.7947, + "step": 8859 + }, + { + "epoch": 2.7194597912829956, + "grad_norm": 0.5471592545509338, + "learning_rate": 8.542380955970004e-05, + "loss": 1.9073, + "step": 8860 + }, + { + "epoch": 2.719766728054021, + "grad_norm": 0.5037226676940918, + "learning_rate": 8.542030148802566e-05, + "loss": 1.8701, + "step": 8861 + }, + { + "epoch": 2.720073664825046, + "grad_norm": 0.3415449559688568, + "learning_rate": 8.54167930663071e-05, + "loss": 1.827, + "step": 8862 + }, + { + "epoch": 2.720380601596071, + "grad_norm": 0.33516764640808105, + "learning_rate": 8.541328429457903e-05, + "loss": 1.9396, + "step": 8863 + }, + { + "epoch": 2.7206875383670965, + "grad_norm": 0.3934863209724426, + "learning_rate": 8.540977517287612e-05, + "loss": 1.8738, + "step": 8864 + }, + { + "epoch": 2.7209944751381214, + "grad_norm": 0.5137139558792114, + "learning_rate": 8.540626570123307e-05, + "loss": 1.9007, + "step": 8865 + }, + { + "epoch": 2.7213014119091468, + "grad_norm": 0.5846540331840515, + "learning_rate": 8.540275587968453e-05, + "loss": 1.9335, + "step": 8866 + }, + { + "epoch": 2.7216083486801717, + "grad_norm": 0.613388180732727, + "learning_rate": 8.539924570826523e-05, + "loss": 1.8967, + "step": 8867 + }, + { + "epoch": 2.721915285451197, + "grad_norm": 0.4804840087890625, + "learning_rate": 8.539573518700983e-05, + "loss": 1.7712, + "step": 8868 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.34939101338386536, + "learning_rate": 8.539222431595303e-05, + "loss": 1.8578, + "step": 8869 + }, + { + "epoch": 2.7225291589932477, + "grad_norm": 0.4230511486530304, + "learning_rate": 8.538871309512951e-05, + "loss": 1.793, + "step": 8870 + }, + { + "epoch": 2.7228360957642725, + "grad_norm": 0.5383400917053223, + "learning_rate": 8.538520152457402e-05, + "loss": 1.8153, + "step": 8871 + }, + { + "epoch": 2.723143032535298, + "grad_norm": 0.46213194727897644, + "learning_rate": 8.538168960432118e-05, + "loss": 1.9357, + "step": 8872 + }, + { + "epoch": 2.7234499693063228, + "grad_norm": 0.3126194477081299, + "learning_rate": 8.537817733440577e-05, + "loss": 1.7954, + "step": 8873 + }, + { + "epoch": 2.723756906077348, + "grad_norm": 0.4018714129924774, + "learning_rate": 8.537466471486248e-05, + "loss": 1.824, + "step": 8874 + }, + { + "epoch": 2.7240638428483734, + "grad_norm": 0.5690213441848755, + "learning_rate": 8.537115174572602e-05, + "loss": 1.7807, + "step": 8875 + }, + { + "epoch": 2.7243707796193983, + "grad_norm": 0.4669814705848694, + "learning_rate": 8.53676384270311e-05, + "loss": 1.7438, + "step": 8876 + }, + { + "epoch": 2.7246777163904237, + "grad_norm": 0.3040566146373749, + "learning_rate": 8.536412475881246e-05, + "loss": 1.8613, + "step": 8877 + }, + { + "epoch": 2.7249846531614486, + "grad_norm": 0.38985559344291687, + "learning_rate": 8.53606107411048e-05, + "loss": 1.816, + "step": 8878 + }, + { + "epoch": 2.725291589932474, + "grad_norm": 0.4417174160480499, + "learning_rate": 8.535709637394285e-05, + "loss": 1.8675, + "step": 8879 + }, + { + "epoch": 2.7255985267034992, + "grad_norm": 0.3254696726799011, + "learning_rate": 8.535358165736138e-05, + "loss": 1.8419, + "step": 8880 + }, + { + "epoch": 2.725905463474524, + "grad_norm": 0.36002370715141296, + "learning_rate": 8.535006659139506e-05, + "loss": 1.9084, + "step": 8881 + }, + { + "epoch": 2.7262124002455494, + "grad_norm": 0.3471790850162506, + "learning_rate": 8.534655117607869e-05, + "loss": 1.8442, + "step": 8882 + }, + { + "epoch": 2.7265193370165743, + "grad_norm": 0.3042849004268646, + "learning_rate": 8.534303541144697e-05, + "loss": 1.8261, + "step": 8883 + }, + { + "epoch": 2.7268262737875997, + "grad_norm": 0.32416659593582153, + "learning_rate": 8.533951929753465e-05, + "loss": 1.8625, + "step": 8884 + }, + { + "epoch": 2.727133210558625, + "grad_norm": 0.32449519634246826, + "learning_rate": 8.53360028343765e-05, + "loss": 1.8653, + "step": 8885 + }, + { + "epoch": 2.7274401473296503, + "grad_norm": 0.34744054079055786, + "learning_rate": 8.533248602200726e-05, + "loss": 1.8742, + "step": 8886 + }, + { + "epoch": 2.7277470841006752, + "grad_norm": 0.30540695786476135, + "learning_rate": 8.532896886046167e-05, + "loss": 1.8064, + "step": 8887 + }, + { + "epoch": 2.7280540208717006, + "grad_norm": 0.27105677127838135, + "learning_rate": 8.532545134977452e-05, + "loss": 1.7867, + "step": 8888 + }, + { + "epoch": 2.7283609576427255, + "grad_norm": 0.2682685852050781, + "learning_rate": 8.532193348998054e-05, + "loss": 1.8191, + "step": 8889 + }, + { + "epoch": 2.728667894413751, + "grad_norm": 0.33534809947013855, + "learning_rate": 8.531841528111452e-05, + "loss": 1.8758, + "step": 8890 + }, + { + "epoch": 2.728974831184776, + "grad_norm": 0.33555057644844055, + "learning_rate": 8.531489672321122e-05, + "loss": 1.8932, + "step": 8891 + }, + { + "epoch": 2.729281767955801, + "grad_norm": 0.3532167077064514, + "learning_rate": 8.531137781630542e-05, + "loss": 1.8621, + "step": 8892 + }, + { + "epoch": 2.7295887047268264, + "grad_norm": 0.337634414434433, + "learning_rate": 8.530785856043186e-05, + "loss": 1.8618, + "step": 8893 + }, + { + "epoch": 2.7298956414978512, + "grad_norm": 0.28855568170547485, + "learning_rate": 8.530433895562538e-05, + "loss": 1.8248, + "step": 8894 + }, + { + "epoch": 2.7302025782688766, + "grad_norm": 0.3128049373626709, + "learning_rate": 8.530081900192071e-05, + "loss": 1.8071, + "step": 8895 + }, + { + "epoch": 2.730509515039902, + "grad_norm": 0.2949801981449127, + "learning_rate": 8.529729869935265e-05, + "loss": 1.7704, + "step": 8896 + }, + { + "epoch": 2.730816451810927, + "grad_norm": 0.2708294987678528, + "learning_rate": 8.529377804795603e-05, + "loss": 1.8127, + "step": 8897 + }, + { + "epoch": 2.731123388581952, + "grad_norm": 0.300516813993454, + "learning_rate": 8.529025704776559e-05, + "loss": 1.9063, + "step": 8898 + }, + { + "epoch": 2.731430325352977, + "grad_norm": 0.2590954005718231, + "learning_rate": 8.528673569881613e-05, + "loss": 1.7595, + "step": 8899 + }, + { + "epoch": 2.7317372621240024, + "grad_norm": 0.30067136883735657, + "learning_rate": 8.528321400114248e-05, + "loss": 1.8697, + "step": 8900 + }, + { + "epoch": 2.7320441988950277, + "grad_norm": 0.3289981186389923, + "learning_rate": 8.527969195477943e-05, + "loss": 1.8257, + "step": 8901 + }, + { + "epoch": 2.732351135666053, + "grad_norm": 0.3205581307411194, + "learning_rate": 8.527616955976178e-05, + "loss": 1.9002, + "step": 8902 + }, + { + "epoch": 2.732658072437078, + "grad_norm": 0.30869361758232117, + "learning_rate": 8.527264681612435e-05, + "loss": 1.8239, + "step": 8903 + }, + { + "epoch": 2.7329650092081033, + "grad_norm": 0.3237484097480774, + "learning_rate": 8.526912372390195e-05, + "loss": 1.8879, + "step": 8904 + }, + { + "epoch": 2.733271945979128, + "grad_norm": 0.3172036111354828, + "learning_rate": 8.52656002831294e-05, + "loss": 1.8118, + "step": 8905 + }, + { + "epoch": 2.7335788827501535, + "grad_norm": 0.3326823115348816, + "learning_rate": 8.52620764938415e-05, + "loss": 1.8035, + "step": 8906 + }, + { + "epoch": 2.733885819521179, + "grad_norm": 0.36605212092399597, + "learning_rate": 8.525855235607311e-05, + "loss": 1.8689, + "step": 8907 + }, + { + "epoch": 2.7341927562922037, + "grad_norm": 0.31904828548431396, + "learning_rate": 8.525502786985905e-05, + "loss": 1.8188, + "step": 8908 + }, + { + "epoch": 2.734499693063229, + "grad_norm": 0.2657643258571625, + "learning_rate": 8.525150303523413e-05, + "loss": 1.7471, + "step": 8909 + }, + { + "epoch": 2.734806629834254, + "grad_norm": 0.32748520374298096, + "learning_rate": 8.524797785223318e-05, + "loss": 1.8678, + "step": 8910 + }, + { + "epoch": 2.7351135666052793, + "grad_norm": 0.32576173543930054, + "learning_rate": 8.524445232089107e-05, + "loss": 1.8296, + "step": 8911 + }, + { + "epoch": 2.7354205033763046, + "grad_norm": 0.3028578758239746, + "learning_rate": 8.524092644124261e-05, + "loss": 1.8656, + "step": 8912 + }, + { + "epoch": 2.7357274401473295, + "grad_norm": 0.29967090487480164, + "learning_rate": 8.523740021332268e-05, + "loss": 1.8206, + "step": 8913 + }, + { + "epoch": 2.736034376918355, + "grad_norm": 0.3042941391468048, + "learning_rate": 8.523387363716611e-05, + "loss": 1.7928, + "step": 8914 + }, + { + "epoch": 2.7363413136893797, + "grad_norm": 0.3278021216392517, + "learning_rate": 8.523034671280772e-05, + "loss": 1.9213, + "step": 8915 + }, + { + "epoch": 2.736648250460405, + "grad_norm": 0.39839017391204834, + "learning_rate": 8.522681944028242e-05, + "loss": 1.8242, + "step": 8916 + }, + { + "epoch": 2.7369551872314304, + "grad_norm": 0.3960748016834259, + "learning_rate": 8.522329181962504e-05, + "loss": 1.8761, + "step": 8917 + }, + { + "epoch": 2.7372621240024557, + "grad_norm": 0.3250591456890106, + "learning_rate": 8.521976385087044e-05, + "loss": 1.8318, + "step": 8918 + }, + { + "epoch": 2.7375690607734806, + "grad_norm": 0.31731119751930237, + "learning_rate": 8.521623553405349e-05, + "loss": 1.8062, + "step": 8919 + }, + { + "epoch": 2.737875997544506, + "grad_norm": 0.32452264428138733, + "learning_rate": 8.521270686920906e-05, + "loss": 1.8384, + "step": 8920 + }, + { + "epoch": 2.738182934315531, + "grad_norm": 0.2892500162124634, + "learning_rate": 8.520917785637204e-05, + "loss": 1.8128, + "step": 8921 + }, + { + "epoch": 2.738489871086556, + "grad_norm": 0.30028483271598816, + "learning_rate": 8.520564849557726e-05, + "loss": 1.8512, + "step": 8922 + }, + { + "epoch": 2.7387968078575815, + "grad_norm": 0.29927411675453186, + "learning_rate": 8.520211878685964e-05, + "loss": 1.8431, + "step": 8923 + }, + { + "epoch": 2.7391037446286064, + "grad_norm": 0.3426479995250702, + "learning_rate": 8.519858873025405e-05, + "loss": 1.8724, + "step": 8924 + }, + { + "epoch": 2.7394106813996317, + "grad_norm": 0.3795917332172394, + "learning_rate": 8.519505832579538e-05, + "loss": 1.8888, + "step": 8925 + }, + { + "epoch": 2.7397176181706566, + "grad_norm": 0.4924582839012146, + "learning_rate": 8.519152757351849e-05, + "loss": 1.7743, + "step": 8926 + }, + { + "epoch": 2.740024554941682, + "grad_norm": 0.43054282665252686, + "learning_rate": 8.518799647345832e-05, + "loss": 1.8556, + "step": 8927 + }, + { + "epoch": 2.7403314917127073, + "grad_norm": 0.37040412425994873, + "learning_rate": 8.518446502564974e-05, + "loss": 1.9162, + "step": 8928 + }, + { + "epoch": 2.7406384284837326, + "grad_norm": 0.38334885239601135, + "learning_rate": 8.518093323012766e-05, + "loss": 1.8078, + "step": 8929 + }, + { + "epoch": 2.7409453652547575, + "grad_norm": 0.409101665019989, + "learning_rate": 8.517740108692698e-05, + "loss": 1.7874, + "step": 8930 + }, + { + "epoch": 2.741252302025783, + "grad_norm": 0.3953499495983124, + "learning_rate": 8.517386859608258e-05, + "loss": 1.8455, + "step": 8931 + }, + { + "epoch": 2.7415592387968077, + "grad_norm": 0.30524972081184387, + "learning_rate": 8.517033575762942e-05, + "loss": 1.822, + "step": 8932 + }, + { + "epoch": 2.741866175567833, + "grad_norm": 0.354086309671402, + "learning_rate": 8.516680257160239e-05, + "loss": 1.859, + "step": 8933 + }, + { + "epoch": 2.7421731123388584, + "grad_norm": 0.4305376410484314, + "learning_rate": 8.516326903803638e-05, + "loss": 1.8918, + "step": 8934 + }, + { + "epoch": 2.7424800491098833, + "grad_norm": 0.590727686882019, + "learning_rate": 8.515973515696635e-05, + "loss": 1.8841, + "step": 8935 + }, + { + "epoch": 2.7427869858809086, + "grad_norm": 0.665314257144928, + "learning_rate": 8.515620092842723e-05, + "loss": 1.8166, + "step": 8936 + }, + { + "epoch": 2.7430939226519335, + "grad_norm": 0.5579181909561157, + "learning_rate": 8.515266635245389e-05, + "loss": 1.8344, + "step": 8937 + }, + { + "epoch": 2.743400859422959, + "grad_norm": 0.3698382079601288, + "learning_rate": 8.514913142908132e-05, + "loss": 1.8445, + "step": 8938 + }, + { + "epoch": 2.743707796193984, + "grad_norm": 0.30882057547569275, + "learning_rate": 8.514559615834442e-05, + "loss": 1.8443, + "step": 8939 + }, + { + "epoch": 2.744014732965009, + "grad_norm": 0.35821446776390076, + "learning_rate": 8.514206054027815e-05, + "loss": 1.8482, + "step": 8940 + }, + { + "epoch": 2.7443216697360344, + "grad_norm": 0.35552099347114563, + "learning_rate": 8.513852457491744e-05, + "loss": 1.7848, + "step": 8941 + }, + { + "epoch": 2.7446286065070593, + "grad_norm": 0.27788954973220825, + "learning_rate": 8.513498826229722e-05, + "loss": 1.7935, + "step": 8942 + }, + { + "epoch": 2.7449355432780846, + "grad_norm": 0.30653929710388184, + "learning_rate": 8.513145160245246e-05, + "loss": 1.808, + "step": 8943 + }, + { + "epoch": 2.74524248004911, + "grad_norm": 0.34749966859817505, + "learning_rate": 8.512791459541812e-05, + "loss": 1.8498, + "step": 8944 + }, + { + "epoch": 2.7455494168201353, + "grad_norm": 0.362326979637146, + "learning_rate": 8.512437724122912e-05, + "loss": 1.8263, + "step": 8945 + }, + { + "epoch": 2.74585635359116, + "grad_norm": 0.2914038598537445, + "learning_rate": 8.512083953992044e-05, + "loss": 1.834, + "step": 8946 + }, + { + "epoch": 2.7461632903621855, + "grad_norm": 0.31662893295288086, + "learning_rate": 8.511730149152705e-05, + "loss": 1.8157, + "step": 8947 + }, + { + "epoch": 2.7464702271332104, + "grad_norm": 0.38970568776130676, + "learning_rate": 8.51137630960839e-05, + "loss": 1.8764, + "step": 8948 + }, + { + "epoch": 2.7467771639042358, + "grad_norm": 0.3907272517681122, + "learning_rate": 8.511022435362594e-05, + "loss": 1.8665, + "step": 8949 + }, + { + "epoch": 2.747084100675261, + "grad_norm": 0.3315196931362152, + "learning_rate": 8.510668526418819e-05, + "loss": 1.8076, + "step": 8950 + }, + { + "epoch": 2.747391037446286, + "grad_norm": 0.29783520102500916, + "learning_rate": 8.510314582780559e-05, + "loss": 1.8518, + "step": 8951 + }, + { + "epoch": 2.7476979742173113, + "grad_norm": 0.3085685670375824, + "learning_rate": 8.509960604451312e-05, + "loss": 1.8961, + "step": 8952 + }, + { + "epoch": 2.748004910988336, + "grad_norm": 0.3204992711544037, + "learning_rate": 8.509606591434579e-05, + "loss": 1.8374, + "step": 8953 + }, + { + "epoch": 2.7483118477593615, + "grad_norm": 0.2801276445388794, + "learning_rate": 8.509252543733855e-05, + "loss": 1.8455, + "step": 8954 + }, + { + "epoch": 2.748618784530387, + "grad_norm": 0.26911506056785583, + "learning_rate": 8.508898461352641e-05, + "loss": 1.8093, + "step": 8955 + }, + { + "epoch": 2.7489257213014118, + "grad_norm": 0.30429625511169434, + "learning_rate": 8.508544344294435e-05, + "loss": 1.8526, + "step": 8956 + }, + { + "epoch": 2.749232658072437, + "grad_norm": 0.308403342962265, + "learning_rate": 8.50819019256274e-05, + "loss": 1.7917, + "step": 8957 + }, + { + "epoch": 2.749539594843462, + "grad_norm": 0.3292251229286194, + "learning_rate": 8.507836006161052e-05, + "loss": 1.8206, + "step": 8958 + }, + { + "epoch": 2.7498465316144873, + "grad_norm": 0.30014076828956604, + "learning_rate": 8.507481785092871e-05, + "loss": 1.8136, + "step": 8959 + }, + { + "epoch": 2.7501534683855127, + "grad_norm": 0.2879343032836914, + "learning_rate": 8.5071275293617e-05, + "loss": 1.8476, + "step": 8960 + }, + { + "epoch": 2.750460405156538, + "grad_norm": 0.30646058917045593, + "learning_rate": 8.506773238971039e-05, + "loss": 1.7936, + "step": 8961 + }, + { + "epoch": 2.750767341927563, + "grad_norm": 0.309804230928421, + "learning_rate": 8.506418913924391e-05, + "loss": 1.8076, + "step": 8962 + }, + { + "epoch": 2.7510742786985882, + "grad_norm": 0.27035996317863464, + "learning_rate": 8.506064554225255e-05, + "loss": 1.8169, + "step": 8963 + }, + { + "epoch": 2.751381215469613, + "grad_norm": 0.3185548782348633, + "learning_rate": 8.505710159877134e-05, + "loss": 1.8265, + "step": 8964 + }, + { + "epoch": 2.7516881522406385, + "grad_norm": 0.3806973099708557, + "learning_rate": 8.505355730883532e-05, + "loss": 1.824, + "step": 8965 + }, + { + "epoch": 2.751995089011664, + "grad_norm": 0.3206372857093811, + "learning_rate": 8.505001267247949e-05, + "loss": 1.8436, + "step": 8966 + }, + { + "epoch": 2.7523020257826887, + "grad_norm": 0.2957460880279541, + "learning_rate": 8.504646768973889e-05, + "loss": 1.8212, + "step": 8967 + }, + { + "epoch": 2.752608962553714, + "grad_norm": 0.2854628562927246, + "learning_rate": 8.504292236064854e-05, + "loss": 1.862, + "step": 8968 + }, + { + "epoch": 2.752915899324739, + "grad_norm": 0.30056047439575195, + "learning_rate": 8.503937668524351e-05, + "loss": 1.8007, + "step": 8969 + }, + { + "epoch": 2.7532228360957642, + "grad_norm": 0.33884522318840027, + "learning_rate": 8.503583066355883e-05, + "loss": 1.8972, + "step": 8970 + }, + { + "epoch": 2.7535297728667896, + "grad_norm": 0.29358747601509094, + "learning_rate": 8.503228429562951e-05, + "loss": 1.8343, + "step": 8971 + }, + { + "epoch": 2.7538367096378145, + "grad_norm": 0.3650909662246704, + "learning_rate": 8.502873758149063e-05, + "loss": 1.7866, + "step": 8972 + }, + { + "epoch": 2.75414364640884, + "grad_norm": 0.3245839476585388, + "learning_rate": 8.502519052117725e-05, + "loss": 1.8451, + "step": 8973 + }, + { + "epoch": 2.7544505831798647, + "grad_norm": 0.305429071187973, + "learning_rate": 8.502164311472441e-05, + "loss": 1.9277, + "step": 8974 + }, + { + "epoch": 2.75475751995089, + "grad_norm": 0.3520638942718506, + "learning_rate": 8.501809536216716e-05, + "loss": 1.7648, + "step": 8975 + }, + { + "epoch": 2.7550644567219154, + "grad_norm": 0.419918030500412, + "learning_rate": 8.501454726354054e-05, + "loss": 1.7862, + "step": 8976 + }, + { + "epoch": 2.7553713934929407, + "grad_norm": 0.3854345977306366, + "learning_rate": 8.501099881887968e-05, + "loss": 1.8234, + "step": 8977 + }, + { + "epoch": 2.7556783302639656, + "grad_norm": 0.27826064825057983, + "learning_rate": 8.50074500282196e-05, + "loss": 1.7694, + "step": 8978 + }, + { + "epoch": 2.755985267034991, + "grad_norm": 0.3439055383205414, + "learning_rate": 8.500390089159536e-05, + "loss": 1.8136, + "step": 8979 + }, + { + "epoch": 2.756292203806016, + "grad_norm": 0.3434913754463196, + "learning_rate": 8.500035140904208e-05, + "loss": 1.8053, + "step": 8980 + }, + { + "epoch": 2.756599140577041, + "grad_norm": 0.27551600337028503, + "learning_rate": 8.49968015805948e-05, + "loss": 1.8349, + "step": 8981 + }, + { + "epoch": 2.7569060773480665, + "grad_norm": 0.304706871509552, + "learning_rate": 8.499325140628863e-05, + "loss": 1.8488, + "step": 8982 + }, + { + "epoch": 2.7572130141190914, + "grad_norm": 0.36910584568977356, + "learning_rate": 8.498970088615861e-05, + "loss": 1.8519, + "step": 8983 + }, + { + "epoch": 2.7575199508901167, + "grad_norm": 0.30584999918937683, + "learning_rate": 8.498615002023987e-05, + "loss": 1.8479, + "step": 8984 + }, + { + "epoch": 2.7578268876611416, + "grad_norm": 0.28511542081832886, + "learning_rate": 8.498259880856749e-05, + "loss": 1.8047, + "step": 8985 + }, + { + "epoch": 2.758133824432167, + "grad_norm": 0.28804922103881836, + "learning_rate": 8.497904725117658e-05, + "loss": 1.891, + "step": 8986 + }, + { + "epoch": 2.7584407612031923, + "grad_norm": 0.32592445611953735, + "learning_rate": 8.497549534810221e-05, + "loss": 1.8081, + "step": 8987 + }, + { + "epoch": 2.758747697974217, + "grad_norm": 0.3298552632331848, + "learning_rate": 8.497194309937949e-05, + "loss": 1.8897, + "step": 8988 + }, + { + "epoch": 2.7590546347452425, + "grad_norm": 0.3506438136100769, + "learning_rate": 8.496839050504353e-05, + "loss": 1.9007, + "step": 8989 + }, + { + "epoch": 2.7593615715162674, + "grad_norm": 0.30891793966293335, + "learning_rate": 8.496483756512946e-05, + "loss": 1.8154, + "step": 8990 + }, + { + "epoch": 2.7596685082872927, + "grad_norm": 0.3697068691253662, + "learning_rate": 8.496128427967235e-05, + "loss": 1.8301, + "step": 8991 + }, + { + "epoch": 2.759975445058318, + "grad_norm": 0.3090182840824127, + "learning_rate": 8.495773064870734e-05, + "loss": 1.8443, + "step": 8992 + }, + { + "epoch": 2.7602823818293434, + "grad_norm": 0.31172695755958557, + "learning_rate": 8.495417667226955e-05, + "loss": 1.8051, + "step": 8993 + }, + { + "epoch": 2.7605893186003683, + "grad_norm": 0.34285077452659607, + "learning_rate": 8.495062235039411e-05, + "loss": 1.8766, + "step": 8994 + }, + { + "epoch": 2.7608962553713936, + "grad_norm": 0.30001118779182434, + "learning_rate": 8.494706768311612e-05, + "loss": 1.8267, + "step": 8995 + }, + { + "epoch": 2.7612031921424185, + "grad_norm": 0.2767544984817505, + "learning_rate": 8.494351267047074e-05, + "loss": 1.8038, + "step": 8996 + }, + { + "epoch": 2.761510128913444, + "grad_norm": 0.2952648401260376, + "learning_rate": 8.493995731249307e-05, + "loss": 1.7863, + "step": 8997 + }, + { + "epoch": 2.761817065684469, + "grad_norm": 0.27491581439971924, + "learning_rate": 8.493640160921828e-05, + "loss": 1.844, + "step": 8998 + }, + { + "epoch": 2.762124002455494, + "grad_norm": 0.2733328938484192, + "learning_rate": 8.493284556068147e-05, + "loss": 1.7909, + "step": 8999 + }, + { + "epoch": 2.7624309392265194, + "grad_norm": 0.3201010525226593, + "learning_rate": 8.492928916691783e-05, + "loss": 1.8827, + "step": 9000 + }, + { + "epoch": 2.7627378759975443, + "grad_norm": 0.293652206659317, + "learning_rate": 8.492573242796244e-05, + "loss": 1.7755, + "step": 9001 + }, + { + "epoch": 2.7630448127685696, + "grad_norm": 0.2862321436405182, + "learning_rate": 8.492217534385053e-05, + "loss": 1.7868, + "step": 9002 + }, + { + "epoch": 2.763351749539595, + "grad_norm": 0.364490270614624, + "learning_rate": 8.491861791461722e-05, + "loss": 1.8276, + "step": 9003 + }, + { + "epoch": 2.7636586863106203, + "grad_norm": 0.4316955506801605, + "learning_rate": 8.491506014029765e-05, + "loss": 1.8727, + "step": 9004 + }, + { + "epoch": 2.763965623081645, + "grad_norm": 0.37957659363746643, + "learning_rate": 8.491150202092697e-05, + "loss": 1.8471, + "step": 9005 + }, + { + "epoch": 2.7642725598526705, + "grad_norm": 0.2936808168888092, + "learning_rate": 8.490794355654039e-05, + "loss": 1.7964, + "step": 9006 + }, + { + "epoch": 2.7645794966236954, + "grad_norm": 0.3742556869983673, + "learning_rate": 8.490438474717304e-05, + "loss": 1.8461, + "step": 9007 + }, + { + "epoch": 2.7648864333947207, + "grad_norm": 0.4273780286312103, + "learning_rate": 8.49008255928601e-05, + "loss": 1.7947, + "step": 9008 + }, + { + "epoch": 2.765193370165746, + "grad_norm": 0.35967808961868286, + "learning_rate": 8.489726609363675e-05, + "loss": 1.8125, + "step": 9009 + }, + { + "epoch": 2.765500306936771, + "grad_norm": 0.27607613801956177, + "learning_rate": 8.489370624953817e-05, + "loss": 1.8413, + "step": 9010 + }, + { + "epoch": 2.7658072437077963, + "grad_norm": 0.38287433981895447, + "learning_rate": 8.489014606059952e-05, + "loss": 1.8184, + "step": 9011 + }, + { + "epoch": 2.766114180478821, + "grad_norm": 0.4284100830554962, + "learning_rate": 8.4886585526856e-05, + "loss": 1.7965, + "step": 9012 + }, + { + "epoch": 2.7664211172498465, + "grad_norm": 0.35851627588272095, + "learning_rate": 8.48830246483428e-05, + "loss": 1.8275, + "step": 9013 + }, + { + "epoch": 2.766728054020872, + "grad_norm": 0.30598360300064087, + "learning_rate": 8.487946342509509e-05, + "loss": 1.8383, + "step": 9014 + }, + { + "epoch": 2.7670349907918967, + "grad_norm": 0.30098259449005127, + "learning_rate": 8.487590185714811e-05, + "loss": 1.8229, + "step": 9015 + }, + { + "epoch": 2.767341927562922, + "grad_norm": 0.45887723565101624, + "learning_rate": 8.487233994453701e-05, + "loss": 1.9128, + "step": 9016 + }, + { + "epoch": 2.767648864333947, + "grad_norm": 0.4983403980731964, + "learning_rate": 8.4868777687297e-05, + "loss": 1.8269, + "step": 9017 + }, + { + "epoch": 2.7679558011049723, + "grad_norm": 0.4925507605075836, + "learning_rate": 8.48652150854633e-05, + "loss": 1.9231, + "step": 9018 + }, + { + "epoch": 2.7682627378759976, + "grad_norm": 0.31434112787246704, + "learning_rate": 8.48616521390711e-05, + "loss": 1.7782, + "step": 9019 + }, + { + "epoch": 2.768569674647023, + "grad_norm": 0.31802332401275635, + "learning_rate": 8.485808884815563e-05, + "loss": 1.8927, + "step": 9020 + }, + { + "epoch": 2.768876611418048, + "grad_norm": 0.4615871012210846, + "learning_rate": 8.485452521275208e-05, + "loss": 1.7866, + "step": 9021 + }, + { + "epoch": 2.769183548189073, + "grad_norm": 0.43722355365753174, + "learning_rate": 8.48509612328957e-05, + "loss": 1.8159, + "step": 9022 + }, + { + "epoch": 2.769490484960098, + "grad_norm": 0.27137285470962524, + "learning_rate": 8.484739690862169e-05, + "loss": 1.7613, + "step": 9023 + }, + { + "epoch": 2.7697974217311234, + "grad_norm": 0.32973676919937134, + "learning_rate": 8.484383223996528e-05, + "loss": 1.8321, + "step": 9024 + }, + { + "epoch": 2.7701043585021488, + "grad_norm": 0.38628003001213074, + "learning_rate": 8.484026722696169e-05, + "loss": 1.8154, + "step": 9025 + }, + { + "epoch": 2.7704112952731736, + "grad_norm": 0.33044543862342834, + "learning_rate": 8.483670186964617e-05, + "loss": 1.857, + "step": 9026 + }, + { + "epoch": 2.770718232044199, + "grad_norm": 0.2778245210647583, + "learning_rate": 8.483313616805393e-05, + "loss": 1.8524, + "step": 9027 + }, + { + "epoch": 2.771025168815224, + "grad_norm": 0.32064709067344666, + "learning_rate": 8.482957012222024e-05, + "loss": 1.8757, + "step": 9028 + }, + { + "epoch": 2.771332105586249, + "grad_norm": 0.29325249791145325, + "learning_rate": 8.48260037321803e-05, + "loss": 1.8504, + "step": 9029 + }, + { + "epoch": 2.7716390423572745, + "grad_norm": 0.308626651763916, + "learning_rate": 8.48224369979694e-05, + "loss": 1.882, + "step": 9030 + }, + { + "epoch": 2.7719459791282994, + "grad_norm": 0.34577706456184387, + "learning_rate": 8.481886991962276e-05, + "loss": 1.8178, + "step": 9031 + }, + { + "epoch": 2.7722529158993248, + "grad_norm": 0.3902320861816406, + "learning_rate": 8.481530249717564e-05, + "loss": 1.9111, + "step": 9032 + }, + { + "epoch": 2.7725598526703497, + "grad_norm": 0.431540310382843, + "learning_rate": 8.481173473066328e-05, + "loss": 1.8145, + "step": 9033 + }, + { + "epoch": 2.772866789441375, + "grad_norm": 0.3637184798717499, + "learning_rate": 8.480816662012097e-05, + "loss": 1.8298, + "step": 9034 + }, + { + "epoch": 2.7731737262124003, + "grad_norm": 0.3045017123222351, + "learning_rate": 8.480459816558397e-05, + "loss": 1.8099, + "step": 9035 + }, + { + "epoch": 2.7734806629834257, + "grad_norm": 0.4252402186393738, + "learning_rate": 8.48010293670875e-05, + "loss": 1.8125, + "step": 9036 + }, + { + "epoch": 2.7737875997544506, + "grad_norm": 0.37933188676834106, + "learning_rate": 8.479746022466688e-05, + "loss": 1.8162, + "step": 9037 + }, + { + "epoch": 2.774094536525476, + "grad_norm": 0.287536084651947, + "learning_rate": 8.479389073835735e-05, + "loss": 1.8377, + "step": 9038 + }, + { + "epoch": 2.7744014732965008, + "grad_norm": 0.3484840393066406, + "learning_rate": 8.47903209081942e-05, + "loss": 1.8166, + "step": 9039 + }, + { + "epoch": 2.774708410067526, + "grad_norm": 0.4489477872848511, + "learning_rate": 8.478675073421272e-05, + "loss": 1.8618, + "step": 9040 + }, + { + "epoch": 2.7750153468385514, + "grad_norm": 0.3817744553089142, + "learning_rate": 8.478318021644817e-05, + "loss": 1.86, + "step": 9041 + }, + { + "epoch": 2.7753222836095763, + "grad_norm": 0.263468861579895, + "learning_rate": 8.477960935493585e-05, + "loss": 1.7802, + "step": 9042 + }, + { + "epoch": 2.7756292203806017, + "grad_norm": 0.3218925893306732, + "learning_rate": 8.477603814971104e-05, + "loss": 1.8056, + "step": 9043 + }, + { + "epoch": 2.7759361571516266, + "grad_norm": 0.38502782583236694, + "learning_rate": 8.477246660080905e-05, + "loss": 1.8405, + "step": 9044 + }, + { + "epoch": 2.776243093922652, + "grad_norm": 0.3504064381122589, + "learning_rate": 8.476889470826517e-05, + "loss": 1.8606, + "step": 9045 + }, + { + "epoch": 2.7765500306936772, + "grad_norm": 0.3007161021232605, + "learning_rate": 8.476532247211468e-05, + "loss": 1.8407, + "step": 9046 + }, + { + "epoch": 2.776856967464702, + "grad_norm": 0.30306726694107056, + "learning_rate": 8.476174989239289e-05, + "loss": 1.8399, + "step": 9047 + }, + { + "epoch": 2.7771639042357275, + "grad_norm": 0.3898545801639557, + "learning_rate": 8.475817696913511e-05, + "loss": 1.8971, + "step": 9048 + }, + { + "epoch": 2.7774708410067523, + "grad_norm": 0.35386478900909424, + "learning_rate": 8.475460370237667e-05, + "loss": 1.8213, + "step": 9049 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.35815873742103577, + "learning_rate": 8.475103009215287e-05, + "loss": 1.9593, + "step": 9050 + }, + { + "epoch": 2.778084714548803, + "grad_norm": 0.28021275997161865, + "learning_rate": 8.474745613849901e-05, + "loss": 1.7767, + "step": 9051 + }, + { + "epoch": 2.7783916513198283, + "grad_norm": 0.3393603563308716, + "learning_rate": 8.474388184145042e-05, + "loss": 1.8484, + "step": 9052 + }, + { + "epoch": 2.7786985880908532, + "grad_norm": 0.30488693714141846, + "learning_rate": 8.474030720104243e-05, + "loss": 1.835, + "step": 9053 + }, + { + "epoch": 2.7790055248618786, + "grad_norm": 0.2839586138725281, + "learning_rate": 8.473673221731037e-05, + "loss": 1.8054, + "step": 9054 + }, + { + "epoch": 2.7793124616329035, + "grad_norm": 0.2718851864337921, + "learning_rate": 8.473315689028955e-05, + "loss": 1.8216, + "step": 9055 + }, + { + "epoch": 2.779619398403929, + "grad_norm": 0.3072827458381653, + "learning_rate": 8.472958122001531e-05, + "loss": 1.8537, + "step": 9056 + }, + { + "epoch": 2.779926335174954, + "grad_norm": 0.36827966570854187, + "learning_rate": 8.472600520652301e-05, + "loss": 1.8174, + "step": 9057 + }, + { + "epoch": 2.780233271945979, + "grad_norm": 0.37436968088150024, + "learning_rate": 8.472242884984797e-05, + "loss": 1.7983, + "step": 9058 + }, + { + "epoch": 2.7805402087170044, + "grad_norm": 0.3039530813694, + "learning_rate": 8.471885215002554e-05, + "loss": 1.839, + "step": 9059 + }, + { + "epoch": 2.7808471454880292, + "grad_norm": 0.2949865162372589, + "learning_rate": 8.471527510709106e-05, + "loss": 1.8191, + "step": 9060 + }, + { + "epoch": 2.7811540822590546, + "grad_norm": 0.2914051413536072, + "learning_rate": 8.471169772107987e-05, + "loss": 1.8511, + "step": 9061 + }, + { + "epoch": 2.78146101903008, + "grad_norm": 0.29169002175331116, + "learning_rate": 8.470811999202734e-05, + "loss": 1.8242, + "step": 9062 + }, + { + "epoch": 2.781767955801105, + "grad_norm": 0.2862909436225891, + "learning_rate": 8.470454191996884e-05, + "loss": 1.8471, + "step": 9063 + }, + { + "epoch": 2.78207489257213, + "grad_norm": 0.2820829749107361, + "learning_rate": 8.47009635049397e-05, + "loss": 1.8539, + "step": 9064 + }, + { + "epoch": 2.782381829343155, + "grad_norm": 0.2778072655200958, + "learning_rate": 8.469738474697532e-05, + "loss": 1.7999, + "step": 9065 + }, + { + "epoch": 2.7826887661141804, + "grad_norm": 0.35963353514671326, + "learning_rate": 8.469380564611103e-05, + "loss": 1.8589, + "step": 9066 + }, + { + "epoch": 2.7829957028852057, + "grad_norm": 0.29438379406929016, + "learning_rate": 8.469022620238223e-05, + "loss": 1.7898, + "step": 9067 + }, + { + "epoch": 2.783302639656231, + "grad_norm": 0.2766551971435547, + "learning_rate": 8.468664641582428e-05, + "loss": 1.858, + "step": 9068 + }, + { + "epoch": 2.783609576427256, + "grad_norm": 0.29893574118614197, + "learning_rate": 8.468306628647256e-05, + "loss": 1.7859, + "step": 9069 + }, + { + "epoch": 2.7839165131982813, + "grad_norm": 0.2744910717010498, + "learning_rate": 8.467948581436243e-05, + "loss": 1.7803, + "step": 9070 + }, + { + "epoch": 2.784223449969306, + "grad_norm": 0.2405908703804016, + "learning_rate": 8.467590499952931e-05, + "loss": 1.8064, + "step": 9071 + }, + { + "epoch": 2.7845303867403315, + "grad_norm": 0.28585049510002136, + "learning_rate": 8.467232384200858e-05, + "loss": 1.809, + "step": 9072 + }, + { + "epoch": 2.784837323511357, + "grad_norm": 0.25816819071769714, + "learning_rate": 8.466874234183562e-05, + "loss": 1.7687, + "step": 9073 + }, + { + "epoch": 2.7851442602823817, + "grad_norm": 0.3135145306587219, + "learning_rate": 8.466516049904582e-05, + "loss": 1.8902, + "step": 9074 + }, + { + "epoch": 2.785451197053407, + "grad_norm": 0.32004159688949585, + "learning_rate": 8.46615783136746e-05, + "loss": 1.8227, + "step": 9075 + }, + { + "epoch": 2.785758133824432, + "grad_norm": 0.2775251567363739, + "learning_rate": 8.465799578575733e-05, + "loss": 1.8293, + "step": 9076 + }, + { + "epoch": 2.7860650705954573, + "grad_norm": 0.3377391993999481, + "learning_rate": 8.465441291532944e-05, + "loss": 1.9096, + "step": 9077 + }, + { + "epoch": 2.7863720073664826, + "grad_norm": 0.322818398475647, + "learning_rate": 8.465082970242634e-05, + "loss": 1.8372, + "step": 9078 + }, + { + "epoch": 2.786678944137508, + "grad_norm": 0.30539727210998535, + "learning_rate": 8.464724614708342e-05, + "loss": 1.8678, + "step": 9079 + }, + { + "epoch": 2.786985880908533, + "grad_norm": 0.3148079216480255, + "learning_rate": 8.464366224933611e-05, + "loss": 1.798, + "step": 9080 + }, + { + "epoch": 2.787292817679558, + "grad_norm": 0.3834371566772461, + "learning_rate": 8.464007800921983e-05, + "loss": 1.7871, + "step": 9081 + }, + { + "epoch": 2.787599754450583, + "grad_norm": 0.360202431678772, + "learning_rate": 8.463649342676998e-05, + "loss": 1.8396, + "step": 9082 + }, + { + "epoch": 2.7879066912216084, + "grad_norm": 0.28360050916671753, + "learning_rate": 8.463290850202201e-05, + "loss": 1.7905, + "step": 9083 + }, + { + "epoch": 2.7882136279926337, + "grad_norm": 0.28087326884269714, + "learning_rate": 8.462932323501134e-05, + "loss": 1.8079, + "step": 9084 + }, + { + "epoch": 2.7885205647636586, + "grad_norm": 0.2725851833820343, + "learning_rate": 8.462573762577339e-05, + "loss": 1.8099, + "step": 9085 + }, + { + "epoch": 2.788827501534684, + "grad_norm": 0.27776938676834106, + "learning_rate": 8.462215167434363e-05, + "loss": 1.8002, + "step": 9086 + }, + { + "epoch": 2.789134438305709, + "grad_norm": 0.3118545711040497, + "learning_rate": 8.461856538075745e-05, + "loss": 1.8541, + "step": 9087 + }, + { + "epoch": 2.789441375076734, + "grad_norm": 0.29499873518943787, + "learning_rate": 8.461497874505034e-05, + "loss": 1.8667, + "step": 9088 + }, + { + "epoch": 2.7897483118477595, + "grad_norm": 0.31346917152404785, + "learning_rate": 8.46113917672577e-05, + "loss": 1.8737, + "step": 9089 + }, + { + "epoch": 2.7900552486187844, + "grad_norm": 0.30406203866004944, + "learning_rate": 8.460780444741501e-05, + "loss": 1.8467, + "step": 9090 + }, + { + "epoch": 2.7903621853898097, + "grad_norm": 0.28438735008239746, + "learning_rate": 8.46042167855577e-05, + "loss": 1.8008, + "step": 9091 + }, + { + "epoch": 2.7906691221608346, + "grad_norm": 0.29893866181373596, + "learning_rate": 8.460062878172125e-05, + "loss": 1.8498, + "step": 9092 + }, + { + "epoch": 2.79097605893186, + "grad_norm": 0.33810749650001526, + "learning_rate": 8.459704043594112e-05, + "loss": 1.8259, + "step": 9093 + }, + { + "epoch": 2.7912829957028853, + "grad_norm": 0.3726813495159149, + "learning_rate": 8.459345174825273e-05, + "loss": 1.8831, + "step": 9094 + }, + { + "epoch": 2.7915899324739106, + "grad_norm": 0.2983379662036896, + "learning_rate": 8.45898627186916e-05, + "loss": 1.7886, + "step": 9095 + }, + { + "epoch": 2.7918968692449355, + "grad_norm": 0.3235681354999542, + "learning_rate": 8.458627334729316e-05, + "loss": 1.8616, + "step": 9096 + }, + { + "epoch": 2.792203806015961, + "grad_norm": 0.47961094975471497, + "learning_rate": 8.458268363409288e-05, + "loss": 1.8134, + "step": 9097 + }, + { + "epoch": 2.7925107427869857, + "grad_norm": 0.5463281869888306, + "learning_rate": 8.457909357912628e-05, + "loss": 1.8288, + "step": 9098 + }, + { + "epoch": 2.792817679558011, + "grad_norm": 0.5377171635627747, + "learning_rate": 8.45755031824288e-05, + "loss": 1.8032, + "step": 9099 + }, + { + "epoch": 2.7931246163290364, + "grad_norm": 0.30159178376197815, + "learning_rate": 8.457191244403592e-05, + "loss": 1.7619, + "step": 9100 + }, + { + "epoch": 2.7934315531000613, + "grad_norm": 0.33798086643218994, + "learning_rate": 8.456832136398315e-05, + "loss": 1.839, + "step": 9101 + }, + { + "epoch": 2.7937384898710866, + "grad_norm": 0.5194488167762756, + "learning_rate": 8.456472994230595e-05, + "loss": 1.7908, + "step": 9102 + }, + { + "epoch": 2.7940454266421115, + "grad_norm": 0.49310582876205444, + "learning_rate": 8.456113817903986e-05, + "loss": 1.8471, + "step": 9103 + }, + { + "epoch": 2.794352363413137, + "grad_norm": 0.27490735054016113, + "learning_rate": 8.455754607422032e-05, + "loss": 1.8168, + "step": 9104 + }, + { + "epoch": 2.794659300184162, + "grad_norm": 0.3760504126548767, + "learning_rate": 8.455395362788285e-05, + "loss": 1.8796, + "step": 9105 + }, + { + "epoch": 2.794966236955187, + "grad_norm": 0.4636823534965515, + "learning_rate": 8.455036084006298e-05, + "loss": 1.8001, + "step": 9106 + }, + { + "epoch": 2.7952731737262124, + "grad_norm": 0.38666999340057373, + "learning_rate": 8.454676771079619e-05, + "loss": 1.8396, + "step": 9107 + }, + { + "epoch": 2.7955801104972373, + "grad_norm": 0.2992180585861206, + "learning_rate": 8.454317424011797e-05, + "loss": 1.8298, + "step": 9108 + }, + { + "epoch": 2.7958870472682626, + "grad_norm": 0.3744206428527832, + "learning_rate": 8.453958042806389e-05, + "loss": 1.8396, + "step": 9109 + }, + { + "epoch": 2.796193984039288, + "grad_norm": 0.5117284059524536, + "learning_rate": 8.453598627466941e-05, + "loss": 1.9734, + "step": 9110 + }, + { + "epoch": 2.7965009208103133, + "grad_norm": 0.36792969703674316, + "learning_rate": 8.453239177997008e-05, + "loss": 1.8347, + "step": 9111 + }, + { + "epoch": 2.796807857581338, + "grad_norm": 0.3352719843387604, + "learning_rate": 8.452879694400139e-05, + "loss": 1.7967, + "step": 9112 + }, + { + "epoch": 2.7971147943523635, + "grad_norm": 0.45745235681533813, + "learning_rate": 8.452520176679893e-05, + "loss": 1.8484, + "step": 9113 + }, + { + "epoch": 2.7974217311233884, + "grad_norm": 0.43958255648612976, + "learning_rate": 8.452160624839816e-05, + "loss": 1.7954, + "step": 9114 + }, + { + "epoch": 2.7977286678944138, + "grad_norm": 0.28715837001800537, + "learning_rate": 8.451801038883467e-05, + "loss": 1.8088, + "step": 9115 + }, + { + "epoch": 2.798035604665439, + "grad_norm": 0.3552972078323364, + "learning_rate": 8.451441418814394e-05, + "loss": 1.7654, + "step": 9116 + }, + { + "epoch": 2.798342541436464, + "grad_norm": 0.5065462589263916, + "learning_rate": 8.451081764636156e-05, + "loss": 1.7841, + "step": 9117 + }, + { + "epoch": 2.7986494782074893, + "grad_norm": 0.48900917172431946, + "learning_rate": 8.450722076352306e-05, + "loss": 1.8709, + "step": 9118 + }, + { + "epoch": 2.798956414978514, + "grad_norm": 0.31420227885246277, + "learning_rate": 8.450362353966395e-05, + "loss": 1.9057, + "step": 9119 + }, + { + "epoch": 2.7992633517495396, + "grad_norm": 0.35886913537979126, + "learning_rate": 8.450002597481982e-05, + "loss": 1.877, + "step": 9120 + }, + { + "epoch": 2.799570288520565, + "grad_norm": 0.3822213113307953, + "learning_rate": 8.449642806902623e-05, + "loss": 1.9171, + "step": 9121 + }, + { + "epoch": 2.7998772252915898, + "grad_norm": 0.3286183476448059, + "learning_rate": 8.449282982231869e-05, + "loss": 1.8342, + "step": 9122 + }, + { + "epoch": 2.800184162062615, + "grad_norm": 0.3498966693878174, + "learning_rate": 8.448923123473282e-05, + "loss": 1.8276, + "step": 9123 + }, + { + "epoch": 2.80049109883364, + "grad_norm": 0.3550187647342682, + "learning_rate": 8.448563230630413e-05, + "loss": 1.8585, + "step": 9124 + }, + { + "epoch": 2.8007980356046653, + "grad_norm": 0.32100117206573486, + "learning_rate": 8.448203303706821e-05, + "loss": 1.8168, + "step": 9125 + }, + { + "epoch": 2.8011049723756907, + "grad_norm": 0.3859860301017761, + "learning_rate": 8.447843342706063e-05, + "loss": 1.8941, + "step": 9126 + }, + { + "epoch": 2.801411909146716, + "grad_norm": 0.41674432158470154, + "learning_rate": 8.447483347631697e-05, + "loss": 1.7894, + "step": 9127 + }, + { + "epoch": 2.801718845917741, + "grad_norm": 0.3324837386608124, + "learning_rate": 8.44712331848728e-05, + "loss": 1.8901, + "step": 9128 + }, + { + "epoch": 2.8020257826887662, + "grad_norm": 0.30357789993286133, + "learning_rate": 8.44676325527637e-05, + "loss": 1.8434, + "step": 9129 + }, + { + "epoch": 2.802332719459791, + "grad_norm": 0.3215816617012024, + "learning_rate": 8.446403158002525e-05, + "loss": 1.8291, + "step": 9130 + }, + { + "epoch": 2.8026396562308165, + "grad_norm": 0.26280832290649414, + "learning_rate": 8.446043026669303e-05, + "loss": 1.7934, + "step": 9131 + }, + { + "epoch": 2.802946593001842, + "grad_norm": 0.2963539659976959, + "learning_rate": 8.445682861280265e-05, + "loss": 1.824, + "step": 9132 + }, + { + "epoch": 2.8032535297728667, + "grad_norm": 0.4251864552497864, + "learning_rate": 8.44532266183897e-05, + "loss": 1.9, + "step": 9133 + }, + { + "epoch": 2.803560466543892, + "grad_norm": 0.3920140862464905, + "learning_rate": 8.444962428348978e-05, + "loss": 1.7753, + "step": 9134 + }, + { + "epoch": 2.803867403314917, + "grad_norm": 0.2614890933036804, + "learning_rate": 8.444602160813845e-05, + "loss": 1.844, + "step": 9135 + }, + { + "epoch": 2.8041743400859422, + "grad_norm": 0.3359995484352112, + "learning_rate": 8.444241859237135e-05, + "loss": 1.8636, + "step": 9136 + }, + { + "epoch": 2.8044812768569676, + "grad_norm": 0.34399285912513733, + "learning_rate": 8.44388152362241e-05, + "loss": 1.8304, + "step": 9137 + }, + { + "epoch": 2.804788213627993, + "grad_norm": 0.27815961837768555, + "learning_rate": 8.443521153973228e-05, + "loss": 1.7916, + "step": 9138 + }, + { + "epoch": 2.805095150399018, + "grad_norm": 0.40705251693725586, + "learning_rate": 8.443160750293152e-05, + "loss": 1.7707, + "step": 9139 + }, + { + "epoch": 2.805402087170043, + "grad_norm": 0.49512532353401184, + "learning_rate": 8.442800312585744e-05, + "loss": 1.866, + "step": 9140 + }, + { + "epoch": 2.805709023941068, + "grad_norm": 0.31373831629753113, + "learning_rate": 8.442439840854565e-05, + "loss": 1.8495, + "step": 9141 + }, + { + "epoch": 2.8060159607120934, + "grad_norm": 0.33470213413238525, + "learning_rate": 8.442079335103177e-05, + "loss": 1.8459, + "step": 9142 + }, + { + "epoch": 2.8063228974831187, + "grad_norm": 0.4092586636543274, + "learning_rate": 8.441718795335145e-05, + "loss": 1.8547, + "step": 9143 + }, + { + "epoch": 2.8066298342541436, + "grad_norm": 0.37220728397369385, + "learning_rate": 8.44135822155403e-05, + "loss": 1.8922, + "step": 9144 + }, + { + "epoch": 2.806936771025169, + "grad_norm": 0.3197399973869324, + "learning_rate": 8.440997613763395e-05, + "loss": 1.872, + "step": 9145 + }, + { + "epoch": 2.807243707796194, + "grad_norm": 0.31258881092071533, + "learning_rate": 8.440636971966805e-05, + "loss": 1.8394, + "step": 9146 + }, + { + "epoch": 2.807550644567219, + "grad_norm": 0.31450721621513367, + "learning_rate": 8.440276296167825e-05, + "loss": 1.8496, + "step": 9147 + }, + { + "epoch": 2.8078575813382445, + "grad_norm": 0.30959805846214294, + "learning_rate": 8.439915586370018e-05, + "loss": 1.8326, + "step": 9148 + }, + { + "epoch": 2.8081645181092694, + "grad_norm": 0.2942456901073456, + "learning_rate": 8.439554842576949e-05, + "loss": 1.8742, + "step": 9149 + }, + { + "epoch": 2.8084714548802947, + "grad_norm": 0.32378795742988586, + "learning_rate": 8.439194064792182e-05, + "loss": 1.7991, + "step": 9150 + }, + { + "epoch": 2.8087783916513196, + "grad_norm": 0.30733996629714966, + "learning_rate": 8.438833253019285e-05, + "loss": 1.8822, + "step": 9151 + }, + { + "epoch": 2.809085328422345, + "grad_norm": 0.29933521151542664, + "learning_rate": 8.438472407261821e-05, + "loss": 1.7785, + "step": 9152 + }, + { + "epoch": 2.8093922651933703, + "grad_norm": 0.2992005944252014, + "learning_rate": 8.438111527523358e-05, + "loss": 1.9056, + "step": 9153 + }, + { + "epoch": 2.8096992019643956, + "grad_norm": 0.3074969947338104, + "learning_rate": 8.43775061380746e-05, + "loss": 1.8283, + "step": 9154 + }, + { + "epoch": 2.8100061387354205, + "grad_norm": 0.29843345284461975, + "learning_rate": 8.437389666117699e-05, + "loss": 1.87, + "step": 9155 + }, + { + "epoch": 2.810313075506446, + "grad_norm": 0.2939853072166443, + "learning_rate": 8.437028684457635e-05, + "loss": 1.8657, + "step": 9156 + }, + { + "epoch": 2.8106200122774707, + "grad_norm": 0.292972207069397, + "learning_rate": 8.436667668830841e-05, + "loss": 1.821, + "step": 9157 + }, + { + "epoch": 2.810926949048496, + "grad_norm": 0.298244833946228, + "learning_rate": 8.436306619240882e-05, + "loss": 1.8531, + "step": 9158 + }, + { + "epoch": 2.8112338858195214, + "grad_norm": 0.28567394614219666, + "learning_rate": 8.435945535691328e-05, + "loss": 1.7719, + "step": 9159 + }, + { + "epoch": 2.8115408225905463, + "grad_norm": 0.2876092493534088, + "learning_rate": 8.435584418185745e-05, + "loss": 1.7622, + "step": 9160 + }, + { + "epoch": 2.8118477593615716, + "grad_norm": 0.2656804919242859, + "learning_rate": 8.435223266727704e-05, + "loss": 1.7624, + "step": 9161 + }, + { + "epoch": 2.8121546961325965, + "grad_norm": 0.26690298318862915, + "learning_rate": 8.434862081320774e-05, + "loss": 1.807, + "step": 9162 + }, + { + "epoch": 2.812461632903622, + "grad_norm": 0.3088238537311554, + "learning_rate": 8.434500861968521e-05, + "loss": 1.9214, + "step": 9163 + }, + { + "epoch": 2.812768569674647, + "grad_norm": 0.32310751080513, + "learning_rate": 8.43413960867452e-05, + "loss": 1.8341, + "step": 9164 + }, + { + "epoch": 2.813075506445672, + "grad_norm": 0.3028428554534912, + "learning_rate": 8.433778321442339e-05, + "loss": 1.8316, + "step": 9165 + }, + { + "epoch": 2.8133824432166974, + "grad_norm": 0.28363901376724243, + "learning_rate": 8.433417000275545e-05, + "loss": 1.8506, + "step": 9166 + }, + { + "epoch": 2.8136893799877223, + "grad_norm": 0.2976547181606293, + "learning_rate": 8.433055645177714e-05, + "loss": 1.8654, + "step": 9167 + }, + { + "epoch": 2.8139963167587476, + "grad_norm": 0.2945725619792938, + "learning_rate": 8.432694256152414e-05, + "loss": 1.8146, + "step": 9168 + }, + { + "epoch": 2.814303253529773, + "grad_norm": 0.30364149808883667, + "learning_rate": 8.432332833203217e-05, + "loss": 1.8152, + "step": 9169 + }, + { + "epoch": 2.8146101903007983, + "grad_norm": 0.2776038348674774, + "learning_rate": 8.431971376333699e-05, + "loss": 1.7723, + "step": 9170 + }, + { + "epoch": 2.814917127071823, + "grad_norm": 0.41802000999450684, + "learning_rate": 8.431609885547425e-05, + "loss": 1.7909, + "step": 9171 + }, + { + "epoch": 2.8152240638428485, + "grad_norm": 0.400622695684433, + "learning_rate": 8.43124836084797e-05, + "loss": 1.8241, + "step": 9172 + }, + { + "epoch": 2.8155310006138734, + "grad_norm": 0.3760300576686859, + "learning_rate": 8.430886802238908e-05, + "loss": 1.9298, + "step": 9173 + }, + { + "epoch": 2.8158379373848987, + "grad_norm": 0.2944977283477783, + "learning_rate": 8.430525209723813e-05, + "loss": 1.8181, + "step": 9174 + }, + { + "epoch": 2.816144874155924, + "grad_norm": 0.28091785311698914, + "learning_rate": 8.430163583306257e-05, + "loss": 1.8178, + "step": 9175 + }, + { + "epoch": 2.816451810926949, + "grad_norm": 0.33689528703689575, + "learning_rate": 8.429801922989812e-05, + "loss": 1.8195, + "step": 9176 + }, + { + "epoch": 2.8167587476979743, + "grad_norm": 0.3541412055492401, + "learning_rate": 8.429440228778058e-05, + "loss": 1.8951, + "step": 9177 + }, + { + "epoch": 2.817065684468999, + "grad_norm": 0.2846376299858093, + "learning_rate": 8.429078500674564e-05, + "loss": 1.7858, + "step": 9178 + }, + { + "epoch": 2.8173726212400245, + "grad_norm": 0.28097108006477356, + "learning_rate": 8.428716738682905e-05, + "loss": 1.8503, + "step": 9179 + }, + { + "epoch": 2.81767955801105, + "grad_norm": 0.354670912027359, + "learning_rate": 8.428354942806658e-05, + "loss": 1.8332, + "step": 9180 + }, + { + "epoch": 2.8179864947820747, + "grad_norm": 0.3589770793914795, + "learning_rate": 8.427993113049397e-05, + "loss": 1.8527, + "step": 9181 + }, + { + "epoch": 2.8182934315531, + "grad_norm": 0.3171144723892212, + "learning_rate": 8.4276312494147e-05, + "loss": 1.789, + "step": 9182 + }, + { + "epoch": 2.818600368324125, + "grad_norm": 0.3540917932987213, + "learning_rate": 8.427269351906143e-05, + "loss": 1.8338, + "step": 9183 + }, + { + "epoch": 2.8189073050951503, + "grad_norm": 0.34149861335754395, + "learning_rate": 8.426907420527302e-05, + "loss": 1.8202, + "step": 9184 + }, + { + "epoch": 2.8192142418661756, + "grad_norm": 0.3035878837108612, + "learning_rate": 8.426545455281751e-05, + "loss": 1.842, + "step": 9185 + }, + { + "epoch": 2.819521178637201, + "grad_norm": 0.29007625579833984, + "learning_rate": 8.426183456173072e-05, + "loss": 1.8486, + "step": 9186 + }, + { + "epoch": 2.819828115408226, + "grad_norm": 0.3066602647304535, + "learning_rate": 8.425821423204837e-05, + "loss": 1.7833, + "step": 9187 + }, + { + "epoch": 2.820135052179251, + "grad_norm": 0.3163747191429138, + "learning_rate": 8.425459356380627e-05, + "loss": 1.8037, + "step": 9188 + }, + { + "epoch": 2.820441988950276, + "grad_norm": 0.3282648026943207, + "learning_rate": 8.425097255704022e-05, + "loss": 1.8476, + "step": 9189 + }, + { + "epoch": 2.8207489257213014, + "grad_norm": 0.3573009669780731, + "learning_rate": 8.424735121178598e-05, + "loss": 1.87, + "step": 9190 + }, + { + "epoch": 2.8210558624923268, + "grad_norm": 0.3480490744113922, + "learning_rate": 8.424372952807933e-05, + "loss": 1.8773, + "step": 9191 + }, + { + "epoch": 2.8213627992633517, + "grad_norm": 0.3296821415424347, + "learning_rate": 8.424010750595608e-05, + "loss": 1.8775, + "step": 9192 + }, + { + "epoch": 2.821669736034377, + "grad_norm": 0.33366382122039795, + "learning_rate": 8.423648514545202e-05, + "loss": 1.8064, + "step": 9193 + }, + { + "epoch": 2.821976672805402, + "grad_norm": 0.454303503036499, + "learning_rate": 8.423286244660295e-05, + "loss": 1.9702, + "step": 9194 + }, + { + "epoch": 2.822283609576427, + "grad_norm": 0.361215740442276, + "learning_rate": 8.422923940944466e-05, + "loss": 1.8055, + "step": 9195 + }, + { + "epoch": 2.8225905463474525, + "grad_norm": 0.3678447902202606, + "learning_rate": 8.422561603401297e-05, + "loss": 1.8924, + "step": 9196 + }, + { + "epoch": 2.8228974831184774, + "grad_norm": 0.32999005913734436, + "learning_rate": 8.422199232034369e-05, + "loss": 1.7887, + "step": 9197 + }, + { + "epoch": 2.8232044198895028, + "grad_norm": 0.2811618149280548, + "learning_rate": 8.42183682684726e-05, + "loss": 1.8166, + "step": 9198 + }, + { + "epoch": 2.8235113566605277, + "grad_norm": 0.3178839385509491, + "learning_rate": 8.421474387843555e-05, + "loss": 1.7868, + "step": 9199 + }, + { + "epoch": 2.823818293431553, + "grad_norm": 0.27299264073371887, + "learning_rate": 8.421111915026836e-05, + "loss": 1.816, + "step": 9200 + }, + { + "epoch": 2.8241252302025783, + "grad_norm": 0.3191591203212738, + "learning_rate": 8.420749408400684e-05, + "loss": 1.912, + "step": 9201 + }, + { + "epoch": 2.8244321669736037, + "grad_norm": 0.3638809323310852, + "learning_rate": 8.42038686796868e-05, + "loss": 1.7716, + "step": 9202 + }, + { + "epoch": 2.8247391037446286, + "grad_norm": 0.33573171496391296, + "learning_rate": 8.420024293734407e-05, + "loss": 1.8599, + "step": 9203 + }, + { + "epoch": 2.825046040515654, + "grad_norm": 0.29062843322753906, + "learning_rate": 8.419661685701452e-05, + "loss": 1.7982, + "step": 9204 + }, + { + "epoch": 2.825352977286679, + "grad_norm": 0.27475887537002563, + "learning_rate": 8.419299043873394e-05, + "loss": 1.7763, + "step": 9205 + }, + { + "epoch": 2.825659914057704, + "grad_norm": 0.2996850609779358, + "learning_rate": 8.41893636825382e-05, + "loss": 1.7957, + "step": 9206 + }, + { + "epoch": 2.8259668508287294, + "grad_norm": 0.38112908601760864, + "learning_rate": 8.418573658846314e-05, + "loss": 1.8536, + "step": 9207 + }, + { + "epoch": 2.8262737875997543, + "grad_norm": 0.3245584964752197, + "learning_rate": 8.418210915654456e-05, + "loss": 1.8254, + "step": 9208 + }, + { + "epoch": 2.8265807243707797, + "grad_norm": 0.24600234627723694, + "learning_rate": 8.417848138681837e-05, + "loss": 1.825, + "step": 9209 + }, + { + "epoch": 2.8268876611418046, + "grad_norm": 0.3130429685115814, + "learning_rate": 8.417485327932038e-05, + "loss": 1.7954, + "step": 9210 + }, + { + "epoch": 2.82719459791283, + "grad_norm": 0.3218819200992584, + "learning_rate": 8.417122483408647e-05, + "loss": 1.8343, + "step": 9211 + }, + { + "epoch": 2.8275015346838552, + "grad_norm": 0.3020598292350769, + "learning_rate": 8.416759605115248e-05, + "loss": 1.8547, + "step": 9212 + }, + { + "epoch": 2.8278084714548806, + "grad_norm": 0.2685437798500061, + "learning_rate": 8.416396693055429e-05, + "loss": 1.7828, + "step": 9213 + }, + { + "epoch": 2.8281154082259055, + "grad_norm": 0.2990378737449646, + "learning_rate": 8.416033747232775e-05, + "loss": 1.8108, + "step": 9214 + }, + { + "epoch": 2.828422344996931, + "grad_norm": 0.25395238399505615, + "learning_rate": 8.415670767650871e-05, + "loss": 1.786, + "step": 9215 + }, + { + "epoch": 2.8287292817679557, + "grad_norm": 0.3406725823879242, + "learning_rate": 8.41530775431331e-05, + "loss": 1.9015, + "step": 9216 + }, + { + "epoch": 2.829036218538981, + "grad_norm": 0.279859721660614, + "learning_rate": 8.414944707223676e-05, + "loss": 1.8639, + "step": 9217 + }, + { + "epoch": 2.8293431553100064, + "grad_norm": 0.2574310600757599, + "learning_rate": 8.414581626385554e-05, + "loss": 1.7595, + "step": 9218 + }, + { + "epoch": 2.8296500920810312, + "grad_norm": 0.2956291437149048, + "learning_rate": 8.414218511802537e-05, + "loss": 1.8418, + "step": 9219 + }, + { + "epoch": 2.8299570288520566, + "grad_norm": 0.30965283513069153, + "learning_rate": 8.41385536347821e-05, + "loss": 1.8241, + "step": 9220 + }, + { + "epoch": 2.8302639656230815, + "grad_norm": 0.3125357925891876, + "learning_rate": 8.413492181416166e-05, + "loss": 1.7961, + "step": 9221 + }, + { + "epoch": 2.830570902394107, + "grad_norm": 0.23901188373565674, + "learning_rate": 8.413128965619988e-05, + "loss": 1.8109, + "step": 9222 + }, + { + "epoch": 2.830877839165132, + "grad_norm": 0.26556700468063354, + "learning_rate": 8.412765716093272e-05, + "loss": 1.8756, + "step": 9223 + }, + { + "epoch": 2.831184775936157, + "grad_norm": 0.3080972731113434, + "learning_rate": 8.412402432839604e-05, + "loss": 1.8271, + "step": 9224 + }, + { + "epoch": 2.8314917127071824, + "grad_norm": 0.32894501090049744, + "learning_rate": 8.412039115862573e-05, + "loss": 1.8427, + "step": 9225 + }, + { + "epoch": 2.8317986494782073, + "grad_norm": 0.3136049509048462, + "learning_rate": 8.411675765165774e-05, + "loss": 1.8716, + "step": 9226 + }, + { + "epoch": 2.8321055862492326, + "grad_norm": 0.26859185099601746, + "learning_rate": 8.411312380752795e-05, + "loss": 1.8138, + "step": 9227 + }, + { + "epoch": 2.832412523020258, + "grad_norm": 0.26863718032836914, + "learning_rate": 8.410948962627227e-05, + "loss": 1.8286, + "step": 9228 + }, + { + "epoch": 2.8327194597912833, + "grad_norm": 0.25599852204322815, + "learning_rate": 8.410585510792663e-05, + "loss": 1.8274, + "step": 9229 + }, + { + "epoch": 2.833026396562308, + "grad_norm": 0.22787287831306458, + "learning_rate": 8.410222025252694e-05, + "loss": 1.7961, + "step": 9230 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.22957643866539001, + "learning_rate": 8.409858506010912e-05, + "loss": 1.7763, + "step": 9231 + }, + { + "epoch": 2.8336402701043584, + "grad_norm": 0.2794438302516937, + "learning_rate": 8.409494953070909e-05, + "loss": 1.8552, + "step": 9232 + }, + { + "epoch": 2.8339472068753837, + "grad_norm": 0.2755461037158966, + "learning_rate": 8.409131366436279e-05, + "loss": 1.8418, + "step": 9233 + }, + { + "epoch": 2.834254143646409, + "grad_norm": 0.27968719601631165, + "learning_rate": 8.408767746110616e-05, + "loss": 1.8774, + "step": 9234 + }, + { + "epoch": 2.834561080417434, + "grad_norm": 0.3014982044696808, + "learning_rate": 8.408404092097511e-05, + "loss": 1.8886, + "step": 9235 + }, + { + "epoch": 2.8348680171884593, + "grad_norm": 0.3139450252056122, + "learning_rate": 8.408040404400558e-05, + "loss": 1.8119, + "step": 9236 + }, + { + "epoch": 2.835174953959484, + "grad_norm": 0.43578827381134033, + "learning_rate": 8.407676683023353e-05, + "loss": 1.8173, + "step": 9237 + }, + { + "epoch": 2.8354818907305095, + "grad_norm": 0.4939953088760376, + "learning_rate": 8.407312927969489e-05, + "loss": 1.8647, + "step": 9238 + }, + { + "epoch": 2.835788827501535, + "grad_norm": 0.40801018476486206, + "learning_rate": 8.406949139242562e-05, + "loss": 1.8259, + "step": 9239 + }, + { + "epoch": 2.8360957642725597, + "grad_norm": 0.331249862909317, + "learning_rate": 8.406585316846168e-05, + "loss": 1.8727, + "step": 9240 + }, + { + "epoch": 2.836402701043585, + "grad_norm": 0.3368569314479828, + "learning_rate": 8.406221460783901e-05, + "loss": 1.8362, + "step": 9241 + }, + { + "epoch": 2.83670963781461, + "grad_norm": 0.4736326336860657, + "learning_rate": 8.405857571059355e-05, + "loss": 1.9543, + "step": 9242 + }, + { + "epoch": 2.8370165745856353, + "grad_norm": 0.4151712656021118, + "learning_rate": 8.405493647676131e-05, + "loss": 1.8764, + "step": 9243 + }, + { + "epoch": 2.8373235113566606, + "grad_norm": 0.3463367819786072, + "learning_rate": 8.405129690637821e-05, + "loss": 1.8578, + "step": 9244 + }, + { + "epoch": 2.837630448127686, + "grad_norm": 0.28701671957969666, + "learning_rate": 8.404765699948023e-05, + "loss": 1.8201, + "step": 9245 + }, + { + "epoch": 2.837937384898711, + "grad_norm": 0.2893613874912262, + "learning_rate": 8.404401675610336e-05, + "loss": 1.7918, + "step": 9246 + }, + { + "epoch": 2.838244321669736, + "grad_norm": 0.29359766840934753, + "learning_rate": 8.404037617628357e-05, + "loss": 1.7919, + "step": 9247 + }, + { + "epoch": 2.838551258440761, + "grad_norm": 0.30147913098335266, + "learning_rate": 8.403673526005682e-05, + "loss": 1.8227, + "step": 9248 + }, + { + "epoch": 2.8388581952117864, + "grad_norm": 0.28443291783332825, + "learning_rate": 8.403309400745908e-05, + "loss": 1.8128, + "step": 9249 + }, + { + "epoch": 2.8391651319828117, + "grad_norm": 0.27890142798423767, + "learning_rate": 8.40294524185264e-05, + "loss": 1.8109, + "step": 9250 + }, + { + "epoch": 2.8394720687538366, + "grad_norm": 0.29900890588760376, + "learning_rate": 8.402581049329471e-05, + "loss": 1.7852, + "step": 9251 + }, + { + "epoch": 2.839779005524862, + "grad_norm": 0.34249019622802734, + "learning_rate": 8.402216823180001e-05, + "loss": 1.8681, + "step": 9252 + }, + { + "epoch": 2.840085942295887, + "grad_norm": 0.3387257754802704, + "learning_rate": 8.40185256340783e-05, + "loss": 1.9171, + "step": 9253 + }, + { + "epoch": 2.840392879066912, + "grad_norm": 0.2831752598285675, + "learning_rate": 8.40148827001656e-05, + "loss": 1.8422, + "step": 9254 + }, + { + "epoch": 2.8406998158379375, + "grad_norm": 0.30895891785621643, + "learning_rate": 8.401123943009788e-05, + "loss": 1.7967, + "step": 9255 + }, + { + "epoch": 2.8410067526089624, + "grad_norm": 0.381154328584671, + "learning_rate": 8.400759582391116e-05, + "loss": 1.8359, + "step": 9256 + }, + { + "epoch": 2.8413136893799877, + "grad_norm": 0.4041622281074524, + "learning_rate": 8.400395188164144e-05, + "loss": 1.8306, + "step": 9257 + }, + { + "epoch": 2.8416206261510126, + "grad_norm": 0.3801247775554657, + "learning_rate": 8.400030760332474e-05, + "loss": 1.8696, + "step": 9258 + }, + { + "epoch": 2.841927562922038, + "grad_norm": 0.27382874488830566, + "learning_rate": 8.399666298899706e-05, + "loss": 1.8369, + "step": 9259 + }, + { + "epoch": 2.8422344996930633, + "grad_norm": 0.31395214796066284, + "learning_rate": 8.399301803869445e-05, + "loss": 1.8135, + "step": 9260 + }, + { + "epoch": 2.8425414364640886, + "grad_norm": 0.36473774909973145, + "learning_rate": 8.398937275245291e-05, + "loss": 1.8025, + "step": 9261 + }, + { + "epoch": 2.8428483732351135, + "grad_norm": 0.38420331478118896, + "learning_rate": 8.398572713030846e-05, + "loss": 1.7873, + "step": 9262 + }, + { + "epoch": 2.843155310006139, + "grad_norm": 0.2707001566886902, + "learning_rate": 8.398208117229714e-05, + "loss": 1.8071, + "step": 9263 + }, + { + "epoch": 2.8434622467771637, + "grad_norm": 0.3391258418560028, + "learning_rate": 8.397843487845496e-05, + "loss": 1.8186, + "step": 9264 + }, + { + "epoch": 2.843769183548189, + "grad_norm": 0.4473530650138855, + "learning_rate": 8.397478824881799e-05, + "loss": 1.9144, + "step": 9265 + }, + { + "epoch": 2.8440761203192144, + "grad_norm": 0.3141709268093109, + "learning_rate": 8.397114128342224e-05, + "loss": 1.77, + "step": 9266 + }, + { + "epoch": 2.8443830570902393, + "grad_norm": 0.29191854596138, + "learning_rate": 8.396749398230377e-05, + "loss": 1.8645, + "step": 9267 + }, + { + "epoch": 2.8446899938612646, + "grad_norm": 0.4399743676185608, + "learning_rate": 8.39638463454986e-05, + "loss": 1.8261, + "step": 9268 + }, + { + "epoch": 2.8449969306322895, + "grad_norm": 0.4741196036338806, + "learning_rate": 8.396019837304281e-05, + "loss": 1.8566, + "step": 9269 + }, + { + "epoch": 2.845303867403315, + "grad_norm": 0.39640361070632935, + "learning_rate": 8.395655006497243e-05, + "loss": 1.8062, + "step": 9270 + }, + { + "epoch": 2.84561080417434, + "grad_norm": 0.290171355009079, + "learning_rate": 8.39529014213235e-05, + "loss": 1.8463, + "step": 9271 + }, + { + "epoch": 2.845917740945365, + "grad_norm": 0.2773928940296173, + "learning_rate": 8.394925244213212e-05, + "loss": 1.7929, + "step": 9272 + }, + { + "epoch": 2.8462246777163904, + "grad_norm": 0.38512173295021057, + "learning_rate": 8.394560312743433e-05, + "loss": 1.8724, + "step": 9273 + }, + { + "epoch": 2.8465316144874153, + "grad_norm": 0.44405680894851685, + "learning_rate": 8.394195347726619e-05, + "loss": 1.8184, + "step": 9274 + }, + { + "epoch": 2.8468385512584407, + "grad_norm": 0.32526880502700806, + "learning_rate": 8.393830349166376e-05, + "loss": 1.8207, + "step": 9275 + }, + { + "epoch": 2.847145488029466, + "grad_norm": 0.2934194803237915, + "learning_rate": 8.393465317066313e-05, + "loss": 1.8023, + "step": 9276 + }, + { + "epoch": 2.8474524248004913, + "grad_norm": 0.43126001954078674, + "learning_rate": 8.393100251430037e-05, + "loss": 1.8283, + "step": 9277 + }, + { + "epoch": 2.847759361571516, + "grad_norm": 0.48253729939460754, + "learning_rate": 8.392735152261157e-05, + "loss": 1.8359, + "step": 9278 + }, + { + "epoch": 2.8480662983425415, + "grad_norm": 0.3736251890659332, + "learning_rate": 8.392370019563279e-05, + "loss": 1.8553, + "step": 9279 + }, + { + "epoch": 2.8483732351135664, + "grad_norm": 0.33329901099205017, + "learning_rate": 8.39200485334001e-05, + "loss": 1.8156, + "step": 9280 + }, + { + "epoch": 2.8486801718845918, + "grad_norm": 0.42538657784461975, + "learning_rate": 8.391639653594963e-05, + "loss": 1.7812, + "step": 9281 + }, + { + "epoch": 2.848987108655617, + "grad_norm": 0.39076727628707886, + "learning_rate": 8.391274420331744e-05, + "loss": 1.8027, + "step": 9282 + }, + { + "epoch": 2.849294045426642, + "grad_norm": 0.3558272123336792, + "learning_rate": 8.390909153553963e-05, + "loss": 1.8448, + "step": 9283 + }, + { + "epoch": 2.8496009821976673, + "grad_norm": 0.26782071590423584, + "learning_rate": 8.390543853265232e-05, + "loss": 1.7995, + "step": 9284 + }, + { + "epoch": 2.849907918968692, + "grad_norm": 0.3449724614620209, + "learning_rate": 8.390178519469158e-05, + "loss": 1.7888, + "step": 9285 + }, + { + "epoch": 2.8502148557397176, + "grad_norm": 0.36390578746795654, + "learning_rate": 8.389813152169355e-05, + "loss": 1.8072, + "step": 9286 + }, + { + "epoch": 2.850521792510743, + "grad_norm": 0.31959423422813416, + "learning_rate": 8.389447751369428e-05, + "loss": 1.8513, + "step": 9287 + }, + { + "epoch": 2.8508287292817682, + "grad_norm": 0.2717762589454651, + "learning_rate": 8.389082317072994e-05, + "loss": 1.8457, + "step": 9288 + }, + { + "epoch": 2.851135666052793, + "grad_norm": 0.28937265276908875, + "learning_rate": 8.388716849283662e-05, + "loss": 1.7945, + "step": 9289 + }, + { + "epoch": 2.8514426028238185, + "grad_norm": 0.293079674243927, + "learning_rate": 8.388351348005044e-05, + "loss": 1.7731, + "step": 9290 + }, + { + "epoch": 2.8517495395948433, + "grad_norm": 0.32930463552474976, + "learning_rate": 8.38798581324075e-05, + "loss": 1.9017, + "step": 9291 + }, + { + "epoch": 2.8520564763658687, + "grad_norm": 0.2972584664821625, + "learning_rate": 8.387620244994397e-05, + "loss": 1.861, + "step": 9292 + }, + { + "epoch": 2.852363413136894, + "grad_norm": 0.24732981622219086, + "learning_rate": 8.387254643269595e-05, + "loss": 1.7749, + "step": 9293 + }, + { + "epoch": 2.852670349907919, + "grad_norm": 0.31004419922828674, + "learning_rate": 8.386889008069955e-05, + "loss": 1.7848, + "step": 9294 + }, + { + "epoch": 2.8529772866789442, + "grad_norm": 0.2916278541088104, + "learning_rate": 8.386523339399095e-05, + "loss": 1.8299, + "step": 9295 + }, + { + "epoch": 2.853284223449969, + "grad_norm": 0.3109573423862457, + "learning_rate": 8.386157637260626e-05, + "loss": 1.8072, + "step": 9296 + }, + { + "epoch": 2.8535911602209945, + "grad_norm": 0.26398584246635437, + "learning_rate": 8.385791901658162e-05, + "loss": 1.8157, + "step": 9297 + }, + { + "epoch": 2.85389809699202, + "grad_norm": 0.3289371132850647, + "learning_rate": 8.385426132595317e-05, + "loss": 1.9382, + "step": 9298 + }, + { + "epoch": 2.8542050337630447, + "grad_norm": 0.2946974039077759, + "learning_rate": 8.38506033007571e-05, + "loss": 1.7893, + "step": 9299 + }, + { + "epoch": 2.85451197053407, + "grad_norm": 0.2909530699253082, + "learning_rate": 8.384694494102949e-05, + "loss": 1.8223, + "step": 9300 + }, + { + "epoch": 2.854818907305095, + "grad_norm": 0.2886645793914795, + "learning_rate": 8.384328624680655e-05, + "loss": 1.8239, + "step": 9301 + }, + { + "epoch": 2.8551258440761202, + "grad_norm": 0.2669137716293335, + "learning_rate": 8.383962721812442e-05, + "loss": 1.8102, + "step": 9302 + }, + { + "epoch": 2.8554327808471456, + "grad_norm": 0.3740660548210144, + "learning_rate": 8.383596785501926e-05, + "loss": 1.9014, + "step": 9303 + }, + { + "epoch": 2.855739717618171, + "grad_norm": 0.3062593638896942, + "learning_rate": 8.383230815752724e-05, + "loss": 1.8071, + "step": 9304 + }, + { + "epoch": 2.856046654389196, + "grad_norm": 0.2509091794490814, + "learning_rate": 8.382864812568452e-05, + "loss": 1.7968, + "step": 9305 + }, + { + "epoch": 2.856353591160221, + "grad_norm": 0.2764138877391815, + "learning_rate": 8.382498775952725e-05, + "loss": 1.7463, + "step": 9306 + }, + { + "epoch": 2.856660527931246, + "grad_norm": 0.3292323350906372, + "learning_rate": 8.382132705909165e-05, + "loss": 1.7888, + "step": 9307 + }, + { + "epoch": 2.8569674647022714, + "grad_norm": 0.3169284462928772, + "learning_rate": 8.381766602441386e-05, + "loss": 1.841, + "step": 9308 + }, + { + "epoch": 2.8572744014732967, + "grad_norm": 0.27665168046951294, + "learning_rate": 8.381400465553007e-05, + "loss": 1.7659, + "step": 9309 + }, + { + "epoch": 2.8575813382443216, + "grad_norm": 0.34908005595207214, + "learning_rate": 8.381034295247647e-05, + "loss": 1.8752, + "step": 9310 + }, + { + "epoch": 2.857888275015347, + "grad_norm": 0.31204238533973694, + "learning_rate": 8.380668091528924e-05, + "loss": 1.8201, + "step": 9311 + }, + { + "epoch": 2.858195211786372, + "grad_norm": 0.2713339328765869, + "learning_rate": 8.380301854400459e-05, + "loss": 1.8002, + "step": 9312 + }, + { + "epoch": 2.858502148557397, + "grad_norm": 0.30525076389312744, + "learning_rate": 8.379935583865868e-05, + "loss": 1.8533, + "step": 9313 + }, + { + "epoch": 2.8588090853284225, + "grad_norm": 0.3294430673122406, + "learning_rate": 8.379569279928774e-05, + "loss": 1.8895, + "step": 9314 + }, + { + "epoch": 2.8591160220994474, + "grad_norm": 0.31798750162124634, + "learning_rate": 8.379202942592795e-05, + "loss": 1.8148, + "step": 9315 + }, + { + "epoch": 2.8594229588704727, + "grad_norm": 0.3044969141483307, + "learning_rate": 8.378836571861553e-05, + "loss": 1.8477, + "step": 9316 + }, + { + "epoch": 2.8597298956414976, + "grad_norm": 0.2694118320941925, + "learning_rate": 8.378470167738665e-05, + "loss": 1.7998, + "step": 9317 + }, + { + "epoch": 2.860036832412523, + "grad_norm": 0.2601872980594635, + "learning_rate": 8.378103730227758e-05, + "loss": 1.8118, + "step": 9318 + }, + { + "epoch": 2.8603437691835483, + "grad_norm": 0.28168994188308716, + "learning_rate": 8.377737259332446e-05, + "loss": 1.8048, + "step": 9319 + }, + { + "epoch": 2.8606507059545736, + "grad_norm": 0.3008260428905487, + "learning_rate": 8.377370755056358e-05, + "loss": 1.7743, + "step": 9320 + }, + { + "epoch": 2.8609576427255985, + "grad_norm": 0.2578682601451874, + "learning_rate": 8.37700421740311e-05, + "loss": 1.8011, + "step": 9321 + }, + { + "epoch": 2.861264579496624, + "grad_norm": 0.3051932752132416, + "learning_rate": 8.376637646376329e-05, + "loss": 1.8747, + "step": 9322 + }, + { + "epoch": 2.8615715162676487, + "grad_norm": 0.27534300088882446, + "learning_rate": 8.376271041979636e-05, + "loss": 1.8018, + "step": 9323 + }, + { + "epoch": 2.861878453038674, + "grad_norm": 0.3990626335144043, + "learning_rate": 8.375904404216653e-05, + "loss": 1.9223, + "step": 9324 + }, + { + "epoch": 2.8621853898096994, + "grad_norm": 0.43015196919441223, + "learning_rate": 8.375537733091003e-05, + "loss": 1.8219, + "step": 9325 + }, + { + "epoch": 2.8624923265807243, + "grad_norm": 0.4051269590854645, + "learning_rate": 8.37517102860631e-05, + "loss": 1.8057, + "step": 9326 + }, + { + "epoch": 2.8627992633517496, + "grad_norm": 0.31781086325645447, + "learning_rate": 8.3748042907662e-05, + "loss": 1.8374, + "step": 9327 + }, + { + "epoch": 2.8631062001227745, + "grad_norm": 0.3476638197898865, + "learning_rate": 8.374437519574297e-05, + "loss": 1.8679, + "step": 9328 + }, + { + "epoch": 2.8634131368938, + "grad_norm": 0.40497875213623047, + "learning_rate": 8.374070715034224e-05, + "loss": 1.7996, + "step": 9329 + }, + { + "epoch": 2.863720073664825, + "grad_norm": 0.40277308225631714, + "learning_rate": 8.373703877149605e-05, + "loss": 1.8156, + "step": 9330 + }, + { + "epoch": 2.86402701043585, + "grad_norm": 0.3012325167655945, + "learning_rate": 8.373337005924069e-05, + "loss": 1.8765, + "step": 9331 + }, + { + "epoch": 2.8643339472068754, + "grad_norm": 0.3151897192001343, + "learning_rate": 8.372970101361238e-05, + "loss": 1.8395, + "step": 9332 + }, + { + "epoch": 2.8646408839779003, + "grad_norm": 0.33645790815353394, + "learning_rate": 8.372603163464741e-05, + "loss": 1.8587, + "step": 9333 + }, + { + "epoch": 2.8649478207489256, + "grad_norm": 0.29943743348121643, + "learning_rate": 8.3722361922382e-05, + "loss": 1.8007, + "step": 9334 + }, + { + "epoch": 2.865254757519951, + "grad_norm": 0.24727779626846313, + "learning_rate": 8.371869187685248e-05, + "loss": 1.766, + "step": 9335 + }, + { + "epoch": 2.8655616942909763, + "grad_norm": 0.3177282512187958, + "learning_rate": 8.371502149809507e-05, + "loss": 1.7954, + "step": 9336 + }, + { + "epoch": 2.865868631062001, + "grad_norm": 0.3415081202983856, + "learning_rate": 8.371135078614605e-05, + "loss": 1.8036, + "step": 9337 + }, + { + "epoch": 2.8661755678330265, + "grad_norm": 0.3044268488883972, + "learning_rate": 8.37076797410417e-05, + "loss": 1.8196, + "step": 9338 + }, + { + "epoch": 2.8664825046040514, + "grad_norm": 0.24425630271434784, + "learning_rate": 8.370400836281831e-05, + "loss": 1.8267, + "step": 9339 + }, + { + "epoch": 2.8667894413750767, + "grad_norm": 0.27264806628227234, + "learning_rate": 8.370033665151216e-05, + "loss": 1.8218, + "step": 9340 + }, + { + "epoch": 2.867096378146102, + "grad_norm": 0.275601327419281, + "learning_rate": 8.369666460715953e-05, + "loss": 1.8427, + "step": 9341 + }, + { + "epoch": 2.867403314917127, + "grad_norm": 0.2670573592185974, + "learning_rate": 8.36929922297967e-05, + "loss": 1.8449, + "step": 9342 + }, + { + "epoch": 2.8677102516881523, + "grad_norm": 0.2991434335708618, + "learning_rate": 8.368931951945998e-05, + "loss": 1.8866, + "step": 9343 + }, + { + "epoch": 2.868017188459177, + "grad_norm": 0.2975110411643982, + "learning_rate": 8.368564647618564e-05, + "loss": 1.7992, + "step": 9344 + }, + { + "epoch": 2.8683241252302025, + "grad_norm": 0.30109819769859314, + "learning_rate": 8.368197310001001e-05, + "loss": 1.8402, + "step": 9345 + }, + { + "epoch": 2.868631062001228, + "grad_norm": 0.3303714692592621, + "learning_rate": 8.367829939096938e-05, + "loss": 1.8329, + "step": 9346 + }, + { + "epoch": 2.8689379987722528, + "grad_norm": 0.3697182834148407, + "learning_rate": 8.367462534910007e-05, + "loss": 1.9328, + "step": 9347 + }, + { + "epoch": 2.869244935543278, + "grad_norm": 0.3292355537414551, + "learning_rate": 8.367095097443836e-05, + "loss": 1.8284, + "step": 9348 + }, + { + "epoch": 2.869551872314303, + "grad_norm": 0.30440348386764526, + "learning_rate": 8.366727626702058e-05, + "loss": 1.8891, + "step": 9349 + }, + { + "epoch": 2.8698588090853283, + "grad_norm": 0.28200212121009827, + "learning_rate": 8.366360122688303e-05, + "loss": 1.7931, + "step": 9350 + }, + { + "epoch": 2.8701657458563536, + "grad_norm": 0.3162787854671478, + "learning_rate": 8.365992585406207e-05, + "loss": 1.8033, + "step": 9351 + }, + { + "epoch": 2.870472682627379, + "grad_norm": 0.3326094448566437, + "learning_rate": 8.365625014859399e-05, + "loss": 1.8474, + "step": 9352 + }, + { + "epoch": 2.870779619398404, + "grad_norm": 0.36957383155822754, + "learning_rate": 8.36525741105151e-05, + "loss": 1.8387, + "step": 9353 + }, + { + "epoch": 2.871086556169429, + "grad_norm": 0.32996198534965515, + "learning_rate": 8.364889773986175e-05, + "loss": 1.9087, + "step": 9354 + }, + { + "epoch": 2.871393492940454, + "grad_norm": 0.3164239227771759, + "learning_rate": 8.36452210366703e-05, + "loss": 1.8735, + "step": 9355 + }, + { + "epoch": 2.8717004297114794, + "grad_norm": 0.411538302898407, + "learning_rate": 8.364154400097702e-05, + "loss": 1.832, + "step": 9356 + }, + { + "epoch": 2.8720073664825048, + "grad_norm": 0.48294687271118164, + "learning_rate": 8.36378666328183e-05, + "loss": 1.7772, + "step": 9357 + }, + { + "epoch": 2.8723143032535297, + "grad_norm": 0.4894202649593353, + "learning_rate": 8.363418893223046e-05, + "loss": 1.8396, + "step": 9358 + }, + { + "epoch": 2.872621240024555, + "grad_norm": 0.3328344225883484, + "learning_rate": 8.363051089924986e-05, + "loss": 1.8264, + "step": 9359 + }, + { + "epoch": 2.87292817679558, + "grad_norm": 0.29800695180892944, + "learning_rate": 8.362683253391284e-05, + "loss": 1.8609, + "step": 9360 + }, + { + "epoch": 2.873235113566605, + "grad_norm": 0.48049718141555786, + "learning_rate": 8.362315383625574e-05, + "loss": 1.8703, + "step": 9361 + }, + { + "epoch": 2.8735420503376305, + "grad_norm": 0.5477426052093506, + "learning_rate": 8.361947480631494e-05, + "loss": 1.8336, + "step": 9362 + }, + { + "epoch": 2.873848987108656, + "grad_norm": 0.42515942454338074, + "learning_rate": 8.361579544412676e-05, + "loss": 1.826, + "step": 9363 + }, + { + "epoch": 2.8741559238796808, + "grad_norm": 0.3049539029598236, + "learning_rate": 8.361211574972762e-05, + "loss": 1.9117, + "step": 9364 + }, + { + "epoch": 2.874462860650706, + "grad_norm": 0.4089799225330353, + "learning_rate": 8.360843572315384e-05, + "loss": 1.8669, + "step": 9365 + }, + { + "epoch": 2.874769797421731, + "grad_norm": 0.42594894766807556, + "learning_rate": 8.36047553644418e-05, + "loss": 1.8527, + "step": 9366 + }, + { + "epoch": 2.8750767341927563, + "grad_norm": 0.3282840847969055, + "learning_rate": 8.360107467362785e-05, + "loss": 1.833, + "step": 9367 + }, + { + "epoch": 2.8753836709637817, + "grad_norm": 0.26597294211387634, + "learning_rate": 8.359739365074841e-05, + "loss": 1.7735, + "step": 9368 + }, + { + "epoch": 2.8756906077348066, + "grad_norm": 0.33498096466064453, + "learning_rate": 8.359371229583983e-05, + "loss": 1.7923, + "step": 9369 + }, + { + "epoch": 2.875997544505832, + "grad_norm": 0.3046290874481201, + "learning_rate": 8.35900306089385e-05, + "loss": 1.8296, + "step": 9370 + }, + { + "epoch": 2.876304481276857, + "grad_norm": 0.3128269612789154, + "learning_rate": 8.358634859008079e-05, + "loss": 1.8115, + "step": 9371 + }, + { + "epoch": 2.876611418047882, + "grad_norm": 0.3814822733402252, + "learning_rate": 8.358266623930309e-05, + "loss": 1.8454, + "step": 9372 + }, + { + "epoch": 2.8769183548189075, + "grad_norm": 0.42400503158569336, + "learning_rate": 8.35789835566418e-05, + "loss": 1.8162, + "step": 9373 + }, + { + "epoch": 2.8772252915899323, + "grad_norm": 0.3131491243839264, + "learning_rate": 8.357530054213333e-05, + "loss": 1.8281, + "step": 9374 + }, + { + "epoch": 2.8775322283609577, + "grad_norm": 0.2566036581993103, + "learning_rate": 8.357161719581406e-05, + "loss": 1.7751, + "step": 9375 + }, + { + "epoch": 2.8778391651319826, + "grad_norm": 0.3858461081981659, + "learning_rate": 8.356793351772038e-05, + "loss": 1.8558, + "step": 9376 + }, + { + "epoch": 2.878146101903008, + "grad_norm": 0.38664349913597107, + "learning_rate": 8.35642495078887e-05, + "loss": 1.8009, + "step": 9377 + }, + { + "epoch": 2.8784530386740332, + "grad_norm": 0.33365172147750854, + "learning_rate": 8.356056516635545e-05, + "loss": 1.8689, + "step": 9378 + }, + { + "epoch": 2.8787599754450586, + "grad_norm": 0.3602980971336365, + "learning_rate": 8.355688049315702e-05, + "loss": 1.8397, + "step": 9379 + }, + { + "epoch": 2.8790669122160835, + "grad_norm": 0.4508447051048279, + "learning_rate": 8.355319548832983e-05, + "loss": 1.8163, + "step": 9380 + }, + { + "epoch": 2.879373848987109, + "grad_norm": 0.4433961808681488, + "learning_rate": 8.35495101519103e-05, + "loss": 1.7868, + "step": 9381 + }, + { + "epoch": 2.8796807857581337, + "grad_norm": 0.2754592299461365, + "learning_rate": 8.354582448393483e-05, + "loss": 1.8222, + "step": 9382 + }, + { + "epoch": 2.879987722529159, + "grad_norm": 0.29384344816207886, + "learning_rate": 8.354213848443987e-05, + "loss": 1.7742, + "step": 9383 + }, + { + "epoch": 2.8802946593001844, + "grad_norm": 0.33183756470680237, + "learning_rate": 8.353845215346183e-05, + "loss": 1.8327, + "step": 9384 + }, + { + "epoch": 2.8806015960712092, + "grad_norm": 0.3018858730792999, + "learning_rate": 8.353476549103717e-05, + "loss": 1.8606, + "step": 9385 + }, + { + "epoch": 2.8809085328422346, + "grad_norm": 0.38592803478240967, + "learning_rate": 8.353107849720229e-05, + "loss": 1.8091, + "step": 9386 + }, + { + "epoch": 2.8812154696132595, + "grad_norm": 0.448723703622818, + "learning_rate": 8.352739117199364e-05, + "loss": 1.8537, + "step": 9387 + }, + { + "epoch": 2.881522406384285, + "grad_norm": 0.25959616899490356, + "learning_rate": 8.352370351544765e-05, + "loss": 1.8188, + "step": 9388 + }, + { + "epoch": 2.88182934315531, + "grad_norm": 0.3304184079170227, + "learning_rate": 8.352001552760078e-05, + "loss": 1.8008, + "step": 9389 + }, + { + "epoch": 2.882136279926335, + "grad_norm": 0.3831254541873932, + "learning_rate": 8.351632720848947e-05, + "loss": 1.7636, + "step": 9390 + }, + { + "epoch": 2.8824432166973604, + "grad_norm": 0.3358294665813446, + "learning_rate": 8.351263855815017e-05, + "loss": 1.8375, + "step": 9391 + }, + { + "epoch": 2.8827501534683853, + "grad_norm": 0.31194913387298584, + "learning_rate": 8.350894957661935e-05, + "loss": 1.817, + "step": 9392 + }, + { + "epoch": 2.8830570902394106, + "grad_norm": 0.4156818687915802, + "learning_rate": 8.350526026393343e-05, + "loss": 1.799, + "step": 9393 + }, + { + "epoch": 2.883364027010436, + "grad_norm": 0.3062533140182495, + "learning_rate": 8.350157062012889e-05, + "loss": 1.8535, + "step": 9394 + }, + { + "epoch": 2.8836709637814613, + "grad_norm": 0.3091447949409485, + "learning_rate": 8.34978806452422e-05, + "loss": 1.839, + "step": 9395 + }, + { + "epoch": 2.883977900552486, + "grad_norm": 0.38731643557548523, + "learning_rate": 8.349419033930981e-05, + "loss": 1.8714, + "step": 9396 + }, + { + "epoch": 2.8842848373235115, + "grad_norm": 0.34655869007110596, + "learning_rate": 8.34904997023682e-05, + "loss": 1.8694, + "step": 9397 + }, + { + "epoch": 2.8845917740945364, + "grad_norm": 0.3094301223754883, + "learning_rate": 8.348680873445386e-05, + "loss": 1.8773, + "step": 9398 + }, + { + "epoch": 2.8848987108655617, + "grad_norm": 0.2954508364200592, + "learning_rate": 8.348311743560325e-05, + "loss": 1.7716, + "step": 9399 + }, + { + "epoch": 2.885205647636587, + "grad_norm": 0.32545948028564453, + "learning_rate": 8.347942580585282e-05, + "loss": 1.871, + "step": 9400 + }, + { + "epoch": 2.885512584407612, + "grad_norm": 0.3251612186431885, + "learning_rate": 8.34757338452391e-05, + "loss": 1.8553, + "step": 9401 + }, + { + "epoch": 2.8858195211786373, + "grad_norm": 0.2610895335674286, + "learning_rate": 8.347204155379856e-05, + "loss": 1.8018, + "step": 9402 + }, + { + "epoch": 2.886126457949662, + "grad_norm": 0.3369129002094269, + "learning_rate": 8.346834893156768e-05, + "loss": 1.8536, + "step": 9403 + }, + { + "epoch": 2.8864333947206875, + "grad_norm": 0.4544060528278351, + "learning_rate": 8.346465597858296e-05, + "loss": 1.8332, + "step": 9404 + }, + { + "epoch": 2.886740331491713, + "grad_norm": 0.45742174983024597, + "learning_rate": 8.346096269488089e-05, + "loss": 1.89, + "step": 9405 + }, + { + "epoch": 2.8870472682627377, + "grad_norm": 0.3458103537559509, + "learning_rate": 8.345726908049799e-05, + "loss": 1.8902, + "step": 9406 + }, + { + "epoch": 2.887354205033763, + "grad_norm": 0.33266058564186096, + "learning_rate": 8.345357513547074e-05, + "loss": 1.7975, + "step": 9407 + }, + { + "epoch": 2.887661141804788, + "grad_norm": 0.3503437042236328, + "learning_rate": 8.344988085983565e-05, + "loss": 1.8503, + "step": 9408 + }, + { + "epoch": 2.8879680785758133, + "grad_norm": 0.33511486649513245, + "learning_rate": 8.344618625362923e-05, + "loss": 1.8731, + "step": 9409 + }, + { + "epoch": 2.8882750153468386, + "grad_norm": 0.295250803232193, + "learning_rate": 8.344249131688799e-05, + "loss": 1.8557, + "step": 9410 + }, + { + "epoch": 2.888581952117864, + "grad_norm": 0.33287179470062256, + "learning_rate": 8.343879604964846e-05, + "loss": 1.8015, + "step": 9411 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.35169747471809387, + "learning_rate": 8.343510045194715e-05, + "loss": 1.7857, + "step": 9412 + }, + { + "epoch": 2.889195825659914, + "grad_norm": 0.3191360533237457, + "learning_rate": 8.343140452382056e-05, + "loss": 1.8474, + "step": 9413 + }, + { + "epoch": 2.889502762430939, + "grad_norm": 0.27216482162475586, + "learning_rate": 8.342770826530526e-05, + "loss": 1.7941, + "step": 9414 + }, + { + "epoch": 2.8898096992019644, + "grad_norm": 0.32968905568122864, + "learning_rate": 8.342401167643774e-05, + "loss": 1.8568, + "step": 9415 + }, + { + "epoch": 2.8901166359729897, + "grad_norm": 0.37429341673851013, + "learning_rate": 8.342031475725456e-05, + "loss": 1.8995, + "step": 9416 + }, + { + "epoch": 2.8904235727440146, + "grad_norm": 0.3318146765232086, + "learning_rate": 8.341661750779223e-05, + "loss": 1.8886, + "step": 9417 + }, + { + "epoch": 2.89073050951504, + "grad_norm": 0.3208807408809662, + "learning_rate": 8.34129199280873e-05, + "loss": 1.8306, + "step": 9418 + }, + { + "epoch": 2.891037446286065, + "grad_norm": 0.30906134843826294, + "learning_rate": 8.340922201817632e-05, + "loss": 1.8931, + "step": 9419 + }, + { + "epoch": 2.89134438305709, + "grad_norm": 0.2949373722076416, + "learning_rate": 8.340552377809581e-05, + "loss": 1.8375, + "step": 9420 + }, + { + "epoch": 2.8916513198281155, + "grad_norm": 0.2553368806838989, + "learning_rate": 8.340182520788236e-05, + "loss": 1.7816, + "step": 9421 + }, + { + "epoch": 2.891958256599141, + "grad_norm": 0.26867765188217163, + "learning_rate": 8.339812630757246e-05, + "loss": 1.7721, + "step": 9422 + }, + { + "epoch": 2.8922651933701657, + "grad_norm": 0.3132673501968384, + "learning_rate": 8.339442707720273e-05, + "loss": 1.8412, + "step": 9423 + }, + { + "epoch": 2.892572130141191, + "grad_norm": 0.32028669118881226, + "learning_rate": 8.33907275168097e-05, + "loss": 1.8081, + "step": 9424 + }, + { + "epoch": 2.892879066912216, + "grad_norm": 0.30383285880088806, + "learning_rate": 8.338702762642992e-05, + "loss": 1.8294, + "step": 9425 + }, + { + "epoch": 2.8931860036832413, + "grad_norm": 0.284161239862442, + "learning_rate": 8.338332740609995e-05, + "loss": 1.7788, + "step": 9426 + }, + { + "epoch": 2.8934929404542666, + "grad_norm": 0.26731929183006287, + "learning_rate": 8.337962685585638e-05, + "loss": 1.8244, + "step": 9427 + }, + { + "epoch": 2.8937998772252915, + "grad_norm": 0.2687760889530182, + "learning_rate": 8.337592597573578e-05, + "loss": 1.8104, + "step": 9428 + }, + { + "epoch": 2.894106813996317, + "grad_norm": 0.3097872734069824, + "learning_rate": 8.337222476577472e-05, + "loss": 1.8311, + "step": 9429 + }, + { + "epoch": 2.8944137507673418, + "grad_norm": 0.2915988862514496, + "learning_rate": 8.336852322600977e-05, + "loss": 1.8878, + "step": 9430 + }, + { + "epoch": 2.894720687538367, + "grad_norm": 0.2783167362213135, + "learning_rate": 8.336482135647751e-05, + "loss": 1.829, + "step": 9431 + }, + { + "epoch": 2.8950276243093924, + "grad_norm": 0.27866432070732117, + "learning_rate": 8.336111915721454e-05, + "loss": 1.8881, + "step": 9432 + }, + { + "epoch": 2.8953345610804173, + "grad_norm": 0.26949164271354675, + "learning_rate": 8.335741662825743e-05, + "loss": 1.7652, + "step": 9433 + }, + { + "epoch": 2.8956414978514426, + "grad_norm": 0.31324130296707153, + "learning_rate": 8.335371376964278e-05, + "loss": 1.8362, + "step": 9434 + }, + { + "epoch": 2.8959484346224675, + "grad_norm": 0.31150999665260315, + "learning_rate": 8.335001058140718e-05, + "loss": 1.8588, + "step": 9435 + }, + { + "epoch": 2.896255371393493, + "grad_norm": 0.30692601203918457, + "learning_rate": 8.334630706358724e-05, + "loss": 1.8473, + "step": 9436 + }, + { + "epoch": 2.896562308164518, + "grad_norm": 0.2764357328414917, + "learning_rate": 8.334260321621954e-05, + "loss": 1.8696, + "step": 9437 + }, + { + "epoch": 2.8968692449355435, + "grad_norm": 0.26108071208000183, + "learning_rate": 8.333889903934069e-05, + "loss": 1.7647, + "step": 9438 + }, + { + "epoch": 2.8971761817065684, + "grad_norm": 0.3382989466190338, + "learning_rate": 8.33351945329873e-05, + "loss": 1.8936, + "step": 9439 + }, + { + "epoch": 2.8974831184775938, + "grad_norm": 0.3121405839920044, + "learning_rate": 8.333148969719598e-05, + "loss": 1.8281, + "step": 9440 + }, + { + "epoch": 2.8977900552486187, + "grad_norm": 0.283149778842926, + "learning_rate": 8.332778453200334e-05, + "loss": 1.8642, + "step": 9441 + }, + { + "epoch": 2.898096992019644, + "grad_norm": 0.4140075445175171, + "learning_rate": 8.332407903744598e-05, + "loss": 1.8553, + "step": 9442 + }, + { + "epoch": 2.8984039287906693, + "grad_norm": 0.4345620274543762, + "learning_rate": 8.332037321356057e-05, + "loss": 1.7879, + "step": 9443 + }, + { + "epoch": 2.898710865561694, + "grad_norm": 0.4103661775588989, + "learning_rate": 8.33166670603837e-05, + "loss": 1.7928, + "step": 9444 + }, + { + "epoch": 2.8990178023327196, + "grad_norm": 0.2874266505241394, + "learning_rate": 8.3312960577952e-05, + "loss": 1.8097, + "step": 9445 + }, + { + "epoch": 2.8993247391037444, + "grad_norm": 0.2949487864971161, + "learning_rate": 8.330925376630208e-05, + "loss": 1.8679, + "step": 9446 + }, + { + "epoch": 2.8996316758747698, + "grad_norm": 0.3222406804561615, + "learning_rate": 8.330554662547059e-05, + "loss": 1.8184, + "step": 9447 + }, + { + "epoch": 2.899938612645795, + "grad_norm": 0.32089436054229736, + "learning_rate": 8.330183915549418e-05, + "loss": 1.8798, + "step": 9448 + }, + { + "epoch": 2.90024554941682, + "grad_norm": 0.28950363397598267, + "learning_rate": 8.329813135640947e-05, + "loss": 1.8502, + "step": 9449 + }, + { + "epoch": 2.9005524861878453, + "grad_norm": 0.29070547223091125, + "learning_rate": 8.329442322825312e-05, + "loss": 1.8826, + "step": 9450 + }, + { + "epoch": 2.9008594229588702, + "grad_norm": 0.3030688464641571, + "learning_rate": 8.329071477106175e-05, + "loss": 1.8002, + "step": 9451 + }, + { + "epoch": 2.9011663597298956, + "grad_norm": 0.33711570501327515, + "learning_rate": 8.328700598487203e-05, + "loss": 1.8876, + "step": 9452 + }, + { + "epoch": 2.901473296500921, + "grad_norm": 0.31995612382888794, + "learning_rate": 8.328329686972063e-05, + "loss": 1.7952, + "step": 9453 + }, + { + "epoch": 2.9017802332719462, + "grad_norm": 0.2619616389274597, + "learning_rate": 8.327958742564415e-05, + "loss": 1.7371, + "step": 9454 + }, + { + "epoch": 2.902087170042971, + "grad_norm": 0.3527650535106659, + "learning_rate": 8.32758776526793e-05, + "loss": 1.8385, + "step": 9455 + }, + { + "epoch": 2.9023941068139965, + "grad_norm": 0.3238582909107208, + "learning_rate": 8.327216755086271e-05, + "loss": 1.7955, + "step": 9456 + }, + { + "epoch": 2.9027010435850213, + "grad_norm": 0.2647970914840698, + "learning_rate": 8.326845712023106e-05, + "loss": 1.8639, + "step": 9457 + }, + { + "epoch": 2.9030079803560467, + "grad_norm": 0.3435346186161041, + "learning_rate": 8.326474636082103e-05, + "loss": 1.7831, + "step": 9458 + }, + { + "epoch": 2.903314917127072, + "grad_norm": 0.42539843916893005, + "learning_rate": 8.326103527266927e-05, + "loss": 1.8473, + "step": 9459 + }, + { + "epoch": 2.903621853898097, + "grad_norm": 0.3773367404937744, + "learning_rate": 8.325732385581247e-05, + "loss": 1.8993, + "step": 9460 + }, + { + "epoch": 2.9039287906691222, + "grad_norm": 0.2918262183666229, + "learning_rate": 8.32536121102873e-05, + "loss": 1.8198, + "step": 9461 + }, + { + "epoch": 2.904235727440147, + "grad_norm": 0.3997703492641449, + "learning_rate": 8.324990003613044e-05, + "loss": 1.8307, + "step": 9462 + }, + { + "epoch": 2.9045426642111725, + "grad_norm": 0.4593566656112671, + "learning_rate": 8.324618763337858e-05, + "loss": 1.8068, + "step": 9463 + }, + { + "epoch": 2.904849600982198, + "grad_norm": 0.30200180411338806, + "learning_rate": 8.324247490206841e-05, + "loss": 1.7935, + "step": 9464 + }, + { + "epoch": 2.9051565377532227, + "grad_norm": 0.37651970982551575, + "learning_rate": 8.323876184223663e-05, + "loss": 1.9268, + "step": 9465 + }, + { + "epoch": 2.905463474524248, + "grad_norm": 0.465863436460495, + "learning_rate": 8.32350484539199e-05, + "loss": 1.8331, + "step": 9466 + }, + { + "epoch": 2.905770411295273, + "grad_norm": 0.3527480661869049, + "learning_rate": 8.323133473715496e-05, + "loss": 1.899, + "step": 9467 + }, + { + "epoch": 2.9060773480662982, + "grad_norm": 0.30979883670806885, + "learning_rate": 8.32276206919785e-05, + "loss": 1.7578, + "step": 9468 + }, + { + "epoch": 2.9063842848373236, + "grad_norm": 0.5039793252944946, + "learning_rate": 8.322390631842718e-05, + "loss": 1.7822, + "step": 9469 + }, + { + "epoch": 2.906691221608349, + "grad_norm": 0.4683503806591034, + "learning_rate": 8.322019161653777e-05, + "loss": 1.7958, + "step": 9470 + }, + { + "epoch": 2.906998158379374, + "grad_norm": 0.27022865414619446, + "learning_rate": 8.321647658634696e-05, + "loss": 1.838, + "step": 9471 + }, + { + "epoch": 2.907305095150399, + "grad_norm": 0.3253246247768402, + "learning_rate": 8.321276122789146e-05, + "loss": 1.862, + "step": 9472 + }, + { + "epoch": 2.907612031921424, + "grad_norm": 0.3654547929763794, + "learning_rate": 8.320904554120798e-05, + "loss": 1.8578, + "step": 9473 + }, + { + "epoch": 2.9079189686924494, + "grad_norm": 0.3140239417552948, + "learning_rate": 8.320532952633325e-05, + "loss": 1.7954, + "step": 9474 + }, + { + "epoch": 2.9082259054634747, + "grad_norm": 0.24541302025318146, + "learning_rate": 8.3201613183304e-05, + "loss": 1.7711, + "step": 9475 + }, + { + "epoch": 2.9085328422344996, + "grad_norm": 0.2538415491580963, + "learning_rate": 8.319789651215692e-05, + "loss": 1.7756, + "step": 9476 + }, + { + "epoch": 2.908839779005525, + "grad_norm": 0.3181871175765991, + "learning_rate": 8.31941795129288e-05, + "loss": 1.7957, + "step": 9477 + }, + { + "epoch": 2.90914671577655, + "grad_norm": 0.3094673752784729, + "learning_rate": 8.319046218565633e-05, + "loss": 1.8897, + "step": 9478 + }, + { + "epoch": 2.909453652547575, + "grad_norm": 0.3004473149776459, + "learning_rate": 8.318674453037626e-05, + "loss": 1.7853, + "step": 9479 + }, + { + "epoch": 2.9097605893186005, + "grad_norm": 0.28673505783081055, + "learning_rate": 8.318302654712532e-05, + "loss": 1.8119, + "step": 9480 + }, + { + "epoch": 2.9100675260896254, + "grad_norm": 0.3177729547023773, + "learning_rate": 8.317930823594027e-05, + "loss": 1.8211, + "step": 9481 + }, + { + "epoch": 2.9103744628606507, + "grad_norm": 0.28347232937812805, + "learning_rate": 8.317558959685786e-05, + "loss": 1.8061, + "step": 9482 + }, + { + "epoch": 2.9106813996316756, + "grad_norm": 0.28247126936912537, + "learning_rate": 8.317187062991482e-05, + "loss": 1.8175, + "step": 9483 + }, + { + "epoch": 2.910988336402701, + "grad_norm": 0.3153017461299896, + "learning_rate": 8.31681513351479e-05, + "loss": 1.8619, + "step": 9484 + }, + { + "epoch": 2.9112952731737263, + "grad_norm": 0.265821635723114, + "learning_rate": 8.316443171259389e-05, + "loss": 1.7783, + "step": 9485 + }, + { + "epoch": 2.9116022099447516, + "grad_norm": 0.33247366547584534, + "learning_rate": 8.31607117622895e-05, + "loss": 1.8701, + "step": 9486 + }, + { + "epoch": 2.9119091467157765, + "grad_norm": 0.3343275189399719, + "learning_rate": 8.315699148427154e-05, + "loss": 1.742, + "step": 9487 + }, + { + "epoch": 2.912216083486802, + "grad_norm": 0.3427117168903351, + "learning_rate": 8.315327087857677e-05, + "loss": 1.8382, + "step": 9488 + }, + { + "epoch": 2.9125230202578267, + "grad_norm": 0.2884635925292969, + "learning_rate": 8.31495499452419e-05, + "loss": 1.8378, + "step": 9489 + }, + { + "epoch": 2.912829957028852, + "grad_norm": 0.30335184931755066, + "learning_rate": 8.31458286843038e-05, + "loss": 1.7619, + "step": 9490 + }, + { + "epoch": 2.9131368937998774, + "grad_norm": 0.3224368095397949, + "learning_rate": 8.314210709579916e-05, + "loss": 1.8289, + "step": 9491 + }, + { + "epoch": 2.9134438305709023, + "grad_norm": 0.28016242384910583, + "learning_rate": 8.31383851797648e-05, + "loss": 1.8027, + "step": 9492 + }, + { + "epoch": 2.9137507673419276, + "grad_norm": 0.32091468572616577, + "learning_rate": 8.313466293623749e-05, + "loss": 1.9027, + "step": 9493 + }, + { + "epoch": 2.9140577041129525, + "grad_norm": 0.2809069752693176, + "learning_rate": 8.313094036525403e-05, + "loss": 1.9194, + "step": 9494 + }, + { + "epoch": 2.914364640883978, + "grad_norm": 0.30734366178512573, + "learning_rate": 8.312721746685119e-05, + "loss": 1.8612, + "step": 9495 + }, + { + "epoch": 2.914671577655003, + "grad_norm": 0.25953513383865356, + "learning_rate": 8.312349424106578e-05, + "loss": 1.7593, + "step": 9496 + }, + { + "epoch": 2.9149785144260285, + "grad_norm": 0.27583983540534973, + "learning_rate": 8.311977068793459e-05, + "loss": 1.8138, + "step": 9497 + }, + { + "epoch": 2.9152854511970534, + "grad_norm": 0.30315884947776794, + "learning_rate": 8.31160468074944e-05, + "loss": 1.7704, + "step": 9498 + }, + { + "epoch": 2.9155923879680787, + "grad_norm": 0.321603387594223, + "learning_rate": 8.311232259978204e-05, + "loss": 1.8055, + "step": 9499 + }, + { + "epoch": 2.9158993247391036, + "grad_norm": 0.27882421016693115, + "learning_rate": 8.310859806483429e-05, + "loss": 1.8257, + "step": 9500 + }, + { + "epoch": 2.916206261510129, + "grad_norm": 0.3095625042915344, + "learning_rate": 8.310487320268795e-05, + "loss": 1.8561, + "step": 9501 + }, + { + "epoch": 2.9165131982811543, + "grad_norm": 0.27503731846809387, + "learning_rate": 8.310114801337988e-05, + "loss": 1.7588, + "step": 9502 + }, + { + "epoch": 2.916820135052179, + "grad_norm": 0.2534404695034027, + "learning_rate": 8.309742249694686e-05, + "loss": 1.7289, + "step": 9503 + }, + { + "epoch": 2.9171270718232045, + "grad_norm": 0.24968849122524261, + "learning_rate": 8.30936966534257e-05, + "loss": 1.7763, + "step": 9504 + }, + { + "epoch": 2.9174340085942294, + "grad_norm": 0.2728060781955719, + "learning_rate": 8.308997048285324e-05, + "loss": 1.7847, + "step": 9505 + }, + { + "epoch": 2.9177409453652547, + "grad_norm": 0.28728193044662476, + "learning_rate": 8.308624398526629e-05, + "loss": 1.7957, + "step": 9506 + }, + { + "epoch": 2.91804788213628, + "grad_norm": 0.3097241520881653, + "learning_rate": 8.308251716070169e-05, + "loss": 1.8141, + "step": 9507 + }, + { + "epoch": 2.918354818907305, + "grad_norm": 0.3570188879966736, + "learning_rate": 8.307879000919628e-05, + "loss": 1.8246, + "step": 9508 + }, + { + "epoch": 2.9186617556783303, + "grad_norm": 0.27077826857566833, + "learning_rate": 8.307506253078685e-05, + "loss": 1.7912, + "step": 9509 + }, + { + "epoch": 2.918968692449355, + "grad_norm": 0.26213565468788147, + "learning_rate": 8.307133472551028e-05, + "loss": 1.8378, + "step": 9510 + }, + { + "epoch": 2.9192756292203805, + "grad_norm": 0.3482845723628998, + "learning_rate": 8.306760659340339e-05, + "loss": 1.8031, + "step": 9511 + }, + { + "epoch": 2.919582565991406, + "grad_norm": 0.3730507791042328, + "learning_rate": 8.306387813450303e-05, + "loss": 1.7404, + "step": 9512 + }, + { + "epoch": 2.919889502762431, + "grad_norm": 0.2957874536514282, + "learning_rate": 8.306014934884606e-05, + "loss": 1.8623, + "step": 9513 + }, + { + "epoch": 2.920196439533456, + "grad_norm": 0.29137885570526123, + "learning_rate": 8.30564202364693e-05, + "loss": 1.847, + "step": 9514 + }, + { + "epoch": 2.9205033763044814, + "grad_norm": 0.35623642802238464, + "learning_rate": 8.305269079740964e-05, + "loss": 1.8382, + "step": 9515 + }, + { + "epoch": 2.9208103130755063, + "grad_norm": 0.28263330459594727, + "learning_rate": 8.304896103170389e-05, + "loss": 1.7732, + "step": 9516 + }, + { + "epoch": 2.9211172498465316, + "grad_norm": 0.23631221055984497, + "learning_rate": 8.304523093938897e-05, + "loss": 1.7709, + "step": 9517 + }, + { + "epoch": 2.921424186617557, + "grad_norm": 0.25887101888656616, + "learning_rate": 8.304150052050169e-05, + "loss": 1.7966, + "step": 9518 + }, + { + "epoch": 2.921731123388582, + "grad_norm": 0.31445473432540894, + "learning_rate": 8.303776977507894e-05, + "loss": 1.8735, + "step": 9519 + }, + { + "epoch": 2.922038060159607, + "grad_norm": 0.264930784702301, + "learning_rate": 8.303403870315757e-05, + "loss": 1.7983, + "step": 9520 + }, + { + "epoch": 2.922344996930632, + "grad_norm": 0.2664194107055664, + "learning_rate": 8.30303073047745e-05, + "loss": 1.8573, + "step": 9521 + }, + { + "epoch": 2.9226519337016574, + "grad_norm": 0.31645768880844116, + "learning_rate": 8.302657557996656e-05, + "loss": 1.913, + "step": 9522 + }, + { + "epoch": 2.9229588704726828, + "grad_norm": 0.2820858657360077, + "learning_rate": 8.302284352877063e-05, + "loss": 1.8714, + "step": 9523 + }, + { + "epoch": 2.9232658072437077, + "grad_norm": 0.2960543930530548, + "learning_rate": 8.30191111512236e-05, + "loss": 1.8296, + "step": 9524 + }, + { + "epoch": 2.923572744014733, + "grad_norm": 0.319363534450531, + "learning_rate": 8.301537844736237e-05, + "loss": 1.8533, + "step": 9525 + }, + { + "epoch": 2.923879680785758, + "grad_norm": 0.28047996759414673, + "learning_rate": 8.301164541722384e-05, + "loss": 1.7415, + "step": 9526 + }, + { + "epoch": 2.924186617556783, + "grad_norm": 0.3106628656387329, + "learning_rate": 8.300791206084486e-05, + "loss": 1.8809, + "step": 9527 + }, + { + "epoch": 2.9244935543278086, + "grad_norm": 0.2650253474712372, + "learning_rate": 8.300417837826235e-05, + "loss": 1.8097, + "step": 9528 + }, + { + "epoch": 2.924800491098834, + "grad_norm": 0.31832796335220337, + "learning_rate": 8.30004443695132e-05, + "loss": 1.881, + "step": 9529 + }, + { + "epoch": 2.925107427869859, + "grad_norm": 0.311018168926239, + "learning_rate": 8.299671003463432e-05, + "loss": 1.8725, + "step": 9530 + }, + { + "epoch": 2.925414364640884, + "grad_norm": 0.3125450909137726, + "learning_rate": 8.299297537366262e-05, + "loss": 1.8159, + "step": 9531 + }, + { + "epoch": 2.925721301411909, + "grad_norm": 0.30022570490837097, + "learning_rate": 8.298924038663498e-05, + "loss": 1.8217, + "step": 9532 + }, + { + "epoch": 2.9260282381829343, + "grad_norm": 0.3061163127422333, + "learning_rate": 8.298550507358836e-05, + "loss": 1.8529, + "step": 9533 + }, + { + "epoch": 2.9263351749539597, + "grad_norm": 0.258891224861145, + "learning_rate": 8.298176943455962e-05, + "loss": 1.8579, + "step": 9534 + }, + { + "epoch": 2.9266421117249846, + "grad_norm": 0.2871147096157074, + "learning_rate": 8.297803346958571e-05, + "loss": 1.8699, + "step": 9535 + }, + { + "epoch": 2.92694904849601, + "grad_norm": 0.3047468066215515, + "learning_rate": 8.297429717870356e-05, + "loss": 1.9165, + "step": 9536 + }, + { + "epoch": 2.927255985267035, + "grad_norm": 0.2852346897125244, + "learning_rate": 8.297056056195005e-05, + "loss": 1.8417, + "step": 9537 + }, + { + "epoch": 2.92756292203806, + "grad_norm": 0.30782654881477356, + "learning_rate": 8.296682361936216e-05, + "loss": 1.835, + "step": 9538 + }, + { + "epoch": 2.9278698588090855, + "grad_norm": 0.44828128814697266, + "learning_rate": 8.296308635097678e-05, + "loss": 1.8997, + "step": 9539 + }, + { + "epoch": 2.9281767955801103, + "grad_norm": 0.48911961913108826, + "learning_rate": 8.295934875683087e-05, + "loss": 1.8249, + "step": 9540 + }, + { + "epoch": 2.9284837323511357, + "grad_norm": 0.3377256691455841, + "learning_rate": 8.295561083696136e-05, + "loss": 1.757, + "step": 9541 + }, + { + "epoch": 2.9287906691221606, + "grad_norm": 0.29486989974975586, + "learning_rate": 8.295187259140518e-05, + "loss": 1.8282, + "step": 9542 + }, + { + "epoch": 2.929097605893186, + "grad_norm": 0.4291549026966095, + "learning_rate": 8.294813402019927e-05, + "loss": 1.7633, + "step": 9543 + }, + { + "epoch": 2.9294045426642112, + "grad_norm": 0.43153640627861023, + "learning_rate": 8.294439512338061e-05, + "loss": 1.7904, + "step": 9544 + }, + { + "epoch": 2.9297114794352366, + "grad_norm": 0.3454402685165405, + "learning_rate": 8.294065590098611e-05, + "loss": 1.8586, + "step": 9545 + }, + { + "epoch": 2.9300184162062615, + "grad_norm": 0.2709622383117676, + "learning_rate": 8.293691635305276e-05, + "loss": 1.8225, + "step": 9546 + }, + { + "epoch": 2.930325352977287, + "grad_norm": 0.34379467368125916, + "learning_rate": 8.293317647961749e-05, + "loss": 1.9005, + "step": 9547 + }, + { + "epoch": 2.9306322897483117, + "grad_norm": 0.37137365341186523, + "learning_rate": 8.292943628071727e-05, + "loss": 1.829, + "step": 9548 + }, + { + "epoch": 2.930939226519337, + "grad_norm": 0.31634894013404846, + "learning_rate": 8.292569575638905e-05, + "loss": 1.8062, + "step": 9549 + }, + { + "epoch": 2.9312461632903624, + "grad_norm": 0.25719332695007324, + "learning_rate": 8.292195490666981e-05, + "loss": 1.8044, + "step": 9550 + }, + { + "epoch": 2.9315531000613873, + "grad_norm": 0.3341852128505707, + "learning_rate": 8.291821373159652e-05, + "loss": 1.8627, + "step": 9551 + }, + { + "epoch": 2.9318600368324126, + "grad_norm": 0.38499385118484497, + "learning_rate": 8.291447223120614e-05, + "loss": 1.8138, + "step": 9552 + }, + { + "epoch": 2.9321669736034375, + "grad_norm": 0.28036460280418396, + "learning_rate": 8.291073040553567e-05, + "loss": 1.7958, + "step": 9553 + }, + { + "epoch": 2.932473910374463, + "grad_norm": 0.30798816680908203, + "learning_rate": 8.290698825462207e-05, + "loss": 1.899, + "step": 9554 + }, + { + "epoch": 2.932780847145488, + "grad_norm": 0.40930941700935364, + "learning_rate": 8.290324577850232e-05, + "loss": 1.841, + "step": 9555 + }, + { + "epoch": 2.933087783916513, + "grad_norm": 0.38794800639152527, + "learning_rate": 8.289950297721341e-05, + "loss": 1.8022, + "step": 9556 + }, + { + "epoch": 2.9333947206875384, + "grad_norm": 0.2716790437698364, + "learning_rate": 8.289575985079232e-05, + "loss": 1.8009, + "step": 9557 + }, + { + "epoch": 2.9337016574585633, + "grad_norm": 0.3063231110572815, + "learning_rate": 8.289201639927605e-05, + "loss": 1.8677, + "step": 9558 + }, + { + "epoch": 2.9340085942295886, + "grad_norm": 0.3279048800468445, + "learning_rate": 8.28882726227016e-05, + "loss": 1.8071, + "step": 9559 + }, + { + "epoch": 2.934315531000614, + "grad_norm": 0.32144758105278015, + "learning_rate": 8.288452852110596e-05, + "loss": 1.8601, + "step": 9560 + }, + { + "epoch": 2.9346224677716393, + "grad_norm": 0.284495085477829, + "learning_rate": 8.288078409452614e-05, + "loss": 1.8358, + "step": 9561 + }, + { + "epoch": 2.934929404542664, + "grad_norm": 0.3779112696647644, + "learning_rate": 8.287703934299915e-05, + "loss": 1.7903, + "step": 9562 + }, + { + "epoch": 2.9352363413136895, + "grad_norm": 0.33851495385169983, + "learning_rate": 8.287329426656197e-05, + "loss": 1.806, + "step": 9563 + }, + { + "epoch": 2.9355432780847144, + "grad_norm": 0.26610738039016724, + "learning_rate": 8.286954886525164e-05, + "loss": 1.7739, + "step": 9564 + }, + { + "epoch": 2.9358502148557397, + "grad_norm": 0.24825556576251984, + "learning_rate": 8.286580313910515e-05, + "loss": 1.7595, + "step": 9565 + }, + { + "epoch": 2.936157151626765, + "grad_norm": 0.28356245160102844, + "learning_rate": 8.286205708815954e-05, + "loss": 1.8497, + "step": 9566 + }, + { + "epoch": 2.93646408839779, + "grad_norm": 0.2974208891391754, + "learning_rate": 8.285831071245182e-05, + "loss": 1.8561, + "step": 9567 + }, + { + "epoch": 2.9367710251688153, + "grad_norm": 0.26718810200691223, + "learning_rate": 8.2854564012019e-05, + "loss": 1.776, + "step": 9568 + }, + { + "epoch": 2.93707796193984, + "grad_norm": 0.30627691745758057, + "learning_rate": 8.285081698689814e-05, + "loss": 1.8141, + "step": 9569 + }, + { + "epoch": 2.9373848987108655, + "grad_norm": 0.33287444710731506, + "learning_rate": 8.284706963712625e-05, + "loss": 1.8727, + "step": 9570 + }, + { + "epoch": 2.937691835481891, + "grad_norm": 0.30571332573890686, + "learning_rate": 8.284332196274036e-05, + "loss": 1.8388, + "step": 9571 + }, + { + "epoch": 2.937998772252916, + "grad_norm": 0.3603699207305908, + "learning_rate": 8.283957396377753e-05, + "loss": 1.8655, + "step": 9572 + }, + { + "epoch": 2.938305709023941, + "grad_norm": 0.2890760898590088, + "learning_rate": 8.283582564027477e-05, + "loss": 1.7919, + "step": 9573 + }, + { + "epoch": 2.9386126457949664, + "grad_norm": 0.34981194138526917, + "learning_rate": 8.283207699226912e-05, + "loss": 1.8542, + "step": 9574 + }, + { + "epoch": 2.9389195825659913, + "grad_norm": 0.43490317463874817, + "learning_rate": 8.282832801979766e-05, + "loss": 1.8109, + "step": 9575 + }, + { + "epoch": 2.9392265193370166, + "grad_norm": 0.4337438941001892, + "learning_rate": 8.282457872289742e-05, + "loss": 1.8856, + "step": 9576 + }, + { + "epoch": 2.939533456108042, + "grad_norm": 0.2723710834980011, + "learning_rate": 8.282082910160544e-05, + "loss": 1.8554, + "step": 9577 + }, + { + "epoch": 2.939840392879067, + "grad_norm": 0.32447734475135803, + "learning_rate": 8.28170791559588e-05, + "loss": 1.8086, + "step": 9578 + }, + { + "epoch": 2.940147329650092, + "grad_norm": 0.3495276868343353, + "learning_rate": 8.281332888599455e-05, + "loss": 1.785, + "step": 9579 + }, + { + "epoch": 2.940454266421117, + "grad_norm": 0.3324705958366394, + "learning_rate": 8.280957829174975e-05, + "loss": 1.8086, + "step": 9580 + }, + { + "epoch": 2.9407612031921424, + "grad_norm": 0.2633898854255676, + "learning_rate": 8.280582737326146e-05, + "loss": 1.8116, + "step": 9581 + }, + { + "epoch": 2.9410681399631677, + "grad_norm": 0.3109157085418701, + "learning_rate": 8.280207613056676e-05, + "loss": 1.8649, + "step": 9582 + }, + { + "epoch": 2.9413750767341926, + "grad_norm": 0.2772599756717682, + "learning_rate": 8.279832456370273e-05, + "loss": 1.8578, + "step": 9583 + }, + { + "epoch": 2.941682013505218, + "grad_norm": 0.32322654128074646, + "learning_rate": 8.279457267270642e-05, + "loss": 1.8621, + "step": 9584 + }, + { + "epoch": 2.941988950276243, + "grad_norm": 0.3678343594074249, + "learning_rate": 8.279082045761493e-05, + "loss": 1.8819, + "step": 9585 + }, + { + "epoch": 2.942295887047268, + "grad_norm": 0.30976057052612305, + "learning_rate": 8.27870679184653e-05, + "loss": 1.8126, + "step": 9586 + }, + { + "epoch": 2.9426028238182935, + "grad_norm": 0.26715603470802307, + "learning_rate": 8.278331505529469e-05, + "loss": 1.8831, + "step": 9587 + }, + { + "epoch": 2.942909760589319, + "grad_norm": 0.263288289308548, + "learning_rate": 8.277956186814014e-05, + "loss": 1.8057, + "step": 9588 + }, + { + "epoch": 2.9432166973603437, + "grad_norm": 0.29458633065223694, + "learning_rate": 8.277580835703873e-05, + "loss": 1.7307, + "step": 9589 + }, + { + "epoch": 2.943523634131369, + "grad_norm": 0.27819791436195374, + "learning_rate": 8.277205452202759e-05, + "loss": 1.8783, + "step": 9590 + }, + { + "epoch": 2.943830570902394, + "grad_norm": 0.29286056756973267, + "learning_rate": 8.276830036314379e-05, + "loss": 1.8061, + "step": 9591 + }, + { + "epoch": 2.9441375076734193, + "grad_norm": 0.2955230474472046, + "learning_rate": 8.276454588042442e-05, + "loss": 1.8227, + "step": 9592 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.3070714473724365, + "learning_rate": 8.276079107390663e-05, + "loss": 1.8451, + "step": 9593 + }, + { + "epoch": 2.9447513812154695, + "grad_norm": 0.34235841035842896, + "learning_rate": 8.275703594362749e-05, + "loss": 1.8052, + "step": 9594 + }, + { + "epoch": 2.945058317986495, + "grad_norm": 0.2863236665725708, + "learning_rate": 8.275328048962412e-05, + "loss": 1.8741, + "step": 9595 + }, + { + "epoch": 2.9453652547575198, + "grad_norm": 0.3013235032558441, + "learning_rate": 8.274952471193364e-05, + "loss": 1.8177, + "step": 9596 + }, + { + "epoch": 2.945672191528545, + "grad_norm": 0.2994023561477661, + "learning_rate": 8.274576861059316e-05, + "loss": 1.903, + "step": 9597 + }, + { + "epoch": 2.9459791282995704, + "grad_norm": 0.320049524307251, + "learning_rate": 8.27420121856398e-05, + "loss": 1.882, + "step": 9598 + }, + { + "epoch": 2.9462860650705953, + "grad_norm": 0.2789655029773712, + "learning_rate": 8.273825543711069e-05, + "loss": 1.794, + "step": 9599 + }, + { + "epoch": 2.9465930018416207, + "grad_norm": 0.3148564398288727, + "learning_rate": 8.273449836504294e-05, + "loss": 1.8453, + "step": 9600 + }, + { + "epoch": 2.9468999386126455, + "grad_norm": 0.46754372119903564, + "learning_rate": 8.273074096947371e-05, + "loss": 1.8147, + "step": 9601 + }, + { + "epoch": 2.947206875383671, + "grad_norm": 0.5946900844573975, + "learning_rate": 8.27269832504401e-05, + "loss": 1.8099, + "step": 9602 + }, + { + "epoch": 2.947513812154696, + "grad_norm": 0.4916069507598877, + "learning_rate": 8.272322520797926e-05, + "loss": 1.8315, + "step": 9603 + }, + { + "epoch": 2.9478207489257215, + "grad_norm": 0.30378973484039307, + "learning_rate": 8.271946684212833e-05, + "loss": 1.87, + "step": 9604 + }, + { + "epoch": 2.9481276856967464, + "grad_norm": 0.5197327136993408, + "learning_rate": 8.271570815292447e-05, + "loss": 1.8109, + "step": 9605 + }, + { + "epoch": 2.9484346224677718, + "grad_norm": 0.7213841080665588, + "learning_rate": 8.271194914040478e-05, + "loss": 1.8526, + "step": 9606 + }, + { + "epoch": 2.9487415592387967, + "grad_norm": 0.5521572232246399, + "learning_rate": 8.270818980460643e-05, + "loss": 1.7982, + "step": 9607 + }, + { + "epoch": 2.949048496009822, + "grad_norm": 0.3072868287563324, + "learning_rate": 8.27044301455666e-05, + "loss": 1.8708, + "step": 9608 + }, + { + "epoch": 2.9493554327808473, + "grad_norm": 0.5477200746536255, + "learning_rate": 8.270067016332241e-05, + "loss": 1.8708, + "step": 9609 + }, + { + "epoch": 2.949662369551872, + "grad_norm": 0.5991030335426331, + "learning_rate": 8.269690985791104e-05, + "loss": 1.7983, + "step": 9610 + }, + { + "epoch": 2.9499693063228976, + "grad_norm": 0.33343803882598877, + "learning_rate": 8.269314922936964e-05, + "loss": 1.7867, + "step": 9611 + }, + { + "epoch": 2.9502762430939224, + "grad_norm": 0.3671727776527405, + "learning_rate": 8.268938827773538e-05, + "loss": 1.9604, + "step": 9612 + }, + { + "epoch": 2.950583179864948, + "grad_norm": 0.5015503764152527, + "learning_rate": 8.26856270030454e-05, + "loss": 1.8424, + "step": 9613 + }, + { + "epoch": 2.950890116635973, + "grad_norm": 0.4369170367717743, + "learning_rate": 8.268186540533693e-05, + "loss": 1.7915, + "step": 9614 + }, + { + "epoch": 2.951197053406998, + "grad_norm": 0.2739746868610382, + "learning_rate": 8.267810348464709e-05, + "loss": 1.7816, + "step": 9615 + }, + { + "epoch": 2.9515039901780233, + "grad_norm": 0.3660983145236969, + "learning_rate": 8.26743412410131e-05, + "loss": 1.8235, + "step": 9616 + }, + { + "epoch": 2.9518109269490482, + "grad_norm": 0.44442248344421387, + "learning_rate": 8.26705786744721e-05, + "loss": 1.8566, + "step": 9617 + }, + { + "epoch": 2.9521178637200736, + "grad_norm": 0.28847622871398926, + "learning_rate": 8.266681578506129e-05, + "loss": 1.82, + "step": 9618 + }, + { + "epoch": 2.952424800491099, + "grad_norm": 0.32827475666999817, + "learning_rate": 8.266305257281786e-05, + "loss": 1.8422, + "step": 9619 + }, + { + "epoch": 2.9527317372621242, + "grad_norm": 0.3459654748439789, + "learning_rate": 8.265928903777902e-05, + "loss": 1.7919, + "step": 9620 + }, + { + "epoch": 2.953038674033149, + "grad_norm": 0.31467050313949585, + "learning_rate": 8.265552517998191e-05, + "loss": 1.8178, + "step": 9621 + }, + { + "epoch": 2.9533456108041745, + "grad_norm": 0.2814936935901642, + "learning_rate": 8.265176099946381e-05, + "loss": 1.7823, + "step": 9622 + }, + { + "epoch": 2.9536525475751993, + "grad_norm": 0.36387261748313904, + "learning_rate": 8.264799649626182e-05, + "loss": 1.7861, + "step": 9623 + }, + { + "epoch": 2.9539594843462247, + "grad_norm": 0.3504095673561096, + "learning_rate": 8.264423167041322e-05, + "loss": 1.8216, + "step": 9624 + }, + { + "epoch": 2.95426642111725, + "grad_norm": 0.28199300169944763, + "learning_rate": 8.264046652195519e-05, + "loss": 1.8397, + "step": 9625 + }, + { + "epoch": 2.954573357888275, + "grad_norm": 0.435774028301239, + "learning_rate": 8.263670105092494e-05, + "loss": 1.8316, + "step": 9626 + }, + { + "epoch": 2.9548802946593002, + "grad_norm": 0.37712937593460083, + "learning_rate": 8.263293525735967e-05, + "loss": 1.8089, + "step": 9627 + }, + { + "epoch": 2.955187231430325, + "grad_norm": 0.34833967685699463, + "learning_rate": 8.26291691412966e-05, + "loss": 1.8324, + "step": 9628 + }, + { + "epoch": 2.9554941682013505, + "grad_norm": 0.37515538930892944, + "learning_rate": 8.262540270277297e-05, + "loss": 1.7958, + "step": 9629 + }, + { + "epoch": 2.955801104972376, + "grad_norm": 0.3392273485660553, + "learning_rate": 8.262163594182598e-05, + "loss": 1.8322, + "step": 9630 + }, + { + "epoch": 2.9561080417434007, + "grad_norm": 0.3477925956249237, + "learning_rate": 8.261786885849287e-05, + "loss": 1.8525, + "step": 9631 + }, + { + "epoch": 2.956414978514426, + "grad_norm": 0.35574036836624146, + "learning_rate": 8.261410145281085e-05, + "loss": 1.8148, + "step": 9632 + }, + { + "epoch": 2.9567219152854514, + "grad_norm": 0.3166620135307312, + "learning_rate": 8.261033372481717e-05, + "loss": 1.7914, + "step": 9633 + }, + { + "epoch": 2.9570288520564763, + "grad_norm": 0.2562217116355896, + "learning_rate": 8.260656567454907e-05, + "loss": 1.7794, + "step": 9634 + }, + { + "epoch": 2.9573357888275016, + "grad_norm": 0.3328792452812195, + "learning_rate": 8.260279730204377e-05, + "loss": 1.8235, + "step": 9635 + }, + { + "epoch": 2.957642725598527, + "grad_norm": 0.33144834637641907, + "learning_rate": 8.259902860733852e-05, + "loss": 1.7668, + "step": 9636 + }, + { + "epoch": 2.957949662369552, + "grad_norm": 0.30557021498680115, + "learning_rate": 8.259525959047056e-05, + "loss": 1.9135, + "step": 9637 + }, + { + "epoch": 2.958256599140577, + "grad_norm": 0.2901468575000763, + "learning_rate": 8.259149025147713e-05, + "loss": 1.8023, + "step": 9638 + }, + { + "epoch": 2.958563535911602, + "grad_norm": 0.35177919268608093, + "learning_rate": 8.25877205903955e-05, + "loss": 1.8541, + "step": 9639 + }, + { + "epoch": 2.9588704726826274, + "grad_norm": 0.2745177447795868, + "learning_rate": 8.258395060726291e-05, + "loss": 1.8103, + "step": 9640 + }, + { + "epoch": 2.9591774094536527, + "grad_norm": 0.29005685448646545, + "learning_rate": 8.258018030211663e-05, + "loss": 1.7587, + "step": 9641 + }, + { + "epoch": 2.9594843462246776, + "grad_norm": 0.27498918771743774, + "learning_rate": 8.257640967499391e-05, + "loss": 1.8052, + "step": 9642 + }, + { + "epoch": 2.959791282995703, + "grad_norm": 0.2689644694328308, + "learning_rate": 8.257263872593202e-05, + "loss": 1.8582, + "step": 9643 + }, + { + "epoch": 2.960098219766728, + "grad_norm": 0.2953707277774811, + "learning_rate": 8.256886745496821e-05, + "loss": 1.7654, + "step": 9644 + }, + { + "epoch": 2.960405156537753, + "grad_norm": 0.2573971450328827, + "learning_rate": 8.256509586213978e-05, + "loss": 1.7819, + "step": 9645 + }, + { + "epoch": 2.9607120933087785, + "grad_norm": 0.29667192697525024, + "learning_rate": 8.256132394748398e-05, + "loss": 1.8632, + "step": 9646 + }, + { + "epoch": 2.961019030079804, + "grad_norm": 0.2953830361366272, + "learning_rate": 8.255755171103808e-05, + "loss": 1.8672, + "step": 9647 + }, + { + "epoch": 2.9613259668508287, + "grad_norm": 0.2925500273704529, + "learning_rate": 8.255377915283937e-05, + "loss": 1.8691, + "step": 9648 + }, + { + "epoch": 2.961632903621854, + "grad_norm": 0.32245302200317383, + "learning_rate": 8.255000627292515e-05, + "loss": 1.8701, + "step": 9649 + }, + { + "epoch": 2.961939840392879, + "grad_norm": 0.2671414315700531, + "learning_rate": 8.254623307133268e-05, + "loss": 1.8045, + "step": 9650 + }, + { + "epoch": 2.9622467771639043, + "grad_norm": 0.3135749101638794, + "learning_rate": 8.254245954809928e-05, + "loss": 1.7573, + "step": 9651 + }, + { + "epoch": 2.9625537139349296, + "grad_norm": 0.2604369521141052, + "learning_rate": 8.253868570326218e-05, + "loss": 1.8513, + "step": 9652 + }, + { + "epoch": 2.9628606507059545, + "grad_norm": 0.24657092988491058, + "learning_rate": 8.253491153685875e-05, + "loss": 1.8303, + "step": 9653 + }, + { + "epoch": 2.96316758747698, + "grad_norm": 0.24310527741909027, + "learning_rate": 8.253113704892623e-05, + "loss": 1.7648, + "step": 9654 + }, + { + "epoch": 2.9634745242480047, + "grad_norm": 0.24558408558368683, + "learning_rate": 8.252736223950198e-05, + "loss": 1.7517, + "step": 9655 + }, + { + "epoch": 2.96378146101903, + "grad_norm": 0.2500043511390686, + "learning_rate": 8.252358710862324e-05, + "loss": 1.7588, + "step": 9656 + }, + { + "epoch": 2.9640883977900554, + "grad_norm": 0.2532055079936981, + "learning_rate": 8.251981165632737e-05, + "loss": 1.8414, + "step": 9657 + }, + { + "epoch": 2.9643953345610803, + "grad_norm": 0.2692684829235077, + "learning_rate": 8.251603588265165e-05, + "loss": 1.8701, + "step": 9658 + }, + { + "epoch": 2.9647022713321056, + "grad_norm": 0.2511022984981537, + "learning_rate": 8.251225978763341e-05, + "loss": 1.8068, + "step": 9659 + }, + { + "epoch": 2.9650092081031305, + "grad_norm": 0.24702081084251404, + "learning_rate": 8.250848337130997e-05, + "loss": 1.7993, + "step": 9660 + }, + { + "epoch": 2.965316144874156, + "grad_norm": 0.26960623264312744, + "learning_rate": 8.250470663371862e-05, + "loss": 1.8269, + "step": 9661 + }, + { + "epoch": 2.965623081645181, + "grad_norm": 0.2651064693927765, + "learning_rate": 8.250092957489673e-05, + "loss": 1.8235, + "step": 9662 + }, + { + "epoch": 2.9659300184162065, + "grad_norm": 0.3117934465408325, + "learning_rate": 8.249715219488158e-05, + "loss": 1.9603, + "step": 9663 + }, + { + "epoch": 2.9662369551872314, + "grad_norm": 0.3244706988334656, + "learning_rate": 8.249337449371055e-05, + "loss": 1.8766, + "step": 9664 + }, + { + "epoch": 2.9665438919582567, + "grad_norm": 0.3071763515472412, + "learning_rate": 8.248959647142094e-05, + "loss": 1.8118, + "step": 9665 + }, + { + "epoch": 2.9668508287292816, + "grad_norm": 0.2575626075267792, + "learning_rate": 8.24858181280501e-05, + "loss": 1.8578, + "step": 9666 + }, + { + "epoch": 2.967157765500307, + "grad_norm": 0.369356244802475, + "learning_rate": 8.248203946363535e-05, + "loss": 1.7831, + "step": 9667 + }, + { + "epoch": 2.9674647022713323, + "grad_norm": 0.317775160074234, + "learning_rate": 8.247826047821405e-05, + "loss": 1.8839, + "step": 9668 + }, + { + "epoch": 2.967771639042357, + "grad_norm": 0.31816980242729187, + "learning_rate": 8.247448117182355e-05, + "loss": 1.8111, + "step": 9669 + }, + { + "epoch": 2.9680785758133825, + "grad_norm": 0.2943781316280365, + "learning_rate": 8.247070154450119e-05, + "loss": 1.848, + "step": 9670 + }, + { + "epoch": 2.9683855125844074, + "grad_norm": 0.28252434730529785, + "learning_rate": 8.246692159628433e-05, + "loss": 1.8601, + "step": 9671 + }, + { + "epoch": 2.9686924493554327, + "grad_norm": 0.29150691628456116, + "learning_rate": 8.246314132721032e-05, + "loss": 1.7738, + "step": 9672 + }, + { + "epoch": 2.968999386126458, + "grad_norm": 0.3699757754802704, + "learning_rate": 8.245936073731653e-05, + "loss": 1.842, + "step": 9673 + }, + { + "epoch": 2.969306322897483, + "grad_norm": 0.37951794266700745, + "learning_rate": 8.245557982664031e-05, + "loss": 1.8648, + "step": 9674 + }, + { + "epoch": 2.9696132596685083, + "grad_norm": 0.2792273461818695, + "learning_rate": 8.245179859521901e-05, + "loss": 1.889, + "step": 9675 + }, + { + "epoch": 2.969920196439533, + "grad_norm": 0.3405047059059143, + "learning_rate": 8.244801704309002e-05, + "loss": 1.7658, + "step": 9676 + }, + { + "epoch": 2.9702271332105585, + "grad_norm": 0.40138551592826843, + "learning_rate": 8.244423517029072e-05, + "loss": 1.79, + "step": 9677 + }, + { + "epoch": 2.970534069981584, + "grad_norm": 0.42260462045669556, + "learning_rate": 8.244045297685846e-05, + "loss": 1.9248, + "step": 9678 + }, + { + "epoch": 2.970841006752609, + "grad_norm": 0.30391061305999756, + "learning_rate": 8.243667046283063e-05, + "loss": 1.7922, + "step": 9679 + }, + { + "epoch": 2.971147943523634, + "grad_norm": 0.3194752037525177, + "learning_rate": 8.243288762824463e-05, + "loss": 1.8582, + "step": 9680 + }, + { + "epoch": 2.9714548802946594, + "grad_norm": 0.47853100299835205, + "learning_rate": 8.24291044731378e-05, + "loss": 1.8206, + "step": 9681 + }, + { + "epoch": 2.9717618170656843, + "grad_norm": 0.47428956627845764, + "learning_rate": 8.242532099754756e-05, + "loss": 1.8271, + "step": 9682 + }, + { + "epoch": 2.9720687538367097, + "grad_norm": 0.30275169014930725, + "learning_rate": 8.24215372015113e-05, + "loss": 1.8532, + "step": 9683 + }, + { + "epoch": 2.972375690607735, + "grad_norm": 0.31766825914382935, + "learning_rate": 8.24177530850664e-05, + "loss": 1.7751, + "step": 9684 + }, + { + "epoch": 2.97268262737876, + "grad_norm": 0.3738986551761627, + "learning_rate": 8.241396864825026e-05, + "loss": 1.7644, + "step": 9685 + }, + { + "epoch": 2.972989564149785, + "grad_norm": 0.2794596254825592, + "learning_rate": 8.24101838911003e-05, + "loss": 1.7445, + "step": 9686 + }, + { + "epoch": 2.97329650092081, + "grad_norm": 0.30008718371391296, + "learning_rate": 8.240639881365388e-05, + "loss": 1.8181, + "step": 9687 + }, + { + "epoch": 2.9736034376918354, + "grad_norm": 0.36667200922966003, + "learning_rate": 8.240261341594846e-05, + "loss": 1.8606, + "step": 9688 + }, + { + "epoch": 2.9739103744628608, + "grad_norm": 0.2943612039089203, + "learning_rate": 8.23988276980214e-05, + "loss": 1.8169, + "step": 9689 + }, + { + "epoch": 2.9742173112338857, + "grad_norm": 0.3499365746974945, + "learning_rate": 8.239504165991015e-05, + "loss": 1.8901, + "step": 9690 + }, + { + "epoch": 2.974524248004911, + "grad_norm": 0.35552978515625, + "learning_rate": 8.239125530165211e-05, + "loss": 1.8266, + "step": 9691 + }, + { + "epoch": 2.974831184775936, + "grad_norm": 0.35415011644363403, + "learning_rate": 8.23874686232847e-05, + "loss": 1.8588, + "step": 9692 + }, + { + "epoch": 2.9751381215469612, + "grad_norm": 0.3237420618534088, + "learning_rate": 8.238368162484533e-05, + "loss": 1.8112, + "step": 9693 + }, + { + "epoch": 2.9754450583179866, + "grad_norm": 0.31672203540802, + "learning_rate": 8.237989430637145e-05, + "loss": 1.7983, + "step": 9694 + }, + { + "epoch": 2.975751995089012, + "grad_norm": 0.2926657795906067, + "learning_rate": 8.237610666790048e-05, + "loss": 1.8137, + "step": 9695 + }, + { + "epoch": 2.976058931860037, + "grad_norm": 0.2924230992794037, + "learning_rate": 8.237231870946983e-05, + "loss": 1.8789, + "step": 9696 + }, + { + "epoch": 2.976365868631062, + "grad_norm": 0.2768077850341797, + "learning_rate": 8.236853043111697e-05, + "loss": 1.8643, + "step": 9697 + }, + { + "epoch": 2.976672805402087, + "grad_norm": 0.24151389300823212, + "learning_rate": 8.23647418328793e-05, + "loss": 1.8245, + "step": 9698 + }, + { + "epoch": 2.9769797421731123, + "grad_norm": 0.24514195322990417, + "learning_rate": 8.23609529147943e-05, + "loss": 1.761, + "step": 9699 + }, + { + "epoch": 2.9772866789441377, + "grad_norm": 0.2619125545024872, + "learning_rate": 8.235716367689938e-05, + "loss": 1.8445, + "step": 9700 + }, + { + "epoch": 2.9775936157151626, + "grad_norm": 0.2570437490940094, + "learning_rate": 8.235337411923203e-05, + "loss": 1.7881, + "step": 9701 + }, + { + "epoch": 2.977900552486188, + "grad_norm": 0.288775235414505, + "learning_rate": 8.234958424182966e-05, + "loss": 1.8177, + "step": 9702 + }, + { + "epoch": 2.978207489257213, + "grad_norm": 0.3186240792274475, + "learning_rate": 8.234579404472973e-05, + "loss": 1.8438, + "step": 9703 + }, + { + "epoch": 2.978514426028238, + "grad_norm": 0.2520117163658142, + "learning_rate": 8.23420035279697e-05, + "loss": 1.7791, + "step": 9704 + }, + { + "epoch": 2.9788213627992635, + "grad_norm": 0.23164312541484833, + "learning_rate": 8.233821269158706e-05, + "loss": 1.7368, + "step": 9705 + }, + { + "epoch": 2.979128299570289, + "grad_norm": 0.33843451738357544, + "learning_rate": 8.233442153561924e-05, + "loss": 1.8656, + "step": 9706 + }, + { + "epoch": 2.9794352363413137, + "grad_norm": 0.3070257604122162, + "learning_rate": 8.23306300601037e-05, + "loss": 1.7982, + "step": 9707 + }, + { + "epoch": 2.979742173112339, + "grad_norm": 0.29138872027397156, + "learning_rate": 8.232683826507793e-05, + "loss": 1.8227, + "step": 9708 + }, + { + "epoch": 2.980049109883364, + "grad_norm": 0.22698308527469635, + "learning_rate": 8.23230461505794e-05, + "loss": 1.7841, + "step": 9709 + }, + { + "epoch": 2.9803560466543892, + "grad_norm": 0.2597857713699341, + "learning_rate": 8.231925371664559e-05, + "loss": 1.7438, + "step": 9710 + }, + { + "epoch": 2.9806629834254146, + "grad_norm": 0.28672367334365845, + "learning_rate": 8.231546096331395e-05, + "loss": 1.8415, + "step": 9711 + }, + { + "epoch": 2.9809699201964395, + "grad_norm": 0.24295037984848022, + "learning_rate": 8.2311667890622e-05, + "loss": 1.8179, + "step": 9712 + }, + { + "epoch": 2.981276856967465, + "grad_norm": 0.24558894336223602, + "learning_rate": 8.23078744986072e-05, + "loss": 1.8092, + "step": 9713 + }, + { + "epoch": 2.9815837937384897, + "grad_norm": 0.2644276022911072, + "learning_rate": 8.230408078730706e-05, + "loss": 1.8214, + "step": 9714 + }, + { + "epoch": 2.981890730509515, + "grad_norm": 0.27007076144218445, + "learning_rate": 8.230028675675907e-05, + "loss": 1.8042, + "step": 9715 + }, + { + "epoch": 2.9821976672805404, + "grad_norm": 0.2729937732219696, + "learning_rate": 8.229649240700069e-05, + "loss": 1.8419, + "step": 9716 + }, + { + "epoch": 2.9825046040515653, + "grad_norm": 0.26545679569244385, + "learning_rate": 8.229269773806945e-05, + "loss": 1.823, + "step": 9717 + }, + { + "epoch": 2.9828115408225906, + "grad_norm": 0.23276878893375397, + "learning_rate": 8.228890275000285e-05, + "loss": 1.7635, + "step": 9718 + }, + { + "epoch": 2.9831184775936155, + "grad_norm": 0.28991779685020447, + "learning_rate": 8.228510744283837e-05, + "loss": 1.8303, + "step": 9719 + }, + { + "epoch": 2.983425414364641, + "grad_norm": 0.2821960151195526, + "learning_rate": 8.228131181661357e-05, + "loss": 1.8246, + "step": 9720 + }, + { + "epoch": 2.983732351135666, + "grad_norm": 0.25588423013687134, + "learning_rate": 8.22775158713659e-05, + "loss": 1.7764, + "step": 9721 + }, + { + "epoch": 2.9840392879066915, + "grad_norm": 0.2694758176803589, + "learning_rate": 8.227371960713289e-05, + "loss": 1.8026, + "step": 9722 + }, + { + "epoch": 2.9843462246777164, + "grad_norm": 0.27571097016334534, + "learning_rate": 8.226992302395209e-05, + "loss": 1.8051, + "step": 9723 + }, + { + "epoch": 2.9846531614487417, + "grad_norm": 0.2940119504928589, + "learning_rate": 8.226612612186099e-05, + "loss": 1.8782, + "step": 9724 + }, + { + "epoch": 2.9849600982197666, + "grad_norm": 0.34924936294555664, + "learning_rate": 8.226232890089711e-05, + "loss": 1.7845, + "step": 9725 + }, + { + "epoch": 2.985267034990792, + "grad_norm": 0.30503180623054504, + "learning_rate": 8.2258531361098e-05, + "loss": 1.8345, + "step": 9726 + }, + { + "epoch": 2.9855739717618173, + "grad_norm": 0.2463730275630951, + "learning_rate": 8.225473350250117e-05, + "loss": 1.8188, + "step": 9727 + }, + { + "epoch": 2.985880908532842, + "grad_norm": 0.3514629900455475, + "learning_rate": 8.225093532514417e-05, + "loss": 1.9253, + "step": 9728 + }, + { + "epoch": 2.9861878453038675, + "grad_norm": 0.26462769508361816, + "learning_rate": 8.224713682906449e-05, + "loss": 1.7396, + "step": 9729 + }, + { + "epoch": 2.9864947820748924, + "grad_norm": 0.27125996351242065, + "learning_rate": 8.224333801429973e-05, + "loss": 1.7784, + "step": 9730 + }, + { + "epoch": 2.9868017188459177, + "grad_norm": 0.3083387315273285, + "learning_rate": 8.22395388808874e-05, + "loss": 1.8503, + "step": 9731 + }, + { + "epoch": 2.987108655616943, + "grad_norm": 0.28289708495140076, + "learning_rate": 8.223573942886505e-05, + "loss": 1.8337, + "step": 9732 + }, + { + "epoch": 2.987415592387968, + "grad_norm": 0.3667753040790558, + "learning_rate": 8.223193965827023e-05, + "loss": 1.8213, + "step": 9733 + }, + { + "epoch": 2.9877225291589933, + "grad_norm": 0.3568948805332184, + "learning_rate": 8.222813956914049e-05, + "loss": 1.8337, + "step": 9734 + }, + { + "epoch": 2.988029465930018, + "grad_norm": 0.2883065640926361, + "learning_rate": 8.22243391615134e-05, + "loss": 1.7227, + "step": 9735 + }, + { + "epoch": 2.9883364027010435, + "grad_norm": 0.24940936267375946, + "learning_rate": 8.222053843542648e-05, + "loss": 1.7889, + "step": 9736 + }, + { + "epoch": 2.988643339472069, + "grad_norm": 0.31267982721328735, + "learning_rate": 8.221673739091732e-05, + "loss": 1.8432, + "step": 9737 + }, + { + "epoch": 2.988950276243094, + "grad_norm": 0.3552311658859253, + "learning_rate": 8.221293602802349e-05, + "loss": 1.8569, + "step": 9738 + }, + { + "epoch": 2.989257213014119, + "grad_norm": 0.4149966835975647, + "learning_rate": 8.220913434678252e-05, + "loss": 1.8052, + "step": 9739 + }, + { + "epoch": 2.9895641497851444, + "grad_norm": 0.282320499420166, + "learning_rate": 8.220533234723204e-05, + "loss": 1.7629, + "step": 9740 + }, + { + "epoch": 2.9898710865561693, + "grad_norm": 0.27737030386924744, + "learning_rate": 8.220153002940958e-05, + "loss": 1.8331, + "step": 9741 + }, + { + "epoch": 2.9901780233271946, + "grad_norm": 0.29296645522117615, + "learning_rate": 8.219772739335272e-05, + "loss": 1.8414, + "step": 9742 + }, + { + "epoch": 2.99048496009822, + "grad_norm": 0.35226449370384216, + "learning_rate": 8.219392443909903e-05, + "loss": 1.8608, + "step": 9743 + }, + { + "epoch": 2.990791896869245, + "grad_norm": 0.3199223577976227, + "learning_rate": 8.219012116668612e-05, + "loss": 1.7868, + "step": 9744 + }, + { + "epoch": 2.99109883364027, + "grad_norm": 0.2904597818851471, + "learning_rate": 8.218631757615159e-05, + "loss": 1.8495, + "step": 9745 + }, + { + "epoch": 2.991405770411295, + "grad_norm": 0.34674009680747986, + "learning_rate": 8.218251366753298e-05, + "loss": 1.8143, + "step": 9746 + }, + { + "epoch": 2.9917127071823204, + "grad_norm": 0.38007479906082153, + "learning_rate": 8.217870944086791e-05, + "loss": 1.8534, + "step": 9747 + }, + { + "epoch": 2.9920196439533457, + "grad_norm": 0.31660130620002747, + "learning_rate": 8.217490489619398e-05, + "loss": 1.7807, + "step": 9748 + }, + { + "epoch": 2.9923265807243706, + "grad_norm": 0.2923539876937866, + "learning_rate": 8.217110003354877e-05, + "loss": 1.8517, + "step": 9749 + }, + { + "epoch": 2.992633517495396, + "grad_norm": 0.31018227338790894, + "learning_rate": 8.21672948529699e-05, + "loss": 1.7998, + "step": 9750 + }, + { + "epoch": 2.992940454266421, + "grad_norm": 0.29448994994163513, + "learning_rate": 8.216348935449496e-05, + "loss": 1.7883, + "step": 9751 + }, + { + "epoch": 2.993247391037446, + "grad_norm": 0.26120781898498535, + "learning_rate": 8.215968353816158e-05, + "loss": 1.7762, + "step": 9752 + }, + { + "epoch": 2.9935543278084715, + "grad_norm": 0.27784180641174316, + "learning_rate": 8.215587740400735e-05, + "loss": 1.8711, + "step": 9753 + }, + { + "epoch": 2.993861264579497, + "grad_norm": 0.3106052577495575, + "learning_rate": 8.21520709520699e-05, + "loss": 1.8112, + "step": 9754 + }, + { + "epoch": 2.9941682013505218, + "grad_norm": 0.3170885145664215, + "learning_rate": 8.214826418238684e-05, + "loss": 1.8893, + "step": 9755 + }, + { + "epoch": 2.994475138121547, + "grad_norm": 0.2969432473182678, + "learning_rate": 8.214445709499577e-05, + "loss": 1.8628, + "step": 9756 + }, + { + "epoch": 2.994782074892572, + "grad_norm": 0.30484744906425476, + "learning_rate": 8.214064968993436e-05, + "loss": 1.8421, + "step": 9757 + }, + { + "epoch": 2.9950890116635973, + "grad_norm": 0.24819856882095337, + "learning_rate": 8.213684196724019e-05, + "loss": 1.8243, + "step": 9758 + }, + { + "epoch": 2.9953959484346226, + "grad_norm": 0.28566786646842957, + "learning_rate": 8.213303392695092e-05, + "loss": 1.8064, + "step": 9759 + }, + { + "epoch": 2.9957028852056475, + "grad_norm": 0.27742111682891846, + "learning_rate": 8.212922556910418e-05, + "loss": 1.8174, + "step": 9760 + }, + { + "epoch": 2.996009821976673, + "grad_norm": 0.27103090286254883, + "learning_rate": 8.212541689373761e-05, + "loss": 1.761, + "step": 9761 + }, + { + "epoch": 2.9963167587476978, + "grad_norm": 0.27157172560691833, + "learning_rate": 8.212160790088883e-05, + "loss": 1.8893, + "step": 9762 + }, + { + "epoch": 2.996623695518723, + "grad_norm": 0.2742370367050171, + "learning_rate": 8.21177985905955e-05, + "loss": 1.8774, + "step": 9763 + }, + { + "epoch": 2.9969306322897484, + "grad_norm": 0.26467064023017883, + "learning_rate": 8.211398896289524e-05, + "loss": 1.7805, + "step": 9764 + }, + { + "epoch": 2.9972375690607733, + "grad_norm": 0.2622149884700775, + "learning_rate": 8.211017901782574e-05, + "loss": 1.7346, + "step": 9765 + }, + { + "epoch": 2.9975445058317987, + "grad_norm": 0.3163202106952667, + "learning_rate": 8.210636875542462e-05, + "loss": 1.8348, + "step": 9766 + }, + { + "epoch": 2.9978514426028235, + "grad_norm": 0.2789528965950012, + "learning_rate": 8.210255817572955e-05, + "loss": 1.7535, + "step": 9767 + }, + { + "epoch": 2.998158379373849, + "grad_norm": 0.25694188475608826, + "learning_rate": 8.209874727877818e-05, + "loss": 1.8731, + "step": 9768 + }, + { + "epoch": 2.998465316144874, + "grad_norm": 0.40298742055892944, + "learning_rate": 8.209493606460818e-05, + "loss": 1.7924, + "step": 9769 + }, + { + "epoch": 2.9987722529158995, + "grad_norm": 0.5090280771255493, + "learning_rate": 8.20911245332572e-05, + "loss": 1.8253, + "step": 9770 + }, + { + "epoch": 2.9990791896869244, + "grad_norm": 0.41809162497520447, + "learning_rate": 8.208731268476293e-05, + "loss": 1.8233, + "step": 9771 + }, + { + "epoch": 2.9993861264579498, + "grad_norm": 0.23141434788703918, + "learning_rate": 8.208350051916303e-05, + "loss": 1.7842, + "step": 9772 + }, + { + "epoch": 2.9996930632289747, + "grad_norm": 0.3174372613430023, + "learning_rate": 8.207968803649517e-05, + "loss": 1.8477, + "step": 9773 + }, + { + "epoch": 3.0, + "grad_norm": 0.41795292496681213, + "learning_rate": 8.207587523679704e-05, + "loss": 1.8407, + "step": 9774 + }, + { + "epoch": 3.0003069367710253, + "grad_norm": 0.43365660309791565, + "learning_rate": 8.20720621201063e-05, + "loss": 1.8074, + "step": 9775 + }, + { + "epoch": 3.0006138735420502, + "grad_norm": 0.461374968290329, + "learning_rate": 8.206824868646064e-05, + "loss": 1.9089, + "step": 9776 + }, + { + "epoch": 3.0009208103130756, + "grad_norm": 0.3747929632663727, + "learning_rate": 8.206443493589776e-05, + "loss": 1.8358, + "step": 9777 + }, + { + "epoch": 3.001227747084101, + "grad_norm": 0.28436774015426636, + "learning_rate": 8.206062086845532e-05, + "loss": 1.8527, + "step": 9778 + }, + { + "epoch": 3.001534683855126, + "grad_norm": 0.33642131090164185, + "learning_rate": 8.205680648417106e-05, + "loss": 1.8142, + "step": 9779 + }, + { + "epoch": 3.001841620626151, + "grad_norm": 0.4283481240272522, + "learning_rate": 8.205299178308263e-05, + "loss": 1.9006, + "step": 9780 + }, + { + "epoch": 3.002148557397176, + "grad_norm": 0.34405630826950073, + "learning_rate": 8.204917676522777e-05, + "loss": 1.7988, + "step": 9781 + }, + { + "epoch": 3.0024554941682013, + "grad_norm": 0.3161070942878723, + "learning_rate": 8.204536143064414e-05, + "loss": 1.8271, + "step": 9782 + }, + { + "epoch": 3.0027624309392267, + "grad_norm": 0.42518749833106995, + "learning_rate": 8.204154577936946e-05, + "loss": 1.864, + "step": 9783 + }, + { + "epoch": 3.0030693677102516, + "grad_norm": 0.3760852813720703, + "learning_rate": 8.203772981144146e-05, + "loss": 1.8543, + "step": 9784 + }, + { + "epoch": 3.003376304481277, + "grad_norm": 0.32794755697250366, + "learning_rate": 8.203391352689784e-05, + "loss": 1.8776, + "step": 9785 + }, + { + "epoch": 3.0036832412523022, + "grad_norm": 0.3053889274597168, + "learning_rate": 8.20300969257763e-05, + "loss": 1.8064, + "step": 9786 + }, + { + "epoch": 3.003990178023327, + "grad_norm": 0.40283143520355225, + "learning_rate": 8.202628000811456e-05, + "loss": 1.8083, + "step": 9787 + }, + { + "epoch": 3.0042971147943525, + "grad_norm": 0.49270665645599365, + "learning_rate": 8.202246277395038e-05, + "loss": 1.802, + "step": 9788 + }, + { + "epoch": 3.0046040515653774, + "grad_norm": 0.4373023211956024, + "learning_rate": 8.201864522332143e-05, + "loss": 1.8429, + "step": 9789 + }, + { + "epoch": 3.0049109883364027, + "grad_norm": 0.3136310875415802, + "learning_rate": 8.201482735626547e-05, + "loss": 1.8224, + "step": 9790 + }, + { + "epoch": 3.005217925107428, + "grad_norm": 0.3306807279586792, + "learning_rate": 8.201100917282023e-05, + "loss": 1.8463, + "step": 9791 + }, + { + "epoch": 3.005524861878453, + "grad_norm": 0.45082196593284607, + "learning_rate": 8.200719067302342e-05, + "loss": 1.7587, + "step": 9792 + }, + { + "epoch": 3.0058317986494782, + "grad_norm": 0.49246448278427124, + "learning_rate": 8.20033718569128e-05, + "loss": 1.8245, + "step": 9793 + }, + { + "epoch": 3.0061387354205036, + "grad_norm": 0.3040246367454529, + "learning_rate": 8.199955272452609e-05, + "loss": 1.8309, + "step": 9794 + }, + { + "epoch": 3.0064456721915285, + "grad_norm": 0.3909318149089813, + "learning_rate": 8.199573327590105e-05, + "loss": 1.8187, + "step": 9795 + }, + { + "epoch": 3.006752608962554, + "grad_norm": 0.5753183960914612, + "learning_rate": 8.199191351107543e-05, + "loss": 1.826, + "step": 9796 + }, + { + "epoch": 3.0070595457335787, + "grad_norm": 0.48908689618110657, + "learning_rate": 8.198809343008695e-05, + "loss": 1.8475, + "step": 9797 + }, + { + "epoch": 3.007366482504604, + "grad_norm": 0.31570208072662354, + "learning_rate": 8.198427303297341e-05, + "loss": 1.8046, + "step": 9798 + }, + { + "epoch": 3.0076734192756294, + "grad_norm": 0.39205440878868103, + "learning_rate": 8.198045231977251e-05, + "loss": 1.8413, + "step": 9799 + }, + { + "epoch": 3.0079803560466543, + "grad_norm": 0.5117597579956055, + "learning_rate": 8.197663129052204e-05, + "loss": 1.8184, + "step": 9800 + }, + { + "epoch": 3.0082872928176796, + "grad_norm": 0.3623514175415039, + "learning_rate": 8.197280994525978e-05, + "loss": 1.8292, + "step": 9801 + }, + { + "epoch": 3.008594229588705, + "grad_norm": 0.2826726734638214, + "learning_rate": 8.196898828402344e-05, + "loss": 1.8216, + "step": 9802 + }, + { + "epoch": 3.00890116635973, + "grad_norm": 0.38658398389816284, + "learning_rate": 8.196516630685085e-05, + "loss": 1.867, + "step": 9803 + }, + { + "epoch": 3.009208103130755, + "grad_norm": 0.3371698260307312, + "learning_rate": 8.196134401377973e-05, + "loss": 1.8077, + "step": 9804 + }, + { + "epoch": 3.00951503990178, + "grad_norm": 0.24108785390853882, + "learning_rate": 8.195752140484789e-05, + "loss": 1.7858, + "step": 9805 + }, + { + "epoch": 3.0098219766728054, + "grad_norm": 0.34410104155540466, + "learning_rate": 8.195369848009309e-05, + "loss": 1.801, + "step": 9806 + }, + { + "epoch": 3.0101289134438307, + "grad_norm": 0.3412116467952728, + "learning_rate": 8.194987523955311e-05, + "loss": 1.7905, + "step": 9807 + }, + { + "epoch": 3.0104358502148556, + "grad_norm": 0.2473030537366867, + "learning_rate": 8.194605168326573e-05, + "loss": 1.7765, + "step": 9808 + }, + { + "epoch": 3.010742786985881, + "grad_norm": 0.28590065240859985, + "learning_rate": 8.194222781126875e-05, + "loss": 1.7897, + "step": 9809 + }, + { + "epoch": 3.0110497237569063, + "grad_norm": 0.2994272708892822, + "learning_rate": 8.193840362359994e-05, + "loss": 1.7976, + "step": 9810 + }, + { + "epoch": 3.011356660527931, + "grad_norm": 0.2971307635307312, + "learning_rate": 8.193457912029713e-05, + "loss": 1.829, + "step": 9811 + }, + { + "epoch": 3.0116635972989565, + "grad_norm": 0.25149810314178467, + "learning_rate": 8.193075430139809e-05, + "loss": 1.7709, + "step": 9812 + }, + { + "epoch": 3.0119705340699814, + "grad_norm": 0.2561332583427429, + "learning_rate": 8.19269291669406e-05, + "loss": 1.7689, + "step": 9813 + }, + { + "epoch": 3.0122774708410067, + "grad_norm": 0.2658882141113281, + "learning_rate": 8.192310371696249e-05, + "loss": 1.8497, + "step": 9814 + }, + { + "epoch": 3.012584407612032, + "grad_norm": 0.2873780429363251, + "learning_rate": 8.191927795150156e-05, + "loss": 1.8217, + "step": 9815 + }, + { + "epoch": 3.012891344383057, + "grad_norm": 0.2181183248758316, + "learning_rate": 8.191545187059562e-05, + "loss": 1.7261, + "step": 9816 + }, + { + "epoch": 3.0131982811540823, + "grad_norm": 0.2414858490228653, + "learning_rate": 8.191162547428248e-05, + "loss": 1.8035, + "step": 9817 + }, + { + "epoch": 3.0135052179251076, + "grad_norm": 0.2799840271472931, + "learning_rate": 8.190779876259995e-05, + "loss": 1.8279, + "step": 9818 + }, + { + "epoch": 3.0138121546961325, + "grad_norm": 0.2669760584831238, + "learning_rate": 8.190397173558584e-05, + "loss": 1.8155, + "step": 9819 + }, + { + "epoch": 3.014119091467158, + "grad_norm": 0.28857991099357605, + "learning_rate": 8.1900144393278e-05, + "loss": 1.8479, + "step": 9820 + }, + { + "epoch": 3.0144260282381827, + "grad_norm": 0.30534693598747253, + "learning_rate": 8.189631673571422e-05, + "loss": 1.8609, + "step": 9821 + }, + { + "epoch": 3.014732965009208, + "grad_norm": 0.3238218128681183, + "learning_rate": 8.189248876293236e-05, + "loss": 1.9292, + "step": 9822 + }, + { + "epoch": 3.0150399017802334, + "grad_norm": 0.3000536561012268, + "learning_rate": 8.188866047497022e-05, + "loss": 1.8214, + "step": 9823 + }, + { + "epoch": 3.0153468385512583, + "grad_norm": 0.2960065007209778, + "learning_rate": 8.188483187186565e-05, + "loss": 1.8316, + "step": 9824 + }, + { + "epoch": 3.0156537753222836, + "grad_norm": 0.28609779477119446, + "learning_rate": 8.188100295365648e-05, + "loss": 1.8002, + "step": 9825 + }, + { + "epoch": 3.015960712093309, + "grad_norm": 0.31390634179115295, + "learning_rate": 8.187717372038057e-05, + "loss": 1.8134, + "step": 9826 + }, + { + "epoch": 3.016267648864334, + "grad_norm": 0.28550946712493896, + "learning_rate": 8.187334417207573e-05, + "loss": 1.8359, + "step": 9827 + }, + { + "epoch": 3.016574585635359, + "grad_norm": 0.3085210621356964, + "learning_rate": 8.186951430877982e-05, + "loss": 1.813, + "step": 9828 + }, + { + "epoch": 3.016881522406384, + "grad_norm": 0.3043847978115082, + "learning_rate": 8.18656841305307e-05, + "loss": 1.8222, + "step": 9829 + }, + { + "epoch": 3.0171884591774094, + "grad_norm": 0.32524731755256653, + "learning_rate": 8.18618536373662e-05, + "loss": 1.8258, + "step": 9830 + }, + { + "epoch": 3.0174953959484347, + "grad_norm": 0.2690991461277008, + "learning_rate": 8.18580228293242e-05, + "loss": 1.8492, + "step": 9831 + }, + { + "epoch": 3.0178023327194596, + "grad_norm": 0.34936225414276123, + "learning_rate": 8.185419170644253e-05, + "loss": 1.8363, + "step": 9832 + }, + { + "epoch": 3.018109269490485, + "grad_norm": 0.3274296820163727, + "learning_rate": 8.185036026875908e-05, + "loss": 1.7789, + "step": 9833 + }, + { + "epoch": 3.0184162062615103, + "grad_norm": 0.2729836106300354, + "learning_rate": 8.184652851631169e-05, + "loss": 1.8264, + "step": 9834 + }, + { + "epoch": 3.018723143032535, + "grad_norm": 0.28682780265808105, + "learning_rate": 8.184269644913826e-05, + "loss": 1.8399, + "step": 9835 + }, + { + "epoch": 3.0190300798035605, + "grad_norm": 0.3224826455116272, + "learning_rate": 8.183886406727662e-05, + "loss": 1.8338, + "step": 9836 + }, + { + "epoch": 3.0193370165745854, + "grad_norm": 0.30945318937301636, + "learning_rate": 8.183503137076467e-05, + "loss": 1.8248, + "step": 9837 + }, + { + "epoch": 3.0196439533456108, + "grad_norm": 0.27580398321151733, + "learning_rate": 8.183119835964029e-05, + "loss": 1.8096, + "step": 9838 + }, + { + "epoch": 3.019950890116636, + "grad_norm": 0.28927183151245117, + "learning_rate": 8.182736503394132e-05, + "loss": 1.825, + "step": 9839 + }, + { + "epoch": 3.020257826887661, + "grad_norm": 0.253000408411026, + "learning_rate": 8.182353139370571e-05, + "loss": 1.7678, + "step": 9840 + }, + { + "epoch": 3.0205647636586863, + "grad_norm": 0.2882022559642792, + "learning_rate": 8.18196974389713e-05, + "loss": 1.8895, + "step": 9841 + }, + { + "epoch": 3.0208717004297116, + "grad_norm": 0.26864609122276306, + "learning_rate": 8.1815863169776e-05, + "loss": 1.7674, + "step": 9842 + }, + { + "epoch": 3.0211786372007365, + "grad_norm": 0.27344849705696106, + "learning_rate": 8.181202858615769e-05, + "loss": 1.8146, + "step": 9843 + }, + { + "epoch": 3.021485573971762, + "grad_norm": 0.31659772992134094, + "learning_rate": 8.180819368815425e-05, + "loss": 1.8485, + "step": 9844 + }, + { + "epoch": 3.021792510742787, + "grad_norm": 0.3163176476955414, + "learning_rate": 8.18043584758036e-05, + "loss": 1.8994, + "step": 9845 + }, + { + "epoch": 3.022099447513812, + "grad_norm": 0.2583829462528229, + "learning_rate": 8.180052294914365e-05, + "loss": 1.764, + "step": 9846 + }, + { + "epoch": 3.0224063842848374, + "grad_norm": 0.3006649315357208, + "learning_rate": 8.179668710821227e-05, + "loss": 1.9232, + "step": 9847 + }, + { + "epoch": 3.0227133210558623, + "grad_norm": 0.35702988505363464, + "learning_rate": 8.179285095304741e-05, + "loss": 1.8403, + "step": 9848 + }, + { + "epoch": 3.0230202578268877, + "grad_norm": 0.29699379205703735, + "learning_rate": 8.178901448368697e-05, + "loss": 1.8412, + "step": 9849 + }, + { + "epoch": 3.023327194597913, + "grad_norm": 0.3022700548171997, + "learning_rate": 8.178517770016885e-05, + "loss": 1.8197, + "step": 9850 + }, + { + "epoch": 3.023634131368938, + "grad_norm": 0.2943836748600006, + "learning_rate": 8.178134060253097e-05, + "loss": 1.8127, + "step": 9851 + }, + { + "epoch": 3.023941068139963, + "grad_norm": 0.31290489435195923, + "learning_rate": 8.177750319081126e-05, + "loss": 1.821, + "step": 9852 + }, + { + "epoch": 3.0242480049109886, + "grad_norm": 0.30308374762535095, + "learning_rate": 8.177366546504763e-05, + "loss": 1.8522, + "step": 9853 + }, + { + "epoch": 3.0245549416820134, + "grad_norm": 0.301559716463089, + "learning_rate": 8.176982742527802e-05, + "loss": 1.8758, + "step": 9854 + }, + { + "epoch": 3.0248618784530388, + "grad_norm": 0.33314836025238037, + "learning_rate": 8.176598907154034e-05, + "loss": 1.8178, + "step": 9855 + }, + { + "epoch": 3.0251688152240637, + "grad_norm": 0.3567935526371002, + "learning_rate": 8.176215040387255e-05, + "loss": 1.7847, + "step": 9856 + }, + { + "epoch": 3.025475751995089, + "grad_norm": 0.27716195583343506, + "learning_rate": 8.175831142231258e-05, + "loss": 1.772, + "step": 9857 + }, + { + "epoch": 3.0257826887661143, + "grad_norm": 0.24568212032318115, + "learning_rate": 8.175447212689836e-05, + "loss": 1.8171, + "step": 9858 + }, + { + "epoch": 3.0260896255371392, + "grad_norm": 0.25368261337280273, + "learning_rate": 8.175063251766784e-05, + "loss": 1.852, + "step": 9859 + }, + { + "epoch": 3.0263965623081646, + "grad_norm": 0.2509497404098511, + "learning_rate": 8.174679259465894e-05, + "loss": 1.7737, + "step": 9860 + }, + { + "epoch": 3.02670349907919, + "grad_norm": 0.3539343774318695, + "learning_rate": 8.174295235790963e-05, + "loss": 1.8663, + "step": 9861 + }, + { + "epoch": 3.027010435850215, + "grad_norm": 0.36450034379959106, + "learning_rate": 8.173911180745788e-05, + "loss": 1.8179, + "step": 9862 + }, + { + "epoch": 3.02731737262124, + "grad_norm": 0.3550017178058624, + "learning_rate": 8.173527094334162e-05, + "loss": 1.8256, + "step": 9863 + }, + { + "epoch": 3.027624309392265, + "grad_norm": 0.33518701791763306, + "learning_rate": 8.17314297655988e-05, + "loss": 1.7842, + "step": 9864 + }, + { + "epoch": 3.0279312461632903, + "grad_norm": 0.2522886097431183, + "learning_rate": 8.172758827426739e-05, + "loss": 1.7688, + "step": 9865 + }, + { + "epoch": 3.0282381829343157, + "grad_norm": 0.26222914457321167, + "learning_rate": 8.172374646938536e-05, + "loss": 1.8517, + "step": 9866 + }, + { + "epoch": 3.0285451197053406, + "grad_norm": 0.3355788588523865, + "learning_rate": 8.171990435099068e-05, + "loss": 1.9002, + "step": 9867 + }, + { + "epoch": 3.028852056476366, + "grad_norm": 0.32907500863075256, + "learning_rate": 8.171606191912131e-05, + "loss": 1.7801, + "step": 9868 + }, + { + "epoch": 3.0291589932473912, + "grad_norm": 0.29234179854393005, + "learning_rate": 8.171221917381523e-05, + "loss": 1.8055, + "step": 9869 + }, + { + "epoch": 3.029465930018416, + "grad_norm": 0.26374876499176025, + "learning_rate": 8.170837611511041e-05, + "loss": 1.781, + "step": 9870 + }, + { + "epoch": 3.0297728667894415, + "grad_norm": 0.311282217502594, + "learning_rate": 8.170453274304483e-05, + "loss": 1.839, + "step": 9871 + }, + { + "epoch": 3.0300798035604664, + "grad_norm": 0.24225831031799316, + "learning_rate": 8.170068905765648e-05, + "loss": 1.804, + "step": 9872 + }, + { + "epoch": 3.0303867403314917, + "grad_norm": 0.29383334517478943, + "learning_rate": 8.169684505898335e-05, + "loss": 1.7817, + "step": 9873 + }, + { + "epoch": 3.030693677102517, + "grad_norm": 0.2607928514480591, + "learning_rate": 8.169300074706339e-05, + "loss": 1.8379, + "step": 9874 + }, + { + "epoch": 3.031000613873542, + "grad_norm": 0.283028244972229, + "learning_rate": 8.168915612193464e-05, + "loss": 1.7797, + "step": 9875 + }, + { + "epoch": 3.0313075506445673, + "grad_norm": 0.27675309777259827, + "learning_rate": 8.168531118363508e-05, + "loss": 1.8355, + "step": 9876 + }, + { + "epoch": 3.0316144874155926, + "grad_norm": 0.2598227262496948, + "learning_rate": 8.16814659322027e-05, + "loss": 1.7898, + "step": 9877 + }, + { + "epoch": 3.0319214241866175, + "grad_norm": 0.24715003371238708, + "learning_rate": 8.16776203676755e-05, + "loss": 1.7791, + "step": 9878 + }, + { + "epoch": 3.032228360957643, + "grad_norm": 0.2749374210834503, + "learning_rate": 8.167377449009149e-05, + "loss": 1.8303, + "step": 9879 + }, + { + "epoch": 3.0325352977286677, + "grad_norm": 0.26150834560394287, + "learning_rate": 8.166992829948868e-05, + "loss": 1.8462, + "step": 9880 + }, + { + "epoch": 3.032842234499693, + "grad_norm": 0.3044755160808563, + "learning_rate": 8.166608179590506e-05, + "loss": 1.806, + "step": 9881 + }, + { + "epoch": 3.0331491712707184, + "grad_norm": 0.2949555516242981, + "learning_rate": 8.166223497937868e-05, + "loss": 1.8785, + "step": 9882 + }, + { + "epoch": 3.0334561080417433, + "grad_norm": 0.33206698298454285, + "learning_rate": 8.165838784994752e-05, + "loss": 1.8476, + "step": 9883 + }, + { + "epoch": 3.0337630448127686, + "grad_norm": 0.2720400094985962, + "learning_rate": 8.165454040764962e-05, + "loss": 1.843, + "step": 9884 + }, + { + "epoch": 3.034069981583794, + "grad_norm": 0.29340869188308716, + "learning_rate": 8.1650692652523e-05, + "loss": 1.7761, + "step": 9885 + }, + { + "epoch": 3.034376918354819, + "grad_norm": 0.35155293345451355, + "learning_rate": 8.16468445846057e-05, + "loss": 1.8887, + "step": 9886 + }, + { + "epoch": 3.034683855125844, + "grad_norm": 0.2688990831375122, + "learning_rate": 8.164299620393571e-05, + "loss": 1.8001, + "step": 9887 + }, + { + "epoch": 3.034990791896869, + "grad_norm": 0.2921253442764282, + "learning_rate": 8.16391475105511e-05, + "loss": 1.7951, + "step": 9888 + }, + { + "epoch": 3.0352977286678944, + "grad_norm": 0.28100699186325073, + "learning_rate": 8.163529850448988e-05, + "loss": 1.8041, + "step": 9889 + }, + { + "epoch": 3.0356046654389197, + "grad_norm": 0.3155081868171692, + "learning_rate": 8.16314491857901e-05, + "loss": 1.8026, + "step": 9890 + }, + { + "epoch": 3.0359116022099446, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.16275995544898e-05, + "loss": 1.8502, + "step": 9891 + }, + { + "epoch": 3.03621853898097, + "grad_norm": 0.2732076644897461, + "learning_rate": 8.162374961062704e-05, + "loss": 1.8424, + "step": 9892 + }, + { + "epoch": 3.0365254757519953, + "grad_norm": 0.2943679690361023, + "learning_rate": 8.161989935423984e-05, + "loss": 1.7635, + "step": 9893 + }, + { + "epoch": 3.03683241252302, + "grad_norm": 0.28894683718681335, + "learning_rate": 8.161604878536626e-05, + "loss": 1.78, + "step": 9894 + }, + { + "epoch": 3.0371393492940455, + "grad_norm": 0.2718082666397095, + "learning_rate": 8.161219790404435e-05, + "loss": 1.7664, + "step": 9895 + }, + { + "epoch": 3.0374462860650704, + "grad_norm": 0.29092124104499817, + "learning_rate": 8.160834671031216e-05, + "loss": 1.8621, + "step": 9896 + }, + { + "epoch": 3.0377532228360957, + "grad_norm": 0.284665584564209, + "learning_rate": 8.160449520420779e-05, + "loss": 1.8607, + "step": 9897 + }, + { + "epoch": 3.038060159607121, + "grad_norm": 0.23676982522010803, + "learning_rate": 8.160064338576925e-05, + "loss": 1.7137, + "step": 9898 + }, + { + "epoch": 3.038367096378146, + "grad_norm": 0.2666932940483093, + "learning_rate": 8.159679125503466e-05, + "loss": 1.8038, + "step": 9899 + }, + { + "epoch": 3.0386740331491713, + "grad_norm": 0.36214375495910645, + "learning_rate": 8.159293881204204e-05, + "loss": 1.8902, + "step": 9900 + }, + { + "epoch": 3.0389809699201966, + "grad_norm": 0.30301332473754883, + "learning_rate": 8.158908605682948e-05, + "loss": 1.8456, + "step": 9901 + }, + { + "epoch": 3.0392879066912215, + "grad_norm": 0.32190418243408203, + "learning_rate": 8.158523298943506e-05, + "loss": 1.8246, + "step": 9902 + }, + { + "epoch": 3.039594843462247, + "grad_norm": 0.2938043475151062, + "learning_rate": 8.158137960989685e-05, + "loss": 1.8324, + "step": 9903 + }, + { + "epoch": 3.0399017802332717, + "grad_norm": 0.29493969678878784, + "learning_rate": 8.157752591825294e-05, + "loss": 1.8458, + "step": 9904 + }, + { + "epoch": 3.040208717004297, + "grad_norm": 0.2681889832019806, + "learning_rate": 8.157367191454141e-05, + "loss": 1.889, + "step": 9905 + }, + { + "epoch": 3.0405156537753224, + "grad_norm": 0.3111969232559204, + "learning_rate": 8.156981759880035e-05, + "loss": 1.8966, + "step": 9906 + }, + { + "epoch": 3.0408225905463473, + "grad_norm": 0.345262736082077, + "learning_rate": 8.156596297106784e-05, + "loss": 1.8174, + "step": 9907 + }, + { + "epoch": 3.0411295273173726, + "grad_norm": 0.30156534910202026, + "learning_rate": 8.156210803138199e-05, + "loss": 1.766, + "step": 9908 + }, + { + "epoch": 3.041436464088398, + "grad_norm": 0.28691565990448, + "learning_rate": 8.15582527797809e-05, + "loss": 1.8436, + "step": 9909 + }, + { + "epoch": 3.041743400859423, + "grad_norm": 0.33418282866477966, + "learning_rate": 8.155439721630264e-05, + "loss": 1.8939, + "step": 9910 + }, + { + "epoch": 3.042050337630448, + "grad_norm": 0.25496938824653625, + "learning_rate": 8.155054134098535e-05, + "loss": 1.8368, + "step": 9911 + }, + { + "epoch": 3.042357274401473, + "grad_norm": 0.3806788921356201, + "learning_rate": 8.154668515386711e-05, + "loss": 1.8635, + "step": 9912 + }, + { + "epoch": 3.0426642111724984, + "grad_norm": 0.42668119072914124, + "learning_rate": 8.154282865498603e-05, + "loss": 1.76, + "step": 9913 + }, + { + "epoch": 3.0429711479435237, + "grad_norm": 0.35945314168930054, + "learning_rate": 8.153897184438024e-05, + "loss": 1.8275, + "step": 9914 + }, + { + "epoch": 3.0432780847145486, + "grad_norm": 0.3225449323654175, + "learning_rate": 8.153511472208784e-05, + "loss": 1.7901, + "step": 9915 + }, + { + "epoch": 3.043585021485574, + "grad_norm": 0.2905425727367401, + "learning_rate": 8.153125728814694e-05, + "loss": 1.8021, + "step": 9916 + }, + { + "epoch": 3.0438919582565993, + "grad_norm": 0.3315529525279999, + "learning_rate": 8.15273995425957e-05, + "loss": 1.8003, + "step": 9917 + }, + { + "epoch": 3.044198895027624, + "grad_norm": 0.30256444215774536, + "learning_rate": 8.152354148547221e-05, + "loss": 1.8243, + "step": 9918 + }, + { + "epoch": 3.0445058317986495, + "grad_norm": 0.2563035190105438, + "learning_rate": 8.15196831168146e-05, + "loss": 1.7877, + "step": 9919 + }, + { + "epoch": 3.044812768569675, + "grad_norm": 0.25705814361572266, + "learning_rate": 8.151582443666101e-05, + "loss": 1.813, + "step": 9920 + }, + { + "epoch": 3.0451197053406998, + "grad_norm": 0.3649071455001831, + "learning_rate": 8.151196544504957e-05, + "loss": 1.8114, + "step": 9921 + }, + { + "epoch": 3.045426642111725, + "grad_norm": 0.4076193571090698, + "learning_rate": 8.150810614201841e-05, + "loss": 1.7869, + "step": 9922 + }, + { + "epoch": 3.04573357888275, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.150424652760569e-05, + "loss": 1.7878, + "step": 9923 + }, + { + "epoch": 3.0460405156537753, + "grad_norm": 0.2243243157863617, + "learning_rate": 8.150038660184955e-05, + "loss": 1.8224, + "step": 9924 + }, + { + "epoch": 3.0463474524248007, + "grad_norm": 0.3295031487941742, + "learning_rate": 8.149652636478811e-05, + "loss": 1.8685, + "step": 9925 + }, + { + "epoch": 3.0466543891958255, + "grad_norm": 0.2973531186580658, + "learning_rate": 8.149266581645954e-05, + "loss": 1.8082, + "step": 9926 + }, + { + "epoch": 3.046961325966851, + "grad_norm": 0.25648918747901917, + "learning_rate": 8.148880495690199e-05, + "loss": 1.8089, + "step": 9927 + }, + { + "epoch": 3.047268262737876, + "grad_norm": 0.2845752537250519, + "learning_rate": 8.148494378615361e-05, + "loss": 1.8726, + "step": 9928 + }, + { + "epoch": 3.047575199508901, + "grad_norm": 0.2917105555534363, + "learning_rate": 8.148108230425255e-05, + "loss": 1.8035, + "step": 9929 + }, + { + "epoch": 3.0478821362799264, + "grad_norm": 0.2775834798812866, + "learning_rate": 8.1477220511237e-05, + "loss": 1.8545, + "step": 9930 + }, + { + "epoch": 3.0481890730509513, + "grad_norm": 0.3522767424583435, + "learning_rate": 8.14733584071451e-05, + "loss": 1.8261, + "step": 9931 + }, + { + "epoch": 3.0484960098219767, + "grad_norm": 0.3759000599384308, + "learning_rate": 8.146949599201503e-05, + "loss": 1.8405, + "step": 9932 + }, + { + "epoch": 3.048802946593002, + "grad_norm": 0.3353044390678406, + "learning_rate": 8.146563326588496e-05, + "loss": 1.7762, + "step": 9933 + }, + { + "epoch": 3.049109883364027, + "grad_norm": 0.263810932636261, + "learning_rate": 8.146177022879304e-05, + "loss": 1.7546, + "step": 9934 + }, + { + "epoch": 3.049416820135052, + "grad_norm": 0.24064256250858307, + "learning_rate": 8.14579068807775e-05, + "loss": 1.7903, + "step": 9935 + }, + { + "epoch": 3.0497237569060776, + "grad_norm": 0.3144194781780243, + "learning_rate": 8.145404322187645e-05, + "loss": 1.8011, + "step": 9936 + }, + { + "epoch": 3.0500306936771024, + "grad_norm": 0.3362879455089569, + "learning_rate": 8.145017925212812e-05, + "loss": 1.8224, + "step": 9937 + }, + { + "epoch": 3.050337630448128, + "grad_norm": 0.33979395031929016, + "learning_rate": 8.144631497157071e-05, + "loss": 1.8415, + "step": 9938 + }, + { + "epoch": 3.0506445672191527, + "grad_norm": 0.33391237258911133, + "learning_rate": 8.144245038024235e-05, + "loss": 1.7983, + "step": 9939 + }, + { + "epoch": 3.050951503990178, + "grad_norm": 0.34034964442253113, + "learning_rate": 8.143858547818128e-05, + "loss": 1.8635, + "step": 9940 + }, + { + "epoch": 3.0512584407612033, + "grad_norm": 0.3472529947757721, + "learning_rate": 8.143472026542569e-05, + "loss": 1.8067, + "step": 9941 + }, + { + "epoch": 3.0515653775322282, + "grad_norm": 0.3369109630584717, + "learning_rate": 8.143085474201376e-05, + "loss": 1.7933, + "step": 9942 + }, + { + "epoch": 3.0518723143032536, + "grad_norm": 0.3055182993412018, + "learning_rate": 8.14269889079837e-05, + "loss": 1.7358, + "step": 9943 + }, + { + "epoch": 3.052179251074279, + "grad_norm": 0.26729708909988403, + "learning_rate": 8.142312276337372e-05, + "loss": 1.8315, + "step": 9944 + }, + { + "epoch": 3.052486187845304, + "grad_norm": 0.3626720607280731, + "learning_rate": 8.141925630822203e-05, + "loss": 1.7593, + "step": 9945 + }, + { + "epoch": 3.052793124616329, + "grad_norm": 0.3673512637615204, + "learning_rate": 8.141538954256683e-05, + "loss": 1.8414, + "step": 9946 + }, + { + "epoch": 3.053100061387354, + "grad_norm": 0.30554768443107605, + "learning_rate": 8.141152246644632e-05, + "loss": 1.7504, + "step": 9947 + }, + { + "epoch": 3.0534069981583793, + "grad_norm": 0.41163405776023865, + "learning_rate": 8.140765507989875e-05, + "loss": 1.8794, + "step": 9948 + }, + { + "epoch": 3.0537139349294047, + "grad_norm": 0.592751145362854, + "learning_rate": 8.140378738296233e-05, + "loss": 1.8538, + "step": 9949 + }, + { + "epoch": 3.0540208717004296, + "grad_norm": 0.483828604221344, + "learning_rate": 8.139991937567527e-05, + "loss": 1.7952, + "step": 9950 + }, + { + "epoch": 3.054327808471455, + "grad_norm": 0.26665306091308594, + "learning_rate": 8.13960510580758e-05, + "loss": 1.8268, + "step": 9951 + }, + { + "epoch": 3.0546347452424802, + "grad_norm": 0.42917072772979736, + "learning_rate": 8.139218243020215e-05, + "loss": 1.843, + "step": 9952 + }, + { + "epoch": 3.054941682013505, + "grad_norm": 0.47911396622657776, + "learning_rate": 8.138831349209256e-05, + "loss": 1.8223, + "step": 9953 + }, + { + "epoch": 3.0552486187845305, + "grad_norm": 0.4540431797504425, + "learning_rate": 8.138444424378524e-05, + "loss": 1.9198, + "step": 9954 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.29719051718711853, + "learning_rate": 8.138057468531845e-05, + "loss": 1.7873, + "step": 9955 + }, + { + "epoch": 3.0558624923265807, + "grad_norm": 0.35133618116378784, + "learning_rate": 8.137670481673045e-05, + "loss": 1.8459, + "step": 9956 + }, + { + "epoch": 3.056169429097606, + "grad_norm": 0.42896488308906555, + "learning_rate": 8.137283463805945e-05, + "loss": 1.7814, + "step": 9957 + }, + { + "epoch": 3.056476365868631, + "grad_norm": 0.38993972539901733, + "learning_rate": 8.136896414934372e-05, + "loss": 1.7636, + "step": 9958 + }, + { + "epoch": 3.0567833026396563, + "grad_norm": 0.31362372636795044, + "learning_rate": 8.13650933506215e-05, + "loss": 1.8021, + "step": 9959 + }, + { + "epoch": 3.0570902394106816, + "grad_norm": 0.27980196475982666, + "learning_rate": 8.136122224193103e-05, + "loss": 1.8445, + "step": 9960 + }, + { + "epoch": 3.0573971761817065, + "grad_norm": 0.2721461057662964, + "learning_rate": 8.135735082331059e-05, + "loss": 1.7614, + "step": 9961 + }, + { + "epoch": 3.057704112952732, + "grad_norm": 0.25157424807548523, + "learning_rate": 8.135347909479843e-05, + "loss": 1.7598, + "step": 9962 + }, + { + "epoch": 3.0580110497237567, + "grad_norm": 0.25798025727272034, + "learning_rate": 8.13496070564328e-05, + "loss": 1.7823, + "step": 9963 + }, + { + "epoch": 3.058317986494782, + "grad_norm": 0.30775198340415955, + "learning_rate": 8.134573470825199e-05, + "loss": 1.7755, + "step": 9964 + }, + { + "epoch": 3.0586249232658074, + "grad_norm": 0.28916797041893005, + "learning_rate": 8.134186205029426e-05, + "loss": 1.8189, + "step": 9965 + }, + { + "epoch": 3.0589318600368323, + "grad_norm": 0.2829149067401886, + "learning_rate": 8.133798908259787e-05, + "loss": 1.8546, + "step": 9966 + }, + { + "epoch": 3.0592387968078576, + "grad_norm": 0.2884117662906647, + "learning_rate": 8.13341158052011e-05, + "loss": 1.7705, + "step": 9967 + }, + { + "epoch": 3.059545733578883, + "grad_norm": 0.28311973810195923, + "learning_rate": 8.133024221814225e-05, + "loss": 1.8147, + "step": 9968 + }, + { + "epoch": 3.059852670349908, + "grad_norm": 0.25405213236808777, + "learning_rate": 8.132636832145957e-05, + "loss": 1.7813, + "step": 9969 + }, + { + "epoch": 3.060159607120933, + "grad_norm": 0.3082229793071747, + "learning_rate": 8.132249411519137e-05, + "loss": 1.8536, + "step": 9970 + }, + { + "epoch": 3.060466543891958, + "grad_norm": 0.29918181896209717, + "learning_rate": 8.13186195993759e-05, + "loss": 1.8181, + "step": 9971 + }, + { + "epoch": 3.0607734806629834, + "grad_norm": 0.3025238811969757, + "learning_rate": 8.13147447740515e-05, + "loss": 1.7785, + "step": 9972 + }, + { + "epoch": 3.0610804174340087, + "grad_norm": 0.2798222303390503, + "learning_rate": 8.131086963925643e-05, + "loss": 1.7873, + "step": 9973 + }, + { + "epoch": 3.0613873542050336, + "grad_norm": 0.32636210322380066, + "learning_rate": 8.130699419502898e-05, + "loss": 1.882, + "step": 9974 + }, + { + "epoch": 3.061694290976059, + "grad_norm": 0.27722054719924927, + "learning_rate": 8.130311844140748e-05, + "loss": 1.7788, + "step": 9975 + }, + { + "epoch": 3.0620012277470843, + "grad_norm": 0.289156436920166, + "learning_rate": 8.129924237843023e-05, + "loss": 1.8591, + "step": 9976 + }, + { + "epoch": 3.062308164518109, + "grad_norm": 0.2839665412902832, + "learning_rate": 8.12953660061355e-05, + "loss": 1.8255, + "step": 9977 + }, + { + "epoch": 3.0626151012891345, + "grad_norm": 0.2650148272514343, + "learning_rate": 8.129148932456161e-05, + "loss": 1.8353, + "step": 9978 + }, + { + "epoch": 3.06292203806016, + "grad_norm": 0.2884560227394104, + "learning_rate": 8.128761233374691e-05, + "loss": 1.8099, + "step": 9979 + }, + { + "epoch": 3.0632289748311847, + "grad_norm": 0.2610029876232147, + "learning_rate": 8.128373503372967e-05, + "loss": 1.8173, + "step": 9980 + }, + { + "epoch": 3.06353591160221, + "grad_norm": 0.32512393593788147, + "learning_rate": 8.127985742454822e-05, + "loss": 1.8619, + "step": 9981 + }, + { + "epoch": 3.063842848373235, + "grad_norm": 0.3382968604564667, + "learning_rate": 8.127597950624091e-05, + "loss": 1.831, + "step": 9982 + }, + { + "epoch": 3.0641497851442603, + "grad_norm": 0.33773133158683777, + "learning_rate": 8.127210127884602e-05, + "loss": 1.8194, + "step": 9983 + }, + { + "epoch": 3.0644567219152856, + "grad_norm": 0.31642746925354004, + "learning_rate": 8.126822274240188e-05, + "loss": 1.8782, + "step": 9984 + }, + { + "epoch": 3.0647636586863105, + "grad_norm": 0.2476506233215332, + "learning_rate": 8.126434389694686e-05, + "loss": 1.7866, + "step": 9985 + }, + { + "epoch": 3.065070595457336, + "grad_norm": 0.27296319603919983, + "learning_rate": 8.126046474251927e-05, + "loss": 1.8276, + "step": 9986 + }, + { + "epoch": 3.0653775322283607, + "grad_norm": 0.353865385055542, + "learning_rate": 8.125658527915744e-05, + "loss": 1.9525, + "step": 9987 + }, + { + "epoch": 3.065684468999386, + "grad_norm": 0.370256632566452, + "learning_rate": 8.12527055068997e-05, + "loss": 1.8514, + "step": 9988 + }, + { + "epoch": 3.0659914057704114, + "grad_norm": 0.30738842487335205, + "learning_rate": 8.124882542578442e-05, + "loss": 1.8125, + "step": 9989 + }, + { + "epoch": 3.0662983425414363, + "grad_norm": 0.3151233494281769, + "learning_rate": 8.124494503584995e-05, + "loss": 1.8165, + "step": 9990 + }, + { + "epoch": 3.0666052793124616, + "grad_norm": 0.29071590304374695, + "learning_rate": 8.124106433713458e-05, + "loss": 1.7617, + "step": 9991 + }, + { + "epoch": 3.066912216083487, + "grad_norm": 0.2898697853088379, + "learning_rate": 8.123718332967672e-05, + "loss": 1.7779, + "step": 9992 + }, + { + "epoch": 3.067219152854512, + "grad_norm": 0.26601701974868774, + "learning_rate": 8.123330201351471e-05, + "loss": 1.8307, + "step": 9993 + }, + { + "epoch": 3.067526089625537, + "grad_norm": 0.2622119188308716, + "learning_rate": 8.12294203886869e-05, + "loss": 1.7958, + "step": 9994 + }, + { + "epoch": 3.0678330263965625, + "grad_norm": 0.29709386825561523, + "learning_rate": 8.122553845523166e-05, + "loss": 1.7799, + "step": 9995 + }, + { + "epoch": 3.0681399631675874, + "grad_norm": 0.31267789006233215, + "learning_rate": 8.122165621318733e-05, + "loss": 1.8149, + "step": 9996 + }, + { + "epoch": 3.0684468999386127, + "grad_norm": 0.3076523244380951, + "learning_rate": 8.121777366259232e-05, + "loss": 1.7701, + "step": 9997 + }, + { + "epoch": 3.0687538367096376, + "grad_norm": 0.30096009373664856, + "learning_rate": 8.121389080348496e-05, + "loss": 1.8323, + "step": 9998 + }, + { + "epoch": 3.069060773480663, + "grad_norm": 0.25739142298698425, + "learning_rate": 8.121000763590363e-05, + "loss": 1.8105, + "step": 9999 + }, + { + "epoch": 3.0693677102516883, + "grad_norm": 0.2780844271183014, + "learning_rate": 8.120612415988671e-05, + "loss": 1.8502, + "step": 10000 + }, + { + "epoch": 3.069674647022713, + "grad_norm": 0.3316378593444824, + "learning_rate": 8.120224037547259e-05, + "loss": 1.8244, + "step": 10001 + }, + { + "epoch": 3.0699815837937385, + "grad_norm": 0.261129766702652, + "learning_rate": 8.119835628269964e-05, + "loss": 1.7769, + "step": 10002 + }, + { + "epoch": 3.070288520564764, + "grad_norm": 0.29213985800743103, + "learning_rate": 8.119447188160625e-05, + "loss": 1.7717, + "step": 10003 + }, + { + "epoch": 3.0705954573357888, + "grad_norm": 0.38545623421669006, + "learning_rate": 8.11905871722308e-05, + "loss": 1.8433, + "step": 10004 + }, + { + "epoch": 3.070902394106814, + "grad_norm": 0.3617223799228668, + "learning_rate": 8.118670215461168e-05, + "loss": 1.8172, + "step": 10005 + }, + { + "epoch": 3.071209330877839, + "grad_norm": 0.3241543769836426, + "learning_rate": 8.11828168287873e-05, + "loss": 1.8325, + "step": 10006 + }, + { + "epoch": 3.0715162676488643, + "grad_norm": 0.3538578152656555, + "learning_rate": 8.117893119479605e-05, + "loss": 1.8188, + "step": 10007 + }, + { + "epoch": 3.0718232044198897, + "grad_norm": 0.3861970603466034, + "learning_rate": 8.117504525267632e-05, + "loss": 1.8518, + "step": 10008 + }, + { + "epoch": 3.0721301411909145, + "grad_norm": 0.35433146357536316, + "learning_rate": 8.117115900246652e-05, + "loss": 1.8601, + "step": 10009 + }, + { + "epoch": 3.07243707796194, + "grad_norm": 0.29796987771987915, + "learning_rate": 8.116727244420507e-05, + "loss": 1.7934, + "step": 10010 + }, + { + "epoch": 3.072744014732965, + "grad_norm": 0.3091779947280884, + "learning_rate": 8.116338557793035e-05, + "loss": 1.8111, + "step": 10011 + }, + { + "epoch": 3.07305095150399, + "grad_norm": 0.2741319537162781, + "learning_rate": 8.11594984036808e-05, + "loss": 1.8079, + "step": 10012 + }, + { + "epoch": 3.0733578882750154, + "grad_norm": 0.28905320167541504, + "learning_rate": 8.115561092149482e-05, + "loss": 1.8475, + "step": 10013 + }, + { + "epoch": 3.0736648250460403, + "grad_norm": 0.2897081673145294, + "learning_rate": 8.115172313141081e-05, + "loss": 1.838, + "step": 10014 + }, + { + "epoch": 3.0739717618170657, + "grad_norm": 0.2620783746242523, + "learning_rate": 8.114783503346725e-05, + "loss": 1.8024, + "step": 10015 + }, + { + "epoch": 3.074278698588091, + "grad_norm": 0.26478636264801025, + "learning_rate": 8.11439466277025e-05, + "loss": 1.8137, + "step": 10016 + }, + { + "epoch": 3.074585635359116, + "grad_norm": 0.2796174883842468, + "learning_rate": 8.114005791415502e-05, + "loss": 1.7976, + "step": 10017 + }, + { + "epoch": 3.074892572130141, + "grad_norm": 0.26813286542892456, + "learning_rate": 8.113616889286325e-05, + "loss": 1.7945, + "step": 10018 + }, + { + "epoch": 3.0751995089011666, + "grad_norm": 0.2443828582763672, + "learning_rate": 8.11322795638656e-05, + "loss": 1.7829, + "step": 10019 + }, + { + "epoch": 3.0755064456721914, + "grad_norm": 0.2981395423412323, + "learning_rate": 8.112838992720053e-05, + "loss": 1.7928, + "step": 10020 + }, + { + "epoch": 3.075813382443217, + "grad_norm": 0.25605037808418274, + "learning_rate": 8.112449998290644e-05, + "loss": 1.8129, + "step": 10021 + }, + { + "epoch": 3.0761203192142417, + "grad_norm": 0.31180307269096375, + "learning_rate": 8.112060973102181e-05, + "loss": 1.7393, + "step": 10022 + }, + { + "epoch": 3.076427255985267, + "grad_norm": 0.3230421543121338, + "learning_rate": 8.111671917158508e-05, + "loss": 1.818, + "step": 10023 + }, + { + "epoch": 3.0767341927562923, + "grad_norm": 0.3158549964427948, + "learning_rate": 8.111282830463468e-05, + "loss": 1.7582, + "step": 10024 + }, + { + "epoch": 3.0770411295273172, + "grad_norm": 0.24524325132369995, + "learning_rate": 8.110893713020908e-05, + "loss": 1.8215, + "step": 10025 + }, + { + "epoch": 3.0773480662983426, + "grad_norm": 0.2793932259082794, + "learning_rate": 8.110504564834675e-05, + "loss": 1.8551, + "step": 10026 + }, + { + "epoch": 3.077655003069368, + "grad_norm": 0.29629403352737427, + "learning_rate": 8.110115385908612e-05, + "loss": 1.8019, + "step": 10027 + }, + { + "epoch": 3.077961939840393, + "grad_norm": 0.3138490915298462, + "learning_rate": 8.109726176246564e-05, + "loss": 1.8436, + "step": 10028 + }, + { + "epoch": 3.078268876611418, + "grad_norm": 0.29802024364471436, + "learning_rate": 8.10933693585238e-05, + "loss": 1.8158, + "step": 10029 + }, + { + "epoch": 3.078575813382443, + "grad_norm": 0.30785220861434937, + "learning_rate": 8.108947664729907e-05, + "loss": 1.8674, + "step": 10030 + }, + { + "epoch": 3.0788827501534684, + "grad_norm": 0.277662992477417, + "learning_rate": 8.10855836288299e-05, + "loss": 1.8253, + "step": 10031 + }, + { + "epoch": 3.0791896869244937, + "grad_norm": 0.27399590611457825, + "learning_rate": 8.108169030315477e-05, + "loss": 1.8587, + "step": 10032 + }, + { + "epoch": 3.0794966236955186, + "grad_norm": 0.28398239612579346, + "learning_rate": 8.107779667031217e-05, + "loss": 1.8326, + "step": 10033 + }, + { + "epoch": 3.079803560466544, + "grad_norm": 0.2882741093635559, + "learning_rate": 8.107390273034057e-05, + "loss": 1.785, + "step": 10034 + }, + { + "epoch": 3.0801104972375692, + "grad_norm": 0.271043598651886, + "learning_rate": 8.107000848327843e-05, + "loss": 1.765, + "step": 10035 + }, + { + "epoch": 3.080417434008594, + "grad_norm": 0.2589638829231262, + "learning_rate": 8.106611392916427e-05, + "loss": 1.8136, + "step": 10036 + }, + { + "epoch": 3.0807243707796195, + "grad_norm": 0.3068227469921112, + "learning_rate": 8.106221906803656e-05, + "loss": 1.8034, + "step": 10037 + }, + { + "epoch": 3.0810313075506444, + "grad_norm": 0.2714168131351471, + "learning_rate": 8.105832389993379e-05, + "loss": 1.8007, + "step": 10038 + }, + { + "epoch": 3.0813382443216697, + "grad_norm": 0.2747504711151123, + "learning_rate": 8.105442842489447e-05, + "loss": 1.8135, + "step": 10039 + }, + { + "epoch": 3.081645181092695, + "grad_norm": 0.2719285488128662, + "learning_rate": 8.105053264295708e-05, + "loss": 1.7629, + "step": 10040 + }, + { + "epoch": 3.08195211786372, + "grad_norm": 0.3119582235813141, + "learning_rate": 8.104663655416014e-05, + "loss": 1.7887, + "step": 10041 + }, + { + "epoch": 3.0822590546347453, + "grad_norm": 0.35965192317962646, + "learning_rate": 8.104274015854212e-05, + "loss": 1.8484, + "step": 10042 + }, + { + "epoch": 3.0825659914057706, + "grad_norm": 0.3045980632305145, + "learning_rate": 8.103884345614157e-05, + "loss": 1.8625, + "step": 10043 + }, + { + "epoch": 3.0828729281767955, + "grad_norm": 0.2925138473510742, + "learning_rate": 8.103494644699696e-05, + "loss": 1.9306, + "step": 10044 + }, + { + "epoch": 3.083179864947821, + "grad_norm": 0.2894277274608612, + "learning_rate": 8.103104913114681e-05, + "loss": 1.7796, + "step": 10045 + }, + { + "epoch": 3.0834868017188457, + "grad_norm": 0.2776826322078705, + "learning_rate": 8.102715150862967e-05, + "loss": 1.8169, + "step": 10046 + }, + { + "epoch": 3.083793738489871, + "grad_norm": 0.3315230906009674, + "learning_rate": 8.102325357948402e-05, + "loss": 1.8139, + "step": 10047 + }, + { + "epoch": 3.0841006752608964, + "grad_norm": 0.2906761169433594, + "learning_rate": 8.10193553437484e-05, + "loss": 1.8162, + "step": 10048 + }, + { + "epoch": 3.0844076120319213, + "grad_norm": 0.32681339979171753, + "learning_rate": 8.101545680146132e-05, + "loss": 1.8245, + "step": 10049 + }, + { + "epoch": 3.0847145488029466, + "grad_norm": 0.32525795698165894, + "learning_rate": 8.101155795266131e-05, + "loss": 1.8605, + "step": 10050 + }, + { + "epoch": 3.085021485573972, + "grad_norm": 0.31705379486083984, + "learning_rate": 8.100765879738692e-05, + "loss": 1.8214, + "step": 10051 + }, + { + "epoch": 3.085328422344997, + "grad_norm": 0.27772918343544006, + "learning_rate": 8.100375933567668e-05, + "loss": 1.7822, + "step": 10052 + }, + { + "epoch": 3.085635359116022, + "grad_norm": 0.2877809405326843, + "learning_rate": 8.09998595675691e-05, + "loss": 1.7935, + "step": 10053 + }, + { + "epoch": 3.0859422958870475, + "grad_norm": 0.29759806394577026, + "learning_rate": 8.099595949310276e-05, + "loss": 1.8041, + "step": 10054 + }, + { + "epoch": 3.0862492326580724, + "grad_norm": 0.2715320289134979, + "learning_rate": 8.099205911231617e-05, + "loss": 1.7923, + "step": 10055 + }, + { + "epoch": 3.0865561694290977, + "grad_norm": 0.33566340804100037, + "learning_rate": 8.098815842524789e-05, + "loss": 1.7953, + "step": 10056 + }, + { + "epoch": 3.0868631062001226, + "grad_norm": 0.3360871970653534, + "learning_rate": 8.098425743193645e-05, + "loss": 1.8275, + "step": 10057 + }, + { + "epoch": 3.087170042971148, + "grad_norm": 0.2797739803791046, + "learning_rate": 8.098035613242043e-05, + "loss": 1.7597, + "step": 10058 + }, + { + "epoch": 3.0874769797421733, + "grad_norm": 0.25500187277793884, + "learning_rate": 8.097645452673837e-05, + "loss": 1.8059, + "step": 10059 + }, + { + "epoch": 3.087783916513198, + "grad_norm": 0.28042587637901306, + "learning_rate": 8.097255261492884e-05, + "loss": 1.7954, + "step": 10060 + }, + { + "epoch": 3.0880908532842235, + "grad_norm": 0.3616262376308441, + "learning_rate": 8.096865039703038e-05, + "loss": 1.8605, + "step": 10061 + }, + { + "epoch": 3.0883977900552484, + "grad_norm": 0.3453714847564697, + "learning_rate": 8.096474787308157e-05, + "loss": 1.7643, + "step": 10062 + }, + { + "epoch": 3.0887047268262737, + "grad_norm": 0.3192278742790222, + "learning_rate": 8.096084504312098e-05, + "loss": 1.8415, + "step": 10063 + }, + { + "epoch": 3.089011663597299, + "grad_norm": 0.2714482545852661, + "learning_rate": 8.095694190718715e-05, + "loss": 1.8204, + "step": 10064 + }, + { + "epoch": 3.089318600368324, + "grad_norm": 0.26562005281448364, + "learning_rate": 8.09530384653187e-05, + "loss": 1.7322, + "step": 10065 + }, + { + "epoch": 3.0896255371393493, + "grad_norm": 0.33727800846099854, + "learning_rate": 8.094913471755417e-05, + "loss": 1.8221, + "step": 10066 + }, + { + "epoch": 3.0899324739103746, + "grad_norm": 0.3561044931411743, + "learning_rate": 8.094523066393215e-05, + "loss": 1.8879, + "step": 10067 + }, + { + "epoch": 3.0902394106813995, + "grad_norm": 0.2568742334842682, + "learning_rate": 8.094132630449122e-05, + "loss": 1.8178, + "step": 10068 + }, + { + "epoch": 3.090546347452425, + "grad_norm": 0.4025525450706482, + "learning_rate": 8.093742163926998e-05, + "loss": 1.8186, + "step": 10069 + }, + { + "epoch": 3.09085328422345, + "grad_norm": 0.43863433599472046, + "learning_rate": 8.0933516668307e-05, + "loss": 1.8371, + "step": 10070 + }, + { + "epoch": 3.091160220994475, + "grad_norm": 0.34873950481414795, + "learning_rate": 8.092961139164087e-05, + "loss": 1.8083, + "step": 10071 + }, + { + "epoch": 3.0914671577655004, + "grad_norm": 0.31433534622192383, + "learning_rate": 8.092570580931021e-05, + "loss": 1.8154, + "step": 10072 + }, + { + "epoch": 3.0917740945365253, + "grad_norm": 0.25523966550827026, + "learning_rate": 8.092179992135358e-05, + "loss": 1.8158, + "step": 10073 + }, + { + "epoch": 3.0920810313075506, + "grad_norm": 0.348469078540802, + "learning_rate": 8.09178937278096e-05, + "loss": 1.8358, + "step": 10074 + }, + { + "epoch": 3.092387968078576, + "grad_norm": 0.33455297350883484, + "learning_rate": 8.091398722871688e-05, + "loss": 1.7779, + "step": 10075 + }, + { + "epoch": 3.092694904849601, + "grad_norm": 0.36544880270957947, + "learning_rate": 8.091008042411403e-05, + "loss": 1.9186, + "step": 10076 + }, + { + "epoch": 3.093001841620626, + "grad_norm": 0.29165831208229065, + "learning_rate": 8.090617331403965e-05, + "loss": 1.8964, + "step": 10077 + }, + { + "epoch": 3.0933087783916515, + "grad_norm": 0.31011059880256653, + "learning_rate": 8.090226589853234e-05, + "loss": 1.8453, + "step": 10078 + }, + { + "epoch": 3.0936157151626764, + "grad_norm": 0.2835703492164612, + "learning_rate": 8.089835817763071e-05, + "loss": 1.7718, + "step": 10079 + }, + { + "epoch": 3.0939226519337018, + "grad_norm": 0.2910583019256592, + "learning_rate": 8.08944501513734e-05, + "loss": 1.7881, + "step": 10080 + }, + { + "epoch": 3.0942295887047266, + "grad_norm": 0.391303688287735, + "learning_rate": 8.089054181979905e-05, + "loss": 1.7915, + "step": 10081 + }, + { + "epoch": 3.094536525475752, + "grad_norm": 0.4119330048561096, + "learning_rate": 8.088663318294623e-05, + "loss": 1.7975, + "step": 10082 + }, + { + "epoch": 3.0948434622467773, + "grad_norm": 0.2980102002620697, + "learning_rate": 8.088272424085361e-05, + "loss": 1.805, + "step": 10083 + }, + { + "epoch": 3.095150399017802, + "grad_norm": 0.3089980483055115, + "learning_rate": 8.087881499355983e-05, + "loss": 1.8265, + "step": 10084 + }, + { + "epoch": 3.0954573357888275, + "grad_norm": 0.3851003348827362, + "learning_rate": 8.087490544110348e-05, + "loss": 1.8174, + "step": 10085 + }, + { + "epoch": 3.095764272559853, + "grad_norm": 0.42357420921325684, + "learning_rate": 8.08709955835232e-05, + "loss": 1.8083, + "step": 10086 + }, + { + "epoch": 3.0960712093308778, + "grad_norm": 0.291777640581131, + "learning_rate": 8.086708542085768e-05, + "loss": 1.7713, + "step": 10087 + }, + { + "epoch": 3.096378146101903, + "grad_norm": 0.2563805878162384, + "learning_rate": 8.086317495314552e-05, + "loss": 1.7691, + "step": 10088 + }, + { + "epoch": 3.096685082872928, + "grad_norm": 0.3418877422809601, + "learning_rate": 8.085926418042536e-05, + "loss": 1.8547, + "step": 10089 + }, + { + "epoch": 3.0969920196439533, + "grad_norm": 0.3859385550022125, + "learning_rate": 8.085535310273589e-05, + "loss": 1.8226, + "step": 10090 + }, + { + "epoch": 3.0972989564149787, + "grad_norm": 0.3427267372608185, + "learning_rate": 8.085144172011571e-05, + "loss": 1.837, + "step": 10091 + }, + { + "epoch": 3.0976058931860035, + "grad_norm": 0.29290953278541565, + "learning_rate": 8.084753003260352e-05, + "loss": 1.8392, + "step": 10092 + }, + { + "epoch": 3.097912829957029, + "grad_norm": 0.33282020688056946, + "learning_rate": 8.084361804023795e-05, + "loss": 1.8351, + "step": 10093 + }, + { + "epoch": 3.098219766728054, + "grad_norm": 0.3802134394645691, + "learning_rate": 8.083970574305768e-05, + "loss": 1.7467, + "step": 10094 + }, + { + "epoch": 3.098526703499079, + "grad_norm": 0.3142111897468567, + "learning_rate": 8.083579314110135e-05, + "loss": 1.7966, + "step": 10095 + }, + { + "epoch": 3.0988336402701044, + "grad_norm": 0.2956278324127197, + "learning_rate": 8.083188023440765e-05, + "loss": 1.8724, + "step": 10096 + }, + { + "epoch": 3.0991405770411293, + "grad_norm": 0.3262473940849304, + "learning_rate": 8.082796702301522e-05, + "loss": 1.8448, + "step": 10097 + }, + { + "epoch": 3.0994475138121547, + "grad_norm": 0.29358017444610596, + "learning_rate": 8.082405350696276e-05, + "loss": 1.8679, + "step": 10098 + }, + { + "epoch": 3.09975445058318, + "grad_norm": 0.36439722776412964, + "learning_rate": 8.082013968628893e-05, + "loss": 1.8801, + "step": 10099 + }, + { + "epoch": 3.100061387354205, + "grad_norm": 0.3565322458744049, + "learning_rate": 8.081622556103244e-05, + "loss": 1.794, + "step": 10100 + }, + { + "epoch": 3.1003683241252302, + "grad_norm": 0.2841760814189911, + "learning_rate": 8.081231113123191e-05, + "loss": 1.7593, + "step": 10101 + }, + { + "epoch": 3.1006752608962556, + "grad_norm": 0.28589630126953125, + "learning_rate": 8.080839639692608e-05, + "loss": 1.864, + "step": 10102 + }, + { + "epoch": 3.1009821976672804, + "grad_norm": 0.3595057427883148, + "learning_rate": 8.080448135815362e-05, + "loss": 1.8067, + "step": 10103 + }, + { + "epoch": 3.101289134438306, + "grad_norm": 0.3909708261489868, + "learning_rate": 8.080056601495322e-05, + "loss": 1.8601, + "step": 10104 + }, + { + "epoch": 3.1015960712093307, + "grad_norm": 0.35180148482322693, + "learning_rate": 8.079665036736358e-05, + "loss": 1.8328, + "step": 10105 + }, + { + "epoch": 3.101903007980356, + "grad_norm": 0.3065175712108612, + "learning_rate": 8.079273441542338e-05, + "loss": 1.8449, + "step": 10106 + }, + { + "epoch": 3.1022099447513813, + "grad_norm": 0.31358617544174194, + "learning_rate": 8.078881815917134e-05, + "loss": 1.8325, + "step": 10107 + }, + { + "epoch": 3.1025168815224062, + "grad_norm": 0.4737118184566498, + "learning_rate": 8.078490159864614e-05, + "loss": 1.8232, + "step": 10108 + }, + { + "epoch": 3.1028238182934316, + "grad_norm": 0.435148686170578, + "learning_rate": 8.078098473388651e-05, + "loss": 1.8227, + "step": 10109 + }, + { + "epoch": 3.103130755064457, + "grad_norm": 0.3080987334251404, + "learning_rate": 8.077706756493115e-05, + "loss": 1.8072, + "step": 10110 + }, + { + "epoch": 3.103437691835482, + "grad_norm": 0.3225170075893402, + "learning_rate": 8.077315009181876e-05, + "loss": 1.7716, + "step": 10111 + }, + { + "epoch": 3.103744628606507, + "grad_norm": 0.46642443537712097, + "learning_rate": 8.076923231458808e-05, + "loss": 1.8295, + "step": 10112 + }, + { + "epoch": 3.104051565377532, + "grad_norm": 0.42561766505241394, + "learning_rate": 8.07653142332778e-05, + "loss": 1.8553, + "step": 10113 + }, + { + "epoch": 3.1043585021485574, + "grad_norm": 0.27187541127204895, + "learning_rate": 8.076139584792664e-05, + "loss": 1.7937, + "step": 10114 + }, + { + "epoch": 3.1046654389195827, + "grad_norm": 0.27822238206863403, + "learning_rate": 8.075747715857335e-05, + "loss": 1.8151, + "step": 10115 + }, + { + "epoch": 3.1049723756906076, + "grad_norm": 0.40106478333473206, + "learning_rate": 8.075355816525665e-05, + "loss": 1.8637, + "step": 10116 + }, + { + "epoch": 3.105279312461633, + "grad_norm": 0.33455124497413635, + "learning_rate": 8.074963886801525e-05, + "loss": 1.8543, + "step": 10117 + }, + { + "epoch": 3.1055862492326582, + "grad_norm": 0.32246437668800354, + "learning_rate": 8.07457192668879e-05, + "loss": 1.7907, + "step": 10118 + }, + { + "epoch": 3.105893186003683, + "grad_norm": 0.45360109210014343, + "learning_rate": 8.074179936191332e-05, + "loss": 1.7404, + "step": 10119 + }, + { + "epoch": 3.1062001227747085, + "grad_norm": 0.445916086435318, + "learning_rate": 8.07378791531303e-05, + "loss": 1.778, + "step": 10120 + }, + { + "epoch": 3.1065070595457334, + "grad_norm": 0.28561538457870483, + "learning_rate": 8.073395864057751e-05, + "loss": 1.8723, + "step": 10121 + }, + { + "epoch": 3.1068139963167587, + "grad_norm": 0.3258218467235565, + "learning_rate": 8.073003782429373e-05, + "loss": 1.8106, + "step": 10122 + }, + { + "epoch": 3.107120933087784, + "grad_norm": 0.5459560751914978, + "learning_rate": 8.07261167043177e-05, + "loss": 1.8022, + "step": 10123 + }, + { + "epoch": 3.107427869858809, + "grad_norm": 0.4828549921512604, + "learning_rate": 8.072219528068819e-05, + "loss": 1.7556, + "step": 10124 + }, + { + "epoch": 3.1077348066298343, + "grad_norm": 0.24075324833393097, + "learning_rate": 8.071827355344393e-05, + "loss": 1.7901, + "step": 10125 + }, + { + "epoch": 3.1080417434008596, + "grad_norm": 0.44677188992500305, + "learning_rate": 8.071435152262367e-05, + "loss": 1.7858, + "step": 10126 + }, + { + "epoch": 3.1083486801718845, + "grad_norm": 0.49862590432167053, + "learning_rate": 8.071042918826622e-05, + "loss": 1.805, + "step": 10127 + }, + { + "epoch": 3.10865561694291, + "grad_norm": 0.30883491039276123, + "learning_rate": 8.07065065504103e-05, + "loss": 1.7693, + "step": 10128 + }, + { + "epoch": 3.108962553713935, + "grad_norm": 0.29583030939102173, + "learning_rate": 8.070258360909467e-05, + "loss": 1.8141, + "step": 10129 + }, + { + "epoch": 3.10926949048496, + "grad_norm": 0.3595346510410309, + "learning_rate": 8.069866036435812e-05, + "loss": 1.8286, + "step": 10130 + }, + { + "epoch": 3.1095764272559854, + "grad_norm": 0.3215504288673401, + "learning_rate": 8.069473681623942e-05, + "loss": 1.8557, + "step": 10131 + }, + { + "epoch": 3.1098833640270103, + "grad_norm": 0.29734939336776733, + "learning_rate": 8.069081296477734e-05, + "loss": 1.7996, + "step": 10132 + }, + { + "epoch": 3.1101903007980356, + "grad_norm": 0.33546003699302673, + "learning_rate": 8.068688881001065e-05, + "loss": 1.8307, + "step": 10133 + }, + { + "epoch": 3.110497237569061, + "grad_norm": 0.3886832296848297, + "learning_rate": 8.068296435197814e-05, + "loss": 1.751, + "step": 10134 + }, + { + "epoch": 3.110804174340086, + "grad_norm": 0.34505394101142883, + "learning_rate": 8.06790395907186e-05, + "loss": 1.7543, + "step": 10135 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.27018141746520996, + "learning_rate": 8.06751145262708e-05, + "loss": 1.8109, + "step": 10136 + }, + { + "epoch": 3.1114180478821365, + "grad_norm": 0.3367149531841278, + "learning_rate": 8.067118915867355e-05, + "loss": 1.8025, + "step": 10137 + }, + { + "epoch": 3.1117249846531614, + "grad_norm": 0.40811091661453247, + "learning_rate": 8.066726348796562e-05, + "loss": 1.7327, + "step": 10138 + }, + { + "epoch": 3.1120319214241867, + "grad_norm": 0.3511471152305603, + "learning_rate": 8.066333751418583e-05, + "loss": 1.8711, + "step": 10139 + }, + { + "epoch": 3.1123388581952116, + "grad_norm": 0.3112446367740631, + "learning_rate": 8.065941123737295e-05, + "loss": 1.8621, + "step": 10140 + }, + { + "epoch": 3.112645794966237, + "grad_norm": 0.3424238860607147, + "learning_rate": 8.065548465756581e-05, + "loss": 1.8383, + "step": 10141 + }, + { + "epoch": 3.1129527317372623, + "grad_norm": 0.380013108253479, + "learning_rate": 8.06515577748032e-05, + "loss": 1.8121, + "step": 10142 + }, + { + "epoch": 3.113259668508287, + "grad_norm": 0.2650558650493622, + "learning_rate": 8.064763058912393e-05, + "loss": 1.866, + "step": 10143 + }, + { + "epoch": 3.1135666052793125, + "grad_norm": 0.30580762028694153, + "learning_rate": 8.06437031005668e-05, + "loss": 1.7769, + "step": 10144 + }, + { + "epoch": 3.113873542050338, + "grad_norm": 0.29927194118499756, + "learning_rate": 8.063977530917066e-05, + "loss": 1.7897, + "step": 10145 + }, + { + "epoch": 3.1141804788213627, + "grad_norm": 0.24322012066841125, + "learning_rate": 8.063584721497429e-05, + "loss": 1.7968, + "step": 10146 + }, + { + "epoch": 3.114487415592388, + "grad_norm": 0.3082945644855499, + "learning_rate": 8.063191881801651e-05, + "loss": 1.8456, + "step": 10147 + }, + { + "epoch": 3.114794352363413, + "grad_norm": 0.3247329890727997, + "learning_rate": 8.062799011833617e-05, + "loss": 1.7436, + "step": 10148 + }, + { + "epoch": 3.1151012891344383, + "grad_norm": 0.27591946721076965, + "learning_rate": 8.062406111597207e-05, + "loss": 1.7976, + "step": 10149 + }, + { + "epoch": 3.1154082259054636, + "grad_norm": 0.2752058804035187, + "learning_rate": 8.062013181096306e-05, + "loss": 1.7814, + "step": 10150 + }, + { + "epoch": 3.1157151626764885, + "grad_norm": 0.3207196891307831, + "learning_rate": 8.061620220334795e-05, + "loss": 1.7767, + "step": 10151 + }, + { + "epoch": 3.116022099447514, + "grad_norm": 0.2895309627056122, + "learning_rate": 8.061227229316559e-05, + "loss": 1.8588, + "step": 10152 + }, + { + "epoch": 3.116329036218539, + "grad_norm": 0.333843469619751, + "learning_rate": 8.060834208045481e-05, + "loss": 1.7871, + "step": 10153 + }, + { + "epoch": 3.116635972989564, + "grad_norm": 0.43877774477005005, + "learning_rate": 8.060441156525445e-05, + "loss": 1.8165, + "step": 10154 + }, + { + "epoch": 3.1169429097605894, + "grad_norm": 0.35700589418411255, + "learning_rate": 8.060048074760337e-05, + "loss": 1.777, + "step": 10155 + }, + { + "epoch": 3.1172498465316143, + "grad_norm": 0.26124534010887146, + "learning_rate": 8.059654962754039e-05, + "loss": 1.8343, + "step": 10156 + }, + { + "epoch": 3.1175567833026396, + "grad_norm": 0.331444650888443, + "learning_rate": 8.059261820510438e-05, + "loss": 1.9437, + "step": 10157 + }, + { + "epoch": 3.117863720073665, + "grad_norm": 0.31657731533050537, + "learning_rate": 8.058868648033419e-05, + "loss": 1.7621, + "step": 10158 + }, + { + "epoch": 3.11817065684469, + "grad_norm": 0.2785957455635071, + "learning_rate": 8.058475445326867e-05, + "loss": 1.9049, + "step": 10159 + }, + { + "epoch": 3.118477593615715, + "grad_norm": 0.2605743408203125, + "learning_rate": 8.058082212394667e-05, + "loss": 1.7895, + "step": 10160 + }, + { + "epoch": 3.1187845303867405, + "grad_norm": 0.2981378138065338, + "learning_rate": 8.057688949240707e-05, + "loss": 1.8373, + "step": 10161 + }, + { + "epoch": 3.1190914671577654, + "grad_norm": 0.2944273054599762, + "learning_rate": 8.057295655868873e-05, + "loss": 1.8373, + "step": 10162 + }, + { + "epoch": 3.1193984039287908, + "grad_norm": 0.2696721851825714, + "learning_rate": 8.056902332283052e-05, + "loss": 1.8023, + "step": 10163 + }, + { + "epoch": 3.1197053406998156, + "grad_norm": 0.27659857273101807, + "learning_rate": 8.056508978487128e-05, + "loss": 1.8453, + "step": 10164 + }, + { + "epoch": 3.120012277470841, + "grad_norm": 0.2982441186904907, + "learning_rate": 8.056115594484992e-05, + "loss": 1.9072, + "step": 10165 + }, + { + "epoch": 3.1203192142418663, + "grad_norm": 0.3136404752731323, + "learning_rate": 8.055722180280531e-05, + "loss": 1.8585, + "step": 10166 + }, + { + "epoch": 3.120626151012891, + "grad_norm": 0.2979940176010132, + "learning_rate": 8.055328735877631e-05, + "loss": 1.8699, + "step": 10167 + }, + { + "epoch": 3.1209330877839165, + "grad_norm": 0.2585618793964386, + "learning_rate": 8.054935261280184e-05, + "loss": 1.8323, + "step": 10168 + }, + { + "epoch": 3.121240024554942, + "grad_norm": 0.28734859824180603, + "learning_rate": 8.054541756492075e-05, + "loss": 1.8694, + "step": 10169 + }, + { + "epoch": 3.1215469613259668, + "grad_norm": 0.30582788586616516, + "learning_rate": 8.054148221517193e-05, + "loss": 1.856, + "step": 10170 + }, + { + "epoch": 3.121853898096992, + "grad_norm": 0.3128255009651184, + "learning_rate": 8.053754656359429e-05, + "loss": 1.8329, + "step": 10171 + }, + { + "epoch": 3.122160834868017, + "grad_norm": 0.2845318615436554, + "learning_rate": 8.053361061022671e-05, + "loss": 1.8111, + "step": 10172 + }, + { + "epoch": 3.1224677716390423, + "grad_norm": 0.2994609773159027, + "learning_rate": 8.05296743551081e-05, + "loss": 1.8157, + "step": 10173 + }, + { + "epoch": 3.1227747084100677, + "grad_norm": 0.26397961378097534, + "learning_rate": 8.052573779827737e-05, + "loss": 1.8572, + "step": 10174 + }, + { + "epoch": 3.1230816451810925, + "grad_norm": 0.2911500334739685, + "learning_rate": 8.052180093977339e-05, + "loss": 1.8312, + "step": 10175 + }, + { + "epoch": 3.123388581952118, + "grad_norm": 0.33455008268356323, + "learning_rate": 8.051786377963509e-05, + "loss": 1.8748, + "step": 10176 + }, + { + "epoch": 3.123695518723143, + "grad_norm": 0.3127586841583252, + "learning_rate": 8.051392631790135e-05, + "loss": 1.8224, + "step": 10177 + }, + { + "epoch": 3.124002455494168, + "grad_norm": 0.2910686433315277, + "learning_rate": 8.050998855461113e-05, + "loss": 1.8557, + "step": 10178 + }, + { + "epoch": 3.1243093922651934, + "grad_norm": 0.2849208414554596, + "learning_rate": 8.050605048980333e-05, + "loss": 1.82, + "step": 10179 + }, + { + "epoch": 3.1246163290362183, + "grad_norm": 0.35189691185951233, + "learning_rate": 8.050211212351683e-05, + "loss": 1.7884, + "step": 10180 + }, + { + "epoch": 3.1249232658072437, + "grad_norm": 0.3641110360622406, + "learning_rate": 8.04981734557906e-05, + "loss": 1.7984, + "step": 10181 + }, + { + "epoch": 3.125230202578269, + "grad_norm": 0.3111717700958252, + "learning_rate": 8.049423448666353e-05, + "loss": 1.8134, + "step": 10182 + }, + { + "epoch": 3.125537139349294, + "grad_norm": 0.2608453631401062, + "learning_rate": 8.049029521617457e-05, + "loss": 1.765, + "step": 10183 + }, + { + "epoch": 3.1258440761203192, + "grad_norm": 0.28779423236846924, + "learning_rate": 8.048635564436265e-05, + "loss": 1.8355, + "step": 10184 + }, + { + "epoch": 3.1261510128913446, + "grad_norm": 0.38227665424346924, + "learning_rate": 8.048241577126668e-05, + "loss": 1.8487, + "step": 10185 + }, + { + "epoch": 3.1264579496623695, + "grad_norm": 0.3603171706199646, + "learning_rate": 8.047847559692562e-05, + "loss": 1.8035, + "step": 10186 + }, + { + "epoch": 3.126764886433395, + "grad_norm": 0.21950066089630127, + "learning_rate": 8.04745351213784e-05, + "loss": 1.7399, + "step": 10187 + }, + { + "epoch": 3.12707182320442, + "grad_norm": 0.2796075642108917, + "learning_rate": 8.047059434466395e-05, + "loss": 1.8229, + "step": 10188 + }, + { + "epoch": 3.127378759975445, + "grad_norm": 0.3382907807826996, + "learning_rate": 8.046665326682125e-05, + "loss": 1.7713, + "step": 10189 + }, + { + "epoch": 3.1276856967464703, + "grad_norm": 0.36472463607788086, + "learning_rate": 8.04627118878892e-05, + "loss": 1.8129, + "step": 10190 + }, + { + "epoch": 3.1279926335174952, + "grad_norm": 0.2971884310245514, + "learning_rate": 8.045877020790679e-05, + "loss": 1.7894, + "step": 10191 + }, + { + "epoch": 3.1282995702885206, + "grad_norm": 0.2292303442955017, + "learning_rate": 8.045482822691297e-05, + "loss": 1.7637, + "step": 10192 + }, + { + "epoch": 3.128606507059546, + "grad_norm": 0.300750732421875, + "learning_rate": 8.045088594494668e-05, + "loss": 1.7678, + "step": 10193 + }, + { + "epoch": 3.128913443830571, + "grad_norm": 0.3121531009674072, + "learning_rate": 8.044694336204688e-05, + "loss": 1.8651, + "step": 10194 + }, + { + "epoch": 3.129220380601596, + "grad_norm": 0.2456093430519104, + "learning_rate": 8.044300047825254e-05, + "loss": 1.7769, + "step": 10195 + }, + { + "epoch": 3.129527317372621, + "grad_norm": 0.25085800886154175, + "learning_rate": 8.043905729360264e-05, + "loss": 1.7723, + "step": 10196 + }, + { + "epoch": 3.1298342541436464, + "grad_norm": 0.2505287826061249, + "learning_rate": 8.043511380813612e-05, + "loss": 1.7943, + "step": 10197 + }, + { + "epoch": 3.1301411909146717, + "grad_norm": 0.27144530415534973, + "learning_rate": 8.043117002189198e-05, + "loss": 1.8119, + "step": 10198 + }, + { + "epoch": 3.1304481276856966, + "grad_norm": 0.2702989876270294, + "learning_rate": 8.042722593490916e-05, + "loss": 1.8517, + "step": 10199 + }, + { + "epoch": 3.130755064456722, + "grad_norm": 0.2585136890411377, + "learning_rate": 8.042328154722667e-05, + "loss": 1.8382, + "step": 10200 + }, + { + "epoch": 3.1310620012277472, + "grad_norm": 0.26306065917015076, + "learning_rate": 8.041933685888348e-05, + "loss": 1.8211, + "step": 10201 + }, + { + "epoch": 3.131368937998772, + "grad_norm": 0.2208927720785141, + "learning_rate": 8.041539186991858e-05, + "loss": 1.7765, + "step": 10202 + }, + { + "epoch": 3.1316758747697975, + "grad_norm": 0.2756440043449402, + "learning_rate": 8.041144658037095e-05, + "loss": 1.898, + "step": 10203 + }, + { + "epoch": 3.131982811540823, + "grad_norm": 0.29718101024627686, + "learning_rate": 8.040750099027958e-05, + "loss": 1.8226, + "step": 10204 + }, + { + "epoch": 3.1322897483118477, + "grad_norm": 0.3166738748550415, + "learning_rate": 8.040355509968345e-05, + "loss": 1.8129, + "step": 10205 + }, + { + "epoch": 3.132596685082873, + "grad_norm": 0.3534909784793854, + "learning_rate": 8.039960890862158e-05, + "loss": 1.8915, + "step": 10206 + }, + { + "epoch": 3.132903621853898, + "grad_norm": 0.3015006184577942, + "learning_rate": 8.039566241713297e-05, + "loss": 1.8389, + "step": 10207 + }, + { + "epoch": 3.1332105586249233, + "grad_norm": 0.35226619243621826, + "learning_rate": 8.039171562525659e-05, + "loss": 1.7287, + "step": 10208 + }, + { + "epoch": 3.1335174953959486, + "grad_norm": 0.4290136694908142, + "learning_rate": 8.038776853303146e-05, + "loss": 1.8768, + "step": 10209 + }, + { + "epoch": 3.1338244321669735, + "grad_norm": 0.2828960418701172, + "learning_rate": 8.03838211404966e-05, + "loss": 1.7552, + "step": 10210 + }, + { + "epoch": 3.134131368937999, + "grad_norm": 0.3781953752040863, + "learning_rate": 8.0379873447691e-05, + "loss": 1.7812, + "step": 10211 + }, + { + "epoch": 3.1344383057090237, + "grad_norm": 0.4282926023006439, + "learning_rate": 8.037592545465371e-05, + "loss": 1.84, + "step": 10212 + }, + { + "epoch": 3.134745242480049, + "grad_norm": 0.2622411251068115, + "learning_rate": 8.03719771614237e-05, + "loss": 1.8114, + "step": 10213 + }, + { + "epoch": 3.1350521792510744, + "grad_norm": 0.34881457686424255, + "learning_rate": 8.036802856804001e-05, + "loss": 1.7694, + "step": 10214 + }, + { + "epoch": 3.1353591160220993, + "grad_norm": 0.40797632932662964, + "learning_rate": 8.036407967454167e-05, + "loss": 1.7595, + "step": 10215 + }, + { + "epoch": 3.1356660527931246, + "grad_norm": 0.24902814626693726, + "learning_rate": 8.036013048096769e-05, + "loss": 1.8068, + "step": 10216 + }, + { + "epoch": 3.13597298956415, + "grad_norm": 0.3682909607887268, + "learning_rate": 8.035618098735711e-05, + "loss": 1.8519, + "step": 10217 + }, + { + "epoch": 3.136279926335175, + "grad_norm": 0.6111233234405518, + "learning_rate": 8.035223119374895e-05, + "loss": 1.9254, + "step": 10218 + }, + { + "epoch": 3.1365868631062, + "grad_norm": 0.4793062210083008, + "learning_rate": 8.034828110018227e-05, + "loss": 1.786, + "step": 10219 + }, + { + "epoch": 3.1368937998772255, + "grad_norm": 0.3074932396411896, + "learning_rate": 8.034433070669607e-05, + "loss": 1.8495, + "step": 10220 + }, + { + "epoch": 3.1372007366482504, + "grad_norm": 0.4366479218006134, + "learning_rate": 8.034038001332942e-05, + "loss": 1.8501, + "step": 10221 + }, + { + "epoch": 3.1375076734192757, + "grad_norm": 0.4660070538520813, + "learning_rate": 8.033642902012135e-05, + "loss": 1.8317, + "step": 10222 + }, + { + "epoch": 3.1378146101903006, + "grad_norm": 0.3452899158000946, + "learning_rate": 8.03324777271109e-05, + "loss": 1.8702, + "step": 10223 + }, + { + "epoch": 3.138121546961326, + "grad_norm": 0.3658824563026428, + "learning_rate": 8.032852613433713e-05, + "loss": 1.8754, + "step": 10224 + }, + { + "epoch": 3.1384284837323513, + "grad_norm": 0.3777768909931183, + "learning_rate": 8.03245742418391e-05, + "loss": 1.8613, + "step": 10225 + }, + { + "epoch": 3.138735420503376, + "grad_norm": 0.3873192071914673, + "learning_rate": 8.032062204965582e-05, + "loss": 1.8438, + "step": 10226 + }, + { + "epoch": 3.1390423572744015, + "grad_norm": 0.30686715245246887, + "learning_rate": 8.031666955782641e-05, + "loss": 1.811, + "step": 10227 + }, + { + "epoch": 3.139349294045427, + "grad_norm": 0.2738516330718994, + "learning_rate": 8.03127167663899e-05, + "loss": 1.757, + "step": 10228 + }, + { + "epoch": 3.1396562308164517, + "grad_norm": 0.3093133270740509, + "learning_rate": 8.030876367538536e-05, + "loss": 1.8181, + "step": 10229 + }, + { + "epoch": 3.139963167587477, + "grad_norm": 0.3247159719467163, + "learning_rate": 8.030481028485185e-05, + "loss": 1.7798, + "step": 10230 + }, + { + "epoch": 3.140270104358502, + "grad_norm": 0.2855088412761688, + "learning_rate": 8.030085659482845e-05, + "loss": 1.825, + "step": 10231 + }, + { + "epoch": 3.1405770411295273, + "grad_norm": 0.2818242907524109, + "learning_rate": 8.02969026053542e-05, + "loss": 1.7737, + "step": 10232 + }, + { + "epoch": 3.1408839779005526, + "grad_norm": 0.27074751257896423, + "learning_rate": 8.029294831646822e-05, + "loss": 1.8306, + "step": 10233 + }, + { + "epoch": 3.1411909146715775, + "grad_norm": 0.29740920662879944, + "learning_rate": 8.028899372820954e-05, + "loss": 1.8157, + "step": 10234 + }, + { + "epoch": 3.141497851442603, + "grad_norm": 0.30743202567100525, + "learning_rate": 8.028503884061731e-05, + "loss": 1.7626, + "step": 10235 + }, + { + "epoch": 3.141804788213628, + "grad_norm": 0.27812567353248596, + "learning_rate": 8.028108365373058e-05, + "loss": 1.7604, + "step": 10236 + }, + { + "epoch": 3.142111724984653, + "grad_norm": 0.26212629675865173, + "learning_rate": 8.027712816758839e-05, + "loss": 1.8161, + "step": 10237 + }, + { + "epoch": 3.1424186617556784, + "grad_norm": 0.3611658811569214, + "learning_rate": 8.02731723822299e-05, + "loss": 1.8283, + "step": 10238 + }, + { + "epoch": 3.1427255985267033, + "grad_norm": 0.31705498695373535, + "learning_rate": 8.026921629769418e-05, + "loss": 1.7986, + "step": 10239 + }, + { + "epoch": 3.1430325352977286, + "grad_norm": 0.25905972719192505, + "learning_rate": 8.026525991402032e-05, + "loss": 1.7926, + "step": 10240 + }, + { + "epoch": 3.143339472068754, + "grad_norm": 0.42376595735549927, + "learning_rate": 8.026130323124741e-05, + "loss": 1.8275, + "step": 10241 + }, + { + "epoch": 3.143646408839779, + "grad_norm": 0.415556401014328, + "learning_rate": 8.025734624941458e-05, + "loss": 1.7938, + "step": 10242 + }, + { + "epoch": 3.143953345610804, + "grad_norm": 0.3558904528617859, + "learning_rate": 8.025338896856091e-05, + "loss": 1.836, + "step": 10243 + }, + { + "epoch": 3.1442602823818295, + "grad_norm": 0.3091062307357788, + "learning_rate": 8.024943138872553e-05, + "loss": 1.8285, + "step": 10244 + }, + { + "epoch": 3.1445672191528544, + "grad_norm": 0.2620905041694641, + "learning_rate": 8.024547350994753e-05, + "loss": 1.7115, + "step": 10245 + }, + { + "epoch": 3.1448741559238798, + "grad_norm": 0.25716835260391235, + "learning_rate": 8.024151533226604e-05, + "loss": 1.7702, + "step": 10246 + }, + { + "epoch": 3.1451810926949046, + "grad_norm": 0.250844269990921, + "learning_rate": 8.023755685572017e-05, + "loss": 1.7617, + "step": 10247 + }, + { + "epoch": 3.14548802946593, + "grad_norm": 0.23898956179618835, + "learning_rate": 8.023359808034903e-05, + "loss": 1.7872, + "step": 10248 + }, + { + "epoch": 3.1457949662369553, + "grad_norm": 0.2335387021303177, + "learning_rate": 8.022963900619176e-05, + "loss": 1.7656, + "step": 10249 + }, + { + "epoch": 3.14610190300798, + "grad_norm": 0.21822704374790192, + "learning_rate": 8.022567963328749e-05, + "loss": 1.7706, + "step": 10250 + }, + { + "epoch": 3.1464088397790055, + "grad_norm": 0.2627898156642914, + "learning_rate": 8.022171996167531e-05, + "loss": 1.8559, + "step": 10251 + }, + { + "epoch": 3.146715776550031, + "grad_norm": 0.2530064582824707, + "learning_rate": 8.021775999139441e-05, + "loss": 1.788, + "step": 10252 + }, + { + "epoch": 3.1470227133210558, + "grad_norm": 0.2293635457754135, + "learning_rate": 8.021379972248387e-05, + "loss": 1.8129, + "step": 10253 + }, + { + "epoch": 3.147329650092081, + "grad_norm": 0.27753588557243347, + "learning_rate": 8.020983915498286e-05, + "loss": 1.7957, + "step": 10254 + }, + { + "epoch": 3.147636586863106, + "grad_norm": 0.24507668614387512, + "learning_rate": 8.020587828893051e-05, + "loss": 1.7969, + "step": 10255 + }, + { + "epoch": 3.1479435236341313, + "grad_norm": 0.24818891286849976, + "learning_rate": 8.020191712436598e-05, + "loss": 1.8412, + "step": 10256 + }, + { + "epoch": 3.1482504604051567, + "grad_norm": 0.2463149130344391, + "learning_rate": 8.01979556613284e-05, + "loss": 1.8097, + "step": 10257 + }, + { + "epoch": 3.1485573971761815, + "grad_norm": 0.26742151379585266, + "learning_rate": 8.019399389985692e-05, + "loss": 1.8487, + "step": 10258 + }, + { + "epoch": 3.148864333947207, + "grad_norm": 0.3078254461288452, + "learning_rate": 8.01900318399907e-05, + "loss": 1.8189, + "step": 10259 + }, + { + "epoch": 3.149171270718232, + "grad_norm": 0.3819321393966675, + "learning_rate": 8.018606948176887e-05, + "loss": 1.8019, + "step": 10260 + }, + { + "epoch": 3.149478207489257, + "grad_norm": 0.3932126462459564, + "learning_rate": 8.018210682523061e-05, + "loss": 1.787, + "step": 10261 + }, + { + "epoch": 3.1497851442602824, + "grad_norm": 0.2696186900138855, + "learning_rate": 8.017814387041511e-05, + "loss": 1.8345, + "step": 10262 + }, + { + "epoch": 3.150092081031308, + "grad_norm": 0.32631832361221313, + "learning_rate": 8.017418061736149e-05, + "loss": 1.7724, + "step": 10263 + }, + { + "epoch": 3.1503990178023327, + "grad_norm": 0.36187833547592163, + "learning_rate": 8.017021706610893e-05, + "loss": 1.7829, + "step": 10264 + }, + { + "epoch": 3.150705954573358, + "grad_norm": 0.29678142070770264, + "learning_rate": 8.01662532166966e-05, + "loss": 1.7896, + "step": 10265 + }, + { + "epoch": 3.151012891344383, + "grad_norm": 0.2997078001499176, + "learning_rate": 8.016228906916368e-05, + "loss": 1.8401, + "step": 10266 + }, + { + "epoch": 3.1513198281154082, + "grad_norm": 0.4688792824745178, + "learning_rate": 8.015832462354933e-05, + "loss": 1.8263, + "step": 10267 + }, + { + "epoch": 3.1516267648864336, + "grad_norm": 0.42710503935813904, + "learning_rate": 8.015435987989275e-05, + "loss": 1.8233, + "step": 10268 + }, + { + "epoch": 3.1519337016574585, + "grad_norm": 0.2490987628698349, + "learning_rate": 8.01503948382331e-05, + "loss": 1.7792, + "step": 10269 + }, + { + "epoch": 3.152240638428484, + "grad_norm": 0.400836706161499, + "learning_rate": 8.014642949860957e-05, + "loss": 1.8113, + "step": 10270 + }, + { + "epoch": 3.1525475751995087, + "grad_norm": 0.47995972633361816, + "learning_rate": 8.014246386106138e-05, + "loss": 1.8754, + "step": 10271 + }, + { + "epoch": 3.152854511970534, + "grad_norm": 0.39069879055023193, + "learning_rate": 8.013849792562769e-05, + "loss": 1.8541, + "step": 10272 + }, + { + "epoch": 3.1531614487415593, + "grad_norm": 0.27174463868141174, + "learning_rate": 8.013453169234768e-05, + "loss": 1.8018, + "step": 10273 + }, + { + "epoch": 3.1534683855125842, + "grad_norm": 0.37808045744895935, + "learning_rate": 8.013056516126058e-05, + "loss": 1.8346, + "step": 10274 + }, + { + "epoch": 3.1537753222836096, + "grad_norm": 0.43864908814430237, + "learning_rate": 8.012659833240557e-05, + "loss": 1.7626, + "step": 10275 + }, + { + "epoch": 3.154082259054635, + "grad_norm": 0.3592168688774109, + "learning_rate": 8.012263120582187e-05, + "loss": 1.8261, + "step": 10276 + }, + { + "epoch": 3.15438919582566, + "grad_norm": 0.3056562542915344, + "learning_rate": 8.011866378154866e-05, + "loss": 1.903, + "step": 10277 + }, + { + "epoch": 3.154696132596685, + "grad_norm": 0.2898549735546112, + "learning_rate": 8.011469605962517e-05, + "loss": 1.7781, + "step": 10278 + }, + { + "epoch": 3.1550030693677105, + "grad_norm": 0.3498871624469757, + "learning_rate": 8.011072804009059e-05, + "loss": 1.7571, + "step": 10279 + }, + { + "epoch": 3.1553100061387354, + "grad_norm": 0.3330932557582855, + "learning_rate": 8.010675972298416e-05, + "loss": 1.8298, + "step": 10280 + }, + { + "epoch": 3.1556169429097607, + "grad_norm": 0.2540839910507202, + "learning_rate": 8.010279110834507e-05, + "loss": 1.8327, + "step": 10281 + }, + { + "epoch": 3.1559238796807856, + "grad_norm": 0.3557111322879791, + "learning_rate": 8.009882219621257e-05, + "loss": 1.7611, + "step": 10282 + }, + { + "epoch": 3.156230816451811, + "grad_norm": 0.28293952345848083, + "learning_rate": 8.009485298662584e-05, + "loss": 1.7761, + "step": 10283 + }, + { + "epoch": 3.1565377532228363, + "grad_norm": 0.27089303731918335, + "learning_rate": 8.009088347962416e-05, + "loss": 1.8081, + "step": 10284 + }, + { + "epoch": 3.156844689993861, + "grad_norm": 0.2689332664012909, + "learning_rate": 8.008691367524673e-05, + "loss": 1.7458, + "step": 10285 + }, + { + "epoch": 3.1571516267648865, + "grad_norm": 0.2495841234922409, + "learning_rate": 8.008294357353278e-05, + "loss": 1.8307, + "step": 10286 + }, + { + "epoch": 3.1574585635359114, + "grad_norm": 0.29242852330207825, + "learning_rate": 8.007897317452156e-05, + "loss": 1.9216, + "step": 10287 + }, + { + "epoch": 3.1577655003069367, + "grad_norm": 0.26574134826660156, + "learning_rate": 8.007500247825229e-05, + "loss": 1.8392, + "step": 10288 + }, + { + "epoch": 3.158072437077962, + "grad_norm": 0.2503872811794281, + "learning_rate": 8.00710314847642e-05, + "loss": 1.7742, + "step": 10289 + }, + { + "epoch": 3.158379373848987, + "grad_norm": 0.25614771246910095, + "learning_rate": 8.006706019409658e-05, + "loss": 1.828, + "step": 10290 + }, + { + "epoch": 3.1586863106200123, + "grad_norm": 0.259369820356369, + "learning_rate": 8.006308860628863e-05, + "loss": 1.8328, + "step": 10291 + }, + { + "epoch": 3.1589932473910376, + "grad_norm": 0.28183647990226746, + "learning_rate": 8.005911672137962e-05, + "loss": 1.8269, + "step": 10292 + }, + { + "epoch": 3.1593001841620625, + "grad_norm": 0.2926514446735382, + "learning_rate": 8.005514453940881e-05, + "loss": 1.8334, + "step": 10293 + }, + { + "epoch": 3.159607120933088, + "grad_norm": 0.34313449263572693, + "learning_rate": 8.005117206041543e-05, + "loss": 1.7866, + "step": 10294 + }, + { + "epoch": 3.159914057704113, + "grad_norm": 0.30971628427505493, + "learning_rate": 8.004719928443875e-05, + "loss": 1.7827, + "step": 10295 + }, + { + "epoch": 3.160220994475138, + "grad_norm": 0.23955371975898743, + "learning_rate": 8.004322621151807e-05, + "loss": 1.7619, + "step": 10296 + }, + { + "epoch": 3.1605279312461634, + "grad_norm": 0.31311795115470886, + "learning_rate": 8.003925284169261e-05, + "loss": 1.8247, + "step": 10297 + }, + { + "epoch": 3.1608348680171883, + "grad_norm": 0.3408358097076416, + "learning_rate": 8.003527917500163e-05, + "loss": 1.8146, + "step": 10298 + }, + { + "epoch": 3.1611418047882136, + "grad_norm": 0.3030858337879181, + "learning_rate": 8.003130521148442e-05, + "loss": 1.857, + "step": 10299 + }, + { + "epoch": 3.161448741559239, + "grad_norm": 0.25168511271476746, + "learning_rate": 8.002733095118025e-05, + "loss": 1.8404, + "step": 10300 + }, + { + "epoch": 3.161755678330264, + "grad_norm": 0.2956216335296631, + "learning_rate": 8.002335639412839e-05, + "loss": 1.7352, + "step": 10301 + }, + { + "epoch": 3.162062615101289, + "grad_norm": 0.27791857719421387, + "learning_rate": 8.001938154036814e-05, + "loss": 1.7797, + "step": 10302 + }, + { + "epoch": 3.1623695518723145, + "grad_norm": 0.3106420040130615, + "learning_rate": 8.001540638993876e-05, + "loss": 1.8434, + "step": 10303 + }, + { + "epoch": 3.1626764886433394, + "grad_norm": 0.2940445840358734, + "learning_rate": 8.001143094287954e-05, + "loss": 1.8459, + "step": 10304 + }, + { + "epoch": 3.1629834254143647, + "grad_norm": 0.3857429325580597, + "learning_rate": 8.000745519922977e-05, + "loss": 1.7853, + "step": 10305 + }, + { + "epoch": 3.1632903621853896, + "grad_norm": 0.3585071861743927, + "learning_rate": 8.000347915902874e-05, + "loss": 1.8905, + "step": 10306 + }, + { + "epoch": 3.163597298956415, + "grad_norm": 0.320003867149353, + "learning_rate": 7.999950282231574e-05, + "loss": 1.8397, + "step": 10307 + }, + { + "epoch": 3.1639042357274403, + "grad_norm": 0.24986252188682556, + "learning_rate": 7.999552618913009e-05, + "loss": 1.7916, + "step": 10308 + }, + { + "epoch": 3.164211172498465, + "grad_norm": 0.33077237010002136, + "learning_rate": 7.999154925951104e-05, + "loss": 1.8334, + "step": 10309 + }, + { + "epoch": 3.1645181092694905, + "grad_norm": 0.35700327157974243, + "learning_rate": 7.998757203349794e-05, + "loss": 1.7773, + "step": 10310 + }, + { + "epoch": 3.164825046040516, + "grad_norm": 0.3095493018627167, + "learning_rate": 7.998359451113007e-05, + "loss": 1.8156, + "step": 10311 + }, + { + "epoch": 3.1651319828115407, + "grad_norm": 0.3004748225212097, + "learning_rate": 7.997961669244673e-05, + "loss": 1.7862, + "step": 10312 + }, + { + "epoch": 3.165438919582566, + "grad_norm": 0.39382806420326233, + "learning_rate": 7.99756385774873e-05, + "loss": 1.764, + "step": 10313 + }, + { + "epoch": 3.165745856353591, + "grad_norm": 0.3109463155269623, + "learning_rate": 7.997166016629099e-05, + "loss": 1.8006, + "step": 10314 + }, + { + "epoch": 3.1660527931246163, + "grad_norm": 0.2896469235420227, + "learning_rate": 7.996768145889717e-05, + "loss": 1.8373, + "step": 10315 + }, + { + "epoch": 3.1663597298956416, + "grad_norm": 0.35024940967559814, + "learning_rate": 7.996370245534517e-05, + "loss": 1.797, + "step": 10316 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.3228827714920044, + "learning_rate": 7.995972315567431e-05, + "loss": 1.7757, + "step": 10317 + }, + { + "epoch": 3.166973603437692, + "grad_norm": 0.27102410793304443, + "learning_rate": 7.995574355992388e-05, + "loss": 1.7786, + "step": 10318 + }, + { + "epoch": 3.167280540208717, + "grad_norm": 0.2556116580963135, + "learning_rate": 7.995176366813325e-05, + "loss": 1.7621, + "step": 10319 + }, + { + "epoch": 3.167587476979742, + "grad_norm": 0.28279444575309753, + "learning_rate": 7.994778348034173e-05, + "loss": 1.7954, + "step": 10320 + }, + { + "epoch": 3.1678944137507674, + "grad_norm": 0.31778639554977417, + "learning_rate": 7.994380299658867e-05, + "loss": 1.7657, + "step": 10321 + }, + { + "epoch": 3.1682013505217923, + "grad_norm": 0.27935469150543213, + "learning_rate": 7.993982221691339e-05, + "loss": 1.7502, + "step": 10322 + }, + { + "epoch": 3.1685082872928176, + "grad_norm": 0.29012617468833923, + "learning_rate": 7.993584114135524e-05, + "loss": 1.8497, + "step": 10323 + }, + { + "epoch": 3.168815224063843, + "grad_norm": 0.2674056887626648, + "learning_rate": 7.993185976995356e-05, + "loss": 1.7875, + "step": 10324 + }, + { + "epoch": 3.169122160834868, + "grad_norm": 0.2667328417301178, + "learning_rate": 7.992787810274771e-05, + "loss": 1.771, + "step": 10325 + }, + { + "epoch": 3.169429097605893, + "grad_norm": 0.25807151198387146, + "learning_rate": 7.992389613977702e-05, + "loss": 1.7638, + "step": 10326 + }, + { + "epoch": 3.1697360343769185, + "grad_norm": 0.2572930157184601, + "learning_rate": 7.991991388108084e-05, + "loss": 1.8218, + "step": 10327 + }, + { + "epoch": 3.1700429711479434, + "grad_norm": 0.3955067992210388, + "learning_rate": 7.991593132669855e-05, + "loss": 1.8458, + "step": 10328 + }, + { + "epoch": 3.1703499079189688, + "grad_norm": 0.2813466489315033, + "learning_rate": 7.991194847666948e-05, + "loss": 1.8042, + "step": 10329 + }, + { + "epoch": 3.1706568446899936, + "grad_norm": 0.2645012140274048, + "learning_rate": 7.990796533103302e-05, + "loss": 1.8241, + "step": 10330 + }, + { + "epoch": 3.170963781461019, + "grad_norm": 0.28462091088294983, + "learning_rate": 7.99039818898285e-05, + "loss": 1.8853, + "step": 10331 + }, + { + "epoch": 3.1712707182320443, + "grad_norm": 0.2727372944355011, + "learning_rate": 7.98999981530953e-05, + "loss": 1.7564, + "step": 10332 + }, + { + "epoch": 3.171577655003069, + "grad_norm": 0.2658170759677887, + "learning_rate": 7.989601412087281e-05, + "loss": 1.8344, + "step": 10333 + }, + { + "epoch": 3.1718845917740945, + "grad_norm": 0.29713502526283264, + "learning_rate": 7.989202979320039e-05, + "loss": 1.8721, + "step": 10334 + }, + { + "epoch": 3.17219152854512, + "grad_norm": 0.26609495282173157, + "learning_rate": 7.98880451701174e-05, + "loss": 1.7991, + "step": 10335 + }, + { + "epoch": 3.1724984653161448, + "grad_norm": 0.29779741168022156, + "learning_rate": 7.988406025166322e-05, + "loss": 1.8182, + "step": 10336 + }, + { + "epoch": 3.17280540208717, + "grad_norm": 0.2771340012550354, + "learning_rate": 7.988007503787724e-05, + "loss": 1.8034, + "step": 10337 + }, + { + "epoch": 3.1731123388581954, + "grad_norm": 0.30510422587394714, + "learning_rate": 7.987608952879886e-05, + "loss": 1.8477, + "step": 10338 + }, + { + "epoch": 3.1734192756292203, + "grad_norm": 0.3097476363182068, + "learning_rate": 7.987210372446745e-05, + "loss": 1.7572, + "step": 10339 + }, + { + "epoch": 3.1737262124002457, + "grad_norm": 0.2553942799568176, + "learning_rate": 7.986811762492239e-05, + "loss": 1.7837, + "step": 10340 + }, + { + "epoch": 3.1740331491712706, + "grad_norm": 0.26546719670295715, + "learning_rate": 7.986413123020312e-05, + "loss": 1.7893, + "step": 10341 + }, + { + "epoch": 3.174340085942296, + "grad_norm": 0.37721553444862366, + "learning_rate": 7.986014454034895e-05, + "loss": 1.8475, + "step": 10342 + }, + { + "epoch": 3.174647022713321, + "grad_norm": 0.3215494453907013, + "learning_rate": 7.985615755539937e-05, + "loss": 1.7806, + "step": 10343 + }, + { + "epoch": 3.174953959484346, + "grad_norm": 0.2662442922592163, + "learning_rate": 7.985217027539373e-05, + "loss": 1.8116, + "step": 10344 + }, + { + "epoch": 3.1752608962553714, + "grad_norm": 0.23334236443042755, + "learning_rate": 7.984818270037145e-05, + "loss": 1.7929, + "step": 10345 + }, + { + "epoch": 3.1755678330263963, + "grad_norm": 0.2873367667198181, + "learning_rate": 7.98441948303719e-05, + "loss": 1.7808, + "step": 10346 + }, + { + "epoch": 3.1758747697974217, + "grad_norm": 0.3623826801776886, + "learning_rate": 7.984020666543458e-05, + "loss": 1.8817, + "step": 10347 + }, + { + "epoch": 3.176181706568447, + "grad_norm": 0.3060589134693146, + "learning_rate": 7.983621820559881e-05, + "loss": 1.796, + "step": 10348 + }, + { + "epoch": 3.176488643339472, + "grad_norm": 0.2396882325410843, + "learning_rate": 7.983222945090407e-05, + "loss": 1.7455, + "step": 10349 + }, + { + "epoch": 3.1767955801104972, + "grad_norm": 0.24811476469039917, + "learning_rate": 7.982824040138974e-05, + "loss": 1.7907, + "step": 10350 + }, + { + "epoch": 3.1771025168815226, + "grad_norm": 0.32749706506729126, + "learning_rate": 7.982425105709524e-05, + "loss": 1.8553, + "step": 10351 + }, + { + "epoch": 3.1774094536525475, + "grad_norm": 0.3648095726966858, + "learning_rate": 7.982026141806003e-05, + "loss": 1.8387, + "step": 10352 + }, + { + "epoch": 3.177716390423573, + "grad_norm": 0.2749348282814026, + "learning_rate": 7.981627148432352e-05, + "loss": 1.7676, + "step": 10353 + }, + { + "epoch": 3.178023327194598, + "grad_norm": 0.2735142409801483, + "learning_rate": 7.981228125592513e-05, + "loss": 1.822, + "step": 10354 + }, + { + "epoch": 3.178330263965623, + "grad_norm": 0.28759655356407166, + "learning_rate": 7.98082907329043e-05, + "loss": 1.8113, + "step": 10355 + }, + { + "epoch": 3.1786372007366483, + "grad_norm": 0.33661654591560364, + "learning_rate": 7.980429991530048e-05, + "loss": 1.8036, + "step": 10356 + }, + { + "epoch": 3.1789441375076732, + "grad_norm": 0.2634892761707306, + "learning_rate": 7.98003088031531e-05, + "loss": 1.8323, + "step": 10357 + }, + { + "epoch": 3.1792510742786986, + "grad_norm": 0.25864094495773315, + "learning_rate": 7.979631739650158e-05, + "loss": 1.8199, + "step": 10358 + }, + { + "epoch": 3.179558011049724, + "grad_norm": 0.27368444204330444, + "learning_rate": 7.979232569538541e-05, + "loss": 1.7673, + "step": 10359 + }, + { + "epoch": 3.179864947820749, + "grad_norm": 0.2506616413593292, + "learning_rate": 7.9788333699844e-05, + "loss": 1.7912, + "step": 10360 + }, + { + "epoch": 3.180171884591774, + "grad_norm": 0.2539178133010864, + "learning_rate": 7.978434140991684e-05, + "loss": 1.7934, + "step": 10361 + }, + { + "epoch": 3.1804788213627995, + "grad_norm": 0.2605626881122589, + "learning_rate": 7.978034882564334e-05, + "loss": 1.8031, + "step": 10362 + }, + { + "epoch": 3.1807857581338244, + "grad_norm": 0.2610207796096802, + "learning_rate": 7.977635594706299e-05, + "loss": 1.8664, + "step": 10363 + }, + { + "epoch": 3.1810926949048497, + "grad_norm": 0.26164132356643677, + "learning_rate": 7.977236277421523e-05, + "loss": 1.7758, + "step": 10364 + }, + { + "epoch": 3.1813996316758746, + "grad_norm": 0.3122340142726898, + "learning_rate": 7.976836930713953e-05, + "loss": 1.9033, + "step": 10365 + }, + { + "epoch": 3.1817065684469, + "grad_norm": 0.3317202031612396, + "learning_rate": 7.976437554587537e-05, + "loss": 1.7899, + "step": 10366 + }, + { + "epoch": 3.1820135052179253, + "grad_norm": 0.28612568974494934, + "learning_rate": 7.97603814904622e-05, + "loss": 1.8145, + "step": 10367 + }, + { + "epoch": 3.18232044198895, + "grad_norm": 0.349917471408844, + "learning_rate": 7.975638714093949e-05, + "loss": 1.877, + "step": 10368 + }, + { + "epoch": 3.1826273787599755, + "grad_norm": 0.3737771809101105, + "learning_rate": 7.975239249734672e-05, + "loss": 1.8204, + "step": 10369 + }, + { + "epoch": 3.182934315531001, + "grad_norm": 0.3688446879386902, + "learning_rate": 7.974839755972339e-05, + "loss": 1.8487, + "step": 10370 + }, + { + "epoch": 3.1832412523020257, + "grad_norm": 0.2934897541999817, + "learning_rate": 7.974440232810894e-05, + "loss": 1.8243, + "step": 10371 + }, + { + "epoch": 3.183548189073051, + "grad_norm": 0.2596173882484436, + "learning_rate": 7.974040680254287e-05, + "loss": 1.7887, + "step": 10372 + }, + { + "epoch": 3.183855125844076, + "grad_norm": 0.35686594247817993, + "learning_rate": 7.973641098306468e-05, + "loss": 1.8653, + "step": 10373 + }, + { + "epoch": 3.1841620626151013, + "grad_norm": 0.3187713921070099, + "learning_rate": 7.973241486971383e-05, + "loss": 1.8767, + "step": 10374 + }, + { + "epoch": 3.1844689993861266, + "grad_norm": 0.2596273124217987, + "learning_rate": 7.972841846252985e-05, + "loss": 1.8028, + "step": 10375 + }, + { + "epoch": 3.1847759361571515, + "grad_norm": 0.2637474834918976, + "learning_rate": 7.972442176155221e-05, + "loss": 1.802, + "step": 10376 + }, + { + "epoch": 3.185082872928177, + "grad_norm": 0.2641126215457916, + "learning_rate": 7.97204247668204e-05, + "loss": 1.7931, + "step": 10377 + }, + { + "epoch": 3.185389809699202, + "grad_norm": 0.25594159960746765, + "learning_rate": 7.971642747837393e-05, + "loss": 1.818, + "step": 10378 + }, + { + "epoch": 3.185696746470227, + "grad_norm": 0.26567938923835754, + "learning_rate": 7.971242989625233e-05, + "loss": 1.8174, + "step": 10379 + }, + { + "epoch": 3.1860036832412524, + "grad_norm": 0.29580214619636536, + "learning_rate": 7.970843202049508e-05, + "loss": 1.869, + "step": 10380 + }, + { + "epoch": 3.1863106200122773, + "grad_norm": 0.2657530605792999, + "learning_rate": 7.970443385114168e-05, + "loss": 1.8352, + "step": 10381 + }, + { + "epoch": 3.1866175567833026, + "grad_norm": 0.2468358278274536, + "learning_rate": 7.970043538823165e-05, + "loss": 1.7851, + "step": 10382 + }, + { + "epoch": 3.186924493554328, + "grad_norm": 0.26464715600013733, + "learning_rate": 7.969643663180451e-05, + "loss": 1.8208, + "step": 10383 + }, + { + "epoch": 3.187231430325353, + "grad_norm": 0.26035723090171814, + "learning_rate": 7.969243758189979e-05, + "loss": 1.8089, + "step": 10384 + }, + { + "epoch": 3.187538367096378, + "grad_norm": 0.2644619941711426, + "learning_rate": 7.968843823855699e-05, + "loss": 1.8379, + "step": 10385 + }, + { + "epoch": 3.1878453038674035, + "grad_norm": 0.25576624274253845, + "learning_rate": 7.968443860181565e-05, + "loss": 1.7932, + "step": 10386 + }, + { + "epoch": 3.1881522406384284, + "grad_norm": 0.24276074767112732, + "learning_rate": 7.968043867171528e-05, + "loss": 1.8037, + "step": 10387 + }, + { + "epoch": 3.1884591774094537, + "grad_norm": 0.27156540751457214, + "learning_rate": 7.967643844829543e-05, + "loss": 1.7998, + "step": 10388 + }, + { + "epoch": 3.1887661141804786, + "grad_norm": 0.2555428743362427, + "learning_rate": 7.96724379315956e-05, + "loss": 1.7612, + "step": 10389 + }, + { + "epoch": 3.189073050951504, + "grad_norm": 0.3358438014984131, + "learning_rate": 7.966843712165537e-05, + "loss": 1.8543, + "step": 10390 + }, + { + "epoch": 3.1893799877225293, + "grad_norm": 0.2799586355686188, + "learning_rate": 7.966443601851424e-05, + "loss": 1.819, + "step": 10391 + }, + { + "epoch": 3.189686924493554, + "grad_norm": 0.2364189177751541, + "learning_rate": 7.966043462221178e-05, + "loss": 1.8537, + "step": 10392 + }, + { + "epoch": 3.1899938612645795, + "grad_norm": 0.23849403858184814, + "learning_rate": 7.96564329327875e-05, + "loss": 1.8125, + "step": 10393 + }, + { + "epoch": 3.190300798035605, + "grad_norm": 0.2371583878993988, + "learning_rate": 7.965243095028098e-05, + "loss": 1.7352, + "step": 10394 + }, + { + "epoch": 3.1906077348066297, + "grad_norm": 0.2584737539291382, + "learning_rate": 7.964842867473176e-05, + "loss": 1.8801, + "step": 10395 + }, + { + "epoch": 3.190914671577655, + "grad_norm": 0.27768051624298096, + "learning_rate": 7.964442610617939e-05, + "loss": 1.8221, + "step": 10396 + }, + { + "epoch": 3.1912216083486804, + "grad_norm": 0.2680891752243042, + "learning_rate": 7.964042324466341e-05, + "loss": 1.8371, + "step": 10397 + }, + { + "epoch": 3.1915285451197053, + "grad_norm": 0.25301921367645264, + "learning_rate": 7.963642009022343e-05, + "loss": 1.7972, + "step": 10398 + }, + { + "epoch": 3.1918354818907306, + "grad_norm": 0.2589731216430664, + "learning_rate": 7.963241664289896e-05, + "loss": 1.8145, + "step": 10399 + }, + { + "epoch": 3.1921424186617555, + "grad_norm": 0.2611297369003296, + "learning_rate": 7.962841290272956e-05, + "loss": 1.8736, + "step": 10400 + }, + { + "epoch": 3.192449355432781, + "grad_norm": 0.2812272906303406, + "learning_rate": 7.962440886975483e-05, + "loss": 1.8116, + "step": 10401 + }, + { + "epoch": 3.192756292203806, + "grad_norm": 0.3261657655239105, + "learning_rate": 7.962040454401434e-05, + "loss": 1.7935, + "step": 10402 + }, + { + "epoch": 3.193063228974831, + "grad_norm": 0.3355373442173004, + "learning_rate": 7.961639992554764e-05, + "loss": 1.7957, + "step": 10403 + }, + { + "epoch": 3.1933701657458564, + "grad_norm": 0.2811843156814575, + "learning_rate": 7.961239501439432e-05, + "loss": 1.797, + "step": 10404 + }, + { + "epoch": 3.1936771025168813, + "grad_norm": 0.24933238327503204, + "learning_rate": 7.960838981059395e-05, + "loss": 1.7594, + "step": 10405 + }, + { + "epoch": 3.1939840392879066, + "grad_norm": 0.29110121726989746, + "learning_rate": 7.960438431418613e-05, + "loss": 1.8268, + "step": 10406 + }, + { + "epoch": 3.194290976058932, + "grad_norm": 0.3702283799648285, + "learning_rate": 7.960037852521043e-05, + "loss": 1.7629, + "step": 10407 + }, + { + "epoch": 3.194597912829957, + "grad_norm": 0.33275437355041504, + "learning_rate": 7.959637244370644e-05, + "loss": 1.8507, + "step": 10408 + }, + { + "epoch": 3.194904849600982, + "grad_norm": 0.2691981792449951, + "learning_rate": 7.959236606971375e-05, + "loss": 1.8084, + "step": 10409 + }, + { + "epoch": 3.1952117863720075, + "grad_norm": 0.30108413100242615, + "learning_rate": 7.958835940327194e-05, + "loss": 1.8525, + "step": 10410 + }, + { + "epoch": 3.1955187231430324, + "grad_norm": 0.32112306356430054, + "learning_rate": 7.958435244442064e-05, + "loss": 1.7431, + "step": 10411 + }, + { + "epoch": 3.1958256599140578, + "grad_norm": 0.2795291543006897, + "learning_rate": 7.958034519319942e-05, + "loss": 1.7985, + "step": 10412 + }, + { + "epoch": 3.196132596685083, + "grad_norm": 0.2485792338848114, + "learning_rate": 7.957633764964788e-05, + "loss": 1.7363, + "step": 10413 + }, + { + "epoch": 3.196439533456108, + "grad_norm": 0.3552432358264923, + "learning_rate": 7.957232981380565e-05, + "loss": 1.8174, + "step": 10414 + }, + { + "epoch": 3.1967464702271333, + "grad_norm": 0.3829655051231384, + "learning_rate": 7.956832168571234e-05, + "loss": 1.9249, + "step": 10415 + }, + { + "epoch": 3.197053406998158, + "grad_norm": 0.2498074769973755, + "learning_rate": 7.956431326540752e-05, + "loss": 1.8104, + "step": 10416 + }, + { + "epoch": 3.1973603437691835, + "grad_norm": 0.24596504867076874, + "learning_rate": 7.956030455293082e-05, + "loss": 1.8007, + "step": 10417 + }, + { + "epoch": 3.197667280540209, + "grad_norm": 0.2795363664627075, + "learning_rate": 7.95562955483219e-05, + "loss": 1.775, + "step": 10418 + }, + { + "epoch": 3.1979742173112338, + "grad_norm": 0.3581138253211975, + "learning_rate": 7.95522862516203e-05, + "loss": 1.8567, + "step": 10419 + }, + { + "epoch": 3.198281154082259, + "grad_norm": 0.36102500557899475, + "learning_rate": 7.95482766628657e-05, + "loss": 1.8509, + "step": 10420 + }, + { + "epoch": 3.198588090853284, + "grad_norm": 0.4717029929161072, + "learning_rate": 7.954426678209774e-05, + "loss": 1.8218, + "step": 10421 + }, + { + "epoch": 3.1988950276243093, + "grad_norm": 0.3211984932422638, + "learning_rate": 7.9540256609356e-05, + "loss": 1.8696, + "step": 10422 + }, + { + "epoch": 3.1992019643953347, + "grad_norm": 0.30094626545906067, + "learning_rate": 7.953624614468011e-05, + "loss": 1.8714, + "step": 10423 + }, + { + "epoch": 3.1995089011663596, + "grad_norm": 0.267578125, + "learning_rate": 7.953223538810976e-05, + "loss": 1.7903, + "step": 10424 + }, + { + "epoch": 3.199815837937385, + "grad_norm": 0.35577845573425293, + "learning_rate": 7.952822433968453e-05, + "loss": 1.7808, + "step": 10425 + }, + { + "epoch": 3.2001227747084102, + "grad_norm": 0.4117741882801056, + "learning_rate": 7.952421299944408e-05, + "loss": 1.7856, + "step": 10426 + }, + { + "epoch": 3.200429711479435, + "grad_norm": 0.35202035307884216, + "learning_rate": 7.952020136742806e-05, + "loss": 1.8112, + "step": 10427 + }, + { + "epoch": 3.2007366482504604, + "grad_norm": 0.26514917612075806, + "learning_rate": 7.951618944367611e-05, + "loss": 1.828, + "step": 10428 + }, + { + "epoch": 3.201043585021486, + "grad_norm": 0.29219159483909607, + "learning_rate": 7.951217722822786e-05, + "loss": 1.9366, + "step": 10429 + }, + { + "epoch": 3.2013505217925107, + "grad_norm": 0.2929961383342743, + "learning_rate": 7.950816472112298e-05, + "loss": 1.8006, + "step": 10430 + }, + { + "epoch": 3.201657458563536, + "grad_norm": 0.28339722752571106, + "learning_rate": 7.950415192240114e-05, + "loss": 1.7411, + "step": 10431 + }, + { + "epoch": 3.201964395334561, + "grad_norm": 0.258884996175766, + "learning_rate": 7.950013883210196e-05, + "loss": 1.8153, + "step": 10432 + }, + { + "epoch": 3.2022713321055862, + "grad_norm": 0.3065929114818573, + "learning_rate": 7.949612545026512e-05, + "loss": 1.7918, + "step": 10433 + }, + { + "epoch": 3.2025782688766116, + "grad_norm": 0.289874404668808, + "learning_rate": 7.949211177693029e-05, + "loss": 1.7975, + "step": 10434 + }, + { + "epoch": 3.2028852056476365, + "grad_norm": 0.27025631070137024, + "learning_rate": 7.948809781213711e-05, + "loss": 1.8129, + "step": 10435 + }, + { + "epoch": 3.203192142418662, + "grad_norm": 0.2501074969768524, + "learning_rate": 7.948408355592528e-05, + "loss": 1.7653, + "step": 10436 + }, + { + "epoch": 3.203499079189687, + "grad_norm": 0.30402958393096924, + "learning_rate": 7.948006900833445e-05, + "loss": 1.8311, + "step": 10437 + }, + { + "epoch": 3.203806015960712, + "grad_norm": 0.28783223032951355, + "learning_rate": 7.94760541694043e-05, + "loss": 1.82, + "step": 10438 + }, + { + "epoch": 3.2041129527317374, + "grad_norm": 0.30428317189216614, + "learning_rate": 7.947203903917451e-05, + "loss": 1.8673, + "step": 10439 + }, + { + "epoch": 3.2044198895027622, + "grad_norm": 0.2860367000102997, + "learning_rate": 7.946802361768473e-05, + "loss": 1.824, + "step": 10440 + }, + { + "epoch": 3.2047268262737876, + "grad_norm": 0.2995273172855377, + "learning_rate": 7.946400790497469e-05, + "loss": 1.7342, + "step": 10441 + }, + { + "epoch": 3.205033763044813, + "grad_norm": 0.4374088943004608, + "learning_rate": 7.945999190108407e-05, + "loss": 1.8522, + "step": 10442 + }, + { + "epoch": 3.205340699815838, + "grad_norm": 0.37659478187561035, + "learning_rate": 7.945597560605252e-05, + "loss": 1.7518, + "step": 10443 + }, + { + "epoch": 3.205647636586863, + "grad_norm": 0.24257932603359222, + "learning_rate": 7.945195901991975e-05, + "loss": 1.7892, + "step": 10444 + }, + { + "epoch": 3.2059545733578885, + "grad_norm": 0.3682694435119629, + "learning_rate": 7.944794214272546e-05, + "loss": 1.7757, + "step": 10445 + }, + { + "epoch": 3.2062615101289134, + "grad_norm": 0.434692919254303, + "learning_rate": 7.944392497450936e-05, + "loss": 1.8207, + "step": 10446 + }, + { + "epoch": 3.2065684468999387, + "grad_norm": 0.3982211947441101, + "learning_rate": 7.943990751531113e-05, + "loss": 1.8303, + "step": 10447 + }, + { + "epoch": 3.2068753836709636, + "grad_norm": 0.2877334654331207, + "learning_rate": 7.943588976517049e-05, + "loss": 1.8495, + "step": 10448 + }, + { + "epoch": 3.207182320441989, + "grad_norm": 0.34589654207229614, + "learning_rate": 7.943187172412712e-05, + "loss": 1.7773, + "step": 10449 + }, + { + "epoch": 3.2074892572130143, + "grad_norm": 0.4727517366409302, + "learning_rate": 7.942785339222074e-05, + "loss": 1.8702, + "step": 10450 + }, + { + "epoch": 3.207796193984039, + "grad_norm": 0.4019354581832886, + "learning_rate": 7.942383476949107e-05, + "loss": 1.8095, + "step": 10451 + }, + { + "epoch": 3.2081031307550645, + "grad_norm": 0.2726243734359741, + "learning_rate": 7.941981585597782e-05, + "loss": 1.7273, + "step": 10452 + }, + { + "epoch": 3.20841006752609, + "grad_norm": 0.2944760024547577, + "learning_rate": 7.941579665172072e-05, + "loss": 1.7507, + "step": 10453 + }, + { + "epoch": 3.2087170042971147, + "grad_norm": 0.3530777096748352, + "learning_rate": 7.941177715675945e-05, + "loss": 1.8434, + "step": 10454 + }, + { + "epoch": 3.20902394106814, + "grad_norm": 0.28612539172172546, + "learning_rate": 7.940775737113378e-05, + "loss": 1.8094, + "step": 10455 + }, + { + "epoch": 3.209330877839165, + "grad_norm": 0.27006468176841736, + "learning_rate": 7.94037372948834e-05, + "loss": 1.7854, + "step": 10456 + }, + { + "epoch": 3.2096378146101903, + "grad_norm": 0.3027147054672241, + "learning_rate": 7.939971692804806e-05, + "loss": 1.7596, + "step": 10457 + }, + { + "epoch": 3.2099447513812156, + "grad_norm": 0.31999528408050537, + "learning_rate": 7.939569627066749e-05, + "loss": 1.8836, + "step": 10458 + }, + { + "epoch": 3.2102516881522405, + "grad_norm": 0.267600417137146, + "learning_rate": 7.939167532278142e-05, + "loss": 1.8508, + "step": 10459 + }, + { + "epoch": 3.210558624923266, + "grad_norm": 0.3171706795692444, + "learning_rate": 7.938765408442958e-05, + "loss": 1.7507, + "step": 10460 + }, + { + "epoch": 3.210865561694291, + "grad_norm": 0.2955280840396881, + "learning_rate": 7.938363255565171e-05, + "loss": 1.733, + "step": 10461 + }, + { + "epoch": 3.211172498465316, + "grad_norm": 0.3427969217300415, + "learning_rate": 7.937961073648759e-05, + "loss": 1.9208, + "step": 10462 + }, + { + "epoch": 3.2114794352363414, + "grad_norm": 0.28788647055625916, + "learning_rate": 7.937558862697692e-05, + "loss": 1.7723, + "step": 10463 + }, + { + "epoch": 3.2117863720073663, + "grad_norm": 0.26093682646751404, + "learning_rate": 7.937156622715945e-05, + "loss": 1.803, + "step": 10464 + }, + { + "epoch": 3.2120933087783916, + "grad_norm": 0.2791301906108856, + "learning_rate": 7.936754353707497e-05, + "loss": 1.7601, + "step": 10465 + }, + { + "epoch": 3.212400245549417, + "grad_norm": 0.3039831519126892, + "learning_rate": 7.93635205567632e-05, + "loss": 1.7864, + "step": 10466 + }, + { + "epoch": 3.212707182320442, + "grad_norm": 0.28498128056526184, + "learning_rate": 7.935949728626392e-05, + "loss": 1.7745, + "step": 10467 + }, + { + "epoch": 3.213014119091467, + "grad_norm": 0.2908780872821808, + "learning_rate": 7.935547372561687e-05, + "loss": 1.8281, + "step": 10468 + }, + { + "epoch": 3.2133210558624925, + "grad_norm": 0.26148509979248047, + "learning_rate": 7.935144987486183e-05, + "loss": 1.8545, + "step": 10469 + }, + { + "epoch": 3.2136279926335174, + "grad_norm": 0.2853962481021881, + "learning_rate": 7.934742573403856e-05, + "loss": 1.7765, + "step": 10470 + }, + { + "epoch": 3.2139349294045427, + "grad_norm": 0.26497501134872437, + "learning_rate": 7.934340130318681e-05, + "loss": 1.7472, + "step": 10471 + }, + { + "epoch": 3.214241866175568, + "grad_norm": 0.2806912660598755, + "learning_rate": 7.933937658234638e-05, + "loss": 1.7879, + "step": 10472 + }, + { + "epoch": 3.214548802946593, + "grad_norm": 0.2699974477291107, + "learning_rate": 7.933535157155705e-05, + "loss": 1.7539, + "step": 10473 + }, + { + "epoch": 3.2148557397176183, + "grad_norm": 0.22714731097221375, + "learning_rate": 7.933132627085856e-05, + "loss": 1.7861, + "step": 10474 + }, + { + "epoch": 3.215162676488643, + "grad_norm": 0.291340708732605, + "learning_rate": 7.932730068029072e-05, + "loss": 1.8381, + "step": 10475 + }, + { + "epoch": 3.2154696132596685, + "grad_norm": 0.3257324695587158, + "learning_rate": 7.93232747998933e-05, + "loss": 1.8293, + "step": 10476 + }, + { + "epoch": 3.215776550030694, + "grad_norm": 0.3518911600112915, + "learning_rate": 7.93192486297061e-05, + "loss": 1.853, + "step": 10477 + }, + { + "epoch": 3.2160834868017187, + "grad_norm": 0.27663540840148926, + "learning_rate": 7.93152221697689e-05, + "loss": 1.7831, + "step": 10478 + }, + { + "epoch": 3.216390423572744, + "grad_norm": 0.3153248429298401, + "learning_rate": 7.931119542012149e-05, + "loss": 1.7443, + "step": 10479 + }, + { + "epoch": 3.216697360343769, + "grad_norm": 0.2919597029685974, + "learning_rate": 7.930716838080368e-05, + "loss": 1.8108, + "step": 10480 + }, + { + "epoch": 3.2170042971147943, + "grad_norm": 0.26892516016960144, + "learning_rate": 7.930314105185524e-05, + "loss": 1.7791, + "step": 10481 + }, + { + "epoch": 3.2173112338858196, + "grad_norm": 0.2486005276441574, + "learning_rate": 7.929911343331599e-05, + "loss": 1.8184, + "step": 10482 + }, + { + "epoch": 3.2176181706568445, + "grad_norm": 0.260728120803833, + "learning_rate": 7.929508552522571e-05, + "loss": 1.7933, + "step": 10483 + }, + { + "epoch": 3.21792510742787, + "grad_norm": 0.3081948757171631, + "learning_rate": 7.929105732762425e-05, + "loss": 1.7732, + "step": 10484 + }, + { + "epoch": 3.218232044198895, + "grad_norm": 0.3807671368122101, + "learning_rate": 7.928702884055138e-05, + "loss": 1.7652, + "step": 10485 + }, + { + "epoch": 3.21853898096992, + "grad_norm": 0.31637755036354065, + "learning_rate": 7.928300006404692e-05, + "loss": 1.7605, + "step": 10486 + }, + { + "epoch": 3.2188459177409454, + "grad_norm": 0.2812853455543518, + "learning_rate": 7.927897099815071e-05, + "loss": 1.7925, + "step": 10487 + }, + { + "epoch": 3.2191528545119708, + "grad_norm": 0.3472350239753723, + "learning_rate": 7.927494164290253e-05, + "loss": 1.8252, + "step": 10488 + }, + { + "epoch": 3.2194597912829956, + "grad_norm": 0.4202714264392853, + "learning_rate": 7.927091199834222e-05, + "loss": 1.7993, + "step": 10489 + }, + { + "epoch": 3.219766728054021, + "grad_norm": 0.44552353024482727, + "learning_rate": 7.92668820645096e-05, + "loss": 1.8609, + "step": 10490 + }, + { + "epoch": 3.220073664825046, + "grad_norm": 0.38964664936065674, + "learning_rate": 7.926285184144451e-05, + "loss": 1.864, + "step": 10491 + }, + { + "epoch": 3.220380601596071, + "grad_norm": 0.2978462278842926, + "learning_rate": 7.925882132918676e-05, + "loss": 1.7892, + "step": 10492 + }, + { + "epoch": 3.2206875383670965, + "grad_norm": 0.2520316243171692, + "learning_rate": 7.925479052777619e-05, + "loss": 1.7702, + "step": 10493 + }, + { + "epoch": 3.2209944751381214, + "grad_norm": 0.28151068091392517, + "learning_rate": 7.925075943725263e-05, + "loss": 1.7613, + "step": 10494 + }, + { + "epoch": 3.2213014119091468, + "grad_norm": 0.3346099555492401, + "learning_rate": 7.924672805765592e-05, + "loss": 1.894, + "step": 10495 + }, + { + "epoch": 3.2216083486801717, + "grad_norm": 0.2981362044811249, + "learning_rate": 7.924269638902591e-05, + "loss": 1.8157, + "step": 10496 + }, + { + "epoch": 3.221915285451197, + "grad_norm": 0.2561499774456024, + "learning_rate": 7.923866443140242e-05, + "loss": 1.8259, + "step": 10497 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.26480481028556824, + "learning_rate": 7.923463218482532e-05, + "loss": 1.7856, + "step": 10498 + }, + { + "epoch": 3.222529158993247, + "grad_norm": 0.24103692173957825, + "learning_rate": 7.923059964933446e-05, + "loss": 1.7765, + "step": 10499 + }, + { + "epoch": 3.2228360957642725, + "grad_norm": 0.2399173080921173, + "learning_rate": 7.922656682496967e-05, + "loss": 1.8216, + "step": 10500 + }, + { + "epoch": 3.223143032535298, + "grad_norm": 0.24530018866062164, + "learning_rate": 7.922253371177082e-05, + "loss": 1.8155, + "step": 10501 + }, + { + "epoch": 3.2234499693063228, + "grad_norm": 0.23298653960227966, + "learning_rate": 7.921850030977775e-05, + "loss": 1.7843, + "step": 10502 + }, + { + "epoch": 3.223756906077348, + "grad_norm": 0.3053973317146301, + "learning_rate": 7.921446661903035e-05, + "loss": 1.8113, + "step": 10503 + }, + { + "epoch": 3.2240638428483734, + "grad_norm": 0.261336088180542, + "learning_rate": 7.921043263956847e-05, + "loss": 1.8073, + "step": 10504 + }, + { + "epoch": 3.2243707796193983, + "grad_norm": 0.24877268075942993, + "learning_rate": 7.920639837143195e-05, + "loss": 1.8344, + "step": 10505 + }, + { + "epoch": 3.2246777163904237, + "grad_norm": 0.26784422993659973, + "learning_rate": 7.920236381466071e-05, + "loss": 1.7757, + "step": 10506 + }, + { + "epoch": 3.2249846531614486, + "grad_norm": 0.2672121226787567, + "learning_rate": 7.919832896929458e-05, + "loss": 1.8384, + "step": 10507 + }, + { + "epoch": 3.225291589932474, + "grad_norm": 0.27254921197891235, + "learning_rate": 7.919429383537346e-05, + "loss": 1.8056, + "step": 10508 + }, + { + "epoch": 3.2255985267034992, + "grad_norm": 0.24467822909355164, + "learning_rate": 7.91902584129372e-05, + "loss": 1.8109, + "step": 10509 + }, + { + "epoch": 3.225905463474524, + "grad_norm": 0.25966358184814453, + "learning_rate": 7.918622270202571e-05, + "loss": 1.82, + "step": 10510 + }, + { + "epoch": 3.2262124002455494, + "grad_norm": 0.28601330518722534, + "learning_rate": 7.918218670267886e-05, + "loss": 1.7266, + "step": 10511 + }, + { + "epoch": 3.226519337016575, + "grad_norm": 0.4017516076564789, + "learning_rate": 7.917815041493653e-05, + "loss": 1.8408, + "step": 10512 + }, + { + "epoch": 3.2268262737875997, + "grad_norm": 0.3995787501335144, + "learning_rate": 7.917411383883862e-05, + "loss": 1.8441, + "step": 10513 + }, + { + "epoch": 3.227133210558625, + "grad_norm": 0.26997458934783936, + "learning_rate": 7.917007697442502e-05, + "loss": 1.8078, + "step": 10514 + }, + { + "epoch": 3.22744014732965, + "grad_norm": 0.34353014826774597, + "learning_rate": 7.916603982173562e-05, + "loss": 1.7523, + "step": 10515 + }, + { + "epoch": 3.2277470841006752, + "grad_norm": 0.39522337913513184, + "learning_rate": 7.916200238081032e-05, + "loss": 1.7532, + "step": 10516 + }, + { + "epoch": 3.2280540208717006, + "grad_norm": 0.4176923334598541, + "learning_rate": 7.915796465168903e-05, + "loss": 1.8895, + "step": 10517 + }, + { + "epoch": 3.2283609576427255, + "grad_norm": 0.30232906341552734, + "learning_rate": 7.915392663441164e-05, + "loss": 1.8223, + "step": 10518 + }, + { + "epoch": 3.228667894413751, + "grad_norm": 0.230951726436615, + "learning_rate": 7.914988832901805e-05, + "loss": 1.7265, + "step": 10519 + }, + { + "epoch": 3.228974831184776, + "grad_norm": 0.26381877064704895, + "learning_rate": 7.914584973554819e-05, + "loss": 1.7858, + "step": 10520 + }, + { + "epoch": 3.229281767955801, + "grad_norm": 0.2500905394554138, + "learning_rate": 7.914181085404194e-05, + "loss": 1.7606, + "step": 10521 + }, + { + "epoch": 3.2295887047268264, + "grad_norm": 0.2585415840148926, + "learning_rate": 7.913777168453925e-05, + "loss": 1.787, + "step": 10522 + }, + { + "epoch": 3.2298956414978512, + "grad_norm": 0.24236604571342468, + "learning_rate": 7.913373222708001e-05, + "loss": 1.7623, + "step": 10523 + }, + { + "epoch": 3.2302025782688766, + "grad_norm": 0.3113093078136444, + "learning_rate": 7.912969248170416e-05, + "loss": 1.7736, + "step": 10524 + }, + { + "epoch": 3.230509515039902, + "grad_norm": 0.3341342806816101, + "learning_rate": 7.912565244845163e-05, + "loss": 1.8583, + "step": 10525 + }, + { + "epoch": 3.230816451810927, + "grad_norm": 0.2644478678703308, + "learning_rate": 7.912161212736231e-05, + "loss": 1.7891, + "step": 10526 + }, + { + "epoch": 3.231123388581952, + "grad_norm": 0.22916561365127563, + "learning_rate": 7.911757151847616e-05, + "loss": 1.7642, + "step": 10527 + }, + { + "epoch": 3.2314303253529775, + "grad_norm": 0.24204877018928528, + "learning_rate": 7.911353062183309e-05, + "loss": 1.8522, + "step": 10528 + }, + { + "epoch": 3.2317372621240024, + "grad_norm": 0.25339365005493164, + "learning_rate": 7.910948943747307e-05, + "loss": 1.8391, + "step": 10529 + }, + { + "epoch": 3.2320441988950277, + "grad_norm": 0.2652709186077118, + "learning_rate": 7.9105447965436e-05, + "loss": 1.7735, + "step": 10530 + }, + { + "epoch": 3.2323511356660526, + "grad_norm": 0.2711019217967987, + "learning_rate": 7.910140620576183e-05, + "loss": 1.8491, + "step": 10531 + }, + { + "epoch": 3.232658072437078, + "grad_norm": 0.2598389685153961, + "learning_rate": 7.909736415849052e-05, + "loss": 1.8417, + "step": 10532 + }, + { + "epoch": 3.2329650092081033, + "grad_norm": 0.278037428855896, + "learning_rate": 7.9093321823662e-05, + "loss": 1.8774, + "step": 10533 + }, + { + "epoch": 3.233271945979128, + "grad_norm": 0.32015568017959595, + "learning_rate": 7.90892792013162e-05, + "loss": 1.8873, + "step": 10534 + }, + { + "epoch": 3.2335788827501535, + "grad_norm": 0.3098098635673523, + "learning_rate": 7.908523629149312e-05, + "loss": 1.8141, + "step": 10535 + }, + { + "epoch": 3.233885819521179, + "grad_norm": 0.3127266764640808, + "learning_rate": 7.908119309423267e-05, + "loss": 1.8587, + "step": 10536 + }, + { + "epoch": 3.2341927562922037, + "grad_norm": 0.3085545301437378, + "learning_rate": 7.907714960957483e-05, + "loss": 1.8544, + "step": 10537 + }, + { + "epoch": 3.234499693063229, + "grad_norm": 0.3051004409790039, + "learning_rate": 7.907310583755956e-05, + "loss": 1.8144, + "step": 10538 + }, + { + "epoch": 3.234806629834254, + "grad_norm": 0.3458186686038971, + "learning_rate": 7.906906177822682e-05, + "loss": 1.8388, + "step": 10539 + }, + { + "epoch": 3.2351135666052793, + "grad_norm": 0.37064439058303833, + "learning_rate": 7.906501743161656e-05, + "loss": 1.7574, + "step": 10540 + }, + { + "epoch": 3.2354205033763046, + "grad_norm": 0.3382316827774048, + "learning_rate": 7.906097279776876e-05, + "loss": 1.8785, + "step": 10541 + }, + { + "epoch": 3.2357274401473295, + "grad_norm": 0.254802942276001, + "learning_rate": 7.905692787672341e-05, + "loss": 1.8276, + "step": 10542 + }, + { + "epoch": 3.236034376918355, + "grad_norm": 0.3362341523170471, + "learning_rate": 7.905288266852047e-05, + "loss": 1.8057, + "step": 10543 + }, + { + "epoch": 3.23634131368938, + "grad_norm": 0.38821661472320557, + "learning_rate": 7.904883717319988e-05, + "loss": 1.7841, + "step": 10544 + }, + { + "epoch": 3.236648250460405, + "grad_norm": 0.33889076113700867, + "learning_rate": 7.90447913908017e-05, + "loss": 1.7892, + "step": 10545 + }, + { + "epoch": 3.2369551872314304, + "grad_norm": 0.2741014361381531, + "learning_rate": 7.904074532136585e-05, + "loss": 1.7611, + "step": 10546 + }, + { + "epoch": 3.2372621240024557, + "grad_norm": 0.28950995206832886, + "learning_rate": 7.903669896493233e-05, + "loss": 1.7963, + "step": 10547 + }, + { + "epoch": 3.2375690607734806, + "grad_norm": 0.30647143721580505, + "learning_rate": 7.903265232154113e-05, + "loss": 1.7522, + "step": 10548 + }, + { + "epoch": 3.237875997544506, + "grad_norm": 0.30428263545036316, + "learning_rate": 7.902860539123225e-05, + "loss": 1.7383, + "step": 10549 + }, + { + "epoch": 3.238182934315531, + "grad_norm": 0.2357146292924881, + "learning_rate": 7.902455817404569e-05, + "loss": 1.7243, + "step": 10550 + }, + { + "epoch": 3.238489871086556, + "grad_norm": 0.3125104606151581, + "learning_rate": 7.90205106700214e-05, + "loss": 1.8542, + "step": 10551 + }, + { + "epoch": 3.2387968078575815, + "grad_norm": 0.25797244906425476, + "learning_rate": 7.901646287919944e-05, + "loss": 1.8374, + "step": 10552 + }, + { + "epoch": 3.2391037446286064, + "grad_norm": 0.3127591907978058, + "learning_rate": 7.901241480161978e-05, + "loss": 1.9457, + "step": 10553 + }, + { + "epoch": 3.2394106813996317, + "grad_norm": 0.2971835434436798, + "learning_rate": 7.900836643732243e-05, + "loss": 1.7933, + "step": 10554 + }, + { + "epoch": 3.2397176181706566, + "grad_norm": 0.28931814432144165, + "learning_rate": 7.90043177863474e-05, + "loss": 1.8201, + "step": 10555 + }, + { + "epoch": 3.240024554941682, + "grad_norm": 0.3348724842071533, + "learning_rate": 7.90002688487347e-05, + "loss": 1.8718, + "step": 10556 + }, + { + "epoch": 3.2403314917127073, + "grad_norm": 0.28566426038742065, + "learning_rate": 7.899621962452436e-05, + "loss": 1.805, + "step": 10557 + }, + { + "epoch": 3.240638428483732, + "grad_norm": 0.27074119448661804, + "learning_rate": 7.899217011375637e-05, + "loss": 1.842, + "step": 10558 + }, + { + "epoch": 3.2409453652547575, + "grad_norm": 0.27014291286468506, + "learning_rate": 7.898812031647076e-05, + "loss": 1.8156, + "step": 10559 + }, + { + "epoch": 3.241252302025783, + "grad_norm": 0.28087863326072693, + "learning_rate": 7.898407023270756e-05, + "loss": 1.8399, + "step": 10560 + }, + { + "epoch": 3.2415592387968077, + "grad_norm": 0.2641037404537201, + "learning_rate": 7.898001986250679e-05, + "loss": 1.7977, + "step": 10561 + }, + { + "epoch": 3.241866175567833, + "grad_norm": 0.2843858301639557, + "learning_rate": 7.897596920590848e-05, + "loss": 1.834, + "step": 10562 + }, + { + "epoch": 3.2421731123388584, + "grad_norm": 0.2724611163139343, + "learning_rate": 7.897191826295266e-05, + "loss": 1.7547, + "step": 10563 + }, + { + "epoch": 3.2424800491098833, + "grad_norm": 0.2583858370780945, + "learning_rate": 7.896786703367935e-05, + "loss": 1.7658, + "step": 10564 + }, + { + "epoch": 3.2427869858809086, + "grad_norm": 0.2666650712490082, + "learning_rate": 7.896381551812861e-05, + "loss": 1.8017, + "step": 10565 + }, + { + "epoch": 3.2430939226519335, + "grad_norm": 0.23269347846508026, + "learning_rate": 7.895976371634047e-05, + "loss": 1.8267, + "step": 10566 + }, + { + "epoch": 3.243400859422959, + "grad_norm": 0.27865225076675415, + "learning_rate": 7.895571162835496e-05, + "loss": 1.8093, + "step": 10567 + }, + { + "epoch": 3.243707796193984, + "grad_norm": 0.29445022344589233, + "learning_rate": 7.895165925421216e-05, + "loss": 1.7999, + "step": 10568 + }, + { + "epoch": 3.244014732965009, + "grad_norm": 0.32135528326034546, + "learning_rate": 7.894760659395206e-05, + "loss": 1.8405, + "step": 10569 + }, + { + "epoch": 3.2443216697360344, + "grad_norm": 0.3409091532230377, + "learning_rate": 7.894355364761477e-05, + "loss": 1.7861, + "step": 10570 + }, + { + "epoch": 3.2446286065070598, + "grad_norm": 0.3379025459289551, + "learning_rate": 7.893950041524032e-05, + "loss": 1.8495, + "step": 10571 + }, + { + "epoch": 3.2449355432780846, + "grad_norm": 0.2843063473701477, + "learning_rate": 7.893544689686874e-05, + "loss": 1.7888, + "step": 10572 + }, + { + "epoch": 3.24524248004911, + "grad_norm": 0.2914074957370758, + "learning_rate": 7.893139309254013e-05, + "loss": 1.7866, + "step": 10573 + }, + { + "epoch": 3.245549416820135, + "grad_norm": 0.39855021238327026, + "learning_rate": 7.892733900229454e-05, + "loss": 1.7865, + "step": 10574 + }, + { + "epoch": 3.24585635359116, + "grad_norm": 0.4232102632522583, + "learning_rate": 7.892328462617203e-05, + "loss": 1.8443, + "step": 10575 + }, + { + "epoch": 3.2461632903621855, + "grad_norm": 0.390794962644577, + "learning_rate": 7.891922996421267e-05, + "loss": 1.8735, + "step": 10576 + }, + { + "epoch": 3.2464702271332104, + "grad_norm": 0.3051595687866211, + "learning_rate": 7.891517501645653e-05, + "loss": 1.8654, + "step": 10577 + }, + { + "epoch": 3.2467771639042358, + "grad_norm": 0.25363096594810486, + "learning_rate": 7.891111978294367e-05, + "loss": 1.7602, + "step": 10578 + }, + { + "epoch": 3.247084100675261, + "grad_norm": 0.29785794019699097, + "learning_rate": 7.890706426371419e-05, + "loss": 1.8242, + "step": 10579 + }, + { + "epoch": 3.247391037446286, + "grad_norm": 0.346162885427475, + "learning_rate": 7.890300845880816e-05, + "loss": 1.8551, + "step": 10580 + }, + { + "epoch": 3.2476979742173113, + "grad_norm": 0.33906155824661255, + "learning_rate": 7.889895236826566e-05, + "loss": 1.765, + "step": 10581 + }, + { + "epoch": 3.248004910988336, + "grad_norm": 0.26083165407180786, + "learning_rate": 7.889489599212676e-05, + "loss": 1.8246, + "step": 10582 + }, + { + "epoch": 3.2483118477593615, + "grad_norm": 0.3042019009590149, + "learning_rate": 7.889083933043157e-05, + "loss": 1.9017, + "step": 10583 + }, + { + "epoch": 3.248618784530387, + "grad_norm": 0.34833577275276184, + "learning_rate": 7.888678238322018e-05, + "loss": 1.7863, + "step": 10584 + }, + { + "epoch": 3.2489257213014118, + "grad_norm": 0.34436655044555664, + "learning_rate": 7.888272515053267e-05, + "loss": 1.7937, + "step": 10585 + }, + { + "epoch": 3.249232658072437, + "grad_norm": 0.2550172507762909, + "learning_rate": 7.887866763240914e-05, + "loss": 1.7615, + "step": 10586 + }, + { + "epoch": 3.2495395948434624, + "grad_norm": 0.3334405720233917, + "learning_rate": 7.88746098288897e-05, + "loss": 1.7465, + "step": 10587 + }, + { + "epoch": 3.2498465316144873, + "grad_norm": 0.4668157696723938, + "learning_rate": 7.887055174001443e-05, + "loss": 1.7836, + "step": 10588 + }, + { + "epoch": 3.2501534683855127, + "grad_norm": 0.524680495262146, + "learning_rate": 7.886649336582344e-05, + "loss": 1.844, + "step": 10589 + }, + { + "epoch": 3.250460405156538, + "grad_norm": 0.36859074234962463, + "learning_rate": 7.886243470635685e-05, + "loss": 1.8072, + "step": 10590 + }, + { + "epoch": 3.250767341927563, + "grad_norm": 0.32370296120643616, + "learning_rate": 7.885837576165478e-05, + "loss": 1.802, + "step": 10591 + }, + { + "epoch": 3.2510742786985882, + "grad_norm": 0.3506374955177307, + "learning_rate": 7.88543165317573e-05, + "loss": 1.7965, + "step": 10592 + }, + { + "epoch": 3.251381215469613, + "grad_norm": 0.39058688282966614, + "learning_rate": 7.885025701670457e-05, + "loss": 1.7987, + "step": 10593 + }, + { + "epoch": 3.2516881522406385, + "grad_norm": 0.3042154014110565, + "learning_rate": 7.884619721653669e-05, + "loss": 1.8345, + "step": 10594 + }, + { + "epoch": 3.251995089011664, + "grad_norm": 0.2249498963356018, + "learning_rate": 7.884213713129378e-05, + "loss": 1.7796, + "step": 10595 + }, + { + "epoch": 3.2523020257826887, + "grad_norm": 0.2701997458934784, + "learning_rate": 7.883807676101595e-05, + "loss": 1.8027, + "step": 10596 + }, + { + "epoch": 3.252608962553714, + "grad_norm": 0.2574785053730011, + "learning_rate": 7.883401610574336e-05, + "loss": 1.7878, + "step": 10597 + }, + { + "epoch": 3.252915899324739, + "grad_norm": 0.24964739382266998, + "learning_rate": 7.882995516551613e-05, + "loss": 1.7612, + "step": 10598 + }, + { + "epoch": 3.2532228360957642, + "grad_norm": 0.2519865930080414, + "learning_rate": 7.882589394037437e-05, + "loss": 1.7583, + "step": 10599 + }, + { + "epoch": 3.2535297728667896, + "grad_norm": 0.23174463212490082, + "learning_rate": 7.882183243035823e-05, + "loss": 1.7607, + "step": 10600 + }, + { + "epoch": 3.2538367096378145, + "grad_norm": 0.28103554248809814, + "learning_rate": 7.881777063550786e-05, + "loss": 1.904, + "step": 10601 + }, + { + "epoch": 3.25414364640884, + "grad_norm": 0.265677809715271, + "learning_rate": 7.881370855586339e-05, + "loss": 1.8169, + "step": 10602 + }, + { + "epoch": 3.254450583179865, + "grad_norm": 0.2539603114128113, + "learning_rate": 7.880964619146493e-05, + "loss": 1.8439, + "step": 10603 + }, + { + "epoch": 3.25475751995089, + "grad_norm": 0.2741886377334595, + "learning_rate": 7.88055835423527e-05, + "loss": 1.8737, + "step": 10604 + }, + { + "epoch": 3.2550644567219154, + "grad_norm": 0.27548348903656006, + "learning_rate": 7.88015206085668e-05, + "loss": 1.8385, + "step": 10605 + }, + { + "epoch": 3.2553713934929407, + "grad_norm": 0.2958502769470215, + "learning_rate": 7.879745739014739e-05, + "loss": 1.8603, + "step": 10606 + }, + { + "epoch": 3.2556783302639656, + "grad_norm": 0.2728644907474518, + "learning_rate": 7.879339388713462e-05, + "loss": 1.8, + "step": 10607 + }, + { + "epoch": 3.255985267034991, + "grad_norm": 0.28718289732933044, + "learning_rate": 7.878933009956866e-05, + "loss": 1.7803, + "step": 10608 + }, + { + "epoch": 3.256292203806016, + "grad_norm": 0.2989691197872162, + "learning_rate": 7.878526602748967e-05, + "loss": 1.8155, + "step": 10609 + }, + { + "epoch": 3.256599140577041, + "grad_norm": 0.24515527486801147, + "learning_rate": 7.87812016709378e-05, + "loss": 1.7623, + "step": 10610 + }, + { + "epoch": 3.2569060773480665, + "grad_norm": 0.29946041107177734, + "learning_rate": 7.877713702995324e-05, + "loss": 1.8097, + "step": 10611 + }, + { + "epoch": 3.2572130141190914, + "grad_norm": 0.2854483723640442, + "learning_rate": 7.877307210457613e-05, + "loss": 1.8088, + "step": 10612 + }, + { + "epoch": 3.2575199508901167, + "grad_norm": 0.27812930941581726, + "learning_rate": 7.876900689484668e-05, + "loss": 1.8151, + "step": 10613 + }, + { + "epoch": 3.2578268876611416, + "grad_norm": 0.2658015787601471, + "learning_rate": 7.876494140080503e-05, + "loss": 1.8314, + "step": 10614 + }, + { + "epoch": 3.258133824432167, + "grad_norm": 0.28935661911964417, + "learning_rate": 7.876087562249137e-05, + "loss": 1.7948, + "step": 10615 + }, + { + "epoch": 3.2584407612031923, + "grad_norm": 0.27497121691703796, + "learning_rate": 7.875680955994587e-05, + "loss": 1.7964, + "step": 10616 + }, + { + "epoch": 3.258747697974217, + "grad_norm": 0.3313405513763428, + "learning_rate": 7.875274321320873e-05, + "loss": 1.8143, + "step": 10617 + }, + { + "epoch": 3.2590546347452425, + "grad_norm": 0.3217218816280365, + "learning_rate": 7.874867658232013e-05, + "loss": 1.7749, + "step": 10618 + }, + { + "epoch": 3.259361571516268, + "grad_norm": 0.25105544924736023, + "learning_rate": 7.874460966732025e-05, + "loss": 1.7834, + "step": 10619 + }, + { + "epoch": 3.2596685082872927, + "grad_norm": 0.2931382358074188, + "learning_rate": 7.874054246824931e-05, + "loss": 1.8252, + "step": 10620 + }, + { + "epoch": 3.259975445058318, + "grad_norm": 0.2803363502025604, + "learning_rate": 7.873647498514747e-05, + "loss": 1.7527, + "step": 10621 + }, + { + "epoch": 3.2602823818293434, + "grad_norm": 0.29857927560806274, + "learning_rate": 7.873240721805492e-05, + "loss": 1.8085, + "step": 10622 + }, + { + "epoch": 3.2605893186003683, + "grad_norm": 0.24864110350608826, + "learning_rate": 7.872833916701192e-05, + "loss": 1.7509, + "step": 10623 + }, + { + "epoch": 3.2608962553713936, + "grad_norm": 0.24105949699878693, + "learning_rate": 7.872427083205862e-05, + "loss": 1.7871, + "step": 10624 + }, + { + "epoch": 3.2612031921424185, + "grad_norm": 0.2429245114326477, + "learning_rate": 7.872020221323523e-05, + "loss": 1.777, + "step": 10625 + }, + { + "epoch": 3.261510128913444, + "grad_norm": 0.234287828207016, + "learning_rate": 7.871613331058197e-05, + "loss": 1.8001, + "step": 10626 + }, + { + "epoch": 3.261817065684469, + "grad_norm": 0.3463406264781952, + "learning_rate": 7.871206412413905e-05, + "loss": 1.8925, + "step": 10627 + }, + { + "epoch": 3.262124002455494, + "grad_norm": 0.26798921823501587, + "learning_rate": 7.87079946539467e-05, + "loss": 1.7963, + "step": 10628 + }, + { + "epoch": 3.2624309392265194, + "grad_norm": 0.28603312373161316, + "learning_rate": 7.87039249000451e-05, + "loss": 1.8308, + "step": 10629 + }, + { + "epoch": 3.2627378759975443, + "grad_norm": 0.2717527747154236, + "learning_rate": 7.86998548624745e-05, + "loss": 1.8246, + "step": 10630 + }, + { + "epoch": 3.2630448127685696, + "grad_norm": 0.32215580344200134, + "learning_rate": 7.86957845412751e-05, + "loss": 1.7278, + "step": 10631 + }, + { + "epoch": 3.263351749539595, + "grad_norm": 0.3578735589981079, + "learning_rate": 7.869171393648717e-05, + "loss": 1.7288, + "step": 10632 + }, + { + "epoch": 3.26365868631062, + "grad_norm": 0.3120707869529724, + "learning_rate": 7.868764304815089e-05, + "loss": 1.7971, + "step": 10633 + }, + { + "epoch": 3.263965623081645, + "grad_norm": 0.27419236302375793, + "learning_rate": 7.86835718763065e-05, + "loss": 1.8529, + "step": 10634 + }, + { + "epoch": 3.2642725598526705, + "grad_norm": 0.3200531601905823, + "learning_rate": 7.867950042099423e-05, + "loss": 1.7892, + "step": 10635 + }, + { + "epoch": 3.2645794966236954, + "grad_norm": 0.325706422328949, + "learning_rate": 7.867542868225435e-05, + "loss": 1.8236, + "step": 10636 + }, + { + "epoch": 3.2648864333947207, + "grad_norm": 0.2950136065483093, + "learning_rate": 7.867135666012707e-05, + "loss": 1.8163, + "step": 10637 + }, + { + "epoch": 3.265193370165746, + "grad_norm": 0.2772117257118225, + "learning_rate": 7.866728435465263e-05, + "loss": 1.8373, + "step": 10638 + }, + { + "epoch": 3.265500306936771, + "grad_norm": 0.2887401580810547, + "learning_rate": 7.866321176587129e-05, + "loss": 1.7756, + "step": 10639 + }, + { + "epoch": 3.2658072437077963, + "grad_norm": 0.3474489152431488, + "learning_rate": 7.865913889382329e-05, + "loss": 1.7539, + "step": 10640 + }, + { + "epoch": 3.266114180478821, + "grad_norm": 0.3433493971824646, + "learning_rate": 7.865506573854888e-05, + "loss": 1.7987, + "step": 10641 + }, + { + "epoch": 3.2664211172498465, + "grad_norm": 0.3075394630432129, + "learning_rate": 7.865099230008832e-05, + "loss": 1.7907, + "step": 10642 + }, + { + "epoch": 3.266728054020872, + "grad_norm": 0.24817697703838348, + "learning_rate": 7.864691857848187e-05, + "loss": 1.7941, + "step": 10643 + }, + { + "epoch": 3.2670349907918967, + "grad_norm": 0.290147602558136, + "learning_rate": 7.864284457376976e-05, + "loss": 1.9125, + "step": 10644 + }, + { + "epoch": 3.267341927562922, + "grad_norm": 0.253684937953949, + "learning_rate": 7.863877028599229e-05, + "loss": 1.8084, + "step": 10645 + }, + { + "epoch": 3.267648864333947, + "grad_norm": 0.26349252462387085, + "learning_rate": 7.863469571518969e-05, + "loss": 1.7548, + "step": 10646 + }, + { + "epoch": 3.2679558011049723, + "grad_norm": 0.30568864941596985, + "learning_rate": 7.863062086140224e-05, + "loss": 1.8551, + "step": 10647 + }, + { + "epoch": 3.2682627378759976, + "grad_norm": 0.2866690456867218, + "learning_rate": 7.862654572467024e-05, + "loss": 1.8145, + "step": 10648 + }, + { + "epoch": 3.2685696746470225, + "grad_norm": 0.32022854685783386, + "learning_rate": 7.862247030503391e-05, + "loss": 1.896, + "step": 10649 + }, + { + "epoch": 3.268876611418048, + "grad_norm": 0.25260284543037415, + "learning_rate": 7.861839460253356e-05, + "loss": 1.814, + "step": 10650 + }, + { + "epoch": 3.269183548189073, + "grad_norm": 0.26776066422462463, + "learning_rate": 7.861431861720947e-05, + "loss": 1.7755, + "step": 10651 + }, + { + "epoch": 3.269490484960098, + "grad_norm": 0.26514193415641785, + "learning_rate": 7.861024234910191e-05, + "loss": 1.7606, + "step": 10652 + }, + { + "epoch": 3.2697974217311234, + "grad_norm": 0.27213940024375916, + "learning_rate": 7.860616579825116e-05, + "loss": 1.8074, + "step": 10653 + }, + { + "epoch": 3.2701043585021488, + "grad_norm": 0.29192888736724854, + "learning_rate": 7.860208896469752e-05, + "loss": 1.8436, + "step": 10654 + }, + { + "epoch": 3.2704112952731736, + "grad_norm": 0.3772370219230652, + "learning_rate": 7.859801184848127e-05, + "loss": 1.8096, + "step": 10655 + }, + { + "epoch": 3.270718232044199, + "grad_norm": 0.4574970006942749, + "learning_rate": 7.859393444964269e-05, + "loss": 1.7612, + "step": 10656 + }, + { + "epoch": 3.271025168815224, + "grad_norm": 0.4614393413066864, + "learning_rate": 7.858985676822211e-05, + "loss": 1.8529, + "step": 10657 + }, + { + "epoch": 3.271332105586249, + "grad_norm": 0.33567267656326294, + "learning_rate": 7.85857788042598e-05, + "loss": 1.8391, + "step": 10658 + }, + { + "epoch": 3.2716390423572745, + "grad_norm": 0.2564064860343933, + "learning_rate": 7.858170055779609e-05, + "loss": 1.7621, + "step": 10659 + }, + { + "epoch": 3.2719459791282994, + "grad_norm": 0.26769882440567017, + "learning_rate": 7.857762202887122e-05, + "loss": 1.8145, + "step": 10660 + }, + { + "epoch": 3.2722529158993248, + "grad_norm": 0.262008935213089, + "learning_rate": 7.857354321752558e-05, + "loss": 1.7513, + "step": 10661 + }, + { + "epoch": 3.27255985267035, + "grad_norm": 0.26494377851486206, + "learning_rate": 7.856946412379942e-05, + "loss": 1.8071, + "step": 10662 + }, + { + "epoch": 3.272866789441375, + "grad_norm": 0.25613999366760254, + "learning_rate": 7.856538474773307e-05, + "loss": 1.8775, + "step": 10663 + }, + { + "epoch": 3.2731737262124003, + "grad_norm": 0.24789929389953613, + "learning_rate": 7.856130508936684e-05, + "loss": 1.8055, + "step": 10664 + }, + { + "epoch": 3.2734806629834257, + "grad_norm": 0.29111939668655396, + "learning_rate": 7.855722514874107e-05, + "loss": 1.8114, + "step": 10665 + }, + { + "epoch": 3.2737875997544506, + "grad_norm": 0.30511030554771423, + "learning_rate": 7.855314492589605e-05, + "loss": 1.8131, + "step": 10666 + }, + { + "epoch": 3.274094536525476, + "grad_norm": 0.2545989453792572, + "learning_rate": 7.854906442087212e-05, + "loss": 1.7933, + "step": 10667 + }, + { + "epoch": 3.2744014732965008, + "grad_norm": 0.26684823632240295, + "learning_rate": 7.85449836337096e-05, + "loss": 1.7604, + "step": 10668 + }, + { + "epoch": 3.274708410067526, + "grad_norm": 0.5097808837890625, + "learning_rate": 7.854090256444881e-05, + "loss": 1.777, + "step": 10669 + }, + { + "epoch": 3.2750153468385514, + "grad_norm": 0.27828142046928406, + "learning_rate": 7.853682121313011e-05, + "loss": 1.7885, + "step": 10670 + }, + { + "epoch": 3.2753222836095763, + "grad_norm": 0.2925552725791931, + "learning_rate": 7.853273957979381e-05, + "loss": 1.7962, + "step": 10671 + }, + { + "epoch": 3.2756292203806017, + "grad_norm": 0.284574955701828, + "learning_rate": 7.852865766448025e-05, + "loss": 1.8645, + "step": 10672 + }, + { + "epoch": 3.2759361571516266, + "grad_norm": 0.23407664895057678, + "learning_rate": 7.85245754672298e-05, + "loss": 1.7106, + "step": 10673 + }, + { + "epoch": 3.276243093922652, + "grad_norm": 0.2555919885635376, + "learning_rate": 7.852049298808274e-05, + "loss": 1.8237, + "step": 10674 + }, + { + "epoch": 3.2765500306936772, + "grad_norm": 0.26703694462776184, + "learning_rate": 7.851641022707947e-05, + "loss": 1.7844, + "step": 10675 + }, + { + "epoch": 3.276856967464702, + "grad_norm": 0.24889135360717773, + "learning_rate": 7.851232718426033e-05, + "loss": 1.7783, + "step": 10676 + }, + { + "epoch": 3.2771639042357275, + "grad_norm": 0.25770726799964905, + "learning_rate": 7.850824385966564e-05, + "loss": 1.8007, + "step": 10677 + }, + { + "epoch": 3.277470841006753, + "grad_norm": 0.31806984543800354, + "learning_rate": 7.850416025333578e-05, + "loss": 1.8623, + "step": 10678 + }, + { + "epoch": 3.2777777777777777, + "grad_norm": 0.2906930148601532, + "learning_rate": 7.850007636531111e-05, + "loss": 1.8315, + "step": 10679 + }, + { + "epoch": 3.278084714548803, + "grad_norm": 0.2802525460720062, + "learning_rate": 7.849599219563197e-05, + "loss": 1.8488, + "step": 10680 + }, + { + "epoch": 3.2783916513198283, + "grad_norm": 0.26150405406951904, + "learning_rate": 7.849190774433874e-05, + "loss": 1.7967, + "step": 10681 + }, + { + "epoch": 3.2786985880908532, + "grad_norm": 0.25863370299339294, + "learning_rate": 7.848782301147178e-05, + "loss": 1.864, + "step": 10682 + }, + { + "epoch": 3.2790055248618786, + "grad_norm": 0.25381043553352356, + "learning_rate": 7.848373799707145e-05, + "loss": 1.8239, + "step": 10683 + }, + { + "epoch": 3.2793124616329035, + "grad_norm": 0.2583387792110443, + "learning_rate": 7.847965270117814e-05, + "loss": 1.8449, + "step": 10684 + }, + { + "epoch": 3.279619398403929, + "grad_norm": 0.30759841203689575, + "learning_rate": 7.84755671238322e-05, + "loss": 1.7992, + "step": 10685 + }, + { + "epoch": 3.279926335174954, + "grad_norm": 0.4316023588180542, + "learning_rate": 7.847148126507402e-05, + "loss": 1.7912, + "step": 10686 + }, + { + "epoch": 3.280233271945979, + "grad_norm": 0.3988901674747467, + "learning_rate": 7.846739512494396e-05, + "loss": 1.8831, + "step": 10687 + }, + { + "epoch": 3.2805402087170044, + "grad_norm": 0.318934828042984, + "learning_rate": 7.846330870348244e-05, + "loss": 1.8411, + "step": 10688 + }, + { + "epoch": 3.2808471454880292, + "grad_norm": 0.27755632996559143, + "learning_rate": 7.84592220007298e-05, + "loss": 1.8763, + "step": 10689 + }, + { + "epoch": 3.2811540822590546, + "grad_norm": 0.33544883131980896, + "learning_rate": 7.845513501672646e-05, + "loss": 1.731, + "step": 10690 + }, + { + "epoch": 3.28146101903008, + "grad_norm": 0.28299057483673096, + "learning_rate": 7.845104775151278e-05, + "loss": 1.813, + "step": 10691 + }, + { + "epoch": 3.281767955801105, + "grad_norm": 0.2761382460594177, + "learning_rate": 7.844696020512918e-05, + "loss": 1.8018, + "step": 10692 + }, + { + "epoch": 3.28207489257213, + "grad_norm": 0.2919033169746399, + "learning_rate": 7.844287237761605e-05, + "loss": 1.793, + "step": 10693 + }, + { + "epoch": 3.2823818293431555, + "grad_norm": 0.32922014594078064, + "learning_rate": 7.843878426901378e-05, + "loss": 1.8186, + "step": 10694 + }, + { + "epoch": 3.2826887661141804, + "grad_norm": 0.2818562090396881, + "learning_rate": 7.843469587936279e-05, + "loss": 1.7794, + "step": 10695 + }, + { + "epoch": 3.2829957028852057, + "grad_norm": 0.26414254307746887, + "learning_rate": 7.843060720870345e-05, + "loss": 1.7854, + "step": 10696 + }, + { + "epoch": 3.283302639656231, + "grad_norm": 0.28345760703086853, + "learning_rate": 7.842651825707618e-05, + "loss": 1.7659, + "step": 10697 + }, + { + "epoch": 3.283609576427256, + "grad_norm": 0.3522340655326843, + "learning_rate": 7.842242902452141e-05, + "loss": 1.8427, + "step": 10698 + }, + { + "epoch": 3.2839165131982813, + "grad_norm": 0.2861590087413788, + "learning_rate": 7.841833951107954e-05, + "loss": 1.7539, + "step": 10699 + }, + { + "epoch": 3.284223449969306, + "grad_norm": 0.2596624493598938, + "learning_rate": 7.841424971679099e-05, + "loss": 1.8407, + "step": 10700 + }, + { + "epoch": 3.2845303867403315, + "grad_norm": 0.2847718298435211, + "learning_rate": 7.841015964169616e-05, + "loss": 1.8085, + "step": 10701 + }, + { + "epoch": 3.284837323511357, + "grad_norm": 0.29566115140914917, + "learning_rate": 7.840606928583547e-05, + "loss": 1.7873, + "step": 10702 + }, + { + "epoch": 3.2851442602823817, + "grad_norm": 0.2752111256122589, + "learning_rate": 7.840197864924936e-05, + "loss": 1.8186, + "step": 10703 + }, + { + "epoch": 3.285451197053407, + "grad_norm": 0.2907958924770355, + "learning_rate": 7.839788773197826e-05, + "loss": 1.8081, + "step": 10704 + }, + { + "epoch": 3.285758133824432, + "grad_norm": 0.25808724761009216, + "learning_rate": 7.839379653406258e-05, + "loss": 1.7635, + "step": 10705 + }, + { + "epoch": 3.2860650705954573, + "grad_norm": 0.2732730507850647, + "learning_rate": 7.838970505554277e-05, + "loss": 1.8061, + "step": 10706 + }, + { + "epoch": 3.2863720073664826, + "grad_norm": 0.23820067942142487, + "learning_rate": 7.838561329645923e-05, + "loss": 1.8091, + "step": 10707 + }, + { + "epoch": 3.2866789441375075, + "grad_norm": 0.24179396033287048, + "learning_rate": 7.838152125685245e-05, + "loss": 1.7513, + "step": 10708 + }, + { + "epoch": 3.286985880908533, + "grad_norm": 0.2627546787261963, + "learning_rate": 7.837742893676283e-05, + "loss": 1.8741, + "step": 10709 + }, + { + "epoch": 3.287292817679558, + "grad_norm": 0.2827817499637604, + "learning_rate": 7.837333633623083e-05, + "loss": 1.8387, + "step": 10710 + }, + { + "epoch": 3.287599754450583, + "grad_norm": 0.2666749060153961, + "learning_rate": 7.836924345529688e-05, + "loss": 1.8319, + "step": 10711 + }, + { + "epoch": 3.2879066912216084, + "grad_norm": 0.3403390944004059, + "learning_rate": 7.836515029400145e-05, + "loss": 1.7827, + "step": 10712 + }, + { + "epoch": 3.2882136279926337, + "grad_norm": 0.30646705627441406, + "learning_rate": 7.836105685238497e-05, + "loss": 1.8612, + "step": 10713 + }, + { + "epoch": 3.2885205647636586, + "grad_norm": 0.2580253481864929, + "learning_rate": 7.83569631304879e-05, + "loss": 1.7332, + "step": 10714 + }, + { + "epoch": 3.288827501534684, + "grad_norm": 0.23734542727470398, + "learning_rate": 7.835286912835071e-05, + "loss": 1.7899, + "step": 10715 + }, + { + "epoch": 3.289134438305709, + "grad_norm": 0.2457810491323471, + "learning_rate": 7.834877484601384e-05, + "loss": 1.8059, + "step": 10716 + }, + { + "epoch": 3.289441375076734, + "grad_norm": 0.2558443248271942, + "learning_rate": 7.834468028351778e-05, + "loss": 1.8689, + "step": 10717 + }, + { + "epoch": 3.2897483118477595, + "grad_norm": 0.26596710085868835, + "learning_rate": 7.834058544090298e-05, + "loss": 1.816, + "step": 10718 + }, + { + "epoch": 3.2900552486187844, + "grad_norm": 0.25424903631210327, + "learning_rate": 7.833649031820987e-05, + "loss": 1.7907, + "step": 10719 + }, + { + "epoch": 3.2903621853898097, + "grad_norm": 0.23873139917850494, + "learning_rate": 7.833239491547896e-05, + "loss": 1.7666, + "step": 10720 + }, + { + "epoch": 3.2906691221608346, + "grad_norm": 0.23292972147464752, + "learning_rate": 7.832829923275073e-05, + "loss": 1.7674, + "step": 10721 + }, + { + "epoch": 3.29097605893186, + "grad_norm": 0.30133312940597534, + "learning_rate": 7.832420327006566e-05, + "loss": 1.8229, + "step": 10722 + }, + { + "epoch": 3.2912829957028853, + "grad_norm": 0.2882522642612457, + "learning_rate": 7.83201070274642e-05, + "loss": 1.7855, + "step": 10723 + }, + { + "epoch": 3.29158993247391, + "grad_norm": 0.2578088045120239, + "learning_rate": 7.831601050498683e-05, + "loss": 1.7276, + "step": 10724 + }, + { + "epoch": 3.2918968692449355, + "grad_norm": 0.29511600732803345, + "learning_rate": 7.831191370267406e-05, + "loss": 1.8085, + "step": 10725 + }, + { + "epoch": 3.292203806015961, + "grad_norm": 0.29557499289512634, + "learning_rate": 7.830781662056634e-05, + "loss": 1.815, + "step": 10726 + }, + { + "epoch": 3.2925107427869857, + "grad_norm": 0.32722121477127075, + "learning_rate": 7.830371925870422e-05, + "loss": 1.7889, + "step": 10727 + }, + { + "epoch": 3.292817679558011, + "grad_norm": 0.3124488592147827, + "learning_rate": 7.829962161712814e-05, + "loss": 1.8063, + "step": 10728 + }, + { + "epoch": 3.2931246163290364, + "grad_norm": 0.311334490776062, + "learning_rate": 7.829552369587861e-05, + "loss": 1.8852, + "step": 10729 + }, + { + "epoch": 3.2934315531000613, + "grad_norm": 0.28010860085487366, + "learning_rate": 7.829142549499613e-05, + "loss": 1.8274, + "step": 10730 + }, + { + "epoch": 3.2937384898710866, + "grad_norm": 0.3453529477119446, + "learning_rate": 7.828732701452119e-05, + "loss": 1.8618, + "step": 10731 + }, + { + "epoch": 3.2940454266421115, + "grad_norm": 0.2946802079677582, + "learning_rate": 7.828322825449432e-05, + "loss": 1.7123, + "step": 10732 + }, + { + "epoch": 3.294352363413137, + "grad_norm": 0.2467648684978485, + "learning_rate": 7.827912921495601e-05, + "loss": 1.7786, + "step": 10733 + }, + { + "epoch": 3.294659300184162, + "grad_norm": 0.2957034707069397, + "learning_rate": 7.827502989594677e-05, + "loss": 1.7817, + "step": 10734 + }, + { + "epoch": 3.294966236955187, + "grad_norm": 0.300905704498291, + "learning_rate": 7.827093029750713e-05, + "loss": 1.7582, + "step": 10735 + }, + { + "epoch": 3.2952731737262124, + "grad_norm": 0.28935131430625916, + "learning_rate": 7.826683041967757e-05, + "loss": 1.7766, + "step": 10736 + }, + { + "epoch": 3.2955801104972378, + "grad_norm": 0.26046010851860046, + "learning_rate": 7.826273026249861e-05, + "loss": 1.8152, + "step": 10737 + }, + { + "epoch": 3.2958870472682626, + "grad_norm": 0.24247924983501434, + "learning_rate": 7.82586298260108e-05, + "loss": 1.8679, + "step": 10738 + }, + { + "epoch": 3.296193984039288, + "grad_norm": 0.25977620482444763, + "learning_rate": 7.825452911025466e-05, + "loss": 1.8108, + "step": 10739 + }, + { + "epoch": 3.2965009208103133, + "grad_norm": 0.2732592821121216, + "learning_rate": 7.825042811527068e-05, + "loss": 1.7355, + "step": 10740 + }, + { + "epoch": 3.296807857581338, + "grad_norm": 0.38407859206199646, + "learning_rate": 7.824632684109941e-05, + "loss": 1.8418, + "step": 10741 + }, + { + "epoch": 3.2971147943523635, + "grad_norm": 0.4239252805709839, + "learning_rate": 7.82422252877814e-05, + "loss": 1.7655, + "step": 10742 + }, + { + "epoch": 3.2974217311233884, + "grad_norm": 0.3810526132583618, + "learning_rate": 7.823812345535716e-05, + "loss": 1.8804, + "step": 10743 + }, + { + "epoch": 3.2977286678944138, + "grad_norm": 0.29939520359039307, + "learning_rate": 7.823402134386722e-05, + "loss": 1.8207, + "step": 10744 + }, + { + "epoch": 3.298035604665439, + "grad_norm": 0.4053972065448761, + "learning_rate": 7.822991895335215e-05, + "loss": 1.7901, + "step": 10745 + }, + { + "epoch": 3.298342541436464, + "grad_norm": 0.4975005090236664, + "learning_rate": 7.822581628385247e-05, + "loss": 1.8344, + "step": 10746 + }, + { + "epoch": 3.2986494782074893, + "grad_norm": 0.4100436270236969, + "learning_rate": 7.822171333540874e-05, + "loss": 1.7891, + "step": 10747 + }, + { + "epoch": 3.298956414978514, + "grad_norm": 0.2817644476890564, + "learning_rate": 7.821761010806147e-05, + "loss": 1.7895, + "step": 10748 + }, + { + "epoch": 3.2992633517495396, + "grad_norm": 0.332660973072052, + "learning_rate": 7.821350660185125e-05, + "loss": 1.7281, + "step": 10749 + }, + { + "epoch": 3.299570288520565, + "grad_norm": 0.42652732133865356, + "learning_rate": 7.820940281681863e-05, + "loss": 1.7855, + "step": 10750 + }, + { + "epoch": 3.2998772252915898, + "grad_norm": 0.35700714588165283, + "learning_rate": 7.820529875300415e-05, + "loss": 1.8722, + "step": 10751 + }, + { + "epoch": 3.300184162062615, + "grad_norm": 0.25305211544036865, + "learning_rate": 7.820119441044838e-05, + "loss": 1.7696, + "step": 10752 + }, + { + "epoch": 3.3004910988336404, + "grad_norm": 0.280205637216568, + "learning_rate": 7.819708978919188e-05, + "loss": 1.756, + "step": 10753 + }, + { + "epoch": 3.3007980356046653, + "grad_norm": 0.4176226854324341, + "learning_rate": 7.819298488927521e-05, + "loss": 1.7731, + "step": 10754 + }, + { + "epoch": 3.3011049723756907, + "grad_norm": 0.4264865517616272, + "learning_rate": 7.818887971073894e-05, + "loss": 1.7851, + "step": 10755 + }, + { + "epoch": 3.301411909146716, + "grad_norm": 0.2901221215724945, + "learning_rate": 7.818477425362363e-05, + "loss": 1.7356, + "step": 10756 + }, + { + "epoch": 3.301718845917741, + "grad_norm": 0.29583361744880676, + "learning_rate": 7.818066851796986e-05, + "loss": 1.8269, + "step": 10757 + }, + { + "epoch": 3.3020257826887662, + "grad_norm": 0.38592997193336487, + "learning_rate": 7.817656250381821e-05, + "loss": 1.7515, + "step": 10758 + }, + { + "epoch": 3.302332719459791, + "grad_norm": 0.29301533102989197, + "learning_rate": 7.817245621120927e-05, + "loss": 1.7955, + "step": 10759 + }, + { + "epoch": 3.3026396562308165, + "grad_norm": 0.2770880162715912, + "learning_rate": 7.816834964018359e-05, + "loss": 1.7899, + "step": 10760 + }, + { + "epoch": 3.302946593001842, + "grad_norm": 0.32566413283348083, + "learning_rate": 7.816424279078176e-05, + "loss": 1.74, + "step": 10761 + }, + { + "epoch": 3.3032535297728667, + "grad_norm": 0.3077750504016876, + "learning_rate": 7.81601356630444e-05, + "loss": 1.8123, + "step": 10762 + }, + { + "epoch": 3.303560466543892, + "grad_norm": 0.2826370298862457, + "learning_rate": 7.815602825701206e-05, + "loss": 1.865, + "step": 10763 + }, + { + "epoch": 3.303867403314917, + "grad_norm": 0.31700822710990906, + "learning_rate": 7.815192057272534e-05, + "loss": 1.8021, + "step": 10764 + }, + { + "epoch": 3.3041743400859422, + "grad_norm": 0.33182790875434875, + "learning_rate": 7.814781261022486e-05, + "loss": 1.818, + "step": 10765 + }, + { + "epoch": 3.3044812768569676, + "grad_norm": 0.2720039486885071, + "learning_rate": 7.814370436955118e-05, + "loss": 1.8369, + "step": 10766 + }, + { + "epoch": 3.3047882136279925, + "grad_norm": 0.28134068846702576, + "learning_rate": 7.813959585074493e-05, + "loss": 1.8391, + "step": 10767 + }, + { + "epoch": 3.305095150399018, + "grad_norm": 0.25748828053474426, + "learning_rate": 7.813548705384667e-05, + "loss": 1.7987, + "step": 10768 + }, + { + "epoch": 3.305402087170043, + "grad_norm": 0.26187625527381897, + "learning_rate": 7.813137797889708e-05, + "loss": 1.7645, + "step": 10769 + }, + { + "epoch": 3.305709023941068, + "grad_norm": 0.297262579202652, + "learning_rate": 7.812726862593671e-05, + "loss": 1.771, + "step": 10770 + }, + { + "epoch": 3.3060159607120934, + "grad_norm": 0.2987872064113617, + "learning_rate": 7.812315899500618e-05, + "loss": 1.8115, + "step": 10771 + }, + { + "epoch": 3.3063228974831187, + "grad_norm": 0.31963878870010376, + "learning_rate": 7.81190490861461e-05, + "loss": 1.7685, + "step": 10772 + }, + { + "epoch": 3.3066298342541436, + "grad_norm": 0.27007177472114563, + "learning_rate": 7.81149388993971e-05, + "loss": 1.8272, + "step": 10773 + }, + { + "epoch": 3.306936771025169, + "grad_norm": 0.26818498969078064, + "learning_rate": 7.811082843479981e-05, + "loss": 1.7894, + "step": 10774 + }, + { + "epoch": 3.307243707796194, + "grad_norm": 0.28857091069221497, + "learning_rate": 7.810671769239483e-05, + "loss": 1.8769, + "step": 10775 + }, + { + "epoch": 3.307550644567219, + "grad_norm": 0.26983144879341125, + "learning_rate": 7.810260667222277e-05, + "loss": 1.796, + "step": 10776 + }, + { + "epoch": 3.3078575813382445, + "grad_norm": 0.2566467225551605, + "learning_rate": 7.809849537432432e-05, + "loss": 1.848, + "step": 10777 + }, + { + "epoch": 3.3081645181092694, + "grad_norm": 0.25607848167419434, + "learning_rate": 7.809438379874005e-05, + "loss": 1.8072, + "step": 10778 + }, + { + "epoch": 3.3084714548802947, + "grad_norm": 0.29158470034599304, + "learning_rate": 7.809027194551059e-05, + "loss": 1.7772, + "step": 10779 + }, + { + "epoch": 3.3087783916513196, + "grad_norm": 0.360897421836853, + "learning_rate": 7.808615981467664e-05, + "loss": 1.8404, + "step": 10780 + }, + { + "epoch": 3.309085328422345, + "grad_norm": 0.31121253967285156, + "learning_rate": 7.808204740627877e-05, + "loss": 1.8137, + "step": 10781 + }, + { + "epoch": 3.3093922651933703, + "grad_norm": 0.2846451699733734, + "learning_rate": 7.807793472035765e-05, + "loss": 1.8367, + "step": 10782 + }, + { + "epoch": 3.309699201964395, + "grad_norm": 0.2711004316806793, + "learning_rate": 7.807382175695393e-05, + "loss": 1.7728, + "step": 10783 + }, + { + "epoch": 3.3100061387354205, + "grad_norm": 0.2693859338760376, + "learning_rate": 7.806970851610824e-05, + "loss": 1.7026, + "step": 10784 + }, + { + "epoch": 3.310313075506446, + "grad_norm": 0.3050517439842224, + "learning_rate": 7.806559499786125e-05, + "loss": 1.8041, + "step": 10785 + }, + { + "epoch": 3.3106200122774707, + "grad_norm": 0.27304747700691223, + "learning_rate": 7.80614812022536e-05, + "loss": 1.8182, + "step": 10786 + }, + { + "epoch": 3.310926949048496, + "grad_norm": 0.28378555178642273, + "learning_rate": 7.805736712932594e-05, + "loss": 1.8519, + "step": 10787 + }, + { + "epoch": 3.3112338858195214, + "grad_norm": 0.30620133876800537, + "learning_rate": 7.805325277911892e-05, + "loss": 1.8594, + "step": 10788 + }, + { + "epoch": 3.3115408225905463, + "grad_norm": 0.2580169141292572, + "learning_rate": 7.804913815167325e-05, + "loss": 1.7897, + "step": 10789 + }, + { + "epoch": 3.3118477593615716, + "grad_norm": 0.28937023878097534, + "learning_rate": 7.804502324702951e-05, + "loss": 1.8362, + "step": 10790 + }, + { + "epoch": 3.3121546961325965, + "grad_norm": 0.28032705187797546, + "learning_rate": 7.804090806522844e-05, + "loss": 1.8168, + "step": 10791 + }, + { + "epoch": 3.312461632903622, + "grad_norm": 0.33712559938430786, + "learning_rate": 7.803679260631069e-05, + "loss": 1.7489, + "step": 10792 + }, + { + "epoch": 3.312768569674647, + "grad_norm": 0.40536820888519287, + "learning_rate": 7.80326768703169e-05, + "loss": 1.8413, + "step": 10793 + }, + { + "epoch": 3.313075506445672, + "grad_norm": 0.34967559576034546, + "learning_rate": 7.802856085728778e-05, + "loss": 1.8076, + "step": 10794 + }, + { + "epoch": 3.3133824432166974, + "grad_norm": 0.2429870367050171, + "learning_rate": 7.8024444567264e-05, + "loss": 1.8002, + "step": 10795 + }, + { + "epoch": 3.3136893799877223, + "grad_norm": 0.40956684947013855, + "learning_rate": 7.802032800028621e-05, + "loss": 1.8151, + "step": 10796 + }, + { + "epoch": 3.3139963167587476, + "grad_norm": 0.4908781945705414, + "learning_rate": 7.801621115639512e-05, + "loss": 1.8124, + "step": 10797 + }, + { + "epoch": 3.314303253529773, + "grad_norm": 0.3922197222709656, + "learning_rate": 7.801209403563143e-05, + "loss": 1.7911, + "step": 10798 + }, + { + "epoch": 3.314610190300798, + "grad_norm": 0.29467105865478516, + "learning_rate": 7.800797663803578e-05, + "loss": 1.8472, + "step": 10799 + }, + { + "epoch": 3.314917127071823, + "grad_norm": 0.384974867105484, + "learning_rate": 7.800385896364891e-05, + "loss": 1.8139, + "step": 10800 + }, + { + "epoch": 3.3152240638428485, + "grad_norm": 0.4605129063129425, + "learning_rate": 7.79997410125115e-05, + "loss": 1.7982, + "step": 10801 + }, + { + "epoch": 3.3155310006138734, + "grad_norm": 0.2982464134693146, + "learning_rate": 7.799562278466423e-05, + "loss": 1.8496, + "step": 10802 + }, + { + "epoch": 3.3158379373848987, + "grad_norm": 0.3101392984390259, + "learning_rate": 7.79915042801478e-05, + "loss": 1.8172, + "step": 10803 + }, + { + "epoch": 3.316144874155924, + "grad_norm": 0.3651282489299774, + "learning_rate": 7.798738549900292e-05, + "loss": 1.7497, + "step": 10804 + }, + { + "epoch": 3.316451810926949, + "grad_norm": 0.28504419326782227, + "learning_rate": 7.79832664412703e-05, + "loss": 1.8027, + "step": 10805 + }, + { + "epoch": 3.3167587476979743, + "grad_norm": 0.28333309292793274, + "learning_rate": 7.797914710699063e-05, + "loss": 1.8121, + "step": 10806 + }, + { + "epoch": 3.317065684468999, + "grad_norm": 0.37549784779548645, + "learning_rate": 7.797502749620462e-05, + "loss": 1.817, + "step": 10807 + }, + { + "epoch": 3.3173726212400245, + "grad_norm": 0.3864210844039917, + "learning_rate": 7.797090760895301e-05, + "loss": 1.852, + "step": 10808 + }, + { + "epoch": 3.31767955801105, + "grad_norm": 0.2422102987766266, + "learning_rate": 7.79667874452765e-05, + "loss": 1.7523, + "step": 10809 + }, + { + "epoch": 3.3179864947820747, + "grad_norm": 0.307892382144928, + "learning_rate": 7.79626670052158e-05, + "loss": 1.7436, + "step": 10810 + }, + { + "epoch": 3.3182934315531, + "grad_norm": 0.29607462882995605, + "learning_rate": 7.795854628881162e-05, + "loss": 1.768, + "step": 10811 + }, + { + "epoch": 3.3186003683241254, + "grad_norm": 0.23334427177906036, + "learning_rate": 7.795442529610471e-05, + "loss": 1.7687, + "step": 10812 + }, + { + "epoch": 3.3189073050951503, + "grad_norm": 0.26257455348968506, + "learning_rate": 7.795030402713578e-05, + "loss": 1.8266, + "step": 10813 + }, + { + "epoch": 3.3192142418661756, + "grad_norm": 0.3252788782119751, + "learning_rate": 7.794618248194556e-05, + "loss": 1.8645, + "step": 10814 + }, + { + "epoch": 3.319521178637201, + "grad_norm": 0.3807232975959778, + "learning_rate": 7.79420606605748e-05, + "loss": 1.8154, + "step": 10815 + }, + { + "epoch": 3.319828115408226, + "grad_norm": 0.3395625948905945, + "learning_rate": 7.793793856306422e-05, + "loss": 1.8002, + "step": 10816 + }, + { + "epoch": 3.320135052179251, + "grad_norm": 0.2896415889263153, + "learning_rate": 7.793381618945455e-05, + "loss": 1.8077, + "step": 10817 + }, + { + "epoch": 3.320441988950276, + "grad_norm": 0.27733489871025085, + "learning_rate": 7.792969353978652e-05, + "loss": 1.7976, + "step": 10818 + }, + { + "epoch": 3.3207489257213014, + "grad_norm": 0.36985141038894653, + "learning_rate": 7.79255706141009e-05, + "loss": 1.8724, + "step": 10819 + }, + { + "epoch": 3.3210558624923268, + "grad_norm": 0.37886983156204224, + "learning_rate": 7.792144741243843e-05, + "loss": 1.8249, + "step": 10820 + }, + { + "epoch": 3.3213627992633517, + "grad_norm": 0.3030721843242645, + "learning_rate": 7.791732393483986e-05, + "loss": 1.7975, + "step": 10821 + }, + { + "epoch": 3.321669736034377, + "grad_norm": 0.2637709081172943, + "learning_rate": 7.791320018134592e-05, + "loss": 1.7205, + "step": 10822 + }, + { + "epoch": 3.321976672805402, + "grad_norm": 0.35307520627975464, + "learning_rate": 7.790907615199736e-05, + "loss": 1.8786, + "step": 10823 + }, + { + "epoch": 3.322283609576427, + "grad_norm": 0.3333272635936737, + "learning_rate": 7.790495184683497e-05, + "loss": 1.7715, + "step": 10824 + }, + { + "epoch": 3.3225905463474525, + "grad_norm": 0.2597469091415405, + "learning_rate": 7.790082726589948e-05, + "loss": 1.8379, + "step": 10825 + }, + { + "epoch": 3.3228974831184774, + "grad_norm": 0.34176257252693176, + "learning_rate": 7.789670240923168e-05, + "loss": 1.8305, + "step": 10826 + }, + { + "epoch": 3.3232044198895028, + "grad_norm": 0.37954533100128174, + "learning_rate": 7.789257727687229e-05, + "loss": 1.7728, + "step": 10827 + }, + { + "epoch": 3.323511356660528, + "grad_norm": 0.2840248644351959, + "learning_rate": 7.788845186886212e-05, + "loss": 1.8059, + "step": 10828 + }, + { + "epoch": 3.323818293431553, + "grad_norm": 0.3650275766849518, + "learning_rate": 7.788432618524193e-05, + "loss": 1.8127, + "step": 10829 + }, + { + "epoch": 3.3241252302025783, + "grad_norm": 0.4869692623615265, + "learning_rate": 7.788020022605247e-05, + "loss": 1.833, + "step": 10830 + }, + { + "epoch": 3.3244321669736037, + "grad_norm": 0.3419482707977295, + "learning_rate": 7.787607399133453e-05, + "loss": 1.7812, + "step": 10831 + }, + { + "epoch": 3.3247391037446286, + "grad_norm": 0.27625617384910583, + "learning_rate": 7.787194748112889e-05, + "loss": 1.8513, + "step": 10832 + }, + { + "epoch": 3.325046040515654, + "grad_norm": 0.4287806749343872, + "learning_rate": 7.786782069547633e-05, + "loss": 1.836, + "step": 10833 + }, + { + "epoch": 3.325352977286679, + "grad_norm": 0.4345545172691345, + "learning_rate": 7.786369363441763e-05, + "loss": 1.8027, + "step": 10834 + }, + { + "epoch": 3.325659914057704, + "grad_norm": 0.32976534962654114, + "learning_rate": 7.78595662979936e-05, + "loss": 1.7987, + "step": 10835 + }, + { + "epoch": 3.3259668508287294, + "grad_norm": 0.2677469849586487, + "learning_rate": 7.785543868624498e-05, + "loss": 1.8312, + "step": 10836 + }, + { + "epoch": 3.3262737875997543, + "grad_norm": 0.2547740638256073, + "learning_rate": 7.785131079921259e-05, + "loss": 1.7844, + "step": 10837 + }, + { + "epoch": 3.3265807243707797, + "grad_norm": 0.26755592226982117, + "learning_rate": 7.784718263693725e-05, + "loss": 1.8263, + "step": 10838 + }, + { + "epoch": 3.3268876611418046, + "grad_norm": 0.23884403705596924, + "learning_rate": 7.784305419945969e-05, + "loss": 1.7862, + "step": 10839 + }, + { + "epoch": 3.32719459791283, + "grad_norm": 0.2896903157234192, + "learning_rate": 7.783892548682077e-05, + "loss": 1.9138, + "step": 10840 + }, + { + "epoch": 3.3275015346838552, + "grad_norm": 0.3201359510421753, + "learning_rate": 7.783479649906127e-05, + "loss": 1.8382, + "step": 10841 + }, + { + "epoch": 3.32780847145488, + "grad_norm": 0.39285311102867126, + "learning_rate": 7.7830667236222e-05, + "loss": 1.7763, + "step": 10842 + }, + { + "epoch": 3.3281154082259055, + "grad_norm": 0.435007244348526, + "learning_rate": 7.782653769834376e-05, + "loss": 1.8415, + "step": 10843 + }, + { + "epoch": 3.328422344996931, + "grad_norm": 0.34605318307876587, + "learning_rate": 7.782240788546736e-05, + "loss": 1.757, + "step": 10844 + }, + { + "epoch": 3.3287292817679557, + "grad_norm": 0.26830604672431946, + "learning_rate": 7.781827779763362e-05, + "loss": 1.7779, + "step": 10845 + }, + { + "epoch": 3.329036218538981, + "grad_norm": 0.41851529479026794, + "learning_rate": 7.781414743488336e-05, + "loss": 1.8609, + "step": 10846 + }, + { + "epoch": 3.3293431553100064, + "grad_norm": 0.5058079361915588, + "learning_rate": 7.78100167972574e-05, + "loss": 1.8146, + "step": 10847 + }, + { + "epoch": 3.3296500920810312, + "grad_norm": 0.34394967555999756, + "learning_rate": 7.780588588479654e-05, + "loss": 1.8079, + "step": 10848 + }, + { + "epoch": 3.3299570288520566, + "grad_norm": 0.3033885061740875, + "learning_rate": 7.780175469754161e-05, + "loss": 1.8223, + "step": 10849 + }, + { + "epoch": 3.3302639656230815, + "grad_norm": 0.4431045651435852, + "learning_rate": 7.779762323553347e-05, + "loss": 1.8841, + "step": 10850 + }, + { + "epoch": 3.330570902394107, + "grad_norm": 0.3451448976993561, + "learning_rate": 7.77934914988129e-05, + "loss": 1.8092, + "step": 10851 + }, + { + "epoch": 3.330877839165132, + "grad_norm": 0.26580891013145447, + "learning_rate": 7.778935948742077e-05, + "loss": 1.8244, + "step": 10852 + }, + { + "epoch": 3.331184775936157, + "grad_norm": 0.32079070806503296, + "learning_rate": 7.778522720139792e-05, + "loss": 1.7816, + "step": 10853 + }, + { + "epoch": 3.3314917127071824, + "grad_norm": 0.35789042711257935, + "learning_rate": 7.778109464078514e-05, + "loss": 1.8211, + "step": 10854 + }, + { + "epoch": 3.3317986494782073, + "grad_norm": 0.2808612585067749, + "learning_rate": 7.77769618056233e-05, + "loss": 1.8387, + "step": 10855 + }, + { + "epoch": 3.3321055862492326, + "grad_norm": 0.24760548770427704, + "learning_rate": 7.777282869595326e-05, + "loss": 1.7795, + "step": 10856 + }, + { + "epoch": 3.332412523020258, + "grad_norm": 0.2840912640094757, + "learning_rate": 7.776869531181583e-05, + "loss": 1.7492, + "step": 10857 + }, + { + "epoch": 3.332719459791283, + "grad_norm": 0.2881413698196411, + "learning_rate": 7.77645616532519e-05, + "loss": 1.8157, + "step": 10858 + }, + { + "epoch": 3.333026396562308, + "grad_norm": 0.2508779764175415, + "learning_rate": 7.776042772030228e-05, + "loss": 1.8196, + "step": 10859 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.3307822048664093, + "learning_rate": 7.775629351300785e-05, + "loss": 1.8195, + "step": 10860 + }, + { + "epoch": 3.3336402701043584, + "grad_norm": 0.34392043948173523, + "learning_rate": 7.775215903140946e-05, + "loss": 1.7775, + "step": 10861 + }, + { + "epoch": 3.3339472068753837, + "grad_norm": 0.2594252824783325, + "learning_rate": 7.774802427554796e-05, + "loss": 1.7687, + "step": 10862 + }, + { + "epoch": 3.334254143646409, + "grad_norm": 0.3109053075313568, + "learning_rate": 7.774388924546423e-05, + "loss": 1.7908, + "step": 10863 + }, + { + "epoch": 3.334561080417434, + "grad_norm": 0.4801923930644989, + "learning_rate": 7.773975394119913e-05, + "loss": 1.8316, + "step": 10864 + }, + { + "epoch": 3.3348680171884593, + "grad_norm": 0.4754973351955414, + "learning_rate": 7.77356183627935e-05, + "loss": 1.8015, + "step": 10865 + }, + { + "epoch": 3.335174953959484, + "grad_norm": 0.29624658823013306, + "learning_rate": 7.773148251028825e-05, + "loss": 1.8179, + "step": 10866 + }, + { + "epoch": 3.3354818907305095, + "grad_norm": 0.32207581400871277, + "learning_rate": 7.772734638372423e-05, + "loss": 1.799, + "step": 10867 + }, + { + "epoch": 3.335788827501535, + "grad_norm": 0.5227517485618591, + "learning_rate": 7.772320998314233e-05, + "loss": 1.8452, + "step": 10868 + }, + { + "epoch": 3.3360957642725597, + "grad_norm": 0.4081100523471832, + "learning_rate": 7.771907330858341e-05, + "loss": 1.8182, + "step": 10869 + }, + { + "epoch": 3.336402701043585, + "grad_norm": 0.23786653578281403, + "learning_rate": 7.771493636008838e-05, + "loss": 1.7392, + "step": 10870 + }, + { + "epoch": 3.33670963781461, + "grad_norm": 0.37913820147514343, + "learning_rate": 7.771079913769807e-05, + "loss": 1.7559, + "step": 10871 + }, + { + "epoch": 3.3370165745856353, + "grad_norm": 0.4939163625240326, + "learning_rate": 7.770666164145344e-05, + "loss": 1.8076, + "step": 10872 + }, + { + "epoch": 3.3373235113566606, + "grad_norm": 0.3322528302669525, + "learning_rate": 7.770252387139532e-05, + "loss": 1.8045, + "step": 10873 + }, + { + "epoch": 3.337630448127686, + "grad_norm": 0.3685782849788666, + "learning_rate": 7.769838582756461e-05, + "loss": 1.7703, + "step": 10874 + }, + { + "epoch": 3.337937384898711, + "grad_norm": 0.5564271807670593, + "learning_rate": 7.769424751000224e-05, + "loss": 1.7697, + "step": 10875 + }, + { + "epoch": 3.338244321669736, + "grad_norm": 0.38610726594924927, + "learning_rate": 7.769010891874906e-05, + "loss": 1.7944, + "step": 10876 + }, + { + "epoch": 3.338551258440761, + "grad_norm": 0.23838558793067932, + "learning_rate": 7.768597005384602e-05, + "loss": 1.765, + "step": 10877 + }, + { + "epoch": 3.3388581952117864, + "grad_norm": 0.4334571063518524, + "learning_rate": 7.768183091533399e-05, + "loss": 1.7854, + "step": 10878 + }, + { + "epoch": 3.3391651319828117, + "grad_norm": 0.44844719767570496, + "learning_rate": 7.767769150325386e-05, + "loss": 1.7955, + "step": 10879 + }, + { + "epoch": 3.3394720687538366, + "grad_norm": 0.26543378829956055, + "learning_rate": 7.767355181764659e-05, + "loss": 1.8311, + "step": 10880 + }, + { + "epoch": 3.339779005524862, + "grad_norm": 0.39401358366012573, + "learning_rate": 7.766941185855304e-05, + "loss": 1.8264, + "step": 10881 + }, + { + "epoch": 3.340085942295887, + "grad_norm": 0.5476824045181274, + "learning_rate": 7.766527162601416e-05, + "loss": 1.8051, + "step": 10882 + }, + { + "epoch": 3.340392879066912, + "grad_norm": 0.4021138548851013, + "learning_rate": 7.766113112007084e-05, + "loss": 1.7941, + "step": 10883 + }, + { + "epoch": 3.3406998158379375, + "grad_norm": 0.3262040317058563, + "learning_rate": 7.765699034076402e-05, + "loss": 1.8317, + "step": 10884 + }, + { + "epoch": 3.3410067526089624, + "grad_norm": 0.5461146831512451, + "learning_rate": 7.765284928813459e-05, + "loss": 1.833, + "step": 10885 + }, + { + "epoch": 3.3413136893799877, + "grad_norm": 0.5067405700683594, + "learning_rate": 7.764870796222351e-05, + "loss": 1.7862, + "step": 10886 + }, + { + "epoch": 3.341620626151013, + "grad_norm": 0.2731069028377533, + "learning_rate": 7.76445663630717e-05, + "loss": 1.8173, + "step": 10887 + }, + { + "epoch": 3.341927562922038, + "grad_norm": 0.48928195238113403, + "learning_rate": 7.764042449072008e-05, + "loss": 1.7992, + "step": 10888 + }, + { + "epoch": 3.3422344996930633, + "grad_norm": 0.5338504910469055, + "learning_rate": 7.763628234520958e-05, + "loss": 1.7891, + "step": 10889 + }, + { + "epoch": 3.3425414364640886, + "grad_norm": 0.3136523365974426, + "learning_rate": 7.763213992658114e-05, + "loss": 1.8623, + "step": 10890 + }, + { + "epoch": 3.3428483732351135, + "grad_norm": 0.36551395058631897, + "learning_rate": 7.762799723487568e-05, + "loss": 1.8474, + "step": 10891 + }, + { + "epoch": 3.343155310006139, + "grad_norm": 0.35772353410720825, + "learning_rate": 7.762385427013419e-05, + "loss": 1.8625, + "step": 10892 + }, + { + "epoch": 3.3434622467771637, + "grad_norm": 0.29944708943367004, + "learning_rate": 7.761971103239755e-05, + "loss": 1.8181, + "step": 10893 + }, + { + "epoch": 3.343769183548189, + "grad_norm": 0.3395330309867859, + "learning_rate": 7.761556752170676e-05, + "loss": 1.7943, + "step": 10894 + }, + { + "epoch": 3.3440761203192144, + "grad_norm": 0.3624265193939209, + "learning_rate": 7.761142373810274e-05, + "loss": 1.8234, + "step": 10895 + }, + { + "epoch": 3.3443830570902393, + "grad_norm": 0.25409621000289917, + "learning_rate": 7.760727968162644e-05, + "loss": 1.7532, + "step": 10896 + }, + { + "epoch": 3.3446899938612646, + "grad_norm": 0.321437805891037, + "learning_rate": 7.760313535231883e-05, + "loss": 1.8808, + "step": 10897 + }, + { + "epoch": 3.3449969306322895, + "grad_norm": 0.2919142544269562, + "learning_rate": 7.759899075022086e-05, + "loss": 1.7677, + "step": 10898 + }, + { + "epoch": 3.345303867403315, + "grad_norm": 0.26515716314315796, + "learning_rate": 7.759484587537346e-05, + "loss": 1.8118, + "step": 10899 + }, + { + "epoch": 3.34561080417434, + "grad_norm": 0.2963240146636963, + "learning_rate": 7.759070072781764e-05, + "loss": 1.8329, + "step": 10900 + }, + { + "epoch": 3.345917740945365, + "grad_norm": 0.3186480700969696, + "learning_rate": 7.758655530759435e-05, + "loss": 1.8013, + "step": 10901 + }, + { + "epoch": 3.3462246777163904, + "grad_norm": 0.256145715713501, + "learning_rate": 7.758240961474454e-05, + "loss": 1.7865, + "step": 10902 + }, + { + "epoch": 3.3465316144874158, + "grad_norm": 0.28951629996299744, + "learning_rate": 7.757826364930921e-05, + "loss": 1.8091, + "step": 10903 + }, + { + "epoch": 3.3468385512584407, + "grad_norm": 0.2692483365535736, + "learning_rate": 7.75741174113293e-05, + "loss": 1.8308, + "step": 10904 + }, + { + "epoch": 3.347145488029466, + "grad_norm": 0.27615389227867126, + "learning_rate": 7.75699709008458e-05, + "loss": 1.7888, + "step": 10905 + }, + { + "epoch": 3.3474524248004913, + "grad_norm": 0.2819034457206726, + "learning_rate": 7.75658241178997e-05, + "loss": 1.7624, + "step": 10906 + }, + { + "epoch": 3.347759361571516, + "grad_norm": 0.2627592086791992, + "learning_rate": 7.756167706253196e-05, + "loss": 1.7696, + "step": 10907 + }, + { + "epoch": 3.3480662983425415, + "grad_norm": 0.3528621196746826, + "learning_rate": 7.755752973478356e-05, + "loss": 1.7725, + "step": 10908 + }, + { + "epoch": 3.3483732351135664, + "grad_norm": 0.35949698090553284, + "learning_rate": 7.755338213469552e-05, + "loss": 1.8163, + "step": 10909 + }, + { + "epoch": 3.3486801718845918, + "grad_norm": 0.25142577290534973, + "learning_rate": 7.75492342623088e-05, + "loss": 1.7879, + "step": 10910 + }, + { + "epoch": 3.348987108655617, + "grad_norm": 0.25766023993492126, + "learning_rate": 7.75450861176644e-05, + "loss": 1.8143, + "step": 10911 + }, + { + "epoch": 3.349294045426642, + "grad_norm": 0.2736956477165222, + "learning_rate": 7.754093770080331e-05, + "loss": 1.8907, + "step": 10912 + }, + { + "epoch": 3.3496009821976673, + "grad_norm": 0.23700755834579468, + "learning_rate": 7.753678901176654e-05, + "loss": 1.813, + "step": 10913 + }, + { + "epoch": 3.349907918968692, + "grad_norm": 0.245509073138237, + "learning_rate": 7.753264005059507e-05, + "loss": 1.8019, + "step": 10914 + }, + { + "epoch": 3.3502148557397176, + "grad_norm": 0.232910618185997, + "learning_rate": 7.752849081732993e-05, + "loss": 1.784, + "step": 10915 + }, + { + "epoch": 3.350521792510743, + "grad_norm": 0.22989360988140106, + "learning_rate": 7.75243413120121e-05, + "loss": 1.7597, + "step": 10916 + }, + { + "epoch": 3.350828729281768, + "grad_norm": 0.2093925178050995, + "learning_rate": 7.752019153468258e-05, + "loss": 1.7698, + "step": 10917 + }, + { + "epoch": 3.351135666052793, + "grad_norm": 0.25539630651474, + "learning_rate": 7.751604148538241e-05, + "loss": 1.8287, + "step": 10918 + }, + { + "epoch": 3.3514426028238185, + "grad_norm": 0.2731820046901703, + "learning_rate": 7.75118911641526e-05, + "loss": 1.8862, + "step": 10919 + }, + { + "epoch": 3.3517495395948433, + "grad_norm": 0.2464541345834732, + "learning_rate": 7.750774057103416e-05, + "loss": 1.8165, + "step": 10920 + }, + { + "epoch": 3.3520564763658687, + "grad_norm": 0.26380276679992676, + "learning_rate": 7.75035897060681e-05, + "loss": 1.78, + "step": 10921 + }, + { + "epoch": 3.352363413136894, + "grad_norm": 0.3080748915672302, + "learning_rate": 7.749943856929542e-05, + "loss": 1.7925, + "step": 10922 + }, + { + "epoch": 3.352670349907919, + "grad_norm": 0.317754864692688, + "learning_rate": 7.74952871607572e-05, + "loss": 1.8248, + "step": 10923 + }, + { + "epoch": 3.3529772866789442, + "grad_norm": 0.2525196373462677, + "learning_rate": 7.749113548049442e-05, + "loss": 1.762, + "step": 10924 + }, + { + "epoch": 3.353284223449969, + "grad_norm": 0.3149549961090088, + "learning_rate": 7.748698352854814e-05, + "loss": 1.8289, + "step": 10925 + }, + { + "epoch": 3.3535911602209945, + "grad_norm": 0.35744383931159973, + "learning_rate": 7.748283130495937e-05, + "loss": 1.8132, + "step": 10926 + }, + { + "epoch": 3.35389809699202, + "grad_norm": 0.28599128127098083, + "learning_rate": 7.747867880976916e-05, + "loss": 1.7351, + "step": 10927 + }, + { + "epoch": 3.3542050337630447, + "grad_norm": 0.24428869783878326, + "learning_rate": 7.747452604301852e-05, + "loss": 1.794, + "step": 10928 + }, + { + "epoch": 3.35451197053407, + "grad_norm": 0.29067808389663696, + "learning_rate": 7.747037300474854e-05, + "loss": 1.8181, + "step": 10929 + }, + { + "epoch": 3.354818907305095, + "grad_norm": 0.32417505979537964, + "learning_rate": 7.746621969500021e-05, + "loss": 1.8338, + "step": 10930 + }, + { + "epoch": 3.3551258440761202, + "grad_norm": 0.29536551237106323, + "learning_rate": 7.746206611381462e-05, + "loss": 1.8732, + "step": 10931 + }, + { + "epoch": 3.3554327808471456, + "grad_norm": 0.3169345259666443, + "learning_rate": 7.745791226123278e-05, + "loss": 1.876, + "step": 10932 + }, + { + "epoch": 3.3557397176181705, + "grad_norm": 0.2680271565914154, + "learning_rate": 7.745375813729576e-05, + "loss": 1.7347, + "step": 10933 + }, + { + "epoch": 3.356046654389196, + "grad_norm": 0.28339266777038574, + "learning_rate": 7.74496037420446e-05, + "loss": 1.8507, + "step": 10934 + }, + { + "epoch": 3.356353591160221, + "grad_norm": 0.2567409574985504, + "learning_rate": 7.744544907552038e-05, + "loss": 1.8244, + "step": 10935 + }, + { + "epoch": 3.356660527931246, + "grad_norm": 0.266063928604126, + "learning_rate": 7.744129413776416e-05, + "loss": 1.7864, + "step": 10936 + }, + { + "epoch": 3.3569674647022714, + "grad_norm": 0.2490999698638916, + "learning_rate": 7.743713892881696e-05, + "loss": 1.7637, + "step": 10937 + }, + { + "epoch": 3.3572744014732967, + "grad_norm": 0.25857025384902954, + "learning_rate": 7.743298344871988e-05, + "loss": 1.8101, + "step": 10938 + }, + { + "epoch": 3.3575813382443216, + "grad_norm": 0.2549006938934326, + "learning_rate": 7.742882769751398e-05, + "loss": 1.7782, + "step": 10939 + }, + { + "epoch": 3.357888275015347, + "grad_norm": 0.23915350437164307, + "learning_rate": 7.742467167524035e-05, + "loss": 1.7822, + "step": 10940 + }, + { + "epoch": 3.358195211786372, + "grad_norm": 0.25501590967178345, + "learning_rate": 7.742051538194e-05, + "loss": 1.798, + "step": 10941 + }, + { + "epoch": 3.358502148557397, + "grad_norm": 0.29332005977630615, + "learning_rate": 7.741635881765408e-05, + "loss": 1.8334, + "step": 10942 + }, + { + "epoch": 3.3588090853284225, + "grad_norm": 0.28878241777420044, + "learning_rate": 7.741220198242362e-05, + "loss": 1.8266, + "step": 10943 + }, + { + "epoch": 3.3591160220994474, + "grad_norm": 0.3068650960922241, + "learning_rate": 7.740804487628971e-05, + "loss": 1.8562, + "step": 10944 + }, + { + "epoch": 3.3594229588704727, + "grad_norm": 0.2522405683994293, + "learning_rate": 7.740388749929343e-05, + "loss": 1.8001, + "step": 10945 + }, + { + "epoch": 3.359729895641498, + "grad_norm": 0.3073521554470062, + "learning_rate": 7.739972985147588e-05, + "loss": 1.7454, + "step": 10946 + }, + { + "epoch": 3.360036832412523, + "grad_norm": 0.3018052577972412, + "learning_rate": 7.739557193287815e-05, + "loss": 1.7888, + "step": 10947 + }, + { + "epoch": 3.3603437691835483, + "grad_norm": 0.2738604247570038, + "learning_rate": 7.73914137435413e-05, + "loss": 1.7208, + "step": 10948 + }, + { + "epoch": 3.3606507059545736, + "grad_norm": 0.37699586153030396, + "learning_rate": 7.738725528350646e-05, + "loss": 1.8175, + "step": 10949 + }, + { + "epoch": 3.3609576427255985, + "grad_norm": 0.3479778468608856, + "learning_rate": 7.738309655281471e-05, + "loss": 1.818, + "step": 10950 + }, + { + "epoch": 3.361264579496624, + "grad_norm": 0.24871166050434113, + "learning_rate": 7.737893755150715e-05, + "loss": 1.7046, + "step": 10951 + }, + { + "epoch": 3.3615715162676487, + "grad_norm": 0.45015642046928406, + "learning_rate": 7.737477827962488e-05, + "loss": 1.8517, + "step": 10952 + }, + { + "epoch": 3.361878453038674, + "grad_norm": 0.4149077534675598, + "learning_rate": 7.7370618737209e-05, + "loss": 1.7403, + "step": 10953 + }, + { + "epoch": 3.3621853898096994, + "grad_norm": 0.2556059658527374, + "learning_rate": 7.736645892430064e-05, + "loss": 1.8167, + "step": 10954 + }, + { + "epoch": 3.3624923265807243, + "grad_norm": 0.3153657615184784, + "learning_rate": 7.736229884094088e-05, + "loss": 1.8471, + "step": 10955 + }, + { + "epoch": 3.3627992633517496, + "grad_norm": 0.27943772077560425, + "learning_rate": 7.735813848717084e-05, + "loss": 1.7742, + "step": 10956 + }, + { + "epoch": 3.3631062001227745, + "grad_norm": 0.28270283341407776, + "learning_rate": 7.735397786303164e-05, + "loss": 1.8418, + "step": 10957 + }, + { + "epoch": 3.3634131368938, + "grad_norm": 0.3596261441707611, + "learning_rate": 7.734981696856442e-05, + "loss": 1.8213, + "step": 10958 + }, + { + "epoch": 3.363720073664825, + "grad_norm": 0.3678492307662964, + "learning_rate": 7.734565580381026e-05, + "loss": 1.806, + "step": 10959 + }, + { + "epoch": 3.36402701043585, + "grad_norm": 0.27758681774139404, + "learning_rate": 7.734149436881031e-05, + "loss": 1.7832, + "step": 10960 + }, + { + "epoch": 3.3643339472068754, + "grad_norm": 0.2821379005908966, + "learning_rate": 7.733733266360568e-05, + "loss": 1.8888, + "step": 10961 + }, + { + "epoch": 3.3646408839779007, + "grad_norm": 0.33676958084106445, + "learning_rate": 7.733317068823751e-05, + "loss": 1.902, + "step": 10962 + }, + { + "epoch": 3.3649478207489256, + "grad_norm": 0.3116114139556885, + "learning_rate": 7.732900844274691e-05, + "loss": 1.8228, + "step": 10963 + }, + { + "epoch": 3.365254757519951, + "grad_norm": 0.3286324143409729, + "learning_rate": 7.732484592717506e-05, + "loss": 1.8707, + "step": 10964 + }, + { + "epoch": 3.3655616942909763, + "grad_norm": 0.2732192873954773, + "learning_rate": 7.732068314156304e-05, + "loss": 1.773, + "step": 10965 + }, + { + "epoch": 3.365868631062001, + "grad_norm": 0.26663896441459656, + "learning_rate": 7.731652008595204e-05, + "loss": 1.7837, + "step": 10966 + }, + { + "epoch": 3.3661755678330265, + "grad_norm": 0.27447745203971863, + "learning_rate": 7.731235676038317e-05, + "loss": 1.9103, + "step": 10967 + }, + { + "epoch": 3.3664825046040514, + "grad_norm": 0.30832916498184204, + "learning_rate": 7.730819316489757e-05, + "loss": 1.7552, + "step": 10968 + }, + { + "epoch": 3.3667894413750767, + "grad_norm": 0.29657161235809326, + "learning_rate": 7.73040292995364e-05, + "loss": 1.7654, + "step": 10969 + }, + { + "epoch": 3.367096378146102, + "grad_norm": 0.30434274673461914, + "learning_rate": 7.729986516434082e-05, + "loss": 1.8646, + "step": 10970 + }, + { + "epoch": 3.367403314917127, + "grad_norm": 0.25926661491394043, + "learning_rate": 7.729570075935198e-05, + "loss": 1.7555, + "step": 10971 + }, + { + "epoch": 3.3677102516881523, + "grad_norm": 0.2775980532169342, + "learning_rate": 7.729153608461102e-05, + "loss": 1.8427, + "step": 10972 + }, + { + "epoch": 3.368017188459177, + "grad_norm": 0.23915666341781616, + "learning_rate": 7.72873711401591e-05, + "loss": 1.7902, + "step": 10973 + }, + { + "epoch": 3.3683241252302025, + "grad_norm": 0.2603691518306732, + "learning_rate": 7.728320592603737e-05, + "loss": 1.8587, + "step": 10974 + }, + { + "epoch": 3.368631062001228, + "grad_norm": 0.2579508125782013, + "learning_rate": 7.727904044228703e-05, + "loss": 1.7617, + "step": 10975 + }, + { + "epoch": 3.3689379987722528, + "grad_norm": 0.3384297788143158, + "learning_rate": 7.72748746889492e-05, + "loss": 1.8499, + "step": 10976 + }, + { + "epoch": 3.369244935543278, + "grad_norm": 0.36756646633148193, + "learning_rate": 7.727070866606509e-05, + "loss": 1.808, + "step": 10977 + }, + { + "epoch": 3.3695518723143034, + "grad_norm": 0.3212372958660126, + "learning_rate": 7.726654237367587e-05, + "loss": 1.8245, + "step": 10978 + }, + { + "epoch": 3.3698588090853283, + "grad_norm": 0.23782415688037872, + "learning_rate": 7.726237581182267e-05, + "loss": 1.7629, + "step": 10979 + }, + { + "epoch": 3.3701657458563536, + "grad_norm": 0.2782919108867645, + "learning_rate": 7.725820898054669e-05, + "loss": 1.8, + "step": 10980 + }, + { + "epoch": 3.370472682627379, + "grad_norm": 0.2973455488681793, + "learning_rate": 7.725404187988914e-05, + "loss": 1.7949, + "step": 10981 + }, + { + "epoch": 3.370779619398404, + "grad_norm": 0.2875392735004425, + "learning_rate": 7.724987450989114e-05, + "loss": 1.8019, + "step": 10982 + }, + { + "epoch": 3.371086556169429, + "grad_norm": 0.26133236289024353, + "learning_rate": 7.724570687059394e-05, + "loss": 1.7984, + "step": 10983 + }, + { + "epoch": 3.371393492940454, + "grad_norm": 0.2760173976421356, + "learning_rate": 7.724153896203867e-05, + "loss": 1.8082, + "step": 10984 + }, + { + "epoch": 3.3717004297114794, + "grad_norm": 0.26373061537742615, + "learning_rate": 7.723737078426656e-05, + "loss": 1.8408, + "step": 10985 + }, + { + "epoch": 3.3720073664825048, + "grad_norm": 0.29425618052482605, + "learning_rate": 7.723320233731879e-05, + "loss": 1.7992, + "step": 10986 + }, + { + "epoch": 3.3723143032535297, + "grad_norm": 0.29822099208831787, + "learning_rate": 7.722903362123655e-05, + "loss": 1.8204, + "step": 10987 + }, + { + "epoch": 3.372621240024555, + "grad_norm": 0.25945618748664856, + "learning_rate": 7.722486463606104e-05, + "loss": 1.7376, + "step": 10988 + }, + { + "epoch": 3.37292817679558, + "grad_norm": 0.26367196440696716, + "learning_rate": 7.722069538183345e-05, + "loss": 1.814, + "step": 10989 + }, + { + "epoch": 3.373235113566605, + "grad_norm": 0.25015249848365784, + "learning_rate": 7.7216525858595e-05, + "loss": 1.8199, + "step": 10990 + }, + { + "epoch": 3.3735420503376305, + "grad_norm": 0.3035781681537628, + "learning_rate": 7.72123560663869e-05, + "loss": 1.739, + "step": 10991 + }, + { + "epoch": 3.3738489871086554, + "grad_norm": 0.2847912013530731, + "learning_rate": 7.720818600525033e-05, + "loss": 1.8754, + "step": 10992 + }, + { + "epoch": 3.3741559238796808, + "grad_norm": 0.2533976435661316, + "learning_rate": 7.720401567522653e-05, + "loss": 1.7616, + "step": 10993 + }, + { + "epoch": 3.374462860650706, + "grad_norm": 0.250828355550766, + "learning_rate": 7.719984507635669e-05, + "loss": 1.7973, + "step": 10994 + }, + { + "epoch": 3.374769797421731, + "grad_norm": 0.3019898235797882, + "learning_rate": 7.719567420868206e-05, + "loss": 1.7563, + "step": 10995 + }, + { + "epoch": 3.3750767341927563, + "grad_norm": 0.2703310549259186, + "learning_rate": 7.719150307224382e-05, + "loss": 1.8183, + "step": 10996 + }, + { + "epoch": 3.3753836709637817, + "grad_norm": 0.2434745579957962, + "learning_rate": 7.718733166708321e-05, + "loss": 1.7913, + "step": 10997 + }, + { + "epoch": 3.3756906077348066, + "grad_norm": 0.28036773204803467, + "learning_rate": 7.718315999324146e-05, + "loss": 1.7884, + "step": 10998 + }, + { + "epoch": 3.375997544505832, + "grad_norm": 0.25123077630996704, + "learning_rate": 7.717898805075978e-05, + "loss": 1.7394, + "step": 10999 + }, + { + "epoch": 3.376304481276857, + "grad_norm": 0.2313947230577469, + "learning_rate": 7.717481583967943e-05, + "loss": 1.7537, + "step": 11000 + }, + { + "epoch": 3.376611418047882, + "grad_norm": 0.27152860164642334, + "learning_rate": 7.71706433600416e-05, + "loss": 1.8596, + "step": 11001 + }, + { + "epoch": 3.3769183548189075, + "grad_norm": 0.32866382598876953, + "learning_rate": 7.716647061188757e-05, + "loss": 1.9007, + "step": 11002 + }, + { + "epoch": 3.3772252915899323, + "grad_norm": 0.2842368185520172, + "learning_rate": 7.716229759525854e-05, + "loss": 1.7781, + "step": 11003 + }, + { + "epoch": 3.3775322283609577, + "grad_norm": 0.30411216616630554, + "learning_rate": 7.715812431019576e-05, + "loss": 1.7403, + "step": 11004 + }, + { + "epoch": 3.3778391651319826, + "grad_norm": 0.31848132610321045, + "learning_rate": 7.71539507567405e-05, + "loss": 1.817, + "step": 11005 + }, + { + "epoch": 3.378146101903008, + "grad_norm": 0.24206148087978363, + "learning_rate": 7.714977693493397e-05, + "loss": 1.7796, + "step": 11006 + }, + { + "epoch": 3.3784530386740332, + "grad_norm": 0.2982998490333557, + "learning_rate": 7.714560284481742e-05, + "loss": 1.7883, + "step": 11007 + }, + { + "epoch": 3.378759975445058, + "grad_norm": 0.24857483804225922, + "learning_rate": 7.714142848643213e-05, + "loss": 1.7447, + "step": 11008 + }, + { + "epoch": 3.3790669122160835, + "grad_norm": 0.2509039044380188, + "learning_rate": 7.713725385981932e-05, + "loss": 1.8362, + "step": 11009 + }, + { + "epoch": 3.379373848987109, + "grad_norm": 0.2759779095649719, + "learning_rate": 7.713307896502027e-05, + "loss": 1.8655, + "step": 11010 + }, + { + "epoch": 3.3796807857581337, + "grad_norm": 0.264776349067688, + "learning_rate": 7.712890380207623e-05, + "loss": 1.8221, + "step": 11011 + }, + { + "epoch": 3.379987722529159, + "grad_norm": 0.2771971821784973, + "learning_rate": 7.712472837102846e-05, + "loss": 1.6992, + "step": 11012 + }, + { + "epoch": 3.3802946593001844, + "grad_norm": 0.2749316096305847, + "learning_rate": 7.712055267191822e-05, + "loss": 1.8128, + "step": 11013 + }, + { + "epoch": 3.3806015960712092, + "grad_norm": 0.256656289100647, + "learning_rate": 7.71163767047868e-05, + "loss": 1.8382, + "step": 11014 + }, + { + "epoch": 3.3809085328422346, + "grad_norm": 0.27646976709365845, + "learning_rate": 7.711220046967545e-05, + "loss": 1.8321, + "step": 11015 + }, + { + "epoch": 3.3812154696132595, + "grad_norm": 0.3083149194717407, + "learning_rate": 7.710802396662542e-05, + "loss": 1.904, + "step": 11016 + }, + { + "epoch": 3.381522406384285, + "grad_norm": 0.2750856280326843, + "learning_rate": 7.710384719567803e-05, + "loss": 1.7596, + "step": 11017 + }, + { + "epoch": 3.38182934315531, + "grad_norm": 0.3029455244541168, + "learning_rate": 7.709967015687452e-05, + "loss": 1.8542, + "step": 11018 + }, + { + "epoch": 3.382136279926335, + "grad_norm": 0.3144093453884125, + "learning_rate": 7.709549285025622e-05, + "loss": 1.7489, + "step": 11019 + }, + { + "epoch": 3.3824432166973604, + "grad_norm": 0.2675442099571228, + "learning_rate": 7.709131527586433e-05, + "loss": 1.7324, + "step": 11020 + }, + { + "epoch": 3.3827501534683857, + "grad_norm": 0.2906095087528229, + "learning_rate": 7.708713743374021e-05, + "loss": 1.7848, + "step": 11021 + }, + { + "epoch": 3.3830570902394106, + "grad_norm": 0.25141623616218567, + "learning_rate": 7.708295932392513e-05, + "loss": 1.7423, + "step": 11022 + }, + { + "epoch": 3.383364027010436, + "grad_norm": 0.25832003355026245, + "learning_rate": 7.707878094646037e-05, + "loss": 1.7792, + "step": 11023 + }, + { + "epoch": 3.3836709637814613, + "grad_norm": 0.23710070550441742, + "learning_rate": 7.70746023013872e-05, + "loss": 1.7916, + "step": 11024 + }, + { + "epoch": 3.383977900552486, + "grad_norm": 0.286735862493515, + "learning_rate": 7.707042338874697e-05, + "loss": 1.8272, + "step": 11025 + }, + { + "epoch": 3.3842848373235115, + "grad_norm": 0.2536577582359314, + "learning_rate": 7.706624420858094e-05, + "loss": 1.7839, + "step": 11026 + }, + { + "epoch": 3.3845917740945364, + "grad_norm": 0.5564702749252319, + "learning_rate": 7.706206476093043e-05, + "loss": 1.7832, + "step": 11027 + }, + { + "epoch": 3.3848987108655617, + "grad_norm": 0.34694772958755493, + "learning_rate": 7.705788504583671e-05, + "loss": 1.8668, + "step": 11028 + }, + { + "epoch": 3.385205647636587, + "grad_norm": 0.30388176441192627, + "learning_rate": 7.705370506334113e-05, + "loss": 1.8244, + "step": 11029 + }, + { + "epoch": 3.385512584407612, + "grad_norm": 0.2998919188976288, + "learning_rate": 7.704952481348497e-05, + "loss": 1.7927, + "step": 11030 + }, + { + "epoch": 3.3858195211786373, + "grad_norm": 0.2714936435222626, + "learning_rate": 7.704534429630955e-05, + "loss": 1.8757, + "step": 11031 + }, + { + "epoch": 3.386126457949662, + "grad_norm": 0.26670241355895996, + "learning_rate": 7.704116351185619e-05, + "loss": 1.8146, + "step": 11032 + }, + { + "epoch": 3.3864333947206875, + "grad_norm": 0.2500552833080292, + "learning_rate": 7.703698246016621e-05, + "loss": 1.7984, + "step": 11033 + }, + { + "epoch": 3.386740331491713, + "grad_norm": 0.2494918406009674, + "learning_rate": 7.703280114128091e-05, + "loss": 1.7433, + "step": 11034 + }, + { + "epoch": 3.3870472682627377, + "grad_norm": 0.25658491253852844, + "learning_rate": 7.702861955524163e-05, + "loss": 1.8487, + "step": 11035 + }, + { + "epoch": 3.387354205033763, + "grad_norm": 0.2871410548686981, + "learning_rate": 7.702443770208969e-05, + "loss": 1.7919, + "step": 11036 + }, + { + "epoch": 3.3876611418047884, + "grad_norm": 0.3347938060760498, + "learning_rate": 7.702025558186643e-05, + "loss": 1.8091, + "step": 11037 + }, + { + "epoch": 3.3879680785758133, + "grad_norm": 0.39016643166542053, + "learning_rate": 7.701607319461315e-05, + "loss": 1.7816, + "step": 11038 + }, + { + "epoch": 3.3882750153468386, + "grad_norm": 0.3423028290271759, + "learning_rate": 7.701189054037121e-05, + "loss": 1.8454, + "step": 11039 + }, + { + "epoch": 3.388581952117864, + "grad_norm": 0.27592089772224426, + "learning_rate": 7.700770761918192e-05, + "loss": 1.8431, + "step": 11040 + }, + { + "epoch": 3.388888888888889, + "grad_norm": 0.46047264337539673, + "learning_rate": 7.700352443108665e-05, + "loss": 1.8412, + "step": 11041 + }, + { + "epoch": 3.389195825659914, + "grad_norm": 0.49226754903793335, + "learning_rate": 7.699934097612673e-05, + "loss": 1.8212, + "step": 11042 + }, + { + "epoch": 3.389502762430939, + "grad_norm": 0.3958778381347656, + "learning_rate": 7.699515725434348e-05, + "loss": 1.747, + "step": 11043 + }, + { + "epoch": 3.3898096992019644, + "grad_norm": 0.26097169518470764, + "learning_rate": 7.699097326577827e-05, + "loss": 1.7631, + "step": 11044 + }, + { + "epoch": 3.3901166359729897, + "grad_norm": 0.2922612130641937, + "learning_rate": 7.698678901047245e-05, + "loss": 1.7891, + "step": 11045 + }, + { + "epoch": 3.3904235727440146, + "grad_norm": 0.4195055365562439, + "learning_rate": 7.698260448846734e-05, + "loss": 1.7765, + "step": 11046 + }, + { + "epoch": 3.39073050951504, + "grad_norm": 0.4572988450527191, + "learning_rate": 7.697841969980434e-05, + "loss": 1.8085, + "step": 11047 + }, + { + "epoch": 3.391037446286065, + "grad_norm": 0.38819587230682373, + "learning_rate": 7.697423464452478e-05, + "loss": 1.8854, + "step": 11048 + }, + { + "epoch": 3.39134438305709, + "grad_norm": 0.27421653270721436, + "learning_rate": 7.697004932267003e-05, + "loss": 1.8327, + "step": 11049 + }, + { + "epoch": 3.3916513198281155, + "grad_norm": 0.33559146523475647, + "learning_rate": 7.696586373428142e-05, + "loss": 1.8109, + "step": 11050 + }, + { + "epoch": 3.3919582565991404, + "grad_norm": 0.39438655972480774, + "learning_rate": 7.696167787940037e-05, + "loss": 1.7909, + "step": 11051 + }, + { + "epoch": 3.3922651933701657, + "grad_norm": 0.3425842523574829, + "learning_rate": 7.695749175806819e-05, + "loss": 1.8571, + "step": 11052 + }, + { + "epoch": 3.392572130141191, + "grad_norm": 0.2860080301761627, + "learning_rate": 7.695330537032628e-05, + "loss": 1.8546, + "step": 11053 + }, + { + "epoch": 3.392879066912216, + "grad_norm": 0.35894665122032166, + "learning_rate": 7.694911871621601e-05, + "loss": 1.7895, + "step": 11054 + }, + { + "epoch": 3.3931860036832413, + "grad_norm": 0.351193904876709, + "learning_rate": 7.694493179577879e-05, + "loss": 1.7453, + "step": 11055 + }, + { + "epoch": 3.3934929404542666, + "grad_norm": 0.24812865257263184, + "learning_rate": 7.694074460905592e-05, + "loss": 1.8131, + "step": 11056 + }, + { + "epoch": 3.3937998772252915, + "grad_norm": 0.38620972633361816, + "learning_rate": 7.693655715608883e-05, + "loss": 1.8346, + "step": 11057 + }, + { + "epoch": 3.394106813996317, + "grad_norm": 0.5005692839622498, + "learning_rate": 7.69323694369189e-05, + "loss": 1.9031, + "step": 11058 + }, + { + "epoch": 3.3944137507673418, + "grad_norm": 0.4321887791156769, + "learning_rate": 7.692818145158751e-05, + "loss": 1.8783, + "step": 11059 + }, + { + "epoch": 3.394720687538367, + "grad_norm": 0.269307017326355, + "learning_rate": 7.692399320013603e-05, + "loss": 1.8075, + "step": 11060 + }, + { + "epoch": 3.3950276243093924, + "grad_norm": 0.2945556342601776, + "learning_rate": 7.69198046826059e-05, + "loss": 1.8366, + "step": 11061 + }, + { + "epoch": 3.3953345610804173, + "grad_norm": 0.30531853437423706, + "learning_rate": 7.691561589903847e-05, + "loss": 1.7665, + "step": 11062 + }, + { + "epoch": 3.3956414978514426, + "grad_norm": 0.25105199217796326, + "learning_rate": 7.691142684947513e-05, + "loss": 1.782, + "step": 11063 + }, + { + "epoch": 3.3959484346224675, + "grad_norm": 0.3373202085494995, + "learning_rate": 7.69072375339573e-05, + "loss": 1.8148, + "step": 11064 + }, + { + "epoch": 3.396255371393493, + "grad_norm": 0.34207093715667725, + "learning_rate": 7.690304795252638e-05, + "loss": 1.8287, + "step": 11065 + }, + { + "epoch": 3.396562308164518, + "grad_norm": 0.26281681656837463, + "learning_rate": 7.68988581052238e-05, + "loss": 1.8551, + "step": 11066 + }, + { + "epoch": 3.396869244935543, + "grad_norm": 0.3091152608394623, + "learning_rate": 7.689466799209091e-05, + "loss": 1.7689, + "step": 11067 + }, + { + "epoch": 3.3971761817065684, + "grad_norm": 0.37421298027038574, + "learning_rate": 7.689047761316914e-05, + "loss": 1.7908, + "step": 11068 + }, + { + "epoch": 3.3974831184775938, + "grad_norm": 0.3745511770248413, + "learning_rate": 7.688628696849993e-05, + "loss": 1.8408, + "step": 11069 + }, + { + "epoch": 3.3977900552486187, + "grad_norm": 0.3003663122653961, + "learning_rate": 7.688209605812467e-05, + "loss": 1.9109, + "step": 11070 + }, + { + "epoch": 3.398096992019644, + "grad_norm": 0.3437681496143341, + "learning_rate": 7.687790488208478e-05, + "loss": 1.811, + "step": 11071 + }, + { + "epoch": 3.3984039287906693, + "grad_norm": 0.3480641841888428, + "learning_rate": 7.687371344042168e-05, + "loss": 1.8114, + "step": 11072 + }, + { + "epoch": 3.398710865561694, + "grad_norm": 0.24670913815498352, + "learning_rate": 7.686952173317679e-05, + "loss": 1.7959, + "step": 11073 + }, + { + "epoch": 3.3990178023327196, + "grad_norm": 0.2939499020576477, + "learning_rate": 7.686532976039154e-05, + "loss": 1.7518, + "step": 11074 + }, + { + "epoch": 3.3993247391037444, + "grad_norm": 0.3332279622554779, + "learning_rate": 7.686113752210736e-05, + "loss": 1.843, + "step": 11075 + }, + { + "epoch": 3.3996316758747698, + "grad_norm": 0.22967280447483063, + "learning_rate": 7.685694501836566e-05, + "loss": 1.7408, + "step": 11076 + }, + { + "epoch": 3.399938612645795, + "grad_norm": 0.3443470001220703, + "learning_rate": 7.685275224920789e-05, + "loss": 1.8004, + "step": 11077 + }, + { + "epoch": 3.40024554941682, + "grad_norm": 0.3725457489490509, + "learning_rate": 7.684855921467548e-05, + "loss": 1.833, + "step": 11078 + }, + { + "epoch": 3.4005524861878453, + "grad_norm": 0.3178638219833374, + "learning_rate": 7.68443659148099e-05, + "loss": 1.8055, + "step": 11079 + }, + { + "epoch": 3.4008594229588702, + "grad_norm": 0.2609167695045471, + "learning_rate": 7.684017234965254e-05, + "loss": 1.7881, + "step": 11080 + }, + { + "epoch": 3.4011663597298956, + "grad_norm": 0.26975762844085693, + "learning_rate": 7.683597851924486e-05, + "loss": 1.8424, + "step": 11081 + }, + { + "epoch": 3.401473296500921, + "grad_norm": 0.266661673784256, + "learning_rate": 7.683178442362832e-05, + "loss": 1.7785, + "step": 11082 + }, + { + "epoch": 3.401780233271946, + "grad_norm": 0.27915671467781067, + "learning_rate": 7.682759006284436e-05, + "loss": 1.8241, + "step": 11083 + }, + { + "epoch": 3.402087170042971, + "grad_norm": 0.25167274475097656, + "learning_rate": 7.682339543693444e-05, + "loss": 1.7637, + "step": 11084 + }, + { + "epoch": 3.4023941068139965, + "grad_norm": 0.2439529299736023, + "learning_rate": 7.681920054593999e-05, + "loss": 1.7796, + "step": 11085 + }, + { + "epoch": 3.4027010435850213, + "grad_norm": 0.26224252581596375, + "learning_rate": 7.681500538990249e-05, + "loss": 1.8018, + "step": 11086 + }, + { + "epoch": 3.4030079803560467, + "grad_norm": 0.25093868374824524, + "learning_rate": 7.681080996886336e-05, + "loss": 1.7664, + "step": 11087 + }, + { + "epoch": 3.403314917127072, + "grad_norm": 0.26393210887908936, + "learning_rate": 7.680661428286413e-05, + "loss": 1.8389, + "step": 11088 + }, + { + "epoch": 3.403621853898097, + "grad_norm": 0.24750283360481262, + "learning_rate": 7.680241833194622e-05, + "loss": 1.8358, + "step": 11089 + }, + { + "epoch": 3.4039287906691222, + "grad_norm": 0.21568982303142548, + "learning_rate": 7.67982221161511e-05, + "loss": 1.7874, + "step": 11090 + }, + { + "epoch": 3.404235727440147, + "grad_norm": 0.24407126009464264, + "learning_rate": 7.679402563552023e-05, + "loss": 1.7753, + "step": 11091 + }, + { + "epoch": 3.4045426642111725, + "grad_norm": 0.23288260400295258, + "learning_rate": 7.67898288900951e-05, + "loss": 1.8046, + "step": 11092 + }, + { + "epoch": 3.404849600982198, + "grad_norm": 0.2548544108867645, + "learning_rate": 7.678563187991718e-05, + "loss": 1.8778, + "step": 11093 + }, + { + "epoch": 3.4051565377532227, + "grad_norm": 0.24008090794086456, + "learning_rate": 7.678143460502796e-05, + "loss": 1.7912, + "step": 11094 + }, + { + "epoch": 3.405463474524248, + "grad_norm": 0.26085031032562256, + "learning_rate": 7.677723706546889e-05, + "loss": 1.849, + "step": 11095 + }, + { + "epoch": 3.4057704112952734, + "grad_norm": 0.2830932140350342, + "learning_rate": 7.677303926128147e-05, + "loss": 1.8265, + "step": 11096 + }, + { + "epoch": 3.4060773480662982, + "grad_norm": 0.27593597769737244, + "learning_rate": 7.676884119250718e-05, + "loss": 1.8555, + "step": 11097 + }, + { + "epoch": 3.4063842848373236, + "grad_norm": 0.2403372824192047, + "learning_rate": 7.676464285918751e-05, + "loss": 1.7243, + "step": 11098 + }, + { + "epoch": 3.406691221608349, + "grad_norm": 0.28830090165138245, + "learning_rate": 7.676044426136397e-05, + "loss": 1.8108, + "step": 11099 + }, + { + "epoch": 3.406998158379374, + "grad_norm": 0.2918153405189514, + "learning_rate": 7.675624539907802e-05, + "loss": 1.7875, + "step": 11100 + }, + { + "epoch": 3.407305095150399, + "grad_norm": 0.2609013020992279, + "learning_rate": 7.675204627237117e-05, + "loss": 1.778, + "step": 11101 + }, + { + "epoch": 3.407612031921424, + "grad_norm": 0.2714763283729553, + "learning_rate": 7.674784688128494e-05, + "loss": 1.8472, + "step": 11102 + }, + { + "epoch": 3.4079189686924494, + "grad_norm": 0.25857117772102356, + "learning_rate": 7.674364722586078e-05, + "loss": 1.7495, + "step": 11103 + }, + { + "epoch": 3.4082259054634747, + "grad_norm": 0.25485143065452576, + "learning_rate": 7.673944730614023e-05, + "loss": 1.7817, + "step": 11104 + }, + { + "epoch": 3.4085328422344996, + "grad_norm": 0.2735857665538788, + "learning_rate": 7.67352471221648e-05, + "loss": 1.7522, + "step": 11105 + }, + { + "epoch": 3.408839779005525, + "grad_norm": 0.25079572200775146, + "learning_rate": 7.6731046673976e-05, + "loss": 1.765, + "step": 11106 + }, + { + "epoch": 3.40914671577655, + "grad_norm": 0.3080148696899414, + "learning_rate": 7.672684596161532e-05, + "loss": 1.8305, + "step": 11107 + }, + { + "epoch": 3.409453652547575, + "grad_norm": 0.23771968483924866, + "learning_rate": 7.672264498512427e-05, + "loss": 1.7837, + "step": 11108 + }, + { + "epoch": 3.4097605893186005, + "grad_norm": 0.29941999912261963, + "learning_rate": 7.671844374454437e-05, + "loss": 1.8013, + "step": 11109 + }, + { + "epoch": 3.4100675260896254, + "grad_norm": 0.27871644496917725, + "learning_rate": 7.671424223991717e-05, + "loss": 1.8598, + "step": 11110 + }, + { + "epoch": 3.4103744628606507, + "grad_norm": 0.2751443684101105, + "learning_rate": 7.671004047128416e-05, + "loss": 1.8341, + "step": 11111 + }, + { + "epoch": 3.410681399631676, + "grad_norm": 0.27227312326431274, + "learning_rate": 7.670583843868688e-05, + "loss": 1.81, + "step": 11112 + }, + { + "epoch": 3.410988336402701, + "grad_norm": 0.29617756605148315, + "learning_rate": 7.670163614216685e-05, + "loss": 1.8795, + "step": 11113 + }, + { + "epoch": 3.4112952731737263, + "grad_norm": 0.268920361995697, + "learning_rate": 7.669743358176563e-05, + "loss": 1.7659, + "step": 11114 + }, + { + "epoch": 3.4116022099447516, + "grad_norm": 0.2875109314918518, + "learning_rate": 7.669323075752467e-05, + "loss": 1.8263, + "step": 11115 + }, + { + "epoch": 3.4119091467157765, + "grad_norm": 0.34703585505485535, + "learning_rate": 7.668902766948558e-05, + "loss": 1.7622, + "step": 11116 + }, + { + "epoch": 3.412216083486802, + "grad_norm": 0.3090265393257141, + "learning_rate": 7.668482431768989e-05, + "loss": 1.7381, + "step": 11117 + }, + { + "epoch": 3.4125230202578267, + "grad_norm": 0.2619737684726715, + "learning_rate": 7.668062070217911e-05, + "loss": 1.8004, + "step": 11118 + }, + { + "epoch": 3.412829957028852, + "grad_norm": 0.289815217256546, + "learning_rate": 7.667641682299482e-05, + "loss": 1.7946, + "step": 11119 + }, + { + "epoch": 3.4131368937998774, + "grad_norm": 0.28732073307037354, + "learning_rate": 7.667221268017852e-05, + "loss": 1.8746, + "step": 11120 + }, + { + "epoch": 3.4134438305709023, + "grad_norm": 0.23232576251029968, + "learning_rate": 7.666800827377178e-05, + "loss": 1.7403, + "step": 11121 + }, + { + "epoch": 3.4137507673419276, + "grad_norm": 0.22903507947921753, + "learning_rate": 7.666380360381616e-05, + "loss": 1.7785, + "step": 11122 + }, + { + "epoch": 3.4140577041129525, + "grad_norm": 0.25023025274276733, + "learning_rate": 7.665959867035321e-05, + "loss": 1.7881, + "step": 11123 + }, + { + "epoch": 3.414364640883978, + "grad_norm": 0.2199166864156723, + "learning_rate": 7.665539347342449e-05, + "loss": 1.7522, + "step": 11124 + }, + { + "epoch": 3.414671577655003, + "grad_norm": 0.2539862394332886, + "learning_rate": 7.665118801307152e-05, + "loss": 1.7964, + "step": 11125 + }, + { + "epoch": 3.414978514426028, + "grad_norm": 0.22670161724090576, + "learning_rate": 7.664698228933591e-05, + "loss": 1.7071, + "step": 11126 + }, + { + "epoch": 3.4152854511970534, + "grad_norm": 0.24827396869659424, + "learning_rate": 7.664277630225919e-05, + "loss": 1.7897, + "step": 11127 + }, + { + "epoch": 3.4155923879680787, + "grad_norm": 0.29391366243362427, + "learning_rate": 7.663857005188296e-05, + "loss": 1.7967, + "step": 11128 + }, + { + "epoch": 3.4158993247391036, + "grad_norm": 0.3201812505722046, + "learning_rate": 7.663436353824874e-05, + "loss": 1.7681, + "step": 11129 + }, + { + "epoch": 3.416206261510129, + "grad_norm": 0.2274552583694458, + "learning_rate": 7.663015676139814e-05, + "loss": 1.7535, + "step": 11130 + }, + { + "epoch": 3.4165131982811543, + "grad_norm": 0.3955044150352478, + "learning_rate": 7.662594972137273e-05, + "loss": 1.8175, + "step": 11131 + }, + { + "epoch": 3.416820135052179, + "grad_norm": 0.46493569016456604, + "learning_rate": 7.662174241821406e-05, + "loss": 1.7806, + "step": 11132 + }, + { + "epoch": 3.4171270718232045, + "grad_norm": 0.37731611728668213, + "learning_rate": 7.661753485196375e-05, + "loss": 1.7555, + "step": 11133 + }, + { + "epoch": 3.4174340085942294, + "grad_norm": 0.23983556032180786, + "learning_rate": 7.661332702266334e-05, + "loss": 1.7662, + "step": 11134 + }, + { + "epoch": 3.4177409453652547, + "grad_norm": 0.34964314103126526, + "learning_rate": 7.660911893035445e-05, + "loss": 1.7786, + "step": 11135 + }, + { + "epoch": 3.41804788213628, + "grad_norm": 0.44820764660835266, + "learning_rate": 7.660491057507864e-05, + "loss": 1.778, + "step": 11136 + }, + { + "epoch": 3.418354818907305, + "grad_norm": 0.32936233282089233, + "learning_rate": 7.660070195687752e-05, + "loss": 1.8181, + "step": 11137 + }, + { + "epoch": 3.4186617556783303, + "grad_norm": 0.2874850332736969, + "learning_rate": 7.659649307579266e-05, + "loss": 1.8733, + "step": 11138 + }, + { + "epoch": 3.418968692449355, + "grad_norm": 0.46269866824150085, + "learning_rate": 7.659228393186566e-05, + "loss": 1.8566, + "step": 11139 + }, + { + "epoch": 3.4192756292203805, + "grad_norm": 0.5873839855194092, + "learning_rate": 7.658807452513816e-05, + "loss": 1.8317, + "step": 11140 + }, + { + "epoch": 3.419582565991406, + "grad_norm": 0.43150341510772705, + "learning_rate": 7.65838648556517e-05, + "loss": 1.7702, + "step": 11141 + }, + { + "epoch": 3.4198895027624308, + "grad_norm": 0.2803891599178314, + "learning_rate": 7.65796549234479e-05, + "loss": 1.8043, + "step": 11142 + }, + { + "epoch": 3.420196439533456, + "grad_norm": 0.37295013666152954, + "learning_rate": 7.657544472856838e-05, + "loss": 1.7923, + "step": 11143 + }, + { + "epoch": 3.4205033763044814, + "grad_norm": 0.3922573924064636, + "learning_rate": 7.657123427105473e-05, + "loss": 1.8231, + "step": 11144 + }, + { + "epoch": 3.4208103130755063, + "grad_norm": 0.27254152297973633, + "learning_rate": 7.656702355094859e-05, + "loss": 1.8168, + "step": 11145 + }, + { + "epoch": 3.4211172498465316, + "grad_norm": 0.28005337715148926, + "learning_rate": 7.656281256829152e-05, + "loss": 1.8047, + "step": 11146 + }, + { + "epoch": 3.421424186617557, + "grad_norm": 0.4369073808193207, + "learning_rate": 7.655860132312519e-05, + "loss": 1.7243, + "step": 11147 + }, + { + "epoch": 3.421731123388582, + "grad_norm": 0.4127553701400757, + "learning_rate": 7.655438981549119e-05, + "loss": 1.8148, + "step": 11148 + }, + { + "epoch": 3.422038060159607, + "grad_norm": 0.3131798207759857, + "learning_rate": 7.655017804543114e-05, + "loss": 1.789, + "step": 11149 + }, + { + "epoch": 3.422344996930632, + "grad_norm": 0.2947194576263428, + "learning_rate": 7.654596601298666e-05, + "loss": 1.8221, + "step": 11150 + }, + { + "epoch": 3.4226519337016574, + "grad_norm": 0.3072497546672821, + "learning_rate": 7.654175371819941e-05, + "loss": 1.7747, + "step": 11151 + }, + { + "epoch": 3.4229588704726828, + "grad_norm": 0.29408320784568787, + "learning_rate": 7.653754116111099e-05, + "loss": 1.9009, + "step": 11152 + }, + { + "epoch": 3.4232658072437077, + "grad_norm": 0.2629215717315674, + "learning_rate": 7.653332834176303e-05, + "loss": 1.7354, + "step": 11153 + }, + { + "epoch": 3.423572744014733, + "grad_norm": 0.2850257456302643, + "learning_rate": 7.652911526019716e-05, + "loss": 1.8422, + "step": 11154 + }, + { + "epoch": 3.423879680785758, + "grad_norm": 0.29787111282348633, + "learning_rate": 7.652490191645503e-05, + "loss": 1.8122, + "step": 11155 + }, + { + "epoch": 3.424186617556783, + "grad_norm": 0.2670947015285492, + "learning_rate": 7.652068831057826e-05, + "loss": 1.7734, + "step": 11156 + }, + { + "epoch": 3.4244935543278086, + "grad_norm": 0.26415133476257324, + "learning_rate": 7.651647444260853e-05, + "loss": 1.7661, + "step": 11157 + }, + { + "epoch": 3.424800491098834, + "grad_norm": 0.2614886164665222, + "learning_rate": 7.651226031258745e-05, + "loss": 1.6918, + "step": 11158 + }, + { + "epoch": 3.425107427869859, + "grad_norm": 0.28485649824142456, + "learning_rate": 7.650804592055667e-05, + "loss": 1.7771, + "step": 11159 + }, + { + "epoch": 3.425414364640884, + "grad_norm": 0.26080289483070374, + "learning_rate": 7.650383126655784e-05, + "loss": 1.7637, + "step": 11160 + }, + { + "epoch": 3.425721301411909, + "grad_norm": 0.2503695487976074, + "learning_rate": 7.649961635063261e-05, + "loss": 1.7864, + "step": 11161 + }, + { + "epoch": 3.4260282381829343, + "grad_norm": 0.3165570795536041, + "learning_rate": 7.649540117282263e-05, + "loss": 1.8107, + "step": 11162 + }, + { + "epoch": 3.4263351749539597, + "grad_norm": 0.28411731123924255, + "learning_rate": 7.649118573316959e-05, + "loss": 1.7557, + "step": 11163 + }, + { + "epoch": 3.4266421117249846, + "grad_norm": 0.24469570815563202, + "learning_rate": 7.648697003171512e-05, + "loss": 1.7597, + "step": 11164 + }, + { + "epoch": 3.42694904849601, + "grad_norm": 0.31968292593955994, + "learning_rate": 7.648275406850087e-05, + "loss": 1.7796, + "step": 11165 + }, + { + "epoch": 3.427255985267035, + "grad_norm": 0.24520765244960785, + "learning_rate": 7.647853784356856e-05, + "loss": 1.7931, + "step": 11166 + }, + { + "epoch": 3.42756292203806, + "grad_norm": 0.23946821689605713, + "learning_rate": 7.647432135695977e-05, + "loss": 1.7143, + "step": 11167 + }, + { + "epoch": 3.4278698588090855, + "grad_norm": 0.321455180644989, + "learning_rate": 7.647010460871624e-05, + "loss": 1.8682, + "step": 11168 + }, + { + "epoch": 3.4281767955801103, + "grad_norm": 0.2803197503089905, + "learning_rate": 7.646588759887964e-05, + "loss": 1.8, + "step": 11169 + }, + { + "epoch": 3.4284837323511357, + "grad_norm": 0.2597559988498688, + "learning_rate": 7.64616703274916e-05, + "loss": 1.8027, + "step": 11170 + }, + { + "epoch": 3.428790669122161, + "grad_norm": 0.25055503845214844, + "learning_rate": 7.645745279459384e-05, + "loss": 1.7659, + "step": 11171 + }, + { + "epoch": 3.429097605893186, + "grad_norm": 0.34582629799842834, + "learning_rate": 7.645323500022803e-05, + "loss": 1.7868, + "step": 11172 + }, + { + "epoch": 3.4294045426642112, + "grad_norm": 0.32845041155815125, + "learning_rate": 7.644901694443584e-05, + "loss": 1.8247, + "step": 11173 + }, + { + "epoch": 3.4297114794352366, + "grad_norm": 0.2570398449897766, + "learning_rate": 7.644479862725896e-05, + "loss": 1.7802, + "step": 11174 + }, + { + "epoch": 3.4300184162062615, + "grad_norm": 0.23117294907569885, + "learning_rate": 7.644058004873908e-05, + "loss": 1.7575, + "step": 11175 + }, + { + "epoch": 3.430325352977287, + "grad_norm": 0.2417830377817154, + "learning_rate": 7.64363612089179e-05, + "loss": 1.7954, + "step": 11176 + }, + { + "epoch": 3.4306322897483117, + "grad_norm": 0.249378964304924, + "learning_rate": 7.643214210783708e-05, + "loss": 1.8161, + "step": 11177 + }, + { + "epoch": 3.430939226519337, + "grad_norm": 0.24494746327400208, + "learning_rate": 7.642792274553836e-05, + "loss": 1.825, + "step": 11178 + }, + { + "epoch": 3.4312461632903624, + "grad_norm": 0.2663760185241699, + "learning_rate": 7.642370312206342e-05, + "loss": 1.7589, + "step": 11179 + }, + { + "epoch": 3.4315531000613873, + "grad_norm": 0.2819322645664215, + "learning_rate": 7.641948323745395e-05, + "loss": 1.8097, + "step": 11180 + }, + { + "epoch": 3.4318600368324126, + "grad_norm": 0.26917630434036255, + "learning_rate": 7.641526309175166e-05, + "loss": 1.7934, + "step": 11181 + }, + { + "epoch": 3.4321669736034375, + "grad_norm": 0.31618112325668335, + "learning_rate": 7.641104268499826e-05, + "loss": 1.8522, + "step": 11182 + }, + { + "epoch": 3.432473910374463, + "grad_norm": 0.29209139943122864, + "learning_rate": 7.640682201723546e-05, + "loss": 1.7499, + "step": 11183 + }, + { + "epoch": 3.432780847145488, + "grad_norm": 0.24831914901733398, + "learning_rate": 7.640260108850496e-05, + "loss": 1.7897, + "step": 11184 + }, + { + "epoch": 3.433087783916513, + "grad_norm": 0.2459818720817566, + "learning_rate": 7.639837989884849e-05, + "loss": 1.7604, + "step": 11185 + }, + { + "epoch": 3.4333947206875384, + "grad_norm": 0.27157485485076904, + "learning_rate": 7.639415844830774e-05, + "loss": 1.7776, + "step": 11186 + }, + { + "epoch": 3.4337016574585637, + "grad_norm": 0.3021515905857086, + "learning_rate": 7.638993673692445e-05, + "loss": 1.7771, + "step": 11187 + }, + { + "epoch": 3.4340085942295886, + "grad_norm": 0.2591722309589386, + "learning_rate": 7.638571476474036e-05, + "loss": 1.8333, + "step": 11188 + }, + { + "epoch": 3.434315531000614, + "grad_norm": 0.2255258709192276, + "learning_rate": 7.638149253179717e-05, + "loss": 1.7647, + "step": 11189 + }, + { + "epoch": 3.4346224677716393, + "grad_norm": 0.2585793733596802, + "learning_rate": 7.637727003813658e-05, + "loss": 1.786, + "step": 11190 + }, + { + "epoch": 3.434929404542664, + "grad_norm": 0.23649543523788452, + "learning_rate": 7.637304728380036e-05, + "loss": 1.822, + "step": 11191 + }, + { + "epoch": 3.4352363413136895, + "grad_norm": 0.2610832452774048, + "learning_rate": 7.636882426883023e-05, + "loss": 1.7925, + "step": 11192 + }, + { + "epoch": 3.4355432780847144, + "grad_norm": 0.26230642199516296, + "learning_rate": 7.636460099326793e-05, + "loss": 1.8169, + "step": 11193 + }, + { + "epoch": 3.4358502148557397, + "grad_norm": 0.2800561189651489, + "learning_rate": 7.636037745715518e-05, + "loss": 1.845, + "step": 11194 + }, + { + "epoch": 3.436157151626765, + "grad_norm": 0.27790409326553345, + "learning_rate": 7.635615366053372e-05, + "loss": 1.8141, + "step": 11195 + }, + { + "epoch": 3.43646408839779, + "grad_norm": 0.2894865870475769, + "learning_rate": 7.635192960344533e-05, + "loss": 1.7916, + "step": 11196 + }, + { + "epoch": 3.4367710251688153, + "grad_norm": 0.22310738265514374, + "learning_rate": 7.634770528593171e-05, + "loss": 1.79, + "step": 11197 + }, + { + "epoch": 3.43707796193984, + "grad_norm": 0.2837755084037781, + "learning_rate": 7.634348070803463e-05, + "loss": 1.8763, + "step": 11198 + }, + { + "epoch": 3.4373848987108655, + "grad_norm": 0.32488104701042175, + "learning_rate": 7.633925586979583e-05, + "loss": 1.8331, + "step": 11199 + }, + { + "epoch": 3.437691835481891, + "grad_norm": 0.2708779573440552, + "learning_rate": 7.633503077125706e-05, + "loss": 1.761, + "step": 11200 + }, + { + "epoch": 3.4379987722529157, + "grad_norm": 0.23929642140865326, + "learning_rate": 7.633080541246008e-05, + "loss": 1.8217, + "step": 11201 + }, + { + "epoch": 3.438305709023941, + "grad_norm": 0.3213331997394562, + "learning_rate": 7.632657979344667e-05, + "loss": 1.8375, + "step": 11202 + }, + { + "epoch": 3.4386126457949664, + "grad_norm": 0.38420629501342773, + "learning_rate": 7.632235391425854e-05, + "loss": 1.765, + "step": 11203 + }, + { + "epoch": 3.4389195825659913, + "grad_norm": 0.40466073155403137, + "learning_rate": 7.631812777493749e-05, + "loss": 1.8262, + "step": 11204 + }, + { + "epoch": 3.4392265193370166, + "grad_norm": 0.35904639959335327, + "learning_rate": 7.631390137552527e-05, + "loss": 1.894, + "step": 11205 + }, + { + "epoch": 3.439533456108042, + "grad_norm": 0.28880515694618225, + "learning_rate": 7.630967471606368e-05, + "loss": 1.87, + "step": 11206 + }, + { + "epoch": 3.439840392879067, + "grad_norm": 0.2878882884979248, + "learning_rate": 7.630544779659444e-05, + "loss": 1.7841, + "step": 11207 + }, + { + "epoch": 3.440147329650092, + "grad_norm": 0.36002418398857117, + "learning_rate": 7.630122061715935e-05, + "loss": 1.7318, + "step": 11208 + }, + { + "epoch": 3.440454266421117, + "grad_norm": 0.3304644227027893, + "learning_rate": 7.629699317780019e-05, + "loss": 1.8581, + "step": 11209 + }, + { + "epoch": 3.4407612031921424, + "grad_norm": 0.23396331071853638, + "learning_rate": 7.629276547855872e-05, + "loss": 1.7897, + "step": 11210 + }, + { + "epoch": 3.4410681399631677, + "grad_norm": 0.34914183616638184, + "learning_rate": 7.628853751947674e-05, + "loss": 1.8531, + "step": 11211 + }, + { + "epoch": 3.4413750767341926, + "grad_norm": 0.3700502812862396, + "learning_rate": 7.6284309300596e-05, + "loss": 1.7884, + "step": 11212 + }, + { + "epoch": 3.441682013505218, + "grad_norm": 0.24606801569461823, + "learning_rate": 7.628008082195835e-05, + "loss": 1.7292, + "step": 11213 + }, + { + "epoch": 3.441988950276243, + "grad_norm": 0.26344993710517883, + "learning_rate": 7.627585208360551e-05, + "loss": 1.7832, + "step": 11214 + }, + { + "epoch": 3.442295887047268, + "grad_norm": 0.4034743010997772, + "learning_rate": 7.62716230855793e-05, + "loss": 1.8164, + "step": 11215 + }, + { + "epoch": 3.4426028238182935, + "grad_norm": 0.4508039355278015, + "learning_rate": 7.626739382792152e-05, + "loss": 1.7855, + "step": 11216 + }, + { + "epoch": 3.4429097605893184, + "grad_norm": 0.2963111400604248, + "learning_rate": 7.626316431067395e-05, + "loss": 1.7995, + "step": 11217 + }, + { + "epoch": 3.4432166973603437, + "grad_norm": 0.35248515009880066, + "learning_rate": 7.625893453387841e-05, + "loss": 1.8761, + "step": 11218 + }, + { + "epoch": 3.443523634131369, + "grad_norm": 0.4032224416732788, + "learning_rate": 7.625470449757668e-05, + "loss": 1.7746, + "step": 11219 + }, + { + "epoch": 3.443830570902394, + "grad_norm": 0.3505195081233978, + "learning_rate": 7.625047420181057e-05, + "loss": 1.851, + "step": 11220 + }, + { + "epoch": 3.4441375076734193, + "grad_norm": 0.288968563079834, + "learning_rate": 7.62462436466219e-05, + "loss": 1.8055, + "step": 11221 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.43141910433769226, + "learning_rate": 7.624201283205246e-05, + "loss": 1.816, + "step": 11222 + }, + { + "epoch": 3.4447513812154695, + "grad_norm": 0.46902137994766235, + "learning_rate": 7.623778175814407e-05, + "loss": 1.8478, + "step": 11223 + }, + { + "epoch": 3.445058317986495, + "grad_norm": 0.3333328366279602, + "learning_rate": 7.623355042493854e-05, + "loss": 1.7949, + "step": 11224 + }, + { + "epoch": 3.4453652547575198, + "grad_norm": 0.2625340521335602, + "learning_rate": 7.622931883247768e-05, + "loss": 1.745, + "step": 11225 + }, + { + "epoch": 3.445672191528545, + "grad_norm": 0.4565848410129547, + "learning_rate": 7.622508698080333e-05, + "loss": 1.796, + "step": 11226 + }, + { + "epoch": 3.4459791282995704, + "grad_norm": 0.4676518738269806, + "learning_rate": 7.622085486995729e-05, + "loss": 1.8115, + "step": 11227 + }, + { + "epoch": 3.4462860650705953, + "grad_norm": 0.3828938603401184, + "learning_rate": 7.62166224999814e-05, + "loss": 1.8758, + "step": 11228 + }, + { + "epoch": 3.4465930018416207, + "grad_norm": 0.2786383628845215, + "learning_rate": 7.621238987091747e-05, + "loss": 1.7616, + "step": 11229 + }, + { + "epoch": 3.446899938612646, + "grad_norm": 0.4442835748195648, + "learning_rate": 7.620815698280734e-05, + "loss": 1.8342, + "step": 11230 + }, + { + "epoch": 3.447206875383671, + "grad_norm": 0.45760586857795715, + "learning_rate": 7.620392383569286e-05, + "loss": 1.8159, + "step": 11231 + }, + { + "epoch": 3.447513812154696, + "grad_norm": 0.2567009925842285, + "learning_rate": 7.619969042961583e-05, + "loss": 1.774, + "step": 11232 + }, + { + "epoch": 3.4478207489257215, + "grad_norm": 0.3720102310180664, + "learning_rate": 7.619545676461812e-05, + "loss": 1.8366, + "step": 11233 + }, + { + "epoch": 3.4481276856967464, + "grad_norm": 0.36436137557029724, + "learning_rate": 7.619122284074154e-05, + "loss": 1.832, + "step": 11234 + }, + { + "epoch": 3.4484346224677718, + "grad_norm": 0.310310959815979, + "learning_rate": 7.618698865802795e-05, + "loss": 1.9023, + "step": 11235 + }, + { + "epoch": 3.4487415592387967, + "grad_norm": 0.2693026661872864, + "learning_rate": 7.618275421651916e-05, + "loss": 1.7696, + "step": 11236 + }, + { + "epoch": 3.449048496009822, + "grad_norm": 0.2942425608634949, + "learning_rate": 7.61785195162571e-05, + "loss": 1.822, + "step": 11237 + }, + { + "epoch": 3.4493554327808473, + "grad_norm": 0.22454749047756195, + "learning_rate": 7.617428455728353e-05, + "loss": 1.7011, + "step": 11238 + }, + { + "epoch": 3.449662369551872, + "grad_norm": 0.23345038294792175, + "learning_rate": 7.617004933964035e-05, + "loss": 1.7563, + "step": 11239 + }, + { + "epoch": 3.4499693063228976, + "grad_norm": 0.24990662932395935, + "learning_rate": 7.616581386336941e-05, + "loss": 1.8031, + "step": 11240 + }, + { + "epoch": 3.4502762430939224, + "grad_norm": 0.2919348478317261, + "learning_rate": 7.616157812851254e-05, + "loss": 1.7355, + "step": 11241 + }, + { + "epoch": 3.450583179864948, + "grad_norm": 0.2926909327507019, + "learning_rate": 7.615734213511165e-05, + "loss": 1.8341, + "step": 11242 + }, + { + "epoch": 3.450890116635973, + "grad_norm": 0.24316683411598206, + "learning_rate": 7.615310588320855e-05, + "loss": 1.8154, + "step": 11243 + }, + { + "epoch": 3.451197053406998, + "grad_norm": 0.23154498636722565, + "learning_rate": 7.614886937284513e-05, + "loss": 1.7904, + "step": 11244 + }, + { + "epoch": 3.4515039901780233, + "grad_norm": 0.25973939895629883, + "learning_rate": 7.614463260406327e-05, + "loss": 1.7598, + "step": 11245 + }, + { + "epoch": 3.4518109269490487, + "grad_norm": 0.22110119462013245, + "learning_rate": 7.614039557690482e-05, + "loss": 1.7903, + "step": 11246 + }, + { + "epoch": 3.4521178637200736, + "grad_norm": 0.26184993982315063, + "learning_rate": 7.613615829141165e-05, + "loss": 1.748, + "step": 11247 + }, + { + "epoch": 3.452424800491099, + "grad_norm": 0.26128727197647095, + "learning_rate": 7.613192074762565e-05, + "loss": 1.7786, + "step": 11248 + }, + { + "epoch": 3.4527317372621242, + "grad_norm": 0.23230813443660736, + "learning_rate": 7.612768294558871e-05, + "loss": 1.8114, + "step": 11249 + }, + { + "epoch": 3.453038674033149, + "grad_norm": 0.2686540186405182, + "learning_rate": 7.612344488534268e-05, + "loss": 1.7311, + "step": 11250 + }, + { + "epoch": 3.4533456108041745, + "grad_norm": 0.25553348660469055, + "learning_rate": 7.611920656692946e-05, + "loss": 1.8468, + "step": 11251 + }, + { + "epoch": 3.4536525475751993, + "grad_norm": 0.2639308273792267, + "learning_rate": 7.611496799039092e-05, + "loss": 1.8292, + "step": 11252 + }, + { + "epoch": 3.4539594843462247, + "grad_norm": 0.2468358874320984, + "learning_rate": 7.611072915576895e-05, + "loss": 1.8173, + "step": 11253 + }, + { + "epoch": 3.45426642111725, + "grad_norm": 0.27236035466194153, + "learning_rate": 7.610649006310549e-05, + "loss": 1.8082, + "step": 11254 + }, + { + "epoch": 3.454573357888275, + "grad_norm": 0.2277914434671402, + "learning_rate": 7.610225071244237e-05, + "loss": 1.7483, + "step": 11255 + }, + { + "epoch": 3.4548802946593002, + "grad_norm": 0.2292868196964264, + "learning_rate": 7.60980111038215e-05, + "loss": 1.7716, + "step": 11256 + }, + { + "epoch": 3.455187231430325, + "grad_norm": 0.22116152942180634, + "learning_rate": 7.60937712372848e-05, + "loss": 1.773, + "step": 11257 + }, + { + "epoch": 3.4554941682013505, + "grad_norm": 0.23238304257392883, + "learning_rate": 7.608953111287416e-05, + "loss": 1.7602, + "step": 11258 + }, + { + "epoch": 3.455801104972376, + "grad_norm": 0.2810615003108978, + "learning_rate": 7.608529073063149e-05, + "loss": 1.8781, + "step": 11259 + }, + { + "epoch": 3.4561080417434007, + "grad_norm": 0.2516821324825287, + "learning_rate": 7.608105009059867e-05, + "loss": 1.835, + "step": 11260 + }, + { + "epoch": 3.456414978514426, + "grad_norm": 0.25698330998420715, + "learning_rate": 7.607680919281763e-05, + "loss": 1.7859, + "step": 11261 + }, + { + "epoch": 3.4567219152854514, + "grad_norm": 0.2597602903842926, + "learning_rate": 7.60725680373303e-05, + "loss": 1.8287, + "step": 11262 + }, + { + "epoch": 3.4570288520564763, + "grad_norm": 0.2564091980457306, + "learning_rate": 7.606832662417855e-05, + "loss": 1.8003, + "step": 11263 + }, + { + "epoch": 3.4573357888275016, + "grad_norm": 0.2872684597969055, + "learning_rate": 7.606408495340432e-05, + "loss": 1.8242, + "step": 11264 + }, + { + "epoch": 3.457642725598527, + "grad_norm": 0.27513590455055237, + "learning_rate": 7.605984302504952e-05, + "loss": 1.8605, + "step": 11265 + }, + { + "epoch": 3.457949662369552, + "grad_norm": 0.27768459916114807, + "learning_rate": 7.605560083915609e-05, + "loss": 1.7948, + "step": 11266 + }, + { + "epoch": 3.458256599140577, + "grad_norm": 0.23911382257938385, + "learning_rate": 7.605135839576593e-05, + "loss": 1.7575, + "step": 11267 + }, + { + "epoch": 3.458563535911602, + "grad_norm": 0.26773568987846375, + "learning_rate": 7.604711569492098e-05, + "loss": 1.752, + "step": 11268 + }, + { + "epoch": 3.4588704726826274, + "grad_norm": 0.30079394578933716, + "learning_rate": 7.604287273666316e-05, + "loss": 1.8022, + "step": 11269 + }, + { + "epoch": 3.4591774094536527, + "grad_norm": 0.27393853664398193, + "learning_rate": 7.603862952103441e-05, + "loss": 1.8054, + "step": 11270 + }, + { + "epoch": 3.4594843462246776, + "grad_norm": 0.2794870436191559, + "learning_rate": 7.603438604807667e-05, + "loss": 1.808, + "step": 11271 + }, + { + "epoch": 3.459791282995703, + "grad_norm": 0.26482146978378296, + "learning_rate": 7.603014231783185e-05, + "loss": 1.8696, + "step": 11272 + }, + { + "epoch": 3.460098219766728, + "grad_norm": 0.2755354344844818, + "learning_rate": 7.602589833034192e-05, + "loss": 1.8412, + "step": 11273 + }, + { + "epoch": 3.460405156537753, + "grad_norm": 0.2666642367839813, + "learning_rate": 7.602165408564883e-05, + "loss": 1.8333, + "step": 11274 + }, + { + "epoch": 3.4607120933087785, + "grad_norm": 0.26958519220352173, + "learning_rate": 7.601740958379448e-05, + "loss": 1.7943, + "step": 11275 + }, + { + "epoch": 3.4610190300798034, + "grad_norm": 0.2915789783000946, + "learning_rate": 7.601316482482084e-05, + "loss": 1.7519, + "step": 11276 + }, + { + "epoch": 3.4613259668508287, + "grad_norm": 0.2456950694322586, + "learning_rate": 7.600891980876985e-05, + "loss": 1.8064, + "step": 11277 + }, + { + "epoch": 3.461632903621854, + "grad_norm": 0.2517867088317871, + "learning_rate": 7.600467453568348e-05, + "loss": 1.7766, + "step": 11278 + }, + { + "epoch": 3.461939840392879, + "grad_norm": 0.24567969143390656, + "learning_rate": 7.600042900560368e-05, + "loss": 1.7331, + "step": 11279 + }, + { + "epoch": 3.4622467771639043, + "grad_norm": 0.23986820876598358, + "learning_rate": 7.599618321857239e-05, + "loss": 1.7477, + "step": 11280 + }, + { + "epoch": 3.4625537139349296, + "grad_norm": 0.2555375397205353, + "learning_rate": 7.599193717463158e-05, + "loss": 1.8154, + "step": 11281 + }, + { + "epoch": 3.4628606507059545, + "grad_norm": 0.2522781193256378, + "learning_rate": 7.598769087382323e-05, + "loss": 1.7821, + "step": 11282 + }, + { + "epoch": 3.46316758747698, + "grad_norm": 0.25631004571914673, + "learning_rate": 7.598344431618926e-05, + "loss": 1.8043, + "step": 11283 + }, + { + "epoch": 3.4634745242480047, + "grad_norm": 0.2611328661441803, + "learning_rate": 7.597919750177168e-05, + "loss": 1.8036, + "step": 11284 + }, + { + "epoch": 3.46378146101903, + "grad_norm": 0.255670428276062, + "learning_rate": 7.597495043061244e-05, + "loss": 1.7375, + "step": 11285 + }, + { + "epoch": 3.4640883977900554, + "grad_norm": 0.2687236964702606, + "learning_rate": 7.597070310275353e-05, + "loss": 1.7496, + "step": 11286 + }, + { + "epoch": 3.4643953345610803, + "grad_norm": 0.2643752992153168, + "learning_rate": 7.596645551823688e-05, + "loss": 1.8444, + "step": 11287 + }, + { + "epoch": 3.4647022713321056, + "grad_norm": 0.2564511299133301, + "learning_rate": 7.596220767710452e-05, + "loss": 1.7557, + "step": 11288 + }, + { + "epoch": 3.4650092081031305, + "grad_norm": 0.2510208487510681, + "learning_rate": 7.59579595793984e-05, + "loss": 1.7234, + "step": 11289 + }, + { + "epoch": 3.465316144874156, + "grad_norm": 0.2765158712863922, + "learning_rate": 7.595371122516051e-05, + "loss": 1.8215, + "step": 11290 + }, + { + "epoch": 3.465623081645181, + "grad_norm": 0.28233039379119873, + "learning_rate": 7.594946261443286e-05, + "loss": 1.7752, + "step": 11291 + }, + { + "epoch": 3.465930018416206, + "grad_norm": 0.26971468329429626, + "learning_rate": 7.594521374725735e-05, + "loss": 1.7924, + "step": 11292 + }, + { + "epoch": 3.4662369551872314, + "grad_norm": 0.29425930976867676, + "learning_rate": 7.594096462367608e-05, + "loss": 1.8144, + "step": 11293 + }, + { + "epoch": 3.4665438919582567, + "grad_norm": 0.233150452375412, + "learning_rate": 7.593671524373098e-05, + "loss": 1.7741, + "step": 11294 + }, + { + "epoch": 3.4668508287292816, + "grad_norm": 0.2947762608528137, + "learning_rate": 7.593246560746406e-05, + "loss": 1.8031, + "step": 11295 + }, + { + "epoch": 3.467157765500307, + "grad_norm": 0.250552773475647, + "learning_rate": 7.59282157149173e-05, + "loss": 1.7501, + "step": 11296 + }, + { + "epoch": 3.4674647022713323, + "grad_norm": 0.26091331243515015, + "learning_rate": 7.592396556613274e-05, + "loss": 1.836, + "step": 11297 + }, + { + "epoch": 3.467771639042357, + "grad_norm": 0.28625619411468506, + "learning_rate": 7.591971516115233e-05, + "loss": 1.7555, + "step": 11298 + }, + { + "epoch": 3.4680785758133825, + "grad_norm": 0.2723398804664612, + "learning_rate": 7.591546450001811e-05, + "loss": 1.825, + "step": 11299 + }, + { + "epoch": 3.4683855125844074, + "grad_norm": 0.24289946258068085, + "learning_rate": 7.591121358277211e-05, + "loss": 1.7441, + "step": 11300 + }, + { + "epoch": 3.4686924493554327, + "grad_norm": 0.2706952691078186, + "learning_rate": 7.590696240945629e-05, + "loss": 1.8651, + "step": 11301 + }, + { + "epoch": 3.468999386126458, + "grad_norm": 0.24632862210273743, + "learning_rate": 7.590271098011268e-05, + "loss": 1.8229, + "step": 11302 + }, + { + "epoch": 3.469306322897483, + "grad_norm": 0.29275211691856384, + "learning_rate": 7.58984592947833e-05, + "loss": 1.7591, + "step": 11303 + }, + { + "epoch": 3.4696132596685083, + "grad_norm": 0.29228144884109497, + "learning_rate": 7.589420735351016e-05, + "loss": 1.8395, + "step": 11304 + }, + { + "epoch": 3.4699201964395336, + "grad_norm": 0.28339114785194397, + "learning_rate": 7.588995515633528e-05, + "loss": 1.8543, + "step": 11305 + }, + { + "epoch": 3.4702271332105585, + "grad_norm": 0.2834693193435669, + "learning_rate": 7.588570270330071e-05, + "loss": 1.826, + "step": 11306 + }, + { + "epoch": 3.470534069981584, + "grad_norm": 0.26130759716033936, + "learning_rate": 7.588144999444844e-05, + "loss": 1.7887, + "step": 11307 + }, + { + "epoch": 3.470841006752609, + "grad_norm": 0.29554685950279236, + "learning_rate": 7.587719702982052e-05, + "loss": 1.819, + "step": 11308 + }, + { + "epoch": 3.471147943523634, + "grad_norm": 0.2687968611717224, + "learning_rate": 7.587294380945898e-05, + "loss": 1.7354, + "step": 11309 + }, + { + "epoch": 3.4714548802946594, + "grad_norm": 0.28795287013053894, + "learning_rate": 7.586869033340582e-05, + "loss": 1.8267, + "step": 11310 + }, + { + "epoch": 3.4717618170656843, + "grad_norm": 0.33244553208351135, + "learning_rate": 7.58644366017031e-05, + "loss": 1.86, + "step": 11311 + }, + { + "epoch": 3.4720687538367097, + "grad_norm": 0.2878025472164154, + "learning_rate": 7.586018261439288e-05, + "loss": 1.7587, + "step": 11312 + }, + { + "epoch": 3.472375690607735, + "grad_norm": 0.26856711506843567, + "learning_rate": 7.585592837151716e-05, + "loss": 1.7351, + "step": 11313 + }, + { + "epoch": 3.47268262737876, + "grad_norm": 0.2554367780685425, + "learning_rate": 7.585167387311802e-05, + "loss": 1.7664, + "step": 11314 + }, + { + "epoch": 3.472989564149785, + "grad_norm": 0.3193204700946808, + "learning_rate": 7.584741911923748e-05, + "loss": 1.7487, + "step": 11315 + }, + { + "epoch": 3.47329650092081, + "grad_norm": 0.3227958679199219, + "learning_rate": 7.584316410991759e-05, + "loss": 1.8107, + "step": 11316 + }, + { + "epoch": 3.4736034376918354, + "grad_norm": 0.33891916275024414, + "learning_rate": 7.58389088452004e-05, + "loss": 1.8466, + "step": 11317 + }, + { + "epoch": 3.4739103744628608, + "grad_norm": 0.27050724625587463, + "learning_rate": 7.583465332512797e-05, + "loss": 1.7877, + "step": 11318 + }, + { + "epoch": 3.4742173112338857, + "grad_norm": 0.2935837209224701, + "learning_rate": 7.583039754974235e-05, + "loss": 1.7932, + "step": 11319 + }, + { + "epoch": 3.474524248004911, + "grad_norm": 0.27780550718307495, + "learning_rate": 7.582614151908561e-05, + "loss": 1.8374, + "step": 11320 + }, + { + "epoch": 3.4748311847759363, + "grad_norm": 0.2579033076763153, + "learning_rate": 7.58218852331998e-05, + "loss": 1.7305, + "step": 11321 + }, + { + "epoch": 3.4751381215469612, + "grad_norm": 0.2531716227531433, + "learning_rate": 7.581762869212699e-05, + "loss": 1.8136, + "step": 11322 + }, + { + "epoch": 3.4754450583179866, + "grad_norm": 0.25504544377326965, + "learning_rate": 7.581337189590924e-05, + "loss": 1.787, + "step": 11323 + }, + { + "epoch": 3.475751995089012, + "grad_norm": 0.23659855127334595, + "learning_rate": 7.580911484458861e-05, + "loss": 1.77, + "step": 11324 + }, + { + "epoch": 3.476058931860037, + "grad_norm": 0.22556856274604797, + "learning_rate": 7.580485753820721e-05, + "loss": 1.7808, + "step": 11325 + }, + { + "epoch": 3.476365868631062, + "grad_norm": 0.2860291600227356, + "learning_rate": 7.580059997680705e-05, + "loss": 1.8224, + "step": 11326 + }, + { + "epoch": 3.476672805402087, + "grad_norm": 0.3134596645832062, + "learning_rate": 7.579634216043023e-05, + "loss": 1.8278, + "step": 11327 + }, + { + "epoch": 3.4769797421731123, + "grad_norm": 0.2883087992668152, + "learning_rate": 7.579208408911887e-05, + "loss": 1.7917, + "step": 11328 + }, + { + "epoch": 3.4772866789441377, + "grad_norm": 0.2743333578109741, + "learning_rate": 7.578782576291501e-05, + "loss": 1.8228, + "step": 11329 + }, + { + "epoch": 3.4775936157151626, + "grad_norm": 0.25026053190231323, + "learning_rate": 7.578356718186073e-05, + "loss": 1.7717, + "step": 11330 + }, + { + "epoch": 3.477900552486188, + "grad_norm": 0.246905118227005, + "learning_rate": 7.577930834599813e-05, + "loss": 1.7979, + "step": 11331 + }, + { + "epoch": 3.478207489257213, + "grad_norm": 0.24709418416023254, + "learning_rate": 7.577504925536929e-05, + "loss": 1.8111, + "step": 11332 + }, + { + "epoch": 3.478514426028238, + "grad_norm": 0.25685814023017883, + "learning_rate": 7.577078991001632e-05, + "loss": 1.8255, + "step": 11333 + }, + { + "epoch": 3.4788213627992635, + "grad_norm": 0.23937836289405823, + "learning_rate": 7.576653030998129e-05, + "loss": 1.7254, + "step": 11334 + }, + { + "epoch": 3.4791282995702884, + "grad_norm": 0.22638650238513947, + "learning_rate": 7.57622704553063e-05, + "loss": 1.7847, + "step": 11335 + }, + { + "epoch": 3.4794352363413137, + "grad_norm": 0.26083993911743164, + "learning_rate": 7.575801034603347e-05, + "loss": 1.7947, + "step": 11336 + }, + { + "epoch": 3.479742173112339, + "grad_norm": 0.2715466022491455, + "learning_rate": 7.575374998220488e-05, + "loss": 1.848, + "step": 11337 + }, + { + "epoch": 3.480049109883364, + "grad_norm": 0.25554224848747253, + "learning_rate": 7.574948936386262e-05, + "loss": 1.7811, + "step": 11338 + }, + { + "epoch": 3.4803560466543892, + "grad_norm": 0.2689397931098938, + "learning_rate": 7.574522849104882e-05, + "loss": 1.82, + "step": 11339 + }, + { + "epoch": 3.4806629834254146, + "grad_norm": 0.25027474761009216, + "learning_rate": 7.57409673638056e-05, + "loss": 1.775, + "step": 11340 + }, + { + "epoch": 3.4809699201964395, + "grad_norm": 0.2545457184314728, + "learning_rate": 7.573670598217504e-05, + "loss": 1.8056, + "step": 11341 + }, + { + "epoch": 3.481276856967465, + "grad_norm": 0.28404027223587036, + "learning_rate": 7.573244434619928e-05, + "loss": 1.8372, + "step": 11342 + }, + { + "epoch": 3.4815837937384897, + "grad_norm": 0.28046950697898865, + "learning_rate": 7.572818245592041e-05, + "loss": 1.7851, + "step": 11343 + }, + { + "epoch": 3.481890730509515, + "grad_norm": 0.23005759716033936, + "learning_rate": 7.572392031138056e-05, + "loss": 1.7059, + "step": 11344 + }, + { + "epoch": 3.4821976672805404, + "grad_norm": 0.2931719124317169, + "learning_rate": 7.571965791262185e-05, + "loss": 1.84, + "step": 11345 + }, + { + "epoch": 3.4825046040515653, + "grad_norm": 0.4399266242980957, + "learning_rate": 7.571539525968642e-05, + "loss": 1.7465, + "step": 11346 + }, + { + "epoch": 3.4828115408225906, + "grad_norm": 0.48957565426826477, + "learning_rate": 7.571113235261638e-05, + "loss": 1.8494, + "step": 11347 + }, + { + "epoch": 3.4831184775936155, + "grad_norm": 0.37828895449638367, + "learning_rate": 7.570686919145385e-05, + "loss": 1.7598, + "step": 11348 + }, + { + "epoch": 3.483425414364641, + "grad_norm": 0.22943973541259766, + "learning_rate": 7.570260577624098e-05, + "loss": 1.7443, + "step": 11349 + }, + { + "epoch": 3.483732351135666, + "grad_norm": 0.3245384991168976, + "learning_rate": 7.569834210701987e-05, + "loss": 1.7232, + "step": 11350 + }, + { + "epoch": 3.484039287906691, + "grad_norm": 0.4419693648815155, + "learning_rate": 7.569407818383271e-05, + "loss": 1.841, + "step": 11351 + }, + { + "epoch": 3.4843462246777164, + "grad_norm": 0.4061864912509918, + "learning_rate": 7.568981400672159e-05, + "loss": 1.8274, + "step": 11352 + }, + { + "epoch": 3.4846531614487417, + "grad_norm": 0.2609417736530304, + "learning_rate": 7.56855495757287e-05, + "loss": 1.8631, + "step": 11353 + }, + { + "epoch": 3.4849600982197666, + "grad_norm": 0.28758567571640015, + "learning_rate": 7.568128489089612e-05, + "loss": 1.8169, + "step": 11354 + }, + { + "epoch": 3.485267034990792, + "grad_norm": 0.40643060207366943, + "learning_rate": 7.567701995226606e-05, + "loss": 1.809, + "step": 11355 + }, + { + "epoch": 3.4855739717618173, + "grad_norm": 0.37649446725845337, + "learning_rate": 7.56727547598806e-05, + "loss": 1.7661, + "step": 11356 + }, + { + "epoch": 3.485880908532842, + "grad_norm": 0.22863779962062836, + "learning_rate": 7.566848931378197e-05, + "loss": 1.808, + "step": 11357 + }, + { + "epoch": 3.4861878453038675, + "grad_norm": 0.4487019181251526, + "learning_rate": 7.566422361401226e-05, + "loss": 1.7627, + "step": 11358 + }, + { + "epoch": 3.4864947820748924, + "grad_norm": 0.4583640694618225, + "learning_rate": 7.565995766061367e-05, + "loss": 1.8186, + "step": 11359 + }, + { + "epoch": 3.4868017188459177, + "grad_norm": 0.27231526374816895, + "learning_rate": 7.565569145362833e-05, + "loss": 1.8465, + "step": 11360 + }, + { + "epoch": 3.487108655616943, + "grad_norm": 0.3877887725830078, + "learning_rate": 7.565142499309841e-05, + "loss": 1.7668, + "step": 11361 + }, + { + "epoch": 3.487415592387968, + "grad_norm": 0.5511242747306824, + "learning_rate": 7.564715827906606e-05, + "loss": 1.8417, + "step": 11362 + }, + { + "epoch": 3.4877225291589933, + "grad_norm": 0.5112231373786926, + "learning_rate": 7.564289131157348e-05, + "loss": 1.8038, + "step": 11363 + }, + { + "epoch": 3.488029465930018, + "grad_norm": 0.279502809047699, + "learning_rate": 7.56386240906628e-05, + "loss": 1.7545, + "step": 11364 + }, + { + "epoch": 3.4883364027010435, + "grad_norm": 0.30080464482307434, + "learning_rate": 7.563435661637623e-05, + "loss": 1.8136, + "step": 11365 + }, + { + "epoch": 3.488643339472069, + "grad_norm": 0.4424717128276825, + "learning_rate": 7.563008888875591e-05, + "loss": 1.7542, + "step": 11366 + }, + { + "epoch": 3.4889502762430937, + "grad_norm": 0.42144715785980225, + "learning_rate": 7.562582090784403e-05, + "loss": 1.8245, + "step": 11367 + }, + { + "epoch": 3.489257213014119, + "grad_norm": 0.2533668875694275, + "learning_rate": 7.562155267368277e-05, + "loss": 1.8654, + "step": 11368 + }, + { + "epoch": 3.4895641497851444, + "grad_norm": 0.3327534794807434, + "learning_rate": 7.56172841863143e-05, + "loss": 1.7882, + "step": 11369 + }, + { + "epoch": 3.4898710865561693, + "grad_norm": 0.44001486897468567, + "learning_rate": 7.561301544578081e-05, + "loss": 1.8397, + "step": 11370 + }, + { + "epoch": 3.4901780233271946, + "grad_norm": 0.2779090106487274, + "learning_rate": 7.56087464521245e-05, + "loss": 1.7398, + "step": 11371 + }, + { + "epoch": 3.49048496009822, + "grad_norm": 0.3018067479133606, + "learning_rate": 7.560447720538755e-05, + "loss": 1.8076, + "step": 11372 + }, + { + "epoch": 3.490791896869245, + "grad_norm": 0.4370935261249542, + "learning_rate": 7.560020770561216e-05, + "loss": 1.8057, + "step": 11373 + }, + { + "epoch": 3.49109883364027, + "grad_norm": 0.2936978042125702, + "learning_rate": 7.559593795284047e-05, + "loss": 1.7726, + "step": 11374 + }, + { + "epoch": 3.491405770411295, + "grad_norm": 0.28825095295906067, + "learning_rate": 7.559166794711476e-05, + "loss": 1.8039, + "step": 11375 + }, + { + "epoch": 3.4917127071823204, + "grad_norm": 0.39334073662757874, + "learning_rate": 7.55873976884772e-05, + "loss": 1.8388, + "step": 11376 + }, + { + "epoch": 3.4920196439533457, + "grad_norm": 0.33880460262298584, + "learning_rate": 7.558312717696995e-05, + "loss": 1.7791, + "step": 11377 + }, + { + "epoch": 3.4923265807243706, + "grad_norm": 0.4433762729167938, + "learning_rate": 7.557885641263524e-05, + "loss": 1.7786, + "step": 11378 + }, + { + "epoch": 3.492633517495396, + "grad_norm": 0.4710264205932617, + "learning_rate": 7.557458539551527e-05, + "loss": 1.7193, + "step": 11379 + }, + { + "epoch": 3.4929404542664213, + "grad_norm": 0.27514326572418213, + "learning_rate": 7.557031412565228e-05, + "loss": 1.823, + "step": 11380 + }, + { + "epoch": 3.493247391037446, + "grad_norm": 0.4681413471698761, + "learning_rate": 7.556604260308846e-05, + "loss": 1.7598, + "step": 11381 + }, + { + "epoch": 3.4935543278084715, + "grad_norm": 0.5032503604888916, + "learning_rate": 7.556177082786602e-05, + "loss": 1.741, + "step": 11382 + }, + { + "epoch": 3.493861264579497, + "grad_norm": 0.2677086889743805, + "learning_rate": 7.555749880002716e-05, + "loss": 1.8528, + "step": 11383 + }, + { + "epoch": 3.4941682013505218, + "grad_norm": 0.43870940804481506, + "learning_rate": 7.555322651961414e-05, + "loss": 1.7632, + "step": 11384 + }, + { + "epoch": 3.494475138121547, + "grad_norm": 0.5403209924697876, + "learning_rate": 7.554895398666914e-05, + "loss": 1.8181, + "step": 11385 + }, + { + "epoch": 3.494782074892572, + "grad_norm": 0.2714318335056305, + "learning_rate": 7.554468120123441e-05, + "loss": 1.8151, + "step": 11386 + }, + { + "epoch": 3.4950890116635973, + "grad_norm": 0.49661698937416077, + "learning_rate": 7.554040816335217e-05, + "loss": 1.8116, + "step": 11387 + }, + { + "epoch": 3.4953959484346226, + "grad_norm": 0.49954715371131897, + "learning_rate": 7.553613487306465e-05, + "loss": 1.8841, + "step": 11388 + }, + { + "epoch": 3.4957028852056475, + "grad_norm": 0.28189441561698914, + "learning_rate": 7.553186133041406e-05, + "loss": 1.7834, + "step": 11389 + }, + { + "epoch": 3.496009821976673, + "grad_norm": 0.36029115319252014, + "learning_rate": 7.552758753544267e-05, + "loss": 1.7796, + "step": 11390 + }, + { + "epoch": 3.4963167587476978, + "grad_norm": 0.45023465156555176, + "learning_rate": 7.552331348819268e-05, + "loss": 1.8773, + "step": 11391 + }, + { + "epoch": 3.496623695518723, + "grad_norm": 0.3235788643360138, + "learning_rate": 7.551903918870636e-05, + "loss": 1.7984, + "step": 11392 + }, + { + "epoch": 3.4969306322897484, + "grad_norm": 0.25656190514564514, + "learning_rate": 7.551476463702596e-05, + "loss": 1.8403, + "step": 11393 + }, + { + "epoch": 3.4972375690607733, + "grad_norm": 0.2866458594799042, + "learning_rate": 7.551048983319366e-05, + "loss": 1.7428, + "step": 11394 + }, + { + "epoch": 3.4975445058317987, + "grad_norm": 0.2713877856731415, + "learning_rate": 7.550621477725177e-05, + "loss": 1.8508, + "step": 11395 + }, + { + "epoch": 3.497851442602824, + "grad_norm": 0.27978867292404175, + "learning_rate": 7.55019394692425e-05, + "loss": 1.8049, + "step": 11396 + }, + { + "epoch": 3.498158379373849, + "grad_norm": 0.3275020122528076, + "learning_rate": 7.549766390920814e-05, + "loss": 1.8553, + "step": 11397 + }, + { + "epoch": 3.498465316144874, + "grad_norm": 0.29947492480278015, + "learning_rate": 7.54933880971909e-05, + "loss": 1.7614, + "step": 11398 + }, + { + "epoch": 3.4987722529158995, + "grad_norm": 0.25790849328041077, + "learning_rate": 7.548911203323308e-05, + "loss": 1.8223, + "step": 11399 + }, + { + "epoch": 3.4990791896869244, + "grad_norm": 0.3145451545715332, + "learning_rate": 7.54848357173769e-05, + "loss": 1.7642, + "step": 11400 + }, + { + "epoch": 3.4993861264579498, + "grad_norm": 0.29052913188934326, + "learning_rate": 7.548055914966463e-05, + "loss": 1.7728, + "step": 11401 + }, + { + "epoch": 3.4996930632289747, + "grad_norm": 0.2741037905216217, + "learning_rate": 7.547628233013854e-05, + "loss": 1.7382, + "step": 11402 + }, + { + "epoch": 3.5, + "grad_norm": 0.2562723755836487, + "learning_rate": 7.54720052588409e-05, + "loss": 1.7455, + "step": 11403 + }, + { + "epoch": 3.5003069367710253, + "grad_norm": 0.27649983763694763, + "learning_rate": 7.546772793581398e-05, + "loss": 1.7194, + "step": 11404 + }, + { + "epoch": 3.5006138735420502, + "grad_norm": 0.27290579676628113, + "learning_rate": 7.546345036110004e-05, + "loss": 1.87, + "step": 11405 + }, + { + "epoch": 3.5009208103130756, + "grad_norm": 0.33585605025291443, + "learning_rate": 7.545917253474136e-05, + "loss": 1.7703, + "step": 11406 + }, + { + "epoch": 3.5012277470841005, + "grad_norm": 0.2592691481113434, + "learning_rate": 7.545489445678022e-05, + "loss": 1.7657, + "step": 11407 + }, + { + "epoch": 3.501534683855126, + "grad_norm": 0.3081367015838623, + "learning_rate": 7.545061612725888e-05, + "loss": 1.8067, + "step": 11408 + }, + { + "epoch": 3.501841620626151, + "grad_norm": 0.31012001633644104, + "learning_rate": 7.544633754621965e-05, + "loss": 1.8009, + "step": 11409 + }, + { + "epoch": 3.5021485573971765, + "grad_norm": 0.28232479095458984, + "learning_rate": 7.54420587137048e-05, + "loss": 1.8124, + "step": 11410 + }, + { + "epoch": 3.5024554941682013, + "grad_norm": 0.24079222977161407, + "learning_rate": 7.54377796297566e-05, + "loss": 1.789, + "step": 11411 + }, + { + "epoch": 3.5027624309392267, + "grad_norm": 0.27347204089164734, + "learning_rate": 7.543350029441737e-05, + "loss": 1.7704, + "step": 11412 + }, + { + "epoch": 3.5030693677102516, + "grad_norm": 0.25545811653137207, + "learning_rate": 7.542922070772935e-05, + "loss": 1.7871, + "step": 11413 + }, + { + "epoch": 3.503376304481277, + "grad_norm": 0.2507263123989105, + "learning_rate": 7.54249408697349e-05, + "loss": 1.8424, + "step": 11414 + }, + { + "epoch": 3.5036832412523022, + "grad_norm": 0.2776084244251251, + "learning_rate": 7.542066078047627e-05, + "loss": 1.8246, + "step": 11415 + }, + { + "epoch": 3.503990178023327, + "grad_norm": 0.32833749055862427, + "learning_rate": 7.541638043999577e-05, + "loss": 1.7785, + "step": 11416 + }, + { + "epoch": 3.5042971147943525, + "grad_norm": 0.258486270904541, + "learning_rate": 7.541209984833571e-05, + "loss": 1.7543, + "step": 11417 + }, + { + "epoch": 3.5046040515653774, + "grad_norm": 0.25825178623199463, + "learning_rate": 7.540781900553837e-05, + "loss": 1.7939, + "step": 11418 + }, + { + "epoch": 3.5049109883364027, + "grad_norm": 0.26980888843536377, + "learning_rate": 7.540353791164606e-05, + "loss": 1.7777, + "step": 11419 + }, + { + "epoch": 3.505217925107428, + "grad_norm": 0.24103333055973053, + "learning_rate": 7.539925656670111e-05, + "loss": 1.7565, + "step": 11420 + }, + { + "epoch": 3.505524861878453, + "grad_norm": 0.25192007422447205, + "learning_rate": 7.539497497074584e-05, + "loss": 1.7696, + "step": 11421 + }, + { + "epoch": 3.5058317986494782, + "grad_norm": 0.218489870429039, + "learning_rate": 7.539069312382252e-05, + "loss": 1.761, + "step": 11422 + }, + { + "epoch": 3.506138735420503, + "grad_norm": 0.27533552050590515, + "learning_rate": 7.53864110259735e-05, + "loss": 1.7374, + "step": 11423 + }, + { + "epoch": 3.5064456721915285, + "grad_norm": 0.2603490650653839, + "learning_rate": 7.538212867724108e-05, + "loss": 1.8342, + "step": 11424 + }, + { + "epoch": 3.506752608962554, + "grad_norm": 0.27340635657310486, + "learning_rate": 7.537784607766758e-05, + "loss": 1.8099, + "step": 11425 + }, + { + "epoch": 3.507059545733579, + "grad_norm": 0.25342679023742676, + "learning_rate": 7.537356322729537e-05, + "loss": 1.7949, + "step": 11426 + }, + { + "epoch": 3.507366482504604, + "grad_norm": 0.292819082736969, + "learning_rate": 7.536928012616669e-05, + "loss": 1.9049, + "step": 11427 + }, + { + "epoch": 3.5076734192756294, + "grad_norm": 0.28256532549858093, + "learning_rate": 7.536499677432393e-05, + "loss": 1.8464, + "step": 11428 + }, + { + "epoch": 3.5079803560466543, + "grad_norm": 0.2672989070415497, + "learning_rate": 7.536071317180942e-05, + "loss": 1.8301, + "step": 11429 + }, + { + "epoch": 3.5082872928176796, + "grad_norm": 0.2525518238544464, + "learning_rate": 7.535642931866546e-05, + "loss": 1.8054, + "step": 11430 + }, + { + "epoch": 3.508594229588705, + "grad_norm": 0.2622447609901428, + "learning_rate": 7.535214521493442e-05, + "loss": 1.8293, + "step": 11431 + }, + { + "epoch": 3.50890116635973, + "grad_norm": 0.27057385444641113, + "learning_rate": 7.534786086065859e-05, + "loss": 1.7426, + "step": 11432 + }, + { + "epoch": 3.509208103130755, + "grad_norm": 0.27363866567611694, + "learning_rate": 7.534357625588038e-05, + "loss": 1.7138, + "step": 11433 + }, + { + "epoch": 3.50951503990178, + "grad_norm": 0.3029060363769531, + "learning_rate": 7.533929140064207e-05, + "loss": 1.864, + "step": 11434 + }, + { + "epoch": 3.5098219766728054, + "grad_norm": 0.3144821524620056, + "learning_rate": 7.533500629498604e-05, + "loss": 1.7846, + "step": 11435 + }, + { + "epoch": 3.5101289134438307, + "grad_norm": 0.44535213708877563, + "learning_rate": 7.533072093895461e-05, + "loss": 1.799, + "step": 11436 + }, + { + "epoch": 3.5104358502148556, + "grad_norm": 0.25344160199165344, + "learning_rate": 7.532643533259017e-05, + "loss": 1.7391, + "step": 11437 + }, + { + "epoch": 3.510742786985881, + "grad_norm": 0.286026269197464, + "learning_rate": 7.532214947593506e-05, + "loss": 1.8436, + "step": 11438 + }, + { + "epoch": 3.511049723756906, + "grad_norm": 0.3317352533340454, + "learning_rate": 7.53178633690316e-05, + "loss": 1.8507, + "step": 11439 + }, + { + "epoch": 3.511356660527931, + "grad_norm": 0.2547265589237213, + "learning_rate": 7.53135770119222e-05, + "loss": 1.7483, + "step": 11440 + }, + { + "epoch": 3.5116635972989565, + "grad_norm": 0.24281835556030273, + "learning_rate": 7.530929040464917e-05, + "loss": 1.759, + "step": 11441 + }, + { + "epoch": 3.511970534069982, + "grad_norm": 0.2935381829738617, + "learning_rate": 7.530500354725491e-05, + "loss": 1.8235, + "step": 11442 + }, + { + "epoch": 3.5122774708410067, + "grad_norm": 0.26642969250679016, + "learning_rate": 7.53007164397818e-05, + "loss": 1.8324, + "step": 11443 + }, + { + "epoch": 3.512584407612032, + "grad_norm": 0.24830882251262665, + "learning_rate": 7.529642908227215e-05, + "loss": 1.8132, + "step": 11444 + }, + { + "epoch": 3.512891344383057, + "grad_norm": 0.3100191056728363, + "learning_rate": 7.529214147476838e-05, + "loss": 1.8453, + "step": 11445 + }, + { + "epoch": 3.5131982811540823, + "grad_norm": 0.27948811650276184, + "learning_rate": 7.528785361731282e-05, + "loss": 1.7792, + "step": 11446 + }, + { + "epoch": 3.5135052179251076, + "grad_norm": 0.26978832483291626, + "learning_rate": 7.528356550994787e-05, + "loss": 1.7857, + "step": 11447 + }, + { + "epoch": 3.5138121546961325, + "grad_norm": 0.30527836084365845, + "learning_rate": 7.527927715271592e-05, + "loss": 1.807, + "step": 11448 + }, + { + "epoch": 3.514119091467158, + "grad_norm": 0.2915664315223694, + "learning_rate": 7.527498854565934e-05, + "loss": 1.8414, + "step": 11449 + }, + { + "epoch": 3.5144260282381827, + "grad_norm": 0.2854034900665283, + "learning_rate": 7.52706996888205e-05, + "loss": 1.793, + "step": 11450 + }, + { + "epoch": 3.514732965009208, + "grad_norm": 0.30281978845596313, + "learning_rate": 7.52664105822418e-05, + "loss": 1.7896, + "step": 11451 + }, + { + "epoch": 3.5150399017802334, + "grad_norm": 0.3317166566848755, + "learning_rate": 7.526212122596561e-05, + "loss": 1.7776, + "step": 11452 + }, + { + "epoch": 3.5153468385512583, + "grad_norm": 0.3400021195411682, + "learning_rate": 7.525783162003434e-05, + "loss": 1.8411, + "step": 11453 + }, + { + "epoch": 3.5156537753222836, + "grad_norm": 0.25169485807418823, + "learning_rate": 7.525354176449037e-05, + "loss": 1.7871, + "step": 11454 + }, + { + "epoch": 3.5159607120933085, + "grad_norm": 0.3442455530166626, + "learning_rate": 7.52492516593761e-05, + "loss": 1.7644, + "step": 11455 + }, + { + "epoch": 3.516267648864334, + "grad_norm": 0.35644033551216125, + "learning_rate": 7.524496130473394e-05, + "loss": 1.801, + "step": 11456 + }, + { + "epoch": 3.516574585635359, + "grad_norm": 0.3180185854434967, + "learning_rate": 7.524067070060625e-05, + "loss": 1.7897, + "step": 11457 + }, + { + "epoch": 3.5168815224063845, + "grad_norm": 0.2417978048324585, + "learning_rate": 7.523637984703548e-05, + "loss": 1.8527, + "step": 11458 + }, + { + "epoch": 3.5171884591774094, + "grad_norm": 0.29661375284194946, + "learning_rate": 7.5232088744064e-05, + "loss": 1.8276, + "step": 11459 + }, + { + "epoch": 3.5174953959484347, + "grad_norm": 0.2467545121908188, + "learning_rate": 7.522779739173424e-05, + "loss": 1.7819, + "step": 11460 + }, + { + "epoch": 3.5178023327194596, + "grad_norm": 0.26177898049354553, + "learning_rate": 7.522350579008859e-05, + "loss": 1.8017, + "step": 11461 + }, + { + "epoch": 3.518109269490485, + "grad_norm": 0.28740498423576355, + "learning_rate": 7.521921393916948e-05, + "loss": 1.7863, + "step": 11462 + }, + { + "epoch": 3.5184162062615103, + "grad_norm": 0.28685200214385986, + "learning_rate": 7.521492183901932e-05, + "loss": 1.8069, + "step": 11463 + }, + { + "epoch": 3.518723143032535, + "grad_norm": 0.24174338579177856, + "learning_rate": 7.521062948968051e-05, + "loss": 1.7523, + "step": 11464 + }, + { + "epoch": 3.5190300798035605, + "grad_norm": 0.23273243010044098, + "learning_rate": 7.520633689119548e-05, + "loss": 1.7827, + "step": 11465 + }, + { + "epoch": 3.5193370165745854, + "grad_norm": 0.22708217799663544, + "learning_rate": 7.520204404360667e-05, + "loss": 1.7377, + "step": 11466 + }, + { + "epoch": 3.5196439533456108, + "grad_norm": 0.24725353717803955, + "learning_rate": 7.519775094695649e-05, + "loss": 1.7828, + "step": 11467 + }, + { + "epoch": 3.519950890116636, + "grad_norm": 0.23046265542507172, + "learning_rate": 7.519345760128736e-05, + "loss": 1.7427, + "step": 11468 + }, + { + "epoch": 3.520257826887661, + "grad_norm": 0.2618728280067444, + "learning_rate": 7.518916400664171e-05, + "loss": 1.8133, + "step": 11469 + }, + { + "epoch": 3.5205647636586863, + "grad_norm": 0.23232363164424896, + "learning_rate": 7.5184870163062e-05, + "loss": 1.7468, + "step": 11470 + }, + { + "epoch": 3.520871700429711, + "grad_norm": 0.21993626654148102, + "learning_rate": 7.51805760705906e-05, + "loss": 1.7565, + "step": 11471 + }, + { + "epoch": 3.5211786372007365, + "grad_norm": 0.23563124239444733, + "learning_rate": 7.517628172927001e-05, + "loss": 1.7795, + "step": 11472 + }, + { + "epoch": 3.521485573971762, + "grad_norm": 0.24502862989902496, + "learning_rate": 7.517198713914266e-05, + "loss": 1.813, + "step": 11473 + }, + { + "epoch": 3.521792510742787, + "grad_norm": 0.24745969474315643, + "learning_rate": 7.516769230025097e-05, + "loss": 1.7601, + "step": 11474 + }, + { + "epoch": 3.522099447513812, + "grad_norm": 0.27686986327171326, + "learning_rate": 7.516339721263739e-05, + "loss": 1.8121, + "step": 11475 + }, + { + "epoch": 3.5224063842848374, + "grad_norm": 0.3110332787036896, + "learning_rate": 7.515910187634439e-05, + "loss": 1.7978, + "step": 11476 + }, + { + "epoch": 3.5227133210558623, + "grad_norm": 0.3394792377948761, + "learning_rate": 7.515480629141436e-05, + "loss": 1.8427, + "step": 11477 + }, + { + "epoch": 3.5230202578268877, + "grad_norm": 0.2802537679672241, + "learning_rate": 7.515051045788984e-05, + "loss": 1.7343, + "step": 11478 + }, + { + "epoch": 3.523327194597913, + "grad_norm": 0.23687711358070374, + "learning_rate": 7.514621437581319e-05, + "loss": 1.7786, + "step": 11479 + }, + { + "epoch": 3.523634131368938, + "grad_norm": 0.31114310026168823, + "learning_rate": 7.514191804522693e-05, + "loss": 1.8137, + "step": 11480 + }, + { + "epoch": 3.523941068139963, + "grad_norm": 0.3257891833782196, + "learning_rate": 7.513762146617351e-05, + "loss": 1.8015, + "step": 11481 + }, + { + "epoch": 3.524248004910988, + "grad_norm": 0.24353443086147308, + "learning_rate": 7.513332463869536e-05, + "loss": 1.7485, + "step": 11482 + }, + { + "epoch": 3.5245549416820134, + "grad_norm": 0.29861485958099365, + "learning_rate": 7.512902756283498e-05, + "loss": 1.7993, + "step": 11483 + }, + { + "epoch": 3.5248618784530388, + "grad_norm": 0.40380924940109253, + "learning_rate": 7.51247302386348e-05, + "loss": 1.7664, + "step": 11484 + }, + { + "epoch": 3.525168815224064, + "grad_norm": 0.3365862965583801, + "learning_rate": 7.512043266613733e-05, + "loss": 1.7512, + "step": 11485 + }, + { + "epoch": 3.525475751995089, + "grad_norm": 0.2502824068069458, + "learning_rate": 7.511613484538502e-05, + "loss": 1.8414, + "step": 11486 + }, + { + "epoch": 3.5257826887661143, + "grad_norm": 0.2598603069782257, + "learning_rate": 7.511183677642034e-05, + "loss": 1.7358, + "step": 11487 + }, + { + "epoch": 3.5260896255371392, + "grad_norm": 0.30246880650520325, + "learning_rate": 7.510753845928576e-05, + "loss": 1.791, + "step": 11488 + }, + { + "epoch": 3.5263965623081646, + "grad_norm": 0.25170832872390747, + "learning_rate": 7.510323989402378e-05, + "loss": 1.7498, + "step": 11489 + }, + { + "epoch": 3.52670349907919, + "grad_norm": 0.2925282418727875, + "learning_rate": 7.509894108067688e-05, + "loss": 1.8413, + "step": 11490 + }, + { + "epoch": 3.527010435850215, + "grad_norm": 0.2643601596355438, + "learning_rate": 7.509464201928752e-05, + "loss": 1.8052, + "step": 11491 + }, + { + "epoch": 3.52731737262124, + "grad_norm": 0.2938917279243469, + "learning_rate": 7.50903427098982e-05, + "loss": 1.7308, + "step": 11492 + }, + { + "epoch": 3.527624309392265, + "grad_norm": 0.2978343367576599, + "learning_rate": 7.508604315255142e-05, + "loss": 1.8147, + "step": 11493 + }, + { + "epoch": 3.5279312461632903, + "grad_norm": 0.2507816255092621, + "learning_rate": 7.508174334728963e-05, + "loss": 1.774, + "step": 11494 + }, + { + "epoch": 3.5282381829343157, + "grad_norm": 0.32971861958503723, + "learning_rate": 7.507744329415538e-05, + "loss": 1.7634, + "step": 11495 + }, + { + "epoch": 3.5285451197053406, + "grad_norm": 0.3149639964103699, + "learning_rate": 7.507314299319113e-05, + "loss": 1.8032, + "step": 11496 + }, + { + "epoch": 3.528852056476366, + "grad_norm": 0.2721364498138428, + "learning_rate": 7.506884244443937e-05, + "loss": 1.7702, + "step": 11497 + }, + { + "epoch": 3.529158993247391, + "grad_norm": 0.29375985264778137, + "learning_rate": 7.506454164794263e-05, + "loss": 1.8673, + "step": 11498 + }, + { + "epoch": 3.529465930018416, + "grad_norm": 0.379944384098053, + "learning_rate": 7.50602406037434e-05, + "loss": 1.883, + "step": 11499 + }, + { + "epoch": 3.5297728667894415, + "grad_norm": 0.4041840136051178, + "learning_rate": 7.505593931188417e-05, + "loss": 1.7998, + "step": 11500 + }, + { + "epoch": 3.530079803560467, + "grad_norm": 0.30013784766197205, + "learning_rate": 7.505163777240747e-05, + "loss": 1.775, + "step": 11501 + }, + { + "epoch": 3.5303867403314917, + "grad_norm": 0.25161153078079224, + "learning_rate": 7.50473359853558e-05, + "loss": 1.8609, + "step": 11502 + }, + { + "epoch": 3.530693677102517, + "grad_norm": 0.2803831100463867, + "learning_rate": 7.504303395077168e-05, + "loss": 1.8397, + "step": 11503 + }, + { + "epoch": 3.531000613873542, + "grad_norm": 0.26678118109703064, + "learning_rate": 7.503873166869762e-05, + "loss": 1.7877, + "step": 11504 + }, + { + "epoch": 3.5313075506445673, + "grad_norm": 0.24280449748039246, + "learning_rate": 7.503442913917613e-05, + "loss": 1.7891, + "step": 11505 + }, + { + "epoch": 3.5316144874155926, + "grad_norm": 0.26461485028266907, + "learning_rate": 7.503012636224976e-05, + "loss": 1.7993, + "step": 11506 + }, + { + "epoch": 3.5319214241866175, + "grad_norm": 0.27001824975013733, + "learning_rate": 7.502582333796098e-05, + "loss": 1.7719, + "step": 11507 + }, + { + "epoch": 3.532228360957643, + "grad_norm": 0.27585846185684204, + "learning_rate": 7.502152006635237e-05, + "loss": 1.7412, + "step": 11508 + }, + { + "epoch": 3.5325352977286677, + "grad_norm": 0.24896648526191711, + "learning_rate": 7.501721654746643e-05, + "loss": 1.7459, + "step": 11509 + }, + { + "epoch": 3.532842234499693, + "grad_norm": 0.2308502197265625, + "learning_rate": 7.501291278134569e-05, + "loss": 1.7717, + "step": 11510 + }, + { + "epoch": 3.5331491712707184, + "grad_norm": 0.3026069104671478, + "learning_rate": 7.500860876803267e-05, + "loss": 1.8578, + "step": 11511 + }, + { + "epoch": 3.5334561080417433, + "grad_norm": 0.30242082476615906, + "learning_rate": 7.500430450756995e-05, + "loss": 1.7793, + "step": 11512 + }, + { + "epoch": 3.5337630448127686, + "grad_norm": 0.2583339214324951, + "learning_rate": 7.500000000000001e-05, + "loss": 1.8388, + "step": 11513 + }, + { + "epoch": 3.5340699815837935, + "grad_norm": 0.29673871397972107, + "learning_rate": 7.499569524536542e-05, + "loss": 1.7749, + "step": 11514 + }, + { + "epoch": 3.534376918354819, + "grad_norm": 0.35199788212776184, + "learning_rate": 7.499139024370874e-05, + "loss": 1.7863, + "step": 11515 + }, + { + "epoch": 3.534683855125844, + "grad_norm": 0.25776436924934387, + "learning_rate": 7.498708499507247e-05, + "loss": 1.7568, + "step": 11516 + }, + { + "epoch": 3.5349907918968695, + "grad_norm": 0.26081520318984985, + "learning_rate": 7.498277949949919e-05, + "loss": 1.807, + "step": 11517 + }, + { + "epoch": 3.5352977286678944, + "grad_norm": 0.29247912764549255, + "learning_rate": 7.497847375703145e-05, + "loss": 1.7568, + "step": 11518 + }, + { + "epoch": 3.5356046654389197, + "grad_norm": 0.20964498817920685, + "learning_rate": 7.497416776771178e-05, + "loss": 1.7601, + "step": 11519 + }, + { + "epoch": 3.5359116022099446, + "grad_norm": 0.28739818930625916, + "learning_rate": 7.496986153158273e-05, + "loss": 1.7915, + "step": 11520 + }, + { + "epoch": 3.53621853898097, + "grad_norm": 0.3109932839870453, + "learning_rate": 7.496555504868691e-05, + "loss": 1.8046, + "step": 11521 + }, + { + "epoch": 3.5365254757519953, + "grad_norm": 0.259284108877182, + "learning_rate": 7.496124831906681e-05, + "loss": 1.7595, + "step": 11522 + }, + { + "epoch": 3.53683241252302, + "grad_norm": 0.265909343957901, + "learning_rate": 7.495694134276504e-05, + "loss": 1.8249, + "step": 11523 + }, + { + "epoch": 3.5371393492940455, + "grad_norm": 0.2478799819946289, + "learning_rate": 7.495263411982415e-05, + "loss": 1.8531, + "step": 11524 + }, + { + "epoch": 3.5374462860650704, + "grad_norm": 0.2636432945728302, + "learning_rate": 7.494832665028671e-05, + "loss": 1.8114, + "step": 11525 + }, + { + "epoch": 3.5377532228360957, + "grad_norm": 0.25323864817619324, + "learning_rate": 7.494401893419527e-05, + "loss": 1.8271, + "step": 11526 + }, + { + "epoch": 3.538060159607121, + "grad_norm": 0.2352467179298401, + "learning_rate": 7.493971097159241e-05, + "loss": 1.7524, + "step": 11527 + }, + { + "epoch": 3.538367096378146, + "grad_norm": 0.2788623869419098, + "learning_rate": 7.493540276252072e-05, + "loss": 1.8238, + "step": 11528 + }, + { + "epoch": 3.5386740331491713, + "grad_norm": 0.3506326377391815, + "learning_rate": 7.493109430702277e-05, + "loss": 1.8525, + "step": 11529 + }, + { + "epoch": 3.538980969920196, + "grad_norm": 0.3685263395309448, + "learning_rate": 7.492678560514113e-05, + "loss": 1.8497, + "step": 11530 + }, + { + "epoch": 3.5392879066912215, + "grad_norm": 0.32200056314468384, + "learning_rate": 7.492247665691837e-05, + "loss": 1.7587, + "step": 11531 + }, + { + "epoch": 3.539594843462247, + "grad_norm": 0.2800062894821167, + "learning_rate": 7.49181674623971e-05, + "loss": 1.8188, + "step": 11532 + }, + { + "epoch": 3.539901780233272, + "grad_norm": 0.24137580394744873, + "learning_rate": 7.491385802161989e-05, + "loss": 1.7947, + "step": 11533 + }, + { + "epoch": 3.540208717004297, + "grad_norm": 0.21900027990341187, + "learning_rate": 7.490954833462933e-05, + "loss": 1.7722, + "step": 11534 + }, + { + "epoch": 3.5405156537753224, + "grad_norm": 0.25009945034980774, + "learning_rate": 7.490523840146803e-05, + "loss": 1.8173, + "step": 11535 + }, + { + "epoch": 3.5408225905463473, + "grad_norm": 0.2778431475162506, + "learning_rate": 7.490092822217855e-05, + "loss": 1.8368, + "step": 11536 + }, + { + "epoch": 3.5411295273173726, + "grad_norm": 0.2845982611179352, + "learning_rate": 7.48966177968035e-05, + "loss": 1.7539, + "step": 11537 + }, + { + "epoch": 3.541436464088398, + "grad_norm": 0.27480921149253845, + "learning_rate": 7.48923071253855e-05, + "loss": 1.8494, + "step": 11538 + }, + { + "epoch": 3.541743400859423, + "grad_norm": 0.2722087502479553, + "learning_rate": 7.488799620796711e-05, + "loss": 1.8422, + "step": 11539 + }, + { + "epoch": 3.542050337630448, + "grad_norm": 0.2984340190887451, + "learning_rate": 7.488368504459097e-05, + "loss": 1.8042, + "step": 11540 + }, + { + "epoch": 3.542357274401473, + "grad_norm": 0.2405850738286972, + "learning_rate": 7.487937363529966e-05, + "loss": 1.749, + "step": 11541 + }, + { + "epoch": 3.5426642111724984, + "grad_norm": 0.24816973507404327, + "learning_rate": 7.487506198013579e-05, + "loss": 1.8671, + "step": 11542 + }, + { + "epoch": 3.5429711479435237, + "grad_norm": 0.2796473503112793, + "learning_rate": 7.487075007914199e-05, + "loss": 1.8023, + "step": 11543 + }, + { + "epoch": 3.5432780847145486, + "grad_norm": 0.2600162625312805, + "learning_rate": 7.486643793236086e-05, + "loss": 1.7997, + "step": 11544 + }, + { + "epoch": 3.543585021485574, + "grad_norm": 0.2746226489543915, + "learning_rate": 7.486212553983503e-05, + "loss": 1.7773, + "step": 11545 + }, + { + "epoch": 3.5438919582565993, + "grad_norm": 0.24142079055309296, + "learning_rate": 7.485781290160708e-05, + "loss": 1.791, + "step": 11546 + }, + { + "epoch": 3.544198895027624, + "grad_norm": 0.2472934126853943, + "learning_rate": 7.485350001771966e-05, + "loss": 1.8183, + "step": 11547 + }, + { + "epoch": 3.5445058317986495, + "grad_norm": 0.26891404390335083, + "learning_rate": 7.48491868882154e-05, + "loss": 1.7421, + "step": 11548 + }, + { + "epoch": 3.544812768569675, + "grad_norm": 0.24820464849472046, + "learning_rate": 7.48448735131369e-05, + "loss": 1.7372, + "step": 11549 + }, + { + "epoch": 3.5451197053406998, + "grad_norm": 0.2456594705581665, + "learning_rate": 7.484055989252679e-05, + "loss": 1.7883, + "step": 11550 + }, + { + "epoch": 3.545426642111725, + "grad_norm": 0.32420551776885986, + "learning_rate": 7.48362460264277e-05, + "loss": 1.8363, + "step": 11551 + }, + { + "epoch": 3.54573357888275, + "grad_norm": 0.3187662661075592, + "learning_rate": 7.483193191488229e-05, + "loss": 1.7957, + "step": 11552 + }, + { + "epoch": 3.5460405156537753, + "grad_norm": 0.2845410108566284, + "learning_rate": 7.482761755793316e-05, + "loss": 1.8288, + "step": 11553 + }, + { + "epoch": 3.5463474524248007, + "grad_norm": 0.2816021740436554, + "learning_rate": 7.482330295562298e-05, + "loss": 1.7562, + "step": 11554 + }, + { + "epoch": 3.5466543891958255, + "grad_norm": 0.28938058018684387, + "learning_rate": 7.481898810799435e-05, + "loss": 1.8139, + "step": 11555 + }, + { + "epoch": 3.546961325966851, + "grad_norm": 0.3305707573890686, + "learning_rate": 7.481467301508995e-05, + "loss": 1.8956, + "step": 11556 + }, + { + "epoch": 3.5472682627378758, + "grad_norm": 0.3890376091003418, + "learning_rate": 7.48103576769524e-05, + "loss": 1.8552, + "step": 11557 + }, + { + "epoch": 3.547575199508901, + "grad_norm": 0.3900652825832367, + "learning_rate": 7.480604209362434e-05, + "loss": 1.7748, + "step": 11558 + }, + { + "epoch": 3.5478821362799264, + "grad_norm": 0.3297326862812042, + "learning_rate": 7.480172626514845e-05, + "loss": 1.8201, + "step": 11559 + }, + { + "epoch": 3.5481890730509518, + "grad_norm": 0.28797218203544617, + "learning_rate": 7.479741019156737e-05, + "loss": 1.7652, + "step": 11560 + }, + { + "epoch": 3.5484960098219767, + "grad_norm": 0.2764691114425659, + "learning_rate": 7.479309387292373e-05, + "loss": 1.7534, + "step": 11561 + }, + { + "epoch": 3.548802946593002, + "grad_norm": 0.25067585706710815, + "learning_rate": 7.47887773092602e-05, + "loss": 1.7849, + "step": 11562 + }, + { + "epoch": 3.549109883364027, + "grad_norm": 0.29966798424720764, + "learning_rate": 7.478446050061947e-05, + "loss": 1.8299, + "step": 11563 + }, + { + "epoch": 3.549416820135052, + "grad_norm": 0.24068406224250793, + "learning_rate": 7.478014344704416e-05, + "loss": 1.8366, + "step": 11564 + }, + { + "epoch": 3.5497237569060776, + "grad_norm": 0.2559303641319275, + "learning_rate": 7.477582614857695e-05, + "loss": 1.7665, + "step": 11565 + }, + { + "epoch": 3.5500306936771024, + "grad_norm": 0.24617858231067657, + "learning_rate": 7.47715086052605e-05, + "loss": 1.8334, + "step": 11566 + }, + { + "epoch": 3.550337630448128, + "grad_norm": 0.2433501034975052, + "learning_rate": 7.476719081713749e-05, + "loss": 1.7963, + "step": 11567 + }, + { + "epoch": 3.5506445672191527, + "grad_norm": 0.2583518326282501, + "learning_rate": 7.476287278425057e-05, + "loss": 1.8311, + "step": 11568 + }, + { + "epoch": 3.550951503990178, + "grad_norm": 0.3232485055923462, + "learning_rate": 7.475855450664244e-05, + "loss": 1.9162, + "step": 11569 + }, + { + "epoch": 3.5512584407612033, + "grad_norm": 0.28247153759002686, + "learning_rate": 7.475423598435576e-05, + "loss": 1.8027, + "step": 11570 + }, + { + "epoch": 3.5515653775322282, + "grad_norm": 0.27201834321022034, + "learning_rate": 7.47499172174332e-05, + "loss": 1.7822, + "step": 11571 + }, + { + "epoch": 3.5518723143032536, + "grad_norm": 0.2408471554517746, + "learning_rate": 7.474559820591748e-05, + "loss": 1.7735, + "step": 11572 + }, + { + "epoch": 3.5521792510742785, + "grad_norm": 0.24187393486499786, + "learning_rate": 7.474127894985124e-05, + "loss": 1.7931, + "step": 11573 + }, + { + "epoch": 3.552486187845304, + "grad_norm": 0.2759699523448944, + "learning_rate": 7.473695944927717e-05, + "loss": 1.8407, + "step": 11574 + }, + { + "epoch": 3.552793124616329, + "grad_norm": 0.2503111958503723, + "learning_rate": 7.473263970423797e-05, + "loss": 1.7613, + "step": 11575 + }, + { + "epoch": 3.5531000613873545, + "grad_norm": 0.24795177578926086, + "learning_rate": 7.472831971477633e-05, + "loss": 1.8221, + "step": 11576 + }, + { + "epoch": 3.5534069981583793, + "grad_norm": 0.23190177977085114, + "learning_rate": 7.472399948093494e-05, + "loss": 1.7541, + "step": 11577 + }, + { + "epoch": 3.5537139349294047, + "grad_norm": 0.24650825560092926, + "learning_rate": 7.471967900275653e-05, + "loss": 1.8002, + "step": 11578 + }, + { + "epoch": 3.5540208717004296, + "grad_norm": 0.256598562002182, + "learning_rate": 7.471535828028372e-05, + "loss": 1.7052, + "step": 11579 + }, + { + "epoch": 3.554327808471455, + "grad_norm": 0.2715381681919098, + "learning_rate": 7.471103731355926e-05, + "loss": 1.7701, + "step": 11580 + }, + { + "epoch": 3.5546347452424802, + "grad_norm": 0.29806044697761536, + "learning_rate": 7.470671610262586e-05, + "loss": 1.7614, + "step": 11581 + }, + { + "epoch": 3.554941682013505, + "grad_norm": 0.26364314556121826, + "learning_rate": 7.470239464752621e-05, + "loss": 1.7957, + "step": 11582 + }, + { + "epoch": 3.5552486187845305, + "grad_norm": 0.29270800948143005, + "learning_rate": 7.4698072948303e-05, + "loss": 1.8263, + "step": 11583 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.25941839814186096, + "learning_rate": 7.469375100499898e-05, + "loss": 1.8517, + "step": 11584 + }, + { + "epoch": 3.5558624923265807, + "grad_norm": 0.29509237408638, + "learning_rate": 7.468942881765681e-05, + "loss": 1.8643, + "step": 11585 + }, + { + "epoch": 3.556169429097606, + "grad_norm": 0.23090367019176483, + "learning_rate": 7.468510638631926e-05, + "loss": 1.7239, + "step": 11586 + }, + { + "epoch": 3.556476365868631, + "grad_norm": 0.2696724236011505, + "learning_rate": 7.468078371102901e-05, + "loss": 1.848, + "step": 11587 + }, + { + "epoch": 3.5567833026396563, + "grad_norm": 0.2691192626953125, + "learning_rate": 7.46764607918288e-05, + "loss": 1.8194, + "step": 11588 + }, + { + "epoch": 3.557090239410681, + "grad_norm": 0.26616501808166504, + "learning_rate": 7.467213762876131e-05, + "loss": 1.8382, + "step": 11589 + }, + { + "epoch": 3.5573971761817065, + "grad_norm": 0.30629831552505493, + "learning_rate": 7.466781422186933e-05, + "loss": 1.8417, + "step": 11590 + }, + { + "epoch": 3.557704112952732, + "grad_norm": 0.27212417125701904, + "learning_rate": 7.466349057119552e-05, + "loss": 1.7612, + "step": 11591 + }, + { + "epoch": 3.558011049723757, + "grad_norm": 0.2872084379196167, + "learning_rate": 7.465916667678266e-05, + "loss": 1.7998, + "step": 11592 + }, + { + "epoch": 3.558317986494782, + "grad_norm": 0.3017117977142334, + "learning_rate": 7.465484253867348e-05, + "loss": 1.7996, + "step": 11593 + }, + { + "epoch": 3.5586249232658074, + "grad_norm": 0.2707957327365875, + "learning_rate": 7.465051815691066e-05, + "loss": 1.7678, + "step": 11594 + }, + { + "epoch": 3.5589318600368323, + "grad_norm": 0.28932711482048035, + "learning_rate": 7.464619353153702e-05, + "loss": 1.8576, + "step": 11595 + }, + { + "epoch": 3.5592387968078576, + "grad_norm": 0.2585125267505646, + "learning_rate": 7.464186866259519e-05, + "loss": 1.8678, + "step": 11596 + }, + { + "epoch": 3.559545733578883, + "grad_norm": 0.24386851489543915, + "learning_rate": 7.4637543550128e-05, + "loss": 1.7778, + "step": 11597 + }, + { + "epoch": 3.559852670349908, + "grad_norm": 0.2375860959291458, + "learning_rate": 7.463321819417817e-05, + "loss": 1.8096, + "step": 11598 + }, + { + "epoch": 3.560159607120933, + "grad_norm": 0.2341299206018448, + "learning_rate": 7.462889259478842e-05, + "loss": 1.7191, + "step": 11599 + }, + { + "epoch": 3.560466543891958, + "grad_norm": 0.2510595917701721, + "learning_rate": 7.462456675200154e-05, + "loss": 1.7763, + "step": 11600 + }, + { + "epoch": 3.5607734806629834, + "grad_norm": 0.2554674744606018, + "learning_rate": 7.462024066586025e-05, + "loss": 1.7578, + "step": 11601 + }, + { + "epoch": 3.5610804174340087, + "grad_norm": 0.25040730834007263, + "learning_rate": 7.46159143364073e-05, + "loss": 1.8194, + "step": 11602 + }, + { + "epoch": 3.5613873542050336, + "grad_norm": 0.24294932186603546, + "learning_rate": 7.461158776368547e-05, + "loss": 1.8063, + "step": 11603 + }, + { + "epoch": 3.561694290976059, + "grad_norm": 0.2388325333595276, + "learning_rate": 7.46072609477375e-05, + "loss": 1.7942, + "step": 11604 + }, + { + "epoch": 3.562001227747084, + "grad_norm": 0.2569502890110016, + "learning_rate": 7.460293388860615e-05, + "loss": 1.7824, + "step": 11605 + }, + { + "epoch": 3.562308164518109, + "grad_norm": 0.24004346132278442, + "learning_rate": 7.45986065863342e-05, + "loss": 1.8676, + "step": 11606 + }, + { + "epoch": 3.5626151012891345, + "grad_norm": 0.25446319580078125, + "learning_rate": 7.45942790409644e-05, + "loss": 1.7726, + "step": 11607 + }, + { + "epoch": 3.56292203806016, + "grad_norm": 0.26257482171058655, + "learning_rate": 7.458995125253951e-05, + "loss": 1.779, + "step": 11608 + }, + { + "epoch": 3.5632289748311847, + "grad_norm": 0.27703070640563965, + "learning_rate": 7.458562322110231e-05, + "loss": 1.8247, + "step": 11609 + }, + { + "epoch": 3.56353591160221, + "grad_norm": 0.25478535890579224, + "learning_rate": 7.458129494669556e-05, + "loss": 1.7794, + "step": 11610 + }, + { + "epoch": 3.563842848373235, + "grad_norm": 0.26173365116119385, + "learning_rate": 7.457696642936207e-05, + "loss": 1.758, + "step": 11611 + }, + { + "epoch": 3.5641497851442603, + "grad_norm": 0.25077274441719055, + "learning_rate": 7.45726376691446e-05, + "loss": 1.8234, + "step": 11612 + }, + { + "epoch": 3.5644567219152856, + "grad_norm": 0.2591109275817871, + "learning_rate": 7.456830866608589e-05, + "loss": 1.7723, + "step": 11613 + }, + { + "epoch": 3.5647636586863105, + "grad_norm": 0.2653447091579437, + "learning_rate": 7.456397942022877e-05, + "loss": 1.7839, + "step": 11614 + }, + { + "epoch": 3.565070595457336, + "grad_norm": 0.3203454911708832, + "learning_rate": 7.455964993161601e-05, + "loss": 1.8548, + "step": 11615 + }, + { + "epoch": 3.5653775322283607, + "grad_norm": 0.3041793704032898, + "learning_rate": 7.455532020029039e-05, + "loss": 1.7925, + "step": 11616 + }, + { + "epoch": 3.565684468999386, + "grad_norm": 0.26066139340400696, + "learning_rate": 7.45509902262947e-05, + "loss": 1.7905, + "step": 11617 + }, + { + "epoch": 3.5659914057704114, + "grad_norm": 0.2483314871788025, + "learning_rate": 7.454666000967174e-05, + "loss": 1.7658, + "step": 11618 + }, + { + "epoch": 3.5662983425414367, + "grad_norm": 0.24285900592803955, + "learning_rate": 7.45423295504643e-05, + "loss": 1.7575, + "step": 11619 + }, + { + "epoch": 3.5666052793124616, + "grad_norm": 0.27231669425964355, + "learning_rate": 7.453799884871517e-05, + "loss": 1.8389, + "step": 11620 + }, + { + "epoch": 3.566912216083487, + "grad_norm": 0.24324406683444977, + "learning_rate": 7.453366790446717e-05, + "loss": 1.7775, + "step": 11621 + }, + { + "epoch": 3.567219152854512, + "grad_norm": 0.2724440097808838, + "learning_rate": 7.452933671776305e-05, + "loss": 1.8135, + "step": 11622 + }, + { + "epoch": 3.567526089625537, + "grad_norm": 0.22207655012607574, + "learning_rate": 7.452500528864568e-05, + "loss": 1.722, + "step": 11623 + }, + { + "epoch": 3.5678330263965625, + "grad_norm": 0.25650298595428467, + "learning_rate": 7.452067361715782e-05, + "loss": 1.7813, + "step": 11624 + }, + { + "epoch": 3.5681399631675874, + "grad_norm": 0.2582200765609741, + "learning_rate": 7.45163417033423e-05, + "loss": 1.8253, + "step": 11625 + }, + { + "epoch": 3.5684468999386127, + "grad_norm": 0.29545384645462036, + "learning_rate": 7.451200954724188e-05, + "loss": 1.8108, + "step": 11626 + }, + { + "epoch": 3.5687538367096376, + "grad_norm": 0.30457428097724915, + "learning_rate": 7.450767714889946e-05, + "loss": 1.8257, + "step": 11627 + }, + { + "epoch": 3.569060773480663, + "grad_norm": 0.2955166697502136, + "learning_rate": 7.450334450835781e-05, + "loss": 1.8172, + "step": 11628 + }, + { + "epoch": 3.5693677102516883, + "grad_norm": 0.2793857753276825, + "learning_rate": 7.449901162565974e-05, + "loss": 1.8493, + "step": 11629 + }, + { + "epoch": 3.569674647022713, + "grad_norm": 0.27154335379600525, + "learning_rate": 7.449467850084808e-05, + "loss": 1.8306, + "step": 11630 + }, + { + "epoch": 3.5699815837937385, + "grad_norm": 0.22336189448833466, + "learning_rate": 7.449034513396564e-05, + "loss": 1.7435, + "step": 11631 + }, + { + "epoch": 3.5702885205647634, + "grad_norm": 0.22799183428287506, + "learning_rate": 7.448601152505526e-05, + "loss": 1.7818, + "step": 11632 + }, + { + "epoch": 3.5705954573357888, + "grad_norm": 0.26670658588409424, + "learning_rate": 7.448167767415976e-05, + "loss": 1.7777, + "step": 11633 + }, + { + "epoch": 3.570902394106814, + "grad_norm": 0.2848666310310364, + "learning_rate": 7.447734358132196e-05, + "loss": 1.7572, + "step": 11634 + }, + { + "epoch": 3.5712093308778394, + "grad_norm": 0.26843544840812683, + "learning_rate": 7.447300924658473e-05, + "loss": 1.7642, + "step": 11635 + }, + { + "epoch": 3.5715162676488643, + "grad_norm": 0.24666404724121094, + "learning_rate": 7.446867466999087e-05, + "loss": 1.7533, + "step": 11636 + }, + { + "epoch": 3.5718232044198897, + "grad_norm": 0.31111210584640503, + "learning_rate": 7.44643398515832e-05, + "loss": 1.7875, + "step": 11637 + }, + { + "epoch": 3.5721301411909145, + "grad_norm": 0.3157108724117279, + "learning_rate": 7.446000479140462e-05, + "loss": 1.7879, + "step": 11638 + }, + { + "epoch": 3.57243707796194, + "grad_norm": 0.2935558259487152, + "learning_rate": 7.445566948949792e-05, + "loss": 1.7819, + "step": 11639 + }, + { + "epoch": 3.572744014732965, + "grad_norm": 0.2265472710132599, + "learning_rate": 7.445133394590597e-05, + "loss": 1.7518, + "step": 11640 + }, + { + "epoch": 3.57305095150399, + "grad_norm": 0.2564176023006439, + "learning_rate": 7.444699816067159e-05, + "loss": 1.7281, + "step": 11641 + }, + { + "epoch": 3.5733578882750154, + "grad_norm": 0.27933555841445923, + "learning_rate": 7.444266213383766e-05, + "loss": 1.7852, + "step": 11642 + }, + { + "epoch": 3.5736648250460403, + "grad_norm": 0.29105356335639954, + "learning_rate": 7.4438325865447e-05, + "loss": 1.8056, + "step": 11643 + }, + { + "epoch": 3.5739717618170657, + "grad_norm": 0.27665549516677856, + "learning_rate": 7.443398935554249e-05, + "loss": 1.7249, + "step": 11644 + }, + { + "epoch": 3.574278698588091, + "grad_norm": 0.21899232268333435, + "learning_rate": 7.442965260416698e-05, + "loss": 1.7689, + "step": 11645 + }, + { + "epoch": 3.574585635359116, + "grad_norm": 0.3250672221183777, + "learning_rate": 7.442531561136333e-05, + "loss": 1.8058, + "step": 11646 + }, + { + "epoch": 3.574892572130141, + "grad_norm": 0.42442524433135986, + "learning_rate": 7.442097837717438e-05, + "loss": 1.7887, + "step": 11647 + }, + { + "epoch": 3.575199508901166, + "grad_norm": 0.33108964562416077, + "learning_rate": 7.441664090164302e-05, + "loss": 1.7628, + "step": 11648 + }, + { + "epoch": 3.5755064456721914, + "grad_norm": 0.23050357401371002, + "learning_rate": 7.44123031848121e-05, + "loss": 1.8121, + "step": 11649 + }, + { + "epoch": 3.575813382443217, + "grad_norm": 0.29251593351364136, + "learning_rate": 7.440796522672448e-05, + "loss": 1.8051, + "step": 11650 + }, + { + "epoch": 3.576120319214242, + "grad_norm": 0.3764750063419342, + "learning_rate": 7.440362702742305e-05, + "loss": 1.9002, + "step": 11651 + }, + { + "epoch": 3.576427255985267, + "grad_norm": 0.3751949071884155, + "learning_rate": 7.439928858695069e-05, + "loss": 1.821, + "step": 11652 + }, + { + "epoch": 3.5767341927562923, + "grad_norm": 0.268476665019989, + "learning_rate": 7.439494990535024e-05, + "loss": 1.8241, + "step": 11653 + }, + { + "epoch": 3.5770411295273172, + "grad_norm": 0.3072795271873474, + "learning_rate": 7.439061098266459e-05, + "loss": 1.8169, + "step": 11654 + }, + { + "epoch": 3.5773480662983426, + "grad_norm": 0.4948901832103729, + "learning_rate": 7.438627181893664e-05, + "loss": 1.7706, + "step": 11655 + }, + { + "epoch": 3.577655003069368, + "grad_norm": 0.5892601013183594, + "learning_rate": 7.438193241420926e-05, + "loss": 1.7631, + "step": 11656 + }, + { + "epoch": 3.577961939840393, + "grad_norm": 0.4599401652812958, + "learning_rate": 7.437759276852533e-05, + "loss": 1.7471, + "step": 11657 + }, + { + "epoch": 3.578268876611418, + "grad_norm": 0.2545170783996582, + "learning_rate": 7.437325288192773e-05, + "loss": 1.7945, + "step": 11658 + }, + { + "epoch": 3.578575813382443, + "grad_norm": 0.3136496841907501, + "learning_rate": 7.436891275445938e-05, + "loss": 1.828, + "step": 11659 + }, + { + "epoch": 3.5788827501534684, + "grad_norm": 0.3631688058376312, + "learning_rate": 7.436457238616313e-05, + "loss": 1.8302, + "step": 11660 + }, + { + "epoch": 3.5791896869244937, + "grad_norm": 0.3097386658191681, + "learning_rate": 7.436023177708192e-05, + "loss": 1.8397, + "step": 11661 + }, + { + "epoch": 3.5794966236955186, + "grad_norm": 0.20948798954486847, + "learning_rate": 7.43558909272586e-05, + "loss": 1.7844, + "step": 11662 + }, + { + "epoch": 3.579803560466544, + "grad_norm": 0.24327392876148224, + "learning_rate": 7.43515498367361e-05, + "loss": 1.7827, + "step": 11663 + }, + { + "epoch": 3.580110497237569, + "grad_norm": 0.25268325209617615, + "learning_rate": 7.434720850555731e-05, + "loss": 1.8224, + "step": 11664 + }, + { + "epoch": 3.580417434008594, + "grad_norm": 0.24883607029914856, + "learning_rate": 7.434286693376513e-05, + "loss": 1.8189, + "step": 11665 + }, + { + "epoch": 3.5807243707796195, + "grad_norm": 0.2942518889904022, + "learning_rate": 7.433852512140248e-05, + "loss": 1.8325, + "step": 11666 + }, + { + "epoch": 3.581031307550645, + "grad_norm": 0.3556186556816101, + "learning_rate": 7.433418306851225e-05, + "loss": 1.7511, + "step": 11667 + }, + { + "epoch": 3.5813382443216697, + "grad_norm": 0.421220600605011, + "learning_rate": 7.432984077513738e-05, + "loss": 1.8081, + "step": 11668 + }, + { + "epoch": 3.581645181092695, + "grad_norm": 0.3338243067264557, + "learning_rate": 7.432549824132074e-05, + "loss": 1.8274, + "step": 11669 + }, + { + "epoch": 3.58195211786372, + "grad_norm": 0.25091543793678284, + "learning_rate": 7.432115546710528e-05, + "loss": 1.7637, + "step": 11670 + }, + { + "epoch": 3.5822590546347453, + "grad_norm": 0.29870370030403137, + "learning_rate": 7.431681245253389e-05, + "loss": 1.8036, + "step": 11671 + }, + { + "epoch": 3.5825659914057706, + "grad_norm": 0.2682137191295624, + "learning_rate": 7.431246919764953e-05, + "loss": 1.8252, + "step": 11672 + }, + { + "epoch": 3.5828729281767955, + "grad_norm": 0.28790801763534546, + "learning_rate": 7.430812570249508e-05, + "loss": 1.7713, + "step": 11673 + }, + { + "epoch": 3.583179864947821, + "grad_norm": 0.26357609033584595, + "learning_rate": 7.43037819671135e-05, + "loss": 1.8388, + "step": 11674 + }, + { + "epoch": 3.5834868017188457, + "grad_norm": 0.2505483031272888, + "learning_rate": 7.42994379915477e-05, + "loss": 1.7722, + "step": 11675 + }, + { + "epoch": 3.583793738489871, + "grad_norm": 0.2535844147205353, + "learning_rate": 7.42950937758406e-05, + "loss": 1.756, + "step": 11676 + }, + { + "epoch": 3.5841006752608964, + "grad_norm": 0.23045027256011963, + "learning_rate": 7.429074932003515e-05, + "loss": 1.791, + "step": 11677 + }, + { + "epoch": 3.5844076120319213, + "grad_norm": 0.22525762021541595, + "learning_rate": 7.428640462417428e-05, + "loss": 1.7234, + "step": 11678 + }, + { + "epoch": 3.5847145488029466, + "grad_norm": 0.2402270883321762, + "learning_rate": 7.428205968830094e-05, + "loss": 1.845, + "step": 11679 + }, + { + "epoch": 3.5850214855739715, + "grad_norm": 0.24909646809101105, + "learning_rate": 7.427771451245802e-05, + "loss": 1.8537, + "step": 11680 + }, + { + "epoch": 3.585328422344997, + "grad_norm": 0.25813063979148865, + "learning_rate": 7.427336909668853e-05, + "loss": 1.7353, + "step": 11681 + }, + { + "epoch": 3.585635359116022, + "grad_norm": 0.26073768734931946, + "learning_rate": 7.426902344103534e-05, + "loss": 1.8142, + "step": 11682 + }, + { + "epoch": 3.5859422958870475, + "grad_norm": 0.2498280256986618, + "learning_rate": 7.426467754554147e-05, + "loss": 1.7996, + "step": 11683 + }, + { + "epoch": 3.5862492326580724, + "grad_norm": 0.3131188154220581, + "learning_rate": 7.426033141024981e-05, + "loss": 1.7793, + "step": 11684 + }, + { + "epoch": 3.5865561694290977, + "grad_norm": 0.24118199944496155, + "learning_rate": 7.425598503520337e-05, + "loss": 1.8249, + "step": 11685 + }, + { + "epoch": 3.5868631062001226, + "grad_norm": 0.2791197597980499, + "learning_rate": 7.425163842044504e-05, + "loss": 1.7966, + "step": 11686 + }, + { + "epoch": 3.587170042971148, + "grad_norm": 0.2298576384782791, + "learning_rate": 7.424729156601781e-05, + "loss": 1.7224, + "step": 11687 + }, + { + "epoch": 3.5874769797421733, + "grad_norm": 0.23113438487052917, + "learning_rate": 7.424294447196462e-05, + "loss": 1.7641, + "step": 11688 + }, + { + "epoch": 3.587783916513198, + "grad_norm": 0.3064495027065277, + "learning_rate": 7.423859713832847e-05, + "loss": 1.8688, + "step": 11689 + }, + { + "epoch": 3.5880908532842235, + "grad_norm": 0.22847676277160645, + "learning_rate": 7.423424956515228e-05, + "loss": 1.7513, + "step": 11690 + }, + { + "epoch": 3.5883977900552484, + "grad_norm": 0.2797350585460663, + "learning_rate": 7.422990175247905e-05, + "loss": 1.8268, + "step": 11691 + }, + { + "epoch": 3.5887047268262737, + "grad_norm": 0.2753821313381195, + "learning_rate": 7.422555370035171e-05, + "loss": 1.7313, + "step": 11692 + }, + { + "epoch": 3.589011663597299, + "grad_norm": 0.2981179654598236, + "learning_rate": 7.422120540881326e-05, + "loss": 1.8455, + "step": 11693 + }, + { + "epoch": 3.5893186003683244, + "grad_norm": 0.33028867840766907, + "learning_rate": 7.421685687790667e-05, + "loss": 1.8397, + "step": 11694 + }, + { + "epoch": 3.5896255371393493, + "grad_norm": 0.409173846244812, + "learning_rate": 7.421250810767487e-05, + "loss": 1.8088, + "step": 11695 + }, + { + "epoch": 3.5899324739103746, + "grad_norm": 0.4118194878101349, + "learning_rate": 7.42081590981609e-05, + "loss": 1.7719, + "step": 11696 + }, + { + "epoch": 3.5902394106813995, + "grad_norm": 0.34716179966926575, + "learning_rate": 7.420380984940773e-05, + "loss": 1.8063, + "step": 11697 + }, + { + "epoch": 3.590546347452425, + "grad_norm": 0.27763083577156067, + "learning_rate": 7.419946036145829e-05, + "loss": 1.7777, + "step": 11698 + }, + { + "epoch": 3.59085328422345, + "grad_norm": 0.3175280690193176, + "learning_rate": 7.419511063435562e-05, + "loss": 1.697, + "step": 11699 + }, + { + "epoch": 3.591160220994475, + "grad_norm": 0.3151503801345825, + "learning_rate": 7.419076066814268e-05, + "loss": 1.8067, + "step": 11700 + }, + { + "epoch": 3.5914671577655004, + "grad_norm": 0.26914867758750916, + "learning_rate": 7.418641046286245e-05, + "loss": 1.7797, + "step": 11701 + }, + { + "epoch": 3.5917740945365253, + "grad_norm": 0.27231964468955994, + "learning_rate": 7.418206001855797e-05, + "loss": 1.7931, + "step": 11702 + }, + { + "epoch": 3.5920810313075506, + "grad_norm": 0.3352177143096924, + "learning_rate": 7.417770933527217e-05, + "loss": 1.9187, + "step": 11703 + }, + { + "epoch": 3.592387968078576, + "grad_norm": 0.3510081470012665, + "learning_rate": 7.417335841304808e-05, + "loss": 1.7889, + "step": 11704 + }, + { + "epoch": 3.592694904849601, + "grad_norm": 0.24949313700199127, + "learning_rate": 7.41690072519287e-05, + "loss": 1.7683, + "step": 11705 + }, + { + "epoch": 3.593001841620626, + "grad_norm": 0.28442221879959106, + "learning_rate": 7.416465585195702e-05, + "loss": 1.7889, + "step": 11706 + }, + { + "epoch": 3.593308778391651, + "grad_norm": 0.3355824649333954, + "learning_rate": 7.416030421317605e-05, + "loss": 1.7637, + "step": 11707 + }, + { + "epoch": 3.5936157151626764, + "grad_norm": 0.33569446206092834, + "learning_rate": 7.415595233562878e-05, + "loss": 1.919, + "step": 11708 + }, + { + "epoch": 3.5939226519337018, + "grad_norm": 0.2488354742527008, + "learning_rate": 7.415160021935825e-05, + "loss": 1.8424, + "step": 11709 + }, + { + "epoch": 3.594229588704727, + "grad_norm": 0.2701130509376526, + "learning_rate": 7.414724786440746e-05, + "loss": 1.7586, + "step": 11710 + }, + { + "epoch": 3.594536525475752, + "grad_norm": 0.26289790868759155, + "learning_rate": 7.414289527081939e-05, + "loss": 1.7975, + "step": 11711 + }, + { + "epoch": 3.5948434622467773, + "grad_norm": 0.25382301211357117, + "learning_rate": 7.413854243863707e-05, + "loss": 1.7393, + "step": 11712 + }, + { + "epoch": 3.595150399017802, + "grad_norm": 0.28282979130744934, + "learning_rate": 7.413418936790357e-05, + "loss": 1.8048, + "step": 11713 + }, + { + "epoch": 3.5954573357888275, + "grad_norm": 0.28001347184181213, + "learning_rate": 7.412983605866183e-05, + "loss": 1.7864, + "step": 11714 + }, + { + "epoch": 3.595764272559853, + "grad_norm": 0.26107707619667053, + "learning_rate": 7.412548251095491e-05, + "loss": 1.8016, + "step": 11715 + }, + { + "epoch": 3.5960712093308778, + "grad_norm": 0.2518761456012726, + "learning_rate": 7.412112872482583e-05, + "loss": 1.7565, + "step": 11716 + }, + { + "epoch": 3.596378146101903, + "grad_norm": 0.25911152362823486, + "learning_rate": 7.411677470031762e-05, + "loss": 1.8333, + "step": 11717 + }, + { + "epoch": 3.596685082872928, + "grad_norm": 0.3411506414413452, + "learning_rate": 7.41124204374733e-05, + "loss": 1.8027, + "step": 11718 + }, + { + "epoch": 3.5969920196439533, + "grad_norm": 0.28535547852516174, + "learning_rate": 7.410806593633593e-05, + "loss": 1.7596, + "step": 11719 + }, + { + "epoch": 3.5972989564149787, + "grad_norm": 0.24665530025959015, + "learning_rate": 7.410371119694852e-05, + "loss": 1.7777, + "step": 11720 + }, + { + "epoch": 3.5976058931860035, + "grad_norm": 0.29162275791168213, + "learning_rate": 7.40993562193541e-05, + "loss": 1.795, + "step": 11721 + }, + { + "epoch": 3.597912829957029, + "grad_norm": 0.2712220549583435, + "learning_rate": 7.409500100359573e-05, + "loss": 1.824, + "step": 11722 + }, + { + "epoch": 3.5982197667280538, + "grad_norm": 0.239755779504776, + "learning_rate": 7.40906455497164e-05, + "loss": 1.7534, + "step": 11723 + }, + { + "epoch": 3.598526703499079, + "grad_norm": 0.26056957244873047, + "learning_rate": 7.408628985775922e-05, + "loss": 1.757, + "step": 11724 + }, + { + "epoch": 3.5988336402701044, + "grad_norm": 0.3230258822441101, + "learning_rate": 7.40819339277672e-05, + "loss": 1.8684, + "step": 11725 + }, + { + "epoch": 3.5991405770411298, + "grad_norm": 0.26070696115493774, + "learning_rate": 7.407757775978339e-05, + "loss": 1.7868, + "step": 11726 + }, + { + "epoch": 3.5994475138121547, + "grad_norm": 0.24940893054008484, + "learning_rate": 7.407322135385085e-05, + "loss": 1.8391, + "step": 11727 + }, + { + "epoch": 3.59975445058318, + "grad_norm": 0.2717723250389099, + "learning_rate": 7.406886471001263e-05, + "loss": 1.7567, + "step": 11728 + }, + { + "epoch": 3.600061387354205, + "grad_norm": 0.2328445315361023, + "learning_rate": 7.406450782831177e-05, + "loss": 1.7761, + "step": 11729 + }, + { + "epoch": 3.6003683241252302, + "grad_norm": 0.2740287184715271, + "learning_rate": 7.406015070879136e-05, + "loss": 1.8599, + "step": 11730 + }, + { + "epoch": 3.6006752608962556, + "grad_norm": 0.2930558919906616, + "learning_rate": 7.405579335149441e-05, + "loss": 1.852, + "step": 11731 + }, + { + "epoch": 3.6009821976672804, + "grad_norm": 0.30175161361694336, + "learning_rate": 7.405143575646403e-05, + "loss": 1.8861, + "step": 11732 + }, + { + "epoch": 3.601289134438306, + "grad_norm": 0.2617531418800354, + "learning_rate": 7.404707792374328e-05, + "loss": 1.7598, + "step": 11733 + }, + { + "epoch": 3.6015960712093307, + "grad_norm": 0.25384122133255005, + "learning_rate": 7.404271985337517e-05, + "loss": 1.7634, + "step": 11734 + }, + { + "epoch": 3.601903007980356, + "grad_norm": 0.31706711649894714, + "learning_rate": 7.403836154540284e-05, + "loss": 1.8125, + "step": 11735 + }, + { + "epoch": 3.6022099447513813, + "grad_norm": 0.299662709236145, + "learning_rate": 7.403400299986932e-05, + "loss": 1.748, + "step": 11736 + }, + { + "epoch": 3.6025168815224062, + "grad_norm": 0.23828944563865662, + "learning_rate": 7.40296442168177e-05, + "loss": 1.7473, + "step": 11737 + }, + { + "epoch": 3.6028238182934316, + "grad_norm": 0.22611604630947113, + "learning_rate": 7.402528519629106e-05, + "loss": 1.7519, + "step": 11738 + }, + { + "epoch": 3.6031307550644565, + "grad_norm": 0.28498536348342896, + "learning_rate": 7.402092593833246e-05, + "loss": 1.7792, + "step": 11739 + }, + { + "epoch": 3.603437691835482, + "grad_norm": 0.2404283881187439, + "learning_rate": 7.4016566442985e-05, + "loss": 1.7434, + "step": 11740 + }, + { + "epoch": 3.603744628606507, + "grad_norm": 0.2291589230298996, + "learning_rate": 7.401220671029173e-05, + "loss": 1.7623, + "step": 11741 + }, + { + "epoch": 3.6040515653775325, + "grad_norm": 0.23962698876857758, + "learning_rate": 7.400784674029578e-05, + "loss": 1.7232, + "step": 11742 + }, + { + "epoch": 3.6043585021485574, + "grad_norm": 0.3015185594558716, + "learning_rate": 7.400348653304022e-05, + "loss": 1.7808, + "step": 11743 + }, + { + "epoch": 3.6046654389195827, + "grad_norm": 0.30623099207878113, + "learning_rate": 7.399912608856813e-05, + "loss": 1.8518, + "step": 11744 + }, + { + "epoch": 3.6049723756906076, + "grad_norm": 0.2698235511779785, + "learning_rate": 7.39947654069226e-05, + "loss": 1.7829, + "step": 11745 + }, + { + "epoch": 3.605279312461633, + "grad_norm": 0.2195274829864502, + "learning_rate": 7.399040448814674e-05, + "loss": 1.7709, + "step": 11746 + }, + { + "epoch": 3.6055862492326582, + "grad_norm": 0.22962357103824615, + "learning_rate": 7.398604333228366e-05, + "loss": 1.7482, + "step": 11747 + }, + { + "epoch": 3.605893186003683, + "grad_norm": 0.2403932511806488, + "learning_rate": 7.398168193937642e-05, + "loss": 1.8063, + "step": 11748 + }, + { + "epoch": 3.6062001227747085, + "grad_norm": 0.23542718589305878, + "learning_rate": 7.397732030946816e-05, + "loss": 1.7599, + "step": 11749 + }, + { + "epoch": 3.6065070595457334, + "grad_norm": 0.2462490350008011, + "learning_rate": 7.397295844260195e-05, + "loss": 1.8183, + "step": 11750 + }, + { + "epoch": 3.6068139963167587, + "grad_norm": 0.21428349614143372, + "learning_rate": 7.396859633882091e-05, + "loss": 1.6944, + "step": 11751 + }, + { + "epoch": 3.607120933087784, + "grad_norm": 0.21240907907485962, + "learning_rate": 7.396423399816817e-05, + "loss": 1.7795, + "step": 11752 + }, + { + "epoch": 3.607427869858809, + "grad_norm": 0.23413677513599396, + "learning_rate": 7.395987142068682e-05, + "loss": 1.8015, + "step": 11753 + }, + { + "epoch": 3.6077348066298343, + "grad_norm": 0.26724907755851746, + "learning_rate": 7.395550860641998e-05, + "loss": 1.8174, + "step": 11754 + }, + { + "epoch": 3.608041743400859, + "grad_norm": 0.22077679634094238, + "learning_rate": 7.395114555541077e-05, + "loss": 1.7929, + "step": 11755 + }, + { + "epoch": 3.6083486801718845, + "grad_norm": 0.2475263774394989, + "learning_rate": 7.394678226770228e-05, + "loss": 1.7744, + "step": 11756 + }, + { + "epoch": 3.60865561694291, + "grad_norm": 0.22579342126846313, + "learning_rate": 7.394241874333764e-05, + "loss": 1.79, + "step": 11757 + }, + { + "epoch": 3.608962553713935, + "grad_norm": 0.26798152923583984, + "learning_rate": 7.393805498236001e-05, + "loss": 1.8087, + "step": 11758 + }, + { + "epoch": 3.60926949048496, + "grad_norm": 0.2755621373653412, + "learning_rate": 7.393369098481248e-05, + "loss": 1.7834, + "step": 11759 + }, + { + "epoch": 3.6095764272559854, + "grad_norm": 0.2741812467575073, + "learning_rate": 7.39293267507382e-05, + "loss": 1.7948, + "step": 11760 + }, + { + "epoch": 3.6098833640270103, + "grad_norm": 0.2378924936056137, + "learning_rate": 7.392496228018028e-05, + "loss": 1.8317, + "step": 11761 + }, + { + "epoch": 3.6101903007980356, + "grad_norm": 0.2628132700920105, + "learning_rate": 7.392059757318187e-05, + "loss": 1.8123, + "step": 11762 + }, + { + "epoch": 3.610497237569061, + "grad_norm": 0.2613002359867096, + "learning_rate": 7.391623262978607e-05, + "loss": 1.795, + "step": 11763 + }, + { + "epoch": 3.610804174340086, + "grad_norm": 0.27272161841392517, + "learning_rate": 7.391186745003608e-05, + "loss": 1.7808, + "step": 11764 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.21366162598133087, + "learning_rate": 7.390750203397497e-05, + "loss": 1.77, + "step": 11765 + }, + { + "epoch": 3.611418047882136, + "grad_norm": 0.25559261441230774, + "learning_rate": 7.390313638164593e-05, + "loss": 1.8442, + "step": 11766 + }, + { + "epoch": 3.6117249846531614, + "grad_norm": 0.23794838786125183, + "learning_rate": 7.389877049309207e-05, + "loss": 1.8237, + "step": 11767 + }, + { + "epoch": 3.6120319214241867, + "grad_norm": 0.2690154016017914, + "learning_rate": 7.389440436835656e-05, + "loss": 1.8194, + "step": 11768 + }, + { + "epoch": 3.612338858195212, + "grad_norm": 0.26148009300231934, + "learning_rate": 7.389003800748254e-05, + "loss": 1.7862, + "step": 11769 + }, + { + "epoch": 3.612645794966237, + "grad_norm": 0.26414936780929565, + "learning_rate": 7.388567141051315e-05, + "loss": 1.7815, + "step": 11770 + }, + { + "epoch": 3.6129527317372623, + "grad_norm": 0.24473857879638672, + "learning_rate": 7.388130457749157e-05, + "loss": 1.801, + "step": 11771 + }, + { + "epoch": 3.613259668508287, + "grad_norm": 0.24356001615524292, + "learning_rate": 7.387693750846094e-05, + "loss": 1.8031, + "step": 11772 + }, + { + "epoch": 3.6135666052793125, + "grad_norm": 0.26716411113739014, + "learning_rate": 7.387257020346441e-05, + "loss": 1.7999, + "step": 11773 + }, + { + "epoch": 3.613873542050338, + "grad_norm": 0.2730760872364044, + "learning_rate": 7.386820266254516e-05, + "loss": 1.8079, + "step": 11774 + }, + { + "epoch": 3.6141804788213627, + "grad_norm": 0.2570728361606598, + "learning_rate": 7.386383488574635e-05, + "loss": 1.7374, + "step": 11775 + }, + { + "epoch": 3.614487415592388, + "grad_norm": 0.24992883205413818, + "learning_rate": 7.385946687311112e-05, + "loss": 1.8432, + "step": 11776 + }, + { + "epoch": 3.614794352363413, + "grad_norm": 0.28632259368896484, + "learning_rate": 7.385509862468266e-05, + "loss": 1.8014, + "step": 11777 + }, + { + "epoch": 3.6151012891344383, + "grad_norm": 0.257303923368454, + "learning_rate": 7.385073014050412e-05, + "loss": 1.8166, + "step": 11778 + }, + { + "epoch": 3.6154082259054636, + "grad_norm": 0.2791872024536133, + "learning_rate": 7.38463614206187e-05, + "loss": 1.7865, + "step": 11779 + }, + { + "epoch": 3.6157151626764885, + "grad_norm": 0.25708603858947754, + "learning_rate": 7.384199246506956e-05, + "loss": 1.807, + "step": 11780 + }, + { + "epoch": 3.616022099447514, + "grad_norm": 0.28693172335624695, + "learning_rate": 7.383762327389988e-05, + "loss": 1.8049, + "step": 11781 + }, + { + "epoch": 3.6163290362185387, + "grad_norm": 0.2731167674064636, + "learning_rate": 7.383325384715283e-05, + "loss": 1.8937, + "step": 11782 + }, + { + "epoch": 3.616635972989564, + "grad_norm": 0.26151663064956665, + "learning_rate": 7.38288841848716e-05, + "loss": 1.8288, + "step": 11783 + }, + { + "epoch": 3.6169429097605894, + "grad_norm": 0.2732257843017578, + "learning_rate": 7.382451428709936e-05, + "loss": 1.7668, + "step": 11784 + }, + { + "epoch": 3.6172498465316147, + "grad_norm": 0.2747575640678406, + "learning_rate": 7.38201441538793e-05, + "loss": 1.7991, + "step": 11785 + }, + { + "epoch": 3.6175567833026396, + "grad_norm": 0.2884783446788788, + "learning_rate": 7.381577378525462e-05, + "loss": 1.7798, + "step": 11786 + }, + { + "epoch": 3.617863720073665, + "grad_norm": 0.2716344892978668, + "learning_rate": 7.381140318126851e-05, + "loss": 1.7923, + "step": 11787 + }, + { + "epoch": 3.61817065684469, + "grad_norm": 0.3007747232913971, + "learning_rate": 7.380703234196416e-05, + "loss": 1.8397, + "step": 11788 + }, + { + "epoch": 3.618477593615715, + "grad_norm": 0.39218056201934814, + "learning_rate": 7.380266126738476e-05, + "loss": 1.8517, + "step": 11789 + }, + { + "epoch": 3.6187845303867405, + "grad_norm": 0.43425866961479187, + "learning_rate": 7.379828995757351e-05, + "loss": 1.7518, + "step": 11790 + }, + { + "epoch": 3.6190914671577654, + "grad_norm": 0.34399518370628357, + "learning_rate": 7.37939184125736e-05, + "loss": 1.7607, + "step": 11791 + }, + { + "epoch": 3.6193984039287908, + "grad_norm": 0.23124302923679352, + "learning_rate": 7.378954663242825e-05, + "loss": 1.7898, + "step": 11792 + }, + { + "epoch": 3.6197053406998156, + "grad_norm": 0.32839757204055786, + "learning_rate": 7.378517461718066e-05, + "loss": 1.7472, + "step": 11793 + }, + { + "epoch": 3.620012277470841, + "grad_norm": 0.38583460450172424, + "learning_rate": 7.378080236687403e-05, + "loss": 1.7947, + "step": 11794 + }, + { + "epoch": 3.6203192142418663, + "grad_norm": 0.4622896909713745, + "learning_rate": 7.377642988155157e-05, + "loss": 1.9023, + "step": 11795 + }, + { + "epoch": 3.620626151012891, + "grad_norm": 0.3783189058303833, + "learning_rate": 7.37720571612565e-05, + "loss": 1.7813, + "step": 11796 + }, + { + "epoch": 3.6209330877839165, + "grad_norm": 0.3468814790248871, + "learning_rate": 7.376768420603204e-05, + "loss": 1.7509, + "step": 11797 + }, + { + "epoch": 3.6212400245549414, + "grad_norm": 0.2602507174015045, + "learning_rate": 7.376331101592138e-05, + "loss": 1.8158, + "step": 11798 + }, + { + "epoch": 3.6215469613259668, + "grad_norm": 0.28337883949279785, + "learning_rate": 7.375893759096775e-05, + "loss": 1.7755, + "step": 11799 + }, + { + "epoch": 3.621853898096992, + "grad_norm": 0.3644609749317169, + "learning_rate": 7.375456393121437e-05, + "loss": 1.8193, + "step": 11800 + }, + { + "epoch": 3.6221608348680174, + "grad_norm": 0.338211327791214, + "learning_rate": 7.375019003670448e-05, + "loss": 1.821, + "step": 11801 + }, + { + "epoch": 3.6224677716390423, + "grad_norm": 0.23850654065608978, + "learning_rate": 7.374581590748129e-05, + "loss": 1.7317, + "step": 11802 + }, + { + "epoch": 3.6227747084100677, + "grad_norm": 0.3496716618537903, + "learning_rate": 7.374144154358801e-05, + "loss": 1.8361, + "step": 11803 + }, + { + "epoch": 3.6230816451810925, + "grad_norm": 0.5585216283798218, + "learning_rate": 7.37370669450679e-05, + "loss": 1.7667, + "step": 11804 + }, + { + "epoch": 3.623388581952118, + "grad_norm": 0.4578089714050293, + "learning_rate": 7.373269211196418e-05, + "loss": 1.8051, + "step": 11805 + }, + { + "epoch": 3.623695518723143, + "grad_norm": 0.28195759654045105, + "learning_rate": 7.37283170443201e-05, + "loss": 1.7823, + "step": 11806 + }, + { + "epoch": 3.624002455494168, + "grad_norm": 0.4066108465194702, + "learning_rate": 7.372394174217887e-05, + "loss": 1.7819, + "step": 11807 + }, + { + "epoch": 3.6243093922651934, + "grad_norm": 0.5368703007698059, + "learning_rate": 7.371956620558375e-05, + "loss": 1.8121, + "step": 11808 + }, + { + "epoch": 3.6246163290362183, + "grad_norm": 0.36627063155174255, + "learning_rate": 7.371519043457795e-05, + "loss": 1.7944, + "step": 11809 + }, + { + "epoch": 3.6249232658072437, + "grad_norm": 0.3100780248641968, + "learning_rate": 7.371081442920476e-05, + "loss": 1.783, + "step": 11810 + }, + { + "epoch": 3.625230202578269, + "grad_norm": 0.3277178704738617, + "learning_rate": 7.370643818950741e-05, + "loss": 1.8105, + "step": 11811 + }, + { + "epoch": 3.625537139349294, + "grad_norm": 0.3887772560119629, + "learning_rate": 7.370206171552914e-05, + "loss": 1.8136, + "step": 11812 + }, + { + "epoch": 3.6258440761203192, + "grad_norm": 0.2770824134349823, + "learning_rate": 7.36976850073132e-05, + "loss": 1.7852, + "step": 11813 + }, + { + "epoch": 3.626151012891344, + "grad_norm": 0.26357728242874146, + "learning_rate": 7.369330806490284e-05, + "loss": 1.7621, + "step": 11814 + }, + { + "epoch": 3.6264579496623695, + "grad_norm": 0.3387344181537628, + "learning_rate": 7.368893088834135e-05, + "loss": 1.7785, + "step": 11815 + }, + { + "epoch": 3.626764886433395, + "grad_norm": 0.35155174136161804, + "learning_rate": 7.368455347767193e-05, + "loss": 1.8081, + "step": 11816 + }, + { + "epoch": 3.62707182320442, + "grad_norm": 0.2855289876461029, + "learning_rate": 7.368017583293788e-05, + "loss": 1.8245, + "step": 11817 + }, + { + "epoch": 3.627378759975445, + "grad_norm": 0.28462162613868713, + "learning_rate": 7.367579795418245e-05, + "loss": 1.8066, + "step": 11818 + }, + { + "epoch": 3.6276856967464703, + "grad_norm": 0.40696555376052856, + "learning_rate": 7.367141984144891e-05, + "loss": 1.8897, + "step": 11819 + }, + { + "epoch": 3.6279926335174952, + "grad_norm": 0.472782701253891, + "learning_rate": 7.366704149478054e-05, + "loss": 1.8071, + "step": 11820 + }, + { + "epoch": 3.6282995702885206, + "grad_norm": 0.27022916078567505, + "learning_rate": 7.366266291422057e-05, + "loss": 1.8574, + "step": 11821 + }, + { + "epoch": 3.628606507059546, + "grad_norm": 0.4207148253917694, + "learning_rate": 7.365828409981231e-05, + "loss": 1.7759, + "step": 11822 + }, + { + "epoch": 3.628913443830571, + "grad_norm": 0.42866072058677673, + "learning_rate": 7.365390505159902e-05, + "loss": 1.7366, + "step": 11823 + }, + { + "epoch": 3.629220380601596, + "grad_norm": 0.28288859128952026, + "learning_rate": 7.364952576962398e-05, + "loss": 1.8591, + "step": 11824 + }, + { + "epoch": 3.629527317372621, + "grad_norm": 0.30544906854629517, + "learning_rate": 7.364514625393045e-05, + "loss": 1.7965, + "step": 11825 + }, + { + "epoch": 3.6298342541436464, + "grad_norm": 0.3251616954803467, + "learning_rate": 7.364076650456173e-05, + "loss": 1.8197, + "step": 11826 + }, + { + "epoch": 3.6301411909146717, + "grad_norm": 0.3133888840675354, + "learning_rate": 7.363638652156109e-05, + "loss": 1.7978, + "step": 11827 + }, + { + "epoch": 3.630448127685697, + "grad_norm": 0.29004594683647156, + "learning_rate": 7.363200630497185e-05, + "loss": 1.8035, + "step": 11828 + }, + { + "epoch": 3.630755064456722, + "grad_norm": 0.2781279683113098, + "learning_rate": 7.362762585483725e-05, + "loss": 1.8462, + "step": 11829 + }, + { + "epoch": 3.6310620012277472, + "grad_norm": 0.29003822803497314, + "learning_rate": 7.362324517120063e-05, + "loss": 1.7952, + "step": 11830 + }, + { + "epoch": 3.631368937998772, + "grad_norm": 0.2510940134525299, + "learning_rate": 7.361886425410524e-05, + "loss": 1.7645, + "step": 11831 + }, + { + "epoch": 3.6316758747697975, + "grad_norm": 0.23798540234565735, + "learning_rate": 7.361448310359438e-05, + "loss": 1.7329, + "step": 11832 + }, + { + "epoch": 3.631982811540823, + "grad_norm": 0.2711278796195984, + "learning_rate": 7.361010171971137e-05, + "loss": 1.8245, + "step": 11833 + }, + { + "epoch": 3.6322897483118477, + "grad_norm": 0.2895669639110565, + "learning_rate": 7.360572010249949e-05, + "loss": 1.7668, + "step": 11834 + }, + { + "epoch": 3.632596685082873, + "grad_norm": 0.2216273844242096, + "learning_rate": 7.360133825200205e-05, + "loss": 1.8164, + "step": 11835 + }, + { + "epoch": 3.632903621853898, + "grad_norm": 0.3075082302093506, + "learning_rate": 7.359695616826236e-05, + "loss": 1.8159, + "step": 11836 + }, + { + "epoch": 3.6332105586249233, + "grad_norm": 0.3208801746368408, + "learning_rate": 7.35925738513237e-05, + "loss": 1.8385, + "step": 11837 + }, + { + "epoch": 3.6335174953959486, + "grad_norm": 0.272517591714859, + "learning_rate": 7.35881913012294e-05, + "loss": 1.7653, + "step": 11838 + }, + { + "epoch": 3.6338244321669735, + "grad_norm": 0.23105360567569733, + "learning_rate": 7.358380851802277e-05, + "loss": 1.7697, + "step": 11839 + }, + { + "epoch": 3.634131368937999, + "grad_norm": 0.2643153667449951, + "learning_rate": 7.357942550174714e-05, + "loss": 1.7885, + "step": 11840 + }, + { + "epoch": 3.6344383057090237, + "grad_norm": 0.22643202543258667, + "learning_rate": 7.357504225244579e-05, + "loss": 1.746, + "step": 11841 + }, + { + "epoch": 3.634745242480049, + "grad_norm": 0.27782970666885376, + "learning_rate": 7.357065877016207e-05, + "loss": 1.794, + "step": 11842 + }, + { + "epoch": 3.6350521792510744, + "grad_norm": 0.3035561740398407, + "learning_rate": 7.356627505493925e-05, + "loss": 1.7892, + "step": 11843 + }, + { + "epoch": 3.6353591160220997, + "grad_norm": 0.31859731674194336, + "learning_rate": 7.356189110682072e-05, + "loss": 1.7636, + "step": 11844 + }, + { + "epoch": 3.6356660527931246, + "grad_norm": 0.2960890233516693, + "learning_rate": 7.355750692584977e-05, + "loss": 1.8294, + "step": 11845 + }, + { + "epoch": 3.63597298956415, + "grad_norm": 0.2544194459915161, + "learning_rate": 7.355312251206972e-05, + "loss": 1.7603, + "step": 11846 + }, + { + "epoch": 3.636279926335175, + "grad_norm": 0.27864789962768555, + "learning_rate": 7.354873786552391e-05, + "loss": 1.7917, + "step": 11847 + }, + { + "epoch": 3.6365868631062, + "grad_norm": 0.32552552223205566, + "learning_rate": 7.354435298625568e-05, + "loss": 1.7769, + "step": 11848 + }, + { + "epoch": 3.6368937998772255, + "grad_norm": 0.25094640254974365, + "learning_rate": 7.353996787430833e-05, + "loss": 1.8371, + "step": 11849 + }, + { + "epoch": 3.6372007366482504, + "grad_norm": 0.26656433939933777, + "learning_rate": 7.353558252972524e-05, + "loss": 1.7686, + "step": 11850 + }, + { + "epoch": 3.6375076734192757, + "grad_norm": 0.3023635745048523, + "learning_rate": 7.353119695254973e-05, + "loss": 1.7892, + "step": 11851 + }, + { + "epoch": 3.6378146101903006, + "grad_norm": 0.2822463810443878, + "learning_rate": 7.352681114282514e-05, + "loss": 1.8221, + "step": 11852 + }, + { + "epoch": 3.638121546961326, + "grad_norm": 0.31159496307373047, + "learning_rate": 7.35224251005948e-05, + "loss": 1.803, + "step": 11853 + }, + { + "epoch": 3.6384284837323513, + "grad_norm": 0.3133087158203125, + "learning_rate": 7.351803882590207e-05, + "loss": 1.744, + "step": 11854 + }, + { + "epoch": 3.638735420503376, + "grad_norm": 0.3050002455711365, + "learning_rate": 7.351365231879029e-05, + "loss": 1.7522, + "step": 11855 + }, + { + "epoch": 3.6390423572744015, + "grad_norm": 0.2729037404060364, + "learning_rate": 7.350926557930283e-05, + "loss": 1.7629, + "step": 11856 + }, + { + "epoch": 3.6393492940454264, + "grad_norm": 0.3181995153427124, + "learning_rate": 7.350487860748303e-05, + "loss": 1.7603, + "step": 11857 + }, + { + "epoch": 3.6396562308164517, + "grad_norm": 0.352651447057724, + "learning_rate": 7.350049140337423e-05, + "loss": 1.8177, + "step": 11858 + }, + { + "epoch": 3.639963167587477, + "grad_norm": 0.22935177385807037, + "learning_rate": 7.349610396701981e-05, + "loss": 1.7421, + "step": 11859 + }, + { + "epoch": 3.6402701043585024, + "grad_norm": 0.26442599296569824, + "learning_rate": 7.349171629846312e-05, + "loss": 1.8026, + "step": 11860 + }, + { + "epoch": 3.6405770411295273, + "grad_norm": 0.25357648730278015, + "learning_rate": 7.348732839774751e-05, + "loss": 1.788, + "step": 11861 + }, + { + "epoch": 3.6408839779005526, + "grad_norm": 0.26959577202796936, + "learning_rate": 7.348294026491635e-05, + "loss": 1.884, + "step": 11862 + }, + { + "epoch": 3.6411909146715775, + "grad_norm": 0.2243001013994217, + "learning_rate": 7.347855190001304e-05, + "loss": 1.7765, + "step": 11863 + }, + { + "epoch": 3.641497851442603, + "grad_norm": 0.2480708807706833, + "learning_rate": 7.34741633030809e-05, + "loss": 1.7597, + "step": 11864 + }, + { + "epoch": 3.641804788213628, + "grad_norm": 0.22512994706630707, + "learning_rate": 7.346977447416332e-05, + "loss": 1.7647, + "step": 11865 + }, + { + "epoch": 3.642111724984653, + "grad_norm": 0.24961981177330017, + "learning_rate": 7.346538541330368e-05, + "loss": 1.8178, + "step": 11866 + }, + { + "epoch": 3.6424186617556784, + "grad_norm": 0.320896714925766, + "learning_rate": 7.346099612054533e-05, + "loss": 1.85, + "step": 11867 + }, + { + "epoch": 3.6427255985267033, + "grad_norm": 0.3420880436897278, + "learning_rate": 7.345660659593167e-05, + "loss": 1.8661, + "step": 11868 + }, + { + "epoch": 3.6430325352977286, + "grad_norm": 0.2675844132900238, + "learning_rate": 7.34522168395061e-05, + "loss": 1.8177, + "step": 11869 + }, + { + "epoch": 3.643339472068754, + "grad_norm": 0.23993943631649017, + "learning_rate": 7.344782685131195e-05, + "loss": 1.7365, + "step": 11870 + }, + { + "epoch": 3.643646408839779, + "grad_norm": 0.21805813908576965, + "learning_rate": 7.344343663139264e-05, + "loss": 1.7813, + "step": 11871 + }, + { + "epoch": 3.643953345610804, + "grad_norm": 0.24334421753883362, + "learning_rate": 7.343904617979154e-05, + "loss": 1.7763, + "step": 11872 + }, + { + "epoch": 3.644260282381829, + "grad_norm": 0.22768431901931763, + "learning_rate": 7.343465549655206e-05, + "loss": 1.7817, + "step": 11873 + }, + { + "epoch": 3.6445672191528544, + "grad_norm": 0.23828962445259094, + "learning_rate": 7.343026458171757e-05, + "loss": 1.8391, + "step": 11874 + }, + { + "epoch": 3.6448741559238798, + "grad_norm": 0.24838197231292725, + "learning_rate": 7.342587343533149e-05, + "loss": 1.759, + "step": 11875 + }, + { + "epoch": 3.645181092694905, + "grad_norm": 0.22732019424438477, + "learning_rate": 7.342148205743718e-05, + "loss": 1.7348, + "step": 11876 + }, + { + "epoch": 3.64548802946593, + "grad_norm": 0.25106775760650635, + "learning_rate": 7.341709044807807e-05, + "loss": 1.8121, + "step": 11877 + }, + { + "epoch": 3.6457949662369553, + "grad_norm": 0.28532838821411133, + "learning_rate": 7.341269860729753e-05, + "loss": 1.7147, + "step": 11878 + }, + { + "epoch": 3.64610190300798, + "grad_norm": 0.3041890859603882, + "learning_rate": 7.340830653513899e-05, + "loss": 1.7666, + "step": 11879 + }, + { + "epoch": 3.6464088397790055, + "grad_norm": 0.3142147958278656, + "learning_rate": 7.340391423164585e-05, + "loss": 1.8707, + "step": 11880 + }, + { + "epoch": 3.646715776550031, + "grad_norm": 0.28531381487846375, + "learning_rate": 7.339952169686151e-05, + "loss": 1.7961, + "step": 11881 + }, + { + "epoch": 3.6470227133210558, + "grad_norm": 0.33779671788215637, + "learning_rate": 7.339512893082938e-05, + "loss": 1.7428, + "step": 11882 + }, + { + "epoch": 3.647329650092081, + "grad_norm": 0.29611849784851074, + "learning_rate": 7.339073593359287e-05, + "loss": 1.8803, + "step": 11883 + }, + { + "epoch": 3.647636586863106, + "grad_norm": 0.31248557567596436, + "learning_rate": 7.33863427051954e-05, + "loss": 1.7868, + "step": 11884 + }, + { + "epoch": 3.6479435236341313, + "grad_norm": 0.42829564213752747, + "learning_rate": 7.338194924568039e-05, + "loss": 1.8558, + "step": 11885 + }, + { + "epoch": 3.6482504604051567, + "grad_norm": 0.431023508310318, + "learning_rate": 7.337755555509126e-05, + "loss": 1.7565, + "step": 11886 + }, + { + "epoch": 3.6485573971761815, + "grad_norm": 0.2917975187301636, + "learning_rate": 7.33731616334714e-05, + "loss": 1.8067, + "step": 11887 + }, + { + "epoch": 3.648864333947207, + "grad_norm": 0.3072175085544586, + "learning_rate": 7.336876748086427e-05, + "loss": 1.782, + "step": 11888 + }, + { + "epoch": 3.6491712707182318, + "grad_norm": 0.33658862113952637, + "learning_rate": 7.336437309731327e-05, + "loss": 1.8007, + "step": 11889 + }, + { + "epoch": 3.649478207489257, + "grad_norm": 0.23774033784866333, + "learning_rate": 7.335997848286185e-05, + "loss": 1.7606, + "step": 11890 + }, + { + "epoch": 3.6497851442602824, + "grad_norm": 0.3373236358165741, + "learning_rate": 7.335558363755344e-05, + "loss": 1.7335, + "step": 11891 + }, + { + "epoch": 3.650092081031308, + "grad_norm": 0.3906517028808594, + "learning_rate": 7.335118856143145e-05, + "loss": 1.7974, + "step": 11892 + }, + { + "epoch": 3.6503990178023327, + "grad_norm": 0.37715303897857666, + "learning_rate": 7.334679325453934e-05, + "loss": 1.8875, + "step": 11893 + }, + { + "epoch": 3.650705954573358, + "grad_norm": 0.278540700674057, + "learning_rate": 7.334239771692053e-05, + "loss": 1.8165, + "step": 11894 + }, + { + "epoch": 3.651012891344383, + "grad_norm": 0.24434895813465118, + "learning_rate": 7.333800194861845e-05, + "loss": 1.7756, + "step": 11895 + }, + { + "epoch": 3.6513198281154082, + "grad_norm": 0.25057271122932434, + "learning_rate": 7.333360594967658e-05, + "loss": 1.7932, + "step": 11896 + }, + { + "epoch": 3.6516267648864336, + "grad_norm": 0.3277342617511749, + "learning_rate": 7.332920972013833e-05, + "loss": 1.7781, + "step": 11897 + }, + { + "epoch": 3.6519337016574585, + "grad_norm": 0.2754829525947571, + "learning_rate": 7.332481326004715e-05, + "loss": 1.7916, + "step": 11898 + }, + { + "epoch": 3.652240638428484, + "grad_norm": 0.24490588903427124, + "learning_rate": 7.332041656944651e-05, + "loss": 1.7904, + "step": 11899 + }, + { + "epoch": 3.6525475751995087, + "grad_norm": 0.3176959455013275, + "learning_rate": 7.331601964837982e-05, + "loss": 1.7379, + "step": 11900 + }, + { + "epoch": 3.652854511970534, + "grad_norm": 0.3435784876346588, + "learning_rate": 7.331162249689057e-05, + "loss": 1.7635, + "step": 11901 + }, + { + "epoch": 3.6531614487415593, + "grad_norm": 0.335697740316391, + "learning_rate": 7.330722511502221e-05, + "loss": 1.7903, + "step": 11902 + }, + { + "epoch": 3.6534683855125847, + "grad_norm": 0.2748894691467285, + "learning_rate": 7.330282750281819e-05, + "loss": 1.8259, + "step": 11903 + }, + { + "epoch": 3.6537753222836096, + "grad_norm": 0.36754751205444336, + "learning_rate": 7.329842966032197e-05, + "loss": 1.7728, + "step": 11904 + }, + { + "epoch": 3.654082259054635, + "grad_norm": 0.4355713129043579, + "learning_rate": 7.3294031587577e-05, + "loss": 1.7447, + "step": 11905 + }, + { + "epoch": 3.65438919582566, + "grad_norm": 0.3967476487159729, + "learning_rate": 7.328963328462677e-05, + "loss": 1.8299, + "step": 11906 + }, + { + "epoch": 3.654696132596685, + "grad_norm": 0.23805755376815796, + "learning_rate": 7.328523475151472e-05, + "loss": 1.7631, + "step": 11907 + }, + { + "epoch": 3.6550030693677105, + "grad_norm": 0.40350377559661865, + "learning_rate": 7.328083598828435e-05, + "loss": 1.8693, + "step": 11908 + }, + { + "epoch": 3.6553100061387354, + "grad_norm": 0.4743673801422119, + "learning_rate": 7.32764369949791e-05, + "loss": 1.7887, + "step": 11909 + }, + { + "epoch": 3.6556169429097607, + "grad_norm": 0.33830127120018005, + "learning_rate": 7.327203777164246e-05, + "loss": 1.7527, + "step": 11910 + }, + { + "epoch": 3.6559238796807856, + "grad_norm": 0.2465003877878189, + "learning_rate": 7.326763831831791e-05, + "loss": 1.7898, + "step": 11911 + }, + { + "epoch": 3.656230816451811, + "grad_norm": 0.31647852063179016, + "learning_rate": 7.326323863504892e-05, + "loss": 1.8056, + "step": 11912 + }, + { + "epoch": 3.6565377532228363, + "grad_norm": 0.31436124444007874, + "learning_rate": 7.325883872187896e-05, + "loss": 1.7972, + "step": 11913 + }, + { + "epoch": 3.656844689993861, + "grad_norm": 0.260405957698822, + "learning_rate": 7.325443857885153e-05, + "loss": 1.8109, + "step": 11914 + }, + { + "epoch": 3.6571516267648865, + "grad_norm": 0.29312583804130554, + "learning_rate": 7.325003820601011e-05, + "loss": 1.8947, + "step": 11915 + }, + { + "epoch": 3.6574585635359114, + "grad_norm": 0.2641582190990448, + "learning_rate": 7.324563760339819e-05, + "loss": 1.7737, + "step": 11916 + }, + { + "epoch": 3.6577655003069367, + "grad_norm": 0.2338121086359024, + "learning_rate": 7.324123677105923e-05, + "loss": 1.7462, + "step": 11917 + }, + { + "epoch": 3.658072437077962, + "grad_norm": 0.27877378463745117, + "learning_rate": 7.323683570903676e-05, + "loss": 1.8371, + "step": 11918 + }, + { + "epoch": 3.6583793738489874, + "grad_norm": 0.24238766729831696, + "learning_rate": 7.323243441737427e-05, + "loss": 1.7304, + "step": 11919 + }, + { + "epoch": 3.6586863106200123, + "grad_norm": 0.2349759042263031, + "learning_rate": 7.322803289611525e-05, + "loss": 1.7422, + "step": 11920 + }, + { + "epoch": 3.6589932473910376, + "grad_norm": 0.2254217565059662, + "learning_rate": 7.322363114530318e-05, + "loss": 1.7296, + "step": 11921 + }, + { + "epoch": 3.6593001841620625, + "grad_norm": 0.24533270299434662, + "learning_rate": 7.321922916498158e-05, + "loss": 1.7834, + "step": 11922 + }, + { + "epoch": 3.659607120933088, + "grad_norm": 0.24993161857128143, + "learning_rate": 7.321482695519393e-05, + "loss": 1.8502, + "step": 11923 + }, + { + "epoch": 3.659914057704113, + "grad_norm": 0.2540178894996643, + "learning_rate": 7.321042451598378e-05, + "loss": 1.8372, + "step": 11924 + }, + { + "epoch": 3.660220994475138, + "grad_norm": 0.2241390198469162, + "learning_rate": 7.32060218473946e-05, + "loss": 1.7619, + "step": 11925 + }, + { + "epoch": 3.6605279312461634, + "grad_norm": 0.2137840837240219, + "learning_rate": 7.32016189494699e-05, + "loss": 1.751, + "step": 11926 + }, + { + "epoch": 3.6608348680171883, + "grad_norm": 0.2596585154533386, + "learning_rate": 7.319721582225323e-05, + "loss": 1.7773, + "step": 11927 + }, + { + "epoch": 3.6611418047882136, + "grad_norm": 0.24898354709148407, + "learning_rate": 7.319281246578806e-05, + "loss": 1.7347, + "step": 11928 + }, + { + "epoch": 3.661448741559239, + "grad_norm": 0.26553863286972046, + "learning_rate": 7.31884088801179e-05, + "loss": 1.7812, + "step": 11929 + }, + { + "epoch": 3.661755678330264, + "grad_norm": 0.2494438737630844, + "learning_rate": 7.318400506528633e-05, + "loss": 1.7554, + "step": 11930 + }, + { + "epoch": 3.662062615101289, + "grad_norm": 0.2794995903968811, + "learning_rate": 7.317960102133682e-05, + "loss": 1.7495, + "step": 11931 + }, + { + "epoch": 3.662369551872314, + "grad_norm": 0.2843860983848572, + "learning_rate": 7.317519674831293e-05, + "loss": 1.7734, + "step": 11932 + }, + { + "epoch": 3.6626764886433394, + "grad_norm": 0.28261128067970276, + "learning_rate": 7.317079224625813e-05, + "loss": 1.7794, + "step": 11933 + }, + { + "epoch": 3.6629834254143647, + "grad_norm": 0.2552426755428314, + "learning_rate": 7.316638751521599e-05, + "loss": 1.8397, + "step": 11934 + }, + { + "epoch": 3.66329036218539, + "grad_norm": 0.4140608608722687, + "learning_rate": 7.316198255523002e-05, + "loss": 1.848, + "step": 11935 + }, + { + "epoch": 3.663597298956415, + "grad_norm": 0.3709854483604431, + "learning_rate": 7.315757736634377e-05, + "loss": 1.8489, + "step": 11936 + }, + { + "epoch": 3.6639042357274403, + "grad_norm": 0.23637300729751587, + "learning_rate": 7.315317194860078e-05, + "loss": 1.7549, + "step": 11937 + }, + { + "epoch": 3.664211172498465, + "grad_norm": 0.32884421944618225, + "learning_rate": 7.314876630204456e-05, + "loss": 1.8061, + "step": 11938 + }, + { + "epoch": 3.6645181092694905, + "grad_norm": 0.33354130387306213, + "learning_rate": 7.314436042671867e-05, + "loss": 1.8346, + "step": 11939 + }, + { + "epoch": 3.664825046040516, + "grad_norm": 0.25776317715644836, + "learning_rate": 7.313995432266663e-05, + "loss": 1.8598, + "step": 11940 + }, + { + "epoch": 3.6651319828115407, + "grad_norm": 0.2910402715206146, + "learning_rate": 7.313554798993202e-05, + "loss": 1.7613, + "step": 11941 + }, + { + "epoch": 3.665438919582566, + "grad_norm": 0.3487538695335388, + "learning_rate": 7.313114142855836e-05, + "loss": 1.8105, + "step": 11942 + }, + { + "epoch": 3.665745856353591, + "grad_norm": 0.27271291613578796, + "learning_rate": 7.312673463858918e-05, + "loss": 1.8107, + "step": 11943 + }, + { + "epoch": 3.6660527931246163, + "grad_norm": 0.2613036632537842, + "learning_rate": 7.312232762006809e-05, + "loss": 1.7871, + "step": 11944 + }, + { + "epoch": 3.6663597298956416, + "grad_norm": 0.30594903230667114, + "learning_rate": 7.311792037303859e-05, + "loss": 1.8043, + "step": 11945 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.3960847854614258, + "learning_rate": 7.311351289754425e-05, + "loss": 1.8434, + "step": 11946 + }, + { + "epoch": 3.666973603437692, + "grad_norm": 0.33369311690330505, + "learning_rate": 7.310910519362861e-05, + "loss": 1.7496, + "step": 11947 + }, + { + "epoch": 3.6672805402087167, + "grad_norm": 0.29852384328842163, + "learning_rate": 7.310469726133528e-05, + "loss": 1.858, + "step": 11948 + }, + { + "epoch": 3.667587476979742, + "grad_norm": 0.2610527276992798, + "learning_rate": 7.310028910070777e-05, + "loss": 1.7642, + "step": 11949 + }, + { + "epoch": 3.6678944137507674, + "grad_norm": 0.3606704771518707, + "learning_rate": 7.309588071178967e-05, + "loss": 1.845, + "step": 11950 + }, + { + "epoch": 3.6682013505217927, + "grad_norm": 0.3157273828983307, + "learning_rate": 7.309147209462454e-05, + "loss": 1.7864, + "step": 11951 + }, + { + "epoch": 3.6685082872928176, + "grad_norm": 0.23907925188541412, + "learning_rate": 7.308706324925594e-05, + "loss": 1.8363, + "step": 11952 + }, + { + "epoch": 3.668815224063843, + "grad_norm": 0.3365088999271393, + "learning_rate": 7.308265417572747e-05, + "loss": 1.8755, + "step": 11953 + }, + { + "epoch": 3.669122160834868, + "grad_norm": 0.29404979944229126, + "learning_rate": 7.307824487408266e-05, + "loss": 1.8128, + "step": 11954 + }, + { + "epoch": 3.669429097605893, + "grad_norm": 0.2689574658870697, + "learning_rate": 7.307383534436511e-05, + "loss": 1.8072, + "step": 11955 + }, + { + "epoch": 3.6697360343769185, + "grad_norm": 0.28394198417663574, + "learning_rate": 7.306942558661841e-05, + "loss": 1.7919, + "step": 11956 + }, + { + "epoch": 3.6700429711479434, + "grad_norm": 0.2594783902168274, + "learning_rate": 7.306501560088612e-05, + "loss": 1.7467, + "step": 11957 + }, + { + "epoch": 3.6703499079189688, + "grad_norm": 0.24765191972255707, + "learning_rate": 7.30606053872118e-05, + "loss": 1.7876, + "step": 11958 + }, + { + "epoch": 3.6706568446899936, + "grad_norm": 0.22157172858715057, + "learning_rate": 7.305619494563909e-05, + "loss": 1.7802, + "step": 11959 + }, + { + "epoch": 3.670963781461019, + "grad_norm": 0.270151287317276, + "learning_rate": 7.305178427621155e-05, + "loss": 1.7723, + "step": 11960 + }, + { + "epoch": 3.6712707182320443, + "grad_norm": 0.3163939118385315, + "learning_rate": 7.304737337897277e-05, + "loss": 1.8488, + "step": 11961 + }, + { + "epoch": 3.671577655003069, + "grad_norm": 0.2605706453323364, + "learning_rate": 7.304296225396632e-05, + "loss": 1.7442, + "step": 11962 + }, + { + "epoch": 3.6718845917740945, + "grad_norm": 0.31179291009902954, + "learning_rate": 7.303855090123582e-05, + "loss": 1.831, + "step": 11963 + }, + { + "epoch": 3.6721915285451194, + "grad_norm": 0.33365359902381897, + "learning_rate": 7.303413932082483e-05, + "loss": 1.8376, + "step": 11964 + }, + { + "epoch": 3.6724984653161448, + "grad_norm": 0.2952130138874054, + "learning_rate": 7.302972751277701e-05, + "loss": 1.7733, + "step": 11965 + }, + { + "epoch": 3.67280540208717, + "grad_norm": 0.24270877242088318, + "learning_rate": 7.302531547713592e-05, + "loss": 1.8367, + "step": 11966 + }, + { + "epoch": 3.6731123388581954, + "grad_norm": 0.34315919876098633, + "learning_rate": 7.302090321394517e-05, + "loss": 1.7901, + "step": 11967 + }, + { + "epoch": 3.6734192756292203, + "grad_norm": 0.33511418104171753, + "learning_rate": 7.301649072324834e-05, + "loss": 1.7929, + "step": 11968 + }, + { + "epoch": 3.6737262124002457, + "grad_norm": 0.22397933900356293, + "learning_rate": 7.301207800508907e-05, + "loss": 1.7533, + "step": 11969 + }, + { + "epoch": 3.6740331491712706, + "grad_norm": 0.2882738411426544, + "learning_rate": 7.300766505951095e-05, + "loss": 1.8071, + "step": 11970 + }, + { + "epoch": 3.674340085942296, + "grad_norm": 0.242112398147583, + "learning_rate": 7.300325188655761e-05, + "loss": 1.7739, + "step": 11971 + }, + { + "epoch": 3.674647022713321, + "grad_norm": 0.27754491567611694, + "learning_rate": 7.299883848627265e-05, + "loss": 1.8295, + "step": 11972 + }, + { + "epoch": 3.674953959484346, + "grad_norm": 0.2787899076938629, + "learning_rate": 7.29944248586997e-05, + "loss": 1.7682, + "step": 11973 + }, + { + "epoch": 3.6752608962553714, + "grad_norm": 0.24448934197425842, + "learning_rate": 7.299001100388234e-05, + "loss": 1.7826, + "step": 11974 + }, + { + "epoch": 3.6755678330263963, + "grad_norm": 0.37869495153427124, + "learning_rate": 7.298559692186421e-05, + "loss": 1.8582, + "step": 11975 + }, + { + "epoch": 3.6758747697974217, + "grad_norm": 0.3299996256828308, + "learning_rate": 7.298118261268897e-05, + "loss": 1.7716, + "step": 11976 + }, + { + "epoch": 3.676181706568447, + "grad_norm": 0.278891384601593, + "learning_rate": 7.29767680764002e-05, + "loss": 1.879, + "step": 11977 + }, + { + "epoch": 3.6764886433394723, + "grad_norm": 0.29326459765434265, + "learning_rate": 7.297235331304155e-05, + "loss": 1.804, + "step": 11978 + }, + { + "epoch": 3.6767955801104972, + "grad_norm": 0.2697092592716217, + "learning_rate": 7.296793832265663e-05, + "loss": 1.7842, + "step": 11979 + }, + { + "epoch": 3.6771025168815226, + "grad_norm": 0.3045118749141693, + "learning_rate": 7.296352310528909e-05, + "loss": 1.7959, + "step": 11980 + }, + { + "epoch": 3.6774094536525475, + "grad_norm": 0.278647780418396, + "learning_rate": 7.295910766098252e-05, + "loss": 1.7907, + "step": 11981 + }, + { + "epoch": 3.677716390423573, + "grad_norm": 0.2370275855064392, + "learning_rate": 7.295469198978063e-05, + "loss": 1.757, + "step": 11982 + }, + { + "epoch": 3.678023327194598, + "grad_norm": 0.3061021566390991, + "learning_rate": 7.295027609172702e-05, + "loss": 1.7927, + "step": 11983 + }, + { + "epoch": 3.678330263965623, + "grad_norm": 0.2844544053077698, + "learning_rate": 7.294585996686532e-05, + "loss": 1.7705, + "step": 11984 + }, + { + "epoch": 3.6786372007366483, + "grad_norm": 0.31121113896369934, + "learning_rate": 7.29414436152392e-05, + "loss": 1.783, + "step": 11985 + }, + { + "epoch": 3.6789441375076732, + "grad_norm": 0.2566785514354706, + "learning_rate": 7.293702703689225e-05, + "loss": 1.7781, + "step": 11986 + }, + { + "epoch": 3.6792510742786986, + "grad_norm": 0.22176961600780487, + "learning_rate": 7.293261023186818e-05, + "loss": 1.7302, + "step": 11987 + }, + { + "epoch": 3.679558011049724, + "grad_norm": 0.21547441184520721, + "learning_rate": 7.292819320021062e-05, + "loss": 1.7666, + "step": 11988 + }, + { + "epoch": 3.679864947820749, + "grad_norm": 0.26309674978256226, + "learning_rate": 7.29237759419632e-05, + "loss": 1.7817, + "step": 11989 + }, + { + "epoch": 3.680171884591774, + "grad_norm": 0.2558063864707947, + "learning_rate": 7.29193584571696e-05, + "loss": 1.8257, + "step": 11990 + }, + { + "epoch": 3.680478821362799, + "grad_norm": 0.24516844749450684, + "learning_rate": 7.291494074587347e-05, + "loss": 1.7803, + "step": 11991 + }, + { + "epoch": 3.6807857581338244, + "grad_norm": 0.22891047596931458, + "learning_rate": 7.291052280811843e-05, + "loss": 1.7977, + "step": 11992 + }, + { + "epoch": 3.6810926949048497, + "grad_norm": 0.2776026129722595, + "learning_rate": 7.290610464394822e-05, + "loss": 1.8486, + "step": 11993 + }, + { + "epoch": 3.681399631675875, + "grad_norm": 0.31472426652908325, + "learning_rate": 7.290168625340644e-05, + "loss": 1.7841, + "step": 11994 + }, + { + "epoch": 3.6817065684469, + "grad_norm": 0.3459274470806122, + "learning_rate": 7.289726763653677e-05, + "loss": 1.7458, + "step": 11995 + }, + { + "epoch": 3.6820135052179253, + "grad_norm": 0.23645849525928497, + "learning_rate": 7.289284879338289e-05, + "loss": 1.781, + "step": 11996 + }, + { + "epoch": 3.68232044198895, + "grad_norm": 0.3257114291191101, + "learning_rate": 7.288842972398845e-05, + "loss": 1.8269, + "step": 11997 + }, + { + "epoch": 3.6826273787599755, + "grad_norm": 0.5450126528739929, + "learning_rate": 7.288401042839713e-05, + "loss": 1.8342, + "step": 11998 + }, + { + "epoch": 3.682934315531001, + "grad_norm": 0.5080512762069702, + "learning_rate": 7.287959090665262e-05, + "loss": 1.8097, + "step": 11999 + }, + { + "epoch": 3.6832412523020257, + "grad_norm": 0.3005252480506897, + "learning_rate": 7.287517115879858e-05, + "loss": 1.8271, + "step": 12000 + }, + { + "epoch": 3.683548189073051, + "grad_norm": 0.2760924994945526, + "learning_rate": 7.287075118487869e-05, + "loss": 1.8267, + "step": 12001 + }, + { + "epoch": 3.683855125844076, + "grad_norm": 0.3475865423679352, + "learning_rate": 7.286633098493663e-05, + "loss": 1.785, + "step": 12002 + }, + { + "epoch": 3.6841620626151013, + "grad_norm": 0.2905690670013428, + "learning_rate": 7.286191055901608e-05, + "loss": 1.8283, + "step": 12003 + }, + { + "epoch": 3.6844689993861266, + "grad_norm": 0.23666246235370636, + "learning_rate": 7.285748990716072e-05, + "loss": 1.7665, + "step": 12004 + }, + { + "epoch": 3.6847759361571515, + "grad_norm": 0.32329514622688293, + "learning_rate": 7.285306902941427e-05, + "loss": 1.7267, + "step": 12005 + }, + { + "epoch": 3.685082872928177, + "grad_norm": 0.32345879077911377, + "learning_rate": 7.28486479258204e-05, + "loss": 1.7529, + "step": 12006 + }, + { + "epoch": 3.6853898096992017, + "grad_norm": 0.2727855443954468, + "learning_rate": 7.284422659642279e-05, + "loss": 1.8279, + "step": 12007 + }, + { + "epoch": 3.685696746470227, + "grad_norm": 0.37847277522087097, + "learning_rate": 7.283980504126513e-05, + "loss": 1.7809, + "step": 12008 + }, + { + "epoch": 3.6860036832412524, + "grad_norm": 0.44694215059280396, + "learning_rate": 7.283538326039113e-05, + "loss": 1.8184, + "step": 12009 + }, + { + "epoch": 3.6863106200122777, + "grad_norm": 0.2868261933326721, + "learning_rate": 7.28309612538445e-05, + "loss": 1.7461, + "step": 12010 + }, + { + "epoch": 3.6866175567833026, + "grad_norm": 0.2601351737976074, + "learning_rate": 7.282653902166894e-05, + "loss": 1.8011, + "step": 12011 + }, + { + "epoch": 3.686924493554328, + "grad_norm": 0.328185498714447, + "learning_rate": 7.282211656390813e-05, + "loss": 1.7934, + "step": 12012 + }, + { + "epoch": 3.687231430325353, + "grad_norm": 0.2712559103965759, + "learning_rate": 7.281769388060578e-05, + "loss": 1.7566, + "step": 12013 + }, + { + "epoch": 3.687538367096378, + "grad_norm": 0.2725805938243866, + "learning_rate": 7.281327097180562e-05, + "loss": 1.8024, + "step": 12014 + }, + { + "epoch": 3.6878453038674035, + "grad_norm": 0.37282630801200867, + "learning_rate": 7.280884783755133e-05, + "loss": 1.7624, + "step": 12015 + }, + { + "epoch": 3.6881522406384284, + "grad_norm": 0.36519256234169006, + "learning_rate": 7.280442447788664e-05, + "loss": 1.8691, + "step": 12016 + }, + { + "epoch": 3.6884591774094537, + "grad_norm": 0.21699345111846924, + "learning_rate": 7.280000089285528e-05, + "loss": 1.7308, + "step": 12017 + }, + { + "epoch": 3.6887661141804786, + "grad_norm": 0.3159945011138916, + "learning_rate": 7.279557708250094e-05, + "loss": 1.8144, + "step": 12018 + }, + { + "epoch": 3.689073050951504, + "grad_norm": 0.2927449643611908, + "learning_rate": 7.279115304686735e-05, + "loss": 1.7746, + "step": 12019 + }, + { + "epoch": 3.6893799877225293, + "grad_norm": 0.279208242893219, + "learning_rate": 7.278672878599819e-05, + "loss": 1.7678, + "step": 12020 + }, + { + "epoch": 3.689686924493554, + "grad_norm": 0.40005648136138916, + "learning_rate": 7.278230429993725e-05, + "loss": 1.7876, + "step": 12021 + }, + { + "epoch": 3.6899938612645795, + "grad_norm": 0.3444392681121826, + "learning_rate": 7.277787958872824e-05, + "loss": 1.7591, + "step": 12022 + }, + { + "epoch": 3.6903007980356044, + "grad_norm": 0.21841467916965485, + "learning_rate": 7.277345465241485e-05, + "loss": 1.785, + "step": 12023 + }, + { + "epoch": 3.6906077348066297, + "grad_norm": 0.32463181018829346, + "learning_rate": 7.276902949104084e-05, + "loss": 1.8164, + "step": 12024 + }, + { + "epoch": 3.690914671577655, + "grad_norm": 0.36221247911453247, + "learning_rate": 7.276460410464994e-05, + "loss": 1.7529, + "step": 12025 + }, + { + "epoch": 3.6912216083486804, + "grad_norm": 0.24451927840709686, + "learning_rate": 7.276017849328588e-05, + "loss": 1.8031, + "step": 12026 + }, + { + "epoch": 3.6915285451197053, + "grad_norm": 0.3055694103240967, + "learning_rate": 7.275575265699239e-05, + "loss": 1.8158, + "step": 12027 + }, + { + "epoch": 3.6918354818907306, + "grad_norm": 0.4315083622932434, + "learning_rate": 7.27513265958132e-05, + "loss": 1.8322, + "step": 12028 + }, + { + "epoch": 3.6921424186617555, + "grad_norm": 0.3391095697879791, + "learning_rate": 7.274690030979209e-05, + "loss": 1.8214, + "step": 12029 + }, + { + "epoch": 3.692449355432781, + "grad_norm": 0.22714883089065552, + "learning_rate": 7.274247379897277e-05, + "loss": 1.7312, + "step": 12030 + }, + { + "epoch": 3.692756292203806, + "grad_norm": 0.24982765316963196, + "learning_rate": 7.273804706339899e-05, + "loss": 1.738, + "step": 12031 + }, + { + "epoch": 3.693063228974831, + "grad_norm": 0.32509860396385193, + "learning_rate": 7.273362010311451e-05, + "loss": 1.7773, + "step": 12032 + }, + { + "epoch": 3.6933701657458564, + "grad_norm": 0.2643086612224579, + "learning_rate": 7.272919291816307e-05, + "loss": 1.7545, + "step": 12033 + }, + { + "epoch": 3.6936771025168813, + "grad_norm": 0.2568800747394562, + "learning_rate": 7.272476550858842e-05, + "loss": 1.8055, + "step": 12034 + }, + { + "epoch": 3.6939840392879066, + "grad_norm": 0.27418240904808044, + "learning_rate": 7.272033787443433e-05, + "loss": 1.7769, + "step": 12035 + }, + { + "epoch": 3.694290976058932, + "grad_norm": 0.2459677755832672, + "learning_rate": 7.271591001574453e-05, + "loss": 1.7971, + "step": 12036 + }, + { + "epoch": 3.694597912829957, + "grad_norm": 0.22349393367767334, + "learning_rate": 7.27114819325628e-05, + "loss": 1.7791, + "step": 12037 + }, + { + "epoch": 3.694904849600982, + "grad_norm": 0.25321197509765625, + "learning_rate": 7.270705362493288e-05, + "loss": 1.7475, + "step": 12038 + }, + { + "epoch": 3.695211786372007, + "grad_norm": 0.2585916519165039, + "learning_rate": 7.270262509289855e-05, + "loss": 1.7801, + "step": 12039 + }, + { + "epoch": 3.6955187231430324, + "grad_norm": 0.2673574686050415, + "learning_rate": 7.269819633650359e-05, + "loss": 1.7578, + "step": 12040 + }, + { + "epoch": 3.6958256599140578, + "grad_norm": 0.2509469985961914, + "learning_rate": 7.269376735579175e-05, + "loss": 1.7994, + "step": 12041 + }, + { + "epoch": 3.696132596685083, + "grad_norm": 0.28527703881263733, + "learning_rate": 7.268933815080679e-05, + "loss": 1.7752, + "step": 12042 + }, + { + "epoch": 3.696439533456108, + "grad_norm": 0.22716578841209412, + "learning_rate": 7.268490872159248e-05, + "loss": 1.7186, + "step": 12043 + }, + { + "epoch": 3.6967464702271333, + "grad_norm": 0.24888403713703156, + "learning_rate": 7.268047906819262e-05, + "loss": 1.7882, + "step": 12044 + }, + { + "epoch": 3.697053406998158, + "grad_norm": 0.28976112604141235, + "learning_rate": 7.267604919065096e-05, + "loss": 1.7655, + "step": 12045 + }, + { + "epoch": 3.6973603437691835, + "grad_norm": 0.24668502807617188, + "learning_rate": 7.267161908901131e-05, + "loss": 1.8051, + "step": 12046 + }, + { + "epoch": 3.697667280540209, + "grad_norm": 0.2464776188135147, + "learning_rate": 7.266718876331742e-05, + "loss": 1.809, + "step": 12047 + }, + { + "epoch": 3.6979742173112338, + "grad_norm": 0.27648577094078064, + "learning_rate": 7.266275821361309e-05, + "loss": 1.7869, + "step": 12048 + }, + { + "epoch": 3.698281154082259, + "grad_norm": 0.26427242159843445, + "learning_rate": 7.26583274399421e-05, + "loss": 1.7681, + "step": 12049 + }, + { + "epoch": 3.698588090853284, + "grad_norm": 0.24595285952091217, + "learning_rate": 7.265389644234823e-05, + "loss": 1.7209, + "step": 12050 + }, + { + "epoch": 3.6988950276243093, + "grad_norm": 0.32514405250549316, + "learning_rate": 7.26494652208753e-05, + "loss": 1.8702, + "step": 12051 + }, + { + "epoch": 3.6992019643953347, + "grad_norm": 0.24512936174869537, + "learning_rate": 7.264503377556705e-05, + "loss": 1.784, + "step": 12052 + }, + { + "epoch": 3.69950890116636, + "grad_norm": 0.28698310256004333, + "learning_rate": 7.264060210646733e-05, + "loss": 1.905, + "step": 12053 + }, + { + "epoch": 3.699815837937385, + "grad_norm": 0.2995007336139679, + "learning_rate": 7.263617021361989e-05, + "loss": 1.7822, + "step": 12054 + }, + { + "epoch": 3.7001227747084102, + "grad_norm": 0.25869423151016235, + "learning_rate": 7.263173809706855e-05, + "loss": 1.7988, + "step": 12055 + }, + { + "epoch": 3.700429711479435, + "grad_norm": 0.350918710231781, + "learning_rate": 7.262730575685711e-05, + "loss": 1.9504, + "step": 12056 + }, + { + "epoch": 3.7007366482504604, + "grad_norm": 0.3407665491104126, + "learning_rate": 7.262287319302937e-05, + "loss": 1.8506, + "step": 12057 + }, + { + "epoch": 3.701043585021486, + "grad_norm": 0.3039441704750061, + "learning_rate": 7.261844040562915e-05, + "loss": 1.7841, + "step": 12058 + }, + { + "epoch": 3.7013505217925107, + "grad_norm": 0.23483428359031677, + "learning_rate": 7.261400739470023e-05, + "loss": 1.7899, + "step": 12059 + }, + { + "epoch": 3.701657458563536, + "grad_norm": 0.30779507756233215, + "learning_rate": 7.260957416028645e-05, + "loss": 1.8131, + "step": 12060 + }, + { + "epoch": 3.701964395334561, + "grad_norm": 0.29901376366615295, + "learning_rate": 7.26051407024316e-05, + "loss": 1.7861, + "step": 12061 + }, + { + "epoch": 3.7022713321055862, + "grad_norm": 0.30058762431144714, + "learning_rate": 7.260070702117949e-05, + "loss": 1.7485, + "step": 12062 + }, + { + "epoch": 3.7025782688766116, + "grad_norm": 0.24523651599884033, + "learning_rate": 7.259627311657396e-05, + "loss": 1.772, + "step": 12063 + }, + { + "epoch": 3.7028852056476365, + "grad_norm": 0.24375474452972412, + "learning_rate": 7.259183898865882e-05, + "loss": 1.7848, + "step": 12064 + }, + { + "epoch": 3.703192142418662, + "grad_norm": 0.2562403380870819, + "learning_rate": 7.258740463747788e-05, + "loss": 1.7447, + "step": 12065 + }, + { + "epoch": 3.7034990791896867, + "grad_norm": 0.265229195356369, + "learning_rate": 7.258297006307496e-05, + "loss": 1.8111, + "step": 12066 + }, + { + "epoch": 3.703806015960712, + "grad_norm": 0.2836552858352661, + "learning_rate": 7.25785352654939e-05, + "loss": 1.7952, + "step": 12067 + }, + { + "epoch": 3.7041129527317374, + "grad_norm": 0.3269572854042053, + "learning_rate": 7.257410024477852e-05, + "loss": 1.8604, + "step": 12068 + }, + { + "epoch": 3.7044198895027627, + "grad_norm": 0.2391490638256073, + "learning_rate": 7.256966500097264e-05, + "loss": 1.7417, + "step": 12069 + }, + { + "epoch": 3.7047268262737876, + "grad_norm": 0.2610675096511841, + "learning_rate": 7.256522953412011e-05, + "loss": 1.7712, + "step": 12070 + }, + { + "epoch": 3.705033763044813, + "grad_norm": 0.24954774975776672, + "learning_rate": 7.256079384426477e-05, + "loss": 1.7506, + "step": 12071 + }, + { + "epoch": 3.705340699815838, + "grad_norm": 0.2603892385959625, + "learning_rate": 7.255635793145042e-05, + "loss": 1.8105, + "step": 12072 + }, + { + "epoch": 3.705647636586863, + "grad_norm": 0.32728591561317444, + "learning_rate": 7.255192179572092e-05, + "loss": 1.8448, + "step": 12073 + }, + { + "epoch": 3.7059545733578885, + "grad_norm": 0.4559340178966522, + "learning_rate": 7.254748543712013e-05, + "loss": 1.7232, + "step": 12074 + }, + { + "epoch": 3.7062615101289134, + "grad_norm": 0.36526206135749817, + "learning_rate": 7.254304885569186e-05, + "loss": 1.7874, + "step": 12075 + }, + { + "epoch": 3.7065684468999387, + "grad_norm": 0.21606837213039398, + "learning_rate": 7.253861205147998e-05, + "loss": 1.7266, + "step": 12076 + }, + { + "epoch": 3.7068753836709636, + "grad_norm": 0.3629585802555084, + "learning_rate": 7.253417502452831e-05, + "loss": 1.7722, + "step": 12077 + }, + { + "epoch": 3.707182320441989, + "grad_norm": 0.4224923551082611, + "learning_rate": 7.252973777488072e-05, + "loss": 1.7369, + "step": 12078 + }, + { + "epoch": 3.7074892572130143, + "grad_norm": 0.32245784997940063, + "learning_rate": 7.252530030258106e-05, + "loss": 1.7836, + "step": 12079 + }, + { + "epoch": 3.707796193984039, + "grad_norm": 0.29909494519233704, + "learning_rate": 7.252086260767317e-05, + "loss": 1.8718, + "step": 12080 + }, + { + "epoch": 3.7081031307550645, + "grad_norm": 0.21995799243450165, + "learning_rate": 7.251642469020093e-05, + "loss": 1.7103, + "step": 12081 + }, + { + "epoch": 3.7084100675260894, + "grad_norm": 0.2737572193145752, + "learning_rate": 7.251198655020818e-05, + "loss": 1.7787, + "step": 12082 + }, + { + "epoch": 3.7087170042971147, + "grad_norm": 0.22417058050632477, + "learning_rate": 7.250754818773879e-05, + "loss": 1.7782, + "step": 12083 + }, + { + "epoch": 3.70902394106814, + "grad_norm": 0.3350662887096405, + "learning_rate": 7.25031096028366e-05, + "loss": 1.8193, + "step": 12084 + }, + { + "epoch": 3.7093308778391654, + "grad_norm": 0.3199101686477661, + "learning_rate": 7.24986707955455e-05, + "loss": 1.831, + "step": 12085 + }, + { + "epoch": 3.7096378146101903, + "grad_norm": 0.2513977289199829, + "learning_rate": 7.249423176590936e-05, + "loss": 1.8288, + "step": 12086 + }, + { + "epoch": 3.7099447513812156, + "grad_norm": 0.30411866307258606, + "learning_rate": 7.248979251397203e-05, + "loss": 1.7837, + "step": 12087 + }, + { + "epoch": 3.7102516881522405, + "grad_norm": 0.30755332112312317, + "learning_rate": 7.248535303977738e-05, + "loss": 1.8016, + "step": 12088 + }, + { + "epoch": 3.710558624923266, + "grad_norm": 0.25746986269950867, + "learning_rate": 7.248091334336929e-05, + "loss": 1.8014, + "step": 12089 + }, + { + "epoch": 3.710865561694291, + "grad_norm": 0.3327447772026062, + "learning_rate": 7.247647342479164e-05, + "loss": 1.752, + "step": 12090 + }, + { + "epoch": 3.711172498465316, + "grad_norm": 0.3101816475391388, + "learning_rate": 7.247203328408832e-05, + "loss": 1.7867, + "step": 12091 + }, + { + "epoch": 3.7114794352363414, + "grad_norm": 0.2168906182050705, + "learning_rate": 7.246759292130318e-05, + "loss": 1.7452, + "step": 12092 + }, + { + "epoch": 3.7117863720073663, + "grad_norm": 0.34260258078575134, + "learning_rate": 7.246315233648013e-05, + "loss": 1.8156, + "step": 12093 + }, + { + "epoch": 3.7120933087783916, + "grad_norm": 0.2730714976787567, + "learning_rate": 7.245871152966303e-05, + "loss": 1.7429, + "step": 12094 + }, + { + "epoch": 3.712400245549417, + "grad_norm": 0.2560936212539673, + "learning_rate": 7.245427050089578e-05, + "loss": 1.7969, + "step": 12095 + }, + { + "epoch": 3.712707182320442, + "grad_norm": 0.27510303258895874, + "learning_rate": 7.244982925022228e-05, + "loss": 1.7981, + "step": 12096 + }, + { + "epoch": 3.713014119091467, + "grad_norm": 0.29171642661094666, + "learning_rate": 7.24453877776864e-05, + "loss": 1.7913, + "step": 12097 + }, + { + "epoch": 3.713321055862492, + "grad_norm": 0.26431843638420105, + "learning_rate": 7.244094608333206e-05, + "loss": 1.8262, + "step": 12098 + }, + { + "epoch": 3.7136279926335174, + "grad_norm": 0.30747905373573303, + "learning_rate": 7.243650416720311e-05, + "loss": 1.7951, + "step": 12099 + }, + { + "epoch": 3.7139349294045427, + "grad_norm": 0.346443772315979, + "learning_rate": 7.24320620293435e-05, + "loss": 1.7677, + "step": 12100 + }, + { + "epoch": 3.714241866175568, + "grad_norm": 0.2910652458667755, + "learning_rate": 7.242761966979709e-05, + "loss": 1.7887, + "step": 12101 + }, + { + "epoch": 3.714548802946593, + "grad_norm": 0.22342006862163544, + "learning_rate": 7.24231770886078e-05, + "loss": 1.7678, + "step": 12102 + }, + { + "epoch": 3.7148557397176183, + "grad_norm": 0.24125796556472778, + "learning_rate": 7.241873428581954e-05, + "loss": 1.7436, + "step": 12103 + }, + { + "epoch": 3.715162676488643, + "grad_norm": 0.23542635142803192, + "learning_rate": 7.24142912614762e-05, + "loss": 1.7942, + "step": 12104 + }, + { + "epoch": 3.7154696132596685, + "grad_norm": 0.22476384043693542, + "learning_rate": 7.240984801562169e-05, + "loss": 1.8235, + "step": 12105 + }, + { + "epoch": 3.715776550030694, + "grad_norm": 0.25123465061187744, + "learning_rate": 7.240540454829992e-05, + "loss": 1.8112, + "step": 12106 + }, + { + "epoch": 3.7160834868017187, + "grad_norm": 0.27230000495910645, + "learning_rate": 7.240096085955483e-05, + "loss": 1.8312, + "step": 12107 + }, + { + "epoch": 3.716390423572744, + "grad_norm": 0.2722976803779602, + "learning_rate": 7.239651694943031e-05, + "loss": 1.8368, + "step": 12108 + }, + { + "epoch": 3.716697360343769, + "grad_norm": 0.264138400554657, + "learning_rate": 7.239207281797028e-05, + "loss": 1.8206, + "step": 12109 + }, + { + "epoch": 3.7170042971147943, + "grad_norm": 0.28813931345939636, + "learning_rate": 7.238762846521866e-05, + "loss": 1.7391, + "step": 12110 + }, + { + "epoch": 3.7173112338858196, + "grad_norm": 0.2319631576538086, + "learning_rate": 7.238318389121939e-05, + "loss": 1.7574, + "step": 12111 + }, + { + "epoch": 3.717618170656845, + "grad_norm": 0.2507809102535248, + "learning_rate": 7.237873909601635e-05, + "loss": 1.7359, + "step": 12112 + }, + { + "epoch": 3.71792510742787, + "grad_norm": 0.2717304825782776, + "learning_rate": 7.237429407965351e-05, + "loss": 1.774, + "step": 12113 + }, + { + "epoch": 3.718232044198895, + "grad_norm": 0.2619280517101288, + "learning_rate": 7.236984884217478e-05, + "loss": 1.8083, + "step": 12114 + }, + { + "epoch": 3.71853898096992, + "grad_norm": 0.22268806397914886, + "learning_rate": 7.23654033836241e-05, + "loss": 1.7436, + "step": 12115 + }, + { + "epoch": 3.7188459177409454, + "grad_norm": 0.2341407984495163, + "learning_rate": 7.236095770404539e-05, + "loss": 1.7807, + "step": 12116 + }, + { + "epoch": 3.7191528545119708, + "grad_norm": 0.23519712686538696, + "learning_rate": 7.235651180348258e-05, + "loss": 1.8051, + "step": 12117 + }, + { + "epoch": 3.7194597912829956, + "grad_norm": 0.2391074150800705, + "learning_rate": 7.235206568197963e-05, + "loss": 1.8377, + "step": 12118 + }, + { + "epoch": 3.719766728054021, + "grad_norm": 0.26821592450141907, + "learning_rate": 7.234761933958045e-05, + "loss": 1.8586, + "step": 12119 + }, + { + "epoch": 3.720073664825046, + "grad_norm": 0.24971134960651398, + "learning_rate": 7.234317277632902e-05, + "loss": 1.8404, + "step": 12120 + }, + { + "epoch": 3.720380601596071, + "grad_norm": 0.20817919075489044, + "learning_rate": 7.233872599226926e-05, + "loss": 1.7204, + "step": 12121 + }, + { + "epoch": 3.7206875383670965, + "grad_norm": 0.29301291704177856, + "learning_rate": 7.233427898744509e-05, + "loss": 1.8528, + "step": 12122 + }, + { + "epoch": 3.7209944751381214, + "grad_norm": 0.22214651107788086, + "learning_rate": 7.23298317619005e-05, + "loss": 1.748, + "step": 12123 + }, + { + "epoch": 3.7213014119091468, + "grad_norm": 0.2511044442653656, + "learning_rate": 7.232538431567941e-05, + "loss": 1.8146, + "step": 12124 + }, + { + "epoch": 3.7216083486801717, + "grad_norm": 0.26976367831230164, + "learning_rate": 7.232093664882581e-05, + "loss": 1.8483, + "step": 12125 + }, + { + "epoch": 3.721915285451197, + "grad_norm": 0.2538089156150818, + "learning_rate": 7.231648876138361e-05, + "loss": 1.8097, + "step": 12126 + }, + { + "epoch": 3.7222222222222223, + "grad_norm": 0.2353016883134842, + "learning_rate": 7.231204065339682e-05, + "loss": 1.737, + "step": 12127 + }, + { + "epoch": 3.7225291589932477, + "grad_norm": 0.3205147981643677, + "learning_rate": 7.230759232490935e-05, + "loss": 1.8116, + "step": 12128 + }, + { + "epoch": 3.7228360957642725, + "grad_norm": 0.39056599140167236, + "learning_rate": 7.230314377596516e-05, + "loss": 1.7785, + "step": 12129 + }, + { + "epoch": 3.723143032535298, + "grad_norm": 0.3846863806247711, + "learning_rate": 7.229869500660825e-05, + "loss": 1.738, + "step": 12130 + }, + { + "epoch": 3.7234499693063228, + "grad_norm": 0.24412120878696442, + "learning_rate": 7.229424601688256e-05, + "loss": 1.7351, + "step": 12131 + }, + { + "epoch": 3.723756906077348, + "grad_norm": 0.2978009581565857, + "learning_rate": 7.228979680683206e-05, + "loss": 1.8037, + "step": 12132 + }, + { + "epoch": 3.7240638428483734, + "grad_norm": 0.33787262439727783, + "learning_rate": 7.228534737650074e-05, + "loss": 1.8421, + "step": 12133 + }, + { + "epoch": 3.7243707796193983, + "grad_norm": 0.2536921203136444, + "learning_rate": 7.228089772593254e-05, + "loss": 1.7472, + "step": 12134 + }, + { + "epoch": 3.7246777163904237, + "grad_norm": 0.24103601276874542, + "learning_rate": 7.227644785517144e-05, + "loss": 1.8011, + "step": 12135 + }, + { + "epoch": 3.7249846531614486, + "grad_norm": 0.3653033375740051, + "learning_rate": 7.227199776426146e-05, + "loss": 1.8018, + "step": 12136 + }, + { + "epoch": 3.725291589932474, + "grad_norm": 0.35728752613067627, + "learning_rate": 7.226754745324652e-05, + "loss": 1.7684, + "step": 12137 + }, + { + "epoch": 3.7255985267034992, + "grad_norm": 0.262018620967865, + "learning_rate": 7.226309692217063e-05, + "loss": 1.8124, + "step": 12138 + }, + { + "epoch": 3.725905463474524, + "grad_norm": 0.3467118442058563, + "learning_rate": 7.225864617107776e-05, + "loss": 1.8761, + "step": 12139 + }, + { + "epoch": 3.7262124002455494, + "grad_norm": 0.4365626871585846, + "learning_rate": 7.22541952000119e-05, + "loss": 1.7159, + "step": 12140 + }, + { + "epoch": 3.7265193370165743, + "grad_norm": 0.2819811999797821, + "learning_rate": 7.224974400901705e-05, + "loss": 1.8051, + "step": 12141 + }, + { + "epoch": 3.7268262737875997, + "grad_norm": 0.39062437415122986, + "learning_rate": 7.224529259813719e-05, + "loss": 1.8517, + "step": 12142 + }, + { + "epoch": 3.727133210558625, + "grad_norm": 0.4383927285671234, + "learning_rate": 7.22408409674163e-05, + "loss": 1.8295, + "step": 12143 + }, + { + "epoch": 3.7274401473296503, + "grad_norm": 0.3043094575405121, + "learning_rate": 7.223638911689839e-05, + "loss": 1.7653, + "step": 12144 + }, + { + "epoch": 3.7277470841006752, + "grad_norm": 0.25198984146118164, + "learning_rate": 7.223193704662746e-05, + "loss": 1.7561, + "step": 12145 + }, + { + "epoch": 3.7280540208717006, + "grad_norm": 0.353565514087677, + "learning_rate": 7.222748475664749e-05, + "loss": 1.8077, + "step": 12146 + }, + { + "epoch": 3.7283609576427255, + "grad_norm": 0.39757224917411804, + "learning_rate": 7.222303224700248e-05, + "loss": 1.7622, + "step": 12147 + }, + { + "epoch": 3.728667894413751, + "grad_norm": 0.35595703125, + "learning_rate": 7.221857951773644e-05, + "loss": 1.8436, + "step": 12148 + }, + { + "epoch": 3.728974831184776, + "grad_norm": 0.2469715029001236, + "learning_rate": 7.221412656889338e-05, + "loss": 1.8531, + "step": 12149 + }, + { + "epoch": 3.729281767955801, + "grad_norm": 0.35324424505233765, + "learning_rate": 7.22096734005173e-05, + "loss": 1.7361, + "step": 12150 + }, + { + "epoch": 3.7295887047268264, + "grad_norm": 0.3783365488052368, + "learning_rate": 7.220522001265223e-05, + "loss": 1.7459, + "step": 12151 + }, + { + "epoch": 3.7298956414978512, + "grad_norm": 0.27526360750198364, + "learning_rate": 7.220076640534212e-05, + "loss": 1.8867, + "step": 12152 + }, + { + "epoch": 3.7302025782688766, + "grad_norm": 0.30863118171691895, + "learning_rate": 7.219631257863105e-05, + "loss": 1.7363, + "step": 12153 + }, + { + "epoch": 3.730509515039902, + "grad_norm": 0.38505107164382935, + "learning_rate": 7.219185853256301e-05, + "loss": 1.764, + "step": 12154 + }, + { + "epoch": 3.730816451810927, + "grad_norm": 0.2925978899002075, + "learning_rate": 7.218740426718202e-05, + "loss": 1.7693, + "step": 12155 + }, + { + "epoch": 3.731123388581952, + "grad_norm": 0.24510078132152557, + "learning_rate": 7.218294978253209e-05, + "loss": 1.8089, + "step": 12156 + }, + { + "epoch": 3.731430325352977, + "grad_norm": 0.33029109239578247, + "learning_rate": 7.217849507865724e-05, + "loss": 1.6885, + "step": 12157 + }, + { + "epoch": 3.7317372621240024, + "grad_norm": 0.333970308303833, + "learning_rate": 7.217404015560149e-05, + "loss": 1.8132, + "step": 12158 + }, + { + "epoch": 3.7320441988950277, + "grad_norm": 0.2467660754919052, + "learning_rate": 7.216958501340891e-05, + "loss": 1.8021, + "step": 12159 + }, + { + "epoch": 3.732351135666053, + "grad_norm": 0.2701449990272522, + "learning_rate": 7.216512965212348e-05, + "loss": 1.7006, + "step": 12160 + }, + { + "epoch": 3.732658072437078, + "grad_norm": 0.2784138023853302, + "learning_rate": 7.216067407178926e-05, + "loss": 1.7616, + "step": 12161 + }, + { + "epoch": 3.7329650092081033, + "grad_norm": 0.2082870900630951, + "learning_rate": 7.215621827245026e-05, + "loss": 1.7391, + "step": 12162 + }, + { + "epoch": 3.733271945979128, + "grad_norm": 0.2477869987487793, + "learning_rate": 7.215176225415053e-05, + "loss": 1.7761, + "step": 12163 + }, + { + "epoch": 3.7335788827501535, + "grad_norm": 0.28395572304725647, + "learning_rate": 7.21473060169341e-05, + "loss": 1.8181, + "step": 12164 + }, + { + "epoch": 3.733885819521179, + "grad_norm": 0.20430058240890503, + "learning_rate": 7.2142849560845e-05, + "loss": 1.7035, + "step": 12165 + }, + { + "epoch": 3.7341927562922037, + "grad_norm": 0.30061420798301697, + "learning_rate": 7.21383928859273e-05, + "loss": 1.7703, + "step": 12166 + }, + { + "epoch": 3.734499693063229, + "grad_norm": 0.33865803480148315, + "learning_rate": 7.2133935992225e-05, + "loss": 1.8204, + "step": 12167 + }, + { + "epoch": 3.734806629834254, + "grad_norm": 0.29172980785369873, + "learning_rate": 7.212947887978221e-05, + "loss": 1.739, + "step": 12168 + }, + { + "epoch": 3.7351135666052793, + "grad_norm": 0.2799396812915802, + "learning_rate": 7.212502154864291e-05, + "loss": 1.8503, + "step": 12169 + }, + { + "epoch": 3.7354205033763046, + "grad_norm": 0.2945539355278015, + "learning_rate": 7.212056399885118e-05, + "loss": 1.7523, + "step": 12170 + }, + { + "epoch": 3.7357274401473295, + "grad_norm": 0.2395290732383728, + "learning_rate": 7.211610623045108e-05, + "loss": 1.7728, + "step": 12171 + }, + { + "epoch": 3.736034376918355, + "grad_norm": 0.24369286000728607, + "learning_rate": 7.211164824348667e-05, + "loss": 1.7725, + "step": 12172 + }, + { + "epoch": 3.7363413136893797, + "grad_norm": 0.3272435963153839, + "learning_rate": 7.210719003800197e-05, + "loss": 1.8531, + "step": 12173 + }, + { + "epoch": 3.736648250460405, + "grad_norm": 0.23954182863235474, + "learning_rate": 7.210273161404107e-05, + "loss": 1.7807, + "step": 12174 + }, + { + "epoch": 3.7369551872314304, + "grad_norm": 0.24547603726387024, + "learning_rate": 7.209827297164801e-05, + "loss": 1.8481, + "step": 12175 + }, + { + "epoch": 3.7372621240024557, + "grad_norm": 0.26926249265670776, + "learning_rate": 7.209381411086687e-05, + "loss": 1.7496, + "step": 12176 + }, + { + "epoch": 3.7375690607734806, + "grad_norm": 0.22948235273361206, + "learning_rate": 7.208935503174172e-05, + "loss": 1.7681, + "step": 12177 + }, + { + "epoch": 3.737875997544506, + "grad_norm": 0.2697654664516449, + "learning_rate": 7.20848957343166e-05, + "loss": 1.789, + "step": 12178 + }, + { + "epoch": 3.738182934315531, + "grad_norm": 0.235344797372818, + "learning_rate": 7.208043621863562e-05, + "loss": 1.8309, + "step": 12179 + }, + { + "epoch": 3.738489871086556, + "grad_norm": 0.2688879072666168, + "learning_rate": 7.20759764847428e-05, + "loss": 1.7898, + "step": 12180 + }, + { + "epoch": 3.7387968078575815, + "grad_norm": 0.26818978786468506, + "learning_rate": 7.207151653268226e-05, + "loss": 1.7882, + "step": 12181 + }, + { + "epoch": 3.7391037446286064, + "grad_norm": 0.2612875998020172, + "learning_rate": 7.206705636249804e-05, + "loss": 1.7352, + "step": 12182 + }, + { + "epoch": 3.7394106813996317, + "grad_norm": 0.22547565400600433, + "learning_rate": 7.206259597423425e-05, + "loss": 1.733, + "step": 12183 + }, + { + "epoch": 3.7397176181706566, + "grad_norm": 0.24645474553108215, + "learning_rate": 7.205813536793495e-05, + "loss": 1.8064, + "step": 12184 + }, + { + "epoch": 3.740024554941682, + "grad_norm": 0.25879329442977905, + "learning_rate": 7.205367454364424e-05, + "loss": 1.8134, + "step": 12185 + }, + { + "epoch": 3.7403314917127073, + "grad_norm": 0.22420097887516022, + "learning_rate": 7.204921350140617e-05, + "loss": 1.7819, + "step": 12186 + }, + { + "epoch": 3.7406384284837326, + "grad_norm": 0.2569858431816101, + "learning_rate": 7.204475224126487e-05, + "loss": 1.784, + "step": 12187 + }, + { + "epoch": 3.7409453652547575, + "grad_norm": 0.23769912123680115, + "learning_rate": 7.20402907632644e-05, + "loss": 1.7853, + "step": 12188 + }, + { + "epoch": 3.741252302025783, + "grad_norm": 0.26935988664627075, + "learning_rate": 7.203582906744885e-05, + "loss": 1.806, + "step": 12189 + }, + { + "epoch": 3.7415592387968077, + "grad_norm": 0.2544274628162384, + "learning_rate": 7.203136715386233e-05, + "loss": 1.7988, + "step": 12190 + }, + { + "epoch": 3.741866175567833, + "grad_norm": 0.22665882110595703, + "learning_rate": 7.202690502254892e-05, + "loss": 1.7798, + "step": 12191 + }, + { + "epoch": 3.7421731123388584, + "grad_norm": 0.24512888491153717, + "learning_rate": 7.202244267355273e-05, + "loss": 1.816, + "step": 12192 + }, + { + "epoch": 3.7424800491098833, + "grad_norm": 0.2408553808927536, + "learning_rate": 7.201798010691785e-05, + "loss": 1.7417, + "step": 12193 + }, + { + "epoch": 3.7427869858809086, + "grad_norm": 0.23142600059509277, + "learning_rate": 7.201351732268838e-05, + "loss": 1.7771, + "step": 12194 + }, + { + "epoch": 3.7430939226519335, + "grad_norm": 0.245071142911911, + "learning_rate": 7.200905432090844e-05, + "loss": 1.7556, + "step": 12195 + }, + { + "epoch": 3.743400859422959, + "grad_norm": 0.2623934745788574, + "learning_rate": 7.200459110162211e-05, + "loss": 1.8042, + "step": 12196 + }, + { + "epoch": 3.743707796193984, + "grad_norm": 0.2531217038631439, + "learning_rate": 7.200012766487353e-05, + "loss": 1.7709, + "step": 12197 + }, + { + "epoch": 3.744014732965009, + "grad_norm": 0.23839864134788513, + "learning_rate": 7.19956640107068e-05, + "loss": 1.8202, + "step": 12198 + }, + { + "epoch": 3.7443216697360344, + "grad_norm": 0.2342260777950287, + "learning_rate": 7.1991200139166e-05, + "loss": 1.827, + "step": 12199 + }, + { + "epoch": 3.7446286065070593, + "grad_norm": 0.25511276721954346, + "learning_rate": 7.198673605029528e-05, + "loss": 1.7766, + "step": 12200 + }, + { + "epoch": 3.7449355432780846, + "grad_norm": 0.27601274847984314, + "learning_rate": 7.198227174413876e-05, + "loss": 1.7716, + "step": 12201 + }, + { + "epoch": 3.74524248004911, + "grad_norm": 0.3027385175228119, + "learning_rate": 7.197780722074056e-05, + "loss": 1.8007, + "step": 12202 + }, + { + "epoch": 3.7455494168201353, + "grad_norm": 0.31242382526397705, + "learning_rate": 7.197334248014477e-05, + "loss": 1.8089, + "step": 12203 + }, + { + "epoch": 3.74585635359116, + "grad_norm": 0.3673859238624573, + "learning_rate": 7.196887752239551e-05, + "loss": 1.8017, + "step": 12204 + }, + { + "epoch": 3.7461632903621855, + "grad_norm": 0.3152726888656616, + "learning_rate": 7.196441234753695e-05, + "loss": 1.7108, + "step": 12205 + }, + { + "epoch": 3.7464702271332104, + "grad_norm": 0.2606927156448364, + "learning_rate": 7.195994695561319e-05, + "loss": 1.8066, + "step": 12206 + }, + { + "epoch": 3.7467771639042358, + "grad_norm": 0.37624871730804443, + "learning_rate": 7.195548134666836e-05, + "loss": 1.725, + "step": 12207 + }, + { + "epoch": 3.747084100675261, + "grad_norm": 0.4138187766075134, + "learning_rate": 7.195101552074658e-05, + "loss": 1.7838, + "step": 12208 + }, + { + "epoch": 3.747391037446286, + "grad_norm": 0.3668459951877594, + "learning_rate": 7.194654947789204e-05, + "loss": 1.7575, + "step": 12209 + }, + { + "epoch": 3.7476979742173113, + "grad_norm": 0.27947792410850525, + "learning_rate": 7.19420832181488e-05, + "loss": 1.792, + "step": 12210 + }, + { + "epoch": 3.748004910988336, + "grad_norm": 0.2507692873477936, + "learning_rate": 7.193761674156103e-05, + "loss": 1.7752, + "step": 12211 + }, + { + "epoch": 3.7483118477593615, + "grad_norm": 0.3209949731826782, + "learning_rate": 7.193315004817289e-05, + "loss": 1.8491, + "step": 12212 + }, + { + "epoch": 3.748618784530387, + "grad_norm": 0.32883042097091675, + "learning_rate": 7.192868313802849e-05, + "loss": 1.8135, + "step": 12213 + }, + { + "epoch": 3.7489257213014118, + "grad_norm": 0.2450616955757141, + "learning_rate": 7.192421601117201e-05, + "loss": 1.7722, + "step": 12214 + }, + { + "epoch": 3.749232658072437, + "grad_norm": 0.2545110285282135, + "learning_rate": 7.191974866764757e-05, + "loss": 1.7866, + "step": 12215 + }, + { + "epoch": 3.749539594843462, + "grad_norm": 0.264017790555954, + "learning_rate": 7.191528110749932e-05, + "loss": 1.778, + "step": 12216 + }, + { + "epoch": 3.7498465316144873, + "grad_norm": 0.3156309425830841, + "learning_rate": 7.191081333077142e-05, + "loss": 1.7917, + "step": 12217 + }, + { + "epoch": 3.7501534683855127, + "grad_norm": 0.3578774631023407, + "learning_rate": 7.190634533750802e-05, + "loss": 1.8468, + "step": 12218 + }, + { + "epoch": 3.750460405156538, + "grad_norm": 0.30735981464385986, + "learning_rate": 7.19018771277533e-05, + "loss": 1.7502, + "step": 12219 + }, + { + "epoch": 3.750767341927563, + "grad_norm": 0.22870220243930817, + "learning_rate": 7.189740870155135e-05, + "loss": 1.7686, + "step": 12220 + }, + { + "epoch": 3.7510742786985882, + "grad_norm": 0.30297720432281494, + "learning_rate": 7.18929400589464e-05, + "loss": 1.826, + "step": 12221 + }, + { + "epoch": 3.751381215469613, + "grad_norm": 0.2735389173030853, + "learning_rate": 7.188847119998257e-05, + "loss": 1.8142, + "step": 12222 + }, + { + "epoch": 3.7516881522406385, + "grad_norm": 0.2823885679244995, + "learning_rate": 7.188400212470405e-05, + "loss": 1.8028, + "step": 12223 + }, + { + "epoch": 3.751995089011664, + "grad_norm": 0.4184139370918274, + "learning_rate": 7.187953283315499e-05, + "loss": 1.8467, + "step": 12224 + }, + { + "epoch": 3.7523020257826887, + "grad_norm": 0.3559226095676422, + "learning_rate": 7.187506332537957e-05, + "loss": 1.7416, + "step": 12225 + }, + { + "epoch": 3.752608962553714, + "grad_norm": 0.26055800914764404, + "learning_rate": 7.187059360142194e-05, + "loss": 1.8309, + "step": 12226 + }, + { + "epoch": 3.752915899324739, + "grad_norm": 0.28032660484313965, + "learning_rate": 7.186612366132629e-05, + "loss": 1.7926, + "step": 12227 + }, + { + "epoch": 3.7532228360957642, + "grad_norm": 0.26229965686798096, + "learning_rate": 7.18616535051368e-05, + "loss": 1.7368, + "step": 12228 + }, + { + "epoch": 3.7535297728667896, + "grad_norm": 0.2779417634010315, + "learning_rate": 7.185718313289763e-05, + "loss": 1.8418, + "step": 12229 + }, + { + "epoch": 3.7538367096378145, + "grad_norm": 0.26164770126342773, + "learning_rate": 7.185271254465295e-05, + "loss": 1.7511, + "step": 12230 + }, + { + "epoch": 3.75414364640884, + "grad_norm": 0.30725157260894775, + "learning_rate": 7.184824174044698e-05, + "loss": 1.7661, + "step": 12231 + }, + { + "epoch": 3.7544505831798647, + "grad_norm": 0.33111417293548584, + "learning_rate": 7.184377072032386e-05, + "loss": 1.7341, + "step": 12232 + }, + { + "epoch": 3.75475751995089, + "grad_norm": 0.23978343605995178, + "learning_rate": 7.183929948432779e-05, + "loss": 1.7151, + "step": 12233 + }, + { + "epoch": 3.7550644567219154, + "grad_norm": 0.3057664632797241, + "learning_rate": 7.183482803250299e-05, + "loss": 1.8446, + "step": 12234 + }, + { + "epoch": 3.7553713934929407, + "grad_norm": 0.2629055678844452, + "learning_rate": 7.18303563648936e-05, + "loss": 1.7415, + "step": 12235 + }, + { + "epoch": 3.7556783302639656, + "grad_norm": 0.22703498601913452, + "learning_rate": 7.182588448154386e-05, + "loss": 1.8188, + "step": 12236 + }, + { + "epoch": 3.755985267034991, + "grad_norm": 0.3014034032821655, + "learning_rate": 7.182141238249792e-05, + "loss": 1.8634, + "step": 12237 + }, + { + "epoch": 3.756292203806016, + "grad_norm": 0.28859084844589233, + "learning_rate": 7.181694006779998e-05, + "loss": 1.7509, + "step": 12238 + }, + { + "epoch": 3.756599140577041, + "grad_norm": 0.293720543384552, + "learning_rate": 7.181246753749426e-05, + "loss": 1.777, + "step": 12239 + }, + { + "epoch": 3.7569060773480665, + "grad_norm": 0.2374580055475235, + "learning_rate": 7.180799479162496e-05, + "loss": 1.7492, + "step": 12240 + }, + { + "epoch": 3.7572130141190914, + "grad_norm": 0.30106452107429504, + "learning_rate": 7.180352183023627e-05, + "loss": 1.7538, + "step": 12241 + }, + { + "epoch": 3.7575199508901167, + "grad_norm": 0.3504682183265686, + "learning_rate": 7.179904865337238e-05, + "loss": 1.7477, + "step": 12242 + }, + { + "epoch": 3.7578268876611416, + "grad_norm": 0.2901679575443268, + "learning_rate": 7.179457526107754e-05, + "loss": 1.9412, + "step": 12243 + }, + { + "epoch": 3.758133824432167, + "grad_norm": 0.37690606713294983, + "learning_rate": 7.179010165339591e-05, + "loss": 1.8222, + "step": 12244 + }, + { + "epoch": 3.7584407612031923, + "grad_norm": 0.45126965641975403, + "learning_rate": 7.178562783037172e-05, + "loss": 1.8563, + "step": 12245 + }, + { + "epoch": 3.758747697974217, + "grad_norm": 0.2747548818588257, + "learning_rate": 7.178115379204921e-05, + "loss": 1.7179, + "step": 12246 + }, + { + "epoch": 3.7590546347452425, + "grad_norm": 0.43243977427482605, + "learning_rate": 7.177667953847257e-05, + "loss": 1.8157, + "step": 12247 + }, + { + "epoch": 3.7593615715162674, + "grad_norm": 0.529448390007019, + "learning_rate": 7.177220506968602e-05, + "loss": 1.8113, + "step": 12248 + }, + { + "epoch": 3.7596685082872927, + "grad_norm": 0.3099314868450165, + "learning_rate": 7.176773038573377e-05, + "loss": 1.7833, + "step": 12249 + }, + { + "epoch": 3.759975445058318, + "grad_norm": 0.3111872375011444, + "learning_rate": 7.176325548666004e-05, + "loss": 1.7965, + "step": 12250 + }, + { + "epoch": 3.7602823818293434, + "grad_norm": 0.38437551259994507, + "learning_rate": 7.175878037250907e-05, + "loss": 1.7822, + "step": 12251 + }, + { + "epoch": 3.7605893186003683, + "grad_norm": 0.33643704652786255, + "learning_rate": 7.175430504332509e-05, + "loss": 1.7839, + "step": 12252 + }, + { + "epoch": 3.7608962553713936, + "grad_norm": 0.24705304205417633, + "learning_rate": 7.174982949915232e-05, + "loss": 1.8302, + "step": 12253 + }, + { + "epoch": 3.7612031921424185, + "grad_norm": 0.3615458309650421, + "learning_rate": 7.174535374003497e-05, + "loss": 1.7963, + "step": 12254 + }, + { + "epoch": 3.761510128913444, + "grad_norm": 0.36486589908599854, + "learning_rate": 7.17408777660173e-05, + "loss": 1.7933, + "step": 12255 + }, + { + "epoch": 3.761817065684469, + "grad_norm": 0.2566867172718048, + "learning_rate": 7.173640157714352e-05, + "loss": 1.7254, + "step": 12256 + }, + { + "epoch": 3.762124002455494, + "grad_norm": 0.2602523863315582, + "learning_rate": 7.17319251734579e-05, + "loss": 1.7357, + "step": 12257 + }, + { + "epoch": 3.7624309392265194, + "grad_norm": 0.3626105785369873, + "learning_rate": 7.172744855500464e-05, + "loss": 1.7971, + "step": 12258 + }, + { + "epoch": 3.7627378759975443, + "grad_norm": 0.36327603459358215, + "learning_rate": 7.172297172182802e-05, + "loss": 1.7819, + "step": 12259 + }, + { + "epoch": 3.7630448127685696, + "grad_norm": 0.25935736298561096, + "learning_rate": 7.171849467397224e-05, + "loss": 1.8112, + "step": 12260 + }, + { + "epoch": 3.763351749539595, + "grad_norm": 0.2779700756072998, + "learning_rate": 7.171401741148156e-05, + "loss": 1.786, + "step": 12261 + }, + { + "epoch": 3.7636586863106203, + "grad_norm": 0.3089013695716858, + "learning_rate": 7.170953993440025e-05, + "loss": 1.7808, + "step": 12262 + }, + { + "epoch": 3.763965623081645, + "grad_norm": 0.2562308609485626, + "learning_rate": 7.170506224277253e-05, + "loss": 1.8207, + "step": 12263 + }, + { + "epoch": 3.7642725598526705, + "grad_norm": 0.2907634973526001, + "learning_rate": 7.170058433664268e-05, + "loss": 1.7638, + "step": 12264 + }, + { + "epoch": 3.7645794966236954, + "grad_norm": 0.30341312289237976, + "learning_rate": 7.169610621605493e-05, + "loss": 1.7827, + "step": 12265 + }, + { + "epoch": 3.7648864333947207, + "grad_norm": 0.27091866731643677, + "learning_rate": 7.169162788105353e-05, + "loss": 1.786, + "step": 12266 + }, + { + "epoch": 3.765193370165746, + "grad_norm": 0.234042689204216, + "learning_rate": 7.168714933168277e-05, + "loss": 1.7638, + "step": 12267 + }, + { + "epoch": 3.765500306936771, + "grad_norm": 0.2477465271949768, + "learning_rate": 7.168267056798686e-05, + "loss": 1.7275, + "step": 12268 + }, + { + "epoch": 3.7658072437077963, + "grad_norm": 0.25578543543815613, + "learning_rate": 7.167819159001012e-05, + "loss": 1.7831, + "step": 12269 + }, + { + "epoch": 3.766114180478821, + "grad_norm": 0.26629674434661865, + "learning_rate": 7.167371239779678e-05, + "loss": 1.7866, + "step": 12270 + }, + { + "epoch": 3.7664211172498465, + "grad_norm": 0.31350967288017273, + "learning_rate": 7.16692329913911e-05, + "loss": 1.7755, + "step": 12271 + }, + { + "epoch": 3.766728054020872, + "grad_norm": 0.2670116126537323, + "learning_rate": 7.166475337083735e-05, + "loss": 1.7524, + "step": 12272 + }, + { + "epoch": 3.7670349907918967, + "grad_norm": 0.26503682136535645, + "learning_rate": 7.166027353617983e-05, + "loss": 1.7867, + "step": 12273 + }, + { + "epoch": 3.767341927562922, + "grad_norm": 0.3674192428588867, + "learning_rate": 7.165579348746278e-05, + "loss": 1.7604, + "step": 12274 + }, + { + "epoch": 3.767648864333947, + "grad_norm": 0.4120824337005615, + "learning_rate": 7.16513132247305e-05, + "loss": 1.7905, + "step": 12275 + }, + { + "epoch": 3.7679558011049723, + "grad_norm": 0.29074826836586, + "learning_rate": 7.164683274802723e-05, + "loss": 1.7539, + "step": 12276 + }, + { + "epoch": 3.7682627378759976, + "grad_norm": 0.22223204374313354, + "learning_rate": 7.164235205739729e-05, + "loss": 1.755, + "step": 12277 + }, + { + "epoch": 3.768569674647023, + "grad_norm": 0.23997461795806885, + "learning_rate": 7.163787115288494e-05, + "loss": 1.8024, + "step": 12278 + }, + { + "epoch": 3.768876611418048, + "grad_norm": 0.2556418776512146, + "learning_rate": 7.163339003453445e-05, + "loss": 1.7717, + "step": 12279 + }, + { + "epoch": 3.769183548189073, + "grad_norm": 0.3107141852378845, + "learning_rate": 7.162890870239013e-05, + "loss": 1.8257, + "step": 12280 + }, + { + "epoch": 3.769490484960098, + "grad_norm": 0.35293644666671753, + "learning_rate": 7.162442715649627e-05, + "loss": 1.7855, + "step": 12281 + }, + { + "epoch": 3.7697974217311234, + "grad_norm": 0.25989311933517456, + "learning_rate": 7.161994539689713e-05, + "loss": 1.7816, + "step": 12282 + }, + { + "epoch": 3.7701043585021488, + "grad_norm": 0.25615137815475464, + "learning_rate": 7.161546342363701e-05, + "loss": 1.7738, + "step": 12283 + }, + { + "epoch": 3.7704112952731736, + "grad_norm": 0.29345229268074036, + "learning_rate": 7.161098123676023e-05, + "loss": 1.8496, + "step": 12284 + }, + { + "epoch": 3.770718232044199, + "grad_norm": 0.2975969612598419, + "learning_rate": 7.160649883631105e-05, + "loss": 1.7342, + "step": 12285 + }, + { + "epoch": 3.771025168815224, + "grad_norm": 0.28458064794540405, + "learning_rate": 7.16020162223338e-05, + "loss": 1.8253, + "step": 12286 + }, + { + "epoch": 3.771332105586249, + "grad_norm": 0.2798703908920288, + "learning_rate": 7.159753339487276e-05, + "loss": 1.746, + "step": 12287 + }, + { + "epoch": 3.7716390423572745, + "grad_norm": 0.380044549703598, + "learning_rate": 7.159305035397223e-05, + "loss": 1.769, + "step": 12288 + }, + { + "epoch": 3.7719459791282994, + "grad_norm": 0.28760263323783875, + "learning_rate": 7.158856709967654e-05, + "loss": 1.7466, + "step": 12289 + }, + { + "epoch": 3.7722529158993248, + "grad_norm": 0.23314130306243896, + "learning_rate": 7.158408363202996e-05, + "loss": 1.7545, + "step": 12290 + }, + { + "epoch": 3.7725598526703497, + "grad_norm": 0.2864209711551666, + "learning_rate": 7.15795999510768e-05, + "loss": 1.7549, + "step": 12291 + }, + { + "epoch": 3.772866789441375, + "grad_norm": 0.2605510354042053, + "learning_rate": 7.15751160568614e-05, + "loss": 1.7684, + "step": 12292 + }, + { + "epoch": 3.7731737262124003, + "grad_norm": 0.2475409358739853, + "learning_rate": 7.157063194942806e-05, + "loss": 1.7841, + "step": 12293 + }, + { + "epoch": 3.7734806629834257, + "grad_norm": 0.22479289770126343, + "learning_rate": 7.15661476288211e-05, + "loss": 1.7592, + "step": 12294 + }, + { + "epoch": 3.7737875997544506, + "grad_norm": 0.22076937556266785, + "learning_rate": 7.156166309508482e-05, + "loss": 1.7853, + "step": 12295 + }, + { + "epoch": 3.774094536525476, + "grad_norm": 0.26082465052604675, + "learning_rate": 7.155717834826353e-05, + "loss": 1.7828, + "step": 12296 + }, + { + "epoch": 3.7744014732965008, + "grad_norm": 0.24771755933761597, + "learning_rate": 7.15526933884016e-05, + "loss": 1.758, + "step": 12297 + }, + { + "epoch": 3.774708410067526, + "grad_norm": 0.23806311190128326, + "learning_rate": 7.15482082155433e-05, + "loss": 1.7237, + "step": 12298 + }, + { + "epoch": 3.7750153468385514, + "grad_norm": 0.24822844564914703, + "learning_rate": 7.154372282973299e-05, + "loss": 1.7828, + "step": 12299 + }, + { + "epoch": 3.7753222836095763, + "grad_norm": 0.24423740804195404, + "learning_rate": 7.153923723101496e-05, + "loss": 1.8014, + "step": 12300 + }, + { + "epoch": 3.7756292203806017, + "grad_norm": 0.24966634809970856, + "learning_rate": 7.15347514194336e-05, + "loss": 1.8005, + "step": 12301 + }, + { + "epoch": 3.7759361571516266, + "grad_norm": 0.2549348473548889, + "learning_rate": 7.153026539503317e-05, + "loss": 1.8473, + "step": 12302 + }, + { + "epoch": 3.776243093922652, + "grad_norm": 0.23709465563297272, + "learning_rate": 7.152577915785807e-05, + "loss": 1.8031, + "step": 12303 + }, + { + "epoch": 3.7765500306936772, + "grad_norm": 0.28554168343544006, + "learning_rate": 7.152129270795258e-05, + "loss": 1.7836, + "step": 12304 + }, + { + "epoch": 3.776856967464702, + "grad_norm": 0.2568756639957428, + "learning_rate": 7.151680604536107e-05, + "loss": 1.7345, + "step": 12305 + }, + { + "epoch": 3.7771639042357275, + "grad_norm": 0.23883797228336334, + "learning_rate": 7.151231917012787e-05, + "loss": 1.7342, + "step": 12306 + }, + { + "epoch": 3.7774708410067523, + "grad_norm": 0.24026677012443542, + "learning_rate": 7.150783208229732e-05, + "loss": 1.8156, + "step": 12307 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.25756222009658813, + "learning_rate": 7.150334478191376e-05, + "loss": 1.8204, + "step": 12308 + }, + { + "epoch": 3.778084714548803, + "grad_norm": 0.24917428195476532, + "learning_rate": 7.149885726902156e-05, + "loss": 1.7867, + "step": 12309 + }, + { + "epoch": 3.7783916513198283, + "grad_norm": 0.26269277930259705, + "learning_rate": 7.149436954366504e-05, + "loss": 1.8233, + "step": 12310 + }, + { + "epoch": 3.7786985880908532, + "grad_norm": 0.2502293586730957, + "learning_rate": 7.148988160588857e-05, + "loss": 1.8329, + "step": 12311 + }, + { + "epoch": 3.7790055248618786, + "grad_norm": 0.24845796823501587, + "learning_rate": 7.14853934557365e-05, + "loss": 1.7936, + "step": 12312 + }, + { + "epoch": 3.7793124616329035, + "grad_norm": 0.2453537881374359, + "learning_rate": 7.148090509325315e-05, + "loss": 1.8149, + "step": 12313 + }, + { + "epoch": 3.779619398403929, + "grad_norm": 0.2336922138929367, + "learning_rate": 7.147641651848293e-05, + "loss": 1.7826, + "step": 12314 + }, + { + "epoch": 3.779926335174954, + "grad_norm": 0.25542667508125305, + "learning_rate": 7.147192773147017e-05, + "loss": 1.801, + "step": 12315 + }, + { + "epoch": 3.780233271945979, + "grad_norm": 0.2301866114139557, + "learning_rate": 7.146743873225923e-05, + "loss": 1.7302, + "step": 12316 + }, + { + "epoch": 3.7805402087170044, + "grad_norm": 0.25821468234062195, + "learning_rate": 7.14629495208945e-05, + "loss": 1.7704, + "step": 12317 + }, + { + "epoch": 3.7808471454880292, + "grad_norm": 0.22537970542907715, + "learning_rate": 7.145846009742029e-05, + "loss": 1.7281, + "step": 12318 + }, + { + "epoch": 3.7811540822590546, + "grad_norm": 0.2565869688987732, + "learning_rate": 7.145397046188102e-05, + "loss": 1.8077, + "step": 12319 + }, + { + "epoch": 3.78146101903008, + "grad_norm": 0.2588396966457367, + "learning_rate": 7.144948061432105e-05, + "loss": 1.7438, + "step": 12320 + }, + { + "epoch": 3.781767955801105, + "grad_norm": 0.2538135349750519, + "learning_rate": 7.144499055478472e-05, + "loss": 1.8253, + "step": 12321 + }, + { + "epoch": 3.78207489257213, + "grad_norm": 0.2272680401802063, + "learning_rate": 7.144050028331644e-05, + "loss": 1.7408, + "step": 12322 + }, + { + "epoch": 3.782381829343155, + "grad_norm": 0.25010406970977783, + "learning_rate": 7.143600979996055e-05, + "loss": 1.8219, + "step": 12323 + }, + { + "epoch": 3.7826887661141804, + "grad_norm": 0.2560291290283203, + "learning_rate": 7.143151910476144e-05, + "loss": 1.7734, + "step": 12324 + }, + { + "epoch": 3.7829957028852057, + "grad_norm": 0.24927431344985962, + "learning_rate": 7.142702819776352e-05, + "loss": 1.7682, + "step": 12325 + }, + { + "epoch": 3.783302639656231, + "grad_norm": 0.2501368224620819, + "learning_rate": 7.142253707901114e-05, + "loss": 1.818, + "step": 12326 + }, + { + "epoch": 3.783609576427256, + "grad_norm": 0.3132917284965515, + "learning_rate": 7.141804574854871e-05, + "loss": 1.7793, + "step": 12327 + }, + { + "epoch": 3.7839165131982813, + "grad_norm": 0.24229925870895386, + "learning_rate": 7.141355420642057e-05, + "loss": 1.7585, + "step": 12328 + }, + { + "epoch": 3.784223449969306, + "grad_norm": 0.22612906992435455, + "learning_rate": 7.140906245267116e-05, + "loss": 1.7374, + "step": 12329 + }, + { + "epoch": 3.7845303867403315, + "grad_norm": 0.26354333758354187, + "learning_rate": 7.140457048734482e-05, + "loss": 1.7751, + "step": 12330 + }, + { + "epoch": 3.784837323511357, + "grad_norm": 0.21500451862812042, + "learning_rate": 7.140007831048599e-05, + "loss": 1.7827, + "step": 12331 + }, + { + "epoch": 3.7851442602823817, + "grad_norm": 0.2826332151889801, + "learning_rate": 7.139558592213904e-05, + "loss": 1.7522, + "step": 12332 + }, + { + "epoch": 3.785451197053407, + "grad_norm": 0.3217725455760956, + "learning_rate": 7.139109332234837e-05, + "loss": 1.8758, + "step": 12333 + }, + { + "epoch": 3.785758133824432, + "grad_norm": 0.26934614777565, + "learning_rate": 7.138660051115837e-05, + "loss": 1.8322, + "step": 12334 + }, + { + "epoch": 3.7860650705954573, + "grad_norm": 0.2653827667236328, + "learning_rate": 7.138210748861346e-05, + "loss": 1.7651, + "step": 12335 + }, + { + "epoch": 3.7863720073664826, + "grad_norm": 0.30470311641693115, + "learning_rate": 7.137761425475802e-05, + "loss": 1.855, + "step": 12336 + }, + { + "epoch": 3.786678944137508, + "grad_norm": 0.2558726370334625, + "learning_rate": 7.137312080963647e-05, + "loss": 1.7174, + "step": 12337 + }, + { + "epoch": 3.786985880908533, + "grad_norm": 0.24025602638721466, + "learning_rate": 7.136862715329322e-05, + "loss": 1.7565, + "step": 12338 + }, + { + "epoch": 3.787292817679558, + "grad_norm": 0.34205392003059387, + "learning_rate": 7.136413328577267e-05, + "loss": 1.8116, + "step": 12339 + }, + { + "epoch": 3.787599754450583, + "grad_norm": 0.4069152772426605, + "learning_rate": 7.135963920711923e-05, + "loss": 1.7662, + "step": 12340 + }, + { + "epoch": 3.7879066912216084, + "grad_norm": 0.3915627598762512, + "learning_rate": 7.13551449173773e-05, + "loss": 1.81, + "step": 12341 + }, + { + "epoch": 3.7882136279926337, + "grad_norm": 0.27136507630348206, + "learning_rate": 7.135065041659134e-05, + "loss": 1.7845, + "step": 12342 + }, + { + "epoch": 3.7885205647636586, + "grad_norm": 0.2924078106880188, + "learning_rate": 7.134615570480572e-05, + "loss": 1.8606, + "step": 12343 + }, + { + "epoch": 3.788827501534684, + "grad_norm": 0.35581526160240173, + "learning_rate": 7.134166078206488e-05, + "loss": 1.7785, + "step": 12344 + }, + { + "epoch": 3.789134438305709, + "grad_norm": 0.3003756105899811, + "learning_rate": 7.133716564841324e-05, + "loss": 1.7321, + "step": 12345 + }, + { + "epoch": 3.789441375076734, + "grad_norm": 0.2586000859737396, + "learning_rate": 7.133267030389524e-05, + "loss": 1.7889, + "step": 12346 + }, + { + "epoch": 3.7897483118477595, + "grad_norm": 0.28053075075149536, + "learning_rate": 7.132817474855527e-05, + "loss": 1.8216, + "step": 12347 + }, + { + "epoch": 3.7900552486187844, + "grad_norm": 0.3064870834350586, + "learning_rate": 7.132367898243777e-05, + "loss": 1.7528, + "step": 12348 + }, + { + "epoch": 3.7903621853898097, + "grad_norm": 0.3045158386230469, + "learning_rate": 7.131918300558719e-05, + "loss": 1.8251, + "step": 12349 + }, + { + "epoch": 3.7906691221608346, + "grad_norm": 0.2438485324382782, + "learning_rate": 7.131468681804794e-05, + "loss": 1.7505, + "step": 12350 + }, + { + "epoch": 3.79097605893186, + "grad_norm": 0.24239958822727203, + "learning_rate": 7.131019041986447e-05, + "loss": 1.7544, + "step": 12351 + }, + { + "epoch": 3.7912829957028853, + "grad_norm": 0.24632441997528076, + "learning_rate": 7.130569381108121e-05, + "loss": 1.7485, + "step": 12352 + }, + { + "epoch": 3.7915899324739106, + "grad_norm": 0.22553624212741852, + "learning_rate": 7.13011969917426e-05, + "loss": 1.803, + "step": 12353 + }, + { + "epoch": 3.7918968692449355, + "grad_norm": 0.2164420485496521, + "learning_rate": 7.129669996189306e-05, + "loss": 1.7307, + "step": 12354 + }, + { + "epoch": 3.792203806015961, + "grad_norm": 0.25104281306266785, + "learning_rate": 7.129220272157705e-05, + "loss": 1.8154, + "step": 12355 + }, + { + "epoch": 3.7925107427869857, + "grad_norm": 0.25533202290534973, + "learning_rate": 7.128770527083903e-05, + "loss": 1.8046, + "step": 12356 + }, + { + "epoch": 3.792817679558011, + "grad_norm": 0.24428130686283112, + "learning_rate": 7.128320760972341e-05, + "loss": 1.7984, + "step": 12357 + }, + { + "epoch": 3.7931246163290364, + "grad_norm": 0.2366408109664917, + "learning_rate": 7.127870973827467e-05, + "loss": 1.7781, + "step": 12358 + }, + { + "epoch": 3.7934315531000613, + "grad_norm": 0.2558888792991638, + "learning_rate": 7.127421165653722e-05, + "loss": 1.7858, + "step": 12359 + }, + { + "epoch": 3.7937384898710866, + "grad_norm": 0.25825443863868713, + "learning_rate": 7.126971336455558e-05, + "loss": 1.8292, + "step": 12360 + }, + { + "epoch": 3.7940454266421115, + "grad_norm": 0.2554624080657959, + "learning_rate": 7.126521486237415e-05, + "loss": 1.822, + "step": 12361 + }, + { + "epoch": 3.794352363413137, + "grad_norm": 0.3030763268470764, + "learning_rate": 7.126071615003742e-05, + "loss": 1.8261, + "step": 12362 + }, + { + "epoch": 3.794659300184162, + "grad_norm": 0.3047907054424286, + "learning_rate": 7.125621722758981e-05, + "loss": 1.8419, + "step": 12363 + }, + { + "epoch": 3.794966236955187, + "grad_norm": 0.27782654762268066, + "learning_rate": 7.12517180950758e-05, + "loss": 1.7959, + "step": 12364 + }, + { + "epoch": 3.7952731737262124, + "grad_norm": 0.24526572227478027, + "learning_rate": 7.124721875253986e-05, + "loss": 1.7313, + "step": 12365 + }, + { + "epoch": 3.7955801104972373, + "grad_norm": 0.23718179762363434, + "learning_rate": 7.124271920002646e-05, + "loss": 1.7479, + "step": 12366 + }, + { + "epoch": 3.7958870472682626, + "grad_norm": 0.2880019247531891, + "learning_rate": 7.123821943758004e-05, + "loss": 1.7792, + "step": 12367 + }, + { + "epoch": 3.796193984039288, + "grad_norm": 0.28923723101615906, + "learning_rate": 7.123371946524511e-05, + "loss": 1.7474, + "step": 12368 + }, + { + "epoch": 3.7965009208103133, + "grad_norm": 0.2281525880098343, + "learning_rate": 7.122921928306612e-05, + "loss": 1.8106, + "step": 12369 + }, + { + "epoch": 3.796807857581338, + "grad_norm": 0.34825438261032104, + "learning_rate": 7.122471889108752e-05, + "loss": 1.8076, + "step": 12370 + }, + { + "epoch": 3.7971147943523635, + "grad_norm": 0.41145995259284973, + "learning_rate": 7.122021828935382e-05, + "loss": 1.7692, + "step": 12371 + }, + { + "epoch": 3.7974217311233884, + "grad_norm": 0.31711262464523315, + "learning_rate": 7.12157174779095e-05, + "loss": 1.8101, + "step": 12372 + }, + { + "epoch": 3.7977286678944138, + "grad_norm": 0.3044308125972748, + "learning_rate": 7.1211216456799e-05, + "loss": 1.8238, + "step": 12373 + }, + { + "epoch": 3.798035604665439, + "grad_norm": 0.3750055134296417, + "learning_rate": 7.120671522606683e-05, + "loss": 1.7323, + "step": 12374 + }, + { + "epoch": 3.798342541436464, + "grad_norm": 0.38852599263191223, + "learning_rate": 7.120221378575749e-05, + "loss": 1.8402, + "step": 12375 + }, + { + "epoch": 3.7986494782074893, + "grad_norm": 0.3430371582508087, + "learning_rate": 7.119771213591541e-05, + "loss": 1.8369, + "step": 12376 + }, + { + "epoch": 3.798956414978514, + "grad_norm": 0.4787428677082062, + "learning_rate": 7.119321027658515e-05, + "loss": 1.7977, + "step": 12377 + }, + { + "epoch": 3.7992633517495396, + "grad_norm": 0.4263977110385895, + "learning_rate": 7.118870820781114e-05, + "loss": 1.8208, + "step": 12378 + }, + { + "epoch": 3.799570288520565, + "grad_norm": 0.28649669885635376, + "learning_rate": 7.118420592963793e-05, + "loss": 1.773, + "step": 12379 + }, + { + "epoch": 3.7998772252915898, + "grad_norm": 0.26070261001586914, + "learning_rate": 7.117970344210996e-05, + "loss": 1.6866, + "step": 12380 + }, + { + "epoch": 3.800184162062615, + "grad_norm": 0.30127593874931335, + "learning_rate": 7.117520074527173e-05, + "loss": 1.7208, + "step": 12381 + }, + { + "epoch": 3.80049109883364, + "grad_norm": 0.23639258742332458, + "learning_rate": 7.117069783916777e-05, + "loss": 1.7504, + "step": 12382 + }, + { + "epoch": 3.8007980356046653, + "grad_norm": 0.2852858901023865, + "learning_rate": 7.116619472384256e-05, + "loss": 1.7954, + "step": 12383 + }, + { + "epoch": 3.8011049723756907, + "grad_norm": 0.2673225998878479, + "learning_rate": 7.116169139934063e-05, + "loss": 1.7562, + "step": 12384 + }, + { + "epoch": 3.801411909146716, + "grad_norm": 0.21615394949913025, + "learning_rate": 7.115718786570644e-05, + "loss": 1.7126, + "step": 12385 + }, + { + "epoch": 3.801718845917741, + "grad_norm": 0.2165435254573822, + "learning_rate": 7.115268412298453e-05, + "loss": 1.7171, + "step": 12386 + }, + { + "epoch": 3.8020257826887662, + "grad_norm": 0.280564546585083, + "learning_rate": 7.114818017121939e-05, + "loss": 1.7711, + "step": 12387 + }, + { + "epoch": 3.802332719459791, + "grad_norm": 0.3023521304130554, + "learning_rate": 7.114367601045555e-05, + "loss": 1.7538, + "step": 12388 + }, + { + "epoch": 3.8026396562308165, + "grad_norm": 0.27252480387687683, + "learning_rate": 7.11391716407375e-05, + "loss": 1.7604, + "step": 12389 + }, + { + "epoch": 3.802946593001842, + "grad_norm": 0.2122909128665924, + "learning_rate": 7.113466706210976e-05, + "loss": 1.716, + "step": 12390 + }, + { + "epoch": 3.8032535297728667, + "grad_norm": 0.30141574144363403, + "learning_rate": 7.113016227461686e-05, + "loss": 1.7636, + "step": 12391 + }, + { + "epoch": 3.803560466543892, + "grad_norm": 0.33359697461128235, + "learning_rate": 7.112565727830331e-05, + "loss": 1.7805, + "step": 12392 + }, + { + "epoch": 3.803867403314917, + "grad_norm": 0.3161376714706421, + "learning_rate": 7.112115207321364e-05, + "loss": 1.7974, + "step": 12393 + }, + { + "epoch": 3.8041743400859422, + "grad_norm": 0.29028698801994324, + "learning_rate": 7.111664665939235e-05, + "loss": 1.83, + "step": 12394 + }, + { + "epoch": 3.8044812768569676, + "grad_norm": 0.38829556107521057, + "learning_rate": 7.1112141036884e-05, + "loss": 1.8684, + "step": 12395 + }, + { + "epoch": 3.804788213627993, + "grad_norm": 0.4118283987045288, + "learning_rate": 7.110763520573309e-05, + "loss": 1.7812, + "step": 12396 + }, + { + "epoch": 3.805095150399018, + "grad_norm": 0.3907717168331146, + "learning_rate": 7.110312916598416e-05, + "loss": 1.7789, + "step": 12397 + }, + { + "epoch": 3.805402087170043, + "grad_norm": 0.2768644690513611, + "learning_rate": 7.109862291768173e-05, + "loss": 1.8575, + "step": 12398 + }, + { + "epoch": 3.805709023941068, + "grad_norm": 0.3234006464481354, + "learning_rate": 7.109411646087035e-05, + "loss": 1.7485, + "step": 12399 + }, + { + "epoch": 3.8060159607120934, + "grad_norm": 0.415475994348526, + "learning_rate": 7.108960979559454e-05, + "loss": 1.7363, + "step": 12400 + }, + { + "epoch": 3.8063228974831187, + "grad_norm": 0.38654613494873047, + "learning_rate": 7.108510292189884e-05, + "loss": 1.7907, + "step": 12401 + }, + { + "epoch": 3.8066298342541436, + "grad_norm": 0.2541481852531433, + "learning_rate": 7.10805958398278e-05, + "loss": 1.8458, + "step": 12402 + }, + { + "epoch": 3.806936771025169, + "grad_norm": 0.32562851905822754, + "learning_rate": 7.107608854942597e-05, + "loss": 1.7989, + "step": 12403 + }, + { + "epoch": 3.807243707796194, + "grad_norm": 0.3628395199775696, + "learning_rate": 7.107158105073786e-05, + "loss": 1.8044, + "step": 12404 + }, + { + "epoch": 3.807550644567219, + "grad_norm": 0.3363969027996063, + "learning_rate": 7.106707334380805e-05, + "loss": 1.8078, + "step": 12405 + }, + { + "epoch": 3.8078575813382445, + "grad_norm": 0.2853989601135254, + "learning_rate": 7.106256542868108e-05, + "loss": 1.7913, + "step": 12406 + }, + { + "epoch": 3.8081645181092694, + "grad_norm": 0.33455806970596313, + "learning_rate": 7.105805730540148e-05, + "loss": 1.7252, + "step": 12407 + }, + { + "epoch": 3.8084714548802947, + "grad_norm": 0.28103405237197876, + "learning_rate": 7.105354897401382e-05, + "loss": 1.6942, + "step": 12408 + }, + { + "epoch": 3.8087783916513196, + "grad_norm": 0.23230718076229095, + "learning_rate": 7.104904043456264e-05, + "loss": 1.7723, + "step": 12409 + }, + { + "epoch": 3.809085328422345, + "grad_norm": 0.2883053421974182, + "learning_rate": 7.104453168709251e-05, + "loss": 1.8015, + "step": 12410 + }, + { + "epoch": 3.8093922651933703, + "grad_norm": 0.28462252020835876, + "learning_rate": 7.104002273164798e-05, + "loss": 1.791, + "step": 12411 + }, + { + "epoch": 3.8096992019643956, + "grad_norm": 0.3004699647426605, + "learning_rate": 7.103551356827363e-05, + "loss": 1.8401, + "step": 12412 + }, + { + "epoch": 3.8100061387354205, + "grad_norm": 0.2546156048774719, + "learning_rate": 7.1031004197014e-05, + "loss": 1.7645, + "step": 12413 + }, + { + "epoch": 3.810313075506446, + "grad_norm": 0.24532915651798248, + "learning_rate": 7.102649461791364e-05, + "loss": 1.8, + "step": 12414 + }, + { + "epoch": 3.8106200122774707, + "grad_norm": 0.2432405799627304, + "learning_rate": 7.102198483101716e-05, + "loss": 1.7957, + "step": 12415 + }, + { + "epoch": 3.810926949048496, + "grad_norm": 0.24405215680599213, + "learning_rate": 7.101747483636908e-05, + "loss": 1.79, + "step": 12416 + }, + { + "epoch": 3.8112338858195214, + "grad_norm": 0.29519838094711304, + "learning_rate": 7.101296463401401e-05, + "loss": 1.8087, + "step": 12417 + }, + { + "epoch": 3.8115408225905463, + "grad_norm": 0.28205612301826477, + "learning_rate": 7.100845422399652e-05, + "loss": 1.7897, + "step": 12418 + }, + { + "epoch": 3.8118477593615716, + "grad_norm": 0.25014567375183105, + "learning_rate": 7.100394360636115e-05, + "loss": 1.7574, + "step": 12419 + }, + { + "epoch": 3.8121546961325965, + "grad_norm": 0.3133499026298523, + "learning_rate": 7.099943278115251e-05, + "loss": 1.7957, + "step": 12420 + }, + { + "epoch": 3.812461632903622, + "grad_norm": 0.3706473708152771, + "learning_rate": 7.099492174841516e-05, + "loss": 1.8519, + "step": 12421 + }, + { + "epoch": 3.812768569674647, + "grad_norm": 0.30085715651512146, + "learning_rate": 7.09904105081937e-05, + "loss": 1.778, + "step": 12422 + }, + { + "epoch": 3.813075506445672, + "grad_norm": 0.23897981643676758, + "learning_rate": 7.09858990605327e-05, + "loss": 1.7289, + "step": 12423 + }, + { + "epoch": 3.8133824432166974, + "grad_norm": 0.30046290159225464, + "learning_rate": 7.098138740547673e-05, + "loss": 1.8838, + "step": 12424 + }, + { + "epoch": 3.8136893799877223, + "grad_norm": 0.32126328349113464, + "learning_rate": 7.097687554307041e-05, + "loss": 1.7916, + "step": 12425 + }, + { + "epoch": 3.8139963167587476, + "grad_norm": 0.2922256886959076, + "learning_rate": 7.097236347335829e-05, + "loss": 1.8305, + "step": 12426 + }, + { + "epoch": 3.814303253529773, + "grad_norm": 0.2772706151008606, + "learning_rate": 7.0967851196385e-05, + "loss": 1.7694, + "step": 12427 + }, + { + "epoch": 3.8146101903007983, + "grad_norm": 0.25763455033302307, + "learning_rate": 7.096333871219511e-05, + "loss": 1.8716, + "step": 12428 + }, + { + "epoch": 3.814917127071823, + "grad_norm": 0.2631739377975464, + "learning_rate": 7.095882602083322e-05, + "loss": 1.7771, + "step": 12429 + }, + { + "epoch": 3.8152240638428485, + "grad_norm": 0.29229632019996643, + "learning_rate": 7.095431312234392e-05, + "loss": 1.7865, + "step": 12430 + }, + { + "epoch": 3.8155310006138734, + "grad_norm": 0.2672729790210724, + "learning_rate": 7.094980001677181e-05, + "loss": 1.7848, + "step": 12431 + }, + { + "epoch": 3.8158379373848987, + "grad_norm": 0.2388373166322708, + "learning_rate": 7.094528670416152e-05, + "loss": 1.75, + "step": 12432 + }, + { + "epoch": 3.816144874155924, + "grad_norm": 0.2385305017232895, + "learning_rate": 7.094077318455762e-05, + "loss": 1.748, + "step": 12433 + }, + { + "epoch": 3.816451810926949, + "grad_norm": 0.25421401858329773, + "learning_rate": 7.093625945800471e-05, + "loss": 1.779, + "step": 12434 + }, + { + "epoch": 3.8167587476979743, + "grad_norm": 0.2785158157348633, + "learning_rate": 7.093174552454743e-05, + "loss": 1.8295, + "step": 12435 + }, + { + "epoch": 3.817065684468999, + "grad_norm": 0.2907472252845764, + "learning_rate": 7.092723138423036e-05, + "loss": 1.8216, + "step": 12436 + }, + { + "epoch": 3.8173726212400245, + "grad_norm": 0.253955215215683, + "learning_rate": 7.092271703709814e-05, + "loss": 1.8394, + "step": 12437 + }, + { + "epoch": 3.81767955801105, + "grad_norm": 0.32139912247657776, + "learning_rate": 7.091820248319537e-05, + "loss": 1.8634, + "step": 12438 + }, + { + "epoch": 3.8179864947820747, + "grad_norm": 0.25890466570854187, + "learning_rate": 7.091368772256664e-05, + "loss": 1.7336, + "step": 12439 + }, + { + "epoch": 3.8182934315531, + "grad_norm": 0.2823775112628937, + "learning_rate": 7.090917275525661e-05, + "loss": 1.7927, + "step": 12440 + }, + { + "epoch": 3.818600368324125, + "grad_norm": 0.28739333152770996, + "learning_rate": 7.090465758130988e-05, + "loss": 1.7807, + "step": 12441 + }, + { + "epoch": 3.8189073050951503, + "grad_norm": 0.36823949217796326, + "learning_rate": 7.090014220077106e-05, + "loss": 1.7288, + "step": 12442 + }, + { + "epoch": 3.8192142418661756, + "grad_norm": 0.3061312735080719, + "learning_rate": 7.089562661368479e-05, + "loss": 1.8039, + "step": 12443 + }, + { + "epoch": 3.819521178637201, + "grad_norm": 0.25867924094200134, + "learning_rate": 7.089111082009569e-05, + "loss": 1.7678, + "step": 12444 + }, + { + "epoch": 3.819828115408226, + "grad_norm": 0.26834985613822937, + "learning_rate": 7.088659482004837e-05, + "loss": 1.7592, + "step": 12445 + }, + { + "epoch": 3.820135052179251, + "grad_norm": 0.25608211755752563, + "learning_rate": 7.08820786135875e-05, + "loss": 1.7622, + "step": 12446 + }, + { + "epoch": 3.820441988950276, + "grad_norm": 0.2512456774711609, + "learning_rate": 7.087756220075769e-05, + "loss": 1.7648, + "step": 12447 + }, + { + "epoch": 3.8207489257213014, + "grad_norm": 0.2434878647327423, + "learning_rate": 7.087304558160355e-05, + "loss": 1.7435, + "step": 12448 + }, + { + "epoch": 3.8210558624923268, + "grad_norm": 0.26456570625305176, + "learning_rate": 7.086852875616978e-05, + "loss": 1.7342, + "step": 12449 + }, + { + "epoch": 3.8213627992633517, + "grad_norm": 0.2958984971046448, + "learning_rate": 7.086401172450095e-05, + "loss": 1.8532, + "step": 12450 + }, + { + "epoch": 3.821669736034377, + "grad_norm": 0.25939157605171204, + "learning_rate": 7.085949448664172e-05, + "loss": 1.7746, + "step": 12451 + }, + { + "epoch": 3.821976672805402, + "grad_norm": 0.2210223525762558, + "learning_rate": 7.085497704263675e-05, + "loss": 1.7745, + "step": 12452 + }, + { + "epoch": 3.822283609576427, + "grad_norm": 0.2409319430589676, + "learning_rate": 7.085045939253068e-05, + "loss": 1.7981, + "step": 12453 + }, + { + "epoch": 3.8225905463474525, + "grad_norm": 0.26331812143325806, + "learning_rate": 7.084594153636815e-05, + "loss": 1.8163, + "step": 12454 + }, + { + "epoch": 3.8228974831184774, + "grad_norm": 0.2613828480243683, + "learning_rate": 7.08414234741938e-05, + "loss": 1.8362, + "step": 12455 + }, + { + "epoch": 3.8232044198895028, + "grad_norm": 0.3139529228210449, + "learning_rate": 7.083690520605228e-05, + "loss": 1.8247, + "step": 12456 + }, + { + "epoch": 3.8235113566605277, + "grad_norm": 0.2958570718765259, + "learning_rate": 7.083238673198826e-05, + "loss": 1.8011, + "step": 12457 + }, + { + "epoch": 3.823818293431553, + "grad_norm": 0.2517626881599426, + "learning_rate": 7.082786805204639e-05, + "loss": 1.7353, + "step": 12458 + }, + { + "epoch": 3.8241252302025783, + "grad_norm": 0.2443888783454895, + "learning_rate": 7.082334916627132e-05, + "loss": 1.7916, + "step": 12459 + }, + { + "epoch": 3.8244321669736037, + "grad_norm": 0.283514142036438, + "learning_rate": 7.08188300747077e-05, + "loss": 1.8048, + "step": 12460 + }, + { + "epoch": 3.8247391037446286, + "grad_norm": 0.24775351583957672, + "learning_rate": 7.08143107774002e-05, + "loss": 1.8145, + "step": 12461 + }, + { + "epoch": 3.825046040515654, + "grad_norm": 0.27904003858566284, + "learning_rate": 7.080979127439347e-05, + "loss": 1.8003, + "step": 12462 + }, + { + "epoch": 3.825352977286679, + "grad_norm": 0.24997512996196747, + "learning_rate": 7.08052715657322e-05, + "loss": 1.7962, + "step": 12463 + }, + { + "epoch": 3.825659914057704, + "grad_norm": 0.25874343514442444, + "learning_rate": 7.080075165146104e-05, + "loss": 1.7861, + "step": 12464 + }, + { + "epoch": 3.8259668508287294, + "grad_norm": 0.2964434027671814, + "learning_rate": 7.079623153162467e-05, + "loss": 1.7618, + "step": 12465 + }, + { + "epoch": 3.8262737875997543, + "grad_norm": 0.26403337717056274, + "learning_rate": 7.079171120626774e-05, + "loss": 1.8016, + "step": 12466 + }, + { + "epoch": 3.8265807243707797, + "grad_norm": 0.28369295597076416, + "learning_rate": 7.078719067543494e-05, + "loss": 1.7517, + "step": 12467 + }, + { + "epoch": 3.8268876611418046, + "grad_norm": 0.254312127828598, + "learning_rate": 7.078266993917093e-05, + "loss": 1.8085, + "step": 12468 + }, + { + "epoch": 3.82719459791283, + "grad_norm": 0.24992622435092926, + "learning_rate": 7.077814899752038e-05, + "loss": 1.7657, + "step": 12469 + }, + { + "epoch": 3.8275015346838552, + "grad_norm": 0.26485762000083923, + "learning_rate": 7.077362785052802e-05, + "loss": 1.7303, + "step": 12470 + }, + { + "epoch": 3.8278084714548806, + "grad_norm": 0.29864901304244995, + "learning_rate": 7.076910649823846e-05, + "loss": 1.7734, + "step": 12471 + }, + { + "epoch": 3.8281154082259055, + "grad_norm": 0.2973599433898926, + "learning_rate": 7.076458494069644e-05, + "loss": 1.8055, + "step": 12472 + }, + { + "epoch": 3.828422344996931, + "grad_norm": 0.2150362730026245, + "learning_rate": 7.07600631779466e-05, + "loss": 1.7377, + "step": 12473 + }, + { + "epoch": 3.8287292817679557, + "grad_norm": 0.26443010568618774, + "learning_rate": 7.075554121003367e-05, + "loss": 1.837, + "step": 12474 + }, + { + "epoch": 3.829036218538981, + "grad_norm": 0.27365007996559143, + "learning_rate": 7.075101903700231e-05, + "loss": 1.7784, + "step": 12475 + }, + { + "epoch": 3.8293431553100064, + "grad_norm": 0.22037263214588165, + "learning_rate": 7.074649665889721e-05, + "loss": 1.8182, + "step": 12476 + }, + { + "epoch": 3.8296500920810312, + "grad_norm": 0.29614946246147156, + "learning_rate": 7.074197407576308e-05, + "loss": 1.7993, + "step": 12477 + }, + { + "epoch": 3.8299570288520566, + "grad_norm": 0.25135520100593567, + "learning_rate": 7.07374512876446e-05, + "loss": 1.8211, + "step": 12478 + }, + { + "epoch": 3.8302639656230815, + "grad_norm": 0.2711503207683563, + "learning_rate": 7.073292829458645e-05, + "loss": 1.8274, + "step": 12479 + }, + { + "epoch": 3.830570902394107, + "grad_norm": 0.38659265637397766, + "learning_rate": 7.072840509663338e-05, + "loss": 1.796, + "step": 12480 + }, + { + "epoch": 3.830877839165132, + "grad_norm": 0.39382728934288025, + "learning_rate": 7.072388169383005e-05, + "loss": 1.8439, + "step": 12481 + }, + { + "epoch": 3.831184775936157, + "grad_norm": 0.27570033073425293, + "learning_rate": 7.071935808622118e-05, + "loss": 1.8155, + "step": 12482 + }, + { + "epoch": 3.8314917127071824, + "grad_norm": 0.29054465889930725, + "learning_rate": 7.071483427385147e-05, + "loss": 1.754, + "step": 12483 + }, + { + "epoch": 3.8317986494782073, + "grad_norm": 0.4138031303882599, + "learning_rate": 7.071031025676562e-05, + "loss": 1.7686, + "step": 12484 + }, + { + "epoch": 3.8321055862492326, + "grad_norm": 0.3447251617908478, + "learning_rate": 7.070578603500833e-05, + "loss": 1.8135, + "step": 12485 + }, + { + "epoch": 3.832412523020258, + "grad_norm": 0.265115886926651, + "learning_rate": 7.070126160862436e-05, + "loss": 1.803, + "step": 12486 + }, + { + "epoch": 3.8327194597912833, + "grad_norm": 0.4288817346096039, + "learning_rate": 7.069673697765837e-05, + "loss": 1.7814, + "step": 12487 + }, + { + "epoch": 3.833026396562308, + "grad_norm": 0.4890103340148926, + "learning_rate": 7.06922121421551e-05, + "loss": 1.8318, + "step": 12488 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.3676142990589142, + "learning_rate": 7.068768710215928e-05, + "loss": 1.7792, + "step": 12489 + }, + { + "epoch": 3.8336402701043584, + "grad_norm": 0.23254090547561646, + "learning_rate": 7.068316185771557e-05, + "loss": 1.7154, + "step": 12490 + }, + { + "epoch": 3.8339472068753837, + "grad_norm": 0.35014036297798157, + "learning_rate": 7.067863640886876e-05, + "loss": 1.7031, + "step": 12491 + }, + { + "epoch": 3.834254143646409, + "grad_norm": 0.32155317068099976, + "learning_rate": 7.067411075566353e-05, + "loss": 1.7692, + "step": 12492 + }, + { + "epoch": 3.834561080417434, + "grad_norm": 0.260772705078125, + "learning_rate": 7.066958489814463e-05, + "loss": 1.7488, + "step": 12493 + }, + { + "epoch": 3.8348680171884593, + "grad_norm": 0.2624910771846771, + "learning_rate": 7.066505883635678e-05, + "loss": 1.7436, + "step": 12494 + }, + { + "epoch": 3.835174953959484, + "grad_norm": 0.2782299220561981, + "learning_rate": 7.066053257034471e-05, + "loss": 1.8219, + "step": 12495 + }, + { + "epoch": 3.8354818907305095, + "grad_norm": 0.2749497890472412, + "learning_rate": 7.065600610015312e-05, + "loss": 1.8068, + "step": 12496 + }, + { + "epoch": 3.835788827501535, + "grad_norm": 0.2730359733104706, + "learning_rate": 7.06514794258268e-05, + "loss": 1.7588, + "step": 12497 + }, + { + "epoch": 3.8360957642725597, + "grad_norm": 0.3606291711330414, + "learning_rate": 7.064695254741044e-05, + "loss": 1.8509, + "step": 12498 + }, + { + "epoch": 3.836402701043585, + "grad_norm": 0.23282989859580994, + "learning_rate": 7.064242546494879e-05, + "loss": 1.7444, + "step": 12499 + }, + { + "epoch": 3.83670963781461, + "grad_norm": 0.2554507255554199, + "learning_rate": 7.06378981784866e-05, + "loss": 1.7486, + "step": 12500 + }, + { + "epoch": 3.8370165745856353, + "grad_norm": 0.2916143834590912, + "learning_rate": 7.06333706880686e-05, + "loss": 1.8035, + "step": 12501 + }, + { + "epoch": 3.8373235113566606, + "grad_norm": 0.23719090223312378, + "learning_rate": 7.062884299373955e-05, + "loss": 1.7896, + "step": 12502 + }, + { + "epoch": 3.837630448127686, + "grad_norm": 0.2596152126789093, + "learning_rate": 7.062431509554417e-05, + "loss": 1.7944, + "step": 12503 + }, + { + "epoch": 3.837937384898711, + "grad_norm": 0.29140764474868774, + "learning_rate": 7.061978699352723e-05, + "loss": 1.7988, + "step": 12504 + }, + { + "epoch": 3.838244321669736, + "grad_norm": 0.3421068489551544, + "learning_rate": 7.061525868773347e-05, + "loss": 1.751, + "step": 12505 + }, + { + "epoch": 3.838551258440761, + "grad_norm": 0.2705349624156952, + "learning_rate": 7.061073017820764e-05, + "loss": 1.7578, + "step": 12506 + }, + { + "epoch": 3.8388581952117864, + "grad_norm": 0.2403286248445511, + "learning_rate": 7.060620146499448e-05, + "loss": 1.8422, + "step": 12507 + }, + { + "epoch": 3.8391651319828117, + "grad_norm": 0.3860442042350769, + "learning_rate": 7.060167254813876e-05, + "loss": 1.8168, + "step": 12508 + }, + { + "epoch": 3.8394720687538366, + "grad_norm": 0.4729512631893158, + "learning_rate": 7.059714342768526e-05, + "loss": 1.7786, + "step": 12509 + }, + { + "epoch": 3.839779005524862, + "grad_norm": 0.3522968888282776, + "learning_rate": 7.059261410367871e-05, + "loss": 1.8749, + "step": 12510 + }, + { + "epoch": 3.840085942295887, + "grad_norm": 0.28071436285972595, + "learning_rate": 7.058808457616386e-05, + "loss": 1.7959, + "step": 12511 + }, + { + "epoch": 3.840392879066912, + "grad_norm": 0.4356439411640167, + "learning_rate": 7.05835548451855e-05, + "loss": 1.8045, + "step": 12512 + }, + { + "epoch": 3.8406998158379375, + "grad_norm": 0.4051562249660492, + "learning_rate": 7.057902491078839e-05, + "loss": 1.7909, + "step": 12513 + }, + { + "epoch": 3.8410067526089624, + "grad_norm": 0.2817205488681793, + "learning_rate": 7.057449477301728e-05, + "loss": 1.8736, + "step": 12514 + }, + { + "epoch": 3.8413136893799877, + "grad_norm": 0.33369559049606323, + "learning_rate": 7.056996443191697e-05, + "loss": 1.7799, + "step": 12515 + }, + { + "epoch": 3.8416206261510126, + "grad_norm": 0.369954913854599, + "learning_rate": 7.056543388753221e-05, + "loss": 1.795, + "step": 12516 + }, + { + "epoch": 3.841927562922038, + "grad_norm": 0.289474755525589, + "learning_rate": 7.056090313990778e-05, + "loss": 1.786, + "step": 12517 + }, + { + "epoch": 3.8422344996930633, + "grad_norm": 0.2431849092245102, + "learning_rate": 7.055637218908845e-05, + "loss": 1.7363, + "step": 12518 + }, + { + "epoch": 3.8425414364640886, + "grad_norm": 0.3736060857772827, + "learning_rate": 7.0551841035119e-05, + "loss": 1.8234, + "step": 12519 + }, + { + "epoch": 3.8428483732351135, + "grad_norm": 0.34008854627609253, + "learning_rate": 7.054730967804422e-05, + "loss": 1.8001, + "step": 12520 + }, + { + "epoch": 3.843155310006139, + "grad_norm": 0.24852876365184784, + "learning_rate": 7.054277811790887e-05, + "loss": 1.8298, + "step": 12521 + }, + { + "epoch": 3.8434622467771637, + "grad_norm": 0.3491046726703644, + "learning_rate": 7.053824635475777e-05, + "loss": 1.7336, + "step": 12522 + }, + { + "epoch": 3.843769183548189, + "grad_norm": 0.38757824897766113, + "learning_rate": 7.053371438863566e-05, + "loss": 1.8241, + "step": 12523 + }, + { + "epoch": 3.8440761203192144, + "grad_norm": 0.2607647180557251, + "learning_rate": 7.052918221958735e-05, + "loss": 1.7813, + "step": 12524 + }, + { + "epoch": 3.8443830570902393, + "grad_norm": 0.25634410977363586, + "learning_rate": 7.052464984765764e-05, + "loss": 1.7836, + "step": 12525 + }, + { + "epoch": 3.8446899938612646, + "grad_norm": 0.3113503158092499, + "learning_rate": 7.052011727289129e-05, + "loss": 1.8477, + "step": 12526 + }, + { + "epoch": 3.8449969306322895, + "grad_norm": 0.2852596044540405, + "learning_rate": 7.051558449533313e-05, + "loss": 1.7607, + "step": 12527 + }, + { + "epoch": 3.845303867403315, + "grad_norm": 0.24841541051864624, + "learning_rate": 7.051105151502795e-05, + "loss": 1.8109, + "step": 12528 + }, + { + "epoch": 3.84561080417434, + "grad_norm": 0.2231549620628357, + "learning_rate": 7.050651833202053e-05, + "loss": 1.7245, + "step": 12529 + }, + { + "epoch": 3.845917740945365, + "grad_norm": 0.21975892782211304, + "learning_rate": 7.050198494635566e-05, + "loss": 1.7512, + "step": 12530 + }, + { + "epoch": 3.8462246777163904, + "grad_norm": 0.2546280324459076, + "learning_rate": 7.049745135807816e-05, + "loss": 1.8003, + "step": 12531 + }, + { + "epoch": 3.8465316144874153, + "grad_norm": 0.21507929265499115, + "learning_rate": 7.049291756723284e-05, + "loss": 1.7616, + "step": 12532 + }, + { + "epoch": 3.8468385512584407, + "grad_norm": 0.24927987158298492, + "learning_rate": 7.04883835738645e-05, + "loss": 1.7519, + "step": 12533 + }, + { + "epoch": 3.847145488029466, + "grad_norm": 0.24988602101802826, + "learning_rate": 7.048384937801793e-05, + "loss": 1.7966, + "step": 12534 + }, + { + "epoch": 3.8474524248004913, + "grad_norm": 0.24039845168590546, + "learning_rate": 7.047931497973798e-05, + "loss": 1.7834, + "step": 12535 + }, + { + "epoch": 3.847759361571516, + "grad_norm": 0.22826696932315826, + "learning_rate": 7.047478037906943e-05, + "loss": 1.7334, + "step": 12536 + }, + { + "epoch": 3.8480662983425415, + "grad_norm": 0.22260744869709015, + "learning_rate": 7.047024557605708e-05, + "loss": 1.787, + "step": 12537 + }, + { + "epoch": 3.8483732351135664, + "grad_norm": 0.2457917332649231, + "learning_rate": 7.046571057074578e-05, + "loss": 1.7865, + "step": 12538 + }, + { + "epoch": 3.8486801718845918, + "grad_norm": 0.23952928185462952, + "learning_rate": 7.046117536318035e-05, + "loss": 1.7764, + "step": 12539 + }, + { + "epoch": 3.848987108655617, + "grad_norm": 0.22186748683452606, + "learning_rate": 7.045663995340557e-05, + "loss": 1.7917, + "step": 12540 + }, + { + "epoch": 3.849294045426642, + "grad_norm": 0.24234962463378906, + "learning_rate": 7.045210434146629e-05, + "loss": 1.7697, + "step": 12541 + }, + { + "epoch": 3.8496009821976673, + "grad_norm": 0.2510770857334137, + "learning_rate": 7.044756852740732e-05, + "loss": 1.8012, + "step": 12542 + }, + { + "epoch": 3.849907918968692, + "grad_norm": 0.24910703301429749, + "learning_rate": 7.044303251127349e-05, + "loss": 1.831, + "step": 12543 + }, + { + "epoch": 3.8502148557397176, + "grad_norm": 0.3159966468811035, + "learning_rate": 7.043849629310964e-05, + "loss": 1.8029, + "step": 12544 + }, + { + "epoch": 3.850521792510743, + "grad_norm": 0.3155403733253479, + "learning_rate": 7.04339598729606e-05, + "loss": 1.7429, + "step": 12545 + }, + { + "epoch": 3.8508287292817682, + "grad_norm": 0.3037515878677368, + "learning_rate": 7.042942325087117e-05, + "loss": 1.8186, + "step": 12546 + }, + { + "epoch": 3.851135666052793, + "grad_norm": 0.2319766730070114, + "learning_rate": 7.042488642688621e-05, + "loss": 1.7853, + "step": 12547 + }, + { + "epoch": 3.8514426028238185, + "grad_norm": 0.23911969363689423, + "learning_rate": 7.042034940105055e-05, + "loss": 1.8314, + "step": 12548 + }, + { + "epoch": 3.8517495395948433, + "grad_norm": 0.2541846036911011, + "learning_rate": 7.041581217340905e-05, + "loss": 1.8289, + "step": 12549 + }, + { + "epoch": 3.8520564763658687, + "grad_norm": 0.22234943509101868, + "learning_rate": 7.04112747440065e-05, + "loss": 1.7847, + "step": 12550 + }, + { + "epoch": 3.852363413136894, + "grad_norm": 0.2747870981693268, + "learning_rate": 7.04067371128878e-05, + "loss": 1.7875, + "step": 12551 + }, + { + "epoch": 3.852670349907919, + "grad_norm": 0.28589147329330444, + "learning_rate": 7.040219928009775e-05, + "loss": 1.7289, + "step": 12552 + }, + { + "epoch": 3.8529772866789442, + "grad_norm": 0.21180351078510284, + "learning_rate": 7.039766124568119e-05, + "loss": 1.7611, + "step": 12553 + }, + { + "epoch": 3.853284223449969, + "grad_norm": 0.27751782536506653, + "learning_rate": 7.0393123009683e-05, + "loss": 1.7481, + "step": 12554 + }, + { + "epoch": 3.8535911602209945, + "grad_norm": 0.32883307337760925, + "learning_rate": 7.038858457214802e-05, + "loss": 1.7271, + "step": 12555 + }, + { + "epoch": 3.85389809699202, + "grad_norm": 0.30965641140937805, + "learning_rate": 7.03840459331211e-05, + "loss": 1.81, + "step": 12556 + }, + { + "epoch": 3.8542050337630447, + "grad_norm": 0.25184348225593567, + "learning_rate": 7.037950709264709e-05, + "loss": 1.7642, + "step": 12557 + }, + { + "epoch": 3.85451197053407, + "grad_norm": 0.2376822829246521, + "learning_rate": 7.037496805077084e-05, + "loss": 1.7774, + "step": 12558 + }, + { + "epoch": 3.854818907305095, + "grad_norm": 0.2395993024110794, + "learning_rate": 7.03704288075372e-05, + "loss": 1.8397, + "step": 12559 + }, + { + "epoch": 3.8551258440761202, + "grad_norm": 0.26460394263267517, + "learning_rate": 7.036588936299107e-05, + "loss": 1.7472, + "step": 12560 + }, + { + "epoch": 3.8554327808471456, + "grad_norm": 0.34742459654808044, + "learning_rate": 7.036134971717725e-05, + "loss": 1.8003, + "step": 12561 + }, + { + "epoch": 3.855739717618171, + "grad_norm": 0.2829316556453705, + "learning_rate": 7.035680987014068e-05, + "loss": 1.7765, + "step": 12562 + }, + { + "epoch": 3.856046654389196, + "grad_norm": 0.3087223172187805, + "learning_rate": 7.035226982192615e-05, + "loss": 1.8462, + "step": 12563 + }, + { + "epoch": 3.856353591160221, + "grad_norm": 0.2806380093097687, + "learning_rate": 7.034772957257858e-05, + "loss": 1.7704, + "step": 12564 + }, + { + "epoch": 3.856660527931246, + "grad_norm": 0.25598087906837463, + "learning_rate": 7.03431891221428e-05, + "loss": 1.7843, + "step": 12565 + }, + { + "epoch": 3.8569674647022714, + "grad_norm": 0.30833700299263, + "learning_rate": 7.033864847066373e-05, + "loss": 1.8404, + "step": 12566 + }, + { + "epoch": 3.8572744014732967, + "grad_norm": 0.29562532901763916, + "learning_rate": 7.03341076181862e-05, + "loss": 1.8044, + "step": 12567 + }, + { + "epoch": 3.8575813382443216, + "grad_norm": 0.2901719808578491, + "learning_rate": 7.03295665647551e-05, + "loss": 1.7789, + "step": 12568 + }, + { + "epoch": 3.857888275015347, + "grad_norm": 0.25453686714172363, + "learning_rate": 7.03250253104153e-05, + "loss": 1.6792, + "step": 12569 + }, + { + "epoch": 3.858195211786372, + "grad_norm": 0.26009416580200195, + "learning_rate": 7.03204838552117e-05, + "loss": 1.7835, + "step": 12570 + }, + { + "epoch": 3.858502148557397, + "grad_norm": 0.28074127435684204, + "learning_rate": 7.031594219918916e-05, + "loss": 1.7932, + "step": 12571 + }, + { + "epoch": 3.8588090853284225, + "grad_norm": 0.3341725170612335, + "learning_rate": 7.031140034239258e-05, + "loss": 1.7439, + "step": 12572 + }, + { + "epoch": 3.8591160220994474, + "grad_norm": 0.28142449259757996, + "learning_rate": 7.030685828486684e-05, + "loss": 1.8263, + "step": 12573 + }, + { + "epoch": 3.8594229588704727, + "grad_norm": 0.2571438252925873, + "learning_rate": 7.030231602665681e-05, + "loss": 1.7628, + "step": 12574 + }, + { + "epoch": 3.8597298956414976, + "grad_norm": 0.3079041838645935, + "learning_rate": 7.029777356780741e-05, + "loss": 1.7879, + "step": 12575 + }, + { + "epoch": 3.860036832412523, + "grad_norm": 0.2605433464050293, + "learning_rate": 7.029323090836349e-05, + "loss": 1.7841, + "step": 12576 + }, + { + "epoch": 3.8603437691835483, + "grad_norm": 0.24069640040397644, + "learning_rate": 7.028868804836999e-05, + "loss": 1.7939, + "step": 12577 + }, + { + "epoch": 3.8606507059545736, + "grad_norm": 0.26801639795303345, + "learning_rate": 7.028414498787177e-05, + "loss": 1.8082, + "step": 12578 + }, + { + "epoch": 3.8609576427255985, + "grad_norm": 0.28828585147857666, + "learning_rate": 7.027960172691375e-05, + "loss": 1.8094, + "step": 12579 + }, + { + "epoch": 3.861264579496624, + "grad_norm": 0.22927051782608032, + "learning_rate": 7.027505826554082e-05, + "loss": 1.7758, + "step": 12580 + }, + { + "epoch": 3.8615715162676487, + "grad_norm": 0.25755998492240906, + "learning_rate": 7.027051460379788e-05, + "loss": 1.8429, + "step": 12581 + }, + { + "epoch": 3.861878453038674, + "grad_norm": 0.23636581003665924, + "learning_rate": 7.026597074172982e-05, + "loss": 1.7662, + "step": 12582 + }, + { + "epoch": 3.8621853898096994, + "grad_norm": 0.22599349915981293, + "learning_rate": 7.026142667938156e-05, + "loss": 1.7199, + "step": 12583 + }, + { + "epoch": 3.8624923265807243, + "grad_norm": 0.2504875659942627, + "learning_rate": 7.025688241679802e-05, + "loss": 1.8473, + "step": 12584 + }, + { + "epoch": 3.8627992633517496, + "grad_norm": 0.3012976348400116, + "learning_rate": 7.025233795402408e-05, + "loss": 1.8715, + "step": 12585 + }, + { + "epoch": 3.8631062001227745, + "grad_norm": 0.31703677773475647, + "learning_rate": 7.024779329110469e-05, + "loss": 1.8143, + "step": 12586 + }, + { + "epoch": 3.8634131368938, + "grad_norm": 0.27287593483924866, + "learning_rate": 7.024324842808472e-05, + "loss": 1.7227, + "step": 12587 + }, + { + "epoch": 3.863720073664825, + "grad_norm": 0.24663801491260529, + "learning_rate": 7.02387033650091e-05, + "loss": 1.7529, + "step": 12588 + }, + { + "epoch": 3.86402701043585, + "grad_norm": 0.26127147674560547, + "learning_rate": 7.023415810192277e-05, + "loss": 1.7629, + "step": 12589 + }, + { + "epoch": 3.8643339472068754, + "grad_norm": 0.3457142114639282, + "learning_rate": 7.022961263887062e-05, + "loss": 1.8212, + "step": 12590 + }, + { + "epoch": 3.8646408839779003, + "grad_norm": 0.3296070694923401, + "learning_rate": 7.022506697589759e-05, + "loss": 1.7907, + "step": 12591 + }, + { + "epoch": 3.8649478207489256, + "grad_norm": 0.29474303126335144, + "learning_rate": 7.022052111304858e-05, + "loss": 1.7866, + "step": 12592 + }, + { + "epoch": 3.865254757519951, + "grad_norm": 0.2535403072834015, + "learning_rate": 7.021597505036852e-05, + "loss": 1.7607, + "step": 12593 + }, + { + "epoch": 3.8655616942909763, + "grad_norm": 0.26691222190856934, + "learning_rate": 7.021142878790237e-05, + "loss": 1.8063, + "step": 12594 + }, + { + "epoch": 3.865868631062001, + "grad_norm": 0.2784755229949951, + "learning_rate": 7.020688232569502e-05, + "loss": 1.8065, + "step": 12595 + }, + { + "epoch": 3.8661755678330265, + "grad_norm": 0.23714317381381989, + "learning_rate": 7.020233566379142e-05, + "loss": 1.8317, + "step": 12596 + }, + { + "epoch": 3.8664825046040514, + "grad_norm": 0.25010553002357483, + "learning_rate": 7.019778880223649e-05, + "loss": 1.8493, + "step": 12597 + }, + { + "epoch": 3.8667894413750767, + "grad_norm": 0.2798489034175873, + "learning_rate": 7.01932417410752e-05, + "loss": 1.8134, + "step": 12598 + }, + { + "epoch": 3.867096378146102, + "grad_norm": 0.26199260354042053, + "learning_rate": 7.018869448035243e-05, + "loss": 1.6931, + "step": 12599 + }, + { + "epoch": 3.867403314917127, + "grad_norm": 0.24582891166210175, + "learning_rate": 7.018414702011314e-05, + "loss": 1.8076, + "step": 12600 + }, + { + "epoch": 3.8677102516881523, + "grad_norm": 0.25493237376213074, + "learning_rate": 7.01795993604023e-05, + "loss": 1.7851, + "step": 12601 + }, + { + "epoch": 3.868017188459177, + "grad_norm": 0.2607674300670624, + "learning_rate": 7.017505150126483e-05, + "loss": 1.7285, + "step": 12602 + }, + { + "epoch": 3.8683241252302025, + "grad_norm": 0.23629581928253174, + "learning_rate": 7.017050344274568e-05, + "loss": 1.8254, + "step": 12603 + }, + { + "epoch": 3.868631062001228, + "grad_norm": 0.3129318058490753, + "learning_rate": 7.016595518488979e-05, + "loss": 1.7914, + "step": 12604 + }, + { + "epoch": 3.8689379987722528, + "grad_norm": 0.3178271949291229, + "learning_rate": 7.01614067277421e-05, + "loss": 1.8139, + "step": 12605 + }, + { + "epoch": 3.869244935543278, + "grad_norm": 0.3230711817741394, + "learning_rate": 7.015685807134757e-05, + "loss": 1.8203, + "step": 12606 + }, + { + "epoch": 3.869551872314303, + "grad_norm": 0.26339825987815857, + "learning_rate": 7.015230921575118e-05, + "loss": 1.8022, + "step": 12607 + }, + { + "epoch": 3.8698588090853283, + "grad_norm": 0.25337356328964233, + "learning_rate": 7.014776016099785e-05, + "loss": 1.7779, + "step": 12608 + }, + { + "epoch": 3.8701657458563536, + "grad_norm": 0.2506195306777954, + "learning_rate": 7.014321090713253e-05, + "loss": 1.7858, + "step": 12609 + }, + { + "epoch": 3.870472682627379, + "grad_norm": 0.26249951124191284, + "learning_rate": 7.013866145420021e-05, + "loss": 1.8051, + "step": 12610 + }, + { + "epoch": 3.870779619398404, + "grad_norm": 0.25666534900665283, + "learning_rate": 7.013411180224581e-05, + "loss": 1.7945, + "step": 12611 + }, + { + "epoch": 3.871086556169429, + "grad_norm": 0.23901648819446564, + "learning_rate": 7.012956195131433e-05, + "loss": 1.7844, + "step": 12612 + }, + { + "epoch": 3.871393492940454, + "grad_norm": 0.26814451813697815, + "learning_rate": 7.012501190145071e-05, + "loss": 1.7713, + "step": 12613 + }, + { + "epoch": 3.8717004297114794, + "grad_norm": 0.28377315402030945, + "learning_rate": 7.012046165269995e-05, + "loss": 1.7866, + "step": 12614 + }, + { + "epoch": 3.8720073664825048, + "grad_norm": 0.2751680612564087, + "learning_rate": 7.011591120510699e-05, + "loss": 1.7215, + "step": 12615 + }, + { + "epoch": 3.8723143032535297, + "grad_norm": 0.21988113224506378, + "learning_rate": 7.011136055871679e-05, + "loss": 1.8009, + "step": 12616 + }, + { + "epoch": 3.872621240024555, + "grad_norm": 0.26462143659591675, + "learning_rate": 7.010680971357434e-05, + "loss": 1.7618, + "step": 12617 + }, + { + "epoch": 3.87292817679558, + "grad_norm": 0.29054632782936096, + "learning_rate": 7.010225866972462e-05, + "loss": 1.7549, + "step": 12618 + }, + { + "epoch": 3.873235113566605, + "grad_norm": 0.31341224908828735, + "learning_rate": 7.00977074272126e-05, + "loss": 1.8827, + "step": 12619 + }, + { + "epoch": 3.8735420503376305, + "grad_norm": 0.24252115190029144, + "learning_rate": 7.009315598608324e-05, + "loss": 1.7544, + "step": 12620 + }, + { + "epoch": 3.873848987108656, + "grad_norm": 0.30036893486976624, + "learning_rate": 7.008860434638154e-05, + "loss": 1.7465, + "step": 12621 + }, + { + "epoch": 3.8741559238796808, + "grad_norm": 0.3217438757419586, + "learning_rate": 7.00840525081525e-05, + "loss": 1.72, + "step": 12622 + }, + { + "epoch": 3.874462860650706, + "grad_norm": 0.22507290542125702, + "learning_rate": 7.007950047144105e-05, + "loss": 1.7177, + "step": 12623 + }, + { + "epoch": 3.874769797421731, + "grad_norm": 0.3014441728591919, + "learning_rate": 7.007494823629224e-05, + "loss": 1.7502, + "step": 12624 + }, + { + "epoch": 3.8750767341927563, + "grad_norm": 0.3836904466152191, + "learning_rate": 7.0070395802751e-05, + "loss": 1.7971, + "step": 12625 + }, + { + "epoch": 3.8753836709637817, + "grad_norm": 0.33565691113471985, + "learning_rate": 7.006584317086235e-05, + "loss": 1.7439, + "step": 12626 + }, + { + "epoch": 3.8756906077348066, + "grad_norm": 0.2292134314775467, + "learning_rate": 7.006129034067128e-05, + "loss": 1.7998, + "step": 12627 + }, + { + "epoch": 3.875997544505832, + "grad_norm": 0.26385873556137085, + "learning_rate": 7.005673731222277e-05, + "loss": 1.7914, + "step": 12628 + }, + { + "epoch": 3.876304481276857, + "grad_norm": 0.2854950428009033, + "learning_rate": 7.005218408556184e-05, + "loss": 1.7761, + "step": 12629 + }, + { + "epoch": 3.876611418047882, + "grad_norm": 0.34260645508766174, + "learning_rate": 7.004763066073348e-05, + "loss": 1.8015, + "step": 12630 + }, + { + "epoch": 3.8769183548189075, + "grad_norm": 0.3223683834075928, + "learning_rate": 7.004307703778267e-05, + "loss": 1.7453, + "step": 12631 + }, + { + "epoch": 3.8772252915899323, + "grad_norm": 0.24715089797973633, + "learning_rate": 7.003852321675442e-05, + "loss": 1.7813, + "step": 12632 + }, + { + "epoch": 3.8775322283609577, + "grad_norm": 0.22822390496730804, + "learning_rate": 7.003396919769377e-05, + "loss": 1.7982, + "step": 12633 + }, + { + "epoch": 3.8778391651319826, + "grad_norm": 0.24125081300735474, + "learning_rate": 7.002941498064565e-05, + "loss": 1.8606, + "step": 12634 + }, + { + "epoch": 3.878146101903008, + "grad_norm": 0.23512506484985352, + "learning_rate": 7.002486056565513e-05, + "loss": 1.7469, + "step": 12635 + }, + { + "epoch": 3.8784530386740332, + "grad_norm": 0.2908322215080261, + "learning_rate": 7.00203059527672e-05, + "loss": 1.796, + "step": 12636 + }, + { + "epoch": 3.8787599754450586, + "grad_norm": 0.22931252419948578, + "learning_rate": 7.001575114202689e-05, + "loss": 1.7482, + "step": 12637 + }, + { + "epoch": 3.8790669122160835, + "grad_norm": 0.22574284672737122, + "learning_rate": 7.001119613347917e-05, + "loss": 1.7698, + "step": 12638 + }, + { + "epoch": 3.879373848987109, + "grad_norm": 0.23129726946353912, + "learning_rate": 7.000664092716909e-05, + "loss": 1.776, + "step": 12639 + }, + { + "epoch": 3.8796807857581337, + "grad_norm": 0.2763366401195526, + "learning_rate": 7.000208552314165e-05, + "loss": 1.7814, + "step": 12640 + }, + { + "epoch": 3.879987722529159, + "grad_norm": 0.29870158433914185, + "learning_rate": 6.99975299214419e-05, + "loss": 1.7467, + "step": 12641 + }, + { + "epoch": 3.8802946593001844, + "grad_norm": 0.33574381470680237, + "learning_rate": 6.999297412211484e-05, + "loss": 1.8159, + "step": 12642 + }, + { + "epoch": 3.8806015960712092, + "grad_norm": 0.30309897661209106, + "learning_rate": 6.998841812520547e-05, + "loss": 1.8454, + "step": 12643 + }, + { + "epoch": 3.8809085328422346, + "grad_norm": 0.27399247884750366, + "learning_rate": 6.998386193075886e-05, + "loss": 1.7956, + "step": 12644 + }, + { + "epoch": 3.8812154696132595, + "grad_norm": 0.28649580478668213, + "learning_rate": 6.997930553881998e-05, + "loss": 1.8308, + "step": 12645 + }, + { + "epoch": 3.881522406384285, + "grad_norm": 0.2716052532196045, + "learning_rate": 6.997474894943392e-05, + "loss": 1.7698, + "step": 12646 + }, + { + "epoch": 3.88182934315531, + "grad_norm": 0.21380536258220673, + "learning_rate": 6.997019216264567e-05, + "loss": 1.7028, + "step": 12647 + }, + { + "epoch": 3.882136279926335, + "grad_norm": 0.25262731313705444, + "learning_rate": 6.996563517850028e-05, + "loss": 1.8236, + "step": 12648 + }, + { + "epoch": 3.8824432166973604, + "grad_norm": 0.21150052547454834, + "learning_rate": 6.996107799704277e-05, + "loss": 1.7437, + "step": 12649 + }, + { + "epoch": 3.8827501534683853, + "grad_norm": 0.2614554464817047, + "learning_rate": 6.995652061831821e-05, + "loss": 1.7575, + "step": 12650 + }, + { + "epoch": 3.8830570902394106, + "grad_norm": 0.214684396982193, + "learning_rate": 6.995196304237159e-05, + "loss": 1.8195, + "step": 12651 + }, + { + "epoch": 3.883364027010436, + "grad_norm": 0.2226872444152832, + "learning_rate": 6.994740526924798e-05, + "loss": 1.7556, + "step": 12652 + }, + { + "epoch": 3.8836709637814613, + "grad_norm": 0.22270764410495758, + "learning_rate": 6.994284729899246e-05, + "loss": 1.7536, + "step": 12653 + }, + { + "epoch": 3.883977900552486, + "grad_norm": 0.20683564245700836, + "learning_rate": 6.993828913165e-05, + "loss": 1.7728, + "step": 12654 + }, + { + "epoch": 3.8842848373235115, + "grad_norm": 0.23667018115520477, + "learning_rate": 6.993373076726568e-05, + "loss": 1.7819, + "step": 12655 + }, + { + "epoch": 3.8845917740945364, + "grad_norm": 0.2265234887599945, + "learning_rate": 6.992917220588455e-05, + "loss": 1.7502, + "step": 12656 + }, + { + "epoch": 3.8848987108655617, + "grad_norm": 0.24490754306316376, + "learning_rate": 6.992461344755168e-05, + "loss": 1.7513, + "step": 12657 + }, + { + "epoch": 3.885205647636587, + "grad_norm": 0.23001348972320557, + "learning_rate": 6.992005449231208e-05, + "loss": 1.733, + "step": 12658 + }, + { + "epoch": 3.885512584407612, + "grad_norm": 0.25424695014953613, + "learning_rate": 6.991549534021084e-05, + "loss": 1.7621, + "step": 12659 + }, + { + "epoch": 3.8858195211786373, + "grad_norm": 0.25552862882614136, + "learning_rate": 6.991093599129299e-05, + "loss": 1.7974, + "step": 12660 + }, + { + "epoch": 3.886126457949662, + "grad_norm": 0.26876959204673767, + "learning_rate": 6.99063764456036e-05, + "loss": 1.7924, + "step": 12661 + }, + { + "epoch": 3.8864333947206875, + "grad_norm": 0.2754429578781128, + "learning_rate": 6.990181670318772e-05, + "loss": 1.7981, + "step": 12662 + }, + { + "epoch": 3.886740331491713, + "grad_norm": 0.281818687915802, + "learning_rate": 6.989725676409044e-05, + "loss": 1.7328, + "step": 12663 + }, + { + "epoch": 3.8870472682627377, + "grad_norm": 0.21676552295684814, + "learning_rate": 6.989269662835681e-05, + "loss": 1.7376, + "step": 12664 + }, + { + "epoch": 3.887354205033763, + "grad_norm": 0.276115745306015, + "learning_rate": 6.98881362960319e-05, + "loss": 1.7784, + "step": 12665 + }, + { + "epoch": 3.887661141804788, + "grad_norm": 0.2806364893913269, + "learning_rate": 6.988357576716075e-05, + "loss": 1.8078, + "step": 12666 + }, + { + "epoch": 3.8879680785758133, + "grad_norm": 0.27620184421539307, + "learning_rate": 6.987901504178845e-05, + "loss": 1.8115, + "step": 12667 + }, + { + "epoch": 3.8882750153468386, + "grad_norm": 0.23845402896404266, + "learning_rate": 6.987445411996009e-05, + "loss": 1.7485, + "step": 12668 + }, + { + "epoch": 3.888581952117864, + "grad_norm": 0.25063586235046387, + "learning_rate": 6.986989300172071e-05, + "loss": 1.7663, + "step": 12669 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.2417975515127182, + "learning_rate": 6.98653316871154e-05, + "loss": 1.7562, + "step": 12670 + }, + { + "epoch": 3.889195825659914, + "grad_norm": 0.24952733516693115, + "learning_rate": 6.986077017618923e-05, + "loss": 1.8063, + "step": 12671 + }, + { + "epoch": 3.889502762430939, + "grad_norm": 0.25847554206848145, + "learning_rate": 6.985620846898732e-05, + "loss": 1.7722, + "step": 12672 + }, + { + "epoch": 3.8898096992019644, + "grad_norm": 0.23762650787830353, + "learning_rate": 6.985164656555471e-05, + "loss": 1.8368, + "step": 12673 + }, + { + "epoch": 3.8901166359729897, + "grad_norm": 0.25346314907073975, + "learning_rate": 6.984708446593648e-05, + "loss": 1.7957, + "step": 12674 + }, + { + "epoch": 3.8904235727440146, + "grad_norm": 0.2466745674610138, + "learning_rate": 6.984252217017774e-05, + "loss": 1.8286, + "step": 12675 + }, + { + "epoch": 3.89073050951504, + "grad_norm": 0.25413215160369873, + "learning_rate": 6.983795967832356e-05, + "loss": 1.7711, + "step": 12676 + }, + { + "epoch": 3.891037446286065, + "grad_norm": 0.2315925806760788, + "learning_rate": 6.983339699041903e-05, + "loss": 1.7546, + "step": 12677 + }, + { + "epoch": 3.89134438305709, + "grad_norm": 0.26473405957221985, + "learning_rate": 6.982883410650925e-05, + "loss": 1.7563, + "step": 12678 + }, + { + "epoch": 3.8916513198281155, + "grad_norm": 0.24176491796970367, + "learning_rate": 6.982427102663932e-05, + "loss": 1.7734, + "step": 12679 + }, + { + "epoch": 3.891958256599141, + "grad_norm": 0.25444844365119934, + "learning_rate": 6.98197077508543e-05, + "loss": 1.803, + "step": 12680 + }, + { + "epoch": 3.8922651933701657, + "grad_norm": 0.25234144926071167, + "learning_rate": 6.981514427919933e-05, + "loss": 1.8099, + "step": 12681 + }, + { + "epoch": 3.892572130141191, + "grad_norm": 0.2571142315864563, + "learning_rate": 6.98105806117195e-05, + "loss": 1.8618, + "step": 12682 + }, + { + "epoch": 3.892879066912216, + "grad_norm": 0.21235275268554688, + "learning_rate": 6.980601674845988e-05, + "loss": 1.7121, + "step": 12683 + }, + { + "epoch": 3.8931860036832413, + "grad_norm": 0.27078527212142944, + "learning_rate": 6.98014526894656e-05, + "loss": 1.8103, + "step": 12684 + }, + { + "epoch": 3.8934929404542666, + "grad_norm": 0.3198096454143524, + "learning_rate": 6.979688843478176e-05, + "loss": 1.7529, + "step": 12685 + }, + { + "epoch": 3.8937998772252915, + "grad_norm": 0.3170493245124817, + "learning_rate": 6.979232398445345e-05, + "loss": 1.7629, + "step": 12686 + }, + { + "epoch": 3.894106813996317, + "grad_norm": 0.2495265007019043, + "learning_rate": 6.978775933852582e-05, + "loss": 1.7407, + "step": 12687 + }, + { + "epoch": 3.8944137507673418, + "grad_norm": 0.24570141732692719, + "learning_rate": 6.978319449704395e-05, + "loss": 1.7688, + "step": 12688 + }, + { + "epoch": 3.894720687538367, + "grad_norm": 0.23956388235092163, + "learning_rate": 6.977862946005295e-05, + "loss": 1.7115, + "step": 12689 + }, + { + "epoch": 3.8950276243093924, + "grad_norm": 0.21548940241336823, + "learning_rate": 6.977406422759793e-05, + "loss": 1.7611, + "step": 12690 + }, + { + "epoch": 3.8953345610804173, + "grad_norm": 0.25797295570373535, + "learning_rate": 6.976949879972403e-05, + "loss": 1.7688, + "step": 12691 + }, + { + "epoch": 3.8956414978514426, + "grad_norm": 0.28257784247398376, + "learning_rate": 6.976493317647636e-05, + "loss": 1.7517, + "step": 12692 + }, + { + "epoch": 3.8959484346224675, + "grad_norm": 0.23828580975532532, + "learning_rate": 6.976036735790004e-05, + "loss": 1.7877, + "step": 12693 + }, + { + "epoch": 3.896255371393493, + "grad_norm": 0.22915001213550568, + "learning_rate": 6.975580134404017e-05, + "loss": 1.7741, + "step": 12694 + }, + { + "epoch": 3.896562308164518, + "grad_norm": 0.22975030541419983, + "learning_rate": 6.97512351349419e-05, + "loss": 1.772, + "step": 12695 + }, + { + "epoch": 3.8968692449355435, + "grad_norm": 0.29515185952186584, + "learning_rate": 6.974666873065034e-05, + "loss": 1.8001, + "step": 12696 + }, + { + "epoch": 3.8971761817065684, + "grad_norm": 0.26904794573783875, + "learning_rate": 6.974210213121064e-05, + "loss": 1.7069, + "step": 12697 + }, + { + "epoch": 3.8974831184775938, + "grad_norm": 0.2549479603767395, + "learning_rate": 6.97375353366679e-05, + "loss": 1.7419, + "step": 12698 + }, + { + "epoch": 3.8977900552486187, + "grad_norm": 0.23750101029872894, + "learning_rate": 6.973296834706729e-05, + "loss": 1.7815, + "step": 12699 + }, + { + "epoch": 3.898096992019644, + "grad_norm": 0.23529762029647827, + "learning_rate": 6.972840116245389e-05, + "loss": 1.8139, + "step": 12700 + }, + { + "epoch": 3.8984039287906693, + "grad_norm": 0.3212098777294159, + "learning_rate": 6.97238337828729e-05, + "loss": 1.7507, + "step": 12701 + }, + { + "epoch": 3.898710865561694, + "grad_norm": 0.3167687952518463, + "learning_rate": 6.971926620836941e-05, + "loss": 1.8062, + "step": 12702 + }, + { + "epoch": 3.8990178023327196, + "grad_norm": 0.31298309564590454, + "learning_rate": 6.971469843898855e-05, + "loss": 1.8127, + "step": 12703 + }, + { + "epoch": 3.8993247391037444, + "grad_norm": 0.2537378668785095, + "learning_rate": 6.971013047477551e-05, + "loss": 1.7675, + "step": 12704 + }, + { + "epoch": 3.8996316758747698, + "grad_norm": 0.24292805790901184, + "learning_rate": 6.97055623157754e-05, + "loss": 1.8004, + "step": 12705 + }, + { + "epoch": 3.899938612645795, + "grad_norm": 0.2929537296295166, + "learning_rate": 6.970099396203338e-05, + "loss": 1.7963, + "step": 12706 + }, + { + "epoch": 3.90024554941682, + "grad_norm": 0.30531612038612366, + "learning_rate": 6.969642541359459e-05, + "loss": 1.7347, + "step": 12707 + }, + { + "epoch": 3.9005524861878453, + "grad_norm": 0.3138202726840973, + "learning_rate": 6.969185667050417e-05, + "loss": 1.7987, + "step": 12708 + }, + { + "epoch": 3.9008594229588702, + "grad_norm": 0.2366247922182083, + "learning_rate": 6.96872877328073e-05, + "loss": 1.7671, + "step": 12709 + }, + { + "epoch": 3.9011663597298956, + "grad_norm": 0.26251721382141113, + "learning_rate": 6.96827186005491e-05, + "loss": 1.7657, + "step": 12710 + }, + { + "epoch": 3.901473296500921, + "grad_norm": 0.32497119903564453, + "learning_rate": 6.967814927377474e-05, + "loss": 1.7873, + "step": 12711 + }, + { + "epoch": 3.9017802332719462, + "grad_norm": 0.3290228843688965, + "learning_rate": 6.967357975252939e-05, + "loss": 1.8076, + "step": 12712 + }, + { + "epoch": 3.902087170042971, + "grad_norm": 0.2737300992012024, + "learning_rate": 6.966901003685817e-05, + "loss": 1.7405, + "step": 12713 + }, + { + "epoch": 3.9023941068139965, + "grad_norm": 0.25465309619903564, + "learning_rate": 6.966444012680626e-05, + "loss": 1.8063, + "step": 12714 + }, + { + "epoch": 3.9027010435850213, + "grad_norm": 0.2397255003452301, + "learning_rate": 6.965987002241885e-05, + "loss": 1.8079, + "step": 12715 + }, + { + "epoch": 3.9030079803560467, + "grad_norm": 0.23115718364715576, + "learning_rate": 6.965529972374108e-05, + "loss": 1.8032, + "step": 12716 + }, + { + "epoch": 3.903314917127072, + "grad_norm": 0.2536461055278778, + "learning_rate": 6.96507292308181e-05, + "loss": 1.7477, + "step": 12717 + }, + { + "epoch": 3.903621853898097, + "grad_norm": 0.27151185274124146, + "learning_rate": 6.96461585436951e-05, + "loss": 1.75, + "step": 12718 + }, + { + "epoch": 3.9039287906691222, + "grad_norm": 0.26894113421440125, + "learning_rate": 6.964158766241726e-05, + "loss": 1.7816, + "step": 12719 + }, + { + "epoch": 3.904235727440147, + "grad_norm": 0.23541375994682312, + "learning_rate": 6.963701658702972e-05, + "loss": 1.7991, + "step": 12720 + }, + { + "epoch": 3.9045426642111725, + "grad_norm": 0.22142915427684784, + "learning_rate": 6.96324453175777e-05, + "loss": 1.7245, + "step": 12721 + }, + { + "epoch": 3.904849600982198, + "grad_norm": 0.32864269614219666, + "learning_rate": 6.962787385410632e-05, + "loss": 1.7631, + "step": 12722 + }, + { + "epoch": 3.9051565377532227, + "grad_norm": 0.23657776415348053, + "learning_rate": 6.96233021966608e-05, + "loss": 1.8081, + "step": 12723 + }, + { + "epoch": 3.905463474524248, + "grad_norm": 0.24790632724761963, + "learning_rate": 6.961873034528629e-05, + "loss": 1.7193, + "step": 12724 + }, + { + "epoch": 3.905770411295273, + "grad_norm": 0.2517886459827423, + "learning_rate": 6.961415830002801e-05, + "loss": 1.7785, + "step": 12725 + }, + { + "epoch": 3.9060773480662982, + "grad_norm": 0.2340923547744751, + "learning_rate": 6.960958606093113e-05, + "loss": 1.7632, + "step": 12726 + }, + { + "epoch": 3.9063842848373236, + "grad_norm": 0.23260441422462463, + "learning_rate": 6.960501362804079e-05, + "loss": 1.7865, + "step": 12727 + }, + { + "epoch": 3.906691221608349, + "grad_norm": 0.22616329789161682, + "learning_rate": 6.960044100140224e-05, + "loss": 1.7851, + "step": 12728 + }, + { + "epoch": 3.906998158379374, + "grad_norm": 0.2849951982498169, + "learning_rate": 6.959586818106064e-05, + "loss": 1.8618, + "step": 12729 + }, + { + "epoch": 3.907305095150399, + "grad_norm": 0.3279374837875366, + "learning_rate": 6.95912951670612e-05, + "loss": 1.8563, + "step": 12730 + }, + { + "epoch": 3.907612031921424, + "grad_norm": 0.24359555542469025, + "learning_rate": 6.958672195944906e-05, + "loss": 1.7604, + "step": 12731 + }, + { + "epoch": 3.9079189686924494, + "grad_norm": 0.30881935358047485, + "learning_rate": 6.958214855826947e-05, + "loss": 1.8463, + "step": 12732 + }, + { + "epoch": 3.9082259054634747, + "grad_norm": 0.25361543893814087, + "learning_rate": 6.957757496356763e-05, + "loss": 1.7831, + "step": 12733 + }, + { + "epoch": 3.9085328422344996, + "grad_norm": 0.26763513684272766, + "learning_rate": 6.957300117538869e-05, + "loss": 1.8383, + "step": 12734 + }, + { + "epoch": 3.908839779005525, + "grad_norm": 0.2238057255744934, + "learning_rate": 6.95684271937779e-05, + "loss": 1.7702, + "step": 12735 + }, + { + "epoch": 3.90914671577655, + "grad_norm": 0.22110232710838318, + "learning_rate": 6.956385301878045e-05, + "loss": 1.7931, + "step": 12736 + }, + { + "epoch": 3.909453652547575, + "grad_norm": 0.23765070736408234, + "learning_rate": 6.955927865044152e-05, + "loss": 1.7212, + "step": 12737 + }, + { + "epoch": 3.9097605893186005, + "grad_norm": 0.22324508428573608, + "learning_rate": 6.955470408880633e-05, + "loss": 1.7161, + "step": 12738 + }, + { + "epoch": 3.9100675260896254, + "grad_norm": 0.22485347092151642, + "learning_rate": 6.955012933392012e-05, + "loss": 1.7374, + "step": 12739 + }, + { + "epoch": 3.9103744628606507, + "grad_norm": 0.28046715259552, + "learning_rate": 6.954555438582806e-05, + "loss": 1.9264, + "step": 12740 + }, + { + "epoch": 3.9106813996316756, + "grad_norm": 0.26391276717185974, + "learning_rate": 6.954097924457536e-05, + "loss": 1.7343, + "step": 12741 + }, + { + "epoch": 3.910988336402701, + "grad_norm": 0.29596614837646484, + "learning_rate": 6.953640391020726e-05, + "loss": 1.8111, + "step": 12742 + }, + { + "epoch": 3.9112952731737263, + "grad_norm": 0.2709808051586151, + "learning_rate": 6.953182838276896e-05, + "loss": 1.7776, + "step": 12743 + }, + { + "epoch": 3.9116022099447516, + "grad_norm": 0.2585100531578064, + "learning_rate": 6.952725266230571e-05, + "loss": 1.7774, + "step": 12744 + }, + { + "epoch": 3.9119091467157765, + "grad_norm": 0.26490530371665955, + "learning_rate": 6.952267674886268e-05, + "loss": 1.78, + "step": 12745 + }, + { + "epoch": 3.912216083486802, + "grad_norm": 0.23654767870903015, + "learning_rate": 6.951810064248512e-05, + "loss": 1.8263, + "step": 12746 + }, + { + "epoch": 3.9125230202578267, + "grad_norm": 0.2495296597480774, + "learning_rate": 6.951352434321826e-05, + "loss": 1.787, + "step": 12747 + }, + { + "epoch": 3.912829957028852, + "grad_norm": 0.24038313329219818, + "learning_rate": 6.950894785110728e-05, + "loss": 1.774, + "step": 12748 + }, + { + "epoch": 3.9131368937998774, + "grad_norm": 0.23738732933998108, + "learning_rate": 6.950437116619749e-05, + "loss": 1.7401, + "step": 12749 + }, + { + "epoch": 3.9134438305709023, + "grad_norm": 0.28192025423049927, + "learning_rate": 6.949979428853405e-05, + "loss": 1.8416, + "step": 12750 + }, + { + "epoch": 3.9137507673419276, + "grad_norm": 0.30579057335853577, + "learning_rate": 6.949521721816221e-05, + "loss": 1.7404, + "step": 12751 + }, + { + "epoch": 3.9140577041129525, + "grad_norm": 0.23972894251346588, + "learning_rate": 6.949063995512721e-05, + "loss": 1.7543, + "step": 12752 + }, + { + "epoch": 3.914364640883978, + "grad_norm": 0.2837793231010437, + "learning_rate": 6.94860624994743e-05, + "loss": 1.7779, + "step": 12753 + }, + { + "epoch": 3.914671577655003, + "grad_norm": 0.3344916105270386, + "learning_rate": 6.948148485124868e-05, + "loss": 1.7803, + "step": 12754 + }, + { + "epoch": 3.9149785144260285, + "grad_norm": 0.24271291494369507, + "learning_rate": 6.94769070104956e-05, + "loss": 1.7362, + "step": 12755 + }, + { + "epoch": 3.9152854511970534, + "grad_norm": 0.25299304723739624, + "learning_rate": 6.947232897726031e-05, + "loss": 1.7685, + "step": 12756 + }, + { + "epoch": 3.9155923879680787, + "grad_norm": 0.24766205251216888, + "learning_rate": 6.946775075158807e-05, + "loss": 1.829, + "step": 12757 + }, + { + "epoch": 3.9158993247391036, + "grad_norm": 0.2508428692817688, + "learning_rate": 6.94631723335241e-05, + "loss": 1.809, + "step": 12758 + }, + { + "epoch": 3.916206261510129, + "grad_norm": 0.2172096222639084, + "learning_rate": 6.945859372311365e-05, + "loss": 1.7376, + "step": 12759 + }, + { + "epoch": 3.9165131982811543, + "grad_norm": 0.28976425528526306, + "learning_rate": 6.945401492040198e-05, + "loss": 1.8229, + "step": 12760 + }, + { + "epoch": 3.916820135052179, + "grad_norm": 0.3528063893318176, + "learning_rate": 6.944943592543432e-05, + "loss": 1.7559, + "step": 12761 + }, + { + "epoch": 3.9171270718232045, + "grad_norm": 0.46312370896339417, + "learning_rate": 6.944485673825595e-05, + "loss": 1.7664, + "step": 12762 + }, + { + "epoch": 3.9174340085942294, + "grad_norm": 0.4466164708137512, + "learning_rate": 6.94402773589121e-05, + "loss": 1.7833, + "step": 12763 + }, + { + "epoch": 3.9177409453652547, + "grad_norm": 0.2637740969657898, + "learning_rate": 6.943569778744804e-05, + "loss": 1.818, + "step": 12764 + }, + { + "epoch": 3.91804788213628, + "grad_norm": 0.37515267729759216, + "learning_rate": 6.943111802390901e-05, + "loss": 1.7898, + "step": 12765 + }, + { + "epoch": 3.918354818907305, + "grad_norm": 0.45146289467811584, + "learning_rate": 6.942653806834029e-05, + "loss": 1.7797, + "step": 12766 + }, + { + "epoch": 3.9186617556783303, + "grad_norm": 0.2809859812259674, + "learning_rate": 6.942195792078712e-05, + "loss": 1.7836, + "step": 12767 + }, + { + "epoch": 3.918968692449355, + "grad_norm": 0.3606306314468384, + "learning_rate": 6.94173775812948e-05, + "loss": 1.7657, + "step": 12768 + }, + { + "epoch": 3.9192756292203805, + "grad_norm": 0.49528738856315613, + "learning_rate": 6.941279704990857e-05, + "loss": 1.7628, + "step": 12769 + }, + { + "epoch": 3.919582565991406, + "grad_norm": 0.3484322428703308, + "learning_rate": 6.940821632667371e-05, + "loss": 1.7939, + "step": 12770 + }, + { + "epoch": 3.919889502762431, + "grad_norm": 0.2479606419801712, + "learning_rate": 6.940363541163546e-05, + "loss": 1.813, + "step": 12771 + }, + { + "epoch": 3.920196439533456, + "grad_norm": 0.3491765558719635, + "learning_rate": 6.939905430483911e-05, + "loss": 1.7338, + "step": 12772 + }, + { + "epoch": 3.9205033763044814, + "grad_norm": 0.291810005903244, + "learning_rate": 6.939447300632995e-05, + "loss": 1.7445, + "step": 12773 + }, + { + "epoch": 3.9208103130755063, + "grad_norm": 0.2467527985572815, + "learning_rate": 6.938989151615324e-05, + "loss": 1.8462, + "step": 12774 + }, + { + "epoch": 3.9211172498465316, + "grad_norm": 0.35656824707984924, + "learning_rate": 6.938530983435426e-05, + "loss": 1.7751, + "step": 12775 + }, + { + "epoch": 3.921424186617557, + "grad_norm": 0.31269776821136475, + "learning_rate": 6.938072796097828e-05, + "loss": 1.7714, + "step": 12776 + }, + { + "epoch": 3.921731123388582, + "grad_norm": 0.2082831859588623, + "learning_rate": 6.937614589607058e-05, + "loss": 1.7263, + "step": 12777 + }, + { + "epoch": 3.922038060159607, + "grad_norm": 0.27583765983581543, + "learning_rate": 6.937156363967646e-05, + "loss": 1.6822, + "step": 12778 + }, + { + "epoch": 3.922344996930632, + "grad_norm": 0.32773876190185547, + "learning_rate": 6.93669811918412e-05, + "loss": 1.7792, + "step": 12779 + }, + { + "epoch": 3.9226519337016574, + "grad_norm": 0.2583121657371521, + "learning_rate": 6.936239855261007e-05, + "loss": 1.7812, + "step": 12780 + }, + { + "epoch": 3.9229588704726828, + "grad_norm": 0.245570570230484, + "learning_rate": 6.935781572202836e-05, + "loss": 1.7252, + "step": 12781 + }, + { + "epoch": 3.9232658072437077, + "grad_norm": 0.2379419505596161, + "learning_rate": 6.935323270014138e-05, + "loss": 1.7485, + "step": 12782 + }, + { + "epoch": 3.923572744014733, + "grad_norm": 0.2239784598350525, + "learning_rate": 6.934864948699439e-05, + "loss": 1.7444, + "step": 12783 + }, + { + "epoch": 3.923879680785758, + "grad_norm": 0.2366618812084198, + "learning_rate": 6.934406608263274e-05, + "loss": 1.777, + "step": 12784 + }, + { + "epoch": 3.924186617556783, + "grad_norm": 0.22583791613578796, + "learning_rate": 6.933948248710169e-05, + "loss": 1.7291, + "step": 12785 + }, + { + "epoch": 3.9244935543278086, + "grad_norm": 0.24141047894954681, + "learning_rate": 6.933489870044651e-05, + "loss": 1.7748, + "step": 12786 + }, + { + "epoch": 3.924800491098834, + "grad_norm": 0.2389962524175644, + "learning_rate": 6.933031472271255e-05, + "loss": 1.7957, + "step": 12787 + }, + { + "epoch": 3.925107427869859, + "grad_norm": 0.25230300426483154, + "learning_rate": 6.932573055394509e-05, + "loss": 1.7621, + "step": 12788 + }, + { + "epoch": 3.925414364640884, + "grad_norm": 0.23894043266773224, + "learning_rate": 6.932114619418941e-05, + "loss": 1.7285, + "step": 12789 + }, + { + "epoch": 3.925721301411909, + "grad_norm": 0.2650291919708252, + "learning_rate": 6.931656164349086e-05, + "loss": 1.7613, + "step": 12790 + }, + { + "epoch": 3.9260282381829343, + "grad_norm": 0.20616789162158966, + "learning_rate": 6.931197690189472e-05, + "loss": 1.7505, + "step": 12791 + }, + { + "epoch": 3.9263351749539597, + "grad_norm": 0.23915675282478333, + "learning_rate": 6.930739196944633e-05, + "loss": 1.7477, + "step": 12792 + }, + { + "epoch": 3.9266421117249846, + "grad_norm": 0.2522687613964081, + "learning_rate": 6.930280684619094e-05, + "loss": 1.8, + "step": 12793 + }, + { + "epoch": 3.92694904849601, + "grad_norm": 0.264167845249176, + "learning_rate": 6.929822153217391e-05, + "loss": 1.7516, + "step": 12794 + }, + { + "epoch": 3.927255985267035, + "grad_norm": 0.21358054876327515, + "learning_rate": 6.929363602744054e-05, + "loss": 1.7207, + "step": 12795 + }, + { + "epoch": 3.92756292203806, + "grad_norm": 0.25632721185684204, + "learning_rate": 6.928905033203617e-05, + "loss": 1.7446, + "step": 12796 + }, + { + "epoch": 3.9278698588090855, + "grad_norm": 0.2717185318470001, + "learning_rate": 6.928446444600608e-05, + "loss": 1.8555, + "step": 12797 + }, + { + "epoch": 3.9281767955801103, + "grad_norm": 0.2871767282485962, + "learning_rate": 6.927987836939561e-05, + "loss": 1.7861, + "step": 12798 + }, + { + "epoch": 3.9284837323511357, + "grad_norm": 0.282507061958313, + "learning_rate": 6.927529210225009e-05, + "loss": 1.7683, + "step": 12799 + }, + { + "epoch": 3.9287906691221606, + "grad_norm": 0.24870644509792328, + "learning_rate": 6.927070564461482e-05, + "loss": 1.7355, + "step": 12800 + }, + { + "epoch": 3.929097605893186, + "grad_norm": 0.2093631625175476, + "learning_rate": 6.926611899653516e-05, + "loss": 1.7691, + "step": 12801 + }, + { + "epoch": 3.9294045426642112, + "grad_norm": 0.34258076548576355, + "learning_rate": 6.926153215805642e-05, + "loss": 1.8398, + "step": 12802 + }, + { + "epoch": 3.9297114794352366, + "grad_norm": 0.39179500937461853, + "learning_rate": 6.925694512922391e-05, + "loss": 1.8229, + "step": 12803 + }, + { + "epoch": 3.9300184162062615, + "grad_norm": 0.36814743280410767, + "learning_rate": 6.9252357910083e-05, + "loss": 1.7759, + "step": 12804 + }, + { + "epoch": 3.930325352977287, + "grad_norm": 0.2659403085708618, + "learning_rate": 6.924777050067902e-05, + "loss": 1.7553, + "step": 12805 + }, + { + "epoch": 3.9306322897483117, + "grad_norm": 0.20617491006851196, + "learning_rate": 6.924318290105724e-05, + "loss": 1.7398, + "step": 12806 + }, + { + "epoch": 3.930939226519337, + "grad_norm": 0.23730522394180298, + "learning_rate": 6.923859511126309e-05, + "loss": 1.699, + "step": 12807 + }, + { + "epoch": 3.9312461632903624, + "grad_norm": 0.24865423142910004, + "learning_rate": 6.923400713134184e-05, + "loss": 1.7801, + "step": 12808 + }, + { + "epoch": 3.9315531000613873, + "grad_norm": 0.2495356798171997, + "learning_rate": 6.92294189613389e-05, + "loss": 1.803, + "step": 12809 + }, + { + "epoch": 3.9318600368324126, + "grad_norm": 0.24223244190216064, + "learning_rate": 6.922483060129955e-05, + "loss": 1.751, + "step": 12810 + }, + { + "epoch": 3.9321669736034375, + "grad_norm": 0.2541450262069702, + "learning_rate": 6.922024205126913e-05, + "loss": 1.7721, + "step": 12811 + }, + { + "epoch": 3.932473910374463, + "grad_norm": 0.24528831243515015, + "learning_rate": 6.921565331129304e-05, + "loss": 1.792, + "step": 12812 + }, + { + "epoch": 3.932780847145488, + "grad_norm": 0.22789500653743744, + "learning_rate": 6.921106438141659e-05, + "loss": 1.8455, + "step": 12813 + }, + { + "epoch": 3.933087783916513, + "grad_norm": 0.26267170906066895, + "learning_rate": 6.920647526168515e-05, + "loss": 1.7254, + "step": 12814 + }, + { + "epoch": 3.9333947206875384, + "grad_norm": 0.23044808208942413, + "learning_rate": 6.920188595214406e-05, + "loss": 1.7217, + "step": 12815 + }, + { + "epoch": 3.9337016574585633, + "grad_norm": 0.2304011732339859, + "learning_rate": 6.919729645283867e-05, + "loss": 1.8121, + "step": 12816 + }, + { + "epoch": 3.9340085942295886, + "grad_norm": 0.21516792476177216, + "learning_rate": 6.919270676381435e-05, + "loss": 1.7305, + "step": 12817 + }, + { + "epoch": 3.934315531000614, + "grad_norm": 0.24698840081691742, + "learning_rate": 6.918811688511646e-05, + "loss": 1.7967, + "step": 12818 + }, + { + "epoch": 3.9346224677716393, + "grad_norm": 0.23132537305355072, + "learning_rate": 6.918352681679035e-05, + "loss": 1.7439, + "step": 12819 + }, + { + "epoch": 3.934929404542664, + "grad_norm": 0.2597793936729431, + "learning_rate": 6.917893655888139e-05, + "loss": 1.7882, + "step": 12820 + }, + { + "epoch": 3.9352363413136895, + "grad_norm": 0.23946607112884521, + "learning_rate": 6.917434611143493e-05, + "loss": 1.7991, + "step": 12821 + }, + { + "epoch": 3.9355432780847144, + "grad_norm": 0.25808244943618774, + "learning_rate": 6.916975547449634e-05, + "loss": 1.845, + "step": 12822 + }, + { + "epoch": 3.9358502148557397, + "grad_norm": 0.26082557439804077, + "learning_rate": 6.9165164648111e-05, + "loss": 1.7562, + "step": 12823 + }, + { + "epoch": 3.936157151626765, + "grad_norm": 0.24810053408145905, + "learning_rate": 6.916057363232425e-05, + "loss": 1.778, + "step": 12824 + }, + { + "epoch": 3.93646408839779, + "grad_norm": 0.24168157577514648, + "learning_rate": 6.91559824271815e-05, + "loss": 1.7628, + "step": 12825 + }, + { + "epoch": 3.9367710251688153, + "grad_norm": 0.23800434172153473, + "learning_rate": 6.91513910327281e-05, + "loss": 1.8063, + "step": 12826 + }, + { + "epoch": 3.93707796193984, + "grad_norm": 0.23055073618888855, + "learning_rate": 6.914679944900944e-05, + "loss": 1.749, + "step": 12827 + }, + { + "epoch": 3.9373848987108655, + "grad_norm": 0.22455987334251404, + "learning_rate": 6.914220767607088e-05, + "loss": 1.7471, + "step": 12828 + }, + { + "epoch": 3.937691835481891, + "grad_norm": 0.21808378398418427, + "learning_rate": 6.913761571395778e-05, + "loss": 1.7503, + "step": 12829 + }, + { + "epoch": 3.937998772252916, + "grad_norm": 0.23136213421821594, + "learning_rate": 6.913302356271556e-05, + "loss": 1.752, + "step": 12830 + }, + { + "epoch": 3.938305709023941, + "grad_norm": 0.29579970240592957, + "learning_rate": 6.912843122238959e-05, + "loss": 1.8028, + "step": 12831 + }, + { + "epoch": 3.9386126457949664, + "grad_norm": 0.28578072786331177, + "learning_rate": 6.912383869302526e-05, + "loss": 1.8183, + "step": 12832 + }, + { + "epoch": 3.9389195825659913, + "grad_norm": 0.2616737186908722, + "learning_rate": 6.911924597466793e-05, + "loss": 1.8366, + "step": 12833 + }, + { + "epoch": 3.9392265193370166, + "grad_norm": 0.29275768995285034, + "learning_rate": 6.911465306736302e-05, + "loss": 1.731, + "step": 12834 + }, + { + "epoch": 3.939533456108042, + "grad_norm": 0.3300873041152954, + "learning_rate": 6.91100599711559e-05, + "loss": 1.8713, + "step": 12835 + }, + { + "epoch": 3.939840392879067, + "grad_norm": 0.2744643986225128, + "learning_rate": 6.910546668609195e-05, + "loss": 1.8479, + "step": 12836 + }, + { + "epoch": 3.940147329650092, + "grad_norm": 0.25248417258262634, + "learning_rate": 6.91008732122166e-05, + "loss": 1.7962, + "step": 12837 + }, + { + "epoch": 3.940454266421117, + "grad_norm": 0.3068546652793884, + "learning_rate": 6.909627954957521e-05, + "loss": 1.759, + "step": 12838 + }, + { + "epoch": 3.9407612031921424, + "grad_norm": 0.3273559808731079, + "learning_rate": 6.909168569821321e-05, + "loss": 1.814, + "step": 12839 + }, + { + "epoch": 3.9410681399631677, + "grad_norm": 0.31192758679389954, + "learning_rate": 6.908709165817597e-05, + "loss": 1.7906, + "step": 12840 + }, + { + "epoch": 3.9413750767341926, + "grad_norm": 0.24487090110778809, + "learning_rate": 6.90824974295089e-05, + "loss": 1.8238, + "step": 12841 + }, + { + "epoch": 3.941682013505218, + "grad_norm": 0.24863721430301666, + "learning_rate": 6.907790301225743e-05, + "loss": 1.7651, + "step": 12842 + }, + { + "epoch": 3.941988950276243, + "grad_norm": 0.26555630564689636, + "learning_rate": 6.907330840646693e-05, + "loss": 1.8268, + "step": 12843 + }, + { + "epoch": 3.942295887047268, + "grad_norm": 0.2439817190170288, + "learning_rate": 6.906871361218281e-05, + "loss": 1.7291, + "step": 12844 + }, + { + "epoch": 3.9426028238182935, + "grad_norm": 0.2410304993391037, + "learning_rate": 6.906411862945048e-05, + "loss": 1.712, + "step": 12845 + }, + { + "epoch": 3.942909760589319, + "grad_norm": 0.28575149178504944, + "learning_rate": 6.905952345831537e-05, + "loss": 1.7269, + "step": 12846 + }, + { + "epoch": 3.9432166973603437, + "grad_norm": 0.3055815100669861, + "learning_rate": 6.905492809882286e-05, + "loss": 1.7234, + "step": 12847 + }, + { + "epoch": 3.943523634131369, + "grad_norm": 0.2762533724308014, + "learning_rate": 6.905033255101839e-05, + "loss": 1.7768, + "step": 12848 + }, + { + "epoch": 3.943830570902394, + "grad_norm": 0.22819125652313232, + "learning_rate": 6.904573681494738e-05, + "loss": 1.7416, + "step": 12849 + }, + { + "epoch": 3.9441375076734193, + "grad_norm": 0.21664194762706757, + "learning_rate": 6.904114089065523e-05, + "loss": 1.7506, + "step": 12850 + }, + { + "epoch": 3.9444444444444446, + "grad_norm": 0.21935151517391205, + "learning_rate": 6.903654477818735e-05, + "loss": 1.7522, + "step": 12851 + }, + { + "epoch": 3.9447513812154695, + "grad_norm": 0.2204175442457199, + "learning_rate": 6.903194847758918e-05, + "loss": 1.7753, + "step": 12852 + }, + { + "epoch": 3.945058317986495, + "grad_norm": 0.23130151629447937, + "learning_rate": 6.902735198890615e-05, + "loss": 1.7743, + "step": 12853 + }, + { + "epoch": 3.9453652547575198, + "grad_norm": 0.2548399567604065, + "learning_rate": 6.902275531218368e-05, + "loss": 1.8373, + "step": 12854 + }, + { + "epoch": 3.945672191528545, + "grad_norm": 0.2905479371547699, + "learning_rate": 6.901815844746718e-05, + "loss": 1.8336, + "step": 12855 + }, + { + "epoch": 3.9459791282995704, + "grad_norm": 0.2698945105075836, + "learning_rate": 6.90135613948021e-05, + "loss": 1.7498, + "step": 12856 + }, + { + "epoch": 3.9462860650705953, + "grad_norm": 0.24966828525066376, + "learning_rate": 6.900896415423387e-05, + "loss": 1.7664, + "step": 12857 + }, + { + "epoch": 3.9465930018416207, + "grad_norm": 0.23272784054279327, + "learning_rate": 6.90043667258079e-05, + "loss": 1.7742, + "step": 12858 + }, + { + "epoch": 3.9468999386126455, + "grad_norm": 0.2277698516845703, + "learning_rate": 6.899976910956965e-05, + "loss": 1.7465, + "step": 12859 + }, + { + "epoch": 3.947206875383671, + "grad_norm": 0.2376442402601242, + "learning_rate": 6.899517130556454e-05, + "loss": 1.7995, + "step": 12860 + }, + { + "epoch": 3.947513812154696, + "grad_norm": 0.25591593980789185, + "learning_rate": 6.899057331383802e-05, + "loss": 1.8017, + "step": 12861 + }, + { + "epoch": 3.9478207489257215, + "grad_norm": 0.2715262472629547, + "learning_rate": 6.898597513443551e-05, + "loss": 1.7967, + "step": 12862 + }, + { + "epoch": 3.9481276856967464, + "grad_norm": 0.20916256308555603, + "learning_rate": 6.898137676740246e-05, + "loss": 1.7711, + "step": 12863 + }, + { + "epoch": 3.9484346224677718, + "grad_norm": 0.2570229768753052, + "learning_rate": 6.897677821278435e-05, + "loss": 1.833, + "step": 12864 + }, + { + "epoch": 3.9487415592387967, + "grad_norm": 0.26343438029289246, + "learning_rate": 6.897217947062657e-05, + "loss": 1.7625, + "step": 12865 + }, + { + "epoch": 3.949048496009822, + "grad_norm": 0.23407024145126343, + "learning_rate": 6.896758054097459e-05, + "loss": 1.7211, + "step": 12866 + }, + { + "epoch": 3.9493554327808473, + "grad_norm": 0.2554715573787689, + "learning_rate": 6.896298142387387e-05, + "loss": 1.8548, + "step": 12867 + }, + { + "epoch": 3.949662369551872, + "grad_norm": 0.24143370985984802, + "learning_rate": 6.895838211936986e-05, + "loss": 1.7635, + "step": 12868 + }, + { + "epoch": 3.9499693063228976, + "grad_norm": 0.24634715914726257, + "learning_rate": 6.8953782627508e-05, + "loss": 1.8012, + "step": 12869 + }, + { + "epoch": 3.9502762430939224, + "grad_norm": 0.22740426659584045, + "learning_rate": 6.894918294833375e-05, + "loss": 1.7294, + "step": 12870 + }, + { + "epoch": 3.950583179864948, + "grad_norm": 0.2651631832122803, + "learning_rate": 6.894458308189257e-05, + "loss": 1.8289, + "step": 12871 + }, + { + "epoch": 3.950890116635973, + "grad_norm": 0.28693267703056335, + "learning_rate": 6.893998302822991e-05, + "loss": 1.8462, + "step": 12872 + }, + { + "epoch": 3.951197053406998, + "grad_norm": 0.26584213972091675, + "learning_rate": 6.893538278739125e-05, + "loss": 1.7621, + "step": 12873 + }, + { + "epoch": 3.9515039901780233, + "grad_norm": 0.29970669746398926, + "learning_rate": 6.893078235942203e-05, + "loss": 1.7659, + "step": 12874 + }, + { + "epoch": 3.9518109269490482, + "grad_norm": 0.2271152138710022, + "learning_rate": 6.892618174436771e-05, + "loss": 1.7151, + "step": 12875 + }, + { + "epoch": 3.9521178637200736, + "grad_norm": 0.24783682823181152, + "learning_rate": 6.892158094227379e-05, + "loss": 1.761, + "step": 12876 + }, + { + "epoch": 3.952424800491099, + "grad_norm": 0.2371140718460083, + "learning_rate": 6.891697995318573e-05, + "loss": 1.7557, + "step": 12877 + }, + { + "epoch": 3.9527317372621242, + "grad_norm": 0.29708394408226013, + "learning_rate": 6.891237877714896e-05, + "loss": 1.8629, + "step": 12878 + }, + { + "epoch": 3.953038674033149, + "grad_norm": 0.2724219262599945, + "learning_rate": 6.890777741420899e-05, + "loss": 1.7378, + "step": 12879 + }, + { + "epoch": 3.9533456108041745, + "grad_norm": 0.2227276861667633, + "learning_rate": 6.890317586441126e-05, + "loss": 1.6989, + "step": 12880 + }, + { + "epoch": 3.9536525475751993, + "grad_norm": 0.2546161413192749, + "learning_rate": 6.889857412780128e-05, + "loss": 1.8688, + "step": 12881 + }, + { + "epoch": 3.9539594843462247, + "grad_norm": 0.24882884323596954, + "learning_rate": 6.889397220442452e-05, + "loss": 1.8137, + "step": 12882 + }, + { + "epoch": 3.95426642111725, + "grad_norm": 0.2549113929271698, + "learning_rate": 6.888937009432644e-05, + "loss": 1.8366, + "step": 12883 + }, + { + "epoch": 3.954573357888275, + "grad_norm": 0.30032673478126526, + "learning_rate": 6.888476779755255e-05, + "loss": 1.8267, + "step": 12884 + }, + { + "epoch": 3.9548802946593002, + "grad_norm": 0.2887294292449951, + "learning_rate": 6.888016531414832e-05, + "loss": 1.8295, + "step": 12885 + }, + { + "epoch": 3.955187231430325, + "grad_norm": 0.2947406470775604, + "learning_rate": 6.88755626441592e-05, + "loss": 1.7713, + "step": 12886 + }, + { + "epoch": 3.9554941682013505, + "grad_norm": 0.2967108190059662, + "learning_rate": 6.887095978763072e-05, + "loss": 1.7636, + "step": 12887 + }, + { + "epoch": 3.955801104972376, + "grad_norm": 0.2495311200618744, + "learning_rate": 6.886635674460836e-05, + "loss": 1.8148, + "step": 12888 + }, + { + "epoch": 3.9561080417434007, + "grad_norm": 0.23367099463939667, + "learning_rate": 6.88617535151376e-05, + "loss": 1.7353, + "step": 12889 + }, + { + "epoch": 3.956414978514426, + "grad_norm": 0.36790570616722107, + "learning_rate": 6.885715009926395e-05, + "loss": 1.7853, + "step": 12890 + }, + { + "epoch": 3.9567219152854514, + "grad_norm": 0.5013020038604736, + "learning_rate": 6.885254649703287e-05, + "loss": 1.7923, + "step": 12891 + }, + { + "epoch": 3.9570288520564763, + "grad_norm": 0.4446276128292084, + "learning_rate": 6.884794270848988e-05, + "loss": 1.7504, + "step": 12892 + }, + { + "epoch": 3.9573357888275016, + "grad_norm": 0.2478526383638382, + "learning_rate": 6.88433387336805e-05, + "loss": 1.7629, + "step": 12893 + }, + { + "epoch": 3.957642725598527, + "grad_norm": 0.30111798644065857, + "learning_rate": 6.883873457265019e-05, + "loss": 1.8291, + "step": 12894 + }, + { + "epoch": 3.957949662369552, + "grad_norm": 0.3812437951564789, + "learning_rate": 6.883413022544445e-05, + "loss": 1.7919, + "step": 12895 + }, + { + "epoch": 3.958256599140577, + "grad_norm": 0.2895318269729614, + "learning_rate": 6.882952569210881e-05, + "loss": 1.7467, + "step": 12896 + }, + { + "epoch": 3.958563535911602, + "grad_norm": 0.30391454696655273, + "learning_rate": 6.882492097268873e-05, + "loss": 1.8145, + "step": 12897 + }, + { + "epoch": 3.9588704726826274, + "grad_norm": 0.5033623576164246, + "learning_rate": 6.882031606722977e-05, + "loss": 1.8231, + "step": 12898 + }, + { + "epoch": 3.9591774094536527, + "grad_norm": 0.5351777672767639, + "learning_rate": 6.881571097577742e-05, + "loss": 1.807, + "step": 12899 + }, + { + "epoch": 3.9594843462246776, + "grad_norm": 0.35540491342544556, + "learning_rate": 6.881110569837719e-05, + "loss": 1.7626, + "step": 12900 + }, + { + "epoch": 3.959791282995703, + "grad_norm": 0.22447600960731506, + "learning_rate": 6.880650023507457e-05, + "loss": 1.7392, + "step": 12901 + }, + { + "epoch": 3.960098219766728, + "grad_norm": 0.44619202613830566, + "learning_rate": 6.88018945859151e-05, + "loss": 1.8138, + "step": 12902 + }, + { + "epoch": 3.960405156537753, + "grad_norm": 0.41381633281707764, + "learning_rate": 6.879728875094428e-05, + "loss": 1.7676, + "step": 12903 + }, + { + "epoch": 3.9607120933087785, + "grad_norm": 0.2601528465747833, + "learning_rate": 6.879268273020764e-05, + "loss": 1.8406, + "step": 12904 + }, + { + "epoch": 3.961019030079804, + "grad_norm": 0.3309035003185272, + "learning_rate": 6.878807652375071e-05, + "loss": 1.7673, + "step": 12905 + }, + { + "epoch": 3.9613259668508287, + "grad_norm": 0.5281669497489929, + "learning_rate": 6.878347013161899e-05, + "loss": 1.7686, + "step": 12906 + }, + { + "epoch": 3.961632903621854, + "grad_norm": 0.5397645831108093, + "learning_rate": 6.8778863553858e-05, + "loss": 1.8575, + "step": 12907 + }, + { + "epoch": 3.961939840392879, + "grad_norm": 0.329485684633255, + "learning_rate": 6.877425679051327e-05, + "loss": 1.8185, + "step": 12908 + }, + { + "epoch": 3.9622467771639043, + "grad_norm": 0.3012789487838745, + "learning_rate": 6.876964984163034e-05, + "loss": 1.7962, + "step": 12909 + }, + { + "epoch": 3.9625537139349296, + "grad_norm": 0.5596817135810852, + "learning_rate": 6.876504270725472e-05, + "loss": 1.7972, + "step": 12910 + }, + { + "epoch": 3.9628606507059545, + "grad_norm": 0.5374729633331299, + "learning_rate": 6.876043538743197e-05, + "loss": 1.7863, + "step": 12911 + }, + { + "epoch": 3.96316758747698, + "grad_norm": 0.24617290496826172, + "learning_rate": 6.875582788220757e-05, + "loss": 1.7555, + "step": 12912 + }, + { + "epoch": 3.9634745242480047, + "grad_norm": 0.3493972420692444, + "learning_rate": 6.875122019162712e-05, + "loss": 1.8595, + "step": 12913 + }, + { + "epoch": 3.96378146101903, + "grad_norm": 0.4293089807033539, + "learning_rate": 6.874661231573609e-05, + "loss": 1.7647, + "step": 12914 + }, + { + "epoch": 3.9640883977900554, + "grad_norm": 0.30602574348449707, + "learning_rate": 6.874200425458006e-05, + "loss": 1.7122, + "step": 12915 + }, + { + "epoch": 3.9643953345610803, + "grad_norm": 0.22776013612747192, + "learning_rate": 6.873739600820457e-05, + "loss": 1.7136, + "step": 12916 + }, + { + "epoch": 3.9647022713321056, + "grad_norm": 0.3727327585220337, + "learning_rate": 6.873278757665513e-05, + "loss": 1.8314, + "step": 12917 + }, + { + "epoch": 3.9650092081031305, + "grad_norm": 0.35110536217689514, + "learning_rate": 6.872817895997733e-05, + "loss": 1.7506, + "step": 12918 + }, + { + "epoch": 3.965316144874156, + "grad_norm": 0.275560587644577, + "learning_rate": 6.872357015821666e-05, + "loss": 1.7865, + "step": 12919 + }, + { + "epoch": 3.965623081645181, + "grad_norm": 0.2686980366706848, + "learning_rate": 6.871896117141873e-05, + "loss": 1.8431, + "step": 12920 + }, + { + "epoch": 3.9659300184162065, + "grad_norm": 0.3299664556980133, + "learning_rate": 6.871435199962901e-05, + "loss": 1.7988, + "step": 12921 + }, + { + "epoch": 3.9662369551872314, + "grad_norm": 0.2833637297153473, + "learning_rate": 6.870974264289313e-05, + "loss": 1.6993, + "step": 12922 + }, + { + "epoch": 3.9665438919582567, + "grad_norm": 0.25062620639801025, + "learning_rate": 6.870513310125659e-05, + "loss": 1.7814, + "step": 12923 + }, + { + "epoch": 3.9668508287292816, + "grad_norm": 0.26609909534454346, + "learning_rate": 6.870052337476498e-05, + "loss": 1.7871, + "step": 12924 + }, + { + "epoch": 3.967157765500307, + "grad_norm": 0.22760890424251556, + "learning_rate": 6.869591346346382e-05, + "loss": 1.7941, + "step": 12925 + }, + { + "epoch": 3.9674647022713323, + "grad_norm": 0.2845582067966461, + "learning_rate": 6.869130336739869e-05, + "loss": 1.8215, + "step": 12926 + }, + { + "epoch": 3.967771639042357, + "grad_norm": 0.254948228597641, + "learning_rate": 6.868669308661514e-05, + "loss": 1.7515, + "step": 12927 + }, + { + "epoch": 3.9680785758133825, + "grad_norm": 0.2372167855501175, + "learning_rate": 6.868208262115875e-05, + "loss": 1.7524, + "step": 12928 + }, + { + "epoch": 3.9683855125844074, + "grad_norm": 0.31165993213653564, + "learning_rate": 6.867747197107506e-05, + "loss": 1.8139, + "step": 12929 + }, + { + "epoch": 3.9686924493554327, + "grad_norm": 0.2617839276790619, + "learning_rate": 6.867286113640965e-05, + "loss": 1.7388, + "step": 12930 + }, + { + "epoch": 3.968999386126458, + "grad_norm": 0.22749558091163635, + "learning_rate": 6.866825011720807e-05, + "loss": 1.7421, + "step": 12931 + }, + { + "epoch": 3.969306322897483, + "grad_norm": 0.27737462520599365, + "learning_rate": 6.86636389135159e-05, + "loss": 1.7977, + "step": 12932 + }, + { + "epoch": 3.9696132596685083, + "grad_norm": 0.3331063985824585, + "learning_rate": 6.865902752537871e-05, + "loss": 1.7925, + "step": 12933 + }, + { + "epoch": 3.969920196439533, + "grad_norm": 0.24229519069194794, + "learning_rate": 6.86544159528421e-05, + "loss": 1.7782, + "step": 12934 + }, + { + "epoch": 3.9702271332105585, + "grad_norm": 0.29494860768318176, + "learning_rate": 6.86498041959516e-05, + "loss": 1.7713, + "step": 12935 + }, + { + "epoch": 3.970534069981584, + "grad_norm": 0.26064008474349976, + "learning_rate": 6.86451922547528e-05, + "loss": 1.7161, + "step": 12936 + }, + { + "epoch": 3.970841006752609, + "grad_norm": 0.2656785547733307, + "learning_rate": 6.864058012929129e-05, + "loss": 1.8154, + "step": 12937 + }, + { + "epoch": 3.971147943523634, + "grad_norm": 0.21170997619628906, + "learning_rate": 6.863596781961263e-05, + "loss": 1.7614, + "step": 12938 + }, + { + "epoch": 3.9714548802946594, + "grad_norm": 0.21709072589874268, + "learning_rate": 6.863135532576241e-05, + "loss": 1.7896, + "step": 12939 + }, + { + "epoch": 3.9717618170656843, + "grad_norm": 0.2361367791891098, + "learning_rate": 6.862674264778623e-05, + "loss": 1.7775, + "step": 12940 + }, + { + "epoch": 3.9720687538367097, + "grad_norm": 0.22042550146579742, + "learning_rate": 6.862212978572967e-05, + "loss": 1.7781, + "step": 12941 + }, + { + "epoch": 3.972375690607735, + "grad_norm": 0.2535422146320343, + "learning_rate": 6.86175167396383e-05, + "loss": 1.7665, + "step": 12942 + }, + { + "epoch": 3.97268262737876, + "grad_norm": 0.23741906881332397, + "learning_rate": 6.861290350955771e-05, + "loss": 1.7829, + "step": 12943 + }, + { + "epoch": 3.972989564149785, + "grad_norm": 0.23789910972118378, + "learning_rate": 6.860829009553351e-05, + "loss": 1.7745, + "step": 12944 + }, + { + "epoch": 3.97329650092081, + "grad_norm": 0.26867765188217163, + "learning_rate": 6.860367649761127e-05, + "loss": 1.7239, + "step": 12945 + }, + { + "epoch": 3.9736034376918354, + "grad_norm": 0.3211663067340851, + "learning_rate": 6.85990627158366e-05, + "loss": 1.7976, + "step": 12946 + }, + { + "epoch": 3.9739103744628608, + "grad_norm": 0.26177310943603516, + "learning_rate": 6.85944487502551e-05, + "loss": 1.7446, + "step": 12947 + }, + { + "epoch": 3.9742173112338857, + "grad_norm": 0.23622745275497437, + "learning_rate": 6.858983460091234e-05, + "loss": 1.7824, + "step": 12948 + }, + { + "epoch": 3.974524248004911, + "grad_norm": 0.24372988939285278, + "learning_rate": 6.858522026785395e-05, + "loss": 1.8014, + "step": 12949 + }, + { + "epoch": 3.974831184775936, + "grad_norm": 0.2566998600959778, + "learning_rate": 6.85806057511255e-05, + "loss": 1.742, + "step": 12950 + }, + { + "epoch": 3.9751381215469612, + "grad_norm": 0.24418365955352783, + "learning_rate": 6.857599105077264e-05, + "loss": 1.7331, + "step": 12951 + }, + { + "epoch": 3.9754450583179866, + "grad_norm": 0.2260327935218811, + "learning_rate": 6.857137616684094e-05, + "loss": 1.7173, + "step": 12952 + }, + { + "epoch": 3.975751995089012, + "grad_norm": 0.277044415473938, + "learning_rate": 6.856676109937602e-05, + "loss": 1.7255, + "step": 12953 + }, + { + "epoch": 3.976058931860037, + "grad_norm": 0.228300079703331, + "learning_rate": 6.856214584842348e-05, + "loss": 1.7796, + "step": 12954 + }, + { + "epoch": 3.976365868631062, + "grad_norm": 0.2246638983488083, + "learning_rate": 6.855753041402893e-05, + "loss": 1.7458, + "step": 12955 + }, + { + "epoch": 3.976672805402087, + "grad_norm": 0.22235621511936188, + "learning_rate": 6.855291479623799e-05, + "loss": 1.7585, + "step": 12956 + }, + { + "epoch": 3.9769797421731123, + "grad_norm": 0.23710694909095764, + "learning_rate": 6.854829899509627e-05, + "loss": 1.767, + "step": 12957 + }, + { + "epoch": 3.9772866789441377, + "grad_norm": 0.2527346611022949, + "learning_rate": 6.854368301064939e-05, + "loss": 1.828, + "step": 12958 + }, + { + "epoch": 3.9775936157151626, + "grad_norm": 0.25032514333724976, + "learning_rate": 6.853906684294298e-05, + "loss": 1.8533, + "step": 12959 + }, + { + "epoch": 3.977900552486188, + "grad_norm": 0.2346320003271103, + "learning_rate": 6.853445049202262e-05, + "loss": 1.8046, + "step": 12960 + }, + { + "epoch": 3.978207489257213, + "grad_norm": 0.22576460242271423, + "learning_rate": 6.852983395793398e-05, + "loss": 1.7502, + "step": 12961 + }, + { + "epoch": 3.978514426028238, + "grad_norm": 0.2230147123336792, + "learning_rate": 6.852521724072266e-05, + "loss": 1.7362, + "step": 12962 + }, + { + "epoch": 3.9788213627992635, + "grad_norm": 0.2339705526828766, + "learning_rate": 6.852060034043425e-05, + "loss": 1.763, + "step": 12963 + }, + { + "epoch": 3.979128299570289, + "grad_norm": 0.24511271715164185, + "learning_rate": 6.851598325711446e-05, + "loss": 1.7988, + "step": 12964 + }, + { + "epoch": 3.9794352363413137, + "grad_norm": 0.2927285134792328, + "learning_rate": 6.851136599080885e-05, + "loss": 1.8346, + "step": 12965 + }, + { + "epoch": 3.979742173112339, + "grad_norm": 0.2593212425708771, + "learning_rate": 6.850674854156305e-05, + "loss": 1.7368, + "step": 12966 + }, + { + "epoch": 3.980049109883364, + "grad_norm": 0.3013291656970978, + "learning_rate": 6.850213090942275e-05, + "loss": 1.7911, + "step": 12967 + }, + { + "epoch": 3.9803560466543892, + "grad_norm": 0.3420047163963318, + "learning_rate": 6.849751309443352e-05, + "loss": 1.7899, + "step": 12968 + }, + { + "epoch": 3.9806629834254146, + "grad_norm": 0.2901746928691864, + "learning_rate": 6.849289509664105e-05, + "loss": 1.8244, + "step": 12969 + }, + { + "epoch": 3.9809699201964395, + "grad_norm": 0.2389298677444458, + "learning_rate": 6.848827691609093e-05, + "loss": 1.7116, + "step": 12970 + }, + { + "epoch": 3.981276856967465, + "grad_norm": 0.3153960704803467, + "learning_rate": 6.848365855282882e-05, + "loss": 1.7665, + "step": 12971 + }, + { + "epoch": 3.9815837937384897, + "grad_norm": 0.3162175118923187, + "learning_rate": 6.847904000690036e-05, + "loss": 1.7722, + "step": 12972 + }, + { + "epoch": 3.981890730509515, + "grad_norm": 0.27458643913269043, + "learning_rate": 6.847442127835122e-05, + "loss": 1.8095, + "step": 12973 + }, + { + "epoch": 3.9821976672805404, + "grad_norm": 0.22330710291862488, + "learning_rate": 6.846980236722699e-05, + "loss": 1.7179, + "step": 12974 + }, + { + "epoch": 3.9825046040515653, + "grad_norm": 0.2940923869609833, + "learning_rate": 6.846518327357339e-05, + "loss": 1.7363, + "step": 12975 + }, + { + "epoch": 3.9828115408225906, + "grad_norm": 0.26479849219322205, + "learning_rate": 6.846056399743599e-05, + "loss": 1.7788, + "step": 12976 + }, + { + "epoch": 3.9831184775936155, + "grad_norm": 0.24145057797431946, + "learning_rate": 6.845594453886048e-05, + "loss": 1.7825, + "step": 12977 + }, + { + "epoch": 3.983425414364641, + "grad_norm": 0.2795869708061218, + "learning_rate": 6.845132489789252e-05, + "loss": 1.7705, + "step": 12978 + }, + { + "epoch": 3.983732351135666, + "grad_norm": 0.3117202818393707, + "learning_rate": 6.844670507457776e-05, + "loss": 1.8183, + "step": 12979 + }, + { + "epoch": 3.9840392879066915, + "grad_norm": 0.2666899263858795, + "learning_rate": 6.844208506896184e-05, + "loss": 1.7434, + "step": 12980 + }, + { + "epoch": 3.9843462246777164, + "grad_norm": 0.24682332575321198, + "learning_rate": 6.843746488109042e-05, + "loss": 1.751, + "step": 12981 + }, + { + "epoch": 3.9846531614487417, + "grad_norm": 0.2558208703994751, + "learning_rate": 6.843284451100916e-05, + "loss": 1.7983, + "step": 12982 + }, + { + "epoch": 3.9849600982197666, + "grad_norm": 0.4236481189727783, + "learning_rate": 6.842822395876374e-05, + "loss": 1.8584, + "step": 12983 + }, + { + "epoch": 3.985267034990792, + "grad_norm": 0.4931485950946808, + "learning_rate": 6.84236032243998e-05, + "loss": 1.7617, + "step": 12984 + }, + { + "epoch": 3.9855739717618173, + "grad_norm": 0.37793654203414917, + "learning_rate": 6.841898230796302e-05, + "loss": 1.7411, + "step": 12985 + }, + { + "epoch": 3.985880908532842, + "grad_norm": 0.2093842774629593, + "learning_rate": 6.841436120949906e-05, + "loss": 1.772, + "step": 12986 + }, + { + "epoch": 3.9861878453038675, + "grad_norm": 0.4065552055835724, + "learning_rate": 6.840973992905359e-05, + "loss": 1.7675, + "step": 12987 + }, + { + "epoch": 3.9864947820748924, + "grad_norm": 0.5334183573722839, + "learning_rate": 6.840511846667228e-05, + "loss": 1.7872, + "step": 12988 + }, + { + "epoch": 3.9868017188459177, + "grad_norm": 0.378974974155426, + "learning_rate": 6.84004968224008e-05, + "loss": 1.8288, + "step": 12989 + }, + { + "epoch": 3.987108655616943, + "grad_norm": 0.22518309950828552, + "learning_rate": 6.839587499628483e-05, + "loss": 1.7715, + "step": 12990 + }, + { + "epoch": 3.987415592387968, + "grad_norm": 0.4270850718021393, + "learning_rate": 6.839125298837003e-05, + "loss": 1.7797, + "step": 12991 + }, + { + "epoch": 3.9877225291589933, + "grad_norm": 0.4629896879196167, + "learning_rate": 6.838663079870211e-05, + "loss": 1.7936, + "step": 12992 + }, + { + "epoch": 3.988029465930018, + "grad_norm": 0.29273948073387146, + "learning_rate": 6.838200842732672e-05, + "loss": 1.8264, + "step": 12993 + }, + { + "epoch": 3.9883364027010435, + "grad_norm": 0.31575852632522583, + "learning_rate": 6.837738587428954e-05, + "loss": 1.8043, + "step": 12994 + }, + { + "epoch": 3.988643339472069, + "grad_norm": 0.40602433681488037, + "learning_rate": 6.837276313963627e-05, + "loss": 1.7409, + "step": 12995 + }, + { + "epoch": 3.988950276243094, + "grad_norm": 0.23413142561912537, + "learning_rate": 6.836814022341259e-05, + "loss": 1.8585, + "step": 12996 + }, + { + "epoch": 3.989257213014119, + "grad_norm": 0.3518814444541931, + "learning_rate": 6.836351712566416e-05, + "loss": 1.7768, + "step": 12997 + }, + { + "epoch": 3.9895641497851444, + "grad_norm": 0.3811505436897278, + "learning_rate": 6.83588938464367e-05, + "loss": 1.7738, + "step": 12998 + }, + { + "epoch": 3.9898710865561693, + "grad_norm": 0.2516780197620392, + "learning_rate": 6.835427038577589e-05, + "loss": 1.7351, + "step": 12999 + }, + { + "epoch": 3.9901780233271946, + "grad_norm": 0.23704510927200317, + "learning_rate": 6.834964674372744e-05, + "loss": 1.7907, + "step": 13000 + }, + { + "epoch": 3.99048496009822, + "grad_norm": 0.2890201807022095, + "learning_rate": 6.8345022920337e-05, + "loss": 1.9546, + "step": 13001 + }, + { + "epoch": 3.990791896869245, + "grad_norm": 0.2678101360797882, + "learning_rate": 6.834039891565031e-05, + "loss": 1.7338, + "step": 13002 + }, + { + "epoch": 3.99109883364027, + "grad_norm": 0.31726256012916565, + "learning_rate": 6.833577472971304e-05, + "loss": 1.8464, + "step": 13003 + }, + { + "epoch": 3.991405770411295, + "grad_norm": 0.28112682700157166, + "learning_rate": 6.83311503625709e-05, + "loss": 1.7427, + "step": 13004 + }, + { + "epoch": 3.9917127071823204, + "grad_norm": 0.2651563584804535, + "learning_rate": 6.832652581426958e-05, + "loss": 1.8117, + "step": 13005 + }, + { + "epoch": 3.9920196439533457, + "grad_norm": 0.3095388114452362, + "learning_rate": 6.83219010848548e-05, + "loss": 1.8286, + "step": 13006 + }, + { + "epoch": 3.9923265807243706, + "grad_norm": 0.24704942107200623, + "learning_rate": 6.831727617437225e-05, + "loss": 1.77, + "step": 13007 + }, + { + "epoch": 3.992633517495396, + "grad_norm": 0.24868519604206085, + "learning_rate": 6.831265108286764e-05, + "loss": 1.8129, + "step": 13008 + }, + { + "epoch": 3.992940454266421, + "grad_norm": 0.26511049270629883, + "learning_rate": 6.830802581038669e-05, + "loss": 1.7539, + "step": 13009 + }, + { + "epoch": 3.993247391037446, + "grad_norm": 0.2823421061038971, + "learning_rate": 6.830340035697508e-05, + "loss": 1.8068, + "step": 13010 + }, + { + "epoch": 3.9935543278084715, + "grad_norm": 0.28526121377944946, + "learning_rate": 6.829877472267856e-05, + "loss": 1.764, + "step": 13011 + }, + { + "epoch": 3.993861264579497, + "grad_norm": 0.2576456069946289, + "learning_rate": 6.829414890754281e-05, + "loss": 1.728, + "step": 13012 + }, + { + "epoch": 3.9941682013505218, + "grad_norm": 0.27154842019081116, + "learning_rate": 6.828952291161356e-05, + "loss": 1.797, + "step": 13013 + }, + { + "epoch": 3.994475138121547, + "grad_norm": 0.3129710555076599, + "learning_rate": 6.828489673493652e-05, + "loss": 1.769, + "step": 13014 + }, + { + "epoch": 3.994782074892572, + "grad_norm": 0.40118902921676636, + "learning_rate": 6.828027037755742e-05, + "loss": 1.8029, + "step": 13015 + }, + { + "epoch": 3.9950890116635973, + "grad_norm": 0.33228442072868347, + "learning_rate": 6.827564383952197e-05, + "loss": 1.7295, + "step": 13016 + }, + { + "epoch": 3.9953959484346226, + "grad_norm": 0.218771830201149, + "learning_rate": 6.827101712087591e-05, + "loss": 1.7693, + "step": 13017 + }, + { + "epoch": 3.9957028852056475, + "grad_norm": 0.31354373693466187, + "learning_rate": 6.826639022166492e-05, + "loss": 1.743, + "step": 13018 + }, + { + "epoch": 3.996009821976673, + "grad_norm": 0.3584701418876648, + "learning_rate": 6.826176314193478e-05, + "loss": 1.7597, + "step": 13019 + }, + { + "epoch": 3.9963167587476978, + "grad_norm": 0.2692064344882965, + "learning_rate": 6.82571358817312e-05, + "loss": 1.7871, + "step": 13020 + }, + { + "epoch": 3.996623695518723, + "grad_norm": 0.3064020276069641, + "learning_rate": 6.825250844109987e-05, + "loss": 1.7858, + "step": 13021 + }, + { + "epoch": 3.9969306322897484, + "grad_norm": 0.29913413524627686, + "learning_rate": 6.824788082008657e-05, + "loss": 1.7773, + "step": 13022 + }, + { + "epoch": 3.9972375690607733, + "grad_norm": 0.2682165801525116, + "learning_rate": 6.824325301873703e-05, + "loss": 1.8321, + "step": 13023 + }, + { + "epoch": 3.9975445058317987, + "grad_norm": 0.3274376690387726, + "learning_rate": 6.823862503709694e-05, + "loss": 1.8514, + "step": 13024 + }, + { + "epoch": 3.9978514426028235, + "grad_norm": 0.29828041791915894, + "learning_rate": 6.823399687521211e-05, + "loss": 1.7923, + "step": 13025 + }, + { + "epoch": 3.998158379373849, + "grad_norm": 0.22339288890361786, + "learning_rate": 6.82293685331282e-05, + "loss": 1.756, + "step": 13026 + }, + { + "epoch": 3.998465316144874, + "grad_norm": 0.2254658192396164, + "learning_rate": 6.8224740010891e-05, + "loss": 1.7392, + "step": 13027 + }, + { + "epoch": 3.9987722529158995, + "grad_norm": 0.24932752549648285, + "learning_rate": 6.822011130854624e-05, + "loss": 1.7538, + "step": 13028 + }, + { + "epoch": 3.9990791896869244, + "grad_norm": 0.21429690718650818, + "learning_rate": 6.821548242613966e-05, + "loss": 1.7746, + "step": 13029 + }, + { + "epoch": 3.9993861264579498, + "grad_norm": 0.25503116846084595, + "learning_rate": 6.8210853363717e-05, + "loss": 1.814, + "step": 13030 + }, + { + "epoch": 3.9996930632289747, + "grad_norm": 0.23168155550956726, + "learning_rate": 6.820622412132402e-05, + "loss": 1.769, + "step": 13031 + }, + { + "epoch": 4.0, + "grad_norm": 0.2252223789691925, + "learning_rate": 6.820159469900645e-05, + "loss": 1.7782, + "step": 13032 + } + ], + "logging_steps": 1.0, + "max_steps": 32580, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.853013139326176e+20, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-13032/training_args.bin b/checkpoint-13032/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9db7ad91da5423a229826113feb3e9db3ef40c31 --- /dev/null +++ b/checkpoint-13032/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682b697e933b6e2693e5f9af9a0654effab1ca392c8500bf8af0eb089116a263 +size 7288 diff --git a/checkpoint-13032/zero_to_fp32.py b/checkpoint-13032/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-13032/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-16290/config.json b/checkpoint-16290/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a29af639fbf705188c21aae22660a85fee1ca26e --- /dev/null +++ b/checkpoint-16290/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "freeze_mm_mlp_adapter": false, + "gen_hidden_size": 1792, + "gen_pooling": "early_pool2d_4", + "gen_vision_tower": "eva-clip-E-14-plus", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "image_aspect_ratio": "square", + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "llava_llama", + "n_query": 64, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "tokenizer_model_max_length": 256, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "tune_mm_mlp_adapter": false, + "use_cache": false, + "use_mm_proj": true, + "vision_tower_pretrained": null, + "vocab_size": 128260 +} diff --git a/checkpoint-16290/generation_config.json b/checkpoint-16290/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..05892c70fa899883072c585fa444b4aa7175d6bc --- /dev/null +++ b/checkpoint-16290/generation_config.json @@ -0,0 +1,13 @@ +{ + "attn_implementation": "flash_attention_2", + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-16290/latest b/checkpoint-16290/latest new file mode 100644 index 0000000000000000000000000000000000000000..9d0bd5d9e8d51ed417f7b07f50ba26286ba77659 --- /dev/null +++ b/checkpoint-16290/latest @@ -0,0 +1 @@ +global_step16290 \ No newline at end of file diff --git a/checkpoint-16290/model-00001-of-00003.safetensors b/checkpoint-16290/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f958ce0c31d37cd5be599e95e3943e969e0257af --- /dev/null +++ b/checkpoint-16290/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:807823f070a089d40144822144c7561df2cf32d773c60b32992f5babd0f48a94 +size 4955415870 diff --git a/checkpoint-16290/model-00002-of-00003.safetensors b/checkpoint-16290/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1c6f3bf70f8abb1e7ffb233219debc10bc20bfc --- /dev/null +++ b/checkpoint-16290/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b088e0e2c4fb5916f448522fa5aef361db713e2c2c0ceac534662c8d52e330d +size 4971563008 diff --git a/checkpoint-16290/model-00003-of-00003.safetensors b/checkpoint-16290/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b52114259a4ca663ee3d75610c3dd5b312d3d7f0 --- /dev/null +++ b/checkpoint-16290/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfdc9b8dbb7e7d221b992590a6e4d5d0b6f01ceb1bd62592d31f5ba8ece34b68 +size 4180840856 diff --git a/checkpoint-16290/model.safetensors.index.json b/checkpoint-16290/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c911c94f46f802ae304903dd7796da96c28604 --- /dev/null +++ b/checkpoint-16290/model.safetensors.index.json @@ -0,0 +1,2358 @@ +{ + "metadata": { + "total_size": 14107506086 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.bias": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.cls_token": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.pos_embed": "model-00001-of-00003.safetensors", + "model.latent_queries": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/checkpoint-16290/rng_state_0.pth b/checkpoint-16290/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..7f7af0016bda8b5a8711e43ee01a323120cbca5b --- /dev/null +++ b/checkpoint-16290/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d78fa6ab55cf3d6cda1f1df45f8e11f39d2b6010c70220e4f99396dd2e1817bf +size 15984 diff --git a/checkpoint-16290/rng_state_1.pth b/checkpoint-16290/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1eb9fc5ad4c195e367ed052c2cf1f084c71f5770 --- /dev/null +++ b/checkpoint-16290/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4526806a5346cb4bf0b553ab31fcd559bf61a126a14eb1d7ace2e46946de898a +size 15984 diff --git a/checkpoint-16290/rng_state_10.pth b/checkpoint-16290/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..0328614da3cef2f94f318866fbc8e407f3be8d0e --- /dev/null +++ b/checkpoint-16290/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc926304e9204c2dc2193d6eaf29d619d08dd2ea94bca677e987c9a26358a5b +size 15997 diff --git a/checkpoint-16290/rng_state_11.pth b/checkpoint-16290/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..bde763435735df7ef3db762192def007108972ef --- /dev/null +++ b/checkpoint-16290/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6b0370d16b175478c3d1016b540f8ee754df8fa56c2723ee7532e80a71f18d5 +size 15997 diff --git a/checkpoint-16290/rng_state_12.pth b/checkpoint-16290/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca1b960719e8de601398b62254ea8c97c7c51b49 --- /dev/null +++ b/checkpoint-16290/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc4d1e1934f406fb39f430ca3b5ea437d06bf665f3f331a670035ef5861ce0f +size 15997 diff --git a/checkpoint-16290/rng_state_13.pth b/checkpoint-16290/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..b34b8d4aeee54107d60dcc7e418816b3f8ab0a8c --- /dev/null +++ b/checkpoint-16290/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5a18a41cbe23fde0fd8e6946431ae3e46013f62c3ea627154dfef47a4555d1 +size 15997 diff --git a/checkpoint-16290/rng_state_14.pth b/checkpoint-16290/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..0c52e94e352afb8352d197fc1936b560af50ee71 --- /dev/null +++ b/checkpoint-16290/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ddba6bfe5dd298b7153baf08df0986b7e2478ce8ae05d7db41154ef8d49abab +size 15997 diff --git a/checkpoint-16290/rng_state_15.pth b/checkpoint-16290/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..050633f268d67fbc67539bce2c0b11b2b3896ce2 --- /dev/null +++ b/checkpoint-16290/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:006f5709d313fd41a11a1fc0bc36c05d71dcb7112df3c4e5abe430506da2a035 +size 15997 diff --git a/checkpoint-16290/rng_state_16.pth b/checkpoint-16290/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..98d65069f6d947074eac16ee04f17925d30d0322 --- /dev/null +++ b/checkpoint-16290/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f15cb0ec3d93fd0cec50e24af02c1167e5ed9b82976c699e10928bc756e448a3 +size 15997 diff --git a/checkpoint-16290/rng_state_17.pth b/checkpoint-16290/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..b0e361e0470cdc4c968cf37c4d28804fcff9a442 --- /dev/null +++ b/checkpoint-16290/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da61ff395f9c39e5ed646137434053820a7a759802387e23aaf19fd6ded6cb4e +size 15997 diff --git a/checkpoint-16290/rng_state_18.pth b/checkpoint-16290/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..3d02ba2caefcdcb6eae8b4e321001d8c0a71d4cb --- /dev/null +++ b/checkpoint-16290/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e8390fb6937c79641fa5310cd744b3a5c8f202f26cf49e693efddc1cc9de265 +size 15997 diff --git a/checkpoint-16290/rng_state_19.pth b/checkpoint-16290/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..e88398c30a53d7cda28ed0f8fd3da297e2de1f43 --- /dev/null +++ b/checkpoint-16290/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b716c69e5eed75f7385c2adde32f8ecf3d30acf37259eda51a13ad6e8fb0de1 +size 15997 diff --git a/checkpoint-16290/rng_state_2.pth b/checkpoint-16290/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d314300fe7a4e3788d336785bdf2ca281072f87 --- /dev/null +++ b/checkpoint-16290/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49c9449ac81de688159136e72e209b135126e6d5ec8a94f23dbf96aed2dd1782 +size 15984 diff --git a/checkpoint-16290/rng_state_20.pth b/checkpoint-16290/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca34f1099d0a44f337b6ff4f9f566e574718be23 --- /dev/null +++ b/checkpoint-16290/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75bc218e9b7153a6d4cdd2f0b39102ccea31ade253160ebd830822fc0cf1160d +size 15997 diff --git a/checkpoint-16290/rng_state_21.pth b/checkpoint-16290/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd80422ff36738c1af6c13c5b0fd3eac44ad44af --- /dev/null +++ b/checkpoint-16290/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aafe38154381fb57c3ea3b2d9825d9b9f0596bd8e460d7915df217a094c5ae4d +size 15997 diff --git a/checkpoint-16290/rng_state_22.pth b/checkpoint-16290/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..de5978138a135f1b3605138e1ff2711e6a0b4b57 --- /dev/null +++ b/checkpoint-16290/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c9aaae908b81d720c460508313443cfb7eadc3ba0fd8d9c63f43858e81acb29 +size 15997 diff --git a/checkpoint-16290/rng_state_23.pth b/checkpoint-16290/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..280e16aa218d1fe931a1943ccd6483b04f2ed44c --- /dev/null +++ b/checkpoint-16290/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fbdfd7d0bf9211369c34f23a213156ebd58cc1b6d647fe803803ce3523b8ca3 +size 15997 diff --git a/checkpoint-16290/rng_state_24.pth b/checkpoint-16290/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..c42d293dbb9116c93bdabaa0a2cf3fc1125b8ef0 --- /dev/null +++ b/checkpoint-16290/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14ac8a2a73363dbc1811e57b239c925709a37130a05abf9ed32ac4b6a8c40d2f +size 15997 diff --git a/checkpoint-16290/rng_state_25.pth b/checkpoint-16290/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..d6f292dea7cc61427cb24a41aae55b24665fa5d0 --- /dev/null +++ b/checkpoint-16290/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd089056928f7a9262490de8d664eb4c31cafa427d2bd2a4aa321c82ef6b8cf2 +size 15997 diff --git a/checkpoint-16290/rng_state_26.pth b/checkpoint-16290/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8efb0e809aedd5b329b8a12c785354dacc4c1e0 --- /dev/null +++ b/checkpoint-16290/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21c348744f37b3476231bd1cdc5ca555c0c12e64ed6885ec43c56e32155bb7c8 +size 15997 diff --git a/checkpoint-16290/rng_state_27.pth b/checkpoint-16290/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd964f17560bfc85ce2f9e49f3cb6cd02b9a27e9 --- /dev/null +++ b/checkpoint-16290/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b3cb7a0fb9ce68ee06f03d8c3b8aeb15feb78d920c564824f2ad3a2a1188f4c +size 15997 diff --git a/checkpoint-16290/rng_state_28.pth b/checkpoint-16290/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..94ce9bf37ae93d29bc4ae7b1bbbec91aa9196de7 --- /dev/null +++ b/checkpoint-16290/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc966a6ceae2936434a7e8e937c7db5381187d7a38ad8af61485cf0737efecc1 +size 15997 diff --git a/checkpoint-16290/rng_state_29.pth b/checkpoint-16290/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..69fdd7162eac84d48a47f63254b6a228c1547393 --- /dev/null +++ b/checkpoint-16290/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c42c3a40f06c3c19dc6e2d9b7efb0f1e7ce779f141d01af120efaa7650b5f02 +size 15997 diff --git a/checkpoint-16290/rng_state_3.pth b/checkpoint-16290/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..81117cea1df0870b50bdd79ad832d0ecbd645920 --- /dev/null +++ b/checkpoint-16290/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff022e8050c330004c682301f8342d11b3e2ec63a641c16e2dd44c4fb882e972 +size 15984 diff --git a/checkpoint-16290/rng_state_30.pth b/checkpoint-16290/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..ae93dfced36649248cf4f7df6096061e8d23c4c0 --- /dev/null +++ b/checkpoint-16290/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb3a17926445f426cc8a28b59cf3cc7802ff1b537640bea4c335db735511557 +size 15997 diff --git a/checkpoint-16290/rng_state_31.pth b/checkpoint-16290/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..677107b7aef018e8852f45876aac4b93bec6c0e0 --- /dev/null +++ b/checkpoint-16290/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:537cfb949b35f9de7502e2f99668493f4c1d4075a73433f8f2cc585cca014664 +size 15997 diff --git a/checkpoint-16290/rng_state_32.pth b/checkpoint-16290/rng_state_32.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0b764859662bc685a06c847ecd58e9fc6235482 --- /dev/null +++ b/checkpoint-16290/rng_state_32.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c5bcd0329fad9a02f703ed5eca464cd7fab08c34d72b8ceb6d2616387b95481 +size 15997 diff --git a/checkpoint-16290/rng_state_33.pth b/checkpoint-16290/rng_state_33.pth new file mode 100644 index 0000000000000000000000000000000000000000..f0dd1b0d11b4be16cf86f0292af1b449451e162f --- /dev/null +++ b/checkpoint-16290/rng_state_33.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfe40234470ace961ef725ca43ad0a8d6303f1733009634fdfd09d15a58f4805 +size 15997 diff --git a/checkpoint-16290/rng_state_34.pth b/checkpoint-16290/rng_state_34.pth new file mode 100644 index 0000000000000000000000000000000000000000..7304f34e936049341dc5f991a2b058e809c4a56a --- /dev/null +++ b/checkpoint-16290/rng_state_34.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4615274dcb1138ed76cf7add6e910a1f9dbb5fe896dc00008889a6d7f393c472 +size 15997 diff --git a/checkpoint-16290/rng_state_35.pth b/checkpoint-16290/rng_state_35.pth new file mode 100644 index 0000000000000000000000000000000000000000..55f5fd6008ac92523bc4dd0285cd687509cffcd9 --- /dev/null +++ b/checkpoint-16290/rng_state_35.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f70fd9b67c5a95f2832104ca04b46df023285ccbf89ad4075c7ae72783d010f +size 15997 diff --git a/checkpoint-16290/rng_state_36.pth b/checkpoint-16290/rng_state_36.pth new file mode 100644 index 0000000000000000000000000000000000000000..3f4a1c46583a029f4df57034cef2a49a10a394b2 --- /dev/null +++ b/checkpoint-16290/rng_state_36.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c67de5a2d45951b4203bbcede9fab5d00ca00444ea78da4dae8f095ff35c1d1 +size 15997 diff --git a/checkpoint-16290/rng_state_37.pth b/checkpoint-16290/rng_state_37.pth new file mode 100644 index 0000000000000000000000000000000000000000..c07858eb1fcada5da560b8108451473b07502336 --- /dev/null +++ b/checkpoint-16290/rng_state_37.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b97e77baa7ac6061ddbdd437b011ef70fc0635feb03049fb2912ae8e462316fd +size 15997 diff --git a/checkpoint-16290/rng_state_38.pth b/checkpoint-16290/rng_state_38.pth new file mode 100644 index 0000000000000000000000000000000000000000..e707f3f28a7b26938596a8d9b6a2f30637f3f020 --- /dev/null +++ b/checkpoint-16290/rng_state_38.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4d42ad6d4f9ba92710ff320d041de7314e8cf43a1fc4f639e14c52f36891c8f +size 15997 diff --git a/checkpoint-16290/rng_state_39.pth b/checkpoint-16290/rng_state_39.pth new file mode 100644 index 0000000000000000000000000000000000000000..845e6611504235dcc32e54217b8b335c3ce8a819 --- /dev/null +++ b/checkpoint-16290/rng_state_39.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0358326ffb85c4c5958cc7e0a69d95b191e6f24bc91760e172827657ef98c85c +size 15997 diff --git a/checkpoint-16290/rng_state_4.pth b/checkpoint-16290/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8558ab316c5e0a8d58beda9f7d6bd0a1e00e2e50 --- /dev/null +++ b/checkpoint-16290/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d13fdeeeb9b307f900ce53429899ba0c097122ba8ace5e3b55940434fc8c2c72 +size 15984 diff --git a/checkpoint-16290/rng_state_40.pth b/checkpoint-16290/rng_state_40.pth new file mode 100644 index 0000000000000000000000000000000000000000..1545d1e9f2f00f0ce4124ff0e7265f4732bcbdde --- /dev/null +++ b/checkpoint-16290/rng_state_40.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30cbf0f87dde7b950d0aa0b0466033b2a170f8eab1247efb12b644d0c74d0b21 +size 15997 diff --git a/checkpoint-16290/rng_state_41.pth b/checkpoint-16290/rng_state_41.pth new file mode 100644 index 0000000000000000000000000000000000000000..d07267d8762cacf8a0b5772a39e10da0162c9a9a --- /dev/null +++ b/checkpoint-16290/rng_state_41.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e3d82b9da0617cd2601cb7bc4b0d9375a8d71fbfb714a31a654b37e57618540 +size 15997 diff --git a/checkpoint-16290/rng_state_42.pth b/checkpoint-16290/rng_state_42.pth new file mode 100644 index 0000000000000000000000000000000000000000..0c4154da079a6290f9e41c7a22adfd534cea8eac --- /dev/null +++ b/checkpoint-16290/rng_state_42.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:639df123adb12ba8c9b88ababf065f80ad2fd3820ed1258385b74131f1b9e206 +size 15997 diff --git a/checkpoint-16290/rng_state_43.pth b/checkpoint-16290/rng_state_43.pth new file mode 100644 index 0000000000000000000000000000000000000000..0213d924a47e9c30ab73cfa4a031261b473d678c --- /dev/null +++ b/checkpoint-16290/rng_state_43.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abe9613803a240e91251ef827a1834055a79a2fc1bdd6dfddc3c84716400f749 +size 15997 diff --git a/checkpoint-16290/rng_state_44.pth b/checkpoint-16290/rng_state_44.pth new file mode 100644 index 0000000000000000000000000000000000000000..62ddfcfd6b877dca0b5ad6530f886b3029cce6dc --- /dev/null +++ b/checkpoint-16290/rng_state_44.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b93e4796edf73c12bdfa2e5b8f77c618085809bf5bb070b1a7529156efa919 +size 15997 diff --git a/checkpoint-16290/rng_state_45.pth b/checkpoint-16290/rng_state_45.pth new file mode 100644 index 0000000000000000000000000000000000000000..dc980544fc28db6554a30a0bed400537bba279d4 --- /dev/null +++ b/checkpoint-16290/rng_state_45.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e73535683d0441ec492d2ccb86bd64a36389d556a5708844f663da382fce759b +size 15997 diff --git a/checkpoint-16290/rng_state_46.pth b/checkpoint-16290/rng_state_46.pth new file mode 100644 index 0000000000000000000000000000000000000000..1228c57d3a0cf6b05ad929cca866b31e7c78f308 --- /dev/null +++ b/checkpoint-16290/rng_state_46.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1711bb3cb117133a3980c7808caf470707a5eb28c6ce839d372af17e7b4787a0 +size 15997 diff --git a/checkpoint-16290/rng_state_47.pth b/checkpoint-16290/rng_state_47.pth new file mode 100644 index 0000000000000000000000000000000000000000..171bf079014a4553587d8e0cc408d6f791e3c2c6 --- /dev/null +++ b/checkpoint-16290/rng_state_47.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2b4f5498708acfa691ff436a519bbf190edb931feddd6fca31cdc1128645501 +size 15997 diff --git a/checkpoint-16290/rng_state_48.pth b/checkpoint-16290/rng_state_48.pth new file mode 100644 index 0000000000000000000000000000000000000000..d946f0a7a7d4377cbb84a4f946c531e2fb7243ff --- /dev/null +++ b/checkpoint-16290/rng_state_48.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c78d72833850aecafda31b124ae8f3a1ca3167fbd8ad9f760c70d65da0b14ea7 +size 15997 diff --git a/checkpoint-16290/rng_state_49.pth b/checkpoint-16290/rng_state_49.pth new file mode 100644 index 0000000000000000000000000000000000000000..fcfc4d36de65f065e1a8b1d1a8a789cf12f201a9 --- /dev/null +++ b/checkpoint-16290/rng_state_49.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6880b935cf0324acd7f7e03a81e63f50ca19f30caa203d82063a4b31d31c306a +size 15997 diff --git a/checkpoint-16290/rng_state_5.pth b/checkpoint-16290/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..8716ab86a046827f99f81df0bf62ae526d479c20 --- /dev/null +++ b/checkpoint-16290/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77b068ac15314a9298098806edbf711d4413307396c805bafcde134342b7d3bd +size 15984 diff --git a/checkpoint-16290/rng_state_50.pth b/checkpoint-16290/rng_state_50.pth new file mode 100644 index 0000000000000000000000000000000000000000..06fe879f005253843f9b6fbb09fa7e4ad5be341a --- /dev/null +++ b/checkpoint-16290/rng_state_50.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08a807e1f77875ee968fe02c79258b062e9fc94a74762af568b9c6b87e040d8d +size 15997 diff --git a/checkpoint-16290/rng_state_51.pth b/checkpoint-16290/rng_state_51.pth new file mode 100644 index 0000000000000000000000000000000000000000..4d03e7d14a7904e6db49454f1bc0187107351092 --- /dev/null +++ b/checkpoint-16290/rng_state_51.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f9bb358f5c7d931b357ae779f136b9b4e77cd8401ae4354fec4db128266f550 +size 15997 diff --git a/checkpoint-16290/rng_state_52.pth b/checkpoint-16290/rng_state_52.pth new file mode 100644 index 0000000000000000000000000000000000000000..697bb7082582dc4ec2a91bbf2f8638e228e26e37 --- /dev/null +++ b/checkpoint-16290/rng_state_52.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fde22a9907411ca28ad1f31f9b1cb4e118a96bbc02c418141000363e8d6a831 +size 15997 diff --git a/checkpoint-16290/rng_state_53.pth b/checkpoint-16290/rng_state_53.pth new file mode 100644 index 0000000000000000000000000000000000000000..2cfed622b57cf238fd4dd73fe814ba8f76a08edc --- /dev/null +++ b/checkpoint-16290/rng_state_53.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6aff0bc5bb9a7e65ac22c89d9c980fc3e598c40fb67ca34913662c7835beef3 +size 15997 diff --git a/checkpoint-16290/rng_state_54.pth b/checkpoint-16290/rng_state_54.pth new file mode 100644 index 0000000000000000000000000000000000000000..fa132cdba83bdeb094ed54c6494f3cf946746730 --- /dev/null +++ b/checkpoint-16290/rng_state_54.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f46eb06220825ae93cc5ff6f402c4c7622e290a71717b939400a6eb25af964dd +size 15997 diff --git a/checkpoint-16290/rng_state_55.pth b/checkpoint-16290/rng_state_55.pth new file mode 100644 index 0000000000000000000000000000000000000000..291fbcf3b176b21ad7e1282de0ecf3d4a9118984 --- /dev/null +++ b/checkpoint-16290/rng_state_55.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5813c78b722cc084b05a2d7ed2edf127e1690d4d97dcb85b43c3a197c3017046 +size 15997 diff --git a/checkpoint-16290/rng_state_56.pth b/checkpoint-16290/rng_state_56.pth new file mode 100644 index 0000000000000000000000000000000000000000..06a898cb5e9ba587c685903f69160096374c460d --- /dev/null +++ b/checkpoint-16290/rng_state_56.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeb0ade516472ea53ef8479e448b8056b6008d44423b4f6ebdf885af1e6d8b7e +size 15997 diff --git a/checkpoint-16290/rng_state_57.pth b/checkpoint-16290/rng_state_57.pth new file mode 100644 index 0000000000000000000000000000000000000000..643fbfbc1e1ce1ebfaa8f86144958cd4b903cbbc --- /dev/null +++ b/checkpoint-16290/rng_state_57.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4562b15393a8a0ed8e365ee9f892a06d4b134932d98f5a807c1ce49bdcc009a +size 15997 diff --git a/checkpoint-16290/rng_state_58.pth b/checkpoint-16290/rng_state_58.pth new file mode 100644 index 0000000000000000000000000000000000000000..74b770177971b4a0b815bed31946233653c700b8 --- /dev/null +++ b/checkpoint-16290/rng_state_58.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1a3fa6286722da3c7e11b53bf4a9632b41c60b03bbed1c11da095f6439eb3d5 +size 15997 diff --git a/checkpoint-16290/rng_state_59.pth b/checkpoint-16290/rng_state_59.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e224ddf174cf6507808b99b7af2fc333da50a79 --- /dev/null +++ b/checkpoint-16290/rng_state_59.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c466ac7d76e091625c90f8a3c9c396ef74dde1195608ead72c440b81ebd7c2e5 +size 15997 diff --git a/checkpoint-16290/rng_state_6.pth b/checkpoint-16290/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..c84b0020cc0d167273ed3db14f206e92e6287ea6 --- /dev/null +++ b/checkpoint-16290/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:935ad8d63f0562e93c61d7a37dca96023c498f9c98c582870ea859ef46d736c5 +size 15984 diff --git a/checkpoint-16290/rng_state_60.pth b/checkpoint-16290/rng_state_60.pth new file mode 100644 index 0000000000000000000000000000000000000000..cb4de9fd349f7be6a5d23f3fdffa3e686c67bbf4 --- /dev/null +++ b/checkpoint-16290/rng_state_60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2de91221adc58a519d7f01d34a7ea22fe8c9ad8fd7cfacbc288257622c3c43cc +size 15997 diff --git a/checkpoint-16290/rng_state_61.pth b/checkpoint-16290/rng_state_61.pth new file mode 100644 index 0000000000000000000000000000000000000000..940a60a43a4c1476b55b9ba1644c34fc46ef3289 --- /dev/null +++ b/checkpoint-16290/rng_state_61.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b3da0ba7c70744c1cf599d446331550a892b4465f104984a2d72b824911d6eb +size 15997 diff --git a/checkpoint-16290/rng_state_62.pth b/checkpoint-16290/rng_state_62.pth new file mode 100644 index 0000000000000000000000000000000000000000..3f052d0913a058100d113d61a1b5308950ebd5f4 --- /dev/null +++ b/checkpoint-16290/rng_state_62.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccbf08bd46f420ba7d6c1ac276334b3ec3336e35f3f3d8a55674ea2dde1af541 +size 15997 diff --git a/checkpoint-16290/rng_state_63.pth b/checkpoint-16290/rng_state_63.pth new file mode 100644 index 0000000000000000000000000000000000000000..6097ef9c36d3d557a4f3f70bf2a36e53645297b9 --- /dev/null +++ b/checkpoint-16290/rng_state_63.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f71388c288405e5e2a038a2d5caee1e63f204c06c952945e479c4cf00ae42410 +size 15997 diff --git a/checkpoint-16290/rng_state_7.pth b/checkpoint-16290/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..b91399afa6e2795338cff5b105b6cc2c6ae0db54 --- /dev/null +++ b/checkpoint-16290/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:affc7be6c20aff9893dcd7d51a345c482606a3e975d64cd89af0da01d815869c +size 15984 diff --git a/checkpoint-16290/rng_state_8.pth b/checkpoint-16290/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..5be933b6f80c9564ed91ed6a910a3a22fbb7732a --- /dev/null +++ b/checkpoint-16290/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2989d4a902982122296d3668fdb39fc952e5dee51eeba56de60e32205d4e24e9 +size 15984 diff --git a/checkpoint-16290/rng_state_9.pth b/checkpoint-16290/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..6815b241af1ac8a92258dce37c21b839606474ca --- /dev/null +++ b/checkpoint-16290/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f572f246ea842ed39481836a18a58d9f63f2aa188b405e47da5a818f265ea68e +size 15984 diff --git a/checkpoint-16290/scheduler.pt b/checkpoint-16290/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..af480a6f558e2544c1385a2593aa21f935f969fa --- /dev/null +++ b/checkpoint-16290/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e08b79ccf2b98be2c2dbc54b7ecf22a2731936f9ab32bc9265301730ae38b26b +size 1064 diff --git a/checkpoint-16290/special_tokens_map.json b/checkpoint-16290/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad43db72a0e94321a5a9455dce616c68d1f9673 --- /dev/null +++ b/checkpoint-16290/special_tokens_map.json @@ -0,0 +1,46 @@ +{ + "additional_special_tokens": [ + { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-16290/tokenizer.json b/checkpoint-16290/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..444d43e1c25d11b63381073024becd006c83d4f6 --- /dev/null +++ b/checkpoint-16290/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fbef9068a1d82c7fafc3fdfd7c717524c8bfbcaea19c14ce4f8a4e616deb57 +size 17210651 diff --git a/checkpoint-16290/tokenizer_config.json b/checkpoint-16290/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a54102d00c210427fe2da524cea00c5ace13686 --- /dev/null +++ b/checkpoint-16290/tokenizer_config.json @@ -0,0 +1,2102 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "[IMG]", + "[/IMG]", + "" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 256, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-16290/trainer_state.json b/checkpoint-16290/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..43c5ae68d39adf84f7e02dbb5766fda131e66ae2 --- /dev/null +++ b/checkpoint-16290/trainer_state.json @@ -0,0 +1,114064 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 16290, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003069367710251688, + "grad_norm": 1.3492016792297363, + "learning_rate": 0.0, + "loss": 6.5185, + "step": 1 + }, + { + "epoch": 0.0006138735420503376, + "grad_norm": 1.4303781986236572, + "learning_rate": 1.0224948875255626e-07, + "loss": 6.5124, + "step": 2 + }, + { + "epoch": 0.0009208103130755065, + "grad_norm": 1.3981783390045166, + "learning_rate": 2.0449897750511251e-07, + "loss": 6.5204, + "step": 3 + }, + { + "epoch": 0.0012277470841006752, + "grad_norm": 1.3760672807693481, + "learning_rate": 3.0674846625766876e-07, + "loss": 6.502, + "step": 4 + }, + { + "epoch": 0.001534683855125844, + "grad_norm": 1.3704107999801636, + "learning_rate": 4.0899795501022503e-07, + "loss": 6.5021, + "step": 5 + }, + { + "epoch": 0.001841620626151013, + "grad_norm": 1.3109549283981323, + "learning_rate": 5.112474437627812e-07, + "loss": 6.521, + "step": 6 + }, + { + "epoch": 0.002148557397176182, + "grad_norm": 1.475183367729187, + "learning_rate": 6.134969325153375e-07, + "loss": 6.521, + "step": 7 + }, + { + "epoch": 0.0024554941682013503, + "grad_norm": 1.4563297033309937, + "learning_rate": 7.157464212678937e-07, + "loss": 6.5075, + "step": 8 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 1.437183141708374, + "learning_rate": 8.179959100204501e-07, + "loss": 6.5135, + "step": 9 + }, + { + "epoch": 0.003069367710251688, + "grad_norm": 1.336928129196167, + "learning_rate": 9.202453987730062e-07, + "loss": 6.5138, + "step": 10 + }, + { + "epoch": 0.003376304481276857, + "grad_norm": 1.3220698833465576, + "learning_rate": 1.0224948875255625e-06, + "loss": 6.5187, + "step": 11 + }, + { + "epoch": 0.003683241252302026, + "grad_norm": 1.3990652561187744, + "learning_rate": 1.1247443762781187e-06, + "loss": 6.5129, + "step": 12 + }, + { + "epoch": 0.003990178023327195, + "grad_norm": 1.4394340515136719, + "learning_rate": 1.226993865030675e-06, + "loss": 6.5078, + "step": 13 + }, + { + "epoch": 0.004297114794352364, + "grad_norm": 1.3675259351730347, + "learning_rate": 1.3292433537832312e-06, + "loss": 6.5115, + "step": 14 + }, + { + "epoch": 0.004604051565377533, + "grad_norm": 1.3085063695907593, + "learning_rate": 1.4314928425357874e-06, + "loss": 6.5092, + "step": 15 + }, + { + "epoch": 0.004910988336402701, + "grad_norm": 1.4214227199554443, + "learning_rate": 1.5337423312883435e-06, + "loss": 6.5026, + "step": 16 + }, + { + "epoch": 0.0052179251074278695, + "grad_norm": 1.377146601676941, + "learning_rate": 1.6359918200409001e-06, + "loss": 6.4882, + "step": 17 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.3461124897003174, + "learning_rate": 1.7382413087934563e-06, + "loss": 6.4935, + "step": 18 + }, + { + "epoch": 0.005831798649478207, + "grad_norm": 1.3161669969558716, + "learning_rate": 1.8404907975460124e-06, + "loss": 6.4795, + "step": 19 + }, + { + "epoch": 0.006138735420503376, + "grad_norm": 1.2915974855422974, + "learning_rate": 1.942740286298569e-06, + "loss": 6.4529, + "step": 20 + }, + { + "epoch": 0.006445672191528545, + "grad_norm": 1.2675414085388184, + "learning_rate": 2.044989775051125e-06, + "loss": 6.454, + "step": 21 + }, + { + "epoch": 0.006752608962553714, + "grad_norm": 1.2769283056259155, + "learning_rate": 2.147239263803681e-06, + "loss": 6.4574, + "step": 22 + }, + { + "epoch": 0.007059545733578883, + "grad_norm": 1.2556813955307007, + "learning_rate": 2.2494887525562373e-06, + "loss": 6.4486, + "step": 23 + }, + { + "epoch": 0.007366482504604052, + "grad_norm": 1.2158268690109253, + "learning_rate": 2.3517382413087935e-06, + "loss": 6.4357, + "step": 24 + }, + { + "epoch": 0.007673419275629221, + "grad_norm": 1.2383767366409302, + "learning_rate": 2.45398773006135e-06, + "loss": 6.4347, + "step": 25 + }, + { + "epoch": 0.00798035604665439, + "grad_norm": 1.2865383625030518, + "learning_rate": 2.5562372188139062e-06, + "loss": 6.3611, + "step": 26 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 1.1501989364624023, + "learning_rate": 2.6584867075664624e-06, + "loss": 6.3247, + "step": 27 + }, + { + "epoch": 0.008594229588704727, + "grad_norm": 1.0971378087997437, + "learning_rate": 2.7607361963190186e-06, + "loss": 6.3078, + "step": 28 + }, + { + "epoch": 0.008901166359729895, + "grad_norm": 1.1365599632263184, + "learning_rate": 2.8629856850715747e-06, + "loss": 6.3211, + "step": 29 + }, + { + "epoch": 0.009208103130755065, + "grad_norm": 1.1228944063186646, + "learning_rate": 2.965235173824131e-06, + "loss": 6.3185, + "step": 30 + }, + { + "epoch": 0.009515039901780233, + "grad_norm": 1.126287579536438, + "learning_rate": 3.067484662576687e-06, + "loss": 6.2845, + "step": 31 + }, + { + "epoch": 0.009821976672805401, + "grad_norm": 1.1070353984832764, + "learning_rate": 3.1697341513292436e-06, + "loss": 6.2855, + "step": 32 + }, + { + "epoch": 0.010128913443830571, + "grad_norm": 1.101291537284851, + "learning_rate": 3.2719836400818002e-06, + "loss": 6.2764, + "step": 33 + }, + { + "epoch": 0.010435850214855739, + "grad_norm": 1.0643113851547241, + "learning_rate": 3.374233128834356e-06, + "loss": 6.2363, + "step": 34 + }, + { + "epoch": 0.010742786985880909, + "grad_norm": 0.9714563488960266, + "learning_rate": 3.4764826175869125e-06, + "loss": 6.1771, + "step": 35 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8998560309410095, + "learning_rate": 3.5787321063394683e-06, + "loss": 6.1202, + "step": 36 + }, + { + "epoch": 0.011356660527931247, + "grad_norm": 0.8481987714767456, + "learning_rate": 3.680981595092025e-06, + "loss": 6.0954, + "step": 37 + }, + { + "epoch": 0.011663597298956415, + "grad_norm": 0.8124909996986389, + "learning_rate": 3.783231083844581e-06, + "loss": 6.0832, + "step": 38 + }, + { + "epoch": 0.011970534069981584, + "grad_norm": 0.7968178391456604, + "learning_rate": 3.885480572597138e-06, + "loss": 6.0661, + "step": 39 + }, + { + "epoch": 0.012277470841006752, + "grad_norm": 0.7714207768440247, + "learning_rate": 3.987730061349693e-06, + "loss": 6.0385, + "step": 40 + }, + { + "epoch": 0.012584407612031922, + "grad_norm": 0.7436742782592773, + "learning_rate": 4.08997955010225e-06, + "loss": 6.0227, + "step": 41 + }, + { + "epoch": 0.01289134438305709, + "grad_norm": 0.7447277307510376, + "learning_rate": 4.192229038854806e-06, + "loss": 6.0208, + "step": 42 + }, + { + "epoch": 0.013198281154082258, + "grad_norm": 0.6983785629272461, + "learning_rate": 4.294478527607362e-06, + "loss": 6.0295, + "step": 43 + }, + { + "epoch": 0.013505217925107428, + "grad_norm": 0.6630908250808716, + "learning_rate": 4.3967280163599184e-06, + "loss": 6.004, + "step": 44 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.6481929421424866, + "learning_rate": 4.498977505112475e-06, + "loss": 5.9986, + "step": 45 + }, + { + "epoch": 0.014119091467157766, + "grad_norm": 0.7187685966491699, + "learning_rate": 4.601226993865031e-06, + "loss": 6.0008, + "step": 46 + }, + { + "epoch": 0.014426028238182934, + "grad_norm": 0.6550983190536499, + "learning_rate": 4.703476482617587e-06, + "loss": 5.9735, + "step": 47 + }, + { + "epoch": 0.014732965009208104, + "grad_norm": 0.6780675649642944, + "learning_rate": 4.805725971370143e-06, + "loss": 5.9568, + "step": 48 + }, + { + "epoch": 0.015039901780233272, + "grad_norm": 0.703427791595459, + "learning_rate": 4.9079754601227e-06, + "loss": 5.961, + "step": 49 + }, + { + "epoch": 0.015346838551258441, + "grad_norm": 0.6507543921470642, + "learning_rate": 5.0102249488752554e-06, + "loss": 5.9557, + "step": 50 + }, + { + "epoch": 0.01565377532228361, + "grad_norm": 0.5959481000900269, + "learning_rate": 5.1124744376278124e-06, + "loss": 5.9391, + "step": 51 + }, + { + "epoch": 0.01596071209330878, + "grad_norm": 0.5798730254173279, + "learning_rate": 5.214723926380368e-06, + "loss": 5.9488, + "step": 52 + }, + { + "epoch": 0.016267648864333947, + "grad_norm": 0.5932896137237549, + "learning_rate": 5.316973415132925e-06, + "loss": 5.9176, + "step": 53 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5772561430931091, + "learning_rate": 5.419222903885481e-06, + "loss": 5.9069, + "step": 54 + }, + { + "epoch": 0.016881522406384283, + "grad_norm": 0.5578178763389587, + "learning_rate": 5.521472392638037e-06, + "loss": 5.8924, + "step": 55 + }, + { + "epoch": 0.017188459177409455, + "grad_norm": 0.5458457469940186, + "learning_rate": 5.623721881390593e-06, + "loss": 5.9001, + "step": 56 + }, + { + "epoch": 0.017495395948434623, + "grad_norm": 0.5381231904029846, + "learning_rate": 5.7259713701431494e-06, + "loss": 5.8827, + "step": 57 + }, + { + "epoch": 0.01780233271945979, + "grad_norm": 0.540920615196228, + "learning_rate": 5.828220858895706e-06, + "loss": 5.8763, + "step": 58 + }, + { + "epoch": 0.01810926949048496, + "grad_norm": 0.5378615260124207, + "learning_rate": 5.930470347648262e-06, + "loss": 5.865, + "step": 59 + }, + { + "epoch": 0.01841620626151013, + "grad_norm": 0.5139282941818237, + "learning_rate": 6.032719836400819e-06, + "loss": 5.873, + "step": 60 + }, + { + "epoch": 0.0187231430325353, + "grad_norm": 0.5298904776573181, + "learning_rate": 6.134969325153374e-06, + "loss": 5.861, + "step": 61 + }, + { + "epoch": 0.019030079803560467, + "grad_norm": 0.503131628036499, + "learning_rate": 6.237218813905931e-06, + "loss": 5.844, + "step": 62 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.5133433938026428, + "learning_rate": 6.339468302658487e-06, + "loss": 5.8535, + "step": 63 + }, + { + "epoch": 0.019643953345610803, + "grad_norm": 0.4909187853336334, + "learning_rate": 6.4417177914110434e-06, + "loss": 5.8378, + "step": 64 + }, + { + "epoch": 0.019950890116635974, + "grad_norm": 0.6916642785072327, + "learning_rate": 6.5439672801636004e-06, + "loss": 5.8385, + "step": 65 + }, + { + "epoch": 0.020257826887661142, + "grad_norm": 0.4801484942436218, + "learning_rate": 6.646216768916155e-06, + "loss": 5.8089, + "step": 66 + }, + { + "epoch": 0.02056476365868631, + "grad_norm": 0.47745251655578613, + "learning_rate": 6.748466257668712e-06, + "loss": 5.8119, + "step": 67 + }, + { + "epoch": 0.020871700429711478, + "grad_norm": 0.4693359136581421, + "learning_rate": 6.850715746421268e-06, + "loss": 5.8038, + "step": 68 + }, + { + "epoch": 0.02117863720073665, + "grad_norm": 0.46996453404426575, + "learning_rate": 6.952965235173825e-06, + "loss": 5.7966, + "step": 69 + }, + { + "epoch": 0.021485573971761818, + "grad_norm": 0.45779168605804443, + "learning_rate": 7.05521472392638e-06, + "loss": 5.7959, + "step": 70 + }, + { + "epoch": 0.021792510742786986, + "grad_norm": 0.49008259177207947, + "learning_rate": 7.1574642126789366e-06, + "loss": 5.7861, + "step": 71 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.44727766513824463, + "learning_rate": 7.259713701431494e-06, + "loss": 5.7716, + "step": 72 + }, + { + "epoch": 0.022406384284837322, + "grad_norm": 0.4392741918563843, + "learning_rate": 7.36196319018405e-06, + "loss": 5.7776, + "step": 73 + }, + { + "epoch": 0.022713321055862493, + "grad_norm": 0.43525391817092896, + "learning_rate": 7.464212678936605e-06, + "loss": 5.7687, + "step": 74 + }, + { + "epoch": 0.02302025782688766, + "grad_norm": 0.4370710253715515, + "learning_rate": 7.566462167689162e-06, + "loss": 5.7504, + "step": 75 + }, + { + "epoch": 0.02332719459791283, + "grad_norm": 0.4349770247936249, + "learning_rate": 7.668711656441718e-06, + "loss": 5.7425, + "step": 76 + }, + { + "epoch": 0.023634131368937997, + "grad_norm": 0.42710933089256287, + "learning_rate": 7.770961145194275e-06, + "loss": 5.7562, + "step": 77 + }, + { + "epoch": 0.02394106813996317, + "grad_norm": 0.42816224694252014, + "learning_rate": 7.87321063394683e-06, + "loss": 5.7301, + "step": 78 + }, + { + "epoch": 0.024248004910988337, + "grad_norm": 0.4183364510536194, + "learning_rate": 7.975460122699386e-06, + "loss": 5.7131, + "step": 79 + }, + { + "epoch": 0.024554941682013505, + "grad_norm": 0.4179428517818451, + "learning_rate": 8.077709611451943e-06, + "loss": 5.7057, + "step": 80 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.40880727767944336, + "learning_rate": 8.1799591002045e-06, + "loss": 5.7179, + "step": 81 + }, + { + "epoch": 0.025168815224063844, + "grad_norm": 0.40961235761642456, + "learning_rate": 8.282208588957055e-06, + "loss": 5.7008, + "step": 82 + }, + { + "epoch": 0.025475751995089013, + "grad_norm": 0.46789029240608215, + "learning_rate": 8.384458077709612e-06, + "loss": 5.7071, + "step": 83 + }, + { + "epoch": 0.02578268876611418, + "grad_norm": 0.4776248335838318, + "learning_rate": 8.486707566462168e-06, + "loss": 5.6829, + "step": 84 + }, + { + "epoch": 0.02608962553713935, + "grad_norm": 0.40660589933395386, + "learning_rate": 8.588957055214725e-06, + "loss": 5.6732, + "step": 85 + }, + { + "epoch": 0.026396562308164517, + "grad_norm": 0.3984324038028717, + "learning_rate": 8.69120654396728e-06, + "loss": 5.6777, + "step": 86 + }, + { + "epoch": 0.026703499079189688, + "grad_norm": 0.3972148597240448, + "learning_rate": 8.793456032719837e-06, + "loss": 5.6598, + "step": 87 + }, + { + "epoch": 0.027010435850214856, + "grad_norm": 0.3906182050704956, + "learning_rate": 8.895705521472392e-06, + "loss": 5.6468, + "step": 88 + }, + { + "epoch": 0.027317372621240024, + "grad_norm": 0.38598939776420593, + "learning_rate": 8.99795501022495e-06, + "loss": 5.6452, + "step": 89 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.405943363904953, + "learning_rate": 9.100204498977506e-06, + "loss": 5.6408, + "step": 90 + }, + { + "epoch": 0.027931246163290364, + "grad_norm": 0.3859459161758423, + "learning_rate": 9.202453987730062e-06, + "loss": 5.613, + "step": 91 + }, + { + "epoch": 0.028238182934315532, + "grad_norm": 0.3773545026779175, + "learning_rate": 9.304703476482619e-06, + "loss": 5.6277, + "step": 92 + }, + { + "epoch": 0.0285451197053407, + "grad_norm": 0.36915943026542664, + "learning_rate": 9.406952965235174e-06, + "loss": 5.618, + "step": 93 + }, + { + "epoch": 0.028852056476365868, + "grad_norm": 0.3732316792011261, + "learning_rate": 9.509202453987731e-06, + "loss": 5.6066, + "step": 94 + }, + { + "epoch": 0.029158993247391036, + "grad_norm": 0.3670802414417267, + "learning_rate": 9.611451942740286e-06, + "loss": 5.6189, + "step": 95 + }, + { + "epoch": 0.029465930018416207, + "grad_norm": 0.3672202229499817, + "learning_rate": 9.713701431492843e-06, + "loss": 5.6046, + "step": 96 + }, + { + "epoch": 0.029772866789441375, + "grad_norm": 0.3624509871006012, + "learning_rate": 9.8159509202454e-06, + "loss": 5.585, + "step": 97 + }, + { + "epoch": 0.030079803560466543, + "grad_norm": 0.36265870928764343, + "learning_rate": 9.918200408997956e-06, + "loss": 5.5867, + "step": 98 + }, + { + "epoch": 0.03038674033149171, + "grad_norm": 0.3606979548931122, + "learning_rate": 1.0020449897750511e-05, + "loss": 5.5658, + "step": 99 + }, + { + "epoch": 0.030693677102516883, + "grad_norm": 0.36800363659858704, + "learning_rate": 1.0122699386503068e-05, + "loss": 5.5494, + "step": 100 + }, + { + "epoch": 0.03100061387354205, + "grad_norm": 0.3641016483306885, + "learning_rate": 1.0224948875255625e-05, + "loss": 5.5553, + "step": 101 + }, + { + "epoch": 0.03130755064456722, + "grad_norm": 0.36807990074157715, + "learning_rate": 1.032719836400818e-05, + "loss": 5.5315, + "step": 102 + }, + { + "epoch": 0.03161448741559239, + "grad_norm": 0.37071728706359863, + "learning_rate": 1.0429447852760736e-05, + "loss": 5.522, + "step": 103 + }, + { + "epoch": 0.03192142418661756, + "grad_norm": 0.3549076020717621, + "learning_rate": 1.0531697341513293e-05, + "loss": 5.5354, + "step": 104 + }, + { + "epoch": 0.03222836095764273, + "grad_norm": 0.3589537441730499, + "learning_rate": 1.063394683026585e-05, + "loss": 5.534, + "step": 105 + }, + { + "epoch": 0.032535297728667895, + "grad_norm": 0.4341397285461426, + "learning_rate": 1.0736196319018407e-05, + "loss": 5.5088, + "step": 106 + }, + { + "epoch": 0.03284223449969306, + "grad_norm": 0.37220680713653564, + "learning_rate": 1.0838445807770962e-05, + "loss": 5.5213, + "step": 107 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.3776145875453949, + "learning_rate": 1.0940695296523517e-05, + "loss": 5.4955, + "step": 108 + }, + { + "epoch": 0.0334561080417434, + "grad_norm": 0.38651829957962036, + "learning_rate": 1.1042944785276074e-05, + "loss": 5.4916, + "step": 109 + }, + { + "epoch": 0.03376304481276857, + "grad_norm": 0.3749970495700836, + "learning_rate": 1.1145194274028631e-05, + "loss": 5.4686, + "step": 110 + }, + { + "epoch": 0.03406998158379374, + "grad_norm": 0.38184404373168945, + "learning_rate": 1.1247443762781187e-05, + "loss": 5.4694, + "step": 111 + }, + { + "epoch": 0.03437691835481891, + "grad_norm": 0.38783952593803406, + "learning_rate": 1.1349693251533742e-05, + "loss": 5.4447, + "step": 112 + }, + { + "epoch": 0.03468385512584408, + "grad_norm": 0.369125097990036, + "learning_rate": 1.1451942740286299e-05, + "loss": 5.4506, + "step": 113 + }, + { + "epoch": 0.034990791896869246, + "grad_norm": 0.3773012161254883, + "learning_rate": 1.1554192229038856e-05, + "loss": 5.4637, + "step": 114 + }, + { + "epoch": 0.035297728667894414, + "grad_norm": 0.47702446579933167, + "learning_rate": 1.1656441717791411e-05, + "loss": 5.4487, + "step": 115 + }, + { + "epoch": 0.03560466543891958, + "grad_norm": 0.5288241505622864, + "learning_rate": 1.1758691206543968e-05, + "loss": 5.4216, + "step": 116 + }, + { + "epoch": 0.03591160220994475, + "grad_norm": 0.49916699528694153, + "learning_rate": 1.1860940695296524e-05, + "loss": 5.4055, + "step": 117 + }, + { + "epoch": 0.03621853898096992, + "grad_norm": 0.5027921795845032, + "learning_rate": 1.196319018404908e-05, + "loss": 5.4141, + "step": 118 + }, + { + "epoch": 0.036525475751995086, + "grad_norm": 0.5069209933280945, + "learning_rate": 1.2065439672801638e-05, + "loss": 5.4277, + "step": 119 + }, + { + "epoch": 0.03683241252302026, + "grad_norm": 0.5208525657653809, + "learning_rate": 1.2167689161554193e-05, + "loss": 5.4023, + "step": 120 + }, + { + "epoch": 0.03713934929404543, + "grad_norm": 0.7059593796730042, + "learning_rate": 1.2269938650306748e-05, + "loss": 5.3797, + "step": 121 + }, + { + "epoch": 0.0374462860650706, + "grad_norm": 0.71112060546875, + "learning_rate": 1.2372188139059305e-05, + "loss": 5.3619, + "step": 122 + }, + { + "epoch": 0.037753222836095765, + "grad_norm": 0.5095361471176147, + "learning_rate": 1.2474437627811862e-05, + "loss": 5.3667, + "step": 123 + }, + { + "epoch": 0.03806015960712093, + "grad_norm": 0.986062228679657, + "learning_rate": 1.2576687116564418e-05, + "loss": 5.3459, + "step": 124 + }, + { + "epoch": 0.0383670963781461, + "grad_norm": 0.693392813205719, + "learning_rate": 1.2678936605316975e-05, + "loss": 5.3165, + "step": 125 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7835625410079956, + "learning_rate": 1.278118609406953e-05, + "loss": 5.3205, + "step": 126 + }, + { + "epoch": 0.03898096992019644, + "grad_norm": 0.6314569711685181, + "learning_rate": 1.2883435582822087e-05, + "loss": 5.3287, + "step": 127 + }, + { + "epoch": 0.039287906691221605, + "grad_norm": 0.9079526662826538, + "learning_rate": 1.2985685071574644e-05, + "loss": 5.2935, + "step": 128 + }, + { + "epoch": 0.03959484346224678, + "grad_norm": 0.6998131275177002, + "learning_rate": 1.3087934560327201e-05, + "loss": 5.315, + "step": 129 + }, + { + "epoch": 0.03990178023327195, + "grad_norm": 0.7570182085037231, + "learning_rate": 1.3190184049079754e-05, + "loss": 5.293, + "step": 130 + }, + { + "epoch": 0.040208717004297116, + "grad_norm": 0.6972737908363342, + "learning_rate": 1.329243353783231e-05, + "loss": 5.2863, + "step": 131 + }, + { + "epoch": 0.040515653775322284, + "grad_norm": 0.8841190934181213, + "learning_rate": 1.3394683026584867e-05, + "loss": 5.2518, + "step": 132 + }, + { + "epoch": 0.04082259054634745, + "grad_norm": 0.6792641282081604, + "learning_rate": 1.3496932515337424e-05, + "loss": 5.2386, + "step": 133 + }, + { + "epoch": 0.04112952731737262, + "grad_norm": 0.9234145879745483, + "learning_rate": 1.359918200408998e-05, + "loss": 5.2418, + "step": 134 + }, + { + "epoch": 0.04143646408839779, + "grad_norm": 1.1438226699829102, + "learning_rate": 1.3701431492842536e-05, + "loss": 5.2298, + "step": 135 + }, + { + "epoch": 0.041743400859422956, + "grad_norm": 0.910861074924469, + "learning_rate": 1.3803680981595093e-05, + "loss": 5.2437, + "step": 136 + }, + { + "epoch": 0.042050337630448124, + "grad_norm": 0.8995844721794128, + "learning_rate": 1.390593047034765e-05, + "loss": 5.2456, + "step": 137 + }, + { + "epoch": 0.0423572744014733, + "grad_norm": 0.8543404936790466, + "learning_rate": 1.4008179959100204e-05, + "loss": 5.1888, + "step": 138 + }, + { + "epoch": 0.04266421117249847, + "grad_norm": 0.7565917372703552, + "learning_rate": 1.411042944785276e-05, + "loss": 5.1939, + "step": 139 + }, + { + "epoch": 0.042971147943523635, + "grad_norm": 0.7103878259658813, + "learning_rate": 1.4212678936605318e-05, + "loss": 5.1693, + "step": 140 + }, + { + "epoch": 0.0432780847145488, + "grad_norm": 1.008686900138855, + "learning_rate": 1.4314928425357873e-05, + "loss": 5.1467, + "step": 141 + }, + { + "epoch": 0.04358502148557397, + "grad_norm": 1.1134421825408936, + "learning_rate": 1.441717791411043e-05, + "loss": 5.1695, + "step": 142 + }, + { + "epoch": 0.04389195825659914, + "grad_norm": 0.7418283820152283, + "learning_rate": 1.4519427402862987e-05, + "loss": 5.1556, + "step": 143 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 1.3332276344299316, + "learning_rate": 1.4621676891615542e-05, + "loss": 5.1736, + "step": 144 + }, + { + "epoch": 0.044505831798649476, + "grad_norm": 0.99709153175354, + "learning_rate": 1.47239263803681e-05, + "loss": 5.1326, + "step": 145 + }, + { + "epoch": 0.044812768569674644, + "grad_norm": 2.0185158252716064, + "learning_rate": 1.4826175869120657e-05, + "loss": 5.1075, + "step": 146 + }, + { + "epoch": 0.04511970534069982, + "grad_norm": 0.9810693264007568, + "learning_rate": 1.492842535787321e-05, + "loss": 5.1181, + "step": 147 + }, + { + "epoch": 0.04542664211172499, + "grad_norm": 1.3122087717056274, + "learning_rate": 1.5030674846625767e-05, + "loss": 5.1104, + "step": 148 + }, + { + "epoch": 0.045733578882750155, + "grad_norm": 1.230662226676941, + "learning_rate": 1.5132924335378324e-05, + "loss": 5.0721, + "step": 149 + }, + { + "epoch": 0.04604051565377532, + "grad_norm": 0.9584419131278992, + "learning_rate": 1.523517382413088e-05, + "loss": 5.0574, + "step": 150 + }, + { + "epoch": 0.04634745242480049, + "grad_norm": 1.3933353424072266, + "learning_rate": 1.5337423312883436e-05, + "loss": 5.0468, + "step": 151 + }, + { + "epoch": 0.04665438919582566, + "grad_norm": 1.2336134910583496, + "learning_rate": 1.5439672801635993e-05, + "loss": 5.0596, + "step": 152 + }, + { + "epoch": 0.04696132596685083, + "grad_norm": 1.3005256652832031, + "learning_rate": 1.554192229038855e-05, + "loss": 5.0236, + "step": 153 + }, + { + "epoch": 0.047268262737875995, + "grad_norm": 1.2528692483901978, + "learning_rate": 1.5644171779141108e-05, + "loss": 5.0269, + "step": 154 + }, + { + "epoch": 0.04757519950890117, + "grad_norm": 1.0448148250579834, + "learning_rate": 1.574642126789366e-05, + "loss": 5.0338, + "step": 155 + }, + { + "epoch": 0.04788213627992634, + "grad_norm": 1.2372045516967773, + "learning_rate": 1.5848670756646218e-05, + "loss": 4.9544, + "step": 156 + }, + { + "epoch": 0.048189073050951506, + "grad_norm": 1.2700645923614502, + "learning_rate": 1.5950920245398772e-05, + "loss": 4.9723, + "step": 157 + }, + { + "epoch": 0.048496009821976674, + "grad_norm": 1.1283228397369385, + "learning_rate": 1.605316973415133e-05, + "loss": 4.9801, + "step": 158 + }, + { + "epoch": 0.04880294659300184, + "grad_norm": 1.5563665628433228, + "learning_rate": 1.6155419222903886e-05, + "loss": 4.9118, + "step": 159 + }, + { + "epoch": 0.04910988336402701, + "grad_norm": 1.3759487867355347, + "learning_rate": 1.6257668711656443e-05, + "loss": 4.9552, + "step": 160 + }, + { + "epoch": 0.04941682013505218, + "grad_norm": 1.2167878150939941, + "learning_rate": 1.6359918200409e-05, + "loss": 4.9186, + "step": 161 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.6424930095672607, + "learning_rate": 1.6462167689161557e-05, + "loss": 4.9143, + "step": 162 + }, + { + "epoch": 0.050030693677102514, + "grad_norm": 1.0009948015213013, + "learning_rate": 1.656441717791411e-05, + "loss": 4.8615, + "step": 163 + }, + { + "epoch": 0.05033763044812769, + "grad_norm": 1.8803274631500244, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.8558, + "step": 164 + }, + { + "epoch": 0.05064456721915286, + "grad_norm": 1.1819735765457153, + "learning_rate": 1.6768916155419224e-05, + "loss": 4.8453, + "step": 165 + }, + { + "epoch": 0.050951503990178025, + "grad_norm": 1.9724273681640625, + "learning_rate": 1.6871165644171778e-05, + "loss": 4.8573, + "step": 166 + }, + { + "epoch": 0.05125844076120319, + "grad_norm": 1.4624557495117188, + "learning_rate": 1.6973415132924335e-05, + "loss": 4.8494, + "step": 167 + }, + { + "epoch": 0.05156537753222836, + "grad_norm": 1.4750267267227173, + "learning_rate": 1.7075664621676892e-05, + "loss": 4.8296, + "step": 168 + }, + { + "epoch": 0.05187231430325353, + "grad_norm": 1.3206923007965088, + "learning_rate": 1.717791411042945e-05, + "loss": 4.7834, + "step": 169 + }, + { + "epoch": 0.0521792510742787, + "grad_norm": 1.4332681894302368, + "learning_rate": 1.7280163599182006e-05, + "loss": 4.8008, + "step": 170 + }, + { + "epoch": 0.052486187845303865, + "grad_norm": 1.612804651260376, + "learning_rate": 1.738241308793456e-05, + "loss": 4.7885, + "step": 171 + }, + { + "epoch": 0.05279312461632903, + "grad_norm": 1.3880311250686646, + "learning_rate": 1.7484662576687117e-05, + "loss": 4.8034, + "step": 172 + }, + { + "epoch": 0.05310006138735421, + "grad_norm": 1.7550631761550903, + "learning_rate": 1.7586912065439674e-05, + "loss": 4.7568, + "step": 173 + }, + { + "epoch": 0.053406998158379376, + "grad_norm": 1.653678297996521, + "learning_rate": 1.768916155419223e-05, + "loss": 4.7294, + "step": 174 + }, + { + "epoch": 0.053713934929404544, + "grad_norm": 1.6094826459884644, + "learning_rate": 1.7791411042944784e-05, + "loss": 4.7409, + "step": 175 + }, + { + "epoch": 0.05402087170042971, + "grad_norm": 1.7453033924102783, + "learning_rate": 1.789366053169734e-05, + "loss": 4.7191, + "step": 176 + }, + { + "epoch": 0.05432780847145488, + "grad_norm": 1.3073794841766357, + "learning_rate": 1.79959100204499e-05, + "loss": 4.7347, + "step": 177 + }, + { + "epoch": 0.05463474524248005, + "grad_norm": 2.096515655517578, + "learning_rate": 1.8098159509202455e-05, + "loss": 4.7396, + "step": 178 + }, + { + "epoch": 0.054941682013505216, + "grad_norm": 1.3826024532318115, + "learning_rate": 1.8200408997955012e-05, + "loss": 4.6988, + "step": 179 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 1.9290310144424438, + "learning_rate": 1.8302658486707566e-05, + "loss": 4.6653, + "step": 180 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.7404149770736694, + "learning_rate": 1.8404907975460123e-05, + "loss": 4.7102, + "step": 181 + }, + { + "epoch": 0.05586249232658073, + "grad_norm": 1.7535779476165771, + "learning_rate": 1.850715746421268e-05, + "loss": 4.7124, + "step": 182 + }, + { + "epoch": 0.056169429097605895, + "grad_norm": 1.7792351245880127, + "learning_rate": 1.8609406952965237e-05, + "loss": 4.6969, + "step": 183 + }, + { + "epoch": 0.056476365868631064, + "grad_norm": 2.048332452774048, + "learning_rate": 1.8711656441717794e-05, + "loss": 4.6134, + "step": 184 + }, + { + "epoch": 0.05678330263965623, + "grad_norm": 1.9558366537094116, + "learning_rate": 1.8813905930470348e-05, + "loss": 4.6739, + "step": 185 + }, + { + "epoch": 0.0570902394106814, + "grad_norm": 2.5299644470214844, + "learning_rate": 1.8916155419222905e-05, + "loss": 4.6248, + "step": 186 + }, + { + "epoch": 0.05739717618170657, + "grad_norm": 2.143704891204834, + "learning_rate": 1.9018404907975462e-05, + "loss": 4.6664, + "step": 187 + }, + { + "epoch": 0.057704112952731736, + "grad_norm": 1.925010323524475, + "learning_rate": 1.9120654396728015e-05, + "loss": 4.5657, + "step": 188 + }, + { + "epoch": 0.058011049723756904, + "grad_norm": 1.8223596811294556, + "learning_rate": 1.9222903885480572e-05, + "loss": 4.6124, + "step": 189 + }, + { + "epoch": 0.05831798649478207, + "grad_norm": 1.9519827365875244, + "learning_rate": 1.932515337423313e-05, + "loss": 4.5937, + "step": 190 + }, + { + "epoch": 0.05862492326580725, + "grad_norm": 2.062534809112549, + "learning_rate": 1.9427402862985686e-05, + "loss": 4.6023, + "step": 191 + }, + { + "epoch": 0.058931860036832415, + "grad_norm": 1.8512892723083496, + "learning_rate": 1.9529652351738243e-05, + "loss": 4.5709, + "step": 192 + }, + { + "epoch": 0.05923879680785758, + "grad_norm": 2.7771248817443848, + "learning_rate": 1.96319018404908e-05, + "loss": 4.5902, + "step": 193 + }, + { + "epoch": 0.05954573357888275, + "grad_norm": 1.8911874294281006, + "learning_rate": 1.9734151329243354e-05, + "loss": 4.4973, + "step": 194 + }, + { + "epoch": 0.05985267034990792, + "grad_norm": 2.261096715927124, + "learning_rate": 1.983640081799591e-05, + "loss": 4.5343, + "step": 195 + }, + { + "epoch": 0.06015960712093309, + "grad_norm": 1.833983302116394, + "learning_rate": 1.9938650306748465e-05, + "loss": 4.5604, + "step": 196 + }, + { + "epoch": 0.060466543891958255, + "grad_norm": 2.6909141540527344, + "learning_rate": 2.0040899795501022e-05, + "loss": 4.5411, + "step": 197 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.8085883855819702, + "learning_rate": 2.014314928425358e-05, + "loss": 4.5127, + "step": 198 + }, + { + "epoch": 0.06108041743400859, + "grad_norm": 3.082063913345337, + "learning_rate": 2.0245398773006136e-05, + "loss": 4.5055, + "step": 199 + }, + { + "epoch": 0.061387354205033766, + "grad_norm": 1.6942392587661743, + "learning_rate": 2.0347648261758693e-05, + "loss": 4.4852, + "step": 200 + }, + { + "epoch": 0.061694290976058934, + "grad_norm": 2.428569793701172, + "learning_rate": 2.044989775051125e-05, + "loss": 4.4876, + "step": 201 + }, + { + "epoch": 0.0620012277470841, + "grad_norm": 2.1669068336486816, + "learning_rate": 2.0552147239263807e-05, + "loss": 4.5156, + "step": 202 + }, + { + "epoch": 0.06230816451810927, + "grad_norm": 1.8558237552642822, + "learning_rate": 2.065439672801636e-05, + "loss": 4.495, + "step": 203 + }, + { + "epoch": 0.06261510128913444, + "grad_norm": 2.86224627494812, + "learning_rate": 2.0756646216768917e-05, + "loss": 4.4881, + "step": 204 + }, + { + "epoch": 0.06292203806015961, + "grad_norm": 2.263230562210083, + "learning_rate": 2.085889570552147e-05, + "loss": 4.4349, + "step": 205 + }, + { + "epoch": 0.06322897483118478, + "grad_norm": 2.533039093017578, + "learning_rate": 2.0961145194274028e-05, + "loss": 4.4921, + "step": 206 + }, + { + "epoch": 0.06353591160220995, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.1063394683026585e-05, + "loss": 4.4581, + "step": 207 + }, + { + "epoch": 0.06384284837323512, + "grad_norm": 1.9801981449127197, + "learning_rate": 2.1165644171779142e-05, + "loss": 4.4646, + "step": 208 + }, + { + "epoch": 0.06414978514426029, + "grad_norm": 2.8499860763549805, + "learning_rate": 2.12678936605317e-05, + "loss": 4.3913, + "step": 209 + }, + { + "epoch": 0.06445672191528545, + "grad_norm": 1.8176993131637573, + "learning_rate": 2.1370143149284256e-05, + "loss": 4.4414, + "step": 210 + }, + { + "epoch": 0.06476365868631062, + "grad_norm": 3.1497061252593994, + "learning_rate": 2.1472392638036813e-05, + "loss": 4.4164, + "step": 211 + }, + { + "epoch": 0.06507059545733579, + "grad_norm": 2.0509049892425537, + "learning_rate": 2.1574642126789367e-05, + "loss": 4.4198, + "step": 212 + }, + { + "epoch": 0.06537753222836096, + "grad_norm": 2.5346014499664307, + "learning_rate": 2.1676891615541924e-05, + "loss": 4.3628, + "step": 213 + }, + { + "epoch": 0.06568446899938613, + "grad_norm": 2.281947135925293, + "learning_rate": 2.1779141104294477e-05, + "loss": 4.3824, + "step": 214 + }, + { + "epoch": 0.0659914057704113, + "grad_norm": 2.9005074501037598, + "learning_rate": 2.1881390593047034e-05, + "loss": 4.4227, + "step": 215 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 2.5869741439819336, + "learning_rate": 2.198364008179959e-05, + "loss": 4.4231, + "step": 216 + }, + { + "epoch": 0.06660527931246163, + "grad_norm": 2.339655637741089, + "learning_rate": 2.208588957055215e-05, + "loss": 4.3901, + "step": 217 + }, + { + "epoch": 0.0669122160834868, + "grad_norm": 2.430664539337158, + "learning_rate": 2.2188139059304705e-05, + "loss": 4.3487, + "step": 218 + }, + { + "epoch": 0.06721915285451197, + "grad_norm": 2.1791040897369385, + "learning_rate": 2.2290388548057262e-05, + "loss": 4.3404, + "step": 219 + }, + { + "epoch": 0.06752608962553713, + "grad_norm": 2.7054920196533203, + "learning_rate": 2.239263803680982e-05, + "loss": 4.4186, + "step": 220 + }, + { + "epoch": 0.0678330263965623, + "grad_norm": 2.516566514968872, + "learning_rate": 2.2494887525562373e-05, + "loss": 4.4102, + "step": 221 + }, + { + "epoch": 0.06813996316758748, + "grad_norm": 2.3522324562072754, + "learning_rate": 2.259713701431493e-05, + "loss": 4.4062, + "step": 222 + }, + { + "epoch": 0.06844689993861265, + "grad_norm": 2.557600259780884, + "learning_rate": 2.2699386503067484e-05, + "loss": 4.3711, + "step": 223 + }, + { + "epoch": 0.06875383670963782, + "grad_norm": 2.0590531826019287, + "learning_rate": 2.280163599182004e-05, + "loss": 4.3546, + "step": 224 + }, + { + "epoch": 0.06906077348066299, + "grad_norm": 4.704878330230713, + "learning_rate": 2.2903885480572598e-05, + "loss": 4.39, + "step": 225 + }, + { + "epoch": 0.06936771025168816, + "grad_norm": 2.237440347671509, + "learning_rate": 2.3006134969325155e-05, + "loss": 4.3425, + "step": 226 + }, + { + "epoch": 0.06967464702271332, + "grad_norm": 3.9394450187683105, + "learning_rate": 2.3108384458077712e-05, + "loss": 4.3641, + "step": 227 + }, + { + "epoch": 0.06998158379373849, + "grad_norm": 2.4857213497161865, + "learning_rate": 2.321063394683027e-05, + "loss": 4.3435, + "step": 228 + }, + { + "epoch": 0.07028852056476366, + "grad_norm": 2.893437147140503, + "learning_rate": 2.3312883435582822e-05, + "loss": 4.329, + "step": 229 + }, + { + "epoch": 0.07059545733578883, + "grad_norm": 2.6498284339904785, + "learning_rate": 2.341513292433538e-05, + "loss": 4.3058, + "step": 230 + }, + { + "epoch": 0.070902394106814, + "grad_norm": 2.4182214736938477, + "learning_rate": 2.3517382413087936e-05, + "loss": 4.3147, + "step": 231 + }, + { + "epoch": 0.07120933087783916, + "grad_norm": 2.532050371170044, + "learning_rate": 2.361963190184049e-05, + "loss": 4.3388, + "step": 232 + }, + { + "epoch": 0.07151626764886433, + "grad_norm": 2.5818533897399902, + "learning_rate": 2.3721881390593047e-05, + "loss": 4.3023, + "step": 233 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 2.1860098838806152, + "learning_rate": 2.3824130879345604e-05, + "loss": 4.2571, + "step": 234 + }, + { + "epoch": 0.07213014119091467, + "grad_norm": 3.5780131816864014, + "learning_rate": 2.392638036809816e-05, + "loss": 4.3336, + "step": 235 + }, + { + "epoch": 0.07243707796193984, + "grad_norm": 2.24653697013855, + "learning_rate": 2.4028629856850718e-05, + "loss": 4.3013, + "step": 236 + }, + { + "epoch": 0.072744014732965, + "grad_norm": 3.59663987159729, + "learning_rate": 2.4130879345603275e-05, + "loss": 4.3248, + "step": 237 + }, + { + "epoch": 0.07305095150399017, + "grad_norm": 2.818321943283081, + "learning_rate": 2.423312883435583e-05, + "loss": 4.2876, + "step": 238 + }, + { + "epoch": 0.07335788827501534, + "grad_norm": 2.457371950149536, + "learning_rate": 2.4335378323108386e-05, + "loss": 4.2584, + "step": 239 + }, + { + "epoch": 0.07366482504604052, + "grad_norm": 3.6243598461151123, + "learning_rate": 2.4437627811860943e-05, + "loss": 4.2786, + "step": 240 + }, + { + "epoch": 0.07397176181706569, + "grad_norm": 2.113060474395752, + "learning_rate": 2.4539877300613496e-05, + "loss": 4.2071, + "step": 241 + }, + { + "epoch": 0.07427869858809086, + "grad_norm": 5.355374813079834, + "learning_rate": 2.4642126789366053e-05, + "loss": 4.2871, + "step": 242 + }, + { + "epoch": 0.07458563535911603, + "grad_norm": 2.4509847164154053, + "learning_rate": 2.474437627811861e-05, + "loss": 4.2073, + "step": 243 + }, + { + "epoch": 0.0748925721301412, + "grad_norm": 3.313793659210205, + "learning_rate": 2.4846625766871167e-05, + "loss": 4.2938, + "step": 244 + }, + { + "epoch": 0.07519950890116636, + "grad_norm": 2.731903553009033, + "learning_rate": 2.4948875255623724e-05, + "loss": 4.2023, + "step": 245 + }, + { + "epoch": 0.07550644567219153, + "grad_norm": 2.6218042373657227, + "learning_rate": 2.505112474437628e-05, + "loss": 4.2492, + "step": 246 + }, + { + "epoch": 0.0758133824432167, + "grad_norm": 3.2865426540374756, + "learning_rate": 2.5153374233128835e-05, + "loss": 4.2358, + "step": 247 + }, + { + "epoch": 0.07612031921424187, + "grad_norm": 2.21870756149292, + "learning_rate": 2.5255623721881395e-05, + "loss": 4.1989, + "step": 248 + }, + { + "epoch": 0.07642725598526703, + "grad_norm": 4.095842361450195, + "learning_rate": 2.535787321063395e-05, + "loss": 4.2484, + "step": 249 + }, + { + "epoch": 0.0767341927562922, + "grad_norm": 2.21420955657959, + "learning_rate": 2.5460122699386503e-05, + "loss": 4.1985, + "step": 250 + }, + { + "epoch": 0.07704112952731737, + "grad_norm": 3.011272668838501, + "learning_rate": 2.556237218813906e-05, + "loss": 4.2182, + "step": 251 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 2.930999279022217, + "learning_rate": 2.5664621676891613e-05, + "loss": 4.1985, + "step": 252 + }, + { + "epoch": 0.0776550030693677, + "grad_norm": 2.8528032302856445, + "learning_rate": 2.5766871165644174e-05, + "loss": 4.1859, + "step": 253 + }, + { + "epoch": 0.07796193984039287, + "grad_norm": 3.215587854385376, + "learning_rate": 2.5869120654396727e-05, + "loss": 4.2416, + "step": 254 + }, + { + "epoch": 0.07826887661141804, + "grad_norm": 3.1349990367889404, + "learning_rate": 2.5971370143149288e-05, + "loss": 4.2204, + "step": 255 + }, + { + "epoch": 0.07857581338244321, + "grad_norm": 3.146942377090454, + "learning_rate": 2.607361963190184e-05, + "loss": 4.17, + "step": 256 + }, + { + "epoch": 0.07888275015346839, + "grad_norm": 2.2611942291259766, + "learning_rate": 2.6175869120654402e-05, + "loss": 4.191, + "step": 257 + }, + { + "epoch": 0.07918968692449356, + "grad_norm": 3.434574604034424, + "learning_rate": 2.6278118609406955e-05, + "loss": 4.1854, + "step": 258 + }, + { + "epoch": 0.07949662369551873, + "grad_norm": 2.3132400512695312, + "learning_rate": 2.638036809815951e-05, + "loss": 4.233, + "step": 259 + }, + { + "epoch": 0.0798035604665439, + "grad_norm": 3.2676596641540527, + "learning_rate": 2.6482617586912066e-05, + "loss": 4.1586, + "step": 260 + }, + { + "epoch": 0.08011049723756906, + "grad_norm": 2.6182920932769775, + "learning_rate": 2.658486707566462e-05, + "loss": 4.164, + "step": 261 + }, + { + "epoch": 0.08041743400859423, + "grad_norm": 2.872018814086914, + "learning_rate": 2.668711656441718e-05, + "loss": 4.1642, + "step": 262 + }, + { + "epoch": 0.0807243707796194, + "grad_norm": 3.147237539291382, + "learning_rate": 2.6789366053169734e-05, + "loss": 4.147, + "step": 263 + }, + { + "epoch": 0.08103130755064457, + "grad_norm": 2.363360643386841, + "learning_rate": 2.6891615541922294e-05, + "loss": 4.1388, + "step": 264 + }, + { + "epoch": 0.08133824432166974, + "grad_norm": 3.364442825317383, + "learning_rate": 2.6993865030674848e-05, + "loss": 4.1678, + "step": 265 + }, + { + "epoch": 0.0816451810926949, + "grad_norm": 2.393705368041992, + "learning_rate": 2.7096114519427408e-05, + "loss": 4.1626, + "step": 266 + }, + { + "epoch": 0.08195211786372007, + "grad_norm": 3.8512558937072754, + "learning_rate": 2.719836400817996e-05, + "loss": 4.1613, + "step": 267 + }, + { + "epoch": 0.08225905463474524, + "grad_norm": 3.0992584228515625, + "learning_rate": 2.7300613496932515e-05, + "loss": 4.1486, + "step": 268 + }, + { + "epoch": 0.08256599140577041, + "grad_norm": 3.481079578399658, + "learning_rate": 2.7402862985685072e-05, + "loss": 4.1772, + "step": 269 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 3.2167513370513916, + "learning_rate": 2.7505112474437626e-05, + "loss": 4.1253, + "step": 270 + }, + { + "epoch": 0.08317986494782074, + "grad_norm": 2.9698429107666016, + "learning_rate": 2.7607361963190186e-05, + "loss": 4.0897, + "step": 271 + }, + { + "epoch": 0.08348680171884591, + "grad_norm": 3.2549962997436523, + "learning_rate": 2.770961145194274e-05, + "loss": 4.0851, + "step": 272 + }, + { + "epoch": 0.08379373848987108, + "grad_norm": 3.089301824569702, + "learning_rate": 2.78118609406953e-05, + "loss": 4.1378, + "step": 273 + }, + { + "epoch": 0.08410067526089625, + "grad_norm": 3.1799745559692383, + "learning_rate": 2.7914110429447854e-05, + "loss": 4.159, + "step": 274 + }, + { + "epoch": 0.08440761203192143, + "grad_norm": 2.7577199935913086, + "learning_rate": 2.8016359918200408e-05, + "loss": 4.0524, + "step": 275 + }, + { + "epoch": 0.0847145488029466, + "grad_norm": 3.709740161895752, + "learning_rate": 2.8118609406952968e-05, + "loss": 4.0877, + "step": 276 + }, + { + "epoch": 0.08502148557397177, + "grad_norm": 2.930482864379883, + "learning_rate": 2.822085889570552e-05, + "loss": 4.0408, + "step": 277 + }, + { + "epoch": 0.08532842234499693, + "grad_norm": 3.8216278553009033, + "learning_rate": 2.832310838445808e-05, + "loss": 4.0915, + "step": 278 + }, + { + "epoch": 0.0856353591160221, + "grad_norm": 2.7614903450012207, + "learning_rate": 2.8425357873210636e-05, + "loss": 4.0793, + "step": 279 + }, + { + "epoch": 0.08594229588704727, + "grad_norm": 4.005281448364258, + "learning_rate": 2.8527607361963193e-05, + "loss": 4.1234, + "step": 280 + }, + { + "epoch": 0.08624923265807244, + "grad_norm": 2.731640338897705, + "learning_rate": 2.8629856850715746e-05, + "loss": 4.1408, + "step": 281 + }, + { + "epoch": 0.0865561694290976, + "grad_norm": 4.439471244812012, + "learning_rate": 2.8732106339468307e-05, + "loss": 4.08, + "step": 282 + }, + { + "epoch": 0.08686310620012277, + "grad_norm": 2.929032564163208, + "learning_rate": 2.883435582822086e-05, + "loss": 4.0521, + "step": 283 + }, + { + "epoch": 0.08717004297114794, + "grad_norm": 3.3943557739257812, + "learning_rate": 2.8936605316973414e-05, + "loss": 4.0936, + "step": 284 + }, + { + "epoch": 0.08747697974217311, + "grad_norm": 2.9899704456329346, + "learning_rate": 2.9038854805725974e-05, + "loss": 4.0985, + "step": 285 + }, + { + "epoch": 0.08778391651319828, + "grad_norm": 2.8169870376586914, + "learning_rate": 2.9141104294478528e-05, + "loss": 4.1044, + "step": 286 + }, + { + "epoch": 0.08809085328422345, + "grad_norm": 4.312693119049072, + "learning_rate": 2.9243353783231085e-05, + "loss": 4.0515, + "step": 287 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 2.9270846843719482, + "learning_rate": 2.9345603271983642e-05, + "loss": 4.0221, + "step": 288 + }, + { + "epoch": 0.08870472682627378, + "grad_norm": 3.9831974506378174, + "learning_rate": 2.94478527607362e-05, + "loss": 4.0807, + "step": 289 + }, + { + "epoch": 0.08901166359729895, + "grad_norm": 2.721794605255127, + "learning_rate": 2.9550102249488753e-05, + "loss": 4.0732, + "step": 290 + }, + { + "epoch": 0.08931860036832412, + "grad_norm": 4.721047878265381, + "learning_rate": 2.9652351738241313e-05, + "loss": 4.0457, + "step": 291 + }, + { + "epoch": 0.08962553713934929, + "grad_norm": 2.785738229751587, + "learning_rate": 2.9754601226993867e-05, + "loss": 4.0288, + "step": 292 + }, + { + "epoch": 0.08993247391037447, + "grad_norm": 4.842009544372559, + "learning_rate": 2.985685071574642e-05, + "loss": 4.1193, + "step": 293 + }, + { + "epoch": 0.09023941068139964, + "grad_norm": 2.802044153213501, + "learning_rate": 2.995910020449898e-05, + "loss": 4.0055, + "step": 294 + }, + { + "epoch": 0.0905463474524248, + "grad_norm": 3.7060954570770264, + "learning_rate": 3.0061349693251534e-05, + "loss": 4.0478, + "step": 295 + }, + { + "epoch": 0.09085328422344997, + "grad_norm": 2.8033370971679688, + "learning_rate": 3.0163599182004095e-05, + "loss": 4.0344, + "step": 296 + }, + { + "epoch": 0.09116022099447514, + "grad_norm": 3.148653984069824, + "learning_rate": 3.026584867075665e-05, + "loss": 3.9825, + "step": 297 + }, + { + "epoch": 0.09146715776550031, + "grad_norm": 3.925459384918213, + "learning_rate": 3.0368098159509205e-05, + "loss": 4.0253, + "step": 298 + }, + { + "epoch": 0.09177409453652548, + "grad_norm": 2.8502724170684814, + "learning_rate": 3.047034764826176e-05, + "loss": 4.0192, + "step": 299 + }, + { + "epoch": 0.09208103130755065, + "grad_norm": 3.8444268703460693, + "learning_rate": 3.057259713701431e-05, + "loss": 4.0354, + "step": 300 + }, + { + "epoch": 0.09238796807857581, + "grad_norm": 2.935976982116699, + "learning_rate": 3.067484662576687e-05, + "loss": 4.0397, + "step": 301 + }, + { + "epoch": 0.09269490484960098, + "grad_norm": 2.9375271797180176, + "learning_rate": 3.0777096114519427e-05, + "loss": 3.975, + "step": 302 + }, + { + "epoch": 0.09300184162062615, + "grad_norm": 3.7623329162597656, + "learning_rate": 3.087934560327199e-05, + "loss": 4.0259, + "step": 303 + }, + { + "epoch": 0.09330877839165132, + "grad_norm": 3.1480228900909424, + "learning_rate": 3.098159509202454e-05, + "loss": 3.9676, + "step": 304 + }, + { + "epoch": 0.09361571516267649, + "grad_norm": 4.572622299194336, + "learning_rate": 3.10838445807771e-05, + "loss": 4.0123, + "step": 305 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 2.469806671142578, + "learning_rate": 3.1186094069529655e-05, + "loss": 4.012, + "step": 306 + }, + { + "epoch": 0.09422958870472682, + "grad_norm": 5.133090019226074, + "learning_rate": 3.1288343558282215e-05, + "loss": 3.9892, + "step": 307 + }, + { + "epoch": 0.09453652547575199, + "grad_norm": 3.379105567932129, + "learning_rate": 3.139059304703477e-05, + "loss": 4.0286, + "step": 308 + }, + { + "epoch": 0.09484346224677716, + "grad_norm": 3.1413521766662598, + "learning_rate": 3.149284253578732e-05, + "loss": 4.0238, + "step": 309 + }, + { + "epoch": 0.09515039901780234, + "grad_norm": 2.832242250442505, + "learning_rate": 3.159509202453988e-05, + "loss": 3.9955, + "step": 310 + }, + { + "epoch": 0.09545733578882751, + "grad_norm": 4.405134201049805, + "learning_rate": 3.1697341513292436e-05, + "loss": 4.0093, + "step": 311 + }, + { + "epoch": 0.09576427255985268, + "grad_norm": 2.8928587436676025, + "learning_rate": 3.179959100204499e-05, + "loss": 3.9518, + "step": 312 + }, + { + "epoch": 0.09607120933087784, + "grad_norm": 3.8899731636047363, + "learning_rate": 3.1901840490797544e-05, + "loss": 3.9773, + "step": 313 + }, + { + "epoch": 0.09637814610190301, + "grad_norm": 2.768199920654297, + "learning_rate": 3.2004089979550104e-05, + "loss": 3.9671, + "step": 314 + }, + { + "epoch": 0.09668508287292818, + "grad_norm": 3.834092378616333, + "learning_rate": 3.210633946830266e-05, + "loss": 3.9641, + "step": 315 + }, + { + "epoch": 0.09699201964395335, + "grad_norm": 3.566220998764038, + "learning_rate": 3.220858895705521e-05, + "loss": 3.9585, + "step": 316 + }, + { + "epoch": 0.09729895641497852, + "grad_norm": 3.1876113414764404, + "learning_rate": 3.231083844580777e-05, + "loss": 3.9689, + "step": 317 + }, + { + "epoch": 0.09760589318600368, + "grad_norm": 3.122142791748047, + "learning_rate": 3.2413087934560325e-05, + "loss": 3.9601, + "step": 318 + }, + { + "epoch": 0.09791282995702885, + "grad_norm": 3.825195789337158, + "learning_rate": 3.2515337423312886e-05, + "loss": 3.9413, + "step": 319 + }, + { + "epoch": 0.09821976672805402, + "grad_norm": 3.3126778602600098, + "learning_rate": 3.261758691206544e-05, + "loss": 4.0414, + "step": 320 + }, + { + "epoch": 0.09852670349907919, + "grad_norm": 3.7704360485076904, + "learning_rate": 3.2719836400818e-05, + "loss": 3.9224, + "step": 321 + }, + { + "epoch": 0.09883364027010436, + "grad_norm": 2.997194290161133, + "learning_rate": 3.282208588957055e-05, + "loss": 3.9454, + "step": 322 + }, + { + "epoch": 0.09914057704112952, + "grad_norm": 3.4990131855010986, + "learning_rate": 3.2924335378323114e-05, + "loss": 3.8682, + "step": 323 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 3.146879196166992, + "learning_rate": 3.302658486707567e-05, + "loss": 3.8863, + "step": 324 + }, + { + "epoch": 0.09975445058317986, + "grad_norm": 4.963291645050049, + "learning_rate": 3.312883435582822e-05, + "loss": 3.9951, + "step": 325 + }, + { + "epoch": 0.10006138735420503, + "grad_norm": 2.4511775970458984, + "learning_rate": 3.323108384458078e-05, + "loss": 3.875, + "step": 326 + }, + { + "epoch": 0.1003683241252302, + "grad_norm": 5.670922756195068, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.0446, + "step": 327 + }, + { + "epoch": 0.10067526089625538, + "grad_norm": 3.54237699508667, + "learning_rate": 3.3435582822085895e-05, + "loss": 3.9877, + "step": 328 + }, + { + "epoch": 0.10098219766728055, + "grad_norm": 2.9059271812438965, + "learning_rate": 3.353783231083845e-05, + "loss": 3.949, + "step": 329 + }, + { + "epoch": 0.10128913443830571, + "grad_norm": 3.870962381362915, + "learning_rate": 3.3640081799591e-05, + "loss": 3.8985, + "step": 330 + }, + { + "epoch": 0.10159607120933088, + "grad_norm": 3.275129556655884, + "learning_rate": 3.3742331288343556e-05, + "loss": 4.0209, + "step": 331 + }, + { + "epoch": 0.10190300798035605, + "grad_norm": 3.040931224822998, + "learning_rate": 3.3844580777096117e-05, + "loss": 3.9938, + "step": 332 + }, + { + "epoch": 0.10220994475138122, + "grad_norm": 4.3355584144592285, + "learning_rate": 3.394683026584867e-05, + "loss": 3.876, + "step": 333 + }, + { + "epoch": 0.10251688152240639, + "grad_norm": 3.0981085300445557, + "learning_rate": 3.4049079754601224e-05, + "loss": 3.9014, + "step": 334 + }, + { + "epoch": 0.10282381829343155, + "grad_norm": 3.2902655601501465, + "learning_rate": 3.4151329243353784e-05, + "loss": 3.9599, + "step": 335 + }, + { + "epoch": 0.10313075506445672, + "grad_norm": 3.496514081954956, + "learning_rate": 3.425357873210634e-05, + "loss": 3.9005, + "step": 336 + }, + { + "epoch": 0.10343769183548189, + "grad_norm": 3.4680685997009277, + "learning_rate": 3.43558282208589e-05, + "loss": 3.8591, + "step": 337 + }, + { + "epoch": 0.10374462860650706, + "grad_norm": 3.3041694164276123, + "learning_rate": 3.445807770961145e-05, + "loss": 3.9566, + "step": 338 + }, + { + "epoch": 0.10405156537753223, + "grad_norm": 3.519709825515747, + "learning_rate": 3.456032719836401e-05, + "loss": 3.9219, + "step": 339 + }, + { + "epoch": 0.1043585021485574, + "grad_norm": 3.932344436645508, + "learning_rate": 3.4662576687116566e-05, + "loss": 3.9155, + "step": 340 + }, + { + "epoch": 0.10466543891958256, + "grad_norm": 3.3109822273254395, + "learning_rate": 3.476482617586912e-05, + "loss": 3.9729, + "step": 341 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 4.556341648101807, + "learning_rate": 3.486707566462168e-05, + "loss": 3.9459, + "step": 342 + }, + { + "epoch": 0.1052793124616329, + "grad_norm": 2.9105725288391113, + "learning_rate": 3.4969325153374234e-05, + "loss": 3.9384, + "step": 343 + }, + { + "epoch": 0.10558624923265807, + "grad_norm": 3.865682601928711, + "learning_rate": 3.5071574642126794e-05, + "loss": 3.9826, + "step": 344 + }, + { + "epoch": 0.10589318600368323, + "grad_norm": 2.8606700897216797, + "learning_rate": 3.517382413087935e-05, + "loss": 3.8184, + "step": 345 + }, + { + "epoch": 0.10620012277470842, + "grad_norm": 4.323507785797119, + "learning_rate": 3.527607361963191e-05, + "loss": 3.8772, + "step": 346 + }, + { + "epoch": 0.10650705954573358, + "grad_norm": 2.890390157699585, + "learning_rate": 3.537832310838446e-05, + "loss": 3.8769, + "step": 347 + }, + { + "epoch": 0.10681399631675875, + "grad_norm": 4.008283615112305, + "learning_rate": 3.5480572597137015e-05, + "loss": 3.8796, + "step": 348 + }, + { + "epoch": 0.10712093308778392, + "grad_norm": 3.3605823516845703, + "learning_rate": 3.558282208588957e-05, + "loss": 3.8924, + "step": 349 + }, + { + "epoch": 0.10742786985880909, + "grad_norm": 3.6573123931884766, + "learning_rate": 3.568507157464213e-05, + "loss": 3.812, + "step": 350 + }, + { + "epoch": 0.10773480662983426, + "grad_norm": 3.0771777629852295, + "learning_rate": 3.578732106339468e-05, + "loss": 3.8958, + "step": 351 + }, + { + "epoch": 0.10804174340085942, + "grad_norm": 3.6483314037323, + "learning_rate": 3.5889570552147236e-05, + "loss": 3.8863, + "step": 352 + }, + { + "epoch": 0.10834868017188459, + "grad_norm": 3.1320669651031494, + "learning_rate": 3.59918200408998e-05, + "loss": 3.8194, + "step": 353 + }, + { + "epoch": 0.10865561694290976, + "grad_norm": 3.6510627269744873, + "learning_rate": 3.609406952965235e-05, + "loss": 3.8916, + "step": 354 + }, + { + "epoch": 0.10896255371393493, + "grad_norm": 3.0419273376464844, + "learning_rate": 3.619631901840491e-05, + "loss": 3.7907, + "step": 355 + }, + { + "epoch": 0.1092694904849601, + "grad_norm": 4.519289493560791, + "learning_rate": 3.6298568507157465e-05, + "loss": 3.8902, + "step": 356 + }, + { + "epoch": 0.10957642725598526, + "grad_norm": 2.938493251800537, + "learning_rate": 3.6400817995910025e-05, + "loss": 3.8675, + "step": 357 + }, + { + "epoch": 0.10988336402701043, + "grad_norm": 4.398004531860352, + "learning_rate": 3.650306748466258e-05, + "loss": 3.9535, + "step": 358 + }, + { + "epoch": 0.1101903007980356, + "grad_norm": 2.9128408432006836, + "learning_rate": 3.660531697341513e-05, + "loss": 3.944, + "step": 359 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 5.364169597625732, + "learning_rate": 3.670756646216769e-05, + "loss": 3.9289, + "step": 360 + }, + { + "epoch": 0.11080417434008594, + "grad_norm": 2.8434085845947266, + "learning_rate": 3.6809815950920246e-05, + "loss": 3.8204, + "step": 361 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.494234561920166, + "learning_rate": 3.6912065439672807e-05, + "loss": 3.8518, + "step": 362 + }, + { + "epoch": 0.11141804788213629, + "grad_norm": 2.959608554840088, + "learning_rate": 3.701431492842536e-05, + "loss": 3.8365, + "step": 363 + }, + { + "epoch": 0.11172498465316145, + "grad_norm": 3.4115726947784424, + "learning_rate": 3.711656441717792e-05, + "loss": 3.8507, + "step": 364 + }, + { + "epoch": 0.11203192142418662, + "grad_norm": 3.8023531436920166, + "learning_rate": 3.7218813905930474e-05, + "loss": 3.8544, + "step": 365 + }, + { + "epoch": 0.11233885819521179, + "grad_norm": 3.0639398097991943, + "learning_rate": 3.732106339468303e-05, + "loss": 3.8772, + "step": 366 + }, + { + "epoch": 0.11264579496623696, + "grad_norm": 4.241199016571045, + "learning_rate": 3.742331288343559e-05, + "loss": 3.7739, + "step": 367 + }, + { + "epoch": 0.11295273173726213, + "grad_norm": 2.977330446243286, + "learning_rate": 3.752556237218814e-05, + "loss": 3.8376, + "step": 368 + }, + { + "epoch": 0.1132596685082873, + "grad_norm": 4.574001789093018, + "learning_rate": 3.7627811860940696e-05, + "loss": 3.8761, + "step": 369 + }, + { + "epoch": 0.11356660527931246, + "grad_norm": 3.1499617099761963, + "learning_rate": 3.773006134969325e-05, + "loss": 3.8884, + "step": 370 + }, + { + "epoch": 0.11387354205033763, + "grad_norm": 3.81887149810791, + "learning_rate": 3.783231083844581e-05, + "loss": 3.8474, + "step": 371 + }, + { + "epoch": 0.1141804788213628, + "grad_norm": 3.424117088317871, + "learning_rate": 3.793456032719836e-05, + "loss": 3.8715, + "step": 372 + }, + { + "epoch": 0.11448741559238797, + "grad_norm": 4.431595325469971, + "learning_rate": 3.8036809815950924e-05, + "loss": 3.8305, + "step": 373 + }, + { + "epoch": 0.11479435236341314, + "grad_norm": 3.1664443016052246, + "learning_rate": 3.813905930470348e-05, + "loss": 3.8203, + "step": 374 + }, + { + "epoch": 0.1151012891344383, + "grad_norm": 4.312273025512695, + "learning_rate": 3.824130879345603e-05, + "loss": 3.8195, + "step": 375 + }, + { + "epoch": 0.11540822590546347, + "grad_norm": 3.0893726348876953, + "learning_rate": 3.834355828220859e-05, + "loss": 3.8248, + "step": 376 + }, + { + "epoch": 0.11571516267648864, + "grad_norm": 4.526726722717285, + "learning_rate": 3.8445807770961145e-05, + "loss": 3.8505, + "step": 377 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 2.5805325508117676, + "learning_rate": 3.8548057259713705e-05, + "loss": 3.8153, + "step": 378 + }, + { + "epoch": 0.11632903621853898, + "grad_norm": 4.6043381690979, + "learning_rate": 3.865030674846626e-05, + "loss": 3.8248, + "step": 379 + }, + { + "epoch": 0.11663597298956414, + "grad_norm": 3.0713136196136475, + "learning_rate": 3.875255623721882e-05, + "loss": 3.7687, + "step": 380 + }, + { + "epoch": 0.11694290976058933, + "grad_norm": 3.6344685554504395, + "learning_rate": 3.885480572597137e-05, + "loss": 3.8061, + "step": 381 + }, + { + "epoch": 0.1172498465316145, + "grad_norm": 3.6261723041534424, + "learning_rate": 3.895705521472393e-05, + "loss": 3.7939, + "step": 382 + }, + { + "epoch": 0.11755678330263966, + "grad_norm": 3.811779260635376, + "learning_rate": 3.905930470347649e-05, + "loss": 3.7973, + "step": 383 + }, + { + "epoch": 0.11786372007366483, + "grad_norm": 3.741685628890991, + "learning_rate": 3.916155419222904e-05, + "loss": 3.8149, + "step": 384 + }, + { + "epoch": 0.11817065684469, + "grad_norm": 3.330526351928711, + "learning_rate": 3.92638036809816e-05, + "loss": 3.8058, + "step": 385 + }, + { + "epoch": 0.11847759361571517, + "grad_norm": 3.2102115154266357, + "learning_rate": 3.9366053169734155e-05, + "loss": 3.7199, + "step": 386 + }, + { + "epoch": 0.11878453038674033, + "grad_norm": 3.670474052429199, + "learning_rate": 3.946830265848671e-05, + "loss": 3.8087, + "step": 387 + }, + { + "epoch": 0.1190914671577655, + "grad_norm": 3.218390941619873, + "learning_rate": 3.957055214723926e-05, + "loss": 3.7631, + "step": 388 + }, + { + "epoch": 0.11939840392879067, + "grad_norm": 4.2256693840026855, + "learning_rate": 3.967280163599182e-05, + "loss": 3.7624, + "step": 389 + }, + { + "epoch": 0.11970534069981584, + "grad_norm": 2.86247181892395, + "learning_rate": 3.9775051124744376e-05, + "loss": 3.7638, + "step": 390 + }, + { + "epoch": 0.120012277470841, + "grad_norm": 4.083118915557861, + "learning_rate": 3.987730061349693e-05, + "loss": 3.7581, + "step": 391 + }, + { + "epoch": 0.12031921424186617, + "grad_norm": 2.836794376373291, + "learning_rate": 3.997955010224949e-05, + "loss": 3.7466, + "step": 392 + }, + { + "epoch": 0.12062615101289134, + "grad_norm": 4.071137428283691, + "learning_rate": 4.0081799591002043e-05, + "loss": 3.7836, + "step": 393 + }, + { + "epoch": 0.12093308778391651, + "grad_norm": 3.3141064643859863, + "learning_rate": 4.0184049079754604e-05, + "loss": 3.754, + "step": 394 + }, + { + "epoch": 0.12124002455494168, + "grad_norm": 3.6064393520355225, + "learning_rate": 4.028629856850716e-05, + "loss": 3.8379, + "step": 395 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 3.7306606769561768, + "learning_rate": 4.038854805725972e-05, + "loss": 3.6848, + "step": 396 + }, + { + "epoch": 0.12185389809699201, + "grad_norm": 3.5877859592437744, + "learning_rate": 4.049079754601227e-05, + "loss": 3.8201, + "step": 397 + }, + { + "epoch": 0.12216083486801718, + "grad_norm": 3.930271625518799, + "learning_rate": 4.059304703476483e-05, + "loss": 3.7507, + "step": 398 + }, + { + "epoch": 0.12246777163904236, + "grad_norm": 2.974968194961548, + "learning_rate": 4.0695296523517386e-05, + "loss": 3.7545, + "step": 399 + }, + { + "epoch": 0.12277470841006753, + "grad_norm": 4.655934810638428, + "learning_rate": 4.079754601226994e-05, + "loss": 3.8093, + "step": 400 + }, + { + "epoch": 0.1230816451810927, + "grad_norm": 3.201986312866211, + "learning_rate": 4.08997955010225e-05, + "loss": 3.7252, + "step": 401 + }, + { + "epoch": 0.12338858195211787, + "grad_norm": 4.447626113891602, + "learning_rate": 4.100204498977505e-05, + "loss": 3.7132, + "step": 402 + }, + { + "epoch": 0.12369551872314304, + "grad_norm": 2.6518118381500244, + "learning_rate": 4.1104294478527614e-05, + "loss": 3.7637, + "step": 403 + }, + { + "epoch": 0.1240024554941682, + "grad_norm": 5.116448402404785, + "learning_rate": 4.120654396728017e-05, + "loss": 3.6991, + "step": 404 + }, + { + "epoch": 0.12430939226519337, + "grad_norm": 2.7780613899230957, + "learning_rate": 4.130879345603272e-05, + "loss": 3.7555, + "step": 405 + }, + { + "epoch": 0.12461632903621854, + "grad_norm": 4.281010627746582, + "learning_rate": 4.1411042944785274e-05, + "loss": 3.688, + "step": 406 + }, + { + "epoch": 0.12492326580724371, + "grad_norm": 2.851562023162842, + "learning_rate": 4.1513292433537835e-05, + "loss": 3.7557, + "step": 407 + }, + { + "epoch": 0.1252302025782689, + "grad_norm": 4.092229843139648, + "learning_rate": 4.161554192229039e-05, + "loss": 3.7179, + "step": 408 + }, + { + "epoch": 0.12553713934929406, + "grad_norm": 3.410094976425171, + "learning_rate": 4.171779141104294e-05, + "loss": 3.7292, + "step": 409 + }, + { + "epoch": 0.12584407612031923, + "grad_norm": 4.266562461853027, + "learning_rate": 4.18200408997955e-05, + "loss": 3.8204, + "step": 410 + }, + { + "epoch": 0.1261510128913444, + "grad_norm": 2.997642755508423, + "learning_rate": 4.1922290388548056e-05, + "loss": 3.7773, + "step": 411 + }, + { + "epoch": 0.12645794966236956, + "grad_norm": 4.50873327255249, + "learning_rate": 4.2024539877300617e-05, + "loss": 3.7255, + "step": 412 + }, + { + "epoch": 0.12676488643339473, + "grad_norm": 3.65312123298645, + "learning_rate": 4.212678936605317e-05, + "loss": 3.6472, + "step": 413 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 3.985487222671509, + "learning_rate": 4.222903885480573e-05, + "loss": 3.6915, + "step": 414 + }, + { + "epoch": 0.12737875997544507, + "grad_norm": 3.6020219326019287, + "learning_rate": 4.2331288343558284e-05, + "loss": 3.7299, + "step": 415 + }, + { + "epoch": 0.12768569674647023, + "grad_norm": 3.414529323577881, + "learning_rate": 4.243353783231084e-05, + "loss": 3.7827, + "step": 416 + }, + { + "epoch": 0.1279926335174954, + "grad_norm": 3.537292718887329, + "learning_rate": 4.25357873210634e-05, + "loss": 3.751, + "step": 417 + }, + { + "epoch": 0.12829957028852057, + "grad_norm": 3.5442280769348145, + "learning_rate": 4.263803680981595e-05, + "loss": 3.6828, + "step": 418 + }, + { + "epoch": 0.12860650705954574, + "grad_norm": 3.9816019535064697, + "learning_rate": 4.274028629856851e-05, + "loss": 3.7668, + "step": 419 + }, + { + "epoch": 0.1289134438305709, + "grad_norm": 3.1632657051086426, + "learning_rate": 4.2842535787321066e-05, + "loss": 3.6946, + "step": 420 + }, + { + "epoch": 0.12922038060159607, + "grad_norm": 4.731013298034668, + "learning_rate": 4.2944785276073626e-05, + "loss": 3.7078, + "step": 421 + }, + { + "epoch": 0.12952731737262124, + "grad_norm": 2.7973382472991943, + "learning_rate": 4.304703476482618e-05, + "loss": 3.5934, + "step": 422 + }, + { + "epoch": 0.1298342541436464, + "grad_norm": 4.555461406707764, + "learning_rate": 4.3149284253578733e-05, + "loss": 3.7406, + "step": 423 + }, + { + "epoch": 0.13014119091467158, + "grad_norm": 3.25795841217041, + "learning_rate": 4.3251533742331294e-05, + "loss": 3.6302, + "step": 424 + }, + { + "epoch": 0.13044812768569675, + "grad_norm": 3.9974427223205566, + "learning_rate": 4.335378323108385e-05, + "loss": 3.6995, + "step": 425 + }, + { + "epoch": 0.13075506445672191, + "grad_norm": 3.4234917163848877, + "learning_rate": 4.34560327198364e-05, + "loss": 3.727, + "step": 426 + }, + { + "epoch": 0.13106200122774708, + "grad_norm": 3.40573787689209, + "learning_rate": 4.3558282208588955e-05, + "loss": 3.6964, + "step": 427 + }, + { + "epoch": 0.13136893799877225, + "grad_norm": 3.6903765201568604, + "learning_rate": 4.3660531697341515e-05, + "loss": 3.7139, + "step": 428 + }, + { + "epoch": 0.13167587476979742, + "grad_norm": 3.3252439498901367, + "learning_rate": 4.376278118609407e-05, + "loss": 3.7221, + "step": 429 + }, + { + "epoch": 0.1319828115408226, + "grad_norm": 3.591610908508301, + "learning_rate": 4.386503067484663e-05, + "loss": 3.6592, + "step": 430 + }, + { + "epoch": 0.13228974831184775, + "grad_norm": 3.584683418273926, + "learning_rate": 4.396728016359918e-05, + "loss": 3.695, + "step": 431 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 3.5093443393707275, + "learning_rate": 4.4069529652351736e-05, + "loss": 3.6368, + "step": 432 + }, + { + "epoch": 0.1329036218538981, + "grad_norm": 3.5040347576141357, + "learning_rate": 4.41717791411043e-05, + "loss": 3.6463, + "step": 433 + }, + { + "epoch": 0.13321055862492326, + "grad_norm": 3.534536361694336, + "learning_rate": 4.427402862985685e-05, + "loss": 3.681, + "step": 434 + }, + { + "epoch": 0.13351749539594843, + "grad_norm": 4.016106605529785, + "learning_rate": 4.437627811860941e-05, + "loss": 3.7592, + "step": 435 + }, + { + "epoch": 0.1338244321669736, + "grad_norm": 3.4661898612976074, + "learning_rate": 4.4478527607361964e-05, + "loss": 3.6437, + "step": 436 + }, + { + "epoch": 0.13413136893799876, + "grad_norm": 3.917189359664917, + "learning_rate": 4.4580777096114525e-05, + "loss": 3.6809, + "step": 437 + }, + { + "epoch": 0.13443830570902393, + "grad_norm": 3.472147226333618, + "learning_rate": 4.468302658486708e-05, + "loss": 3.5978, + "step": 438 + }, + { + "epoch": 0.1347452424800491, + "grad_norm": 3.2357044219970703, + "learning_rate": 4.478527607361964e-05, + "loss": 3.6758, + "step": 439 + }, + { + "epoch": 0.13505217925107427, + "grad_norm": 3.8607826232910156, + "learning_rate": 4.488752556237219e-05, + "loss": 3.7155, + "step": 440 + }, + { + "epoch": 0.13535911602209943, + "grad_norm": 3.085242509841919, + "learning_rate": 4.4989775051124746e-05, + "loss": 3.674, + "step": 441 + }, + { + "epoch": 0.1356660527931246, + "grad_norm": 4.0473432540893555, + "learning_rate": 4.5092024539877307e-05, + "loss": 3.6542, + "step": 442 + }, + { + "epoch": 0.1359729895641498, + "grad_norm": 3.4742088317871094, + "learning_rate": 4.519427402862986e-05, + "loss": 3.6226, + "step": 443 + }, + { + "epoch": 0.13627992633517497, + "grad_norm": 3.8838884830474854, + "learning_rate": 4.5296523517382414e-05, + "loss": 3.695, + "step": 444 + }, + { + "epoch": 0.13658686310620013, + "grad_norm": 3.1551895141601562, + "learning_rate": 4.539877300613497e-05, + "loss": 3.6886, + "step": 445 + }, + { + "epoch": 0.1368937998772253, + "grad_norm": 3.6824824810028076, + "learning_rate": 4.550102249488753e-05, + "loss": 3.6397, + "step": 446 + }, + { + "epoch": 0.13720073664825047, + "grad_norm": 3.3671298027038574, + "learning_rate": 4.560327198364008e-05, + "loss": 3.5983, + "step": 447 + }, + { + "epoch": 0.13750767341927564, + "grad_norm": 4.11976957321167, + "learning_rate": 4.570552147239264e-05, + "loss": 3.6371, + "step": 448 + }, + { + "epoch": 0.1378146101903008, + "grad_norm": 3.2035205364227295, + "learning_rate": 4.5807770961145195e-05, + "loss": 3.6097, + "step": 449 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 4.944174289703369, + "learning_rate": 4.591002044989775e-05, + "loss": 3.6317, + "step": 450 + }, + { + "epoch": 0.13842848373235114, + "grad_norm": 3.0040266513824463, + "learning_rate": 4.601226993865031e-05, + "loss": 3.6407, + "step": 451 + }, + { + "epoch": 0.1387354205033763, + "grad_norm": 5.124639511108398, + "learning_rate": 4.611451942740286e-05, + "loss": 3.6539, + "step": 452 + }, + { + "epoch": 0.13904235727440148, + "grad_norm": 2.792884349822998, + "learning_rate": 4.6216768916155423e-05, + "loss": 3.6542, + "step": 453 + }, + { + "epoch": 0.13934929404542665, + "grad_norm": 4.394725799560547, + "learning_rate": 4.631901840490798e-05, + "loss": 3.6811, + "step": 454 + }, + { + "epoch": 0.13965623081645182, + "grad_norm": 3.209400177001953, + "learning_rate": 4.642126789366054e-05, + "loss": 3.6635, + "step": 455 + }, + { + "epoch": 0.13996316758747698, + "grad_norm": 3.6599526405334473, + "learning_rate": 4.652351738241309e-05, + "loss": 3.5732, + "step": 456 + }, + { + "epoch": 0.14027010435850215, + "grad_norm": 3.6527204513549805, + "learning_rate": 4.6625766871165645e-05, + "loss": 3.5979, + "step": 457 + }, + { + "epoch": 0.14057704112952732, + "grad_norm": 3.4562110900878906, + "learning_rate": 4.6728016359918205e-05, + "loss": 3.6761, + "step": 458 + }, + { + "epoch": 0.1408839779005525, + "grad_norm": 3.5935721397399902, + "learning_rate": 4.683026584867076e-05, + "loss": 3.6598, + "step": 459 + }, + { + "epoch": 0.14119091467157766, + "grad_norm": 3.4518251419067383, + "learning_rate": 4.693251533742332e-05, + "loss": 3.5707, + "step": 460 + }, + { + "epoch": 0.14149785144260282, + "grad_norm": 3.3248815536499023, + "learning_rate": 4.703476482617587e-05, + "loss": 3.6949, + "step": 461 + }, + { + "epoch": 0.141804788213628, + "grad_norm": 3.6379971504211426, + "learning_rate": 4.7137014314928426e-05, + "loss": 3.6265, + "step": 462 + }, + { + "epoch": 0.14211172498465316, + "grad_norm": 4.068325996398926, + "learning_rate": 4.723926380368098e-05, + "loss": 3.6096, + "step": 463 + }, + { + "epoch": 0.14241866175567833, + "grad_norm": 3.0870959758758545, + "learning_rate": 4.734151329243354e-05, + "loss": 3.5201, + "step": 464 + }, + { + "epoch": 0.1427255985267035, + "grad_norm": 4.013638973236084, + "learning_rate": 4.7443762781186094e-05, + "loss": 3.5845, + "step": 465 + }, + { + "epoch": 0.14303253529772866, + "grad_norm": 3.421921968460083, + "learning_rate": 4.754601226993865e-05, + "loss": 3.6718, + "step": 466 + }, + { + "epoch": 0.14333947206875383, + "grad_norm": 3.4814112186431885, + "learning_rate": 4.764826175869121e-05, + "loss": 3.6225, + "step": 467 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 2.9323105812072754, + "learning_rate": 4.775051124744376e-05, + "loss": 3.5881, + "step": 468 + }, + { + "epoch": 0.14395334561080417, + "grad_norm": 3.862344264984131, + "learning_rate": 4.785276073619632e-05, + "loss": 3.6264, + "step": 469 + }, + { + "epoch": 0.14426028238182934, + "grad_norm": 2.950495481491089, + "learning_rate": 4.7955010224948876e-05, + "loss": 3.5891, + "step": 470 + }, + { + "epoch": 0.1445672191528545, + "grad_norm": 4.360744476318359, + "learning_rate": 4.8057259713701436e-05, + "loss": 3.6746, + "step": 471 + }, + { + "epoch": 0.14487415592387967, + "grad_norm": 2.689297914505005, + "learning_rate": 4.815950920245399e-05, + "loss": 3.616, + "step": 472 + }, + { + "epoch": 0.14518109269490484, + "grad_norm": 4.433006286621094, + "learning_rate": 4.826175869120655e-05, + "loss": 3.6259, + "step": 473 + }, + { + "epoch": 0.14548802946593, + "grad_norm": 2.9184467792510986, + "learning_rate": 4.8364008179959104e-05, + "loss": 3.59, + "step": 474 + }, + { + "epoch": 0.14579496623695518, + "grad_norm": 4.472714424133301, + "learning_rate": 4.846625766871166e-05, + "loss": 3.5608, + "step": 475 + }, + { + "epoch": 0.14610190300798034, + "grad_norm": 3.0839431285858154, + "learning_rate": 4.856850715746422e-05, + "loss": 3.6069, + "step": 476 + }, + { + "epoch": 0.1464088397790055, + "grad_norm": 3.8900411128997803, + "learning_rate": 4.867075664621677e-05, + "loss": 3.5387, + "step": 477 + }, + { + "epoch": 0.14671577655003068, + "grad_norm": 3.0446956157684326, + "learning_rate": 4.877300613496933e-05, + "loss": 3.5374, + "step": 478 + }, + { + "epoch": 0.14702271332105588, + "grad_norm": 3.805018901824951, + "learning_rate": 4.8875255623721885e-05, + "loss": 3.6032, + "step": 479 + }, + { + "epoch": 0.14732965009208104, + "grad_norm": 2.9937491416931152, + "learning_rate": 4.897750511247444e-05, + "loss": 3.548, + "step": 480 + }, + { + "epoch": 0.1476365868631062, + "grad_norm": 4.103757858276367, + "learning_rate": 4.907975460122699e-05, + "loss": 3.6292, + "step": 481 + }, + { + "epoch": 0.14794352363413138, + "grad_norm": 2.8275530338287354, + "learning_rate": 4.918200408997955e-05, + "loss": 3.5885, + "step": 482 + }, + { + "epoch": 0.14825046040515655, + "grad_norm": 4.104444980621338, + "learning_rate": 4.928425357873211e-05, + "loss": 3.5566, + "step": 483 + }, + { + "epoch": 0.14855739717618172, + "grad_norm": 2.820648670196533, + "learning_rate": 4.938650306748466e-05, + "loss": 3.6576, + "step": 484 + }, + { + "epoch": 0.14886433394720688, + "grad_norm": 4.639568328857422, + "learning_rate": 4.948875255623722e-05, + "loss": 3.583, + "step": 485 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 2.8675858974456787, + "learning_rate": 4.9591002044989774e-05, + "loss": 3.5982, + "step": 486 + }, + { + "epoch": 0.14947820748925722, + "grad_norm": 4.820484638214111, + "learning_rate": 4.9693251533742335e-05, + "loss": 3.5479, + "step": 487 + }, + { + "epoch": 0.1497851442602824, + "grad_norm": 2.9569075107574463, + "learning_rate": 4.979550102249489e-05, + "loss": 3.5846, + "step": 488 + }, + { + "epoch": 0.15009208103130756, + "grad_norm": 4.402152061462402, + "learning_rate": 4.989775051124745e-05, + "loss": 3.5368, + "step": 489 + }, + { + "epoch": 0.15039901780233272, + "grad_norm": 3.0454704761505127, + "learning_rate": 5e-05, + "loss": 3.5233, + "step": 490 + }, + { + "epoch": 0.1507059545733579, + "grad_norm": 3.564425468444824, + "learning_rate": 5.010224948875256e-05, + "loss": 3.5747, + "step": 491 + }, + { + "epoch": 0.15101289134438306, + "grad_norm": 3.2065536975860596, + "learning_rate": 5.020449897750511e-05, + "loss": 3.4803, + "step": 492 + }, + { + "epoch": 0.15131982811540823, + "grad_norm": 4.06170129776001, + "learning_rate": 5.030674846625767e-05, + "loss": 3.5867, + "step": 493 + }, + { + "epoch": 0.1516267648864334, + "grad_norm": 2.937181234359741, + "learning_rate": 5.040899795501023e-05, + "loss": 3.5098, + "step": 494 + }, + { + "epoch": 0.15193370165745856, + "grad_norm": 3.7272653579711914, + "learning_rate": 5.051124744376279e-05, + "loss": 3.5959, + "step": 495 + }, + { + "epoch": 0.15224063842848373, + "grad_norm": 2.8606886863708496, + "learning_rate": 5.061349693251534e-05, + "loss": 3.4881, + "step": 496 + }, + { + "epoch": 0.1525475751995089, + "grad_norm": 3.4861185550689697, + "learning_rate": 5.07157464212679e-05, + "loss": 3.563, + "step": 497 + }, + { + "epoch": 0.15285451197053407, + "grad_norm": 3.1362967491149902, + "learning_rate": 5.081799591002045e-05, + "loss": 3.5564, + "step": 498 + }, + { + "epoch": 0.15316144874155924, + "grad_norm": 3.360508441925049, + "learning_rate": 5.0920245398773005e-05, + "loss": 3.5307, + "step": 499 + }, + { + "epoch": 0.1534683855125844, + "grad_norm": 3.2896840572357178, + "learning_rate": 5.1022494887525566e-05, + "loss": 3.4843, + "step": 500 + }, + { + "epoch": 0.15377532228360957, + "grad_norm": 3.320429801940918, + "learning_rate": 5.112474437627812e-05, + "loss": 3.484, + "step": 501 + }, + { + "epoch": 0.15408225905463474, + "grad_norm": 3.409586191177368, + "learning_rate": 5.122699386503068e-05, + "loss": 3.506, + "step": 502 + }, + { + "epoch": 0.1543891958256599, + "grad_norm": 3.0944409370422363, + "learning_rate": 5.1329243353783227e-05, + "loss": 3.5011, + "step": 503 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 3.7220418453216553, + "learning_rate": 5.143149284253579e-05, + "loss": 3.5629, + "step": 504 + }, + { + "epoch": 0.15500306936771024, + "grad_norm": 3.217435359954834, + "learning_rate": 5.153374233128835e-05, + "loss": 3.4957, + "step": 505 + }, + { + "epoch": 0.1553100061387354, + "grad_norm": 4.0457444190979, + "learning_rate": 5.163599182004091e-05, + "loss": 3.5152, + "step": 506 + }, + { + "epoch": 0.15561694290976058, + "grad_norm": 2.9380006790161133, + "learning_rate": 5.1738241308793455e-05, + "loss": 3.5261, + "step": 507 + }, + { + "epoch": 0.15592387968078575, + "grad_norm": 4.134535312652588, + "learning_rate": 5.1840490797546015e-05, + "loss": 3.5622, + "step": 508 + }, + { + "epoch": 0.15623081645181092, + "grad_norm": 2.8209407329559326, + "learning_rate": 5.1942740286298575e-05, + "loss": 3.5335, + "step": 509 + }, + { + "epoch": 0.15653775322283608, + "grad_norm": 4.4260711669921875, + "learning_rate": 5.204498977505112e-05, + "loss": 3.5554, + "step": 510 + }, + { + "epoch": 0.15684468999386125, + "grad_norm": 2.8649590015411377, + "learning_rate": 5.214723926380368e-05, + "loss": 3.4989, + "step": 511 + }, + { + "epoch": 0.15715162676488642, + "grad_norm": 4.0349812507629395, + "learning_rate": 5.224948875255624e-05, + "loss": 3.4883, + "step": 512 + }, + { + "epoch": 0.1574585635359116, + "grad_norm": 2.841923475265503, + "learning_rate": 5.2351738241308803e-05, + "loss": 3.4748, + "step": 513 + }, + { + "epoch": 0.15776550030693678, + "grad_norm": 3.8810653686523438, + "learning_rate": 5.245398773006135e-05, + "loss": 3.5403, + "step": 514 + }, + { + "epoch": 0.15807243707796195, + "grad_norm": 3.0830774307250977, + "learning_rate": 5.255623721881391e-05, + "loss": 3.513, + "step": 515 + }, + { + "epoch": 0.15837937384898712, + "grad_norm": 3.8688604831695557, + "learning_rate": 5.265848670756647e-05, + "loss": 3.5409, + "step": 516 + }, + { + "epoch": 0.1586863106200123, + "grad_norm": 2.854600429534912, + "learning_rate": 5.276073619631902e-05, + "loss": 3.4441, + "step": 517 + }, + { + "epoch": 0.15899324739103746, + "grad_norm": 3.9125611782073975, + "learning_rate": 5.286298568507158e-05, + "loss": 3.4953, + "step": 518 + }, + { + "epoch": 0.15930018416206262, + "grad_norm": 2.8626177310943604, + "learning_rate": 5.296523517382413e-05, + "loss": 3.5279, + "step": 519 + }, + { + "epoch": 0.1596071209330878, + "grad_norm": 3.5023677349090576, + "learning_rate": 5.306748466257669e-05, + "loss": 3.4886, + "step": 520 + }, + { + "epoch": 0.15991405770411296, + "grad_norm": 2.960505962371826, + "learning_rate": 5.316973415132924e-05, + "loss": 3.5278, + "step": 521 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 3.976245164871216, + "learning_rate": 5.32719836400818e-05, + "loss": 3.5236, + "step": 522 + }, + { + "epoch": 0.1605279312461633, + "grad_norm": 3.078248977661133, + "learning_rate": 5.337423312883436e-05, + "loss": 3.5194, + "step": 523 + }, + { + "epoch": 0.16083486801718846, + "grad_norm": 3.7498552799224854, + "learning_rate": 5.347648261758691e-05, + "loss": 3.5315, + "step": 524 + }, + { + "epoch": 0.16114180478821363, + "grad_norm": 2.87638258934021, + "learning_rate": 5.357873210633947e-05, + "loss": 3.434, + "step": 525 + }, + { + "epoch": 0.1614487415592388, + "grad_norm": 3.786454677581787, + "learning_rate": 5.368098159509203e-05, + "loss": 3.4985, + "step": 526 + }, + { + "epoch": 0.16175567833026397, + "grad_norm": 2.915156364440918, + "learning_rate": 5.378323108384459e-05, + "loss": 3.4979, + "step": 527 + }, + { + "epoch": 0.16206261510128914, + "grad_norm": 4.095824718475342, + "learning_rate": 5.3885480572597135e-05, + "loss": 3.4605, + "step": 528 + }, + { + "epoch": 0.1623695518723143, + "grad_norm": 2.793501853942871, + "learning_rate": 5.3987730061349695e-05, + "loss": 3.476, + "step": 529 + }, + { + "epoch": 0.16267648864333947, + "grad_norm": 3.9074480533599854, + "learning_rate": 5.4089979550102256e-05, + "loss": 3.4636, + "step": 530 + }, + { + "epoch": 0.16298342541436464, + "grad_norm": 2.8382515907287598, + "learning_rate": 5.4192229038854816e-05, + "loss": 3.4364, + "step": 531 + }, + { + "epoch": 0.1632903621853898, + "grad_norm": 3.4670751094818115, + "learning_rate": 5.429447852760736e-05, + "loss": 3.5033, + "step": 532 + }, + { + "epoch": 0.16359729895641498, + "grad_norm": 2.8805580139160156, + "learning_rate": 5.439672801635992e-05, + "loss": 3.471, + "step": 533 + }, + { + "epoch": 0.16390423572744015, + "grad_norm": 3.745434522628784, + "learning_rate": 5.4498977505112484e-05, + "loss": 3.4565, + "step": 534 + }, + { + "epoch": 0.1642111724984653, + "grad_norm": 3.290579319000244, + "learning_rate": 5.460122699386503e-05, + "loss": 3.47, + "step": 535 + }, + { + "epoch": 0.16451810926949048, + "grad_norm": 3.2988481521606445, + "learning_rate": 5.470347648261759e-05, + "loss": 3.3781, + "step": 536 + }, + { + "epoch": 0.16482504604051565, + "grad_norm": 3.3673248291015625, + "learning_rate": 5.4805725971370145e-05, + "loss": 3.4891, + "step": 537 + }, + { + "epoch": 0.16513198281154082, + "grad_norm": 3.1917717456817627, + "learning_rate": 5.4907975460122705e-05, + "loss": 3.4493, + "step": 538 + }, + { + "epoch": 0.16543891958256599, + "grad_norm": 3.3869614601135254, + "learning_rate": 5.501022494887525e-05, + "loss": 3.3954, + "step": 539 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 2.896742820739746, + "learning_rate": 5.511247443762781e-05, + "loss": 3.4465, + "step": 540 + }, + { + "epoch": 0.16605279312461632, + "grad_norm": 3.771268844604492, + "learning_rate": 5.521472392638037e-05, + "loss": 3.4889, + "step": 541 + }, + { + "epoch": 0.1663597298956415, + "grad_norm": 2.8693349361419678, + "learning_rate": 5.531697341513292e-05, + "loss": 3.3661, + "step": 542 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.093103885650635, + "learning_rate": 5.541922290388548e-05, + "loss": 3.4451, + "step": 543 + }, + { + "epoch": 0.16697360343769183, + "grad_norm": 3.050361394882202, + "learning_rate": 5.552147239263804e-05, + "loss": 3.4203, + "step": 544 + }, + { + "epoch": 0.167280540208717, + "grad_norm": 3.041480302810669, + "learning_rate": 5.56237218813906e-05, + "loss": 3.4173, + "step": 545 + }, + { + "epoch": 0.16758747697974216, + "grad_norm": 3.385680675506592, + "learning_rate": 5.572597137014315e-05, + "loss": 3.4408, + "step": 546 + }, + { + "epoch": 0.16789441375076733, + "grad_norm": 2.88845157623291, + "learning_rate": 5.582822085889571e-05, + "loss": 3.4536, + "step": 547 + }, + { + "epoch": 0.1682013505217925, + "grad_norm": 3.7155961990356445, + "learning_rate": 5.593047034764827e-05, + "loss": 3.4392, + "step": 548 + }, + { + "epoch": 0.1685082872928177, + "grad_norm": 3.4626615047454834, + "learning_rate": 5.6032719836400815e-05, + "loss": 3.4395, + "step": 549 + }, + { + "epoch": 0.16881522406384286, + "grad_norm": 3.182154417037964, + "learning_rate": 5.6134969325153376e-05, + "loss": 3.5239, + "step": 550 + }, + { + "epoch": 0.16912216083486803, + "grad_norm": 3.478602886199951, + "learning_rate": 5.6237218813905936e-05, + "loss": 3.4258, + "step": 551 + }, + { + "epoch": 0.1694290976058932, + "grad_norm": 2.9652369022369385, + "learning_rate": 5.6339468302658496e-05, + "loss": 3.3919, + "step": 552 + }, + { + "epoch": 0.16973603437691837, + "grad_norm": 3.736821413040161, + "learning_rate": 5.644171779141104e-05, + "loss": 3.4491, + "step": 553 + }, + { + "epoch": 0.17004297114794353, + "grad_norm": 2.7791361808776855, + "learning_rate": 5.6543967280163604e-05, + "loss": 3.4748, + "step": 554 + }, + { + "epoch": 0.1703499079189687, + "grad_norm": 4.583637714385986, + "learning_rate": 5.664621676891616e-05, + "loss": 3.4554, + "step": 555 + }, + { + "epoch": 0.17065684468999387, + "grad_norm": 2.8527474403381348, + "learning_rate": 5.674846625766872e-05, + "loss": 3.4327, + "step": 556 + }, + { + "epoch": 0.17096378146101904, + "grad_norm": 4.116163730621338, + "learning_rate": 5.685071574642127e-05, + "loss": 3.4043, + "step": 557 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 3.0130903720855713, + "learning_rate": 5.6952965235173825e-05, + "loss": 3.4823, + "step": 558 + }, + { + "epoch": 0.17157765500306937, + "grad_norm": 3.3556432723999023, + "learning_rate": 5.7055214723926385e-05, + "loss": 3.4464, + "step": 559 + }, + { + "epoch": 0.17188459177409454, + "grad_norm": 2.854952573776245, + "learning_rate": 5.715746421267893e-05, + "loss": 3.3768, + "step": 560 + }, + { + "epoch": 0.1721915285451197, + "grad_norm": 3.9891982078552246, + "learning_rate": 5.725971370143149e-05, + "loss": 3.3949, + "step": 561 + }, + { + "epoch": 0.17249846531614488, + "grad_norm": 2.980468511581421, + "learning_rate": 5.736196319018405e-05, + "loss": 3.459, + "step": 562 + }, + { + "epoch": 0.17280540208717005, + "grad_norm": 3.453510284423828, + "learning_rate": 5.7464212678936613e-05, + "loss": 3.4549, + "step": 563 + }, + { + "epoch": 0.1731123388581952, + "grad_norm": 2.8926782608032227, + "learning_rate": 5.756646216768916e-05, + "loss": 3.392, + "step": 564 + }, + { + "epoch": 0.17341927562922038, + "grad_norm": 3.3722894191741943, + "learning_rate": 5.766871165644172e-05, + "loss": 3.4002, + "step": 565 + }, + { + "epoch": 0.17372621240024555, + "grad_norm": 2.8093647956848145, + "learning_rate": 5.777096114519428e-05, + "loss": 3.3862, + "step": 566 + }, + { + "epoch": 0.17403314917127072, + "grad_norm": 4.1722731590271, + "learning_rate": 5.787321063394683e-05, + "loss": 3.3903, + "step": 567 + }, + { + "epoch": 0.17434008594229589, + "grad_norm": 2.778069257736206, + "learning_rate": 5.797546012269939e-05, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.17464702271332105, + "grad_norm": 3.8501908779144287, + "learning_rate": 5.807770961145195e-05, + "loss": 3.4094, + "step": 569 + }, + { + "epoch": 0.17495395948434622, + "grad_norm": 2.5164549350738525, + "learning_rate": 5.817995910020451e-05, + "loss": 3.4343, + "step": 570 + }, + { + "epoch": 0.1752608962553714, + "grad_norm": 4.0673065185546875, + "learning_rate": 5.8282208588957056e-05, + "loss": 3.3993, + "step": 571 + }, + { + "epoch": 0.17556783302639656, + "grad_norm": 2.7882072925567627, + "learning_rate": 5.8384458077709616e-05, + "loss": 3.4759, + "step": 572 + }, + { + "epoch": 0.17587476979742173, + "grad_norm": 3.3252487182617188, + "learning_rate": 5.848670756646217e-05, + "loss": 3.3562, + "step": 573 + }, + { + "epoch": 0.1761817065684469, + "grad_norm": 2.7499115467071533, + "learning_rate": 5.8588957055214724e-05, + "loss": 3.3376, + "step": 574 + }, + { + "epoch": 0.17648864333947206, + "grad_norm": 4.061224460601807, + "learning_rate": 5.8691206543967284e-05, + "loss": 3.3521, + "step": 575 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 3.022193431854248, + "learning_rate": 5.879345603271984e-05, + "loss": 3.3933, + "step": 576 + }, + { + "epoch": 0.1771025168815224, + "grad_norm": 3.2442128658294678, + "learning_rate": 5.88957055214724e-05, + "loss": 3.4531, + "step": 577 + }, + { + "epoch": 0.17740945365254757, + "grad_norm": 2.9524872303009033, + "learning_rate": 5.8997955010224945e-05, + "loss": 3.332, + "step": 578 + }, + { + "epoch": 0.17771639042357273, + "grad_norm": 3.4604902267456055, + "learning_rate": 5.9100204498977505e-05, + "loss": 3.3706, + "step": 579 + }, + { + "epoch": 0.1780233271945979, + "grad_norm": 3.05216646194458, + "learning_rate": 5.9202453987730066e-05, + "loss": 3.463, + "step": 580 + }, + { + "epoch": 0.17833026396562307, + "grad_norm": 3.427311658859253, + "learning_rate": 5.9304703476482626e-05, + "loss": 3.4204, + "step": 581 + }, + { + "epoch": 0.17863720073664824, + "grad_norm": 2.5583856105804443, + "learning_rate": 5.940695296523517e-05, + "loss": 3.4686, + "step": 582 + }, + { + "epoch": 0.1789441375076734, + "grad_norm": 3.85471248626709, + "learning_rate": 5.950920245398773e-05, + "loss": 3.4518, + "step": 583 + }, + { + "epoch": 0.17925107427869857, + "grad_norm": 2.6894235610961914, + "learning_rate": 5.9611451942740294e-05, + "loss": 3.4179, + "step": 584 + }, + { + "epoch": 0.17955801104972377, + "grad_norm": 3.7592904567718506, + "learning_rate": 5.971370143149284e-05, + "loss": 3.3197, + "step": 585 + }, + { + "epoch": 0.17986494782074894, + "grad_norm": 2.8180313110351562, + "learning_rate": 5.98159509202454e-05, + "loss": 3.4098, + "step": 586 + }, + { + "epoch": 0.1801718845917741, + "grad_norm": 3.5678224563598633, + "learning_rate": 5.991820040899796e-05, + "loss": 3.3644, + "step": 587 + }, + { + "epoch": 0.18047882136279927, + "grad_norm": 2.920607328414917, + "learning_rate": 6.002044989775052e-05, + "loss": 3.4158, + "step": 588 + }, + { + "epoch": 0.18078575813382444, + "grad_norm": 2.9465436935424805, + "learning_rate": 6.012269938650307e-05, + "loss": 3.3369, + "step": 589 + }, + { + "epoch": 0.1810926949048496, + "grad_norm": 3.8760533332824707, + "learning_rate": 6.022494887525563e-05, + "loss": 3.4205, + "step": 590 + }, + { + "epoch": 0.18139963167587478, + "grad_norm": 3.2972259521484375, + "learning_rate": 6.032719836400819e-05, + "loss": 3.3234, + "step": 591 + }, + { + "epoch": 0.18170656844689995, + "grad_norm": 2.8855841159820557, + "learning_rate": 6.0429447852760736e-05, + "loss": 3.4172, + "step": 592 + }, + { + "epoch": 0.18201350521792511, + "grad_norm": 3.3035166263580322, + "learning_rate": 6.05316973415133e-05, + "loss": 3.3235, + "step": 593 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 2.5975232124328613, + "learning_rate": 6.063394683026585e-05, + "loss": 3.3245, + "step": 594 + }, + { + "epoch": 0.18262737875997545, + "grad_norm": 3.68007755279541, + "learning_rate": 6.073619631901841e-05, + "loss": 3.4348, + "step": 595 + }, + { + "epoch": 0.18293431553100062, + "grad_norm": 2.774419069290161, + "learning_rate": 6.083844580777096e-05, + "loss": 3.2763, + "step": 596 + }, + { + "epoch": 0.1832412523020258, + "grad_norm": 3.686140298843384, + "learning_rate": 6.094069529652352e-05, + "loss": 3.29, + "step": 597 + }, + { + "epoch": 0.18354818907305095, + "grad_norm": 2.71142315864563, + "learning_rate": 6.104294478527609e-05, + "loss": 3.3899, + "step": 598 + }, + { + "epoch": 0.18385512584407612, + "grad_norm": 3.725736141204834, + "learning_rate": 6.114519427402863e-05, + "loss": 3.3844, + "step": 599 + }, + { + "epoch": 0.1841620626151013, + "grad_norm": 2.691237211227417, + "learning_rate": 6.124744376278119e-05, + "loss": 3.3138, + "step": 600 + }, + { + "epoch": 0.18446899938612646, + "grad_norm": 3.467499256134033, + "learning_rate": 6.134969325153375e-05, + "loss": 3.3501, + "step": 601 + }, + { + "epoch": 0.18477593615715163, + "grad_norm": 2.776309013366699, + "learning_rate": 6.14519427402863e-05, + "loss": 3.3278, + "step": 602 + }, + { + "epoch": 0.1850828729281768, + "grad_norm": 3.4674019813537598, + "learning_rate": 6.155419222903885e-05, + "loss": 3.262, + "step": 603 + }, + { + "epoch": 0.18538980969920196, + "grad_norm": 2.8091421127319336, + "learning_rate": 6.165644171779141e-05, + "loss": 3.3296, + "step": 604 + }, + { + "epoch": 0.18569674647022713, + "grad_norm": 3.4938528537750244, + "learning_rate": 6.175869120654397e-05, + "loss": 3.4028, + "step": 605 + }, + { + "epoch": 0.1860036832412523, + "grad_norm": 2.5200188159942627, + "learning_rate": 6.186094069529653e-05, + "loss": 3.3726, + "step": 606 + }, + { + "epoch": 0.18631062001227747, + "grad_norm": 3.6415109634399414, + "learning_rate": 6.196319018404908e-05, + "loss": 3.3539, + "step": 607 + }, + { + "epoch": 0.18661755678330263, + "grad_norm": 2.553532123565674, + "learning_rate": 6.206543967280163e-05, + "loss": 3.2971, + "step": 608 + }, + { + "epoch": 0.1869244935543278, + "grad_norm": 3.7287046909332275, + "learning_rate": 6.21676891615542e-05, + "loss": 3.3987, + "step": 609 + }, + { + "epoch": 0.18723143032535297, + "grad_norm": 2.6285226345062256, + "learning_rate": 6.226993865030674e-05, + "loss": 3.2446, + "step": 610 + }, + { + "epoch": 0.18753836709637814, + "grad_norm": 3.453766107559204, + "learning_rate": 6.237218813905931e-05, + "loss": 3.2644, + "step": 611 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 2.7924115657806396, + "learning_rate": 6.247443762781186e-05, + "loss": 3.3056, + "step": 612 + }, + { + "epoch": 0.18815224063842848, + "grad_norm": 3.4854533672332764, + "learning_rate": 6.257668711656443e-05, + "loss": 3.3468, + "step": 613 + }, + { + "epoch": 0.18845917740945364, + "grad_norm": 2.8738653659820557, + "learning_rate": 6.267893660531697e-05, + "loss": 3.3079, + "step": 614 + }, + { + "epoch": 0.1887661141804788, + "grad_norm": 3.496342420578003, + "learning_rate": 6.278118609406954e-05, + "loss": 3.3453, + "step": 615 + }, + { + "epoch": 0.18907305095150398, + "grad_norm": 3.1935245990753174, + "learning_rate": 6.288343558282209e-05, + "loss": 3.303, + "step": 616 + }, + { + "epoch": 0.18937998772252915, + "grad_norm": 2.9726579189300537, + "learning_rate": 6.298568507157464e-05, + "loss": 3.284, + "step": 617 + }, + { + "epoch": 0.18968692449355432, + "grad_norm": 2.8515241146087646, + "learning_rate": 6.30879345603272e-05, + "loss": 3.2748, + "step": 618 + }, + { + "epoch": 0.18999386126457948, + "grad_norm": 3.216681480407715, + "learning_rate": 6.319018404907977e-05, + "loss": 3.2613, + "step": 619 + }, + { + "epoch": 0.19030079803560468, + "grad_norm": 2.9164562225341797, + "learning_rate": 6.329243353783232e-05, + "loss": 3.3234, + "step": 620 + }, + { + "epoch": 0.19060773480662985, + "grad_norm": 2.6724259853363037, + "learning_rate": 6.339468302658487e-05, + "loss": 3.3271, + "step": 621 + }, + { + "epoch": 0.19091467157765502, + "grad_norm": 3.298551082611084, + "learning_rate": 6.349693251533743e-05, + "loss": 3.2715, + "step": 622 + }, + { + "epoch": 0.19122160834868018, + "grad_norm": 2.609632968902588, + "learning_rate": 6.359918200408998e-05, + "loss": 3.2392, + "step": 623 + }, + { + "epoch": 0.19152854511970535, + "grad_norm": 3.6469385623931885, + "learning_rate": 6.370143149284253e-05, + "loss": 3.428, + "step": 624 + }, + { + "epoch": 0.19183548189073052, + "grad_norm": 2.4231622219085693, + "learning_rate": 6.380368098159509e-05, + "loss": 3.3436, + "step": 625 + }, + { + "epoch": 0.1921424186617557, + "grad_norm": 3.9182474613189697, + "learning_rate": 6.390593047034765e-05, + "loss": 3.3375, + "step": 626 + }, + { + "epoch": 0.19244935543278086, + "grad_norm": 2.3975942134857178, + "learning_rate": 6.400817995910021e-05, + "loss": 3.2711, + "step": 627 + }, + { + "epoch": 0.19275629220380602, + "grad_norm": 3.061039447784424, + "learning_rate": 6.411042944785276e-05, + "loss": 3.3124, + "step": 628 + }, + { + "epoch": 0.1930632289748312, + "grad_norm": 2.9461817741394043, + "learning_rate": 6.421267893660532e-05, + "loss": 3.2954, + "step": 629 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 2.6603662967681885, + "learning_rate": 6.431492842535788e-05, + "loss": 3.2138, + "step": 630 + }, + { + "epoch": 0.19367710251688153, + "grad_norm": 3.339444875717163, + "learning_rate": 6.441717791411042e-05, + "loss": 3.2796, + "step": 631 + }, + { + "epoch": 0.1939840392879067, + "grad_norm": 2.59061861038208, + "learning_rate": 6.451942740286299e-05, + "loss": 3.3906, + "step": 632 + }, + { + "epoch": 0.19429097605893186, + "grad_norm": 3.704300880432129, + "learning_rate": 6.462167689161554e-05, + "loss": 3.2604, + "step": 633 + }, + { + "epoch": 0.19459791282995703, + "grad_norm": 3.110203266143799, + "learning_rate": 6.472392638036811e-05, + "loss": 3.3236, + "step": 634 + }, + { + "epoch": 0.1949048496009822, + "grad_norm": 3.016730308532715, + "learning_rate": 6.482617586912065e-05, + "loss": 3.2911, + "step": 635 + }, + { + "epoch": 0.19521178637200737, + "grad_norm": 2.896956205368042, + "learning_rate": 6.492842535787322e-05, + "loss": 3.35, + "step": 636 + }, + { + "epoch": 0.19551872314303254, + "grad_norm": 2.7913663387298584, + "learning_rate": 6.503067484662577e-05, + "loss": 3.3474, + "step": 637 + }, + { + "epoch": 0.1958256599140577, + "grad_norm": 3.285518169403076, + "learning_rate": 6.513292433537832e-05, + "loss": 3.2131, + "step": 638 + }, + { + "epoch": 0.19613259668508287, + "grad_norm": 2.588491201400757, + "learning_rate": 6.523517382413088e-05, + "loss": 3.2955, + "step": 639 + }, + { + "epoch": 0.19643953345610804, + "grad_norm": 2.9417827129364014, + "learning_rate": 6.533742331288345e-05, + "loss": 3.2917, + "step": 640 + }, + { + "epoch": 0.1967464702271332, + "grad_norm": 3.2209408283233643, + "learning_rate": 6.5439672801636e-05, + "loss": 3.233, + "step": 641 + }, + { + "epoch": 0.19705340699815838, + "grad_norm": 2.8424925804138184, + "learning_rate": 6.554192229038855e-05, + "loss": 3.3194, + "step": 642 + }, + { + "epoch": 0.19736034376918354, + "grad_norm": 2.9005842208862305, + "learning_rate": 6.56441717791411e-05, + "loss": 3.275, + "step": 643 + }, + { + "epoch": 0.1976672805402087, + "grad_norm": 3.0277016162872314, + "learning_rate": 6.574642126789366e-05, + "loss": 3.2881, + "step": 644 + }, + { + "epoch": 0.19797421731123388, + "grad_norm": 2.8932368755340576, + "learning_rate": 6.584867075664623e-05, + "loss": 3.2799, + "step": 645 + }, + { + "epoch": 0.19828115408225905, + "grad_norm": 2.994464635848999, + "learning_rate": 6.595092024539877e-05, + "loss": 3.258, + "step": 646 + }, + { + "epoch": 0.19858809085328422, + "grad_norm": 2.943040132522583, + "learning_rate": 6.605316973415133e-05, + "loss": 3.1994, + "step": 647 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 2.942765712738037, + "learning_rate": 6.615541922290389e-05, + "loss": 3.1802, + "step": 648 + }, + { + "epoch": 0.19920196439533455, + "grad_norm": 2.8036246299743652, + "learning_rate": 6.625766871165644e-05, + "loss": 3.2426, + "step": 649 + }, + { + "epoch": 0.19950890116635972, + "grad_norm": 2.814507484436035, + "learning_rate": 6.6359918200409e-05, + "loss": 3.2978, + "step": 650 + }, + { + "epoch": 0.1998158379373849, + "grad_norm": 2.8133158683776855, + "learning_rate": 6.646216768916156e-05, + "loss": 3.2435, + "step": 651 + }, + { + "epoch": 0.20012277470841006, + "grad_norm": 2.8596129417419434, + "learning_rate": 6.656441717791412e-05, + "loss": 3.2154, + "step": 652 + }, + { + "epoch": 0.20042971147943522, + "grad_norm": 2.663926839828491, + "learning_rate": 6.666666666666667e-05, + "loss": 3.2487, + "step": 653 + }, + { + "epoch": 0.2007366482504604, + "grad_norm": 3.40561580657959, + "learning_rate": 6.676891615541922e-05, + "loss": 3.1509, + "step": 654 + }, + { + "epoch": 0.20104358502148556, + "grad_norm": 2.5786798000335693, + "learning_rate": 6.687116564417179e-05, + "loss": 3.2686, + "step": 655 + }, + { + "epoch": 0.20135052179251076, + "grad_norm": 3.007436752319336, + "learning_rate": 6.697341513292433e-05, + "loss": 3.2543, + "step": 656 + }, + { + "epoch": 0.20165745856353592, + "grad_norm": 2.5966951847076416, + "learning_rate": 6.70756646216769e-05, + "loss": 3.2643, + "step": 657 + }, + { + "epoch": 0.2019643953345611, + "grad_norm": 3.2698333263397217, + "learning_rate": 6.717791411042945e-05, + "loss": 3.2002, + "step": 658 + }, + { + "epoch": 0.20227133210558626, + "grad_norm": 2.513129472732544, + "learning_rate": 6.7280163599182e-05, + "loss": 3.1551, + "step": 659 + }, + { + "epoch": 0.20257826887661143, + "grad_norm": 2.9690299034118652, + "learning_rate": 6.738241308793456e-05, + "loss": 3.3037, + "step": 660 + }, + { + "epoch": 0.2028852056476366, + "grad_norm": 2.6644227504730225, + "learning_rate": 6.748466257668711e-05, + "loss": 3.3225, + "step": 661 + }, + { + "epoch": 0.20319214241866176, + "grad_norm": 2.6990232467651367, + "learning_rate": 6.758691206543968e-05, + "loss": 3.227, + "step": 662 + }, + { + "epoch": 0.20349907918968693, + "grad_norm": 3.6271350383758545, + "learning_rate": 6.768916155419223e-05, + "loss": 3.32, + "step": 663 + }, + { + "epoch": 0.2038060159607121, + "grad_norm": 2.6351428031921387, + "learning_rate": 6.779141104294479e-05, + "loss": 3.2104, + "step": 664 + }, + { + "epoch": 0.20411295273173727, + "grad_norm": 3.980685234069824, + "learning_rate": 6.789366053169734e-05, + "loss": 3.2602, + "step": 665 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 2.5207509994506836, + "learning_rate": 6.799591002044991e-05, + "loss": 3.2256, + "step": 666 + }, + { + "epoch": 0.2047268262737876, + "grad_norm": 3.0568666458129883, + "learning_rate": 6.809815950920245e-05, + "loss": 3.2918, + "step": 667 + }, + { + "epoch": 0.20503376304481277, + "grad_norm": 2.6476826667785645, + "learning_rate": 6.820040899795501e-05, + "loss": 3.2745, + "step": 668 + }, + { + "epoch": 0.20534069981583794, + "grad_norm": 3.0413191318511963, + "learning_rate": 6.830265848670757e-05, + "loss": 3.2683, + "step": 669 + }, + { + "epoch": 0.2056476365868631, + "grad_norm": 2.6214709281921387, + "learning_rate": 6.840490797546014e-05, + "loss": 3.1399, + "step": 670 + }, + { + "epoch": 0.20595457335788828, + "grad_norm": 3.0577988624572754, + "learning_rate": 6.850715746421268e-05, + "loss": 3.2131, + "step": 671 + }, + { + "epoch": 0.20626151012891344, + "grad_norm": 2.795365571975708, + "learning_rate": 6.860940695296524e-05, + "loss": 3.1633, + "step": 672 + }, + { + "epoch": 0.2065684468999386, + "grad_norm": 3.3030495643615723, + "learning_rate": 6.87116564417178e-05, + "loss": 3.2036, + "step": 673 + }, + { + "epoch": 0.20687538367096378, + "grad_norm": 2.3182966709136963, + "learning_rate": 6.881390593047035e-05, + "loss": 3.2154, + "step": 674 + }, + { + "epoch": 0.20718232044198895, + "grad_norm": 3.133702039718628, + "learning_rate": 6.89161554192229e-05, + "loss": 3.1828, + "step": 675 + }, + { + "epoch": 0.20748925721301412, + "grad_norm": 2.555358409881592, + "learning_rate": 6.901840490797547e-05, + "loss": 3.1434, + "step": 676 + }, + { + "epoch": 0.20779619398403928, + "grad_norm": 2.990675687789917, + "learning_rate": 6.912065439672802e-05, + "loss": 3.2182, + "step": 677 + }, + { + "epoch": 0.20810313075506445, + "grad_norm": 2.5072035789489746, + "learning_rate": 6.922290388548058e-05, + "loss": 3.2735, + "step": 678 + }, + { + "epoch": 0.20841006752608962, + "grad_norm": 3.311474323272705, + "learning_rate": 6.932515337423313e-05, + "loss": 3.2152, + "step": 679 + }, + { + "epoch": 0.2087170042971148, + "grad_norm": 2.7110986709594727, + "learning_rate": 6.942740286298569e-05, + "loss": 3.1633, + "step": 680 + }, + { + "epoch": 0.20902394106813996, + "grad_norm": 2.6963095664978027, + "learning_rate": 6.952965235173824e-05, + "loss": 3.2097, + "step": 681 + }, + { + "epoch": 0.20933087783916512, + "grad_norm": 2.7126448154449463, + "learning_rate": 6.963190184049079e-05, + "loss": 3.232, + "step": 682 + }, + { + "epoch": 0.2096378146101903, + "grad_norm": 2.723257541656494, + "learning_rate": 6.973415132924336e-05, + "loss": 3.1024, + "step": 683 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 2.985407829284668, + "learning_rate": 6.983640081799591e-05, + "loss": 3.215, + "step": 684 + }, + { + "epoch": 0.21025168815224063, + "grad_norm": 2.4878063201904297, + "learning_rate": 6.993865030674847e-05, + "loss": 3.2543, + "step": 685 + }, + { + "epoch": 0.2105586249232658, + "grad_norm": 3.417191505432129, + "learning_rate": 7.004089979550102e-05, + "loss": 3.217, + "step": 686 + }, + { + "epoch": 0.21086556169429096, + "grad_norm": 2.606513738632202, + "learning_rate": 7.014314928425359e-05, + "loss": 3.1831, + "step": 687 + }, + { + "epoch": 0.21117249846531613, + "grad_norm": 2.777334213256836, + "learning_rate": 7.024539877300614e-05, + "loss": 3.1513, + "step": 688 + }, + { + "epoch": 0.2114794352363413, + "grad_norm": 2.718494415283203, + "learning_rate": 7.03476482617587e-05, + "loss": 3.1695, + "step": 689 + }, + { + "epoch": 0.21178637200736647, + "grad_norm": 3.041794776916504, + "learning_rate": 7.044989775051125e-05, + "loss": 3.2078, + "step": 690 + }, + { + "epoch": 0.21209330877839166, + "grad_norm": 2.6473169326782227, + "learning_rate": 7.055214723926382e-05, + "loss": 3.177, + "step": 691 + }, + { + "epoch": 0.21240024554941683, + "grad_norm": 3.2349517345428467, + "learning_rate": 7.065439672801636e-05, + "loss": 3.2144, + "step": 692 + }, + { + "epoch": 0.212707182320442, + "grad_norm": 2.6024651527404785, + "learning_rate": 7.075664621676892e-05, + "loss": 3.2204, + "step": 693 + }, + { + "epoch": 0.21301411909146717, + "grad_norm": 2.9090511798858643, + "learning_rate": 7.085889570552148e-05, + "loss": 3.2473, + "step": 694 + }, + { + "epoch": 0.21332105586249234, + "grad_norm": 3.230525255203247, + "learning_rate": 7.096114519427403e-05, + "loss": 3.2552, + "step": 695 + }, + { + "epoch": 0.2136279926335175, + "grad_norm": 2.2609128952026367, + "learning_rate": 7.106339468302658e-05, + "loss": 3.1302, + "step": 696 + }, + { + "epoch": 0.21393492940454267, + "grad_norm": 3.484372854232788, + "learning_rate": 7.116564417177914e-05, + "loss": 3.1578, + "step": 697 + }, + { + "epoch": 0.21424186617556784, + "grad_norm": 2.130702257156372, + "learning_rate": 7.12678936605317e-05, + "loss": 3.2089, + "step": 698 + }, + { + "epoch": 0.214548802946593, + "grad_norm": 3.0673611164093018, + "learning_rate": 7.137014314928426e-05, + "loss": 3.214, + "step": 699 + }, + { + "epoch": 0.21485573971761818, + "grad_norm": 2.572826862335205, + "learning_rate": 7.147239263803681e-05, + "loss": 3.1824, + "step": 700 + }, + { + "epoch": 0.21516267648864335, + "grad_norm": 2.8327746391296387, + "learning_rate": 7.157464212678937e-05, + "loss": 3.2384, + "step": 701 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 2.863041877746582, + "learning_rate": 7.167689161554193e-05, + "loss": 3.1102, + "step": 702 + }, + { + "epoch": 0.21577655003069368, + "grad_norm": 2.2519750595092773, + "learning_rate": 7.177914110429447e-05, + "loss": 3.1541, + "step": 703 + }, + { + "epoch": 0.21608348680171885, + "grad_norm": 3.197129011154175, + "learning_rate": 7.188139059304704e-05, + "loss": 3.2407, + "step": 704 + }, + { + "epoch": 0.21639042357274402, + "grad_norm": 2.32582426071167, + "learning_rate": 7.19836400817996e-05, + "loss": 3.1895, + "step": 705 + }, + { + "epoch": 0.21669736034376919, + "grad_norm": 3.0128488540649414, + "learning_rate": 7.208588957055215e-05, + "loss": 3.2839, + "step": 706 + }, + { + "epoch": 0.21700429711479435, + "grad_norm": 2.503342390060425, + "learning_rate": 7.21881390593047e-05, + "loss": 3.2093, + "step": 707 + }, + { + "epoch": 0.21731123388581952, + "grad_norm": 2.7540833950042725, + "learning_rate": 7.229038854805727e-05, + "loss": 3.2143, + "step": 708 + }, + { + "epoch": 0.2176181706568447, + "grad_norm": 2.8838772773742676, + "learning_rate": 7.239263803680982e-05, + "loss": 3.2051, + "step": 709 + }, + { + "epoch": 0.21792510742786986, + "grad_norm": 2.7495758533477783, + "learning_rate": 7.249488752556238e-05, + "loss": 3.0701, + "step": 710 + }, + { + "epoch": 0.21823204419889503, + "grad_norm": 2.684539794921875, + "learning_rate": 7.259713701431493e-05, + "loss": 3.1917, + "step": 711 + }, + { + "epoch": 0.2185389809699202, + "grad_norm": 2.8330819606781006, + "learning_rate": 7.26993865030675e-05, + "loss": 3.1685, + "step": 712 + }, + { + "epoch": 0.21884591774094536, + "grad_norm": 2.6974711418151855, + "learning_rate": 7.280163599182005e-05, + "loss": 3.0953, + "step": 713 + }, + { + "epoch": 0.21915285451197053, + "grad_norm": 2.5129306316375732, + "learning_rate": 7.29038854805726e-05, + "loss": 3.1371, + "step": 714 + }, + { + "epoch": 0.2194597912829957, + "grad_norm": 2.7884230613708496, + "learning_rate": 7.300613496932516e-05, + "loss": 3.1386, + "step": 715 + }, + { + "epoch": 0.21976672805402087, + "grad_norm": 2.296306610107422, + "learning_rate": 7.310838445807771e-05, + "loss": 3.1735, + "step": 716 + }, + { + "epoch": 0.22007366482504603, + "grad_norm": 2.777911424636841, + "learning_rate": 7.321063394683026e-05, + "loss": 3.1726, + "step": 717 + }, + { + "epoch": 0.2203806015960712, + "grad_norm": 2.5349695682525635, + "learning_rate": 7.331288343558282e-05, + "loss": 3.1603, + "step": 718 + }, + { + "epoch": 0.22068753836709637, + "grad_norm": 2.415412425994873, + "learning_rate": 7.341513292433539e-05, + "loss": 3.1378, + "step": 719 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 2.7188358306884766, + "learning_rate": 7.351738241308794e-05, + "loss": 3.1321, + "step": 720 + }, + { + "epoch": 0.2213014119091467, + "grad_norm": 2.4872183799743652, + "learning_rate": 7.361963190184049e-05, + "loss": 3.1283, + "step": 721 + }, + { + "epoch": 0.22160834868017187, + "grad_norm": 2.454535961151123, + "learning_rate": 7.372188139059305e-05, + "loss": 3.1085, + "step": 722 + }, + { + "epoch": 0.22191528545119704, + "grad_norm": 2.5621426105499268, + "learning_rate": 7.382413087934561e-05, + "loss": 3.1307, + "step": 723 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.513777256011963, + "learning_rate": 7.392638036809815e-05, + "loss": 3.1103, + "step": 724 + }, + { + "epoch": 0.22252915899324738, + "grad_norm": 2.596559762954712, + "learning_rate": 7.402862985685072e-05, + "loss": 3.1563, + "step": 725 + }, + { + "epoch": 0.22283609576427257, + "grad_norm": 2.371487617492676, + "learning_rate": 7.413087934560327e-05, + "loss": 3.1344, + "step": 726 + }, + { + "epoch": 0.22314303253529774, + "grad_norm": 2.7252206802368164, + "learning_rate": 7.423312883435584e-05, + "loss": 3.2139, + "step": 727 + }, + { + "epoch": 0.2234499693063229, + "grad_norm": 2.2834722995758057, + "learning_rate": 7.433537832310838e-05, + "loss": 3.1461, + "step": 728 + }, + { + "epoch": 0.22375690607734808, + "grad_norm": 3.0965540409088135, + "learning_rate": 7.443762781186095e-05, + "loss": 3.1433, + "step": 729 + }, + { + "epoch": 0.22406384284837325, + "grad_norm": 2.351365804672241, + "learning_rate": 7.45398773006135e-05, + "loss": 3.1737, + "step": 730 + }, + { + "epoch": 0.2243707796193984, + "grad_norm": 3.0938596725463867, + "learning_rate": 7.464212678936606e-05, + "loss": 3.1689, + "step": 731 + }, + { + "epoch": 0.22467771639042358, + "grad_norm": 2.415039300918579, + "learning_rate": 7.474437627811861e-05, + "loss": 3.1146, + "step": 732 + }, + { + "epoch": 0.22498465316144875, + "grad_norm": 2.8242318630218506, + "learning_rate": 7.484662576687118e-05, + "loss": 3.0812, + "step": 733 + }, + { + "epoch": 0.22529158993247392, + "grad_norm": 2.4347777366638184, + "learning_rate": 7.494887525562373e-05, + "loss": 3.203, + "step": 734 + }, + { + "epoch": 0.22559852670349909, + "grad_norm": 2.953418016433716, + "learning_rate": 7.505112474437628e-05, + "loss": 3.109, + "step": 735 + }, + { + "epoch": 0.22590546347452425, + "grad_norm": 2.600888252258301, + "learning_rate": 7.515337423312884e-05, + "loss": 3.1859, + "step": 736 + }, + { + "epoch": 0.22621240024554942, + "grad_norm": 2.7484869956970215, + "learning_rate": 7.525562372188139e-05, + "loss": 3.1169, + "step": 737 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 2.4797677993774414, + "learning_rate": 7.535787321063396e-05, + "loss": 3.0696, + "step": 738 + }, + { + "epoch": 0.22682627378759976, + "grad_norm": 2.641873359680176, + "learning_rate": 7.54601226993865e-05, + "loss": 3.1545, + "step": 739 + }, + { + "epoch": 0.22713321055862493, + "grad_norm": 2.3956825733184814, + "learning_rate": 7.556237218813907e-05, + "loss": 3.1295, + "step": 740 + }, + { + "epoch": 0.2274401473296501, + "grad_norm": 2.8832130432128906, + "learning_rate": 7.566462167689162e-05, + "loss": 3.1119, + "step": 741 + }, + { + "epoch": 0.22774708410067526, + "grad_norm": 2.3001184463500977, + "learning_rate": 7.576687116564417e-05, + "loss": 3.0068, + "step": 742 + }, + { + "epoch": 0.22805402087170043, + "grad_norm": 2.8682122230529785, + "learning_rate": 7.586912065439673e-05, + "loss": 3.0562, + "step": 743 + }, + { + "epoch": 0.2283609576427256, + "grad_norm": 2.2176413536071777, + "learning_rate": 7.59713701431493e-05, + "loss": 3.1395, + "step": 744 + }, + { + "epoch": 0.22866789441375077, + "grad_norm": 3.698274612426758, + "learning_rate": 7.607361963190185e-05, + "loss": 3.209, + "step": 745 + }, + { + "epoch": 0.22897483118477593, + "grad_norm": 2.141063928604126, + "learning_rate": 7.61758691206544e-05, + "loss": 3.1734, + "step": 746 + }, + { + "epoch": 0.2292817679558011, + "grad_norm": 2.728498697280884, + "learning_rate": 7.627811860940695e-05, + "loss": 3.1498, + "step": 747 + }, + { + "epoch": 0.22958870472682627, + "grad_norm": 2.271678924560547, + "learning_rate": 7.638036809815952e-05, + "loss": 3.1538, + "step": 748 + }, + { + "epoch": 0.22989564149785144, + "grad_norm": 2.6095521450042725, + "learning_rate": 7.648261758691206e-05, + "loss": 3.155, + "step": 749 + }, + { + "epoch": 0.2302025782688766, + "grad_norm": 2.410792112350464, + "learning_rate": 7.658486707566463e-05, + "loss": 3.0478, + "step": 750 + }, + { + "epoch": 0.23050951503990177, + "grad_norm": 2.6980888843536377, + "learning_rate": 7.668711656441718e-05, + "loss": 3.1369, + "step": 751 + }, + { + "epoch": 0.23081645181092694, + "grad_norm": 2.353308916091919, + "learning_rate": 7.678936605316974e-05, + "loss": 3.0052, + "step": 752 + }, + { + "epoch": 0.2311233885819521, + "grad_norm": 2.4530155658721924, + "learning_rate": 7.689161554192229e-05, + "loss": 3.1348, + "step": 753 + }, + { + "epoch": 0.23143032535297728, + "grad_norm": 2.393601894378662, + "learning_rate": 7.699386503067484e-05, + "loss": 2.9941, + "step": 754 + }, + { + "epoch": 0.23173726212400245, + "grad_norm": 2.576876401901245, + "learning_rate": 7.709611451942741e-05, + "loss": 3.114, + "step": 755 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 2.0420913696289062, + "learning_rate": 7.719836400817996e-05, + "loss": 3.132, + "step": 756 + }, + { + "epoch": 0.23235113566605278, + "grad_norm": 3.0095622539520264, + "learning_rate": 7.730061349693252e-05, + "loss": 3.1763, + "step": 757 + }, + { + "epoch": 0.23265807243707795, + "grad_norm": 2.224005937576294, + "learning_rate": 7.740286298568507e-05, + "loss": 3.0703, + "step": 758 + }, + { + "epoch": 0.23296500920810312, + "grad_norm": 2.7559845447540283, + "learning_rate": 7.750511247443764e-05, + "loss": 3.1026, + "step": 759 + }, + { + "epoch": 0.2332719459791283, + "grad_norm": 2.2965753078460693, + "learning_rate": 7.760736196319018e-05, + "loss": 3.0284, + "step": 760 + }, + { + "epoch": 0.23357888275015345, + "grad_norm": 2.374398708343506, + "learning_rate": 7.770961145194275e-05, + "loss": 3.0636, + "step": 761 + }, + { + "epoch": 0.23388581952117865, + "grad_norm": 2.4315314292907715, + "learning_rate": 7.78118609406953e-05, + "loss": 3.0906, + "step": 762 + }, + { + "epoch": 0.23419275629220382, + "grad_norm": 2.5609946250915527, + "learning_rate": 7.791411042944787e-05, + "loss": 3.0692, + "step": 763 + }, + { + "epoch": 0.234499693063229, + "grad_norm": 2.419597864151001, + "learning_rate": 7.80163599182004e-05, + "loss": 3.1934, + "step": 764 + }, + { + "epoch": 0.23480662983425415, + "grad_norm": 3.0499062538146973, + "learning_rate": 7.811860940695297e-05, + "loss": 3.18, + "step": 765 + }, + { + "epoch": 0.23511356660527932, + "grad_norm": 2.464421510696411, + "learning_rate": 7.822085889570553e-05, + "loss": 3.1591, + "step": 766 + }, + { + "epoch": 0.2354205033763045, + "grad_norm": 3.4370174407958984, + "learning_rate": 7.832310838445808e-05, + "loss": 3.1156, + "step": 767 + }, + { + "epoch": 0.23572744014732966, + "grad_norm": 2.207406520843506, + "learning_rate": 7.842535787321063e-05, + "loss": 3.0557, + "step": 768 + }, + { + "epoch": 0.23603437691835483, + "grad_norm": 2.484807014465332, + "learning_rate": 7.85276073619632e-05, + "loss": 3.1003, + "step": 769 + }, + { + "epoch": 0.23634131368938, + "grad_norm": 2.33217716217041, + "learning_rate": 7.862985685071576e-05, + "loss": 3.0707, + "step": 770 + }, + { + "epoch": 0.23664825046040516, + "grad_norm": 2.493717670440674, + "learning_rate": 7.873210633946831e-05, + "loss": 3.127, + "step": 771 + }, + { + "epoch": 0.23695518723143033, + "grad_norm": 2.5824413299560547, + "learning_rate": 7.883435582822086e-05, + "loss": 3.1042, + "step": 772 + }, + { + "epoch": 0.2372621240024555, + "grad_norm": 2.4137654304504395, + "learning_rate": 7.893660531697342e-05, + "loss": 3.136, + "step": 773 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 2.4657833576202393, + "learning_rate": 7.903885480572597e-05, + "loss": 3.038, + "step": 774 + }, + { + "epoch": 0.23787599754450584, + "grad_norm": 2.426260471343994, + "learning_rate": 7.914110429447852e-05, + "loss": 3.0102, + "step": 775 + }, + { + "epoch": 0.238182934315531, + "grad_norm": 2.4658050537109375, + "learning_rate": 7.924335378323109e-05, + "loss": 3.0645, + "step": 776 + }, + { + "epoch": 0.23848987108655617, + "grad_norm": 2.186267614364624, + "learning_rate": 7.934560327198364e-05, + "loss": 3.0585, + "step": 777 + }, + { + "epoch": 0.23879680785758134, + "grad_norm": 2.8824141025543213, + "learning_rate": 7.94478527607362e-05, + "loss": 3.0796, + "step": 778 + }, + { + "epoch": 0.2391037446286065, + "grad_norm": 1.9940539598464966, + "learning_rate": 7.955010224948875e-05, + "loss": 2.9894, + "step": 779 + }, + { + "epoch": 0.23941068139963168, + "grad_norm": 2.9386861324310303, + "learning_rate": 7.965235173824132e-05, + "loss": 3.1147, + "step": 780 + }, + { + "epoch": 0.23971761817065684, + "grad_norm": 2.241983413696289, + "learning_rate": 7.975460122699386e-05, + "loss": 2.9977, + "step": 781 + }, + { + "epoch": 0.240024554941682, + "grad_norm": 2.4796900749206543, + "learning_rate": 7.985685071574643e-05, + "loss": 3.0507, + "step": 782 + }, + { + "epoch": 0.24033149171270718, + "grad_norm": 2.6178741455078125, + "learning_rate": 7.995910020449898e-05, + "loss": 3.0299, + "step": 783 + }, + { + "epoch": 0.24063842848373235, + "grad_norm": 2.157179594039917, + "learning_rate": 8.006134969325155e-05, + "loss": 3.0419, + "step": 784 + }, + { + "epoch": 0.24094536525475752, + "grad_norm": 2.49029541015625, + "learning_rate": 8.016359918200409e-05, + "loss": 3.0785, + "step": 785 + }, + { + "epoch": 0.24125230202578268, + "grad_norm": 2.254014492034912, + "learning_rate": 8.026584867075665e-05, + "loss": 3.0009, + "step": 786 + }, + { + "epoch": 0.24155923879680785, + "grad_norm": 2.514465570449829, + "learning_rate": 8.036809815950921e-05, + "loss": 3.0221, + "step": 787 + }, + { + "epoch": 0.24186617556783302, + "grad_norm": 2.309812545776367, + "learning_rate": 8.047034764826176e-05, + "loss": 2.9822, + "step": 788 + }, + { + "epoch": 0.2421731123388582, + "grad_norm": 2.5367796421051025, + "learning_rate": 8.057259713701431e-05, + "loss": 2.966, + "step": 789 + }, + { + "epoch": 0.24248004910988336, + "grad_norm": 2.4668943881988525, + "learning_rate": 8.067484662576688e-05, + "loss": 3.1177, + "step": 790 + }, + { + "epoch": 0.24278698588090852, + "grad_norm": 2.9424917697906494, + "learning_rate": 8.077709611451944e-05, + "loss": 3.078, + "step": 791 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 2.3068933486938477, + "learning_rate": 8.087934560327199e-05, + "loss": 3.0415, + "step": 792 + }, + { + "epoch": 0.24340085942295886, + "grad_norm": 2.675631284713745, + "learning_rate": 8.098159509202454e-05, + "loss": 3.012, + "step": 793 + }, + { + "epoch": 0.24370779619398403, + "grad_norm": 2.0261662006378174, + "learning_rate": 8.10838445807771e-05, + "loss": 3.0023, + "step": 794 + }, + { + "epoch": 0.2440147329650092, + "grad_norm": 3.32330322265625, + "learning_rate": 8.118609406952966e-05, + "loss": 3.0992, + "step": 795 + }, + { + "epoch": 0.24432166973603436, + "grad_norm": 2.1587088108062744, + "learning_rate": 8.12883435582822e-05, + "loss": 3.0922, + "step": 796 + }, + { + "epoch": 0.24462860650705956, + "grad_norm": 2.639254331588745, + "learning_rate": 8.139059304703477e-05, + "loss": 2.9856, + "step": 797 + }, + { + "epoch": 0.24493554327808473, + "grad_norm": 1.9976975917816162, + "learning_rate": 8.149284253578732e-05, + "loss": 3.0015, + "step": 798 + }, + { + "epoch": 0.2452424800491099, + "grad_norm": 2.763504981994629, + "learning_rate": 8.159509202453988e-05, + "loss": 3.0437, + "step": 799 + }, + { + "epoch": 0.24554941682013506, + "grad_norm": 1.9080138206481934, + "learning_rate": 8.169734151329243e-05, + "loss": 3.0009, + "step": 800 + }, + { + "epoch": 0.24585635359116023, + "grad_norm": 3.1276164054870605, + "learning_rate": 8.1799591002045e-05, + "loss": 3.0433, + "step": 801 + }, + { + "epoch": 0.2461632903621854, + "grad_norm": 2.0463218688964844, + "learning_rate": 8.190184049079755e-05, + "loss": 2.988, + "step": 802 + }, + { + "epoch": 0.24647022713321057, + "grad_norm": 2.8476648330688477, + "learning_rate": 8.20040899795501e-05, + "loss": 3.0238, + "step": 803 + }, + { + "epoch": 0.24677716390423574, + "grad_norm": 1.9715898036956787, + "learning_rate": 8.210633946830266e-05, + "loss": 3.0657, + "step": 804 + }, + { + "epoch": 0.2470841006752609, + "grad_norm": 3.369995594024658, + "learning_rate": 8.220858895705523e-05, + "loss": 3.0181, + "step": 805 + }, + { + "epoch": 0.24739103744628607, + "grad_norm": 2.0333900451660156, + "learning_rate": 8.231083844580777e-05, + "loss": 3.0589, + "step": 806 + }, + { + "epoch": 0.24769797421731124, + "grad_norm": 2.5702931880950928, + "learning_rate": 8.241308793456033e-05, + "loss": 2.9908, + "step": 807 + }, + { + "epoch": 0.2480049109883364, + "grad_norm": 2.12131929397583, + "learning_rate": 8.251533742331289e-05, + "loss": 3.0519, + "step": 808 + }, + { + "epoch": 0.24831184775936158, + "grad_norm": 2.5457377433776855, + "learning_rate": 8.261758691206544e-05, + "loss": 3.019, + "step": 809 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 2.0954740047454834, + "learning_rate": 8.2719836400818e-05, + "loss": 2.9805, + "step": 810 + }, + { + "epoch": 0.2489257213014119, + "grad_norm": 2.2456700801849365, + "learning_rate": 8.282208588957055e-05, + "loss": 3.0627, + "step": 811 + }, + { + "epoch": 0.24923265807243708, + "grad_norm": 2.4453790187835693, + "learning_rate": 8.292433537832312e-05, + "loss": 3.0447, + "step": 812 + }, + { + "epoch": 0.24953959484346225, + "grad_norm": 2.1835873126983643, + "learning_rate": 8.302658486707567e-05, + "loss": 3.0008, + "step": 813 + }, + { + "epoch": 0.24984653161448742, + "grad_norm": 2.292989492416382, + "learning_rate": 8.312883435582822e-05, + "loss": 2.9175, + "step": 814 + }, + { + "epoch": 0.2501534683855126, + "grad_norm": 2.408888816833496, + "learning_rate": 8.323108384458078e-05, + "loss": 2.9649, + "step": 815 + }, + { + "epoch": 0.2504604051565378, + "grad_norm": 2.1873834133148193, + "learning_rate": 8.333333333333334e-05, + "loss": 2.9812, + "step": 816 + }, + { + "epoch": 0.25076734192756295, + "grad_norm": 2.2599284648895264, + "learning_rate": 8.343558282208588e-05, + "loss": 3.0086, + "step": 817 + }, + { + "epoch": 0.2510742786985881, + "grad_norm": 2.1902761459350586, + "learning_rate": 8.353783231083845e-05, + "loss": 2.9295, + "step": 818 + }, + { + "epoch": 0.2513812154696133, + "grad_norm": 2.4830422401428223, + "learning_rate": 8.3640081799591e-05, + "loss": 2.9808, + "step": 819 + }, + { + "epoch": 0.25168815224063845, + "grad_norm": 2.2274281978607178, + "learning_rate": 8.374233128834357e-05, + "loss": 2.9525, + "step": 820 + }, + { + "epoch": 0.2519950890116636, + "grad_norm": 2.2949111461639404, + "learning_rate": 8.384458077709611e-05, + "loss": 3.0313, + "step": 821 + }, + { + "epoch": 0.2523020257826888, + "grad_norm": 2.2345564365386963, + "learning_rate": 8.394683026584868e-05, + "loss": 2.9024, + "step": 822 + }, + { + "epoch": 0.25260896255371396, + "grad_norm": 2.488744020462036, + "learning_rate": 8.404907975460123e-05, + "loss": 2.9907, + "step": 823 + }, + { + "epoch": 0.2529158993247391, + "grad_norm": 1.9192837476730347, + "learning_rate": 8.415132924335379e-05, + "loss": 2.9792, + "step": 824 + }, + { + "epoch": 0.2532228360957643, + "grad_norm": 2.6426947116851807, + "learning_rate": 8.425357873210634e-05, + "loss": 2.972, + "step": 825 + }, + { + "epoch": 0.25352977286678946, + "grad_norm": 1.9950047731399536, + "learning_rate": 8.435582822085891e-05, + "loss": 2.9885, + "step": 826 + }, + { + "epoch": 0.25383670963781463, + "grad_norm": 2.30191969871521, + "learning_rate": 8.445807770961146e-05, + "loss": 2.9358, + "step": 827 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 2.1111395359039307, + "learning_rate": 8.456032719836401e-05, + "loss": 3.0343, + "step": 828 + }, + { + "epoch": 0.25445058317986496, + "grad_norm": 2.7292258739471436, + "learning_rate": 8.466257668711657e-05, + "loss": 2.9465, + "step": 829 + }, + { + "epoch": 0.25475751995089013, + "grad_norm": 1.9130604267120361, + "learning_rate": 8.476482617586912e-05, + "loss": 2.9443, + "step": 830 + }, + { + "epoch": 0.2550644567219153, + "grad_norm": 2.4240024089813232, + "learning_rate": 8.486707566462168e-05, + "loss": 2.963, + "step": 831 + }, + { + "epoch": 0.25537139349294047, + "grad_norm": 2.062875509262085, + "learning_rate": 8.496932515337423e-05, + "loss": 3.0127, + "step": 832 + }, + { + "epoch": 0.25567833026396564, + "grad_norm": 2.223639726638794, + "learning_rate": 8.50715746421268e-05, + "loss": 2.944, + "step": 833 + }, + { + "epoch": 0.2559852670349908, + "grad_norm": 2.2969272136688232, + "learning_rate": 8.517382413087935e-05, + "loss": 2.9495, + "step": 834 + }, + { + "epoch": 0.256292203806016, + "grad_norm": 2.1343178749084473, + "learning_rate": 8.52760736196319e-05, + "loss": 3.0383, + "step": 835 + }, + { + "epoch": 0.25659914057704114, + "grad_norm": 2.2348313331604004, + "learning_rate": 8.537832310838446e-05, + "loss": 2.9205, + "step": 836 + }, + { + "epoch": 0.2569060773480663, + "grad_norm": 2.2653896808624268, + "learning_rate": 8.548057259713702e-05, + "loss": 2.9699, + "step": 837 + }, + { + "epoch": 0.2572130141190915, + "grad_norm": 2.1332547664642334, + "learning_rate": 8.558282208588958e-05, + "loss": 2.9318, + "step": 838 + }, + { + "epoch": 0.25751995089011664, + "grad_norm": 2.5935778617858887, + "learning_rate": 8.568507157464213e-05, + "loss": 2.9754, + "step": 839 + }, + { + "epoch": 0.2578268876611418, + "grad_norm": 2.073923110961914, + "learning_rate": 8.578732106339469e-05, + "loss": 3.0396, + "step": 840 + }, + { + "epoch": 0.258133824432167, + "grad_norm": 2.485049247741699, + "learning_rate": 8.588957055214725e-05, + "loss": 2.9297, + "step": 841 + }, + { + "epoch": 0.25844076120319215, + "grad_norm": 1.9425253868103027, + "learning_rate": 8.599182004089979e-05, + "loss": 3.0131, + "step": 842 + }, + { + "epoch": 0.2587476979742173, + "grad_norm": 2.6248724460601807, + "learning_rate": 8.609406952965236e-05, + "loss": 3.0345, + "step": 843 + }, + { + "epoch": 0.2590546347452425, + "grad_norm": 1.9123374223709106, + "learning_rate": 8.619631901840491e-05, + "loss": 3.0259, + "step": 844 + }, + { + "epoch": 0.25936157151626765, + "grad_norm": 2.457913637161255, + "learning_rate": 8.629856850715747e-05, + "loss": 3.0015, + "step": 845 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 2.0444202423095703, + "learning_rate": 8.640081799591002e-05, + "loss": 2.9663, + "step": 846 + }, + { + "epoch": 0.259975445058318, + "grad_norm": 2.1673583984375, + "learning_rate": 8.650306748466259e-05, + "loss": 3.0646, + "step": 847 + }, + { + "epoch": 0.26028238182934316, + "grad_norm": 2.1198627948760986, + "learning_rate": 8.660531697341514e-05, + "loss": 2.8769, + "step": 848 + }, + { + "epoch": 0.2605893186003683, + "grad_norm": 2.379960775375366, + "learning_rate": 8.67075664621677e-05, + "loss": 2.9637, + "step": 849 + }, + { + "epoch": 0.2608962553713935, + "grad_norm": 2.3954226970672607, + "learning_rate": 8.680981595092025e-05, + "loss": 3.025, + "step": 850 + }, + { + "epoch": 0.26120319214241866, + "grad_norm": 2.254746198654175, + "learning_rate": 8.69120654396728e-05, + "loss": 2.9962, + "step": 851 + }, + { + "epoch": 0.26151012891344383, + "grad_norm": 2.0851991176605225, + "learning_rate": 8.701431492842537e-05, + "loss": 2.9399, + "step": 852 + }, + { + "epoch": 0.261817065684469, + "grad_norm": 2.2800698280334473, + "learning_rate": 8.711656441717791e-05, + "loss": 2.9465, + "step": 853 + }, + { + "epoch": 0.26212400245549416, + "grad_norm": 2.3628437519073486, + "learning_rate": 8.721881390593048e-05, + "loss": 3.0298, + "step": 854 + }, + { + "epoch": 0.26243093922651933, + "grad_norm": 1.9642207622528076, + "learning_rate": 8.732106339468303e-05, + "loss": 2.8462, + "step": 855 + }, + { + "epoch": 0.2627378759975445, + "grad_norm": 2.5833423137664795, + "learning_rate": 8.742331288343558e-05, + "loss": 2.9024, + "step": 856 + }, + { + "epoch": 0.26304481276856967, + "grad_norm": 1.7022998332977295, + "learning_rate": 8.752556237218814e-05, + "loss": 2.9948, + "step": 857 + }, + { + "epoch": 0.26335174953959484, + "grad_norm": 3.181725025177002, + "learning_rate": 8.76278118609407e-05, + "loss": 3.0634, + "step": 858 + }, + { + "epoch": 0.26365868631062, + "grad_norm": 1.8931077718734741, + "learning_rate": 8.773006134969326e-05, + "loss": 2.9974, + "step": 859 + }, + { + "epoch": 0.2639656230816452, + "grad_norm": 2.5016703605651855, + "learning_rate": 8.783231083844581e-05, + "loss": 3.0109, + "step": 860 + }, + { + "epoch": 0.26427255985267034, + "grad_norm": 1.810957908630371, + "learning_rate": 8.793456032719837e-05, + "loss": 3.0143, + "step": 861 + }, + { + "epoch": 0.2645794966236955, + "grad_norm": 2.3004086017608643, + "learning_rate": 8.803680981595093e-05, + "loss": 2.9825, + "step": 862 + }, + { + "epoch": 0.2648864333947207, + "grad_norm": 2.23740816116333, + "learning_rate": 8.813905930470347e-05, + "loss": 2.8897, + "step": 863 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 2.441157579421997, + "learning_rate": 8.824130879345604e-05, + "loss": 2.8966, + "step": 864 + }, + { + "epoch": 0.265500306936771, + "grad_norm": 2.063201665878296, + "learning_rate": 8.83435582822086e-05, + "loss": 2.9468, + "step": 865 + }, + { + "epoch": 0.2658072437077962, + "grad_norm": 2.1484951972961426, + "learning_rate": 8.844580777096115e-05, + "loss": 2.9199, + "step": 866 + }, + { + "epoch": 0.26611418047882135, + "grad_norm": 2.167827844619751, + "learning_rate": 8.85480572597137e-05, + "loss": 2.9403, + "step": 867 + }, + { + "epoch": 0.2664211172498465, + "grad_norm": 2.193556070327759, + "learning_rate": 8.865030674846625e-05, + "loss": 2.9171, + "step": 868 + }, + { + "epoch": 0.2667280540208717, + "grad_norm": 2.0754151344299316, + "learning_rate": 8.875255623721882e-05, + "loss": 2.9605, + "step": 869 + }, + { + "epoch": 0.26703499079189685, + "grad_norm": 2.1351094245910645, + "learning_rate": 8.885480572597138e-05, + "loss": 2.9272, + "step": 870 + }, + { + "epoch": 0.267341927562922, + "grad_norm": 2.0486347675323486, + "learning_rate": 8.895705521472393e-05, + "loss": 3.0308, + "step": 871 + }, + { + "epoch": 0.2676488643339472, + "grad_norm": 2.3303308486938477, + "learning_rate": 8.905930470347648e-05, + "loss": 2.9061, + "step": 872 + }, + { + "epoch": 0.26795580110497236, + "grad_norm": 1.9345083236694336, + "learning_rate": 8.916155419222905e-05, + "loss": 2.9644, + "step": 873 + }, + { + "epoch": 0.2682627378759975, + "grad_norm": 2.451918601989746, + "learning_rate": 8.926380368098159e-05, + "loss": 2.9536, + "step": 874 + }, + { + "epoch": 0.2685696746470227, + "grad_norm": 1.6964573860168457, + "learning_rate": 8.936605316973416e-05, + "loss": 2.9228, + "step": 875 + }, + { + "epoch": 0.26887661141804786, + "grad_norm": 2.2414000034332275, + "learning_rate": 8.946830265848671e-05, + "loss": 2.9776, + "step": 876 + }, + { + "epoch": 0.26918354818907303, + "grad_norm": 1.725002408027649, + "learning_rate": 8.957055214723928e-05, + "loss": 2.9837, + "step": 877 + }, + { + "epoch": 0.2694904849600982, + "grad_norm": 2.1498587131500244, + "learning_rate": 8.967280163599182e-05, + "loss": 2.8684, + "step": 878 + }, + { + "epoch": 0.26979742173112337, + "grad_norm": 1.814738392829895, + "learning_rate": 8.977505112474438e-05, + "loss": 2.9077, + "step": 879 + }, + { + "epoch": 0.27010435850214853, + "grad_norm": 2.3086628913879395, + "learning_rate": 8.987730061349694e-05, + "loss": 2.9482, + "step": 880 + }, + { + "epoch": 0.2704112952731737, + "grad_norm": 1.7470855712890625, + "learning_rate": 8.997955010224949e-05, + "loss": 2.9775, + "step": 881 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 2.2822775840759277, + "learning_rate": 9.008179959100205e-05, + "loss": 3.0004, + "step": 882 + }, + { + "epoch": 0.27102516881522404, + "grad_norm": 1.9530903100967407, + "learning_rate": 9.018404907975461e-05, + "loss": 2.949, + "step": 883 + }, + { + "epoch": 0.2713321055862492, + "grad_norm": 2.0626885890960693, + "learning_rate": 9.028629856850717e-05, + "loss": 2.9184, + "step": 884 + }, + { + "epoch": 0.2716390423572744, + "grad_norm": 2.0040712356567383, + "learning_rate": 9.038854805725972e-05, + "loss": 2.8562, + "step": 885 + }, + { + "epoch": 0.2719459791282996, + "grad_norm": 2.026193141937256, + "learning_rate": 9.049079754601227e-05, + "loss": 2.883, + "step": 886 + }, + { + "epoch": 0.27225291589932477, + "grad_norm": 1.8337095975875854, + "learning_rate": 9.059304703476483e-05, + "loss": 2.8512, + "step": 887 + }, + { + "epoch": 0.27255985267034993, + "grad_norm": 2.1098122596740723, + "learning_rate": 9.069529652351738e-05, + "loss": 2.9024, + "step": 888 + }, + { + "epoch": 0.2728667894413751, + "grad_norm": 2.065650701522827, + "learning_rate": 9.079754601226993e-05, + "loss": 2.9291, + "step": 889 + }, + { + "epoch": 0.27317372621240027, + "grad_norm": 2.204819679260254, + "learning_rate": 9.08997955010225e-05, + "loss": 2.9153, + "step": 890 + }, + { + "epoch": 0.27348066298342544, + "grad_norm": 1.7931475639343262, + "learning_rate": 9.100204498977506e-05, + "loss": 2.9104, + "step": 891 + }, + { + "epoch": 0.2737875997544506, + "grad_norm": 2.4288859367370605, + "learning_rate": 9.110429447852761e-05, + "loss": 2.9974, + "step": 892 + }, + { + "epoch": 0.2740945365254758, + "grad_norm": 2.095872640609741, + "learning_rate": 9.120654396728016e-05, + "loss": 2.8446, + "step": 893 + }, + { + "epoch": 0.27440147329650094, + "grad_norm": 2.054410696029663, + "learning_rate": 9.130879345603273e-05, + "loss": 2.9008, + "step": 894 + }, + { + "epoch": 0.2747084100675261, + "grad_norm": 2.1989710330963135, + "learning_rate": 9.141104294478528e-05, + "loss": 2.8808, + "step": 895 + }, + { + "epoch": 0.2750153468385513, + "grad_norm": 2.531081199645996, + "learning_rate": 9.151329243353784e-05, + "loss": 2.8928, + "step": 896 + }, + { + "epoch": 0.27532228360957645, + "grad_norm": 2.010425567626953, + "learning_rate": 9.161554192229039e-05, + "loss": 2.9051, + "step": 897 + }, + { + "epoch": 0.2756292203806016, + "grad_norm": 1.9320241212844849, + "learning_rate": 9.171779141104296e-05, + "loss": 2.8675, + "step": 898 + }, + { + "epoch": 0.2759361571516268, + "grad_norm": 2.2280430793762207, + "learning_rate": 9.18200408997955e-05, + "loss": 2.9082, + "step": 899 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 1.9172335863113403, + "learning_rate": 9.192229038854807e-05, + "loss": 2.8947, + "step": 900 + }, + { + "epoch": 0.2765500306936771, + "grad_norm": 2.0846056938171387, + "learning_rate": 9.202453987730062e-05, + "loss": 2.9161, + "step": 901 + }, + { + "epoch": 0.2768569674647023, + "grad_norm": 1.875034213066101, + "learning_rate": 9.212678936605317e-05, + "loss": 2.8937, + "step": 902 + }, + { + "epoch": 0.27716390423572745, + "grad_norm": 2.230164051055908, + "learning_rate": 9.222903885480573e-05, + "loss": 2.8396, + "step": 903 + }, + { + "epoch": 0.2774708410067526, + "grad_norm": 1.6204382181167603, + "learning_rate": 9.233128834355828e-05, + "loss": 2.9367, + "step": 904 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.4218156337738037, + "learning_rate": 9.243353783231085e-05, + "loss": 2.9727, + "step": 905 + }, + { + "epoch": 0.27808471454880296, + "grad_norm": 1.7401793003082275, + "learning_rate": 9.25357873210634e-05, + "loss": 2.8957, + "step": 906 + }, + { + "epoch": 0.2783916513198281, + "grad_norm": 2.2128076553344727, + "learning_rate": 9.263803680981595e-05, + "loss": 2.8725, + "step": 907 + }, + { + "epoch": 0.2786985880908533, + "grad_norm": 2.004179000854492, + "learning_rate": 9.274028629856851e-05, + "loss": 2.8879, + "step": 908 + }, + { + "epoch": 0.27900552486187846, + "grad_norm": 2.198784112930298, + "learning_rate": 9.284253578732107e-05, + "loss": 2.9655, + "step": 909 + }, + { + "epoch": 0.27931246163290363, + "grad_norm": 1.8064004182815552, + "learning_rate": 9.294478527607362e-05, + "loss": 2.7801, + "step": 910 + }, + { + "epoch": 0.2796193984039288, + "grad_norm": 2.1273581981658936, + "learning_rate": 9.304703476482618e-05, + "loss": 2.8615, + "step": 911 + }, + { + "epoch": 0.27992633517495397, + "grad_norm": 1.7843197584152222, + "learning_rate": 9.314928425357874e-05, + "loss": 2.8735, + "step": 912 + }, + { + "epoch": 0.28023327194597913, + "grad_norm": 2.234886884689331, + "learning_rate": 9.325153374233129e-05, + "loss": 2.9444, + "step": 913 + }, + { + "epoch": 0.2805402087170043, + "grad_norm": 2.0565783977508545, + "learning_rate": 9.335378323108384e-05, + "loss": 2.9784, + "step": 914 + }, + { + "epoch": 0.28084714548802947, + "grad_norm": 1.836901068687439, + "learning_rate": 9.345603271983641e-05, + "loss": 2.9217, + "step": 915 + }, + { + "epoch": 0.28115408225905464, + "grad_norm": 2.0981357097625732, + "learning_rate": 9.355828220858896e-05, + "loss": 2.9091, + "step": 916 + }, + { + "epoch": 0.2814610190300798, + "grad_norm": 1.9199821949005127, + "learning_rate": 9.366053169734152e-05, + "loss": 2.8882, + "step": 917 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 1.9928756952285767, + "learning_rate": 9.376278118609407e-05, + "loss": 2.8463, + "step": 918 + }, + { + "epoch": 0.28207489257213014, + "grad_norm": 1.9580156803131104, + "learning_rate": 9.386503067484664e-05, + "loss": 2.7814, + "step": 919 + }, + { + "epoch": 0.2823818293431553, + "grad_norm": 2.016144275665283, + "learning_rate": 9.396728016359919e-05, + "loss": 2.8725, + "step": 920 + }, + { + "epoch": 0.2826887661141805, + "grad_norm": 1.967668890953064, + "learning_rate": 9.406952965235175e-05, + "loss": 2.912, + "step": 921 + }, + { + "epoch": 0.28299570288520565, + "grad_norm": 1.8826593160629272, + "learning_rate": 9.41717791411043e-05, + "loss": 2.7885, + "step": 922 + }, + { + "epoch": 0.2833026396562308, + "grad_norm": 2.0615732669830322, + "learning_rate": 9.427402862985685e-05, + "loss": 2.9111, + "step": 923 + }, + { + "epoch": 0.283609576427256, + "grad_norm": 1.7132701873779297, + "learning_rate": 9.43762781186094e-05, + "loss": 2.89, + "step": 924 + }, + { + "epoch": 0.28391651319828115, + "grad_norm": 2.1561272144317627, + "learning_rate": 9.447852760736196e-05, + "loss": 2.8741, + "step": 925 + }, + { + "epoch": 0.2842234499693063, + "grad_norm": 1.727338433265686, + "learning_rate": 9.458077709611453e-05, + "loss": 2.8449, + "step": 926 + }, + { + "epoch": 0.2845303867403315, + "grad_norm": 2.19234299659729, + "learning_rate": 9.468302658486708e-05, + "loss": 2.8499, + "step": 927 + }, + { + "epoch": 0.28483732351135665, + "grad_norm": 1.7370812892913818, + "learning_rate": 9.478527607361963e-05, + "loss": 2.882, + "step": 928 + }, + { + "epoch": 0.2851442602823818, + "grad_norm": 2.0576157569885254, + "learning_rate": 9.488752556237219e-05, + "loss": 2.7869, + "step": 929 + }, + { + "epoch": 0.285451197053407, + "grad_norm": 1.7926486730575562, + "learning_rate": 9.498977505112476e-05, + "loss": 2.906, + "step": 930 + }, + { + "epoch": 0.28575813382443216, + "grad_norm": 1.6877856254577637, + "learning_rate": 9.50920245398773e-05, + "loss": 2.8422, + "step": 931 + }, + { + "epoch": 0.2860650705954573, + "grad_norm": 2.3053178787231445, + "learning_rate": 9.519427402862986e-05, + "loss": 2.9039, + "step": 932 + }, + { + "epoch": 0.2863720073664825, + "grad_norm": 1.7746092081069946, + "learning_rate": 9.529652351738242e-05, + "loss": 2.9082, + "step": 933 + }, + { + "epoch": 0.28667894413750766, + "grad_norm": 2.1900086402893066, + "learning_rate": 9.539877300613498e-05, + "loss": 2.8511, + "step": 934 + }, + { + "epoch": 0.28698588090853283, + "grad_norm": 1.781988501548767, + "learning_rate": 9.550102249488752e-05, + "loss": 2.8264, + "step": 935 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 1.845797061920166, + "learning_rate": 9.560327198364009e-05, + "loss": 2.8657, + "step": 936 + }, + { + "epoch": 0.28759975445058317, + "grad_norm": 1.8794586658477783, + "learning_rate": 9.570552147239264e-05, + "loss": 2.8365, + "step": 937 + }, + { + "epoch": 0.28790669122160834, + "grad_norm": 2.078359603881836, + "learning_rate": 9.58077709611452e-05, + "loss": 2.8829, + "step": 938 + }, + { + "epoch": 0.2882136279926335, + "grad_norm": 1.8091285228729248, + "learning_rate": 9.591002044989775e-05, + "loss": 2.8083, + "step": 939 + }, + { + "epoch": 0.28852056476365867, + "grad_norm": 2.0130608081817627, + "learning_rate": 9.601226993865032e-05, + "loss": 2.8922, + "step": 940 + }, + { + "epoch": 0.28882750153468384, + "grad_norm": 1.8504360914230347, + "learning_rate": 9.611451942740287e-05, + "loss": 2.8034, + "step": 941 + }, + { + "epoch": 0.289134438305709, + "grad_norm": 1.860420823097229, + "learning_rate": 9.621676891615543e-05, + "loss": 2.8249, + "step": 942 + }, + { + "epoch": 0.2894413750767342, + "grad_norm": 2.157158374786377, + "learning_rate": 9.631901840490798e-05, + "loss": 2.8629, + "step": 943 + }, + { + "epoch": 0.28974831184775934, + "grad_norm": 1.8066895008087158, + "learning_rate": 9.642126789366053e-05, + "loss": 2.7965, + "step": 944 + }, + { + "epoch": 0.2900552486187845, + "grad_norm": 1.9674500226974487, + "learning_rate": 9.65235173824131e-05, + "loss": 2.8043, + "step": 945 + }, + { + "epoch": 0.2903621853898097, + "grad_norm": 1.7899354696273804, + "learning_rate": 9.662576687116564e-05, + "loss": 2.8803, + "step": 946 + }, + { + "epoch": 0.29066912216083485, + "grad_norm": 2.220201015472412, + "learning_rate": 9.672801635991821e-05, + "loss": 2.8201, + "step": 947 + }, + { + "epoch": 0.29097605893186, + "grad_norm": 1.76320219039917, + "learning_rate": 9.683026584867076e-05, + "loss": 2.8921, + "step": 948 + }, + { + "epoch": 0.2912829957028852, + "grad_norm": 1.6863081455230713, + "learning_rate": 9.693251533742331e-05, + "loss": 2.8208, + "step": 949 + }, + { + "epoch": 0.29158993247391035, + "grad_norm": 2.1578476428985596, + "learning_rate": 9.703476482617587e-05, + "loss": 2.8972, + "step": 950 + }, + { + "epoch": 0.2918968692449355, + "grad_norm": 1.6925181150436401, + "learning_rate": 9.713701431492844e-05, + "loss": 2.8225, + "step": 951 + }, + { + "epoch": 0.2922038060159607, + "grad_norm": 1.8861147165298462, + "learning_rate": 9.723926380368099e-05, + "loss": 2.8707, + "step": 952 + }, + { + "epoch": 0.29251074278698586, + "grad_norm": 1.5894604921340942, + "learning_rate": 9.734151329243354e-05, + "loss": 2.7576, + "step": 953 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 1.9092673063278198, + "learning_rate": 9.74437627811861e-05, + "loss": 2.8659, + "step": 954 + }, + { + "epoch": 0.2931246163290362, + "grad_norm": 1.8600605726242065, + "learning_rate": 9.754601226993866e-05, + "loss": 2.752, + "step": 955 + }, + { + "epoch": 0.29343155310006136, + "grad_norm": 2.005805015563965, + "learning_rate": 9.76482617586912e-05, + "loss": 2.8511, + "step": 956 + }, + { + "epoch": 0.2937384898710866, + "grad_norm": 1.9485148191452026, + "learning_rate": 9.775051124744377e-05, + "loss": 2.9726, + "step": 957 + }, + { + "epoch": 0.29404542664211175, + "grad_norm": 1.9197280406951904, + "learning_rate": 9.785276073619632e-05, + "loss": 2.7753, + "step": 958 + }, + { + "epoch": 0.2943523634131369, + "grad_norm": 1.6279773712158203, + "learning_rate": 9.795501022494888e-05, + "loss": 2.8855, + "step": 959 + }, + { + "epoch": 0.2946593001841621, + "grad_norm": 2.0233097076416016, + "learning_rate": 9.805725971370143e-05, + "loss": 2.749, + "step": 960 + }, + { + "epoch": 0.29496623695518726, + "grad_norm": 1.550295352935791, + "learning_rate": 9.815950920245399e-05, + "loss": 2.7991, + "step": 961 + }, + { + "epoch": 0.2952731737262124, + "grad_norm": 2.3194360733032227, + "learning_rate": 9.826175869120655e-05, + "loss": 2.8208, + "step": 962 + }, + { + "epoch": 0.2955801104972376, + "grad_norm": 1.634867787361145, + "learning_rate": 9.83640081799591e-05, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.29588704726826276, + "grad_norm": 2.1152596473693848, + "learning_rate": 9.846625766871166e-05, + "loss": 2.7667, + "step": 964 + }, + { + "epoch": 0.2961939840392879, + "grad_norm": 1.8927233219146729, + "learning_rate": 9.856850715746421e-05, + "loss": 2.8308, + "step": 965 + }, + { + "epoch": 0.2965009208103131, + "grad_norm": 1.765026330947876, + "learning_rate": 9.867075664621678e-05, + "loss": 2.7546, + "step": 966 + }, + { + "epoch": 0.29680785758133826, + "grad_norm": 1.7491015195846558, + "learning_rate": 9.877300613496932e-05, + "loss": 2.8156, + "step": 967 + }, + { + "epoch": 0.29711479435236343, + "grad_norm": 1.8352077007293701, + "learning_rate": 9.887525562372189e-05, + "loss": 2.8542, + "step": 968 + }, + { + "epoch": 0.2974217311233886, + "grad_norm": 1.8892323970794678, + "learning_rate": 9.897750511247444e-05, + "loss": 2.8216, + "step": 969 + }, + { + "epoch": 0.29772866789441377, + "grad_norm": 1.7171403169631958, + "learning_rate": 9.907975460122701e-05, + "loss": 2.8428, + "step": 970 + }, + { + "epoch": 0.29803560466543894, + "grad_norm": 1.8318040370941162, + "learning_rate": 9.918200408997955e-05, + "loss": 2.7821, + "step": 971 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 1.5829975605010986, + "learning_rate": 9.928425357873212e-05, + "loss": 2.9091, + "step": 972 + }, + { + "epoch": 0.29864947820748927, + "grad_norm": 1.7248235940933228, + "learning_rate": 9.938650306748467e-05, + "loss": 2.7914, + "step": 973 + }, + { + "epoch": 0.29895641497851444, + "grad_norm": 1.7741187810897827, + "learning_rate": 9.948875255623722e-05, + "loss": 2.8711, + "step": 974 + }, + { + "epoch": 0.2992633517495396, + "grad_norm": 1.7419151067733765, + "learning_rate": 9.959100204498978e-05, + "loss": 2.8933, + "step": 975 + }, + { + "epoch": 0.2995702885205648, + "grad_norm": 1.6603926420211792, + "learning_rate": 9.969325153374234e-05, + "loss": 2.7138, + "step": 976 + }, + { + "epoch": 0.29987722529158994, + "grad_norm": 1.8423576354980469, + "learning_rate": 9.97955010224949e-05, + "loss": 2.7776, + "step": 977 + }, + { + "epoch": 0.3001841620626151, + "grad_norm": 1.5548568964004517, + "learning_rate": 9.989775051124745e-05, + "loss": 2.8193, + "step": 978 + }, + { + "epoch": 0.3004910988336403, + "grad_norm": 1.711785078048706, + "learning_rate": 0.0001, + "loss": 2.7082, + "step": 979 + }, + { + "epoch": 0.30079803560466545, + "grad_norm": 1.6395221948623657, + "learning_rate": 9.999999975293535e-05, + "loss": 2.7526, + "step": 980 + }, + { + "epoch": 0.3011049723756906, + "grad_norm": 1.829174518585205, + "learning_rate": 9.999999901174139e-05, + "loss": 2.7555, + "step": 981 + }, + { + "epoch": 0.3014119091467158, + "grad_norm": 1.5807569026947021, + "learning_rate": 9.999999777641814e-05, + "loss": 2.848, + "step": 982 + }, + { + "epoch": 0.30171884591774095, + "grad_norm": 2.014803171157837, + "learning_rate": 9.99999960469656e-05, + "loss": 2.8318, + "step": 983 + }, + { + "epoch": 0.3020257826887661, + "grad_norm": 1.4732542037963867, + "learning_rate": 9.99999938233838e-05, + "loss": 2.8143, + "step": 984 + }, + { + "epoch": 0.3023327194597913, + "grad_norm": 2.4888343811035156, + "learning_rate": 9.999999110567275e-05, + "loss": 2.7979, + "step": 985 + }, + { + "epoch": 0.30263965623081646, + "grad_norm": 1.4265737533569336, + "learning_rate": 9.99999878938325e-05, + "loss": 2.7968, + "step": 986 + }, + { + "epoch": 0.3029465930018416, + "grad_norm": 2.0397326946258545, + "learning_rate": 9.999998418786303e-05, + "loss": 2.7413, + "step": 987 + }, + { + "epoch": 0.3032535297728668, + "grad_norm": 1.6565579175949097, + "learning_rate": 9.999997998776443e-05, + "loss": 2.8249, + "step": 988 + }, + { + "epoch": 0.30356046654389196, + "grad_norm": 1.8470033407211304, + "learning_rate": 9.999997529353673e-05, + "loss": 2.7815, + "step": 989 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 1.571768045425415, + "learning_rate": 9.999997010517995e-05, + "loss": 2.7202, + "step": 990 + }, + { + "epoch": 0.3041743400859423, + "grad_norm": 1.6217811107635498, + "learning_rate": 9.999996442269417e-05, + "loss": 2.832, + "step": 991 + }, + { + "epoch": 0.30448127685696746, + "grad_norm": 1.745591640472412, + "learning_rate": 9.999995824607943e-05, + "loss": 2.8271, + "step": 992 + }, + { + "epoch": 0.30478821362799263, + "grad_norm": 1.6469355821609497, + "learning_rate": 9.99999515753358e-05, + "loss": 2.7699, + "step": 993 + }, + { + "epoch": 0.3050951503990178, + "grad_norm": 1.733182430267334, + "learning_rate": 9.999994441046334e-05, + "loss": 2.7927, + "step": 994 + }, + { + "epoch": 0.30540208717004297, + "grad_norm": 1.6043230295181274, + "learning_rate": 9.999993675146213e-05, + "loss": 2.7536, + "step": 995 + }, + { + "epoch": 0.30570902394106814, + "grad_norm": 1.8154711723327637, + "learning_rate": 9.999992859833222e-05, + "loss": 2.7795, + "step": 996 + }, + { + "epoch": 0.3060159607120933, + "grad_norm": 1.7553666830062866, + "learning_rate": 9.999991995107374e-05, + "loss": 2.8128, + "step": 997 + }, + { + "epoch": 0.3063228974831185, + "grad_norm": 1.702697992324829, + "learning_rate": 9.999991080968672e-05, + "loss": 2.7234, + "step": 998 + }, + { + "epoch": 0.30662983425414364, + "grad_norm": 1.512619972229004, + "learning_rate": 9.99999011741713e-05, + "loss": 2.7555, + "step": 999 + }, + { + "epoch": 0.3069367710251688, + "grad_norm": 1.735844612121582, + "learning_rate": 9.999989104452753e-05, + "loss": 2.7847, + "step": 1000 + }, + { + "epoch": 0.307243707796194, + "grad_norm": 1.4687904119491577, + "learning_rate": 9.999988042075555e-05, + "loss": 2.8039, + "step": 1001 + }, + { + "epoch": 0.30755064456721914, + "grad_norm": 1.6867917776107788, + "learning_rate": 9.999986930285542e-05, + "loss": 2.7643, + "step": 1002 + }, + { + "epoch": 0.3078575813382443, + "grad_norm": 1.6974400281906128, + "learning_rate": 9.99998576908273e-05, + "loss": 2.7284, + "step": 1003 + }, + { + "epoch": 0.3081645181092695, + "grad_norm": 1.6622353792190552, + "learning_rate": 9.999984558467126e-05, + "loss": 2.8364, + "step": 1004 + }, + { + "epoch": 0.30847145488029465, + "grad_norm": 1.7920496463775635, + "learning_rate": 9.999983298438744e-05, + "loss": 2.7769, + "step": 1005 + }, + { + "epoch": 0.3087783916513198, + "grad_norm": 1.7111997604370117, + "learning_rate": 9.999981988997598e-05, + "loss": 2.7323, + "step": 1006 + }, + { + "epoch": 0.309085328422345, + "grad_norm": 1.6372064352035522, + "learning_rate": 9.9999806301437e-05, + "loss": 2.8128, + "step": 1007 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 1.841002345085144, + "learning_rate": 9.999979221877061e-05, + "loss": 2.7049, + "step": 1008 + }, + { + "epoch": 0.3096992019643953, + "grad_norm": 1.4474141597747803, + "learning_rate": 9.999977764197697e-05, + "loss": 2.64, + "step": 1009 + }, + { + "epoch": 0.3100061387354205, + "grad_norm": 1.6599560976028442, + "learning_rate": 9.999976257105622e-05, + "loss": 2.7989, + "step": 1010 + }, + { + "epoch": 0.31031307550644566, + "grad_norm": 1.7502890825271606, + "learning_rate": 9.999974700600851e-05, + "loss": 2.7949, + "step": 1011 + }, + { + "epoch": 0.3106200122774708, + "grad_norm": 1.8119313716888428, + "learning_rate": 9.9999730946834e-05, + "loss": 2.7577, + "step": 1012 + }, + { + "epoch": 0.310926949048496, + "grad_norm": 1.4398404359817505, + "learning_rate": 9.999971439353284e-05, + "loss": 2.7369, + "step": 1013 + }, + { + "epoch": 0.31123388581952116, + "grad_norm": 1.8501840829849243, + "learning_rate": 9.999969734610522e-05, + "loss": 2.6651, + "step": 1014 + }, + { + "epoch": 0.31154082259054633, + "grad_norm": 1.450804352760315, + "learning_rate": 9.999967980455125e-05, + "loss": 2.7231, + "step": 1015 + }, + { + "epoch": 0.3118477593615715, + "grad_norm": 1.9445282220840454, + "learning_rate": 9.999966176887115e-05, + "loss": 2.795, + "step": 1016 + }, + { + "epoch": 0.31215469613259667, + "grad_norm": 1.6361008882522583, + "learning_rate": 9.99996432390651e-05, + "loss": 2.8894, + "step": 1017 + }, + { + "epoch": 0.31246163290362183, + "grad_norm": 2.0804831981658936, + "learning_rate": 9.999962421513325e-05, + "loss": 2.8313, + "step": 1018 + }, + { + "epoch": 0.312768569674647, + "grad_norm": 1.3779852390289307, + "learning_rate": 9.999960469707582e-05, + "loss": 2.6776, + "step": 1019 + }, + { + "epoch": 0.31307550644567217, + "grad_norm": 1.7727700471878052, + "learning_rate": 9.999958468489299e-05, + "loss": 2.8076, + "step": 1020 + }, + { + "epoch": 0.31338244321669734, + "grad_norm": 1.5273795127868652, + "learning_rate": 9.999956417858496e-05, + "loss": 2.7069, + "step": 1021 + }, + { + "epoch": 0.3136893799877225, + "grad_norm": 1.8135402202606201, + "learning_rate": 9.999954317815193e-05, + "loss": 2.7375, + "step": 1022 + }, + { + "epoch": 0.3139963167587477, + "grad_norm": 1.6642818450927734, + "learning_rate": 9.99995216835941e-05, + "loss": 2.8085, + "step": 1023 + }, + { + "epoch": 0.31430325352977284, + "grad_norm": 1.681378722190857, + "learning_rate": 9.999949969491169e-05, + "loss": 2.807, + "step": 1024 + }, + { + "epoch": 0.314610190300798, + "grad_norm": 1.5521160364151, + "learning_rate": 9.999947721210493e-05, + "loss": 2.7266, + "step": 1025 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 1.486830711364746, + "learning_rate": 9.999945423517403e-05, + "loss": 2.774, + "step": 1026 + }, + { + "epoch": 0.3152240638428484, + "grad_norm": 1.5730900764465332, + "learning_rate": 9.99994307641192e-05, + "loss": 2.7101, + "step": 1027 + }, + { + "epoch": 0.31553100061387357, + "grad_norm": 1.4835596084594727, + "learning_rate": 9.999940679894071e-05, + "loss": 2.8195, + "step": 1028 + }, + { + "epoch": 0.31583793738489874, + "grad_norm": 1.7885956764221191, + "learning_rate": 9.999938233963877e-05, + "loss": 2.796, + "step": 1029 + }, + { + "epoch": 0.3161448741559239, + "grad_norm": 1.4036259651184082, + "learning_rate": 9.999935738621362e-05, + "loss": 2.7167, + "step": 1030 + }, + { + "epoch": 0.3164518109269491, + "grad_norm": 1.7480512857437134, + "learning_rate": 9.999933193866554e-05, + "loss": 2.6774, + "step": 1031 + }, + { + "epoch": 0.31675874769797424, + "grad_norm": 1.66177499294281, + "learning_rate": 9.999930599699473e-05, + "loss": 2.7635, + "step": 1032 + }, + { + "epoch": 0.3170656844689994, + "grad_norm": 1.5088306665420532, + "learning_rate": 9.999927956120147e-05, + "loss": 2.7284, + "step": 1033 + }, + { + "epoch": 0.3173726212400246, + "grad_norm": 1.6847199201583862, + "learning_rate": 9.999925263128605e-05, + "loss": 2.8287, + "step": 1034 + }, + { + "epoch": 0.31767955801104975, + "grad_norm": 1.6092369556427002, + "learning_rate": 9.999922520724869e-05, + "loss": 2.7189, + "step": 1035 + }, + { + "epoch": 0.3179864947820749, + "grad_norm": 1.41717529296875, + "learning_rate": 9.999919728908969e-05, + "loss": 2.7134, + "step": 1036 + }, + { + "epoch": 0.3182934315531001, + "grad_norm": 1.6256498098373413, + "learning_rate": 9.999916887680931e-05, + "loss": 2.7312, + "step": 1037 + }, + { + "epoch": 0.31860036832412525, + "grad_norm": 1.4934377670288086, + "learning_rate": 9.999913997040784e-05, + "loss": 2.7548, + "step": 1038 + }, + { + "epoch": 0.3189073050951504, + "grad_norm": 1.6037719249725342, + "learning_rate": 9.999911056988557e-05, + "loss": 2.7682, + "step": 1039 + }, + { + "epoch": 0.3192142418661756, + "grad_norm": 1.4746284484863281, + "learning_rate": 9.999908067524277e-05, + "loss": 2.7256, + "step": 1040 + }, + { + "epoch": 0.31952117863720075, + "grad_norm": 1.4633710384368896, + "learning_rate": 9.999905028647976e-05, + "loss": 2.6779, + "step": 1041 + }, + { + "epoch": 0.3198281154082259, + "grad_norm": 1.6108646392822266, + "learning_rate": 9.999901940359684e-05, + "loss": 2.781, + "step": 1042 + }, + { + "epoch": 0.3201350521792511, + "grad_norm": 1.4130996465682983, + "learning_rate": 9.999898802659428e-05, + "loss": 2.6327, + "step": 1043 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 2.110307455062866, + "learning_rate": 9.999895615547244e-05, + "loss": 2.7965, + "step": 1044 + }, + { + "epoch": 0.3207489257213014, + "grad_norm": 1.500618815422058, + "learning_rate": 9.99989237902316e-05, + "loss": 2.7874, + "step": 1045 + }, + { + "epoch": 0.3210558624923266, + "grad_norm": 1.577890157699585, + "learning_rate": 9.999889093087207e-05, + "loss": 2.6816, + "step": 1046 + }, + { + "epoch": 0.32136279926335176, + "grad_norm": 1.2820981740951538, + "learning_rate": 9.999885757739422e-05, + "loss": 2.6799, + "step": 1047 + }, + { + "epoch": 0.32166973603437693, + "grad_norm": 1.629936695098877, + "learning_rate": 9.999882372979835e-05, + "loss": 2.6783, + "step": 1048 + }, + { + "epoch": 0.3219766728054021, + "grad_norm": 1.3119972944259644, + "learning_rate": 9.999878938808478e-05, + "loss": 2.6403, + "step": 1049 + }, + { + "epoch": 0.32228360957642727, + "grad_norm": 1.720093846321106, + "learning_rate": 9.999875455225389e-05, + "loss": 2.709, + "step": 1050 + }, + { + "epoch": 0.32259054634745243, + "grad_norm": 1.446273922920227, + "learning_rate": 9.999871922230599e-05, + "loss": 2.6463, + "step": 1051 + }, + { + "epoch": 0.3228974831184776, + "grad_norm": 1.5000908374786377, + "learning_rate": 9.999868339824145e-05, + "loss": 2.7502, + "step": 1052 + }, + { + "epoch": 0.32320441988950277, + "grad_norm": 1.6257869005203247, + "learning_rate": 9.999864708006061e-05, + "loss": 2.6984, + "step": 1053 + }, + { + "epoch": 0.32351135666052794, + "grad_norm": 1.509638786315918, + "learning_rate": 9.999861026776384e-05, + "loss": 2.6931, + "step": 1054 + }, + { + "epoch": 0.3238182934315531, + "grad_norm": 1.5305874347686768, + "learning_rate": 9.999857296135149e-05, + "loss": 2.8423, + "step": 1055 + }, + { + "epoch": 0.3241252302025783, + "grad_norm": 1.7664300203323364, + "learning_rate": 9.999853516082394e-05, + "loss": 2.7703, + "step": 1056 + }, + { + "epoch": 0.32443216697360344, + "grad_norm": 1.4633153676986694, + "learning_rate": 9.999849686618157e-05, + "loss": 2.7588, + "step": 1057 + }, + { + "epoch": 0.3247391037446286, + "grad_norm": 1.5177773237228394, + "learning_rate": 9.999845807742473e-05, + "loss": 2.7376, + "step": 1058 + }, + { + "epoch": 0.3250460405156538, + "grad_norm": 1.6122089624404907, + "learning_rate": 9.999841879455383e-05, + "loss": 2.7871, + "step": 1059 + }, + { + "epoch": 0.32535297728667895, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.999837901756926e-05, + "loss": 2.6602, + "step": 1060 + }, + { + "epoch": 0.3256599140577041, + "grad_norm": 1.5714327096939087, + "learning_rate": 9.99983387464714e-05, + "loss": 2.6279, + "step": 1061 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 1.399731993675232, + "learning_rate": 9.999829798126065e-05, + "loss": 2.7957, + "step": 1062 + }, + { + "epoch": 0.32627378759975445, + "grad_norm": 1.694368839263916, + "learning_rate": 9.999825672193741e-05, + "loss": 2.6859, + "step": 1063 + }, + { + "epoch": 0.3265807243707796, + "grad_norm": 1.2585967779159546, + "learning_rate": 9.99982149685021e-05, + "loss": 2.7964, + "step": 1064 + }, + { + "epoch": 0.3268876611418048, + "grad_norm": 1.802262306213379, + "learning_rate": 9.999817272095512e-05, + "loss": 2.6325, + "step": 1065 + }, + { + "epoch": 0.32719459791282995, + "grad_norm": 1.213222861289978, + "learning_rate": 9.99981299792969e-05, + "loss": 2.718, + "step": 1066 + }, + { + "epoch": 0.3275015346838551, + "grad_norm": 1.5745760202407837, + "learning_rate": 9.999808674352785e-05, + "loss": 2.8589, + "step": 1067 + }, + { + "epoch": 0.3278084714548803, + "grad_norm": 1.516995906829834, + "learning_rate": 9.999804301364839e-05, + "loss": 2.6691, + "step": 1068 + }, + { + "epoch": 0.32811540822590546, + "grad_norm": 1.4223122596740723, + "learning_rate": 9.999799878965897e-05, + "loss": 2.6899, + "step": 1069 + }, + { + "epoch": 0.3284223449969306, + "grad_norm": 1.4502828121185303, + "learning_rate": 9.999795407156003e-05, + "loss": 2.7801, + "step": 1070 + }, + { + "epoch": 0.3287292817679558, + "grad_norm": 1.4692026376724243, + "learning_rate": 9.999790885935198e-05, + "loss": 2.6869, + "step": 1071 + }, + { + "epoch": 0.32903621853898096, + "grad_norm": 1.4182246923446655, + "learning_rate": 9.999786315303532e-05, + "loss": 2.7802, + "step": 1072 + }, + { + "epoch": 0.32934315531000613, + "grad_norm": 1.781173586845398, + "learning_rate": 9.999781695261046e-05, + "loss": 2.7522, + "step": 1073 + }, + { + "epoch": 0.3296500920810313, + "grad_norm": 1.3958306312561035, + "learning_rate": 9.999777025807786e-05, + "loss": 2.6894, + "step": 1074 + }, + { + "epoch": 0.32995702885205647, + "grad_norm": 1.7938110828399658, + "learning_rate": 9.9997723069438e-05, + "loss": 2.6468, + "step": 1075 + }, + { + "epoch": 0.33026396562308163, + "grad_norm": 1.2314528226852417, + "learning_rate": 9.999767538669134e-05, + "loss": 2.7446, + "step": 1076 + }, + { + "epoch": 0.3305709023941068, + "grad_norm": 1.4881565570831299, + "learning_rate": 9.999762720983835e-05, + "loss": 2.6904, + "step": 1077 + }, + { + "epoch": 0.33087783916513197, + "grad_norm": 1.3903130292892456, + "learning_rate": 9.999757853887948e-05, + "loss": 2.7315, + "step": 1078 + }, + { + "epoch": 0.33118477593615714, + "grad_norm": 1.491129755973816, + "learning_rate": 9.999752937381525e-05, + "loss": 2.7325, + "step": 1079 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 1.4748190641403198, + "learning_rate": 9.999747971464612e-05, + "loss": 2.7288, + "step": 1080 + }, + { + "epoch": 0.3317986494782075, + "grad_norm": 1.5664055347442627, + "learning_rate": 9.99974295613726e-05, + "loss": 2.8225, + "step": 1081 + }, + { + "epoch": 0.33210558624923264, + "grad_norm": 1.4422696828842163, + "learning_rate": 9.999737891399518e-05, + "loss": 2.6537, + "step": 1082 + }, + { + "epoch": 0.3324125230202578, + "grad_norm": 1.397817850112915, + "learning_rate": 9.999732777251436e-05, + "loss": 2.6329, + "step": 1083 + }, + { + "epoch": 0.332719459791283, + "grad_norm": 1.4253548383712769, + "learning_rate": 9.999727613693063e-05, + "loss": 2.7028, + "step": 1084 + }, + { + "epoch": 0.33302639656230815, + "grad_norm": 1.4327688217163086, + "learning_rate": 9.999722400724451e-05, + "loss": 2.6524, + "step": 1085 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.999717138345654e-05, + "loss": 2.7278, + "step": 1086 + }, + { + "epoch": 0.3336402701043585, + "grad_norm": 1.536656379699707, + "learning_rate": 9.999711826556719e-05, + "loss": 2.5858, + "step": 1087 + }, + { + "epoch": 0.33394720687538365, + "grad_norm": 1.4210286140441895, + "learning_rate": 9.999706465357703e-05, + "loss": 2.7057, + "step": 1088 + }, + { + "epoch": 0.3342541436464088, + "grad_norm": 1.4605839252471924, + "learning_rate": 9.999701054748657e-05, + "loss": 2.6461, + "step": 1089 + }, + { + "epoch": 0.334561080417434, + "grad_norm": 1.4764037132263184, + "learning_rate": 9.999695594729636e-05, + "loss": 2.608, + "step": 1090 + }, + { + "epoch": 0.33486801718845915, + "grad_norm": 1.630843162536621, + "learning_rate": 9.99969008530069e-05, + "loss": 2.6165, + "step": 1091 + }, + { + "epoch": 0.3351749539594843, + "grad_norm": 1.3693522214889526, + "learning_rate": 9.999684526461879e-05, + "loss": 2.72, + "step": 1092 + }, + { + "epoch": 0.3354818907305095, + "grad_norm": 1.609580636024475, + "learning_rate": 9.999678918213254e-05, + "loss": 2.7602, + "step": 1093 + }, + { + "epoch": 0.33578882750153466, + "grad_norm": 1.3815720081329346, + "learning_rate": 9.999673260554872e-05, + "loss": 2.6297, + "step": 1094 + }, + { + "epoch": 0.3360957642725598, + "grad_norm": 1.4511120319366455, + "learning_rate": 9.999667553486787e-05, + "loss": 2.7515, + "step": 1095 + }, + { + "epoch": 0.336402701043585, + "grad_norm": 1.486387848854065, + "learning_rate": 9.999661797009057e-05, + "loss": 2.6839, + "step": 1096 + }, + { + "epoch": 0.33670963781461016, + "grad_norm": 1.239160180091858, + "learning_rate": 9.999655991121739e-05, + "loss": 2.6033, + "step": 1097 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 1.499598741531372, + "learning_rate": 9.999650135824891e-05, + "loss": 2.5582, + "step": 1098 + }, + { + "epoch": 0.33732351135666055, + "grad_norm": 1.32973051071167, + "learning_rate": 9.999644231118571e-05, + "loss": 2.6253, + "step": 1099 + }, + { + "epoch": 0.3376304481276857, + "grad_norm": 1.4025259017944336, + "learning_rate": 9.999638277002833e-05, + "loss": 2.6199, + "step": 1100 + }, + { + "epoch": 0.3379373848987109, + "grad_norm": 1.3162082433700562, + "learning_rate": 9.999632273477742e-05, + "loss": 2.5528, + "step": 1101 + }, + { + "epoch": 0.33824432166973606, + "grad_norm": 1.5454723834991455, + "learning_rate": 9.999626220543352e-05, + "loss": 2.6724, + "step": 1102 + }, + { + "epoch": 0.3385512584407612, + "grad_norm": 1.45896315574646, + "learning_rate": 9.999620118199727e-05, + "loss": 2.688, + "step": 1103 + }, + { + "epoch": 0.3388581952117864, + "grad_norm": 1.3940998315811157, + "learning_rate": 9.999613966446926e-05, + "loss": 2.6991, + "step": 1104 + }, + { + "epoch": 0.33916513198281156, + "grad_norm": 1.4427480697631836, + "learning_rate": 9.999607765285009e-05, + "loss": 2.6869, + "step": 1105 + }, + { + "epoch": 0.33947206875383673, + "grad_norm": 1.260373830795288, + "learning_rate": 9.999601514714036e-05, + "loss": 2.7011, + "step": 1106 + }, + { + "epoch": 0.3397790055248619, + "grad_norm": 1.5985103845596313, + "learning_rate": 9.999595214734072e-05, + "loss": 2.599, + "step": 1107 + }, + { + "epoch": 0.34008594229588707, + "grad_norm": 1.1968494653701782, + "learning_rate": 9.999588865345179e-05, + "loss": 2.6346, + "step": 1108 + }, + { + "epoch": 0.34039287906691224, + "grad_norm": 1.4565916061401367, + "learning_rate": 9.999582466547417e-05, + "loss": 2.6303, + "step": 1109 + }, + { + "epoch": 0.3406998158379374, + "grad_norm": 1.2992361783981323, + "learning_rate": 9.999576018340851e-05, + "loss": 2.6121, + "step": 1110 + }, + { + "epoch": 0.34100675260896257, + "grad_norm": 1.402471899986267, + "learning_rate": 9.999569520725543e-05, + "loss": 2.6697, + "step": 1111 + }, + { + "epoch": 0.34131368937998774, + "grad_norm": 1.3006439208984375, + "learning_rate": 9.99956297370156e-05, + "loss": 2.6347, + "step": 1112 + }, + { + "epoch": 0.3416206261510129, + "grad_norm": 1.4235650300979614, + "learning_rate": 9.999556377268966e-05, + "loss": 2.6869, + "step": 1113 + }, + { + "epoch": 0.3419275629220381, + "grad_norm": 1.3288183212280273, + "learning_rate": 9.999549731427824e-05, + "loss": 2.5834, + "step": 1114 + }, + { + "epoch": 0.34223449969306324, + "grad_norm": 1.430736780166626, + "learning_rate": 9.999543036178203e-05, + "loss": 2.6248, + "step": 1115 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 1.467417597770691, + "learning_rate": 9.999536291520167e-05, + "loss": 2.6563, + "step": 1116 + }, + { + "epoch": 0.3428483732351136, + "grad_norm": 1.3988397121429443, + "learning_rate": 9.999529497453782e-05, + "loss": 2.6634, + "step": 1117 + }, + { + "epoch": 0.34315531000613875, + "grad_norm": 1.2072746753692627, + "learning_rate": 9.999522653979117e-05, + "loss": 2.6129, + "step": 1118 + }, + { + "epoch": 0.3434622467771639, + "grad_norm": 1.5297373533248901, + "learning_rate": 9.999515761096239e-05, + "loss": 2.6359, + "step": 1119 + }, + { + "epoch": 0.3437691835481891, + "grad_norm": 1.2022082805633545, + "learning_rate": 9.999508818805214e-05, + "loss": 2.6934, + "step": 1120 + }, + { + "epoch": 0.34407612031921425, + "grad_norm": 1.5655800104141235, + "learning_rate": 9.999501827106114e-05, + "loss": 2.6132, + "step": 1121 + }, + { + "epoch": 0.3443830570902394, + "grad_norm": 1.1639407873153687, + "learning_rate": 9.999494785999007e-05, + "loss": 2.6416, + "step": 1122 + }, + { + "epoch": 0.3446899938612646, + "grad_norm": 1.5784116983413696, + "learning_rate": 9.999487695483962e-05, + "loss": 2.5967, + "step": 1123 + }, + { + "epoch": 0.34499693063228976, + "grad_norm": 1.1812770366668701, + "learning_rate": 9.999480555561049e-05, + "loss": 2.6303, + "step": 1124 + }, + { + "epoch": 0.3453038674033149, + "grad_norm": 1.5105888843536377, + "learning_rate": 9.99947336623034e-05, + "loss": 2.58, + "step": 1125 + }, + { + "epoch": 0.3456108041743401, + "grad_norm": 1.2969506978988647, + "learning_rate": 9.999466127491904e-05, + "loss": 2.6857, + "step": 1126 + }, + { + "epoch": 0.34591774094536526, + "grad_norm": 1.679018259048462, + "learning_rate": 9.999458839345812e-05, + "loss": 2.6304, + "step": 1127 + }, + { + "epoch": 0.3462246777163904, + "grad_norm": 1.2718015909194946, + "learning_rate": 9.99945150179214e-05, + "loss": 2.6929, + "step": 1128 + }, + { + "epoch": 0.3465316144874156, + "grad_norm": 1.5834014415740967, + "learning_rate": 9.999444114830957e-05, + "loss": 2.6477, + "step": 1129 + }, + { + "epoch": 0.34683855125844076, + "grad_norm": 1.1575955152511597, + "learning_rate": 9.999436678462338e-05, + "loss": 2.6908, + "step": 1130 + }, + { + "epoch": 0.34714548802946593, + "grad_norm": 1.6231988668441772, + "learning_rate": 9.999429192686352e-05, + "loss": 2.6741, + "step": 1131 + }, + { + "epoch": 0.3474524248004911, + "grad_norm": 1.1616390943527222, + "learning_rate": 9.99942165750308e-05, + "loss": 2.5977, + "step": 1132 + }, + { + "epoch": 0.34775936157151627, + "grad_norm": 1.6188498735427856, + "learning_rate": 9.999414072912592e-05, + "loss": 2.6776, + "step": 1133 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 1.3885529041290283, + "learning_rate": 9.999406438914962e-05, + "loss": 2.7136, + "step": 1134 + }, + { + "epoch": 0.3483732351135666, + "grad_norm": 1.4522851705551147, + "learning_rate": 9.999398755510269e-05, + "loss": 2.6817, + "step": 1135 + }, + { + "epoch": 0.34868017188459177, + "grad_norm": 1.2695082426071167, + "learning_rate": 9.999391022698588e-05, + "loss": 2.6257, + "step": 1136 + }, + { + "epoch": 0.34898710865561694, + "grad_norm": 1.1735594272613525, + "learning_rate": 9.999383240479993e-05, + "loss": 2.5908, + "step": 1137 + }, + { + "epoch": 0.3492940454266421, + "grad_norm": 1.4158523082733154, + "learning_rate": 9.999375408854564e-05, + "loss": 2.572, + "step": 1138 + }, + { + "epoch": 0.3496009821976673, + "grad_norm": 1.1342333555221558, + "learning_rate": 9.999367527822376e-05, + "loss": 2.6918, + "step": 1139 + }, + { + "epoch": 0.34990791896869244, + "grad_norm": 1.4462997913360596, + "learning_rate": 9.999359597383509e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 0.3502148557397176, + "grad_norm": 1.254346251487732, + "learning_rate": 9.99935161753804e-05, + "loss": 2.6426, + "step": 1141 + }, + { + "epoch": 0.3505217925107428, + "grad_norm": 1.5101851224899292, + "learning_rate": 9.999343588286048e-05, + "loss": 2.6261, + "step": 1142 + }, + { + "epoch": 0.35082872928176795, + "grad_norm": 1.2910065650939941, + "learning_rate": 9.999335509627612e-05, + "loss": 2.5587, + "step": 1143 + }, + { + "epoch": 0.3511356660527931, + "grad_norm": 1.4421133995056152, + "learning_rate": 9.999327381562812e-05, + "loss": 2.6812, + "step": 1144 + }, + { + "epoch": 0.3514426028238183, + "grad_norm": 1.3265037536621094, + "learning_rate": 9.999319204091728e-05, + "loss": 2.6506, + "step": 1145 + }, + { + "epoch": 0.35174953959484345, + "grad_norm": 1.346258521080017, + "learning_rate": 9.999310977214443e-05, + "loss": 2.7038, + "step": 1146 + }, + { + "epoch": 0.3520564763658686, + "grad_norm": 1.3683836460113525, + "learning_rate": 9.999302700931037e-05, + "loss": 2.5823, + "step": 1147 + }, + { + "epoch": 0.3523634131368938, + "grad_norm": 1.3593783378601074, + "learning_rate": 9.99929437524159e-05, + "loss": 2.5705, + "step": 1148 + }, + { + "epoch": 0.35267034990791896, + "grad_norm": 1.4077095985412598, + "learning_rate": 9.999286000146186e-05, + "loss": 2.6259, + "step": 1149 + }, + { + "epoch": 0.3529772866789441, + "grad_norm": 1.3095922470092773, + "learning_rate": 9.99927757564491e-05, + "loss": 2.683, + "step": 1150 + }, + { + "epoch": 0.3532842234499693, + "grad_norm": 1.4188631772994995, + "learning_rate": 9.999269101737841e-05, + "loss": 2.619, + "step": 1151 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 1.2483123540878296, + "learning_rate": 9.999260578425063e-05, + "loss": 2.6477, + "step": 1152 + }, + { + "epoch": 0.35389809699201963, + "grad_norm": 1.4601099491119385, + "learning_rate": 9.999252005706663e-05, + "loss": 2.5861, + "step": 1153 + }, + { + "epoch": 0.3542050337630448, + "grad_norm": 1.107335090637207, + "learning_rate": 9.999243383582726e-05, + "loss": 2.6308, + "step": 1154 + }, + { + "epoch": 0.35451197053406996, + "grad_norm": 1.60590398311615, + "learning_rate": 9.999234712053334e-05, + "loss": 2.7057, + "step": 1155 + }, + { + "epoch": 0.35481890730509513, + "grad_norm": 1.2256578207015991, + "learning_rate": 9.999225991118575e-05, + "loss": 2.6371, + "step": 1156 + }, + { + "epoch": 0.3551258440761203, + "grad_norm": 1.4451910257339478, + "learning_rate": 9.999217220778535e-05, + "loss": 2.6424, + "step": 1157 + }, + { + "epoch": 0.35543278084714547, + "grad_norm": 1.184781789779663, + "learning_rate": 9.999208401033299e-05, + "loss": 2.6576, + "step": 1158 + }, + { + "epoch": 0.35573971761817064, + "grad_norm": 1.3395711183547974, + "learning_rate": 9.999199531882956e-05, + "loss": 2.6109, + "step": 1159 + }, + { + "epoch": 0.3560466543891958, + "grad_norm": 1.2052571773529053, + "learning_rate": 9.999190613327594e-05, + "loss": 2.5486, + "step": 1160 + }, + { + "epoch": 0.356353591160221, + "grad_norm": 1.2690850496292114, + "learning_rate": 9.999181645367299e-05, + "loss": 2.6457, + "step": 1161 + }, + { + "epoch": 0.35666052793124614, + "grad_norm": 1.2832787036895752, + "learning_rate": 9.999172628002162e-05, + "loss": 2.6097, + "step": 1162 + }, + { + "epoch": 0.3569674647022713, + "grad_norm": 1.3791579008102417, + "learning_rate": 9.999163561232272e-05, + "loss": 2.7458, + "step": 1163 + }, + { + "epoch": 0.3572744014732965, + "grad_norm": 1.260743498802185, + "learning_rate": 9.999154445057715e-05, + "loss": 2.594, + "step": 1164 + }, + { + "epoch": 0.35758133824432164, + "grad_norm": 1.1595406532287598, + "learning_rate": 9.999145279478585e-05, + "loss": 2.5315, + "step": 1165 + }, + { + "epoch": 0.3578882750153468, + "grad_norm": 1.3424396514892578, + "learning_rate": 9.999136064494972e-05, + "loss": 2.6017, + "step": 1166 + }, + { + "epoch": 0.358195211786372, + "grad_norm": 1.317750334739685, + "learning_rate": 9.999126800106963e-05, + "loss": 2.5787, + "step": 1167 + }, + { + "epoch": 0.35850214855739715, + "grad_norm": 1.104471206665039, + "learning_rate": 9.999117486314657e-05, + "loss": 2.6801, + "step": 1168 + }, + { + "epoch": 0.3588090853284224, + "grad_norm": 1.5555830001831055, + "learning_rate": 9.99910812311814e-05, + "loss": 2.6575, + "step": 1169 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 1.1883453130722046, + "learning_rate": 9.999098710517507e-05, + "loss": 2.5801, + "step": 1170 + }, + { + "epoch": 0.3594229588704727, + "grad_norm": 1.3885222673416138, + "learning_rate": 9.99908924851285e-05, + "loss": 2.5637, + "step": 1171 + }, + { + "epoch": 0.3597298956414979, + "grad_norm": 1.1860510110855103, + "learning_rate": 9.999079737104262e-05, + "loss": 2.6528, + "step": 1172 + }, + { + "epoch": 0.36003683241252304, + "grad_norm": 1.4319096803665161, + "learning_rate": 9.99907017629184e-05, + "loss": 2.579, + "step": 1173 + }, + { + "epoch": 0.3603437691835482, + "grad_norm": 1.256819725036621, + "learning_rate": 9.999060566075676e-05, + "loss": 2.5638, + "step": 1174 + }, + { + "epoch": 0.3606507059545734, + "grad_norm": 1.5452641248703003, + "learning_rate": 9.999050906455865e-05, + "loss": 2.6318, + "step": 1175 + }, + { + "epoch": 0.36095764272559855, + "grad_norm": 1.1933847665786743, + "learning_rate": 9.999041197432503e-05, + "loss": 2.5451, + "step": 1176 + }, + { + "epoch": 0.3612645794966237, + "grad_norm": 1.245689034461975, + "learning_rate": 9.999031439005684e-05, + "loss": 2.5452, + "step": 1177 + }, + { + "epoch": 0.3615715162676489, + "grad_norm": 1.2228111028671265, + "learning_rate": 9.99902163117551e-05, + "loss": 2.5856, + "step": 1178 + }, + { + "epoch": 0.36187845303867405, + "grad_norm": 1.3547098636627197, + "learning_rate": 9.999011773942071e-05, + "loss": 2.6604, + "step": 1179 + }, + { + "epoch": 0.3621853898096992, + "grad_norm": 1.25395929813385, + "learning_rate": 9.999001867305469e-05, + "loss": 2.5947, + "step": 1180 + }, + { + "epoch": 0.3624923265807244, + "grad_norm": 1.1676687002182007, + "learning_rate": 9.9989919112658e-05, + "loss": 2.5728, + "step": 1181 + }, + { + "epoch": 0.36279926335174956, + "grad_norm": 1.2076375484466553, + "learning_rate": 9.998981905823163e-05, + "loss": 2.569, + "step": 1182 + }, + { + "epoch": 0.3631062001227747, + "grad_norm": 1.3417900800704956, + "learning_rate": 9.998971850977659e-05, + "loss": 2.5552, + "step": 1183 + }, + { + "epoch": 0.3634131368937999, + "grad_norm": 1.135088324546814, + "learning_rate": 9.998961746729383e-05, + "loss": 2.5883, + "step": 1184 + }, + { + "epoch": 0.36372007366482506, + "grad_norm": 1.3329869508743286, + "learning_rate": 9.998951593078438e-05, + "loss": 2.6398, + "step": 1185 + }, + { + "epoch": 0.36402701043585023, + "grad_norm": 1.1681292057037354, + "learning_rate": 9.998941390024923e-05, + "loss": 2.6082, + "step": 1186 + }, + { + "epoch": 0.3643339472068754, + "grad_norm": 1.4083843231201172, + "learning_rate": 9.998931137568939e-05, + "loss": 2.6585, + "step": 1187 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 1.0879896879196167, + "learning_rate": 9.998920835710587e-05, + "loss": 2.4779, + "step": 1188 + }, + { + "epoch": 0.36494782074892573, + "grad_norm": 1.2977828979492188, + "learning_rate": 9.99891048444997e-05, + "loss": 2.6586, + "step": 1189 + }, + { + "epoch": 0.3652547575199509, + "grad_norm": 1.2552378177642822, + "learning_rate": 9.998900083787188e-05, + "loss": 2.5211, + "step": 1190 + }, + { + "epoch": 0.36556169429097607, + "grad_norm": 1.178227186203003, + "learning_rate": 9.998889633722348e-05, + "loss": 2.5365, + "step": 1191 + }, + { + "epoch": 0.36586863106200124, + "grad_norm": 1.36601722240448, + "learning_rate": 9.99887913425555e-05, + "loss": 2.6108, + "step": 1192 + }, + { + "epoch": 0.3661755678330264, + "grad_norm": 1.1947816610336304, + "learning_rate": 9.998868585386898e-05, + "loss": 2.5269, + "step": 1193 + }, + { + "epoch": 0.3664825046040516, + "grad_norm": 1.3113429546356201, + "learning_rate": 9.998857987116497e-05, + "loss": 2.5241, + "step": 1194 + }, + { + "epoch": 0.36678944137507674, + "grad_norm": 1.1573466062545776, + "learning_rate": 9.99884733944445e-05, + "loss": 2.5772, + "step": 1195 + }, + { + "epoch": 0.3670963781461019, + "grad_norm": 1.3841795921325684, + "learning_rate": 9.998836642370866e-05, + "loss": 2.6254, + "step": 1196 + }, + { + "epoch": 0.3674033149171271, + "grad_norm": 1.3332045078277588, + "learning_rate": 9.998825895895848e-05, + "loss": 2.6846, + "step": 1197 + }, + { + "epoch": 0.36771025168815225, + "grad_norm": 1.1578748226165771, + "learning_rate": 9.9988151000195e-05, + "loss": 2.4717, + "step": 1198 + }, + { + "epoch": 0.3680171884591774, + "grad_norm": 1.1045753955841064, + "learning_rate": 9.998804254741934e-05, + "loss": 2.6433, + "step": 1199 + }, + { + "epoch": 0.3683241252302026, + "grad_norm": 1.3260962963104248, + "learning_rate": 9.998793360063254e-05, + "loss": 2.6385, + "step": 1200 + }, + { + "epoch": 0.36863106200122775, + "grad_norm": 1.1483805179595947, + "learning_rate": 9.998782415983568e-05, + "loss": 2.6013, + "step": 1201 + }, + { + "epoch": 0.3689379987722529, + "grad_norm": 1.1897181272506714, + "learning_rate": 9.998771422502984e-05, + "loss": 2.485, + "step": 1202 + }, + { + "epoch": 0.3692449355432781, + "grad_norm": 1.2124346494674683, + "learning_rate": 9.99876037962161e-05, + "loss": 2.6271, + "step": 1203 + }, + { + "epoch": 0.36955187231430325, + "grad_norm": 1.2274240255355835, + "learning_rate": 9.998749287339557e-05, + "loss": 2.6072, + "step": 1204 + }, + { + "epoch": 0.3698588090853284, + "grad_norm": 1.2045015096664429, + "learning_rate": 9.998738145656934e-05, + "loss": 2.5567, + "step": 1205 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 1.187698483467102, + "learning_rate": 9.998726954573852e-05, + "loss": 2.6251, + "step": 1206 + }, + { + "epoch": 0.37047268262737876, + "grad_norm": 1.1760836839675903, + "learning_rate": 9.998715714090419e-05, + "loss": 2.6544, + "step": 1207 + }, + { + "epoch": 0.3707796193984039, + "grad_norm": 1.2181260585784912, + "learning_rate": 9.998704424206746e-05, + "loss": 2.6258, + "step": 1208 + }, + { + "epoch": 0.3710865561694291, + "grad_norm": 1.2106094360351562, + "learning_rate": 9.998693084922947e-05, + "loss": 2.5932, + "step": 1209 + }, + { + "epoch": 0.37139349294045426, + "grad_norm": 1.2973625659942627, + "learning_rate": 9.998681696239133e-05, + "loss": 2.5257, + "step": 1210 + }, + { + "epoch": 0.37170042971147943, + "grad_norm": 1.2477924823760986, + "learning_rate": 9.998670258155417e-05, + "loss": 2.6579, + "step": 1211 + }, + { + "epoch": 0.3720073664825046, + "grad_norm": 1.3301422595977783, + "learning_rate": 9.998658770671913e-05, + "loss": 2.4903, + "step": 1212 + }, + { + "epoch": 0.37231430325352977, + "grad_norm": 1.224321722984314, + "learning_rate": 9.998647233788732e-05, + "loss": 2.5865, + "step": 1213 + }, + { + "epoch": 0.37262124002455493, + "grad_norm": 1.3110655546188354, + "learning_rate": 9.99863564750599e-05, + "loss": 2.6134, + "step": 1214 + }, + { + "epoch": 0.3729281767955801, + "grad_norm": 1.2323014736175537, + "learning_rate": 9.998624011823801e-05, + "loss": 2.5892, + "step": 1215 + }, + { + "epoch": 0.37323511356660527, + "grad_norm": 1.0873770713806152, + "learning_rate": 9.998612326742279e-05, + "loss": 2.4897, + "step": 1216 + }, + { + "epoch": 0.37354205033763044, + "grad_norm": 1.2789679765701294, + "learning_rate": 9.998600592261539e-05, + "loss": 2.5603, + "step": 1217 + }, + { + "epoch": 0.3738489871086556, + "grad_norm": 1.1311540603637695, + "learning_rate": 9.998588808381699e-05, + "loss": 2.5327, + "step": 1218 + }, + { + "epoch": 0.3741559238796808, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.998576975102876e-05, + "loss": 2.4789, + "step": 1219 + }, + { + "epoch": 0.37446286065070594, + "grad_norm": 1.1840651035308838, + "learning_rate": 9.998565092425182e-05, + "loss": 2.5026, + "step": 1220 + }, + { + "epoch": 0.3747697974217311, + "grad_norm": 1.3145099878311157, + "learning_rate": 9.998553160348743e-05, + "loss": 2.5424, + "step": 1221 + }, + { + "epoch": 0.3750767341927563, + "grad_norm": 1.2192758321762085, + "learning_rate": 9.998541178873668e-05, + "loss": 2.5556, + "step": 1222 + }, + { + "epoch": 0.37538367096378145, + "grad_norm": 1.1329905986785889, + "learning_rate": 9.99852914800008e-05, + "loss": 2.4624, + "step": 1223 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 1.2490339279174805, + "learning_rate": 9.9985170677281e-05, + "loss": 2.5016, + "step": 1224 + }, + { + "epoch": 0.3759975445058318, + "grad_norm": 1.1884582042694092, + "learning_rate": 9.998504938057841e-05, + "loss": 2.5345, + "step": 1225 + }, + { + "epoch": 0.37630448127685695, + "grad_norm": 1.2075775861740112, + "learning_rate": 9.998492758989428e-05, + "loss": 2.5206, + "step": 1226 + }, + { + "epoch": 0.3766114180478821, + "grad_norm": 1.238457441329956, + "learning_rate": 9.99848053052298e-05, + "loss": 2.6748, + "step": 1227 + }, + { + "epoch": 0.3769183548189073, + "grad_norm": 1.3056883811950684, + "learning_rate": 9.998468252658618e-05, + "loss": 2.6146, + "step": 1228 + }, + { + "epoch": 0.37722529158993245, + "grad_norm": 1.191575050354004, + "learning_rate": 9.998455925396461e-05, + "loss": 2.4743, + "step": 1229 + }, + { + "epoch": 0.3775322283609576, + "grad_norm": 1.2834603786468506, + "learning_rate": 9.998443548736635e-05, + "loss": 2.5504, + "step": 1230 + }, + { + "epoch": 0.3778391651319828, + "grad_norm": 1.3023632764816284, + "learning_rate": 9.99843112267926e-05, + "loss": 2.5832, + "step": 1231 + }, + { + "epoch": 0.37814610190300796, + "grad_norm": 1.1219336986541748, + "learning_rate": 9.998418647224458e-05, + "loss": 2.5715, + "step": 1232 + }, + { + "epoch": 0.3784530386740331, + "grad_norm": 1.0666810274124146, + "learning_rate": 9.998406122372354e-05, + "loss": 2.4865, + "step": 1233 + }, + { + "epoch": 0.3787599754450583, + "grad_norm": 1.3699263334274292, + "learning_rate": 9.998393548123072e-05, + "loss": 2.5523, + "step": 1234 + }, + { + "epoch": 0.37906691221608346, + "grad_norm": 1.1383014917373657, + "learning_rate": 9.998380924476733e-05, + "loss": 2.7054, + "step": 1235 + }, + { + "epoch": 0.37937384898710863, + "grad_norm": 1.1304205656051636, + "learning_rate": 9.998368251433465e-05, + "loss": 2.5007, + "step": 1236 + }, + { + "epoch": 0.3796807857581338, + "grad_norm": 1.2220405340194702, + "learning_rate": 9.998355528993394e-05, + "loss": 2.5635, + "step": 1237 + }, + { + "epoch": 0.37998772252915897, + "grad_norm": 1.1126691102981567, + "learning_rate": 9.998342757156642e-05, + "loss": 2.5795, + "step": 1238 + }, + { + "epoch": 0.38029465930018413, + "grad_norm": 1.1675945520401, + "learning_rate": 9.998329935923339e-05, + "loss": 2.564, + "step": 1239 + }, + { + "epoch": 0.38060159607120936, + "grad_norm": 1.1286569833755493, + "learning_rate": 9.998317065293607e-05, + "loss": 2.5476, + "step": 1240 + }, + { + "epoch": 0.3809085328422345, + "grad_norm": 1.1252213716506958, + "learning_rate": 9.998304145267579e-05, + "loss": 2.5406, + "step": 1241 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 1.1931700706481934, + "learning_rate": 9.998291175845378e-05, + "loss": 2.5277, + "step": 1242 + }, + { + "epoch": 0.38152240638428486, + "grad_norm": 1.2148306369781494, + "learning_rate": 9.998278157027136e-05, + "loss": 2.5178, + "step": 1243 + }, + { + "epoch": 0.38182934315531003, + "grad_norm": 1.1597660779953003, + "learning_rate": 9.998265088812978e-05, + "loss": 2.5522, + "step": 1244 + }, + { + "epoch": 0.3821362799263352, + "grad_norm": 1.105973243713379, + "learning_rate": 9.998251971203035e-05, + "loss": 2.4558, + "step": 1245 + }, + { + "epoch": 0.38244321669736037, + "grad_norm": 1.1082781553268433, + "learning_rate": 9.998238804197437e-05, + "loss": 2.5504, + "step": 1246 + }, + { + "epoch": 0.38275015346838553, + "grad_norm": 1.2124732732772827, + "learning_rate": 9.998225587796312e-05, + "loss": 2.5536, + "step": 1247 + }, + { + "epoch": 0.3830570902394107, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.998212321999795e-05, + "loss": 2.4837, + "step": 1248 + }, + { + "epoch": 0.38336402701043587, + "grad_norm": 1.353562355041504, + "learning_rate": 9.998199006808014e-05, + "loss": 2.4554, + "step": 1249 + }, + { + "epoch": 0.38367096378146104, + "grad_norm": 1.2103357315063477, + "learning_rate": 9.998185642221098e-05, + "loss": 2.4843, + "step": 1250 + }, + { + "epoch": 0.3839779005524862, + "grad_norm": 1.2572352886199951, + "learning_rate": 9.998172228239185e-05, + "loss": 2.497, + "step": 1251 + }, + { + "epoch": 0.3842848373235114, + "grad_norm": 1.0910226106643677, + "learning_rate": 9.998158764862402e-05, + "loss": 2.577, + "step": 1252 + }, + { + "epoch": 0.38459177409453654, + "grad_norm": 1.2550606727600098, + "learning_rate": 9.998145252090886e-05, + "loss": 2.5087, + "step": 1253 + }, + { + "epoch": 0.3848987108655617, + "grad_norm": 1.0103787183761597, + "learning_rate": 9.998131689924768e-05, + "loss": 2.5306, + "step": 1254 + }, + { + "epoch": 0.3852056476365869, + "grad_norm": 1.2965941429138184, + "learning_rate": 9.998118078364184e-05, + "loss": 2.5622, + "step": 1255 + }, + { + "epoch": 0.38551258440761205, + "grad_norm": 1.0791535377502441, + "learning_rate": 9.998104417409269e-05, + "loss": 2.5608, + "step": 1256 + }, + { + "epoch": 0.3858195211786372, + "grad_norm": 1.3277596235275269, + "learning_rate": 9.998090707060155e-05, + "loss": 2.5748, + "step": 1257 + }, + { + "epoch": 0.3861264579496624, + "grad_norm": 1.004031777381897, + "learning_rate": 9.99807694731698e-05, + "loss": 2.5532, + "step": 1258 + }, + { + "epoch": 0.38643339472068755, + "grad_norm": 1.4802277088165283, + "learning_rate": 9.998063138179877e-05, + "loss": 2.585, + "step": 1259 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 1.0821146965026855, + "learning_rate": 9.998049279648987e-05, + "loss": 2.5248, + "step": 1260 + }, + { + "epoch": 0.3870472682627379, + "grad_norm": 1.2902108430862427, + "learning_rate": 9.998035371724443e-05, + "loss": 2.5134, + "step": 1261 + }, + { + "epoch": 0.38735420503376305, + "grad_norm": 1.082943320274353, + "learning_rate": 9.998021414406385e-05, + "loss": 2.5937, + "step": 1262 + }, + { + "epoch": 0.3876611418047882, + "grad_norm": 1.2164193391799927, + "learning_rate": 9.998007407694949e-05, + "loss": 2.5106, + "step": 1263 + }, + { + "epoch": 0.3879680785758134, + "grad_norm": 1.0999115705490112, + "learning_rate": 9.997993351590276e-05, + "loss": 2.5458, + "step": 1264 + }, + { + "epoch": 0.38827501534683856, + "grad_norm": 1.2275537252426147, + "learning_rate": 9.997979246092503e-05, + "loss": 2.5664, + "step": 1265 + }, + { + "epoch": 0.3885819521178637, + "grad_norm": 1.3246204853057861, + "learning_rate": 9.997965091201769e-05, + "loss": 2.5289, + "step": 1266 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.2404677867889404, + "learning_rate": 9.997950886918214e-05, + "loss": 2.5302, + "step": 1267 + }, + { + "epoch": 0.38919582565991406, + "grad_norm": 1.0993810892105103, + "learning_rate": 9.99793663324198e-05, + "loss": 2.5085, + "step": 1268 + }, + { + "epoch": 0.38950276243093923, + "grad_norm": 1.3394049406051636, + "learning_rate": 9.997922330173206e-05, + "loss": 2.5882, + "step": 1269 + }, + { + "epoch": 0.3898096992019644, + "grad_norm": 1.1464321613311768, + "learning_rate": 9.997907977712036e-05, + "loss": 2.5211, + "step": 1270 + }, + { + "epoch": 0.39011663597298957, + "grad_norm": 1.1246297359466553, + "learning_rate": 9.997893575858608e-05, + "loss": 2.4204, + "step": 1271 + }, + { + "epoch": 0.39042357274401474, + "grad_norm": 1.1278076171875, + "learning_rate": 9.997879124613067e-05, + "loss": 2.4405, + "step": 1272 + }, + { + "epoch": 0.3907305095150399, + "grad_norm": 1.2284942865371704, + "learning_rate": 9.997864623975555e-05, + "loss": 2.5674, + "step": 1273 + }, + { + "epoch": 0.39103744628606507, + "grad_norm": 1.1243138313293457, + "learning_rate": 9.997850073946215e-05, + "loss": 2.489, + "step": 1274 + }, + { + "epoch": 0.39134438305709024, + "grad_norm": 1.198461890220642, + "learning_rate": 9.997835474525193e-05, + "loss": 2.51, + "step": 1275 + }, + { + "epoch": 0.3916513198281154, + "grad_norm": 1.1643213033676147, + "learning_rate": 9.997820825712629e-05, + "loss": 2.5688, + "step": 1276 + }, + { + "epoch": 0.3919582565991406, + "grad_norm": 1.2107082605361938, + "learning_rate": 9.997806127508671e-05, + "loss": 2.5614, + "step": 1277 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 1.1856440305709839, + "learning_rate": 9.997791379913464e-05, + "loss": 2.5893, + "step": 1278 + }, + { + "epoch": 0.3925721301411909, + "grad_norm": 1.166395664215088, + "learning_rate": 9.997776582927153e-05, + "loss": 2.539, + "step": 1279 + }, + { + "epoch": 0.3928790669122161, + "grad_norm": 1.1638765335083008, + "learning_rate": 9.997761736549886e-05, + "loss": 2.5384, + "step": 1280 + }, + { + "epoch": 0.39318600368324125, + "grad_norm": 1.107485055923462, + "learning_rate": 9.997746840781806e-05, + "loss": 2.559, + "step": 1281 + }, + { + "epoch": 0.3934929404542664, + "grad_norm": 1.174592137336731, + "learning_rate": 9.997731895623063e-05, + "loss": 2.5132, + "step": 1282 + }, + { + "epoch": 0.3937998772252916, + "grad_norm": 1.0407745838165283, + "learning_rate": 9.997716901073806e-05, + "loss": 2.4871, + "step": 1283 + }, + { + "epoch": 0.39410681399631675, + "grad_norm": 1.059743046760559, + "learning_rate": 9.997701857134179e-05, + "loss": 2.4865, + "step": 1284 + }, + { + "epoch": 0.3944137507673419, + "grad_norm": 1.0606070756912231, + "learning_rate": 9.997686763804335e-05, + "loss": 2.5651, + "step": 1285 + }, + { + "epoch": 0.3947206875383671, + "grad_norm": 1.0753284692764282, + "learning_rate": 9.99767162108442e-05, + "loss": 2.4699, + "step": 1286 + }, + { + "epoch": 0.39502762430939226, + "grad_norm": 1.1155509948730469, + "learning_rate": 9.997656428974585e-05, + "loss": 2.5326, + "step": 1287 + }, + { + "epoch": 0.3953345610804174, + "grad_norm": 1.2243739366531372, + "learning_rate": 9.99764118747498e-05, + "loss": 2.5189, + "step": 1288 + }, + { + "epoch": 0.3956414978514426, + "grad_norm": 1.2526514530181885, + "learning_rate": 9.997625896585757e-05, + "loss": 2.5464, + "step": 1289 + }, + { + "epoch": 0.39594843462246776, + "grad_norm": 1.297153115272522, + "learning_rate": 9.997610556307062e-05, + "loss": 2.5752, + "step": 1290 + }, + { + "epoch": 0.39625537139349293, + "grad_norm": 1.1064956188201904, + "learning_rate": 9.997595166639054e-05, + "loss": 2.5743, + "step": 1291 + }, + { + "epoch": 0.3965623081645181, + "grad_norm": 1.255810022354126, + "learning_rate": 9.997579727581879e-05, + "loss": 2.7087, + "step": 1292 + }, + { + "epoch": 0.39686924493554326, + "grad_norm": 1.4290298223495483, + "learning_rate": 9.997564239135692e-05, + "loss": 2.5417, + "step": 1293 + }, + { + "epoch": 0.39717618170656843, + "grad_norm": 1.1937109231948853, + "learning_rate": 9.997548701300648e-05, + "loss": 2.4862, + "step": 1294 + }, + { + "epoch": 0.3974831184775936, + "grad_norm": 1.1707425117492676, + "learning_rate": 9.997533114076897e-05, + "loss": 2.4715, + "step": 1295 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 1.1248551607131958, + "learning_rate": 9.997517477464596e-05, + "loss": 2.4859, + "step": 1296 + }, + { + "epoch": 0.39809699201964394, + "grad_norm": 1.1656453609466553, + "learning_rate": 9.997501791463897e-05, + "loss": 2.5402, + "step": 1297 + }, + { + "epoch": 0.3984039287906691, + "grad_norm": 0.9916674494743347, + "learning_rate": 9.997486056074956e-05, + "loss": 2.5116, + "step": 1298 + }, + { + "epoch": 0.39871086556169427, + "grad_norm": 1.3229619264602661, + "learning_rate": 9.997470271297928e-05, + "loss": 2.5565, + "step": 1299 + }, + { + "epoch": 0.39901780233271944, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.997454437132971e-05, + "loss": 2.5191, + "step": 1300 + }, + { + "epoch": 0.3993247391037446, + "grad_norm": 1.2117778062820435, + "learning_rate": 9.997438553580241e-05, + "loss": 2.558, + "step": 1301 + }, + { + "epoch": 0.3996316758747698, + "grad_norm": 1.1083563566207886, + "learning_rate": 9.997422620639892e-05, + "loss": 2.4734, + "step": 1302 + }, + { + "epoch": 0.39993861264579494, + "grad_norm": 0.9662174582481384, + "learning_rate": 9.997406638312084e-05, + "loss": 2.4866, + "step": 1303 + }, + { + "epoch": 0.4002455494168201, + "grad_norm": 1.0886632204055786, + "learning_rate": 9.997390606596976e-05, + "loss": 2.5397, + "step": 1304 + }, + { + "epoch": 0.4005524861878453, + "grad_norm": 1.2318742275238037, + "learning_rate": 9.997374525494723e-05, + "loss": 2.6281, + "step": 1305 + }, + { + "epoch": 0.40085942295887045, + "grad_norm": 1.1717815399169922, + "learning_rate": 9.997358395005487e-05, + "loss": 2.5202, + "step": 1306 + }, + { + "epoch": 0.4011663597298956, + "grad_norm": 1.0533723831176758, + "learning_rate": 9.997342215129427e-05, + "loss": 2.5096, + "step": 1307 + }, + { + "epoch": 0.4014732965009208, + "grad_norm": 1.0814248323440552, + "learning_rate": 9.997325985866701e-05, + "loss": 2.5513, + "step": 1308 + }, + { + "epoch": 0.40178023327194595, + "grad_norm": 1.078261137008667, + "learning_rate": 9.997309707217472e-05, + "loss": 2.5115, + "step": 1309 + }, + { + "epoch": 0.4020871700429711, + "grad_norm": 1.0834710597991943, + "learning_rate": 9.997293379181897e-05, + "loss": 2.4754, + "step": 1310 + }, + { + "epoch": 0.40239410681399634, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.997277001760142e-05, + "loss": 2.5068, + "step": 1311 + }, + { + "epoch": 0.4027010435850215, + "grad_norm": 1.3008345365524292, + "learning_rate": 9.997260574952366e-05, + "loss": 2.4675, + "step": 1312 + }, + { + "epoch": 0.4030079803560467, + "grad_norm": 1.176858901977539, + "learning_rate": 9.997244098758732e-05, + "loss": 2.4786, + "step": 1313 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 1.0121303796768188, + "learning_rate": 9.997227573179403e-05, + "loss": 2.476, + "step": 1314 + }, + { + "epoch": 0.403621853898097, + "grad_norm": 1.326298713684082, + "learning_rate": 9.997210998214542e-05, + "loss": 2.4093, + "step": 1315 + }, + { + "epoch": 0.4039287906691222, + "grad_norm": 0.9008898735046387, + "learning_rate": 9.997194373864314e-05, + "loss": 2.4523, + "step": 1316 + }, + { + "epoch": 0.40423572744014735, + "grad_norm": 1.0441854000091553, + "learning_rate": 9.99717770012888e-05, + "loss": 2.5419, + "step": 1317 + }, + { + "epoch": 0.4045426642111725, + "grad_norm": 1.0490028858184814, + "learning_rate": 9.997160977008408e-05, + "loss": 2.4855, + "step": 1318 + }, + { + "epoch": 0.4048496009821977, + "grad_norm": 1.0244388580322266, + "learning_rate": 9.997144204503063e-05, + "loss": 2.4555, + "step": 1319 + }, + { + "epoch": 0.40515653775322286, + "grad_norm": 1.1217700242996216, + "learning_rate": 9.99712738261301e-05, + "loss": 2.4872, + "step": 1320 + }, + { + "epoch": 0.405463474524248, + "grad_norm": 1.031691551208496, + "learning_rate": 9.997110511338414e-05, + "loss": 2.4094, + "step": 1321 + }, + { + "epoch": 0.4057704112952732, + "grad_norm": 1.1658705472946167, + "learning_rate": 9.997093590679444e-05, + "loss": 2.407, + "step": 1322 + }, + { + "epoch": 0.40607734806629836, + "grad_norm": 1.1527072191238403, + "learning_rate": 9.997076620636266e-05, + "loss": 2.5041, + "step": 1323 + }, + { + "epoch": 0.40638428483732353, + "grad_norm": 1.2039116621017456, + "learning_rate": 9.997059601209049e-05, + "loss": 2.4682, + "step": 1324 + }, + { + "epoch": 0.4066912216083487, + "grad_norm": 1.142160177230835, + "learning_rate": 9.997042532397957e-05, + "loss": 2.4629, + "step": 1325 + }, + { + "epoch": 0.40699815837937386, + "grad_norm": 0.972081184387207, + "learning_rate": 9.997025414203164e-05, + "loss": 2.3941, + "step": 1326 + }, + { + "epoch": 0.40730509515039903, + "grad_norm": 1.0181753635406494, + "learning_rate": 9.99700824662484e-05, + "loss": 2.5649, + "step": 1327 + }, + { + "epoch": 0.4076120319214242, + "grad_norm": 1.145769715309143, + "learning_rate": 9.996991029663148e-05, + "loss": 2.5284, + "step": 1328 + }, + { + "epoch": 0.40791896869244937, + "grad_norm": 1.0604028701782227, + "learning_rate": 9.996973763318262e-05, + "loss": 2.4488, + "step": 1329 + }, + { + "epoch": 0.40822590546347454, + "grad_norm": 1.161383867263794, + "learning_rate": 9.996956447590354e-05, + "loss": 2.6081, + "step": 1330 + }, + { + "epoch": 0.4085328422344997, + "grad_norm": 1.0880714654922485, + "learning_rate": 9.996939082479591e-05, + "loss": 2.4695, + "step": 1331 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 1.036556601524353, + "learning_rate": 9.99692166798615e-05, + "loss": 2.4428, + "step": 1332 + }, + { + "epoch": 0.40914671577655004, + "grad_norm": 1.079179286956787, + "learning_rate": 9.996904204110198e-05, + "loss": 2.4543, + "step": 1333 + }, + { + "epoch": 0.4094536525475752, + "grad_norm": 1.0588144063949585, + "learning_rate": 9.996886690851912e-05, + "loss": 2.4755, + "step": 1334 + }, + { + "epoch": 0.4097605893186004, + "grad_norm": 1.0359580516815186, + "learning_rate": 9.996869128211462e-05, + "loss": 2.4933, + "step": 1335 + }, + { + "epoch": 0.41006752608962554, + "grad_norm": 1.0067389011383057, + "learning_rate": 9.996851516189021e-05, + "loss": 2.4291, + "step": 1336 + }, + { + "epoch": 0.4103744628606507, + "grad_norm": 1.0173524618148804, + "learning_rate": 9.996833854784766e-05, + "loss": 2.4856, + "step": 1337 + }, + { + "epoch": 0.4106813996316759, + "grad_norm": 1.0740927457809448, + "learning_rate": 9.99681614399887e-05, + "loss": 2.5248, + "step": 1338 + }, + { + "epoch": 0.41098833640270105, + "grad_norm": 0.9638547301292419, + "learning_rate": 9.99679838383151e-05, + "loss": 2.4777, + "step": 1339 + }, + { + "epoch": 0.4112952731737262, + "grad_norm": 1.0349369049072266, + "learning_rate": 9.996780574282856e-05, + "loss": 2.5188, + "step": 1340 + }, + { + "epoch": 0.4116022099447514, + "grad_norm": 1.099743127822876, + "learning_rate": 9.996762715353089e-05, + "loss": 2.4141, + "step": 1341 + }, + { + "epoch": 0.41190914671577655, + "grad_norm": 1.027178406715393, + "learning_rate": 9.996744807042386e-05, + "loss": 2.5134, + "step": 1342 + }, + { + "epoch": 0.4122160834868017, + "grad_norm": 1.1933472156524658, + "learning_rate": 9.996726849350922e-05, + "loss": 2.4821, + "step": 1343 + }, + { + "epoch": 0.4125230202578269, + "grad_norm": 1.1663923263549805, + "learning_rate": 9.996708842278872e-05, + "loss": 2.4593, + "step": 1344 + }, + { + "epoch": 0.41282995702885206, + "grad_norm": 1.2633854150772095, + "learning_rate": 9.996690785826418e-05, + "loss": 2.5524, + "step": 1345 + }, + { + "epoch": 0.4131368937998772, + "grad_norm": 1.03873610496521, + "learning_rate": 9.996672679993737e-05, + "loss": 2.5403, + "step": 1346 + }, + { + "epoch": 0.4134438305709024, + "grad_norm": 1.106656789779663, + "learning_rate": 9.996654524781009e-05, + "loss": 2.5172, + "step": 1347 + }, + { + "epoch": 0.41375076734192756, + "grad_norm": 1.015608310699463, + "learning_rate": 9.996636320188411e-05, + "loss": 2.423, + "step": 1348 + }, + { + "epoch": 0.41405770411295273, + "grad_norm": 1.0672087669372559, + "learning_rate": 9.996618066216124e-05, + "loss": 2.4861, + "step": 1349 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 1.1289842128753662, + "learning_rate": 9.996599762864329e-05, + "loss": 2.3944, + "step": 1350 + }, + { + "epoch": 0.41467157765500307, + "grad_norm": 1.080428957939148, + "learning_rate": 9.996581410133207e-05, + "loss": 2.4563, + "step": 1351 + }, + { + "epoch": 0.41497851442602823, + "grad_norm": 1.257104516029358, + "learning_rate": 9.996563008022939e-05, + "loss": 2.437, + "step": 1352 + }, + { + "epoch": 0.4152854511970534, + "grad_norm": 1.039293646812439, + "learning_rate": 9.996544556533706e-05, + "loss": 2.4654, + "step": 1353 + }, + { + "epoch": 0.41559238796807857, + "grad_norm": 1.0976085662841797, + "learning_rate": 9.996526055665692e-05, + "loss": 2.4755, + "step": 1354 + }, + { + "epoch": 0.41589932473910374, + "grad_norm": 0.937647819519043, + "learning_rate": 9.996507505419078e-05, + "loss": 2.4687, + "step": 1355 + }, + { + "epoch": 0.4162062615101289, + "grad_norm": 1.0461267232894897, + "learning_rate": 9.996488905794047e-05, + "loss": 2.4092, + "step": 1356 + }, + { + "epoch": 0.4165131982811541, + "grad_norm": 1.0510658025741577, + "learning_rate": 9.996470256790787e-05, + "loss": 2.4806, + "step": 1357 + }, + { + "epoch": 0.41682013505217924, + "grad_norm": 1.2323371171951294, + "learning_rate": 9.996451558409478e-05, + "loss": 2.5017, + "step": 1358 + }, + { + "epoch": 0.4171270718232044, + "grad_norm": 0.9880139827728271, + "learning_rate": 9.996432810650307e-05, + "loss": 2.5171, + "step": 1359 + }, + { + "epoch": 0.4174340085942296, + "grad_norm": 1.2572466135025024, + "learning_rate": 9.996414013513458e-05, + "loss": 2.4285, + "step": 1360 + }, + { + "epoch": 0.41774094536525475, + "grad_norm": 0.8809559941291809, + "learning_rate": 9.996395166999118e-05, + "loss": 2.398, + "step": 1361 + }, + { + "epoch": 0.4180478821362799, + "grad_norm": 0.9389429688453674, + "learning_rate": 9.996376271107471e-05, + "loss": 2.4539, + "step": 1362 + }, + { + "epoch": 0.4183548189073051, + "grad_norm": 0.8821789026260376, + "learning_rate": 9.996357325838705e-05, + "loss": 2.4762, + "step": 1363 + }, + { + "epoch": 0.41866175567833025, + "grad_norm": 1.0148484706878662, + "learning_rate": 9.99633833119301e-05, + "loss": 2.5292, + "step": 1364 + }, + { + "epoch": 0.4189686924493554, + "grad_norm": 0.9861947894096375, + "learning_rate": 9.996319287170569e-05, + "loss": 2.4285, + "step": 1365 + }, + { + "epoch": 0.4192756292203806, + "grad_norm": 1.1907099485397339, + "learning_rate": 9.996300193771573e-05, + "loss": 2.4325, + "step": 1366 + }, + { + "epoch": 0.41958256599140575, + "grad_norm": 1.0746681690216064, + "learning_rate": 9.99628105099621e-05, + "loss": 2.3349, + "step": 1367 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 1.2040268182754517, + "learning_rate": 9.996261858844669e-05, + "loss": 2.4427, + "step": 1368 + }, + { + "epoch": 0.4201964395334561, + "grad_norm": 1.0487430095672607, + "learning_rate": 9.99624261731714e-05, + "loss": 2.4305, + "step": 1369 + }, + { + "epoch": 0.42050337630448126, + "grad_norm": 1.0047999620437622, + "learning_rate": 9.996223326413812e-05, + "loss": 2.4442, + "step": 1370 + }, + { + "epoch": 0.4208103130755064, + "grad_norm": 1.147078275680542, + "learning_rate": 9.996203986134879e-05, + "loss": 2.5189, + "step": 1371 + }, + { + "epoch": 0.4211172498465316, + "grad_norm": 1.2269455194473267, + "learning_rate": 9.996184596480529e-05, + "loss": 2.3905, + "step": 1372 + }, + { + "epoch": 0.42142418661755676, + "grad_norm": 0.9716771245002747, + "learning_rate": 9.996165157450954e-05, + "loss": 2.4246, + "step": 1373 + }, + { + "epoch": 0.42173112338858193, + "grad_norm": 1.0569939613342285, + "learning_rate": 9.996145669046347e-05, + "loss": 2.529, + "step": 1374 + }, + { + "epoch": 0.4220380601596071, + "grad_norm": 1.1145942211151123, + "learning_rate": 9.996126131266899e-05, + "loss": 2.3965, + "step": 1375 + }, + { + "epoch": 0.42234499693063227, + "grad_norm": 0.9990974068641663, + "learning_rate": 9.996106544112805e-05, + "loss": 2.4991, + "step": 1376 + }, + { + "epoch": 0.42265193370165743, + "grad_norm": 0.9536247253417969, + "learning_rate": 9.99608690758426e-05, + "loss": 2.4347, + "step": 1377 + }, + { + "epoch": 0.4229588704726826, + "grad_norm": 1.0053460597991943, + "learning_rate": 9.996067221681452e-05, + "loss": 2.4213, + "step": 1378 + }, + { + "epoch": 0.42326580724370777, + "grad_norm": 1.0727168321609497, + "learning_rate": 9.99604748640458e-05, + "loss": 2.4479, + "step": 1379 + }, + { + "epoch": 0.42357274401473294, + "grad_norm": 1.2539277076721191, + "learning_rate": 9.996027701753841e-05, + "loss": 2.4721, + "step": 1380 + }, + { + "epoch": 0.4238796807857581, + "grad_norm": 1.0348230600357056, + "learning_rate": 9.996007867729427e-05, + "loss": 2.4263, + "step": 1381 + }, + { + "epoch": 0.42418661755678333, + "grad_norm": 1.051802158355713, + "learning_rate": 9.995987984331533e-05, + "loss": 2.4492, + "step": 1382 + }, + { + "epoch": 0.4244935543278085, + "grad_norm": 1.0394505262374878, + "learning_rate": 9.995968051560361e-05, + "loss": 2.4625, + "step": 1383 + }, + { + "epoch": 0.42480049109883367, + "grad_norm": 1.1121852397918701, + "learning_rate": 9.995948069416103e-05, + "loss": 2.4999, + "step": 1384 + }, + { + "epoch": 0.42510742786985883, + "grad_norm": 0.9693613052368164, + "learning_rate": 9.995928037898957e-05, + "loss": 2.4112, + "step": 1385 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 1.1416810750961304, + "learning_rate": 9.995907957009123e-05, + "loss": 2.5452, + "step": 1386 + }, + { + "epoch": 0.42572130141190917, + "grad_norm": 1.010640025138855, + "learning_rate": 9.995887826746797e-05, + "loss": 2.412, + "step": 1387 + }, + { + "epoch": 0.42602823818293434, + "grad_norm": 1.0800373554229736, + "learning_rate": 9.99586764711218e-05, + "loss": 2.4451, + "step": 1388 + }, + { + "epoch": 0.4263351749539595, + "grad_norm": 1.058931589126587, + "learning_rate": 9.995847418105471e-05, + "loss": 2.474, + "step": 1389 + }, + { + "epoch": 0.4266421117249847, + "grad_norm": 1.0727131366729736, + "learning_rate": 9.99582713972687e-05, + "loss": 2.468, + "step": 1390 + }, + { + "epoch": 0.42694904849600984, + "grad_norm": 1.0237464904785156, + "learning_rate": 9.995806811976576e-05, + "loss": 2.5208, + "step": 1391 + }, + { + "epoch": 0.427255985267035, + "grad_norm": 1.036582112312317, + "learning_rate": 9.995786434854793e-05, + "loss": 2.4338, + "step": 1392 + }, + { + "epoch": 0.4275629220380602, + "grad_norm": 0.9617817997932434, + "learning_rate": 9.995766008361719e-05, + "loss": 2.4465, + "step": 1393 + }, + { + "epoch": 0.42786985880908535, + "grad_norm": 1.2188911437988281, + "learning_rate": 9.995745532497556e-05, + "loss": 2.5069, + "step": 1394 + }, + { + "epoch": 0.4281767955801105, + "grad_norm": 1.0796585083007812, + "learning_rate": 9.99572500726251e-05, + "loss": 2.4839, + "step": 1395 + }, + { + "epoch": 0.4284837323511357, + "grad_norm": 0.9843130111694336, + "learning_rate": 9.99570443265678e-05, + "loss": 2.4968, + "step": 1396 + }, + { + "epoch": 0.42879066912216085, + "grad_norm": 1.0441415309906006, + "learning_rate": 9.99568380868057e-05, + "loss": 2.4134, + "step": 1397 + }, + { + "epoch": 0.429097605893186, + "grad_norm": 0.9156177639961243, + "learning_rate": 9.995663135334085e-05, + "loss": 2.4891, + "step": 1398 + }, + { + "epoch": 0.4294045426642112, + "grad_norm": 1.1159545183181763, + "learning_rate": 9.995642412617529e-05, + "loss": 2.4507, + "step": 1399 + }, + { + "epoch": 0.42971147943523635, + "grad_norm": 0.8944577574729919, + "learning_rate": 9.995621640531107e-05, + "loss": 2.4465, + "step": 1400 + }, + { + "epoch": 0.4300184162062615, + "grad_norm": 0.9043408036231995, + "learning_rate": 9.995600819075025e-05, + "loss": 2.3726, + "step": 1401 + }, + { + "epoch": 0.4303253529772867, + "grad_norm": 0.9028464555740356, + "learning_rate": 9.995579948249486e-05, + "loss": 2.427, + "step": 1402 + }, + { + "epoch": 0.43063228974831186, + "grad_norm": 0.9497705101966858, + "learning_rate": 9.995559028054699e-05, + "loss": 2.4666, + "step": 1403 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.927601158618927, + "learning_rate": 9.995538058490868e-05, + "loss": 2.3679, + "step": 1404 + }, + { + "epoch": 0.4312461632903622, + "grad_norm": 1.050394892692566, + "learning_rate": 9.995517039558204e-05, + "loss": 2.4096, + "step": 1405 + }, + { + "epoch": 0.43155310006138736, + "grad_norm": 1.3011974096298218, + "learning_rate": 9.995495971256911e-05, + "loss": 2.4439, + "step": 1406 + }, + { + "epoch": 0.43186003683241253, + "grad_norm": 1.0740708112716675, + "learning_rate": 9.9954748535872e-05, + "loss": 2.4891, + "step": 1407 + }, + { + "epoch": 0.4321669736034377, + "grad_norm": 1.1132466793060303, + "learning_rate": 9.995453686549279e-05, + "loss": 2.46, + "step": 1408 + }, + { + "epoch": 0.43247391037446287, + "grad_norm": 1.063275933265686, + "learning_rate": 9.995432470143356e-05, + "loss": 2.5035, + "step": 1409 + }, + { + "epoch": 0.43278084714548803, + "grad_norm": 1.065679669380188, + "learning_rate": 9.99541120436964e-05, + "loss": 2.4471, + "step": 1410 + }, + { + "epoch": 0.4330877839165132, + "grad_norm": 1.017587423324585, + "learning_rate": 9.995389889228344e-05, + "loss": 2.4879, + "step": 1411 + }, + { + "epoch": 0.43339472068753837, + "grad_norm": 0.9744442701339722, + "learning_rate": 9.995368524719678e-05, + "loss": 2.3923, + "step": 1412 + }, + { + "epoch": 0.43370165745856354, + "grad_norm": 0.8916706442832947, + "learning_rate": 9.995347110843851e-05, + "loss": 2.3965, + "step": 1413 + }, + { + "epoch": 0.4340085942295887, + "grad_norm": 0.916221559047699, + "learning_rate": 9.995325647601075e-05, + "loss": 2.4742, + "step": 1414 + }, + { + "epoch": 0.4343155310006139, + "grad_norm": 0.9388782978057861, + "learning_rate": 9.995304134991565e-05, + "loss": 2.453, + "step": 1415 + }, + { + "epoch": 0.43462246777163904, + "grad_norm": 1.057085633277893, + "learning_rate": 9.995282573015532e-05, + "loss": 2.5791, + "step": 1416 + }, + { + "epoch": 0.4349294045426642, + "grad_norm": 1.055145025253296, + "learning_rate": 9.995260961673187e-05, + "loss": 2.3565, + "step": 1417 + }, + { + "epoch": 0.4352363413136894, + "grad_norm": 1.0733528137207031, + "learning_rate": 9.995239300964747e-05, + "loss": 2.5413, + "step": 1418 + }, + { + "epoch": 0.43554327808471455, + "grad_norm": 1.1478198766708374, + "learning_rate": 9.995217590890425e-05, + "loss": 2.4093, + "step": 1419 + }, + { + "epoch": 0.4358502148557397, + "grad_norm": 0.8663081526756287, + "learning_rate": 9.995195831450432e-05, + "loss": 2.3968, + "step": 1420 + }, + { + "epoch": 0.4361571516267649, + "grad_norm": 0.9811860918998718, + "learning_rate": 9.995174022644988e-05, + "loss": 2.3536, + "step": 1421 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.9883477687835693, + "learning_rate": 9.995152164474306e-05, + "loss": 2.5372, + "step": 1422 + }, + { + "epoch": 0.4367710251688152, + "grad_norm": 1.2196532487869263, + "learning_rate": 9.995130256938603e-05, + "loss": 2.429, + "step": 1423 + }, + { + "epoch": 0.4370779619398404, + "grad_norm": 1.000264286994934, + "learning_rate": 9.995108300038096e-05, + "loss": 2.4116, + "step": 1424 + }, + { + "epoch": 0.43738489871086556, + "grad_norm": 1.1259286403656006, + "learning_rate": 9.995086293773e-05, + "loss": 2.4405, + "step": 1425 + }, + { + "epoch": 0.4376918354818907, + "grad_norm": 0.9334595203399658, + "learning_rate": 9.995064238143533e-05, + "loss": 2.3849, + "step": 1426 + }, + { + "epoch": 0.4379987722529159, + "grad_norm": 0.8880285620689392, + "learning_rate": 9.995042133149914e-05, + "loss": 2.4177, + "step": 1427 + }, + { + "epoch": 0.43830570902394106, + "grad_norm": 0.8823251724243164, + "learning_rate": 9.995019978792362e-05, + "loss": 2.4876, + "step": 1428 + }, + { + "epoch": 0.4386126457949662, + "grad_norm": 0.9289014339447021, + "learning_rate": 9.994997775071094e-05, + "loss": 2.4725, + "step": 1429 + }, + { + "epoch": 0.4389195825659914, + "grad_norm": 0.9100427627563477, + "learning_rate": 9.994975521986329e-05, + "loss": 2.3834, + "step": 1430 + }, + { + "epoch": 0.43922651933701656, + "grad_norm": 0.8956978917121887, + "learning_rate": 9.99495321953829e-05, + "loss": 2.4418, + "step": 1431 + }, + { + "epoch": 0.43953345610804173, + "grad_norm": 1.1248396635055542, + "learning_rate": 9.994930867727195e-05, + "loss": 2.4389, + "step": 1432 + }, + { + "epoch": 0.4398403928790669, + "grad_norm": 0.9285669922828674, + "learning_rate": 9.994908466553266e-05, + "loss": 2.3922, + "step": 1433 + }, + { + "epoch": 0.44014732965009207, + "grad_norm": 0.9604844450950623, + "learning_rate": 9.994886016016723e-05, + "loss": 2.4365, + "step": 1434 + }, + { + "epoch": 0.44045426642111724, + "grad_norm": 1.0534024238586426, + "learning_rate": 9.99486351611779e-05, + "loss": 2.4377, + "step": 1435 + }, + { + "epoch": 0.4407612031921424, + "grad_norm": 1.1028003692626953, + "learning_rate": 9.994840966856686e-05, + "loss": 2.4299, + "step": 1436 + }, + { + "epoch": 0.44106813996316757, + "grad_norm": 1.119832158088684, + "learning_rate": 9.994818368233639e-05, + "loss": 2.4656, + "step": 1437 + }, + { + "epoch": 0.44137507673419274, + "grad_norm": 0.9782878160476685, + "learning_rate": 9.994795720248867e-05, + "loss": 2.3661, + "step": 1438 + }, + { + "epoch": 0.4416820135052179, + "grad_norm": 1.0002741813659668, + "learning_rate": 9.994773022902597e-05, + "loss": 2.4157, + "step": 1439 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 1.051486611366272, + "learning_rate": 9.994750276195053e-05, + "loss": 2.452, + "step": 1440 + }, + { + "epoch": 0.44229588704726824, + "grad_norm": 1.0375488996505737, + "learning_rate": 9.994727480126457e-05, + "loss": 2.4406, + "step": 1441 + }, + { + "epoch": 0.4426028238182934, + "grad_norm": 0.9407445192337036, + "learning_rate": 9.99470463469704e-05, + "loss": 2.3434, + "step": 1442 + }, + { + "epoch": 0.4429097605893186, + "grad_norm": 1.0371474027633667, + "learning_rate": 9.994681739907022e-05, + "loss": 2.5094, + "step": 1443 + }, + { + "epoch": 0.44321669736034375, + "grad_norm": 1.057519555091858, + "learning_rate": 9.994658795756632e-05, + "loss": 2.4501, + "step": 1444 + }, + { + "epoch": 0.4435236341313689, + "grad_norm": 0.9340078234672546, + "learning_rate": 9.994635802246097e-05, + "loss": 2.4151, + "step": 1445 + }, + { + "epoch": 0.4438305709023941, + "grad_norm": 0.8906050324440002, + "learning_rate": 9.994612759375644e-05, + "loss": 2.3837, + "step": 1446 + }, + { + "epoch": 0.44413750767341925, + "grad_norm": 0.8349595665931702, + "learning_rate": 9.994589667145497e-05, + "loss": 2.4317, + "step": 1447 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.9362117648124695, + "learning_rate": 9.994566525555891e-05, + "loss": 2.4586, + "step": 1448 + }, + { + "epoch": 0.4447513812154696, + "grad_norm": 0.869215190410614, + "learning_rate": 9.99454333460705e-05, + "loss": 2.4458, + "step": 1449 + }, + { + "epoch": 0.44505831798649476, + "grad_norm": 0.904531717300415, + "learning_rate": 9.994520094299204e-05, + "loss": 2.4198, + "step": 1450 + }, + { + "epoch": 0.4453652547575199, + "grad_norm": 0.9153178930282593, + "learning_rate": 9.994496804632583e-05, + "loss": 2.3718, + "step": 1451 + }, + { + "epoch": 0.44567219152854515, + "grad_norm": 1.0229307413101196, + "learning_rate": 9.994473465607418e-05, + "loss": 2.3787, + "step": 1452 + }, + { + "epoch": 0.4459791282995703, + "grad_norm": 1.0449415445327759, + "learning_rate": 9.994450077223938e-05, + "loss": 2.4965, + "step": 1453 + }, + { + "epoch": 0.4462860650705955, + "grad_norm": 1.0524135828018188, + "learning_rate": 9.994426639482375e-05, + "loss": 2.3518, + "step": 1454 + }, + { + "epoch": 0.44659300184162065, + "grad_norm": 1.0612086057662964, + "learning_rate": 9.994403152382961e-05, + "loss": 2.4501, + "step": 1455 + }, + { + "epoch": 0.4468999386126458, + "grad_norm": 1.0568779706954956, + "learning_rate": 9.994379615925929e-05, + "loss": 2.3754, + "step": 1456 + }, + { + "epoch": 0.447206875383671, + "grad_norm": 1.0984265804290771, + "learning_rate": 9.994356030111509e-05, + "loss": 2.4318, + "step": 1457 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.9227646589279175, + "learning_rate": 9.994332394939936e-05, + "loss": 2.3928, + "step": 1458 + }, + { + "epoch": 0.4478207489257213, + "grad_norm": 1.0073471069335938, + "learning_rate": 9.994308710411442e-05, + "loss": 2.4203, + "step": 1459 + }, + { + "epoch": 0.4481276856967465, + "grad_norm": 1.1347973346710205, + "learning_rate": 9.994284976526263e-05, + "loss": 2.4991, + "step": 1460 + }, + { + "epoch": 0.44843462246777166, + "grad_norm": 0.9912654757499695, + "learning_rate": 9.994261193284631e-05, + "loss": 2.471, + "step": 1461 + }, + { + "epoch": 0.4487415592387968, + "grad_norm": 1.0599550008773804, + "learning_rate": 9.994237360686784e-05, + "loss": 2.505, + "step": 1462 + }, + { + "epoch": 0.449048496009822, + "grad_norm": 0.9811004996299744, + "learning_rate": 9.994213478732957e-05, + "loss": 2.3868, + "step": 1463 + }, + { + "epoch": 0.44935543278084716, + "grad_norm": 0.8389631509780884, + "learning_rate": 9.994189547423384e-05, + "loss": 2.4766, + "step": 1464 + }, + { + "epoch": 0.44966236955187233, + "grad_norm": 0.8475043773651123, + "learning_rate": 9.994165566758302e-05, + "loss": 2.3666, + "step": 1465 + }, + { + "epoch": 0.4499693063228975, + "grad_norm": 0.8922824859619141, + "learning_rate": 9.994141536737951e-05, + "loss": 2.3823, + "step": 1466 + }, + { + "epoch": 0.45027624309392267, + "grad_norm": 1.0286083221435547, + "learning_rate": 9.994117457362564e-05, + "loss": 2.4639, + "step": 1467 + }, + { + "epoch": 0.45058317986494784, + "grad_norm": 1.094282865524292, + "learning_rate": 9.994093328632383e-05, + "loss": 2.3984, + "step": 1468 + }, + { + "epoch": 0.450890116635973, + "grad_norm": 1.0993603467941284, + "learning_rate": 9.994069150547642e-05, + "loss": 2.3719, + "step": 1469 + }, + { + "epoch": 0.45119705340699817, + "grad_norm": 1.0274133682250977, + "learning_rate": 9.994044923108585e-05, + "loss": 2.3644, + "step": 1470 + }, + { + "epoch": 0.45150399017802334, + "grad_norm": 0.8834434747695923, + "learning_rate": 9.994020646315448e-05, + "loss": 2.4955, + "step": 1471 + }, + { + "epoch": 0.4518109269490485, + "grad_norm": 0.8540776968002319, + "learning_rate": 9.993996320168473e-05, + "loss": 2.4292, + "step": 1472 + }, + { + "epoch": 0.4521178637200737, + "grad_norm": 0.8735383749008179, + "learning_rate": 9.993971944667897e-05, + "loss": 2.4343, + "step": 1473 + }, + { + "epoch": 0.45242480049109884, + "grad_norm": 0.976224422454834, + "learning_rate": 9.993947519813965e-05, + "loss": 2.4173, + "step": 1474 + }, + { + "epoch": 0.452731737262124, + "grad_norm": 0.9638139009475708, + "learning_rate": 9.993923045606917e-05, + "loss": 2.4322, + "step": 1475 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.9689927697181702, + "learning_rate": 9.993898522046992e-05, + "loss": 2.4625, + "step": 1476 + }, + { + "epoch": 0.45334561080417435, + "grad_norm": 1.0496052503585815, + "learning_rate": 9.993873949134437e-05, + "loss": 2.4788, + "step": 1477 + }, + { + "epoch": 0.4536525475751995, + "grad_norm": 1.0285090208053589, + "learning_rate": 9.993849326869491e-05, + "loss": 2.4119, + "step": 1478 + }, + { + "epoch": 0.4539594843462247, + "grad_norm": 0.9423730373382568, + "learning_rate": 9.993824655252401e-05, + "loss": 2.3919, + "step": 1479 + }, + { + "epoch": 0.45426642111724985, + "grad_norm": 1.0312988758087158, + "learning_rate": 9.993799934283407e-05, + "loss": 2.3829, + "step": 1480 + }, + { + "epoch": 0.454573357888275, + "grad_norm": 1.0985655784606934, + "learning_rate": 9.993775163962755e-05, + "loss": 2.3958, + "step": 1481 + }, + { + "epoch": 0.4548802946593002, + "grad_norm": 0.9346623420715332, + "learning_rate": 9.993750344290691e-05, + "loss": 2.3611, + "step": 1482 + }, + { + "epoch": 0.45518723143032536, + "grad_norm": 1.039681315422058, + "learning_rate": 9.993725475267459e-05, + "loss": 2.3989, + "step": 1483 + }, + { + "epoch": 0.4554941682013505, + "grad_norm": 0.9941854476928711, + "learning_rate": 9.993700556893304e-05, + "loss": 2.3092, + "step": 1484 + }, + { + "epoch": 0.4558011049723757, + "grad_norm": 0.9752130508422852, + "learning_rate": 9.993675589168473e-05, + "loss": 2.3727, + "step": 1485 + }, + { + "epoch": 0.45610804174340086, + "grad_norm": 0.9946039319038391, + "learning_rate": 9.993650572093216e-05, + "loss": 2.4121, + "step": 1486 + }, + { + "epoch": 0.45641497851442603, + "grad_norm": 1.1340489387512207, + "learning_rate": 9.993625505667774e-05, + "loss": 2.4477, + "step": 1487 + }, + { + "epoch": 0.4567219152854512, + "grad_norm": 0.9300981760025024, + "learning_rate": 9.993600389892399e-05, + "loss": 2.4045, + "step": 1488 + }, + { + "epoch": 0.45702885205647636, + "grad_norm": 0.8670973181724548, + "learning_rate": 9.993575224767338e-05, + "loss": 2.3596, + "step": 1489 + }, + { + "epoch": 0.45733578882750153, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.99355001029284e-05, + "loss": 2.4191, + "step": 1490 + }, + { + "epoch": 0.4576427255985267, + "grad_norm": 0.9099079370498657, + "learning_rate": 9.993524746469154e-05, + "loss": 2.4139, + "step": 1491 + }, + { + "epoch": 0.45794966236955187, + "grad_norm": 0.9740153551101685, + "learning_rate": 9.99349943329653e-05, + "loss": 2.4269, + "step": 1492 + }, + { + "epoch": 0.45825659914057704, + "grad_norm": 0.9112171530723572, + "learning_rate": 9.993474070775217e-05, + "loss": 2.3575, + "step": 1493 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 1.124553918838501, + "learning_rate": 9.993448658905466e-05, + "loss": 2.5518, + "step": 1494 + }, + { + "epoch": 0.4588704726826274, + "grad_norm": 1.1732012033462524, + "learning_rate": 9.99342319768753e-05, + "loss": 2.4346, + "step": 1495 + }, + { + "epoch": 0.45917740945365254, + "grad_norm": 0.8880025148391724, + "learning_rate": 9.993397687121659e-05, + "loss": 2.3593, + "step": 1496 + }, + { + "epoch": 0.4594843462246777, + "grad_norm": 0.9916797876358032, + "learning_rate": 9.993372127208105e-05, + "loss": 2.3283, + "step": 1497 + }, + { + "epoch": 0.4597912829957029, + "grad_norm": 0.9372622966766357, + "learning_rate": 9.99334651794712e-05, + "loss": 2.3868, + "step": 1498 + }, + { + "epoch": 0.46009821976672804, + "grad_norm": 1.0630989074707031, + "learning_rate": 9.99332085933896e-05, + "loss": 2.3605, + "step": 1499 + }, + { + "epoch": 0.4604051565377532, + "grad_norm": 1.000473976135254, + "learning_rate": 9.993295151383874e-05, + "loss": 2.3478, + "step": 1500 + }, + { + "epoch": 0.4607120933087784, + "grad_norm": 1.0269688367843628, + "learning_rate": 9.99326939408212e-05, + "loss": 2.4104, + "step": 1501 + }, + { + "epoch": 0.46101903007980355, + "grad_norm": 0.9003174901008606, + "learning_rate": 9.993243587433952e-05, + "loss": 2.3461, + "step": 1502 + }, + { + "epoch": 0.4613259668508287, + "grad_norm": 0.7938058972358704, + "learning_rate": 9.993217731439623e-05, + "loss": 2.3463, + "step": 1503 + }, + { + "epoch": 0.4616329036218539, + "grad_norm": 0.8715407252311707, + "learning_rate": 9.993191826099391e-05, + "loss": 2.3962, + "step": 1504 + }, + { + "epoch": 0.46193984039287905, + "grad_norm": 0.8319756984710693, + "learning_rate": 9.99316587141351e-05, + "loss": 2.342, + "step": 1505 + }, + { + "epoch": 0.4622467771639042, + "grad_norm": 0.846592903137207, + "learning_rate": 9.993139867382238e-05, + "loss": 2.4064, + "step": 1506 + }, + { + "epoch": 0.4625537139349294, + "grad_norm": 0.8567312955856323, + "learning_rate": 9.99311381400583e-05, + "loss": 2.3603, + "step": 1507 + }, + { + "epoch": 0.46286065070595456, + "grad_norm": 0.8784321546554565, + "learning_rate": 9.993087711284546e-05, + "loss": 2.4031, + "step": 1508 + }, + { + "epoch": 0.4631675874769797, + "grad_norm": 0.838233232498169, + "learning_rate": 9.993061559218641e-05, + "loss": 2.3156, + "step": 1509 + }, + { + "epoch": 0.4634745242480049, + "grad_norm": 0.8804462552070618, + "learning_rate": 9.993035357808376e-05, + "loss": 2.4322, + "step": 1510 + }, + { + "epoch": 0.46378146101903006, + "grad_norm": 1.1055982112884521, + "learning_rate": 9.99300910705401e-05, + "loss": 2.5006, + "step": 1511 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.9872145056724548, + "learning_rate": 9.992982806955799e-05, + "loss": 2.3547, + "step": 1512 + }, + { + "epoch": 0.4643953345610804, + "grad_norm": 1.0710479021072388, + "learning_rate": 9.99295645751401e-05, + "loss": 2.4867, + "step": 1513 + }, + { + "epoch": 0.46470227133210557, + "grad_norm": 0.9858919382095337, + "learning_rate": 9.992930058728894e-05, + "loss": 2.2986, + "step": 1514 + }, + { + "epoch": 0.46500920810313073, + "grad_norm": 0.9031065702438354, + "learning_rate": 9.992903610600719e-05, + "loss": 2.3172, + "step": 1515 + }, + { + "epoch": 0.4653161448741559, + "grad_norm": 0.923160970211029, + "learning_rate": 9.992877113129744e-05, + "loss": 2.4231, + "step": 1516 + }, + { + "epoch": 0.46562308164518107, + "grad_norm": 1.0130947828292847, + "learning_rate": 9.992850566316231e-05, + "loss": 2.3593, + "step": 1517 + }, + { + "epoch": 0.46593001841620624, + "grad_norm": 0.8947033286094666, + "learning_rate": 9.992823970160441e-05, + "loss": 2.3324, + "step": 1518 + }, + { + "epoch": 0.4662369551872314, + "grad_norm": 0.8819900155067444, + "learning_rate": 9.992797324662639e-05, + "loss": 2.2885, + "step": 1519 + }, + { + "epoch": 0.4665438919582566, + "grad_norm": 0.9434374570846558, + "learning_rate": 9.99277062982309e-05, + "loss": 2.427, + "step": 1520 + }, + { + "epoch": 0.46685082872928174, + "grad_norm": 0.9568646550178528, + "learning_rate": 9.99274388564205e-05, + "loss": 2.4059, + "step": 1521 + }, + { + "epoch": 0.4671577655003069, + "grad_norm": 0.9125105142593384, + "learning_rate": 9.992717092119794e-05, + "loss": 2.3306, + "step": 1522 + }, + { + "epoch": 0.46746470227133213, + "grad_norm": 0.8893206715583801, + "learning_rate": 9.992690249256578e-05, + "loss": 2.4211, + "step": 1523 + }, + { + "epoch": 0.4677716390423573, + "grad_norm": 0.8655402660369873, + "learning_rate": 9.992663357052672e-05, + "loss": 2.3493, + "step": 1524 + }, + { + "epoch": 0.46807857581338247, + "grad_norm": 0.7973037958145142, + "learning_rate": 9.99263641550834e-05, + "loss": 2.4255, + "step": 1525 + }, + { + "epoch": 0.46838551258440764, + "grad_norm": 0.8158934116363525, + "learning_rate": 9.992609424623849e-05, + "loss": 2.3518, + "step": 1526 + }, + { + "epoch": 0.4686924493554328, + "grad_norm": 0.7919436693191528, + "learning_rate": 9.992582384399465e-05, + "loss": 2.3762, + "step": 1527 + }, + { + "epoch": 0.468999386126458, + "grad_norm": 0.911490261554718, + "learning_rate": 9.992555294835455e-05, + "loss": 2.454, + "step": 1528 + }, + { + "epoch": 0.46930632289748314, + "grad_norm": 0.9504674077033997, + "learning_rate": 9.992528155932088e-05, + "loss": 2.3554, + "step": 1529 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.9833991527557373, + "learning_rate": 9.99250096768963e-05, + "loss": 2.4245, + "step": 1530 + }, + { + "epoch": 0.4699201964395335, + "grad_norm": 0.9994687438011169, + "learning_rate": 9.992473730108354e-05, + "loss": 2.3269, + "step": 1531 + }, + { + "epoch": 0.47022713321055865, + "grad_norm": 0.977237343788147, + "learning_rate": 9.992446443188526e-05, + "loss": 2.3938, + "step": 1532 + }, + { + "epoch": 0.4705340699815838, + "grad_norm": 1.018334150314331, + "learning_rate": 9.992419106930415e-05, + "loss": 2.3076, + "step": 1533 + }, + { + "epoch": 0.470841006752609, + "grad_norm": 0.9752077460289001, + "learning_rate": 9.992391721334293e-05, + "loss": 2.4224, + "step": 1534 + }, + { + "epoch": 0.47114794352363415, + "grad_norm": 0.9457291960716248, + "learning_rate": 9.992364286400428e-05, + "loss": 2.3859, + "step": 1535 + }, + { + "epoch": 0.4714548802946593, + "grad_norm": 0.9112275838851929, + "learning_rate": 9.992336802129096e-05, + "loss": 2.3343, + "step": 1536 + }, + { + "epoch": 0.4717618170656845, + "grad_norm": 0.7701164484024048, + "learning_rate": 9.992309268520563e-05, + "loss": 2.3912, + "step": 1537 + }, + { + "epoch": 0.47206875383670965, + "grad_norm": 0.826822817325592, + "learning_rate": 9.992281685575105e-05, + "loss": 2.3794, + "step": 1538 + }, + { + "epoch": 0.4723756906077348, + "grad_norm": 0.8690019249916077, + "learning_rate": 9.992254053292994e-05, + "loss": 2.3474, + "step": 1539 + }, + { + "epoch": 0.47268262737876, + "grad_norm": 0.935954213142395, + "learning_rate": 9.9922263716745e-05, + "loss": 2.3794, + "step": 1540 + }, + { + "epoch": 0.47298956414978516, + "grad_norm": 1.0606616735458374, + "learning_rate": 9.992198640719901e-05, + "loss": 2.3491, + "step": 1541 + }, + { + "epoch": 0.4732965009208103, + "grad_norm": 1.0020630359649658, + "learning_rate": 9.992170860429469e-05, + "loss": 2.4723, + "step": 1542 + }, + { + "epoch": 0.4736034376918355, + "grad_norm": 0.9738268256187439, + "learning_rate": 9.992143030803476e-05, + "loss": 2.4282, + "step": 1543 + }, + { + "epoch": 0.47391037446286066, + "grad_norm": 1.0320461988449097, + "learning_rate": 9.992115151842203e-05, + "loss": 2.3935, + "step": 1544 + }, + { + "epoch": 0.47421731123388583, + "grad_norm": 0.926980197429657, + "learning_rate": 9.992087223545921e-05, + "loss": 2.4403, + "step": 1545 + }, + { + "epoch": 0.474524248004911, + "grad_norm": 0.8760805130004883, + "learning_rate": 9.992059245914906e-05, + "loss": 2.3282, + "step": 1546 + }, + { + "epoch": 0.47483118477593617, + "grad_norm": 0.807569146156311, + "learning_rate": 9.992031218949435e-05, + "loss": 2.351, + "step": 1547 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.7491574883460999, + "learning_rate": 9.992003142649788e-05, + "loss": 2.3788, + "step": 1548 + }, + { + "epoch": 0.4754450583179865, + "grad_norm": 0.8402566909790039, + "learning_rate": 9.99197501701624e-05, + "loss": 2.4025, + "step": 1549 + }, + { + "epoch": 0.47575199508901167, + "grad_norm": 0.9501824975013733, + "learning_rate": 9.991946842049067e-05, + "loss": 2.4433, + "step": 1550 + }, + { + "epoch": 0.47605893186003684, + "grad_norm": 1.0070267915725708, + "learning_rate": 9.99191861774855e-05, + "loss": 2.4267, + "step": 1551 + }, + { + "epoch": 0.476365868631062, + "grad_norm": 0.9052779078483582, + "learning_rate": 9.991890344114969e-05, + "loss": 2.37, + "step": 1552 + }, + { + "epoch": 0.4766728054020872, + "grad_norm": 0.9453344345092773, + "learning_rate": 9.9918620211486e-05, + "loss": 2.4687, + "step": 1553 + }, + { + "epoch": 0.47697974217311234, + "grad_norm": 0.9836863875389099, + "learning_rate": 9.991833648849725e-05, + "loss": 2.4005, + "step": 1554 + }, + { + "epoch": 0.4772866789441375, + "grad_norm": 0.856532633304596, + "learning_rate": 9.991805227218624e-05, + "loss": 2.329, + "step": 1555 + }, + { + "epoch": 0.4775936157151627, + "grad_norm": 0.8338705897331238, + "learning_rate": 9.991776756255579e-05, + "loss": 2.3648, + "step": 1556 + }, + { + "epoch": 0.47790055248618785, + "grad_norm": 0.7738644480705261, + "learning_rate": 9.991748235960869e-05, + "loss": 2.2784, + "step": 1557 + }, + { + "epoch": 0.478207489257213, + "grad_norm": 0.7771223783493042, + "learning_rate": 9.991719666334778e-05, + "loss": 2.2747, + "step": 1558 + }, + { + "epoch": 0.4785144260282382, + "grad_norm": 0.7564612627029419, + "learning_rate": 9.991691047377588e-05, + "loss": 2.2964, + "step": 1559 + }, + { + "epoch": 0.47882136279926335, + "grad_norm": 0.7877290844917297, + "learning_rate": 9.99166237908958e-05, + "loss": 2.3149, + "step": 1560 + }, + { + "epoch": 0.4791282995702885, + "grad_norm": 0.7967450022697449, + "learning_rate": 9.991633661471039e-05, + "loss": 2.4035, + "step": 1561 + }, + { + "epoch": 0.4794352363413137, + "grad_norm": 0.8993534445762634, + "learning_rate": 9.991604894522248e-05, + "loss": 2.4028, + "step": 1562 + }, + { + "epoch": 0.47974217311233885, + "grad_norm": 0.9135516881942749, + "learning_rate": 9.991576078243494e-05, + "loss": 2.3968, + "step": 1563 + }, + { + "epoch": 0.480049109883364, + "grad_norm": 0.8438525795936584, + "learning_rate": 9.991547212635057e-05, + "loss": 2.3589, + "step": 1564 + }, + { + "epoch": 0.4803560466543892, + "grad_norm": 0.8979686498641968, + "learning_rate": 9.991518297697226e-05, + "loss": 2.3835, + "step": 1565 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.8821539878845215, + "learning_rate": 9.991489333430286e-05, + "loss": 2.3503, + "step": 1566 + }, + { + "epoch": 0.4809699201964395, + "grad_norm": 0.8649077415466309, + "learning_rate": 9.991460319834523e-05, + "loss": 2.3806, + "step": 1567 + }, + { + "epoch": 0.4812768569674647, + "grad_norm": 0.8360965847969055, + "learning_rate": 9.991431256910223e-05, + "loss": 2.3997, + "step": 1568 + }, + { + "epoch": 0.48158379373848986, + "grad_norm": 0.9178828597068787, + "learning_rate": 9.991402144657673e-05, + "loss": 2.3611, + "step": 1569 + }, + { + "epoch": 0.48189073050951503, + "grad_norm": 0.7961607575416565, + "learning_rate": 9.991372983077161e-05, + "loss": 2.3588, + "step": 1570 + }, + { + "epoch": 0.4821976672805402, + "grad_norm": 0.8136993646621704, + "learning_rate": 9.991343772168978e-05, + "loss": 2.3241, + "step": 1571 + }, + { + "epoch": 0.48250460405156537, + "grad_norm": 0.8421273231506348, + "learning_rate": 9.991314511933407e-05, + "loss": 2.3493, + "step": 1572 + }, + { + "epoch": 0.48281154082259053, + "grad_norm": 0.774861752986908, + "learning_rate": 9.991285202370743e-05, + "loss": 2.362, + "step": 1573 + }, + { + "epoch": 0.4831184775936157, + "grad_norm": 0.9181589484214783, + "learning_rate": 9.991255843481273e-05, + "loss": 2.443, + "step": 1574 + }, + { + "epoch": 0.48342541436464087, + "grad_norm": 0.873884379863739, + "learning_rate": 9.991226435265286e-05, + "loss": 2.3819, + "step": 1575 + }, + { + "epoch": 0.48373235113566604, + "grad_norm": 0.923200786113739, + "learning_rate": 9.991196977723077e-05, + "loss": 2.4152, + "step": 1576 + }, + { + "epoch": 0.4840392879066912, + "grad_norm": 0.9097923040390015, + "learning_rate": 9.99116747085493e-05, + "loss": 2.4072, + "step": 1577 + }, + { + "epoch": 0.4843462246777164, + "grad_norm": 0.8885805010795593, + "learning_rate": 9.991137914661143e-05, + "loss": 2.3963, + "step": 1578 + }, + { + "epoch": 0.48465316144874154, + "grad_norm": 0.9016655683517456, + "learning_rate": 9.991108309142006e-05, + "loss": 2.4287, + "step": 1579 + }, + { + "epoch": 0.4849600982197667, + "grad_norm": 0.957548201084137, + "learning_rate": 9.99107865429781e-05, + "loss": 2.4306, + "step": 1580 + }, + { + "epoch": 0.4852670349907919, + "grad_norm": 0.9604195356369019, + "learning_rate": 9.99104895012885e-05, + "loss": 2.3721, + "step": 1581 + }, + { + "epoch": 0.48557397176181705, + "grad_norm": 1.0423815250396729, + "learning_rate": 9.991019196635419e-05, + "loss": 2.3847, + "step": 1582 + }, + { + "epoch": 0.4858809085328422, + "grad_norm": 0.9538045525550842, + "learning_rate": 9.990989393817809e-05, + "loss": 2.4307, + "step": 1583 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 1.0103334188461304, + "learning_rate": 9.990959541676318e-05, + "loss": 2.409, + "step": 1584 + }, + { + "epoch": 0.48649478207489255, + "grad_norm": 1.0780646800994873, + "learning_rate": 9.99092964021124e-05, + "loss": 2.3314, + "step": 1585 + }, + { + "epoch": 0.4868017188459177, + "grad_norm": 1.0062072277069092, + "learning_rate": 9.99089968942287e-05, + "loss": 2.3922, + "step": 1586 + }, + { + "epoch": 0.4871086556169429, + "grad_norm": 1.0575196743011475, + "learning_rate": 9.990869689311504e-05, + "loss": 2.4156, + "step": 1587 + }, + { + "epoch": 0.48741559238796806, + "grad_norm": 0.9953998923301697, + "learning_rate": 9.990839639877438e-05, + "loss": 2.381, + "step": 1588 + }, + { + "epoch": 0.4877225291589932, + "grad_norm": 0.8848470449447632, + "learning_rate": 9.99080954112097e-05, + "loss": 2.4178, + "step": 1589 + }, + { + "epoch": 0.4880294659300184, + "grad_norm": 0.7849117517471313, + "learning_rate": 9.990779393042397e-05, + "loss": 2.3021, + "step": 1590 + }, + { + "epoch": 0.48833640270104356, + "grad_norm": 0.7611599564552307, + "learning_rate": 9.990749195642016e-05, + "loss": 2.4426, + "step": 1591 + }, + { + "epoch": 0.4886433394720687, + "grad_norm": 0.8361895084381104, + "learning_rate": 9.990718948920127e-05, + "loss": 2.3442, + "step": 1592 + }, + { + "epoch": 0.4889502762430939, + "grad_norm": 0.8249576687812805, + "learning_rate": 9.990688652877028e-05, + "loss": 2.2745, + "step": 1593 + }, + { + "epoch": 0.4892572130141191, + "grad_norm": 0.763889729976654, + "learning_rate": 9.990658307513019e-05, + "loss": 2.3123, + "step": 1594 + }, + { + "epoch": 0.4895641497851443, + "grad_norm": 0.7517281770706177, + "learning_rate": 9.990627912828399e-05, + "loss": 2.3811, + "step": 1595 + }, + { + "epoch": 0.48987108655616945, + "grad_norm": 0.8254112005233765, + "learning_rate": 9.990597468823468e-05, + "loss": 2.4269, + "step": 1596 + }, + { + "epoch": 0.4901780233271946, + "grad_norm": 0.8267236948013306, + "learning_rate": 9.99056697549853e-05, + "loss": 2.354, + "step": 1597 + }, + { + "epoch": 0.4904849600982198, + "grad_norm": 0.8511303067207336, + "learning_rate": 9.990536432853881e-05, + "loss": 2.3755, + "step": 1598 + }, + { + "epoch": 0.49079189686924496, + "grad_norm": 0.8639636635780334, + "learning_rate": 9.990505840889828e-05, + "loss": 2.3828, + "step": 1599 + }, + { + "epoch": 0.4910988336402701, + "grad_norm": 0.8371795415878296, + "learning_rate": 9.990475199606672e-05, + "loss": 2.4235, + "step": 1600 + }, + { + "epoch": 0.4914057704112953, + "grad_norm": 0.7639186382293701, + "learning_rate": 9.990444509004713e-05, + "loss": 2.3547, + "step": 1601 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.7835492491722107, + "learning_rate": 9.990413769084257e-05, + "loss": 2.2983, + "step": 1602 + }, + { + "epoch": 0.49201964395334563, + "grad_norm": 0.8301565647125244, + "learning_rate": 9.990382979845609e-05, + "loss": 2.4109, + "step": 1603 + }, + { + "epoch": 0.4923265807243708, + "grad_norm": 0.9005976915359497, + "learning_rate": 9.99035214128907e-05, + "loss": 2.3618, + "step": 1604 + }, + { + "epoch": 0.49263351749539597, + "grad_norm": 1.0234936475753784, + "learning_rate": 9.990321253414945e-05, + "loss": 2.4622, + "step": 1605 + }, + { + "epoch": 0.49294045426642114, + "grad_norm": 1.1613819599151611, + "learning_rate": 9.990290316223542e-05, + "loss": 2.3231, + "step": 1606 + }, + { + "epoch": 0.4932473910374463, + "grad_norm": 0.9382983446121216, + "learning_rate": 9.990259329715165e-05, + "loss": 2.357, + "step": 1607 + }, + { + "epoch": 0.49355432780847147, + "grad_norm": 1.0277435779571533, + "learning_rate": 9.990228293890121e-05, + "loss": 2.3497, + "step": 1608 + }, + { + "epoch": 0.49386126457949664, + "grad_norm": 0.9809542894363403, + "learning_rate": 9.990197208748716e-05, + "loss": 2.363, + "step": 1609 + }, + { + "epoch": 0.4941682013505218, + "grad_norm": 1.151412844657898, + "learning_rate": 9.990166074291255e-05, + "loss": 2.4859, + "step": 1610 + }, + { + "epoch": 0.494475138121547, + "grad_norm": 0.9663482308387756, + "learning_rate": 9.990134890518051e-05, + "loss": 2.3848, + "step": 1611 + }, + { + "epoch": 0.49478207489257214, + "grad_norm": 0.9619266986846924, + "learning_rate": 9.990103657429405e-05, + "loss": 2.3381, + "step": 1612 + }, + { + "epoch": 0.4950890116635973, + "grad_norm": 1.1306475400924683, + "learning_rate": 9.990072375025634e-05, + "loss": 2.3859, + "step": 1613 + }, + { + "epoch": 0.4953959484346225, + "grad_norm": 1.127801537513733, + "learning_rate": 9.990041043307043e-05, + "loss": 2.4259, + "step": 1614 + }, + { + "epoch": 0.49570288520564765, + "grad_norm": 0.9880200624465942, + "learning_rate": 9.990009662273941e-05, + "loss": 2.3629, + "step": 1615 + }, + { + "epoch": 0.4960098219766728, + "grad_norm": 0.940493643283844, + "learning_rate": 9.989978231926636e-05, + "loss": 2.3716, + "step": 1616 + }, + { + "epoch": 0.496316758747698, + "grad_norm": 0.7923702597618103, + "learning_rate": 9.989946752265445e-05, + "loss": 2.3017, + "step": 1617 + }, + { + "epoch": 0.49662369551872315, + "grad_norm": 0.7668408155441284, + "learning_rate": 9.989915223290673e-05, + "loss": 2.3273, + "step": 1618 + }, + { + "epoch": 0.4969306322897483, + "grad_norm": 0.7134098410606384, + "learning_rate": 9.989883645002636e-05, + "loss": 2.302, + "step": 1619 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.6878800392150879, + "learning_rate": 9.989852017401643e-05, + "loss": 2.3047, + "step": 1620 + }, + { + "epoch": 0.49754450583179866, + "grad_norm": 0.8099397420883179, + "learning_rate": 9.989820340488008e-05, + "loss": 2.4747, + "step": 1621 + }, + { + "epoch": 0.4978514426028238, + "grad_norm": 0.9677640795707703, + "learning_rate": 9.989788614262043e-05, + "loss": 2.3347, + "step": 1622 + }, + { + "epoch": 0.498158379373849, + "grad_norm": 0.7592893838882446, + "learning_rate": 9.989756838724064e-05, + "loss": 2.3238, + "step": 1623 + }, + { + "epoch": 0.49846531614487416, + "grad_norm": 0.872529923915863, + "learning_rate": 9.989725013874382e-05, + "loss": 2.4117, + "step": 1624 + }, + { + "epoch": 0.49877225291589933, + "grad_norm": 1.023362159729004, + "learning_rate": 9.989693139713315e-05, + "loss": 2.3307, + "step": 1625 + }, + { + "epoch": 0.4990791896869245, + "grad_norm": 0.8994693756103516, + "learning_rate": 9.989661216241172e-05, + "loss": 2.3661, + "step": 1626 + }, + { + "epoch": 0.49938612645794966, + "grad_norm": 0.8854429125785828, + "learning_rate": 9.989629243458275e-05, + "loss": 2.311, + "step": 1627 + }, + { + "epoch": 0.49969306322897483, + "grad_norm": 0.8326926231384277, + "learning_rate": 9.989597221364937e-05, + "loss": 2.302, + "step": 1628 + }, + { + "epoch": 0.5, + "grad_norm": 0.8778239488601685, + "learning_rate": 9.989565149961475e-05, + "loss": 2.4653, + "step": 1629 + }, + { + "epoch": 0.5003069367710252, + "grad_norm": 0.9369759559631348, + "learning_rate": 9.989533029248205e-05, + "loss": 2.4165, + "step": 1630 + }, + { + "epoch": 0.5006138735420503, + "grad_norm": 0.8510915637016296, + "learning_rate": 9.989500859225445e-05, + "loss": 2.3345, + "step": 1631 + }, + { + "epoch": 0.5009208103130756, + "grad_norm": 0.787972629070282, + "learning_rate": 9.989468639893513e-05, + "loss": 2.283, + "step": 1632 + }, + { + "epoch": 0.5012277470841007, + "grad_norm": 0.7370568513870239, + "learning_rate": 9.989436371252729e-05, + "loss": 2.2867, + "step": 1633 + }, + { + "epoch": 0.5015346838551259, + "grad_norm": 0.8459502458572388, + "learning_rate": 9.989404053303409e-05, + "loss": 2.2875, + "step": 1634 + }, + { + "epoch": 0.501841620626151, + "grad_norm": 0.9123181700706482, + "learning_rate": 9.989371686045874e-05, + "loss": 2.2653, + "step": 1635 + }, + { + "epoch": 0.5021485573971762, + "grad_norm": 1.1908178329467773, + "learning_rate": 9.989339269480445e-05, + "loss": 2.4849, + "step": 1636 + }, + { + "epoch": 0.5024554941682013, + "grad_norm": 0.8162623643875122, + "learning_rate": 9.989306803607439e-05, + "loss": 2.2409, + "step": 1637 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.9289522171020508, + "learning_rate": 9.98927428842718e-05, + "loss": 2.455, + "step": 1638 + }, + { + "epoch": 0.5030693677102517, + "grad_norm": 1.212346076965332, + "learning_rate": 9.989241723939988e-05, + "loss": 2.3461, + "step": 1639 + }, + { + "epoch": 0.5033763044812769, + "grad_norm": 0.8971593976020813, + "learning_rate": 9.989209110146184e-05, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.503683241252302, + "grad_norm": 0.9293156862258911, + "learning_rate": 9.989176447046092e-05, + "loss": 2.3235, + "step": 1641 + }, + { + "epoch": 0.5039901780233272, + "grad_norm": 0.8665596842765808, + "learning_rate": 9.989143734640034e-05, + "loss": 2.4694, + "step": 1642 + }, + { + "epoch": 0.5042971147943524, + "grad_norm": 0.7732648253440857, + "learning_rate": 9.989110972928333e-05, + "loss": 2.1985, + "step": 1643 + }, + { + "epoch": 0.5046040515653776, + "grad_norm": 0.8124692440032959, + "learning_rate": 9.989078161911314e-05, + "loss": 2.315, + "step": 1644 + }, + { + "epoch": 0.5049109883364027, + "grad_norm": 0.8534342050552368, + "learning_rate": 9.989045301589301e-05, + "loss": 2.3491, + "step": 1645 + }, + { + "epoch": 0.5052179251074279, + "grad_norm": 0.8351274132728577, + "learning_rate": 9.989012391962617e-05, + "loss": 2.3416, + "step": 1646 + }, + { + "epoch": 0.505524861878453, + "grad_norm": 0.9143189787864685, + "learning_rate": 9.988979433031588e-05, + "loss": 2.4665, + "step": 1647 + }, + { + "epoch": 0.5058317986494782, + "grad_norm": 0.8978474140167236, + "learning_rate": 9.988946424796542e-05, + "loss": 2.389, + "step": 1648 + }, + { + "epoch": 0.5061387354205034, + "grad_norm": 1.0245648622512817, + "learning_rate": 9.988913367257802e-05, + "loss": 2.3391, + "step": 1649 + }, + { + "epoch": 0.5064456721915286, + "grad_norm": 0.9991573691368103, + "learning_rate": 9.988880260415695e-05, + "loss": 2.405, + "step": 1650 + }, + { + "epoch": 0.5067526089625537, + "grad_norm": 1.042378306388855, + "learning_rate": 9.98884710427055e-05, + "loss": 2.3467, + "step": 1651 + }, + { + "epoch": 0.5070595457335789, + "grad_norm": 0.9569510817527771, + "learning_rate": 9.988813898822694e-05, + "loss": 2.31, + "step": 1652 + }, + { + "epoch": 0.507366482504604, + "grad_norm": 0.9343158006668091, + "learning_rate": 9.988780644072456e-05, + "loss": 2.3659, + "step": 1653 + }, + { + "epoch": 0.5076734192756293, + "grad_norm": 0.7857093811035156, + "learning_rate": 9.988747340020162e-05, + "loss": 2.3424, + "step": 1654 + }, + { + "epoch": 0.5079803560466544, + "grad_norm": 0.7613041996955872, + "learning_rate": 9.988713986666144e-05, + "loss": 2.2698, + "step": 1655 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.8077516555786133, + "learning_rate": 9.98868058401073e-05, + "loss": 2.3827, + "step": 1656 + }, + { + "epoch": 0.5085942295887047, + "grad_norm": 0.8794304132461548, + "learning_rate": 9.98864713205425e-05, + "loss": 2.3079, + "step": 1657 + }, + { + "epoch": 0.5089011663597299, + "grad_norm": 0.8333674073219299, + "learning_rate": 9.988613630797036e-05, + "loss": 2.3622, + "step": 1658 + }, + { + "epoch": 0.509208103130755, + "grad_norm": 0.9654781222343445, + "learning_rate": 9.988580080239417e-05, + "loss": 2.3979, + "step": 1659 + }, + { + "epoch": 0.5095150399017803, + "grad_norm": 0.9278727769851685, + "learning_rate": 9.988546480381727e-05, + "loss": 2.3728, + "step": 1660 + }, + { + "epoch": 0.5098219766728054, + "grad_norm": 0.7971704006195068, + "learning_rate": 9.988512831224298e-05, + "loss": 2.2983, + "step": 1661 + }, + { + "epoch": 0.5101289134438306, + "grad_norm": 0.8991698026657104, + "learning_rate": 9.988479132767459e-05, + "loss": 2.3992, + "step": 1662 + }, + { + "epoch": 0.5104358502148557, + "grad_norm": 1.0208392143249512, + "learning_rate": 9.988445385011546e-05, + "loss": 2.3847, + "step": 1663 + }, + { + "epoch": 0.5107427869858809, + "grad_norm": 0.878237247467041, + "learning_rate": 9.988411587956891e-05, + "loss": 2.2851, + "step": 1664 + }, + { + "epoch": 0.511049723756906, + "grad_norm": 0.903287410736084, + "learning_rate": 9.98837774160383e-05, + "loss": 2.4233, + "step": 1665 + }, + { + "epoch": 0.5113566605279313, + "grad_norm": 0.8845674991607666, + "learning_rate": 9.988343845952697e-05, + "loss": 2.2923, + "step": 1666 + }, + { + "epoch": 0.5116635972989564, + "grad_norm": 0.7729392051696777, + "learning_rate": 9.988309901003825e-05, + "loss": 2.3044, + "step": 1667 + }, + { + "epoch": 0.5119705340699816, + "grad_norm": 0.719302237033844, + "learning_rate": 9.988275906757551e-05, + "loss": 2.3207, + "step": 1668 + }, + { + "epoch": 0.5122774708410067, + "grad_norm": 0.7205179333686829, + "learning_rate": 9.988241863214211e-05, + "loss": 2.341, + "step": 1669 + }, + { + "epoch": 0.512584407612032, + "grad_norm": 0.7318145036697388, + "learning_rate": 9.988207770374142e-05, + "loss": 2.3419, + "step": 1670 + }, + { + "epoch": 0.5128913443830571, + "grad_norm": 0.770630955696106, + "learning_rate": 9.98817362823768e-05, + "loss": 2.27, + "step": 1671 + }, + { + "epoch": 0.5131982811540823, + "grad_norm": 0.6485452651977539, + "learning_rate": 9.988139436805162e-05, + "loss": 2.2715, + "step": 1672 + }, + { + "epoch": 0.5135052179251074, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.988105196076925e-05, + "loss": 2.2806, + "step": 1673 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.695818305015564, + "learning_rate": 9.98807090605331e-05, + "loss": 2.3387, + "step": 1674 + }, + { + "epoch": 0.5141190914671577, + "grad_norm": 0.7685426473617554, + "learning_rate": 9.988036566734655e-05, + "loss": 2.2921, + "step": 1675 + }, + { + "epoch": 0.514426028238183, + "grad_norm": 0.6522897481918335, + "learning_rate": 9.988002178121301e-05, + "loss": 2.2507, + "step": 1676 + }, + { + "epoch": 0.5147329650092081, + "grad_norm": 0.7442181706428528, + "learning_rate": 9.987967740213583e-05, + "loss": 2.3292, + "step": 1677 + }, + { + "epoch": 0.5150399017802333, + "grad_norm": 0.8093023300170898, + "learning_rate": 9.987933253011846e-05, + "loss": 2.3384, + "step": 1678 + }, + { + "epoch": 0.5153468385512584, + "grad_norm": 0.8014655113220215, + "learning_rate": 9.987898716516428e-05, + "loss": 2.3619, + "step": 1679 + }, + { + "epoch": 0.5156537753222836, + "grad_norm": 0.8230258822441101, + "learning_rate": 9.987864130727671e-05, + "loss": 2.3242, + "step": 1680 + }, + { + "epoch": 0.5159607120933087, + "grad_norm": 0.9222247004508972, + "learning_rate": 9.987829495645918e-05, + "loss": 2.3907, + "step": 1681 + }, + { + "epoch": 0.516267648864334, + "grad_norm": 0.9293351769447327, + "learning_rate": 9.987794811271511e-05, + "loss": 2.3632, + "step": 1682 + }, + { + "epoch": 0.5165745856353591, + "grad_norm": 0.9555168747901917, + "learning_rate": 9.987760077604791e-05, + "loss": 2.3273, + "step": 1683 + }, + { + "epoch": 0.5168815224063843, + "grad_norm": 0.9839370250701904, + "learning_rate": 9.987725294646102e-05, + "loss": 2.3451, + "step": 1684 + }, + { + "epoch": 0.5171884591774094, + "grad_norm": 1.097970962524414, + "learning_rate": 9.987690462395791e-05, + "loss": 2.308, + "step": 1685 + }, + { + "epoch": 0.5174953959484346, + "grad_norm": 0.9345484972000122, + "learning_rate": 9.987655580854198e-05, + "loss": 2.3051, + "step": 1686 + }, + { + "epoch": 0.5178023327194597, + "grad_norm": 0.8075851798057556, + "learning_rate": 9.987620650021668e-05, + "loss": 2.3005, + "step": 1687 + }, + { + "epoch": 0.518109269490485, + "grad_norm": 0.7287935614585876, + "learning_rate": 9.987585669898549e-05, + "loss": 2.3709, + "step": 1688 + }, + { + "epoch": 0.5184162062615101, + "grad_norm": 0.7611173987388611, + "learning_rate": 9.987550640485184e-05, + "loss": 2.3265, + "step": 1689 + }, + { + "epoch": 0.5187231430325353, + "grad_norm": 0.7932588458061218, + "learning_rate": 9.987515561781921e-05, + "loss": 2.3625, + "step": 1690 + }, + { + "epoch": 0.5190300798035604, + "grad_norm": 0.7837479114532471, + "learning_rate": 9.987480433789106e-05, + "loss": 2.2614, + "step": 1691 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.905799925327301, + "learning_rate": 9.987445256507085e-05, + "loss": 2.2915, + "step": 1692 + }, + { + "epoch": 0.5196439533456108, + "grad_norm": 0.9417183995246887, + "learning_rate": 9.987410029936208e-05, + "loss": 2.3624, + "step": 1693 + }, + { + "epoch": 0.519950890116636, + "grad_norm": 0.9971327185630798, + "learning_rate": 9.987374754076822e-05, + "loss": 2.3913, + "step": 1694 + }, + { + "epoch": 0.5202578268876611, + "grad_norm": 0.8719072341918945, + "learning_rate": 9.987339428929274e-05, + "loss": 2.3412, + "step": 1695 + }, + { + "epoch": 0.5205647636586863, + "grad_norm": 0.8198116421699524, + "learning_rate": 9.987304054493916e-05, + "loss": 2.333, + "step": 1696 + }, + { + "epoch": 0.5208717004297114, + "grad_norm": 0.7450931668281555, + "learning_rate": 9.987268630771096e-05, + "loss": 2.2817, + "step": 1697 + }, + { + "epoch": 0.5211786372007366, + "grad_norm": 0.6867587566375732, + "learning_rate": 9.987233157761164e-05, + "loss": 2.3456, + "step": 1698 + }, + { + "epoch": 0.5214855739717618, + "grad_norm": 0.7537778615951538, + "learning_rate": 9.987197635464471e-05, + "loss": 2.176, + "step": 1699 + }, + { + "epoch": 0.521792510742787, + "grad_norm": 0.8347577452659607, + "learning_rate": 9.987162063881366e-05, + "loss": 2.3296, + "step": 1700 + }, + { + "epoch": 0.5220994475138122, + "grad_norm": 0.8714643120765686, + "learning_rate": 9.987126443012205e-05, + "loss": 2.3648, + "step": 1701 + }, + { + "epoch": 0.5224063842848373, + "grad_norm": 0.8579849004745483, + "learning_rate": 9.987090772857336e-05, + "loss": 2.4189, + "step": 1702 + }, + { + "epoch": 0.5227133210558625, + "grad_norm": 0.8651238083839417, + "learning_rate": 9.987055053417114e-05, + "loss": 2.3036, + "step": 1703 + }, + { + "epoch": 0.5230202578268877, + "grad_norm": 0.8447873592376709, + "learning_rate": 9.98701928469189e-05, + "loss": 2.3243, + "step": 1704 + }, + { + "epoch": 0.5233271945979129, + "grad_norm": 0.8218941688537598, + "learning_rate": 9.986983466682019e-05, + "loss": 2.3888, + "step": 1705 + }, + { + "epoch": 0.523634131368938, + "grad_norm": 0.7862920761108398, + "learning_rate": 9.986947599387855e-05, + "loss": 2.335, + "step": 1706 + }, + { + "epoch": 0.5239410681399632, + "grad_norm": 0.8096200227737427, + "learning_rate": 9.986911682809749e-05, + "loss": 2.4034, + "step": 1707 + }, + { + "epoch": 0.5242480049109883, + "grad_norm": 0.8217427730560303, + "learning_rate": 9.986875716948062e-05, + "loss": 2.2659, + "step": 1708 + }, + { + "epoch": 0.5245549416820136, + "grad_norm": 0.7676928043365479, + "learning_rate": 9.986839701803146e-05, + "loss": 2.2736, + "step": 1709 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.7783572673797607, + "learning_rate": 9.986803637375356e-05, + "loss": 2.3611, + "step": 1710 + }, + { + "epoch": 0.5251688152240639, + "grad_norm": 0.7657338380813599, + "learning_rate": 9.98676752366505e-05, + "loss": 2.3573, + "step": 1711 + }, + { + "epoch": 0.525475751995089, + "grad_norm": 0.8946976065635681, + "learning_rate": 9.986731360672585e-05, + "loss": 2.3443, + "step": 1712 + }, + { + "epoch": 0.5257826887661142, + "grad_norm": 0.8047227263450623, + "learning_rate": 9.986695148398318e-05, + "loss": 2.345, + "step": 1713 + }, + { + "epoch": 0.5260896255371393, + "grad_norm": 0.8407939672470093, + "learning_rate": 9.986658886842605e-05, + "loss": 2.2828, + "step": 1714 + }, + { + "epoch": 0.5263965623081646, + "grad_norm": 0.8460215330123901, + "learning_rate": 9.986622576005806e-05, + "loss": 2.2786, + "step": 1715 + }, + { + "epoch": 0.5267034990791897, + "grad_norm": 0.8291949033737183, + "learning_rate": 9.986586215888283e-05, + "loss": 2.3491, + "step": 1716 + }, + { + "epoch": 0.5270104358502149, + "grad_norm": 0.8812628388404846, + "learning_rate": 9.98654980649039e-05, + "loss": 2.3392, + "step": 1717 + }, + { + "epoch": 0.52731737262124, + "grad_norm": 0.8666933178901672, + "learning_rate": 9.98651334781249e-05, + "loss": 2.2585, + "step": 1718 + }, + { + "epoch": 0.5276243093922652, + "grad_norm": 0.8393275737762451, + "learning_rate": 9.986476839854941e-05, + "loss": 2.3315, + "step": 1719 + }, + { + "epoch": 0.5279312461632903, + "grad_norm": 0.8431777954101562, + "learning_rate": 9.986440282618105e-05, + "loss": 2.268, + "step": 1720 + }, + { + "epoch": 0.5282381829343156, + "grad_norm": 0.8020747900009155, + "learning_rate": 9.986403676102346e-05, + "loss": 2.2306, + "step": 1721 + }, + { + "epoch": 0.5285451197053407, + "grad_norm": 0.817395806312561, + "learning_rate": 9.986367020308022e-05, + "loss": 2.2914, + "step": 1722 + }, + { + "epoch": 0.5288520564763659, + "grad_norm": 0.8034493327140808, + "learning_rate": 9.986330315235497e-05, + "loss": 2.3598, + "step": 1723 + }, + { + "epoch": 0.529158993247391, + "grad_norm": 0.9001252055168152, + "learning_rate": 9.986293560885131e-05, + "loss": 2.3456, + "step": 1724 + }, + { + "epoch": 0.5294659300184162, + "grad_norm": 0.9782349467277527, + "learning_rate": 9.986256757257293e-05, + "loss": 2.231, + "step": 1725 + }, + { + "epoch": 0.5297728667894414, + "grad_norm": 1.0022578239440918, + "learning_rate": 9.98621990435234e-05, + "loss": 2.3457, + "step": 1726 + }, + { + "epoch": 0.5300798035604666, + "grad_norm": 1.0705206394195557, + "learning_rate": 9.986183002170642e-05, + "loss": 2.2775, + "step": 1727 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.8464064598083496, + "learning_rate": 9.98614605071256e-05, + "loss": 2.4006, + "step": 1728 + }, + { + "epoch": 0.5306936771025169, + "grad_norm": 0.7128132581710815, + "learning_rate": 9.98610904997846e-05, + "loss": 2.3273, + "step": 1729 + }, + { + "epoch": 0.531000613873542, + "grad_norm": 0.8113927245140076, + "learning_rate": 9.986071999968706e-05, + "loss": 2.3467, + "step": 1730 + }, + { + "epoch": 0.5313075506445673, + "grad_norm": 0.9236831665039062, + "learning_rate": 9.986034900683669e-05, + "loss": 2.3815, + "step": 1731 + }, + { + "epoch": 0.5316144874155924, + "grad_norm": 0.9325668811798096, + "learning_rate": 9.985997752123713e-05, + "loss": 2.3411, + "step": 1732 + }, + { + "epoch": 0.5319214241866176, + "grad_norm": 0.9585117101669312, + "learning_rate": 9.985960554289203e-05, + "loss": 2.3309, + "step": 1733 + }, + { + "epoch": 0.5322283609576427, + "grad_norm": 0.9459986686706543, + "learning_rate": 9.98592330718051e-05, + "loss": 2.3525, + "step": 1734 + }, + { + "epoch": 0.5325352977286679, + "grad_norm": 0.971592366695404, + "learning_rate": 9.985886010797997e-05, + "loss": 2.3665, + "step": 1735 + }, + { + "epoch": 0.532842234499693, + "grad_norm": 0.8533779978752136, + "learning_rate": 9.985848665142039e-05, + "loss": 2.26, + "step": 1736 + }, + { + "epoch": 0.5331491712707183, + "grad_norm": 0.8224228620529175, + "learning_rate": 9.985811270213002e-05, + "loss": 2.3523, + "step": 1737 + }, + { + "epoch": 0.5334561080417434, + "grad_norm": 0.8649810552597046, + "learning_rate": 9.985773826011255e-05, + "loss": 2.3262, + "step": 1738 + }, + { + "epoch": 0.5337630448127686, + "grad_norm": 0.8099339604377747, + "learning_rate": 9.98573633253717e-05, + "loss": 2.3038, + "step": 1739 + }, + { + "epoch": 0.5340699815837937, + "grad_norm": 0.6788219213485718, + "learning_rate": 9.985698789791115e-05, + "loss": 2.3278, + "step": 1740 + }, + { + "epoch": 0.5343769183548189, + "grad_norm": 0.8716040253639221, + "learning_rate": 9.985661197773464e-05, + "loss": 2.2955, + "step": 1741 + }, + { + "epoch": 0.534683855125844, + "grad_norm": 0.8377614617347717, + "learning_rate": 9.985623556484587e-05, + "loss": 2.2801, + "step": 1742 + }, + { + "epoch": 0.5349907918968693, + "grad_norm": 0.8452683091163635, + "learning_rate": 9.985585865924853e-05, + "loss": 2.3313, + "step": 1743 + }, + { + "epoch": 0.5352977286678944, + "grad_norm": 0.8226203918457031, + "learning_rate": 9.98554812609464e-05, + "loss": 2.3464, + "step": 1744 + }, + { + "epoch": 0.5356046654389196, + "grad_norm": 0.7476974725723267, + "learning_rate": 9.985510336994316e-05, + "loss": 2.3721, + "step": 1745 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.7132230997085571, + "learning_rate": 9.98547249862426e-05, + "loss": 2.2657, + "step": 1746 + }, + { + "epoch": 0.5362185389809699, + "grad_norm": 0.7022002339363098, + "learning_rate": 9.98543461098484e-05, + "loss": 2.2656, + "step": 1747 + }, + { + "epoch": 0.536525475751995, + "grad_norm": 0.7174789309501648, + "learning_rate": 9.985396674076435e-05, + "loss": 2.2914, + "step": 1748 + }, + { + "epoch": 0.5368324125230203, + "grad_norm": 0.78509920835495, + "learning_rate": 9.985358687899417e-05, + "loss": 2.3155, + "step": 1749 + }, + { + "epoch": 0.5371393492940454, + "grad_norm": 0.7670894861221313, + "learning_rate": 9.985320652454162e-05, + "loss": 2.2608, + "step": 1750 + }, + { + "epoch": 0.5374462860650706, + "grad_norm": 0.6196603178977966, + "learning_rate": 9.985282567741047e-05, + "loss": 2.2796, + "step": 1751 + }, + { + "epoch": 0.5377532228360957, + "grad_norm": 0.7119829058647156, + "learning_rate": 9.985244433760448e-05, + "loss": 2.2262, + "step": 1752 + }, + { + "epoch": 0.538060159607121, + "grad_norm": 0.6665359735488892, + "learning_rate": 9.98520625051274e-05, + "loss": 2.2714, + "step": 1753 + }, + { + "epoch": 0.5383670963781461, + "grad_norm": 0.7960934042930603, + "learning_rate": 9.985168017998303e-05, + "loss": 2.3703, + "step": 1754 + }, + { + "epoch": 0.5386740331491713, + "grad_norm": 0.9428521394729614, + "learning_rate": 9.985129736217513e-05, + "loss": 2.3334, + "step": 1755 + }, + { + "epoch": 0.5389809699201964, + "grad_norm": 0.9900842905044556, + "learning_rate": 9.985091405170751e-05, + "loss": 2.2369, + "step": 1756 + }, + { + "epoch": 0.5392879066912216, + "grad_norm": 0.9340593814849854, + "learning_rate": 9.985053024858393e-05, + "loss": 2.4332, + "step": 1757 + }, + { + "epoch": 0.5395948434622467, + "grad_norm": 0.9241896271705627, + "learning_rate": 9.985014595280818e-05, + "loss": 2.3484, + "step": 1758 + }, + { + "epoch": 0.539901780233272, + "grad_norm": 0.7724506258964539, + "learning_rate": 9.984976116438408e-05, + "loss": 2.282, + "step": 1759 + }, + { + "epoch": 0.5402087170042971, + "grad_norm": 0.9098101854324341, + "learning_rate": 9.984937588331543e-05, + "loss": 2.3039, + "step": 1760 + }, + { + "epoch": 0.5405156537753223, + "grad_norm": 0.9430370330810547, + "learning_rate": 9.984899010960601e-05, + "loss": 2.2555, + "step": 1761 + }, + { + "epoch": 0.5408225905463474, + "grad_norm": 0.8927021026611328, + "learning_rate": 9.984860384325965e-05, + "loss": 2.3034, + "step": 1762 + }, + { + "epoch": 0.5411295273173726, + "grad_norm": 0.8331896662712097, + "learning_rate": 9.98482170842802e-05, + "loss": 2.3341, + "step": 1763 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.8311246633529663, + "learning_rate": 9.984782983267142e-05, + "loss": 2.3913, + "step": 1764 + }, + { + "epoch": 0.541743400859423, + "grad_norm": 0.7459335923194885, + "learning_rate": 9.98474420884372e-05, + "loss": 2.2912, + "step": 1765 + }, + { + "epoch": 0.5420503376304481, + "grad_norm": 0.84760981798172, + "learning_rate": 9.984705385158131e-05, + "loss": 2.316, + "step": 1766 + }, + { + "epoch": 0.5423572744014733, + "grad_norm": 0.888793408870697, + "learning_rate": 9.984666512210762e-05, + "loss": 2.3452, + "step": 1767 + }, + { + "epoch": 0.5426642111724984, + "grad_norm": 0.7977499961853027, + "learning_rate": 9.984627590001999e-05, + "loss": 2.3325, + "step": 1768 + }, + { + "epoch": 0.5429711479435236, + "grad_norm": 0.8059934377670288, + "learning_rate": 9.984588618532224e-05, + "loss": 2.3347, + "step": 1769 + }, + { + "epoch": 0.5432780847145487, + "grad_norm": 0.8190197348594666, + "learning_rate": 9.984549597801822e-05, + "loss": 2.3446, + "step": 1770 + }, + { + "epoch": 0.543585021485574, + "grad_norm": 0.774773895740509, + "learning_rate": 9.98451052781118e-05, + "loss": 2.2598, + "step": 1771 + }, + { + "epoch": 0.5438919582565992, + "grad_norm": 0.7341485023498535, + "learning_rate": 9.984471408560682e-05, + "loss": 2.2728, + "step": 1772 + }, + { + "epoch": 0.5441988950276243, + "grad_norm": 0.6881145238876343, + "learning_rate": 9.984432240050719e-05, + "loss": 2.2922, + "step": 1773 + }, + { + "epoch": 0.5445058317986495, + "grad_norm": 0.6896151304244995, + "learning_rate": 9.984393022281673e-05, + "loss": 2.2915, + "step": 1774 + }, + { + "epoch": 0.5448127685696746, + "grad_norm": 0.6902059316635132, + "learning_rate": 9.984353755253932e-05, + "loss": 2.31, + "step": 1775 + }, + { + "epoch": 0.5451197053406999, + "grad_norm": 0.7594140768051147, + "learning_rate": 9.984314438967888e-05, + "loss": 2.3092, + "step": 1776 + }, + { + "epoch": 0.545426642111725, + "grad_norm": 0.8682328462600708, + "learning_rate": 9.984275073423927e-05, + "loss": 2.2851, + "step": 1777 + }, + { + "epoch": 0.5457335788827502, + "grad_norm": 0.8747107982635498, + "learning_rate": 9.98423565862244e-05, + "loss": 2.2927, + "step": 1778 + }, + { + "epoch": 0.5460405156537753, + "grad_norm": 0.9824326038360596, + "learning_rate": 9.984196194563813e-05, + "loss": 2.3622, + "step": 1779 + }, + { + "epoch": 0.5463474524248005, + "grad_norm": 1.0006790161132812, + "learning_rate": 9.984156681248438e-05, + "loss": 2.2531, + "step": 1780 + }, + { + "epoch": 0.5466543891958257, + "grad_norm": 0.9501944184303284, + "learning_rate": 9.984117118676705e-05, + "loss": 2.3902, + "step": 1781 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.7835353016853333, + "learning_rate": 9.984077506849005e-05, + "loss": 2.2754, + "step": 1782 + }, + { + "epoch": 0.547268262737876, + "grad_norm": 0.7310026288032532, + "learning_rate": 9.984037845765732e-05, + "loss": 2.2742, + "step": 1783 + }, + { + "epoch": 0.5475751995089012, + "grad_norm": 0.9469361901283264, + "learning_rate": 9.983998135427275e-05, + "loss": 2.4026, + "step": 1784 + }, + { + "epoch": 0.5478821362799263, + "grad_norm": 1.0639240741729736, + "learning_rate": 9.983958375834025e-05, + "loss": 2.3522, + "step": 1785 + }, + { + "epoch": 0.5481890730509515, + "grad_norm": 0.7771989703178406, + "learning_rate": 9.983918566986379e-05, + "loss": 2.216, + "step": 1786 + }, + { + "epoch": 0.5484960098219767, + "grad_norm": 0.6809307932853699, + "learning_rate": 9.983878708884728e-05, + "loss": 2.256, + "step": 1787 + }, + { + "epoch": 0.5488029465930019, + "grad_norm": 0.7300165891647339, + "learning_rate": 9.983838801529469e-05, + "loss": 2.3156, + "step": 1788 + }, + { + "epoch": 0.549109883364027, + "grad_norm": 0.8352389335632324, + "learning_rate": 9.98379884492099e-05, + "loss": 2.3344, + "step": 1789 + }, + { + "epoch": 0.5494168201350522, + "grad_norm": 0.830585777759552, + "learning_rate": 9.983758839059692e-05, + "loss": 2.3076, + "step": 1790 + }, + { + "epoch": 0.5497237569060773, + "grad_norm": 0.7384640574455261, + "learning_rate": 9.983718783945968e-05, + "loss": 2.2387, + "step": 1791 + }, + { + "epoch": 0.5500306936771026, + "grad_norm": 0.7133243083953857, + "learning_rate": 9.983678679580213e-05, + "loss": 2.2933, + "step": 1792 + }, + { + "epoch": 0.5503376304481277, + "grad_norm": 0.8462459444999695, + "learning_rate": 9.983638525962823e-05, + "loss": 2.3294, + "step": 1793 + }, + { + "epoch": 0.5506445672191529, + "grad_norm": 0.7841110825538635, + "learning_rate": 9.983598323094199e-05, + "loss": 2.3156, + "step": 1794 + }, + { + "epoch": 0.550951503990178, + "grad_norm": 0.8454114198684692, + "learning_rate": 9.983558070974735e-05, + "loss": 2.2203, + "step": 1795 + }, + { + "epoch": 0.5512584407612032, + "grad_norm": 0.7741531729698181, + "learning_rate": 9.983517769604826e-05, + "loss": 2.2585, + "step": 1796 + }, + { + "epoch": 0.5515653775322283, + "grad_norm": 0.717714250087738, + "learning_rate": 9.983477418984876e-05, + "loss": 2.3127, + "step": 1797 + }, + { + "epoch": 0.5518723143032536, + "grad_norm": 0.7546361088752747, + "learning_rate": 9.983437019115283e-05, + "loss": 2.2591, + "step": 1798 + }, + { + "epoch": 0.5521792510742787, + "grad_norm": 0.7947681546211243, + "learning_rate": 9.983396569996442e-05, + "loss": 2.337, + "step": 1799 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.9286270141601562, + "learning_rate": 9.983356071628756e-05, + "loss": 2.371, + "step": 1800 + }, + { + "epoch": 0.552793124616329, + "grad_norm": 1.0236682891845703, + "learning_rate": 9.983315524012625e-05, + "loss": 2.2673, + "step": 1801 + }, + { + "epoch": 0.5531000613873542, + "grad_norm": 1.043534278869629, + "learning_rate": 9.983274927148447e-05, + "loss": 2.3204, + "step": 1802 + }, + { + "epoch": 0.5534069981583793, + "grad_norm": 0.9694257378578186, + "learning_rate": 9.983234281036626e-05, + "loss": 2.2642, + "step": 1803 + }, + { + "epoch": 0.5537139349294046, + "grad_norm": 0.8890992403030396, + "learning_rate": 9.983193585677563e-05, + "loss": 2.2546, + "step": 1804 + }, + { + "epoch": 0.5540208717004297, + "grad_norm": 0.8109140396118164, + "learning_rate": 9.983152841071662e-05, + "loss": 2.3088, + "step": 1805 + }, + { + "epoch": 0.5543278084714549, + "grad_norm": 0.7762413620948792, + "learning_rate": 9.983112047219323e-05, + "loss": 2.2277, + "step": 1806 + }, + { + "epoch": 0.55463474524248, + "grad_norm": 0.7949336767196655, + "learning_rate": 9.983071204120951e-05, + "loss": 2.3004, + "step": 1807 + }, + { + "epoch": 0.5549416820135052, + "grad_norm": 0.9118300080299377, + "learning_rate": 9.983030311776946e-05, + "loss": 2.3986, + "step": 1808 + }, + { + "epoch": 0.5552486187845304, + "grad_norm": 0.874891996383667, + "learning_rate": 9.982989370187717e-05, + "loss": 2.2721, + "step": 1809 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.8089940547943115, + "learning_rate": 9.982948379353667e-05, + "loss": 2.2846, + "step": 1810 + }, + { + "epoch": 0.5558624923265807, + "grad_norm": 0.7407395839691162, + "learning_rate": 9.982907339275198e-05, + "loss": 2.2848, + "step": 1811 + }, + { + "epoch": 0.5561694290976059, + "grad_norm": 0.7487329244613647, + "learning_rate": 9.982866249952721e-05, + "loss": 2.266, + "step": 1812 + }, + { + "epoch": 0.556476365868631, + "grad_norm": 0.7910557389259338, + "learning_rate": 9.982825111386638e-05, + "loss": 2.2975, + "step": 1813 + }, + { + "epoch": 0.5567833026396563, + "grad_norm": 0.767186164855957, + "learning_rate": 9.982783923577356e-05, + "loss": 2.2867, + "step": 1814 + }, + { + "epoch": 0.5570902394106814, + "grad_norm": 0.7296959757804871, + "learning_rate": 9.982742686525284e-05, + "loss": 2.2167, + "step": 1815 + }, + { + "epoch": 0.5573971761817066, + "grad_norm": 0.6536411643028259, + "learning_rate": 9.982701400230827e-05, + "loss": 2.2278, + "step": 1816 + }, + { + "epoch": 0.5577041129527317, + "grad_norm": 0.7393643260002136, + "learning_rate": 9.982660064694394e-05, + "loss": 2.3275, + "step": 1817 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.7837240099906921, + "learning_rate": 9.982618679916396e-05, + "loss": 2.3516, + "step": 1818 + }, + { + "epoch": 0.558317986494782, + "grad_norm": 0.8186847567558289, + "learning_rate": 9.982577245897238e-05, + "loss": 2.4104, + "step": 1819 + }, + { + "epoch": 0.5586249232658073, + "grad_norm": 0.733651340007782, + "learning_rate": 9.98253576263733e-05, + "loss": 2.2151, + "step": 1820 + }, + { + "epoch": 0.5589318600368324, + "grad_norm": 0.7452411651611328, + "learning_rate": 9.982494230137086e-05, + "loss": 2.3288, + "step": 1821 + }, + { + "epoch": 0.5592387968078576, + "grad_norm": 0.7369456887245178, + "learning_rate": 9.982452648396913e-05, + "loss": 2.3023, + "step": 1822 + }, + { + "epoch": 0.5595457335788827, + "grad_norm": 0.794789731502533, + "learning_rate": 9.982411017417222e-05, + "loss": 2.2774, + "step": 1823 + }, + { + "epoch": 0.5598526703499079, + "grad_norm": 0.7677412033081055, + "learning_rate": 9.982369337198425e-05, + "loss": 2.3213, + "step": 1824 + }, + { + "epoch": 0.560159607120933, + "grad_norm": 0.8195241689682007, + "learning_rate": 9.982327607740934e-05, + "loss": 2.3721, + "step": 1825 + }, + { + "epoch": 0.5604665438919583, + "grad_norm": 0.867115318775177, + "learning_rate": 9.982285829045162e-05, + "loss": 2.3653, + "step": 1826 + }, + { + "epoch": 0.5607734806629834, + "grad_norm": 0.8519865870475769, + "learning_rate": 9.98224400111152e-05, + "loss": 2.3646, + "step": 1827 + }, + { + "epoch": 0.5610804174340086, + "grad_norm": 0.9408721923828125, + "learning_rate": 9.982202123940425e-05, + "loss": 2.2051, + "step": 1828 + }, + { + "epoch": 0.5613873542050337, + "grad_norm": 0.985325813293457, + "learning_rate": 9.982160197532287e-05, + "loss": 2.3402, + "step": 1829 + }, + { + "epoch": 0.5616942909760589, + "grad_norm": 1.018094539642334, + "learning_rate": 9.982118221887521e-05, + "loss": 2.2712, + "step": 1830 + }, + { + "epoch": 0.562001227747084, + "grad_norm": 0.9246920347213745, + "learning_rate": 9.982076197006543e-05, + "loss": 2.3808, + "step": 1831 + }, + { + "epoch": 0.5623081645181093, + "grad_norm": 0.8519729971885681, + "learning_rate": 9.982034122889768e-05, + "loss": 2.3774, + "step": 1832 + }, + { + "epoch": 0.5626151012891344, + "grad_norm": 0.801567018032074, + "learning_rate": 9.981991999537612e-05, + "loss": 2.2713, + "step": 1833 + }, + { + "epoch": 0.5629220380601596, + "grad_norm": 0.7212518453598022, + "learning_rate": 9.981949826950492e-05, + "loss": 2.1902, + "step": 1834 + }, + { + "epoch": 0.5632289748311847, + "grad_norm": 0.7644798755645752, + "learning_rate": 9.981907605128822e-05, + "loss": 2.2751, + "step": 1835 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.7941999435424805, + "learning_rate": 9.981865334073022e-05, + "loss": 2.2991, + "step": 1836 + }, + { + "epoch": 0.5638428483732351, + "grad_norm": 0.7274888753890991, + "learning_rate": 9.981823013783508e-05, + "loss": 2.3536, + "step": 1837 + }, + { + "epoch": 0.5641497851442603, + "grad_norm": 0.845024585723877, + "learning_rate": 9.9817806442607e-05, + "loss": 2.2796, + "step": 1838 + }, + { + "epoch": 0.5644567219152854, + "grad_norm": 0.8225597739219666, + "learning_rate": 9.981738225505015e-05, + "loss": 2.3339, + "step": 1839 + }, + { + "epoch": 0.5647636586863106, + "grad_norm": 0.8456425070762634, + "learning_rate": 9.981695757516873e-05, + "loss": 2.2583, + "step": 1840 + }, + { + "epoch": 0.5650705954573357, + "grad_norm": 1.0066497325897217, + "learning_rate": 9.981653240296695e-05, + "loss": 2.3628, + "step": 1841 + }, + { + "epoch": 0.565377532228361, + "grad_norm": 0.9574379920959473, + "learning_rate": 9.981610673844899e-05, + "loss": 2.306, + "step": 1842 + }, + { + "epoch": 0.5656844689993862, + "grad_norm": 0.7427437901496887, + "learning_rate": 9.981568058161905e-05, + "loss": 2.267, + "step": 1843 + }, + { + "epoch": 0.5659914057704113, + "grad_norm": 0.6984857320785522, + "learning_rate": 9.981525393248138e-05, + "loss": 2.2095, + "step": 1844 + }, + { + "epoch": 0.5662983425414365, + "grad_norm": 0.748062789440155, + "learning_rate": 9.981482679104016e-05, + "loss": 2.211, + "step": 1845 + }, + { + "epoch": 0.5666052793124616, + "grad_norm": 0.7978217005729675, + "learning_rate": 9.981439915729964e-05, + "loss": 2.2437, + "step": 1846 + }, + { + "epoch": 0.5669122160834869, + "grad_norm": 0.807849109172821, + "learning_rate": 9.981397103126401e-05, + "loss": 2.3063, + "step": 1847 + }, + { + "epoch": 0.567219152854512, + "grad_norm": 0.8626619577407837, + "learning_rate": 9.981354241293752e-05, + "loss": 2.3616, + "step": 1848 + }, + { + "epoch": 0.5675260896255372, + "grad_norm": 0.8991526961326599, + "learning_rate": 9.981311330232442e-05, + "loss": 2.2355, + "step": 1849 + }, + { + "epoch": 0.5678330263965623, + "grad_norm": 0.7399953007698059, + "learning_rate": 9.981268369942894e-05, + "loss": 2.2452, + "step": 1850 + }, + { + "epoch": 0.5681399631675875, + "grad_norm": 0.7787104845046997, + "learning_rate": 9.981225360425533e-05, + "loss": 2.4141, + "step": 1851 + }, + { + "epoch": 0.5684468999386126, + "grad_norm": 0.8570892214775085, + "learning_rate": 9.98118230168078e-05, + "loss": 2.2487, + "step": 1852 + }, + { + "epoch": 0.5687538367096379, + "grad_norm": 0.8277538418769836, + "learning_rate": 9.981139193709068e-05, + "loss": 2.2602, + "step": 1853 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.7638106942176819, + "learning_rate": 9.981096036510817e-05, + "loss": 2.2886, + "step": 1854 + }, + { + "epoch": 0.5693677102516882, + "grad_norm": 0.8480616807937622, + "learning_rate": 9.981052830086454e-05, + "loss": 2.2893, + "step": 1855 + }, + { + "epoch": 0.5696746470227133, + "grad_norm": 0.8568599820137024, + "learning_rate": 9.98100957443641e-05, + "loss": 2.3802, + "step": 1856 + }, + { + "epoch": 0.5699815837937385, + "grad_norm": 0.7863987684249878, + "learning_rate": 9.98096626956111e-05, + "loss": 2.2996, + "step": 1857 + }, + { + "epoch": 0.5702885205647636, + "grad_norm": 0.7636334896087646, + "learning_rate": 9.980922915460979e-05, + "loss": 2.2569, + "step": 1858 + }, + { + "epoch": 0.5705954573357889, + "grad_norm": 0.7514677047729492, + "learning_rate": 9.98087951213645e-05, + "loss": 2.3317, + "step": 1859 + }, + { + "epoch": 0.570902394106814, + "grad_norm": 0.717637300491333, + "learning_rate": 9.980836059587951e-05, + "loss": 2.2855, + "step": 1860 + }, + { + "epoch": 0.5712093308778392, + "grad_norm": 0.728518545627594, + "learning_rate": 9.98079255781591e-05, + "loss": 2.3166, + "step": 1861 + }, + { + "epoch": 0.5715162676488643, + "grad_norm": 0.7158043384552002, + "learning_rate": 9.980749006820757e-05, + "loss": 2.2639, + "step": 1862 + }, + { + "epoch": 0.5718232044198895, + "grad_norm": 0.7565107941627502, + "learning_rate": 9.980705406602924e-05, + "loss": 2.2833, + "step": 1863 + }, + { + "epoch": 0.5721301411909147, + "grad_norm": 0.7873388528823853, + "learning_rate": 9.980661757162841e-05, + "loss": 2.201, + "step": 1864 + }, + { + "epoch": 0.5724370779619399, + "grad_norm": 0.7818259596824646, + "learning_rate": 9.980618058500939e-05, + "loss": 2.242, + "step": 1865 + }, + { + "epoch": 0.572744014732965, + "grad_norm": 0.7464665770530701, + "learning_rate": 9.98057431061765e-05, + "loss": 2.2325, + "step": 1866 + }, + { + "epoch": 0.5730509515039902, + "grad_norm": 0.7778184413909912, + "learning_rate": 9.980530513513406e-05, + "loss": 2.3258, + "step": 1867 + }, + { + "epoch": 0.5733578882750153, + "grad_norm": 0.825661301612854, + "learning_rate": 9.980486667188642e-05, + "loss": 2.3477, + "step": 1868 + }, + { + "epoch": 0.5736648250460405, + "grad_norm": 0.8448848724365234, + "learning_rate": 9.980442771643788e-05, + "loss": 2.3523, + "step": 1869 + }, + { + "epoch": 0.5739717618170657, + "grad_norm": 0.8330404758453369, + "learning_rate": 9.98039882687928e-05, + "loss": 2.2274, + "step": 1870 + }, + { + "epoch": 0.5742786985880909, + "grad_norm": 0.7520943284034729, + "learning_rate": 9.98035483289555e-05, + "loss": 2.2773, + "step": 1871 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.8312448263168335, + "learning_rate": 9.980310789693037e-05, + "loss": 2.302, + "step": 1872 + }, + { + "epoch": 0.5748925721301412, + "grad_norm": 0.7383994460105896, + "learning_rate": 9.980266697272173e-05, + "loss": 2.2168, + "step": 1873 + }, + { + "epoch": 0.5751995089011663, + "grad_norm": 0.9612922072410583, + "learning_rate": 9.980222555633394e-05, + "loss": 2.3558, + "step": 1874 + }, + { + "epoch": 0.5755064456721916, + "grad_norm": 0.9921227097511292, + "learning_rate": 9.980178364777136e-05, + "loss": 2.2913, + "step": 1875 + }, + { + "epoch": 0.5758133824432167, + "grad_norm": 0.9152889847755432, + "learning_rate": 9.980134124703837e-05, + "loss": 2.2615, + "step": 1876 + }, + { + "epoch": 0.5761203192142419, + "grad_norm": 0.8090541362762451, + "learning_rate": 9.980089835413936e-05, + "loss": 2.2661, + "step": 1877 + }, + { + "epoch": 0.576427255985267, + "grad_norm": 0.8074322938919067, + "learning_rate": 9.980045496907865e-05, + "loss": 2.3209, + "step": 1878 + }, + { + "epoch": 0.5767341927562922, + "grad_norm": 0.784649670124054, + "learning_rate": 9.980001109186065e-05, + "loss": 2.241, + "step": 1879 + }, + { + "epoch": 0.5770411295273173, + "grad_norm": 0.768108069896698, + "learning_rate": 9.979956672248978e-05, + "loss": 2.3333, + "step": 1880 + }, + { + "epoch": 0.5773480662983426, + "grad_norm": 0.798058271408081, + "learning_rate": 9.97991218609704e-05, + "loss": 2.3564, + "step": 1881 + }, + { + "epoch": 0.5776550030693677, + "grad_norm": 0.7606865763664246, + "learning_rate": 9.97986765073069e-05, + "loss": 2.2277, + "step": 1882 + }, + { + "epoch": 0.5779619398403929, + "grad_norm": 0.8320558667182922, + "learning_rate": 9.979823066150369e-05, + "loss": 2.3715, + "step": 1883 + }, + { + "epoch": 0.578268876611418, + "grad_norm": 0.7935798168182373, + "learning_rate": 9.979778432356517e-05, + "loss": 2.2605, + "step": 1884 + }, + { + "epoch": 0.5785758133824432, + "grad_norm": 0.6914796829223633, + "learning_rate": 9.979733749349578e-05, + "loss": 2.2699, + "step": 1885 + }, + { + "epoch": 0.5788827501534684, + "grad_norm": 0.6546899676322937, + "learning_rate": 9.979689017129989e-05, + "loss": 2.1908, + "step": 1886 + }, + { + "epoch": 0.5791896869244936, + "grad_norm": 0.7231267094612122, + "learning_rate": 9.979644235698195e-05, + "loss": 2.2084, + "step": 1887 + }, + { + "epoch": 0.5794966236955187, + "grad_norm": 0.668933093547821, + "learning_rate": 9.979599405054639e-05, + "loss": 2.2722, + "step": 1888 + }, + { + "epoch": 0.5798035604665439, + "grad_norm": 0.678191602230072, + "learning_rate": 9.979554525199763e-05, + "loss": 2.2312, + "step": 1889 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.6407462954521179, + "learning_rate": 9.97950959613401e-05, + "loss": 2.2381, + "step": 1890 + }, + { + "epoch": 0.5804174340085942, + "grad_norm": 0.6920403242111206, + "learning_rate": 9.979464617857826e-05, + "loss": 2.2678, + "step": 1891 + }, + { + "epoch": 0.5807243707796194, + "grad_norm": 0.6907110810279846, + "learning_rate": 9.979419590371651e-05, + "loss": 2.2579, + "step": 1892 + }, + { + "epoch": 0.5810313075506446, + "grad_norm": 0.7683933973312378, + "learning_rate": 9.979374513675935e-05, + "loss": 2.2184, + "step": 1893 + }, + { + "epoch": 0.5813382443216697, + "grad_norm": 0.797286868095398, + "learning_rate": 9.979329387771121e-05, + "loss": 2.2518, + "step": 1894 + }, + { + "epoch": 0.5816451810926949, + "grad_norm": 0.8192877769470215, + "learning_rate": 9.979284212657657e-05, + "loss": 2.2271, + "step": 1895 + }, + { + "epoch": 0.58195211786372, + "grad_norm": 0.7510090470314026, + "learning_rate": 9.979238988335986e-05, + "loss": 2.2864, + "step": 1896 + }, + { + "epoch": 0.5822590546347453, + "grad_norm": 0.7541393041610718, + "learning_rate": 9.979193714806558e-05, + "loss": 2.239, + "step": 1897 + }, + { + "epoch": 0.5825659914057704, + "grad_norm": 0.7353073358535767, + "learning_rate": 9.97914839206982e-05, + "loss": 2.2145, + "step": 1898 + }, + { + "epoch": 0.5828729281767956, + "grad_norm": 0.6813456416130066, + "learning_rate": 9.979103020126218e-05, + "loss": 2.194, + "step": 1899 + }, + { + "epoch": 0.5831798649478207, + "grad_norm": 0.6922066807746887, + "learning_rate": 9.979057598976202e-05, + "loss": 2.2335, + "step": 1900 + }, + { + "epoch": 0.5834868017188459, + "grad_norm": 0.5800344944000244, + "learning_rate": 9.97901212862022e-05, + "loss": 2.2159, + "step": 1901 + }, + { + "epoch": 0.583793738489871, + "grad_norm": 0.5770835280418396, + "learning_rate": 9.978966609058722e-05, + "loss": 2.2217, + "step": 1902 + }, + { + "epoch": 0.5841006752608963, + "grad_norm": 0.6217128038406372, + "learning_rate": 9.978921040292158e-05, + "loss": 2.2703, + "step": 1903 + }, + { + "epoch": 0.5844076120319214, + "grad_norm": 0.6684436798095703, + "learning_rate": 9.97887542232098e-05, + "loss": 2.2747, + "step": 1904 + }, + { + "epoch": 0.5847145488029466, + "grad_norm": 0.6261670589447021, + "learning_rate": 9.978829755145633e-05, + "loss": 2.2867, + "step": 1905 + }, + { + "epoch": 0.5850214855739717, + "grad_norm": 0.646051824092865, + "learning_rate": 9.978784038766575e-05, + "loss": 2.2493, + "step": 1906 + }, + { + "epoch": 0.5853284223449969, + "grad_norm": 0.6757060885429382, + "learning_rate": 9.978738273184254e-05, + "loss": 2.218, + "step": 1907 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.7867937684059143, + "learning_rate": 9.978692458399122e-05, + "loss": 2.3405, + "step": 1908 + }, + { + "epoch": 0.5859422958870473, + "grad_norm": 0.8349789381027222, + "learning_rate": 9.978646594411636e-05, + "loss": 2.3292, + "step": 1909 + }, + { + "epoch": 0.5862492326580724, + "grad_norm": 0.8739562034606934, + "learning_rate": 9.978600681222243e-05, + "loss": 2.2132, + "step": 1910 + }, + { + "epoch": 0.5865561694290976, + "grad_norm": 0.8187520503997803, + "learning_rate": 9.978554718831402e-05, + "loss": 2.3078, + "step": 1911 + }, + { + "epoch": 0.5868631062001227, + "grad_norm": 0.8463271856307983, + "learning_rate": 9.978508707239565e-05, + "loss": 2.1924, + "step": 1912 + }, + { + "epoch": 0.5871700429711479, + "grad_norm": 0.8674206733703613, + "learning_rate": 9.978462646447187e-05, + "loss": 2.2185, + "step": 1913 + }, + { + "epoch": 0.5874769797421732, + "grad_norm": 0.7828893065452576, + "learning_rate": 9.978416536454722e-05, + "loss": 2.3137, + "step": 1914 + }, + { + "epoch": 0.5877839165131983, + "grad_norm": 0.7868914604187012, + "learning_rate": 9.978370377262629e-05, + "loss": 2.2202, + "step": 1915 + }, + { + "epoch": 0.5880908532842235, + "grad_norm": 0.811596155166626, + "learning_rate": 9.97832416887136e-05, + "loss": 2.3463, + "step": 1916 + }, + { + "epoch": 0.5883977900552486, + "grad_norm": 0.9281075596809387, + "learning_rate": 9.978277911281375e-05, + "loss": 2.2394, + "step": 1917 + }, + { + "epoch": 0.5887047268262738, + "grad_norm": 0.8862313628196716, + "learning_rate": 9.978231604493129e-05, + "loss": 2.2456, + "step": 1918 + }, + { + "epoch": 0.589011663597299, + "grad_norm": 0.8411116600036621, + "learning_rate": 9.978185248507081e-05, + "loss": 2.2409, + "step": 1919 + }, + { + "epoch": 0.5893186003683242, + "grad_norm": 0.8205060958862305, + "learning_rate": 9.978138843323688e-05, + "loss": 2.2468, + "step": 1920 + }, + { + "epoch": 0.5896255371393493, + "grad_norm": 0.8103171586990356, + "learning_rate": 9.97809238894341e-05, + "loss": 2.2979, + "step": 1921 + }, + { + "epoch": 0.5899324739103745, + "grad_norm": 0.7937025427818298, + "learning_rate": 9.978045885366704e-05, + "loss": 2.3582, + "step": 1922 + }, + { + "epoch": 0.5902394106813996, + "grad_norm": 0.7983896136283875, + "learning_rate": 9.977999332594032e-05, + "loss": 2.2725, + "step": 1923 + }, + { + "epoch": 0.5905463474524248, + "grad_norm": 0.8274399042129517, + "learning_rate": 9.977952730625852e-05, + "loss": 2.3091, + "step": 1924 + }, + { + "epoch": 0.59085328422345, + "grad_norm": 0.9385362863540649, + "learning_rate": 9.977906079462627e-05, + "loss": 2.4322, + "step": 1925 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.8405537009239197, + "learning_rate": 9.977859379104814e-05, + "loss": 2.1606, + "step": 1926 + }, + { + "epoch": 0.5914671577655003, + "grad_norm": 0.8082418441772461, + "learning_rate": 9.97781262955288e-05, + "loss": 2.2929, + "step": 1927 + }, + { + "epoch": 0.5917740945365255, + "grad_norm": 0.7444280385971069, + "learning_rate": 9.977765830807283e-05, + "loss": 2.3217, + "step": 1928 + }, + { + "epoch": 0.5920810313075506, + "grad_norm": 0.7369982600212097, + "learning_rate": 9.977718982868485e-05, + "loss": 2.2658, + "step": 1929 + }, + { + "epoch": 0.5923879680785759, + "grad_norm": 0.6842257380485535, + "learning_rate": 9.977672085736951e-05, + "loss": 2.2243, + "step": 1930 + }, + { + "epoch": 0.592694904849601, + "grad_norm": 0.6954882740974426, + "learning_rate": 9.977625139413145e-05, + "loss": 2.2802, + "step": 1931 + }, + { + "epoch": 0.5930018416206262, + "grad_norm": 0.749829888343811, + "learning_rate": 9.97757814389753e-05, + "loss": 2.3166, + "step": 1932 + }, + { + "epoch": 0.5933087783916513, + "grad_norm": 0.7725609540939331, + "learning_rate": 9.977531099190569e-05, + "loss": 2.2367, + "step": 1933 + }, + { + "epoch": 0.5936157151626765, + "grad_norm": 0.7467440366744995, + "learning_rate": 9.977484005292728e-05, + "loss": 2.2704, + "step": 1934 + }, + { + "epoch": 0.5939226519337016, + "grad_norm": 0.7104424834251404, + "learning_rate": 9.977436862204475e-05, + "loss": 2.1983, + "step": 1935 + }, + { + "epoch": 0.5942295887047269, + "grad_norm": 0.7562711834907532, + "learning_rate": 9.977389669926272e-05, + "loss": 2.2857, + "step": 1936 + }, + { + "epoch": 0.594536525475752, + "grad_norm": 0.7803298830986023, + "learning_rate": 9.977342428458585e-05, + "loss": 2.3526, + "step": 1937 + }, + { + "epoch": 0.5948434622467772, + "grad_norm": 0.7487826943397522, + "learning_rate": 9.977295137801885e-05, + "loss": 2.2338, + "step": 1938 + }, + { + "epoch": 0.5951503990178023, + "grad_norm": 0.6969291567802429, + "learning_rate": 9.977247797956639e-05, + "loss": 2.2185, + "step": 1939 + }, + { + "epoch": 0.5954573357888275, + "grad_norm": 0.6293052434921265, + "learning_rate": 9.977200408923311e-05, + "loss": 2.2767, + "step": 1940 + }, + { + "epoch": 0.5957642725598526, + "grad_norm": 0.7457680702209473, + "learning_rate": 9.97715297070237e-05, + "loss": 2.2688, + "step": 1941 + }, + { + "epoch": 0.5960712093308779, + "grad_norm": 0.7255130410194397, + "learning_rate": 9.977105483294288e-05, + "loss": 2.2157, + "step": 1942 + }, + { + "epoch": 0.596378146101903, + "grad_norm": 0.739815890789032, + "learning_rate": 9.977057946699532e-05, + "loss": 2.306, + "step": 1943 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.7493855357170105, + "learning_rate": 9.977010360918571e-05, + "loss": 2.1893, + "step": 1944 + }, + { + "epoch": 0.5969920196439533, + "grad_norm": 0.7976173758506775, + "learning_rate": 9.976962725951878e-05, + "loss": 2.3288, + "step": 1945 + }, + { + "epoch": 0.5972989564149785, + "grad_norm": 0.9487287998199463, + "learning_rate": 9.976915041799921e-05, + "loss": 2.4484, + "step": 1946 + }, + { + "epoch": 0.5976058931860037, + "grad_norm": 0.9866845011711121, + "learning_rate": 9.976867308463174e-05, + "loss": 2.3223, + "step": 1947 + }, + { + "epoch": 0.5979128299570289, + "grad_norm": 0.9258660674095154, + "learning_rate": 9.976819525942107e-05, + "loss": 2.2358, + "step": 1948 + }, + { + "epoch": 0.598219766728054, + "grad_norm": 0.9822832345962524, + "learning_rate": 9.976771694237192e-05, + "loss": 2.2951, + "step": 1949 + }, + { + "epoch": 0.5985267034990792, + "grad_norm": 1.005528450012207, + "learning_rate": 9.976723813348902e-05, + "loss": 2.2604, + "step": 1950 + }, + { + "epoch": 0.5988336402701043, + "grad_norm": 0.8988018035888672, + "learning_rate": 9.976675883277711e-05, + "loss": 2.3419, + "step": 1951 + }, + { + "epoch": 0.5991405770411296, + "grad_norm": 0.7386319041252136, + "learning_rate": 9.976627904024091e-05, + "loss": 2.2357, + "step": 1952 + }, + { + "epoch": 0.5994475138121547, + "grad_norm": 0.7715404033660889, + "learning_rate": 9.976579875588518e-05, + "loss": 2.3482, + "step": 1953 + }, + { + "epoch": 0.5997544505831799, + "grad_norm": 0.7529712319374084, + "learning_rate": 9.976531797971464e-05, + "loss": 2.1735, + "step": 1954 + }, + { + "epoch": 0.600061387354205, + "grad_norm": 0.8589643836021423, + "learning_rate": 9.97648367117341e-05, + "loss": 2.305, + "step": 1955 + }, + { + "epoch": 0.6003683241252302, + "grad_norm": 0.9038915634155273, + "learning_rate": 9.976435495194823e-05, + "loss": 2.2123, + "step": 1956 + }, + { + "epoch": 0.6006752608962553, + "grad_norm": 0.9388678073883057, + "learning_rate": 9.976387270036186e-05, + "loss": 2.1792, + "step": 1957 + }, + { + "epoch": 0.6009821976672806, + "grad_norm": 0.7970952391624451, + "learning_rate": 9.976338995697974e-05, + "loss": 2.2425, + "step": 1958 + }, + { + "epoch": 0.6012891344383057, + "grad_norm": 0.7219900488853455, + "learning_rate": 9.976290672180662e-05, + "loss": 2.1984, + "step": 1959 + }, + { + "epoch": 0.6015960712093309, + "grad_norm": 0.639715313911438, + "learning_rate": 9.976242299484728e-05, + "loss": 2.2796, + "step": 1960 + }, + { + "epoch": 0.601903007980356, + "grad_norm": 0.6734911799430847, + "learning_rate": 9.976193877610652e-05, + "loss": 2.3066, + "step": 1961 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.8328932523727417, + "learning_rate": 9.976145406558912e-05, + "loss": 2.3958, + "step": 1962 + }, + { + "epoch": 0.6025168815224063, + "grad_norm": 0.9552088379859924, + "learning_rate": 9.976096886329986e-05, + "loss": 2.3246, + "step": 1963 + }, + { + "epoch": 0.6028238182934316, + "grad_norm": 0.8407328128814697, + "learning_rate": 9.976048316924354e-05, + "loss": 2.2922, + "step": 1964 + }, + { + "epoch": 0.6031307550644567, + "grad_norm": 0.6899709105491638, + "learning_rate": 9.975999698342495e-05, + "loss": 2.1808, + "step": 1965 + }, + { + "epoch": 0.6034376918354819, + "grad_norm": 0.8114390969276428, + "learning_rate": 9.975951030584892e-05, + "loss": 2.3516, + "step": 1966 + }, + { + "epoch": 0.603744628606507, + "grad_norm": 0.8071461319923401, + "learning_rate": 9.975902313652024e-05, + "loss": 2.2044, + "step": 1967 + }, + { + "epoch": 0.6040515653775322, + "grad_norm": 0.8767913579940796, + "learning_rate": 9.975853547544372e-05, + "loss": 2.24, + "step": 1968 + }, + { + "epoch": 0.6043585021485574, + "grad_norm": 0.817095935344696, + "learning_rate": 9.975804732262419e-05, + "loss": 2.169, + "step": 1969 + }, + { + "epoch": 0.6046654389195826, + "grad_norm": 0.6818623542785645, + "learning_rate": 9.975755867806648e-05, + "loss": 2.2869, + "step": 1970 + }, + { + "epoch": 0.6049723756906077, + "grad_norm": 0.7248693704605103, + "learning_rate": 9.97570695417754e-05, + "loss": 2.2159, + "step": 1971 + }, + { + "epoch": 0.6052793124616329, + "grad_norm": 0.6425455212593079, + "learning_rate": 9.975657991375581e-05, + "loss": 2.2173, + "step": 1972 + }, + { + "epoch": 0.605586249232658, + "grad_norm": 0.6856566071510315, + "learning_rate": 9.975608979401252e-05, + "loss": 2.2994, + "step": 1973 + }, + { + "epoch": 0.6058931860036832, + "grad_norm": 0.6731004118919373, + "learning_rate": 9.97555991825504e-05, + "loss": 2.2286, + "step": 1974 + }, + { + "epoch": 0.6062001227747084, + "grad_norm": 0.7461759448051453, + "learning_rate": 9.975510807937428e-05, + "loss": 2.2057, + "step": 1975 + }, + { + "epoch": 0.6065070595457336, + "grad_norm": 0.7256236672401428, + "learning_rate": 9.975461648448902e-05, + "loss": 2.2686, + "step": 1976 + }, + { + "epoch": 0.6068139963167587, + "grad_norm": 0.7254514098167419, + "learning_rate": 9.975412439789949e-05, + "loss": 2.2748, + "step": 1977 + }, + { + "epoch": 0.6071209330877839, + "grad_norm": 0.7280047535896301, + "learning_rate": 9.975363181961052e-05, + "loss": 2.27, + "step": 1978 + }, + { + "epoch": 0.607427869858809, + "grad_norm": 0.6801813244819641, + "learning_rate": 9.9753138749627e-05, + "loss": 2.2356, + "step": 1979 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.841946005821228, + "learning_rate": 9.975264518795382e-05, + "loss": 2.3887, + "step": 1980 + }, + { + "epoch": 0.6080417434008594, + "grad_norm": 0.9610007405281067, + "learning_rate": 9.975215113459582e-05, + "loss": 2.2857, + "step": 1981 + }, + { + "epoch": 0.6083486801718846, + "grad_norm": 0.8726536631584167, + "learning_rate": 9.975165658955791e-05, + "loss": 2.3137, + "step": 1982 + }, + { + "epoch": 0.6086556169429097, + "grad_norm": 0.9275946021080017, + "learning_rate": 9.975116155284498e-05, + "loss": 2.291, + "step": 1983 + }, + { + "epoch": 0.6089625537139349, + "grad_norm": 0.9045402407646179, + "learning_rate": 9.97506660244619e-05, + "loss": 2.2183, + "step": 1984 + }, + { + "epoch": 0.6092694904849602, + "grad_norm": 0.7913599610328674, + "learning_rate": 9.975017000441358e-05, + "loss": 2.349, + "step": 1985 + }, + { + "epoch": 0.6095764272559853, + "grad_norm": 0.714824378490448, + "learning_rate": 9.974967349270492e-05, + "loss": 2.2163, + "step": 1986 + }, + { + "epoch": 0.6098833640270105, + "grad_norm": 0.7178559899330139, + "learning_rate": 9.974917648934084e-05, + "loss": 2.2338, + "step": 1987 + }, + { + "epoch": 0.6101903007980356, + "grad_norm": 0.8417280912399292, + "learning_rate": 9.97486789943262e-05, + "loss": 2.1961, + "step": 1988 + }, + { + "epoch": 0.6104972375690608, + "grad_norm": 0.8488532304763794, + "learning_rate": 9.9748181007666e-05, + "loss": 2.2509, + "step": 1989 + }, + { + "epoch": 0.6108041743400859, + "grad_norm": 0.796309769153595, + "learning_rate": 9.974768252936509e-05, + "loss": 2.2948, + "step": 1990 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.7163965702056885, + "learning_rate": 9.974718355942843e-05, + "loss": 2.2136, + "step": 1991 + }, + { + "epoch": 0.6114180478821363, + "grad_norm": 0.6620060205459595, + "learning_rate": 9.974668409786095e-05, + "loss": 2.2442, + "step": 1992 + }, + { + "epoch": 0.6117249846531615, + "grad_norm": 0.6843542456626892, + "learning_rate": 9.974618414466759e-05, + "loss": 2.1972, + "step": 1993 + }, + { + "epoch": 0.6120319214241866, + "grad_norm": 0.699847936630249, + "learning_rate": 9.974568369985327e-05, + "loss": 2.2194, + "step": 1994 + }, + { + "epoch": 0.6123388581952118, + "grad_norm": 0.693384051322937, + "learning_rate": 9.974518276342293e-05, + "loss": 2.2446, + "step": 1995 + }, + { + "epoch": 0.612645794966237, + "grad_norm": 0.6022316813468933, + "learning_rate": 9.974468133538155e-05, + "loss": 2.2037, + "step": 1996 + }, + { + "epoch": 0.6129527317372622, + "grad_norm": 0.6317062377929688, + "learning_rate": 9.974417941573409e-05, + "loss": 2.1855, + "step": 1997 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.7291355133056641, + "learning_rate": 9.974367700448547e-05, + "loss": 2.2179, + "step": 1998 + }, + { + "epoch": 0.6135666052793125, + "grad_norm": 0.6776867508888245, + "learning_rate": 9.97431741016407e-05, + "loss": 2.2437, + "step": 1999 + }, + { + "epoch": 0.6138735420503376, + "grad_norm": 0.6598517298698425, + "learning_rate": 9.97426707072047e-05, + "loss": 2.2775, + "step": 2000 + }, + { + "epoch": 0.6141804788213628, + "grad_norm": 0.6681709289550781, + "learning_rate": 9.974216682118249e-05, + "loss": 2.2004, + "step": 2001 + }, + { + "epoch": 0.614487415592388, + "grad_norm": 0.6725168228149414, + "learning_rate": 9.974166244357903e-05, + "loss": 2.2922, + "step": 2002 + }, + { + "epoch": 0.6147943523634132, + "grad_norm": 0.6547908782958984, + "learning_rate": 9.974115757439931e-05, + "loss": 2.2195, + "step": 2003 + }, + { + "epoch": 0.6151012891344383, + "grad_norm": 0.7195348739624023, + "learning_rate": 9.974065221364831e-05, + "loss": 2.2862, + "step": 2004 + }, + { + "epoch": 0.6154082259054635, + "grad_norm": 0.7992655038833618, + "learning_rate": 9.974014636133103e-05, + "loss": 2.3109, + "step": 2005 + }, + { + "epoch": 0.6157151626764886, + "grad_norm": 0.7932934165000916, + "learning_rate": 9.973964001745249e-05, + "loss": 2.2869, + "step": 2006 + }, + { + "epoch": 0.6160220994475138, + "grad_norm": 0.7778924107551575, + "learning_rate": 9.973913318201763e-05, + "loss": 2.2046, + "step": 2007 + }, + { + "epoch": 0.616329036218539, + "grad_norm": 0.7951294183731079, + "learning_rate": 9.973862585503155e-05, + "loss": 2.221, + "step": 2008 + }, + { + "epoch": 0.6166359729895642, + "grad_norm": 0.729552686214447, + "learning_rate": 9.97381180364992e-05, + "loss": 2.2929, + "step": 2009 + }, + { + "epoch": 0.6169429097605893, + "grad_norm": 0.731516420841217, + "learning_rate": 9.973760972642561e-05, + "loss": 2.2673, + "step": 2010 + }, + { + "epoch": 0.6172498465316145, + "grad_norm": 0.6950094103813171, + "learning_rate": 9.973710092481581e-05, + "loss": 2.2029, + "step": 2011 + }, + { + "epoch": 0.6175567833026396, + "grad_norm": 0.6260825395584106, + "learning_rate": 9.973659163167484e-05, + "loss": 2.3037, + "step": 2012 + }, + { + "epoch": 0.6178637200736649, + "grad_norm": 0.6949467658996582, + "learning_rate": 9.97360818470077e-05, + "loss": 2.2699, + "step": 2013 + }, + { + "epoch": 0.61817065684469, + "grad_norm": 0.7322572469711304, + "learning_rate": 9.973557157081945e-05, + "loss": 2.2921, + "step": 2014 + }, + { + "epoch": 0.6184775936157152, + "grad_norm": 0.8999563455581665, + "learning_rate": 9.973506080311514e-05, + "loss": 2.2499, + "step": 2015 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.9269914031028748, + "learning_rate": 9.973454954389981e-05, + "loss": 2.2676, + "step": 2016 + }, + { + "epoch": 0.6190914671577655, + "grad_norm": 0.8630712628364563, + "learning_rate": 9.973403779317852e-05, + "loss": 2.1379, + "step": 2017 + }, + { + "epoch": 0.6193984039287906, + "grad_norm": 0.8249645233154297, + "learning_rate": 9.97335255509563e-05, + "loss": 2.3109, + "step": 2018 + }, + { + "epoch": 0.6197053406998159, + "grad_norm": 0.7832711338996887, + "learning_rate": 9.973301281723824e-05, + "loss": 2.1316, + "step": 2019 + }, + { + "epoch": 0.620012277470841, + "grad_norm": 0.7502821683883667, + "learning_rate": 9.97324995920294e-05, + "loss": 2.2188, + "step": 2020 + }, + { + "epoch": 0.6203192142418662, + "grad_norm": 0.7804487347602844, + "learning_rate": 9.973198587533483e-05, + "loss": 2.2639, + "step": 2021 + }, + { + "epoch": 0.6206261510128913, + "grad_norm": 0.9198356866836548, + "learning_rate": 9.973147166715963e-05, + "loss": 2.2574, + "step": 2022 + }, + { + "epoch": 0.6209330877839165, + "grad_norm": 0.8792869448661804, + "learning_rate": 9.97309569675089e-05, + "loss": 2.2228, + "step": 2023 + }, + { + "epoch": 0.6212400245549416, + "grad_norm": 0.779772937297821, + "learning_rate": 9.97304417763877e-05, + "loss": 2.2179, + "step": 2024 + }, + { + "epoch": 0.6215469613259669, + "grad_norm": 0.7702100276947021, + "learning_rate": 9.972992609380111e-05, + "loss": 2.3872, + "step": 2025 + }, + { + "epoch": 0.621853898096992, + "grad_norm": 0.8576669096946716, + "learning_rate": 9.972940991975426e-05, + "loss": 2.2279, + "step": 2026 + }, + { + "epoch": 0.6221608348680172, + "grad_norm": 0.8312802314758301, + "learning_rate": 9.972889325425223e-05, + "loss": 2.3507, + "step": 2027 + }, + { + "epoch": 0.6224677716390423, + "grad_norm": 0.7873719930648804, + "learning_rate": 9.972837609730013e-05, + "loss": 2.2252, + "step": 2028 + }, + { + "epoch": 0.6227747084100675, + "grad_norm": 0.7763897180557251, + "learning_rate": 9.972785844890307e-05, + "loss": 2.2559, + "step": 2029 + }, + { + "epoch": 0.6230816451810927, + "grad_norm": 0.7053700685501099, + "learning_rate": 9.972734030906617e-05, + "loss": 2.2248, + "step": 2030 + }, + { + "epoch": 0.6233885819521179, + "grad_norm": 0.8800643682479858, + "learning_rate": 9.972682167779453e-05, + "loss": 2.3111, + "step": 2031 + }, + { + "epoch": 0.623695518723143, + "grad_norm": 0.7237632274627686, + "learning_rate": 9.97263025550933e-05, + "loss": 2.2255, + "step": 2032 + }, + { + "epoch": 0.6240024554941682, + "grad_norm": 0.7139064073562622, + "learning_rate": 9.97257829409676e-05, + "loss": 2.2065, + "step": 2033 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.6514315009117126, + "learning_rate": 9.972526283542259e-05, + "loss": 2.2176, + "step": 2034 + }, + { + "epoch": 0.6246163290362186, + "grad_norm": 0.726828932762146, + "learning_rate": 9.972474223846337e-05, + "loss": 2.2236, + "step": 2035 + }, + { + "epoch": 0.6249232658072437, + "grad_norm": 0.7121313810348511, + "learning_rate": 9.97242211500951e-05, + "loss": 2.2696, + "step": 2036 + }, + { + "epoch": 0.6252302025782689, + "grad_norm": 0.7203021049499512, + "learning_rate": 9.972369957032293e-05, + "loss": 2.2418, + "step": 2037 + }, + { + "epoch": 0.625537139349294, + "grad_norm": 0.6843051910400391, + "learning_rate": 9.972317749915203e-05, + "loss": 2.2408, + "step": 2038 + }, + { + "epoch": 0.6258440761203192, + "grad_norm": 0.6523141264915466, + "learning_rate": 9.972265493658754e-05, + "loss": 2.1693, + "step": 2039 + }, + { + "epoch": 0.6261510128913443, + "grad_norm": 0.6263946294784546, + "learning_rate": 9.972213188263463e-05, + "loss": 2.2477, + "step": 2040 + }, + { + "epoch": 0.6264579496623696, + "grad_norm": 0.6428464651107788, + "learning_rate": 9.972160833729847e-05, + "loss": 2.2131, + "step": 2041 + }, + { + "epoch": 0.6267648864333947, + "grad_norm": 0.6333484649658203, + "learning_rate": 9.972108430058423e-05, + "loss": 2.2806, + "step": 2042 + }, + { + "epoch": 0.6270718232044199, + "grad_norm": 0.7168832421302795, + "learning_rate": 9.97205597724971e-05, + "loss": 2.2468, + "step": 2043 + }, + { + "epoch": 0.627378759975445, + "grad_norm": 0.7522227168083191, + "learning_rate": 9.972003475304226e-05, + "loss": 2.249, + "step": 2044 + }, + { + "epoch": 0.6276856967464702, + "grad_norm": 0.6810066103935242, + "learning_rate": 9.971950924222488e-05, + "loss": 2.1988, + "step": 2045 + }, + { + "epoch": 0.6279926335174953, + "grad_norm": 0.6983187198638916, + "learning_rate": 9.971898324005018e-05, + "loss": 2.2444, + "step": 2046 + }, + { + "epoch": 0.6282995702885206, + "grad_norm": 0.7261439561843872, + "learning_rate": 9.971845674652333e-05, + "loss": 2.1789, + "step": 2047 + }, + { + "epoch": 0.6286065070595457, + "grad_norm": 0.6844322681427002, + "learning_rate": 9.971792976164957e-05, + "loss": 2.2666, + "step": 2048 + }, + { + "epoch": 0.6289134438305709, + "grad_norm": 0.7166746258735657, + "learning_rate": 9.971740228543407e-05, + "loss": 2.3002, + "step": 2049 + }, + { + "epoch": 0.629220380601596, + "grad_norm": 0.7386785745620728, + "learning_rate": 9.971687431788207e-05, + "loss": 2.1798, + "step": 2050 + }, + { + "epoch": 0.6295273173726212, + "grad_norm": 0.6873611211776733, + "learning_rate": 9.971634585899878e-05, + "loss": 2.184, + "step": 2051 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.8005948066711426, + "learning_rate": 9.971581690878941e-05, + "loss": 2.2778, + "step": 2052 + }, + { + "epoch": 0.6301411909146716, + "grad_norm": 0.8972415924072266, + "learning_rate": 9.971528746725922e-05, + "loss": 2.2822, + "step": 2053 + }, + { + "epoch": 0.6304481276856968, + "grad_norm": 0.7935822010040283, + "learning_rate": 9.97147575344134e-05, + "loss": 2.1732, + "step": 2054 + }, + { + "epoch": 0.6307550644567219, + "grad_norm": 0.7891644239425659, + "learning_rate": 9.971422711025721e-05, + "loss": 2.2765, + "step": 2055 + }, + { + "epoch": 0.6310620012277471, + "grad_norm": 0.7857005000114441, + "learning_rate": 9.971369619479589e-05, + "loss": 2.2386, + "step": 2056 + }, + { + "epoch": 0.6313689379987723, + "grad_norm": 0.6909852623939514, + "learning_rate": 9.97131647880347e-05, + "loss": 2.1251, + "step": 2057 + }, + { + "epoch": 0.6316758747697975, + "grad_norm": 0.6352387070655823, + "learning_rate": 9.971263288997885e-05, + "loss": 2.1883, + "step": 2058 + }, + { + "epoch": 0.6319828115408226, + "grad_norm": 0.5811386704444885, + "learning_rate": 9.971210050063364e-05, + "loss": 2.281, + "step": 2059 + }, + { + "epoch": 0.6322897483118478, + "grad_norm": 0.6227630376815796, + "learning_rate": 9.971156762000432e-05, + "loss": 2.1346, + "step": 2060 + }, + { + "epoch": 0.6325966850828729, + "grad_norm": 0.6628422737121582, + "learning_rate": 9.971103424809616e-05, + "loss": 2.2617, + "step": 2061 + }, + { + "epoch": 0.6329036218538981, + "grad_norm": 0.7212308645248413, + "learning_rate": 9.97105003849144e-05, + "loss": 2.1764, + "step": 2062 + }, + { + "epoch": 0.6332105586249233, + "grad_norm": 0.8368894457817078, + "learning_rate": 9.970996603046435e-05, + "loss": 2.2897, + "step": 2063 + }, + { + "epoch": 0.6335174953959485, + "grad_norm": 0.8797467350959778, + "learning_rate": 9.970943118475129e-05, + "loss": 2.1987, + "step": 2064 + }, + { + "epoch": 0.6338244321669736, + "grad_norm": 0.9241101145744324, + "learning_rate": 9.970889584778047e-05, + "loss": 2.2759, + "step": 2065 + }, + { + "epoch": 0.6341313689379988, + "grad_norm": 0.8636183142662048, + "learning_rate": 9.970836001955723e-05, + "loss": 2.2188, + "step": 2066 + }, + { + "epoch": 0.6344383057090239, + "grad_norm": 0.8965754508972168, + "learning_rate": 9.970782370008682e-05, + "loss": 2.2845, + "step": 2067 + }, + { + "epoch": 0.6347452424800492, + "grad_norm": 0.9064372777938843, + "learning_rate": 9.970728688937459e-05, + "loss": 2.1787, + "step": 2068 + }, + { + "epoch": 0.6350521792510743, + "grad_norm": 0.7387171387672424, + "learning_rate": 9.970674958742579e-05, + "loss": 2.1805, + "step": 2069 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.6220484972000122, + "learning_rate": 9.970621179424578e-05, + "loss": 2.2762, + "step": 2070 + }, + { + "epoch": 0.6356660527931246, + "grad_norm": 0.6268464922904968, + "learning_rate": 9.970567350983984e-05, + "loss": 2.2491, + "step": 2071 + }, + { + "epoch": 0.6359729895641498, + "grad_norm": 0.6385738253593445, + "learning_rate": 9.97051347342133e-05, + "loss": 2.2126, + "step": 2072 + }, + { + "epoch": 0.6362799263351749, + "grad_norm": 0.7084285020828247, + "learning_rate": 9.970459546737148e-05, + "loss": 2.2364, + "step": 2073 + }, + { + "epoch": 0.6365868631062002, + "grad_norm": 0.6957145929336548, + "learning_rate": 9.97040557093197e-05, + "loss": 2.266, + "step": 2074 + }, + { + "epoch": 0.6368937998772253, + "grad_norm": 0.6037309169769287, + "learning_rate": 9.970351546006334e-05, + "loss": 2.1514, + "step": 2075 + }, + { + "epoch": 0.6372007366482505, + "grad_norm": 0.6342970132827759, + "learning_rate": 9.97029747196077e-05, + "loss": 2.1602, + "step": 2076 + }, + { + "epoch": 0.6375076734192756, + "grad_norm": 0.5793863534927368, + "learning_rate": 9.970243348795812e-05, + "loss": 2.1853, + "step": 2077 + }, + { + "epoch": 0.6378146101903008, + "grad_norm": 0.5420103073120117, + "learning_rate": 9.970189176511997e-05, + "loss": 2.1885, + "step": 2078 + }, + { + "epoch": 0.638121546961326, + "grad_norm": 0.6713188886642456, + "learning_rate": 9.97013495510986e-05, + "loss": 2.2641, + "step": 2079 + }, + { + "epoch": 0.6384284837323512, + "grad_norm": 0.7410796880722046, + "learning_rate": 9.970080684589935e-05, + "loss": 2.2248, + "step": 2080 + }, + { + "epoch": 0.6387354205033763, + "grad_norm": 0.7138017416000366, + "learning_rate": 9.970026364952761e-05, + "loss": 2.1975, + "step": 2081 + }, + { + "epoch": 0.6390423572744015, + "grad_norm": 0.7553584575653076, + "learning_rate": 9.969971996198873e-05, + "loss": 2.2482, + "step": 2082 + }, + { + "epoch": 0.6393492940454266, + "grad_norm": 0.7082852125167847, + "learning_rate": 9.969917578328808e-05, + "loss": 2.1681, + "step": 2083 + }, + { + "epoch": 0.6396562308164518, + "grad_norm": 0.6190223097801208, + "learning_rate": 9.969863111343105e-05, + "loss": 2.1995, + "step": 2084 + }, + { + "epoch": 0.639963167587477, + "grad_norm": 0.6640429496765137, + "learning_rate": 9.969808595242302e-05, + "loss": 2.2969, + "step": 2085 + }, + { + "epoch": 0.6402701043585022, + "grad_norm": 0.761377215385437, + "learning_rate": 9.969754030026936e-05, + "loss": 2.2412, + "step": 2086 + }, + { + "epoch": 0.6405770411295273, + "grad_norm": 0.7226401567459106, + "learning_rate": 9.969699415697551e-05, + "loss": 2.1852, + "step": 2087 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.6474639177322388, + "learning_rate": 9.969644752254681e-05, + "loss": 2.1867, + "step": 2088 + }, + { + "epoch": 0.6411909146715776, + "grad_norm": 0.6725835800170898, + "learning_rate": 9.96959003969887e-05, + "loss": 2.1962, + "step": 2089 + }, + { + "epoch": 0.6414978514426029, + "grad_norm": 0.6669641733169556, + "learning_rate": 9.969535278030657e-05, + "loss": 2.2045, + "step": 2090 + }, + { + "epoch": 0.641804788213628, + "grad_norm": 0.7604048252105713, + "learning_rate": 9.969480467250583e-05, + "loss": 2.2543, + "step": 2091 + }, + { + "epoch": 0.6421117249846532, + "grad_norm": 0.9369953870773315, + "learning_rate": 9.969425607359191e-05, + "loss": 2.2461, + "step": 2092 + }, + { + "epoch": 0.6424186617556783, + "grad_norm": 1.116156816482544, + "learning_rate": 9.969370698357022e-05, + "loss": 2.2447, + "step": 2093 + }, + { + "epoch": 0.6427255985267035, + "grad_norm": 0.9179674983024597, + "learning_rate": 9.96931574024462e-05, + "loss": 2.2164, + "step": 2094 + }, + { + "epoch": 0.6430325352977286, + "grad_norm": 0.7629393339157104, + "learning_rate": 9.969260733022526e-05, + "loss": 2.22, + "step": 2095 + }, + { + "epoch": 0.6433394720687539, + "grad_norm": 0.7152948379516602, + "learning_rate": 9.969205676691286e-05, + "loss": 2.1967, + "step": 2096 + }, + { + "epoch": 0.643646408839779, + "grad_norm": 0.7527763247489929, + "learning_rate": 9.969150571251442e-05, + "loss": 2.2263, + "step": 2097 + }, + { + "epoch": 0.6439533456108042, + "grad_norm": 0.9889422655105591, + "learning_rate": 9.96909541670354e-05, + "loss": 2.2127, + "step": 2098 + }, + { + "epoch": 0.6442602823818293, + "grad_norm": 1.0340619087219238, + "learning_rate": 9.969040213048125e-05, + "loss": 2.2392, + "step": 2099 + }, + { + "epoch": 0.6445672191528545, + "grad_norm": 0.735322892665863, + "learning_rate": 9.968984960285743e-05, + "loss": 2.1351, + "step": 2100 + }, + { + "epoch": 0.6448741559238796, + "grad_norm": 0.6575397849082947, + "learning_rate": 9.968929658416936e-05, + "loss": 2.2481, + "step": 2101 + }, + { + "epoch": 0.6451810926949049, + "grad_norm": 0.6891960501670837, + "learning_rate": 9.968874307442258e-05, + "loss": 2.2164, + "step": 2102 + }, + { + "epoch": 0.64548802946593, + "grad_norm": 0.792298436164856, + "learning_rate": 9.968818907362248e-05, + "loss": 2.1681, + "step": 2103 + }, + { + "epoch": 0.6457949662369552, + "grad_norm": 0.8438142538070679, + "learning_rate": 9.968763458177459e-05, + "loss": 2.2123, + "step": 2104 + }, + { + "epoch": 0.6461019030079803, + "grad_norm": 0.7494921088218689, + "learning_rate": 9.968707959888436e-05, + "loss": 2.1863, + "step": 2105 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.7049927115440369, + "learning_rate": 9.968652412495731e-05, + "loss": 2.2364, + "step": 2106 + }, + { + "epoch": 0.6467157765500307, + "grad_norm": 0.7586455345153809, + "learning_rate": 9.968596815999889e-05, + "loss": 2.1976, + "step": 2107 + }, + { + "epoch": 0.6470227133210559, + "grad_norm": 0.7762691974639893, + "learning_rate": 9.968541170401462e-05, + "loss": 2.2323, + "step": 2108 + }, + { + "epoch": 0.647329650092081, + "grad_norm": 0.8127642869949341, + "learning_rate": 9.968485475700998e-05, + "loss": 2.1577, + "step": 2109 + }, + { + "epoch": 0.6476365868631062, + "grad_norm": 0.6762635111808777, + "learning_rate": 9.968429731899049e-05, + "loss": 2.1972, + "step": 2110 + }, + { + "epoch": 0.6479435236341313, + "grad_norm": 0.675707995891571, + "learning_rate": 9.968373938996165e-05, + "loss": 2.1932, + "step": 2111 + }, + { + "epoch": 0.6482504604051565, + "grad_norm": 0.6996815204620361, + "learning_rate": 9.968318096992898e-05, + "loss": 2.2695, + "step": 2112 + }, + { + "epoch": 0.6485573971761817, + "grad_norm": 0.8519851565361023, + "learning_rate": 9.968262205889799e-05, + "loss": 2.2662, + "step": 2113 + }, + { + "epoch": 0.6488643339472069, + "grad_norm": 0.7621145844459534, + "learning_rate": 9.968206265687421e-05, + "loss": 2.2888, + "step": 2114 + }, + { + "epoch": 0.649171270718232, + "grad_norm": 0.786609411239624, + "learning_rate": 9.968150276386317e-05, + "loss": 2.3354, + "step": 2115 + }, + { + "epoch": 0.6494782074892572, + "grad_norm": 0.7693428993225098, + "learning_rate": 9.96809423798704e-05, + "loss": 2.1981, + "step": 2116 + }, + { + "epoch": 0.6497851442602823, + "grad_norm": 0.72762131690979, + "learning_rate": 9.968038150490145e-05, + "loss": 2.2387, + "step": 2117 + }, + { + "epoch": 0.6500920810313076, + "grad_norm": 0.737617015838623, + "learning_rate": 9.967982013896184e-05, + "loss": 2.258, + "step": 2118 + }, + { + "epoch": 0.6503990178023327, + "grad_norm": 0.7320968508720398, + "learning_rate": 9.967925828205712e-05, + "loss": 2.3248, + "step": 2119 + }, + { + "epoch": 0.6507059545733579, + "grad_norm": 0.7904484868049622, + "learning_rate": 9.967869593419286e-05, + "loss": 2.2121, + "step": 2120 + }, + { + "epoch": 0.651012891344383, + "grad_norm": 0.7519722580909729, + "learning_rate": 9.967813309537461e-05, + "loss": 2.1999, + "step": 2121 + }, + { + "epoch": 0.6513198281154082, + "grad_norm": 0.7201504707336426, + "learning_rate": 9.967756976560793e-05, + "loss": 2.2022, + "step": 2122 + }, + { + "epoch": 0.6516267648864333, + "grad_norm": 0.6134514808654785, + "learning_rate": 9.96770059448984e-05, + "loss": 2.2105, + "step": 2123 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.6086028218269348, + "learning_rate": 9.967644163325156e-05, + "loss": 2.212, + "step": 2124 + }, + { + "epoch": 0.6522406384284838, + "grad_norm": 0.6550475358963013, + "learning_rate": 9.967587683067302e-05, + "loss": 2.181, + "step": 2125 + }, + { + "epoch": 0.6525475751995089, + "grad_norm": 0.7557916045188904, + "learning_rate": 9.967531153716835e-05, + "loss": 2.3194, + "step": 2126 + }, + { + "epoch": 0.6528545119705341, + "grad_norm": 0.8859965801239014, + "learning_rate": 9.967474575274314e-05, + "loss": 2.2104, + "step": 2127 + }, + { + "epoch": 0.6531614487415592, + "grad_norm": 0.8049005270004272, + "learning_rate": 9.967417947740296e-05, + "loss": 2.2949, + "step": 2128 + }, + { + "epoch": 0.6534683855125845, + "grad_norm": 0.708297073841095, + "learning_rate": 9.967361271115343e-05, + "loss": 2.1703, + "step": 2129 + }, + { + "epoch": 0.6537753222836096, + "grad_norm": 0.6764169335365295, + "learning_rate": 9.967304545400016e-05, + "loss": 2.2177, + "step": 2130 + }, + { + "epoch": 0.6540822590546348, + "grad_norm": 0.6987971067428589, + "learning_rate": 9.967247770594872e-05, + "loss": 2.1699, + "step": 2131 + }, + { + "epoch": 0.6543891958256599, + "grad_norm": 0.7212976217269897, + "learning_rate": 9.967190946700476e-05, + "loss": 2.1217, + "step": 2132 + }, + { + "epoch": 0.6546961325966851, + "grad_norm": 0.6805562973022461, + "learning_rate": 9.967134073717386e-05, + "loss": 2.2295, + "step": 2133 + }, + { + "epoch": 0.6550030693677102, + "grad_norm": 0.665428102016449, + "learning_rate": 9.967077151646167e-05, + "loss": 2.1742, + "step": 2134 + }, + { + "epoch": 0.6553100061387355, + "grad_norm": 0.6691353917121887, + "learning_rate": 9.967020180487378e-05, + "loss": 2.2313, + "step": 2135 + }, + { + "epoch": 0.6556169429097606, + "grad_norm": 0.7095547914505005, + "learning_rate": 9.966963160241587e-05, + "loss": 2.1367, + "step": 2136 + }, + { + "epoch": 0.6559238796807858, + "grad_norm": 0.7050215601921082, + "learning_rate": 9.966906090909353e-05, + "loss": 2.3234, + "step": 2137 + }, + { + "epoch": 0.6562308164518109, + "grad_norm": 0.7592353820800781, + "learning_rate": 9.966848972491245e-05, + "loss": 2.1722, + "step": 2138 + }, + { + "epoch": 0.6565377532228361, + "grad_norm": 0.6520100831985474, + "learning_rate": 9.96679180498782e-05, + "loss": 2.2401, + "step": 2139 + }, + { + "epoch": 0.6568446899938613, + "grad_norm": 0.6650902628898621, + "learning_rate": 9.966734588399651e-05, + "loss": 2.2094, + "step": 2140 + }, + { + "epoch": 0.6571516267648865, + "grad_norm": 0.7236151099205017, + "learning_rate": 9.966677322727299e-05, + "loss": 2.3021, + "step": 2141 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.7160753011703491, + "learning_rate": 9.966620007971327e-05, + "loss": 2.1992, + "step": 2142 + }, + { + "epoch": 0.6577655003069368, + "grad_norm": 0.6761705279350281, + "learning_rate": 9.966562644132309e-05, + "loss": 2.1853, + "step": 2143 + }, + { + "epoch": 0.6580724370779619, + "grad_norm": 0.7017555236816406, + "learning_rate": 9.966505231210806e-05, + "loss": 2.208, + "step": 2144 + }, + { + "epoch": 0.6583793738489871, + "grad_norm": 0.7652586102485657, + "learning_rate": 9.966447769207387e-05, + "loss": 2.3065, + "step": 2145 + }, + { + "epoch": 0.6586863106200123, + "grad_norm": 0.7148436307907104, + "learning_rate": 9.966390258122621e-05, + "loss": 2.1388, + "step": 2146 + }, + { + "epoch": 0.6589932473910375, + "grad_norm": 0.5885360240936279, + "learning_rate": 9.966332697957076e-05, + "loss": 2.1463, + "step": 2147 + }, + { + "epoch": 0.6593001841620626, + "grad_norm": 0.6800816655158997, + "learning_rate": 9.966275088711321e-05, + "loss": 2.3397, + "step": 2148 + }, + { + "epoch": 0.6596071209330878, + "grad_norm": 0.6856956481933594, + "learning_rate": 9.966217430385925e-05, + "loss": 2.0893, + "step": 2149 + }, + { + "epoch": 0.6599140577041129, + "grad_norm": 0.6302888989448547, + "learning_rate": 9.966159722981456e-05, + "loss": 2.1108, + "step": 2150 + }, + { + "epoch": 0.6602209944751382, + "grad_norm": 0.6145252585411072, + "learning_rate": 9.966101966498486e-05, + "loss": 2.2668, + "step": 2151 + }, + { + "epoch": 0.6605279312461633, + "grad_norm": 0.7258949279785156, + "learning_rate": 9.966044160937586e-05, + "loss": 2.2163, + "step": 2152 + }, + { + "epoch": 0.6608348680171885, + "grad_norm": 0.6809847950935364, + "learning_rate": 9.965986306299327e-05, + "loss": 2.1828, + "step": 2153 + }, + { + "epoch": 0.6611418047882136, + "grad_norm": 0.6673223376274109, + "learning_rate": 9.96592840258428e-05, + "loss": 2.232, + "step": 2154 + }, + { + "epoch": 0.6614487415592388, + "grad_norm": 0.6483572721481323, + "learning_rate": 9.96587044979302e-05, + "loss": 2.199, + "step": 2155 + }, + { + "epoch": 0.6617556783302639, + "grad_norm": 0.6227185726165771, + "learning_rate": 9.965812447926115e-05, + "loss": 2.166, + "step": 2156 + }, + { + "epoch": 0.6620626151012892, + "grad_norm": 0.5982463955879211, + "learning_rate": 9.965754396984142e-05, + "loss": 2.2074, + "step": 2157 + }, + { + "epoch": 0.6623695518723143, + "grad_norm": 0.6357809901237488, + "learning_rate": 9.965696296967673e-05, + "loss": 2.2086, + "step": 2158 + }, + { + "epoch": 0.6626764886433395, + "grad_norm": 0.5908147692680359, + "learning_rate": 9.965638147877283e-05, + "loss": 2.1103, + "step": 2159 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.591332733631134, + "learning_rate": 9.965579949713545e-05, + "loss": 2.1698, + "step": 2160 + }, + { + "epoch": 0.6632903621853898, + "grad_norm": 0.5748336911201477, + "learning_rate": 9.965521702477038e-05, + "loss": 2.1812, + "step": 2161 + }, + { + "epoch": 0.663597298956415, + "grad_norm": 0.6643908023834229, + "learning_rate": 9.965463406168334e-05, + "loss": 2.2129, + "step": 2162 + }, + { + "epoch": 0.6639042357274402, + "grad_norm": 0.637627124786377, + "learning_rate": 9.965405060788011e-05, + "loss": 2.226, + "step": 2163 + }, + { + "epoch": 0.6642111724984653, + "grad_norm": 0.6170387268066406, + "learning_rate": 9.965346666336644e-05, + "loss": 2.2025, + "step": 2164 + }, + { + "epoch": 0.6645181092694905, + "grad_norm": 0.6038833260536194, + "learning_rate": 9.965288222814812e-05, + "loss": 2.1761, + "step": 2165 + }, + { + "epoch": 0.6648250460405156, + "grad_norm": 0.5705585479736328, + "learning_rate": 9.965229730223092e-05, + "loss": 2.1511, + "step": 2166 + }, + { + "epoch": 0.6651319828115408, + "grad_norm": 0.5994759798049927, + "learning_rate": 9.965171188562059e-05, + "loss": 2.1763, + "step": 2167 + }, + { + "epoch": 0.665438919582566, + "grad_norm": 0.5887313485145569, + "learning_rate": 9.965112597832296e-05, + "loss": 2.2185, + "step": 2168 + }, + { + "epoch": 0.6657458563535912, + "grad_norm": 0.5688689947128296, + "learning_rate": 9.96505395803438e-05, + "loss": 2.2387, + "step": 2169 + }, + { + "epoch": 0.6660527931246163, + "grad_norm": 0.6121554970741272, + "learning_rate": 9.96499526916889e-05, + "loss": 2.1938, + "step": 2170 + }, + { + "epoch": 0.6663597298956415, + "grad_norm": 0.6048038005828857, + "learning_rate": 9.964936531236407e-05, + "loss": 2.197, + "step": 2171 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6442995071411133, + "learning_rate": 9.96487774423751e-05, + "loss": 2.1725, + "step": 2172 + }, + { + "epoch": 0.6669736034376919, + "grad_norm": 0.7136862874031067, + "learning_rate": 9.964818908172783e-05, + "loss": 2.2166, + "step": 2173 + }, + { + "epoch": 0.667280540208717, + "grad_norm": 0.6902804970741272, + "learning_rate": 9.964760023042805e-05, + "loss": 2.2318, + "step": 2174 + }, + { + "epoch": 0.6675874769797422, + "grad_norm": 0.6946488618850708, + "learning_rate": 9.964701088848158e-05, + "loss": 2.177, + "step": 2175 + }, + { + "epoch": 0.6678944137507673, + "grad_norm": 0.6283712983131409, + "learning_rate": 9.964642105589425e-05, + "loss": 2.2227, + "step": 2176 + }, + { + "epoch": 0.6682013505217925, + "grad_norm": 0.5768510103225708, + "learning_rate": 9.96458307326719e-05, + "loss": 2.1559, + "step": 2177 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.6045784950256348, + "learning_rate": 9.964523991882035e-05, + "loss": 2.2018, + "step": 2178 + }, + { + "epoch": 0.6688152240638429, + "grad_norm": 0.5962889790534973, + "learning_rate": 9.964464861434544e-05, + "loss": 2.1898, + "step": 2179 + }, + { + "epoch": 0.669122160834868, + "grad_norm": 0.6611660718917847, + "learning_rate": 9.964405681925301e-05, + "loss": 2.1989, + "step": 2180 + }, + { + "epoch": 0.6694290976058932, + "grad_norm": 0.6764575242996216, + "learning_rate": 9.964346453354891e-05, + "loss": 2.2764, + "step": 2181 + }, + { + "epoch": 0.6697360343769183, + "grad_norm": 0.6795048117637634, + "learning_rate": 9.964287175723899e-05, + "loss": 2.1313, + "step": 2182 + }, + { + "epoch": 0.6700429711479435, + "grad_norm": 0.6697003841400146, + "learning_rate": 9.964227849032914e-05, + "loss": 2.1999, + "step": 2183 + }, + { + "epoch": 0.6703499079189686, + "grad_norm": 0.669682502746582, + "learning_rate": 9.964168473282519e-05, + "loss": 2.202, + "step": 2184 + }, + { + "epoch": 0.6706568446899939, + "grad_norm": 0.6823530793190002, + "learning_rate": 9.9641090484733e-05, + "loss": 2.2326, + "step": 2185 + }, + { + "epoch": 0.670963781461019, + "grad_norm": 0.7460775971412659, + "learning_rate": 9.964049574605848e-05, + "loss": 2.1594, + "step": 2186 + }, + { + "epoch": 0.6712707182320442, + "grad_norm": 0.8075460195541382, + "learning_rate": 9.963990051680744e-05, + "loss": 2.1506, + "step": 2187 + }, + { + "epoch": 0.6715776550030693, + "grad_norm": 0.8041695356369019, + "learning_rate": 9.963930479698585e-05, + "loss": 2.123, + "step": 2188 + }, + { + "epoch": 0.6718845917740945, + "grad_norm": 0.9129732251167297, + "learning_rate": 9.963870858659955e-05, + "loss": 2.116, + "step": 2189 + }, + { + "epoch": 0.6721915285451197, + "grad_norm": 0.9989685416221619, + "learning_rate": 9.963811188565444e-05, + "loss": 2.3194, + "step": 2190 + }, + { + "epoch": 0.6724984653161449, + "grad_norm": 1.0353670120239258, + "learning_rate": 9.96375146941564e-05, + "loss": 2.113, + "step": 2191 + }, + { + "epoch": 0.67280540208717, + "grad_norm": 0.897750735282898, + "learning_rate": 9.963691701211135e-05, + "loss": 2.1038, + "step": 2192 + }, + { + "epoch": 0.6731123388581952, + "grad_norm": 0.7353916168212891, + "learning_rate": 9.96363188395252e-05, + "loss": 2.2185, + "step": 2193 + }, + { + "epoch": 0.6734192756292203, + "grad_norm": 0.6474063992500305, + "learning_rate": 9.963572017640385e-05, + "loss": 2.2229, + "step": 2194 + }, + { + "epoch": 0.6737262124002455, + "grad_norm": 0.7194583415985107, + "learning_rate": 9.963512102275322e-05, + "loss": 2.2172, + "step": 2195 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.6638131737709045, + "learning_rate": 9.963452137857926e-05, + "loss": 2.2212, + "step": 2196 + }, + { + "epoch": 0.6743400859422959, + "grad_norm": 0.7219048738479614, + "learning_rate": 9.963392124388782e-05, + "loss": 2.3302, + "step": 2197 + }, + { + "epoch": 0.6746470227133211, + "grad_norm": 0.7941164374351501, + "learning_rate": 9.963332061868491e-05, + "loss": 2.2982, + "step": 2198 + }, + { + "epoch": 0.6749539594843462, + "grad_norm": 0.7356888055801392, + "learning_rate": 9.963271950297643e-05, + "loss": 2.1761, + "step": 2199 + }, + { + "epoch": 0.6752608962553714, + "grad_norm": 0.6705774664878845, + "learning_rate": 9.963211789676831e-05, + "loss": 2.2483, + "step": 2200 + }, + { + "epoch": 0.6755678330263966, + "grad_norm": 0.7958056926727295, + "learning_rate": 9.963151580006653e-05, + "loss": 2.2209, + "step": 2201 + }, + { + "epoch": 0.6758747697974218, + "grad_norm": 0.7215412259101868, + "learning_rate": 9.9630913212877e-05, + "loss": 2.1676, + "step": 2202 + }, + { + "epoch": 0.6761817065684469, + "grad_norm": 0.705649197101593, + "learning_rate": 9.963031013520572e-05, + "loss": 2.1855, + "step": 2203 + }, + { + "epoch": 0.6764886433394721, + "grad_norm": 0.7050254344940186, + "learning_rate": 9.962970656705861e-05, + "loss": 2.171, + "step": 2204 + }, + { + "epoch": 0.6767955801104972, + "grad_norm": 0.7163556218147278, + "learning_rate": 9.962910250844167e-05, + "loss": 2.1295, + "step": 2205 + }, + { + "epoch": 0.6771025168815225, + "grad_norm": 0.7195280194282532, + "learning_rate": 9.962849795936083e-05, + "loss": 2.1436, + "step": 2206 + }, + { + "epoch": 0.6774094536525476, + "grad_norm": 0.7356030344963074, + "learning_rate": 9.962789291982208e-05, + "loss": 2.2739, + "step": 2207 + }, + { + "epoch": 0.6777163904235728, + "grad_norm": 0.783649742603302, + "learning_rate": 9.962728738983143e-05, + "loss": 2.2461, + "step": 2208 + }, + { + "epoch": 0.6780233271945979, + "grad_norm": 0.6966754794120789, + "learning_rate": 9.962668136939481e-05, + "loss": 2.1977, + "step": 2209 + }, + { + "epoch": 0.6783302639656231, + "grad_norm": 0.6986487507820129, + "learning_rate": 9.962607485851825e-05, + "loss": 2.1806, + "step": 2210 + }, + { + "epoch": 0.6786372007366482, + "grad_norm": 0.6502536535263062, + "learning_rate": 9.962546785720774e-05, + "loss": 2.174, + "step": 2211 + }, + { + "epoch": 0.6789441375076735, + "grad_norm": 0.6797144412994385, + "learning_rate": 9.962486036546926e-05, + "loss": 2.2635, + "step": 2212 + }, + { + "epoch": 0.6792510742786986, + "grad_norm": 0.7190150022506714, + "learning_rate": 9.962425238330884e-05, + "loss": 2.2231, + "step": 2213 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.6770560145378113, + "learning_rate": 9.962364391073245e-05, + "loss": 2.1639, + "step": 2214 + }, + { + "epoch": 0.6798649478207489, + "grad_norm": 0.624911904335022, + "learning_rate": 9.962303494774614e-05, + "loss": 2.1754, + "step": 2215 + }, + { + "epoch": 0.6801718845917741, + "grad_norm": 0.7127423286437988, + "learning_rate": 9.96224254943559e-05, + "loss": 2.2047, + "step": 2216 + }, + { + "epoch": 0.6804788213627992, + "grad_norm": 0.6729345321655273, + "learning_rate": 9.962181555056778e-05, + "loss": 2.2245, + "step": 2217 + }, + { + "epoch": 0.6807857581338245, + "grad_norm": 0.7142044901847839, + "learning_rate": 9.96212051163878e-05, + "loss": 2.1827, + "step": 2218 + }, + { + "epoch": 0.6810926949048496, + "grad_norm": 0.686295211315155, + "learning_rate": 9.962059419182196e-05, + "loss": 2.1784, + "step": 2219 + }, + { + "epoch": 0.6813996316758748, + "grad_norm": 0.7207211256027222, + "learning_rate": 9.961998277687634e-05, + "loss": 2.2603, + "step": 2220 + }, + { + "epoch": 0.6817065684468999, + "grad_norm": 0.814552903175354, + "learning_rate": 9.961937087155697e-05, + "loss": 2.2328, + "step": 2221 + }, + { + "epoch": 0.6820135052179251, + "grad_norm": 0.851860761642456, + "learning_rate": 9.96187584758699e-05, + "loss": 2.2334, + "step": 2222 + }, + { + "epoch": 0.6823204419889503, + "grad_norm": 0.9232058525085449, + "learning_rate": 9.961814558982117e-05, + "loss": 2.2259, + "step": 2223 + }, + { + "epoch": 0.6826273787599755, + "grad_norm": 0.8393358588218689, + "learning_rate": 9.961753221341684e-05, + "loss": 2.1347, + "step": 2224 + }, + { + "epoch": 0.6829343155310006, + "grad_norm": 0.7124439477920532, + "learning_rate": 9.961691834666297e-05, + "loss": 2.195, + "step": 2225 + }, + { + "epoch": 0.6832412523020258, + "grad_norm": 0.644290566444397, + "learning_rate": 9.961630398956565e-05, + "loss": 2.1967, + "step": 2226 + }, + { + "epoch": 0.6835481890730509, + "grad_norm": 0.6896283030509949, + "learning_rate": 9.961568914213092e-05, + "loss": 2.1781, + "step": 2227 + }, + { + "epoch": 0.6838551258440762, + "grad_norm": 0.711643636226654, + "learning_rate": 9.961507380436487e-05, + "loss": 2.1091, + "step": 2228 + }, + { + "epoch": 0.6841620626151013, + "grad_norm": 0.7056689858436584, + "learning_rate": 9.961445797627358e-05, + "loss": 2.1848, + "step": 2229 + }, + { + "epoch": 0.6844689993861265, + "grad_norm": 0.60573410987854, + "learning_rate": 9.961384165786314e-05, + "loss": 2.1156, + "step": 2230 + }, + { + "epoch": 0.6847759361571516, + "grad_norm": 0.5612443089485168, + "learning_rate": 9.961322484913963e-05, + "loss": 2.2311, + "step": 2231 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.6356449723243713, + "learning_rate": 9.961260755010916e-05, + "loss": 2.1945, + "step": 2232 + }, + { + "epoch": 0.6853898096992019, + "grad_norm": 0.7393341660499573, + "learning_rate": 9.961198976077782e-05, + "loss": 2.2743, + "step": 2233 + }, + { + "epoch": 0.6856967464702272, + "grad_norm": 0.7658794522285461, + "learning_rate": 9.961137148115171e-05, + "loss": 2.1729, + "step": 2234 + }, + { + "epoch": 0.6860036832412523, + "grad_norm": 0.790540337562561, + "learning_rate": 9.961075271123697e-05, + "loss": 2.1372, + "step": 2235 + }, + { + "epoch": 0.6863106200122775, + "grad_norm": 0.71295565366745, + "learning_rate": 9.961013345103968e-05, + "loss": 2.1325, + "step": 2236 + }, + { + "epoch": 0.6866175567833026, + "grad_norm": 0.6648302674293518, + "learning_rate": 9.960951370056597e-05, + "loss": 2.1626, + "step": 2237 + }, + { + "epoch": 0.6869244935543278, + "grad_norm": 0.6276865601539612, + "learning_rate": 9.960889345982198e-05, + "loss": 2.1848, + "step": 2238 + }, + { + "epoch": 0.6872314303253529, + "grad_norm": 0.6786942481994629, + "learning_rate": 9.960827272881383e-05, + "loss": 2.2402, + "step": 2239 + }, + { + "epoch": 0.6875383670963782, + "grad_norm": 0.7752293348312378, + "learning_rate": 9.960765150754764e-05, + "loss": 2.2187, + "step": 2240 + }, + { + "epoch": 0.6878453038674033, + "grad_norm": 0.7958577871322632, + "learning_rate": 9.960702979602956e-05, + "loss": 2.1995, + "step": 2241 + }, + { + "epoch": 0.6881522406384285, + "grad_norm": 0.7327582240104675, + "learning_rate": 9.960640759426575e-05, + "loss": 2.1709, + "step": 2242 + }, + { + "epoch": 0.6884591774094536, + "grad_norm": 0.7002710103988647, + "learning_rate": 9.960578490226233e-05, + "loss": 2.1966, + "step": 2243 + }, + { + "epoch": 0.6887661141804788, + "grad_norm": 0.6163785457611084, + "learning_rate": 9.960516172002548e-05, + "loss": 2.2012, + "step": 2244 + }, + { + "epoch": 0.689073050951504, + "grad_norm": 0.6808127760887146, + "learning_rate": 9.960453804756134e-05, + "loss": 2.1704, + "step": 2245 + }, + { + "epoch": 0.6893799877225292, + "grad_norm": 0.6571208834648132, + "learning_rate": 9.960391388487609e-05, + "loss": 2.17, + "step": 2246 + }, + { + "epoch": 0.6896869244935543, + "grad_norm": 0.7180834412574768, + "learning_rate": 9.960328923197588e-05, + "loss": 2.229, + "step": 2247 + }, + { + "epoch": 0.6899938612645795, + "grad_norm": 0.7283746600151062, + "learning_rate": 9.96026640888669e-05, + "loss": 2.195, + "step": 2248 + }, + { + "epoch": 0.6903007980356046, + "grad_norm": 0.6808122992515564, + "learning_rate": 9.960203845555531e-05, + "loss": 2.1327, + "step": 2249 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.7105094790458679, + "learning_rate": 9.960141233204731e-05, + "loss": 2.2747, + "step": 2250 + }, + { + "epoch": 0.690914671577655, + "grad_norm": 0.7650291919708252, + "learning_rate": 9.960078571834909e-05, + "loss": 2.2751, + "step": 2251 + }, + { + "epoch": 0.6912216083486802, + "grad_norm": 0.8347647786140442, + "learning_rate": 9.960015861446684e-05, + "loss": 2.2101, + "step": 2252 + }, + { + "epoch": 0.6915285451197053, + "grad_norm": 0.7774063348770142, + "learning_rate": 9.959953102040672e-05, + "loss": 2.1275, + "step": 2253 + }, + { + "epoch": 0.6918354818907305, + "grad_norm": 0.7466274499893188, + "learning_rate": 9.959890293617497e-05, + "loss": 2.1352, + "step": 2254 + }, + { + "epoch": 0.6921424186617556, + "grad_norm": 0.7451669573783875, + "learning_rate": 9.959827436177781e-05, + "loss": 2.1229, + "step": 2255 + }, + { + "epoch": 0.6924493554327809, + "grad_norm": 0.651746392250061, + "learning_rate": 9.959764529722142e-05, + "loss": 2.1416, + "step": 2256 + }, + { + "epoch": 0.692756292203806, + "grad_norm": 0.6267968416213989, + "learning_rate": 9.959701574251203e-05, + "loss": 2.1346, + "step": 2257 + }, + { + "epoch": 0.6930632289748312, + "grad_norm": 0.6087000966072083, + "learning_rate": 9.959638569765586e-05, + "loss": 2.2136, + "step": 2258 + }, + { + "epoch": 0.6933701657458563, + "grad_norm": 0.6032208204269409, + "learning_rate": 9.959575516265914e-05, + "loss": 2.1211, + "step": 2259 + }, + { + "epoch": 0.6936771025168815, + "grad_norm": 0.83074551820755, + "learning_rate": 9.95951241375281e-05, + "loss": 2.2951, + "step": 2260 + }, + { + "epoch": 0.6939840392879066, + "grad_norm": 0.8564106225967407, + "learning_rate": 9.959449262226897e-05, + "loss": 2.1496, + "step": 2261 + }, + { + "epoch": 0.6942909760589319, + "grad_norm": 0.8558153510093689, + "learning_rate": 9.9593860616888e-05, + "loss": 2.2325, + "step": 2262 + }, + { + "epoch": 0.694597912829957, + "grad_norm": 0.7391008734703064, + "learning_rate": 9.959322812139143e-05, + "loss": 2.1133, + "step": 2263 + }, + { + "epoch": 0.6949048496009822, + "grad_norm": 0.6090536713600159, + "learning_rate": 9.959259513578552e-05, + "loss": 2.1453, + "step": 2264 + }, + { + "epoch": 0.6952117863720073, + "grad_norm": 0.5893986821174622, + "learning_rate": 9.95919616600765e-05, + "loss": 2.2035, + "step": 2265 + }, + { + "epoch": 0.6955187231430325, + "grad_norm": 0.6274020671844482, + "learning_rate": 9.959132769427065e-05, + "loss": 2.2118, + "step": 2266 + }, + { + "epoch": 0.6958256599140578, + "grad_norm": 0.6287395358085632, + "learning_rate": 9.959069323837424e-05, + "loss": 2.2167, + "step": 2267 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.6281611323356628, + "learning_rate": 9.959005829239354e-05, + "loss": 2.1945, + "step": 2268 + }, + { + "epoch": 0.6964395334561081, + "grad_norm": 0.6422389149665833, + "learning_rate": 9.958942285633481e-05, + "loss": 2.1826, + "step": 2269 + }, + { + "epoch": 0.6967464702271332, + "grad_norm": 0.6461887955665588, + "learning_rate": 9.958878693020434e-05, + "loss": 2.2454, + "step": 2270 + }, + { + "epoch": 0.6970534069981584, + "grad_norm": 0.562102735042572, + "learning_rate": 9.958815051400841e-05, + "loss": 2.1375, + "step": 2271 + }, + { + "epoch": 0.6973603437691835, + "grad_norm": 0.5737003087997437, + "learning_rate": 9.958751360775331e-05, + "loss": 2.2344, + "step": 2272 + }, + { + "epoch": 0.6976672805402088, + "grad_norm": 0.5516494512557983, + "learning_rate": 9.958687621144535e-05, + "loss": 2.249, + "step": 2273 + }, + { + "epoch": 0.6979742173112339, + "grad_norm": 0.7148357629776001, + "learning_rate": 9.958623832509081e-05, + "loss": 2.2383, + "step": 2274 + }, + { + "epoch": 0.6982811540822591, + "grad_norm": 0.7151525020599365, + "learning_rate": 9.958559994869599e-05, + "loss": 2.1697, + "step": 2275 + }, + { + "epoch": 0.6985880908532842, + "grad_norm": 0.6927846670150757, + "learning_rate": 9.958496108226722e-05, + "loss": 2.1534, + "step": 2276 + }, + { + "epoch": 0.6988950276243094, + "grad_norm": 0.811660647392273, + "learning_rate": 9.958432172581079e-05, + "loss": 2.2197, + "step": 2277 + }, + { + "epoch": 0.6992019643953346, + "grad_norm": 0.9680081009864807, + "learning_rate": 9.958368187933305e-05, + "loss": 2.2241, + "step": 2278 + }, + { + "epoch": 0.6995089011663598, + "grad_norm": 0.9996320605278015, + "learning_rate": 9.958304154284028e-05, + "loss": 2.1598, + "step": 2279 + }, + { + "epoch": 0.6998158379373849, + "grad_norm": 1.008695363998413, + "learning_rate": 9.958240071633884e-05, + "loss": 2.2082, + "step": 2280 + }, + { + "epoch": 0.7001227747084101, + "grad_norm": 0.9931860566139221, + "learning_rate": 9.958175939983506e-05, + "loss": 2.1478, + "step": 2281 + }, + { + "epoch": 0.7004297114794352, + "grad_norm": 0.8637800812721252, + "learning_rate": 9.958111759333528e-05, + "loss": 2.149, + "step": 2282 + }, + { + "epoch": 0.7007366482504604, + "grad_norm": 0.7089012861251831, + "learning_rate": 9.958047529684582e-05, + "loss": 2.1845, + "step": 2283 + }, + { + "epoch": 0.7010435850214856, + "grad_norm": 0.6083673238754272, + "learning_rate": 9.957983251037303e-05, + "loss": 2.1542, + "step": 2284 + }, + { + "epoch": 0.7013505217925108, + "grad_norm": 0.7092905044555664, + "learning_rate": 9.957918923392331e-05, + "loss": 2.2305, + "step": 2285 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.8416675925254822, + "learning_rate": 9.957854546750297e-05, + "loss": 2.2975, + "step": 2286 + }, + { + "epoch": 0.7019643953345611, + "grad_norm": 0.7778663039207458, + "learning_rate": 9.957790121111838e-05, + "loss": 2.2363, + "step": 2287 + }, + { + "epoch": 0.7022713321055862, + "grad_norm": 0.7886617183685303, + "learning_rate": 9.957725646477592e-05, + "loss": 2.1547, + "step": 2288 + }, + { + "epoch": 0.7025782688766115, + "grad_norm": 0.6596038937568665, + "learning_rate": 9.957661122848194e-05, + "loss": 2.1537, + "step": 2289 + }, + { + "epoch": 0.7028852056476366, + "grad_norm": 0.6441544890403748, + "learning_rate": 9.957596550224285e-05, + "loss": 2.1678, + "step": 2290 + }, + { + "epoch": 0.7031921424186618, + "grad_norm": 0.7106116414070129, + "learning_rate": 9.957531928606499e-05, + "loss": 2.2039, + "step": 2291 + }, + { + "epoch": 0.7034990791896869, + "grad_norm": 0.6948207020759583, + "learning_rate": 9.957467257995476e-05, + "loss": 2.176, + "step": 2292 + }, + { + "epoch": 0.7038060159607121, + "grad_norm": 0.6834874153137207, + "learning_rate": 9.957402538391859e-05, + "loss": 2.2182, + "step": 2293 + }, + { + "epoch": 0.7041129527317372, + "grad_norm": 0.6246630549430847, + "learning_rate": 9.957337769796282e-05, + "loss": 2.1181, + "step": 2294 + }, + { + "epoch": 0.7044198895027625, + "grad_norm": 0.6421988606452942, + "learning_rate": 9.957272952209389e-05, + "loss": 2.1352, + "step": 2295 + }, + { + "epoch": 0.7047268262737876, + "grad_norm": 0.5955870151519775, + "learning_rate": 9.95720808563182e-05, + "loss": 2.1852, + "step": 2296 + }, + { + "epoch": 0.7050337630448128, + "grad_norm": 0.6961265206336975, + "learning_rate": 9.957143170064214e-05, + "loss": 2.242, + "step": 2297 + }, + { + "epoch": 0.7053406998158379, + "grad_norm": 0.6966063380241394, + "learning_rate": 9.957078205507213e-05, + "loss": 2.1505, + "step": 2298 + }, + { + "epoch": 0.7056476365868631, + "grad_norm": 0.6155996322631836, + "learning_rate": 9.957013191961459e-05, + "loss": 2.1928, + "step": 2299 + }, + { + "epoch": 0.7059545733578882, + "grad_norm": 0.6092718839645386, + "learning_rate": 9.956948129427597e-05, + "loss": 2.138, + "step": 2300 + }, + { + "epoch": 0.7062615101289135, + "grad_norm": 0.645746111869812, + "learning_rate": 9.95688301790627e-05, + "loss": 2.2334, + "step": 2301 + }, + { + "epoch": 0.7065684468999386, + "grad_norm": 0.5959149599075317, + "learning_rate": 9.956817857398116e-05, + "loss": 2.1985, + "step": 2302 + }, + { + "epoch": 0.7068753836709638, + "grad_norm": 0.7127073407173157, + "learning_rate": 9.956752647903785e-05, + "loss": 2.2157, + "step": 2303 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.5589274764060974, + "learning_rate": 9.956687389423917e-05, + "loss": 2.1251, + "step": 2304 + }, + { + "epoch": 0.7074892572130141, + "grad_norm": 0.5502300262451172, + "learning_rate": 9.95662208195916e-05, + "loss": 2.1344, + "step": 2305 + }, + { + "epoch": 0.7077961939840393, + "grad_norm": 0.6577275991439819, + "learning_rate": 9.95655672551016e-05, + "loss": 2.1646, + "step": 2306 + }, + { + "epoch": 0.7081031307550645, + "grad_norm": 0.6241618394851685, + "learning_rate": 9.956491320077559e-05, + "loss": 2.1153, + "step": 2307 + }, + { + "epoch": 0.7084100675260896, + "grad_norm": 0.5846728086471558, + "learning_rate": 9.956425865662007e-05, + "loss": 2.1477, + "step": 2308 + }, + { + "epoch": 0.7087170042971148, + "grad_norm": 0.6005275249481201, + "learning_rate": 9.95636036226415e-05, + "loss": 2.2034, + "step": 2309 + }, + { + "epoch": 0.7090239410681399, + "grad_norm": 0.6545519828796387, + "learning_rate": 9.956294809884635e-05, + "loss": 2.23, + "step": 2310 + }, + { + "epoch": 0.7093308778391652, + "grad_norm": 0.7513750791549683, + "learning_rate": 9.956229208524108e-05, + "loss": 2.2497, + "step": 2311 + }, + { + "epoch": 0.7096378146101903, + "grad_norm": 0.7308349609375, + "learning_rate": 9.956163558183219e-05, + "loss": 2.166, + "step": 2312 + }, + { + "epoch": 0.7099447513812155, + "grad_norm": 0.6278798580169678, + "learning_rate": 9.956097858862619e-05, + "loss": 2.1994, + "step": 2313 + }, + { + "epoch": 0.7102516881522406, + "grad_norm": 0.6725621223449707, + "learning_rate": 9.956032110562953e-05, + "loss": 2.2212, + "step": 2314 + }, + { + "epoch": 0.7105586249232658, + "grad_norm": 0.7116945385932922, + "learning_rate": 9.955966313284872e-05, + "loss": 2.2033, + "step": 2315 + }, + { + "epoch": 0.7108655616942909, + "grad_norm": 0.5906245112419128, + "learning_rate": 9.95590046702903e-05, + "loss": 2.1419, + "step": 2316 + }, + { + "epoch": 0.7111724984653162, + "grad_norm": 0.6911863684654236, + "learning_rate": 9.955834571796073e-05, + "loss": 2.1697, + "step": 2317 + }, + { + "epoch": 0.7114794352363413, + "grad_norm": 0.600350558757782, + "learning_rate": 9.955768627586655e-05, + "loss": 2.0864, + "step": 2318 + }, + { + "epoch": 0.7117863720073665, + "grad_norm": 0.6246278285980225, + "learning_rate": 9.955702634401427e-05, + "loss": 2.1549, + "step": 2319 + }, + { + "epoch": 0.7120933087783916, + "grad_norm": 0.6530009508132935, + "learning_rate": 9.95563659224104e-05, + "loss": 2.1457, + "step": 2320 + }, + { + "epoch": 0.7124002455494168, + "grad_norm": 0.6566256880760193, + "learning_rate": 9.955570501106148e-05, + "loss": 2.1589, + "step": 2321 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.6607041358947754, + "learning_rate": 9.955504360997404e-05, + "loss": 2.1692, + "step": 2322 + }, + { + "epoch": 0.7130141190914672, + "grad_norm": 0.7257810235023499, + "learning_rate": 9.95543817191546e-05, + "loss": 2.2067, + "step": 2323 + }, + { + "epoch": 0.7133210558624923, + "grad_norm": 0.7413349151611328, + "learning_rate": 9.955371933860973e-05, + "loss": 2.1817, + "step": 2324 + }, + { + "epoch": 0.7136279926335175, + "grad_norm": 0.6968317031860352, + "learning_rate": 9.955305646834596e-05, + "loss": 2.2574, + "step": 2325 + }, + { + "epoch": 0.7139349294045426, + "grad_norm": 0.8065732717514038, + "learning_rate": 9.955239310836983e-05, + "loss": 2.1957, + "step": 2326 + }, + { + "epoch": 0.7142418661755678, + "grad_norm": 0.7563133835792542, + "learning_rate": 9.955172925868792e-05, + "loss": 2.2113, + "step": 2327 + }, + { + "epoch": 0.714548802946593, + "grad_norm": 0.6790496110916138, + "learning_rate": 9.955106491930678e-05, + "loss": 2.103, + "step": 2328 + }, + { + "epoch": 0.7148557397176182, + "grad_norm": 0.65167236328125, + "learning_rate": 9.955040009023298e-05, + "loss": 2.1919, + "step": 2329 + }, + { + "epoch": 0.7151626764886433, + "grad_norm": 0.6869332790374756, + "learning_rate": 9.954973477147307e-05, + "loss": 2.2141, + "step": 2330 + }, + { + "epoch": 0.7154696132596685, + "grad_norm": 0.8613699078559875, + "learning_rate": 9.954906896303363e-05, + "loss": 2.1962, + "step": 2331 + }, + { + "epoch": 0.7157765500306936, + "grad_norm": 0.8827282786369324, + "learning_rate": 9.954840266492127e-05, + "loss": 2.216, + "step": 2332 + }, + { + "epoch": 0.7160834868017188, + "grad_norm": 0.9737905263900757, + "learning_rate": 9.954773587714255e-05, + "loss": 2.2118, + "step": 2333 + }, + { + "epoch": 0.716390423572744, + "grad_norm": 0.9978635311126709, + "learning_rate": 9.954706859970404e-05, + "loss": 2.0998, + "step": 2334 + }, + { + "epoch": 0.7166973603437692, + "grad_norm": 0.8694623112678528, + "learning_rate": 9.954640083261238e-05, + "loss": 2.1533, + "step": 2335 + }, + { + "epoch": 0.7170042971147943, + "grad_norm": 0.641293466091156, + "learning_rate": 9.954573257587415e-05, + "loss": 2.2095, + "step": 2336 + }, + { + "epoch": 0.7173112338858195, + "grad_norm": 0.6289860010147095, + "learning_rate": 9.954506382949594e-05, + "loss": 2.1683, + "step": 2337 + }, + { + "epoch": 0.7176181706568447, + "grad_norm": 0.8292246460914612, + "learning_rate": 9.954439459348437e-05, + "loss": 2.1729, + "step": 2338 + }, + { + "epoch": 0.7179251074278699, + "grad_norm": 0.8990920782089233, + "learning_rate": 9.954372486784605e-05, + "loss": 2.0888, + "step": 2339 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.7905614376068115, + "learning_rate": 9.954305465258762e-05, + "loss": 2.2262, + "step": 2340 + }, + { + "epoch": 0.7185389809699202, + "grad_norm": 0.7142611145973206, + "learning_rate": 9.954238394771567e-05, + "loss": 2.1311, + "step": 2341 + }, + { + "epoch": 0.7188459177409454, + "grad_norm": 0.68161541223526, + "learning_rate": 9.954171275323684e-05, + "loss": 2.2622, + "step": 2342 + }, + { + "epoch": 0.7191528545119705, + "grad_norm": 0.7524895668029785, + "learning_rate": 9.954104106915779e-05, + "loss": 2.1709, + "step": 2343 + }, + { + "epoch": 0.7194597912829958, + "grad_norm": 0.7419885396957397, + "learning_rate": 9.954036889548511e-05, + "loss": 2.1528, + "step": 2344 + }, + { + "epoch": 0.7197667280540209, + "grad_norm": 0.8045634031295776, + "learning_rate": 9.953969623222547e-05, + "loss": 2.1774, + "step": 2345 + }, + { + "epoch": 0.7200736648250461, + "grad_norm": 0.6680217385292053, + "learning_rate": 9.953902307938554e-05, + "loss": 2.2345, + "step": 2346 + }, + { + "epoch": 0.7203806015960712, + "grad_norm": 0.6900907754898071, + "learning_rate": 9.953834943697193e-05, + "loss": 2.1696, + "step": 2347 + }, + { + "epoch": 0.7206875383670964, + "grad_norm": 0.7231009006500244, + "learning_rate": 9.953767530499132e-05, + "loss": 2.2556, + "step": 2348 + }, + { + "epoch": 0.7209944751381215, + "grad_norm": 0.7766092419624329, + "learning_rate": 9.953700068345036e-05, + "loss": 2.1522, + "step": 2349 + }, + { + "epoch": 0.7213014119091468, + "grad_norm": 0.7361852526664734, + "learning_rate": 9.953632557235574e-05, + "loss": 2.2427, + "step": 2350 + }, + { + "epoch": 0.7216083486801719, + "grad_norm": 0.7170109152793884, + "learning_rate": 9.953564997171411e-05, + "loss": 2.2439, + "step": 2351 + }, + { + "epoch": 0.7219152854511971, + "grad_norm": 0.7192662954330444, + "learning_rate": 9.953497388153214e-05, + "loss": 2.1242, + "step": 2352 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.7363288402557373, + "learning_rate": 9.953429730181653e-05, + "loss": 2.2748, + "step": 2353 + }, + { + "epoch": 0.7225291589932474, + "grad_norm": 0.8516983985900879, + "learning_rate": 9.953362023257397e-05, + "loss": 2.2471, + "step": 2354 + }, + { + "epoch": 0.7228360957642725, + "grad_norm": 0.7928574681282043, + "learning_rate": 9.953294267381114e-05, + "loss": 2.164, + "step": 2355 + }, + { + "epoch": 0.7231430325352978, + "grad_norm": 0.6803320646286011, + "learning_rate": 9.953226462553474e-05, + "loss": 2.1671, + "step": 2356 + }, + { + "epoch": 0.7234499693063229, + "grad_norm": 0.6811994910240173, + "learning_rate": 9.953158608775147e-05, + "loss": 2.1042, + "step": 2357 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.6077840328216553, + "learning_rate": 9.953090706046804e-05, + "loss": 2.2161, + "step": 2358 + }, + { + "epoch": 0.7240638428483732, + "grad_norm": 0.5938412547111511, + "learning_rate": 9.953022754369114e-05, + "loss": 2.1177, + "step": 2359 + }, + { + "epoch": 0.7243707796193984, + "grad_norm": 0.6752299070358276, + "learning_rate": 9.952954753742751e-05, + "loss": 2.2255, + "step": 2360 + }, + { + "epoch": 0.7246777163904236, + "grad_norm": 0.6745245456695557, + "learning_rate": 9.952886704168387e-05, + "loss": 2.1817, + "step": 2361 + }, + { + "epoch": 0.7249846531614488, + "grad_norm": 0.6645397543907166, + "learning_rate": 9.95281860564669e-05, + "loss": 2.2495, + "step": 2362 + }, + { + "epoch": 0.7252915899324739, + "grad_norm": 0.6758745312690735, + "learning_rate": 9.95275045817834e-05, + "loss": 2.2059, + "step": 2363 + }, + { + "epoch": 0.7255985267034991, + "grad_norm": 0.6584516763687134, + "learning_rate": 9.952682261764006e-05, + "loss": 2.1868, + "step": 2364 + }, + { + "epoch": 0.7259054634745242, + "grad_norm": 0.6335561871528625, + "learning_rate": 9.952614016404363e-05, + "loss": 2.1352, + "step": 2365 + }, + { + "epoch": 0.7262124002455494, + "grad_norm": 0.6656816601753235, + "learning_rate": 9.952545722100087e-05, + "loss": 2.1805, + "step": 2366 + }, + { + "epoch": 0.7265193370165746, + "grad_norm": 0.6262782216072083, + "learning_rate": 9.95247737885185e-05, + "loss": 2.1435, + "step": 2367 + }, + { + "epoch": 0.7268262737875998, + "grad_norm": 0.569795548915863, + "learning_rate": 9.952408986660329e-05, + "loss": 2.1547, + "step": 2368 + }, + { + "epoch": 0.7271332105586249, + "grad_norm": 0.5249118208885193, + "learning_rate": 9.952340545526199e-05, + "loss": 2.1213, + "step": 2369 + }, + { + "epoch": 0.7274401473296501, + "grad_norm": 0.5581740140914917, + "learning_rate": 9.952272055450139e-05, + "loss": 2.1866, + "step": 2370 + }, + { + "epoch": 0.7277470841006752, + "grad_norm": 0.5986969470977783, + "learning_rate": 9.952203516432821e-05, + "loss": 2.143, + "step": 2371 + }, + { + "epoch": 0.7280540208717005, + "grad_norm": 0.6426723599433899, + "learning_rate": 9.952134928474926e-05, + "loss": 2.2132, + "step": 2372 + }, + { + "epoch": 0.7283609576427256, + "grad_norm": 0.5856953263282776, + "learning_rate": 9.952066291577133e-05, + "loss": 2.1502, + "step": 2373 + }, + { + "epoch": 0.7286678944137508, + "grad_norm": 0.5420570969581604, + "learning_rate": 9.951997605740117e-05, + "loss": 2.1213, + "step": 2374 + }, + { + "epoch": 0.7289748311847759, + "grad_norm": 0.6201688647270203, + "learning_rate": 9.951928870964558e-05, + "loss": 2.218, + "step": 2375 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.7023850083351135, + "learning_rate": 9.951860087251137e-05, + "loss": 2.2787, + "step": 2376 + }, + { + "epoch": 0.7295887047268262, + "grad_norm": 0.733650803565979, + "learning_rate": 9.951791254600532e-05, + "loss": 2.1861, + "step": 2377 + }, + { + "epoch": 0.7298956414978515, + "grad_norm": 0.7177363038063049, + "learning_rate": 9.951722373013421e-05, + "loss": 2.1905, + "step": 2378 + }, + { + "epoch": 0.7302025782688766, + "grad_norm": 0.7963547706604004, + "learning_rate": 9.95165344249049e-05, + "loss": 2.1842, + "step": 2379 + }, + { + "epoch": 0.7305095150399018, + "grad_norm": 0.8466546535491943, + "learning_rate": 9.951584463032416e-05, + "loss": 2.1661, + "step": 2380 + }, + { + "epoch": 0.7308164518109269, + "grad_norm": 0.7288870811462402, + "learning_rate": 9.951515434639882e-05, + "loss": 2.1153, + "step": 2381 + }, + { + "epoch": 0.7311233885819521, + "grad_norm": 0.6168704032897949, + "learning_rate": 9.951446357313571e-05, + "loss": 2.121, + "step": 2382 + }, + { + "epoch": 0.7314303253529773, + "grad_norm": 0.6534848809242249, + "learning_rate": 9.951377231054166e-05, + "loss": 2.2087, + "step": 2383 + }, + { + "epoch": 0.7317372621240025, + "grad_norm": 0.7872020602226257, + "learning_rate": 9.951308055862347e-05, + "loss": 2.2428, + "step": 2384 + }, + { + "epoch": 0.7320441988950276, + "grad_norm": 0.864799439907074, + "learning_rate": 9.9512388317388e-05, + "loss": 2.2392, + "step": 2385 + }, + { + "epoch": 0.7323511356660528, + "grad_norm": 0.7365485429763794, + "learning_rate": 9.95116955868421e-05, + "loss": 2.1614, + "step": 2386 + }, + { + "epoch": 0.7326580724370779, + "grad_norm": 0.6509390473365784, + "learning_rate": 9.95110023669926e-05, + "loss": 2.1917, + "step": 2387 + }, + { + "epoch": 0.7329650092081031, + "grad_norm": 0.7660403847694397, + "learning_rate": 9.951030865784635e-05, + "loss": 2.2414, + "step": 2388 + }, + { + "epoch": 0.7332719459791283, + "grad_norm": 0.9997872114181519, + "learning_rate": 9.950961445941022e-05, + "loss": 2.2063, + "step": 2389 + }, + { + "epoch": 0.7335788827501535, + "grad_norm": 1.0113418102264404, + "learning_rate": 9.950891977169106e-05, + "loss": 2.1898, + "step": 2390 + }, + { + "epoch": 0.7338858195211786, + "grad_norm": 0.8849206566810608, + "learning_rate": 9.950822459469573e-05, + "loss": 2.1503, + "step": 2391 + }, + { + "epoch": 0.7341927562922038, + "grad_norm": 0.6561055779457092, + "learning_rate": 9.950752892843112e-05, + "loss": 2.1234, + "step": 2392 + }, + { + "epoch": 0.7344996930632289, + "grad_norm": 0.5568758845329285, + "learning_rate": 9.950683277290407e-05, + "loss": 2.2129, + "step": 2393 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.7019078135490417, + "learning_rate": 9.950613612812149e-05, + "loss": 2.1162, + "step": 2394 + }, + { + "epoch": 0.7351135666052793, + "grad_norm": 0.7633521556854248, + "learning_rate": 9.950543899409026e-05, + "loss": 2.2427, + "step": 2395 + }, + { + "epoch": 0.7354205033763045, + "grad_norm": 0.6743205785751343, + "learning_rate": 9.950474137081726e-05, + "loss": 2.2213, + "step": 2396 + }, + { + "epoch": 0.7357274401473296, + "grad_norm": 0.6008336544036865, + "learning_rate": 9.950404325830941e-05, + "loss": 2.1605, + "step": 2397 + }, + { + "epoch": 0.7360343769183548, + "grad_norm": 0.648760199546814, + "learning_rate": 9.950334465657357e-05, + "loss": 2.2298, + "step": 2398 + }, + { + "epoch": 0.7363413136893799, + "grad_norm": 0.6996559500694275, + "learning_rate": 9.950264556561667e-05, + "loss": 2.1616, + "step": 2399 + }, + { + "epoch": 0.7366482504604052, + "grad_norm": 0.741629421710968, + "learning_rate": 9.950194598544561e-05, + "loss": 2.2162, + "step": 2400 + }, + { + "epoch": 0.7369551872314303, + "grad_norm": 0.6144673824310303, + "learning_rate": 9.95012459160673e-05, + "loss": 2.15, + "step": 2401 + }, + { + "epoch": 0.7372621240024555, + "grad_norm": 0.5826541781425476, + "learning_rate": 9.950054535748867e-05, + "loss": 2.1792, + "step": 2402 + }, + { + "epoch": 0.7375690607734806, + "grad_norm": 0.6489288806915283, + "learning_rate": 9.949984430971665e-05, + "loss": 2.1703, + "step": 2403 + }, + { + "epoch": 0.7378759975445058, + "grad_norm": 0.6752250790596008, + "learning_rate": 9.949914277275814e-05, + "loss": 2.2561, + "step": 2404 + }, + { + "epoch": 0.738182934315531, + "grad_norm": 0.5570092797279358, + "learning_rate": 9.94984407466201e-05, + "loss": 2.1418, + "step": 2405 + }, + { + "epoch": 0.7384898710865562, + "grad_norm": 0.5966812968254089, + "learning_rate": 9.949773823130944e-05, + "loss": 2.2168, + "step": 2406 + }, + { + "epoch": 0.7387968078575813, + "grad_norm": 0.6253142952919006, + "learning_rate": 9.949703522683314e-05, + "loss": 2.1646, + "step": 2407 + }, + { + "epoch": 0.7391037446286065, + "grad_norm": 0.6673659086227417, + "learning_rate": 9.94963317331981e-05, + "loss": 2.1904, + "step": 2408 + }, + { + "epoch": 0.7394106813996317, + "grad_norm": 0.6243279576301575, + "learning_rate": 9.949562775041133e-05, + "loss": 2.2568, + "step": 2409 + }, + { + "epoch": 0.7397176181706568, + "grad_norm": 0.7014298439025879, + "learning_rate": 9.949492327847973e-05, + "loss": 2.2331, + "step": 2410 + }, + { + "epoch": 0.7400245549416821, + "grad_norm": 0.698403537273407, + "learning_rate": 9.94942183174103e-05, + "loss": 2.1928, + "step": 2411 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.6354022026062012, + "learning_rate": 9.949351286721001e-05, + "loss": 2.0975, + "step": 2412 + }, + { + "epoch": 0.7406384284837324, + "grad_norm": 0.595302164554596, + "learning_rate": 9.949280692788579e-05, + "loss": 2.177, + "step": 2413 + }, + { + "epoch": 0.7409453652547575, + "grad_norm": 0.6844484210014343, + "learning_rate": 9.949210049944465e-05, + "loss": 2.1962, + "step": 2414 + }, + { + "epoch": 0.7412523020257827, + "grad_norm": 0.6242616176605225, + "learning_rate": 9.949139358189357e-05, + "loss": 2.2143, + "step": 2415 + }, + { + "epoch": 0.7415592387968079, + "grad_norm": 0.6524595022201538, + "learning_rate": 9.949068617523954e-05, + "loss": 2.1438, + "step": 2416 + }, + { + "epoch": 0.7418661755678331, + "grad_norm": 0.6667510867118835, + "learning_rate": 9.948997827948953e-05, + "loss": 2.2115, + "step": 2417 + }, + { + "epoch": 0.7421731123388582, + "grad_norm": 0.7688906192779541, + "learning_rate": 9.948926989465056e-05, + "loss": 2.1887, + "step": 2418 + }, + { + "epoch": 0.7424800491098834, + "grad_norm": 0.6888165473937988, + "learning_rate": 9.948856102072958e-05, + "loss": 2.1349, + "step": 2419 + }, + { + "epoch": 0.7427869858809085, + "grad_norm": 0.5672495365142822, + "learning_rate": 9.948785165773367e-05, + "loss": 2.1109, + "step": 2420 + }, + { + "epoch": 0.7430939226519337, + "grad_norm": 0.5714489221572876, + "learning_rate": 9.94871418056698e-05, + "loss": 2.1483, + "step": 2421 + }, + { + "epoch": 0.7434008594229589, + "grad_norm": 0.6061533093452454, + "learning_rate": 9.948643146454498e-05, + "loss": 2.211, + "step": 2422 + }, + { + "epoch": 0.7437077961939841, + "grad_norm": 0.6132726073265076, + "learning_rate": 9.948572063436625e-05, + "loss": 2.23, + "step": 2423 + }, + { + "epoch": 0.7440147329650092, + "grad_norm": 0.684301495552063, + "learning_rate": 9.948500931514062e-05, + "loss": 2.129, + "step": 2424 + }, + { + "epoch": 0.7443216697360344, + "grad_norm": 0.6325442790985107, + "learning_rate": 9.948429750687512e-05, + "loss": 2.129, + "step": 2425 + }, + { + "epoch": 0.7446286065070595, + "grad_norm": 0.6245989203453064, + "learning_rate": 9.948358520957678e-05, + "loss": 2.1999, + "step": 2426 + }, + { + "epoch": 0.7449355432780848, + "grad_norm": 0.6638534069061279, + "learning_rate": 9.948287242325267e-05, + "loss": 2.203, + "step": 2427 + }, + { + "epoch": 0.7452424800491099, + "grad_norm": 0.6121437549591064, + "learning_rate": 9.94821591479098e-05, + "loss": 2.1204, + "step": 2428 + }, + { + "epoch": 0.7455494168201351, + "grad_norm": 0.7919846177101135, + "learning_rate": 9.948144538355522e-05, + "loss": 2.2353, + "step": 2429 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.7246984839439392, + "learning_rate": 9.948073113019602e-05, + "loss": 2.1284, + "step": 2430 + }, + { + "epoch": 0.7461632903621854, + "grad_norm": 0.6120265126228333, + "learning_rate": 9.948001638783921e-05, + "loss": 2.0873, + "step": 2431 + }, + { + "epoch": 0.7464702271332105, + "grad_norm": 0.628588080406189, + "learning_rate": 9.947930115649189e-05, + "loss": 2.1713, + "step": 2432 + }, + { + "epoch": 0.7467771639042358, + "grad_norm": 0.63116854429245, + "learning_rate": 9.947858543616111e-05, + "loss": 2.123, + "step": 2433 + }, + { + "epoch": 0.7470841006752609, + "grad_norm": 0.6533017754554749, + "learning_rate": 9.947786922685394e-05, + "loss": 2.1593, + "step": 2434 + }, + { + "epoch": 0.7473910374462861, + "grad_norm": 0.6854177117347717, + "learning_rate": 9.947715252857749e-05, + "loss": 2.162, + "step": 2435 + }, + { + "epoch": 0.7476979742173112, + "grad_norm": 0.7257967591285706, + "learning_rate": 9.94764353413388e-05, + "loss": 2.2644, + "step": 2436 + }, + { + "epoch": 0.7480049109883364, + "grad_norm": 0.6806700825691223, + "learning_rate": 9.947571766514498e-05, + "loss": 2.0875, + "step": 2437 + }, + { + "epoch": 0.7483118477593615, + "grad_norm": 0.6616181135177612, + "learning_rate": 9.947499950000312e-05, + "loss": 2.1353, + "step": 2438 + }, + { + "epoch": 0.7486187845303868, + "grad_norm": 0.7249685525894165, + "learning_rate": 9.947428084592032e-05, + "loss": 2.148, + "step": 2439 + }, + { + "epoch": 0.7489257213014119, + "grad_norm": 0.6372905969619751, + "learning_rate": 9.947356170290369e-05, + "loss": 2.1749, + "step": 2440 + }, + { + "epoch": 0.7492326580724371, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.947284207096031e-05, + "loss": 2.1909, + "step": 2441 + }, + { + "epoch": 0.7495395948434622, + "grad_norm": 0.5830507278442383, + "learning_rate": 9.94721219500973e-05, + "loss": 2.1351, + "step": 2442 + }, + { + "epoch": 0.7498465316144874, + "grad_norm": 0.650262713432312, + "learning_rate": 9.94714013403218e-05, + "loss": 2.2602, + "step": 2443 + }, + { + "epoch": 0.7501534683855126, + "grad_norm": 0.6658717393875122, + "learning_rate": 9.947068024164091e-05, + "loss": 2.0919, + "step": 2444 + }, + { + "epoch": 0.7504604051565378, + "grad_norm": 0.7299105525016785, + "learning_rate": 9.946995865406177e-05, + "loss": 2.2079, + "step": 2445 + }, + { + "epoch": 0.7507673419275629, + "grad_norm": 0.762246310710907, + "learning_rate": 9.946923657759148e-05, + "loss": 2.2225, + "step": 2446 + }, + { + "epoch": 0.7510742786985881, + "grad_norm": 0.7019835710525513, + "learning_rate": 9.946851401223722e-05, + "loss": 2.175, + "step": 2447 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.6214791536331177, + "learning_rate": 9.946779095800611e-05, + "loss": 2.2095, + "step": 2448 + }, + { + "epoch": 0.7516881522406385, + "grad_norm": 0.6380667090415955, + "learning_rate": 9.94670674149053e-05, + "loss": 2.2325, + "step": 2449 + }, + { + "epoch": 0.7519950890116636, + "grad_norm": 0.6175886392593384, + "learning_rate": 9.946634338294191e-05, + "loss": 2.1431, + "step": 2450 + }, + { + "epoch": 0.7523020257826888, + "grad_norm": 0.6642621159553528, + "learning_rate": 9.946561886212315e-05, + "loss": 2.1538, + "step": 2451 + }, + { + "epoch": 0.7526089625537139, + "grad_norm": 0.7078617215156555, + "learning_rate": 9.946489385245614e-05, + "loss": 2.1544, + "step": 2452 + }, + { + "epoch": 0.7529158993247391, + "grad_norm": 0.6939398050308228, + "learning_rate": 9.946416835394806e-05, + "loss": 2.1131, + "step": 2453 + }, + { + "epoch": 0.7532228360957642, + "grad_norm": 0.7080716490745544, + "learning_rate": 9.946344236660608e-05, + "loss": 2.2135, + "step": 2454 + }, + { + "epoch": 0.7535297728667895, + "grad_norm": 0.7451115250587463, + "learning_rate": 9.946271589043736e-05, + "loss": 2.1475, + "step": 2455 + }, + { + "epoch": 0.7538367096378146, + "grad_norm": 0.6718367338180542, + "learning_rate": 9.946198892544909e-05, + "loss": 2.1853, + "step": 2456 + }, + { + "epoch": 0.7541436464088398, + "grad_norm": 0.7071637511253357, + "learning_rate": 9.946126147164847e-05, + "loss": 2.0981, + "step": 2457 + }, + { + "epoch": 0.7544505831798649, + "grad_norm": 0.6745624542236328, + "learning_rate": 9.946053352904267e-05, + "loss": 2.1914, + "step": 2458 + }, + { + "epoch": 0.7547575199508901, + "grad_norm": 0.7267486453056335, + "learning_rate": 9.945980509763888e-05, + "loss": 2.1091, + "step": 2459 + }, + { + "epoch": 0.7550644567219152, + "grad_norm": 0.6128695607185364, + "learning_rate": 9.94590761774443e-05, + "loss": 2.1721, + "step": 2460 + }, + { + "epoch": 0.7553713934929405, + "grad_norm": 0.6574678421020508, + "learning_rate": 9.945834676846615e-05, + "loss": 2.1609, + "step": 2461 + }, + { + "epoch": 0.7556783302639656, + "grad_norm": 0.6209995150566101, + "learning_rate": 9.945761687071164e-05, + "loss": 2.1889, + "step": 2462 + }, + { + "epoch": 0.7559852670349908, + "grad_norm": 0.7425361275672913, + "learning_rate": 9.945688648418795e-05, + "loss": 2.2189, + "step": 2463 + }, + { + "epoch": 0.7562922038060159, + "grad_norm": 1.0604934692382812, + "learning_rate": 9.945615560890234e-05, + "loss": 2.1858, + "step": 2464 + }, + { + "epoch": 0.7565991405770411, + "grad_norm": 0.7162829041481018, + "learning_rate": 9.945542424486201e-05, + "loss": 2.101, + "step": 2465 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.6361207962036133, + "learning_rate": 9.945469239207416e-05, + "loss": 2.0836, + "step": 2466 + }, + { + "epoch": 0.7572130141190915, + "grad_norm": 0.5858156085014343, + "learning_rate": 9.945396005054609e-05, + "loss": 2.2059, + "step": 2467 + }, + { + "epoch": 0.7575199508901166, + "grad_norm": 0.7322074174880981, + "learning_rate": 9.945322722028498e-05, + "loss": 2.2295, + "step": 2468 + }, + { + "epoch": 0.7578268876611418, + "grad_norm": 0.775900661945343, + "learning_rate": 9.945249390129811e-05, + "loss": 2.2171, + "step": 2469 + }, + { + "epoch": 0.7581338244321669, + "grad_norm": 0.8801379799842834, + "learning_rate": 9.94517600935927e-05, + "loss": 2.1632, + "step": 2470 + }, + { + "epoch": 0.7584407612031921, + "grad_norm": 0.8258405923843384, + "learning_rate": 9.945102579717602e-05, + "loss": 2.1591, + "step": 2471 + }, + { + "epoch": 0.7587476979742173, + "grad_norm": 0.7472482323646545, + "learning_rate": 9.945029101205532e-05, + "loss": 2.2242, + "step": 2472 + }, + { + "epoch": 0.7590546347452425, + "grad_norm": 0.6594643592834473, + "learning_rate": 9.944955573823785e-05, + "loss": 2.1217, + "step": 2473 + }, + { + "epoch": 0.7593615715162676, + "grad_norm": 0.6547524333000183, + "learning_rate": 9.944881997573088e-05, + "loss": 2.131, + "step": 2474 + }, + { + "epoch": 0.7596685082872928, + "grad_norm": 0.6630129814147949, + "learning_rate": 9.94480837245417e-05, + "loss": 2.1264, + "step": 2475 + }, + { + "epoch": 0.7599754450583179, + "grad_norm": 0.6877384781837463, + "learning_rate": 9.944734698467757e-05, + "loss": 2.2453, + "step": 2476 + }, + { + "epoch": 0.7602823818293432, + "grad_norm": 0.6736158728599548, + "learning_rate": 9.944660975614579e-05, + "loss": 2.1425, + "step": 2477 + }, + { + "epoch": 0.7605893186003683, + "grad_norm": 0.6140786409378052, + "learning_rate": 9.944587203895361e-05, + "loss": 2.1345, + "step": 2478 + }, + { + "epoch": 0.7608962553713935, + "grad_norm": 0.5515910387039185, + "learning_rate": 9.944513383310837e-05, + "loss": 2.086, + "step": 2479 + }, + { + "epoch": 0.7612031921424187, + "grad_norm": 0.49419671297073364, + "learning_rate": 9.944439513861731e-05, + "loss": 2.1069, + "step": 2480 + }, + { + "epoch": 0.7615101289134438, + "grad_norm": 0.5526577234268188, + "learning_rate": 9.944365595548777e-05, + "loss": 2.1702, + "step": 2481 + }, + { + "epoch": 0.761817065684469, + "grad_norm": 0.5430580973625183, + "learning_rate": 9.944291628372702e-05, + "loss": 2.121, + "step": 2482 + }, + { + "epoch": 0.7621240024554942, + "grad_norm": 0.5333554148674011, + "learning_rate": 9.94421761233424e-05, + "loss": 2.1154, + "step": 2483 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.5856761932373047, + "learning_rate": 9.944143547434124e-05, + "loss": 2.1734, + "step": 2484 + }, + { + "epoch": 0.7627378759975445, + "grad_norm": 0.6619083881378174, + "learning_rate": 9.944069433673082e-05, + "loss": 2.2068, + "step": 2485 + }, + { + "epoch": 0.7630448127685697, + "grad_norm": 0.5791018009185791, + "learning_rate": 9.943995271051849e-05, + "loss": 2.0834, + "step": 2486 + }, + { + "epoch": 0.7633517495395948, + "grad_norm": 0.5942522287368774, + "learning_rate": 9.943921059571155e-05, + "loss": 2.2001, + "step": 2487 + }, + { + "epoch": 0.7636586863106201, + "grad_norm": 0.6285880208015442, + "learning_rate": 9.943846799231738e-05, + "loss": 2.1601, + "step": 2488 + }, + { + "epoch": 0.7639656230816452, + "grad_norm": 0.6337715983390808, + "learning_rate": 9.943772490034326e-05, + "loss": 2.1722, + "step": 2489 + }, + { + "epoch": 0.7642725598526704, + "grad_norm": 0.6912121772766113, + "learning_rate": 9.94369813197966e-05, + "loss": 2.1933, + "step": 2490 + }, + { + "epoch": 0.7645794966236955, + "grad_norm": 0.8028284311294556, + "learning_rate": 9.943623725068469e-05, + "loss": 2.129, + "step": 2491 + }, + { + "epoch": 0.7648864333947207, + "grad_norm": 0.8527138233184814, + "learning_rate": 9.943549269301491e-05, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.7651933701657458, + "grad_norm": 0.8422580361366272, + "learning_rate": 9.943474764679462e-05, + "loss": 2.2958, + "step": 2493 + }, + { + "epoch": 0.7655003069367711, + "grad_norm": 0.7698150873184204, + "learning_rate": 9.943400211203118e-05, + "loss": 2.1415, + "step": 2494 + }, + { + "epoch": 0.7658072437077962, + "grad_norm": 0.6360690593719482, + "learning_rate": 9.943325608873196e-05, + "loss": 2.1188, + "step": 2495 + }, + { + "epoch": 0.7661141804788214, + "grad_norm": 0.6225799918174744, + "learning_rate": 9.943250957690433e-05, + "loss": 2.1006, + "step": 2496 + }, + { + "epoch": 0.7664211172498465, + "grad_norm": 0.6694490909576416, + "learning_rate": 9.943176257655567e-05, + "loss": 2.2455, + "step": 2497 + }, + { + "epoch": 0.7667280540208717, + "grad_norm": 0.6188158988952637, + "learning_rate": 9.943101508769335e-05, + "loss": 2.0853, + "step": 2498 + }, + { + "epoch": 0.7670349907918969, + "grad_norm": 0.5934504866600037, + "learning_rate": 9.943026711032477e-05, + "loss": 2.0718, + "step": 2499 + }, + { + "epoch": 0.7673419275629221, + "grad_norm": 0.6261292695999146, + "learning_rate": 9.942951864445732e-05, + "loss": 2.1747, + "step": 2500 + }, + { + "epoch": 0.7676488643339472, + "grad_norm": 0.5891184210777283, + "learning_rate": 9.94287696900984e-05, + "loss": 2.1637, + "step": 2501 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.5321740508079529, + "learning_rate": 9.94280202472554e-05, + "loss": 2.0717, + "step": 2502 + }, + { + "epoch": 0.7682627378759975, + "grad_norm": 0.5563281178474426, + "learning_rate": 9.942727031593573e-05, + "loss": 2.1654, + "step": 2503 + }, + { + "epoch": 0.7685696746470227, + "grad_norm": 0.5672664046287537, + "learning_rate": 9.942651989614681e-05, + "loss": 2.0853, + "step": 2504 + }, + { + "epoch": 0.7688766114180479, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.942576898789606e-05, + "loss": 2.0636, + "step": 2505 + }, + { + "epoch": 0.7691835481890731, + "grad_norm": 0.5802470445632935, + "learning_rate": 9.942501759119088e-05, + "loss": 2.0924, + "step": 2506 + }, + { + "epoch": 0.7694904849600982, + "grad_norm": 0.5630003213882446, + "learning_rate": 9.94242657060387e-05, + "loss": 2.1975, + "step": 2507 + }, + { + "epoch": 0.7697974217311234, + "grad_norm": 0.6001835465431213, + "learning_rate": 9.942351333244697e-05, + "loss": 2.1187, + "step": 2508 + }, + { + "epoch": 0.7701043585021485, + "grad_norm": 0.6702088117599487, + "learning_rate": 9.942276047042311e-05, + "loss": 2.1489, + "step": 2509 + }, + { + "epoch": 0.7704112952731738, + "grad_norm": 0.7941808700561523, + "learning_rate": 9.942200711997456e-05, + "loss": 2.1404, + "step": 2510 + }, + { + "epoch": 0.7707182320441989, + "grad_norm": 0.8202539682388306, + "learning_rate": 9.942125328110876e-05, + "loss": 2.1242, + "step": 2511 + }, + { + "epoch": 0.7710251688152241, + "grad_norm": 0.7667655348777771, + "learning_rate": 9.942049895383319e-05, + "loss": 2.118, + "step": 2512 + }, + { + "epoch": 0.7713321055862492, + "grad_norm": 0.6766887307167053, + "learning_rate": 9.941974413815527e-05, + "loss": 2.2632, + "step": 2513 + }, + { + "epoch": 0.7716390423572744, + "grad_norm": 0.5923287272453308, + "learning_rate": 9.941898883408248e-05, + "loss": 2.1096, + "step": 2514 + }, + { + "epoch": 0.7719459791282995, + "grad_norm": 0.8847586512565613, + "learning_rate": 9.941823304162227e-05, + "loss": 2.2629, + "step": 2515 + }, + { + "epoch": 0.7722529158993248, + "grad_norm": 1.2274069786071777, + "learning_rate": 9.941747676078211e-05, + "loss": 2.2493, + "step": 2516 + }, + { + "epoch": 0.7725598526703499, + "grad_norm": 0.8637729287147522, + "learning_rate": 9.94167199915695e-05, + "loss": 2.1545, + "step": 2517 + }, + { + "epoch": 0.7728667894413751, + "grad_norm": 0.7852178812026978, + "learning_rate": 9.941596273399187e-05, + "loss": 2.1984, + "step": 2518 + }, + { + "epoch": 0.7731737262124002, + "grad_norm": 0.6839576959609985, + "learning_rate": 9.941520498805677e-05, + "loss": 2.1913, + "step": 2519 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.7051649689674377, + "learning_rate": 9.941444675377163e-05, + "loss": 2.1678, + "step": 2520 + }, + { + "epoch": 0.7737875997544506, + "grad_norm": 0.702549159526825, + "learning_rate": 9.941368803114395e-05, + "loss": 2.1426, + "step": 2521 + }, + { + "epoch": 0.7740945365254758, + "grad_norm": 0.6717942953109741, + "learning_rate": 9.941292882018127e-05, + "loss": 2.1873, + "step": 2522 + }, + { + "epoch": 0.7744014732965009, + "grad_norm": 0.6705282926559448, + "learning_rate": 9.941216912089104e-05, + "loss": 2.1363, + "step": 2523 + }, + { + "epoch": 0.7747084100675261, + "grad_norm": 0.5858317017555237, + "learning_rate": 9.941140893328082e-05, + "loss": 2.1019, + "step": 2524 + }, + { + "epoch": 0.7750153468385512, + "grad_norm": 0.6353682279586792, + "learning_rate": 9.941064825735808e-05, + "loss": 2.1765, + "step": 2525 + }, + { + "epoch": 0.7753222836095764, + "grad_norm": 0.6573354601860046, + "learning_rate": 9.940988709313035e-05, + "loss": 2.0636, + "step": 2526 + }, + { + "epoch": 0.7756292203806016, + "grad_norm": 0.6040489077568054, + "learning_rate": 9.940912544060517e-05, + "loss": 2.0902, + "step": 2527 + }, + { + "epoch": 0.7759361571516268, + "grad_norm": 0.7024530172348022, + "learning_rate": 9.940836329979004e-05, + "loss": 2.2198, + "step": 2528 + }, + { + "epoch": 0.7762430939226519, + "grad_norm": 0.6910196542739868, + "learning_rate": 9.940760067069251e-05, + "loss": 2.0546, + "step": 2529 + }, + { + "epoch": 0.7765500306936771, + "grad_norm": 0.6841506361961365, + "learning_rate": 9.940683755332012e-05, + "loss": 2.2159, + "step": 2530 + }, + { + "epoch": 0.7768569674647022, + "grad_norm": 0.6503066420555115, + "learning_rate": 9.940607394768038e-05, + "loss": 2.2156, + "step": 2531 + }, + { + "epoch": 0.7771639042357275, + "grad_norm": 0.6512146592140198, + "learning_rate": 9.940530985378089e-05, + "loss": 2.1417, + "step": 2532 + }, + { + "epoch": 0.7774708410067526, + "grad_norm": 0.6234787106513977, + "learning_rate": 9.940454527162914e-05, + "loss": 2.1315, + "step": 2533 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.6279457211494446, + "learning_rate": 9.940378020123273e-05, + "loss": 2.2699, + "step": 2534 + }, + { + "epoch": 0.7780847145488029, + "grad_norm": 0.6793956160545349, + "learning_rate": 9.940301464259921e-05, + "loss": 2.2488, + "step": 2535 + }, + { + "epoch": 0.7783916513198281, + "grad_norm": 0.721234142780304, + "learning_rate": 9.940224859573614e-05, + "loss": 2.1183, + "step": 2536 + }, + { + "epoch": 0.7786985880908532, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.94014820606511e-05, + "loss": 2.0995, + "step": 2537 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.6358578205108643, + "learning_rate": 9.940071503735165e-05, + "loss": 2.2024, + "step": 2538 + }, + { + "epoch": 0.7793124616329036, + "grad_norm": 0.6250868439674377, + "learning_rate": 9.939994752584538e-05, + "loss": 2.1574, + "step": 2539 + }, + { + "epoch": 0.7796193984039288, + "grad_norm": 0.7657763361930847, + "learning_rate": 9.939917952613989e-05, + "loss": 2.2625, + "step": 2540 + }, + { + "epoch": 0.7799263351749539, + "grad_norm": 0.7625400424003601, + "learning_rate": 9.939841103824275e-05, + "loss": 2.1809, + "step": 2541 + }, + { + "epoch": 0.7802332719459791, + "grad_norm": 0.8593107461929321, + "learning_rate": 9.939764206216155e-05, + "loss": 2.2359, + "step": 2542 + }, + { + "epoch": 0.7805402087170042, + "grad_norm": 0.8441007733345032, + "learning_rate": 9.93968725979039e-05, + "loss": 2.1844, + "step": 2543 + }, + { + "epoch": 0.7808471454880295, + "grad_norm": 0.6408470273017883, + "learning_rate": 9.93961026454774e-05, + "loss": 2.1871, + "step": 2544 + }, + { + "epoch": 0.7811540822590546, + "grad_norm": 0.6779976487159729, + "learning_rate": 9.939533220488966e-05, + "loss": 2.1651, + "step": 2545 + }, + { + "epoch": 0.7814610190300798, + "grad_norm": 0.5885556936264038, + "learning_rate": 9.93945612761483e-05, + "loss": 2.0172, + "step": 2546 + }, + { + "epoch": 0.7817679558011049, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.939378985926094e-05, + "loss": 2.1358, + "step": 2547 + }, + { + "epoch": 0.7820748925721301, + "grad_norm": 0.685183584690094, + "learning_rate": 9.939301795423519e-05, + "loss": 2.1822, + "step": 2548 + }, + { + "epoch": 0.7823818293431553, + "grad_norm": 0.6666997671127319, + "learning_rate": 9.939224556107869e-05, + "loss": 2.288, + "step": 2549 + }, + { + "epoch": 0.7826887661141805, + "grad_norm": 0.6401170492172241, + "learning_rate": 9.939147267979905e-05, + "loss": 2.1038, + "step": 2550 + }, + { + "epoch": 0.7829957028852057, + "grad_norm": 0.645182728767395, + "learning_rate": 9.939069931040396e-05, + "loss": 2.1285, + "step": 2551 + }, + { + "epoch": 0.7833026396562308, + "grad_norm": 0.6795851588249207, + "learning_rate": 9.9389925452901e-05, + "loss": 2.1844, + "step": 2552 + }, + { + "epoch": 0.783609576427256, + "grad_norm": 0.7027488946914673, + "learning_rate": 9.938915110729788e-05, + "loss": 2.1712, + "step": 2553 + }, + { + "epoch": 0.7839165131982812, + "grad_norm": 0.7076524496078491, + "learning_rate": 9.93883762736022e-05, + "loss": 2.1812, + "step": 2554 + }, + { + "epoch": 0.7842234499693064, + "grad_norm": 0.5979459881782532, + "learning_rate": 9.938760095182165e-05, + "loss": 2.0877, + "step": 2555 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.6408665776252747, + "learning_rate": 9.938682514196387e-05, + "loss": 2.191, + "step": 2556 + }, + { + "epoch": 0.7848373235113567, + "grad_norm": 0.6545908451080322, + "learning_rate": 9.938604884403654e-05, + "loss": 2.0933, + "step": 2557 + }, + { + "epoch": 0.7851442602823818, + "grad_norm": 0.7271838784217834, + "learning_rate": 9.938527205804733e-05, + "loss": 2.1804, + "step": 2558 + }, + { + "epoch": 0.785451197053407, + "grad_norm": 0.6371840834617615, + "learning_rate": 9.938449478400391e-05, + "loss": 2.1161, + "step": 2559 + }, + { + "epoch": 0.7857581338244322, + "grad_norm": 0.5922467708587646, + "learning_rate": 9.938371702191398e-05, + "loss": 2.0929, + "step": 2560 + }, + { + "epoch": 0.7860650705954574, + "grad_norm": 0.536125898361206, + "learning_rate": 9.938293877178522e-05, + "loss": 2.0815, + "step": 2561 + }, + { + "epoch": 0.7863720073664825, + "grad_norm": 0.6026225090026855, + "learning_rate": 9.93821600336253e-05, + "loss": 2.1719, + "step": 2562 + }, + { + "epoch": 0.7866789441375077, + "grad_norm": 0.584267795085907, + "learning_rate": 9.938138080744192e-05, + "loss": 2.1515, + "step": 2563 + }, + { + "epoch": 0.7869858809085328, + "grad_norm": 0.6616362929344177, + "learning_rate": 9.938060109324281e-05, + "loss": 2.2425, + "step": 2564 + }, + { + "epoch": 0.787292817679558, + "grad_norm": 0.669987678527832, + "learning_rate": 9.937982089103566e-05, + "loss": 2.1883, + "step": 2565 + }, + { + "epoch": 0.7875997544505832, + "grad_norm": 0.6769465208053589, + "learning_rate": 9.937904020082815e-05, + "loss": 2.1508, + "step": 2566 + }, + { + "epoch": 0.7879066912216084, + "grad_norm": 0.5796112418174744, + "learning_rate": 9.937825902262805e-05, + "loss": 2.0925, + "step": 2567 + }, + { + "epoch": 0.7882136279926335, + "grad_norm": 0.5895870923995972, + "learning_rate": 9.937747735644305e-05, + "loss": 2.1002, + "step": 2568 + }, + { + "epoch": 0.7885205647636587, + "grad_norm": 0.5870219469070435, + "learning_rate": 9.937669520228088e-05, + "loss": 2.1189, + "step": 2569 + }, + { + "epoch": 0.7888275015346838, + "grad_norm": 0.6191404461860657, + "learning_rate": 9.937591256014925e-05, + "loss": 2.1783, + "step": 2570 + }, + { + "epoch": 0.7891344383057091, + "grad_norm": 0.6033806204795837, + "learning_rate": 9.937512943005592e-05, + "loss": 2.1507, + "step": 2571 + }, + { + "epoch": 0.7894413750767342, + "grad_norm": 0.6319470405578613, + "learning_rate": 9.937434581200863e-05, + "loss": 2.2088, + "step": 2572 + }, + { + "epoch": 0.7897483118477594, + "grad_norm": 0.621004581451416, + "learning_rate": 9.93735617060151e-05, + "loss": 2.1523, + "step": 2573 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.6069821715354919, + "learning_rate": 9.937277711208311e-05, + "loss": 2.1437, + "step": 2574 + }, + { + "epoch": 0.7903621853898097, + "grad_norm": 0.6186996102333069, + "learning_rate": 9.937199203022039e-05, + "loss": 2.1541, + "step": 2575 + }, + { + "epoch": 0.7906691221608348, + "grad_norm": 0.6531949639320374, + "learning_rate": 9.937120646043471e-05, + "loss": 2.1928, + "step": 2576 + }, + { + "epoch": 0.7909760589318601, + "grad_norm": 0.5974560379981995, + "learning_rate": 9.937042040273383e-05, + "loss": 2.1814, + "step": 2577 + }, + { + "epoch": 0.7912829957028852, + "grad_norm": 0.59506756067276, + "learning_rate": 9.936963385712552e-05, + "loss": 2.2143, + "step": 2578 + }, + { + "epoch": 0.7915899324739104, + "grad_norm": 0.5878757834434509, + "learning_rate": 9.936884682361755e-05, + "loss": 2.0718, + "step": 2579 + }, + { + "epoch": 0.7918968692449355, + "grad_norm": 0.6318243145942688, + "learning_rate": 9.936805930221769e-05, + "loss": 2.1465, + "step": 2580 + }, + { + "epoch": 0.7922038060159607, + "grad_norm": 0.6474836468696594, + "learning_rate": 9.936727129293376e-05, + "loss": 2.0869, + "step": 2581 + }, + { + "epoch": 0.7925107427869859, + "grad_norm": 0.6589438915252686, + "learning_rate": 9.936648279577349e-05, + "loss": 2.1422, + "step": 2582 + }, + { + "epoch": 0.7928176795580111, + "grad_norm": 0.6935134530067444, + "learning_rate": 9.93656938107447e-05, + "loss": 2.1571, + "step": 2583 + }, + { + "epoch": 0.7931246163290362, + "grad_norm": 0.655430793762207, + "learning_rate": 9.936490433785522e-05, + "loss": 2.1044, + "step": 2584 + }, + { + "epoch": 0.7934315531000614, + "grad_norm": 0.6856111288070679, + "learning_rate": 9.93641143771128e-05, + "loss": 2.0551, + "step": 2585 + }, + { + "epoch": 0.7937384898710865, + "grad_norm": 0.6783097386360168, + "learning_rate": 9.936332392852527e-05, + "loss": 2.1475, + "step": 2586 + }, + { + "epoch": 0.7940454266421118, + "grad_norm": 0.6746678948402405, + "learning_rate": 9.936253299210045e-05, + "loss": 2.1462, + "step": 2587 + }, + { + "epoch": 0.7943523634131369, + "grad_norm": 0.6854017972946167, + "learning_rate": 9.936174156784614e-05, + "loss": 2.1649, + "step": 2588 + }, + { + "epoch": 0.7946593001841621, + "grad_norm": 0.6740380525588989, + "learning_rate": 9.936094965577017e-05, + "loss": 2.06, + "step": 2589 + }, + { + "epoch": 0.7949662369551872, + "grad_norm": 0.6354179978370667, + "learning_rate": 9.936015725588037e-05, + "loss": 2.1938, + "step": 2590 + }, + { + "epoch": 0.7952731737262124, + "grad_norm": 0.6496716141700745, + "learning_rate": 9.935936436818453e-05, + "loss": 2.089, + "step": 2591 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.5996106266975403, + "learning_rate": 9.935857099269057e-05, + "loss": 2.2254, + "step": 2592 + }, + { + "epoch": 0.7958870472682628, + "grad_norm": 0.5630382895469666, + "learning_rate": 9.935777712940625e-05, + "loss": 2.069, + "step": 2593 + }, + { + "epoch": 0.7961939840392879, + "grad_norm": 0.5480468273162842, + "learning_rate": 9.935698277833946e-05, + "loss": 2.1288, + "step": 2594 + }, + { + "epoch": 0.7965009208103131, + "grad_norm": 0.5127096772193909, + "learning_rate": 9.935618793949803e-05, + "loss": 2.0753, + "step": 2595 + }, + { + "epoch": 0.7968078575813382, + "grad_norm": 0.6451439261436462, + "learning_rate": 9.935539261288983e-05, + "loss": 2.3005, + "step": 2596 + }, + { + "epoch": 0.7971147943523634, + "grad_norm": 0.7047737836837769, + "learning_rate": 9.935459679852271e-05, + "loss": 2.1307, + "step": 2597 + }, + { + "epoch": 0.7974217311233885, + "grad_norm": 0.6382983922958374, + "learning_rate": 9.935380049640454e-05, + "loss": 2.1136, + "step": 2598 + }, + { + "epoch": 0.7977286678944138, + "grad_norm": 0.7337773442268372, + "learning_rate": 9.935300370654317e-05, + "loss": 2.0719, + "step": 2599 + }, + { + "epoch": 0.7980356046654389, + "grad_norm": 0.7481197118759155, + "learning_rate": 9.935220642894652e-05, + "loss": 2.2263, + "step": 2600 + }, + { + "epoch": 0.7983425414364641, + "grad_norm": 0.7383365631103516, + "learning_rate": 9.93514086636224e-05, + "loss": 2.2207, + "step": 2601 + }, + { + "epoch": 0.7986494782074892, + "grad_norm": 0.800762951374054, + "learning_rate": 9.935061041057876e-05, + "loss": 2.1848, + "step": 2602 + }, + { + "epoch": 0.7989564149785144, + "grad_norm": 0.6972829699516296, + "learning_rate": 9.934981166982346e-05, + "loss": 2.1301, + "step": 2603 + }, + { + "epoch": 0.7992633517495396, + "grad_norm": 0.5842304229736328, + "learning_rate": 9.93490124413644e-05, + "loss": 2.1311, + "step": 2604 + }, + { + "epoch": 0.7995702885205648, + "grad_norm": 0.6070491075515747, + "learning_rate": 9.934821272520946e-05, + "loss": 2.2226, + "step": 2605 + }, + { + "epoch": 0.7998772252915899, + "grad_norm": 0.6141406297683716, + "learning_rate": 9.934741252136656e-05, + "loss": 2.1425, + "step": 2606 + }, + { + "epoch": 0.8001841620626151, + "grad_norm": 0.5515148043632507, + "learning_rate": 9.934661182984363e-05, + "loss": 2.1138, + "step": 2607 + }, + { + "epoch": 0.8004910988336402, + "grad_norm": 0.5819688439369202, + "learning_rate": 9.934581065064854e-05, + "loss": 2.0835, + "step": 2608 + }, + { + "epoch": 0.8007980356046654, + "grad_norm": 0.593979001045227, + "learning_rate": 9.934500898378922e-05, + "loss": 2.2262, + "step": 2609 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.6978363990783691, + "learning_rate": 9.934420682927361e-05, + "loss": 2.1283, + "step": 2610 + }, + { + "epoch": 0.8014119091467158, + "grad_norm": 0.6205853223800659, + "learning_rate": 9.934340418710963e-05, + "loss": 2.1254, + "step": 2611 + }, + { + "epoch": 0.8017188459177409, + "grad_norm": 0.5547113418579102, + "learning_rate": 9.93426010573052e-05, + "loss": 2.0895, + "step": 2612 + }, + { + "epoch": 0.8020257826887661, + "grad_norm": 0.5652415156364441, + "learning_rate": 9.934179743986827e-05, + "loss": 2.1496, + "step": 2613 + }, + { + "epoch": 0.8023327194597912, + "grad_norm": 0.5833094120025635, + "learning_rate": 9.934099333480678e-05, + "loss": 2.1159, + "step": 2614 + }, + { + "epoch": 0.8026396562308165, + "grad_norm": 0.5929473638534546, + "learning_rate": 9.934018874212866e-05, + "loss": 2.1512, + "step": 2615 + }, + { + "epoch": 0.8029465930018416, + "grad_norm": 0.6359207630157471, + "learning_rate": 9.93393836618419e-05, + "loss": 2.1384, + "step": 2616 + }, + { + "epoch": 0.8032535297728668, + "grad_norm": 0.5934728384017944, + "learning_rate": 9.933857809395441e-05, + "loss": 2.1087, + "step": 2617 + }, + { + "epoch": 0.8035604665438919, + "grad_norm": 0.5685787796974182, + "learning_rate": 9.933777203847418e-05, + "loss": 2.1521, + "step": 2618 + }, + { + "epoch": 0.8038674033149171, + "grad_norm": 0.6276339292526245, + "learning_rate": 9.933696549540918e-05, + "loss": 2.1151, + "step": 2619 + }, + { + "epoch": 0.8041743400859422, + "grad_norm": 0.6206804513931274, + "learning_rate": 9.933615846476736e-05, + "loss": 2.1872, + "step": 2620 + }, + { + "epoch": 0.8044812768569675, + "grad_norm": 0.6645623445510864, + "learning_rate": 9.933535094655671e-05, + "loss": 2.217, + "step": 2621 + }, + { + "epoch": 0.8047882136279927, + "grad_norm": 0.6639950275421143, + "learning_rate": 9.93345429407852e-05, + "loss": 2.1479, + "step": 2622 + }, + { + "epoch": 0.8050951503990178, + "grad_norm": 0.6284301280975342, + "learning_rate": 9.933373444746081e-05, + "loss": 2.1763, + "step": 2623 + }, + { + "epoch": 0.805402087170043, + "grad_norm": 0.5974198579788208, + "learning_rate": 9.933292546659156e-05, + "loss": 2.1453, + "step": 2624 + }, + { + "epoch": 0.8057090239410681, + "grad_norm": 0.6465814113616943, + "learning_rate": 9.933211599818541e-05, + "loss": 2.1999, + "step": 2625 + }, + { + "epoch": 0.8060159607120934, + "grad_norm": 0.6099503040313721, + "learning_rate": 9.933130604225038e-05, + "loss": 2.1523, + "step": 2626 + }, + { + "epoch": 0.8063228974831185, + "grad_norm": 0.5749596953392029, + "learning_rate": 9.933049559879448e-05, + "loss": 2.0802, + "step": 2627 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.5541282892227173, + "learning_rate": 9.93296846678257e-05, + "loss": 2.0851, + "step": 2628 + }, + { + "epoch": 0.8069367710251688, + "grad_norm": 0.5884469747543335, + "learning_rate": 9.932887324935207e-05, + "loss": 2.1824, + "step": 2629 + }, + { + "epoch": 0.807243707796194, + "grad_norm": 0.7330854535102844, + "learning_rate": 9.93280613433816e-05, + "loss": 2.1463, + "step": 2630 + }, + { + "epoch": 0.8075506445672191, + "grad_norm": 0.7012677192687988, + "learning_rate": 9.932724894992232e-05, + "loss": 2.0907, + "step": 2631 + }, + { + "epoch": 0.8078575813382444, + "grad_norm": 0.6487980484962463, + "learning_rate": 9.932643606898224e-05, + "loss": 2.2131, + "step": 2632 + }, + { + "epoch": 0.8081645181092695, + "grad_norm": 0.7956567406654358, + "learning_rate": 9.932562270056941e-05, + "loss": 2.2289, + "step": 2633 + }, + { + "epoch": 0.8084714548802947, + "grad_norm": 0.7904889583587646, + "learning_rate": 9.932480884469187e-05, + "loss": 2.195, + "step": 2634 + }, + { + "epoch": 0.8087783916513198, + "grad_norm": 0.8088505864143372, + "learning_rate": 9.932399450135766e-05, + "loss": 2.1199, + "step": 2635 + }, + { + "epoch": 0.809085328422345, + "grad_norm": 0.7557070851325989, + "learning_rate": 9.932317967057483e-05, + "loss": 2.177, + "step": 2636 + }, + { + "epoch": 0.8093922651933702, + "grad_norm": 0.8585113286972046, + "learning_rate": 9.932236435235143e-05, + "loss": 2.2215, + "step": 2637 + }, + { + "epoch": 0.8096992019643954, + "grad_norm": 0.9541242718696594, + "learning_rate": 9.932154854669551e-05, + "loss": 2.0971, + "step": 2638 + }, + { + "epoch": 0.8100061387354205, + "grad_norm": 0.9696017503738403, + "learning_rate": 9.932073225361513e-05, + "loss": 2.1723, + "step": 2639 + }, + { + "epoch": 0.8103130755064457, + "grad_norm": 0.9876028895378113, + "learning_rate": 9.931991547311839e-05, + "loss": 2.2266, + "step": 2640 + }, + { + "epoch": 0.8106200122774708, + "grad_norm": 0.9169884324073792, + "learning_rate": 9.931909820521332e-05, + "loss": 2.1453, + "step": 2641 + }, + { + "epoch": 0.810926949048496, + "grad_norm": 0.7645174860954285, + "learning_rate": 9.931828044990801e-05, + "loss": 2.1683, + "step": 2642 + }, + { + "epoch": 0.8112338858195212, + "grad_norm": 0.6733110547065735, + "learning_rate": 9.931746220721056e-05, + "loss": 2.0869, + "step": 2643 + }, + { + "epoch": 0.8115408225905464, + "grad_norm": 0.6033461689949036, + "learning_rate": 9.931664347712904e-05, + "loss": 2.1395, + "step": 2644 + }, + { + "epoch": 0.8118477593615715, + "grad_norm": 0.5953301191329956, + "learning_rate": 9.931582425967154e-05, + "loss": 2.0886, + "step": 2645 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.6587704420089722, + "learning_rate": 9.931500455484616e-05, + "loss": 2.1846, + "step": 2646 + }, + { + "epoch": 0.8124616329036218, + "grad_norm": 0.5837808847427368, + "learning_rate": 9.931418436266101e-05, + "loss": 2.0953, + "step": 2647 + }, + { + "epoch": 0.8127685696746471, + "grad_norm": 0.5593163967132568, + "learning_rate": 9.931336368312417e-05, + "loss": 2.1044, + "step": 2648 + }, + { + "epoch": 0.8130755064456722, + "grad_norm": 0.5758668780326843, + "learning_rate": 9.931254251624378e-05, + "loss": 2.1813, + "step": 2649 + }, + { + "epoch": 0.8133824432166974, + "grad_norm": 0.7128240466117859, + "learning_rate": 9.931172086202793e-05, + "loss": 2.1743, + "step": 2650 + }, + { + "epoch": 0.8136893799877225, + "grad_norm": 0.6214346885681152, + "learning_rate": 9.931089872048476e-05, + "loss": 2.0566, + "step": 2651 + }, + { + "epoch": 0.8139963167587477, + "grad_norm": 0.6279975771903992, + "learning_rate": 9.931007609162239e-05, + "loss": 2.1487, + "step": 2652 + }, + { + "epoch": 0.8143032535297728, + "grad_norm": 0.6137428879737854, + "learning_rate": 9.930925297544895e-05, + "loss": 2.1281, + "step": 2653 + }, + { + "epoch": 0.8146101903007981, + "grad_norm": 0.7433622479438782, + "learning_rate": 9.930842937197255e-05, + "loss": 2.2398, + "step": 2654 + }, + { + "epoch": 0.8149171270718232, + "grad_norm": 0.7490934729576111, + "learning_rate": 9.930760528120137e-05, + "loss": 2.0626, + "step": 2655 + }, + { + "epoch": 0.8152240638428484, + "grad_norm": 0.6829020380973816, + "learning_rate": 9.930678070314352e-05, + "loss": 2.0685, + "step": 2656 + }, + { + "epoch": 0.8155310006138735, + "grad_norm": 0.6328942775726318, + "learning_rate": 9.930595563780718e-05, + "loss": 2.1415, + "step": 2657 + }, + { + "epoch": 0.8158379373848987, + "grad_norm": 0.6919183135032654, + "learning_rate": 9.930513008520048e-05, + "loss": 2.1764, + "step": 2658 + }, + { + "epoch": 0.8161448741559238, + "grad_norm": 0.6600683331489563, + "learning_rate": 9.930430404533158e-05, + "loss": 2.2252, + "step": 2659 + }, + { + "epoch": 0.8164518109269491, + "grad_norm": 0.6614112257957458, + "learning_rate": 9.930347751820866e-05, + "loss": 2.0842, + "step": 2660 + }, + { + "epoch": 0.8167587476979742, + "grad_norm": 0.634395182132721, + "learning_rate": 9.930265050383987e-05, + "loss": 2.1784, + "step": 2661 + }, + { + "epoch": 0.8170656844689994, + "grad_norm": 0.6563819050788879, + "learning_rate": 9.930182300223338e-05, + "loss": 2.1845, + "step": 2662 + }, + { + "epoch": 0.8173726212400245, + "grad_norm": 0.7023175954818726, + "learning_rate": 9.93009950133974e-05, + "loss": 2.1913, + "step": 2663 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.6042037010192871, + "learning_rate": 9.930016653734007e-05, + "loss": 2.1624, + "step": 2664 + }, + { + "epoch": 0.8179864947820749, + "grad_norm": 0.5729875564575195, + "learning_rate": 9.929933757406962e-05, + "loss": 2.0439, + "step": 2665 + }, + { + "epoch": 0.8182934315531001, + "grad_norm": 0.5399687886238098, + "learning_rate": 9.929850812359421e-05, + "loss": 2.1438, + "step": 2666 + }, + { + "epoch": 0.8186003683241252, + "grad_norm": 0.6325745582580566, + "learning_rate": 9.929767818592205e-05, + "loss": 2.1644, + "step": 2667 + }, + { + "epoch": 0.8189073050951504, + "grad_norm": 0.6303146481513977, + "learning_rate": 9.929684776106134e-05, + "loss": 2.1106, + "step": 2668 + }, + { + "epoch": 0.8192142418661755, + "grad_norm": 0.6482712030410767, + "learning_rate": 9.929601684902027e-05, + "loss": 2.0877, + "step": 2669 + }, + { + "epoch": 0.8195211786372008, + "grad_norm": 0.6858036518096924, + "learning_rate": 9.92951854498071e-05, + "loss": 2.1263, + "step": 2670 + }, + { + "epoch": 0.8198281154082259, + "grad_norm": 0.6214284896850586, + "learning_rate": 9.929435356343e-05, + "loss": 2.1516, + "step": 2671 + }, + { + "epoch": 0.8201350521792511, + "grad_norm": 0.5486865639686584, + "learning_rate": 9.92935211898972e-05, + "loss": 2.1199, + "step": 2672 + }, + { + "epoch": 0.8204419889502762, + "grad_norm": 0.62936931848526, + "learning_rate": 9.929268832921693e-05, + "loss": 2.1555, + "step": 2673 + }, + { + "epoch": 0.8207489257213014, + "grad_norm": 0.6402064561843872, + "learning_rate": 9.929185498139744e-05, + "loss": 2.1017, + "step": 2674 + }, + { + "epoch": 0.8210558624923265, + "grad_norm": 0.7254593372344971, + "learning_rate": 9.929102114644693e-05, + "loss": 2.1145, + "step": 2675 + }, + { + "epoch": 0.8213627992633518, + "grad_norm": 0.776472806930542, + "learning_rate": 9.929018682437366e-05, + "loss": 2.2582, + "step": 2676 + }, + { + "epoch": 0.8216697360343769, + "grad_norm": 0.7073757648468018, + "learning_rate": 9.928935201518587e-05, + "loss": 2.1135, + "step": 2677 + }, + { + "epoch": 0.8219766728054021, + "grad_norm": 0.7075079679489136, + "learning_rate": 9.928851671889184e-05, + "loss": 2.128, + "step": 2678 + }, + { + "epoch": 0.8222836095764272, + "grad_norm": 0.7937450408935547, + "learning_rate": 9.928768093549979e-05, + "loss": 2.1401, + "step": 2679 + }, + { + "epoch": 0.8225905463474524, + "grad_norm": 0.7523970603942871, + "learning_rate": 9.928684466501797e-05, + "loss": 2.2055, + "step": 2680 + }, + { + "epoch": 0.8228974831184775, + "grad_norm": 0.6644876599311829, + "learning_rate": 9.928600790745466e-05, + "loss": 2.1449, + "step": 2681 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.6054069399833679, + "learning_rate": 9.928517066281816e-05, + "loss": 2.1191, + "step": 2682 + }, + { + "epoch": 0.8235113566605279, + "grad_norm": 0.6610973477363586, + "learning_rate": 9.92843329311167e-05, + "loss": 2.2247, + "step": 2683 + }, + { + "epoch": 0.8238182934315531, + "grad_norm": 0.69968181848526, + "learning_rate": 9.928349471235858e-05, + "loss": 2.149, + "step": 2684 + }, + { + "epoch": 0.8241252302025782, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.928265600655206e-05, + "loss": 2.1906, + "step": 2685 + }, + { + "epoch": 0.8244321669736034, + "grad_norm": 0.6621972918510437, + "learning_rate": 9.928181681370547e-05, + "loss": 2.1259, + "step": 2686 + }, + { + "epoch": 0.8247391037446286, + "grad_norm": 0.6452053785324097, + "learning_rate": 9.928097713382708e-05, + "loss": 2.1301, + "step": 2687 + }, + { + "epoch": 0.8250460405156538, + "grad_norm": 0.6137326955795288, + "learning_rate": 9.928013696692519e-05, + "loss": 2.0942, + "step": 2688 + }, + { + "epoch": 0.8253529772866789, + "grad_norm": 0.6449215412139893, + "learning_rate": 9.92792963130081e-05, + "loss": 2.2135, + "step": 2689 + }, + { + "epoch": 0.8256599140577041, + "grad_norm": 0.5838732123374939, + "learning_rate": 9.927845517208411e-05, + "loss": 2.1161, + "step": 2690 + }, + { + "epoch": 0.8259668508287292, + "grad_norm": 0.6642805337905884, + "learning_rate": 9.927761354416157e-05, + "loss": 2.1228, + "step": 2691 + }, + { + "epoch": 0.8262737875997545, + "grad_norm": 0.653274416923523, + "learning_rate": 9.927677142924874e-05, + "loss": 2.1777, + "step": 2692 + }, + { + "epoch": 0.8265807243707797, + "grad_norm": 0.6471827030181885, + "learning_rate": 9.927592882735398e-05, + "loss": 2.0756, + "step": 2693 + }, + { + "epoch": 0.8268876611418048, + "grad_norm": 0.6215457916259766, + "learning_rate": 9.927508573848562e-05, + "loss": 2.0691, + "step": 2694 + }, + { + "epoch": 0.82719459791283, + "grad_norm": 0.6343390345573425, + "learning_rate": 9.927424216265198e-05, + "loss": 2.2145, + "step": 2695 + }, + { + "epoch": 0.8275015346838551, + "grad_norm": 0.5296334624290466, + "learning_rate": 9.927339809986138e-05, + "loss": 2.0861, + "step": 2696 + }, + { + "epoch": 0.8278084714548803, + "grad_norm": 0.6457146406173706, + "learning_rate": 9.92725535501222e-05, + "loss": 2.1703, + "step": 2697 + }, + { + "epoch": 0.8281154082259055, + "grad_norm": 0.753579318523407, + "learning_rate": 9.927170851344276e-05, + "loss": 2.1628, + "step": 2698 + }, + { + "epoch": 0.8284223449969307, + "grad_norm": 0.7327163815498352, + "learning_rate": 9.927086298983141e-05, + "loss": 2.105, + "step": 2699 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.7786175608634949, + "learning_rate": 9.927001697929653e-05, + "loss": 2.084, + "step": 2700 + }, + { + "epoch": 0.829036218538981, + "grad_norm": 0.6370857357978821, + "learning_rate": 9.926917048184646e-05, + "loss": 2.0888, + "step": 2701 + }, + { + "epoch": 0.8293431553100061, + "grad_norm": 0.6600006818771362, + "learning_rate": 9.926832349748955e-05, + "loss": 2.148, + "step": 2702 + }, + { + "epoch": 0.8296500920810314, + "grad_norm": 0.6266845464706421, + "learning_rate": 9.926747602623422e-05, + "loss": 2.2182, + "step": 2703 + }, + { + "epoch": 0.8299570288520565, + "grad_norm": 0.588934600353241, + "learning_rate": 9.92666280680888e-05, + "loss": 2.1879, + "step": 2704 + }, + { + "epoch": 0.8302639656230817, + "grad_norm": 0.6467881202697754, + "learning_rate": 9.926577962306168e-05, + "loss": 2.1082, + "step": 2705 + }, + { + "epoch": 0.8305709023941068, + "grad_norm": 0.6256638765335083, + "learning_rate": 9.926493069116127e-05, + "loss": 2.1007, + "step": 2706 + }, + { + "epoch": 0.830877839165132, + "grad_norm": 0.5710256099700928, + "learning_rate": 9.926408127239592e-05, + "loss": 2.0783, + "step": 2707 + }, + { + "epoch": 0.8311847759361571, + "grad_norm": 0.5836597681045532, + "learning_rate": 9.926323136677405e-05, + "loss": 2.1292, + "step": 2708 + }, + { + "epoch": 0.8314917127071824, + "grad_norm": 0.6420408487319946, + "learning_rate": 9.926238097430405e-05, + "loss": 2.1191, + "step": 2709 + }, + { + "epoch": 0.8317986494782075, + "grad_norm": 0.6192520260810852, + "learning_rate": 9.926153009499433e-05, + "loss": 2.1401, + "step": 2710 + }, + { + "epoch": 0.8321055862492327, + "grad_norm": 0.5986925959587097, + "learning_rate": 9.92606787288533e-05, + "loss": 2.0466, + "step": 2711 + }, + { + "epoch": 0.8324125230202578, + "grad_norm": 0.6386710405349731, + "learning_rate": 9.925982687588937e-05, + "loss": 2.1975, + "step": 2712 + }, + { + "epoch": 0.832719459791283, + "grad_norm": 0.6678250432014465, + "learning_rate": 9.925897453611095e-05, + "loss": 2.1744, + "step": 2713 + }, + { + "epoch": 0.8330263965623081, + "grad_norm": 0.628873348236084, + "learning_rate": 9.925812170952648e-05, + "loss": 2.0901, + "step": 2714 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6365368366241455, + "learning_rate": 9.925726839614438e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.8336402701043585, + "grad_norm": 0.6812825798988342, + "learning_rate": 9.925641459597309e-05, + "loss": 2.1163, + "step": 2716 + }, + { + "epoch": 0.8339472068753837, + "grad_norm": 0.6961301565170288, + "learning_rate": 9.925556030902103e-05, + "loss": 2.1634, + "step": 2717 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.687017023563385, + "learning_rate": 9.925470553529666e-05, + "loss": 2.1921, + "step": 2718 + }, + { + "epoch": 0.834561080417434, + "grad_norm": 0.6528787612915039, + "learning_rate": 9.925385027480841e-05, + "loss": 2.1148, + "step": 2719 + }, + { + "epoch": 0.8348680171884592, + "grad_norm": 0.6092917323112488, + "learning_rate": 9.925299452756476e-05, + "loss": 2.0154, + "step": 2720 + }, + { + "epoch": 0.8351749539594844, + "grad_norm": 0.6537092328071594, + "learning_rate": 9.925213829357413e-05, + "loss": 2.1775, + "step": 2721 + }, + { + "epoch": 0.8354818907305095, + "grad_norm": 0.6560773849487305, + "learning_rate": 9.925128157284503e-05, + "loss": 2.1628, + "step": 2722 + }, + { + "epoch": 0.8357888275015347, + "grad_norm": 0.5976104140281677, + "learning_rate": 9.925042436538588e-05, + "loss": 2.1527, + "step": 2723 + }, + { + "epoch": 0.8360957642725598, + "grad_norm": 0.6577131152153015, + "learning_rate": 9.924956667120516e-05, + "loss": 2.1449, + "step": 2724 + }, + { + "epoch": 0.836402701043585, + "grad_norm": 0.6574232578277588, + "learning_rate": 9.924870849031136e-05, + "loss": 2.0517, + "step": 2725 + }, + { + "epoch": 0.8367096378146102, + "grad_norm": 0.5988326072692871, + "learning_rate": 9.924784982271297e-05, + "loss": 2.0975, + "step": 2726 + }, + { + "epoch": 0.8370165745856354, + "grad_norm": 0.5970706939697266, + "learning_rate": 9.924699066841845e-05, + "loss": 2.1754, + "step": 2727 + }, + { + "epoch": 0.8373235113566605, + "grad_norm": 0.6547200679779053, + "learning_rate": 9.924613102743632e-05, + "loss": 2.1651, + "step": 2728 + }, + { + "epoch": 0.8376304481276857, + "grad_norm": 0.643358588218689, + "learning_rate": 9.924527089977504e-05, + "loss": 2.1355, + "step": 2729 + }, + { + "epoch": 0.8379373848987108, + "grad_norm": 0.6696504950523376, + "learning_rate": 9.924441028544314e-05, + "loss": 2.1444, + "step": 2730 + }, + { + "epoch": 0.8382443216697361, + "grad_norm": 0.5923263430595398, + "learning_rate": 9.924354918444911e-05, + "loss": 2.1656, + "step": 2731 + }, + { + "epoch": 0.8385512584407612, + "grad_norm": 0.6507698893547058, + "learning_rate": 9.924268759680146e-05, + "loss": 2.1172, + "step": 2732 + }, + { + "epoch": 0.8388581952117864, + "grad_norm": 0.6240561008453369, + "learning_rate": 9.924182552250873e-05, + "loss": 2.113, + "step": 2733 + }, + { + "epoch": 0.8391651319828115, + "grad_norm": 0.7350605726242065, + "learning_rate": 9.92409629615794e-05, + "loss": 2.2099, + "step": 2734 + }, + { + "epoch": 0.8394720687538367, + "grad_norm": 0.679027795791626, + "learning_rate": 9.924009991402202e-05, + "loss": 2.1202, + "step": 2735 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.7187801003456116, + "learning_rate": 9.923923637984512e-05, + "loss": 2.1994, + "step": 2736 + }, + { + "epoch": 0.8400859422958871, + "grad_norm": 0.7437569499015808, + "learning_rate": 9.92383723590572e-05, + "loss": 2.1778, + "step": 2737 + }, + { + "epoch": 0.8403928790669122, + "grad_norm": 0.7004902958869934, + "learning_rate": 9.923750785166686e-05, + "loss": 2.1478, + "step": 2738 + }, + { + "epoch": 0.8406998158379374, + "grad_norm": 0.632478654384613, + "learning_rate": 9.923664285768258e-05, + "loss": 2.1785, + "step": 2739 + }, + { + "epoch": 0.8410067526089625, + "grad_norm": 0.6399826407432556, + "learning_rate": 9.923577737711295e-05, + "loss": 2.1708, + "step": 2740 + }, + { + "epoch": 0.8413136893799877, + "grad_norm": 0.649340033531189, + "learning_rate": 9.92349114099665e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.8416206261510129, + "grad_norm": 0.6143749952316284, + "learning_rate": 9.923404495625182e-05, + "loss": 2.0696, + "step": 2742 + }, + { + "epoch": 0.8419275629220381, + "grad_norm": 0.655846357345581, + "learning_rate": 9.923317801597742e-05, + "loss": 2.1163, + "step": 2743 + }, + { + "epoch": 0.8422344996930632, + "grad_norm": 0.588096022605896, + "learning_rate": 9.923231058915192e-05, + "loss": 2.0893, + "step": 2744 + }, + { + "epoch": 0.8425414364640884, + "grad_norm": 0.5445908904075623, + "learning_rate": 9.923144267578386e-05, + "loss": 2.1223, + "step": 2745 + }, + { + "epoch": 0.8428483732351135, + "grad_norm": 0.5372910499572754, + "learning_rate": 9.923057427588182e-05, + "loss": 2.1386, + "step": 2746 + }, + { + "epoch": 0.8431553100061387, + "grad_norm": 0.5118899345397949, + "learning_rate": 9.922970538945442e-05, + "loss": 2.0532, + "step": 2747 + }, + { + "epoch": 0.8434622467771639, + "grad_norm": 0.5252440571784973, + "learning_rate": 9.922883601651019e-05, + "loss": 2.1679, + "step": 2748 + }, + { + "epoch": 0.8437691835481891, + "grad_norm": 0.5978875160217285, + "learning_rate": 9.922796615705776e-05, + "loss": 2.2054, + "step": 2749 + }, + { + "epoch": 0.8440761203192142, + "grad_norm": 0.5642610788345337, + "learning_rate": 9.922709581110572e-05, + "loss": 2.1886, + "step": 2750 + }, + { + "epoch": 0.8443830570902394, + "grad_norm": 0.6332407593727112, + "learning_rate": 9.922622497866265e-05, + "loss": 2.1618, + "step": 2751 + }, + { + "epoch": 0.8446899938612645, + "grad_norm": 0.6971728801727295, + "learning_rate": 9.922535365973718e-05, + "loss": 2.1011, + "step": 2752 + }, + { + "epoch": 0.8449969306322898, + "grad_norm": 0.6917250156402588, + "learning_rate": 9.922448185433792e-05, + "loss": 2.1408, + "step": 2753 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.748960554599762, + "learning_rate": 9.922360956247348e-05, + "loss": 2.1612, + "step": 2754 + }, + { + "epoch": 0.8456108041743401, + "grad_norm": 0.6739722490310669, + "learning_rate": 9.922273678415245e-05, + "loss": 2.1234, + "step": 2755 + }, + { + "epoch": 0.8459177409453652, + "grad_norm": 0.6310722827911377, + "learning_rate": 9.922186351938351e-05, + "loss": 2.1476, + "step": 2756 + }, + { + "epoch": 0.8462246777163904, + "grad_norm": 0.5992079973220825, + "learning_rate": 9.922098976817527e-05, + "loss": 2.1009, + "step": 2757 + }, + { + "epoch": 0.8465316144874155, + "grad_norm": 0.5697188973426819, + "learning_rate": 9.922011553053637e-05, + "loss": 2.1277, + "step": 2758 + }, + { + "epoch": 0.8468385512584408, + "grad_norm": 0.7005256414413452, + "learning_rate": 9.921924080647541e-05, + "loss": 2.1592, + "step": 2759 + }, + { + "epoch": 0.8471454880294659, + "grad_norm": 0.7664382457733154, + "learning_rate": 9.921836559600109e-05, + "loss": 2.2328, + "step": 2760 + }, + { + "epoch": 0.8474524248004911, + "grad_norm": 0.8668230772018433, + "learning_rate": 9.921748989912201e-05, + "loss": 2.2285, + "step": 2761 + }, + { + "epoch": 0.8477593615715162, + "grad_norm": 0.9423169493675232, + "learning_rate": 9.921661371584685e-05, + "loss": 2.1172, + "step": 2762 + }, + { + "epoch": 0.8480662983425414, + "grad_norm": 0.8547552824020386, + "learning_rate": 9.921573704618428e-05, + "loss": 2.1426, + "step": 2763 + }, + { + "epoch": 0.8483732351135667, + "grad_norm": 0.7568690776824951, + "learning_rate": 9.921485989014294e-05, + "loss": 2.0861, + "step": 2764 + }, + { + "epoch": 0.8486801718845918, + "grad_norm": 0.6535828709602356, + "learning_rate": 9.92139822477315e-05, + "loss": 2.1705, + "step": 2765 + }, + { + "epoch": 0.848987108655617, + "grad_norm": 0.6099218130111694, + "learning_rate": 9.921310411895867e-05, + "loss": 2.1666, + "step": 2766 + }, + { + "epoch": 0.8492940454266421, + "grad_norm": 0.6315065026283264, + "learning_rate": 9.92122255038331e-05, + "loss": 2.1868, + "step": 2767 + }, + { + "epoch": 0.8496009821976673, + "grad_norm": 0.6861329078674316, + "learning_rate": 9.921134640236344e-05, + "loss": 2.1056, + "step": 2768 + }, + { + "epoch": 0.8499079189686924, + "grad_norm": 0.6357519626617432, + "learning_rate": 9.921046681455844e-05, + "loss": 2.1272, + "step": 2769 + }, + { + "epoch": 0.8502148557397177, + "grad_norm": 0.6245810389518738, + "learning_rate": 9.920958674042676e-05, + "loss": 2.1313, + "step": 2770 + }, + { + "epoch": 0.8505217925107428, + "grad_norm": 0.6087192296981812, + "learning_rate": 9.920870617997709e-05, + "loss": 2.123, + "step": 2771 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.6384228467941284, + "learning_rate": 9.920782513321814e-05, + "loss": 2.1343, + "step": 2772 + }, + { + "epoch": 0.8511356660527931, + "grad_norm": 0.6143882274627686, + "learning_rate": 9.920694360015863e-05, + "loss": 2.0706, + "step": 2773 + }, + { + "epoch": 0.8514426028238183, + "grad_norm": 0.5561975240707397, + "learning_rate": 9.920606158080725e-05, + "loss": 2.1015, + "step": 2774 + }, + { + "epoch": 0.8517495395948435, + "grad_norm": 0.5434146523475647, + "learning_rate": 9.920517907517275e-05, + "loss": 2.1306, + "step": 2775 + }, + { + "epoch": 0.8520564763658687, + "grad_norm": 0.6028591990470886, + "learning_rate": 9.920429608326382e-05, + "loss": 2.1665, + "step": 2776 + }, + { + "epoch": 0.8523634131368938, + "grad_norm": 0.6491599082946777, + "learning_rate": 9.920341260508918e-05, + "loss": 2.0715, + "step": 2777 + }, + { + "epoch": 0.852670349907919, + "grad_norm": 0.6350167989730835, + "learning_rate": 9.92025286406576e-05, + "loss": 2.1492, + "step": 2778 + }, + { + "epoch": 0.8529772866789441, + "grad_norm": 0.5726897120475769, + "learning_rate": 9.92016441899778e-05, + "loss": 2.1128, + "step": 2779 + }, + { + "epoch": 0.8532842234499693, + "grad_norm": 0.5680630207061768, + "learning_rate": 9.92007592530585e-05, + "loss": 2.0718, + "step": 2780 + }, + { + "epoch": 0.8535911602209945, + "grad_norm": 0.5901346802711487, + "learning_rate": 9.919987382990845e-05, + "loss": 2.0577, + "step": 2781 + }, + { + "epoch": 0.8538980969920197, + "grad_norm": 0.5756994485855103, + "learning_rate": 9.919898792053643e-05, + "loss": 2.106, + "step": 2782 + }, + { + "epoch": 0.8542050337630448, + "grad_norm": 0.5831238031387329, + "learning_rate": 9.919810152495116e-05, + "loss": 2.0507, + "step": 2783 + }, + { + "epoch": 0.85451197053407, + "grad_norm": 0.529931902885437, + "learning_rate": 9.919721464316143e-05, + "loss": 2.0934, + "step": 2784 + }, + { + "epoch": 0.8548189073050951, + "grad_norm": 0.603672981262207, + "learning_rate": 9.919632727517597e-05, + "loss": 2.164, + "step": 2785 + }, + { + "epoch": 0.8551258440761204, + "grad_norm": 0.5741528868675232, + "learning_rate": 9.919543942100357e-05, + "loss": 2.0948, + "step": 2786 + }, + { + "epoch": 0.8554327808471455, + "grad_norm": 0.5689142942428589, + "learning_rate": 9.919455108065303e-05, + "loss": 2.1572, + "step": 2787 + }, + { + "epoch": 0.8557397176181707, + "grad_norm": 0.5767523646354675, + "learning_rate": 9.919366225413308e-05, + "loss": 2.0528, + "step": 2788 + }, + { + "epoch": 0.8560466543891958, + "grad_norm": 0.6004374623298645, + "learning_rate": 9.919277294145252e-05, + "loss": 2.1078, + "step": 2789 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.6199560761451721, + "learning_rate": 9.919188314262017e-05, + "loss": 2.034, + "step": 2790 + }, + { + "epoch": 0.8566605279312461, + "grad_norm": 0.5928464531898499, + "learning_rate": 9.919099285764478e-05, + "loss": 2.1226, + "step": 2791 + }, + { + "epoch": 0.8569674647022714, + "grad_norm": 0.5620111227035522, + "learning_rate": 9.919010208653517e-05, + "loss": 2.1387, + "step": 2792 + }, + { + "epoch": 0.8572744014732965, + "grad_norm": 0.6035314798355103, + "learning_rate": 9.918921082930015e-05, + "loss": 2.0888, + "step": 2793 + }, + { + "epoch": 0.8575813382443217, + "grad_norm": 0.6842171549797058, + "learning_rate": 9.91883190859485e-05, + "loss": 2.15, + "step": 2794 + }, + { + "epoch": 0.8578882750153468, + "grad_norm": 0.7600229978561401, + "learning_rate": 9.918742685648906e-05, + "loss": 2.1776, + "step": 2795 + }, + { + "epoch": 0.858195211786372, + "grad_norm": 0.641504168510437, + "learning_rate": 9.918653414093065e-05, + "loss": 2.086, + "step": 2796 + }, + { + "epoch": 0.8585021485573971, + "grad_norm": 0.6062462329864502, + "learning_rate": 9.918564093928207e-05, + "loss": 2.0772, + "step": 2797 + }, + { + "epoch": 0.8588090853284224, + "grad_norm": 0.5259165167808533, + "learning_rate": 9.918474725155214e-05, + "loss": 2.1034, + "step": 2798 + }, + { + "epoch": 0.8591160220994475, + "grad_norm": 0.532511830329895, + "learning_rate": 9.918385307774973e-05, + "loss": 2.103, + "step": 2799 + }, + { + "epoch": 0.8594229588704727, + "grad_norm": 0.5996485352516174, + "learning_rate": 9.918295841788366e-05, + "loss": 2.1698, + "step": 2800 + }, + { + "epoch": 0.8597298956414978, + "grad_norm": 0.5895976424217224, + "learning_rate": 9.918206327196276e-05, + "loss": 2.132, + "step": 2801 + }, + { + "epoch": 0.860036832412523, + "grad_norm": 0.6363179087638855, + "learning_rate": 9.918116763999588e-05, + "loss": 2.0967, + "step": 2802 + }, + { + "epoch": 0.8603437691835482, + "grad_norm": 0.6594113707542419, + "learning_rate": 9.918027152199187e-05, + "loss": 2.1266, + "step": 2803 + }, + { + "epoch": 0.8606507059545734, + "grad_norm": 0.694879412651062, + "learning_rate": 9.917937491795961e-05, + "loss": 2.0694, + "step": 2804 + }, + { + "epoch": 0.8609576427255985, + "grad_norm": 0.6310710906982422, + "learning_rate": 9.917847782790793e-05, + "loss": 2.1546, + "step": 2805 + }, + { + "epoch": 0.8612645794966237, + "grad_norm": 0.6166081428527832, + "learning_rate": 9.917758025184572e-05, + "loss": 2.131, + "step": 2806 + }, + { + "epoch": 0.8615715162676488, + "grad_norm": 0.5857066512107849, + "learning_rate": 9.917668218978182e-05, + "loss": 2.1529, + "step": 2807 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.6374151706695557, + "learning_rate": 9.917578364172513e-05, + "loss": 2.151, + "step": 2808 + }, + { + "epoch": 0.8621853898096992, + "grad_norm": 0.6760959625244141, + "learning_rate": 9.917488460768453e-05, + "loss": 2.1955, + "step": 2809 + }, + { + "epoch": 0.8624923265807244, + "grad_norm": 0.6308501362800598, + "learning_rate": 9.917398508766889e-05, + "loss": 2.1449, + "step": 2810 + }, + { + "epoch": 0.8627992633517495, + "grad_norm": 0.615181028842926, + "learning_rate": 9.91730850816871e-05, + "loss": 2.0326, + "step": 2811 + }, + { + "epoch": 0.8631062001227747, + "grad_norm": 0.6746891736984253, + "learning_rate": 9.917218458974809e-05, + "loss": 2.1472, + "step": 2812 + }, + { + "epoch": 0.8634131368937998, + "grad_norm": 0.6594959497451782, + "learning_rate": 9.91712836118607e-05, + "loss": 2.0879, + "step": 2813 + }, + { + "epoch": 0.8637200736648251, + "grad_norm": 0.6843087077140808, + "learning_rate": 9.91703821480339e-05, + "loss": 2.13, + "step": 2814 + }, + { + "epoch": 0.8640270104358502, + "grad_norm": 0.7513928413391113, + "learning_rate": 9.916948019827653e-05, + "loss": 2.1866, + "step": 2815 + }, + { + "epoch": 0.8643339472068754, + "grad_norm": 0.7352319955825806, + "learning_rate": 9.916857776259755e-05, + "loss": 2.0844, + "step": 2816 + }, + { + "epoch": 0.8646408839779005, + "grad_norm": 0.6901769638061523, + "learning_rate": 9.916767484100587e-05, + "loss": 2.086, + "step": 2817 + }, + { + "epoch": 0.8649478207489257, + "grad_norm": 0.621734619140625, + "learning_rate": 9.91667714335104e-05, + "loss": 2.0764, + "step": 2818 + }, + { + "epoch": 0.8652547575199508, + "grad_norm": 0.5779813528060913, + "learning_rate": 9.916586754012008e-05, + "loss": 2.0568, + "step": 2819 + }, + { + "epoch": 0.8655616942909761, + "grad_norm": 0.566251814365387, + "learning_rate": 9.916496316084385e-05, + "loss": 2.1624, + "step": 2820 + }, + { + "epoch": 0.8658686310620012, + "grad_norm": 0.6039763689041138, + "learning_rate": 9.916405829569062e-05, + "loss": 2.0412, + "step": 2821 + }, + { + "epoch": 0.8661755678330264, + "grad_norm": 0.587469220161438, + "learning_rate": 9.916315294466935e-05, + "loss": 2.1513, + "step": 2822 + }, + { + "epoch": 0.8664825046040515, + "grad_norm": 0.5792883634567261, + "learning_rate": 9.916224710778901e-05, + "loss": 2.055, + "step": 2823 + }, + { + "epoch": 0.8667894413750767, + "grad_norm": 0.5533844232559204, + "learning_rate": 9.916134078505852e-05, + "loss": 2.1237, + "step": 2824 + }, + { + "epoch": 0.8670963781461019, + "grad_norm": 0.6140845417976379, + "learning_rate": 9.916043397648685e-05, + "loss": 2.1481, + "step": 2825 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.6092365384101868, + "learning_rate": 9.915952668208295e-05, + "loss": 2.1567, + "step": 2826 + }, + { + "epoch": 0.8677102516881522, + "grad_norm": 0.5712884068489075, + "learning_rate": 9.915861890185578e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.8680171884591774, + "grad_norm": 0.5314213633537292, + "learning_rate": 9.915771063581434e-05, + "loss": 2.0408, + "step": 2828 + }, + { + "epoch": 0.8683241252302025, + "grad_norm": 0.5258345007896423, + "learning_rate": 9.915680188396759e-05, + "loss": 2.0968, + "step": 2829 + }, + { + "epoch": 0.8686310620012277, + "grad_norm": 0.6071497797966003, + "learning_rate": 9.915589264632453e-05, + "loss": 2.0924, + "step": 2830 + }, + { + "epoch": 0.8689379987722529, + "grad_norm": 0.6742420792579651, + "learning_rate": 9.915498292289408e-05, + "loss": 2.1276, + "step": 2831 + }, + { + "epoch": 0.8692449355432781, + "grad_norm": 0.7642729878425598, + "learning_rate": 9.915407271368533e-05, + "loss": 2.204, + "step": 2832 + }, + { + "epoch": 0.8695518723143032, + "grad_norm": 0.8024489283561707, + "learning_rate": 9.915316201870718e-05, + "loss": 2.163, + "step": 2833 + }, + { + "epoch": 0.8698588090853284, + "grad_norm": 0.8268367648124695, + "learning_rate": 9.915225083796871e-05, + "loss": 2.117, + "step": 2834 + }, + { + "epoch": 0.8701657458563536, + "grad_norm": 0.7761407494544983, + "learning_rate": 9.915133917147888e-05, + "loss": 2.0727, + "step": 2835 + }, + { + "epoch": 0.8704726826273788, + "grad_norm": 0.7515753507614136, + "learning_rate": 9.91504270192467e-05, + "loss": 2.075, + "step": 2836 + }, + { + "epoch": 0.870779619398404, + "grad_norm": 0.6203973889350891, + "learning_rate": 9.914951438128119e-05, + "loss": 2.1163, + "step": 2837 + }, + { + "epoch": 0.8710865561694291, + "grad_norm": 0.6056976318359375, + "learning_rate": 9.914860125759138e-05, + "loss": 2.1515, + "step": 2838 + }, + { + "epoch": 0.8713934929404543, + "grad_norm": 0.6472234725952148, + "learning_rate": 9.914768764818627e-05, + "loss": 2.1618, + "step": 2839 + }, + { + "epoch": 0.8717004297114794, + "grad_norm": 0.5981749892234802, + "learning_rate": 9.914677355307491e-05, + "loss": 2.0763, + "step": 2840 + }, + { + "epoch": 0.8720073664825047, + "grad_norm": 0.5721938014030457, + "learning_rate": 9.914585897226634e-05, + "loss": 2.0916, + "step": 2841 + }, + { + "epoch": 0.8723143032535298, + "grad_norm": 0.6079535484313965, + "learning_rate": 9.914494390576958e-05, + "loss": 2.0767, + "step": 2842 + }, + { + "epoch": 0.872621240024555, + "grad_norm": 0.6684066653251648, + "learning_rate": 9.914402835359368e-05, + "loss": 2.2712, + "step": 2843 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.6992711424827576, + "learning_rate": 9.91431123157477e-05, + "loss": 2.0813, + "step": 2844 + }, + { + "epoch": 0.8732351135666053, + "grad_norm": 0.6585392951965332, + "learning_rate": 9.914219579224065e-05, + "loss": 2.1303, + "step": 2845 + }, + { + "epoch": 0.8735420503376304, + "grad_norm": 0.7267395257949829, + "learning_rate": 9.914127878308164e-05, + "loss": 2.2253, + "step": 2846 + }, + { + "epoch": 0.8738489871086557, + "grad_norm": 0.6764006018638611, + "learning_rate": 9.91403612882797e-05, + "loss": 2.0886, + "step": 2847 + }, + { + "epoch": 0.8741559238796808, + "grad_norm": 0.612808108329773, + "learning_rate": 9.91394433078439e-05, + "loss": 2.0469, + "step": 2848 + }, + { + "epoch": 0.874462860650706, + "grad_norm": 0.5598782896995544, + "learning_rate": 9.913852484178334e-05, + "loss": 2.1745, + "step": 2849 + }, + { + "epoch": 0.8747697974217311, + "grad_norm": 0.6498168706893921, + "learning_rate": 9.913760589010707e-05, + "loss": 2.2657, + "step": 2850 + }, + { + "epoch": 0.8750767341927563, + "grad_norm": 0.6796014904975891, + "learning_rate": 9.913668645282418e-05, + "loss": 2.1056, + "step": 2851 + }, + { + "epoch": 0.8753836709637814, + "grad_norm": 0.7409440279006958, + "learning_rate": 9.913576652994376e-05, + "loss": 2.1533, + "step": 2852 + }, + { + "epoch": 0.8756906077348067, + "grad_norm": 0.7044464945793152, + "learning_rate": 9.913484612147488e-05, + "loss": 2.2088, + "step": 2853 + }, + { + "epoch": 0.8759975445058318, + "grad_norm": 0.6333544254302979, + "learning_rate": 9.913392522742666e-05, + "loss": 2.132, + "step": 2854 + }, + { + "epoch": 0.876304481276857, + "grad_norm": 0.603382408618927, + "learning_rate": 9.91330038478082e-05, + "loss": 2.0657, + "step": 2855 + }, + { + "epoch": 0.8766114180478821, + "grad_norm": 0.5919856429100037, + "learning_rate": 9.913208198262858e-05, + "loss": 2.0854, + "step": 2856 + }, + { + "epoch": 0.8769183548189073, + "grad_norm": 0.6033365726470947, + "learning_rate": 9.913115963189694e-05, + "loss": 2.0825, + "step": 2857 + }, + { + "epoch": 0.8772252915899325, + "grad_norm": 0.5917964577674866, + "learning_rate": 9.913023679562238e-05, + "loss": 2.1608, + "step": 2858 + }, + { + "epoch": 0.8775322283609577, + "grad_norm": 0.5953360795974731, + "learning_rate": 9.912931347381402e-05, + "loss": 2.1454, + "step": 2859 + }, + { + "epoch": 0.8778391651319828, + "grad_norm": 0.5949352979660034, + "learning_rate": 9.9128389666481e-05, + "loss": 2.1575, + "step": 2860 + }, + { + "epoch": 0.878146101903008, + "grad_norm": 0.5468181371688843, + "learning_rate": 9.912746537363243e-05, + "loss": 2.151, + "step": 2861 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.5476632714271545, + "learning_rate": 9.912654059527746e-05, + "loss": 2.1015, + "step": 2862 + }, + { + "epoch": 0.8787599754450584, + "grad_norm": 0.6881390810012817, + "learning_rate": 9.912561533142521e-05, + "loss": 2.2002, + "step": 2863 + }, + { + "epoch": 0.8790669122160835, + "grad_norm": 0.6663404703140259, + "learning_rate": 9.912468958208486e-05, + "loss": 2.0691, + "step": 2864 + }, + { + "epoch": 0.8793738489871087, + "grad_norm": 0.5739100575447083, + "learning_rate": 9.91237633472655e-05, + "loss": 2.0852, + "step": 2865 + }, + { + "epoch": 0.8796807857581338, + "grad_norm": 0.5227558016777039, + "learning_rate": 9.912283662697635e-05, + "loss": 2.1144, + "step": 2866 + }, + { + "epoch": 0.879987722529159, + "grad_norm": 0.5626821517944336, + "learning_rate": 9.912190942122652e-05, + "loss": 2.0796, + "step": 2867 + }, + { + "epoch": 0.8802946593001841, + "grad_norm": 0.5367855429649353, + "learning_rate": 9.912098173002518e-05, + "loss": 2.0768, + "step": 2868 + }, + { + "epoch": 0.8806015960712094, + "grad_norm": 0.5285482406616211, + "learning_rate": 9.912005355338152e-05, + "loss": 2.0832, + "step": 2869 + }, + { + "epoch": 0.8809085328422345, + "grad_norm": 0.5384502410888672, + "learning_rate": 9.91191248913047e-05, + "loss": 2.0187, + "step": 2870 + }, + { + "epoch": 0.8812154696132597, + "grad_norm": 0.5099567770957947, + "learning_rate": 9.91181957438039e-05, + "loss": 2.0865, + "step": 2871 + }, + { + "epoch": 0.8815224063842848, + "grad_norm": 0.5513966679573059, + "learning_rate": 9.911726611088831e-05, + "loss": 2.1097, + "step": 2872 + }, + { + "epoch": 0.88182934315531, + "grad_norm": 0.5411790609359741, + "learning_rate": 9.911633599256709e-05, + "loss": 2.0964, + "step": 2873 + }, + { + "epoch": 0.8821362799263351, + "grad_norm": 0.6151100397109985, + "learning_rate": 9.911540538884947e-05, + "loss": 2.1006, + "step": 2874 + }, + { + "epoch": 0.8824432166973604, + "grad_norm": 0.754391610622406, + "learning_rate": 9.911447429974461e-05, + "loss": 2.1493, + "step": 2875 + }, + { + "epoch": 0.8827501534683855, + "grad_norm": 0.7485715746879578, + "learning_rate": 9.911354272526172e-05, + "loss": 2.1136, + "step": 2876 + }, + { + "epoch": 0.8830570902394107, + "grad_norm": 0.6808591485023499, + "learning_rate": 9.911261066541003e-05, + "loss": 2.1238, + "step": 2877 + }, + { + "epoch": 0.8833640270104358, + "grad_norm": 0.5771127343177795, + "learning_rate": 9.911167812019874e-05, + "loss": 2.0846, + "step": 2878 + }, + { + "epoch": 0.883670963781461, + "grad_norm": 0.5991767048835754, + "learning_rate": 9.911074508963705e-05, + "loss": 2.1486, + "step": 2879 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.6899440884590149, + "learning_rate": 9.91098115737342e-05, + "loss": 2.1357, + "step": 2880 + }, + { + "epoch": 0.8842848373235114, + "grad_norm": 0.7102574110031128, + "learning_rate": 9.91088775724994e-05, + "loss": 2.1269, + "step": 2881 + }, + { + "epoch": 0.8845917740945365, + "grad_norm": 0.7238754034042358, + "learning_rate": 9.910794308594189e-05, + "loss": 2.0829, + "step": 2882 + }, + { + "epoch": 0.8848987108655617, + "grad_norm": 0.7232441902160645, + "learning_rate": 9.91070081140709e-05, + "loss": 2.1704, + "step": 2883 + }, + { + "epoch": 0.8852056476365868, + "grad_norm": 0.7136173844337463, + "learning_rate": 9.910607265689569e-05, + "loss": 2.1553, + "step": 2884 + }, + { + "epoch": 0.885512584407612, + "grad_norm": 0.6566216945648193, + "learning_rate": 9.910513671442547e-05, + "loss": 2.0856, + "step": 2885 + }, + { + "epoch": 0.8858195211786372, + "grad_norm": 0.5712916851043701, + "learning_rate": 9.910420028666951e-05, + "loss": 2.1399, + "step": 2886 + }, + { + "epoch": 0.8861264579496624, + "grad_norm": 0.727664589881897, + "learning_rate": 9.910326337363707e-05, + "loss": 2.088, + "step": 2887 + }, + { + "epoch": 0.8864333947206875, + "grad_norm": 0.799963653087616, + "learning_rate": 9.91023259753374e-05, + "loss": 2.0984, + "step": 2888 + }, + { + "epoch": 0.8867403314917127, + "grad_norm": 0.9462977051734924, + "learning_rate": 9.910138809177975e-05, + "loss": 2.1262, + "step": 2889 + }, + { + "epoch": 0.8870472682627378, + "grad_norm": 0.9130533933639526, + "learning_rate": 9.910044972297343e-05, + "loss": 2.1967, + "step": 2890 + }, + { + "epoch": 0.887354205033763, + "grad_norm": 0.6971304416656494, + "learning_rate": 9.909951086892767e-05, + "loss": 2.0797, + "step": 2891 + }, + { + "epoch": 0.8876611418047882, + "grad_norm": 0.5822353363037109, + "learning_rate": 9.909857152965176e-05, + "loss": 2.1152, + "step": 2892 + }, + { + "epoch": 0.8879680785758134, + "grad_norm": 0.5885453820228577, + "learning_rate": 9.9097631705155e-05, + "loss": 2.0323, + "step": 2893 + }, + { + "epoch": 0.8882750153468385, + "grad_norm": 0.6249284744262695, + "learning_rate": 9.909669139544666e-05, + "loss": 2.1076, + "step": 2894 + }, + { + "epoch": 0.8885819521178637, + "grad_norm": 0.6117702722549438, + "learning_rate": 9.909575060053604e-05, + "loss": 2.0608, + "step": 2895 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.560357928276062, + "learning_rate": 9.909480932043245e-05, + "loss": 2.145, + "step": 2896 + }, + { + "epoch": 0.8891958256599141, + "grad_norm": 0.5442607998847961, + "learning_rate": 9.909386755514516e-05, + "loss": 2.1091, + "step": 2897 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.5653077363967896, + "learning_rate": 9.909292530468351e-05, + "loss": 2.1097, + "step": 2898 + }, + { + "epoch": 0.8898096992019644, + "grad_norm": 0.531939685344696, + "learning_rate": 9.909198256905679e-05, + "loss": 2.0866, + "step": 2899 + }, + { + "epoch": 0.8901166359729895, + "grad_norm": 0.6238400340080261, + "learning_rate": 9.909103934827433e-05, + "loss": 2.1421, + "step": 2900 + }, + { + "epoch": 0.8904235727440147, + "grad_norm": 0.5685901045799255, + "learning_rate": 9.909009564234543e-05, + "loss": 2.0019, + "step": 2901 + }, + { + "epoch": 0.8907305095150398, + "grad_norm": 0.5979083180427551, + "learning_rate": 9.908915145127945e-05, + "loss": 2.0891, + "step": 2902 + }, + { + "epoch": 0.8910374462860651, + "grad_norm": 0.5847237706184387, + "learning_rate": 9.90882067750857e-05, + "loss": 2.1165, + "step": 2903 + }, + { + "epoch": 0.8913443830570903, + "grad_norm": 0.6281530261039734, + "learning_rate": 9.908726161377351e-05, + "loss": 2.1396, + "step": 2904 + }, + { + "epoch": 0.8916513198281154, + "grad_norm": 0.5685252547264099, + "learning_rate": 9.908631596735225e-05, + "loss": 2.0781, + "step": 2905 + }, + { + "epoch": 0.8919582565991406, + "grad_norm": 0.5427065491676331, + "learning_rate": 9.908536983583123e-05, + "loss": 2.1387, + "step": 2906 + }, + { + "epoch": 0.8922651933701657, + "grad_norm": 0.5972270965576172, + "learning_rate": 9.908442321921982e-05, + "loss": 2.0546, + "step": 2907 + }, + { + "epoch": 0.892572130141191, + "grad_norm": 0.562685489654541, + "learning_rate": 9.908347611752735e-05, + "loss": 2.093, + "step": 2908 + }, + { + "epoch": 0.8928790669122161, + "grad_norm": 0.6781734824180603, + "learning_rate": 9.908252853076323e-05, + "loss": 2.1589, + "step": 2909 + }, + { + "epoch": 0.8931860036832413, + "grad_norm": 0.7591540813446045, + "learning_rate": 9.908158045893678e-05, + "loss": 2.164, + "step": 2910 + }, + { + "epoch": 0.8934929404542664, + "grad_norm": 0.7161938548088074, + "learning_rate": 9.908063190205738e-05, + "loss": 2.079, + "step": 2911 + }, + { + "epoch": 0.8937998772252916, + "grad_norm": 0.7338036298751831, + "learning_rate": 9.907968286013442e-05, + "loss": 2.0033, + "step": 2912 + }, + { + "epoch": 0.8941068139963168, + "grad_norm": 0.7641176581382751, + "learning_rate": 9.907873333317727e-05, + "loss": 2.187, + "step": 2913 + }, + { + "epoch": 0.894413750767342, + "grad_norm": 0.6073760390281677, + "learning_rate": 9.90777833211953e-05, + "loss": 2.0589, + "step": 2914 + }, + { + "epoch": 0.8947206875383671, + "grad_norm": 0.49493756890296936, + "learning_rate": 9.907683282419791e-05, + "loss": 2.0555, + "step": 2915 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.6428996920585632, + "learning_rate": 9.907588184219449e-05, + "loss": 2.1083, + "step": 2916 + }, + { + "epoch": 0.8953345610804174, + "grad_norm": 0.6752644777297974, + "learning_rate": 9.907493037519447e-05, + "loss": 2.0987, + "step": 2917 + }, + { + "epoch": 0.8956414978514426, + "grad_norm": 0.5719494223594666, + "learning_rate": 9.907397842320719e-05, + "loss": 2.1735, + "step": 2918 + }, + { + "epoch": 0.8959484346224678, + "grad_norm": 0.5799626111984253, + "learning_rate": 9.907302598624211e-05, + "loss": 2.0978, + "step": 2919 + }, + { + "epoch": 0.896255371393493, + "grad_norm": 0.5407500267028809, + "learning_rate": 9.907207306430861e-05, + "loss": 2.0303, + "step": 2920 + }, + { + "epoch": 0.8965623081645181, + "grad_norm": 0.5950884222984314, + "learning_rate": 9.907111965741614e-05, + "loss": 2.0721, + "step": 2921 + }, + { + "epoch": 0.8968692449355433, + "grad_norm": 0.7711441516876221, + "learning_rate": 9.907016576557409e-05, + "loss": 2.1693, + "step": 2922 + }, + { + "epoch": 0.8971761817065684, + "grad_norm": 0.5522177815437317, + "learning_rate": 9.906921138879191e-05, + "loss": 2.1057, + "step": 2923 + }, + { + "epoch": 0.8974831184775937, + "grad_norm": 0.5743894577026367, + "learning_rate": 9.906825652707903e-05, + "loss": 2.119, + "step": 2924 + }, + { + "epoch": 0.8977900552486188, + "grad_norm": 0.5996440649032593, + "learning_rate": 9.906730118044486e-05, + "loss": 2.1251, + "step": 2925 + }, + { + "epoch": 0.898096992019644, + "grad_norm": 0.691302478313446, + "learning_rate": 9.906634534889887e-05, + "loss": 2.1459, + "step": 2926 + }, + { + "epoch": 0.8984039287906691, + "grad_norm": 0.6125866770744324, + "learning_rate": 9.90653890324505e-05, + "loss": 2.0739, + "step": 2927 + }, + { + "epoch": 0.8987108655616943, + "grad_norm": 0.5285681486129761, + "learning_rate": 9.906443223110919e-05, + "loss": 2.0398, + "step": 2928 + }, + { + "epoch": 0.8990178023327194, + "grad_norm": 0.5747935771942139, + "learning_rate": 9.90634749448844e-05, + "loss": 2.0688, + "step": 2929 + }, + { + "epoch": 0.8993247391037447, + "grad_norm": 0.5686646103858948, + "learning_rate": 9.90625171737856e-05, + "loss": 2.1196, + "step": 2930 + }, + { + "epoch": 0.8996316758747698, + "grad_norm": 0.5320247411727905, + "learning_rate": 9.906155891782225e-05, + "loss": 2.1069, + "step": 2931 + }, + { + "epoch": 0.899938612645795, + "grad_norm": 0.5626047849655151, + "learning_rate": 9.906060017700383e-05, + "loss": 2.1091, + "step": 2932 + }, + { + "epoch": 0.9002455494168201, + "grad_norm": 0.5284978151321411, + "learning_rate": 9.905964095133979e-05, + "loss": 2.036, + "step": 2933 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.5362093448638916, + "learning_rate": 9.905868124083962e-05, + "loss": 2.1273, + "step": 2934 + }, + { + "epoch": 0.9008594229588704, + "grad_norm": 0.5583781599998474, + "learning_rate": 9.90577210455128e-05, + "loss": 2.0871, + "step": 2935 + }, + { + "epoch": 0.9011663597298957, + "grad_norm": 0.5552016496658325, + "learning_rate": 9.905676036536883e-05, + "loss": 2.0785, + "step": 2936 + }, + { + "epoch": 0.9014732965009208, + "grad_norm": 0.6875657439231873, + "learning_rate": 9.905579920041724e-05, + "loss": 2.083, + "step": 2937 + }, + { + "epoch": 0.901780233271946, + "grad_norm": 0.5396340489387512, + "learning_rate": 9.905483755066744e-05, + "loss": 2.0717, + "step": 2938 + }, + { + "epoch": 0.9020871700429711, + "grad_norm": 0.594739556312561, + "learning_rate": 9.9053875416129e-05, + "loss": 2.1305, + "step": 2939 + }, + { + "epoch": 0.9023941068139963, + "grad_norm": 0.6208831667900085, + "learning_rate": 9.905291279681143e-05, + "loss": 2.0034, + "step": 2940 + }, + { + "epoch": 0.9027010435850215, + "grad_norm": 0.5154325366020203, + "learning_rate": 9.90519496927242e-05, + "loss": 2.098, + "step": 2941 + }, + { + "epoch": 0.9030079803560467, + "grad_norm": 0.5217738151550293, + "learning_rate": 9.905098610387687e-05, + "loss": 2.0467, + "step": 2942 + }, + { + "epoch": 0.9033149171270718, + "grad_norm": 0.5623623728752136, + "learning_rate": 9.905002203027894e-05, + "loss": 2.1854, + "step": 2943 + }, + { + "epoch": 0.903621853898097, + "grad_norm": 0.5365456938743591, + "learning_rate": 9.904905747193993e-05, + "loss": 2.1021, + "step": 2944 + }, + { + "epoch": 0.9039287906691221, + "grad_norm": 0.5391906499862671, + "learning_rate": 9.904809242886941e-05, + "loss": 2.1102, + "step": 2945 + }, + { + "epoch": 0.9042357274401474, + "grad_norm": 0.5439971685409546, + "learning_rate": 9.904712690107687e-05, + "loss": 2.0691, + "step": 2946 + }, + { + "epoch": 0.9045426642111725, + "grad_norm": 0.539383053779602, + "learning_rate": 9.904616088857189e-05, + "loss": 2.0514, + "step": 2947 + }, + { + "epoch": 0.9048496009821977, + "grad_norm": 0.5370060801506042, + "learning_rate": 9.904519439136399e-05, + "loss": 2.1069, + "step": 2948 + }, + { + "epoch": 0.9051565377532228, + "grad_norm": 0.5136541724205017, + "learning_rate": 9.904422740946274e-05, + "loss": 2.0519, + "step": 2949 + }, + { + "epoch": 0.905463474524248, + "grad_norm": 0.4970051348209381, + "learning_rate": 9.904325994287768e-05, + "loss": 2.0624, + "step": 2950 + }, + { + "epoch": 0.9057704112952731, + "grad_norm": 0.5003986954689026, + "learning_rate": 9.90422919916184e-05, + "loss": 2.135, + "step": 2951 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.5559821724891663, + "learning_rate": 9.904132355569443e-05, + "loss": 2.0733, + "step": 2952 + }, + { + "epoch": 0.9063842848373235, + "grad_norm": 0.5450533628463745, + "learning_rate": 9.904035463511537e-05, + "loss": 2.1491, + "step": 2953 + }, + { + "epoch": 0.9066912216083487, + "grad_norm": 0.5789141058921814, + "learning_rate": 9.903938522989076e-05, + "loss": 2.0604, + "step": 2954 + }, + { + "epoch": 0.9069981583793738, + "grad_norm": 0.6327412128448486, + "learning_rate": 9.903841534003023e-05, + "loss": 2.1307, + "step": 2955 + }, + { + "epoch": 0.907305095150399, + "grad_norm": 0.5694023966789246, + "learning_rate": 9.90374449655433e-05, + "loss": 2.1322, + "step": 2956 + }, + { + "epoch": 0.9076120319214241, + "grad_norm": 0.6241337060928345, + "learning_rate": 9.903647410643963e-05, + "loss": 2.1026, + "step": 2957 + }, + { + "epoch": 0.9079189686924494, + "grad_norm": 0.6257766485214233, + "learning_rate": 9.903550276272878e-05, + "loss": 2.0449, + "step": 2958 + }, + { + "epoch": 0.9082259054634745, + "grad_norm": 0.708626389503479, + "learning_rate": 9.903453093442032e-05, + "loss": 2.095, + "step": 2959 + }, + { + "epoch": 0.9085328422344997, + "grad_norm": 0.6769086122512817, + "learning_rate": 9.903355862152391e-05, + "loss": 2.0939, + "step": 2960 + }, + { + "epoch": 0.9088397790055248, + "grad_norm": 0.6221890449523926, + "learning_rate": 9.903258582404913e-05, + "loss": 2.1552, + "step": 2961 + }, + { + "epoch": 0.90914671577655, + "grad_norm": 0.7477858662605286, + "learning_rate": 9.903161254200561e-05, + "loss": 2.1155, + "step": 2962 + }, + { + "epoch": 0.9094536525475752, + "grad_norm": 0.665538489818573, + "learning_rate": 9.903063877540294e-05, + "loss": 2.1032, + "step": 2963 + }, + { + "epoch": 0.9097605893186004, + "grad_norm": 0.5973435044288635, + "learning_rate": 9.902966452425076e-05, + "loss": 2.0793, + "step": 2964 + }, + { + "epoch": 0.9100675260896255, + "grad_norm": 0.6544547080993652, + "learning_rate": 9.90286897885587e-05, + "loss": 2.1566, + "step": 2965 + }, + { + "epoch": 0.9103744628606507, + "grad_norm": 0.7162452936172485, + "learning_rate": 9.90277145683364e-05, + "loss": 2.1234, + "step": 2966 + }, + { + "epoch": 0.9106813996316758, + "grad_norm": 0.8400503993034363, + "learning_rate": 9.902673886359349e-05, + "loss": 2.216, + "step": 2967 + }, + { + "epoch": 0.910988336402701, + "grad_norm": 1.0350611209869385, + "learning_rate": 9.902576267433961e-05, + "loss": 2.0785, + "step": 2968 + }, + { + "epoch": 0.9112952731737262, + "grad_norm": 0.9551987051963806, + "learning_rate": 9.90247860005844e-05, + "loss": 2.0652, + "step": 2969 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.839712381362915, + "learning_rate": 9.902380884233751e-05, + "loss": 2.1197, + "step": 2970 + }, + { + "epoch": 0.9119091467157765, + "grad_norm": 0.6588022708892822, + "learning_rate": 9.902283119960863e-05, + "loss": 2.155, + "step": 2971 + }, + { + "epoch": 0.9122160834868017, + "grad_norm": 0.6532430052757263, + "learning_rate": 9.902185307240739e-05, + "loss": 2.0947, + "step": 2972 + }, + { + "epoch": 0.9125230202578268, + "grad_norm": 0.7890481352806091, + "learning_rate": 9.902087446074346e-05, + "loss": 2.0246, + "step": 2973 + }, + { + "epoch": 0.9128299570288521, + "grad_norm": 0.6234511137008667, + "learning_rate": 9.901989536462652e-05, + "loss": 2.1033, + "step": 2974 + }, + { + "epoch": 0.9131368937998773, + "grad_norm": 0.5875300168991089, + "learning_rate": 9.901891578406623e-05, + "loss": 2.0553, + "step": 2975 + }, + { + "epoch": 0.9134438305709024, + "grad_norm": 0.6868174076080322, + "learning_rate": 9.901793571907231e-05, + "loss": 2.1398, + "step": 2976 + }, + { + "epoch": 0.9137507673419276, + "grad_norm": 0.7423301339149475, + "learning_rate": 9.90169551696544e-05, + "loss": 2.1034, + "step": 2977 + }, + { + "epoch": 0.9140577041129527, + "grad_norm": 0.588916003704071, + "learning_rate": 9.901597413582222e-05, + "loss": 2.078, + "step": 2978 + }, + { + "epoch": 0.914364640883978, + "grad_norm": 0.5895309448242188, + "learning_rate": 9.901499261758544e-05, + "loss": 2.0902, + "step": 2979 + }, + { + "epoch": 0.9146715776550031, + "grad_norm": 0.5403301119804382, + "learning_rate": 9.901401061495379e-05, + "loss": 2.0291, + "step": 2980 + }, + { + "epoch": 0.9149785144260283, + "grad_norm": 0.6102077960968018, + "learning_rate": 9.901302812793696e-05, + "loss": 2.0415, + "step": 2981 + }, + { + "epoch": 0.9152854511970534, + "grad_norm": 0.6728450059890747, + "learning_rate": 9.901204515654465e-05, + "loss": 2.105, + "step": 2982 + }, + { + "epoch": 0.9155923879680786, + "grad_norm": 0.5886163711547852, + "learning_rate": 9.901106170078657e-05, + "loss": 2.0186, + "step": 2983 + }, + { + "epoch": 0.9158993247391037, + "grad_norm": 0.539252758026123, + "learning_rate": 9.901007776067247e-05, + "loss": 2.0604, + "step": 2984 + }, + { + "epoch": 0.916206261510129, + "grad_norm": 0.6169516444206238, + "learning_rate": 9.900909333621205e-05, + "loss": 2.1257, + "step": 2985 + }, + { + "epoch": 0.9165131982811541, + "grad_norm": 0.5624274015426636, + "learning_rate": 9.900810842741506e-05, + "loss": 2.0325, + "step": 2986 + }, + { + "epoch": 0.9168201350521793, + "grad_norm": 0.5931735634803772, + "learning_rate": 9.900712303429119e-05, + "loss": 2.0815, + "step": 2987 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.5720505714416504, + "learning_rate": 9.900613715685023e-05, + "loss": 2.1261, + "step": 2988 + }, + { + "epoch": 0.9174340085942296, + "grad_norm": 0.5752067565917969, + "learning_rate": 9.900515079510189e-05, + "loss": 2.1402, + "step": 2989 + }, + { + "epoch": 0.9177409453652547, + "grad_norm": 0.5836917757987976, + "learning_rate": 9.900416394905591e-05, + "loss": 2.0523, + "step": 2990 + }, + { + "epoch": 0.91804788213628, + "grad_norm": 0.6408325433731079, + "learning_rate": 9.900317661872209e-05, + "loss": 2.1874, + "step": 2991 + }, + { + "epoch": 0.9183548189073051, + "grad_norm": 0.6188341379165649, + "learning_rate": 9.900218880411013e-05, + "loss": 2.0903, + "step": 2992 + }, + { + "epoch": 0.9186617556783303, + "grad_norm": 0.5740565657615662, + "learning_rate": 9.900120050522985e-05, + "loss": 2.1243, + "step": 2993 + }, + { + "epoch": 0.9189686924493554, + "grad_norm": 0.635638952255249, + "learning_rate": 9.900021172209096e-05, + "loss": 2.089, + "step": 2994 + }, + { + "epoch": 0.9192756292203806, + "grad_norm": 0.5538209676742554, + "learning_rate": 9.899922245470326e-05, + "loss": 2.0489, + "step": 2995 + }, + { + "epoch": 0.9195825659914058, + "grad_norm": 0.5440292954444885, + "learning_rate": 9.899823270307654e-05, + "loss": 2.0534, + "step": 2996 + }, + { + "epoch": 0.919889502762431, + "grad_norm": 0.6203792691230774, + "learning_rate": 9.899724246722055e-05, + "loss": 2.2799, + "step": 2997 + }, + { + "epoch": 0.9201964395334561, + "grad_norm": 0.6299278140068054, + "learning_rate": 9.89962517471451e-05, + "loss": 2.0813, + "step": 2998 + }, + { + "epoch": 0.9205033763044813, + "grad_norm": 0.6156774759292603, + "learning_rate": 9.899526054285997e-05, + "loss": 2.1345, + "step": 2999 + }, + { + "epoch": 0.9208103130755064, + "grad_norm": 0.5940032601356506, + "learning_rate": 9.899426885437496e-05, + "loss": 2.133, + "step": 3000 + }, + { + "epoch": 0.9211172498465316, + "grad_norm": 0.6210232377052307, + "learning_rate": 9.899327668169987e-05, + "loss": 2.0275, + "step": 3001 + }, + { + "epoch": 0.9214241866175568, + "grad_norm": 0.5578985214233398, + "learning_rate": 9.89922840248445e-05, + "loss": 2.0806, + "step": 3002 + }, + { + "epoch": 0.921731123388582, + "grad_norm": 0.5264963507652283, + "learning_rate": 9.899129088381866e-05, + "loss": 2.1233, + "step": 3003 + }, + { + "epoch": 0.9220380601596071, + "grad_norm": 0.5414119958877563, + "learning_rate": 9.899029725863218e-05, + "loss": 2.1052, + "step": 3004 + }, + { + "epoch": 0.9223449969306323, + "grad_norm": 0.5933207869529724, + "learning_rate": 9.898930314929486e-05, + "loss": 2.108, + "step": 3005 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.6170317530632019, + "learning_rate": 9.898830855581654e-05, + "loss": 2.0997, + "step": 3006 + }, + { + "epoch": 0.9229588704726827, + "grad_norm": 0.5930282473564148, + "learning_rate": 9.898731347820705e-05, + "loss": 2.0507, + "step": 3007 + }, + { + "epoch": 0.9232658072437078, + "grad_norm": 0.5894142985343933, + "learning_rate": 9.898631791647619e-05, + "loss": 2.0687, + "step": 3008 + }, + { + "epoch": 0.923572744014733, + "grad_norm": 0.6560437083244324, + "learning_rate": 9.898532187063383e-05, + "loss": 2.096, + "step": 3009 + }, + { + "epoch": 0.9238796807857581, + "grad_norm": 0.6083245873451233, + "learning_rate": 9.898432534068983e-05, + "loss": 2.0526, + "step": 3010 + }, + { + "epoch": 0.9241866175567833, + "grad_norm": 0.5152565240859985, + "learning_rate": 9.8983328326654e-05, + "loss": 2.0802, + "step": 3011 + }, + { + "epoch": 0.9244935543278084, + "grad_norm": 0.6326588988304138, + "learning_rate": 9.89823308285362e-05, + "loss": 2.1246, + "step": 3012 + }, + { + "epoch": 0.9248004910988337, + "grad_norm": 0.6821309328079224, + "learning_rate": 9.898133284634632e-05, + "loss": 2.1106, + "step": 3013 + }, + { + "epoch": 0.9251074278698588, + "grad_norm": 0.6192164421081543, + "learning_rate": 9.898033438009419e-05, + "loss": 2.0475, + "step": 3014 + }, + { + "epoch": 0.925414364640884, + "grad_norm": 0.6112427115440369, + "learning_rate": 9.897933542978967e-05, + "loss": 2.0904, + "step": 3015 + }, + { + "epoch": 0.9257213014119091, + "grad_norm": 0.5729427933692932, + "learning_rate": 9.897833599544268e-05, + "loss": 2.1151, + "step": 3016 + }, + { + "epoch": 0.9260282381829343, + "grad_norm": 0.6200255751609802, + "learning_rate": 9.897733607706305e-05, + "loss": 2.0815, + "step": 3017 + }, + { + "epoch": 0.9263351749539595, + "grad_norm": 0.635920524597168, + "learning_rate": 9.897633567466068e-05, + "loss": 2.0724, + "step": 3018 + }, + { + "epoch": 0.9266421117249847, + "grad_norm": 0.5916038155555725, + "learning_rate": 9.897533478824546e-05, + "loss": 2.1527, + "step": 3019 + }, + { + "epoch": 0.9269490484960098, + "grad_norm": 0.5552941560745239, + "learning_rate": 9.897433341782727e-05, + "loss": 2.0958, + "step": 3020 + }, + { + "epoch": 0.927255985267035, + "grad_norm": 0.562383770942688, + "learning_rate": 9.897333156341602e-05, + "loss": 2.0939, + "step": 3021 + }, + { + "epoch": 0.9275629220380601, + "grad_norm": 0.5227869153022766, + "learning_rate": 9.897232922502158e-05, + "loss": 2.1358, + "step": 3022 + }, + { + "epoch": 0.9278698588090853, + "grad_norm": 0.5671074986457825, + "learning_rate": 9.897132640265391e-05, + "loss": 2.0877, + "step": 3023 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.5176356434822083, + "learning_rate": 9.897032309632287e-05, + "loss": 2.0392, + "step": 3024 + }, + { + "epoch": 0.9284837323511357, + "grad_norm": 0.5160155296325684, + "learning_rate": 9.89693193060384e-05, + "loss": 2.069, + "step": 3025 + }, + { + "epoch": 0.9287906691221608, + "grad_norm": 0.5034440159797668, + "learning_rate": 9.896831503181042e-05, + "loss": 2.0348, + "step": 3026 + }, + { + "epoch": 0.929097605893186, + "grad_norm": 0.5146151781082153, + "learning_rate": 9.896731027364884e-05, + "loss": 2.0884, + "step": 3027 + }, + { + "epoch": 0.9294045426642111, + "grad_norm": 0.7153071165084839, + "learning_rate": 9.896630503156361e-05, + "loss": 2.2295, + "step": 3028 + }, + { + "epoch": 0.9297114794352364, + "grad_norm": 0.7201753258705139, + "learning_rate": 9.896529930556464e-05, + "loss": 2.1285, + "step": 3029 + }, + { + "epoch": 0.9300184162062615, + "grad_norm": 0.7110029458999634, + "learning_rate": 9.89642930956619e-05, + "loss": 2.1371, + "step": 3030 + }, + { + "epoch": 0.9303253529772867, + "grad_norm": 0.695444643497467, + "learning_rate": 9.896328640186531e-05, + "loss": 2.0698, + "step": 3031 + }, + { + "epoch": 0.9306322897483118, + "grad_norm": 0.6157357096672058, + "learning_rate": 9.896227922418482e-05, + "loss": 2.1294, + "step": 3032 + }, + { + "epoch": 0.930939226519337, + "grad_norm": 0.5473730564117432, + "learning_rate": 9.896127156263039e-05, + "loss": 2.0487, + "step": 3033 + }, + { + "epoch": 0.9312461632903621, + "grad_norm": 0.6400229334831238, + "learning_rate": 9.896026341721198e-05, + "loss": 2.0422, + "step": 3034 + }, + { + "epoch": 0.9315531000613874, + "grad_norm": 0.5046324729919434, + "learning_rate": 9.895925478793955e-05, + "loss": 2.0715, + "step": 3035 + }, + { + "epoch": 0.9318600368324125, + "grad_norm": 0.5316528081893921, + "learning_rate": 9.895824567482307e-05, + "loss": 2.11, + "step": 3036 + }, + { + "epoch": 0.9321669736034377, + "grad_norm": 0.5760478973388672, + "learning_rate": 9.895723607787251e-05, + "loss": 2.0885, + "step": 3037 + }, + { + "epoch": 0.9324739103744628, + "grad_norm": 0.5034705996513367, + "learning_rate": 9.895622599709785e-05, + "loss": 2.0024, + "step": 3038 + }, + { + "epoch": 0.932780847145488, + "grad_norm": 0.46088743209838867, + "learning_rate": 9.895521543250906e-05, + "loss": 2.0794, + "step": 3039 + }, + { + "epoch": 0.9330877839165131, + "grad_norm": 0.5219544172286987, + "learning_rate": 9.895420438411616e-05, + "loss": 2.1002, + "step": 3040 + }, + { + "epoch": 0.9333947206875384, + "grad_norm": 0.5363453030586243, + "learning_rate": 9.89531928519291e-05, + "loss": 2.0629, + "step": 3041 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.5860787630081177, + "learning_rate": 9.89521808359579e-05, + "loss": 2.0999, + "step": 3042 + }, + { + "epoch": 0.9340085942295887, + "grad_norm": 0.7155836224555969, + "learning_rate": 9.895116833621255e-05, + "loss": 2.1674, + "step": 3043 + }, + { + "epoch": 0.9343155310006138, + "grad_norm": 0.8029196262359619, + "learning_rate": 9.895015535270307e-05, + "loss": 2.0776, + "step": 3044 + }, + { + "epoch": 0.934622467771639, + "grad_norm": 0.6973832845687866, + "learning_rate": 9.894914188543946e-05, + "loss": 2.0537, + "step": 3045 + }, + { + "epoch": 0.9349294045426643, + "grad_norm": 0.6646706461906433, + "learning_rate": 9.894812793443175e-05, + "loss": 2.0857, + "step": 3046 + }, + { + "epoch": 0.9352363413136894, + "grad_norm": 0.6343888640403748, + "learning_rate": 9.894711349968995e-05, + "loss": 2.0832, + "step": 3047 + }, + { + "epoch": 0.9355432780847146, + "grad_norm": 0.54819256067276, + "learning_rate": 9.894609858122407e-05, + "loss": 2.1576, + "step": 3048 + }, + { + "epoch": 0.9358502148557397, + "grad_norm": 0.6905701160430908, + "learning_rate": 9.894508317904419e-05, + "loss": 2.0685, + "step": 3049 + }, + { + "epoch": 0.9361571516267649, + "grad_norm": 0.605591356754303, + "learning_rate": 9.894406729316028e-05, + "loss": 2.0931, + "step": 3050 + }, + { + "epoch": 0.93646408839779, + "grad_norm": 0.5702943801879883, + "learning_rate": 9.89430509235824e-05, + "loss": 2.1224, + "step": 3051 + }, + { + "epoch": 0.9367710251688153, + "grad_norm": 0.5855122804641724, + "learning_rate": 9.894203407032064e-05, + "loss": 2.0747, + "step": 3052 + }, + { + "epoch": 0.9370779619398404, + "grad_norm": 0.6002167463302612, + "learning_rate": 9.894101673338498e-05, + "loss": 2.0991, + "step": 3053 + }, + { + "epoch": 0.9373848987108656, + "grad_norm": 0.5914842486381531, + "learning_rate": 9.893999891278553e-05, + "loss": 2.0427, + "step": 3054 + }, + { + "epoch": 0.9376918354818907, + "grad_norm": 0.6283048391342163, + "learning_rate": 9.893898060853232e-05, + "loss": 2.0558, + "step": 3055 + }, + { + "epoch": 0.937998772252916, + "grad_norm": 0.5955209136009216, + "learning_rate": 9.893796182063542e-05, + "loss": 2.1286, + "step": 3056 + }, + { + "epoch": 0.9383057090239411, + "grad_norm": 0.5579878687858582, + "learning_rate": 9.893694254910489e-05, + "loss": 2.0799, + "step": 3057 + }, + { + "epoch": 0.9386126457949663, + "grad_norm": 0.5690281391143799, + "learning_rate": 9.893592279395082e-05, + "loss": 2.0699, + "step": 3058 + }, + { + "epoch": 0.9389195825659914, + "grad_norm": 0.5189259648323059, + "learning_rate": 9.893490255518327e-05, + "loss": 2.0627, + "step": 3059 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.5205439925193787, + "learning_rate": 9.893388183281233e-05, + "loss": 2.0136, + "step": 3060 + }, + { + "epoch": 0.9395334561080417, + "grad_norm": 0.492593914270401, + "learning_rate": 9.89328606268481e-05, + "loss": 2.0799, + "step": 3061 + }, + { + "epoch": 0.939840392879067, + "grad_norm": 0.6511666178703308, + "learning_rate": 9.893183893730067e-05, + "loss": 2.1297, + "step": 3062 + }, + { + "epoch": 0.9401473296500921, + "grad_norm": 0.7640050053596497, + "learning_rate": 9.89308167641801e-05, + "loss": 2.1384, + "step": 3063 + }, + { + "epoch": 0.9404542664211173, + "grad_norm": 0.7526536583900452, + "learning_rate": 9.892979410749654e-05, + "loss": 2.0454, + "step": 3064 + }, + { + "epoch": 0.9407612031921424, + "grad_norm": 0.7140639424324036, + "learning_rate": 9.892877096726007e-05, + "loss": 2.0219, + "step": 3065 + }, + { + "epoch": 0.9410681399631676, + "grad_norm": 0.6584374308586121, + "learning_rate": 9.89277473434808e-05, + "loss": 2.0943, + "step": 3066 + }, + { + "epoch": 0.9413750767341927, + "grad_norm": 0.5889024138450623, + "learning_rate": 9.892672323616888e-05, + "loss": 2.1088, + "step": 3067 + }, + { + "epoch": 0.941682013505218, + "grad_norm": 0.6196749806404114, + "learning_rate": 9.892569864533438e-05, + "loss": 2.101, + "step": 3068 + }, + { + "epoch": 0.9419889502762431, + "grad_norm": 0.6432211399078369, + "learning_rate": 9.892467357098744e-05, + "loss": 2.0828, + "step": 3069 + }, + { + "epoch": 0.9422958870472683, + "grad_norm": 0.6448069214820862, + "learning_rate": 9.892364801313823e-05, + "loss": 2.1389, + "step": 3070 + }, + { + "epoch": 0.9426028238182934, + "grad_norm": 0.597197949886322, + "learning_rate": 9.892262197179682e-05, + "loss": 2.0902, + "step": 3071 + }, + { + "epoch": 0.9429097605893186, + "grad_norm": 0.625348687171936, + "learning_rate": 9.892159544697341e-05, + "loss": 2.0659, + "step": 3072 + }, + { + "epoch": 0.9432166973603437, + "grad_norm": 0.5109166502952576, + "learning_rate": 9.892056843867812e-05, + "loss": 2.0895, + "step": 3073 + }, + { + "epoch": 0.943523634131369, + "grad_norm": 0.5917959213256836, + "learning_rate": 9.891954094692108e-05, + "loss": 2.0646, + "step": 3074 + }, + { + "epoch": 0.9438305709023941, + "grad_norm": 0.5320633053779602, + "learning_rate": 9.891851297171249e-05, + "loss": 2.107, + "step": 3075 + }, + { + "epoch": 0.9441375076734193, + "grad_norm": 0.5271332263946533, + "learning_rate": 9.891748451306246e-05, + "loss": 2.0984, + "step": 3076 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.5389983057975769, + "learning_rate": 9.89164555709812e-05, + "loss": 2.1097, + "step": 3077 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.5536573529243469, + "learning_rate": 9.891542614547885e-05, + "loss": 2.1271, + "step": 3078 + }, + { + "epoch": 0.9450583179864948, + "grad_norm": 0.5481712222099304, + "learning_rate": 9.891439623656558e-05, + "loss": 2.0975, + "step": 3079 + }, + { + "epoch": 0.94536525475752, + "grad_norm": 0.626431941986084, + "learning_rate": 9.891336584425157e-05, + "loss": 2.1561, + "step": 3080 + }, + { + "epoch": 0.9456721915285451, + "grad_norm": 0.7452689409255981, + "learning_rate": 9.891233496854702e-05, + "loss": 2.0791, + "step": 3081 + }, + { + "epoch": 0.9459791282995703, + "grad_norm": 0.9399113059043884, + "learning_rate": 9.89113036094621e-05, + "loss": 2.0706, + "step": 3082 + }, + { + "epoch": 0.9462860650705954, + "grad_norm": 1.0733267068862915, + "learning_rate": 9.891027176700701e-05, + "loss": 2.0705, + "step": 3083 + }, + { + "epoch": 0.9465930018416207, + "grad_norm": 0.7521542906761169, + "learning_rate": 9.890923944119194e-05, + "loss": 2.0862, + "step": 3084 + }, + { + "epoch": 0.9468999386126458, + "grad_norm": 0.5447198152542114, + "learning_rate": 9.890820663202713e-05, + "loss": 2.1047, + "step": 3085 + }, + { + "epoch": 0.947206875383671, + "grad_norm": 0.5733833312988281, + "learning_rate": 9.890717333952273e-05, + "loss": 2.121, + "step": 3086 + }, + { + "epoch": 0.9475138121546961, + "grad_norm": 0.7225440144538879, + "learning_rate": 9.890613956368899e-05, + "loss": 2.0533, + "step": 3087 + }, + { + "epoch": 0.9478207489257213, + "grad_norm": 0.6377096176147461, + "learning_rate": 9.89051053045361e-05, + "loss": 2.07, + "step": 3088 + }, + { + "epoch": 0.9481276856967464, + "grad_norm": 0.556656002998352, + "learning_rate": 9.890407056207432e-05, + "loss": 2.1103, + "step": 3089 + }, + { + "epoch": 0.9484346224677717, + "grad_norm": 0.6807621121406555, + "learning_rate": 9.890303533631382e-05, + "loss": 2.1351, + "step": 3090 + }, + { + "epoch": 0.9487415592387968, + "grad_norm": 0.7187803983688354, + "learning_rate": 9.890199962726487e-05, + "loss": 2.0582, + "step": 3091 + }, + { + "epoch": 0.949048496009822, + "grad_norm": 0.6201196908950806, + "learning_rate": 9.890096343493771e-05, + "loss": 2.0799, + "step": 3092 + }, + { + "epoch": 0.9493554327808471, + "grad_norm": 0.6258496046066284, + "learning_rate": 9.889992675934257e-05, + "loss": 2.156, + "step": 3093 + }, + { + "epoch": 0.9496623695518723, + "grad_norm": 0.6191570162773132, + "learning_rate": 9.889888960048967e-05, + "loss": 2.0121, + "step": 3094 + }, + { + "epoch": 0.9499693063228974, + "grad_norm": 0.5668848752975464, + "learning_rate": 9.88978519583893e-05, + "loss": 2.0954, + "step": 3095 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.5596859455108643, + "learning_rate": 9.88968138330517e-05, + "loss": 2.1274, + "step": 3096 + }, + { + "epoch": 0.9505831798649478, + "grad_norm": 0.6199706196784973, + "learning_rate": 9.889577522448712e-05, + "loss": 2.0588, + "step": 3097 + }, + { + "epoch": 0.950890116635973, + "grad_norm": 0.5129860639572144, + "learning_rate": 9.889473613270584e-05, + "loss": 2.0722, + "step": 3098 + }, + { + "epoch": 0.9511970534069981, + "grad_norm": 0.513263463973999, + "learning_rate": 9.88936965577181e-05, + "loss": 2.0298, + "step": 3099 + }, + { + "epoch": 0.9515039901780233, + "grad_norm": 0.4870156943798065, + "learning_rate": 9.88926564995342e-05, + "loss": 2.025, + "step": 3100 + }, + { + "epoch": 0.9518109269490485, + "grad_norm": 0.5310595035552979, + "learning_rate": 9.889161595816442e-05, + "loss": 2.0767, + "step": 3101 + }, + { + "epoch": 0.9521178637200737, + "grad_norm": 0.5993812084197998, + "learning_rate": 9.889057493361903e-05, + "loss": 2.1931, + "step": 3102 + }, + { + "epoch": 0.9524248004910988, + "grad_norm": 0.6157637238502502, + "learning_rate": 9.888953342590832e-05, + "loss": 2.0757, + "step": 3103 + }, + { + "epoch": 0.952731737262124, + "grad_norm": 0.6280032992362976, + "learning_rate": 9.88884914350426e-05, + "loss": 2.0042, + "step": 3104 + }, + { + "epoch": 0.9530386740331491, + "grad_norm": 0.6740781664848328, + "learning_rate": 9.888744896103212e-05, + "loss": 2.0663, + "step": 3105 + }, + { + "epoch": 0.9533456108041743, + "grad_norm": 0.5851804614067078, + "learning_rate": 9.888640600388725e-05, + "loss": 2.0585, + "step": 3106 + }, + { + "epoch": 0.9536525475751995, + "grad_norm": 0.6590312719345093, + "learning_rate": 9.888536256361825e-05, + "loss": 2.0698, + "step": 3107 + }, + { + "epoch": 0.9539594843462247, + "grad_norm": 0.5356595516204834, + "learning_rate": 9.888431864023544e-05, + "loss": 2.1019, + "step": 3108 + }, + { + "epoch": 0.9542664211172498, + "grad_norm": 0.6401084661483765, + "learning_rate": 9.888327423374915e-05, + "loss": 2.1176, + "step": 3109 + }, + { + "epoch": 0.954573357888275, + "grad_norm": 0.6582900285720825, + "learning_rate": 9.888222934416968e-05, + "loss": 2.0375, + "step": 3110 + }, + { + "epoch": 0.9548802946593001, + "grad_norm": 0.6245424151420593, + "learning_rate": 9.888118397150738e-05, + "loss": 1.9913, + "step": 3111 + }, + { + "epoch": 0.9551872314303254, + "grad_norm": 0.5871780514717102, + "learning_rate": 9.888013811577256e-05, + "loss": 2.1434, + "step": 3112 + }, + { + "epoch": 0.9554941682013505, + "grad_norm": 0.6295487284660339, + "learning_rate": 9.887909177697559e-05, + "loss": 2.0805, + "step": 3113 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.5844045877456665, + "learning_rate": 9.887804495512676e-05, + "loss": 2.076, + "step": 3114 + }, + { + "epoch": 0.9561080417434008, + "grad_norm": 0.5581921339035034, + "learning_rate": 9.887699765023645e-05, + "loss": 2.131, + "step": 3115 + }, + { + "epoch": 0.956414978514426, + "grad_norm": 0.6659174561500549, + "learning_rate": 9.8875949862315e-05, + "loss": 2.0759, + "step": 3116 + }, + { + "epoch": 0.9567219152854513, + "grad_norm": 0.5852961540222168, + "learning_rate": 9.887490159137276e-05, + "loss": 2.0486, + "step": 3117 + }, + { + "epoch": 0.9570288520564764, + "grad_norm": 0.6077566146850586, + "learning_rate": 9.887385283742011e-05, + "loss": 2.1132, + "step": 3118 + }, + { + "epoch": 0.9573357888275016, + "grad_norm": 0.5991361141204834, + "learning_rate": 9.88728036004674e-05, + "loss": 2.0322, + "step": 3119 + }, + { + "epoch": 0.9576427255985267, + "grad_norm": 0.5832391977310181, + "learning_rate": 9.887175388052499e-05, + "loss": 2.135, + "step": 3120 + }, + { + "epoch": 0.9579496623695519, + "grad_norm": 0.5479732751846313, + "learning_rate": 9.887070367760327e-05, + "loss": 2.1222, + "step": 3121 + }, + { + "epoch": 0.958256599140577, + "grad_norm": 0.5630220770835876, + "learning_rate": 9.88696529917126e-05, + "loss": 2.1247, + "step": 3122 + }, + { + "epoch": 0.9585635359116023, + "grad_norm": 0.7052439451217651, + "learning_rate": 9.88686018228634e-05, + "loss": 2.204, + "step": 3123 + }, + { + "epoch": 0.9588704726826274, + "grad_norm": 0.5995638370513916, + "learning_rate": 9.8867550171066e-05, + "loss": 2.0153, + "step": 3124 + }, + { + "epoch": 0.9591774094536526, + "grad_norm": 0.5689408779144287, + "learning_rate": 9.886649803633086e-05, + "loss": 2.0341, + "step": 3125 + }, + { + "epoch": 0.9594843462246777, + "grad_norm": 0.5247456431388855, + "learning_rate": 9.886544541866832e-05, + "loss": 2.0657, + "step": 3126 + }, + { + "epoch": 0.9597912829957029, + "grad_norm": 0.5596463084220886, + "learning_rate": 9.886439231808882e-05, + "loss": 2.0829, + "step": 3127 + }, + { + "epoch": 0.960098219766728, + "grad_norm": 0.4993874430656433, + "learning_rate": 9.886333873460275e-05, + "loss": 2.0517, + "step": 3128 + }, + { + "epoch": 0.9604051565377533, + "grad_norm": 0.5776910185813904, + "learning_rate": 9.886228466822054e-05, + "loss": 2.0124, + "step": 3129 + }, + { + "epoch": 0.9607120933087784, + "grad_norm": 0.5871354341506958, + "learning_rate": 9.886123011895258e-05, + "loss": 2.0327, + "step": 3130 + }, + { + "epoch": 0.9610190300798036, + "grad_norm": 0.5873207449913025, + "learning_rate": 9.886017508680931e-05, + "loss": 2.0756, + "step": 3131 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.6422720551490784, + "learning_rate": 9.885911957180113e-05, + "loss": 2.0649, + "step": 3132 + }, + { + "epoch": 0.9616329036218539, + "grad_norm": 0.6040814518928528, + "learning_rate": 9.885806357393853e-05, + "loss": 2.066, + "step": 3133 + }, + { + "epoch": 0.961939840392879, + "grad_norm": 0.6629621982574463, + "learning_rate": 9.885700709323189e-05, + "loss": 2.0824, + "step": 3134 + }, + { + "epoch": 0.9622467771639043, + "grad_norm": 0.572485625743866, + "learning_rate": 9.885595012969168e-05, + "loss": 2.0572, + "step": 3135 + }, + { + "epoch": 0.9625537139349294, + "grad_norm": 0.5050783753395081, + "learning_rate": 9.885489268332833e-05, + "loss": 2.0645, + "step": 3136 + }, + { + "epoch": 0.9628606507059546, + "grad_norm": 0.5744417309761047, + "learning_rate": 9.885383475415229e-05, + "loss": 2.0549, + "step": 3137 + }, + { + "epoch": 0.9631675874769797, + "grad_norm": 0.5604275465011597, + "learning_rate": 9.885277634217403e-05, + "loss": 2.1339, + "step": 3138 + }, + { + "epoch": 0.963474524248005, + "grad_norm": 0.6182584762573242, + "learning_rate": 9.8851717447404e-05, + "loss": 2.0397, + "step": 3139 + }, + { + "epoch": 0.9637814610190301, + "grad_norm": 0.510515570640564, + "learning_rate": 9.885065806985266e-05, + "loss": 1.9761, + "step": 3140 + }, + { + "epoch": 0.9640883977900553, + "grad_norm": 0.4881763756275177, + "learning_rate": 9.884959820953048e-05, + "loss": 2.005, + "step": 3141 + }, + { + "epoch": 0.9643953345610804, + "grad_norm": 0.47206851840019226, + "learning_rate": 9.884853786644794e-05, + "loss": 2.0661, + "step": 3142 + }, + { + "epoch": 0.9647022713321056, + "grad_norm": 0.5691676735877991, + "learning_rate": 9.884747704061552e-05, + "loss": 2.1316, + "step": 3143 + }, + { + "epoch": 0.9650092081031307, + "grad_norm": 0.5338765978813171, + "learning_rate": 9.884641573204372e-05, + "loss": 2.0715, + "step": 3144 + }, + { + "epoch": 0.965316144874156, + "grad_norm": 0.5721597075462341, + "learning_rate": 9.884535394074299e-05, + "loss": 2.1004, + "step": 3145 + }, + { + "epoch": 0.9656230816451811, + "grad_norm": 0.5269518494606018, + "learning_rate": 9.884429166672384e-05, + "loss": 2.1233, + "step": 3146 + }, + { + "epoch": 0.9659300184162063, + "grad_norm": 0.5264385342597961, + "learning_rate": 9.884322890999678e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.9662369551872314, + "grad_norm": 0.6094604730606079, + "learning_rate": 9.88421656705723e-05, + "loss": 2.1009, + "step": 3148 + }, + { + "epoch": 0.9665438919582566, + "grad_norm": 0.5538906455039978, + "learning_rate": 9.884110194846093e-05, + "loss": 2.0055, + "step": 3149 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.591526985168457, + "learning_rate": 9.884003774367313e-05, + "loss": 2.0655, + "step": 3150 + }, + { + "epoch": 0.967157765500307, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.883897305621948e-05, + "loss": 2.0775, + "step": 3151 + }, + { + "epoch": 0.9674647022713321, + "grad_norm": 0.5074640512466431, + "learning_rate": 9.883790788611045e-05, + "loss": 2.0322, + "step": 3152 + }, + { + "epoch": 0.9677716390423573, + "grad_norm": 0.5111376047134399, + "learning_rate": 9.883684223335661e-05, + "loss": 2.0972, + "step": 3153 + }, + { + "epoch": 0.9680785758133824, + "grad_norm": 0.5187644362449646, + "learning_rate": 9.883577609796846e-05, + "loss": 2.072, + "step": 3154 + }, + { + "epoch": 0.9683855125844076, + "grad_norm": 0.5285201072692871, + "learning_rate": 9.883470947995654e-05, + "loss": 2.0468, + "step": 3155 + }, + { + "epoch": 0.9686924493554327, + "grad_norm": 0.49360916018486023, + "learning_rate": 9.883364237933142e-05, + "loss": 2.07, + "step": 3156 + }, + { + "epoch": 0.968999386126458, + "grad_norm": 0.6359294056892395, + "learning_rate": 9.88325747961036e-05, + "loss": 2.1169, + "step": 3157 + }, + { + "epoch": 0.9693063228974831, + "grad_norm": 0.6274764537811279, + "learning_rate": 9.883150673028367e-05, + "loss": 2.1412, + "step": 3158 + }, + { + "epoch": 0.9696132596685083, + "grad_norm": 0.5755917429924011, + "learning_rate": 9.883043818188215e-05, + "loss": 2.0547, + "step": 3159 + }, + { + "epoch": 0.9699201964395334, + "grad_norm": 0.4765770137310028, + "learning_rate": 9.882936915090964e-05, + "loss": 2.02, + "step": 3160 + }, + { + "epoch": 0.9702271332105586, + "grad_norm": 0.5085053443908691, + "learning_rate": 9.882829963737667e-05, + "loss": 2.0355, + "step": 3161 + }, + { + "epoch": 0.9705340699815838, + "grad_norm": 0.49804505705833435, + "learning_rate": 9.882722964129385e-05, + "loss": 2.1274, + "step": 3162 + }, + { + "epoch": 0.970841006752609, + "grad_norm": 0.5575076341629028, + "learning_rate": 9.882615916267171e-05, + "loss": 2.0661, + "step": 3163 + }, + { + "epoch": 0.9711479435236341, + "grad_norm": 0.5678727626800537, + "learning_rate": 9.882508820152084e-05, + "loss": 2.1135, + "step": 3164 + }, + { + "epoch": 0.9714548802946593, + "grad_norm": 0.5505611896514893, + "learning_rate": 9.882401675785185e-05, + "loss": 2.0888, + "step": 3165 + }, + { + "epoch": 0.9717618170656844, + "grad_norm": 0.5224125385284424, + "learning_rate": 9.88229448316753e-05, + "loss": 2.0492, + "step": 3166 + }, + { + "epoch": 0.9720687538367097, + "grad_norm": 0.437215656042099, + "learning_rate": 9.882187242300178e-05, + "loss": 1.9927, + "step": 3167 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.4914848804473877, + "learning_rate": 9.882079953184192e-05, + "loss": 2.0309, + "step": 3168 + }, + { + "epoch": 0.97268262737876, + "grad_norm": 0.4990764260292053, + "learning_rate": 9.88197261582063e-05, + "loss": 2.0408, + "step": 3169 + }, + { + "epoch": 0.9729895641497851, + "grad_norm": 0.5283234715461731, + "learning_rate": 9.881865230210552e-05, + "loss": 2.0627, + "step": 3170 + }, + { + "epoch": 0.9732965009208103, + "grad_norm": 0.5771347284317017, + "learning_rate": 9.88175779635502e-05, + "loss": 2.1591, + "step": 3171 + }, + { + "epoch": 0.9736034376918354, + "grad_norm": 0.5020268559455872, + "learning_rate": 9.881650314255098e-05, + "loss": 2.0311, + "step": 3172 + }, + { + "epoch": 0.9739103744628607, + "grad_norm": 0.5476529002189636, + "learning_rate": 9.881542783911846e-05, + "loss": 2.1114, + "step": 3173 + }, + { + "epoch": 0.9742173112338858, + "grad_norm": 0.5630559921264648, + "learning_rate": 9.881435205326327e-05, + "loss": 2.0617, + "step": 3174 + }, + { + "epoch": 0.974524248004911, + "grad_norm": 0.5931001305580139, + "learning_rate": 9.881327578499604e-05, + "loss": 2.0376, + "step": 3175 + }, + { + "epoch": 0.9748311847759361, + "grad_norm": 0.6123979091644287, + "learning_rate": 9.881219903432742e-05, + "loss": 2.0995, + "step": 3176 + }, + { + "epoch": 0.9751381215469613, + "grad_norm": 0.6064465641975403, + "learning_rate": 9.881112180126802e-05, + "loss": 2.0533, + "step": 3177 + }, + { + "epoch": 0.9754450583179864, + "grad_norm": 0.6071485877037048, + "learning_rate": 9.881004408582852e-05, + "loss": 2.1007, + "step": 3178 + }, + { + "epoch": 0.9757519950890117, + "grad_norm": 0.6021482944488525, + "learning_rate": 9.880896588801954e-05, + "loss": 2.0528, + "step": 3179 + }, + { + "epoch": 0.9760589318600368, + "grad_norm": 0.5204832553863525, + "learning_rate": 9.880788720785177e-05, + "loss": 2.0489, + "step": 3180 + }, + { + "epoch": 0.976365868631062, + "grad_norm": 0.5347138047218323, + "learning_rate": 9.880680804533585e-05, + "loss": 2.1021, + "step": 3181 + }, + { + "epoch": 0.9766728054020871, + "grad_norm": 0.6318790912628174, + "learning_rate": 9.880572840048243e-05, + "loss": 2.0808, + "step": 3182 + }, + { + "epoch": 0.9769797421731123, + "grad_norm": 0.6978665590286255, + "learning_rate": 9.88046482733022e-05, + "loss": 2.0067, + "step": 3183 + }, + { + "epoch": 0.9772866789441375, + "grad_norm": 0.7986917495727539, + "learning_rate": 9.880356766380582e-05, + "loss": 2.0239, + "step": 3184 + }, + { + "epoch": 0.9775936157151627, + "grad_norm": 0.853898286819458, + "learning_rate": 9.880248657200402e-05, + "loss": 2.085, + "step": 3185 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.8207793235778809, + "learning_rate": 9.880140499790741e-05, + "loss": 2.0504, + "step": 3186 + }, + { + "epoch": 0.978207489257213, + "grad_norm": 0.7750336527824402, + "learning_rate": 9.880032294152673e-05, + "loss": 2.0962, + "step": 3187 + }, + { + "epoch": 0.9785144260282382, + "grad_norm": 0.7141241431236267, + "learning_rate": 9.879924040287263e-05, + "loss": 2.0655, + "step": 3188 + }, + { + "epoch": 0.9788213627992634, + "grad_norm": 0.6119080781936646, + "learning_rate": 9.879815738195585e-05, + "loss": 2.0611, + "step": 3189 + }, + { + "epoch": 0.9791282995702886, + "grad_norm": 0.5963751673698425, + "learning_rate": 9.879707387878708e-05, + "loss": 2.0978, + "step": 3190 + }, + { + "epoch": 0.9794352363413137, + "grad_norm": 0.5016428828239441, + "learning_rate": 9.879598989337703e-05, + "loss": 2.0323, + "step": 3191 + }, + { + "epoch": 0.9797421731123389, + "grad_norm": 0.5610151290893555, + "learning_rate": 9.87949054257364e-05, + "loss": 2.1362, + "step": 3192 + }, + { + "epoch": 0.980049109883364, + "grad_norm": 0.5687069296836853, + "learning_rate": 9.879382047587591e-05, + "loss": 2.0234, + "step": 3193 + }, + { + "epoch": 0.9803560466543892, + "grad_norm": 0.6210914254188538, + "learning_rate": 9.87927350438063e-05, + "loss": 2.0455, + "step": 3194 + }, + { + "epoch": 0.9806629834254144, + "grad_norm": 0.530215322971344, + "learning_rate": 9.879164912953827e-05, + "loss": 2.0607, + "step": 3195 + }, + { + "epoch": 0.9809699201964396, + "grad_norm": 0.5462486147880554, + "learning_rate": 9.879056273308258e-05, + "loss": 2.1229, + "step": 3196 + }, + { + "epoch": 0.9812768569674647, + "grad_norm": 0.5765405297279358, + "learning_rate": 9.878947585444994e-05, + "loss": 2.0575, + "step": 3197 + }, + { + "epoch": 0.9815837937384899, + "grad_norm": 0.531679630279541, + "learning_rate": 9.878838849365111e-05, + "loss": 2.0208, + "step": 3198 + }, + { + "epoch": 0.981890730509515, + "grad_norm": 0.5190781950950623, + "learning_rate": 9.878730065069683e-05, + "loss": 2.0073, + "step": 3199 + }, + { + "epoch": 0.9821976672805403, + "grad_norm": 0.6260761022567749, + "learning_rate": 9.878621232559784e-05, + "loss": 2.1144, + "step": 3200 + }, + { + "epoch": 0.9825046040515654, + "grad_norm": 0.664830207824707, + "learning_rate": 9.878512351836491e-05, + "loss": 2.1423, + "step": 3201 + }, + { + "epoch": 0.9828115408225906, + "grad_norm": 0.7107433676719666, + "learning_rate": 9.878403422900881e-05, + "loss": 2.0851, + "step": 3202 + }, + { + "epoch": 0.9831184775936157, + "grad_norm": 0.7426268458366394, + "learning_rate": 9.878294445754027e-05, + "loss": 2.0637, + "step": 3203 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.7643515467643738, + "learning_rate": 9.878185420397008e-05, + "loss": 2.0623, + "step": 3204 + }, + { + "epoch": 0.983732351135666, + "grad_norm": 0.644257664680481, + "learning_rate": 9.878076346830904e-05, + "loss": 2.103, + "step": 3205 + }, + { + "epoch": 0.9840392879066913, + "grad_norm": 0.5871284008026123, + "learning_rate": 9.877967225056787e-05, + "loss": 2.0695, + "step": 3206 + }, + { + "epoch": 0.9843462246777164, + "grad_norm": 0.6907737851142883, + "learning_rate": 9.877858055075742e-05, + "loss": 2.1148, + "step": 3207 + }, + { + "epoch": 0.9846531614487416, + "grad_norm": 0.6685691475868225, + "learning_rate": 9.877748836888843e-05, + "loss": 2.0356, + "step": 3208 + }, + { + "epoch": 0.9849600982197667, + "grad_norm": 0.797210156917572, + "learning_rate": 9.87763957049717e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.9852670349907919, + "grad_norm": 0.8397588133811951, + "learning_rate": 9.877530255901806e-05, + "loss": 2.0697, + "step": 3210 + }, + { + "epoch": 0.985573971761817, + "grad_norm": 0.6988976001739502, + "learning_rate": 9.877420893103828e-05, + "loss": 2.0676, + "step": 3211 + }, + { + "epoch": 0.9858809085328423, + "grad_norm": 0.5828577876091003, + "learning_rate": 9.877311482104319e-05, + "loss": 2.0988, + "step": 3212 + }, + { + "epoch": 0.9861878453038674, + "grad_norm": 0.66143798828125, + "learning_rate": 9.877202022904359e-05, + "loss": 2.101, + "step": 3213 + }, + { + "epoch": 0.9864947820748926, + "grad_norm": 0.7351155877113342, + "learning_rate": 9.877092515505028e-05, + "loss": 2.0198, + "step": 3214 + }, + { + "epoch": 0.9868017188459177, + "grad_norm": 0.6817437410354614, + "learning_rate": 9.876982959907413e-05, + "loss": 2.1182, + "step": 3215 + }, + { + "epoch": 0.9871086556169429, + "grad_norm": 0.6640676259994507, + "learning_rate": 9.876873356112592e-05, + "loss": 2.1264, + "step": 3216 + }, + { + "epoch": 0.987415592387968, + "grad_norm": 0.6146695017814636, + "learning_rate": 9.876763704121652e-05, + "loss": 2.0378, + "step": 3217 + }, + { + "epoch": 0.9877225291589933, + "grad_norm": 0.6681298017501831, + "learning_rate": 9.876654003935672e-05, + "loss": 2.1916, + "step": 3218 + }, + { + "epoch": 0.9880294659300184, + "grad_norm": 0.7407983541488647, + "learning_rate": 9.876544255555742e-05, + "loss": 2.0996, + "step": 3219 + }, + { + "epoch": 0.9883364027010436, + "grad_norm": 0.5995208621025085, + "learning_rate": 9.876434458982941e-05, + "loss": 2.0023, + "step": 3220 + }, + { + "epoch": 0.9886433394720687, + "grad_norm": 0.6491377949714661, + "learning_rate": 9.876324614218357e-05, + "loss": 2.129, + "step": 3221 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.6356569528579712, + "learning_rate": 9.876214721263074e-05, + "loss": 2.1396, + "step": 3222 + }, + { + "epoch": 0.9892572130141191, + "grad_norm": 0.6149557828903198, + "learning_rate": 9.876104780118182e-05, + "loss": 2.0204, + "step": 3223 + }, + { + "epoch": 0.9895641497851443, + "grad_norm": 0.600841224193573, + "learning_rate": 9.875994790784764e-05, + "loss": 2.0585, + "step": 3224 + }, + { + "epoch": 0.9898710865561694, + "grad_norm": 0.6398041248321533, + "learning_rate": 9.875884753263906e-05, + "loss": 2.1296, + "step": 3225 + }, + { + "epoch": 0.9901780233271946, + "grad_norm": 0.5978466272354126, + "learning_rate": 9.875774667556697e-05, + "loss": 1.9765, + "step": 3226 + }, + { + "epoch": 0.9904849600982197, + "grad_norm": 0.49499931931495667, + "learning_rate": 9.875664533664227e-05, + "loss": 2.0516, + "step": 3227 + }, + { + "epoch": 0.990791896869245, + "grad_norm": 0.5660768151283264, + "learning_rate": 9.875554351587579e-05, + "loss": 2.0743, + "step": 3228 + }, + { + "epoch": 0.9910988336402701, + "grad_norm": 0.56971275806427, + "learning_rate": 9.875444121327849e-05, + "loss": 2.0794, + "step": 3229 + }, + { + "epoch": 0.9914057704112953, + "grad_norm": 0.5806300044059753, + "learning_rate": 9.87533384288612e-05, + "loss": 2.1636, + "step": 3230 + }, + { + "epoch": 0.9917127071823204, + "grad_norm": 0.5485837459564209, + "learning_rate": 9.875223516263485e-05, + "loss": 2.025, + "step": 3231 + }, + { + "epoch": 0.9920196439533456, + "grad_norm": 0.6353451013565063, + "learning_rate": 9.875113141461034e-05, + "loss": 2.1033, + "step": 3232 + }, + { + "epoch": 0.9923265807243707, + "grad_norm": 0.577608048915863, + "learning_rate": 9.875002718479858e-05, + "loss": 2.1306, + "step": 3233 + }, + { + "epoch": 0.992633517495396, + "grad_norm": 0.5305901765823364, + "learning_rate": 9.874892247321046e-05, + "loss": 2.1123, + "step": 3234 + }, + { + "epoch": 0.9929404542664211, + "grad_norm": 0.5554118752479553, + "learning_rate": 9.874781727985693e-05, + "loss": 2.0524, + "step": 3235 + }, + { + "epoch": 0.9932473910374463, + "grad_norm": 0.48555269837379456, + "learning_rate": 9.87467116047489e-05, + "loss": 2.0699, + "step": 3236 + }, + { + "epoch": 0.9935543278084714, + "grad_norm": 0.578976035118103, + "learning_rate": 9.874560544789729e-05, + "loss": 2.0747, + "step": 3237 + }, + { + "epoch": 0.9938612645794966, + "grad_norm": 0.5508282780647278, + "learning_rate": 9.874449880931304e-05, + "loss": 2.0947, + "step": 3238 + }, + { + "epoch": 0.9941682013505218, + "grad_norm": 0.5458595752716064, + "learning_rate": 9.874339168900707e-05, + "loss": 2.0417, + "step": 3239 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.5668261647224426, + "learning_rate": 9.874228408699035e-05, + "loss": 2.0948, + "step": 3240 + }, + { + "epoch": 0.9947820748925721, + "grad_norm": 0.6127253174781799, + "learning_rate": 9.87411760032738e-05, + "loss": 2.0904, + "step": 3241 + }, + { + "epoch": 0.9950890116635973, + "grad_norm": 0.5736191868782043, + "learning_rate": 9.874006743786839e-05, + "loss": 2.0637, + "step": 3242 + }, + { + "epoch": 0.9953959484346224, + "grad_norm": 0.574163019657135, + "learning_rate": 9.873895839078507e-05, + "loss": 2.0925, + "step": 3243 + }, + { + "epoch": 0.9957028852056476, + "grad_norm": 0.5660602450370789, + "learning_rate": 9.873784886203478e-05, + "loss": 2.0743, + "step": 3244 + }, + { + "epoch": 0.9960098219766728, + "grad_norm": 0.6037993431091309, + "learning_rate": 9.87367388516285e-05, + "loss": 2.1274, + "step": 3245 + }, + { + "epoch": 0.996316758747698, + "grad_norm": 0.5664488673210144, + "learning_rate": 9.873562835957722e-05, + "loss": 2.0403, + "step": 3246 + }, + { + "epoch": 0.9966236955187231, + "grad_norm": 0.6170254349708557, + "learning_rate": 9.873451738589188e-05, + "loss": 2.0198, + "step": 3247 + }, + { + "epoch": 0.9969306322897483, + "grad_norm": 0.5582032799720764, + "learning_rate": 9.873340593058348e-05, + "loss": 2.1494, + "step": 3248 + }, + { + "epoch": 0.9972375690607734, + "grad_norm": 0.5565598607063293, + "learning_rate": 9.8732293993663e-05, + "loss": 2.1062, + "step": 3249 + }, + { + "epoch": 0.9975445058317987, + "grad_norm": 0.5526474118232727, + "learning_rate": 9.873118157514142e-05, + "loss": 2.1184, + "step": 3250 + }, + { + "epoch": 0.9978514426028238, + "grad_norm": 0.5864302515983582, + "learning_rate": 9.873006867502975e-05, + "loss": 2.1389, + "step": 3251 + }, + { + "epoch": 0.998158379373849, + "grad_norm": 0.5295118689537048, + "learning_rate": 9.872895529333899e-05, + "loss": 2.05, + "step": 3252 + }, + { + "epoch": 0.9984653161448741, + "grad_norm": 0.553537905216217, + "learning_rate": 9.872784143008012e-05, + "loss": 2.0464, + "step": 3253 + }, + { + "epoch": 0.9987722529158993, + "grad_norm": 0.558159589767456, + "learning_rate": 9.872672708526416e-05, + "loss": 2.1013, + "step": 3254 + }, + { + "epoch": 0.9990791896869244, + "grad_norm": 0.5479860901832581, + "learning_rate": 9.872561225890211e-05, + "loss": 2.0497, + "step": 3255 + }, + { + "epoch": 0.9993861264579497, + "grad_norm": 0.5538234114646912, + "learning_rate": 9.872449695100503e-05, + "loss": 2.1239, + "step": 3256 + }, + { + "epoch": 0.9996930632289748, + "grad_norm": 0.5970771908760071, + "learning_rate": 9.872338116158389e-05, + "loss": 2.0693, + "step": 3257 + }, + { + "epoch": 1.0, + "grad_norm": 0.5118132829666138, + "learning_rate": 9.872226489064975e-05, + "loss": 2.0302, + "step": 3258 + }, + { + "epoch": 1.0003069367710251, + "grad_norm": 0.538902223110199, + "learning_rate": 9.872114813821363e-05, + "loss": 2.0604, + "step": 3259 + }, + { + "epoch": 1.0006138735420504, + "grad_norm": 0.47124916315078735, + "learning_rate": 9.872003090428657e-05, + "loss": 2.054, + "step": 3260 + }, + { + "epoch": 1.0009208103130756, + "grad_norm": 0.5109235048294067, + "learning_rate": 9.87189131888796e-05, + "loss": 2.0107, + "step": 3261 + }, + { + "epoch": 1.0012277470841007, + "grad_norm": 0.5530306696891785, + "learning_rate": 9.871779499200377e-05, + "loss": 2.0914, + "step": 3262 + }, + { + "epoch": 1.0015346838551258, + "grad_norm": 0.6271992325782776, + "learning_rate": 9.871667631367017e-05, + "loss": 1.9855, + "step": 3263 + }, + { + "epoch": 1.0018416206261511, + "grad_norm": 0.5752004384994507, + "learning_rate": 9.871555715388978e-05, + "loss": 2.0689, + "step": 3264 + }, + { + "epoch": 1.0021485573971762, + "grad_norm": 0.6185278296470642, + "learning_rate": 9.871443751267373e-05, + "loss": 2.0751, + "step": 3265 + }, + { + "epoch": 1.0024554941682013, + "grad_norm": 0.625248908996582, + "learning_rate": 9.871331739003304e-05, + "loss": 2.102, + "step": 3266 + }, + { + "epoch": 1.0027624309392265, + "grad_norm": 0.6345300078392029, + "learning_rate": 9.87121967859788e-05, + "loss": 2.0898, + "step": 3267 + }, + { + "epoch": 1.0030693677102518, + "grad_norm": 0.6836622953414917, + "learning_rate": 9.871107570052207e-05, + "loss": 2.1348, + "step": 3268 + }, + { + "epoch": 1.003376304481277, + "grad_norm": 0.699739933013916, + "learning_rate": 9.870995413367397e-05, + "loss": 2.0085, + "step": 3269 + }, + { + "epoch": 1.003683241252302, + "grad_norm": 0.650558590888977, + "learning_rate": 9.870883208544553e-05, + "loss": 2.0927, + "step": 3270 + }, + { + "epoch": 1.0039901780233271, + "grad_norm": 0.6837300658226013, + "learning_rate": 9.870770955584785e-05, + "loss": 2.1415, + "step": 3271 + }, + { + "epoch": 1.0042971147943525, + "grad_norm": 0.595761239528656, + "learning_rate": 9.870658654489206e-05, + "loss": 2.0372, + "step": 3272 + }, + { + "epoch": 1.0046040515653776, + "grad_norm": 0.5177203416824341, + "learning_rate": 9.870546305258922e-05, + "loss": 2.053, + "step": 3273 + }, + { + "epoch": 1.0049109883364027, + "grad_norm": 0.5392438173294067, + "learning_rate": 9.870433907895045e-05, + "loss": 2.0886, + "step": 3274 + }, + { + "epoch": 1.0052179251074278, + "grad_norm": 0.594776451587677, + "learning_rate": 9.870321462398686e-05, + "loss": 2.0158, + "step": 3275 + }, + { + "epoch": 1.0055248618784531, + "grad_norm": 0.6363179683685303, + "learning_rate": 9.870208968770955e-05, + "loss": 2.0532, + "step": 3276 + }, + { + "epoch": 1.0058317986494782, + "grad_norm": 0.7506567239761353, + "learning_rate": 9.870096427012965e-05, + "loss": 2.1288, + "step": 3277 + }, + { + "epoch": 1.0061387354205034, + "grad_norm": 0.7155289053916931, + "learning_rate": 9.869983837125828e-05, + "loss": 2.0859, + "step": 3278 + }, + { + "epoch": 1.0064456721915285, + "grad_norm": 0.7589760422706604, + "learning_rate": 9.869871199110656e-05, + "loss": 2.1668, + "step": 3279 + }, + { + "epoch": 1.0067526089625538, + "grad_norm": 0.6161168217658997, + "learning_rate": 9.869758512968562e-05, + "loss": 2.0421, + "step": 3280 + }, + { + "epoch": 1.007059545733579, + "grad_norm": 0.5722637176513672, + "learning_rate": 9.86964577870066e-05, + "loss": 2.1333, + "step": 3281 + }, + { + "epoch": 1.007366482504604, + "grad_norm": 0.6443020701408386, + "learning_rate": 9.869532996308065e-05, + "loss": 2.0227, + "step": 3282 + }, + { + "epoch": 1.0076734192756291, + "grad_norm": 0.6603342890739441, + "learning_rate": 9.869420165791891e-05, + "loss": 2.0888, + "step": 3283 + }, + { + "epoch": 1.0079803560466545, + "grad_norm": 0.6666482090950012, + "learning_rate": 9.869307287153251e-05, + "loss": 2.0132, + "step": 3284 + }, + { + "epoch": 1.0082872928176796, + "grad_norm": 0.6691575646400452, + "learning_rate": 9.869194360393264e-05, + "loss": 2.0752, + "step": 3285 + }, + { + "epoch": 1.0085942295887047, + "grad_norm": 0.6142565011978149, + "learning_rate": 9.869081385513044e-05, + "loss": 2.0491, + "step": 3286 + }, + { + "epoch": 1.0089011663597298, + "grad_norm": 0.5869930386543274, + "learning_rate": 9.868968362513708e-05, + "loss": 2.1252, + "step": 3287 + }, + { + "epoch": 1.0092081031307552, + "grad_norm": 0.532183825969696, + "learning_rate": 9.868855291396373e-05, + "loss": 2.0589, + "step": 3288 + }, + { + "epoch": 1.0095150399017803, + "grad_norm": 0.616374135017395, + "learning_rate": 9.868742172162156e-05, + "loss": 2.0808, + "step": 3289 + }, + { + "epoch": 1.0098219766728054, + "grad_norm": 0.5750923156738281, + "learning_rate": 9.868629004812176e-05, + "loss": 2.0407, + "step": 3290 + }, + { + "epoch": 1.0101289134438305, + "grad_norm": 0.6161531209945679, + "learning_rate": 9.86851578934755e-05, + "loss": 2.0938, + "step": 3291 + }, + { + "epoch": 1.0104358502148558, + "grad_norm": 0.5369158983230591, + "learning_rate": 9.868402525769397e-05, + "loss": 2.1298, + "step": 3292 + }, + { + "epoch": 1.010742786985881, + "grad_norm": 0.5134824514389038, + "learning_rate": 9.868289214078837e-05, + "loss": 2.0345, + "step": 3293 + }, + { + "epoch": 1.011049723756906, + "grad_norm": 0.4972594082355499, + "learning_rate": 9.868175854276991e-05, + "loss": 2.1264, + "step": 3294 + }, + { + "epoch": 1.0113566605279312, + "grad_norm": 0.5727534890174866, + "learning_rate": 9.868062446364976e-05, + "loss": 2.1668, + "step": 3295 + }, + { + "epoch": 1.0116635972989565, + "grad_norm": 0.6384626030921936, + "learning_rate": 9.867948990343915e-05, + "loss": 2.1125, + "step": 3296 + }, + { + "epoch": 1.0119705340699816, + "grad_norm": 0.7591070532798767, + "learning_rate": 9.867835486214929e-05, + "loss": 2.0975, + "step": 3297 + }, + { + "epoch": 1.0122774708410067, + "grad_norm": 0.7940282821655273, + "learning_rate": 9.86772193397914e-05, + "loss": 2.0107, + "step": 3298 + }, + { + "epoch": 1.0125844076120318, + "grad_norm": 0.6877933144569397, + "learning_rate": 9.86760833363767e-05, + "loss": 2.0684, + "step": 3299 + }, + { + "epoch": 1.0128913443830572, + "grad_norm": 0.5361137986183167, + "learning_rate": 9.867494685191641e-05, + "loss": 2.0426, + "step": 3300 + }, + { + "epoch": 1.0131982811540823, + "grad_norm": 0.5104349851608276, + "learning_rate": 9.867380988642177e-05, + "loss": 2.0849, + "step": 3301 + }, + { + "epoch": 1.0135052179251074, + "grad_norm": 0.6133849024772644, + "learning_rate": 9.867267243990399e-05, + "loss": 2.0789, + "step": 3302 + }, + { + "epoch": 1.0138121546961325, + "grad_norm": 0.6607559323310852, + "learning_rate": 9.867153451237436e-05, + "loss": 2.0978, + "step": 3303 + }, + { + "epoch": 1.0141190914671578, + "grad_norm": 0.6853774189949036, + "learning_rate": 9.867039610384409e-05, + "loss": 2.1612, + "step": 3304 + }, + { + "epoch": 1.014426028238183, + "grad_norm": 0.6326626539230347, + "learning_rate": 9.866925721432442e-05, + "loss": 2.0887, + "step": 3305 + }, + { + "epoch": 1.014732965009208, + "grad_norm": 0.5483830571174622, + "learning_rate": 9.866811784382665e-05, + "loss": 2.0522, + "step": 3306 + }, + { + "epoch": 1.0150399017802332, + "grad_norm": 0.5980744957923889, + "learning_rate": 9.866697799236201e-05, + "loss": 2.0666, + "step": 3307 + }, + { + "epoch": 1.0153468385512585, + "grad_norm": 0.6047075986862183, + "learning_rate": 9.866583765994177e-05, + "loss": 2.0924, + "step": 3308 + }, + { + "epoch": 1.0156537753222836, + "grad_norm": 0.5932674407958984, + "learning_rate": 9.86646968465772e-05, + "loss": 2.0426, + "step": 3309 + }, + { + "epoch": 1.0159607120933087, + "grad_norm": 0.5349873304367065, + "learning_rate": 9.866355555227957e-05, + "loss": 2.027, + "step": 3310 + }, + { + "epoch": 1.0162676488643339, + "grad_norm": 0.5090891122817993, + "learning_rate": 9.866241377706015e-05, + "loss": 2.0554, + "step": 3311 + }, + { + "epoch": 1.0165745856353592, + "grad_norm": 0.605268120765686, + "learning_rate": 9.866127152093025e-05, + "loss": 2.0788, + "step": 3312 + }, + { + "epoch": 1.0168815224063843, + "grad_norm": 0.6006563305854797, + "learning_rate": 9.866012878390113e-05, + "loss": 2.0154, + "step": 3313 + }, + { + "epoch": 1.0171884591774094, + "grad_norm": 0.6412727236747742, + "learning_rate": 9.865898556598409e-05, + "loss": 2.0948, + "step": 3314 + }, + { + "epoch": 1.0174953959484345, + "grad_norm": 0.512140154838562, + "learning_rate": 9.865784186719046e-05, + "loss": 2.0314, + "step": 3315 + }, + { + "epoch": 1.0178023327194599, + "grad_norm": 0.48285913467407227, + "learning_rate": 9.865669768753151e-05, + "loss": 1.9689, + "step": 3316 + }, + { + "epoch": 1.018109269490485, + "grad_norm": 0.6067737340927124, + "learning_rate": 9.865555302701854e-05, + "loss": 2.1042, + "step": 3317 + }, + { + "epoch": 1.01841620626151, + "grad_norm": 0.6272363662719727, + "learning_rate": 9.865440788566289e-05, + "loss": 2.1092, + "step": 3318 + }, + { + "epoch": 1.0187231430325352, + "grad_norm": 0.6264182925224304, + "learning_rate": 9.865326226347586e-05, + "loss": 2.0445, + "step": 3319 + }, + { + "epoch": 1.0190300798035605, + "grad_norm": 0.5642834901809692, + "learning_rate": 9.86521161604688e-05, + "loss": 2.1041, + "step": 3320 + }, + { + "epoch": 1.0193370165745856, + "grad_norm": 0.5188324451446533, + "learning_rate": 9.865096957665297e-05, + "loss": 2.0174, + "step": 3321 + }, + { + "epoch": 1.0196439533456108, + "grad_norm": 0.5204416513442993, + "learning_rate": 9.864982251203976e-05, + "loss": 2.0927, + "step": 3322 + }, + { + "epoch": 1.0199508901166359, + "grad_norm": 0.5845292806625366, + "learning_rate": 9.86486749666405e-05, + "loss": 2.0751, + "step": 3323 + }, + { + "epoch": 1.0202578268876612, + "grad_norm": 0.5514994263648987, + "learning_rate": 9.86475269404665e-05, + "loss": 2.0976, + "step": 3324 + }, + { + "epoch": 1.0205647636586863, + "grad_norm": 0.6578981280326843, + "learning_rate": 9.864637843352915e-05, + "loss": 2.0668, + "step": 3325 + }, + { + "epoch": 1.0208717004297114, + "grad_norm": 0.6396434307098389, + "learning_rate": 9.864522944583976e-05, + "loss": 2.0648, + "step": 3326 + }, + { + "epoch": 1.0211786372007365, + "grad_norm": 0.548759400844574, + "learning_rate": 9.86440799774097e-05, + "loss": 2.0873, + "step": 3327 + }, + { + "epoch": 1.0214855739717619, + "grad_norm": 0.5739279985427856, + "learning_rate": 9.864293002825033e-05, + "loss": 2.0623, + "step": 3328 + }, + { + "epoch": 1.021792510742787, + "grad_norm": 0.5882315039634705, + "learning_rate": 9.864177959837303e-05, + "loss": 2.0399, + "step": 3329 + }, + { + "epoch": 1.022099447513812, + "grad_norm": 0.563359797000885, + "learning_rate": 9.864062868778914e-05, + "loss": 2.0839, + "step": 3330 + }, + { + "epoch": 1.0224063842848374, + "grad_norm": 0.6162607073783875, + "learning_rate": 9.863947729651006e-05, + "loss": 2.0439, + "step": 3331 + }, + { + "epoch": 1.0227133210558625, + "grad_norm": 0.6540365815162659, + "learning_rate": 9.863832542454715e-05, + "loss": 2.1234, + "step": 3332 + }, + { + "epoch": 1.0230202578268877, + "grad_norm": 0.6401089429855347, + "learning_rate": 9.86371730719118e-05, + "loss": 2.0418, + "step": 3333 + }, + { + "epoch": 1.0233271945979128, + "grad_norm": 0.6456391215324402, + "learning_rate": 9.86360202386154e-05, + "loss": 2.1191, + "step": 3334 + }, + { + "epoch": 1.023634131368938, + "grad_norm": 0.59992516040802, + "learning_rate": 9.863486692466933e-05, + "loss": 2.0582, + "step": 3335 + }, + { + "epoch": 1.0239410681399632, + "grad_norm": 0.5932520627975464, + "learning_rate": 9.8633713130085e-05, + "loss": 2.1812, + "step": 3336 + }, + { + "epoch": 1.0242480049109883, + "grad_norm": 0.6322866082191467, + "learning_rate": 9.863255885487384e-05, + "loss": 2.1523, + "step": 3337 + }, + { + "epoch": 1.0245549416820134, + "grad_norm": 0.6291313171386719, + "learning_rate": 9.863140409904719e-05, + "loss": 2.0495, + "step": 3338 + }, + { + "epoch": 1.0248618784530388, + "grad_norm": 0.6272565126419067, + "learning_rate": 9.863024886261653e-05, + "loss": 1.9812, + "step": 3339 + }, + { + "epoch": 1.025168815224064, + "grad_norm": 0.6485729217529297, + "learning_rate": 9.862909314559323e-05, + "loss": 2.0826, + "step": 3340 + }, + { + "epoch": 1.025475751995089, + "grad_norm": 0.608239471912384, + "learning_rate": 9.862793694798875e-05, + "loss": 2.0519, + "step": 3341 + }, + { + "epoch": 1.0257826887661141, + "grad_norm": 0.5492779612541199, + "learning_rate": 9.862678026981447e-05, + "loss": 1.9901, + "step": 3342 + }, + { + "epoch": 1.0260896255371394, + "grad_norm": 0.524030327796936, + "learning_rate": 9.862562311108187e-05, + "loss": 2.0695, + "step": 3343 + }, + { + "epoch": 1.0263965623081646, + "grad_norm": 0.6835227608680725, + "learning_rate": 9.862446547180235e-05, + "loss": 2.1312, + "step": 3344 + }, + { + "epoch": 1.0267034990791897, + "grad_norm": 0.6771748065948486, + "learning_rate": 9.862330735198736e-05, + "loss": 2.0566, + "step": 3345 + }, + { + "epoch": 1.0270104358502148, + "grad_norm": 0.609993577003479, + "learning_rate": 9.862214875164835e-05, + "loss": 2.1463, + "step": 3346 + }, + { + "epoch": 1.0273173726212401, + "grad_norm": 0.6617777347564697, + "learning_rate": 9.862098967079677e-05, + "loss": 2.0485, + "step": 3347 + }, + { + "epoch": 1.0276243093922652, + "grad_norm": 0.7935113906860352, + "learning_rate": 9.861983010944407e-05, + "loss": 2.0528, + "step": 3348 + }, + { + "epoch": 1.0279312461632903, + "grad_norm": 0.7510255575180054, + "learning_rate": 9.861867006760172e-05, + "loss": 1.9803, + "step": 3349 + }, + { + "epoch": 1.0282381829343155, + "grad_norm": 0.6944519281387329, + "learning_rate": 9.861750954528117e-05, + "loss": 2.0488, + "step": 3350 + }, + { + "epoch": 1.0285451197053408, + "grad_norm": 0.6057126522064209, + "learning_rate": 9.861634854249389e-05, + "loss": 2.1465, + "step": 3351 + }, + { + "epoch": 1.028852056476366, + "grad_norm": 0.6156182289123535, + "learning_rate": 9.861518705925135e-05, + "loss": 2.1227, + "step": 3352 + }, + { + "epoch": 1.029158993247391, + "grad_norm": 0.6016978621482849, + "learning_rate": 9.861402509556506e-05, + "loss": 2.0238, + "step": 3353 + }, + { + "epoch": 1.0294659300184161, + "grad_norm": 0.5987950563430786, + "learning_rate": 9.861286265144648e-05, + "loss": 2.0529, + "step": 3354 + }, + { + "epoch": 1.0297728667894415, + "grad_norm": 0.6011384725570679, + "learning_rate": 9.861169972690707e-05, + "loss": 2.0612, + "step": 3355 + }, + { + "epoch": 1.0300798035604666, + "grad_norm": 0.5217840671539307, + "learning_rate": 9.861053632195838e-05, + "loss": 2.0472, + "step": 3356 + }, + { + "epoch": 1.0303867403314917, + "grad_norm": 0.5202180743217468, + "learning_rate": 9.860937243661186e-05, + "loss": 2.1301, + "step": 3357 + }, + { + "epoch": 1.0306936771025168, + "grad_norm": 0.572290301322937, + "learning_rate": 9.860820807087905e-05, + "loss": 2.0309, + "step": 3358 + }, + { + "epoch": 1.0310006138735421, + "grad_norm": 0.5088694095611572, + "learning_rate": 9.860704322477142e-05, + "loss": 2.0789, + "step": 3359 + }, + { + "epoch": 1.0313075506445673, + "grad_norm": 0.5546056032180786, + "learning_rate": 9.860587789830052e-05, + "loss": 1.9708, + "step": 3360 + }, + { + "epoch": 1.0316144874155924, + "grad_norm": 0.5152996182441711, + "learning_rate": 9.860471209147782e-05, + "loss": 2.0656, + "step": 3361 + }, + { + "epoch": 1.0319214241866175, + "grad_norm": 0.4997018873691559, + "learning_rate": 9.860354580431488e-05, + "loss": 2.1404, + "step": 3362 + }, + { + "epoch": 1.0322283609576428, + "grad_norm": 0.5464209318161011, + "learning_rate": 9.860237903682321e-05, + "loss": 2.0013, + "step": 3363 + }, + { + "epoch": 1.032535297728668, + "grad_norm": 0.4934932589530945, + "learning_rate": 9.860121178901435e-05, + "loss": 2.0873, + "step": 3364 + }, + { + "epoch": 1.032842234499693, + "grad_norm": 0.5755184292793274, + "learning_rate": 9.860004406089982e-05, + "loss": 2.0706, + "step": 3365 + }, + { + "epoch": 1.0331491712707181, + "grad_norm": 0.6155427098274231, + "learning_rate": 9.859887585249117e-05, + "loss": 2.1153, + "step": 3366 + }, + { + "epoch": 1.0334561080417435, + "grad_norm": 0.6251068711280823, + "learning_rate": 9.859770716379995e-05, + "loss": 1.9988, + "step": 3367 + }, + { + "epoch": 1.0337630448127686, + "grad_norm": 0.5652515888214111, + "learning_rate": 9.85965379948377e-05, + "loss": 1.9834, + "step": 3368 + }, + { + "epoch": 1.0340699815837937, + "grad_norm": 0.49031418561935425, + "learning_rate": 9.859536834561599e-05, + "loss": 2.0719, + "step": 3369 + }, + { + "epoch": 1.0343769183548188, + "grad_norm": 0.5014585852622986, + "learning_rate": 9.859419821614635e-05, + "loss": 2.0309, + "step": 3370 + }, + { + "epoch": 1.0346838551258442, + "grad_norm": 0.5657221674919128, + "learning_rate": 9.859302760644036e-05, + "loss": 2.048, + "step": 3371 + }, + { + "epoch": 1.0349907918968693, + "grad_norm": 0.7023506164550781, + "learning_rate": 9.85918565165096e-05, + "loss": 2.033, + "step": 3372 + }, + { + "epoch": 1.0352977286678944, + "grad_norm": 0.5712850689888, + "learning_rate": 9.859068494636565e-05, + "loss": 2.1006, + "step": 3373 + }, + { + "epoch": 1.0356046654389195, + "grad_norm": 0.5352653861045837, + "learning_rate": 9.858951289602004e-05, + "loss": 1.9775, + "step": 3374 + }, + { + "epoch": 1.0359116022099448, + "grad_norm": 0.5282073616981506, + "learning_rate": 9.85883403654844e-05, + "loss": 2.0388, + "step": 3375 + }, + { + "epoch": 1.03621853898097, + "grad_norm": 0.6164727210998535, + "learning_rate": 9.85871673547703e-05, + "loss": 2.0758, + "step": 3376 + }, + { + "epoch": 1.036525475751995, + "grad_norm": 0.6034660935401917, + "learning_rate": 9.858599386388933e-05, + "loss": 2.0619, + "step": 3377 + }, + { + "epoch": 1.0368324125230202, + "grad_norm": 0.6129952073097229, + "learning_rate": 9.85848198928531e-05, + "loss": 2.0709, + "step": 3378 + }, + { + "epoch": 1.0371393492940455, + "grad_norm": 0.6287248134613037, + "learning_rate": 9.85836454416732e-05, + "loss": 2.1493, + "step": 3379 + }, + { + "epoch": 1.0374462860650706, + "grad_norm": 0.675419807434082, + "learning_rate": 9.858247051036124e-05, + "loss": 2.0558, + "step": 3380 + }, + { + "epoch": 1.0377532228360957, + "grad_norm": 0.6493481397628784, + "learning_rate": 9.858129509892882e-05, + "loss": 2.2019, + "step": 3381 + }, + { + "epoch": 1.0380601596071208, + "grad_norm": 0.6690036058425903, + "learning_rate": 9.85801192073876e-05, + "loss": 2.0069, + "step": 3382 + }, + { + "epoch": 1.0383670963781462, + "grad_norm": 0.6682954430580139, + "learning_rate": 9.857894283574913e-05, + "loss": 2.0559, + "step": 3383 + }, + { + "epoch": 1.0386740331491713, + "grad_norm": 0.6408236622810364, + "learning_rate": 9.857776598402508e-05, + "loss": 2.0837, + "step": 3384 + }, + { + "epoch": 1.0389809699201964, + "grad_norm": 0.7896385192871094, + "learning_rate": 9.85765886522271e-05, + "loss": 2.1344, + "step": 3385 + }, + { + "epoch": 1.0392879066912215, + "grad_norm": 0.7404007911682129, + "learning_rate": 9.857541084036677e-05, + "loss": 2.0937, + "step": 3386 + }, + { + "epoch": 1.0395948434622468, + "grad_norm": 0.6780609488487244, + "learning_rate": 9.857423254845577e-05, + "loss": 2.0279, + "step": 3387 + }, + { + "epoch": 1.039901780233272, + "grad_norm": 0.5989474654197693, + "learning_rate": 9.857305377650574e-05, + "loss": 2.0997, + "step": 3388 + }, + { + "epoch": 1.040208717004297, + "grad_norm": 0.5449484586715698, + "learning_rate": 9.857187452452832e-05, + "loss": 2.0544, + "step": 3389 + }, + { + "epoch": 1.0405156537753222, + "grad_norm": 0.6261779069900513, + "learning_rate": 9.857069479253516e-05, + "loss": 2.024, + "step": 3390 + }, + { + "epoch": 1.0408225905463475, + "grad_norm": 0.6665713787078857, + "learning_rate": 9.856951458053794e-05, + "loss": 2.1139, + "step": 3391 + }, + { + "epoch": 1.0411295273173726, + "grad_norm": 0.5861490964889526, + "learning_rate": 9.856833388854829e-05, + "loss": 2.0087, + "step": 3392 + }, + { + "epoch": 1.0414364640883977, + "grad_norm": 0.5511623620986938, + "learning_rate": 9.856715271657793e-05, + "loss": 2.106, + "step": 3393 + }, + { + "epoch": 1.0417434008594229, + "grad_norm": 0.5450705885887146, + "learning_rate": 9.856597106463848e-05, + "loss": 2.0669, + "step": 3394 + }, + { + "epoch": 1.0420503376304482, + "grad_norm": 0.5172801613807678, + "learning_rate": 9.856478893274163e-05, + "loss": 2.0492, + "step": 3395 + }, + { + "epoch": 1.0423572744014733, + "grad_norm": 0.580157458782196, + "learning_rate": 9.856360632089907e-05, + "loss": 2.0794, + "step": 3396 + }, + { + "epoch": 1.0426642111724984, + "grad_norm": 0.5138662457466125, + "learning_rate": 9.856242322912251e-05, + "loss": 2.0813, + "step": 3397 + }, + { + "epoch": 1.0429711479435237, + "grad_norm": 0.5626689791679382, + "learning_rate": 9.85612396574236e-05, + "loss": 2.071, + "step": 3398 + }, + { + "epoch": 1.0432780847145489, + "grad_norm": 0.6069894433021545, + "learning_rate": 9.856005560581407e-05, + "loss": 2.132, + "step": 3399 + }, + { + "epoch": 1.043585021485574, + "grad_norm": 0.547346293926239, + "learning_rate": 9.85588710743056e-05, + "loss": 2.0572, + "step": 3400 + }, + { + "epoch": 1.043891958256599, + "grad_norm": 0.5712311863899231, + "learning_rate": 9.855768606290992e-05, + "loss": 2.0943, + "step": 3401 + }, + { + "epoch": 1.0441988950276242, + "grad_norm": 0.5945014953613281, + "learning_rate": 9.85565005716387e-05, + "loss": 2.1004, + "step": 3402 + }, + { + "epoch": 1.0445058317986495, + "grad_norm": 0.5712563395500183, + "learning_rate": 9.85553146005037e-05, + "loss": 2.0817, + "step": 3403 + }, + { + "epoch": 1.0448127685696746, + "grad_norm": 0.552578866481781, + "learning_rate": 9.855412814951661e-05, + "loss": 2.0514, + "step": 3404 + }, + { + "epoch": 1.0451197053406998, + "grad_norm": 0.5654930472373962, + "learning_rate": 9.855294121868918e-05, + "loss": 2.1342, + "step": 3405 + }, + { + "epoch": 1.045426642111725, + "grad_norm": 0.516094446182251, + "learning_rate": 9.855175380803312e-05, + "loss": 2.01, + "step": 3406 + }, + { + "epoch": 1.0457335788827502, + "grad_norm": 0.5198549628257751, + "learning_rate": 9.855056591756018e-05, + "loss": 2.0423, + "step": 3407 + }, + { + "epoch": 1.0460405156537753, + "grad_norm": 0.45312678813934326, + "learning_rate": 9.854937754728209e-05, + "loss": 1.9767, + "step": 3408 + }, + { + "epoch": 1.0463474524248004, + "grad_norm": 0.4647958278656006, + "learning_rate": 9.854818869721059e-05, + "loss": 2.107, + "step": 3409 + }, + { + "epoch": 1.0466543891958258, + "grad_norm": 0.5034347772598267, + "learning_rate": 9.854699936735742e-05, + "loss": 2.0358, + "step": 3410 + }, + { + "epoch": 1.0469613259668509, + "grad_norm": 0.48189103603363037, + "learning_rate": 9.854580955773435e-05, + "loss": 2.0441, + "step": 3411 + }, + { + "epoch": 1.047268262737876, + "grad_norm": 0.5315099954605103, + "learning_rate": 9.854461926835316e-05, + "loss": 2.0222, + "step": 3412 + }, + { + "epoch": 1.047575199508901, + "grad_norm": 0.6013970971107483, + "learning_rate": 9.854342849922557e-05, + "loss": 2.09, + "step": 3413 + }, + { + "epoch": 1.0478821362799264, + "grad_norm": 0.7554240226745605, + "learning_rate": 9.854223725036339e-05, + "loss": 2.0411, + "step": 3414 + }, + { + "epoch": 1.0481890730509515, + "grad_norm": 0.7160158157348633, + "learning_rate": 9.854104552177835e-05, + "loss": 2.0858, + "step": 3415 + }, + { + "epoch": 1.0484960098219767, + "grad_norm": 0.5641576051712036, + "learning_rate": 9.853985331348225e-05, + "loss": 2.0287, + "step": 3416 + }, + { + "epoch": 1.0488029465930018, + "grad_norm": 0.5947676301002502, + "learning_rate": 9.853866062548687e-05, + "loss": 2.1177, + "step": 3417 + }, + { + "epoch": 1.049109883364027, + "grad_norm": 0.5780991911888123, + "learning_rate": 9.853746745780401e-05, + "loss": 2.024, + "step": 3418 + }, + { + "epoch": 1.0494168201350522, + "grad_norm": 0.6753053665161133, + "learning_rate": 9.853627381044543e-05, + "loss": 2.1303, + "step": 3419 + }, + { + "epoch": 1.0497237569060773, + "grad_norm": 0.7183442711830139, + "learning_rate": 9.853507968342295e-05, + "loss": 2.0845, + "step": 3420 + }, + { + "epoch": 1.0500306936771024, + "grad_norm": 0.6768840551376343, + "learning_rate": 9.853388507674837e-05, + "loss": 2.0991, + "step": 3421 + }, + { + "epoch": 1.0503376304481278, + "grad_norm": 0.624703049659729, + "learning_rate": 9.85326899904335e-05, + "loss": 2.0952, + "step": 3422 + }, + { + "epoch": 1.050644567219153, + "grad_norm": 0.523289144039154, + "learning_rate": 9.853149442449013e-05, + "loss": 2.0244, + "step": 3423 + }, + { + "epoch": 1.050951503990178, + "grad_norm": 0.4939860701560974, + "learning_rate": 9.853029837893008e-05, + "loss": 2.0312, + "step": 3424 + }, + { + "epoch": 1.0512584407612031, + "grad_norm": 0.5685132145881653, + "learning_rate": 9.852910185376519e-05, + "loss": 2.0863, + "step": 3425 + }, + { + "epoch": 1.0515653775322285, + "grad_norm": 0.5713129639625549, + "learning_rate": 9.852790484900725e-05, + "loss": 2.1182, + "step": 3426 + }, + { + "epoch": 1.0518723143032536, + "grad_norm": 0.5626100301742554, + "learning_rate": 9.852670736466813e-05, + "loss": 2.0187, + "step": 3427 + }, + { + "epoch": 1.0521792510742787, + "grad_norm": 0.5129684805870056, + "learning_rate": 9.852550940075965e-05, + "loss": 2.0354, + "step": 3428 + }, + { + "epoch": 1.0524861878453038, + "grad_norm": 0.6123769879341125, + "learning_rate": 9.852431095729361e-05, + "loss": 2.1315, + "step": 3429 + }, + { + "epoch": 1.0527931246163291, + "grad_norm": 0.66834956407547, + "learning_rate": 9.852311203428192e-05, + "loss": 2.1642, + "step": 3430 + }, + { + "epoch": 1.0531000613873542, + "grad_norm": 0.6253052353858948, + "learning_rate": 9.85219126317364e-05, + "loss": 2.0651, + "step": 3431 + }, + { + "epoch": 1.0534069981583793, + "grad_norm": 0.5162510871887207, + "learning_rate": 9.852071274966888e-05, + "loss": 2.0029, + "step": 3432 + }, + { + "epoch": 1.0537139349294045, + "grad_norm": 0.5725626349449158, + "learning_rate": 9.851951238809125e-05, + "loss": 2.0875, + "step": 3433 + }, + { + "epoch": 1.0540208717004298, + "grad_norm": 0.5319885611534119, + "learning_rate": 9.851831154701537e-05, + "loss": 2.0042, + "step": 3434 + }, + { + "epoch": 1.054327808471455, + "grad_norm": 0.5030925273895264, + "learning_rate": 9.851711022645307e-05, + "loss": 1.9805, + "step": 3435 + }, + { + "epoch": 1.05463474524248, + "grad_norm": 0.5786148309707642, + "learning_rate": 9.851590842641627e-05, + "loss": 2.1456, + "step": 3436 + }, + { + "epoch": 1.0549416820135051, + "grad_norm": 0.6246622800827026, + "learning_rate": 9.851470614691682e-05, + "loss": 2.042, + "step": 3437 + }, + { + "epoch": 1.0552486187845305, + "grad_norm": 0.5181210041046143, + "learning_rate": 9.851350338796662e-05, + "loss": 2.0423, + "step": 3438 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.5505120754241943, + "learning_rate": 9.851230014957754e-05, + "loss": 2.0478, + "step": 3439 + }, + { + "epoch": 1.0558624923265807, + "grad_norm": 0.6193632483482361, + "learning_rate": 9.851109643176147e-05, + "loss": 1.9904, + "step": 3440 + }, + { + "epoch": 1.0561694290976058, + "grad_norm": 0.6332803964614868, + "learning_rate": 9.85098922345303e-05, + "loss": 2.0037, + "step": 3441 + }, + { + "epoch": 1.0564763658686311, + "grad_norm": 0.5601481199264526, + "learning_rate": 9.850868755789595e-05, + "loss": 2.141, + "step": 3442 + }, + { + "epoch": 1.0567833026396563, + "grad_norm": 0.588182270526886, + "learning_rate": 9.850748240187033e-05, + "loss": 2.17, + "step": 3443 + }, + { + "epoch": 1.0570902394106814, + "grad_norm": 0.5955865383148193, + "learning_rate": 9.850627676646533e-05, + "loss": 2.1004, + "step": 3444 + }, + { + "epoch": 1.0573971761817065, + "grad_norm": 0.6412670612335205, + "learning_rate": 9.850507065169288e-05, + "loss": 2.0642, + "step": 3445 + }, + { + "epoch": 1.0577041129527318, + "grad_norm": 0.5597305297851562, + "learning_rate": 9.850386405756489e-05, + "loss": 2.0412, + "step": 3446 + }, + { + "epoch": 1.058011049723757, + "grad_norm": 0.5633887052536011, + "learning_rate": 9.850265698409328e-05, + "loss": 1.9976, + "step": 3447 + }, + { + "epoch": 1.058317986494782, + "grad_norm": 0.5924213528633118, + "learning_rate": 9.850144943128998e-05, + "loss": 2.0715, + "step": 3448 + }, + { + "epoch": 1.0586249232658071, + "grad_norm": 0.5968048572540283, + "learning_rate": 9.850024139916694e-05, + "loss": 2.0755, + "step": 3449 + }, + { + "epoch": 1.0589318600368325, + "grad_norm": 0.5745044946670532, + "learning_rate": 9.849903288773609e-05, + "loss": 2.0615, + "step": 3450 + }, + { + "epoch": 1.0592387968078576, + "grad_norm": 0.5154273509979248, + "learning_rate": 9.849782389700936e-05, + "loss": 2.0429, + "step": 3451 + }, + { + "epoch": 1.0595457335788827, + "grad_norm": 0.5307286977767944, + "learning_rate": 9.849661442699871e-05, + "loss": 2.0788, + "step": 3452 + }, + { + "epoch": 1.0598526703499078, + "grad_norm": 0.5445010662078857, + "learning_rate": 9.84954044777161e-05, + "loss": 2.0598, + "step": 3453 + }, + { + "epoch": 1.0601596071209332, + "grad_norm": 0.5858064889907837, + "learning_rate": 9.849419404917347e-05, + "loss": 2.069, + "step": 3454 + }, + { + "epoch": 1.0604665438919583, + "grad_norm": 0.5906962156295776, + "learning_rate": 9.84929831413828e-05, + "loss": 2.1256, + "step": 3455 + }, + { + "epoch": 1.0607734806629834, + "grad_norm": 0.6632845997810364, + "learning_rate": 9.849177175435605e-05, + "loss": 2.1002, + "step": 3456 + }, + { + "epoch": 1.0610804174340085, + "grad_norm": 0.6352782845497131, + "learning_rate": 9.849055988810518e-05, + "loss": 2.0901, + "step": 3457 + }, + { + "epoch": 1.0613873542050338, + "grad_norm": 0.5406731963157654, + "learning_rate": 9.848934754264218e-05, + "loss": 2.0562, + "step": 3458 + }, + { + "epoch": 1.061694290976059, + "grad_norm": 0.6067590117454529, + "learning_rate": 9.848813471797902e-05, + "loss": 2.0914, + "step": 3459 + }, + { + "epoch": 1.062001227747084, + "grad_norm": 0.5876826047897339, + "learning_rate": 9.84869214141277e-05, + "loss": 2.0065, + "step": 3460 + }, + { + "epoch": 1.0623081645181092, + "grad_norm": 0.611648440361023, + "learning_rate": 9.84857076311002e-05, + "loss": 2.1252, + "step": 3461 + }, + { + "epoch": 1.0626151012891345, + "grad_norm": 0.568358302116394, + "learning_rate": 9.848449336890853e-05, + "loss": 2.0312, + "step": 3462 + }, + { + "epoch": 1.0629220380601596, + "grad_norm": 0.5303518772125244, + "learning_rate": 9.848327862756466e-05, + "loss": 1.9989, + "step": 3463 + }, + { + "epoch": 1.0632289748311847, + "grad_norm": 0.5377182960510254, + "learning_rate": 9.848206340708062e-05, + "loss": 2.0759, + "step": 3464 + }, + { + "epoch": 1.06353591160221, + "grad_norm": 0.5178431868553162, + "learning_rate": 9.848084770746842e-05, + "loss": 2.0613, + "step": 3465 + }, + { + "epoch": 1.0638428483732352, + "grad_norm": 0.4605518877506256, + "learning_rate": 9.847963152874007e-05, + "loss": 1.9961, + "step": 3466 + }, + { + "epoch": 1.0641497851442603, + "grad_norm": 0.5262506604194641, + "learning_rate": 9.847841487090758e-05, + "loss": 2.032, + "step": 3467 + }, + { + "epoch": 1.0644567219152854, + "grad_norm": 0.5210484862327576, + "learning_rate": 9.847719773398298e-05, + "loss": 2.106, + "step": 3468 + }, + { + "epoch": 1.0647636586863105, + "grad_norm": 0.5159584283828735, + "learning_rate": 9.84759801179783e-05, + "loss": 2.07, + "step": 3469 + }, + { + "epoch": 1.0650705954573358, + "grad_norm": 0.5094224810600281, + "learning_rate": 9.847476202290557e-05, + "loss": 2.1379, + "step": 3470 + }, + { + "epoch": 1.065377532228361, + "grad_norm": 0.5180851221084595, + "learning_rate": 9.847354344877684e-05, + "loss": 2.0911, + "step": 3471 + }, + { + "epoch": 1.065684468999386, + "grad_norm": 0.5476199984550476, + "learning_rate": 9.847232439560412e-05, + "loss": 2.0654, + "step": 3472 + }, + { + "epoch": 1.0659914057704114, + "grad_norm": 0.5314182639122009, + "learning_rate": 9.84711048633995e-05, + "loss": 1.9829, + "step": 3473 + }, + { + "epoch": 1.0662983425414365, + "grad_norm": 0.549379825592041, + "learning_rate": 9.8469884852175e-05, + "loss": 2.0876, + "step": 3474 + }, + { + "epoch": 1.0666052793124616, + "grad_norm": 0.6280861496925354, + "learning_rate": 9.84686643619427e-05, + "loss": 2.1026, + "step": 3475 + }, + { + "epoch": 1.0669122160834867, + "grad_norm": 0.5838838219642639, + "learning_rate": 9.846744339271464e-05, + "loss": 2.0553, + "step": 3476 + }, + { + "epoch": 1.0672191528545119, + "grad_norm": 0.6090747117996216, + "learning_rate": 9.84662219445029e-05, + "loss": 2.0983, + "step": 3477 + }, + { + "epoch": 1.0675260896255372, + "grad_norm": 0.515504002571106, + "learning_rate": 9.846500001731955e-05, + "loss": 2.0992, + "step": 3478 + }, + { + "epoch": 1.0678330263965623, + "grad_norm": 0.5083954930305481, + "learning_rate": 9.846377761117667e-05, + "loss": 1.9851, + "step": 3479 + }, + { + "epoch": 1.0681399631675874, + "grad_norm": 0.5102222561836243, + "learning_rate": 9.846255472608632e-05, + "loss": 2.0553, + "step": 3480 + }, + { + "epoch": 1.0684468999386127, + "grad_norm": 0.5123574137687683, + "learning_rate": 9.846133136206061e-05, + "loss": 2.0382, + "step": 3481 + }, + { + "epoch": 1.0687538367096379, + "grad_norm": 0.5657833814620972, + "learning_rate": 9.84601075191116e-05, + "loss": 2.0735, + "step": 3482 + }, + { + "epoch": 1.069060773480663, + "grad_norm": 0.5460711121559143, + "learning_rate": 9.845888319725143e-05, + "loss": 2.0445, + "step": 3483 + }, + { + "epoch": 1.069367710251688, + "grad_norm": 0.42860034108161926, + "learning_rate": 9.845765839649217e-05, + "loss": 2.0166, + "step": 3484 + }, + { + "epoch": 1.0696746470227134, + "grad_norm": 0.5413190126419067, + "learning_rate": 9.845643311684592e-05, + "loss": 1.9923, + "step": 3485 + }, + { + "epoch": 1.0699815837937385, + "grad_norm": 0.4982166290283203, + "learning_rate": 9.84552073583248e-05, + "loss": 2.0279, + "step": 3486 + }, + { + "epoch": 1.0702885205647636, + "grad_norm": 0.4824393689632416, + "learning_rate": 9.845398112094091e-05, + "loss": 1.9661, + "step": 3487 + }, + { + "epoch": 1.0705954573357888, + "grad_norm": 0.5690898895263672, + "learning_rate": 9.845275440470639e-05, + "loss": 2.0866, + "step": 3488 + }, + { + "epoch": 1.070902394106814, + "grad_norm": 0.6087098717689514, + "learning_rate": 9.845152720963335e-05, + "loss": 2.055, + "step": 3489 + }, + { + "epoch": 1.0712093308778392, + "grad_norm": 0.5754218101501465, + "learning_rate": 9.845029953573392e-05, + "loss": 2.0577, + "step": 3490 + }, + { + "epoch": 1.0715162676488643, + "grad_norm": 0.619746744632721, + "learning_rate": 9.844907138302023e-05, + "loss": 2.0694, + "step": 3491 + }, + { + "epoch": 1.0718232044198894, + "grad_norm": 0.5165389776229858, + "learning_rate": 9.844784275150442e-05, + "loss": 1.9618, + "step": 3492 + }, + { + "epoch": 1.0721301411909148, + "grad_norm": 0.5098079442977905, + "learning_rate": 9.844661364119863e-05, + "loss": 2.0021, + "step": 3493 + }, + { + "epoch": 1.0724370779619399, + "grad_norm": 0.5978688597679138, + "learning_rate": 9.8445384052115e-05, + "loss": 2.0861, + "step": 3494 + }, + { + "epoch": 1.072744014732965, + "grad_norm": 0.5498695373535156, + "learning_rate": 9.844415398426572e-05, + "loss": 2.095, + "step": 3495 + }, + { + "epoch": 1.07305095150399, + "grad_norm": 0.4890369474887848, + "learning_rate": 9.844292343766289e-05, + "loss": 1.9819, + "step": 3496 + }, + { + "epoch": 1.0733578882750154, + "grad_norm": 0.49551400542259216, + "learning_rate": 9.844169241231871e-05, + "loss": 2.109, + "step": 3497 + }, + { + "epoch": 1.0736648250460405, + "grad_norm": 0.5358633399009705, + "learning_rate": 9.844046090824533e-05, + "loss": 2.0579, + "step": 3498 + }, + { + "epoch": 1.0739717618170657, + "grad_norm": 0.5990919470787048, + "learning_rate": 9.843922892545492e-05, + "loss": 2.1962, + "step": 3499 + }, + { + "epoch": 1.0742786985880908, + "grad_norm": 0.5973169207572937, + "learning_rate": 9.843799646395967e-05, + "loss": 2.0691, + "step": 3500 + }, + { + "epoch": 1.074585635359116, + "grad_norm": 0.5875831246376038, + "learning_rate": 9.843676352377172e-05, + "loss": 2.0807, + "step": 3501 + }, + { + "epoch": 1.0748925721301412, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.84355301049033e-05, + "loss": 2.0694, + "step": 3502 + }, + { + "epoch": 1.0751995089011663, + "grad_norm": 0.7694209814071655, + "learning_rate": 9.843429620736659e-05, + "loss": 2.1504, + "step": 3503 + }, + { + "epoch": 1.0755064456721914, + "grad_norm": 0.7930089831352234, + "learning_rate": 9.843306183117376e-05, + "loss": 2.0635, + "step": 3504 + }, + { + "epoch": 1.0758133824432168, + "grad_norm": 0.6518469452857971, + "learning_rate": 9.843182697633704e-05, + "loss": 2.0395, + "step": 3505 + }, + { + "epoch": 1.076120319214242, + "grad_norm": 0.49737605452537537, + "learning_rate": 9.843059164286861e-05, + "loss": 1.9875, + "step": 3506 + }, + { + "epoch": 1.076427255985267, + "grad_norm": 0.5311492085456848, + "learning_rate": 9.84293558307807e-05, + "loss": 2.1331, + "step": 3507 + }, + { + "epoch": 1.0767341927562921, + "grad_norm": 0.6801449656486511, + "learning_rate": 9.842811954008551e-05, + "loss": 2.0991, + "step": 3508 + }, + { + "epoch": 1.0770411295273175, + "grad_norm": 0.5404406189918518, + "learning_rate": 9.842688277079523e-05, + "loss": 2.0482, + "step": 3509 + }, + { + "epoch": 1.0773480662983426, + "grad_norm": 0.6136532425880432, + "learning_rate": 9.842564552292215e-05, + "loss": 2.1016, + "step": 3510 + }, + { + "epoch": 1.0776550030693677, + "grad_norm": 0.5874183773994446, + "learning_rate": 9.842440779647843e-05, + "loss": 2.0495, + "step": 3511 + }, + { + "epoch": 1.0779619398403928, + "grad_norm": 0.4891047775745392, + "learning_rate": 9.842316959147635e-05, + "loss": 2.0592, + "step": 3512 + }, + { + "epoch": 1.0782688766114181, + "grad_norm": 0.5115689635276794, + "learning_rate": 9.84219309079281e-05, + "loss": 2.0084, + "step": 3513 + }, + { + "epoch": 1.0785758133824432, + "grad_norm": 0.5662370324134827, + "learning_rate": 9.842069174584597e-05, + "loss": 2.1134, + "step": 3514 + }, + { + "epoch": 1.0788827501534684, + "grad_norm": 0.6859605312347412, + "learning_rate": 9.841945210524217e-05, + "loss": 2.1144, + "step": 3515 + }, + { + "epoch": 1.0791896869244935, + "grad_norm": 0.8003933429718018, + "learning_rate": 9.841821198612897e-05, + "loss": 2.0353, + "step": 3516 + }, + { + "epoch": 1.0794966236955188, + "grad_norm": 0.8481027483940125, + "learning_rate": 9.841697138851863e-05, + "loss": 2.1012, + "step": 3517 + }, + { + "epoch": 1.079803560466544, + "grad_norm": 0.7234178185462952, + "learning_rate": 9.84157303124234e-05, + "loss": 2.1134, + "step": 3518 + }, + { + "epoch": 1.080110497237569, + "grad_norm": 0.6129522919654846, + "learning_rate": 9.841448875785553e-05, + "loss": 2.0736, + "step": 3519 + }, + { + "epoch": 1.0804174340085941, + "grad_norm": 0.4983314573764801, + "learning_rate": 9.841324672482732e-05, + "loss": 2.0334, + "step": 3520 + }, + { + "epoch": 1.0807243707796195, + "grad_norm": 0.6069099307060242, + "learning_rate": 9.841200421335101e-05, + "loss": 2.0506, + "step": 3521 + }, + { + "epoch": 1.0810313075506446, + "grad_norm": 0.5841798186302185, + "learning_rate": 9.841076122343893e-05, + "loss": 2.0491, + "step": 3522 + }, + { + "epoch": 1.0813382443216697, + "grad_norm": 0.5629861354827881, + "learning_rate": 9.84095177551033e-05, + "loss": 2.0435, + "step": 3523 + }, + { + "epoch": 1.0816451810926948, + "grad_norm": 0.48676446080207825, + "learning_rate": 9.840827380835646e-05, + "loss": 2.0543, + "step": 3524 + }, + { + "epoch": 1.0819521178637201, + "grad_norm": 0.5119389295578003, + "learning_rate": 9.840702938321069e-05, + "loss": 2.0461, + "step": 3525 + }, + { + "epoch": 1.0822590546347453, + "grad_norm": 0.47259917855262756, + "learning_rate": 9.840578447967827e-05, + "loss": 2.0494, + "step": 3526 + }, + { + "epoch": 1.0825659914057704, + "grad_norm": 0.5083605647087097, + "learning_rate": 9.840453909777153e-05, + "loss": 2.0518, + "step": 3527 + }, + { + "epoch": 1.0828729281767955, + "grad_norm": 0.46149778366088867, + "learning_rate": 9.840329323750276e-05, + "loss": 2.0087, + "step": 3528 + }, + { + "epoch": 1.0831798649478208, + "grad_norm": 0.4698919951915741, + "learning_rate": 9.840204689888427e-05, + "loss": 2.0715, + "step": 3529 + }, + { + "epoch": 1.083486801718846, + "grad_norm": 0.514570951461792, + "learning_rate": 9.840080008192838e-05, + "loss": 2.1067, + "step": 3530 + }, + { + "epoch": 1.083793738489871, + "grad_norm": 0.5938723087310791, + "learning_rate": 9.839955278664743e-05, + "loss": 2.1246, + "step": 3531 + }, + { + "epoch": 1.0841006752608962, + "grad_norm": 0.58525550365448, + "learning_rate": 9.839830501305372e-05, + "loss": 2.0695, + "step": 3532 + }, + { + "epoch": 1.0844076120319215, + "grad_norm": 0.5693490505218506, + "learning_rate": 9.83970567611596e-05, + "loss": 2.0166, + "step": 3533 + }, + { + "epoch": 1.0847145488029466, + "grad_norm": 0.544964075088501, + "learning_rate": 9.839580803097738e-05, + "loss": 2.0093, + "step": 3534 + }, + { + "epoch": 1.0850214855739717, + "grad_norm": 0.5509639978408813, + "learning_rate": 9.839455882251945e-05, + "loss": 2.0511, + "step": 3535 + }, + { + "epoch": 1.0853284223449968, + "grad_norm": 0.5092516541481018, + "learning_rate": 9.83933091357981e-05, + "loss": 2.0586, + "step": 3536 + }, + { + "epoch": 1.0856353591160222, + "grad_norm": 0.5163968205451965, + "learning_rate": 9.83920589708257e-05, + "loss": 2.0541, + "step": 3537 + }, + { + "epoch": 1.0859422958870473, + "grad_norm": 0.49756479263305664, + "learning_rate": 9.839080832761464e-05, + "loss": 2.0495, + "step": 3538 + }, + { + "epoch": 1.0862492326580724, + "grad_norm": 0.6246916055679321, + "learning_rate": 9.838955720617722e-05, + "loss": 2.2082, + "step": 3539 + }, + { + "epoch": 1.0865561694290977, + "grad_norm": 0.5826153755187988, + "learning_rate": 9.838830560652585e-05, + "loss": 2.0318, + "step": 3540 + }, + { + "epoch": 1.0868631062001228, + "grad_norm": 0.6131548285484314, + "learning_rate": 9.838705352867287e-05, + "loss": 2.1172, + "step": 3541 + }, + { + "epoch": 1.087170042971148, + "grad_norm": 0.7028201818466187, + "learning_rate": 9.838580097263068e-05, + "loss": 2.061, + "step": 3542 + }, + { + "epoch": 1.087476979742173, + "grad_norm": 0.7061073780059814, + "learning_rate": 9.838454793841166e-05, + "loss": 2.0944, + "step": 3543 + }, + { + "epoch": 1.0877839165131982, + "grad_norm": 0.6820229887962341, + "learning_rate": 9.838329442602814e-05, + "loss": 2.072, + "step": 3544 + }, + { + "epoch": 1.0880908532842235, + "grad_norm": 0.5658139586448669, + "learning_rate": 9.838204043549257e-05, + "loss": 2.0499, + "step": 3545 + }, + { + "epoch": 1.0883977900552486, + "grad_norm": 0.5714126825332642, + "learning_rate": 9.838078596681731e-05, + "loss": 2.06, + "step": 3546 + }, + { + "epoch": 1.0887047268262737, + "grad_norm": 0.5343610048294067, + "learning_rate": 9.837953102001477e-05, + "loss": 2.0932, + "step": 3547 + }, + { + "epoch": 1.089011663597299, + "grad_norm": 0.5799851417541504, + "learning_rate": 9.837827559509735e-05, + "loss": 2.0615, + "step": 3548 + }, + { + "epoch": 1.0893186003683242, + "grad_norm": 0.5679401159286499, + "learning_rate": 9.837701969207745e-05, + "loss": 2.0161, + "step": 3549 + }, + { + "epoch": 1.0896255371393493, + "grad_norm": 0.5369420647621155, + "learning_rate": 9.83757633109675e-05, + "loss": 2.0066, + "step": 3550 + }, + { + "epoch": 1.0899324739103744, + "grad_norm": 0.5276355147361755, + "learning_rate": 9.837450645177988e-05, + "loss": 2.03, + "step": 3551 + }, + { + "epoch": 1.0902394106813997, + "grad_norm": 0.49717894196510315, + "learning_rate": 9.837324911452705e-05, + "loss": 1.9897, + "step": 3552 + }, + { + "epoch": 1.0905463474524248, + "grad_norm": 0.460783451795578, + "learning_rate": 9.837199129922142e-05, + "loss": 2.089, + "step": 3553 + }, + { + "epoch": 1.09085328422345, + "grad_norm": 0.505473792552948, + "learning_rate": 9.837073300587541e-05, + "loss": 2.035, + "step": 3554 + }, + { + "epoch": 1.091160220994475, + "grad_norm": 0.4588155150413513, + "learning_rate": 9.836947423450147e-05, + "loss": 2.0029, + "step": 3555 + }, + { + "epoch": 1.0914671577655004, + "grad_norm": 0.5151825547218323, + "learning_rate": 9.836821498511203e-05, + "loss": 2.1075, + "step": 3556 + }, + { + "epoch": 1.0917740945365255, + "grad_norm": 0.46669647097587585, + "learning_rate": 9.836695525771955e-05, + "loss": 2.0468, + "step": 3557 + }, + { + "epoch": 1.0920810313075506, + "grad_norm": 0.49291539192199707, + "learning_rate": 9.836569505233647e-05, + "loss": 2.1201, + "step": 3558 + }, + { + "epoch": 1.0923879680785757, + "grad_norm": 0.49323126673698425, + "learning_rate": 9.836443436897525e-05, + "loss": 1.9796, + "step": 3559 + }, + { + "epoch": 1.092694904849601, + "grad_norm": 0.4784039258956909, + "learning_rate": 9.836317320764832e-05, + "loss": 2.0267, + "step": 3560 + }, + { + "epoch": 1.0930018416206262, + "grad_norm": 0.5402999520301819, + "learning_rate": 9.836191156836818e-05, + "loss": 2.07, + "step": 3561 + }, + { + "epoch": 1.0933087783916513, + "grad_norm": 0.5989857912063599, + "learning_rate": 9.83606494511473e-05, + "loss": 2.0518, + "step": 3562 + }, + { + "epoch": 1.0936157151626764, + "grad_norm": 0.685855507850647, + "learning_rate": 9.835938685599811e-05, + "loss": 2.0632, + "step": 3563 + }, + { + "epoch": 1.0939226519337018, + "grad_norm": 0.7716066837310791, + "learning_rate": 9.835812378293312e-05, + "loss": 2.0758, + "step": 3564 + }, + { + "epoch": 1.0942295887047269, + "grad_norm": 0.6822659969329834, + "learning_rate": 9.835686023196481e-05, + "loss": 2.0077, + "step": 3565 + }, + { + "epoch": 1.094536525475752, + "grad_norm": 0.5031718611717224, + "learning_rate": 9.835559620310566e-05, + "loss": 2.0432, + "step": 3566 + }, + { + "epoch": 1.094843462246777, + "grad_norm": 0.5570902228355408, + "learning_rate": 9.835433169636818e-05, + "loss": 2.1203, + "step": 3567 + }, + { + "epoch": 1.0951503990178024, + "grad_norm": 0.6224993467330933, + "learning_rate": 9.835306671176484e-05, + "loss": 2.0281, + "step": 3568 + }, + { + "epoch": 1.0954573357888275, + "grad_norm": 0.67215895652771, + "learning_rate": 9.835180124930816e-05, + "loss": 2.1158, + "step": 3569 + }, + { + "epoch": 1.0957642725598526, + "grad_norm": 0.5764983892440796, + "learning_rate": 9.835053530901064e-05, + "loss": 1.9735, + "step": 3570 + }, + { + "epoch": 1.0960712093308778, + "grad_norm": 0.48459672927856445, + "learning_rate": 9.834926889088478e-05, + "loss": 2.0074, + "step": 3571 + }, + { + "epoch": 1.096378146101903, + "grad_norm": 0.4789890944957733, + "learning_rate": 9.834800199494312e-05, + "loss": 1.9942, + "step": 3572 + }, + { + "epoch": 1.0966850828729282, + "grad_norm": 0.5133237838745117, + "learning_rate": 9.834673462119817e-05, + "loss": 2.0204, + "step": 3573 + }, + { + "epoch": 1.0969920196439533, + "grad_norm": 0.638518750667572, + "learning_rate": 9.834546676966244e-05, + "loss": 2.1396, + "step": 3574 + }, + { + "epoch": 1.0972989564149784, + "grad_norm": 0.5471677780151367, + "learning_rate": 9.834419844034848e-05, + "loss": 1.99, + "step": 3575 + }, + { + "epoch": 1.0976058931860038, + "grad_norm": 0.5372926592826843, + "learning_rate": 9.83429296332688e-05, + "loss": 2.0241, + "step": 3576 + }, + { + "epoch": 1.0979128299570289, + "grad_norm": 0.5284983515739441, + "learning_rate": 9.834166034843597e-05, + "loss": 2.0705, + "step": 3577 + }, + { + "epoch": 1.098219766728054, + "grad_norm": 0.5212574601173401, + "learning_rate": 9.834039058586252e-05, + "loss": 2.0648, + "step": 3578 + }, + { + "epoch": 1.098526703499079, + "grad_norm": 0.439454048871994, + "learning_rate": 9.833912034556099e-05, + "loss": 1.9981, + "step": 3579 + }, + { + "epoch": 1.0988336402701044, + "grad_norm": 0.529550313949585, + "learning_rate": 9.833784962754394e-05, + "loss": 2.0092, + "step": 3580 + }, + { + "epoch": 1.0991405770411296, + "grad_norm": 0.5555844902992249, + "learning_rate": 9.833657843182394e-05, + "loss": 2.0457, + "step": 3581 + }, + { + "epoch": 1.0994475138121547, + "grad_norm": 0.56191086769104, + "learning_rate": 9.833530675841352e-05, + "loss": 2.0742, + "step": 3582 + }, + { + "epoch": 1.0997544505831798, + "grad_norm": 0.5119436383247375, + "learning_rate": 9.833403460732529e-05, + "loss": 2.0836, + "step": 3583 + }, + { + "epoch": 1.1000613873542051, + "grad_norm": 0.48049578070640564, + "learning_rate": 9.833276197857179e-05, + "loss": 2.0018, + "step": 3584 + }, + { + "epoch": 1.1003683241252302, + "grad_norm": 0.48501092195510864, + "learning_rate": 9.83314888721656e-05, + "loss": 2.0158, + "step": 3585 + }, + { + "epoch": 1.1006752608962553, + "grad_norm": 0.528548538684845, + "learning_rate": 9.833021528811932e-05, + "loss": 2.0327, + "step": 3586 + }, + { + "epoch": 1.1009821976672804, + "grad_norm": 0.5243194699287415, + "learning_rate": 9.832894122644551e-05, + "loss": 1.9874, + "step": 3587 + }, + { + "epoch": 1.1012891344383058, + "grad_norm": 0.46920302510261536, + "learning_rate": 9.832766668715681e-05, + "loss": 2.0487, + "step": 3588 + }, + { + "epoch": 1.101596071209331, + "grad_norm": 0.45994171500205994, + "learning_rate": 9.832639167026575e-05, + "loss": 2.0926, + "step": 3589 + }, + { + "epoch": 1.101903007980356, + "grad_norm": 0.5337465405464172, + "learning_rate": 9.832511617578497e-05, + "loss": 1.9957, + "step": 3590 + }, + { + "epoch": 1.1022099447513811, + "grad_norm": 0.5920217633247375, + "learning_rate": 9.832384020372707e-05, + "loss": 2.0571, + "step": 3591 + }, + { + "epoch": 1.1025168815224065, + "grad_norm": 0.651720404624939, + "learning_rate": 9.832256375410466e-05, + "loss": 2.0382, + "step": 3592 + }, + { + "epoch": 1.1028238182934316, + "grad_norm": 0.6063461899757385, + "learning_rate": 9.832128682693035e-05, + "loss": 1.9932, + "step": 3593 + }, + { + "epoch": 1.1031307550644567, + "grad_norm": 0.5111881494522095, + "learning_rate": 9.832000942221676e-05, + "loss": 1.9821, + "step": 3594 + }, + { + "epoch": 1.1034376918354818, + "grad_norm": 0.5419835448265076, + "learning_rate": 9.831873153997652e-05, + "loss": 2.0535, + "step": 3595 + }, + { + "epoch": 1.1037446286065071, + "grad_norm": 0.5685762763023376, + "learning_rate": 9.831745318022226e-05, + "loss": 2.0715, + "step": 3596 + }, + { + "epoch": 1.1040515653775322, + "grad_norm": 0.6095051765441895, + "learning_rate": 9.831617434296659e-05, + "loss": 2.0382, + "step": 3597 + }, + { + "epoch": 1.1043585021485574, + "grad_norm": 0.548292338848114, + "learning_rate": 9.831489502822217e-05, + "loss": 1.98, + "step": 3598 + }, + { + "epoch": 1.1046654389195825, + "grad_norm": 0.5056986808776855, + "learning_rate": 9.831361523600165e-05, + "loss": 2.0271, + "step": 3599 + }, + { + "epoch": 1.1049723756906078, + "grad_norm": 0.48790082335472107, + "learning_rate": 9.831233496631767e-05, + "loss": 1.9555, + "step": 3600 + }, + { + "epoch": 1.105279312461633, + "grad_norm": 0.4663766622543335, + "learning_rate": 9.831105421918287e-05, + "loss": 1.9985, + "step": 3601 + }, + { + "epoch": 1.105586249232658, + "grad_norm": 0.4549616277217865, + "learning_rate": 9.83097729946099e-05, + "loss": 2.0543, + "step": 3602 + }, + { + "epoch": 1.1058931860036831, + "grad_norm": 0.46699193120002747, + "learning_rate": 9.830849129261146e-05, + "loss": 2.0395, + "step": 3603 + }, + { + "epoch": 1.1062001227747085, + "grad_norm": 0.4600387215614319, + "learning_rate": 9.830720911320019e-05, + "loss": 2.0155, + "step": 3604 + }, + { + "epoch": 1.1065070595457336, + "grad_norm": 0.4854283034801483, + "learning_rate": 9.830592645638877e-05, + "loss": 2.0698, + "step": 3605 + }, + { + "epoch": 1.1068139963167587, + "grad_norm": 0.5249526500701904, + "learning_rate": 9.830464332218987e-05, + "loss": 2.0842, + "step": 3606 + }, + { + "epoch": 1.107120933087784, + "grad_norm": 0.6377332806587219, + "learning_rate": 9.830335971061616e-05, + "loss": 2.1399, + "step": 3607 + }, + { + "epoch": 1.1074278698588091, + "grad_norm": 0.632194995880127, + "learning_rate": 9.830207562168034e-05, + "loss": 2.1203, + "step": 3608 + }, + { + "epoch": 1.1077348066298343, + "grad_norm": 0.5585857629776001, + "learning_rate": 9.830079105539512e-05, + "loss": 2.0219, + "step": 3609 + }, + { + "epoch": 1.1080417434008594, + "grad_norm": 0.5613297820091248, + "learning_rate": 9.829950601177316e-05, + "loss": 2.0464, + "step": 3610 + }, + { + "epoch": 1.1083486801718845, + "grad_norm": 0.5213276743888855, + "learning_rate": 9.829822049082716e-05, + "loss": 2.0134, + "step": 3611 + }, + { + "epoch": 1.1086556169429098, + "grad_norm": 0.5008644461631775, + "learning_rate": 9.829693449256984e-05, + "loss": 1.9952, + "step": 3612 + }, + { + "epoch": 1.108962553713935, + "grad_norm": 0.5565455555915833, + "learning_rate": 9.829564801701392e-05, + "loss": 1.9737, + "step": 3613 + }, + { + "epoch": 1.10926949048496, + "grad_norm": 0.6150243878364563, + "learning_rate": 9.82943610641721e-05, + "loss": 2.0414, + "step": 3614 + }, + { + "epoch": 1.1095764272559854, + "grad_norm": 0.6731769442558289, + "learning_rate": 9.829307363405709e-05, + "loss": 2.0262, + "step": 3615 + }, + { + "epoch": 1.1098833640270105, + "grad_norm": 0.5681004524230957, + "learning_rate": 9.829178572668162e-05, + "loss": 2.0303, + "step": 3616 + }, + { + "epoch": 1.1101903007980356, + "grad_norm": 0.4748475253582001, + "learning_rate": 9.829049734205841e-05, + "loss": 1.9756, + "step": 3617 + }, + { + "epoch": 1.1104972375690607, + "grad_norm": 0.4218698740005493, + "learning_rate": 9.82892084802002e-05, + "loss": 2.0243, + "step": 3618 + }, + { + "epoch": 1.1108041743400858, + "grad_norm": 0.47928178310394287, + "learning_rate": 9.828791914111976e-05, + "loss": 2.0368, + "step": 3619 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5805749297142029, + "learning_rate": 9.828662932482977e-05, + "loss": 2.0071, + "step": 3620 + }, + { + "epoch": 1.1114180478821363, + "grad_norm": 0.5580070614814758, + "learning_rate": 9.828533903134302e-05, + "loss": 1.9568, + "step": 3621 + }, + { + "epoch": 1.1117249846531614, + "grad_norm": 0.572694718837738, + "learning_rate": 9.828404826067224e-05, + "loss": 2.0128, + "step": 3622 + }, + { + "epoch": 1.1120319214241867, + "grad_norm": 0.605338990688324, + "learning_rate": 9.828275701283021e-05, + "loss": 2.0638, + "step": 3623 + }, + { + "epoch": 1.1123388581952118, + "grad_norm": 0.550521969795227, + "learning_rate": 9.828146528782967e-05, + "loss": 2.118, + "step": 3624 + }, + { + "epoch": 1.112645794966237, + "grad_norm": 0.5420751571655273, + "learning_rate": 9.828017308568337e-05, + "loss": 2.0685, + "step": 3625 + }, + { + "epoch": 1.112952731737262, + "grad_norm": 0.5761057734489441, + "learning_rate": 9.827888040640414e-05, + "loss": 2.1111, + "step": 3626 + }, + { + "epoch": 1.1132596685082874, + "grad_norm": 0.5724154710769653, + "learning_rate": 9.827758725000468e-05, + "loss": 2.0596, + "step": 3627 + }, + { + "epoch": 1.1135666052793125, + "grad_norm": 0.5120618343353271, + "learning_rate": 9.827629361649783e-05, + "loss": 1.9811, + "step": 3628 + }, + { + "epoch": 1.1138735420503376, + "grad_norm": 0.4449520409107208, + "learning_rate": 9.827499950589633e-05, + "loss": 1.9935, + "step": 3629 + }, + { + "epoch": 1.1141804788213627, + "grad_norm": 0.5478667616844177, + "learning_rate": 9.827370491821302e-05, + "loss": 2.0142, + "step": 3630 + }, + { + "epoch": 1.114487415592388, + "grad_norm": 0.6170383095741272, + "learning_rate": 9.827240985346064e-05, + "loss": 2.0588, + "step": 3631 + }, + { + "epoch": 1.1147943523634132, + "grad_norm": 0.5950221419334412, + "learning_rate": 9.827111431165202e-05, + "loss": 2.0187, + "step": 3632 + }, + { + "epoch": 1.1151012891344383, + "grad_norm": 0.5250533819198608, + "learning_rate": 9.826981829279995e-05, + "loss": 2.0288, + "step": 3633 + }, + { + "epoch": 1.1154082259054634, + "grad_norm": 0.6252482533454895, + "learning_rate": 9.826852179691725e-05, + "loss": 2.1834, + "step": 3634 + }, + { + "epoch": 1.1157151626764887, + "grad_norm": 0.5258986353874207, + "learning_rate": 9.826722482401673e-05, + "loss": 1.9894, + "step": 3635 + }, + { + "epoch": 1.1160220994475138, + "grad_norm": 0.5532206892967224, + "learning_rate": 9.82659273741112e-05, + "loss": 2.013, + "step": 3636 + }, + { + "epoch": 1.116329036218539, + "grad_norm": 0.5178828835487366, + "learning_rate": 9.826462944721349e-05, + "loss": 1.955, + "step": 3637 + }, + { + "epoch": 1.116635972989564, + "grad_norm": 0.5466227531433105, + "learning_rate": 9.826333104333642e-05, + "loss": 2.1073, + "step": 3638 + }, + { + "epoch": 1.1169429097605894, + "grad_norm": 0.5513507723808289, + "learning_rate": 9.826203216249282e-05, + "loss": 2.0735, + "step": 3639 + }, + { + "epoch": 1.1172498465316145, + "grad_norm": 0.5485204458236694, + "learning_rate": 9.826073280469554e-05, + "loss": 2.0699, + "step": 3640 + }, + { + "epoch": 1.1175567833026396, + "grad_norm": 0.5148037075996399, + "learning_rate": 9.825943296995741e-05, + "loss": 1.9364, + "step": 3641 + }, + { + "epoch": 1.1178637200736647, + "grad_norm": 0.5639125108718872, + "learning_rate": 9.825813265829127e-05, + "loss": 2.078, + "step": 3642 + }, + { + "epoch": 1.11817065684469, + "grad_norm": 0.581631064414978, + "learning_rate": 9.825683186970997e-05, + "loss": 2.0404, + "step": 3643 + }, + { + "epoch": 1.1184775936157152, + "grad_norm": 0.5630286335945129, + "learning_rate": 9.82555306042264e-05, + "loss": 2.0615, + "step": 3644 + }, + { + "epoch": 1.1187845303867403, + "grad_norm": 0.5661062598228455, + "learning_rate": 9.825422886185338e-05, + "loss": 2.0432, + "step": 3645 + }, + { + "epoch": 1.1190914671577654, + "grad_norm": 0.4960556626319885, + "learning_rate": 9.825292664260379e-05, + "loss": 2.0576, + "step": 3646 + }, + { + "epoch": 1.1193984039287908, + "grad_norm": 0.5052362084388733, + "learning_rate": 9.825162394649048e-05, + "loss": 2.0615, + "step": 3647 + }, + { + "epoch": 1.1197053406998159, + "grad_norm": 0.566758930683136, + "learning_rate": 9.825032077352636e-05, + "loss": 2.0821, + "step": 3648 + }, + { + "epoch": 1.120012277470841, + "grad_norm": 0.5705568790435791, + "learning_rate": 9.824901712372429e-05, + "loss": 2.1455, + "step": 3649 + }, + { + "epoch": 1.120319214241866, + "grad_norm": 0.5584011673927307, + "learning_rate": 9.824771299709714e-05, + "loss": 2.0911, + "step": 3650 + }, + { + "epoch": 1.1206261510128914, + "grad_norm": 0.5621497631072998, + "learning_rate": 9.824640839365782e-05, + "loss": 2.1209, + "step": 3651 + }, + { + "epoch": 1.1209330877839165, + "grad_norm": 0.4893646240234375, + "learning_rate": 9.824510331341921e-05, + "loss": 1.977, + "step": 3652 + }, + { + "epoch": 1.1212400245549416, + "grad_norm": 0.5626688599586487, + "learning_rate": 9.82437977563942e-05, + "loss": 2.1114, + "step": 3653 + }, + { + "epoch": 1.1215469613259668, + "grad_norm": 0.5714966058731079, + "learning_rate": 9.824249172259573e-05, + "loss": 2.021, + "step": 3654 + }, + { + "epoch": 1.121853898096992, + "grad_norm": 0.5190821886062622, + "learning_rate": 9.824118521203666e-05, + "loss": 1.9788, + "step": 3655 + }, + { + "epoch": 1.1221608348680172, + "grad_norm": 0.46421363949775696, + "learning_rate": 9.823987822472994e-05, + "loss": 1.9762, + "step": 3656 + }, + { + "epoch": 1.1224677716390423, + "grad_norm": 0.5071156620979309, + "learning_rate": 9.823857076068846e-05, + "loss": 1.9625, + "step": 3657 + }, + { + "epoch": 1.1227747084100674, + "grad_norm": 0.5762679576873779, + "learning_rate": 9.823726281992515e-05, + "loss": 2.0543, + "step": 3658 + }, + { + "epoch": 1.1230816451810928, + "grad_norm": 0.6275226473808289, + "learning_rate": 9.823595440245294e-05, + "loss": 2.0878, + "step": 3659 + }, + { + "epoch": 1.1233885819521179, + "grad_norm": 0.6893213391304016, + "learning_rate": 9.823464550828476e-05, + "loss": 2.1059, + "step": 3660 + }, + { + "epoch": 1.123695518723143, + "grad_norm": 0.5521993041038513, + "learning_rate": 9.823333613743353e-05, + "loss": 2.035, + "step": 3661 + }, + { + "epoch": 1.124002455494168, + "grad_norm": 0.4918796718120575, + "learning_rate": 9.823202628991221e-05, + "loss": 1.9873, + "step": 3662 + }, + { + "epoch": 1.1243093922651934, + "grad_norm": 0.5177932977676392, + "learning_rate": 9.823071596573373e-05, + "loss": 2.0376, + "step": 3663 + }, + { + "epoch": 1.1246163290362186, + "grad_norm": 0.5337314009666443, + "learning_rate": 9.822940516491106e-05, + "loss": 2.1065, + "step": 3664 + }, + { + "epoch": 1.1249232658072437, + "grad_norm": 0.5179010629653931, + "learning_rate": 9.822809388745713e-05, + "loss": 1.9642, + "step": 3665 + }, + { + "epoch": 1.125230202578269, + "grad_norm": 0.5394679307937622, + "learning_rate": 9.82267821333849e-05, + "loss": 2.0275, + "step": 3666 + }, + { + "epoch": 1.1255371393492941, + "grad_norm": 0.582873523235321, + "learning_rate": 9.822546990270735e-05, + "loss": 2.0369, + "step": 3667 + }, + { + "epoch": 1.1258440761203192, + "grad_norm": 0.6595674753189087, + "learning_rate": 9.822415719543745e-05, + "loss": 1.9776, + "step": 3668 + }, + { + "epoch": 1.1261510128913443, + "grad_norm": 0.8103840947151184, + "learning_rate": 9.822284401158814e-05, + "loss": 2.0784, + "step": 3669 + }, + { + "epoch": 1.1264579496623695, + "grad_norm": 0.9062070250511169, + "learning_rate": 9.822153035117245e-05, + "loss": 1.9886, + "step": 3670 + }, + { + "epoch": 1.1267648864333948, + "grad_norm": 0.8718156814575195, + "learning_rate": 9.822021621420333e-05, + "loss": 2.0499, + "step": 3671 + }, + { + "epoch": 1.12707182320442, + "grad_norm": 0.6499583721160889, + "learning_rate": 9.821890160069375e-05, + "loss": 2.0734, + "step": 3672 + }, + { + "epoch": 1.127378759975445, + "grad_norm": 0.4573141932487488, + "learning_rate": 9.821758651065673e-05, + "loss": 2.0306, + "step": 3673 + }, + { + "epoch": 1.1276856967464703, + "grad_norm": 0.6441135406494141, + "learning_rate": 9.821627094410526e-05, + "loss": 2.051, + "step": 3674 + }, + { + "epoch": 1.1279926335174955, + "grad_norm": 0.7201390266418457, + "learning_rate": 9.821495490105235e-05, + "loss": 2.0187, + "step": 3675 + }, + { + "epoch": 1.1282995702885206, + "grad_norm": 0.6751874685287476, + "learning_rate": 9.821363838151099e-05, + "loss": 2.0363, + "step": 3676 + }, + { + "epoch": 1.1286065070595457, + "grad_norm": 0.5435949563980103, + "learning_rate": 9.821232138549419e-05, + "loss": 1.939, + "step": 3677 + }, + { + "epoch": 1.1289134438305708, + "grad_norm": 0.605248212814331, + "learning_rate": 9.821100391301497e-05, + "loss": 2.146, + "step": 3678 + }, + { + "epoch": 1.1292203806015961, + "grad_norm": 0.6798139810562134, + "learning_rate": 9.820968596408636e-05, + "loss": 2.0423, + "step": 3679 + }, + { + "epoch": 1.1295273173726212, + "grad_norm": 0.6683683395385742, + "learning_rate": 9.820836753872137e-05, + "loss": 1.9768, + "step": 3680 + }, + { + "epoch": 1.1298342541436464, + "grad_norm": 0.578346312046051, + "learning_rate": 9.820704863693304e-05, + "loss": 1.9313, + "step": 3681 + }, + { + "epoch": 1.1301411909146717, + "grad_norm": 0.5639599561691284, + "learning_rate": 9.820572925873441e-05, + "loss": 2.0706, + "step": 3682 + }, + { + "epoch": 1.1304481276856968, + "grad_norm": 0.5749368071556091, + "learning_rate": 9.82044094041385e-05, + "loss": 2.0072, + "step": 3683 + }, + { + "epoch": 1.130755064456722, + "grad_norm": 0.6490229368209839, + "learning_rate": 9.820308907315836e-05, + "loss": 1.9947, + "step": 3684 + }, + { + "epoch": 1.131062001227747, + "grad_norm": 0.6207692623138428, + "learning_rate": 9.820176826580705e-05, + "loss": 2.1426, + "step": 3685 + }, + { + "epoch": 1.1313689379987721, + "grad_norm": 0.6421573162078857, + "learning_rate": 9.82004469820976e-05, + "loss": 2.0558, + "step": 3686 + }, + { + "epoch": 1.1316758747697975, + "grad_norm": 0.5462764501571655, + "learning_rate": 9.81991252220431e-05, + "loss": 2.0072, + "step": 3687 + }, + { + "epoch": 1.1319828115408226, + "grad_norm": 0.49791282415390015, + "learning_rate": 9.819780298565657e-05, + "loss": 1.9949, + "step": 3688 + }, + { + "epoch": 1.1322897483118477, + "grad_norm": 0.5120366215705872, + "learning_rate": 9.819648027295112e-05, + "loss": 2.0503, + "step": 3689 + }, + { + "epoch": 1.132596685082873, + "grad_norm": 0.5118343830108643, + "learning_rate": 9.81951570839398e-05, + "loss": 2.0104, + "step": 3690 + }, + { + "epoch": 1.1329036218538981, + "grad_norm": 0.44520822167396545, + "learning_rate": 9.81938334186357e-05, + "loss": 2.0024, + "step": 3691 + }, + { + "epoch": 1.1332105586249233, + "grad_norm": 0.5505960583686829, + "learning_rate": 9.819250927705188e-05, + "loss": 2.0924, + "step": 3692 + }, + { + "epoch": 1.1335174953959484, + "grad_norm": 0.5269182920455933, + "learning_rate": 9.819118465920143e-05, + "loss": 2.0553, + "step": 3693 + }, + { + "epoch": 1.1338244321669735, + "grad_norm": 0.4864311218261719, + "learning_rate": 9.818985956509745e-05, + "loss": 2.0405, + "step": 3694 + }, + { + "epoch": 1.1341313689379988, + "grad_norm": 0.515202522277832, + "learning_rate": 9.818853399475304e-05, + "loss": 2.0211, + "step": 3695 + }, + { + "epoch": 1.134438305709024, + "grad_norm": 0.5360483527183533, + "learning_rate": 9.818720794818128e-05, + "loss": 2.1077, + "step": 3696 + }, + { + "epoch": 1.134745242480049, + "grad_norm": 0.5469255447387695, + "learning_rate": 9.818588142539531e-05, + "loss": 1.9538, + "step": 3697 + }, + { + "epoch": 1.1350521792510744, + "grad_norm": 0.5042214393615723, + "learning_rate": 9.818455442640819e-05, + "loss": 2.0477, + "step": 3698 + }, + { + "epoch": 1.1353591160220995, + "grad_norm": 0.5678744316101074, + "learning_rate": 9.81832269512331e-05, + "loss": 2.0871, + "step": 3699 + }, + { + "epoch": 1.1356660527931246, + "grad_norm": 0.5218677520751953, + "learning_rate": 9.818189899988308e-05, + "loss": 2.1014, + "step": 3700 + }, + { + "epoch": 1.1359729895641497, + "grad_norm": 0.5141727924346924, + "learning_rate": 9.818057057237132e-05, + "loss": 2.0385, + "step": 3701 + }, + { + "epoch": 1.136279926335175, + "grad_norm": 0.5288038849830627, + "learning_rate": 9.81792416687109e-05, + "loss": 2.0736, + "step": 3702 + }, + { + "epoch": 1.1365868631062002, + "grad_norm": 0.5533168911933899, + "learning_rate": 9.817791228891499e-05, + "loss": 2.032, + "step": 3703 + }, + { + "epoch": 1.1368937998772253, + "grad_norm": 0.4840674102306366, + "learning_rate": 9.81765824329967e-05, + "loss": 2.027, + "step": 3704 + }, + { + "epoch": 1.1372007366482504, + "grad_norm": 0.5060023069381714, + "learning_rate": 9.817525210096921e-05, + "loss": 2.0561, + "step": 3705 + }, + { + "epoch": 1.1375076734192757, + "grad_norm": 0.48830488324165344, + "learning_rate": 9.817392129284561e-05, + "loss": 1.9807, + "step": 3706 + }, + { + "epoch": 1.1378146101903008, + "grad_norm": 0.4644564390182495, + "learning_rate": 9.817259000863911e-05, + "loss": 1.9871, + "step": 3707 + }, + { + "epoch": 1.138121546961326, + "grad_norm": 0.4644739329814911, + "learning_rate": 9.817125824836283e-05, + "loss": 2.0253, + "step": 3708 + }, + { + "epoch": 1.138428483732351, + "grad_norm": 0.5376463532447815, + "learning_rate": 9.816992601202994e-05, + "loss": 2.0693, + "step": 3709 + }, + { + "epoch": 1.1387354205033764, + "grad_norm": 0.49980148673057556, + "learning_rate": 9.816859329965363e-05, + "loss": 2.0123, + "step": 3710 + }, + { + "epoch": 1.1390423572744015, + "grad_norm": 0.5452225208282471, + "learning_rate": 9.816726011124702e-05, + "loss": 2.0725, + "step": 3711 + }, + { + "epoch": 1.1393492940454266, + "grad_norm": 0.5428896546363831, + "learning_rate": 9.816592644682332e-05, + "loss": 2.0446, + "step": 3712 + }, + { + "epoch": 1.1396562308164517, + "grad_norm": 0.5448847413063049, + "learning_rate": 9.816459230639571e-05, + "loss": 2.0262, + "step": 3713 + }, + { + "epoch": 1.139963167587477, + "grad_norm": 0.48574572801589966, + "learning_rate": 9.816325768997736e-05, + "loss": 2.0105, + "step": 3714 + }, + { + "epoch": 1.1402701043585022, + "grad_norm": 0.5566397905349731, + "learning_rate": 9.816192259758147e-05, + "loss": 2.0665, + "step": 3715 + }, + { + "epoch": 1.1405770411295273, + "grad_norm": 0.6098625659942627, + "learning_rate": 9.816058702922124e-05, + "loss": 2.0589, + "step": 3716 + }, + { + "epoch": 1.1408839779005524, + "grad_norm": 0.6118699312210083, + "learning_rate": 9.815925098490985e-05, + "loss": 2.0683, + "step": 3717 + }, + { + "epoch": 1.1411909146715777, + "grad_norm": 0.5213121175765991, + "learning_rate": 9.815791446466053e-05, + "loss": 2.0226, + "step": 3718 + }, + { + "epoch": 1.1414978514426029, + "grad_norm": 0.45717960596084595, + "learning_rate": 9.815657746848648e-05, + "loss": 2.0371, + "step": 3719 + }, + { + "epoch": 1.141804788213628, + "grad_norm": 0.4613656997680664, + "learning_rate": 9.815523999640088e-05, + "loss": 2.0702, + "step": 3720 + }, + { + "epoch": 1.142111724984653, + "grad_norm": 0.4527476727962494, + "learning_rate": 9.8153902048417e-05, + "loss": 1.9893, + "step": 3721 + }, + { + "epoch": 1.1424186617556784, + "grad_norm": 0.4524305462837219, + "learning_rate": 9.815256362454801e-05, + "loss": 1.975, + "step": 3722 + }, + { + "epoch": 1.1427255985267035, + "grad_norm": 0.4421180188655853, + "learning_rate": 9.815122472480718e-05, + "loss": 1.9987, + "step": 3723 + }, + { + "epoch": 1.1430325352977286, + "grad_norm": 0.4833788275718689, + "learning_rate": 9.814988534920771e-05, + "loss": 2.0246, + "step": 3724 + }, + { + "epoch": 1.1433394720687537, + "grad_norm": 0.46547624468803406, + "learning_rate": 9.814854549776287e-05, + "loss": 2.0007, + "step": 3725 + }, + { + "epoch": 1.143646408839779, + "grad_norm": 0.43220648169517517, + "learning_rate": 9.814720517048587e-05, + "loss": 1.9845, + "step": 3726 + }, + { + "epoch": 1.1439533456108042, + "grad_norm": 0.473910391330719, + "learning_rate": 9.814586436738998e-05, + "loss": 2.0518, + "step": 3727 + }, + { + "epoch": 1.1442602823818293, + "grad_norm": 0.507354199886322, + "learning_rate": 9.814452308848843e-05, + "loss": 2.0708, + "step": 3728 + }, + { + "epoch": 1.1445672191528544, + "grad_norm": 0.4585053622722626, + "learning_rate": 9.814318133379448e-05, + "loss": 2.0124, + "step": 3729 + }, + { + "epoch": 1.1448741559238798, + "grad_norm": 0.5280457735061646, + "learning_rate": 9.81418391033214e-05, + "loss": 2.0424, + "step": 3730 + }, + { + "epoch": 1.1451810926949049, + "grad_norm": 0.5173056125640869, + "learning_rate": 9.814049639708245e-05, + "loss": 1.9666, + "step": 3731 + }, + { + "epoch": 1.14548802946593, + "grad_norm": 0.5850839018821716, + "learning_rate": 9.81391532150909e-05, + "loss": 2.0765, + "step": 3732 + }, + { + "epoch": 1.145794966236955, + "grad_norm": 0.5450417995452881, + "learning_rate": 9.813780955736002e-05, + "loss": 2.0696, + "step": 3733 + }, + { + "epoch": 1.1461019030079804, + "grad_norm": 0.4577319622039795, + "learning_rate": 9.81364654239031e-05, + "loss": 2.0493, + "step": 3734 + }, + { + "epoch": 1.1464088397790055, + "grad_norm": 0.5211838483810425, + "learning_rate": 9.813512081473339e-05, + "loss": 2.0578, + "step": 3735 + }, + { + "epoch": 1.1467157765500307, + "grad_norm": 0.6763051152229309, + "learning_rate": 9.813377572986422e-05, + "loss": 2.0859, + "step": 3736 + }, + { + "epoch": 1.1470227133210558, + "grad_norm": 0.8591815233230591, + "learning_rate": 9.813243016930887e-05, + "loss": 1.9743, + "step": 3737 + }, + { + "epoch": 1.147329650092081, + "grad_norm": 0.8573755025863647, + "learning_rate": 9.813108413308063e-05, + "loss": 2.048, + "step": 3738 + }, + { + "epoch": 1.1476365868631062, + "grad_norm": 0.6887713074684143, + "learning_rate": 9.812973762119281e-05, + "loss": 2.0184, + "step": 3739 + }, + { + "epoch": 1.1479435236341313, + "grad_norm": 0.5491438508033752, + "learning_rate": 9.81283906336587e-05, + "loss": 2.0373, + "step": 3740 + }, + { + "epoch": 1.1482504604051567, + "grad_norm": 0.6413923501968384, + "learning_rate": 9.812704317049164e-05, + "loss": 2.067, + "step": 3741 + }, + { + "epoch": 1.1485573971761818, + "grad_norm": 0.8731338381767273, + "learning_rate": 9.812569523170492e-05, + "loss": 1.9996, + "step": 3742 + }, + { + "epoch": 1.1488643339472069, + "grad_norm": 0.8043886423110962, + "learning_rate": 9.812434681731189e-05, + "loss": 2.0464, + "step": 3743 + }, + { + "epoch": 1.149171270718232, + "grad_norm": 0.5458279252052307, + "learning_rate": 9.812299792732584e-05, + "loss": 2.0026, + "step": 3744 + }, + { + "epoch": 1.149478207489257, + "grad_norm": 0.5135432481765747, + "learning_rate": 9.812164856176011e-05, + "loss": 2.0302, + "step": 3745 + }, + { + "epoch": 1.1497851442602824, + "grad_norm": 0.6673153638839722, + "learning_rate": 9.812029872062807e-05, + "loss": 2.0435, + "step": 3746 + }, + { + "epoch": 1.1500920810313076, + "grad_norm": 0.6777083873748779, + "learning_rate": 9.811894840394302e-05, + "loss": 2.0591, + "step": 3747 + }, + { + "epoch": 1.1503990178023327, + "grad_norm": 0.6660524010658264, + "learning_rate": 9.811759761171833e-05, + "loss": 2.0461, + "step": 3748 + }, + { + "epoch": 1.150705954573358, + "grad_norm": 0.6079594492912292, + "learning_rate": 9.811624634396733e-05, + "loss": 2.0708, + "step": 3749 + }, + { + "epoch": 1.1510128913443831, + "grad_norm": 0.5242465734481812, + "learning_rate": 9.811489460070337e-05, + "loss": 2.0513, + "step": 3750 + }, + { + "epoch": 1.1513198281154082, + "grad_norm": 0.7091820240020752, + "learning_rate": 9.811354238193984e-05, + "loss": 2.0356, + "step": 3751 + }, + { + "epoch": 1.1516267648864333, + "grad_norm": 0.6781896948814392, + "learning_rate": 9.811218968769007e-05, + "loss": 2.0693, + "step": 3752 + }, + { + "epoch": 1.1519337016574585, + "grad_norm": 0.6036314368247986, + "learning_rate": 9.811083651796744e-05, + "loss": 2.134, + "step": 3753 + }, + { + "epoch": 1.1522406384284838, + "grad_norm": 0.6173892617225647, + "learning_rate": 9.810948287278534e-05, + "loss": 2.056, + "step": 3754 + }, + { + "epoch": 1.152547575199509, + "grad_norm": 0.4903198182582855, + "learning_rate": 9.810812875215712e-05, + "loss": 2.0037, + "step": 3755 + }, + { + "epoch": 1.152854511970534, + "grad_norm": 0.5527236461639404, + "learning_rate": 9.810677415609619e-05, + "loss": 2.0334, + "step": 3756 + }, + { + "epoch": 1.1531614487415593, + "grad_norm": 0.5342993140220642, + "learning_rate": 9.81054190846159e-05, + "loss": 2.0376, + "step": 3757 + }, + { + "epoch": 1.1534683855125845, + "grad_norm": 0.4860527515411377, + "learning_rate": 9.810406353772968e-05, + "loss": 2.0009, + "step": 3758 + }, + { + "epoch": 1.1537753222836096, + "grad_norm": 0.49722176790237427, + "learning_rate": 9.810270751545089e-05, + "loss": 2.051, + "step": 3759 + }, + { + "epoch": 1.1540822590546347, + "grad_norm": 0.4714743196964264, + "learning_rate": 9.810135101779296e-05, + "loss": 2.0474, + "step": 3760 + }, + { + "epoch": 1.1543891958256598, + "grad_norm": 0.5183619856834412, + "learning_rate": 9.80999940447693e-05, + "loss": 2.1032, + "step": 3761 + }, + { + "epoch": 1.1546961325966851, + "grad_norm": 0.6118659377098083, + "learning_rate": 9.809863659639328e-05, + "loss": 2.0967, + "step": 3762 + }, + { + "epoch": 1.1550030693677102, + "grad_norm": 0.49166184663772583, + "learning_rate": 9.809727867267838e-05, + "loss": 2.0683, + "step": 3763 + }, + { + "epoch": 1.1553100061387354, + "grad_norm": 0.5190026164054871, + "learning_rate": 9.809592027363795e-05, + "loss": 2.0161, + "step": 3764 + }, + { + "epoch": 1.1556169429097607, + "grad_norm": 0.516914427280426, + "learning_rate": 9.809456139928546e-05, + "loss": 2.0886, + "step": 3765 + }, + { + "epoch": 1.1559238796807858, + "grad_norm": 0.49737948179244995, + "learning_rate": 9.809320204963433e-05, + "loss": 2.0111, + "step": 3766 + }, + { + "epoch": 1.156230816451811, + "grad_norm": 0.44676536321640015, + "learning_rate": 9.809184222469796e-05, + "loss": 2.0571, + "step": 3767 + }, + { + "epoch": 1.156537753222836, + "grad_norm": 0.5008999109268188, + "learning_rate": 9.809048192448983e-05, + "loss": 2.0489, + "step": 3768 + }, + { + "epoch": 1.1568446899938611, + "grad_norm": 0.5116657614707947, + "learning_rate": 9.80891211490234e-05, + "loss": 1.9571, + "step": 3769 + }, + { + "epoch": 1.1571516267648865, + "grad_norm": 0.49909651279449463, + "learning_rate": 9.808775989831207e-05, + "loss": 2.0568, + "step": 3770 + }, + { + "epoch": 1.1574585635359116, + "grad_norm": 0.5186662077903748, + "learning_rate": 9.80863981723693e-05, + "loss": 2.0283, + "step": 3771 + }, + { + "epoch": 1.1577655003069367, + "grad_norm": 0.4974740445613861, + "learning_rate": 9.808503597120858e-05, + "loss": 1.9525, + "step": 3772 + }, + { + "epoch": 1.158072437077962, + "grad_norm": 0.5369553565979004, + "learning_rate": 9.808367329484333e-05, + "loss": 1.9627, + "step": 3773 + }, + { + "epoch": 1.1583793738489871, + "grad_norm": 0.5084113478660583, + "learning_rate": 9.808231014328704e-05, + "loss": 1.9563, + "step": 3774 + }, + { + "epoch": 1.1586863106200123, + "grad_norm": 0.6059956550598145, + "learning_rate": 9.808094651655319e-05, + "loss": 2.078, + "step": 3775 + }, + { + "epoch": 1.1589932473910374, + "grad_norm": 0.5677124261856079, + "learning_rate": 9.807958241465523e-05, + "loss": 1.9977, + "step": 3776 + }, + { + "epoch": 1.1593001841620627, + "grad_norm": 0.5582616329193115, + "learning_rate": 9.807821783760667e-05, + "loss": 2.0053, + "step": 3777 + }, + { + "epoch": 1.1596071209330878, + "grad_norm": 0.5558032989501953, + "learning_rate": 9.807685278542097e-05, + "loss": 2.0015, + "step": 3778 + }, + { + "epoch": 1.159914057704113, + "grad_norm": 0.553292989730835, + "learning_rate": 9.807548725811165e-05, + "loss": 2.133, + "step": 3779 + }, + { + "epoch": 1.160220994475138, + "grad_norm": 0.5281317234039307, + "learning_rate": 9.807412125569217e-05, + "loss": 2.0018, + "step": 3780 + }, + { + "epoch": 1.1605279312461634, + "grad_norm": 0.45385050773620605, + "learning_rate": 9.807275477817605e-05, + "loss": 1.9986, + "step": 3781 + }, + { + "epoch": 1.1608348680171885, + "grad_norm": 0.5843673944473267, + "learning_rate": 9.80713878255768e-05, + "loss": 2.0653, + "step": 3782 + }, + { + "epoch": 1.1611418047882136, + "grad_norm": 0.6193283796310425, + "learning_rate": 9.807002039790792e-05, + "loss": 1.9646, + "step": 3783 + }, + { + "epoch": 1.1614487415592387, + "grad_norm": 0.5831897258758545, + "learning_rate": 9.806865249518292e-05, + "loss": 1.9708, + "step": 3784 + }, + { + "epoch": 1.161755678330264, + "grad_norm": 0.49771901965141296, + "learning_rate": 9.806728411741533e-05, + "loss": 1.9953, + "step": 3785 + }, + { + "epoch": 1.1620626151012892, + "grad_norm": 0.5003515481948853, + "learning_rate": 9.806591526461864e-05, + "loss": 2.0503, + "step": 3786 + }, + { + "epoch": 1.1623695518723143, + "grad_norm": 0.5710052847862244, + "learning_rate": 9.806454593680642e-05, + "loss": 1.9976, + "step": 3787 + }, + { + "epoch": 1.1626764886433394, + "grad_norm": 0.5180788040161133, + "learning_rate": 9.806317613399218e-05, + "loss": 1.9872, + "step": 3788 + }, + { + "epoch": 1.1629834254143647, + "grad_norm": 0.5202008485794067, + "learning_rate": 9.806180585618949e-05, + "loss": 1.9628, + "step": 3789 + }, + { + "epoch": 1.1632903621853898, + "grad_norm": 0.47358211874961853, + "learning_rate": 9.806043510341183e-05, + "loss": 1.9994, + "step": 3790 + }, + { + "epoch": 1.163597298956415, + "grad_norm": 0.4258720278739929, + "learning_rate": 9.80590638756728e-05, + "loss": 1.9547, + "step": 3791 + }, + { + "epoch": 1.16390423572744, + "grad_norm": 0.4487614035606384, + "learning_rate": 9.805769217298593e-05, + "loss": 1.9912, + "step": 3792 + }, + { + "epoch": 1.1642111724984654, + "grad_norm": 0.4970495104789734, + "learning_rate": 9.805631999536477e-05, + "loss": 2.0568, + "step": 3793 + }, + { + "epoch": 1.1645181092694905, + "grad_norm": 0.4535474479198456, + "learning_rate": 9.805494734282289e-05, + "loss": 2.0088, + "step": 3794 + }, + { + "epoch": 1.1648250460405156, + "grad_norm": 0.44582805037498474, + "learning_rate": 9.805357421537385e-05, + "loss": 1.9694, + "step": 3795 + }, + { + "epoch": 1.1651319828115407, + "grad_norm": 0.43872734904289246, + "learning_rate": 9.805220061303125e-05, + "loss": 2.0041, + "step": 3796 + }, + { + "epoch": 1.165438919582566, + "grad_norm": 0.5050458908081055, + "learning_rate": 9.805082653580861e-05, + "loss": 1.9963, + "step": 3797 + }, + { + "epoch": 1.1657458563535912, + "grad_norm": 0.5346884727478027, + "learning_rate": 9.804945198371956e-05, + "loss": 2.0334, + "step": 3798 + }, + { + "epoch": 1.1660527931246163, + "grad_norm": 0.5607240796089172, + "learning_rate": 9.804807695677764e-05, + "loss": 2.0474, + "step": 3799 + }, + { + "epoch": 1.1663597298956414, + "grad_norm": 0.5343592166900635, + "learning_rate": 9.804670145499648e-05, + "loss": 2.0542, + "step": 3800 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5195753574371338, + "learning_rate": 9.804532547838964e-05, + "loss": 2.0816, + "step": 3801 + }, + { + "epoch": 1.1669736034376919, + "grad_norm": 0.575821042060852, + "learning_rate": 9.804394902697075e-05, + "loss": 2.0182, + "step": 3802 + }, + { + "epoch": 1.167280540208717, + "grad_norm": 0.6385466456413269, + "learning_rate": 9.804257210075339e-05, + "loss": 2.0519, + "step": 3803 + }, + { + "epoch": 1.167587476979742, + "grad_norm": 0.7202457785606384, + "learning_rate": 9.804119469975117e-05, + "loss": 1.9871, + "step": 3804 + }, + { + "epoch": 1.1678944137507674, + "grad_norm": 0.696793258190155, + "learning_rate": 9.803981682397772e-05, + "loss": 2.1018, + "step": 3805 + }, + { + "epoch": 1.1682013505217925, + "grad_norm": 0.6217656135559082, + "learning_rate": 9.803843847344662e-05, + "loss": 2.1009, + "step": 3806 + }, + { + "epoch": 1.1685082872928176, + "grad_norm": 0.5296351313591003, + "learning_rate": 9.803705964817153e-05, + "loss": 2.1057, + "step": 3807 + }, + { + "epoch": 1.168815224063843, + "grad_norm": 0.5280975699424744, + "learning_rate": 9.803568034816606e-05, + "loss": 2.0019, + "step": 3808 + }, + { + "epoch": 1.169122160834868, + "grad_norm": 0.4981881380081177, + "learning_rate": 9.803430057344385e-05, + "loss": 1.9918, + "step": 3809 + }, + { + "epoch": 1.1694290976058932, + "grad_norm": 0.43662941455841064, + "learning_rate": 9.803292032401852e-05, + "loss": 2.0273, + "step": 3810 + }, + { + "epoch": 1.1697360343769183, + "grad_norm": 0.5039259791374207, + "learning_rate": 9.80315395999037e-05, + "loss": 2.0475, + "step": 3811 + }, + { + "epoch": 1.1700429711479434, + "grad_norm": 0.4330410957336426, + "learning_rate": 9.803015840111308e-05, + "loss": 1.99, + "step": 3812 + }, + { + "epoch": 1.1703499079189688, + "grad_norm": 0.4603813886642456, + "learning_rate": 9.802877672766026e-05, + "loss": 2.0288, + "step": 3813 + }, + { + "epoch": 1.1706568446899939, + "grad_norm": 0.45815590023994446, + "learning_rate": 9.802739457955894e-05, + "loss": 2.0026, + "step": 3814 + }, + { + "epoch": 1.170963781461019, + "grad_norm": 0.46995803713798523, + "learning_rate": 9.802601195682275e-05, + "loss": 2.0608, + "step": 3815 + }, + { + "epoch": 1.1712707182320443, + "grad_norm": 0.4511576294898987, + "learning_rate": 9.802462885946536e-05, + "loss": 1.9793, + "step": 3816 + }, + { + "epoch": 1.1715776550030694, + "grad_norm": 0.49079468846321106, + "learning_rate": 9.802324528750044e-05, + "loss": 2.0049, + "step": 3817 + }, + { + "epoch": 1.1718845917740945, + "grad_norm": 0.47245466709136963, + "learning_rate": 9.802186124094166e-05, + "loss": 1.9562, + "step": 3818 + }, + { + "epoch": 1.1721915285451197, + "grad_norm": 0.485575795173645, + "learning_rate": 9.80204767198027e-05, + "loss": 2.0212, + "step": 3819 + }, + { + "epoch": 1.1724984653161448, + "grad_norm": 0.5924440622329712, + "learning_rate": 9.801909172409724e-05, + "loss": 1.9875, + "step": 3820 + }, + { + "epoch": 1.17280540208717, + "grad_norm": 0.48908641934394836, + "learning_rate": 9.801770625383899e-05, + "loss": 1.9778, + "step": 3821 + }, + { + "epoch": 1.1731123388581952, + "grad_norm": 0.4372415840625763, + "learning_rate": 9.80163203090416e-05, + "loss": 1.9368, + "step": 3822 + }, + { + "epoch": 1.1734192756292203, + "grad_norm": 0.5811094641685486, + "learning_rate": 9.801493388971881e-05, + "loss": 2.1293, + "step": 3823 + }, + { + "epoch": 1.1737262124002457, + "grad_norm": 0.516983151435852, + "learning_rate": 9.801354699588428e-05, + "loss": 2.039, + "step": 3824 + }, + { + "epoch": 1.1740331491712708, + "grad_norm": 0.53409343957901, + "learning_rate": 9.801215962755175e-05, + "loss": 2.0294, + "step": 3825 + }, + { + "epoch": 1.1743400859422959, + "grad_norm": 0.5703202486038208, + "learning_rate": 9.801077178473492e-05, + "loss": 2.0241, + "step": 3826 + }, + { + "epoch": 1.174647022713321, + "grad_norm": 0.49341192841529846, + "learning_rate": 9.80093834674475e-05, + "loss": 1.9092, + "step": 3827 + }, + { + "epoch": 1.174953959484346, + "grad_norm": 0.46960577368736267, + "learning_rate": 9.800799467570321e-05, + "loss": 1.9994, + "step": 3828 + }, + { + "epoch": 1.1752608962553714, + "grad_norm": 0.468108594417572, + "learning_rate": 9.800660540951577e-05, + "loss": 1.9471, + "step": 3829 + }, + { + "epoch": 1.1755678330263966, + "grad_norm": 0.4133259057998657, + "learning_rate": 9.800521566889893e-05, + "loss": 2.0159, + "step": 3830 + }, + { + "epoch": 1.1758747697974217, + "grad_norm": 0.44991979002952576, + "learning_rate": 9.800382545386641e-05, + "loss": 2.0179, + "step": 3831 + }, + { + "epoch": 1.176181706568447, + "grad_norm": 0.43111294507980347, + "learning_rate": 9.800243476443195e-05, + "loss": 2.1092, + "step": 3832 + }, + { + "epoch": 1.1764886433394721, + "grad_norm": 0.4859693944454193, + "learning_rate": 9.800104360060929e-05, + "loss": 2.0134, + "step": 3833 + }, + { + "epoch": 1.1767955801104972, + "grad_norm": 0.474960058927536, + "learning_rate": 9.799965196241219e-05, + "loss": 2.0288, + "step": 3834 + }, + { + "epoch": 1.1771025168815223, + "grad_norm": 0.5269008278846741, + "learning_rate": 9.79982598498544e-05, + "loss": 2.063, + "step": 3835 + }, + { + "epoch": 1.1774094536525475, + "grad_norm": 0.4923003613948822, + "learning_rate": 9.799686726294965e-05, + "loss": 1.9506, + "step": 3836 + }, + { + "epoch": 1.1777163904235728, + "grad_norm": 0.5355561971664429, + "learning_rate": 9.799547420171175e-05, + "loss": 2.0066, + "step": 3837 + }, + { + "epoch": 1.178023327194598, + "grad_norm": 0.6095728874206543, + "learning_rate": 9.799408066615443e-05, + "loss": 1.9799, + "step": 3838 + }, + { + "epoch": 1.178330263965623, + "grad_norm": 0.5268104672431946, + "learning_rate": 9.799268665629148e-05, + "loss": 2.0409, + "step": 3839 + }, + { + "epoch": 1.1786372007366483, + "grad_norm": 0.4478130340576172, + "learning_rate": 9.799129217213667e-05, + "loss": 1.9521, + "step": 3840 + }, + { + "epoch": 1.1789441375076735, + "grad_norm": 0.4691653847694397, + "learning_rate": 9.798989721370379e-05, + "loss": 2.0432, + "step": 3841 + }, + { + "epoch": 1.1792510742786986, + "grad_norm": 0.5602376461029053, + "learning_rate": 9.798850178100661e-05, + "loss": 2.0557, + "step": 3842 + }, + { + "epoch": 1.1795580110497237, + "grad_norm": 0.5619905591011047, + "learning_rate": 9.798710587405893e-05, + "loss": 2.0258, + "step": 3843 + }, + { + "epoch": 1.179864947820749, + "grad_norm": 0.5845574736595154, + "learning_rate": 9.798570949287454e-05, + "loss": 2.0637, + "step": 3844 + }, + { + "epoch": 1.1801718845917741, + "grad_norm": 0.5339313745498657, + "learning_rate": 9.798431263746725e-05, + "loss": 2.0265, + "step": 3845 + }, + { + "epoch": 1.1804788213627992, + "grad_norm": 0.45720914006233215, + "learning_rate": 9.798291530785086e-05, + "loss": 1.9745, + "step": 3846 + }, + { + "epoch": 1.1807857581338244, + "grad_norm": 0.5121282935142517, + "learning_rate": 9.798151750403917e-05, + "loss": 2.0427, + "step": 3847 + }, + { + "epoch": 1.1810926949048497, + "grad_norm": 0.48100459575653076, + "learning_rate": 9.7980119226046e-05, + "loss": 2.0307, + "step": 3848 + }, + { + "epoch": 1.1813996316758748, + "grad_norm": 0.4424034655094147, + "learning_rate": 9.797872047388517e-05, + "loss": 1.9697, + "step": 3849 + }, + { + "epoch": 1.1817065684469, + "grad_norm": 0.45154938101768494, + "learning_rate": 9.797732124757051e-05, + "loss": 1.9689, + "step": 3850 + }, + { + "epoch": 1.182013505217925, + "grad_norm": 0.4807071387767792, + "learning_rate": 9.797592154711584e-05, + "loss": 1.9616, + "step": 3851 + }, + { + "epoch": 1.1823204419889504, + "grad_norm": 0.5113904476165771, + "learning_rate": 9.797452137253498e-05, + "loss": 2.0158, + "step": 3852 + }, + { + "epoch": 1.1826273787599755, + "grad_norm": 0.5456753969192505, + "learning_rate": 9.797312072384179e-05, + "loss": 1.977, + "step": 3853 + }, + { + "epoch": 1.1829343155310006, + "grad_norm": 0.5545704364776611, + "learning_rate": 9.797171960105012e-05, + "loss": 2.0622, + "step": 3854 + }, + { + "epoch": 1.1832412523020257, + "grad_norm": 0.651498556137085, + "learning_rate": 9.797031800417377e-05, + "loss": 2.0739, + "step": 3855 + }, + { + "epoch": 1.183548189073051, + "grad_norm": 0.748968780040741, + "learning_rate": 9.796891593322665e-05, + "loss": 2.0713, + "step": 3856 + }, + { + "epoch": 1.1838551258440762, + "grad_norm": 0.8724157214164734, + "learning_rate": 9.796751338822256e-05, + "loss": 2.0224, + "step": 3857 + }, + { + "epoch": 1.1841620626151013, + "grad_norm": 0.8158844709396362, + "learning_rate": 9.796611036917542e-05, + "loss": 2.0165, + "step": 3858 + }, + { + "epoch": 1.1844689993861264, + "grad_norm": 0.6231487989425659, + "learning_rate": 9.796470687609904e-05, + "loss": 1.9607, + "step": 3859 + }, + { + "epoch": 1.1847759361571517, + "grad_norm": 0.49367067217826843, + "learning_rate": 9.796330290900731e-05, + "loss": 2.0074, + "step": 3860 + }, + { + "epoch": 1.1850828729281768, + "grad_norm": 0.5546393990516663, + "learning_rate": 9.796189846791413e-05, + "loss": 1.9688, + "step": 3861 + }, + { + "epoch": 1.185389809699202, + "grad_norm": 0.5880963802337646, + "learning_rate": 9.796049355283333e-05, + "loss": 2.0192, + "step": 3862 + }, + { + "epoch": 1.185696746470227, + "grad_norm": 0.6064910292625427, + "learning_rate": 9.795908816377884e-05, + "loss": 2.0236, + "step": 3863 + }, + { + "epoch": 1.1860036832412524, + "grad_norm": 0.524116575717926, + "learning_rate": 9.795768230076454e-05, + "loss": 2.0315, + "step": 3864 + }, + { + "epoch": 1.1863106200122775, + "grad_norm": 0.449158251285553, + "learning_rate": 9.79562759638043e-05, + "loss": 1.9423, + "step": 3865 + }, + { + "epoch": 1.1866175567833026, + "grad_norm": 0.5623016953468323, + "learning_rate": 9.795486915291203e-05, + "loss": 2.096, + "step": 3866 + }, + { + "epoch": 1.1869244935543277, + "grad_norm": 0.6107217073440552, + "learning_rate": 9.795346186810164e-05, + "loss": 1.9994, + "step": 3867 + }, + { + "epoch": 1.187231430325353, + "grad_norm": 0.5559211373329163, + "learning_rate": 9.795205410938704e-05, + "loss": 2.0138, + "step": 3868 + }, + { + "epoch": 1.1875383670963782, + "grad_norm": 0.5022037029266357, + "learning_rate": 9.795064587678212e-05, + "loss": 2.0835, + "step": 3869 + }, + { + "epoch": 1.1878453038674033, + "grad_norm": 0.5760810971260071, + "learning_rate": 9.794923717030082e-05, + "loss": 2.0839, + "step": 3870 + }, + { + "epoch": 1.1881522406384284, + "grad_norm": 0.559018075466156, + "learning_rate": 9.794782798995706e-05, + "loss": 2.0397, + "step": 3871 + }, + { + "epoch": 1.1884591774094537, + "grad_norm": 0.48842501640319824, + "learning_rate": 9.794641833576477e-05, + "loss": 2.022, + "step": 3872 + }, + { + "epoch": 1.1887661141804788, + "grad_norm": 0.47267377376556396, + "learning_rate": 9.794500820773785e-05, + "loss": 1.9677, + "step": 3873 + }, + { + "epoch": 1.189073050951504, + "grad_norm": 0.5107980966567993, + "learning_rate": 9.794359760589026e-05, + "loss": 2.124, + "step": 3874 + }, + { + "epoch": 1.189379987722529, + "grad_norm": 0.4993875026702881, + "learning_rate": 9.794218653023595e-05, + "loss": 1.9528, + "step": 3875 + }, + { + "epoch": 1.1896869244935544, + "grad_norm": 0.49543896317481995, + "learning_rate": 9.794077498078885e-05, + "loss": 2.0257, + "step": 3876 + }, + { + "epoch": 1.1899938612645795, + "grad_norm": 0.5207403302192688, + "learning_rate": 9.79393629575629e-05, + "loss": 2.0853, + "step": 3877 + }, + { + "epoch": 1.1903007980356046, + "grad_norm": 0.44884833693504333, + "learning_rate": 9.793795046057208e-05, + "loss": 1.9366, + "step": 3878 + }, + { + "epoch": 1.1906077348066297, + "grad_norm": 0.47921934723854065, + "learning_rate": 9.793653748983033e-05, + "loss": 2.0614, + "step": 3879 + }, + { + "epoch": 1.190914671577655, + "grad_norm": 0.5371566414833069, + "learning_rate": 9.793512404535163e-05, + "loss": 2.0433, + "step": 3880 + }, + { + "epoch": 1.1912216083486802, + "grad_norm": 0.48760104179382324, + "learning_rate": 9.793371012714994e-05, + "loss": 2.0061, + "step": 3881 + }, + { + "epoch": 1.1915285451197053, + "grad_norm": 0.47291669249534607, + "learning_rate": 9.793229573523922e-05, + "loss": 2.0661, + "step": 3882 + }, + { + "epoch": 1.1918354818907306, + "grad_norm": 0.5348502397537231, + "learning_rate": 9.793088086963347e-05, + "loss": 2.0131, + "step": 3883 + }, + { + "epoch": 1.1921424186617557, + "grad_norm": 0.6291812062263489, + "learning_rate": 9.792946553034666e-05, + "loss": 2.0312, + "step": 3884 + }, + { + "epoch": 1.1924493554327809, + "grad_norm": 0.5620503425598145, + "learning_rate": 9.792804971739276e-05, + "loss": 2.0429, + "step": 3885 + }, + { + "epoch": 1.192756292203806, + "grad_norm": 0.4984607696533203, + "learning_rate": 9.792663343078581e-05, + "loss": 2.0183, + "step": 3886 + }, + { + "epoch": 1.193063228974831, + "grad_norm": 0.5867961645126343, + "learning_rate": 9.792521667053975e-05, + "loss": 2.0609, + "step": 3887 + }, + { + "epoch": 1.1933701657458564, + "grad_norm": 0.5819169282913208, + "learning_rate": 9.792379943666863e-05, + "loss": 1.9412, + "step": 3888 + }, + { + "epoch": 1.1936771025168815, + "grad_norm": 0.6232548952102661, + "learning_rate": 9.792238172918643e-05, + "loss": 2.0607, + "step": 3889 + }, + { + "epoch": 1.1939840392879066, + "grad_norm": 0.5859619379043579, + "learning_rate": 9.792096354810716e-05, + "loss": 2.0718, + "step": 3890 + }, + { + "epoch": 1.194290976058932, + "grad_norm": 0.47209057211875916, + "learning_rate": 9.791954489344485e-05, + "loss": 1.9872, + "step": 3891 + }, + { + "epoch": 1.194597912829957, + "grad_norm": 0.5183662176132202, + "learning_rate": 9.79181257652135e-05, + "loss": 2.0782, + "step": 3892 + }, + { + "epoch": 1.1949048496009822, + "grad_norm": 0.551873505115509, + "learning_rate": 9.791670616342715e-05, + "loss": 2.0477, + "step": 3893 + }, + { + "epoch": 1.1952117863720073, + "grad_norm": 0.47254955768585205, + "learning_rate": 9.791528608809984e-05, + "loss": 1.9859, + "step": 3894 + }, + { + "epoch": 1.1955187231430324, + "grad_norm": 0.45482897758483887, + "learning_rate": 9.791386553924556e-05, + "loss": 1.9939, + "step": 3895 + }, + { + "epoch": 1.1958256599140578, + "grad_norm": 0.4687066078186035, + "learning_rate": 9.79124445168784e-05, + "loss": 1.9982, + "step": 3896 + }, + { + "epoch": 1.1961325966850829, + "grad_norm": 0.4855460524559021, + "learning_rate": 9.791102302101236e-05, + "loss": 1.9667, + "step": 3897 + }, + { + "epoch": 1.196439533456108, + "grad_norm": 0.48152467608451843, + "learning_rate": 9.790960105166153e-05, + "loss": 1.9914, + "step": 3898 + }, + { + "epoch": 1.1967464702271333, + "grad_norm": 0.48487406969070435, + "learning_rate": 9.790817860883993e-05, + "loss": 1.9978, + "step": 3899 + }, + { + "epoch": 1.1970534069981584, + "grad_norm": 0.47665563225746155, + "learning_rate": 9.790675569256162e-05, + "loss": 1.9995, + "step": 3900 + }, + { + "epoch": 1.1973603437691835, + "grad_norm": 0.48938530683517456, + "learning_rate": 9.790533230284069e-05, + "loss": 2.0461, + "step": 3901 + }, + { + "epoch": 1.1976672805402087, + "grad_norm": 0.6336411237716675, + "learning_rate": 9.790390843969119e-05, + "loss": 2.0003, + "step": 3902 + }, + { + "epoch": 1.1979742173112338, + "grad_norm": 0.6946616172790527, + "learning_rate": 9.790248410312717e-05, + "loss": 1.9979, + "step": 3903 + }, + { + "epoch": 1.198281154082259, + "grad_norm": 0.7829384803771973, + "learning_rate": 9.790105929316274e-05, + "loss": 2.015, + "step": 3904 + }, + { + "epoch": 1.1985880908532842, + "grad_norm": 0.6874059438705444, + "learning_rate": 9.789963400981197e-05, + "loss": 1.9887, + "step": 3905 + }, + { + "epoch": 1.1988950276243093, + "grad_norm": 0.6074720025062561, + "learning_rate": 9.789820825308893e-05, + "loss": 2.0287, + "step": 3906 + }, + { + "epoch": 1.1992019643953347, + "grad_norm": 0.49311673641204834, + "learning_rate": 9.789678202300774e-05, + "loss": 1.9846, + "step": 3907 + }, + { + "epoch": 1.1995089011663598, + "grad_norm": 0.5266487002372742, + "learning_rate": 9.789535531958244e-05, + "loss": 2.017, + "step": 3908 + }, + { + "epoch": 1.1998158379373849, + "grad_norm": 0.6170570850372314, + "learning_rate": 9.789392814282721e-05, + "loss": 2.0615, + "step": 3909 + }, + { + "epoch": 1.20012277470841, + "grad_norm": 0.5820409059524536, + "learning_rate": 9.789250049275609e-05, + "loss": 2.0459, + "step": 3910 + }, + { + "epoch": 1.2004297114794351, + "grad_norm": 0.5220739841461182, + "learning_rate": 9.78910723693832e-05, + "loss": 2.0843, + "step": 3911 + }, + { + "epoch": 1.2007366482504604, + "grad_norm": 0.5884750485420227, + "learning_rate": 9.788964377272267e-05, + "loss": 2.1068, + "step": 3912 + }, + { + "epoch": 1.2010435850214856, + "grad_norm": 0.5634950995445251, + "learning_rate": 9.788821470278861e-05, + "loss": 2.0206, + "step": 3913 + }, + { + "epoch": 1.2013505217925107, + "grad_norm": 0.5219514966011047, + "learning_rate": 9.788678515959517e-05, + "loss": 2.0802, + "step": 3914 + }, + { + "epoch": 1.201657458563536, + "grad_norm": 0.5870078206062317, + "learning_rate": 9.788535514315642e-05, + "loss": 2.0149, + "step": 3915 + }, + { + "epoch": 1.2019643953345611, + "grad_norm": 0.4850577414035797, + "learning_rate": 9.788392465348653e-05, + "loss": 2.0424, + "step": 3916 + }, + { + "epoch": 1.2022713321055862, + "grad_norm": 0.5354881882667542, + "learning_rate": 9.788249369059964e-05, + "loss": 2.0822, + "step": 3917 + }, + { + "epoch": 1.2025782688766113, + "grad_norm": 0.5817529559135437, + "learning_rate": 9.788106225450988e-05, + "loss": 2.0384, + "step": 3918 + }, + { + "epoch": 1.2028852056476367, + "grad_norm": 0.5685575008392334, + "learning_rate": 9.78796303452314e-05, + "loss": 1.9777, + "step": 3919 + }, + { + "epoch": 1.2031921424186618, + "grad_norm": 0.5086472034454346, + "learning_rate": 9.787819796277835e-05, + "loss": 1.9109, + "step": 3920 + }, + { + "epoch": 1.203499079189687, + "grad_norm": 0.45905008912086487, + "learning_rate": 9.787676510716488e-05, + "loss": 1.9945, + "step": 3921 + }, + { + "epoch": 1.203806015960712, + "grad_norm": 0.6052672863006592, + "learning_rate": 9.787533177840516e-05, + "loss": 2.0873, + "step": 3922 + }, + { + "epoch": 1.2041129527317374, + "grad_norm": 0.636320173740387, + "learning_rate": 9.787389797651334e-05, + "loss": 1.954, + "step": 3923 + }, + { + "epoch": 1.2044198895027625, + "grad_norm": 0.5775459408760071, + "learning_rate": 9.78724637015036e-05, + "loss": 1.9632, + "step": 3924 + }, + { + "epoch": 1.2047268262737876, + "grad_norm": 0.4593936502933502, + "learning_rate": 9.787102895339013e-05, + "loss": 1.948, + "step": 3925 + }, + { + "epoch": 1.2050337630448127, + "grad_norm": 0.4568643867969513, + "learning_rate": 9.78695937321871e-05, + "loss": 1.977, + "step": 3926 + }, + { + "epoch": 1.205340699815838, + "grad_norm": 0.6079357266426086, + "learning_rate": 9.786815803790867e-05, + "loss": 1.9738, + "step": 3927 + }, + { + "epoch": 1.2056476365868631, + "grad_norm": 0.5991626977920532, + "learning_rate": 9.786672187056905e-05, + "loss": 1.9603, + "step": 3928 + }, + { + "epoch": 1.2059545733578882, + "grad_norm": 0.4844282865524292, + "learning_rate": 9.786528523018242e-05, + "loss": 1.9739, + "step": 3929 + }, + { + "epoch": 1.2062615101289134, + "grad_norm": 0.43694475293159485, + "learning_rate": 9.786384811676298e-05, + "loss": 1.957, + "step": 3930 + }, + { + "epoch": 1.2065684468999387, + "grad_norm": 0.5742451548576355, + "learning_rate": 9.786241053032496e-05, + "loss": 1.9872, + "step": 3931 + }, + { + "epoch": 1.2068753836709638, + "grad_norm": 0.6246824860572815, + "learning_rate": 9.786097247088255e-05, + "loss": 2.0747, + "step": 3932 + }, + { + "epoch": 1.207182320441989, + "grad_norm": 0.5364731550216675, + "learning_rate": 9.785953393844996e-05, + "loss": 1.9793, + "step": 3933 + }, + { + "epoch": 1.207489257213014, + "grad_norm": 0.42909273505210876, + "learning_rate": 9.785809493304139e-05, + "loss": 1.9959, + "step": 3934 + }, + { + "epoch": 1.2077961939840394, + "grad_norm": 0.43952879309654236, + "learning_rate": 9.785665545467108e-05, + "loss": 2.0019, + "step": 3935 + }, + { + "epoch": 1.2081031307550645, + "grad_norm": 0.45972180366516113, + "learning_rate": 9.785521550335323e-05, + "loss": 1.9504, + "step": 3936 + }, + { + "epoch": 1.2084100675260896, + "grad_norm": 0.5592246651649475, + "learning_rate": 9.785377507910212e-05, + "loss": 2.0214, + "step": 3937 + }, + { + "epoch": 1.2087170042971147, + "grad_norm": 0.6084285378456116, + "learning_rate": 9.785233418193196e-05, + "loss": 2.08, + "step": 3938 + }, + { + "epoch": 1.20902394106814, + "grad_norm": 0.5370670557022095, + "learning_rate": 9.785089281185698e-05, + "loss": 2.0877, + "step": 3939 + }, + { + "epoch": 1.2093308778391652, + "grad_norm": 0.466501921415329, + "learning_rate": 9.784945096889143e-05, + "loss": 1.9795, + "step": 3940 + }, + { + "epoch": 1.2096378146101903, + "grad_norm": 0.48617517948150635, + "learning_rate": 9.784800865304954e-05, + "loss": 2.0099, + "step": 3941 + }, + { + "epoch": 1.2099447513812154, + "grad_norm": 0.528110921382904, + "learning_rate": 9.78465658643456e-05, + "loss": 2.0597, + "step": 3942 + }, + { + "epoch": 1.2102516881522407, + "grad_norm": 0.47355538606643677, + "learning_rate": 9.784512260279385e-05, + "loss": 2.0145, + "step": 3943 + }, + { + "epoch": 1.2105586249232658, + "grad_norm": 0.46970823407173157, + "learning_rate": 9.784367886840856e-05, + "loss": 2.0533, + "step": 3944 + }, + { + "epoch": 1.210865561694291, + "grad_norm": 0.41206037998199463, + "learning_rate": 9.784223466120399e-05, + "loss": 1.9226, + "step": 3945 + }, + { + "epoch": 1.211172498465316, + "grad_norm": 0.4298155605792999, + "learning_rate": 9.784078998119442e-05, + "loss": 2.0686, + "step": 3946 + }, + { + "epoch": 1.2114794352363414, + "grad_norm": 0.4616359770298004, + "learning_rate": 9.783934482839412e-05, + "loss": 2.0063, + "step": 3947 + }, + { + "epoch": 1.2117863720073665, + "grad_norm": 0.476726233959198, + "learning_rate": 9.783789920281737e-05, + "loss": 1.9868, + "step": 3948 + }, + { + "epoch": 1.2120933087783916, + "grad_norm": 0.5075610876083374, + "learning_rate": 9.783645310447846e-05, + "loss": 2.1019, + "step": 3949 + }, + { + "epoch": 1.212400245549417, + "grad_norm": 0.49806225299835205, + "learning_rate": 9.78350065333917e-05, + "loss": 2.0503, + "step": 3950 + }, + { + "epoch": 1.212707182320442, + "grad_norm": 0.5278452634811401, + "learning_rate": 9.783355948957134e-05, + "loss": 2.0513, + "step": 3951 + }, + { + "epoch": 1.2130141190914672, + "grad_norm": 0.5634627938270569, + "learning_rate": 9.783211197303174e-05, + "loss": 2.1135, + "step": 3952 + }, + { + "epoch": 1.2133210558624923, + "grad_norm": 0.5152999758720398, + "learning_rate": 9.783066398378715e-05, + "loss": 2.0392, + "step": 3953 + }, + { + "epoch": 1.2136279926335174, + "grad_norm": 0.48095864057540894, + "learning_rate": 9.782921552185191e-05, + "loss": 1.982, + "step": 3954 + }, + { + "epoch": 1.2139349294045427, + "grad_norm": 0.47377893328666687, + "learning_rate": 9.782776658724034e-05, + "loss": 1.9538, + "step": 3955 + }, + { + "epoch": 1.2142418661755678, + "grad_norm": 0.5260181427001953, + "learning_rate": 9.782631717996675e-05, + "loss": 2.1197, + "step": 3956 + }, + { + "epoch": 1.214548802946593, + "grad_norm": 0.5640038251876831, + "learning_rate": 9.782486730004544e-05, + "loss": 2.0338, + "step": 3957 + }, + { + "epoch": 1.2148557397176183, + "grad_norm": 0.5091645121574402, + "learning_rate": 9.782341694749078e-05, + "loss": 1.9921, + "step": 3958 + }, + { + "epoch": 1.2151626764886434, + "grad_norm": 0.48285624384880066, + "learning_rate": 9.782196612231706e-05, + "loss": 2.0358, + "step": 3959 + }, + { + "epoch": 1.2154696132596685, + "grad_norm": 0.5013573169708252, + "learning_rate": 9.782051482453867e-05, + "loss": 1.9378, + "step": 3960 + }, + { + "epoch": 1.2157765500306936, + "grad_norm": 0.42000052332878113, + "learning_rate": 9.781906305416991e-05, + "loss": 1.9232, + "step": 3961 + }, + { + "epoch": 1.2160834868017187, + "grad_norm": 0.4651196599006653, + "learning_rate": 9.781761081122514e-05, + "loss": 2.0244, + "step": 3962 + }, + { + "epoch": 1.216390423572744, + "grad_norm": 0.48081469535827637, + "learning_rate": 9.781615809571871e-05, + "loss": 1.938, + "step": 3963 + }, + { + "epoch": 1.2166973603437692, + "grad_norm": 0.4692462086677551, + "learning_rate": 9.7814704907665e-05, + "loss": 1.9592, + "step": 3964 + }, + { + "epoch": 1.2170042971147943, + "grad_norm": 0.5545635223388672, + "learning_rate": 9.781325124707832e-05, + "loss": 2.0882, + "step": 3965 + }, + { + "epoch": 1.2173112338858196, + "grad_norm": 0.47801801562309265, + "learning_rate": 9.78117971139731e-05, + "loss": 2.0127, + "step": 3966 + }, + { + "epoch": 1.2176181706568447, + "grad_norm": 0.4705824851989746, + "learning_rate": 9.781034250836364e-05, + "loss": 2.0659, + "step": 3967 + }, + { + "epoch": 1.2179251074278699, + "grad_norm": 0.4757092297077179, + "learning_rate": 9.78088874302644e-05, + "loss": 1.9177, + "step": 3968 + }, + { + "epoch": 1.218232044198895, + "grad_norm": 0.4563291370868683, + "learning_rate": 9.780743187968968e-05, + "loss": 1.991, + "step": 3969 + }, + { + "epoch": 1.21853898096992, + "grad_norm": 0.4641762375831604, + "learning_rate": 9.78059758566539e-05, + "loss": 2.0357, + "step": 3970 + }, + { + "epoch": 1.2188459177409454, + "grad_norm": 0.510754406452179, + "learning_rate": 9.780451936117145e-05, + "loss": 2.0754, + "step": 3971 + }, + { + "epoch": 1.2191528545119705, + "grad_norm": 0.5595460534095764, + "learning_rate": 9.780306239325671e-05, + "loss": 2.0449, + "step": 3972 + }, + { + "epoch": 1.2194597912829956, + "grad_norm": 0.5778231620788574, + "learning_rate": 9.780160495292412e-05, + "loss": 2.0187, + "step": 3973 + }, + { + "epoch": 1.219766728054021, + "grad_norm": 0.5098022818565369, + "learning_rate": 9.780014704018803e-05, + "loss": 1.9881, + "step": 3974 + }, + { + "epoch": 1.220073664825046, + "grad_norm": 0.46725937724113464, + "learning_rate": 9.779868865506288e-05, + "loss": 1.9929, + "step": 3975 + }, + { + "epoch": 1.2203806015960712, + "grad_norm": 0.48517540097236633, + "learning_rate": 9.779722979756304e-05, + "loss": 1.9446, + "step": 3976 + }, + { + "epoch": 1.2206875383670963, + "grad_norm": 0.5013269186019897, + "learning_rate": 9.7795770467703e-05, + "loss": 2.0256, + "step": 3977 + }, + { + "epoch": 1.2209944751381214, + "grad_norm": 0.4918982982635498, + "learning_rate": 9.779431066549713e-05, + "loss": 1.9732, + "step": 3978 + }, + { + "epoch": 1.2213014119091468, + "grad_norm": 0.45646655559539795, + "learning_rate": 9.779285039095987e-05, + "loss": 1.9672, + "step": 3979 + }, + { + "epoch": 1.2216083486801719, + "grad_norm": 0.4712901711463928, + "learning_rate": 9.779138964410565e-05, + "loss": 2.0074, + "step": 3980 + }, + { + "epoch": 1.221915285451197, + "grad_norm": 0.4901394844055176, + "learning_rate": 9.77899284249489e-05, + "loss": 2.0073, + "step": 3981 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.579247772693634, + "learning_rate": 9.778846673350407e-05, + "loss": 2.0983, + "step": 3982 + }, + { + "epoch": 1.2225291589932474, + "grad_norm": 0.6108444929122925, + "learning_rate": 9.77870045697856e-05, + "loss": 2.0268, + "step": 3983 + }, + { + "epoch": 1.2228360957642725, + "grad_norm": 0.5592121481895447, + "learning_rate": 9.778554193380796e-05, + "loss": 2.0549, + "step": 3984 + }, + { + "epoch": 1.2231430325352977, + "grad_norm": 0.538088858127594, + "learning_rate": 9.778407882558556e-05, + "loss": 1.9398, + "step": 3985 + }, + { + "epoch": 1.223449969306323, + "grad_norm": 0.5928295850753784, + "learning_rate": 9.77826152451329e-05, + "loss": 2.0341, + "step": 3986 + }, + { + "epoch": 1.223756906077348, + "grad_norm": 0.566687822341919, + "learning_rate": 9.778115119246442e-05, + "loss": 2.0629, + "step": 3987 + }, + { + "epoch": 1.2240638428483732, + "grad_norm": 0.7019027471542358, + "learning_rate": 9.777968666759461e-05, + "loss": 1.9979, + "step": 3988 + }, + { + "epoch": 1.2243707796193983, + "grad_norm": 0.7198969721794128, + "learning_rate": 9.777822167053793e-05, + "loss": 1.9898, + "step": 3989 + }, + { + "epoch": 1.2246777163904237, + "grad_norm": 0.6319006085395813, + "learning_rate": 9.777675620130887e-05, + "loss": 1.9591, + "step": 3990 + }, + { + "epoch": 1.2249846531614488, + "grad_norm": 0.5372903347015381, + "learning_rate": 9.777529025992187e-05, + "loss": 1.9605, + "step": 3991 + }, + { + "epoch": 1.225291589932474, + "grad_norm": 0.47436487674713135, + "learning_rate": 9.777382384639147e-05, + "loss": 1.9667, + "step": 3992 + }, + { + "epoch": 1.225598526703499, + "grad_norm": 0.5885797739028931, + "learning_rate": 9.777235696073214e-05, + "loss": 2.0363, + "step": 3993 + }, + { + "epoch": 1.2259054634745243, + "grad_norm": 0.6333138346672058, + "learning_rate": 9.777088960295838e-05, + "loss": 1.9352, + "step": 3994 + }, + { + "epoch": 1.2262124002455494, + "grad_norm": 0.6364251971244812, + "learning_rate": 9.776942177308468e-05, + "loss": 1.9577, + "step": 3995 + }, + { + "epoch": 1.2265193370165746, + "grad_norm": 0.5114668607711792, + "learning_rate": 9.776795347112557e-05, + "loss": 2.0241, + "step": 3996 + }, + { + "epoch": 1.2268262737875997, + "grad_norm": 0.6139995455741882, + "learning_rate": 9.776648469709556e-05, + "loss": 1.9847, + "step": 3997 + }, + { + "epoch": 1.227133210558625, + "grad_norm": 0.6104671955108643, + "learning_rate": 9.776501545100911e-05, + "loss": 1.9311, + "step": 3998 + }, + { + "epoch": 1.2274401473296501, + "grad_norm": 0.5099297761917114, + "learning_rate": 9.776354573288081e-05, + "loss": 2.0877, + "step": 3999 + }, + { + "epoch": 1.2277470841006752, + "grad_norm": 0.48199233412742615, + "learning_rate": 9.776207554272516e-05, + "loss": 1.9802, + "step": 4000 + }, + { + "epoch": 1.2280540208717003, + "grad_norm": 0.5323067307472229, + "learning_rate": 9.776060488055667e-05, + "loss": 2.0278, + "step": 4001 + }, + { + "epoch": 1.2283609576427257, + "grad_norm": 0.49086472392082214, + "learning_rate": 9.775913374638988e-05, + "loss": 2.0242, + "step": 4002 + }, + { + "epoch": 1.2286678944137508, + "grad_norm": 0.4812946319580078, + "learning_rate": 9.775766214023936e-05, + "loss": 1.9762, + "step": 4003 + }, + { + "epoch": 1.228974831184776, + "grad_norm": 0.44118809700012207, + "learning_rate": 9.775619006211962e-05, + "loss": 1.9242, + "step": 4004 + }, + { + "epoch": 1.229281767955801, + "grad_norm": 0.4507352113723755, + "learning_rate": 9.775471751204522e-05, + "loss": 2.0015, + "step": 4005 + }, + { + "epoch": 1.2295887047268264, + "grad_norm": 0.4620691239833832, + "learning_rate": 9.775324449003072e-05, + "loss": 2.0269, + "step": 4006 + }, + { + "epoch": 1.2298956414978515, + "grad_norm": 0.5053025484085083, + "learning_rate": 9.775177099609065e-05, + "loss": 1.9764, + "step": 4007 + }, + { + "epoch": 1.2302025782688766, + "grad_norm": 0.5113483667373657, + "learning_rate": 9.775029703023961e-05, + "loss": 2.0583, + "step": 4008 + }, + { + "epoch": 1.2305095150399017, + "grad_norm": 0.517400324344635, + "learning_rate": 9.774882259249214e-05, + "loss": 2.0918, + "step": 4009 + }, + { + "epoch": 1.230816451810927, + "grad_norm": 0.5575035214424133, + "learning_rate": 9.774734768286282e-05, + "loss": 2.0573, + "step": 4010 + }, + { + "epoch": 1.2311233885819521, + "grad_norm": 0.5556582808494568, + "learning_rate": 9.774587230136622e-05, + "loss": 1.9612, + "step": 4011 + }, + { + "epoch": 1.2314303253529773, + "grad_norm": 0.541752815246582, + "learning_rate": 9.774439644801693e-05, + "loss": 2.0165, + "step": 4012 + }, + { + "epoch": 1.2317372621240024, + "grad_norm": 0.46944886445999146, + "learning_rate": 9.774292012282953e-05, + "loss": 2.0068, + "step": 4013 + }, + { + "epoch": 1.2320441988950277, + "grad_norm": 0.5507385730743408, + "learning_rate": 9.77414433258186e-05, + "loss": 2.0092, + "step": 4014 + }, + { + "epoch": 1.2323511356660528, + "grad_norm": 0.550862193107605, + "learning_rate": 9.773996605699875e-05, + "loss": 1.9887, + "step": 4015 + }, + { + "epoch": 1.232658072437078, + "grad_norm": 0.5281004905700684, + "learning_rate": 9.77384883163846e-05, + "loss": 2.0214, + "step": 4016 + }, + { + "epoch": 1.232965009208103, + "grad_norm": 0.5682541131973267, + "learning_rate": 9.77370101039907e-05, + "loss": 2.0021, + "step": 4017 + }, + { + "epoch": 1.2332719459791284, + "grad_norm": 0.5083168745040894, + "learning_rate": 9.77355314198317e-05, + "loss": 1.9589, + "step": 4018 + }, + { + "epoch": 1.2335788827501535, + "grad_norm": 0.48763957619667053, + "learning_rate": 9.773405226392218e-05, + "loss": 1.9517, + "step": 4019 + }, + { + "epoch": 1.2338858195211786, + "grad_norm": 0.4721868634223938, + "learning_rate": 9.77325726362768e-05, + "loss": 1.959, + "step": 4020 + }, + { + "epoch": 1.2341927562922037, + "grad_norm": 0.5072606205940247, + "learning_rate": 9.773109253691016e-05, + "loss": 2.0252, + "step": 4021 + }, + { + "epoch": 1.234499693063229, + "grad_norm": 0.483260840177536, + "learning_rate": 9.772961196583686e-05, + "loss": 2.0205, + "step": 4022 + }, + { + "epoch": 1.2348066298342542, + "grad_norm": 0.4468609392642975, + "learning_rate": 9.772813092307158e-05, + "loss": 2.0182, + "step": 4023 + }, + { + "epoch": 1.2351135666052793, + "grad_norm": 0.4950753152370453, + "learning_rate": 9.772664940862893e-05, + "loss": 2.0276, + "step": 4024 + }, + { + "epoch": 1.2354205033763046, + "grad_norm": 0.45740416646003723, + "learning_rate": 9.772516742252356e-05, + "loss": 1.9519, + "step": 4025 + }, + { + "epoch": 1.2357274401473297, + "grad_norm": 0.409072607755661, + "learning_rate": 9.772368496477011e-05, + "loss": 1.9441, + "step": 4026 + }, + { + "epoch": 1.2360343769183548, + "grad_norm": 0.44857287406921387, + "learning_rate": 9.772220203538325e-05, + "loss": 1.9941, + "step": 4027 + }, + { + "epoch": 1.23634131368938, + "grad_norm": 0.4610998034477234, + "learning_rate": 9.77207186343776e-05, + "loss": 1.9855, + "step": 4028 + }, + { + "epoch": 1.236648250460405, + "grad_norm": 0.4809660017490387, + "learning_rate": 9.771923476176784e-05, + "loss": 1.9596, + "step": 4029 + }, + { + "epoch": 1.2369551872314304, + "grad_norm": 0.5011657476425171, + "learning_rate": 9.771775041756865e-05, + "loss": 1.9537, + "step": 4030 + }, + { + "epoch": 1.2372621240024555, + "grad_norm": 0.476001501083374, + "learning_rate": 9.771626560179465e-05, + "loss": 1.9447, + "step": 4031 + }, + { + "epoch": 1.2375690607734806, + "grad_norm": 0.4733816385269165, + "learning_rate": 9.771478031446057e-05, + "loss": 2.08, + "step": 4032 + }, + { + "epoch": 1.237875997544506, + "grad_norm": 0.4763995409011841, + "learning_rate": 9.771329455558108e-05, + "loss": 1.9483, + "step": 4033 + }, + { + "epoch": 1.238182934315531, + "grad_norm": 0.4906281530857086, + "learning_rate": 9.771180832517082e-05, + "loss": 1.9619, + "step": 4034 + }, + { + "epoch": 1.2384898710865562, + "grad_norm": 0.48713672161102295, + "learning_rate": 9.77103216232445e-05, + "loss": 1.9753, + "step": 4035 + }, + { + "epoch": 1.2387968078575813, + "grad_norm": 0.5214180946350098, + "learning_rate": 9.770883444981683e-05, + "loss": 2.0407, + "step": 4036 + }, + { + "epoch": 1.2391037446286064, + "grad_norm": 0.5161129236221313, + "learning_rate": 9.77073468049025e-05, + "loss": 2.0298, + "step": 4037 + }, + { + "epoch": 1.2394106813996317, + "grad_norm": 0.5041607022285461, + "learning_rate": 9.770585868851621e-05, + "loss": 1.9898, + "step": 4038 + }, + { + "epoch": 1.2397176181706568, + "grad_norm": 0.5076795220375061, + "learning_rate": 9.770437010067264e-05, + "loss": 1.9899, + "step": 4039 + }, + { + "epoch": 1.240024554941682, + "grad_norm": 0.47992074489593506, + "learning_rate": 9.770288104138654e-05, + "loss": 1.9923, + "step": 4040 + }, + { + "epoch": 1.2403314917127073, + "grad_norm": 0.4655405580997467, + "learning_rate": 9.770139151067261e-05, + "loss": 2.0082, + "step": 4041 + }, + { + "epoch": 1.2406384284837324, + "grad_norm": 0.499953031539917, + "learning_rate": 9.769990150854558e-05, + "loss": 2.0412, + "step": 4042 + }, + { + "epoch": 1.2409453652547575, + "grad_norm": 0.5288184285163879, + "learning_rate": 9.769841103502016e-05, + "loss": 2.0163, + "step": 4043 + }, + { + "epoch": 1.2412523020257826, + "grad_norm": 0.6660463809967041, + "learning_rate": 9.769692009011107e-05, + "loss": 2.1644, + "step": 4044 + }, + { + "epoch": 1.2415592387968077, + "grad_norm": 0.7020677328109741, + "learning_rate": 9.769542867383306e-05, + "loss": 1.9921, + "step": 4045 + }, + { + "epoch": 1.241866175567833, + "grad_norm": 0.8394366502761841, + "learning_rate": 9.769393678620089e-05, + "loss": 2.0099, + "step": 4046 + }, + { + "epoch": 1.2421731123388582, + "grad_norm": 0.9541008472442627, + "learning_rate": 9.769244442722927e-05, + "loss": 2.0035, + "step": 4047 + }, + { + "epoch": 1.2424800491098833, + "grad_norm": 0.8454573750495911, + "learning_rate": 9.769095159693296e-05, + "loss": 2.0075, + "step": 4048 + }, + { + "epoch": 1.2427869858809086, + "grad_norm": 0.6634951233863831, + "learning_rate": 9.768945829532672e-05, + "loss": 2.0352, + "step": 4049 + }, + { + "epoch": 1.2430939226519337, + "grad_norm": 0.5453166365623474, + "learning_rate": 9.76879645224253e-05, + "loss": 2.0259, + "step": 4050 + }, + { + "epoch": 1.2434008594229589, + "grad_norm": 0.8018995523452759, + "learning_rate": 9.768647027824344e-05, + "loss": 2.0175, + "step": 4051 + }, + { + "epoch": 1.243707796193984, + "grad_norm": 0.8518994450569153, + "learning_rate": 9.768497556279596e-05, + "loss": 1.986, + "step": 4052 + }, + { + "epoch": 1.244014732965009, + "grad_norm": 0.670764684677124, + "learning_rate": 9.76834803760976e-05, + "loss": 1.9779, + "step": 4053 + }, + { + "epoch": 1.2443216697360344, + "grad_norm": 0.5042433142662048, + "learning_rate": 9.768198471816312e-05, + "loss": 1.9808, + "step": 4054 + }, + { + "epoch": 1.2446286065070595, + "grad_norm": 0.45487603545188904, + "learning_rate": 9.768048858900733e-05, + "loss": 2.011, + "step": 4055 + }, + { + "epoch": 1.2449355432780846, + "grad_norm": 0.5012104511260986, + "learning_rate": 9.767899198864502e-05, + "loss": 1.9945, + "step": 4056 + }, + { + "epoch": 1.24524248004911, + "grad_norm": 0.6275805234909058, + "learning_rate": 9.767749491709095e-05, + "loss": 2.0397, + "step": 4057 + }, + { + "epoch": 1.245549416820135, + "grad_norm": 0.601513683795929, + "learning_rate": 9.767599737435993e-05, + "loss": 2.0201, + "step": 4058 + }, + { + "epoch": 1.2458563535911602, + "grad_norm": 0.531112551689148, + "learning_rate": 9.767449936046678e-05, + "loss": 2.0449, + "step": 4059 + }, + { + "epoch": 1.2461632903621853, + "grad_norm": 0.48515528440475464, + "learning_rate": 9.767300087542626e-05, + "loss": 2.0318, + "step": 4060 + }, + { + "epoch": 1.2464702271332107, + "grad_norm": 0.49292388558387756, + "learning_rate": 9.767150191925321e-05, + "loss": 2.0004, + "step": 4061 + }, + { + "epoch": 1.2467771639042358, + "grad_norm": 0.6046907901763916, + "learning_rate": 9.767000249196242e-05, + "loss": 2.0141, + "step": 4062 + }, + { + "epoch": 1.2470841006752609, + "grad_norm": 0.5311875939369202, + "learning_rate": 9.766850259356876e-05, + "loss": 1.9909, + "step": 4063 + }, + { + "epoch": 1.247391037446286, + "grad_norm": 0.535664975643158, + "learning_rate": 9.7667002224087e-05, + "loss": 2.07, + "step": 4064 + }, + { + "epoch": 1.2476979742173113, + "grad_norm": 0.594886839389801, + "learning_rate": 9.766550138353199e-05, + "loss": 1.9646, + "step": 4065 + }, + { + "epoch": 1.2480049109883364, + "grad_norm": 0.6726763844490051, + "learning_rate": 9.766400007191856e-05, + "loss": 1.9778, + "step": 4066 + }, + { + "epoch": 1.2483118477593615, + "grad_norm": 0.6045297384262085, + "learning_rate": 9.766249828926154e-05, + "loss": 2.0215, + "step": 4067 + }, + { + "epoch": 1.2486187845303867, + "grad_norm": 0.56207275390625, + "learning_rate": 9.766099603557576e-05, + "loss": 2.0252, + "step": 4068 + }, + { + "epoch": 1.248925721301412, + "grad_norm": 0.6623022556304932, + "learning_rate": 9.765949331087611e-05, + "loss": 1.975, + "step": 4069 + }, + { + "epoch": 1.249232658072437, + "grad_norm": 0.6274738311767578, + "learning_rate": 9.76579901151774e-05, + "loss": 2.037, + "step": 4070 + }, + { + "epoch": 1.2495395948434622, + "grad_norm": 0.5161643028259277, + "learning_rate": 9.76564864484945e-05, + "loss": 1.969, + "step": 4071 + }, + { + "epoch": 1.2498465316144873, + "grad_norm": 0.5624449849128723, + "learning_rate": 9.765498231084227e-05, + "loss": 2.0322, + "step": 4072 + }, + { + "epoch": 1.2501534683855127, + "grad_norm": 0.6198796629905701, + "learning_rate": 9.765347770223556e-05, + "loss": 1.986, + "step": 4073 + }, + { + "epoch": 1.2504604051565378, + "grad_norm": 0.5928165316581726, + "learning_rate": 9.765197262268927e-05, + "loss": 1.9886, + "step": 4074 + }, + { + "epoch": 1.250767341927563, + "grad_norm": 0.476484090089798, + "learning_rate": 9.765046707221825e-05, + "loss": 2.0476, + "step": 4075 + }, + { + "epoch": 1.2510742786985882, + "grad_norm": 0.5001220703125, + "learning_rate": 9.764896105083738e-05, + "loss": 1.9222, + "step": 4076 + }, + { + "epoch": 1.2513812154696133, + "grad_norm": 0.5429214239120483, + "learning_rate": 9.764745455856156e-05, + "loss": 2.0005, + "step": 4077 + }, + { + "epoch": 1.2516881522406385, + "grad_norm": 0.49443748593330383, + "learning_rate": 9.764594759540566e-05, + "loss": 1.9746, + "step": 4078 + }, + { + "epoch": 1.2519950890116636, + "grad_norm": 0.46963369846343994, + "learning_rate": 9.764444016138458e-05, + "loss": 1.9133, + "step": 4079 + }, + { + "epoch": 1.2523020257826887, + "grad_norm": 0.5112172365188599, + "learning_rate": 9.764293225651324e-05, + "loss": 1.9488, + "step": 4080 + }, + { + "epoch": 1.252608962553714, + "grad_norm": 0.4584117829799652, + "learning_rate": 9.764142388080648e-05, + "loss": 1.9895, + "step": 4081 + }, + { + "epoch": 1.2529158993247391, + "grad_norm": 0.48059090971946716, + "learning_rate": 9.763991503427927e-05, + "loss": 2.0436, + "step": 4082 + }, + { + "epoch": 1.2532228360957642, + "grad_norm": 0.5877810120582581, + "learning_rate": 9.763840571694649e-05, + "loss": 1.97, + "step": 4083 + }, + { + "epoch": 1.2535297728667896, + "grad_norm": 0.5370834469795227, + "learning_rate": 9.763689592882306e-05, + "loss": 2.0369, + "step": 4084 + }, + { + "epoch": 1.2538367096378147, + "grad_norm": 0.5483170747756958, + "learning_rate": 9.763538566992392e-05, + "loss": 2.066, + "step": 4085 + }, + { + "epoch": 1.2541436464088398, + "grad_norm": 0.5209359526634216, + "learning_rate": 9.763387494026396e-05, + "loss": 2.0685, + "step": 4086 + }, + { + "epoch": 1.254450583179865, + "grad_norm": 0.5569130182266235, + "learning_rate": 9.763236373985813e-05, + "loss": 2.0253, + "step": 4087 + }, + { + "epoch": 1.25475751995089, + "grad_norm": 0.48483753204345703, + "learning_rate": 9.763085206872136e-05, + "loss": 1.9851, + "step": 4088 + }, + { + "epoch": 1.2550644567219154, + "grad_norm": 0.4289563000202179, + "learning_rate": 9.76293399268686e-05, + "loss": 1.9374, + "step": 4089 + }, + { + "epoch": 1.2553713934929405, + "grad_norm": 0.4691961109638214, + "learning_rate": 9.762782731431478e-05, + "loss": 1.9588, + "step": 4090 + }, + { + "epoch": 1.2556783302639656, + "grad_norm": 0.49626582860946655, + "learning_rate": 9.762631423107488e-05, + "loss": 1.999, + "step": 4091 + }, + { + "epoch": 1.255985267034991, + "grad_norm": 0.5099872946739197, + "learning_rate": 9.762480067716381e-05, + "loss": 2.013, + "step": 4092 + }, + { + "epoch": 1.256292203806016, + "grad_norm": 0.47525838017463684, + "learning_rate": 9.762328665259654e-05, + "loss": 1.9953, + "step": 4093 + }, + { + "epoch": 1.2565991405770411, + "grad_norm": 0.4277878999710083, + "learning_rate": 9.762177215738804e-05, + "loss": 1.9623, + "step": 4094 + }, + { + "epoch": 1.2569060773480663, + "grad_norm": 0.46068885922431946, + "learning_rate": 9.762025719155328e-05, + "loss": 2.0012, + "step": 4095 + }, + { + "epoch": 1.2572130141190914, + "grad_norm": 0.4566059410572052, + "learning_rate": 9.761874175510723e-05, + "loss": 1.9666, + "step": 4096 + }, + { + "epoch": 1.2575199508901167, + "grad_norm": 0.44656631350517273, + "learning_rate": 9.761722584806487e-05, + "loss": 1.9912, + "step": 4097 + }, + { + "epoch": 1.2578268876611418, + "grad_norm": 0.5149295330047607, + "learning_rate": 9.761570947044117e-05, + "loss": 1.9876, + "step": 4098 + }, + { + "epoch": 1.258133824432167, + "grad_norm": 0.5265617370605469, + "learning_rate": 9.761419262225111e-05, + "loss": 2.0817, + "step": 4099 + }, + { + "epoch": 1.2584407612031923, + "grad_norm": 0.5015068054199219, + "learning_rate": 9.76126753035097e-05, + "loss": 1.9767, + "step": 4100 + }, + { + "epoch": 1.2587476979742174, + "grad_norm": 0.5178890228271484, + "learning_rate": 9.761115751423192e-05, + "loss": 1.9968, + "step": 4101 + }, + { + "epoch": 1.2590546347452425, + "grad_norm": 0.46565014123916626, + "learning_rate": 9.760963925443279e-05, + "loss": 1.8977, + "step": 4102 + }, + { + "epoch": 1.2593615715162676, + "grad_norm": 0.466398686170578, + "learning_rate": 9.760812052412728e-05, + "loss": 2.0317, + "step": 4103 + }, + { + "epoch": 1.2596685082872927, + "grad_norm": 0.48445576429367065, + "learning_rate": 9.760660132333043e-05, + "loss": 1.9953, + "step": 4104 + }, + { + "epoch": 1.259975445058318, + "grad_norm": 0.5716978907585144, + "learning_rate": 9.760508165205724e-05, + "loss": 2.0468, + "step": 4105 + }, + { + "epoch": 1.2602823818293432, + "grad_norm": 0.5168376564979553, + "learning_rate": 9.760356151032273e-05, + "loss": 1.9896, + "step": 4106 + }, + { + "epoch": 1.2605893186003683, + "grad_norm": 0.5014469027519226, + "learning_rate": 9.760204089814192e-05, + "loss": 2.0855, + "step": 4107 + }, + { + "epoch": 1.2608962553713936, + "grad_norm": 0.5283352732658386, + "learning_rate": 9.760051981552984e-05, + "loss": 2.0477, + "step": 4108 + }, + { + "epoch": 1.2612031921424187, + "grad_norm": 0.4526209533214569, + "learning_rate": 9.759899826250153e-05, + "loss": 1.9638, + "step": 4109 + }, + { + "epoch": 1.2615101289134438, + "grad_norm": 0.4565027058124542, + "learning_rate": 9.759747623907203e-05, + "loss": 1.9401, + "step": 4110 + }, + { + "epoch": 1.261817065684469, + "grad_norm": 0.48825928568840027, + "learning_rate": 9.759595374525636e-05, + "loss": 1.9721, + "step": 4111 + }, + { + "epoch": 1.262124002455494, + "grad_norm": 0.4922933578491211, + "learning_rate": 9.759443078106958e-05, + "loss": 1.969, + "step": 4112 + }, + { + "epoch": 1.2624309392265194, + "grad_norm": 0.5227758884429932, + "learning_rate": 9.759290734652674e-05, + "loss": 2.0144, + "step": 4113 + }, + { + "epoch": 1.2627378759975445, + "grad_norm": 0.48013919591903687, + "learning_rate": 9.759138344164289e-05, + "loss": 1.9889, + "step": 4114 + }, + { + "epoch": 1.2630448127685696, + "grad_norm": 0.5039379596710205, + "learning_rate": 9.758985906643309e-05, + "loss": 1.9313, + "step": 4115 + }, + { + "epoch": 1.263351749539595, + "grad_norm": 0.5248776078224182, + "learning_rate": 9.758833422091244e-05, + "loss": 2.0091, + "step": 4116 + }, + { + "epoch": 1.26365868631062, + "grad_norm": 0.4788825809955597, + "learning_rate": 9.758680890509595e-05, + "loss": 2.0197, + "step": 4117 + }, + { + "epoch": 1.2639656230816452, + "grad_norm": 0.4926285743713379, + "learning_rate": 9.758528311899873e-05, + "loss": 2.0558, + "step": 4118 + }, + { + "epoch": 1.2642725598526703, + "grad_norm": 0.44785842299461365, + "learning_rate": 9.758375686263586e-05, + "loss": 1.9505, + "step": 4119 + }, + { + "epoch": 1.2645794966236954, + "grad_norm": 0.44693484902381897, + "learning_rate": 9.75822301360224e-05, + "loss": 1.9734, + "step": 4120 + }, + { + "epoch": 1.2648864333947207, + "grad_norm": 0.4691752791404724, + "learning_rate": 9.758070293917346e-05, + "loss": 2.0069, + "step": 4121 + }, + { + "epoch": 1.2651933701657458, + "grad_norm": 0.4718364477157593, + "learning_rate": 9.757917527210413e-05, + "loss": 1.9926, + "step": 4122 + }, + { + "epoch": 1.265500306936771, + "grad_norm": 0.47527435421943665, + "learning_rate": 9.757764713482949e-05, + "loss": 2.0304, + "step": 4123 + }, + { + "epoch": 1.2658072437077963, + "grad_norm": 0.5030924677848816, + "learning_rate": 9.757611852736467e-05, + "loss": 2.0281, + "step": 4124 + }, + { + "epoch": 1.2661141804788214, + "grad_norm": 0.5260440707206726, + "learning_rate": 9.757458944972475e-05, + "loss": 1.9952, + "step": 4125 + }, + { + "epoch": 1.2664211172498465, + "grad_norm": 0.5542300939559937, + "learning_rate": 9.757305990192486e-05, + "loss": 1.979, + "step": 4126 + }, + { + "epoch": 1.2667280540208716, + "grad_norm": 0.5589221715927124, + "learning_rate": 9.757152988398011e-05, + "loss": 2.0123, + "step": 4127 + }, + { + "epoch": 1.2670349907918967, + "grad_norm": 0.48933175206184387, + "learning_rate": 9.75699993959056e-05, + "loss": 1.9671, + "step": 4128 + }, + { + "epoch": 1.267341927562922, + "grad_norm": 0.4785501956939697, + "learning_rate": 9.75684684377165e-05, + "loss": 1.9452, + "step": 4129 + }, + { + "epoch": 1.2676488643339472, + "grad_norm": 0.5000367760658264, + "learning_rate": 9.75669370094279e-05, + "loss": 1.9637, + "step": 4130 + }, + { + "epoch": 1.2679558011049723, + "grad_norm": 0.5292743444442749, + "learning_rate": 9.756540511105496e-05, + "loss": 2.0464, + "step": 4131 + }, + { + "epoch": 1.2682627378759976, + "grad_norm": 0.4979592561721802, + "learning_rate": 9.75638727426128e-05, + "loss": 1.9863, + "step": 4132 + }, + { + "epoch": 1.2685696746470227, + "grad_norm": 0.4681611657142639, + "learning_rate": 9.756233990411656e-05, + "loss": 1.9978, + "step": 4133 + }, + { + "epoch": 1.2688766114180479, + "grad_norm": 0.5034354329109192, + "learning_rate": 9.756080659558142e-05, + "loss": 2.0332, + "step": 4134 + }, + { + "epoch": 1.269183548189073, + "grad_norm": 0.4815942347049713, + "learning_rate": 9.75592728170225e-05, + "loss": 1.9669, + "step": 4135 + }, + { + "epoch": 1.269490484960098, + "grad_norm": 0.49555137753486633, + "learning_rate": 9.755773856845498e-05, + "loss": 1.9774, + "step": 4136 + }, + { + "epoch": 1.2697974217311234, + "grad_norm": 0.5533550381660461, + "learning_rate": 9.755620384989401e-05, + "loss": 2.0236, + "step": 4137 + }, + { + "epoch": 1.2701043585021485, + "grad_norm": 0.49497511982917786, + "learning_rate": 9.755466866135476e-05, + "loss": 1.9266, + "step": 4138 + }, + { + "epoch": 1.2704112952731736, + "grad_norm": 0.5009804964065552, + "learning_rate": 9.755313300285239e-05, + "loss": 1.9463, + "step": 4139 + }, + { + "epoch": 1.270718232044199, + "grad_norm": 0.49870428442955017, + "learning_rate": 9.755159687440209e-05, + "loss": 1.9566, + "step": 4140 + }, + { + "epoch": 1.271025168815224, + "grad_norm": 0.49113500118255615, + "learning_rate": 9.755006027601905e-05, + "loss": 2.0075, + "step": 4141 + }, + { + "epoch": 1.2713321055862492, + "grad_norm": 0.45977187156677246, + "learning_rate": 9.754852320771845e-05, + "loss": 1.9358, + "step": 4142 + }, + { + "epoch": 1.2716390423572743, + "grad_norm": 0.5493664145469666, + "learning_rate": 9.754698566951545e-05, + "loss": 1.9996, + "step": 4143 + }, + { + "epoch": 1.2719459791282997, + "grad_norm": 0.4791078567504883, + "learning_rate": 9.75454476614253e-05, + "loss": 1.9426, + "step": 4144 + }, + { + "epoch": 1.2722529158993248, + "grad_norm": 0.4809282720088959, + "learning_rate": 9.754390918346315e-05, + "loss": 2.0197, + "step": 4145 + }, + { + "epoch": 1.2725598526703499, + "grad_norm": 0.5380387902259827, + "learning_rate": 9.754237023564423e-05, + "loss": 2.0261, + "step": 4146 + }, + { + "epoch": 1.272866789441375, + "grad_norm": 0.48302608728408813, + "learning_rate": 9.754083081798374e-05, + "loss": 2.0539, + "step": 4147 + }, + { + "epoch": 1.2731737262124003, + "grad_norm": 0.5752124786376953, + "learning_rate": 9.75392909304969e-05, + "loss": 2.0901, + "step": 4148 + }, + { + "epoch": 1.2734806629834254, + "grad_norm": 0.5538807511329651, + "learning_rate": 9.75377505731989e-05, + "loss": 1.9721, + "step": 4149 + }, + { + "epoch": 1.2737875997544506, + "grad_norm": 0.6331756114959717, + "learning_rate": 9.753620974610502e-05, + "loss": 2.0124, + "step": 4150 + }, + { + "epoch": 1.2740945365254759, + "grad_norm": 0.6422140598297119, + "learning_rate": 9.753466844923042e-05, + "loss": 2.0115, + "step": 4151 + }, + { + "epoch": 1.274401473296501, + "grad_norm": 0.6650347113609314, + "learning_rate": 9.753312668259038e-05, + "loss": 1.9735, + "step": 4152 + }, + { + "epoch": 1.274708410067526, + "grad_norm": 0.587230384349823, + "learning_rate": 9.753158444620013e-05, + "loss": 1.9382, + "step": 4153 + }, + { + "epoch": 1.2750153468385512, + "grad_norm": 0.5357664823532104, + "learning_rate": 9.75300417400749e-05, + "loss": 2.0437, + "step": 4154 + }, + { + "epoch": 1.2753222836095763, + "grad_norm": 0.5058115720748901, + "learning_rate": 9.752849856422994e-05, + "loss": 2.0031, + "step": 4155 + }, + { + "epoch": 1.2756292203806017, + "grad_norm": 0.5913745164871216, + "learning_rate": 9.75269549186805e-05, + "loss": 1.9923, + "step": 4156 + }, + { + "epoch": 1.2759361571516268, + "grad_norm": 0.6766920685768127, + "learning_rate": 9.752541080344181e-05, + "loss": 1.9619, + "step": 4157 + }, + { + "epoch": 1.276243093922652, + "grad_norm": 0.606132984161377, + "learning_rate": 9.752386621852919e-05, + "loss": 1.9689, + "step": 4158 + }, + { + "epoch": 1.2765500306936772, + "grad_norm": 0.521133542060852, + "learning_rate": 9.752232116395785e-05, + "loss": 1.9602, + "step": 4159 + }, + { + "epoch": 1.2768569674647023, + "grad_norm": 0.45266324281692505, + "learning_rate": 9.75207756397431e-05, + "loss": 2.0032, + "step": 4160 + }, + { + "epoch": 1.2771639042357275, + "grad_norm": 0.5078892707824707, + "learning_rate": 9.751922964590017e-05, + "loss": 2.0656, + "step": 4161 + }, + { + "epoch": 1.2774708410067526, + "grad_norm": 0.5042154788970947, + "learning_rate": 9.751768318244437e-05, + "loss": 1.9356, + "step": 4162 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.5866135954856873, + "learning_rate": 9.751613624939098e-05, + "loss": 1.9655, + "step": 4163 + }, + { + "epoch": 1.278084714548803, + "grad_norm": 0.6038163304328918, + "learning_rate": 9.751458884675527e-05, + "loss": 1.9445, + "step": 4164 + }, + { + "epoch": 1.2783916513198281, + "grad_norm": 0.4938269555568695, + "learning_rate": 9.751304097455254e-05, + "loss": 2.0164, + "step": 4165 + }, + { + "epoch": 1.2786985880908532, + "grad_norm": 0.4289272427558899, + "learning_rate": 9.75114926327981e-05, + "loss": 1.912, + "step": 4166 + }, + { + "epoch": 1.2790055248618786, + "grad_norm": 0.524058997631073, + "learning_rate": 9.750994382150724e-05, + "loss": 1.9279, + "step": 4167 + }, + { + "epoch": 1.2793124616329037, + "grad_norm": 0.6318224668502808, + "learning_rate": 9.750839454069527e-05, + "loss": 1.98, + "step": 4168 + }, + { + "epoch": 1.2796193984039288, + "grad_norm": 0.5709670782089233, + "learning_rate": 9.750684479037749e-05, + "loss": 2.0029, + "step": 4169 + }, + { + "epoch": 1.279926335174954, + "grad_norm": 0.4621523916721344, + "learning_rate": 9.750529457056924e-05, + "loss": 2.0295, + "step": 4170 + }, + { + "epoch": 1.280233271945979, + "grad_norm": 0.5812001824378967, + "learning_rate": 9.750374388128581e-05, + "loss": 2.0839, + "step": 4171 + }, + { + "epoch": 1.2805402087170044, + "grad_norm": 0.6389874219894409, + "learning_rate": 9.750219272254256e-05, + "loss": 2.0825, + "step": 4172 + }, + { + "epoch": 1.2808471454880295, + "grad_norm": 0.49902382493019104, + "learning_rate": 9.750064109435478e-05, + "loss": 1.8902, + "step": 4173 + }, + { + "epoch": 1.2811540822590546, + "grad_norm": 0.5641525983810425, + "learning_rate": 9.749908899673783e-05, + "loss": 2.0463, + "step": 4174 + }, + { + "epoch": 1.28146101903008, + "grad_norm": 0.5977841019630432, + "learning_rate": 9.749753642970704e-05, + "loss": 2.0253, + "step": 4175 + }, + { + "epoch": 1.281767955801105, + "grad_norm": 0.5438104271888733, + "learning_rate": 9.749598339327777e-05, + "loss": 1.9862, + "step": 4176 + }, + { + "epoch": 1.2820748925721301, + "grad_norm": 0.4542587697505951, + "learning_rate": 9.749442988746535e-05, + "loss": 1.9476, + "step": 4177 + }, + { + "epoch": 1.2823818293431553, + "grad_norm": 0.4900791347026825, + "learning_rate": 9.749287591228513e-05, + "loss": 2.0093, + "step": 4178 + }, + { + "epoch": 1.2826887661141804, + "grad_norm": 0.5837534666061401, + "learning_rate": 9.749132146775247e-05, + "loss": 2.0699, + "step": 4179 + }, + { + "epoch": 1.2829957028852057, + "grad_norm": 0.5315881967544556, + "learning_rate": 9.748976655388274e-05, + "loss": 1.9514, + "step": 4180 + }, + { + "epoch": 1.2833026396562308, + "grad_norm": 0.5284895300865173, + "learning_rate": 9.74882111706913e-05, + "loss": 2.0171, + "step": 4181 + }, + { + "epoch": 1.283609576427256, + "grad_norm": 0.521202802658081, + "learning_rate": 9.748665531819352e-05, + "loss": 2.025, + "step": 4182 + }, + { + "epoch": 1.2839165131982813, + "grad_norm": 0.5437573194503784, + "learning_rate": 9.748509899640479e-05, + "loss": 2.0352, + "step": 4183 + }, + { + "epoch": 1.2842234499693064, + "grad_norm": 0.5394143462181091, + "learning_rate": 9.748354220534048e-05, + "loss": 2.0245, + "step": 4184 + }, + { + "epoch": 1.2845303867403315, + "grad_norm": 0.47468093037605286, + "learning_rate": 9.748198494501597e-05, + "loss": 1.9719, + "step": 4185 + }, + { + "epoch": 1.2848373235113566, + "grad_norm": 0.5312216877937317, + "learning_rate": 9.748042721544666e-05, + "loss": 2.0111, + "step": 4186 + }, + { + "epoch": 1.2851442602823817, + "grad_norm": 0.525694727897644, + "learning_rate": 9.747886901664794e-05, + "loss": 2.0582, + "step": 4187 + }, + { + "epoch": 1.285451197053407, + "grad_norm": 0.4965955317020416, + "learning_rate": 9.74773103486352e-05, + "loss": 1.9777, + "step": 4188 + }, + { + "epoch": 1.2857581338244322, + "grad_norm": 0.4391513466835022, + "learning_rate": 9.747575121142385e-05, + "loss": 1.9725, + "step": 4189 + }, + { + "epoch": 1.2860650705954573, + "grad_norm": 0.48999011516571045, + "learning_rate": 9.74741916050293e-05, + "loss": 1.953, + "step": 4190 + }, + { + "epoch": 1.2863720073664826, + "grad_norm": 0.5297304391860962, + "learning_rate": 9.747263152946698e-05, + "loss": 2.0484, + "step": 4191 + }, + { + "epoch": 1.2866789441375077, + "grad_norm": 0.4878230690956116, + "learning_rate": 9.747107098475226e-05, + "loss": 2.0423, + "step": 4192 + }, + { + "epoch": 1.2869858809085328, + "grad_norm": 0.538070023059845, + "learning_rate": 9.74695099709006e-05, + "loss": 2.0699, + "step": 4193 + }, + { + "epoch": 1.287292817679558, + "grad_norm": 0.6656436324119568, + "learning_rate": 9.746794848792743e-05, + "loss": 2.0689, + "step": 4194 + }, + { + "epoch": 1.287599754450583, + "grad_norm": 0.6416848301887512, + "learning_rate": 9.746638653584819e-05, + "loss": 1.9796, + "step": 4195 + }, + { + "epoch": 1.2879066912216084, + "grad_norm": 0.5917447805404663, + "learning_rate": 9.746482411467827e-05, + "loss": 2.0324, + "step": 4196 + }, + { + "epoch": 1.2882136279926335, + "grad_norm": 0.5234537124633789, + "learning_rate": 9.746326122443314e-05, + "loss": 2.0468, + "step": 4197 + }, + { + "epoch": 1.2885205647636586, + "grad_norm": 0.4885808229446411, + "learning_rate": 9.746169786512827e-05, + "loss": 1.9619, + "step": 4198 + }, + { + "epoch": 1.288827501534684, + "grad_norm": 0.5776945948600769, + "learning_rate": 9.746013403677905e-05, + "loss": 2.0167, + "step": 4199 + }, + { + "epoch": 1.289134438305709, + "grad_norm": 0.5722271203994751, + "learning_rate": 9.745856973940099e-05, + "loss": 1.9751, + "step": 4200 + }, + { + "epoch": 1.2894413750767342, + "grad_norm": 0.49253931641578674, + "learning_rate": 9.745700497300951e-05, + "loss": 1.9821, + "step": 4201 + }, + { + "epoch": 1.2897483118477593, + "grad_norm": 0.4739282727241516, + "learning_rate": 9.74554397376201e-05, + "loss": 1.9926, + "step": 4202 + }, + { + "epoch": 1.2900552486187844, + "grad_norm": 0.5133153200149536, + "learning_rate": 9.745387403324823e-05, + "loss": 1.9655, + "step": 4203 + }, + { + "epoch": 1.2903621853898097, + "grad_norm": 0.48941388726234436, + "learning_rate": 9.745230785990935e-05, + "loss": 1.9401, + "step": 4204 + }, + { + "epoch": 1.2906691221608348, + "grad_norm": 0.5998152494430542, + "learning_rate": 9.745074121761896e-05, + "loss": 2.0223, + "step": 4205 + }, + { + "epoch": 1.29097605893186, + "grad_norm": 0.4423331618309021, + "learning_rate": 9.744917410639253e-05, + "loss": 1.9602, + "step": 4206 + }, + { + "epoch": 1.2912829957028853, + "grad_norm": 0.5387418866157532, + "learning_rate": 9.744760652624553e-05, + "loss": 2.0631, + "step": 4207 + }, + { + "epoch": 1.2915899324739104, + "grad_norm": 0.5992900729179382, + "learning_rate": 9.744603847719352e-05, + "loss": 1.9805, + "step": 4208 + }, + { + "epoch": 1.2918968692449355, + "grad_norm": 0.5033924579620361, + "learning_rate": 9.744446995925192e-05, + "loss": 1.9817, + "step": 4209 + }, + { + "epoch": 1.2922038060159606, + "grad_norm": 0.47493448853492737, + "learning_rate": 9.744290097243624e-05, + "loss": 2.0259, + "step": 4210 + }, + { + "epoch": 1.2925107427869857, + "grad_norm": 0.5161942839622498, + "learning_rate": 9.744133151676203e-05, + "loss": 1.9686, + "step": 4211 + }, + { + "epoch": 1.292817679558011, + "grad_norm": 0.4476351737976074, + "learning_rate": 9.743976159224477e-05, + "loss": 1.9488, + "step": 4212 + }, + { + "epoch": 1.2931246163290362, + "grad_norm": 0.5168361663818359, + "learning_rate": 9.743819119889999e-05, + "loss": 2.0645, + "step": 4213 + }, + { + "epoch": 1.2934315531000613, + "grad_norm": 0.5098811984062195, + "learning_rate": 9.743662033674319e-05, + "loss": 1.9889, + "step": 4214 + }, + { + "epoch": 1.2937384898710866, + "grad_norm": 0.5559372305870056, + "learning_rate": 9.74350490057899e-05, + "loss": 2.0348, + "step": 4215 + }, + { + "epoch": 1.2940454266421118, + "grad_norm": 0.5274948477745056, + "learning_rate": 9.743347720605566e-05, + "loss": 2.0566, + "step": 4216 + }, + { + "epoch": 1.2943523634131369, + "grad_norm": 0.5009967088699341, + "learning_rate": 9.743190493755601e-05, + "loss": 1.9915, + "step": 4217 + }, + { + "epoch": 1.2946593001841622, + "grad_norm": 0.5365834832191467, + "learning_rate": 9.743033220030646e-05, + "loss": 2.0581, + "step": 4218 + }, + { + "epoch": 1.2949662369551873, + "grad_norm": 0.519478976726532, + "learning_rate": 9.742875899432255e-05, + "loss": 1.9766, + "step": 4219 + }, + { + "epoch": 1.2952731737262124, + "grad_norm": 0.48030364513397217, + "learning_rate": 9.742718531961988e-05, + "loss": 2.0006, + "step": 4220 + }, + { + "epoch": 1.2955801104972375, + "grad_norm": 0.5257472991943359, + "learning_rate": 9.742561117621394e-05, + "loss": 2.0636, + "step": 4221 + }, + { + "epoch": 1.2958870472682626, + "grad_norm": 0.44784319400787354, + "learning_rate": 9.742403656412034e-05, + "loss": 1.9975, + "step": 4222 + }, + { + "epoch": 1.296193984039288, + "grad_norm": 0.4997022747993469, + "learning_rate": 9.742246148335459e-05, + "loss": 2.0167, + "step": 4223 + }, + { + "epoch": 1.296500920810313, + "grad_norm": 0.43378305435180664, + "learning_rate": 9.742088593393228e-05, + "loss": 1.9202, + "step": 4224 + }, + { + "epoch": 1.2968078575813382, + "grad_norm": 0.5256497859954834, + "learning_rate": 9.741930991586899e-05, + "loss": 2.0306, + "step": 4225 + }, + { + "epoch": 1.2971147943523635, + "grad_norm": 0.5017027258872986, + "learning_rate": 9.741773342918028e-05, + "loss": 2.0124, + "step": 4226 + }, + { + "epoch": 1.2974217311233887, + "grad_norm": 0.5393915176391602, + "learning_rate": 9.741615647388175e-05, + "loss": 2.0255, + "step": 4227 + }, + { + "epoch": 1.2977286678944138, + "grad_norm": 0.48618295788764954, + "learning_rate": 9.741457904998896e-05, + "loss": 1.9863, + "step": 4228 + }, + { + "epoch": 1.2980356046654389, + "grad_norm": 0.48060059547424316, + "learning_rate": 9.741300115751752e-05, + "loss": 2.0787, + "step": 4229 + }, + { + "epoch": 1.298342541436464, + "grad_norm": 0.4966236650943756, + "learning_rate": 9.741142279648298e-05, + "loss": 1.9818, + "step": 4230 + }, + { + "epoch": 1.2986494782074893, + "grad_norm": 0.5178021788597107, + "learning_rate": 9.7409843966901e-05, + "loss": 1.9847, + "step": 4231 + }, + { + "epoch": 1.2989564149785144, + "grad_norm": 0.5161063075065613, + "learning_rate": 9.740826466878716e-05, + "loss": 2.0028, + "step": 4232 + }, + { + "epoch": 1.2992633517495396, + "grad_norm": 0.5972462296485901, + "learning_rate": 9.740668490215705e-05, + "loss": 2.0205, + "step": 4233 + }, + { + "epoch": 1.2995702885205649, + "grad_norm": 0.5929185152053833, + "learning_rate": 9.740510466702629e-05, + "loss": 1.9802, + "step": 4234 + }, + { + "epoch": 1.29987722529159, + "grad_norm": 0.5496684908866882, + "learning_rate": 9.74035239634105e-05, + "loss": 1.9331, + "step": 4235 + }, + { + "epoch": 1.3001841620626151, + "grad_norm": 0.5822622179985046, + "learning_rate": 9.740194279132531e-05, + "loss": 2.1079, + "step": 4236 + }, + { + "epoch": 1.3004910988336402, + "grad_norm": 0.5886369943618774, + "learning_rate": 9.740036115078634e-05, + "loss": 1.9938, + "step": 4237 + }, + { + "epoch": 1.3007980356046653, + "grad_norm": 0.5259171724319458, + "learning_rate": 9.73987790418092e-05, + "loss": 2.0787, + "step": 4238 + }, + { + "epoch": 1.3011049723756907, + "grad_norm": 0.6112152934074402, + "learning_rate": 9.739719646440956e-05, + "loss": 2.0488, + "step": 4239 + }, + { + "epoch": 1.3014119091467158, + "grad_norm": 0.5786338448524475, + "learning_rate": 9.739561341860306e-05, + "loss": 1.9917, + "step": 4240 + }, + { + "epoch": 1.301718845917741, + "grad_norm": 0.5099230408668518, + "learning_rate": 9.739402990440531e-05, + "loss": 1.9949, + "step": 4241 + }, + { + "epoch": 1.3020257826887662, + "grad_norm": 0.5040346384048462, + "learning_rate": 9.739244592183198e-05, + "loss": 1.9368, + "step": 4242 + }, + { + "epoch": 1.3023327194597913, + "grad_norm": 0.48172008991241455, + "learning_rate": 9.739086147089871e-05, + "loss": 1.97, + "step": 4243 + }, + { + "epoch": 1.3026396562308165, + "grad_norm": 0.5350810885429382, + "learning_rate": 9.738927655162119e-05, + "loss": 2.0584, + "step": 4244 + }, + { + "epoch": 1.3029465930018416, + "grad_norm": 0.566371738910675, + "learning_rate": 9.738769116401505e-05, + "loss": 2.0138, + "step": 4245 + }, + { + "epoch": 1.3032535297728667, + "grad_norm": 0.5697746872901917, + "learning_rate": 9.738610530809598e-05, + "loss": 2.0319, + "step": 4246 + }, + { + "epoch": 1.303560466543892, + "grad_norm": 0.5186757445335388, + "learning_rate": 9.738451898387964e-05, + "loss": 1.9958, + "step": 4247 + }, + { + "epoch": 1.3038674033149171, + "grad_norm": 0.5318703651428223, + "learning_rate": 9.73829321913817e-05, + "loss": 2.0857, + "step": 4248 + }, + { + "epoch": 1.3041743400859422, + "grad_norm": 0.5013560056686401, + "learning_rate": 9.738134493061786e-05, + "loss": 1.9545, + "step": 4249 + }, + { + "epoch": 1.3044812768569676, + "grad_norm": 0.499009907245636, + "learning_rate": 9.737975720160382e-05, + "loss": 1.9773, + "step": 4250 + }, + { + "epoch": 1.3047882136279927, + "grad_norm": 0.5187140703201294, + "learning_rate": 9.737816900435522e-05, + "loss": 1.9826, + "step": 4251 + }, + { + "epoch": 1.3050951503990178, + "grad_norm": 0.4950683116912842, + "learning_rate": 9.73765803388878e-05, + "loss": 2.0061, + "step": 4252 + }, + { + "epoch": 1.305402087170043, + "grad_norm": 0.40729087591171265, + "learning_rate": 9.737499120521722e-05, + "loss": 1.9502, + "step": 4253 + }, + { + "epoch": 1.305709023941068, + "grad_norm": 0.4959156811237335, + "learning_rate": 9.737340160335924e-05, + "loss": 2.0975, + "step": 4254 + }, + { + "epoch": 1.3060159607120934, + "grad_norm": 0.5127618312835693, + "learning_rate": 9.737181153332952e-05, + "loss": 2.0098, + "step": 4255 + }, + { + "epoch": 1.3063228974831185, + "grad_norm": 0.45458972454071045, + "learning_rate": 9.737022099514381e-05, + "loss": 1.9475, + "step": 4256 + }, + { + "epoch": 1.3066298342541436, + "grad_norm": 0.5024627447128296, + "learning_rate": 9.736862998881779e-05, + "loss": 2.0682, + "step": 4257 + }, + { + "epoch": 1.306936771025169, + "grad_norm": 0.5217326283454895, + "learning_rate": 9.736703851436722e-05, + "loss": 2.0363, + "step": 4258 + }, + { + "epoch": 1.307243707796194, + "grad_norm": 0.4798679053783417, + "learning_rate": 9.736544657180781e-05, + "loss": 2.0357, + "step": 4259 + }, + { + "epoch": 1.3075506445672191, + "grad_norm": 0.6031736135482788, + "learning_rate": 9.73638541611553e-05, + "loss": 2.0143, + "step": 4260 + }, + { + "epoch": 1.3078575813382443, + "grad_norm": 0.4914969801902771, + "learning_rate": 9.736226128242542e-05, + "loss": 1.9292, + "step": 4261 + }, + { + "epoch": 1.3081645181092694, + "grad_norm": 0.40556418895721436, + "learning_rate": 9.736066793563392e-05, + "loss": 1.9528, + "step": 4262 + }, + { + "epoch": 1.3084714548802947, + "grad_norm": 0.45605841279029846, + "learning_rate": 9.735907412079652e-05, + "loss": 2.0704, + "step": 4263 + }, + { + "epoch": 1.3087783916513198, + "grad_norm": 0.4992324113845825, + "learning_rate": 9.7357479837929e-05, + "loss": 2.0211, + "step": 4264 + }, + { + "epoch": 1.309085328422345, + "grad_norm": 0.4904097020626068, + "learning_rate": 9.735588508704712e-05, + "loss": 1.987, + "step": 4265 + }, + { + "epoch": 1.3093922651933703, + "grad_norm": 0.5436086058616638, + "learning_rate": 9.735428986816661e-05, + "loss": 2.0704, + "step": 4266 + }, + { + "epoch": 1.3096992019643954, + "grad_norm": 0.4850294589996338, + "learning_rate": 9.735269418130326e-05, + "loss": 1.9576, + "step": 4267 + }, + { + "epoch": 1.3100061387354205, + "grad_norm": 0.44082164764404297, + "learning_rate": 9.735109802647283e-05, + "loss": 2.0018, + "step": 4268 + }, + { + "epoch": 1.3103130755064456, + "grad_norm": 0.4844531714916229, + "learning_rate": 9.73495014036911e-05, + "loss": 1.9852, + "step": 4269 + }, + { + "epoch": 1.3106200122774707, + "grad_norm": 0.547596275806427, + "learning_rate": 9.734790431297384e-05, + "loss": 2.0632, + "step": 4270 + }, + { + "epoch": 1.310926949048496, + "grad_norm": 0.517882764339447, + "learning_rate": 9.734630675433684e-05, + "loss": 1.9851, + "step": 4271 + }, + { + "epoch": 1.3112338858195212, + "grad_norm": 0.5148623585700989, + "learning_rate": 9.734470872779589e-05, + "loss": 2.0446, + "step": 4272 + }, + { + "epoch": 1.3115408225905463, + "grad_norm": 0.5872887372970581, + "learning_rate": 9.734311023336678e-05, + "loss": 2.0588, + "step": 4273 + }, + { + "epoch": 1.3118477593615716, + "grad_norm": 0.7116255164146423, + "learning_rate": 9.73415112710653e-05, + "loss": 2.0213, + "step": 4274 + }, + { + "epoch": 1.3121546961325967, + "grad_norm": 0.8191964626312256, + "learning_rate": 9.733991184090725e-05, + "loss": 1.9528, + "step": 4275 + }, + { + "epoch": 1.3124616329036218, + "grad_norm": 0.8214605450630188, + "learning_rate": 9.733831194290846e-05, + "loss": 1.9614, + "step": 4276 + }, + { + "epoch": 1.312768569674647, + "grad_norm": 0.7057182788848877, + "learning_rate": 9.733671157708472e-05, + "loss": 2.0767, + "step": 4277 + }, + { + "epoch": 1.313075506445672, + "grad_norm": 0.5114007592201233, + "learning_rate": 9.733511074345185e-05, + "loss": 1.946, + "step": 4278 + }, + { + "epoch": 1.3133824432166974, + "grad_norm": 0.5347970128059387, + "learning_rate": 9.733350944202566e-05, + "loss": 1.9658, + "step": 4279 + }, + { + "epoch": 1.3136893799877225, + "grad_norm": 0.6962214112281799, + "learning_rate": 9.733190767282202e-05, + "loss": 2.0943, + "step": 4280 + }, + { + "epoch": 1.3139963167587476, + "grad_norm": 0.5942707657814026, + "learning_rate": 9.733030543585668e-05, + "loss": 2.0101, + "step": 4281 + }, + { + "epoch": 1.314303253529773, + "grad_norm": 0.46218639612197876, + "learning_rate": 9.732870273114556e-05, + "loss": 2.0292, + "step": 4282 + }, + { + "epoch": 1.314610190300798, + "grad_norm": 0.5194444060325623, + "learning_rate": 9.732709955870445e-05, + "loss": 2.0666, + "step": 4283 + }, + { + "epoch": 1.3149171270718232, + "grad_norm": 0.5112141370773315, + "learning_rate": 9.732549591854918e-05, + "loss": 2.0205, + "step": 4284 + }, + { + "epoch": 1.3152240638428485, + "grad_norm": 0.5282790660858154, + "learning_rate": 9.732389181069566e-05, + "loss": 2.0704, + "step": 4285 + }, + { + "epoch": 1.3155310006138736, + "grad_norm": 0.4598311185836792, + "learning_rate": 9.732228723515968e-05, + "loss": 1.9485, + "step": 4286 + }, + { + "epoch": 1.3158379373848987, + "grad_norm": 0.4700186550617218, + "learning_rate": 9.732068219195711e-05, + "loss": 2.0329, + "step": 4287 + }, + { + "epoch": 1.3161448741559238, + "grad_norm": 0.4512452781200409, + "learning_rate": 9.731907668110384e-05, + "loss": 1.9829, + "step": 4288 + }, + { + "epoch": 1.316451810926949, + "grad_norm": 0.5053353309631348, + "learning_rate": 9.731747070261572e-05, + "loss": 2.0583, + "step": 4289 + }, + { + "epoch": 1.3167587476979743, + "grad_norm": 0.48143625259399414, + "learning_rate": 9.73158642565086e-05, + "loss": 2.014, + "step": 4290 + }, + { + "epoch": 1.3170656844689994, + "grad_norm": 0.4843716025352478, + "learning_rate": 9.73142573427984e-05, + "loss": 1.9951, + "step": 4291 + }, + { + "epoch": 1.3173726212400245, + "grad_norm": 0.45646217465400696, + "learning_rate": 9.731264996150098e-05, + "loss": 1.9701, + "step": 4292 + }, + { + "epoch": 1.3176795580110499, + "grad_norm": 0.5176306962966919, + "learning_rate": 9.73110421126322e-05, + "loss": 1.9915, + "step": 4293 + }, + { + "epoch": 1.317986494782075, + "grad_norm": 0.4862259328365326, + "learning_rate": 9.730943379620799e-05, + "loss": 2.0157, + "step": 4294 + }, + { + "epoch": 1.3182934315531, + "grad_norm": 0.4941593110561371, + "learning_rate": 9.730782501224423e-05, + "loss": 2.0164, + "step": 4295 + }, + { + "epoch": 1.3186003683241252, + "grad_norm": 0.46818530559539795, + "learning_rate": 9.73062157607568e-05, + "loss": 1.9749, + "step": 4296 + }, + { + "epoch": 1.3189073050951503, + "grad_norm": 0.41685113310813904, + "learning_rate": 9.730460604176163e-05, + "loss": 1.9443, + "step": 4297 + }, + { + "epoch": 1.3192142418661756, + "grad_norm": 0.40586861968040466, + "learning_rate": 9.73029958552746e-05, + "loss": 1.9227, + "step": 4298 + }, + { + "epoch": 1.3195211786372008, + "grad_norm": 0.3946068286895752, + "learning_rate": 9.730138520131167e-05, + "loss": 1.9073, + "step": 4299 + }, + { + "epoch": 1.3198281154082259, + "grad_norm": 0.3722321093082428, + "learning_rate": 9.729977407988871e-05, + "loss": 1.9299, + "step": 4300 + }, + { + "epoch": 1.3201350521792512, + "grad_norm": 0.39335691928863525, + "learning_rate": 9.729816249102164e-05, + "loss": 1.9673, + "step": 4301 + }, + { + "epoch": 1.3204419889502763, + "grad_norm": 0.4342779815196991, + "learning_rate": 9.729655043472643e-05, + "loss": 2.0704, + "step": 4302 + }, + { + "epoch": 1.3207489257213014, + "grad_norm": 0.46981000900268555, + "learning_rate": 9.729493791101899e-05, + "loss": 2.0593, + "step": 4303 + }, + { + "epoch": 1.3210558624923265, + "grad_norm": 0.4319849908351898, + "learning_rate": 9.729332491991524e-05, + "loss": 1.9378, + "step": 4304 + }, + { + "epoch": 1.3213627992633517, + "grad_norm": 0.4555012285709381, + "learning_rate": 9.729171146143115e-05, + "loss": 1.993, + "step": 4305 + }, + { + "epoch": 1.321669736034377, + "grad_norm": 0.5122297406196594, + "learning_rate": 9.729009753558262e-05, + "loss": 2.0237, + "step": 4306 + }, + { + "epoch": 1.321976672805402, + "grad_norm": 0.4814549386501312, + "learning_rate": 9.728848314238566e-05, + "loss": 2.0063, + "step": 4307 + }, + { + "epoch": 1.3222836095764272, + "grad_norm": 0.45410022139549255, + "learning_rate": 9.728686828185618e-05, + "loss": 2.0262, + "step": 4308 + }, + { + "epoch": 1.3225905463474525, + "grad_norm": 0.44759154319763184, + "learning_rate": 9.728525295401014e-05, + "loss": 1.9746, + "step": 4309 + }, + { + "epoch": 1.3228974831184777, + "grad_norm": 0.41539889574050903, + "learning_rate": 9.728363715886352e-05, + "loss": 1.9197, + "step": 4310 + }, + { + "epoch": 1.3232044198895028, + "grad_norm": 0.549961268901825, + "learning_rate": 9.72820208964323e-05, + "loss": 2.0168, + "step": 4311 + }, + { + "epoch": 1.3235113566605279, + "grad_norm": 0.6832249164581299, + "learning_rate": 9.728040416673243e-05, + "loss": 1.9711, + "step": 4312 + }, + { + "epoch": 1.323818293431553, + "grad_norm": 0.7458481788635254, + "learning_rate": 9.727878696977988e-05, + "loss": 2.1677, + "step": 4313 + }, + { + "epoch": 1.3241252302025783, + "grad_norm": 0.6268119812011719, + "learning_rate": 9.727716930559066e-05, + "loss": 2.0222, + "step": 4314 + }, + { + "epoch": 1.3244321669736034, + "grad_norm": 0.540987491607666, + "learning_rate": 9.727555117418075e-05, + "loss": 2.0552, + "step": 4315 + }, + { + "epoch": 1.3247391037446286, + "grad_norm": 0.6105024814605713, + "learning_rate": 9.727393257556612e-05, + "loss": 1.9287, + "step": 4316 + }, + { + "epoch": 1.325046040515654, + "grad_norm": 0.594327449798584, + "learning_rate": 9.727231350976277e-05, + "loss": 1.9737, + "step": 4317 + }, + { + "epoch": 1.325352977286679, + "grad_norm": 0.5686312913894653, + "learning_rate": 9.727069397678674e-05, + "loss": 1.988, + "step": 4318 + }, + { + "epoch": 1.3256599140577041, + "grad_norm": 0.5335875153541565, + "learning_rate": 9.726907397665399e-05, + "loss": 1.9992, + "step": 4319 + }, + { + "epoch": 1.3259668508287292, + "grad_norm": 0.514209508895874, + "learning_rate": 9.726745350938055e-05, + "loss": 2.0928, + "step": 4320 + }, + { + "epoch": 1.3262737875997543, + "grad_norm": 0.58844393491745, + "learning_rate": 9.726583257498242e-05, + "loss": 1.968, + "step": 4321 + }, + { + "epoch": 1.3265807243707797, + "grad_norm": 0.5247591733932495, + "learning_rate": 9.726421117347563e-05, + "loss": 1.9529, + "step": 4322 + }, + { + "epoch": 1.3268876611418048, + "grad_norm": 0.5057464241981506, + "learning_rate": 9.726258930487622e-05, + "loss": 2.0595, + "step": 4323 + }, + { + "epoch": 1.32719459791283, + "grad_norm": 0.564689040184021, + "learning_rate": 9.726096696920019e-05, + "loss": 1.9974, + "step": 4324 + }, + { + "epoch": 1.3275015346838552, + "grad_norm": 0.5755618214607239, + "learning_rate": 9.725934416646358e-05, + "loss": 1.9949, + "step": 4325 + }, + { + "epoch": 1.3278084714548803, + "grad_norm": 0.5969316959381104, + "learning_rate": 9.725772089668243e-05, + "loss": 1.972, + "step": 4326 + }, + { + "epoch": 1.3281154082259055, + "grad_norm": 0.5776877403259277, + "learning_rate": 9.725609715987278e-05, + "loss": 2.1018, + "step": 4327 + }, + { + "epoch": 1.3284223449969306, + "grad_norm": 0.5471270680427551, + "learning_rate": 9.725447295605071e-05, + "loss": 2.0153, + "step": 4328 + }, + { + "epoch": 1.3287292817679557, + "grad_norm": 0.49090373516082764, + "learning_rate": 9.725284828523222e-05, + "loss": 1.9651, + "step": 4329 + }, + { + "epoch": 1.329036218538981, + "grad_norm": 0.49420034885406494, + "learning_rate": 9.725122314743337e-05, + "loss": 2.0119, + "step": 4330 + }, + { + "epoch": 1.3293431553100061, + "grad_norm": 0.4841148853302002, + "learning_rate": 9.724959754267027e-05, + "loss": 1.974, + "step": 4331 + }, + { + "epoch": 1.3296500920810312, + "grad_norm": 0.42349007725715637, + "learning_rate": 9.724797147095893e-05, + "loss": 1.9779, + "step": 4332 + }, + { + "epoch": 1.3299570288520566, + "grad_norm": 0.47239863872528076, + "learning_rate": 9.724634493231545e-05, + "loss": 1.9184, + "step": 4333 + }, + { + "epoch": 1.3302639656230817, + "grad_norm": 0.5583773255348206, + "learning_rate": 9.72447179267559e-05, + "loss": 2.0742, + "step": 4334 + }, + { + "epoch": 1.3305709023941068, + "grad_norm": 0.486937552690506, + "learning_rate": 9.724309045429636e-05, + "loss": 2.0101, + "step": 4335 + }, + { + "epoch": 1.330877839165132, + "grad_norm": 0.42204493284225464, + "learning_rate": 9.724146251495289e-05, + "loss": 1.9564, + "step": 4336 + }, + { + "epoch": 1.331184775936157, + "grad_norm": 0.451628714799881, + "learning_rate": 9.723983410874163e-05, + "loss": 1.9949, + "step": 4337 + }, + { + "epoch": 1.3314917127071824, + "grad_norm": 0.4453491270542145, + "learning_rate": 9.723820523567861e-05, + "loss": 1.9415, + "step": 4338 + }, + { + "epoch": 1.3317986494782075, + "grad_norm": 0.4628424644470215, + "learning_rate": 9.723657589577999e-05, + "loss": 2.0296, + "step": 4339 + }, + { + "epoch": 1.3321055862492326, + "grad_norm": 0.5362148284912109, + "learning_rate": 9.723494608906181e-05, + "loss": 2.0719, + "step": 4340 + }, + { + "epoch": 1.332412523020258, + "grad_norm": 0.45357146859169006, + "learning_rate": 9.723331581554023e-05, + "loss": 1.9107, + "step": 4341 + }, + { + "epoch": 1.332719459791283, + "grad_norm": 0.5042485594749451, + "learning_rate": 9.723168507523133e-05, + "loss": 1.9838, + "step": 4342 + }, + { + "epoch": 1.3330263965623081, + "grad_norm": 0.4797585606575012, + "learning_rate": 9.723005386815123e-05, + "loss": 1.9779, + "step": 4343 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4489155113697052, + "learning_rate": 9.722842219431607e-05, + "loss": 1.9805, + "step": 4344 + }, + { + "epoch": 1.3336402701043584, + "grad_norm": 0.43091216683387756, + "learning_rate": 9.722679005374196e-05, + "loss": 1.9708, + "step": 4345 + }, + { + "epoch": 1.3339472068753837, + "grad_norm": 0.453937292098999, + "learning_rate": 9.722515744644502e-05, + "loss": 2.0038, + "step": 4346 + }, + { + "epoch": 1.3342541436464088, + "grad_norm": 0.38905346393585205, + "learning_rate": 9.722352437244138e-05, + "loss": 2.0042, + "step": 4347 + }, + { + "epoch": 1.334561080417434, + "grad_norm": 0.46686118841171265, + "learning_rate": 9.722189083174722e-05, + "loss": 2.0733, + "step": 4348 + }, + { + "epoch": 1.3348680171884593, + "grad_norm": 0.42737439274787903, + "learning_rate": 9.722025682437865e-05, + "loss": 1.9572, + "step": 4349 + }, + { + "epoch": 1.3351749539594844, + "grad_norm": 0.3857511878013611, + "learning_rate": 9.721862235035181e-05, + "loss": 1.9288, + "step": 4350 + }, + { + "epoch": 1.3354818907305095, + "grad_norm": 0.42448824644088745, + "learning_rate": 9.721698740968288e-05, + "loss": 1.99, + "step": 4351 + }, + { + "epoch": 1.3357888275015346, + "grad_norm": 0.4753642976284027, + "learning_rate": 9.721535200238802e-05, + "loss": 2.0268, + "step": 4352 + }, + { + "epoch": 1.3360957642725597, + "grad_norm": 0.5248960256576538, + "learning_rate": 9.721371612848336e-05, + "loss": 2.008, + "step": 4353 + }, + { + "epoch": 1.336402701043585, + "grad_norm": 0.5046865344047546, + "learning_rate": 9.721207978798507e-05, + "loss": 1.9248, + "step": 4354 + }, + { + "epoch": 1.3367096378146102, + "grad_norm": 0.48205190896987915, + "learning_rate": 9.721044298090937e-05, + "loss": 1.9895, + "step": 4355 + }, + { + "epoch": 1.3370165745856353, + "grad_norm": 0.46149346232414246, + "learning_rate": 9.720880570727238e-05, + "loss": 2.0001, + "step": 4356 + }, + { + "epoch": 1.3373235113566606, + "grad_norm": 0.6212405562400818, + "learning_rate": 9.72071679670903e-05, + "loss": 2.0772, + "step": 4357 + }, + { + "epoch": 1.3376304481276857, + "grad_norm": 0.6935828924179077, + "learning_rate": 9.720552976037934e-05, + "loss": 1.9865, + "step": 4358 + }, + { + "epoch": 1.3379373848987108, + "grad_norm": 0.6850154399871826, + "learning_rate": 9.720389108715564e-05, + "loss": 1.9964, + "step": 4359 + }, + { + "epoch": 1.3382443216697362, + "grad_norm": 0.5925734043121338, + "learning_rate": 9.720225194743544e-05, + "loss": 2.0109, + "step": 4360 + }, + { + "epoch": 1.3385512584407613, + "grad_norm": 0.47503459453582764, + "learning_rate": 9.720061234123492e-05, + "loss": 2.0406, + "step": 4361 + }, + { + "epoch": 1.3388581952117864, + "grad_norm": 0.44226083159446716, + "learning_rate": 9.719897226857026e-05, + "loss": 1.953, + "step": 4362 + }, + { + "epoch": 1.3391651319828115, + "grad_norm": 0.5688608884811401, + "learning_rate": 9.719733172945772e-05, + "loss": 1.9422, + "step": 4363 + }, + { + "epoch": 1.3394720687538366, + "grad_norm": 0.6097545027732849, + "learning_rate": 9.719569072391347e-05, + "loss": 2.0204, + "step": 4364 + }, + { + "epoch": 1.339779005524862, + "grad_norm": 0.44313064217567444, + "learning_rate": 9.719404925195374e-05, + "loss": 1.9458, + "step": 4365 + }, + { + "epoch": 1.340085942295887, + "grad_norm": 0.495632141828537, + "learning_rate": 9.719240731359476e-05, + "loss": 1.9682, + "step": 4366 + }, + { + "epoch": 1.3403928790669122, + "grad_norm": 0.5843736529350281, + "learning_rate": 9.719076490885275e-05, + "loss": 1.9948, + "step": 4367 + }, + { + "epoch": 1.3406998158379375, + "grad_norm": 0.6249645352363586, + "learning_rate": 9.718912203774395e-05, + "loss": 1.9675, + "step": 4368 + }, + { + "epoch": 1.3410067526089626, + "grad_norm": 0.48386043310165405, + "learning_rate": 9.718747870028457e-05, + "loss": 1.9678, + "step": 4369 + }, + { + "epoch": 1.3413136893799877, + "grad_norm": 0.4797835648059845, + "learning_rate": 9.718583489649088e-05, + "loss": 2.0118, + "step": 4370 + }, + { + "epoch": 1.3416206261510129, + "grad_norm": 0.6131169199943542, + "learning_rate": 9.718419062637911e-05, + "loss": 2.0057, + "step": 4371 + }, + { + "epoch": 1.341927562922038, + "grad_norm": 0.6230120062828064, + "learning_rate": 9.718254588996552e-05, + "loss": 1.9871, + "step": 4372 + }, + { + "epoch": 1.3422344996930633, + "grad_norm": 0.5323978662490845, + "learning_rate": 9.718090068726633e-05, + "loss": 1.9389, + "step": 4373 + }, + { + "epoch": 1.3425414364640884, + "grad_norm": 0.429446280002594, + "learning_rate": 9.717925501829786e-05, + "loss": 1.9928, + "step": 4374 + }, + { + "epoch": 1.3428483732351135, + "grad_norm": 0.5588231086730957, + "learning_rate": 9.717760888307632e-05, + "loss": 2.0197, + "step": 4375 + }, + { + "epoch": 1.3431553100061389, + "grad_norm": 0.608248770236969, + "learning_rate": 9.7175962281618e-05, + "loss": 1.9486, + "step": 4376 + }, + { + "epoch": 1.343462246777164, + "grad_norm": 0.6100868582725525, + "learning_rate": 9.717431521393918e-05, + "loss": 2.044, + "step": 4377 + }, + { + "epoch": 1.343769183548189, + "grad_norm": 0.5428611636161804, + "learning_rate": 9.717266768005611e-05, + "loss": 2.0078, + "step": 4378 + }, + { + "epoch": 1.3440761203192142, + "grad_norm": 0.4338260889053345, + "learning_rate": 9.71710196799851e-05, + "loss": 1.9206, + "step": 4379 + }, + { + "epoch": 1.3443830570902393, + "grad_norm": 0.4879632294178009, + "learning_rate": 9.716937121374243e-05, + "loss": 1.9852, + "step": 4380 + }, + { + "epoch": 1.3446899938612646, + "grad_norm": 0.5174580216407776, + "learning_rate": 9.716772228134438e-05, + "loss": 1.9328, + "step": 4381 + }, + { + "epoch": 1.3449969306322898, + "grad_norm": 0.4461662173271179, + "learning_rate": 9.716607288280726e-05, + "loss": 1.9653, + "step": 4382 + }, + { + "epoch": 1.3453038674033149, + "grad_norm": 0.49747103452682495, + "learning_rate": 9.716442301814735e-05, + "loss": 1.9904, + "step": 4383 + }, + { + "epoch": 1.3456108041743402, + "grad_norm": 0.5059060454368591, + "learning_rate": 9.716277268738097e-05, + "loss": 1.9408, + "step": 4384 + }, + { + "epoch": 1.3459177409453653, + "grad_norm": 0.47981831431388855, + "learning_rate": 9.716112189052445e-05, + "loss": 1.9604, + "step": 4385 + }, + { + "epoch": 1.3462246777163904, + "grad_norm": 0.48941048979759216, + "learning_rate": 9.715947062759405e-05, + "loss": 2.0005, + "step": 4386 + }, + { + "epoch": 1.3465316144874155, + "grad_norm": 0.4544732868671417, + "learning_rate": 9.715781889860613e-05, + "loss": 1.9641, + "step": 4387 + }, + { + "epoch": 1.3468385512584407, + "grad_norm": 0.4564060866832733, + "learning_rate": 9.715616670357701e-05, + "loss": 1.8786, + "step": 4388 + }, + { + "epoch": 1.347145488029466, + "grad_norm": 0.4216209352016449, + "learning_rate": 9.715451404252301e-05, + "loss": 1.9402, + "step": 4389 + }, + { + "epoch": 1.347452424800491, + "grad_norm": 0.5024694204330444, + "learning_rate": 9.715286091546046e-05, + "loss": 1.9815, + "step": 4390 + }, + { + "epoch": 1.3477593615715162, + "grad_norm": 0.523953378200531, + "learning_rate": 9.715120732240571e-05, + "loss": 2.008, + "step": 4391 + }, + { + "epoch": 1.3480662983425415, + "grad_norm": 0.5068427920341492, + "learning_rate": 9.714955326337508e-05, + "loss": 1.9984, + "step": 4392 + }, + { + "epoch": 1.3483732351135667, + "grad_norm": 0.4349055290222168, + "learning_rate": 9.714789873838494e-05, + "loss": 1.9576, + "step": 4393 + }, + { + "epoch": 1.3486801718845918, + "grad_norm": 0.4677357077598572, + "learning_rate": 9.714624374745162e-05, + "loss": 2.0491, + "step": 4394 + }, + { + "epoch": 1.3489871086556169, + "grad_norm": 0.5942007899284363, + "learning_rate": 9.71445882905915e-05, + "loss": 1.9951, + "step": 4395 + }, + { + "epoch": 1.349294045426642, + "grad_norm": 0.5354358553886414, + "learning_rate": 9.714293236782092e-05, + "loss": 2.0033, + "step": 4396 + }, + { + "epoch": 1.3496009821976673, + "grad_norm": 0.5081890821456909, + "learning_rate": 9.714127597915625e-05, + "loss": 1.9944, + "step": 4397 + }, + { + "epoch": 1.3499079189686924, + "grad_norm": 0.5279759764671326, + "learning_rate": 9.713961912461386e-05, + "loss": 2.025, + "step": 4398 + }, + { + "epoch": 1.3502148557397176, + "grad_norm": 0.41777312755584717, + "learning_rate": 9.713796180421012e-05, + "loss": 1.9214, + "step": 4399 + }, + { + "epoch": 1.350521792510743, + "grad_norm": 0.48946598172187805, + "learning_rate": 9.713630401796141e-05, + "loss": 1.9851, + "step": 4400 + }, + { + "epoch": 1.350828729281768, + "grad_norm": 0.45182350277900696, + "learning_rate": 9.713464576588413e-05, + "loss": 1.9825, + "step": 4401 + }, + { + "epoch": 1.3511356660527931, + "grad_norm": 0.4178939461708069, + "learning_rate": 9.713298704799465e-05, + "loss": 1.8944, + "step": 4402 + }, + { + "epoch": 1.3514426028238182, + "grad_norm": 0.4178236424922943, + "learning_rate": 9.713132786430937e-05, + "loss": 1.9884, + "step": 4403 + }, + { + "epoch": 1.3517495395948433, + "grad_norm": 0.45951130986213684, + "learning_rate": 9.712966821484467e-05, + "loss": 2.0786, + "step": 4404 + }, + { + "epoch": 1.3520564763658687, + "grad_norm": 0.4884461760520935, + "learning_rate": 9.712800809961697e-05, + "loss": 2.0494, + "step": 4405 + }, + { + "epoch": 1.3523634131368938, + "grad_norm": 0.5342240929603577, + "learning_rate": 9.712634751864268e-05, + "loss": 2.1068, + "step": 4406 + }, + { + "epoch": 1.352670349907919, + "grad_norm": 0.5503208637237549, + "learning_rate": 9.71246864719382e-05, + "loss": 1.9588, + "step": 4407 + }, + { + "epoch": 1.3529772866789442, + "grad_norm": 0.5576291084289551, + "learning_rate": 9.712302495951994e-05, + "loss": 2.0461, + "step": 4408 + }, + { + "epoch": 1.3532842234499693, + "grad_norm": 0.5063806772232056, + "learning_rate": 9.712136298140433e-05, + "loss": 1.9606, + "step": 4409 + }, + { + "epoch": 1.3535911602209945, + "grad_norm": 0.5391512513160706, + "learning_rate": 9.71197005376078e-05, + "loss": 2.0115, + "step": 4410 + }, + { + "epoch": 1.3538980969920196, + "grad_norm": 0.4934769868850708, + "learning_rate": 9.711803762814676e-05, + "loss": 1.9966, + "step": 4411 + }, + { + "epoch": 1.3542050337630447, + "grad_norm": 0.4658334255218506, + "learning_rate": 9.711637425303766e-05, + "loss": 1.9477, + "step": 4412 + }, + { + "epoch": 1.35451197053407, + "grad_norm": 0.4407191574573517, + "learning_rate": 9.711471041229693e-05, + "loss": 1.9334, + "step": 4413 + }, + { + "epoch": 1.3548189073050951, + "grad_norm": 0.5043092370033264, + "learning_rate": 9.711304610594104e-05, + "loss": 2.0068, + "step": 4414 + }, + { + "epoch": 1.3551258440761202, + "grad_norm": 0.4502009451389313, + "learning_rate": 9.711138133398639e-05, + "loss": 1.9389, + "step": 4415 + }, + { + "epoch": 1.3554327808471456, + "grad_norm": 0.41863033175468445, + "learning_rate": 9.710971609644945e-05, + "loss": 1.9244, + "step": 4416 + }, + { + "epoch": 1.3557397176181707, + "grad_norm": 0.47590091824531555, + "learning_rate": 9.71080503933467e-05, + "loss": 2.0144, + "step": 4417 + }, + { + "epoch": 1.3560466543891958, + "grad_norm": 0.47155439853668213, + "learning_rate": 9.71063842246946e-05, + "loss": 2.0729, + "step": 4418 + }, + { + "epoch": 1.356353591160221, + "grad_norm": 0.5231152176856995, + "learning_rate": 9.710471759050957e-05, + "loss": 2.0654, + "step": 4419 + }, + { + "epoch": 1.356660527931246, + "grad_norm": 0.5952544212341309, + "learning_rate": 9.710305049080812e-05, + "loss": 1.9983, + "step": 4420 + }, + { + "epoch": 1.3569674647022714, + "grad_norm": 0.4810022711753845, + "learning_rate": 9.710138292560673e-05, + "loss": 1.9725, + "step": 4421 + }, + { + "epoch": 1.3572744014732965, + "grad_norm": 0.553421676158905, + "learning_rate": 9.709971489492185e-05, + "loss": 2.0666, + "step": 4422 + }, + { + "epoch": 1.3575813382443216, + "grad_norm": 0.48790663480758667, + "learning_rate": 9.709804639877001e-05, + "loss": 1.9312, + "step": 4423 + }, + { + "epoch": 1.357888275015347, + "grad_norm": 0.42968273162841797, + "learning_rate": 9.709637743716764e-05, + "loss": 1.9061, + "step": 4424 + }, + { + "epoch": 1.358195211786372, + "grad_norm": 0.40183690190315247, + "learning_rate": 9.709470801013128e-05, + "loss": 2.0547, + "step": 4425 + }, + { + "epoch": 1.3585021485573971, + "grad_norm": 0.5162881016731262, + "learning_rate": 9.70930381176774e-05, + "loss": 2.0246, + "step": 4426 + }, + { + "epoch": 1.3588090853284225, + "grad_norm": 0.517995297908783, + "learning_rate": 9.709136775982252e-05, + "loss": 2.0029, + "step": 4427 + }, + { + "epoch": 1.3591160220994476, + "grad_norm": 0.47416025400161743, + "learning_rate": 9.708969693658314e-05, + "loss": 1.9517, + "step": 4428 + }, + { + "epoch": 1.3594229588704727, + "grad_norm": 0.4192255437374115, + "learning_rate": 9.708802564797578e-05, + "loss": 1.9138, + "step": 4429 + }, + { + "epoch": 1.3597298956414978, + "grad_norm": 0.4643617868423462, + "learning_rate": 9.708635389401697e-05, + "loss": 1.9753, + "step": 4430 + }, + { + "epoch": 1.360036832412523, + "grad_norm": 0.5007988214492798, + "learning_rate": 9.708468167472317e-05, + "loss": 1.9654, + "step": 4431 + }, + { + "epoch": 1.3603437691835483, + "grad_norm": 0.5188244581222534, + "learning_rate": 9.708300899011098e-05, + "loss": 1.9959, + "step": 4432 + }, + { + "epoch": 1.3606507059545734, + "grad_norm": 0.5209388732910156, + "learning_rate": 9.70813358401969e-05, + "loss": 2.0028, + "step": 4433 + }, + { + "epoch": 1.3609576427255985, + "grad_norm": 0.48829126358032227, + "learning_rate": 9.707966222499745e-05, + "loss": 2.0554, + "step": 4434 + }, + { + "epoch": 1.3612645794966238, + "grad_norm": 0.4373438358306885, + "learning_rate": 9.707798814452919e-05, + "loss": 1.9611, + "step": 4435 + }, + { + "epoch": 1.361571516267649, + "grad_norm": 0.4294830858707428, + "learning_rate": 9.707631359880867e-05, + "loss": 1.9049, + "step": 4436 + }, + { + "epoch": 1.361878453038674, + "grad_norm": 0.46988123655319214, + "learning_rate": 9.70746385878524e-05, + "loss": 1.9221, + "step": 4437 + }, + { + "epoch": 1.3621853898096992, + "grad_norm": 0.4956746995449066, + "learning_rate": 9.707296311167697e-05, + "loss": 1.9215, + "step": 4438 + }, + { + "epoch": 1.3624923265807243, + "grad_norm": 0.43748801946640015, + "learning_rate": 9.707128717029894e-05, + "loss": 1.9882, + "step": 4439 + }, + { + "epoch": 1.3627992633517496, + "grad_norm": 0.4926415979862213, + "learning_rate": 9.706961076373485e-05, + "loss": 1.9664, + "step": 4440 + }, + { + "epoch": 1.3631062001227747, + "grad_norm": 0.5239415764808655, + "learning_rate": 9.706793389200129e-05, + "loss": 1.9809, + "step": 4441 + }, + { + "epoch": 1.3634131368937998, + "grad_norm": 0.5134629607200623, + "learning_rate": 9.706625655511481e-05, + "loss": 1.9559, + "step": 4442 + }, + { + "epoch": 1.3637200736648252, + "grad_norm": 0.49562570452690125, + "learning_rate": 9.706457875309198e-05, + "loss": 1.9603, + "step": 4443 + }, + { + "epoch": 1.3640270104358503, + "grad_norm": 0.45000702142715454, + "learning_rate": 9.706290048594942e-05, + "loss": 1.9395, + "step": 4444 + }, + { + "epoch": 1.3643339472068754, + "grad_norm": 0.4216759502887726, + "learning_rate": 9.70612217537037e-05, + "loss": 1.8857, + "step": 4445 + }, + { + "epoch": 1.3646408839779005, + "grad_norm": 0.5022158622741699, + "learning_rate": 9.705954255637138e-05, + "loss": 1.9388, + "step": 4446 + }, + { + "epoch": 1.3649478207489256, + "grad_norm": 0.5086642503738403, + "learning_rate": 9.70578628939691e-05, + "loss": 1.9325, + "step": 4447 + }, + { + "epoch": 1.365254757519951, + "grad_norm": 0.4891139566898346, + "learning_rate": 9.705618276651342e-05, + "loss": 1.9068, + "step": 4448 + }, + { + "epoch": 1.365561694290976, + "grad_norm": 0.42479926347732544, + "learning_rate": 9.705450217402096e-05, + "loss": 2.0345, + "step": 4449 + }, + { + "epoch": 1.3658686310620012, + "grad_norm": 0.45347172021865845, + "learning_rate": 9.705282111650834e-05, + "loss": 1.9343, + "step": 4450 + }, + { + "epoch": 1.3661755678330265, + "grad_norm": 0.5443231463432312, + "learning_rate": 9.705113959399217e-05, + "loss": 2.0428, + "step": 4451 + }, + { + "epoch": 1.3664825046040516, + "grad_norm": 0.5320110321044922, + "learning_rate": 9.704945760648905e-05, + "loss": 2.0015, + "step": 4452 + }, + { + "epoch": 1.3667894413750767, + "grad_norm": 0.5018410086631775, + "learning_rate": 9.704777515401561e-05, + "loss": 1.9284, + "step": 4453 + }, + { + "epoch": 1.3670963781461019, + "grad_norm": 0.4587440490722656, + "learning_rate": 9.704609223658848e-05, + "loss": 1.8945, + "step": 4454 + }, + { + "epoch": 1.367403314917127, + "grad_norm": 0.4634784758090973, + "learning_rate": 9.70444088542243e-05, + "loss": 1.9564, + "step": 4455 + }, + { + "epoch": 1.3677102516881523, + "grad_norm": 0.43047839403152466, + "learning_rate": 9.70427250069397e-05, + "loss": 2.0417, + "step": 4456 + }, + { + "epoch": 1.3680171884591774, + "grad_norm": 0.46661630272865295, + "learning_rate": 9.70410406947513e-05, + "loss": 2.0563, + "step": 4457 + }, + { + "epoch": 1.3683241252302025, + "grad_norm": 0.46544912457466125, + "learning_rate": 9.703935591767579e-05, + "loss": 2.0115, + "step": 4458 + }, + { + "epoch": 1.3686310620012279, + "grad_norm": 0.466172993183136, + "learning_rate": 9.703767067572977e-05, + "loss": 1.9177, + "step": 4459 + }, + { + "epoch": 1.368937998772253, + "grad_norm": 0.44513949751853943, + "learning_rate": 9.703598496892994e-05, + "loss": 1.9954, + "step": 4460 + }, + { + "epoch": 1.369244935543278, + "grad_norm": 0.4502551257610321, + "learning_rate": 9.703429879729293e-05, + "loss": 1.9155, + "step": 4461 + }, + { + "epoch": 1.3695518723143032, + "grad_norm": 0.4618416726589203, + "learning_rate": 9.703261216083541e-05, + "loss": 2.015, + "step": 4462 + }, + { + "epoch": 1.3698588090853283, + "grad_norm": 0.4691082239151001, + "learning_rate": 9.703092505957405e-05, + "loss": 2.0332, + "step": 4463 + }, + { + "epoch": 1.3701657458563536, + "grad_norm": 0.5674530863761902, + "learning_rate": 9.702923749352553e-05, + "loss": 2.0, + "step": 4464 + }, + { + "epoch": 1.3704726826273788, + "grad_norm": 0.5828661322593689, + "learning_rate": 9.702754946270651e-05, + "loss": 1.9727, + "step": 4465 + }, + { + "epoch": 1.3707796193984039, + "grad_norm": 0.5861548781394958, + "learning_rate": 9.702586096713369e-05, + "loss": 2.0337, + "step": 4466 + }, + { + "epoch": 1.3710865561694292, + "grad_norm": 0.5607923865318298, + "learning_rate": 9.702417200682374e-05, + "loss": 1.9639, + "step": 4467 + }, + { + "epoch": 1.3713934929404543, + "grad_norm": 0.553827702999115, + "learning_rate": 9.702248258179337e-05, + "loss": 1.9644, + "step": 4468 + }, + { + "epoch": 1.3717004297114794, + "grad_norm": 0.6120470762252808, + "learning_rate": 9.702079269205925e-05, + "loss": 1.9562, + "step": 4469 + }, + { + "epoch": 1.3720073664825045, + "grad_norm": 0.6354473829269409, + "learning_rate": 9.70191023376381e-05, + "loss": 2.0984, + "step": 4470 + }, + { + "epoch": 1.3723143032535297, + "grad_norm": 0.5426626801490784, + "learning_rate": 9.701741151854665e-05, + "loss": 1.9473, + "step": 4471 + }, + { + "epoch": 1.372621240024555, + "grad_norm": 0.5632089376449585, + "learning_rate": 9.701572023480156e-05, + "loss": 2.0167, + "step": 4472 + }, + { + "epoch": 1.37292817679558, + "grad_norm": 0.5315039157867432, + "learning_rate": 9.701402848641957e-05, + "loss": 1.9537, + "step": 4473 + }, + { + "epoch": 1.3732351135666052, + "grad_norm": 0.4552931785583496, + "learning_rate": 9.70123362734174e-05, + "loss": 1.9553, + "step": 4474 + }, + { + "epoch": 1.3735420503376305, + "grad_norm": 0.49282166361808777, + "learning_rate": 9.701064359581176e-05, + "loss": 2.0409, + "step": 4475 + }, + { + "epoch": 1.3738489871086557, + "grad_norm": 0.46548575162887573, + "learning_rate": 9.700895045361939e-05, + "loss": 1.9707, + "step": 4476 + }, + { + "epoch": 1.3741559238796808, + "grad_norm": 0.4619027078151703, + "learning_rate": 9.7007256846857e-05, + "loss": 1.9531, + "step": 4477 + }, + { + "epoch": 1.3744628606507059, + "grad_norm": 0.5122626423835754, + "learning_rate": 9.700556277554138e-05, + "loss": 2.0625, + "step": 4478 + }, + { + "epoch": 1.374769797421731, + "grad_norm": 0.487246036529541, + "learning_rate": 9.700386823968922e-05, + "loss": 1.9667, + "step": 4479 + }, + { + "epoch": 1.3750767341927563, + "grad_norm": 0.5093865990638733, + "learning_rate": 9.700217323931729e-05, + "loss": 1.9982, + "step": 4480 + }, + { + "epoch": 1.3753836709637814, + "grad_norm": 0.47049981355667114, + "learning_rate": 9.700047777444232e-05, + "loss": 1.9876, + "step": 4481 + }, + { + "epoch": 1.3756906077348066, + "grad_norm": 0.4997411370277405, + "learning_rate": 9.699878184508109e-05, + "loss": 1.9925, + "step": 4482 + }, + { + "epoch": 1.375997544505832, + "grad_norm": 0.49374327063560486, + "learning_rate": 9.699708545125034e-05, + "loss": 1.9468, + "step": 4483 + }, + { + "epoch": 1.376304481276857, + "grad_norm": 0.44101378321647644, + "learning_rate": 9.699538859296686e-05, + "loss": 2.0577, + "step": 4484 + }, + { + "epoch": 1.3766114180478821, + "grad_norm": 0.47289925813674927, + "learning_rate": 9.699369127024741e-05, + "loss": 1.9611, + "step": 4485 + }, + { + "epoch": 1.3769183548189072, + "grad_norm": 0.4616342782974243, + "learning_rate": 9.699199348310875e-05, + "loss": 2.0196, + "step": 4486 + }, + { + "epoch": 1.3772252915899323, + "grad_norm": 0.45797309279441833, + "learning_rate": 9.699029523156766e-05, + "loss": 2.0168, + "step": 4487 + }, + { + "epoch": 1.3775322283609577, + "grad_norm": 0.5224477648735046, + "learning_rate": 9.698859651564095e-05, + "loss": 2.0312, + "step": 4488 + }, + { + "epoch": 1.3778391651319828, + "grad_norm": 0.4831027388572693, + "learning_rate": 9.698689733534539e-05, + "loss": 2.0084, + "step": 4489 + }, + { + "epoch": 1.378146101903008, + "grad_norm": 0.49492040276527405, + "learning_rate": 9.698519769069774e-05, + "loss": 1.9474, + "step": 4490 + }, + { + "epoch": 1.3784530386740332, + "grad_norm": 0.4911774694919586, + "learning_rate": 9.698349758171486e-05, + "loss": 1.987, + "step": 4491 + }, + { + "epoch": 1.3787599754450584, + "grad_norm": 0.5415390729904175, + "learning_rate": 9.69817970084135e-05, + "loss": 1.9927, + "step": 4492 + }, + { + "epoch": 1.3790669122160835, + "grad_norm": 0.6870381832122803, + "learning_rate": 9.698009597081048e-05, + "loss": 2.0348, + "step": 4493 + }, + { + "epoch": 1.3793738489871086, + "grad_norm": 0.6322616934776306, + "learning_rate": 9.697839446892263e-05, + "loss": 2.0119, + "step": 4494 + }, + { + "epoch": 1.3796807857581337, + "grad_norm": 0.5950151681900024, + "learning_rate": 9.697669250276675e-05, + "loss": 2.002, + "step": 4495 + }, + { + "epoch": 1.379987722529159, + "grad_norm": 0.4321151673793793, + "learning_rate": 9.697499007235966e-05, + "loss": 1.9173, + "step": 4496 + }, + { + "epoch": 1.3802946593001841, + "grad_norm": 0.4627344608306885, + "learning_rate": 9.697328717771818e-05, + "loss": 2.0289, + "step": 4497 + }, + { + "epoch": 1.3806015960712092, + "grad_norm": 0.5040726661682129, + "learning_rate": 9.697158381885915e-05, + "loss": 1.9844, + "step": 4498 + }, + { + "epoch": 1.3809085328422346, + "grad_norm": 0.5219398736953735, + "learning_rate": 9.696987999579939e-05, + "loss": 1.9536, + "step": 4499 + }, + { + "epoch": 1.3812154696132597, + "grad_norm": 0.487734317779541, + "learning_rate": 9.696817570855575e-05, + "loss": 1.9655, + "step": 4500 + }, + { + "epoch": 1.3815224063842848, + "grad_norm": 0.40818822383880615, + "learning_rate": 9.696647095714506e-05, + "loss": 1.9524, + "step": 4501 + }, + { + "epoch": 1.3818293431553101, + "grad_norm": 0.41752889752388, + "learning_rate": 9.69647657415842e-05, + "loss": 1.9927, + "step": 4502 + }, + { + "epoch": 1.3821362799263353, + "grad_norm": 0.44540464878082275, + "learning_rate": 9.696306006188998e-05, + "loss": 1.9207, + "step": 4503 + }, + { + "epoch": 1.3824432166973604, + "grad_norm": 0.44818806648254395, + "learning_rate": 9.696135391807927e-05, + "loss": 1.9054, + "step": 4504 + }, + { + "epoch": 1.3827501534683855, + "grad_norm": 0.430758535861969, + "learning_rate": 9.695964731016896e-05, + "loss": 1.9644, + "step": 4505 + }, + { + "epoch": 1.3830570902394106, + "grad_norm": 0.3787635564804077, + "learning_rate": 9.695794023817586e-05, + "loss": 1.9601, + "step": 4506 + }, + { + "epoch": 1.383364027010436, + "grad_norm": 0.42520588636398315, + "learning_rate": 9.695623270211689e-05, + "loss": 1.9681, + "step": 4507 + }, + { + "epoch": 1.383670963781461, + "grad_norm": 0.39063912630081177, + "learning_rate": 9.69545247020089e-05, + "loss": 2.0323, + "step": 4508 + }, + { + "epoch": 1.3839779005524862, + "grad_norm": 0.41405799984931946, + "learning_rate": 9.695281623786879e-05, + "loss": 1.9239, + "step": 4509 + }, + { + "epoch": 1.3842848373235115, + "grad_norm": 0.4275501072406769, + "learning_rate": 9.695110730971342e-05, + "loss": 1.941, + "step": 4510 + }, + { + "epoch": 1.3845917740945366, + "grad_norm": 0.5254966616630554, + "learning_rate": 9.694939791755968e-05, + "loss": 1.9997, + "step": 4511 + }, + { + "epoch": 1.3848987108655617, + "grad_norm": 0.581857442855835, + "learning_rate": 9.694768806142448e-05, + "loss": 2.0085, + "step": 4512 + }, + { + "epoch": 1.3852056476365868, + "grad_norm": 0.6330662965774536, + "learning_rate": 9.69459777413247e-05, + "loss": 1.9898, + "step": 4513 + }, + { + "epoch": 1.385512584407612, + "grad_norm": 0.693536639213562, + "learning_rate": 9.694426695727727e-05, + "loss": 1.9466, + "step": 4514 + }, + { + "epoch": 1.3858195211786373, + "grad_norm": 0.6494079232215881, + "learning_rate": 9.694255570929906e-05, + "loss": 1.9523, + "step": 4515 + }, + { + "epoch": 1.3861264579496624, + "grad_norm": 0.573515772819519, + "learning_rate": 9.694084399740701e-05, + "loss": 1.9789, + "step": 4516 + }, + { + "epoch": 1.3864333947206875, + "grad_norm": 0.5253448486328125, + "learning_rate": 9.693913182161805e-05, + "loss": 2.0348, + "step": 4517 + }, + { + "epoch": 1.3867403314917128, + "grad_norm": 0.49921590089797974, + "learning_rate": 9.693741918194904e-05, + "loss": 1.9684, + "step": 4518 + }, + { + "epoch": 1.387047268262738, + "grad_norm": 0.5164174437522888, + "learning_rate": 9.693570607841696e-05, + "loss": 2.0104, + "step": 4519 + }, + { + "epoch": 1.387354205033763, + "grad_norm": 0.5620231032371521, + "learning_rate": 9.693399251103872e-05, + "loss": 1.9969, + "step": 4520 + }, + { + "epoch": 1.3876611418047882, + "grad_norm": 0.495890349149704, + "learning_rate": 9.693227847983126e-05, + "loss": 2.0037, + "step": 4521 + }, + { + "epoch": 1.3879680785758133, + "grad_norm": 0.4942645728588104, + "learning_rate": 9.693056398481151e-05, + "loss": 2.0199, + "step": 4522 + }, + { + "epoch": 1.3882750153468386, + "grad_norm": 0.5366860628128052, + "learning_rate": 9.692884902599643e-05, + "loss": 2.0395, + "step": 4523 + }, + { + "epoch": 1.3885819521178637, + "grad_norm": 0.48179951310157776, + "learning_rate": 9.692713360340295e-05, + "loss": 2.0292, + "step": 4524 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.4709320366382599, + "learning_rate": 9.692541771704804e-05, + "loss": 2.006, + "step": 4525 + }, + { + "epoch": 1.3891958256599142, + "grad_norm": 0.4311975836753845, + "learning_rate": 9.692370136694864e-05, + "loss": 2.0122, + "step": 4526 + }, + { + "epoch": 1.3895027624309393, + "grad_norm": 0.4489841163158417, + "learning_rate": 9.692198455312172e-05, + "loss": 1.9635, + "step": 4527 + }, + { + "epoch": 1.3898096992019644, + "grad_norm": 0.40383243560791016, + "learning_rate": 9.692026727558425e-05, + "loss": 1.9352, + "step": 4528 + }, + { + "epoch": 1.3901166359729895, + "grad_norm": 0.4732677638530731, + "learning_rate": 9.691854953435319e-05, + "loss": 1.9882, + "step": 4529 + }, + { + "epoch": 1.3904235727440146, + "grad_norm": 0.5124688744544983, + "learning_rate": 9.691683132944553e-05, + "loss": 2.0068, + "step": 4530 + }, + { + "epoch": 1.39073050951504, + "grad_norm": 0.4810490906238556, + "learning_rate": 9.691511266087824e-05, + "loss": 2.0163, + "step": 4531 + }, + { + "epoch": 1.391037446286065, + "grad_norm": 0.4019710421562195, + "learning_rate": 9.691339352866831e-05, + "loss": 1.8943, + "step": 4532 + }, + { + "epoch": 1.3913443830570902, + "grad_norm": 0.4144287705421448, + "learning_rate": 9.691167393283274e-05, + "loss": 1.9457, + "step": 4533 + }, + { + "epoch": 1.3916513198281155, + "grad_norm": 0.42622655630111694, + "learning_rate": 9.690995387338851e-05, + "loss": 1.9618, + "step": 4534 + }, + { + "epoch": 1.3919582565991406, + "grad_norm": 0.4547794461250305, + "learning_rate": 9.690823335035259e-05, + "loss": 2.0243, + "step": 4535 + }, + { + "epoch": 1.3922651933701657, + "grad_norm": 0.4298909604549408, + "learning_rate": 9.690651236374205e-05, + "loss": 1.9872, + "step": 4536 + }, + { + "epoch": 1.3925721301411909, + "grad_norm": 0.40463829040527344, + "learning_rate": 9.690479091357386e-05, + "loss": 1.9617, + "step": 4537 + }, + { + "epoch": 1.392879066912216, + "grad_norm": 0.441487580537796, + "learning_rate": 9.690306899986502e-05, + "loss": 1.8965, + "step": 4538 + }, + { + "epoch": 1.3931860036832413, + "grad_norm": 0.4713582694530487, + "learning_rate": 9.690134662263256e-05, + "loss": 2.0112, + "step": 4539 + }, + { + "epoch": 1.3934929404542664, + "grad_norm": 0.5772922039031982, + "learning_rate": 9.689962378189351e-05, + "loss": 1.9903, + "step": 4540 + }, + { + "epoch": 1.3937998772252915, + "grad_norm": 0.6658890247344971, + "learning_rate": 9.689790047766489e-05, + "loss": 2.0569, + "step": 4541 + }, + { + "epoch": 1.3941068139963169, + "grad_norm": 0.6710116267204285, + "learning_rate": 9.689617670996372e-05, + "loss": 1.9692, + "step": 4542 + }, + { + "epoch": 1.394413750767342, + "grad_norm": 0.6778390407562256, + "learning_rate": 9.689445247880707e-05, + "loss": 2.0363, + "step": 4543 + }, + { + "epoch": 1.394720687538367, + "grad_norm": 0.6921203136444092, + "learning_rate": 9.689272778421192e-05, + "loss": 2.0104, + "step": 4544 + }, + { + "epoch": 1.3950276243093922, + "grad_norm": 0.48772117495536804, + "learning_rate": 9.689100262619537e-05, + "loss": 2.0006, + "step": 4545 + }, + { + "epoch": 1.3953345610804173, + "grad_norm": 0.4956360459327698, + "learning_rate": 9.688927700477445e-05, + "loss": 1.9724, + "step": 4546 + }, + { + "epoch": 1.3956414978514426, + "grad_norm": 0.6304072141647339, + "learning_rate": 9.68875509199662e-05, + "loss": 1.9904, + "step": 4547 + }, + { + "epoch": 1.3959484346224678, + "grad_norm": 0.6372275948524475, + "learning_rate": 9.68858243717877e-05, + "loss": 2.0328, + "step": 4548 + }, + { + "epoch": 1.3962553713934929, + "grad_norm": 0.48642870783805847, + "learning_rate": 9.688409736025601e-05, + "loss": 1.9898, + "step": 4549 + }, + { + "epoch": 1.3965623081645182, + "grad_norm": 0.41096800565719604, + "learning_rate": 9.688236988538817e-05, + "loss": 1.8945, + "step": 4550 + }, + { + "epoch": 1.3968692449355433, + "grad_norm": 0.48746830224990845, + "learning_rate": 9.68806419472013e-05, + "loss": 1.9809, + "step": 4551 + }, + { + "epoch": 1.3971761817065684, + "grad_norm": 0.5296676754951477, + "learning_rate": 9.687891354571242e-05, + "loss": 1.9194, + "step": 4552 + }, + { + "epoch": 1.3974831184775935, + "grad_norm": 0.43177086114883423, + "learning_rate": 9.687718468093865e-05, + "loss": 1.8785, + "step": 4553 + }, + { + "epoch": 1.3977900552486187, + "grad_norm": 0.4617565870285034, + "learning_rate": 9.687545535289705e-05, + "loss": 2.0021, + "step": 4554 + }, + { + "epoch": 1.398096992019644, + "grad_norm": 0.4460168182849884, + "learning_rate": 9.687372556160477e-05, + "loss": 1.9368, + "step": 4555 + }, + { + "epoch": 1.398403928790669, + "grad_norm": 0.5051010847091675, + "learning_rate": 9.687199530707882e-05, + "loss": 2.0321, + "step": 4556 + }, + { + "epoch": 1.3987108655616942, + "grad_norm": 0.5623685717582703, + "learning_rate": 9.687026458933636e-05, + "loss": 2.007, + "step": 4557 + }, + { + "epoch": 1.3990178023327196, + "grad_norm": 0.48149919509887695, + "learning_rate": 9.686853340839446e-05, + "loss": 1.9346, + "step": 4558 + }, + { + "epoch": 1.3993247391037447, + "grad_norm": 0.4651631712913513, + "learning_rate": 9.686680176427025e-05, + "loss": 1.9603, + "step": 4559 + }, + { + "epoch": 1.3996316758747698, + "grad_norm": 0.5255021452903748, + "learning_rate": 9.686506965698083e-05, + "loss": 2.0206, + "step": 4560 + }, + { + "epoch": 1.3999386126457949, + "grad_norm": 0.5137404799461365, + "learning_rate": 9.686333708654334e-05, + "loss": 1.9736, + "step": 4561 + }, + { + "epoch": 1.40024554941682, + "grad_norm": 0.5037943124771118, + "learning_rate": 9.686160405297487e-05, + "loss": 1.9886, + "step": 4562 + }, + { + "epoch": 1.4005524861878453, + "grad_norm": 0.46424365043640137, + "learning_rate": 9.685987055629256e-05, + "loss": 1.9316, + "step": 4563 + }, + { + "epoch": 1.4008594229588704, + "grad_norm": 0.4839535355567932, + "learning_rate": 9.685813659651355e-05, + "loss": 1.9651, + "step": 4564 + }, + { + "epoch": 1.4011663597298956, + "grad_norm": 0.48972323536872864, + "learning_rate": 9.685640217365497e-05, + "loss": 1.9544, + "step": 4565 + }, + { + "epoch": 1.401473296500921, + "grad_norm": 0.43038102984428406, + "learning_rate": 9.685466728773396e-05, + "loss": 1.9522, + "step": 4566 + }, + { + "epoch": 1.401780233271946, + "grad_norm": 0.5174641013145447, + "learning_rate": 9.685293193876765e-05, + "loss": 2.046, + "step": 4567 + }, + { + "epoch": 1.4020871700429711, + "grad_norm": 0.6731263995170593, + "learning_rate": 9.685119612677323e-05, + "loss": 2.0123, + "step": 4568 + }, + { + "epoch": 1.4023941068139965, + "grad_norm": 0.5863515734672546, + "learning_rate": 9.684945985176782e-05, + "loss": 1.9951, + "step": 4569 + }, + { + "epoch": 1.4027010435850216, + "grad_norm": 0.4479050934314728, + "learning_rate": 9.684772311376859e-05, + "loss": 1.9287, + "step": 4570 + }, + { + "epoch": 1.4030079803560467, + "grad_norm": 0.432740718126297, + "learning_rate": 9.68459859127927e-05, + "loss": 1.955, + "step": 4571 + }, + { + "epoch": 1.4033149171270718, + "grad_norm": 0.571775496006012, + "learning_rate": 9.684424824885731e-05, + "loss": 1.9519, + "step": 4572 + }, + { + "epoch": 1.403621853898097, + "grad_norm": 0.6454880237579346, + "learning_rate": 9.684251012197963e-05, + "loss": 1.9858, + "step": 4573 + }, + { + "epoch": 1.4039287906691222, + "grad_norm": 0.5274731516838074, + "learning_rate": 9.684077153217677e-05, + "loss": 1.9956, + "step": 4574 + }, + { + "epoch": 1.4042357274401474, + "grad_norm": 0.4459272027015686, + "learning_rate": 9.683903247946597e-05, + "loss": 2.0412, + "step": 4575 + }, + { + "epoch": 1.4045426642111725, + "grad_norm": 0.47089213132858276, + "learning_rate": 9.683729296386441e-05, + "loss": 1.9247, + "step": 4576 + }, + { + "epoch": 1.4048496009821978, + "grad_norm": 0.628490149974823, + "learning_rate": 9.683555298538927e-05, + "loss": 2.1311, + "step": 4577 + }, + { + "epoch": 1.405156537753223, + "grad_norm": 0.5498626232147217, + "learning_rate": 9.683381254405773e-05, + "loss": 1.9538, + "step": 4578 + }, + { + "epoch": 1.405463474524248, + "grad_norm": 0.4556458294391632, + "learning_rate": 9.6832071639887e-05, + "loss": 1.9957, + "step": 4579 + }, + { + "epoch": 1.4057704112952731, + "grad_norm": 0.5684164762496948, + "learning_rate": 9.68303302728943e-05, + "loss": 1.9339, + "step": 4580 + }, + { + "epoch": 1.4060773480662982, + "grad_norm": 0.5723292231559753, + "learning_rate": 9.682858844309682e-05, + "loss": 2.0043, + "step": 4581 + }, + { + "epoch": 1.4063842848373236, + "grad_norm": 0.4734770953655243, + "learning_rate": 9.682684615051178e-05, + "loss": 1.9854, + "step": 4582 + }, + { + "epoch": 1.4066912216083487, + "grad_norm": 0.49376189708709717, + "learning_rate": 9.682510339515642e-05, + "loss": 2.0436, + "step": 4583 + }, + { + "epoch": 1.4069981583793738, + "grad_norm": 0.6263520121574402, + "learning_rate": 9.682336017704793e-05, + "loss": 1.9426, + "step": 4584 + }, + { + "epoch": 1.4073050951503991, + "grad_norm": 0.5852357745170593, + "learning_rate": 9.682161649620355e-05, + "loss": 1.9865, + "step": 4585 + }, + { + "epoch": 1.4076120319214243, + "grad_norm": 0.45548367500305176, + "learning_rate": 9.681987235264052e-05, + "loss": 2.0454, + "step": 4586 + }, + { + "epoch": 1.4079189686924494, + "grad_norm": 0.4961472153663635, + "learning_rate": 9.681812774637607e-05, + "loss": 2.0414, + "step": 4587 + }, + { + "epoch": 1.4082259054634745, + "grad_norm": 0.5739028453826904, + "learning_rate": 9.681638267742741e-05, + "loss": 1.9591, + "step": 4588 + }, + { + "epoch": 1.4085328422344996, + "grad_norm": 0.546283483505249, + "learning_rate": 9.681463714581184e-05, + "loss": 1.9631, + "step": 4589 + }, + { + "epoch": 1.408839779005525, + "grad_norm": 0.4757421910762787, + "learning_rate": 9.681289115154659e-05, + "loss": 1.954, + "step": 4590 + }, + { + "epoch": 1.40914671577655, + "grad_norm": 0.5116898417472839, + "learning_rate": 9.681114469464891e-05, + "loss": 1.9816, + "step": 4591 + }, + { + "epoch": 1.4094536525475752, + "grad_norm": 0.6128544807434082, + "learning_rate": 9.680939777513607e-05, + "loss": 1.9408, + "step": 4592 + }, + { + "epoch": 1.4097605893186005, + "grad_norm": 0.5577036142349243, + "learning_rate": 9.680765039302531e-05, + "loss": 1.906, + "step": 4593 + }, + { + "epoch": 1.4100675260896256, + "grad_norm": 0.4608074128627777, + "learning_rate": 9.680590254833393e-05, + "loss": 1.9421, + "step": 4594 + }, + { + "epoch": 1.4103744628606507, + "grad_norm": 0.4221206307411194, + "learning_rate": 9.680415424107917e-05, + "loss": 1.9596, + "step": 4595 + }, + { + "epoch": 1.4106813996316758, + "grad_norm": 0.4278069734573364, + "learning_rate": 9.680240547127832e-05, + "loss": 1.9718, + "step": 4596 + }, + { + "epoch": 1.410988336402701, + "grad_norm": 0.48608019948005676, + "learning_rate": 9.680065623894869e-05, + "loss": 2.0595, + "step": 4597 + }, + { + "epoch": 1.4112952731737263, + "grad_norm": 0.4559817910194397, + "learning_rate": 9.679890654410753e-05, + "loss": 1.959, + "step": 4598 + }, + { + "epoch": 1.4116022099447514, + "grad_norm": 0.5122750997543335, + "learning_rate": 9.679715638677216e-05, + "loss": 2.0669, + "step": 4599 + }, + { + "epoch": 1.4119091467157765, + "grad_norm": 0.5203170776367188, + "learning_rate": 9.679540576695985e-05, + "loss": 1.9475, + "step": 4600 + }, + { + "epoch": 1.4122160834868018, + "grad_norm": 0.5420581698417664, + "learning_rate": 9.679365468468791e-05, + "loss": 1.9603, + "step": 4601 + }, + { + "epoch": 1.412523020257827, + "grad_norm": 0.527387261390686, + "learning_rate": 9.679190313997364e-05, + "loss": 1.9172, + "step": 4602 + }, + { + "epoch": 1.412829957028852, + "grad_norm": 0.48417946696281433, + "learning_rate": 9.679015113283438e-05, + "loss": 1.9619, + "step": 4603 + }, + { + "epoch": 1.4131368937998772, + "grad_norm": 0.49174100160598755, + "learning_rate": 9.678839866328742e-05, + "loss": 1.9959, + "step": 4604 + }, + { + "epoch": 1.4134438305709023, + "grad_norm": 0.5096092224121094, + "learning_rate": 9.678664573135006e-05, + "loss": 2.0046, + "step": 4605 + }, + { + "epoch": 1.4137507673419276, + "grad_norm": 0.4536958634853363, + "learning_rate": 9.678489233703965e-05, + "loss": 1.9289, + "step": 4606 + }, + { + "epoch": 1.4140577041129527, + "grad_norm": 0.40438196063041687, + "learning_rate": 9.678313848037353e-05, + "loss": 1.9488, + "step": 4607 + }, + { + "epoch": 1.4143646408839778, + "grad_norm": 0.4447456896305084, + "learning_rate": 9.6781384161369e-05, + "loss": 1.9638, + "step": 4608 + }, + { + "epoch": 1.4146715776550032, + "grad_norm": 0.44451746344566345, + "learning_rate": 9.677962938004342e-05, + "loss": 1.9026, + "step": 4609 + }, + { + "epoch": 1.4149785144260283, + "grad_norm": 0.4262266457080841, + "learning_rate": 9.677787413641412e-05, + "loss": 1.9408, + "step": 4610 + }, + { + "epoch": 1.4152854511970534, + "grad_norm": 0.42755937576293945, + "learning_rate": 9.677611843049845e-05, + "loss": 1.9542, + "step": 4611 + }, + { + "epoch": 1.4155923879680785, + "grad_norm": 0.43264830112457275, + "learning_rate": 9.677436226231375e-05, + "loss": 2.0244, + "step": 4612 + }, + { + "epoch": 1.4158993247391036, + "grad_norm": 0.4521278142929077, + "learning_rate": 9.67726056318774e-05, + "loss": 2.0343, + "step": 4613 + }, + { + "epoch": 1.416206261510129, + "grad_norm": 0.45257535576820374, + "learning_rate": 9.677084853920675e-05, + "loss": 1.9743, + "step": 4614 + }, + { + "epoch": 1.416513198281154, + "grad_norm": 0.42859771847724915, + "learning_rate": 9.676909098431915e-05, + "loss": 2.0067, + "step": 4615 + }, + { + "epoch": 1.4168201350521792, + "grad_norm": 0.4057050049304962, + "learning_rate": 9.6767332967232e-05, + "loss": 1.9074, + "step": 4616 + }, + { + "epoch": 1.4171270718232045, + "grad_norm": 0.46177807450294495, + "learning_rate": 9.676557448796264e-05, + "loss": 1.9899, + "step": 4617 + }, + { + "epoch": 1.4174340085942296, + "grad_norm": 0.44164395332336426, + "learning_rate": 9.676381554652846e-05, + "loss": 1.9759, + "step": 4618 + }, + { + "epoch": 1.4177409453652547, + "grad_norm": 0.42987993359565735, + "learning_rate": 9.676205614294684e-05, + "loss": 1.8783, + "step": 4619 + }, + { + "epoch": 1.4180478821362799, + "grad_norm": 0.541702389717102, + "learning_rate": 9.67602962772352e-05, + "loss": 2.0099, + "step": 4620 + }, + { + "epoch": 1.418354818907305, + "grad_norm": 0.42173272371292114, + "learning_rate": 9.67585359494109e-05, + "loss": 1.9281, + "step": 4621 + }, + { + "epoch": 1.4186617556783303, + "grad_norm": 0.432476669549942, + "learning_rate": 9.67567751594913e-05, + "loss": 1.9124, + "step": 4622 + }, + { + "epoch": 1.4189686924493554, + "grad_norm": 0.4952125549316406, + "learning_rate": 9.675501390749388e-05, + "loss": 1.973, + "step": 4623 + }, + { + "epoch": 1.4192756292203805, + "grad_norm": 0.5270698070526123, + "learning_rate": 9.6753252193436e-05, + "loss": 2.003, + "step": 4624 + }, + { + "epoch": 1.4195825659914059, + "grad_norm": 0.5735524892807007, + "learning_rate": 9.67514900173351e-05, + "loss": 1.9266, + "step": 4625 + }, + { + "epoch": 1.419889502762431, + "grad_norm": 0.508196234703064, + "learning_rate": 9.674972737920855e-05, + "loss": 1.9633, + "step": 4626 + }, + { + "epoch": 1.420196439533456, + "grad_norm": 0.4321250319480896, + "learning_rate": 9.674796427907379e-05, + "loss": 1.9994, + "step": 4627 + }, + { + "epoch": 1.4205033763044812, + "grad_norm": 0.5697643756866455, + "learning_rate": 9.674620071694826e-05, + "loss": 2.0018, + "step": 4628 + }, + { + "epoch": 1.4208103130755063, + "grad_norm": 0.6797513365745544, + "learning_rate": 9.674443669284936e-05, + "loss": 2.0514, + "step": 4629 + }, + { + "epoch": 1.4211172498465316, + "grad_norm": 0.6622742414474487, + "learning_rate": 9.674267220679456e-05, + "loss": 1.9315, + "step": 4630 + }, + { + "epoch": 1.4214241866175568, + "grad_norm": 0.5143589377403259, + "learning_rate": 9.674090725880125e-05, + "loss": 1.9691, + "step": 4631 + }, + { + "epoch": 1.4217311233885819, + "grad_norm": 0.4472220838069916, + "learning_rate": 9.673914184888692e-05, + "loss": 1.9629, + "step": 4632 + }, + { + "epoch": 1.4220380601596072, + "grad_norm": 0.4992378354072571, + "learning_rate": 9.6737375977069e-05, + "loss": 1.9202, + "step": 4633 + }, + { + "epoch": 1.4223449969306323, + "grad_norm": 0.5463345646858215, + "learning_rate": 9.673560964336493e-05, + "loss": 2.0143, + "step": 4634 + }, + { + "epoch": 1.4226519337016574, + "grad_norm": 0.4566437304019928, + "learning_rate": 9.673384284779217e-05, + "loss": 1.8907, + "step": 4635 + }, + { + "epoch": 1.4229588704726825, + "grad_norm": 0.41718652844429016, + "learning_rate": 9.673207559036816e-05, + "loss": 1.8955, + "step": 4636 + }, + { + "epoch": 1.4232658072437077, + "grad_norm": 0.5017329454421997, + "learning_rate": 9.673030787111043e-05, + "loss": 1.9745, + "step": 4637 + }, + { + "epoch": 1.423572744014733, + "grad_norm": 0.48890092968940735, + "learning_rate": 9.67285396900364e-05, + "loss": 1.9448, + "step": 4638 + }, + { + "epoch": 1.423879680785758, + "grad_norm": 0.4519537687301636, + "learning_rate": 9.672677104716352e-05, + "loss": 1.9572, + "step": 4639 + }, + { + "epoch": 1.4241866175567832, + "grad_norm": 0.4786919355392456, + "learning_rate": 9.672500194250932e-05, + "loss": 2.0212, + "step": 4640 + }, + { + "epoch": 1.4244935543278086, + "grad_norm": 0.4938487112522125, + "learning_rate": 9.672323237609127e-05, + "loss": 1.9842, + "step": 4641 + }, + { + "epoch": 1.4248004910988337, + "grad_norm": 0.5786599516868591, + "learning_rate": 9.672146234792686e-05, + "loss": 1.9575, + "step": 4642 + }, + { + "epoch": 1.4251074278698588, + "grad_norm": 0.5532247424125671, + "learning_rate": 9.671969185803356e-05, + "loss": 1.9972, + "step": 4643 + }, + { + "epoch": 1.4254143646408841, + "grad_norm": 0.5058014988899231, + "learning_rate": 9.671792090642889e-05, + "loss": 2.0042, + "step": 4644 + }, + { + "epoch": 1.4257213014119092, + "grad_norm": 0.46545106172561646, + "learning_rate": 9.671614949313033e-05, + "loss": 1.9853, + "step": 4645 + }, + { + "epoch": 1.4260282381829343, + "grad_norm": 0.47626879811286926, + "learning_rate": 9.671437761815541e-05, + "loss": 1.9725, + "step": 4646 + }, + { + "epoch": 1.4263351749539595, + "grad_norm": 0.4476237893104553, + "learning_rate": 9.671260528152165e-05, + "loss": 1.8876, + "step": 4647 + }, + { + "epoch": 1.4266421117249846, + "grad_norm": 0.4290693700313568, + "learning_rate": 9.671083248324651e-05, + "loss": 1.9766, + "step": 4648 + }, + { + "epoch": 1.42694904849601, + "grad_norm": 0.443131685256958, + "learning_rate": 9.670905922334757e-05, + "loss": 2.0201, + "step": 4649 + }, + { + "epoch": 1.427255985267035, + "grad_norm": 0.5181389451026917, + "learning_rate": 9.670728550184231e-05, + "loss": 2.0013, + "step": 4650 + }, + { + "epoch": 1.4275629220380601, + "grad_norm": 0.48453402519226074, + "learning_rate": 9.670551131874829e-05, + "loss": 1.9536, + "step": 4651 + }, + { + "epoch": 1.4278698588090855, + "grad_norm": 0.49652302265167236, + "learning_rate": 9.670373667408303e-05, + "loss": 1.9934, + "step": 4652 + }, + { + "epoch": 1.4281767955801106, + "grad_norm": 0.47071191668510437, + "learning_rate": 9.670196156786406e-05, + "loss": 2.0319, + "step": 4653 + }, + { + "epoch": 1.4284837323511357, + "grad_norm": 0.46828708052635193, + "learning_rate": 9.670018600010894e-05, + "loss": 1.9248, + "step": 4654 + }, + { + "epoch": 1.4287906691221608, + "grad_norm": 0.48472490906715393, + "learning_rate": 9.669840997083524e-05, + "loss": 1.9681, + "step": 4655 + }, + { + "epoch": 1.429097605893186, + "grad_norm": 0.48628562688827515, + "learning_rate": 9.669663348006044e-05, + "loss": 1.9818, + "step": 4656 + }, + { + "epoch": 1.4294045426642112, + "grad_norm": 0.40770742297172546, + "learning_rate": 9.669485652780215e-05, + "loss": 1.927, + "step": 4657 + }, + { + "epoch": 1.4297114794352364, + "grad_norm": 0.5005267858505249, + "learning_rate": 9.669307911407794e-05, + "loss": 2.0564, + "step": 4658 + }, + { + "epoch": 1.4300184162062615, + "grad_norm": 0.42432111501693726, + "learning_rate": 9.669130123890533e-05, + "loss": 1.9344, + "step": 4659 + }, + { + "epoch": 1.4303253529772868, + "grad_norm": 0.42347240447998047, + "learning_rate": 9.668952290230192e-05, + "loss": 1.962, + "step": 4660 + }, + { + "epoch": 1.430632289748312, + "grad_norm": 0.4718005955219269, + "learning_rate": 9.668774410428529e-05, + "loss": 2.0081, + "step": 4661 + }, + { + "epoch": 1.430939226519337, + "grad_norm": 0.45922374725341797, + "learning_rate": 9.6685964844873e-05, + "loss": 1.9378, + "step": 4662 + }, + { + "epoch": 1.4312461632903621, + "grad_norm": 0.43764227628707886, + "learning_rate": 9.668418512408263e-05, + "loss": 2.0084, + "step": 4663 + }, + { + "epoch": 1.4315531000613873, + "grad_norm": 0.42079678177833557, + "learning_rate": 9.668240494193179e-05, + "loss": 1.9675, + "step": 4664 + }, + { + "epoch": 1.4318600368324126, + "grad_norm": 0.4470539093017578, + "learning_rate": 9.668062429843808e-05, + "loss": 1.9781, + "step": 4665 + }, + { + "epoch": 1.4321669736034377, + "grad_norm": 0.4903084337711334, + "learning_rate": 9.667884319361906e-05, + "loss": 1.9612, + "step": 4666 + }, + { + "epoch": 1.4324739103744628, + "grad_norm": 0.4906228482723236, + "learning_rate": 9.667706162749234e-05, + "loss": 2.0115, + "step": 4667 + }, + { + "epoch": 1.4327808471454881, + "grad_norm": 0.4868105351924896, + "learning_rate": 9.667527960007556e-05, + "loss": 1.9648, + "step": 4668 + }, + { + "epoch": 1.4330877839165133, + "grad_norm": 0.5115882754325867, + "learning_rate": 9.667349711138632e-05, + "loss": 2.0366, + "step": 4669 + }, + { + "epoch": 1.4333947206875384, + "grad_norm": 0.47366276383399963, + "learning_rate": 9.66717141614422e-05, + "loss": 1.9467, + "step": 4670 + }, + { + "epoch": 1.4337016574585635, + "grad_norm": 0.6110171675682068, + "learning_rate": 9.666993075026086e-05, + "loss": 1.9272, + "step": 4671 + }, + { + "epoch": 1.4340085942295886, + "grad_norm": 0.5915683507919312, + "learning_rate": 9.66681468778599e-05, + "loss": 2.0444, + "step": 4672 + }, + { + "epoch": 1.434315531000614, + "grad_norm": 0.5783519744873047, + "learning_rate": 9.666636254425697e-05, + "loss": 1.9579, + "step": 4673 + }, + { + "epoch": 1.434622467771639, + "grad_norm": 0.4646502137184143, + "learning_rate": 9.66645777494697e-05, + "loss": 1.9172, + "step": 4674 + }, + { + "epoch": 1.4349294045426642, + "grad_norm": 0.4184744656085968, + "learning_rate": 9.666279249351571e-05, + "loss": 1.9189, + "step": 4675 + }, + { + "epoch": 1.4352363413136895, + "grad_norm": 0.5444575548171997, + "learning_rate": 9.666100677641266e-05, + "loss": 2.045, + "step": 4676 + }, + { + "epoch": 1.4355432780847146, + "grad_norm": 0.5232846140861511, + "learning_rate": 9.665922059817818e-05, + "loss": 2.0059, + "step": 4677 + }, + { + "epoch": 1.4358502148557397, + "grad_norm": 0.439259797334671, + "learning_rate": 9.665743395882994e-05, + "loss": 1.9164, + "step": 4678 + }, + { + "epoch": 1.4361571516267648, + "grad_norm": 0.405073344707489, + "learning_rate": 9.66556468583856e-05, + "loss": 1.9211, + "step": 4679 + }, + { + "epoch": 1.43646408839779, + "grad_norm": 0.47113174200057983, + "learning_rate": 9.665385929686279e-05, + "loss": 2.0732, + "step": 4680 + }, + { + "epoch": 1.4367710251688153, + "grad_norm": 0.4710143506526947, + "learning_rate": 9.665207127427923e-05, + "loss": 1.9153, + "step": 4681 + }, + { + "epoch": 1.4370779619398404, + "grad_norm": 0.41988152265548706, + "learning_rate": 9.665028279065254e-05, + "loss": 1.9985, + "step": 4682 + }, + { + "epoch": 1.4373848987108655, + "grad_norm": 0.4629889130592346, + "learning_rate": 9.664849384600042e-05, + "loss": 2.0188, + "step": 4683 + }, + { + "epoch": 1.4376918354818908, + "grad_norm": 0.42099106311798096, + "learning_rate": 9.664670444034051e-05, + "loss": 1.8915, + "step": 4684 + }, + { + "epoch": 1.437998772252916, + "grad_norm": 0.4132508337497711, + "learning_rate": 9.664491457369056e-05, + "loss": 1.9842, + "step": 4685 + }, + { + "epoch": 1.438305709023941, + "grad_norm": 0.4019499123096466, + "learning_rate": 9.664312424606822e-05, + "loss": 1.8653, + "step": 4686 + }, + { + "epoch": 1.4386126457949662, + "grad_norm": 0.40366294980049133, + "learning_rate": 9.664133345749118e-05, + "loss": 1.8993, + "step": 4687 + }, + { + "epoch": 1.4389195825659913, + "grad_norm": 0.4391988217830658, + "learning_rate": 9.663954220797715e-05, + "loss": 1.9471, + "step": 4688 + }, + { + "epoch": 1.4392265193370166, + "grad_norm": 0.44109684228897095, + "learning_rate": 9.663775049754382e-05, + "loss": 1.9579, + "step": 4689 + }, + { + "epoch": 1.4395334561080417, + "grad_norm": 0.45682960748672485, + "learning_rate": 9.663595832620891e-05, + "loss": 1.9757, + "step": 4690 + }, + { + "epoch": 1.4398403928790668, + "grad_norm": 0.4106207489967346, + "learning_rate": 9.663416569399013e-05, + "loss": 2.0038, + "step": 4691 + }, + { + "epoch": 1.4401473296500922, + "grad_norm": 0.4627512991428375, + "learning_rate": 9.66323726009052e-05, + "loss": 2.0253, + "step": 4692 + }, + { + "epoch": 1.4404542664211173, + "grad_norm": 0.43822941184043884, + "learning_rate": 9.663057904697182e-05, + "loss": 1.9565, + "step": 4693 + }, + { + "epoch": 1.4407612031921424, + "grad_norm": 0.46254315972328186, + "learning_rate": 9.662878503220772e-05, + "loss": 2.0042, + "step": 4694 + }, + { + "epoch": 1.4410681399631675, + "grad_norm": 0.49801671504974365, + "learning_rate": 9.662699055663065e-05, + "loss": 1.9725, + "step": 4695 + }, + { + "epoch": 1.4413750767341926, + "grad_norm": 0.40280646085739136, + "learning_rate": 9.662519562025832e-05, + "loss": 1.9016, + "step": 4696 + }, + { + "epoch": 1.441682013505218, + "grad_norm": 0.4095497131347656, + "learning_rate": 9.662340022310848e-05, + "loss": 2.0054, + "step": 4697 + }, + { + "epoch": 1.441988950276243, + "grad_norm": 0.44916659593582153, + "learning_rate": 9.662160436519889e-05, + "loss": 2.0126, + "step": 4698 + }, + { + "epoch": 1.4422958870472682, + "grad_norm": 0.47450655698776245, + "learning_rate": 9.661980804654725e-05, + "loss": 1.9679, + "step": 4699 + }, + { + "epoch": 1.4426028238182935, + "grad_norm": 0.4454696774482727, + "learning_rate": 9.661801126717136e-05, + "loss": 1.9335, + "step": 4700 + }, + { + "epoch": 1.4429097605893186, + "grad_norm": 0.5009927153587341, + "learning_rate": 9.661621402708896e-05, + "loss": 1.9777, + "step": 4701 + }, + { + "epoch": 1.4432166973603437, + "grad_norm": 0.49912458658218384, + "learning_rate": 9.66144163263178e-05, + "loss": 2.0095, + "step": 4702 + }, + { + "epoch": 1.4435236341313689, + "grad_norm": 0.4477069079875946, + "learning_rate": 9.661261816487568e-05, + "loss": 1.9265, + "step": 4703 + }, + { + "epoch": 1.443830570902394, + "grad_norm": 0.4170798361301422, + "learning_rate": 9.661081954278033e-05, + "loss": 1.9458, + "step": 4704 + }, + { + "epoch": 1.4441375076734193, + "grad_norm": 0.45160573720932007, + "learning_rate": 9.660902046004953e-05, + "loss": 1.9596, + "step": 4705 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4391551911830902, + "learning_rate": 9.660722091670109e-05, + "loss": 1.9158, + "step": 4706 + }, + { + "epoch": 1.4447513812154695, + "grad_norm": 0.5183218121528625, + "learning_rate": 9.660542091275276e-05, + "loss": 2.0055, + "step": 4707 + }, + { + "epoch": 1.4450583179864949, + "grad_norm": 0.49749481678009033, + "learning_rate": 9.660362044822235e-05, + "loss": 1.9695, + "step": 4708 + }, + { + "epoch": 1.44536525475752, + "grad_norm": 0.4839307963848114, + "learning_rate": 9.660181952312766e-05, + "loss": 1.9447, + "step": 4709 + }, + { + "epoch": 1.445672191528545, + "grad_norm": 0.5218588709831238, + "learning_rate": 9.660001813748647e-05, + "loss": 1.9892, + "step": 4710 + }, + { + "epoch": 1.4459791282995704, + "grad_norm": 0.5628986954689026, + "learning_rate": 9.659821629131658e-05, + "loss": 2.0598, + "step": 4711 + }, + { + "epoch": 1.4462860650705955, + "grad_norm": 0.5226300358772278, + "learning_rate": 9.65964139846358e-05, + "loss": 1.977, + "step": 4712 + }, + { + "epoch": 1.4465930018416207, + "grad_norm": 0.4345463216304779, + "learning_rate": 9.659461121746196e-05, + "loss": 1.9649, + "step": 4713 + }, + { + "epoch": 1.4468999386126458, + "grad_norm": 0.47233885526657104, + "learning_rate": 9.659280798981285e-05, + "loss": 1.9791, + "step": 4714 + }, + { + "epoch": 1.4472068753836709, + "grad_norm": 0.5272542238235474, + "learning_rate": 9.659100430170631e-05, + "loss": 2.0153, + "step": 4715 + }, + { + "epoch": 1.4475138121546962, + "grad_norm": 0.5567492246627808, + "learning_rate": 9.658920015316015e-05, + "loss": 2.0196, + "step": 4716 + }, + { + "epoch": 1.4478207489257213, + "grad_norm": 0.5393046140670776, + "learning_rate": 9.658739554419222e-05, + "loss": 1.9871, + "step": 4717 + }, + { + "epoch": 1.4481276856967464, + "grad_norm": 0.46408072113990784, + "learning_rate": 9.658559047482034e-05, + "loss": 1.9896, + "step": 4718 + }, + { + "epoch": 1.4484346224677718, + "grad_norm": 0.47001218795776367, + "learning_rate": 9.658378494506234e-05, + "loss": 2.0281, + "step": 4719 + }, + { + "epoch": 1.4487415592387969, + "grad_norm": 0.555749773979187, + "learning_rate": 9.658197895493608e-05, + "loss": 2.0184, + "step": 4720 + }, + { + "epoch": 1.449048496009822, + "grad_norm": 0.6206443905830383, + "learning_rate": 9.65801725044594e-05, + "loss": 1.9788, + "step": 4721 + }, + { + "epoch": 1.449355432780847, + "grad_norm": 0.533336877822876, + "learning_rate": 9.657836559365016e-05, + "loss": 1.9755, + "step": 4722 + }, + { + "epoch": 1.4496623695518722, + "grad_norm": 0.4553185701370239, + "learning_rate": 9.65765582225262e-05, + "loss": 1.9791, + "step": 4723 + }, + { + "epoch": 1.4499693063228976, + "grad_norm": 0.5754305124282837, + "learning_rate": 9.65747503911054e-05, + "loss": 1.9485, + "step": 4724 + }, + { + "epoch": 1.4502762430939227, + "grad_norm": 0.6812698245048523, + "learning_rate": 9.657294209940562e-05, + "loss": 2.0326, + "step": 4725 + }, + { + "epoch": 1.4505831798649478, + "grad_norm": 0.7532522678375244, + "learning_rate": 9.657113334744472e-05, + "loss": 1.9387, + "step": 4726 + }, + { + "epoch": 1.4508901166359731, + "grad_norm": 0.5618684887886047, + "learning_rate": 9.656932413524058e-05, + "loss": 1.9395, + "step": 4727 + }, + { + "epoch": 1.4511970534069982, + "grad_norm": 0.4818387031555176, + "learning_rate": 9.65675144628111e-05, + "loss": 1.9473, + "step": 4728 + }, + { + "epoch": 1.4515039901780233, + "grad_norm": 0.5152607560157776, + "learning_rate": 9.656570433017413e-05, + "loss": 1.894, + "step": 4729 + }, + { + "epoch": 1.4518109269490485, + "grad_norm": 0.5098578333854675, + "learning_rate": 9.656389373734759e-05, + "loss": 1.9519, + "step": 4730 + }, + { + "epoch": 1.4521178637200736, + "grad_norm": 0.5862317681312561, + "learning_rate": 9.656208268434936e-05, + "loss": 1.9968, + "step": 4731 + }, + { + "epoch": 1.452424800491099, + "grad_norm": 0.501220703125, + "learning_rate": 9.656027117119732e-05, + "loss": 1.993, + "step": 4732 + }, + { + "epoch": 1.452731737262124, + "grad_norm": 0.4974796772003174, + "learning_rate": 9.655845919790943e-05, + "loss": 2.0007, + "step": 4733 + }, + { + "epoch": 1.4530386740331491, + "grad_norm": 0.513671875, + "learning_rate": 9.655664676450351e-05, + "loss": 1.9321, + "step": 4734 + }, + { + "epoch": 1.4533456108041745, + "grad_norm": 0.5111755728721619, + "learning_rate": 9.655483387099756e-05, + "loss": 2.0187, + "step": 4735 + }, + { + "epoch": 1.4536525475751996, + "grad_norm": 0.47103258967399597, + "learning_rate": 9.655302051740942e-05, + "loss": 1.9716, + "step": 4736 + }, + { + "epoch": 1.4539594843462247, + "grad_norm": 0.4526553750038147, + "learning_rate": 9.655120670375707e-05, + "loss": 2.0424, + "step": 4737 + }, + { + "epoch": 1.4542664211172498, + "grad_norm": 0.44393640756607056, + "learning_rate": 9.65493924300584e-05, + "loss": 1.9318, + "step": 4738 + }, + { + "epoch": 1.454573357888275, + "grad_norm": 0.4070759415626526, + "learning_rate": 9.654757769633136e-05, + "loss": 1.9292, + "step": 4739 + }, + { + "epoch": 1.4548802946593002, + "grad_norm": 0.4010253846645355, + "learning_rate": 9.654576250259387e-05, + "loss": 1.9641, + "step": 4740 + }, + { + "epoch": 1.4551872314303254, + "grad_norm": 0.39156264066696167, + "learning_rate": 9.654394684886387e-05, + "loss": 1.9575, + "step": 4741 + }, + { + "epoch": 1.4554941682013505, + "grad_norm": 0.4360155463218689, + "learning_rate": 9.65421307351593e-05, + "loss": 1.9615, + "step": 4742 + }, + { + "epoch": 1.4558011049723758, + "grad_norm": 0.4203348755836487, + "learning_rate": 9.654031416149813e-05, + "loss": 1.9629, + "step": 4743 + }, + { + "epoch": 1.456108041743401, + "grad_norm": 0.42294225096702576, + "learning_rate": 9.653849712789828e-05, + "loss": 1.9756, + "step": 4744 + }, + { + "epoch": 1.456414978514426, + "grad_norm": 0.46253907680511475, + "learning_rate": 9.653667963437775e-05, + "loss": 2.0128, + "step": 4745 + }, + { + "epoch": 1.4567219152854511, + "grad_norm": 0.41743987798690796, + "learning_rate": 9.653486168095446e-05, + "loss": 1.938, + "step": 4746 + }, + { + "epoch": 1.4570288520564763, + "grad_norm": 0.43411263823509216, + "learning_rate": 9.653304326764639e-05, + "loss": 1.9744, + "step": 4747 + }, + { + "epoch": 1.4573357888275016, + "grad_norm": 0.4569607973098755, + "learning_rate": 9.653122439447151e-05, + "loss": 1.9844, + "step": 4748 + }, + { + "epoch": 1.4576427255985267, + "grad_norm": 0.41858115792274475, + "learning_rate": 9.652940506144781e-05, + "loss": 1.9835, + "step": 4749 + }, + { + "epoch": 1.4579496623695518, + "grad_norm": 0.4259703755378723, + "learning_rate": 9.652758526859324e-05, + "loss": 1.9467, + "step": 4750 + }, + { + "epoch": 1.4582565991405771, + "grad_norm": 0.49847620725631714, + "learning_rate": 9.652576501592583e-05, + "loss": 1.989, + "step": 4751 + }, + { + "epoch": 1.4585635359116023, + "grad_norm": 0.5898705720901489, + "learning_rate": 9.652394430346352e-05, + "loss": 1.9896, + "step": 4752 + }, + { + "epoch": 1.4588704726826274, + "grad_norm": 0.6528434157371521, + "learning_rate": 9.652212313122433e-05, + "loss": 1.9814, + "step": 4753 + }, + { + "epoch": 1.4591774094536525, + "grad_norm": 0.5704251527786255, + "learning_rate": 9.652030149922624e-05, + "loss": 1.9735, + "step": 4754 + }, + { + "epoch": 1.4594843462246776, + "grad_norm": 0.4349142014980316, + "learning_rate": 9.651847940748727e-05, + "loss": 1.9923, + "step": 4755 + }, + { + "epoch": 1.459791282995703, + "grad_norm": 0.43891096115112305, + "learning_rate": 9.651665685602542e-05, + "loss": 1.9429, + "step": 4756 + }, + { + "epoch": 1.460098219766728, + "grad_norm": 0.5881633758544922, + "learning_rate": 9.651483384485871e-05, + "loss": 2.0075, + "step": 4757 + }, + { + "epoch": 1.4604051565377532, + "grad_norm": 0.569064736366272, + "learning_rate": 9.651301037400515e-05, + "loss": 1.9968, + "step": 4758 + }, + { + "epoch": 1.4607120933087785, + "grad_norm": 0.49636805057525635, + "learning_rate": 9.651118644348276e-05, + "loss": 2.0844, + "step": 4759 + }, + { + "epoch": 1.4610190300798036, + "grad_norm": 0.4893283247947693, + "learning_rate": 9.650936205330955e-05, + "loss": 1.9635, + "step": 4760 + }, + { + "epoch": 1.4613259668508287, + "grad_norm": 0.5199632048606873, + "learning_rate": 9.650753720350358e-05, + "loss": 1.8934, + "step": 4761 + }, + { + "epoch": 1.4616329036218538, + "grad_norm": 0.5655859708786011, + "learning_rate": 9.650571189408287e-05, + "loss": 2.0473, + "step": 4762 + }, + { + "epoch": 1.461939840392879, + "grad_norm": 0.5004158020019531, + "learning_rate": 9.650388612506545e-05, + "loss": 1.9388, + "step": 4763 + }, + { + "epoch": 1.4622467771639043, + "grad_norm": 0.5075541734695435, + "learning_rate": 9.650205989646937e-05, + "loss": 2.0362, + "step": 4764 + }, + { + "epoch": 1.4625537139349294, + "grad_norm": 0.52835613489151, + "learning_rate": 9.650023320831267e-05, + "loss": 1.9849, + "step": 4765 + }, + { + "epoch": 1.4628606507059545, + "grad_norm": 0.5208338499069214, + "learning_rate": 9.649840606061342e-05, + "loss": 1.9619, + "step": 4766 + }, + { + "epoch": 1.4631675874769798, + "grad_norm": 0.4954691529273987, + "learning_rate": 9.649657845338966e-05, + "loss": 1.9282, + "step": 4767 + }, + { + "epoch": 1.463474524248005, + "grad_norm": 0.4260660409927368, + "learning_rate": 9.649475038665947e-05, + "loss": 2.0108, + "step": 4768 + }, + { + "epoch": 1.46378146101903, + "grad_norm": 0.4954771101474762, + "learning_rate": 9.64929218604409e-05, + "loss": 1.9995, + "step": 4769 + }, + { + "epoch": 1.4640883977900552, + "grad_norm": 0.6004415154457092, + "learning_rate": 9.649109287475202e-05, + "loss": 1.9816, + "step": 4770 + }, + { + "epoch": 1.4643953345610803, + "grad_norm": 0.6472858190536499, + "learning_rate": 9.648926342961092e-05, + "loss": 1.927, + "step": 4771 + }, + { + "epoch": 1.4647022713321056, + "grad_norm": 0.5293224453926086, + "learning_rate": 9.648743352503567e-05, + "loss": 1.9082, + "step": 4772 + }, + { + "epoch": 1.4650092081031307, + "grad_norm": 0.4413148760795593, + "learning_rate": 9.648560316104435e-05, + "loss": 1.9368, + "step": 4773 + }, + { + "epoch": 1.4653161448741558, + "grad_norm": 0.4727863371372223, + "learning_rate": 9.648377233765507e-05, + "loss": 1.944, + "step": 4774 + }, + { + "epoch": 1.4656230816451812, + "grad_norm": 0.5681154131889343, + "learning_rate": 9.648194105488589e-05, + "loss": 2.0003, + "step": 4775 + }, + { + "epoch": 1.4659300184162063, + "grad_norm": 0.5893644690513611, + "learning_rate": 9.648010931275493e-05, + "loss": 1.936, + "step": 4776 + }, + { + "epoch": 1.4662369551872314, + "grad_norm": 0.5034298300743103, + "learning_rate": 9.647827711128029e-05, + "loss": 2.0318, + "step": 4777 + }, + { + "epoch": 1.4665438919582565, + "grad_norm": 0.4954885244369507, + "learning_rate": 9.647644445048006e-05, + "loss": 2.0053, + "step": 4778 + }, + { + "epoch": 1.4668508287292816, + "grad_norm": 0.475923627614975, + "learning_rate": 9.647461133037236e-05, + "loss": 1.8911, + "step": 4779 + }, + { + "epoch": 1.467157765500307, + "grad_norm": 0.4725008010864258, + "learning_rate": 9.647277775097534e-05, + "loss": 1.8954, + "step": 4780 + }, + { + "epoch": 1.467464702271332, + "grad_norm": 0.4183707535266876, + "learning_rate": 9.647094371230707e-05, + "loss": 1.9891, + "step": 4781 + }, + { + "epoch": 1.4677716390423572, + "grad_norm": 0.4862513244152069, + "learning_rate": 9.64691092143857e-05, + "loss": 2.0364, + "step": 4782 + }, + { + "epoch": 1.4680785758133825, + "grad_norm": 0.5038082599639893, + "learning_rate": 9.646727425722936e-05, + "loss": 1.9304, + "step": 4783 + }, + { + "epoch": 1.4683855125844076, + "grad_norm": 0.47281327843666077, + "learning_rate": 9.646543884085618e-05, + "loss": 1.9453, + "step": 4784 + }, + { + "epoch": 1.4686924493554327, + "grad_norm": 0.42275354266166687, + "learning_rate": 9.646360296528431e-05, + "loss": 1.9434, + "step": 4785 + }, + { + "epoch": 1.468999386126458, + "grad_norm": 0.5757746696472168, + "learning_rate": 9.646176663053185e-05, + "loss": 2.0241, + "step": 4786 + }, + { + "epoch": 1.4693063228974832, + "grad_norm": 0.6757779121398926, + "learning_rate": 9.645992983661701e-05, + "loss": 1.9823, + "step": 4787 + }, + { + "epoch": 1.4696132596685083, + "grad_norm": 0.7052981853485107, + "learning_rate": 9.645809258355792e-05, + "loss": 2.0553, + "step": 4788 + }, + { + "epoch": 1.4699201964395334, + "grad_norm": 0.5630238652229309, + "learning_rate": 9.64562548713727e-05, + "loss": 2.0241, + "step": 4789 + }, + { + "epoch": 1.4702271332105585, + "grad_norm": 0.5034958124160767, + "learning_rate": 9.645441670007955e-05, + "loss": 1.9788, + "step": 4790 + }, + { + "epoch": 1.4705340699815839, + "grad_norm": 0.48978129029273987, + "learning_rate": 9.645257806969663e-05, + "loss": 1.9415, + "step": 4791 + }, + { + "epoch": 1.470841006752609, + "grad_norm": 0.4718508720397949, + "learning_rate": 9.645073898024211e-05, + "loss": 1.9657, + "step": 4792 + }, + { + "epoch": 1.471147943523634, + "grad_norm": 0.5171064734458923, + "learning_rate": 9.644889943173417e-05, + "loss": 1.9311, + "step": 4793 + }, + { + "epoch": 1.4714548802946594, + "grad_norm": 0.4556005597114563, + "learning_rate": 9.644705942419097e-05, + "loss": 1.9093, + "step": 4794 + }, + { + "epoch": 1.4717618170656845, + "grad_norm": 0.44836321473121643, + "learning_rate": 9.64452189576307e-05, + "loss": 1.9715, + "step": 4795 + }, + { + "epoch": 1.4720687538367097, + "grad_norm": 0.5139105916023254, + "learning_rate": 9.644337803207155e-05, + "loss": 1.967, + "step": 4796 + }, + { + "epoch": 1.4723756906077348, + "grad_norm": 0.49145743250846863, + "learning_rate": 9.644153664753173e-05, + "loss": 1.9679, + "step": 4797 + }, + { + "epoch": 1.4726826273787599, + "grad_norm": 0.4353790283203125, + "learning_rate": 9.643969480402942e-05, + "loss": 1.9438, + "step": 4798 + }, + { + "epoch": 1.4729895641497852, + "grad_norm": 0.39393118023872375, + "learning_rate": 9.643785250158283e-05, + "loss": 1.91, + "step": 4799 + }, + { + "epoch": 1.4732965009208103, + "grad_norm": 0.4250284731388092, + "learning_rate": 9.643600974021017e-05, + "loss": 1.9315, + "step": 4800 + }, + { + "epoch": 1.4736034376918354, + "grad_norm": 0.40301406383514404, + "learning_rate": 9.643416651992962e-05, + "loss": 1.9344, + "step": 4801 + }, + { + "epoch": 1.4739103744628608, + "grad_norm": 0.4428589940071106, + "learning_rate": 9.643232284075944e-05, + "loss": 1.9767, + "step": 4802 + }, + { + "epoch": 1.4742173112338859, + "grad_norm": 0.5098150372505188, + "learning_rate": 9.643047870271783e-05, + "loss": 2.0471, + "step": 4803 + }, + { + "epoch": 1.474524248004911, + "grad_norm": 0.5230079293251038, + "learning_rate": 9.642863410582302e-05, + "loss": 1.9647, + "step": 4804 + }, + { + "epoch": 1.474831184775936, + "grad_norm": 0.44200628995895386, + "learning_rate": 9.642678905009322e-05, + "loss": 1.9046, + "step": 4805 + }, + { + "epoch": 1.4751381215469612, + "grad_norm": 0.42684751749038696, + "learning_rate": 9.642494353554669e-05, + "loss": 1.82, + "step": 4806 + }, + { + "epoch": 1.4754450583179866, + "grad_norm": 0.3907437324523926, + "learning_rate": 9.642309756220165e-05, + "loss": 1.9257, + "step": 4807 + }, + { + "epoch": 1.4757519950890117, + "grad_norm": 0.43622660636901855, + "learning_rate": 9.642125113007636e-05, + "loss": 1.9319, + "step": 4808 + }, + { + "epoch": 1.4760589318600368, + "grad_norm": 0.4553097188472748, + "learning_rate": 9.641940423918905e-05, + "loss": 1.9699, + "step": 4809 + }, + { + "epoch": 1.4763658686310621, + "grad_norm": 0.48997193574905396, + "learning_rate": 9.641755688955798e-05, + "loss": 1.9843, + "step": 4810 + }, + { + "epoch": 1.4766728054020872, + "grad_norm": 0.5008227825164795, + "learning_rate": 9.641570908120141e-05, + "loss": 1.9616, + "step": 4811 + }, + { + "epoch": 1.4769797421731123, + "grad_norm": 0.49788615107536316, + "learning_rate": 9.64138608141376e-05, + "loss": 2.0233, + "step": 4812 + }, + { + "epoch": 1.4772866789441375, + "grad_norm": 0.509159505367279, + "learning_rate": 9.64120120883848e-05, + "loss": 1.9982, + "step": 4813 + }, + { + "epoch": 1.4775936157151626, + "grad_norm": 0.4976164996623993, + "learning_rate": 9.641016290396132e-05, + "loss": 1.9944, + "step": 4814 + }, + { + "epoch": 1.477900552486188, + "grad_norm": 0.4925370514392853, + "learning_rate": 9.640831326088539e-05, + "loss": 1.9547, + "step": 4815 + }, + { + "epoch": 1.478207489257213, + "grad_norm": 0.5058705806732178, + "learning_rate": 9.64064631591753e-05, + "loss": 2.0147, + "step": 4816 + }, + { + "epoch": 1.4785144260282381, + "grad_norm": 0.5614715814590454, + "learning_rate": 9.640461259884937e-05, + "loss": 1.9475, + "step": 4817 + }, + { + "epoch": 1.4788213627992635, + "grad_norm": 0.4417608380317688, + "learning_rate": 9.640276157992582e-05, + "loss": 1.9422, + "step": 4818 + }, + { + "epoch": 1.4791282995702886, + "grad_norm": 0.5124607682228088, + "learning_rate": 9.6400910102423e-05, + "loss": 1.9489, + "step": 4819 + }, + { + "epoch": 1.4794352363413137, + "grad_norm": 0.4931279420852661, + "learning_rate": 9.63990581663592e-05, + "loss": 1.9717, + "step": 4820 + }, + { + "epoch": 1.4797421731123388, + "grad_norm": 0.4716447591781616, + "learning_rate": 9.639720577175271e-05, + "loss": 1.9758, + "step": 4821 + }, + { + "epoch": 1.480049109883364, + "grad_norm": 0.4613695740699768, + "learning_rate": 9.639535291862183e-05, + "loss": 1.8998, + "step": 4822 + }, + { + "epoch": 1.4803560466543892, + "grad_norm": 0.4430600702762604, + "learning_rate": 9.639349960698489e-05, + "loss": 1.9539, + "step": 4823 + }, + { + "epoch": 1.4806629834254144, + "grad_norm": 0.45596009492874146, + "learning_rate": 9.639164583686018e-05, + "loss": 1.9626, + "step": 4824 + }, + { + "epoch": 1.4809699201964395, + "grad_norm": 0.4248705804347992, + "learning_rate": 9.638979160826604e-05, + "loss": 1.9627, + "step": 4825 + }, + { + "epoch": 1.4812768569674648, + "grad_norm": 0.43419960141181946, + "learning_rate": 9.63879369212208e-05, + "loss": 1.9589, + "step": 4826 + }, + { + "epoch": 1.48158379373849, + "grad_norm": 0.4715637266635895, + "learning_rate": 9.638608177574278e-05, + "loss": 1.981, + "step": 4827 + }, + { + "epoch": 1.481890730509515, + "grad_norm": 0.41809993982315063, + "learning_rate": 9.63842261718503e-05, + "loss": 1.9587, + "step": 4828 + }, + { + "epoch": 1.4821976672805401, + "grad_norm": 0.4085060656070709, + "learning_rate": 9.63823701095617e-05, + "loss": 1.9497, + "step": 4829 + }, + { + "epoch": 1.4825046040515653, + "grad_norm": 0.4199173152446747, + "learning_rate": 9.638051358889535e-05, + "loss": 1.9543, + "step": 4830 + }, + { + "epoch": 1.4828115408225906, + "grad_norm": 0.4560040235519409, + "learning_rate": 9.637865660986958e-05, + "loss": 1.9451, + "step": 4831 + }, + { + "epoch": 1.4831184775936157, + "grad_norm": 0.4059405028820038, + "learning_rate": 9.637679917250272e-05, + "loss": 1.9154, + "step": 4832 + }, + { + "epoch": 1.4834254143646408, + "grad_norm": 0.43314236402511597, + "learning_rate": 9.637494127681318e-05, + "loss": 1.9589, + "step": 4833 + }, + { + "epoch": 1.4837323511356661, + "grad_norm": 0.3866138458251953, + "learning_rate": 9.637308292281928e-05, + "loss": 1.9239, + "step": 4834 + }, + { + "epoch": 1.4840392879066913, + "grad_norm": 0.40781381726264954, + "learning_rate": 9.637122411053939e-05, + "loss": 1.9805, + "step": 4835 + }, + { + "epoch": 1.4843462246777164, + "grad_norm": 0.4605334401130676, + "learning_rate": 9.636936483999189e-05, + "loss": 1.9571, + "step": 4836 + }, + { + "epoch": 1.4846531614487415, + "grad_norm": 0.4730539917945862, + "learning_rate": 9.636750511119513e-05, + "loss": 1.9429, + "step": 4837 + }, + { + "epoch": 1.4849600982197666, + "grad_norm": 0.47973817586898804, + "learning_rate": 9.636564492416753e-05, + "loss": 1.9865, + "step": 4838 + }, + { + "epoch": 1.485267034990792, + "grad_norm": 0.4541794955730438, + "learning_rate": 9.636378427892744e-05, + "loss": 1.9796, + "step": 4839 + }, + { + "epoch": 1.485573971761817, + "grad_norm": 0.4863722026348114, + "learning_rate": 9.636192317549327e-05, + "loss": 1.9581, + "step": 4840 + }, + { + "epoch": 1.4858809085328422, + "grad_norm": 0.4559536278247833, + "learning_rate": 9.636006161388338e-05, + "loss": 1.9444, + "step": 4841 + }, + { + "epoch": 1.4861878453038675, + "grad_norm": 0.4385206401348114, + "learning_rate": 9.63581995941162e-05, + "loss": 1.9323, + "step": 4842 + }, + { + "epoch": 1.4864947820748926, + "grad_norm": 0.48802945017814636, + "learning_rate": 9.635633711621012e-05, + "loss": 1.9643, + "step": 4843 + }, + { + "epoch": 1.4868017188459177, + "grad_norm": 0.4051367938518524, + "learning_rate": 9.635447418018355e-05, + "loss": 1.9342, + "step": 4844 + }, + { + "epoch": 1.4871086556169428, + "grad_norm": 0.46384257078170776, + "learning_rate": 9.63526107860549e-05, + "loss": 1.9656, + "step": 4845 + }, + { + "epoch": 1.487415592387968, + "grad_norm": 0.3950713574886322, + "learning_rate": 9.635074693384257e-05, + "loss": 1.8673, + "step": 4846 + }, + { + "epoch": 1.4877225291589933, + "grad_norm": 0.4694644808769226, + "learning_rate": 9.634888262356501e-05, + "loss": 1.9484, + "step": 4847 + }, + { + "epoch": 1.4880294659300184, + "grad_norm": 0.45068567991256714, + "learning_rate": 9.63470178552406e-05, + "loss": 1.9221, + "step": 4848 + }, + { + "epoch": 1.4883364027010435, + "grad_norm": 0.44717836380004883, + "learning_rate": 9.634515262888781e-05, + "loss": 1.9968, + "step": 4849 + }, + { + "epoch": 1.4886433394720688, + "grad_norm": 0.42189615964889526, + "learning_rate": 9.634328694452506e-05, + "loss": 2.0262, + "step": 4850 + }, + { + "epoch": 1.488950276243094, + "grad_norm": 0.4895322322845459, + "learning_rate": 9.63414208021708e-05, + "loss": 2.0628, + "step": 4851 + }, + { + "epoch": 1.489257213014119, + "grad_norm": 0.4732883870601654, + "learning_rate": 9.633955420184342e-05, + "loss": 1.9487, + "step": 4852 + }, + { + "epoch": 1.4895641497851444, + "grad_norm": 0.4426051676273346, + "learning_rate": 9.633768714356143e-05, + "loss": 2.0181, + "step": 4853 + }, + { + "epoch": 1.4898710865561695, + "grad_norm": 0.5831739902496338, + "learning_rate": 9.633581962734326e-05, + "loss": 1.9311, + "step": 4854 + }, + { + "epoch": 1.4901780233271946, + "grad_norm": 0.6048587560653687, + "learning_rate": 9.633395165320734e-05, + "loss": 1.9159, + "step": 4855 + }, + { + "epoch": 1.4904849600982197, + "grad_norm": 0.60125732421875, + "learning_rate": 9.633208322117218e-05, + "loss": 1.9732, + "step": 4856 + }, + { + "epoch": 1.4907918968692448, + "grad_norm": 0.4806794822216034, + "learning_rate": 9.63302143312562e-05, + "loss": 1.9101, + "step": 4857 + }, + { + "epoch": 1.4910988336402702, + "grad_norm": 0.4032946228981018, + "learning_rate": 9.632834498347789e-05, + "loss": 1.9097, + "step": 4858 + }, + { + "epoch": 1.4914057704112953, + "grad_norm": 0.400632381439209, + "learning_rate": 9.632647517785571e-05, + "loss": 1.9949, + "step": 4859 + }, + { + "epoch": 1.4917127071823204, + "grad_norm": 0.49766576290130615, + "learning_rate": 9.632460491440818e-05, + "loss": 1.9762, + "step": 4860 + }, + { + "epoch": 1.4920196439533457, + "grad_norm": 0.6273209452629089, + "learning_rate": 9.632273419315372e-05, + "loss": 2.0797, + "step": 4861 + }, + { + "epoch": 1.4923265807243709, + "grad_norm": 0.5848406553268433, + "learning_rate": 9.632086301411087e-05, + "loss": 1.9366, + "step": 4862 + }, + { + "epoch": 1.492633517495396, + "grad_norm": 0.4683595597743988, + "learning_rate": 9.631899137729809e-05, + "loss": 1.9802, + "step": 4863 + }, + { + "epoch": 1.492940454266421, + "grad_norm": 0.43066033720970154, + "learning_rate": 9.63171192827339e-05, + "loss": 1.9621, + "step": 4864 + }, + { + "epoch": 1.4932473910374462, + "grad_norm": 0.47469422221183777, + "learning_rate": 9.63152467304368e-05, + "loss": 1.9795, + "step": 4865 + }, + { + "epoch": 1.4935543278084715, + "grad_norm": 0.5453927516937256, + "learning_rate": 9.631337372042526e-05, + "loss": 1.9711, + "step": 4866 + }, + { + "epoch": 1.4938612645794966, + "grad_norm": 0.5361614227294922, + "learning_rate": 9.631150025271782e-05, + "loss": 1.9849, + "step": 4867 + }, + { + "epoch": 1.4941682013505218, + "grad_norm": 0.4773578643798828, + "learning_rate": 9.6309626327333e-05, + "loss": 2.065, + "step": 4868 + }, + { + "epoch": 1.494475138121547, + "grad_norm": 0.428091824054718, + "learning_rate": 9.630775194428932e-05, + "loss": 1.9448, + "step": 4869 + }, + { + "epoch": 1.4947820748925722, + "grad_norm": 0.41679108142852783, + "learning_rate": 9.630587710360527e-05, + "loss": 1.9511, + "step": 4870 + }, + { + "epoch": 1.4950890116635973, + "grad_norm": 0.5072546601295471, + "learning_rate": 9.630400180529942e-05, + "loss": 1.9973, + "step": 4871 + }, + { + "epoch": 1.4953959484346224, + "grad_norm": 0.5230575799942017, + "learning_rate": 9.630212604939026e-05, + "loss": 1.9659, + "step": 4872 + }, + { + "epoch": 1.4957028852056475, + "grad_norm": 0.44307753443717957, + "learning_rate": 9.630024983589638e-05, + "loss": 1.9056, + "step": 4873 + }, + { + "epoch": 1.4960098219766729, + "grad_norm": 0.43783196806907654, + "learning_rate": 9.629837316483628e-05, + "loss": 1.9716, + "step": 4874 + }, + { + "epoch": 1.496316758747698, + "grad_norm": 0.4553990960121155, + "learning_rate": 9.629649603622852e-05, + "loss": 2.044, + "step": 4875 + }, + { + "epoch": 1.496623695518723, + "grad_norm": 0.49152833223342896, + "learning_rate": 9.629461845009164e-05, + "loss": 1.948, + "step": 4876 + }, + { + "epoch": 1.4969306322897484, + "grad_norm": 0.4371738135814667, + "learning_rate": 9.629274040644422e-05, + "loss": 1.9497, + "step": 4877 + }, + { + "epoch": 1.4972375690607735, + "grad_norm": 0.4973873198032379, + "learning_rate": 9.629086190530482e-05, + "loss": 2.0053, + "step": 4878 + }, + { + "epoch": 1.4975445058317987, + "grad_norm": 0.4250672459602356, + "learning_rate": 9.628898294669197e-05, + "loss": 1.9617, + "step": 4879 + }, + { + "epoch": 1.4978514426028238, + "grad_norm": 0.4514639675617218, + "learning_rate": 9.628710353062427e-05, + "loss": 1.9503, + "step": 4880 + }, + { + "epoch": 1.4981583793738489, + "grad_norm": 0.4960804879665375, + "learning_rate": 9.628522365712027e-05, + "loss": 1.9932, + "step": 4881 + }, + { + "epoch": 1.4984653161448742, + "grad_norm": 0.5604363083839417, + "learning_rate": 9.628334332619857e-05, + "loss": 2.0186, + "step": 4882 + }, + { + "epoch": 1.4987722529158993, + "grad_norm": 0.5125443935394287, + "learning_rate": 9.628146253787776e-05, + "loss": 1.9897, + "step": 4883 + }, + { + "epoch": 1.4990791896869244, + "grad_norm": 0.4029771089553833, + "learning_rate": 9.627958129217639e-05, + "loss": 1.9083, + "step": 4884 + }, + { + "epoch": 1.4993861264579498, + "grad_norm": 0.4608222544193268, + "learning_rate": 9.627769958911308e-05, + "loss": 2.0153, + "step": 4885 + }, + { + "epoch": 1.4996930632289749, + "grad_norm": 0.4253246486186981, + "learning_rate": 9.627581742870641e-05, + "loss": 1.9278, + "step": 4886 + }, + { + "epoch": 1.5, + "grad_norm": 0.4247463047504425, + "learning_rate": 9.6273934810975e-05, + "loss": 1.9456, + "step": 4887 + }, + { + "epoch": 1.5003069367710253, + "grad_norm": 0.44055816531181335, + "learning_rate": 9.627205173593744e-05, + "loss": 2.0225, + "step": 4888 + }, + { + "epoch": 1.5006138735420502, + "grad_norm": 0.47912710905075073, + "learning_rate": 9.627016820361235e-05, + "loss": 1.9716, + "step": 4889 + }, + { + "epoch": 1.5009208103130756, + "grad_norm": 0.47608625888824463, + "learning_rate": 9.626828421401832e-05, + "loss": 1.9444, + "step": 4890 + }, + { + "epoch": 1.5012277470841007, + "grad_norm": 0.4757349193096161, + "learning_rate": 9.6266399767174e-05, + "loss": 2.0699, + "step": 4891 + }, + { + "epoch": 1.5015346838551258, + "grad_norm": 0.5556650757789612, + "learning_rate": 9.6264514863098e-05, + "loss": 1.99, + "step": 4892 + }, + { + "epoch": 1.5018416206261511, + "grad_norm": 0.5072291493415833, + "learning_rate": 9.626262950180894e-05, + "loss": 1.9435, + "step": 4893 + }, + { + "epoch": 1.5021485573971762, + "grad_norm": 0.47811564803123474, + "learning_rate": 9.626074368332546e-05, + "loss": 1.9399, + "step": 4894 + }, + { + "epoch": 1.5024554941682013, + "grad_norm": 0.4613232910633087, + "learning_rate": 9.62588574076662e-05, + "loss": 1.9259, + "step": 4895 + }, + { + "epoch": 1.5027624309392267, + "grad_norm": 0.4170697331428528, + "learning_rate": 9.62569706748498e-05, + "loss": 1.9319, + "step": 4896 + }, + { + "epoch": 1.5030693677102516, + "grad_norm": 0.4731575548648834, + "learning_rate": 9.62550834848949e-05, + "loss": 1.9862, + "step": 4897 + }, + { + "epoch": 1.503376304481277, + "grad_norm": 0.49881401658058167, + "learning_rate": 9.625319583782016e-05, + "loss": 1.9837, + "step": 4898 + }, + { + "epoch": 1.503683241252302, + "grad_norm": 0.4689660668373108, + "learning_rate": 9.625130773364424e-05, + "loss": 1.9662, + "step": 4899 + }, + { + "epoch": 1.5039901780233271, + "grad_norm": 0.48389768600463867, + "learning_rate": 9.624941917238577e-05, + "loss": 2.0087, + "step": 4900 + }, + { + "epoch": 1.5042971147943525, + "grad_norm": 0.46716609597206116, + "learning_rate": 9.624753015406342e-05, + "loss": 1.9718, + "step": 4901 + }, + { + "epoch": 1.5046040515653776, + "grad_norm": 0.544793963432312, + "learning_rate": 9.62456406786959e-05, + "loss": 1.9878, + "step": 4902 + }, + { + "epoch": 1.5049109883364027, + "grad_norm": 0.44499701261520386, + "learning_rate": 9.624375074630183e-05, + "loss": 1.8849, + "step": 4903 + }, + { + "epoch": 1.505217925107428, + "grad_norm": 0.42464208602905273, + "learning_rate": 9.624186035689993e-05, + "loss": 1.8995, + "step": 4904 + }, + { + "epoch": 1.505524861878453, + "grad_norm": 0.41650670766830444, + "learning_rate": 9.623996951050885e-05, + "loss": 1.9138, + "step": 4905 + }, + { + "epoch": 1.5058317986494782, + "grad_norm": 0.37955889105796814, + "learning_rate": 9.62380782071473e-05, + "loss": 1.9746, + "step": 4906 + }, + { + "epoch": 1.5061387354205034, + "grad_norm": 0.3799228072166443, + "learning_rate": 9.623618644683394e-05, + "loss": 1.942, + "step": 4907 + }, + { + "epoch": 1.5064456721915285, + "grad_norm": 0.3799766004085541, + "learning_rate": 9.623429422958751e-05, + "loss": 1.9025, + "step": 4908 + }, + { + "epoch": 1.5067526089625538, + "grad_norm": 0.3780234456062317, + "learning_rate": 9.623240155542668e-05, + "loss": 1.9581, + "step": 4909 + }, + { + "epoch": 1.507059545733579, + "grad_norm": 0.36379706859588623, + "learning_rate": 9.623050842437014e-05, + "loss": 1.9299, + "step": 4910 + }, + { + "epoch": 1.507366482504604, + "grad_norm": 0.5230580568313599, + "learning_rate": 9.622861483643663e-05, + "loss": 2.0306, + "step": 4911 + }, + { + "epoch": 1.5076734192756294, + "grad_norm": 0.443945050239563, + "learning_rate": 9.622672079164486e-05, + "loss": 1.9032, + "step": 4912 + }, + { + "epoch": 1.5079803560466543, + "grad_norm": 0.4689701795578003, + "learning_rate": 9.622482629001355e-05, + "loss": 1.9901, + "step": 4913 + }, + { + "epoch": 1.5082872928176796, + "grad_norm": 0.4483632445335388, + "learning_rate": 9.622293133156139e-05, + "loss": 1.948, + "step": 4914 + }, + { + "epoch": 1.5085942295887047, + "grad_norm": 0.4064919948577881, + "learning_rate": 9.622103591630715e-05, + "loss": 1.9487, + "step": 4915 + }, + { + "epoch": 1.5089011663597298, + "grad_norm": 0.44170522689819336, + "learning_rate": 9.621914004426952e-05, + "loss": 1.9929, + "step": 4916 + }, + { + "epoch": 1.5092081031307552, + "grad_norm": 0.45979443192481995, + "learning_rate": 9.621724371546727e-05, + "loss": 1.9428, + "step": 4917 + }, + { + "epoch": 1.5095150399017803, + "grad_norm": 0.5258452892303467, + "learning_rate": 9.621534692991913e-05, + "loss": 2.0049, + "step": 4918 + }, + { + "epoch": 1.5098219766728054, + "grad_norm": 0.45191919803619385, + "learning_rate": 9.621344968764385e-05, + "loss": 2.0364, + "step": 4919 + }, + { + "epoch": 1.5101289134438307, + "grad_norm": 0.539245069026947, + "learning_rate": 9.621155198866016e-05, + "loss": 2.072, + "step": 4920 + }, + { + "epoch": 1.5104358502148556, + "grad_norm": 0.5410256385803223, + "learning_rate": 9.620965383298684e-05, + "loss": 2.0231, + "step": 4921 + }, + { + "epoch": 1.510742786985881, + "grad_norm": 0.4409741759300232, + "learning_rate": 9.620775522064264e-05, + "loss": 1.9024, + "step": 4922 + }, + { + "epoch": 1.511049723756906, + "grad_norm": 0.4911535680294037, + "learning_rate": 9.620585615164631e-05, + "loss": 2.0057, + "step": 4923 + }, + { + "epoch": 1.5113566605279312, + "grad_norm": 0.48139557242393494, + "learning_rate": 9.620395662601663e-05, + "loss": 2.0175, + "step": 4924 + }, + { + "epoch": 1.5116635972989565, + "grad_norm": 0.5130077004432678, + "learning_rate": 9.620205664377238e-05, + "loss": 1.952, + "step": 4925 + }, + { + "epoch": 1.5119705340699816, + "grad_norm": 0.5428542494773865, + "learning_rate": 9.62001562049323e-05, + "loss": 1.977, + "step": 4926 + }, + { + "epoch": 1.5122774708410067, + "grad_norm": 0.4586256444454193, + "learning_rate": 9.619825530951522e-05, + "loss": 1.9997, + "step": 4927 + }, + { + "epoch": 1.512584407612032, + "grad_norm": 0.3941349387168884, + "learning_rate": 9.61963539575399e-05, + "loss": 1.9174, + "step": 4928 + }, + { + "epoch": 1.512891344383057, + "grad_norm": 0.4396456480026245, + "learning_rate": 9.619445214902511e-05, + "loss": 1.9696, + "step": 4929 + }, + { + "epoch": 1.5131982811540823, + "grad_norm": 0.5413886904716492, + "learning_rate": 9.61925498839897e-05, + "loss": 2.0332, + "step": 4930 + }, + { + "epoch": 1.5135052179251074, + "grad_norm": 0.5946230888366699, + "learning_rate": 9.619064716245242e-05, + "loss": 2.0433, + "step": 4931 + }, + { + "epoch": 1.5138121546961325, + "grad_norm": 0.6353569030761719, + "learning_rate": 9.618874398443211e-05, + "loss": 1.9828, + "step": 4932 + }, + { + "epoch": 1.5141190914671578, + "grad_norm": 0.523690938949585, + "learning_rate": 9.618684034994754e-05, + "loss": 1.9024, + "step": 4933 + }, + { + "epoch": 1.514426028238183, + "grad_norm": 0.4437367022037506, + "learning_rate": 9.618493625901754e-05, + "loss": 1.9961, + "step": 4934 + }, + { + "epoch": 1.514732965009208, + "grad_norm": 0.48458734154701233, + "learning_rate": 9.618303171166094e-05, + "loss": 1.9515, + "step": 4935 + }, + { + "epoch": 1.5150399017802334, + "grad_norm": 0.47659310698509216, + "learning_rate": 9.618112670789657e-05, + "loss": 1.9943, + "step": 4936 + }, + { + "epoch": 1.5153468385512583, + "grad_norm": 0.49281415343284607, + "learning_rate": 9.617922124774322e-05, + "loss": 1.9311, + "step": 4937 + }, + { + "epoch": 1.5156537753222836, + "grad_norm": 0.4706041216850281, + "learning_rate": 9.617731533121972e-05, + "loss": 1.9478, + "step": 4938 + }, + { + "epoch": 1.5159607120933087, + "grad_norm": 0.4187149405479431, + "learning_rate": 9.617540895834496e-05, + "loss": 1.9915, + "step": 4939 + }, + { + "epoch": 1.5162676488643339, + "grad_norm": 0.3792540431022644, + "learning_rate": 9.617350212913772e-05, + "loss": 1.8609, + "step": 4940 + }, + { + "epoch": 1.5165745856353592, + "grad_norm": 0.46558165550231934, + "learning_rate": 9.617159484361688e-05, + "loss": 1.9574, + "step": 4941 + }, + { + "epoch": 1.5168815224063843, + "grad_norm": 0.4930344820022583, + "learning_rate": 9.616968710180127e-05, + "loss": 1.9924, + "step": 4942 + }, + { + "epoch": 1.5171884591774094, + "grad_norm": 0.44909337162971497, + "learning_rate": 9.616777890370976e-05, + "loss": 1.9674, + "step": 4943 + }, + { + "epoch": 1.5174953959484347, + "grad_norm": 0.43266600370407104, + "learning_rate": 9.616587024936119e-05, + "loss": 1.8899, + "step": 4944 + }, + { + "epoch": 1.5178023327194596, + "grad_norm": 0.43229207396507263, + "learning_rate": 9.616396113877444e-05, + "loss": 1.9671, + "step": 4945 + }, + { + "epoch": 1.518109269490485, + "grad_norm": 0.4609402120113373, + "learning_rate": 9.616205157196837e-05, + "loss": 1.9844, + "step": 4946 + }, + { + "epoch": 1.51841620626151, + "grad_norm": 0.4598314166069031, + "learning_rate": 9.616014154896184e-05, + "loss": 1.985, + "step": 4947 + }, + { + "epoch": 1.5187231430325352, + "grad_norm": 0.4746960997581482, + "learning_rate": 9.615823106977376e-05, + "loss": 2.0199, + "step": 4948 + }, + { + "epoch": 1.5190300798035605, + "grad_norm": 0.47560420632362366, + "learning_rate": 9.615632013442295e-05, + "loss": 1.8864, + "step": 4949 + }, + { + "epoch": 1.5193370165745856, + "grad_norm": 0.447837233543396, + "learning_rate": 9.615440874292835e-05, + "loss": 1.9699, + "step": 4950 + }, + { + "epoch": 1.5196439533456108, + "grad_norm": 0.49653175473213196, + "learning_rate": 9.615249689530883e-05, + "loss": 2.0645, + "step": 4951 + }, + { + "epoch": 1.519950890116636, + "grad_norm": 0.47083014249801636, + "learning_rate": 9.615058459158328e-05, + "loss": 2.01, + "step": 4952 + }, + { + "epoch": 1.520257826887661, + "grad_norm": 0.5299197435379028, + "learning_rate": 9.614867183177061e-05, + "loss": 2.0232, + "step": 4953 + }, + { + "epoch": 1.5205647636586863, + "grad_norm": 0.5005922317504883, + "learning_rate": 9.614675861588971e-05, + "loss": 1.9703, + "step": 4954 + }, + { + "epoch": 1.5208717004297114, + "grad_norm": 0.5131978392601013, + "learning_rate": 9.61448449439595e-05, + "loss": 1.9921, + "step": 4955 + }, + { + "epoch": 1.5211786372007365, + "grad_norm": 0.5278428196907043, + "learning_rate": 9.614293081599889e-05, + "loss": 1.9111, + "step": 4956 + }, + { + "epoch": 1.5214855739717619, + "grad_norm": 0.4914579689502716, + "learning_rate": 9.614101623202678e-05, + "loss": 2.0398, + "step": 4957 + }, + { + "epoch": 1.521792510742787, + "grad_norm": 0.454863041639328, + "learning_rate": 9.61391011920621e-05, + "loss": 1.9674, + "step": 4958 + }, + { + "epoch": 1.522099447513812, + "grad_norm": 0.464491605758667, + "learning_rate": 9.613718569612379e-05, + "loss": 2.0123, + "step": 4959 + }, + { + "epoch": 1.5224063842848374, + "grad_norm": 0.4252295196056366, + "learning_rate": 9.613526974423078e-05, + "loss": 1.9796, + "step": 4960 + }, + { + "epoch": 1.5227133210558625, + "grad_norm": 0.4643968641757965, + "learning_rate": 9.613335333640199e-05, + "loss": 1.9448, + "step": 4961 + }, + { + "epoch": 1.5230202578268877, + "grad_norm": 0.4204397201538086, + "learning_rate": 9.613143647265635e-05, + "loss": 2.0191, + "step": 4962 + }, + { + "epoch": 1.523327194597913, + "grad_norm": 0.3838767111301422, + "learning_rate": 9.612951915301283e-05, + "loss": 1.9057, + "step": 4963 + }, + { + "epoch": 1.5236341313689379, + "grad_norm": 0.4353863000869751, + "learning_rate": 9.612760137749035e-05, + "loss": 2.0435, + "step": 4964 + }, + { + "epoch": 1.5239410681399632, + "grad_norm": 0.4082738757133484, + "learning_rate": 9.612568314610788e-05, + "loss": 1.9229, + "step": 4965 + }, + { + "epoch": 1.5242480049109883, + "grad_norm": 0.4382591247558594, + "learning_rate": 9.612376445888437e-05, + "loss": 1.9185, + "step": 4966 + }, + { + "epoch": 1.5245549416820134, + "grad_norm": 0.48340749740600586, + "learning_rate": 9.61218453158388e-05, + "loss": 1.9669, + "step": 4967 + }, + { + "epoch": 1.5248618784530388, + "grad_norm": 0.47423556447029114, + "learning_rate": 9.611992571699012e-05, + "loss": 1.9372, + "step": 4968 + }, + { + "epoch": 1.525168815224064, + "grad_norm": 0.4070637822151184, + "learning_rate": 9.611800566235728e-05, + "loss": 2.0201, + "step": 4969 + }, + { + "epoch": 1.525475751995089, + "grad_norm": 0.43758198618888855, + "learning_rate": 9.61160851519593e-05, + "loss": 1.982, + "step": 4970 + }, + { + "epoch": 1.5257826887661143, + "grad_norm": 0.4724174737930298, + "learning_rate": 9.611416418581513e-05, + "loss": 1.9938, + "step": 4971 + }, + { + "epoch": 1.5260896255371392, + "grad_norm": 0.492405503988266, + "learning_rate": 9.611224276394374e-05, + "loss": 1.9462, + "step": 4972 + }, + { + "epoch": 1.5263965623081646, + "grad_norm": 0.5064161419868469, + "learning_rate": 9.611032088636418e-05, + "loss": 2.0326, + "step": 4973 + }, + { + "epoch": 1.5267034990791897, + "grad_norm": 0.4256031811237335, + "learning_rate": 9.610839855309537e-05, + "loss": 1.8885, + "step": 4974 + }, + { + "epoch": 1.5270104358502148, + "grad_norm": 0.4283316731452942, + "learning_rate": 9.610647576415636e-05, + "loss": 2.005, + "step": 4975 + }, + { + "epoch": 1.5273173726212401, + "grad_norm": 0.44234412908554077, + "learning_rate": 9.610455251956614e-05, + "loss": 1.9626, + "step": 4976 + }, + { + "epoch": 1.5276243093922652, + "grad_norm": 0.4135831594467163, + "learning_rate": 9.610262881934369e-05, + "loss": 1.9529, + "step": 4977 + }, + { + "epoch": 1.5279312461632903, + "grad_norm": 0.48090922832489014, + "learning_rate": 9.610070466350805e-05, + "loss": 2.0239, + "step": 4978 + }, + { + "epoch": 1.5282381829343157, + "grad_norm": 0.4546974301338196, + "learning_rate": 9.609878005207822e-05, + "loss": 1.9556, + "step": 4979 + }, + { + "epoch": 1.5285451197053406, + "grad_norm": 0.4197862148284912, + "learning_rate": 9.609685498507323e-05, + "loss": 1.9117, + "step": 4980 + }, + { + "epoch": 1.528852056476366, + "grad_norm": 0.4376974105834961, + "learning_rate": 9.60949294625121e-05, + "loss": 1.9514, + "step": 4981 + }, + { + "epoch": 1.529158993247391, + "grad_norm": 0.3671407401561737, + "learning_rate": 9.609300348441385e-05, + "loss": 1.9042, + "step": 4982 + }, + { + "epoch": 1.5294659300184161, + "grad_norm": 0.4326031506061554, + "learning_rate": 9.609107705079754e-05, + "loss": 1.9606, + "step": 4983 + }, + { + "epoch": 1.5297728667894415, + "grad_norm": 0.423308402299881, + "learning_rate": 9.608915016168218e-05, + "loss": 1.9663, + "step": 4984 + }, + { + "epoch": 1.5300798035604666, + "grad_norm": 0.46309906244277954, + "learning_rate": 9.608722281708683e-05, + "loss": 2.0114, + "step": 4985 + }, + { + "epoch": 1.5303867403314917, + "grad_norm": 0.4619913101196289, + "learning_rate": 9.608529501703053e-05, + "loss": 1.9328, + "step": 4986 + }, + { + "epoch": 1.530693677102517, + "grad_norm": 0.4335738718509674, + "learning_rate": 9.608336676153234e-05, + "loss": 1.9069, + "step": 4987 + }, + { + "epoch": 1.531000613873542, + "grad_norm": 0.40606966614723206, + "learning_rate": 9.608143805061129e-05, + "loss": 1.9243, + "step": 4988 + }, + { + "epoch": 1.5313075506445673, + "grad_norm": 0.45613235235214233, + "learning_rate": 9.607950888428649e-05, + "loss": 1.9943, + "step": 4989 + }, + { + "epoch": 1.5316144874155924, + "grad_norm": 0.4905582666397095, + "learning_rate": 9.607757926257696e-05, + "loss": 1.9649, + "step": 4990 + }, + { + "epoch": 1.5319214241866175, + "grad_norm": 0.44312527775764465, + "learning_rate": 9.607564918550179e-05, + "loss": 1.927, + "step": 4991 + }, + { + "epoch": 1.5322283609576428, + "grad_norm": 0.5193700790405273, + "learning_rate": 9.607371865308004e-05, + "loss": 1.9038, + "step": 4992 + }, + { + "epoch": 1.532535297728668, + "grad_norm": 0.5528806447982788, + "learning_rate": 9.607178766533078e-05, + "loss": 1.9194, + "step": 4993 + }, + { + "epoch": 1.532842234499693, + "grad_norm": 0.6561285257339478, + "learning_rate": 9.606985622227314e-05, + "loss": 2.0098, + "step": 4994 + }, + { + "epoch": 1.5331491712707184, + "grad_norm": 0.5642603635787964, + "learning_rate": 9.606792432392617e-05, + "loss": 1.9781, + "step": 4995 + }, + { + "epoch": 1.5334561080417433, + "grad_norm": 0.4974311590194702, + "learning_rate": 9.606599197030896e-05, + "loss": 1.9558, + "step": 4996 + }, + { + "epoch": 1.5337630448127686, + "grad_norm": 0.4324510395526886, + "learning_rate": 9.606405916144063e-05, + "loss": 1.9749, + "step": 4997 + }, + { + "epoch": 1.5340699815837937, + "grad_norm": 0.45244327187538147, + "learning_rate": 9.606212589734027e-05, + "loss": 1.8902, + "step": 4998 + }, + { + "epoch": 1.5343769183548188, + "grad_norm": 0.5418685078620911, + "learning_rate": 9.606019217802698e-05, + "loss": 1.9766, + "step": 4999 + }, + { + "epoch": 1.5346838551258442, + "grad_norm": 0.48479241132736206, + "learning_rate": 9.605825800351987e-05, + "loss": 1.9949, + "step": 5000 + }, + { + "epoch": 1.5349907918968693, + "grad_norm": 0.4958111643791199, + "learning_rate": 9.605632337383806e-05, + "loss": 1.988, + "step": 5001 + }, + { + "epoch": 1.5352977286678944, + "grad_norm": 0.47347983717918396, + "learning_rate": 9.605438828900067e-05, + "loss": 1.9157, + "step": 5002 + }, + { + "epoch": 1.5356046654389197, + "grad_norm": 0.4018974304199219, + "learning_rate": 9.605245274902684e-05, + "loss": 1.9347, + "step": 5003 + }, + { + "epoch": 1.5359116022099446, + "grad_norm": 0.46161791682243347, + "learning_rate": 9.605051675393565e-05, + "loss": 1.9785, + "step": 5004 + }, + { + "epoch": 1.53621853898097, + "grad_norm": 0.5113234519958496, + "learning_rate": 9.604858030374627e-05, + "loss": 1.9595, + "step": 5005 + }, + { + "epoch": 1.536525475751995, + "grad_norm": 0.6643409132957458, + "learning_rate": 9.604664339847784e-05, + "loss": 2.0395, + "step": 5006 + }, + { + "epoch": 1.5368324125230202, + "grad_norm": 0.6759974360466003, + "learning_rate": 9.604470603814948e-05, + "loss": 1.9058, + "step": 5007 + }, + { + "epoch": 1.5371393492940455, + "grad_norm": 0.5576213598251343, + "learning_rate": 9.604276822278035e-05, + "loss": 1.9326, + "step": 5008 + }, + { + "epoch": 1.5374462860650706, + "grad_norm": 0.4472630023956299, + "learning_rate": 9.60408299523896e-05, + "loss": 1.9553, + "step": 5009 + }, + { + "epoch": 1.5377532228360957, + "grad_norm": 0.48445144295692444, + "learning_rate": 9.603889122699638e-05, + "loss": 2.0136, + "step": 5010 + }, + { + "epoch": 1.538060159607121, + "grad_norm": 0.4793097972869873, + "learning_rate": 9.603695204661987e-05, + "loss": 1.9777, + "step": 5011 + }, + { + "epoch": 1.538367096378146, + "grad_norm": 0.5003167390823364, + "learning_rate": 9.60350124112792e-05, + "loss": 1.9672, + "step": 5012 + }, + { + "epoch": 1.5386740331491713, + "grad_norm": 0.5131042003631592, + "learning_rate": 9.603307232099355e-05, + "loss": 2.0058, + "step": 5013 + }, + { + "epoch": 1.5389809699201964, + "grad_norm": 0.4145869314670563, + "learning_rate": 9.603113177578212e-05, + "loss": 1.9332, + "step": 5014 + }, + { + "epoch": 1.5392879066912215, + "grad_norm": 0.4939991235733032, + "learning_rate": 9.602919077566404e-05, + "loss": 1.9967, + "step": 5015 + }, + { + "epoch": 1.5395948434622468, + "grad_norm": 0.4768902361392975, + "learning_rate": 9.602724932065853e-05, + "loss": 1.873, + "step": 5016 + }, + { + "epoch": 1.539901780233272, + "grad_norm": 0.45381611585617065, + "learning_rate": 9.602530741078476e-05, + "loss": 1.9416, + "step": 5017 + }, + { + "epoch": 1.540208717004297, + "grad_norm": 0.43104392290115356, + "learning_rate": 9.602336504606193e-05, + "loss": 1.9566, + "step": 5018 + }, + { + "epoch": 1.5405156537753224, + "grad_norm": 0.5354776978492737, + "learning_rate": 9.602142222650924e-05, + "loss": 1.9939, + "step": 5019 + }, + { + "epoch": 1.5408225905463473, + "grad_norm": 0.5623740553855896, + "learning_rate": 9.601947895214586e-05, + "loss": 1.9622, + "step": 5020 + }, + { + "epoch": 1.5411295273173726, + "grad_norm": 0.5234485268592834, + "learning_rate": 9.601753522299103e-05, + "loss": 1.9636, + "step": 5021 + }, + { + "epoch": 1.5414364640883977, + "grad_norm": 0.416384756565094, + "learning_rate": 9.601559103906396e-05, + "loss": 1.92, + "step": 5022 + }, + { + "epoch": 1.5417434008594229, + "grad_norm": 0.47080478072166443, + "learning_rate": 9.601364640038384e-05, + "loss": 1.9147, + "step": 5023 + }, + { + "epoch": 1.5420503376304482, + "grad_norm": 0.527463972568512, + "learning_rate": 9.601170130696988e-05, + "loss": 1.9458, + "step": 5024 + }, + { + "epoch": 1.5423572744014733, + "grad_norm": 0.4761022925376892, + "learning_rate": 9.600975575884134e-05, + "loss": 1.95, + "step": 5025 + }, + { + "epoch": 1.5426642111724984, + "grad_norm": 0.48202264308929443, + "learning_rate": 9.600780975601741e-05, + "loss": 1.9618, + "step": 5026 + }, + { + "epoch": 1.5429711479435237, + "grad_norm": 0.43222522735595703, + "learning_rate": 9.600586329851735e-05, + "loss": 1.9869, + "step": 5027 + }, + { + "epoch": 1.5432780847145486, + "grad_norm": 0.40816691517829895, + "learning_rate": 9.600391638636037e-05, + "loss": 1.991, + "step": 5028 + }, + { + "epoch": 1.543585021485574, + "grad_norm": 0.4365478754043579, + "learning_rate": 9.600196901956572e-05, + "loss": 1.9904, + "step": 5029 + }, + { + "epoch": 1.5438919582565993, + "grad_norm": 0.41411092877388, + "learning_rate": 9.600002119815268e-05, + "loss": 1.9449, + "step": 5030 + }, + { + "epoch": 1.5441988950276242, + "grad_norm": 0.41023650765419006, + "learning_rate": 9.599807292214045e-05, + "loss": 1.9318, + "step": 5031 + }, + { + "epoch": 1.5445058317986495, + "grad_norm": 0.4844631254673004, + "learning_rate": 9.599612419154831e-05, + "loss": 1.9884, + "step": 5032 + }, + { + "epoch": 1.5448127685696746, + "grad_norm": 0.4347037374973297, + "learning_rate": 9.59941750063955e-05, + "loss": 1.8992, + "step": 5033 + }, + { + "epoch": 1.5451197053406998, + "grad_norm": 0.6414445638656616, + "learning_rate": 9.59922253667013e-05, + "loss": 2.0268, + "step": 5034 + }, + { + "epoch": 1.545426642111725, + "grad_norm": 0.6607222557067871, + "learning_rate": 9.599027527248498e-05, + "loss": 2.0116, + "step": 5035 + }, + { + "epoch": 1.5457335788827502, + "grad_norm": 0.6406869292259216, + "learning_rate": 9.59883247237658e-05, + "loss": 1.9256, + "step": 5036 + }, + { + "epoch": 1.5460405156537753, + "grad_norm": 0.5388308167457581, + "learning_rate": 9.598637372056303e-05, + "loss": 1.906, + "step": 5037 + }, + { + "epoch": 1.5463474524248007, + "grad_norm": 0.42285510897636414, + "learning_rate": 9.598442226289596e-05, + "loss": 1.9137, + "step": 5038 + }, + { + "epoch": 1.5466543891958255, + "grad_norm": 0.5622994303703308, + "learning_rate": 9.598247035078389e-05, + "loss": 1.9825, + "step": 5039 + }, + { + "epoch": 1.5469613259668509, + "grad_norm": 0.7120574116706848, + "learning_rate": 9.59805179842461e-05, + "loss": 1.9467, + "step": 5040 + }, + { + "epoch": 1.547268262737876, + "grad_norm": 0.7050338983535767, + "learning_rate": 9.597856516330187e-05, + "loss": 1.9763, + "step": 5041 + }, + { + "epoch": 1.547575199508901, + "grad_norm": 0.4908922016620636, + "learning_rate": 9.597661188797051e-05, + "loss": 1.9826, + "step": 5042 + }, + { + "epoch": 1.5478821362799264, + "grad_norm": 0.47363361716270447, + "learning_rate": 9.597465815827133e-05, + "loss": 1.9769, + "step": 5043 + }, + { + "epoch": 1.5481890730509515, + "grad_norm": 0.6289864182472229, + "learning_rate": 9.597270397422364e-05, + "loss": 1.9364, + "step": 5044 + }, + { + "epoch": 1.5484960098219767, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.597074933584673e-05, + "loss": 1.949, + "step": 5045 + }, + { + "epoch": 1.548802946593002, + "grad_norm": 0.559152364730835, + "learning_rate": 9.596879424315993e-05, + "loss": 2.0194, + "step": 5046 + }, + { + "epoch": 1.5491098833640269, + "grad_norm": 0.4613901674747467, + "learning_rate": 9.596683869618257e-05, + "loss": 1.9658, + "step": 5047 + }, + { + "epoch": 1.5494168201350522, + "grad_norm": 0.6245483160018921, + "learning_rate": 9.596488269493396e-05, + "loss": 1.9265, + "step": 5048 + }, + { + "epoch": 1.5497237569060773, + "grad_norm": 0.8100824356079102, + "learning_rate": 9.596292623943343e-05, + "loss": 1.9536, + "step": 5049 + }, + { + "epoch": 1.5500306936771024, + "grad_norm": 0.7486092448234558, + "learning_rate": 9.596096932970035e-05, + "loss": 1.9801, + "step": 5050 + }, + { + "epoch": 1.5503376304481278, + "grad_norm": 0.4803295135498047, + "learning_rate": 9.595901196575401e-05, + "loss": 1.9943, + "step": 5051 + }, + { + "epoch": 1.550644567219153, + "grad_norm": 0.5027125477790833, + "learning_rate": 9.595705414761379e-05, + "loss": 1.9036, + "step": 5052 + }, + { + "epoch": 1.550951503990178, + "grad_norm": 0.5785070657730103, + "learning_rate": 9.595509587529902e-05, + "loss": 1.9489, + "step": 5053 + }, + { + "epoch": 1.5512584407612033, + "grad_norm": 0.6017338633537292, + "learning_rate": 9.595313714882906e-05, + "loss": 1.9964, + "step": 5054 + }, + { + "epoch": 1.5515653775322282, + "grad_norm": 0.5023195147514343, + "learning_rate": 9.595117796822326e-05, + "loss": 1.9778, + "step": 5055 + }, + { + "epoch": 1.5518723143032536, + "grad_norm": 0.4488884508609772, + "learning_rate": 9.594921833350099e-05, + "loss": 2.0141, + "step": 5056 + }, + { + "epoch": 1.5521792510742787, + "grad_norm": 0.47110801935195923, + "learning_rate": 9.59472582446816e-05, + "loss": 1.9294, + "step": 5057 + }, + { + "epoch": 1.5524861878453038, + "grad_norm": 0.5292330980300903, + "learning_rate": 9.594529770178449e-05, + "loss": 2.0427, + "step": 5058 + }, + { + "epoch": 1.5527931246163291, + "grad_norm": 0.522756814956665, + "learning_rate": 9.5943336704829e-05, + "loss": 1.9854, + "step": 5059 + }, + { + "epoch": 1.5531000613873542, + "grad_norm": 0.44659632444381714, + "learning_rate": 9.594137525383455e-05, + "loss": 2.028, + "step": 5060 + }, + { + "epoch": 1.5534069981583793, + "grad_norm": 0.4745616614818573, + "learning_rate": 9.593941334882048e-05, + "loss": 1.9994, + "step": 5061 + }, + { + "epoch": 1.5537139349294047, + "grad_norm": 0.41752973198890686, + "learning_rate": 9.593745098980622e-05, + "loss": 1.9466, + "step": 5062 + }, + { + "epoch": 1.5540208717004296, + "grad_norm": 0.4548248052597046, + "learning_rate": 9.593548817681115e-05, + "loss": 1.9064, + "step": 5063 + }, + { + "epoch": 1.554327808471455, + "grad_norm": 0.45780888199806213, + "learning_rate": 9.593352490985464e-05, + "loss": 2.0254, + "step": 5064 + }, + { + "epoch": 1.55463474524248, + "grad_norm": 0.4118718206882477, + "learning_rate": 9.593156118895613e-05, + "loss": 1.9761, + "step": 5065 + }, + { + "epoch": 1.5549416820135051, + "grad_norm": 0.41350236535072327, + "learning_rate": 9.592959701413501e-05, + "loss": 1.9476, + "step": 5066 + }, + { + "epoch": 1.5552486187845305, + "grad_norm": 0.4116091728210449, + "learning_rate": 9.59276323854107e-05, + "loss": 1.9325, + "step": 5067 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.44039735198020935, + "learning_rate": 9.592566730280259e-05, + "loss": 1.9916, + "step": 5068 + }, + { + "epoch": 1.5558624923265807, + "grad_norm": 0.4028816819190979, + "learning_rate": 9.592370176633012e-05, + "loss": 1.916, + "step": 5069 + }, + { + "epoch": 1.556169429097606, + "grad_norm": 0.42046302556991577, + "learning_rate": 9.592173577601271e-05, + "loss": 1.961, + "step": 5070 + }, + { + "epoch": 1.556476365868631, + "grad_norm": 0.3749450147151947, + "learning_rate": 9.591976933186982e-05, + "loss": 1.9279, + "step": 5071 + }, + { + "epoch": 1.5567833026396563, + "grad_norm": 0.3441384434700012, + "learning_rate": 9.591780243392081e-05, + "loss": 1.8967, + "step": 5072 + }, + { + "epoch": 1.5570902394106814, + "grad_norm": 0.4032546877861023, + "learning_rate": 9.59158350821852e-05, + "loss": 1.9912, + "step": 5073 + }, + { + "epoch": 1.5573971761817065, + "grad_norm": 0.44628265500068665, + "learning_rate": 9.591386727668238e-05, + "loss": 2.0539, + "step": 5074 + }, + { + "epoch": 1.5577041129527318, + "grad_norm": 0.43606969714164734, + "learning_rate": 9.59118990174318e-05, + "loss": 1.97, + "step": 5075 + }, + { + "epoch": 1.558011049723757, + "grad_norm": 0.42076775431632996, + "learning_rate": 9.590993030445295e-05, + "loss": 1.962, + "step": 5076 + }, + { + "epoch": 1.558317986494782, + "grad_norm": 0.34569117426872253, + "learning_rate": 9.590796113776526e-05, + "loss": 1.8815, + "step": 5077 + }, + { + "epoch": 1.5586249232658074, + "grad_norm": 0.3931111693382263, + "learning_rate": 9.590599151738817e-05, + "loss": 1.9016, + "step": 5078 + }, + { + "epoch": 1.5589318600368323, + "grad_norm": 0.3952369689941406, + "learning_rate": 9.590402144334117e-05, + "loss": 1.9277, + "step": 5079 + }, + { + "epoch": 1.5592387968078576, + "grad_norm": 0.3960857689380646, + "learning_rate": 9.590205091564372e-05, + "loss": 1.947, + "step": 5080 + }, + { + "epoch": 1.5595457335788827, + "grad_norm": 0.37946292757987976, + "learning_rate": 9.590007993431532e-05, + "loss": 1.9907, + "step": 5081 + }, + { + "epoch": 1.5598526703499078, + "grad_norm": 0.41619375348091125, + "learning_rate": 9.589810849937541e-05, + "loss": 1.9451, + "step": 5082 + }, + { + "epoch": 1.5601596071209332, + "grad_norm": 0.39266669750213623, + "learning_rate": 9.58961366108435e-05, + "loss": 2.0137, + "step": 5083 + }, + { + "epoch": 1.5604665438919583, + "grad_norm": 0.39510276913642883, + "learning_rate": 9.589416426873907e-05, + "loss": 1.947, + "step": 5084 + }, + { + "epoch": 1.5607734806629834, + "grad_norm": 0.40243181586265564, + "learning_rate": 9.58921914730816e-05, + "loss": 1.8957, + "step": 5085 + }, + { + "epoch": 1.5610804174340087, + "grad_norm": 0.39877578616142273, + "learning_rate": 9.58902182238906e-05, + "loss": 1.9497, + "step": 5086 + }, + { + "epoch": 1.5613873542050336, + "grad_norm": 0.39367151260375977, + "learning_rate": 9.588824452118557e-05, + "loss": 1.9616, + "step": 5087 + }, + { + "epoch": 1.561694290976059, + "grad_norm": 0.35690104961395264, + "learning_rate": 9.5886270364986e-05, + "loss": 1.9108, + "step": 5088 + }, + { + "epoch": 1.562001227747084, + "grad_norm": 0.39512762427330017, + "learning_rate": 9.588429575531141e-05, + "loss": 1.9909, + "step": 5089 + }, + { + "epoch": 1.5623081645181092, + "grad_norm": 0.39253926277160645, + "learning_rate": 9.588232069218132e-05, + "loss": 1.937, + "step": 5090 + }, + { + "epoch": 1.5626151012891345, + "grad_norm": 0.37811553478240967, + "learning_rate": 9.588034517561526e-05, + "loss": 1.8918, + "step": 5091 + }, + { + "epoch": 1.5629220380601596, + "grad_norm": 0.38191986083984375, + "learning_rate": 9.587836920563272e-05, + "loss": 1.9149, + "step": 5092 + }, + { + "epoch": 1.5632289748311847, + "grad_norm": 0.3903779089450836, + "learning_rate": 9.587639278225326e-05, + "loss": 1.9714, + "step": 5093 + }, + { + "epoch": 1.56353591160221, + "grad_norm": 0.4467499554157257, + "learning_rate": 9.587441590549639e-05, + "loss": 1.8822, + "step": 5094 + }, + { + "epoch": 1.563842848373235, + "grad_norm": 0.3819296956062317, + "learning_rate": 9.587243857538164e-05, + "loss": 1.9212, + "step": 5095 + }, + { + "epoch": 1.5641497851442603, + "grad_norm": 0.4305097162723541, + "learning_rate": 9.587046079192858e-05, + "loss": 1.9264, + "step": 5096 + }, + { + "epoch": 1.5644567219152854, + "grad_norm": 0.4135383367538452, + "learning_rate": 9.586848255515675e-05, + "loss": 1.9743, + "step": 5097 + }, + { + "epoch": 1.5647636586863105, + "grad_norm": 0.44688066840171814, + "learning_rate": 9.586650386508566e-05, + "loss": 1.8804, + "step": 5098 + }, + { + "epoch": 1.5650705954573358, + "grad_norm": 0.5358461737632751, + "learning_rate": 9.586452472173492e-05, + "loss": 1.9485, + "step": 5099 + }, + { + "epoch": 1.565377532228361, + "grad_norm": 0.5585343837738037, + "learning_rate": 9.586254512512408e-05, + "loss": 2.0901, + "step": 5100 + }, + { + "epoch": 1.565684468999386, + "grad_norm": 0.4682343602180481, + "learning_rate": 9.586056507527266e-05, + "loss": 1.8877, + "step": 5101 + }, + { + "epoch": 1.5659914057704114, + "grad_norm": 0.44076529145240784, + "learning_rate": 9.585858457220026e-05, + "loss": 1.93, + "step": 5102 + }, + { + "epoch": 1.5662983425414365, + "grad_norm": 0.4613071382045746, + "learning_rate": 9.585660361592646e-05, + "loss": 1.9689, + "step": 5103 + }, + { + "epoch": 1.5666052793124616, + "grad_norm": 0.4589289128780365, + "learning_rate": 9.585462220647082e-05, + "loss": 1.8876, + "step": 5104 + }, + { + "epoch": 1.566912216083487, + "grad_norm": 0.3495907485485077, + "learning_rate": 9.585264034385292e-05, + "loss": 1.9013, + "step": 5105 + }, + { + "epoch": 1.5672191528545119, + "grad_norm": 0.42263728380203247, + "learning_rate": 9.585065802809235e-05, + "loss": 1.8886, + "step": 5106 + }, + { + "epoch": 1.5675260896255372, + "grad_norm": 0.4275301694869995, + "learning_rate": 9.584867525920872e-05, + "loss": 1.9865, + "step": 5107 + }, + { + "epoch": 1.5678330263965623, + "grad_norm": 0.4228142201900482, + "learning_rate": 9.584669203722161e-05, + "loss": 1.8573, + "step": 5108 + }, + { + "epoch": 1.5681399631675874, + "grad_norm": 0.4422524571418762, + "learning_rate": 9.58447083621506e-05, + "loss": 1.924, + "step": 5109 + }, + { + "epoch": 1.5684468999386127, + "grad_norm": 0.41540947556495667, + "learning_rate": 9.584272423401532e-05, + "loss": 1.969, + "step": 5110 + }, + { + "epoch": 1.5687538367096379, + "grad_norm": 0.3963775336742401, + "learning_rate": 9.584073965283538e-05, + "loss": 1.9509, + "step": 5111 + }, + { + "epoch": 1.569060773480663, + "grad_norm": 0.41465985774993896, + "learning_rate": 9.583875461863037e-05, + "loss": 1.9393, + "step": 5112 + }, + { + "epoch": 1.5693677102516883, + "grad_norm": 0.4396083652973175, + "learning_rate": 9.583676913141991e-05, + "loss": 1.9872, + "step": 5113 + }, + { + "epoch": 1.5696746470227132, + "grad_norm": 0.4247182607650757, + "learning_rate": 9.583478319122366e-05, + "loss": 1.9807, + "step": 5114 + }, + { + "epoch": 1.5699815837937385, + "grad_norm": 0.3612080216407776, + "learning_rate": 9.583279679806119e-05, + "loss": 1.9563, + "step": 5115 + }, + { + "epoch": 1.5702885205647636, + "grad_norm": 0.40084055066108704, + "learning_rate": 9.583080995195217e-05, + "loss": 1.9099, + "step": 5116 + }, + { + "epoch": 1.5705954573357888, + "grad_norm": 0.432381272315979, + "learning_rate": 9.582882265291621e-05, + "loss": 2.0167, + "step": 5117 + }, + { + "epoch": 1.570902394106814, + "grad_norm": 0.45490768551826477, + "learning_rate": 9.5826834900973e-05, + "loss": 1.9179, + "step": 5118 + }, + { + "epoch": 1.5712093308778392, + "grad_norm": 0.39158329367637634, + "learning_rate": 9.582484669614211e-05, + "loss": 1.8716, + "step": 5119 + }, + { + "epoch": 1.5715162676488643, + "grad_norm": 0.45607441663742065, + "learning_rate": 9.582285803844324e-05, + "loss": 1.9631, + "step": 5120 + }, + { + "epoch": 1.5718232044198897, + "grad_norm": 0.42591094970703125, + "learning_rate": 9.582086892789604e-05, + "loss": 1.9809, + "step": 5121 + }, + { + "epoch": 1.5721301411909145, + "grad_norm": 0.46772903203964233, + "learning_rate": 9.581887936452015e-05, + "loss": 1.9991, + "step": 5122 + }, + { + "epoch": 1.5724370779619399, + "grad_norm": 0.4450485408306122, + "learning_rate": 9.581688934833524e-05, + "loss": 1.9471, + "step": 5123 + }, + { + "epoch": 1.572744014732965, + "grad_norm": 0.37539350986480713, + "learning_rate": 9.581489887936097e-05, + "loss": 1.8624, + "step": 5124 + }, + { + "epoch": 1.57305095150399, + "grad_norm": 0.4184030294418335, + "learning_rate": 9.581290795761702e-05, + "loss": 1.9746, + "step": 5125 + }, + { + "epoch": 1.5733578882750154, + "grad_norm": 0.43275317549705505, + "learning_rate": 9.581091658312305e-05, + "loss": 2.0484, + "step": 5126 + }, + { + "epoch": 1.5736648250460405, + "grad_norm": 0.48845502734184265, + "learning_rate": 9.580892475589876e-05, + "loss": 1.9331, + "step": 5127 + }, + { + "epoch": 1.5739717618170657, + "grad_norm": 0.4653528034687042, + "learning_rate": 9.580693247596383e-05, + "loss": 1.8888, + "step": 5128 + }, + { + "epoch": 1.574278698588091, + "grad_norm": 0.4371016323566437, + "learning_rate": 9.580493974333794e-05, + "loss": 1.9004, + "step": 5129 + }, + { + "epoch": 1.5745856353591159, + "grad_norm": 0.4274102747440338, + "learning_rate": 9.580294655804079e-05, + "loss": 1.9877, + "step": 5130 + }, + { + "epoch": 1.5748925721301412, + "grad_norm": 0.4053245484828949, + "learning_rate": 9.580095292009208e-05, + "loss": 1.9253, + "step": 5131 + }, + { + "epoch": 1.5751995089011663, + "grad_norm": 0.47868627309799194, + "learning_rate": 9.579895882951151e-05, + "loss": 1.9659, + "step": 5132 + }, + { + "epoch": 1.5755064456721914, + "grad_norm": 0.47420576214790344, + "learning_rate": 9.579696428631877e-05, + "loss": 1.9115, + "step": 5133 + }, + { + "epoch": 1.5758133824432168, + "grad_norm": 0.41192150115966797, + "learning_rate": 9.57949692905336e-05, + "loss": 1.8949, + "step": 5134 + }, + { + "epoch": 1.576120319214242, + "grad_norm": 0.44949471950531006, + "learning_rate": 9.57929738421757e-05, + "loss": 1.9393, + "step": 5135 + }, + { + "epoch": 1.576427255985267, + "grad_norm": 0.38450154662132263, + "learning_rate": 9.57909779412648e-05, + "loss": 1.8399, + "step": 5136 + }, + { + "epoch": 1.5767341927562923, + "grad_norm": 0.43553364276885986, + "learning_rate": 9.57889815878206e-05, + "loss": 1.9477, + "step": 5137 + }, + { + "epoch": 1.5770411295273172, + "grad_norm": 0.4546982944011688, + "learning_rate": 9.578698478186285e-05, + "loss": 1.9169, + "step": 5138 + }, + { + "epoch": 1.5773480662983426, + "grad_norm": 0.47802838683128357, + "learning_rate": 9.57849875234113e-05, + "loss": 1.9204, + "step": 5139 + }, + { + "epoch": 1.5776550030693677, + "grad_norm": 0.3648034930229187, + "learning_rate": 9.578298981248565e-05, + "loss": 1.9157, + "step": 5140 + }, + { + "epoch": 1.5779619398403928, + "grad_norm": 0.41951245069503784, + "learning_rate": 9.578099164910565e-05, + "loss": 1.9171, + "step": 5141 + }, + { + "epoch": 1.5782688766114181, + "grad_norm": 0.5198701620101929, + "learning_rate": 9.577899303329107e-05, + "loss": 1.9786, + "step": 5142 + }, + { + "epoch": 1.5785758133824432, + "grad_norm": 0.45244187116622925, + "learning_rate": 9.577699396506165e-05, + "loss": 2.0044, + "step": 5143 + }, + { + "epoch": 1.5788827501534684, + "grad_norm": 0.3874819874763489, + "learning_rate": 9.577499444443715e-05, + "loss": 1.9385, + "step": 5144 + }, + { + "epoch": 1.5791896869244937, + "grad_norm": 0.4578075110912323, + "learning_rate": 9.577299447143733e-05, + "loss": 1.9679, + "step": 5145 + }, + { + "epoch": 1.5794966236955186, + "grad_norm": 0.6001343727111816, + "learning_rate": 9.577099404608192e-05, + "loss": 1.9331, + "step": 5146 + }, + { + "epoch": 1.579803560466544, + "grad_norm": 0.5592501759529114, + "learning_rate": 9.576899316839074e-05, + "loss": 1.8968, + "step": 5147 + }, + { + "epoch": 1.580110497237569, + "grad_norm": 0.4333004951477051, + "learning_rate": 9.576699183838356e-05, + "loss": 2.0378, + "step": 5148 + }, + { + "epoch": 1.5804174340085941, + "grad_norm": 0.40593892335891724, + "learning_rate": 9.576499005608011e-05, + "loss": 1.9878, + "step": 5149 + }, + { + "epoch": 1.5807243707796195, + "grad_norm": 0.4805290400981903, + "learning_rate": 9.576298782150023e-05, + "loss": 1.9897, + "step": 5150 + }, + { + "epoch": 1.5810313075506446, + "grad_norm": 0.4620860517024994, + "learning_rate": 9.576098513466367e-05, + "loss": 1.9808, + "step": 5151 + }, + { + "epoch": 1.5813382443216697, + "grad_norm": 0.47085410356521606, + "learning_rate": 9.575898199559023e-05, + "loss": 1.9526, + "step": 5152 + }, + { + "epoch": 1.581645181092695, + "grad_norm": 0.512971043586731, + "learning_rate": 9.575697840429971e-05, + "loss": 1.9684, + "step": 5153 + }, + { + "epoch": 1.58195211786372, + "grad_norm": 0.5474939346313477, + "learning_rate": 9.575497436081193e-05, + "loss": 2.0052, + "step": 5154 + }, + { + "epoch": 1.5822590546347453, + "grad_norm": 0.6277830004692078, + "learning_rate": 9.575296986514666e-05, + "loss": 2.042, + "step": 5155 + }, + { + "epoch": 1.5825659914057704, + "grad_norm": 0.46941256523132324, + "learning_rate": 9.575096491732372e-05, + "loss": 1.952, + "step": 5156 + }, + { + "epoch": 1.5828729281767955, + "grad_norm": 0.4948115646839142, + "learning_rate": 9.574895951736294e-05, + "loss": 1.9573, + "step": 5157 + }, + { + "epoch": 1.5831798649478208, + "grad_norm": 0.5677160024642944, + "learning_rate": 9.574695366528411e-05, + "loss": 1.9696, + "step": 5158 + }, + { + "epoch": 1.583486801718846, + "grad_norm": 0.5915918350219727, + "learning_rate": 9.574494736110708e-05, + "loss": 1.9822, + "step": 5159 + }, + { + "epoch": 1.583793738489871, + "grad_norm": 0.556413471698761, + "learning_rate": 9.574294060485168e-05, + "loss": 1.9548, + "step": 5160 + }, + { + "epoch": 1.5841006752608964, + "grad_norm": 0.4706072509288788, + "learning_rate": 9.574093339653772e-05, + "loss": 2.0052, + "step": 5161 + }, + { + "epoch": 1.5844076120319213, + "grad_norm": 0.3931087553501129, + "learning_rate": 9.573892573618505e-05, + "loss": 1.9071, + "step": 5162 + }, + { + "epoch": 1.5847145488029466, + "grad_norm": 0.4590308368206024, + "learning_rate": 9.573691762381349e-05, + "loss": 2.048, + "step": 5163 + }, + { + "epoch": 1.5850214855739717, + "grad_norm": 0.4404078423976898, + "learning_rate": 9.573490905944293e-05, + "loss": 1.9426, + "step": 5164 + }, + { + "epoch": 1.5853284223449968, + "grad_norm": 0.486074298620224, + "learning_rate": 9.573290004309318e-05, + "loss": 1.9937, + "step": 5165 + }, + { + "epoch": 1.5856353591160222, + "grad_norm": 0.4650556445121765, + "learning_rate": 9.57308905747841e-05, + "loss": 1.9821, + "step": 5166 + }, + { + "epoch": 1.5859422958870473, + "grad_norm": 0.48193567991256714, + "learning_rate": 9.572888065453557e-05, + "loss": 2.0143, + "step": 5167 + }, + { + "epoch": 1.5862492326580724, + "grad_norm": 0.43178877234458923, + "learning_rate": 9.572687028236744e-05, + "loss": 2.0066, + "step": 5168 + }, + { + "epoch": 1.5865561694290977, + "grad_norm": 0.5256033539772034, + "learning_rate": 9.572485945829957e-05, + "loss": 2.0431, + "step": 5169 + }, + { + "epoch": 1.5868631062001226, + "grad_norm": 0.4714619517326355, + "learning_rate": 9.572284818235182e-05, + "loss": 1.9411, + "step": 5170 + }, + { + "epoch": 1.587170042971148, + "grad_norm": 0.4224734902381897, + "learning_rate": 9.572083645454411e-05, + "loss": 1.9648, + "step": 5171 + }, + { + "epoch": 1.5874769797421733, + "grad_norm": 0.45965152978897095, + "learning_rate": 9.571882427489628e-05, + "loss": 1.9241, + "step": 5172 + }, + { + "epoch": 1.5877839165131982, + "grad_norm": 0.459114670753479, + "learning_rate": 9.571681164342825e-05, + "loss": 2.0197, + "step": 5173 + }, + { + "epoch": 1.5880908532842235, + "grad_norm": 0.4278501272201538, + "learning_rate": 9.571479856015988e-05, + "loss": 1.9411, + "step": 5174 + }, + { + "epoch": 1.5883977900552486, + "grad_norm": 0.6875150799751282, + "learning_rate": 9.571278502511107e-05, + "loss": 1.8876, + "step": 5175 + }, + { + "epoch": 1.5887047268262737, + "grad_norm": 0.4596772789955139, + "learning_rate": 9.571077103830174e-05, + "loss": 1.9002, + "step": 5176 + }, + { + "epoch": 1.589011663597299, + "grad_norm": 0.47587937116622925, + "learning_rate": 9.570875659975178e-05, + "loss": 2.0034, + "step": 5177 + }, + { + "epoch": 1.5893186003683242, + "grad_norm": 0.42494842410087585, + "learning_rate": 9.570674170948109e-05, + "loss": 1.9668, + "step": 5178 + }, + { + "epoch": 1.5896255371393493, + "grad_norm": 0.4231310784816742, + "learning_rate": 9.570472636750957e-05, + "loss": 1.9365, + "step": 5179 + }, + { + "epoch": 1.5899324739103746, + "grad_norm": 0.4585247337818146, + "learning_rate": 9.570271057385719e-05, + "loss": 1.9707, + "step": 5180 + }, + { + "epoch": 1.5902394106813995, + "grad_norm": 0.4146895408630371, + "learning_rate": 9.570069432854382e-05, + "loss": 1.9405, + "step": 5181 + }, + { + "epoch": 1.5905463474524248, + "grad_norm": 0.42243605852127075, + "learning_rate": 9.56986776315894e-05, + "loss": 1.8893, + "step": 5182 + }, + { + "epoch": 1.59085328422345, + "grad_norm": 0.44299328327178955, + "learning_rate": 9.569666048301386e-05, + "loss": 1.9596, + "step": 5183 + }, + { + "epoch": 1.591160220994475, + "grad_norm": 0.4950970709323883, + "learning_rate": 9.569464288283716e-05, + "loss": 1.9066, + "step": 5184 + }, + { + "epoch": 1.5914671577655004, + "grad_norm": 0.4664969742298126, + "learning_rate": 9.569262483107919e-05, + "loss": 1.9485, + "step": 5185 + }, + { + "epoch": 1.5917740945365255, + "grad_norm": 0.5052160024642944, + "learning_rate": 9.569060632775993e-05, + "loss": 1.9189, + "step": 5186 + }, + { + "epoch": 1.5920810313075506, + "grad_norm": 0.4109063446521759, + "learning_rate": 9.568858737289932e-05, + "loss": 1.9236, + "step": 5187 + }, + { + "epoch": 1.592387968078576, + "grad_norm": 0.4078194499015808, + "learning_rate": 9.568656796651731e-05, + "loss": 1.9465, + "step": 5188 + }, + { + "epoch": 1.5926949048496009, + "grad_norm": 0.43199312686920166, + "learning_rate": 9.568454810863385e-05, + "loss": 1.9537, + "step": 5189 + }, + { + "epoch": 1.5930018416206262, + "grad_norm": 0.46389925479888916, + "learning_rate": 9.568252779926891e-05, + "loss": 1.9463, + "step": 5190 + }, + { + "epoch": 1.5933087783916513, + "grad_norm": 0.4130708575248718, + "learning_rate": 9.568050703844247e-05, + "loss": 1.948, + "step": 5191 + }, + { + "epoch": 1.5936157151626764, + "grad_norm": 0.4699256122112274, + "learning_rate": 9.567848582617448e-05, + "loss": 1.957, + "step": 5192 + }, + { + "epoch": 1.5939226519337018, + "grad_norm": 0.41965460777282715, + "learning_rate": 9.56764641624849e-05, + "loss": 1.9622, + "step": 5193 + }, + { + "epoch": 1.5942295887047269, + "grad_norm": 0.4313151240348816, + "learning_rate": 9.567444204739376e-05, + "loss": 1.981, + "step": 5194 + }, + { + "epoch": 1.594536525475752, + "grad_norm": 0.4149332642555237, + "learning_rate": 9.5672419480921e-05, + "loss": 1.9542, + "step": 5195 + }, + { + "epoch": 1.5948434622467773, + "grad_norm": 0.4456483721733093, + "learning_rate": 9.567039646308661e-05, + "loss": 2.0206, + "step": 5196 + }, + { + "epoch": 1.5951503990178022, + "grad_norm": 0.46637552976608276, + "learning_rate": 9.56683729939106e-05, + "loss": 2.0264, + "step": 5197 + }, + { + "epoch": 1.5954573357888275, + "grad_norm": 0.4809871315956116, + "learning_rate": 9.566634907341297e-05, + "loss": 1.9113, + "step": 5198 + }, + { + "epoch": 1.5957642725598526, + "grad_norm": 0.5220670104026794, + "learning_rate": 9.566432470161371e-05, + "loss": 1.9806, + "step": 5199 + }, + { + "epoch": 1.5960712093308778, + "grad_norm": 0.5020555853843689, + "learning_rate": 9.566229987853283e-05, + "loss": 1.9925, + "step": 5200 + }, + { + "epoch": 1.596378146101903, + "grad_norm": 0.5481683611869812, + "learning_rate": 9.566027460419034e-05, + "loss": 1.978, + "step": 5201 + }, + { + "epoch": 1.5966850828729282, + "grad_norm": 0.5014147758483887, + "learning_rate": 9.565824887860624e-05, + "loss": 1.9402, + "step": 5202 + }, + { + "epoch": 1.5969920196439533, + "grad_norm": 0.43973588943481445, + "learning_rate": 9.565622270180057e-05, + "loss": 1.9877, + "step": 5203 + }, + { + "epoch": 1.5972989564149787, + "grad_norm": 0.5172939300537109, + "learning_rate": 9.565419607379335e-05, + "loss": 1.9304, + "step": 5204 + }, + { + "epoch": 1.5976058931860035, + "grad_norm": 0.4767214357852936, + "learning_rate": 9.56521689946046e-05, + "loss": 1.9063, + "step": 5205 + }, + { + "epoch": 1.5979128299570289, + "grad_norm": 0.48810651898384094, + "learning_rate": 9.565014146425437e-05, + "loss": 1.9473, + "step": 5206 + }, + { + "epoch": 1.598219766728054, + "grad_norm": 0.4204402565956116, + "learning_rate": 9.564811348276269e-05, + "loss": 1.9562, + "step": 5207 + }, + { + "epoch": 1.598526703499079, + "grad_norm": 0.42679163813591003, + "learning_rate": 9.564608505014958e-05, + "loss": 1.8904, + "step": 5208 + }, + { + "epoch": 1.5988336402701044, + "grad_norm": 0.4240354299545288, + "learning_rate": 9.56440561664351e-05, + "loss": 1.9982, + "step": 5209 + }, + { + "epoch": 1.5991405770411296, + "grad_norm": 0.41588497161865234, + "learning_rate": 9.564202683163932e-05, + "loss": 1.9904, + "step": 5210 + }, + { + "epoch": 1.5994475138121547, + "grad_norm": 0.486240029335022, + "learning_rate": 9.563999704578226e-05, + "loss": 1.9379, + "step": 5211 + }, + { + "epoch": 1.59975445058318, + "grad_norm": 0.4628448188304901, + "learning_rate": 9.563796680888403e-05, + "loss": 2.0061, + "step": 5212 + }, + { + "epoch": 1.600061387354205, + "grad_norm": 0.4514544606208801, + "learning_rate": 9.563593612096464e-05, + "loss": 1.9692, + "step": 5213 + }, + { + "epoch": 1.6003683241252302, + "grad_norm": 0.3869803845882416, + "learning_rate": 9.563390498204419e-05, + "loss": 1.8801, + "step": 5214 + }, + { + "epoch": 1.6006752608962553, + "grad_norm": 0.47029098868370056, + "learning_rate": 9.563187339214274e-05, + "loss": 2.0457, + "step": 5215 + }, + { + "epoch": 1.6009821976672804, + "grad_norm": 0.49051982164382935, + "learning_rate": 9.562984135128037e-05, + "loss": 1.9121, + "step": 5216 + }, + { + "epoch": 1.6012891344383058, + "grad_norm": 0.5087830424308777, + "learning_rate": 9.562780885947717e-05, + "loss": 1.9165, + "step": 5217 + }, + { + "epoch": 1.601596071209331, + "grad_norm": 0.4597826600074768, + "learning_rate": 9.562577591675322e-05, + "loss": 1.9037, + "step": 5218 + }, + { + "epoch": 1.601903007980356, + "grad_norm": 0.43610528111457825, + "learning_rate": 9.562374252312858e-05, + "loss": 1.8785, + "step": 5219 + }, + { + "epoch": 1.6022099447513813, + "grad_norm": 0.45797282457351685, + "learning_rate": 9.56217086786234e-05, + "loss": 2.0713, + "step": 5220 + }, + { + "epoch": 1.6025168815224062, + "grad_norm": 0.46097078919410706, + "learning_rate": 9.561967438325777e-05, + "loss": 1.9176, + "step": 5221 + }, + { + "epoch": 1.6028238182934316, + "grad_norm": 0.47368288040161133, + "learning_rate": 9.561763963705176e-05, + "loss": 1.9333, + "step": 5222 + }, + { + "epoch": 1.6031307550644567, + "grad_norm": 0.5048179626464844, + "learning_rate": 9.561560444002551e-05, + "loss": 1.9473, + "step": 5223 + }, + { + "epoch": 1.6034376918354818, + "grad_norm": 0.42069435119628906, + "learning_rate": 9.56135687921991e-05, + "loss": 1.8507, + "step": 5224 + }, + { + "epoch": 1.6037446286065071, + "grad_norm": 0.37166985869407654, + "learning_rate": 9.561153269359269e-05, + "loss": 1.9404, + "step": 5225 + }, + { + "epoch": 1.6040515653775322, + "grad_norm": 0.42752668261528015, + "learning_rate": 9.560949614422637e-05, + "loss": 1.9791, + "step": 5226 + }, + { + "epoch": 1.6043585021485574, + "grad_norm": 0.4334527552127838, + "learning_rate": 9.560745914412029e-05, + "loss": 1.972, + "step": 5227 + }, + { + "epoch": 1.6046654389195827, + "grad_norm": 0.44162631034851074, + "learning_rate": 9.560542169329454e-05, + "loss": 1.9054, + "step": 5228 + }, + { + "epoch": 1.6049723756906076, + "grad_norm": 0.3891509771347046, + "learning_rate": 9.560338379176929e-05, + "loss": 1.9356, + "step": 5229 + }, + { + "epoch": 1.605279312461633, + "grad_norm": 0.3821989893913269, + "learning_rate": 9.56013454395647e-05, + "loss": 1.9197, + "step": 5230 + }, + { + "epoch": 1.605586249232658, + "grad_norm": 0.4338948428630829, + "learning_rate": 9.559930663670084e-05, + "loss": 2.002, + "step": 5231 + }, + { + "epoch": 1.6058931860036831, + "grad_norm": 0.4784114956855774, + "learning_rate": 9.559726738319794e-05, + "loss": 2.0344, + "step": 5232 + }, + { + "epoch": 1.6062001227747085, + "grad_norm": 0.43362441658973694, + "learning_rate": 9.559522767907612e-05, + "loss": 1.9282, + "step": 5233 + }, + { + "epoch": 1.6065070595457336, + "grad_norm": 0.40863800048828125, + "learning_rate": 9.559318752435553e-05, + "loss": 1.8468, + "step": 5234 + }, + { + "epoch": 1.6068139963167587, + "grad_norm": 0.4509727358818054, + "learning_rate": 9.559114691905633e-05, + "loss": 2.0175, + "step": 5235 + }, + { + "epoch": 1.607120933087784, + "grad_norm": 0.4650020897388458, + "learning_rate": 9.55891058631987e-05, + "loss": 1.9946, + "step": 5236 + }, + { + "epoch": 1.607427869858809, + "grad_norm": 0.4315911829471588, + "learning_rate": 9.55870643568028e-05, + "loss": 1.9271, + "step": 5237 + }, + { + "epoch": 1.6077348066298343, + "grad_norm": 0.4109809994697571, + "learning_rate": 9.558502239988882e-05, + "loss": 1.9791, + "step": 5238 + }, + { + "epoch": 1.6080417434008594, + "grad_norm": 0.4323776662349701, + "learning_rate": 9.558297999247692e-05, + "loss": 1.9745, + "step": 5239 + }, + { + "epoch": 1.6083486801718845, + "grad_norm": 0.4255007207393646, + "learning_rate": 9.558093713458729e-05, + "loss": 1.96, + "step": 5240 + }, + { + "epoch": 1.6086556169429098, + "grad_norm": 0.4045571982860565, + "learning_rate": 9.557889382624014e-05, + "loss": 1.9148, + "step": 5241 + }, + { + "epoch": 1.608962553713935, + "grad_norm": 0.39663615822792053, + "learning_rate": 9.557685006745564e-05, + "loss": 1.9313, + "step": 5242 + }, + { + "epoch": 1.60926949048496, + "grad_norm": 0.39130523800849915, + "learning_rate": 9.5574805858254e-05, + "loss": 2.0073, + "step": 5243 + }, + { + "epoch": 1.6095764272559854, + "grad_norm": 0.4071548581123352, + "learning_rate": 9.55727611986554e-05, + "loss": 1.9353, + "step": 5244 + }, + { + "epoch": 1.6098833640270105, + "grad_norm": 0.44347357749938965, + "learning_rate": 9.557071608868007e-05, + "loss": 1.9325, + "step": 5245 + }, + { + "epoch": 1.6101903007980356, + "grad_norm": 0.48900067806243896, + "learning_rate": 9.556867052834821e-05, + "loss": 2.0083, + "step": 5246 + }, + { + "epoch": 1.610497237569061, + "grad_norm": 0.44374197721481323, + "learning_rate": 9.556662451768006e-05, + "loss": 2.0143, + "step": 5247 + }, + { + "epoch": 1.6108041743400858, + "grad_norm": 0.385268896818161, + "learning_rate": 9.556457805669581e-05, + "loss": 1.8981, + "step": 5248 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.5355607867240906, + "learning_rate": 9.556253114541569e-05, + "loss": 2.0413, + "step": 5249 + }, + { + "epoch": 1.6114180478821363, + "grad_norm": 0.5672646164894104, + "learning_rate": 9.556048378385992e-05, + "loss": 1.9429, + "step": 5250 + }, + { + "epoch": 1.6117249846531614, + "grad_norm": 0.46225669980049133, + "learning_rate": 9.555843597204875e-05, + "loss": 1.9883, + "step": 5251 + }, + { + "epoch": 1.6120319214241867, + "grad_norm": 0.43236228823661804, + "learning_rate": 9.555638771000243e-05, + "loss": 1.9641, + "step": 5252 + }, + { + "epoch": 1.6123388581952118, + "grad_norm": 0.4843178987503052, + "learning_rate": 9.555433899774116e-05, + "loss": 1.9224, + "step": 5253 + }, + { + "epoch": 1.612645794966237, + "grad_norm": 0.4693675637245178, + "learning_rate": 9.555228983528523e-05, + "loss": 1.9774, + "step": 5254 + }, + { + "epoch": 1.6129527317372623, + "grad_norm": 0.3968529999256134, + "learning_rate": 9.555024022265487e-05, + "loss": 1.8939, + "step": 5255 + }, + { + "epoch": 1.6132596685082872, + "grad_norm": 0.42781850695610046, + "learning_rate": 9.554819015987033e-05, + "loss": 1.9561, + "step": 5256 + }, + { + "epoch": 1.6135666052793125, + "grad_norm": 0.5241015553474426, + "learning_rate": 9.554613964695189e-05, + "loss": 1.963, + "step": 5257 + }, + { + "epoch": 1.6138735420503376, + "grad_norm": 0.4292888641357422, + "learning_rate": 9.554408868391979e-05, + "loss": 2.0248, + "step": 5258 + }, + { + "epoch": 1.6141804788213627, + "grad_norm": 0.49197763204574585, + "learning_rate": 9.554203727079433e-05, + "loss": 1.9612, + "step": 5259 + }, + { + "epoch": 1.614487415592388, + "grad_norm": 0.45733556151390076, + "learning_rate": 9.553998540759575e-05, + "loss": 1.9093, + "step": 5260 + }, + { + "epoch": 1.6147943523634132, + "grad_norm": 0.4139576256275177, + "learning_rate": 9.553793309434436e-05, + "loss": 1.875, + "step": 5261 + }, + { + "epoch": 1.6151012891344383, + "grad_norm": 0.42295894026756287, + "learning_rate": 9.55358803310604e-05, + "loss": 1.9427, + "step": 5262 + }, + { + "epoch": 1.6154082259054636, + "grad_norm": 0.370761513710022, + "learning_rate": 9.55338271177642e-05, + "loss": 1.932, + "step": 5263 + }, + { + "epoch": 1.6157151626764885, + "grad_norm": 0.38912683725357056, + "learning_rate": 9.553177345447602e-05, + "loss": 1.9606, + "step": 5264 + }, + { + "epoch": 1.6160220994475138, + "grad_norm": 0.3901510238647461, + "learning_rate": 9.552971934121618e-05, + "loss": 1.9455, + "step": 5265 + }, + { + "epoch": 1.616329036218539, + "grad_norm": 0.4517458975315094, + "learning_rate": 9.552766477800494e-05, + "loss": 1.9291, + "step": 5266 + }, + { + "epoch": 1.616635972989564, + "grad_norm": 0.47282713651657104, + "learning_rate": 9.552560976486266e-05, + "loss": 1.9326, + "step": 5267 + }, + { + "epoch": 1.6169429097605894, + "grad_norm": 0.4741488993167877, + "learning_rate": 9.552355430180961e-05, + "loss": 1.9782, + "step": 5268 + }, + { + "epoch": 1.6172498465316145, + "grad_norm": 0.42634037137031555, + "learning_rate": 9.552149838886612e-05, + "loss": 1.9871, + "step": 5269 + }, + { + "epoch": 1.6175567833026396, + "grad_norm": 0.39007633924484253, + "learning_rate": 9.55194420260525e-05, + "loss": 1.9397, + "step": 5270 + }, + { + "epoch": 1.617863720073665, + "grad_norm": 0.41707170009613037, + "learning_rate": 9.551738521338906e-05, + "loss": 1.8555, + "step": 5271 + }, + { + "epoch": 1.6181706568446899, + "grad_norm": 0.46702343225479126, + "learning_rate": 9.551532795089616e-05, + "loss": 1.9987, + "step": 5272 + }, + { + "epoch": 1.6184775936157152, + "grad_norm": 0.44585564732551575, + "learning_rate": 9.551327023859411e-05, + "loss": 1.8512, + "step": 5273 + }, + { + "epoch": 1.6187845303867403, + "grad_norm": 0.42617684602737427, + "learning_rate": 9.551121207650324e-05, + "loss": 1.9405, + "step": 5274 + }, + { + "epoch": 1.6190914671577654, + "grad_norm": 0.39399340748786926, + "learning_rate": 9.55091534646439e-05, + "loss": 1.9787, + "step": 5275 + }, + { + "epoch": 1.6193984039287908, + "grad_norm": 0.44386324286460876, + "learning_rate": 9.550709440303642e-05, + "loss": 1.9791, + "step": 5276 + }, + { + "epoch": 1.6197053406998159, + "grad_norm": 0.3871287405490875, + "learning_rate": 9.550503489170117e-05, + "loss": 1.9354, + "step": 5277 + }, + { + "epoch": 1.620012277470841, + "grad_norm": 0.4131690263748169, + "learning_rate": 9.550297493065851e-05, + "loss": 1.9709, + "step": 5278 + }, + { + "epoch": 1.6203192142418663, + "grad_norm": 0.3919534683227539, + "learning_rate": 9.550091451992877e-05, + "loss": 1.8997, + "step": 5279 + }, + { + "epoch": 1.6206261510128912, + "grad_norm": 0.40001583099365234, + "learning_rate": 9.54988536595323e-05, + "loss": 1.9006, + "step": 5280 + }, + { + "epoch": 1.6209330877839165, + "grad_norm": 0.44222408533096313, + "learning_rate": 9.549679234948952e-05, + "loss": 2.0033, + "step": 5281 + }, + { + "epoch": 1.6212400245549416, + "grad_norm": 0.4243159592151642, + "learning_rate": 9.549473058982077e-05, + "loss": 1.9582, + "step": 5282 + }, + { + "epoch": 1.6215469613259668, + "grad_norm": 0.411408007144928, + "learning_rate": 9.549266838054641e-05, + "loss": 1.9244, + "step": 5283 + }, + { + "epoch": 1.621853898096992, + "grad_norm": 0.3833782970905304, + "learning_rate": 9.549060572168686e-05, + "loss": 1.9184, + "step": 5284 + }, + { + "epoch": 1.6221608348680172, + "grad_norm": 0.3925926685333252, + "learning_rate": 9.548854261326246e-05, + "loss": 1.9299, + "step": 5285 + }, + { + "epoch": 1.6224677716390423, + "grad_norm": 0.4472656846046448, + "learning_rate": 9.548647905529363e-05, + "loss": 2.0622, + "step": 5286 + }, + { + "epoch": 1.6227747084100677, + "grad_norm": 0.4842108488082886, + "learning_rate": 9.548441504780074e-05, + "loss": 1.9759, + "step": 5287 + }, + { + "epoch": 1.6230816451810925, + "grad_norm": 0.49826517701148987, + "learning_rate": 9.548235059080422e-05, + "loss": 1.9162, + "step": 5288 + }, + { + "epoch": 1.6233885819521179, + "grad_norm": 0.4672689735889435, + "learning_rate": 9.548028568432445e-05, + "loss": 1.9843, + "step": 5289 + }, + { + "epoch": 1.623695518723143, + "grad_norm": 0.48113325238227844, + "learning_rate": 9.547822032838182e-05, + "loss": 1.9426, + "step": 5290 + }, + { + "epoch": 1.624002455494168, + "grad_norm": 0.49646374583244324, + "learning_rate": 9.54761545229968e-05, + "loss": 1.908, + "step": 5291 + }, + { + "epoch": 1.6243093922651934, + "grad_norm": 0.42530664801597595, + "learning_rate": 9.547408826818974e-05, + "loss": 1.9189, + "step": 5292 + }, + { + "epoch": 1.6246163290362186, + "grad_norm": 0.592721164226532, + "learning_rate": 9.54720215639811e-05, + "loss": 1.9656, + "step": 5293 + }, + { + "epoch": 1.6249232658072437, + "grad_norm": 0.5530748963356018, + "learning_rate": 9.546995441039127e-05, + "loss": 1.8815, + "step": 5294 + }, + { + "epoch": 1.625230202578269, + "grad_norm": 0.4551030695438385, + "learning_rate": 9.546788680744073e-05, + "loss": 1.9485, + "step": 5295 + }, + { + "epoch": 1.625537139349294, + "grad_norm": 0.42004409432411194, + "learning_rate": 9.546581875514985e-05, + "loss": 1.9903, + "step": 5296 + }, + { + "epoch": 1.6258440761203192, + "grad_norm": 0.5363507270812988, + "learning_rate": 9.546375025353911e-05, + "loss": 1.93, + "step": 5297 + }, + { + "epoch": 1.6261510128913443, + "grad_norm": 0.457795649766922, + "learning_rate": 9.546168130262896e-05, + "loss": 1.9279, + "step": 5298 + }, + { + "epoch": 1.6264579496623695, + "grad_norm": 0.5061174631118774, + "learning_rate": 9.545961190243982e-05, + "loss": 1.9198, + "step": 5299 + }, + { + "epoch": 1.6267648864333948, + "grad_norm": 0.4366548955440521, + "learning_rate": 9.545754205299214e-05, + "loss": 1.9206, + "step": 5300 + }, + { + "epoch": 1.62707182320442, + "grad_norm": 0.361251562833786, + "learning_rate": 9.54554717543064e-05, + "loss": 1.8638, + "step": 5301 + }, + { + "epoch": 1.627378759975445, + "grad_norm": 0.45089036226272583, + "learning_rate": 9.545340100640303e-05, + "loss": 1.9206, + "step": 5302 + }, + { + "epoch": 1.6276856967464703, + "grad_norm": 0.38224726915359497, + "learning_rate": 9.545132980930251e-05, + "loss": 1.9893, + "step": 5303 + }, + { + "epoch": 1.6279926335174952, + "grad_norm": 0.43573206663131714, + "learning_rate": 9.544925816302533e-05, + "loss": 1.9358, + "step": 5304 + }, + { + "epoch": 1.6282995702885206, + "grad_norm": 0.5618723630905151, + "learning_rate": 9.544718606759193e-05, + "loss": 1.9745, + "step": 5305 + }, + { + "epoch": 1.6286065070595457, + "grad_norm": 0.517867386341095, + "learning_rate": 9.54451135230228e-05, + "loss": 2.0238, + "step": 5306 + }, + { + "epoch": 1.6289134438305708, + "grad_norm": 0.4745725393295288, + "learning_rate": 9.544304052933842e-05, + "loss": 1.999, + "step": 5307 + }, + { + "epoch": 1.6292203806015961, + "grad_norm": 0.4454270899295807, + "learning_rate": 9.544096708655928e-05, + "loss": 1.9215, + "step": 5308 + }, + { + "epoch": 1.6295273173726212, + "grad_norm": 0.5604696273803711, + "learning_rate": 9.543889319470586e-05, + "loss": 1.8756, + "step": 5309 + }, + { + "epoch": 1.6298342541436464, + "grad_norm": 0.645453155040741, + "learning_rate": 9.543681885379869e-05, + "loss": 1.9177, + "step": 5310 + }, + { + "epoch": 1.6301411909146717, + "grad_norm": 0.7018140554428101, + "learning_rate": 9.543474406385824e-05, + "loss": 1.9231, + "step": 5311 + }, + { + "epoch": 1.6304481276856968, + "grad_norm": 0.691644549369812, + "learning_rate": 9.543266882490501e-05, + "loss": 1.9055, + "step": 5312 + }, + { + "epoch": 1.630755064456722, + "grad_norm": 0.5484849810600281, + "learning_rate": 9.54305931369595e-05, + "loss": 1.8977, + "step": 5313 + }, + { + "epoch": 1.6310620012277472, + "grad_norm": 0.4035104811191559, + "learning_rate": 9.542851700004227e-05, + "loss": 1.9098, + "step": 5314 + }, + { + "epoch": 1.6313689379987721, + "grad_norm": 0.4578574299812317, + "learning_rate": 9.542644041417379e-05, + "loss": 1.9946, + "step": 5315 + }, + { + "epoch": 1.6316758747697975, + "grad_norm": 0.646272599697113, + "learning_rate": 9.542436337937462e-05, + "loss": 1.9489, + "step": 5316 + }, + { + "epoch": 1.6319828115408226, + "grad_norm": 0.5796291828155518, + "learning_rate": 9.542228589566524e-05, + "loss": 1.8396, + "step": 5317 + }, + { + "epoch": 1.6322897483118477, + "grad_norm": 0.42690619826316833, + "learning_rate": 9.542020796306623e-05, + "loss": 1.9691, + "step": 5318 + }, + { + "epoch": 1.632596685082873, + "grad_norm": 0.3943910002708435, + "learning_rate": 9.54181295815981e-05, + "loss": 1.8711, + "step": 5319 + }, + { + "epoch": 1.6329036218538981, + "grad_norm": 0.4636860489845276, + "learning_rate": 9.541605075128137e-05, + "loss": 1.8659, + "step": 5320 + }, + { + "epoch": 1.6332105586249233, + "grad_norm": 0.5485807061195374, + "learning_rate": 9.541397147213664e-05, + "loss": 2.031, + "step": 5321 + }, + { + "epoch": 1.6335174953959486, + "grad_norm": 0.40169721841812134, + "learning_rate": 9.541189174418441e-05, + "loss": 1.9346, + "step": 5322 + }, + { + "epoch": 1.6338244321669735, + "grad_norm": 0.3407663106918335, + "learning_rate": 9.540981156744524e-05, + "loss": 1.9238, + "step": 5323 + }, + { + "epoch": 1.6341313689379988, + "grad_norm": 0.4062422513961792, + "learning_rate": 9.540773094193971e-05, + "loss": 1.914, + "step": 5324 + }, + { + "epoch": 1.634438305709024, + "grad_norm": 0.47654685378074646, + "learning_rate": 9.540564986768836e-05, + "loss": 1.8957, + "step": 5325 + }, + { + "epoch": 1.634745242480049, + "grad_norm": 0.4369850754737854, + "learning_rate": 9.540356834471178e-05, + "loss": 1.968, + "step": 5326 + }, + { + "epoch": 1.6350521792510744, + "grad_norm": 0.38868457078933716, + "learning_rate": 9.540148637303052e-05, + "loss": 1.931, + "step": 5327 + }, + { + "epoch": 1.6353591160220995, + "grad_norm": 0.4998358190059662, + "learning_rate": 9.539940395266515e-05, + "loss": 1.9316, + "step": 5328 + }, + { + "epoch": 1.6356660527931246, + "grad_norm": 0.5497372150421143, + "learning_rate": 9.539732108363628e-05, + "loss": 1.9233, + "step": 5329 + }, + { + "epoch": 1.63597298956415, + "grad_norm": 0.5609846115112305, + "learning_rate": 9.539523776596445e-05, + "loss": 1.898, + "step": 5330 + }, + { + "epoch": 1.6362799263351748, + "grad_norm": 0.44984617829322815, + "learning_rate": 9.539315399967029e-05, + "loss": 2.0103, + "step": 5331 + }, + { + "epoch": 1.6365868631062002, + "grad_norm": 0.41710013151168823, + "learning_rate": 9.539106978477436e-05, + "loss": 1.9008, + "step": 5332 + }, + { + "epoch": 1.6368937998772253, + "grad_norm": 0.44854703545570374, + "learning_rate": 9.53889851212973e-05, + "loss": 1.9591, + "step": 5333 + }, + { + "epoch": 1.6372007366482504, + "grad_norm": 0.4259171485900879, + "learning_rate": 9.538690000925968e-05, + "loss": 1.915, + "step": 5334 + }, + { + "epoch": 1.6375076734192757, + "grad_norm": 0.4444480240345001, + "learning_rate": 9.53848144486821e-05, + "loss": 1.9562, + "step": 5335 + }, + { + "epoch": 1.6378146101903008, + "grad_norm": 0.40078794956207275, + "learning_rate": 9.538272843958518e-05, + "loss": 1.8802, + "step": 5336 + }, + { + "epoch": 1.638121546961326, + "grad_norm": 0.5346726179122925, + "learning_rate": 9.538064198198955e-05, + "loss": 2.0214, + "step": 5337 + }, + { + "epoch": 1.6384284837323513, + "grad_norm": 0.47136780619621277, + "learning_rate": 9.537855507591581e-05, + "loss": 1.9593, + "step": 5338 + }, + { + "epoch": 1.6387354205033762, + "grad_norm": 0.3839198052883148, + "learning_rate": 9.53764677213846e-05, + "loss": 1.9507, + "step": 5339 + }, + { + "epoch": 1.6390423572744015, + "grad_norm": 0.4565586447715759, + "learning_rate": 9.537437991841654e-05, + "loss": 1.9292, + "step": 5340 + }, + { + "epoch": 1.6393492940454266, + "grad_norm": 0.5139011740684509, + "learning_rate": 9.537229166703225e-05, + "loss": 1.9388, + "step": 5341 + }, + { + "epoch": 1.6396562308164517, + "grad_norm": 0.5421571135520935, + "learning_rate": 9.537020296725238e-05, + "loss": 1.9031, + "step": 5342 + }, + { + "epoch": 1.639963167587477, + "grad_norm": 0.4085434675216675, + "learning_rate": 9.536811381909758e-05, + "loss": 1.9167, + "step": 5343 + }, + { + "epoch": 1.6402701043585022, + "grad_norm": 0.3567824065685272, + "learning_rate": 9.536602422258849e-05, + "loss": 1.89, + "step": 5344 + }, + { + "epoch": 1.6405770411295273, + "grad_norm": 0.5427443385124207, + "learning_rate": 9.536393417774575e-05, + "loss": 2.0036, + "step": 5345 + }, + { + "epoch": 1.6408839779005526, + "grad_norm": 0.5275370478630066, + "learning_rate": 9.536184368459003e-05, + "loss": 1.94, + "step": 5346 + }, + { + "epoch": 1.6411909146715775, + "grad_norm": 0.3916989862918854, + "learning_rate": 9.535975274314198e-05, + "loss": 1.8769, + "step": 5347 + }, + { + "epoch": 1.6414978514426029, + "grad_norm": 0.4200802743434906, + "learning_rate": 9.535766135342228e-05, + "loss": 1.9384, + "step": 5348 + }, + { + "epoch": 1.641804788213628, + "grad_norm": 0.5287195444107056, + "learning_rate": 9.535556951545157e-05, + "loss": 1.9159, + "step": 5349 + }, + { + "epoch": 1.642111724984653, + "grad_norm": 0.5934851765632629, + "learning_rate": 9.535347722925055e-05, + "loss": 1.9927, + "step": 5350 + }, + { + "epoch": 1.6424186617556784, + "grad_norm": 0.49941807985305786, + "learning_rate": 9.535138449483987e-05, + "loss": 1.9124, + "step": 5351 + }, + { + "epoch": 1.6427255985267035, + "grad_norm": 0.41778016090393066, + "learning_rate": 9.534929131224024e-05, + "loss": 1.9468, + "step": 5352 + }, + { + "epoch": 1.6430325352977286, + "grad_norm": 0.5172474384307861, + "learning_rate": 9.534719768147233e-05, + "loss": 1.928, + "step": 5353 + }, + { + "epoch": 1.643339472068754, + "grad_norm": 0.6690294146537781, + "learning_rate": 9.534510360255683e-05, + "loss": 1.9697, + "step": 5354 + }, + { + "epoch": 1.6436464088397789, + "grad_norm": 0.617683470249176, + "learning_rate": 9.534300907551444e-05, + "loss": 1.9529, + "step": 5355 + }, + { + "epoch": 1.6439533456108042, + "grad_norm": 0.40067893266677856, + "learning_rate": 9.534091410036587e-05, + "loss": 1.915, + "step": 5356 + }, + { + "epoch": 1.6442602823818293, + "grad_norm": 0.46418440341949463, + "learning_rate": 9.53388186771318e-05, + "loss": 1.9056, + "step": 5357 + }, + { + "epoch": 1.6445672191528544, + "grad_norm": 0.6600098013877869, + "learning_rate": 9.533672280583295e-05, + "loss": 1.9641, + "step": 5358 + }, + { + "epoch": 1.6448741559238798, + "grad_norm": 0.6510347127914429, + "learning_rate": 9.533462648649004e-05, + "loss": 1.916, + "step": 5359 + }, + { + "epoch": 1.6451810926949049, + "grad_norm": 0.5004377365112305, + "learning_rate": 9.533252971912376e-05, + "loss": 1.9584, + "step": 5360 + }, + { + "epoch": 1.64548802946593, + "grad_norm": 0.45522230863571167, + "learning_rate": 9.533043250375488e-05, + "loss": 1.973, + "step": 5361 + }, + { + "epoch": 1.6457949662369553, + "grad_norm": 0.5304180383682251, + "learning_rate": 9.532833484040408e-05, + "loss": 1.8542, + "step": 5362 + }, + { + "epoch": 1.6461019030079802, + "grad_norm": 0.5320406556129456, + "learning_rate": 9.53262367290921e-05, + "loss": 1.9405, + "step": 5363 + }, + { + "epoch": 1.6464088397790055, + "grad_norm": 0.4377361536026001, + "learning_rate": 9.532413816983969e-05, + "loss": 1.9126, + "step": 5364 + }, + { + "epoch": 1.6467157765500307, + "grad_norm": 0.4632298946380615, + "learning_rate": 9.532203916266758e-05, + "loss": 1.9868, + "step": 5365 + }, + { + "epoch": 1.6470227133210558, + "grad_norm": 0.4861730635166168, + "learning_rate": 9.531993970759651e-05, + "loss": 1.895, + "step": 5366 + }, + { + "epoch": 1.647329650092081, + "grad_norm": 0.45012348890304565, + "learning_rate": 9.531783980464726e-05, + "loss": 1.9583, + "step": 5367 + }, + { + "epoch": 1.6476365868631062, + "grad_norm": 0.43772751092910767, + "learning_rate": 9.531573945384053e-05, + "loss": 1.9341, + "step": 5368 + }, + { + "epoch": 1.6479435236341313, + "grad_norm": 0.39253392815589905, + "learning_rate": 9.531363865519711e-05, + "loss": 1.8629, + "step": 5369 + }, + { + "epoch": 1.6482504604051567, + "grad_norm": 0.44614076614379883, + "learning_rate": 9.531153740873775e-05, + "loss": 1.9508, + "step": 5370 + }, + { + "epoch": 1.6485573971761815, + "grad_norm": 0.4442307949066162, + "learning_rate": 9.530943571448322e-05, + "loss": 1.9624, + "step": 5371 + }, + { + "epoch": 1.6488643339472069, + "grad_norm": 0.44962942600250244, + "learning_rate": 9.53073335724543e-05, + "loss": 1.9315, + "step": 5372 + }, + { + "epoch": 1.649171270718232, + "grad_norm": 0.4903222620487213, + "learning_rate": 9.530523098267173e-05, + "loss": 1.8776, + "step": 5373 + }, + { + "epoch": 1.649478207489257, + "grad_norm": 0.4733131229877472, + "learning_rate": 9.530312794515633e-05, + "loss": 1.958, + "step": 5374 + }, + { + "epoch": 1.6497851442602824, + "grad_norm": 0.4134232997894287, + "learning_rate": 9.530102445992886e-05, + "loss": 1.9184, + "step": 5375 + }, + { + "epoch": 1.6500920810313076, + "grad_norm": 0.43521758913993835, + "learning_rate": 9.529892052701012e-05, + "loss": 1.9383, + "step": 5376 + }, + { + "epoch": 1.6503990178023327, + "grad_norm": 0.5098583102226257, + "learning_rate": 9.52968161464209e-05, + "loss": 1.9596, + "step": 5377 + }, + { + "epoch": 1.650705954573358, + "grad_norm": 0.48421037197113037, + "learning_rate": 9.5294711318182e-05, + "loss": 1.9258, + "step": 5378 + }, + { + "epoch": 1.651012891344383, + "grad_norm": 0.4039461314678192, + "learning_rate": 9.52926060423142e-05, + "loss": 1.9975, + "step": 5379 + }, + { + "epoch": 1.6513198281154082, + "grad_norm": 0.491858571767807, + "learning_rate": 9.529050031883832e-05, + "loss": 1.9564, + "step": 5380 + }, + { + "epoch": 1.6516267648864333, + "grad_norm": 0.45920100808143616, + "learning_rate": 9.528839414777517e-05, + "loss": 1.8513, + "step": 5381 + }, + { + "epoch": 1.6519337016574585, + "grad_norm": 0.4812139868736267, + "learning_rate": 9.528628752914558e-05, + "loss": 1.9638, + "step": 5382 + }, + { + "epoch": 1.6522406384284838, + "grad_norm": 0.38021141290664673, + "learning_rate": 9.528418046297034e-05, + "loss": 1.848, + "step": 5383 + }, + { + "epoch": 1.652547575199509, + "grad_norm": 0.438681960105896, + "learning_rate": 9.52820729492703e-05, + "loss": 1.9931, + "step": 5384 + }, + { + "epoch": 1.652854511970534, + "grad_norm": 0.4387293756008148, + "learning_rate": 9.527996498806627e-05, + "loss": 1.9969, + "step": 5385 + }, + { + "epoch": 1.6531614487415593, + "grad_norm": 0.43315380811691284, + "learning_rate": 9.527785657937907e-05, + "loss": 1.9607, + "step": 5386 + }, + { + "epoch": 1.6534683855125845, + "grad_norm": 0.4800446927547455, + "learning_rate": 9.527574772322956e-05, + "loss": 1.9645, + "step": 5387 + }, + { + "epoch": 1.6537753222836096, + "grad_norm": 0.45495909452438354, + "learning_rate": 9.527363841963857e-05, + "loss": 1.8748, + "step": 5388 + }, + { + "epoch": 1.654082259054635, + "grad_norm": 0.4052638113498688, + "learning_rate": 9.527152866862696e-05, + "loss": 1.9491, + "step": 5389 + }, + { + "epoch": 1.6543891958256598, + "grad_norm": 0.44545745849609375, + "learning_rate": 9.526941847021558e-05, + "loss": 1.8938, + "step": 5390 + }, + { + "epoch": 1.6546961325966851, + "grad_norm": 0.5576399564743042, + "learning_rate": 9.526730782442526e-05, + "loss": 1.9656, + "step": 5391 + }, + { + "epoch": 1.6550030693677102, + "grad_norm": 0.5678401589393616, + "learning_rate": 9.526519673127686e-05, + "loss": 1.9914, + "step": 5392 + }, + { + "epoch": 1.6553100061387354, + "grad_norm": 0.4391598701477051, + "learning_rate": 9.526308519079127e-05, + "loss": 1.9452, + "step": 5393 + }, + { + "epoch": 1.6556169429097607, + "grad_norm": 0.4375559091567993, + "learning_rate": 9.526097320298934e-05, + "loss": 1.9335, + "step": 5394 + }, + { + "epoch": 1.6559238796807858, + "grad_norm": 0.4976498782634735, + "learning_rate": 9.525886076789194e-05, + "loss": 2.0065, + "step": 5395 + }, + { + "epoch": 1.656230816451811, + "grad_norm": 0.5966445207595825, + "learning_rate": 9.525674788551996e-05, + "loss": 1.9924, + "step": 5396 + }, + { + "epoch": 1.6565377532228363, + "grad_norm": 0.5119359493255615, + "learning_rate": 9.525463455589427e-05, + "loss": 2.0061, + "step": 5397 + }, + { + "epoch": 1.6568446899938611, + "grad_norm": 0.46835067868232727, + "learning_rate": 9.525252077903574e-05, + "loss": 1.9441, + "step": 5398 + }, + { + "epoch": 1.6571516267648865, + "grad_norm": 0.5319140553474426, + "learning_rate": 9.52504065549653e-05, + "loss": 1.9704, + "step": 5399 + }, + { + "epoch": 1.6574585635359116, + "grad_norm": 0.5132572054862976, + "learning_rate": 9.52482918837038e-05, + "loss": 1.9037, + "step": 5400 + }, + { + "epoch": 1.6577655003069367, + "grad_norm": 0.41260987520217896, + "learning_rate": 9.524617676527218e-05, + "loss": 1.9103, + "step": 5401 + }, + { + "epoch": 1.658072437077962, + "grad_norm": 0.41780540347099304, + "learning_rate": 9.524406119969131e-05, + "loss": 1.9419, + "step": 5402 + }, + { + "epoch": 1.6583793738489871, + "grad_norm": 0.42015889286994934, + "learning_rate": 9.524194518698211e-05, + "loss": 1.9143, + "step": 5403 + }, + { + "epoch": 1.6586863106200123, + "grad_norm": 0.4449796676635742, + "learning_rate": 9.523982872716548e-05, + "loss": 1.9794, + "step": 5404 + }, + { + "epoch": 1.6589932473910376, + "grad_norm": 0.4392293393611908, + "learning_rate": 9.523771182026237e-05, + "loss": 1.8687, + "step": 5405 + }, + { + "epoch": 1.6593001841620625, + "grad_norm": 0.49595963954925537, + "learning_rate": 9.523559446629366e-05, + "loss": 2.013, + "step": 5406 + }, + { + "epoch": 1.6596071209330878, + "grad_norm": 0.4456728994846344, + "learning_rate": 9.523347666528029e-05, + "loss": 1.9269, + "step": 5407 + }, + { + "epoch": 1.659914057704113, + "grad_norm": 0.3835284411907196, + "learning_rate": 9.52313584172432e-05, + "loss": 1.9042, + "step": 5408 + }, + { + "epoch": 1.660220994475138, + "grad_norm": 0.39068692922592163, + "learning_rate": 9.522923972220332e-05, + "loss": 1.999, + "step": 5409 + }, + { + "epoch": 1.6605279312461634, + "grad_norm": 0.4522729814052582, + "learning_rate": 9.522712058018157e-05, + "loss": 1.9546, + "step": 5410 + }, + { + "epoch": 1.6608348680171885, + "grad_norm": 0.3834155201911926, + "learning_rate": 9.522500099119891e-05, + "loss": 1.9184, + "step": 5411 + }, + { + "epoch": 1.6611418047882136, + "grad_norm": 0.36149126291275024, + "learning_rate": 9.522288095527629e-05, + "loss": 1.8973, + "step": 5412 + }, + { + "epoch": 1.661448741559239, + "grad_norm": 0.3502398729324341, + "learning_rate": 9.522076047243464e-05, + "loss": 1.8775, + "step": 5413 + }, + { + "epoch": 1.6617556783302638, + "grad_norm": 0.36552321910858154, + "learning_rate": 9.521863954269495e-05, + "loss": 1.901, + "step": 5414 + }, + { + "epoch": 1.6620626151012892, + "grad_norm": 0.37815216183662415, + "learning_rate": 9.521651816607814e-05, + "loss": 1.9143, + "step": 5415 + }, + { + "epoch": 1.6623695518723143, + "grad_norm": 0.4048994481563568, + "learning_rate": 9.52143963426052e-05, + "loss": 1.9892, + "step": 5416 + }, + { + "epoch": 1.6626764886433394, + "grad_norm": 0.35271233320236206, + "learning_rate": 9.52122740722971e-05, + "loss": 1.9209, + "step": 5417 + }, + { + "epoch": 1.6629834254143647, + "grad_norm": 0.405009925365448, + "learning_rate": 9.521015135517482e-05, + "loss": 1.9583, + "step": 5418 + }, + { + "epoch": 1.6632903621853898, + "grad_norm": 0.4041683077812195, + "learning_rate": 9.520802819125932e-05, + "loss": 1.8937, + "step": 5419 + }, + { + "epoch": 1.663597298956415, + "grad_norm": 0.41353970766067505, + "learning_rate": 9.520590458057157e-05, + "loss": 1.949, + "step": 5420 + }, + { + "epoch": 1.6639042357274403, + "grad_norm": 0.3704569637775421, + "learning_rate": 9.520378052313258e-05, + "loss": 1.9287, + "step": 5421 + }, + { + "epoch": 1.6642111724984652, + "grad_norm": 0.4043133854866028, + "learning_rate": 9.520165601896334e-05, + "loss": 1.9116, + "step": 5422 + }, + { + "epoch": 1.6645181092694905, + "grad_norm": 0.3976849317550659, + "learning_rate": 9.519953106808485e-05, + "loss": 1.9578, + "step": 5423 + }, + { + "epoch": 1.6648250460405156, + "grad_norm": 0.41225695610046387, + "learning_rate": 9.51974056705181e-05, + "loss": 1.8861, + "step": 5424 + }, + { + "epoch": 1.6651319828115407, + "grad_norm": 0.40096259117126465, + "learning_rate": 9.519527982628409e-05, + "loss": 1.926, + "step": 5425 + }, + { + "epoch": 1.665438919582566, + "grad_norm": 0.4373134970664978, + "learning_rate": 9.519315353540384e-05, + "loss": 1.8761, + "step": 5426 + }, + { + "epoch": 1.6657458563535912, + "grad_norm": 0.3798682689666748, + "learning_rate": 9.519102679789835e-05, + "loss": 1.8655, + "step": 5427 + }, + { + "epoch": 1.6660527931246163, + "grad_norm": 0.3889687955379486, + "learning_rate": 9.518889961378865e-05, + "loss": 1.8928, + "step": 5428 + }, + { + "epoch": 1.6663597298956416, + "grad_norm": 0.39567697048187256, + "learning_rate": 9.518677198309575e-05, + "loss": 1.9193, + "step": 5429 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.37571004033088684, + "learning_rate": 9.51846439058407e-05, + "loss": 1.9653, + "step": 5430 + }, + { + "epoch": 1.6669736034376919, + "grad_norm": 0.36011725664138794, + "learning_rate": 9.518251538204451e-05, + "loss": 1.9202, + "step": 5431 + }, + { + "epoch": 1.667280540208717, + "grad_norm": 0.42314839363098145, + "learning_rate": 9.518038641172822e-05, + "loss": 1.9883, + "step": 5432 + }, + { + "epoch": 1.667587476979742, + "grad_norm": 0.3986029326915741, + "learning_rate": 9.517825699491287e-05, + "loss": 1.9838, + "step": 5433 + }, + { + "epoch": 1.6678944137507674, + "grad_norm": 0.388236939907074, + "learning_rate": 9.517612713161949e-05, + "loss": 1.901, + "step": 5434 + }, + { + "epoch": 1.6682013505217925, + "grad_norm": 0.3849826455116272, + "learning_rate": 9.517399682186917e-05, + "loss": 1.9621, + "step": 5435 + }, + { + "epoch": 1.6685082872928176, + "grad_norm": 0.40182530879974365, + "learning_rate": 9.517186606568292e-05, + "loss": 1.9081, + "step": 5436 + }, + { + "epoch": 1.668815224063843, + "grad_norm": 0.4260261654853821, + "learning_rate": 9.516973486308181e-05, + "loss": 1.9701, + "step": 5437 + }, + { + "epoch": 1.6691221608348679, + "grad_norm": 0.4035099744796753, + "learning_rate": 9.516760321408692e-05, + "loss": 1.9269, + "step": 5438 + }, + { + "epoch": 1.6694290976058932, + "grad_norm": 0.42106589674949646, + "learning_rate": 9.51654711187193e-05, + "loss": 1.9026, + "step": 5439 + }, + { + "epoch": 1.6697360343769183, + "grad_norm": 0.4629819989204407, + "learning_rate": 9.516333857700001e-05, + "loss": 1.9128, + "step": 5440 + }, + { + "epoch": 1.6700429711479434, + "grad_norm": 0.3824837803840637, + "learning_rate": 9.516120558895014e-05, + "loss": 1.8861, + "step": 5441 + }, + { + "epoch": 1.6703499079189688, + "grad_norm": 0.37263223528862, + "learning_rate": 9.515907215459076e-05, + "loss": 1.9098, + "step": 5442 + }, + { + "epoch": 1.6706568446899939, + "grad_norm": 0.3980494439601898, + "learning_rate": 9.515693827394299e-05, + "loss": 1.9764, + "step": 5443 + }, + { + "epoch": 1.670963781461019, + "grad_norm": 0.5064507722854614, + "learning_rate": 9.515480394702786e-05, + "loss": 1.9771, + "step": 5444 + }, + { + "epoch": 1.6712707182320443, + "grad_norm": 0.5012909770011902, + "learning_rate": 9.515266917386649e-05, + "loss": 1.9162, + "step": 5445 + }, + { + "epoch": 1.6715776550030692, + "grad_norm": 0.5422279238700867, + "learning_rate": 9.515053395447999e-05, + "loss": 1.8913, + "step": 5446 + }, + { + "epoch": 1.6718845917740945, + "grad_norm": 0.4677022397518158, + "learning_rate": 9.514839828888946e-05, + "loss": 1.9156, + "step": 5447 + }, + { + "epoch": 1.6721915285451197, + "grad_norm": 0.39561185240745544, + "learning_rate": 9.514626217711597e-05, + "loss": 1.9203, + "step": 5448 + }, + { + "epoch": 1.6724984653161448, + "grad_norm": 0.4435743987560272, + "learning_rate": 9.514412561918068e-05, + "loss": 1.953, + "step": 5449 + }, + { + "epoch": 1.67280540208717, + "grad_norm": 0.5383535027503967, + "learning_rate": 9.514198861510467e-05, + "loss": 1.9662, + "step": 5450 + }, + { + "epoch": 1.6731123388581952, + "grad_norm": 0.4787214696407318, + "learning_rate": 9.513985116490906e-05, + "loss": 1.9278, + "step": 5451 + }, + { + "epoch": 1.6734192756292203, + "grad_norm": 0.40962034463882446, + "learning_rate": 9.513771326861501e-05, + "loss": 1.9267, + "step": 5452 + }, + { + "epoch": 1.6737262124002457, + "grad_norm": 0.43605929613113403, + "learning_rate": 9.513557492624359e-05, + "loss": 1.9537, + "step": 5453 + }, + { + "epoch": 1.6740331491712708, + "grad_norm": 0.46278494596481323, + "learning_rate": 9.513343613781599e-05, + "loss": 1.9383, + "step": 5454 + }, + { + "epoch": 1.6743400859422959, + "grad_norm": 0.4052918255329132, + "learning_rate": 9.513129690335331e-05, + "loss": 1.9289, + "step": 5455 + }, + { + "epoch": 1.6746470227133212, + "grad_norm": 0.37791141867637634, + "learning_rate": 9.51291572228767e-05, + "loss": 1.9185, + "step": 5456 + }, + { + "epoch": 1.674953959484346, + "grad_norm": 0.41135111451148987, + "learning_rate": 9.512701709640731e-05, + "loss": 2.0003, + "step": 5457 + }, + { + "epoch": 1.6752608962553714, + "grad_norm": 0.41175320744514465, + "learning_rate": 9.512487652396629e-05, + "loss": 1.9307, + "step": 5458 + }, + { + "epoch": 1.6755678330263966, + "grad_norm": 0.40061330795288086, + "learning_rate": 9.512273550557478e-05, + "loss": 1.9361, + "step": 5459 + }, + { + "epoch": 1.6758747697974217, + "grad_norm": 0.3938329219818115, + "learning_rate": 9.512059404125397e-05, + "loss": 1.9419, + "step": 5460 + }, + { + "epoch": 1.676181706568447, + "grad_norm": 0.42825883626937866, + "learning_rate": 9.511845213102498e-05, + "loss": 1.9201, + "step": 5461 + }, + { + "epoch": 1.6764886433394721, + "grad_norm": 0.3795798122882843, + "learning_rate": 9.511630977490901e-05, + "loss": 1.9872, + "step": 5462 + }, + { + "epoch": 1.6767955801104972, + "grad_norm": 0.3639005422592163, + "learning_rate": 9.511416697292724e-05, + "loss": 1.9066, + "step": 5463 + }, + { + "epoch": 1.6771025168815226, + "grad_norm": 0.4200088381767273, + "learning_rate": 9.511202372510082e-05, + "loss": 1.9928, + "step": 5464 + }, + { + "epoch": 1.6774094536525475, + "grad_norm": 0.436638742685318, + "learning_rate": 9.510988003145092e-05, + "loss": 1.8527, + "step": 5465 + }, + { + "epoch": 1.6777163904235728, + "grad_norm": 0.40901345014572144, + "learning_rate": 9.510773589199877e-05, + "loss": 1.9915, + "step": 5466 + }, + { + "epoch": 1.678023327194598, + "grad_norm": 0.39717167615890503, + "learning_rate": 9.510559130676553e-05, + "loss": 1.9682, + "step": 5467 + }, + { + "epoch": 1.678330263965623, + "grad_norm": 0.37574490904808044, + "learning_rate": 9.510344627577239e-05, + "loss": 1.9641, + "step": 5468 + }, + { + "epoch": 1.6786372007366483, + "grad_norm": 0.36686137318611145, + "learning_rate": 9.510130079904057e-05, + "loss": 1.9082, + "step": 5469 + }, + { + "epoch": 1.6789441375076735, + "grad_norm": 0.37321972846984863, + "learning_rate": 9.509915487659125e-05, + "loss": 1.8911, + "step": 5470 + }, + { + "epoch": 1.6792510742786986, + "grad_norm": 0.3911389112472534, + "learning_rate": 9.509700850844566e-05, + "loss": 1.9721, + "step": 5471 + }, + { + "epoch": 1.679558011049724, + "grad_norm": 0.41182973980903625, + "learning_rate": 9.509486169462499e-05, + "loss": 1.9188, + "step": 5472 + }, + { + "epoch": 1.6798649478207488, + "grad_norm": 0.4141900837421417, + "learning_rate": 9.509271443515047e-05, + "loss": 1.875, + "step": 5473 + }, + { + "epoch": 1.6801718845917741, + "grad_norm": 0.4259745478630066, + "learning_rate": 9.509056673004333e-05, + "loss": 1.9258, + "step": 5474 + }, + { + "epoch": 1.6804788213627992, + "grad_norm": 0.47081178426742554, + "learning_rate": 9.508841857932476e-05, + "loss": 2.0494, + "step": 5475 + }, + { + "epoch": 1.6807857581338244, + "grad_norm": 0.5346465110778809, + "learning_rate": 9.508626998301602e-05, + "loss": 1.9371, + "step": 5476 + }, + { + "epoch": 1.6810926949048497, + "grad_norm": 0.5532976388931274, + "learning_rate": 9.508412094113832e-05, + "loss": 1.8727, + "step": 5477 + }, + { + "epoch": 1.6813996316758748, + "grad_norm": 0.5262138843536377, + "learning_rate": 9.508197145371294e-05, + "loss": 1.9098, + "step": 5478 + }, + { + "epoch": 1.6817065684469, + "grad_norm": 0.47581788897514343, + "learning_rate": 9.507982152076108e-05, + "loss": 1.9174, + "step": 5479 + }, + { + "epoch": 1.6820135052179253, + "grad_norm": 0.41795024275779724, + "learning_rate": 9.507767114230399e-05, + "loss": 1.9333, + "step": 5480 + }, + { + "epoch": 1.6823204419889501, + "grad_norm": 0.5213392376899719, + "learning_rate": 9.507552031836295e-05, + "loss": 1.9731, + "step": 5481 + }, + { + "epoch": 1.6826273787599755, + "grad_norm": 0.624969482421875, + "learning_rate": 9.507336904895919e-05, + "loss": 1.965, + "step": 5482 + }, + { + "epoch": 1.6829343155310006, + "grad_norm": 0.5719303488731384, + "learning_rate": 9.507121733411397e-05, + "loss": 1.9325, + "step": 5483 + }, + { + "epoch": 1.6832412523020257, + "grad_norm": 0.45429563522338867, + "learning_rate": 9.506906517384858e-05, + "loss": 1.8846, + "step": 5484 + }, + { + "epoch": 1.683548189073051, + "grad_norm": 0.4679521322250366, + "learning_rate": 9.506691256818427e-05, + "loss": 1.9609, + "step": 5485 + }, + { + "epoch": 1.6838551258440762, + "grad_norm": 0.64385986328125, + "learning_rate": 9.50647595171423e-05, + "loss": 1.9138, + "step": 5486 + }, + { + "epoch": 1.6841620626151013, + "grad_norm": 0.6783073544502258, + "learning_rate": 9.506260602074398e-05, + "loss": 2.0252, + "step": 5487 + }, + { + "epoch": 1.6844689993861266, + "grad_norm": 0.6151844263076782, + "learning_rate": 9.506045207901058e-05, + "loss": 2.0077, + "step": 5488 + }, + { + "epoch": 1.6847759361571515, + "grad_norm": 0.43046683073043823, + "learning_rate": 9.505829769196338e-05, + "loss": 1.8945, + "step": 5489 + }, + { + "epoch": 1.6850828729281768, + "grad_norm": 0.44831258058547974, + "learning_rate": 9.505614285962366e-05, + "loss": 1.9775, + "step": 5490 + }, + { + "epoch": 1.685389809699202, + "grad_norm": 0.4917668402194977, + "learning_rate": 9.505398758201272e-05, + "loss": 1.9115, + "step": 5491 + }, + { + "epoch": 1.685696746470227, + "grad_norm": 0.4595036506652832, + "learning_rate": 9.505183185915187e-05, + "loss": 1.9103, + "step": 5492 + }, + { + "epoch": 1.6860036832412524, + "grad_norm": 0.43335607647895813, + "learning_rate": 9.504967569106243e-05, + "loss": 1.9147, + "step": 5493 + }, + { + "epoch": 1.6863106200122775, + "grad_norm": 0.42885956168174744, + "learning_rate": 9.504751907776567e-05, + "loss": 2.0085, + "step": 5494 + }, + { + "epoch": 1.6866175567833026, + "grad_norm": 0.4121492803096771, + "learning_rate": 9.504536201928295e-05, + "loss": 1.9212, + "step": 5495 + }, + { + "epoch": 1.686924493554328, + "grad_norm": 0.4387015700340271, + "learning_rate": 9.504320451563555e-05, + "loss": 1.9202, + "step": 5496 + }, + { + "epoch": 1.6872314303253528, + "grad_norm": 0.4333394467830658, + "learning_rate": 9.504104656684481e-05, + "loss": 1.9165, + "step": 5497 + }, + { + "epoch": 1.6875383670963782, + "grad_norm": 0.37835901975631714, + "learning_rate": 9.503888817293203e-05, + "loss": 1.9087, + "step": 5498 + }, + { + "epoch": 1.6878453038674033, + "grad_norm": 0.42156684398651123, + "learning_rate": 9.503672933391857e-05, + "loss": 1.8909, + "step": 5499 + }, + { + "epoch": 1.6881522406384284, + "grad_norm": 0.4315885603427887, + "learning_rate": 9.503457004982574e-05, + "loss": 1.8892, + "step": 5500 + }, + { + "epoch": 1.6884591774094537, + "grad_norm": 0.4349892735481262, + "learning_rate": 9.50324103206749e-05, + "loss": 1.9532, + "step": 5501 + }, + { + "epoch": 1.6887661141804788, + "grad_norm": 0.45786523818969727, + "learning_rate": 9.503025014648739e-05, + "loss": 1.9285, + "step": 5502 + }, + { + "epoch": 1.689073050951504, + "grad_norm": 0.36640092730522156, + "learning_rate": 9.502808952728456e-05, + "loss": 1.9167, + "step": 5503 + }, + { + "epoch": 1.6893799877225293, + "grad_norm": 0.46942031383514404, + "learning_rate": 9.502592846308775e-05, + "loss": 2.08, + "step": 5504 + }, + { + "epoch": 1.6896869244935542, + "grad_norm": 0.44714173674583435, + "learning_rate": 9.502376695391833e-05, + "loss": 1.9618, + "step": 5505 + }, + { + "epoch": 1.6899938612645795, + "grad_norm": 0.4216810464859009, + "learning_rate": 9.502160499979764e-05, + "loss": 1.888, + "step": 5506 + }, + { + "epoch": 1.6903007980356046, + "grad_norm": 0.40471377968788147, + "learning_rate": 9.501944260074709e-05, + "loss": 1.9048, + "step": 5507 + }, + { + "epoch": 1.6906077348066297, + "grad_norm": 0.399309366941452, + "learning_rate": 9.501727975678801e-05, + "loss": 1.8796, + "step": 5508 + }, + { + "epoch": 1.690914671577655, + "grad_norm": 0.36903873085975647, + "learning_rate": 9.501511646794176e-05, + "loss": 1.9607, + "step": 5509 + }, + { + "epoch": 1.6912216083486802, + "grad_norm": 0.40781939029693604, + "learning_rate": 9.501295273422977e-05, + "loss": 1.9328, + "step": 5510 + }, + { + "epoch": 1.6915285451197053, + "grad_norm": 0.38062483072280884, + "learning_rate": 9.50107885556734e-05, + "loss": 1.9552, + "step": 5511 + }, + { + "epoch": 1.6918354818907306, + "grad_norm": 0.4047648012638092, + "learning_rate": 9.500862393229402e-05, + "loss": 1.9503, + "step": 5512 + }, + { + "epoch": 1.6921424186617555, + "grad_norm": 0.3829517066478729, + "learning_rate": 9.500645886411305e-05, + "loss": 1.9034, + "step": 5513 + }, + { + "epoch": 1.6924493554327809, + "grad_norm": 0.3657867908477783, + "learning_rate": 9.500429335115188e-05, + "loss": 1.869, + "step": 5514 + }, + { + "epoch": 1.692756292203806, + "grad_norm": 0.410877525806427, + "learning_rate": 9.50021273934319e-05, + "loss": 1.9824, + "step": 5515 + }, + { + "epoch": 1.693063228974831, + "grad_norm": 0.420682817697525, + "learning_rate": 9.499996099097453e-05, + "loss": 1.969, + "step": 5516 + }, + { + "epoch": 1.6933701657458564, + "grad_norm": 0.44578227400779724, + "learning_rate": 9.499779414380115e-05, + "loss": 1.9513, + "step": 5517 + }, + { + "epoch": 1.6936771025168815, + "grad_norm": 0.42710423469543457, + "learning_rate": 9.499562685193319e-05, + "loss": 1.9423, + "step": 5518 + }, + { + "epoch": 1.6939840392879066, + "grad_norm": 0.4503214657306671, + "learning_rate": 9.49934591153921e-05, + "loss": 1.9849, + "step": 5519 + }, + { + "epoch": 1.694290976058932, + "grad_norm": 0.427157998085022, + "learning_rate": 9.499129093419926e-05, + "loss": 1.9502, + "step": 5520 + }, + { + "epoch": 1.6945979128299569, + "grad_norm": 0.4356638491153717, + "learning_rate": 9.498912230837611e-05, + "loss": 1.8593, + "step": 5521 + }, + { + "epoch": 1.6949048496009822, + "grad_norm": 0.3894338309764862, + "learning_rate": 9.498695323794409e-05, + "loss": 1.8857, + "step": 5522 + }, + { + "epoch": 1.6952117863720073, + "grad_norm": 0.4285121262073517, + "learning_rate": 9.498478372292464e-05, + "loss": 1.9774, + "step": 5523 + }, + { + "epoch": 1.6955187231430324, + "grad_norm": 0.4316183924674988, + "learning_rate": 9.498261376333916e-05, + "loss": 1.9067, + "step": 5524 + }, + { + "epoch": 1.6958256599140578, + "grad_norm": 0.3760167956352234, + "learning_rate": 9.498044335920914e-05, + "loss": 1.8375, + "step": 5525 + }, + { + "epoch": 1.6961325966850829, + "grad_norm": 0.4327097237110138, + "learning_rate": 9.497827251055602e-05, + "loss": 1.9333, + "step": 5526 + }, + { + "epoch": 1.696439533456108, + "grad_norm": 0.4169953167438507, + "learning_rate": 9.497610121740126e-05, + "loss": 1.9015, + "step": 5527 + }, + { + "epoch": 1.6967464702271333, + "grad_norm": 0.3915253281593323, + "learning_rate": 9.49739294797663e-05, + "loss": 1.8608, + "step": 5528 + }, + { + "epoch": 1.6970534069981584, + "grad_norm": 0.4071075916290283, + "learning_rate": 9.497175729767259e-05, + "loss": 1.9336, + "step": 5529 + }, + { + "epoch": 1.6973603437691835, + "grad_norm": 0.3550303876399994, + "learning_rate": 9.496958467114163e-05, + "loss": 1.8614, + "step": 5530 + }, + { + "epoch": 1.6976672805402089, + "grad_norm": 0.3757273554801941, + "learning_rate": 9.496741160019487e-05, + "loss": 1.9959, + "step": 5531 + }, + { + "epoch": 1.6979742173112338, + "grad_norm": 0.4126262366771698, + "learning_rate": 9.49652380848538e-05, + "loss": 1.935, + "step": 5532 + }, + { + "epoch": 1.698281154082259, + "grad_norm": 0.46366190910339355, + "learning_rate": 9.496306412513988e-05, + "loss": 1.9336, + "step": 5533 + }, + { + "epoch": 1.6985880908532842, + "grad_norm": 0.42553630471229553, + "learning_rate": 9.496088972107463e-05, + "loss": 1.9388, + "step": 5534 + }, + { + "epoch": 1.6988950276243093, + "grad_norm": 0.4060843884944916, + "learning_rate": 9.49587148726795e-05, + "loss": 1.917, + "step": 5535 + }, + { + "epoch": 1.6992019643953347, + "grad_norm": 0.37994736433029175, + "learning_rate": 9.495653957997601e-05, + "loss": 1.9268, + "step": 5536 + }, + { + "epoch": 1.6995089011663598, + "grad_norm": 0.4148559272289276, + "learning_rate": 9.495436384298563e-05, + "loss": 1.8936, + "step": 5537 + }, + { + "epoch": 1.6998158379373849, + "grad_norm": 0.39814767241477966, + "learning_rate": 9.495218766172989e-05, + "loss": 1.9468, + "step": 5538 + }, + { + "epoch": 1.7001227747084102, + "grad_norm": 0.40800294280052185, + "learning_rate": 9.495001103623027e-05, + "loss": 1.9649, + "step": 5539 + }, + { + "epoch": 1.7004297114794351, + "grad_norm": 0.4225989282131195, + "learning_rate": 9.49478339665083e-05, + "loss": 1.987, + "step": 5540 + }, + { + "epoch": 1.7007366482504604, + "grad_norm": 0.4280939996242523, + "learning_rate": 9.494565645258551e-05, + "loss": 2.0487, + "step": 5541 + }, + { + "epoch": 1.7010435850214856, + "grad_norm": 0.44816237688064575, + "learning_rate": 9.494347849448338e-05, + "loss": 1.9112, + "step": 5542 + }, + { + "epoch": 1.7013505217925107, + "grad_norm": 0.424629271030426, + "learning_rate": 9.494130009222346e-05, + "loss": 1.9284, + "step": 5543 + }, + { + "epoch": 1.701657458563536, + "grad_norm": 0.40010082721710205, + "learning_rate": 9.493912124582727e-05, + "loss": 1.9307, + "step": 5544 + }, + { + "epoch": 1.7019643953345611, + "grad_norm": 0.42541825771331787, + "learning_rate": 9.493694195531633e-05, + "loss": 2.0009, + "step": 5545 + }, + { + "epoch": 1.7022713321055862, + "grad_norm": 0.39693546295166016, + "learning_rate": 9.49347622207122e-05, + "loss": 1.9237, + "step": 5546 + }, + { + "epoch": 1.7025782688766116, + "grad_norm": 0.37853676080703735, + "learning_rate": 9.493258204203644e-05, + "loss": 1.9212, + "step": 5547 + }, + { + "epoch": 1.7028852056476365, + "grad_norm": 0.3856247663497925, + "learning_rate": 9.493040141931054e-05, + "loss": 1.926, + "step": 5548 + }, + { + "epoch": 1.7031921424186618, + "grad_norm": 0.3429555892944336, + "learning_rate": 9.492822035255608e-05, + "loss": 1.8854, + "step": 5549 + }, + { + "epoch": 1.703499079189687, + "grad_norm": 0.3500545620918274, + "learning_rate": 9.49260388417946e-05, + "loss": 1.8627, + "step": 5550 + }, + { + "epoch": 1.703806015960712, + "grad_norm": 0.3461480140686035, + "learning_rate": 9.49238568870477e-05, + "loss": 1.8962, + "step": 5551 + }, + { + "epoch": 1.7041129527317374, + "grad_norm": 0.36311015486717224, + "learning_rate": 9.492167448833691e-05, + "loss": 1.9398, + "step": 5552 + }, + { + "epoch": 1.7044198895027625, + "grad_norm": 0.36770105361938477, + "learning_rate": 9.491949164568379e-05, + "loss": 1.9083, + "step": 5553 + }, + { + "epoch": 1.7047268262737876, + "grad_norm": 0.42491769790649414, + "learning_rate": 9.491730835910993e-05, + "loss": 1.8874, + "step": 5554 + }, + { + "epoch": 1.705033763044813, + "grad_norm": 0.5321764945983887, + "learning_rate": 9.491512462863691e-05, + "loss": 1.9813, + "step": 5555 + }, + { + "epoch": 1.7053406998158378, + "grad_norm": 0.5481576323509216, + "learning_rate": 9.49129404542863e-05, + "loss": 1.8696, + "step": 5556 + }, + { + "epoch": 1.7056476365868631, + "grad_norm": 0.47720953822135925, + "learning_rate": 9.491075583607969e-05, + "loss": 1.9026, + "step": 5557 + }, + { + "epoch": 1.7059545733578882, + "grad_norm": 0.3976534605026245, + "learning_rate": 9.490857077403865e-05, + "loss": 1.8551, + "step": 5558 + }, + { + "epoch": 1.7062615101289134, + "grad_norm": 0.3744281828403473, + "learning_rate": 9.49063852681848e-05, + "loss": 2.012, + "step": 5559 + }, + { + "epoch": 1.7065684468999387, + "grad_norm": 0.3931918740272522, + "learning_rate": 9.490419931853974e-05, + "loss": 1.845, + "step": 5560 + }, + { + "epoch": 1.7068753836709638, + "grad_norm": 0.5411466956138611, + "learning_rate": 9.490201292512506e-05, + "loss": 2.0225, + "step": 5561 + }, + { + "epoch": 1.707182320441989, + "grad_norm": 0.6602910757064819, + "learning_rate": 9.489982608796237e-05, + "loss": 1.9559, + "step": 5562 + }, + { + "epoch": 1.7074892572130143, + "grad_norm": 0.5455329418182373, + "learning_rate": 9.489763880707329e-05, + "loss": 1.8855, + "step": 5563 + }, + { + "epoch": 1.7077961939840391, + "grad_norm": 0.42309099435806274, + "learning_rate": 9.489545108247941e-05, + "loss": 1.8784, + "step": 5564 + }, + { + "epoch": 1.7081031307550645, + "grad_norm": 0.3817001283168793, + "learning_rate": 9.489326291420239e-05, + "loss": 1.8926, + "step": 5565 + }, + { + "epoch": 1.7084100675260896, + "grad_norm": 0.5077582597732544, + "learning_rate": 9.489107430226381e-05, + "loss": 1.8742, + "step": 5566 + }, + { + "epoch": 1.7087170042971147, + "grad_norm": 0.5634065866470337, + "learning_rate": 9.488888524668533e-05, + "loss": 1.9251, + "step": 5567 + }, + { + "epoch": 1.70902394106814, + "grad_norm": 0.5182891488075256, + "learning_rate": 9.488669574748859e-05, + "loss": 1.9689, + "step": 5568 + }, + { + "epoch": 1.7093308778391652, + "grad_norm": 0.4180498719215393, + "learning_rate": 9.48845058046952e-05, + "loss": 1.9248, + "step": 5569 + }, + { + "epoch": 1.7096378146101903, + "grad_norm": 0.4833194315433502, + "learning_rate": 9.488231541832682e-05, + "loss": 2.0115, + "step": 5570 + }, + { + "epoch": 1.7099447513812156, + "grad_norm": 0.46525415778160095, + "learning_rate": 9.488012458840509e-05, + "loss": 1.9108, + "step": 5571 + }, + { + "epoch": 1.7102516881522405, + "grad_norm": 0.5051191449165344, + "learning_rate": 9.487793331495166e-05, + "loss": 1.9055, + "step": 5572 + }, + { + "epoch": 1.7105586249232658, + "grad_norm": 0.4713154137134552, + "learning_rate": 9.48757415979882e-05, + "loss": 1.9104, + "step": 5573 + }, + { + "epoch": 1.710865561694291, + "grad_norm": 0.44901835918426514, + "learning_rate": 9.487354943753635e-05, + "loss": 1.9536, + "step": 5574 + }, + { + "epoch": 1.711172498465316, + "grad_norm": 0.41106006503105164, + "learning_rate": 9.487135683361778e-05, + "loss": 1.9549, + "step": 5575 + }, + { + "epoch": 1.7114794352363414, + "grad_norm": 0.4571320116519928, + "learning_rate": 9.486916378625416e-05, + "loss": 1.859, + "step": 5576 + }, + { + "epoch": 1.7117863720073665, + "grad_norm": 0.4423540532588959, + "learning_rate": 9.486697029546718e-05, + "loss": 1.9621, + "step": 5577 + }, + { + "epoch": 1.7120933087783916, + "grad_norm": 0.44291070103645325, + "learning_rate": 9.48647763612785e-05, + "loss": 1.8567, + "step": 5578 + }, + { + "epoch": 1.712400245549417, + "grad_norm": 0.4374423921108246, + "learning_rate": 9.486258198370981e-05, + "loss": 1.9754, + "step": 5579 + }, + { + "epoch": 1.7127071823204418, + "grad_norm": 0.44008153676986694, + "learning_rate": 9.486038716278277e-05, + "loss": 1.8815, + "step": 5580 + }, + { + "epoch": 1.7130141190914672, + "grad_norm": 0.3571348190307617, + "learning_rate": 9.48581918985191e-05, + "loss": 1.8948, + "step": 5581 + }, + { + "epoch": 1.7133210558624923, + "grad_norm": 0.42260754108428955, + "learning_rate": 9.485599619094049e-05, + "loss": 1.9964, + "step": 5582 + }, + { + "epoch": 1.7136279926335174, + "grad_norm": 0.44568777084350586, + "learning_rate": 9.485380004006863e-05, + "loss": 1.9596, + "step": 5583 + }, + { + "epoch": 1.7139349294045427, + "grad_norm": 0.5488269925117493, + "learning_rate": 9.485160344592523e-05, + "loss": 1.9239, + "step": 5584 + }, + { + "epoch": 1.7142418661755678, + "grad_norm": 0.5653155446052551, + "learning_rate": 9.484940640853199e-05, + "loss": 1.9115, + "step": 5585 + }, + { + "epoch": 1.714548802946593, + "grad_norm": 0.4652312099933624, + "learning_rate": 9.484720892791064e-05, + "loss": 1.9973, + "step": 5586 + }, + { + "epoch": 1.7148557397176183, + "grad_norm": 0.41521382331848145, + "learning_rate": 9.484501100408288e-05, + "loss": 1.9395, + "step": 5587 + }, + { + "epoch": 1.7151626764886432, + "grad_norm": 0.46761438250541687, + "learning_rate": 9.484281263707043e-05, + "loss": 1.9465, + "step": 5588 + }, + { + "epoch": 1.7154696132596685, + "grad_norm": 0.46990182995796204, + "learning_rate": 9.484061382689501e-05, + "loss": 1.8969, + "step": 5589 + }, + { + "epoch": 1.7157765500306936, + "grad_norm": 0.44951021671295166, + "learning_rate": 9.48384145735784e-05, + "loss": 1.9925, + "step": 5590 + }, + { + "epoch": 1.7160834868017187, + "grad_norm": 0.4029327630996704, + "learning_rate": 9.483621487714227e-05, + "loss": 1.8574, + "step": 5591 + }, + { + "epoch": 1.716390423572744, + "grad_norm": 0.3501027226448059, + "learning_rate": 9.48340147376084e-05, + "loss": 1.9156, + "step": 5592 + }, + { + "epoch": 1.7166973603437692, + "grad_norm": 0.5058720111846924, + "learning_rate": 9.48318141549985e-05, + "loss": 2.071, + "step": 5593 + }, + { + "epoch": 1.7170042971147943, + "grad_norm": 0.5097518563270569, + "learning_rate": 9.482961312933435e-05, + "loss": 1.9609, + "step": 5594 + }, + { + "epoch": 1.7173112338858196, + "grad_norm": 0.4728573262691498, + "learning_rate": 9.482741166063769e-05, + "loss": 1.9552, + "step": 5595 + }, + { + "epoch": 1.7176181706568447, + "grad_norm": 0.44095897674560547, + "learning_rate": 9.482520974893026e-05, + "loss": 2.011, + "step": 5596 + }, + { + "epoch": 1.7179251074278699, + "grad_norm": 0.48331573605537415, + "learning_rate": 9.482300739423385e-05, + "loss": 1.9676, + "step": 5597 + }, + { + "epoch": 1.7182320441988952, + "grad_norm": 0.4890894293785095, + "learning_rate": 9.482080459657019e-05, + "loss": 1.9571, + "step": 5598 + }, + { + "epoch": 1.71853898096992, + "grad_norm": 0.4486929476261139, + "learning_rate": 9.481860135596109e-05, + "loss": 1.9205, + "step": 5599 + }, + { + "epoch": 1.7188459177409454, + "grad_norm": 0.44154083728790283, + "learning_rate": 9.48163976724283e-05, + "loss": 1.9995, + "step": 5600 + }, + { + "epoch": 1.7191528545119705, + "grad_norm": 0.4155641496181488, + "learning_rate": 9.481419354599358e-05, + "loss": 1.9192, + "step": 5601 + }, + { + "epoch": 1.7194597912829956, + "grad_norm": 0.453253835439682, + "learning_rate": 9.481198897667875e-05, + "loss": 2.0102, + "step": 5602 + }, + { + "epoch": 1.719766728054021, + "grad_norm": 0.4325653314590454, + "learning_rate": 9.480978396450557e-05, + "loss": 1.8859, + "step": 5603 + }, + { + "epoch": 1.720073664825046, + "grad_norm": 0.4191089868545532, + "learning_rate": 9.480757850949584e-05, + "loss": 2.0007, + "step": 5604 + }, + { + "epoch": 1.7203806015960712, + "grad_norm": 0.4182284474372864, + "learning_rate": 9.480537261167137e-05, + "loss": 1.9374, + "step": 5605 + }, + { + "epoch": 1.7206875383670965, + "grad_norm": 0.4695988893508911, + "learning_rate": 9.480316627105394e-05, + "loss": 1.983, + "step": 5606 + }, + { + "epoch": 1.7209944751381214, + "grad_norm": 0.4668160378932953, + "learning_rate": 9.480095948766536e-05, + "loss": 1.8705, + "step": 5607 + }, + { + "epoch": 1.7213014119091468, + "grad_norm": 0.3689236044883728, + "learning_rate": 9.479875226152744e-05, + "loss": 1.8695, + "step": 5608 + }, + { + "epoch": 1.7216083486801719, + "grad_norm": 0.4206932485103607, + "learning_rate": 9.4796544592662e-05, + "loss": 1.9494, + "step": 5609 + }, + { + "epoch": 1.721915285451197, + "grad_norm": 0.4420578181743622, + "learning_rate": 9.479433648109083e-05, + "loss": 1.8749, + "step": 5610 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.4243582487106323, + "learning_rate": 9.479212792683579e-05, + "loss": 1.9524, + "step": 5611 + }, + { + "epoch": 1.7225291589932474, + "grad_norm": 0.5053666234016418, + "learning_rate": 9.478991892991868e-05, + "loss": 1.9308, + "step": 5612 + }, + { + "epoch": 1.7228360957642725, + "grad_norm": 0.4365650713443756, + "learning_rate": 9.478770949036136e-05, + "loss": 1.9469, + "step": 5613 + }, + { + "epoch": 1.7231430325352979, + "grad_norm": 0.3916216194629669, + "learning_rate": 9.478549960818561e-05, + "loss": 1.8239, + "step": 5614 + }, + { + "epoch": 1.7234499693063228, + "grad_norm": 0.4051356911659241, + "learning_rate": 9.478328928341334e-05, + "loss": 1.892, + "step": 5615 + }, + { + "epoch": 1.723756906077348, + "grad_norm": 0.36592593789100647, + "learning_rate": 9.478107851606633e-05, + "loss": 1.8763, + "step": 5616 + }, + { + "epoch": 1.7240638428483732, + "grad_norm": 0.45741888880729675, + "learning_rate": 9.477886730616645e-05, + "loss": 1.9502, + "step": 5617 + }, + { + "epoch": 1.7243707796193983, + "grad_norm": 0.38170990347862244, + "learning_rate": 9.477665565373558e-05, + "loss": 1.8568, + "step": 5618 + }, + { + "epoch": 1.7246777163904237, + "grad_norm": 0.4193691313266754, + "learning_rate": 9.477444355879554e-05, + "loss": 1.9553, + "step": 5619 + }, + { + "epoch": 1.7249846531614488, + "grad_norm": 0.39682838320732117, + "learning_rate": 9.477223102136821e-05, + "loss": 1.9474, + "step": 5620 + }, + { + "epoch": 1.725291589932474, + "grad_norm": 0.391544371843338, + "learning_rate": 9.477001804147545e-05, + "loss": 1.9277, + "step": 5621 + }, + { + "epoch": 1.7255985267034992, + "grad_norm": 0.42348888516426086, + "learning_rate": 9.476780461913913e-05, + "loss": 1.8923, + "step": 5622 + }, + { + "epoch": 1.7259054634745241, + "grad_norm": 0.4393916130065918, + "learning_rate": 9.476559075438114e-05, + "loss": 1.9052, + "step": 5623 + }, + { + "epoch": 1.7262124002455494, + "grad_norm": 0.42631569504737854, + "learning_rate": 9.476337644722333e-05, + "loss": 1.8849, + "step": 5624 + }, + { + "epoch": 1.7265193370165746, + "grad_norm": 0.3514206111431122, + "learning_rate": 9.47611616976876e-05, + "loss": 1.9286, + "step": 5625 + }, + { + "epoch": 1.7268262737875997, + "grad_norm": 0.4104609191417694, + "learning_rate": 9.475894650579582e-05, + "loss": 1.9178, + "step": 5626 + }, + { + "epoch": 1.727133210558625, + "grad_norm": 0.44329676032066345, + "learning_rate": 9.475673087156992e-05, + "loss": 1.9789, + "step": 5627 + }, + { + "epoch": 1.7274401473296501, + "grad_norm": 0.41865840554237366, + "learning_rate": 9.475451479503175e-05, + "loss": 1.9105, + "step": 5628 + }, + { + "epoch": 1.7277470841006752, + "grad_norm": 0.4166790544986725, + "learning_rate": 9.475229827620326e-05, + "loss": 1.9089, + "step": 5629 + }, + { + "epoch": 1.7280540208717006, + "grad_norm": 0.353771448135376, + "learning_rate": 9.475008131510633e-05, + "loss": 1.9081, + "step": 5630 + }, + { + "epoch": 1.7283609576427255, + "grad_norm": 0.385046124458313, + "learning_rate": 9.474786391176284e-05, + "loss": 1.9268, + "step": 5631 + }, + { + "epoch": 1.7286678944137508, + "grad_norm": 0.3956538438796997, + "learning_rate": 9.474564606619474e-05, + "loss": 1.9445, + "step": 5632 + }, + { + "epoch": 1.728974831184776, + "grad_norm": 0.41305112838745117, + "learning_rate": 9.474342777842394e-05, + "loss": 1.9331, + "step": 5633 + }, + { + "epoch": 1.729281767955801, + "grad_norm": 0.39336860179901123, + "learning_rate": 9.474120904847237e-05, + "loss": 1.9792, + "step": 5634 + }, + { + "epoch": 1.7295887047268264, + "grad_norm": 0.41963186860084534, + "learning_rate": 9.473898987636194e-05, + "loss": 1.8719, + "step": 5635 + }, + { + "epoch": 1.7298956414978515, + "grad_norm": 0.4087338149547577, + "learning_rate": 9.473677026211458e-05, + "loss": 1.9121, + "step": 5636 + }, + { + "epoch": 1.7302025782688766, + "grad_norm": 0.3693830966949463, + "learning_rate": 9.473455020575226e-05, + "loss": 1.9293, + "step": 5637 + }, + { + "epoch": 1.730509515039902, + "grad_norm": 0.40699541568756104, + "learning_rate": 9.473232970729688e-05, + "loss": 1.94, + "step": 5638 + }, + { + "epoch": 1.7308164518109268, + "grad_norm": 0.4222811162471771, + "learning_rate": 9.473010876677041e-05, + "loss": 1.9416, + "step": 5639 + }, + { + "epoch": 1.7311233885819521, + "grad_norm": 0.41459110379219055, + "learning_rate": 9.472788738419477e-05, + "loss": 1.8801, + "step": 5640 + }, + { + "epoch": 1.7314303253529773, + "grad_norm": 0.36970487236976624, + "learning_rate": 9.472566555959195e-05, + "loss": 1.9122, + "step": 5641 + }, + { + "epoch": 1.7317372621240024, + "grad_norm": 0.35511577129364014, + "learning_rate": 9.472344329298388e-05, + "loss": 1.8646, + "step": 5642 + }, + { + "epoch": 1.7320441988950277, + "grad_norm": 0.3511577248573303, + "learning_rate": 9.472122058439252e-05, + "loss": 1.9047, + "step": 5643 + }, + { + "epoch": 1.7323511356660528, + "grad_norm": 0.3421955108642578, + "learning_rate": 9.471899743383986e-05, + "loss": 1.8732, + "step": 5644 + }, + { + "epoch": 1.732658072437078, + "grad_norm": 0.44008341431617737, + "learning_rate": 9.471677384134785e-05, + "loss": 1.8956, + "step": 5645 + }, + { + "epoch": 1.7329650092081033, + "grad_norm": 0.49410128593444824, + "learning_rate": 9.471454980693848e-05, + "loss": 1.9197, + "step": 5646 + }, + { + "epoch": 1.7332719459791281, + "grad_norm": 0.4664965867996216, + "learning_rate": 9.471232533063373e-05, + "loss": 1.8945, + "step": 5647 + }, + { + "epoch": 1.7335788827501535, + "grad_norm": 0.3789248764514923, + "learning_rate": 9.471010041245555e-05, + "loss": 1.9153, + "step": 5648 + }, + { + "epoch": 1.7338858195211786, + "grad_norm": 0.34556612372398376, + "learning_rate": 9.470787505242596e-05, + "loss": 1.9144, + "step": 5649 + }, + { + "epoch": 1.7341927562922037, + "grad_norm": 0.3466256856918335, + "learning_rate": 9.470564925056695e-05, + "loss": 1.8837, + "step": 5650 + }, + { + "epoch": 1.734499693063229, + "grad_norm": 0.34612321853637695, + "learning_rate": 9.470342300690051e-05, + "loss": 1.8667, + "step": 5651 + }, + { + "epoch": 1.7348066298342542, + "grad_norm": 0.3648833632469177, + "learning_rate": 9.470119632144864e-05, + "loss": 1.9499, + "step": 5652 + }, + { + "epoch": 1.7351135666052793, + "grad_norm": 0.3600454330444336, + "learning_rate": 9.469896919423334e-05, + "loss": 1.9093, + "step": 5653 + }, + { + "epoch": 1.7354205033763046, + "grad_norm": 0.41487598419189453, + "learning_rate": 9.469674162527664e-05, + "loss": 1.9714, + "step": 5654 + }, + { + "epoch": 1.7357274401473295, + "grad_norm": 0.35980695486068726, + "learning_rate": 9.469451361460053e-05, + "loss": 1.9006, + "step": 5655 + }, + { + "epoch": 1.7360343769183548, + "grad_norm": 0.42676928639411926, + "learning_rate": 9.469228516222705e-05, + "loss": 1.9286, + "step": 5656 + }, + { + "epoch": 1.73634131368938, + "grad_norm": 0.41541969776153564, + "learning_rate": 9.469005626817822e-05, + "loss": 1.9243, + "step": 5657 + }, + { + "epoch": 1.736648250460405, + "grad_norm": 0.4245065152645111, + "learning_rate": 9.468782693247604e-05, + "loss": 1.9427, + "step": 5658 + }, + { + "epoch": 1.7369551872314304, + "grad_norm": 0.46148940920829773, + "learning_rate": 9.468559715514257e-05, + "loss": 2.0201, + "step": 5659 + }, + { + "epoch": 1.7372621240024555, + "grad_norm": 0.47727301716804504, + "learning_rate": 9.468336693619985e-05, + "loss": 1.9792, + "step": 5660 + }, + { + "epoch": 1.7375690607734806, + "grad_norm": 0.4807848036289215, + "learning_rate": 9.46811362756699e-05, + "loss": 1.9036, + "step": 5661 + }, + { + "epoch": 1.737875997544506, + "grad_norm": 0.5129636526107788, + "learning_rate": 9.467890517357477e-05, + "loss": 1.8861, + "step": 5662 + }, + { + "epoch": 1.7381829343155308, + "grad_norm": 0.467804878950119, + "learning_rate": 9.467667362993651e-05, + "loss": 1.868, + "step": 5663 + }, + { + "epoch": 1.7384898710865562, + "grad_norm": 0.4179893136024475, + "learning_rate": 9.46744416447772e-05, + "loss": 1.9521, + "step": 5664 + }, + { + "epoch": 1.7387968078575813, + "grad_norm": 0.4384612739086151, + "learning_rate": 9.467220921811884e-05, + "loss": 1.9167, + "step": 5665 + }, + { + "epoch": 1.7391037446286064, + "grad_norm": 0.517855703830719, + "learning_rate": 9.466997634998354e-05, + "loss": 1.8919, + "step": 5666 + }, + { + "epoch": 1.7394106813996317, + "grad_norm": 0.4875940978527069, + "learning_rate": 9.466774304039334e-05, + "loss": 1.8774, + "step": 5667 + }, + { + "epoch": 1.7397176181706568, + "grad_norm": 0.44286540150642395, + "learning_rate": 9.466550928937034e-05, + "loss": 1.9696, + "step": 5668 + }, + { + "epoch": 1.740024554941682, + "grad_norm": 0.4092461168766022, + "learning_rate": 9.466327509693658e-05, + "loss": 1.9978, + "step": 5669 + }, + { + "epoch": 1.7403314917127073, + "grad_norm": 0.42797163128852844, + "learning_rate": 9.466104046311418e-05, + "loss": 1.9428, + "step": 5670 + }, + { + "epoch": 1.7406384284837324, + "grad_norm": 0.5174738764762878, + "learning_rate": 9.465880538792518e-05, + "loss": 1.9493, + "step": 5671 + }, + { + "epoch": 1.7409453652547575, + "grad_norm": 0.6263836622238159, + "learning_rate": 9.46565698713917e-05, + "loss": 1.9131, + "step": 5672 + }, + { + "epoch": 1.7412523020257828, + "grad_norm": 0.6452967524528503, + "learning_rate": 9.465433391353582e-05, + "loss": 2.0412, + "step": 5673 + }, + { + "epoch": 1.7415592387968077, + "grad_norm": 0.5004684925079346, + "learning_rate": 9.465209751437964e-05, + "loss": 1.8721, + "step": 5674 + }, + { + "epoch": 1.741866175567833, + "grad_norm": 0.4694507420063019, + "learning_rate": 9.464986067394526e-05, + "loss": 1.9614, + "step": 5675 + }, + { + "epoch": 1.7421731123388582, + "grad_norm": 0.4519532322883606, + "learning_rate": 9.464762339225479e-05, + "loss": 1.9687, + "step": 5676 + }, + { + "epoch": 1.7424800491098833, + "grad_norm": 0.4297941029071808, + "learning_rate": 9.464538566933033e-05, + "loss": 1.965, + "step": 5677 + }, + { + "epoch": 1.7427869858809086, + "grad_norm": 0.4612393081188202, + "learning_rate": 9.464314750519401e-05, + "loss": 1.9651, + "step": 5678 + }, + { + "epoch": 1.7430939226519337, + "grad_norm": 0.394142210483551, + "learning_rate": 9.464090889986794e-05, + "loss": 1.9185, + "step": 5679 + }, + { + "epoch": 1.7434008594229589, + "grad_norm": 0.39999979734420776, + "learning_rate": 9.463866985337424e-05, + "loss": 1.899, + "step": 5680 + }, + { + "epoch": 1.7437077961939842, + "grad_norm": 0.40942859649658203, + "learning_rate": 9.463643036573504e-05, + "loss": 1.9653, + "step": 5681 + }, + { + "epoch": 1.744014732965009, + "grad_norm": 0.4097300171852112, + "learning_rate": 9.463419043697248e-05, + "loss": 1.9944, + "step": 5682 + }, + { + "epoch": 1.7443216697360344, + "grad_norm": 0.41627535223960876, + "learning_rate": 9.463195006710868e-05, + "loss": 1.9156, + "step": 5683 + }, + { + "epoch": 1.7446286065070595, + "grad_norm": 0.3789215385913849, + "learning_rate": 9.46297092561658e-05, + "loss": 1.9262, + "step": 5684 + }, + { + "epoch": 1.7449355432780846, + "grad_norm": 0.4867783188819885, + "learning_rate": 9.462746800416595e-05, + "loss": 1.961, + "step": 5685 + }, + { + "epoch": 1.74524248004911, + "grad_norm": 0.6078580617904663, + "learning_rate": 9.462522631113133e-05, + "loss": 1.9694, + "step": 5686 + }, + { + "epoch": 1.745549416820135, + "grad_norm": 0.558968186378479, + "learning_rate": 9.462298417708406e-05, + "loss": 1.9537, + "step": 5687 + }, + { + "epoch": 1.7458563535911602, + "grad_norm": 0.4677596986293793, + "learning_rate": 9.46207416020463e-05, + "loss": 1.9253, + "step": 5688 + }, + { + "epoch": 1.7461632903621855, + "grad_norm": 0.40353646874427795, + "learning_rate": 9.461849858604023e-05, + "loss": 1.8992, + "step": 5689 + }, + { + "epoch": 1.7464702271332104, + "grad_norm": 0.3738614618778229, + "learning_rate": 9.4616255129088e-05, + "loss": 1.9109, + "step": 5690 + }, + { + "epoch": 1.7467771639042358, + "grad_norm": 0.4040324091911316, + "learning_rate": 9.461401123121179e-05, + "loss": 1.8981, + "step": 5691 + }, + { + "epoch": 1.7470841006752609, + "grad_norm": 0.44214901328086853, + "learning_rate": 9.461176689243376e-05, + "loss": 1.9244, + "step": 5692 + }, + { + "epoch": 1.747391037446286, + "grad_norm": 0.44187378883361816, + "learning_rate": 9.460952211277611e-05, + "loss": 1.9329, + "step": 5693 + }, + { + "epoch": 1.7476979742173113, + "grad_norm": 0.44287410378456116, + "learning_rate": 9.460727689226102e-05, + "loss": 1.97, + "step": 5694 + }, + { + "epoch": 1.7480049109883364, + "grad_norm": 0.3757341504096985, + "learning_rate": 9.460503123091067e-05, + "loss": 1.8766, + "step": 5695 + }, + { + "epoch": 1.7483118477593615, + "grad_norm": 0.4139314591884613, + "learning_rate": 9.460278512874725e-05, + "loss": 1.902, + "step": 5696 + }, + { + "epoch": 1.7486187845303869, + "grad_norm": 0.37526339292526245, + "learning_rate": 9.460053858579298e-05, + "loss": 1.9325, + "step": 5697 + }, + { + "epoch": 1.7489257213014118, + "grad_norm": 0.3770616948604584, + "learning_rate": 9.459829160207004e-05, + "loss": 1.9437, + "step": 5698 + }, + { + "epoch": 1.749232658072437, + "grad_norm": 0.4069806933403015, + "learning_rate": 9.459604417760064e-05, + "loss": 1.9454, + "step": 5699 + }, + { + "epoch": 1.7495395948434622, + "grad_norm": 0.42822694778442383, + "learning_rate": 9.459379631240699e-05, + "loss": 1.8798, + "step": 5700 + }, + { + "epoch": 1.7498465316144873, + "grad_norm": 0.44075292348861694, + "learning_rate": 9.459154800651131e-05, + "loss": 1.9842, + "step": 5701 + }, + { + "epoch": 1.7501534683855127, + "grad_norm": 0.4151122272014618, + "learning_rate": 9.458929925993583e-05, + "loss": 1.8495, + "step": 5702 + }, + { + "epoch": 1.7504604051565378, + "grad_norm": 0.41887882351875305, + "learning_rate": 9.458705007270275e-05, + "loss": 1.9611, + "step": 5703 + }, + { + "epoch": 1.750767341927563, + "grad_norm": 0.3976796865463257, + "learning_rate": 9.45848004448343e-05, + "loss": 1.8841, + "step": 5704 + }, + { + "epoch": 1.7510742786985882, + "grad_norm": 0.3783813416957855, + "learning_rate": 9.458255037635272e-05, + "loss": 1.8897, + "step": 5705 + }, + { + "epoch": 1.7513812154696131, + "grad_norm": 0.35153308510780334, + "learning_rate": 9.458029986728026e-05, + "loss": 1.911, + "step": 5706 + }, + { + "epoch": 1.7516881522406385, + "grad_norm": 0.38390985131263733, + "learning_rate": 9.457804891763913e-05, + "loss": 2.0105, + "step": 5707 + }, + { + "epoch": 1.7519950890116636, + "grad_norm": 0.3830740451812744, + "learning_rate": 9.457579752745161e-05, + "loss": 1.9635, + "step": 5708 + }, + { + "epoch": 1.7523020257826887, + "grad_norm": 0.3711417019367218, + "learning_rate": 9.457354569673993e-05, + "loss": 1.8553, + "step": 5709 + }, + { + "epoch": 1.752608962553714, + "grad_norm": 0.3670618236064911, + "learning_rate": 9.457129342552633e-05, + "loss": 1.9044, + "step": 5710 + }, + { + "epoch": 1.7529158993247391, + "grad_norm": 0.398863285779953, + "learning_rate": 9.45690407138331e-05, + "loss": 1.987, + "step": 5711 + }, + { + "epoch": 1.7532228360957642, + "grad_norm": 0.4100732207298279, + "learning_rate": 9.456678756168248e-05, + "loss": 1.8552, + "step": 5712 + }, + { + "epoch": 1.7535297728667896, + "grad_norm": 0.41883236169815063, + "learning_rate": 9.456453396909676e-05, + "loss": 1.9183, + "step": 5713 + }, + { + "epoch": 1.7538367096378145, + "grad_norm": 0.4063440263271332, + "learning_rate": 9.456227993609818e-05, + "loss": 1.8751, + "step": 5714 + }, + { + "epoch": 1.7541436464088398, + "grad_norm": 0.3880515694618225, + "learning_rate": 9.456002546270904e-05, + "loss": 1.9558, + "step": 5715 + }, + { + "epoch": 1.754450583179865, + "grad_norm": 0.38582444190979004, + "learning_rate": 9.45577705489516e-05, + "loss": 1.9588, + "step": 5716 + }, + { + "epoch": 1.75475751995089, + "grad_norm": 0.3678396940231323, + "learning_rate": 9.455551519484816e-05, + "loss": 1.9108, + "step": 5717 + }, + { + "epoch": 1.7550644567219154, + "grad_norm": 0.3590768277645111, + "learning_rate": 9.455325940042098e-05, + "loss": 1.9027, + "step": 5718 + }, + { + "epoch": 1.7553713934929405, + "grad_norm": 0.4104592204093933, + "learning_rate": 9.455100316569241e-05, + "loss": 1.9099, + "step": 5719 + }, + { + "epoch": 1.7556783302639656, + "grad_norm": 0.3774401843547821, + "learning_rate": 9.45487464906847e-05, + "loss": 1.9098, + "step": 5720 + }, + { + "epoch": 1.755985267034991, + "grad_norm": 0.38464388251304626, + "learning_rate": 9.454648937542019e-05, + "loss": 1.9194, + "step": 5721 + }, + { + "epoch": 1.7562922038060158, + "grad_norm": 0.435131698846817, + "learning_rate": 9.454423181992114e-05, + "loss": 1.9798, + "step": 5722 + }, + { + "epoch": 1.7565991405770411, + "grad_norm": 0.4583236575126648, + "learning_rate": 9.454197382420988e-05, + "loss": 1.9862, + "step": 5723 + }, + { + "epoch": 1.7569060773480663, + "grad_norm": 0.3644738793373108, + "learning_rate": 9.453971538830874e-05, + "loss": 1.8535, + "step": 5724 + }, + { + "epoch": 1.7572130141190914, + "grad_norm": 0.3644218444824219, + "learning_rate": 9.453745651224002e-05, + "loss": 1.8773, + "step": 5725 + }, + { + "epoch": 1.7575199508901167, + "grad_norm": 0.42884743213653564, + "learning_rate": 9.453519719602604e-05, + "loss": 1.882, + "step": 5726 + }, + { + "epoch": 1.7578268876611418, + "grad_norm": 0.41049477458000183, + "learning_rate": 9.453293743968916e-05, + "loss": 1.9133, + "step": 5727 + }, + { + "epoch": 1.758133824432167, + "grad_norm": 0.35882604122161865, + "learning_rate": 9.453067724325169e-05, + "loss": 1.9056, + "step": 5728 + }, + { + "epoch": 1.7584407612031923, + "grad_norm": 0.34516364336013794, + "learning_rate": 9.452841660673595e-05, + "loss": 1.8894, + "step": 5729 + }, + { + "epoch": 1.7587476979742172, + "grad_norm": 0.41804373264312744, + "learning_rate": 9.45261555301643e-05, + "loss": 1.8798, + "step": 5730 + }, + { + "epoch": 1.7590546347452425, + "grad_norm": 0.48584702610969543, + "learning_rate": 9.45238940135591e-05, + "loss": 1.9353, + "step": 5731 + }, + { + "epoch": 1.7593615715162676, + "grad_norm": 0.5693044662475586, + "learning_rate": 9.452163205694267e-05, + "loss": 1.8813, + "step": 5732 + }, + { + "epoch": 1.7596685082872927, + "grad_norm": 0.6146205067634583, + "learning_rate": 9.451936966033738e-05, + "loss": 1.9993, + "step": 5733 + }, + { + "epoch": 1.759975445058318, + "grad_norm": 0.4658338129520416, + "learning_rate": 9.451710682376558e-05, + "loss": 1.8977, + "step": 5734 + }, + { + "epoch": 1.7602823818293432, + "grad_norm": 0.35184696316719055, + "learning_rate": 9.451484354724964e-05, + "loss": 1.9924, + "step": 5735 + }, + { + "epoch": 1.7605893186003683, + "grad_norm": 0.48720163106918335, + "learning_rate": 9.451257983081194e-05, + "loss": 1.9054, + "step": 5736 + }, + { + "epoch": 1.7608962553713936, + "grad_norm": 0.6268271803855896, + "learning_rate": 9.451031567447482e-05, + "loss": 1.9956, + "step": 5737 + }, + { + "epoch": 1.7612031921424187, + "grad_norm": 0.5384534001350403, + "learning_rate": 9.450805107826068e-05, + "loss": 1.9169, + "step": 5738 + }, + { + "epoch": 1.7615101289134438, + "grad_norm": 0.4011121094226837, + "learning_rate": 9.450578604219188e-05, + "loss": 1.9845, + "step": 5739 + }, + { + "epoch": 1.7618170656844692, + "grad_norm": 0.4422668516635895, + "learning_rate": 9.450352056629082e-05, + "loss": 2.0014, + "step": 5740 + }, + { + "epoch": 1.762124002455494, + "grad_norm": 0.5033303499221802, + "learning_rate": 9.45012546505799e-05, + "loss": 1.9142, + "step": 5741 + }, + { + "epoch": 1.7624309392265194, + "grad_norm": 0.6074427366256714, + "learning_rate": 9.449898829508148e-05, + "loss": 1.9385, + "step": 5742 + }, + { + "epoch": 1.7627378759975445, + "grad_norm": 0.6405495405197144, + "learning_rate": 9.449672149981799e-05, + "loss": 1.9792, + "step": 5743 + }, + { + "epoch": 1.7630448127685696, + "grad_norm": 0.5432560443878174, + "learning_rate": 9.449445426481182e-05, + "loss": 1.9294, + "step": 5744 + }, + { + "epoch": 1.763351749539595, + "grad_norm": 0.41406089067459106, + "learning_rate": 9.449218659008536e-05, + "loss": 1.9266, + "step": 5745 + }, + { + "epoch": 1.76365868631062, + "grad_norm": 0.41278013586997986, + "learning_rate": 9.448991847566104e-05, + "loss": 1.9448, + "step": 5746 + }, + { + "epoch": 1.7639656230816452, + "grad_norm": 0.4682934582233429, + "learning_rate": 9.448764992156128e-05, + "loss": 1.9836, + "step": 5747 + }, + { + "epoch": 1.7642725598526705, + "grad_norm": 0.47673073410987854, + "learning_rate": 9.448538092780848e-05, + "loss": 2.0229, + "step": 5748 + }, + { + "epoch": 1.7645794966236954, + "grad_norm": 0.3956258296966553, + "learning_rate": 9.448311149442507e-05, + "loss": 1.9871, + "step": 5749 + }, + { + "epoch": 1.7648864333947207, + "grad_norm": 0.39578214287757874, + "learning_rate": 9.448084162143348e-05, + "loss": 1.8991, + "step": 5750 + }, + { + "epoch": 1.7651933701657458, + "grad_norm": 0.42902353405952454, + "learning_rate": 9.447857130885614e-05, + "loss": 1.9925, + "step": 5751 + }, + { + "epoch": 1.765500306936771, + "grad_norm": 0.45643556118011475, + "learning_rate": 9.44763005567155e-05, + "loss": 1.9662, + "step": 5752 + }, + { + "epoch": 1.7658072437077963, + "grad_norm": 0.39291635155677795, + "learning_rate": 9.447402936503398e-05, + "loss": 1.8925, + "step": 5753 + }, + { + "epoch": 1.7661141804788214, + "grad_norm": 0.36709296703338623, + "learning_rate": 9.447175773383404e-05, + "loss": 1.8669, + "step": 5754 + }, + { + "epoch": 1.7664211172498465, + "grad_norm": 0.41586652398109436, + "learning_rate": 9.446948566313812e-05, + "loss": 1.8925, + "step": 5755 + }, + { + "epoch": 1.7667280540208719, + "grad_norm": 0.42532578110694885, + "learning_rate": 9.446721315296867e-05, + "loss": 1.9923, + "step": 5756 + }, + { + "epoch": 1.7670349907918967, + "grad_norm": 0.45310646295547485, + "learning_rate": 9.446494020334817e-05, + "loss": 1.9908, + "step": 5757 + }, + { + "epoch": 1.767341927562922, + "grad_norm": 0.4391445219516754, + "learning_rate": 9.446266681429907e-05, + "loss": 1.9391, + "step": 5758 + }, + { + "epoch": 1.7676488643339472, + "grad_norm": 0.3728313446044922, + "learning_rate": 9.446039298584382e-05, + "loss": 1.9352, + "step": 5759 + }, + { + "epoch": 1.7679558011049723, + "grad_norm": 0.3862408697605133, + "learning_rate": 9.445811871800492e-05, + "loss": 1.9628, + "step": 5760 + }, + { + "epoch": 1.7682627378759976, + "grad_norm": 0.3704443573951721, + "learning_rate": 9.445584401080482e-05, + "loss": 1.9041, + "step": 5761 + }, + { + "epoch": 1.7685696746470227, + "grad_norm": 0.3490816652774811, + "learning_rate": 9.445356886426603e-05, + "loss": 1.9203, + "step": 5762 + }, + { + "epoch": 1.7688766114180479, + "grad_norm": 0.40135613083839417, + "learning_rate": 9.445129327841102e-05, + "loss": 1.9166, + "step": 5763 + }, + { + "epoch": 1.7691835481890732, + "grad_norm": 0.3794950246810913, + "learning_rate": 9.444901725326227e-05, + "loss": 1.8735, + "step": 5764 + }, + { + "epoch": 1.769490484960098, + "grad_norm": 0.3908408284187317, + "learning_rate": 9.444674078884228e-05, + "loss": 1.9044, + "step": 5765 + }, + { + "epoch": 1.7697974217311234, + "grad_norm": 0.45880573987960815, + "learning_rate": 9.444446388517354e-05, + "loss": 1.999, + "step": 5766 + }, + { + "epoch": 1.7701043585021485, + "grad_norm": 0.44833555817604065, + "learning_rate": 9.444218654227856e-05, + "loss": 1.8638, + "step": 5767 + }, + { + "epoch": 1.7704112952731736, + "grad_norm": 0.4608282446861267, + "learning_rate": 9.443990876017985e-05, + "loss": 2.0073, + "step": 5768 + }, + { + "epoch": 1.770718232044199, + "grad_norm": 0.41873493790626526, + "learning_rate": 9.44376305388999e-05, + "loss": 1.9337, + "step": 5769 + }, + { + "epoch": 1.771025168815224, + "grad_norm": 0.44395530223846436, + "learning_rate": 9.443535187846125e-05, + "loss": 1.9218, + "step": 5770 + }, + { + "epoch": 1.7713321055862492, + "grad_norm": 0.4347928464412689, + "learning_rate": 9.443307277888641e-05, + "loss": 1.9251, + "step": 5771 + }, + { + "epoch": 1.7716390423572745, + "grad_norm": 0.4892890155315399, + "learning_rate": 9.44307932401979e-05, + "loss": 1.9549, + "step": 5772 + }, + { + "epoch": 1.7719459791282994, + "grad_norm": 0.4234324097633362, + "learning_rate": 9.442851326241826e-05, + "loss": 1.9835, + "step": 5773 + }, + { + "epoch": 1.7722529158993248, + "grad_norm": 0.3614303171634674, + "learning_rate": 9.442623284557e-05, + "loss": 1.8942, + "step": 5774 + }, + { + "epoch": 1.7725598526703499, + "grad_norm": 0.4273429214954376, + "learning_rate": 9.442395198967566e-05, + "loss": 1.9363, + "step": 5775 + }, + { + "epoch": 1.772866789441375, + "grad_norm": 0.5049880146980286, + "learning_rate": 9.44216706947578e-05, + "loss": 1.904, + "step": 5776 + }, + { + "epoch": 1.7731737262124003, + "grad_norm": 0.5713424682617188, + "learning_rate": 9.441938896083895e-05, + "loss": 1.9756, + "step": 5777 + }, + { + "epoch": 1.7734806629834254, + "grad_norm": 0.4836362600326538, + "learning_rate": 9.441710678794166e-05, + "loss": 1.9657, + "step": 5778 + }, + { + "epoch": 1.7737875997544506, + "grad_norm": 0.39967820048332214, + "learning_rate": 9.44148241760885e-05, + "loss": 1.9566, + "step": 5779 + }, + { + "epoch": 1.7740945365254759, + "grad_norm": 0.38304075598716736, + "learning_rate": 9.4412541125302e-05, + "loss": 1.9055, + "step": 5780 + }, + { + "epoch": 1.7744014732965008, + "grad_norm": 0.3932463526725769, + "learning_rate": 9.441025763560474e-05, + "loss": 1.9603, + "step": 5781 + }, + { + "epoch": 1.774708410067526, + "grad_norm": 0.4528409242630005, + "learning_rate": 9.44079737070193e-05, + "loss": 2.0095, + "step": 5782 + }, + { + "epoch": 1.7750153468385512, + "grad_norm": 0.42075392603874207, + "learning_rate": 9.440568933956822e-05, + "loss": 1.8818, + "step": 5783 + }, + { + "epoch": 1.7753222836095763, + "grad_norm": 0.4114269018173218, + "learning_rate": 9.44034045332741e-05, + "loss": 1.8524, + "step": 5784 + }, + { + "epoch": 1.7756292203806017, + "grad_norm": 0.4052261412143707, + "learning_rate": 9.44011192881595e-05, + "loss": 1.9759, + "step": 5785 + }, + { + "epoch": 1.7759361571516268, + "grad_norm": 0.3551998436450958, + "learning_rate": 9.439883360424702e-05, + "loss": 1.9534, + "step": 5786 + }, + { + "epoch": 1.776243093922652, + "grad_norm": 0.404109925031662, + "learning_rate": 9.439654748155924e-05, + "loss": 1.8944, + "step": 5787 + }, + { + "epoch": 1.7765500306936772, + "grad_norm": 0.4092860519886017, + "learning_rate": 9.439426092011875e-05, + "loss": 2.0341, + "step": 5788 + }, + { + "epoch": 1.7768569674647021, + "grad_norm": 0.36132386326789856, + "learning_rate": 9.439197391994819e-05, + "loss": 1.8746, + "step": 5789 + }, + { + "epoch": 1.7771639042357275, + "grad_norm": 0.34845319390296936, + "learning_rate": 9.438968648107009e-05, + "loss": 1.8646, + "step": 5790 + }, + { + "epoch": 1.7774708410067526, + "grad_norm": 0.33360353112220764, + "learning_rate": 9.43873986035071e-05, + "loss": 1.901, + "step": 5791 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.348147988319397, + "learning_rate": 9.438511028728181e-05, + "loss": 1.8703, + "step": 5792 + }, + { + "epoch": 1.778084714548803, + "grad_norm": 0.385662704706192, + "learning_rate": 9.438282153241686e-05, + "loss": 1.9806, + "step": 5793 + }, + { + "epoch": 1.7783916513198281, + "grad_norm": 0.39457234740257263, + "learning_rate": 9.438053233893484e-05, + "loss": 1.9324, + "step": 5794 + }, + { + "epoch": 1.7786985880908532, + "grad_norm": 0.35745853185653687, + "learning_rate": 9.43782427068584e-05, + "loss": 1.9754, + "step": 5795 + }, + { + "epoch": 1.7790055248618786, + "grad_norm": 0.40866991877555847, + "learning_rate": 9.437595263621015e-05, + "loss": 1.959, + "step": 5796 + }, + { + "epoch": 1.7793124616329035, + "grad_norm": 0.3938930630683899, + "learning_rate": 9.437366212701274e-05, + "loss": 1.8746, + "step": 5797 + }, + { + "epoch": 1.7796193984039288, + "grad_norm": 0.36665603518486023, + "learning_rate": 9.437137117928878e-05, + "loss": 1.9209, + "step": 5798 + }, + { + "epoch": 1.779926335174954, + "grad_norm": 0.38514846563339233, + "learning_rate": 9.436907979306092e-05, + "loss": 1.8697, + "step": 5799 + }, + { + "epoch": 1.780233271945979, + "grad_norm": 0.4100898206233978, + "learning_rate": 9.43667879683518e-05, + "loss": 1.9606, + "step": 5800 + }, + { + "epoch": 1.7805402087170044, + "grad_norm": 0.40195250511169434, + "learning_rate": 9.43644957051841e-05, + "loss": 1.918, + "step": 5801 + }, + { + "epoch": 1.7808471454880295, + "grad_norm": 0.3943032920360565, + "learning_rate": 9.436220300358043e-05, + "loss": 1.9394, + "step": 5802 + }, + { + "epoch": 1.7811540822590546, + "grad_norm": 0.4171943664550781, + "learning_rate": 9.435990986356349e-05, + "loss": 1.9773, + "step": 5803 + }, + { + "epoch": 1.78146101903008, + "grad_norm": 0.4278806746006012, + "learning_rate": 9.435761628515589e-05, + "loss": 1.8696, + "step": 5804 + }, + { + "epoch": 1.7817679558011048, + "grad_norm": 0.4659377634525299, + "learning_rate": 9.435532226838036e-05, + "loss": 1.9387, + "step": 5805 + }, + { + "epoch": 1.7820748925721301, + "grad_norm": 0.4428139925003052, + "learning_rate": 9.435302781325952e-05, + "loss": 1.9673, + "step": 5806 + }, + { + "epoch": 1.7823818293431553, + "grad_norm": 0.4488377869129181, + "learning_rate": 9.435073291981607e-05, + "loss": 1.8493, + "step": 5807 + }, + { + "epoch": 1.7826887661141804, + "grad_norm": 0.5337218046188354, + "learning_rate": 9.434843758807268e-05, + "loss": 1.8631, + "step": 5808 + }, + { + "epoch": 1.7829957028852057, + "grad_norm": 0.5479410886764526, + "learning_rate": 9.434614181805202e-05, + "loss": 1.8548, + "step": 5809 + }, + { + "epoch": 1.7833026396562308, + "grad_norm": 0.5154398679733276, + "learning_rate": 9.434384560977681e-05, + "loss": 1.9558, + "step": 5810 + }, + { + "epoch": 1.783609576427256, + "grad_norm": 0.44863855838775635, + "learning_rate": 9.434154896326974e-05, + "loss": 1.9287, + "step": 5811 + }, + { + "epoch": 1.7839165131982813, + "grad_norm": 0.43923139572143555, + "learning_rate": 9.433925187855348e-05, + "loss": 1.9475, + "step": 5812 + }, + { + "epoch": 1.7842234499693064, + "grad_norm": 0.3602962791919708, + "learning_rate": 9.433695435565073e-05, + "loss": 1.8705, + "step": 5813 + }, + { + "epoch": 1.7845303867403315, + "grad_norm": 0.3956433832645416, + "learning_rate": 9.433465639458423e-05, + "loss": 1.9402, + "step": 5814 + }, + { + "epoch": 1.7848373235113568, + "grad_norm": 0.3382786810398102, + "learning_rate": 9.433235799537666e-05, + "loss": 1.9176, + "step": 5815 + }, + { + "epoch": 1.7851442602823817, + "grad_norm": 0.3681669533252716, + "learning_rate": 9.433005915805076e-05, + "loss": 1.8628, + "step": 5816 + }, + { + "epoch": 1.785451197053407, + "grad_norm": 0.32285505533218384, + "learning_rate": 9.432775988262921e-05, + "loss": 1.8875, + "step": 5817 + }, + { + "epoch": 1.7857581338244322, + "grad_norm": 0.35673508048057556, + "learning_rate": 9.432546016913477e-05, + "loss": 1.925, + "step": 5818 + }, + { + "epoch": 1.7860650705954573, + "grad_norm": 0.363308310508728, + "learning_rate": 9.432316001759015e-05, + "loss": 1.8711, + "step": 5819 + }, + { + "epoch": 1.7863720073664826, + "grad_norm": 0.36789265275001526, + "learning_rate": 9.432085942801808e-05, + "loss": 1.8578, + "step": 5820 + }, + { + "epoch": 1.7866789441375077, + "grad_norm": 0.3791796565055847, + "learning_rate": 9.43185584004413e-05, + "loss": 1.9162, + "step": 5821 + }, + { + "epoch": 1.7869858809085328, + "grad_norm": 0.3819539248943329, + "learning_rate": 9.431625693488256e-05, + "loss": 1.9042, + "step": 5822 + }, + { + "epoch": 1.7872928176795582, + "grad_norm": 0.36675095558166504, + "learning_rate": 9.43139550313646e-05, + "loss": 1.9775, + "step": 5823 + }, + { + "epoch": 1.787599754450583, + "grad_norm": 0.40895935893058777, + "learning_rate": 9.431165268991013e-05, + "loss": 1.9249, + "step": 5824 + }, + { + "epoch": 1.7879066912216084, + "grad_norm": 0.3866878151893616, + "learning_rate": 9.430934991054197e-05, + "loss": 1.8706, + "step": 5825 + }, + { + "epoch": 1.7882136279926335, + "grad_norm": 0.4892923831939697, + "learning_rate": 9.430704669328283e-05, + "loss": 1.9177, + "step": 5826 + }, + { + "epoch": 1.7885205647636586, + "grad_norm": 0.46216699481010437, + "learning_rate": 9.430474303815548e-05, + "loss": 1.8606, + "step": 5827 + }, + { + "epoch": 1.788827501534684, + "grad_norm": 0.4253760874271393, + "learning_rate": 9.430243894518271e-05, + "loss": 1.9123, + "step": 5828 + }, + { + "epoch": 1.789134438305709, + "grad_norm": 0.3316090404987335, + "learning_rate": 9.430013441438726e-05, + "loss": 1.9138, + "step": 5829 + }, + { + "epoch": 1.7894413750767342, + "grad_norm": 0.36144545674324036, + "learning_rate": 9.429782944579191e-05, + "loss": 1.8851, + "step": 5830 + }, + { + "epoch": 1.7897483118477595, + "grad_norm": 0.47213298082351685, + "learning_rate": 9.429552403941946e-05, + "loss": 1.9614, + "step": 5831 + }, + { + "epoch": 1.7900552486187844, + "grad_norm": 0.5166186094284058, + "learning_rate": 9.429321819529267e-05, + "loss": 1.9297, + "step": 5832 + }, + { + "epoch": 1.7903621853898097, + "grad_norm": 0.5276393294334412, + "learning_rate": 9.429091191343433e-05, + "loss": 1.8803, + "step": 5833 + }, + { + "epoch": 1.7906691221608348, + "grad_norm": 0.5736613869667053, + "learning_rate": 9.428860519386726e-05, + "loss": 1.9256, + "step": 5834 + }, + { + "epoch": 1.79097605893186, + "grad_norm": 0.6111080050468445, + "learning_rate": 9.428629803661421e-05, + "loss": 1.9624, + "step": 5835 + }, + { + "epoch": 1.7912829957028853, + "grad_norm": 0.45036107301712036, + "learning_rate": 9.428399044169802e-05, + "loss": 1.8625, + "step": 5836 + }, + { + "epoch": 1.7915899324739104, + "grad_norm": 0.35049325227737427, + "learning_rate": 9.428168240914148e-05, + "loss": 1.8988, + "step": 5837 + }, + { + "epoch": 1.7918968692449355, + "grad_norm": 0.4196048080921173, + "learning_rate": 9.427937393896739e-05, + "loss": 1.8593, + "step": 5838 + }, + { + "epoch": 1.7922038060159609, + "grad_norm": 0.5051491856575012, + "learning_rate": 9.42770650311986e-05, + "loss": 1.9283, + "step": 5839 + }, + { + "epoch": 1.7925107427869857, + "grad_norm": 0.5883297324180603, + "learning_rate": 9.427475568585787e-05, + "loss": 1.9211, + "step": 5840 + }, + { + "epoch": 1.792817679558011, + "grad_norm": 0.54326993227005, + "learning_rate": 9.427244590296807e-05, + "loss": 1.8856, + "step": 5841 + }, + { + "epoch": 1.7931246163290362, + "grad_norm": 0.3963034152984619, + "learning_rate": 9.4270135682552e-05, + "loss": 1.9302, + "step": 5842 + }, + { + "epoch": 1.7934315531000613, + "grad_norm": 0.3804232180118561, + "learning_rate": 9.426782502463251e-05, + "loss": 1.8615, + "step": 5843 + }, + { + "epoch": 1.7937384898710866, + "grad_norm": 0.5173880457878113, + "learning_rate": 9.426551392923244e-05, + "loss": 1.9702, + "step": 5844 + }, + { + "epoch": 1.7940454266421118, + "grad_norm": 0.5509253144264221, + "learning_rate": 9.42632023963746e-05, + "loss": 1.9091, + "step": 5845 + }, + { + "epoch": 1.7943523634131369, + "grad_norm": 0.4918860197067261, + "learning_rate": 9.426089042608186e-05, + "loss": 1.956, + "step": 5846 + }, + { + "epoch": 1.7946593001841622, + "grad_norm": 0.40632131695747375, + "learning_rate": 9.425857801837705e-05, + "loss": 1.978, + "step": 5847 + }, + { + "epoch": 1.794966236955187, + "grad_norm": 0.429643839597702, + "learning_rate": 9.425626517328303e-05, + "loss": 1.9293, + "step": 5848 + }, + { + "epoch": 1.7952731737262124, + "grad_norm": 0.46690109372138977, + "learning_rate": 9.425395189082267e-05, + "loss": 1.935, + "step": 5849 + }, + { + "epoch": 1.7955801104972375, + "grad_norm": 0.47745081782341003, + "learning_rate": 9.425163817101881e-05, + "loss": 1.9308, + "step": 5850 + }, + { + "epoch": 1.7958870472682626, + "grad_norm": 0.40971288084983826, + "learning_rate": 9.424932401389433e-05, + "loss": 1.8818, + "step": 5851 + }, + { + "epoch": 1.796193984039288, + "grad_norm": 0.44640809297561646, + "learning_rate": 9.424700941947209e-05, + "loss": 1.9298, + "step": 5852 + }, + { + "epoch": 1.796500920810313, + "grad_norm": 0.4068106412887573, + "learning_rate": 9.424469438777497e-05, + "loss": 1.9176, + "step": 5853 + }, + { + "epoch": 1.7968078575813382, + "grad_norm": 0.39228180050849915, + "learning_rate": 9.424237891882584e-05, + "loss": 1.9822, + "step": 5854 + }, + { + "epoch": 1.7971147943523635, + "grad_norm": 0.4050966203212738, + "learning_rate": 9.424006301264761e-05, + "loss": 2.0092, + "step": 5855 + }, + { + "epoch": 1.7974217311233884, + "grad_norm": 0.4402252733707428, + "learning_rate": 9.423774666926313e-05, + "loss": 1.9686, + "step": 5856 + }, + { + "epoch": 1.7977286678944138, + "grad_norm": 0.4362206757068634, + "learning_rate": 9.423542988869531e-05, + "loss": 1.9472, + "step": 5857 + }, + { + "epoch": 1.7980356046654389, + "grad_norm": 0.4363079369068146, + "learning_rate": 9.423311267096706e-05, + "loss": 1.9046, + "step": 5858 + }, + { + "epoch": 1.798342541436464, + "grad_norm": 0.4619371294975281, + "learning_rate": 9.423079501610123e-05, + "loss": 1.9322, + "step": 5859 + }, + { + "epoch": 1.7986494782074893, + "grad_norm": 0.3747330605983734, + "learning_rate": 9.42284769241208e-05, + "loss": 1.8859, + "step": 5860 + }, + { + "epoch": 1.7989564149785144, + "grad_norm": 0.46349939703941345, + "learning_rate": 9.422615839504863e-05, + "loss": 2.0343, + "step": 5861 + }, + { + "epoch": 1.7992633517495396, + "grad_norm": 0.4081406891345978, + "learning_rate": 9.422383942890762e-05, + "loss": 1.9261, + "step": 5862 + }, + { + "epoch": 1.7995702885205649, + "grad_norm": 0.4200274348258972, + "learning_rate": 9.42215200257207e-05, + "loss": 1.8922, + "step": 5863 + }, + { + "epoch": 1.7998772252915898, + "grad_norm": 0.4353233277797699, + "learning_rate": 9.421920018551084e-05, + "loss": 1.9263, + "step": 5864 + }, + { + "epoch": 1.8001841620626151, + "grad_norm": 0.43261346220970154, + "learning_rate": 9.42168799083009e-05, + "loss": 1.872, + "step": 5865 + }, + { + "epoch": 1.8004910988336402, + "grad_norm": 0.41588231921195984, + "learning_rate": 9.421455919411385e-05, + "loss": 1.9427, + "step": 5866 + }, + { + "epoch": 1.8007980356046653, + "grad_norm": 0.36490678787231445, + "learning_rate": 9.421223804297261e-05, + "loss": 1.9458, + "step": 5867 + }, + { + "epoch": 1.8011049723756907, + "grad_norm": 0.40656644105911255, + "learning_rate": 9.42099164549001e-05, + "loss": 1.8791, + "step": 5868 + }, + { + "epoch": 1.8014119091467158, + "grad_norm": 0.35529834032058716, + "learning_rate": 9.42075944299193e-05, + "loss": 1.8889, + "step": 5869 + }, + { + "epoch": 1.801718845917741, + "grad_norm": 0.3530628979206085, + "learning_rate": 9.420527196805314e-05, + "loss": 1.9093, + "step": 5870 + }, + { + "epoch": 1.8020257826887662, + "grad_norm": 0.35012003779411316, + "learning_rate": 9.420294906932457e-05, + "loss": 1.84, + "step": 5871 + }, + { + "epoch": 1.8023327194597911, + "grad_norm": 0.37993142008781433, + "learning_rate": 9.420062573375654e-05, + "loss": 1.9943, + "step": 5872 + }, + { + "epoch": 1.8026396562308165, + "grad_norm": 0.34801873564720154, + "learning_rate": 9.419830196137204e-05, + "loss": 1.9092, + "step": 5873 + }, + { + "epoch": 1.8029465930018416, + "grad_norm": 0.3381052017211914, + "learning_rate": 9.4195977752194e-05, + "loss": 1.9212, + "step": 5874 + }, + { + "epoch": 1.8032535297728667, + "grad_norm": 0.3624991476535797, + "learning_rate": 9.419365310624542e-05, + "loss": 1.9491, + "step": 5875 + }, + { + "epoch": 1.803560466543892, + "grad_norm": 0.3840768337249756, + "learning_rate": 9.419132802354925e-05, + "loss": 1.9531, + "step": 5876 + }, + { + "epoch": 1.8038674033149171, + "grad_norm": 0.377481073141098, + "learning_rate": 9.418900250412846e-05, + "loss": 1.9103, + "step": 5877 + }, + { + "epoch": 1.8041743400859422, + "grad_norm": 0.41462278366088867, + "learning_rate": 9.418667654800606e-05, + "loss": 1.944, + "step": 5878 + }, + { + "epoch": 1.8044812768569676, + "grad_norm": 0.5620705485343933, + "learning_rate": 9.418435015520502e-05, + "loss": 1.9184, + "step": 5879 + }, + { + "epoch": 1.8047882136279927, + "grad_norm": 0.6150699853897095, + "learning_rate": 9.418202332574833e-05, + "loss": 1.8971, + "step": 5880 + }, + { + "epoch": 1.8050951503990178, + "grad_norm": 0.5631645321846008, + "learning_rate": 9.4179696059659e-05, + "loss": 1.9668, + "step": 5881 + }, + { + "epoch": 1.8054020871700431, + "grad_norm": 0.4416831433773041, + "learning_rate": 9.417736835696001e-05, + "loss": 1.8531, + "step": 5882 + }, + { + "epoch": 1.805709023941068, + "grad_norm": 0.37340816855430603, + "learning_rate": 9.417504021767438e-05, + "loss": 1.8928, + "step": 5883 + }, + { + "epoch": 1.8060159607120934, + "grad_norm": 0.46018123626708984, + "learning_rate": 9.41727116418251e-05, + "loss": 1.8943, + "step": 5884 + }, + { + "epoch": 1.8063228974831185, + "grad_norm": 0.3852032721042633, + "learning_rate": 9.41703826294352e-05, + "loss": 1.8927, + "step": 5885 + }, + { + "epoch": 1.8066298342541436, + "grad_norm": 0.36783283948898315, + "learning_rate": 9.41680531805277e-05, + "loss": 1.9255, + "step": 5886 + }, + { + "epoch": 1.806936771025169, + "grad_norm": 0.39950302243232727, + "learning_rate": 9.416572329512559e-05, + "loss": 1.9215, + "step": 5887 + }, + { + "epoch": 1.807243707796194, + "grad_norm": 0.37217068672180176, + "learning_rate": 9.416339297325193e-05, + "loss": 1.8798, + "step": 5888 + }, + { + "epoch": 1.8075506445672191, + "grad_norm": 0.4334213137626648, + "learning_rate": 9.416106221492974e-05, + "loss": 1.9583, + "step": 5889 + }, + { + "epoch": 1.8078575813382445, + "grad_norm": 0.39610370993614197, + "learning_rate": 9.415873102018204e-05, + "loss": 1.9526, + "step": 5890 + }, + { + "epoch": 1.8081645181092694, + "grad_norm": 0.4256335496902466, + "learning_rate": 9.41563993890319e-05, + "loss": 1.9633, + "step": 5891 + }, + { + "epoch": 1.8084714548802947, + "grad_norm": 0.48030543327331543, + "learning_rate": 9.41540673215023e-05, + "loss": 1.8869, + "step": 5892 + }, + { + "epoch": 1.8087783916513198, + "grad_norm": 0.5549675822257996, + "learning_rate": 9.415173481761634e-05, + "loss": 1.9894, + "step": 5893 + }, + { + "epoch": 1.809085328422345, + "grad_norm": 0.5706361532211304, + "learning_rate": 9.414940187739708e-05, + "loss": 1.9721, + "step": 5894 + }, + { + "epoch": 1.8093922651933703, + "grad_norm": 0.4263947606086731, + "learning_rate": 9.414706850086754e-05, + "loss": 1.9408, + "step": 5895 + }, + { + "epoch": 1.8096992019643954, + "grad_norm": 0.3934611976146698, + "learning_rate": 9.414473468805078e-05, + "loss": 1.9444, + "step": 5896 + }, + { + "epoch": 1.8100061387354205, + "grad_norm": 0.4267776608467102, + "learning_rate": 9.41424004389699e-05, + "loss": 1.8774, + "step": 5897 + }, + { + "epoch": 1.8103130755064458, + "grad_norm": 0.46216219663619995, + "learning_rate": 9.414006575364795e-05, + "loss": 1.9648, + "step": 5898 + }, + { + "epoch": 1.8106200122774707, + "grad_norm": 0.4730767607688904, + "learning_rate": 9.413773063210798e-05, + "loss": 1.9528, + "step": 5899 + }, + { + "epoch": 1.810926949048496, + "grad_norm": 0.36383283138275146, + "learning_rate": 9.413539507437308e-05, + "loss": 1.843, + "step": 5900 + }, + { + "epoch": 1.8112338858195212, + "grad_norm": 0.343729168176651, + "learning_rate": 9.413305908046636e-05, + "loss": 1.9101, + "step": 5901 + }, + { + "epoch": 1.8115408225905463, + "grad_norm": 0.3774524927139282, + "learning_rate": 9.413072265041087e-05, + "loss": 1.8705, + "step": 5902 + }, + { + "epoch": 1.8118477593615716, + "grad_norm": 0.37734711170196533, + "learning_rate": 9.412838578422972e-05, + "loss": 1.868, + "step": 5903 + }, + { + "epoch": 1.8121546961325967, + "grad_norm": 0.3705524206161499, + "learning_rate": 9.4126048481946e-05, + "loss": 1.9587, + "step": 5904 + }, + { + "epoch": 1.8124616329036218, + "grad_norm": 0.45906612277030945, + "learning_rate": 9.41237107435828e-05, + "loss": 1.9872, + "step": 5905 + }, + { + "epoch": 1.8127685696746472, + "grad_norm": 0.5013484954833984, + "learning_rate": 9.412137256916323e-05, + "loss": 1.8692, + "step": 5906 + }, + { + "epoch": 1.813075506445672, + "grad_norm": 0.5123991370201111, + "learning_rate": 9.411903395871038e-05, + "loss": 1.9574, + "step": 5907 + }, + { + "epoch": 1.8133824432166974, + "grad_norm": 0.45425844192504883, + "learning_rate": 9.411669491224739e-05, + "loss": 1.9295, + "step": 5908 + }, + { + "epoch": 1.8136893799877225, + "grad_norm": 0.3939640522003174, + "learning_rate": 9.411435542979736e-05, + "loss": 1.9258, + "step": 5909 + }, + { + "epoch": 1.8139963167587476, + "grad_norm": 0.5032235383987427, + "learning_rate": 9.411201551138342e-05, + "loss": 1.9012, + "step": 5910 + }, + { + "epoch": 1.814303253529773, + "grad_norm": 0.6334826946258545, + "learning_rate": 9.410967515702869e-05, + "loss": 1.9699, + "step": 5911 + }, + { + "epoch": 1.814610190300798, + "grad_norm": 0.56645667552948, + "learning_rate": 9.41073343667563e-05, + "loss": 1.9346, + "step": 5912 + }, + { + "epoch": 1.8149171270718232, + "grad_norm": 0.461668461561203, + "learning_rate": 9.410499314058936e-05, + "loss": 1.9549, + "step": 5913 + }, + { + "epoch": 1.8152240638428485, + "grad_norm": 0.39917534589767456, + "learning_rate": 9.410265147855104e-05, + "loss": 1.9503, + "step": 5914 + }, + { + "epoch": 1.8155310006138734, + "grad_norm": 0.4409043788909912, + "learning_rate": 9.410030938066448e-05, + "loss": 1.897, + "step": 5915 + }, + { + "epoch": 1.8158379373848987, + "grad_norm": 0.5793384313583374, + "learning_rate": 9.40979668469528e-05, + "loss": 1.9526, + "step": 5916 + }, + { + "epoch": 1.8161448741559238, + "grad_norm": 0.4642924666404724, + "learning_rate": 9.409562387743917e-05, + "loss": 1.8993, + "step": 5917 + }, + { + "epoch": 1.816451810926949, + "grad_norm": 0.3799861669540405, + "learning_rate": 9.409328047214674e-05, + "loss": 1.9412, + "step": 5918 + }, + { + "epoch": 1.8167587476979743, + "grad_norm": 0.40758320689201355, + "learning_rate": 9.409093663109866e-05, + "loss": 1.9908, + "step": 5919 + }, + { + "epoch": 1.8170656844689994, + "grad_norm": 0.41446420550346375, + "learning_rate": 9.40885923543181e-05, + "loss": 1.8711, + "step": 5920 + }, + { + "epoch": 1.8173726212400245, + "grad_norm": 0.4744807183742523, + "learning_rate": 9.408624764182823e-05, + "loss": 2.0297, + "step": 5921 + }, + { + "epoch": 1.8176795580110499, + "grad_norm": 0.43377524614334106, + "learning_rate": 9.408390249365224e-05, + "loss": 1.9613, + "step": 5922 + }, + { + "epoch": 1.8179864947820747, + "grad_norm": 0.38450872898101807, + "learning_rate": 9.408155690981328e-05, + "loss": 1.8716, + "step": 5923 + }, + { + "epoch": 1.8182934315531, + "grad_norm": 0.4989684820175171, + "learning_rate": 9.407921089033452e-05, + "loss": 1.9909, + "step": 5924 + }, + { + "epoch": 1.8186003683241252, + "grad_norm": 0.4137042462825775, + "learning_rate": 9.407686443523918e-05, + "loss": 1.8778, + "step": 5925 + }, + { + "epoch": 1.8189073050951503, + "grad_norm": 0.3816729485988617, + "learning_rate": 9.407451754455042e-05, + "loss": 1.9355, + "step": 5926 + }, + { + "epoch": 1.8192142418661756, + "grad_norm": 0.48876214027404785, + "learning_rate": 9.407217021829145e-05, + "loss": 1.9256, + "step": 5927 + }, + { + "epoch": 1.8195211786372008, + "grad_norm": 0.5273690223693848, + "learning_rate": 9.406982245648547e-05, + "loss": 1.9456, + "step": 5928 + }, + { + "epoch": 1.8198281154082259, + "grad_norm": 0.4148990511894226, + "learning_rate": 9.406747425915566e-05, + "loss": 1.9184, + "step": 5929 + }, + { + "epoch": 1.8201350521792512, + "grad_norm": 0.4484131634235382, + "learning_rate": 9.406512562632526e-05, + "loss": 1.9305, + "step": 5930 + }, + { + "epoch": 1.820441988950276, + "grad_norm": 0.6036938428878784, + "learning_rate": 9.406277655801744e-05, + "loss": 1.9294, + "step": 5931 + }, + { + "epoch": 1.8207489257213014, + "grad_norm": 0.5399366021156311, + "learning_rate": 9.406042705425543e-05, + "loss": 1.9265, + "step": 5932 + }, + { + "epoch": 1.8210558624923265, + "grad_norm": 0.3591126501560211, + "learning_rate": 9.405807711506249e-05, + "loss": 1.8634, + "step": 5933 + }, + { + "epoch": 1.8213627992633517, + "grad_norm": 0.4474995732307434, + "learning_rate": 9.405572674046179e-05, + "loss": 2.0084, + "step": 5934 + }, + { + "epoch": 1.821669736034377, + "grad_norm": 0.4841657876968384, + "learning_rate": 9.405337593047657e-05, + "loss": 1.8885, + "step": 5935 + }, + { + "epoch": 1.821976672805402, + "grad_norm": 0.4786655008792877, + "learning_rate": 9.405102468513008e-05, + "loss": 1.9273, + "step": 5936 + }, + { + "epoch": 1.8222836095764272, + "grad_norm": 0.4675963521003723, + "learning_rate": 9.404867300444553e-05, + "loss": 1.9267, + "step": 5937 + }, + { + "epoch": 1.8225905463474525, + "grad_norm": 0.40235474705696106, + "learning_rate": 9.404632088844619e-05, + "loss": 2.0208, + "step": 5938 + }, + { + "epoch": 1.8228974831184774, + "grad_norm": 0.40626317262649536, + "learning_rate": 9.404396833715527e-05, + "loss": 1.9079, + "step": 5939 + }, + { + "epoch": 1.8232044198895028, + "grad_norm": 0.4164435565471649, + "learning_rate": 9.404161535059607e-05, + "loss": 1.8818, + "step": 5940 + }, + { + "epoch": 1.8235113566605279, + "grad_norm": 0.44487184286117554, + "learning_rate": 9.40392619287918e-05, + "loss": 1.9184, + "step": 5941 + }, + { + "epoch": 1.823818293431553, + "grad_norm": 0.4009508192539215, + "learning_rate": 9.403690807176572e-05, + "loss": 1.8814, + "step": 5942 + }, + { + "epoch": 1.8241252302025783, + "grad_norm": 0.3518575429916382, + "learning_rate": 9.403455377954112e-05, + "loss": 1.9319, + "step": 5943 + }, + { + "epoch": 1.8244321669736034, + "grad_norm": 0.36712533235549927, + "learning_rate": 9.403219905214125e-05, + "loss": 1.8609, + "step": 5944 + }, + { + "epoch": 1.8247391037446286, + "grad_norm": 0.3926267623901367, + "learning_rate": 9.402984388958937e-05, + "loss": 1.9328, + "step": 5945 + }, + { + "epoch": 1.825046040515654, + "grad_norm": 0.370781272649765, + "learning_rate": 9.402748829190878e-05, + "loss": 1.9848, + "step": 5946 + }, + { + "epoch": 1.8253529772866788, + "grad_norm": 0.38226625323295593, + "learning_rate": 9.402513225912273e-05, + "loss": 1.8933, + "step": 5947 + }, + { + "epoch": 1.8256599140577041, + "grad_norm": 0.40101101994514465, + "learning_rate": 9.402277579125451e-05, + "loss": 1.9231, + "step": 5948 + }, + { + "epoch": 1.8259668508287292, + "grad_norm": 0.41038060188293457, + "learning_rate": 9.402041888832744e-05, + "loss": 1.9445, + "step": 5949 + }, + { + "epoch": 1.8262737875997543, + "grad_norm": 0.37442395091056824, + "learning_rate": 9.401806155036479e-05, + "loss": 1.9271, + "step": 5950 + }, + { + "epoch": 1.8265807243707797, + "grad_norm": 0.43142926692962646, + "learning_rate": 9.401570377738984e-05, + "loss": 1.9489, + "step": 5951 + }, + { + "epoch": 1.8268876611418048, + "grad_norm": 0.38730981945991516, + "learning_rate": 9.401334556942591e-05, + "loss": 1.8802, + "step": 5952 + }, + { + "epoch": 1.82719459791283, + "grad_norm": 0.34189531207084656, + "learning_rate": 9.40109869264963e-05, + "loss": 1.9116, + "step": 5953 + }, + { + "epoch": 1.8275015346838552, + "grad_norm": 0.3632197678089142, + "learning_rate": 9.400862784862434e-05, + "loss": 1.8456, + "step": 5954 + }, + { + "epoch": 1.8278084714548803, + "grad_norm": 0.4008798599243164, + "learning_rate": 9.400626833583331e-05, + "loss": 1.9984, + "step": 5955 + }, + { + "epoch": 1.8281154082259055, + "grad_norm": 0.4087502062320709, + "learning_rate": 9.400390838814655e-05, + "loss": 1.8177, + "step": 5956 + }, + { + "epoch": 1.8284223449969308, + "grad_norm": 0.3753478229045868, + "learning_rate": 9.400154800558737e-05, + "loss": 1.864, + "step": 5957 + }, + { + "epoch": 1.8287292817679557, + "grad_norm": 0.37939608097076416, + "learning_rate": 9.399918718817911e-05, + "loss": 1.9331, + "step": 5958 + }, + { + "epoch": 1.829036218538981, + "grad_norm": 0.41382426023483276, + "learning_rate": 9.399682593594507e-05, + "loss": 1.9014, + "step": 5959 + }, + { + "epoch": 1.8293431553100061, + "grad_norm": 0.46129345893859863, + "learning_rate": 9.399446424890864e-05, + "loss": 1.9591, + "step": 5960 + }, + { + "epoch": 1.8296500920810312, + "grad_norm": 0.487870454788208, + "learning_rate": 9.399210212709312e-05, + "loss": 1.9073, + "step": 5961 + }, + { + "epoch": 1.8299570288520566, + "grad_norm": 0.4693615138530731, + "learning_rate": 9.398973957052185e-05, + "loss": 1.8336, + "step": 5962 + }, + { + "epoch": 1.8302639656230817, + "grad_norm": 0.38947850465774536, + "learning_rate": 9.39873765792182e-05, + "loss": 1.8599, + "step": 5963 + }, + { + "epoch": 1.8305709023941068, + "grad_norm": 0.372242271900177, + "learning_rate": 9.398501315320551e-05, + "loss": 1.9653, + "step": 5964 + }, + { + "epoch": 1.8308778391651321, + "grad_norm": 0.37679895758628845, + "learning_rate": 9.398264929250714e-05, + "loss": 1.8886, + "step": 5965 + }, + { + "epoch": 1.831184775936157, + "grad_norm": 0.347989022731781, + "learning_rate": 9.398028499714645e-05, + "loss": 1.8665, + "step": 5966 + }, + { + "epoch": 1.8314917127071824, + "grad_norm": 0.4297877550125122, + "learning_rate": 9.397792026714681e-05, + "loss": 1.9646, + "step": 5967 + }, + { + "epoch": 1.8317986494782075, + "grad_norm": 0.3698103427886963, + "learning_rate": 9.397555510253158e-05, + "loss": 1.9537, + "step": 5968 + }, + { + "epoch": 1.8321055862492326, + "grad_norm": 0.3268609941005707, + "learning_rate": 9.397318950332414e-05, + "loss": 1.8679, + "step": 5969 + }, + { + "epoch": 1.832412523020258, + "grad_norm": 0.3487341105937958, + "learning_rate": 9.397082346954788e-05, + "loss": 1.8936, + "step": 5970 + }, + { + "epoch": 1.832719459791283, + "grad_norm": 0.36363741755485535, + "learning_rate": 9.396845700122616e-05, + "loss": 1.8926, + "step": 5971 + }, + { + "epoch": 1.8330263965623081, + "grad_norm": 0.42258647084236145, + "learning_rate": 9.396609009838237e-05, + "loss": 1.9439, + "step": 5972 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.4087521433830261, + "learning_rate": 9.396372276103992e-05, + "loss": 1.8868, + "step": 5973 + }, + { + "epoch": 1.8336402701043584, + "grad_norm": 0.41857820749282837, + "learning_rate": 9.396135498922218e-05, + "loss": 1.9824, + "step": 5974 + }, + { + "epoch": 1.8339472068753837, + "grad_norm": 0.44207099080085754, + "learning_rate": 9.395898678295259e-05, + "loss": 1.9183, + "step": 5975 + }, + { + "epoch": 1.8342541436464088, + "grad_norm": 0.38295891880989075, + "learning_rate": 9.39566181422545e-05, + "loss": 1.8882, + "step": 5976 + }, + { + "epoch": 1.834561080417434, + "grad_norm": 0.4440687298774719, + "learning_rate": 9.395424906715136e-05, + "loss": 1.9401, + "step": 5977 + }, + { + "epoch": 1.8348680171884593, + "grad_norm": 0.3867577016353607, + "learning_rate": 9.395187955766655e-05, + "loss": 1.9243, + "step": 5978 + }, + { + "epoch": 1.8351749539594844, + "grad_norm": 0.47536182403564453, + "learning_rate": 9.394950961382354e-05, + "loss": 1.9248, + "step": 5979 + }, + { + "epoch": 1.8354818907305095, + "grad_norm": 0.4071936011314392, + "learning_rate": 9.394713923564569e-05, + "loss": 1.8701, + "step": 5980 + }, + { + "epoch": 1.8357888275015348, + "grad_norm": 0.41844502091407776, + "learning_rate": 9.394476842315645e-05, + "loss": 2.0087, + "step": 5981 + }, + { + "epoch": 1.8360957642725597, + "grad_norm": 0.40439316630363464, + "learning_rate": 9.394239717637927e-05, + "loss": 1.8945, + "step": 5982 + }, + { + "epoch": 1.836402701043585, + "grad_norm": 0.36738064885139465, + "learning_rate": 9.394002549533754e-05, + "loss": 1.9361, + "step": 5983 + }, + { + "epoch": 1.8367096378146102, + "grad_norm": 0.4733370542526245, + "learning_rate": 9.393765338005476e-05, + "loss": 1.9301, + "step": 5984 + }, + { + "epoch": 1.8370165745856353, + "grad_norm": 0.4467030465602875, + "learning_rate": 9.39352808305543e-05, + "loss": 1.8691, + "step": 5985 + }, + { + "epoch": 1.8373235113566606, + "grad_norm": 0.5276423692703247, + "learning_rate": 9.393290784685967e-05, + "loss": 1.9211, + "step": 5986 + }, + { + "epoch": 1.8376304481276857, + "grad_norm": 0.4791669547557831, + "learning_rate": 9.393053442899428e-05, + "loss": 1.9876, + "step": 5987 + }, + { + "epoch": 1.8379373848987108, + "grad_norm": 0.41468554735183716, + "learning_rate": 9.392816057698159e-05, + "loss": 1.9483, + "step": 5988 + }, + { + "epoch": 1.8382443216697362, + "grad_norm": 0.3979242742061615, + "learning_rate": 9.39257862908451e-05, + "loss": 1.8962, + "step": 5989 + }, + { + "epoch": 1.838551258440761, + "grad_norm": 0.47706472873687744, + "learning_rate": 9.392341157060822e-05, + "loss": 1.9028, + "step": 5990 + }, + { + "epoch": 1.8388581952117864, + "grad_norm": 0.5254244804382324, + "learning_rate": 9.392103641629446e-05, + "loss": 1.9244, + "step": 5991 + }, + { + "epoch": 1.8391651319828115, + "grad_norm": 0.49596595764160156, + "learning_rate": 9.391866082792727e-05, + "loss": 1.8731, + "step": 5992 + }, + { + "epoch": 1.8394720687538366, + "grad_norm": 0.3787136971950531, + "learning_rate": 9.391628480553013e-05, + "loss": 1.9404, + "step": 5993 + }, + { + "epoch": 1.839779005524862, + "grad_norm": 0.3986566960811615, + "learning_rate": 9.391390834912651e-05, + "loss": 1.9319, + "step": 5994 + }, + { + "epoch": 1.840085942295887, + "grad_norm": 0.4466419219970703, + "learning_rate": 9.391153145873992e-05, + "loss": 1.9755, + "step": 5995 + }, + { + "epoch": 1.8403928790669122, + "grad_norm": 0.43374884128570557, + "learning_rate": 9.390915413439385e-05, + "loss": 1.913, + "step": 5996 + }, + { + "epoch": 1.8406998158379375, + "grad_norm": 0.3897610902786255, + "learning_rate": 9.390677637611176e-05, + "loss": 1.9488, + "step": 5997 + }, + { + "epoch": 1.8410067526089624, + "grad_norm": 0.38407614827156067, + "learning_rate": 9.390439818391718e-05, + "loss": 1.8712, + "step": 5998 + }, + { + "epoch": 1.8413136893799877, + "grad_norm": 0.4159192740917206, + "learning_rate": 9.390201955783362e-05, + "loss": 1.9254, + "step": 5999 + }, + { + "epoch": 1.8416206261510129, + "grad_norm": 0.42220592498779297, + "learning_rate": 9.389964049788455e-05, + "loss": 1.9684, + "step": 6000 + }, + { + "epoch": 1.841927562922038, + "grad_norm": 0.3792029619216919, + "learning_rate": 9.389726100409351e-05, + "loss": 1.9091, + "step": 6001 + }, + { + "epoch": 1.8422344996930633, + "grad_norm": 0.37374788522720337, + "learning_rate": 9.389488107648401e-05, + "loss": 1.9498, + "step": 6002 + }, + { + "epoch": 1.8425414364640884, + "grad_norm": 0.4237084686756134, + "learning_rate": 9.389250071507958e-05, + "loss": 1.9177, + "step": 6003 + }, + { + "epoch": 1.8428483732351135, + "grad_norm": 0.5332993865013123, + "learning_rate": 9.38901199199037e-05, + "loss": 1.8994, + "step": 6004 + }, + { + "epoch": 1.8431553100061389, + "grad_norm": 0.42202335596084595, + "learning_rate": 9.388773869097996e-05, + "loss": 1.8365, + "step": 6005 + }, + { + "epoch": 1.8434622467771637, + "grad_norm": 0.3581100106239319, + "learning_rate": 9.388535702833185e-05, + "loss": 1.8536, + "step": 6006 + }, + { + "epoch": 1.843769183548189, + "grad_norm": 0.3670782446861267, + "learning_rate": 9.388297493198293e-05, + "loss": 1.8965, + "step": 6007 + }, + { + "epoch": 1.8440761203192142, + "grad_norm": 0.39181825518608093, + "learning_rate": 9.38805924019567e-05, + "loss": 1.8674, + "step": 6008 + }, + { + "epoch": 1.8443830570902393, + "grad_norm": 0.46757015585899353, + "learning_rate": 9.387820943827676e-05, + "loss": 1.8945, + "step": 6009 + }, + { + "epoch": 1.8446899938612646, + "grad_norm": 0.4656504690647125, + "learning_rate": 9.387582604096664e-05, + "loss": 1.8626, + "step": 6010 + }, + { + "epoch": 1.8449969306322898, + "grad_norm": 0.4699888825416565, + "learning_rate": 9.387344221004988e-05, + "loss": 1.9396, + "step": 6011 + }, + { + "epoch": 1.8453038674033149, + "grad_norm": 0.36591392755508423, + "learning_rate": 9.387105794555006e-05, + "loss": 1.8031, + "step": 6012 + }, + { + "epoch": 1.8456108041743402, + "grad_norm": 0.3563486933708191, + "learning_rate": 9.386867324749073e-05, + "loss": 1.8658, + "step": 6013 + }, + { + "epoch": 1.845917740945365, + "grad_norm": 0.4490883946418762, + "learning_rate": 9.386628811589547e-05, + "loss": 1.9809, + "step": 6014 + }, + { + "epoch": 1.8462246777163904, + "grad_norm": 0.39862295985221863, + "learning_rate": 9.38639025507878e-05, + "loss": 1.9268, + "step": 6015 + }, + { + "epoch": 1.8465316144874155, + "grad_norm": 0.3579883575439453, + "learning_rate": 9.386151655219138e-05, + "loss": 1.8538, + "step": 6016 + }, + { + "epoch": 1.8468385512584407, + "grad_norm": 0.411685973405838, + "learning_rate": 9.385913012012973e-05, + "loss": 1.9034, + "step": 6017 + }, + { + "epoch": 1.847145488029466, + "grad_norm": 0.44486066699028015, + "learning_rate": 9.385674325462643e-05, + "loss": 1.9279, + "step": 6018 + }, + { + "epoch": 1.847452424800491, + "grad_norm": 0.42794153094291687, + "learning_rate": 9.385435595570511e-05, + "loss": 1.9117, + "step": 6019 + }, + { + "epoch": 1.8477593615715162, + "grad_norm": 0.3652110695838928, + "learning_rate": 9.385196822338933e-05, + "loss": 1.9636, + "step": 6020 + }, + { + "epoch": 1.8480662983425415, + "grad_norm": 0.36490142345428467, + "learning_rate": 9.38495800577027e-05, + "loss": 1.9468, + "step": 6021 + }, + { + "epoch": 1.8483732351135667, + "grad_norm": 0.3946039080619812, + "learning_rate": 9.384719145866882e-05, + "loss": 1.8851, + "step": 6022 + }, + { + "epoch": 1.8486801718845918, + "grad_norm": 0.4236997067928314, + "learning_rate": 9.38448024263113e-05, + "loss": 2.0256, + "step": 6023 + }, + { + "epoch": 1.848987108655617, + "grad_norm": 0.34637942910194397, + "learning_rate": 9.384241296065374e-05, + "loss": 1.9032, + "step": 6024 + }, + { + "epoch": 1.849294045426642, + "grad_norm": 0.4096907079219818, + "learning_rate": 9.384002306171975e-05, + "loss": 1.9762, + "step": 6025 + }, + { + "epoch": 1.8496009821976673, + "grad_norm": 0.38225218653678894, + "learning_rate": 9.383763272953297e-05, + "loss": 2.023, + "step": 6026 + }, + { + "epoch": 1.8499079189686924, + "grad_norm": 0.4297153055667877, + "learning_rate": 9.3835241964117e-05, + "loss": 1.977, + "step": 6027 + }, + { + "epoch": 1.8502148557397176, + "grad_norm": 0.5225360989570618, + "learning_rate": 9.383285076549548e-05, + "loss": 1.919, + "step": 6028 + }, + { + "epoch": 1.850521792510743, + "grad_norm": 0.6799743175506592, + "learning_rate": 9.383045913369205e-05, + "loss": 1.9382, + "step": 6029 + }, + { + "epoch": 1.850828729281768, + "grad_norm": 0.6274817585945129, + "learning_rate": 9.382806706873031e-05, + "loss": 1.9782, + "step": 6030 + }, + { + "epoch": 1.8511356660527931, + "grad_norm": 0.4939708113670349, + "learning_rate": 9.382567457063392e-05, + "loss": 1.8794, + "step": 6031 + }, + { + "epoch": 1.8514426028238185, + "grad_norm": 0.3876135051250458, + "learning_rate": 9.382328163942656e-05, + "loss": 2.0153, + "step": 6032 + }, + { + "epoch": 1.8517495395948433, + "grad_norm": 0.592051088809967, + "learning_rate": 9.38208882751318e-05, + "loss": 1.9277, + "step": 6033 + }, + { + "epoch": 1.8520564763658687, + "grad_norm": 0.660763144493103, + "learning_rate": 9.381849447777337e-05, + "loss": 1.9177, + "step": 6034 + }, + { + "epoch": 1.8523634131368938, + "grad_norm": 0.5823151469230652, + "learning_rate": 9.381610024737489e-05, + "loss": 1.9363, + "step": 6035 + }, + { + "epoch": 1.852670349907919, + "grad_norm": 0.39519962668418884, + "learning_rate": 9.381370558396004e-05, + "loss": 1.8627, + "step": 6036 + }, + { + "epoch": 1.8529772866789442, + "grad_norm": 0.44657328724861145, + "learning_rate": 9.381131048755244e-05, + "loss": 1.9075, + "step": 6037 + }, + { + "epoch": 1.8532842234499693, + "grad_norm": 0.540743887424469, + "learning_rate": 9.380891495817581e-05, + "loss": 1.9518, + "step": 6038 + }, + { + "epoch": 1.8535911602209945, + "grad_norm": 0.4388680160045624, + "learning_rate": 9.38065189958538e-05, + "loss": 1.8485, + "step": 6039 + }, + { + "epoch": 1.8538980969920198, + "grad_norm": 0.37645572423934937, + "learning_rate": 9.38041226006101e-05, + "loss": 1.9542, + "step": 6040 + }, + { + "epoch": 1.8542050337630447, + "grad_norm": 0.4405656158924103, + "learning_rate": 9.380172577246837e-05, + "loss": 1.9054, + "step": 6041 + }, + { + "epoch": 1.85451197053407, + "grad_norm": 0.45483505725860596, + "learning_rate": 9.379932851145232e-05, + "loss": 1.9077, + "step": 6042 + }, + { + "epoch": 1.8548189073050951, + "grad_norm": 0.40666261315345764, + "learning_rate": 9.379693081758564e-05, + "loss": 1.9977, + "step": 6043 + }, + { + "epoch": 1.8551258440761202, + "grad_norm": 0.365241140127182, + "learning_rate": 9.379453269089202e-05, + "loss": 1.9047, + "step": 6044 + }, + { + "epoch": 1.8554327808471456, + "grad_norm": 0.40797916054725647, + "learning_rate": 9.379213413139516e-05, + "loss": 1.9621, + "step": 6045 + }, + { + "epoch": 1.8557397176181707, + "grad_norm": 0.4525306820869446, + "learning_rate": 9.378973513911875e-05, + "loss": 1.9479, + "step": 6046 + }, + { + "epoch": 1.8560466543891958, + "grad_norm": 0.45422959327697754, + "learning_rate": 9.378733571408652e-05, + "loss": 1.9754, + "step": 6047 + }, + { + "epoch": 1.8563535911602211, + "grad_norm": 0.381862998008728, + "learning_rate": 9.378493585632217e-05, + "loss": 1.8542, + "step": 6048 + }, + { + "epoch": 1.856660527931246, + "grad_norm": 0.40489691495895386, + "learning_rate": 9.378253556584944e-05, + "loss": 1.9331, + "step": 6049 + }, + { + "epoch": 1.8569674647022714, + "grad_norm": 0.40347445011138916, + "learning_rate": 9.378013484269201e-05, + "loss": 1.9414, + "step": 6050 + }, + { + "epoch": 1.8572744014732965, + "grad_norm": 0.35401904582977295, + "learning_rate": 9.377773368687363e-05, + "loss": 1.8094, + "step": 6051 + }, + { + "epoch": 1.8575813382443216, + "grad_norm": 0.4061582684516907, + "learning_rate": 9.377533209841805e-05, + "loss": 1.8686, + "step": 6052 + }, + { + "epoch": 1.857888275015347, + "grad_norm": 0.44419318437576294, + "learning_rate": 9.377293007734895e-05, + "loss": 1.929, + "step": 6053 + }, + { + "epoch": 1.858195211786372, + "grad_norm": 0.41038191318511963, + "learning_rate": 9.37705276236901e-05, + "loss": 1.9636, + "step": 6054 + }, + { + "epoch": 1.8585021485573971, + "grad_norm": 0.4431348145008087, + "learning_rate": 9.376812473746526e-05, + "loss": 1.953, + "step": 6055 + }, + { + "epoch": 1.8588090853284225, + "grad_norm": 0.42502057552337646, + "learning_rate": 9.376572141869814e-05, + "loss": 1.95, + "step": 6056 + }, + { + "epoch": 1.8591160220994474, + "grad_norm": 0.40050914883613586, + "learning_rate": 9.376331766741253e-05, + "loss": 1.9507, + "step": 6057 + }, + { + "epoch": 1.8594229588704727, + "grad_norm": 0.3863932490348816, + "learning_rate": 9.376091348363216e-05, + "loss": 1.8746, + "step": 6058 + }, + { + "epoch": 1.8597298956414978, + "grad_norm": 0.37295350432395935, + "learning_rate": 9.375850886738077e-05, + "loss": 1.8778, + "step": 6059 + }, + { + "epoch": 1.860036832412523, + "grad_norm": 0.37965887784957886, + "learning_rate": 9.375610381868217e-05, + "loss": 1.8511, + "step": 6060 + }, + { + "epoch": 1.8603437691835483, + "grad_norm": 0.3740752637386322, + "learning_rate": 9.37536983375601e-05, + "loss": 1.8988, + "step": 6061 + }, + { + "epoch": 1.8606507059545734, + "grad_norm": 0.40466782450675964, + "learning_rate": 9.375129242403834e-05, + "loss": 1.9195, + "step": 6062 + }, + { + "epoch": 1.8609576427255985, + "grad_norm": 0.3658956289291382, + "learning_rate": 9.374888607814067e-05, + "loss": 1.9598, + "step": 6063 + }, + { + "epoch": 1.8612645794966238, + "grad_norm": 0.3752783238887787, + "learning_rate": 9.374647929989085e-05, + "loss": 1.9791, + "step": 6064 + }, + { + "epoch": 1.8615715162676487, + "grad_norm": 0.408774733543396, + "learning_rate": 9.374407208931268e-05, + "loss": 1.88, + "step": 6065 + }, + { + "epoch": 1.861878453038674, + "grad_norm": 0.3968205749988556, + "learning_rate": 9.374166444642997e-05, + "loss": 1.8755, + "step": 6066 + }, + { + "epoch": 1.8621853898096992, + "grad_norm": 0.37851858139038086, + "learning_rate": 9.373925637126648e-05, + "loss": 1.9296, + "step": 6067 + }, + { + "epoch": 1.8624923265807243, + "grad_norm": 0.34285619854927063, + "learning_rate": 9.373684786384604e-05, + "loss": 2.0149, + "step": 6068 + }, + { + "epoch": 1.8627992633517496, + "grad_norm": 0.38841512799263, + "learning_rate": 9.373443892419242e-05, + "loss": 1.9134, + "step": 6069 + }, + { + "epoch": 1.8631062001227747, + "grad_norm": 0.4744485914707184, + "learning_rate": 9.373202955232943e-05, + "loss": 1.9164, + "step": 6070 + }, + { + "epoch": 1.8634131368937998, + "grad_norm": 0.522659420967102, + "learning_rate": 9.372961974828092e-05, + "loss": 1.9155, + "step": 6071 + }, + { + "epoch": 1.8637200736648252, + "grad_norm": 0.5794001817703247, + "learning_rate": 9.372720951207066e-05, + "loss": 1.9003, + "step": 6072 + }, + { + "epoch": 1.86402701043585, + "grad_norm": 0.5135447978973389, + "learning_rate": 9.372479884372247e-05, + "loss": 1.948, + "step": 6073 + }, + { + "epoch": 1.8643339472068754, + "grad_norm": 0.4060198664665222, + "learning_rate": 9.372238774326021e-05, + "loss": 1.8634, + "step": 6074 + }, + { + "epoch": 1.8646408839779005, + "grad_norm": 0.3880244195461273, + "learning_rate": 9.371997621070769e-05, + "loss": 1.8729, + "step": 6075 + }, + { + "epoch": 1.8649478207489256, + "grad_norm": 0.4862929582595825, + "learning_rate": 9.371756424608875e-05, + "loss": 1.9185, + "step": 6076 + }, + { + "epoch": 1.865254757519951, + "grad_norm": 0.4763035476207733, + "learning_rate": 9.371515184942719e-05, + "loss": 1.9696, + "step": 6077 + }, + { + "epoch": 1.865561694290976, + "grad_norm": 0.3552228808403015, + "learning_rate": 9.371273902074689e-05, + "loss": 1.9101, + "step": 6078 + }, + { + "epoch": 1.8658686310620012, + "grad_norm": 0.46329566836357117, + "learning_rate": 9.371032576007168e-05, + "loss": 1.8807, + "step": 6079 + }, + { + "epoch": 1.8661755678330265, + "grad_norm": 0.5176550149917603, + "learning_rate": 9.370791206742541e-05, + "loss": 1.9044, + "step": 6080 + }, + { + "epoch": 1.8664825046040514, + "grad_norm": 0.3929184675216675, + "learning_rate": 9.370549794283194e-05, + "loss": 1.8858, + "step": 6081 + }, + { + "epoch": 1.8667894413750767, + "grad_norm": 0.35135987401008606, + "learning_rate": 9.370308338631511e-05, + "loss": 1.8518, + "step": 6082 + }, + { + "epoch": 1.8670963781461019, + "grad_norm": 0.4229072034358978, + "learning_rate": 9.370066839789881e-05, + "loss": 1.891, + "step": 6083 + }, + { + "epoch": 1.867403314917127, + "grad_norm": 0.4862394630908966, + "learning_rate": 9.369825297760688e-05, + "loss": 1.9058, + "step": 6084 + }, + { + "epoch": 1.8677102516881523, + "grad_norm": 0.4775281548500061, + "learning_rate": 9.369583712546322e-05, + "loss": 1.9738, + "step": 6085 + }, + { + "epoch": 1.8680171884591774, + "grad_norm": 0.3831046521663666, + "learning_rate": 9.369342084149166e-05, + "loss": 1.9516, + "step": 6086 + }, + { + "epoch": 1.8683241252302025, + "grad_norm": 0.3970867395401001, + "learning_rate": 9.369100412571612e-05, + "loss": 2.0158, + "step": 6087 + }, + { + "epoch": 1.8686310620012279, + "grad_norm": 0.41662725806236267, + "learning_rate": 9.368858697816047e-05, + "loss": 1.86, + "step": 6088 + }, + { + "epoch": 1.8689379987722528, + "grad_norm": 0.44235244393348694, + "learning_rate": 9.36861693988486e-05, + "loss": 1.9257, + "step": 6089 + }, + { + "epoch": 1.869244935543278, + "grad_norm": 0.37863966822624207, + "learning_rate": 9.36837513878044e-05, + "loss": 1.8877, + "step": 6090 + }, + { + "epoch": 1.8695518723143032, + "grad_norm": 0.44757044315338135, + "learning_rate": 9.368133294505175e-05, + "loss": 1.8962, + "step": 6091 + }, + { + "epoch": 1.8698588090853283, + "grad_norm": 0.5299558639526367, + "learning_rate": 9.367891407061458e-05, + "loss": 1.8655, + "step": 6092 + }, + { + "epoch": 1.8701657458563536, + "grad_norm": 0.4899531900882721, + "learning_rate": 9.367649476451678e-05, + "loss": 1.8933, + "step": 6093 + }, + { + "epoch": 1.8704726826273788, + "grad_norm": 0.3883507251739502, + "learning_rate": 9.367407502678224e-05, + "loss": 1.88, + "step": 6094 + }, + { + "epoch": 1.8707796193984039, + "grad_norm": 0.40936750173568726, + "learning_rate": 9.367165485743493e-05, + "loss": 1.8926, + "step": 6095 + }, + { + "epoch": 1.8710865561694292, + "grad_norm": 0.5708447098731995, + "learning_rate": 9.36692342564987e-05, + "loss": 1.9701, + "step": 6096 + }, + { + "epoch": 1.8713934929404543, + "grad_norm": 0.5559602379798889, + "learning_rate": 9.366681322399751e-05, + "loss": 1.8962, + "step": 6097 + }, + { + "epoch": 1.8717004297114794, + "grad_norm": 0.45344826579093933, + "learning_rate": 9.366439175995528e-05, + "loss": 1.9766, + "step": 6098 + }, + { + "epoch": 1.8720073664825048, + "grad_norm": 0.4887133538722992, + "learning_rate": 9.366196986439592e-05, + "loss": 1.8982, + "step": 6099 + }, + { + "epoch": 1.8723143032535297, + "grad_norm": 0.536568284034729, + "learning_rate": 9.365954753734339e-05, + "loss": 1.9506, + "step": 6100 + }, + { + "epoch": 1.872621240024555, + "grad_norm": 0.4792746901512146, + "learning_rate": 9.365712477882162e-05, + "loss": 1.9392, + "step": 6101 + }, + { + "epoch": 1.87292817679558, + "grad_norm": 0.39836910367012024, + "learning_rate": 9.365470158885458e-05, + "loss": 1.8812, + "step": 6102 + }, + { + "epoch": 1.8732351135666052, + "grad_norm": 0.4263121783733368, + "learning_rate": 9.365227796746617e-05, + "loss": 1.8326, + "step": 6103 + }, + { + "epoch": 1.8735420503376305, + "grad_norm": 0.4158315360546112, + "learning_rate": 9.364985391468038e-05, + "loss": 1.8857, + "step": 6104 + }, + { + "epoch": 1.8738489871086557, + "grad_norm": 0.4384559094905853, + "learning_rate": 9.364742943052112e-05, + "loss": 1.9247, + "step": 6105 + }, + { + "epoch": 1.8741559238796808, + "grad_norm": 0.34221649169921875, + "learning_rate": 9.364500451501242e-05, + "loss": 1.8869, + "step": 6106 + }, + { + "epoch": 1.874462860650706, + "grad_norm": 0.38786688446998596, + "learning_rate": 9.364257916817817e-05, + "loss": 1.8879, + "step": 6107 + }, + { + "epoch": 1.874769797421731, + "grad_norm": 0.39408090710639954, + "learning_rate": 9.364015339004239e-05, + "loss": 1.8832, + "step": 6108 + }, + { + "epoch": 1.8750767341927563, + "grad_norm": 0.33985385298728943, + "learning_rate": 9.363772718062902e-05, + "loss": 1.8823, + "step": 6109 + }, + { + "epoch": 1.8753836709637814, + "grad_norm": 0.35319194197654724, + "learning_rate": 9.363530053996206e-05, + "loss": 1.9205, + "step": 6110 + }, + { + "epoch": 1.8756906077348066, + "grad_norm": 0.3455435335636139, + "learning_rate": 9.36328734680655e-05, + "loss": 1.9028, + "step": 6111 + }, + { + "epoch": 1.875997544505832, + "grad_norm": 0.3689115643501282, + "learning_rate": 9.363044596496329e-05, + "loss": 1.8996, + "step": 6112 + }, + { + "epoch": 1.876304481276857, + "grad_norm": 0.35776960849761963, + "learning_rate": 9.362801803067945e-05, + "loss": 1.9563, + "step": 6113 + }, + { + "epoch": 1.8766114180478821, + "grad_norm": 0.3524370491504669, + "learning_rate": 9.362558966523797e-05, + "loss": 1.9016, + "step": 6114 + }, + { + "epoch": 1.8769183548189075, + "grad_norm": 0.3725074529647827, + "learning_rate": 9.362316086866283e-05, + "loss": 1.9467, + "step": 6115 + }, + { + "epoch": 1.8772252915899323, + "grad_norm": 0.390055775642395, + "learning_rate": 9.362073164097807e-05, + "loss": 1.9326, + "step": 6116 + }, + { + "epoch": 1.8775322283609577, + "grad_norm": 0.39119964838027954, + "learning_rate": 9.361830198220764e-05, + "loss": 1.8723, + "step": 6117 + }, + { + "epoch": 1.8778391651319828, + "grad_norm": 0.3659103512763977, + "learning_rate": 9.36158718923756e-05, + "loss": 1.835, + "step": 6118 + }, + { + "epoch": 1.878146101903008, + "grad_norm": 0.3360283076763153, + "learning_rate": 9.361344137150597e-05, + "loss": 1.8622, + "step": 6119 + }, + { + "epoch": 1.8784530386740332, + "grad_norm": 0.35440295934677124, + "learning_rate": 9.361101041962272e-05, + "loss": 1.8523, + "step": 6120 + }, + { + "epoch": 1.8787599754450584, + "grad_norm": 1.2606174945831299, + "learning_rate": 9.36085790367499e-05, + "loss": 1.9826, + "step": 6121 + }, + { + "epoch": 1.8790669122160835, + "grad_norm": 0.49294769763946533, + "learning_rate": 9.360614722291157e-05, + "loss": 1.8478, + "step": 6122 + }, + { + "epoch": 1.8793738489871088, + "grad_norm": 0.5642881393432617, + "learning_rate": 9.360371497813172e-05, + "loss": 1.883, + "step": 6123 + }, + { + "epoch": 1.8796807857581337, + "grad_norm": 0.5257276296615601, + "learning_rate": 9.36012823024344e-05, + "loss": 1.8577, + "step": 6124 + }, + { + "epoch": 1.879987722529159, + "grad_norm": 0.36913231015205383, + "learning_rate": 9.359884919584366e-05, + "loss": 1.8934, + "step": 6125 + }, + { + "epoch": 1.8802946593001841, + "grad_norm": 0.43373262882232666, + "learning_rate": 9.359641565838353e-05, + "loss": 1.8354, + "step": 6126 + }, + { + "epoch": 1.8806015960712092, + "grad_norm": 0.5280462503433228, + "learning_rate": 9.359398169007807e-05, + "loss": 1.9446, + "step": 6127 + }, + { + "epoch": 1.8809085328422346, + "grad_norm": 0.4991915225982666, + "learning_rate": 9.359154729095135e-05, + "loss": 1.9003, + "step": 6128 + }, + { + "epoch": 1.8812154696132597, + "grad_norm": 0.3766331374645233, + "learning_rate": 9.358911246102738e-05, + "loss": 1.9149, + "step": 6129 + }, + { + "epoch": 1.8815224063842848, + "grad_norm": 0.39050692319869995, + "learning_rate": 9.358667720033026e-05, + "loss": 1.8945, + "step": 6130 + }, + { + "epoch": 1.8818293431553101, + "grad_norm": 0.47633904218673706, + "learning_rate": 9.358424150888405e-05, + "loss": 1.8772, + "step": 6131 + }, + { + "epoch": 1.882136279926335, + "grad_norm": 0.46322503685951233, + "learning_rate": 9.358180538671283e-05, + "loss": 1.893, + "step": 6132 + }, + { + "epoch": 1.8824432166973604, + "grad_norm": 0.39437612891197205, + "learning_rate": 9.357936883384066e-05, + "loss": 1.9394, + "step": 6133 + }, + { + "epoch": 1.8827501534683855, + "grad_norm": 0.4534996747970581, + "learning_rate": 9.357693185029162e-05, + "loss": 1.9689, + "step": 6134 + }, + { + "epoch": 1.8830570902394106, + "grad_norm": 0.4408230483531952, + "learning_rate": 9.35744944360898e-05, + "loss": 1.876, + "step": 6135 + }, + { + "epoch": 1.883364027010436, + "grad_norm": 0.5688899755477905, + "learning_rate": 9.35720565912593e-05, + "loss": 2.0153, + "step": 6136 + }, + { + "epoch": 1.883670963781461, + "grad_norm": 0.5005510449409485, + "learning_rate": 9.356961831582418e-05, + "loss": 1.9454, + "step": 6137 + }, + { + "epoch": 1.8839779005524862, + "grad_norm": 0.4002588987350464, + "learning_rate": 9.356717960980856e-05, + "loss": 1.9153, + "step": 6138 + }, + { + "epoch": 1.8842848373235115, + "grad_norm": 0.49053385853767395, + "learning_rate": 9.356474047323653e-05, + "loss": 1.9734, + "step": 6139 + }, + { + "epoch": 1.8845917740945364, + "grad_norm": 0.4828382432460785, + "learning_rate": 9.35623009061322e-05, + "loss": 1.8946, + "step": 6140 + }, + { + "epoch": 1.8848987108655617, + "grad_norm": 0.4389181137084961, + "learning_rate": 9.35598609085197e-05, + "loss": 1.9491, + "step": 6141 + }, + { + "epoch": 1.8852056476365868, + "grad_norm": 0.4010564982891083, + "learning_rate": 9.35574204804231e-05, + "loss": 1.8786, + "step": 6142 + }, + { + "epoch": 1.885512584407612, + "grad_norm": 0.4038756787776947, + "learning_rate": 9.355497962186657e-05, + "loss": 1.907, + "step": 6143 + }, + { + "epoch": 1.8858195211786373, + "grad_norm": 0.5030881762504578, + "learning_rate": 9.355253833287418e-05, + "loss": 1.8438, + "step": 6144 + }, + { + "epoch": 1.8861264579496624, + "grad_norm": 0.42690956592559814, + "learning_rate": 9.355009661347007e-05, + "loss": 1.8254, + "step": 6145 + }, + { + "epoch": 1.8864333947206875, + "grad_norm": 0.37733983993530273, + "learning_rate": 9.35476544636784e-05, + "loss": 1.9035, + "step": 6146 + }, + { + "epoch": 1.8867403314917128, + "grad_norm": 0.36874648928642273, + "learning_rate": 9.354521188352327e-05, + "loss": 1.885, + "step": 6147 + }, + { + "epoch": 1.8870472682627377, + "grad_norm": 0.36208659410476685, + "learning_rate": 9.354276887302885e-05, + "loss": 1.9416, + "step": 6148 + }, + { + "epoch": 1.887354205033763, + "grad_norm": 0.3952158987522125, + "learning_rate": 9.354032543221926e-05, + "loss": 1.9073, + "step": 6149 + }, + { + "epoch": 1.8876611418047882, + "grad_norm": 0.3603280782699585, + "learning_rate": 9.353788156111864e-05, + "loss": 1.9204, + "step": 6150 + }, + { + "epoch": 1.8879680785758133, + "grad_norm": 0.4325824975967407, + "learning_rate": 9.353543725975118e-05, + "loss": 1.9345, + "step": 6151 + }, + { + "epoch": 1.8882750153468386, + "grad_norm": 0.46270960569381714, + "learning_rate": 9.3532992528141e-05, + "loss": 1.9783, + "step": 6152 + }, + { + "epoch": 1.8885819521178637, + "grad_norm": 0.42317959666252136, + "learning_rate": 9.353054736631228e-05, + "loss": 1.9252, + "step": 6153 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.42653194069862366, + "learning_rate": 9.352810177428917e-05, + "loss": 1.9875, + "step": 6154 + }, + { + "epoch": 1.8891958256599142, + "grad_norm": 0.49819129705429077, + "learning_rate": 9.352565575209584e-05, + "loss": 1.9334, + "step": 6155 + }, + { + "epoch": 1.889502762430939, + "grad_norm": 0.4481790065765381, + "learning_rate": 9.352320929975646e-05, + "loss": 1.8939, + "step": 6156 + }, + { + "epoch": 1.8898096992019644, + "grad_norm": 0.41602686047554016, + "learning_rate": 9.352076241729524e-05, + "loss": 1.9207, + "step": 6157 + }, + { + "epoch": 1.8901166359729895, + "grad_norm": 0.4516759216785431, + "learning_rate": 9.351831510473633e-05, + "loss": 1.9384, + "step": 6158 + }, + { + "epoch": 1.8904235727440146, + "grad_norm": 0.5030251741409302, + "learning_rate": 9.351586736210391e-05, + "loss": 1.9787, + "step": 6159 + }, + { + "epoch": 1.89073050951504, + "grad_norm": 0.37176215648651123, + "learning_rate": 9.35134191894222e-05, + "loss": 1.8826, + "step": 6160 + }, + { + "epoch": 1.891037446286065, + "grad_norm": 0.3850235939025879, + "learning_rate": 9.351097058671537e-05, + "loss": 1.8689, + "step": 6161 + }, + { + "epoch": 1.8913443830570902, + "grad_norm": 0.3740260601043701, + "learning_rate": 9.350852155400764e-05, + "loss": 1.8624, + "step": 6162 + }, + { + "epoch": 1.8916513198281155, + "grad_norm": 0.386124849319458, + "learning_rate": 9.350607209132318e-05, + "loss": 1.8506, + "step": 6163 + }, + { + "epoch": 1.8919582565991406, + "grad_norm": 0.3743472993373871, + "learning_rate": 9.350362219868623e-05, + "loss": 1.9499, + "step": 6164 + }, + { + "epoch": 1.8922651933701657, + "grad_norm": 0.4257555603981018, + "learning_rate": 9.350117187612097e-05, + "loss": 1.9407, + "step": 6165 + }, + { + "epoch": 1.892572130141191, + "grad_norm": 0.37218552827835083, + "learning_rate": 9.349872112365163e-05, + "loss": 1.8772, + "step": 6166 + }, + { + "epoch": 1.892879066912216, + "grad_norm": 0.3443894386291504, + "learning_rate": 9.349626994130245e-05, + "loss": 1.8846, + "step": 6167 + }, + { + "epoch": 1.8931860036832413, + "grad_norm": 0.33507248759269714, + "learning_rate": 9.349381832909763e-05, + "loss": 1.9303, + "step": 6168 + }, + { + "epoch": 1.8934929404542664, + "grad_norm": 0.3844592869281769, + "learning_rate": 9.349136628706141e-05, + "loss": 1.9453, + "step": 6169 + }, + { + "epoch": 1.8937998772252915, + "grad_norm": 0.35765793919563293, + "learning_rate": 9.348891381521802e-05, + "loss": 1.8745, + "step": 6170 + }, + { + "epoch": 1.8941068139963169, + "grad_norm": 0.3732185661792755, + "learning_rate": 9.348646091359168e-05, + "loss": 1.9318, + "step": 6171 + }, + { + "epoch": 1.894413750767342, + "grad_norm": 0.3704257607460022, + "learning_rate": 9.348400758220666e-05, + "loss": 1.9285, + "step": 6172 + }, + { + "epoch": 1.894720687538367, + "grad_norm": 0.32159942388534546, + "learning_rate": 9.348155382108717e-05, + "loss": 1.8368, + "step": 6173 + }, + { + "epoch": 1.8950276243093924, + "grad_norm": 0.32755646109580994, + "learning_rate": 9.34790996302575e-05, + "loss": 1.8975, + "step": 6174 + }, + { + "epoch": 1.8953345610804173, + "grad_norm": 0.38797906041145325, + "learning_rate": 9.347664500974186e-05, + "loss": 1.9684, + "step": 6175 + }, + { + "epoch": 1.8956414978514426, + "grad_norm": 0.3870599865913391, + "learning_rate": 9.347418995956456e-05, + "loss": 1.963, + "step": 6176 + }, + { + "epoch": 1.8959484346224678, + "grad_norm": 0.35739025473594666, + "learning_rate": 9.347173447974982e-05, + "loss": 1.8912, + "step": 6177 + }, + { + "epoch": 1.8962553713934929, + "grad_norm": 0.3525852859020233, + "learning_rate": 9.346927857032193e-05, + "loss": 1.8455, + "step": 6178 + }, + { + "epoch": 1.8965623081645182, + "grad_norm": 0.39735934138298035, + "learning_rate": 9.346682223130514e-05, + "loss": 1.8824, + "step": 6179 + }, + { + "epoch": 1.8968692449355433, + "grad_norm": 0.3677692413330078, + "learning_rate": 9.346436546272373e-05, + "loss": 1.8723, + "step": 6180 + }, + { + "epoch": 1.8971761817065684, + "grad_norm": 0.3660476505756378, + "learning_rate": 9.346190826460199e-05, + "loss": 1.9674, + "step": 6181 + }, + { + "epoch": 1.8974831184775938, + "grad_norm": 0.4416230022907257, + "learning_rate": 9.34594506369642e-05, + "loss": 1.9309, + "step": 6182 + }, + { + "epoch": 1.8977900552486187, + "grad_norm": 0.39761826395988464, + "learning_rate": 9.345699257983466e-05, + "loss": 1.9408, + "step": 6183 + }, + { + "epoch": 1.898096992019644, + "grad_norm": 0.44419440627098083, + "learning_rate": 9.345453409323763e-05, + "loss": 2.0013, + "step": 6184 + }, + { + "epoch": 1.898403928790669, + "grad_norm": 0.4173676371574402, + "learning_rate": 9.345207517719743e-05, + "loss": 1.8462, + "step": 6185 + }, + { + "epoch": 1.8987108655616942, + "grad_norm": 0.39312002062797546, + "learning_rate": 9.344961583173837e-05, + "loss": 1.8716, + "step": 6186 + }, + { + "epoch": 1.8990178023327196, + "grad_norm": 0.389996737241745, + "learning_rate": 9.344715605688472e-05, + "loss": 1.9331, + "step": 6187 + }, + { + "epoch": 1.8993247391037447, + "grad_norm": 0.4575251340866089, + "learning_rate": 9.34446958526608e-05, + "loss": 1.9408, + "step": 6188 + }, + { + "epoch": 1.8996316758747698, + "grad_norm": 0.425075888633728, + "learning_rate": 9.344223521909097e-05, + "loss": 1.8632, + "step": 6189 + }, + { + "epoch": 1.899938612645795, + "grad_norm": 0.3622394800186157, + "learning_rate": 9.343977415619948e-05, + "loss": 1.8671, + "step": 6190 + }, + { + "epoch": 1.90024554941682, + "grad_norm": 0.38955047726631165, + "learning_rate": 9.343731266401068e-05, + "loss": 1.8955, + "step": 6191 + }, + { + "epoch": 1.9005524861878453, + "grad_norm": 0.40853381156921387, + "learning_rate": 9.34348507425489e-05, + "loss": 1.8477, + "step": 6192 + }, + { + "epoch": 1.9008594229588704, + "grad_norm": 0.36416095495224, + "learning_rate": 9.343238839183848e-05, + "loss": 1.8596, + "step": 6193 + }, + { + "epoch": 1.9011663597298956, + "grad_norm": 0.3371017277240753, + "learning_rate": 9.342992561190374e-05, + "loss": 1.9646, + "step": 6194 + }, + { + "epoch": 1.901473296500921, + "grad_norm": 0.3605191111564636, + "learning_rate": 9.3427462402769e-05, + "loss": 1.9165, + "step": 6195 + }, + { + "epoch": 1.901780233271946, + "grad_norm": 0.32952287793159485, + "learning_rate": 9.342499876445863e-05, + "loss": 1.8827, + "step": 6196 + }, + { + "epoch": 1.9020871700429711, + "grad_norm": 0.3627411425113678, + "learning_rate": 9.342253469699698e-05, + "loss": 1.9058, + "step": 6197 + }, + { + "epoch": 1.9023941068139965, + "grad_norm": 0.3830505311489105, + "learning_rate": 9.342007020040839e-05, + "loss": 1.89, + "step": 6198 + }, + { + "epoch": 1.9027010435850213, + "grad_norm": 0.36550065875053406, + "learning_rate": 9.341760527471722e-05, + "loss": 1.9004, + "step": 6199 + }, + { + "epoch": 1.9030079803560467, + "grad_norm": 0.4098506569862366, + "learning_rate": 9.341513991994782e-05, + "loss": 1.8656, + "step": 6200 + }, + { + "epoch": 1.9033149171270718, + "grad_norm": 0.5218825340270996, + "learning_rate": 9.341267413612456e-05, + "loss": 1.9179, + "step": 6201 + }, + { + "epoch": 1.903621853898097, + "grad_norm": 0.6201978921890259, + "learning_rate": 9.34102079232718e-05, + "loss": 1.9485, + "step": 6202 + }, + { + "epoch": 1.9039287906691222, + "grad_norm": 0.597594141960144, + "learning_rate": 9.340774128141395e-05, + "loss": 1.9074, + "step": 6203 + }, + { + "epoch": 1.9042357274401474, + "grad_norm": 0.477268248796463, + "learning_rate": 9.340527421057533e-05, + "loss": 1.9202, + "step": 6204 + }, + { + "epoch": 1.9045426642111725, + "grad_norm": 0.39805278182029724, + "learning_rate": 9.340280671078035e-05, + "loss": 1.8801, + "step": 6205 + }, + { + "epoch": 1.9048496009821978, + "grad_norm": 0.5815454721450806, + "learning_rate": 9.340033878205342e-05, + "loss": 1.8564, + "step": 6206 + }, + { + "epoch": 1.9051565377532227, + "grad_norm": 0.6385661363601685, + "learning_rate": 9.339787042441888e-05, + "loss": 1.8992, + "step": 6207 + }, + { + "epoch": 1.905463474524248, + "grad_norm": 0.5905124545097351, + "learning_rate": 9.339540163790116e-05, + "loss": 1.9608, + "step": 6208 + }, + { + "epoch": 1.9057704112952731, + "grad_norm": 0.37329113483428955, + "learning_rate": 9.339293242252465e-05, + "loss": 1.9037, + "step": 6209 + }, + { + "epoch": 1.9060773480662982, + "grad_norm": 0.4568968117237091, + "learning_rate": 9.339046277831374e-05, + "loss": 1.8719, + "step": 6210 + }, + { + "epoch": 1.9063842848373236, + "grad_norm": 0.43003782629966736, + "learning_rate": 9.338799270529284e-05, + "loss": 1.8594, + "step": 6211 + }, + { + "epoch": 1.9066912216083487, + "grad_norm": 0.3795240819454193, + "learning_rate": 9.338552220348637e-05, + "loss": 1.8645, + "step": 6212 + }, + { + "epoch": 1.9069981583793738, + "grad_norm": 0.3791581392288208, + "learning_rate": 9.338305127291876e-05, + "loss": 1.9076, + "step": 6213 + }, + { + "epoch": 1.9073050951503991, + "grad_norm": 0.3747733533382416, + "learning_rate": 9.338057991361438e-05, + "loss": 1.8665, + "step": 6214 + }, + { + "epoch": 1.907612031921424, + "grad_norm": 0.3994114100933075, + "learning_rate": 9.337810812559771e-05, + "loss": 1.9202, + "step": 6215 + }, + { + "epoch": 1.9079189686924494, + "grad_norm": 0.3808605670928955, + "learning_rate": 9.337563590889312e-05, + "loss": 1.9272, + "step": 6216 + }, + { + "epoch": 1.9082259054634745, + "grad_norm": 0.3461966812610626, + "learning_rate": 9.33731632635251e-05, + "loss": 1.8621, + "step": 6217 + }, + { + "epoch": 1.9085328422344996, + "grad_norm": 0.37272316217422485, + "learning_rate": 9.337069018951805e-05, + "loss": 1.8996, + "step": 6218 + }, + { + "epoch": 1.908839779005525, + "grad_norm": 0.40319329500198364, + "learning_rate": 9.336821668689642e-05, + "loss": 1.8852, + "step": 6219 + }, + { + "epoch": 1.90914671577655, + "grad_norm": 0.4059053659439087, + "learning_rate": 9.336574275568463e-05, + "loss": 1.9156, + "step": 6220 + }, + { + "epoch": 1.9094536525475752, + "grad_norm": 0.41244640946388245, + "learning_rate": 9.336326839590719e-05, + "loss": 1.9858, + "step": 6221 + }, + { + "epoch": 1.9097605893186005, + "grad_norm": 0.38230007886886597, + "learning_rate": 9.336079360758849e-05, + "loss": 1.8756, + "step": 6222 + }, + { + "epoch": 1.9100675260896254, + "grad_norm": 0.3620646297931671, + "learning_rate": 9.335831839075304e-05, + "loss": 1.9305, + "step": 6223 + }, + { + "epoch": 1.9103744628606507, + "grad_norm": 0.3700193166732788, + "learning_rate": 9.335584274542525e-05, + "loss": 1.8544, + "step": 6224 + }, + { + "epoch": 1.9106813996316758, + "grad_norm": 0.36827734112739563, + "learning_rate": 9.335336667162962e-05, + "loss": 1.8658, + "step": 6225 + }, + { + "epoch": 1.910988336402701, + "grad_norm": 0.33878061175346375, + "learning_rate": 9.33508901693906e-05, + "loss": 1.8638, + "step": 6226 + }, + { + "epoch": 1.9112952731737263, + "grad_norm": 0.3522186577320099, + "learning_rate": 9.334841323873269e-05, + "loss": 1.9109, + "step": 6227 + }, + { + "epoch": 1.9116022099447514, + "grad_norm": 0.3552776277065277, + "learning_rate": 9.334593587968035e-05, + "loss": 1.8499, + "step": 6228 + }, + { + "epoch": 1.9119091467157765, + "grad_norm": 0.3232300877571106, + "learning_rate": 9.334345809225805e-05, + "loss": 1.9078, + "step": 6229 + }, + { + "epoch": 1.9122160834868018, + "grad_norm": 0.3500599265098572, + "learning_rate": 9.33409798764903e-05, + "loss": 1.8953, + "step": 6230 + }, + { + "epoch": 1.9125230202578267, + "grad_norm": 0.4011479914188385, + "learning_rate": 9.333850123240159e-05, + "loss": 1.8961, + "step": 6231 + }, + { + "epoch": 1.912829957028852, + "grad_norm": 0.419539213180542, + "learning_rate": 9.333602216001642e-05, + "loss": 1.9381, + "step": 6232 + }, + { + "epoch": 1.9131368937998774, + "grad_norm": 0.364956259727478, + "learning_rate": 9.333354265935926e-05, + "loss": 1.8495, + "step": 6233 + }, + { + "epoch": 1.9134438305709023, + "grad_norm": 0.3322601318359375, + "learning_rate": 9.333106273045464e-05, + "loss": 1.8389, + "step": 6234 + }, + { + "epoch": 1.9137507673419276, + "grad_norm": 0.3706522583961487, + "learning_rate": 9.332858237332705e-05, + "loss": 1.904, + "step": 6235 + }, + { + "epoch": 1.9140577041129527, + "grad_norm": 0.3900963366031647, + "learning_rate": 9.332610158800104e-05, + "loss": 1.8974, + "step": 6236 + }, + { + "epoch": 1.9143646408839778, + "grad_norm": 0.3308334946632385, + "learning_rate": 9.332362037450108e-05, + "loss": 1.959, + "step": 6237 + }, + { + "epoch": 1.9146715776550032, + "grad_norm": 0.37876754999160767, + "learning_rate": 9.332113873285171e-05, + "loss": 1.9187, + "step": 6238 + }, + { + "epoch": 1.9149785144260283, + "grad_norm": 0.3557550609111786, + "learning_rate": 9.331865666307746e-05, + "loss": 1.9351, + "step": 6239 + }, + { + "epoch": 1.9152854511970534, + "grad_norm": 0.3792133927345276, + "learning_rate": 9.331617416520285e-05, + "loss": 1.8488, + "step": 6240 + }, + { + "epoch": 1.9155923879680787, + "grad_norm": 0.40517017245292664, + "learning_rate": 9.331369123925242e-05, + "loss": 1.9311, + "step": 6241 + }, + { + "epoch": 1.9158993247391036, + "grad_norm": 0.34011030197143555, + "learning_rate": 9.331120788525072e-05, + "loss": 1.8606, + "step": 6242 + }, + { + "epoch": 1.916206261510129, + "grad_norm": 0.39949584007263184, + "learning_rate": 9.330872410322227e-05, + "loss": 1.9156, + "step": 6243 + }, + { + "epoch": 1.916513198281154, + "grad_norm": 0.3771394193172455, + "learning_rate": 9.330623989319162e-05, + "loss": 1.8448, + "step": 6244 + }, + { + "epoch": 1.9168201350521792, + "grad_norm": 0.32114169001579285, + "learning_rate": 9.330375525518333e-05, + "loss": 1.8681, + "step": 6245 + }, + { + "epoch": 1.9171270718232045, + "grad_norm": 0.3438408672809601, + "learning_rate": 9.330127018922194e-05, + "loss": 1.8582, + "step": 6246 + }, + { + "epoch": 1.9174340085942296, + "grad_norm": 0.35971906781196594, + "learning_rate": 9.329878469533201e-05, + "loss": 1.9026, + "step": 6247 + }, + { + "epoch": 1.9177409453652547, + "grad_norm": 0.3953855633735657, + "learning_rate": 9.329629877353813e-05, + "loss": 1.8837, + "step": 6248 + }, + { + "epoch": 1.91804788213628, + "grad_norm": 0.36541905999183655, + "learning_rate": 9.329381242386485e-05, + "loss": 1.9156, + "step": 6249 + }, + { + "epoch": 1.918354818907305, + "grad_norm": 0.3577594459056854, + "learning_rate": 9.329132564633673e-05, + "loss": 1.8791, + "step": 6250 + }, + { + "epoch": 1.9186617556783303, + "grad_norm": 0.3869122564792633, + "learning_rate": 9.328883844097837e-05, + "loss": 1.9048, + "step": 6251 + }, + { + "epoch": 1.9189686924493554, + "grad_norm": 0.35097724199295044, + "learning_rate": 9.328635080781433e-05, + "loss": 1.9602, + "step": 6252 + }, + { + "epoch": 1.9192756292203805, + "grad_norm": 0.3813062012195587, + "learning_rate": 9.328386274686919e-05, + "loss": 1.9133, + "step": 6253 + }, + { + "epoch": 1.9195825659914059, + "grad_norm": 0.3950280249118805, + "learning_rate": 9.328137425816756e-05, + "loss": 1.9462, + "step": 6254 + }, + { + "epoch": 1.919889502762431, + "grad_norm": 0.41710540652275085, + "learning_rate": 9.327888534173402e-05, + "loss": 1.8616, + "step": 6255 + }, + { + "epoch": 1.920196439533456, + "grad_norm": 0.39998626708984375, + "learning_rate": 9.327639599759318e-05, + "loss": 1.8758, + "step": 6256 + }, + { + "epoch": 1.9205033763044814, + "grad_norm": 0.35425302386283875, + "learning_rate": 9.32739062257696e-05, + "loss": 1.8896, + "step": 6257 + }, + { + "epoch": 1.9208103130755063, + "grad_norm": 0.3487682640552521, + "learning_rate": 9.327141602628793e-05, + "loss": 1.8901, + "step": 6258 + }, + { + "epoch": 1.9211172498465316, + "grad_norm": 0.38767126202583313, + "learning_rate": 9.326892539917277e-05, + "loss": 1.9264, + "step": 6259 + }, + { + "epoch": 1.9214241866175568, + "grad_norm": 0.4265333116054535, + "learning_rate": 9.326643434444872e-05, + "loss": 1.9282, + "step": 6260 + }, + { + "epoch": 1.9217311233885819, + "grad_norm": 0.3386894166469574, + "learning_rate": 9.326394286214042e-05, + "loss": 1.8167, + "step": 6261 + }, + { + "epoch": 1.9220380601596072, + "grad_norm": 0.3594066798686981, + "learning_rate": 9.326145095227246e-05, + "loss": 1.9293, + "step": 6262 + }, + { + "epoch": 1.9223449969306323, + "grad_norm": 0.4041733741760254, + "learning_rate": 9.32589586148695e-05, + "loss": 2.0066, + "step": 6263 + }, + { + "epoch": 1.9226519337016574, + "grad_norm": 0.45588794350624084, + "learning_rate": 9.325646584995615e-05, + "loss": 1.9485, + "step": 6264 + }, + { + "epoch": 1.9229588704726828, + "grad_norm": 0.42583590745925903, + "learning_rate": 9.325397265755705e-05, + "loss": 1.8973, + "step": 6265 + }, + { + "epoch": 1.9232658072437077, + "grad_norm": 0.38701504468917847, + "learning_rate": 9.325147903769684e-05, + "loss": 1.9624, + "step": 6266 + }, + { + "epoch": 1.923572744014733, + "grad_norm": 0.4298608899116516, + "learning_rate": 9.324898499040017e-05, + "loss": 1.9033, + "step": 6267 + }, + { + "epoch": 1.923879680785758, + "grad_norm": 0.3692619800567627, + "learning_rate": 9.324649051569167e-05, + "loss": 1.973, + "step": 6268 + }, + { + "epoch": 1.9241866175567832, + "grad_norm": 0.40625011920928955, + "learning_rate": 9.324399561359602e-05, + "loss": 1.8629, + "step": 6269 + }, + { + "epoch": 1.9244935543278086, + "grad_norm": 0.43613263964653015, + "learning_rate": 9.324150028413784e-05, + "loss": 1.8928, + "step": 6270 + }, + { + "epoch": 1.9248004910988337, + "grad_norm": 0.4670937657356262, + "learning_rate": 9.323900452734182e-05, + "loss": 1.8809, + "step": 6271 + }, + { + "epoch": 1.9251074278698588, + "grad_norm": 0.43263986706733704, + "learning_rate": 9.323650834323262e-05, + "loss": 1.891, + "step": 6272 + }, + { + "epoch": 1.9254143646408841, + "grad_norm": 0.4253878891468048, + "learning_rate": 9.32340117318349e-05, + "loss": 2.0064, + "step": 6273 + }, + { + "epoch": 1.925721301411909, + "grad_norm": 0.3742302358150482, + "learning_rate": 9.323151469317332e-05, + "loss": 1.9441, + "step": 6274 + }, + { + "epoch": 1.9260282381829343, + "grad_norm": 0.37415632605552673, + "learning_rate": 9.32290172272726e-05, + "loss": 1.8901, + "step": 6275 + }, + { + "epoch": 1.9263351749539595, + "grad_norm": 0.402935266494751, + "learning_rate": 9.322651933415738e-05, + "loss": 1.9013, + "step": 6276 + }, + { + "epoch": 1.9266421117249846, + "grad_norm": 0.479819118976593, + "learning_rate": 9.322402101385235e-05, + "loss": 1.9713, + "step": 6277 + }, + { + "epoch": 1.92694904849601, + "grad_norm": 0.4472719430923462, + "learning_rate": 9.322152226638222e-05, + "loss": 1.9106, + "step": 6278 + }, + { + "epoch": 1.927255985267035, + "grad_norm": 0.36508920788764954, + "learning_rate": 9.321902309177168e-05, + "loss": 1.8999, + "step": 6279 + }, + { + "epoch": 1.9275629220380601, + "grad_norm": 0.38674476742744446, + "learning_rate": 9.321652349004542e-05, + "loss": 1.8653, + "step": 6280 + }, + { + "epoch": 1.9278698588090855, + "grad_norm": 0.3745587170124054, + "learning_rate": 9.321402346122814e-05, + "loss": 1.8764, + "step": 6281 + }, + { + "epoch": 1.9281767955801103, + "grad_norm": 0.37824445962905884, + "learning_rate": 9.321152300534454e-05, + "loss": 1.8712, + "step": 6282 + }, + { + "epoch": 1.9284837323511357, + "grad_norm": 0.3442685306072235, + "learning_rate": 9.320902212241936e-05, + "loss": 1.8242, + "step": 6283 + }, + { + "epoch": 1.9287906691221608, + "grad_norm": 0.3152186870574951, + "learning_rate": 9.32065208124773e-05, + "loss": 1.9282, + "step": 6284 + }, + { + "epoch": 1.929097605893186, + "grad_norm": 0.35380542278289795, + "learning_rate": 9.320401907554306e-05, + "loss": 1.8783, + "step": 6285 + }, + { + "epoch": 1.9294045426642112, + "grad_norm": 0.3140089511871338, + "learning_rate": 9.320151691164138e-05, + "loss": 1.9174, + "step": 6286 + }, + { + "epoch": 1.9297114794352364, + "grad_norm": 0.33666202425956726, + "learning_rate": 9.3199014320797e-05, + "loss": 1.8926, + "step": 6287 + }, + { + "epoch": 1.9300184162062615, + "grad_norm": 0.3297472894191742, + "learning_rate": 9.319651130303465e-05, + "loss": 1.8763, + "step": 6288 + }, + { + "epoch": 1.9303253529772868, + "grad_norm": 0.3323235511779785, + "learning_rate": 9.319400785837906e-05, + "loss": 1.9088, + "step": 6289 + }, + { + "epoch": 1.9306322897483117, + "grad_norm": 0.32601413130760193, + "learning_rate": 9.319150398685494e-05, + "loss": 1.8672, + "step": 6290 + }, + { + "epoch": 1.930939226519337, + "grad_norm": 0.35310089588165283, + "learning_rate": 9.318899968848708e-05, + "loss": 1.9492, + "step": 6291 + }, + { + "epoch": 1.9312461632903621, + "grad_norm": 0.3718548119068146, + "learning_rate": 9.31864949633002e-05, + "loss": 1.8692, + "step": 6292 + }, + { + "epoch": 1.9315531000613873, + "grad_norm": 0.42382025718688965, + "learning_rate": 9.318398981131908e-05, + "loss": 1.9693, + "step": 6293 + }, + { + "epoch": 1.9318600368324126, + "grad_norm": 0.5123299360275269, + "learning_rate": 9.318148423256845e-05, + "loss": 2.0117, + "step": 6294 + }, + { + "epoch": 1.9321669736034377, + "grad_norm": 0.4483809769153595, + "learning_rate": 9.317897822707308e-05, + "loss": 1.9165, + "step": 6295 + }, + { + "epoch": 1.9324739103744628, + "grad_norm": 0.4385908544063568, + "learning_rate": 9.317647179485776e-05, + "loss": 1.8869, + "step": 6296 + }, + { + "epoch": 1.9327808471454881, + "grad_norm": 0.42863771319389343, + "learning_rate": 9.317396493594724e-05, + "loss": 1.9484, + "step": 6297 + }, + { + "epoch": 1.933087783916513, + "grad_norm": 0.4130534529685974, + "learning_rate": 9.317145765036627e-05, + "loss": 1.9201, + "step": 6298 + }, + { + "epoch": 1.9333947206875384, + "grad_norm": 0.39024612307548523, + "learning_rate": 9.316894993813965e-05, + "loss": 1.9674, + "step": 6299 + }, + { + "epoch": 1.9337016574585635, + "grad_norm": 0.41060271859169006, + "learning_rate": 9.316644179929219e-05, + "loss": 1.9529, + "step": 6300 + }, + { + "epoch": 1.9340085942295886, + "grad_norm": 0.4302372634410858, + "learning_rate": 9.316393323384863e-05, + "loss": 1.8998, + "step": 6301 + }, + { + "epoch": 1.934315531000614, + "grad_norm": 0.3739410936832428, + "learning_rate": 9.316142424183379e-05, + "loss": 1.8812, + "step": 6302 + }, + { + "epoch": 1.934622467771639, + "grad_norm": 0.3965891897678375, + "learning_rate": 9.315891482327245e-05, + "loss": 1.8851, + "step": 6303 + }, + { + "epoch": 1.9349294045426642, + "grad_norm": 0.4486664831638336, + "learning_rate": 9.315640497818943e-05, + "loss": 1.9494, + "step": 6304 + }, + { + "epoch": 1.9352363413136895, + "grad_norm": 0.5530070662498474, + "learning_rate": 9.315389470660951e-05, + "loss": 1.9716, + "step": 6305 + }, + { + "epoch": 1.9355432780847146, + "grad_norm": 0.7142495512962341, + "learning_rate": 9.315138400855751e-05, + "loss": 1.947, + "step": 6306 + }, + { + "epoch": 1.9358502148557397, + "grad_norm": 0.7555594444274902, + "learning_rate": 9.314887288405827e-05, + "loss": 1.873, + "step": 6307 + }, + { + "epoch": 1.936157151626765, + "grad_norm": 0.6025232076644897, + "learning_rate": 9.314636133313654e-05, + "loss": 1.9189, + "step": 6308 + }, + { + "epoch": 1.93646408839779, + "grad_norm": 0.3686346113681793, + "learning_rate": 9.314384935581719e-05, + "loss": 1.8461, + "step": 6309 + }, + { + "epoch": 1.9367710251688153, + "grad_norm": 0.46265771985054016, + "learning_rate": 9.314133695212505e-05, + "loss": 1.8955, + "step": 6310 + }, + { + "epoch": 1.9370779619398404, + "grad_norm": 0.7023865580558777, + "learning_rate": 9.313882412208492e-05, + "loss": 1.9378, + "step": 6311 + }, + { + "epoch": 1.9373848987108655, + "grad_norm": 0.7163348197937012, + "learning_rate": 9.313631086572163e-05, + "loss": 1.9278, + "step": 6312 + }, + { + "epoch": 1.9376918354818908, + "grad_norm": 0.4772320091724396, + "learning_rate": 9.313379718306006e-05, + "loss": 1.9215, + "step": 6313 + }, + { + "epoch": 1.937998772252916, + "grad_norm": 0.4934171438217163, + "learning_rate": 9.313128307412501e-05, + "loss": 1.9725, + "step": 6314 + }, + { + "epoch": 1.938305709023941, + "grad_norm": 0.5988278985023499, + "learning_rate": 9.312876853894134e-05, + "loss": 1.9238, + "step": 6315 + }, + { + "epoch": 1.9386126457949664, + "grad_norm": 0.5819640159606934, + "learning_rate": 9.31262535775339e-05, + "loss": 1.9228, + "step": 6316 + }, + { + "epoch": 1.9389195825659913, + "grad_norm": 0.49525877833366394, + "learning_rate": 9.312373818992756e-05, + "loss": 1.8939, + "step": 6317 + }, + { + "epoch": 1.9392265193370166, + "grad_norm": 0.3778049647808075, + "learning_rate": 9.312122237614715e-05, + "loss": 1.8709, + "step": 6318 + }, + { + "epoch": 1.9395334561080417, + "grad_norm": 0.48716801404953003, + "learning_rate": 9.311870613621754e-05, + "loss": 1.9014, + "step": 6319 + }, + { + "epoch": 1.9398403928790668, + "grad_norm": 0.47298866510391235, + "learning_rate": 9.311618947016362e-05, + "loss": 1.8686, + "step": 6320 + }, + { + "epoch": 1.9401473296500922, + "grad_norm": 0.3709685206413269, + "learning_rate": 9.311367237801023e-05, + "loss": 1.9531, + "step": 6321 + }, + { + "epoch": 1.9404542664211173, + "grad_norm": 0.3898928761482239, + "learning_rate": 9.311115485978228e-05, + "loss": 1.8806, + "step": 6322 + }, + { + "epoch": 1.9407612031921424, + "grad_norm": 0.43091922998428345, + "learning_rate": 9.310863691550461e-05, + "loss": 1.9278, + "step": 6323 + }, + { + "epoch": 1.9410681399631677, + "grad_norm": 0.3788231909275055, + "learning_rate": 9.310611854520212e-05, + "loss": 1.893, + "step": 6324 + }, + { + "epoch": 1.9413750767341926, + "grad_norm": 0.4471469819545746, + "learning_rate": 9.310359974889972e-05, + "loss": 1.9706, + "step": 6325 + }, + { + "epoch": 1.941682013505218, + "grad_norm": 0.4047459661960602, + "learning_rate": 9.310108052662228e-05, + "loss": 1.8863, + "step": 6326 + }, + { + "epoch": 1.941988950276243, + "grad_norm": 0.4334566593170166, + "learning_rate": 9.309856087839468e-05, + "loss": 1.9543, + "step": 6327 + }, + { + "epoch": 1.9422958870472682, + "grad_norm": 0.3828316032886505, + "learning_rate": 9.309604080424185e-05, + "loss": 1.8601, + "step": 6328 + }, + { + "epoch": 1.9426028238182935, + "grad_norm": 0.3702560067176819, + "learning_rate": 9.30935203041887e-05, + "loss": 1.9055, + "step": 6329 + }, + { + "epoch": 1.9429097605893186, + "grad_norm": 0.4922797977924347, + "learning_rate": 9.309099937826011e-05, + "loss": 1.9589, + "step": 6330 + }, + { + "epoch": 1.9432166973603437, + "grad_norm": 0.4073271155357361, + "learning_rate": 9.308847802648102e-05, + "loss": 1.9727, + "step": 6331 + }, + { + "epoch": 1.943523634131369, + "grad_norm": 0.3833904266357422, + "learning_rate": 9.308595624887633e-05, + "loss": 1.8641, + "step": 6332 + }, + { + "epoch": 1.943830570902394, + "grad_norm": 0.44063761830329895, + "learning_rate": 9.308343404547095e-05, + "loss": 1.8996, + "step": 6333 + }, + { + "epoch": 1.9441375076734193, + "grad_norm": 0.4776977300643921, + "learning_rate": 9.308091141628983e-05, + "loss": 1.9353, + "step": 6334 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.39584699273109436, + "learning_rate": 9.307838836135792e-05, + "loss": 1.8521, + "step": 6335 + }, + { + "epoch": 1.9447513812154695, + "grad_norm": 0.3220890760421753, + "learning_rate": 9.30758648807001e-05, + "loss": 1.825, + "step": 6336 + }, + { + "epoch": 1.9450583179864949, + "grad_norm": 0.4301774501800537, + "learning_rate": 9.307334097434133e-05, + "loss": 1.9317, + "step": 6337 + }, + { + "epoch": 1.94536525475752, + "grad_norm": 0.439165323972702, + "learning_rate": 9.307081664230658e-05, + "loss": 1.8669, + "step": 6338 + }, + { + "epoch": 1.945672191528545, + "grad_norm": 0.4185279607772827, + "learning_rate": 9.306829188462076e-05, + "loss": 1.9512, + "step": 6339 + }, + { + "epoch": 1.9459791282995704, + "grad_norm": 0.4089502990245819, + "learning_rate": 9.306576670130885e-05, + "loss": 1.9607, + "step": 6340 + }, + { + "epoch": 1.9462860650705953, + "grad_norm": 0.508836567401886, + "learning_rate": 9.306324109239578e-05, + "loss": 1.9187, + "step": 6341 + }, + { + "epoch": 1.9465930018416207, + "grad_norm": 0.637534499168396, + "learning_rate": 9.306071505790652e-05, + "loss": 1.8237, + "step": 6342 + }, + { + "epoch": 1.9468999386126458, + "grad_norm": 0.5845112800598145, + "learning_rate": 9.305818859786603e-05, + "loss": 1.8238, + "step": 6343 + }, + { + "epoch": 1.9472068753836709, + "grad_norm": 0.4168374240398407, + "learning_rate": 9.305566171229932e-05, + "loss": 1.9343, + "step": 6344 + }, + { + "epoch": 1.9475138121546962, + "grad_norm": 0.43040701746940613, + "learning_rate": 9.305313440123129e-05, + "loss": 1.8774, + "step": 6345 + }, + { + "epoch": 1.9478207489257213, + "grad_norm": 0.6011641025543213, + "learning_rate": 9.305060666468696e-05, + "loss": 1.89, + "step": 6346 + }, + { + "epoch": 1.9481276856967464, + "grad_norm": 0.5530022382736206, + "learning_rate": 9.304807850269131e-05, + "loss": 2.0006, + "step": 6347 + }, + { + "epoch": 1.9484346224677718, + "grad_norm": 0.3707423210144043, + "learning_rate": 9.30455499152693e-05, + "loss": 1.9116, + "step": 6348 + }, + { + "epoch": 1.9487415592387967, + "grad_norm": 0.5013771653175354, + "learning_rate": 9.304302090244595e-05, + "loss": 1.8902, + "step": 6349 + }, + { + "epoch": 1.949048496009822, + "grad_norm": 0.5873609781265259, + "learning_rate": 9.304049146424623e-05, + "loss": 1.8879, + "step": 6350 + }, + { + "epoch": 1.949355432780847, + "grad_norm": 0.4389801621437073, + "learning_rate": 9.303796160069516e-05, + "loss": 1.9215, + "step": 6351 + }, + { + "epoch": 1.9496623695518722, + "grad_norm": 0.4004434645175934, + "learning_rate": 9.303543131181772e-05, + "loss": 1.9137, + "step": 6352 + }, + { + "epoch": 1.9499693063228976, + "grad_norm": 0.4928852617740631, + "learning_rate": 9.303290059763892e-05, + "loss": 1.9415, + "step": 6353 + }, + { + "epoch": 1.9502762430939227, + "grad_norm": 0.5045879483222961, + "learning_rate": 9.303036945818377e-05, + "loss": 1.8727, + "step": 6354 + }, + { + "epoch": 1.9505831798649478, + "grad_norm": 0.3434823453426361, + "learning_rate": 9.30278378934773e-05, + "loss": 1.8971, + "step": 6355 + }, + { + "epoch": 1.9508901166359731, + "grad_norm": 0.42980003356933594, + "learning_rate": 9.302530590354452e-05, + "loss": 1.9233, + "step": 6356 + }, + { + "epoch": 1.951197053406998, + "grad_norm": 0.3832406997680664, + "learning_rate": 9.302277348841042e-05, + "loss": 1.9317, + "step": 6357 + }, + { + "epoch": 1.9515039901780233, + "grad_norm": 0.37214264273643494, + "learning_rate": 9.30202406481001e-05, + "loss": 1.9172, + "step": 6358 + }, + { + "epoch": 1.9518109269490485, + "grad_norm": 0.3601585924625397, + "learning_rate": 9.30177073826385e-05, + "loss": 1.9286, + "step": 6359 + }, + { + "epoch": 1.9521178637200736, + "grad_norm": 0.36419349908828735, + "learning_rate": 9.301517369205072e-05, + "loss": 1.8624, + "step": 6360 + }, + { + "epoch": 1.952424800491099, + "grad_norm": 0.3808813691139221, + "learning_rate": 9.30126395763618e-05, + "loss": 1.8656, + "step": 6361 + }, + { + "epoch": 1.952731737262124, + "grad_norm": 0.39045700430870056, + "learning_rate": 9.301010503559675e-05, + "loss": 1.9205, + "step": 6362 + }, + { + "epoch": 1.9530386740331491, + "grad_norm": 0.37281444668769836, + "learning_rate": 9.300757006978065e-05, + "loss": 1.9162, + "step": 6363 + }, + { + "epoch": 1.9533456108041745, + "grad_norm": 0.4525204002857208, + "learning_rate": 9.300503467893851e-05, + "loss": 1.8999, + "step": 6364 + }, + { + "epoch": 1.9536525475751993, + "grad_norm": 0.41406187415122986, + "learning_rate": 9.300249886309542e-05, + "loss": 1.9804, + "step": 6365 + }, + { + "epoch": 1.9539594843462247, + "grad_norm": 0.4125058650970459, + "learning_rate": 9.299996262227644e-05, + "loss": 1.8464, + "step": 6366 + }, + { + "epoch": 1.9542664211172498, + "grad_norm": 0.41582876443862915, + "learning_rate": 9.299742595650663e-05, + "loss": 1.9937, + "step": 6367 + }, + { + "epoch": 1.954573357888275, + "grad_norm": 0.4360882639884949, + "learning_rate": 9.299488886581103e-05, + "loss": 1.9064, + "step": 6368 + }, + { + "epoch": 1.9548802946593002, + "grad_norm": 0.38369372487068176, + "learning_rate": 9.299235135021476e-05, + "loss": 1.9202, + "step": 6369 + }, + { + "epoch": 1.9551872314303254, + "grad_norm": 0.34401383996009827, + "learning_rate": 9.298981340974287e-05, + "loss": 1.844, + "step": 6370 + }, + { + "epoch": 1.9554941682013505, + "grad_norm": 0.3434326946735382, + "learning_rate": 9.298727504442044e-05, + "loss": 1.8206, + "step": 6371 + }, + { + "epoch": 1.9558011049723758, + "grad_norm": 0.35966724157333374, + "learning_rate": 9.298473625427257e-05, + "loss": 1.9, + "step": 6372 + }, + { + "epoch": 1.9561080417434007, + "grad_norm": 0.3726016581058502, + "learning_rate": 9.298219703932434e-05, + "loss": 1.9004, + "step": 6373 + }, + { + "epoch": 1.956414978514426, + "grad_norm": 0.3377366364002228, + "learning_rate": 9.297965739960084e-05, + "loss": 1.8747, + "step": 6374 + }, + { + "epoch": 1.9567219152854514, + "grad_norm": 0.36824578046798706, + "learning_rate": 9.297711733512718e-05, + "loss": 1.9059, + "step": 6375 + }, + { + "epoch": 1.9570288520564763, + "grad_norm": 0.3434023857116699, + "learning_rate": 9.297457684592847e-05, + "loss": 1.8624, + "step": 6376 + }, + { + "epoch": 1.9573357888275016, + "grad_norm": 0.36236703395843506, + "learning_rate": 9.297203593202979e-05, + "loss": 1.8558, + "step": 6377 + }, + { + "epoch": 1.9576427255985267, + "grad_norm": 0.3326953947544098, + "learning_rate": 9.296949459345625e-05, + "loss": 1.9189, + "step": 6378 + }, + { + "epoch": 1.9579496623695518, + "grad_norm": 0.3358452022075653, + "learning_rate": 9.2966952830233e-05, + "loss": 1.8601, + "step": 6379 + }, + { + "epoch": 1.9582565991405771, + "grad_norm": 0.36092114448547363, + "learning_rate": 9.296441064238514e-05, + "loss": 1.873, + "step": 6380 + }, + { + "epoch": 1.9585635359116023, + "grad_norm": 0.345683217048645, + "learning_rate": 9.296186802993778e-05, + "loss": 1.9122, + "step": 6381 + }, + { + "epoch": 1.9588704726826274, + "grad_norm": 0.32488611340522766, + "learning_rate": 9.295932499291606e-05, + "loss": 1.8709, + "step": 6382 + }, + { + "epoch": 1.9591774094536527, + "grad_norm": 0.34276288747787476, + "learning_rate": 9.295678153134512e-05, + "loss": 1.937, + "step": 6383 + }, + { + "epoch": 1.9594843462246776, + "grad_norm": 0.3953622877597809, + "learning_rate": 9.295423764525008e-05, + "loss": 1.9357, + "step": 6384 + }, + { + "epoch": 1.959791282995703, + "grad_norm": 0.37806951999664307, + "learning_rate": 9.29516933346561e-05, + "loss": 1.8813, + "step": 6385 + }, + { + "epoch": 1.960098219766728, + "grad_norm": 0.39551272988319397, + "learning_rate": 9.29491485995883e-05, + "loss": 1.8812, + "step": 6386 + }, + { + "epoch": 1.9604051565377532, + "grad_norm": 0.37042370438575745, + "learning_rate": 9.294660344007184e-05, + "loss": 1.9059, + "step": 6387 + }, + { + "epoch": 1.9607120933087785, + "grad_norm": 0.37503576278686523, + "learning_rate": 9.294405785613187e-05, + "loss": 1.9792, + "step": 6388 + }, + { + "epoch": 1.9610190300798036, + "grad_norm": 0.3515741229057312, + "learning_rate": 9.294151184779355e-05, + "loss": 1.8792, + "step": 6389 + }, + { + "epoch": 1.9613259668508287, + "grad_norm": 0.319890558719635, + "learning_rate": 9.293896541508205e-05, + "loss": 1.9222, + "step": 6390 + }, + { + "epoch": 1.961632903621854, + "grad_norm": 0.3517487645149231, + "learning_rate": 9.293641855802252e-05, + "loss": 1.8751, + "step": 6391 + }, + { + "epoch": 1.961939840392879, + "grad_norm": 0.33269986510276794, + "learning_rate": 9.293387127664012e-05, + "loss": 1.8372, + "step": 6392 + }, + { + "epoch": 1.9622467771639043, + "grad_norm": 0.36048516631126404, + "learning_rate": 9.293132357096007e-05, + "loss": 1.8944, + "step": 6393 + }, + { + "epoch": 1.9625537139349294, + "grad_norm": 0.4329642057418823, + "learning_rate": 9.292877544100751e-05, + "loss": 1.9868, + "step": 6394 + }, + { + "epoch": 1.9628606507059545, + "grad_norm": 0.445496529340744, + "learning_rate": 9.292622688680762e-05, + "loss": 1.9885, + "step": 6395 + }, + { + "epoch": 1.9631675874769798, + "grad_norm": 0.3818886876106262, + "learning_rate": 9.292367790838561e-05, + "loss": 1.9515, + "step": 6396 + }, + { + "epoch": 1.963474524248005, + "grad_norm": 0.3800121545791626, + "learning_rate": 9.292112850576664e-05, + "loss": 1.8838, + "step": 6397 + }, + { + "epoch": 1.96378146101903, + "grad_norm": 0.44252321124076843, + "learning_rate": 9.291857867897593e-05, + "loss": 1.9296, + "step": 6398 + }, + { + "epoch": 1.9640883977900554, + "grad_norm": 0.463766485452652, + "learning_rate": 9.291602842803867e-05, + "loss": 1.9164, + "step": 6399 + }, + { + "epoch": 1.9643953345610803, + "grad_norm": 0.4599217474460602, + "learning_rate": 9.291347775298006e-05, + "loss": 1.9277, + "step": 6400 + }, + { + "epoch": 1.9647022713321056, + "grad_norm": 0.371346652507782, + "learning_rate": 9.291092665382532e-05, + "loss": 1.9036, + "step": 6401 + }, + { + "epoch": 1.9650092081031307, + "grad_norm": 0.327197402715683, + "learning_rate": 9.290837513059965e-05, + "loss": 1.8214, + "step": 6402 + }, + { + "epoch": 1.9653161448741558, + "grad_norm": 0.3346688747406006, + "learning_rate": 9.290582318332826e-05, + "loss": 1.8671, + "step": 6403 + }, + { + "epoch": 1.9656230816451812, + "grad_norm": 0.342208594083786, + "learning_rate": 9.290327081203637e-05, + "loss": 1.9143, + "step": 6404 + }, + { + "epoch": 1.9659300184162063, + "grad_norm": 0.3430559039115906, + "learning_rate": 9.290071801674923e-05, + "loss": 1.9135, + "step": 6405 + }, + { + "epoch": 1.9662369551872314, + "grad_norm": 0.3335573971271515, + "learning_rate": 9.289816479749202e-05, + "loss": 1.9011, + "step": 6406 + }, + { + "epoch": 1.9665438919582567, + "grad_norm": 0.3464879095554352, + "learning_rate": 9.289561115429004e-05, + "loss": 1.9061, + "step": 6407 + }, + { + "epoch": 1.9668508287292816, + "grad_norm": 0.3513408899307251, + "learning_rate": 9.289305708716847e-05, + "loss": 1.8982, + "step": 6408 + }, + { + "epoch": 1.967157765500307, + "grad_norm": 0.3888663947582245, + "learning_rate": 9.289050259615256e-05, + "loss": 1.9196, + "step": 6409 + }, + { + "epoch": 1.967464702271332, + "grad_norm": 0.3414073884487152, + "learning_rate": 9.288794768126759e-05, + "loss": 1.932, + "step": 6410 + }, + { + "epoch": 1.9677716390423572, + "grad_norm": 0.33067384362220764, + "learning_rate": 9.288539234253876e-05, + "loss": 1.8547, + "step": 6411 + }, + { + "epoch": 1.9680785758133825, + "grad_norm": 0.31827688217163086, + "learning_rate": 9.288283657999135e-05, + "loss": 1.8691, + "step": 6412 + }, + { + "epoch": 1.9683855125844076, + "grad_norm": 0.32259073853492737, + "learning_rate": 9.288028039365062e-05, + "loss": 1.8889, + "step": 6413 + }, + { + "epoch": 1.9686924493554327, + "grad_norm": 0.37552687525749207, + "learning_rate": 9.287772378354182e-05, + "loss": 1.8709, + "step": 6414 + }, + { + "epoch": 1.968999386126458, + "grad_norm": 0.3446151316165924, + "learning_rate": 9.287516674969024e-05, + "loss": 1.8749, + "step": 6415 + }, + { + "epoch": 1.969306322897483, + "grad_norm": 0.3648208975791931, + "learning_rate": 9.287260929212111e-05, + "loss": 1.93, + "step": 6416 + }, + { + "epoch": 1.9696132596685083, + "grad_norm": 0.3430599868297577, + "learning_rate": 9.287005141085974e-05, + "loss": 1.8537, + "step": 6417 + }, + { + "epoch": 1.9699201964395334, + "grad_norm": 0.39110586047172546, + "learning_rate": 9.286749310593139e-05, + "loss": 1.987, + "step": 6418 + }, + { + "epoch": 1.9702271332105585, + "grad_norm": 0.4033393859863281, + "learning_rate": 9.286493437736136e-05, + "loss": 1.9793, + "step": 6419 + }, + { + "epoch": 1.9705340699815839, + "grad_norm": 0.3950151205062866, + "learning_rate": 9.286237522517491e-05, + "loss": 1.8781, + "step": 6420 + }, + { + "epoch": 1.970841006752609, + "grad_norm": 0.4614053964614868, + "learning_rate": 9.285981564939735e-05, + "loss": 1.9886, + "step": 6421 + }, + { + "epoch": 1.971147943523634, + "grad_norm": 0.4990023076534271, + "learning_rate": 9.285725565005398e-05, + "loss": 1.8957, + "step": 6422 + }, + { + "epoch": 1.9714548802946594, + "grad_norm": 0.501301109790802, + "learning_rate": 9.285469522717008e-05, + "loss": 1.8606, + "step": 6423 + }, + { + "epoch": 1.9717618170656843, + "grad_norm": 0.3820148706436157, + "learning_rate": 9.285213438077097e-05, + "loss": 1.9097, + "step": 6424 + }, + { + "epoch": 1.9720687538367097, + "grad_norm": 0.3959129750728607, + "learning_rate": 9.284957311088193e-05, + "loss": 1.8972, + "step": 6425 + }, + { + "epoch": 1.9723756906077348, + "grad_norm": 0.4914678931236267, + "learning_rate": 9.284701141752831e-05, + "loss": 1.9211, + "step": 6426 + }, + { + "epoch": 1.9726826273787599, + "grad_norm": 0.5992010831832886, + "learning_rate": 9.284444930073542e-05, + "loss": 1.917, + "step": 6427 + }, + { + "epoch": 1.9729895641497852, + "grad_norm": 0.6089407801628113, + "learning_rate": 9.284188676052856e-05, + "loss": 1.9497, + "step": 6428 + }, + { + "epoch": 1.9732965009208103, + "grad_norm": 0.5493173003196716, + "learning_rate": 9.283932379693306e-05, + "loss": 1.9888, + "step": 6429 + }, + { + "epoch": 1.9736034376918354, + "grad_norm": 0.4451984167098999, + "learning_rate": 9.283676040997426e-05, + "loss": 1.892, + "step": 6430 + }, + { + "epoch": 1.9739103744628608, + "grad_norm": 0.35765743255615234, + "learning_rate": 9.283419659967748e-05, + "loss": 1.8768, + "step": 6431 + }, + { + "epoch": 1.9742173112338857, + "grad_norm": 0.36561164259910583, + "learning_rate": 9.283163236606807e-05, + "loss": 1.825, + "step": 6432 + }, + { + "epoch": 1.974524248004911, + "grad_norm": 0.38473913073539734, + "learning_rate": 9.282906770917137e-05, + "loss": 1.9247, + "step": 6433 + }, + { + "epoch": 1.974831184775936, + "grad_norm": 0.324945867061615, + "learning_rate": 9.28265026290127e-05, + "loss": 1.8832, + "step": 6434 + }, + { + "epoch": 1.9751381215469612, + "grad_norm": 0.38697487115859985, + "learning_rate": 9.282393712561744e-05, + "loss": 1.9282, + "step": 6435 + }, + { + "epoch": 1.9754450583179866, + "grad_norm": 0.3772333264350891, + "learning_rate": 9.282137119901094e-05, + "loss": 1.8822, + "step": 6436 + }, + { + "epoch": 1.9757519950890117, + "grad_norm": 0.3522745668888092, + "learning_rate": 9.281880484921854e-05, + "loss": 1.9102, + "step": 6437 + }, + { + "epoch": 1.9760589318600368, + "grad_norm": 0.36745330691337585, + "learning_rate": 9.281623807626562e-05, + "loss": 1.8842, + "step": 6438 + }, + { + "epoch": 1.9763658686310621, + "grad_norm": 0.3990548253059387, + "learning_rate": 9.281367088017755e-05, + "loss": 1.9642, + "step": 6439 + }, + { + "epoch": 1.976672805402087, + "grad_norm": 0.3333520293235779, + "learning_rate": 9.281110326097969e-05, + "loss": 1.8541, + "step": 6440 + }, + { + "epoch": 1.9769797421731123, + "grad_norm": 0.3282802700996399, + "learning_rate": 9.280853521869739e-05, + "loss": 1.8416, + "step": 6441 + }, + { + "epoch": 1.9772866789441375, + "grad_norm": 0.3415268361568451, + "learning_rate": 9.280596675335607e-05, + "loss": 1.9009, + "step": 6442 + }, + { + "epoch": 1.9775936157151626, + "grad_norm": 0.3621836006641388, + "learning_rate": 9.28033978649811e-05, + "loss": 1.8584, + "step": 6443 + }, + { + "epoch": 1.977900552486188, + "grad_norm": 0.34778010845184326, + "learning_rate": 9.280082855359786e-05, + "loss": 1.9455, + "step": 6444 + }, + { + "epoch": 1.978207489257213, + "grad_norm": 0.36525633931159973, + "learning_rate": 9.279825881923174e-05, + "loss": 1.9182, + "step": 6445 + }, + { + "epoch": 1.9785144260282381, + "grad_norm": 0.3404203951358795, + "learning_rate": 9.279568866190815e-05, + "loss": 1.8853, + "step": 6446 + }, + { + "epoch": 1.9788213627992635, + "grad_norm": 0.4564785659313202, + "learning_rate": 9.279311808165249e-05, + "loss": 2.0012, + "step": 6447 + }, + { + "epoch": 1.9791282995702886, + "grad_norm": 0.4371441602706909, + "learning_rate": 9.279054707849015e-05, + "loss": 1.9372, + "step": 6448 + }, + { + "epoch": 1.9794352363413137, + "grad_norm": 0.3928726017475128, + "learning_rate": 9.278797565244652e-05, + "loss": 1.882, + "step": 6449 + }, + { + "epoch": 1.979742173112339, + "grad_norm": 0.483331561088562, + "learning_rate": 9.278540380354706e-05, + "loss": 1.9664, + "step": 6450 + }, + { + "epoch": 1.980049109883364, + "grad_norm": 0.39085066318511963, + "learning_rate": 9.278283153181716e-05, + "loss": 1.874, + "step": 6451 + }, + { + "epoch": 1.9803560466543892, + "grad_norm": 0.3549460172653198, + "learning_rate": 9.278025883728224e-05, + "loss": 1.9108, + "step": 6452 + }, + { + "epoch": 1.9806629834254144, + "grad_norm": 0.4260072410106659, + "learning_rate": 9.277768571996772e-05, + "loss": 1.8621, + "step": 6453 + }, + { + "epoch": 1.9809699201964395, + "grad_norm": 0.4531188905239105, + "learning_rate": 9.277511217989904e-05, + "loss": 1.9924, + "step": 6454 + }, + { + "epoch": 1.9812768569674648, + "grad_norm": 0.34916743636131287, + "learning_rate": 9.277253821710165e-05, + "loss": 1.9459, + "step": 6455 + }, + { + "epoch": 1.98158379373849, + "grad_norm": 0.45466169714927673, + "learning_rate": 9.276996383160095e-05, + "loss": 1.9129, + "step": 6456 + }, + { + "epoch": 1.981890730509515, + "grad_norm": 0.4948022663593292, + "learning_rate": 9.27673890234224e-05, + "loss": 1.9362, + "step": 6457 + }, + { + "epoch": 1.9821976672805404, + "grad_norm": 0.43365779519081116, + "learning_rate": 9.276481379259146e-05, + "loss": 1.9323, + "step": 6458 + }, + { + "epoch": 1.9825046040515653, + "grad_norm": 0.5301255583763123, + "learning_rate": 9.276223813913354e-05, + "loss": 1.9611, + "step": 6459 + }, + { + "epoch": 1.9828115408225906, + "grad_norm": 0.4785257577896118, + "learning_rate": 9.275966206307412e-05, + "loss": 1.8945, + "step": 6460 + }, + { + "epoch": 1.9831184775936157, + "grad_norm": 0.4091590940952301, + "learning_rate": 9.275708556443868e-05, + "loss": 1.9171, + "step": 6461 + }, + { + "epoch": 1.9834254143646408, + "grad_norm": 0.4031025767326355, + "learning_rate": 9.275450864325264e-05, + "loss": 1.9518, + "step": 6462 + }, + { + "epoch": 1.9837323511356661, + "grad_norm": 0.39147642254829407, + "learning_rate": 9.275193129954149e-05, + "loss": 1.8756, + "step": 6463 + }, + { + "epoch": 1.9840392879066913, + "grad_norm": 0.3863523006439209, + "learning_rate": 9.27493535333307e-05, + "loss": 1.8894, + "step": 6464 + }, + { + "epoch": 1.9843462246777164, + "grad_norm": 0.36373165249824524, + "learning_rate": 9.274677534464576e-05, + "loss": 1.8574, + "step": 6465 + }, + { + "epoch": 1.9846531614487417, + "grad_norm": 0.40247389674186707, + "learning_rate": 9.274419673351211e-05, + "loss": 1.832, + "step": 6466 + }, + { + "epoch": 1.9849600982197666, + "grad_norm": 0.3874013125896454, + "learning_rate": 9.274161769995526e-05, + "loss": 1.9079, + "step": 6467 + }, + { + "epoch": 1.985267034990792, + "grad_norm": 0.35506606101989746, + "learning_rate": 9.27390382440007e-05, + "loss": 1.8784, + "step": 6468 + }, + { + "epoch": 1.985573971761817, + "grad_norm": 0.406325101852417, + "learning_rate": 9.273645836567388e-05, + "loss": 1.9822, + "step": 6469 + }, + { + "epoch": 1.9858809085328422, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.273387806500036e-05, + "loss": 1.9334, + "step": 6470 + }, + { + "epoch": 1.9861878453038675, + "grad_norm": 0.4810343384742737, + "learning_rate": 9.273129734200561e-05, + "loss": 1.9598, + "step": 6471 + }, + { + "epoch": 1.9864947820748926, + "grad_norm": 0.4552834630012512, + "learning_rate": 9.272871619671513e-05, + "loss": 1.9504, + "step": 6472 + }, + { + "epoch": 1.9868017188459177, + "grad_norm": 0.38974207639694214, + "learning_rate": 9.272613462915443e-05, + "loss": 1.8811, + "step": 6473 + }, + { + "epoch": 1.987108655616943, + "grad_norm": 0.40983298420906067, + "learning_rate": 9.272355263934902e-05, + "loss": 1.8876, + "step": 6474 + }, + { + "epoch": 1.987415592387968, + "grad_norm": 0.3684757947921753, + "learning_rate": 9.272097022732443e-05, + "loss": 1.921, + "step": 6475 + }, + { + "epoch": 1.9877225291589933, + "grad_norm": 0.38384270668029785, + "learning_rate": 9.271838739310618e-05, + "loss": 1.9099, + "step": 6476 + }, + { + "epoch": 1.9880294659300184, + "grad_norm": 0.3783731460571289, + "learning_rate": 9.271580413671976e-05, + "loss": 1.9322, + "step": 6477 + }, + { + "epoch": 1.9883364027010435, + "grad_norm": 0.3686216473579407, + "learning_rate": 9.271322045819076e-05, + "loss": 1.914, + "step": 6478 + }, + { + "epoch": 1.9886433394720688, + "grad_norm": 0.38776305317878723, + "learning_rate": 9.271063635754466e-05, + "loss": 1.9331, + "step": 6479 + }, + { + "epoch": 1.988950276243094, + "grad_norm": 0.35099950432777405, + "learning_rate": 9.270805183480702e-05, + "loss": 1.9837, + "step": 6480 + }, + { + "epoch": 1.989257213014119, + "grad_norm": 0.3736453652381897, + "learning_rate": 9.270546689000339e-05, + "loss": 1.846, + "step": 6481 + }, + { + "epoch": 1.9895641497851444, + "grad_norm": 0.3654848635196686, + "learning_rate": 9.27028815231593e-05, + "loss": 1.8987, + "step": 6482 + }, + { + "epoch": 1.9898710865561693, + "grad_norm": 0.3534870147705078, + "learning_rate": 9.27002957343003e-05, + "loss": 1.868, + "step": 6483 + }, + { + "epoch": 1.9901780233271946, + "grad_norm": 0.3143392503261566, + "learning_rate": 9.269770952345197e-05, + "loss": 1.8042, + "step": 6484 + }, + { + "epoch": 1.9904849600982197, + "grad_norm": 0.37151026725769043, + "learning_rate": 9.269512289063982e-05, + "loss": 1.8392, + "step": 6485 + }, + { + "epoch": 1.9907918968692448, + "grad_norm": 0.39781463146209717, + "learning_rate": 9.269253583588947e-05, + "loss": 1.9911, + "step": 6486 + }, + { + "epoch": 1.9910988336402702, + "grad_norm": 0.44022107124328613, + "learning_rate": 9.268994835922643e-05, + "loss": 1.9644, + "step": 6487 + }, + { + "epoch": 1.9914057704112953, + "grad_norm": 0.4058530628681183, + "learning_rate": 9.268736046067632e-05, + "loss": 1.9062, + "step": 6488 + }, + { + "epoch": 1.9917127071823204, + "grad_norm": 0.3754481077194214, + "learning_rate": 9.268477214026467e-05, + "loss": 1.8278, + "step": 6489 + }, + { + "epoch": 1.9920196439533457, + "grad_norm": 0.318208247423172, + "learning_rate": 9.268218339801711e-05, + "loss": 1.8529, + "step": 6490 + }, + { + "epoch": 1.9923265807243706, + "grad_norm": 0.350777268409729, + "learning_rate": 9.267959423395918e-05, + "loss": 1.9024, + "step": 6491 + }, + { + "epoch": 1.992633517495396, + "grad_norm": 0.3145158588886261, + "learning_rate": 9.26770046481165e-05, + "loss": 1.934, + "step": 6492 + }, + { + "epoch": 1.992940454266421, + "grad_norm": 0.3347548842430115, + "learning_rate": 9.267441464051463e-05, + "loss": 1.8989, + "step": 6493 + }, + { + "epoch": 1.9932473910374462, + "grad_norm": 0.33111512660980225, + "learning_rate": 9.267182421117919e-05, + "loss": 1.8808, + "step": 6494 + }, + { + "epoch": 1.9935543278084715, + "grad_norm": 0.3135010898113251, + "learning_rate": 9.266923336013577e-05, + "loss": 1.895, + "step": 6495 + }, + { + "epoch": 1.9938612645794966, + "grad_norm": 0.3638830780982971, + "learning_rate": 9.266664208740998e-05, + "loss": 1.9331, + "step": 6496 + }, + { + "epoch": 1.9941682013505218, + "grad_norm": 0.3592624068260193, + "learning_rate": 9.266405039302743e-05, + "loss": 1.8963, + "step": 6497 + }, + { + "epoch": 1.994475138121547, + "grad_norm": 0.34216129779815674, + "learning_rate": 9.266145827701371e-05, + "loss": 1.9062, + "step": 6498 + }, + { + "epoch": 1.994782074892572, + "grad_norm": 0.4180343747138977, + "learning_rate": 9.265886573939447e-05, + "loss": 1.9351, + "step": 6499 + }, + { + "epoch": 1.9950890116635973, + "grad_norm": 0.36890342831611633, + "learning_rate": 9.265627278019531e-05, + "loss": 1.9037, + "step": 6500 + }, + { + "epoch": 1.9953959484346224, + "grad_norm": 0.36638152599334717, + "learning_rate": 9.265367939944188e-05, + "loss": 1.9524, + "step": 6501 + }, + { + "epoch": 1.9957028852056475, + "grad_norm": 0.44918373227119446, + "learning_rate": 9.265108559715976e-05, + "loss": 1.9236, + "step": 6502 + }, + { + "epoch": 1.9960098219766729, + "grad_norm": 0.3805326521396637, + "learning_rate": 9.264849137337462e-05, + "loss": 1.8526, + "step": 6503 + }, + { + "epoch": 1.996316758747698, + "grad_norm": 0.39035212993621826, + "learning_rate": 9.26458967281121e-05, + "loss": 1.8256, + "step": 6504 + }, + { + "epoch": 1.996623695518723, + "grad_norm": 0.330522358417511, + "learning_rate": 9.264330166139783e-05, + "loss": 1.8487, + "step": 6505 + }, + { + "epoch": 1.9969306322897484, + "grad_norm": 0.33569198846817017, + "learning_rate": 9.264070617325746e-05, + "loss": 1.8735, + "step": 6506 + }, + { + "epoch": 1.9972375690607733, + "grad_norm": 0.4121384918689728, + "learning_rate": 9.263811026371664e-05, + "loss": 2.0028, + "step": 6507 + }, + { + "epoch": 1.9975445058317987, + "grad_norm": 0.3419879972934723, + "learning_rate": 9.263551393280103e-05, + "loss": 1.8432, + "step": 6508 + }, + { + "epoch": 1.9978514426028238, + "grad_norm": 0.33369818329811096, + "learning_rate": 9.263291718053626e-05, + "loss": 1.8752, + "step": 6509 + }, + { + "epoch": 1.9981583793738489, + "grad_norm": 0.3580996096134186, + "learning_rate": 9.263032000694804e-05, + "loss": 1.9319, + "step": 6510 + }, + { + "epoch": 1.9984653161448742, + "grad_norm": 0.38216903805732727, + "learning_rate": 9.2627722412062e-05, + "loss": 1.9424, + "step": 6511 + }, + { + "epoch": 1.9987722529158993, + "grad_norm": 0.3836761713027954, + "learning_rate": 9.26251243959038e-05, + "loss": 1.9259, + "step": 6512 + }, + { + "epoch": 1.9990791896869244, + "grad_norm": 0.34978967905044556, + "learning_rate": 9.262252595849917e-05, + "loss": 1.8648, + "step": 6513 + }, + { + "epoch": 1.9993861264579498, + "grad_norm": 0.4190160632133484, + "learning_rate": 9.261992709987375e-05, + "loss": 1.9456, + "step": 6514 + }, + { + "epoch": 1.9996930632289747, + "grad_norm": 0.38700881600379944, + "learning_rate": 9.261732782005322e-05, + "loss": 1.8768, + "step": 6515 + }, + { + "epoch": 2.0, + "grad_norm": 0.3706338405609131, + "learning_rate": 9.261472811906328e-05, + "loss": 1.9247, + "step": 6516 + }, + { + "epoch": 2.0003069367710253, + "grad_norm": 0.36679908633232117, + "learning_rate": 9.261212799692962e-05, + "loss": 1.8193, + "step": 6517 + }, + { + "epoch": 2.0006138735420502, + "grad_norm": 0.45219072699546814, + "learning_rate": 9.260952745367795e-05, + "loss": 1.9019, + "step": 6518 + }, + { + "epoch": 2.0009208103130756, + "grad_norm": 0.6038491725921631, + "learning_rate": 9.260692648933393e-05, + "loss": 1.8834, + "step": 6519 + }, + { + "epoch": 2.001227747084101, + "grad_norm": 0.5823990106582642, + "learning_rate": 9.260432510392331e-05, + "loss": 1.9066, + "step": 6520 + }, + { + "epoch": 2.001534683855126, + "grad_norm": 0.4731088876724243, + "learning_rate": 9.260172329747178e-05, + "loss": 1.8997, + "step": 6521 + }, + { + "epoch": 2.001841620626151, + "grad_norm": 0.3397974669933319, + "learning_rate": 9.259912107000504e-05, + "loss": 1.9396, + "step": 6522 + }, + { + "epoch": 2.002148557397176, + "grad_norm": 0.374734103679657, + "learning_rate": 9.259651842154882e-05, + "loss": 1.9311, + "step": 6523 + }, + { + "epoch": 2.0024554941682013, + "grad_norm": 0.48218441009521484, + "learning_rate": 9.259391535212884e-05, + "loss": 1.948, + "step": 6524 + }, + { + "epoch": 2.0027624309392267, + "grad_norm": 0.40540626645088196, + "learning_rate": 9.259131186177082e-05, + "loss": 1.8541, + "step": 6525 + }, + { + "epoch": 2.0030693677102516, + "grad_norm": 0.3698440492153168, + "learning_rate": 9.258870795050048e-05, + "loss": 1.9622, + "step": 6526 + }, + { + "epoch": 2.003376304481277, + "grad_norm": 0.35084524750709534, + "learning_rate": 9.258610361834358e-05, + "loss": 1.8882, + "step": 6527 + }, + { + "epoch": 2.0036832412523022, + "grad_norm": 0.38982072472572327, + "learning_rate": 9.258349886532584e-05, + "loss": 1.9523, + "step": 6528 + }, + { + "epoch": 2.003990178023327, + "grad_norm": 0.3737744390964508, + "learning_rate": 9.258089369147302e-05, + "loss": 1.9091, + "step": 6529 + }, + { + "epoch": 2.0042971147943525, + "grad_norm": 0.36094167828559875, + "learning_rate": 9.257828809681083e-05, + "loss": 1.8711, + "step": 6530 + }, + { + "epoch": 2.0046040515653774, + "grad_norm": 0.3270244896411896, + "learning_rate": 9.257568208136506e-05, + "loss": 1.8738, + "step": 6531 + }, + { + "epoch": 2.0049109883364027, + "grad_norm": 0.3320237100124359, + "learning_rate": 9.257307564516145e-05, + "loss": 1.8889, + "step": 6532 + }, + { + "epoch": 2.005217925107428, + "grad_norm": 0.3091014623641968, + "learning_rate": 9.257046878822573e-05, + "loss": 1.8683, + "step": 6533 + }, + { + "epoch": 2.005524861878453, + "grad_norm": 0.3234712779521942, + "learning_rate": 9.25678615105837e-05, + "loss": 1.8787, + "step": 6534 + }, + { + "epoch": 2.0058317986494782, + "grad_norm": 0.38402292132377625, + "learning_rate": 9.25652538122611e-05, + "loss": 1.9414, + "step": 6535 + }, + { + "epoch": 2.0061387354205036, + "grad_norm": 0.41379863023757935, + "learning_rate": 9.256264569328372e-05, + "loss": 1.9185, + "step": 6536 + }, + { + "epoch": 2.0064456721915285, + "grad_norm": 0.35990384221076965, + "learning_rate": 9.256003715367733e-05, + "loss": 1.8756, + "step": 6537 + }, + { + "epoch": 2.006752608962554, + "grad_norm": 0.3489217460155487, + "learning_rate": 9.25574281934677e-05, + "loss": 1.8984, + "step": 6538 + }, + { + "epoch": 2.0070595457335787, + "grad_norm": 0.326541006565094, + "learning_rate": 9.255481881268064e-05, + "loss": 1.8559, + "step": 6539 + }, + { + "epoch": 2.007366482504604, + "grad_norm": 0.40900397300720215, + "learning_rate": 9.25522090113419e-05, + "loss": 1.8832, + "step": 6540 + }, + { + "epoch": 2.0076734192756294, + "grad_norm": 0.4130956828594208, + "learning_rate": 9.254959878947731e-05, + "loss": 1.8437, + "step": 6541 + }, + { + "epoch": 2.0079803560466543, + "grad_norm": 0.38869336247444153, + "learning_rate": 9.254698814711263e-05, + "loss": 1.8839, + "step": 6542 + }, + { + "epoch": 2.0082872928176796, + "grad_norm": 0.37832918763160706, + "learning_rate": 9.254437708427368e-05, + "loss": 1.9519, + "step": 6543 + }, + { + "epoch": 2.008594229588705, + "grad_norm": 0.35336560010910034, + "learning_rate": 9.254176560098625e-05, + "loss": 1.8928, + "step": 6544 + }, + { + "epoch": 2.00890116635973, + "grad_norm": 0.347260981798172, + "learning_rate": 9.253915369727617e-05, + "loss": 1.9133, + "step": 6545 + }, + { + "epoch": 2.009208103130755, + "grad_norm": 0.3706999719142914, + "learning_rate": 9.253654137316923e-05, + "loss": 1.9048, + "step": 6546 + }, + { + "epoch": 2.00951503990178, + "grad_norm": 0.40080907940864563, + "learning_rate": 9.253392862869127e-05, + "loss": 1.9169, + "step": 6547 + }, + { + "epoch": 2.0098219766728054, + "grad_norm": 0.3635334074497223, + "learning_rate": 9.253131546386808e-05, + "loss": 1.8623, + "step": 6548 + }, + { + "epoch": 2.0101289134438307, + "grad_norm": 0.32642990350723267, + "learning_rate": 9.252870187872552e-05, + "loss": 1.8624, + "step": 6549 + }, + { + "epoch": 2.0104358502148556, + "grad_norm": 0.32467779517173767, + "learning_rate": 9.25260878732894e-05, + "loss": 1.8867, + "step": 6550 + }, + { + "epoch": 2.010742786985881, + "grad_norm": 0.3496699631214142, + "learning_rate": 9.252347344758553e-05, + "loss": 1.8441, + "step": 6551 + }, + { + "epoch": 2.0110497237569063, + "grad_norm": 0.3624981939792633, + "learning_rate": 9.252085860163981e-05, + "loss": 1.9045, + "step": 6552 + }, + { + "epoch": 2.011356660527931, + "grad_norm": 0.3801099359989166, + "learning_rate": 9.251824333547801e-05, + "loss": 1.9273, + "step": 6553 + }, + { + "epoch": 2.0116635972989565, + "grad_norm": 0.355866402387619, + "learning_rate": 9.251562764912602e-05, + "loss": 1.9032, + "step": 6554 + }, + { + "epoch": 2.0119705340699814, + "grad_norm": 0.31210052967071533, + "learning_rate": 9.251301154260968e-05, + "loss": 1.8148, + "step": 6555 + }, + { + "epoch": 2.0122774708410067, + "grad_norm": 0.3583676218986511, + "learning_rate": 9.251039501595485e-05, + "loss": 1.9326, + "step": 6556 + }, + { + "epoch": 2.012584407612032, + "grad_norm": 0.40221846103668213, + "learning_rate": 9.250777806918737e-05, + "loss": 1.8968, + "step": 6557 + }, + { + "epoch": 2.012891344383057, + "grad_norm": 0.3403627574443817, + "learning_rate": 9.250516070233311e-05, + "loss": 1.8956, + "step": 6558 + }, + { + "epoch": 2.0131982811540823, + "grad_norm": 0.37752729654312134, + "learning_rate": 9.250254291541796e-05, + "loss": 1.9136, + "step": 6559 + }, + { + "epoch": 2.0135052179251076, + "grad_norm": 0.3661794364452362, + "learning_rate": 9.249992470846774e-05, + "loss": 1.8796, + "step": 6560 + }, + { + "epoch": 2.0138121546961325, + "grad_norm": 0.315603643655777, + "learning_rate": 9.249730608150837e-05, + "loss": 1.8711, + "step": 6561 + }, + { + "epoch": 2.014119091467158, + "grad_norm": 0.3187065124511719, + "learning_rate": 9.249468703456571e-05, + "loss": 1.8611, + "step": 6562 + }, + { + "epoch": 2.0144260282381827, + "grad_norm": 0.3018025755882263, + "learning_rate": 9.249206756766564e-05, + "loss": 1.786, + "step": 6563 + }, + { + "epoch": 2.014732965009208, + "grad_norm": 0.344963401556015, + "learning_rate": 9.248944768083406e-05, + "loss": 1.9428, + "step": 6564 + }, + { + "epoch": 2.0150399017802334, + "grad_norm": 0.29776978492736816, + "learning_rate": 9.248682737409687e-05, + "loss": 1.8089, + "step": 6565 + }, + { + "epoch": 2.0153468385512583, + "grad_norm": 0.348982572555542, + "learning_rate": 9.248420664747992e-05, + "loss": 1.8407, + "step": 6566 + }, + { + "epoch": 2.0156537753222836, + "grad_norm": 0.3413224518299103, + "learning_rate": 9.248158550100915e-05, + "loss": 1.9802, + "step": 6567 + }, + { + "epoch": 2.015960712093309, + "grad_norm": 0.3598950505256653, + "learning_rate": 9.247896393471044e-05, + "loss": 1.8882, + "step": 6568 + }, + { + "epoch": 2.016267648864334, + "grad_norm": 0.3609221875667572, + "learning_rate": 9.247634194860974e-05, + "loss": 1.934, + "step": 6569 + }, + { + "epoch": 2.016574585635359, + "grad_norm": 0.3893497586250305, + "learning_rate": 9.247371954273291e-05, + "loss": 1.8808, + "step": 6570 + }, + { + "epoch": 2.016881522406384, + "grad_norm": 0.347417950630188, + "learning_rate": 9.24710967171059e-05, + "loss": 1.863, + "step": 6571 + }, + { + "epoch": 2.0171884591774094, + "grad_norm": 0.35378298163414, + "learning_rate": 9.246847347175461e-05, + "loss": 1.8664, + "step": 6572 + }, + { + "epoch": 2.0174953959484347, + "grad_norm": 0.2819608151912689, + "learning_rate": 9.246584980670499e-05, + "loss": 1.9007, + "step": 6573 + }, + { + "epoch": 2.0178023327194596, + "grad_norm": 0.32445117831230164, + "learning_rate": 9.246322572198293e-05, + "loss": 1.9176, + "step": 6574 + }, + { + "epoch": 2.018109269490485, + "grad_norm": 0.33579203486442566, + "learning_rate": 9.24606012176144e-05, + "loss": 1.8192, + "step": 6575 + }, + { + "epoch": 2.0184162062615103, + "grad_norm": 0.40369588136672974, + "learning_rate": 9.245797629362532e-05, + "loss": 1.8731, + "step": 6576 + }, + { + "epoch": 2.018723143032535, + "grad_norm": 0.34241169691085815, + "learning_rate": 9.245535095004163e-05, + "loss": 1.8555, + "step": 6577 + }, + { + "epoch": 2.0190300798035605, + "grad_norm": 0.3627666234970093, + "learning_rate": 9.245272518688927e-05, + "loss": 1.9212, + "step": 6578 + }, + { + "epoch": 2.0193370165745854, + "grad_norm": 0.3330884873867035, + "learning_rate": 9.245009900419422e-05, + "loss": 1.8727, + "step": 6579 + }, + { + "epoch": 2.0196439533456108, + "grad_norm": 0.3259236514568329, + "learning_rate": 9.244747240198239e-05, + "loss": 1.8471, + "step": 6580 + }, + { + "epoch": 2.019950890116636, + "grad_norm": 0.3715277910232544, + "learning_rate": 9.244484538027976e-05, + "loss": 1.8925, + "step": 6581 + }, + { + "epoch": 2.020257826887661, + "grad_norm": 0.4752909541130066, + "learning_rate": 9.24422179391123e-05, + "loss": 1.889, + "step": 6582 + }, + { + "epoch": 2.0205647636586863, + "grad_norm": 0.5166791677474976, + "learning_rate": 9.243959007850597e-05, + "loss": 1.8637, + "step": 6583 + }, + { + "epoch": 2.0208717004297116, + "grad_norm": 0.5350266695022583, + "learning_rate": 9.243696179848673e-05, + "loss": 1.8916, + "step": 6584 + }, + { + "epoch": 2.0211786372007365, + "grad_norm": 0.6115607619285583, + "learning_rate": 9.243433309908055e-05, + "loss": 1.8847, + "step": 6585 + }, + { + "epoch": 2.021485573971762, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.243170398031343e-05, + "loss": 1.8889, + "step": 6586 + }, + { + "epoch": 2.021792510742787, + "grad_norm": 0.4547630846500397, + "learning_rate": 9.242907444221134e-05, + "loss": 1.8752, + "step": 6587 + }, + { + "epoch": 2.022099447513812, + "grad_norm": 0.39437413215637207, + "learning_rate": 9.242644448480027e-05, + "loss": 1.9318, + "step": 6588 + }, + { + "epoch": 2.0224063842848374, + "grad_norm": 0.39216291904449463, + "learning_rate": 9.24238141081062e-05, + "loss": 1.8799, + "step": 6589 + }, + { + "epoch": 2.0227133210558623, + "grad_norm": 0.4100605547428131, + "learning_rate": 9.242118331215513e-05, + "loss": 1.9278, + "step": 6590 + }, + { + "epoch": 2.0230202578268877, + "grad_norm": 0.38527074456214905, + "learning_rate": 9.241855209697307e-05, + "loss": 1.9085, + "step": 6591 + }, + { + "epoch": 2.023327194597913, + "grad_norm": 0.39856311678886414, + "learning_rate": 9.241592046258602e-05, + "loss": 1.8057, + "step": 6592 + }, + { + "epoch": 2.023634131368938, + "grad_norm": 0.4070499539375305, + "learning_rate": 9.241328840902e-05, + "loss": 1.8099, + "step": 6593 + }, + { + "epoch": 2.023941068139963, + "grad_norm": 0.40319183468818665, + "learning_rate": 9.241065593630097e-05, + "loss": 1.8654, + "step": 6594 + }, + { + "epoch": 2.0242480049109886, + "grad_norm": 0.3788430988788605, + "learning_rate": 9.240802304445499e-05, + "loss": 1.9419, + "step": 6595 + }, + { + "epoch": 2.0245549416820134, + "grad_norm": 0.3656894564628601, + "learning_rate": 9.240538973350809e-05, + "loss": 1.8625, + "step": 6596 + }, + { + "epoch": 2.0248618784530388, + "grad_norm": 0.4384852945804596, + "learning_rate": 9.240275600348625e-05, + "loss": 1.8893, + "step": 6597 + }, + { + "epoch": 2.0251688152240637, + "grad_norm": 0.5054775476455688, + "learning_rate": 9.240012185441554e-05, + "loss": 1.826, + "step": 6598 + }, + { + "epoch": 2.025475751995089, + "grad_norm": 0.4576725959777832, + "learning_rate": 9.239748728632196e-05, + "loss": 1.9319, + "step": 6599 + }, + { + "epoch": 2.0257826887661143, + "grad_norm": 0.40581515431404114, + "learning_rate": 9.239485229923157e-05, + "loss": 1.905, + "step": 6600 + }, + { + "epoch": 2.0260896255371392, + "grad_norm": 0.3168322443962097, + "learning_rate": 9.23922168931704e-05, + "loss": 1.8937, + "step": 6601 + }, + { + "epoch": 2.0263965623081646, + "grad_norm": 0.39211124181747437, + "learning_rate": 9.238958106816449e-05, + "loss": 1.8346, + "step": 6602 + }, + { + "epoch": 2.02670349907919, + "grad_norm": 0.4722496569156647, + "learning_rate": 9.23869448242399e-05, + "loss": 1.933, + "step": 6603 + }, + { + "epoch": 2.027010435850215, + "grad_norm": 0.47029170393943787, + "learning_rate": 9.238430816142268e-05, + "loss": 1.8873, + "step": 6604 + }, + { + "epoch": 2.02731737262124, + "grad_norm": 0.36421555280685425, + "learning_rate": 9.238167107973888e-05, + "loss": 1.8311, + "step": 6605 + }, + { + "epoch": 2.027624309392265, + "grad_norm": 0.36506712436676025, + "learning_rate": 9.237903357921455e-05, + "loss": 1.9025, + "step": 6606 + }, + { + "epoch": 2.0279312461632903, + "grad_norm": 0.5055087208747864, + "learning_rate": 9.237639565987579e-05, + "loss": 1.9138, + "step": 6607 + }, + { + "epoch": 2.0282381829343157, + "grad_norm": 0.5850993394851685, + "learning_rate": 9.237375732174867e-05, + "loss": 1.869, + "step": 6608 + }, + { + "epoch": 2.0285451197053406, + "grad_norm": 0.5053986310958862, + "learning_rate": 9.237111856485921e-05, + "loss": 1.8196, + "step": 6609 + }, + { + "epoch": 2.028852056476366, + "grad_norm": 0.40635839104652405, + "learning_rate": 9.236847938923354e-05, + "loss": 1.8399, + "step": 6610 + }, + { + "epoch": 2.0291589932473912, + "grad_norm": 0.32075709104537964, + "learning_rate": 9.236583979489771e-05, + "loss": 1.8532, + "step": 6611 + }, + { + "epoch": 2.029465930018416, + "grad_norm": 0.4474230408668518, + "learning_rate": 9.236319978187783e-05, + "loss": 1.8807, + "step": 6612 + }, + { + "epoch": 2.0297728667894415, + "grad_norm": 0.5391832590103149, + "learning_rate": 9.236055935019998e-05, + "loss": 1.8887, + "step": 6613 + }, + { + "epoch": 2.0300798035604664, + "grad_norm": 0.5129361748695374, + "learning_rate": 9.235791849989024e-05, + "loss": 1.8541, + "step": 6614 + }, + { + "epoch": 2.0303867403314917, + "grad_norm": 0.33113735914230347, + "learning_rate": 9.235527723097474e-05, + "loss": 1.8611, + "step": 6615 + }, + { + "epoch": 2.030693677102517, + "grad_norm": 0.3526761531829834, + "learning_rate": 9.235263554347956e-05, + "loss": 1.8436, + "step": 6616 + }, + { + "epoch": 2.031000613873542, + "grad_norm": 0.4380190670490265, + "learning_rate": 9.234999343743081e-05, + "loss": 1.854, + "step": 6617 + }, + { + "epoch": 2.0313075506445673, + "grad_norm": 0.4300559163093567, + "learning_rate": 9.23473509128546e-05, + "loss": 1.919, + "step": 6618 + }, + { + "epoch": 2.0316144874155926, + "grad_norm": 0.3445209860801697, + "learning_rate": 9.234470796977705e-05, + "loss": 1.88, + "step": 6619 + }, + { + "epoch": 2.0319214241866175, + "grad_norm": 0.35759109258651733, + "learning_rate": 9.234206460822428e-05, + "loss": 1.9244, + "step": 6620 + }, + { + "epoch": 2.032228360957643, + "grad_norm": 0.432804137468338, + "learning_rate": 9.23394208282224e-05, + "loss": 1.9312, + "step": 6621 + }, + { + "epoch": 2.0325352977286677, + "grad_norm": 0.446865439414978, + "learning_rate": 9.233677662979756e-05, + "loss": 1.8791, + "step": 6622 + }, + { + "epoch": 2.032842234499693, + "grad_norm": 0.37617436051368713, + "learning_rate": 9.233413201297588e-05, + "loss": 1.8794, + "step": 6623 + }, + { + "epoch": 2.0331491712707184, + "grad_norm": 0.33695775270462036, + "learning_rate": 9.233148697778349e-05, + "loss": 1.8649, + "step": 6624 + }, + { + "epoch": 2.0334561080417433, + "grad_norm": 0.3893069624900818, + "learning_rate": 9.232884152424654e-05, + "loss": 1.899, + "step": 6625 + }, + { + "epoch": 2.0337630448127686, + "grad_norm": 0.38993194699287415, + "learning_rate": 9.232619565239116e-05, + "loss": 1.8994, + "step": 6626 + }, + { + "epoch": 2.034069981583794, + "grad_norm": 0.3725507855415344, + "learning_rate": 9.23235493622435e-05, + "loss": 1.8758, + "step": 6627 + }, + { + "epoch": 2.034376918354819, + "grad_norm": 0.3236019015312195, + "learning_rate": 9.232090265382973e-05, + "loss": 1.9041, + "step": 6628 + }, + { + "epoch": 2.034683855125844, + "grad_norm": 0.3399617671966553, + "learning_rate": 9.231825552717599e-05, + "loss": 1.9081, + "step": 6629 + }, + { + "epoch": 2.034990791896869, + "grad_norm": 0.352096289396286, + "learning_rate": 9.231560798230845e-05, + "loss": 1.9001, + "step": 6630 + }, + { + "epoch": 2.0352977286678944, + "grad_norm": 0.39621952176094055, + "learning_rate": 9.231296001925327e-05, + "loss": 1.9258, + "step": 6631 + }, + { + "epoch": 2.0356046654389197, + "grad_norm": 0.36686012148857117, + "learning_rate": 9.23103116380366e-05, + "loss": 1.9325, + "step": 6632 + }, + { + "epoch": 2.0359116022099446, + "grad_norm": 0.36286696791648865, + "learning_rate": 9.230766283868466e-05, + "loss": 1.9623, + "step": 6633 + }, + { + "epoch": 2.03621853898097, + "grad_norm": 0.34748387336730957, + "learning_rate": 9.230501362122359e-05, + "loss": 1.8326, + "step": 6634 + }, + { + "epoch": 2.0365254757519953, + "grad_norm": 0.350993275642395, + "learning_rate": 9.230236398567958e-05, + "loss": 1.8333, + "step": 6635 + }, + { + "epoch": 2.03683241252302, + "grad_norm": 0.3181723356246948, + "learning_rate": 9.229971393207881e-05, + "loss": 1.8852, + "step": 6636 + }, + { + "epoch": 2.0371393492940455, + "grad_norm": 0.3446536660194397, + "learning_rate": 9.229706346044747e-05, + "loss": 1.8833, + "step": 6637 + }, + { + "epoch": 2.0374462860650704, + "grad_norm": 0.3077203631401062, + "learning_rate": 9.229441257081176e-05, + "loss": 1.8546, + "step": 6638 + }, + { + "epoch": 2.0377532228360957, + "grad_norm": 0.3659566342830658, + "learning_rate": 9.229176126319788e-05, + "loss": 1.8687, + "step": 6639 + }, + { + "epoch": 2.038060159607121, + "grad_norm": 0.379779577255249, + "learning_rate": 9.228910953763204e-05, + "loss": 1.9208, + "step": 6640 + }, + { + "epoch": 2.038367096378146, + "grad_norm": 0.4496903121471405, + "learning_rate": 9.228645739414042e-05, + "loss": 1.9471, + "step": 6641 + }, + { + "epoch": 2.0386740331491713, + "grad_norm": 0.37597209215164185, + "learning_rate": 9.228380483274923e-05, + "loss": 1.9047, + "step": 6642 + }, + { + "epoch": 2.0389809699201966, + "grad_norm": 0.3739323019981384, + "learning_rate": 9.228115185348471e-05, + "loss": 1.9697, + "step": 6643 + }, + { + "epoch": 2.0392879066912215, + "grad_norm": 0.3524092435836792, + "learning_rate": 9.227849845637306e-05, + "loss": 1.8716, + "step": 6644 + }, + { + "epoch": 2.039594843462247, + "grad_norm": 0.36939096450805664, + "learning_rate": 9.227584464144051e-05, + "loss": 1.9836, + "step": 6645 + }, + { + "epoch": 2.0399017802332717, + "grad_norm": 0.39015519618988037, + "learning_rate": 9.22731904087133e-05, + "loss": 1.907, + "step": 6646 + }, + { + "epoch": 2.040208717004297, + "grad_norm": 0.3725626468658447, + "learning_rate": 9.227053575821763e-05, + "loss": 1.9483, + "step": 6647 + }, + { + "epoch": 2.0405156537753224, + "grad_norm": 0.41595613956451416, + "learning_rate": 9.226788068997974e-05, + "loss": 1.9352, + "step": 6648 + }, + { + "epoch": 2.0408225905463473, + "grad_norm": 0.4026443660259247, + "learning_rate": 9.226522520402589e-05, + "loss": 1.9166, + "step": 6649 + }, + { + "epoch": 2.0411295273173726, + "grad_norm": 0.39883533120155334, + "learning_rate": 9.226256930038233e-05, + "loss": 1.8594, + "step": 6650 + }, + { + "epoch": 2.041436464088398, + "grad_norm": 0.35540083050727844, + "learning_rate": 9.225991297907526e-05, + "loss": 1.9065, + "step": 6651 + }, + { + "epoch": 2.041743400859423, + "grad_norm": 0.3799804747104645, + "learning_rate": 9.225725624013097e-05, + "loss": 1.9232, + "step": 6652 + }, + { + "epoch": 2.042050337630448, + "grad_norm": 0.37289959192276, + "learning_rate": 9.225459908357572e-05, + "loss": 1.9679, + "step": 6653 + }, + { + "epoch": 2.042357274401473, + "grad_norm": 0.38069143891334534, + "learning_rate": 9.225194150943574e-05, + "loss": 1.9699, + "step": 6654 + }, + { + "epoch": 2.0426642111724984, + "grad_norm": 0.43708884716033936, + "learning_rate": 9.224928351773731e-05, + "loss": 1.8907, + "step": 6655 + }, + { + "epoch": 2.0429711479435237, + "grad_norm": 0.47203195095062256, + "learning_rate": 9.22466251085067e-05, + "loss": 1.9615, + "step": 6656 + }, + { + "epoch": 2.0432780847145486, + "grad_norm": 0.405129998922348, + "learning_rate": 9.224396628177019e-05, + "loss": 1.9165, + "step": 6657 + }, + { + "epoch": 2.043585021485574, + "grad_norm": 0.33447468280792236, + "learning_rate": 9.224130703755403e-05, + "loss": 1.852, + "step": 6658 + }, + { + "epoch": 2.0438919582565993, + "grad_norm": 0.33780771493911743, + "learning_rate": 9.223864737588453e-05, + "loss": 1.875, + "step": 6659 + }, + { + "epoch": 2.044198895027624, + "grad_norm": 0.37942594289779663, + "learning_rate": 9.223598729678796e-05, + "loss": 1.9115, + "step": 6660 + }, + { + "epoch": 2.0445058317986495, + "grad_norm": 0.3368874192237854, + "learning_rate": 9.223332680029059e-05, + "loss": 1.822, + "step": 6661 + }, + { + "epoch": 2.044812768569675, + "grad_norm": 0.3029201924800873, + "learning_rate": 9.223066588641873e-05, + "loss": 1.8902, + "step": 6662 + }, + { + "epoch": 2.0451197053406998, + "grad_norm": 0.4605506360530853, + "learning_rate": 9.22280045551987e-05, + "loss": 1.9164, + "step": 6663 + }, + { + "epoch": 2.045426642111725, + "grad_norm": 0.5012617111206055, + "learning_rate": 9.222534280665675e-05, + "loss": 1.8859, + "step": 6664 + }, + { + "epoch": 2.04573357888275, + "grad_norm": 0.5177115797996521, + "learning_rate": 9.222268064081924e-05, + "loss": 1.93, + "step": 6665 + }, + { + "epoch": 2.0460405156537753, + "grad_norm": 0.3966628313064575, + "learning_rate": 9.222001805771244e-05, + "loss": 1.8817, + "step": 6666 + }, + { + "epoch": 2.0463474524248007, + "grad_norm": 0.3670666813850403, + "learning_rate": 9.221735505736269e-05, + "loss": 1.8224, + "step": 6667 + }, + { + "epoch": 2.0466543891958255, + "grad_norm": 0.4584221839904785, + "learning_rate": 9.221469163979628e-05, + "loss": 1.7788, + "step": 6668 + }, + { + "epoch": 2.046961325966851, + "grad_norm": 0.5598693490028381, + "learning_rate": 9.221202780503954e-05, + "loss": 1.9263, + "step": 6669 + }, + { + "epoch": 2.047268262737876, + "grad_norm": 0.44200289249420166, + "learning_rate": 9.22093635531188e-05, + "loss": 1.8455, + "step": 6670 + }, + { + "epoch": 2.047575199508901, + "grad_norm": 0.33257725834846497, + "learning_rate": 9.22066988840604e-05, + "loss": 1.9019, + "step": 6671 + }, + { + "epoch": 2.0478821362799264, + "grad_norm": 0.4716290831565857, + "learning_rate": 9.220403379789066e-05, + "loss": 1.9012, + "step": 6672 + }, + { + "epoch": 2.0481890730509513, + "grad_norm": 0.5600453615188599, + "learning_rate": 9.220136829463591e-05, + "loss": 1.9158, + "step": 6673 + }, + { + "epoch": 2.0484960098219767, + "grad_norm": 0.5345216393470764, + "learning_rate": 9.219870237432252e-05, + "loss": 1.931, + "step": 6674 + }, + { + "epoch": 2.048802946593002, + "grad_norm": 0.36617112159729004, + "learning_rate": 9.219603603697682e-05, + "loss": 1.9019, + "step": 6675 + }, + { + "epoch": 2.049109883364027, + "grad_norm": 0.33677804470062256, + "learning_rate": 9.219336928262514e-05, + "loss": 1.8897, + "step": 6676 + }, + { + "epoch": 2.049416820135052, + "grad_norm": 0.48563066124916077, + "learning_rate": 9.219070211129388e-05, + "loss": 1.9147, + "step": 6677 + }, + { + "epoch": 2.0497237569060776, + "grad_norm": 0.5029729008674622, + "learning_rate": 9.218803452300935e-05, + "loss": 1.8926, + "step": 6678 + }, + { + "epoch": 2.0500306936771024, + "grad_norm": 0.3969452977180481, + "learning_rate": 9.218536651779795e-05, + "loss": 1.9337, + "step": 6679 + }, + { + "epoch": 2.050337630448128, + "grad_norm": 0.37374138832092285, + "learning_rate": 9.218269809568603e-05, + "loss": 1.9147, + "step": 6680 + }, + { + "epoch": 2.0506445672191527, + "grad_norm": 0.416608065366745, + "learning_rate": 9.218002925669996e-05, + "loss": 1.975, + "step": 6681 + }, + { + "epoch": 2.050951503990178, + "grad_norm": 0.35848283767700195, + "learning_rate": 9.217736000086612e-05, + "loss": 1.9194, + "step": 6682 + }, + { + "epoch": 2.0512584407612033, + "grad_norm": 0.3294626772403717, + "learning_rate": 9.217469032821088e-05, + "loss": 1.8541, + "step": 6683 + }, + { + "epoch": 2.0515653775322282, + "grad_norm": 0.4164618253707886, + "learning_rate": 9.217202023876064e-05, + "loss": 1.8999, + "step": 6684 + }, + { + "epoch": 2.0518723143032536, + "grad_norm": 0.4067288935184479, + "learning_rate": 9.216934973254179e-05, + "loss": 1.8609, + "step": 6685 + }, + { + "epoch": 2.052179251074279, + "grad_norm": 0.38743069767951965, + "learning_rate": 9.216667880958069e-05, + "loss": 1.8571, + "step": 6686 + }, + { + "epoch": 2.052486187845304, + "grad_norm": 0.3430919647216797, + "learning_rate": 9.216400746990377e-05, + "loss": 1.9229, + "step": 6687 + }, + { + "epoch": 2.052793124616329, + "grad_norm": 0.3512028753757477, + "learning_rate": 9.21613357135374e-05, + "loss": 1.9331, + "step": 6688 + }, + { + "epoch": 2.053100061387354, + "grad_norm": 0.3708036541938782, + "learning_rate": 9.215866354050799e-05, + "loss": 1.8499, + "step": 6689 + }, + { + "epoch": 2.0534069981583793, + "grad_norm": 0.39376455545425415, + "learning_rate": 9.215599095084199e-05, + "loss": 1.8531, + "step": 6690 + }, + { + "epoch": 2.0537139349294047, + "grad_norm": 0.3855830430984497, + "learning_rate": 9.215331794456576e-05, + "loss": 1.8597, + "step": 6691 + }, + { + "epoch": 2.0540208717004296, + "grad_norm": 0.3515113592147827, + "learning_rate": 9.215064452170574e-05, + "loss": 1.8776, + "step": 6692 + }, + { + "epoch": 2.054327808471455, + "grad_norm": 0.3165057897567749, + "learning_rate": 9.214797068228833e-05, + "loss": 1.926, + "step": 6693 + }, + { + "epoch": 2.0546347452424802, + "grad_norm": 0.3516407310962677, + "learning_rate": 9.214529642633998e-05, + "loss": 1.9397, + "step": 6694 + }, + { + "epoch": 2.054941682013505, + "grad_norm": 0.36943888664245605, + "learning_rate": 9.214262175388713e-05, + "loss": 1.9114, + "step": 6695 + }, + { + "epoch": 2.0552486187845305, + "grad_norm": 0.3490065634250641, + "learning_rate": 9.213994666495616e-05, + "loss": 1.8637, + "step": 6696 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.30341869592666626, + "learning_rate": 9.213727115957356e-05, + "loss": 1.8525, + "step": 6697 + }, + { + "epoch": 2.0558624923265807, + "grad_norm": 0.3899247646331787, + "learning_rate": 9.213459523776573e-05, + "loss": 2.0578, + "step": 6698 + }, + { + "epoch": 2.056169429097606, + "grad_norm": 0.34904104471206665, + "learning_rate": 9.213191889955915e-05, + "loss": 1.9135, + "step": 6699 + }, + { + "epoch": 2.056476365868631, + "grad_norm": 0.3806450366973877, + "learning_rate": 9.212924214498024e-05, + "loss": 1.9252, + "step": 6700 + }, + { + "epoch": 2.0567833026396563, + "grad_norm": 0.33185848593711853, + "learning_rate": 9.212656497405547e-05, + "loss": 1.8457, + "step": 6701 + }, + { + "epoch": 2.0570902394106816, + "grad_norm": 0.356717050075531, + "learning_rate": 9.21238873868113e-05, + "loss": 1.9086, + "step": 6702 + }, + { + "epoch": 2.0573971761817065, + "grad_norm": 0.41743260622024536, + "learning_rate": 9.212120938327418e-05, + "loss": 1.9255, + "step": 6703 + }, + { + "epoch": 2.057704112952732, + "grad_norm": 0.3937377631664276, + "learning_rate": 9.211853096347058e-05, + "loss": 1.9529, + "step": 6704 + }, + { + "epoch": 2.0580110497237567, + "grad_norm": 0.43980923295021057, + "learning_rate": 9.211585212742698e-05, + "loss": 1.905, + "step": 6705 + }, + { + "epoch": 2.058317986494782, + "grad_norm": 0.36891186237335205, + "learning_rate": 9.211317287516984e-05, + "loss": 1.8109, + "step": 6706 + }, + { + "epoch": 2.0586249232658074, + "grad_norm": 0.3582547605037689, + "learning_rate": 9.211049320672563e-05, + "loss": 1.9633, + "step": 6707 + }, + { + "epoch": 2.0589318600368323, + "grad_norm": 0.3421446979045868, + "learning_rate": 9.210781312212087e-05, + "loss": 1.8956, + "step": 6708 + }, + { + "epoch": 2.0592387968078576, + "grad_norm": 0.34717023372650146, + "learning_rate": 9.210513262138199e-05, + "loss": 1.837, + "step": 6709 + }, + { + "epoch": 2.059545733578883, + "grad_norm": 0.32769930362701416, + "learning_rate": 9.210245170453553e-05, + "loss": 1.8588, + "step": 6710 + }, + { + "epoch": 2.059852670349908, + "grad_norm": 0.3694380223751068, + "learning_rate": 9.209977037160796e-05, + "loss": 1.9298, + "step": 6711 + }, + { + "epoch": 2.060159607120933, + "grad_norm": 0.38598594069480896, + "learning_rate": 9.209708862262578e-05, + "loss": 1.9011, + "step": 6712 + }, + { + "epoch": 2.060466543891958, + "grad_norm": 0.33520397543907166, + "learning_rate": 9.20944064576155e-05, + "loss": 1.9689, + "step": 6713 + }, + { + "epoch": 2.0607734806629834, + "grad_norm": 0.36898335814476013, + "learning_rate": 9.209172387660363e-05, + "loss": 1.9362, + "step": 6714 + }, + { + "epoch": 2.0610804174340087, + "grad_norm": 0.3989763855934143, + "learning_rate": 9.208904087961667e-05, + "loss": 1.8875, + "step": 6715 + }, + { + "epoch": 2.0613873542050336, + "grad_norm": 0.38079237937927246, + "learning_rate": 9.208635746668113e-05, + "loss": 1.8645, + "step": 6716 + }, + { + "epoch": 2.061694290976059, + "grad_norm": 0.3853057026863098, + "learning_rate": 9.208367363782355e-05, + "loss": 1.9346, + "step": 6717 + }, + { + "epoch": 2.0620012277470843, + "grad_norm": 0.33557942509651184, + "learning_rate": 9.208098939307044e-05, + "loss": 1.8629, + "step": 6718 + }, + { + "epoch": 2.062308164518109, + "grad_norm": 0.31848183274269104, + "learning_rate": 9.207830473244832e-05, + "loss": 1.7616, + "step": 6719 + }, + { + "epoch": 2.0626151012891345, + "grad_norm": 0.2901391088962555, + "learning_rate": 9.207561965598375e-05, + "loss": 1.8876, + "step": 6720 + }, + { + "epoch": 2.06292203806016, + "grad_norm": 0.33935174345970154, + "learning_rate": 9.207293416370322e-05, + "loss": 1.8407, + "step": 6721 + }, + { + "epoch": 2.0632289748311847, + "grad_norm": 0.3615114390850067, + "learning_rate": 9.207024825563331e-05, + "loss": 1.8378, + "step": 6722 + }, + { + "epoch": 2.06353591160221, + "grad_norm": 0.35903334617614746, + "learning_rate": 9.206756193180053e-05, + "loss": 1.8316, + "step": 6723 + }, + { + "epoch": 2.063842848373235, + "grad_norm": 0.35222968459129333, + "learning_rate": 9.206487519223146e-05, + "loss": 1.8786, + "step": 6724 + }, + { + "epoch": 2.0641497851442603, + "grad_norm": 0.3412967622280121, + "learning_rate": 9.206218803695264e-05, + "loss": 1.8682, + "step": 6725 + }, + { + "epoch": 2.0644567219152856, + "grad_norm": 0.4166354835033417, + "learning_rate": 9.205950046599062e-05, + "loss": 1.8871, + "step": 6726 + }, + { + "epoch": 2.0647636586863105, + "grad_norm": 0.4631161093711853, + "learning_rate": 9.205681247937196e-05, + "loss": 1.9328, + "step": 6727 + }, + { + "epoch": 2.065070595457336, + "grad_norm": 0.39197248220443726, + "learning_rate": 9.205412407712325e-05, + "loss": 1.9434, + "step": 6728 + }, + { + "epoch": 2.0653775322283607, + "grad_norm": 0.37939852476119995, + "learning_rate": 9.205143525927103e-05, + "loss": 1.9115, + "step": 6729 + }, + { + "epoch": 2.065684468999386, + "grad_norm": 0.35442814230918884, + "learning_rate": 9.204874602584186e-05, + "loss": 1.9197, + "step": 6730 + }, + { + "epoch": 2.0659914057704114, + "grad_norm": 0.3598809242248535, + "learning_rate": 9.204605637686235e-05, + "loss": 1.8684, + "step": 6731 + }, + { + "epoch": 2.0662983425414363, + "grad_norm": 0.3360415995121002, + "learning_rate": 9.204336631235905e-05, + "loss": 1.8531, + "step": 6732 + }, + { + "epoch": 2.0666052793124616, + "grad_norm": 0.4487619698047638, + "learning_rate": 9.204067583235859e-05, + "loss": 1.8509, + "step": 6733 + }, + { + "epoch": 2.066912216083487, + "grad_norm": 0.37166881561279297, + "learning_rate": 9.203798493688753e-05, + "loss": 1.8826, + "step": 6734 + }, + { + "epoch": 2.067219152854512, + "grad_norm": 0.35294032096862793, + "learning_rate": 9.203529362597244e-05, + "loss": 1.9029, + "step": 6735 + }, + { + "epoch": 2.067526089625537, + "grad_norm": 0.4115317165851593, + "learning_rate": 9.203260189963995e-05, + "loss": 1.9117, + "step": 6736 + }, + { + "epoch": 2.0678330263965625, + "grad_norm": 0.44137999415397644, + "learning_rate": 9.202990975791666e-05, + "loss": 1.8754, + "step": 6737 + }, + { + "epoch": 2.0681399631675874, + "grad_norm": 0.46055081486701965, + "learning_rate": 9.202721720082916e-05, + "loss": 1.8322, + "step": 6738 + }, + { + "epoch": 2.0684468999386127, + "grad_norm": 0.38548141717910767, + "learning_rate": 9.202452422840407e-05, + "loss": 1.8341, + "step": 6739 + }, + { + "epoch": 2.0687538367096376, + "grad_norm": 0.3542765974998474, + "learning_rate": 9.2021830840668e-05, + "loss": 1.9301, + "step": 6740 + }, + { + "epoch": 2.069060773480663, + "grad_norm": 0.35987207293510437, + "learning_rate": 9.201913703764755e-05, + "loss": 1.8756, + "step": 6741 + }, + { + "epoch": 2.0693677102516883, + "grad_norm": 0.4297364056110382, + "learning_rate": 9.201644281936938e-05, + "loss": 1.8549, + "step": 6742 + }, + { + "epoch": 2.069674647022713, + "grad_norm": 0.3679873049259186, + "learning_rate": 9.20137481858601e-05, + "loss": 1.8905, + "step": 6743 + }, + { + "epoch": 2.0699815837937385, + "grad_norm": 0.3402685523033142, + "learning_rate": 9.201105313714632e-05, + "loss": 1.8834, + "step": 6744 + }, + { + "epoch": 2.070288520564764, + "grad_norm": 0.40986955165863037, + "learning_rate": 9.200835767325469e-05, + "loss": 1.8861, + "step": 6745 + }, + { + "epoch": 2.0705954573357888, + "grad_norm": 0.4305949807167053, + "learning_rate": 9.200566179421186e-05, + "loss": 1.8977, + "step": 6746 + }, + { + "epoch": 2.070902394106814, + "grad_norm": 0.3948439359664917, + "learning_rate": 9.200296550004446e-05, + "loss": 1.8801, + "step": 6747 + }, + { + "epoch": 2.071209330877839, + "grad_norm": 0.3404015600681305, + "learning_rate": 9.200026879077912e-05, + "loss": 1.8417, + "step": 6748 + }, + { + "epoch": 2.0715162676488643, + "grad_norm": 0.39447101950645447, + "learning_rate": 9.199757166644252e-05, + "loss": 1.9675, + "step": 6749 + }, + { + "epoch": 2.0718232044198897, + "grad_norm": 0.44323647022247314, + "learning_rate": 9.199487412706129e-05, + "loss": 1.9014, + "step": 6750 + }, + { + "epoch": 2.0721301411909145, + "grad_norm": 0.47096556425094604, + "learning_rate": 9.199217617266212e-05, + "loss": 1.8783, + "step": 6751 + }, + { + "epoch": 2.07243707796194, + "grad_norm": 0.42863038182258606, + "learning_rate": 9.198947780327163e-05, + "loss": 1.8369, + "step": 6752 + }, + { + "epoch": 2.072744014732965, + "grad_norm": 0.414079874753952, + "learning_rate": 9.198677901891652e-05, + "loss": 1.9247, + "step": 6753 + }, + { + "epoch": 2.07305095150399, + "grad_norm": 0.3445589542388916, + "learning_rate": 9.198407981962345e-05, + "loss": 1.8494, + "step": 6754 + }, + { + "epoch": 2.0733578882750154, + "grad_norm": 0.4340321719646454, + "learning_rate": 9.198138020541908e-05, + "loss": 1.904, + "step": 6755 + }, + { + "epoch": 2.0736648250460403, + "grad_norm": 0.55349200963974, + "learning_rate": 9.197868017633013e-05, + "loss": 1.9368, + "step": 6756 + }, + { + "epoch": 2.0739717618170657, + "grad_norm": 0.5893970727920532, + "learning_rate": 9.197597973238326e-05, + "loss": 1.9329, + "step": 6757 + }, + { + "epoch": 2.074278698588091, + "grad_norm": 0.4942009449005127, + "learning_rate": 9.197327887360514e-05, + "loss": 1.7726, + "step": 6758 + }, + { + "epoch": 2.074585635359116, + "grad_norm": 0.36411046981811523, + "learning_rate": 9.197057760002247e-05, + "loss": 1.8214, + "step": 6759 + }, + { + "epoch": 2.074892572130141, + "grad_norm": 0.31520166993141174, + "learning_rate": 9.196787591166198e-05, + "loss": 1.8491, + "step": 6760 + }, + { + "epoch": 2.0751995089011666, + "grad_norm": 0.47392621636390686, + "learning_rate": 9.196517380855032e-05, + "loss": 2.0165, + "step": 6761 + }, + { + "epoch": 2.0755064456721914, + "grad_norm": 0.4768085181713104, + "learning_rate": 9.196247129071423e-05, + "loss": 1.9289, + "step": 6762 + }, + { + "epoch": 2.075813382443217, + "grad_norm": 0.396391361951828, + "learning_rate": 9.195976835818039e-05, + "loss": 1.9521, + "step": 6763 + }, + { + "epoch": 2.0761203192142417, + "grad_norm": 0.4030967950820923, + "learning_rate": 9.195706501097551e-05, + "loss": 1.8386, + "step": 6764 + }, + { + "epoch": 2.076427255985267, + "grad_norm": 0.48308777809143066, + "learning_rate": 9.195436124912635e-05, + "loss": 1.8874, + "step": 6765 + }, + { + "epoch": 2.0767341927562923, + "grad_norm": 0.5232771635055542, + "learning_rate": 9.19516570726596e-05, + "loss": 1.8822, + "step": 6766 + }, + { + "epoch": 2.0770411295273172, + "grad_norm": 0.3607174754142761, + "learning_rate": 9.194895248160198e-05, + "loss": 1.8995, + "step": 6767 + }, + { + "epoch": 2.0773480662983426, + "grad_norm": 0.4354429841041565, + "learning_rate": 9.194624747598022e-05, + "loss": 1.8629, + "step": 6768 + }, + { + "epoch": 2.077655003069368, + "grad_norm": 0.5405299067497253, + "learning_rate": 9.194354205582107e-05, + "loss": 1.8608, + "step": 6769 + }, + { + "epoch": 2.077961939840393, + "grad_norm": 0.5442025065422058, + "learning_rate": 9.194083622115123e-05, + "loss": 1.885, + "step": 6770 + }, + { + "epoch": 2.078268876611418, + "grad_norm": 0.4160112142562866, + "learning_rate": 9.193812997199749e-05, + "loss": 1.8617, + "step": 6771 + }, + { + "epoch": 2.078575813382443, + "grad_norm": 0.3550199866294861, + "learning_rate": 9.193542330838656e-05, + "loss": 1.9277, + "step": 6772 + }, + { + "epoch": 2.0788827501534684, + "grad_norm": 0.5224893093109131, + "learning_rate": 9.19327162303452e-05, + "loss": 1.7893, + "step": 6773 + }, + { + "epoch": 2.0791896869244937, + "grad_norm": 0.45021727681159973, + "learning_rate": 9.193000873790014e-05, + "loss": 1.8635, + "step": 6774 + }, + { + "epoch": 2.0794966236955186, + "grad_norm": 0.3087892532348633, + "learning_rate": 9.192730083107819e-05, + "loss": 1.842, + "step": 6775 + }, + { + "epoch": 2.079803560466544, + "grad_norm": 0.4304139018058777, + "learning_rate": 9.192459250990606e-05, + "loss": 1.8461, + "step": 6776 + }, + { + "epoch": 2.0801104972375692, + "grad_norm": 0.4388587474822998, + "learning_rate": 9.192188377441054e-05, + "loss": 1.8978, + "step": 6777 + }, + { + "epoch": 2.080417434008594, + "grad_norm": 0.3452616333961487, + "learning_rate": 9.19191746246184e-05, + "loss": 1.8849, + "step": 6778 + }, + { + "epoch": 2.0807243707796195, + "grad_norm": 0.3127618432044983, + "learning_rate": 9.191646506055638e-05, + "loss": 1.8703, + "step": 6779 + }, + { + "epoch": 2.0810313075506444, + "grad_norm": 0.3424977958202362, + "learning_rate": 9.191375508225131e-05, + "loss": 1.8446, + "step": 6780 + }, + { + "epoch": 2.0813382443216697, + "grad_norm": 0.3536671996116638, + "learning_rate": 9.191104468972993e-05, + "loss": 1.9079, + "step": 6781 + }, + { + "epoch": 2.081645181092695, + "grad_norm": 0.3689599633216858, + "learning_rate": 9.190833388301905e-05, + "loss": 1.8683, + "step": 6782 + }, + { + "epoch": 2.08195211786372, + "grad_norm": 0.30976906418800354, + "learning_rate": 9.190562266214546e-05, + "loss": 1.89, + "step": 6783 + }, + { + "epoch": 2.0822590546347453, + "grad_norm": 0.34682777523994446, + "learning_rate": 9.190291102713593e-05, + "loss": 1.8384, + "step": 6784 + }, + { + "epoch": 2.0825659914057706, + "grad_norm": 0.4135018587112427, + "learning_rate": 9.190019897801727e-05, + "loss": 1.8878, + "step": 6785 + }, + { + "epoch": 2.0828729281767955, + "grad_norm": 0.4247548580169678, + "learning_rate": 9.189748651481629e-05, + "loss": 1.9244, + "step": 6786 + }, + { + "epoch": 2.083179864947821, + "grad_norm": 0.3961609899997711, + "learning_rate": 9.18947736375598e-05, + "loss": 1.9539, + "step": 6787 + }, + { + "epoch": 2.0834868017188457, + "grad_norm": 0.4174231290817261, + "learning_rate": 9.18920603462746e-05, + "loss": 1.9705, + "step": 6788 + }, + { + "epoch": 2.083793738489871, + "grad_norm": 0.38771605491638184, + "learning_rate": 9.18893466409875e-05, + "loss": 1.9038, + "step": 6789 + }, + { + "epoch": 2.0841006752608964, + "grad_norm": 0.38480475544929504, + "learning_rate": 9.188663252172534e-05, + "loss": 1.8725, + "step": 6790 + }, + { + "epoch": 2.0844076120319213, + "grad_norm": 0.37508267164230347, + "learning_rate": 9.18839179885149e-05, + "loss": 1.8819, + "step": 6791 + }, + { + "epoch": 2.0847145488029466, + "grad_norm": 0.3970893621444702, + "learning_rate": 9.188120304138306e-05, + "loss": 1.9035, + "step": 6792 + }, + { + "epoch": 2.085021485573972, + "grad_norm": 0.42629706859588623, + "learning_rate": 9.18784876803566e-05, + "loss": 1.993, + "step": 6793 + }, + { + "epoch": 2.085328422344997, + "grad_norm": 0.40387317538261414, + "learning_rate": 9.18757719054624e-05, + "loss": 1.8987, + "step": 6794 + }, + { + "epoch": 2.085635359116022, + "grad_norm": 0.40304768085479736, + "learning_rate": 9.187305571672726e-05, + "loss": 1.9017, + "step": 6795 + }, + { + "epoch": 2.0859422958870475, + "grad_norm": 0.34255313873291016, + "learning_rate": 9.187033911417805e-05, + "loss": 1.8406, + "step": 6796 + }, + { + "epoch": 2.0862492326580724, + "grad_norm": 0.34713810682296753, + "learning_rate": 9.18676220978416e-05, + "loss": 1.8773, + "step": 6797 + }, + { + "epoch": 2.0865561694290977, + "grad_norm": 0.3651806712150574, + "learning_rate": 9.186490466774478e-05, + "loss": 1.9158, + "step": 6798 + }, + { + "epoch": 2.0868631062001226, + "grad_norm": 0.3859401047229767, + "learning_rate": 9.186218682391443e-05, + "loss": 1.8488, + "step": 6799 + }, + { + "epoch": 2.087170042971148, + "grad_norm": 0.34309303760528564, + "learning_rate": 9.185946856637742e-05, + "loss": 1.8373, + "step": 6800 + }, + { + "epoch": 2.0874769797421733, + "grad_norm": 0.3597384989261627, + "learning_rate": 9.18567498951606e-05, + "loss": 1.8297, + "step": 6801 + }, + { + "epoch": 2.087783916513198, + "grad_norm": 0.39170950651168823, + "learning_rate": 9.185403081029085e-05, + "loss": 1.9623, + "step": 6802 + }, + { + "epoch": 2.0880908532842235, + "grad_norm": 0.37024664878845215, + "learning_rate": 9.185131131179503e-05, + "loss": 1.8966, + "step": 6803 + }, + { + "epoch": 2.0883977900552484, + "grad_norm": 0.37869709730148315, + "learning_rate": 9.184859139970001e-05, + "loss": 1.9121, + "step": 6804 + }, + { + "epoch": 2.0887047268262737, + "grad_norm": 0.3808143436908722, + "learning_rate": 9.184587107403271e-05, + "loss": 1.918, + "step": 6805 + }, + { + "epoch": 2.089011663597299, + "grad_norm": 0.3864719271659851, + "learning_rate": 9.184315033481996e-05, + "loss": 1.9087, + "step": 6806 + }, + { + "epoch": 2.089318600368324, + "grad_norm": 0.41121476888656616, + "learning_rate": 9.184042918208869e-05, + "loss": 1.8971, + "step": 6807 + }, + { + "epoch": 2.0896255371393493, + "grad_norm": 0.33098986744880676, + "learning_rate": 9.183770761586576e-05, + "loss": 1.8497, + "step": 6808 + }, + { + "epoch": 2.0899324739103746, + "grad_norm": 0.336174339056015, + "learning_rate": 9.183498563617809e-05, + "loss": 1.8341, + "step": 6809 + }, + { + "epoch": 2.0902394106813995, + "grad_norm": 0.339040070772171, + "learning_rate": 9.183226324305258e-05, + "loss": 1.9228, + "step": 6810 + }, + { + "epoch": 2.090546347452425, + "grad_norm": 0.395000159740448, + "learning_rate": 9.182954043651613e-05, + "loss": 1.9773, + "step": 6811 + }, + { + "epoch": 2.09085328422345, + "grad_norm": 0.3884550929069519, + "learning_rate": 9.182681721659563e-05, + "loss": 1.9665, + "step": 6812 + }, + { + "epoch": 2.091160220994475, + "grad_norm": 0.38752105832099915, + "learning_rate": 9.182409358331801e-05, + "loss": 1.9337, + "step": 6813 + }, + { + "epoch": 2.0914671577655004, + "grad_norm": 0.3557493984699249, + "learning_rate": 9.182136953671017e-05, + "loss": 1.8506, + "step": 6814 + }, + { + "epoch": 2.0917740945365253, + "grad_norm": 0.36052554845809937, + "learning_rate": 9.181864507679906e-05, + "loss": 1.8336, + "step": 6815 + }, + { + "epoch": 2.0920810313075506, + "grad_norm": 0.3311133086681366, + "learning_rate": 9.181592020361158e-05, + "loss": 1.9121, + "step": 6816 + }, + { + "epoch": 2.092387968078576, + "grad_norm": 0.33922117948532104, + "learning_rate": 9.181319491717468e-05, + "loss": 1.8366, + "step": 6817 + }, + { + "epoch": 2.092694904849601, + "grad_norm": 0.30820000171661377, + "learning_rate": 9.181046921751527e-05, + "loss": 1.8931, + "step": 6818 + }, + { + "epoch": 2.093001841620626, + "grad_norm": 0.327374666929245, + "learning_rate": 9.180774310466031e-05, + "loss": 1.8818, + "step": 6819 + }, + { + "epoch": 2.0933087783916515, + "grad_norm": 0.3244091868400574, + "learning_rate": 9.180501657863672e-05, + "loss": 1.8542, + "step": 6820 + }, + { + "epoch": 2.0936157151626764, + "grad_norm": 0.32823657989501953, + "learning_rate": 9.180228963947144e-05, + "loss": 1.8745, + "step": 6821 + }, + { + "epoch": 2.0939226519337018, + "grad_norm": 0.32869017124176025, + "learning_rate": 9.179956228719144e-05, + "loss": 1.8497, + "step": 6822 + }, + { + "epoch": 2.0942295887047266, + "grad_norm": 0.3624805808067322, + "learning_rate": 9.179683452182369e-05, + "loss": 1.9499, + "step": 6823 + }, + { + "epoch": 2.094536525475752, + "grad_norm": 0.35709038376808167, + "learning_rate": 9.179410634339509e-05, + "loss": 1.8709, + "step": 6824 + }, + { + "epoch": 2.0948434622467773, + "grad_norm": 0.3875027298927307, + "learning_rate": 9.179137775193266e-05, + "loss": 1.883, + "step": 6825 + }, + { + "epoch": 2.095150399017802, + "grad_norm": 0.4203769862651825, + "learning_rate": 9.178864874746333e-05, + "loss": 1.814, + "step": 6826 + }, + { + "epoch": 2.0954573357888275, + "grad_norm": 0.46331214904785156, + "learning_rate": 9.178591933001407e-05, + "loss": 1.9821, + "step": 6827 + }, + { + "epoch": 2.095764272559853, + "grad_norm": 0.4264145791530609, + "learning_rate": 9.178318949961188e-05, + "loss": 1.9249, + "step": 6828 + }, + { + "epoch": 2.0960712093308778, + "grad_norm": 0.3697608709335327, + "learning_rate": 9.178045925628371e-05, + "loss": 2.0052, + "step": 6829 + }, + { + "epoch": 2.096378146101903, + "grad_norm": 0.39582517743110657, + "learning_rate": 9.177772860005656e-05, + "loss": 1.9086, + "step": 6830 + }, + { + "epoch": 2.096685082872928, + "grad_norm": 0.3287788927555084, + "learning_rate": 9.17749975309574e-05, + "loss": 1.8766, + "step": 6831 + }, + { + "epoch": 2.0969920196439533, + "grad_norm": 0.33648282289505005, + "learning_rate": 9.177226604901324e-05, + "loss": 1.933, + "step": 6832 + }, + { + "epoch": 2.0972989564149787, + "grad_norm": 0.34225910902023315, + "learning_rate": 9.176953415425106e-05, + "loss": 1.8801, + "step": 6833 + }, + { + "epoch": 2.0976058931860035, + "grad_norm": 0.35536935925483704, + "learning_rate": 9.176680184669786e-05, + "loss": 1.9472, + "step": 6834 + }, + { + "epoch": 2.097912829957029, + "grad_norm": 0.39152607321739197, + "learning_rate": 9.176406912638064e-05, + "loss": 1.9502, + "step": 6835 + }, + { + "epoch": 2.098219766728054, + "grad_norm": 0.3812694549560547, + "learning_rate": 9.176133599332643e-05, + "loss": 1.8746, + "step": 6836 + }, + { + "epoch": 2.098526703499079, + "grad_norm": 0.36225396394729614, + "learning_rate": 9.17586024475622e-05, + "loss": 1.8489, + "step": 6837 + }, + { + "epoch": 2.0988336402701044, + "grad_norm": 0.3953205943107605, + "learning_rate": 9.1755868489115e-05, + "loss": 1.8671, + "step": 6838 + }, + { + "epoch": 2.0991405770411293, + "grad_norm": 0.33443906903266907, + "learning_rate": 9.175313411801181e-05, + "loss": 1.8574, + "step": 6839 + }, + { + "epoch": 2.0994475138121547, + "grad_norm": 0.3358154892921448, + "learning_rate": 9.17503993342797e-05, + "loss": 1.8329, + "step": 6840 + }, + { + "epoch": 2.09975445058318, + "grad_norm": 0.45934513211250305, + "learning_rate": 9.174766413794566e-05, + "loss": 1.862, + "step": 6841 + }, + { + "epoch": 2.100061387354205, + "grad_norm": 0.46342480182647705, + "learning_rate": 9.174492852903673e-05, + "loss": 1.8747, + "step": 6842 + }, + { + "epoch": 2.1003683241252302, + "grad_norm": 0.4199588894844055, + "learning_rate": 9.174219250757996e-05, + "loss": 1.9308, + "step": 6843 + }, + { + "epoch": 2.1006752608962556, + "grad_norm": 0.3508588373661041, + "learning_rate": 9.173945607360238e-05, + "loss": 1.8622, + "step": 6844 + }, + { + "epoch": 2.1009821976672804, + "grad_norm": 0.3656609356403351, + "learning_rate": 9.173671922713104e-05, + "loss": 1.899, + "step": 6845 + }, + { + "epoch": 2.101289134438306, + "grad_norm": 0.43374791741371155, + "learning_rate": 9.173398196819295e-05, + "loss": 1.8725, + "step": 6846 + }, + { + "epoch": 2.1015960712093307, + "grad_norm": 0.49730411171913147, + "learning_rate": 9.17312442968152e-05, + "loss": 1.9224, + "step": 6847 + }, + { + "epoch": 2.101903007980356, + "grad_norm": 0.45392677187919617, + "learning_rate": 9.172850621302484e-05, + "loss": 1.8374, + "step": 6848 + }, + { + "epoch": 2.1022099447513813, + "grad_norm": 0.3507382273674011, + "learning_rate": 9.172576771684892e-05, + "loss": 1.8875, + "step": 6849 + }, + { + "epoch": 2.1025168815224062, + "grad_norm": 0.4124681055545807, + "learning_rate": 9.172302880831451e-05, + "loss": 1.8828, + "step": 6850 + }, + { + "epoch": 2.1028238182934316, + "grad_norm": 0.5120462775230408, + "learning_rate": 9.172028948744867e-05, + "loss": 1.8218, + "step": 6851 + }, + { + "epoch": 2.103130755064457, + "grad_norm": 0.5858038067817688, + "learning_rate": 9.171754975427848e-05, + "loss": 1.8679, + "step": 6852 + }, + { + "epoch": 2.103437691835482, + "grad_norm": 0.5196588039398193, + "learning_rate": 9.171480960883101e-05, + "loss": 1.8885, + "step": 6853 + }, + { + "epoch": 2.103744628606507, + "grad_norm": 0.38581255078315735, + "learning_rate": 9.171206905113335e-05, + "loss": 1.9127, + "step": 6854 + }, + { + "epoch": 2.104051565377532, + "grad_norm": 0.31531259417533875, + "learning_rate": 9.170932808121256e-05, + "loss": 1.84, + "step": 6855 + }, + { + "epoch": 2.1043585021485574, + "grad_norm": 0.4595080018043518, + "learning_rate": 9.170658669909575e-05, + "loss": 1.908, + "step": 6856 + }, + { + "epoch": 2.1046654389195827, + "grad_norm": 0.42485639452934265, + "learning_rate": 9.170384490481001e-05, + "loss": 1.8943, + "step": 6857 + }, + { + "epoch": 2.1049723756906076, + "grad_norm": 0.3465791344642639, + "learning_rate": 9.170110269838243e-05, + "loss": 1.8362, + "step": 6858 + }, + { + "epoch": 2.105279312461633, + "grad_norm": 0.26863181591033936, + "learning_rate": 9.16983600798401e-05, + "loss": 1.856, + "step": 6859 + }, + { + "epoch": 2.1055862492326582, + "grad_norm": 0.33826425671577454, + "learning_rate": 9.169561704921014e-05, + "loss": 1.8148, + "step": 6860 + }, + { + "epoch": 2.105893186003683, + "grad_norm": 0.3657929301261902, + "learning_rate": 9.169287360651967e-05, + "loss": 1.8978, + "step": 6861 + }, + { + "epoch": 2.1062001227747085, + "grad_norm": 0.2963617444038391, + "learning_rate": 9.169012975179579e-05, + "loss": 1.8432, + "step": 6862 + }, + { + "epoch": 2.1065070595457334, + "grad_norm": 0.32966092228889465, + "learning_rate": 9.168738548506559e-05, + "loss": 1.9137, + "step": 6863 + }, + { + "epoch": 2.1068139963167587, + "grad_norm": 0.4043191075325012, + "learning_rate": 9.168464080635622e-05, + "loss": 1.9294, + "step": 6864 + }, + { + "epoch": 2.107120933087784, + "grad_norm": 0.41461876034736633, + "learning_rate": 9.168189571569479e-05, + "loss": 1.8582, + "step": 6865 + }, + { + "epoch": 2.107427869858809, + "grad_norm": 0.34119492769241333, + "learning_rate": 9.167915021310845e-05, + "loss": 1.8245, + "step": 6866 + }, + { + "epoch": 2.1077348066298343, + "grad_norm": 0.3259434401988983, + "learning_rate": 9.167640429862429e-05, + "loss": 1.8962, + "step": 6867 + }, + { + "epoch": 2.1080417434008596, + "grad_norm": 0.3074548840522766, + "learning_rate": 9.167365797226951e-05, + "loss": 1.8617, + "step": 6868 + }, + { + "epoch": 2.1083486801718845, + "grad_norm": 0.40738388895988464, + "learning_rate": 9.167091123407121e-05, + "loss": 1.9701, + "step": 6869 + }, + { + "epoch": 2.10865561694291, + "grad_norm": 0.3931449055671692, + "learning_rate": 9.166816408405653e-05, + "loss": 1.8874, + "step": 6870 + }, + { + "epoch": 2.108962553713935, + "grad_norm": 0.3726460635662079, + "learning_rate": 9.166541652225264e-05, + "loss": 1.9307, + "step": 6871 + }, + { + "epoch": 2.10926949048496, + "grad_norm": 0.36566078662872314, + "learning_rate": 9.166266854868667e-05, + "loss": 1.8782, + "step": 6872 + }, + { + "epoch": 2.1095764272559854, + "grad_norm": 0.33448025584220886, + "learning_rate": 9.16599201633858e-05, + "loss": 1.8007, + "step": 6873 + }, + { + "epoch": 2.1098833640270103, + "grad_norm": 0.4261031150817871, + "learning_rate": 9.165717136637716e-05, + "loss": 1.9092, + "step": 6874 + }, + { + "epoch": 2.1101903007980356, + "grad_norm": 0.37860241532325745, + "learning_rate": 9.165442215768798e-05, + "loss": 1.8538, + "step": 6875 + }, + { + "epoch": 2.110497237569061, + "grad_norm": 0.35417279601097107, + "learning_rate": 9.165167253734535e-05, + "loss": 1.8859, + "step": 6876 + }, + { + "epoch": 2.110804174340086, + "grad_norm": 0.33357858657836914, + "learning_rate": 9.16489225053765e-05, + "loss": 1.8615, + "step": 6877 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.40441447496414185, + "learning_rate": 9.164617206180856e-05, + "loss": 1.8711, + "step": 6878 + }, + { + "epoch": 2.1114180478821365, + "grad_norm": 0.401530921459198, + "learning_rate": 9.164342120666876e-05, + "loss": 1.8378, + "step": 6879 + }, + { + "epoch": 2.1117249846531614, + "grad_norm": 0.36379504203796387, + "learning_rate": 9.164066993998426e-05, + "loss": 1.87, + "step": 6880 + }, + { + "epoch": 2.1120319214241867, + "grad_norm": 0.36242642998695374, + "learning_rate": 9.163791826178225e-05, + "loss": 1.9041, + "step": 6881 + }, + { + "epoch": 2.1123388581952116, + "grad_norm": 0.34601980447769165, + "learning_rate": 9.163516617208994e-05, + "loss": 1.9248, + "step": 6882 + }, + { + "epoch": 2.112645794966237, + "grad_norm": 0.4664660096168518, + "learning_rate": 9.163241367093451e-05, + "loss": 1.901, + "step": 6883 + }, + { + "epoch": 2.1129527317372623, + "grad_norm": 0.5991809964179993, + "learning_rate": 9.162966075834315e-05, + "loss": 1.9061, + "step": 6884 + }, + { + "epoch": 2.113259668508287, + "grad_norm": 0.5235050320625305, + "learning_rate": 9.16269074343431e-05, + "loss": 1.8958, + "step": 6885 + }, + { + "epoch": 2.1135666052793125, + "grad_norm": 0.39008161425590515, + "learning_rate": 9.162415369896153e-05, + "loss": 1.7935, + "step": 6886 + }, + { + "epoch": 2.113873542050338, + "grad_norm": 0.4212269186973572, + "learning_rate": 9.16213995522257e-05, + "loss": 1.9876, + "step": 6887 + }, + { + "epoch": 2.1141804788213627, + "grad_norm": 0.44495880603790283, + "learning_rate": 9.161864499416279e-05, + "loss": 1.9011, + "step": 6888 + }, + { + "epoch": 2.114487415592388, + "grad_norm": 0.40533384680747986, + "learning_rate": 9.161589002480006e-05, + "loss": 1.8734, + "step": 6889 + }, + { + "epoch": 2.114794352363413, + "grad_norm": 0.45783132314682007, + "learning_rate": 9.161313464416469e-05, + "loss": 1.9769, + "step": 6890 + }, + { + "epoch": 2.1151012891344383, + "grad_norm": 0.37975600361824036, + "learning_rate": 9.161037885228393e-05, + "loss": 1.8988, + "step": 6891 + }, + { + "epoch": 2.1154082259054636, + "grad_norm": 0.394987553358078, + "learning_rate": 9.160762264918504e-05, + "loss": 1.8076, + "step": 6892 + }, + { + "epoch": 2.1157151626764885, + "grad_norm": 0.4180262088775635, + "learning_rate": 9.160486603489522e-05, + "loss": 1.9497, + "step": 6893 + }, + { + "epoch": 2.116022099447514, + "grad_norm": 0.3917383849620819, + "learning_rate": 9.160210900944173e-05, + "loss": 1.9093, + "step": 6894 + }, + { + "epoch": 2.116329036218539, + "grad_norm": 0.3631739616394043, + "learning_rate": 9.15993515728518e-05, + "loss": 1.8724, + "step": 6895 + }, + { + "epoch": 2.116635972989564, + "grad_norm": 0.3304460942745209, + "learning_rate": 9.159659372515272e-05, + "loss": 1.8291, + "step": 6896 + }, + { + "epoch": 2.1169429097605894, + "grad_norm": 0.38202792406082153, + "learning_rate": 9.159383546637172e-05, + "loss": 1.8919, + "step": 6897 + }, + { + "epoch": 2.1172498465316143, + "grad_norm": 0.39544618129730225, + "learning_rate": 9.159107679653605e-05, + "loss": 1.8748, + "step": 6898 + }, + { + "epoch": 2.1175567833026396, + "grad_norm": 0.44175153970718384, + "learning_rate": 9.158831771567298e-05, + "loss": 1.9063, + "step": 6899 + }, + { + "epoch": 2.117863720073665, + "grad_norm": 0.3696559965610504, + "learning_rate": 9.158555822380979e-05, + "loss": 1.8356, + "step": 6900 + }, + { + "epoch": 2.11817065684469, + "grad_norm": 0.2917703688144684, + "learning_rate": 9.158279832097372e-05, + "loss": 1.8996, + "step": 6901 + }, + { + "epoch": 2.118477593615715, + "grad_norm": 0.3991266191005707, + "learning_rate": 9.158003800719208e-05, + "loss": 1.8872, + "step": 6902 + }, + { + "epoch": 2.1187845303867405, + "grad_norm": 0.41425880789756775, + "learning_rate": 9.157727728249213e-05, + "loss": 1.845, + "step": 6903 + }, + { + "epoch": 2.1190914671577654, + "grad_norm": 0.33590519428253174, + "learning_rate": 9.157451614690115e-05, + "loss": 1.8779, + "step": 6904 + }, + { + "epoch": 2.1193984039287908, + "grad_norm": 0.34963786602020264, + "learning_rate": 9.157175460044644e-05, + "loss": 1.8846, + "step": 6905 + }, + { + "epoch": 2.1197053406998156, + "grad_norm": 0.3274745047092438, + "learning_rate": 9.156899264315528e-05, + "loss": 1.8859, + "step": 6906 + }, + { + "epoch": 2.120012277470841, + "grad_norm": 0.35821303725242615, + "learning_rate": 9.156623027505498e-05, + "loss": 1.8314, + "step": 6907 + }, + { + "epoch": 2.1203192142418663, + "grad_norm": 0.41185733675956726, + "learning_rate": 9.156346749617283e-05, + "loss": 1.9162, + "step": 6908 + }, + { + "epoch": 2.120626151012891, + "grad_norm": 0.4120326042175293, + "learning_rate": 9.156070430653613e-05, + "loss": 1.8593, + "step": 6909 + }, + { + "epoch": 2.1209330877839165, + "grad_norm": 0.39017269015312195, + "learning_rate": 9.155794070617218e-05, + "loss": 1.9333, + "step": 6910 + }, + { + "epoch": 2.121240024554942, + "grad_norm": 0.3104727864265442, + "learning_rate": 9.155517669510832e-05, + "loss": 1.8274, + "step": 6911 + }, + { + "epoch": 2.1215469613259668, + "grad_norm": 0.38360875844955444, + "learning_rate": 9.155241227337183e-05, + "loss": 1.9013, + "step": 6912 + }, + { + "epoch": 2.121853898096992, + "grad_norm": 0.3752502501010895, + "learning_rate": 9.154964744099006e-05, + "loss": 1.9079, + "step": 6913 + }, + { + "epoch": 2.122160834868017, + "grad_norm": 0.32074928283691406, + "learning_rate": 9.154688219799033e-05, + "loss": 1.8232, + "step": 6914 + }, + { + "epoch": 2.1224677716390423, + "grad_norm": 0.39559221267700195, + "learning_rate": 9.154411654439993e-05, + "loss": 1.9273, + "step": 6915 + }, + { + "epoch": 2.1227747084100677, + "grad_norm": 0.4010276198387146, + "learning_rate": 9.154135048024623e-05, + "loss": 1.8368, + "step": 6916 + }, + { + "epoch": 2.1230816451810925, + "grad_norm": 0.5745936036109924, + "learning_rate": 9.153858400555658e-05, + "loss": 2.0344, + "step": 6917 + }, + { + "epoch": 2.123388581952118, + "grad_norm": 0.45708227157592773, + "learning_rate": 9.153581712035827e-05, + "loss": 1.9309, + "step": 6918 + }, + { + "epoch": 2.123695518723143, + "grad_norm": 0.43845629692077637, + "learning_rate": 9.153304982467868e-05, + "loss": 1.9213, + "step": 6919 + }, + { + "epoch": 2.124002455494168, + "grad_norm": 0.34456655383110046, + "learning_rate": 9.153028211854516e-05, + "loss": 1.9, + "step": 6920 + }, + { + "epoch": 2.1243093922651934, + "grad_norm": 0.3903563618659973, + "learning_rate": 9.152751400198502e-05, + "loss": 1.8619, + "step": 6921 + }, + { + "epoch": 2.1246163290362183, + "grad_norm": 0.3465174436569214, + "learning_rate": 9.152474547502566e-05, + "loss": 1.8253, + "step": 6922 + }, + { + "epoch": 2.1249232658072437, + "grad_norm": 0.38335317373275757, + "learning_rate": 9.152197653769444e-05, + "loss": 1.8824, + "step": 6923 + }, + { + "epoch": 2.125230202578269, + "grad_norm": 0.3583361506462097, + "learning_rate": 9.15192071900187e-05, + "loss": 1.8749, + "step": 6924 + }, + { + "epoch": 2.125537139349294, + "grad_norm": 0.38249272108078003, + "learning_rate": 9.151643743202582e-05, + "loss": 1.9289, + "step": 6925 + }, + { + "epoch": 2.1258440761203192, + "grad_norm": 0.3972204327583313, + "learning_rate": 9.151366726374318e-05, + "loss": 1.8259, + "step": 6926 + }, + { + "epoch": 2.1261510128913446, + "grad_norm": 0.42475268244743347, + "learning_rate": 9.151089668519814e-05, + "loss": 1.9026, + "step": 6927 + }, + { + "epoch": 2.1264579496623695, + "grad_norm": 0.39575010538101196, + "learning_rate": 9.15081256964181e-05, + "loss": 1.8835, + "step": 6928 + }, + { + "epoch": 2.126764886433395, + "grad_norm": 0.33592918515205383, + "learning_rate": 9.150535429743041e-05, + "loss": 1.9439, + "step": 6929 + }, + { + "epoch": 2.12707182320442, + "grad_norm": 0.41760140657424927, + "learning_rate": 9.150258248826249e-05, + "loss": 1.9326, + "step": 6930 + }, + { + "epoch": 2.127378759975445, + "grad_norm": 0.4759281575679779, + "learning_rate": 9.149981026894173e-05, + "loss": 1.8443, + "step": 6931 + }, + { + "epoch": 2.1276856967464703, + "grad_norm": 0.4669014513492584, + "learning_rate": 9.149703763949552e-05, + "loss": 1.9254, + "step": 6932 + }, + { + "epoch": 2.1279926335174952, + "grad_norm": 0.3498002588748932, + "learning_rate": 9.149426459995126e-05, + "loss": 1.8814, + "step": 6933 + }, + { + "epoch": 2.1282995702885206, + "grad_norm": 0.332998663187027, + "learning_rate": 9.149149115033637e-05, + "loss": 1.8223, + "step": 6934 + }, + { + "epoch": 2.128606507059546, + "grad_norm": 0.36990395188331604, + "learning_rate": 9.148871729067823e-05, + "loss": 1.917, + "step": 6935 + }, + { + "epoch": 2.128913443830571, + "grad_norm": 0.4807330369949341, + "learning_rate": 9.148594302100426e-05, + "loss": 1.9138, + "step": 6936 + }, + { + "epoch": 2.129220380601596, + "grad_norm": 0.4821743369102478, + "learning_rate": 9.14831683413419e-05, + "loss": 1.9201, + "step": 6937 + }, + { + "epoch": 2.129527317372621, + "grad_norm": 0.45373013615608215, + "learning_rate": 9.148039325171855e-05, + "loss": 1.88, + "step": 6938 + }, + { + "epoch": 2.1298342541436464, + "grad_norm": 0.3712935745716095, + "learning_rate": 9.147761775216166e-05, + "loss": 1.8424, + "step": 6939 + }, + { + "epoch": 2.1301411909146717, + "grad_norm": 0.32493939995765686, + "learning_rate": 9.147484184269862e-05, + "loss": 1.8691, + "step": 6940 + }, + { + "epoch": 2.1304481276856966, + "grad_norm": 0.41952449083328247, + "learning_rate": 9.14720655233569e-05, + "loss": 1.8468, + "step": 6941 + }, + { + "epoch": 2.130755064456722, + "grad_norm": 0.4730648398399353, + "learning_rate": 9.14692887941639e-05, + "loss": 2.0333, + "step": 6942 + }, + { + "epoch": 2.1310620012277472, + "grad_norm": 0.3745786249637604, + "learning_rate": 9.14665116551471e-05, + "loss": 1.8835, + "step": 6943 + }, + { + "epoch": 2.131368937998772, + "grad_norm": 0.3747421205043793, + "learning_rate": 9.146373410633392e-05, + "loss": 1.8958, + "step": 6944 + }, + { + "epoch": 2.1316758747697975, + "grad_norm": 0.4383934438228607, + "learning_rate": 9.146095614775182e-05, + "loss": 1.8527, + "step": 6945 + }, + { + "epoch": 2.131982811540823, + "grad_norm": 0.4657299220561981, + "learning_rate": 9.145817777942824e-05, + "loss": 1.9073, + "step": 6946 + }, + { + "epoch": 2.1322897483118477, + "grad_norm": 0.4741605818271637, + "learning_rate": 9.145539900139067e-05, + "loss": 1.8736, + "step": 6947 + }, + { + "epoch": 2.132596685082873, + "grad_norm": 0.4058460295200348, + "learning_rate": 9.145261981366653e-05, + "loss": 1.9365, + "step": 6948 + }, + { + "epoch": 2.132903621853898, + "grad_norm": 0.3430838882923126, + "learning_rate": 9.14498402162833e-05, + "loss": 1.8992, + "step": 6949 + }, + { + "epoch": 2.1332105586249233, + "grad_norm": 0.43009114265441895, + "learning_rate": 9.144706020926847e-05, + "loss": 1.925, + "step": 6950 + }, + { + "epoch": 2.1335174953959486, + "grad_norm": 0.47696158289909363, + "learning_rate": 9.144427979264949e-05, + "loss": 1.858, + "step": 6951 + }, + { + "epoch": 2.1338244321669735, + "grad_norm": 0.4477602243423462, + "learning_rate": 9.144149896645386e-05, + "loss": 1.9042, + "step": 6952 + }, + { + "epoch": 2.134131368937999, + "grad_norm": 0.3736960291862488, + "learning_rate": 9.143871773070903e-05, + "loss": 1.782, + "step": 6953 + }, + { + "epoch": 2.1344383057090237, + "grad_norm": 0.3065558075904846, + "learning_rate": 9.143593608544251e-05, + "loss": 1.8711, + "step": 6954 + }, + { + "epoch": 2.134745242480049, + "grad_norm": 0.41738569736480713, + "learning_rate": 9.143315403068178e-05, + "loss": 1.8651, + "step": 6955 + }, + { + "epoch": 2.1350521792510744, + "grad_norm": 0.4652978479862213, + "learning_rate": 9.143037156645435e-05, + "loss": 1.8225, + "step": 6956 + }, + { + "epoch": 2.1353591160220993, + "grad_norm": 0.3625001311302185, + "learning_rate": 9.142758869278769e-05, + "loss": 1.9045, + "step": 6957 + }, + { + "epoch": 2.1356660527931246, + "grad_norm": 0.34516090154647827, + "learning_rate": 9.142480540970933e-05, + "loss": 1.8527, + "step": 6958 + }, + { + "epoch": 2.13597298956415, + "grad_norm": 0.36983323097229004, + "learning_rate": 9.142202171724674e-05, + "loss": 1.7911, + "step": 6959 + }, + { + "epoch": 2.136279926335175, + "grad_norm": 0.46084535121917725, + "learning_rate": 9.141923761542748e-05, + "loss": 1.9489, + "step": 6960 + }, + { + "epoch": 2.1365868631062, + "grad_norm": 0.49472227692604065, + "learning_rate": 9.141645310427903e-05, + "loss": 1.9904, + "step": 6961 + }, + { + "epoch": 2.1368937998772255, + "grad_norm": 0.39878135919570923, + "learning_rate": 9.14136681838289e-05, + "loss": 1.8969, + "step": 6962 + }, + { + "epoch": 2.1372007366482504, + "grad_norm": 0.3451174795627594, + "learning_rate": 9.141088285410464e-05, + "loss": 1.9186, + "step": 6963 + }, + { + "epoch": 2.1375076734192757, + "grad_norm": 0.4497967064380646, + "learning_rate": 9.140809711513377e-05, + "loss": 1.8636, + "step": 6964 + }, + { + "epoch": 2.1378146101903006, + "grad_norm": 0.4643685221672058, + "learning_rate": 9.14053109669438e-05, + "loss": 1.8427, + "step": 6965 + }, + { + "epoch": 2.138121546961326, + "grad_norm": 0.3748690187931061, + "learning_rate": 9.140252440956229e-05, + "loss": 1.8529, + "step": 6966 + }, + { + "epoch": 2.1384284837323513, + "grad_norm": 0.3211230933666229, + "learning_rate": 9.139973744301675e-05, + "loss": 1.8849, + "step": 6967 + }, + { + "epoch": 2.138735420503376, + "grad_norm": 0.41169998049736023, + "learning_rate": 9.139695006733476e-05, + "loss": 1.8535, + "step": 6968 + }, + { + "epoch": 2.1390423572744015, + "grad_norm": 0.48356300592422485, + "learning_rate": 9.139416228254382e-05, + "loss": 1.8182, + "step": 6969 + }, + { + "epoch": 2.139349294045427, + "grad_norm": 0.4596598148345947, + "learning_rate": 9.139137408867153e-05, + "loss": 1.8522, + "step": 6970 + }, + { + "epoch": 2.1396562308164517, + "grad_norm": 0.37168747186660767, + "learning_rate": 9.138858548574543e-05, + "loss": 1.896, + "step": 6971 + }, + { + "epoch": 2.139963167587477, + "grad_norm": 0.34447649121284485, + "learning_rate": 9.138579647379305e-05, + "loss": 1.8473, + "step": 6972 + }, + { + "epoch": 2.140270104358502, + "grad_norm": 0.466169536113739, + "learning_rate": 9.138300705284197e-05, + "loss": 1.9131, + "step": 6973 + }, + { + "epoch": 2.1405770411295273, + "grad_norm": 0.4297258257865906, + "learning_rate": 9.138021722291977e-05, + "loss": 1.9013, + "step": 6974 + }, + { + "epoch": 2.1408839779005526, + "grad_norm": 0.29336342215538025, + "learning_rate": 9.1377426984054e-05, + "loss": 1.8242, + "step": 6975 + }, + { + "epoch": 2.1411909146715775, + "grad_norm": 0.4282750189304352, + "learning_rate": 9.137463633627226e-05, + "loss": 1.9159, + "step": 6976 + }, + { + "epoch": 2.141497851442603, + "grad_norm": 0.6071211099624634, + "learning_rate": 9.13718452796021e-05, + "loss": 1.9105, + "step": 6977 + }, + { + "epoch": 2.141804788213628, + "grad_norm": 0.5837090015411377, + "learning_rate": 9.136905381407113e-05, + "loss": 1.8735, + "step": 6978 + }, + { + "epoch": 2.142111724984653, + "grad_norm": 0.36910486221313477, + "learning_rate": 9.13662619397069e-05, + "loss": 1.9013, + "step": 6979 + }, + { + "epoch": 2.1424186617556784, + "grad_norm": 0.37497541308403015, + "learning_rate": 9.136346965653704e-05, + "loss": 1.8444, + "step": 6980 + }, + { + "epoch": 2.1427255985267033, + "grad_norm": 0.508252739906311, + "learning_rate": 9.136067696458911e-05, + "loss": 1.8756, + "step": 6981 + }, + { + "epoch": 2.1430325352977286, + "grad_norm": 0.4045214056968689, + "learning_rate": 9.135788386389077e-05, + "loss": 1.8843, + "step": 6982 + }, + { + "epoch": 2.143339472068754, + "grad_norm": 0.36260777711868286, + "learning_rate": 9.135509035446955e-05, + "loss": 1.9264, + "step": 6983 + }, + { + "epoch": 2.143646408839779, + "grad_norm": 0.4112427234649658, + "learning_rate": 9.135229643635309e-05, + "loss": 1.8843, + "step": 6984 + }, + { + "epoch": 2.143953345610804, + "grad_norm": 0.43893104791641235, + "learning_rate": 9.1349502109569e-05, + "loss": 1.9486, + "step": 6985 + }, + { + "epoch": 2.1442602823818295, + "grad_norm": 0.3942745625972748, + "learning_rate": 9.13467073741449e-05, + "loss": 1.8607, + "step": 6986 + }, + { + "epoch": 2.1445672191528544, + "grad_norm": 0.3920004963874817, + "learning_rate": 9.13439122301084e-05, + "loss": 1.8102, + "step": 6987 + }, + { + "epoch": 2.1448741559238798, + "grad_norm": 0.3774373531341553, + "learning_rate": 9.134111667748712e-05, + "loss": 1.8326, + "step": 6988 + }, + { + "epoch": 2.1451810926949046, + "grad_norm": 0.355228453874588, + "learning_rate": 9.13383207163087e-05, + "loss": 1.895, + "step": 6989 + }, + { + "epoch": 2.14548802946593, + "grad_norm": 0.40284648537635803, + "learning_rate": 9.133552434660077e-05, + "loss": 1.928, + "step": 6990 + }, + { + "epoch": 2.1457949662369553, + "grad_norm": 0.3974910378456116, + "learning_rate": 9.133272756839096e-05, + "loss": 1.8567, + "step": 6991 + }, + { + "epoch": 2.14610190300798, + "grad_norm": 0.3878382742404938, + "learning_rate": 9.13299303817069e-05, + "loss": 1.9125, + "step": 6992 + }, + { + "epoch": 2.1464088397790055, + "grad_norm": 0.36132267117500305, + "learning_rate": 9.132713278657625e-05, + "loss": 1.8395, + "step": 6993 + }, + { + "epoch": 2.146715776550031, + "grad_norm": 0.4648832082748413, + "learning_rate": 9.132433478302667e-05, + "loss": 1.8877, + "step": 6994 + }, + { + "epoch": 2.1470227133210558, + "grad_norm": 0.5171563625335693, + "learning_rate": 9.132153637108577e-05, + "loss": 1.857, + "step": 6995 + }, + { + "epoch": 2.147329650092081, + "grad_norm": 0.4256175756454468, + "learning_rate": 9.131873755078124e-05, + "loss": 1.8434, + "step": 6996 + }, + { + "epoch": 2.147636586863106, + "grad_norm": 0.3421500623226166, + "learning_rate": 9.131593832214072e-05, + "loss": 1.8747, + "step": 6997 + }, + { + "epoch": 2.1479435236341313, + "grad_norm": 0.3880314230918884, + "learning_rate": 9.131313868519188e-05, + "loss": 1.8592, + "step": 6998 + }, + { + "epoch": 2.1482504604051567, + "grad_norm": 0.41070252656936646, + "learning_rate": 9.131033863996239e-05, + "loss": 1.8746, + "step": 6999 + }, + { + "epoch": 2.1485573971761815, + "grad_norm": 0.3837376534938812, + "learning_rate": 9.130753818647992e-05, + "loss": 1.8722, + "step": 7000 + }, + { + "epoch": 2.148864333947207, + "grad_norm": 0.311184823513031, + "learning_rate": 9.130473732477217e-05, + "loss": 1.8964, + "step": 7001 + }, + { + "epoch": 2.149171270718232, + "grad_norm": 0.3548091948032379, + "learning_rate": 9.130193605486677e-05, + "loss": 1.9235, + "step": 7002 + }, + { + "epoch": 2.149478207489257, + "grad_norm": 0.3509860932826996, + "learning_rate": 9.129913437679143e-05, + "loss": 1.8088, + "step": 7003 + }, + { + "epoch": 2.1497851442602824, + "grad_norm": 0.3301749527454376, + "learning_rate": 9.129633229057384e-05, + "loss": 1.8926, + "step": 7004 + }, + { + "epoch": 2.150092081031308, + "grad_norm": 0.3071286082267761, + "learning_rate": 9.129352979624169e-05, + "loss": 1.8045, + "step": 7005 + }, + { + "epoch": 2.1503990178023327, + "grad_norm": 0.3222786486148834, + "learning_rate": 9.129072689382268e-05, + "loss": 1.877, + "step": 7006 + }, + { + "epoch": 2.150705954573358, + "grad_norm": 0.31817424297332764, + "learning_rate": 9.128792358334451e-05, + "loss": 1.8863, + "step": 7007 + }, + { + "epoch": 2.151012891344383, + "grad_norm": 0.29379183053970337, + "learning_rate": 9.128511986483487e-05, + "loss": 1.8339, + "step": 7008 + }, + { + "epoch": 2.1513198281154082, + "grad_norm": 0.3618883788585663, + "learning_rate": 9.128231573832149e-05, + "loss": 1.9521, + "step": 7009 + }, + { + "epoch": 2.1516267648864336, + "grad_norm": 0.3188464045524597, + "learning_rate": 9.127951120383205e-05, + "loss": 1.811, + "step": 7010 + }, + { + "epoch": 2.1519337016574585, + "grad_norm": 0.3257068395614624, + "learning_rate": 9.127670626139431e-05, + "loss": 1.9084, + "step": 7011 + }, + { + "epoch": 2.152240638428484, + "grad_norm": 0.3389057219028473, + "learning_rate": 9.127390091103595e-05, + "loss": 1.9272, + "step": 7012 + }, + { + "epoch": 2.1525475751995087, + "grad_norm": 0.3376730680465698, + "learning_rate": 9.127109515278471e-05, + "loss": 1.8841, + "step": 7013 + }, + { + "epoch": 2.152854511970534, + "grad_norm": 0.3032901883125305, + "learning_rate": 9.126828898666833e-05, + "loss": 1.8057, + "step": 7014 + }, + { + "epoch": 2.1531614487415593, + "grad_norm": 0.32034799456596375, + "learning_rate": 9.126548241271451e-05, + "loss": 1.7988, + "step": 7015 + }, + { + "epoch": 2.1534683855125842, + "grad_norm": 0.31879931688308716, + "learning_rate": 9.126267543095102e-05, + "loss": 1.8932, + "step": 7016 + }, + { + "epoch": 2.1537753222836096, + "grad_norm": 0.3282395005226135, + "learning_rate": 9.125986804140559e-05, + "loss": 1.907, + "step": 7017 + }, + { + "epoch": 2.154082259054635, + "grad_norm": 0.36310696601867676, + "learning_rate": 9.125706024410594e-05, + "loss": 1.9812, + "step": 7018 + }, + { + "epoch": 2.15438919582566, + "grad_norm": 0.39414262771606445, + "learning_rate": 9.125425203907985e-05, + "loss": 1.9112, + "step": 7019 + }, + { + "epoch": 2.154696132596685, + "grad_norm": 0.4457061290740967, + "learning_rate": 9.125144342635508e-05, + "loss": 1.8876, + "step": 7020 + }, + { + "epoch": 2.1550030693677105, + "grad_norm": 0.4651646316051483, + "learning_rate": 9.124863440595934e-05, + "loss": 1.8283, + "step": 7021 + }, + { + "epoch": 2.1553100061387354, + "grad_norm": 0.4404383897781372, + "learning_rate": 9.124582497792043e-05, + "loss": 1.8646, + "step": 7022 + }, + { + "epoch": 2.1556169429097607, + "grad_norm": 0.3569783866405487, + "learning_rate": 9.124301514226612e-05, + "loss": 1.9603, + "step": 7023 + }, + { + "epoch": 2.1559238796807856, + "grad_norm": 0.3878212571144104, + "learning_rate": 9.124020489902414e-05, + "loss": 1.889, + "step": 7024 + }, + { + "epoch": 2.156230816451811, + "grad_norm": 0.43005698919296265, + "learning_rate": 9.123739424822229e-05, + "loss": 1.9127, + "step": 7025 + }, + { + "epoch": 2.1565377532228363, + "grad_norm": 0.37798774242401123, + "learning_rate": 9.123458318988834e-05, + "loss": 1.8434, + "step": 7026 + }, + { + "epoch": 2.156844689993861, + "grad_norm": 0.38182979822158813, + "learning_rate": 9.123177172405007e-05, + "loss": 1.8905, + "step": 7027 + }, + { + "epoch": 2.1571516267648865, + "grad_norm": 0.4695180058479309, + "learning_rate": 9.122895985073524e-05, + "loss": 1.9035, + "step": 7028 + }, + { + "epoch": 2.1574585635359114, + "grad_norm": 0.37112870812416077, + "learning_rate": 9.12261475699717e-05, + "loss": 1.8497, + "step": 7029 + }, + { + "epoch": 2.1577655003069367, + "grad_norm": 0.36758264899253845, + "learning_rate": 9.122333488178721e-05, + "loss": 1.9015, + "step": 7030 + }, + { + "epoch": 2.158072437077962, + "grad_norm": 0.4691081643104553, + "learning_rate": 9.122052178620953e-05, + "loss": 1.9707, + "step": 7031 + }, + { + "epoch": 2.158379373848987, + "grad_norm": 0.47068753838539124, + "learning_rate": 9.121770828326653e-05, + "loss": 1.9103, + "step": 7032 + }, + { + "epoch": 2.1586863106200123, + "grad_norm": 0.38539063930511475, + "learning_rate": 9.121489437298593e-05, + "loss": 1.7872, + "step": 7033 + }, + { + "epoch": 2.1589932473910376, + "grad_norm": 0.43769749999046326, + "learning_rate": 9.121208005539563e-05, + "loss": 1.9654, + "step": 7034 + }, + { + "epoch": 2.1593001841620625, + "grad_norm": 0.4770655930042267, + "learning_rate": 9.120926533052338e-05, + "loss": 1.9754, + "step": 7035 + }, + { + "epoch": 2.159607120933088, + "grad_norm": 0.526979386806488, + "learning_rate": 9.120645019839702e-05, + "loss": 1.8833, + "step": 7036 + }, + { + "epoch": 2.159914057704113, + "grad_norm": 0.4734671413898468, + "learning_rate": 9.120363465904438e-05, + "loss": 1.8695, + "step": 7037 + }, + { + "epoch": 2.160220994475138, + "grad_norm": 0.40346798300743103, + "learning_rate": 9.120081871249326e-05, + "loss": 1.9216, + "step": 7038 + }, + { + "epoch": 2.1605279312461634, + "grad_norm": 0.38210105895996094, + "learning_rate": 9.119800235877149e-05, + "loss": 1.9334, + "step": 7039 + }, + { + "epoch": 2.1608348680171883, + "grad_norm": 0.5528677105903625, + "learning_rate": 9.119518559790694e-05, + "loss": 1.8858, + "step": 7040 + }, + { + "epoch": 2.1611418047882136, + "grad_norm": 0.6684148907661438, + "learning_rate": 9.11923684299274e-05, + "loss": 1.9105, + "step": 7041 + }, + { + "epoch": 2.161448741559239, + "grad_norm": 0.4497738778591156, + "learning_rate": 9.118955085486073e-05, + "loss": 1.8789, + "step": 7042 + }, + { + "epoch": 2.161755678330264, + "grad_norm": 0.4440831243991852, + "learning_rate": 9.11867328727348e-05, + "loss": 1.9966, + "step": 7043 + }, + { + "epoch": 2.162062615101289, + "grad_norm": 0.5910835266113281, + "learning_rate": 9.118391448357742e-05, + "loss": 1.8841, + "step": 7044 + }, + { + "epoch": 2.1623695518723145, + "grad_norm": 0.5312752723693848, + "learning_rate": 9.118109568741645e-05, + "loss": 1.8825, + "step": 7045 + }, + { + "epoch": 2.1626764886433394, + "grad_norm": 0.3885713815689087, + "learning_rate": 9.117827648427977e-05, + "loss": 1.8763, + "step": 7046 + }, + { + "epoch": 2.1629834254143647, + "grad_norm": 0.4274894893169403, + "learning_rate": 9.117545687419522e-05, + "loss": 1.8802, + "step": 7047 + }, + { + "epoch": 2.1632903621853896, + "grad_norm": 0.3984382748603821, + "learning_rate": 9.117263685719067e-05, + "loss": 1.8319, + "step": 7048 + }, + { + "epoch": 2.163597298956415, + "grad_norm": 0.3687778115272522, + "learning_rate": 9.1169816433294e-05, + "loss": 1.838, + "step": 7049 + }, + { + "epoch": 2.1639042357274403, + "grad_norm": 0.37597915530204773, + "learning_rate": 9.116699560253306e-05, + "loss": 1.8711, + "step": 7050 + }, + { + "epoch": 2.164211172498465, + "grad_norm": 0.41217467188835144, + "learning_rate": 9.116417436493574e-05, + "loss": 1.8552, + "step": 7051 + }, + { + "epoch": 2.1645181092694905, + "grad_norm": 0.3937448263168335, + "learning_rate": 9.116135272052994e-05, + "loss": 1.8548, + "step": 7052 + }, + { + "epoch": 2.164825046040516, + "grad_norm": 0.3545389175415039, + "learning_rate": 9.115853066934351e-05, + "loss": 1.8694, + "step": 7053 + }, + { + "epoch": 2.1651319828115407, + "grad_norm": 0.32625243067741394, + "learning_rate": 9.115570821140436e-05, + "loss": 1.8579, + "step": 7054 + }, + { + "epoch": 2.165438919582566, + "grad_norm": 0.32701975107192993, + "learning_rate": 9.115288534674038e-05, + "loss": 1.8676, + "step": 7055 + }, + { + "epoch": 2.165745856353591, + "grad_norm": 0.39372533559799194, + "learning_rate": 9.115006207537947e-05, + "loss": 1.8895, + "step": 7056 + }, + { + "epoch": 2.1660527931246163, + "grad_norm": 0.3688350021839142, + "learning_rate": 9.114723839734954e-05, + "loss": 1.8742, + "step": 7057 + }, + { + "epoch": 2.1663597298956416, + "grad_norm": 0.35461875796318054, + "learning_rate": 9.114441431267846e-05, + "loss": 1.8723, + "step": 7058 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.3331618010997772, + "learning_rate": 9.114158982139416e-05, + "loss": 1.8514, + "step": 7059 + }, + { + "epoch": 2.166973603437692, + "grad_norm": 0.3313215374946594, + "learning_rate": 9.113876492352458e-05, + "loss": 1.912, + "step": 7060 + }, + { + "epoch": 2.167280540208717, + "grad_norm": 0.3320949375629425, + "learning_rate": 9.113593961909759e-05, + "loss": 1.8908, + "step": 7061 + }, + { + "epoch": 2.167587476979742, + "grad_norm": 0.3292064070701599, + "learning_rate": 9.113311390814115e-05, + "loss": 1.8702, + "step": 7062 + }, + { + "epoch": 2.1678944137507674, + "grad_norm": 0.33991244435310364, + "learning_rate": 9.113028779068316e-05, + "loss": 1.8503, + "step": 7063 + }, + { + "epoch": 2.1682013505217923, + "grad_norm": 0.3602859377861023, + "learning_rate": 9.112746126675156e-05, + "loss": 1.9185, + "step": 7064 + }, + { + "epoch": 2.1685082872928176, + "grad_norm": 0.3354876637458801, + "learning_rate": 9.112463433637428e-05, + "loss": 1.8857, + "step": 7065 + }, + { + "epoch": 2.168815224063843, + "grad_norm": 0.32364192605018616, + "learning_rate": 9.112180699957926e-05, + "loss": 1.8548, + "step": 7066 + }, + { + "epoch": 2.169122160834868, + "grad_norm": 0.3617163896560669, + "learning_rate": 9.111897925639446e-05, + "loss": 1.9021, + "step": 7067 + }, + { + "epoch": 2.169429097605893, + "grad_norm": 0.3852904438972473, + "learning_rate": 9.111615110684778e-05, + "loss": 1.9331, + "step": 7068 + }, + { + "epoch": 2.1697360343769185, + "grad_norm": 0.332939088344574, + "learning_rate": 9.111332255096721e-05, + "loss": 1.9156, + "step": 7069 + }, + { + "epoch": 2.1700429711479434, + "grad_norm": 0.3386891186237335, + "learning_rate": 9.111049358878067e-05, + "loss": 1.8898, + "step": 7070 + }, + { + "epoch": 2.1703499079189688, + "grad_norm": 0.3559711277484894, + "learning_rate": 9.110766422031617e-05, + "loss": 1.8546, + "step": 7071 + }, + { + "epoch": 2.1706568446899936, + "grad_norm": 0.3440175950527191, + "learning_rate": 9.110483444560162e-05, + "loss": 1.9005, + "step": 7072 + }, + { + "epoch": 2.170963781461019, + "grad_norm": 0.3239493668079376, + "learning_rate": 9.110200426466499e-05, + "loss": 1.9258, + "step": 7073 + }, + { + "epoch": 2.1712707182320443, + "grad_norm": 0.3658723533153534, + "learning_rate": 9.109917367753428e-05, + "loss": 2.0203, + "step": 7074 + }, + { + "epoch": 2.171577655003069, + "grad_norm": 0.35419905185699463, + "learning_rate": 9.109634268423746e-05, + "loss": 1.8515, + "step": 7075 + }, + { + "epoch": 2.1718845917740945, + "grad_norm": 0.40852081775665283, + "learning_rate": 9.109351128480246e-05, + "loss": 1.8744, + "step": 7076 + }, + { + "epoch": 2.17219152854512, + "grad_norm": 0.3502386212348938, + "learning_rate": 9.109067947925732e-05, + "loss": 1.8785, + "step": 7077 + }, + { + "epoch": 2.1724984653161448, + "grad_norm": 0.42964309453964233, + "learning_rate": 9.108784726763e-05, + "loss": 1.9175, + "step": 7078 + }, + { + "epoch": 2.17280540208717, + "grad_norm": 0.39438319206237793, + "learning_rate": 9.108501464994849e-05, + "loss": 1.9072, + "step": 7079 + }, + { + "epoch": 2.1731123388581954, + "grad_norm": 0.5045785903930664, + "learning_rate": 9.108218162624079e-05, + "loss": 1.9246, + "step": 7080 + }, + { + "epoch": 2.1734192756292203, + "grad_norm": 0.4374946653842926, + "learning_rate": 9.107934819653488e-05, + "loss": 1.8669, + "step": 7081 + }, + { + "epoch": 2.1737262124002457, + "grad_norm": 0.3263556957244873, + "learning_rate": 9.107651436085878e-05, + "loss": 1.8402, + "step": 7082 + }, + { + "epoch": 2.1740331491712706, + "grad_norm": 0.4380986988544464, + "learning_rate": 9.107368011924048e-05, + "loss": 1.8948, + "step": 7083 + }, + { + "epoch": 2.174340085942296, + "grad_norm": 0.4350908696651459, + "learning_rate": 9.1070845471708e-05, + "loss": 1.8717, + "step": 7084 + }, + { + "epoch": 2.174647022713321, + "grad_norm": 0.37809762358665466, + "learning_rate": 9.106801041828936e-05, + "loss": 1.8703, + "step": 7085 + }, + { + "epoch": 2.174953959484346, + "grad_norm": 0.3473457992076874, + "learning_rate": 9.106517495901257e-05, + "loss": 1.8999, + "step": 7086 + }, + { + "epoch": 2.1752608962553714, + "grad_norm": 0.48066645860671997, + "learning_rate": 9.106233909390564e-05, + "loss": 1.8788, + "step": 7087 + }, + { + "epoch": 2.1755678330263963, + "grad_norm": 0.5873035788536072, + "learning_rate": 9.105950282299663e-05, + "loss": 1.8879, + "step": 7088 + }, + { + "epoch": 2.1758747697974217, + "grad_norm": 0.47609585523605347, + "learning_rate": 9.105666614631354e-05, + "loss": 1.8813, + "step": 7089 + }, + { + "epoch": 2.176181706568447, + "grad_norm": 0.3845362365245819, + "learning_rate": 9.10538290638844e-05, + "loss": 1.9629, + "step": 7090 + }, + { + "epoch": 2.176488643339472, + "grad_norm": 0.5463572144508362, + "learning_rate": 9.105099157573727e-05, + "loss": 1.9455, + "step": 7091 + }, + { + "epoch": 2.1767955801104972, + "grad_norm": 0.4875337779521942, + "learning_rate": 9.104815368190017e-05, + "loss": 1.9146, + "step": 7092 + }, + { + "epoch": 2.1771025168815226, + "grad_norm": 0.37513965368270874, + "learning_rate": 9.104531538240116e-05, + "loss": 1.8626, + "step": 7093 + }, + { + "epoch": 2.1774094536525475, + "grad_norm": 0.3477539122104645, + "learning_rate": 9.104247667726828e-05, + "loss": 1.878, + "step": 7094 + }, + { + "epoch": 2.177716390423573, + "grad_norm": 0.5122693181037903, + "learning_rate": 9.103963756652961e-05, + "loss": 1.8784, + "step": 7095 + }, + { + "epoch": 2.178023327194598, + "grad_norm": 0.49106159806251526, + "learning_rate": 9.103679805021317e-05, + "loss": 1.8441, + "step": 7096 + }, + { + "epoch": 2.178330263965623, + "grad_norm": 0.3801479637622833, + "learning_rate": 9.103395812834705e-05, + "loss": 1.8986, + "step": 7097 + }, + { + "epoch": 2.1786372007366483, + "grad_norm": 0.3429640233516693, + "learning_rate": 9.10311178009593e-05, + "loss": 1.8806, + "step": 7098 + }, + { + "epoch": 2.1789441375076732, + "grad_norm": 0.36715295910835266, + "learning_rate": 9.102827706807799e-05, + "loss": 1.8215, + "step": 7099 + }, + { + "epoch": 2.1792510742786986, + "grad_norm": 0.37225866317749023, + "learning_rate": 9.10254359297312e-05, + "loss": 1.8851, + "step": 7100 + }, + { + "epoch": 2.179558011049724, + "grad_norm": 0.3552459180355072, + "learning_rate": 9.102259438594702e-05, + "loss": 1.9345, + "step": 7101 + }, + { + "epoch": 2.179864947820749, + "grad_norm": 0.3876415193080902, + "learning_rate": 9.10197524367535e-05, + "loss": 1.8657, + "step": 7102 + }, + { + "epoch": 2.180171884591774, + "grad_norm": 0.4635472595691681, + "learning_rate": 9.101691008217875e-05, + "loss": 1.8527, + "step": 7103 + }, + { + "epoch": 2.1804788213627995, + "grad_norm": 0.46319296956062317, + "learning_rate": 9.101406732225086e-05, + "loss": 1.869, + "step": 7104 + }, + { + "epoch": 2.1807857581338244, + "grad_norm": 0.36179330945014954, + "learning_rate": 9.101122415699792e-05, + "loss": 1.9157, + "step": 7105 + }, + { + "epoch": 2.1810926949048497, + "grad_norm": 0.30921339988708496, + "learning_rate": 9.100838058644801e-05, + "loss": 1.858, + "step": 7106 + }, + { + "epoch": 2.1813996316758746, + "grad_norm": 0.4568884074687958, + "learning_rate": 9.100553661062925e-05, + "loss": 1.8663, + "step": 7107 + }, + { + "epoch": 2.1817065684469, + "grad_norm": 0.43856412172317505, + "learning_rate": 9.100269222956976e-05, + "loss": 1.8492, + "step": 7108 + }, + { + "epoch": 2.1820135052179253, + "grad_norm": 0.3025546967983246, + "learning_rate": 9.099984744329761e-05, + "loss": 1.8532, + "step": 7109 + }, + { + "epoch": 2.18232044198895, + "grad_norm": 0.38365665078163147, + "learning_rate": 9.099700225184096e-05, + "loss": 1.8883, + "step": 7110 + }, + { + "epoch": 2.1826273787599755, + "grad_norm": 0.4863334596157074, + "learning_rate": 9.099415665522788e-05, + "loss": 1.8682, + "step": 7111 + }, + { + "epoch": 2.182934315531001, + "grad_norm": 0.42789241671562195, + "learning_rate": 9.099131065348653e-05, + "loss": 1.8867, + "step": 7112 + }, + { + "epoch": 2.1832412523020257, + "grad_norm": 0.35933569073677063, + "learning_rate": 9.098846424664504e-05, + "loss": 1.9282, + "step": 7113 + }, + { + "epoch": 2.183548189073051, + "grad_norm": 0.42611026763916016, + "learning_rate": 9.09856174347315e-05, + "loss": 1.9609, + "step": 7114 + }, + { + "epoch": 2.183855125844076, + "grad_norm": 0.43970558047294617, + "learning_rate": 9.098277021777406e-05, + "loss": 1.823, + "step": 7115 + }, + { + "epoch": 2.1841620626151013, + "grad_norm": 0.36792683601379395, + "learning_rate": 9.097992259580089e-05, + "loss": 1.9231, + "step": 7116 + }, + { + "epoch": 2.1844689993861266, + "grad_norm": 0.3554590344429016, + "learning_rate": 9.097707456884008e-05, + "loss": 1.914, + "step": 7117 + }, + { + "epoch": 2.1847759361571515, + "grad_norm": 0.4271651804447174, + "learning_rate": 9.097422613691982e-05, + "loss": 1.8666, + "step": 7118 + }, + { + "epoch": 2.185082872928177, + "grad_norm": 0.32142770290374756, + "learning_rate": 9.097137730006822e-05, + "loss": 1.7989, + "step": 7119 + }, + { + "epoch": 2.185389809699202, + "grad_norm": 0.33245620131492615, + "learning_rate": 9.096852805831348e-05, + "loss": 1.8536, + "step": 7120 + }, + { + "epoch": 2.185696746470227, + "grad_norm": 0.3480495810508728, + "learning_rate": 9.09656784116837e-05, + "loss": 1.9008, + "step": 7121 + }, + { + "epoch": 2.1860036832412524, + "grad_norm": 0.35290226340293884, + "learning_rate": 9.09628283602071e-05, + "loss": 1.8593, + "step": 7122 + }, + { + "epoch": 2.1863106200122773, + "grad_norm": 0.3084987998008728, + "learning_rate": 9.095997790391183e-05, + "loss": 1.827, + "step": 7123 + }, + { + "epoch": 2.1866175567833026, + "grad_norm": 0.36295285820961, + "learning_rate": 9.095712704282604e-05, + "loss": 1.909, + "step": 7124 + }, + { + "epoch": 2.186924493554328, + "grad_norm": 0.3893873691558838, + "learning_rate": 9.095427577697791e-05, + "loss": 1.9221, + "step": 7125 + }, + { + "epoch": 2.187231430325353, + "grad_norm": 0.3699241578578949, + "learning_rate": 9.095142410639564e-05, + "loss": 1.9352, + "step": 7126 + }, + { + "epoch": 2.187538367096378, + "grad_norm": 0.3384705185890198, + "learning_rate": 9.094857203110738e-05, + "loss": 1.8541, + "step": 7127 + }, + { + "epoch": 2.1878453038674035, + "grad_norm": 0.377687007188797, + "learning_rate": 9.094571955114133e-05, + "loss": 1.8336, + "step": 7128 + }, + { + "epoch": 2.1881522406384284, + "grad_norm": 0.40227916836738586, + "learning_rate": 9.094286666652567e-05, + "loss": 1.9565, + "step": 7129 + }, + { + "epoch": 2.1884591774094537, + "grad_norm": 0.3679705560207367, + "learning_rate": 9.094001337728862e-05, + "loss": 1.8152, + "step": 7130 + }, + { + "epoch": 2.1887661141804786, + "grad_norm": 0.3197132647037506, + "learning_rate": 9.093715968345836e-05, + "loss": 1.9263, + "step": 7131 + }, + { + "epoch": 2.189073050951504, + "grad_norm": 0.3518284559249878, + "learning_rate": 9.09343055850631e-05, + "loss": 1.8675, + "step": 7132 + }, + { + "epoch": 2.1893799877225293, + "grad_norm": 0.3214010000228882, + "learning_rate": 9.093145108213103e-05, + "loss": 1.8991, + "step": 7133 + }, + { + "epoch": 2.189686924493554, + "grad_norm": 0.3563176393508911, + "learning_rate": 9.092859617469037e-05, + "loss": 1.8603, + "step": 7134 + }, + { + "epoch": 2.1899938612645795, + "grad_norm": 0.34053143858909607, + "learning_rate": 9.092574086276933e-05, + "loss": 1.8955, + "step": 7135 + }, + { + "epoch": 2.190300798035605, + "grad_norm": 0.3833705484867096, + "learning_rate": 9.092288514639613e-05, + "loss": 1.8845, + "step": 7136 + }, + { + "epoch": 2.1906077348066297, + "grad_norm": 0.3932427763938904, + "learning_rate": 9.092002902559901e-05, + "loss": 1.8608, + "step": 7137 + }, + { + "epoch": 2.190914671577655, + "grad_norm": 0.332955539226532, + "learning_rate": 9.091717250040617e-05, + "loss": 1.8558, + "step": 7138 + }, + { + "epoch": 2.1912216083486804, + "grad_norm": 0.3149980306625366, + "learning_rate": 9.091431557084584e-05, + "loss": 1.893, + "step": 7139 + }, + { + "epoch": 2.1915285451197053, + "grad_norm": 0.3679150640964508, + "learning_rate": 9.091145823694628e-05, + "loss": 1.9012, + "step": 7140 + }, + { + "epoch": 2.1918354818907306, + "grad_norm": 0.36836057901382446, + "learning_rate": 9.09086004987357e-05, + "loss": 1.9121, + "step": 7141 + }, + { + "epoch": 2.1921424186617555, + "grad_norm": 0.3581927418708801, + "learning_rate": 9.090574235624237e-05, + "loss": 1.8826, + "step": 7142 + }, + { + "epoch": 2.192449355432781, + "grad_norm": 0.40886545181274414, + "learning_rate": 9.09028838094945e-05, + "loss": 1.8828, + "step": 7143 + }, + { + "epoch": 2.192756292203806, + "grad_norm": 0.32729873061180115, + "learning_rate": 9.090002485852037e-05, + "loss": 1.8827, + "step": 7144 + }, + { + "epoch": 2.193063228974831, + "grad_norm": 0.35304784774780273, + "learning_rate": 9.089716550334819e-05, + "loss": 1.846, + "step": 7145 + }, + { + "epoch": 2.1933701657458564, + "grad_norm": 0.35022708773612976, + "learning_rate": 9.089430574400629e-05, + "loss": 1.9169, + "step": 7146 + }, + { + "epoch": 2.1936771025168813, + "grad_norm": 0.4137697219848633, + "learning_rate": 9.089144558052287e-05, + "loss": 1.9111, + "step": 7147 + }, + { + "epoch": 2.1939840392879066, + "grad_norm": 0.3193536102771759, + "learning_rate": 9.088858501292622e-05, + "loss": 1.8577, + "step": 7148 + }, + { + "epoch": 2.194290976058932, + "grad_norm": 0.35795432329177856, + "learning_rate": 9.08857240412446e-05, + "loss": 1.8645, + "step": 7149 + }, + { + "epoch": 2.194597912829957, + "grad_norm": 0.3626460134983063, + "learning_rate": 9.088286266550632e-05, + "loss": 1.9288, + "step": 7150 + }, + { + "epoch": 2.194904849600982, + "grad_norm": 0.3438000977039337, + "learning_rate": 9.08800008857396e-05, + "loss": 1.9112, + "step": 7151 + }, + { + "epoch": 2.1952117863720075, + "grad_norm": 0.3445241153240204, + "learning_rate": 9.087713870197276e-05, + "loss": 1.8711, + "step": 7152 + }, + { + "epoch": 2.1955187231430324, + "grad_norm": 0.34294596314430237, + "learning_rate": 9.087427611423408e-05, + "loss": 1.9061, + "step": 7153 + }, + { + "epoch": 2.1958256599140578, + "grad_norm": 0.3608735203742981, + "learning_rate": 9.087141312255184e-05, + "loss": 1.8634, + "step": 7154 + }, + { + "epoch": 2.196132596685083, + "grad_norm": 0.3417772352695465, + "learning_rate": 9.086854972695434e-05, + "loss": 1.9, + "step": 7155 + }, + { + "epoch": 2.196439533456108, + "grad_norm": 0.3516700863838196, + "learning_rate": 9.086568592746988e-05, + "loss": 1.9021, + "step": 7156 + }, + { + "epoch": 2.1967464702271333, + "grad_norm": 0.37481075525283813, + "learning_rate": 9.086282172412677e-05, + "loss": 1.8845, + "step": 7157 + }, + { + "epoch": 2.197053406998158, + "grad_norm": 0.3413105010986328, + "learning_rate": 9.08599571169533e-05, + "loss": 1.8128, + "step": 7158 + }, + { + "epoch": 2.1973603437691835, + "grad_norm": 0.3539934754371643, + "learning_rate": 9.085709210597777e-05, + "loss": 1.857, + "step": 7159 + }, + { + "epoch": 2.197667280540209, + "grad_norm": 0.4345060884952545, + "learning_rate": 9.085422669122851e-05, + "loss": 1.8698, + "step": 7160 + }, + { + "epoch": 2.1979742173112338, + "grad_norm": 0.40369880199432373, + "learning_rate": 9.085136087273386e-05, + "loss": 1.7948, + "step": 7161 + }, + { + "epoch": 2.198281154082259, + "grad_norm": 0.3832145035266876, + "learning_rate": 9.08484946505221e-05, + "loss": 1.8682, + "step": 7162 + }, + { + "epoch": 2.198588090853284, + "grad_norm": 0.2859131097793579, + "learning_rate": 9.084562802462158e-05, + "loss": 1.8123, + "step": 7163 + }, + { + "epoch": 2.1988950276243093, + "grad_norm": 0.3062222898006439, + "learning_rate": 9.084276099506062e-05, + "loss": 1.8448, + "step": 7164 + }, + { + "epoch": 2.1992019643953347, + "grad_norm": 0.3819046914577484, + "learning_rate": 9.083989356186757e-05, + "loss": 1.8661, + "step": 7165 + }, + { + "epoch": 2.1995089011663596, + "grad_norm": 0.5007020235061646, + "learning_rate": 9.083702572507074e-05, + "loss": 1.9144, + "step": 7166 + }, + { + "epoch": 2.199815837937385, + "grad_norm": 0.521885097026825, + "learning_rate": 9.083415748469849e-05, + "loss": 1.8695, + "step": 7167 + }, + { + "epoch": 2.2001227747084102, + "grad_norm": 0.35051268339157104, + "learning_rate": 9.083128884077916e-05, + "loss": 1.9378, + "step": 7168 + }, + { + "epoch": 2.200429711479435, + "grad_norm": 0.40265345573425293, + "learning_rate": 9.082841979334111e-05, + "loss": 1.8902, + "step": 7169 + }, + { + "epoch": 2.2007366482504604, + "grad_norm": 0.506377637386322, + "learning_rate": 9.082555034241267e-05, + "loss": 1.9115, + "step": 7170 + }, + { + "epoch": 2.201043585021486, + "grad_norm": 0.42828384041786194, + "learning_rate": 9.082268048802223e-05, + "loss": 1.8173, + "step": 7171 + }, + { + "epoch": 2.2013505217925107, + "grad_norm": 0.2979312539100647, + "learning_rate": 9.081981023019812e-05, + "loss": 1.8089, + "step": 7172 + }, + { + "epoch": 2.201657458563536, + "grad_norm": 0.3840465843677521, + "learning_rate": 9.081693956896872e-05, + "loss": 1.8557, + "step": 7173 + }, + { + "epoch": 2.201964395334561, + "grad_norm": 0.41454845666885376, + "learning_rate": 9.081406850436241e-05, + "loss": 1.8599, + "step": 7174 + }, + { + "epoch": 2.2022713321055862, + "grad_norm": 0.3305908739566803, + "learning_rate": 9.081119703640756e-05, + "loss": 1.8013, + "step": 7175 + }, + { + "epoch": 2.2025782688766116, + "grad_norm": 0.33649876713752747, + "learning_rate": 9.080832516513252e-05, + "loss": 1.9028, + "step": 7176 + }, + { + "epoch": 2.2028852056476365, + "grad_norm": 0.41247284412384033, + "learning_rate": 9.08054528905657e-05, + "loss": 1.8636, + "step": 7177 + }, + { + "epoch": 2.203192142418662, + "grad_norm": 0.4355279505252838, + "learning_rate": 9.080258021273548e-05, + "loss": 1.8923, + "step": 7178 + }, + { + "epoch": 2.203499079189687, + "grad_norm": 0.34598320722579956, + "learning_rate": 9.079970713167026e-05, + "loss": 1.9187, + "step": 7179 + }, + { + "epoch": 2.203806015960712, + "grad_norm": 0.3560951054096222, + "learning_rate": 9.07968336473984e-05, + "loss": 1.9382, + "step": 7180 + }, + { + "epoch": 2.2041129527317374, + "grad_norm": 0.3873176872730255, + "learning_rate": 9.079395975994834e-05, + "loss": 1.8377, + "step": 7181 + }, + { + "epoch": 2.2044198895027622, + "grad_norm": 0.38699567317962646, + "learning_rate": 9.079108546934844e-05, + "loss": 1.848, + "step": 7182 + }, + { + "epoch": 2.2047268262737876, + "grad_norm": 0.3658364713191986, + "learning_rate": 9.078821077562712e-05, + "loss": 1.9308, + "step": 7183 + }, + { + "epoch": 2.205033763044813, + "grad_norm": 0.35228830575942993, + "learning_rate": 9.078533567881281e-05, + "loss": 1.8886, + "step": 7184 + }, + { + "epoch": 2.205340699815838, + "grad_norm": 0.4177337884902954, + "learning_rate": 9.07824601789339e-05, + "loss": 1.8695, + "step": 7185 + }, + { + "epoch": 2.205647636586863, + "grad_norm": 0.4778536260128021, + "learning_rate": 9.077958427601882e-05, + "loss": 1.8288, + "step": 7186 + }, + { + "epoch": 2.2059545733578885, + "grad_norm": 0.46544820070266724, + "learning_rate": 9.077670797009599e-05, + "loss": 1.8974, + "step": 7187 + }, + { + "epoch": 2.2062615101289134, + "grad_norm": 0.36188805103302, + "learning_rate": 9.077383126119382e-05, + "loss": 1.8953, + "step": 7188 + }, + { + "epoch": 2.2065684468999387, + "grad_norm": 0.30941206216812134, + "learning_rate": 9.077095414934075e-05, + "loss": 1.8395, + "step": 7189 + }, + { + "epoch": 2.2068753836709636, + "grad_norm": 0.4497200846672058, + "learning_rate": 9.076807663456524e-05, + "loss": 1.8485, + "step": 7190 + }, + { + "epoch": 2.207182320441989, + "grad_norm": 0.4923233985900879, + "learning_rate": 9.076519871689568e-05, + "loss": 1.8233, + "step": 7191 + }, + { + "epoch": 2.2074892572130143, + "grad_norm": 0.32226502895355225, + "learning_rate": 9.076232039636053e-05, + "loss": 1.8563, + "step": 7192 + }, + { + "epoch": 2.207796193984039, + "grad_norm": 0.46719446778297424, + "learning_rate": 9.075944167298824e-05, + "loss": 1.8602, + "step": 7193 + }, + { + "epoch": 2.2081031307550645, + "grad_norm": 0.5534674525260925, + "learning_rate": 9.075656254680727e-05, + "loss": 1.8804, + "step": 7194 + }, + { + "epoch": 2.20841006752609, + "grad_norm": 0.4895678162574768, + "learning_rate": 9.075368301784606e-05, + "loss": 1.8893, + "step": 7195 + }, + { + "epoch": 2.2087170042971147, + "grad_norm": 0.33137625455856323, + "learning_rate": 9.075080308613306e-05, + "loss": 1.9158, + "step": 7196 + }, + { + "epoch": 2.20902394106814, + "grad_norm": 0.469319611787796, + "learning_rate": 9.074792275169674e-05, + "loss": 1.8628, + "step": 7197 + }, + { + "epoch": 2.209330877839165, + "grad_norm": 0.43872305750846863, + "learning_rate": 9.074504201456556e-05, + "loss": 1.8867, + "step": 7198 + }, + { + "epoch": 2.2096378146101903, + "grad_norm": 0.32900992035865784, + "learning_rate": 9.0742160874768e-05, + "loss": 1.8079, + "step": 7199 + }, + { + "epoch": 2.2099447513812156, + "grad_norm": 0.34231048822402954, + "learning_rate": 9.073927933233253e-05, + "loss": 1.9018, + "step": 7200 + }, + { + "epoch": 2.2102516881522405, + "grad_norm": 0.43461740016937256, + "learning_rate": 9.07363973872876e-05, + "loss": 1.8299, + "step": 7201 + }, + { + "epoch": 2.210558624923266, + "grad_norm": 0.43819913268089294, + "learning_rate": 9.073351503966174e-05, + "loss": 1.8641, + "step": 7202 + }, + { + "epoch": 2.210865561694291, + "grad_norm": 0.330683171749115, + "learning_rate": 9.073063228948339e-05, + "loss": 1.8595, + "step": 7203 + }, + { + "epoch": 2.211172498465316, + "grad_norm": 0.35648414492607117, + "learning_rate": 9.072774913678108e-05, + "loss": 1.8265, + "step": 7204 + }, + { + "epoch": 2.2114794352363414, + "grad_norm": 0.4420771300792694, + "learning_rate": 9.072486558158329e-05, + "loss": 1.902, + "step": 7205 + }, + { + "epoch": 2.2117863720073663, + "grad_norm": 0.41682472825050354, + "learning_rate": 9.072198162391849e-05, + "loss": 1.903, + "step": 7206 + }, + { + "epoch": 2.2120933087783916, + "grad_norm": 0.3194744288921356, + "learning_rate": 9.07190972638152e-05, + "loss": 1.8221, + "step": 7207 + }, + { + "epoch": 2.212400245549417, + "grad_norm": 0.35625776648521423, + "learning_rate": 9.071621250130192e-05, + "loss": 1.8737, + "step": 7208 + }, + { + "epoch": 2.212707182320442, + "grad_norm": 0.4136293828487396, + "learning_rate": 9.071332733640716e-05, + "loss": 1.7995, + "step": 7209 + }, + { + "epoch": 2.213014119091467, + "grad_norm": 0.39144495129585266, + "learning_rate": 9.071044176915947e-05, + "loss": 1.8446, + "step": 7210 + }, + { + "epoch": 2.2133210558624925, + "grad_norm": 0.3082813322544098, + "learning_rate": 9.07075557995873e-05, + "loss": 1.7635, + "step": 7211 + }, + { + "epoch": 2.2136279926335174, + "grad_norm": 0.3642291724681854, + "learning_rate": 9.070466942771921e-05, + "loss": 1.9471, + "step": 7212 + }, + { + "epoch": 2.2139349294045427, + "grad_norm": 0.4506807029247284, + "learning_rate": 9.070178265358372e-05, + "loss": 1.8542, + "step": 7213 + }, + { + "epoch": 2.214241866175568, + "grad_norm": 0.5011601448059082, + "learning_rate": 9.069889547720936e-05, + "loss": 1.9135, + "step": 7214 + }, + { + "epoch": 2.214548802946593, + "grad_norm": 0.3946228623390198, + "learning_rate": 9.069600789862467e-05, + "loss": 1.876, + "step": 7215 + }, + { + "epoch": 2.2148557397176183, + "grad_norm": 0.34833815693855286, + "learning_rate": 9.069311991785816e-05, + "loss": 1.8666, + "step": 7216 + }, + { + "epoch": 2.215162676488643, + "grad_norm": 0.43735191226005554, + "learning_rate": 9.069023153493839e-05, + "loss": 1.9238, + "step": 7217 + }, + { + "epoch": 2.2154696132596685, + "grad_norm": 0.5010718107223511, + "learning_rate": 9.06873427498939e-05, + "loss": 1.8724, + "step": 7218 + }, + { + "epoch": 2.215776550030694, + "grad_norm": 0.35850396752357483, + "learning_rate": 9.068445356275326e-05, + "loss": 1.8825, + "step": 7219 + }, + { + "epoch": 2.2160834868017187, + "grad_norm": 0.3528468906879425, + "learning_rate": 9.0681563973545e-05, + "loss": 1.8724, + "step": 7220 + }, + { + "epoch": 2.216390423572744, + "grad_norm": 0.34725508093833923, + "learning_rate": 9.067867398229767e-05, + "loss": 1.8722, + "step": 7221 + }, + { + "epoch": 2.216697360343769, + "grad_norm": 0.3343757092952728, + "learning_rate": 9.067578358903985e-05, + "loss": 1.8144, + "step": 7222 + }, + { + "epoch": 2.2170042971147943, + "grad_norm": 0.33384087681770325, + "learning_rate": 9.067289279380009e-05, + "loss": 1.832, + "step": 7223 + }, + { + "epoch": 2.2173112338858196, + "grad_norm": 0.3275810778141022, + "learning_rate": 9.067000159660697e-05, + "loss": 1.8819, + "step": 7224 + }, + { + "epoch": 2.2176181706568445, + "grad_norm": 0.405293732881546, + "learning_rate": 9.066710999748904e-05, + "loss": 1.8669, + "step": 7225 + }, + { + "epoch": 2.21792510742787, + "grad_norm": 0.3554569482803345, + "learning_rate": 9.066421799647491e-05, + "loss": 1.8331, + "step": 7226 + }, + { + "epoch": 2.218232044198895, + "grad_norm": 0.3896840810775757, + "learning_rate": 9.066132559359313e-05, + "loss": 1.891, + "step": 7227 + }, + { + "epoch": 2.21853898096992, + "grad_norm": 0.38668718934059143, + "learning_rate": 9.065843278887231e-05, + "loss": 1.9162, + "step": 7228 + }, + { + "epoch": 2.2188459177409454, + "grad_norm": 0.3593392074108124, + "learning_rate": 9.065553958234103e-05, + "loss": 1.866, + "step": 7229 + }, + { + "epoch": 2.2191528545119708, + "grad_norm": 0.3509809076786041, + "learning_rate": 9.065264597402788e-05, + "loss": 1.8979, + "step": 7230 + }, + { + "epoch": 2.2194597912829956, + "grad_norm": 0.35477882623672485, + "learning_rate": 9.064975196396144e-05, + "loss": 1.8425, + "step": 7231 + }, + { + "epoch": 2.219766728054021, + "grad_norm": 0.38763463497161865, + "learning_rate": 9.064685755217033e-05, + "loss": 1.8853, + "step": 7232 + }, + { + "epoch": 2.220073664825046, + "grad_norm": 0.33559930324554443, + "learning_rate": 9.064396273868316e-05, + "loss": 1.8825, + "step": 7233 + }, + { + "epoch": 2.220380601596071, + "grad_norm": 0.3130233585834503, + "learning_rate": 9.064106752352852e-05, + "loss": 1.8082, + "step": 7234 + }, + { + "epoch": 2.2206875383670965, + "grad_norm": 0.33321285247802734, + "learning_rate": 9.063817190673503e-05, + "loss": 1.8795, + "step": 7235 + }, + { + "epoch": 2.2209944751381214, + "grad_norm": 0.47564151883125305, + "learning_rate": 9.063527588833132e-05, + "loss": 1.9461, + "step": 7236 + }, + { + "epoch": 2.2213014119091468, + "grad_norm": 0.38102859258651733, + "learning_rate": 9.063237946834597e-05, + "loss": 1.8656, + "step": 7237 + }, + { + "epoch": 2.2216083486801717, + "grad_norm": 0.32240456342697144, + "learning_rate": 9.062948264680765e-05, + "loss": 1.8187, + "step": 7238 + }, + { + "epoch": 2.221915285451197, + "grad_norm": 0.2852800190448761, + "learning_rate": 9.062658542374496e-05, + "loss": 1.8172, + "step": 7239 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3506350815296173, + "learning_rate": 9.062368779918655e-05, + "loss": 1.8909, + "step": 7240 + }, + { + "epoch": 2.222529158993247, + "grad_norm": 0.29418817162513733, + "learning_rate": 9.062078977316104e-05, + "loss": 1.8078, + "step": 7241 + }, + { + "epoch": 2.2228360957642725, + "grad_norm": 0.31221407651901245, + "learning_rate": 9.061789134569707e-05, + "loss": 1.8813, + "step": 7242 + }, + { + "epoch": 2.223143032535298, + "grad_norm": 0.32314184308052063, + "learning_rate": 9.061499251682331e-05, + "loss": 1.8838, + "step": 7243 + }, + { + "epoch": 2.2234499693063228, + "grad_norm": 0.3329566419124603, + "learning_rate": 9.061209328656838e-05, + "loss": 1.8987, + "step": 7244 + }, + { + "epoch": 2.223756906077348, + "grad_norm": 0.35992133617401123, + "learning_rate": 9.060919365496094e-05, + "loss": 1.9194, + "step": 7245 + }, + { + "epoch": 2.2240638428483734, + "grad_norm": 0.33594536781311035, + "learning_rate": 9.060629362202964e-05, + "loss": 1.8303, + "step": 7246 + }, + { + "epoch": 2.2243707796193983, + "grad_norm": 0.3469938635826111, + "learning_rate": 9.060339318780316e-05, + "loss": 1.905, + "step": 7247 + }, + { + "epoch": 2.2246777163904237, + "grad_norm": 0.3989942967891693, + "learning_rate": 9.060049235231015e-05, + "loss": 1.8655, + "step": 7248 + }, + { + "epoch": 2.2249846531614486, + "grad_norm": 0.35004356503486633, + "learning_rate": 9.059759111557926e-05, + "loss": 1.8081, + "step": 7249 + }, + { + "epoch": 2.225291589932474, + "grad_norm": 0.38162320852279663, + "learning_rate": 9.059468947763919e-05, + "loss": 1.9243, + "step": 7250 + }, + { + "epoch": 2.2255985267034992, + "grad_norm": 0.3417564034461975, + "learning_rate": 9.059178743851859e-05, + "loss": 1.8246, + "step": 7251 + }, + { + "epoch": 2.225905463474524, + "grad_norm": 0.39185380935668945, + "learning_rate": 9.058888499824618e-05, + "loss": 1.9235, + "step": 7252 + }, + { + "epoch": 2.2262124002455494, + "grad_norm": 0.5741223096847534, + "learning_rate": 9.058598215685061e-05, + "loss": 1.9104, + "step": 7253 + }, + { + "epoch": 2.226519337016575, + "grad_norm": 0.6595804691314697, + "learning_rate": 9.058307891436057e-05, + "loss": 1.9956, + "step": 7254 + }, + { + "epoch": 2.2268262737875997, + "grad_norm": 0.6249661445617676, + "learning_rate": 9.058017527080476e-05, + "loss": 1.8913, + "step": 7255 + }, + { + "epoch": 2.227133210558625, + "grad_norm": 0.48208609223365784, + "learning_rate": 9.057727122621188e-05, + "loss": 1.9116, + "step": 7256 + }, + { + "epoch": 2.22744014732965, + "grad_norm": 0.37400147318840027, + "learning_rate": 9.057436678061062e-05, + "loss": 1.8828, + "step": 7257 + }, + { + "epoch": 2.2277470841006752, + "grad_norm": 0.40321463346481323, + "learning_rate": 9.057146193402968e-05, + "loss": 1.7984, + "step": 7258 + }, + { + "epoch": 2.2280540208717006, + "grad_norm": 0.43090149760246277, + "learning_rate": 9.056855668649778e-05, + "loss": 1.9135, + "step": 7259 + }, + { + "epoch": 2.2283609576427255, + "grad_norm": 0.3625677525997162, + "learning_rate": 9.056565103804362e-05, + "loss": 1.9005, + "step": 7260 + }, + { + "epoch": 2.228667894413751, + "grad_norm": 0.3386496901512146, + "learning_rate": 9.056274498869593e-05, + "loss": 1.879, + "step": 7261 + }, + { + "epoch": 2.228974831184776, + "grad_norm": 0.45207980275154114, + "learning_rate": 9.05598385384834e-05, + "loss": 1.8748, + "step": 7262 + }, + { + "epoch": 2.229281767955801, + "grad_norm": 0.38665562868118286, + "learning_rate": 9.055693168743478e-05, + "loss": 1.8828, + "step": 7263 + }, + { + "epoch": 2.2295887047268264, + "grad_norm": 0.3074968159198761, + "learning_rate": 9.05540244355788e-05, + "loss": 1.8443, + "step": 7264 + }, + { + "epoch": 2.2298956414978512, + "grad_norm": 0.36243903636932373, + "learning_rate": 9.055111678294418e-05, + "loss": 1.8681, + "step": 7265 + }, + { + "epoch": 2.2302025782688766, + "grad_norm": 0.4070085287094116, + "learning_rate": 9.054820872955965e-05, + "loss": 1.8643, + "step": 7266 + }, + { + "epoch": 2.230509515039902, + "grad_norm": 0.3784204125404358, + "learning_rate": 9.054530027545396e-05, + "loss": 1.9197, + "step": 7267 + }, + { + "epoch": 2.230816451810927, + "grad_norm": 0.32002586126327515, + "learning_rate": 9.054239142065583e-05, + "loss": 1.9, + "step": 7268 + }, + { + "epoch": 2.231123388581952, + "grad_norm": 0.3701259195804596, + "learning_rate": 9.053948216519405e-05, + "loss": 1.8815, + "step": 7269 + }, + { + "epoch": 2.2314303253529775, + "grad_norm": 0.32927554845809937, + "learning_rate": 9.053657250909734e-05, + "loss": 1.8599, + "step": 7270 + }, + { + "epoch": 2.2317372621240024, + "grad_norm": 0.2915503680706024, + "learning_rate": 9.053366245239445e-05, + "loss": 1.8553, + "step": 7271 + }, + { + "epoch": 2.2320441988950277, + "grad_norm": 0.3347928822040558, + "learning_rate": 9.053075199511416e-05, + "loss": 1.926, + "step": 7272 + }, + { + "epoch": 2.2323511356660526, + "grad_norm": 0.37499183416366577, + "learning_rate": 9.052784113728523e-05, + "loss": 1.8636, + "step": 7273 + }, + { + "epoch": 2.232658072437078, + "grad_norm": 0.38303107023239136, + "learning_rate": 9.05249298789364e-05, + "loss": 1.8739, + "step": 7274 + }, + { + "epoch": 2.2329650092081033, + "grad_norm": 0.356942355632782, + "learning_rate": 9.052201822009648e-05, + "loss": 1.8401, + "step": 7275 + }, + { + "epoch": 2.233271945979128, + "grad_norm": 0.3391316533088684, + "learning_rate": 9.051910616079422e-05, + "loss": 1.8954, + "step": 7276 + }, + { + "epoch": 2.2335788827501535, + "grad_norm": 0.3100464344024658, + "learning_rate": 9.051619370105839e-05, + "loss": 1.8726, + "step": 7277 + }, + { + "epoch": 2.233885819521179, + "grad_norm": 0.38745078444480896, + "learning_rate": 9.05132808409178e-05, + "loss": 1.9605, + "step": 7278 + }, + { + "epoch": 2.2341927562922037, + "grad_norm": 0.40631747245788574, + "learning_rate": 9.051036758040123e-05, + "loss": 1.8458, + "step": 7279 + }, + { + "epoch": 2.234499693063229, + "grad_norm": 0.4084717929363251, + "learning_rate": 9.050745391953745e-05, + "loss": 1.8696, + "step": 7280 + }, + { + "epoch": 2.234806629834254, + "grad_norm": 0.4426955282688141, + "learning_rate": 9.050453985835527e-05, + "loss": 1.9063, + "step": 7281 + }, + { + "epoch": 2.2351135666052793, + "grad_norm": 0.37360796332359314, + "learning_rate": 9.05016253968835e-05, + "loss": 1.9299, + "step": 7282 + }, + { + "epoch": 2.2354205033763046, + "grad_norm": 0.34415799379348755, + "learning_rate": 9.049871053515091e-05, + "loss": 1.8877, + "step": 7283 + }, + { + "epoch": 2.2357274401473295, + "grad_norm": 0.3745698928833008, + "learning_rate": 9.049579527318633e-05, + "loss": 1.9272, + "step": 7284 + }, + { + "epoch": 2.236034376918355, + "grad_norm": 0.3293079435825348, + "learning_rate": 9.049287961101857e-05, + "loss": 1.8599, + "step": 7285 + }, + { + "epoch": 2.23634131368938, + "grad_norm": 0.3563106060028076, + "learning_rate": 9.048996354867644e-05, + "loss": 1.938, + "step": 7286 + }, + { + "epoch": 2.236648250460405, + "grad_norm": 0.36354976892471313, + "learning_rate": 9.048704708618876e-05, + "loss": 1.9401, + "step": 7287 + }, + { + "epoch": 2.2369551872314304, + "grad_norm": 0.32659000158309937, + "learning_rate": 9.048413022358434e-05, + "loss": 1.8056, + "step": 7288 + }, + { + "epoch": 2.2372621240024557, + "grad_norm": 0.30486637353897095, + "learning_rate": 9.048121296089202e-05, + "loss": 1.8178, + "step": 7289 + }, + { + "epoch": 2.2375690607734806, + "grad_norm": 0.34506455063819885, + "learning_rate": 9.047829529814063e-05, + "loss": 1.8866, + "step": 7290 + }, + { + "epoch": 2.237875997544506, + "grad_norm": 0.3200983703136444, + "learning_rate": 9.047537723535902e-05, + "loss": 1.8218, + "step": 7291 + }, + { + "epoch": 2.238182934315531, + "grad_norm": 0.33315715193748474, + "learning_rate": 9.047245877257597e-05, + "loss": 1.8939, + "step": 7292 + }, + { + "epoch": 2.238489871086556, + "grad_norm": 0.38259127736091614, + "learning_rate": 9.046953990982039e-05, + "loss": 1.9566, + "step": 7293 + }, + { + "epoch": 2.2387968078575815, + "grad_norm": 0.32880350947380066, + "learning_rate": 9.04666206471211e-05, + "loss": 1.9056, + "step": 7294 + }, + { + "epoch": 2.2391037446286064, + "grad_norm": 0.39114195108413696, + "learning_rate": 9.046370098450692e-05, + "loss": 1.8773, + "step": 7295 + }, + { + "epoch": 2.2394106813996317, + "grad_norm": 0.37625813484191895, + "learning_rate": 9.046078092200675e-05, + "loss": 1.8685, + "step": 7296 + }, + { + "epoch": 2.2397176181706566, + "grad_norm": 0.3604978621006012, + "learning_rate": 9.045786045964942e-05, + "loss": 1.885, + "step": 7297 + }, + { + "epoch": 2.240024554941682, + "grad_norm": 0.32200589776039124, + "learning_rate": 9.045493959746381e-05, + "loss": 1.9146, + "step": 7298 + }, + { + "epoch": 2.2403314917127073, + "grad_norm": 0.3635976314544678, + "learning_rate": 9.045201833547876e-05, + "loss": 1.8597, + "step": 7299 + }, + { + "epoch": 2.240638428483732, + "grad_norm": 0.3326318562030792, + "learning_rate": 9.044909667372317e-05, + "loss": 1.8577, + "step": 7300 + }, + { + "epoch": 2.2409453652547575, + "grad_norm": 0.32209664583206177, + "learning_rate": 9.044617461222589e-05, + "loss": 1.844, + "step": 7301 + }, + { + "epoch": 2.241252302025783, + "grad_norm": 0.3654637634754181, + "learning_rate": 9.044325215101581e-05, + "loss": 1.8858, + "step": 7302 + }, + { + "epoch": 2.2415592387968077, + "grad_norm": 0.3583166003227234, + "learning_rate": 9.04403292901218e-05, + "loss": 1.8148, + "step": 7303 + }, + { + "epoch": 2.241866175567833, + "grad_norm": 0.3315606117248535, + "learning_rate": 9.043740602957276e-05, + "loss": 1.8504, + "step": 7304 + }, + { + "epoch": 2.2421731123388584, + "grad_norm": 0.36084556579589844, + "learning_rate": 9.043448236939758e-05, + "loss": 1.9167, + "step": 7305 + }, + { + "epoch": 2.2424800491098833, + "grad_norm": 0.43558987975120544, + "learning_rate": 9.043155830962514e-05, + "loss": 1.8937, + "step": 7306 + }, + { + "epoch": 2.2427869858809086, + "grad_norm": 0.455240398645401, + "learning_rate": 9.042863385028433e-05, + "loss": 1.9774, + "step": 7307 + }, + { + "epoch": 2.2430939226519335, + "grad_norm": 0.35868698358535767, + "learning_rate": 9.042570899140408e-05, + "loss": 1.7999, + "step": 7308 + }, + { + "epoch": 2.243400859422959, + "grad_norm": 0.33930447697639465, + "learning_rate": 9.042278373301327e-05, + "loss": 1.965, + "step": 7309 + }, + { + "epoch": 2.243707796193984, + "grad_norm": 0.34124335646629333, + "learning_rate": 9.041985807514082e-05, + "loss": 1.8916, + "step": 7310 + }, + { + "epoch": 2.244014732965009, + "grad_norm": 0.3905695974826813, + "learning_rate": 9.041693201781565e-05, + "loss": 1.9066, + "step": 7311 + }, + { + "epoch": 2.2443216697360344, + "grad_norm": 0.3108711242675781, + "learning_rate": 9.041400556106667e-05, + "loss": 1.8038, + "step": 7312 + }, + { + "epoch": 2.2446286065070598, + "grad_norm": 0.2853390872478485, + "learning_rate": 9.041107870492279e-05, + "loss": 1.8945, + "step": 7313 + }, + { + "epoch": 2.2449355432780846, + "grad_norm": 0.33351564407348633, + "learning_rate": 9.040815144941295e-05, + "loss": 1.8796, + "step": 7314 + }, + { + "epoch": 2.24524248004911, + "grad_norm": 0.3470609486103058, + "learning_rate": 9.040522379456606e-05, + "loss": 1.8914, + "step": 7315 + }, + { + "epoch": 2.245549416820135, + "grad_norm": 0.3474356532096863, + "learning_rate": 9.040229574041109e-05, + "loss": 1.838, + "step": 7316 + }, + { + "epoch": 2.24585635359116, + "grad_norm": 0.36590397357940674, + "learning_rate": 9.039936728697693e-05, + "loss": 1.86, + "step": 7317 + }, + { + "epoch": 2.2461632903621855, + "grad_norm": 0.35168272256851196, + "learning_rate": 9.039643843429257e-05, + "loss": 1.9337, + "step": 7318 + }, + { + "epoch": 2.2464702271332104, + "grad_norm": 0.3402341604232788, + "learning_rate": 9.039350918238691e-05, + "loss": 1.9291, + "step": 7319 + }, + { + "epoch": 2.2467771639042358, + "grad_norm": 0.3505321443080902, + "learning_rate": 9.03905795312889e-05, + "loss": 1.8252, + "step": 7320 + }, + { + "epoch": 2.247084100675261, + "grad_norm": 0.38366270065307617, + "learning_rate": 9.038764948102754e-05, + "loss": 1.8685, + "step": 7321 + }, + { + "epoch": 2.247391037446286, + "grad_norm": 0.3616010844707489, + "learning_rate": 9.038471903163176e-05, + "loss": 1.8734, + "step": 7322 + }, + { + "epoch": 2.2476979742173113, + "grad_norm": 0.2982875108718872, + "learning_rate": 9.038178818313048e-05, + "loss": 1.824, + "step": 7323 + }, + { + "epoch": 2.248004910988336, + "grad_norm": 0.41936174035072327, + "learning_rate": 9.037885693555273e-05, + "loss": 1.8799, + "step": 7324 + }, + { + "epoch": 2.2483118477593615, + "grad_norm": 0.3460717797279358, + "learning_rate": 9.037592528892744e-05, + "loss": 1.8889, + "step": 7325 + }, + { + "epoch": 2.248618784530387, + "grad_norm": 0.34347018599510193, + "learning_rate": 9.03729932432836e-05, + "loss": 1.8779, + "step": 7326 + }, + { + "epoch": 2.2489257213014118, + "grad_norm": 0.2988032400608063, + "learning_rate": 9.037006079865016e-05, + "loss": 1.8753, + "step": 7327 + }, + { + "epoch": 2.249232658072437, + "grad_norm": 0.32754310965538025, + "learning_rate": 9.036712795505613e-05, + "loss": 1.8896, + "step": 7328 + }, + { + "epoch": 2.2495395948434624, + "grad_norm": 0.3599032163619995, + "learning_rate": 9.036419471253049e-05, + "loss": 1.8752, + "step": 7329 + }, + { + "epoch": 2.2498465316144873, + "grad_norm": 0.3461225926876068, + "learning_rate": 9.03612610711022e-05, + "loss": 1.8723, + "step": 7330 + }, + { + "epoch": 2.2501534683855127, + "grad_norm": 0.3141838610172272, + "learning_rate": 9.035832703080027e-05, + "loss": 1.8825, + "step": 7331 + }, + { + "epoch": 2.250460405156538, + "grad_norm": 0.35188567638397217, + "learning_rate": 9.035539259165371e-05, + "loss": 1.8832, + "step": 7332 + }, + { + "epoch": 2.250767341927563, + "grad_norm": 0.3496280014514923, + "learning_rate": 9.035245775369151e-05, + "loss": 1.9084, + "step": 7333 + }, + { + "epoch": 2.2510742786985882, + "grad_norm": 0.34936273097991943, + "learning_rate": 9.034952251694266e-05, + "loss": 1.8142, + "step": 7334 + }, + { + "epoch": 2.251381215469613, + "grad_norm": 0.4227045774459839, + "learning_rate": 9.034658688143618e-05, + "loss": 1.9454, + "step": 7335 + }, + { + "epoch": 2.2516881522406385, + "grad_norm": 0.4042366147041321, + "learning_rate": 9.034365084720108e-05, + "loss": 1.8993, + "step": 7336 + }, + { + "epoch": 2.251995089011664, + "grad_norm": 0.392633318901062, + "learning_rate": 9.03407144142664e-05, + "loss": 1.9229, + "step": 7337 + }, + { + "epoch": 2.2523020257826887, + "grad_norm": 0.31304940581321716, + "learning_rate": 9.033777758266111e-05, + "loss": 1.8746, + "step": 7338 + }, + { + "epoch": 2.252608962553714, + "grad_norm": 0.3205752372741699, + "learning_rate": 9.033484035241426e-05, + "loss": 1.8224, + "step": 7339 + }, + { + "epoch": 2.252915899324739, + "grad_norm": 0.32164251804351807, + "learning_rate": 9.033190272355488e-05, + "loss": 1.8164, + "step": 7340 + }, + { + "epoch": 2.2532228360957642, + "grad_norm": 0.3567545413970947, + "learning_rate": 9.032896469611201e-05, + "loss": 1.8892, + "step": 7341 + }, + { + "epoch": 2.2535297728667896, + "grad_norm": 0.3475800156593323, + "learning_rate": 9.032602627011467e-05, + "loss": 1.8594, + "step": 7342 + }, + { + "epoch": 2.2538367096378145, + "grad_norm": 0.38770994544029236, + "learning_rate": 9.032308744559189e-05, + "loss": 1.8899, + "step": 7343 + }, + { + "epoch": 2.25414364640884, + "grad_norm": 0.3671153783798218, + "learning_rate": 9.032014822257273e-05, + "loss": 1.8795, + "step": 7344 + }, + { + "epoch": 2.254450583179865, + "grad_norm": 0.3415989875793457, + "learning_rate": 9.031720860108623e-05, + "loss": 1.9007, + "step": 7345 + }, + { + "epoch": 2.25475751995089, + "grad_norm": 0.3317084014415741, + "learning_rate": 9.031426858116145e-05, + "loss": 1.8604, + "step": 7346 + }, + { + "epoch": 2.2550644567219154, + "grad_norm": 0.3760251998901367, + "learning_rate": 9.031132816282745e-05, + "loss": 1.9061, + "step": 7347 + }, + { + "epoch": 2.2553713934929407, + "grad_norm": 0.4288908541202545, + "learning_rate": 9.030838734611326e-05, + "loss": 1.8621, + "step": 7348 + }, + { + "epoch": 2.2556783302639656, + "grad_norm": 0.3840491771697998, + "learning_rate": 9.030544613104797e-05, + "loss": 1.8743, + "step": 7349 + }, + { + "epoch": 2.255985267034991, + "grad_norm": 0.32746297121047974, + "learning_rate": 9.030250451766063e-05, + "loss": 1.8813, + "step": 7350 + }, + { + "epoch": 2.256292203806016, + "grad_norm": 0.31266525387763977, + "learning_rate": 9.029956250598032e-05, + "loss": 1.816, + "step": 7351 + }, + { + "epoch": 2.256599140577041, + "grad_norm": 0.34744998812675476, + "learning_rate": 9.029662009603613e-05, + "loss": 1.8728, + "step": 7352 + }, + { + "epoch": 2.2569060773480665, + "grad_norm": 0.36204856634140015, + "learning_rate": 9.029367728785709e-05, + "loss": 1.9331, + "step": 7353 + }, + { + "epoch": 2.2572130141190914, + "grad_norm": 0.3839271664619446, + "learning_rate": 9.029073408147234e-05, + "loss": 2.0018, + "step": 7354 + }, + { + "epoch": 2.2575199508901167, + "grad_norm": 0.34844526648521423, + "learning_rate": 9.028779047691094e-05, + "loss": 1.8873, + "step": 7355 + }, + { + "epoch": 2.2578268876611416, + "grad_norm": 0.31876906752586365, + "learning_rate": 9.028484647420196e-05, + "loss": 1.8569, + "step": 7356 + }, + { + "epoch": 2.258133824432167, + "grad_norm": 0.3633274435997009, + "learning_rate": 9.028190207337452e-05, + "loss": 1.8645, + "step": 7357 + }, + { + "epoch": 2.2584407612031923, + "grad_norm": 0.39025530219078064, + "learning_rate": 9.027895727445775e-05, + "loss": 1.911, + "step": 7358 + }, + { + "epoch": 2.258747697974217, + "grad_norm": 0.34168434143066406, + "learning_rate": 9.027601207748067e-05, + "loss": 1.8675, + "step": 7359 + }, + { + "epoch": 2.2590546347452425, + "grad_norm": 0.3539605438709259, + "learning_rate": 9.027306648247245e-05, + "loss": 1.9001, + "step": 7360 + }, + { + "epoch": 2.259361571516268, + "grad_norm": 0.30433401465415955, + "learning_rate": 9.02701204894622e-05, + "loss": 1.8598, + "step": 7361 + }, + { + "epoch": 2.2596685082872927, + "grad_norm": 0.35448700189590454, + "learning_rate": 9.026717409847898e-05, + "loss": 1.8845, + "step": 7362 + }, + { + "epoch": 2.259975445058318, + "grad_norm": 0.34060248732566833, + "learning_rate": 9.026422730955197e-05, + "loss": 1.9322, + "step": 7363 + }, + { + "epoch": 2.2602823818293434, + "grad_norm": 0.3370642364025116, + "learning_rate": 9.026128012271026e-05, + "loss": 1.8356, + "step": 7364 + }, + { + "epoch": 2.2605893186003683, + "grad_norm": 0.3148033022880554, + "learning_rate": 9.025833253798298e-05, + "loss": 1.7723, + "step": 7365 + }, + { + "epoch": 2.2608962553713936, + "grad_norm": 0.3062879145145416, + "learning_rate": 9.025538455539925e-05, + "loss": 1.8548, + "step": 7366 + }, + { + "epoch": 2.2612031921424185, + "grad_norm": 0.3378484547138214, + "learning_rate": 9.025243617498825e-05, + "loss": 1.9049, + "step": 7367 + }, + { + "epoch": 2.261510128913444, + "grad_norm": 0.277660608291626, + "learning_rate": 9.024948739677905e-05, + "loss": 1.7833, + "step": 7368 + }, + { + "epoch": 2.261817065684469, + "grad_norm": 0.3986060619354248, + "learning_rate": 9.024653822080083e-05, + "loss": 1.8837, + "step": 7369 + }, + { + "epoch": 2.262124002455494, + "grad_norm": 0.3013289272785187, + "learning_rate": 9.024358864708275e-05, + "loss": 1.8659, + "step": 7370 + }, + { + "epoch": 2.2624309392265194, + "grad_norm": 0.3403053879737854, + "learning_rate": 9.024063867565391e-05, + "loss": 1.8914, + "step": 7371 + }, + { + "epoch": 2.2627378759975443, + "grad_norm": 0.3488257825374603, + "learning_rate": 9.023768830654351e-05, + "loss": 1.8887, + "step": 7372 + }, + { + "epoch": 2.2630448127685696, + "grad_norm": 0.2950255274772644, + "learning_rate": 9.023473753978069e-05, + "loss": 1.8385, + "step": 7373 + }, + { + "epoch": 2.263351749539595, + "grad_norm": 0.35732173919677734, + "learning_rate": 9.023178637539461e-05, + "loss": 1.8769, + "step": 7374 + }, + { + "epoch": 2.26365868631062, + "grad_norm": 0.5403436422348022, + "learning_rate": 9.022883481341445e-05, + "loss": 1.9742, + "step": 7375 + }, + { + "epoch": 2.263965623081645, + "grad_norm": 0.5506799221038818, + "learning_rate": 9.022588285386935e-05, + "loss": 1.8667, + "step": 7376 + }, + { + "epoch": 2.2642725598526705, + "grad_norm": 0.4272395372390747, + "learning_rate": 9.02229304967885e-05, + "loss": 1.8336, + "step": 7377 + }, + { + "epoch": 2.2645794966236954, + "grad_norm": 0.34911462664604187, + "learning_rate": 9.021997774220108e-05, + "loss": 1.8608, + "step": 7378 + }, + { + "epoch": 2.2648864333947207, + "grad_norm": 0.3592715263366699, + "learning_rate": 9.021702459013626e-05, + "loss": 1.925, + "step": 7379 + }, + { + "epoch": 2.265193370165746, + "grad_norm": 0.38482216000556946, + "learning_rate": 9.021407104062323e-05, + "loss": 1.8553, + "step": 7380 + }, + { + "epoch": 2.265500306936771, + "grad_norm": 0.4675584137439728, + "learning_rate": 9.021111709369118e-05, + "loss": 1.9303, + "step": 7381 + }, + { + "epoch": 2.2658072437077963, + "grad_norm": 0.40397754311561584, + "learning_rate": 9.02081627493693e-05, + "loss": 1.9512, + "step": 7382 + }, + { + "epoch": 2.266114180478821, + "grad_norm": 0.3385498821735382, + "learning_rate": 9.02052080076868e-05, + "loss": 1.8314, + "step": 7383 + }, + { + "epoch": 2.2664211172498465, + "grad_norm": 0.40668871998786926, + "learning_rate": 9.020225286867285e-05, + "loss": 1.8658, + "step": 7384 + }, + { + "epoch": 2.266728054020872, + "grad_norm": 0.4566061198711395, + "learning_rate": 9.01992973323567e-05, + "loss": 1.8429, + "step": 7385 + }, + { + "epoch": 2.2670349907918967, + "grad_norm": 0.42283549904823303, + "learning_rate": 9.019634139876752e-05, + "loss": 1.8858, + "step": 7386 + }, + { + "epoch": 2.267341927562922, + "grad_norm": 0.3491251468658447, + "learning_rate": 9.019338506793454e-05, + "loss": 1.8389, + "step": 7387 + }, + { + "epoch": 2.267648864333947, + "grad_norm": 0.33846428990364075, + "learning_rate": 9.019042833988696e-05, + "loss": 1.8309, + "step": 7388 + }, + { + "epoch": 2.2679558011049723, + "grad_norm": 0.39968016743659973, + "learning_rate": 9.0187471214654e-05, + "loss": 1.8591, + "step": 7389 + }, + { + "epoch": 2.2682627378759976, + "grad_norm": 0.39926376938819885, + "learning_rate": 9.018451369226493e-05, + "loss": 1.9341, + "step": 7390 + }, + { + "epoch": 2.2685696746470225, + "grad_norm": 0.41112056374549866, + "learning_rate": 9.018155577274892e-05, + "loss": 1.8856, + "step": 7391 + }, + { + "epoch": 2.268876611418048, + "grad_norm": 0.49490058422088623, + "learning_rate": 9.017859745613521e-05, + "loss": 1.8458, + "step": 7392 + }, + { + "epoch": 2.269183548189073, + "grad_norm": 0.42149874567985535, + "learning_rate": 9.017563874245308e-05, + "loss": 1.862, + "step": 7393 + }, + { + "epoch": 2.269490484960098, + "grad_norm": 0.37284091114997864, + "learning_rate": 9.017267963173173e-05, + "loss": 1.8698, + "step": 7394 + }, + { + "epoch": 2.2697974217311234, + "grad_norm": 0.3743322193622589, + "learning_rate": 9.016972012400041e-05, + "loss": 1.8847, + "step": 7395 + }, + { + "epoch": 2.2701043585021488, + "grad_norm": 0.4327050447463989, + "learning_rate": 9.016676021928838e-05, + "loss": 1.8227, + "step": 7396 + }, + { + "epoch": 2.2704112952731736, + "grad_norm": 0.4334336519241333, + "learning_rate": 9.016379991762487e-05, + "loss": 1.9292, + "step": 7397 + }, + { + "epoch": 2.270718232044199, + "grad_norm": 0.37071630358695984, + "learning_rate": 9.016083921903915e-05, + "loss": 1.8045, + "step": 7398 + }, + { + "epoch": 2.271025168815224, + "grad_norm": 0.32131752371788025, + "learning_rate": 9.015787812356049e-05, + "loss": 1.8697, + "step": 7399 + }, + { + "epoch": 2.271332105586249, + "grad_norm": 0.3604664206504822, + "learning_rate": 9.015491663121813e-05, + "loss": 1.9259, + "step": 7400 + }, + { + "epoch": 2.2716390423572745, + "grad_norm": 0.3364580571651459, + "learning_rate": 9.015195474204136e-05, + "loss": 1.8964, + "step": 7401 + }, + { + "epoch": 2.2719459791282994, + "grad_norm": 0.3141402304172516, + "learning_rate": 9.014899245605944e-05, + "loss": 1.8536, + "step": 7402 + }, + { + "epoch": 2.2722529158993248, + "grad_norm": 0.3387024402618408, + "learning_rate": 9.014602977330162e-05, + "loss": 1.8362, + "step": 7403 + }, + { + "epoch": 2.27255985267035, + "grad_norm": 0.42270272970199585, + "learning_rate": 9.014306669379723e-05, + "loss": 1.8288, + "step": 7404 + }, + { + "epoch": 2.272866789441375, + "grad_norm": 0.4565230906009674, + "learning_rate": 9.01401032175755e-05, + "loss": 1.8573, + "step": 7405 + }, + { + "epoch": 2.2731737262124003, + "grad_norm": 0.38861140608787537, + "learning_rate": 9.013713934466576e-05, + "loss": 1.8778, + "step": 7406 + }, + { + "epoch": 2.2734806629834257, + "grad_norm": 0.31552520394325256, + "learning_rate": 9.01341750750973e-05, + "loss": 1.8342, + "step": 7407 + }, + { + "epoch": 2.2737875997544506, + "grad_norm": 0.3771591782569885, + "learning_rate": 9.013121040889938e-05, + "loss": 1.8847, + "step": 7408 + }, + { + "epoch": 2.274094536525476, + "grad_norm": 0.3689042925834656, + "learning_rate": 9.012824534610132e-05, + "loss": 1.9014, + "step": 7409 + }, + { + "epoch": 2.2744014732965008, + "grad_norm": 0.31477800011634827, + "learning_rate": 9.012527988673241e-05, + "loss": 1.8631, + "step": 7410 + }, + { + "epoch": 2.274708410067526, + "grad_norm": 0.3238977789878845, + "learning_rate": 9.012231403082199e-05, + "loss": 1.8319, + "step": 7411 + }, + { + "epoch": 2.2750153468385514, + "grad_norm": 0.3587593138217926, + "learning_rate": 9.011934777839932e-05, + "loss": 1.8982, + "step": 7412 + }, + { + "epoch": 2.2753222836095763, + "grad_norm": 0.35946986079216003, + "learning_rate": 9.011638112949376e-05, + "loss": 1.9206, + "step": 7413 + }, + { + "epoch": 2.2756292203806017, + "grad_norm": 0.3451001048088074, + "learning_rate": 9.01134140841346e-05, + "loss": 1.8122, + "step": 7414 + }, + { + "epoch": 2.2759361571516266, + "grad_norm": 0.3779532313346863, + "learning_rate": 9.011044664235116e-05, + "loss": 1.8851, + "step": 7415 + }, + { + "epoch": 2.276243093922652, + "grad_norm": 0.3812767267227173, + "learning_rate": 9.010747880417279e-05, + "loss": 1.902, + "step": 7416 + }, + { + "epoch": 2.2765500306936772, + "grad_norm": 0.3666127920150757, + "learning_rate": 9.01045105696288e-05, + "loss": 1.8296, + "step": 7417 + }, + { + "epoch": 2.276856967464702, + "grad_norm": 0.3588816225528717, + "learning_rate": 9.010154193874854e-05, + "loss": 1.9023, + "step": 7418 + }, + { + "epoch": 2.2771639042357275, + "grad_norm": 0.37766706943511963, + "learning_rate": 9.009857291156134e-05, + "loss": 1.7996, + "step": 7419 + }, + { + "epoch": 2.277470841006753, + "grad_norm": 0.4222901165485382, + "learning_rate": 9.009560348809654e-05, + "loss": 1.8802, + "step": 7420 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.39289870858192444, + "learning_rate": 9.009263366838348e-05, + "loss": 1.8988, + "step": 7421 + }, + { + "epoch": 2.278084714548803, + "grad_norm": 0.3670540750026703, + "learning_rate": 9.008966345245152e-05, + "loss": 1.8348, + "step": 7422 + }, + { + "epoch": 2.2783916513198283, + "grad_norm": 0.36671552062034607, + "learning_rate": 9.008669284032998e-05, + "loss": 1.9059, + "step": 7423 + }, + { + "epoch": 2.2786985880908532, + "grad_norm": 0.33226338028907776, + "learning_rate": 9.008372183204827e-05, + "loss": 1.8736, + "step": 7424 + }, + { + "epoch": 2.2790055248618786, + "grad_norm": 0.3424983322620392, + "learning_rate": 9.008075042763573e-05, + "loss": 1.8537, + "step": 7425 + }, + { + "epoch": 2.2793124616329035, + "grad_norm": 0.3336870074272156, + "learning_rate": 9.007777862712172e-05, + "loss": 1.8622, + "step": 7426 + }, + { + "epoch": 2.279619398403929, + "grad_norm": 0.3488881289958954, + "learning_rate": 9.007480643053561e-05, + "loss": 1.88, + "step": 7427 + }, + { + "epoch": 2.279926335174954, + "grad_norm": 0.34159761667251587, + "learning_rate": 9.007183383790676e-05, + "loss": 1.8893, + "step": 7428 + }, + { + "epoch": 2.280233271945979, + "grad_norm": 0.3075805604457855, + "learning_rate": 9.006886084926459e-05, + "loss": 1.8613, + "step": 7429 + }, + { + "epoch": 2.2805402087170044, + "grad_norm": 0.32371413707733154, + "learning_rate": 9.006588746463844e-05, + "loss": 1.909, + "step": 7430 + }, + { + "epoch": 2.2808471454880292, + "grad_norm": 0.34343451261520386, + "learning_rate": 9.006291368405769e-05, + "loss": 1.8696, + "step": 7431 + }, + { + "epoch": 2.2811540822590546, + "grad_norm": 0.34018251299858093, + "learning_rate": 9.005993950755177e-05, + "loss": 1.9155, + "step": 7432 + }, + { + "epoch": 2.28146101903008, + "grad_norm": 0.42582982778549194, + "learning_rate": 9.005696493515003e-05, + "loss": 1.8901, + "step": 7433 + }, + { + "epoch": 2.281767955801105, + "grad_norm": 0.44168829917907715, + "learning_rate": 9.005398996688188e-05, + "loss": 1.8693, + "step": 7434 + }, + { + "epoch": 2.28207489257213, + "grad_norm": 0.3650555908679962, + "learning_rate": 9.005101460277673e-05, + "loss": 1.8726, + "step": 7435 + }, + { + "epoch": 2.2823818293431555, + "grad_norm": 0.2945705056190491, + "learning_rate": 9.004803884286399e-05, + "loss": 1.8655, + "step": 7436 + }, + { + "epoch": 2.2826887661141804, + "grad_norm": 0.4192120432853699, + "learning_rate": 9.004506268717305e-05, + "loss": 1.9859, + "step": 7437 + }, + { + "epoch": 2.2829957028852057, + "grad_norm": 0.35403937101364136, + "learning_rate": 9.004208613573334e-05, + "loss": 1.785, + "step": 7438 + }, + { + "epoch": 2.283302639656231, + "grad_norm": 0.3038218021392822, + "learning_rate": 9.003910918857426e-05, + "loss": 1.8199, + "step": 7439 + }, + { + "epoch": 2.283609576427256, + "grad_norm": 0.3447442352771759, + "learning_rate": 9.003613184572522e-05, + "loss": 1.882, + "step": 7440 + }, + { + "epoch": 2.2839165131982813, + "grad_norm": 0.32208123803138733, + "learning_rate": 9.003315410721567e-05, + "loss": 1.8326, + "step": 7441 + }, + { + "epoch": 2.284223449969306, + "grad_norm": 0.31731268763542175, + "learning_rate": 9.003017597307504e-05, + "loss": 1.8947, + "step": 7442 + }, + { + "epoch": 2.2845303867403315, + "grad_norm": 0.3491910398006439, + "learning_rate": 9.002719744333273e-05, + "loss": 1.924, + "step": 7443 + }, + { + "epoch": 2.284837323511357, + "grad_norm": 0.32135117053985596, + "learning_rate": 9.00242185180182e-05, + "loss": 1.838, + "step": 7444 + }, + { + "epoch": 2.2851442602823817, + "grad_norm": 0.32201823592185974, + "learning_rate": 9.00212391971609e-05, + "loss": 1.8449, + "step": 7445 + }, + { + "epoch": 2.285451197053407, + "grad_norm": 0.3983609676361084, + "learning_rate": 9.001825948079024e-05, + "loss": 1.8897, + "step": 7446 + }, + { + "epoch": 2.285758133824432, + "grad_norm": 0.4174421727657318, + "learning_rate": 9.001527936893568e-05, + "loss": 1.8671, + "step": 7447 + }, + { + "epoch": 2.2860650705954573, + "grad_norm": 0.3456888496875763, + "learning_rate": 9.001229886162668e-05, + "loss": 1.9064, + "step": 7448 + }, + { + "epoch": 2.2863720073664826, + "grad_norm": 0.3092229664325714, + "learning_rate": 9.000931795889269e-05, + "loss": 1.8478, + "step": 7449 + }, + { + "epoch": 2.2866789441375075, + "grad_norm": 0.40093541145324707, + "learning_rate": 9.000633666076317e-05, + "loss": 1.9226, + "step": 7450 + }, + { + "epoch": 2.286985880908533, + "grad_norm": 0.41090336441993713, + "learning_rate": 9.000335496726759e-05, + "loss": 1.8542, + "step": 7451 + }, + { + "epoch": 2.287292817679558, + "grad_norm": 0.48479974269866943, + "learning_rate": 9.00003728784354e-05, + "loss": 1.9217, + "step": 7452 + }, + { + "epoch": 2.287599754450583, + "grad_norm": 0.662677526473999, + "learning_rate": 8.999739039429609e-05, + "loss": 1.957, + "step": 7453 + }, + { + "epoch": 2.2879066912216084, + "grad_norm": 0.6417959928512573, + "learning_rate": 8.999440751487911e-05, + "loss": 1.8273, + "step": 7454 + }, + { + "epoch": 2.2882136279926337, + "grad_norm": 0.5561745762825012, + "learning_rate": 8.999142424021396e-05, + "loss": 1.9081, + "step": 7455 + }, + { + "epoch": 2.2885205647636586, + "grad_norm": 0.3603537976741791, + "learning_rate": 8.998844057033013e-05, + "loss": 1.8256, + "step": 7456 + }, + { + "epoch": 2.288827501534684, + "grad_norm": 0.5149406790733337, + "learning_rate": 8.998545650525707e-05, + "loss": 1.8257, + "step": 7457 + }, + { + "epoch": 2.289134438305709, + "grad_norm": 0.6777750253677368, + "learning_rate": 8.99824720450243e-05, + "loss": 1.8581, + "step": 7458 + }, + { + "epoch": 2.289441375076734, + "grad_norm": 0.6244171857833862, + "learning_rate": 8.997948718966132e-05, + "loss": 1.9195, + "step": 7459 + }, + { + "epoch": 2.2897483118477595, + "grad_norm": 0.3903466463088989, + "learning_rate": 8.99765019391976e-05, + "loss": 1.8996, + "step": 7460 + }, + { + "epoch": 2.2900552486187844, + "grad_norm": 0.4231773614883423, + "learning_rate": 8.997351629366266e-05, + "loss": 1.9447, + "step": 7461 + }, + { + "epoch": 2.2903621853898097, + "grad_norm": 0.5735896825790405, + "learning_rate": 8.997053025308602e-05, + "loss": 1.9082, + "step": 7462 + }, + { + "epoch": 2.2906691221608346, + "grad_norm": 0.5015980005264282, + "learning_rate": 8.996754381749715e-05, + "loss": 1.8744, + "step": 7463 + }, + { + "epoch": 2.29097605893186, + "grad_norm": 0.3385339677333832, + "learning_rate": 8.996455698692558e-05, + "loss": 1.8908, + "step": 7464 + }, + { + "epoch": 2.2912829957028853, + "grad_norm": 0.35323935747146606, + "learning_rate": 8.996156976140086e-05, + "loss": 1.8739, + "step": 7465 + }, + { + "epoch": 2.29158993247391, + "grad_norm": 0.386081725358963, + "learning_rate": 8.995858214095248e-05, + "loss": 1.8734, + "step": 7466 + }, + { + "epoch": 2.2918968692449355, + "grad_norm": 0.32834386825561523, + "learning_rate": 8.995559412560996e-05, + "loss": 1.8849, + "step": 7467 + }, + { + "epoch": 2.292203806015961, + "grad_norm": 0.3868117034435272, + "learning_rate": 8.995260571540284e-05, + "loss": 1.8992, + "step": 7468 + }, + { + "epoch": 2.2925107427869857, + "grad_norm": 0.3869209885597229, + "learning_rate": 8.994961691036066e-05, + "loss": 1.8562, + "step": 7469 + }, + { + "epoch": 2.292817679558011, + "grad_norm": 0.39098650217056274, + "learning_rate": 8.994662771051294e-05, + "loss": 1.9077, + "step": 7470 + }, + { + "epoch": 2.2931246163290364, + "grad_norm": 0.4433341920375824, + "learning_rate": 8.994363811588923e-05, + "loss": 1.9193, + "step": 7471 + }, + { + "epoch": 2.2934315531000613, + "grad_norm": 0.37947940826416016, + "learning_rate": 8.99406481265191e-05, + "loss": 1.8843, + "step": 7472 + }, + { + "epoch": 2.2937384898710866, + "grad_norm": 0.4123954772949219, + "learning_rate": 8.993765774243206e-05, + "loss": 1.8847, + "step": 7473 + }, + { + "epoch": 2.2940454266421115, + "grad_norm": 0.3863835036754608, + "learning_rate": 8.993466696365768e-05, + "loss": 1.8226, + "step": 7474 + }, + { + "epoch": 2.294352363413137, + "grad_norm": 0.34903961420059204, + "learning_rate": 8.993167579022551e-05, + "loss": 1.9151, + "step": 7475 + }, + { + "epoch": 2.294659300184162, + "grad_norm": 0.439989298582077, + "learning_rate": 8.992868422216512e-05, + "loss": 1.8494, + "step": 7476 + }, + { + "epoch": 2.294966236955187, + "grad_norm": 0.42929476499557495, + "learning_rate": 8.992569225950607e-05, + "loss": 1.8174, + "step": 7477 + }, + { + "epoch": 2.2952731737262124, + "grad_norm": 0.39554497599601746, + "learning_rate": 8.992269990227792e-05, + "loss": 1.8692, + "step": 7478 + }, + { + "epoch": 2.2955801104972378, + "grad_norm": 0.29355254769325256, + "learning_rate": 8.991970715051026e-05, + "loss": 1.8033, + "step": 7479 + }, + { + "epoch": 2.2958870472682626, + "grad_norm": 0.3488605320453644, + "learning_rate": 8.991671400423265e-05, + "loss": 1.8979, + "step": 7480 + }, + { + "epoch": 2.296193984039288, + "grad_norm": 0.34984245896339417, + "learning_rate": 8.991372046347468e-05, + "loss": 1.8931, + "step": 7481 + }, + { + "epoch": 2.2965009208103133, + "grad_norm": 0.29404810070991516, + "learning_rate": 8.991072652826593e-05, + "loss": 1.8626, + "step": 7482 + }, + { + "epoch": 2.296807857581338, + "grad_norm": 0.2838701009750366, + "learning_rate": 8.990773219863598e-05, + "loss": 1.8542, + "step": 7483 + }, + { + "epoch": 2.2971147943523635, + "grad_norm": 0.28008925914764404, + "learning_rate": 8.990473747461444e-05, + "loss": 1.8354, + "step": 7484 + }, + { + "epoch": 2.2974217311233884, + "grad_norm": 0.3046751320362091, + "learning_rate": 8.99017423562309e-05, + "loss": 1.8657, + "step": 7485 + }, + { + "epoch": 2.2977286678944138, + "grad_norm": 0.28220781683921814, + "learning_rate": 8.989874684351494e-05, + "loss": 1.8349, + "step": 7486 + }, + { + "epoch": 2.298035604665439, + "grad_norm": 0.2665577232837677, + "learning_rate": 8.989575093649619e-05, + "loss": 1.8551, + "step": 7487 + }, + { + "epoch": 2.298342541436464, + "grad_norm": 0.2797924280166626, + "learning_rate": 8.989275463520423e-05, + "loss": 1.8568, + "step": 7488 + }, + { + "epoch": 2.2986494782074893, + "grad_norm": 0.2917410731315613, + "learning_rate": 8.98897579396687e-05, + "loss": 1.843, + "step": 7489 + }, + { + "epoch": 2.298956414978514, + "grad_norm": 0.3014819920063019, + "learning_rate": 8.98867608499192e-05, + "loss": 1.8527, + "step": 7490 + }, + { + "epoch": 2.2992633517495396, + "grad_norm": 0.28019243478775024, + "learning_rate": 8.988376336598537e-05, + "loss": 1.7744, + "step": 7491 + }, + { + "epoch": 2.299570288520565, + "grad_norm": 0.35014277696609497, + "learning_rate": 8.988076548789678e-05, + "loss": 1.9604, + "step": 7492 + }, + { + "epoch": 2.2998772252915898, + "grad_norm": 0.3060695230960846, + "learning_rate": 8.987776721568311e-05, + "loss": 1.8463, + "step": 7493 + }, + { + "epoch": 2.300184162062615, + "grad_norm": 0.29870638251304626, + "learning_rate": 8.987476854937395e-05, + "loss": 1.815, + "step": 7494 + }, + { + "epoch": 2.3004910988336404, + "grad_norm": 0.27395132184028625, + "learning_rate": 8.987176948899898e-05, + "loss": 1.8126, + "step": 7495 + }, + { + "epoch": 2.3007980356046653, + "grad_norm": 0.2982339859008789, + "learning_rate": 8.986877003458781e-05, + "loss": 1.9114, + "step": 7496 + }, + { + "epoch": 2.3011049723756907, + "grad_norm": 0.3113982081413269, + "learning_rate": 8.986577018617008e-05, + "loss": 1.8429, + "step": 7497 + }, + { + "epoch": 2.301411909146716, + "grad_norm": 0.3538585603237152, + "learning_rate": 8.986276994377544e-05, + "loss": 1.9045, + "step": 7498 + }, + { + "epoch": 2.301718845917741, + "grad_norm": 0.37576064467430115, + "learning_rate": 8.985976930743356e-05, + "loss": 1.8955, + "step": 7499 + }, + { + "epoch": 2.3020257826887662, + "grad_norm": 0.3080044388771057, + "learning_rate": 8.985676827717406e-05, + "loss": 1.7946, + "step": 7500 + }, + { + "epoch": 2.302332719459791, + "grad_norm": 0.33935341238975525, + "learning_rate": 8.985376685302662e-05, + "loss": 1.8817, + "step": 7501 + }, + { + "epoch": 2.3026396562308165, + "grad_norm": 0.3817180395126343, + "learning_rate": 8.98507650350209e-05, + "loss": 1.9178, + "step": 7502 + }, + { + "epoch": 2.302946593001842, + "grad_norm": 0.35170307755470276, + "learning_rate": 8.984776282318657e-05, + "loss": 1.9451, + "step": 7503 + }, + { + "epoch": 2.3032535297728667, + "grad_norm": 0.3451419770717621, + "learning_rate": 8.984476021755329e-05, + "loss": 1.9127, + "step": 7504 + }, + { + "epoch": 2.303560466543892, + "grad_norm": 0.4312259554862976, + "learning_rate": 8.984175721815071e-05, + "loss": 1.8784, + "step": 7505 + }, + { + "epoch": 2.303867403314917, + "grad_norm": 0.4684976041316986, + "learning_rate": 8.983875382500856e-05, + "loss": 1.8782, + "step": 7506 + }, + { + "epoch": 2.3041743400859422, + "grad_norm": 0.4230491518974304, + "learning_rate": 8.983575003815648e-05, + "loss": 1.8769, + "step": 7507 + }, + { + "epoch": 2.3044812768569676, + "grad_norm": 0.32715409994125366, + "learning_rate": 8.983274585762417e-05, + "loss": 1.8535, + "step": 7508 + }, + { + "epoch": 2.3047882136279925, + "grad_norm": 0.3857569396495819, + "learning_rate": 8.982974128344134e-05, + "loss": 1.8689, + "step": 7509 + }, + { + "epoch": 2.305095150399018, + "grad_norm": 0.46266329288482666, + "learning_rate": 8.982673631563766e-05, + "loss": 1.9151, + "step": 7510 + }, + { + "epoch": 2.305402087170043, + "grad_norm": 0.455713152885437, + "learning_rate": 8.98237309542428e-05, + "loss": 1.9304, + "step": 7511 + }, + { + "epoch": 2.305709023941068, + "grad_norm": 0.3413514792919159, + "learning_rate": 8.98207251992865e-05, + "loss": 1.8516, + "step": 7512 + }, + { + "epoch": 2.3060159607120934, + "grad_norm": 0.3705863058567047, + "learning_rate": 8.981771905079846e-05, + "loss": 1.8434, + "step": 7513 + }, + { + "epoch": 2.3063228974831187, + "grad_norm": 0.46615147590637207, + "learning_rate": 8.981471250880839e-05, + "loss": 1.9265, + "step": 7514 + }, + { + "epoch": 2.3066298342541436, + "grad_norm": 0.5400925278663635, + "learning_rate": 8.981170557334598e-05, + "loss": 1.9061, + "step": 7515 + }, + { + "epoch": 2.306936771025169, + "grad_norm": 0.40317288041114807, + "learning_rate": 8.980869824444096e-05, + "loss": 1.7916, + "step": 7516 + }, + { + "epoch": 2.307243707796194, + "grad_norm": 0.3522326648235321, + "learning_rate": 8.980569052212307e-05, + "loss": 1.867, + "step": 7517 + }, + { + "epoch": 2.307550644567219, + "grad_norm": 0.5134142637252808, + "learning_rate": 8.9802682406422e-05, + "loss": 1.8406, + "step": 7518 + }, + { + "epoch": 2.3078575813382445, + "grad_norm": 0.5792621970176697, + "learning_rate": 8.97996738973675e-05, + "loss": 1.8467, + "step": 7519 + }, + { + "epoch": 2.3081645181092694, + "grad_norm": 0.424405962228775, + "learning_rate": 8.979666499498928e-05, + "loss": 1.779, + "step": 7520 + }, + { + "epoch": 2.3084714548802947, + "grad_norm": 0.3233562409877777, + "learning_rate": 8.979365569931712e-05, + "loss": 1.9043, + "step": 7521 + }, + { + "epoch": 2.3087783916513196, + "grad_norm": 0.6043062806129456, + "learning_rate": 8.979064601038071e-05, + "loss": 1.9245, + "step": 7522 + }, + { + "epoch": 2.309085328422345, + "grad_norm": 0.6618810892105103, + "learning_rate": 8.978763592820982e-05, + "loss": 1.8601, + "step": 7523 + }, + { + "epoch": 2.3093922651933703, + "grad_norm": 0.44771909713745117, + "learning_rate": 8.978462545283418e-05, + "loss": 1.7836, + "step": 7524 + }, + { + "epoch": 2.309699201964395, + "grad_norm": 0.3473430871963501, + "learning_rate": 8.978161458428356e-05, + "loss": 1.8743, + "step": 7525 + }, + { + "epoch": 2.3100061387354205, + "grad_norm": 0.46158188581466675, + "learning_rate": 8.977860332258772e-05, + "loss": 1.8802, + "step": 7526 + }, + { + "epoch": 2.310313075506446, + "grad_norm": 0.42034098505973816, + "learning_rate": 8.977559166777639e-05, + "loss": 1.8773, + "step": 7527 + }, + { + "epoch": 2.3106200122774707, + "grad_norm": 0.30994895100593567, + "learning_rate": 8.977257961987936e-05, + "loss": 1.8042, + "step": 7528 + }, + { + "epoch": 2.310926949048496, + "grad_norm": 0.32265907526016235, + "learning_rate": 8.976956717892638e-05, + "loss": 1.8, + "step": 7529 + }, + { + "epoch": 2.3112338858195214, + "grad_norm": 0.3592197000980377, + "learning_rate": 8.976655434494723e-05, + "loss": 1.9053, + "step": 7530 + }, + { + "epoch": 2.3115408225905463, + "grad_norm": 0.36494702100753784, + "learning_rate": 8.97635411179717e-05, + "loss": 1.8982, + "step": 7531 + }, + { + "epoch": 2.3118477593615716, + "grad_norm": 0.3697327971458435, + "learning_rate": 8.976052749802952e-05, + "loss": 1.9446, + "step": 7532 + }, + { + "epoch": 2.3121546961325965, + "grad_norm": 0.5200048089027405, + "learning_rate": 8.975751348515052e-05, + "loss": 1.9429, + "step": 7533 + }, + { + "epoch": 2.312461632903622, + "grad_norm": 0.4033229947090149, + "learning_rate": 8.975449907936446e-05, + "loss": 1.8128, + "step": 7534 + }, + { + "epoch": 2.312768569674647, + "grad_norm": 0.35759851336479187, + "learning_rate": 8.975148428070115e-05, + "loss": 1.8721, + "step": 7535 + }, + { + "epoch": 2.313075506445672, + "grad_norm": 0.4578085243701935, + "learning_rate": 8.974846908919037e-05, + "loss": 1.8397, + "step": 7536 + }, + { + "epoch": 2.3133824432166974, + "grad_norm": 0.4557357132434845, + "learning_rate": 8.974545350486192e-05, + "loss": 1.8726, + "step": 7537 + }, + { + "epoch": 2.3136893799877223, + "grad_norm": 0.3946380615234375, + "learning_rate": 8.974243752774561e-05, + "loss": 1.8662, + "step": 7538 + }, + { + "epoch": 2.3139963167587476, + "grad_norm": 0.29723790287971497, + "learning_rate": 8.973942115787122e-05, + "loss": 1.8215, + "step": 7539 + }, + { + "epoch": 2.314303253529773, + "grad_norm": 0.37225791811943054, + "learning_rate": 8.973640439526858e-05, + "loss": 1.9422, + "step": 7540 + }, + { + "epoch": 2.314610190300798, + "grad_norm": 0.3359868824481964, + "learning_rate": 8.973338723996751e-05, + "loss": 1.7974, + "step": 7541 + }, + { + "epoch": 2.314917127071823, + "grad_norm": 0.2993139922618866, + "learning_rate": 8.973036969199782e-05, + "loss": 1.8691, + "step": 7542 + }, + { + "epoch": 2.3152240638428485, + "grad_norm": 0.3155567944049835, + "learning_rate": 8.972735175138933e-05, + "loss": 1.857, + "step": 7543 + }, + { + "epoch": 2.3155310006138734, + "grad_norm": 0.315820574760437, + "learning_rate": 8.972433341817188e-05, + "loss": 1.8597, + "step": 7544 + }, + { + "epoch": 2.3158379373848987, + "grad_norm": 0.32500606775283813, + "learning_rate": 8.972131469237526e-05, + "loss": 1.9293, + "step": 7545 + }, + { + "epoch": 2.316144874155924, + "grad_norm": 0.3481442332267761, + "learning_rate": 8.971829557402933e-05, + "loss": 1.8839, + "step": 7546 + }, + { + "epoch": 2.316451810926949, + "grad_norm": 0.3110404312610626, + "learning_rate": 8.971527606316394e-05, + "loss": 1.8717, + "step": 7547 + }, + { + "epoch": 2.3167587476979743, + "grad_norm": 0.319795161485672, + "learning_rate": 8.97122561598089e-05, + "loss": 1.8855, + "step": 7548 + }, + { + "epoch": 2.317065684468999, + "grad_norm": 0.33142411708831787, + "learning_rate": 8.970923586399407e-05, + "loss": 1.863, + "step": 7549 + }, + { + "epoch": 2.3173726212400245, + "grad_norm": 0.348715603351593, + "learning_rate": 8.970621517574929e-05, + "loss": 1.8886, + "step": 7550 + }, + { + "epoch": 2.31767955801105, + "grad_norm": 0.3179607689380646, + "learning_rate": 8.970319409510444e-05, + "loss": 1.8955, + "step": 7551 + }, + { + "epoch": 2.3179864947820747, + "grad_norm": 0.33166465163230896, + "learning_rate": 8.970017262208934e-05, + "loss": 1.8366, + "step": 7552 + }, + { + "epoch": 2.3182934315531, + "grad_norm": 0.30798691511154175, + "learning_rate": 8.969715075673386e-05, + "loss": 1.8437, + "step": 7553 + }, + { + "epoch": 2.3186003683241254, + "grad_norm": 0.292639821767807, + "learning_rate": 8.969412849906788e-05, + "loss": 1.8056, + "step": 7554 + }, + { + "epoch": 2.3189073050951503, + "grad_norm": 0.2972165048122406, + "learning_rate": 8.969110584912125e-05, + "loss": 1.8596, + "step": 7555 + }, + { + "epoch": 2.3192142418661756, + "grad_norm": 0.3346043527126312, + "learning_rate": 8.968808280692385e-05, + "loss": 1.8652, + "step": 7556 + }, + { + "epoch": 2.319521178637201, + "grad_norm": 0.31866857409477234, + "learning_rate": 8.968505937250555e-05, + "loss": 1.9263, + "step": 7557 + }, + { + "epoch": 2.319828115408226, + "grad_norm": 0.3511367440223694, + "learning_rate": 8.968203554589625e-05, + "loss": 1.8615, + "step": 7558 + }, + { + "epoch": 2.320135052179251, + "grad_norm": 0.36077243089675903, + "learning_rate": 8.96790113271258e-05, + "loss": 1.9155, + "step": 7559 + }, + { + "epoch": 2.320441988950276, + "grad_norm": 0.3335363268852234, + "learning_rate": 8.96759867162241e-05, + "loss": 1.8313, + "step": 7560 + }, + { + "epoch": 2.3207489257213014, + "grad_norm": 0.31834676861763, + "learning_rate": 8.967296171322105e-05, + "loss": 1.809, + "step": 7561 + }, + { + "epoch": 2.3210558624923268, + "grad_norm": 0.3629632890224457, + "learning_rate": 8.966993631814655e-05, + "loss": 1.854, + "step": 7562 + }, + { + "epoch": 2.3213627992633517, + "grad_norm": 0.3164220154285431, + "learning_rate": 8.966691053103049e-05, + "loss": 1.8431, + "step": 7563 + }, + { + "epoch": 2.321669736034377, + "grad_norm": 0.408178448677063, + "learning_rate": 8.966388435190276e-05, + "loss": 1.8652, + "step": 7564 + }, + { + "epoch": 2.321976672805402, + "grad_norm": 0.4244436025619507, + "learning_rate": 8.966085778079327e-05, + "loss": 1.8834, + "step": 7565 + }, + { + "epoch": 2.322283609576427, + "grad_norm": 0.44187989830970764, + "learning_rate": 8.965783081773195e-05, + "loss": 1.8822, + "step": 7566 + }, + { + "epoch": 2.3225905463474525, + "grad_norm": 0.30801042914390564, + "learning_rate": 8.965480346274869e-05, + "loss": 1.8145, + "step": 7567 + }, + { + "epoch": 2.3228974831184774, + "grad_norm": 0.30103740096092224, + "learning_rate": 8.965177571587343e-05, + "loss": 1.8207, + "step": 7568 + }, + { + "epoch": 2.3232044198895028, + "grad_norm": 0.417538046836853, + "learning_rate": 8.964874757713608e-05, + "loss": 1.9213, + "step": 7569 + }, + { + "epoch": 2.323511356660528, + "grad_norm": 0.4238434433937073, + "learning_rate": 8.964571904656656e-05, + "loss": 1.8309, + "step": 7570 + }, + { + "epoch": 2.323818293431553, + "grad_norm": 0.3717726171016693, + "learning_rate": 8.964269012419482e-05, + "loss": 1.8613, + "step": 7571 + }, + { + "epoch": 2.3241252302025783, + "grad_norm": 0.369182288646698, + "learning_rate": 8.963966081005078e-05, + "loss": 1.9232, + "step": 7572 + }, + { + "epoch": 2.3244321669736037, + "grad_norm": 0.40301385521888733, + "learning_rate": 8.963663110416436e-05, + "loss": 1.9509, + "step": 7573 + }, + { + "epoch": 2.3247391037446286, + "grad_norm": 0.3336825966835022, + "learning_rate": 8.963360100656553e-05, + "loss": 1.807, + "step": 7574 + }, + { + "epoch": 2.325046040515654, + "grad_norm": 0.4070039987564087, + "learning_rate": 8.963057051728423e-05, + "loss": 1.9349, + "step": 7575 + }, + { + "epoch": 2.325352977286679, + "grad_norm": 0.34244731068611145, + "learning_rate": 8.96275396363504e-05, + "loss": 1.8378, + "step": 7576 + }, + { + "epoch": 2.325659914057704, + "grad_norm": 0.3408849835395813, + "learning_rate": 8.962450836379401e-05, + "loss": 1.8087, + "step": 7577 + }, + { + "epoch": 2.3259668508287294, + "grad_norm": 0.34224358201026917, + "learning_rate": 8.962147669964498e-05, + "loss": 1.9158, + "step": 7578 + }, + { + "epoch": 2.3262737875997543, + "grad_norm": 0.36177051067352295, + "learning_rate": 8.961844464393332e-05, + "loss": 1.8774, + "step": 7579 + }, + { + "epoch": 2.3265807243707797, + "grad_norm": 0.3000224232673645, + "learning_rate": 8.961541219668895e-05, + "loss": 1.8092, + "step": 7580 + }, + { + "epoch": 2.3268876611418046, + "grad_norm": 0.34738194942474365, + "learning_rate": 8.961237935794185e-05, + "loss": 1.9107, + "step": 7581 + }, + { + "epoch": 2.32719459791283, + "grad_norm": 0.355585515499115, + "learning_rate": 8.960934612772203e-05, + "loss": 1.8343, + "step": 7582 + }, + { + "epoch": 2.3275015346838552, + "grad_norm": 0.29839828610420227, + "learning_rate": 8.96063125060594e-05, + "loss": 1.8345, + "step": 7583 + }, + { + "epoch": 2.32780847145488, + "grad_norm": 0.3695736229419708, + "learning_rate": 8.960327849298399e-05, + "loss": 1.8763, + "step": 7584 + }, + { + "epoch": 2.3281154082259055, + "grad_norm": 0.38834989070892334, + "learning_rate": 8.960024408852578e-05, + "loss": 1.8732, + "step": 7585 + }, + { + "epoch": 2.328422344996931, + "grad_norm": 0.4515606462955475, + "learning_rate": 8.959720929271474e-05, + "loss": 1.9685, + "step": 7586 + }, + { + "epoch": 2.3287292817679557, + "grad_norm": 0.39115825295448303, + "learning_rate": 8.959417410558087e-05, + "loss": 1.7969, + "step": 7587 + }, + { + "epoch": 2.329036218538981, + "grad_norm": 0.37858307361602783, + "learning_rate": 8.959113852715417e-05, + "loss": 1.9013, + "step": 7588 + }, + { + "epoch": 2.3293431553100064, + "grad_norm": 0.35533010959625244, + "learning_rate": 8.958810255746462e-05, + "loss": 1.8862, + "step": 7589 + }, + { + "epoch": 2.3296500920810312, + "grad_norm": 0.36994054913520813, + "learning_rate": 8.958506619654226e-05, + "loss": 1.9783, + "step": 7590 + }, + { + "epoch": 2.3299570288520566, + "grad_norm": 0.4424416124820709, + "learning_rate": 8.958202944441705e-05, + "loss": 1.9095, + "step": 7591 + }, + { + "epoch": 2.3302639656230815, + "grad_norm": 0.41932111978530884, + "learning_rate": 8.957899230111903e-05, + "loss": 1.8623, + "step": 7592 + }, + { + "epoch": 2.330570902394107, + "grad_norm": 0.4359748363494873, + "learning_rate": 8.957595476667822e-05, + "loss": 1.8917, + "step": 7593 + }, + { + "epoch": 2.330877839165132, + "grad_norm": 0.362957239151001, + "learning_rate": 8.957291684112463e-05, + "loss": 1.8478, + "step": 7594 + }, + { + "epoch": 2.331184775936157, + "grad_norm": 0.3442717492580414, + "learning_rate": 8.956987852448827e-05, + "loss": 1.862, + "step": 7595 + }, + { + "epoch": 2.3314917127071824, + "grad_norm": 0.33355212211608887, + "learning_rate": 8.956683981679918e-05, + "loss": 1.8319, + "step": 7596 + }, + { + "epoch": 2.3317986494782073, + "grad_norm": 0.36758801341056824, + "learning_rate": 8.95638007180874e-05, + "loss": 1.8989, + "step": 7597 + }, + { + "epoch": 2.3321055862492326, + "grad_norm": 0.3574751019477844, + "learning_rate": 8.956076122838294e-05, + "loss": 1.8304, + "step": 7598 + }, + { + "epoch": 2.332412523020258, + "grad_norm": 0.30615341663360596, + "learning_rate": 8.955772134771585e-05, + "loss": 1.9078, + "step": 7599 + }, + { + "epoch": 2.332719459791283, + "grad_norm": 0.38824397325515747, + "learning_rate": 8.955468107611618e-05, + "loss": 1.8733, + "step": 7600 + }, + { + "epoch": 2.333026396562308, + "grad_norm": 0.40545380115509033, + "learning_rate": 8.955164041361395e-05, + "loss": 1.8264, + "step": 7601 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.3104313910007477, + "learning_rate": 8.954859936023925e-05, + "loss": 1.8272, + "step": 7602 + }, + { + "epoch": 2.3336402701043584, + "grad_norm": 0.34795114398002625, + "learning_rate": 8.954555791602211e-05, + "loss": 1.8711, + "step": 7603 + }, + { + "epoch": 2.3339472068753837, + "grad_norm": 0.42790937423706055, + "learning_rate": 8.954251608099257e-05, + "loss": 1.8802, + "step": 7604 + }, + { + "epoch": 2.334254143646409, + "grad_norm": 0.3903054893016815, + "learning_rate": 8.953947385518072e-05, + "loss": 1.8489, + "step": 7605 + }, + { + "epoch": 2.334561080417434, + "grad_norm": 0.35869601368904114, + "learning_rate": 8.953643123861661e-05, + "loss": 1.8565, + "step": 7606 + }, + { + "epoch": 2.3348680171884593, + "grad_norm": 0.3960758447647095, + "learning_rate": 8.953338823133033e-05, + "loss": 1.9335, + "step": 7607 + }, + { + "epoch": 2.335174953959484, + "grad_norm": 0.3884136974811554, + "learning_rate": 8.953034483335191e-05, + "loss": 1.887, + "step": 7608 + }, + { + "epoch": 2.3354818907305095, + "grad_norm": 0.3734811246395111, + "learning_rate": 8.952730104471147e-05, + "loss": 1.861, + "step": 7609 + }, + { + "epoch": 2.335788827501535, + "grad_norm": 0.3074554204940796, + "learning_rate": 8.952425686543908e-05, + "loss": 1.8556, + "step": 7610 + }, + { + "epoch": 2.3360957642725597, + "grad_norm": 0.3098750412464142, + "learning_rate": 8.952121229556481e-05, + "loss": 1.8724, + "step": 7611 + }, + { + "epoch": 2.336402701043585, + "grad_norm": 0.3514649569988251, + "learning_rate": 8.951816733511875e-05, + "loss": 1.8023, + "step": 7612 + }, + { + "epoch": 2.33670963781461, + "grad_norm": 0.3275100290775299, + "learning_rate": 8.951512198413101e-05, + "loss": 1.8805, + "step": 7613 + }, + { + "epoch": 2.3370165745856353, + "grad_norm": 0.3380829989910126, + "learning_rate": 8.951207624263165e-05, + "loss": 1.8559, + "step": 7614 + }, + { + "epoch": 2.3373235113566606, + "grad_norm": 0.43179723620414734, + "learning_rate": 8.950903011065082e-05, + "loss": 1.937, + "step": 7615 + }, + { + "epoch": 2.337630448127686, + "grad_norm": 0.4981893002986908, + "learning_rate": 8.950598358821858e-05, + "loss": 1.8828, + "step": 7616 + }, + { + "epoch": 2.337937384898711, + "grad_norm": 0.42164552211761475, + "learning_rate": 8.950293667536506e-05, + "loss": 1.8898, + "step": 7617 + }, + { + "epoch": 2.338244321669736, + "grad_norm": 0.32897287607192993, + "learning_rate": 8.949988937212037e-05, + "loss": 1.9073, + "step": 7618 + }, + { + "epoch": 2.338551258440761, + "grad_norm": 0.38831618428230286, + "learning_rate": 8.949684167851462e-05, + "loss": 1.9694, + "step": 7619 + }, + { + "epoch": 2.3388581952117864, + "grad_norm": 0.3728467524051666, + "learning_rate": 8.949379359457793e-05, + "loss": 1.8803, + "step": 7620 + }, + { + "epoch": 2.3391651319828117, + "grad_norm": 0.4003579020500183, + "learning_rate": 8.949074512034044e-05, + "loss": 1.9306, + "step": 7621 + }, + { + "epoch": 2.3394720687538366, + "grad_norm": 0.35670751333236694, + "learning_rate": 8.948769625583224e-05, + "loss": 1.9176, + "step": 7622 + }, + { + "epoch": 2.339779005524862, + "grad_norm": 0.3257119357585907, + "learning_rate": 8.948464700108347e-05, + "loss": 1.8781, + "step": 7623 + }, + { + "epoch": 2.340085942295887, + "grad_norm": 0.2840226888656616, + "learning_rate": 8.94815973561243e-05, + "loss": 1.8112, + "step": 7624 + }, + { + "epoch": 2.340392879066912, + "grad_norm": 0.33156147599220276, + "learning_rate": 8.947854732098484e-05, + "loss": 1.8562, + "step": 7625 + }, + { + "epoch": 2.3406998158379375, + "grad_norm": 0.33335328102111816, + "learning_rate": 8.947549689569524e-05, + "loss": 1.8404, + "step": 7626 + }, + { + "epoch": 2.3410067526089624, + "grad_norm": 0.2913919985294342, + "learning_rate": 8.947244608028562e-05, + "loss": 1.83, + "step": 7627 + }, + { + "epoch": 2.3413136893799877, + "grad_norm": 0.32735875248908997, + "learning_rate": 8.946939487478618e-05, + "loss": 1.9047, + "step": 7628 + }, + { + "epoch": 2.341620626151013, + "grad_norm": 0.3421878516674042, + "learning_rate": 8.946634327922703e-05, + "loss": 1.8771, + "step": 7629 + }, + { + "epoch": 2.341927562922038, + "grad_norm": 0.33164483308792114, + "learning_rate": 8.946329129363835e-05, + "loss": 1.8463, + "step": 7630 + }, + { + "epoch": 2.3422344996930633, + "grad_norm": 0.35423099994659424, + "learning_rate": 8.946023891805029e-05, + "loss": 1.9254, + "step": 7631 + }, + { + "epoch": 2.3425414364640886, + "grad_norm": 0.3554958403110504, + "learning_rate": 8.9457186152493e-05, + "loss": 1.8949, + "step": 7632 + }, + { + "epoch": 2.3428483732351135, + "grad_norm": 0.35155919194221497, + "learning_rate": 8.94541329969967e-05, + "loss": 1.8432, + "step": 7633 + }, + { + "epoch": 2.343155310006139, + "grad_norm": 0.3210476338863373, + "learning_rate": 8.945107945159154e-05, + "loss": 1.8512, + "step": 7634 + }, + { + "epoch": 2.3434622467771637, + "grad_norm": 0.3587365746498108, + "learning_rate": 8.944802551630767e-05, + "loss": 1.8355, + "step": 7635 + }, + { + "epoch": 2.343769183548189, + "grad_norm": 0.41851457953453064, + "learning_rate": 8.94449711911753e-05, + "loss": 1.814, + "step": 7636 + }, + { + "epoch": 2.3440761203192144, + "grad_norm": 0.3516016900539398, + "learning_rate": 8.94419164762246e-05, + "loss": 1.8563, + "step": 7637 + }, + { + "epoch": 2.3443830570902393, + "grad_norm": 0.2917228937149048, + "learning_rate": 8.943886137148576e-05, + "loss": 1.8037, + "step": 7638 + }, + { + "epoch": 2.3446899938612646, + "grad_norm": 0.3597778379917145, + "learning_rate": 8.943580587698899e-05, + "loss": 1.8766, + "step": 7639 + }, + { + "epoch": 2.3449969306322895, + "grad_norm": 0.359642893075943, + "learning_rate": 8.943274999276445e-05, + "loss": 1.8485, + "step": 7640 + }, + { + "epoch": 2.345303867403315, + "grad_norm": 0.3543380796909332, + "learning_rate": 8.942969371884238e-05, + "loss": 1.8853, + "step": 7641 + }, + { + "epoch": 2.34561080417434, + "grad_norm": 0.371267706155777, + "learning_rate": 8.942663705525296e-05, + "loss": 1.869, + "step": 7642 + }, + { + "epoch": 2.345917740945365, + "grad_norm": 0.34073930978775024, + "learning_rate": 8.942358000202642e-05, + "loss": 1.831, + "step": 7643 + }, + { + "epoch": 2.3462246777163904, + "grad_norm": 0.3654492497444153, + "learning_rate": 8.942052255919293e-05, + "loss": 1.8697, + "step": 7644 + }, + { + "epoch": 2.3465316144874158, + "grad_norm": 0.31281957030296326, + "learning_rate": 8.941746472678275e-05, + "loss": 1.7908, + "step": 7645 + }, + { + "epoch": 2.3468385512584407, + "grad_norm": 0.3310844302177429, + "learning_rate": 8.941440650482607e-05, + "loss": 1.8523, + "step": 7646 + }, + { + "epoch": 2.347145488029466, + "grad_norm": 0.3187454342842102, + "learning_rate": 8.941134789335312e-05, + "loss": 1.8808, + "step": 7647 + }, + { + "epoch": 2.3474524248004913, + "grad_norm": 0.35980424284935, + "learning_rate": 8.940828889239415e-05, + "loss": 1.8713, + "step": 7648 + }, + { + "epoch": 2.347759361571516, + "grad_norm": 0.2960885763168335, + "learning_rate": 8.940522950197935e-05, + "loss": 1.8077, + "step": 7649 + }, + { + "epoch": 2.3480662983425415, + "grad_norm": 0.3056114912033081, + "learning_rate": 8.940216972213897e-05, + "loss": 1.8805, + "step": 7650 + }, + { + "epoch": 2.3483732351135664, + "grad_norm": 0.3047563135623932, + "learning_rate": 8.939910955290328e-05, + "loss": 1.793, + "step": 7651 + }, + { + "epoch": 2.3486801718845918, + "grad_norm": 0.3381251394748688, + "learning_rate": 8.939604899430248e-05, + "loss": 1.8267, + "step": 7652 + }, + { + "epoch": 2.348987108655617, + "grad_norm": 0.36855414509773254, + "learning_rate": 8.939298804636684e-05, + "loss": 1.9386, + "step": 7653 + }, + { + "epoch": 2.349294045426642, + "grad_norm": 0.3742626905441284, + "learning_rate": 8.93899267091266e-05, + "loss": 1.8695, + "step": 7654 + }, + { + "epoch": 2.3496009821976673, + "grad_norm": 0.3170017600059509, + "learning_rate": 8.938686498261201e-05, + "loss": 1.881, + "step": 7655 + }, + { + "epoch": 2.349907918968692, + "grad_norm": 0.2740418016910553, + "learning_rate": 8.938380286685334e-05, + "loss": 1.7992, + "step": 7656 + }, + { + "epoch": 2.3502148557397176, + "grad_norm": 0.3170342743396759, + "learning_rate": 8.938074036188087e-05, + "loss": 1.8281, + "step": 7657 + }, + { + "epoch": 2.350521792510743, + "grad_norm": 0.3487764298915863, + "learning_rate": 8.93776774677248e-05, + "loss": 1.8508, + "step": 7658 + }, + { + "epoch": 2.350828729281768, + "grad_norm": 0.3193725347518921, + "learning_rate": 8.937461418441549e-05, + "loss": 1.802, + "step": 7659 + }, + { + "epoch": 2.351135666052793, + "grad_norm": 0.30621078610420227, + "learning_rate": 8.937155051198312e-05, + "loss": 1.8723, + "step": 7660 + }, + { + "epoch": 2.3514426028238185, + "grad_norm": 0.3154527544975281, + "learning_rate": 8.936848645045803e-05, + "loss": 1.8276, + "step": 7661 + }, + { + "epoch": 2.3517495395948433, + "grad_norm": 0.3809822201728821, + "learning_rate": 8.936542199987048e-05, + "loss": 1.9682, + "step": 7662 + }, + { + "epoch": 2.3520564763658687, + "grad_norm": 0.3817490339279175, + "learning_rate": 8.936235716025076e-05, + "loss": 1.8896, + "step": 7663 + }, + { + "epoch": 2.352363413136894, + "grad_norm": 0.2996097207069397, + "learning_rate": 8.935929193162915e-05, + "loss": 1.7994, + "step": 7664 + }, + { + "epoch": 2.352670349907919, + "grad_norm": 0.30788013339042664, + "learning_rate": 8.935622631403596e-05, + "loss": 1.8243, + "step": 7665 + }, + { + "epoch": 2.3529772866789442, + "grad_norm": 0.331193745136261, + "learning_rate": 8.935316030750145e-05, + "loss": 1.9044, + "step": 7666 + }, + { + "epoch": 2.353284223449969, + "grad_norm": 0.31796711683273315, + "learning_rate": 8.935009391205598e-05, + "loss": 1.8006, + "step": 7667 + }, + { + "epoch": 2.3535911602209945, + "grad_norm": 0.3864014744758606, + "learning_rate": 8.934702712772979e-05, + "loss": 2.0193, + "step": 7668 + }, + { + "epoch": 2.35389809699202, + "grad_norm": 0.3923170566558838, + "learning_rate": 8.934395995455323e-05, + "loss": 1.9418, + "step": 7669 + }, + { + "epoch": 2.3542050337630447, + "grad_norm": 0.3210037052631378, + "learning_rate": 8.934089239255659e-05, + "loss": 1.7964, + "step": 7670 + }, + { + "epoch": 2.35451197053407, + "grad_norm": 0.32465317845344543, + "learning_rate": 8.933782444177019e-05, + "loss": 1.9405, + "step": 7671 + }, + { + "epoch": 2.354818907305095, + "grad_norm": 0.35554173588752747, + "learning_rate": 8.933475610222435e-05, + "loss": 1.8645, + "step": 7672 + }, + { + "epoch": 2.3551258440761202, + "grad_norm": 0.32723551988601685, + "learning_rate": 8.933168737394942e-05, + "loss": 1.8941, + "step": 7673 + }, + { + "epoch": 2.3554327808471456, + "grad_norm": 0.3295009732246399, + "learning_rate": 8.932861825697567e-05, + "loss": 1.9047, + "step": 7674 + }, + { + "epoch": 2.3557397176181705, + "grad_norm": 0.32315388321876526, + "learning_rate": 8.932554875133348e-05, + "loss": 1.8535, + "step": 7675 + }, + { + "epoch": 2.356046654389196, + "grad_norm": 0.31577154994010925, + "learning_rate": 8.932247885705315e-05, + "loss": 1.8697, + "step": 7676 + }, + { + "epoch": 2.356353591160221, + "grad_norm": 0.31099769473075867, + "learning_rate": 8.931940857416506e-05, + "loss": 1.8377, + "step": 7677 + }, + { + "epoch": 2.356660527931246, + "grad_norm": 0.32998642325401306, + "learning_rate": 8.931633790269954e-05, + "loss": 1.8528, + "step": 7678 + }, + { + "epoch": 2.3569674647022714, + "grad_norm": 0.29609233140945435, + "learning_rate": 8.93132668426869e-05, + "loss": 1.8646, + "step": 7679 + }, + { + "epoch": 2.3572744014732967, + "grad_norm": 0.31335413455963135, + "learning_rate": 8.931019539415752e-05, + "loss": 1.9011, + "step": 7680 + }, + { + "epoch": 2.3575813382443216, + "grad_norm": 0.3441788852214813, + "learning_rate": 8.930712355714174e-05, + "loss": 1.8673, + "step": 7681 + }, + { + "epoch": 2.357888275015347, + "grad_norm": 0.34610918164253235, + "learning_rate": 8.930405133166992e-05, + "loss": 1.8613, + "step": 7682 + }, + { + "epoch": 2.358195211786372, + "grad_norm": 0.31753265857696533, + "learning_rate": 8.930097871777245e-05, + "loss": 1.873, + "step": 7683 + }, + { + "epoch": 2.358502148557397, + "grad_norm": 0.29862073063850403, + "learning_rate": 8.929790571547966e-05, + "loss": 1.8392, + "step": 7684 + }, + { + "epoch": 2.3588090853284225, + "grad_norm": 0.2953017055988312, + "learning_rate": 8.929483232482194e-05, + "loss": 1.8402, + "step": 7685 + }, + { + "epoch": 2.3591160220994474, + "grad_norm": 0.36613956093788147, + "learning_rate": 8.929175854582966e-05, + "loss": 1.8954, + "step": 7686 + }, + { + "epoch": 2.3594229588704727, + "grad_norm": 0.3867746889591217, + "learning_rate": 8.928868437853319e-05, + "loss": 1.8496, + "step": 7687 + }, + { + "epoch": 2.359729895641498, + "grad_norm": 0.30742913484573364, + "learning_rate": 8.928560982296292e-05, + "loss": 1.82, + "step": 7688 + }, + { + "epoch": 2.360036832412523, + "grad_norm": 0.306905061006546, + "learning_rate": 8.928253487914921e-05, + "loss": 1.8299, + "step": 7689 + }, + { + "epoch": 2.3603437691835483, + "grad_norm": 0.3253326416015625, + "learning_rate": 8.927945954712247e-05, + "loss": 1.896, + "step": 7690 + }, + { + "epoch": 2.3606507059545736, + "grad_norm": 0.3139156699180603, + "learning_rate": 8.927638382691309e-05, + "loss": 1.838, + "step": 7691 + }, + { + "epoch": 2.3609576427255985, + "grad_norm": 0.3865121006965637, + "learning_rate": 8.927330771855147e-05, + "loss": 1.8502, + "step": 7692 + }, + { + "epoch": 2.361264579496624, + "grad_norm": 0.3640300929546356, + "learning_rate": 8.927023122206799e-05, + "loss": 1.8929, + "step": 7693 + }, + { + "epoch": 2.3615715162676487, + "grad_norm": 0.3446909487247467, + "learning_rate": 8.926715433749309e-05, + "loss": 1.864, + "step": 7694 + }, + { + "epoch": 2.361878453038674, + "grad_norm": 0.3086490035057068, + "learning_rate": 8.926407706485713e-05, + "loss": 1.8588, + "step": 7695 + }, + { + "epoch": 2.3621853898096994, + "grad_norm": 0.28351619839668274, + "learning_rate": 8.926099940419057e-05, + "loss": 1.8114, + "step": 7696 + }, + { + "epoch": 2.3624923265807243, + "grad_norm": 0.31882742047309875, + "learning_rate": 8.925792135552379e-05, + "loss": 1.8544, + "step": 7697 + }, + { + "epoch": 2.3627992633517496, + "grad_norm": 0.2691894769668579, + "learning_rate": 8.925484291888723e-05, + "loss": 1.8143, + "step": 7698 + }, + { + "epoch": 2.3631062001227745, + "grad_norm": 0.2815118432044983, + "learning_rate": 8.925176409431129e-05, + "loss": 1.8687, + "step": 7699 + }, + { + "epoch": 2.3634131368938, + "grad_norm": 0.34842196106910706, + "learning_rate": 8.924868488182643e-05, + "loss": 1.8673, + "step": 7700 + }, + { + "epoch": 2.363720073664825, + "grad_norm": 0.33553025126457214, + "learning_rate": 8.924560528146304e-05, + "loss": 1.8982, + "step": 7701 + }, + { + "epoch": 2.36402701043585, + "grad_norm": 0.30077221989631653, + "learning_rate": 8.924252529325159e-05, + "loss": 1.8155, + "step": 7702 + }, + { + "epoch": 2.3643339472068754, + "grad_norm": 0.3376595079898834, + "learning_rate": 8.923944491722252e-05, + "loss": 1.8871, + "step": 7703 + }, + { + "epoch": 2.3646408839779007, + "grad_norm": 0.3980284333229065, + "learning_rate": 8.923636415340622e-05, + "loss": 1.8414, + "step": 7704 + }, + { + "epoch": 2.3649478207489256, + "grad_norm": 0.4772777259349823, + "learning_rate": 8.92332830018332e-05, + "loss": 1.8393, + "step": 7705 + }, + { + "epoch": 2.365254757519951, + "grad_norm": 0.5061559081077576, + "learning_rate": 8.923020146253387e-05, + "loss": 1.9134, + "step": 7706 + }, + { + "epoch": 2.3655616942909763, + "grad_norm": 0.47147873044013977, + "learning_rate": 8.922711953553871e-05, + "loss": 1.9026, + "step": 7707 + }, + { + "epoch": 2.365868631062001, + "grad_norm": 0.37263748049736023, + "learning_rate": 8.922403722087814e-05, + "loss": 1.8474, + "step": 7708 + }, + { + "epoch": 2.3661755678330265, + "grad_norm": 0.3158501386642456, + "learning_rate": 8.922095451858265e-05, + "loss": 1.8771, + "step": 7709 + }, + { + "epoch": 2.3664825046040514, + "grad_norm": 0.3170566260814667, + "learning_rate": 8.921787142868271e-05, + "loss": 1.8111, + "step": 7710 + }, + { + "epoch": 2.3667894413750767, + "grad_norm": 0.3532208502292633, + "learning_rate": 8.921478795120877e-05, + "loss": 1.8708, + "step": 7711 + }, + { + "epoch": 2.367096378146102, + "grad_norm": 0.3211480379104614, + "learning_rate": 8.921170408619131e-05, + "loss": 1.8487, + "step": 7712 + }, + { + "epoch": 2.367403314917127, + "grad_norm": 0.2806071937084198, + "learning_rate": 8.920861983366083e-05, + "loss": 1.8325, + "step": 7713 + }, + { + "epoch": 2.3677102516881523, + "grad_norm": 0.30703970789909363, + "learning_rate": 8.920553519364777e-05, + "loss": 1.8364, + "step": 7714 + }, + { + "epoch": 2.368017188459177, + "grad_norm": 0.30848923325538635, + "learning_rate": 8.920245016618263e-05, + "loss": 1.833, + "step": 7715 + }, + { + "epoch": 2.3683241252302025, + "grad_norm": 0.31656739115715027, + "learning_rate": 8.919936475129588e-05, + "loss": 1.8884, + "step": 7716 + }, + { + "epoch": 2.368631062001228, + "grad_norm": 0.2806589603424072, + "learning_rate": 8.919627894901806e-05, + "loss": 1.7779, + "step": 7717 + }, + { + "epoch": 2.3689379987722528, + "grad_norm": 0.2943432629108429, + "learning_rate": 8.919319275937962e-05, + "loss": 1.8741, + "step": 7718 + }, + { + "epoch": 2.369244935543278, + "grad_norm": 0.2870347499847412, + "learning_rate": 8.919010618241111e-05, + "loss": 1.8415, + "step": 7719 + }, + { + "epoch": 2.3695518723143034, + "grad_norm": 0.3224312663078308, + "learning_rate": 8.918701921814297e-05, + "loss": 1.8594, + "step": 7720 + }, + { + "epoch": 2.3698588090853283, + "grad_norm": 0.3007681369781494, + "learning_rate": 8.918393186660575e-05, + "loss": 1.878, + "step": 7721 + }, + { + "epoch": 2.3701657458563536, + "grad_norm": 0.3083780109882355, + "learning_rate": 8.918084412782994e-05, + "loss": 1.9088, + "step": 7722 + }, + { + "epoch": 2.370472682627379, + "grad_norm": 0.30599063634872437, + "learning_rate": 8.917775600184608e-05, + "loss": 1.8743, + "step": 7723 + }, + { + "epoch": 2.370779619398404, + "grad_norm": 0.33503273129463196, + "learning_rate": 8.917466748868466e-05, + "loss": 1.9048, + "step": 7724 + }, + { + "epoch": 2.371086556169429, + "grad_norm": 0.3861919343471527, + "learning_rate": 8.917157858837622e-05, + "loss": 1.9073, + "step": 7725 + }, + { + "epoch": 2.371393492940454, + "grad_norm": 0.395945280790329, + "learning_rate": 8.916848930095128e-05, + "loss": 1.8678, + "step": 7726 + }, + { + "epoch": 2.3717004297114794, + "grad_norm": 0.3657386600971222, + "learning_rate": 8.916539962644037e-05, + "loss": 1.9138, + "step": 7727 + }, + { + "epoch": 2.3720073664825048, + "grad_norm": 0.32392752170562744, + "learning_rate": 8.916230956487402e-05, + "loss": 1.803, + "step": 7728 + }, + { + "epoch": 2.3723143032535297, + "grad_norm": 0.406703382730484, + "learning_rate": 8.915921911628278e-05, + "loss": 1.9222, + "step": 7729 + }, + { + "epoch": 2.372621240024555, + "grad_norm": 0.4293023645877838, + "learning_rate": 8.915612828069718e-05, + "loss": 1.8874, + "step": 7730 + }, + { + "epoch": 2.37292817679558, + "grad_norm": 0.45155876874923706, + "learning_rate": 8.915303705814777e-05, + "loss": 1.9059, + "step": 7731 + }, + { + "epoch": 2.373235113566605, + "grad_norm": 0.35105881094932556, + "learning_rate": 8.91499454486651e-05, + "loss": 1.8387, + "step": 7732 + }, + { + "epoch": 2.3735420503376305, + "grad_norm": 0.3197930157184601, + "learning_rate": 8.914685345227973e-05, + "loss": 1.8174, + "step": 7733 + }, + { + "epoch": 2.3738489871086554, + "grad_norm": 0.3610389232635498, + "learning_rate": 8.91437610690222e-05, + "loss": 1.841, + "step": 7734 + }, + { + "epoch": 2.3741559238796808, + "grad_norm": 0.3696954548358917, + "learning_rate": 8.91406682989231e-05, + "loss": 1.8511, + "step": 7735 + }, + { + "epoch": 2.374462860650706, + "grad_norm": 0.3364555239677429, + "learning_rate": 8.913757514201295e-05, + "loss": 1.8382, + "step": 7736 + }, + { + "epoch": 2.374769797421731, + "grad_norm": 0.4600698947906494, + "learning_rate": 8.913448159832236e-05, + "loss": 1.8247, + "step": 7737 + }, + { + "epoch": 2.3750767341927563, + "grad_norm": 0.5877843499183655, + "learning_rate": 8.913138766788187e-05, + "loss": 1.8449, + "step": 7738 + }, + { + "epoch": 2.3753836709637817, + "grad_norm": 0.5380640029907227, + "learning_rate": 8.912829335072208e-05, + "loss": 1.8647, + "step": 7739 + }, + { + "epoch": 2.3756906077348066, + "grad_norm": 0.5100306272506714, + "learning_rate": 8.912519864687357e-05, + "loss": 1.884, + "step": 7740 + }, + { + "epoch": 2.375997544505832, + "grad_norm": 0.48175910115242004, + "learning_rate": 8.91221035563669e-05, + "loss": 1.8378, + "step": 7741 + }, + { + "epoch": 2.376304481276857, + "grad_norm": 0.3296540081501007, + "learning_rate": 8.911900807923268e-05, + "loss": 1.8036, + "step": 7742 + }, + { + "epoch": 2.376611418047882, + "grad_norm": 0.32398131489753723, + "learning_rate": 8.911591221550149e-05, + "loss": 1.8415, + "step": 7743 + }, + { + "epoch": 2.3769183548189075, + "grad_norm": 0.33934786915779114, + "learning_rate": 8.911281596520393e-05, + "loss": 1.9002, + "step": 7744 + }, + { + "epoch": 2.3772252915899323, + "grad_norm": 0.33059465885162354, + "learning_rate": 8.91097193283706e-05, + "loss": 1.8194, + "step": 7745 + }, + { + "epoch": 2.3775322283609577, + "grad_norm": 0.2908796966075897, + "learning_rate": 8.91066223050321e-05, + "loss": 1.8272, + "step": 7746 + }, + { + "epoch": 2.3778391651319826, + "grad_norm": 0.31551963090896606, + "learning_rate": 8.910352489521904e-05, + "loss": 1.8717, + "step": 7747 + }, + { + "epoch": 2.378146101903008, + "grad_norm": 0.2886766493320465, + "learning_rate": 8.910042709896203e-05, + "loss": 1.8714, + "step": 7748 + }, + { + "epoch": 2.3784530386740332, + "grad_norm": 0.3288721740245819, + "learning_rate": 8.909732891629167e-05, + "loss": 1.9194, + "step": 7749 + }, + { + "epoch": 2.378759975445058, + "grad_norm": 0.42444637417793274, + "learning_rate": 8.90942303472386e-05, + "loss": 1.8871, + "step": 7750 + }, + { + "epoch": 2.3790669122160835, + "grad_norm": 0.3550770580768585, + "learning_rate": 8.909113139183343e-05, + "loss": 1.8639, + "step": 7751 + }, + { + "epoch": 2.379373848987109, + "grad_norm": 0.3291744589805603, + "learning_rate": 8.908803205010679e-05, + "loss": 1.8284, + "step": 7752 + }, + { + "epoch": 2.3796807857581337, + "grad_norm": 0.2803054451942444, + "learning_rate": 8.908493232208928e-05, + "loss": 1.8113, + "step": 7753 + }, + { + "epoch": 2.379987722529159, + "grad_norm": 0.30959245562553406, + "learning_rate": 8.908183220781158e-05, + "loss": 1.8821, + "step": 7754 + }, + { + "epoch": 2.3802946593001844, + "grad_norm": 0.37838777899742126, + "learning_rate": 8.907873170730431e-05, + "loss": 1.8749, + "step": 7755 + }, + { + "epoch": 2.3806015960712092, + "grad_norm": 0.34625449776649475, + "learning_rate": 8.907563082059813e-05, + "loss": 1.8804, + "step": 7756 + }, + { + "epoch": 2.3809085328422346, + "grad_norm": 0.3966830372810364, + "learning_rate": 8.907252954772364e-05, + "loss": 1.9295, + "step": 7757 + }, + { + "epoch": 2.3812154696132595, + "grad_norm": 0.3144119679927826, + "learning_rate": 8.906942788871151e-05, + "loss": 1.8486, + "step": 7758 + }, + { + "epoch": 2.381522406384285, + "grad_norm": 0.3498438596725464, + "learning_rate": 8.90663258435924e-05, + "loss": 1.8813, + "step": 7759 + }, + { + "epoch": 2.38182934315531, + "grad_norm": 0.32803723216056824, + "learning_rate": 8.906322341239696e-05, + "loss": 1.8282, + "step": 7760 + }, + { + "epoch": 2.382136279926335, + "grad_norm": 0.28600773215293884, + "learning_rate": 8.906012059515585e-05, + "loss": 1.8319, + "step": 7761 + }, + { + "epoch": 2.3824432166973604, + "grad_norm": 0.2743505537509918, + "learning_rate": 8.905701739189973e-05, + "loss": 1.8198, + "step": 7762 + }, + { + "epoch": 2.3827501534683857, + "grad_norm": 0.3011966347694397, + "learning_rate": 8.905391380265929e-05, + "loss": 1.8476, + "step": 7763 + }, + { + "epoch": 2.3830570902394106, + "grad_norm": 0.3022943437099457, + "learning_rate": 8.905080982746516e-05, + "loss": 1.9037, + "step": 7764 + }, + { + "epoch": 2.383364027010436, + "grad_norm": 0.3333243727684021, + "learning_rate": 8.904770546634805e-05, + "loss": 1.8487, + "step": 7765 + }, + { + "epoch": 2.3836709637814613, + "grad_norm": 0.3773072361946106, + "learning_rate": 8.904460071933862e-05, + "loss": 1.8828, + "step": 7766 + }, + { + "epoch": 2.383977900552486, + "grad_norm": 0.4382041096687317, + "learning_rate": 8.904149558646756e-05, + "loss": 1.9069, + "step": 7767 + }, + { + "epoch": 2.3842848373235115, + "grad_norm": 0.3963650166988373, + "learning_rate": 8.903839006776557e-05, + "loss": 1.816, + "step": 7768 + }, + { + "epoch": 2.3845917740945364, + "grad_norm": 0.35340386629104614, + "learning_rate": 8.903528416326333e-05, + "loss": 1.8853, + "step": 7769 + }, + { + "epoch": 2.3848987108655617, + "grad_norm": 0.31519120931625366, + "learning_rate": 8.903217787299153e-05, + "loss": 1.8953, + "step": 7770 + }, + { + "epoch": 2.385205647636587, + "grad_norm": 0.41126203536987305, + "learning_rate": 8.902907119698088e-05, + "loss": 1.9494, + "step": 7771 + }, + { + "epoch": 2.385512584407612, + "grad_norm": 0.4488140344619751, + "learning_rate": 8.902596413526205e-05, + "loss": 1.8717, + "step": 7772 + }, + { + "epoch": 2.3858195211786373, + "grad_norm": 0.36129191517829895, + "learning_rate": 8.902285668786578e-05, + "loss": 1.8472, + "step": 7773 + }, + { + "epoch": 2.386126457949662, + "grad_norm": 0.3357439935207367, + "learning_rate": 8.901974885482277e-05, + "loss": 1.8143, + "step": 7774 + }, + { + "epoch": 2.3864333947206875, + "grad_norm": 0.2832469046115875, + "learning_rate": 8.901664063616372e-05, + "loss": 1.7952, + "step": 7775 + }, + { + "epoch": 2.386740331491713, + "grad_norm": 0.31065669655799866, + "learning_rate": 8.901353203191937e-05, + "loss": 1.8651, + "step": 7776 + }, + { + "epoch": 2.3870472682627377, + "grad_norm": 0.2985263764858246, + "learning_rate": 8.901042304212042e-05, + "loss": 1.8106, + "step": 7777 + }, + { + "epoch": 2.387354205033763, + "grad_norm": 0.31606364250183105, + "learning_rate": 8.900731366679761e-05, + "loss": 1.8831, + "step": 7778 + }, + { + "epoch": 2.3876611418047884, + "grad_norm": 0.33167949318885803, + "learning_rate": 8.900420390598166e-05, + "loss": 1.9494, + "step": 7779 + }, + { + "epoch": 2.3879680785758133, + "grad_norm": 0.32814472913742065, + "learning_rate": 8.900109375970333e-05, + "loss": 1.8654, + "step": 7780 + }, + { + "epoch": 2.3882750153468386, + "grad_norm": 0.35307401418685913, + "learning_rate": 8.899798322799331e-05, + "loss": 1.904, + "step": 7781 + }, + { + "epoch": 2.388581952117864, + "grad_norm": 0.3936740458011627, + "learning_rate": 8.899487231088236e-05, + "loss": 1.8404, + "step": 7782 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.3675380349159241, + "learning_rate": 8.899176100840124e-05, + "loss": 1.8689, + "step": 7783 + }, + { + "epoch": 2.389195825659914, + "grad_norm": 0.34065911173820496, + "learning_rate": 8.898864932058067e-05, + "loss": 1.8819, + "step": 7784 + }, + { + "epoch": 2.389502762430939, + "grad_norm": 0.31531861424446106, + "learning_rate": 8.898553724745142e-05, + "loss": 1.8379, + "step": 7785 + }, + { + "epoch": 2.3898096992019644, + "grad_norm": 0.33485177159309387, + "learning_rate": 8.898242478904424e-05, + "loss": 1.9206, + "step": 7786 + }, + { + "epoch": 2.3901166359729897, + "grad_norm": 0.33116385340690613, + "learning_rate": 8.897931194538989e-05, + "loss": 1.8744, + "step": 7787 + }, + { + "epoch": 2.3904235727440146, + "grad_norm": 0.33216002583503723, + "learning_rate": 8.897619871651915e-05, + "loss": 1.8794, + "step": 7788 + }, + { + "epoch": 2.39073050951504, + "grad_norm": 0.3246794641017914, + "learning_rate": 8.897308510246273e-05, + "loss": 1.8739, + "step": 7789 + }, + { + "epoch": 2.391037446286065, + "grad_norm": 0.3038793206214905, + "learning_rate": 8.896997110325146e-05, + "loss": 1.8314, + "step": 7790 + }, + { + "epoch": 2.39134438305709, + "grad_norm": 0.35726267099380493, + "learning_rate": 8.896685671891612e-05, + "loss": 1.8764, + "step": 7791 + }, + { + "epoch": 2.3916513198281155, + "grad_norm": 0.421522855758667, + "learning_rate": 8.896374194948744e-05, + "loss": 1.8215, + "step": 7792 + }, + { + "epoch": 2.3919582565991404, + "grad_norm": 0.4456072747707367, + "learning_rate": 8.896062679499621e-05, + "loss": 1.9146, + "step": 7793 + }, + { + "epoch": 2.3922651933701657, + "grad_norm": 0.33498415350914, + "learning_rate": 8.895751125547325e-05, + "loss": 1.8372, + "step": 7794 + }, + { + "epoch": 2.392572130141191, + "grad_norm": 0.3279598355293274, + "learning_rate": 8.895439533094933e-05, + "loss": 1.8469, + "step": 7795 + }, + { + "epoch": 2.392879066912216, + "grad_norm": 0.4238305687904358, + "learning_rate": 8.895127902145524e-05, + "loss": 1.8259, + "step": 7796 + }, + { + "epoch": 2.3931860036832413, + "grad_norm": 0.473057359457016, + "learning_rate": 8.89481623270218e-05, + "loss": 1.8374, + "step": 7797 + }, + { + "epoch": 2.3934929404542666, + "grad_norm": 0.30914968252182007, + "learning_rate": 8.894504524767976e-05, + "loss": 1.7803, + "step": 7798 + }, + { + "epoch": 2.3937998772252915, + "grad_norm": 0.3433384597301483, + "learning_rate": 8.894192778345996e-05, + "loss": 1.8568, + "step": 7799 + }, + { + "epoch": 2.394106813996317, + "grad_norm": 0.4965706467628479, + "learning_rate": 8.893880993439323e-05, + "loss": 1.8576, + "step": 7800 + }, + { + "epoch": 2.3944137507673418, + "grad_norm": 0.4996519684791565, + "learning_rate": 8.893569170051032e-05, + "loss": 1.788, + "step": 7801 + }, + { + "epoch": 2.394720687538367, + "grad_norm": 0.31231364607810974, + "learning_rate": 8.893257308184212e-05, + "loss": 1.7846, + "step": 7802 + }, + { + "epoch": 2.3950276243093924, + "grad_norm": 0.32845574617385864, + "learning_rate": 8.89294540784194e-05, + "loss": 1.8811, + "step": 7803 + }, + { + "epoch": 2.3953345610804173, + "grad_norm": 0.525324285030365, + "learning_rate": 8.8926334690273e-05, + "loss": 1.8458, + "step": 7804 + }, + { + "epoch": 2.3956414978514426, + "grad_norm": 0.5107213854789734, + "learning_rate": 8.892321491743373e-05, + "loss": 1.8419, + "step": 7805 + }, + { + "epoch": 2.3959484346224675, + "grad_norm": 0.33831658959388733, + "learning_rate": 8.892009475993245e-05, + "loss": 1.811, + "step": 7806 + }, + { + "epoch": 2.396255371393493, + "grad_norm": 0.3781357407569885, + "learning_rate": 8.891697421779999e-05, + "loss": 1.9385, + "step": 7807 + }, + { + "epoch": 2.396562308164518, + "grad_norm": 0.43507882952690125, + "learning_rate": 8.891385329106717e-05, + "loss": 1.7705, + "step": 7808 + }, + { + "epoch": 2.396869244935543, + "grad_norm": 0.45114290714263916, + "learning_rate": 8.891073197976483e-05, + "loss": 1.8661, + "step": 7809 + }, + { + "epoch": 2.3971761817065684, + "grad_norm": 0.29369547963142395, + "learning_rate": 8.890761028392385e-05, + "loss": 1.873, + "step": 7810 + }, + { + "epoch": 2.3974831184775938, + "grad_norm": 0.3268595337867737, + "learning_rate": 8.890448820357506e-05, + "loss": 1.8461, + "step": 7811 + }, + { + "epoch": 2.3977900552486187, + "grad_norm": 0.4514225423336029, + "learning_rate": 8.890136573874931e-05, + "loss": 1.8458, + "step": 7812 + }, + { + "epoch": 2.398096992019644, + "grad_norm": 0.5288760662078857, + "learning_rate": 8.889824288947745e-05, + "loss": 1.8301, + "step": 7813 + }, + { + "epoch": 2.3984039287906693, + "grad_norm": 0.46517884731292725, + "learning_rate": 8.889511965579038e-05, + "loss": 1.8769, + "step": 7814 + }, + { + "epoch": 2.398710865561694, + "grad_norm": 0.29907044768333435, + "learning_rate": 8.889199603771892e-05, + "loss": 1.7815, + "step": 7815 + }, + { + "epoch": 2.3990178023327196, + "grad_norm": 0.36091622710227966, + "learning_rate": 8.888887203529398e-05, + "loss": 1.8375, + "step": 7816 + }, + { + "epoch": 2.3993247391037444, + "grad_norm": 0.5604190230369568, + "learning_rate": 8.88857476485464e-05, + "loss": 1.9176, + "step": 7817 + }, + { + "epoch": 2.3996316758747698, + "grad_norm": 0.48299452662467957, + "learning_rate": 8.888262287750707e-05, + "loss": 1.8682, + "step": 7818 + }, + { + "epoch": 2.399938612645795, + "grad_norm": 0.32829394936561584, + "learning_rate": 8.887949772220687e-05, + "loss": 1.9143, + "step": 7819 + }, + { + "epoch": 2.40024554941682, + "grad_norm": 0.401719868183136, + "learning_rate": 8.88763721826767e-05, + "loss": 1.8517, + "step": 7820 + }, + { + "epoch": 2.4005524861878453, + "grad_norm": 0.5205032825469971, + "learning_rate": 8.887324625894741e-05, + "loss": 1.811, + "step": 7821 + }, + { + "epoch": 2.4008594229588702, + "grad_norm": 0.3828800618648529, + "learning_rate": 8.887011995104993e-05, + "loss": 1.8042, + "step": 7822 + }, + { + "epoch": 2.4011663597298956, + "grad_norm": 0.31816062331199646, + "learning_rate": 8.886699325901514e-05, + "loss": 1.8998, + "step": 7823 + }, + { + "epoch": 2.401473296500921, + "grad_norm": 0.36172720789909363, + "learning_rate": 8.886386618287394e-05, + "loss": 1.8689, + "step": 7824 + }, + { + "epoch": 2.401780233271946, + "grad_norm": 0.3582005202770233, + "learning_rate": 8.886073872265725e-05, + "loss": 1.8565, + "step": 7825 + }, + { + "epoch": 2.402087170042971, + "grad_norm": 0.2915255129337311, + "learning_rate": 8.885761087839594e-05, + "loss": 1.8686, + "step": 7826 + }, + { + "epoch": 2.4023941068139965, + "grad_norm": 0.26619917154312134, + "learning_rate": 8.885448265012095e-05, + "loss": 1.7737, + "step": 7827 + }, + { + "epoch": 2.4027010435850213, + "grad_norm": 0.31685733795166016, + "learning_rate": 8.88513540378632e-05, + "loss": 1.9136, + "step": 7828 + }, + { + "epoch": 2.4030079803560467, + "grad_norm": 0.3427450954914093, + "learning_rate": 8.884822504165359e-05, + "loss": 1.8824, + "step": 7829 + }, + { + "epoch": 2.403314917127072, + "grad_norm": 0.3207513689994812, + "learning_rate": 8.884509566152306e-05, + "loss": 1.8332, + "step": 7830 + }, + { + "epoch": 2.403621853898097, + "grad_norm": 0.3301675319671631, + "learning_rate": 8.884196589750251e-05, + "loss": 1.9129, + "step": 7831 + }, + { + "epoch": 2.4039287906691222, + "grad_norm": 0.3232486844062805, + "learning_rate": 8.88388357496229e-05, + "loss": 1.8362, + "step": 7832 + }, + { + "epoch": 2.404235727440147, + "grad_norm": 0.3152230381965637, + "learning_rate": 8.883570521791514e-05, + "loss": 1.8586, + "step": 7833 + }, + { + "epoch": 2.4045426642111725, + "grad_norm": 0.3204822540283203, + "learning_rate": 8.883257430241019e-05, + "loss": 1.842, + "step": 7834 + }, + { + "epoch": 2.404849600982198, + "grad_norm": 0.28253886103630066, + "learning_rate": 8.882944300313897e-05, + "loss": 1.8521, + "step": 7835 + }, + { + "epoch": 2.4051565377532227, + "grad_norm": 0.37631165981292725, + "learning_rate": 8.882631132013245e-05, + "loss": 1.8838, + "step": 7836 + }, + { + "epoch": 2.405463474524248, + "grad_norm": 0.3606031537055969, + "learning_rate": 8.882317925342157e-05, + "loss": 1.8452, + "step": 7837 + }, + { + "epoch": 2.4057704112952734, + "grad_norm": 0.33793914318084717, + "learning_rate": 8.882004680303726e-05, + "loss": 1.8866, + "step": 7838 + }, + { + "epoch": 2.4060773480662982, + "grad_norm": 0.2714223265647888, + "learning_rate": 8.881691396901048e-05, + "loss": 1.7953, + "step": 7839 + }, + { + "epoch": 2.4063842848373236, + "grad_norm": 0.3588239252567291, + "learning_rate": 8.881378075137224e-05, + "loss": 1.9679, + "step": 7840 + }, + { + "epoch": 2.406691221608349, + "grad_norm": 0.3266383707523346, + "learning_rate": 8.881064715015344e-05, + "loss": 1.8747, + "step": 7841 + }, + { + "epoch": 2.406998158379374, + "grad_norm": 0.3498428761959076, + "learning_rate": 8.88075131653851e-05, + "loss": 1.8882, + "step": 7842 + }, + { + "epoch": 2.407305095150399, + "grad_norm": 0.36646100878715515, + "learning_rate": 8.880437879709815e-05, + "loss": 1.8624, + "step": 7843 + }, + { + "epoch": 2.407612031921424, + "grad_norm": 0.36088457703590393, + "learning_rate": 8.88012440453236e-05, + "loss": 1.8527, + "step": 7844 + }, + { + "epoch": 2.4079189686924494, + "grad_norm": 0.3267477750778198, + "learning_rate": 8.87981089100924e-05, + "loss": 1.8374, + "step": 7845 + }, + { + "epoch": 2.4082259054634747, + "grad_norm": 0.3262403607368469, + "learning_rate": 8.879497339143556e-05, + "loss": 1.8752, + "step": 7846 + }, + { + "epoch": 2.4085328422344996, + "grad_norm": 0.278877854347229, + "learning_rate": 8.879183748938405e-05, + "loss": 1.8056, + "step": 7847 + }, + { + "epoch": 2.408839779005525, + "grad_norm": 0.35509005188941956, + "learning_rate": 8.878870120396886e-05, + "loss": 1.8555, + "step": 7848 + }, + { + "epoch": 2.40914671577655, + "grad_norm": 0.3621126413345337, + "learning_rate": 8.8785564535221e-05, + "loss": 1.8084, + "step": 7849 + }, + { + "epoch": 2.409453652547575, + "grad_norm": 0.2772746682167053, + "learning_rate": 8.878242748317145e-05, + "loss": 1.8034, + "step": 7850 + }, + { + "epoch": 2.4097605893186005, + "grad_norm": 0.30938875675201416, + "learning_rate": 8.877929004785121e-05, + "loss": 1.8341, + "step": 7851 + }, + { + "epoch": 2.4100675260896254, + "grad_norm": 0.3349369764328003, + "learning_rate": 8.877615222929133e-05, + "loss": 1.8306, + "step": 7852 + }, + { + "epoch": 2.4103744628606507, + "grad_norm": 0.3109685778617859, + "learning_rate": 8.877301402752277e-05, + "loss": 1.7998, + "step": 7853 + }, + { + "epoch": 2.410681399631676, + "grad_norm": 0.3337927460670471, + "learning_rate": 8.876987544257655e-05, + "loss": 1.8766, + "step": 7854 + }, + { + "epoch": 2.410988336402701, + "grad_norm": 0.33891361951828003, + "learning_rate": 8.87667364744837e-05, + "loss": 1.8535, + "step": 7855 + }, + { + "epoch": 2.4112952731737263, + "grad_norm": 0.30946552753448486, + "learning_rate": 8.876359712327524e-05, + "loss": 1.8144, + "step": 7856 + }, + { + "epoch": 2.4116022099447516, + "grad_norm": 0.354981929063797, + "learning_rate": 8.87604573889822e-05, + "loss": 1.9253, + "step": 7857 + }, + { + "epoch": 2.4119091467157765, + "grad_norm": 0.42054516077041626, + "learning_rate": 8.875731727163559e-05, + "loss": 1.9122, + "step": 7858 + }, + { + "epoch": 2.412216083486802, + "grad_norm": 0.37435492873191833, + "learning_rate": 8.875417677126646e-05, + "loss": 1.8639, + "step": 7859 + }, + { + "epoch": 2.4125230202578267, + "grad_norm": 0.3742216229438782, + "learning_rate": 8.875103588790584e-05, + "loss": 1.8398, + "step": 7860 + }, + { + "epoch": 2.412829957028852, + "grad_norm": 0.3152104616165161, + "learning_rate": 8.874789462158478e-05, + "loss": 1.8078, + "step": 7861 + }, + { + "epoch": 2.4131368937998774, + "grad_norm": 0.32342761754989624, + "learning_rate": 8.87447529723343e-05, + "loss": 1.8632, + "step": 7862 + }, + { + "epoch": 2.4134438305709023, + "grad_norm": 0.31065210700035095, + "learning_rate": 8.874161094018547e-05, + "loss": 1.845, + "step": 7863 + }, + { + "epoch": 2.4137507673419276, + "grad_norm": 0.31379538774490356, + "learning_rate": 8.873846852516933e-05, + "loss": 1.8184, + "step": 7864 + }, + { + "epoch": 2.4140577041129525, + "grad_norm": 0.29058924317359924, + "learning_rate": 8.873532572731694e-05, + "loss": 1.8671, + "step": 7865 + }, + { + "epoch": 2.414364640883978, + "grad_norm": 0.3024691641330719, + "learning_rate": 8.873218254665936e-05, + "loss": 1.7977, + "step": 7866 + }, + { + "epoch": 2.414671577655003, + "grad_norm": 0.30356913805007935, + "learning_rate": 8.872903898322764e-05, + "loss": 1.8284, + "step": 7867 + }, + { + "epoch": 2.414978514426028, + "grad_norm": 0.29594334959983826, + "learning_rate": 8.872589503705287e-05, + "loss": 1.8651, + "step": 7868 + }, + { + "epoch": 2.4152854511970534, + "grad_norm": 0.2929564118385315, + "learning_rate": 8.872275070816612e-05, + "loss": 1.8671, + "step": 7869 + }, + { + "epoch": 2.4155923879680787, + "grad_norm": 0.30591902136802673, + "learning_rate": 8.871960599659842e-05, + "loss": 1.9341, + "step": 7870 + }, + { + "epoch": 2.4158993247391036, + "grad_norm": 0.3944799304008484, + "learning_rate": 8.87164609023809e-05, + "loss": 1.8947, + "step": 7871 + }, + { + "epoch": 2.416206261510129, + "grad_norm": 0.3568263351917267, + "learning_rate": 8.871331542554461e-05, + "loss": 1.8466, + "step": 7872 + }, + { + "epoch": 2.4165131982811543, + "grad_norm": 0.3182635009288788, + "learning_rate": 8.871016956612066e-05, + "loss": 1.8373, + "step": 7873 + }, + { + "epoch": 2.416820135052179, + "grad_norm": 0.31941649317741394, + "learning_rate": 8.870702332414012e-05, + "loss": 1.8356, + "step": 7874 + }, + { + "epoch": 2.4171270718232045, + "grad_norm": 0.3090899586677551, + "learning_rate": 8.870387669963407e-05, + "loss": 1.9308, + "step": 7875 + }, + { + "epoch": 2.4174340085942294, + "grad_norm": 0.3078390955924988, + "learning_rate": 8.870072969263364e-05, + "loss": 1.8521, + "step": 7876 + }, + { + "epoch": 2.4177409453652547, + "grad_norm": 0.29126885533332825, + "learning_rate": 8.869758230316992e-05, + "loss": 1.8091, + "step": 7877 + }, + { + "epoch": 2.41804788213628, + "grad_norm": 0.36473605036735535, + "learning_rate": 8.869443453127402e-05, + "loss": 1.8282, + "step": 7878 + }, + { + "epoch": 2.418354818907305, + "grad_norm": 0.3617660701274872, + "learning_rate": 8.869128637697702e-05, + "loss": 1.8843, + "step": 7879 + }, + { + "epoch": 2.4186617556783303, + "grad_norm": 0.33267220854759216, + "learning_rate": 8.868813784031005e-05, + "loss": 1.8647, + "step": 7880 + }, + { + "epoch": 2.418968692449355, + "grad_norm": 0.29990482330322266, + "learning_rate": 8.868498892130424e-05, + "loss": 1.7697, + "step": 7881 + }, + { + "epoch": 2.4192756292203805, + "grad_norm": 0.3618892431259155, + "learning_rate": 8.868183961999068e-05, + "loss": 1.7699, + "step": 7882 + }, + { + "epoch": 2.419582565991406, + "grad_norm": 0.29534587264060974, + "learning_rate": 8.867868993640051e-05, + "loss": 1.828, + "step": 7883 + }, + { + "epoch": 2.4198895027624308, + "grad_norm": 0.3086758255958557, + "learning_rate": 8.867553987056487e-05, + "loss": 1.8652, + "step": 7884 + }, + { + "epoch": 2.420196439533456, + "grad_norm": 0.3273947834968567, + "learning_rate": 8.867238942251487e-05, + "loss": 1.8553, + "step": 7885 + }, + { + "epoch": 2.4205033763044814, + "grad_norm": 0.3069070279598236, + "learning_rate": 8.866923859228165e-05, + "loss": 1.8057, + "step": 7886 + }, + { + "epoch": 2.4208103130755063, + "grad_norm": 0.2884439527988434, + "learning_rate": 8.866608737989635e-05, + "loss": 1.8479, + "step": 7887 + }, + { + "epoch": 2.4211172498465316, + "grad_norm": 0.32123002409935, + "learning_rate": 8.866293578539011e-05, + "loss": 1.916, + "step": 7888 + }, + { + "epoch": 2.421424186617557, + "grad_norm": 0.285966157913208, + "learning_rate": 8.865978380879407e-05, + "loss": 1.834, + "step": 7889 + }, + { + "epoch": 2.421731123388582, + "grad_norm": 0.28088799118995667, + "learning_rate": 8.865663145013941e-05, + "loss": 1.7794, + "step": 7890 + }, + { + "epoch": 2.422038060159607, + "grad_norm": 0.31160372495651245, + "learning_rate": 8.865347870945724e-05, + "loss": 1.8584, + "step": 7891 + }, + { + "epoch": 2.422344996930632, + "grad_norm": 0.3121089041233063, + "learning_rate": 8.865032558677874e-05, + "loss": 1.8797, + "step": 7892 + }, + { + "epoch": 2.4226519337016574, + "grad_norm": 0.35856643319129944, + "learning_rate": 8.864717208213506e-05, + "loss": 1.8664, + "step": 7893 + }, + { + "epoch": 2.4229588704726828, + "grad_norm": 0.32826781272888184, + "learning_rate": 8.864401819555739e-05, + "loss": 1.8473, + "step": 7894 + }, + { + "epoch": 2.4232658072437077, + "grad_norm": 0.34450921416282654, + "learning_rate": 8.86408639270769e-05, + "loss": 1.918, + "step": 7895 + }, + { + "epoch": 2.423572744014733, + "grad_norm": 0.39621153473854065, + "learning_rate": 8.86377092767247e-05, + "loss": 1.9411, + "step": 7896 + }, + { + "epoch": 2.423879680785758, + "grad_norm": 0.3765166103839874, + "learning_rate": 8.863455424453204e-05, + "loss": 1.9003, + "step": 7897 + }, + { + "epoch": 2.424186617556783, + "grad_norm": 0.3942621946334839, + "learning_rate": 8.863139883053007e-05, + "loss": 1.9647, + "step": 7898 + }, + { + "epoch": 2.4244935543278086, + "grad_norm": 0.4255806803703308, + "learning_rate": 8.862824303474996e-05, + "loss": 1.9147, + "step": 7899 + }, + { + "epoch": 2.424800491098834, + "grad_norm": 0.3993197977542877, + "learning_rate": 8.862508685722292e-05, + "loss": 1.8822, + "step": 7900 + }, + { + "epoch": 2.425107427869859, + "grad_norm": 0.3734201490879059, + "learning_rate": 8.862193029798013e-05, + "loss": 1.8745, + "step": 7901 + }, + { + "epoch": 2.425414364640884, + "grad_norm": 0.40955278277397156, + "learning_rate": 8.861877335705279e-05, + "loss": 1.877, + "step": 7902 + }, + { + "epoch": 2.425721301411909, + "grad_norm": 0.3975965678691864, + "learning_rate": 8.861561603447211e-05, + "loss": 1.868, + "step": 7903 + }, + { + "epoch": 2.4260282381829343, + "grad_norm": 0.30194091796875, + "learning_rate": 8.861245833026926e-05, + "loss": 1.7849, + "step": 7904 + }, + { + "epoch": 2.4263351749539597, + "grad_norm": 0.349930077791214, + "learning_rate": 8.860930024447547e-05, + "loss": 1.891, + "step": 7905 + }, + { + "epoch": 2.4266421117249846, + "grad_norm": 0.40644606947898865, + "learning_rate": 8.860614177712196e-05, + "loss": 1.8463, + "step": 7906 + }, + { + "epoch": 2.42694904849601, + "grad_norm": 0.3627426028251648, + "learning_rate": 8.86029829282399e-05, + "loss": 1.8518, + "step": 7907 + }, + { + "epoch": 2.427255985267035, + "grad_norm": 0.4019826054573059, + "learning_rate": 8.859982369786055e-05, + "loss": 1.7997, + "step": 7908 + }, + { + "epoch": 2.42756292203806, + "grad_norm": 0.375589519739151, + "learning_rate": 8.859666408601512e-05, + "loss": 1.9136, + "step": 7909 + }, + { + "epoch": 2.4278698588090855, + "grad_norm": 0.3135814070701599, + "learning_rate": 8.859350409273484e-05, + "loss": 1.8511, + "step": 7910 + }, + { + "epoch": 2.4281767955801103, + "grad_norm": 0.4534473717212677, + "learning_rate": 8.859034371805093e-05, + "loss": 1.9827, + "step": 7911 + }, + { + "epoch": 2.4284837323511357, + "grad_norm": 0.5559772849082947, + "learning_rate": 8.858718296199462e-05, + "loss": 1.8578, + "step": 7912 + }, + { + "epoch": 2.428790669122161, + "grad_norm": 0.4518011212348938, + "learning_rate": 8.858402182459715e-05, + "loss": 1.8374, + "step": 7913 + }, + { + "epoch": 2.429097605893186, + "grad_norm": 0.31662946939468384, + "learning_rate": 8.858086030588977e-05, + "loss": 1.8356, + "step": 7914 + }, + { + "epoch": 2.4294045426642112, + "grad_norm": 0.4660717844963074, + "learning_rate": 8.857769840590371e-05, + "loss": 1.7977, + "step": 7915 + }, + { + "epoch": 2.4297114794352366, + "grad_norm": 0.5611162185668945, + "learning_rate": 8.857453612467022e-05, + "loss": 1.8423, + "step": 7916 + }, + { + "epoch": 2.4300184162062615, + "grad_norm": 0.5055921077728271, + "learning_rate": 8.857137346222056e-05, + "loss": 1.8595, + "step": 7917 + }, + { + "epoch": 2.430325352977287, + "grad_norm": 0.3589123487472534, + "learning_rate": 8.856821041858597e-05, + "loss": 1.776, + "step": 7918 + }, + { + "epoch": 2.4306322897483117, + "grad_norm": 0.36849313974380493, + "learning_rate": 8.856504699379773e-05, + "loss": 1.8695, + "step": 7919 + }, + { + "epoch": 2.430939226519337, + "grad_norm": 0.47566625475883484, + "learning_rate": 8.856188318788709e-05, + "loss": 1.8578, + "step": 7920 + }, + { + "epoch": 2.4312461632903624, + "grad_norm": 0.554790735244751, + "learning_rate": 8.855871900088532e-05, + "loss": 1.8406, + "step": 7921 + }, + { + "epoch": 2.4315531000613873, + "grad_norm": 0.4846283197402954, + "learning_rate": 8.855555443282369e-05, + "loss": 1.8475, + "step": 7922 + }, + { + "epoch": 2.4318600368324126, + "grad_norm": 0.35256531834602356, + "learning_rate": 8.855238948373346e-05, + "loss": 1.8594, + "step": 7923 + }, + { + "epoch": 2.4321669736034375, + "grad_norm": 0.3713412880897522, + "learning_rate": 8.854922415364593e-05, + "loss": 1.893, + "step": 7924 + }, + { + "epoch": 2.432473910374463, + "grad_norm": 0.4289644658565521, + "learning_rate": 8.854605844259237e-05, + "loss": 1.8958, + "step": 7925 + }, + { + "epoch": 2.432780847145488, + "grad_norm": 0.4209578335285187, + "learning_rate": 8.854289235060406e-05, + "loss": 1.8419, + "step": 7926 + }, + { + "epoch": 2.433087783916513, + "grad_norm": 0.41226091980934143, + "learning_rate": 8.853972587771232e-05, + "loss": 1.958, + "step": 7927 + }, + { + "epoch": 2.4333947206875384, + "grad_norm": 0.36133915185928345, + "learning_rate": 8.853655902394841e-05, + "loss": 1.9181, + "step": 7928 + }, + { + "epoch": 2.4337016574585637, + "grad_norm": 0.44178202748298645, + "learning_rate": 8.853339178934363e-05, + "loss": 1.9242, + "step": 7929 + }, + { + "epoch": 2.4340085942295886, + "grad_norm": 0.4537523686885834, + "learning_rate": 8.853022417392929e-05, + "loss": 2.0451, + "step": 7930 + }, + { + "epoch": 2.434315531000614, + "grad_norm": 0.3214915990829468, + "learning_rate": 8.852705617773669e-05, + "loss": 1.8549, + "step": 7931 + }, + { + "epoch": 2.4346224677716393, + "grad_norm": 0.4621930420398712, + "learning_rate": 8.852388780079714e-05, + "loss": 1.8705, + "step": 7932 + }, + { + "epoch": 2.434929404542664, + "grad_norm": 0.52337646484375, + "learning_rate": 8.852071904314196e-05, + "loss": 1.8381, + "step": 7933 + }, + { + "epoch": 2.4352363413136895, + "grad_norm": 0.3846060633659363, + "learning_rate": 8.851754990480246e-05, + "loss": 1.828, + "step": 7934 + }, + { + "epoch": 2.4355432780847144, + "grad_norm": 0.34233763813972473, + "learning_rate": 8.851438038580994e-05, + "loss": 1.924, + "step": 7935 + }, + { + "epoch": 2.4358502148557397, + "grad_norm": 0.39583292603492737, + "learning_rate": 8.851121048619574e-05, + "loss": 1.8383, + "step": 7936 + }, + { + "epoch": 2.436157151626765, + "grad_norm": 0.3715476393699646, + "learning_rate": 8.850804020599119e-05, + "loss": 1.9251, + "step": 7937 + }, + { + "epoch": 2.43646408839779, + "grad_norm": 0.32089582085609436, + "learning_rate": 8.850486954522762e-05, + "loss": 1.9317, + "step": 7938 + }, + { + "epoch": 2.4367710251688153, + "grad_norm": 0.46823611855506897, + "learning_rate": 8.850169850393634e-05, + "loss": 1.9743, + "step": 7939 + }, + { + "epoch": 2.43707796193984, + "grad_norm": 0.405205637216568, + "learning_rate": 8.849852708214874e-05, + "loss": 1.8772, + "step": 7940 + }, + { + "epoch": 2.4373848987108655, + "grad_norm": 0.33672770857810974, + "learning_rate": 8.849535527989612e-05, + "loss": 1.8767, + "step": 7941 + }, + { + "epoch": 2.437691835481891, + "grad_norm": 0.38022953271865845, + "learning_rate": 8.849218309720983e-05, + "loss": 1.8882, + "step": 7942 + }, + { + "epoch": 2.4379987722529157, + "grad_norm": 0.4224186837673187, + "learning_rate": 8.848901053412124e-05, + "loss": 1.9016, + "step": 7943 + }, + { + "epoch": 2.438305709023941, + "grad_norm": 0.3890904486179352, + "learning_rate": 8.848583759066167e-05, + "loss": 1.8761, + "step": 7944 + }, + { + "epoch": 2.4386126457949664, + "grad_norm": 0.3747030794620514, + "learning_rate": 8.84826642668625e-05, + "loss": 1.8576, + "step": 7945 + }, + { + "epoch": 2.4389195825659913, + "grad_norm": 0.3317604959011078, + "learning_rate": 8.84794905627551e-05, + "loss": 1.9249, + "step": 7946 + }, + { + "epoch": 2.4392265193370166, + "grad_norm": 0.3294972777366638, + "learning_rate": 8.84763164783708e-05, + "loss": 1.8308, + "step": 7947 + }, + { + "epoch": 2.439533456108042, + "grad_norm": 0.42031124234199524, + "learning_rate": 8.847314201374101e-05, + "loss": 1.7884, + "step": 7948 + }, + { + "epoch": 2.439840392879067, + "grad_norm": 0.4018419682979584, + "learning_rate": 8.846996716889708e-05, + "loss": 1.8334, + "step": 7949 + }, + { + "epoch": 2.440147329650092, + "grad_norm": 0.39541858434677124, + "learning_rate": 8.846679194387036e-05, + "loss": 1.888, + "step": 7950 + }, + { + "epoch": 2.440454266421117, + "grad_norm": 0.34641456604003906, + "learning_rate": 8.846361633869228e-05, + "loss": 1.8521, + "step": 7951 + }, + { + "epoch": 2.4407612031921424, + "grad_norm": 0.42987826466560364, + "learning_rate": 8.846044035339419e-05, + "loss": 1.8789, + "step": 7952 + }, + { + "epoch": 2.4410681399631677, + "grad_norm": 0.3651089072227478, + "learning_rate": 8.845726398800749e-05, + "loss": 1.9024, + "step": 7953 + }, + { + "epoch": 2.4413750767341926, + "grad_norm": 0.3024137616157532, + "learning_rate": 8.845408724256356e-05, + "loss": 1.7773, + "step": 7954 + }, + { + "epoch": 2.441682013505218, + "grad_norm": 0.32426944375038147, + "learning_rate": 8.845091011709381e-05, + "loss": 1.7873, + "step": 7955 + }, + { + "epoch": 2.441988950276243, + "grad_norm": 0.34448274970054626, + "learning_rate": 8.844773261162962e-05, + "loss": 1.8854, + "step": 7956 + }, + { + "epoch": 2.442295887047268, + "grad_norm": 0.2942068874835968, + "learning_rate": 8.844455472620241e-05, + "loss": 1.8186, + "step": 7957 + }, + { + "epoch": 2.4426028238182935, + "grad_norm": 0.3849888741970062, + "learning_rate": 8.844137646084358e-05, + "loss": 1.905, + "step": 7958 + }, + { + "epoch": 2.4429097605893184, + "grad_norm": 0.44277897477149963, + "learning_rate": 8.843819781558452e-05, + "loss": 1.8836, + "step": 7959 + }, + { + "epoch": 2.4432166973603437, + "grad_norm": 0.34470248222351074, + "learning_rate": 8.843501879045667e-05, + "loss": 1.9368, + "step": 7960 + }, + { + "epoch": 2.443523634131369, + "grad_norm": 0.29713204503059387, + "learning_rate": 8.843183938549145e-05, + "loss": 1.8562, + "step": 7961 + }, + { + "epoch": 2.443830570902394, + "grad_norm": 0.370623379945755, + "learning_rate": 8.842865960072025e-05, + "loss": 1.8501, + "step": 7962 + }, + { + "epoch": 2.4441375076734193, + "grad_norm": 0.38828277587890625, + "learning_rate": 8.842547943617453e-05, + "loss": 1.884, + "step": 7963 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.294223427772522, + "learning_rate": 8.842229889188566e-05, + "loss": 1.857, + "step": 7964 + }, + { + "epoch": 2.4447513812154695, + "grad_norm": 0.31901589035987854, + "learning_rate": 8.841911796788516e-05, + "loss": 1.8675, + "step": 7965 + }, + { + "epoch": 2.445058317986495, + "grad_norm": 0.3586447834968567, + "learning_rate": 8.84159366642044e-05, + "loss": 1.86, + "step": 7966 + }, + { + "epoch": 2.4453652547575198, + "grad_norm": 0.30848199129104614, + "learning_rate": 8.841275498087482e-05, + "loss": 1.8153, + "step": 7967 + }, + { + "epoch": 2.445672191528545, + "grad_norm": 0.2694801688194275, + "learning_rate": 8.84095729179279e-05, + "loss": 1.7702, + "step": 7968 + }, + { + "epoch": 2.4459791282995704, + "grad_norm": 0.3068044185638428, + "learning_rate": 8.840639047539507e-05, + "loss": 1.8531, + "step": 7969 + }, + { + "epoch": 2.4462860650705953, + "grad_norm": 0.32885125279426575, + "learning_rate": 8.840320765330776e-05, + "loss": 1.9194, + "step": 7970 + }, + { + "epoch": 2.4465930018416207, + "grad_norm": 0.2949635088443756, + "learning_rate": 8.840002445169746e-05, + "loss": 1.8427, + "step": 7971 + }, + { + "epoch": 2.446899938612646, + "grad_norm": 0.27281275391578674, + "learning_rate": 8.83968408705956e-05, + "loss": 1.8279, + "step": 7972 + }, + { + "epoch": 2.447206875383671, + "grad_norm": 0.3038519620895386, + "learning_rate": 8.839365691003367e-05, + "loss": 1.8629, + "step": 7973 + }, + { + "epoch": 2.447513812154696, + "grad_norm": 0.28468266129493713, + "learning_rate": 8.839047257004311e-05, + "loss": 1.8765, + "step": 7974 + }, + { + "epoch": 2.4478207489257215, + "grad_norm": 0.29807159304618835, + "learning_rate": 8.83872878506554e-05, + "loss": 1.8152, + "step": 7975 + }, + { + "epoch": 2.4481276856967464, + "grad_norm": 0.3005301356315613, + "learning_rate": 8.838410275190201e-05, + "loss": 1.8577, + "step": 7976 + }, + { + "epoch": 2.4484346224677718, + "grad_norm": 0.3068598806858063, + "learning_rate": 8.838091727381442e-05, + "loss": 1.863, + "step": 7977 + }, + { + "epoch": 2.4487415592387967, + "grad_norm": 0.33748000860214233, + "learning_rate": 8.837773141642411e-05, + "loss": 1.7889, + "step": 7978 + }, + { + "epoch": 2.449048496009822, + "grad_norm": 0.344417542219162, + "learning_rate": 8.837454517976256e-05, + "loss": 1.9167, + "step": 7979 + }, + { + "epoch": 2.4493554327808473, + "grad_norm": 0.29128298163414, + "learning_rate": 8.837135856386127e-05, + "loss": 1.8246, + "step": 7980 + }, + { + "epoch": 2.449662369551872, + "grad_norm": 0.27023759484291077, + "learning_rate": 8.836817156875172e-05, + "loss": 1.8493, + "step": 7981 + }, + { + "epoch": 2.4499693063228976, + "grad_norm": 0.2792586088180542, + "learning_rate": 8.836498419446541e-05, + "loss": 1.8739, + "step": 7982 + }, + { + "epoch": 2.4502762430939224, + "grad_norm": 0.2715211510658264, + "learning_rate": 8.836179644103384e-05, + "loss": 1.8218, + "step": 7983 + }, + { + "epoch": 2.450583179864948, + "grad_norm": 0.273576557636261, + "learning_rate": 8.835860830848851e-05, + "loss": 1.9063, + "step": 7984 + }, + { + "epoch": 2.450890116635973, + "grad_norm": 0.2992589473724365, + "learning_rate": 8.835541979686093e-05, + "loss": 1.8799, + "step": 7985 + }, + { + "epoch": 2.451197053406998, + "grad_norm": 0.3231843411922455, + "learning_rate": 8.835223090618263e-05, + "loss": 1.8956, + "step": 7986 + }, + { + "epoch": 2.4515039901780233, + "grad_norm": 0.31108468770980835, + "learning_rate": 8.834904163648508e-05, + "loss": 1.8371, + "step": 7987 + }, + { + "epoch": 2.4518109269490487, + "grad_norm": 0.26657021045684814, + "learning_rate": 8.834585198779983e-05, + "loss": 1.8384, + "step": 7988 + }, + { + "epoch": 2.4521178637200736, + "grad_norm": 0.32093849778175354, + "learning_rate": 8.83426619601584e-05, + "loss": 1.8603, + "step": 7989 + }, + { + "epoch": 2.452424800491099, + "grad_norm": 0.32942765951156616, + "learning_rate": 8.833947155359231e-05, + "loss": 1.8306, + "step": 7990 + }, + { + "epoch": 2.4527317372621242, + "grad_norm": 0.31677374243736267, + "learning_rate": 8.83362807681331e-05, + "loss": 1.8339, + "step": 7991 + }, + { + "epoch": 2.453038674033149, + "grad_norm": 0.2739655673503876, + "learning_rate": 8.833308960381228e-05, + "loss": 1.8514, + "step": 7992 + }, + { + "epoch": 2.4533456108041745, + "grad_norm": 0.3194214105606079, + "learning_rate": 8.83298980606614e-05, + "loss": 1.8413, + "step": 7993 + }, + { + "epoch": 2.4536525475751993, + "grad_norm": 0.3346202075481415, + "learning_rate": 8.832670613871202e-05, + "loss": 1.8558, + "step": 7994 + }, + { + "epoch": 2.4539594843462247, + "grad_norm": 0.3400736451148987, + "learning_rate": 8.832351383799565e-05, + "loss": 1.8668, + "step": 7995 + }, + { + "epoch": 2.45426642111725, + "grad_norm": 0.2807479202747345, + "learning_rate": 8.832032115854385e-05, + "loss": 1.8361, + "step": 7996 + }, + { + "epoch": 2.454573357888275, + "grad_norm": 0.2977379262447357, + "learning_rate": 8.831712810038817e-05, + "loss": 1.84, + "step": 7997 + }, + { + "epoch": 2.4548802946593002, + "grad_norm": 0.3242948353290558, + "learning_rate": 8.831393466356019e-05, + "loss": 1.9421, + "step": 7998 + }, + { + "epoch": 2.455187231430325, + "grad_norm": 0.3289327025413513, + "learning_rate": 8.831074084809144e-05, + "loss": 1.9348, + "step": 7999 + }, + { + "epoch": 2.4554941682013505, + "grad_norm": 0.3378387987613678, + "learning_rate": 8.830754665401351e-05, + "loss": 1.7871, + "step": 8000 + }, + { + "epoch": 2.455801104972376, + "grad_norm": 0.29627665877342224, + "learning_rate": 8.830435208135794e-05, + "loss": 1.815, + "step": 8001 + }, + { + "epoch": 2.4561080417434007, + "grad_norm": 0.3509432375431061, + "learning_rate": 8.83011571301563e-05, + "loss": 1.9209, + "step": 8002 + }, + { + "epoch": 2.456414978514426, + "grad_norm": 0.3272305130958557, + "learning_rate": 8.829796180044019e-05, + "loss": 1.8437, + "step": 8003 + }, + { + "epoch": 2.4567219152854514, + "grad_norm": 0.33997493982315063, + "learning_rate": 8.829476609224119e-05, + "loss": 1.8827, + "step": 8004 + }, + { + "epoch": 2.4570288520564763, + "grad_norm": 0.30387789011001587, + "learning_rate": 8.829157000559084e-05, + "loss": 1.8427, + "step": 8005 + }, + { + "epoch": 2.4573357888275016, + "grad_norm": 0.30266425013542175, + "learning_rate": 8.828837354052075e-05, + "loss": 1.8274, + "step": 8006 + }, + { + "epoch": 2.457642725598527, + "grad_norm": 0.365546315908432, + "learning_rate": 8.828517669706254e-05, + "loss": 1.8455, + "step": 8007 + }, + { + "epoch": 2.457949662369552, + "grad_norm": 0.339226633310318, + "learning_rate": 8.828197947524774e-05, + "loss": 1.8665, + "step": 8008 + }, + { + "epoch": 2.458256599140577, + "grad_norm": 0.31167346239089966, + "learning_rate": 8.8278781875108e-05, + "loss": 1.7807, + "step": 8009 + }, + { + "epoch": 2.458563535911602, + "grad_norm": 0.2788028120994568, + "learning_rate": 8.82755838966749e-05, + "loss": 1.8834, + "step": 8010 + }, + { + "epoch": 2.4588704726826274, + "grad_norm": 0.34648752212524414, + "learning_rate": 8.827238553998005e-05, + "loss": 1.8981, + "step": 8011 + }, + { + "epoch": 2.4591774094536527, + "grad_norm": 0.3169974982738495, + "learning_rate": 8.826918680505504e-05, + "loss": 1.81, + "step": 8012 + }, + { + "epoch": 2.4594843462246776, + "grad_norm": 0.46924272179603577, + "learning_rate": 8.826598769193151e-05, + "loss": 1.9016, + "step": 8013 + }, + { + "epoch": 2.459791282995703, + "grad_norm": 0.38437098264694214, + "learning_rate": 8.826278820064106e-05, + "loss": 1.8924, + "step": 8014 + }, + { + "epoch": 2.460098219766728, + "grad_norm": 0.3350604474544525, + "learning_rate": 8.82595883312153e-05, + "loss": 1.8591, + "step": 8015 + }, + { + "epoch": 2.460405156537753, + "grad_norm": 0.3053742051124573, + "learning_rate": 8.825638808368588e-05, + "loss": 1.8114, + "step": 8016 + }, + { + "epoch": 2.4607120933087785, + "grad_norm": 0.29566875100135803, + "learning_rate": 8.82531874580844e-05, + "loss": 1.8055, + "step": 8017 + }, + { + "epoch": 2.4610190300798034, + "grad_norm": 0.3057360053062439, + "learning_rate": 8.824998645444249e-05, + "loss": 1.8268, + "step": 8018 + }, + { + "epoch": 2.4613259668508287, + "grad_norm": 0.27333348989486694, + "learning_rate": 8.82467850727918e-05, + "loss": 1.7876, + "step": 8019 + }, + { + "epoch": 2.461632903621854, + "grad_norm": 0.29202890396118164, + "learning_rate": 8.824358331316398e-05, + "loss": 1.8488, + "step": 8020 + }, + { + "epoch": 2.461939840392879, + "grad_norm": 0.3640623986721039, + "learning_rate": 8.824038117559064e-05, + "loss": 1.9665, + "step": 8021 + }, + { + "epoch": 2.4622467771639043, + "grad_norm": 0.35411131381988525, + "learning_rate": 8.823717866010344e-05, + "loss": 1.8561, + "step": 8022 + }, + { + "epoch": 2.4625537139349296, + "grad_norm": 0.3695240020751953, + "learning_rate": 8.823397576673403e-05, + "loss": 1.8489, + "step": 8023 + }, + { + "epoch": 2.4628606507059545, + "grad_norm": 0.36554715037345886, + "learning_rate": 8.823077249551406e-05, + "loss": 1.8523, + "step": 8024 + }, + { + "epoch": 2.46316758747698, + "grad_norm": 0.2982638478279114, + "learning_rate": 8.822756884647521e-05, + "loss": 1.8006, + "step": 8025 + }, + { + "epoch": 2.4634745242480047, + "grad_norm": 0.3693525791168213, + "learning_rate": 8.822436481964909e-05, + "loss": 1.8695, + "step": 8026 + }, + { + "epoch": 2.46378146101903, + "grad_norm": 0.46769842505455017, + "learning_rate": 8.82211604150674e-05, + "loss": 1.8509, + "step": 8027 + }, + { + "epoch": 2.4640883977900554, + "grad_norm": 0.5327584743499756, + "learning_rate": 8.82179556327618e-05, + "loss": 1.8642, + "step": 8028 + }, + { + "epoch": 2.4643953345610803, + "grad_norm": 0.5302795767784119, + "learning_rate": 8.821475047276398e-05, + "loss": 1.8645, + "step": 8029 + }, + { + "epoch": 2.4647022713321056, + "grad_norm": 0.43549028038978577, + "learning_rate": 8.821154493510557e-05, + "loss": 1.9193, + "step": 8030 + }, + { + "epoch": 2.4650092081031305, + "grad_norm": 0.3013847768306732, + "learning_rate": 8.82083390198183e-05, + "loss": 1.7819, + "step": 8031 + }, + { + "epoch": 2.465316144874156, + "grad_norm": 0.422325074672699, + "learning_rate": 8.820513272693383e-05, + "loss": 1.9307, + "step": 8032 + }, + { + "epoch": 2.465623081645181, + "grad_norm": 0.4823217988014221, + "learning_rate": 8.820192605648383e-05, + "loss": 1.8681, + "step": 8033 + }, + { + "epoch": 2.465930018416206, + "grad_norm": 0.3938382863998413, + "learning_rate": 8.819871900850001e-05, + "loss": 1.8483, + "step": 8034 + }, + { + "epoch": 2.4662369551872314, + "grad_norm": 0.30860164761543274, + "learning_rate": 8.819551158301406e-05, + "loss": 1.8818, + "step": 8035 + }, + { + "epoch": 2.4665438919582567, + "grad_norm": 0.3715503215789795, + "learning_rate": 8.819230378005767e-05, + "loss": 1.8443, + "step": 8036 + }, + { + "epoch": 2.4668508287292816, + "grad_norm": 0.4750272333621979, + "learning_rate": 8.818909559966255e-05, + "loss": 1.8379, + "step": 8037 + }, + { + "epoch": 2.467157765500307, + "grad_norm": 0.4794345796108246, + "learning_rate": 8.818588704186041e-05, + "loss": 1.8585, + "step": 8038 + }, + { + "epoch": 2.4674647022713323, + "grad_norm": 0.33470577001571655, + "learning_rate": 8.818267810668296e-05, + "loss": 1.8231, + "step": 8039 + }, + { + "epoch": 2.467771639042357, + "grad_norm": 0.31480371952056885, + "learning_rate": 8.817946879416191e-05, + "loss": 1.867, + "step": 8040 + }, + { + "epoch": 2.4680785758133825, + "grad_norm": 0.41635531187057495, + "learning_rate": 8.817625910432897e-05, + "loss": 1.9385, + "step": 8041 + }, + { + "epoch": 2.4683855125844074, + "grad_norm": 0.4570399522781372, + "learning_rate": 8.817304903721584e-05, + "loss": 1.7855, + "step": 8042 + }, + { + "epoch": 2.4686924493554327, + "grad_norm": 0.36506229639053345, + "learning_rate": 8.816983859285429e-05, + "loss": 1.808, + "step": 8043 + }, + { + "epoch": 2.468999386126458, + "grad_norm": 0.2650545537471771, + "learning_rate": 8.8166627771276e-05, + "loss": 1.8271, + "step": 8044 + }, + { + "epoch": 2.469306322897483, + "grad_norm": 0.3143758475780487, + "learning_rate": 8.816341657251272e-05, + "loss": 1.9016, + "step": 8045 + }, + { + "epoch": 2.4696132596685083, + "grad_norm": 0.3015407621860504, + "learning_rate": 8.81602049965962e-05, + "loss": 1.8357, + "step": 8046 + }, + { + "epoch": 2.4699201964395336, + "grad_norm": 0.26860085129737854, + "learning_rate": 8.815699304355819e-05, + "loss": 1.8223, + "step": 8047 + }, + { + "epoch": 2.4702271332105585, + "grad_norm": 0.2852436602115631, + "learning_rate": 8.81537807134304e-05, + "loss": 1.8298, + "step": 8048 + }, + { + "epoch": 2.470534069981584, + "grad_norm": 0.29519692063331604, + "learning_rate": 8.815056800624457e-05, + "loss": 1.863, + "step": 8049 + }, + { + "epoch": 2.470841006752609, + "grad_norm": 0.3163367807865143, + "learning_rate": 8.814735492203247e-05, + "loss": 1.878, + "step": 8050 + }, + { + "epoch": 2.471147943523634, + "grad_norm": 0.2955954968929291, + "learning_rate": 8.814414146082586e-05, + "loss": 1.8657, + "step": 8051 + }, + { + "epoch": 2.4714548802946594, + "grad_norm": 0.2773810029029846, + "learning_rate": 8.814092762265648e-05, + "loss": 1.7626, + "step": 8052 + }, + { + "epoch": 2.4717618170656843, + "grad_norm": 0.33908557891845703, + "learning_rate": 8.813771340755609e-05, + "loss": 1.8902, + "step": 8053 + }, + { + "epoch": 2.4720687538367097, + "grad_norm": 0.3083830773830414, + "learning_rate": 8.81344988155565e-05, + "loss": 1.876, + "step": 8054 + }, + { + "epoch": 2.472375690607735, + "grad_norm": 0.29082754254341125, + "learning_rate": 8.81312838466894e-05, + "loss": 1.8637, + "step": 8055 + }, + { + "epoch": 2.47268262737876, + "grad_norm": 0.3240490257740021, + "learning_rate": 8.81280685009866e-05, + "loss": 1.9096, + "step": 8056 + }, + { + "epoch": 2.472989564149785, + "grad_norm": 0.364561527967453, + "learning_rate": 8.812485277847991e-05, + "loss": 1.9361, + "step": 8057 + }, + { + "epoch": 2.47329650092081, + "grad_norm": 0.3420087695121765, + "learning_rate": 8.812163667920107e-05, + "loss": 1.9014, + "step": 8058 + }, + { + "epoch": 2.4736034376918354, + "grad_norm": 0.3346010148525238, + "learning_rate": 8.811842020318186e-05, + "loss": 1.9195, + "step": 8059 + }, + { + "epoch": 2.4739103744628608, + "grad_norm": 0.2990448772907257, + "learning_rate": 8.811520335045409e-05, + "loss": 1.8866, + "step": 8060 + }, + { + "epoch": 2.4742173112338857, + "grad_norm": 0.3047022223472595, + "learning_rate": 8.811198612104953e-05, + "loss": 1.8226, + "step": 8061 + }, + { + "epoch": 2.474524248004911, + "grad_norm": 0.300020307302475, + "learning_rate": 8.8108768515e-05, + "loss": 1.8496, + "step": 8062 + }, + { + "epoch": 2.4748311847759363, + "grad_norm": 0.31999605894088745, + "learning_rate": 8.810555053233729e-05, + "loss": 1.7853, + "step": 8063 + }, + { + "epoch": 2.4751381215469612, + "grad_norm": 0.3136597275733948, + "learning_rate": 8.810233217309318e-05, + "loss": 1.9317, + "step": 8064 + }, + { + "epoch": 2.4754450583179866, + "grad_norm": 0.3373543322086334, + "learning_rate": 8.809911343729948e-05, + "loss": 1.7827, + "step": 8065 + }, + { + "epoch": 2.475751995089012, + "grad_norm": 0.33876341581344604, + "learning_rate": 8.809589432498804e-05, + "loss": 1.8803, + "step": 8066 + }, + { + "epoch": 2.476058931860037, + "grad_norm": 0.3455486297607422, + "learning_rate": 8.809267483619061e-05, + "loss": 1.8987, + "step": 8067 + }, + { + "epoch": 2.476365868631062, + "grad_norm": 0.34245389699935913, + "learning_rate": 8.808945497093907e-05, + "loss": 1.8948, + "step": 8068 + }, + { + "epoch": 2.476672805402087, + "grad_norm": 0.3200787901878357, + "learning_rate": 8.808623472926521e-05, + "loss": 1.8234, + "step": 8069 + }, + { + "epoch": 2.4769797421731123, + "grad_norm": 0.3244795799255371, + "learning_rate": 8.808301411120083e-05, + "loss": 1.8974, + "step": 8070 + }, + { + "epoch": 2.4772866789441377, + "grad_norm": 0.30235809087753296, + "learning_rate": 8.80797931167778e-05, + "loss": 1.8461, + "step": 8071 + }, + { + "epoch": 2.4775936157151626, + "grad_norm": 0.3719651997089386, + "learning_rate": 8.807657174602792e-05, + "loss": 1.9717, + "step": 8072 + }, + { + "epoch": 2.477900552486188, + "grad_norm": 0.3349135220050812, + "learning_rate": 8.807334999898307e-05, + "loss": 1.9, + "step": 8073 + }, + { + "epoch": 2.478207489257213, + "grad_norm": 0.28822100162506104, + "learning_rate": 8.807012787567503e-05, + "loss": 1.7606, + "step": 8074 + }, + { + "epoch": 2.478514426028238, + "grad_norm": 0.33698850870132446, + "learning_rate": 8.806690537613568e-05, + "loss": 1.8909, + "step": 8075 + }, + { + "epoch": 2.4788213627992635, + "grad_norm": 0.35167089104652405, + "learning_rate": 8.806368250039687e-05, + "loss": 1.8529, + "step": 8076 + }, + { + "epoch": 2.4791282995702884, + "grad_norm": 0.3142544627189636, + "learning_rate": 8.806045924849044e-05, + "loss": 1.8169, + "step": 8077 + }, + { + "epoch": 2.4794352363413137, + "grad_norm": 0.3489094078540802, + "learning_rate": 8.805723562044824e-05, + "loss": 1.8822, + "step": 8078 + }, + { + "epoch": 2.479742173112339, + "grad_norm": 0.33814284205436707, + "learning_rate": 8.805401161630214e-05, + "loss": 1.7982, + "step": 8079 + }, + { + "epoch": 2.480049109883364, + "grad_norm": 0.26772376894950867, + "learning_rate": 8.805078723608398e-05, + "loss": 1.8354, + "step": 8080 + }, + { + "epoch": 2.4803560466543892, + "grad_norm": 0.3259965777397156, + "learning_rate": 8.804756247982563e-05, + "loss": 1.8292, + "step": 8081 + }, + { + "epoch": 2.4806629834254146, + "grad_norm": 0.32701683044433594, + "learning_rate": 8.804433734755899e-05, + "loss": 1.8339, + "step": 8082 + }, + { + "epoch": 2.4809699201964395, + "grad_norm": 0.3180190324783325, + "learning_rate": 8.804111183931589e-05, + "loss": 1.8839, + "step": 8083 + }, + { + "epoch": 2.481276856967465, + "grad_norm": 0.3318104147911072, + "learning_rate": 8.803788595512824e-05, + "loss": 1.9024, + "step": 8084 + }, + { + "epoch": 2.4815837937384897, + "grad_norm": 0.3849479854106903, + "learning_rate": 8.80346596950279e-05, + "loss": 1.8497, + "step": 8085 + }, + { + "epoch": 2.481890730509515, + "grad_norm": 0.48812124133110046, + "learning_rate": 8.803143305904676e-05, + "loss": 1.799, + "step": 8086 + }, + { + "epoch": 2.4821976672805404, + "grad_norm": 0.4957241415977478, + "learning_rate": 8.802820604721671e-05, + "loss": 1.8842, + "step": 8087 + }, + { + "epoch": 2.4825046040515653, + "grad_norm": 0.4011611342430115, + "learning_rate": 8.802497865956964e-05, + "loss": 1.8354, + "step": 8088 + }, + { + "epoch": 2.4828115408225906, + "grad_norm": 0.3676159679889679, + "learning_rate": 8.802175089613744e-05, + "loss": 1.8564, + "step": 8089 + }, + { + "epoch": 2.4831184775936155, + "grad_norm": 0.30699628591537476, + "learning_rate": 8.801852275695202e-05, + "loss": 1.8403, + "step": 8090 + }, + { + "epoch": 2.483425414364641, + "grad_norm": 0.4100657105445862, + "learning_rate": 8.801529424204527e-05, + "loss": 1.7885, + "step": 8091 + }, + { + "epoch": 2.483732351135666, + "grad_norm": 0.30880647897720337, + "learning_rate": 8.801206535144909e-05, + "loss": 1.8682, + "step": 8092 + }, + { + "epoch": 2.484039287906691, + "grad_norm": 0.2775783836841583, + "learning_rate": 8.800883608519541e-05, + "loss": 1.8179, + "step": 8093 + }, + { + "epoch": 2.4843462246777164, + "grad_norm": 0.3048902451992035, + "learning_rate": 8.800560644331613e-05, + "loss": 1.8799, + "step": 8094 + }, + { + "epoch": 2.4846531614487417, + "grad_norm": 0.30332526564598083, + "learning_rate": 8.800237642584318e-05, + "loss": 1.8892, + "step": 8095 + }, + { + "epoch": 2.4849600982197666, + "grad_norm": 0.27216237783432007, + "learning_rate": 8.799914603280847e-05, + "loss": 1.7896, + "step": 8096 + }, + { + "epoch": 2.485267034990792, + "grad_norm": 0.28771117329597473, + "learning_rate": 8.799591526424393e-05, + "loss": 1.8593, + "step": 8097 + }, + { + "epoch": 2.4855739717618173, + "grad_norm": 0.2986912429332733, + "learning_rate": 8.799268412018146e-05, + "loss": 1.8205, + "step": 8098 + }, + { + "epoch": 2.485880908532842, + "grad_norm": 0.3072153925895691, + "learning_rate": 8.798945260065306e-05, + "loss": 1.841, + "step": 8099 + }, + { + "epoch": 2.4861878453038675, + "grad_norm": 0.33869001269340515, + "learning_rate": 8.798622070569059e-05, + "loss": 1.8353, + "step": 8100 + }, + { + "epoch": 2.4864947820748924, + "grad_norm": 0.3075481951236725, + "learning_rate": 8.798298843532605e-05, + "loss": 1.8824, + "step": 8101 + }, + { + "epoch": 2.4868017188459177, + "grad_norm": 0.2758934795856476, + "learning_rate": 8.797975578959132e-05, + "loss": 1.8068, + "step": 8102 + }, + { + "epoch": 2.487108655616943, + "grad_norm": 0.3065447211265564, + "learning_rate": 8.79765227685184e-05, + "loss": 1.8661, + "step": 8103 + }, + { + "epoch": 2.487415592387968, + "grad_norm": 0.34466415643692017, + "learning_rate": 8.797328937213923e-05, + "loss": 1.8579, + "step": 8104 + }, + { + "epoch": 2.4877225291589933, + "grad_norm": 0.4202970862388611, + "learning_rate": 8.797005560048575e-05, + "loss": 1.8526, + "step": 8105 + }, + { + "epoch": 2.488029465930018, + "grad_norm": 0.35885924100875854, + "learning_rate": 8.796682145358991e-05, + "loss": 1.8194, + "step": 8106 + }, + { + "epoch": 2.4883364027010435, + "grad_norm": 0.3208492696285248, + "learning_rate": 8.796358693148372e-05, + "loss": 1.8379, + "step": 8107 + }, + { + "epoch": 2.488643339472069, + "grad_norm": 0.26514047384262085, + "learning_rate": 8.79603520341991e-05, + "loss": 1.7978, + "step": 8108 + }, + { + "epoch": 2.4889502762430937, + "grad_norm": 0.34550225734710693, + "learning_rate": 8.795711676176803e-05, + "loss": 1.8771, + "step": 8109 + }, + { + "epoch": 2.489257213014119, + "grad_norm": 0.3016511797904968, + "learning_rate": 8.795388111422248e-05, + "loss": 1.8184, + "step": 8110 + }, + { + "epoch": 2.4895641497851444, + "grad_norm": 0.34824177622795105, + "learning_rate": 8.795064509159444e-05, + "loss": 1.8486, + "step": 8111 + }, + { + "epoch": 2.4898710865561693, + "grad_norm": 0.341482013463974, + "learning_rate": 8.794740869391587e-05, + "loss": 1.7872, + "step": 8112 + }, + { + "epoch": 2.4901780233271946, + "grad_norm": 0.3366520404815674, + "learning_rate": 8.794417192121878e-05, + "loss": 1.838, + "step": 8113 + }, + { + "epoch": 2.49048496009822, + "grad_norm": 0.3168759047985077, + "learning_rate": 8.794093477353514e-05, + "loss": 1.8195, + "step": 8114 + }, + { + "epoch": 2.490791896869245, + "grad_norm": 0.36757516860961914, + "learning_rate": 8.793769725089693e-05, + "loss": 1.8825, + "step": 8115 + }, + { + "epoch": 2.49109883364027, + "grad_norm": 0.3936297297477722, + "learning_rate": 8.793445935333617e-05, + "loss": 1.855, + "step": 8116 + }, + { + "epoch": 2.491405770411295, + "grad_norm": 0.31962448358535767, + "learning_rate": 8.793122108088485e-05, + "loss": 1.8307, + "step": 8117 + }, + { + "epoch": 2.4917127071823204, + "grad_norm": 0.3082095980644226, + "learning_rate": 8.792798243357499e-05, + "loss": 1.8204, + "step": 8118 + }, + { + "epoch": 2.4920196439533457, + "grad_norm": 0.4574470520019531, + "learning_rate": 8.792474341143855e-05, + "loss": 1.8989, + "step": 8119 + }, + { + "epoch": 2.4923265807243706, + "grad_norm": 0.4596022367477417, + "learning_rate": 8.792150401450757e-05, + "loss": 1.8773, + "step": 8120 + }, + { + "epoch": 2.492633517495396, + "grad_norm": 0.32090309262275696, + "learning_rate": 8.791826424281407e-05, + "loss": 1.8621, + "step": 8121 + }, + { + "epoch": 2.4929404542664213, + "grad_norm": 0.3492026925086975, + "learning_rate": 8.791502409639006e-05, + "loss": 1.8887, + "step": 8122 + }, + { + "epoch": 2.493247391037446, + "grad_norm": 0.39859771728515625, + "learning_rate": 8.791178357526754e-05, + "loss": 1.8326, + "step": 8123 + }, + { + "epoch": 2.4935543278084715, + "grad_norm": 0.40439239144325256, + "learning_rate": 8.790854267947857e-05, + "loss": 1.8716, + "step": 8124 + }, + { + "epoch": 2.493861264579497, + "grad_norm": 0.4004671573638916, + "learning_rate": 8.790530140905515e-05, + "loss": 1.8253, + "step": 8125 + }, + { + "epoch": 2.4941682013505218, + "grad_norm": 0.31446993350982666, + "learning_rate": 8.790205976402934e-05, + "loss": 1.8356, + "step": 8126 + }, + { + "epoch": 2.494475138121547, + "grad_norm": 0.3069862723350525, + "learning_rate": 8.789881774443315e-05, + "loss": 1.8532, + "step": 8127 + }, + { + "epoch": 2.494782074892572, + "grad_norm": 0.3192054033279419, + "learning_rate": 8.789557535029864e-05, + "loss": 1.7991, + "step": 8128 + }, + { + "epoch": 2.4950890116635973, + "grad_norm": 0.30979350209236145, + "learning_rate": 8.789233258165783e-05, + "loss": 1.8874, + "step": 8129 + }, + { + "epoch": 2.4953959484346226, + "grad_norm": 0.3193976879119873, + "learning_rate": 8.788908943854279e-05, + "loss": 1.8218, + "step": 8130 + }, + { + "epoch": 2.4957028852056475, + "grad_norm": 0.3120083808898926, + "learning_rate": 8.788584592098557e-05, + "loss": 1.9542, + "step": 8131 + }, + { + "epoch": 2.496009821976673, + "grad_norm": 0.36913001537323, + "learning_rate": 8.788260202901819e-05, + "loss": 1.8543, + "step": 8132 + }, + { + "epoch": 2.4963167587476978, + "grad_norm": 0.40216776728630066, + "learning_rate": 8.787935776267275e-05, + "loss": 1.8645, + "step": 8133 + }, + { + "epoch": 2.496623695518723, + "grad_norm": 0.3553076684474945, + "learning_rate": 8.78761131219813e-05, + "loss": 1.8881, + "step": 8134 + }, + { + "epoch": 2.4969306322897484, + "grad_norm": 0.2926538288593292, + "learning_rate": 8.787286810697589e-05, + "loss": 1.8419, + "step": 8135 + }, + { + "epoch": 2.4972375690607733, + "grad_norm": 0.3412233293056488, + "learning_rate": 8.78696227176886e-05, + "loss": 1.8766, + "step": 8136 + }, + { + "epoch": 2.4975445058317987, + "grad_norm": 0.30935296416282654, + "learning_rate": 8.78663769541515e-05, + "loss": 1.8002, + "step": 8137 + }, + { + "epoch": 2.497851442602824, + "grad_norm": 0.31171828508377075, + "learning_rate": 8.786313081639666e-05, + "loss": 1.7795, + "step": 8138 + }, + { + "epoch": 2.498158379373849, + "grad_norm": 0.2874031364917755, + "learning_rate": 8.785988430445619e-05, + "loss": 1.8508, + "step": 8139 + }, + { + "epoch": 2.498465316144874, + "grad_norm": 0.3126043379306793, + "learning_rate": 8.785663741836215e-05, + "loss": 1.8328, + "step": 8140 + }, + { + "epoch": 2.4987722529158995, + "grad_norm": 0.32581454515457153, + "learning_rate": 8.785339015814662e-05, + "loss": 1.8333, + "step": 8141 + }, + { + "epoch": 2.4990791896869244, + "grad_norm": 0.329745888710022, + "learning_rate": 8.78501425238417e-05, + "loss": 1.8257, + "step": 8142 + }, + { + "epoch": 2.4993861264579498, + "grad_norm": 0.29101938009262085, + "learning_rate": 8.78468945154795e-05, + "loss": 1.8472, + "step": 8143 + }, + { + "epoch": 2.4996930632289747, + "grad_norm": 0.3123742341995239, + "learning_rate": 8.784364613309208e-05, + "loss": 1.9226, + "step": 8144 + }, + { + "epoch": 2.5, + "grad_norm": 0.3330230116844177, + "learning_rate": 8.784039737671159e-05, + "loss": 1.8768, + "step": 8145 + }, + { + "epoch": 2.5003069367710253, + "grad_norm": 0.3147718012332916, + "learning_rate": 8.783714824637011e-05, + "loss": 1.853, + "step": 8146 + }, + { + "epoch": 2.5006138735420502, + "grad_norm": 0.34790241718292236, + "learning_rate": 8.783389874209977e-05, + "loss": 1.8328, + "step": 8147 + }, + { + "epoch": 2.5009208103130756, + "grad_norm": 0.29425308108329773, + "learning_rate": 8.783064886393264e-05, + "loss": 1.8487, + "step": 8148 + }, + { + "epoch": 2.5012277470841005, + "grad_norm": 0.30555078387260437, + "learning_rate": 8.782739861190088e-05, + "loss": 1.8588, + "step": 8149 + }, + { + "epoch": 2.501534683855126, + "grad_norm": 0.29712429642677307, + "learning_rate": 8.78241479860366e-05, + "loss": 1.8056, + "step": 8150 + }, + { + "epoch": 2.501841620626151, + "grad_norm": 0.32512977719306946, + "learning_rate": 8.782089698637191e-05, + "loss": 1.9099, + "step": 8151 + }, + { + "epoch": 2.5021485573971765, + "grad_norm": 0.3660493493080139, + "learning_rate": 8.781764561293895e-05, + "loss": 1.905, + "step": 8152 + }, + { + "epoch": 2.5024554941682013, + "grad_norm": 0.33591583371162415, + "learning_rate": 8.781439386576984e-05, + "loss": 1.8353, + "step": 8153 + }, + { + "epoch": 2.5027624309392267, + "grad_norm": 0.3774370551109314, + "learning_rate": 8.781114174489673e-05, + "loss": 1.8626, + "step": 8154 + }, + { + "epoch": 2.5030693677102516, + "grad_norm": 0.3628109097480774, + "learning_rate": 8.780788925035178e-05, + "loss": 1.8549, + "step": 8155 + }, + { + "epoch": 2.503376304481277, + "grad_norm": 0.3089732825756073, + "learning_rate": 8.78046363821671e-05, + "loss": 1.835, + "step": 8156 + }, + { + "epoch": 2.5036832412523022, + "grad_norm": 0.3630690574645996, + "learning_rate": 8.780138314037482e-05, + "loss": 1.8308, + "step": 8157 + }, + { + "epoch": 2.503990178023327, + "grad_norm": 0.3658130466938019, + "learning_rate": 8.779812952500714e-05, + "loss": 1.8484, + "step": 8158 + }, + { + "epoch": 2.5042971147943525, + "grad_norm": 0.38401272892951965, + "learning_rate": 8.779487553609617e-05, + "loss": 1.8408, + "step": 8159 + }, + { + "epoch": 2.5046040515653774, + "grad_norm": 0.354514479637146, + "learning_rate": 8.77916211736741e-05, + "loss": 1.8491, + "step": 8160 + }, + { + "epoch": 2.5049109883364027, + "grad_norm": 0.3604681193828583, + "learning_rate": 8.778836643777309e-05, + "loss": 1.8887, + "step": 8161 + }, + { + "epoch": 2.505217925107428, + "grad_norm": 0.3155761957168579, + "learning_rate": 8.778511132842528e-05, + "loss": 1.8066, + "step": 8162 + }, + { + "epoch": 2.505524861878453, + "grad_norm": 0.35986092686653137, + "learning_rate": 8.778185584566286e-05, + "loss": 1.8348, + "step": 8163 + }, + { + "epoch": 2.5058317986494782, + "grad_norm": 0.558273434638977, + "learning_rate": 8.777859998951799e-05, + "loss": 1.9118, + "step": 8164 + }, + { + "epoch": 2.506138735420503, + "grad_norm": 0.6520169377326965, + "learning_rate": 8.777534376002285e-05, + "loss": 1.8747, + "step": 8165 + }, + { + "epoch": 2.5064456721915285, + "grad_norm": 0.5059971213340759, + "learning_rate": 8.777208715720963e-05, + "loss": 1.8218, + "step": 8166 + }, + { + "epoch": 2.506752608962554, + "grad_norm": 0.2873745560646057, + "learning_rate": 8.77688301811105e-05, + "loss": 1.8266, + "step": 8167 + }, + { + "epoch": 2.507059545733579, + "grad_norm": 0.4212021827697754, + "learning_rate": 8.776557283175765e-05, + "loss": 1.8553, + "step": 8168 + }, + { + "epoch": 2.507366482504604, + "grad_norm": 0.49324098229408264, + "learning_rate": 8.776231510918328e-05, + "loss": 1.8625, + "step": 8169 + }, + { + "epoch": 2.5076734192756294, + "grad_norm": 0.4414234459400177, + "learning_rate": 8.775905701341959e-05, + "loss": 1.7956, + "step": 8170 + }, + { + "epoch": 2.5079803560466543, + "grad_norm": 0.2691541612148285, + "learning_rate": 8.775579854449876e-05, + "loss": 1.8216, + "step": 8171 + }, + { + "epoch": 2.5082872928176796, + "grad_norm": 0.3366323411464691, + "learning_rate": 8.775253970245299e-05, + "loss": 1.8738, + "step": 8172 + }, + { + "epoch": 2.508594229588705, + "grad_norm": 0.49541351199150085, + "learning_rate": 8.77492804873145e-05, + "loss": 1.8281, + "step": 8173 + }, + { + "epoch": 2.50890116635973, + "grad_norm": 0.584227442741394, + "learning_rate": 8.774602089911548e-05, + "loss": 1.8248, + "step": 8174 + }, + { + "epoch": 2.509208103130755, + "grad_norm": 0.4493597149848938, + "learning_rate": 8.774276093788818e-05, + "loss": 1.8624, + "step": 8175 + }, + { + "epoch": 2.50951503990178, + "grad_norm": 0.29684513807296753, + "learning_rate": 8.77395006036648e-05, + "loss": 1.7806, + "step": 8176 + }, + { + "epoch": 2.5098219766728054, + "grad_norm": 0.38788866996765137, + "learning_rate": 8.773623989647754e-05, + "loss": 1.8334, + "step": 8177 + }, + { + "epoch": 2.5101289134438307, + "grad_norm": 0.44810980558395386, + "learning_rate": 8.773297881635865e-05, + "loss": 1.823, + "step": 8178 + }, + { + "epoch": 2.5104358502148556, + "grad_norm": 0.39918363094329834, + "learning_rate": 8.772971736334032e-05, + "loss": 1.8535, + "step": 8179 + }, + { + "epoch": 2.510742786985881, + "grad_norm": 0.3454466462135315, + "learning_rate": 8.772645553745484e-05, + "loss": 1.8532, + "step": 8180 + }, + { + "epoch": 2.511049723756906, + "grad_norm": 0.3523466885089874, + "learning_rate": 8.77231933387344e-05, + "loss": 1.8402, + "step": 8181 + }, + { + "epoch": 2.511356660527931, + "grad_norm": 0.41947969794273376, + "learning_rate": 8.771993076721126e-05, + "loss": 1.8509, + "step": 8182 + }, + { + "epoch": 2.5116635972989565, + "grad_norm": 0.43224433064460754, + "learning_rate": 8.771666782291765e-05, + "loss": 1.858, + "step": 8183 + }, + { + "epoch": 2.511970534069982, + "grad_norm": 0.3467538058757782, + "learning_rate": 8.771340450588584e-05, + "loss": 1.8528, + "step": 8184 + }, + { + "epoch": 2.5122774708410067, + "grad_norm": 0.33712685108184814, + "learning_rate": 8.771014081614803e-05, + "loss": 1.8741, + "step": 8185 + }, + { + "epoch": 2.512584407612032, + "grad_norm": 0.4289829134941101, + "learning_rate": 8.770687675373652e-05, + "loss": 1.8252, + "step": 8186 + }, + { + "epoch": 2.512891344383057, + "grad_norm": 0.4774068295955658, + "learning_rate": 8.770361231868356e-05, + "loss": 1.8285, + "step": 8187 + }, + { + "epoch": 2.5131982811540823, + "grad_norm": 0.3455580472946167, + "learning_rate": 8.77003475110214e-05, + "loss": 1.8025, + "step": 8188 + }, + { + "epoch": 2.5135052179251076, + "grad_norm": 0.3050900399684906, + "learning_rate": 8.769708233078231e-05, + "loss": 1.8764, + "step": 8189 + }, + { + "epoch": 2.5138121546961325, + "grad_norm": 0.42384061217308044, + "learning_rate": 8.769381677799855e-05, + "loss": 1.8937, + "step": 8190 + }, + { + "epoch": 2.514119091467158, + "grad_norm": 0.4084749221801758, + "learning_rate": 8.76905508527024e-05, + "loss": 1.8124, + "step": 8191 + }, + { + "epoch": 2.5144260282381827, + "grad_norm": 0.38785848021507263, + "learning_rate": 8.768728455492615e-05, + "loss": 1.8731, + "step": 8192 + }, + { + "epoch": 2.514732965009208, + "grad_norm": 0.28196588158607483, + "learning_rate": 8.768401788470206e-05, + "loss": 1.809, + "step": 8193 + }, + { + "epoch": 2.5150399017802334, + "grad_norm": 0.3551066815853119, + "learning_rate": 8.76807508420624e-05, + "loss": 1.8955, + "step": 8194 + }, + { + "epoch": 2.5153468385512583, + "grad_norm": 0.4327031373977661, + "learning_rate": 8.76774834270395e-05, + "loss": 1.8651, + "step": 8195 + }, + { + "epoch": 2.5156537753222836, + "grad_norm": 0.3748793303966522, + "learning_rate": 8.76742156396656e-05, + "loss": 1.8158, + "step": 8196 + }, + { + "epoch": 2.5159607120933085, + "grad_norm": 0.32504430413246155, + "learning_rate": 8.767094747997304e-05, + "loss": 1.8598, + "step": 8197 + }, + { + "epoch": 2.516267648864334, + "grad_norm": 0.3639826476573944, + "learning_rate": 8.76676789479941e-05, + "loss": 1.8829, + "step": 8198 + }, + { + "epoch": 2.516574585635359, + "grad_norm": 0.36793577671051025, + "learning_rate": 8.766441004376106e-05, + "loss": 1.8215, + "step": 8199 + }, + { + "epoch": 2.5168815224063845, + "grad_norm": 0.3245735466480255, + "learning_rate": 8.766114076730624e-05, + "loss": 1.8309, + "step": 8200 + }, + { + "epoch": 2.5171884591774094, + "grad_norm": 0.3022485673427582, + "learning_rate": 8.765787111866198e-05, + "loss": 1.8286, + "step": 8201 + }, + { + "epoch": 2.5174953959484347, + "grad_norm": 0.40962809324264526, + "learning_rate": 8.765460109786056e-05, + "loss": 1.8032, + "step": 8202 + }, + { + "epoch": 2.5178023327194596, + "grad_norm": 0.4123937487602234, + "learning_rate": 8.765133070493428e-05, + "loss": 1.9311, + "step": 8203 + }, + { + "epoch": 2.518109269490485, + "grad_norm": 0.30352556705474854, + "learning_rate": 8.764805993991551e-05, + "loss": 1.8197, + "step": 8204 + }, + { + "epoch": 2.5184162062615103, + "grad_norm": 0.3201169967651367, + "learning_rate": 8.764478880283653e-05, + "loss": 1.9355, + "step": 8205 + }, + { + "epoch": 2.518723143032535, + "grad_norm": 0.36343297362327576, + "learning_rate": 8.764151729372969e-05, + "loss": 1.9201, + "step": 8206 + }, + { + "epoch": 2.5190300798035605, + "grad_norm": 0.3273618817329407, + "learning_rate": 8.763824541262729e-05, + "loss": 1.8195, + "step": 8207 + }, + { + "epoch": 2.5193370165745854, + "grad_norm": 0.30200251936912537, + "learning_rate": 8.76349731595617e-05, + "loss": 1.8094, + "step": 8208 + }, + { + "epoch": 2.5196439533456108, + "grad_norm": 0.3177770674228668, + "learning_rate": 8.763170053456527e-05, + "loss": 1.8519, + "step": 8209 + }, + { + "epoch": 2.519950890116636, + "grad_norm": 0.3206307291984558, + "learning_rate": 8.762842753767031e-05, + "loss": 1.8496, + "step": 8210 + }, + { + "epoch": 2.520257826887661, + "grad_norm": 0.31902456283569336, + "learning_rate": 8.762515416890915e-05, + "loss": 1.9069, + "step": 8211 + }, + { + "epoch": 2.5205647636586863, + "grad_norm": 0.3088377118110657, + "learning_rate": 8.762188042831419e-05, + "loss": 1.8482, + "step": 8212 + }, + { + "epoch": 2.520871700429711, + "grad_norm": 0.3046402931213379, + "learning_rate": 8.761860631591773e-05, + "loss": 1.8241, + "step": 8213 + }, + { + "epoch": 2.5211786372007365, + "grad_norm": 0.291831910610199, + "learning_rate": 8.761533183175217e-05, + "loss": 1.846, + "step": 8214 + }, + { + "epoch": 2.521485573971762, + "grad_norm": 0.3514893054962158, + "learning_rate": 8.761205697584986e-05, + "loss": 1.9, + "step": 8215 + }, + { + "epoch": 2.521792510742787, + "grad_norm": 0.31843090057373047, + "learning_rate": 8.760878174824316e-05, + "loss": 1.78, + "step": 8216 + }, + { + "epoch": 2.522099447513812, + "grad_norm": 0.30090904235839844, + "learning_rate": 8.760550614896443e-05, + "loss": 1.8718, + "step": 8217 + }, + { + "epoch": 2.5224063842848374, + "grad_norm": 0.38502126932144165, + "learning_rate": 8.760223017804604e-05, + "loss": 1.8772, + "step": 8218 + }, + { + "epoch": 2.5227133210558623, + "grad_norm": 0.30862319469451904, + "learning_rate": 8.759895383552037e-05, + "loss": 1.8532, + "step": 8219 + }, + { + "epoch": 2.5230202578268877, + "grad_norm": 0.36331596970558167, + "learning_rate": 8.759567712141981e-05, + "loss": 1.8587, + "step": 8220 + }, + { + "epoch": 2.523327194597913, + "grad_norm": 0.3370853662490845, + "learning_rate": 8.759240003577673e-05, + "loss": 1.8065, + "step": 8221 + }, + { + "epoch": 2.523634131368938, + "grad_norm": 0.3047318160533905, + "learning_rate": 8.758912257862351e-05, + "loss": 1.8783, + "step": 8222 + }, + { + "epoch": 2.523941068139963, + "grad_norm": 0.3172069787979126, + "learning_rate": 8.758584474999257e-05, + "loss": 1.7844, + "step": 8223 + }, + { + "epoch": 2.524248004910988, + "grad_norm": 0.3063897490501404, + "learning_rate": 8.758256654991626e-05, + "loss": 1.8642, + "step": 8224 + }, + { + "epoch": 2.5245549416820134, + "grad_norm": 0.2535867393016815, + "learning_rate": 8.757928797842702e-05, + "loss": 1.7784, + "step": 8225 + }, + { + "epoch": 2.5248618784530388, + "grad_norm": 0.27732348442077637, + "learning_rate": 8.757600903555722e-05, + "loss": 1.8223, + "step": 8226 + }, + { + "epoch": 2.525168815224064, + "grad_norm": 0.29819566011428833, + "learning_rate": 8.757272972133927e-05, + "loss": 1.8237, + "step": 8227 + }, + { + "epoch": 2.525475751995089, + "grad_norm": 0.26726382970809937, + "learning_rate": 8.756945003580559e-05, + "loss": 1.8134, + "step": 8228 + }, + { + "epoch": 2.5257826887661143, + "grad_norm": 0.2845614552497864, + "learning_rate": 8.756616997898859e-05, + "loss": 1.8757, + "step": 8229 + }, + { + "epoch": 2.5260896255371392, + "grad_norm": 0.33399102091789246, + "learning_rate": 8.756288955092066e-05, + "loss": 1.9036, + "step": 8230 + }, + { + "epoch": 2.5263965623081646, + "grad_norm": 0.3839001953601837, + "learning_rate": 8.755960875163426e-05, + "loss": 1.8205, + "step": 8231 + }, + { + "epoch": 2.52670349907919, + "grad_norm": 0.3703761696815491, + "learning_rate": 8.75563275811618e-05, + "loss": 1.768, + "step": 8232 + }, + { + "epoch": 2.527010435850215, + "grad_norm": 0.3083760440349579, + "learning_rate": 8.755304603953568e-05, + "loss": 1.8621, + "step": 8233 + }, + { + "epoch": 2.52731737262124, + "grad_norm": 0.2995334267616272, + "learning_rate": 8.754976412678833e-05, + "loss": 1.8246, + "step": 8234 + }, + { + "epoch": 2.527624309392265, + "grad_norm": 0.3482929766178131, + "learning_rate": 8.754648184295222e-05, + "loss": 1.7982, + "step": 8235 + }, + { + "epoch": 2.5279312461632903, + "grad_norm": 0.37462911009788513, + "learning_rate": 8.754319918805978e-05, + "loss": 1.8458, + "step": 8236 + }, + { + "epoch": 2.5282381829343157, + "grad_norm": 0.3112029433250427, + "learning_rate": 8.753991616214343e-05, + "loss": 1.9116, + "step": 8237 + }, + { + "epoch": 2.5285451197053406, + "grad_norm": 0.309711217880249, + "learning_rate": 8.753663276523563e-05, + "loss": 1.8072, + "step": 8238 + }, + { + "epoch": 2.528852056476366, + "grad_norm": 0.3831833302974701, + "learning_rate": 8.753334899736882e-05, + "loss": 1.8769, + "step": 8239 + }, + { + "epoch": 2.529158993247391, + "grad_norm": 0.30272287130355835, + "learning_rate": 8.753006485857547e-05, + "loss": 1.7874, + "step": 8240 + }, + { + "epoch": 2.529465930018416, + "grad_norm": 0.3613976538181305, + "learning_rate": 8.752678034888801e-05, + "loss": 1.8591, + "step": 8241 + }, + { + "epoch": 2.5297728667894415, + "grad_norm": 0.35976549983024597, + "learning_rate": 8.75234954683389e-05, + "loss": 1.7831, + "step": 8242 + }, + { + "epoch": 2.530079803560467, + "grad_norm": 0.33987951278686523, + "learning_rate": 8.752021021696064e-05, + "loss": 1.7986, + "step": 8243 + }, + { + "epoch": 2.5303867403314917, + "grad_norm": 0.29231634736061096, + "learning_rate": 8.751692459478567e-05, + "loss": 1.8205, + "step": 8244 + }, + { + "epoch": 2.530693677102517, + "grad_norm": 0.3382028341293335, + "learning_rate": 8.751363860184644e-05, + "loss": 1.8403, + "step": 8245 + }, + { + "epoch": 2.531000613873542, + "grad_norm": 0.44643479585647583, + "learning_rate": 8.751035223817546e-05, + "loss": 1.8273, + "step": 8246 + }, + { + "epoch": 2.5313075506445673, + "grad_norm": 0.4412732720375061, + "learning_rate": 8.750706550380518e-05, + "loss": 1.7935, + "step": 8247 + }, + { + "epoch": 2.5316144874155926, + "grad_norm": 0.3826131820678711, + "learning_rate": 8.750377839876811e-05, + "loss": 1.8622, + "step": 8248 + }, + { + "epoch": 2.5319214241866175, + "grad_norm": 0.27509525418281555, + "learning_rate": 8.750049092309672e-05, + "loss": 1.8359, + "step": 8249 + }, + { + "epoch": 2.532228360957643, + "grad_norm": 0.36282727122306824, + "learning_rate": 8.749720307682348e-05, + "loss": 1.8531, + "step": 8250 + }, + { + "epoch": 2.5325352977286677, + "grad_norm": 0.3730177581310272, + "learning_rate": 8.749391485998091e-05, + "loss": 1.8616, + "step": 8251 + }, + { + "epoch": 2.532842234499693, + "grad_norm": 0.3347858190536499, + "learning_rate": 8.749062627260152e-05, + "loss": 1.8078, + "step": 8252 + }, + { + "epoch": 2.5331491712707184, + "grad_norm": 0.29422396421432495, + "learning_rate": 8.748733731471777e-05, + "loss": 1.8623, + "step": 8253 + }, + { + "epoch": 2.5334561080417433, + "grad_norm": 0.36915895342826843, + "learning_rate": 8.748404798636219e-05, + "loss": 1.8461, + "step": 8254 + }, + { + "epoch": 2.5337630448127686, + "grad_norm": 0.4497677981853485, + "learning_rate": 8.748075828756725e-05, + "loss": 1.8328, + "step": 8255 + }, + { + "epoch": 2.5340699815837935, + "grad_norm": 0.4770478308200836, + "learning_rate": 8.747746821836552e-05, + "loss": 1.8418, + "step": 8256 + }, + { + "epoch": 2.534376918354819, + "grad_norm": 0.39125776290893555, + "learning_rate": 8.747417777878946e-05, + "loss": 1.8044, + "step": 8257 + }, + { + "epoch": 2.534683855125844, + "grad_norm": 0.2976539731025696, + "learning_rate": 8.747088696887163e-05, + "loss": 1.8819, + "step": 8258 + }, + { + "epoch": 2.5349907918968695, + "grad_norm": 0.37511107325553894, + "learning_rate": 8.746759578864452e-05, + "loss": 1.8304, + "step": 8259 + }, + { + "epoch": 2.5352977286678944, + "grad_norm": 0.4462794363498688, + "learning_rate": 8.746430423814068e-05, + "loss": 1.8248, + "step": 8260 + }, + { + "epoch": 2.5356046654389197, + "grad_norm": 0.3465537130832672, + "learning_rate": 8.746101231739261e-05, + "loss": 1.7987, + "step": 8261 + }, + { + "epoch": 2.5359116022099446, + "grad_norm": 0.3182581663131714, + "learning_rate": 8.745772002643287e-05, + "loss": 1.8817, + "step": 8262 + }, + { + "epoch": 2.53621853898097, + "grad_norm": 0.43006083369255066, + "learning_rate": 8.745442736529398e-05, + "loss": 1.8003, + "step": 8263 + }, + { + "epoch": 2.5365254757519953, + "grad_norm": 0.45511460304260254, + "learning_rate": 8.745113433400849e-05, + "loss": 1.8735, + "step": 8264 + }, + { + "epoch": 2.53683241252302, + "grad_norm": 0.3625985085964203, + "learning_rate": 8.744784093260894e-05, + "loss": 1.8469, + "step": 8265 + }, + { + "epoch": 2.5371393492940455, + "grad_norm": 0.2977297306060791, + "learning_rate": 8.744454716112787e-05, + "loss": 1.7885, + "step": 8266 + }, + { + "epoch": 2.5374462860650704, + "grad_norm": 0.34910085797309875, + "learning_rate": 8.744125301959785e-05, + "loss": 1.8885, + "step": 8267 + }, + { + "epoch": 2.5377532228360957, + "grad_norm": 0.40707942843437195, + "learning_rate": 8.743795850805141e-05, + "loss": 1.8829, + "step": 8268 + }, + { + "epoch": 2.538060159607121, + "grad_norm": 0.4142697751522064, + "learning_rate": 8.743466362652114e-05, + "loss": 1.903, + "step": 8269 + }, + { + "epoch": 2.538367096378146, + "grad_norm": 0.38610437512397766, + "learning_rate": 8.743136837503958e-05, + "loss": 1.9245, + "step": 8270 + }, + { + "epoch": 2.5386740331491713, + "grad_norm": 0.2940465211868286, + "learning_rate": 8.742807275363928e-05, + "loss": 1.8532, + "step": 8271 + }, + { + "epoch": 2.538980969920196, + "grad_norm": 0.3257673978805542, + "learning_rate": 8.742477676235284e-05, + "loss": 1.8517, + "step": 8272 + }, + { + "epoch": 2.5392879066912215, + "grad_norm": 0.3709326982498169, + "learning_rate": 8.742148040121282e-05, + "loss": 1.872, + "step": 8273 + }, + { + "epoch": 2.539594843462247, + "grad_norm": 0.3433123826980591, + "learning_rate": 8.741818367025179e-05, + "loss": 1.8717, + "step": 8274 + }, + { + "epoch": 2.539901780233272, + "grad_norm": 0.39426255226135254, + "learning_rate": 8.741488656950234e-05, + "loss": 1.8155, + "step": 8275 + }, + { + "epoch": 2.540208717004297, + "grad_norm": 0.48205071687698364, + "learning_rate": 8.741158909899706e-05, + "loss": 1.8668, + "step": 8276 + }, + { + "epoch": 2.5405156537753224, + "grad_norm": 0.35280337929725647, + "learning_rate": 8.740829125876853e-05, + "loss": 1.7845, + "step": 8277 + }, + { + "epoch": 2.5408225905463473, + "grad_norm": 0.3148525059223175, + "learning_rate": 8.740499304884932e-05, + "loss": 1.8539, + "step": 8278 + }, + { + "epoch": 2.5411295273173726, + "grad_norm": 0.387932687997818, + "learning_rate": 8.740169446927207e-05, + "loss": 1.8514, + "step": 8279 + }, + { + "epoch": 2.541436464088398, + "grad_norm": 0.37375807762145996, + "learning_rate": 8.739839552006934e-05, + "loss": 1.8497, + "step": 8280 + }, + { + "epoch": 2.541743400859423, + "grad_norm": 0.3094288408756256, + "learning_rate": 8.739509620127375e-05, + "loss": 1.8675, + "step": 8281 + }, + { + "epoch": 2.542050337630448, + "grad_norm": 0.36951884627342224, + "learning_rate": 8.73917965129179e-05, + "loss": 1.8533, + "step": 8282 + }, + { + "epoch": 2.542357274401473, + "grad_norm": 0.39360809326171875, + "learning_rate": 8.73884964550344e-05, + "loss": 1.8688, + "step": 8283 + }, + { + "epoch": 2.5426642111724984, + "grad_norm": 0.29781201481819153, + "learning_rate": 8.738519602765586e-05, + "loss": 1.8285, + "step": 8284 + }, + { + "epoch": 2.5429711479435237, + "grad_norm": 0.29476743936538696, + "learning_rate": 8.73818952308149e-05, + "loss": 1.8234, + "step": 8285 + }, + { + "epoch": 2.5432780847145486, + "grad_norm": 0.3660123646259308, + "learning_rate": 8.737859406454416e-05, + "loss": 1.8933, + "step": 8286 + }, + { + "epoch": 2.543585021485574, + "grad_norm": 0.41587865352630615, + "learning_rate": 8.737529252887621e-05, + "loss": 1.8799, + "step": 8287 + }, + { + "epoch": 2.5438919582565993, + "grad_norm": 0.4183691143989563, + "learning_rate": 8.737199062384374e-05, + "loss": 1.8479, + "step": 8288 + }, + { + "epoch": 2.544198895027624, + "grad_norm": 0.35940057039260864, + "learning_rate": 8.736868834947935e-05, + "loss": 1.8164, + "step": 8289 + }, + { + "epoch": 2.5445058317986495, + "grad_norm": 0.26804691553115845, + "learning_rate": 8.736538570581568e-05, + "loss": 1.8017, + "step": 8290 + }, + { + "epoch": 2.544812768569675, + "grad_norm": 0.34537792205810547, + "learning_rate": 8.736208269288534e-05, + "loss": 1.9002, + "step": 8291 + }, + { + "epoch": 2.5451197053406998, + "grad_norm": 0.4636915624141693, + "learning_rate": 8.735877931072106e-05, + "loss": 1.8207, + "step": 8292 + }, + { + "epoch": 2.545426642111725, + "grad_norm": 0.4897560775279999, + "learning_rate": 8.735547555935537e-05, + "loss": 1.7981, + "step": 8293 + }, + { + "epoch": 2.54573357888275, + "grad_norm": 0.37379372119903564, + "learning_rate": 8.7352171438821e-05, + "loss": 1.8727, + "step": 8294 + }, + { + "epoch": 2.5460405156537753, + "grad_norm": 0.295436292886734, + "learning_rate": 8.734886694915059e-05, + "loss": 1.8321, + "step": 8295 + }, + { + "epoch": 2.5463474524248007, + "grad_norm": 0.40406084060668945, + "learning_rate": 8.734556209037676e-05, + "loss": 1.8666, + "step": 8296 + }, + { + "epoch": 2.5466543891958255, + "grad_norm": 0.3286290466785431, + "learning_rate": 8.734225686253221e-05, + "loss": 1.8574, + "step": 8297 + }, + { + "epoch": 2.546961325966851, + "grad_norm": 0.3200569152832031, + "learning_rate": 8.73389512656496e-05, + "loss": 1.8253, + "step": 8298 + }, + { + "epoch": 2.5472682627378758, + "grad_norm": 0.35550132393836975, + "learning_rate": 8.733564529976157e-05, + "loss": 1.8293, + "step": 8299 + }, + { + "epoch": 2.547575199508901, + "grad_norm": 0.3804685175418854, + "learning_rate": 8.733233896490081e-05, + "loss": 1.8689, + "step": 8300 + }, + { + "epoch": 2.5478821362799264, + "grad_norm": 0.34739598631858826, + "learning_rate": 8.73290322611e-05, + "loss": 1.8441, + "step": 8301 + }, + { + "epoch": 2.5481890730509518, + "grad_norm": 0.29757586121559143, + "learning_rate": 8.732572518839182e-05, + "loss": 1.8698, + "step": 8302 + }, + { + "epoch": 2.5484960098219767, + "grad_norm": 0.30403536558151245, + "learning_rate": 8.732241774680895e-05, + "loss": 1.8305, + "step": 8303 + }, + { + "epoch": 2.548802946593002, + "grad_norm": 0.326876699924469, + "learning_rate": 8.731910993638406e-05, + "loss": 1.8514, + "step": 8304 + }, + { + "epoch": 2.549109883364027, + "grad_norm": 0.3108467161655426, + "learning_rate": 8.731580175714986e-05, + "loss": 1.8509, + "step": 8305 + }, + { + "epoch": 2.549416820135052, + "grad_norm": 0.31641489267349243, + "learning_rate": 8.731249320913904e-05, + "loss": 1.9009, + "step": 8306 + }, + { + "epoch": 2.5497237569060776, + "grad_norm": 0.3166131377220154, + "learning_rate": 8.730918429238428e-05, + "loss": 1.8291, + "step": 8307 + }, + { + "epoch": 2.5500306936771024, + "grad_norm": 0.27900195121765137, + "learning_rate": 8.730587500691829e-05, + "loss": 1.856, + "step": 8308 + }, + { + "epoch": 2.550337630448128, + "grad_norm": 0.3000704050064087, + "learning_rate": 8.730256535277379e-05, + "loss": 1.839, + "step": 8309 + }, + { + "epoch": 2.5506445672191527, + "grad_norm": 0.30938518047332764, + "learning_rate": 8.729925532998348e-05, + "loss": 1.929, + "step": 8310 + }, + { + "epoch": 2.550951503990178, + "grad_norm": 0.3687250316143036, + "learning_rate": 8.729594493858007e-05, + "loss": 1.9214, + "step": 8311 + }, + { + "epoch": 2.5512584407612033, + "grad_norm": 0.3302690386772156, + "learning_rate": 8.729263417859625e-05, + "loss": 1.8667, + "step": 8312 + }, + { + "epoch": 2.5515653775322282, + "grad_norm": 0.32535505294799805, + "learning_rate": 8.728932305006478e-05, + "loss": 1.8298, + "step": 8313 + }, + { + "epoch": 2.5518723143032536, + "grad_norm": 0.3425545394420624, + "learning_rate": 8.728601155301834e-05, + "loss": 1.9479, + "step": 8314 + }, + { + "epoch": 2.5521792510742785, + "grad_norm": 0.29452621936798096, + "learning_rate": 8.72826996874897e-05, + "loss": 1.7963, + "step": 8315 + }, + { + "epoch": 2.552486187845304, + "grad_norm": 0.28749120235443115, + "learning_rate": 8.727938745351156e-05, + "loss": 1.7993, + "step": 8316 + }, + { + "epoch": 2.552793124616329, + "grad_norm": 0.29261404275894165, + "learning_rate": 8.727607485111669e-05, + "loss": 1.8307, + "step": 8317 + }, + { + "epoch": 2.5531000613873545, + "grad_norm": 0.2949221730232239, + "learning_rate": 8.727276188033778e-05, + "loss": 1.7918, + "step": 8318 + }, + { + "epoch": 2.5534069981583793, + "grad_norm": 0.2975117862224579, + "learning_rate": 8.726944854120757e-05, + "loss": 1.8488, + "step": 8319 + }, + { + "epoch": 2.5537139349294047, + "grad_norm": 0.30285659432411194, + "learning_rate": 8.726613483375885e-05, + "loss": 1.8763, + "step": 8320 + }, + { + "epoch": 2.5540208717004296, + "grad_norm": 0.3068414330482483, + "learning_rate": 8.726282075802435e-05, + "loss": 1.8684, + "step": 8321 + }, + { + "epoch": 2.554327808471455, + "grad_norm": 0.3904091715812683, + "learning_rate": 8.72595063140368e-05, + "loss": 1.8643, + "step": 8322 + }, + { + "epoch": 2.5546347452424802, + "grad_norm": 0.443294882774353, + "learning_rate": 8.725619150182897e-05, + "loss": 1.8268, + "step": 8323 + }, + { + "epoch": 2.554941682013505, + "grad_norm": 0.4574877619743347, + "learning_rate": 8.725287632143362e-05, + "loss": 1.8686, + "step": 8324 + }, + { + "epoch": 2.5552486187845305, + "grad_norm": 0.3246860206127167, + "learning_rate": 8.724956077288351e-05, + "loss": 1.8304, + "step": 8325 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.30745935440063477, + "learning_rate": 8.724624485621141e-05, + "loss": 1.8129, + "step": 8326 + }, + { + "epoch": 2.5558624923265807, + "grad_norm": 0.4026782214641571, + "learning_rate": 8.72429285714501e-05, + "loss": 1.8511, + "step": 8327 + }, + { + "epoch": 2.556169429097606, + "grad_norm": 0.41659530997276306, + "learning_rate": 8.723961191863232e-05, + "loss": 1.891, + "step": 8328 + }, + { + "epoch": 2.556476365868631, + "grad_norm": 0.31792551279067993, + "learning_rate": 8.723629489779088e-05, + "loss": 1.8413, + "step": 8329 + }, + { + "epoch": 2.5567833026396563, + "grad_norm": 0.3168247640132904, + "learning_rate": 8.723297750895856e-05, + "loss": 1.902, + "step": 8330 + }, + { + "epoch": 2.557090239410681, + "grad_norm": 0.27834242582321167, + "learning_rate": 8.72296597521681e-05, + "loss": 1.8185, + "step": 8331 + }, + { + "epoch": 2.5573971761817065, + "grad_norm": 0.2997399568557739, + "learning_rate": 8.722634162745236e-05, + "loss": 1.8389, + "step": 8332 + }, + { + "epoch": 2.557704112952732, + "grad_norm": 0.29116490483283997, + "learning_rate": 8.722302313484407e-05, + "loss": 1.8391, + "step": 8333 + }, + { + "epoch": 2.558011049723757, + "grad_norm": 0.2898460030555725, + "learning_rate": 8.721970427437605e-05, + "loss": 1.8891, + "step": 8334 + }, + { + "epoch": 2.558317986494782, + "grad_norm": 0.3231159746646881, + "learning_rate": 8.721638504608109e-05, + "loss": 1.826, + "step": 8335 + }, + { + "epoch": 2.5586249232658074, + "grad_norm": 0.38665273785591125, + "learning_rate": 8.721306544999203e-05, + "loss": 1.9162, + "step": 8336 + }, + { + "epoch": 2.5589318600368323, + "grad_norm": 0.367824912071228, + "learning_rate": 8.720974548614162e-05, + "loss": 1.8165, + "step": 8337 + }, + { + "epoch": 2.5592387968078576, + "grad_norm": 0.3095315098762512, + "learning_rate": 8.72064251545627e-05, + "loss": 1.8887, + "step": 8338 + }, + { + "epoch": 2.559545733578883, + "grad_norm": 0.316890150308609, + "learning_rate": 8.720310445528807e-05, + "loss": 1.8547, + "step": 8339 + }, + { + "epoch": 2.559852670349908, + "grad_norm": 0.2962728440761566, + "learning_rate": 8.719978338835057e-05, + "loss": 1.8252, + "step": 8340 + }, + { + "epoch": 2.560159607120933, + "grad_norm": 0.3351762890815735, + "learning_rate": 8.719646195378302e-05, + "loss": 1.8056, + "step": 8341 + }, + { + "epoch": 2.560466543891958, + "grad_norm": 0.2946149706840515, + "learning_rate": 8.719314015161822e-05, + "loss": 1.8219, + "step": 8342 + }, + { + "epoch": 2.5607734806629834, + "grad_norm": 0.30291053652763367, + "learning_rate": 8.718981798188899e-05, + "loss": 1.8161, + "step": 8343 + }, + { + "epoch": 2.5610804174340087, + "grad_norm": 0.30717429518699646, + "learning_rate": 8.71864954446282e-05, + "loss": 1.8763, + "step": 8344 + }, + { + "epoch": 2.5613873542050336, + "grad_norm": 0.28360515832901, + "learning_rate": 8.718317253986866e-05, + "loss": 1.7972, + "step": 8345 + }, + { + "epoch": 2.561694290976059, + "grad_norm": 0.34898701310157776, + "learning_rate": 8.717984926764322e-05, + "loss": 1.8843, + "step": 8346 + }, + { + "epoch": 2.562001227747084, + "grad_norm": 0.2702360451221466, + "learning_rate": 8.717652562798472e-05, + "loss": 1.7917, + "step": 8347 + }, + { + "epoch": 2.562308164518109, + "grad_norm": 0.30566295981407166, + "learning_rate": 8.7173201620926e-05, + "loss": 1.9027, + "step": 8348 + }, + { + "epoch": 2.5626151012891345, + "grad_norm": 0.2882433533668518, + "learning_rate": 8.716987724649991e-05, + "loss": 1.8167, + "step": 8349 + }, + { + "epoch": 2.56292203806016, + "grad_norm": 0.2616370916366577, + "learning_rate": 8.71665525047393e-05, + "loss": 1.7779, + "step": 8350 + }, + { + "epoch": 2.5632289748311847, + "grad_norm": 0.3033899664878845, + "learning_rate": 8.716322739567706e-05, + "loss": 1.9022, + "step": 8351 + }, + { + "epoch": 2.56353591160221, + "grad_norm": 0.30584800243377686, + "learning_rate": 8.7159901919346e-05, + "loss": 1.8808, + "step": 8352 + }, + { + "epoch": 2.563842848373235, + "grad_norm": 0.34650805592536926, + "learning_rate": 8.715657607577903e-05, + "loss": 1.8817, + "step": 8353 + }, + { + "epoch": 2.5641497851442603, + "grad_norm": 0.30568572878837585, + "learning_rate": 8.715324986500898e-05, + "loss": 1.8852, + "step": 8354 + }, + { + "epoch": 2.5644567219152856, + "grad_norm": 0.36174869537353516, + "learning_rate": 8.714992328706875e-05, + "loss": 1.8518, + "step": 8355 + }, + { + "epoch": 2.5647636586863105, + "grad_norm": 0.48538872599601746, + "learning_rate": 8.714659634199119e-05, + "loss": 1.8902, + "step": 8356 + }, + { + "epoch": 2.565070595457336, + "grad_norm": 0.44997766613960266, + "learning_rate": 8.71432690298092e-05, + "loss": 1.8914, + "step": 8357 + }, + { + "epoch": 2.5653775322283607, + "grad_norm": 0.30164965987205505, + "learning_rate": 8.713994135055566e-05, + "loss": 1.826, + "step": 8358 + }, + { + "epoch": 2.565684468999386, + "grad_norm": 0.35495996475219727, + "learning_rate": 8.713661330426345e-05, + "loss": 1.8006, + "step": 8359 + }, + { + "epoch": 2.5659914057704114, + "grad_norm": 0.4141593277454376, + "learning_rate": 8.713328489096545e-05, + "loss": 1.782, + "step": 8360 + }, + { + "epoch": 2.5662983425414367, + "grad_norm": 0.4758378267288208, + "learning_rate": 8.712995611069458e-05, + "loss": 1.8378, + "step": 8361 + }, + { + "epoch": 2.5666052793124616, + "grad_norm": 0.4852865934371948, + "learning_rate": 8.71266269634837e-05, + "loss": 1.8472, + "step": 8362 + }, + { + "epoch": 2.566912216083487, + "grad_norm": 0.43413496017456055, + "learning_rate": 8.712329744936576e-05, + "loss": 1.8118, + "step": 8363 + }, + { + "epoch": 2.567219152854512, + "grad_norm": 0.3100700080394745, + "learning_rate": 8.711996756837361e-05, + "loss": 1.8699, + "step": 8364 + }, + { + "epoch": 2.567526089625537, + "grad_norm": 0.31886258721351624, + "learning_rate": 8.711663732054021e-05, + "loss": 1.8022, + "step": 8365 + }, + { + "epoch": 2.5678330263965625, + "grad_norm": 0.38900697231292725, + "learning_rate": 8.711330670589841e-05, + "loss": 1.8119, + "step": 8366 + }, + { + "epoch": 2.5681399631675874, + "grad_norm": 0.4188348650932312, + "learning_rate": 8.710997572448119e-05, + "loss": 1.8561, + "step": 8367 + }, + { + "epoch": 2.5684468999386127, + "grad_norm": 0.3562021255493164, + "learning_rate": 8.710664437632143e-05, + "loss": 1.8605, + "step": 8368 + }, + { + "epoch": 2.5687538367096376, + "grad_norm": 0.3105112910270691, + "learning_rate": 8.710331266145206e-05, + "loss": 1.8122, + "step": 8369 + }, + { + "epoch": 2.569060773480663, + "grad_norm": 0.3209846615791321, + "learning_rate": 8.7099980579906e-05, + "loss": 1.8914, + "step": 8370 + }, + { + "epoch": 2.5693677102516883, + "grad_norm": 0.32560455799102783, + "learning_rate": 8.70966481317162e-05, + "loss": 1.9245, + "step": 8371 + }, + { + "epoch": 2.569674647022713, + "grad_norm": 0.29573267698287964, + "learning_rate": 8.709331531691558e-05, + "loss": 1.8576, + "step": 8372 + }, + { + "epoch": 2.5699815837937385, + "grad_norm": 0.2974778115749359, + "learning_rate": 8.708998213553707e-05, + "loss": 1.8464, + "step": 8373 + }, + { + "epoch": 2.5702885205647634, + "grad_norm": 0.3264322578907013, + "learning_rate": 8.708664858761362e-05, + "loss": 1.8945, + "step": 8374 + }, + { + "epoch": 2.5705954573357888, + "grad_norm": 0.28260353207588196, + "learning_rate": 8.708331467317816e-05, + "loss": 1.8296, + "step": 8375 + }, + { + "epoch": 2.570902394106814, + "grad_norm": 0.2991141676902771, + "learning_rate": 8.707998039226367e-05, + "loss": 1.9227, + "step": 8376 + }, + { + "epoch": 2.5712093308778394, + "grad_norm": 0.28582924604415894, + "learning_rate": 8.707664574490306e-05, + "loss": 1.8465, + "step": 8377 + }, + { + "epoch": 2.5715162676488643, + "grad_norm": 0.2860773205757141, + "learning_rate": 8.707331073112932e-05, + "loss": 1.8403, + "step": 8378 + }, + { + "epoch": 2.5718232044198897, + "grad_norm": 0.31145161390304565, + "learning_rate": 8.70699753509754e-05, + "loss": 1.8775, + "step": 8379 + }, + { + "epoch": 2.5721301411909145, + "grad_norm": 0.28711119294166565, + "learning_rate": 8.706663960447424e-05, + "loss": 1.8354, + "step": 8380 + }, + { + "epoch": 2.57243707796194, + "grad_norm": 0.2884272634983063, + "learning_rate": 8.706330349165884e-05, + "loss": 1.8772, + "step": 8381 + }, + { + "epoch": 2.572744014732965, + "grad_norm": 0.3581789433956146, + "learning_rate": 8.705996701256214e-05, + "loss": 1.8654, + "step": 8382 + }, + { + "epoch": 2.57305095150399, + "grad_norm": 0.41561809182167053, + "learning_rate": 8.705663016721712e-05, + "loss": 1.9112, + "step": 8383 + }, + { + "epoch": 2.5733578882750154, + "grad_norm": 0.301883727312088, + "learning_rate": 8.705329295565676e-05, + "loss": 1.803, + "step": 8384 + }, + { + "epoch": 2.5736648250460403, + "grad_norm": 0.37060779333114624, + "learning_rate": 8.704995537791405e-05, + "loss": 1.9371, + "step": 8385 + }, + { + "epoch": 2.5739717618170657, + "grad_norm": 0.44705548882484436, + "learning_rate": 8.704661743402195e-05, + "loss": 1.8599, + "step": 8386 + }, + { + "epoch": 2.574278698588091, + "grad_norm": 0.44097039103507996, + "learning_rate": 8.70432791240135e-05, + "loss": 1.8305, + "step": 8387 + }, + { + "epoch": 2.574585635359116, + "grad_norm": 0.3278143107891083, + "learning_rate": 8.703994044792161e-05, + "loss": 1.8817, + "step": 8388 + }, + { + "epoch": 2.574892572130141, + "grad_norm": 0.347153902053833, + "learning_rate": 8.703660140577934e-05, + "loss": 1.8182, + "step": 8389 + }, + { + "epoch": 2.575199508901166, + "grad_norm": 0.4667893052101135, + "learning_rate": 8.703326199761966e-05, + "loss": 1.8354, + "step": 8390 + }, + { + "epoch": 2.5755064456721914, + "grad_norm": 0.4956285059452057, + "learning_rate": 8.702992222347559e-05, + "loss": 1.8284, + "step": 8391 + }, + { + "epoch": 2.575813382443217, + "grad_norm": 0.3489355146884918, + "learning_rate": 8.702658208338012e-05, + "loss": 1.8439, + "step": 8392 + }, + { + "epoch": 2.576120319214242, + "grad_norm": 0.3054865002632141, + "learning_rate": 8.702324157736625e-05, + "loss": 1.8659, + "step": 8393 + }, + { + "epoch": 2.576427255985267, + "grad_norm": 0.3459004759788513, + "learning_rate": 8.701990070546703e-05, + "loss": 1.8644, + "step": 8394 + }, + { + "epoch": 2.5767341927562923, + "grad_norm": 0.34715306758880615, + "learning_rate": 8.701655946771544e-05, + "loss": 1.8765, + "step": 8395 + }, + { + "epoch": 2.5770411295273172, + "grad_norm": 0.35610535740852356, + "learning_rate": 8.701321786414452e-05, + "loss": 1.886, + "step": 8396 + }, + { + "epoch": 2.5773480662983426, + "grad_norm": 0.34869852662086487, + "learning_rate": 8.700987589478728e-05, + "loss": 1.8858, + "step": 8397 + }, + { + "epoch": 2.577655003069368, + "grad_norm": 0.33508050441741943, + "learning_rate": 8.700653355967675e-05, + "loss": 1.8429, + "step": 8398 + }, + { + "epoch": 2.577961939840393, + "grad_norm": 0.4707668721675873, + "learning_rate": 8.700319085884597e-05, + "loss": 1.8806, + "step": 8399 + }, + { + "epoch": 2.578268876611418, + "grad_norm": 0.5073609948158264, + "learning_rate": 8.699984779232797e-05, + "loss": 1.9252, + "step": 8400 + }, + { + "epoch": 2.578575813382443, + "grad_norm": 0.4120771884918213, + "learning_rate": 8.699650436015578e-05, + "loss": 1.9463, + "step": 8401 + }, + { + "epoch": 2.5788827501534684, + "grad_norm": 0.5639505386352539, + "learning_rate": 8.699316056236246e-05, + "loss": 1.9076, + "step": 8402 + }, + { + "epoch": 2.5791896869244937, + "grad_norm": 0.7611388564109802, + "learning_rate": 8.698981639898106e-05, + "loss": 1.8344, + "step": 8403 + }, + { + "epoch": 2.5794966236955186, + "grad_norm": 0.715629518032074, + "learning_rate": 8.69864718700446e-05, + "loss": 1.7928, + "step": 8404 + }, + { + "epoch": 2.579803560466544, + "grad_norm": 0.4248988926410675, + "learning_rate": 8.698312697558614e-05, + "loss": 1.835, + "step": 8405 + }, + { + "epoch": 2.580110497237569, + "grad_norm": 0.3638152778148651, + "learning_rate": 8.697978171563875e-05, + "loss": 1.8544, + "step": 8406 + }, + { + "epoch": 2.580417434008594, + "grad_norm": 0.40734997391700745, + "learning_rate": 8.697643609023547e-05, + "loss": 1.7759, + "step": 8407 + }, + { + "epoch": 2.5807243707796195, + "grad_norm": 0.41469305753707886, + "learning_rate": 8.697309009940939e-05, + "loss": 1.8989, + "step": 8408 + }, + { + "epoch": 2.581031307550645, + "grad_norm": 0.3003403842449188, + "learning_rate": 8.696974374319355e-05, + "loss": 1.8138, + "step": 8409 + }, + { + "epoch": 2.5813382443216697, + "grad_norm": 0.3475555181503296, + "learning_rate": 8.696639702162104e-05, + "loss": 1.8851, + "step": 8410 + }, + { + "epoch": 2.581645181092695, + "grad_norm": 0.3952930271625519, + "learning_rate": 8.696304993472493e-05, + "loss": 1.8421, + "step": 8411 + }, + { + "epoch": 2.58195211786372, + "grad_norm": 0.33059266209602356, + "learning_rate": 8.69597024825383e-05, + "loss": 1.886, + "step": 8412 + }, + { + "epoch": 2.5822590546347453, + "grad_norm": 0.291877806186676, + "learning_rate": 8.695635466509422e-05, + "loss": 1.8001, + "step": 8413 + }, + { + "epoch": 2.5825659914057706, + "grad_norm": 0.3707219064235687, + "learning_rate": 8.69530064824258e-05, + "loss": 1.8419, + "step": 8414 + }, + { + "epoch": 2.5828729281767955, + "grad_norm": 0.4656111001968384, + "learning_rate": 8.694965793456609e-05, + "loss": 1.8925, + "step": 8415 + }, + { + "epoch": 2.583179864947821, + "grad_norm": 0.4284421503543854, + "learning_rate": 8.694630902154821e-05, + "loss": 1.8794, + "step": 8416 + }, + { + "epoch": 2.5834868017188457, + "grad_norm": 0.25311100482940674, + "learning_rate": 8.694295974340525e-05, + "loss": 1.8004, + "step": 8417 + }, + { + "epoch": 2.583793738489871, + "grad_norm": 0.3463805615901947, + "learning_rate": 8.693961010017031e-05, + "loss": 1.8666, + "step": 8418 + }, + { + "epoch": 2.5841006752608964, + "grad_norm": 0.3193957209587097, + "learning_rate": 8.693626009187647e-05, + "loss": 1.8787, + "step": 8419 + }, + { + "epoch": 2.5844076120319213, + "grad_norm": 0.30919939279556274, + "learning_rate": 8.69329097185569e-05, + "loss": 1.9066, + "step": 8420 + }, + { + "epoch": 2.5847145488029466, + "grad_norm": 0.31369611620903015, + "learning_rate": 8.692955898024464e-05, + "loss": 1.8714, + "step": 8421 + }, + { + "epoch": 2.5850214855739715, + "grad_norm": 0.3191319406032562, + "learning_rate": 8.692620787697284e-05, + "loss": 1.8535, + "step": 8422 + }, + { + "epoch": 2.585328422344997, + "grad_norm": 0.3148418366909027, + "learning_rate": 8.692285640877462e-05, + "loss": 1.8648, + "step": 8423 + }, + { + "epoch": 2.585635359116022, + "grad_norm": 0.28245437145233154, + "learning_rate": 8.691950457568307e-05, + "loss": 1.8574, + "step": 8424 + }, + { + "epoch": 2.5859422958870475, + "grad_norm": 0.28383150696754456, + "learning_rate": 8.691615237773137e-05, + "loss": 1.7993, + "step": 8425 + }, + { + "epoch": 2.5862492326580724, + "grad_norm": 0.30522802472114563, + "learning_rate": 8.691279981495257e-05, + "loss": 1.8809, + "step": 8426 + }, + { + "epoch": 2.5865561694290977, + "grad_norm": 0.2936995327472687, + "learning_rate": 8.690944688737988e-05, + "loss": 1.745, + "step": 8427 + }, + { + "epoch": 2.5868631062001226, + "grad_norm": 0.2923533320426941, + "learning_rate": 8.69060935950464e-05, + "loss": 1.8929, + "step": 8428 + }, + { + "epoch": 2.587170042971148, + "grad_norm": 0.3280770182609558, + "learning_rate": 8.690273993798526e-05, + "loss": 1.8587, + "step": 8429 + }, + { + "epoch": 2.5874769797421733, + "grad_norm": 0.314712792634964, + "learning_rate": 8.689938591622962e-05, + "loss": 1.8569, + "step": 8430 + }, + { + "epoch": 2.587783916513198, + "grad_norm": 0.3230959475040436, + "learning_rate": 8.689603152981263e-05, + "loss": 1.8451, + "step": 8431 + }, + { + "epoch": 2.5880908532842235, + "grad_norm": 0.35917067527770996, + "learning_rate": 8.689267677876742e-05, + "loss": 1.7755, + "step": 8432 + }, + { + "epoch": 2.5883977900552484, + "grad_norm": 0.3590618968009949, + "learning_rate": 8.688932166312715e-05, + "loss": 1.8236, + "step": 8433 + }, + { + "epoch": 2.5887047268262737, + "grad_norm": 0.29416507482528687, + "learning_rate": 8.6885966182925e-05, + "loss": 1.7852, + "step": 8434 + }, + { + "epoch": 2.589011663597299, + "grad_norm": 0.24230079352855682, + "learning_rate": 8.688261033819409e-05, + "loss": 1.8006, + "step": 8435 + }, + { + "epoch": 2.5893186003683244, + "grad_norm": 0.2519497573375702, + "learning_rate": 8.687925412896762e-05, + "loss": 1.7787, + "step": 8436 + }, + { + "epoch": 2.5896255371393493, + "grad_norm": 0.2794395089149475, + "learning_rate": 8.687589755527874e-05, + "loss": 1.8408, + "step": 8437 + }, + { + "epoch": 2.5899324739103746, + "grad_norm": 0.28811511397361755, + "learning_rate": 8.687254061716063e-05, + "loss": 1.8961, + "step": 8438 + }, + { + "epoch": 2.5902394106813995, + "grad_norm": 0.28127825260162354, + "learning_rate": 8.686918331464647e-05, + "loss": 1.8235, + "step": 8439 + }, + { + "epoch": 2.590546347452425, + "grad_norm": 0.2869607210159302, + "learning_rate": 8.686582564776942e-05, + "loss": 1.8452, + "step": 8440 + }, + { + "epoch": 2.59085328422345, + "grad_norm": 0.36350393295288086, + "learning_rate": 8.686246761656268e-05, + "loss": 1.9262, + "step": 8441 + }, + { + "epoch": 2.591160220994475, + "grad_norm": 0.30231785774230957, + "learning_rate": 8.685910922105942e-05, + "loss": 1.8674, + "step": 8442 + }, + { + "epoch": 2.5914671577655004, + "grad_norm": 0.28321847319602966, + "learning_rate": 8.685575046129285e-05, + "loss": 1.8243, + "step": 8443 + }, + { + "epoch": 2.5917740945365253, + "grad_norm": 0.30235186219215393, + "learning_rate": 8.685239133729615e-05, + "loss": 1.8442, + "step": 8444 + }, + { + "epoch": 2.5920810313075506, + "grad_norm": 0.2684946060180664, + "learning_rate": 8.684903184910252e-05, + "loss": 1.8584, + "step": 8445 + }, + { + "epoch": 2.592387968078576, + "grad_norm": 0.33788567781448364, + "learning_rate": 8.684567199674514e-05, + "loss": 1.8296, + "step": 8446 + }, + { + "epoch": 2.592694904849601, + "grad_norm": 0.38110965490341187, + "learning_rate": 8.684231178025726e-05, + "loss": 1.8581, + "step": 8447 + }, + { + "epoch": 2.593001841620626, + "grad_norm": 0.36466923356056213, + "learning_rate": 8.683895119967204e-05, + "loss": 1.8799, + "step": 8448 + }, + { + "epoch": 2.593308778391651, + "grad_norm": 0.3052733838558197, + "learning_rate": 8.683559025502272e-05, + "loss": 1.8834, + "step": 8449 + }, + { + "epoch": 2.5936157151626764, + "grad_norm": 0.31457164883613586, + "learning_rate": 8.683222894634251e-05, + "loss": 1.8635, + "step": 8450 + }, + { + "epoch": 2.5939226519337018, + "grad_norm": 0.46189576387405396, + "learning_rate": 8.682886727366464e-05, + "loss": 1.8852, + "step": 8451 + }, + { + "epoch": 2.594229588704727, + "grad_norm": 0.467640221118927, + "learning_rate": 8.682550523702229e-05, + "loss": 1.8306, + "step": 8452 + }, + { + "epoch": 2.594536525475752, + "grad_norm": 0.3384416699409485, + "learning_rate": 8.682214283644873e-05, + "loss": 1.8298, + "step": 8453 + }, + { + "epoch": 2.5948434622467773, + "grad_norm": 0.2842169404029846, + "learning_rate": 8.681878007197717e-05, + "loss": 1.8091, + "step": 8454 + }, + { + "epoch": 2.595150399017802, + "grad_norm": 0.31266552209854126, + "learning_rate": 8.681541694364084e-05, + "loss": 1.8329, + "step": 8455 + }, + { + "epoch": 2.5954573357888275, + "grad_norm": 0.36803483963012695, + "learning_rate": 8.681205345147298e-05, + "loss": 1.8427, + "step": 8456 + }, + { + "epoch": 2.595764272559853, + "grad_norm": 0.37500229477882385, + "learning_rate": 8.680868959550684e-05, + "loss": 1.8865, + "step": 8457 + }, + { + "epoch": 2.5960712093308778, + "grad_norm": 0.30494266748428345, + "learning_rate": 8.680532537577565e-05, + "loss": 1.8375, + "step": 8458 + }, + { + "epoch": 2.596378146101903, + "grad_norm": 0.38320985436439514, + "learning_rate": 8.680196079231266e-05, + "loss": 1.8762, + "step": 8459 + }, + { + "epoch": 2.596685082872928, + "grad_norm": 0.48555347323417664, + "learning_rate": 8.679859584515112e-05, + "loss": 1.8558, + "step": 8460 + }, + { + "epoch": 2.5969920196439533, + "grad_norm": 0.3975796401500702, + "learning_rate": 8.67952305343243e-05, + "loss": 1.8265, + "step": 8461 + }, + { + "epoch": 2.5972989564149787, + "grad_norm": 0.3312734365463257, + "learning_rate": 8.679186485986544e-05, + "loss": 1.8346, + "step": 8462 + }, + { + "epoch": 2.5976058931860035, + "grad_norm": 0.37137889862060547, + "learning_rate": 8.67884988218078e-05, + "loss": 1.8894, + "step": 8463 + }, + { + "epoch": 2.597912829957029, + "grad_norm": 0.3645901083946228, + "learning_rate": 8.678513242018467e-05, + "loss": 1.8103, + "step": 8464 + }, + { + "epoch": 2.5982197667280538, + "grad_norm": 0.35010847449302673, + "learning_rate": 8.67817656550293e-05, + "loss": 1.8704, + "step": 8465 + }, + { + "epoch": 2.598526703499079, + "grad_norm": 0.36948931217193604, + "learning_rate": 8.677839852637492e-05, + "loss": 1.8413, + "step": 8466 + }, + { + "epoch": 2.5988336402701044, + "grad_norm": 0.3512018322944641, + "learning_rate": 8.67750310342549e-05, + "loss": 1.8222, + "step": 8467 + }, + { + "epoch": 2.5991405770411298, + "grad_norm": 0.3678590953350067, + "learning_rate": 8.677166317870245e-05, + "loss": 1.852, + "step": 8468 + }, + { + "epoch": 2.5994475138121547, + "grad_norm": 0.46718111634254456, + "learning_rate": 8.676829495975087e-05, + "loss": 1.8459, + "step": 8469 + }, + { + "epoch": 2.59975445058318, + "grad_norm": 0.4580456018447876, + "learning_rate": 8.676492637743345e-05, + "loss": 1.8547, + "step": 8470 + }, + { + "epoch": 2.600061387354205, + "grad_norm": 0.3790566921234131, + "learning_rate": 8.676155743178348e-05, + "loss": 1.8483, + "step": 8471 + }, + { + "epoch": 2.6003683241252302, + "grad_norm": 0.34775233268737793, + "learning_rate": 8.675818812283424e-05, + "loss": 1.9, + "step": 8472 + }, + { + "epoch": 2.6006752608962556, + "grad_norm": 0.4257417619228363, + "learning_rate": 8.675481845061906e-05, + "loss": 1.8354, + "step": 8473 + }, + { + "epoch": 2.6009821976672804, + "grad_norm": 0.46964964270591736, + "learning_rate": 8.675144841517122e-05, + "loss": 1.8305, + "step": 8474 + }, + { + "epoch": 2.601289134438306, + "grad_norm": 0.3592812120914459, + "learning_rate": 8.674807801652403e-05, + "loss": 1.778, + "step": 8475 + }, + { + "epoch": 2.6015960712093307, + "grad_norm": 0.3184985816478729, + "learning_rate": 8.674470725471078e-05, + "loss": 1.8706, + "step": 8476 + }, + { + "epoch": 2.601903007980356, + "grad_norm": 0.31306785345077515, + "learning_rate": 8.674133612976481e-05, + "loss": 1.8482, + "step": 8477 + }, + { + "epoch": 2.6022099447513813, + "grad_norm": 0.30568715929985046, + "learning_rate": 8.673796464171939e-05, + "loss": 1.8346, + "step": 8478 + }, + { + "epoch": 2.6025168815224062, + "grad_norm": 0.33701828122138977, + "learning_rate": 8.673459279060791e-05, + "loss": 1.8165, + "step": 8479 + }, + { + "epoch": 2.6028238182934316, + "grad_norm": 0.3153107166290283, + "learning_rate": 8.673122057646364e-05, + "loss": 1.8175, + "step": 8480 + }, + { + "epoch": 2.6031307550644565, + "grad_norm": 0.3428439497947693, + "learning_rate": 8.67278479993199e-05, + "loss": 1.8344, + "step": 8481 + }, + { + "epoch": 2.603437691835482, + "grad_norm": 0.39118432998657227, + "learning_rate": 8.672447505921006e-05, + "loss": 1.7904, + "step": 8482 + }, + { + "epoch": 2.603744628606507, + "grad_norm": 0.3845612108707428, + "learning_rate": 8.672110175616743e-05, + "loss": 1.8442, + "step": 8483 + }, + { + "epoch": 2.6040515653775325, + "grad_norm": 0.3402850329875946, + "learning_rate": 8.671772809022535e-05, + "loss": 1.8578, + "step": 8484 + }, + { + "epoch": 2.6043585021485574, + "grad_norm": 0.30314967036247253, + "learning_rate": 8.671435406141716e-05, + "loss": 1.8235, + "step": 8485 + }, + { + "epoch": 2.6046654389195827, + "grad_norm": 0.29402145743370056, + "learning_rate": 8.67109796697762e-05, + "loss": 1.8105, + "step": 8486 + }, + { + "epoch": 2.6049723756906076, + "grad_norm": 0.33207419514656067, + "learning_rate": 8.670760491533582e-05, + "loss": 1.9133, + "step": 8487 + }, + { + "epoch": 2.605279312461633, + "grad_norm": 0.3287195861339569, + "learning_rate": 8.670422979812938e-05, + "loss": 1.8344, + "step": 8488 + }, + { + "epoch": 2.6055862492326582, + "grad_norm": 0.37947842478752136, + "learning_rate": 8.670085431819021e-05, + "loss": 1.8504, + "step": 8489 + }, + { + "epoch": 2.605893186003683, + "grad_norm": 0.3688724935054779, + "learning_rate": 8.669747847555171e-05, + "loss": 1.8305, + "step": 8490 + }, + { + "epoch": 2.6062001227747085, + "grad_norm": 0.33962976932525635, + "learning_rate": 8.669410227024721e-05, + "loss": 1.861, + "step": 8491 + }, + { + "epoch": 2.6065070595457334, + "grad_norm": 0.27068057656288147, + "learning_rate": 8.669072570231009e-05, + "loss": 1.7666, + "step": 8492 + }, + { + "epoch": 2.6068139963167587, + "grad_norm": 0.32670122385025024, + "learning_rate": 8.668734877177371e-05, + "loss": 1.8434, + "step": 8493 + }, + { + "epoch": 2.607120933087784, + "grad_norm": 0.37303030490875244, + "learning_rate": 8.668397147867144e-05, + "loss": 1.8326, + "step": 8494 + }, + { + "epoch": 2.607427869858809, + "grad_norm": 0.2860218286514282, + "learning_rate": 8.668059382303666e-05, + "loss": 1.7993, + "step": 8495 + }, + { + "epoch": 2.6077348066298343, + "grad_norm": 0.3480636477470398, + "learning_rate": 8.667721580490278e-05, + "loss": 1.8895, + "step": 8496 + }, + { + "epoch": 2.608041743400859, + "grad_norm": 0.37609198689460754, + "learning_rate": 8.667383742430313e-05, + "loss": 1.8906, + "step": 8497 + }, + { + "epoch": 2.6083486801718845, + "grad_norm": 0.30747851729393005, + "learning_rate": 8.667045868127113e-05, + "loss": 1.8169, + "step": 8498 + }, + { + "epoch": 2.60865561694291, + "grad_norm": 0.3108443021774292, + "learning_rate": 8.666707957584016e-05, + "loss": 1.8296, + "step": 8499 + }, + { + "epoch": 2.608962553713935, + "grad_norm": 0.36353448033332825, + "learning_rate": 8.666370010804361e-05, + "loss": 1.879, + "step": 8500 + }, + { + "epoch": 2.60926949048496, + "grad_norm": 0.39959096908569336, + "learning_rate": 8.666032027791491e-05, + "loss": 1.8602, + "step": 8501 + }, + { + "epoch": 2.6095764272559854, + "grad_norm": 0.3505500853061676, + "learning_rate": 8.665694008548742e-05, + "loss": 1.861, + "step": 8502 + }, + { + "epoch": 2.6098833640270103, + "grad_norm": 0.3155219852924347, + "learning_rate": 8.665355953079457e-05, + "loss": 1.7911, + "step": 8503 + }, + { + "epoch": 2.6101903007980356, + "grad_norm": 0.2868075668811798, + "learning_rate": 8.665017861386975e-05, + "loss": 1.8023, + "step": 8504 + }, + { + "epoch": 2.610497237569061, + "grad_norm": 0.2890832722187042, + "learning_rate": 8.664679733474641e-05, + "loss": 1.8653, + "step": 8505 + }, + { + "epoch": 2.610804174340086, + "grad_norm": 0.3143366575241089, + "learning_rate": 8.66434156934579e-05, + "loss": 1.8024, + "step": 8506 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.28702911734580994, + "learning_rate": 8.664003369003772e-05, + "loss": 1.8231, + "step": 8507 + }, + { + "epoch": 2.611418047882136, + "grad_norm": 0.37087059020996094, + "learning_rate": 8.663665132451924e-05, + "loss": 1.8565, + "step": 8508 + }, + { + "epoch": 2.6117249846531614, + "grad_norm": 0.29796209931373596, + "learning_rate": 8.663326859693588e-05, + "loss": 1.8188, + "step": 8509 + }, + { + "epoch": 2.6120319214241867, + "grad_norm": 0.31352412700653076, + "learning_rate": 8.66298855073211e-05, + "loss": 1.806, + "step": 8510 + }, + { + "epoch": 2.612338858195212, + "grad_norm": 0.28749167919158936, + "learning_rate": 8.662650205570832e-05, + "loss": 1.8082, + "step": 8511 + }, + { + "epoch": 2.612645794966237, + "grad_norm": 0.26889678835868835, + "learning_rate": 8.662311824213099e-05, + "loss": 1.8211, + "step": 8512 + }, + { + "epoch": 2.6129527317372623, + "grad_norm": 0.2562754154205322, + "learning_rate": 8.661973406662253e-05, + "loss": 1.7519, + "step": 8513 + }, + { + "epoch": 2.613259668508287, + "grad_norm": 0.26967912912368774, + "learning_rate": 8.661634952921639e-05, + "loss": 1.8339, + "step": 8514 + }, + { + "epoch": 2.6135666052793125, + "grad_norm": 0.3468424081802368, + "learning_rate": 8.661296462994602e-05, + "loss": 1.9219, + "step": 8515 + }, + { + "epoch": 2.613873542050338, + "grad_norm": 0.34790560603141785, + "learning_rate": 8.660957936884489e-05, + "loss": 1.9089, + "step": 8516 + }, + { + "epoch": 2.6141804788213627, + "grad_norm": 0.350337952375412, + "learning_rate": 8.660619374594643e-05, + "loss": 1.8228, + "step": 8517 + }, + { + "epoch": 2.614487415592388, + "grad_norm": 0.37077057361602783, + "learning_rate": 8.660280776128411e-05, + "loss": 1.8658, + "step": 8518 + }, + { + "epoch": 2.614794352363413, + "grad_norm": 0.35846221446990967, + "learning_rate": 8.659942141489139e-05, + "loss": 1.8573, + "step": 8519 + }, + { + "epoch": 2.6151012891344383, + "grad_norm": 0.339101642370224, + "learning_rate": 8.659603470680173e-05, + "loss": 1.875, + "step": 8520 + }, + { + "epoch": 2.6154082259054636, + "grad_norm": 0.35074207186698914, + "learning_rate": 8.65926476370486e-05, + "loss": 1.8395, + "step": 8521 + }, + { + "epoch": 2.6157151626764885, + "grad_norm": 0.31544017791748047, + "learning_rate": 8.658926020566551e-05, + "loss": 1.8453, + "step": 8522 + }, + { + "epoch": 2.616022099447514, + "grad_norm": 0.30619683861732483, + "learning_rate": 8.658587241268587e-05, + "loss": 1.775, + "step": 8523 + }, + { + "epoch": 2.6163290362185387, + "grad_norm": 0.29331618547439575, + "learning_rate": 8.658248425814322e-05, + "loss": 1.8068, + "step": 8524 + }, + { + "epoch": 2.616635972989564, + "grad_norm": 0.2824336290359497, + "learning_rate": 8.6579095742071e-05, + "loss": 1.8759, + "step": 8525 + }, + { + "epoch": 2.6169429097605894, + "grad_norm": 0.2697986364364624, + "learning_rate": 8.657570686450271e-05, + "loss": 1.8295, + "step": 8526 + }, + { + "epoch": 2.6172498465316147, + "grad_norm": 0.3031822144985199, + "learning_rate": 8.657231762547186e-05, + "loss": 1.9205, + "step": 8527 + }, + { + "epoch": 2.6175567833026396, + "grad_norm": 0.2867984473705292, + "learning_rate": 8.656892802501196e-05, + "loss": 1.8638, + "step": 8528 + }, + { + "epoch": 2.617863720073665, + "grad_norm": 0.29799792170524597, + "learning_rate": 8.656553806315644e-05, + "loss": 1.8187, + "step": 8529 + }, + { + "epoch": 2.61817065684469, + "grad_norm": 0.3222150504589081, + "learning_rate": 8.656214773993884e-05, + "loss": 1.8661, + "step": 8530 + }, + { + "epoch": 2.618477593615715, + "grad_norm": 0.35999616980552673, + "learning_rate": 8.655875705539269e-05, + "loss": 1.9155, + "step": 8531 + }, + { + "epoch": 2.6187845303867405, + "grad_norm": 0.36571675539016724, + "learning_rate": 8.655536600955147e-05, + "loss": 1.8536, + "step": 8532 + }, + { + "epoch": 2.6190914671577654, + "grad_norm": 0.29667189717292786, + "learning_rate": 8.655197460244868e-05, + "loss": 1.8208, + "step": 8533 + }, + { + "epoch": 2.6193984039287908, + "grad_norm": 0.3216320276260376, + "learning_rate": 8.654858283411787e-05, + "loss": 1.8613, + "step": 8534 + }, + { + "epoch": 2.6197053406998156, + "grad_norm": 0.28880423307418823, + "learning_rate": 8.654519070459254e-05, + "loss": 1.8547, + "step": 8535 + }, + { + "epoch": 2.620012277470841, + "grad_norm": 0.3130050301551819, + "learning_rate": 8.654179821390621e-05, + "loss": 1.9355, + "step": 8536 + }, + { + "epoch": 2.6203192142418663, + "grad_norm": 0.3151358664035797, + "learning_rate": 8.653840536209241e-05, + "loss": 1.8462, + "step": 8537 + }, + { + "epoch": 2.620626151012891, + "grad_norm": 0.2702169120311737, + "learning_rate": 8.653501214918468e-05, + "loss": 1.7966, + "step": 8538 + }, + { + "epoch": 2.6209330877839165, + "grad_norm": 0.31494441628456116, + "learning_rate": 8.653161857521655e-05, + "loss": 1.7449, + "step": 8539 + }, + { + "epoch": 2.6212400245549414, + "grad_norm": 0.3219514787197113, + "learning_rate": 8.652822464022154e-05, + "loss": 1.8238, + "step": 8540 + }, + { + "epoch": 2.6215469613259668, + "grad_norm": 0.3237066864967346, + "learning_rate": 8.652483034423322e-05, + "loss": 1.8273, + "step": 8541 + }, + { + "epoch": 2.621853898096992, + "grad_norm": 0.31354910135269165, + "learning_rate": 8.65214356872851e-05, + "loss": 1.8662, + "step": 8542 + }, + { + "epoch": 2.6221608348680174, + "grad_norm": 0.30085036158561707, + "learning_rate": 8.651804066941077e-05, + "loss": 1.8922, + "step": 8543 + }, + { + "epoch": 2.6224677716390423, + "grad_norm": 0.337528258562088, + "learning_rate": 8.651464529064373e-05, + "loss": 1.8234, + "step": 8544 + }, + { + "epoch": 2.6227747084100677, + "grad_norm": 0.33202415704727173, + "learning_rate": 8.65112495510176e-05, + "loss": 1.8331, + "step": 8545 + }, + { + "epoch": 2.6230816451810925, + "grad_norm": 0.3288112282752991, + "learning_rate": 8.650785345056586e-05, + "loss": 1.8129, + "step": 8546 + }, + { + "epoch": 2.623388581952118, + "grad_norm": 0.35483047366142273, + "learning_rate": 8.650445698932214e-05, + "loss": 1.8488, + "step": 8547 + }, + { + "epoch": 2.623695518723143, + "grad_norm": 0.32108932733535767, + "learning_rate": 8.650106016731998e-05, + "loss": 1.8263, + "step": 8548 + }, + { + "epoch": 2.624002455494168, + "grad_norm": 0.2902318239212036, + "learning_rate": 8.649766298459295e-05, + "loss": 1.8352, + "step": 8549 + }, + { + "epoch": 2.6243093922651934, + "grad_norm": 0.29014477133750916, + "learning_rate": 8.64942654411746e-05, + "loss": 1.8568, + "step": 8550 + }, + { + "epoch": 2.6246163290362183, + "grad_norm": 0.3996742367744446, + "learning_rate": 8.649086753709855e-05, + "loss": 1.8928, + "step": 8551 + }, + { + "epoch": 2.6249232658072437, + "grad_norm": 0.3703175187110901, + "learning_rate": 8.648746927239835e-05, + "loss": 1.829, + "step": 8552 + }, + { + "epoch": 2.625230202578269, + "grad_norm": 0.33802542090415955, + "learning_rate": 8.64840706471076e-05, + "loss": 1.8827, + "step": 8553 + }, + { + "epoch": 2.625537139349294, + "grad_norm": 0.33303168416023254, + "learning_rate": 8.648067166125988e-05, + "loss": 1.8964, + "step": 8554 + }, + { + "epoch": 2.6258440761203192, + "grad_norm": 0.33449646830558777, + "learning_rate": 8.647727231488878e-05, + "loss": 1.8477, + "step": 8555 + }, + { + "epoch": 2.626151012891344, + "grad_norm": 0.3260989189147949, + "learning_rate": 8.647387260802788e-05, + "loss": 1.8623, + "step": 8556 + }, + { + "epoch": 2.6264579496623695, + "grad_norm": 0.2847815752029419, + "learning_rate": 8.647047254071082e-05, + "loss": 1.769, + "step": 8557 + }, + { + "epoch": 2.626764886433395, + "grad_norm": 0.30041372776031494, + "learning_rate": 8.646707211297116e-05, + "loss": 1.8451, + "step": 8558 + }, + { + "epoch": 2.62707182320442, + "grad_norm": 0.3557286560535431, + "learning_rate": 8.646367132484252e-05, + "loss": 1.8233, + "step": 8559 + }, + { + "epoch": 2.627378759975445, + "grad_norm": 0.39471131563186646, + "learning_rate": 8.646027017635851e-05, + "loss": 1.8364, + "step": 8560 + }, + { + "epoch": 2.6276856967464703, + "grad_norm": 0.37501803040504456, + "learning_rate": 8.645686866755273e-05, + "loss": 1.8129, + "step": 8561 + }, + { + "epoch": 2.6279926335174952, + "grad_norm": 0.374553918838501, + "learning_rate": 8.645346679845881e-05, + "loss": 1.9388, + "step": 8562 + }, + { + "epoch": 2.6282995702885206, + "grad_norm": 0.34410929679870605, + "learning_rate": 8.645006456911037e-05, + "loss": 1.8496, + "step": 8563 + }, + { + "epoch": 2.628606507059546, + "grad_norm": 0.28208592534065247, + "learning_rate": 8.644666197954103e-05, + "loss": 1.8405, + "step": 8564 + }, + { + "epoch": 2.628913443830571, + "grad_norm": 0.2913917005062103, + "learning_rate": 8.644325902978441e-05, + "loss": 1.8775, + "step": 8565 + }, + { + "epoch": 2.629220380601596, + "grad_norm": 0.33285796642303467, + "learning_rate": 8.643985571987414e-05, + "loss": 1.8217, + "step": 8566 + }, + { + "epoch": 2.629527317372621, + "grad_norm": 0.3419492244720459, + "learning_rate": 8.643645204984386e-05, + "loss": 1.8911, + "step": 8567 + }, + { + "epoch": 2.6298342541436464, + "grad_norm": 0.33901095390319824, + "learning_rate": 8.643304801972721e-05, + "loss": 1.8653, + "step": 8568 + }, + { + "epoch": 2.6301411909146717, + "grad_norm": 0.30073773860931396, + "learning_rate": 8.642964362955781e-05, + "loss": 1.7544, + "step": 8569 + }, + { + "epoch": 2.630448127685697, + "grad_norm": 0.3300367593765259, + "learning_rate": 8.642623887936933e-05, + "loss": 1.8764, + "step": 8570 + }, + { + "epoch": 2.630755064456722, + "grad_norm": 0.330671101808548, + "learning_rate": 8.642283376919542e-05, + "loss": 1.8227, + "step": 8571 + }, + { + "epoch": 2.6310620012277472, + "grad_norm": 0.3498590290546417, + "learning_rate": 8.64194282990697e-05, + "loss": 1.8639, + "step": 8572 + }, + { + "epoch": 2.631368937998772, + "grad_norm": 0.33145999908447266, + "learning_rate": 8.641602246902586e-05, + "loss": 1.8442, + "step": 8573 + }, + { + "epoch": 2.6316758747697975, + "grad_norm": 0.29510337114334106, + "learning_rate": 8.641261627909754e-05, + "loss": 1.829, + "step": 8574 + }, + { + "epoch": 2.631982811540823, + "grad_norm": 0.2788131833076477, + "learning_rate": 8.640920972931839e-05, + "loss": 1.7717, + "step": 8575 + }, + { + "epoch": 2.6322897483118477, + "grad_norm": 0.27459269762039185, + "learning_rate": 8.640580281972209e-05, + "loss": 1.7924, + "step": 8576 + }, + { + "epoch": 2.632596685082873, + "grad_norm": 0.3517146110534668, + "learning_rate": 8.640239555034232e-05, + "loss": 1.8921, + "step": 8577 + }, + { + "epoch": 2.632903621853898, + "grad_norm": 0.2852388620376587, + "learning_rate": 8.639898792121273e-05, + "loss": 1.8207, + "step": 8578 + }, + { + "epoch": 2.6332105586249233, + "grad_norm": 0.3164372742176056, + "learning_rate": 8.639557993236702e-05, + "loss": 1.8782, + "step": 8579 + }, + { + "epoch": 2.6335174953959486, + "grad_norm": 0.43939462304115295, + "learning_rate": 8.639217158383885e-05, + "loss": 1.8345, + "step": 8580 + }, + { + "epoch": 2.6338244321669735, + "grad_norm": 0.45321017503738403, + "learning_rate": 8.63887628756619e-05, + "loss": 1.904, + "step": 8581 + }, + { + "epoch": 2.634131368937999, + "grad_norm": 0.4423905611038208, + "learning_rate": 8.638535380786989e-05, + "loss": 1.8894, + "step": 8582 + }, + { + "epoch": 2.6344383057090237, + "grad_norm": 0.3929237723350525, + "learning_rate": 8.638194438049648e-05, + "loss": 1.8835, + "step": 8583 + }, + { + "epoch": 2.634745242480049, + "grad_norm": 0.3178403973579407, + "learning_rate": 8.637853459357536e-05, + "loss": 1.8125, + "step": 8584 + }, + { + "epoch": 2.6350521792510744, + "grad_norm": 0.3796660602092743, + "learning_rate": 8.637512444714024e-05, + "loss": 1.9376, + "step": 8585 + }, + { + "epoch": 2.6353591160220997, + "grad_norm": 0.34011390805244446, + "learning_rate": 8.637171394122483e-05, + "loss": 1.8339, + "step": 8586 + }, + { + "epoch": 2.6356660527931246, + "grad_norm": 0.3423489034175873, + "learning_rate": 8.636830307586281e-05, + "loss": 1.82, + "step": 8587 + }, + { + "epoch": 2.63597298956415, + "grad_norm": 0.3644867241382599, + "learning_rate": 8.636489185108791e-05, + "loss": 1.811, + "step": 8588 + }, + { + "epoch": 2.636279926335175, + "grad_norm": 0.35383811593055725, + "learning_rate": 8.636148026693384e-05, + "loss": 1.8228, + "step": 8589 + }, + { + "epoch": 2.6365868631062, + "grad_norm": 0.28066012263298035, + "learning_rate": 8.635806832343431e-05, + "loss": 1.7752, + "step": 8590 + }, + { + "epoch": 2.6368937998772255, + "grad_norm": 0.27132275700569153, + "learning_rate": 8.635465602062304e-05, + "loss": 1.8053, + "step": 8591 + }, + { + "epoch": 2.6372007366482504, + "grad_norm": 0.3076920211315155, + "learning_rate": 8.635124335853375e-05, + "loss": 1.77, + "step": 8592 + }, + { + "epoch": 2.6375076734192757, + "grad_norm": 0.35130617022514343, + "learning_rate": 8.634783033720015e-05, + "loss": 1.8272, + "step": 8593 + }, + { + "epoch": 2.6378146101903006, + "grad_norm": 0.3805561661720276, + "learning_rate": 8.634441695665601e-05, + "loss": 1.8549, + "step": 8594 + }, + { + "epoch": 2.638121546961326, + "grad_norm": 0.3168867230415344, + "learning_rate": 8.634100321693504e-05, + "loss": 1.9131, + "step": 8595 + }, + { + "epoch": 2.6384284837323513, + "grad_norm": 0.3061029314994812, + "learning_rate": 8.633758911807095e-05, + "loss": 1.84, + "step": 8596 + }, + { + "epoch": 2.638735420503376, + "grad_norm": 0.2766086459159851, + "learning_rate": 8.633417466009752e-05, + "loss": 1.8519, + "step": 8597 + }, + { + "epoch": 2.6390423572744015, + "grad_norm": 0.3250633180141449, + "learning_rate": 8.633075984304849e-05, + "loss": 1.8434, + "step": 8598 + }, + { + "epoch": 2.6393492940454264, + "grad_norm": 0.2819656729698181, + "learning_rate": 8.63273446669576e-05, + "loss": 1.8181, + "step": 8599 + }, + { + "epoch": 2.6396562308164517, + "grad_norm": 0.3506627678871155, + "learning_rate": 8.632392913185859e-05, + "loss": 1.8521, + "step": 8600 + }, + { + "epoch": 2.639963167587477, + "grad_norm": 0.3026714026927948, + "learning_rate": 8.632051323778521e-05, + "loss": 1.8183, + "step": 8601 + }, + { + "epoch": 2.6402701043585024, + "grad_norm": 0.31900104880332947, + "learning_rate": 8.631709698477124e-05, + "loss": 1.8615, + "step": 8602 + }, + { + "epoch": 2.6405770411295273, + "grad_norm": 0.3017260730266571, + "learning_rate": 8.631368037285044e-05, + "loss": 1.837, + "step": 8603 + }, + { + "epoch": 2.6408839779005526, + "grad_norm": 0.29461613297462463, + "learning_rate": 8.631026340205655e-05, + "loss": 1.8398, + "step": 8604 + }, + { + "epoch": 2.6411909146715775, + "grad_norm": 0.3405241370201111, + "learning_rate": 8.630684607242337e-05, + "loss": 1.9241, + "step": 8605 + }, + { + "epoch": 2.641497851442603, + "grad_norm": 0.36280715465545654, + "learning_rate": 8.630342838398465e-05, + "loss": 1.8319, + "step": 8606 + }, + { + "epoch": 2.641804788213628, + "grad_norm": 0.32274433970451355, + "learning_rate": 8.630001033677414e-05, + "loss": 1.8462, + "step": 8607 + }, + { + "epoch": 2.642111724984653, + "grad_norm": 0.28930720686912537, + "learning_rate": 8.629659193082571e-05, + "loss": 1.8251, + "step": 8608 + }, + { + "epoch": 2.6424186617556784, + "grad_norm": 0.30114278197288513, + "learning_rate": 8.629317316617305e-05, + "loss": 1.8037, + "step": 8609 + }, + { + "epoch": 2.6427255985267033, + "grad_norm": 0.31895074248313904, + "learning_rate": 8.628975404285e-05, + "loss": 1.808, + "step": 8610 + }, + { + "epoch": 2.6430325352977286, + "grad_norm": 0.31819066405296326, + "learning_rate": 8.62863345608903e-05, + "loss": 1.811, + "step": 8611 + }, + { + "epoch": 2.643339472068754, + "grad_norm": 0.3860008716583252, + "learning_rate": 8.628291472032779e-05, + "loss": 1.9041, + "step": 8612 + }, + { + "epoch": 2.643646408839779, + "grad_norm": 0.4598442614078522, + "learning_rate": 8.627949452119626e-05, + "loss": 1.788, + "step": 8613 + }, + { + "epoch": 2.643953345610804, + "grad_norm": 0.4720706641674042, + "learning_rate": 8.62760739635295e-05, + "loss": 1.8436, + "step": 8614 + }, + { + "epoch": 2.644260282381829, + "grad_norm": 0.3894381523132324, + "learning_rate": 8.627265304736131e-05, + "loss": 1.8188, + "step": 8615 + }, + { + "epoch": 2.6445672191528544, + "grad_norm": 0.2819352149963379, + "learning_rate": 8.626923177272551e-05, + "loss": 1.7804, + "step": 8616 + }, + { + "epoch": 2.6448741559238798, + "grad_norm": 0.33847305178642273, + "learning_rate": 8.626581013965588e-05, + "loss": 1.8628, + "step": 8617 + }, + { + "epoch": 2.645181092694905, + "grad_norm": 0.49113303422927856, + "learning_rate": 8.626238814818628e-05, + "loss": 1.821, + "step": 8618 + }, + { + "epoch": 2.64548802946593, + "grad_norm": 0.5562265515327454, + "learning_rate": 8.62589657983505e-05, + "loss": 1.8732, + "step": 8619 + }, + { + "epoch": 2.6457949662369553, + "grad_norm": 0.48525476455688477, + "learning_rate": 8.625554309018237e-05, + "loss": 1.8711, + "step": 8620 + }, + { + "epoch": 2.64610190300798, + "grad_norm": 0.35900986194610596, + "learning_rate": 8.62521200237157e-05, + "loss": 1.8922, + "step": 8621 + }, + { + "epoch": 2.6464088397790055, + "grad_norm": 0.2920636832714081, + "learning_rate": 8.624869659898435e-05, + "loss": 1.8121, + "step": 8622 + }, + { + "epoch": 2.646715776550031, + "grad_norm": 0.3626689314842224, + "learning_rate": 8.624527281602213e-05, + "loss": 1.8231, + "step": 8623 + }, + { + "epoch": 2.6470227133210558, + "grad_norm": 0.37683549523353577, + "learning_rate": 8.624184867486288e-05, + "loss": 1.8648, + "step": 8624 + }, + { + "epoch": 2.647329650092081, + "grad_norm": 0.293865829706192, + "learning_rate": 8.623842417554043e-05, + "loss": 1.8347, + "step": 8625 + }, + { + "epoch": 2.647636586863106, + "grad_norm": 0.28916221857070923, + "learning_rate": 8.623499931808863e-05, + "loss": 1.8337, + "step": 8626 + }, + { + "epoch": 2.6479435236341313, + "grad_norm": 0.439003586769104, + "learning_rate": 8.623157410254134e-05, + "loss": 1.8933, + "step": 8627 + }, + { + "epoch": 2.6482504604051567, + "grad_norm": 0.39125844836235046, + "learning_rate": 8.62281485289324e-05, + "loss": 1.7986, + "step": 8628 + }, + { + "epoch": 2.6485573971761815, + "grad_norm": 0.3968810439109802, + "learning_rate": 8.622472259729566e-05, + "loss": 1.8211, + "step": 8629 + }, + { + "epoch": 2.648864333947207, + "grad_norm": 0.37775713205337524, + "learning_rate": 8.622129630766498e-05, + "loss": 1.8976, + "step": 8630 + }, + { + "epoch": 2.6491712707182318, + "grad_norm": 0.329583078622818, + "learning_rate": 8.621786966007422e-05, + "loss": 1.9164, + "step": 8631 + }, + { + "epoch": 2.649478207489257, + "grad_norm": 0.3499230742454529, + "learning_rate": 8.621444265455725e-05, + "loss": 1.8589, + "step": 8632 + }, + { + "epoch": 2.6497851442602824, + "grad_norm": 0.504540741443634, + "learning_rate": 8.621101529114792e-05, + "loss": 1.7853, + "step": 8633 + }, + { + "epoch": 2.650092081031308, + "grad_norm": 0.47648704051971436, + "learning_rate": 8.620758756988012e-05, + "loss": 1.865, + "step": 8634 + }, + { + "epoch": 2.6503990178023327, + "grad_norm": 0.3592020869255066, + "learning_rate": 8.62041594907877e-05, + "loss": 1.886, + "step": 8635 + }, + { + "epoch": 2.650705954573358, + "grad_norm": 0.4862852096557617, + "learning_rate": 8.620073105390458e-05, + "loss": 1.8408, + "step": 8636 + }, + { + "epoch": 2.651012891344383, + "grad_norm": 0.5418413877487183, + "learning_rate": 8.619730225926462e-05, + "loss": 1.8715, + "step": 8637 + }, + { + "epoch": 2.6513198281154082, + "grad_norm": 0.4154299795627594, + "learning_rate": 8.619387310690168e-05, + "loss": 1.8879, + "step": 8638 + }, + { + "epoch": 2.6516267648864336, + "grad_norm": 0.3325296938419342, + "learning_rate": 8.619044359684968e-05, + "loss": 1.8422, + "step": 8639 + }, + { + "epoch": 2.6519337016574585, + "grad_norm": 0.4082878828048706, + "learning_rate": 8.61870137291425e-05, + "loss": 1.8375, + "step": 8640 + }, + { + "epoch": 2.652240638428484, + "grad_norm": 0.46948596835136414, + "learning_rate": 8.618358350381406e-05, + "loss": 1.8367, + "step": 8641 + }, + { + "epoch": 2.6525475751995087, + "grad_norm": 0.3770928978919983, + "learning_rate": 8.618015292089823e-05, + "loss": 1.8236, + "step": 8642 + }, + { + "epoch": 2.652854511970534, + "grad_norm": 0.27340826392173767, + "learning_rate": 8.617672198042892e-05, + "loss": 1.8446, + "step": 8643 + }, + { + "epoch": 2.6531614487415593, + "grad_norm": 0.4071608781814575, + "learning_rate": 8.617329068244004e-05, + "loss": 1.8576, + "step": 8644 + }, + { + "epoch": 2.6534683855125847, + "grad_norm": 0.5041884779930115, + "learning_rate": 8.61698590269655e-05, + "loss": 1.9075, + "step": 8645 + }, + { + "epoch": 2.6537753222836096, + "grad_norm": 0.4129817485809326, + "learning_rate": 8.616642701403921e-05, + "loss": 1.8592, + "step": 8646 + }, + { + "epoch": 2.654082259054635, + "grad_norm": 0.2837994694709778, + "learning_rate": 8.616299464369508e-05, + "loss": 1.8383, + "step": 8647 + }, + { + "epoch": 2.65438919582566, + "grad_norm": 0.3413170278072357, + "learning_rate": 8.615956191596707e-05, + "loss": 1.8083, + "step": 8648 + }, + { + "epoch": 2.654696132596685, + "grad_norm": 0.3661767244338989, + "learning_rate": 8.615612883088907e-05, + "loss": 1.9141, + "step": 8649 + }, + { + "epoch": 2.6550030693677105, + "grad_norm": 0.3209584951400757, + "learning_rate": 8.6152695388495e-05, + "loss": 1.8886, + "step": 8650 + }, + { + "epoch": 2.6553100061387354, + "grad_norm": 0.3161548674106598, + "learning_rate": 8.61492615888188e-05, + "loss": 1.832, + "step": 8651 + }, + { + "epoch": 2.6556169429097607, + "grad_norm": 0.3258545696735382, + "learning_rate": 8.614582743189441e-05, + "loss": 1.8747, + "step": 8652 + }, + { + "epoch": 2.6559238796807856, + "grad_norm": 0.3528682291507721, + "learning_rate": 8.614239291775579e-05, + "loss": 1.9192, + "step": 8653 + }, + { + "epoch": 2.656230816451811, + "grad_norm": 0.3430826961994171, + "learning_rate": 8.613895804643684e-05, + "loss": 1.8601, + "step": 8654 + }, + { + "epoch": 2.6565377532228363, + "grad_norm": 0.3221988379955292, + "learning_rate": 8.613552281797152e-05, + "loss": 1.9218, + "step": 8655 + }, + { + "epoch": 2.656844689993861, + "grad_norm": 0.2917289137840271, + "learning_rate": 8.613208723239379e-05, + "loss": 1.7443, + "step": 8656 + }, + { + "epoch": 2.6571516267648865, + "grad_norm": 0.28350377082824707, + "learning_rate": 8.612865128973762e-05, + "loss": 1.809, + "step": 8657 + }, + { + "epoch": 2.6574585635359114, + "grad_norm": 0.2758159339427948, + "learning_rate": 8.61252149900369e-05, + "loss": 1.8628, + "step": 8658 + }, + { + "epoch": 2.6577655003069367, + "grad_norm": 0.3537377417087555, + "learning_rate": 8.612177833332566e-05, + "loss": 1.8586, + "step": 8659 + }, + { + "epoch": 2.658072437077962, + "grad_norm": 0.38237693905830383, + "learning_rate": 8.611834131963783e-05, + "loss": 1.8869, + "step": 8660 + }, + { + "epoch": 2.6583793738489874, + "grad_norm": 0.30623751878738403, + "learning_rate": 8.611490394900739e-05, + "loss": 1.8508, + "step": 8661 + }, + { + "epoch": 2.6586863106200123, + "grad_norm": 0.2597752809524536, + "learning_rate": 8.611146622146828e-05, + "loss": 1.7931, + "step": 8662 + }, + { + "epoch": 2.6589932473910376, + "grad_norm": 0.2953357696533203, + "learning_rate": 8.61080281370545e-05, + "loss": 1.837, + "step": 8663 + }, + { + "epoch": 2.6593001841620625, + "grad_norm": 0.3018724322319031, + "learning_rate": 8.610458969580003e-05, + "loss": 1.871, + "step": 8664 + }, + { + "epoch": 2.659607120933088, + "grad_norm": 0.36607179045677185, + "learning_rate": 8.610115089773885e-05, + "loss": 1.9453, + "step": 8665 + }, + { + "epoch": 2.659914057704113, + "grad_norm": 0.38754695653915405, + "learning_rate": 8.609771174290493e-05, + "loss": 1.8886, + "step": 8666 + }, + { + "epoch": 2.660220994475138, + "grad_norm": 0.3752847909927368, + "learning_rate": 8.609427223133226e-05, + "loss": 1.8662, + "step": 8667 + }, + { + "epoch": 2.6605279312461634, + "grad_norm": 0.3301216661930084, + "learning_rate": 8.609083236305483e-05, + "loss": 1.8697, + "step": 8668 + }, + { + "epoch": 2.6608348680171883, + "grad_norm": 0.31682586669921875, + "learning_rate": 8.608739213810666e-05, + "loss": 1.8982, + "step": 8669 + }, + { + "epoch": 2.6611418047882136, + "grad_norm": 0.30835145711898804, + "learning_rate": 8.608395155652172e-05, + "loss": 1.8245, + "step": 8670 + }, + { + "epoch": 2.661448741559239, + "grad_norm": 0.32517582178115845, + "learning_rate": 8.608051061833402e-05, + "loss": 1.9117, + "step": 8671 + }, + { + "epoch": 2.661755678330264, + "grad_norm": 0.3120395541191101, + "learning_rate": 8.607706932357757e-05, + "loss": 1.76, + "step": 8672 + }, + { + "epoch": 2.662062615101289, + "grad_norm": 0.31719091534614563, + "learning_rate": 8.607362767228637e-05, + "loss": 1.8939, + "step": 8673 + }, + { + "epoch": 2.662369551872314, + "grad_norm": 0.28792136907577515, + "learning_rate": 8.607018566449445e-05, + "loss": 1.8403, + "step": 8674 + }, + { + "epoch": 2.6626764886433394, + "grad_norm": 0.28327643871307373, + "learning_rate": 8.606674330023581e-05, + "loss": 1.8204, + "step": 8675 + }, + { + "epoch": 2.6629834254143647, + "grad_norm": 0.29808422923088074, + "learning_rate": 8.606330057954446e-05, + "loss": 1.8325, + "step": 8676 + }, + { + "epoch": 2.66329036218539, + "grad_norm": 0.36162641644477844, + "learning_rate": 8.605985750245446e-05, + "loss": 1.8387, + "step": 8677 + }, + { + "epoch": 2.663597298956415, + "grad_norm": 0.3418589234352112, + "learning_rate": 8.605641406899978e-05, + "loss": 1.8139, + "step": 8678 + }, + { + "epoch": 2.6639042357274403, + "grad_norm": 0.31307870149612427, + "learning_rate": 8.605297027921451e-05, + "loss": 1.8897, + "step": 8679 + }, + { + "epoch": 2.664211172498465, + "grad_norm": 0.36962878704071045, + "learning_rate": 8.604952613313264e-05, + "loss": 1.9233, + "step": 8680 + }, + { + "epoch": 2.6645181092694905, + "grad_norm": 0.3502652049064636, + "learning_rate": 8.604608163078824e-05, + "loss": 1.8218, + "step": 8681 + }, + { + "epoch": 2.664825046040516, + "grad_norm": 0.3703038692474365, + "learning_rate": 8.604263677221533e-05, + "loss": 1.8484, + "step": 8682 + }, + { + "epoch": 2.6651319828115407, + "grad_norm": 0.2609662711620331, + "learning_rate": 8.603919155744796e-05, + "loss": 1.7645, + "step": 8683 + }, + { + "epoch": 2.665438919582566, + "grad_norm": 0.33297231793403625, + "learning_rate": 8.603574598652015e-05, + "loss": 1.8543, + "step": 8684 + }, + { + "epoch": 2.665745856353591, + "grad_norm": 0.28411462903022766, + "learning_rate": 8.603230005946601e-05, + "loss": 1.867, + "step": 8685 + }, + { + "epoch": 2.6660527931246163, + "grad_norm": 0.3209732174873352, + "learning_rate": 8.602885377631954e-05, + "loss": 1.8886, + "step": 8686 + }, + { + "epoch": 2.6663597298956416, + "grad_norm": 0.35397234559059143, + "learning_rate": 8.602540713711482e-05, + "loss": 1.8965, + "step": 8687 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2925071716308594, + "learning_rate": 8.602196014188593e-05, + "loss": 1.8027, + "step": 8688 + }, + { + "epoch": 2.666973603437692, + "grad_norm": 0.2902941107749939, + "learning_rate": 8.60185127906669e-05, + "loss": 1.8022, + "step": 8689 + }, + { + "epoch": 2.6672805402087167, + "grad_norm": 0.31528550386428833, + "learning_rate": 8.601506508349181e-05, + "loss": 1.8153, + "step": 8690 + }, + { + "epoch": 2.667587476979742, + "grad_norm": 0.32254844903945923, + "learning_rate": 8.601161702039477e-05, + "loss": 1.8199, + "step": 8691 + }, + { + "epoch": 2.6678944137507674, + "grad_norm": 0.2999059855937958, + "learning_rate": 8.600816860140979e-05, + "loss": 1.8404, + "step": 8692 + }, + { + "epoch": 2.6682013505217927, + "grad_norm": 0.32727453112602234, + "learning_rate": 8.6004719826571e-05, + "loss": 1.8148, + "step": 8693 + }, + { + "epoch": 2.6685082872928176, + "grad_norm": 0.3048906624317169, + "learning_rate": 8.600127069591245e-05, + "loss": 1.833, + "step": 8694 + }, + { + "epoch": 2.668815224063843, + "grad_norm": 0.43790102005004883, + "learning_rate": 8.599782120946826e-05, + "loss": 1.8537, + "step": 8695 + }, + { + "epoch": 2.669122160834868, + "grad_norm": 0.38096752762794495, + "learning_rate": 8.59943713672725e-05, + "loss": 1.8094, + "step": 8696 + }, + { + "epoch": 2.669429097605893, + "grad_norm": 0.3065931499004364, + "learning_rate": 8.599092116935927e-05, + "loss": 1.8878, + "step": 8697 + }, + { + "epoch": 2.6697360343769185, + "grad_norm": 0.41807904839515686, + "learning_rate": 8.598747061576264e-05, + "loss": 1.8753, + "step": 8698 + }, + { + "epoch": 2.6700429711479434, + "grad_norm": 0.4906943142414093, + "learning_rate": 8.598401970651676e-05, + "loss": 1.7642, + "step": 8699 + }, + { + "epoch": 2.6703499079189688, + "grad_norm": 0.37138858437538147, + "learning_rate": 8.598056844165567e-05, + "loss": 1.8191, + "step": 8700 + }, + { + "epoch": 2.6706568446899936, + "grad_norm": 0.2804940938949585, + "learning_rate": 8.597711682121354e-05, + "loss": 1.8238, + "step": 8701 + }, + { + "epoch": 2.670963781461019, + "grad_norm": 0.3853018581867218, + "learning_rate": 8.597366484522445e-05, + "loss": 1.8762, + "step": 8702 + }, + { + "epoch": 2.6712707182320443, + "grad_norm": 0.3066580295562744, + "learning_rate": 8.597021251372253e-05, + "loss": 1.7638, + "step": 8703 + }, + { + "epoch": 2.671577655003069, + "grad_norm": 0.30797824263572693, + "learning_rate": 8.596675982674186e-05, + "loss": 1.8574, + "step": 8704 + }, + { + "epoch": 2.6718845917740945, + "grad_norm": 0.3268548548221588, + "learning_rate": 8.596330678431661e-05, + "loss": 1.9184, + "step": 8705 + }, + { + "epoch": 2.6721915285451194, + "grad_norm": 0.4077534079551697, + "learning_rate": 8.595985338648087e-05, + "loss": 1.8967, + "step": 8706 + }, + { + "epoch": 2.6724984653161448, + "grad_norm": 0.4514889419078827, + "learning_rate": 8.595639963326881e-05, + "loss": 1.8491, + "step": 8707 + }, + { + "epoch": 2.67280540208717, + "grad_norm": 0.39269959926605225, + "learning_rate": 8.59529455247145e-05, + "loss": 1.7865, + "step": 8708 + }, + { + "epoch": 2.6731123388581954, + "grad_norm": 0.3139820694923401, + "learning_rate": 8.594949106085212e-05, + "loss": 1.8007, + "step": 8709 + }, + { + "epoch": 2.6734192756292203, + "grad_norm": 0.3423599600791931, + "learning_rate": 8.59460362417158e-05, + "loss": 1.8389, + "step": 8710 + }, + { + "epoch": 2.6737262124002457, + "grad_norm": 0.3829670548439026, + "learning_rate": 8.594258106733968e-05, + "loss": 1.8355, + "step": 8711 + }, + { + "epoch": 2.6740331491712706, + "grad_norm": 0.34447145462036133, + "learning_rate": 8.593912553775791e-05, + "loss": 1.8595, + "step": 8712 + }, + { + "epoch": 2.674340085942296, + "grad_norm": 0.34868502616882324, + "learning_rate": 8.593566965300465e-05, + "loss": 1.9195, + "step": 8713 + }, + { + "epoch": 2.674647022713321, + "grad_norm": 0.4919234812259674, + "learning_rate": 8.593221341311402e-05, + "loss": 1.8321, + "step": 8714 + }, + { + "epoch": 2.674953959484346, + "grad_norm": 0.4413202702999115, + "learning_rate": 8.59287568181202e-05, + "loss": 1.7976, + "step": 8715 + }, + { + "epoch": 2.6752608962553714, + "grad_norm": 0.3395153880119324, + "learning_rate": 8.592529986805736e-05, + "loss": 1.7974, + "step": 8716 + }, + { + "epoch": 2.6755678330263963, + "grad_norm": 0.30407002568244934, + "learning_rate": 8.592184256295965e-05, + "loss": 1.7929, + "step": 8717 + }, + { + "epoch": 2.6758747697974217, + "grad_norm": 0.31925150752067566, + "learning_rate": 8.591838490286121e-05, + "loss": 1.8413, + "step": 8718 + }, + { + "epoch": 2.676181706568447, + "grad_norm": 0.28456512093544006, + "learning_rate": 8.591492688779627e-05, + "loss": 1.8686, + "step": 8719 + }, + { + "epoch": 2.6764886433394723, + "grad_norm": 0.3286445438861847, + "learning_rate": 8.591146851779895e-05, + "loss": 1.8538, + "step": 8720 + }, + { + "epoch": 2.6767955801104972, + "grad_norm": 0.40354880690574646, + "learning_rate": 8.590800979290346e-05, + "loss": 1.8599, + "step": 8721 + }, + { + "epoch": 2.6771025168815226, + "grad_norm": 0.3654378652572632, + "learning_rate": 8.590455071314397e-05, + "loss": 1.8063, + "step": 8722 + }, + { + "epoch": 2.6774094536525475, + "grad_norm": 0.3211844861507416, + "learning_rate": 8.590109127855466e-05, + "loss": 1.8146, + "step": 8723 + }, + { + "epoch": 2.677716390423573, + "grad_norm": 0.30884361267089844, + "learning_rate": 8.589763148916973e-05, + "loss": 1.8725, + "step": 8724 + }, + { + "epoch": 2.678023327194598, + "grad_norm": 0.303095281124115, + "learning_rate": 8.589417134502336e-05, + "loss": 1.8994, + "step": 8725 + }, + { + "epoch": 2.678330263965623, + "grad_norm": 0.3086979389190674, + "learning_rate": 8.589071084614977e-05, + "loss": 1.7941, + "step": 8726 + }, + { + "epoch": 2.6786372007366483, + "grad_norm": 0.30298081040382385, + "learning_rate": 8.588724999258311e-05, + "loss": 1.8945, + "step": 8727 + }, + { + "epoch": 2.6789441375076732, + "grad_norm": 0.33253392577171326, + "learning_rate": 8.588378878435763e-05, + "loss": 1.8397, + "step": 8728 + }, + { + "epoch": 2.6792510742786986, + "grad_norm": 0.2782913148403168, + "learning_rate": 8.588032722150752e-05, + "loss": 1.8505, + "step": 8729 + }, + { + "epoch": 2.679558011049724, + "grad_norm": 0.3482373058795929, + "learning_rate": 8.587686530406697e-05, + "loss": 1.9144, + "step": 8730 + }, + { + "epoch": 2.679864947820749, + "grad_norm": 0.31985580921173096, + "learning_rate": 8.587340303207021e-05, + "loss": 1.7695, + "step": 8731 + }, + { + "epoch": 2.680171884591774, + "grad_norm": 0.3222995400428772, + "learning_rate": 8.586994040555147e-05, + "loss": 1.8624, + "step": 8732 + }, + { + "epoch": 2.680478821362799, + "grad_norm": 0.28178468346595764, + "learning_rate": 8.586647742454495e-05, + "loss": 1.8036, + "step": 8733 + }, + { + "epoch": 2.6807857581338244, + "grad_norm": 0.27367156744003296, + "learning_rate": 8.586301408908487e-05, + "loss": 1.801, + "step": 8734 + }, + { + "epoch": 2.6810926949048497, + "grad_norm": 0.2696636915206909, + "learning_rate": 8.585955039920547e-05, + "loss": 1.8211, + "step": 8735 + }, + { + "epoch": 2.681399631675875, + "grad_norm": 0.2880568504333496, + "learning_rate": 8.585608635494098e-05, + "loss": 1.8543, + "step": 8736 + }, + { + "epoch": 2.6817065684469, + "grad_norm": 0.28708669543266296, + "learning_rate": 8.585262195632562e-05, + "loss": 1.8311, + "step": 8737 + }, + { + "epoch": 2.6820135052179253, + "grad_norm": 0.2633354663848877, + "learning_rate": 8.584915720339364e-05, + "loss": 1.7815, + "step": 8738 + }, + { + "epoch": 2.68232044198895, + "grad_norm": 0.25772908329963684, + "learning_rate": 8.584569209617928e-05, + "loss": 1.8322, + "step": 8739 + }, + { + "epoch": 2.6826273787599755, + "grad_norm": 0.2665303647518158, + "learning_rate": 8.584222663471677e-05, + "loss": 1.8456, + "step": 8740 + }, + { + "epoch": 2.682934315531001, + "grad_norm": 0.26330938935279846, + "learning_rate": 8.583876081904038e-05, + "loss": 1.8552, + "step": 8741 + }, + { + "epoch": 2.6832412523020257, + "grad_norm": 0.29758915305137634, + "learning_rate": 8.583529464918434e-05, + "loss": 1.8362, + "step": 8742 + }, + { + "epoch": 2.683548189073051, + "grad_norm": 0.32018154859542847, + "learning_rate": 8.583182812518293e-05, + "loss": 1.8439, + "step": 8743 + }, + { + "epoch": 2.683855125844076, + "grad_norm": 0.33279770612716675, + "learning_rate": 8.582836124707036e-05, + "loss": 1.8629, + "step": 8744 + }, + { + "epoch": 2.6841620626151013, + "grad_norm": 0.40244174003601074, + "learning_rate": 8.582489401488096e-05, + "loss": 1.8221, + "step": 8745 + }, + { + "epoch": 2.6844689993861266, + "grad_norm": 0.3935016393661499, + "learning_rate": 8.582142642864895e-05, + "loss": 1.8564, + "step": 8746 + }, + { + "epoch": 2.6847759361571515, + "grad_norm": 0.3062369227409363, + "learning_rate": 8.58179584884086e-05, + "loss": 1.8587, + "step": 8747 + }, + { + "epoch": 2.685082872928177, + "grad_norm": 0.320422500371933, + "learning_rate": 8.58144901941942e-05, + "loss": 1.8758, + "step": 8748 + }, + { + "epoch": 2.6853898096992017, + "grad_norm": 0.3681413531303406, + "learning_rate": 8.581102154604001e-05, + "loss": 1.7899, + "step": 8749 + }, + { + "epoch": 2.685696746470227, + "grad_norm": 0.37779754400253296, + "learning_rate": 8.580755254398032e-05, + "loss": 1.8584, + "step": 8750 + }, + { + "epoch": 2.6860036832412524, + "grad_norm": 0.34761306643486023, + "learning_rate": 8.58040831880494e-05, + "loss": 1.8656, + "step": 8751 + }, + { + "epoch": 2.6863106200122777, + "grad_norm": 0.2833636403083801, + "learning_rate": 8.580061347828156e-05, + "loss": 1.8043, + "step": 8752 + }, + { + "epoch": 2.6866175567833026, + "grad_norm": 0.29990699887275696, + "learning_rate": 8.579714341471106e-05, + "loss": 1.8365, + "step": 8753 + }, + { + "epoch": 2.686924493554328, + "grad_norm": 0.3322729766368866, + "learning_rate": 8.579367299737222e-05, + "loss": 1.8541, + "step": 8754 + }, + { + "epoch": 2.687231430325353, + "grad_norm": 0.31999245285987854, + "learning_rate": 8.579020222629931e-05, + "loss": 1.8405, + "step": 8755 + }, + { + "epoch": 2.687538367096378, + "grad_norm": 0.332714319229126, + "learning_rate": 8.578673110152666e-05, + "loss": 1.9512, + "step": 8756 + }, + { + "epoch": 2.6878453038674035, + "grad_norm": 0.36372992396354675, + "learning_rate": 8.578325962308855e-05, + "loss": 1.8969, + "step": 8757 + }, + { + "epoch": 2.6881522406384284, + "grad_norm": 0.27239182591438293, + "learning_rate": 8.577978779101929e-05, + "loss": 1.7898, + "step": 8758 + }, + { + "epoch": 2.6884591774094537, + "grad_norm": 0.3552536070346832, + "learning_rate": 8.57763156053532e-05, + "loss": 1.8919, + "step": 8759 + }, + { + "epoch": 2.6887661141804786, + "grad_norm": 0.40591174364089966, + "learning_rate": 8.577284306612458e-05, + "loss": 1.8021, + "step": 8760 + }, + { + "epoch": 2.689073050951504, + "grad_norm": 0.37012994289398193, + "learning_rate": 8.576937017336777e-05, + "loss": 1.7803, + "step": 8761 + }, + { + "epoch": 2.6893799877225293, + "grad_norm": 0.33496031165122986, + "learning_rate": 8.576589692711707e-05, + "loss": 1.8573, + "step": 8762 + }, + { + "epoch": 2.689686924493554, + "grad_norm": 0.35000404715538025, + "learning_rate": 8.576242332740683e-05, + "loss": 1.8769, + "step": 8763 + }, + { + "epoch": 2.6899938612645795, + "grad_norm": 0.32730549573898315, + "learning_rate": 8.575894937427135e-05, + "loss": 1.823, + "step": 8764 + }, + { + "epoch": 2.6903007980356044, + "grad_norm": 0.31418806314468384, + "learning_rate": 8.575547506774497e-05, + "loss": 1.7646, + "step": 8765 + }, + { + "epoch": 2.6906077348066297, + "grad_norm": 0.277721107006073, + "learning_rate": 8.575200040786205e-05, + "loss": 1.8046, + "step": 8766 + }, + { + "epoch": 2.690914671577655, + "grad_norm": 0.3289557695388794, + "learning_rate": 8.574852539465688e-05, + "loss": 1.8145, + "step": 8767 + }, + { + "epoch": 2.6912216083486804, + "grad_norm": 0.28926602005958557, + "learning_rate": 8.574505002816385e-05, + "loss": 1.7627, + "step": 8768 + }, + { + "epoch": 2.6915285451197053, + "grad_norm": 0.2972332835197449, + "learning_rate": 8.574157430841727e-05, + "loss": 1.8294, + "step": 8769 + }, + { + "epoch": 2.6918354818907306, + "grad_norm": 0.28366953134536743, + "learning_rate": 8.57380982354515e-05, + "loss": 1.8535, + "step": 8770 + }, + { + "epoch": 2.6921424186617555, + "grad_norm": 0.2798771262168884, + "learning_rate": 8.57346218093009e-05, + "loss": 1.8298, + "step": 8771 + }, + { + "epoch": 2.692449355432781, + "grad_norm": 0.2614765465259552, + "learning_rate": 8.573114502999983e-05, + "loss": 1.8555, + "step": 8772 + }, + { + "epoch": 2.692756292203806, + "grad_norm": 0.30653777718544006, + "learning_rate": 8.572766789758265e-05, + "loss": 1.8507, + "step": 8773 + }, + { + "epoch": 2.693063228974831, + "grad_norm": 0.3189094066619873, + "learning_rate": 8.572419041208369e-05, + "loss": 1.8791, + "step": 8774 + }, + { + "epoch": 2.6933701657458564, + "grad_norm": 0.33381524682044983, + "learning_rate": 8.572071257353735e-05, + "loss": 1.8241, + "step": 8775 + }, + { + "epoch": 2.6936771025168813, + "grad_norm": 0.2776879668235779, + "learning_rate": 8.571723438197801e-05, + "loss": 1.7837, + "step": 8776 + }, + { + "epoch": 2.6939840392879066, + "grad_norm": 0.35845425724983215, + "learning_rate": 8.571375583744001e-05, + "loss": 1.8896, + "step": 8777 + }, + { + "epoch": 2.694290976058932, + "grad_norm": 0.28849005699157715, + "learning_rate": 8.571027693995775e-05, + "loss": 1.803, + "step": 8778 + }, + { + "epoch": 2.694597912829957, + "grad_norm": 0.3008786141872406, + "learning_rate": 8.57067976895656e-05, + "loss": 1.8559, + "step": 8779 + }, + { + "epoch": 2.694904849600982, + "grad_norm": 0.2924736440181732, + "learning_rate": 8.570331808629795e-05, + "loss": 1.8016, + "step": 8780 + }, + { + "epoch": 2.695211786372007, + "grad_norm": 0.2962380051612854, + "learning_rate": 8.569983813018917e-05, + "loss": 1.819, + "step": 8781 + }, + { + "epoch": 2.6955187231430324, + "grad_norm": 0.3141970634460449, + "learning_rate": 8.569635782127367e-05, + "loss": 1.8462, + "step": 8782 + }, + { + "epoch": 2.6958256599140578, + "grad_norm": 0.297061562538147, + "learning_rate": 8.569287715958584e-05, + "loss": 1.855, + "step": 8783 + }, + { + "epoch": 2.696132596685083, + "grad_norm": 0.30669623613357544, + "learning_rate": 8.568939614516009e-05, + "loss": 1.8626, + "step": 8784 + }, + { + "epoch": 2.696439533456108, + "grad_norm": 0.2782025933265686, + "learning_rate": 8.568591477803081e-05, + "loss": 1.8993, + "step": 8785 + }, + { + "epoch": 2.6967464702271333, + "grad_norm": 0.3644821345806122, + "learning_rate": 8.568243305823239e-05, + "loss": 1.8318, + "step": 8786 + }, + { + "epoch": 2.697053406998158, + "grad_norm": 0.4073259234428406, + "learning_rate": 8.567895098579925e-05, + "loss": 1.8963, + "step": 8787 + }, + { + "epoch": 2.6973603437691835, + "grad_norm": 0.40539780259132385, + "learning_rate": 8.567546856076583e-05, + "loss": 1.8644, + "step": 8788 + }, + { + "epoch": 2.697667280540209, + "grad_norm": 0.36739271879196167, + "learning_rate": 8.567198578316648e-05, + "loss": 1.8555, + "step": 8789 + }, + { + "epoch": 2.6979742173112338, + "grad_norm": 0.3339182138442993, + "learning_rate": 8.566850265303568e-05, + "loss": 1.8431, + "step": 8790 + }, + { + "epoch": 2.698281154082259, + "grad_norm": 0.3389740586280823, + "learning_rate": 8.566501917040784e-05, + "loss": 1.8271, + "step": 8791 + }, + { + "epoch": 2.698588090853284, + "grad_norm": 0.33819615840911865, + "learning_rate": 8.566153533531737e-05, + "loss": 1.8504, + "step": 8792 + }, + { + "epoch": 2.6988950276243093, + "grad_norm": 0.39106276631355286, + "learning_rate": 8.56580511477987e-05, + "loss": 1.7656, + "step": 8793 + }, + { + "epoch": 2.6992019643953347, + "grad_norm": 0.3374726474285126, + "learning_rate": 8.565456660788628e-05, + "loss": 1.8256, + "step": 8794 + }, + { + "epoch": 2.69950890116636, + "grad_norm": 0.33096614480018616, + "learning_rate": 8.565108171561452e-05, + "loss": 1.9486, + "step": 8795 + }, + { + "epoch": 2.699815837937385, + "grad_norm": 0.3202100396156311, + "learning_rate": 8.564759647101788e-05, + "loss": 1.7708, + "step": 8796 + }, + { + "epoch": 2.7001227747084102, + "grad_norm": 0.28830909729003906, + "learning_rate": 8.56441108741308e-05, + "loss": 1.8247, + "step": 8797 + }, + { + "epoch": 2.700429711479435, + "grad_norm": 0.32385459542274475, + "learning_rate": 8.564062492498772e-05, + "loss": 1.8338, + "step": 8798 + }, + { + "epoch": 2.7007366482504604, + "grad_norm": 0.3059900104999542, + "learning_rate": 8.56371386236231e-05, + "loss": 1.8321, + "step": 8799 + }, + { + "epoch": 2.701043585021486, + "grad_norm": 0.2922738492488861, + "learning_rate": 8.563365197007141e-05, + "loss": 1.7734, + "step": 8800 + }, + { + "epoch": 2.7013505217925107, + "grad_norm": 0.32542386651039124, + "learning_rate": 8.563016496436704e-05, + "loss": 1.8696, + "step": 8801 + }, + { + "epoch": 2.701657458563536, + "grad_norm": 0.2830851674079895, + "learning_rate": 8.562667760654452e-05, + "loss": 1.8237, + "step": 8802 + }, + { + "epoch": 2.701964395334561, + "grad_norm": 0.2794142961502075, + "learning_rate": 8.562318989663831e-05, + "loss": 1.8301, + "step": 8803 + }, + { + "epoch": 2.7022713321055862, + "grad_norm": 0.3149101436138153, + "learning_rate": 8.561970183468281e-05, + "loss": 1.8716, + "step": 8804 + }, + { + "epoch": 2.7025782688766116, + "grad_norm": 0.29530593752861023, + "learning_rate": 8.561621342071258e-05, + "loss": 1.9069, + "step": 8805 + }, + { + "epoch": 2.7028852056476365, + "grad_norm": 0.33965879678726196, + "learning_rate": 8.561272465476204e-05, + "loss": 1.8381, + "step": 8806 + }, + { + "epoch": 2.703192142418662, + "grad_norm": 0.3310995399951935, + "learning_rate": 8.560923553686569e-05, + "loss": 1.9293, + "step": 8807 + }, + { + "epoch": 2.7034990791896867, + "grad_norm": 0.3828842043876648, + "learning_rate": 8.5605746067058e-05, + "loss": 1.8789, + "step": 8808 + }, + { + "epoch": 2.703806015960712, + "grad_norm": 0.3666260242462158, + "learning_rate": 8.560225624537346e-05, + "loss": 1.8622, + "step": 8809 + }, + { + "epoch": 2.7041129527317374, + "grad_norm": 0.36732783913612366, + "learning_rate": 8.559876607184653e-05, + "loss": 1.8177, + "step": 8810 + }, + { + "epoch": 2.7044198895027627, + "grad_norm": 0.35554859042167664, + "learning_rate": 8.559527554651176e-05, + "loss": 1.884, + "step": 8811 + }, + { + "epoch": 2.7047268262737876, + "grad_norm": 0.3118159770965576, + "learning_rate": 8.55917846694036e-05, + "loss": 1.8779, + "step": 8812 + }, + { + "epoch": 2.705033763044813, + "grad_norm": 0.278105765581131, + "learning_rate": 8.558829344055657e-05, + "loss": 1.8513, + "step": 8813 + }, + { + "epoch": 2.705340699815838, + "grad_norm": 0.30809372663497925, + "learning_rate": 8.558480186000517e-05, + "loss": 1.8023, + "step": 8814 + }, + { + "epoch": 2.705647636586863, + "grad_norm": 0.28222522139549255, + "learning_rate": 8.558130992778388e-05, + "loss": 1.8421, + "step": 8815 + }, + { + "epoch": 2.7059545733578885, + "grad_norm": 0.29532718658447266, + "learning_rate": 8.557781764392725e-05, + "loss": 1.8131, + "step": 8816 + }, + { + "epoch": 2.7062615101289134, + "grad_norm": 0.2670072317123413, + "learning_rate": 8.557432500846975e-05, + "loss": 1.7856, + "step": 8817 + }, + { + "epoch": 2.7065684468999387, + "grad_norm": 0.3431483805179596, + "learning_rate": 8.557083202144594e-05, + "loss": 1.8484, + "step": 8818 + }, + { + "epoch": 2.7068753836709636, + "grad_norm": 0.3824561536312103, + "learning_rate": 8.556733868289033e-05, + "loss": 1.8954, + "step": 8819 + }, + { + "epoch": 2.707182320441989, + "grad_norm": 0.4189379811286926, + "learning_rate": 8.55638449928374e-05, + "loss": 1.7846, + "step": 8820 + }, + { + "epoch": 2.7074892572130143, + "grad_norm": 0.34948450326919556, + "learning_rate": 8.556035095132173e-05, + "loss": 1.7696, + "step": 8821 + }, + { + "epoch": 2.707796193984039, + "grad_norm": 0.2906292676925659, + "learning_rate": 8.555685655837783e-05, + "loss": 1.8359, + "step": 8822 + }, + { + "epoch": 2.7081031307550645, + "grad_norm": 0.2756035029888153, + "learning_rate": 8.555336181404023e-05, + "loss": 1.8684, + "step": 8823 + }, + { + "epoch": 2.7084100675260894, + "grad_norm": 0.3714772164821625, + "learning_rate": 8.554986671834346e-05, + "loss": 1.8833, + "step": 8824 + }, + { + "epoch": 2.7087170042971147, + "grad_norm": 0.41674792766571045, + "learning_rate": 8.554637127132209e-05, + "loss": 1.8272, + "step": 8825 + }, + { + "epoch": 2.70902394106814, + "grad_norm": 0.333915650844574, + "learning_rate": 8.554287547301063e-05, + "loss": 1.8343, + "step": 8826 + }, + { + "epoch": 2.7093308778391654, + "grad_norm": 0.33764639496803284, + "learning_rate": 8.553937932344365e-05, + "loss": 1.812, + "step": 8827 + }, + { + "epoch": 2.7096378146101903, + "grad_norm": 0.4445551931858063, + "learning_rate": 8.553588282265569e-05, + "loss": 1.8386, + "step": 8828 + }, + { + "epoch": 2.7099447513812156, + "grad_norm": 0.43314024806022644, + "learning_rate": 8.553238597068131e-05, + "loss": 1.7727, + "step": 8829 + }, + { + "epoch": 2.7102516881522405, + "grad_norm": 0.364596426486969, + "learning_rate": 8.552888876755506e-05, + "loss": 1.8875, + "step": 8830 + }, + { + "epoch": 2.710558624923266, + "grad_norm": 0.3023224174976349, + "learning_rate": 8.552539121331151e-05, + "loss": 1.8676, + "step": 8831 + }, + { + "epoch": 2.710865561694291, + "grad_norm": 0.3278682231903076, + "learning_rate": 8.552189330798522e-05, + "loss": 1.852, + "step": 8832 + }, + { + "epoch": 2.711172498465316, + "grad_norm": 0.34684303402900696, + "learning_rate": 8.551839505161077e-05, + "loss": 1.8449, + "step": 8833 + }, + { + "epoch": 2.7114794352363414, + "grad_norm": 0.3398132920265198, + "learning_rate": 8.551489644422271e-05, + "loss": 1.8493, + "step": 8834 + }, + { + "epoch": 2.7117863720073663, + "grad_norm": 0.2835905849933624, + "learning_rate": 8.551139748585563e-05, + "loss": 1.8283, + "step": 8835 + }, + { + "epoch": 2.7120933087783916, + "grad_norm": 0.30910351872444153, + "learning_rate": 8.55078981765441e-05, + "loss": 1.8429, + "step": 8836 + }, + { + "epoch": 2.712400245549417, + "grad_norm": 0.3802061676979065, + "learning_rate": 8.550439851632272e-05, + "loss": 1.8348, + "step": 8837 + }, + { + "epoch": 2.712707182320442, + "grad_norm": 0.3686448931694031, + "learning_rate": 8.550089850522606e-05, + "loss": 1.8652, + "step": 8838 + }, + { + "epoch": 2.713014119091467, + "grad_norm": 0.2919705808162689, + "learning_rate": 8.549739814328872e-05, + "loss": 1.8318, + "step": 8839 + }, + { + "epoch": 2.713321055862492, + "grad_norm": 0.34780198335647583, + "learning_rate": 8.549389743054527e-05, + "loss": 1.8781, + "step": 8840 + }, + { + "epoch": 2.7136279926335174, + "grad_norm": 0.3955966532230377, + "learning_rate": 8.549039636703034e-05, + "loss": 1.867, + "step": 8841 + }, + { + "epoch": 2.7139349294045427, + "grad_norm": 0.2836689054965973, + "learning_rate": 8.548689495277851e-05, + "loss": 1.7859, + "step": 8842 + }, + { + "epoch": 2.714241866175568, + "grad_norm": 0.369865357875824, + "learning_rate": 8.548339318782436e-05, + "loss": 1.8246, + "step": 8843 + }, + { + "epoch": 2.714548802946593, + "grad_norm": 0.2901081442832947, + "learning_rate": 8.547989107220256e-05, + "loss": 1.7888, + "step": 8844 + }, + { + "epoch": 2.7148557397176183, + "grad_norm": 0.2790970802307129, + "learning_rate": 8.547638860594764e-05, + "loss": 1.8311, + "step": 8845 + }, + { + "epoch": 2.715162676488643, + "grad_norm": 0.2935783267021179, + "learning_rate": 8.547288578909429e-05, + "loss": 1.857, + "step": 8846 + }, + { + "epoch": 2.7154696132596685, + "grad_norm": 0.27074959874153137, + "learning_rate": 8.546938262167708e-05, + "loss": 1.7457, + "step": 8847 + }, + { + "epoch": 2.715776550030694, + "grad_norm": 0.3042888343334198, + "learning_rate": 8.546587910373063e-05, + "loss": 1.8598, + "step": 8848 + }, + { + "epoch": 2.7160834868017187, + "grad_norm": 0.29088664054870605, + "learning_rate": 8.546237523528958e-05, + "loss": 1.8461, + "step": 8849 + }, + { + "epoch": 2.716390423572744, + "grad_norm": 0.3022211492061615, + "learning_rate": 8.545887101638857e-05, + "loss": 1.8327, + "step": 8850 + }, + { + "epoch": 2.716697360343769, + "grad_norm": 0.30194929242134094, + "learning_rate": 8.545536644706218e-05, + "loss": 1.8331, + "step": 8851 + }, + { + "epoch": 2.7170042971147943, + "grad_norm": 0.31702303886413574, + "learning_rate": 8.54518615273451e-05, + "loss": 1.8576, + "step": 8852 + }, + { + "epoch": 2.7173112338858196, + "grad_norm": 0.30386796593666077, + "learning_rate": 8.544835625727195e-05, + "loss": 1.8278, + "step": 8853 + }, + { + "epoch": 2.717618170656845, + "grad_norm": 0.30670568346977234, + "learning_rate": 8.544485063687735e-05, + "loss": 1.8123, + "step": 8854 + }, + { + "epoch": 2.71792510742787, + "grad_norm": 0.3896371126174927, + "learning_rate": 8.544134466619597e-05, + "loss": 1.8101, + "step": 8855 + }, + { + "epoch": 2.718232044198895, + "grad_norm": 0.4742000699043274, + "learning_rate": 8.543783834526245e-05, + "loss": 1.8402, + "step": 8856 + }, + { + "epoch": 2.71853898096992, + "grad_norm": 0.4234209954738617, + "learning_rate": 8.543433167411143e-05, + "loss": 1.8814, + "step": 8857 + }, + { + "epoch": 2.7188459177409454, + "grad_norm": 0.28478503227233887, + "learning_rate": 8.54308246527776e-05, + "loss": 1.8165, + "step": 8858 + }, + { + "epoch": 2.7191528545119708, + "grad_norm": 0.3534078896045685, + "learning_rate": 8.542731728129558e-05, + "loss": 1.7947, + "step": 8859 + }, + { + "epoch": 2.7194597912829956, + "grad_norm": 0.5471592545509338, + "learning_rate": 8.542380955970004e-05, + "loss": 1.9073, + "step": 8860 + }, + { + "epoch": 2.719766728054021, + "grad_norm": 0.5037226676940918, + "learning_rate": 8.542030148802566e-05, + "loss": 1.8701, + "step": 8861 + }, + { + "epoch": 2.720073664825046, + "grad_norm": 0.3415449559688568, + "learning_rate": 8.54167930663071e-05, + "loss": 1.827, + "step": 8862 + }, + { + "epoch": 2.720380601596071, + "grad_norm": 0.33516764640808105, + "learning_rate": 8.541328429457903e-05, + "loss": 1.9396, + "step": 8863 + }, + { + "epoch": 2.7206875383670965, + "grad_norm": 0.3934863209724426, + "learning_rate": 8.540977517287612e-05, + "loss": 1.8738, + "step": 8864 + }, + { + "epoch": 2.7209944751381214, + "grad_norm": 0.5137139558792114, + "learning_rate": 8.540626570123307e-05, + "loss": 1.9007, + "step": 8865 + }, + { + "epoch": 2.7213014119091468, + "grad_norm": 0.5846540331840515, + "learning_rate": 8.540275587968453e-05, + "loss": 1.9335, + "step": 8866 + }, + { + "epoch": 2.7216083486801717, + "grad_norm": 0.613388180732727, + "learning_rate": 8.539924570826523e-05, + "loss": 1.8967, + "step": 8867 + }, + { + "epoch": 2.721915285451197, + "grad_norm": 0.4804840087890625, + "learning_rate": 8.539573518700983e-05, + "loss": 1.7712, + "step": 8868 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.34939101338386536, + "learning_rate": 8.539222431595303e-05, + "loss": 1.8578, + "step": 8869 + }, + { + "epoch": 2.7225291589932477, + "grad_norm": 0.4230511486530304, + "learning_rate": 8.538871309512951e-05, + "loss": 1.793, + "step": 8870 + }, + { + "epoch": 2.7228360957642725, + "grad_norm": 0.5383400917053223, + "learning_rate": 8.538520152457402e-05, + "loss": 1.8153, + "step": 8871 + }, + { + "epoch": 2.723143032535298, + "grad_norm": 0.46213194727897644, + "learning_rate": 8.538168960432118e-05, + "loss": 1.9357, + "step": 8872 + }, + { + "epoch": 2.7234499693063228, + "grad_norm": 0.3126194477081299, + "learning_rate": 8.537817733440577e-05, + "loss": 1.7954, + "step": 8873 + }, + { + "epoch": 2.723756906077348, + "grad_norm": 0.4018714129924774, + "learning_rate": 8.537466471486248e-05, + "loss": 1.824, + "step": 8874 + }, + { + "epoch": 2.7240638428483734, + "grad_norm": 0.5690213441848755, + "learning_rate": 8.537115174572602e-05, + "loss": 1.7807, + "step": 8875 + }, + { + "epoch": 2.7243707796193983, + "grad_norm": 0.4669814705848694, + "learning_rate": 8.53676384270311e-05, + "loss": 1.7438, + "step": 8876 + }, + { + "epoch": 2.7246777163904237, + "grad_norm": 0.3040566146373749, + "learning_rate": 8.536412475881246e-05, + "loss": 1.8613, + "step": 8877 + }, + { + "epoch": 2.7249846531614486, + "grad_norm": 0.38985559344291687, + "learning_rate": 8.53606107411048e-05, + "loss": 1.816, + "step": 8878 + }, + { + "epoch": 2.725291589932474, + "grad_norm": 0.4417174160480499, + "learning_rate": 8.535709637394285e-05, + "loss": 1.8675, + "step": 8879 + }, + { + "epoch": 2.7255985267034992, + "grad_norm": 0.3254696726799011, + "learning_rate": 8.535358165736138e-05, + "loss": 1.8419, + "step": 8880 + }, + { + "epoch": 2.725905463474524, + "grad_norm": 0.36002370715141296, + "learning_rate": 8.535006659139506e-05, + "loss": 1.9084, + "step": 8881 + }, + { + "epoch": 2.7262124002455494, + "grad_norm": 0.3471790850162506, + "learning_rate": 8.534655117607869e-05, + "loss": 1.8442, + "step": 8882 + }, + { + "epoch": 2.7265193370165743, + "grad_norm": 0.3042849004268646, + "learning_rate": 8.534303541144697e-05, + "loss": 1.8261, + "step": 8883 + }, + { + "epoch": 2.7268262737875997, + "grad_norm": 0.32416659593582153, + "learning_rate": 8.533951929753465e-05, + "loss": 1.8625, + "step": 8884 + }, + { + "epoch": 2.727133210558625, + "grad_norm": 0.32449519634246826, + "learning_rate": 8.53360028343765e-05, + "loss": 1.8653, + "step": 8885 + }, + { + "epoch": 2.7274401473296503, + "grad_norm": 0.34744054079055786, + "learning_rate": 8.533248602200726e-05, + "loss": 1.8742, + "step": 8886 + }, + { + "epoch": 2.7277470841006752, + "grad_norm": 0.30540695786476135, + "learning_rate": 8.532896886046167e-05, + "loss": 1.8064, + "step": 8887 + }, + { + "epoch": 2.7280540208717006, + "grad_norm": 0.27105677127838135, + "learning_rate": 8.532545134977452e-05, + "loss": 1.7867, + "step": 8888 + }, + { + "epoch": 2.7283609576427255, + "grad_norm": 0.2682685852050781, + "learning_rate": 8.532193348998054e-05, + "loss": 1.8191, + "step": 8889 + }, + { + "epoch": 2.728667894413751, + "grad_norm": 0.33534809947013855, + "learning_rate": 8.531841528111452e-05, + "loss": 1.8758, + "step": 8890 + }, + { + "epoch": 2.728974831184776, + "grad_norm": 0.33555057644844055, + "learning_rate": 8.531489672321122e-05, + "loss": 1.8932, + "step": 8891 + }, + { + "epoch": 2.729281767955801, + "grad_norm": 0.3532167077064514, + "learning_rate": 8.531137781630542e-05, + "loss": 1.8621, + "step": 8892 + }, + { + "epoch": 2.7295887047268264, + "grad_norm": 0.337634414434433, + "learning_rate": 8.530785856043186e-05, + "loss": 1.8618, + "step": 8893 + }, + { + "epoch": 2.7298956414978512, + "grad_norm": 0.28855568170547485, + "learning_rate": 8.530433895562538e-05, + "loss": 1.8248, + "step": 8894 + }, + { + "epoch": 2.7302025782688766, + "grad_norm": 0.3128049373626709, + "learning_rate": 8.530081900192071e-05, + "loss": 1.8071, + "step": 8895 + }, + { + "epoch": 2.730509515039902, + "grad_norm": 0.2949801981449127, + "learning_rate": 8.529729869935265e-05, + "loss": 1.7704, + "step": 8896 + }, + { + "epoch": 2.730816451810927, + "grad_norm": 0.2708294987678528, + "learning_rate": 8.529377804795603e-05, + "loss": 1.8127, + "step": 8897 + }, + { + "epoch": 2.731123388581952, + "grad_norm": 0.300516813993454, + "learning_rate": 8.529025704776559e-05, + "loss": 1.9063, + "step": 8898 + }, + { + "epoch": 2.731430325352977, + "grad_norm": 0.2590954005718231, + "learning_rate": 8.528673569881613e-05, + "loss": 1.7595, + "step": 8899 + }, + { + "epoch": 2.7317372621240024, + "grad_norm": 0.30067136883735657, + "learning_rate": 8.528321400114248e-05, + "loss": 1.8697, + "step": 8900 + }, + { + "epoch": 2.7320441988950277, + "grad_norm": 0.3289981186389923, + "learning_rate": 8.527969195477943e-05, + "loss": 1.8257, + "step": 8901 + }, + { + "epoch": 2.732351135666053, + "grad_norm": 0.3205581307411194, + "learning_rate": 8.527616955976178e-05, + "loss": 1.9002, + "step": 8902 + }, + { + "epoch": 2.732658072437078, + "grad_norm": 0.30869361758232117, + "learning_rate": 8.527264681612435e-05, + "loss": 1.8239, + "step": 8903 + }, + { + "epoch": 2.7329650092081033, + "grad_norm": 0.3237484097480774, + "learning_rate": 8.526912372390195e-05, + "loss": 1.8879, + "step": 8904 + }, + { + "epoch": 2.733271945979128, + "grad_norm": 0.3172036111354828, + "learning_rate": 8.52656002831294e-05, + "loss": 1.8118, + "step": 8905 + }, + { + "epoch": 2.7335788827501535, + "grad_norm": 0.3326823115348816, + "learning_rate": 8.52620764938415e-05, + "loss": 1.8035, + "step": 8906 + }, + { + "epoch": 2.733885819521179, + "grad_norm": 0.36605212092399597, + "learning_rate": 8.525855235607311e-05, + "loss": 1.8689, + "step": 8907 + }, + { + "epoch": 2.7341927562922037, + "grad_norm": 0.31904828548431396, + "learning_rate": 8.525502786985905e-05, + "loss": 1.8188, + "step": 8908 + }, + { + "epoch": 2.734499693063229, + "grad_norm": 0.2657643258571625, + "learning_rate": 8.525150303523413e-05, + "loss": 1.7471, + "step": 8909 + }, + { + "epoch": 2.734806629834254, + "grad_norm": 0.32748520374298096, + "learning_rate": 8.524797785223318e-05, + "loss": 1.8678, + "step": 8910 + }, + { + "epoch": 2.7351135666052793, + "grad_norm": 0.32576173543930054, + "learning_rate": 8.524445232089107e-05, + "loss": 1.8296, + "step": 8911 + }, + { + "epoch": 2.7354205033763046, + "grad_norm": 0.3028578758239746, + "learning_rate": 8.524092644124261e-05, + "loss": 1.8656, + "step": 8912 + }, + { + "epoch": 2.7357274401473295, + "grad_norm": 0.29967090487480164, + "learning_rate": 8.523740021332268e-05, + "loss": 1.8206, + "step": 8913 + }, + { + "epoch": 2.736034376918355, + "grad_norm": 0.3042941391468048, + "learning_rate": 8.523387363716611e-05, + "loss": 1.7928, + "step": 8914 + }, + { + "epoch": 2.7363413136893797, + "grad_norm": 0.3278021216392517, + "learning_rate": 8.523034671280772e-05, + "loss": 1.9213, + "step": 8915 + }, + { + "epoch": 2.736648250460405, + "grad_norm": 0.39839017391204834, + "learning_rate": 8.522681944028242e-05, + "loss": 1.8242, + "step": 8916 + }, + { + "epoch": 2.7369551872314304, + "grad_norm": 0.3960748016834259, + "learning_rate": 8.522329181962504e-05, + "loss": 1.8761, + "step": 8917 + }, + { + "epoch": 2.7372621240024557, + "grad_norm": 0.3250591456890106, + "learning_rate": 8.521976385087044e-05, + "loss": 1.8318, + "step": 8918 + }, + { + "epoch": 2.7375690607734806, + "grad_norm": 0.31731119751930237, + "learning_rate": 8.521623553405349e-05, + "loss": 1.8062, + "step": 8919 + }, + { + "epoch": 2.737875997544506, + "grad_norm": 0.32452264428138733, + "learning_rate": 8.521270686920906e-05, + "loss": 1.8384, + "step": 8920 + }, + { + "epoch": 2.738182934315531, + "grad_norm": 0.2892500162124634, + "learning_rate": 8.520917785637204e-05, + "loss": 1.8128, + "step": 8921 + }, + { + "epoch": 2.738489871086556, + "grad_norm": 0.30028483271598816, + "learning_rate": 8.520564849557726e-05, + "loss": 1.8512, + "step": 8922 + }, + { + "epoch": 2.7387968078575815, + "grad_norm": 0.29927411675453186, + "learning_rate": 8.520211878685964e-05, + "loss": 1.8431, + "step": 8923 + }, + { + "epoch": 2.7391037446286064, + "grad_norm": 0.3426479995250702, + "learning_rate": 8.519858873025405e-05, + "loss": 1.8724, + "step": 8924 + }, + { + "epoch": 2.7394106813996317, + "grad_norm": 0.3795917332172394, + "learning_rate": 8.519505832579538e-05, + "loss": 1.8888, + "step": 8925 + }, + { + "epoch": 2.7397176181706566, + "grad_norm": 0.4924582839012146, + "learning_rate": 8.519152757351849e-05, + "loss": 1.7743, + "step": 8926 + }, + { + "epoch": 2.740024554941682, + "grad_norm": 0.43054282665252686, + "learning_rate": 8.518799647345832e-05, + "loss": 1.8556, + "step": 8927 + }, + { + "epoch": 2.7403314917127073, + "grad_norm": 0.37040412425994873, + "learning_rate": 8.518446502564974e-05, + "loss": 1.9162, + "step": 8928 + }, + { + "epoch": 2.7406384284837326, + "grad_norm": 0.38334885239601135, + "learning_rate": 8.518093323012766e-05, + "loss": 1.8078, + "step": 8929 + }, + { + "epoch": 2.7409453652547575, + "grad_norm": 0.409101665019989, + "learning_rate": 8.517740108692698e-05, + "loss": 1.7874, + "step": 8930 + }, + { + "epoch": 2.741252302025783, + "grad_norm": 0.3953499495983124, + "learning_rate": 8.517386859608258e-05, + "loss": 1.8455, + "step": 8931 + }, + { + "epoch": 2.7415592387968077, + "grad_norm": 0.30524972081184387, + "learning_rate": 8.517033575762942e-05, + "loss": 1.822, + "step": 8932 + }, + { + "epoch": 2.741866175567833, + "grad_norm": 0.354086309671402, + "learning_rate": 8.516680257160239e-05, + "loss": 1.859, + "step": 8933 + }, + { + "epoch": 2.7421731123388584, + "grad_norm": 0.4305376410484314, + "learning_rate": 8.516326903803638e-05, + "loss": 1.8918, + "step": 8934 + }, + { + "epoch": 2.7424800491098833, + "grad_norm": 0.590727686882019, + "learning_rate": 8.515973515696635e-05, + "loss": 1.8841, + "step": 8935 + }, + { + "epoch": 2.7427869858809086, + "grad_norm": 0.665314257144928, + "learning_rate": 8.515620092842723e-05, + "loss": 1.8166, + "step": 8936 + }, + { + "epoch": 2.7430939226519335, + "grad_norm": 0.5579181909561157, + "learning_rate": 8.515266635245389e-05, + "loss": 1.8344, + "step": 8937 + }, + { + "epoch": 2.743400859422959, + "grad_norm": 0.3698382079601288, + "learning_rate": 8.514913142908132e-05, + "loss": 1.8445, + "step": 8938 + }, + { + "epoch": 2.743707796193984, + "grad_norm": 0.30882057547569275, + "learning_rate": 8.514559615834442e-05, + "loss": 1.8443, + "step": 8939 + }, + { + "epoch": 2.744014732965009, + "grad_norm": 0.35821446776390076, + "learning_rate": 8.514206054027815e-05, + "loss": 1.8482, + "step": 8940 + }, + { + "epoch": 2.7443216697360344, + "grad_norm": 0.35552099347114563, + "learning_rate": 8.513852457491744e-05, + "loss": 1.7848, + "step": 8941 + }, + { + "epoch": 2.7446286065070593, + "grad_norm": 0.27788954973220825, + "learning_rate": 8.513498826229722e-05, + "loss": 1.7935, + "step": 8942 + }, + { + "epoch": 2.7449355432780846, + "grad_norm": 0.30653929710388184, + "learning_rate": 8.513145160245246e-05, + "loss": 1.808, + "step": 8943 + }, + { + "epoch": 2.74524248004911, + "grad_norm": 0.34749966859817505, + "learning_rate": 8.512791459541812e-05, + "loss": 1.8498, + "step": 8944 + }, + { + "epoch": 2.7455494168201353, + "grad_norm": 0.362326979637146, + "learning_rate": 8.512437724122912e-05, + "loss": 1.8263, + "step": 8945 + }, + { + "epoch": 2.74585635359116, + "grad_norm": 0.2914038598537445, + "learning_rate": 8.512083953992044e-05, + "loss": 1.834, + "step": 8946 + }, + { + "epoch": 2.7461632903621855, + "grad_norm": 0.31662893295288086, + "learning_rate": 8.511730149152705e-05, + "loss": 1.8157, + "step": 8947 + }, + { + "epoch": 2.7464702271332104, + "grad_norm": 0.38970568776130676, + "learning_rate": 8.51137630960839e-05, + "loss": 1.8764, + "step": 8948 + }, + { + "epoch": 2.7467771639042358, + "grad_norm": 0.3907272517681122, + "learning_rate": 8.511022435362594e-05, + "loss": 1.8665, + "step": 8949 + }, + { + "epoch": 2.747084100675261, + "grad_norm": 0.3315196931362152, + "learning_rate": 8.510668526418819e-05, + "loss": 1.8076, + "step": 8950 + }, + { + "epoch": 2.747391037446286, + "grad_norm": 0.29783520102500916, + "learning_rate": 8.510314582780559e-05, + "loss": 1.8518, + "step": 8951 + }, + { + "epoch": 2.7476979742173113, + "grad_norm": 0.3085685670375824, + "learning_rate": 8.509960604451312e-05, + "loss": 1.8961, + "step": 8952 + }, + { + "epoch": 2.748004910988336, + "grad_norm": 0.3204992711544037, + "learning_rate": 8.509606591434579e-05, + "loss": 1.8374, + "step": 8953 + }, + { + "epoch": 2.7483118477593615, + "grad_norm": 0.2801276445388794, + "learning_rate": 8.509252543733855e-05, + "loss": 1.8455, + "step": 8954 + }, + { + "epoch": 2.748618784530387, + "grad_norm": 0.26911506056785583, + "learning_rate": 8.508898461352641e-05, + "loss": 1.8093, + "step": 8955 + }, + { + "epoch": 2.7489257213014118, + "grad_norm": 0.30429625511169434, + "learning_rate": 8.508544344294435e-05, + "loss": 1.8526, + "step": 8956 + }, + { + "epoch": 2.749232658072437, + "grad_norm": 0.308403342962265, + "learning_rate": 8.50819019256274e-05, + "loss": 1.7917, + "step": 8957 + }, + { + "epoch": 2.749539594843462, + "grad_norm": 0.3292251229286194, + "learning_rate": 8.507836006161052e-05, + "loss": 1.8206, + "step": 8958 + }, + { + "epoch": 2.7498465316144873, + "grad_norm": 0.30014076828956604, + "learning_rate": 8.507481785092871e-05, + "loss": 1.8136, + "step": 8959 + }, + { + "epoch": 2.7501534683855127, + "grad_norm": 0.2879343032836914, + "learning_rate": 8.5071275293617e-05, + "loss": 1.8476, + "step": 8960 + }, + { + "epoch": 2.750460405156538, + "grad_norm": 0.30646058917045593, + "learning_rate": 8.506773238971039e-05, + "loss": 1.7936, + "step": 8961 + }, + { + "epoch": 2.750767341927563, + "grad_norm": 0.309804230928421, + "learning_rate": 8.506418913924391e-05, + "loss": 1.8076, + "step": 8962 + }, + { + "epoch": 2.7510742786985882, + "grad_norm": 0.27035996317863464, + "learning_rate": 8.506064554225255e-05, + "loss": 1.8169, + "step": 8963 + }, + { + "epoch": 2.751381215469613, + "grad_norm": 0.3185548782348633, + "learning_rate": 8.505710159877134e-05, + "loss": 1.8265, + "step": 8964 + }, + { + "epoch": 2.7516881522406385, + "grad_norm": 0.3806973099708557, + "learning_rate": 8.505355730883532e-05, + "loss": 1.824, + "step": 8965 + }, + { + "epoch": 2.751995089011664, + "grad_norm": 0.3206372857093811, + "learning_rate": 8.505001267247949e-05, + "loss": 1.8436, + "step": 8966 + }, + { + "epoch": 2.7523020257826887, + "grad_norm": 0.2957460880279541, + "learning_rate": 8.504646768973889e-05, + "loss": 1.8212, + "step": 8967 + }, + { + "epoch": 2.752608962553714, + "grad_norm": 0.2854628562927246, + "learning_rate": 8.504292236064854e-05, + "loss": 1.862, + "step": 8968 + }, + { + "epoch": 2.752915899324739, + "grad_norm": 0.30056047439575195, + "learning_rate": 8.503937668524351e-05, + "loss": 1.8007, + "step": 8969 + }, + { + "epoch": 2.7532228360957642, + "grad_norm": 0.33884522318840027, + "learning_rate": 8.503583066355883e-05, + "loss": 1.8972, + "step": 8970 + }, + { + "epoch": 2.7535297728667896, + "grad_norm": 0.29358747601509094, + "learning_rate": 8.503228429562951e-05, + "loss": 1.8343, + "step": 8971 + }, + { + "epoch": 2.7538367096378145, + "grad_norm": 0.3650909662246704, + "learning_rate": 8.502873758149063e-05, + "loss": 1.7866, + "step": 8972 + }, + { + "epoch": 2.75414364640884, + "grad_norm": 0.3245839476585388, + "learning_rate": 8.502519052117725e-05, + "loss": 1.8451, + "step": 8973 + }, + { + "epoch": 2.7544505831798647, + "grad_norm": 0.305429071187973, + "learning_rate": 8.502164311472441e-05, + "loss": 1.9277, + "step": 8974 + }, + { + "epoch": 2.75475751995089, + "grad_norm": 0.3520638942718506, + "learning_rate": 8.501809536216716e-05, + "loss": 1.7648, + "step": 8975 + }, + { + "epoch": 2.7550644567219154, + "grad_norm": 0.419918030500412, + "learning_rate": 8.501454726354054e-05, + "loss": 1.7862, + "step": 8976 + }, + { + "epoch": 2.7553713934929407, + "grad_norm": 0.3854345977306366, + "learning_rate": 8.501099881887968e-05, + "loss": 1.8234, + "step": 8977 + }, + { + "epoch": 2.7556783302639656, + "grad_norm": 0.27826064825057983, + "learning_rate": 8.50074500282196e-05, + "loss": 1.7694, + "step": 8978 + }, + { + "epoch": 2.755985267034991, + "grad_norm": 0.3439055383205414, + "learning_rate": 8.500390089159536e-05, + "loss": 1.8136, + "step": 8979 + }, + { + "epoch": 2.756292203806016, + "grad_norm": 0.3434913754463196, + "learning_rate": 8.500035140904208e-05, + "loss": 1.8053, + "step": 8980 + }, + { + "epoch": 2.756599140577041, + "grad_norm": 0.27551600337028503, + "learning_rate": 8.49968015805948e-05, + "loss": 1.8349, + "step": 8981 + }, + { + "epoch": 2.7569060773480665, + "grad_norm": 0.304706871509552, + "learning_rate": 8.499325140628863e-05, + "loss": 1.8488, + "step": 8982 + }, + { + "epoch": 2.7572130141190914, + "grad_norm": 0.36910584568977356, + "learning_rate": 8.498970088615861e-05, + "loss": 1.8519, + "step": 8983 + }, + { + "epoch": 2.7575199508901167, + "grad_norm": 0.30584999918937683, + "learning_rate": 8.498615002023987e-05, + "loss": 1.8479, + "step": 8984 + }, + { + "epoch": 2.7578268876611416, + "grad_norm": 0.28511542081832886, + "learning_rate": 8.498259880856749e-05, + "loss": 1.8047, + "step": 8985 + }, + { + "epoch": 2.758133824432167, + "grad_norm": 0.28804922103881836, + "learning_rate": 8.497904725117658e-05, + "loss": 1.891, + "step": 8986 + }, + { + "epoch": 2.7584407612031923, + "grad_norm": 0.32592445611953735, + "learning_rate": 8.497549534810221e-05, + "loss": 1.8081, + "step": 8987 + }, + { + "epoch": 2.758747697974217, + "grad_norm": 0.3298552632331848, + "learning_rate": 8.497194309937949e-05, + "loss": 1.8897, + "step": 8988 + }, + { + "epoch": 2.7590546347452425, + "grad_norm": 0.3506438136100769, + "learning_rate": 8.496839050504353e-05, + "loss": 1.9007, + "step": 8989 + }, + { + "epoch": 2.7593615715162674, + "grad_norm": 0.30891793966293335, + "learning_rate": 8.496483756512946e-05, + "loss": 1.8154, + "step": 8990 + }, + { + "epoch": 2.7596685082872927, + "grad_norm": 0.3697068691253662, + "learning_rate": 8.496128427967235e-05, + "loss": 1.8301, + "step": 8991 + }, + { + "epoch": 2.759975445058318, + "grad_norm": 0.3090182840824127, + "learning_rate": 8.495773064870734e-05, + "loss": 1.8443, + "step": 8992 + }, + { + "epoch": 2.7602823818293434, + "grad_norm": 0.31172695755958557, + "learning_rate": 8.495417667226955e-05, + "loss": 1.8051, + "step": 8993 + }, + { + "epoch": 2.7605893186003683, + "grad_norm": 0.34285077452659607, + "learning_rate": 8.495062235039411e-05, + "loss": 1.8766, + "step": 8994 + }, + { + "epoch": 2.7608962553713936, + "grad_norm": 0.30001118779182434, + "learning_rate": 8.494706768311612e-05, + "loss": 1.8267, + "step": 8995 + }, + { + "epoch": 2.7612031921424185, + "grad_norm": 0.2767544984817505, + "learning_rate": 8.494351267047074e-05, + "loss": 1.8038, + "step": 8996 + }, + { + "epoch": 2.761510128913444, + "grad_norm": 0.2952648401260376, + "learning_rate": 8.493995731249307e-05, + "loss": 1.7863, + "step": 8997 + }, + { + "epoch": 2.761817065684469, + "grad_norm": 0.27491581439971924, + "learning_rate": 8.493640160921828e-05, + "loss": 1.844, + "step": 8998 + }, + { + "epoch": 2.762124002455494, + "grad_norm": 0.2733328938484192, + "learning_rate": 8.493284556068147e-05, + "loss": 1.7909, + "step": 8999 + }, + { + "epoch": 2.7624309392265194, + "grad_norm": 0.3201010525226593, + "learning_rate": 8.492928916691783e-05, + "loss": 1.8827, + "step": 9000 + }, + { + "epoch": 2.7627378759975443, + "grad_norm": 0.293652206659317, + "learning_rate": 8.492573242796244e-05, + "loss": 1.7755, + "step": 9001 + }, + { + "epoch": 2.7630448127685696, + "grad_norm": 0.2862321436405182, + "learning_rate": 8.492217534385053e-05, + "loss": 1.7868, + "step": 9002 + }, + { + "epoch": 2.763351749539595, + "grad_norm": 0.364490270614624, + "learning_rate": 8.491861791461722e-05, + "loss": 1.8276, + "step": 9003 + }, + { + "epoch": 2.7636586863106203, + "grad_norm": 0.4316955506801605, + "learning_rate": 8.491506014029765e-05, + "loss": 1.8727, + "step": 9004 + }, + { + "epoch": 2.763965623081645, + "grad_norm": 0.37957659363746643, + "learning_rate": 8.491150202092697e-05, + "loss": 1.8471, + "step": 9005 + }, + { + "epoch": 2.7642725598526705, + "grad_norm": 0.2936808168888092, + "learning_rate": 8.490794355654039e-05, + "loss": 1.7964, + "step": 9006 + }, + { + "epoch": 2.7645794966236954, + "grad_norm": 0.3742556869983673, + "learning_rate": 8.490438474717304e-05, + "loss": 1.8461, + "step": 9007 + }, + { + "epoch": 2.7648864333947207, + "grad_norm": 0.4273780286312103, + "learning_rate": 8.49008255928601e-05, + "loss": 1.7947, + "step": 9008 + }, + { + "epoch": 2.765193370165746, + "grad_norm": 0.35967808961868286, + "learning_rate": 8.489726609363675e-05, + "loss": 1.8125, + "step": 9009 + }, + { + "epoch": 2.765500306936771, + "grad_norm": 0.27607613801956177, + "learning_rate": 8.489370624953817e-05, + "loss": 1.8413, + "step": 9010 + }, + { + "epoch": 2.7658072437077963, + "grad_norm": 0.38287433981895447, + "learning_rate": 8.489014606059952e-05, + "loss": 1.8184, + "step": 9011 + }, + { + "epoch": 2.766114180478821, + "grad_norm": 0.4284100830554962, + "learning_rate": 8.4886585526856e-05, + "loss": 1.7965, + "step": 9012 + }, + { + "epoch": 2.7664211172498465, + "grad_norm": 0.35851627588272095, + "learning_rate": 8.48830246483428e-05, + "loss": 1.8275, + "step": 9013 + }, + { + "epoch": 2.766728054020872, + "grad_norm": 0.30598360300064087, + "learning_rate": 8.487946342509509e-05, + "loss": 1.8383, + "step": 9014 + }, + { + "epoch": 2.7670349907918967, + "grad_norm": 0.30098259449005127, + "learning_rate": 8.487590185714811e-05, + "loss": 1.8229, + "step": 9015 + }, + { + "epoch": 2.767341927562922, + "grad_norm": 0.45887723565101624, + "learning_rate": 8.487233994453701e-05, + "loss": 1.9128, + "step": 9016 + }, + { + "epoch": 2.767648864333947, + "grad_norm": 0.4983403980731964, + "learning_rate": 8.4868777687297e-05, + "loss": 1.8269, + "step": 9017 + }, + { + "epoch": 2.7679558011049723, + "grad_norm": 0.4925507605075836, + "learning_rate": 8.48652150854633e-05, + "loss": 1.9231, + "step": 9018 + }, + { + "epoch": 2.7682627378759976, + "grad_norm": 0.31434112787246704, + "learning_rate": 8.48616521390711e-05, + "loss": 1.7782, + "step": 9019 + }, + { + "epoch": 2.768569674647023, + "grad_norm": 0.31802332401275635, + "learning_rate": 8.485808884815563e-05, + "loss": 1.8927, + "step": 9020 + }, + { + "epoch": 2.768876611418048, + "grad_norm": 0.4615871012210846, + "learning_rate": 8.485452521275208e-05, + "loss": 1.7866, + "step": 9021 + }, + { + "epoch": 2.769183548189073, + "grad_norm": 0.43722355365753174, + "learning_rate": 8.48509612328957e-05, + "loss": 1.8159, + "step": 9022 + }, + { + "epoch": 2.769490484960098, + "grad_norm": 0.27137285470962524, + "learning_rate": 8.484739690862169e-05, + "loss": 1.7613, + "step": 9023 + }, + { + "epoch": 2.7697974217311234, + "grad_norm": 0.32973676919937134, + "learning_rate": 8.484383223996528e-05, + "loss": 1.8321, + "step": 9024 + }, + { + "epoch": 2.7701043585021488, + "grad_norm": 0.38628003001213074, + "learning_rate": 8.484026722696169e-05, + "loss": 1.8154, + "step": 9025 + }, + { + "epoch": 2.7704112952731736, + "grad_norm": 0.33044543862342834, + "learning_rate": 8.483670186964617e-05, + "loss": 1.857, + "step": 9026 + }, + { + "epoch": 2.770718232044199, + "grad_norm": 0.2778245210647583, + "learning_rate": 8.483313616805393e-05, + "loss": 1.8524, + "step": 9027 + }, + { + "epoch": 2.771025168815224, + "grad_norm": 0.32064709067344666, + "learning_rate": 8.482957012222024e-05, + "loss": 1.8757, + "step": 9028 + }, + { + "epoch": 2.771332105586249, + "grad_norm": 0.29325249791145325, + "learning_rate": 8.48260037321803e-05, + "loss": 1.8504, + "step": 9029 + }, + { + "epoch": 2.7716390423572745, + "grad_norm": 0.308626651763916, + "learning_rate": 8.48224369979694e-05, + "loss": 1.882, + "step": 9030 + }, + { + "epoch": 2.7719459791282994, + "grad_norm": 0.34577706456184387, + "learning_rate": 8.481886991962276e-05, + "loss": 1.8178, + "step": 9031 + }, + { + "epoch": 2.7722529158993248, + "grad_norm": 0.3902320861816406, + "learning_rate": 8.481530249717564e-05, + "loss": 1.9111, + "step": 9032 + }, + { + "epoch": 2.7725598526703497, + "grad_norm": 0.431540310382843, + "learning_rate": 8.481173473066328e-05, + "loss": 1.8145, + "step": 9033 + }, + { + "epoch": 2.772866789441375, + "grad_norm": 0.3637184798717499, + "learning_rate": 8.480816662012097e-05, + "loss": 1.8298, + "step": 9034 + }, + { + "epoch": 2.7731737262124003, + "grad_norm": 0.3045017123222351, + "learning_rate": 8.480459816558397e-05, + "loss": 1.8099, + "step": 9035 + }, + { + "epoch": 2.7734806629834257, + "grad_norm": 0.4252402186393738, + "learning_rate": 8.48010293670875e-05, + "loss": 1.8125, + "step": 9036 + }, + { + "epoch": 2.7737875997544506, + "grad_norm": 0.37933188676834106, + "learning_rate": 8.479746022466688e-05, + "loss": 1.8162, + "step": 9037 + }, + { + "epoch": 2.774094536525476, + "grad_norm": 0.287536084651947, + "learning_rate": 8.479389073835735e-05, + "loss": 1.8377, + "step": 9038 + }, + { + "epoch": 2.7744014732965008, + "grad_norm": 0.3484840393066406, + "learning_rate": 8.47903209081942e-05, + "loss": 1.8166, + "step": 9039 + }, + { + "epoch": 2.774708410067526, + "grad_norm": 0.4489477872848511, + "learning_rate": 8.478675073421272e-05, + "loss": 1.8618, + "step": 9040 + }, + { + "epoch": 2.7750153468385514, + "grad_norm": 0.3817744553089142, + "learning_rate": 8.478318021644817e-05, + "loss": 1.86, + "step": 9041 + }, + { + "epoch": 2.7753222836095763, + "grad_norm": 0.263468861579895, + "learning_rate": 8.477960935493585e-05, + "loss": 1.7802, + "step": 9042 + }, + { + "epoch": 2.7756292203806017, + "grad_norm": 0.3218925893306732, + "learning_rate": 8.477603814971104e-05, + "loss": 1.8056, + "step": 9043 + }, + { + "epoch": 2.7759361571516266, + "grad_norm": 0.38502782583236694, + "learning_rate": 8.477246660080905e-05, + "loss": 1.8405, + "step": 9044 + }, + { + "epoch": 2.776243093922652, + "grad_norm": 0.3504064381122589, + "learning_rate": 8.476889470826517e-05, + "loss": 1.8606, + "step": 9045 + }, + { + "epoch": 2.7765500306936772, + "grad_norm": 0.3007161021232605, + "learning_rate": 8.476532247211468e-05, + "loss": 1.8407, + "step": 9046 + }, + { + "epoch": 2.776856967464702, + "grad_norm": 0.30306726694107056, + "learning_rate": 8.476174989239289e-05, + "loss": 1.8399, + "step": 9047 + }, + { + "epoch": 2.7771639042357275, + "grad_norm": 0.3898545801639557, + "learning_rate": 8.475817696913511e-05, + "loss": 1.8971, + "step": 9048 + }, + { + "epoch": 2.7774708410067523, + "grad_norm": 0.35386478900909424, + "learning_rate": 8.475460370237667e-05, + "loss": 1.8213, + "step": 9049 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.35815873742103577, + "learning_rate": 8.475103009215287e-05, + "loss": 1.9593, + "step": 9050 + }, + { + "epoch": 2.778084714548803, + "grad_norm": 0.28021275997161865, + "learning_rate": 8.474745613849901e-05, + "loss": 1.7767, + "step": 9051 + }, + { + "epoch": 2.7783916513198283, + "grad_norm": 0.3393603563308716, + "learning_rate": 8.474388184145042e-05, + "loss": 1.8484, + "step": 9052 + }, + { + "epoch": 2.7786985880908532, + "grad_norm": 0.30488693714141846, + "learning_rate": 8.474030720104243e-05, + "loss": 1.835, + "step": 9053 + }, + { + "epoch": 2.7790055248618786, + "grad_norm": 0.2839586138725281, + "learning_rate": 8.473673221731037e-05, + "loss": 1.8054, + "step": 9054 + }, + { + "epoch": 2.7793124616329035, + "grad_norm": 0.2718851864337921, + "learning_rate": 8.473315689028955e-05, + "loss": 1.8216, + "step": 9055 + }, + { + "epoch": 2.779619398403929, + "grad_norm": 0.3072827458381653, + "learning_rate": 8.472958122001531e-05, + "loss": 1.8537, + "step": 9056 + }, + { + "epoch": 2.779926335174954, + "grad_norm": 0.36827966570854187, + "learning_rate": 8.472600520652301e-05, + "loss": 1.8174, + "step": 9057 + }, + { + "epoch": 2.780233271945979, + "grad_norm": 0.37436968088150024, + "learning_rate": 8.472242884984797e-05, + "loss": 1.7983, + "step": 9058 + }, + { + "epoch": 2.7805402087170044, + "grad_norm": 0.3039530813694, + "learning_rate": 8.471885215002554e-05, + "loss": 1.839, + "step": 9059 + }, + { + "epoch": 2.7808471454880292, + "grad_norm": 0.2949865162372589, + "learning_rate": 8.471527510709106e-05, + "loss": 1.8191, + "step": 9060 + }, + { + "epoch": 2.7811540822590546, + "grad_norm": 0.2914051413536072, + "learning_rate": 8.471169772107987e-05, + "loss": 1.8511, + "step": 9061 + }, + { + "epoch": 2.78146101903008, + "grad_norm": 0.29169002175331116, + "learning_rate": 8.470811999202734e-05, + "loss": 1.8242, + "step": 9062 + }, + { + "epoch": 2.781767955801105, + "grad_norm": 0.2862909436225891, + "learning_rate": 8.470454191996884e-05, + "loss": 1.8471, + "step": 9063 + }, + { + "epoch": 2.78207489257213, + "grad_norm": 0.2820829749107361, + "learning_rate": 8.47009635049397e-05, + "loss": 1.8539, + "step": 9064 + }, + { + "epoch": 2.782381829343155, + "grad_norm": 0.2778072655200958, + "learning_rate": 8.469738474697532e-05, + "loss": 1.7999, + "step": 9065 + }, + { + "epoch": 2.7826887661141804, + "grad_norm": 0.35963353514671326, + "learning_rate": 8.469380564611103e-05, + "loss": 1.8589, + "step": 9066 + }, + { + "epoch": 2.7829957028852057, + "grad_norm": 0.29438379406929016, + "learning_rate": 8.469022620238223e-05, + "loss": 1.7898, + "step": 9067 + }, + { + "epoch": 2.783302639656231, + "grad_norm": 0.2766551971435547, + "learning_rate": 8.468664641582428e-05, + "loss": 1.858, + "step": 9068 + }, + { + "epoch": 2.783609576427256, + "grad_norm": 0.29893574118614197, + "learning_rate": 8.468306628647256e-05, + "loss": 1.7859, + "step": 9069 + }, + { + "epoch": 2.7839165131982813, + "grad_norm": 0.2744910717010498, + "learning_rate": 8.467948581436243e-05, + "loss": 1.7803, + "step": 9070 + }, + { + "epoch": 2.784223449969306, + "grad_norm": 0.2405908703804016, + "learning_rate": 8.467590499952931e-05, + "loss": 1.8064, + "step": 9071 + }, + { + "epoch": 2.7845303867403315, + "grad_norm": 0.28585049510002136, + "learning_rate": 8.467232384200858e-05, + "loss": 1.809, + "step": 9072 + }, + { + "epoch": 2.784837323511357, + "grad_norm": 0.25816819071769714, + "learning_rate": 8.466874234183562e-05, + "loss": 1.7687, + "step": 9073 + }, + { + "epoch": 2.7851442602823817, + "grad_norm": 0.3135145306587219, + "learning_rate": 8.466516049904582e-05, + "loss": 1.8902, + "step": 9074 + }, + { + "epoch": 2.785451197053407, + "grad_norm": 0.32004159688949585, + "learning_rate": 8.46615783136746e-05, + "loss": 1.8227, + "step": 9075 + }, + { + "epoch": 2.785758133824432, + "grad_norm": 0.2775251567363739, + "learning_rate": 8.465799578575733e-05, + "loss": 1.8293, + "step": 9076 + }, + { + "epoch": 2.7860650705954573, + "grad_norm": 0.3377391993999481, + "learning_rate": 8.465441291532944e-05, + "loss": 1.9096, + "step": 9077 + }, + { + "epoch": 2.7863720073664826, + "grad_norm": 0.322818398475647, + "learning_rate": 8.465082970242634e-05, + "loss": 1.8372, + "step": 9078 + }, + { + "epoch": 2.786678944137508, + "grad_norm": 0.30539727210998535, + "learning_rate": 8.464724614708342e-05, + "loss": 1.8678, + "step": 9079 + }, + { + "epoch": 2.786985880908533, + "grad_norm": 0.3148079216480255, + "learning_rate": 8.464366224933611e-05, + "loss": 1.798, + "step": 9080 + }, + { + "epoch": 2.787292817679558, + "grad_norm": 0.3834371566772461, + "learning_rate": 8.464007800921983e-05, + "loss": 1.7871, + "step": 9081 + }, + { + "epoch": 2.787599754450583, + "grad_norm": 0.360202431678772, + "learning_rate": 8.463649342676998e-05, + "loss": 1.8396, + "step": 9082 + }, + { + "epoch": 2.7879066912216084, + "grad_norm": 0.28360050916671753, + "learning_rate": 8.463290850202201e-05, + "loss": 1.7905, + "step": 9083 + }, + { + "epoch": 2.7882136279926337, + "grad_norm": 0.28087326884269714, + "learning_rate": 8.462932323501134e-05, + "loss": 1.8079, + "step": 9084 + }, + { + "epoch": 2.7885205647636586, + "grad_norm": 0.2725851833820343, + "learning_rate": 8.462573762577339e-05, + "loss": 1.8099, + "step": 9085 + }, + { + "epoch": 2.788827501534684, + "grad_norm": 0.27776938676834106, + "learning_rate": 8.462215167434363e-05, + "loss": 1.8002, + "step": 9086 + }, + { + "epoch": 2.789134438305709, + "grad_norm": 0.3118545711040497, + "learning_rate": 8.461856538075745e-05, + "loss": 1.8541, + "step": 9087 + }, + { + "epoch": 2.789441375076734, + "grad_norm": 0.29499873518943787, + "learning_rate": 8.461497874505034e-05, + "loss": 1.8667, + "step": 9088 + }, + { + "epoch": 2.7897483118477595, + "grad_norm": 0.31346917152404785, + "learning_rate": 8.46113917672577e-05, + "loss": 1.8737, + "step": 9089 + }, + { + "epoch": 2.7900552486187844, + "grad_norm": 0.30406203866004944, + "learning_rate": 8.460780444741501e-05, + "loss": 1.8467, + "step": 9090 + }, + { + "epoch": 2.7903621853898097, + "grad_norm": 0.28438735008239746, + "learning_rate": 8.46042167855577e-05, + "loss": 1.8008, + "step": 9091 + }, + { + "epoch": 2.7906691221608346, + "grad_norm": 0.29893866181373596, + "learning_rate": 8.460062878172125e-05, + "loss": 1.8498, + "step": 9092 + }, + { + "epoch": 2.79097605893186, + "grad_norm": 0.33810749650001526, + "learning_rate": 8.459704043594112e-05, + "loss": 1.8259, + "step": 9093 + }, + { + "epoch": 2.7912829957028853, + "grad_norm": 0.3726813495159149, + "learning_rate": 8.459345174825273e-05, + "loss": 1.8831, + "step": 9094 + }, + { + "epoch": 2.7915899324739106, + "grad_norm": 0.2983379662036896, + "learning_rate": 8.45898627186916e-05, + "loss": 1.7886, + "step": 9095 + }, + { + "epoch": 2.7918968692449355, + "grad_norm": 0.3235681354999542, + "learning_rate": 8.458627334729316e-05, + "loss": 1.8616, + "step": 9096 + }, + { + "epoch": 2.792203806015961, + "grad_norm": 0.47961094975471497, + "learning_rate": 8.458268363409288e-05, + "loss": 1.8134, + "step": 9097 + }, + { + "epoch": 2.7925107427869857, + "grad_norm": 0.5463281869888306, + "learning_rate": 8.457909357912628e-05, + "loss": 1.8288, + "step": 9098 + }, + { + "epoch": 2.792817679558011, + "grad_norm": 0.5377171635627747, + "learning_rate": 8.45755031824288e-05, + "loss": 1.8032, + "step": 9099 + }, + { + "epoch": 2.7931246163290364, + "grad_norm": 0.30159178376197815, + "learning_rate": 8.457191244403592e-05, + "loss": 1.7619, + "step": 9100 + }, + { + "epoch": 2.7934315531000613, + "grad_norm": 0.33798086643218994, + "learning_rate": 8.456832136398315e-05, + "loss": 1.839, + "step": 9101 + }, + { + "epoch": 2.7937384898710866, + "grad_norm": 0.5194488167762756, + "learning_rate": 8.456472994230595e-05, + "loss": 1.7908, + "step": 9102 + }, + { + "epoch": 2.7940454266421115, + "grad_norm": 0.49310582876205444, + "learning_rate": 8.456113817903986e-05, + "loss": 1.8471, + "step": 9103 + }, + { + "epoch": 2.794352363413137, + "grad_norm": 0.27490735054016113, + "learning_rate": 8.455754607422032e-05, + "loss": 1.8168, + "step": 9104 + }, + { + "epoch": 2.794659300184162, + "grad_norm": 0.3760504126548767, + "learning_rate": 8.455395362788285e-05, + "loss": 1.8796, + "step": 9105 + }, + { + "epoch": 2.794966236955187, + "grad_norm": 0.4636823534965515, + "learning_rate": 8.455036084006298e-05, + "loss": 1.8001, + "step": 9106 + }, + { + "epoch": 2.7952731737262124, + "grad_norm": 0.38666999340057373, + "learning_rate": 8.454676771079619e-05, + "loss": 1.8396, + "step": 9107 + }, + { + "epoch": 2.7955801104972373, + "grad_norm": 0.2992180585861206, + "learning_rate": 8.454317424011797e-05, + "loss": 1.8298, + "step": 9108 + }, + { + "epoch": 2.7958870472682626, + "grad_norm": 0.3744206428527832, + "learning_rate": 8.453958042806389e-05, + "loss": 1.8396, + "step": 9109 + }, + { + "epoch": 2.796193984039288, + "grad_norm": 0.5117284059524536, + "learning_rate": 8.453598627466941e-05, + "loss": 1.9734, + "step": 9110 + }, + { + "epoch": 2.7965009208103133, + "grad_norm": 0.36792969703674316, + "learning_rate": 8.453239177997008e-05, + "loss": 1.8347, + "step": 9111 + }, + { + "epoch": 2.796807857581338, + "grad_norm": 0.3352719843387604, + "learning_rate": 8.452879694400139e-05, + "loss": 1.7967, + "step": 9112 + }, + { + "epoch": 2.7971147943523635, + "grad_norm": 0.45745235681533813, + "learning_rate": 8.452520176679893e-05, + "loss": 1.8484, + "step": 9113 + }, + { + "epoch": 2.7974217311233884, + "grad_norm": 0.43958255648612976, + "learning_rate": 8.452160624839816e-05, + "loss": 1.7954, + "step": 9114 + }, + { + "epoch": 2.7977286678944138, + "grad_norm": 0.28715837001800537, + "learning_rate": 8.451801038883467e-05, + "loss": 1.8088, + "step": 9115 + }, + { + "epoch": 2.798035604665439, + "grad_norm": 0.3552972078323364, + "learning_rate": 8.451441418814394e-05, + "loss": 1.7654, + "step": 9116 + }, + { + "epoch": 2.798342541436464, + "grad_norm": 0.5065462589263916, + "learning_rate": 8.451081764636156e-05, + "loss": 1.7841, + "step": 9117 + }, + { + "epoch": 2.7986494782074893, + "grad_norm": 0.48900917172431946, + "learning_rate": 8.450722076352306e-05, + "loss": 1.8709, + "step": 9118 + }, + { + "epoch": 2.798956414978514, + "grad_norm": 0.31420227885246277, + "learning_rate": 8.450362353966395e-05, + "loss": 1.9057, + "step": 9119 + }, + { + "epoch": 2.7992633517495396, + "grad_norm": 0.35886913537979126, + "learning_rate": 8.450002597481982e-05, + "loss": 1.877, + "step": 9120 + }, + { + "epoch": 2.799570288520565, + "grad_norm": 0.3822213113307953, + "learning_rate": 8.449642806902623e-05, + "loss": 1.9171, + "step": 9121 + }, + { + "epoch": 2.7998772252915898, + "grad_norm": 0.3286183476448059, + "learning_rate": 8.449282982231869e-05, + "loss": 1.8342, + "step": 9122 + }, + { + "epoch": 2.800184162062615, + "grad_norm": 0.3498966693878174, + "learning_rate": 8.448923123473282e-05, + "loss": 1.8276, + "step": 9123 + }, + { + "epoch": 2.80049109883364, + "grad_norm": 0.3550187647342682, + "learning_rate": 8.448563230630413e-05, + "loss": 1.8585, + "step": 9124 + }, + { + "epoch": 2.8007980356046653, + "grad_norm": 0.32100117206573486, + "learning_rate": 8.448203303706821e-05, + "loss": 1.8168, + "step": 9125 + }, + { + "epoch": 2.8011049723756907, + "grad_norm": 0.3859860301017761, + "learning_rate": 8.447843342706063e-05, + "loss": 1.8941, + "step": 9126 + }, + { + "epoch": 2.801411909146716, + "grad_norm": 0.41674432158470154, + "learning_rate": 8.447483347631697e-05, + "loss": 1.7894, + "step": 9127 + }, + { + "epoch": 2.801718845917741, + "grad_norm": 0.3324837386608124, + "learning_rate": 8.44712331848728e-05, + "loss": 1.8901, + "step": 9128 + }, + { + "epoch": 2.8020257826887662, + "grad_norm": 0.30357789993286133, + "learning_rate": 8.44676325527637e-05, + "loss": 1.8434, + "step": 9129 + }, + { + "epoch": 2.802332719459791, + "grad_norm": 0.3215816617012024, + "learning_rate": 8.446403158002525e-05, + "loss": 1.8291, + "step": 9130 + }, + { + "epoch": 2.8026396562308165, + "grad_norm": 0.26280832290649414, + "learning_rate": 8.446043026669303e-05, + "loss": 1.7934, + "step": 9131 + }, + { + "epoch": 2.802946593001842, + "grad_norm": 0.2963539659976959, + "learning_rate": 8.445682861280265e-05, + "loss": 1.824, + "step": 9132 + }, + { + "epoch": 2.8032535297728667, + "grad_norm": 0.4251864552497864, + "learning_rate": 8.44532266183897e-05, + "loss": 1.9, + "step": 9133 + }, + { + "epoch": 2.803560466543892, + "grad_norm": 0.3920140862464905, + "learning_rate": 8.444962428348978e-05, + "loss": 1.7753, + "step": 9134 + }, + { + "epoch": 2.803867403314917, + "grad_norm": 0.2614890933036804, + "learning_rate": 8.444602160813845e-05, + "loss": 1.844, + "step": 9135 + }, + { + "epoch": 2.8041743400859422, + "grad_norm": 0.3359995484352112, + "learning_rate": 8.444241859237135e-05, + "loss": 1.8636, + "step": 9136 + }, + { + "epoch": 2.8044812768569676, + "grad_norm": 0.34399285912513733, + "learning_rate": 8.44388152362241e-05, + "loss": 1.8304, + "step": 9137 + }, + { + "epoch": 2.804788213627993, + "grad_norm": 0.27815961837768555, + "learning_rate": 8.443521153973228e-05, + "loss": 1.7916, + "step": 9138 + }, + { + "epoch": 2.805095150399018, + "grad_norm": 0.40705251693725586, + "learning_rate": 8.443160750293152e-05, + "loss": 1.7707, + "step": 9139 + }, + { + "epoch": 2.805402087170043, + "grad_norm": 0.49512532353401184, + "learning_rate": 8.442800312585744e-05, + "loss": 1.866, + "step": 9140 + }, + { + "epoch": 2.805709023941068, + "grad_norm": 0.31373831629753113, + "learning_rate": 8.442439840854565e-05, + "loss": 1.8495, + "step": 9141 + }, + { + "epoch": 2.8060159607120934, + "grad_norm": 0.33470213413238525, + "learning_rate": 8.442079335103177e-05, + "loss": 1.8459, + "step": 9142 + }, + { + "epoch": 2.8063228974831187, + "grad_norm": 0.4092586636543274, + "learning_rate": 8.441718795335145e-05, + "loss": 1.8547, + "step": 9143 + }, + { + "epoch": 2.8066298342541436, + "grad_norm": 0.37220728397369385, + "learning_rate": 8.44135822155403e-05, + "loss": 1.8922, + "step": 9144 + }, + { + "epoch": 2.806936771025169, + "grad_norm": 0.3197399973869324, + "learning_rate": 8.440997613763395e-05, + "loss": 1.872, + "step": 9145 + }, + { + "epoch": 2.807243707796194, + "grad_norm": 0.31258881092071533, + "learning_rate": 8.440636971966805e-05, + "loss": 1.8394, + "step": 9146 + }, + { + "epoch": 2.807550644567219, + "grad_norm": 0.31450721621513367, + "learning_rate": 8.440276296167825e-05, + "loss": 1.8496, + "step": 9147 + }, + { + "epoch": 2.8078575813382445, + "grad_norm": 0.30959805846214294, + "learning_rate": 8.439915586370018e-05, + "loss": 1.8326, + "step": 9148 + }, + { + "epoch": 2.8081645181092694, + "grad_norm": 0.2942456901073456, + "learning_rate": 8.439554842576949e-05, + "loss": 1.8742, + "step": 9149 + }, + { + "epoch": 2.8084714548802947, + "grad_norm": 0.32378795742988586, + "learning_rate": 8.439194064792182e-05, + "loss": 1.7991, + "step": 9150 + }, + { + "epoch": 2.8087783916513196, + "grad_norm": 0.30733996629714966, + "learning_rate": 8.438833253019285e-05, + "loss": 1.8822, + "step": 9151 + }, + { + "epoch": 2.809085328422345, + "grad_norm": 0.29933521151542664, + "learning_rate": 8.438472407261821e-05, + "loss": 1.7785, + "step": 9152 + }, + { + "epoch": 2.8093922651933703, + "grad_norm": 0.2992005944252014, + "learning_rate": 8.438111527523358e-05, + "loss": 1.9056, + "step": 9153 + }, + { + "epoch": 2.8096992019643956, + "grad_norm": 0.3074969947338104, + "learning_rate": 8.43775061380746e-05, + "loss": 1.8283, + "step": 9154 + }, + { + "epoch": 2.8100061387354205, + "grad_norm": 0.29843345284461975, + "learning_rate": 8.437389666117699e-05, + "loss": 1.87, + "step": 9155 + }, + { + "epoch": 2.810313075506446, + "grad_norm": 0.2939853072166443, + "learning_rate": 8.437028684457635e-05, + "loss": 1.8657, + "step": 9156 + }, + { + "epoch": 2.8106200122774707, + "grad_norm": 0.292972207069397, + "learning_rate": 8.436667668830841e-05, + "loss": 1.821, + "step": 9157 + }, + { + "epoch": 2.810926949048496, + "grad_norm": 0.298244833946228, + "learning_rate": 8.436306619240882e-05, + "loss": 1.8531, + "step": 9158 + }, + { + "epoch": 2.8112338858195214, + "grad_norm": 0.28567394614219666, + "learning_rate": 8.435945535691328e-05, + "loss": 1.7719, + "step": 9159 + }, + { + "epoch": 2.8115408225905463, + "grad_norm": 0.2876092493534088, + "learning_rate": 8.435584418185745e-05, + "loss": 1.7622, + "step": 9160 + }, + { + "epoch": 2.8118477593615716, + "grad_norm": 0.2656804919242859, + "learning_rate": 8.435223266727704e-05, + "loss": 1.7624, + "step": 9161 + }, + { + "epoch": 2.8121546961325965, + "grad_norm": 0.26690298318862915, + "learning_rate": 8.434862081320774e-05, + "loss": 1.807, + "step": 9162 + }, + { + "epoch": 2.812461632903622, + "grad_norm": 0.3088238537311554, + "learning_rate": 8.434500861968521e-05, + "loss": 1.9214, + "step": 9163 + }, + { + "epoch": 2.812768569674647, + "grad_norm": 0.32310751080513, + "learning_rate": 8.43413960867452e-05, + "loss": 1.8341, + "step": 9164 + }, + { + "epoch": 2.813075506445672, + "grad_norm": 0.3028428554534912, + "learning_rate": 8.433778321442339e-05, + "loss": 1.8316, + "step": 9165 + }, + { + "epoch": 2.8133824432166974, + "grad_norm": 0.28363901376724243, + "learning_rate": 8.433417000275545e-05, + "loss": 1.8506, + "step": 9166 + }, + { + "epoch": 2.8136893799877223, + "grad_norm": 0.2976547181606293, + "learning_rate": 8.433055645177714e-05, + "loss": 1.8654, + "step": 9167 + }, + { + "epoch": 2.8139963167587476, + "grad_norm": 0.2945725619792938, + "learning_rate": 8.432694256152414e-05, + "loss": 1.8146, + "step": 9168 + }, + { + "epoch": 2.814303253529773, + "grad_norm": 0.30364149808883667, + "learning_rate": 8.432332833203217e-05, + "loss": 1.8152, + "step": 9169 + }, + { + "epoch": 2.8146101903007983, + "grad_norm": 0.2776038348674774, + "learning_rate": 8.431971376333699e-05, + "loss": 1.7723, + "step": 9170 + }, + { + "epoch": 2.814917127071823, + "grad_norm": 0.41802000999450684, + "learning_rate": 8.431609885547425e-05, + "loss": 1.7909, + "step": 9171 + }, + { + "epoch": 2.8152240638428485, + "grad_norm": 0.400622695684433, + "learning_rate": 8.43124836084797e-05, + "loss": 1.8241, + "step": 9172 + }, + { + "epoch": 2.8155310006138734, + "grad_norm": 0.3760300576686859, + "learning_rate": 8.430886802238908e-05, + "loss": 1.9298, + "step": 9173 + }, + { + "epoch": 2.8158379373848987, + "grad_norm": 0.2944977283477783, + "learning_rate": 8.430525209723813e-05, + "loss": 1.8181, + "step": 9174 + }, + { + "epoch": 2.816144874155924, + "grad_norm": 0.28091785311698914, + "learning_rate": 8.430163583306257e-05, + "loss": 1.8178, + "step": 9175 + }, + { + "epoch": 2.816451810926949, + "grad_norm": 0.33689528703689575, + "learning_rate": 8.429801922989812e-05, + "loss": 1.8195, + "step": 9176 + }, + { + "epoch": 2.8167587476979743, + "grad_norm": 0.3541412055492401, + "learning_rate": 8.429440228778058e-05, + "loss": 1.8951, + "step": 9177 + }, + { + "epoch": 2.817065684468999, + "grad_norm": 0.2846376299858093, + "learning_rate": 8.429078500674564e-05, + "loss": 1.7858, + "step": 9178 + }, + { + "epoch": 2.8173726212400245, + "grad_norm": 0.28097108006477356, + "learning_rate": 8.428716738682905e-05, + "loss": 1.8503, + "step": 9179 + }, + { + "epoch": 2.81767955801105, + "grad_norm": 0.354670912027359, + "learning_rate": 8.428354942806658e-05, + "loss": 1.8332, + "step": 9180 + }, + { + "epoch": 2.8179864947820747, + "grad_norm": 0.3589770793914795, + "learning_rate": 8.427993113049397e-05, + "loss": 1.8527, + "step": 9181 + }, + { + "epoch": 2.8182934315531, + "grad_norm": 0.3171144723892212, + "learning_rate": 8.4276312494147e-05, + "loss": 1.789, + "step": 9182 + }, + { + "epoch": 2.818600368324125, + "grad_norm": 0.3540917932987213, + "learning_rate": 8.427269351906143e-05, + "loss": 1.8338, + "step": 9183 + }, + { + "epoch": 2.8189073050951503, + "grad_norm": 0.34149861335754395, + "learning_rate": 8.426907420527302e-05, + "loss": 1.8202, + "step": 9184 + }, + { + "epoch": 2.8192142418661756, + "grad_norm": 0.3035878837108612, + "learning_rate": 8.426545455281751e-05, + "loss": 1.842, + "step": 9185 + }, + { + "epoch": 2.819521178637201, + "grad_norm": 0.29007625579833984, + "learning_rate": 8.426183456173072e-05, + "loss": 1.8486, + "step": 9186 + }, + { + "epoch": 2.819828115408226, + "grad_norm": 0.3066602647304535, + "learning_rate": 8.425821423204837e-05, + "loss": 1.7833, + "step": 9187 + }, + { + "epoch": 2.820135052179251, + "grad_norm": 0.3163747191429138, + "learning_rate": 8.425459356380627e-05, + "loss": 1.8037, + "step": 9188 + }, + { + "epoch": 2.820441988950276, + "grad_norm": 0.3282648026943207, + "learning_rate": 8.425097255704022e-05, + "loss": 1.8476, + "step": 9189 + }, + { + "epoch": 2.8207489257213014, + "grad_norm": 0.3573009669780731, + "learning_rate": 8.424735121178598e-05, + "loss": 1.87, + "step": 9190 + }, + { + "epoch": 2.8210558624923268, + "grad_norm": 0.3480490744113922, + "learning_rate": 8.424372952807933e-05, + "loss": 1.8773, + "step": 9191 + }, + { + "epoch": 2.8213627992633517, + "grad_norm": 0.3296821415424347, + "learning_rate": 8.424010750595608e-05, + "loss": 1.8775, + "step": 9192 + }, + { + "epoch": 2.821669736034377, + "grad_norm": 0.33366382122039795, + "learning_rate": 8.423648514545202e-05, + "loss": 1.8064, + "step": 9193 + }, + { + "epoch": 2.821976672805402, + "grad_norm": 0.454303503036499, + "learning_rate": 8.423286244660295e-05, + "loss": 1.9702, + "step": 9194 + }, + { + "epoch": 2.822283609576427, + "grad_norm": 0.361215740442276, + "learning_rate": 8.422923940944466e-05, + "loss": 1.8055, + "step": 9195 + }, + { + "epoch": 2.8225905463474525, + "grad_norm": 0.3678447902202606, + "learning_rate": 8.422561603401297e-05, + "loss": 1.8924, + "step": 9196 + }, + { + "epoch": 2.8228974831184774, + "grad_norm": 0.32999005913734436, + "learning_rate": 8.422199232034369e-05, + "loss": 1.7887, + "step": 9197 + }, + { + "epoch": 2.8232044198895028, + "grad_norm": 0.2811618149280548, + "learning_rate": 8.42183682684726e-05, + "loss": 1.8166, + "step": 9198 + }, + { + "epoch": 2.8235113566605277, + "grad_norm": 0.3178839385509491, + "learning_rate": 8.421474387843555e-05, + "loss": 1.7868, + "step": 9199 + }, + { + "epoch": 2.823818293431553, + "grad_norm": 0.27299264073371887, + "learning_rate": 8.421111915026836e-05, + "loss": 1.816, + "step": 9200 + }, + { + "epoch": 2.8241252302025783, + "grad_norm": 0.3191591203212738, + "learning_rate": 8.420749408400684e-05, + "loss": 1.912, + "step": 9201 + }, + { + "epoch": 2.8244321669736037, + "grad_norm": 0.3638809323310852, + "learning_rate": 8.42038686796868e-05, + "loss": 1.7716, + "step": 9202 + }, + { + "epoch": 2.8247391037446286, + "grad_norm": 0.33573171496391296, + "learning_rate": 8.420024293734407e-05, + "loss": 1.8599, + "step": 9203 + }, + { + "epoch": 2.825046040515654, + "grad_norm": 0.29062843322753906, + "learning_rate": 8.419661685701452e-05, + "loss": 1.7982, + "step": 9204 + }, + { + "epoch": 2.825352977286679, + "grad_norm": 0.27475887537002563, + "learning_rate": 8.419299043873394e-05, + "loss": 1.7763, + "step": 9205 + }, + { + "epoch": 2.825659914057704, + "grad_norm": 0.2996850609779358, + "learning_rate": 8.41893636825382e-05, + "loss": 1.7957, + "step": 9206 + }, + { + "epoch": 2.8259668508287294, + "grad_norm": 0.38112908601760864, + "learning_rate": 8.418573658846314e-05, + "loss": 1.8536, + "step": 9207 + }, + { + "epoch": 2.8262737875997543, + "grad_norm": 0.3245584964752197, + "learning_rate": 8.418210915654456e-05, + "loss": 1.8254, + "step": 9208 + }, + { + "epoch": 2.8265807243707797, + "grad_norm": 0.24600234627723694, + "learning_rate": 8.417848138681837e-05, + "loss": 1.825, + "step": 9209 + }, + { + "epoch": 2.8268876611418046, + "grad_norm": 0.3130429685115814, + "learning_rate": 8.417485327932038e-05, + "loss": 1.7954, + "step": 9210 + }, + { + "epoch": 2.82719459791283, + "grad_norm": 0.3218819200992584, + "learning_rate": 8.417122483408647e-05, + "loss": 1.8343, + "step": 9211 + }, + { + "epoch": 2.8275015346838552, + "grad_norm": 0.3020598292350769, + "learning_rate": 8.416759605115248e-05, + "loss": 1.8547, + "step": 9212 + }, + { + "epoch": 2.8278084714548806, + "grad_norm": 0.2685437798500061, + "learning_rate": 8.416396693055429e-05, + "loss": 1.7828, + "step": 9213 + }, + { + "epoch": 2.8281154082259055, + "grad_norm": 0.2990378737449646, + "learning_rate": 8.416033747232775e-05, + "loss": 1.8108, + "step": 9214 + }, + { + "epoch": 2.828422344996931, + "grad_norm": 0.25395238399505615, + "learning_rate": 8.415670767650871e-05, + "loss": 1.786, + "step": 9215 + }, + { + "epoch": 2.8287292817679557, + "grad_norm": 0.3406725823879242, + "learning_rate": 8.41530775431331e-05, + "loss": 1.9015, + "step": 9216 + }, + { + "epoch": 2.829036218538981, + "grad_norm": 0.279859721660614, + "learning_rate": 8.414944707223676e-05, + "loss": 1.8639, + "step": 9217 + }, + { + "epoch": 2.8293431553100064, + "grad_norm": 0.2574310600757599, + "learning_rate": 8.414581626385554e-05, + "loss": 1.7595, + "step": 9218 + }, + { + "epoch": 2.8296500920810312, + "grad_norm": 0.2956291437149048, + "learning_rate": 8.414218511802537e-05, + "loss": 1.8418, + "step": 9219 + }, + { + "epoch": 2.8299570288520566, + "grad_norm": 0.30965283513069153, + "learning_rate": 8.41385536347821e-05, + "loss": 1.8241, + "step": 9220 + }, + { + "epoch": 2.8302639656230815, + "grad_norm": 0.3125357925891876, + "learning_rate": 8.413492181416166e-05, + "loss": 1.7961, + "step": 9221 + }, + { + "epoch": 2.830570902394107, + "grad_norm": 0.23901188373565674, + "learning_rate": 8.413128965619988e-05, + "loss": 1.8109, + "step": 9222 + }, + { + "epoch": 2.830877839165132, + "grad_norm": 0.26556700468063354, + "learning_rate": 8.412765716093272e-05, + "loss": 1.8756, + "step": 9223 + }, + { + "epoch": 2.831184775936157, + "grad_norm": 0.3080972731113434, + "learning_rate": 8.412402432839604e-05, + "loss": 1.8271, + "step": 9224 + }, + { + "epoch": 2.8314917127071824, + "grad_norm": 0.32894501090049744, + "learning_rate": 8.412039115862573e-05, + "loss": 1.8427, + "step": 9225 + }, + { + "epoch": 2.8317986494782073, + "grad_norm": 0.3136049509048462, + "learning_rate": 8.411675765165774e-05, + "loss": 1.8716, + "step": 9226 + }, + { + "epoch": 2.8321055862492326, + "grad_norm": 0.26859185099601746, + "learning_rate": 8.411312380752795e-05, + "loss": 1.8138, + "step": 9227 + }, + { + "epoch": 2.832412523020258, + "grad_norm": 0.26863718032836914, + "learning_rate": 8.410948962627227e-05, + "loss": 1.8286, + "step": 9228 + }, + { + "epoch": 2.8327194597912833, + "grad_norm": 0.25599852204322815, + "learning_rate": 8.410585510792663e-05, + "loss": 1.8274, + "step": 9229 + }, + { + "epoch": 2.833026396562308, + "grad_norm": 0.22787287831306458, + "learning_rate": 8.410222025252694e-05, + "loss": 1.7961, + "step": 9230 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.22957643866539001, + "learning_rate": 8.409858506010912e-05, + "loss": 1.7763, + "step": 9231 + }, + { + "epoch": 2.8336402701043584, + "grad_norm": 0.2794438302516937, + "learning_rate": 8.409494953070909e-05, + "loss": 1.8552, + "step": 9232 + }, + { + "epoch": 2.8339472068753837, + "grad_norm": 0.2755461037158966, + "learning_rate": 8.409131366436279e-05, + "loss": 1.8418, + "step": 9233 + }, + { + "epoch": 2.834254143646409, + "grad_norm": 0.27968719601631165, + "learning_rate": 8.408767746110616e-05, + "loss": 1.8774, + "step": 9234 + }, + { + "epoch": 2.834561080417434, + "grad_norm": 0.3014982044696808, + "learning_rate": 8.408404092097511e-05, + "loss": 1.8886, + "step": 9235 + }, + { + "epoch": 2.8348680171884593, + "grad_norm": 0.3139450252056122, + "learning_rate": 8.408040404400558e-05, + "loss": 1.8119, + "step": 9236 + }, + { + "epoch": 2.835174953959484, + "grad_norm": 0.43578827381134033, + "learning_rate": 8.407676683023353e-05, + "loss": 1.8173, + "step": 9237 + }, + { + "epoch": 2.8354818907305095, + "grad_norm": 0.4939953088760376, + "learning_rate": 8.407312927969489e-05, + "loss": 1.8647, + "step": 9238 + }, + { + "epoch": 2.835788827501535, + "grad_norm": 0.40801018476486206, + "learning_rate": 8.406949139242562e-05, + "loss": 1.8259, + "step": 9239 + }, + { + "epoch": 2.8360957642725597, + "grad_norm": 0.331249862909317, + "learning_rate": 8.406585316846168e-05, + "loss": 1.8727, + "step": 9240 + }, + { + "epoch": 2.836402701043585, + "grad_norm": 0.3368569314479828, + "learning_rate": 8.406221460783901e-05, + "loss": 1.8362, + "step": 9241 + }, + { + "epoch": 2.83670963781461, + "grad_norm": 0.4736326336860657, + "learning_rate": 8.405857571059355e-05, + "loss": 1.9543, + "step": 9242 + }, + { + "epoch": 2.8370165745856353, + "grad_norm": 0.4151712656021118, + "learning_rate": 8.405493647676131e-05, + "loss": 1.8764, + "step": 9243 + }, + { + "epoch": 2.8373235113566606, + "grad_norm": 0.3463367819786072, + "learning_rate": 8.405129690637821e-05, + "loss": 1.8578, + "step": 9244 + }, + { + "epoch": 2.837630448127686, + "grad_norm": 0.28701671957969666, + "learning_rate": 8.404765699948023e-05, + "loss": 1.8201, + "step": 9245 + }, + { + "epoch": 2.837937384898711, + "grad_norm": 0.2893613874912262, + "learning_rate": 8.404401675610336e-05, + "loss": 1.7918, + "step": 9246 + }, + { + "epoch": 2.838244321669736, + "grad_norm": 0.29359766840934753, + "learning_rate": 8.404037617628357e-05, + "loss": 1.7919, + "step": 9247 + }, + { + "epoch": 2.838551258440761, + "grad_norm": 0.30147913098335266, + "learning_rate": 8.403673526005682e-05, + "loss": 1.8227, + "step": 9248 + }, + { + "epoch": 2.8388581952117864, + "grad_norm": 0.28443291783332825, + "learning_rate": 8.403309400745908e-05, + "loss": 1.8128, + "step": 9249 + }, + { + "epoch": 2.8391651319828117, + "grad_norm": 0.27890142798423767, + "learning_rate": 8.40294524185264e-05, + "loss": 1.8109, + "step": 9250 + }, + { + "epoch": 2.8394720687538366, + "grad_norm": 0.29900890588760376, + "learning_rate": 8.402581049329471e-05, + "loss": 1.7852, + "step": 9251 + }, + { + "epoch": 2.839779005524862, + "grad_norm": 0.34249019622802734, + "learning_rate": 8.402216823180001e-05, + "loss": 1.8681, + "step": 9252 + }, + { + "epoch": 2.840085942295887, + "grad_norm": 0.3387257754802704, + "learning_rate": 8.40185256340783e-05, + "loss": 1.9171, + "step": 9253 + }, + { + "epoch": 2.840392879066912, + "grad_norm": 0.2831752598285675, + "learning_rate": 8.40148827001656e-05, + "loss": 1.8422, + "step": 9254 + }, + { + "epoch": 2.8406998158379375, + "grad_norm": 0.30895891785621643, + "learning_rate": 8.401123943009788e-05, + "loss": 1.7967, + "step": 9255 + }, + { + "epoch": 2.8410067526089624, + "grad_norm": 0.381154328584671, + "learning_rate": 8.400759582391116e-05, + "loss": 1.8359, + "step": 9256 + }, + { + "epoch": 2.8413136893799877, + "grad_norm": 0.4041622281074524, + "learning_rate": 8.400395188164144e-05, + "loss": 1.8306, + "step": 9257 + }, + { + "epoch": 2.8416206261510126, + "grad_norm": 0.3801247775554657, + "learning_rate": 8.400030760332474e-05, + "loss": 1.8696, + "step": 9258 + }, + { + "epoch": 2.841927562922038, + "grad_norm": 0.27382874488830566, + "learning_rate": 8.399666298899706e-05, + "loss": 1.8369, + "step": 9259 + }, + { + "epoch": 2.8422344996930633, + "grad_norm": 0.31395214796066284, + "learning_rate": 8.399301803869445e-05, + "loss": 1.8135, + "step": 9260 + }, + { + "epoch": 2.8425414364640886, + "grad_norm": 0.36473774909973145, + "learning_rate": 8.398937275245291e-05, + "loss": 1.8025, + "step": 9261 + }, + { + "epoch": 2.8428483732351135, + "grad_norm": 0.38420331478118896, + "learning_rate": 8.398572713030846e-05, + "loss": 1.7873, + "step": 9262 + }, + { + "epoch": 2.843155310006139, + "grad_norm": 0.2707001566886902, + "learning_rate": 8.398208117229714e-05, + "loss": 1.8071, + "step": 9263 + }, + { + "epoch": 2.8434622467771637, + "grad_norm": 0.3391258418560028, + "learning_rate": 8.397843487845496e-05, + "loss": 1.8186, + "step": 9264 + }, + { + "epoch": 2.843769183548189, + "grad_norm": 0.4473530650138855, + "learning_rate": 8.397478824881799e-05, + "loss": 1.9144, + "step": 9265 + }, + { + "epoch": 2.8440761203192144, + "grad_norm": 0.3141709268093109, + "learning_rate": 8.397114128342224e-05, + "loss": 1.77, + "step": 9266 + }, + { + "epoch": 2.8443830570902393, + "grad_norm": 0.29191854596138, + "learning_rate": 8.396749398230377e-05, + "loss": 1.8645, + "step": 9267 + }, + { + "epoch": 2.8446899938612646, + "grad_norm": 0.4399743676185608, + "learning_rate": 8.39638463454986e-05, + "loss": 1.8261, + "step": 9268 + }, + { + "epoch": 2.8449969306322895, + "grad_norm": 0.4741196036338806, + "learning_rate": 8.396019837304281e-05, + "loss": 1.8566, + "step": 9269 + }, + { + "epoch": 2.845303867403315, + "grad_norm": 0.39640361070632935, + "learning_rate": 8.395655006497243e-05, + "loss": 1.8062, + "step": 9270 + }, + { + "epoch": 2.84561080417434, + "grad_norm": 0.290171355009079, + "learning_rate": 8.39529014213235e-05, + "loss": 1.8463, + "step": 9271 + }, + { + "epoch": 2.845917740945365, + "grad_norm": 0.2773928940296173, + "learning_rate": 8.394925244213212e-05, + "loss": 1.7929, + "step": 9272 + }, + { + "epoch": 2.8462246777163904, + "grad_norm": 0.38512173295021057, + "learning_rate": 8.394560312743433e-05, + "loss": 1.8724, + "step": 9273 + }, + { + "epoch": 2.8465316144874153, + "grad_norm": 0.44405680894851685, + "learning_rate": 8.394195347726619e-05, + "loss": 1.8184, + "step": 9274 + }, + { + "epoch": 2.8468385512584407, + "grad_norm": 0.32526880502700806, + "learning_rate": 8.393830349166376e-05, + "loss": 1.8207, + "step": 9275 + }, + { + "epoch": 2.847145488029466, + "grad_norm": 0.2934194803237915, + "learning_rate": 8.393465317066313e-05, + "loss": 1.8023, + "step": 9276 + }, + { + "epoch": 2.8474524248004913, + "grad_norm": 0.43126001954078674, + "learning_rate": 8.393100251430037e-05, + "loss": 1.8283, + "step": 9277 + }, + { + "epoch": 2.847759361571516, + "grad_norm": 0.48253729939460754, + "learning_rate": 8.392735152261157e-05, + "loss": 1.8359, + "step": 9278 + }, + { + "epoch": 2.8480662983425415, + "grad_norm": 0.3736251890659332, + "learning_rate": 8.392370019563279e-05, + "loss": 1.8553, + "step": 9279 + }, + { + "epoch": 2.8483732351135664, + "grad_norm": 0.33329901099205017, + "learning_rate": 8.39200485334001e-05, + "loss": 1.8156, + "step": 9280 + }, + { + "epoch": 2.8486801718845918, + "grad_norm": 0.42538657784461975, + "learning_rate": 8.391639653594963e-05, + "loss": 1.7812, + "step": 9281 + }, + { + "epoch": 2.848987108655617, + "grad_norm": 0.39076727628707886, + "learning_rate": 8.391274420331744e-05, + "loss": 1.8027, + "step": 9282 + }, + { + "epoch": 2.849294045426642, + "grad_norm": 0.3558272123336792, + "learning_rate": 8.390909153553963e-05, + "loss": 1.8448, + "step": 9283 + }, + { + "epoch": 2.8496009821976673, + "grad_norm": 0.26782071590423584, + "learning_rate": 8.390543853265232e-05, + "loss": 1.7995, + "step": 9284 + }, + { + "epoch": 2.849907918968692, + "grad_norm": 0.3449724614620209, + "learning_rate": 8.390178519469158e-05, + "loss": 1.7888, + "step": 9285 + }, + { + "epoch": 2.8502148557397176, + "grad_norm": 0.36390578746795654, + "learning_rate": 8.389813152169355e-05, + "loss": 1.8072, + "step": 9286 + }, + { + "epoch": 2.850521792510743, + "grad_norm": 0.31959423422813416, + "learning_rate": 8.389447751369428e-05, + "loss": 1.8513, + "step": 9287 + }, + { + "epoch": 2.8508287292817682, + "grad_norm": 0.2717762589454651, + "learning_rate": 8.389082317072994e-05, + "loss": 1.8457, + "step": 9288 + }, + { + "epoch": 2.851135666052793, + "grad_norm": 0.28937265276908875, + "learning_rate": 8.388716849283662e-05, + "loss": 1.7945, + "step": 9289 + }, + { + "epoch": 2.8514426028238185, + "grad_norm": 0.293079674243927, + "learning_rate": 8.388351348005044e-05, + "loss": 1.7731, + "step": 9290 + }, + { + "epoch": 2.8517495395948433, + "grad_norm": 0.32930463552474976, + "learning_rate": 8.38798581324075e-05, + "loss": 1.9017, + "step": 9291 + }, + { + "epoch": 2.8520564763658687, + "grad_norm": 0.2972584664821625, + "learning_rate": 8.387620244994397e-05, + "loss": 1.861, + "step": 9292 + }, + { + "epoch": 2.852363413136894, + "grad_norm": 0.24732981622219086, + "learning_rate": 8.387254643269595e-05, + "loss": 1.7749, + "step": 9293 + }, + { + "epoch": 2.852670349907919, + "grad_norm": 0.31004419922828674, + "learning_rate": 8.386889008069955e-05, + "loss": 1.7848, + "step": 9294 + }, + { + "epoch": 2.8529772866789442, + "grad_norm": 0.2916278541088104, + "learning_rate": 8.386523339399095e-05, + "loss": 1.8299, + "step": 9295 + }, + { + "epoch": 2.853284223449969, + "grad_norm": 0.3109573423862457, + "learning_rate": 8.386157637260626e-05, + "loss": 1.8072, + "step": 9296 + }, + { + "epoch": 2.8535911602209945, + "grad_norm": 0.26398584246635437, + "learning_rate": 8.385791901658162e-05, + "loss": 1.8157, + "step": 9297 + }, + { + "epoch": 2.85389809699202, + "grad_norm": 0.3289371132850647, + "learning_rate": 8.385426132595317e-05, + "loss": 1.9382, + "step": 9298 + }, + { + "epoch": 2.8542050337630447, + "grad_norm": 0.2946974039077759, + "learning_rate": 8.38506033007571e-05, + "loss": 1.7893, + "step": 9299 + }, + { + "epoch": 2.85451197053407, + "grad_norm": 0.2909530699253082, + "learning_rate": 8.384694494102949e-05, + "loss": 1.8223, + "step": 9300 + }, + { + "epoch": 2.854818907305095, + "grad_norm": 0.2886645793914795, + "learning_rate": 8.384328624680655e-05, + "loss": 1.8239, + "step": 9301 + }, + { + "epoch": 2.8551258440761202, + "grad_norm": 0.2669137716293335, + "learning_rate": 8.383962721812442e-05, + "loss": 1.8102, + "step": 9302 + }, + { + "epoch": 2.8554327808471456, + "grad_norm": 0.3740660548210144, + "learning_rate": 8.383596785501926e-05, + "loss": 1.9014, + "step": 9303 + }, + { + "epoch": 2.855739717618171, + "grad_norm": 0.3062593638896942, + "learning_rate": 8.383230815752724e-05, + "loss": 1.8071, + "step": 9304 + }, + { + "epoch": 2.856046654389196, + "grad_norm": 0.2509091794490814, + "learning_rate": 8.382864812568452e-05, + "loss": 1.7968, + "step": 9305 + }, + { + "epoch": 2.856353591160221, + "grad_norm": 0.2764138877391815, + "learning_rate": 8.382498775952725e-05, + "loss": 1.7463, + "step": 9306 + }, + { + "epoch": 2.856660527931246, + "grad_norm": 0.3292323350906372, + "learning_rate": 8.382132705909165e-05, + "loss": 1.7888, + "step": 9307 + }, + { + "epoch": 2.8569674647022714, + "grad_norm": 0.3169284462928772, + "learning_rate": 8.381766602441386e-05, + "loss": 1.841, + "step": 9308 + }, + { + "epoch": 2.8572744014732967, + "grad_norm": 0.27665168046951294, + "learning_rate": 8.381400465553007e-05, + "loss": 1.7659, + "step": 9309 + }, + { + "epoch": 2.8575813382443216, + "grad_norm": 0.34908005595207214, + "learning_rate": 8.381034295247647e-05, + "loss": 1.8752, + "step": 9310 + }, + { + "epoch": 2.857888275015347, + "grad_norm": 0.31204238533973694, + "learning_rate": 8.380668091528924e-05, + "loss": 1.8201, + "step": 9311 + }, + { + "epoch": 2.858195211786372, + "grad_norm": 0.2713339328765869, + "learning_rate": 8.380301854400459e-05, + "loss": 1.8002, + "step": 9312 + }, + { + "epoch": 2.858502148557397, + "grad_norm": 0.30525076389312744, + "learning_rate": 8.379935583865868e-05, + "loss": 1.8533, + "step": 9313 + }, + { + "epoch": 2.8588090853284225, + "grad_norm": 0.3294430673122406, + "learning_rate": 8.379569279928774e-05, + "loss": 1.8895, + "step": 9314 + }, + { + "epoch": 2.8591160220994474, + "grad_norm": 0.31798750162124634, + "learning_rate": 8.379202942592795e-05, + "loss": 1.8148, + "step": 9315 + }, + { + "epoch": 2.8594229588704727, + "grad_norm": 0.3044969141483307, + "learning_rate": 8.378836571861553e-05, + "loss": 1.8477, + "step": 9316 + }, + { + "epoch": 2.8597298956414976, + "grad_norm": 0.2694118320941925, + "learning_rate": 8.378470167738665e-05, + "loss": 1.7998, + "step": 9317 + }, + { + "epoch": 2.860036832412523, + "grad_norm": 0.2601872980594635, + "learning_rate": 8.378103730227758e-05, + "loss": 1.8118, + "step": 9318 + }, + { + "epoch": 2.8603437691835483, + "grad_norm": 0.28168994188308716, + "learning_rate": 8.377737259332446e-05, + "loss": 1.8048, + "step": 9319 + }, + { + "epoch": 2.8606507059545736, + "grad_norm": 0.3008260428905487, + "learning_rate": 8.377370755056358e-05, + "loss": 1.7743, + "step": 9320 + }, + { + "epoch": 2.8609576427255985, + "grad_norm": 0.2578682601451874, + "learning_rate": 8.37700421740311e-05, + "loss": 1.8011, + "step": 9321 + }, + { + "epoch": 2.861264579496624, + "grad_norm": 0.3051932752132416, + "learning_rate": 8.376637646376329e-05, + "loss": 1.8747, + "step": 9322 + }, + { + "epoch": 2.8615715162676487, + "grad_norm": 0.27534300088882446, + "learning_rate": 8.376271041979636e-05, + "loss": 1.8018, + "step": 9323 + }, + { + "epoch": 2.861878453038674, + "grad_norm": 0.3990626335144043, + "learning_rate": 8.375904404216653e-05, + "loss": 1.9223, + "step": 9324 + }, + { + "epoch": 2.8621853898096994, + "grad_norm": 0.43015196919441223, + "learning_rate": 8.375537733091003e-05, + "loss": 1.8219, + "step": 9325 + }, + { + "epoch": 2.8624923265807243, + "grad_norm": 0.4051269590854645, + "learning_rate": 8.37517102860631e-05, + "loss": 1.8057, + "step": 9326 + }, + { + "epoch": 2.8627992633517496, + "grad_norm": 0.31781086325645447, + "learning_rate": 8.3748042907662e-05, + "loss": 1.8374, + "step": 9327 + }, + { + "epoch": 2.8631062001227745, + "grad_norm": 0.3476638197898865, + "learning_rate": 8.374437519574297e-05, + "loss": 1.8679, + "step": 9328 + }, + { + "epoch": 2.8634131368938, + "grad_norm": 0.40497875213623047, + "learning_rate": 8.374070715034224e-05, + "loss": 1.7996, + "step": 9329 + }, + { + "epoch": 2.863720073664825, + "grad_norm": 0.40277308225631714, + "learning_rate": 8.373703877149605e-05, + "loss": 1.8156, + "step": 9330 + }, + { + "epoch": 2.86402701043585, + "grad_norm": 0.3012325167655945, + "learning_rate": 8.373337005924069e-05, + "loss": 1.8765, + "step": 9331 + }, + { + "epoch": 2.8643339472068754, + "grad_norm": 0.3151897192001343, + "learning_rate": 8.372970101361238e-05, + "loss": 1.8395, + "step": 9332 + }, + { + "epoch": 2.8646408839779003, + "grad_norm": 0.33645790815353394, + "learning_rate": 8.372603163464741e-05, + "loss": 1.8587, + "step": 9333 + }, + { + "epoch": 2.8649478207489256, + "grad_norm": 0.29943743348121643, + "learning_rate": 8.3722361922382e-05, + "loss": 1.8007, + "step": 9334 + }, + { + "epoch": 2.865254757519951, + "grad_norm": 0.24727779626846313, + "learning_rate": 8.371869187685248e-05, + "loss": 1.766, + "step": 9335 + }, + { + "epoch": 2.8655616942909763, + "grad_norm": 0.3177282512187958, + "learning_rate": 8.371502149809507e-05, + "loss": 1.7954, + "step": 9336 + }, + { + "epoch": 2.865868631062001, + "grad_norm": 0.3415081202983856, + "learning_rate": 8.371135078614605e-05, + "loss": 1.8036, + "step": 9337 + }, + { + "epoch": 2.8661755678330265, + "grad_norm": 0.3044268488883972, + "learning_rate": 8.37076797410417e-05, + "loss": 1.8196, + "step": 9338 + }, + { + "epoch": 2.8664825046040514, + "grad_norm": 0.24425630271434784, + "learning_rate": 8.370400836281831e-05, + "loss": 1.8267, + "step": 9339 + }, + { + "epoch": 2.8667894413750767, + "grad_norm": 0.27264806628227234, + "learning_rate": 8.370033665151216e-05, + "loss": 1.8218, + "step": 9340 + }, + { + "epoch": 2.867096378146102, + "grad_norm": 0.275601327419281, + "learning_rate": 8.369666460715953e-05, + "loss": 1.8427, + "step": 9341 + }, + { + "epoch": 2.867403314917127, + "grad_norm": 0.2670573592185974, + "learning_rate": 8.36929922297967e-05, + "loss": 1.8449, + "step": 9342 + }, + { + "epoch": 2.8677102516881523, + "grad_norm": 0.2991434335708618, + "learning_rate": 8.368931951945998e-05, + "loss": 1.8866, + "step": 9343 + }, + { + "epoch": 2.868017188459177, + "grad_norm": 0.2975110411643982, + "learning_rate": 8.368564647618564e-05, + "loss": 1.7992, + "step": 9344 + }, + { + "epoch": 2.8683241252302025, + "grad_norm": 0.30109819769859314, + "learning_rate": 8.368197310001001e-05, + "loss": 1.8402, + "step": 9345 + }, + { + "epoch": 2.868631062001228, + "grad_norm": 0.3303714692592621, + "learning_rate": 8.367829939096938e-05, + "loss": 1.8329, + "step": 9346 + }, + { + "epoch": 2.8689379987722528, + "grad_norm": 0.3697182834148407, + "learning_rate": 8.367462534910007e-05, + "loss": 1.9328, + "step": 9347 + }, + { + "epoch": 2.869244935543278, + "grad_norm": 0.3292355537414551, + "learning_rate": 8.367095097443836e-05, + "loss": 1.8284, + "step": 9348 + }, + { + "epoch": 2.869551872314303, + "grad_norm": 0.30440348386764526, + "learning_rate": 8.366727626702058e-05, + "loss": 1.8891, + "step": 9349 + }, + { + "epoch": 2.8698588090853283, + "grad_norm": 0.28200212121009827, + "learning_rate": 8.366360122688303e-05, + "loss": 1.7931, + "step": 9350 + }, + { + "epoch": 2.8701657458563536, + "grad_norm": 0.3162787854671478, + "learning_rate": 8.365992585406207e-05, + "loss": 1.8033, + "step": 9351 + }, + { + "epoch": 2.870472682627379, + "grad_norm": 0.3326094448566437, + "learning_rate": 8.365625014859399e-05, + "loss": 1.8474, + "step": 9352 + }, + { + "epoch": 2.870779619398404, + "grad_norm": 0.36957383155822754, + "learning_rate": 8.36525741105151e-05, + "loss": 1.8387, + "step": 9353 + }, + { + "epoch": 2.871086556169429, + "grad_norm": 0.32996198534965515, + "learning_rate": 8.364889773986175e-05, + "loss": 1.9087, + "step": 9354 + }, + { + "epoch": 2.871393492940454, + "grad_norm": 0.3164239227771759, + "learning_rate": 8.36452210366703e-05, + "loss": 1.8735, + "step": 9355 + }, + { + "epoch": 2.8717004297114794, + "grad_norm": 0.411538302898407, + "learning_rate": 8.364154400097702e-05, + "loss": 1.832, + "step": 9356 + }, + { + "epoch": 2.8720073664825048, + "grad_norm": 0.48294687271118164, + "learning_rate": 8.36378666328183e-05, + "loss": 1.7772, + "step": 9357 + }, + { + "epoch": 2.8723143032535297, + "grad_norm": 0.4894202649593353, + "learning_rate": 8.363418893223046e-05, + "loss": 1.8396, + "step": 9358 + }, + { + "epoch": 2.872621240024555, + "grad_norm": 0.3328344225883484, + "learning_rate": 8.363051089924986e-05, + "loss": 1.8264, + "step": 9359 + }, + { + "epoch": 2.87292817679558, + "grad_norm": 0.29800695180892944, + "learning_rate": 8.362683253391284e-05, + "loss": 1.8609, + "step": 9360 + }, + { + "epoch": 2.873235113566605, + "grad_norm": 0.48049718141555786, + "learning_rate": 8.362315383625574e-05, + "loss": 1.8703, + "step": 9361 + }, + { + "epoch": 2.8735420503376305, + "grad_norm": 0.5477426052093506, + "learning_rate": 8.361947480631494e-05, + "loss": 1.8336, + "step": 9362 + }, + { + "epoch": 2.873848987108656, + "grad_norm": 0.42515942454338074, + "learning_rate": 8.361579544412676e-05, + "loss": 1.826, + "step": 9363 + }, + { + "epoch": 2.8741559238796808, + "grad_norm": 0.3049539029598236, + "learning_rate": 8.361211574972762e-05, + "loss": 1.9117, + "step": 9364 + }, + { + "epoch": 2.874462860650706, + "grad_norm": 0.4089799225330353, + "learning_rate": 8.360843572315384e-05, + "loss": 1.8669, + "step": 9365 + }, + { + "epoch": 2.874769797421731, + "grad_norm": 0.42594894766807556, + "learning_rate": 8.36047553644418e-05, + "loss": 1.8527, + "step": 9366 + }, + { + "epoch": 2.8750767341927563, + "grad_norm": 0.3282840847969055, + "learning_rate": 8.360107467362785e-05, + "loss": 1.833, + "step": 9367 + }, + { + "epoch": 2.8753836709637817, + "grad_norm": 0.26597294211387634, + "learning_rate": 8.359739365074841e-05, + "loss": 1.7735, + "step": 9368 + }, + { + "epoch": 2.8756906077348066, + "grad_norm": 0.33498096466064453, + "learning_rate": 8.359371229583983e-05, + "loss": 1.7923, + "step": 9369 + }, + { + "epoch": 2.875997544505832, + "grad_norm": 0.3046290874481201, + "learning_rate": 8.35900306089385e-05, + "loss": 1.8296, + "step": 9370 + }, + { + "epoch": 2.876304481276857, + "grad_norm": 0.3128269612789154, + "learning_rate": 8.358634859008079e-05, + "loss": 1.8115, + "step": 9371 + }, + { + "epoch": 2.876611418047882, + "grad_norm": 0.3814822733402252, + "learning_rate": 8.358266623930309e-05, + "loss": 1.8454, + "step": 9372 + }, + { + "epoch": 2.8769183548189075, + "grad_norm": 0.42400503158569336, + "learning_rate": 8.35789835566418e-05, + "loss": 1.8162, + "step": 9373 + }, + { + "epoch": 2.8772252915899323, + "grad_norm": 0.3131491243839264, + "learning_rate": 8.357530054213333e-05, + "loss": 1.8281, + "step": 9374 + }, + { + "epoch": 2.8775322283609577, + "grad_norm": 0.2566036581993103, + "learning_rate": 8.357161719581406e-05, + "loss": 1.7751, + "step": 9375 + }, + { + "epoch": 2.8778391651319826, + "grad_norm": 0.3858461081981659, + "learning_rate": 8.356793351772038e-05, + "loss": 1.8558, + "step": 9376 + }, + { + "epoch": 2.878146101903008, + "grad_norm": 0.38664349913597107, + "learning_rate": 8.35642495078887e-05, + "loss": 1.8009, + "step": 9377 + }, + { + "epoch": 2.8784530386740332, + "grad_norm": 0.33365172147750854, + "learning_rate": 8.356056516635545e-05, + "loss": 1.8689, + "step": 9378 + }, + { + "epoch": 2.8787599754450586, + "grad_norm": 0.3602980971336365, + "learning_rate": 8.355688049315702e-05, + "loss": 1.8397, + "step": 9379 + }, + { + "epoch": 2.8790669122160835, + "grad_norm": 0.4508447051048279, + "learning_rate": 8.355319548832983e-05, + "loss": 1.8163, + "step": 9380 + }, + { + "epoch": 2.879373848987109, + "grad_norm": 0.4433961808681488, + "learning_rate": 8.35495101519103e-05, + "loss": 1.7868, + "step": 9381 + }, + { + "epoch": 2.8796807857581337, + "grad_norm": 0.2754592299461365, + "learning_rate": 8.354582448393483e-05, + "loss": 1.8222, + "step": 9382 + }, + { + "epoch": 2.879987722529159, + "grad_norm": 0.29384344816207886, + "learning_rate": 8.354213848443987e-05, + "loss": 1.7742, + "step": 9383 + }, + { + "epoch": 2.8802946593001844, + "grad_norm": 0.33183756470680237, + "learning_rate": 8.353845215346183e-05, + "loss": 1.8327, + "step": 9384 + }, + { + "epoch": 2.8806015960712092, + "grad_norm": 0.3018858730792999, + "learning_rate": 8.353476549103717e-05, + "loss": 1.8606, + "step": 9385 + }, + { + "epoch": 2.8809085328422346, + "grad_norm": 0.38592803478240967, + "learning_rate": 8.353107849720229e-05, + "loss": 1.8091, + "step": 9386 + }, + { + "epoch": 2.8812154696132595, + "grad_norm": 0.448723703622818, + "learning_rate": 8.352739117199364e-05, + "loss": 1.8537, + "step": 9387 + }, + { + "epoch": 2.881522406384285, + "grad_norm": 0.25959616899490356, + "learning_rate": 8.352370351544765e-05, + "loss": 1.8188, + "step": 9388 + }, + { + "epoch": 2.88182934315531, + "grad_norm": 0.3304184079170227, + "learning_rate": 8.352001552760078e-05, + "loss": 1.8008, + "step": 9389 + }, + { + "epoch": 2.882136279926335, + "grad_norm": 0.3831254541873932, + "learning_rate": 8.351632720848947e-05, + "loss": 1.7636, + "step": 9390 + }, + { + "epoch": 2.8824432166973604, + "grad_norm": 0.3358294665813446, + "learning_rate": 8.351263855815017e-05, + "loss": 1.8375, + "step": 9391 + }, + { + "epoch": 2.8827501534683853, + "grad_norm": 0.31194913387298584, + "learning_rate": 8.350894957661935e-05, + "loss": 1.817, + "step": 9392 + }, + { + "epoch": 2.8830570902394106, + "grad_norm": 0.4156818687915802, + "learning_rate": 8.350526026393343e-05, + "loss": 1.799, + "step": 9393 + }, + { + "epoch": 2.883364027010436, + "grad_norm": 0.3062533140182495, + "learning_rate": 8.350157062012889e-05, + "loss": 1.8535, + "step": 9394 + }, + { + "epoch": 2.8836709637814613, + "grad_norm": 0.3091447949409485, + "learning_rate": 8.34978806452422e-05, + "loss": 1.839, + "step": 9395 + }, + { + "epoch": 2.883977900552486, + "grad_norm": 0.38731643557548523, + "learning_rate": 8.349419033930981e-05, + "loss": 1.8714, + "step": 9396 + }, + { + "epoch": 2.8842848373235115, + "grad_norm": 0.34655869007110596, + "learning_rate": 8.34904997023682e-05, + "loss": 1.8694, + "step": 9397 + }, + { + "epoch": 2.8845917740945364, + "grad_norm": 0.3094301223754883, + "learning_rate": 8.348680873445386e-05, + "loss": 1.8773, + "step": 9398 + }, + { + "epoch": 2.8848987108655617, + "grad_norm": 0.2954508364200592, + "learning_rate": 8.348311743560325e-05, + "loss": 1.7716, + "step": 9399 + }, + { + "epoch": 2.885205647636587, + "grad_norm": 0.32545948028564453, + "learning_rate": 8.347942580585282e-05, + "loss": 1.871, + "step": 9400 + }, + { + "epoch": 2.885512584407612, + "grad_norm": 0.3251612186431885, + "learning_rate": 8.34757338452391e-05, + "loss": 1.8553, + "step": 9401 + }, + { + "epoch": 2.8858195211786373, + "grad_norm": 0.2610895335674286, + "learning_rate": 8.347204155379856e-05, + "loss": 1.8018, + "step": 9402 + }, + { + "epoch": 2.886126457949662, + "grad_norm": 0.3369129002094269, + "learning_rate": 8.346834893156768e-05, + "loss": 1.8536, + "step": 9403 + }, + { + "epoch": 2.8864333947206875, + "grad_norm": 0.4544060528278351, + "learning_rate": 8.346465597858296e-05, + "loss": 1.8332, + "step": 9404 + }, + { + "epoch": 2.886740331491713, + "grad_norm": 0.45742174983024597, + "learning_rate": 8.346096269488089e-05, + "loss": 1.89, + "step": 9405 + }, + { + "epoch": 2.8870472682627377, + "grad_norm": 0.3458103537559509, + "learning_rate": 8.345726908049799e-05, + "loss": 1.8902, + "step": 9406 + }, + { + "epoch": 2.887354205033763, + "grad_norm": 0.33266058564186096, + "learning_rate": 8.345357513547074e-05, + "loss": 1.7975, + "step": 9407 + }, + { + "epoch": 2.887661141804788, + "grad_norm": 0.3503437042236328, + "learning_rate": 8.344988085983565e-05, + "loss": 1.8503, + "step": 9408 + }, + { + "epoch": 2.8879680785758133, + "grad_norm": 0.33511486649513245, + "learning_rate": 8.344618625362923e-05, + "loss": 1.8731, + "step": 9409 + }, + { + "epoch": 2.8882750153468386, + "grad_norm": 0.295250803232193, + "learning_rate": 8.344249131688799e-05, + "loss": 1.8557, + "step": 9410 + }, + { + "epoch": 2.888581952117864, + "grad_norm": 0.33287179470062256, + "learning_rate": 8.343879604964846e-05, + "loss": 1.8015, + "step": 9411 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.35169747471809387, + "learning_rate": 8.343510045194715e-05, + "loss": 1.7857, + "step": 9412 + }, + { + "epoch": 2.889195825659914, + "grad_norm": 0.3191360533237457, + "learning_rate": 8.343140452382056e-05, + "loss": 1.8474, + "step": 9413 + }, + { + "epoch": 2.889502762430939, + "grad_norm": 0.27216482162475586, + "learning_rate": 8.342770826530526e-05, + "loss": 1.7941, + "step": 9414 + }, + { + "epoch": 2.8898096992019644, + "grad_norm": 0.32968905568122864, + "learning_rate": 8.342401167643774e-05, + "loss": 1.8568, + "step": 9415 + }, + { + "epoch": 2.8901166359729897, + "grad_norm": 0.37429341673851013, + "learning_rate": 8.342031475725456e-05, + "loss": 1.8995, + "step": 9416 + }, + { + "epoch": 2.8904235727440146, + "grad_norm": 0.3318146765232086, + "learning_rate": 8.341661750779223e-05, + "loss": 1.8886, + "step": 9417 + }, + { + "epoch": 2.89073050951504, + "grad_norm": 0.3208807408809662, + "learning_rate": 8.34129199280873e-05, + "loss": 1.8306, + "step": 9418 + }, + { + "epoch": 2.891037446286065, + "grad_norm": 0.30906134843826294, + "learning_rate": 8.340922201817632e-05, + "loss": 1.8931, + "step": 9419 + }, + { + "epoch": 2.89134438305709, + "grad_norm": 0.2949373722076416, + "learning_rate": 8.340552377809581e-05, + "loss": 1.8375, + "step": 9420 + }, + { + "epoch": 2.8916513198281155, + "grad_norm": 0.2553368806838989, + "learning_rate": 8.340182520788236e-05, + "loss": 1.7816, + "step": 9421 + }, + { + "epoch": 2.891958256599141, + "grad_norm": 0.26867765188217163, + "learning_rate": 8.339812630757246e-05, + "loss": 1.7721, + "step": 9422 + }, + { + "epoch": 2.8922651933701657, + "grad_norm": 0.3132673501968384, + "learning_rate": 8.339442707720273e-05, + "loss": 1.8412, + "step": 9423 + }, + { + "epoch": 2.892572130141191, + "grad_norm": 0.32028669118881226, + "learning_rate": 8.33907275168097e-05, + "loss": 1.8081, + "step": 9424 + }, + { + "epoch": 2.892879066912216, + "grad_norm": 0.30383285880088806, + "learning_rate": 8.338702762642992e-05, + "loss": 1.8294, + "step": 9425 + }, + { + "epoch": 2.8931860036832413, + "grad_norm": 0.284161239862442, + "learning_rate": 8.338332740609995e-05, + "loss": 1.7788, + "step": 9426 + }, + { + "epoch": 2.8934929404542666, + "grad_norm": 0.26731929183006287, + "learning_rate": 8.337962685585638e-05, + "loss": 1.8244, + "step": 9427 + }, + { + "epoch": 2.8937998772252915, + "grad_norm": 0.2687760889530182, + "learning_rate": 8.337592597573578e-05, + "loss": 1.8104, + "step": 9428 + }, + { + "epoch": 2.894106813996317, + "grad_norm": 0.3097872734069824, + "learning_rate": 8.337222476577472e-05, + "loss": 1.8311, + "step": 9429 + }, + { + "epoch": 2.8944137507673418, + "grad_norm": 0.2915988862514496, + "learning_rate": 8.336852322600977e-05, + "loss": 1.8878, + "step": 9430 + }, + { + "epoch": 2.894720687538367, + "grad_norm": 0.2783167362213135, + "learning_rate": 8.336482135647751e-05, + "loss": 1.829, + "step": 9431 + }, + { + "epoch": 2.8950276243093924, + "grad_norm": 0.27866432070732117, + "learning_rate": 8.336111915721454e-05, + "loss": 1.8881, + "step": 9432 + }, + { + "epoch": 2.8953345610804173, + "grad_norm": 0.26949164271354675, + "learning_rate": 8.335741662825743e-05, + "loss": 1.7652, + "step": 9433 + }, + { + "epoch": 2.8956414978514426, + "grad_norm": 0.31324130296707153, + "learning_rate": 8.335371376964278e-05, + "loss": 1.8362, + "step": 9434 + }, + { + "epoch": 2.8959484346224675, + "grad_norm": 0.31150999665260315, + "learning_rate": 8.335001058140718e-05, + "loss": 1.8588, + "step": 9435 + }, + { + "epoch": 2.896255371393493, + "grad_norm": 0.30692601203918457, + "learning_rate": 8.334630706358724e-05, + "loss": 1.8473, + "step": 9436 + }, + { + "epoch": 2.896562308164518, + "grad_norm": 0.2764357328414917, + "learning_rate": 8.334260321621954e-05, + "loss": 1.8696, + "step": 9437 + }, + { + "epoch": 2.8968692449355435, + "grad_norm": 0.26108071208000183, + "learning_rate": 8.333889903934069e-05, + "loss": 1.7647, + "step": 9438 + }, + { + "epoch": 2.8971761817065684, + "grad_norm": 0.3382989466190338, + "learning_rate": 8.33351945329873e-05, + "loss": 1.8936, + "step": 9439 + }, + { + "epoch": 2.8974831184775938, + "grad_norm": 0.3121405839920044, + "learning_rate": 8.333148969719598e-05, + "loss": 1.8281, + "step": 9440 + }, + { + "epoch": 2.8977900552486187, + "grad_norm": 0.283149778842926, + "learning_rate": 8.332778453200334e-05, + "loss": 1.8642, + "step": 9441 + }, + { + "epoch": 2.898096992019644, + "grad_norm": 0.4140075445175171, + "learning_rate": 8.332407903744598e-05, + "loss": 1.8553, + "step": 9442 + }, + { + "epoch": 2.8984039287906693, + "grad_norm": 0.4345620274543762, + "learning_rate": 8.332037321356057e-05, + "loss": 1.7879, + "step": 9443 + }, + { + "epoch": 2.898710865561694, + "grad_norm": 0.4103661775588989, + "learning_rate": 8.33166670603837e-05, + "loss": 1.7928, + "step": 9444 + }, + { + "epoch": 2.8990178023327196, + "grad_norm": 0.2874266505241394, + "learning_rate": 8.3312960577952e-05, + "loss": 1.8097, + "step": 9445 + }, + { + "epoch": 2.8993247391037444, + "grad_norm": 0.2949487864971161, + "learning_rate": 8.330925376630208e-05, + "loss": 1.8679, + "step": 9446 + }, + { + "epoch": 2.8996316758747698, + "grad_norm": 0.3222406804561615, + "learning_rate": 8.330554662547059e-05, + "loss": 1.8184, + "step": 9447 + }, + { + "epoch": 2.899938612645795, + "grad_norm": 0.32089436054229736, + "learning_rate": 8.330183915549418e-05, + "loss": 1.8798, + "step": 9448 + }, + { + "epoch": 2.90024554941682, + "grad_norm": 0.28950363397598267, + "learning_rate": 8.329813135640947e-05, + "loss": 1.8502, + "step": 9449 + }, + { + "epoch": 2.9005524861878453, + "grad_norm": 0.29070547223091125, + "learning_rate": 8.329442322825312e-05, + "loss": 1.8826, + "step": 9450 + }, + { + "epoch": 2.9008594229588702, + "grad_norm": 0.3030688464641571, + "learning_rate": 8.329071477106175e-05, + "loss": 1.8002, + "step": 9451 + }, + { + "epoch": 2.9011663597298956, + "grad_norm": 0.33711570501327515, + "learning_rate": 8.328700598487203e-05, + "loss": 1.8876, + "step": 9452 + }, + { + "epoch": 2.901473296500921, + "grad_norm": 0.31995612382888794, + "learning_rate": 8.328329686972063e-05, + "loss": 1.7952, + "step": 9453 + }, + { + "epoch": 2.9017802332719462, + "grad_norm": 0.2619616389274597, + "learning_rate": 8.327958742564415e-05, + "loss": 1.7371, + "step": 9454 + }, + { + "epoch": 2.902087170042971, + "grad_norm": 0.3527650535106659, + "learning_rate": 8.32758776526793e-05, + "loss": 1.8385, + "step": 9455 + }, + { + "epoch": 2.9023941068139965, + "grad_norm": 0.3238582909107208, + "learning_rate": 8.327216755086271e-05, + "loss": 1.7955, + "step": 9456 + }, + { + "epoch": 2.9027010435850213, + "grad_norm": 0.2647970914840698, + "learning_rate": 8.326845712023106e-05, + "loss": 1.8639, + "step": 9457 + }, + { + "epoch": 2.9030079803560467, + "grad_norm": 0.3435346186161041, + "learning_rate": 8.326474636082103e-05, + "loss": 1.7831, + "step": 9458 + }, + { + "epoch": 2.903314917127072, + "grad_norm": 0.42539843916893005, + "learning_rate": 8.326103527266927e-05, + "loss": 1.8473, + "step": 9459 + }, + { + "epoch": 2.903621853898097, + "grad_norm": 0.3773367404937744, + "learning_rate": 8.325732385581247e-05, + "loss": 1.8993, + "step": 9460 + }, + { + "epoch": 2.9039287906691222, + "grad_norm": 0.2918262183666229, + "learning_rate": 8.32536121102873e-05, + "loss": 1.8198, + "step": 9461 + }, + { + "epoch": 2.904235727440147, + "grad_norm": 0.3997703492641449, + "learning_rate": 8.324990003613044e-05, + "loss": 1.8307, + "step": 9462 + }, + { + "epoch": 2.9045426642111725, + "grad_norm": 0.4593566656112671, + "learning_rate": 8.324618763337858e-05, + "loss": 1.8068, + "step": 9463 + }, + { + "epoch": 2.904849600982198, + "grad_norm": 0.30200180411338806, + "learning_rate": 8.324247490206841e-05, + "loss": 1.7935, + "step": 9464 + }, + { + "epoch": 2.9051565377532227, + "grad_norm": 0.37651970982551575, + "learning_rate": 8.323876184223663e-05, + "loss": 1.9268, + "step": 9465 + }, + { + "epoch": 2.905463474524248, + "grad_norm": 0.465863436460495, + "learning_rate": 8.32350484539199e-05, + "loss": 1.8331, + "step": 9466 + }, + { + "epoch": 2.905770411295273, + "grad_norm": 0.3527480661869049, + "learning_rate": 8.323133473715496e-05, + "loss": 1.899, + "step": 9467 + }, + { + "epoch": 2.9060773480662982, + "grad_norm": 0.30979883670806885, + "learning_rate": 8.32276206919785e-05, + "loss": 1.7578, + "step": 9468 + }, + { + "epoch": 2.9063842848373236, + "grad_norm": 0.5039793252944946, + "learning_rate": 8.322390631842718e-05, + "loss": 1.7822, + "step": 9469 + }, + { + "epoch": 2.906691221608349, + "grad_norm": 0.4683503806591034, + "learning_rate": 8.322019161653777e-05, + "loss": 1.7958, + "step": 9470 + }, + { + "epoch": 2.906998158379374, + "grad_norm": 0.27022865414619446, + "learning_rate": 8.321647658634696e-05, + "loss": 1.838, + "step": 9471 + }, + { + "epoch": 2.907305095150399, + "grad_norm": 0.3253246247768402, + "learning_rate": 8.321276122789146e-05, + "loss": 1.862, + "step": 9472 + }, + { + "epoch": 2.907612031921424, + "grad_norm": 0.3654547929763794, + "learning_rate": 8.320904554120798e-05, + "loss": 1.8578, + "step": 9473 + }, + { + "epoch": 2.9079189686924494, + "grad_norm": 0.3140239417552948, + "learning_rate": 8.320532952633325e-05, + "loss": 1.7954, + "step": 9474 + }, + { + "epoch": 2.9082259054634747, + "grad_norm": 0.24541302025318146, + "learning_rate": 8.3201613183304e-05, + "loss": 1.7711, + "step": 9475 + }, + { + "epoch": 2.9085328422344996, + "grad_norm": 0.2538415491580963, + "learning_rate": 8.319789651215692e-05, + "loss": 1.7756, + "step": 9476 + }, + { + "epoch": 2.908839779005525, + "grad_norm": 0.3181871175765991, + "learning_rate": 8.31941795129288e-05, + "loss": 1.7957, + "step": 9477 + }, + { + "epoch": 2.90914671577655, + "grad_norm": 0.3094673752784729, + "learning_rate": 8.319046218565633e-05, + "loss": 1.8897, + "step": 9478 + }, + { + "epoch": 2.909453652547575, + "grad_norm": 0.3004473149776459, + "learning_rate": 8.318674453037626e-05, + "loss": 1.7853, + "step": 9479 + }, + { + "epoch": 2.9097605893186005, + "grad_norm": 0.28673505783081055, + "learning_rate": 8.318302654712532e-05, + "loss": 1.8119, + "step": 9480 + }, + { + "epoch": 2.9100675260896254, + "grad_norm": 0.3177729547023773, + "learning_rate": 8.317930823594027e-05, + "loss": 1.8211, + "step": 9481 + }, + { + "epoch": 2.9103744628606507, + "grad_norm": 0.28347232937812805, + "learning_rate": 8.317558959685786e-05, + "loss": 1.8061, + "step": 9482 + }, + { + "epoch": 2.9106813996316756, + "grad_norm": 0.28247126936912537, + "learning_rate": 8.317187062991482e-05, + "loss": 1.8175, + "step": 9483 + }, + { + "epoch": 2.910988336402701, + "grad_norm": 0.3153017461299896, + "learning_rate": 8.31681513351479e-05, + "loss": 1.8619, + "step": 9484 + }, + { + "epoch": 2.9112952731737263, + "grad_norm": 0.265821635723114, + "learning_rate": 8.316443171259389e-05, + "loss": 1.7783, + "step": 9485 + }, + { + "epoch": 2.9116022099447516, + "grad_norm": 0.33247366547584534, + "learning_rate": 8.31607117622895e-05, + "loss": 1.8701, + "step": 9486 + }, + { + "epoch": 2.9119091467157765, + "grad_norm": 0.3343275189399719, + "learning_rate": 8.315699148427154e-05, + "loss": 1.742, + "step": 9487 + }, + { + "epoch": 2.912216083486802, + "grad_norm": 0.3427117168903351, + "learning_rate": 8.315327087857677e-05, + "loss": 1.8382, + "step": 9488 + }, + { + "epoch": 2.9125230202578267, + "grad_norm": 0.2884635925292969, + "learning_rate": 8.31495499452419e-05, + "loss": 1.8378, + "step": 9489 + }, + { + "epoch": 2.912829957028852, + "grad_norm": 0.30335184931755066, + "learning_rate": 8.31458286843038e-05, + "loss": 1.7619, + "step": 9490 + }, + { + "epoch": 2.9131368937998774, + "grad_norm": 0.3224368095397949, + "learning_rate": 8.314210709579916e-05, + "loss": 1.8289, + "step": 9491 + }, + { + "epoch": 2.9134438305709023, + "grad_norm": 0.28016242384910583, + "learning_rate": 8.31383851797648e-05, + "loss": 1.8027, + "step": 9492 + }, + { + "epoch": 2.9137507673419276, + "grad_norm": 0.32091468572616577, + "learning_rate": 8.313466293623749e-05, + "loss": 1.9027, + "step": 9493 + }, + { + "epoch": 2.9140577041129525, + "grad_norm": 0.2809069752693176, + "learning_rate": 8.313094036525403e-05, + "loss": 1.9194, + "step": 9494 + }, + { + "epoch": 2.914364640883978, + "grad_norm": 0.30734366178512573, + "learning_rate": 8.312721746685119e-05, + "loss": 1.8612, + "step": 9495 + }, + { + "epoch": 2.914671577655003, + "grad_norm": 0.25953513383865356, + "learning_rate": 8.312349424106578e-05, + "loss": 1.7593, + "step": 9496 + }, + { + "epoch": 2.9149785144260285, + "grad_norm": 0.27583983540534973, + "learning_rate": 8.311977068793459e-05, + "loss": 1.8138, + "step": 9497 + }, + { + "epoch": 2.9152854511970534, + "grad_norm": 0.30315884947776794, + "learning_rate": 8.31160468074944e-05, + "loss": 1.7704, + "step": 9498 + }, + { + "epoch": 2.9155923879680787, + "grad_norm": 0.321603387594223, + "learning_rate": 8.311232259978204e-05, + "loss": 1.8055, + "step": 9499 + }, + { + "epoch": 2.9158993247391036, + "grad_norm": 0.27882421016693115, + "learning_rate": 8.310859806483429e-05, + "loss": 1.8257, + "step": 9500 + }, + { + "epoch": 2.916206261510129, + "grad_norm": 0.3095625042915344, + "learning_rate": 8.310487320268795e-05, + "loss": 1.8561, + "step": 9501 + }, + { + "epoch": 2.9165131982811543, + "grad_norm": 0.27503731846809387, + "learning_rate": 8.310114801337988e-05, + "loss": 1.7588, + "step": 9502 + }, + { + "epoch": 2.916820135052179, + "grad_norm": 0.2534404695034027, + "learning_rate": 8.309742249694686e-05, + "loss": 1.7289, + "step": 9503 + }, + { + "epoch": 2.9171270718232045, + "grad_norm": 0.24968849122524261, + "learning_rate": 8.30936966534257e-05, + "loss": 1.7763, + "step": 9504 + }, + { + "epoch": 2.9174340085942294, + "grad_norm": 0.2728060781955719, + "learning_rate": 8.308997048285324e-05, + "loss": 1.7847, + "step": 9505 + }, + { + "epoch": 2.9177409453652547, + "grad_norm": 0.28728193044662476, + "learning_rate": 8.308624398526629e-05, + "loss": 1.7957, + "step": 9506 + }, + { + "epoch": 2.91804788213628, + "grad_norm": 0.3097241520881653, + "learning_rate": 8.308251716070169e-05, + "loss": 1.8141, + "step": 9507 + }, + { + "epoch": 2.918354818907305, + "grad_norm": 0.3570188879966736, + "learning_rate": 8.307879000919628e-05, + "loss": 1.8246, + "step": 9508 + }, + { + "epoch": 2.9186617556783303, + "grad_norm": 0.27077826857566833, + "learning_rate": 8.307506253078685e-05, + "loss": 1.7912, + "step": 9509 + }, + { + "epoch": 2.918968692449355, + "grad_norm": 0.26213565468788147, + "learning_rate": 8.307133472551028e-05, + "loss": 1.8378, + "step": 9510 + }, + { + "epoch": 2.9192756292203805, + "grad_norm": 0.3482845723628998, + "learning_rate": 8.306760659340339e-05, + "loss": 1.8031, + "step": 9511 + }, + { + "epoch": 2.919582565991406, + "grad_norm": 0.3730507791042328, + "learning_rate": 8.306387813450303e-05, + "loss": 1.7404, + "step": 9512 + }, + { + "epoch": 2.919889502762431, + "grad_norm": 0.2957874536514282, + "learning_rate": 8.306014934884606e-05, + "loss": 1.8623, + "step": 9513 + }, + { + "epoch": 2.920196439533456, + "grad_norm": 0.29137885570526123, + "learning_rate": 8.30564202364693e-05, + "loss": 1.847, + "step": 9514 + }, + { + "epoch": 2.9205033763044814, + "grad_norm": 0.35623642802238464, + "learning_rate": 8.305269079740964e-05, + "loss": 1.8382, + "step": 9515 + }, + { + "epoch": 2.9208103130755063, + "grad_norm": 0.28263330459594727, + "learning_rate": 8.304896103170389e-05, + "loss": 1.7732, + "step": 9516 + }, + { + "epoch": 2.9211172498465316, + "grad_norm": 0.23631221055984497, + "learning_rate": 8.304523093938897e-05, + "loss": 1.7709, + "step": 9517 + }, + { + "epoch": 2.921424186617557, + "grad_norm": 0.25887101888656616, + "learning_rate": 8.304150052050169e-05, + "loss": 1.7966, + "step": 9518 + }, + { + "epoch": 2.921731123388582, + "grad_norm": 0.31445473432540894, + "learning_rate": 8.303776977507894e-05, + "loss": 1.8735, + "step": 9519 + }, + { + "epoch": 2.922038060159607, + "grad_norm": 0.264930784702301, + "learning_rate": 8.303403870315757e-05, + "loss": 1.7983, + "step": 9520 + }, + { + "epoch": 2.922344996930632, + "grad_norm": 0.2664194107055664, + "learning_rate": 8.30303073047745e-05, + "loss": 1.8573, + "step": 9521 + }, + { + "epoch": 2.9226519337016574, + "grad_norm": 0.31645768880844116, + "learning_rate": 8.302657557996656e-05, + "loss": 1.913, + "step": 9522 + }, + { + "epoch": 2.9229588704726828, + "grad_norm": 0.2820858657360077, + "learning_rate": 8.302284352877063e-05, + "loss": 1.8714, + "step": 9523 + }, + { + "epoch": 2.9232658072437077, + "grad_norm": 0.2960543930530548, + "learning_rate": 8.30191111512236e-05, + "loss": 1.8296, + "step": 9524 + }, + { + "epoch": 2.923572744014733, + "grad_norm": 0.319363534450531, + "learning_rate": 8.301537844736237e-05, + "loss": 1.8533, + "step": 9525 + }, + { + "epoch": 2.923879680785758, + "grad_norm": 0.28047996759414673, + "learning_rate": 8.301164541722384e-05, + "loss": 1.7415, + "step": 9526 + }, + { + "epoch": 2.924186617556783, + "grad_norm": 0.3106628656387329, + "learning_rate": 8.300791206084486e-05, + "loss": 1.8809, + "step": 9527 + }, + { + "epoch": 2.9244935543278086, + "grad_norm": 0.2650253474712372, + "learning_rate": 8.300417837826235e-05, + "loss": 1.8097, + "step": 9528 + }, + { + "epoch": 2.924800491098834, + "grad_norm": 0.31832796335220337, + "learning_rate": 8.30004443695132e-05, + "loss": 1.881, + "step": 9529 + }, + { + "epoch": 2.925107427869859, + "grad_norm": 0.311018168926239, + "learning_rate": 8.299671003463432e-05, + "loss": 1.8725, + "step": 9530 + }, + { + "epoch": 2.925414364640884, + "grad_norm": 0.3125450909137726, + "learning_rate": 8.299297537366262e-05, + "loss": 1.8159, + "step": 9531 + }, + { + "epoch": 2.925721301411909, + "grad_norm": 0.30022570490837097, + "learning_rate": 8.298924038663498e-05, + "loss": 1.8217, + "step": 9532 + }, + { + "epoch": 2.9260282381829343, + "grad_norm": 0.3061163127422333, + "learning_rate": 8.298550507358836e-05, + "loss": 1.8529, + "step": 9533 + }, + { + "epoch": 2.9263351749539597, + "grad_norm": 0.258891224861145, + "learning_rate": 8.298176943455962e-05, + "loss": 1.8579, + "step": 9534 + }, + { + "epoch": 2.9266421117249846, + "grad_norm": 0.2871147096157074, + "learning_rate": 8.297803346958571e-05, + "loss": 1.8699, + "step": 9535 + }, + { + "epoch": 2.92694904849601, + "grad_norm": 0.3047468066215515, + "learning_rate": 8.297429717870356e-05, + "loss": 1.9165, + "step": 9536 + }, + { + "epoch": 2.927255985267035, + "grad_norm": 0.2852346897125244, + "learning_rate": 8.297056056195005e-05, + "loss": 1.8417, + "step": 9537 + }, + { + "epoch": 2.92756292203806, + "grad_norm": 0.30782654881477356, + "learning_rate": 8.296682361936216e-05, + "loss": 1.835, + "step": 9538 + }, + { + "epoch": 2.9278698588090855, + "grad_norm": 0.44828128814697266, + "learning_rate": 8.296308635097678e-05, + "loss": 1.8997, + "step": 9539 + }, + { + "epoch": 2.9281767955801103, + "grad_norm": 0.48911961913108826, + "learning_rate": 8.295934875683087e-05, + "loss": 1.8249, + "step": 9540 + }, + { + "epoch": 2.9284837323511357, + "grad_norm": 0.3377256691455841, + "learning_rate": 8.295561083696136e-05, + "loss": 1.757, + "step": 9541 + }, + { + "epoch": 2.9287906691221606, + "grad_norm": 0.29486989974975586, + "learning_rate": 8.295187259140518e-05, + "loss": 1.8282, + "step": 9542 + }, + { + "epoch": 2.929097605893186, + "grad_norm": 0.4291549026966095, + "learning_rate": 8.294813402019927e-05, + "loss": 1.7633, + "step": 9543 + }, + { + "epoch": 2.9294045426642112, + "grad_norm": 0.43153640627861023, + "learning_rate": 8.294439512338061e-05, + "loss": 1.7904, + "step": 9544 + }, + { + "epoch": 2.9297114794352366, + "grad_norm": 0.3454402685165405, + "learning_rate": 8.294065590098611e-05, + "loss": 1.8586, + "step": 9545 + }, + { + "epoch": 2.9300184162062615, + "grad_norm": 0.2709622383117676, + "learning_rate": 8.293691635305276e-05, + "loss": 1.8225, + "step": 9546 + }, + { + "epoch": 2.930325352977287, + "grad_norm": 0.34379467368125916, + "learning_rate": 8.293317647961749e-05, + "loss": 1.9005, + "step": 9547 + }, + { + "epoch": 2.9306322897483117, + "grad_norm": 0.37137365341186523, + "learning_rate": 8.292943628071727e-05, + "loss": 1.829, + "step": 9548 + }, + { + "epoch": 2.930939226519337, + "grad_norm": 0.31634894013404846, + "learning_rate": 8.292569575638905e-05, + "loss": 1.8062, + "step": 9549 + }, + { + "epoch": 2.9312461632903624, + "grad_norm": 0.25719332695007324, + "learning_rate": 8.292195490666981e-05, + "loss": 1.8044, + "step": 9550 + }, + { + "epoch": 2.9315531000613873, + "grad_norm": 0.3341852128505707, + "learning_rate": 8.291821373159652e-05, + "loss": 1.8627, + "step": 9551 + }, + { + "epoch": 2.9318600368324126, + "grad_norm": 0.38499385118484497, + "learning_rate": 8.291447223120614e-05, + "loss": 1.8138, + "step": 9552 + }, + { + "epoch": 2.9321669736034375, + "grad_norm": 0.28036460280418396, + "learning_rate": 8.291073040553567e-05, + "loss": 1.7958, + "step": 9553 + }, + { + "epoch": 2.932473910374463, + "grad_norm": 0.30798816680908203, + "learning_rate": 8.290698825462207e-05, + "loss": 1.899, + "step": 9554 + }, + { + "epoch": 2.932780847145488, + "grad_norm": 0.40930941700935364, + "learning_rate": 8.290324577850232e-05, + "loss": 1.841, + "step": 9555 + }, + { + "epoch": 2.933087783916513, + "grad_norm": 0.38794800639152527, + "learning_rate": 8.289950297721341e-05, + "loss": 1.8022, + "step": 9556 + }, + { + "epoch": 2.9333947206875384, + "grad_norm": 0.2716790437698364, + "learning_rate": 8.289575985079232e-05, + "loss": 1.8009, + "step": 9557 + }, + { + "epoch": 2.9337016574585633, + "grad_norm": 0.3063231110572815, + "learning_rate": 8.289201639927605e-05, + "loss": 1.8677, + "step": 9558 + }, + { + "epoch": 2.9340085942295886, + "grad_norm": 0.3279048800468445, + "learning_rate": 8.28882726227016e-05, + "loss": 1.8071, + "step": 9559 + }, + { + "epoch": 2.934315531000614, + "grad_norm": 0.32144758105278015, + "learning_rate": 8.288452852110596e-05, + "loss": 1.8601, + "step": 9560 + }, + { + "epoch": 2.9346224677716393, + "grad_norm": 0.284495085477829, + "learning_rate": 8.288078409452614e-05, + "loss": 1.8358, + "step": 9561 + }, + { + "epoch": 2.934929404542664, + "grad_norm": 0.3779112696647644, + "learning_rate": 8.287703934299915e-05, + "loss": 1.7903, + "step": 9562 + }, + { + "epoch": 2.9352363413136895, + "grad_norm": 0.33851495385169983, + "learning_rate": 8.287329426656197e-05, + "loss": 1.806, + "step": 9563 + }, + { + "epoch": 2.9355432780847144, + "grad_norm": 0.26610738039016724, + "learning_rate": 8.286954886525164e-05, + "loss": 1.7739, + "step": 9564 + }, + { + "epoch": 2.9358502148557397, + "grad_norm": 0.24825556576251984, + "learning_rate": 8.286580313910515e-05, + "loss": 1.7595, + "step": 9565 + }, + { + "epoch": 2.936157151626765, + "grad_norm": 0.28356245160102844, + "learning_rate": 8.286205708815954e-05, + "loss": 1.8497, + "step": 9566 + }, + { + "epoch": 2.93646408839779, + "grad_norm": 0.2974208891391754, + "learning_rate": 8.285831071245182e-05, + "loss": 1.8561, + "step": 9567 + }, + { + "epoch": 2.9367710251688153, + "grad_norm": 0.26718810200691223, + "learning_rate": 8.2854564012019e-05, + "loss": 1.776, + "step": 9568 + }, + { + "epoch": 2.93707796193984, + "grad_norm": 0.30627691745758057, + "learning_rate": 8.285081698689814e-05, + "loss": 1.8141, + "step": 9569 + }, + { + "epoch": 2.9373848987108655, + "grad_norm": 0.33287444710731506, + "learning_rate": 8.284706963712625e-05, + "loss": 1.8727, + "step": 9570 + }, + { + "epoch": 2.937691835481891, + "grad_norm": 0.30571332573890686, + "learning_rate": 8.284332196274036e-05, + "loss": 1.8388, + "step": 9571 + }, + { + "epoch": 2.937998772252916, + "grad_norm": 0.3603699207305908, + "learning_rate": 8.283957396377753e-05, + "loss": 1.8655, + "step": 9572 + }, + { + "epoch": 2.938305709023941, + "grad_norm": 0.2890760898590088, + "learning_rate": 8.283582564027477e-05, + "loss": 1.7919, + "step": 9573 + }, + { + "epoch": 2.9386126457949664, + "grad_norm": 0.34981194138526917, + "learning_rate": 8.283207699226912e-05, + "loss": 1.8542, + "step": 9574 + }, + { + "epoch": 2.9389195825659913, + "grad_norm": 0.43490317463874817, + "learning_rate": 8.282832801979766e-05, + "loss": 1.8109, + "step": 9575 + }, + { + "epoch": 2.9392265193370166, + "grad_norm": 0.4337438941001892, + "learning_rate": 8.282457872289742e-05, + "loss": 1.8856, + "step": 9576 + }, + { + "epoch": 2.939533456108042, + "grad_norm": 0.2723710834980011, + "learning_rate": 8.282082910160544e-05, + "loss": 1.8554, + "step": 9577 + }, + { + "epoch": 2.939840392879067, + "grad_norm": 0.32447734475135803, + "learning_rate": 8.28170791559588e-05, + "loss": 1.8086, + "step": 9578 + }, + { + "epoch": 2.940147329650092, + "grad_norm": 0.3495276868343353, + "learning_rate": 8.281332888599455e-05, + "loss": 1.785, + "step": 9579 + }, + { + "epoch": 2.940454266421117, + "grad_norm": 0.3324705958366394, + "learning_rate": 8.280957829174975e-05, + "loss": 1.8086, + "step": 9580 + }, + { + "epoch": 2.9407612031921424, + "grad_norm": 0.2633898854255676, + "learning_rate": 8.280582737326146e-05, + "loss": 1.8116, + "step": 9581 + }, + { + "epoch": 2.9410681399631677, + "grad_norm": 0.3109157085418701, + "learning_rate": 8.280207613056676e-05, + "loss": 1.8649, + "step": 9582 + }, + { + "epoch": 2.9413750767341926, + "grad_norm": 0.2772599756717682, + "learning_rate": 8.279832456370273e-05, + "loss": 1.8578, + "step": 9583 + }, + { + "epoch": 2.941682013505218, + "grad_norm": 0.32322654128074646, + "learning_rate": 8.279457267270642e-05, + "loss": 1.8621, + "step": 9584 + }, + { + "epoch": 2.941988950276243, + "grad_norm": 0.3678343594074249, + "learning_rate": 8.279082045761493e-05, + "loss": 1.8819, + "step": 9585 + }, + { + "epoch": 2.942295887047268, + "grad_norm": 0.30976057052612305, + "learning_rate": 8.27870679184653e-05, + "loss": 1.8126, + "step": 9586 + }, + { + "epoch": 2.9426028238182935, + "grad_norm": 0.26715603470802307, + "learning_rate": 8.278331505529469e-05, + "loss": 1.8831, + "step": 9587 + }, + { + "epoch": 2.942909760589319, + "grad_norm": 0.263288289308548, + "learning_rate": 8.277956186814014e-05, + "loss": 1.8057, + "step": 9588 + }, + { + "epoch": 2.9432166973603437, + "grad_norm": 0.29458633065223694, + "learning_rate": 8.277580835703873e-05, + "loss": 1.7307, + "step": 9589 + }, + { + "epoch": 2.943523634131369, + "grad_norm": 0.27819791436195374, + "learning_rate": 8.277205452202759e-05, + "loss": 1.8783, + "step": 9590 + }, + { + "epoch": 2.943830570902394, + "grad_norm": 0.29286056756973267, + "learning_rate": 8.276830036314379e-05, + "loss": 1.8061, + "step": 9591 + }, + { + "epoch": 2.9441375076734193, + "grad_norm": 0.2955230474472046, + "learning_rate": 8.276454588042442e-05, + "loss": 1.8227, + "step": 9592 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.3070714473724365, + "learning_rate": 8.276079107390663e-05, + "loss": 1.8451, + "step": 9593 + }, + { + "epoch": 2.9447513812154695, + "grad_norm": 0.34235841035842896, + "learning_rate": 8.275703594362749e-05, + "loss": 1.8052, + "step": 9594 + }, + { + "epoch": 2.945058317986495, + "grad_norm": 0.2863236665725708, + "learning_rate": 8.275328048962412e-05, + "loss": 1.8741, + "step": 9595 + }, + { + "epoch": 2.9453652547575198, + "grad_norm": 0.3013235032558441, + "learning_rate": 8.274952471193364e-05, + "loss": 1.8177, + "step": 9596 + }, + { + "epoch": 2.945672191528545, + "grad_norm": 0.2994023561477661, + "learning_rate": 8.274576861059316e-05, + "loss": 1.903, + "step": 9597 + }, + { + "epoch": 2.9459791282995704, + "grad_norm": 0.320049524307251, + "learning_rate": 8.27420121856398e-05, + "loss": 1.882, + "step": 9598 + }, + { + "epoch": 2.9462860650705953, + "grad_norm": 0.2789655029773712, + "learning_rate": 8.273825543711069e-05, + "loss": 1.794, + "step": 9599 + }, + { + "epoch": 2.9465930018416207, + "grad_norm": 0.3148564398288727, + "learning_rate": 8.273449836504294e-05, + "loss": 1.8453, + "step": 9600 + }, + { + "epoch": 2.9468999386126455, + "grad_norm": 0.46754372119903564, + "learning_rate": 8.273074096947371e-05, + "loss": 1.8147, + "step": 9601 + }, + { + "epoch": 2.947206875383671, + "grad_norm": 0.5946900844573975, + "learning_rate": 8.27269832504401e-05, + "loss": 1.8099, + "step": 9602 + }, + { + "epoch": 2.947513812154696, + "grad_norm": 0.4916069507598877, + "learning_rate": 8.272322520797926e-05, + "loss": 1.8315, + "step": 9603 + }, + { + "epoch": 2.9478207489257215, + "grad_norm": 0.30378973484039307, + "learning_rate": 8.271946684212833e-05, + "loss": 1.87, + "step": 9604 + }, + { + "epoch": 2.9481276856967464, + "grad_norm": 0.5197327136993408, + "learning_rate": 8.271570815292447e-05, + "loss": 1.8109, + "step": 9605 + }, + { + "epoch": 2.9484346224677718, + "grad_norm": 0.7213841080665588, + "learning_rate": 8.271194914040478e-05, + "loss": 1.8526, + "step": 9606 + }, + { + "epoch": 2.9487415592387967, + "grad_norm": 0.5521572232246399, + "learning_rate": 8.270818980460643e-05, + "loss": 1.7982, + "step": 9607 + }, + { + "epoch": 2.949048496009822, + "grad_norm": 0.3072868287563324, + "learning_rate": 8.27044301455666e-05, + "loss": 1.8708, + "step": 9608 + }, + { + "epoch": 2.9493554327808473, + "grad_norm": 0.5477200746536255, + "learning_rate": 8.270067016332241e-05, + "loss": 1.8708, + "step": 9609 + }, + { + "epoch": 2.949662369551872, + "grad_norm": 0.5991030335426331, + "learning_rate": 8.269690985791104e-05, + "loss": 1.7983, + "step": 9610 + }, + { + "epoch": 2.9499693063228976, + "grad_norm": 0.33343803882598877, + "learning_rate": 8.269314922936964e-05, + "loss": 1.7867, + "step": 9611 + }, + { + "epoch": 2.9502762430939224, + "grad_norm": 0.3671727776527405, + "learning_rate": 8.268938827773538e-05, + "loss": 1.9604, + "step": 9612 + }, + { + "epoch": 2.950583179864948, + "grad_norm": 0.5015503764152527, + "learning_rate": 8.26856270030454e-05, + "loss": 1.8424, + "step": 9613 + }, + { + "epoch": 2.950890116635973, + "grad_norm": 0.4369170367717743, + "learning_rate": 8.268186540533693e-05, + "loss": 1.7915, + "step": 9614 + }, + { + "epoch": 2.951197053406998, + "grad_norm": 0.2739746868610382, + "learning_rate": 8.267810348464709e-05, + "loss": 1.7816, + "step": 9615 + }, + { + "epoch": 2.9515039901780233, + "grad_norm": 0.3660983145236969, + "learning_rate": 8.26743412410131e-05, + "loss": 1.8235, + "step": 9616 + }, + { + "epoch": 2.9518109269490482, + "grad_norm": 0.44442248344421387, + "learning_rate": 8.26705786744721e-05, + "loss": 1.8566, + "step": 9617 + }, + { + "epoch": 2.9521178637200736, + "grad_norm": 0.28847622871398926, + "learning_rate": 8.266681578506129e-05, + "loss": 1.82, + "step": 9618 + }, + { + "epoch": 2.952424800491099, + "grad_norm": 0.32827475666999817, + "learning_rate": 8.266305257281786e-05, + "loss": 1.8422, + "step": 9619 + }, + { + "epoch": 2.9527317372621242, + "grad_norm": 0.3459654748439789, + "learning_rate": 8.265928903777902e-05, + "loss": 1.7919, + "step": 9620 + }, + { + "epoch": 2.953038674033149, + "grad_norm": 0.31467050313949585, + "learning_rate": 8.265552517998191e-05, + "loss": 1.8178, + "step": 9621 + }, + { + "epoch": 2.9533456108041745, + "grad_norm": 0.2814936935901642, + "learning_rate": 8.265176099946381e-05, + "loss": 1.7823, + "step": 9622 + }, + { + "epoch": 2.9536525475751993, + "grad_norm": 0.36387261748313904, + "learning_rate": 8.264799649626182e-05, + "loss": 1.7861, + "step": 9623 + }, + { + "epoch": 2.9539594843462247, + "grad_norm": 0.3504095673561096, + "learning_rate": 8.264423167041322e-05, + "loss": 1.8216, + "step": 9624 + }, + { + "epoch": 2.95426642111725, + "grad_norm": 0.28199300169944763, + "learning_rate": 8.264046652195519e-05, + "loss": 1.8397, + "step": 9625 + }, + { + "epoch": 2.954573357888275, + "grad_norm": 0.435774028301239, + "learning_rate": 8.263670105092494e-05, + "loss": 1.8316, + "step": 9626 + }, + { + "epoch": 2.9548802946593002, + "grad_norm": 0.37712937593460083, + "learning_rate": 8.263293525735967e-05, + "loss": 1.8089, + "step": 9627 + }, + { + "epoch": 2.955187231430325, + "grad_norm": 0.34833967685699463, + "learning_rate": 8.26291691412966e-05, + "loss": 1.8324, + "step": 9628 + }, + { + "epoch": 2.9554941682013505, + "grad_norm": 0.37515538930892944, + "learning_rate": 8.262540270277297e-05, + "loss": 1.7958, + "step": 9629 + }, + { + "epoch": 2.955801104972376, + "grad_norm": 0.3392273485660553, + "learning_rate": 8.262163594182598e-05, + "loss": 1.8322, + "step": 9630 + }, + { + "epoch": 2.9561080417434007, + "grad_norm": 0.3477925956249237, + "learning_rate": 8.261786885849287e-05, + "loss": 1.8525, + "step": 9631 + }, + { + "epoch": 2.956414978514426, + "grad_norm": 0.35574036836624146, + "learning_rate": 8.261410145281085e-05, + "loss": 1.8148, + "step": 9632 + }, + { + "epoch": 2.9567219152854514, + "grad_norm": 0.3166620135307312, + "learning_rate": 8.261033372481717e-05, + "loss": 1.7914, + "step": 9633 + }, + { + "epoch": 2.9570288520564763, + "grad_norm": 0.2562217116355896, + "learning_rate": 8.260656567454907e-05, + "loss": 1.7794, + "step": 9634 + }, + { + "epoch": 2.9573357888275016, + "grad_norm": 0.3328792452812195, + "learning_rate": 8.260279730204377e-05, + "loss": 1.8235, + "step": 9635 + }, + { + "epoch": 2.957642725598527, + "grad_norm": 0.33144834637641907, + "learning_rate": 8.259902860733852e-05, + "loss": 1.7668, + "step": 9636 + }, + { + "epoch": 2.957949662369552, + "grad_norm": 0.30557021498680115, + "learning_rate": 8.259525959047056e-05, + "loss": 1.9135, + "step": 9637 + }, + { + "epoch": 2.958256599140577, + "grad_norm": 0.2901468575000763, + "learning_rate": 8.259149025147713e-05, + "loss": 1.8023, + "step": 9638 + }, + { + "epoch": 2.958563535911602, + "grad_norm": 0.35177919268608093, + "learning_rate": 8.25877205903955e-05, + "loss": 1.8541, + "step": 9639 + }, + { + "epoch": 2.9588704726826274, + "grad_norm": 0.2745177447795868, + "learning_rate": 8.258395060726291e-05, + "loss": 1.8103, + "step": 9640 + }, + { + "epoch": 2.9591774094536527, + "grad_norm": 0.29005685448646545, + "learning_rate": 8.258018030211663e-05, + "loss": 1.7587, + "step": 9641 + }, + { + "epoch": 2.9594843462246776, + "grad_norm": 0.27498918771743774, + "learning_rate": 8.257640967499391e-05, + "loss": 1.8052, + "step": 9642 + }, + { + "epoch": 2.959791282995703, + "grad_norm": 0.2689644694328308, + "learning_rate": 8.257263872593202e-05, + "loss": 1.8582, + "step": 9643 + }, + { + "epoch": 2.960098219766728, + "grad_norm": 0.2953707277774811, + "learning_rate": 8.256886745496821e-05, + "loss": 1.7654, + "step": 9644 + }, + { + "epoch": 2.960405156537753, + "grad_norm": 0.2573971450328827, + "learning_rate": 8.256509586213978e-05, + "loss": 1.7819, + "step": 9645 + }, + { + "epoch": 2.9607120933087785, + "grad_norm": 0.29667192697525024, + "learning_rate": 8.256132394748398e-05, + "loss": 1.8632, + "step": 9646 + }, + { + "epoch": 2.961019030079804, + "grad_norm": 0.2953830361366272, + "learning_rate": 8.255755171103808e-05, + "loss": 1.8672, + "step": 9647 + }, + { + "epoch": 2.9613259668508287, + "grad_norm": 0.2925500273704529, + "learning_rate": 8.255377915283937e-05, + "loss": 1.8691, + "step": 9648 + }, + { + "epoch": 2.961632903621854, + "grad_norm": 0.32245302200317383, + "learning_rate": 8.255000627292515e-05, + "loss": 1.8701, + "step": 9649 + }, + { + "epoch": 2.961939840392879, + "grad_norm": 0.2671414315700531, + "learning_rate": 8.254623307133268e-05, + "loss": 1.8045, + "step": 9650 + }, + { + "epoch": 2.9622467771639043, + "grad_norm": 0.3135749101638794, + "learning_rate": 8.254245954809928e-05, + "loss": 1.7573, + "step": 9651 + }, + { + "epoch": 2.9625537139349296, + "grad_norm": 0.2604369521141052, + "learning_rate": 8.253868570326218e-05, + "loss": 1.8513, + "step": 9652 + }, + { + "epoch": 2.9628606507059545, + "grad_norm": 0.24657092988491058, + "learning_rate": 8.253491153685875e-05, + "loss": 1.8303, + "step": 9653 + }, + { + "epoch": 2.96316758747698, + "grad_norm": 0.24310527741909027, + "learning_rate": 8.253113704892623e-05, + "loss": 1.7648, + "step": 9654 + }, + { + "epoch": 2.9634745242480047, + "grad_norm": 0.24558408558368683, + "learning_rate": 8.252736223950198e-05, + "loss": 1.7517, + "step": 9655 + }, + { + "epoch": 2.96378146101903, + "grad_norm": 0.2500043511390686, + "learning_rate": 8.252358710862324e-05, + "loss": 1.7588, + "step": 9656 + }, + { + "epoch": 2.9640883977900554, + "grad_norm": 0.2532055079936981, + "learning_rate": 8.251981165632737e-05, + "loss": 1.8414, + "step": 9657 + }, + { + "epoch": 2.9643953345610803, + "grad_norm": 0.2692684829235077, + "learning_rate": 8.251603588265165e-05, + "loss": 1.8701, + "step": 9658 + }, + { + "epoch": 2.9647022713321056, + "grad_norm": 0.2511022984981537, + "learning_rate": 8.251225978763341e-05, + "loss": 1.8068, + "step": 9659 + }, + { + "epoch": 2.9650092081031305, + "grad_norm": 0.24702081084251404, + "learning_rate": 8.250848337130997e-05, + "loss": 1.7993, + "step": 9660 + }, + { + "epoch": 2.965316144874156, + "grad_norm": 0.26960623264312744, + "learning_rate": 8.250470663371862e-05, + "loss": 1.8269, + "step": 9661 + }, + { + "epoch": 2.965623081645181, + "grad_norm": 0.2651064693927765, + "learning_rate": 8.250092957489673e-05, + "loss": 1.8235, + "step": 9662 + }, + { + "epoch": 2.9659300184162065, + "grad_norm": 0.3117934465408325, + "learning_rate": 8.249715219488158e-05, + "loss": 1.9603, + "step": 9663 + }, + { + "epoch": 2.9662369551872314, + "grad_norm": 0.3244706988334656, + "learning_rate": 8.249337449371055e-05, + "loss": 1.8766, + "step": 9664 + }, + { + "epoch": 2.9665438919582567, + "grad_norm": 0.3071763515472412, + "learning_rate": 8.248959647142094e-05, + "loss": 1.8118, + "step": 9665 + }, + { + "epoch": 2.9668508287292816, + "grad_norm": 0.2575626075267792, + "learning_rate": 8.24858181280501e-05, + "loss": 1.8578, + "step": 9666 + }, + { + "epoch": 2.967157765500307, + "grad_norm": 0.369356244802475, + "learning_rate": 8.248203946363535e-05, + "loss": 1.7831, + "step": 9667 + }, + { + "epoch": 2.9674647022713323, + "grad_norm": 0.317775160074234, + "learning_rate": 8.247826047821405e-05, + "loss": 1.8839, + "step": 9668 + }, + { + "epoch": 2.967771639042357, + "grad_norm": 0.31816980242729187, + "learning_rate": 8.247448117182355e-05, + "loss": 1.8111, + "step": 9669 + }, + { + "epoch": 2.9680785758133825, + "grad_norm": 0.2943781316280365, + "learning_rate": 8.247070154450119e-05, + "loss": 1.848, + "step": 9670 + }, + { + "epoch": 2.9683855125844074, + "grad_norm": 0.28252434730529785, + "learning_rate": 8.246692159628433e-05, + "loss": 1.8601, + "step": 9671 + }, + { + "epoch": 2.9686924493554327, + "grad_norm": 0.29150691628456116, + "learning_rate": 8.246314132721032e-05, + "loss": 1.7738, + "step": 9672 + }, + { + "epoch": 2.968999386126458, + "grad_norm": 0.3699757754802704, + "learning_rate": 8.245936073731653e-05, + "loss": 1.842, + "step": 9673 + }, + { + "epoch": 2.969306322897483, + "grad_norm": 0.37951794266700745, + "learning_rate": 8.245557982664031e-05, + "loss": 1.8648, + "step": 9674 + }, + { + "epoch": 2.9696132596685083, + "grad_norm": 0.2792273461818695, + "learning_rate": 8.245179859521901e-05, + "loss": 1.889, + "step": 9675 + }, + { + "epoch": 2.969920196439533, + "grad_norm": 0.3405047059059143, + "learning_rate": 8.244801704309002e-05, + "loss": 1.7658, + "step": 9676 + }, + { + "epoch": 2.9702271332105585, + "grad_norm": 0.40138551592826843, + "learning_rate": 8.244423517029072e-05, + "loss": 1.79, + "step": 9677 + }, + { + "epoch": 2.970534069981584, + "grad_norm": 0.42260462045669556, + "learning_rate": 8.244045297685846e-05, + "loss": 1.9248, + "step": 9678 + }, + { + "epoch": 2.970841006752609, + "grad_norm": 0.30391061305999756, + "learning_rate": 8.243667046283063e-05, + "loss": 1.7922, + "step": 9679 + }, + { + "epoch": 2.971147943523634, + "grad_norm": 0.3194752037525177, + "learning_rate": 8.243288762824463e-05, + "loss": 1.8582, + "step": 9680 + }, + { + "epoch": 2.9714548802946594, + "grad_norm": 0.47853100299835205, + "learning_rate": 8.24291044731378e-05, + "loss": 1.8206, + "step": 9681 + }, + { + "epoch": 2.9717618170656843, + "grad_norm": 0.47428956627845764, + "learning_rate": 8.242532099754756e-05, + "loss": 1.8271, + "step": 9682 + }, + { + "epoch": 2.9720687538367097, + "grad_norm": 0.30275169014930725, + "learning_rate": 8.24215372015113e-05, + "loss": 1.8532, + "step": 9683 + }, + { + "epoch": 2.972375690607735, + "grad_norm": 0.31766825914382935, + "learning_rate": 8.24177530850664e-05, + "loss": 1.7751, + "step": 9684 + }, + { + "epoch": 2.97268262737876, + "grad_norm": 0.3738986551761627, + "learning_rate": 8.241396864825026e-05, + "loss": 1.7644, + "step": 9685 + }, + { + "epoch": 2.972989564149785, + "grad_norm": 0.2794596254825592, + "learning_rate": 8.24101838911003e-05, + "loss": 1.7445, + "step": 9686 + }, + { + "epoch": 2.97329650092081, + "grad_norm": 0.30008718371391296, + "learning_rate": 8.240639881365388e-05, + "loss": 1.8181, + "step": 9687 + }, + { + "epoch": 2.9736034376918354, + "grad_norm": 0.36667200922966003, + "learning_rate": 8.240261341594846e-05, + "loss": 1.8606, + "step": 9688 + }, + { + "epoch": 2.9739103744628608, + "grad_norm": 0.2943612039089203, + "learning_rate": 8.23988276980214e-05, + "loss": 1.8169, + "step": 9689 + }, + { + "epoch": 2.9742173112338857, + "grad_norm": 0.3499365746974945, + "learning_rate": 8.239504165991015e-05, + "loss": 1.8901, + "step": 9690 + }, + { + "epoch": 2.974524248004911, + "grad_norm": 0.35552978515625, + "learning_rate": 8.239125530165211e-05, + "loss": 1.8266, + "step": 9691 + }, + { + "epoch": 2.974831184775936, + "grad_norm": 0.35415011644363403, + "learning_rate": 8.23874686232847e-05, + "loss": 1.8588, + "step": 9692 + }, + { + "epoch": 2.9751381215469612, + "grad_norm": 0.3237420618534088, + "learning_rate": 8.238368162484533e-05, + "loss": 1.8112, + "step": 9693 + }, + { + "epoch": 2.9754450583179866, + "grad_norm": 0.31672203540802, + "learning_rate": 8.237989430637145e-05, + "loss": 1.7983, + "step": 9694 + }, + { + "epoch": 2.975751995089012, + "grad_norm": 0.2926657795906067, + "learning_rate": 8.237610666790048e-05, + "loss": 1.8137, + "step": 9695 + }, + { + "epoch": 2.976058931860037, + "grad_norm": 0.2924230992794037, + "learning_rate": 8.237231870946983e-05, + "loss": 1.8789, + "step": 9696 + }, + { + "epoch": 2.976365868631062, + "grad_norm": 0.2768077850341797, + "learning_rate": 8.236853043111697e-05, + "loss": 1.8643, + "step": 9697 + }, + { + "epoch": 2.976672805402087, + "grad_norm": 0.24151389300823212, + "learning_rate": 8.23647418328793e-05, + "loss": 1.8245, + "step": 9698 + }, + { + "epoch": 2.9769797421731123, + "grad_norm": 0.24514195322990417, + "learning_rate": 8.23609529147943e-05, + "loss": 1.761, + "step": 9699 + }, + { + "epoch": 2.9772866789441377, + "grad_norm": 0.2619125545024872, + "learning_rate": 8.235716367689938e-05, + "loss": 1.8445, + "step": 9700 + }, + { + "epoch": 2.9775936157151626, + "grad_norm": 0.2570437490940094, + "learning_rate": 8.235337411923203e-05, + "loss": 1.7881, + "step": 9701 + }, + { + "epoch": 2.977900552486188, + "grad_norm": 0.288775235414505, + "learning_rate": 8.234958424182966e-05, + "loss": 1.8177, + "step": 9702 + }, + { + "epoch": 2.978207489257213, + "grad_norm": 0.3186240792274475, + "learning_rate": 8.234579404472973e-05, + "loss": 1.8438, + "step": 9703 + }, + { + "epoch": 2.978514426028238, + "grad_norm": 0.2520117163658142, + "learning_rate": 8.23420035279697e-05, + "loss": 1.7791, + "step": 9704 + }, + { + "epoch": 2.9788213627992635, + "grad_norm": 0.23164312541484833, + "learning_rate": 8.233821269158706e-05, + "loss": 1.7368, + "step": 9705 + }, + { + "epoch": 2.979128299570289, + "grad_norm": 0.33843451738357544, + "learning_rate": 8.233442153561924e-05, + "loss": 1.8656, + "step": 9706 + }, + { + "epoch": 2.9794352363413137, + "grad_norm": 0.3070257604122162, + "learning_rate": 8.23306300601037e-05, + "loss": 1.7982, + "step": 9707 + }, + { + "epoch": 2.979742173112339, + "grad_norm": 0.29138872027397156, + "learning_rate": 8.232683826507793e-05, + "loss": 1.8227, + "step": 9708 + }, + { + "epoch": 2.980049109883364, + "grad_norm": 0.22698308527469635, + "learning_rate": 8.23230461505794e-05, + "loss": 1.7841, + "step": 9709 + }, + { + "epoch": 2.9803560466543892, + "grad_norm": 0.2597857713699341, + "learning_rate": 8.231925371664559e-05, + "loss": 1.7438, + "step": 9710 + }, + { + "epoch": 2.9806629834254146, + "grad_norm": 0.28672367334365845, + "learning_rate": 8.231546096331395e-05, + "loss": 1.8415, + "step": 9711 + }, + { + "epoch": 2.9809699201964395, + "grad_norm": 0.24295037984848022, + "learning_rate": 8.2311667890622e-05, + "loss": 1.8179, + "step": 9712 + }, + { + "epoch": 2.981276856967465, + "grad_norm": 0.24558894336223602, + "learning_rate": 8.23078744986072e-05, + "loss": 1.8092, + "step": 9713 + }, + { + "epoch": 2.9815837937384897, + "grad_norm": 0.2644276022911072, + "learning_rate": 8.230408078730706e-05, + "loss": 1.8214, + "step": 9714 + }, + { + "epoch": 2.981890730509515, + "grad_norm": 0.27007076144218445, + "learning_rate": 8.230028675675907e-05, + "loss": 1.8042, + "step": 9715 + }, + { + "epoch": 2.9821976672805404, + "grad_norm": 0.2729937732219696, + "learning_rate": 8.229649240700069e-05, + "loss": 1.8419, + "step": 9716 + }, + { + "epoch": 2.9825046040515653, + "grad_norm": 0.26545679569244385, + "learning_rate": 8.229269773806945e-05, + "loss": 1.823, + "step": 9717 + }, + { + "epoch": 2.9828115408225906, + "grad_norm": 0.23276878893375397, + "learning_rate": 8.228890275000285e-05, + "loss": 1.7635, + "step": 9718 + }, + { + "epoch": 2.9831184775936155, + "grad_norm": 0.28991779685020447, + "learning_rate": 8.228510744283837e-05, + "loss": 1.8303, + "step": 9719 + }, + { + "epoch": 2.983425414364641, + "grad_norm": 0.2821960151195526, + "learning_rate": 8.228131181661357e-05, + "loss": 1.8246, + "step": 9720 + }, + { + "epoch": 2.983732351135666, + "grad_norm": 0.25588423013687134, + "learning_rate": 8.22775158713659e-05, + "loss": 1.7764, + "step": 9721 + }, + { + "epoch": 2.9840392879066915, + "grad_norm": 0.2694758176803589, + "learning_rate": 8.227371960713289e-05, + "loss": 1.8026, + "step": 9722 + }, + { + "epoch": 2.9843462246777164, + "grad_norm": 0.27571097016334534, + "learning_rate": 8.226992302395209e-05, + "loss": 1.8051, + "step": 9723 + }, + { + "epoch": 2.9846531614487417, + "grad_norm": 0.2940119504928589, + "learning_rate": 8.226612612186099e-05, + "loss": 1.8782, + "step": 9724 + }, + { + "epoch": 2.9849600982197666, + "grad_norm": 0.34924936294555664, + "learning_rate": 8.226232890089711e-05, + "loss": 1.7845, + "step": 9725 + }, + { + "epoch": 2.985267034990792, + "grad_norm": 0.30503180623054504, + "learning_rate": 8.2258531361098e-05, + "loss": 1.8345, + "step": 9726 + }, + { + "epoch": 2.9855739717618173, + "grad_norm": 0.2463730275630951, + "learning_rate": 8.225473350250117e-05, + "loss": 1.8188, + "step": 9727 + }, + { + "epoch": 2.985880908532842, + "grad_norm": 0.3514629900455475, + "learning_rate": 8.225093532514417e-05, + "loss": 1.9253, + "step": 9728 + }, + { + "epoch": 2.9861878453038675, + "grad_norm": 0.26462769508361816, + "learning_rate": 8.224713682906449e-05, + "loss": 1.7396, + "step": 9729 + }, + { + "epoch": 2.9864947820748924, + "grad_norm": 0.27125996351242065, + "learning_rate": 8.224333801429973e-05, + "loss": 1.7784, + "step": 9730 + }, + { + "epoch": 2.9868017188459177, + "grad_norm": 0.3083387315273285, + "learning_rate": 8.22395388808874e-05, + "loss": 1.8503, + "step": 9731 + }, + { + "epoch": 2.987108655616943, + "grad_norm": 0.28289708495140076, + "learning_rate": 8.223573942886505e-05, + "loss": 1.8337, + "step": 9732 + }, + { + "epoch": 2.987415592387968, + "grad_norm": 0.3667753040790558, + "learning_rate": 8.223193965827023e-05, + "loss": 1.8213, + "step": 9733 + }, + { + "epoch": 2.9877225291589933, + "grad_norm": 0.3568948805332184, + "learning_rate": 8.222813956914049e-05, + "loss": 1.8337, + "step": 9734 + }, + { + "epoch": 2.988029465930018, + "grad_norm": 0.2883065640926361, + "learning_rate": 8.22243391615134e-05, + "loss": 1.7227, + "step": 9735 + }, + { + "epoch": 2.9883364027010435, + "grad_norm": 0.24940936267375946, + "learning_rate": 8.222053843542648e-05, + "loss": 1.7889, + "step": 9736 + }, + { + "epoch": 2.988643339472069, + "grad_norm": 0.31267982721328735, + "learning_rate": 8.221673739091732e-05, + "loss": 1.8432, + "step": 9737 + }, + { + "epoch": 2.988950276243094, + "grad_norm": 0.3552311658859253, + "learning_rate": 8.221293602802349e-05, + "loss": 1.8569, + "step": 9738 + }, + { + "epoch": 2.989257213014119, + "grad_norm": 0.4149966835975647, + "learning_rate": 8.220913434678252e-05, + "loss": 1.8052, + "step": 9739 + }, + { + "epoch": 2.9895641497851444, + "grad_norm": 0.282320499420166, + "learning_rate": 8.220533234723204e-05, + "loss": 1.7629, + "step": 9740 + }, + { + "epoch": 2.9898710865561693, + "grad_norm": 0.27737030386924744, + "learning_rate": 8.220153002940958e-05, + "loss": 1.8331, + "step": 9741 + }, + { + "epoch": 2.9901780233271946, + "grad_norm": 0.29296645522117615, + "learning_rate": 8.219772739335272e-05, + "loss": 1.8414, + "step": 9742 + }, + { + "epoch": 2.99048496009822, + "grad_norm": 0.35226449370384216, + "learning_rate": 8.219392443909903e-05, + "loss": 1.8608, + "step": 9743 + }, + { + "epoch": 2.990791896869245, + "grad_norm": 0.3199223577976227, + "learning_rate": 8.219012116668612e-05, + "loss": 1.7868, + "step": 9744 + }, + { + "epoch": 2.99109883364027, + "grad_norm": 0.2904597818851471, + "learning_rate": 8.218631757615159e-05, + "loss": 1.8495, + "step": 9745 + }, + { + "epoch": 2.991405770411295, + "grad_norm": 0.34674009680747986, + "learning_rate": 8.218251366753298e-05, + "loss": 1.8143, + "step": 9746 + }, + { + "epoch": 2.9917127071823204, + "grad_norm": 0.38007479906082153, + "learning_rate": 8.217870944086791e-05, + "loss": 1.8534, + "step": 9747 + }, + { + "epoch": 2.9920196439533457, + "grad_norm": 0.31660130620002747, + "learning_rate": 8.217490489619398e-05, + "loss": 1.7807, + "step": 9748 + }, + { + "epoch": 2.9923265807243706, + "grad_norm": 0.2923539876937866, + "learning_rate": 8.217110003354877e-05, + "loss": 1.8517, + "step": 9749 + }, + { + "epoch": 2.992633517495396, + "grad_norm": 0.31018227338790894, + "learning_rate": 8.21672948529699e-05, + "loss": 1.7998, + "step": 9750 + }, + { + "epoch": 2.992940454266421, + "grad_norm": 0.29448994994163513, + "learning_rate": 8.216348935449496e-05, + "loss": 1.7883, + "step": 9751 + }, + { + "epoch": 2.993247391037446, + "grad_norm": 0.26120781898498535, + "learning_rate": 8.215968353816158e-05, + "loss": 1.7762, + "step": 9752 + }, + { + "epoch": 2.9935543278084715, + "grad_norm": 0.27784180641174316, + "learning_rate": 8.215587740400735e-05, + "loss": 1.8711, + "step": 9753 + }, + { + "epoch": 2.993861264579497, + "grad_norm": 0.3106052577495575, + "learning_rate": 8.21520709520699e-05, + "loss": 1.8112, + "step": 9754 + }, + { + "epoch": 2.9941682013505218, + "grad_norm": 0.3170885145664215, + "learning_rate": 8.214826418238684e-05, + "loss": 1.8893, + "step": 9755 + }, + { + "epoch": 2.994475138121547, + "grad_norm": 0.2969432473182678, + "learning_rate": 8.214445709499577e-05, + "loss": 1.8628, + "step": 9756 + }, + { + "epoch": 2.994782074892572, + "grad_norm": 0.30484744906425476, + "learning_rate": 8.214064968993436e-05, + "loss": 1.8421, + "step": 9757 + }, + { + "epoch": 2.9950890116635973, + "grad_norm": 0.24819856882095337, + "learning_rate": 8.213684196724019e-05, + "loss": 1.8243, + "step": 9758 + }, + { + "epoch": 2.9953959484346226, + "grad_norm": 0.28566786646842957, + "learning_rate": 8.213303392695092e-05, + "loss": 1.8064, + "step": 9759 + }, + { + "epoch": 2.9957028852056475, + "grad_norm": 0.27742111682891846, + "learning_rate": 8.212922556910418e-05, + "loss": 1.8174, + "step": 9760 + }, + { + "epoch": 2.996009821976673, + "grad_norm": 0.27103090286254883, + "learning_rate": 8.212541689373761e-05, + "loss": 1.761, + "step": 9761 + }, + { + "epoch": 2.9963167587476978, + "grad_norm": 0.27157172560691833, + "learning_rate": 8.212160790088883e-05, + "loss": 1.8893, + "step": 9762 + }, + { + "epoch": 2.996623695518723, + "grad_norm": 0.2742370367050171, + "learning_rate": 8.21177985905955e-05, + "loss": 1.8774, + "step": 9763 + }, + { + "epoch": 2.9969306322897484, + "grad_norm": 0.26467064023017883, + "learning_rate": 8.211398896289524e-05, + "loss": 1.7805, + "step": 9764 + }, + { + "epoch": 2.9972375690607733, + "grad_norm": 0.2622149884700775, + "learning_rate": 8.211017901782574e-05, + "loss": 1.7346, + "step": 9765 + }, + { + "epoch": 2.9975445058317987, + "grad_norm": 0.3163202106952667, + "learning_rate": 8.210636875542462e-05, + "loss": 1.8348, + "step": 9766 + }, + { + "epoch": 2.9978514426028235, + "grad_norm": 0.2789528965950012, + "learning_rate": 8.210255817572955e-05, + "loss": 1.7535, + "step": 9767 + }, + { + "epoch": 2.998158379373849, + "grad_norm": 0.25694188475608826, + "learning_rate": 8.209874727877818e-05, + "loss": 1.8731, + "step": 9768 + }, + { + "epoch": 2.998465316144874, + "grad_norm": 0.40298742055892944, + "learning_rate": 8.209493606460818e-05, + "loss": 1.7924, + "step": 9769 + }, + { + "epoch": 2.9987722529158995, + "grad_norm": 0.5090280771255493, + "learning_rate": 8.20911245332572e-05, + "loss": 1.8253, + "step": 9770 + }, + { + "epoch": 2.9990791896869244, + "grad_norm": 0.41809162497520447, + "learning_rate": 8.208731268476293e-05, + "loss": 1.8233, + "step": 9771 + }, + { + "epoch": 2.9993861264579498, + "grad_norm": 0.23141434788703918, + "learning_rate": 8.208350051916303e-05, + "loss": 1.7842, + "step": 9772 + }, + { + "epoch": 2.9996930632289747, + "grad_norm": 0.3174372613430023, + "learning_rate": 8.207968803649517e-05, + "loss": 1.8477, + "step": 9773 + }, + { + "epoch": 3.0, + "grad_norm": 0.41795292496681213, + "learning_rate": 8.207587523679704e-05, + "loss": 1.8407, + "step": 9774 + }, + { + "epoch": 3.0003069367710253, + "grad_norm": 0.43365660309791565, + "learning_rate": 8.20720621201063e-05, + "loss": 1.8074, + "step": 9775 + }, + { + "epoch": 3.0006138735420502, + "grad_norm": 0.461374968290329, + "learning_rate": 8.206824868646064e-05, + "loss": 1.9089, + "step": 9776 + }, + { + "epoch": 3.0009208103130756, + "grad_norm": 0.3747929632663727, + "learning_rate": 8.206443493589776e-05, + "loss": 1.8358, + "step": 9777 + }, + { + "epoch": 3.001227747084101, + "grad_norm": 0.28436774015426636, + "learning_rate": 8.206062086845532e-05, + "loss": 1.8527, + "step": 9778 + }, + { + "epoch": 3.001534683855126, + "grad_norm": 0.33642131090164185, + "learning_rate": 8.205680648417106e-05, + "loss": 1.8142, + "step": 9779 + }, + { + "epoch": 3.001841620626151, + "grad_norm": 0.4283481240272522, + "learning_rate": 8.205299178308263e-05, + "loss": 1.9006, + "step": 9780 + }, + { + "epoch": 3.002148557397176, + "grad_norm": 0.34405630826950073, + "learning_rate": 8.204917676522777e-05, + "loss": 1.7988, + "step": 9781 + }, + { + "epoch": 3.0024554941682013, + "grad_norm": 0.3161070942878723, + "learning_rate": 8.204536143064414e-05, + "loss": 1.8271, + "step": 9782 + }, + { + "epoch": 3.0027624309392267, + "grad_norm": 0.42518749833106995, + "learning_rate": 8.204154577936946e-05, + "loss": 1.864, + "step": 9783 + }, + { + "epoch": 3.0030693677102516, + "grad_norm": 0.3760852813720703, + "learning_rate": 8.203772981144146e-05, + "loss": 1.8543, + "step": 9784 + }, + { + "epoch": 3.003376304481277, + "grad_norm": 0.32794755697250366, + "learning_rate": 8.203391352689784e-05, + "loss": 1.8776, + "step": 9785 + }, + { + "epoch": 3.0036832412523022, + "grad_norm": 0.3053889274597168, + "learning_rate": 8.20300969257763e-05, + "loss": 1.8064, + "step": 9786 + }, + { + "epoch": 3.003990178023327, + "grad_norm": 0.40283143520355225, + "learning_rate": 8.202628000811456e-05, + "loss": 1.8083, + "step": 9787 + }, + { + "epoch": 3.0042971147943525, + "grad_norm": 0.49270665645599365, + "learning_rate": 8.202246277395038e-05, + "loss": 1.802, + "step": 9788 + }, + { + "epoch": 3.0046040515653774, + "grad_norm": 0.4373023211956024, + "learning_rate": 8.201864522332143e-05, + "loss": 1.8429, + "step": 9789 + }, + { + "epoch": 3.0049109883364027, + "grad_norm": 0.3136310875415802, + "learning_rate": 8.201482735626547e-05, + "loss": 1.8224, + "step": 9790 + }, + { + "epoch": 3.005217925107428, + "grad_norm": 0.3306807279586792, + "learning_rate": 8.201100917282023e-05, + "loss": 1.8463, + "step": 9791 + }, + { + "epoch": 3.005524861878453, + "grad_norm": 0.45082196593284607, + "learning_rate": 8.200719067302342e-05, + "loss": 1.7587, + "step": 9792 + }, + { + "epoch": 3.0058317986494782, + "grad_norm": 0.49246448278427124, + "learning_rate": 8.20033718569128e-05, + "loss": 1.8245, + "step": 9793 + }, + { + "epoch": 3.0061387354205036, + "grad_norm": 0.3040246367454529, + "learning_rate": 8.199955272452609e-05, + "loss": 1.8309, + "step": 9794 + }, + { + "epoch": 3.0064456721915285, + "grad_norm": 0.3909318149089813, + "learning_rate": 8.199573327590105e-05, + "loss": 1.8187, + "step": 9795 + }, + { + "epoch": 3.006752608962554, + "grad_norm": 0.5753183960914612, + "learning_rate": 8.199191351107543e-05, + "loss": 1.826, + "step": 9796 + }, + { + "epoch": 3.0070595457335787, + "grad_norm": 0.48908689618110657, + "learning_rate": 8.198809343008695e-05, + "loss": 1.8475, + "step": 9797 + }, + { + "epoch": 3.007366482504604, + "grad_norm": 0.31570208072662354, + "learning_rate": 8.198427303297341e-05, + "loss": 1.8046, + "step": 9798 + }, + { + "epoch": 3.0076734192756294, + "grad_norm": 0.39205440878868103, + "learning_rate": 8.198045231977251e-05, + "loss": 1.8413, + "step": 9799 + }, + { + "epoch": 3.0079803560466543, + "grad_norm": 0.5117597579956055, + "learning_rate": 8.197663129052204e-05, + "loss": 1.8184, + "step": 9800 + }, + { + "epoch": 3.0082872928176796, + "grad_norm": 0.3623514175415039, + "learning_rate": 8.197280994525978e-05, + "loss": 1.8292, + "step": 9801 + }, + { + "epoch": 3.008594229588705, + "grad_norm": 0.2826726734638214, + "learning_rate": 8.196898828402344e-05, + "loss": 1.8216, + "step": 9802 + }, + { + "epoch": 3.00890116635973, + "grad_norm": 0.38658398389816284, + "learning_rate": 8.196516630685085e-05, + "loss": 1.867, + "step": 9803 + }, + { + "epoch": 3.009208103130755, + "grad_norm": 0.3371698260307312, + "learning_rate": 8.196134401377973e-05, + "loss": 1.8077, + "step": 9804 + }, + { + "epoch": 3.00951503990178, + "grad_norm": 0.24108785390853882, + "learning_rate": 8.195752140484789e-05, + "loss": 1.7858, + "step": 9805 + }, + { + "epoch": 3.0098219766728054, + "grad_norm": 0.34410104155540466, + "learning_rate": 8.195369848009309e-05, + "loss": 1.801, + "step": 9806 + }, + { + "epoch": 3.0101289134438307, + "grad_norm": 0.3412116467952728, + "learning_rate": 8.194987523955311e-05, + "loss": 1.7905, + "step": 9807 + }, + { + "epoch": 3.0104358502148556, + "grad_norm": 0.2473030537366867, + "learning_rate": 8.194605168326573e-05, + "loss": 1.7765, + "step": 9808 + }, + { + "epoch": 3.010742786985881, + "grad_norm": 0.28590065240859985, + "learning_rate": 8.194222781126875e-05, + "loss": 1.7897, + "step": 9809 + }, + { + "epoch": 3.0110497237569063, + "grad_norm": 0.2994272708892822, + "learning_rate": 8.193840362359994e-05, + "loss": 1.7976, + "step": 9810 + }, + { + "epoch": 3.011356660527931, + "grad_norm": 0.2971307635307312, + "learning_rate": 8.193457912029713e-05, + "loss": 1.829, + "step": 9811 + }, + { + "epoch": 3.0116635972989565, + "grad_norm": 0.25149810314178467, + "learning_rate": 8.193075430139809e-05, + "loss": 1.7709, + "step": 9812 + }, + { + "epoch": 3.0119705340699814, + "grad_norm": 0.2561332583427429, + "learning_rate": 8.19269291669406e-05, + "loss": 1.7689, + "step": 9813 + }, + { + "epoch": 3.0122774708410067, + "grad_norm": 0.2658882141113281, + "learning_rate": 8.192310371696249e-05, + "loss": 1.8497, + "step": 9814 + }, + { + "epoch": 3.012584407612032, + "grad_norm": 0.2873780429363251, + "learning_rate": 8.191927795150156e-05, + "loss": 1.8217, + "step": 9815 + }, + { + "epoch": 3.012891344383057, + "grad_norm": 0.2181183248758316, + "learning_rate": 8.191545187059562e-05, + "loss": 1.7261, + "step": 9816 + }, + { + "epoch": 3.0131982811540823, + "grad_norm": 0.2414858490228653, + "learning_rate": 8.191162547428248e-05, + "loss": 1.8035, + "step": 9817 + }, + { + "epoch": 3.0135052179251076, + "grad_norm": 0.2799840271472931, + "learning_rate": 8.190779876259995e-05, + "loss": 1.8279, + "step": 9818 + }, + { + "epoch": 3.0138121546961325, + "grad_norm": 0.2669760584831238, + "learning_rate": 8.190397173558584e-05, + "loss": 1.8155, + "step": 9819 + }, + { + "epoch": 3.014119091467158, + "grad_norm": 0.28857991099357605, + "learning_rate": 8.1900144393278e-05, + "loss": 1.8479, + "step": 9820 + }, + { + "epoch": 3.0144260282381827, + "grad_norm": 0.30534693598747253, + "learning_rate": 8.189631673571422e-05, + "loss": 1.8609, + "step": 9821 + }, + { + "epoch": 3.014732965009208, + "grad_norm": 0.3238218128681183, + "learning_rate": 8.189248876293236e-05, + "loss": 1.9292, + "step": 9822 + }, + { + "epoch": 3.0150399017802334, + "grad_norm": 0.3000536561012268, + "learning_rate": 8.188866047497022e-05, + "loss": 1.8214, + "step": 9823 + }, + { + "epoch": 3.0153468385512583, + "grad_norm": 0.2960065007209778, + "learning_rate": 8.188483187186565e-05, + "loss": 1.8316, + "step": 9824 + }, + { + "epoch": 3.0156537753222836, + "grad_norm": 0.28609779477119446, + "learning_rate": 8.188100295365648e-05, + "loss": 1.8002, + "step": 9825 + }, + { + "epoch": 3.015960712093309, + "grad_norm": 0.31390634179115295, + "learning_rate": 8.187717372038057e-05, + "loss": 1.8134, + "step": 9826 + }, + { + "epoch": 3.016267648864334, + "grad_norm": 0.28550946712493896, + "learning_rate": 8.187334417207573e-05, + "loss": 1.8359, + "step": 9827 + }, + { + "epoch": 3.016574585635359, + "grad_norm": 0.3085210621356964, + "learning_rate": 8.186951430877982e-05, + "loss": 1.813, + "step": 9828 + }, + { + "epoch": 3.016881522406384, + "grad_norm": 0.3043847978115082, + "learning_rate": 8.18656841305307e-05, + "loss": 1.8222, + "step": 9829 + }, + { + "epoch": 3.0171884591774094, + "grad_norm": 0.32524731755256653, + "learning_rate": 8.18618536373662e-05, + "loss": 1.8258, + "step": 9830 + }, + { + "epoch": 3.0174953959484347, + "grad_norm": 0.2690991461277008, + "learning_rate": 8.18580228293242e-05, + "loss": 1.8492, + "step": 9831 + }, + { + "epoch": 3.0178023327194596, + "grad_norm": 0.34936225414276123, + "learning_rate": 8.185419170644253e-05, + "loss": 1.8363, + "step": 9832 + }, + { + "epoch": 3.018109269490485, + "grad_norm": 0.3274296820163727, + "learning_rate": 8.185036026875908e-05, + "loss": 1.7789, + "step": 9833 + }, + { + "epoch": 3.0184162062615103, + "grad_norm": 0.2729836106300354, + "learning_rate": 8.184652851631169e-05, + "loss": 1.8264, + "step": 9834 + }, + { + "epoch": 3.018723143032535, + "grad_norm": 0.28682780265808105, + "learning_rate": 8.184269644913826e-05, + "loss": 1.8399, + "step": 9835 + }, + { + "epoch": 3.0190300798035605, + "grad_norm": 0.3224826455116272, + "learning_rate": 8.183886406727662e-05, + "loss": 1.8338, + "step": 9836 + }, + { + "epoch": 3.0193370165745854, + "grad_norm": 0.30945318937301636, + "learning_rate": 8.183503137076467e-05, + "loss": 1.8248, + "step": 9837 + }, + { + "epoch": 3.0196439533456108, + "grad_norm": 0.27580398321151733, + "learning_rate": 8.183119835964029e-05, + "loss": 1.8096, + "step": 9838 + }, + { + "epoch": 3.019950890116636, + "grad_norm": 0.28927183151245117, + "learning_rate": 8.182736503394132e-05, + "loss": 1.825, + "step": 9839 + }, + { + "epoch": 3.020257826887661, + "grad_norm": 0.253000408411026, + "learning_rate": 8.182353139370571e-05, + "loss": 1.7678, + "step": 9840 + }, + { + "epoch": 3.0205647636586863, + "grad_norm": 0.2882022559642792, + "learning_rate": 8.18196974389713e-05, + "loss": 1.8895, + "step": 9841 + }, + { + "epoch": 3.0208717004297116, + "grad_norm": 0.26864609122276306, + "learning_rate": 8.1815863169776e-05, + "loss": 1.7674, + "step": 9842 + }, + { + "epoch": 3.0211786372007365, + "grad_norm": 0.27344849705696106, + "learning_rate": 8.181202858615769e-05, + "loss": 1.8146, + "step": 9843 + }, + { + "epoch": 3.021485573971762, + "grad_norm": 0.31659772992134094, + "learning_rate": 8.180819368815425e-05, + "loss": 1.8485, + "step": 9844 + }, + { + "epoch": 3.021792510742787, + "grad_norm": 0.3163176476955414, + "learning_rate": 8.18043584758036e-05, + "loss": 1.8994, + "step": 9845 + }, + { + "epoch": 3.022099447513812, + "grad_norm": 0.2583829462528229, + "learning_rate": 8.180052294914365e-05, + "loss": 1.764, + "step": 9846 + }, + { + "epoch": 3.0224063842848374, + "grad_norm": 0.3006649315357208, + "learning_rate": 8.179668710821227e-05, + "loss": 1.9232, + "step": 9847 + }, + { + "epoch": 3.0227133210558623, + "grad_norm": 0.35702988505363464, + "learning_rate": 8.179285095304741e-05, + "loss": 1.8403, + "step": 9848 + }, + { + "epoch": 3.0230202578268877, + "grad_norm": 0.29699379205703735, + "learning_rate": 8.178901448368697e-05, + "loss": 1.8412, + "step": 9849 + }, + { + "epoch": 3.023327194597913, + "grad_norm": 0.3022700548171997, + "learning_rate": 8.178517770016885e-05, + "loss": 1.8197, + "step": 9850 + }, + { + "epoch": 3.023634131368938, + "grad_norm": 0.2943836748600006, + "learning_rate": 8.178134060253097e-05, + "loss": 1.8127, + "step": 9851 + }, + { + "epoch": 3.023941068139963, + "grad_norm": 0.31290489435195923, + "learning_rate": 8.177750319081126e-05, + "loss": 1.821, + "step": 9852 + }, + { + "epoch": 3.0242480049109886, + "grad_norm": 0.30308374762535095, + "learning_rate": 8.177366546504763e-05, + "loss": 1.8522, + "step": 9853 + }, + { + "epoch": 3.0245549416820134, + "grad_norm": 0.301559716463089, + "learning_rate": 8.176982742527802e-05, + "loss": 1.8758, + "step": 9854 + }, + { + "epoch": 3.0248618784530388, + "grad_norm": 0.33314836025238037, + "learning_rate": 8.176598907154034e-05, + "loss": 1.8178, + "step": 9855 + }, + { + "epoch": 3.0251688152240637, + "grad_norm": 0.3567935526371002, + "learning_rate": 8.176215040387255e-05, + "loss": 1.7847, + "step": 9856 + }, + { + "epoch": 3.025475751995089, + "grad_norm": 0.27716195583343506, + "learning_rate": 8.175831142231258e-05, + "loss": 1.772, + "step": 9857 + }, + { + "epoch": 3.0257826887661143, + "grad_norm": 0.24568212032318115, + "learning_rate": 8.175447212689836e-05, + "loss": 1.8171, + "step": 9858 + }, + { + "epoch": 3.0260896255371392, + "grad_norm": 0.25368261337280273, + "learning_rate": 8.175063251766784e-05, + "loss": 1.852, + "step": 9859 + }, + { + "epoch": 3.0263965623081646, + "grad_norm": 0.2509497404098511, + "learning_rate": 8.174679259465894e-05, + "loss": 1.7737, + "step": 9860 + }, + { + "epoch": 3.02670349907919, + "grad_norm": 0.3539343774318695, + "learning_rate": 8.174295235790963e-05, + "loss": 1.8663, + "step": 9861 + }, + { + "epoch": 3.027010435850215, + "grad_norm": 0.36450034379959106, + "learning_rate": 8.173911180745788e-05, + "loss": 1.8179, + "step": 9862 + }, + { + "epoch": 3.02731737262124, + "grad_norm": 0.3550017178058624, + "learning_rate": 8.173527094334162e-05, + "loss": 1.8256, + "step": 9863 + }, + { + "epoch": 3.027624309392265, + "grad_norm": 0.33518701791763306, + "learning_rate": 8.17314297655988e-05, + "loss": 1.7842, + "step": 9864 + }, + { + "epoch": 3.0279312461632903, + "grad_norm": 0.2522886097431183, + "learning_rate": 8.172758827426739e-05, + "loss": 1.7688, + "step": 9865 + }, + { + "epoch": 3.0282381829343157, + "grad_norm": 0.26222914457321167, + "learning_rate": 8.172374646938536e-05, + "loss": 1.8517, + "step": 9866 + }, + { + "epoch": 3.0285451197053406, + "grad_norm": 0.3355788588523865, + "learning_rate": 8.171990435099068e-05, + "loss": 1.9002, + "step": 9867 + }, + { + "epoch": 3.028852056476366, + "grad_norm": 0.32907500863075256, + "learning_rate": 8.171606191912131e-05, + "loss": 1.7801, + "step": 9868 + }, + { + "epoch": 3.0291589932473912, + "grad_norm": 0.29234179854393005, + "learning_rate": 8.171221917381523e-05, + "loss": 1.8055, + "step": 9869 + }, + { + "epoch": 3.029465930018416, + "grad_norm": 0.26374876499176025, + "learning_rate": 8.170837611511041e-05, + "loss": 1.781, + "step": 9870 + }, + { + "epoch": 3.0297728667894415, + "grad_norm": 0.311282217502594, + "learning_rate": 8.170453274304483e-05, + "loss": 1.839, + "step": 9871 + }, + { + "epoch": 3.0300798035604664, + "grad_norm": 0.24225831031799316, + "learning_rate": 8.170068905765648e-05, + "loss": 1.804, + "step": 9872 + }, + { + "epoch": 3.0303867403314917, + "grad_norm": 0.29383334517478943, + "learning_rate": 8.169684505898335e-05, + "loss": 1.7817, + "step": 9873 + }, + { + "epoch": 3.030693677102517, + "grad_norm": 0.2607928514480591, + "learning_rate": 8.169300074706339e-05, + "loss": 1.8379, + "step": 9874 + }, + { + "epoch": 3.031000613873542, + "grad_norm": 0.283028244972229, + "learning_rate": 8.168915612193464e-05, + "loss": 1.7797, + "step": 9875 + }, + { + "epoch": 3.0313075506445673, + "grad_norm": 0.27675309777259827, + "learning_rate": 8.168531118363508e-05, + "loss": 1.8355, + "step": 9876 + }, + { + "epoch": 3.0316144874155926, + "grad_norm": 0.2598227262496948, + "learning_rate": 8.16814659322027e-05, + "loss": 1.7898, + "step": 9877 + }, + { + "epoch": 3.0319214241866175, + "grad_norm": 0.24715003371238708, + "learning_rate": 8.16776203676755e-05, + "loss": 1.7791, + "step": 9878 + }, + { + "epoch": 3.032228360957643, + "grad_norm": 0.2749374210834503, + "learning_rate": 8.167377449009149e-05, + "loss": 1.8303, + "step": 9879 + }, + { + "epoch": 3.0325352977286677, + "grad_norm": 0.26150834560394287, + "learning_rate": 8.166992829948868e-05, + "loss": 1.8462, + "step": 9880 + }, + { + "epoch": 3.032842234499693, + "grad_norm": 0.3044755160808563, + "learning_rate": 8.166608179590506e-05, + "loss": 1.806, + "step": 9881 + }, + { + "epoch": 3.0331491712707184, + "grad_norm": 0.2949555516242981, + "learning_rate": 8.166223497937868e-05, + "loss": 1.8785, + "step": 9882 + }, + { + "epoch": 3.0334561080417433, + "grad_norm": 0.33206698298454285, + "learning_rate": 8.165838784994752e-05, + "loss": 1.8476, + "step": 9883 + }, + { + "epoch": 3.0337630448127686, + "grad_norm": 0.2720400094985962, + "learning_rate": 8.165454040764962e-05, + "loss": 1.843, + "step": 9884 + }, + { + "epoch": 3.034069981583794, + "grad_norm": 0.29340869188308716, + "learning_rate": 8.1650692652523e-05, + "loss": 1.7761, + "step": 9885 + }, + { + "epoch": 3.034376918354819, + "grad_norm": 0.35155293345451355, + "learning_rate": 8.16468445846057e-05, + "loss": 1.8887, + "step": 9886 + }, + { + "epoch": 3.034683855125844, + "grad_norm": 0.2688990831375122, + "learning_rate": 8.164299620393571e-05, + "loss": 1.8001, + "step": 9887 + }, + { + "epoch": 3.034990791896869, + "grad_norm": 0.2921253442764282, + "learning_rate": 8.16391475105511e-05, + "loss": 1.7951, + "step": 9888 + }, + { + "epoch": 3.0352977286678944, + "grad_norm": 0.28100699186325073, + "learning_rate": 8.163529850448988e-05, + "loss": 1.8041, + "step": 9889 + }, + { + "epoch": 3.0356046654389197, + "grad_norm": 0.3155081868171692, + "learning_rate": 8.16314491857901e-05, + "loss": 1.8026, + "step": 9890 + }, + { + "epoch": 3.0359116022099446, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.16275995544898e-05, + "loss": 1.8502, + "step": 9891 + }, + { + "epoch": 3.03621853898097, + "grad_norm": 0.2732076644897461, + "learning_rate": 8.162374961062704e-05, + "loss": 1.8424, + "step": 9892 + }, + { + "epoch": 3.0365254757519953, + "grad_norm": 0.2943679690361023, + "learning_rate": 8.161989935423984e-05, + "loss": 1.7635, + "step": 9893 + }, + { + "epoch": 3.03683241252302, + "grad_norm": 0.28894683718681335, + "learning_rate": 8.161604878536626e-05, + "loss": 1.78, + "step": 9894 + }, + { + "epoch": 3.0371393492940455, + "grad_norm": 0.2718082666397095, + "learning_rate": 8.161219790404435e-05, + "loss": 1.7664, + "step": 9895 + }, + { + "epoch": 3.0374462860650704, + "grad_norm": 0.29092124104499817, + "learning_rate": 8.160834671031216e-05, + "loss": 1.8621, + "step": 9896 + }, + { + "epoch": 3.0377532228360957, + "grad_norm": 0.284665584564209, + "learning_rate": 8.160449520420779e-05, + "loss": 1.8607, + "step": 9897 + }, + { + "epoch": 3.038060159607121, + "grad_norm": 0.23676982522010803, + "learning_rate": 8.160064338576925e-05, + "loss": 1.7137, + "step": 9898 + }, + { + "epoch": 3.038367096378146, + "grad_norm": 0.2666932940483093, + "learning_rate": 8.159679125503466e-05, + "loss": 1.8038, + "step": 9899 + }, + { + "epoch": 3.0386740331491713, + "grad_norm": 0.36214375495910645, + "learning_rate": 8.159293881204204e-05, + "loss": 1.8902, + "step": 9900 + }, + { + "epoch": 3.0389809699201966, + "grad_norm": 0.30301332473754883, + "learning_rate": 8.158908605682948e-05, + "loss": 1.8456, + "step": 9901 + }, + { + "epoch": 3.0392879066912215, + "grad_norm": 0.32190418243408203, + "learning_rate": 8.158523298943506e-05, + "loss": 1.8246, + "step": 9902 + }, + { + "epoch": 3.039594843462247, + "grad_norm": 0.2938043475151062, + "learning_rate": 8.158137960989685e-05, + "loss": 1.8324, + "step": 9903 + }, + { + "epoch": 3.0399017802332717, + "grad_norm": 0.29493969678878784, + "learning_rate": 8.157752591825294e-05, + "loss": 1.8458, + "step": 9904 + }, + { + "epoch": 3.040208717004297, + "grad_norm": 0.2681889832019806, + "learning_rate": 8.157367191454141e-05, + "loss": 1.889, + "step": 9905 + }, + { + "epoch": 3.0405156537753224, + "grad_norm": 0.3111969232559204, + "learning_rate": 8.156981759880035e-05, + "loss": 1.8966, + "step": 9906 + }, + { + "epoch": 3.0408225905463473, + "grad_norm": 0.345262736082077, + "learning_rate": 8.156596297106784e-05, + "loss": 1.8174, + "step": 9907 + }, + { + "epoch": 3.0411295273173726, + "grad_norm": 0.30156534910202026, + "learning_rate": 8.156210803138199e-05, + "loss": 1.766, + "step": 9908 + }, + { + "epoch": 3.041436464088398, + "grad_norm": 0.28691565990448, + "learning_rate": 8.15582527797809e-05, + "loss": 1.8436, + "step": 9909 + }, + { + "epoch": 3.041743400859423, + "grad_norm": 0.33418282866477966, + "learning_rate": 8.155439721630264e-05, + "loss": 1.8939, + "step": 9910 + }, + { + "epoch": 3.042050337630448, + "grad_norm": 0.25496938824653625, + "learning_rate": 8.155054134098535e-05, + "loss": 1.8368, + "step": 9911 + }, + { + "epoch": 3.042357274401473, + "grad_norm": 0.3806788921356201, + "learning_rate": 8.154668515386711e-05, + "loss": 1.8635, + "step": 9912 + }, + { + "epoch": 3.0426642111724984, + "grad_norm": 0.42668119072914124, + "learning_rate": 8.154282865498603e-05, + "loss": 1.76, + "step": 9913 + }, + { + "epoch": 3.0429711479435237, + "grad_norm": 0.35945314168930054, + "learning_rate": 8.153897184438024e-05, + "loss": 1.8275, + "step": 9914 + }, + { + "epoch": 3.0432780847145486, + "grad_norm": 0.3225449323654175, + "learning_rate": 8.153511472208784e-05, + "loss": 1.7901, + "step": 9915 + }, + { + "epoch": 3.043585021485574, + "grad_norm": 0.2905425727367401, + "learning_rate": 8.153125728814694e-05, + "loss": 1.8021, + "step": 9916 + }, + { + "epoch": 3.0438919582565993, + "grad_norm": 0.3315529525279999, + "learning_rate": 8.15273995425957e-05, + "loss": 1.8003, + "step": 9917 + }, + { + "epoch": 3.044198895027624, + "grad_norm": 0.30256444215774536, + "learning_rate": 8.152354148547221e-05, + "loss": 1.8243, + "step": 9918 + }, + { + "epoch": 3.0445058317986495, + "grad_norm": 0.2563035190105438, + "learning_rate": 8.15196831168146e-05, + "loss": 1.7877, + "step": 9919 + }, + { + "epoch": 3.044812768569675, + "grad_norm": 0.25705814361572266, + "learning_rate": 8.151582443666101e-05, + "loss": 1.813, + "step": 9920 + }, + { + "epoch": 3.0451197053406998, + "grad_norm": 0.3649071455001831, + "learning_rate": 8.151196544504957e-05, + "loss": 1.8114, + "step": 9921 + }, + { + "epoch": 3.045426642111725, + "grad_norm": 0.4076193571090698, + "learning_rate": 8.150810614201841e-05, + "loss": 1.7869, + "step": 9922 + }, + { + "epoch": 3.04573357888275, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.150424652760569e-05, + "loss": 1.7878, + "step": 9923 + }, + { + "epoch": 3.0460405156537753, + "grad_norm": 0.2243243157863617, + "learning_rate": 8.150038660184955e-05, + "loss": 1.8224, + "step": 9924 + }, + { + "epoch": 3.0463474524248007, + "grad_norm": 0.3295031487941742, + "learning_rate": 8.149652636478811e-05, + "loss": 1.8685, + "step": 9925 + }, + { + "epoch": 3.0466543891958255, + "grad_norm": 0.2973531186580658, + "learning_rate": 8.149266581645954e-05, + "loss": 1.8082, + "step": 9926 + }, + { + "epoch": 3.046961325966851, + "grad_norm": 0.25648918747901917, + "learning_rate": 8.148880495690199e-05, + "loss": 1.8089, + "step": 9927 + }, + { + "epoch": 3.047268262737876, + "grad_norm": 0.2845752537250519, + "learning_rate": 8.148494378615361e-05, + "loss": 1.8726, + "step": 9928 + }, + { + "epoch": 3.047575199508901, + "grad_norm": 0.2917105555534363, + "learning_rate": 8.148108230425255e-05, + "loss": 1.8035, + "step": 9929 + }, + { + "epoch": 3.0478821362799264, + "grad_norm": 0.2775834798812866, + "learning_rate": 8.1477220511237e-05, + "loss": 1.8545, + "step": 9930 + }, + { + "epoch": 3.0481890730509513, + "grad_norm": 0.3522767424583435, + "learning_rate": 8.14733584071451e-05, + "loss": 1.8261, + "step": 9931 + }, + { + "epoch": 3.0484960098219767, + "grad_norm": 0.3759000599384308, + "learning_rate": 8.146949599201503e-05, + "loss": 1.8405, + "step": 9932 + }, + { + "epoch": 3.048802946593002, + "grad_norm": 0.3353044390678406, + "learning_rate": 8.146563326588496e-05, + "loss": 1.7762, + "step": 9933 + }, + { + "epoch": 3.049109883364027, + "grad_norm": 0.263810932636261, + "learning_rate": 8.146177022879304e-05, + "loss": 1.7546, + "step": 9934 + }, + { + "epoch": 3.049416820135052, + "grad_norm": 0.24064256250858307, + "learning_rate": 8.14579068807775e-05, + "loss": 1.7903, + "step": 9935 + }, + { + "epoch": 3.0497237569060776, + "grad_norm": 0.3144194781780243, + "learning_rate": 8.145404322187645e-05, + "loss": 1.8011, + "step": 9936 + }, + { + "epoch": 3.0500306936771024, + "grad_norm": 0.3362879455089569, + "learning_rate": 8.145017925212812e-05, + "loss": 1.8224, + "step": 9937 + }, + { + "epoch": 3.050337630448128, + "grad_norm": 0.33979395031929016, + "learning_rate": 8.144631497157071e-05, + "loss": 1.8415, + "step": 9938 + }, + { + "epoch": 3.0506445672191527, + "grad_norm": 0.33391237258911133, + "learning_rate": 8.144245038024235e-05, + "loss": 1.7983, + "step": 9939 + }, + { + "epoch": 3.050951503990178, + "grad_norm": 0.34034964442253113, + "learning_rate": 8.143858547818128e-05, + "loss": 1.8635, + "step": 9940 + }, + { + "epoch": 3.0512584407612033, + "grad_norm": 0.3472529947757721, + "learning_rate": 8.143472026542569e-05, + "loss": 1.8067, + "step": 9941 + }, + { + "epoch": 3.0515653775322282, + "grad_norm": 0.3369109630584717, + "learning_rate": 8.143085474201376e-05, + "loss": 1.7933, + "step": 9942 + }, + { + "epoch": 3.0518723143032536, + "grad_norm": 0.3055182993412018, + "learning_rate": 8.14269889079837e-05, + "loss": 1.7358, + "step": 9943 + }, + { + "epoch": 3.052179251074279, + "grad_norm": 0.26729708909988403, + "learning_rate": 8.142312276337372e-05, + "loss": 1.8315, + "step": 9944 + }, + { + "epoch": 3.052486187845304, + "grad_norm": 0.3626720607280731, + "learning_rate": 8.141925630822203e-05, + "loss": 1.7593, + "step": 9945 + }, + { + "epoch": 3.052793124616329, + "grad_norm": 0.3673512637615204, + "learning_rate": 8.141538954256683e-05, + "loss": 1.8414, + "step": 9946 + }, + { + "epoch": 3.053100061387354, + "grad_norm": 0.30554768443107605, + "learning_rate": 8.141152246644632e-05, + "loss": 1.7504, + "step": 9947 + }, + { + "epoch": 3.0534069981583793, + "grad_norm": 0.41163405776023865, + "learning_rate": 8.140765507989875e-05, + "loss": 1.8794, + "step": 9948 + }, + { + "epoch": 3.0537139349294047, + "grad_norm": 0.592751145362854, + "learning_rate": 8.140378738296233e-05, + "loss": 1.8538, + "step": 9949 + }, + { + "epoch": 3.0540208717004296, + "grad_norm": 0.483828604221344, + "learning_rate": 8.139991937567527e-05, + "loss": 1.7952, + "step": 9950 + }, + { + "epoch": 3.054327808471455, + "grad_norm": 0.26665306091308594, + "learning_rate": 8.13960510580758e-05, + "loss": 1.8268, + "step": 9951 + }, + { + "epoch": 3.0546347452424802, + "grad_norm": 0.42917072772979736, + "learning_rate": 8.139218243020215e-05, + "loss": 1.843, + "step": 9952 + }, + { + "epoch": 3.054941682013505, + "grad_norm": 0.47911396622657776, + "learning_rate": 8.138831349209256e-05, + "loss": 1.8223, + "step": 9953 + }, + { + "epoch": 3.0552486187845305, + "grad_norm": 0.4540431797504425, + "learning_rate": 8.138444424378524e-05, + "loss": 1.9198, + "step": 9954 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.29719051718711853, + "learning_rate": 8.138057468531845e-05, + "loss": 1.7873, + "step": 9955 + }, + { + "epoch": 3.0558624923265807, + "grad_norm": 0.35133618116378784, + "learning_rate": 8.137670481673045e-05, + "loss": 1.8459, + "step": 9956 + }, + { + "epoch": 3.056169429097606, + "grad_norm": 0.42896488308906555, + "learning_rate": 8.137283463805945e-05, + "loss": 1.7814, + "step": 9957 + }, + { + "epoch": 3.056476365868631, + "grad_norm": 0.38993972539901733, + "learning_rate": 8.136896414934372e-05, + "loss": 1.7636, + "step": 9958 + }, + { + "epoch": 3.0567833026396563, + "grad_norm": 0.31362372636795044, + "learning_rate": 8.13650933506215e-05, + "loss": 1.8021, + "step": 9959 + }, + { + "epoch": 3.0570902394106816, + "grad_norm": 0.27980196475982666, + "learning_rate": 8.136122224193103e-05, + "loss": 1.8445, + "step": 9960 + }, + { + "epoch": 3.0573971761817065, + "grad_norm": 0.2721461057662964, + "learning_rate": 8.135735082331059e-05, + "loss": 1.7614, + "step": 9961 + }, + { + "epoch": 3.057704112952732, + "grad_norm": 0.25157424807548523, + "learning_rate": 8.135347909479843e-05, + "loss": 1.7598, + "step": 9962 + }, + { + "epoch": 3.0580110497237567, + "grad_norm": 0.25798025727272034, + "learning_rate": 8.13496070564328e-05, + "loss": 1.7823, + "step": 9963 + }, + { + "epoch": 3.058317986494782, + "grad_norm": 0.30775198340415955, + "learning_rate": 8.134573470825199e-05, + "loss": 1.7755, + "step": 9964 + }, + { + "epoch": 3.0586249232658074, + "grad_norm": 0.28916797041893005, + "learning_rate": 8.134186205029426e-05, + "loss": 1.8189, + "step": 9965 + }, + { + "epoch": 3.0589318600368323, + "grad_norm": 0.2829149067401886, + "learning_rate": 8.133798908259787e-05, + "loss": 1.8546, + "step": 9966 + }, + { + "epoch": 3.0592387968078576, + "grad_norm": 0.2884117662906647, + "learning_rate": 8.13341158052011e-05, + "loss": 1.7705, + "step": 9967 + }, + { + "epoch": 3.059545733578883, + "grad_norm": 0.28311973810195923, + "learning_rate": 8.133024221814225e-05, + "loss": 1.8147, + "step": 9968 + }, + { + "epoch": 3.059852670349908, + "grad_norm": 0.25405213236808777, + "learning_rate": 8.132636832145957e-05, + "loss": 1.7813, + "step": 9969 + }, + { + "epoch": 3.060159607120933, + "grad_norm": 0.3082229793071747, + "learning_rate": 8.132249411519137e-05, + "loss": 1.8536, + "step": 9970 + }, + { + "epoch": 3.060466543891958, + "grad_norm": 0.29918181896209717, + "learning_rate": 8.13186195993759e-05, + "loss": 1.8181, + "step": 9971 + }, + { + "epoch": 3.0607734806629834, + "grad_norm": 0.3025238811969757, + "learning_rate": 8.13147447740515e-05, + "loss": 1.7785, + "step": 9972 + }, + { + "epoch": 3.0610804174340087, + "grad_norm": 0.2798222303390503, + "learning_rate": 8.131086963925643e-05, + "loss": 1.7873, + "step": 9973 + }, + { + "epoch": 3.0613873542050336, + "grad_norm": 0.32636210322380066, + "learning_rate": 8.130699419502898e-05, + "loss": 1.882, + "step": 9974 + }, + { + "epoch": 3.061694290976059, + "grad_norm": 0.27722054719924927, + "learning_rate": 8.130311844140748e-05, + "loss": 1.7788, + "step": 9975 + }, + { + "epoch": 3.0620012277470843, + "grad_norm": 0.289156436920166, + "learning_rate": 8.129924237843023e-05, + "loss": 1.8591, + "step": 9976 + }, + { + "epoch": 3.062308164518109, + "grad_norm": 0.2839665412902832, + "learning_rate": 8.12953660061355e-05, + "loss": 1.8255, + "step": 9977 + }, + { + "epoch": 3.0626151012891345, + "grad_norm": 0.2650148272514343, + "learning_rate": 8.129148932456161e-05, + "loss": 1.8353, + "step": 9978 + }, + { + "epoch": 3.06292203806016, + "grad_norm": 0.2884560227394104, + "learning_rate": 8.128761233374691e-05, + "loss": 1.8099, + "step": 9979 + }, + { + "epoch": 3.0632289748311847, + "grad_norm": 0.2610029876232147, + "learning_rate": 8.128373503372967e-05, + "loss": 1.8173, + "step": 9980 + }, + { + "epoch": 3.06353591160221, + "grad_norm": 0.32512393593788147, + "learning_rate": 8.127985742454822e-05, + "loss": 1.8619, + "step": 9981 + }, + { + "epoch": 3.063842848373235, + "grad_norm": 0.3382968604564667, + "learning_rate": 8.127597950624091e-05, + "loss": 1.831, + "step": 9982 + }, + { + "epoch": 3.0641497851442603, + "grad_norm": 0.33773133158683777, + "learning_rate": 8.127210127884602e-05, + "loss": 1.8194, + "step": 9983 + }, + { + "epoch": 3.0644567219152856, + "grad_norm": 0.31642746925354004, + "learning_rate": 8.126822274240188e-05, + "loss": 1.8782, + "step": 9984 + }, + { + "epoch": 3.0647636586863105, + "grad_norm": 0.2476506233215332, + "learning_rate": 8.126434389694686e-05, + "loss": 1.7866, + "step": 9985 + }, + { + "epoch": 3.065070595457336, + "grad_norm": 0.27296319603919983, + "learning_rate": 8.126046474251927e-05, + "loss": 1.8276, + "step": 9986 + }, + { + "epoch": 3.0653775322283607, + "grad_norm": 0.353865385055542, + "learning_rate": 8.125658527915744e-05, + "loss": 1.9525, + "step": 9987 + }, + { + "epoch": 3.065684468999386, + "grad_norm": 0.370256632566452, + "learning_rate": 8.12527055068997e-05, + "loss": 1.8514, + "step": 9988 + }, + { + "epoch": 3.0659914057704114, + "grad_norm": 0.30738842487335205, + "learning_rate": 8.124882542578442e-05, + "loss": 1.8125, + "step": 9989 + }, + { + "epoch": 3.0662983425414363, + "grad_norm": 0.3151233494281769, + "learning_rate": 8.124494503584995e-05, + "loss": 1.8165, + "step": 9990 + }, + { + "epoch": 3.0666052793124616, + "grad_norm": 0.29071590304374695, + "learning_rate": 8.124106433713458e-05, + "loss": 1.7617, + "step": 9991 + }, + { + "epoch": 3.066912216083487, + "grad_norm": 0.2898697853088379, + "learning_rate": 8.123718332967672e-05, + "loss": 1.7779, + "step": 9992 + }, + { + "epoch": 3.067219152854512, + "grad_norm": 0.26601701974868774, + "learning_rate": 8.123330201351471e-05, + "loss": 1.8307, + "step": 9993 + }, + { + "epoch": 3.067526089625537, + "grad_norm": 0.2622119188308716, + "learning_rate": 8.12294203886869e-05, + "loss": 1.7958, + "step": 9994 + }, + { + "epoch": 3.0678330263965625, + "grad_norm": 0.29709386825561523, + "learning_rate": 8.122553845523166e-05, + "loss": 1.7799, + "step": 9995 + }, + { + "epoch": 3.0681399631675874, + "grad_norm": 0.31267789006233215, + "learning_rate": 8.122165621318733e-05, + "loss": 1.8149, + "step": 9996 + }, + { + "epoch": 3.0684468999386127, + "grad_norm": 0.3076523244380951, + "learning_rate": 8.121777366259232e-05, + "loss": 1.7701, + "step": 9997 + }, + { + "epoch": 3.0687538367096376, + "grad_norm": 0.30096009373664856, + "learning_rate": 8.121389080348496e-05, + "loss": 1.8323, + "step": 9998 + }, + { + "epoch": 3.069060773480663, + "grad_norm": 0.25739142298698425, + "learning_rate": 8.121000763590363e-05, + "loss": 1.8105, + "step": 9999 + }, + { + "epoch": 3.0693677102516883, + "grad_norm": 0.2780844271183014, + "learning_rate": 8.120612415988671e-05, + "loss": 1.8502, + "step": 10000 + }, + { + "epoch": 3.069674647022713, + "grad_norm": 0.3316378593444824, + "learning_rate": 8.120224037547259e-05, + "loss": 1.8244, + "step": 10001 + }, + { + "epoch": 3.0699815837937385, + "grad_norm": 0.261129766702652, + "learning_rate": 8.119835628269964e-05, + "loss": 1.7769, + "step": 10002 + }, + { + "epoch": 3.070288520564764, + "grad_norm": 0.29213985800743103, + "learning_rate": 8.119447188160625e-05, + "loss": 1.7717, + "step": 10003 + }, + { + "epoch": 3.0705954573357888, + "grad_norm": 0.38545623421669006, + "learning_rate": 8.11905871722308e-05, + "loss": 1.8433, + "step": 10004 + }, + { + "epoch": 3.070902394106814, + "grad_norm": 0.3617223799228668, + "learning_rate": 8.118670215461168e-05, + "loss": 1.8172, + "step": 10005 + }, + { + "epoch": 3.071209330877839, + "grad_norm": 0.3241543769836426, + "learning_rate": 8.11828168287873e-05, + "loss": 1.8325, + "step": 10006 + }, + { + "epoch": 3.0715162676488643, + "grad_norm": 0.3538578152656555, + "learning_rate": 8.117893119479605e-05, + "loss": 1.8188, + "step": 10007 + }, + { + "epoch": 3.0718232044198897, + "grad_norm": 0.3861970603466034, + "learning_rate": 8.117504525267632e-05, + "loss": 1.8518, + "step": 10008 + }, + { + "epoch": 3.0721301411909145, + "grad_norm": 0.35433146357536316, + "learning_rate": 8.117115900246652e-05, + "loss": 1.8601, + "step": 10009 + }, + { + "epoch": 3.07243707796194, + "grad_norm": 0.29796987771987915, + "learning_rate": 8.116727244420507e-05, + "loss": 1.7934, + "step": 10010 + }, + { + "epoch": 3.072744014732965, + "grad_norm": 0.3091779947280884, + "learning_rate": 8.116338557793035e-05, + "loss": 1.8111, + "step": 10011 + }, + { + "epoch": 3.07305095150399, + "grad_norm": 0.2741319537162781, + "learning_rate": 8.11594984036808e-05, + "loss": 1.8079, + "step": 10012 + }, + { + "epoch": 3.0733578882750154, + "grad_norm": 0.28905320167541504, + "learning_rate": 8.115561092149482e-05, + "loss": 1.8475, + "step": 10013 + }, + { + "epoch": 3.0736648250460403, + "grad_norm": 0.2897081673145294, + "learning_rate": 8.115172313141081e-05, + "loss": 1.838, + "step": 10014 + }, + { + "epoch": 3.0739717618170657, + "grad_norm": 0.2620783746242523, + "learning_rate": 8.114783503346725e-05, + "loss": 1.8024, + "step": 10015 + }, + { + "epoch": 3.074278698588091, + "grad_norm": 0.26478636264801025, + "learning_rate": 8.11439466277025e-05, + "loss": 1.8137, + "step": 10016 + }, + { + "epoch": 3.074585635359116, + "grad_norm": 0.2796174883842468, + "learning_rate": 8.114005791415502e-05, + "loss": 1.7976, + "step": 10017 + }, + { + "epoch": 3.074892572130141, + "grad_norm": 0.26813286542892456, + "learning_rate": 8.113616889286325e-05, + "loss": 1.7945, + "step": 10018 + }, + { + "epoch": 3.0751995089011666, + "grad_norm": 0.2443828582763672, + "learning_rate": 8.11322795638656e-05, + "loss": 1.7829, + "step": 10019 + }, + { + "epoch": 3.0755064456721914, + "grad_norm": 0.2981395423412323, + "learning_rate": 8.112838992720053e-05, + "loss": 1.7928, + "step": 10020 + }, + { + "epoch": 3.075813382443217, + "grad_norm": 0.25605037808418274, + "learning_rate": 8.112449998290644e-05, + "loss": 1.8129, + "step": 10021 + }, + { + "epoch": 3.0761203192142417, + "grad_norm": 0.31180307269096375, + "learning_rate": 8.112060973102181e-05, + "loss": 1.7393, + "step": 10022 + }, + { + "epoch": 3.076427255985267, + "grad_norm": 0.3230421543121338, + "learning_rate": 8.111671917158508e-05, + "loss": 1.818, + "step": 10023 + }, + { + "epoch": 3.0767341927562923, + "grad_norm": 0.3158549964427948, + "learning_rate": 8.111282830463468e-05, + "loss": 1.7582, + "step": 10024 + }, + { + "epoch": 3.0770411295273172, + "grad_norm": 0.24524325132369995, + "learning_rate": 8.110893713020908e-05, + "loss": 1.8215, + "step": 10025 + }, + { + "epoch": 3.0773480662983426, + "grad_norm": 0.2793932259082794, + "learning_rate": 8.110504564834675e-05, + "loss": 1.8551, + "step": 10026 + }, + { + "epoch": 3.077655003069368, + "grad_norm": 0.29629403352737427, + "learning_rate": 8.110115385908612e-05, + "loss": 1.8019, + "step": 10027 + }, + { + "epoch": 3.077961939840393, + "grad_norm": 0.3138490915298462, + "learning_rate": 8.109726176246564e-05, + "loss": 1.8436, + "step": 10028 + }, + { + "epoch": 3.078268876611418, + "grad_norm": 0.29802024364471436, + "learning_rate": 8.10933693585238e-05, + "loss": 1.8158, + "step": 10029 + }, + { + "epoch": 3.078575813382443, + "grad_norm": 0.30785220861434937, + "learning_rate": 8.108947664729907e-05, + "loss": 1.8674, + "step": 10030 + }, + { + "epoch": 3.0788827501534684, + "grad_norm": 0.277662992477417, + "learning_rate": 8.10855836288299e-05, + "loss": 1.8253, + "step": 10031 + }, + { + "epoch": 3.0791896869244937, + "grad_norm": 0.27399590611457825, + "learning_rate": 8.108169030315477e-05, + "loss": 1.8587, + "step": 10032 + }, + { + "epoch": 3.0794966236955186, + "grad_norm": 0.28398239612579346, + "learning_rate": 8.107779667031217e-05, + "loss": 1.8326, + "step": 10033 + }, + { + "epoch": 3.079803560466544, + "grad_norm": 0.2882741093635559, + "learning_rate": 8.107390273034057e-05, + "loss": 1.785, + "step": 10034 + }, + { + "epoch": 3.0801104972375692, + "grad_norm": 0.271043598651886, + "learning_rate": 8.107000848327843e-05, + "loss": 1.765, + "step": 10035 + }, + { + "epoch": 3.080417434008594, + "grad_norm": 0.2589638829231262, + "learning_rate": 8.106611392916427e-05, + "loss": 1.8136, + "step": 10036 + }, + { + "epoch": 3.0807243707796195, + "grad_norm": 0.3068227469921112, + "learning_rate": 8.106221906803656e-05, + "loss": 1.8034, + "step": 10037 + }, + { + "epoch": 3.0810313075506444, + "grad_norm": 0.2714168131351471, + "learning_rate": 8.105832389993379e-05, + "loss": 1.8007, + "step": 10038 + }, + { + "epoch": 3.0813382443216697, + "grad_norm": 0.2747504711151123, + "learning_rate": 8.105442842489447e-05, + "loss": 1.8135, + "step": 10039 + }, + { + "epoch": 3.081645181092695, + "grad_norm": 0.2719285488128662, + "learning_rate": 8.105053264295708e-05, + "loss": 1.7629, + "step": 10040 + }, + { + "epoch": 3.08195211786372, + "grad_norm": 0.3119582235813141, + "learning_rate": 8.104663655416014e-05, + "loss": 1.7887, + "step": 10041 + }, + { + "epoch": 3.0822590546347453, + "grad_norm": 0.35965192317962646, + "learning_rate": 8.104274015854212e-05, + "loss": 1.8484, + "step": 10042 + }, + { + "epoch": 3.0825659914057706, + "grad_norm": 0.3045980632305145, + "learning_rate": 8.103884345614157e-05, + "loss": 1.8625, + "step": 10043 + }, + { + "epoch": 3.0828729281767955, + "grad_norm": 0.2925138473510742, + "learning_rate": 8.103494644699696e-05, + "loss": 1.9306, + "step": 10044 + }, + { + "epoch": 3.083179864947821, + "grad_norm": 0.2894277274608612, + "learning_rate": 8.103104913114681e-05, + "loss": 1.7796, + "step": 10045 + }, + { + "epoch": 3.0834868017188457, + "grad_norm": 0.2776826322078705, + "learning_rate": 8.102715150862967e-05, + "loss": 1.8169, + "step": 10046 + }, + { + "epoch": 3.083793738489871, + "grad_norm": 0.3315230906009674, + "learning_rate": 8.102325357948402e-05, + "loss": 1.8139, + "step": 10047 + }, + { + "epoch": 3.0841006752608964, + "grad_norm": 0.2906761169433594, + "learning_rate": 8.10193553437484e-05, + "loss": 1.8162, + "step": 10048 + }, + { + "epoch": 3.0844076120319213, + "grad_norm": 0.32681339979171753, + "learning_rate": 8.101545680146132e-05, + "loss": 1.8245, + "step": 10049 + }, + { + "epoch": 3.0847145488029466, + "grad_norm": 0.32525795698165894, + "learning_rate": 8.101155795266131e-05, + "loss": 1.8605, + "step": 10050 + }, + { + "epoch": 3.085021485573972, + "grad_norm": 0.31705379486083984, + "learning_rate": 8.100765879738692e-05, + "loss": 1.8214, + "step": 10051 + }, + { + "epoch": 3.085328422344997, + "grad_norm": 0.27772918343544006, + "learning_rate": 8.100375933567668e-05, + "loss": 1.7822, + "step": 10052 + }, + { + "epoch": 3.085635359116022, + "grad_norm": 0.2877809405326843, + "learning_rate": 8.09998595675691e-05, + "loss": 1.7935, + "step": 10053 + }, + { + "epoch": 3.0859422958870475, + "grad_norm": 0.29759806394577026, + "learning_rate": 8.099595949310276e-05, + "loss": 1.8041, + "step": 10054 + }, + { + "epoch": 3.0862492326580724, + "grad_norm": 0.2715320289134979, + "learning_rate": 8.099205911231617e-05, + "loss": 1.7923, + "step": 10055 + }, + { + "epoch": 3.0865561694290977, + "grad_norm": 0.33566340804100037, + "learning_rate": 8.098815842524789e-05, + "loss": 1.7953, + "step": 10056 + }, + { + "epoch": 3.0868631062001226, + "grad_norm": 0.3360871970653534, + "learning_rate": 8.098425743193645e-05, + "loss": 1.8275, + "step": 10057 + }, + { + "epoch": 3.087170042971148, + "grad_norm": 0.2797739803791046, + "learning_rate": 8.098035613242043e-05, + "loss": 1.7597, + "step": 10058 + }, + { + "epoch": 3.0874769797421733, + "grad_norm": 0.25500187277793884, + "learning_rate": 8.097645452673837e-05, + "loss": 1.8059, + "step": 10059 + }, + { + "epoch": 3.087783916513198, + "grad_norm": 0.28042587637901306, + "learning_rate": 8.097255261492884e-05, + "loss": 1.7954, + "step": 10060 + }, + { + "epoch": 3.0880908532842235, + "grad_norm": 0.3616262376308441, + "learning_rate": 8.096865039703038e-05, + "loss": 1.8605, + "step": 10061 + }, + { + "epoch": 3.0883977900552484, + "grad_norm": 0.3453714847564697, + "learning_rate": 8.096474787308157e-05, + "loss": 1.7643, + "step": 10062 + }, + { + "epoch": 3.0887047268262737, + "grad_norm": 0.3192278742790222, + "learning_rate": 8.096084504312098e-05, + "loss": 1.8415, + "step": 10063 + }, + { + "epoch": 3.089011663597299, + "grad_norm": 0.2714482545852661, + "learning_rate": 8.095694190718715e-05, + "loss": 1.8204, + "step": 10064 + }, + { + "epoch": 3.089318600368324, + "grad_norm": 0.26562005281448364, + "learning_rate": 8.09530384653187e-05, + "loss": 1.7322, + "step": 10065 + }, + { + "epoch": 3.0896255371393493, + "grad_norm": 0.33727800846099854, + "learning_rate": 8.094913471755417e-05, + "loss": 1.8221, + "step": 10066 + }, + { + "epoch": 3.0899324739103746, + "grad_norm": 0.3561044931411743, + "learning_rate": 8.094523066393215e-05, + "loss": 1.8879, + "step": 10067 + }, + { + "epoch": 3.0902394106813995, + "grad_norm": 0.2568742334842682, + "learning_rate": 8.094132630449122e-05, + "loss": 1.8178, + "step": 10068 + }, + { + "epoch": 3.090546347452425, + "grad_norm": 0.4025525450706482, + "learning_rate": 8.093742163926998e-05, + "loss": 1.8186, + "step": 10069 + }, + { + "epoch": 3.09085328422345, + "grad_norm": 0.43863433599472046, + "learning_rate": 8.0933516668307e-05, + "loss": 1.8371, + "step": 10070 + }, + { + "epoch": 3.091160220994475, + "grad_norm": 0.34873950481414795, + "learning_rate": 8.092961139164087e-05, + "loss": 1.8083, + "step": 10071 + }, + { + "epoch": 3.0914671577655004, + "grad_norm": 0.31433534622192383, + "learning_rate": 8.092570580931021e-05, + "loss": 1.8154, + "step": 10072 + }, + { + "epoch": 3.0917740945365253, + "grad_norm": 0.25523966550827026, + "learning_rate": 8.092179992135358e-05, + "loss": 1.8158, + "step": 10073 + }, + { + "epoch": 3.0920810313075506, + "grad_norm": 0.348469078540802, + "learning_rate": 8.09178937278096e-05, + "loss": 1.8358, + "step": 10074 + }, + { + "epoch": 3.092387968078576, + "grad_norm": 0.33455297350883484, + "learning_rate": 8.091398722871688e-05, + "loss": 1.7779, + "step": 10075 + }, + { + "epoch": 3.092694904849601, + "grad_norm": 0.36544880270957947, + "learning_rate": 8.091008042411403e-05, + "loss": 1.9186, + "step": 10076 + }, + { + "epoch": 3.093001841620626, + "grad_norm": 0.29165831208229065, + "learning_rate": 8.090617331403965e-05, + "loss": 1.8964, + "step": 10077 + }, + { + "epoch": 3.0933087783916515, + "grad_norm": 0.31011059880256653, + "learning_rate": 8.090226589853234e-05, + "loss": 1.8453, + "step": 10078 + }, + { + "epoch": 3.0936157151626764, + "grad_norm": 0.2835703492164612, + "learning_rate": 8.089835817763071e-05, + "loss": 1.7718, + "step": 10079 + }, + { + "epoch": 3.0939226519337018, + "grad_norm": 0.2910583019256592, + "learning_rate": 8.08944501513734e-05, + "loss": 1.7881, + "step": 10080 + }, + { + "epoch": 3.0942295887047266, + "grad_norm": 0.391303688287735, + "learning_rate": 8.089054181979905e-05, + "loss": 1.7915, + "step": 10081 + }, + { + "epoch": 3.094536525475752, + "grad_norm": 0.4119330048561096, + "learning_rate": 8.088663318294623e-05, + "loss": 1.7975, + "step": 10082 + }, + { + "epoch": 3.0948434622467773, + "grad_norm": 0.2980102002620697, + "learning_rate": 8.088272424085361e-05, + "loss": 1.805, + "step": 10083 + }, + { + "epoch": 3.095150399017802, + "grad_norm": 0.3089980483055115, + "learning_rate": 8.087881499355983e-05, + "loss": 1.8265, + "step": 10084 + }, + { + "epoch": 3.0954573357888275, + "grad_norm": 0.3851003348827362, + "learning_rate": 8.087490544110348e-05, + "loss": 1.8174, + "step": 10085 + }, + { + "epoch": 3.095764272559853, + "grad_norm": 0.42357420921325684, + "learning_rate": 8.08709955835232e-05, + "loss": 1.8083, + "step": 10086 + }, + { + "epoch": 3.0960712093308778, + "grad_norm": 0.291777640581131, + "learning_rate": 8.086708542085768e-05, + "loss": 1.7713, + "step": 10087 + }, + { + "epoch": 3.096378146101903, + "grad_norm": 0.2563805878162384, + "learning_rate": 8.086317495314552e-05, + "loss": 1.7691, + "step": 10088 + }, + { + "epoch": 3.096685082872928, + "grad_norm": 0.3418877422809601, + "learning_rate": 8.085926418042536e-05, + "loss": 1.8547, + "step": 10089 + }, + { + "epoch": 3.0969920196439533, + "grad_norm": 0.3859385550022125, + "learning_rate": 8.085535310273589e-05, + "loss": 1.8226, + "step": 10090 + }, + { + "epoch": 3.0972989564149787, + "grad_norm": 0.3427267372608185, + "learning_rate": 8.085144172011571e-05, + "loss": 1.837, + "step": 10091 + }, + { + "epoch": 3.0976058931860035, + "grad_norm": 0.29290953278541565, + "learning_rate": 8.084753003260352e-05, + "loss": 1.8392, + "step": 10092 + }, + { + "epoch": 3.097912829957029, + "grad_norm": 0.33282020688056946, + "learning_rate": 8.084361804023795e-05, + "loss": 1.8351, + "step": 10093 + }, + { + "epoch": 3.098219766728054, + "grad_norm": 0.3802134394645691, + "learning_rate": 8.083970574305768e-05, + "loss": 1.7467, + "step": 10094 + }, + { + "epoch": 3.098526703499079, + "grad_norm": 0.3142111897468567, + "learning_rate": 8.083579314110135e-05, + "loss": 1.7966, + "step": 10095 + }, + { + "epoch": 3.0988336402701044, + "grad_norm": 0.2956278324127197, + "learning_rate": 8.083188023440765e-05, + "loss": 1.8724, + "step": 10096 + }, + { + "epoch": 3.0991405770411293, + "grad_norm": 0.3262473940849304, + "learning_rate": 8.082796702301522e-05, + "loss": 1.8448, + "step": 10097 + }, + { + "epoch": 3.0994475138121547, + "grad_norm": 0.29358017444610596, + "learning_rate": 8.082405350696276e-05, + "loss": 1.8679, + "step": 10098 + }, + { + "epoch": 3.09975445058318, + "grad_norm": 0.36439722776412964, + "learning_rate": 8.082013968628893e-05, + "loss": 1.8801, + "step": 10099 + }, + { + "epoch": 3.100061387354205, + "grad_norm": 0.3565322458744049, + "learning_rate": 8.081622556103244e-05, + "loss": 1.794, + "step": 10100 + }, + { + "epoch": 3.1003683241252302, + "grad_norm": 0.2841760814189911, + "learning_rate": 8.081231113123191e-05, + "loss": 1.7593, + "step": 10101 + }, + { + "epoch": 3.1006752608962556, + "grad_norm": 0.28589630126953125, + "learning_rate": 8.080839639692608e-05, + "loss": 1.864, + "step": 10102 + }, + { + "epoch": 3.1009821976672804, + "grad_norm": 0.3595057427883148, + "learning_rate": 8.080448135815362e-05, + "loss": 1.8067, + "step": 10103 + }, + { + "epoch": 3.101289134438306, + "grad_norm": 0.3909708261489868, + "learning_rate": 8.080056601495322e-05, + "loss": 1.8601, + "step": 10104 + }, + { + "epoch": 3.1015960712093307, + "grad_norm": 0.35180148482322693, + "learning_rate": 8.079665036736358e-05, + "loss": 1.8328, + "step": 10105 + }, + { + "epoch": 3.101903007980356, + "grad_norm": 0.3065175712108612, + "learning_rate": 8.079273441542338e-05, + "loss": 1.8449, + "step": 10106 + }, + { + "epoch": 3.1022099447513813, + "grad_norm": 0.31358617544174194, + "learning_rate": 8.078881815917134e-05, + "loss": 1.8325, + "step": 10107 + }, + { + "epoch": 3.1025168815224062, + "grad_norm": 0.4737118184566498, + "learning_rate": 8.078490159864614e-05, + "loss": 1.8232, + "step": 10108 + }, + { + "epoch": 3.1028238182934316, + "grad_norm": 0.435148686170578, + "learning_rate": 8.078098473388651e-05, + "loss": 1.8227, + "step": 10109 + }, + { + "epoch": 3.103130755064457, + "grad_norm": 0.3080987334251404, + "learning_rate": 8.077706756493115e-05, + "loss": 1.8072, + "step": 10110 + }, + { + "epoch": 3.103437691835482, + "grad_norm": 0.3225170075893402, + "learning_rate": 8.077315009181876e-05, + "loss": 1.7716, + "step": 10111 + }, + { + "epoch": 3.103744628606507, + "grad_norm": 0.46642443537712097, + "learning_rate": 8.076923231458808e-05, + "loss": 1.8295, + "step": 10112 + }, + { + "epoch": 3.104051565377532, + "grad_norm": 0.42561766505241394, + "learning_rate": 8.07653142332778e-05, + "loss": 1.8553, + "step": 10113 + }, + { + "epoch": 3.1043585021485574, + "grad_norm": 0.27187541127204895, + "learning_rate": 8.076139584792664e-05, + "loss": 1.7937, + "step": 10114 + }, + { + "epoch": 3.1046654389195827, + "grad_norm": 0.27822238206863403, + "learning_rate": 8.075747715857335e-05, + "loss": 1.8151, + "step": 10115 + }, + { + "epoch": 3.1049723756906076, + "grad_norm": 0.40106478333473206, + "learning_rate": 8.075355816525665e-05, + "loss": 1.8637, + "step": 10116 + }, + { + "epoch": 3.105279312461633, + "grad_norm": 0.33455124497413635, + "learning_rate": 8.074963886801525e-05, + "loss": 1.8543, + "step": 10117 + }, + { + "epoch": 3.1055862492326582, + "grad_norm": 0.32246437668800354, + "learning_rate": 8.07457192668879e-05, + "loss": 1.7907, + "step": 10118 + }, + { + "epoch": 3.105893186003683, + "grad_norm": 0.45360109210014343, + "learning_rate": 8.074179936191332e-05, + "loss": 1.7404, + "step": 10119 + }, + { + "epoch": 3.1062001227747085, + "grad_norm": 0.445916086435318, + "learning_rate": 8.07378791531303e-05, + "loss": 1.778, + "step": 10120 + }, + { + "epoch": 3.1065070595457334, + "grad_norm": 0.28561538457870483, + "learning_rate": 8.073395864057751e-05, + "loss": 1.8723, + "step": 10121 + }, + { + "epoch": 3.1068139963167587, + "grad_norm": 0.3258218467235565, + "learning_rate": 8.073003782429373e-05, + "loss": 1.8106, + "step": 10122 + }, + { + "epoch": 3.107120933087784, + "grad_norm": 0.5459560751914978, + "learning_rate": 8.07261167043177e-05, + "loss": 1.8022, + "step": 10123 + }, + { + "epoch": 3.107427869858809, + "grad_norm": 0.4828549921512604, + "learning_rate": 8.072219528068819e-05, + "loss": 1.7556, + "step": 10124 + }, + { + "epoch": 3.1077348066298343, + "grad_norm": 0.24075324833393097, + "learning_rate": 8.071827355344393e-05, + "loss": 1.7901, + "step": 10125 + }, + { + "epoch": 3.1080417434008596, + "grad_norm": 0.44677188992500305, + "learning_rate": 8.071435152262367e-05, + "loss": 1.7858, + "step": 10126 + }, + { + "epoch": 3.1083486801718845, + "grad_norm": 0.49862590432167053, + "learning_rate": 8.071042918826622e-05, + "loss": 1.805, + "step": 10127 + }, + { + "epoch": 3.10865561694291, + "grad_norm": 0.30883491039276123, + "learning_rate": 8.07065065504103e-05, + "loss": 1.7693, + "step": 10128 + }, + { + "epoch": 3.108962553713935, + "grad_norm": 0.29583030939102173, + "learning_rate": 8.070258360909467e-05, + "loss": 1.8141, + "step": 10129 + }, + { + "epoch": 3.10926949048496, + "grad_norm": 0.3595346510410309, + "learning_rate": 8.069866036435812e-05, + "loss": 1.8286, + "step": 10130 + }, + { + "epoch": 3.1095764272559854, + "grad_norm": 0.3215504288673401, + "learning_rate": 8.069473681623942e-05, + "loss": 1.8557, + "step": 10131 + }, + { + "epoch": 3.1098833640270103, + "grad_norm": 0.29734939336776733, + "learning_rate": 8.069081296477734e-05, + "loss": 1.7996, + "step": 10132 + }, + { + "epoch": 3.1101903007980356, + "grad_norm": 0.33546003699302673, + "learning_rate": 8.068688881001065e-05, + "loss": 1.8307, + "step": 10133 + }, + { + "epoch": 3.110497237569061, + "grad_norm": 0.3886832296848297, + "learning_rate": 8.068296435197814e-05, + "loss": 1.751, + "step": 10134 + }, + { + "epoch": 3.110804174340086, + "grad_norm": 0.34505394101142883, + "learning_rate": 8.06790395907186e-05, + "loss": 1.7543, + "step": 10135 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.27018141746520996, + "learning_rate": 8.06751145262708e-05, + "loss": 1.8109, + "step": 10136 + }, + { + "epoch": 3.1114180478821365, + "grad_norm": 0.3367149531841278, + "learning_rate": 8.067118915867355e-05, + "loss": 1.8025, + "step": 10137 + }, + { + "epoch": 3.1117249846531614, + "grad_norm": 0.40811091661453247, + "learning_rate": 8.066726348796562e-05, + "loss": 1.7327, + "step": 10138 + }, + { + "epoch": 3.1120319214241867, + "grad_norm": 0.3511471152305603, + "learning_rate": 8.066333751418583e-05, + "loss": 1.8711, + "step": 10139 + }, + { + "epoch": 3.1123388581952116, + "grad_norm": 0.3112446367740631, + "learning_rate": 8.065941123737295e-05, + "loss": 1.8621, + "step": 10140 + }, + { + "epoch": 3.112645794966237, + "grad_norm": 0.3424238860607147, + "learning_rate": 8.065548465756581e-05, + "loss": 1.8383, + "step": 10141 + }, + { + "epoch": 3.1129527317372623, + "grad_norm": 0.380013108253479, + "learning_rate": 8.06515577748032e-05, + "loss": 1.8121, + "step": 10142 + }, + { + "epoch": 3.113259668508287, + "grad_norm": 0.2650558650493622, + "learning_rate": 8.064763058912393e-05, + "loss": 1.866, + "step": 10143 + }, + { + "epoch": 3.1135666052793125, + "grad_norm": 0.30580762028694153, + "learning_rate": 8.06437031005668e-05, + "loss": 1.7769, + "step": 10144 + }, + { + "epoch": 3.113873542050338, + "grad_norm": 0.29927194118499756, + "learning_rate": 8.063977530917066e-05, + "loss": 1.7897, + "step": 10145 + }, + { + "epoch": 3.1141804788213627, + "grad_norm": 0.24322012066841125, + "learning_rate": 8.063584721497429e-05, + "loss": 1.7968, + "step": 10146 + }, + { + "epoch": 3.114487415592388, + "grad_norm": 0.3082945644855499, + "learning_rate": 8.063191881801651e-05, + "loss": 1.8456, + "step": 10147 + }, + { + "epoch": 3.114794352363413, + "grad_norm": 0.3247329890727997, + "learning_rate": 8.062799011833617e-05, + "loss": 1.7436, + "step": 10148 + }, + { + "epoch": 3.1151012891344383, + "grad_norm": 0.27591946721076965, + "learning_rate": 8.062406111597207e-05, + "loss": 1.7976, + "step": 10149 + }, + { + "epoch": 3.1154082259054636, + "grad_norm": 0.2752058804035187, + "learning_rate": 8.062013181096306e-05, + "loss": 1.7814, + "step": 10150 + }, + { + "epoch": 3.1157151626764885, + "grad_norm": 0.3207196891307831, + "learning_rate": 8.061620220334795e-05, + "loss": 1.7767, + "step": 10151 + }, + { + "epoch": 3.116022099447514, + "grad_norm": 0.2895309627056122, + "learning_rate": 8.061227229316559e-05, + "loss": 1.8588, + "step": 10152 + }, + { + "epoch": 3.116329036218539, + "grad_norm": 0.333843469619751, + "learning_rate": 8.060834208045481e-05, + "loss": 1.7871, + "step": 10153 + }, + { + "epoch": 3.116635972989564, + "grad_norm": 0.43877774477005005, + "learning_rate": 8.060441156525445e-05, + "loss": 1.8165, + "step": 10154 + }, + { + "epoch": 3.1169429097605894, + "grad_norm": 0.35700589418411255, + "learning_rate": 8.060048074760337e-05, + "loss": 1.777, + "step": 10155 + }, + { + "epoch": 3.1172498465316143, + "grad_norm": 0.26124534010887146, + "learning_rate": 8.059654962754039e-05, + "loss": 1.8343, + "step": 10156 + }, + { + "epoch": 3.1175567833026396, + "grad_norm": 0.331444650888443, + "learning_rate": 8.059261820510438e-05, + "loss": 1.9437, + "step": 10157 + }, + { + "epoch": 3.117863720073665, + "grad_norm": 0.31657731533050537, + "learning_rate": 8.058868648033419e-05, + "loss": 1.7621, + "step": 10158 + }, + { + "epoch": 3.11817065684469, + "grad_norm": 0.2785957455635071, + "learning_rate": 8.058475445326867e-05, + "loss": 1.9049, + "step": 10159 + }, + { + "epoch": 3.118477593615715, + "grad_norm": 0.2605743408203125, + "learning_rate": 8.058082212394667e-05, + "loss": 1.7895, + "step": 10160 + }, + { + "epoch": 3.1187845303867405, + "grad_norm": 0.2981378138065338, + "learning_rate": 8.057688949240707e-05, + "loss": 1.8373, + "step": 10161 + }, + { + "epoch": 3.1190914671577654, + "grad_norm": 0.2944273054599762, + "learning_rate": 8.057295655868873e-05, + "loss": 1.8373, + "step": 10162 + }, + { + "epoch": 3.1193984039287908, + "grad_norm": 0.2696721851825714, + "learning_rate": 8.056902332283052e-05, + "loss": 1.8023, + "step": 10163 + }, + { + "epoch": 3.1197053406998156, + "grad_norm": 0.27659857273101807, + "learning_rate": 8.056508978487128e-05, + "loss": 1.8453, + "step": 10164 + }, + { + "epoch": 3.120012277470841, + "grad_norm": 0.2982441186904907, + "learning_rate": 8.056115594484992e-05, + "loss": 1.9072, + "step": 10165 + }, + { + "epoch": 3.1203192142418663, + "grad_norm": 0.3136404752731323, + "learning_rate": 8.055722180280531e-05, + "loss": 1.8585, + "step": 10166 + }, + { + "epoch": 3.120626151012891, + "grad_norm": 0.2979940176010132, + "learning_rate": 8.055328735877631e-05, + "loss": 1.8699, + "step": 10167 + }, + { + "epoch": 3.1209330877839165, + "grad_norm": 0.2585618793964386, + "learning_rate": 8.054935261280184e-05, + "loss": 1.8323, + "step": 10168 + }, + { + "epoch": 3.121240024554942, + "grad_norm": 0.28734859824180603, + "learning_rate": 8.054541756492075e-05, + "loss": 1.8694, + "step": 10169 + }, + { + "epoch": 3.1215469613259668, + "grad_norm": 0.30582788586616516, + "learning_rate": 8.054148221517193e-05, + "loss": 1.856, + "step": 10170 + }, + { + "epoch": 3.121853898096992, + "grad_norm": 0.3128255009651184, + "learning_rate": 8.053754656359429e-05, + "loss": 1.8329, + "step": 10171 + }, + { + "epoch": 3.122160834868017, + "grad_norm": 0.2845318615436554, + "learning_rate": 8.053361061022671e-05, + "loss": 1.8111, + "step": 10172 + }, + { + "epoch": 3.1224677716390423, + "grad_norm": 0.2994609773159027, + "learning_rate": 8.05296743551081e-05, + "loss": 1.8157, + "step": 10173 + }, + { + "epoch": 3.1227747084100677, + "grad_norm": 0.26397961378097534, + "learning_rate": 8.052573779827737e-05, + "loss": 1.8572, + "step": 10174 + }, + { + "epoch": 3.1230816451810925, + "grad_norm": 0.2911500334739685, + "learning_rate": 8.052180093977339e-05, + "loss": 1.8312, + "step": 10175 + }, + { + "epoch": 3.123388581952118, + "grad_norm": 0.33455008268356323, + "learning_rate": 8.051786377963509e-05, + "loss": 1.8748, + "step": 10176 + }, + { + "epoch": 3.123695518723143, + "grad_norm": 0.3127586841583252, + "learning_rate": 8.051392631790135e-05, + "loss": 1.8224, + "step": 10177 + }, + { + "epoch": 3.124002455494168, + "grad_norm": 0.2910686433315277, + "learning_rate": 8.050998855461113e-05, + "loss": 1.8557, + "step": 10178 + }, + { + "epoch": 3.1243093922651934, + "grad_norm": 0.2849208414554596, + "learning_rate": 8.050605048980333e-05, + "loss": 1.82, + "step": 10179 + }, + { + "epoch": 3.1246163290362183, + "grad_norm": 0.35189691185951233, + "learning_rate": 8.050211212351683e-05, + "loss": 1.7884, + "step": 10180 + }, + { + "epoch": 3.1249232658072437, + "grad_norm": 0.3641110360622406, + "learning_rate": 8.04981734557906e-05, + "loss": 1.7984, + "step": 10181 + }, + { + "epoch": 3.125230202578269, + "grad_norm": 0.3111717700958252, + "learning_rate": 8.049423448666353e-05, + "loss": 1.8134, + "step": 10182 + }, + { + "epoch": 3.125537139349294, + "grad_norm": 0.2608453631401062, + "learning_rate": 8.049029521617457e-05, + "loss": 1.765, + "step": 10183 + }, + { + "epoch": 3.1258440761203192, + "grad_norm": 0.28779423236846924, + "learning_rate": 8.048635564436265e-05, + "loss": 1.8355, + "step": 10184 + }, + { + "epoch": 3.1261510128913446, + "grad_norm": 0.38227665424346924, + "learning_rate": 8.048241577126668e-05, + "loss": 1.8487, + "step": 10185 + }, + { + "epoch": 3.1264579496623695, + "grad_norm": 0.3603171706199646, + "learning_rate": 8.047847559692562e-05, + "loss": 1.8035, + "step": 10186 + }, + { + "epoch": 3.126764886433395, + "grad_norm": 0.21950066089630127, + "learning_rate": 8.04745351213784e-05, + "loss": 1.7399, + "step": 10187 + }, + { + "epoch": 3.12707182320442, + "grad_norm": 0.2796075642108917, + "learning_rate": 8.047059434466395e-05, + "loss": 1.8229, + "step": 10188 + }, + { + "epoch": 3.127378759975445, + "grad_norm": 0.3382907807826996, + "learning_rate": 8.046665326682125e-05, + "loss": 1.7713, + "step": 10189 + }, + { + "epoch": 3.1276856967464703, + "grad_norm": 0.36472463607788086, + "learning_rate": 8.04627118878892e-05, + "loss": 1.8129, + "step": 10190 + }, + { + "epoch": 3.1279926335174952, + "grad_norm": 0.2971884310245514, + "learning_rate": 8.045877020790679e-05, + "loss": 1.7894, + "step": 10191 + }, + { + "epoch": 3.1282995702885206, + "grad_norm": 0.2292303442955017, + "learning_rate": 8.045482822691297e-05, + "loss": 1.7637, + "step": 10192 + }, + { + "epoch": 3.128606507059546, + "grad_norm": 0.300750732421875, + "learning_rate": 8.045088594494668e-05, + "loss": 1.7678, + "step": 10193 + }, + { + "epoch": 3.128913443830571, + "grad_norm": 0.3121531009674072, + "learning_rate": 8.044694336204688e-05, + "loss": 1.8651, + "step": 10194 + }, + { + "epoch": 3.129220380601596, + "grad_norm": 0.2456093430519104, + "learning_rate": 8.044300047825254e-05, + "loss": 1.7769, + "step": 10195 + }, + { + "epoch": 3.129527317372621, + "grad_norm": 0.25085800886154175, + "learning_rate": 8.043905729360264e-05, + "loss": 1.7723, + "step": 10196 + }, + { + "epoch": 3.1298342541436464, + "grad_norm": 0.2505287826061249, + "learning_rate": 8.043511380813612e-05, + "loss": 1.7943, + "step": 10197 + }, + { + "epoch": 3.1301411909146717, + "grad_norm": 0.27144530415534973, + "learning_rate": 8.043117002189198e-05, + "loss": 1.8119, + "step": 10198 + }, + { + "epoch": 3.1304481276856966, + "grad_norm": 0.2702989876270294, + "learning_rate": 8.042722593490916e-05, + "loss": 1.8517, + "step": 10199 + }, + { + "epoch": 3.130755064456722, + "grad_norm": 0.2585136890411377, + "learning_rate": 8.042328154722667e-05, + "loss": 1.8382, + "step": 10200 + }, + { + "epoch": 3.1310620012277472, + "grad_norm": 0.26306065917015076, + "learning_rate": 8.041933685888348e-05, + "loss": 1.8211, + "step": 10201 + }, + { + "epoch": 3.131368937998772, + "grad_norm": 0.2208927720785141, + "learning_rate": 8.041539186991858e-05, + "loss": 1.7765, + "step": 10202 + }, + { + "epoch": 3.1316758747697975, + "grad_norm": 0.2756440043449402, + "learning_rate": 8.041144658037095e-05, + "loss": 1.898, + "step": 10203 + }, + { + "epoch": 3.131982811540823, + "grad_norm": 0.29718101024627686, + "learning_rate": 8.040750099027958e-05, + "loss": 1.8226, + "step": 10204 + }, + { + "epoch": 3.1322897483118477, + "grad_norm": 0.3166738748550415, + "learning_rate": 8.040355509968345e-05, + "loss": 1.8129, + "step": 10205 + }, + { + "epoch": 3.132596685082873, + "grad_norm": 0.3534909784793854, + "learning_rate": 8.039960890862158e-05, + "loss": 1.8915, + "step": 10206 + }, + { + "epoch": 3.132903621853898, + "grad_norm": 0.3015006184577942, + "learning_rate": 8.039566241713297e-05, + "loss": 1.8389, + "step": 10207 + }, + { + "epoch": 3.1332105586249233, + "grad_norm": 0.35226619243621826, + "learning_rate": 8.039171562525659e-05, + "loss": 1.7287, + "step": 10208 + }, + { + "epoch": 3.1335174953959486, + "grad_norm": 0.4290136694908142, + "learning_rate": 8.038776853303146e-05, + "loss": 1.8768, + "step": 10209 + }, + { + "epoch": 3.1338244321669735, + "grad_norm": 0.2828960418701172, + "learning_rate": 8.03838211404966e-05, + "loss": 1.7552, + "step": 10210 + }, + { + "epoch": 3.134131368937999, + "grad_norm": 0.3781953752040863, + "learning_rate": 8.0379873447691e-05, + "loss": 1.7812, + "step": 10211 + }, + { + "epoch": 3.1344383057090237, + "grad_norm": 0.4282926023006439, + "learning_rate": 8.037592545465371e-05, + "loss": 1.84, + "step": 10212 + }, + { + "epoch": 3.134745242480049, + "grad_norm": 0.2622411251068115, + "learning_rate": 8.03719771614237e-05, + "loss": 1.8114, + "step": 10213 + }, + { + "epoch": 3.1350521792510744, + "grad_norm": 0.34881457686424255, + "learning_rate": 8.036802856804001e-05, + "loss": 1.7694, + "step": 10214 + }, + { + "epoch": 3.1353591160220993, + "grad_norm": 0.40797632932662964, + "learning_rate": 8.036407967454167e-05, + "loss": 1.7595, + "step": 10215 + }, + { + "epoch": 3.1356660527931246, + "grad_norm": 0.24902814626693726, + "learning_rate": 8.036013048096769e-05, + "loss": 1.8068, + "step": 10216 + }, + { + "epoch": 3.13597298956415, + "grad_norm": 0.3682909607887268, + "learning_rate": 8.035618098735711e-05, + "loss": 1.8519, + "step": 10217 + }, + { + "epoch": 3.136279926335175, + "grad_norm": 0.6111233234405518, + "learning_rate": 8.035223119374895e-05, + "loss": 1.9254, + "step": 10218 + }, + { + "epoch": 3.1365868631062, + "grad_norm": 0.4793062210083008, + "learning_rate": 8.034828110018227e-05, + "loss": 1.786, + "step": 10219 + }, + { + "epoch": 3.1368937998772255, + "grad_norm": 0.3074932396411896, + "learning_rate": 8.034433070669607e-05, + "loss": 1.8495, + "step": 10220 + }, + { + "epoch": 3.1372007366482504, + "grad_norm": 0.4366479218006134, + "learning_rate": 8.034038001332942e-05, + "loss": 1.8501, + "step": 10221 + }, + { + "epoch": 3.1375076734192757, + "grad_norm": 0.4660070538520813, + "learning_rate": 8.033642902012135e-05, + "loss": 1.8317, + "step": 10222 + }, + { + "epoch": 3.1378146101903006, + "grad_norm": 0.3452899158000946, + "learning_rate": 8.03324777271109e-05, + "loss": 1.8702, + "step": 10223 + }, + { + "epoch": 3.138121546961326, + "grad_norm": 0.3658824563026428, + "learning_rate": 8.032852613433713e-05, + "loss": 1.8754, + "step": 10224 + }, + { + "epoch": 3.1384284837323513, + "grad_norm": 0.3777768909931183, + "learning_rate": 8.03245742418391e-05, + "loss": 1.8613, + "step": 10225 + }, + { + "epoch": 3.138735420503376, + "grad_norm": 0.3873192071914673, + "learning_rate": 8.032062204965582e-05, + "loss": 1.8438, + "step": 10226 + }, + { + "epoch": 3.1390423572744015, + "grad_norm": 0.30686715245246887, + "learning_rate": 8.031666955782641e-05, + "loss": 1.811, + "step": 10227 + }, + { + "epoch": 3.139349294045427, + "grad_norm": 0.2738516330718994, + "learning_rate": 8.03127167663899e-05, + "loss": 1.757, + "step": 10228 + }, + { + "epoch": 3.1396562308164517, + "grad_norm": 0.3093133270740509, + "learning_rate": 8.030876367538536e-05, + "loss": 1.8181, + "step": 10229 + }, + { + "epoch": 3.139963167587477, + "grad_norm": 0.3247159719467163, + "learning_rate": 8.030481028485185e-05, + "loss": 1.7798, + "step": 10230 + }, + { + "epoch": 3.140270104358502, + "grad_norm": 0.2855088412761688, + "learning_rate": 8.030085659482845e-05, + "loss": 1.825, + "step": 10231 + }, + { + "epoch": 3.1405770411295273, + "grad_norm": 0.2818242907524109, + "learning_rate": 8.02969026053542e-05, + "loss": 1.7737, + "step": 10232 + }, + { + "epoch": 3.1408839779005526, + "grad_norm": 0.27074751257896423, + "learning_rate": 8.029294831646822e-05, + "loss": 1.8306, + "step": 10233 + }, + { + "epoch": 3.1411909146715775, + "grad_norm": 0.29740920662879944, + "learning_rate": 8.028899372820954e-05, + "loss": 1.8157, + "step": 10234 + }, + { + "epoch": 3.141497851442603, + "grad_norm": 0.30743202567100525, + "learning_rate": 8.028503884061731e-05, + "loss": 1.7626, + "step": 10235 + }, + { + "epoch": 3.141804788213628, + "grad_norm": 0.27812567353248596, + "learning_rate": 8.028108365373058e-05, + "loss": 1.7604, + "step": 10236 + }, + { + "epoch": 3.142111724984653, + "grad_norm": 0.26212629675865173, + "learning_rate": 8.027712816758839e-05, + "loss": 1.8161, + "step": 10237 + }, + { + "epoch": 3.1424186617556784, + "grad_norm": 0.3611658811569214, + "learning_rate": 8.02731723822299e-05, + "loss": 1.8283, + "step": 10238 + }, + { + "epoch": 3.1427255985267033, + "grad_norm": 0.31705498695373535, + "learning_rate": 8.026921629769418e-05, + "loss": 1.7986, + "step": 10239 + }, + { + "epoch": 3.1430325352977286, + "grad_norm": 0.25905972719192505, + "learning_rate": 8.026525991402032e-05, + "loss": 1.7926, + "step": 10240 + }, + { + "epoch": 3.143339472068754, + "grad_norm": 0.42376595735549927, + "learning_rate": 8.026130323124741e-05, + "loss": 1.8275, + "step": 10241 + }, + { + "epoch": 3.143646408839779, + "grad_norm": 0.415556401014328, + "learning_rate": 8.025734624941458e-05, + "loss": 1.7938, + "step": 10242 + }, + { + "epoch": 3.143953345610804, + "grad_norm": 0.3558904528617859, + "learning_rate": 8.025338896856091e-05, + "loss": 1.836, + "step": 10243 + }, + { + "epoch": 3.1442602823818295, + "grad_norm": 0.3091062307357788, + "learning_rate": 8.024943138872553e-05, + "loss": 1.8285, + "step": 10244 + }, + { + "epoch": 3.1445672191528544, + "grad_norm": 0.2620905041694641, + "learning_rate": 8.024547350994753e-05, + "loss": 1.7115, + "step": 10245 + }, + { + "epoch": 3.1448741559238798, + "grad_norm": 0.25716835260391235, + "learning_rate": 8.024151533226604e-05, + "loss": 1.7702, + "step": 10246 + }, + { + "epoch": 3.1451810926949046, + "grad_norm": 0.250844269990921, + "learning_rate": 8.023755685572017e-05, + "loss": 1.7617, + "step": 10247 + }, + { + "epoch": 3.14548802946593, + "grad_norm": 0.23898956179618835, + "learning_rate": 8.023359808034903e-05, + "loss": 1.7872, + "step": 10248 + }, + { + "epoch": 3.1457949662369553, + "grad_norm": 0.2335387021303177, + "learning_rate": 8.022963900619176e-05, + "loss": 1.7656, + "step": 10249 + }, + { + "epoch": 3.14610190300798, + "grad_norm": 0.21822704374790192, + "learning_rate": 8.022567963328749e-05, + "loss": 1.7706, + "step": 10250 + }, + { + "epoch": 3.1464088397790055, + "grad_norm": 0.2627898156642914, + "learning_rate": 8.022171996167531e-05, + "loss": 1.8559, + "step": 10251 + }, + { + "epoch": 3.146715776550031, + "grad_norm": 0.2530064582824707, + "learning_rate": 8.021775999139441e-05, + "loss": 1.788, + "step": 10252 + }, + { + "epoch": 3.1470227133210558, + "grad_norm": 0.2293635457754135, + "learning_rate": 8.021379972248387e-05, + "loss": 1.8129, + "step": 10253 + }, + { + "epoch": 3.147329650092081, + "grad_norm": 0.27753588557243347, + "learning_rate": 8.020983915498286e-05, + "loss": 1.7957, + "step": 10254 + }, + { + "epoch": 3.147636586863106, + "grad_norm": 0.24507668614387512, + "learning_rate": 8.020587828893051e-05, + "loss": 1.7969, + "step": 10255 + }, + { + "epoch": 3.1479435236341313, + "grad_norm": 0.24818891286849976, + "learning_rate": 8.020191712436598e-05, + "loss": 1.8412, + "step": 10256 + }, + { + "epoch": 3.1482504604051567, + "grad_norm": 0.2463149130344391, + "learning_rate": 8.01979556613284e-05, + "loss": 1.8097, + "step": 10257 + }, + { + "epoch": 3.1485573971761815, + "grad_norm": 0.26742151379585266, + "learning_rate": 8.019399389985692e-05, + "loss": 1.8487, + "step": 10258 + }, + { + "epoch": 3.148864333947207, + "grad_norm": 0.3078254461288452, + "learning_rate": 8.01900318399907e-05, + "loss": 1.8189, + "step": 10259 + }, + { + "epoch": 3.149171270718232, + "grad_norm": 0.3819321393966675, + "learning_rate": 8.018606948176887e-05, + "loss": 1.8019, + "step": 10260 + }, + { + "epoch": 3.149478207489257, + "grad_norm": 0.3932126462459564, + "learning_rate": 8.018210682523061e-05, + "loss": 1.787, + "step": 10261 + }, + { + "epoch": 3.1497851442602824, + "grad_norm": 0.2696186900138855, + "learning_rate": 8.017814387041511e-05, + "loss": 1.8345, + "step": 10262 + }, + { + "epoch": 3.150092081031308, + "grad_norm": 0.32631832361221313, + "learning_rate": 8.017418061736149e-05, + "loss": 1.7724, + "step": 10263 + }, + { + "epoch": 3.1503990178023327, + "grad_norm": 0.36187833547592163, + "learning_rate": 8.017021706610893e-05, + "loss": 1.7829, + "step": 10264 + }, + { + "epoch": 3.150705954573358, + "grad_norm": 0.29678142070770264, + "learning_rate": 8.01662532166966e-05, + "loss": 1.7896, + "step": 10265 + }, + { + "epoch": 3.151012891344383, + "grad_norm": 0.2997078001499176, + "learning_rate": 8.016228906916368e-05, + "loss": 1.8401, + "step": 10266 + }, + { + "epoch": 3.1513198281154082, + "grad_norm": 0.4688792824745178, + "learning_rate": 8.015832462354933e-05, + "loss": 1.8263, + "step": 10267 + }, + { + "epoch": 3.1516267648864336, + "grad_norm": 0.42710503935813904, + "learning_rate": 8.015435987989275e-05, + "loss": 1.8233, + "step": 10268 + }, + { + "epoch": 3.1519337016574585, + "grad_norm": 0.2490987628698349, + "learning_rate": 8.01503948382331e-05, + "loss": 1.7792, + "step": 10269 + }, + { + "epoch": 3.152240638428484, + "grad_norm": 0.400836706161499, + "learning_rate": 8.014642949860957e-05, + "loss": 1.8113, + "step": 10270 + }, + { + "epoch": 3.1525475751995087, + "grad_norm": 0.47995972633361816, + "learning_rate": 8.014246386106138e-05, + "loss": 1.8754, + "step": 10271 + }, + { + "epoch": 3.152854511970534, + "grad_norm": 0.39069879055023193, + "learning_rate": 8.013849792562769e-05, + "loss": 1.8541, + "step": 10272 + }, + { + "epoch": 3.1531614487415593, + "grad_norm": 0.27174463868141174, + "learning_rate": 8.013453169234768e-05, + "loss": 1.8018, + "step": 10273 + }, + { + "epoch": 3.1534683855125842, + "grad_norm": 0.37808045744895935, + "learning_rate": 8.013056516126058e-05, + "loss": 1.8346, + "step": 10274 + }, + { + "epoch": 3.1537753222836096, + "grad_norm": 0.43864908814430237, + "learning_rate": 8.012659833240557e-05, + "loss": 1.7626, + "step": 10275 + }, + { + "epoch": 3.154082259054635, + "grad_norm": 0.3592168688774109, + "learning_rate": 8.012263120582187e-05, + "loss": 1.8261, + "step": 10276 + }, + { + "epoch": 3.15438919582566, + "grad_norm": 0.3056562542915344, + "learning_rate": 8.011866378154866e-05, + "loss": 1.903, + "step": 10277 + }, + { + "epoch": 3.154696132596685, + "grad_norm": 0.2898549735546112, + "learning_rate": 8.011469605962517e-05, + "loss": 1.7781, + "step": 10278 + }, + { + "epoch": 3.1550030693677105, + "grad_norm": 0.3498871624469757, + "learning_rate": 8.011072804009059e-05, + "loss": 1.7571, + "step": 10279 + }, + { + "epoch": 3.1553100061387354, + "grad_norm": 0.3330932557582855, + "learning_rate": 8.010675972298416e-05, + "loss": 1.8298, + "step": 10280 + }, + { + "epoch": 3.1556169429097607, + "grad_norm": 0.2540839910507202, + "learning_rate": 8.010279110834507e-05, + "loss": 1.8327, + "step": 10281 + }, + { + "epoch": 3.1559238796807856, + "grad_norm": 0.3557111322879791, + "learning_rate": 8.009882219621257e-05, + "loss": 1.7611, + "step": 10282 + }, + { + "epoch": 3.156230816451811, + "grad_norm": 0.28293952345848083, + "learning_rate": 8.009485298662584e-05, + "loss": 1.7761, + "step": 10283 + }, + { + "epoch": 3.1565377532228363, + "grad_norm": 0.27089303731918335, + "learning_rate": 8.009088347962416e-05, + "loss": 1.8081, + "step": 10284 + }, + { + "epoch": 3.156844689993861, + "grad_norm": 0.2689332664012909, + "learning_rate": 8.008691367524673e-05, + "loss": 1.7458, + "step": 10285 + }, + { + "epoch": 3.1571516267648865, + "grad_norm": 0.2495841234922409, + "learning_rate": 8.008294357353278e-05, + "loss": 1.8307, + "step": 10286 + }, + { + "epoch": 3.1574585635359114, + "grad_norm": 0.29242852330207825, + "learning_rate": 8.007897317452156e-05, + "loss": 1.9216, + "step": 10287 + }, + { + "epoch": 3.1577655003069367, + "grad_norm": 0.26574134826660156, + "learning_rate": 8.007500247825229e-05, + "loss": 1.8392, + "step": 10288 + }, + { + "epoch": 3.158072437077962, + "grad_norm": 0.2503872811794281, + "learning_rate": 8.00710314847642e-05, + "loss": 1.7742, + "step": 10289 + }, + { + "epoch": 3.158379373848987, + "grad_norm": 0.25614771246910095, + "learning_rate": 8.006706019409658e-05, + "loss": 1.828, + "step": 10290 + }, + { + "epoch": 3.1586863106200123, + "grad_norm": 0.259369820356369, + "learning_rate": 8.006308860628863e-05, + "loss": 1.8328, + "step": 10291 + }, + { + "epoch": 3.1589932473910376, + "grad_norm": 0.28183647990226746, + "learning_rate": 8.005911672137962e-05, + "loss": 1.8269, + "step": 10292 + }, + { + "epoch": 3.1593001841620625, + "grad_norm": 0.2926514446735382, + "learning_rate": 8.005514453940881e-05, + "loss": 1.8334, + "step": 10293 + }, + { + "epoch": 3.159607120933088, + "grad_norm": 0.34313449263572693, + "learning_rate": 8.005117206041543e-05, + "loss": 1.7866, + "step": 10294 + }, + { + "epoch": 3.159914057704113, + "grad_norm": 0.30971628427505493, + "learning_rate": 8.004719928443875e-05, + "loss": 1.7827, + "step": 10295 + }, + { + "epoch": 3.160220994475138, + "grad_norm": 0.23955371975898743, + "learning_rate": 8.004322621151807e-05, + "loss": 1.7619, + "step": 10296 + }, + { + "epoch": 3.1605279312461634, + "grad_norm": 0.31311795115470886, + "learning_rate": 8.003925284169261e-05, + "loss": 1.8247, + "step": 10297 + }, + { + "epoch": 3.1608348680171883, + "grad_norm": 0.3408358097076416, + "learning_rate": 8.003527917500163e-05, + "loss": 1.8146, + "step": 10298 + }, + { + "epoch": 3.1611418047882136, + "grad_norm": 0.3030858337879181, + "learning_rate": 8.003130521148442e-05, + "loss": 1.857, + "step": 10299 + }, + { + "epoch": 3.161448741559239, + "grad_norm": 0.25168511271476746, + "learning_rate": 8.002733095118025e-05, + "loss": 1.8404, + "step": 10300 + }, + { + "epoch": 3.161755678330264, + "grad_norm": 0.2956216335296631, + "learning_rate": 8.002335639412839e-05, + "loss": 1.7352, + "step": 10301 + }, + { + "epoch": 3.162062615101289, + "grad_norm": 0.27791857719421387, + "learning_rate": 8.001938154036814e-05, + "loss": 1.7797, + "step": 10302 + }, + { + "epoch": 3.1623695518723145, + "grad_norm": 0.3106420040130615, + "learning_rate": 8.001540638993876e-05, + "loss": 1.8434, + "step": 10303 + }, + { + "epoch": 3.1626764886433394, + "grad_norm": 0.2940445840358734, + "learning_rate": 8.001143094287954e-05, + "loss": 1.8459, + "step": 10304 + }, + { + "epoch": 3.1629834254143647, + "grad_norm": 0.3857429325580597, + "learning_rate": 8.000745519922977e-05, + "loss": 1.7853, + "step": 10305 + }, + { + "epoch": 3.1632903621853896, + "grad_norm": 0.3585071861743927, + "learning_rate": 8.000347915902874e-05, + "loss": 1.8905, + "step": 10306 + }, + { + "epoch": 3.163597298956415, + "grad_norm": 0.320003867149353, + "learning_rate": 7.999950282231574e-05, + "loss": 1.8397, + "step": 10307 + }, + { + "epoch": 3.1639042357274403, + "grad_norm": 0.24986252188682556, + "learning_rate": 7.999552618913009e-05, + "loss": 1.7916, + "step": 10308 + }, + { + "epoch": 3.164211172498465, + "grad_norm": 0.33077237010002136, + "learning_rate": 7.999154925951104e-05, + "loss": 1.8334, + "step": 10309 + }, + { + "epoch": 3.1645181092694905, + "grad_norm": 0.35700327157974243, + "learning_rate": 7.998757203349794e-05, + "loss": 1.7773, + "step": 10310 + }, + { + "epoch": 3.164825046040516, + "grad_norm": 0.3095493018627167, + "learning_rate": 7.998359451113007e-05, + "loss": 1.8156, + "step": 10311 + }, + { + "epoch": 3.1651319828115407, + "grad_norm": 0.3004748225212097, + "learning_rate": 7.997961669244673e-05, + "loss": 1.7862, + "step": 10312 + }, + { + "epoch": 3.165438919582566, + "grad_norm": 0.39382806420326233, + "learning_rate": 7.99756385774873e-05, + "loss": 1.764, + "step": 10313 + }, + { + "epoch": 3.165745856353591, + "grad_norm": 0.3109463155269623, + "learning_rate": 7.997166016629099e-05, + "loss": 1.8006, + "step": 10314 + }, + { + "epoch": 3.1660527931246163, + "grad_norm": 0.2896469235420227, + "learning_rate": 7.996768145889717e-05, + "loss": 1.8373, + "step": 10315 + }, + { + "epoch": 3.1663597298956416, + "grad_norm": 0.35024940967559814, + "learning_rate": 7.996370245534517e-05, + "loss": 1.797, + "step": 10316 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.3228827714920044, + "learning_rate": 7.995972315567431e-05, + "loss": 1.7757, + "step": 10317 + }, + { + "epoch": 3.166973603437692, + "grad_norm": 0.27102410793304443, + "learning_rate": 7.995574355992388e-05, + "loss": 1.7786, + "step": 10318 + }, + { + "epoch": 3.167280540208717, + "grad_norm": 0.2556116580963135, + "learning_rate": 7.995176366813325e-05, + "loss": 1.7621, + "step": 10319 + }, + { + "epoch": 3.167587476979742, + "grad_norm": 0.28279444575309753, + "learning_rate": 7.994778348034173e-05, + "loss": 1.7954, + "step": 10320 + }, + { + "epoch": 3.1678944137507674, + "grad_norm": 0.31778639554977417, + "learning_rate": 7.994380299658867e-05, + "loss": 1.7657, + "step": 10321 + }, + { + "epoch": 3.1682013505217923, + "grad_norm": 0.27935469150543213, + "learning_rate": 7.993982221691339e-05, + "loss": 1.7502, + "step": 10322 + }, + { + "epoch": 3.1685082872928176, + "grad_norm": 0.29012617468833923, + "learning_rate": 7.993584114135524e-05, + "loss": 1.8497, + "step": 10323 + }, + { + "epoch": 3.168815224063843, + "grad_norm": 0.2674056887626648, + "learning_rate": 7.993185976995356e-05, + "loss": 1.7875, + "step": 10324 + }, + { + "epoch": 3.169122160834868, + "grad_norm": 0.2667328417301178, + "learning_rate": 7.992787810274771e-05, + "loss": 1.771, + "step": 10325 + }, + { + "epoch": 3.169429097605893, + "grad_norm": 0.25807151198387146, + "learning_rate": 7.992389613977702e-05, + "loss": 1.7638, + "step": 10326 + }, + { + "epoch": 3.1697360343769185, + "grad_norm": 0.2572930157184601, + "learning_rate": 7.991991388108084e-05, + "loss": 1.8218, + "step": 10327 + }, + { + "epoch": 3.1700429711479434, + "grad_norm": 0.3955067992210388, + "learning_rate": 7.991593132669855e-05, + "loss": 1.8458, + "step": 10328 + }, + { + "epoch": 3.1703499079189688, + "grad_norm": 0.2813466489315033, + "learning_rate": 7.991194847666948e-05, + "loss": 1.8042, + "step": 10329 + }, + { + "epoch": 3.1706568446899936, + "grad_norm": 0.2645012140274048, + "learning_rate": 7.990796533103302e-05, + "loss": 1.8241, + "step": 10330 + }, + { + "epoch": 3.170963781461019, + "grad_norm": 0.28462091088294983, + "learning_rate": 7.99039818898285e-05, + "loss": 1.8853, + "step": 10331 + }, + { + "epoch": 3.1712707182320443, + "grad_norm": 0.2727372944355011, + "learning_rate": 7.98999981530953e-05, + "loss": 1.7564, + "step": 10332 + }, + { + "epoch": 3.171577655003069, + "grad_norm": 0.2658170759677887, + "learning_rate": 7.989601412087281e-05, + "loss": 1.8344, + "step": 10333 + }, + { + "epoch": 3.1718845917740945, + "grad_norm": 0.29713502526283264, + "learning_rate": 7.989202979320039e-05, + "loss": 1.8721, + "step": 10334 + }, + { + "epoch": 3.17219152854512, + "grad_norm": 0.26609495282173157, + "learning_rate": 7.98880451701174e-05, + "loss": 1.7991, + "step": 10335 + }, + { + "epoch": 3.1724984653161448, + "grad_norm": 0.29779741168022156, + "learning_rate": 7.988406025166322e-05, + "loss": 1.8182, + "step": 10336 + }, + { + "epoch": 3.17280540208717, + "grad_norm": 0.2771340012550354, + "learning_rate": 7.988007503787724e-05, + "loss": 1.8034, + "step": 10337 + }, + { + "epoch": 3.1731123388581954, + "grad_norm": 0.30510422587394714, + "learning_rate": 7.987608952879886e-05, + "loss": 1.8477, + "step": 10338 + }, + { + "epoch": 3.1734192756292203, + "grad_norm": 0.3097476363182068, + "learning_rate": 7.987210372446745e-05, + "loss": 1.7572, + "step": 10339 + }, + { + "epoch": 3.1737262124002457, + "grad_norm": 0.2553942799568176, + "learning_rate": 7.986811762492239e-05, + "loss": 1.7837, + "step": 10340 + }, + { + "epoch": 3.1740331491712706, + "grad_norm": 0.26546719670295715, + "learning_rate": 7.986413123020312e-05, + "loss": 1.7893, + "step": 10341 + }, + { + "epoch": 3.174340085942296, + "grad_norm": 0.37721553444862366, + "learning_rate": 7.986014454034895e-05, + "loss": 1.8475, + "step": 10342 + }, + { + "epoch": 3.174647022713321, + "grad_norm": 0.3215494453907013, + "learning_rate": 7.985615755539937e-05, + "loss": 1.7806, + "step": 10343 + }, + { + "epoch": 3.174953959484346, + "grad_norm": 0.2662442922592163, + "learning_rate": 7.985217027539373e-05, + "loss": 1.8116, + "step": 10344 + }, + { + "epoch": 3.1752608962553714, + "grad_norm": 0.23334236443042755, + "learning_rate": 7.984818270037145e-05, + "loss": 1.7929, + "step": 10345 + }, + { + "epoch": 3.1755678330263963, + "grad_norm": 0.2873367667198181, + "learning_rate": 7.98441948303719e-05, + "loss": 1.7808, + "step": 10346 + }, + { + "epoch": 3.1758747697974217, + "grad_norm": 0.3623826801776886, + "learning_rate": 7.984020666543458e-05, + "loss": 1.8817, + "step": 10347 + }, + { + "epoch": 3.176181706568447, + "grad_norm": 0.3060589134693146, + "learning_rate": 7.983621820559881e-05, + "loss": 1.796, + "step": 10348 + }, + { + "epoch": 3.176488643339472, + "grad_norm": 0.2396882325410843, + "learning_rate": 7.983222945090407e-05, + "loss": 1.7455, + "step": 10349 + }, + { + "epoch": 3.1767955801104972, + "grad_norm": 0.24811476469039917, + "learning_rate": 7.982824040138974e-05, + "loss": 1.7907, + "step": 10350 + }, + { + "epoch": 3.1771025168815226, + "grad_norm": 0.32749706506729126, + "learning_rate": 7.982425105709524e-05, + "loss": 1.8553, + "step": 10351 + }, + { + "epoch": 3.1774094536525475, + "grad_norm": 0.3648095726966858, + "learning_rate": 7.982026141806003e-05, + "loss": 1.8387, + "step": 10352 + }, + { + "epoch": 3.177716390423573, + "grad_norm": 0.2749348282814026, + "learning_rate": 7.981627148432352e-05, + "loss": 1.7676, + "step": 10353 + }, + { + "epoch": 3.178023327194598, + "grad_norm": 0.2735142409801483, + "learning_rate": 7.981228125592513e-05, + "loss": 1.822, + "step": 10354 + }, + { + "epoch": 3.178330263965623, + "grad_norm": 0.28759655356407166, + "learning_rate": 7.98082907329043e-05, + "loss": 1.8113, + "step": 10355 + }, + { + "epoch": 3.1786372007366483, + "grad_norm": 0.33661654591560364, + "learning_rate": 7.980429991530048e-05, + "loss": 1.8036, + "step": 10356 + }, + { + "epoch": 3.1789441375076732, + "grad_norm": 0.2634892761707306, + "learning_rate": 7.98003088031531e-05, + "loss": 1.8323, + "step": 10357 + }, + { + "epoch": 3.1792510742786986, + "grad_norm": 0.25864094495773315, + "learning_rate": 7.979631739650158e-05, + "loss": 1.8199, + "step": 10358 + }, + { + "epoch": 3.179558011049724, + "grad_norm": 0.27368444204330444, + "learning_rate": 7.979232569538541e-05, + "loss": 1.7673, + "step": 10359 + }, + { + "epoch": 3.179864947820749, + "grad_norm": 0.2506616413593292, + "learning_rate": 7.9788333699844e-05, + "loss": 1.7912, + "step": 10360 + }, + { + "epoch": 3.180171884591774, + "grad_norm": 0.2539178133010864, + "learning_rate": 7.978434140991684e-05, + "loss": 1.7934, + "step": 10361 + }, + { + "epoch": 3.1804788213627995, + "grad_norm": 0.2605626881122589, + "learning_rate": 7.978034882564334e-05, + "loss": 1.8031, + "step": 10362 + }, + { + "epoch": 3.1807857581338244, + "grad_norm": 0.2610207796096802, + "learning_rate": 7.977635594706299e-05, + "loss": 1.8664, + "step": 10363 + }, + { + "epoch": 3.1810926949048497, + "grad_norm": 0.26164132356643677, + "learning_rate": 7.977236277421523e-05, + "loss": 1.7758, + "step": 10364 + }, + { + "epoch": 3.1813996316758746, + "grad_norm": 0.3122340142726898, + "learning_rate": 7.976836930713953e-05, + "loss": 1.9033, + "step": 10365 + }, + { + "epoch": 3.1817065684469, + "grad_norm": 0.3317202031612396, + "learning_rate": 7.976437554587537e-05, + "loss": 1.7899, + "step": 10366 + }, + { + "epoch": 3.1820135052179253, + "grad_norm": 0.28612568974494934, + "learning_rate": 7.97603814904622e-05, + "loss": 1.8145, + "step": 10367 + }, + { + "epoch": 3.18232044198895, + "grad_norm": 0.349917471408844, + "learning_rate": 7.975638714093949e-05, + "loss": 1.877, + "step": 10368 + }, + { + "epoch": 3.1826273787599755, + "grad_norm": 0.3737771809101105, + "learning_rate": 7.975239249734672e-05, + "loss": 1.8204, + "step": 10369 + }, + { + "epoch": 3.182934315531001, + "grad_norm": 0.3688446879386902, + "learning_rate": 7.974839755972339e-05, + "loss": 1.8487, + "step": 10370 + }, + { + "epoch": 3.1832412523020257, + "grad_norm": 0.2934897541999817, + "learning_rate": 7.974440232810894e-05, + "loss": 1.8243, + "step": 10371 + }, + { + "epoch": 3.183548189073051, + "grad_norm": 0.2596173882484436, + "learning_rate": 7.974040680254287e-05, + "loss": 1.7887, + "step": 10372 + }, + { + "epoch": 3.183855125844076, + "grad_norm": 0.35686594247817993, + "learning_rate": 7.973641098306468e-05, + "loss": 1.8653, + "step": 10373 + }, + { + "epoch": 3.1841620626151013, + "grad_norm": 0.3187713921070099, + "learning_rate": 7.973241486971383e-05, + "loss": 1.8767, + "step": 10374 + }, + { + "epoch": 3.1844689993861266, + "grad_norm": 0.2596273124217987, + "learning_rate": 7.972841846252985e-05, + "loss": 1.8028, + "step": 10375 + }, + { + "epoch": 3.1847759361571515, + "grad_norm": 0.2637474834918976, + "learning_rate": 7.972442176155221e-05, + "loss": 1.802, + "step": 10376 + }, + { + "epoch": 3.185082872928177, + "grad_norm": 0.2641126215457916, + "learning_rate": 7.97204247668204e-05, + "loss": 1.7931, + "step": 10377 + }, + { + "epoch": 3.185389809699202, + "grad_norm": 0.25594159960746765, + "learning_rate": 7.971642747837393e-05, + "loss": 1.818, + "step": 10378 + }, + { + "epoch": 3.185696746470227, + "grad_norm": 0.26567938923835754, + "learning_rate": 7.971242989625233e-05, + "loss": 1.8174, + "step": 10379 + }, + { + "epoch": 3.1860036832412524, + "grad_norm": 0.29580214619636536, + "learning_rate": 7.970843202049508e-05, + "loss": 1.869, + "step": 10380 + }, + { + "epoch": 3.1863106200122773, + "grad_norm": 0.2657530605792999, + "learning_rate": 7.970443385114168e-05, + "loss": 1.8352, + "step": 10381 + }, + { + "epoch": 3.1866175567833026, + "grad_norm": 0.2468358278274536, + "learning_rate": 7.970043538823165e-05, + "loss": 1.7851, + "step": 10382 + }, + { + "epoch": 3.186924493554328, + "grad_norm": 0.26464715600013733, + "learning_rate": 7.969643663180451e-05, + "loss": 1.8208, + "step": 10383 + }, + { + "epoch": 3.187231430325353, + "grad_norm": 0.26035723090171814, + "learning_rate": 7.969243758189979e-05, + "loss": 1.8089, + "step": 10384 + }, + { + "epoch": 3.187538367096378, + "grad_norm": 0.2644619941711426, + "learning_rate": 7.968843823855699e-05, + "loss": 1.8379, + "step": 10385 + }, + { + "epoch": 3.1878453038674035, + "grad_norm": 0.25576624274253845, + "learning_rate": 7.968443860181565e-05, + "loss": 1.7932, + "step": 10386 + }, + { + "epoch": 3.1881522406384284, + "grad_norm": 0.24276074767112732, + "learning_rate": 7.968043867171528e-05, + "loss": 1.8037, + "step": 10387 + }, + { + "epoch": 3.1884591774094537, + "grad_norm": 0.27156540751457214, + "learning_rate": 7.967643844829543e-05, + "loss": 1.7998, + "step": 10388 + }, + { + "epoch": 3.1887661141804786, + "grad_norm": 0.2555428743362427, + "learning_rate": 7.96724379315956e-05, + "loss": 1.7612, + "step": 10389 + }, + { + "epoch": 3.189073050951504, + "grad_norm": 0.3358438014984131, + "learning_rate": 7.966843712165537e-05, + "loss": 1.8543, + "step": 10390 + }, + { + "epoch": 3.1893799877225293, + "grad_norm": 0.2799586355686188, + "learning_rate": 7.966443601851424e-05, + "loss": 1.819, + "step": 10391 + }, + { + "epoch": 3.189686924493554, + "grad_norm": 0.2364189177751541, + "learning_rate": 7.966043462221178e-05, + "loss": 1.8537, + "step": 10392 + }, + { + "epoch": 3.1899938612645795, + "grad_norm": 0.23849403858184814, + "learning_rate": 7.96564329327875e-05, + "loss": 1.8125, + "step": 10393 + }, + { + "epoch": 3.190300798035605, + "grad_norm": 0.2371583878993988, + "learning_rate": 7.965243095028098e-05, + "loss": 1.7352, + "step": 10394 + }, + { + "epoch": 3.1906077348066297, + "grad_norm": 0.2584737539291382, + "learning_rate": 7.964842867473176e-05, + "loss": 1.8801, + "step": 10395 + }, + { + "epoch": 3.190914671577655, + "grad_norm": 0.27768051624298096, + "learning_rate": 7.964442610617939e-05, + "loss": 1.8221, + "step": 10396 + }, + { + "epoch": 3.1912216083486804, + "grad_norm": 0.2680891752243042, + "learning_rate": 7.964042324466341e-05, + "loss": 1.8371, + "step": 10397 + }, + { + "epoch": 3.1915285451197053, + "grad_norm": 0.25301921367645264, + "learning_rate": 7.963642009022343e-05, + "loss": 1.7972, + "step": 10398 + }, + { + "epoch": 3.1918354818907306, + "grad_norm": 0.2589731216430664, + "learning_rate": 7.963241664289896e-05, + "loss": 1.8145, + "step": 10399 + }, + { + "epoch": 3.1921424186617555, + "grad_norm": 0.2611297369003296, + "learning_rate": 7.962841290272956e-05, + "loss": 1.8736, + "step": 10400 + }, + { + "epoch": 3.192449355432781, + "grad_norm": 0.2812272906303406, + "learning_rate": 7.962440886975483e-05, + "loss": 1.8116, + "step": 10401 + }, + { + "epoch": 3.192756292203806, + "grad_norm": 0.3261657655239105, + "learning_rate": 7.962040454401434e-05, + "loss": 1.7935, + "step": 10402 + }, + { + "epoch": 3.193063228974831, + "grad_norm": 0.3355373442173004, + "learning_rate": 7.961639992554764e-05, + "loss": 1.7957, + "step": 10403 + }, + { + "epoch": 3.1933701657458564, + "grad_norm": 0.2811843156814575, + "learning_rate": 7.961239501439432e-05, + "loss": 1.797, + "step": 10404 + }, + { + "epoch": 3.1936771025168813, + "grad_norm": 0.24933238327503204, + "learning_rate": 7.960838981059395e-05, + "loss": 1.7594, + "step": 10405 + }, + { + "epoch": 3.1939840392879066, + "grad_norm": 0.29110121726989746, + "learning_rate": 7.960438431418613e-05, + "loss": 1.8268, + "step": 10406 + }, + { + "epoch": 3.194290976058932, + "grad_norm": 0.3702283799648285, + "learning_rate": 7.960037852521043e-05, + "loss": 1.7629, + "step": 10407 + }, + { + "epoch": 3.194597912829957, + "grad_norm": 0.33275437355041504, + "learning_rate": 7.959637244370644e-05, + "loss": 1.8507, + "step": 10408 + }, + { + "epoch": 3.194904849600982, + "grad_norm": 0.2691981792449951, + "learning_rate": 7.959236606971375e-05, + "loss": 1.8084, + "step": 10409 + }, + { + "epoch": 3.1952117863720075, + "grad_norm": 0.30108413100242615, + "learning_rate": 7.958835940327194e-05, + "loss": 1.8525, + "step": 10410 + }, + { + "epoch": 3.1955187231430324, + "grad_norm": 0.32112306356430054, + "learning_rate": 7.958435244442064e-05, + "loss": 1.7431, + "step": 10411 + }, + { + "epoch": 3.1958256599140578, + "grad_norm": 0.2795291543006897, + "learning_rate": 7.958034519319942e-05, + "loss": 1.7985, + "step": 10412 + }, + { + "epoch": 3.196132596685083, + "grad_norm": 0.2485792338848114, + "learning_rate": 7.957633764964788e-05, + "loss": 1.7363, + "step": 10413 + }, + { + "epoch": 3.196439533456108, + "grad_norm": 0.3552432358264923, + "learning_rate": 7.957232981380565e-05, + "loss": 1.8174, + "step": 10414 + }, + { + "epoch": 3.1967464702271333, + "grad_norm": 0.3829655051231384, + "learning_rate": 7.956832168571234e-05, + "loss": 1.9249, + "step": 10415 + }, + { + "epoch": 3.197053406998158, + "grad_norm": 0.2498074769973755, + "learning_rate": 7.956431326540752e-05, + "loss": 1.8104, + "step": 10416 + }, + { + "epoch": 3.1973603437691835, + "grad_norm": 0.24596504867076874, + "learning_rate": 7.956030455293082e-05, + "loss": 1.8007, + "step": 10417 + }, + { + "epoch": 3.197667280540209, + "grad_norm": 0.2795363664627075, + "learning_rate": 7.95562955483219e-05, + "loss": 1.775, + "step": 10418 + }, + { + "epoch": 3.1979742173112338, + "grad_norm": 0.3581138253211975, + "learning_rate": 7.95522862516203e-05, + "loss": 1.8567, + "step": 10419 + }, + { + "epoch": 3.198281154082259, + "grad_norm": 0.36102500557899475, + "learning_rate": 7.95482766628657e-05, + "loss": 1.8509, + "step": 10420 + }, + { + "epoch": 3.198588090853284, + "grad_norm": 0.4717029929161072, + "learning_rate": 7.954426678209774e-05, + "loss": 1.8218, + "step": 10421 + }, + { + "epoch": 3.1988950276243093, + "grad_norm": 0.3211984932422638, + "learning_rate": 7.9540256609356e-05, + "loss": 1.8696, + "step": 10422 + }, + { + "epoch": 3.1992019643953347, + "grad_norm": 0.30094626545906067, + "learning_rate": 7.953624614468011e-05, + "loss": 1.8714, + "step": 10423 + }, + { + "epoch": 3.1995089011663596, + "grad_norm": 0.267578125, + "learning_rate": 7.953223538810976e-05, + "loss": 1.7903, + "step": 10424 + }, + { + "epoch": 3.199815837937385, + "grad_norm": 0.35577845573425293, + "learning_rate": 7.952822433968453e-05, + "loss": 1.7808, + "step": 10425 + }, + { + "epoch": 3.2001227747084102, + "grad_norm": 0.4117741882801056, + "learning_rate": 7.952421299944408e-05, + "loss": 1.7856, + "step": 10426 + }, + { + "epoch": 3.200429711479435, + "grad_norm": 0.35202035307884216, + "learning_rate": 7.952020136742806e-05, + "loss": 1.8112, + "step": 10427 + }, + { + "epoch": 3.2007366482504604, + "grad_norm": 0.26514917612075806, + "learning_rate": 7.951618944367611e-05, + "loss": 1.828, + "step": 10428 + }, + { + "epoch": 3.201043585021486, + "grad_norm": 0.29219159483909607, + "learning_rate": 7.951217722822786e-05, + "loss": 1.9366, + "step": 10429 + }, + { + "epoch": 3.2013505217925107, + "grad_norm": 0.2929961383342743, + "learning_rate": 7.950816472112298e-05, + "loss": 1.8006, + "step": 10430 + }, + { + "epoch": 3.201657458563536, + "grad_norm": 0.28339722752571106, + "learning_rate": 7.950415192240114e-05, + "loss": 1.7411, + "step": 10431 + }, + { + "epoch": 3.201964395334561, + "grad_norm": 0.258884996175766, + "learning_rate": 7.950013883210196e-05, + "loss": 1.8153, + "step": 10432 + }, + { + "epoch": 3.2022713321055862, + "grad_norm": 0.3065929114818573, + "learning_rate": 7.949612545026512e-05, + "loss": 1.7918, + "step": 10433 + }, + { + "epoch": 3.2025782688766116, + "grad_norm": 0.289874404668808, + "learning_rate": 7.949211177693029e-05, + "loss": 1.7975, + "step": 10434 + }, + { + "epoch": 3.2028852056476365, + "grad_norm": 0.27025631070137024, + "learning_rate": 7.948809781213711e-05, + "loss": 1.8129, + "step": 10435 + }, + { + "epoch": 3.203192142418662, + "grad_norm": 0.2501074969768524, + "learning_rate": 7.948408355592528e-05, + "loss": 1.7653, + "step": 10436 + }, + { + "epoch": 3.203499079189687, + "grad_norm": 0.30402958393096924, + "learning_rate": 7.948006900833445e-05, + "loss": 1.8311, + "step": 10437 + }, + { + "epoch": 3.203806015960712, + "grad_norm": 0.28783223032951355, + "learning_rate": 7.94760541694043e-05, + "loss": 1.82, + "step": 10438 + }, + { + "epoch": 3.2041129527317374, + "grad_norm": 0.30428317189216614, + "learning_rate": 7.947203903917451e-05, + "loss": 1.8673, + "step": 10439 + }, + { + "epoch": 3.2044198895027622, + "grad_norm": 0.2860367000102997, + "learning_rate": 7.946802361768473e-05, + "loss": 1.824, + "step": 10440 + }, + { + "epoch": 3.2047268262737876, + "grad_norm": 0.2995273172855377, + "learning_rate": 7.946400790497469e-05, + "loss": 1.7342, + "step": 10441 + }, + { + "epoch": 3.205033763044813, + "grad_norm": 0.4374088943004608, + "learning_rate": 7.945999190108407e-05, + "loss": 1.8522, + "step": 10442 + }, + { + "epoch": 3.205340699815838, + "grad_norm": 0.37659478187561035, + "learning_rate": 7.945597560605252e-05, + "loss": 1.7518, + "step": 10443 + }, + { + "epoch": 3.205647636586863, + "grad_norm": 0.24257932603359222, + "learning_rate": 7.945195901991975e-05, + "loss": 1.7892, + "step": 10444 + }, + { + "epoch": 3.2059545733578885, + "grad_norm": 0.3682694435119629, + "learning_rate": 7.944794214272546e-05, + "loss": 1.7757, + "step": 10445 + }, + { + "epoch": 3.2062615101289134, + "grad_norm": 0.434692919254303, + "learning_rate": 7.944392497450936e-05, + "loss": 1.8207, + "step": 10446 + }, + { + "epoch": 3.2065684468999387, + "grad_norm": 0.3982211947441101, + "learning_rate": 7.943990751531113e-05, + "loss": 1.8303, + "step": 10447 + }, + { + "epoch": 3.2068753836709636, + "grad_norm": 0.2877334654331207, + "learning_rate": 7.943588976517049e-05, + "loss": 1.8495, + "step": 10448 + }, + { + "epoch": 3.207182320441989, + "grad_norm": 0.34589654207229614, + "learning_rate": 7.943187172412712e-05, + "loss": 1.7773, + "step": 10449 + }, + { + "epoch": 3.2074892572130143, + "grad_norm": 0.4727517366409302, + "learning_rate": 7.942785339222074e-05, + "loss": 1.8702, + "step": 10450 + }, + { + "epoch": 3.207796193984039, + "grad_norm": 0.4019354581832886, + "learning_rate": 7.942383476949107e-05, + "loss": 1.8095, + "step": 10451 + }, + { + "epoch": 3.2081031307550645, + "grad_norm": 0.2726243734359741, + "learning_rate": 7.941981585597782e-05, + "loss": 1.7273, + "step": 10452 + }, + { + "epoch": 3.20841006752609, + "grad_norm": 0.2944760024547577, + "learning_rate": 7.941579665172072e-05, + "loss": 1.7507, + "step": 10453 + }, + { + "epoch": 3.2087170042971147, + "grad_norm": 0.3530777096748352, + "learning_rate": 7.941177715675945e-05, + "loss": 1.8434, + "step": 10454 + }, + { + "epoch": 3.20902394106814, + "grad_norm": 0.28612539172172546, + "learning_rate": 7.940775737113378e-05, + "loss": 1.8094, + "step": 10455 + }, + { + "epoch": 3.209330877839165, + "grad_norm": 0.27006468176841736, + "learning_rate": 7.94037372948834e-05, + "loss": 1.7854, + "step": 10456 + }, + { + "epoch": 3.2096378146101903, + "grad_norm": 0.3027147054672241, + "learning_rate": 7.939971692804806e-05, + "loss": 1.7596, + "step": 10457 + }, + { + "epoch": 3.2099447513812156, + "grad_norm": 0.31999528408050537, + "learning_rate": 7.939569627066749e-05, + "loss": 1.8836, + "step": 10458 + }, + { + "epoch": 3.2102516881522405, + "grad_norm": 0.267600417137146, + "learning_rate": 7.939167532278142e-05, + "loss": 1.8508, + "step": 10459 + }, + { + "epoch": 3.210558624923266, + "grad_norm": 0.3171706795692444, + "learning_rate": 7.938765408442958e-05, + "loss": 1.7507, + "step": 10460 + }, + { + "epoch": 3.210865561694291, + "grad_norm": 0.2955280840396881, + "learning_rate": 7.938363255565171e-05, + "loss": 1.733, + "step": 10461 + }, + { + "epoch": 3.211172498465316, + "grad_norm": 0.3427969217300415, + "learning_rate": 7.937961073648759e-05, + "loss": 1.9208, + "step": 10462 + }, + { + "epoch": 3.2114794352363414, + "grad_norm": 0.28788647055625916, + "learning_rate": 7.937558862697692e-05, + "loss": 1.7723, + "step": 10463 + }, + { + "epoch": 3.2117863720073663, + "grad_norm": 0.26093682646751404, + "learning_rate": 7.937156622715945e-05, + "loss": 1.803, + "step": 10464 + }, + { + "epoch": 3.2120933087783916, + "grad_norm": 0.2791301906108856, + "learning_rate": 7.936754353707497e-05, + "loss": 1.7601, + "step": 10465 + }, + { + "epoch": 3.212400245549417, + "grad_norm": 0.3039831519126892, + "learning_rate": 7.93635205567632e-05, + "loss": 1.7864, + "step": 10466 + }, + { + "epoch": 3.212707182320442, + "grad_norm": 0.28498128056526184, + "learning_rate": 7.935949728626392e-05, + "loss": 1.7745, + "step": 10467 + }, + { + "epoch": 3.213014119091467, + "grad_norm": 0.2908780872821808, + "learning_rate": 7.935547372561687e-05, + "loss": 1.8281, + "step": 10468 + }, + { + "epoch": 3.2133210558624925, + "grad_norm": 0.26148509979248047, + "learning_rate": 7.935144987486183e-05, + "loss": 1.8545, + "step": 10469 + }, + { + "epoch": 3.2136279926335174, + "grad_norm": 0.2853962481021881, + "learning_rate": 7.934742573403856e-05, + "loss": 1.7765, + "step": 10470 + }, + { + "epoch": 3.2139349294045427, + "grad_norm": 0.26497501134872437, + "learning_rate": 7.934340130318681e-05, + "loss": 1.7472, + "step": 10471 + }, + { + "epoch": 3.214241866175568, + "grad_norm": 0.2806912660598755, + "learning_rate": 7.933937658234638e-05, + "loss": 1.7879, + "step": 10472 + }, + { + "epoch": 3.214548802946593, + "grad_norm": 0.2699974477291107, + "learning_rate": 7.933535157155705e-05, + "loss": 1.7539, + "step": 10473 + }, + { + "epoch": 3.2148557397176183, + "grad_norm": 0.22714731097221375, + "learning_rate": 7.933132627085856e-05, + "loss": 1.7861, + "step": 10474 + }, + { + "epoch": 3.215162676488643, + "grad_norm": 0.291340708732605, + "learning_rate": 7.932730068029072e-05, + "loss": 1.8381, + "step": 10475 + }, + { + "epoch": 3.2154696132596685, + "grad_norm": 0.3257324695587158, + "learning_rate": 7.93232747998933e-05, + "loss": 1.8293, + "step": 10476 + }, + { + "epoch": 3.215776550030694, + "grad_norm": 0.3518911600112915, + "learning_rate": 7.93192486297061e-05, + "loss": 1.853, + "step": 10477 + }, + { + "epoch": 3.2160834868017187, + "grad_norm": 0.27663540840148926, + "learning_rate": 7.93152221697689e-05, + "loss": 1.7831, + "step": 10478 + }, + { + "epoch": 3.216390423572744, + "grad_norm": 0.3153248429298401, + "learning_rate": 7.931119542012149e-05, + "loss": 1.7443, + "step": 10479 + }, + { + "epoch": 3.216697360343769, + "grad_norm": 0.2919597029685974, + "learning_rate": 7.930716838080368e-05, + "loss": 1.8108, + "step": 10480 + }, + { + "epoch": 3.2170042971147943, + "grad_norm": 0.26892516016960144, + "learning_rate": 7.930314105185524e-05, + "loss": 1.7791, + "step": 10481 + }, + { + "epoch": 3.2173112338858196, + "grad_norm": 0.2486005276441574, + "learning_rate": 7.929911343331599e-05, + "loss": 1.8184, + "step": 10482 + }, + { + "epoch": 3.2176181706568445, + "grad_norm": 0.260728120803833, + "learning_rate": 7.929508552522571e-05, + "loss": 1.7933, + "step": 10483 + }, + { + "epoch": 3.21792510742787, + "grad_norm": 0.3081948757171631, + "learning_rate": 7.929105732762425e-05, + "loss": 1.7732, + "step": 10484 + }, + { + "epoch": 3.218232044198895, + "grad_norm": 0.3807671368122101, + "learning_rate": 7.928702884055138e-05, + "loss": 1.7652, + "step": 10485 + }, + { + "epoch": 3.21853898096992, + "grad_norm": 0.31637755036354065, + "learning_rate": 7.928300006404692e-05, + "loss": 1.7605, + "step": 10486 + }, + { + "epoch": 3.2188459177409454, + "grad_norm": 0.2812853455543518, + "learning_rate": 7.927897099815071e-05, + "loss": 1.7925, + "step": 10487 + }, + { + "epoch": 3.2191528545119708, + "grad_norm": 0.3472350239753723, + "learning_rate": 7.927494164290253e-05, + "loss": 1.8252, + "step": 10488 + }, + { + "epoch": 3.2194597912829956, + "grad_norm": 0.4202714264392853, + "learning_rate": 7.927091199834222e-05, + "loss": 1.7993, + "step": 10489 + }, + { + "epoch": 3.219766728054021, + "grad_norm": 0.44552353024482727, + "learning_rate": 7.92668820645096e-05, + "loss": 1.8609, + "step": 10490 + }, + { + "epoch": 3.220073664825046, + "grad_norm": 0.38964664936065674, + "learning_rate": 7.926285184144451e-05, + "loss": 1.864, + "step": 10491 + }, + { + "epoch": 3.220380601596071, + "grad_norm": 0.2978462278842926, + "learning_rate": 7.925882132918676e-05, + "loss": 1.7892, + "step": 10492 + }, + { + "epoch": 3.2206875383670965, + "grad_norm": 0.2520316243171692, + "learning_rate": 7.925479052777619e-05, + "loss": 1.7702, + "step": 10493 + }, + { + "epoch": 3.2209944751381214, + "grad_norm": 0.28151068091392517, + "learning_rate": 7.925075943725263e-05, + "loss": 1.7613, + "step": 10494 + }, + { + "epoch": 3.2213014119091468, + "grad_norm": 0.3346099555492401, + "learning_rate": 7.924672805765592e-05, + "loss": 1.894, + "step": 10495 + }, + { + "epoch": 3.2216083486801717, + "grad_norm": 0.2981362044811249, + "learning_rate": 7.924269638902591e-05, + "loss": 1.8157, + "step": 10496 + }, + { + "epoch": 3.221915285451197, + "grad_norm": 0.2561499774456024, + "learning_rate": 7.923866443140242e-05, + "loss": 1.8259, + "step": 10497 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.26480481028556824, + "learning_rate": 7.923463218482532e-05, + "loss": 1.7856, + "step": 10498 + }, + { + "epoch": 3.222529158993247, + "grad_norm": 0.24103692173957825, + "learning_rate": 7.923059964933446e-05, + "loss": 1.7765, + "step": 10499 + }, + { + "epoch": 3.2228360957642725, + "grad_norm": 0.2399173080921173, + "learning_rate": 7.922656682496967e-05, + "loss": 1.8216, + "step": 10500 + }, + { + "epoch": 3.223143032535298, + "grad_norm": 0.24530018866062164, + "learning_rate": 7.922253371177082e-05, + "loss": 1.8155, + "step": 10501 + }, + { + "epoch": 3.2234499693063228, + "grad_norm": 0.23298653960227966, + "learning_rate": 7.921850030977775e-05, + "loss": 1.7843, + "step": 10502 + }, + { + "epoch": 3.223756906077348, + "grad_norm": 0.3053973317146301, + "learning_rate": 7.921446661903035e-05, + "loss": 1.8113, + "step": 10503 + }, + { + "epoch": 3.2240638428483734, + "grad_norm": 0.261336088180542, + "learning_rate": 7.921043263956847e-05, + "loss": 1.8073, + "step": 10504 + }, + { + "epoch": 3.2243707796193983, + "grad_norm": 0.24877268075942993, + "learning_rate": 7.920639837143195e-05, + "loss": 1.8344, + "step": 10505 + }, + { + "epoch": 3.2246777163904237, + "grad_norm": 0.26784422993659973, + "learning_rate": 7.920236381466071e-05, + "loss": 1.7757, + "step": 10506 + }, + { + "epoch": 3.2249846531614486, + "grad_norm": 0.2672121226787567, + "learning_rate": 7.919832896929458e-05, + "loss": 1.8384, + "step": 10507 + }, + { + "epoch": 3.225291589932474, + "grad_norm": 0.27254921197891235, + "learning_rate": 7.919429383537346e-05, + "loss": 1.8056, + "step": 10508 + }, + { + "epoch": 3.2255985267034992, + "grad_norm": 0.24467822909355164, + "learning_rate": 7.91902584129372e-05, + "loss": 1.8109, + "step": 10509 + }, + { + "epoch": 3.225905463474524, + "grad_norm": 0.25966358184814453, + "learning_rate": 7.918622270202571e-05, + "loss": 1.82, + "step": 10510 + }, + { + "epoch": 3.2262124002455494, + "grad_norm": 0.28601330518722534, + "learning_rate": 7.918218670267886e-05, + "loss": 1.7266, + "step": 10511 + }, + { + "epoch": 3.226519337016575, + "grad_norm": 0.4017516076564789, + "learning_rate": 7.917815041493653e-05, + "loss": 1.8408, + "step": 10512 + }, + { + "epoch": 3.2268262737875997, + "grad_norm": 0.3995787501335144, + "learning_rate": 7.917411383883862e-05, + "loss": 1.8441, + "step": 10513 + }, + { + "epoch": 3.227133210558625, + "grad_norm": 0.26997458934783936, + "learning_rate": 7.917007697442502e-05, + "loss": 1.8078, + "step": 10514 + }, + { + "epoch": 3.22744014732965, + "grad_norm": 0.34353014826774597, + "learning_rate": 7.916603982173562e-05, + "loss": 1.7523, + "step": 10515 + }, + { + "epoch": 3.2277470841006752, + "grad_norm": 0.39522337913513184, + "learning_rate": 7.916200238081032e-05, + "loss": 1.7532, + "step": 10516 + }, + { + "epoch": 3.2280540208717006, + "grad_norm": 0.4176923334598541, + "learning_rate": 7.915796465168903e-05, + "loss": 1.8895, + "step": 10517 + }, + { + "epoch": 3.2283609576427255, + "grad_norm": 0.30232906341552734, + "learning_rate": 7.915392663441164e-05, + "loss": 1.8223, + "step": 10518 + }, + { + "epoch": 3.228667894413751, + "grad_norm": 0.230951726436615, + "learning_rate": 7.914988832901805e-05, + "loss": 1.7265, + "step": 10519 + }, + { + "epoch": 3.228974831184776, + "grad_norm": 0.26381877064704895, + "learning_rate": 7.914584973554819e-05, + "loss": 1.7858, + "step": 10520 + }, + { + "epoch": 3.229281767955801, + "grad_norm": 0.2500905394554138, + "learning_rate": 7.914181085404194e-05, + "loss": 1.7606, + "step": 10521 + }, + { + "epoch": 3.2295887047268264, + "grad_norm": 0.2585415840148926, + "learning_rate": 7.913777168453925e-05, + "loss": 1.787, + "step": 10522 + }, + { + "epoch": 3.2298956414978512, + "grad_norm": 0.24236604571342468, + "learning_rate": 7.913373222708001e-05, + "loss": 1.7623, + "step": 10523 + }, + { + "epoch": 3.2302025782688766, + "grad_norm": 0.3113093078136444, + "learning_rate": 7.912969248170416e-05, + "loss": 1.7736, + "step": 10524 + }, + { + "epoch": 3.230509515039902, + "grad_norm": 0.3341342806816101, + "learning_rate": 7.912565244845163e-05, + "loss": 1.8583, + "step": 10525 + }, + { + "epoch": 3.230816451810927, + "grad_norm": 0.2644478678703308, + "learning_rate": 7.912161212736231e-05, + "loss": 1.7891, + "step": 10526 + }, + { + "epoch": 3.231123388581952, + "grad_norm": 0.22916561365127563, + "learning_rate": 7.911757151847616e-05, + "loss": 1.7642, + "step": 10527 + }, + { + "epoch": 3.2314303253529775, + "grad_norm": 0.24204877018928528, + "learning_rate": 7.911353062183309e-05, + "loss": 1.8522, + "step": 10528 + }, + { + "epoch": 3.2317372621240024, + "grad_norm": 0.25339365005493164, + "learning_rate": 7.910948943747307e-05, + "loss": 1.8391, + "step": 10529 + }, + { + "epoch": 3.2320441988950277, + "grad_norm": 0.2652709186077118, + "learning_rate": 7.9105447965436e-05, + "loss": 1.7735, + "step": 10530 + }, + { + "epoch": 3.2323511356660526, + "grad_norm": 0.2711019217967987, + "learning_rate": 7.910140620576183e-05, + "loss": 1.8491, + "step": 10531 + }, + { + "epoch": 3.232658072437078, + "grad_norm": 0.2598389685153961, + "learning_rate": 7.909736415849052e-05, + "loss": 1.8417, + "step": 10532 + }, + { + "epoch": 3.2329650092081033, + "grad_norm": 0.278037428855896, + "learning_rate": 7.9093321823662e-05, + "loss": 1.8774, + "step": 10533 + }, + { + "epoch": 3.233271945979128, + "grad_norm": 0.32015568017959595, + "learning_rate": 7.90892792013162e-05, + "loss": 1.8873, + "step": 10534 + }, + { + "epoch": 3.2335788827501535, + "grad_norm": 0.3098098635673523, + "learning_rate": 7.908523629149312e-05, + "loss": 1.8141, + "step": 10535 + }, + { + "epoch": 3.233885819521179, + "grad_norm": 0.3127266764640808, + "learning_rate": 7.908119309423267e-05, + "loss": 1.8587, + "step": 10536 + }, + { + "epoch": 3.2341927562922037, + "grad_norm": 0.3085545301437378, + "learning_rate": 7.907714960957483e-05, + "loss": 1.8544, + "step": 10537 + }, + { + "epoch": 3.234499693063229, + "grad_norm": 0.3051004409790039, + "learning_rate": 7.907310583755956e-05, + "loss": 1.8144, + "step": 10538 + }, + { + "epoch": 3.234806629834254, + "grad_norm": 0.3458186686038971, + "learning_rate": 7.906906177822682e-05, + "loss": 1.8388, + "step": 10539 + }, + { + "epoch": 3.2351135666052793, + "grad_norm": 0.37064439058303833, + "learning_rate": 7.906501743161656e-05, + "loss": 1.7574, + "step": 10540 + }, + { + "epoch": 3.2354205033763046, + "grad_norm": 0.3382316827774048, + "learning_rate": 7.906097279776876e-05, + "loss": 1.8785, + "step": 10541 + }, + { + "epoch": 3.2357274401473295, + "grad_norm": 0.254802942276001, + "learning_rate": 7.905692787672341e-05, + "loss": 1.8276, + "step": 10542 + }, + { + "epoch": 3.236034376918355, + "grad_norm": 0.3362341523170471, + "learning_rate": 7.905288266852047e-05, + "loss": 1.8057, + "step": 10543 + }, + { + "epoch": 3.23634131368938, + "grad_norm": 0.38821661472320557, + "learning_rate": 7.904883717319988e-05, + "loss": 1.7841, + "step": 10544 + }, + { + "epoch": 3.236648250460405, + "grad_norm": 0.33889076113700867, + "learning_rate": 7.90447913908017e-05, + "loss": 1.7892, + "step": 10545 + }, + { + "epoch": 3.2369551872314304, + "grad_norm": 0.2741014361381531, + "learning_rate": 7.904074532136585e-05, + "loss": 1.7611, + "step": 10546 + }, + { + "epoch": 3.2372621240024557, + "grad_norm": 0.28950995206832886, + "learning_rate": 7.903669896493233e-05, + "loss": 1.7963, + "step": 10547 + }, + { + "epoch": 3.2375690607734806, + "grad_norm": 0.30647143721580505, + "learning_rate": 7.903265232154113e-05, + "loss": 1.7522, + "step": 10548 + }, + { + "epoch": 3.237875997544506, + "grad_norm": 0.30428263545036316, + "learning_rate": 7.902860539123225e-05, + "loss": 1.7383, + "step": 10549 + }, + { + "epoch": 3.238182934315531, + "grad_norm": 0.2357146292924881, + "learning_rate": 7.902455817404569e-05, + "loss": 1.7243, + "step": 10550 + }, + { + "epoch": 3.238489871086556, + "grad_norm": 0.3125104606151581, + "learning_rate": 7.90205106700214e-05, + "loss": 1.8542, + "step": 10551 + }, + { + "epoch": 3.2387968078575815, + "grad_norm": 0.25797244906425476, + "learning_rate": 7.901646287919944e-05, + "loss": 1.8374, + "step": 10552 + }, + { + "epoch": 3.2391037446286064, + "grad_norm": 0.3127591907978058, + "learning_rate": 7.901241480161978e-05, + "loss": 1.9457, + "step": 10553 + }, + { + "epoch": 3.2394106813996317, + "grad_norm": 0.2971835434436798, + "learning_rate": 7.900836643732243e-05, + "loss": 1.7933, + "step": 10554 + }, + { + "epoch": 3.2397176181706566, + "grad_norm": 0.28931814432144165, + "learning_rate": 7.90043177863474e-05, + "loss": 1.8201, + "step": 10555 + }, + { + "epoch": 3.240024554941682, + "grad_norm": 0.3348724842071533, + "learning_rate": 7.90002688487347e-05, + "loss": 1.8718, + "step": 10556 + }, + { + "epoch": 3.2403314917127073, + "grad_norm": 0.28566426038742065, + "learning_rate": 7.899621962452436e-05, + "loss": 1.805, + "step": 10557 + }, + { + "epoch": 3.240638428483732, + "grad_norm": 0.27074119448661804, + "learning_rate": 7.899217011375637e-05, + "loss": 1.842, + "step": 10558 + }, + { + "epoch": 3.2409453652547575, + "grad_norm": 0.27014291286468506, + "learning_rate": 7.898812031647076e-05, + "loss": 1.8156, + "step": 10559 + }, + { + "epoch": 3.241252302025783, + "grad_norm": 0.28087863326072693, + "learning_rate": 7.898407023270756e-05, + "loss": 1.8399, + "step": 10560 + }, + { + "epoch": 3.2415592387968077, + "grad_norm": 0.2641037404537201, + "learning_rate": 7.898001986250679e-05, + "loss": 1.7977, + "step": 10561 + }, + { + "epoch": 3.241866175567833, + "grad_norm": 0.2843858301639557, + "learning_rate": 7.897596920590848e-05, + "loss": 1.834, + "step": 10562 + }, + { + "epoch": 3.2421731123388584, + "grad_norm": 0.2724611163139343, + "learning_rate": 7.897191826295266e-05, + "loss": 1.7547, + "step": 10563 + }, + { + "epoch": 3.2424800491098833, + "grad_norm": 0.2583858370780945, + "learning_rate": 7.896786703367935e-05, + "loss": 1.7658, + "step": 10564 + }, + { + "epoch": 3.2427869858809086, + "grad_norm": 0.2666650712490082, + "learning_rate": 7.896381551812861e-05, + "loss": 1.8017, + "step": 10565 + }, + { + "epoch": 3.2430939226519335, + "grad_norm": 0.23269347846508026, + "learning_rate": 7.895976371634047e-05, + "loss": 1.8267, + "step": 10566 + }, + { + "epoch": 3.243400859422959, + "grad_norm": 0.27865225076675415, + "learning_rate": 7.895571162835496e-05, + "loss": 1.8093, + "step": 10567 + }, + { + "epoch": 3.243707796193984, + "grad_norm": 0.29445022344589233, + "learning_rate": 7.895165925421216e-05, + "loss": 1.7999, + "step": 10568 + }, + { + "epoch": 3.244014732965009, + "grad_norm": 0.32135528326034546, + "learning_rate": 7.894760659395206e-05, + "loss": 1.8405, + "step": 10569 + }, + { + "epoch": 3.2443216697360344, + "grad_norm": 0.3409091532230377, + "learning_rate": 7.894355364761477e-05, + "loss": 1.7861, + "step": 10570 + }, + { + "epoch": 3.2446286065070598, + "grad_norm": 0.3379025459289551, + "learning_rate": 7.893950041524032e-05, + "loss": 1.8495, + "step": 10571 + }, + { + "epoch": 3.2449355432780846, + "grad_norm": 0.2843063473701477, + "learning_rate": 7.893544689686874e-05, + "loss": 1.7888, + "step": 10572 + }, + { + "epoch": 3.24524248004911, + "grad_norm": 0.2914074957370758, + "learning_rate": 7.893139309254013e-05, + "loss": 1.7866, + "step": 10573 + }, + { + "epoch": 3.245549416820135, + "grad_norm": 0.39855021238327026, + "learning_rate": 7.892733900229454e-05, + "loss": 1.7865, + "step": 10574 + }, + { + "epoch": 3.24585635359116, + "grad_norm": 0.4232102632522583, + "learning_rate": 7.892328462617203e-05, + "loss": 1.8443, + "step": 10575 + }, + { + "epoch": 3.2461632903621855, + "grad_norm": 0.390794962644577, + "learning_rate": 7.891922996421267e-05, + "loss": 1.8735, + "step": 10576 + }, + { + "epoch": 3.2464702271332104, + "grad_norm": 0.3051595687866211, + "learning_rate": 7.891517501645653e-05, + "loss": 1.8654, + "step": 10577 + }, + { + "epoch": 3.2467771639042358, + "grad_norm": 0.25363096594810486, + "learning_rate": 7.891111978294367e-05, + "loss": 1.7602, + "step": 10578 + }, + { + "epoch": 3.247084100675261, + "grad_norm": 0.29785794019699097, + "learning_rate": 7.890706426371419e-05, + "loss": 1.8242, + "step": 10579 + }, + { + "epoch": 3.247391037446286, + "grad_norm": 0.346162885427475, + "learning_rate": 7.890300845880816e-05, + "loss": 1.8551, + "step": 10580 + }, + { + "epoch": 3.2476979742173113, + "grad_norm": 0.33906155824661255, + "learning_rate": 7.889895236826566e-05, + "loss": 1.765, + "step": 10581 + }, + { + "epoch": 3.248004910988336, + "grad_norm": 0.26083165407180786, + "learning_rate": 7.889489599212676e-05, + "loss": 1.8246, + "step": 10582 + }, + { + "epoch": 3.2483118477593615, + "grad_norm": 0.3042019009590149, + "learning_rate": 7.889083933043157e-05, + "loss": 1.9017, + "step": 10583 + }, + { + "epoch": 3.248618784530387, + "grad_norm": 0.34833577275276184, + "learning_rate": 7.888678238322018e-05, + "loss": 1.7863, + "step": 10584 + }, + { + "epoch": 3.2489257213014118, + "grad_norm": 0.34436655044555664, + "learning_rate": 7.888272515053267e-05, + "loss": 1.7937, + "step": 10585 + }, + { + "epoch": 3.249232658072437, + "grad_norm": 0.2550172507762909, + "learning_rate": 7.887866763240914e-05, + "loss": 1.7615, + "step": 10586 + }, + { + "epoch": 3.2495395948434624, + "grad_norm": 0.3334405720233917, + "learning_rate": 7.88746098288897e-05, + "loss": 1.7465, + "step": 10587 + }, + { + "epoch": 3.2498465316144873, + "grad_norm": 0.4668157696723938, + "learning_rate": 7.887055174001443e-05, + "loss": 1.7836, + "step": 10588 + }, + { + "epoch": 3.2501534683855127, + "grad_norm": 0.524680495262146, + "learning_rate": 7.886649336582344e-05, + "loss": 1.844, + "step": 10589 + }, + { + "epoch": 3.250460405156538, + "grad_norm": 0.36859074234962463, + "learning_rate": 7.886243470635685e-05, + "loss": 1.8072, + "step": 10590 + }, + { + "epoch": 3.250767341927563, + "grad_norm": 0.32370296120643616, + "learning_rate": 7.885837576165478e-05, + "loss": 1.802, + "step": 10591 + }, + { + "epoch": 3.2510742786985882, + "grad_norm": 0.3506374955177307, + "learning_rate": 7.88543165317573e-05, + "loss": 1.7965, + "step": 10592 + }, + { + "epoch": 3.251381215469613, + "grad_norm": 0.39058688282966614, + "learning_rate": 7.885025701670457e-05, + "loss": 1.7987, + "step": 10593 + }, + { + "epoch": 3.2516881522406385, + "grad_norm": 0.3042154014110565, + "learning_rate": 7.884619721653669e-05, + "loss": 1.8345, + "step": 10594 + }, + { + "epoch": 3.251995089011664, + "grad_norm": 0.2249498963356018, + "learning_rate": 7.884213713129378e-05, + "loss": 1.7796, + "step": 10595 + }, + { + "epoch": 3.2523020257826887, + "grad_norm": 0.2701997458934784, + "learning_rate": 7.883807676101595e-05, + "loss": 1.8027, + "step": 10596 + }, + { + "epoch": 3.252608962553714, + "grad_norm": 0.2574785053730011, + "learning_rate": 7.883401610574336e-05, + "loss": 1.7878, + "step": 10597 + }, + { + "epoch": 3.252915899324739, + "grad_norm": 0.24964739382266998, + "learning_rate": 7.882995516551613e-05, + "loss": 1.7612, + "step": 10598 + }, + { + "epoch": 3.2532228360957642, + "grad_norm": 0.2519865930080414, + "learning_rate": 7.882589394037437e-05, + "loss": 1.7583, + "step": 10599 + }, + { + "epoch": 3.2535297728667896, + "grad_norm": 0.23174463212490082, + "learning_rate": 7.882183243035823e-05, + "loss": 1.7607, + "step": 10600 + }, + { + "epoch": 3.2538367096378145, + "grad_norm": 0.28103554248809814, + "learning_rate": 7.881777063550786e-05, + "loss": 1.904, + "step": 10601 + }, + { + "epoch": 3.25414364640884, + "grad_norm": 0.265677809715271, + "learning_rate": 7.881370855586339e-05, + "loss": 1.8169, + "step": 10602 + }, + { + "epoch": 3.254450583179865, + "grad_norm": 0.2539603114128113, + "learning_rate": 7.880964619146493e-05, + "loss": 1.8439, + "step": 10603 + }, + { + "epoch": 3.25475751995089, + "grad_norm": 0.2741886377334595, + "learning_rate": 7.88055835423527e-05, + "loss": 1.8737, + "step": 10604 + }, + { + "epoch": 3.2550644567219154, + "grad_norm": 0.27548348903656006, + "learning_rate": 7.88015206085668e-05, + "loss": 1.8385, + "step": 10605 + }, + { + "epoch": 3.2553713934929407, + "grad_norm": 0.2958502769470215, + "learning_rate": 7.879745739014739e-05, + "loss": 1.8603, + "step": 10606 + }, + { + "epoch": 3.2556783302639656, + "grad_norm": 0.2728644907474518, + "learning_rate": 7.879339388713462e-05, + "loss": 1.8, + "step": 10607 + }, + { + "epoch": 3.255985267034991, + "grad_norm": 0.28718289732933044, + "learning_rate": 7.878933009956866e-05, + "loss": 1.7803, + "step": 10608 + }, + { + "epoch": 3.256292203806016, + "grad_norm": 0.2989691197872162, + "learning_rate": 7.878526602748967e-05, + "loss": 1.8155, + "step": 10609 + }, + { + "epoch": 3.256599140577041, + "grad_norm": 0.24515527486801147, + "learning_rate": 7.87812016709378e-05, + "loss": 1.7623, + "step": 10610 + }, + { + "epoch": 3.2569060773480665, + "grad_norm": 0.29946041107177734, + "learning_rate": 7.877713702995324e-05, + "loss": 1.8097, + "step": 10611 + }, + { + "epoch": 3.2572130141190914, + "grad_norm": 0.2854483723640442, + "learning_rate": 7.877307210457613e-05, + "loss": 1.8088, + "step": 10612 + }, + { + "epoch": 3.2575199508901167, + "grad_norm": 0.27812930941581726, + "learning_rate": 7.876900689484668e-05, + "loss": 1.8151, + "step": 10613 + }, + { + "epoch": 3.2578268876611416, + "grad_norm": 0.2658015787601471, + "learning_rate": 7.876494140080503e-05, + "loss": 1.8314, + "step": 10614 + }, + { + "epoch": 3.258133824432167, + "grad_norm": 0.28935661911964417, + "learning_rate": 7.876087562249137e-05, + "loss": 1.7948, + "step": 10615 + }, + { + "epoch": 3.2584407612031923, + "grad_norm": 0.27497121691703796, + "learning_rate": 7.875680955994587e-05, + "loss": 1.7964, + "step": 10616 + }, + { + "epoch": 3.258747697974217, + "grad_norm": 0.3313405513763428, + "learning_rate": 7.875274321320873e-05, + "loss": 1.8143, + "step": 10617 + }, + { + "epoch": 3.2590546347452425, + "grad_norm": 0.3217218816280365, + "learning_rate": 7.874867658232013e-05, + "loss": 1.7749, + "step": 10618 + }, + { + "epoch": 3.259361571516268, + "grad_norm": 0.25105544924736023, + "learning_rate": 7.874460966732025e-05, + "loss": 1.7834, + "step": 10619 + }, + { + "epoch": 3.2596685082872927, + "grad_norm": 0.2931382358074188, + "learning_rate": 7.874054246824931e-05, + "loss": 1.8252, + "step": 10620 + }, + { + "epoch": 3.259975445058318, + "grad_norm": 0.2803363502025604, + "learning_rate": 7.873647498514747e-05, + "loss": 1.7527, + "step": 10621 + }, + { + "epoch": 3.2602823818293434, + "grad_norm": 0.29857927560806274, + "learning_rate": 7.873240721805492e-05, + "loss": 1.8085, + "step": 10622 + }, + { + "epoch": 3.2605893186003683, + "grad_norm": 0.24864110350608826, + "learning_rate": 7.872833916701192e-05, + "loss": 1.7509, + "step": 10623 + }, + { + "epoch": 3.2608962553713936, + "grad_norm": 0.24105949699878693, + "learning_rate": 7.872427083205862e-05, + "loss": 1.7871, + "step": 10624 + }, + { + "epoch": 3.2612031921424185, + "grad_norm": 0.2429245114326477, + "learning_rate": 7.872020221323523e-05, + "loss": 1.777, + "step": 10625 + }, + { + "epoch": 3.261510128913444, + "grad_norm": 0.234287828207016, + "learning_rate": 7.871613331058197e-05, + "loss": 1.8001, + "step": 10626 + }, + { + "epoch": 3.261817065684469, + "grad_norm": 0.3463406264781952, + "learning_rate": 7.871206412413905e-05, + "loss": 1.8925, + "step": 10627 + }, + { + "epoch": 3.262124002455494, + "grad_norm": 0.26798921823501587, + "learning_rate": 7.87079946539467e-05, + "loss": 1.7963, + "step": 10628 + }, + { + "epoch": 3.2624309392265194, + "grad_norm": 0.28603312373161316, + "learning_rate": 7.87039249000451e-05, + "loss": 1.8308, + "step": 10629 + }, + { + "epoch": 3.2627378759975443, + "grad_norm": 0.2717527747154236, + "learning_rate": 7.86998548624745e-05, + "loss": 1.8246, + "step": 10630 + }, + { + "epoch": 3.2630448127685696, + "grad_norm": 0.32215580344200134, + "learning_rate": 7.86957845412751e-05, + "loss": 1.7278, + "step": 10631 + }, + { + "epoch": 3.263351749539595, + "grad_norm": 0.3578735589981079, + "learning_rate": 7.869171393648717e-05, + "loss": 1.7288, + "step": 10632 + }, + { + "epoch": 3.26365868631062, + "grad_norm": 0.3120707869529724, + "learning_rate": 7.868764304815089e-05, + "loss": 1.7971, + "step": 10633 + }, + { + "epoch": 3.263965623081645, + "grad_norm": 0.27419236302375793, + "learning_rate": 7.86835718763065e-05, + "loss": 1.8529, + "step": 10634 + }, + { + "epoch": 3.2642725598526705, + "grad_norm": 0.3200531601905823, + "learning_rate": 7.867950042099423e-05, + "loss": 1.7892, + "step": 10635 + }, + { + "epoch": 3.2645794966236954, + "grad_norm": 0.325706422328949, + "learning_rate": 7.867542868225435e-05, + "loss": 1.8236, + "step": 10636 + }, + { + "epoch": 3.2648864333947207, + "grad_norm": 0.2950136065483093, + "learning_rate": 7.867135666012707e-05, + "loss": 1.8163, + "step": 10637 + }, + { + "epoch": 3.265193370165746, + "grad_norm": 0.2772117257118225, + "learning_rate": 7.866728435465263e-05, + "loss": 1.8373, + "step": 10638 + }, + { + "epoch": 3.265500306936771, + "grad_norm": 0.2887401580810547, + "learning_rate": 7.866321176587129e-05, + "loss": 1.7756, + "step": 10639 + }, + { + "epoch": 3.2658072437077963, + "grad_norm": 0.3474489152431488, + "learning_rate": 7.865913889382329e-05, + "loss": 1.7539, + "step": 10640 + }, + { + "epoch": 3.266114180478821, + "grad_norm": 0.3433493971824646, + "learning_rate": 7.865506573854888e-05, + "loss": 1.7987, + "step": 10641 + }, + { + "epoch": 3.2664211172498465, + "grad_norm": 0.3075394630432129, + "learning_rate": 7.865099230008832e-05, + "loss": 1.7907, + "step": 10642 + }, + { + "epoch": 3.266728054020872, + "grad_norm": 0.24817697703838348, + "learning_rate": 7.864691857848187e-05, + "loss": 1.7941, + "step": 10643 + }, + { + "epoch": 3.2670349907918967, + "grad_norm": 0.290147602558136, + "learning_rate": 7.864284457376976e-05, + "loss": 1.9125, + "step": 10644 + }, + { + "epoch": 3.267341927562922, + "grad_norm": 0.253684937953949, + "learning_rate": 7.863877028599229e-05, + "loss": 1.8084, + "step": 10645 + }, + { + "epoch": 3.267648864333947, + "grad_norm": 0.26349252462387085, + "learning_rate": 7.863469571518969e-05, + "loss": 1.7548, + "step": 10646 + }, + { + "epoch": 3.2679558011049723, + "grad_norm": 0.30568864941596985, + "learning_rate": 7.863062086140224e-05, + "loss": 1.8551, + "step": 10647 + }, + { + "epoch": 3.2682627378759976, + "grad_norm": 0.2866690456867218, + "learning_rate": 7.862654572467024e-05, + "loss": 1.8145, + "step": 10648 + }, + { + "epoch": 3.2685696746470225, + "grad_norm": 0.32022854685783386, + "learning_rate": 7.862247030503391e-05, + "loss": 1.896, + "step": 10649 + }, + { + "epoch": 3.268876611418048, + "grad_norm": 0.25260284543037415, + "learning_rate": 7.861839460253356e-05, + "loss": 1.814, + "step": 10650 + }, + { + "epoch": 3.269183548189073, + "grad_norm": 0.26776066422462463, + "learning_rate": 7.861431861720947e-05, + "loss": 1.7755, + "step": 10651 + }, + { + "epoch": 3.269490484960098, + "grad_norm": 0.26514193415641785, + "learning_rate": 7.861024234910191e-05, + "loss": 1.7606, + "step": 10652 + }, + { + "epoch": 3.2697974217311234, + "grad_norm": 0.27213940024375916, + "learning_rate": 7.860616579825116e-05, + "loss": 1.8074, + "step": 10653 + }, + { + "epoch": 3.2701043585021488, + "grad_norm": 0.29192888736724854, + "learning_rate": 7.860208896469752e-05, + "loss": 1.8436, + "step": 10654 + }, + { + "epoch": 3.2704112952731736, + "grad_norm": 0.3772370219230652, + "learning_rate": 7.859801184848127e-05, + "loss": 1.8096, + "step": 10655 + }, + { + "epoch": 3.270718232044199, + "grad_norm": 0.4574970006942749, + "learning_rate": 7.859393444964269e-05, + "loss": 1.7612, + "step": 10656 + }, + { + "epoch": 3.271025168815224, + "grad_norm": 0.4614393413066864, + "learning_rate": 7.858985676822211e-05, + "loss": 1.8529, + "step": 10657 + }, + { + "epoch": 3.271332105586249, + "grad_norm": 0.33567267656326294, + "learning_rate": 7.85857788042598e-05, + "loss": 1.8391, + "step": 10658 + }, + { + "epoch": 3.2716390423572745, + "grad_norm": 0.2564064860343933, + "learning_rate": 7.858170055779609e-05, + "loss": 1.7621, + "step": 10659 + }, + { + "epoch": 3.2719459791282994, + "grad_norm": 0.26769882440567017, + "learning_rate": 7.857762202887122e-05, + "loss": 1.8145, + "step": 10660 + }, + { + "epoch": 3.2722529158993248, + "grad_norm": 0.262008935213089, + "learning_rate": 7.857354321752558e-05, + "loss": 1.7513, + "step": 10661 + }, + { + "epoch": 3.27255985267035, + "grad_norm": 0.26494377851486206, + "learning_rate": 7.856946412379942e-05, + "loss": 1.8071, + "step": 10662 + }, + { + "epoch": 3.272866789441375, + "grad_norm": 0.25613999366760254, + "learning_rate": 7.856538474773307e-05, + "loss": 1.8775, + "step": 10663 + }, + { + "epoch": 3.2731737262124003, + "grad_norm": 0.24789929389953613, + "learning_rate": 7.856130508936684e-05, + "loss": 1.8055, + "step": 10664 + }, + { + "epoch": 3.2734806629834257, + "grad_norm": 0.29111939668655396, + "learning_rate": 7.855722514874107e-05, + "loss": 1.8114, + "step": 10665 + }, + { + "epoch": 3.2737875997544506, + "grad_norm": 0.30511030554771423, + "learning_rate": 7.855314492589605e-05, + "loss": 1.8131, + "step": 10666 + }, + { + "epoch": 3.274094536525476, + "grad_norm": 0.2545989453792572, + "learning_rate": 7.854906442087212e-05, + "loss": 1.7933, + "step": 10667 + }, + { + "epoch": 3.2744014732965008, + "grad_norm": 0.26684823632240295, + "learning_rate": 7.85449836337096e-05, + "loss": 1.7604, + "step": 10668 + }, + { + "epoch": 3.274708410067526, + "grad_norm": 0.5097808837890625, + "learning_rate": 7.854090256444881e-05, + "loss": 1.777, + "step": 10669 + }, + { + "epoch": 3.2750153468385514, + "grad_norm": 0.27828142046928406, + "learning_rate": 7.853682121313011e-05, + "loss": 1.7885, + "step": 10670 + }, + { + "epoch": 3.2753222836095763, + "grad_norm": 0.2925552725791931, + "learning_rate": 7.853273957979381e-05, + "loss": 1.7962, + "step": 10671 + }, + { + "epoch": 3.2756292203806017, + "grad_norm": 0.284574955701828, + "learning_rate": 7.852865766448025e-05, + "loss": 1.8645, + "step": 10672 + }, + { + "epoch": 3.2759361571516266, + "grad_norm": 0.23407664895057678, + "learning_rate": 7.85245754672298e-05, + "loss": 1.7106, + "step": 10673 + }, + { + "epoch": 3.276243093922652, + "grad_norm": 0.2555919885635376, + "learning_rate": 7.852049298808274e-05, + "loss": 1.8237, + "step": 10674 + }, + { + "epoch": 3.2765500306936772, + "grad_norm": 0.26703694462776184, + "learning_rate": 7.851641022707947e-05, + "loss": 1.7844, + "step": 10675 + }, + { + "epoch": 3.276856967464702, + "grad_norm": 0.24889135360717773, + "learning_rate": 7.851232718426033e-05, + "loss": 1.7783, + "step": 10676 + }, + { + "epoch": 3.2771639042357275, + "grad_norm": 0.25770726799964905, + "learning_rate": 7.850824385966564e-05, + "loss": 1.8007, + "step": 10677 + }, + { + "epoch": 3.277470841006753, + "grad_norm": 0.31806984543800354, + "learning_rate": 7.850416025333578e-05, + "loss": 1.8623, + "step": 10678 + }, + { + "epoch": 3.2777777777777777, + "grad_norm": 0.2906930148601532, + "learning_rate": 7.850007636531111e-05, + "loss": 1.8315, + "step": 10679 + }, + { + "epoch": 3.278084714548803, + "grad_norm": 0.2802525460720062, + "learning_rate": 7.849599219563197e-05, + "loss": 1.8488, + "step": 10680 + }, + { + "epoch": 3.2783916513198283, + "grad_norm": 0.26150405406951904, + "learning_rate": 7.849190774433874e-05, + "loss": 1.7967, + "step": 10681 + }, + { + "epoch": 3.2786985880908532, + "grad_norm": 0.25863370299339294, + "learning_rate": 7.848782301147178e-05, + "loss": 1.864, + "step": 10682 + }, + { + "epoch": 3.2790055248618786, + "grad_norm": 0.25381043553352356, + "learning_rate": 7.848373799707145e-05, + "loss": 1.8239, + "step": 10683 + }, + { + "epoch": 3.2793124616329035, + "grad_norm": 0.2583387792110443, + "learning_rate": 7.847965270117814e-05, + "loss": 1.8449, + "step": 10684 + }, + { + "epoch": 3.279619398403929, + "grad_norm": 0.30759841203689575, + "learning_rate": 7.84755671238322e-05, + "loss": 1.7992, + "step": 10685 + }, + { + "epoch": 3.279926335174954, + "grad_norm": 0.4316023588180542, + "learning_rate": 7.847148126507402e-05, + "loss": 1.7912, + "step": 10686 + }, + { + "epoch": 3.280233271945979, + "grad_norm": 0.3988901674747467, + "learning_rate": 7.846739512494396e-05, + "loss": 1.8831, + "step": 10687 + }, + { + "epoch": 3.2805402087170044, + "grad_norm": 0.318934828042984, + "learning_rate": 7.846330870348244e-05, + "loss": 1.8411, + "step": 10688 + }, + { + "epoch": 3.2808471454880292, + "grad_norm": 0.27755632996559143, + "learning_rate": 7.84592220007298e-05, + "loss": 1.8763, + "step": 10689 + }, + { + "epoch": 3.2811540822590546, + "grad_norm": 0.33544883131980896, + "learning_rate": 7.845513501672646e-05, + "loss": 1.731, + "step": 10690 + }, + { + "epoch": 3.28146101903008, + "grad_norm": 0.28299057483673096, + "learning_rate": 7.845104775151278e-05, + "loss": 1.813, + "step": 10691 + }, + { + "epoch": 3.281767955801105, + "grad_norm": 0.2761382460594177, + "learning_rate": 7.844696020512918e-05, + "loss": 1.8018, + "step": 10692 + }, + { + "epoch": 3.28207489257213, + "grad_norm": 0.2919033169746399, + "learning_rate": 7.844287237761605e-05, + "loss": 1.793, + "step": 10693 + }, + { + "epoch": 3.2823818293431555, + "grad_norm": 0.32922014594078064, + "learning_rate": 7.843878426901378e-05, + "loss": 1.8186, + "step": 10694 + }, + { + "epoch": 3.2826887661141804, + "grad_norm": 0.2818562090396881, + "learning_rate": 7.843469587936279e-05, + "loss": 1.7794, + "step": 10695 + }, + { + "epoch": 3.2829957028852057, + "grad_norm": 0.26414254307746887, + "learning_rate": 7.843060720870345e-05, + "loss": 1.7854, + "step": 10696 + }, + { + "epoch": 3.283302639656231, + "grad_norm": 0.28345760703086853, + "learning_rate": 7.842651825707618e-05, + "loss": 1.7659, + "step": 10697 + }, + { + "epoch": 3.283609576427256, + "grad_norm": 0.3522340655326843, + "learning_rate": 7.842242902452141e-05, + "loss": 1.8427, + "step": 10698 + }, + { + "epoch": 3.2839165131982813, + "grad_norm": 0.2861590087413788, + "learning_rate": 7.841833951107954e-05, + "loss": 1.7539, + "step": 10699 + }, + { + "epoch": 3.284223449969306, + "grad_norm": 0.2596624493598938, + "learning_rate": 7.841424971679099e-05, + "loss": 1.8407, + "step": 10700 + }, + { + "epoch": 3.2845303867403315, + "grad_norm": 0.2847718298435211, + "learning_rate": 7.841015964169616e-05, + "loss": 1.8085, + "step": 10701 + }, + { + "epoch": 3.284837323511357, + "grad_norm": 0.29566115140914917, + "learning_rate": 7.840606928583547e-05, + "loss": 1.7873, + "step": 10702 + }, + { + "epoch": 3.2851442602823817, + "grad_norm": 0.2752111256122589, + "learning_rate": 7.840197864924936e-05, + "loss": 1.8186, + "step": 10703 + }, + { + "epoch": 3.285451197053407, + "grad_norm": 0.2907958924770355, + "learning_rate": 7.839788773197826e-05, + "loss": 1.8081, + "step": 10704 + }, + { + "epoch": 3.285758133824432, + "grad_norm": 0.25808724761009216, + "learning_rate": 7.839379653406258e-05, + "loss": 1.7635, + "step": 10705 + }, + { + "epoch": 3.2860650705954573, + "grad_norm": 0.2732730507850647, + "learning_rate": 7.838970505554277e-05, + "loss": 1.8061, + "step": 10706 + }, + { + "epoch": 3.2863720073664826, + "grad_norm": 0.23820067942142487, + "learning_rate": 7.838561329645923e-05, + "loss": 1.8091, + "step": 10707 + }, + { + "epoch": 3.2866789441375075, + "grad_norm": 0.24179396033287048, + "learning_rate": 7.838152125685245e-05, + "loss": 1.7513, + "step": 10708 + }, + { + "epoch": 3.286985880908533, + "grad_norm": 0.2627546787261963, + "learning_rate": 7.837742893676283e-05, + "loss": 1.8741, + "step": 10709 + }, + { + "epoch": 3.287292817679558, + "grad_norm": 0.2827817499637604, + "learning_rate": 7.837333633623083e-05, + "loss": 1.8387, + "step": 10710 + }, + { + "epoch": 3.287599754450583, + "grad_norm": 0.2666749060153961, + "learning_rate": 7.836924345529688e-05, + "loss": 1.8319, + "step": 10711 + }, + { + "epoch": 3.2879066912216084, + "grad_norm": 0.3403390944004059, + "learning_rate": 7.836515029400145e-05, + "loss": 1.7827, + "step": 10712 + }, + { + "epoch": 3.2882136279926337, + "grad_norm": 0.30646705627441406, + "learning_rate": 7.836105685238497e-05, + "loss": 1.8612, + "step": 10713 + }, + { + "epoch": 3.2885205647636586, + "grad_norm": 0.2580253481864929, + "learning_rate": 7.83569631304879e-05, + "loss": 1.7332, + "step": 10714 + }, + { + "epoch": 3.288827501534684, + "grad_norm": 0.23734542727470398, + "learning_rate": 7.835286912835071e-05, + "loss": 1.7899, + "step": 10715 + }, + { + "epoch": 3.289134438305709, + "grad_norm": 0.2457810491323471, + "learning_rate": 7.834877484601384e-05, + "loss": 1.8059, + "step": 10716 + }, + { + "epoch": 3.289441375076734, + "grad_norm": 0.2558443248271942, + "learning_rate": 7.834468028351778e-05, + "loss": 1.8689, + "step": 10717 + }, + { + "epoch": 3.2897483118477595, + "grad_norm": 0.26596710085868835, + "learning_rate": 7.834058544090298e-05, + "loss": 1.816, + "step": 10718 + }, + { + "epoch": 3.2900552486187844, + "grad_norm": 0.25424903631210327, + "learning_rate": 7.833649031820987e-05, + "loss": 1.7907, + "step": 10719 + }, + { + "epoch": 3.2903621853898097, + "grad_norm": 0.23873139917850494, + "learning_rate": 7.833239491547896e-05, + "loss": 1.7666, + "step": 10720 + }, + { + "epoch": 3.2906691221608346, + "grad_norm": 0.23292972147464752, + "learning_rate": 7.832829923275073e-05, + "loss": 1.7674, + "step": 10721 + }, + { + "epoch": 3.29097605893186, + "grad_norm": 0.30133312940597534, + "learning_rate": 7.832420327006566e-05, + "loss": 1.8229, + "step": 10722 + }, + { + "epoch": 3.2912829957028853, + "grad_norm": 0.2882522642612457, + "learning_rate": 7.83201070274642e-05, + "loss": 1.7855, + "step": 10723 + }, + { + "epoch": 3.29158993247391, + "grad_norm": 0.2578088045120239, + "learning_rate": 7.831601050498683e-05, + "loss": 1.7276, + "step": 10724 + }, + { + "epoch": 3.2918968692449355, + "grad_norm": 0.29511600732803345, + "learning_rate": 7.831191370267406e-05, + "loss": 1.8085, + "step": 10725 + }, + { + "epoch": 3.292203806015961, + "grad_norm": 0.29557499289512634, + "learning_rate": 7.830781662056634e-05, + "loss": 1.815, + "step": 10726 + }, + { + "epoch": 3.2925107427869857, + "grad_norm": 0.32722121477127075, + "learning_rate": 7.830371925870422e-05, + "loss": 1.7889, + "step": 10727 + }, + { + "epoch": 3.292817679558011, + "grad_norm": 0.3124488592147827, + "learning_rate": 7.829962161712814e-05, + "loss": 1.8063, + "step": 10728 + }, + { + "epoch": 3.2931246163290364, + "grad_norm": 0.311334490776062, + "learning_rate": 7.829552369587861e-05, + "loss": 1.8852, + "step": 10729 + }, + { + "epoch": 3.2934315531000613, + "grad_norm": 0.28010860085487366, + "learning_rate": 7.829142549499613e-05, + "loss": 1.8274, + "step": 10730 + }, + { + "epoch": 3.2937384898710866, + "grad_norm": 0.3453529477119446, + "learning_rate": 7.828732701452119e-05, + "loss": 1.8618, + "step": 10731 + }, + { + "epoch": 3.2940454266421115, + "grad_norm": 0.2946802079677582, + "learning_rate": 7.828322825449432e-05, + "loss": 1.7123, + "step": 10732 + }, + { + "epoch": 3.294352363413137, + "grad_norm": 0.2467648684978485, + "learning_rate": 7.827912921495601e-05, + "loss": 1.7786, + "step": 10733 + }, + { + "epoch": 3.294659300184162, + "grad_norm": 0.2957034707069397, + "learning_rate": 7.827502989594677e-05, + "loss": 1.7817, + "step": 10734 + }, + { + "epoch": 3.294966236955187, + "grad_norm": 0.300905704498291, + "learning_rate": 7.827093029750713e-05, + "loss": 1.7582, + "step": 10735 + }, + { + "epoch": 3.2952731737262124, + "grad_norm": 0.28935131430625916, + "learning_rate": 7.826683041967757e-05, + "loss": 1.7766, + "step": 10736 + }, + { + "epoch": 3.2955801104972378, + "grad_norm": 0.26046010851860046, + "learning_rate": 7.826273026249861e-05, + "loss": 1.8152, + "step": 10737 + }, + { + "epoch": 3.2958870472682626, + "grad_norm": 0.24247924983501434, + "learning_rate": 7.82586298260108e-05, + "loss": 1.8679, + "step": 10738 + }, + { + "epoch": 3.296193984039288, + "grad_norm": 0.25977620482444763, + "learning_rate": 7.825452911025466e-05, + "loss": 1.8108, + "step": 10739 + }, + { + "epoch": 3.2965009208103133, + "grad_norm": 0.2732592821121216, + "learning_rate": 7.825042811527068e-05, + "loss": 1.7355, + "step": 10740 + }, + { + "epoch": 3.296807857581338, + "grad_norm": 0.38407859206199646, + "learning_rate": 7.824632684109941e-05, + "loss": 1.8418, + "step": 10741 + }, + { + "epoch": 3.2971147943523635, + "grad_norm": 0.4239252805709839, + "learning_rate": 7.82422252877814e-05, + "loss": 1.7655, + "step": 10742 + }, + { + "epoch": 3.2974217311233884, + "grad_norm": 0.3810526132583618, + "learning_rate": 7.823812345535716e-05, + "loss": 1.8804, + "step": 10743 + }, + { + "epoch": 3.2977286678944138, + "grad_norm": 0.29939520359039307, + "learning_rate": 7.823402134386722e-05, + "loss": 1.8207, + "step": 10744 + }, + { + "epoch": 3.298035604665439, + "grad_norm": 0.4053972065448761, + "learning_rate": 7.822991895335215e-05, + "loss": 1.7901, + "step": 10745 + }, + { + "epoch": 3.298342541436464, + "grad_norm": 0.4975005090236664, + "learning_rate": 7.822581628385247e-05, + "loss": 1.8344, + "step": 10746 + }, + { + "epoch": 3.2986494782074893, + "grad_norm": 0.4100436270236969, + "learning_rate": 7.822171333540874e-05, + "loss": 1.7891, + "step": 10747 + }, + { + "epoch": 3.298956414978514, + "grad_norm": 0.2817644476890564, + "learning_rate": 7.821761010806147e-05, + "loss": 1.7895, + "step": 10748 + }, + { + "epoch": 3.2992633517495396, + "grad_norm": 0.332660973072052, + "learning_rate": 7.821350660185125e-05, + "loss": 1.7281, + "step": 10749 + }, + { + "epoch": 3.299570288520565, + "grad_norm": 0.42652732133865356, + "learning_rate": 7.820940281681863e-05, + "loss": 1.7855, + "step": 10750 + }, + { + "epoch": 3.2998772252915898, + "grad_norm": 0.35700714588165283, + "learning_rate": 7.820529875300415e-05, + "loss": 1.8722, + "step": 10751 + }, + { + "epoch": 3.300184162062615, + "grad_norm": 0.25305211544036865, + "learning_rate": 7.820119441044838e-05, + "loss": 1.7696, + "step": 10752 + }, + { + "epoch": 3.3004910988336404, + "grad_norm": 0.280205637216568, + "learning_rate": 7.819708978919188e-05, + "loss": 1.756, + "step": 10753 + }, + { + "epoch": 3.3007980356046653, + "grad_norm": 0.4176226854324341, + "learning_rate": 7.819298488927521e-05, + "loss": 1.7731, + "step": 10754 + }, + { + "epoch": 3.3011049723756907, + "grad_norm": 0.4264865517616272, + "learning_rate": 7.818887971073894e-05, + "loss": 1.7851, + "step": 10755 + }, + { + "epoch": 3.301411909146716, + "grad_norm": 0.2901221215724945, + "learning_rate": 7.818477425362363e-05, + "loss": 1.7356, + "step": 10756 + }, + { + "epoch": 3.301718845917741, + "grad_norm": 0.29583361744880676, + "learning_rate": 7.818066851796986e-05, + "loss": 1.8269, + "step": 10757 + }, + { + "epoch": 3.3020257826887662, + "grad_norm": 0.38592997193336487, + "learning_rate": 7.817656250381821e-05, + "loss": 1.7515, + "step": 10758 + }, + { + "epoch": 3.302332719459791, + "grad_norm": 0.29301533102989197, + "learning_rate": 7.817245621120927e-05, + "loss": 1.7955, + "step": 10759 + }, + { + "epoch": 3.3026396562308165, + "grad_norm": 0.2770880162715912, + "learning_rate": 7.816834964018359e-05, + "loss": 1.7899, + "step": 10760 + }, + { + "epoch": 3.302946593001842, + "grad_norm": 0.32566413283348083, + "learning_rate": 7.816424279078176e-05, + "loss": 1.74, + "step": 10761 + }, + { + "epoch": 3.3032535297728667, + "grad_norm": 0.3077750504016876, + "learning_rate": 7.81601356630444e-05, + "loss": 1.8123, + "step": 10762 + }, + { + "epoch": 3.303560466543892, + "grad_norm": 0.2826370298862457, + "learning_rate": 7.815602825701206e-05, + "loss": 1.865, + "step": 10763 + }, + { + "epoch": 3.303867403314917, + "grad_norm": 0.31700822710990906, + "learning_rate": 7.815192057272534e-05, + "loss": 1.8021, + "step": 10764 + }, + { + "epoch": 3.3041743400859422, + "grad_norm": 0.33182790875434875, + "learning_rate": 7.814781261022486e-05, + "loss": 1.818, + "step": 10765 + }, + { + "epoch": 3.3044812768569676, + "grad_norm": 0.2720039486885071, + "learning_rate": 7.814370436955118e-05, + "loss": 1.8369, + "step": 10766 + }, + { + "epoch": 3.3047882136279925, + "grad_norm": 0.28134068846702576, + "learning_rate": 7.813959585074493e-05, + "loss": 1.8391, + "step": 10767 + }, + { + "epoch": 3.305095150399018, + "grad_norm": 0.25748828053474426, + "learning_rate": 7.813548705384667e-05, + "loss": 1.7987, + "step": 10768 + }, + { + "epoch": 3.305402087170043, + "grad_norm": 0.26187625527381897, + "learning_rate": 7.813137797889708e-05, + "loss": 1.7645, + "step": 10769 + }, + { + "epoch": 3.305709023941068, + "grad_norm": 0.297262579202652, + "learning_rate": 7.812726862593671e-05, + "loss": 1.771, + "step": 10770 + }, + { + "epoch": 3.3060159607120934, + "grad_norm": 0.2987872064113617, + "learning_rate": 7.812315899500618e-05, + "loss": 1.8115, + "step": 10771 + }, + { + "epoch": 3.3063228974831187, + "grad_norm": 0.31963878870010376, + "learning_rate": 7.81190490861461e-05, + "loss": 1.7685, + "step": 10772 + }, + { + "epoch": 3.3066298342541436, + "grad_norm": 0.27007177472114563, + "learning_rate": 7.81149388993971e-05, + "loss": 1.8272, + "step": 10773 + }, + { + "epoch": 3.306936771025169, + "grad_norm": 0.26818498969078064, + "learning_rate": 7.811082843479981e-05, + "loss": 1.7894, + "step": 10774 + }, + { + "epoch": 3.307243707796194, + "grad_norm": 0.28857091069221497, + "learning_rate": 7.810671769239483e-05, + "loss": 1.8769, + "step": 10775 + }, + { + "epoch": 3.307550644567219, + "grad_norm": 0.26983144879341125, + "learning_rate": 7.810260667222277e-05, + "loss": 1.796, + "step": 10776 + }, + { + "epoch": 3.3078575813382445, + "grad_norm": 0.2566467225551605, + "learning_rate": 7.809849537432432e-05, + "loss": 1.848, + "step": 10777 + }, + { + "epoch": 3.3081645181092694, + "grad_norm": 0.25607848167419434, + "learning_rate": 7.809438379874005e-05, + "loss": 1.8072, + "step": 10778 + }, + { + "epoch": 3.3084714548802947, + "grad_norm": 0.29158470034599304, + "learning_rate": 7.809027194551059e-05, + "loss": 1.7772, + "step": 10779 + }, + { + "epoch": 3.3087783916513196, + "grad_norm": 0.360897421836853, + "learning_rate": 7.808615981467664e-05, + "loss": 1.8404, + "step": 10780 + }, + { + "epoch": 3.309085328422345, + "grad_norm": 0.31121253967285156, + "learning_rate": 7.808204740627877e-05, + "loss": 1.8137, + "step": 10781 + }, + { + "epoch": 3.3093922651933703, + "grad_norm": 0.2846451699733734, + "learning_rate": 7.807793472035765e-05, + "loss": 1.8367, + "step": 10782 + }, + { + "epoch": 3.309699201964395, + "grad_norm": 0.2711004316806793, + "learning_rate": 7.807382175695393e-05, + "loss": 1.7728, + "step": 10783 + }, + { + "epoch": 3.3100061387354205, + "grad_norm": 0.2693859338760376, + "learning_rate": 7.806970851610824e-05, + "loss": 1.7026, + "step": 10784 + }, + { + "epoch": 3.310313075506446, + "grad_norm": 0.3050517439842224, + "learning_rate": 7.806559499786125e-05, + "loss": 1.8041, + "step": 10785 + }, + { + "epoch": 3.3106200122774707, + "grad_norm": 0.27304747700691223, + "learning_rate": 7.80614812022536e-05, + "loss": 1.8182, + "step": 10786 + }, + { + "epoch": 3.310926949048496, + "grad_norm": 0.28378555178642273, + "learning_rate": 7.805736712932594e-05, + "loss": 1.8519, + "step": 10787 + }, + { + "epoch": 3.3112338858195214, + "grad_norm": 0.30620133876800537, + "learning_rate": 7.805325277911892e-05, + "loss": 1.8594, + "step": 10788 + }, + { + "epoch": 3.3115408225905463, + "grad_norm": 0.2580169141292572, + "learning_rate": 7.804913815167325e-05, + "loss": 1.7897, + "step": 10789 + }, + { + "epoch": 3.3118477593615716, + "grad_norm": 0.28937023878097534, + "learning_rate": 7.804502324702951e-05, + "loss": 1.8362, + "step": 10790 + }, + { + "epoch": 3.3121546961325965, + "grad_norm": 0.28032705187797546, + "learning_rate": 7.804090806522844e-05, + "loss": 1.8168, + "step": 10791 + }, + { + "epoch": 3.312461632903622, + "grad_norm": 0.33712559938430786, + "learning_rate": 7.803679260631069e-05, + "loss": 1.7489, + "step": 10792 + }, + { + "epoch": 3.312768569674647, + "grad_norm": 0.40536820888519287, + "learning_rate": 7.80326768703169e-05, + "loss": 1.8413, + "step": 10793 + }, + { + "epoch": 3.313075506445672, + "grad_norm": 0.34967559576034546, + "learning_rate": 7.802856085728778e-05, + "loss": 1.8076, + "step": 10794 + }, + { + "epoch": 3.3133824432166974, + "grad_norm": 0.2429870367050171, + "learning_rate": 7.8024444567264e-05, + "loss": 1.8002, + "step": 10795 + }, + { + "epoch": 3.3136893799877223, + "grad_norm": 0.40956684947013855, + "learning_rate": 7.802032800028621e-05, + "loss": 1.8151, + "step": 10796 + }, + { + "epoch": 3.3139963167587476, + "grad_norm": 0.4908781945705414, + "learning_rate": 7.801621115639512e-05, + "loss": 1.8124, + "step": 10797 + }, + { + "epoch": 3.314303253529773, + "grad_norm": 0.3922197222709656, + "learning_rate": 7.801209403563143e-05, + "loss": 1.7911, + "step": 10798 + }, + { + "epoch": 3.314610190300798, + "grad_norm": 0.29467105865478516, + "learning_rate": 7.800797663803578e-05, + "loss": 1.8472, + "step": 10799 + }, + { + "epoch": 3.314917127071823, + "grad_norm": 0.384974867105484, + "learning_rate": 7.800385896364891e-05, + "loss": 1.8139, + "step": 10800 + }, + { + "epoch": 3.3152240638428485, + "grad_norm": 0.4605129063129425, + "learning_rate": 7.79997410125115e-05, + "loss": 1.7982, + "step": 10801 + }, + { + "epoch": 3.3155310006138734, + "grad_norm": 0.2982464134693146, + "learning_rate": 7.799562278466423e-05, + "loss": 1.8496, + "step": 10802 + }, + { + "epoch": 3.3158379373848987, + "grad_norm": 0.3101392984390259, + "learning_rate": 7.79915042801478e-05, + "loss": 1.8172, + "step": 10803 + }, + { + "epoch": 3.316144874155924, + "grad_norm": 0.3651282489299774, + "learning_rate": 7.798738549900292e-05, + "loss": 1.7497, + "step": 10804 + }, + { + "epoch": 3.316451810926949, + "grad_norm": 0.28504419326782227, + "learning_rate": 7.79832664412703e-05, + "loss": 1.8027, + "step": 10805 + }, + { + "epoch": 3.3167587476979743, + "grad_norm": 0.28333309292793274, + "learning_rate": 7.797914710699063e-05, + "loss": 1.8121, + "step": 10806 + }, + { + "epoch": 3.317065684468999, + "grad_norm": 0.37549784779548645, + "learning_rate": 7.797502749620462e-05, + "loss": 1.817, + "step": 10807 + }, + { + "epoch": 3.3173726212400245, + "grad_norm": 0.3864210844039917, + "learning_rate": 7.797090760895301e-05, + "loss": 1.852, + "step": 10808 + }, + { + "epoch": 3.31767955801105, + "grad_norm": 0.2422102987766266, + "learning_rate": 7.79667874452765e-05, + "loss": 1.7523, + "step": 10809 + }, + { + "epoch": 3.3179864947820747, + "grad_norm": 0.307892382144928, + "learning_rate": 7.79626670052158e-05, + "loss": 1.7436, + "step": 10810 + }, + { + "epoch": 3.3182934315531, + "grad_norm": 0.29607462882995605, + "learning_rate": 7.795854628881162e-05, + "loss": 1.768, + "step": 10811 + }, + { + "epoch": 3.3186003683241254, + "grad_norm": 0.23334427177906036, + "learning_rate": 7.795442529610471e-05, + "loss": 1.7687, + "step": 10812 + }, + { + "epoch": 3.3189073050951503, + "grad_norm": 0.26257455348968506, + "learning_rate": 7.795030402713578e-05, + "loss": 1.8266, + "step": 10813 + }, + { + "epoch": 3.3192142418661756, + "grad_norm": 0.3252788782119751, + "learning_rate": 7.794618248194556e-05, + "loss": 1.8645, + "step": 10814 + }, + { + "epoch": 3.319521178637201, + "grad_norm": 0.3807232975959778, + "learning_rate": 7.79420606605748e-05, + "loss": 1.8154, + "step": 10815 + }, + { + "epoch": 3.319828115408226, + "grad_norm": 0.3395625948905945, + "learning_rate": 7.793793856306422e-05, + "loss": 1.8002, + "step": 10816 + }, + { + "epoch": 3.320135052179251, + "grad_norm": 0.2896415889263153, + "learning_rate": 7.793381618945455e-05, + "loss": 1.8077, + "step": 10817 + }, + { + "epoch": 3.320441988950276, + "grad_norm": 0.27733489871025085, + "learning_rate": 7.792969353978652e-05, + "loss": 1.7976, + "step": 10818 + }, + { + "epoch": 3.3207489257213014, + "grad_norm": 0.36985141038894653, + "learning_rate": 7.79255706141009e-05, + "loss": 1.8724, + "step": 10819 + }, + { + "epoch": 3.3210558624923268, + "grad_norm": 0.37886983156204224, + "learning_rate": 7.792144741243843e-05, + "loss": 1.8249, + "step": 10820 + }, + { + "epoch": 3.3213627992633517, + "grad_norm": 0.3030721843242645, + "learning_rate": 7.791732393483986e-05, + "loss": 1.7975, + "step": 10821 + }, + { + "epoch": 3.321669736034377, + "grad_norm": 0.2637709081172943, + "learning_rate": 7.791320018134592e-05, + "loss": 1.7205, + "step": 10822 + }, + { + "epoch": 3.321976672805402, + "grad_norm": 0.35307520627975464, + "learning_rate": 7.790907615199736e-05, + "loss": 1.8786, + "step": 10823 + }, + { + "epoch": 3.322283609576427, + "grad_norm": 0.3333272635936737, + "learning_rate": 7.790495184683497e-05, + "loss": 1.7715, + "step": 10824 + }, + { + "epoch": 3.3225905463474525, + "grad_norm": 0.2597469091415405, + "learning_rate": 7.790082726589948e-05, + "loss": 1.8379, + "step": 10825 + }, + { + "epoch": 3.3228974831184774, + "grad_norm": 0.34176257252693176, + "learning_rate": 7.789670240923168e-05, + "loss": 1.8305, + "step": 10826 + }, + { + "epoch": 3.3232044198895028, + "grad_norm": 0.37954533100128174, + "learning_rate": 7.789257727687229e-05, + "loss": 1.7728, + "step": 10827 + }, + { + "epoch": 3.323511356660528, + "grad_norm": 0.2840248644351959, + "learning_rate": 7.788845186886212e-05, + "loss": 1.8059, + "step": 10828 + }, + { + "epoch": 3.323818293431553, + "grad_norm": 0.3650275766849518, + "learning_rate": 7.788432618524193e-05, + "loss": 1.8127, + "step": 10829 + }, + { + "epoch": 3.3241252302025783, + "grad_norm": 0.4869692623615265, + "learning_rate": 7.788020022605247e-05, + "loss": 1.833, + "step": 10830 + }, + { + "epoch": 3.3244321669736037, + "grad_norm": 0.3419482707977295, + "learning_rate": 7.787607399133453e-05, + "loss": 1.7812, + "step": 10831 + }, + { + "epoch": 3.3247391037446286, + "grad_norm": 0.27625617384910583, + "learning_rate": 7.787194748112889e-05, + "loss": 1.8513, + "step": 10832 + }, + { + "epoch": 3.325046040515654, + "grad_norm": 0.4287806749343872, + "learning_rate": 7.786782069547633e-05, + "loss": 1.836, + "step": 10833 + }, + { + "epoch": 3.325352977286679, + "grad_norm": 0.4345545172691345, + "learning_rate": 7.786369363441763e-05, + "loss": 1.8027, + "step": 10834 + }, + { + "epoch": 3.325659914057704, + "grad_norm": 0.32976534962654114, + "learning_rate": 7.78595662979936e-05, + "loss": 1.7987, + "step": 10835 + }, + { + "epoch": 3.3259668508287294, + "grad_norm": 0.2677469849586487, + "learning_rate": 7.785543868624498e-05, + "loss": 1.8312, + "step": 10836 + }, + { + "epoch": 3.3262737875997543, + "grad_norm": 0.2547740638256073, + "learning_rate": 7.785131079921259e-05, + "loss": 1.7844, + "step": 10837 + }, + { + "epoch": 3.3265807243707797, + "grad_norm": 0.26755592226982117, + "learning_rate": 7.784718263693725e-05, + "loss": 1.8263, + "step": 10838 + }, + { + "epoch": 3.3268876611418046, + "grad_norm": 0.23884403705596924, + "learning_rate": 7.784305419945969e-05, + "loss": 1.7862, + "step": 10839 + }, + { + "epoch": 3.32719459791283, + "grad_norm": 0.2896903157234192, + "learning_rate": 7.783892548682077e-05, + "loss": 1.9138, + "step": 10840 + }, + { + "epoch": 3.3275015346838552, + "grad_norm": 0.3201359510421753, + "learning_rate": 7.783479649906127e-05, + "loss": 1.8382, + "step": 10841 + }, + { + "epoch": 3.32780847145488, + "grad_norm": 0.39285311102867126, + "learning_rate": 7.7830667236222e-05, + "loss": 1.7763, + "step": 10842 + }, + { + "epoch": 3.3281154082259055, + "grad_norm": 0.435007244348526, + "learning_rate": 7.782653769834376e-05, + "loss": 1.8415, + "step": 10843 + }, + { + "epoch": 3.328422344996931, + "grad_norm": 0.34605318307876587, + "learning_rate": 7.782240788546736e-05, + "loss": 1.757, + "step": 10844 + }, + { + "epoch": 3.3287292817679557, + "grad_norm": 0.26830604672431946, + "learning_rate": 7.781827779763362e-05, + "loss": 1.7779, + "step": 10845 + }, + { + "epoch": 3.329036218538981, + "grad_norm": 0.41851529479026794, + "learning_rate": 7.781414743488336e-05, + "loss": 1.8609, + "step": 10846 + }, + { + "epoch": 3.3293431553100064, + "grad_norm": 0.5058079361915588, + "learning_rate": 7.78100167972574e-05, + "loss": 1.8146, + "step": 10847 + }, + { + "epoch": 3.3296500920810312, + "grad_norm": 0.34394967555999756, + "learning_rate": 7.780588588479654e-05, + "loss": 1.8079, + "step": 10848 + }, + { + "epoch": 3.3299570288520566, + "grad_norm": 0.3033885061740875, + "learning_rate": 7.780175469754161e-05, + "loss": 1.8223, + "step": 10849 + }, + { + "epoch": 3.3302639656230815, + "grad_norm": 0.4431045651435852, + "learning_rate": 7.779762323553347e-05, + "loss": 1.8841, + "step": 10850 + }, + { + "epoch": 3.330570902394107, + "grad_norm": 0.3451448976993561, + "learning_rate": 7.77934914988129e-05, + "loss": 1.8092, + "step": 10851 + }, + { + "epoch": 3.330877839165132, + "grad_norm": 0.26580891013145447, + "learning_rate": 7.778935948742077e-05, + "loss": 1.8244, + "step": 10852 + }, + { + "epoch": 3.331184775936157, + "grad_norm": 0.32079070806503296, + "learning_rate": 7.778522720139792e-05, + "loss": 1.7816, + "step": 10853 + }, + { + "epoch": 3.3314917127071824, + "grad_norm": 0.35789042711257935, + "learning_rate": 7.778109464078514e-05, + "loss": 1.8211, + "step": 10854 + }, + { + "epoch": 3.3317986494782073, + "grad_norm": 0.2808612585067749, + "learning_rate": 7.77769618056233e-05, + "loss": 1.8387, + "step": 10855 + }, + { + "epoch": 3.3321055862492326, + "grad_norm": 0.24760548770427704, + "learning_rate": 7.777282869595326e-05, + "loss": 1.7795, + "step": 10856 + }, + { + "epoch": 3.332412523020258, + "grad_norm": 0.2840912640094757, + "learning_rate": 7.776869531181583e-05, + "loss": 1.7492, + "step": 10857 + }, + { + "epoch": 3.332719459791283, + "grad_norm": 0.2881413698196411, + "learning_rate": 7.77645616532519e-05, + "loss": 1.8157, + "step": 10858 + }, + { + "epoch": 3.333026396562308, + "grad_norm": 0.2508779764175415, + "learning_rate": 7.776042772030228e-05, + "loss": 1.8196, + "step": 10859 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.3307822048664093, + "learning_rate": 7.775629351300785e-05, + "loss": 1.8195, + "step": 10860 + }, + { + "epoch": 3.3336402701043584, + "grad_norm": 0.34392043948173523, + "learning_rate": 7.775215903140946e-05, + "loss": 1.7775, + "step": 10861 + }, + { + "epoch": 3.3339472068753837, + "grad_norm": 0.2594252824783325, + "learning_rate": 7.774802427554796e-05, + "loss": 1.7687, + "step": 10862 + }, + { + "epoch": 3.334254143646409, + "grad_norm": 0.3109053075313568, + "learning_rate": 7.774388924546423e-05, + "loss": 1.7908, + "step": 10863 + }, + { + "epoch": 3.334561080417434, + "grad_norm": 0.4801923930644989, + "learning_rate": 7.773975394119913e-05, + "loss": 1.8316, + "step": 10864 + }, + { + "epoch": 3.3348680171884593, + "grad_norm": 0.4754973351955414, + "learning_rate": 7.77356183627935e-05, + "loss": 1.8015, + "step": 10865 + }, + { + "epoch": 3.335174953959484, + "grad_norm": 0.29624658823013306, + "learning_rate": 7.773148251028825e-05, + "loss": 1.8179, + "step": 10866 + }, + { + "epoch": 3.3354818907305095, + "grad_norm": 0.32207581400871277, + "learning_rate": 7.772734638372423e-05, + "loss": 1.799, + "step": 10867 + }, + { + "epoch": 3.335788827501535, + "grad_norm": 0.5227517485618591, + "learning_rate": 7.772320998314233e-05, + "loss": 1.8452, + "step": 10868 + }, + { + "epoch": 3.3360957642725597, + "grad_norm": 0.4081100523471832, + "learning_rate": 7.771907330858341e-05, + "loss": 1.8182, + "step": 10869 + }, + { + "epoch": 3.336402701043585, + "grad_norm": 0.23786653578281403, + "learning_rate": 7.771493636008838e-05, + "loss": 1.7392, + "step": 10870 + }, + { + "epoch": 3.33670963781461, + "grad_norm": 0.37913820147514343, + "learning_rate": 7.771079913769807e-05, + "loss": 1.7559, + "step": 10871 + }, + { + "epoch": 3.3370165745856353, + "grad_norm": 0.4939163625240326, + "learning_rate": 7.770666164145344e-05, + "loss": 1.8076, + "step": 10872 + }, + { + "epoch": 3.3373235113566606, + "grad_norm": 0.3322528302669525, + "learning_rate": 7.770252387139532e-05, + "loss": 1.8045, + "step": 10873 + }, + { + "epoch": 3.337630448127686, + "grad_norm": 0.3685782849788666, + "learning_rate": 7.769838582756461e-05, + "loss": 1.7703, + "step": 10874 + }, + { + "epoch": 3.337937384898711, + "grad_norm": 0.5564271807670593, + "learning_rate": 7.769424751000224e-05, + "loss": 1.7697, + "step": 10875 + }, + { + "epoch": 3.338244321669736, + "grad_norm": 0.38610726594924927, + "learning_rate": 7.769010891874906e-05, + "loss": 1.7944, + "step": 10876 + }, + { + "epoch": 3.338551258440761, + "grad_norm": 0.23838558793067932, + "learning_rate": 7.768597005384602e-05, + "loss": 1.765, + "step": 10877 + }, + { + "epoch": 3.3388581952117864, + "grad_norm": 0.4334571063518524, + "learning_rate": 7.768183091533399e-05, + "loss": 1.7854, + "step": 10878 + }, + { + "epoch": 3.3391651319828117, + "grad_norm": 0.44844719767570496, + "learning_rate": 7.767769150325386e-05, + "loss": 1.7955, + "step": 10879 + }, + { + "epoch": 3.3394720687538366, + "grad_norm": 0.26543378829956055, + "learning_rate": 7.767355181764659e-05, + "loss": 1.8311, + "step": 10880 + }, + { + "epoch": 3.339779005524862, + "grad_norm": 0.39401358366012573, + "learning_rate": 7.766941185855304e-05, + "loss": 1.8264, + "step": 10881 + }, + { + "epoch": 3.340085942295887, + "grad_norm": 0.5476824045181274, + "learning_rate": 7.766527162601416e-05, + "loss": 1.8051, + "step": 10882 + }, + { + "epoch": 3.340392879066912, + "grad_norm": 0.4021138548851013, + "learning_rate": 7.766113112007084e-05, + "loss": 1.7941, + "step": 10883 + }, + { + "epoch": 3.3406998158379375, + "grad_norm": 0.3262040317058563, + "learning_rate": 7.765699034076402e-05, + "loss": 1.8317, + "step": 10884 + }, + { + "epoch": 3.3410067526089624, + "grad_norm": 0.5461146831512451, + "learning_rate": 7.765284928813459e-05, + "loss": 1.833, + "step": 10885 + }, + { + "epoch": 3.3413136893799877, + "grad_norm": 0.5067405700683594, + "learning_rate": 7.764870796222351e-05, + "loss": 1.7862, + "step": 10886 + }, + { + "epoch": 3.341620626151013, + "grad_norm": 0.2731069028377533, + "learning_rate": 7.76445663630717e-05, + "loss": 1.8173, + "step": 10887 + }, + { + "epoch": 3.341927562922038, + "grad_norm": 0.48928195238113403, + "learning_rate": 7.764042449072008e-05, + "loss": 1.7992, + "step": 10888 + }, + { + "epoch": 3.3422344996930633, + "grad_norm": 0.5338504910469055, + "learning_rate": 7.763628234520958e-05, + "loss": 1.7891, + "step": 10889 + }, + { + "epoch": 3.3425414364640886, + "grad_norm": 0.3136523365974426, + "learning_rate": 7.763213992658114e-05, + "loss": 1.8623, + "step": 10890 + }, + { + "epoch": 3.3428483732351135, + "grad_norm": 0.36551395058631897, + "learning_rate": 7.762799723487568e-05, + "loss": 1.8474, + "step": 10891 + }, + { + "epoch": 3.343155310006139, + "grad_norm": 0.35772353410720825, + "learning_rate": 7.762385427013419e-05, + "loss": 1.8625, + "step": 10892 + }, + { + "epoch": 3.3434622467771637, + "grad_norm": 0.29944708943367004, + "learning_rate": 7.761971103239755e-05, + "loss": 1.8181, + "step": 10893 + }, + { + "epoch": 3.343769183548189, + "grad_norm": 0.3395330309867859, + "learning_rate": 7.761556752170676e-05, + "loss": 1.7943, + "step": 10894 + }, + { + "epoch": 3.3440761203192144, + "grad_norm": 0.3624265193939209, + "learning_rate": 7.761142373810274e-05, + "loss": 1.8234, + "step": 10895 + }, + { + "epoch": 3.3443830570902393, + "grad_norm": 0.25409621000289917, + "learning_rate": 7.760727968162644e-05, + "loss": 1.7532, + "step": 10896 + }, + { + "epoch": 3.3446899938612646, + "grad_norm": 0.321437805891037, + "learning_rate": 7.760313535231883e-05, + "loss": 1.8808, + "step": 10897 + }, + { + "epoch": 3.3449969306322895, + "grad_norm": 0.2919142544269562, + "learning_rate": 7.759899075022086e-05, + "loss": 1.7677, + "step": 10898 + }, + { + "epoch": 3.345303867403315, + "grad_norm": 0.26515716314315796, + "learning_rate": 7.759484587537346e-05, + "loss": 1.8118, + "step": 10899 + }, + { + "epoch": 3.34561080417434, + "grad_norm": 0.2963240146636963, + "learning_rate": 7.759070072781764e-05, + "loss": 1.8329, + "step": 10900 + }, + { + "epoch": 3.345917740945365, + "grad_norm": 0.3186480700969696, + "learning_rate": 7.758655530759435e-05, + "loss": 1.8013, + "step": 10901 + }, + { + "epoch": 3.3462246777163904, + "grad_norm": 0.256145715713501, + "learning_rate": 7.758240961474454e-05, + "loss": 1.7865, + "step": 10902 + }, + { + "epoch": 3.3465316144874158, + "grad_norm": 0.28951629996299744, + "learning_rate": 7.757826364930921e-05, + "loss": 1.8091, + "step": 10903 + }, + { + "epoch": 3.3468385512584407, + "grad_norm": 0.2692483365535736, + "learning_rate": 7.75741174113293e-05, + "loss": 1.8308, + "step": 10904 + }, + { + "epoch": 3.347145488029466, + "grad_norm": 0.27615389227867126, + "learning_rate": 7.75699709008458e-05, + "loss": 1.7888, + "step": 10905 + }, + { + "epoch": 3.3474524248004913, + "grad_norm": 0.2819034457206726, + "learning_rate": 7.75658241178997e-05, + "loss": 1.7624, + "step": 10906 + }, + { + "epoch": 3.347759361571516, + "grad_norm": 0.2627592086791992, + "learning_rate": 7.756167706253196e-05, + "loss": 1.7696, + "step": 10907 + }, + { + "epoch": 3.3480662983425415, + "grad_norm": 0.3528621196746826, + "learning_rate": 7.755752973478356e-05, + "loss": 1.7725, + "step": 10908 + }, + { + "epoch": 3.3483732351135664, + "grad_norm": 0.35949698090553284, + "learning_rate": 7.755338213469552e-05, + "loss": 1.8163, + "step": 10909 + }, + { + "epoch": 3.3486801718845918, + "grad_norm": 0.25142577290534973, + "learning_rate": 7.75492342623088e-05, + "loss": 1.7879, + "step": 10910 + }, + { + "epoch": 3.348987108655617, + "grad_norm": 0.25766023993492126, + "learning_rate": 7.75450861176644e-05, + "loss": 1.8143, + "step": 10911 + }, + { + "epoch": 3.349294045426642, + "grad_norm": 0.2736956477165222, + "learning_rate": 7.754093770080331e-05, + "loss": 1.8907, + "step": 10912 + }, + { + "epoch": 3.3496009821976673, + "grad_norm": 0.23700755834579468, + "learning_rate": 7.753678901176654e-05, + "loss": 1.813, + "step": 10913 + }, + { + "epoch": 3.349907918968692, + "grad_norm": 0.245509073138237, + "learning_rate": 7.753264005059507e-05, + "loss": 1.8019, + "step": 10914 + }, + { + "epoch": 3.3502148557397176, + "grad_norm": 0.232910618185997, + "learning_rate": 7.752849081732993e-05, + "loss": 1.784, + "step": 10915 + }, + { + "epoch": 3.350521792510743, + "grad_norm": 0.22989360988140106, + "learning_rate": 7.75243413120121e-05, + "loss": 1.7597, + "step": 10916 + }, + { + "epoch": 3.350828729281768, + "grad_norm": 0.2093925178050995, + "learning_rate": 7.752019153468258e-05, + "loss": 1.7698, + "step": 10917 + }, + { + "epoch": 3.351135666052793, + "grad_norm": 0.25539630651474, + "learning_rate": 7.751604148538241e-05, + "loss": 1.8287, + "step": 10918 + }, + { + "epoch": 3.3514426028238185, + "grad_norm": 0.2731820046901703, + "learning_rate": 7.75118911641526e-05, + "loss": 1.8862, + "step": 10919 + }, + { + "epoch": 3.3517495395948433, + "grad_norm": 0.2464541345834732, + "learning_rate": 7.750774057103416e-05, + "loss": 1.8165, + "step": 10920 + }, + { + "epoch": 3.3520564763658687, + "grad_norm": 0.26380276679992676, + "learning_rate": 7.75035897060681e-05, + "loss": 1.78, + "step": 10921 + }, + { + "epoch": 3.352363413136894, + "grad_norm": 0.3080748915672302, + "learning_rate": 7.749943856929542e-05, + "loss": 1.7925, + "step": 10922 + }, + { + "epoch": 3.352670349907919, + "grad_norm": 0.317754864692688, + "learning_rate": 7.74952871607572e-05, + "loss": 1.8248, + "step": 10923 + }, + { + "epoch": 3.3529772866789442, + "grad_norm": 0.2525196373462677, + "learning_rate": 7.749113548049442e-05, + "loss": 1.762, + "step": 10924 + }, + { + "epoch": 3.353284223449969, + "grad_norm": 0.3149549961090088, + "learning_rate": 7.748698352854814e-05, + "loss": 1.8289, + "step": 10925 + }, + { + "epoch": 3.3535911602209945, + "grad_norm": 0.35744383931159973, + "learning_rate": 7.748283130495937e-05, + "loss": 1.8132, + "step": 10926 + }, + { + "epoch": 3.35389809699202, + "grad_norm": 0.28599128127098083, + "learning_rate": 7.747867880976916e-05, + "loss": 1.7351, + "step": 10927 + }, + { + "epoch": 3.3542050337630447, + "grad_norm": 0.24428869783878326, + "learning_rate": 7.747452604301852e-05, + "loss": 1.794, + "step": 10928 + }, + { + "epoch": 3.35451197053407, + "grad_norm": 0.29067808389663696, + "learning_rate": 7.747037300474854e-05, + "loss": 1.8181, + "step": 10929 + }, + { + "epoch": 3.354818907305095, + "grad_norm": 0.32417505979537964, + "learning_rate": 7.746621969500021e-05, + "loss": 1.8338, + "step": 10930 + }, + { + "epoch": 3.3551258440761202, + "grad_norm": 0.29536551237106323, + "learning_rate": 7.746206611381462e-05, + "loss": 1.8732, + "step": 10931 + }, + { + "epoch": 3.3554327808471456, + "grad_norm": 0.3169345259666443, + "learning_rate": 7.745791226123278e-05, + "loss": 1.876, + "step": 10932 + }, + { + "epoch": 3.3557397176181705, + "grad_norm": 0.2680271565914154, + "learning_rate": 7.745375813729576e-05, + "loss": 1.7347, + "step": 10933 + }, + { + "epoch": 3.356046654389196, + "grad_norm": 0.28339266777038574, + "learning_rate": 7.74496037420446e-05, + "loss": 1.8507, + "step": 10934 + }, + { + "epoch": 3.356353591160221, + "grad_norm": 0.2567409574985504, + "learning_rate": 7.744544907552038e-05, + "loss": 1.8244, + "step": 10935 + }, + { + "epoch": 3.356660527931246, + "grad_norm": 0.266063928604126, + "learning_rate": 7.744129413776416e-05, + "loss": 1.7864, + "step": 10936 + }, + { + "epoch": 3.3569674647022714, + "grad_norm": 0.2490999698638916, + "learning_rate": 7.743713892881696e-05, + "loss": 1.7637, + "step": 10937 + }, + { + "epoch": 3.3572744014732967, + "grad_norm": 0.25857025384902954, + "learning_rate": 7.743298344871988e-05, + "loss": 1.8101, + "step": 10938 + }, + { + "epoch": 3.3575813382443216, + "grad_norm": 0.2549006938934326, + "learning_rate": 7.742882769751398e-05, + "loss": 1.7782, + "step": 10939 + }, + { + "epoch": 3.357888275015347, + "grad_norm": 0.23915350437164307, + "learning_rate": 7.742467167524035e-05, + "loss": 1.7822, + "step": 10940 + }, + { + "epoch": 3.358195211786372, + "grad_norm": 0.25501590967178345, + "learning_rate": 7.742051538194e-05, + "loss": 1.798, + "step": 10941 + }, + { + "epoch": 3.358502148557397, + "grad_norm": 0.29332005977630615, + "learning_rate": 7.741635881765408e-05, + "loss": 1.8334, + "step": 10942 + }, + { + "epoch": 3.3588090853284225, + "grad_norm": 0.28878241777420044, + "learning_rate": 7.741220198242362e-05, + "loss": 1.8266, + "step": 10943 + }, + { + "epoch": 3.3591160220994474, + "grad_norm": 0.3068650960922241, + "learning_rate": 7.740804487628971e-05, + "loss": 1.8562, + "step": 10944 + }, + { + "epoch": 3.3594229588704727, + "grad_norm": 0.2522405683994293, + "learning_rate": 7.740388749929343e-05, + "loss": 1.8001, + "step": 10945 + }, + { + "epoch": 3.359729895641498, + "grad_norm": 0.3073521554470062, + "learning_rate": 7.739972985147588e-05, + "loss": 1.7454, + "step": 10946 + }, + { + "epoch": 3.360036832412523, + "grad_norm": 0.3018052577972412, + "learning_rate": 7.739557193287815e-05, + "loss": 1.7888, + "step": 10947 + }, + { + "epoch": 3.3603437691835483, + "grad_norm": 0.2738604247570038, + "learning_rate": 7.73914137435413e-05, + "loss": 1.7208, + "step": 10948 + }, + { + "epoch": 3.3606507059545736, + "grad_norm": 0.37699586153030396, + "learning_rate": 7.738725528350646e-05, + "loss": 1.8175, + "step": 10949 + }, + { + "epoch": 3.3609576427255985, + "grad_norm": 0.3479778468608856, + "learning_rate": 7.738309655281471e-05, + "loss": 1.818, + "step": 10950 + }, + { + "epoch": 3.361264579496624, + "grad_norm": 0.24871166050434113, + "learning_rate": 7.737893755150715e-05, + "loss": 1.7046, + "step": 10951 + }, + { + "epoch": 3.3615715162676487, + "grad_norm": 0.45015642046928406, + "learning_rate": 7.737477827962488e-05, + "loss": 1.8517, + "step": 10952 + }, + { + "epoch": 3.361878453038674, + "grad_norm": 0.4149077534675598, + "learning_rate": 7.7370618737209e-05, + "loss": 1.7403, + "step": 10953 + }, + { + "epoch": 3.3621853898096994, + "grad_norm": 0.2556059658527374, + "learning_rate": 7.736645892430064e-05, + "loss": 1.8167, + "step": 10954 + }, + { + "epoch": 3.3624923265807243, + "grad_norm": 0.3153657615184784, + "learning_rate": 7.736229884094088e-05, + "loss": 1.8471, + "step": 10955 + }, + { + "epoch": 3.3627992633517496, + "grad_norm": 0.27943772077560425, + "learning_rate": 7.735813848717084e-05, + "loss": 1.7742, + "step": 10956 + }, + { + "epoch": 3.3631062001227745, + "grad_norm": 0.28270283341407776, + "learning_rate": 7.735397786303164e-05, + "loss": 1.8418, + "step": 10957 + }, + { + "epoch": 3.3634131368938, + "grad_norm": 0.3596261441707611, + "learning_rate": 7.734981696856442e-05, + "loss": 1.8213, + "step": 10958 + }, + { + "epoch": 3.363720073664825, + "grad_norm": 0.3678492307662964, + "learning_rate": 7.734565580381026e-05, + "loss": 1.806, + "step": 10959 + }, + { + "epoch": 3.36402701043585, + "grad_norm": 0.27758681774139404, + "learning_rate": 7.734149436881031e-05, + "loss": 1.7832, + "step": 10960 + }, + { + "epoch": 3.3643339472068754, + "grad_norm": 0.2821379005908966, + "learning_rate": 7.733733266360568e-05, + "loss": 1.8888, + "step": 10961 + }, + { + "epoch": 3.3646408839779007, + "grad_norm": 0.33676958084106445, + "learning_rate": 7.733317068823751e-05, + "loss": 1.902, + "step": 10962 + }, + { + "epoch": 3.3649478207489256, + "grad_norm": 0.3116114139556885, + "learning_rate": 7.732900844274691e-05, + "loss": 1.8228, + "step": 10963 + }, + { + "epoch": 3.365254757519951, + "grad_norm": 0.3286324143409729, + "learning_rate": 7.732484592717506e-05, + "loss": 1.8707, + "step": 10964 + }, + { + "epoch": 3.3655616942909763, + "grad_norm": 0.2732192873954773, + "learning_rate": 7.732068314156304e-05, + "loss": 1.773, + "step": 10965 + }, + { + "epoch": 3.365868631062001, + "grad_norm": 0.26663896441459656, + "learning_rate": 7.731652008595204e-05, + "loss": 1.7837, + "step": 10966 + }, + { + "epoch": 3.3661755678330265, + "grad_norm": 0.27447745203971863, + "learning_rate": 7.731235676038317e-05, + "loss": 1.9103, + "step": 10967 + }, + { + "epoch": 3.3664825046040514, + "grad_norm": 0.30832916498184204, + "learning_rate": 7.730819316489757e-05, + "loss": 1.7552, + "step": 10968 + }, + { + "epoch": 3.3667894413750767, + "grad_norm": 0.29657161235809326, + "learning_rate": 7.73040292995364e-05, + "loss": 1.7654, + "step": 10969 + }, + { + "epoch": 3.367096378146102, + "grad_norm": 0.30434274673461914, + "learning_rate": 7.729986516434082e-05, + "loss": 1.8646, + "step": 10970 + }, + { + "epoch": 3.367403314917127, + "grad_norm": 0.25926661491394043, + "learning_rate": 7.729570075935198e-05, + "loss": 1.7555, + "step": 10971 + }, + { + "epoch": 3.3677102516881523, + "grad_norm": 0.2775980532169342, + "learning_rate": 7.729153608461102e-05, + "loss": 1.8427, + "step": 10972 + }, + { + "epoch": 3.368017188459177, + "grad_norm": 0.23915666341781616, + "learning_rate": 7.72873711401591e-05, + "loss": 1.7902, + "step": 10973 + }, + { + "epoch": 3.3683241252302025, + "grad_norm": 0.2603691518306732, + "learning_rate": 7.728320592603737e-05, + "loss": 1.8587, + "step": 10974 + }, + { + "epoch": 3.368631062001228, + "grad_norm": 0.2579508125782013, + "learning_rate": 7.727904044228703e-05, + "loss": 1.7617, + "step": 10975 + }, + { + "epoch": 3.3689379987722528, + "grad_norm": 0.3384297788143158, + "learning_rate": 7.72748746889492e-05, + "loss": 1.8499, + "step": 10976 + }, + { + "epoch": 3.369244935543278, + "grad_norm": 0.36756646633148193, + "learning_rate": 7.727070866606509e-05, + "loss": 1.808, + "step": 10977 + }, + { + "epoch": 3.3695518723143034, + "grad_norm": 0.3212372958660126, + "learning_rate": 7.726654237367587e-05, + "loss": 1.8245, + "step": 10978 + }, + { + "epoch": 3.3698588090853283, + "grad_norm": 0.23782415688037872, + "learning_rate": 7.726237581182267e-05, + "loss": 1.7629, + "step": 10979 + }, + { + "epoch": 3.3701657458563536, + "grad_norm": 0.2782919108867645, + "learning_rate": 7.725820898054669e-05, + "loss": 1.8, + "step": 10980 + }, + { + "epoch": 3.370472682627379, + "grad_norm": 0.2973455488681793, + "learning_rate": 7.725404187988914e-05, + "loss": 1.7949, + "step": 10981 + }, + { + "epoch": 3.370779619398404, + "grad_norm": 0.2875392735004425, + "learning_rate": 7.724987450989114e-05, + "loss": 1.8019, + "step": 10982 + }, + { + "epoch": 3.371086556169429, + "grad_norm": 0.26133236289024353, + "learning_rate": 7.724570687059394e-05, + "loss": 1.7984, + "step": 10983 + }, + { + "epoch": 3.371393492940454, + "grad_norm": 0.2760173976421356, + "learning_rate": 7.724153896203867e-05, + "loss": 1.8082, + "step": 10984 + }, + { + "epoch": 3.3717004297114794, + "grad_norm": 0.26373061537742615, + "learning_rate": 7.723737078426656e-05, + "loss": 1.8408, + "step": 10985 + }, + { + "epoch": 3.3720073664825048, + "grad_norm": 0.29425618052482605, + "learning_rate": 7.723320233731879e-05, + "loss": 1.7992, + "step": 10986 + }, + { + "epoch": 3.3723143032535297, + "grad_norm": 0.29822099208831787, + "learning_rate": 7.722903362123655e-05, + "loss": 1.8204, + "step": 10987 + }, + { + "epoch": 3.372621240024555, + "grad_norm": 0.25945618748664856, + "learning_rate": 7.722486463606104e-05, + "loss": 1.7376, + "step": 10988 + }, + { + "epoch": 3.37292817679558, + "grad_norm": 0.26367196440696716, + "learning_rate": 7.722069538183345e-05, + "loss": 1.814, + "step": 10989 + }, + { + "epoch": 3.373235113566605, + "grad_norm": 0.25015249848365784, + "learning_rate": 7.7216525858595e-05, + "loss": 1.8199, + "step": 10990 + }, + { + "epoch": 3.3735420503376305, + "grad_norm": 0.3035781681537628, + "learning_rate": 7.72123560663869e-05, + "loss": 1.739, + "step": 10991 + }, + { + "epoch": 3.3738489871086554, + "grad_norm": 0.2847912013530731, + "learning_rate": 7.720818600525033e-05, + "loss": 1.8754, + "step": 10992 + }, + { + "epoch": 3.3741559238796808, + "grad_norm": 0.2533976435661316, + "learning_rate": 7.720401567522653e-05, + "loss": 1.7616, + "step": 10993 + }, + { + "epoch": 3.374462860650706, + "grad_norm": 0.250828355550766, + "learning_rate": 7.719984507635669e-05, + "loss": 1.7973, + "step": 10994 + }, + { + "epoch": 3.374769797421731, + "grad_norm": 0.3019898235797882, + "learning_rate": 7.719567420868206e-05, + "loss": 1.7563, + "step": 10995 + }, + { + "epoch": 3.3750767341927563, + "grad_norm": 0.2703310549259186, + "learning_rate": 7.719150307224382e-05, + "loss": 1.8183, + "step": 10996 + }, + { + "epoch": 3.3753836709637817, + "grad_norm": 0.2434745579957962, + "learning_rate": 7.718733166708321e-05, + "loss": 1.7913, + "step": 10997 + }, + { + "epoch": 3.3756906077348066, + "grad_norm": 0.28036773204803467, + "learning_rate": 7.718315999324146e-05, + "loss": 1.7884, + "step": 10998 + }, + { + "epoch": 3.375997544505832, + "grad_norm": 0.25123077630996704, + "learning_rate": 7.717898805075978e-05, + "loss": 1.7394, + "step": 10999 + }, + { + "epoch": 3.376304481276857, + "grad_norm": 0.2313947230577469, + "learning_rate": 7.717481583967943e-05, + "loss": 1.7537, + "step": 11000 + }, + { + "epoch": 3.376611418047882, + "grad_norm": 0.27152860164642334, + "learning_rate": 7.71706433600416e-05, + "loss": 1.8596, + "step": 11001 + }, + { + "epoch": 3.3769183548189075, + "grad_norm": 0.32866382598876953, + "learning_rate": 7.716647061188757e-05, + "loss": 1.9007, + "step": 11002 + }, + { + "epoch": 3.3772252915899323, + "grad_norm": 0.2842368185520172, + "learning_rate": 7.716229759525854e-05, + "loss": 1.7781, + "step": 11003 + }, + { + "epoch": 3.3775322283609577, + "grad_norm": 0.30411216616630554, + "learning_rate": 7.715812431019576e-05, + "loss": 1.7403, + "step": 11004 + }, + { + "epoch": 3.3778391651319826, + "grad_norm": 0.31848132610321045, + "learning_rate": 7.71539507567405e-05, + "loss": 1.817, + "step": 11005 + }, + { + "epoch": 3.378146101903008, + "grad_norm": 0.24206148087978363, + "learning_rate": 7.714977693493397e-05, + "loss": 1.7796, + "step": 11006 + }, + { + "epoch": 3.3784530386740332, + "grad_norm": 0.2982998490333557, + "learning_rate": 7.714560284481742e-05, + "loss": 1.7883, + "step": 11007 + }, + { + "epoch": 3.378759975445058, + "grad_norm": 0.24857483804225922, + "learning_rate": 7.714142848643213e-05, + "loss": 1.7447, + "step": 11008 + }, + { + "epoch": 3.3790669122160835, + "grad_norm": 0.2509039044380188, + "learning_rate": 7.713725385981932e-05, + "loss": 1.8362, + "step": 11009 + }, + { + "epoch": 3.379373848987109, + "grad_norm": 0.2759779095649719, + "learning_rate": 7.713307896502027e-05, + "loss": 1.8655, + "step": 11010 + }, + { + "epoch": 3.3796807857581337, + "grad_norm": 0.264776349067688, + "learning_rate": 7.712890380207623e-05, + "loss": 1.8221, + "step": 11011 + }, + { + "epoch": 3.379987722529159, + "grad_norm": 0.2771971821784973, + "learning_rate": 7.712472837102846e-05, + "loss": 1.6992, + "step": 11012 + }, + { + "epoch": 3.3802946593001844, + "grad_norm": 0.2749316096305847, + "learning_rate": 7.712055267191822e-05, + "loss": 1.8128, + "step": 11013 + }, + { + "epoch": 3.3806015960712092, + "grad_norm": 0.256656289100647, + "learning_rate": 7.71163767047868e-05, + "loss": 1.8382, + "step": 11014 + }, + { + "epoch": 3.3809085328422346, + "grad_norm": 0.27646976709365845, + "learning_rate": 7.711220046967545e-05, + "loss": 1.8321, + "step": 11015 + }, + { + "epoch": 3.3812154696132595, + "grad_norm": 0.3083149194717407, + "learning_rate": 7.710802396662542e-05, + "loss": 1.904, + "step": 11016 + }, + { + "epoch": 3.381522406384285, + "grad_norm": 0.2750856280326843, + "learning_rate": 7.710384719567803e-05, + "loss": 1.7596, + "step": 11017 + }, + { + "epoch": 3.38182934315531, + "grad_norm": 0.3029455244541168, + "learning_rate": 7.709967015687452e-05, + "loss": 1.8542, + "step": 11018 + }, + { + "epoch": 3.382136279926335, + "grad_norm": 0.3144093453884125, + "learning_rate": 7.709549285025622e-05, + "loss": 1.7489, + "step": 11019 + }, + { + "epoch": 3.3824432166973604, + "grad_norm": 0.2675442099571228, + "learning_rate": 7.709131527586433e-05, + "loss": 1.7324, + "step": 11020 + }, + { + "epoch": 3.3827501534683857, + "grad_norm": 0.2906095087528229, + "learning_rate": 7.708713743374021e-05, + "loss": 1.7848, + "step": 11021 + }, + { + "epoch": 3.3830570902394106, + "grad_norm": 0.25141623616218567, + "learning_rate": 7.708295932392513e-05, + "loss": 1.7423, + "step": 11022 + }, + { + "epoch": 3.383364027010436, + "grad_norm": 0.25832003355026245, + "learning_rate": 7.707878094646037e-05, + "loss": 1.7792, + "step": 11023 + }, + { + "epoch": 3.3836709637814613, + "grad_norm": 0.23710070550441742, + "learning_rate": 7.70746023013872e-05, + "loss": 1.7916, + "step": 11024 + }, + { + "epoch": 3.383977900552486, + "grad_norm": 0.286735862493515, + "learning_rate": 7.707042338874697e-05, + "loss": 1.8272, + "step": 11025 + }, + { + "epoch": 3.3842848373235115, + "grad_norm": 0.2536577582359314, + "learning_rate": 7.706624420858094e-05, + "loss": 1.7839, + "step": 11026 + }, + { + "epoch": 3.3845917740945364, + "grad_norm": 0.5564702749252319, + "learning_rate": 7.706206476093043e-05, + "loss": 1.7832, + "step": 11027 + }, + { + "epoch": 3.3848987108655617, + "grad_norm": 0.34694772958755493, + "learning_rate": 7.705788504583671e-05, + "loss": 1.8668, + "step": 11028 + }, + { + "epoch": 3.385205647636587, + "grad_norm": 0.30388176441192627, + "learning_rate": 7.705370506334113e-05, + "loss": 1.8244, + "step": 11029 + }, + { + "epoch": 3.385512584407612, + "grad_norm": 0.2998919188976288, + "learning_rate": 7.704952481348497e-05, + "loss": 1.7927, + "step": 11030 + }, + { + "epoch": 3.3858195211786373, + "grad_norm": 0.2714936435222626, + "learning_rate": 7.704534429630955e-05, + "loss": 1.8757, + "step": 11031 + }, + { + "epoch": 3.386126457949662, + "grad_norm": 0.26670241355895996, + "learning_rate": 7.704116351185619e-05, + "loss": 1.8146, + "step": 11032 + }, + { + "epoch": 3.3864333947206875, + "grad_norm": 0.2500552833080292, + "learning_rate": 7.703698246016621e-05, + "loss": 1.7984, + "step": 11033 + }, + { + "epoch": 3.386740331491713, + "grad_norm": 0.2494918406009674, + "learning_rate": 7.703280114128091e-05, + "loss": 1.7433, + "step": 11034 + }, + { + "epoch": 3.3870472682627377, + "grad_norm": 0.25658491253852844, + "learning_rate": 7.702861955524163e-05, + "loss": 1.8487, + "step": 11035 + }, + { + "epoch": 3.387354205033763, + "grad_norm": 0.2871410548686981, + "learning_rate": 7.702443770208969e-05, + "loss": 1.7919, + "step": 11036 + }, + { + "epoch": 3.3876611418047884, + "grad_norm": 0.3347938060760498, + "learning_rate": 7.702025558186643e-05, + "loss": 1.8091, + "step": 11037 + }, + { + "epoch": 3.3879680785758133, + "grad_norm": 0.39016643166542053, + "learning_rate": 7.701607319461315e-05, + "loss": 1.7816, + "step": 11038 + }, + { + "epoch": 3.3882750153468386, + "grad_norm": 0.3423028290271759, + "learning_rate": 7.701189054037121e-05, + "loss": 1.8454, + "step": 11039 + }, + { + "epoch": 3.388581952117864, + "grad_norm": 0.27592089772224426, + "learning_rate": 7.700770761918192e-05, + "loss": 1.8431, + "step": 11040 + }, + { + "epoch": 3.388888888888889, + "grad_norm": 0.46047264337539673, + "learning_rate": 7.700352443108665e-05, + "loss": 1.8412, + "step": 11041 + }, + { + "epoch": 3.389195825659914, + "grad_norm": 0.49226754903793335, + "learning_rate": 7.699934097612673e-05, + "loss": 1.8212, + "step": 11042 + }, + { + "epoch": 3.389502762430939, + "grad_norm": 0.3958778381347656, + "learning_rate": 7.699515725434348e-05, + "loss": 1.747, + "step": 11043 + }, + { + "epoch": 3.3898096992019644, + "grad_norm": 0.26097169518470764, + "learning_rate": 7.699097326577827e-05, + "loss": 1.7631, + "step": 11044 + }, + { + "epoch": 3.3901166359729897, + "grad_norm": 0.2922612130641937, + "learning_rate": 7.698678901047245e-05, + "loss": 1.7891, + "step": 11045 + }, + { + "epoch": 3.3904235727440146, + "grad_norm": 0.4195055365562439, + "learning_rate": 7.698260448846734e-05, + "loss": 1.7765, + "step": 11046 + }, + { + "epoch": 3.39073050951504, + "grad_norm": 0.4572988450527191, + "learning_rate": 7.697841969980434e-05, + "loss": 1.8085, + "step": 11047 + }, + { + "epoch": 3.391037446286065, + "grad_norm": 0.38819587230682373, + "learning_rate": 7.697423464452478e-05, + "loss": 1.8854, + "step": 11048 + }, + { + "epoch": 3.39134438305709, + "grad_norm": 0.27421653270721436, + "learning_rate": 7.697004932267003e-05, + "loss": 1.8327, + "step": 11049 + }, + { + "epoch": 3.3916513198281155, + "grad_norm": 0.33559146523475647, + "learning_rate": 7.696586373428142e-05, + "loss": 1.8109, + "step": 11050 + }, + { + "epoch": 3.3919582565991404, + "grad_norm": 0.39438655972480774, + "learning_rate": 7.696167787940037e-05, + "loss": 1.7909, + "step": 11051 + }, + { + "epoch": 3.3922651933701657, + "grad_norm": 0.3425842523574829, + "learning_rate": 7.695749175806819e-05, + "loss": 1.8571, + "step": 11052 + }, + { + "epoch": 3.392572130141191, + "grad_norm": 0.2860080301761627, + "learning_rate": 7.695330537032628e-05, + "loss": 1.8546, + "step": 11053 + }, + { + "epoch": 3.392879066912216, + "grad_norm": 0.35894665122032166, + "learning_rate": 7.694911871621601e-05, + "loss": 1.7895, + "step": 11054 + }, + { + "epoch": 3.3931860036832413, + "grad_norm": 0.351193904876709, + "learning_rate": 7.694493179577879e-05, + "loss": 1.7453, + "step": 11055 + }, + { + "epoch": 3.3934929404542666, + "grad_norm": 0.24812865257263184, + "learning_rate": 7.694074460905592e-05, + "loss": 1.8131, + "step": 11056 + }, + { + "epoch": 3.3937998772252915, + "grad_norm": 0.38620972633361816, + "learning_rate": 7.693655715608883e-05, + "loss": 1.8346, + "step": 11057 + }, + { + "epoch": 3.394106813996317, + "grad_norm": 0.5005692839622498, + "learning_rate": 7.69323694369189e-05, + "loss": 1.9031, + "step": 11058 + }, + { + "epoch": 3.3944137507673418, + "grad_norm": 0.4321887791156769, + "learning_rate": 7.692818145158751e-05, + "loss": 1.8783, + "step": 11059 + }, + { + "epoch": 3.394720687538367, + "grad_norm": 0.269307017326355, + "learning_rate": 7.692399320013603e-05, + "loss": 1.8075, + "step": 11060 + }, + { + "epoch": 3.3950276243093924, + "grad_norm": 0.2945556342601776, + "learning_rate": 7.69198046826059e-05, + "loss": 1.8366, + "step": 11061 + }, + { + "epoch": 3.3953345610804173, + "grad_norm": 0.30531853437423706, + "learning_rate": 7.691561589903847e-05, + "loss": 1.7665, + "step": 11062 + }, + { + "epoch": 3.3956414978514426, + "grad_norm": 0.25105199217796326, + "learning_rate": 7.691142684947513e-05, + "loss": 1.782, + "step": 11063 + }, + { + "epoch": 3.3959484346224675, + "grad_norm": 0.3373202085494995, + "learning_rate": 7.69072375339573e-05, + "loss": 1.8148, + "step": 11064 + }, + { + "epoch": 3.396255371393493, + "grad_norm": 0.34207093715667725, + "learning_rate": 7.690304795252638e-05, + "loss": 1.8287, + "step": 11065 + }, + { + "epoch": 3.396562308164518, + "grad_norm": 0.26281681656837463, + "learning_rate": 7.68988581052238e-05, + "loss": 1.8551, + "step": 11066 + }, + { + "epoch": 3.396869244935543, + "grad_norm": 0.3091152608394623, + "learning_rate": 7.689466799209091e-05, + "loss": 1.7689, + "step": 11067 + }, + { + "epoch": 3.3971761817065684, + "grad_norm": 0.37421298027038574, + "learning_rate": 7.689047761316914e-05, + "loss": 1.7908, + "step": 11068 + }, + { + "epoch": 3.3974831184775938, + "grad_norm": 0.3745511770248413, + "learning_rate": 7.688628696849993e-05, + "loss": 1.8408, + "step": 11069 + }, + { + "epoch": 3.3977900552486187, + "grad_norm": 0.3003663122653961, + "learning_rate": 7.688209605812467e-05, + "loss": 1.9109, + "step": 11070 + }, + { + "epoch": 3.398096992019644, + "grad_norm": 0.3437681496143341, + "learning_rate": 7.687790488208478e-05, + "loss": 1.811, + "step": 11071 + }, + { + "epoch": 3.3984039287906693, + "grad_norm": 0.3480641841888428, + "learning_rate": 7.687371344042168e-05, + "loss": 1.8114, + "step": 11072 + }, + { + "epoch": 3.398710865561694, + "grad_norm": 0.24670913815498352, + "learning_rate": 7.686952173317679e-05, + "loss": 1.7959, + "step": 11073 + }, + { + "epoch": 3.3990178023327196, + "grad_norm": 0.2939499020576477, + "learning_rate": 7.686532976039154e-05, + "loss": 1.7518, + "step": 11074 + }, + { + "epoch": 3.3993247391037444, + "grad_norm": 0.3332279622554779, + "learning_rate": 7.686113752210736e-05, + "loss": 1.843, + "step": 11075 + }, + { + "epoch": 3.3996316758747698, + "grad_norm": 0.22967280447483063, + "learning_rate": 7.685694501836566e-05, + "loss": 1.7408, + "step": 11076 + }, + { + "epoch": 3.399938612645795, + "grad_norm": 0.3443470001220703, + "learning_rate": 7.685275224920789e-05, + "loss": 1.8004, + "step": 11077 + }, + { + "epoch": 3.40024554941682, + "grad_norm": 0.3725457489490509, + "learning_rate": 7.684855921467548e-05, + "loss": 1.833, + "step": 11078 + }, + { + "epoch": 3.4005524861878453, + "grad_norm": 0.3178638219833374, + "learning_rate": 7.68443659148099e-05, + "loss": 1.8055, + "step": 11079 + }, + { + "epoch": 3.4008594229588702, + "grad_norm": 0.2609167695045471, + "learning_rate": 7.684017234965254e-05, + "loss": 1.7881, + "step": 11080 + }, + { + "epoch": 3.4011663597298956, + "grad_norm": 0.26975762844085693, + "learning_rate": 7.683597851924486e-05, + "loss": 1.8424, + "step": 11081 + }, + { + "epoch": 3.401473296500921, + "grad_norm": 0.266661673784256, + "learning_rate": 7.683178442362832e-05, + "loss": 1.7785, + "step": 11082 + }, + { + "epoch": 3.401780233271946, + "grad_norm": 0.27915671467781067, + "learning_rate": 7.682759006284436e-05, + "loss": 1.8241, + "step": 11083 + }, + { + "epoch": 3.402087170042971, + "grad_norm": 0.25167274475097656, + "learning_rate": 7.682339543693444e-05, + "loss": 1.7637, + "step": 11084 + }, + { + "epoch": 3.4023941068139965, + "grad_norm": 0.2439529299736023, + "learning_rate": 7.681920054593999e-05, + "loss": 1.7796, + "step": 11085 + }, + { + "epoch": 3.4027010435850213, + "grad_norm": 0.26224252581596375, + "learning_rate": 7.681500538990249e-05, + "loss": 1.8018, + "step": 11086 + }, + { + "epoch": 3.4030079803560467, + "grad_norm": 0.25093868374824524, + "learning_rate": 7.681080996886336e-05, + "loss": 1.7664, + "step": 11087 + }, + { + "epoch": 3.403314917127072, + "grad_norm": 0.26393210887908936, + "learning_rate": 7.680661428286413e-05, + "loss": 1.8389, + "step": 11088 + }, + { + "epoch": 3.403621853898097, + "grad_norm": 0.24750283360481262, + "learning_rate": 7.680241833194622e-05, + "loss": 1.8358, + "step": 11089 + }, + { + "epoch": 3.4039287906691222, + "grad_norm": 0.21568982303142548, + "learning_rate": 7.67982221161511e-05, + "loss": 1.7874, + "step": 11090 + }, + { + "epoch": 3.404235727440147, + "grad_norm": 0.24407126009464264, + "learning_rate": 7.679402563552023e-05, + "loss": 1.7753, + "step": 11091 + }, + { + "epoch": 3.4045426642111725, + "grad_norm": 0.23288260400295258, + "learning_rate": 7.67898288900951e-05, + "loss": 1.8046, + "step": 11092 + }, + { + "epoch": 3.404849600982198, + "grad_norm": 0.2548544108867645, + "learning_rate": 7.678563187991718e-05, + "loss": 1.8778, + "step": 11093 + }, + { + "epoch": 3.4051565377532227, + "grad_norm": 0.24008090794086456, + "learning_rate": 7.678143460502796e-05, + "loss": 1.7912, + "step": 11094 + }, + { + "epoch": 3.405463474524248, + "grad_norm": 0.26085031032562256, + "learning_rate": 7.677723706546889e-05, + "loss": 1.849, + "step": 11095 + }, + { + "epoch": 3.4057704112952734, + "grad_norm": 0.2830932140350342, + "learning_rate": 7.677303926128147e-05, + "loss": 1.8265, + "step": 11096 + }, + { + "epoch": 3.4060773480662982, + "grad_norm": 0.27593597769737244, + "learning_rate": 7.676884119250718e-05, + "loss": 1.8555, + "step": 11097 + }, + { + "epoch": 3.4063842848373236, + "grad_norm": 0.2403372824192047, + "learning_rate": 7.676464285918751e-05, + "loss": 1.7243, + "step": 11098 + }, + { + "epoch": 3.406691221608349, + "grad_norm": 0.28830090165138245, + "learning_rate": 7.676044426136397e-05, + "loss": 1.8108, + "step": 11099 + }, + { + "epoch": 3.406998158379374, + "grad_norm": 0.2918153405189514, + "learning_rate": 7.675624539907802e-05, + "loss": 1.7875, + "step": 11100 + }, + { + "epoch": 3.407305095150399, + "grad_norm": 0.2609013020992279, + "learning_rate": 7.675204627237117e-05, + "loss": 1.778, + "step": 11101 + }, + { + "epoch": 3.407612031921424, + "grad_norm": 0.2714763283729553, + "learning_rate": 7.674784688128494e-05, + "loss": 1.8472, + "step": 11102 + }, + { + "epoch": 3.4079189686924494, + "grad_norm": 0.25857117772102356, + "learning_rate": 7.674364722586078e-05, + "loss": 1.7495, + "step": 11103 + }, + { + "epoch": 3.4082259054634747, + "grad_norm": 0.25485143065452576, + "learning_rate": 7.673944730614023e-05, + "loss": 1.7817, + "step": 11104 + }, + { + "epoch": 3.4085328422344996, + "grad_norm": 0.2735857665538788, + "learning_rate": 7.67352471221648e-05, + "loss": 1.7522, + "step": 11105 + }, + { + "epoch": 3.408839779005525, + "grad_norm": 0.25079572200775146, + "learning_rate": 7.6731046673976e-05, + "loss": 1.765, + "step": 11106 + }, + { + "epoch": 3.40914671577655, + "grad_norm": 0.3080148696899414, + "learning_rate": 7.672684596161532e-05, + "loss": 1.8305, + "step": 11107 + }, + { + "epoch": 3.409453652547575, + "grad_norm": 0.23771968483924866, + "learning_rate": 7.672264498512427e-05, + "loss": 1.7837, + "step": 11108 + }, + { + "epoch": 3.4097605893186005, + "grad_norm": 0.29941999912261963, + "learning_rate": 7.671844374454437e-05, + "loss": 1.8013, + "step": 11109 + }, + { + "epoch": 3.4100675260896254, + "grad_norm": 0.27871644496917725, + "learning_rate": 7.671424223991717e-05, + "loss": 1.8598, + "step": 11110 + }, + { + "epoch": 3.4103744628606507, + "grad_norm": 0.2751443684101105, + "learning_rate": 7.671004047128416e-05, + "loss": 1.8341, + "step": 11111 + }, + { + "epoch": 3.410681399631676, + "grad_norm": 0.27227312326431274, + "learning_rate": 7.670583843868688e-05, + "loss": 1.81, + "step": 11112 + }, + { + "epoch": 3.410988336402701, + "grad_norm": 0.29617756605148315, + "learning_rate": 7.670163614216685e-05, + "loss": 1.8795, + "step": 11113 + }, + { + "epoch": 3.4112952731737263, + "grad_norm": 0.268920361995697, + "learning_rate": 7.669743358176563e-05, + "loss": 1.7659, + "step": 11114 + }, + { + "epoch": 3.4116022099447516, + "grad_norm": 0.2875109314918518, + "learning_rate": 7.669323075752467e-05, + "loss": 1.8263, + "step": 11115 + }, + { + "epoch": 3.4119091467157765, + "grad_norm": 0.34703585505485535, + "learning_rate": 7.668902766948558e-05, + "loss": 1.7622, + "step": 11116 + }, + { + "epoch": 3.412216083486802, + "grad_norm": 0.3090265393257141, + "learning_rate": 7.668482431768989e-05, + "loss": 1.7381, + "step": 11117 + }, + { + "epoch": 3.4125230202578267, + "grad_norm": 0.2619737684726715, + "learning_rate": 7.668062070217911e-05, + "loss": 1.8004, + "step": 11118 + }, + { + "epoch": 3.412829957028852, + "grad_norm": 0.289815217256546, + "learning_rate": 7.667641682299482e-05, + "loss": 1.7946, + "step": 11119 + }, + { + "epoch": 3.4131368937998774, + "grad_norm": 0.28732073307037354, + "learning_rate": 7.667221268017852e-05, + "loss": 1.8746, + "step": 11120 + }, + { + "epoch": 3.4134438305709023, + "grad_norm": 0.23232576251029968, + "learning_rate": 7.666800827377178e-05, + "loss": 1.7403, + "step": 11121 + }, + { + "epoch": 3.4137507673419276, + "grad_norm": 0.22903507947921753, + "learning_rate": 7.666380360381616e-05, + "loss": 1.7785, + "step": 11122 + }, + { + "epoch": 3.4140577041129525, + "grad_norm": 0.25023025274276733, + "learning_rate": 7.665959867035321e-05, + "loss": 1.7881, + "step": 11123 + }, + { + "epoch": 3.414364640883978, + "grad_norm": 0.2199166864156723, + "learning_rate": 7.665539347342449e-05, + "loss": 1.7522, + "step": 11124 + }, + { + "epoch": 3.414671577655003, + "grad_norm": 0.2539862394332886, + "learning_rate": 7.665118801307152e-05, + "loss": 1.7964, + "step": 11125 + }, + { + "epoch": 3.414978514426028, + "grad_norm": 0.22670161724090576, + "learning_rate": 7.664698228933591e-05, + "loss": 1.7071, + "step": 11126 + }, + { + "epoch": 3.4152854511970534, + "grad_norm": 0.24827396869659424, + "learning_rate": 7.664277630225919e-05, + "loss": 1.7897, + "step": 11127 + }, + { + "epoch": 3.4155923879680787, + "grad_norm": 0.29391366243362427, + "learning_rate": 7.663857005188296e-05, + "loss": 1.7967, + "step": 11128 + }, + { + "epoch": 3.4158993247391036, + "grad_norm": 0.3201812505722046, + "learning_rate": 7.663436353824874e-05, + "loss": 1.7681, + "step": 11129 + }, + { + "epoch": 3.416206261510129, + "grad_norm": 0.2274552583694458, + "learning_rate": 7.663015676139814e-05, + "loss": 1.7535, + "step": 11130 + }, + { + "epoch": 3.4165131982811543, + "grad_norm": 0.3955044150352478, + "learning_rate": 7.662594972137273e-05, + "loss": 1.8175, + "step": 11131 + }, + { + "epoch": 3.416820135052179, + "grad_norm": 0.46493569016456604, + "learning_rate": 7.662174241821406e-05, + "loss": 1.7806, + "step": 11132 + }, + { + "epoch": 3.4171270718232045, + "grad_norm": 0.37731611728668213, + "learning_rate": 7.661753485196375e-05, + "loss": 1.7555, + "step": 11133 + }, + { + "epoch": 3.4174340085942294, + "grad_norm": 0.23983556032180786, + "learning_rate": 7.661332702266334e-05, + "loss": 1.7662, + "step": 11134 + }, + { + "epoch": 3.4177409453652547, + "grad_norm": 0.34964314103126526, + "learning_rate": 7.660911893035445e-05, + "loss": 1.7786, + "step": 11135 + }, + { + "epoch": 3.41804788213628, + "grad_norm": 0.44820764660835266, + "learning_rate": 7.660491057507864e-05, + "loss": 1.778, + "step": 11136 + }, + { + "epoch": 3.418354818907305, + "grad_norm": 0.32936233282089233, + "learning_rate": 7.660070195687752e-05, + "loss": 1.8181, + "step": 11137 + }, + { + "epoch": 3.4186617556783303, + "grad_norm": 0.2874850332736969, + "learning_rate": 7.659649307579266e-05, + "loss": 1.8733, + "step": 11138 + }, + { + "epoch": 3.418968692449355, + "grad_norm": 0.46269866824150085, + "learning_rate": 7.659228393186566e-05, + "loss": 1.8566, + "step": 11139 + }, + { + "epoch": 3.4192756292203805, + "grad_norm": 0.5873839855194092, + "learning_rate": 7.658807452513816e-05, + "loss": 1.8317, + "step": 11140 + }, + { + "epoch": 3.419582565991406, + "grad_norm": 0.43150341510772705, + "learning_rate": 7.65838648556517e-05, + "loss": 1.7702, + "step": 11141 + }, + { + "epoch": 3.4198895027624308, + "grad_norm": 0.2803891599178314, + "learning_rate": 7.65796549234479e-05, + "loss": 1.8043, + "step": 11142 + }, + { + "epoch": 3.420196439533456, + "grad_norm": 0.37295013666152954, + "learning_rate": 7.657544472856838e-05, + "loss": 1.7923, + "step": 11143 + }, + { + "epoch": 3.4205033763044814, + "grad_norm": 0.3922573924064636, + "learning_rate": 7.657123427105473e-05, + "loss": 1.8231, + "step": 11144 + }, + { + "epoch": 3.4208103130755063, + "grad_norm": 0.27254152297973633, + "learning_rate": 7.656702355094859e-05, + "loss": 1.8168, + "step": 11145 + }, + { + "epoch": 3.4211172498465316, + "grad_norm": 0.28005337715148926, + "learning_rate": 7.656281256829152e-05, + "loss": 1.8047, + "step": 11146 + }, + { + "epoch": 3.421424186617557, + "grad_norm": 0.4369073808193207, + "learning_rate": 7.655860132312519e-05, + "loss": 1.7243, + "step": 11147 + }, + { + "epoch": 3.421731123388582, + "grad_norm": 0.4127553701400757, + "learning_rate": 7.655438981549119e-05, + "loss": 1.8148, + "step": 11148 + }, + { + "epoch": 3.422038060159607, + "grad_norm": 0.3131798207759857, + "learning_rate": 7.655017804543114e-05, + "loss": 1.789, + "step": 11149 + }, + { + "epoch": 3.422344996930632, + "grad_norm": 0.2947194576263428, + "learning_rate": 7.654596601298666e-05, + "loss": 1.8221, + "step": 11150 + }, + { + "epoch": 3.4226519337016574, + "grad_norm": 0.3072497546672821, + "learning_rate": 7.654175371819941e-05, + "loss": 1.7747, + "step": 11151 + }, + { + "epoch": 3.4229588704726828, + "grad_norm": 0.29408320784568787, + "learning_rate": 7.653754116111099e-05, + "loss": 1.9009, + "step": 11152 + }, + { + "epoch": 3.4232658072437077, + "grad_norm": 0.2629215717315674, + "learning_rate": 7.653332834176303e-05, + "loss": 1.7354, + "step": 11153 + }, + { + "epoch": 3.423572744014733, + "grad_norm": 0.2850257456302643, + "learning_rate": 7.652911526019716e-05, + "loss": 1.8422, + "step": 11154 + }, + { + "epoch": 3.423879680785758, + "grad_norm": 0.29787111282348633, + "learning_rate": 7.652490191645503e-05, + "loss": 1.8122, + "step": 11155 + }, + { + "epoch": 3.424186617556783, + "grad_norm": 0.2670947015285492, + "learning_rate": 7.652068831057826e-05, + "loss": 1.7734, + "step": 11156 + }, + { + "epoch": 3.4244935543278086, + "grad_norm": 0.26415133476257324, + "learning_rate": 7.651647444260853e-05, + "loss": 1.7661, + "step": 11157 + }, + { + "epoch": 3.424800491098834, + "grad_norm": 0.2614886164665222, + "learning_rate": 7.651226031258745e-05, + "loss": 1.6918, + "step": 11158 + }, + { + "epoch": 3.425107427869859, + "grad_norm": 0.28485649824142456, + "learning_rate": 7.650804592055667e-05, + "loss": 1.7771, + "step": 11159 + }, + { + "epoch": 3.425414364640884, + "grad_norm": 0.26080289483070374, + "learning_rate": 7.650383126655784e-05, + "loss": 1.7637, + "step": 11160 + }, + { + "epoch": 3.425721301411909, + "grad_norm": 0.2503695487976074, + "learning_rate": 7.649961635063261e-05, + "loss": 1.7864, + "step": 11161 + }, + { + "epoch": 3.4260282381829343, + "grad_norm": 0.3165570795536041, + "learning_rate": 7.649540117282263e-05, + "loss": 1.8107, + "step": 11162 + }, + { + "epoch": 3.4263351749539597, + "grad_norm": 0.28411731123924255, + "learning_rate": 7.649118573316959e-05, + "loss": 1.7557, + "step": 11163 + }, + { + "epoch": 3.4266421117249846, + "grad_norm": 0.24469570815563202, + "learning_rate": 7.648697003171512e-05, + "loss": 1.7597, + "step": 11164 + }, + { + "epoch": 3.42694904849601, + "grad_norm": 0.31968292593955994, + "learning_rate": 7.648275406850087e-05, + "loss": 1.7796, + "step": 11165 + }, + { + "epoch": 3.427255985267035, + "grad_norm": 0.24520765244960785, + "learning_rate": 7.647853784356856e-05, + "loss": 1.7931, + "step": 11166 + }, + { + "epoch": 3.42756292203806, + "grad_norm": 0.23946821689605713, + "learning_rate": 7.647432135695977e-05, + "loss": 1.7143, + "step": 11167 + }, + { + "epoch": 3.4278698588090855, + "grad_norm": 0.321455180644989, + "learning_rate": 7.647010460871624e-05, + "loss": 1.8682, + "step": 11168 + }, + { + "epoch": 3.4281767955801103, + "grad_norm": 0.2803197503089905, + "learning_rate": 7.646588759887964e-05, + "loss": 1.8, + "step": 11169 + }, + { + "epoch": 3.4284837323511357, + "grad_norm": 0.2597559988498688, + "learning_rate": 7.64616703274916e-05, + "loss": 1.8027, + "step": 11170 + }, + { + "epoch": 3.428790669122161, + "grad_norm": 0.25055503845214844, + "learning_rate": 7.645745279459384e-05, + "loss": 1.7659, + "step": 11171 + }, + { + "epoch": 3.429097605893186, + "grad_norm": 0.34582629799842834, + "learning_rate": 7.645323500022803e-05, + "loss": 1.7868, + "step": 11172 + }, + { + "epoch": 3.4294045426642112, + "grad_norm": 0.32845041155815125, + "learning_rate": 7.644901694443584e-05, + "loss": 1.8247, + "step": 11173 + }, + { + "epoch": 3.4297114794352366, + "grad_norm": 0.2570398449897766, + "learning_rate": 7.644479862725896e-05, + "loss": 1.7802, + "step": 11174 + }, + { + "epoch": 3.4300184162062615, + "grad_norm": 0.23117294907569885, + "learning_rate": 7.644058004873908e-05, + "loss": 1.7575, + "step": 11175 + }, + { + "epoch": 3.430325352977287, + "grad_norm": 0.2417830377817154, + "learning_rate": 7.64363612089179e-05, + "loss": 1.7954, + "step": 11176 + }, + { + "epoch": 3.4306322897483117, + "grad_norm": 0.249378964304924, + "learning_rate": 7.643214210783708e-05, + "loss": 1.8161, + "step": 11177 + }, + { + "epoch": 3.430939226519337, + "grad_norm": 0.24494746327400208, + "learning_rate": 7.642792274553836e-05, + "loss": 1.825, + "step": 11178 + }, + { + "epoch": 3.4312461632903624, + "grad_norm": 0.2663760185241699, + "learning_rate": 7.642370312206342e-05, + "loss": 1.7589, + "step": 11179 + }, + { + "epoch": 3.4315531000613873, + "grad_norm": 0.2819322645664215, + "learning_rate": 7.641948323745395e-05, + "loss": 1.8097, + "step": 11180 + }, + { + "epoch": 3.4318600368324126, + "grad_norm": 0.26917630434036255, + "learning_rate": 7.641526309175166e-05, + "loss": 1.7934, + "step": 11181 + }, + { + "epoch": 3.4321669736034375, + "grad_norm": 0.31618112325668335, + "learning_rate": 7.641104268499826e-05, + "loss": 1.8522, + "step": 11182 + }, + { + "epoch": 3.432473910374463, + "grad_norm": 0.29209139943122864, + "learning_rate": 7.640682201723546e-05, + "loss": 1.7499, + "step": 11183 + }, + { + "epoch": 3.432780847145488, + "grad_norm": 0.24831914901733398, + "learning_rate": 7.640260108850496e-05, + "loss": 1.7897, + "step": 11184 + }, + { + "epoch": 3.433087783916513, + "grad_norm": 0.2459818720817566, + "learning_rate": 7.639837989884849e-05, + "loss": 1.7604, + "step": 11185 + }, + { + "epoch": 3.4333947206875384, + "grad_norm": 0.27157485485076904, + "learning_rate": 7.639415844830774e-05, + "loss": 1.7776, + "step": 11186 + }, + { + "epoch": 3.4337016574585637, + "grad_norm": 0.3021515905857086, + "learning_rate": 7.638993673692445e-05, + "loss": 1.7771, + "step": 11187 + }, + { + "epoch": 3.4340085942295886, + "grad_norm": 0.2591722309589386, + "learning_rate": 7.638571476474036e-05, + "loss": 1.8333, + "step": 11188 + }, + { + "epoch": 3.434315531000614, + "grad_norm": 0.2255258709192276, + "learning_rate": 7.638149253179717e-05, + "loss": 1.7647, + "step": 11189 + }, + { + "epoch": 3.4346224677716393, + "grad_norm": 0.2585793733596802, + "learning_rate": 7.637727003813658e-05, + "loss": 1.786, + "step": 11190 + }, + { + "epoch": 3.434929404542664, + "grad_norm": 0.23649543523788452, + "learning_rate": 7.637304728380036e-05, + "loss": 1.822, + "step": 11191 + }, + { + "epoch": 3.4352363413136895, + "grad_norm": 0.2610832452774048, + "learning_rate": 7.636882426883023e-05, + "loss": 1.7925, + "step": 11192 + }, + { + "epoch": 3.4355432780847144, + "grad_norm": 0.26230642199516296, + "learning_rate": 7.636460099326793e-05, + "loss": 1.8169, + "step": 11193 + }, + { + "epoch": 3.4358502148557397, + "grad_norm": 0.2800561189651489, + "learning_rate": 7.636037745715518e-05, + "loss": 1.845, + "step": 11194 + }, + { + "epoch": 3.436157151626765, + "grad_norm": 0.27790409326553345, + "learning_rate": 7.635615366053372e-05, + "loss": 1.8141, + "step": 11195 + }, + { + "epoch": 3.43646408839779, + "grad_norm": 0.2894865870475769, + "learning_rate": 7.635192960344533e-05, + "loss": 1.7916, + "step": 11196 + }, + { + "epoch": 3.4367710251688153, + "grad_norm": 0.22310738265514374, + "learning_rate": 7.634770528593171e-05, + "loss": 1.79, + "step": 11197 + }, + { + "epoch": 3.43707796193984, + "grad_norm": 0.2837755084037781, + "learning_rate": 7.634348070803463e-05, + "loss": 1.8763, + "step": 11198 + }, + { + "epoch": 3.4373848987108655, + "grad_norm": 0.32488104701042175, + "learning_rate": 7.633925586979583e-05, + "loss": 1.8331, + "step": 11199 + }, + { + "epoch": 3.437691835481891, + "grad_norm": 0.2708779573440552, + "learning_rate": 7.633503077125706e-05, + "loss": 1.761, + "step": 11200 + }, + { + "epoch": 3.4379987722529157, + "grad_norm": 0.23929642140865326, + "learning_rate": 7.633080541246008e-05, + "loss": 1.8217, + "step": 11201 + }, + { + "epoch": 3.438305709023941, + "grad_norm": 0.3213331997394562, + "learning_rate": 7.632657979344667e-05, + "loss": 1.8375, + "step": 11202 + }, + { + "epoch": 3.4386126457949664, + "grad_norm": 0.38420629501342773, + "learning_rate": 7.632235391425854e-05, + "loss": 1.765, + "step": 11203 + }, + { + "epoch": 3.4389195825659913, + "grad_norm": 0.40466073155403137, + "learning_rate": 7.631812777493749e-05, + "loss": 1.8262, + "step": 11204 + }, + { + "epoch": 3.4392265193370166, + "grad_norm": 0.35904639959335327, + "learning_rate": 7.631390137552527e-05, + "loss": 1.894, + "step": 11205 + }, + { + "epoch": 3.439533456108042, + "grad_norm": 0.28880515694618225, + "learning_rate": 7.630967471606368e-05, + "loss": 1.87, + "step": 11206 + }, + { + "epoch": 3.439840392879067, + "grad_norm": 0.2878882884979248, + "learning_rate": 7.630544779659444e-05, + "loss": 1.7841, + "step": 11207 + }, + { + "epoch": 3.440147329650092, + "grad_norm": 0.36002418398857117, + "learning_rate": 7.630122061715935e-05, + "loss": 1.7318, + "step": 11208 + }, + { + "epoch": 3.440454266421117, + "grad_norm": 0.3304644227027893, + "learning_rate": 7.629699317780019e-05, + "loss": 1.8581, + "step": 11209 + }, + { + "epoch": 3.4407612031921424, + "grad_norm": 0.23396331071853638, + "learning_rate": 7.629276547855872e-05, + "loss": 1.7897, + "step": 11210 + }, + { + "epoch": 3.4410681399631677, + "grad_norm": 0.34914183616638184, + "learning_rate": 7.628853751947674e-05, + "loss": 1.8531, + "step": 11211 + }, + { + "epoch": 3.4413750767341926, + "grad_norm": 0.3700502812862396, + "learning_rate": 7.6284309300596e-05, + "loss": 1.7884, + "step": 11212 + }, + { + "epoch": 3.441682013505218, + "grad_norm": 0.24606801569461823, + "learning_rate": 7.628008082195835e-05, + "loss": 1.7292, + "step": 11213 + }, + { + "epoch": 3.441988950276243, + "grad_norm": 0.26344993710517883, + "learning_rate": 7.627585208360551e-05, + "loss": 1.7832, + "step": 11214 + }, + { + "epoch": 3.442295887047268, + "grad_norm": 0.4034743010997772, + "learning_rate": 7.62716230855793e-05, + "loss": 1.8164, + "step": 11215 + }, + { + "epoch": 3.4426028238182935, + "grad_norm": 0.4508039355278015, + "learning_rate": 7.626739382792152e-05, + "loss": 1.7855, + "step": 11216 + }, + { + "epoch": 3.4429097605893184, + "grad_norm": 0.2963111400604248, + "learning_rate": 7.626316431067395e-05, + "loss": 1.7995, + "step": 11217 + }, + { + "epoch": 3.4432166973603437, + "grad_norm": 0.35248515009880066, + "learning_rate": 7.625893453387841e-05, + "loss": 1.8761, + "step": 11218 + }, + { + "epoch": 3.443523634131369, + "grad_norm": 0.4032224416732788, + "learning_rate": 7.625470449757668e-05, + "loss": 1.7746, + "step": 11219 + }, + { + "epoch": 3.443830570902394, + "grad_norm": 0.3505195081233978, + "learning_rate": 7.625047420181057e-05, + "loss": 1.851, + "step": 11220 + }, + { + "epoch": 3.4441375076734193, + "grad_norm": 0.288968563079834, + "learning_rate": 7.62462436466219e-05, + "loss": 1.8055, + "step": 11221 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.43141910433769226, + "learning_rate": 7.624201283205246e-05, + "loss": 1.816, + "step": 11222 + }, + { + "epoch": 3.4447513812154695, + "grad_norm": 0.46902137994766235, + "learning_rate": 7.623778175814407e-05, + "loss": 1.8478, + "step": 11223 + }, + { + "epoch": 3.445058317986495, + "grad_norm": 0.3333328366279602, + "learning_rate": 7.623355042493854e-05, + "loss": 1.7949, + "step": 11224 + }, + { + "epoch": 3.4453652547575198, + "grad_norm": 0.2625340521335602, + "learning_rate": 7.622931883247768e-05, + "loss": 1.745, + "step": 11225 + }, + { + "epoch": 3.445672191528545, + "grad_norm": 0.4565848410129547, + "learning_rate": 7.622508698080333e-05, + "loss": 1.796, + "step": 11226 + }, + { + "epoch": 3.4459791282995704, + "grad_norm": 0.4676518738269806, + "learning_rate": 7.622085486995729e-05, + "loss": 1.8115, + "step": 11227 + }, + { + "epoch": 3.4462860650705953, + "grad_norm": 0.3828938603401184, + "learning_rate": 7.62166224999814e-05, + "loss": 1.8758, + "step": 11228 + }, + { + "epoch": 3.4465930018416207, + "grad_norm": 0.2786383628845215, + "learning_rate": 7.621238987091747e-05, + "loss": 1.7616, + "step": 11229 + }, + { + "epoch": 3.446899938612646, + "grad_norm": 0.4442835748195648, + "learning_rate": 7.620815698280734e-05, + "loss": 1.8342, + "step": 11230 + }, + { + "epoch": 3.447206875383671, + "grad_norm": 0.45760586857795715, + "learning_rate": 7.620392383569286e-05, + "loss": 1.8159, + "step": 11231 + }, + { + "epoch": 3.447513812154696, + "grad_norm": 0.2567009925842285, + "learning_rate": 7.619969042961583e-05, + "loss": 1.774, + "step": 11232 + }, + { + "epoch": 3.4478207489257215, + "grad_norm": 0.3720102310180664, + "learning_rate": 7.619545676461812e-05, + "loss": 1.8366, + "step": 11233 + }, + { + "epoch": 3.4481276856967464, + "grad_norm": 0.36436137557029724, + "learning_rate": 7.619122284074154e-05, + "loss": 1.832, + "step": 11234 + }, + { + "epoch": 3.4484346224677718, + "grad_norm": 0.310310959815979, + "learning_rate": 7.618698865802795e-05, + "loss": 1.9023, + "step": 11235 + }, + { + "epoch": 3.4487415592387967, + "grad_norm": 0.2693026661872864, + "learning_rate": 7.618275421651916e-05, + "loss": 1.7696, + "step": 11236 + }, + { + "epoch": 3.449048496009822, + "grad_norm": 0.2942425608634949, + "learning_rate": 7.61785195162571e-05, + "loss": 1.822, + "step": 11237 + }, + { + "epoch": 3.4493554327808473, + "grad_norm": 0.22454749047756195, + "learning_rate": 7.617428455728353e-05, + "loss": 1.7011, + "step": 11238 + }, + { + "epoch": 3.449662369551872, + "grad_norm": 0.23345038294792175, + "learning_rate": 7.617004933964035e-05, + "loss": 1.7563, + "step": 11239 + }, + { + "epoch": 3.4499693063228976, + "grad_norm": 0.24990662932395935, + "learning_rate": 7.616581386336941e-05, + "loss": 1.8031, + "step": 11240 + }, + { + "epoch": 3.4502762430939224, + "grad_norm": 0.2919348478317261, + "learning_rate": 7.616157812851254e-05, + "loss": 1.7355, + "step": 11241 + }, + { + "epoch": 3.450583179864948, + "grad_norm": 0.2926909327507019, + "learning_rate": 7.615734213511165e-05, + "loss": 1.8341, + "step": 11242 + }, + { + "epoch": 3.450890116635973, + "grad_norm": 0.24316683411598206, + "learning_rate": 7.615310588320855e-05, + "loss": 1.8154, + "step": 11243 + }, + { + "epoch": 3.451197053406998, + "grad_norm": 0.23154498636722565, + "learning_rate": 7.614886937284513e-05, + "loss": 1.7904, + "step": 11244 + }, + { + "epoch": 3.4515039901780233, + "grad_norm": 0.25973939895629883, + "learning_rate": 7.614463260406327e-05, + "loss": 1.7598, + "step": 11245 + }, + { + "epoch": 3.4518109269490487, + "grad_norm": 0.22110119462013245, + "learning_rate": 7.614039557690482e-05, + "loss": 1.7903, + "step": 11246 + }, + { + "epoch": 3.4521178637200736, + "grad_norm": 0.26184993982315063, + "learning_rate": 7.613615829141165e-05, + "loss": 1.748, + "step": 11247 + }, + { + "epoch": 3.452424800491099, + "grad_norm": 0.26128727197647095, + "learning_rate": 7.613192074762565e-05, + "loss": 1.7786, + "step": 11248 + }, + { + "epoch": 3.4527317372621242, + "grad_norm": 0.23230813443660736, + "learning_rate": 7.612768294558871e-05, + "loss": 1.8114, + "step": 11249 + }, + { + "epoch": 3.453038674033149, + "grad_norm": 0.2686540186405182, + "learning_rate": 7.612344488534268e-05, + "loss": 1.7311, + "step": 11250 + }, + { + "epoch": 3.4533456108041745, + "grad_norm": 0.25553348660469055, + "learning_rate": 7.611920656692946e-05, + "loss": 1.8468, + "step": 11251 + }, + { + "epoch": 3.4536525475751993, + "grad_norm": 0.2639308273792267, + "learning_rate": 7.611496799039092e-05, + "loss": 1.8292, + "step": 11252 + }, + { + "epoch": 3.4539594843462247, + "grad_norm": 0.2468358874320984, + "learning_rate": 7.611072915576895e-05, + "loss": 1.8173, + "step": 11253 + }, + { + "epoch": 3.45426642111725, + "grad_norm": 0.27236035466194153, + "learning_rate": 7.610649006310549e-05, + "loss": 1.8082, + "step": 11254 + }, + { + "epoch": 3.454573357888275, + "grad_norm": 0.2277914434671402, + "learning_rate": 7.610225071244237e-05, + "loss": 1.7483, + "step": 11255 + }, + { + "epoch": 3.4548802946593002, + "grad_norm": 0.2292868196964264, + "learning_rate": 7.60980111038215e-05, + "loss": 1.7716, + "step": 11256 + }, + { + "epoch": 3.455187231430325, + "grad_norm": 0.22116152942180634, + "learning_rate": 7.60937712372848e-05, + "loss": 1.773, + "step": 11257 + }, + { + "epoch": 3.4554941682013505, + "grad_norm": 0.23238304257392883, + "learning_rate": 7.608953111287416e-05, + "loss": 1.7602, + "step": 11258 + }, + { + "epoch": 3.455801104972376, + "grad_norm": 0.2810615003108978, + "learning_rate": 7.608529073063149e-05, + "loss": 1.8781, + "step": 11259 + }, + { + "epoch": 3.4561080417434007, + "grad_norm": 0.2516821324825287, + "learning_rate": 7.608105009059867e-05, + "loss": 1.835, + "step": 11260 + }, + { + "epoch": 3.456414978514426, + "grad_norm": 0.25698330998420715, + "learning_rate": 7.607680919281763e-05, + "loss": 1.7859, + "step": 11261 + }, + { + "epoch": 3.4567219152854514, + "grad_norm": 0.2597602903842926, + "learning_rate": 7.60725680373303e-05, + "loss": 1.8287, + "step": 11262 + }, + { + "epoch": 3.4570288520564763, + "grad_norm": 0.2564091980457306, + "learning_rate": 7.606832662417855e-05, + "loss": 1.8003, + "step": 11263 + }, + { + "epoch": 3.4573357888275016, + "grad_norm": 0.2872684597969055, + "learning_rate": 7.606408495340432e-05, + "loss": 1.8242, + "step": 11264 + }, + { + "epoch": 3.457642725598527, + "grad_norm": 0.27513590455055237, + "learning_rate": 7.605984302504952e-05, + "loss": 1.8605, + "step": 11265 + }, + { + "epoch": 3.457949662369552, + "grad_norm": 0.27768459916114807, + "learning_rate": 7.605560083915609e-05, + "loss": 1.7948, + "step": 11266 + }, + { + "epoch": 3.458256599140577, + "grad_norm": 0.23911382257938385, + "learning_rate": 7.605135839576593e-05, + "loss": 1.7575, + "step": 11267 + }, + { + "epoch": 3.458563535911602, + "grad_norm": 0.26773568987846375, + "learning_rate": 7.604711569492098e-05, + "loss": 1.752, + "step": 11268 + }, + { + "epoch": 3.4588704726826274, + "grad_norm": 0.30079394578933716, + "learning_rate": 7.604287273666316e-05, + "loss": 1.8022, + "step": 11269 + }, + { + "epoch": 3.4591774094536527, + "grad_norm": 0.27393853664398193, + "learning_rate": 7.603862952103441e-05, + "loss": 1.8054, + "step": 11270 + }, + { + "epoch": 3.4594843462246776, + "grad_norm": 0.2794870436191559, + "learning_rate": 7.603438604807667e-05, + "loss": 1.808, + "step": 11271 + }, + { + "epoch": 3.459791282995703, + "grad_norm": 0.26482146978378296, + "learning_rate": 7.603014231783185e-05, + "loss": 1.8696, + "step": 11272 + }, + { + "epoch": 3.460098219766728, + "grad_norm": 0.2755354344844818, + "learning_rate": 7.602589833034192e-05, + "loss": 1.8412, + "step": 11273 + }, + { + "epoch": 3.460405156537753, + "grad_norm": 0.2666642367839813, + "learning_rate": 7.602165408564883e-05, + "loss": 1.8333, + "step": 11274 + }, + { + "epoch": 3.4607120933087785, + "grad_norm": 0.26958519220352173, + "learning_rate": 7.601740958379448e-05, + "loss": 1.7943, + "step": 11275 + }, + { + "epoch": 3.4610190300798034, + "grad_norm": 0.2915789783000946, + "learning_rate": 7.601316482482084e-05, + "loss": 1.7519, + "step": 11276 + }, + { + "epoch": 3.4613259668508287, + "grad_norm": 0.2456950694322586, + "learning_rate": 7.600891980876985e-05, + "loss": 1.8064, + "step": 11277 + }, + { + "epoch": 3.461632903621854, + "grad_norm": 0.2517867088317871, + "learning_rate": 7.600467453568348e-05, + "loss": 1.7766, + "step": 11278 + }, + { + "epoch": 3.461939840392879, + "grad_norm": 0.24567969143390656, + "learning_rate": 7.600042900560368e-05, + "loss": 1.7331, + "step": 11279 + }, + { + "epoch": 3.4622467771639043, + "grad_norm": 0.23986820876598358, + "learning_rate": 7.599618321857239e-05, + "loss": 1.7477, + "step": 11280 + }, + { + "epoch": 3.4625537139349296, + "grad_norm": 0.2555375397205353, + "learning_rate": 7.599193717463158e-05, + "loss": 1.8154, + "step": 11281 + }, + { + "epoch": 3.4628606507059545, + "grad_norm": 0.2522781193256378, + "learning_rate": 7.598769087382323e-05, + "loss": 1.7821, + "step": 11282 + }, + { + "epoch": 3.46316758747698, + "grad_norm": 0.25631004571914673, + "learning_rate": 7.598344431618926e-05, + "loss": 1.8043, + "step": 11283 + }, + { + "epoch": 3.4634745242480047, + "grad_norm": 0.2611328661441803, + "learning_rate": 7.597919750177168e-05, + "loss": 1.8036, + "step": 11284 + }, + { + "epoch": 3.46378146101903, + "grad_norm": 0.255670428276062, + "learning_rate": 7.597495043061244e-05, + "loss": 1.7375, + "step": 11285 + }, + { + "epoch": 3.4640883977900554, + "grad_norm": 0.2687236964702606, + "learning_rate": 7.597070310275353e-05, + "loss": 1.7496, + "step": 11286 + }, + { + "epoch": 3.4643953345610803, + "grad_norm": 0.2643752992153168, + "learning_rate": 7.596645551823688e-05, + "loss": 1.8444, + "step": 11287 + }, + { + "epoch": 3.4647022713321056, + "grad_norm": 0.2564511299133301, + "learning_rate": 7.596220767710452e-05, + "loss": 1.7557, + "step": 11288 + }, + { + "epoch": 3.4650092081031305, + "grad_norm": 0.2510208487510681, + "learning_rate": 7.59579595793984e-05, + "loss": 1.7234, + "step": 11289 + }, + { + "epoch": 3.465316144874156, + "grad_norm": 0.2765158712863922, + "learning_rate": 7.595371122516051e-05, + "loss": 1.8215, + "step": 11290 + }, + { + "epoch": 3.465623081645181, + "grad_norm": 0.28233039379119873, + "learning_rate": 7.594946261443286e-05, + "loss": 1.7752, + "step": 11291 + }, + { + "epoch": 3.465930018416206, + "grad_norm": 0.26971468329429626, + "learning_rate": 7.594521374725735e-05, + "loss": 1.7924, + "step": 11292 + }, + { + "epoch": 3.4662369551872314, + "grad_norm": 0.29425930976867676, + "learning_rate": 7.594096462367608e-05, + "loss": 1.8144, + "step": 11293 + }, + { + "epoch": 3.4665438919582567, + "grad_norm": 0.233150452375412, + "learning_rate": 7.593671524373098e-05, + "loss": 1.7741, + "step": 11294 + }, + { + "epoch": 3.4668508287292816, + "grad_norm": 0.2947762608528137, + "learning_rate": 7.593246560746406e-05, + "loss": 1.8031, + "step": 11295 + }, + { + "epoch": 3.467157765500307, + "grad_norm": 0.250552773475647, + "learning_rate": 7.59282157149173e-05, + "loss": 1.7501, + "step": 11296 + }, + { + "epoch": 3.4674647022713323, + "grad_norm": 0.26091331243515015, + "learning_rate": 7.592396556613274e-05, + "loss": 1.836, + "step": 11297 + }, + { + "epoch": 3.467771639042357, + "grad_norm": 0.28625619411468506, + "learning_rate": 7.591971516115233e-05, + "loss": 1.7555, + "step": 11298 + }, + { + "epoch": 3.4680785758133825, + "grad_norm": 0.2723398804664612, + "learning_rate": 7.591546450001811e-05, + "loss": 1.825, + "step": 11299 + }, + { + "epoch": 3.4683855125844074, + "grad_norm": 0.24289946258068085, + "learning_rate": 7.591121358277211e-05, + "loss": 1.7441, + "step": 11300 + }, + { + "epoch": 3.4686924493554327, + "grad_norm": 0.2706952691078186, + "learning_rate": 7.590696240945629e-05, + "loss": 1.8651, + "step": 11301 + }, + { + "epoch": 3.468999386126458, + "grad_norm": 0.24632862210273743, + "learning_rate": 7.590271098011268e-05, + "loss": 1.8229, + "step": 11302 + }, + { + "epoch": 3.469306322897483, + "grad_norm": 0.29275211691856384, + "learning_rate": 7.58984592947833e-05, + "loss": 1.7591, + "step": 11303 + }, + { + "epoch": 3.4696132596685083, + "grad_norm": 0.29228144884109497, + "learning_rate": 7.589420735351016e-05, + "loss": 1.8395, + "step": 11304 + }, + { + "epoch": 3.4699201964395336, + "grad_norm": 0.28339114785194397, + "learning_rate": 7.588995515633528e-05, + "loss": 1.8543, + "step": 11305 + }, + { + "epoch": 3.4702271332105585, + "grad_norm": 0.2834693193435669, + "learning_rate": 7.588570270330071e-05, + "loss": 1.826, + "step": 11306 + }, + { + "epoch": 3.470534069981584, + "grad_norm": 0.26130759716033936, + "learning_rate": 7.588144999444844e-05, + "loss": 1.7887, + "step": 11307 + }, + { + "epoch": 3.470841006752609, + "grad_norm": 0.29554685950279236, + "learning_rate": 7.587719702982052e-05, + "loss": 1.819, + "step": 11308 + }, + { + "epoch": 3.471147943523634, + "grad_norm": 0.2687968611717224, + "learning_rate": 7.587294380945898e-05, + "loss": 1.7354, + "step": 11309 + }, + { + "epoch": 3.4714548802946594, + "grad_norm": 0.28795287013053894, + "learning_rate": 7.586869033340582e-05, + "loss": 1.8267, + "step": 11310 + }, + { + "epoch": 3.4717618170656843, + "grad_norm": 0.33244553208351135, + "learning_rate": 7.58644366017031e-05, + "loss": 1.86, + "step": 11311 + }, + { + "epoch": 3.4720687538367097, + "grad_norm": 0.2878025472164154, + "learning_rate": 7.586018261439288e-05, + "loss": 1.7587, + "step": 11312 + }, + { + "epoch": 3.472375690607735, + "grad_norm": 0.26856711506843567, + "learning_rate": 7.585592837151716e-05, + "loss": 1.7351, + "step": 11313 + }, + { + "epoch": 3.47268262737876, + "grad_norm": 0.2554367780685425, + "learning_rate": 7.585167387311802e-05, + "loss": 1.7664, + "step": 11314 + }, + { + "epoch": 3.472989564149785, + "grad_norm": 0.3193204700946808, + "learning_rate": 7.584741911923748e-05, + "loss": 1.7487, + "step": 11315 + }, + { + "epoch": 3.47329650092081, + "grad_norm": 0.3227958679199219, + "learning_rate": 7.584316410991759e-05, + "loss": 1.8107, + "step": 11316 + }, + { + "epoch": 3.4736034376918354, + "grad_norm": 0.33891916275024414, + "learning_rate": 7.58389088452004e-05, + "loss": 1.8466, + "step": 11317 + }, + { + "epoch": 3.4739103744628608, + "grad_norm": 0.27050724625587463, + "learning_rate": 7.583465332512797e-05, + "loss": 1.7877, + "step": 11318 + }, + { + "epoch": 3.4742173112338857, + "grad_norm": 0.2935837209224701, + "learning_rate": 7.583039754974235e-05, + "loss": 1.7932, + "step": 11319 + }, + { + "epoch": 3.474524248004911, + "grad_norm": 0.27780550718307495, + "learning_rate": 7.582614151908561e-05, + "loss": 1.8374, + "step": 11320 + }, + { + "epoch": 3.4748311847759363, + "grad_norm": 0.2579033076763153, + "learning_rate": 7.58218852331998e-05, + "loss": 1.7305, + "step": 11321 + }, + { + "epoch": 3.4751381215469612, + "grad_norm": 0.2531716227531433, + "learning_rate": 7.581762869212699e-05, + "loss": 1.8136, + "step": 11322 + }, + { + "epoch": 3.4754450583179866, + "grad_norm": 0.25504544377326965, + "learning_rate": 7.581337189590924e-05, + "loss": 1.787, + "step": 11323 + }, + { + "epoch": 3.475751995089012, + "grad_norm": 0.23659855127334595, + "learning_rate": 7.580911484458861e-05, + "loss": 1.77, + "step": 11324 + }, + { + "epoch": 3.476058931860037, + "grad_norm": 0.22556856274604797, + "learning_rate": 7.580485753820721e-05, + "loss": 1.7808, + "step": 11325 + }, + { + "epoch": 3.476365868631062, + "grad_norm": 0.2860291600227356, + "learning_rate": 7.580059997680705e-05, + "loss": 1.8224, + "step": 11326 + }, + { + "epoch": 3.476672805402087, + "grad_norm": 0.3134596645832062, + "learning_rate": 7.579634216043023e-05, + "loss": 1.8278, + "step": 11327 + }, + { + "epoch": 3.4769797421731123, + "grad_norm": 0.2883087992668152, + "learning_rate": 7.579208408911887e-05, + "loss": 1.7917, + "step": 11328 + }, + { + "epoch": 3.4772866789441377, + "grad_norm": 0.2743333578109741, + "learning_rate": 7.578782576291501e-05, + "loss": 1.8228, + "step": 11329 + }, + { + "epoch": 3.4775936157151626, + "grad_norm": 0.25026053190231323, + "learning_rate": 7.578356718186073e-05, + "loss": 1.7717, + "step": 11330 + }, + { + "epoch": 3.477900552486188, + "grad_norm": 0.246905118227005, + "learning_rate": 7.577930834599813e-05, + "loss": 1.7979, + "step": 11331 + }, + { + "epoch": 3.478207489257213, + "grad_norm": 0.24709418416023254, + "learning_rate": 7.577504925536929e-05, + "loss": 1.8111, + "step": 11332 + }, + { + "epoch": 3.478514426028238, + "grad_norm": 0.25685814023017883, + "learning_rate": 7.577078991001632e-05, + "loss": 1.8255, + "step": 11333 + }, + { + "epoch": 3.4788213627992635, + "grad_norm": 0.23937836289405823, + "learning_rate": 7.576653030998129e-05, + "loss": 1.7254, + "step": 11334 + }, + { + "epoch": 3.4791282995702884, + "grad_norm": 0.22638650238513947, + "learning_rate": 7.57622704553063e-05, + "loss": 1.7847, + "step": 11335 + }, + { + "epoch": 3.4794352363413137, + "grad_norm": 0.26083993911743164, + "learning_rate": 7.575801034603347e-05, + "loss": 1.7947, + "step": 11336 + }, + { + "epoch": 3.479742173112339, + "grad_norm": 0.2715466022491455, + "learning_rate": 7.575374998220488e-05, + "loss": 1.848, + "step": 11337 + }, + { + "epoch": 3.480049109883364, + "grad_norm": 0.25554224848747253, + "learning_rate": 7.574948936386262e-05, + "loss": 1.7811, + "step": 11338 + }, + { + "epoch": 3.4803560466543892, + "grad_norm": 0.2689397931098938, + "learning_rate": 7.574522849104882e-05, + "loss": 1.82, + "step": 11339 + }, + { + "epoch": 3.4806629834254146, + "grad_norm": 0.25027474761009216, + "learning_rate": 7.57409673638056e-05, + "loss": 1.775, + "step": 11340 + }, + { + "epoch": 3.4809699201964395, + "grad_norm": 0.2545457184314728, + "learning_rate": 7.573670598217504e-05, + "loss": 1.8056, + "step": 11341 + }, + { + "epoch": 3.481276856967465, + "grad_norm": 0.28404027223587036, + "learning_rate": 7.573244434619928e-05, + "loss": 1.8372, + "step": 11342 + }, + { + "epoch": 3.4815837937384897, + "grad_norm": 0.28046950697898865, + "learning_rate": 7.572818245592041e-05, + "loss": 1.7851, + "step": 11343 + }, + { + "epoch": 3.481890730509515, + "grad_norm": 0.23005759716033936, + "learning_rate": 7.572392031138056e-05, + "loss": 1.7059, + "step": 11344 + }, + { + "epoch": 3.4821976672805404, + "grad_norm": 0.2931719124317169, + "learning_rate": 7.571965791262185e-05, + "loss": 1.84, + "step": 11345 + }, + { + "epoch": 3.4825046040515653, + "grad_norm": 0.4399266242980957, + "learning_rate": 7.571539525968642e-05, + "loss": 1.7465, + "step": 11346 + }, + { + "epoch": 3.4828115408225906, + "grad_norm": 0.48957565426826477, + "learning_rate": 7.571113235261638e-05, + "loss": 1.8494, + "step": 11347 + }, + { + "epoch": 3.4831184775936155, + "grad_norm": 0.37828895449638367, + "learning_rate": 7.570686919145385e-05, + "loss": 1.7598, + "step": 11348 + }, + { + "epoch": 3.483425414364641, + "grad_norm": 0.22943973541259766, + "learning_rate": 7.570260577624098e-05, + "loss": 1.7443, + "step": 11349 + }, + { + "epoch": 3.483732351135666, + "grad_norm": 0.3245384991168976, + "learning_rate": 7.569834210701987e-05, + "loss": 1.7232, + "step": 11350 + }, + { + "epoch": 3.484039287906691, + "grad_norm": 0.4419693648815155, + "learning_rate": 7.569407818383271e-05, + "loss": 1.841, + "step": 11351 + }, + { + "epoch": 3.4843462246777164, + "grad_norm": 0.4061864912509918, + "learning_rate": 7.568981400672159e-05, + "loss": 1.8274, + "step": 11352 + }, + { + "epoch": 3.4846531614487417, + "grad_norm": 0.2609417736530304, + "learning_rate": 7.56855495757287e-05, + "loss": 1.8631, + "step": 11353 + }, + { + "epoch": 3.4849600982197666, + "grad_norm": 0.28758567571640015, + "learning_rate": 7.568128489089612e-05, + "loss": 1.8169, + "step": 11354 + }, + { + "epoch": 3.485267034990792, + "grad_norm": 0.40643060207366943, + "learning_rate": 7.567701995226606e-05, + "loss": 1.809, + "step": 11355 + }, + { + "epoch": 3.4855739717618173, + "grad_norm": 0.37649446725845337, + "learning_rate": 7.56727547598806e-05, + "loss": 1.7661, + "step": 11356 + }, + { + "epoch": 3.485880908532842, + "grad_norm": 0.22863779962062836, + "learning_rate": 7.566848931378197e-05, + "loss": 1.808, + "step": 11357 + }, + { + "epoch": 3.4861878453038675, + "grad_norm": 0.4487019181251526, + "learning_rate": 7.566422361401226e-05, + "loss": 1.7627, + "step": 11358 + }, + { + "epoch": 3.4864947820748924, + "grad_norm": 0.4583640694618225, + "learning_rate": 7.565995766061367e-05, + "loss": 1.8186, + "step": 11359 + }, + { + "epoch": 3.4868017188459177, + "grad_norm": 0.27231526374816895, + "learning_rate": 7.565569145362833e-05, + "loss": 1.8465, + "step": 11360 + }, + { + "epoch": 3.487108655616943, + "grad_norm": 0.3877887725830078, + "learning_rate": 7.565142499309841e-05, + "loss": 1.7668, + "step": 11361 + }, + { + "epoch": 3.487415592387968, + "grad_norm": 0.5511242747306824, + "learning_rate": 7.564715827906606e-05, + "loss": 1.8417, + "step": 11362 + }, + { + "epoch": 3.4877225291589933, + "grad_norm": 0.5112231373786926, + "learning_rate": 7.564289131157348e-05, + "loss": 1.8038, + "step": 11363 + }, + { + "epoch": 3.488029465930018, + "grad_norm": 0.279502809047699, + "learning_rate": 7.56386240906628e-05, + "loss": 1.7545, + "step": 11364 + }, + { + "epoch": 3.4883364027010435, + "grad_norm": 0.30080464482307434, + "learning_rate": 7.563435661637623e-05, + "loss": 1.8136, + "step": 11365 + }, + { + "epoch": 3.488643339472069, + "grad_norm": 0.4424717128276825, + "learning_rate": 7.563008888875591e-05, + "loss": 1.7542, + "step": 11366 + }, + { + "epoch": 3.4889502762430937, + "grad_norm": 0.42144715785980225, + "learning_rate": 7.562582090784403e-05, + "loss": 1.8245, + "step": 11367 + }, + { + "epoch": 3.489257213014119, + "grad_norm": 0.2533668875694275, + "learning_rate": 7.562155267368277e-05, + "loss": 1.8654, + "step": 11368 + }, + { + "epoch": 3.4895641497851444, + "grad_norm": 0.3327534794807434, + "learning_rate": 7.56172841863143e-05, + "loss": 1.7882, + "step": 11369 + }, + { + "epoch": 3.4898710865561693, + "grad_norm": 0.44001486897468567, + "learning_rate": 7.561301544578081e-05, + "loss": 1.8397, + "step": 11370 + }, + { + "epoch": 3.4901780233271946, + "grad_norm": 0.2779090106487274, + "learning_rate": 7.56087464521245e-05, + "loss": 1.7398, + "step": 11371 + }, + { + "epoch": 3.49048496009822, + "grad_norm": 0.3018067479133606, + "learning_rate": 7.560447720538755e-05, + "loss": 1.8076, + "step": 11372 + }, + { + "epoch": 3.490791896869245, + "grad_norm": 0.4370935261249542, + "learning_rate": 7.560020770561216e-05, + "loss": 1.8057, + "step": 11373 + }, + { + "epoch": 3.49109883364027, + "grad_norm": 0.2936978042125702, + "learning_rate": 7.559593795284047e-05, + "loss": 1.7726, + "step": 11374 + }, + { + "epoch": 3.491405770411295, + "grad_norm": 0.28825095295906067, + "learning_rate": 7.559166794711476e-05, + "loss": 1.8039, + "step": 11375 + }, + { + "epoch": 3.4917127071823204, + "grad_norm": 0.39334073662757874, + "learning_rate": 7.55873976884772e-05, + "loss": 1.8388, + "step": 11376 + }, + { + "epoch": 3.4920196439533457, + "grad_norm": 0.33880460262298584, + "learning_rate": 7.558312717696995e-05, + "loss": 1.7791, + "step": 11377 + }, + { + "epoch": 3.4923265807243706, + "grad_norm": 0.4433762729167938, + "learning_rate": 7.557885641263524e-05, + "loss": 1.7786, + "step": 11378 + }, + { + "epoch": 3.492633517495396, + "grad_norm": 0.4710264205932617, + "learning_rate": 7.557458539551527e-05, + "loss": 1.7193, + "step": 11379 + }, + { + "epoch": 3.4929404542664213, + "grad_norm": 0.27514326572418213, + "learning_rate": 7.557031412565228e-05, + "loss": 1.823, + "step": 11380 + }, + { + "epoch": 3.493247391037446, + "grad_norm": 0.4681413471698761, + "learning_rate": 7.556604260308846e-05, + "loss": 1.7598, + "step": 11381 + }, + { + "epoch": 3.4935543278084715, + "grad_norm": 0.5032503604888916, + "learning_rate": 7.556177082786602e-05, + "loss": 1.741, + "step": 11382 + }, + { + "epoch": 3.493861264579497, + "grad_norm": 0.2677086889743805, + "learning_rate": 7.555749880002716e-05, + "loss": 1.8528, + "step": 11383 + }, + { + "epoch": 3.4941682013505218, + "grad_norm": 0.43870940804481506, + "learning_rate": 7.555322651961414e-05, + "loss": 1.7632, + "step": 11384 + }, + { + "epoch": 3.494475138121547, + "grad_norm": 0.5403209924697876, + "learning_rate": 7.554895398666914e-05, + "loss": 1.8181, + "step": 11385 + }, + { + "epoch": 3.494782074892572, + "grad_norm": 0.2714318335056305, + "learning_rate": 7.554468120123441e-05, + "loss": 1.8151, + "step": 11386 + }, + { + "epoch": 3.4950890116635973, + "grad_norm": 0.49661698937416077, + "learning_rate": 7.554040816335217e-05, + "loss": 1.8116, + "step": 11387 + }, + { + "epoch": 3.4953959484346226, + "grad_norm": 0.49954715371131897, + "learning_rate": 7.553613487306465e-05, + "loss": 1.8841, + "step": 11388 + }, + { + "epoch": 3.4957028852056475, + "grad_norm": 0.28189441561698914, + "learning_rate": 7.553186133041406e-05, + "loss": 1.7834, + "step": 11389 + }, + { + "epoch": 3.496009821976673, + "grad_norm": 0.36029115319252014, + "learning_rate": 7.552758753544267e-05, + "loss": 1.7796, + "step": 11390 + }, + { + "epoch": 3.4963167587476978, + "grad_norm": 0.45023465156555176, + "learning_rate": 7.552331348819268e-05, + "loss": 1.8773, + "step": 11391 + }, + { + "epoch": 3.496623695518723, + "grad_norm": 0.3235788643360138, + "learning_rate": 7.551903918870636e-05, + "loss": 1.7984, + "step": 11392 + }, + { + "epoch": 3.4969306322897484, + "grad_norm": 0.25656190514564514, + "learning_rate": 7.551476463702596e-05, + "loss": 1.8403, + "step": 11393 + }, + { + "epoch": 3.4972375690607733, + "grad_norm": 0.2866458594799042, + "learning_rate": 7.551048983319366e-05, + "loss": 1.7428, + "step": 11394 + }, + { + "epoch": 3.4975445058317987, + "grad_norm": 0.2713877856731415, + "learning_rate": 7.550621477725177e-05, + "loss": 1.8508, + "step": 11395 + }, + { + "epoch": 3.497851442602824, + "grad_norm": 0.27978867292404175, + "learning_rate": 7.55019394692425e-05, + "loss": 1.8049, + "step": 11396 + }, + { + "epoch": 3.498158379373849, + "grad_norm": 0.3275020122528076, + "learning_rate": 7.549766390920814e-05, + "loss": 1.8553, + "step": 11397 + }, + { + "epoch": 3.498465316144874, + "grad_norm": 0.29947492480278015, + "learning_rate": 7.54933880971909e-05, + "loss": 1.7614, + "step": 11398 + }, + { + "epoch": 3.4987722529158995, + "grad_norm": 0.25790849328041077, + "learning_rate": 7.548911203323308e-05, + "loss": 1.8223, + "step": 11399 + }, + { + "epoch": 3.4990791896869244, + "grad_norm": 0.3145451545715332, + "learning_rate": 7.54848357173769e-05, + "loss": 1.7642, + "step": 11400 + }, + { + "epoch": 3.4993861264579498, + "grad_norm": 0.29052913188934326, + "learning_rate": 7.548055914966463e-05, + "loss": 1.7728, + "step": 11401 + }, + { + "epoch": 3.4996930632289747, + "grad_norm": 0.2741037905216217, + "learning_rate": 7.547628233013854e-05, + "loss": 1.7382, + "step": 11402 + }, + { + "epoch": 3.5, + "grad_norm": 0.2562723755836487, + "learning_rate": 7.54720052588409e-05, + "loss": 1.7455, + "step": 11403 + }, + { + "epoch": 3.5003069367710253, + "grad_norm": 0.27649983763694763, + "learning_rate": 7.546772793581398e-05, + "loss": 1.7194, + "step": 11404 + }, + { + "epoch": 3.5006138735420502, + "grad_norm": 0.27290579676628113, + "learning_rate": 7.546345036110004e-05, + "loss": 1.87, + "step": 11405 + }, + { + "epoch": 3.5009208103130756, + "grad_norm": 0.33585605025291443, + "learning_rate": 7.545917253474136e-05, + "loss": 1.7703, + "step": 11406 + }, + { + "epoch": 3.5012277470841005, + "grad_norm": 0.2592691481113434, + "learning_rate": 7.545489445678022e-05, + "loss": 1.7657, + "step": 11407 + }, + { + "epoch": 3.501534683855126, + "grad_norm": 0.3081367015838623, + "learning_rate": 7.545061612725888e-05, + "loss": 1.8067, + "step": 11408 + }, + { + "epoch": 3.501841620626151, + "grad_norm": 0.31012001633644104, + "learning_rate": 7.544633754621965e-05, + "loss": 1.8009, + "step": 11409 + }, + { + "epoch": 3.5021485573971765, + "grad_norm": 0.28232479095458984, + "learning_rate": 7.54420587137048e-05, + "loss": 1.8124, + "step": 11410 + }, + { + "epoch": 3.5024554941682013, + "grad_norm": 0.24079222977161407, + "learning_rate": 7.54377796297566e-05, + "loss": 1.789, + "step": 11411 + }, + { + "epoch": 3.5027624309392267, + "grad_norm": 0.27347204089164734, + "learning_rate": 7.543350029441737e-05, + "loss": 1.7704, + "step": 11412 + }, + { + "epoch": 3.5030693677102516, + "grad_norm": 0.25545811653137207, + "learning_rate": 7.542922070772935e-05, + "loss": 1.7871, + "step": 11413 + }, + { + "epoch": 3.503376304481277, + "grad_norm": 0.2507263123989105, + "learning_rate": 7.54249408697349e-05, + "loss": 1.8424, + "step": 11414 + }, + { + "epoch": 3.5036832412523022, + "grad_norm": 0.2776084244251251, + "learning_rate": 7.542066078047627e-05, + "loss": 1.8246, + "step": 11415 + }, + { + "epoch": 3.503990178023327, + "grad_norm": 0.32833749055862427, + "learning_rate": 7.541638043999577e-05, + "loss": 1.7785, + "step": 11416 + }, + { + "epoch": 3.5042971147943525, + "grad_norm": 0.258486270904541, + "learning_rate": 7.541209984833571e-05, + "loss": 1.7543, + "step": 11417 + }, + { + "epoch": 3.5046040515653774, + "grad_norm": 0.25825178623199463, + "learning_rate": 7.540781900553837e-05, + "loss": 1.7939, + "step": 11418 + }, + { + "epoch": 3.5049109883364027, + "grad_norm": 0.26980888843536377, + "learning_rate": 7.540353791164606e-05, + "loss": 1.7777, + "step": 11419 + }, + { + "epoch": 3.505217925107428, + "grad_norm": 0.24103333055973053, + "learning_rate": 7.539925656670111e-05, + "loss": 1.7565, + "step": 11420 + }, + { + "epoch": 3.505524861878453, + "grad_norm": 0.25192007422447205, + "learning_rate": 7.539497497074584e-05, + "loss": 1.7696, + "step": 11421 + }, + { + "epoch": 3.5058317986494782, + "grad_norm": 0.218489870429039, + "learning_rate": 7.539069312382252e-05, + "loss": 1.761, + "step": 11422 + }, + { + "epoch": 3.506138735420503, + "grad_norm": 0.27533552050590515, + "learning_rate": 7.53864110259735e-05, + "loss": 1.7374, + "step": 11423 + }, + { + "epoch": 3.5064456721915285, + "grad_norm": 0.2603490650653839, + "learning_rate": 7.538212867724108e-05, + "loss": 1.8342, + "step": 11424 + }, + { + "epoch": 3.506752608962554, + "grad_norm": 0.27340635657310486, + "learning_rate": 7.537784607766758e-05, + "loss": 1.8099, + "step": 11425 + }, + { + "epoch": 3.507059545733579, + "grad_norm": 0.25342679023742676, + "learning_rate": 7.537356322729537e-05, + "loss": 1.7949, + "step": 11426 + }, + { + "epoch": 3.507366482504604, + "grad_norm": 0.292819082736969, + "learning_rate": 7.536928012616669e-05, + "loss": 1.9049, + "step": 11427 + }, + { + "epoch": 3.5076734192756294, + "grad_norm": 0.28256532549858093, + "learning_rate": 7.536499677432393e-05, + "loss": 1.8464, + "step": 11428 + }, + { + "epoch": 3.5079803560466543, + "grad_norm": 0.2672989070415497, + "learning_rate": 7.536071317180942e-05, + "loss": 1.8301, + "step": 11429 + }, + { + "epoch": 3.5082872928176796, + "grad_norm": 0.2525518238544464, + "learning_rate": 7.535642931866546e-05, + "loss": 1.8054, + "step": 11430 + }, + { + "epoch": 3.508594229588705, + "grad_norm": 0.2622447609901428, + "learning_rate": 7.535214521493442e-05, + "loss": 1.8293, + "step": 11431 + }, + { + "epoch": 3.50890116635973, + "grad_norm": 0.27057385444641113, + "learning_rate": 7.534786086065859e-05, + "loss": 1.7426, + "step": 11432 + }, + { + "epoch": 3.509208103130755, + "grad_norm": 0.27363866567611694, + "learning_rate": 7.534357625588038e-05, + "loss": 1.7138, + "step": 11433 + }, + { + "epoch": 3.50951503990178, + "grad_norm": 0.3029060363769531, + "learning_rate": 7.533929140064207e-05, + "loss": 1.864, + "step": 11434 + }, + { + "epoch": 3.5098219766728054, + "grad_norm": 0.3144821524620056, + "learning_rate": 7.533500629498604e-05, + "loss": 1.7846, + "step": 11435 + }, + { + "epoch": 3.5101289134438307, + "grad_norm": 0.44535213708877563, + "learning_rate": 7.533072093895461e-05, + "loss": 1.799, + "step": 11436 + }, + { + "epoch": 3.5104358502148556, + "grad_norm": 0.25344160199165344, + "learning_rate": 7.532643533259017e-05, + "loss": 1.7391, + "step": 11437 + }, + { + "epoch": 3.510742786985881, + "grad_norm": 0.286026269197464, + "learning_rate": 7.532214947593506e-05, + "loss": 1.8436, + "step": 11438 + }, + { + "epoch": 3.511049723756906, + "grad_norm": 0.3317352533340454, + "learning_rate": 7.53178633690316e-05, + "loss": 1.8507, + "step": 11439 + }, + { + "epoch": 3.511356660527931, + "grad_norm": 0.2547265589237213, + "learning_rate": 7.53135770119222e-05, + "loss": 1.7483, + "step": 11440 + }, + { + "epoch": 3.5116635972989565, + "grad_norm": 0.24281835556030273, + "learning_rate": 7.530929040464917e-05, + "loss": 1.759, + "step": 11441 + }, + { + "epoch": 3.511970534069982, + "grad_norm": 0.2935381829738617, + "learning_rate": 7.530500354725491e-05, + "loss": 1.8235, + "step": 11442 + }, + { + "epoch": 3.5122774708410067, + "grad_norm": 0.26642969250679016, + "learning_rate": 7.53007164397818e-05, + "loss": 1.8324, + "step": 11443 + }, + { + "epoch": 3.512584407612032, + "grad_norm": 0.24830882251262665, + "learning_rate": 7.529642908227215e-05, + "loss": 1.8132, + "step": 11444 + }, + { + "epoch": 3.512891344383057, + "grad_norm": 0.3100191056728363, + "learning_rate": 7.529214147476838e-05, + "loss": 1.8453, + "step": 11445 + }, + { + "epoch": 3.5131982811540823, + "grad_norm": 0.27948811650276184, + "learning_rate": 7.528785361731282e-05, + "loss": 1.7792, + "step": 11446 + }, + { + "epoch": 3.5135052179251076, + "grad_norm": 0.26978832483291626, + "learning_rate": 7.528356550994787e-05, + "loss": 1.7857, + "step": 11447 + }, + { + "epoch": 3.5138121546961325, + "grad_norm": 0.30527836084365845, + "learning_rate": 7.527927715271592e-05, + "loss": 1.807, + "step": 11448 + }, + { + "epoch": 3.514119091467158, + "grad_norm": 0.2915664315223694, + "learning_rate": 7.527498854565934e-05, + "loss": 1.8414, + "step": 11449 + }, + { + "epoch": 3.5144260282381827, + "grad_norm": 0.2854034900665283, + "learning_rate": 7.52706996888205e-05, + "loss": 1.793, + "step": 11450 + }, + { + "epoch": 3.514732965009208, + "grad_norm": 0.30281978845596313, + "learning_rate": 7.52664105822418e-05, + "loss": 1.7896, + "step": 11451 + }, + { + "epoch": 3.5150399017802334, + "grad_norm": 0.3317166566848755, + "learning_rate": 7.526212122596561e-05, + "loss": 1.7776, + "step": 11452 + }, + { + "epoch": 3.5153468385512583, + "grad_norm": 0.3400021195411682, + "learning_rate": 7.525783162003434e-05, + "loss": 1.8411, + "step": 11453 + }, + { + "epoch": 3.5156537753222836, + "grad_norm": 0.25169485807418823, + "learning_rate": 7.525354176449037e-05, + "loss": 1.7871, + "step": 11454 + }, + { + "epoch": 3.5159607120933085, + "grad_norm": 0.3442455530166626, + "learning_rate": 7.52492516593761e-05, + "loss": 1.7644, + "step": 11455 + }, + { + "epoch": 3.516267648864334, + "grad_norm": 0.35644033551216125, + "learning_rate": 7.524496130473394e-05, + "loss": 1.801, + "step": 11456 + }, + { + "epoch": 3.516574585635359, + "grad_norm": 0.3180185854434967, + "learning_rate": 7.524067070060625e-05, + "loss": 1.7897, + "step": 11457 + }, + { + "epoch": 3.5168815224063845, + "grad_norm": 0.2417978048324585, + "learning_rate": 7.523637984703548e-05, + "loss": 1.8527, + "step": 11458 + }, + { + "epoch": 3.5171884591774094, + "grad_norm": 0.29661375284194946, + "learning_rate": 7.5232088744064e-05, + "loss": 1.8276, + "step": 11459 + }, + { + "epoch": 3.5174953959484347, + "grad_norm": 0.2467545121908188, + "learning_rate": 7.522779739173424e-05, + "loss": 1.7819, + "step": 11460 + }, + { + "epoch": 3.5178023327194596, + "grad_norm": 0.26177898049354553, + "learning_rate": 7.522350579008859e-05, + "loss": 1.8017, + "step": 11461 + }, + { + "epoch": 3.518109269490485, + "grad_norm": 0.28740498423576355, + "learning_rate": 7.521921393916948e-05, + "loss": 1.7863, + "step": 11462 + }, + { + "epoch": 3.5184162062615103, + "grad_norm": 0.28685200214385986, + "learning_rate": 7.521492183901932e-05, + "loss": 1.8069, + "step": 11463 + }, + { + "epoch": 3.518723143032535, + "grad_norm": 0.24174338579177856, + "learning_rate": 7.521062948968051e-05, + "loss": 1.7523, + "step": 11464 + }, + { + "epoch": 3.5190300798035605, + "grad_norm": 0.23273243010044098, + "learning_rate": 7.520633689119548e-05, + "loss": 1.7827, + "step": 11465 + }, + { + "epoch": 3.5193370165745854, + "grad_norm": 0.22708217799663544, + "learning_rate": 7.520204404360667e-05, + "loss": 1.7377, + "step": 11466 + }, + { + "epoch": 3.5196439533456108, + "grad_norm": 0.24725353717803955, + "learning_rate": 7.519775094695649e-05, + "loss": 1.7828, + "step": 11467 + }, + { + "epoch": 3.519950890116636, + "grad_norm": 0.23046265542507172, + "learning_rate": 7.519345760128736e-05, + "loss": 1.7427, + "step": 11468 + }, + { + "epoch": 3.520257826887661, + "grad_norm": 0.2618728280067444, + "learning_rate": 7.518916400664171e-05, + "loss": 1.8133, + "step": 11469 + }, + { + "epoch": 3.5205647636586863, + "grad_norm": 0.23232363164424896, + "learning_rate": 7.5184870163062e-05, + "loss": 1.7468, + "step": 11470 + }, + { + "epoch": 3.520871700429711, + "grad_norm": 0.21993626654148102, + "learning_rate": 7.51805760705906e-05, + "loss": 1.7565, + "step": 11471 + }, + { + "epoch": 3.5211786372007365, + "grad_norm": 0.23563124239444733, + "learning_rate": 7.517628172927001e-05, + "loss": 1.7795, + "step": 11472 + }, + { + "epoch": 3.521485573971762, + "grad_norm": 0.24502862989902496, + "learning_rate": 7.517198713914266e-05, + "loss": 1.813, + "step": 11473 + }, + { + "epoch": 3.521792510742787, + "grad_norm": 0.24745969474315643, + "learning_rate": 7.516769230025097e-05, + "loss": 1.7601, + "step": 11474 + }, + { + "epoch": 3.522099447513812, + "grad_norm": 0.27686986327171326, + "learning_rate": 7.516339721263739e-05, + "loss": 1.8121, + "step": 11475 + }, + { + "epoch": 3.5224063842848374, + "grad_norm": 0.3110332787036896, + "learning_rate": 7.515910187634439e-05, + "loss": 1.7978, + "step": 11476 + }, + { + "epoch": 3.5227133210558623, + "grad_norm": 0.3394792377948761, + "learning_rate": 7.515480629141436e-05, + "loss": 1.8427, + "step": 11477 + }, + { + "epoch": 3.5230202578268877, + "grad_norm": 0.2802537679672241, + "learning_rate": 7.515051045788984e-05, + "loss": 1.7343, + "step": 11478 + }, + { + "epoch": 3.523327194597913, + "grad_norm": 0.23687711358070374, + "learning_rate": 7.514621437581319e-05, + "loss": 1.7786, + "step": 11479 + }, + { + "epoch": 3.523634131368938, + "grad_norm": 0.31114310026168823, + "learning_rate": 7.514191804522693e-05, + "loss": 1.8137, + "step": 11480 + }, + { + "epoch": 3.523941068139963, + "grad_norm": 0.3257891833782196, + "learning_rate": 7.513762146617351e-05, + "loss": 1.8015, + "step": 11481 + }, + { + "epoch": 3.524248004910988, + "grad_norm": 0.24353443086147308, + "learning_rate": 7.513332463869536e-05, + "loss": 1.7485, + "step": 11482 + }, + { + "epoch": 3.5245549416820134, + "grad_norm": 0.29861485958099365, + "learning_rate": 7.512902756283498e-05, + "loss": 1.7993, + "step": 11483 + }, + { + "epoch": 3.5248618784530388, + "grad_norm": 0.40380924940109253, + "learning_rate": 7.51247302386348e-05, + "loss": 1.7664, + "step": 11484 + }, + { + "epoch": 3.525168815224064, + "grad_norm": 0.3365862965583801, + "learning_rate": 7.512043266613733e-05, + "loss": 1.7512, + "step": 11485 + }, + { + "epoch": 3.525475751995089, + "grad_norm": 0.2502824068069458, + "learning_rate": 7.511613484538502e-05, + "loss": 1.8414, + "step": 11486 + }, + { + "epoch": 3.5257826887661143, + "grad_norm": 0.2598603069782257, + "learning_rate": 7.511183677642034e-05, + "loss": 1.7358, + "step": 11487 + }, + { + "epoch": 3.5260896255371392, + "grad_norm": 0.30246880650520325, + "learning_rate": 7.510753845928576e-05, + "loss": 1.791, + "step": 11488 + }, + { + "epoch": 3.5263965623081646, + "grad_norm": 0.25170832872390747, + "learning_rate": 7.510323989402378e-05, + "loss": 1.7498, + "step": 11489 + }, + { + "epoch": 3.52670349907919, + "grad_norm": 0.2925282418727875, + "learning_rate": 7.509894108067688e-05, + "loss": 1.8413, + "step": 11490 + }, + { + "epoch": 3.527010435850215, + "grad_norm": 0.2643601596355438, + "learning_rate": 7.509464201928752e-05, + "loss": 1.8052, + "step": 11491 + }, + { + "epoch": 3.52731737262124, + "grad_norm": 0.2938917279243469, + "learning_rate": 7.50903427098982e-05, + "loss": 1.7308, + "step": 11492 + }, + { + "epoch": 3.527624309392265, + "grad_norm": 0.2978343367576599, + "learning_rate": 7.508604315255142e-05, + "loss": 1.8147, + "step": 11493 + }, + { + "epoch": 3.5279312461632903, + "grad_norm": 0.2507816255092621, + "learning_rate": 7.508174334728963e-05, + "loss": 1.774, + "step": 11494 + }, + { + "epoch": 3.5282381829343157, + "grad_norm": 0.32971861958503723, + "learning_rate": 7.507744329415538e-05, + "loss": 1.7634, + "step": 11495 + }, + { + "epoch": 3.5285451197053406, + "grad_norm": 0.3149639964103699, + "learning_rate": 7.507314299319113e-05, + "loss": 1.8032, + "step": 11496 + }, + { + "epoch": 3.528852056476366, + "grad_norm": 0.2721364498138428, + "learning_rate": 7.506884244443937e-05, + "loss": 1.7702, + "step": 11497 + }, + { + "epoch": 3.529158993247391, + "grad_norm": 0.29375985264778137, + "learning_rate": 7.506454164794263e-05, + "loss": 1.8673, + "step": 11498 + }, + { + "epoch": 3.529465930018416, + "grad_norm": 0.379944384098053, + "learning_rate": 7.50602406037434e-05, + "loss": 1.883, + "step": 11499 + }, + { + "epoch": 3.5297728667894415, + "grad_norm": 0.4041840136051178, + "learning_rate": 7.505593931188417e-05, + "loss": 1.7998, + "step": 11500 + }, + { + "epoch": 3.530079803560467, + "grad_norm": 0.30013784766197205, + "learning_rate": 7.505163777240747e-05, + "loss": 1.775, + "step": 11501 + }, + { + "epoch": 3.5303867403314917, + "grad_norm": 0.25161153078079224, + "learning_rate": 7.50473359853558e-05, + "loss": 1.8609, + "step": 11502 + }, + { + "epoch": 3.530693677102517, + "grad_norm": 0.2803831100463867, + "learning_rate": 7.504303395077168e-05, + "loss": 1.8397, + "step": 11503 + }, + { + "epoch": 3.531000613873542, + "grad_norm": 0.26678118109703064, + "learning_rate": 7.503873166869762e-05, + "loss": 1.7877, + "step": 11504 + }, + { + "epoch": 3.5313075506445673, + "grad_norm": 0.24280449748039246, + "learning_rate": 7.503442913917613e-05, + "loss": 1.7891, + "step": 11505 + }, + { + "epoch": 3.5316144874155926, + "grad_norm": 0.26461485028266907, + "learning_rate": 7.503012636224976e-05, + "loss": 1.7993, + "step": 11506 + }, + { + "epoch": 3.5319214241866175, + "grad_norm": 0.27001824975013733, + "learning_rate": 7.502582333796098e-05, + "loss": 1.7719, + "step": 11507 + }, + { + "epoch": 3.532228360957643, + "grad_norm": 0.27585846185684204, + "learning_rate": 7.502152006635237e-05, + "loss": 1.7412, + "step": 11508 + }, + { + "epoch": 3.5325352977286677, + "grad_norm": 0.24896648526191711, + "learning_rate": 7.501721654746643e-05, + "loss": 1.7459, + "step": 11509 + }, + { + "epoch": 3.532842234499693, + "grad_norm": 0.2308502197265625, + "learning_rate": 7.501291278134569e-05, + "loss": 1.7717, + "step": 11510 + }, + { + "epoch": 3.5331491712707184, + "grad_norm": 0.3026069104671478, + "learning_rate": 7.500860876803267e-05, + "loss": 1.8578, + "step": 11511 + }, + { + "epoch": 3.5334561080417433, + "grad_norm": 0.30242082476615906, + "learning_rate": 7.500430450756995e-05, + "loss": 1.7793, + "step": 11512 + }, + { + "epoch": 3.5337630448127686, + "grad_norm": 0.2583339214324951, + "learning_rate": 7.500000000000001e-05, + "loss": 1.8388, + "step": 11513 + }, + { + "epoch": 3.5340699815837935, + "grad_norm": 0.29673871397972107, + "learning_rate": 7.499569524536542e-05, + "loss": 1.7749, + "step": 11514 + }, + { + "epoch": 3.534376918354819, + "grad_norm": 0.35199788212776184, + "learning_rate": 7.499139024370874e-05, + "loss": 1.7863, + "step": 11515 + }, + { + "epoch": 3.534683855125844, + "grad_norm": 0.25776436924934387, + "learning_rate": 7.498708499507247e-05, + "loss": 1.7568, + "step": 11516 + }, + { + "epoch": 3.5349907918968695, + "grad_norm": 0.26081520318984985, + "learning_rate": 7.498277949949919e-05, + "loss": 1.807, + "step": 11517 + }, + { + "epoch": 3.5352977286678944, + "grad_norm": 0.29247912764549255, + "learning_rate": 7.497847375703145e-05, + "loss": 1.7568, + "step": 11518 + }, + { + "epoch": 3.5356046654389197, + "grad_norm": 0.20964498817920685, + "learning_rate": 7.497416776771178e-05, + "loss": 1.7601, + "step": 11519 + }, + { + "epoch": 3.5359116022099446, + "grad_norm": 0.28739818930625916, + "learning_rate": 7.496986153158273e-05, + "loss": 1.7915, + "step": 11520 + }, + { + "epoch": 3.53621853898097, + "grad_norm": 0.3109932839870453, + "learning_rate": 7.496555504868691e-05, + "loss": 1.8046, + "step": 11521 + }, + { + "epoch": 3.5365254757519953, + "grad_norm": 0.259284108877182, + "learning_rate": 7.496124831906681e-05, + "loss": 1.7595, + "step": 11522 + }, + { + "epoch": 3.53683241252302, + "grad_norm": 0.265909343957901, + "learning_rate": 7.495694134276504e-05, + "loss": 1.8249, + "step": 11523 + }, + { + "epoch": 3.5371393492940455, + "grad_norm": 0.2478799819946289, + "learning_rate": 7.495263411982415e-05, + "loss": 1.8531, + "step": 11524 + }, + { + "epoch": 3.5374462860650704, + "grad_norm": 0.2636432945728302, + "learning_rate": 7.494832665028671e-05, + "loss": 1.8114, + "step": 11525 + }, + { + "epoch": 3.5377532228360957, + "grad_norm": 0.25323864817619324, + "learning_rate": 7.494401893419527e-05, + "loss": 1.8271, + "step": 11526 + }, + { + "epoch": 3.538060159607121, + "grad_norm": 0.2352467179298401, + "learning_rate": 7.493971097159241e-05, + "loss": 1.7524, + "step": 11527 + }, + { + "epoch": 3.538367096378146, + "grad_norm": 0.2788623869419098, + "learning_rate": 7.493540276252072e-05, + "loss": 1.8238, + "step": 11528 + }, + { + "epoch": 3.5386740331491713, + "grad_norm": 0.3506326377391815, + "learning_rate": 7.493109430702277e-05, + "loss": 1.8525, + "step": 11529 + }, + { + "epoch": 3.538980969920196, + "grad_norm": 0.3685263395309448, + "learning_rate": 7.492678560514113e-05, + "loss": 1.8497, + "step": 11530 + }, + { + "epoch": 3.5392879066912215, + "grad_norm": 0.32200056314468384, + "learning_rate": 7.492247665691837e-05, + "loss": 1.7587, + "step": 11531 + }, + { + "epoch": 3.539594843462247, + "grad_norm": 0.2800062894821167, + "learning_rate": 7.49181674623971e-05, + "loss": 1.8188, + "step": 11532 + }, + { + "epoch": 3.539901780233272, + "grad_norm": 0.24137580394744873, + "learning_rate": 7.491385802161989e-05, + "loss": 1.7947, + "step": 11533 + }, + { + "epoch": 3.540208717004297, + "grad_norm": 0.21900027990341187, + "learning_rate": 7.490954833462933e-05, + "loss": 1.7722, + "step": 11534 + }, + { + "epoch": 3.5405156537753224, + "grad_norm": 0.25009945034980774, + "learning_rate": 7.490523840146803e-05, + "loss": 1.8173, + "step": 11535 + }, + { + "epoch": 3.5408225905463473, + "grad_norm": 0.2778431475162506, + "learning_rate": 7.490092822217855e-05, + "loss": 1.8368, + "step": 11536 + }, + { + "epoch": 3.5411295273173726, + "grad_norm": 0.2845982611179352, + "learning_rate": 7.48966177968035e-05, + "loss": 1.7539, + "step": 11537 + }, + { + "epoch": 3.541436464088398, + "grad_norm": 0.27480921149253845, + "learning_rate": 7.48923071253855e-05, + "loss": 1.8494, + "step": 11538 + }, + { + "epoch": 3.541743400859423, + "grad_norm": 0.2722087502479553, + "learning_rate": 7.488799620796711e-05, + "loss": 1.8422, + "step": 11539 + }, + { + "epoch": 3.542050337630448, + "grad_norm": 0.2984340190887451, + "learning_rate": 7.488368504459097e-05, + "loss": 1.8042, + "step": 11540 + }, + { + "epoch": 3.542357274401473, + "grad_norm": 0.2405850738286972, + "learning_rate": 7.487937363529966e-05, + "loss": 1.749, + "step": 11541 + }, + { + "epoch": 3.5426642111724984, + "grad_norm": 0.24816973507404327, + "learning_rate": 7.487506198013579e-05, + "loss": 1.8671, + "step": 11542 + }, + { + "epoch": 3.5429711479435237, + "grad_norm": 0.2796473503112793, + "learning_rate": 7.487075007914199e-05, + "loss": 1.8023, + "step": 11543 + }, + { + "epoch": 3.5432780847145486, + "grad_norm": 0.2600162625312805, + "learning_rate": 7.486643793236086e-05, + "loss": 1.7997, + "step": 11544 + }, + { + "epoch": 3.543585021485574, + "grad_norm": 0.2746226489543915, + "learning_rate": 7.486212553983503e-05, + "loss": 1.7773, + "step": 11545 + }, + { + "epoch": 3.5438919582565993, + "grad_norm": 0.24142079055309296, + "learning_rate": 7.485781290160708e-05, + "loss": 1.791, + "step": 11546 + }, + { + "epoch": 3.544198895027624, + "grad_norm": 0.2472934126853943, + "learning_rate": 7.485350001771966e-05, + "loss": 1.8183, + "step": 11547 + }, + { + "epoch": 3.5445058317986495, + "grad_norm": 0.26891404390335083, + "learning_rate": 7.48491868882154e-05, + "loss": 1.7421, + "step": 11548 + }, + { + "epoch": 3.544812768569675, + "grad_norm": 0.24820464849472046, + "learning_rate": 7.48448735131369e-05, + "loss": 1.7372, + "step": 11549 + }, + { + "epoch": 3.5451197053406998, + "grad_norm": 0.2456594705581665, + "learning_rate": 7.484055989252679e-05, + "loss": 1.7883, + "step": 11550 + }, + { + "epoch": 3.545426642111725, + "grad_norm": 0.32420551776885986, + "learning_rate": 7.48362460264277e-05, + "loss": 1.8363, + "step": 11551 + }, + { + "epoch": 3.54573357888275, + "grad_norm": 0.3187662661075592, + "learning_rate": 7.483193191488229e-05, + "loss": 1.7957, + "step": 11552 + }, + { + "epoch": 3.5460405156537753, + "grad_norm": 0.2845410108566284, + "learning_rate": 7.482761755793316e-05, + "loss": 1.8288, + "step": 11553 + }, + { + "epoch": 3.5463474524248007, + "grad_norm": 0.2816021740436554, + "learning_rate": 7.482330295562298e-05, + "loss": 1.7562, + "step": 11554 + }, + { + "epoch": 3.5466543891958255, + "grad_norm": 0.28938058018684387, + "learning_rate": 7.481898810799435e-05, + "loss": 1.8139, + "step": 11555 + }, + { + "epoch": 3.546961325966851, + "grad_norm": 0.3305707573890686, + "learning_rate": 7.481467301508995e-05, + "loss": 1.8956, + "step": 11556 + }, + { + "epoch": 3.5472682627378758, + "grad_norm": 0.3890376091003418, + "learning_rate": 7.48103576769524e-05, + "loss": 1.8552, + "step": 11557 + }, + { + "epoch": 3.547575199508901, + "grad_norm": 0.3900652825832367, + "learning_rate": 7.480604209362434e-05, + "loss": 1.7748, + "step": 11558 + }, + { + "epoch": 3.5478821362799264, + "grad_norm": 0.3297326862812042, + "learning_rate": 7.480172626514845e-05, + "loss": 1.8201, + "step": 11559 + }, + { + "epoch": 3.5481890730509518, + "grad_norm": 0.28797218203544617, + "learning_rate": 7.479741019156737e-05, + "loss": 1.7652, + "step": 11560 + }, + { + "epoch": 3.5484960098219767, + "grad_norm": 0.2764691114425659, + "learning_rate": 7.479309387292373e-05, + "loss": 1.7534, + "step": 11561 + }, + { + "epoch": 3.548802946593002, + "grad_norm": 0.25067585706710815, + "learning_rate": 7.47887773092602e-05, + "loss": 1.7849, + "step": 11562 + }, + { + "epoch": 3.549109883364027, + "grad_norm": 0.29966798424720764, + "learning_rate": 7.478446050061947e-05, + "loss": 1.8299, + "step": 11563 + }, + { + "epoch": 3.549416820135052, + "grad_norm": 0.24068406224250793, + "learning_rate": 7.478014344704416e-05, + "loss": 1.8366, + "step": 11564 + }, + { + "epoch": 3.5497237569060776, + "grad_norm": 0.2559303641319275, + "learning_rate": 7.477582614857695e-05, + "loss": 1.7665, + "step": 11565 + }, + { + "epoch": 3.5500306936771024, + "grad_norm": 0.24617858231067657, + "learning_rate": 7.47715086052605e-05, + "loss": 1.8334, + "step": 11566 + }, + { + "epoch": 3.550337630448128, + "grad_norm": 0.2433501034975052, + "learning_rate": 7.476719081713749e-05, + "loss": 1.7963, + "step": 11567 + }, + { + "epoch": 3.5506445672191527, + "grad_norm": 0.2583518326282501, + "learning_rate": 7.476287278425057e-05, + "loss": 1.8311, + "step": 11568 + }, + { + "epoch": 3.550951503990178, + "grad_norm": 0.3232485055923462, + "learning_rate": 7.475855450664244e-05, + "loss": 1.9162, + "step": 11569 + }, + { + "epoch": 3.5512584407612033, + "grad_norm": 0.28247153759002686, + "learning_rate": 7.475423598435576e-05, + "loss": 1.8027, + "step": 11570 + }, + { + "epoch": 3.5515653775322282, + "grad_norm": 0.27201834321022034, + "learning_rate": 7.47499172174332e-05, + "loss": 1.7822, + "step": 11571 + }, + { + "epoch": 3.5518723143032536, + "grad_norm": 0.2408471554517746, + "learning_rate": 7.474559820591748e-05, + "loss": 1.7735, + "step": 11572 + }, + { + "epoch": 3.5521792510742785, + "grad_norm": 0.24187393486499786, + "learning_rate": 7.474127894985124e-05, + "loss": 1.7931, + "step": 11573 + }, + { + "epoch": 3.552486187845304, + "grad_norm": 0.2759699523448944, + "learning_rate": 7.473695944927717e-05, + "loss": 1.8407, + "step": 11574 + }, + { + "epoch": 3.552793124616329, + "grad_norm": 0.2503111958503723, + "learning_rate": 7.473263970423797e-05, + "loss": 1.7613, + "step": 11575 + }, + { + "epoch": 3.5531000613873545, + "grad_norm": 0.24795177578926086, + "learning_rate": 7.472831971477633e-05, + "loss": 1.8221, + "step": 11576 + }, + { + "epoch": 3.5534069981583793, + "grad_norm": 0.23190177977085114, + "learning_rate": 7.472399948093494e-05, + "loss": 1.7541, + "step": 11577 + }, + { + "epoch": 3.5537139349294047, + "grad_norm": 0.24650825560092926, + "learning_rate": 7.471967900275653e-05, + "loss": 1.8002, + "step": 11578 + }, + { + "epoch": 3.5540208717004296, + "grad_norm": 0.256598562002182, + "learning_rate": 7.471535828028372e-05, + "loss": 1.7052, + "step": 11579 + }, + { + "epoch": 3.554327808471455, + "grad_norm": 0.2715381681919098, + "learning_rate": 7.471103731355926e-05, + "loss": 1.7701, + "step": 11580 + }, + { + "epoch": 3.5546347452424802, + "grad_norm": 0.29806044697761536, + "learning_rate": 7.470671610262586e-05, + "loss": 1.7614, + "step": 11581 + }, + { + "epoch": 3.554941682013505, + "grad_norm": 0.26364314556121826, + "learning_rate": 7.470239464752621e-05, + "loss": 1.7957, + "step": 11582 + }, + { + "epoch": 3.5552486187845305, + "grad_norm": 0.29270800948143005, + "learning_rate": 7.4698072948303e-05, + "loss": 1.8263, + "step": 11583 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.25941839814186096, + "learning_rate": 7.469375100499898e-05, + "loss": 1.8517, + "step": 11584 + }, + { + "epoch": 3.5558624923265807, + "grad_norm": 0.29509237408638, + "learning_rate": 7.468942881765681e-05, + "loss": 1.8643, + "step": 11585 + }, + { + "epoch": 3.556169429097606, + "grad_norm": 0.23090367019176483, + "learning_rate": 7.468510638631926e-05, + "loss": 1.7239, + "step": 11586 + }, + { + "epoch": 3.556476365868631, + "grad_norm": 0.2696724236011505, + "learning_rate": 7.468078371102901e-05, + "loss": 1.848, + "step": 11587 + }, + { + "epoch": 3.5567833026396563, + "grad_norm": 0.2691192626953125, + "learning_rate": 7.46764607918288e-05, + "loss": 1.8194, + "step": 11588 + }, + { + "epoch": 3.557090239410681, + "grad_norm": 0.26616501808166504, + "learning_rate": 7.467213762876131e-05, + "loss": 1.8382, + "step": 11589 + }, + { + "epoch": 3.5573971761817065, + "grad_norm": 0.30629831552505493, + "learning_rate": 7.466781422186933e-05, + "loss": 1.8417, + "step": 11590 + }, + { + "epoch": 3.557704112952732, + "grad_norm": 0.27212417125701904, + "learning_rate": 7.466349057119552e-05, + "loss": 1.7612, + "step": 11591 + }, + { + "epoch": 3.558011049723757, + "grad_norm": 0.2872084379196167, + "learning_rate": 7.465916667678266e-05, + "loss": 1.7998, + "step": 11592 + }, + { + "epoch": 3.558317986494782, + "grad_norm": 0.3017117977142334, + "learning_rate": 7.465484253867348e-05, + "loss": 1.7996, + "step": 11593 + }, + { + "epoch": 3.5586249232658074, + "grad_norm": 0.2707957327365875, + "learning_rate": 7.465051815691066e-05, + "loss": 1.7678, + "step": 11594 + }, + { + "epoch": 3.5589318600368323, + "grad_norm": 0.28932711482048035, + "learning_rate": 7.464619353153702e-05, + "loss": 1.8576, + "step": 11595 + }, + { + "epoch": 3.5592387968078576, + "grad_norm": 0.2585125267505646, + "learning_rate": 7.464186866259519e-05, + "loss": 1.8678, + "step": 11596 + }, + { + "epoch": 3.559545733578883, + "grad_norm": 0.24386851489543915, + "learning_rate": 7.4637543550128e-05, + "loss": 1.7778, + "step": 11597 + }, + { + "epoch": 3.559852670349908, + "grad_norm": 0.2375860959291458, + "learning_rate": 7.463321819417817e-05, + "loss": 1.8096, + "step": 11598 + }, + { + "epoch": 3.560159607120933, + "grad_norm": 0.2341299206018448, + "learning_rate": 7.462889259478842e-05, + "loss": 1.7191, + "step": 11599 + }, + { + "epoch": 3.560466543891958, + "grad_norm": 0.2510595917701721, + "learning_rate": 7.462456675200154e-05, + "loss": 1.7763, + "step": 11600 + }, + { + "epoch": 3.5607734806629834, + "grad_norm": 0.2554674744606018, + "learning_rate": 7.462024066586025e-05, + "loss": 1.7578, + "step": 11601 + }, + { + "epoch": 3.5610804174340087, + "grad_norm": 0.25040730834007263, + "learning_rate": 7.46159143364073e-05, + "loss": 1.8194, + "step": 11602 + }, + { + "epoch": 3.5613873542050336, + "grad_norm": 0.24294932186603546, + "learning_rate": 7.461158776368547e-05, + "loss": 1.8063, + "step": 11603 + }, + { + "epoch": 3.561694290976059, + "grad_norm": 0.2388325333595276, + "learning_rate": 7.46072609477375e-05, + "loss": 1.7942, + "step": 11604 + }, + { + "epoch": 3.562001227747084, + "grad_norm": 0.2569502890110016, + "learning_rate": 7.460293388860615e-05, + "loss": 1.7824, + "step": 11605 + }, + { + "epoch": 3.562308164518109, + "grad_norm": 0.24004346132278442, + "learning_rate": 7.45986065863342e-05, + "loss": 1.8676, + "step": 11606 + }, + { + "epoch": 3.5626151012891345, + "grad_norm": 0.25446319580078125, + "learning_rate": 7.45942790409644e-05, + "loss": 1.7726, + "step": 11607 + }, + { + "epoch": 3.56292203806016, + "grad_norm": 0.26257482171058655, + "learning_rate": 7.458995125253951e-05, + "loss": 1.779, + "step": 11608 + }, + { + "epoch": 3.5632289748311847, + "grad_norm": 0.27703070640563965, + "learning_rate": 7.458562322110231e-05, + "loss": 1.8247, + "step": 11609 + }, + { + "epoch": 3.56353591160221, + "grad_norm": 0.25478535890579224, + "learning_rate": 7.458129494669556e-05, + "loss": 1.7794, + "step": 11610 + }, + { + "epoch": 3.563842848373235, + "grad_norm": 0.26173365116119385, + "learning_rate": 7.457696642936207e-05, + "loss": 1.758, + "step": 11611 + }, + { + "epoch": 3.5641497851442603, + "grad_norm": 0.25077274441719055, + "learning_rate": 7.45726376691446e-05, + "loss": 1.8234, + "step": 11612 + }, + { + "epoch": 3.5644567219152856, + "grad_norm": 0.2591109275817871, + "learning_rate": 7.456830866608589e-05, + "loss": 1.7723, + "step": 11613 + }, + { + "epoch": 3.5647636586863105, + "grad_norm": 0.2653447091579437, + "learning_rate": 7.456397942022877e-05, + "loss": 1.7839, + "step": 11614 + }, + { + "epoch": 3.565070595457336, + "grad_norm": 0.3203454911708832, + "learning_rate": 7.455964993161601e-05, + "loss": 1.8548, + "step": 11615 + }, + { + "epoch": 3.5653775322283607, + "grad_norm": 0.3041793704032898, + "learning_rate": 7.455532020029039e-05, + "loss": 1.7925, + "step": 11616 + }, + { + "epoch": 3.565684468999386, + "grad_norm": 0.26066139340400696, + "learning_rate": 7.45509902262947e-05, + "loss": 1.7905, + "step": 11617 + }, + { + "epoch": 3.5659914057704114, + "grad_norm": 0.2483314871788025, + "learning_rate": 7.454666000967174e-05, + "loss": 1.7658, + "step": 11618 + }, + { + "epoch": 3.5662983425414367, + "grad_norm": 0.24285900592803955, + "learning_rate": 7.45423295504643e-05, + "loss": 1.7575, + "step": 11619 + }, + { + "epoch": 3.5666052793124616, + "grad_norm": 0.27231669425964355, + "learning_rate": 7.453799884871517e-05, + "loss": 1.8389, + "step": 11620 + }, + { + "epoch": 3.566912216083487, + "grad_norm": 0.24324406683444977, + "learning_rate": 7.453366790446717e-05, + "loss": 1.7775, + "step": 11621 + }, + { + "epoch": 3.567219152854512, + "grad_norm": 0.2724440097808838, + "learning_rate": 7.452933671776305e-05, + "loss": 1.8135, + "step": 11622 + }, + { + "epoch": 3.567526089625537, + "grad_norm": 0.22207655012607574, + "learning_rate": 7.452500528864568e-05, + "loss": 1.722, + "step": 11623 + }, + { + "epoch": 3.5678330263965625, + "grad_norm": 0.25650298595428467, + "learning_rate": 7.452067361715782e-05, + "loss": 1.7813, + "step": 11624 + }, + { + "epoch": 3.5681399631675874, + "grad_norm": 0.2582200765609741, + "learning_rate": 7.45163417033423e-05, + "loss": 1.8253, + "step": 11625 + }, + { + "epoch": 3.5684468999386127, + "grad_norm": 0.29545384645462036, + "learning_rate": 7.451200954724188e-05, + "loss": 1.8108, + "step": 11626 + }, + { + "epoch": 3.5687538367096376, + "grad_norm": 0.30457428097724915, + "learning_rate": 7.450767714889946e-05, + "loss": 1.8257, + "step": 11627 + }, + { + "epoch": 3.569060773480663, + "grad_norm": 0.2955166697502136, + "learning_rate": 7.450334450835781e-05, + "loss": 1.8172, + "step": 11628 + }, + { + "epoch": 3.5693677102516883, + "grad_norm": 0.2793857753276825, + "learning_rate": 7.449901162565974e-05, + "loss": 1.8493, + "step": 11629 + }, + { + "epoch": 3.569674647022713, + "grad_norm": 0.27154335379600525, + "learning_rate": 7.449467850084808e-05, + "loss": 1.8306, + "step": 11630 + }, + { + "epoch": 3.5699815837937385, + "grad_norm": 0.22336189448833466, + "learning_rate": 7.449034513396564e-05, + "loss": 1.7435, + "step": 11631 + }, + { + "epoch": 3.5702885205647634, + "grad_norm": 0.22799183428287506, + "learning_rate": 7.448601152505526e-05, + "loss": 1.7818, + "step": 11632 + }, + { + "epoch": 3.5705954573357888, + "grad_norm": 0.26670658588409424, + "learning_rate": 7.448167767415976e-05, + "loss": 1.7777, + "step": 11633 + }, + { + "epoch": 3.570902394106814, + "grad_norm": 0.2848666310310364, + "learning_rate": 7.447734358132196e-05, + "loss": 1.7572, + "step": 11634 + }, + { + "epoch": 3.5712093308778394, + "grad_norm": 0.26843544840812683, + "learning_rate": 7.447300924658473e-05, + "loss": 1.7642, + "step": 11635 + }, + { + "epoch": 3.5715162676488643, + "grad_norm": 0.24666404724121094, + "learning_rate": 7.446867466999087e-05, + "loss": 1.7533, + "step": 11636 + }, + { + "epoch": 3.5718232044198897, + "grad_norm": 0.31111210584640503, + "learning_rate": 7.44643398515832e-05, + "loss": 1.7875, + "step": 11637 + }, + { + "epoch": 3.5721301411909145, + "grad_norm": 0.3157108724117279, + "learning_rate": 7.446000479140462e-05, + "loss": 1.7879, + "step": 11638 + }, + { + "epoch": 3.57243707796194, + "grad_norm": 0.2935558259487152, + "learning_rate": 7.445566948949792e-05, + "loss": 1.7819, + "step": 11639 + }, + { + "epoch": 3.572744014732965, + "grad_norm": 0.2265472710132599, + "learning_rate": 7.445133394590597e-05, + "loss": 1.7518, + "step": 11640 + }, + { + "epoch": 3.57305095150399, + "grad_norm": 0.2564176023006439, + "learning_rate": 7.444699816067159e-05, + "loss": 1.7281, + "step": 11641 + }, + { + "epoch": 3.5733578882750154, + "grad_norm": 0.27933555841445923, + "learning_rate": 7.444266213383766e-05, + "loss": 1.7852, + "step": 11642 + }, + { + "epoch": 3.5736648250460403, + "grad_norm": 0.29105356335639954, + "learning_rate": 7.4438325865447e-05, + "loss": 1.8056, + "step": 11643 + }, + { + "epoch": 3.5739717618170657, + "grad_norm": 0.27665549516677856, + "learning_rate": 7.443398935554249e-05, + "loss": 1.7249, + "step": 11644 + }, + { + "epoch": 3.574278698588091, + "grad_norm": 0.21899232268333435, + "learning_rate": 7.442965260416698e-05, + "loss": 1.7689, + "step": 11645 + }, + { + "epoch": 3.574585635359116, + "grad_norm": 0.3250672221183777, + "learning_rate": 7.442531561136333e-05, + "loss": 1.8058, + "step": 11646 + }, + { + "epoch": 3.574892572130141, + "grad_norm": 0.42442524433135986, + "learning_rate": 7.442097837717438e-05, + "loss": 1.7887, + "step": 11647 + }, + { + "epoch": 3.575199508901166, + "grad_norm": 0.33108964562416077, + "learning_rate": 7.441664090164302e-05, + "loss": 1.7628, + "step": 11648 + }, + { + "epoch": 3.5755064456721914, + "grad_norm": 0.23050357401371002, + "learning_rate": 7.44123031848121e-05, + "loss": 1.8121, + "step": 11649 + }, + { + "epoch": 3.575813382443217, + "grad_norm": 0.29251593351364136, + "learning_rate": 7.440796522672448e-05, + "loss": 1.8051, + "step": 11650 + }, + { + "epoch": 3.576120319214242, + "grad_norm": 0.3764750063419342, + "learning_rate": 7.440362702742305e-05, + "loss": 1.9002, + "step": 11651 + }, + { + "epoch": 3.576427255985267, + "grad_norm": 0.3751949071884155, + "learning_rate": 7.439928858695069e-05, + "loss": 1.821, + "step": 11652 + }, + { + "epoch": 3.5767341927562923, + "grad_norm": 0.268476665019989, + "learning_rate": 7.439494990535024e-05, + "loss": 1.8241, + "step": 11653 + }, + { + "epoch": 3.5770411295273172, + "grad_norm": 0.3072795271873474, + "learning_rate": 7.439061098266459e-05, + "loss": 1.8169, + "step": 11654 + }, + { + "epoch": 3.5773480662983426, + "grad_norm": 0.4948901832103729, + "learning_rate": 7.438627181893664e-05, + "loss": 1.7706, + "step": 11655 + }, + { + "epoch": 3.577655003069368, + "grad_norm": 0.5892601013183594, + "learning_rate": 7.438193241420926e-05, + "loss": 1.7631, + "step": 11656 + }, + { + "epoch": 3.577961939840393, + "grad_norm": 0.4599401652812958, + "learning_rate": 7.437759276852533e-05, + "loss": 1.7471, + "step": 11657 + }, + { + "epoch": 3.578268876611418, + "grad_norm": 0.2545170783996582, + "learning_rate": 7.437325288192773e-05, + "loss": 1.7945, + "step": 11658 + }, + { + "epoch": 3.578575813382443, + "grad_norm": 0.3136496841907501, + "learning_rate": 7.436891275445938e-05, + "loss": 1.828, + "step": 11659 + }, + { + "epoch": 3.5788827501534684, + "grad_norm": 0.3631688058376312, + "learning_rate": 7.436457238616313e-05, + "loss": 1.8302, + "step": 11660 + }, + { + "epoch": 3.5791896869244937, + "grad_norm": 0.3097386658191681, + "learning_rate": 7.436023177708192e-05, + "loss": 1.8397, + "step": 11661 + }, + { + "epoch": 3.5794966236955186, + "grad_norm": 0.20948798954486847, + "learning_rate": 7.43558909272586e-05, + "loss": 1.7844, + "step": 11662 + }, + { + "epoch": 3.579803560466544, + "grad_norm": 0.24327392876148224, + "learning_rate": 7.43515498367361e-05, + "loss": 1.7827, + "step": 11663 + }, + { + "epoch": 3.580110497237569, + "grad_norm": 0.25268325209617615, + "learning_rate": 7.434720850555731e-05, + "loss": 1.8224, + "step": 11664 + }, + { + "epoch": 3.580417434008594, + "grad_norm": 0.24883607029914856, + "learning_rate": 7.434286693376513e-05, + "loss": 1.8189, + "step": 11665 + }, + { + "epoch": 3.5807243707796195, + "grad_norm": 0.2942518889904022, + "learning_rate": 7.433852512140248e-05, + "loss": 1.8325, + "step": 11666 + }, + { + "epoch": 3.581031307550645, + "grad_norm": 0.3556186556816101, + "learning_rate": 7.433418306851225e-05, + "loss": 1.7511, + "step": 11667 + }, + { + "epoch": 3.5813382443216697, + "grad_norm": 0.421220600605011, + "learning_rate": 7.432984077513738e-05, + "loss": 1.8081, + "step": 11668 + }, + { + "epoch": 3.581645181092695, + "grad_norm": 0.3338243067264557, + "learning_rate": 7.432549824132074e-05, + "loss": 1.8274, + "step": 11669 + }, + { + "epoch": 3.58195211786372, + "grad_norm": 0.25091543793678284, + "learning_rate": 7.432115546710528e-05, + "loss": 1.7637, + "step": 11670 + }, + { + "epoch": 3.5822590546347453, + "grad_norm": 0.29870370030403137, + "learning_rate": 7.431681245253389e-05, + "loss": 1.8036, + "step": 11671 + }, + { + "epoch": 3.5825659914057706, + "grad_norm": 0.2682137191295624, + "learning_rate": 7.431246919764953e-05, + "loss": 1.8252, + "step": 11672 + }, + { + "epoch": 3.5828729281767955, + "grad_norm": 0.28790801763534546, + "learning_rate": 7.430812570249508e-05, + "loss": 1.7713, + "step": 11673 + }, + { + "epoch": 3.583179864947821, + "grad_norm": 0.26357609033584595, + "learning_rate": 7.43037819671135e-05, + "loss": 1.8388, + "step": 11674 + }, + { + "epoch": 3.5834868017188457, + "grad_norm": 0.2505483031272888, + "learning_rate": 7.42994379915477e-05, + "loss": 1.7722, + "step": 11675 + }, + { + "epoch": 3.583793738489871, + "grad_norm": 0.2535844147205353, + "learning_rate": 7.42950937758406e-05, + "loss": 1.756, + "step": 11676 + }, + { + "epoch": 3.5841006752608964, + "grad_norm": 0.23045027256011963, + "learning_rate": 7.429074932003515e-05, + "loss": 1.791, + "step": 11677 + }, + { + "epoch": 3.5844076120319213, + "grad_norm": 0.22525762021541595, + "learning_rate": 7.428640462417428e-05, + "loss": 1.7234, + "step": 11678 + }, + { + "epoch": 3.5847145488029466, + "grad_norm": 0.2402270883321762, + "learning_rate": 7.428205968830094e-05, + "loss": 1.845, + "step": 11679 + }, + { + "epoch": 3.5850214855739715, + "grad_norm": 0.24909646809101105, + "learning_rate": 7.427771451245802e-05, + "loss": 1.8537, + "step": 11680 + }, + { + "epoch": 3.585328422344997, + "grad_norm": 0.25813063979148865, + "learning_rate": 7.427336909668853e-05, + "loss": 1.7353, + "step": 11681 + }, + { + "epoch": 3.585635359116022, + "grad_norm": 0.26073768734931946, + "learning_rate": 7.426902344103534e-05, + "loss": 1.8142, + "step": 11682 + }, + { + "epoch": 3.5859422958870475, + "grad_norm": 0.2498280256986618, + "learning_rate": 7.426467754554147e-05, + "loss": 1.7996, + "step": 11683 + }, + { + "epoch": 3.5862492326580724, + "grad_norm": 0.3131188154220581, + "learning_rate": 7.426033141024981e-05, + "loss": 1.7793, + "step": 11684 + }, + { + "epoch": 3.5865561694290977, + "grad_norm": 0.24118199944496155, + "learning_rate": 7.425598503520337e-05, + "loss": 1.8249, + "step": 11685 + }, + { + "epoch": 3.5868631062001226, + "grad_norm": 0.2791197597980499, + "learning_rate": 7.425163842044504e-05, + "loss": 1.7966, + "step": 11686 + }, + { + "epoch": 3.587170042971148, + "grad_norm": 0.2298576384782791, + "learning_rate": 7.424729156601781e-05, + "loss": 1.7224, + "step": 11687 + }, + { + "epoch": 3.5874769797421733, + "grad_norm": 0.23113438487052917, + "learning_rate": 7.424294447196462e-05, + "loss": 1.7641, + "step": 11688 + }, + { + "epoch": 3.587783916513198, + "grad_norm": 0.3064495027065277, + "learning_rate": 7.423859713832847e-05, + "loss": 1.8688, + "step": 11689 + }, + { + "epoch": 3.5880908532842235, + "grad_norm": 0.22847676277160645, + "learning_rate": 7.423424956515228e-05, + "loss": 1.7513, + "step": 11690 + }, + { + "epoch": 3.5883977900552484, + "grad_norm": 0.2797350585460663, + "learning_rate": 7.422990175247905e-05, + "loss": 1.8268, + "step": 11691 + }, + { + "epoch": 3.5887047268262737, + "grad_norm": 0.2753821313381195, + "learning_rate": 7.422555370035171e-05, + "loss": 1.7313, + "step": 11692 + }, + { + "epoch": 3.589011663597299, + "grad_norm": 0.2981179654598236, + "learning_rate": 7.422120540881326e-05, + "loss": 1.8455, + "step": 11693 + }, + { + "epoch": 3.5893186003683244, + "grad_norm": 0.33028867840766907, + "learning_rate": 7.421685687790667e-05, + "loss": 1.8397, + "step": 11694 + }, + { + "epoch": 3.5896255371393493, + "grad_norm": 0.409173846244812, + "learning_rate": 7.421250810767487e-05, + "loss": 1.8088, + "step": 11695 + }, + { + "epoch": 3.5899324739103746, + "grad_norm": 0.4118194878101349, + "learning_rate": 7.42081590981609e-05, + "loss": 1.7719, + "step": 11696 + }, + { + "epoch": 3.5902394106813995, + "grad_norm": 0.34716179966926575, + "learning_rate": 7.420380984940773e-05, + "loss": 1.8063, + "step": 11697 + }, + { + "epoch": 3.590546347452425, + "grad_norm": 0.27763083577156067, + "learning_rate": 7.419946036145829e-05, + "loss": 1.7777, + "step": 11698 + }, + { + "epoch": 3.59085328422345, + "grad_norm": 0.3175280690193176, + "learning_rate": 7.419511063435562e-05, + "loss": 1.697, + "step": 11699 + }, + { + "epoch": 3.591160220994475, + "grad_norm": 0.3151503801345825, + "learning_rate": 7.419076066814268e-05, + "loss": 1.8067, + "step": 11700 + }, + { + "epoch": 3.5914671577655004, + "grad_norm": 0.26914867758750916, + "learning_rate": 7.418641046286245e-05, + "loss": 1.7797, + "step": 11701 + }, + { + "epoch": 3.5917740945365253, + "grad_norm": 0.27231964468955994, + "learning_rate": 7.418206001855797e-05, + "loss": 1.7931, + "step": 11702 + }, + { + "epoch": 3.5920810313075506, + "grad_norm": 0.3352177143096924, + "learning_rate": 7.417770933527217e-05, + "loss": 1.9187, + "step": 11703 + }, + { + "epoch": 3.592387968078576, + "grad_norm": 0.3510081470012665, + "learning_rate": 7.417335841304808e-05, + "loss": 1.7889, + "step": 11704 + }, + { + "epoch": 3.592694904849601, + "grad_norm": 0.24949313700199127, + "learning_rate": 7.41690072519287e-05, + "loss": 1.7683, + "step": 11705 + }, + { + "epoch": 3.593001841620626, + "grad_norm": 0.28442221879959106, + "learning_rate": 7.416465585195702e-05, + "loss": 1.7889, + "step": 11706 + }, + { + "epoch": 3.593308778391651, + "grad_norm": 0.3355824649333954, + "learning_rate": 7.416030421317605e-05, + "loss": 1.7637, + "step": 11707 + }, + { + "epoch": 3.5936157151626764, + "grad_norm": 0.33569446206092834, + "learning_rate": 7.415595233562878e-05, + "loss": 1.919, + "step": 11708 + }, + { + "epoch": 3.5939226519337018, + "grad_norm": 0.2488354742527008, + "learning_rate": 7.415160021935825e-05, + "loss": 1.8424, + "step": 11709 + }, + { + "epoch": 3.594229588704727, + "grad_norm": 0.2701130509376526, + "learning_rate": 7.414724786440746e-05, + "loss": 1.7586, + "step": 11710 + }, + { + "epoch": 3.594536525475752, + "grad_norm": 0.26289790868759155, + "learning_rate": 7.414289527081939e-05, + "loss": 1.7975, + "step": 11711 + }, + { + "epoch": 3.5948434622467773, + "grad_norm": 0.25382301211357117, + "learning_rate": 7.413854243863707e-05, + "loss": 1.7393, + "step": 11712 + }, + { + "epoch": 3.595150399017802, + "grad_norm": 0.28282979130744934, + "learning_rate": 7.413418936790357e-05, + "loss": 1.8048, + "step": 11713 + }, + { + "epoch": 3.5954573357888275, + "grad_norm": 0.28001347184181213, + "learning_rate": 7.412983605866183e-05, + "loss": 1.7864, + "step": 11714 + }, + { + "epoch": 3.595764272559853, + "grad_norm": 0.26107707619667053, + "learning_rate": 7.412548251095491e-05, + "loss": 1.8016, + "step": 11715 + }, + { + "epoch": 3.5960712093308778, + "grad_norm": 0.2518761456012726, + "learning_rate": 7.412112872482583e-05, + "loss": 1.7565, + "step": 11716 + }, + { + "epoch": 3.596378146101903, + "grad_norm": 0.25911152362823486, + "learning_rate": 7.411677470031762e-05, + "loss": 1.8333, + "step": 11717 + }, + { + "epoch": 3.596685082872928, + "grad_norm": 0.3411506414413452, + "learning_rate": 7.41124204374733e-05, + "loss": 1.8027, + "step": 11718 + }, + { + "epoch": 3.5969920196439533, + "grad_norm": 0.28535547852516174, + "learning_rate": 7.410806593633593e-05, + "loss": 1.7596, + "step": 11719 + }, + { + "epoch": 3.5972989564149787, + "grad_norm": 0.24665530025959015, + "learning_rate": 7.410371119694852e-05, + "loss": 1.7777, + "step": 11720 + }, + { + "epoch": 3.5976058931860035, + "grad_norm": 0.29162275791168213, + "learning_rate": 7.40993562193541e-05, + "loss": 1.795, + "step": 11721 + }, + { + "epoch": 3.597912829957029, + "grad_norm": 0.2712220549583435, + "learning_rate": 7.409500100359573e-05, + "loss": 1.824, + "step": 11722 + }, + { + "epoch": 3.5982197667280538, + "grad_norm": 0.239755779504776, + "learning_rate": 7.40906455497164e-05, + "loss": 1.7534, + "step": 11723 + }, + { + "epoch": 3.598526703499079, + "grad_norm": 0.26056957244873047, + "learning_rate": 7.408628985775922e-05, + "loss": 1.757, + "step": 11724 + }, + { + "epoch": 3.5988336402701044, + "grad_norm": 0.3230258822441101, + "learning_rate": 7.40819339277672e-05, + "loss": 1.8684, + "step": 11725 + }, + { + "epoch": 3.5991405770411298, + "grad_norm": 0.26070696115493774, + "learning_rate": 7.407757775978339e-05, + "loss": 1.7868, + "step": 11726 + }, + { + "epoch": 3.5994475138121547, + "grad_norm": 0.24940893054008484, + "learning_rate": 7.407322135385085e-05, + "loss": 1.8391, + "step": 11727 + }, + { + "epoch": 3.59975445058318, + "grad_norm": 0.2717723250389099, + "learning_rate": 7.406886471001263e-05, + "loss": 1.7567, + "step": 11728 + }, + { + "epoch": 3.600061387354205, + "grad_norm": 0.2328445315361023, + "learning_rate": 7.406450782831177e-05, + "loss": 1.7761, + "step": 11729 + }, + { + "epoch": 3.6003683241252302, + "grad_norm": 0.2740287184715271, + "learning_rate": 7.406015070879136e-05, + "loss": 1.8599, + "step": 11730 + }, + { + "epoch": 3.6006752608962556, + "grad_norm": 0.2930558919906616, + "learning_rate": 7.405579335149441e-05, + "loss": 1.852, + "step": 11731 + }, + { + "epoch": 3.6009821976672804, + "grad_norm": 0.30175161361694336, + "learning_rate": 7.405143575646403e-05, + "loss": 1.8861, + "step": 11732 + }, + { + "epoch": 3.601289134438306, + "grad_norm": 0.2617531418800354, + "learning_rate": 7.404707792374328e-05, + "loss": 1.7598, + "step": 11733 + }, + { + "epoch": 3.6015960712093307, + "grad_norm": 0.25384122133255005, + "learning_rate": 7.404271985337517e-05, + "loss": 1.7634, + "step": 11734 + }, + { + "epoch": 3.601903007980356, + "grad_norm": 0.31706711649894714, + "learning_rate": 7.403836154540284e-05, + "loss": 1.8125, + "step": 11735 + }, + { + "epoch": 3.6022099447513813, + "grad_norm": 0.299662709236145, + "learning_rate": 7.403400299986932e-05, + "loss": 1.748, + "step": 11736 + }, + { + "epoch": 3.6025168815224062, + "grad_norm": 0.23828944563865662, + "learning_rate": 7.40296442168177e-05, + "loss": 1.7473, + "step": 11737 + }, + { + "epoch": 3.6028238182934316, + "grad_norm": 0.22611604630947113, + "learning_rate": 7.402528519629106e-05, + "loss": 1.7519, + "step": 11738 + }, + { + "epoch": 3.6031307550644565, + "grad_norm": 0.28498536348342896, + "learning_rate": 7.402092593833246e-05, + "loss": 1.7792, + "step": 11739 + }, + { + "epoch": 3.603437691835482, + "grad_norm": 0.2404283881187439, + "learning_rate": 7.4016566442985e-05, + "loss": 1.7434, + "step": 11740 + }, + { + "epoch": 3.603744628606507, + "grad_norm": 0.2291589230298996, + "learning_rate": 7.401220671029173e-05, + "loss": 1.7623, + "step": 11741 + }, + { + "epoch": 3.6040515653775325, + "grad_norm": 0.23962698876857758, + "learning_rate": 7.400784674029578e-05, + "loss": 1.7232, + "step": 11742 + }, + { + "epoch": 3.6043585021485574, + "grad_norm": 0.3015185594558716, + "learning_rate": 7.400348653304022e-05, + "loss": 1.7808, + "step": 11743 + }, + { + "epoch": 3.6046654389195827, + "grad_norm": 0.30623099207878113, + "learning_rate": 7.399912608856813e-05, + "loss": 1.8518, + "step": 11744 + }, + { + "epoch": 3.6049723756906076, + "grad_norm": 0.2698235511779785, + "learning_rate": 7.39947654069226e-05, + "loss": 1.7829, + "step": 11745 + }, + { + "epoch": 3.605279312461633, + "grad_norm": 0.2195274829864502, + "learning_rate": 7.399040448814674e-05, + "loss": 1.7709, + "step": 11746 + }, + { + "epoch": 3.6055862492326582, + "grad_norm": 0.22962357103824615, + "learning_rate": 7.398604333228366e-05, + "loss": 1.7482, + "step": 11747 + }, + { + "epoch": 3.605893186003683, + "grad_norm": 0.2403932511806488, + "learning_rate": 7.398168193937642e-05, + "loss": 1.8063, + "step": 11748 + }, + { + "epoch": 3.6062001227747085, + "grad_norm": 0.23542718589305878, + "learning_rate": 7.397732030946816e-05, + "loss": 1.7599, + "step": 11749 + }, + { + "epoch": 3.6065070595457334, + "grad_norm": 0.2462490350008011, + "learning_rate": 7.397295844260195e-05, + "loss": 1.8183, + "step": 11750 + }, + { + "epoch": 3.6068139963167587, + "grad_norm": 0.21428349614143372, + "learning_rate": 7.396859633882091e-05, + "loss": 1.6944, + "step": 11751 + }, + { + "epoch": 3.607120933087784, + "grad_norm": 0.21240907907485962, + "learning_rate": 7.396423399816817e-05, + "loss": 1.7795, + "step": 11752 + }, + { + "epoch": 3.607427869858809, + "grad_norm": 0.23413677513599396, + "learning_rate": 7.395987142068682e-05, + "loss": 1.8015, + "step": 11753 + }, + { + "epoch": 3.6077348066298343, + "grad_norm": 0.26724907755851746, + "learning_rate": 7.395550860641998e-05, + "loss": 1.8174, + "step": 11754 + }, + { + "epoch": 3.608041743400859, + "grad_norm": 0.22077679634094238, + "learning_rate": 7.395114555541077e-05, + "loss": 1.7929, + "step": 11755 + }, + { + "epoch": 3.6083486801718845, + "grad_norm": 0.2475263774394989, + "learning_rate": 7.394678226770228e-05, + "loss": 1.7744, + "step": 11756 + }, + { + "epoch": 3.60865561694291, + "grad_norm": 0.22579342126846313, + "learning_rate": 7.394241874333764e-05, + "loss": 1.79, + "step": 11757 + }, + { + "epoch": 3.608962553713935, + "grad_norm": 0.26798152923583984, + "learning_rate": 7.393805498236001e-05, + "loss": 1.8087, + "step": 11758 + }, + { + "epoch": 3.60926949048496, + "grad_norm": 0.2755621373653412, + "learning_rate": 7.393369098481248e-05, + "loss": 1.7834, + "step": 11759 + }, + { + "epoch": 3.6095764272559854, + "grad_norm": 0.2741812467575073, + "learning_rate": 7.39293267507382e-05, + "loss": 1.7948, + "step": 11760 + }, + { + "epoch": 3.6098833640270103, + "grad_norm": 0.2378924936056137, + "learning_rate": 7.392496228018028e-05, + "loss": 1.8317, + "step": 11761 + }, + { + "epoch": 3.6101903007980356, + "grad_norm": 0.2628132700920105, + "learning_rate": 7.392059757318187e-05, + "loss": 1.8123, + "step": 11762 + }, + { + "epoch": 3.610497237569061, + "grad_norm": 0.2613002359867096, + "learning_rate": 7.391623262978607e-05, + "loss": 1.795, + "step": 11763 + }, + { + "epoch": 3.610804174340086, + "grad_norm": 0.27272161841392517, + "learning_rate": 7.391186745003608e-05, + "loss": 1.7808, + "step": 11764 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.21366162598133087, + "learning_rate": 7.390750203397497e-05, + "loss": 1.77, + "step": 11765 + }, + { + "epoch": 3.611418047882136, + "grad_norm": 0.25559261441230774, + "learning_rate": 7.390313638164593e-05, + "loss": 1.8442, + "step": 11766 + }, + { + "epoch": 3.6117249846531614, + "grad_norm": 0.23794838786125183, + "learning_rate": 7.389877049309207e-05, + "loss": 1.8237, + "step": 11767 + }, + { + "epoch": 3.6120319214241867, + "grad_norm": 0.2690154016017914, + "learning_rate": 7.389440436835656e-05, + "loss": 1.8194, + "step": 11768 + }, + { + "epoch": 3.612338858195212, + "grad_norm": 0.26148009300231934, + "learning_rate": 7.389003800748254e-05, + "loss": 1.7862, + "step": 11769 + }, + { + "epoch": 3.612645794966237, + "grad_norm": 0.26414936780929565, + "learning_rate": 7.388567141051315e-05, + "loss": 1.7815, + "step": 11770 + }, + { + "epoch": 3.6129527317372623, + "grad_norm": 0.24473857879638672, + "learning_rate": 7.388130457749157e-05, + "loss": 1.801, + "step": 11771 + }, + { + "epoch": 3.613259668508287, + "grad_norm": 0.24356001615524292, + "learning_rate": 7.387693750846094e-05, + "loss": 1.8031, + "step": 11772 + }, + { + "epoch": 3.6135666052793125, + "grad_norm": 0.26716411113739014, + "learning_rate": 7.387257020346441e-05, + "loss": 1.7999, + "step": 11773 + }, + { + "epoch": 3.613873542050338, + "grad_norm": 0.2730760872364044, + "learning_rate": 7.386820266254516e-05, + "loss": 1.8079, + "step": 11774 + }, + { + "epoch": 3.6141804788213627, + "grad_norm": 0.2570728361606598, + "learning_rate": 7.386383488574635e-05, + "loss": 1.7374, + "step": 11775 + }, + { + "epoch": 3.614487415592388, + "grad_norm": 0.24992883205413818, + "learning_rate": 7.385946687311112e-05, + "loss": 1.8432, + "step": 11776 + }, + { + "epoch": 3.614794352363413, + "grad_norm": 0.28632259368896484, + "learning_rate": 7.385509862468266e-05, + "loss": 1.8014, + "step": 11777 + }, + { + "epoch": 3.6151012891344383, + "grad_norm": 0.257303923368454, + "learning_rate": 7.385073014050412e-05, + "loss": 1.8166, + "step": 11778 + }, + { + "epoch": 3.6154082259054636, + "grad_norm": 0.2791872024536133, + "learning_rate": 7.38463614206187e-05, + "loss": 1.7865, + "step": 11779 + }, + { + "epoch": 3.6157151626764885, + "grad_norm": 0.25708603858947754, + "learning_rate": 7.384199246506956e-05, + "loss": 1.807, + "step": 11780 + }, + { + "epoch": 3.616022099447514, + "grad_norm": 0.28693172335624695, + "learning_rate": 7.383762327389988e-05, + "loss": 1.8049, + "step": 11781 + }, + { + "epoch": 3.6163290362185387, + "grad_norm": 0.2731167674064636, + "learning_rate": 7.383325384715283e-05, + "loss": 1.8937, + "step": 11782 + }, + { + "epoch": 3.616635972989564, + "grad_norm": 0.26151663064956665, + "learning_rate": 7.38288841848716e-05, + "loss": 1.8288, + "step": 11783 + }, + { + "epoch": 3.6169429097605894, + "grad_norm": 0.2732257843017578, + "learning_rate": 7.382451428709936e-05, + "loss": 1.7668, + "step": 11784 + }, + { + "epoch": 3.6172498465316147, + "grad_norm": 0.2747575640678406, + "learning_rate": 7.38201441538793e-05, + "loss": 1.7991, + "step": 11785 + }, + { + "epoch": 3.6175567833026396, + "grad_norm": 0.2884783446788788, + "learning_rate": 7.381577378525462e-05, + "loss": 1.7798, + "step": 11786 + }, + { + "epoch": 3.617863720073665, + "grad_norm": 0.2716344892978668, + "learning_rate": 7.381140318126851e-05, + "loss": 1.7923, + "step": 11787 + }, + { + "epoch": 3.61817065684469, + "grad_norm": 0.3007747232913971, + "learning_rate": 7.380703234196416e-05, + "loss": 1.8397, + "step": 11788 + }, + { + "epoch": 3.618477593615715, + "grad_norm": 0.39218056201934814, + "learning_rate": 7.380266126738476e-05, + "loss": 1.8517, + "step": 11789 + }, + { + "epoch": 3.6187845303867405, + "grad_norm": 0.43425866961479187, + "learning_rate": 7.379828995757351e-05, + "loss": 1.7518, + "step": 11790 + }, + { + "epoch": 3.6190914671577654, + "grad_norm": 0.34399518370628357, + "learning_rate": 7.37939184125736e-05, + "loss": 1.7607, + "step": 11791 + }, + { + "epoch": 3.6193984039287908, + "grad_norm": 0.23124302923679352, + "learning_rate": 7.378954663242825e-05, + "loss": 1.7898, + "step": 11792 + }, + { + "epoch": 3.6197053406998156, + "grad_norm": 0.32839757204055786, + "learning_rate": 7.378517461718066e-05, + "loss": 1.7472, + "step": 11793 + }, + { + "epoch": 3.620012277470841, + "grad_norm": 0.38583460450172424, + "learning_rate": 7.378080236687403e-05, + "loss": 1.7947, + "step": 11794 + }, + { + "epoch": 3.6203192142418663, + "grad_norm": 0.4622896909713745, + "learning_rate": 7.377642988155157e-05, + "loss": 1.9023, + "step": 11795 + }, + { + "epoch": 3.620626151012891, + "grad_norm": 0.3783189058303833, + "learning_rate": 7.37720571612565e-05, + "loss": 1.7813, + "step": 11796 + }, + { + "epoch": 3.6209330877839165, + "grad_norm": 0.3468814790248871, + "learning_rate": 7.376768420603204e-05, + "loss": 1.7509, + "step": 11797 + }, + { + "epoch": 3.6212400245549414, + "grad_norm": 0.2602507174015045, + "learning_rate": 7.376331101592138e-05, + "loss": 1.8158, + "step": 11798 + }, + { + "epoch": 3.6215469613259668, + "grad_norm": 0.28337883949279785, + "learning_rate": 7.375893759096775e-05, + "loss": 1.7755, + "step": 11799 + }, + { + "epoch": 3.621853898096992, + "grad_norm": 0.3644609749317169, + "learning_rate": 7.375456393121437e-05, + "loss": 1.8193, + "step": 11800 + }, + { + "epoch": 3.6221608348680174, + "grad_norm": 0.338211327791214, + "learning_rate": 7.375019003670448e-05, + "loss": 1.821, + "step": 11801 + }, + { + "epoch": 3.6224677716390423, + "grad_norm": 0.23850654065608978, + "learning_rate": 7.374581590748129e-05, + "loss": 1.7317, + "step": 11802 + }, + { + "epoch": 3.6227747084100677, + "grad_norm": 0.3496716618537903, + "learning_rate": 7.374144154358801e-05, + "loss": 1.8361, + "step": 11803 + }, + { + "epoch": 3.6230816451810925, + "grad_norm": 0.5585216283798218, + "learning_rate": 7.37370669450679e-05, + "loss": 1.7667, + "step": 11804 + }, + { + "epoch": 3.623388581952118, + "grad_norm": 0.4578089714050293, + "learning_rate": 7.373269211196418e-05, + "loss": 1.8051, + "step": 11805 + }, + { + "epoch": 3.623695518723143, + "grad_norm": 0.28195759654045105, + "learning_rate": 7.37283170443201e-05, + "loss": 1.7823, + "step": 11806 + }, + { + "epoch": 3.624002455494168, + "grad_norm": 0.4066108465194702, + "learning_rate": 7.372394174217887e-05, + "loss": 1.7819, + "step": 11807 + }, + { + "epoch": 3.6243093922651934, + "grad_norm": 0.5368703007698059, + "learning_rate": 7.371956620558375e-05, + "loss": 1.8121, + "step": 11808 + }, + { + "epoch": 3.6246163290362183, + "grad_norm": 0.36627063155174255, + "learning_rate": 7.371519043457795e-05, + "loss": 1.7944, + "step": 11809 + }, + { + "epoch": 3.6249232658072437, + "grad_norm": 0.3100780248641968, + "learning_rate": 7.371081442920476e-05, + "loss": 1.783, + "step": 11810 + }, + { + "epoch": 3.625230202578269, + "grad_norm": 0.3277178704738617, + "learning_rate": 7.370643818950741e-05, + "loss": 1.8105, + "step": 11811 + }, + { + "epoch": 3.625537139349294, + "grad_norm": 0.3887772560119629, + "learning_rate": 7.370206171552914e-05, + "loss": 1.8136, + "step": 11812 + }, + { + "epoch": 3.6258440761203192, + "grad_norm": 0.2770824134349823, + "learning_rate": 7.36976850073132e-05, + "loss": 1.7852, + "step": 11813 + }, + { + "epoch": 3.626151012891344, + "grad_norm": 0.26357728242874146, + "learning_rate": 7.369330806490284e-05, + "loss": 1.7621, + "step": 11814 + }, + { + "epoch": 3.6264579496623695, + "grad_norm": 0.3387344181537628, + "learning_rate": 7.368893088834135e-05, + "loss": 1.7785, + "step": 11815 + }, + { + "epoch": 3.626764886433395, + "grad_norm": 0.35155174136161804, + "learning_rate": 7.368455347767193e-05, + "loss": 1.8081, + "step": 11816 + }, + { + "epoch": 3.62707182320442, + "grad_norm": 0.2855289876461029, + "learning_rate": 7.368017583293788e-05, + "loss": 1.8245, + "step": 11817 + }, + { + "epoch": 3.627378759975445, + "grad_norm": 0.28462162613868713, + "learning_rate": 7.367579795418245e-05, + "loss": 1.8066, + "step": 11818 + }, + { + "epoch": 3.6276856967464703, + "grad_norm": 0.40696555376052856, + "learning_rate": 7.367141984144891e-05, + "loss": 1.8897, + "step": 11819 + }, + { + "epoch": 3.6279926335174952, + "grad_norm": 0.472782701253891, + "learning_rate": 7.366704149478054e-05, + "loss": 1.8071, + "step": 11820 + }, + { + "epoch": 3.6282995702885206, + "grad_norm": 0.27022916078567505, + "learning_rate": 7.366266291422057e-05, + "loss": 1.8574, + "step": 11821 + }, + { + "epoch": 3.628606507059546, + "grad_norm": 0.4207148253917694, + "learning_rate": 7.365828409981231e-05, + "loss": 1.7759, + "step": 11822 + }, + { + "epoch": 3.628913443830571, + "grad_norm": 0.42866072058677673, + "learning_rate": 7.365390505159902e-05, + "loss": 1.7366, + "step": 11823 + }, + { + "epoch": 3.629220380601596, + "grad_norm": 0.28288859128952026, + "learning_rate": 7.364952576962398e-05, + "loss": 1.8591, + "step": 11824 + }, + { + "epoch": 3.629527317372621, + "grad_norm": 0.30544906854629517, + "learning_rate": 7.364514625393045e-05, + "loss": 1.7965, + "step": 11825 + }, + { + "epoch": 3.6298342541436464, + "grad_norm": 0.3251616954803467, + "learning_rate": 7.364076650456173e-05, + "loss": 1.8197, + "step": 11826 + }, + { + "epoch": 3.6301411909146717, + "grad_norm": 0.3133888840675354, + "learning_rate": 7.363638652156109e-05, + "loss": 1.7978, + "step": 11827 + }, + { + "epoch": 3.630448127685697, + "grad_norm": 0.29004594683647156, + "learning_rate": 7.363200630497185e-05, + "loss": 1.8035, + "step": 11828 + }, + { + "epoch": 3.630755064456722, + "grad_norm": 0.2781279683113098, + "learning_rate": 7.362762585483725e-05, + "loss": 1.8462, + "step": 11829 + }, + { + "epoch": 3.6310620012277472, + "grad_norm": 0.29003822803497314, + "learning_rate": 7.362324517120063e-05, + "loss": 1.7952, + "step": 11830 + }, + { + "epoch": 3.631368937998772, + "grad_norm": 0.2510940134525299, + "learning_rate": 7.361886425410524e-05, + "loss": 1.7645, + "step": 11831 + }, + { + "epoch": 3.6316758747697975, + "grad_norm": 0.23798540234565735, + "learning_rate": 7.361448310359438e-05, + "loss": 1.7329, + "step": 11832 + }, + { + "epoch": 3.631982811540823, + "grad_norm": 0.2711278796195984, + "learning_rate": 7.361010171971137e-05, + "loss": 1.8245, + "step": 11833 + }, + { + "epoch": 3.6322897483118477, + "grad_norm": 0.2895669639110565, + "learning_rate": 7.360572010249949e-05, + "loss": 1.7668, + "step": 11834 + }, + { + "epoch": 3.632596685082873, + "grad_norm": 0.2216273844242096, + "learning_rate": 7.360133825200205e-05, + "loss": 1.8164, + "step": 11835 + }, + { + "epoch": 3.632903621853898, + "grad_norm": 0.3075082302093506, + "learning_rate": 7.359695616826236e-05, + "loss": 1.8159, + "step": 11836 + }, + { + "epoch": 3.6332105586249233, + "grad_norm": 0.3208801746368408, + "learning_rate": 7.35925738513237e-05, + "loss": 1.8385, + "step": 11837 + }, + { + "epoch": 3.6335174953959486, + "grad_norm": 0.272517591714859, + "learning_rate": 7.35881913012294e-05, + "loss": 1.7653, + "step": 11838 + }, + { + "epoch": 3.6338244321669735, + "grad_norm": 0.23105360567569733, + "learning_rate": 7.358380851802277e-05, + "loss": 1.7697, + "step": 11839 + }, + { + "epoch": 3.634131368937999, + "grad_norm": 0.2643153667449951, + "learning_rate": 7.357942550174714e-05, + "loss": 1.7885, + "step": 11840 + }, + { + "epoch": 3.6344383057090237, + "grad_norm": 0.22643202543258667, + "learning_rate": 7.357504225244579e-05, + "loss": 1.746, + "step": 11841 + }, + { + "epoch": 3.634745242480049, + "grad_norm": 0.27782970666885376, + "learning_rate": 7.357065877016207e-05, + "loss": 1.794, + "step": 11842 + }, + { + "epoch": 3.6350521792510744, + "grad_norm": 0.3035561740398407, + "learning_rate": 7.356627505493925e-05, + "loss": 1.7892, + "step": 11843 + }, + { + "epoch": 3.6353591160220997, + "grad_norm": 0.31859731674194336, + "learning_rate": 7.356189110682072e-05, + "loss": 1.7636, + "step": 11844 + }, + { + "epoch": 3.6356660527931246, + "grad_norm": 0.2960890233516693, + "learning_rate": 7.355750692584977e-05, + "loss": 1.8294, + "step": 11845 + }, + { + "epoch": 3.63597298956415, + "grad_norm": 0.2544194459915161, + "learning_rate": 7.355312251206972e-05, + "loss": 1.7603, + "step": 11846 + }, + { + "epoch": 3.636279926335175, + "grad_norm": 0.27864789962768555, + "learning_rate": 7.354873786552391e-05, + "loss": 1.7917, + "step": 11847 + }, + { + "epoch": 3.6365868631062, + "grad_norm": 0.32552552223205566, + "learning_rate": 7.354435298625568e-05, + "loss": 1.7769, + "step": 11848 + }, + { + "epoch": 3.6368937998772255, + "grad_norm": 0.25094640254974365, + "learning_rate": 7.353996787430833e-05, + "loss": 1.8371, + "step": 11849 + }, + { + "epoch": 3.6372007366482504, + "grad_norm": 0.26656433939933777, + "learning_rate": 7.353558252972524e-05, + "loss": 1.7686, + "step": 11850 + }, + { + "epoch": 3.6375076734192757, + "grad_norm": 0.3023635745048523, + "learning_rate": 7.353119695254973e-05, + "loss": 1.7892, + "step": 11851 + }, + { + "epoch": 3.6378146101903006, + "grad_norm": 0.2822463810443878, + "learning_rate": 7.352681114282514e-05, + "loss": 1.8221, + "step": 11852 + }, + { + "epoch": 3.638121546961326, + "grad_norm": 0.31159496307373047, + "learning_rate": 7.35224251005948e-05, + "loss": 1.803, + "step": 11853 + }, + { + "epoch": 3.6384284837323513, + "grad_norm": 0.3133087158203125, + "learning_rate": 7.351803882590207e-05, + "loss": 1.744, + "step": 11854 + }, + { + "epoch": 3.638735420503376, + "grad_norm": 0.3050002455711365, + "learning_rate": 7.351365231879029e-05, + "loss": 1.7522, + "step": 11855 + }, + { + "epoch": 3.6390423572744015, + "grad_norm": 0.2729037404060364, + "learning_rate": 7.350926557930283e-05, + "loss": 1.7629, + "step": 11856 + }, + { + "epoch": 3.6393492940454264, + "grad_norm": 0.3181995153427124, + "learning_rate": 7.350487860748303e-05, + "loss": 1.7603, + "step": 11857 + }, + { + "epoch": 3.6396562308164517, + "grad_norm": 0.352651447057724, + "learning_rate": 7.350049140337423e-05, + "loss": 1.8177, + "step": 11858 + }, + { + "epoch": 3.639963167587477, + "grad_norm": 0.22935177385807037, + "learning_rate": 7.349610396701981e-05, + "loss": 1.7421, + "step": 11859 + }, + { + "epoch": 3.6402701043585024, + "grad_norm": 0.26442599296569824, + "learning_rate": 7.349171629846312e-05, + "loss": 1.8026, + "step": 11860 + }, + { + "epoch": 3.6405770411295273, + "grad_norm": 0.25357648730278015, + "learning_rate": 7.348732839774751e-05, + "loss": 1.788, + "step": 11861 + }, + { + "epoch": 3.6408839779005526, + "grad_norm": 0.26959577202796936, + "learning_rate": 7.348294026491635e-05, + "loss": 1.884, + "step": 11862 + }, + { + "epoch": 3.6411909146715775, + "grad_norm": 0.2243001013994217, + "learning_rate": 7.347855190001304e-05, + "loss": 1.7765, + "step": 11863 + }, + { + "epoch": 3.641497851442603, + "grad_norm": 0.2480708807706833, + "learning_rate": 7.34741633030809e-05, + "loss": 1.7597, + "step": 11864 + }, + { + "epoch": 3.641804788213628, + "grad_norm": 0.22512994706630707, + "learning_rate": 7.346977447416332e-05, + "loss": 1.7647, + "step": 11865 + }, + { + "epoch": 3.642111724984653, + "grad_norm": 0.24961981177330017, + "learning_rate": 7.346538541330368e-05, + "loss": 1.8178, + "step": 11866 + }, + { + "epoch": 3.6424186617556784, + "grad_norm": 0.320896714925766, + "learning_rate": 7.346099612054533e-05, + "loss": 1.85, + "step": 11867 + }, + { + "epoch": 3.6427255985267033, + "grad_norm": 0.3420880436897278, + "learning_rate": 7.345660659593167e-05, + "loss": 1.8661, + "step": 11868 + }, + { + "epoch": 3.6430325352977286, + "grad_norm": 0.2675844132900238, + "learning_rate": 7.34522168395061e-05, + "loss": 1.8177, + "step": 11869 + }, + { + "epoch": 3.643339472068754, + "grad_norm": 0.23993943631649017, + "learning_rate": 7.344782685131195e-05, + "loss": 1.7365, + "step": 11870 + }, + { + "epoch": 3.643646408839779, + "grad_norm": 0.21805813908576965, + "learning_rate": 7.344343663139264e-05, + "loss": 1.7813, + "step": 11871 + }, + { + "epoch": 3.643953345610804, + "grad_norm": 0.24334421753883362, + "learning_rate": 7.343904617979154e-05, + "loss": 1.7763, + "step": 11872 + }, + { + "epoch": 3.644260282381829, + "grad_norm": 0.22768431901931763, + "learning_rate": 7.343465549655206e-05, + "loss": 1.7817, + "step": 11873 + }, + { + "epoch": 3.6445672191528544, + "grad_norm": 0.23828962445259094, + "learning_rate": 7.343026458171757e-05, + "loss": 1.8391, + "step": 11874 + }, + { + "epoch": 3.6448741559238798, + "grad_norm": 0.24838197231292725, + "learning_rate": 7.342587343533149e-05, + "loss": 1.759, + "step": 11875 + }, + { + "epoch": 3.645181092694905, + "grad_norm": 0.22732019424438477, + "learning_rate": 7.342148205743718e-05, + "loss": 1.7348, + "step": 11876 + }, + { + "epoch": 3.64548802946593, + "grad_norm": 0.25106775760650635, + "learning_rate": 7.341709044807807e-05, + "loss": 1.8121, + "step": 11877 + }, + { + "epoch": 3.6457949662369553, + "grad_norm": 0.28532838821411133, + "learning_rate": 7.341269860729753e-05, + "loss": 1.7147, + "step": 11878 + }, + { + "epoch": 3.64610190300798, + "grad_norm": 0.3041890859603882, + "learning_rate": 7.340830653513899e-05, + "loss": 1.7666, + "step": 11879 + }, + { + "epoch": 3.6464088397790055, + "grad_norm": 0.3142147958278656, + "learning_rate": 7.340391423164585e-05, + "loss": 1.8707, + "step": 11880 + }, + { + "epoch": 3.646715776550031, + "grad_norm": 0.28531381487846375, + "learning_rate": 7.339952169686151e-05, + "loss": 1.7961, + "step": 11881 + }, + { + "epoch": 3.6470227133210558, + "grad_norm": 0.33779671788215637, + "learning_rate": 7.339512893082938e-05, + "loss": 1.7428, + "step": 11882 + }, + { + "epoch": 3.647329650092081, + "grad_norm": 0.29611849784851074, + "learning_rate": 7.339073593359287e-05, + "loss": 1.8803, + "step": 11883 + }, + { + "epoch": 3.647636586863106, + "grad_norm": 0.31248557567596436, + "learning_rate": 7.33863427051954e-05, + "loss": 1.7868, + "step": 11884 + }, + { + "epoch": 3.6479435236341313, + "grad_norm": 0.42829564213752747, + "learning_rate": 7.338194924568039e-05, + "loss": 1.8558, + "step": 11885 + }, + { + "epoch": 3.6482504604051567, + "grad_norm": 0.431023508310318, + "learning_rate": 7.337755555509126e-05, + "loss": 1.7565, + "step": 11886 + }, + { + "epoch": 3.6485573971761815, + "grad_norm": 0.2917975187301636, + "learning_rate": 7.33731616334714e-05, + "loss": 1.8067, + "step": 11887 + }, + { + "epoch": 3.648864333947207, + "grad_norm": 0.3072175085544586, + "learning_rate": 7.336876748086427e-05, + "loss": 1.782, + "step": 11888 + }, + { + "epoch": 3.6491712707182318, + "grad_norm": 0.33658862113952637, + "learning_rate": 7.336437309731327e-05, + "loss": 1.8007, + "step": 11889 + }, + { + "epoch": 3.649478207489257, + "grad_norm": 0.23774033784866333, + "learning_rate": 7.335997848286185e-05, + "loss": 1.7606, + "step": 11890 + }, + { + "epoch": 3.6497851442602824, + "grad_norm": 0.3373236358165741, + "learning_rate": 7.335558363755344e-05, + "loss": 1.7335, + "step": 11891 + }, + { + "epoch": 3.650092081031308, + "grad_norm": 0.3906517028808594, + "learning_rate": 7.335118856143145e-05, + "loss": 1.7974, + "step": 11892 + }, + { + "epoch": 3.6503990178023327, + "grad_norm": 0.37715303897857666, + "learning_rate": 7.334679325453934e-05, + "loss": 1.8875, + "step": 11893 + }, + { + "epoch": 3.650705954573358, + "grad_norm": 0.278540700674057, + "learning_rate": 7.334239771692053e-05, + "loss": 1.8165, + "step": 11894 + }, + { + "epoch": 3.651012891344383, + "grad_norm": 0.24434895813465118, + "learning_rate": 7.333800194861845e-05, + "loss": 1.7756, + "step": 11895 + }, + { + "epoch": 3.6513198281154082, + "grad_norm": 0.25057271122932434, + "learning_rate": 7.333360594967658e-05, + "loss": 1.7932, + "step": 11896 + }, + { + "epoch": 3.6516267648864336, + "grad_norm": 0.3277342617511749, + "learning_rate": 7.332920972013833e-05, + "loss": 1.7781, + "step": 11897 + }, + { + "epoch": 3.6519337016574585, + "grad_norm": 0.2754829525947571, + "learning_rate": 7.332481326004715e-05, + "loss": 1.7916, + "step": 11898 + }, + { + "epoch": 3.652240638428484, + "grad_norm": 0.24490588903427124, + "learning_rate": 7.332041656944651e-05, + "loss": 1.7904, + "step": 11899 + }, + { + "epoch": 3.6525475751995087, + "grad_norm": 0.3176959455013275, + "learning_rate": 7.331601964837982e-05, + "loss": 1.7379, + "step": 11900 + }, + { + "epoch": 3.652854511970534, + "grad_norm": 0.3435784876346588, + "learning_rate": 7.331162249689057e-05, + "loss": 1.7635, + "step": 11901 + }, + { + "epoch": 3.6531614487415593, + "grad_norm": 0.335697740316391, + "learning_rate": 7.330722511502221e-05, + "loss": 1.7903, + "step": 11902 + }, + { + "epoch": 3.6534683855125847, + "grad_norm": 0.2748894691467285, + "learning_rate": 7.330282750281819e-05, + "loss": 1.8259, + "step": 11903 + }, + { + "epoch": 3.6537753222836096, + "grad_norm": 0.36754751205444336, + "learning_rate": 7.329842966032197e-05, + "loss": 1.7728, + "step": 11904 + }, + { + "epoch": 3.654082259054635, + "grad_norm": 0.4355713129043579, + "learning_rate": 7.3294031587577e-05, + "loss": 1.7447, + "step": 11905 + }, + { + "epoch": 3.65438919582566, + "grad_norm": 0.3967476487159729, + "learning_rate": 7.328963328462677e-05, + "loss": 1.8299, + "step": 11906 + }, + { + "epoch": 3.654696132596685, + "grad_norm": 0.23805755376815796, + "learning_rate": 7.328523475151472e-05, + "loss": 1.7631, + "step": 11907 + }, + { + "epoch": 3.6550030693677105, + "grad_norm": 0.40350377559661865, + "learning_rate": 7.328083598828435e-05, + "loss": 1.8693, + "step": 11908 + }, + { + "epoch": 3.6553100061387354, + "grad_norm": 0.4743673801422119, + "learning_rate": 7.32764369949791e-05, + "loss": 1.7887, + "step": 11909 + }, + { + "epoch": 3.6556169429097607, + "grad_norm": 0.33830127120018005, + "learning_rate": 7.327203777164246e-05, + "loss": 1.7527, + "step": 11910 + }, + { + "epoch": 3.6559238796807856, + "grad_norm": 0.2465003877878189, + "learning_rate": 7.326763831831791e-05, + "loss": 1.7898, + "step": 11911 + }, + { + "epoch": 3.656230816451811, + "grad_norm": 0.31647852063179016, + "learning_rate": 7.326323863504892e-05, + "loss": 1.8056, + "step": 11912 + }, + { + "epoch": 3.6565377532228363, + "grad_norm": 0.31436124444007874, + "learning_rate": 7.325883872187896e-05, + "loss": 1.7972, + "step": 11913 + }, + { + "epoch": 3.656844689993861, + "grad_norm": 0.260405957698822, + "learning_rate": 7.325443857885153e-05, + "loss": 1.8109, + "step": 11914 + }, + { + "epoch": 3.6571516267648865, + "grad_norm": 0.29312583804130554, + "learning_rate": 7.325003820601011e-05, + "loss": 1.8947, + "step": 11915 + }, + { + "epoch": 3.6574585635359114, + "grad_norm": 0.2641582190990448, + "learning_rate": 7.324563760339819e-05, + "loss": 1.7737, + "step": 11916 + }, + { + "epoch": 3.6577655003069367, + "grad_norm": 0.2338121086359024, + "learning_rate": 7.324123677105923e-05, + "loss": 1.7462, + "step": 11917 + }, + { + "epoch": 3.658072437077962, + "grad_norm": 0.27877378463745117, + "learning_rate": 7.323683570903676e-05, + "loss": 1.8371, + "step": 11918 + }, + { + "epoch": 3.6583793738489874, + "grad_norm": 0.24238766729831696, + "learning_rate": 7.323243441737427e-05, + "loss": 1.7304, + "step": 11919 + }, + { + "epoch": 3.6586863106200123, + "grad_norm": 0.2349759042263031, + "learning_rate": 7.322803289611525e-05, + "loss": 1.7422, + "step": 11920 + }, + { + "epoch": 3.6589932473910376, + "grad_norm": 0.2254217565059662, + "learning_rate": 7.322363114530318e-05, + "loss": 1.7296, + "step": 11921 + }, + { + "epoch": 3.6593001841620625, + "grad_norm": 0.24533270299434662, + "learning_rate": 7.321922916498158e-05, + "loss": 1.7834, + "step": 11922 + }, + { + "epoch": 3.659607120933088, + "grad_norm": 0.24993161857128143, + "learning_rate": 7.321482695519393e-05, + "loss": 1.8502, + "step": 11923 + }, + { + "epoch": 3.659914057704113, + "grad_norm": 0.2540178894996643, + "learning_rate": 7.321042451598378e-05, + "loss": 1.8372, + "step": 11924 + }, + { + "epoch": 3.660220994475138, + "grad_norm": 0.2241390198469162, + "learning_rate": 7.32060218473946e-05, + "loss": 1.7619, + "step": 11925 + }, + { + "epoch": 3.6605279312461634, + "grad_norm": 0.2137840837240219, + "learning_rate": 7.32016189494699e-05, + "loss": 1.751, + "step": 11926 + }, + { + "epoch": 3.6608348680171883, + "grad_norm": 0.2596585154533386, + "learning_rate": 7.319721582225323e-05, + "loss": 1.7773, + "step": 11927 + }, + { + "epoch": 3.6611418047882136, + "grad_norm": 0.24898354709148407, + "learning_rate": 7.319281246578806e-05, + "loss": 1.7347, + "step": 11928 + }, + { + "epoch": 3.661448741559239, + "grad_norm": 0.26553863286972046, + "learning_rate": 7.31884088801179e-05, + "loss": 1.7812, + "step": 11929 + }, + { + "epoch": 3.661755678330264, + "grad_norm": 0.2494438737630844, + "learning_rate": 7.318400506528633e-05, + "loss": 1.7554, + "step": 11930 + }, + { + "epoch": 3.662062615101289, + "grad_norm": 0.2794995903968811, + "learning_rate": 7.317960102133682e-05, + "loss": 1.7495, + "step": 11931 + }, + { + "epoch": 3.662369551872314, + "grad_norm": 0.2843860983848572, + "learning_rate": 7.317519674831293e-05, + "loss": 1.7734, + "step": 11932 + }, + { + "epoch": 3.6626764886433394, + "grad_norm": 0.28261128067970276, + "learning_rate": 7.317079224625813e-05, + "loss": 1.7794, + "step": 11933 + }, + { + "epoch": 3.6629834254143647, + "grad_norm": 0.2552426755428314, + "learning_rate": 7.316638751521599e-05, + "loss": 1.8397, + "step": 11934 + }, + { + "epoch": 3.66329036218539, + "grad_norm": 0.4140608608722687, + "learning_rate": 7.316198255523002e-05, + "loss": 1.848, + "step": 11935 + }, + { + "epoch": 3.663597298956415, + "grad_norm": 0.3709854483604431, + "learning_rate": 7.315757736634377e-05, + "loss": 1.8489, + "step": 11936 + }, + { + "epoch": 3.6639042357274403, + "grad_norm": 0.23637300729751587, + "learning_rate": 7.315317194860078e-05, + "loss": 1.7549, + "step": 11937 + }, + { + "epoch": 3.664211172498465, + "grad_norm": 0.32884421944618225, + "learning_rate": 7.314876630204456e-05, + "loss": 1.8061, + "step": 11938 + }, + { + "epoch": 3.6645181092694905, + "grad_norm": 0.33354130387306213, + "learning_rate": 7.314436042671867e-05, + "loss": 1.8346, + "step": 11939 + }, + { + "epoch": 3.664825046040516, + "grad_norm": 0.25776317715644836, + "learning_rate": 7.313995432266663e-05, + "loss": 1.8598, + "step": 11940 + }, + { + "epoch": 3.6651319828115407, + "grad_norm": 0.2910402715206146, + "learning_rate": 7.313554798993202e-05, + "loss": 1.7613, + "step": 11941 + }, + { + "epoch": 3.665438919582566, + "grad_norm": 0.3487538695335388, + "learning_rate": 7.313114142855836e-05, + "loss": 1.8105, + "step": 11942 + }, + { + "epoch": 3.665745856353591, + "grad_norm": 0.27271291613578796, + "learning_rate": 7.312673463858918e-05, + "loss": 1.8107, + "step": 11943 + }, + { + "epoch": 3.6660527931246163, + "grad_norm": 0.2613036632537842, + "learning_rate": 7.312232762006809e-05, + "loss": 1.7871, + "step": 11944 + }, + { + "epoch": 3.6663597298956416, + "grad_norm": 0.30594903230667114, + "learning_rate": 7.311792037303859e-05, + "loss": 1.8043, + "step": 11945 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.3960847854614258, + "learning_rate": 7.311351289754425e-05, + "loss": 1.8434, + "step": 11946 + }, + { + "epoch": 3.666973603437692, + "grad_norm": 0.33369311690330505, + "learning_rate": 7.310910519362861e-05, + "loss": 1.7496, + "step": 11947 + }, + { + "epoch": 3.6672805402087167, + "grad_norm": 0.29852384328842163, + "learning_rate": 7.310469726133528e-05, + "loss": 1.858, + "step": 11948 + }, + { + "epoch": 3.667587476979742, + "grad_norm": 0.2610527276992798, + "learning_rate": 7.310028910070777e-05, + "loss": 1.7642, + "step": 11949 + }, + { + "epoch": 3.6678944137507674, + "grad_norm": 0.3606704771518707, + "learning_rate": 7.309588071178967e-05, + "loss": 1.845, + "step": 11950 + }, + { + "epoch": 3.6682013505217927, + "grad_norm": 0.3157273828983307, + "learning_rate": 7.309147209462454e-05, + "loss": 1.7864, + "step": 11951 + }, + { + "epoch": 3.6685082872928176, + "grad_norm": 0.23907925188541412, + "learning_rate": 7.308706324925594e-05, + "loss": 1.8363, + "step": 11952 + }, + { + "epoch": 3.668815224063843, + "grad_norm": 0.3365088999271393, + "learning_rate": 7.308265417572747e-05, + "loss": 1.8755, + "step": 11953 + }, + { + "epoch": 3.669122160834868, + "grad_norm": 0.29404979944229126, + "learning_rate": 7.307824487408266e-05, + "loss": 1.8128, + "step": 11954 + }, + { + "epoch": 3.669429097605893, + "grad_norm": 0.2689574658870697, + "learning_rate": 7.307383534436511e-05, + "loss": 1.8072, + "step": 11955 + }, + { + "epoch": 3.6697360343769185, + "grad_norm": 0.28394198417663574, + "learning_rate": 7.306942558661841e-05, + "loss": 1.7919, + "step": 11956 + }, + { + "epoch": 3.6700429711479434, + "grad_norm": 0.2594783902168274, + "learning_rate": 7.306501560088612e-05, + "loss": 1.7467, + "step": 11957 + }, + { + "epoch": 3.6703499079189688, + "grad_norm": 0.24765191972255707, + "learning_rate": 7.30606053872118e-05, + "loss": 1.7876, + "step": 11958 + }, + { + "epoch": 3.6706568446899936, + "grad_norm": 0.22157172858715057, + "learning_rate": 7.305619494563909e-05, + "loss": 1.7802, + "step": 11959 + }, + { + "epoch": 3.670963781461019, + "grad_norm": 0.270151287317276, + "learning_rate": 7.305178427621155e-05, + "loss": 1.7723, + "step": 11960 + }, + { + "epoch": 3.6712707182320443, + "grad_norm": 0.3163939118385315, + "learning_rate": 7.304737337897277e-05, + "loss": 1.8488, + "step": 11961 + }, + { + "epoch": 3.671577655003069, + "grad_norm": 0.2605706453323364, + "learning_rate": 7.304296225396632e-05, + "loss": 1.7442, + "step": 11962 + }, + { + "epoch": 3.6718845917740945, + "grad_norm": 0.31179291009902954, + "learning_rate": 7.303855090123582e-05, + "loss": 1.831, + "step": 11963 + }, + { + "epoch": 3.6721915285451194, + "grad_norm": 0.33365359902381897, + "learning_rate": 7.303413932082483e-05, + "loss": 1.8376, + "step": 11964 + }, + { + "epoch": 3.6724984653161448, + "grad_norm": 0.2952130138874054, + "learning_rate": 7.302972751277701e-05, + "loss": 1.7733, + "step": 11965 + }, + { + "epoch": 3.67280540208717, + "grad_norm": 0.24270877242088318, + "learning_rate": 7.302531547713592e-05, + "loss": 1.8367, + "step": 11966 + }, + { + "epoch": 3.6731123388581954, + "grad_norm": 0.34315919876098633, + "learning_rate": 7.302090321394517e-05, + "loss": 1.7901, + "step": 11967 + }, + { + "epoch": 3.6734192756292203, + "grad_norm": 0.33511418104171753, + "learning_rate": 7.301649072324834e-05, + "loss": 1.7929, + "step": 11968 + }, + { + "epoch": 3.6737262124002457, + "grad_norm": 0.22397933900356293, + "learning_rate": 7.301207800508907e-05, + "loss": 1.7533, + "step": 11969 + }, + { + "epoch": 3.6740331491712706, + "grad_norm": 0.2882738411426544, + "learning_rate": 7.300766505951095e-05, + "loss": 1.8071, + "step": 11970 + }, + { + "epoch": 3.674340085942296, + "grad_norm": 0.242112398147583, + "learning_rate": 7.300325188655761e-05, + "loss": 1.7739, + "step": 11971 + }, + { + "epoch": 3.674647022713321, + "grad_norm": 0.27754491567611694, + "learning_rate": 7.299883848627265e-05, + "loss": 1.8295, + "step": 11972 + }, + { + "epoch": 3.674953959484346, + "grad_norm": 0.2787899076938629, + "learning_rate": 7.29944248586997e-05, + "loss": 1.7682, + "step": 11973 + }, + { + "epoch": 3.6752608962553714, + "grad_norm": 0.24448934197425842, + "learning_rate": 7.299001100388234e-05, + "loss": 1.7826, + "step": 11974 + }, + { + "epoch": 3.6755678330263963, + "grad_norm": 0.37869495153427124, + "learning_rate": 7.298559692186421e-05, + "loss": 1.8582, + "step": 11975 + }, + { + "epoch": 3.6758747697974217, + "grad_norm": 0.3299996256828308, + "learning_rate": 7.298118261268897e-05, + "loss": 1.7716, + "step": 11976 + }, + { + "epoch": 3.676181706568447, + "grad_norm": 0.278891384601593, + "learning_rate": 7.29767680764002e-05, + "loss": 1.879, + "step": 11977 + }, + { + "epoch": 3.6764886433394723, + "grad_norm": 0.29326459765434265, + "learning_rate": 7.297235331304155e-05, + "loss": 1.804, + "step": 11978 + }, + { + "epoch": 3.6767955801104972, + "grad_norm": 0.2697092592716217, + "learning_rate": 7.296793832265663e-05, + "loss": 1.7842, + "step": 11979 + }, + { + "epoch": 3.6771025168815226, + "grad_norm": 0.3045118749141693, + "learning_rate": 7.296352310528909e-05, + "loss": 1.7959, + "step": 11980 + }, + { + "epoch": 3.6774094536525475, + "grad_norm": 0.278647780418396, + "learning_rate": 7.295910766098252e-05, + "loss": 1.7907, + "step": 11981 + }, + { + "epoch": 3.677716390423573, + "grad_norm": 0.2370275855064392, + "learning_rate": 7.295469198978063e-05, + "loss": 1.757, + "step": 11982 + }, + { + "epoch": 3.678023327194598, + "grad_norm": 0.3061021566390991, + "learning_rate": 7.295027609172702e-05, + "loss": 1.7927, + "step": 11983 + }, + { + "epoch": 3.678330263965623, + "grad_norm": 0.2844544053077698, + "learning_rate": 7.294585996686532e-05, + "loss": 1.7705, + "step": 11984 + }, + { + "epoch": 3.6786372007366483, + "grad_norm": 0.31121113896369934, + "learning_rate": 7.29414436152392e-05, + "loss": 1.783, + "step": 11985 + }, + { + "epoch": 3.6789441375076732, + "grad_norm": 0.2566785514354706, + "learning_rate": 7.293702703689225e-05, + "loss": 1.7781, + "step": 11986 + }, + { + "epoch": 3.6792510742786986, + "grad_norm": 0.22176961600780487, + "learning_rate": 7.293261023186818e-05, + "loss": 1.7302, + "step": 11987 + }, + { + "epoch": 3.679558011049724, + "grad_norm": 0.21547441184520721, + "learning_rate": 7.292819320021062e-05, + "loss": 1.7666, + "step": 11988 + }, + { + "epoch": 3.679864947820749, + "grad_norm": 0.26309674978256226, + "learning_rate": 7.29237759419632e-05, + "loss": 1.7817, + "step": 11989 + }, + { + "epoch": 3.680171884591774, + "grad_norm": 0.2558063864707947, + "learning_rate": 7.29193584571696e-05, + "loss": 1.8257, + "step": 11990 + }, + { + "epoch": 3.680478821362799, + "grad_norm": 0.24516844749450684, + "learning_rate": 7.291494074587347e-05, + "loss": 1.7803, + "step": 11991 + }, + { + "epoch": 3.6807857581338244, + "grad_norm": 0.22891047596931458, + "learning_rate": 7.291052280811843e-05, + "loss": 1.7977, + "step": 11992 + }, + { + "epoch": 3.6810926949048497, + "grad_norm": 0.2776026129722595, + "learning_rate": 7.290610464394822e-05, + "loss": 1.8486, + "step": 11993 + }, + { + "epoch": 3.681399631675875, + "grad_norm": 0.31472426652908325, + "learning_rate": 7.290168625340644e-05, + "loss": 1.7841, + "step": 11994 + }, + { + "epoch": 3.6817065684469, + "grad_norm": 0.3459274470806122, + "learning_rate": 7.289726763653677e-05, + "loss": 1.7458, + "step": 11995 + }, + { + "epoch": 3.6820135052179253, + "grad_norm": 0.23645849525928497, + "learning_rate": 7.289284879338289e-05, + "loss": 1.781, + "step": 11996 + }, + { + "epoch": 3.68232044198895, + "grad_norm": 0.3257114291191101, + "learning_rate": 7.288842972398845e-05, + "loss": 1.8269, + "step": 11997 + }, + { + "epoch": 3.6826273787599755, + "grad_norm": 0.5450126528739929, + "learning_rate": 7.288401042839713e-05, + "loss": 1.8342, + "step": 11998 + }, + { + "epoch": 3.682934315531001, + "grad_norm": 0.5080512762069702, + "learning_rate": 7.287959090665262e-05, + "loss": 1.8097, + "step": 11999 + }, + { + "epoch": 3.6832412523020257, + "grad_norm": 0.3005252480506897, + "learning_rate": 7.287517115879858e-05, + "loss": 1.8271, + "step": 12000 + }, + { + "epoch": 3.683548189073051, + "grad_norm": 0.2760924994945526, + "learning_rate": 7.287075118487869e-05, + "loss": 1.8267, + "step": 12001 + }, + { + "epoch": 3.683855125844076, + "grad_norm": 0.3475865423679352, + "learning_rate": 7.286633098493663e-05, + "loss": 1.785, + "step": 12002 + }, + { + "epoch": 3.6841620626151013, + "grad_norm": 0.2905690670013428, + "learning_rate": 7.286191055901608e-05, + "loss": 1.8283, + "step": 12003 + }, + { + "epoch": 3.6844689993861266, + "grad_norm": 0.23666246235370636, + "learning_rate": 7.285748990716072e-05, + "loss": 1.7665, + "step": 12004 + }, + { + "epoch": 3.6847759361571515, + "grad_norm": 0.32329514622688293, + "learning_rate": 7.285306902941427e-05, + "loss": 1.7267, + "step": 12005 + }, + { + "epoch": 3.685082872928177, + "grad_norm": 0.32345879077911377, + "learning_rate": 7.28486479258204e-05, + "loss": 1.7529, + "step": 12006 + }, + { + "epoch": 3.6853898096992017, + "grad_norm": 0.2727855443954468, + "learning_rate": 7.284422659642279e-05, + "loss": 1.8279, + "step": 12007 + }, + { + "epoch": 3.685696746470227, + "grad_norm": 0.37847277522087097, + "learning_rate": 7.283980504126513e-05, + "loss": 1.7809, + "step": 12008 + }, + { + "epoch": 3.6860036832412524, + "grad_norm": 0.44694215059280396, + "learning_rate": 7.283538326039113e-05, + "loss": 1.8184, + "step": 12009 + }, + { + "epoch": 3.6863106200122777, + "grad_norm": 0.2868261933326721, + "learning_rate": 7.28309612538445e-05, + "loss": 1.7461, + "step": 12010 + }, + { + "epoch": 3.6866175567833026, + "grad_norm": 0.2601351737976074, + "learning_rate": 7.282653902166894e-05, + "loss": 1.8011, + "step": 12011 + }, + { + "epoch": 3.686924493554328, + "grad_norm": 0.328185498714447, + "learning_rate": 7.282211656390813e-05, + "loss": 1.7934, + "step": 12012 + }, + { + "epoch": 3.687231430325353, + "grad_norm": 0.2712559103965759, + "learning_rate": 7.281769388060578e-05, + "loss": 1.7566, + "step": 12013 + }, + { + "epoch": 3.687538367096378, + "grad_norm": 0.2725805938243866, + "learning_rate": 7.281327097180562e-05, + "loss": 1.8024, + "step": 12014 + }, + { + "epoch": 3.6878453038674035, + "grad_norm": 0.37282630801200867, + "learning_rate": 7.280884783755133e-05, + "loss": 1.7624, + "step": 12015 + }, + { + "epoch": 3.6881522406384284, + "grad_norm": 0.36519256234169006, + "learning_rate": 7.280442447788664e-05, + "loss": 1.8691, + "step": 12016 + }, + { + "epoch": 3.6884591774094537, + "grad_norm": 0.21699345111846924, + "learning_rate": 7.280000089285528e-05, + "loss": 1.7308, + "step": 12017 + }, + { + "epoch": 3.6887661141804786, + "grad_norm": 0.3159945011138916, + "learning_rate": 7.279557708250094e-05, + "loss": 1.8144, + "step": 12018 + }, + { + "epoch": 3.689073050951504, + "grad_norm": 0.2927449643611908, + "learning_rate": 7.279115304686735e-05, + "loss": 1.7746, + "step": 12019 + }, + { + "epoch": 3.6893799877225293, + "grad_norm": 0.279208242893219, + "learning_rate": 7.278672878599819e-05, + "loss": 1.7678, + "step": 12020 + }, + { + "epoch": 3.689686924493554, + "grad_norm": 0.40005648136138916, + "learning_rate": 7.278230429993725e-05, + "loss": 1.7876, + "step": 12021 + }, + { + "epoch": 3.6899938612645795, + "grad_norm": 0.3444392681121826, + "learning_rate": 7.277787958872824e-05, + "loss": 1.7591, + "step": 12022 + }, + { + "epoch": 3.6903007980356044, + "grad_norm": 0.21841467916965485, + "learning_rate": 7.277345465241485e-05, + "loss": 1.785, + "step": 12023 + }, + { + "epoch": 3.6906077348066297, + "grad_norm": 0.32463181018829346, + "learning_rate": 7.276902949104084e-05, + "loss": 1.8164, + "step": 12024 + }, + { + "epoch": 3.690914671577655, + "grad_norm": 0.36221247911453247, + "learning_rate": 7.276460410464994e-05, + "loss": 1.7529, + "step": 12025 + }, + { + "epoch": 3.6912216083486804, + "grad_norm": 0.24451927840709686, + "learning_rate": 7.276017849328588e-05, + "loss": 1.8031, + "step": 12026 + }, + { + "epoch": 3.6915285451197053, + "grad_norm": 0.3055694103240967, + "learning_rate": 7.275575265699239e-05, + "loss": 1.8158, + "step": 12027 + }, + { + "epoch": 3.6918354818907306, + "grad_norm": 0.4315083622932434, + "learning_rate": 7.27513265958132e-05, + "loss": 1.8322, + "step": 12028 + }, + { + "epoch": 3.6921424186617555, + "grad_norm": 0.3391095697879791, + "learning_rate": 7.274690030979209e-05, + "loss": 1.8214, + "step": 12029 + }, + { + "epoch": 3.692449355432781, + "grad_norm": 0.22714883089065552, + "learning_rate": 7.274247379897277e-05, + "loss": 1.7312, + "step": 12030 + }, + { + "epoch": 3.692756292203806, + "grad_norm": 0.24982765316963196, + "learning_rate": 7.273804706339899e-05, + "loss": 1.738, + "step": 12031 + }, + { + "epoch": 3.693063228974831, + "grad_norm": 0.32509860396385193, + "learning_rate": 7.273362010311451e-05, + "loss": 1.7773, + "step": 12032 + }, + { + "epoch": 3.6933701657458564, + "grad_norm": 0.2643086612224579, + "learning_rate": 7.272919291816307e-05, + "loss": 1.7545, + "step": 12033 + }, + { + "epoch": 3.6936771025168813, + "grad_norm": 0.2568800747394562, + "learning_rate": 7.272476550858842e-05, + "loss": 1.8055, + "step": 12034 + }, + { + "epoch": 3.6939840392879066, + "grad_norm": 0.27418240904808044, + "learning_rate": 7.272033787443433e-05, + "loss": 1.7769, + "step": 12035 + }, + { + "epoch": 3.694290976058932, + "grad_norm": 0.2459677755832672, + "learning_rate": 7.271591001574453e-05, + "loss": 1.7971, + "step": 12036 + }, + { + "epoch": 3.694597912829957, + "grad_norm": 0.22349393367767334, + "learning_rate": 7.27114819325628e-05, + "loss": 1.7791, + "step": 12037 + }, + { + "epoch": 3.694904849600982, + "grad_norm": 0.25321197509765625, + "learning_rate": 7.270705362493288e-05, + "loss": 1.7475, + "step": 12038 + }, + { + "epoch": 3.695211786372007, + "grad_norm": 0.2585916519165039, + "learning_rate": 7.270262509289855e-05, + "loss": 1.7801, + "step": 12039 + }, + { + "epoch": 3.6955187231430324, + "grad_norm": 0.2673574686050415, + "learning_rate": 7.269819633650359e-05, + "loss": 1.7578, + "step": 12040 + }, + { + "epoch": 3.6958256599140578, + "grad_norm": 0.2509469985961914, + "learning_rate": 7.269376735579175e-05, + "loss": 1.7994, + "step": 12041 + }, + { + "epoch": 3.696132596685083, + "grad_norm": 0.28527703881263733, + "learning_rate": 7.268933815080679e-05, + "loss": 1.7752, + "step": 12042 + }, + { + "epoch": 3.696439533456108, + "grad_norm": 0.22716578841209412, + "learning_rate": 7.268490872159248e-05, + "loss": 1.7186, + "step": 12043 + }, + { + "epoch": 3.6967464702271333, + "grad_norm": 0.24888403713703156, + "learning_rate": 7.268047906819262e-05, + "loss": 1.7882, + "step": 12044 + }, + { + "epoch": 3.697053406998158, + "grad_norm": 0.28976112604141235, + "learning_rate": 7.267604919065096e-05, + "loss": 1.7655, + "step": 12045 + }, + { + "epoch": 3.6973603437691835, + "grad_norm": 0.24668502807617188, + "learning_rate": 7.267161908901131e-05, + "loss": 1.8051, + "step": 12046 + }, + { + "epoch": 3.697667280540209, + "grad_norm": 0.2464776188135147, + "learning_rate": 7.266718876331742e-05, + "loss": 1.809, + "step": 12047 + }, + { + "epoch": 3.6979742173112338, + "grad_norm": 0.27648577094078064, + "learning_rate": 7.266275821361309e-05, + "loss": 1.7869, + "step": 12048 + }, + { + "epoch": 3.698281154082259, + "grad_norm": 0.26427242159843445, + "learning_rate": 7.26583274399421e-05, + "loss": 1.7681, + "step": 12049 + }, + { + "epoch": 3.698588090853284, + "grad_norm": 0.24595285952091217, + "learning_rate": 7.265389644234823e-05, + "loss": 1.7209, + "step": 12050 + }, + { + "epoch": 3.6988950276243093, + "grad_norm": 0.32514405250549316, + "learning_rate": 7.26494652208753e-05, + "loss": 1.8702, + "step": 12051 + }, + { + "epoch": 3.6992019643953347, + "grad_norm": 0.24512936174869537, + "learning_rate": 7.264503377556705e-05, + "loss": 1.784, + "step": 12052 + }, + { + "epoch": 3.69950890116636, + "grad_norm": 0.28698310256004333, + "learning_rate": 7.264060210646733e-05, + "loss": 1.905, + "step": 12053 + }, + { + "epoch": 3.699815837937385, + "grad_norm": 0.2995007336139679, + "learning_rate": 7.263617021361989e-05, + "loss": 1.7822, + "step": 12054 + }, + { + "epoch": 3.7001227747084102, + "grad_norm": 0.25869423151016235, + "learning_rate": 7.263173809706855e-05, + "loss": 1.7988, + "step": 12055 + }, + { + "epoch": 3.700429711479435, + "grad_norm": 0.350918710231781, + "learning_rate": 7.262730575685711e-05, + "loss": 1.9504, + "step": 12056 + }, + { + "epoch": 3.7007366482504604, + "grad_norm": 0.3407665491104126, + "learning_rate": 7.262287319302937e-05, + "loss": 1.8506, + "step": 12057 + }, + { + "epoch": 3.701043585021486, + "grad_norm": 0.3039441704750061, + "learning_rate": 7.261844040562915e-05, + "loss": 1.7841, + "step": 12058 + }, + { + "epoch": 3.7013505217925107, + "grad_norm": 0.23483428359031677, + "learning_rate": 7.261400739470023e-05, + "loss": 1.7899, + "step": 12059 + }, + { + "epoch": 3.701657458563536, + "grad_norm": 0.30779507756233215, + "learning_rate": 7.260957416028645e-05, + "loss": 1.8131, + "step": 12060 + }, + { + "epoch": 3.701964395334561, + "grad_norm": 0.29901376366615295, + "learning_rate": 7.26051407024316e-05, + "loss": 1.7861, + "step": 12061 + }, + { + "epoch": 3.7022713321055862, + "grad_norm": 0.30058762431144714, + "learning_rate": 7.260070702117949e-05, + "loss": 1.7485, + "step": 12062 + }, + { + "epoch": 3.7025782688766116, + "grad_norm": 0.24523651599884033, + "learning_rate": 7.259627311657396e-05, + "loss": 1.772, + "step": 12063 + }, + { + "epoch": 3.7028852056476365, + "grad_norm": 0.24375474452972412, + "learning_rate": 7.259183898865882e-05, + "loss": 1.7848, + "step": 12064 + }, + { + "epoch": 3.703192142418662, + "grad_norm": 0.2562403380870819, + "learning_rate": 7.258740463747788e-05, + "loss": 1.7447, + "step": 12065 + }, + { + "epoch": 3.7034990791896867, + "grad_norm": 0.265229195356369, + "learning_rate": 7.258297006307496e-05, + "loss": 1.8111, + "step": 12066 + }, + { + "epoch": 3.703806015960712, + "grad_norm": 0.2836552858352661, + "learning_rate": 7.25785352654939e-05, + "loss": 1.7952, + "step": 12067 + }, + { + "epoch": 3.7041129527317374, + "grad_norm": 0.3269572854042053, + "learning_rate": 7.257410024477852e-05, + "loss": 1.8604, + "step": 12068 + }, + { + "epoch": 3.7044198895027627, + "grad_norm": 0.2391490638256073, + "learning_rate": 7.256966500097264e-05, + "loss": 1.7417, + "step": 12069 + }, + { + "epoch": 3.7047268262737876, + "grad_norm": 0.2610675096511841, + "learning_rate": 7.256522953412011e-05, + "loss": 1.7712, + "step": 12070 + }, + { + "epoch": 3.705033763044813, + "grad_norm": 0.24954774975776672, + "learning_rate": 7.256079384426477e-05, + "loss": 1.7506, + "step": 12071 + }, + { + "epoch": 3.705340699815838, + "grad_norm": 0.2603892385959625, + "learning_rate": 7.255635793145042e-05, + "loss": 1.8105, + "step": 12072 + }, + { + "epoch": 3.705647636586863, + "grad_norm": 0.32728591561317444, + "learning_rate": 7.255192179572092e-05, + "loss": 1.8448, + "step": 12073 + }, + { + "epoch": 3.7059545733578885, + "grad_norm": 0.4559340178966522, + "learning_rate": 7.254748543712013e-05, + "loss": 1.7232, + "step": 12074 + }, + { + "epoch": 3.7062615101289134, + "grad_norm": 0.36526206135749817, + "learning_rate": 7.254304885569186e-05, + "loss": 1.7874, + "step": 12075 + }, + { + "epoch": 3.7065684468999387, + "grad_norm": 0.21606837213039398, + "learning_rate": 7.253861205147998e-05, + "loss": 1.7266, + "step": 12076 + }, + { + "epoch": 3.7068753836709636, + "grad_norm": 0.3629585802555084, + "learning_rate": 7.253417502452831e-05, + "loss": 1.7722, + "step": 12077 + }, + { + "epoch": 3.707182320441989, + "grad_norm": 0.4224923551082611, + "learning_rate": 7.252973777488072e-05, + "loss": 1.7369, + "step": 12078 + }, + { + "epoch": 3.7074892572130143, + "grad_norm": 0.32245784997940063, + "learning_rate": 7.252530030258106e-05, + "loss": 1.7836, + "step": 12079 + }, + { + "epoch": 3.707796193984039, + "grad_norm": 0.29909494519233704, + "learning_rate": 7.252086260767317e-05, + "loss": 1.8718, + "step": 12080 + }, + { + "epoch": 3.7081031307550645, + "grad_norm": 0.21995799243450165, + "learning_rate": 7.251642469020093e-05, + "loss": 1.7103, + "step": 12081 + }, + { + "epoch": 3.7084100675260894, + "grad_norm": 0.2737572193145752, + "learning_rate": 7.251198655020818e-05, + "loss": 1.7787, + "step": 12082 + }, + { + "epoch": 3.7087170042971147, + "grad_norm": 0.22417058050632477, + "learning_rate": 7.250754818773879e-05, + "loss": 1.7782, + "step": 12083 + }, + { + "epoch": 3.70902394106814, + "grad_norm": 0.3350662887096405, + "learning_rate": 7.25031096028366e-05, + "loss": 1.8193, + "step": 12084 + }, + { + "epoch": 3.7093308778391654, + "grad_norm": 0.3199101686477661, + "learning_rate": 7.24986707955455e-05, + "loss": 1.831, + "step": 12085 + }, + { + "epoch": 3.7096378146101903, + "grad_norm": 0.2513977289199829, + "learning_rate": 7.249423176590936e-05, + "loss": 1.8288, + "step": 12086 + }, + { + "epoch": 3.7099447513812156, + "grad_norm": 0.30411866307258606, + "learning_rate": 7.248979251397203e-05, + "loss": 1.7837, + "step": 12087 + }, + { + "epoch": 3.7102516881522405, + "grad_norm": 0.30755332112312317, + "learning_rate": 7.248535303977738e-05, + "loss": 1.8016, + "step": 12088 + }, + { + "epoch": 3.710558624923266, + "grad_norm": 0.25746986269950867, + "learning_rate": 7.248091334336929e-05, + "loss": 1.8014, + "step": 12089 + }, + { + "epoch": 3.710865561694291, + "grad_norm": 0.3327447772026062, + "learning_rate": 7.247647342479164e-05, + "loss": 1.752, + "step": 12090 + }, + { + "epoch": 3.711172498465316, + "grad_norm": 0.3101816475391388, + "learning_rate": 7.247203328408832e-05, + "loss": 1.7867, + "step": 12091 + }, + { + "epoch": 3.7114794352363414, + "grad_norm": 0.2168906182050705, + "learning_rate": 7.246759292130318e-05, + "loss": 1.7452, + "step": 12092 + }, + { + "epoch": 3.7117863720073663, + "grad_norm": 0.34260258078575134, + "learning_rate": 7.246315233648013e-05, + "loss": 1.8156, + "step": 12093 + }, + { + "epoch": 3.7120933087783916, + "grad_norm": 0.2730714976787567, + "learning_rate": 7.245871152966303e-05, + "loss": 1.7429, + "step": 12094 + }, + { + "epoch": 3.712400245549417, + "grad_norm": 0.2560936212539673, + "learning_rate": 7.245427050089578e-05, + "loss": 1.7969, + "step": 12095 + }, + { + "epoch": 3.712707182320442, + "grad_norm": 0.27510303258895874, + "learning_rate": 7.244982925022228e-05, + "loss": 1.7981, + "step": 12096 + }, + { + "epoch": 3.713014119091467, + "grad_norm": 0.29171642661094666, + "learning_rate": 7.24453877776864e-05, + "loss": 1.7913, + "step": 12097 + }, + { + "epoch": 3.713321055862492, + "grad_norm": 0.26431843638420105, + "learning_rate": 7.244094608333206e-05, + "loss": 1.8262, + "step": 12098 + }, + { + "epoch": 3.7136279926335174, + "grad_norm": 0.30747905373573303, + "learning_rate": 7.243650416720311e-05, + "loss": 1.7951, + "step": 12099 + }, + { + "epoch": 3.7139349294045427, + "grad_norm": 0.346443772315979, + "learning_rate": 7.24320620293435e-05, + "loss": 1.7677, + "step": 12100 + }, + { + "epoch": 3.714241866175568, + "grad_norm": 0.2910652458667755, + "learning_rate": 7.242761966979709e-05, + "loss": 1.7887, + "step": 12101 + }, + { + "epoch": 3.714548802946593, + "grad_norm": 0.22342006862163544, + "learning_rate": 7.24231770886078e-05, + "loss": 1.7678, + "step": 12102 + }, + { + "epoch": 3.7148557397176183, + "grad_norm": 0.24125796556472778, + "learning_rate": 7.241873428581954e-05, + "loss": 1.7436, + "step": 12103 + }, + { + "epoch": 3.715162676488643, + "grad_norm": 0.23542635142803192, + "learning_rate": 7.24142912614762e-05, + "loss": 1.7942, + "step": 12104 + }, + { + "epoch": 3.7154696132596685, + "grad_norm": 0.22476384043693542, + "learning_rate": 7.240984801562169e-05, + "loss": 1.8235, + "step": 12105 + }, + { + "epoch": 3.715776550030694, + "grad_norm": 0.25123465061187744, + "learning_rate": 7.240540454829992e-05, + "loss": 1.8112, + "step": 12106 + }, + { + "epoch": 3.7160834868017187, + "grad_norm": 0.27230000495910645, + "learning_rate": 7.240096085955483e-05, + "loss": 1.8312, + "step": 12107 + }, + { + "epoch": 3.716390423572744, + "grad_norm": 0.2722976803779602, + "learning_rate": 7.239651694943031e-05, + "loss": 1.8368, + "step": 12108 + }, + { + "epoch": 3.716697360343769, + "grad_norm": 0.264138400554657, + "learning_rate": 7.239207281797028e-05, + "loss": 1.8206, + "step": 12109 + }, + { + "epoch": 3.7170042971147943, + "grad_norm": 0.28813931345939636, + "learning_rate": 7.238762846521866e-05, + "loss": 1.7391, + "step": 12110 + }, + { + "epoch": 3.7173112338858196, + "grad_norm": 0.2319631576538086, + "learning_rate": 7.238318389121939e-05, + "loss": 1.7574, + "step": 12111 + }, + { + "epoch": 3.717618170656845, + "grad_norm": 0.2507809102535248, + "learning_rate": 7.237873909601635e-05, + "loss": 1.7359, + "step": 12112 + }, + { + "epoch": 3.71792510742787, + "grad_norm": 0.2717304825782776, + "learning_rate": 7.237429407965351e-05, + "loss": 1.774, + "step": 12113 + }, + { + "epoch": 3.718232044198895, + "grad_norm": 0.2619280517101288, + "learning_rate": 7.236984884217478e-05, + "loss": 1.8083, + "step": 12114 + }, + { + "epoch": 3.71853898096992, + "grad_norm": 0.22268806397914886, + "learning_rate": 7.23654033836241e-05, + "loss": 1.7436, + "step": 12115 + }, + { + "epoch": 3.7188459177409454, + "grad_norm": 0.2341407984495163, + "learning_rate": 7.236095770404539e-05, + "loss": 1.7807, + "step": 12116 + }, + { + "epoch": 3.7191528545119708, + "grad_norm": 0.23519712686538696, + "learning_rate": 7.235651180348258e-05, + "loss": 1.8051, + "step": 12117 + }, + { + "epoch": 3.7194597912829956, + "grad_norm": 0.2391074150800705, + "learning_rate": 7.235206568197963e-05, + "loss": 1.8377, + "step": 12118 + }, + { + "epoch": 3.719766728054021, + "grad_norm": 0.26821592450141907, + "learning_rate": 7.234761933958045e-05, + "loss": 1.8586, + "step": 12119 + }, + { + "epoch": 3.720073664825046, + "grad_norm": 0.24971134960651398, + "learning_rate": 7.234317277632902e-05, + "loss": 1.8404, + "step": 12120 + }, + { + "epoch": 3.720380601596071, + "grad_norm": 0.20817919075489044, + "learning_rate": 7.233872599226926e-05, + "loss": 1.7204, + "step": 12121 + }, + { + "epoch": 3.7206875383670965, + "grad_norm": 0.29301291704177856, + "learning_rate": 7.233427898744509e-05, + "loss": 1.8528, + "step": 12122 + }, + { + "epoch": 3.7209944751381214, + "grad_norm": 0.22214651107788086, + "learning_rate": 7.23298317619005e-05, + "loss": 1.748, + "step": 12123 + }, + { + "epoch": 3.7213014119091468, + "grad_norm": 0.2511044442653656, + "learning_rate": 7.232538431567941e-05, + "loss": 1.8146, + "step": 12124 + }, + { + "epoch": 3.7216083486801717, + "grad_norm": 0.26976367831230164, + "learning_rate": 7.232093664882581e-05, + "loss": 1.8483, + "step": 12125 + }, + { + "epoch": 3.721915285451197, + "grad_norm": 0.2538089156150818, + "learning_rate": 7.231648876138361e-05, + "loss": 1.8097, + "step": 12126 + }, + { + "epoch": 3.7222222222222223, + "grad_norm": 0.2353016883134842, + "learning_rate": 7.231204065339682e-05, + "loss": 1.737, + "step": 12127 + }, + { + "epoch": 3.7225291589932477, + "grad_norm": 0.3205147981643677, + "learning_rate": 7.230759232490935e-05, + "loss": 1.8116, + "step": 12128 + }, + { + "epoch": 3.7228360957642725, + "grad_norm": 0.39056599140167236, + "learning_rate": 7.230314377596516e-05, + "loss": 1.7785, + "step": 12129 + }, + { + "epoch": 3.723143032535298, + "grad_norm": 0.3846863806247711, + "learning_rate": 7.229869500660825e-05, + "loss": 1.738, + "step": 12130 + }, + { + "epoch": 3.7234499693063228, + "grad_norm": 0.24412120878696442, + "learning_rate": 7.229424601688256e-05, + "loss": 1.7351, + "step": 12131 + }, + { + "epoch": 3.723756906077348, + "grad_norm": 0.2978009581565857, + "learning_rate": 7.228979680683206e-05, + "loss": 1.8037, + "step": 12132 + }, + { + "epoch": 3.7240638428483734, + "grad_norm": 0.33787262439727783, + "learning_rate": 7.228534737650074e-05, + "loss": 1.8421, + "step": 12133 + }, + { + "epoch": 3.7243707796193983, + "grad_norm": 0.2536921203136444, + "learning_rate": 7.228089772593254e-05, + "loss": 1.7472, + "step": 12134 + }, + { + "epoch": 3.7246777163904237, + "grad_norm": 0.24103601276874542, + "learning_rate": 7.227644785517144e-05, + "loss": 1.8011, + "step": 12135 + }, + { + "epoch": 3.7249846531614486, + "grad_norm": 0.3653033375740051, + "learning_rate": 7.227199776426146e-05, + "loss": 1.8018, + "step": 12136 + }, + { + "epoch": 3.725291589932474, + "grad_norm": 0.35728752613067627, + "learning_rate": 7.226754745324652e-05, + "loss": 1.7684, + "step": 12137 + }, + { + "epoch": 3.7255985267034992, + "grad_norm": 0.262018620967865, + "learning_rate": 7.226309692217063e-05, + "loss": 1.8124, + "step": 12138 + }, + { + "epoch": 3.725905463474524, + "grad_norm": 0.3467118442058563, + "learning_rate": 7.225864617107776e-05, + "loss": 1.8761, + "step": 12139 + }, + { + "epoch": 3.7262124002455494, + "grad_norm": 0.4365626871585846, + "learning_rate": 7.22541952000119e-05, + "loss": 1.7159, + "step": 12140 + }, + { + "epoch": 3.7265193370165743, + "grad_norm": 0.2819811999797821, + "learning_rate": 7.224974400901705e-05, + "loss": 1.8051, + "step": 12141 + }, + { + "epoch": 3.7268262737875997, + "grad_norm": 0.39062437415122986, + "learning_rate": 7.224529259813719e-05, + "loss": 1.8517, + "step": 12142 + }, + { + "epoch": 3.727133210558625, + "grad_norm": 0.4383927285671234, + "learning_rate": 7.22408409674163e-05, + "loss": 1.8295, + "step": 12143 + }, + { + "epoch": 3.7274401473296503, + "grad_norm": 0.3043094575405121, + "learning_rate": 7.223638911689839e-05, + "loss": 1.7653, + "step": 12144 + }, + { + "epoch": 3.7277470841006752, + "grad_norm": 0.25198984146118164, + "learning_rate": 7.223193704662746e-05, + "loss": 1.7561, + "step": 12145 + }, + { + "epoch": 3.7280540208717006, + "grad_norm": 0.353565514087677, + "learning_rate": 7.222748475664749e-05, + "loss": 1.8077, + "step": 12146 + }, + { + "epoch": 3.7283609576427255, + "grad_norm": 0.39757224917411804, + "learning_rate": 7.222303224700248e-05, + "loss": 1.7622, + "step": 12147 + }, + { + "epoch": 3.728667894413751, + "grad_norm": 0.35595703125, + "learning_rate": 7.221857951773644e-05, + "loss": 1.8436, + "step": 12148 + }, + { + "epoch": 3.728974831184776, + "grad_norm": 0.2469715029001236, + "learning_rate": 7.221412656889338e-05, + "loss": 1.8531, + "step": 12149 + }, + { + "epoch": 3.729281767955801, + "grad_norm": 0.35324424505233765, + "learning_rate": 7.22096734005173e-05, + "loss": 1.7361, + "step": 12150 + }, + { + "epoch": 3.7295887047268264, + "grad_norm": 0.3783365488052368, + "learning_rate": 7.220522001265223e-05, + "loss": 1.7459, + "step": 12151 + }, + { + "epoch": 3.7298956414978512, + "grad_norm": 0.27526360750198364, + "learning_rate": 7.220076640534212e-05, + "loss": 1.8867, + "step": 12152 + }, + { + "epoch": 3.7302025782688766, + "grad_norm": 0.30863118171691895, + "learning_rate": 7.219631257863105e-05, + "loss": 1.7363, + "step": 12153 + }, + { + "epoch": 3.730509515039902, + "grad_norm": 0.38505107164382935, + "learning_rate": 7.219185853256301e-05, + "loss": 1.764, + "step": 12154 + }, + { + "epoch": 3.730816451810927, + "grad_norm": 0.2925978899002075, + "learning_rate": 7.218740426718202e-05, + "loss": 1.7693, + "step": 12155 + }, + { + "epoch": 3.731123388581952, + "grad_norm": 0.24510078132152557, + "learning_rate": 7.218294978253209e-05, + "loss": 1.8089, + "step": 12156 + }, + { + "epoch": 3.731430325352977, + "grad_norm": 0.33029109239578247, + "learning_rate": 7.217849507865724e-05, + "loss": 1.6885, + "step": 12157 + }, + { + "epoch": 3.7317372621240024, + "grad_norm": 0.333970308303833, + "learning_rate": 7.217404015560149e-05, + "loss": 1.8132, + "step": 12158 + }, + { + "epoch": 3.7320441988950277, + "grad_norm": 0.2467660754919052, + "learning_rate": 7.216958501340891e-05, + "loss": 1.8021, + "step": 12159 + }, + { + "epoch": 3.732351135666053, + "grad_norm": 0.2701449990272522, + "learning_rate": 7.216512965212348e-05, + "loss": 1.7006, + "step": 12160 + }, + { + "epoch": 3.732658072437078, + "grad_norm": 0.2784138023853302, + "learning_rate": 7.216067407178926e-05, + "loss": 1.7616, + "step": 12161 + }, + { + "epoch": 3.7329650092081033, + "grad_norm": 0.2082870900630951, + "learning_rate": 7.215621827245026e-05, + "loss": 1.7391, + "step": 12162 + }, + { + "epoch": 3.733271945979128, + "grad_norm": 0.2477869987487793, + "learning_rate": 7.215176225415053e-05, + "loss": 1.7761, + "step": 12163 + }, + { + "epoch": 3.7335788827501535, + "grad_norm": 0.28395572304725647, + "learning_rate": 7.21473060169341e-05, + "loss": 1.8181, + "step": 12164 + }, + { + "epoch": 3.733885819521179, + "grad_norm": 0.20430058240890503, + "learning_rate": 7.2142849560845e-05, + "loss": 1.7035, + "step": 12165 + }, + { + "epoch": 3.7341927562922037, + "grad_norm": 0.30061420798301697, + "learning_rate": 7.21383928859273e-05, + "loss": 1.7703, + "step": 12166 + }, + { + "epoch": 3.734499693063229, + "grad_norm": 0.33865803480148315, + "learning_rate": 7.2133935992225e-05, + "loss": 1.8204, + "step": 12167 + }, + { + "epoch": 3.734806629834254, + "grad_norm": 0.29172980785369873, + "learning_rate": 7.212947887978221e-05, + "loss": 1.739, + "step": 12168 + }, + { + "epoch": 3.7351135666052793, + "grad_norm": 0.2799396812915802, + "learning_rate": 7.212502154864291e-05, + "loss": 1.8503, + "step": 12169 + }, + { + "epoch": 3.7354205033763046, + "grad_norm": 0.2945539355278015, + "learning_rate": 7.212056399885118e-05, + "loss": 1.7523, + "step": 12170 + }, + { + "epoch": 3.7357274401473295, + "grad_norm": 0.2395290732383728, + "learning_rate": 7.211610623045108e-05, + "loss": 1.7728, + "step": 12171 + }, + { + "epoch": 3.736034376918355, + "grad_norm": 0.24369286000728607, + "learning_rate": 7.211164824348667e-05, + "loss": 1.7725, + "step": 12172 + }, + { + "epoch": 3.7363413136893797, + "grad_norm": 0.3272435963153839, + "learning_rate": 7.210719003800197e-05, + "loss": 1.8531, + "step": 12173 + }, + { + "epoch": 3.736648250460405, + "grad_norm": 0.23954182863235474, + "learning_rate": 7.210273161404107e-05, + "loss": 1.7807, + "step": 12174 + }, + { + "epoch": 3.7369551872314304, + "grad_norm": 0.24547603726387024, + "learning_rate": 7.209827297164801e-05, + "loss": 1.8481, + "step": 12175 + }, + { + "epoch": 3.7372621240024557, + "grad_norm": 0.26926249265670776, + "learning_rate": 7.209381411086687e-05, + "loss": 1.7496, + "step": 12176 + }, + { + "epoch": 3.7375690607734806, + "grad_norm": 0.22948235273361206, + "learning_rate": 7.208935503174172e-05, + "loss": 1.7681, + "step": 12177 + }, + { + "epoch": 3.737875997544506, + "grad_norm": 0.2697654664516449, + "learning_rate": 7.20848957343166e-05, + "loss": 1.789, + "step": 12178 + }, + { + "epoch": 3.738182934315531, + "grad_norm": 0.235344797372818, + "learning_rate": 7.208043621863562e-05, + "loss": 1.8309, + "step": 12179 + }, + { + "epoch": 3.738489871086556, + "grad_norm": 0.2688879072666168, + "learning_rate": 7.20759764847428e-05, + "loss": 1.7898, + "step": 12180 + }, + { + "epoch": 3.7387968078575815, + "grad_norm": 0.26818978786468506, + "learning_rate": 7.207151653268226e-05, + "loss": 1.7882, + "step": 12181 + }, + { + "epoch": 3.7391037446286064, + "grad_norm": 0.2612875998020172, + "learning_rate": 7.206705636249804e-05, + "loss": 1.7352, + "step": 12182 + }, + { + "epoch": 3.7394106813996317, + "grad_norm": 0.22547565400600433, + "learning_rate": 7.206259597423425e-05, + "loss": 1.733, + "step": 12183 + }, + { + "epoch": 3.7397176181706566, + "grad_norm": 0.24645474553108215, + "learning_rate": 7.205813536793495e-05, + "loss": 1.8064, + "step": 12184 + }, + { + "epoch": 3.740024554941682, + "grad_norm": 0.25879329442977905, + "learning_rate": 7.205367454364424e-05, + "loss": 1.8134, + "step": 12185 + }, + { + "epoch": 3.7403314917127073, + "grad_norm": 0.22420097887516022, + "learning_rate": 7.204921350140617e-05, + "loss": 1.7819, + "step": 12186 + }, + { + "epoch": 3.7406384284837326, + "grad_norm": 0.2569858431816101, + "learning_rate": 7.204475224126487e-05, + "loss": 1.784, + "step": 12187 + }, + { + "epoch": 3.7409453652547575, + "grad_norm": 0.23769912123680115, + "learning_rate": 7.20402907632644e-05, + "loss": 1.7853, + "step": 12188 + }, + { + "epoch": 3.741252302025783, + "grad_norm": 0.26935988664627075, + "learning_rate": 7.203582906744885e-05, + "loss": 1.806, + "step": 12189 + }, + { + "epoch": 3.7415592387968077, + "grad_norm": 0.2544274628162384, + "learning_rate": 7.203136715386233e-05, + "loss": 1.7988, + "step": 12190 + }, + { + "epoch": 3.741866175567833, + "grad_norm": 0.22665882110595703, + "learning_rate": 7.202690502254892e-05, + "loss": 1.7798, + "step": 12191 + }, + { + "epoch": 3.7421731123388584, + "grad_norm": 0.24512888491153717, + "learning_rate": 7.202244267355273e-05, + "loss": 1.816, + "step": 12192 + }, + { + "epoch": 3.7424800491098833, + "grad_norm": 0.2408553808927536, + "learning_rate": 7.201798010691785e-05, + "loss": 1.7417, + "step": 12193 + }, + { + "epoch": 3.7427869858809086, + "grad_norm": 0.23142600059509277, + "learning_rate": 7.201351732268838e-05, + "loss": 1.7771, + "step": 12194 + }, + { + "epoch": 3.7430939226519335, + "grad_norm": 0.245071142911911, + "learning_rate": 7.200905432090844e-05, + "loss": 1.7556, + "step": 12195 + }, + { + "epoch": 3.743400859422959, + "grad_norm": 0.2623934745788574, + "learning_rate": 7.200459110162211e-05, + "loss": 1.8042, + "step": 12196 + }, + { + "epoch": 3.743707796193984, + "grad_norm": 0.2531217038631439, + "learning_rate": 7.200012766487353e-05, + "loss": 1.7709, + "step": 12197 + }, + { + "epoch": 3.744014732965009, + "grad_norm": 0.23839864134788513, + "learning_rate": 7.19956640107068e-05, + "loss": 1.8202, + "step": 12198 + }, + { + "epoch": 3.7443216697360344, + "grad_norm": 0.2342260777950287, + "learning_rate": 7.1991200139166e-05, + "loss": 1.827, + "step": 12199 + }, + { + "epoch": 3.7446286065070593, + "grad_norm": 0.25511276721954346, + "learning_rate": 7.198673605029528e-05, + "loss": 1.7766, + "step": 12200 + }, + { + "epoch": 3.7449355432780846, + "grad_norm": 0.27601274847984314, + "learning_rate": 7.198227174413876e-05, + "loss": 1.7716, + "step": 12201 + }, + { + "epoch": 3.74524248004911, + "grad_norm": 0.3027385175228119, + "learning_rate": 7.197780722074056e-05, + "loss": 1.8007, + "step": 12202 + }, + { + "epoch": 3.7455494168201353, + "grad_norm": 0.31242382526397705, + "learning_rate": 7.197334248014477e-05, + "loss": 1.8089, + "step": 12203 + }, + { + "epoch": 3.74585635359116, + "grad_norm": 0.3673859238624573, + "learning_rate": 7.196887752239551e-05, + "loss": 1.8017, + "step": 12204 + }, + { + "epoch": 3.7461632903621855, + "grad_norm": 0.3152726888656616, + "learning_rate": 7.196441234753695e-05, + "loss": 1.7108, + "step": 12205 + }, + { + "epoch": 3.7464702271332104, + "grad_norm": 0.2606927156448364, + "learning_rate": 7.195994695561319e-05, + "loss": 1.8066, + "step": 12206 + }, + { + "epoch": 3.7467771639042358, + "grad_norm": 0.37624871730804443, + "learning_rate": 7.195548134666836e-05, + "loss": 1.725, + "step": 12207 + }, + { + "epoch": 3.747084100675261, + "grad_norm": 0.4138187766075134, + "learning_rate": 7.195101552074658e-05, + "loss": 1.7838, + "step": 12208 + }, + { + "epoch": 3.747391037446286, + "grad_norm": 0.3668459951877594, + "learning_rate": 7.194654947789204e-05, + "loss": 1.7575, + "step": 12209 + }, + { + "epoch": 3.7476979742173113, + "grad_norm": 0.27947792410850525, + "learning_rate": 7.19420832181488e-05, + "loss": 1.792, + "step": 12210 + }, + { + "epoch": 3.748004910988336, + "grad_norm": 0.2507692873477936, + "learning_rate": 7.193761674156103e-05, + "loss": 1.7752, + "step": 12211 + }, + { + "epoch": 3.7483118477593615, + "grad_norm": 0.3209949731826782, + "learning_rate": 7.193315004817289e-05, + "loss": 1.8491, + "step": 12212 + }, + { + "epoch": 3.748618784530387, + "grad_norm": 0.32883042097091675, + "learning_rate": 7.192868313802849e-05, + "loss": 1.8135, + "step": 12213 + }, + { + "epoch": 3.7489257213014118, + "grad_norm": 0.2450616955757141, + "learning_rate": 7.192421601117201e-05, + "loss": 1.7722, + "step": 12214 + }, + { + "epoch": 3.749232658072437, + "grad_norm": 0.2545110285282135, + "learning_rate": 7.191974866764757e-05, + "loss": 1.7866, + "step": 12215 + }, + { + "epoch": 3.749539594843462, + "grad_norm": 0.264017790555954, + "learning_rate": 7.191528110749932e-05, + "loss": 1.778, + "step": 12216 + }, + { + "epoch": 3.7498465316144873, + "grad_norm": 0.3156309425830841, + "learning_rate": 7.191081333077142e-05, + "loss": 1.7917, + "step": 12217 + }, + { + "epoch": 3.7501534683855127, + "grad_norm": 0.3578774631023407, + "learning_rate": 7.190634533750802e-05, + "loss": 1.8468, + "step": 12218 + }, + { + "epoch": 3.750460405156538, + "grad_norm": 0.30735981464385986, + "learning_rate": 7.19018771277533e-05, + "loss": 1.7502, + "step": 12219 + }, + { + "epoch": 3.750767341927563, + "grad_norm": 0.22870220243930817, + "learning_rate": 7.189740870155135e-05, + "loss": 1.7686, + "step": 12220 + }, + { + "epoch": 3.7510742786985882, + "grad_norm": 0.30297720432281494, + "learning_rate": 7.18929400589464e-05, + "loss": 1.826, + "step": 12221 + }, + { + "epoch": 3.751381215469613, + "grad_norm": 0.2735389173030853, + "learning_rate": 7.188847119998257e-05, + "loss": 1.8142, + "step": 12222 + }, + { + "epoch": 3.7516881522406385, + "grad_norm": 0.2823885679244995, + "learning_rate": 7.188400212470405e-05, + "loss": 1.8028, + "step": 12223 + }, + { + "epoch": 3.751995089011664, + "grad_norm": 0.4184139370918274, + "learning_rate": 7.187953283315499e-05, + "loss": 1.8467, + "step": 12224 + }, + { + "epoch": 3.7523020257826887, + "grad_norm": 0.3559226095676422, + "learning_rate": 7.187506332537957e-05, + "loss": 1.7416, + "step": 12225 + }, + { + "epoch": 3.752608962553714, + "grad_norm": 0.26055800914764404, + "learning_rate": 7.187059360142194e-05, + "loss": 1.8309, + "step": 12226 + }, + { + "epoch": 3.752915899324739, + "grad_norm": 0.28032660484313965, + "learning_rate": 7.186612366132629e-05, + "loss": 1.7926, + "step": 12227 + }, + { + "epoch": 3.7532228360957642, + "grad_norm": 0.26229965686798096, + "learning_rate": 7.18616535051368e-05, + "loss": 1.7368, + "step": 12228 + }, + { + "epoch": 3.7535297728667896, + "grad_norm": 0.2779417634010315, + "learning_rate": 7.185718313289763e-05, + "loss": 1.8418, + "step": 12229 + }, + { + "epoch": 3.7538367096378145, + "grad_norm": 0.26164770126342773, + "learning_rate": 7.185271254465295e-05, + "loss": 1.7511, + "step": 12230 + }, + { + "epoch": 3.75414364640884, + "grad_norm": 0.30725157260894775, + "learning_rate": 7.184824174044698e-05, + "loss": 1.7661, + "step": 12231 + }, + { + "epoch": 3.7544505831798647, + "grad_norm": 0.33111417293548584, + "learning_rate": 7.184377072032386e-05, + "loss": 1.7341, + "step": 12232 + }, + { + "epoch": 3.75475751995089, + "grad_norm": 0.23978343605995178, + "learning_rate": 7.183929948432779e-05, + "loss": 1.7151, + "step": 12233 + }, + { + "epoch": 3.7550644567219154, + "grad_norm": 0.3057664632797241, + "learning_rate": 7.183482803250299e-05, + "loss": 1.8446, + "step": 12234 + }, + { + "epoch": 3.7553713934929407, + "grad_norm": 0.2629055678844452, + "learning_rate": 7.18303563648936e-05, + "loss": 1.7415, + "step": 12235 + }, + { + "epoch": 3.7556783302639656, + "grad_norm": 0.22703498601913452, + "learning_rate": 7.182588448154386e-05, + "loss": 1.8188, + "step": 12236 + }, + { + "epoch": 3.755985267034991, + "grad_norm": 0.3014034032821655, + "learning_rate": 7.182141238249792e-05, + "loss": 1.8634, + "step": 12237 + }, + { + "epoch": 3.756292203806016, + "grad_norm": 0.28859084844589233, + "learning_rate": 7.181694006779998e-05, + "loss": 1.7509, + "step": 12238 + }, + { + "epoch": 3.756599140577041, + "grad_norm": 0.293720543384552, + "learning_rate": 7.181246753749426e-05, + "loss": 1.777, + "step": 12239 + }, + { + "epoch": 3.7569060773480665, + "grad_norm": 0.2374580055475235, + "learning_rate": 7.180799479162496e-05, + "loss": 1.7492, + "step": 12240 + }, + { + "epoch": 3.7572130141190914, + "grad_norm": 0.30106452107429504, + "learning_rate": 7.180352183023627e-05, + "loss": 1.7538, + "step": 12241 + }, + { + "epoch": 3.7575199508901167, + "grad_norm": 0.3504682183265686, + "learning_rate": 7.179904865337238e-05, + "loss": 1.7477, + "step": 12242 + }, + { + "epoch": 3.7578268876611416, + "grad_norm": 0.2901679575443268, + "learning_rate": 7.179457526107754e-05, + "loss": 1.9412, + "step": 12243 + }, + { + "epoch": 3.758133824432167, + "grad_norm": 0.37690606713294983, + "learning_rate": 7.179010165339591e-05, + "loss": 1.8222, + "step": 12244 + }, + { + "epoch": 3.7584407612031923, + "grad_norm": 0.45126965641975403, + "learning_rate": 7.178562783037172e-05, + "loss": 1.8563, + "step": 12245 + }, + { + "epoch": 3.758747697974217, + "grad_norm": 0.2747548818588257, + "learning_rate": 7.178115379204921e-05, + "loss": 1.7179, + "step": 12246 + }, + { + "epoch": 3.7590546347452425, + "grad_norm": 0.43243977427482605, + "learning_rate": 7.177667953847257e-05, + "loss": 1.8157, + "step": 12247 + }, + { + "epoch": 3.7593615715162674, + "grad_norm": 0.529448390007019, + "learning_rate": 7.177220506968602e-05, + "loss": 1.8113, + "step": 12248 + }, + { + "epoch": 3.7596685082872927, + "grad_norm": 0.3099314868450165, + "learning_rate": 7.176773038573377e-05, + "loss": 1.7833, + "step": 12249 + }, + { + "epoch": 3.759975445058318, + "grad_norm": 0.3111872375011444, + "learning_rate": 7.176325548666004e-05, + "loss": 1.7965, + "step": 12250 + }, + { + "epoch": 3.7602823818293434, + "grad_norm": 0.38437551259994507, + "learning_rate": 7.175878037250907e-05, + "loss": 1.7822, + "step": 12251 + }, + { + "epoch": 3.7605893186003683, + "grad_norm": 0.33643704652786255, + "learning_rate": 7.175430504332509e-05, + "loss": 1.7839, + "step": 12252 + }, + { + "epoch": 3.7608962553713936, + "grad_norm": 0.24705304205417633, + "learning_rate": 7.174982949915232e-05, + "loss": 1.8302, + "step": 12253 + }, + { + "epoch": 3.7612031921424185, + "grad_norm": 0.3615458309650421, + "learning_rate": 7.174535374003497e-05, + "loss": 1.7963, + "step": 12254 + }, + { + "epoch": 3.761510128913444, + "grad_norm": 0.36486589908599854, + "learning_rate": 7.17408777660173e-05, + "loss": 1.7933, + "step": 12255 + }, + { + "epoch": 3.761817065684469, + "grad_norm": 0.2566867172718048, + "learning_rate": 7.173640157714352e-05, + "loss": 1.7254, + "step": 12256 + }, + { + "epoch": 3.762124002455494, + "grad_norm": 0.2602523863315582, + "learning_rate": 7.17319251734579e-05, + "loss": 1.7357, + "step": 12257 + }, + { + "epoch": 3.7624309392265194, + "grad_norm": 0.3626105785369873, + "learning_rate": 7.172744855500464e-05, + "loss": 1.7971, + "step": 12258 + }, + { + "epoch": 3.7627378759975443, + "grad_norm": 0.36327603459358215, + "learning_rate": 7.172297172182802e-05, + "loss": 1.7819, + "step": 12259 + }, + { + "epoch": 3.7630448127685696, + "grad_norm": 0.25935736298561096, + "learning_rate": 7.171849467397224e-05, + "loss": 1.8112, + "step": 12260 + }, + { + "epoch": 3.763351749539595, + "grad_norm": 0.2779700756072998, + "learning_rate": 7.171401741148156e-05, + "loss": 1.786, + "step": 12261 + }, + { + "epoch": 3.7636586863106203, + "grad_norm": 0.3089013695716858, + "learning_rate": 7.170953993440025e-05, + "loss": 1.7808, + "step": 12262 + }, + { + "epoch": 3.763965623081645, + "grad_norm": 0.2562308609485626, + "learning_rate": 7.170506224277253e-05, + "loss": 1.8207, + "step": 12263 + }, + { + "epoch": 3.7642725598526705, + "grad_norm": 0.2907634973526001, + "learning_rate": 7.170058433664268e-05, + "loss": 1.7638, + "step": 12264 + }, + { + "epoch": 3.7645794966236954, + "grad_norm": 0.30341312289237976, + "learning_rate": 7.169610621605493e-05, + "loss": 1.7827, + "step": 12265 + }, + { + "epoch": 3.7648864333947207, + "grad_norm": 0.27091866731643677, + "learning_rate": 7.169162788105353e-05, + "loss": 1.786, + "step": 12266 + }, + { + "epoch": 3.765193370165746, + "grad_norm": 0.234042689204216, + "learning_rate": 7.168714933168277e-05, + "loss": 1.7638, + "step": 12267 + }, + { + "epoch": 3.765500306936771, + "grad_norm": 0.2477465271949768, + "learning_rate": 7.168267056798686e-05, + "loss": 1.7275, + "step": 12268 + }, + { + "epoch": 3.7658072437077963, + "grad_norm": 0.25578543543815613, + "learning_rate": 7.167819159001012e-05, + "loss": 1.7831, + "step": 12269 + }, + { + "epoch": 3.766114180478821, + "grad_norm": 0.26629674434661865, + "learning_rate": 7.167371239779678e-05, + "loss": 1.7866, + "step": 12270 + }, + { + "epoch": 3.7664211172498465, + "grad_norm": 0.31350967288017273, + "learning_rate": 7.16692329913911e-05, + "loss": 1.7755, + "step": 12271 + }, + { + "epoch": 3.766728054020872, + "grad_norm": 0.2670116126537323, + "learning_rate": 7.166475337083735e-05, + "loss": 1.7524, + "step": 12272 + }, + { + "epoch": 3.7670349907918967, + "grad_norm": 0.26503682136535645, + "learning_rate": 7.166027353617983e-05, + "loss": 1.7867, + "step": 12273 + }, + { + "epoch": 3.767341927562922, + "grad_norm": 0.3674192428588867, + "learning_rate": 7.165579348746278e-05, + "loss": 1.7604, + "step": 12274 + }, + { + "epoch": 3.767648864333947, + "grad_norm": 0.4120824337005615, + "learning_rate": 7.16513132247305e-05, + "loss": 1.7905, + "step": 12275 + }, + { + "epoch": 3.7679558011049723, + "grad_norm": 0.29074826836586, + "learning_rate": 7.164683274802723e-05, + "loss": 1.7539, + "step": 12276 + }, + { + "epoch": 3.7682627378759976, + "grad_norm": 0.22223204374313354, + "learning_rate": 7.164235205739729e-05, + "loss": 1.755, + "step": 12277 + }, + { + "epoch": 3.768569674647023, + "grad_norm": 0.23997461795806885, + "learning_rate": 7.163787115288494e-05, + "loss": 1.8024, + "step": 12278 + }, + { + "epoch": 3.768876611418048, + "grad_norm": 0.2556418776512146, + "learning_rate": 7.163339003453445e-05, + "loss": 1.7717, + "step": 12279 + }, + { + "epoch": 3.769183548189073, + "grad_norm": 0.3107141852378845, + "learning_rate": 7.162890870239013e-05, + "loss": 1.8257, + "step": 12280 + }, + { + "epoch": 3.769490484960098, + "grad_norm": 0.35293644666671753, + "learning_rate": 7.162442715649627e-05, + "loss": 1.7855, + "step": 12281 + }, + { + "epoch": 3.7697974217311234, + "grad_norm": 0.25989311933517456, + "learning_rate": 7.161994539689713e-05, + "loss": 1.7816, + "step": 12282 + }, + { + "epoch": 3.7701043585021488, + "grad_norm": 0.25615137815475464, + "learning_rate": 7.161546342363701e-05, + "loss": 1.7738, + "step": 12283 + }, + { + "epoch": 3.7704112952731736, + "grad_norm": 0.29345229268074036, + "learning_rate": 7.161098123676023e-05, + "loss": 1.8496, + "step": 12284 + }, + { + "epoch": 3.770718232044199, + "grad_norm": 0.2975969612598419, + "learning_rate": 7.160649883631105e-05, + "loss": 1.7342, + "step": 12285 + }, + { + "epoch": 3.771025168815224, + "grad_norm": 0.28458064794540405, + "learning_rate": 7.16020162223338e-05, + "loss": 1.8253, + "step": 12286 + }, + { + "epoch": 3.771332105586249, + "grad_norm": 0.2798703908920288, + "learning_rate": 7.159753339487276e-05, + "loss": 1.746, + "step": 12287 + }, + { + "epoch": 3.7716390423572745, + "grad_norm": 0.380044549703598, + "learning_rate": 7.159305035397223e-05, + "loss": 1.769, + "step": 12288 + }, + { + "epoch": 3.7719459791282994, + "grad_norm": 0.28760263323783875, + "learning_rate": 7.158856709967654e-05, + "loss": 1.7466, + "step": 12289 + }, + { + "epoch": 3.7722529158993248, + "grad_norm": 0.23314130306243896, + "learning_rate": 7.158408363202996e-05, + "loss": 1.7545, + "step": 12290 + }, + { + "epoch": 3.7725598526703497, + "grad_norm": 0.2864209711551666, + "learning_rate": 7.15795999510768e-05, + "loss": 1.7549, + "step": 12291 + }, + { + "epoch": 3.772866789441375, + "grad_norm": 0.2605510354042053, + "learning_rate": 7.15751160568614e-05, + "loss": 1.7684, + "step": 12292 + }, + { + "epoch": 3.7731737262124003, + "grad_norm": 0.2475409358739853, + "learning_rate": 7.157063194942806e-05, + "loss": 1.7841, + "step": 12293 + }, + { + "epoch": 3.7734806629834257, + "grad_norm": 0.22479289770126343, + "learning_rate": 7.15661476288211e-05, + "loss": 1.7592, + "step": 12294 + }, + { + "epoch": 3.7737875997544506, + "grad_norm": 0.22076937556266785, + "learning_rate": 7.156166309508482e-05, + "loss": 1.7853, + "step": 12295 + }, + { + "epoch": 3.774094536525476, + "grad_norm": 0.26082465052604675, + "learning_rate": 7.155717834826353e-05, + "loss": 1.7828, + "step": 12296 + }, + { + "epoch": 3.7744014732965008, + "grad_norm": 0.24771755933761597, + "learning_rate": 7.15526933884016e-05, + "loss": 1.758, + "step": 12297 + }, + { + "epoch": 3.774708410067526, + "grad_norm": 0.23806311190128326, + "learning_rate": 7.15482082155433e-05, + "loss": 1.7237, + "step": 12298 + }, + { + "epoch": 3.7750153468385514, + "grad_norm": 0.24822844564914703, + "learning_rate": 7.154372282973299e-05, + "loss": 1.7828, + "step": 12299 + }, + { + "epoch": 3.7753222836095763, + "grad_norm": 0.24423740804195404, + "learning_rate": 7.153923723101496e-05, + "loss": 1.8014, + "step": 12300 + }, + { + "epoch": 3.7756292203806017, + "grad_norm": 0.24966634809970856, + "learning_rate": 7.15347514194336e-05, + "loss": 1.8005, + "step": 12301 + }, + { + "epoch": 3.7759361571516266, + "grad_norm": 0.2549348473548889, + "learning_rate": 7.153026539503317e-05, + "loss": 1.8473, + "step": 12302 + }, + { + "epoch": 3.776243093922652, + "grad_norm": 0.23709465563297272, + "learning_rate": 7.152577915785807e-05, + "loss": 1.8031, + "step": 12303 + }, + { + "epoch": 3.7765500306936772, + "grad_norm": 0.28554168343544006, + "learning_rate": 7.152129270795258e-05, + "loss": 1.7836, + "step": 12304 + }, + { + "epoch": 3.776856967464702, + "grad_norm": 0.2568756639957428, + "learning_rate": 7.151680604536107e-05, + "loss": 1.7345, + "step": 12305 + }, + { + "epoch": 3.7771639042357275, + "grad_norm": 0.23883797228336334, + "learning_rate": 7.151231917012787e-05, + "loss": 1.7342, + "step": 12306 + }, + { + "epoch": 3.7774708410067523, + "grad_norm": 0.24026677012443542, + "learning_rate": 7.150783208229732e-05, + "loss": 1.8156, + "step": 12307 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.25756222009658813, + "learning_rate": 7.150334478191376e-05, + "loss": 1.8204, + "step": 12308 + }, + { + "epoch": 3.778084714548803, + "grad_norm": 0.24917428195476532, + "learning_rate": 7.149885726902156e-05, + "loss": 1.7867, + "step": 12309 + }, + { + "epoch": 3.7783916513198283, + "grad_norm": 0.26269277930259705, + "learning_rate": 7.149436954366504e-05, + "loss": 1.8233, + "step": 12310 + }, + { + "epoch": 3.7786985880908532, + "grad_norm": 0.2502293586730957, + "learning_rate": 7.148988160588857e-05, + "loss": 1.8329, + "step": 12311 + }, + { + "epoch": 3.7790055248618786, + "grad_norm": 0.24845796823501587, + "learning_rate": 7.14853934557365e-05, + "loss": 1.7936, + "step": 12312 + }, + { + "epoch": 3.7793124616329035, + "grad_norm": 0.2453537881374359, + "learning_rate": 7.148090509325315e-05, + "loss": 1.8149, + "step": 12313 + }, + { + "epoch": 3.779619398403929, + "grad_norm": 0.2336922138929367, + "learning_rate": 7.147641651848293e-05, + "loss": 1.7826, + "step": 12314 + }, + { + "epoch": 3.779926335174954, + "grad_norm": 0.25542667508125305, + "learning_rate": 7.147192773147017e-05, + "loss": 1.801, + "step": 12315 + }, + { + "epoch": 3.780233271945979, + "grad_norm": 0.2301866114139557, + "learning_rate": 7.146743873225923e-05, + "loss": 1.7302, + "step": 12316 + }, + { + "epoch": 3.7805402087170044, + "grad_norm": 0.25821468234062195, + "learning_rate": 7.14629495208945e-05, + "loss": 1.7704, + "step": 12317 + }, + { + "epoch": 3.7808471454880292, + "grad_norm": 0.22537970542907715, + "learning_rate": 7.145846009742029e-05, + "loss": 1.7281, + "step": 12318 + }, + { + "epoch": 3.7811540822590546, + "grad_norm": 0.2565869688987732, + "learning_rate": 7.145397046188102e-05, + "loss": 1.8077, + "step": 12319 + }, + { + "epoch": 3.78146101903008, + "grad_norm": 0.2588396966457367, + "learning_rate": 7.144948061432105e-05, + "loss": 1.7438, + "step": 12320 + }, + { + "epoch": 3.781767955801105, + "grad_norm": 0.2538135349750519, + "learning_rate": 7.144499055478472e-05, + "loss": 1.8253, + "step": 12321 + }, + { + "epoch": 3.78207489257213, + "grad_norm": 0.2272680401802063, + "learning_rate": 7.144050028331644e-05, + "loss": 1.7408, + "step": 12322 + }, + { + "epoch": 3.782381829343155, + "grad_norm": 0.25010406970977783, + "learning_rate": 7.143600979996055e-05, + "loss": 1.8219, + "step": 12323 + }, + { + "epoch": 3.7826887661141804, + "grad_norm": 0.2560291290283203, + "learning_rate": 7.143151910476144e-05, + "loss": 1.7734, + "step": 12324 + }, + { + "epoch": 3.7829957028852057, + "grad_norm": 0.24927431344985962, + "learning_rate": 7.142702819776352e-05, + "loss": 1.7682, + "step": 12325 + }, + { + "epoch": 3.783302639656231, + "grad_norm": 0.2501368224620819, + "learning_rate": 7.142253707901114e-05, + "loss": 1.818, + "step": 12326 + }, + { + "epoch": 3.783609576427256, + "grad_norm": 0.3132917284965515, + "learning_rate": 7.141804574854871e-05, + "loss": 1.7793, + "step": 12327 + }, + { + "epoch": 3.7839165131982813, + "grad_norm": 0.24229925870895386, + "learning_rate": 7.141355420642057e-05, + "loss": 1.7585, + "step": 12328 + }, + { + "epoch": 3.784223449969306, + "grad_norm": 0.22612906992435455, + "learning_rate": 7.140906245267116e-05, + "loss": 1.7374, + "step": 12329 + }, + { + "epoch": 3.7845303867403315, + "grad_norm": 0.26354333758354187, + "learning_rate": 7.140457048734482e-05, + "loss": 1.7751, + "step": 12330 + }, + { + "epoch": 3.784837323511357, + "grad_norm": 0.21500451862812042, + "learning_rate": 7.140007831048599e-05, + "loss": 1.7827, + "step": 12331 + }, + { + "epoch": 3.7851442602823817, + "grad_norm": 0.2826332151889801, + "learning_rate": 7.139558592213904e-05, + "loss": 1.7522, + "step": 12332 + }, + { + "epoch": 3.785451197053407, + "grad_norm": 0.3217725455760956, + "learning_rate": 7.139109332234837e-05, + "loss": 1.8758, + "step": 12333 + }, + { + "epoch": 3.785758133824432, + "grad_norm": 0.26934614777565, + "learning_rate": 7.138660051115837e-05, + "loss": 1.8322, + "step": 12334 + }, + { + "epoch": 3.7860650705954573, + "grad_norm": 0.2653827667236328, + "learning_rate": 7.138210748861346e-05, + "loss": 1.7651, + "step": 12335 + }, + { + "epoch": 3.7863720073664826, + "grad_norm": 0.30470311641693115, + "learning_rate": 7.137761425475802e-05, + "loss": 1.855, + "step": 12336 + }, + { + "epoch": 3.786678944137508, + "grad_norm": 0.2558726370334625, + "learning_rate": 7.137312080963647e-05, + "loss": 1.7174, + "step": 12337 + }, + { + "epoch": 3.786985880908533, + "grad_norm": 0.24025602638721466, + "learning_rate": 7.136862715329322e-05, + "loss": 1.7565, + "step": 12338 + }, + { + "epoch": 3.787292817679558, + "grad_norm": 0.34205392003059387, + "learning_rate": 7.136413328577267e-05, + "loss": 1.8116, + "step": 12339 + }, + { + "epoch": 3.787599754450583, + "grad_norm": 0.4069152772426605, + "learning_rate": 7.135963920711923e-05, + "loss": 1.7662, + "step": 12340 + }, + { + "epoch": 3.7879066912216084, + "grad_norm": 0.3915627598762512, + "learning_rate": 7.13551449173773e-05, + "loss": 1.81, + "step": 12341 + }, + { + "epoch": 3.7882136279926337, + "grad_norm": 0.27136507630348206, + "learning_rate": 7.135065041659134e-05, + "loss": 1.7845, + "step": 12342 + }, + { + "epoch": 3.7885205647636586, + "grad_norm": 0.2924078106880188, + "learning_rate": 7.134615570480572e-05, + "loss": 1.8606, + "step": 12343 + }, + { + "epoch": 3.788827501534684, + "grad_norm": 0.35581526160240173, + "learning_rate": 7.134166078206488e-05, + "loss": 1.7785, + "step": 12344 + }, + { + "epoch": 3.789134438305709, + "grad_norm": 0.3003756105899811, + "learning_rate": 7.133716564841324e-05, + "loss": 1.7321, + "step": 12345 + }, + { + "epoch": 3.789441375076734, + "grad_norm": 0.2586000859737396, + "learning_rate": 7.133267030389524e-05, + "loss": 1.7889, + "step": 12346 + }, + { + "epoch": 3.7897483118477595, + "grad_norm": 0.28053075075149536, + "learning_rate": 7.132817474855527e-05, + "loss": 1.8216, + "step": 12347 + }, + { + "epoch": 3.7900552486187844, + "grad_norm": 0.3064870834350586, + "learning_rate": 7.132367898243777e-05, + "loss": 1.7528, + "step": 12348 + }, + { + "epoch": 3.7903621853898097, + "grad_norm": 0.3045158386230469, + "learning_rate": 7.131918300558719e-05, + "loss": 1.8251, + "step": 12349 + }, + { + "epoch": 3.7906691221608346, + "grad_norm": 0.2438485324382782, + "learning_rate": 7.131468681804794e-05, + "loss": 1.7505, + "step": 12350 + }, + { + "epoch": 3.79097605893186, + "grad_norm": 0.24239958822727203, + "learning_rate": 7.131019041986447e-05, + "loss": 1.7544, + "step": 12351 + }, + { + "epoch": 3.7912829957028853, + "grad_norm": 0.24632441997528076, + "learning_rate": 7.130569381108121e-05, + "loss": 1.7485, + "step": 12352 + }, + { + "epoch": 3.7915899324739106, + "grad_norm": 0.22553624212741852, + "learning_rate": 7.13011969917426e-05, + "loss": 1.803, + "step": 12353 + }, + { + "epoch": 3.7918968692449355, + "grad_norm": 0.2164420485496521, + "learning_rate": 7.129669996189306e-05, + "loss": 1.7307, + "step": 12354 + }, + { + "epoch": 3.792203806015961, + "grad_norm": 0.25104281306266785, + "learning_rate": 7.129220272157705e-05, + "loss": 1.8154, + "step": 12355 + }, + { + "epoch": 3.7925107427869857, + "grad_norm": 0.25533202290534973, + "learning_rate": 7.128770527083903e-05, + "loss": 1.8046, + "step": 12356 + }, + { + "epoch": 3.792817679558011, + "grad_norm": 0.24428130686283112, + "learning_rate": 7.128320760972341e-05, + "loss": 1.7984, + "step": 12357 + }, + { + "epoch": 3.7931246163290364, + "grad_norm": 0.2366408109664917, + "learning_rate": 7.127870973827467e-05, + "loss": 1.7781, + "step": 12358 + }, + { + "epoch": 3.7934315531000613, + "grad_norm": 0.2558888792991638, + "learning_rate": 7.127421165653722e-05, + "loss": 1.7858, + "step": 12359 + }, + { + "epoch": 3.7937384898710866, + "grad_norm": 0.25825443863868713, + "learning_rate": 7.126971336455558e-05, + "loss": 1.8292, + "step": 12360 + }, + { + "epoch": 3.7940454266421115, + "grad_norm": 0.2554624080657959, + "learning_rate": 7.126521486237415e-05, + "loss": 1.822, + "step": 12361 + }, + { + "epoch": 3.794352363413137, + "grad_norm": 0.3030763268470764, + "learning_rate": 7.126071615003742e-05, + "loss": 1.8261, + "step": 12362 + }, + { + "epoch": 3.794659300184162, + "grad_norm": 0.3047907054424286, + "learning_rate": 7.125621722758981e-05, + "loss": 1.8419, + "step": 12363 + }, + { + "epoch": 3.794966236955187, + "grad_norm": 0.27782654762268066, + "learning_rate": 7.12517180950758e-05, + "loss": 1.7959, + "step": 12364 + }, + { + "epoch": 3.7952731737262124, + "grad_norm": 0.24526572227478027, + "learning_rate": 7.124721875253986e-05, + "loss": 1.7313, + "step": 12365 + }, + { + "epoch": 3.7955801104972373, + "grad_norm": 0.23718179762363434, + "learning_rate": 7.124271920002646e-05, + "loss": 1.7479, + "step": 12366 + }, + { + "epoch": 3.7958870472682626, + "grad_norm": 0.2880019247531891, + "learning_rate": 7.123821943758004e-05, + "loss": 1.7792, + "step": 12367 + }, + { + "epoch": 3.796193984039288, + "grad_norm": 0.28923723101615906, + "learning_rate": 7.123371946524511e-05, + "loss": 1.7474, + "step": 12368 + }, + { + "epoch": 3.7965009208103133, + "grad_norm": 0.2281525880098343, + "learning_rate": 7.122921928306612e-05, + "loss": 1.8106, + "step": 12369 + }, + { + "epoch": 3.796807857581338, + "grad_norm": 0.34825438261032104, + "learning_rate": 7.122471889108752e-05, + "loss": 1.8076, + "step": 12370 + }, + { + "epoch": 3.7971147943523635, + "grad_norm": 0.41145995259284973, + "learning_rate": 7.122021828935382e-05, + "loss": 1.7692, + "step": 12371 + }, + { + "epoch": 3.7974217311233884, + "grad_norm": 0.31711262464523315, + "learning_rate": 7.12157174779095e-05, + "loss": 1.8101, + "step": 12372 + }, + { + "epoch": 3.7977286678944138, + "grad_norm": 0.3044308125972748, + "learning_rate": 7.1211216456799e-05, + "loss": 1.8238, + "step": 12373 + }, + { + "epoch": 3.798035604665439, + "grad_norm": 0.3750055134296417, + "learning_rate": 7.120671522606683e-05, + "loss": 1.7323, + "step": 12374 + }, + { + "epoch": 3.798342541436464, + "grad_norm": 0.38852599263191223, + "learning_rate": 7.120221378575749e-05, + "loss": 1.8402, + "step": 12375 + }, + { + "epoch": 3.7986494782074893, + "grad_norm": 0.3430371582508087, + "learning_rate": 7.119771213591541e-05, + "loss": 1.8369, + "step": 12376 + }, + { + "epoch": 3.798956414978514, + "grad_norm": 0.4787428677082062, + "learning_rate": 7.119321027658515e-05, + "loss": 1.7977, + "step": 12377 + }, + { + "epoch": 3.7992633517495396, + "grad_norm": 0.4263977110385895, + "learning_rate": 7.118870820781114e-05, + "loss": 1.8208, + "step": 12378 + }, + { + "epoch": 3.799570288520565, + "grad_norm": 0.28649669885635376, + "learning_rate": 7.118420592963793e-05, + "loss": 1.773, + "step": 12379 + }, + { + "epoch": 3.7998772252915898, + "grad_norm": 0.26070261001586914, + "learning_rate": 7.117970344210996e-05, + "loss": 1.6866, + "step": 12380 + }, + { + "epoch": 3.800184162062615, + "grad_norm": 0.30127593874931335, + "learning_rate": 7.117520074527173e-05, + "loss": 1.7208, + "step": 12381 + }, + { + "epoch": 3.80049109883364, + "grad_norm": 0.23639258742332458, + "learning_rate": 7.117069783916777e-05, + "loss": 1.7504, + "step": 12382 + }, + { + "epoch": 3.8007980356046653, + "grad_norm": 0.2852858901023865, + "learning_rate": 7.116619472384256e-05, + "loss": 1.7954, + "step": 12383 + }, + { + "epoch": 3.8011049723756907, + "grad_norm": 0.2673225998878479, + "learning_rate": 7.116169139934063e-05, + "loss": 1.7562, + "step": 12384 + }, + { + "epoch": 3.801411909146716, + "grad_norm": 0.21615394949913025, + "learning_rate": 7.115718786570644e-05, + "loss": 1.7126, + "step": 12385 + }, + { + "epoch": 3.801718845917741, + "grad_norm": 0.2165435254573822, + "learning_rate": 7.115268412298453e-05, + "loss": 1.7171, + "step": 12386 + }, + { + "epoch": 3.8020257826887662, + "grad_norm": 0.280564546585083, + "learning_rate": 7.114818017121939e-05, + "loss": 1.7711, + "step": 12387 + }, + { + "epoch": 3.802332719459791, + "grad_norm": 0.3023521304130554, + "learning_rate": 7.114367601045555e-05, + "loss": 1.7538, + "step": 12388 + }, + { + "epoch": 3.8026396562308165, + "grad_norm": 0.27252480387687683, + "learning_rate": 7.11391716407375e-05, + "loss": 1.7604, + "step": 12389 + }, + { + "epoch": 3.802946593001842, + "grad_norm": 0.2122909128665924, + "learning_rate": 7.113466706210976e-05, + "loss": 1.716, + "step": 12390 + }, + { + "epoch": 3.8032535297728667, + "grad_norm": 0.30141574144363403, + "learning_rate": 7.113016227461686e-05, + "loss": 1.7636, + "step": 12391 + }, + { + "epoch": 3.803560466543892, + "grad_norm": 0.33359697461128235, + "learning_rate": 7.112565727830331e-05, + "loss": 1.7805, + "step": 12392 + }, + { + "epoch": 3.803867403314917, + "grad_norm": 0.3161376714706421, + "learning_rate": 7.112115207321364e-05, + "loss": 1.7974, + "step": 12393 + }, + { + "epoch": 3.8041743400859422, + "grad_norm": 0.29028698801994324, + "learning_rate": 7.111664665939235e-05, + "loss": 1.83, + "step": 12394 + }, + { + "epoch": 3.8044812768569676, + "grad_norm": 0.38829556107521057, + "learning_rate": 7.1112141036884e-05, + "loss": 1.8684, + "step": 12395 + }, + { + "epoch": 3.804788213627993, + "grad_norm": 0.4118283987045288, + "learning_rate": 7.110763520573309e-05, + "loss": 1.7812, + "step": 12396 + }, + { + "epoch": 3.805095150399018, + "grad_norm": 0.3907717168331146, + "learning_rate": 7.110312916598416e-05, + "loss": 1.7789, + "step": 12397 + }, + { + "epoch": 3.805402087170043, + "grad_norm": 0.2768644690513611, + "learning_rate": 7.109862291768173e-05, + "loss": 1.8575, + "step": 12398 + }, + { + "epoch": 3.805709023941068, + "grad_norm": 0.3234006464481354, + "learning_rate": 7.109411646087035e-05, + "loss": 1.7485, + "step": 12399 + }, + { + "epoch": 3.8060159607120934, + "grad_norm": 0.415475994348526, + "learning_rate": 7.108960979559454e-05, + "loss": 1.7363, + "step": 12400 + }, + { + "epoch": 3.8063228974831187, + "grad_norm": 0.38654613494873047, + "learning_rate": 7.108510292189884e-05, + "loss": 1.7907, + "step": 12401 + }, + { + "epoch": 3.8066298342541436, + "grad_norm": 0.2541481852531433, + "learning_rate": 7.10805958398278e-05, + "loss": 1.8458, + "step": 12402 + }, + { + "epoch": 3.806936771025169, + "grad_norm": 0.32562851905822754, + "learning_rate": 7.107608854942597e-05, + "loss": 1.7989, + "step": 12403 + }, + { + "epoch": 3.807243707796194, + "grad_norm": 0.3628395199775696, + "learning_rate": 7.107158105073786e-05, + "loss": 1.8044, + "step": 12404 + }, + { + "epoch": 3.807550644567219, + "grad_norm": 0.3363969027996063, + "learning_rate": 7.106707334380805e-05, + "loss": 1.8078, + "step": 12405 + }, + { + "epoch": 3.8078575813382445, + "grad_norm": 0.2853989601135254, + "learning_rate": 7.106256542868108e-05, + "loss": 1.7913, + "step": 12406 + }, + { + "epoch": 3.8081645181092694, + "grad_norm": 0.33455806970596313, + "learning_rate": 7.105805730540148e-05, + "loss": 1.7252, + "step": 12407 + }, + { + "epoch": 3.8084714548802947, + "grad_norm": 0.28103405237197876, + "learning_rate": 7.105354897401382e-05, + "loss": 1.6942, + "step": 12408 + }, + { + "epoch": 3.8087783916513196, + "grad_norm": 0.23230718076229095, + "learning_rate": 7.104904043456264e-05, + "loss": 1.7723, + "step": 12409 + }, + { + "epoch": 3.809085328422345, + "grad_norm": 0.2883053421974182, + "learning_rate": 7.104453168709251e-05, + "loss": 1.8015, + "step": 12410 + }, + { + "epoch": 3.8093922651933703, + "grad_norm": 0.28462252020835876, + "learning_rate": 7.104002273164798e-05, + "loss": 1.791, + "step": 12411 + }, + { + "epoch": 3.8096992019643956, + "grad_norm": 0.3004699647426605, + "learning_rate": 7.103551356827363e-05, + "loss": 1.8401, + "step": 12412 + }, + { + "epoch": 3.8100061387354205, + "grad_norm": 0.2546156048774719, + "learning_rate": 7.1031004197014e-05, + "loss": 1.7645, + "step": 12413 + }, + { + "epoch": 3.810313075506446, + "grad_norm": 0.24532915651798248, + "learning_rate": 7.102649461791364e-05, + "loss": 1.8, + "step": 12414 + }, + { + "epoch": 3.8106200122774707, + "grad_norm": 0.2432405799627304, + "learning_rate": 7.102198483101716e-05, + "loss": 1.7957, + "step": 12415 + }, + { + "epoch": 3.810926949048496, + "grad_norm": 0.24405215680599213, + "learning_rate": 7.101747483636908e-05, + "loss": 1.79, + "step": 12416 + }, + { + "epoch": 3.8112338858195214, + "grad_norm": 0.29519838094711304, + "learning_rate": 7.101296463401401e-05, + "loss": 1.8087, + "step": 12417 + }, + { + "epoch": 3.8115408225905463, + "grad_norm": 0.28205612301826477, + "learning_rate": 7.100845422399652e-05, + "loss": 1.7897, + "step": 12418 + }, + { + "epoch": 3.8118477593615716, + "grad_norm": 0.25014567375183105, + "learning_rate": 7.100394360636115e-05, + "loss": 1.7574, + "step": 12419 + }, + { + "epoch": 3.8121546961325965, + "grad_norm": 0.3133499026298523, + "learning_rate": 7.099943278115251e-05, + "loss": 1.7957, + "step": 12420 + }, + { + "epoch": 3.812461632903622, + "grad_norm": 0.3706473708152771, + "learning_rate": 7.099492174841516e-05, + "loss": 1.8519, + "step": 12421 + }, + { + "epoch": 3.812768569674647, + "grad_norm": 0.30085715651512146, + "learning_rate": 7.09904105081937e-05, + "loss": 1.778, + "step": 12422 + }, + { + "epoch": 3.813075506445672, + "grad_norm": 0.23897981643676758, + "learning_rate": 7.09858990605327e-05, + "loss": 1.7289, + "step": 12423 + }, + { + "epoch": 3.8133824432166974, + "grad_norm": 0.30046290159225464, + "learning_rate": 7.098138740547673e-05, + "loss": 1.8838, + "step": 12424 + }, + { + "epoch": 3.8136893799877223, + "grad_norm": 0.32126328349113464, + "learning_rate": 7.097687554307041e-05, + "loss": 1.7916, + "step": 12425 + }, + { + "epoch": 3.8139963167587476, + "grad_norm": 0.2922256886959076, + "learning_rate": 7.097236347335829e-05, + "loss": 1.8305, + "step": 12426 + }, + { + "epoch": 3.814303253529773, + "grad_norm": 0.2772706151008606, + "learning_rate": 7.0967851196385e-05, + "loss": 1.7694, + "step": 12427 + }, + { + "epoch": 3.8146101903007983, + "grad_norm": 0.25763455033302307, + "learning_rate": 7.096333871219511e-05, + "loss": 1.8716, + "step": 12428 + }, + { + "epoch": 3.814917127071823, + "grad_norm": 0.2631739377975464, + "learning_rate": 7.095882602083322e-05, + "loss": 1.7771, + "step": 12429 + }, + { + "epoch": 3.8152240638428485, + "grad_norm": 0.29229632019996643, + "learning_rate": 7.095431312234392e-05, + "loss": 1.7865, + "step": 12430 + }, + { + "epoch": 3.8155310006138734, + "grad_norm": 0.2672729790210724, + "learning_rate": 7.094980001677181e-05, + "loss": 1.7848, + "step": 12431 + }, + { + "epoch": 3.8158379373848987, + "grad_norm": 0.2388373166322708, + "learning_rate": 7.094528670416152e-05, + "loss": 1.75, + "step": 12432 + }, + { + "epoch": 3.816144874155924, + "grad_norm": 0.2385305017232895, + "learning_rate": 7.094077318455762e-05, + "loss": 1.748, + "step": 12433 + }, + { + "epoch": 3.816451810926949, + "grad_norm": 0.25421401858329773, + "learning_rate": 7.093625945800471e-05, + "loss": 1.779, + "step": 12434 + }, + { + "epoch": 3.8167587476979743, + "grad_norm": 0.2785158157348633, + "learning_rate": 7.093174552454743e-05, + "loss": 1.8295, + "step": 12435 + }, + { + "epoch": 3.817065684468999, + "grad_norm": 0.2907472252845764, + "learning_rate": 7.092723138423036e-05, + "loss": 1.8216, + "step": 12436 + }, + { + "epoch": 3.8173726212400245, + "grad_norm": 0.253955215215683, + "learning_rate": 7.092271703709814e-05, + "loss": 1.8394, + "step": 12437 + }, + { + "epoch": 3.81767955801105, + "grad_norm": 0.32139912247657776, + "learning_rate": 7.091820248319537e-05, + "loss": 1.8634, + "step": 12438 + }, + { + "epoch": 3.8179864947820747, + "grad_norm": 0.25890466570854187, + "learning_rate": 7.091368772256664e-05, + "loss": 1.7336, + "step": 12439 + }, + { + "epoch": 3.8182934315531, + "grad_norm": 0.2823775112628937, + "learning_rate": 7.090917275525661e-05, + "loss": 1.7927, + "step": 12440 + }, + { + "epoch": 3.818600368324125, + "grad_norm": 0.28739333152770996, + "learning_rate": 7.090465758130988e-05, + "loss": 1.7807, + "step": 12441 + }, + { + "epoch": 3.8189073050951503, + "grad_norm": 0.36823949217796326, + "learning_rate": 7.090014220077106e-05, + "loss": 1.7288, + "step": 12442 + }, + { + "epoch": 3.8192142418661756, + "grad_norm": 0.3061312735080719, + "learning_rate": 7.089562661368479e-05, + "loss": 1.8039, + "step": 12443 + }, + { + "epoch": 3.819521178637201, + "grad_norm": 0.25867924094200134, + "learning_rate": 7.089111082009569e-05, + "loss": 1.7678, + "step": 12444 + }, + { + "epoch": 3.819828115408226, + "grad_norm": 0.26834985613822937, + "learning_rate": 7.088659482004837e-05, + "loss": 1.7592, + "step": 12445 + }, + { + "epoch": 3.820135052179251, + "grad_norm": 0.25608211755752563, + "learning_rate": 7.08820786135875e-05, + "loss": 1.7622, + "step": 12446 + }, + { + "epoch": 3.820441988950276, + "grad_norm": 0.2512456774711609, + "learning_rate": 7.087756220075769e-05, + "loss": 1.7648, + "step": 12447 + }, + { + "epoch": 3.8207489257213014, + "grad_norm": 0.2434878647327423, + "learning_rate": 7.087304558160355e-05, + "loss": 1.7435, + "step": 12448 + }, + { + "epoch": 3.8210558624923268, + "grad_norm": 0.26456570625305176, + "learning_rate": 7.086852875616978e-05, + "loss": 1.7342, + "step": 12449 + }, + { + "epoch": 3.8213627992633517, + "grad_norm": 0.2958984971046448, + "learning_rate": 7.086401172450095e-05, + "loss": 1.8532, + "step": 12450 + }, + { + "epoch": 3.821669736034377, + "grad_norm": 0.25939157605171204, + "learning_rate": 7.085949448664172e-05, + "loss": 1.7746, + "step": 12451 + }, + { + "epoch": 3.821976672805402, + "grad_norm": 0.2210223525762558, + "learning_rate": 7.085497704263675e-05, + "loss": 1.7745, + "step": 12452 + }, + { + "epoch": 3.822283609576427, + "grad_norm": 0.2409319430589676, + "learning_rate": 7.085045939253068e-05, + "loss": 1.7981, + "step": 12453 + }, + { + "epoch": 3.8225905463474525, + "grad_norm": 0.26331812143325806, + "learning_rate": 7.084594153636815e-05, + "loss": 1.8163, + "step": 12454 + }, + { + "epoch": 3.8228974831184774, + "grad_norm": 0.2613828480243683, + "learning_rate": 7.08414234741938e-05, + "loss": 1.8362, + "step": 12455 + }, + { + "epoch": 3.8232044198895028, + "grad_norm": 0.3139529228210449, + "learning_rate": 7.083690520605228e-05, + "loss": 1.8247, + "step": 12456 + }, + { + "epoch": 3.8235113566605277, + "grad_norm": 0.2958570718765259, + "learning_rate": 7.083238673198826e-05, + "loss": 1.8011, + "step": 12457 + }, + { + "epoch": 3.823818293431553, + "grad_norm": 0.2517626881599426, + "learning_rate": 7.082786805204639e-05, + "loss": 1.7353, + "step": 12458 + }, + { + "epoch": 3.8241252302025783, + "grad_norm": 0.2443888783454895, + "learning_rate": 7.082334916627132e-05, + "loss": 1.7916, + "step": 12459 + }, + { + "epoch": 3.8244321669736037, + "grad_norm": 0.283514142036438, + "learning_rate": 7.08188300747077e-05, + "loss": 1.8048, + "step": 12460 + }, + { + "epoch": 3.8247391037446286, + "grad_norm": 0.24775351583957672, + "learning_rate": 7.08143107774002e-05, + "loss": 1.8145, + "step": 12461 + }, + { + "epoch": 3.825046040515654, + "grad_norm": 0.27904003858566284, + "learning_rate": 7.080979127439347e-05, + "loss": 1.8003, + "step": 12462 + }, + { + "epoch": 3.825352977286679, + "grad_norm": 0.24997512996196747, + "learning_rate": 7.08052715657322e-05, + "loss": 1.7962, + "step": 12463 + }, + { + "epoch": 3.825659914057704, + "grad_norm": 0.25874343514442444, + "learning_rate": 7.080075165146104e-05, + "loss": 1.7861, + "step": 12464 + }, + { + "epoch": 3.8259668508287294, + "grad_norm": 0.2964434027671814, + "learning_rate": 7.079623153162467e-05, + "loss": 1.7618, + "step": 12465 + }, + { + "epoch": 3.8262737875997543, + "grad_norm": 0.26403337717056274, + "learning_rate": 7.079171120626774e-05, + "loss": 1.8016, + "step": 12466 + }, + { + "epoch": 3.8265807243707797, + "grad_norm": 0.28369295597076416, + "learning_rate": 7.078719067543494e-05, + "loss": 1.7517, + "step": 12467 + }, + { + "epoch": 3.8268876611418046, + "grad_norm": 0.254312127828598, + "learning_rate": 7.078266993917093e-05, + "loss": 1.8085, + "step": 12468 + }, + { + "epoch": 3.82719459791283, + "grad_norm": 0.24992622435092926, + "learning_rate": 7.077814899752038e-05, + "loss": 1.7657, + "step": 12469 + }, + { + "epoch": 3.8275015346838552, + "grad_norm": 0.26485762000083923, + "learning_rate": 7.077362785052802e-05, + "loss": 1.7303, + "step": 12470 + }, + { + "epoch": 3.8278084714548806, + "grad_norm": 0.29864901304244995, + "learning_rate": 7.076910649823846e-05, + "loss": 1.7734, + "step": 12471 + }, + { + "epoch": 3.8281154082259055, + "grad_norm": 0.2973599433898926, + "learning_rate": 7.076458494069644e-05, + "loss": 1.8055, + "step": 12472 + }, + { + "epoch": 3.828422344996931, + "grad_norm": 0.2150362730026245, + "learning_rate": 7.07600631779466e-05, + "loss": 1.7377, + "step": 12473 + }, + { + "epoch": 3.8287292817679557, + "grad_norm": 0.26443010568618774, + "learning_rate": 7.075554121003367e-05, + "loss": 1.837, + "step": 12474 + }, + { + "epoch": 3.829036218538981, + "grad_norm": 0.27365007996559143, + "learning_rate": 7.075101903700231e-05, + "loss": 1.7784, + "step": 12475 + }, + { + "epoch": 3.8293431553100064, + "grad_norm": 0.22037263214588165, + "learning_rate": 7.074649665889721e-05, + "loss": 1.8182, + "step": 12476 + }, + { + "epoch": 3.8296500920810312, + "grad_norm": 0.29614946246147156, + "learning_rate": 7.074197407576308e-05, + "loss": 1.7993, + "step": 12477 + }, + { + "epoch": 3.8299570288520566, + "grad_norm": 0.25135520100593567, + "learning_rate": 7.07374512876446e-05, + "loss": 1.8211, + "step": 12478 + }, + { + "epoch": 3.8302639656230815, + "grad_norm": 0.2711503207683563, + "learning_rate": 7.073292829458645e-05, + "loss": 1.8274, + "step": 12479 + }, + { + "epoch": 3.830570902394107, + "grad_norm": 0.38659265637397766, + "learning_rate": 7.072840509663338e-05, + "loss": 1.796, + "step": 12480 + }, + { + "epoch": 3.830877839165132, + "grad_norm": 0.39382728934288025, + "learning_rate": 7.072388169383005e-05, + "loss": 1.8439, + "step": 12481 + }, + { + "epoch": 3.831184775936157, + "grad_norm": 0.27570033073425293, + "learning_rate": 7.071935808622118e-05, + "loss": 1.8155, + "step": 12482 + }, + { + "epoch": 3.8314917127071824, + "grad_norm": 0.29054465889930725, + "learning_rate": 7.071483427385147e-05, + "loss": 1.754, + "step": 12483 + }, + { + "epoch": 3.8317986494782073, + "grad_norm": 0.4138031303882599, + "learning_rate": 7.071031025676562e-05, + "loss": 1.7686, + "step": 12484 + }, + { + "epoch": 3.8321055862492326, + "grad_norm": 0.3447251617908478, + "learning_rate": 7.070578603500833e-05, + "loss": 1.8135, + "step": 12485 + }, + { + "epoch": 3.832412523020258, + "grad_norm": 0.265115886926651, + "learning_rate": 7.070126160862436e-05, + "loss": 1.803, + "step": 12486 + }, + { + "epoch": 3.8327194597912833, + "grad_norm": 0.4288817346096039, + "learning_rate": 7.069673697765837e-05, + "loss": 1.7814, + "step": 12487 + }, + { + "epoch": 3.833026396562308, + "grad_norm": 0.4890103340148926, + "learning_rate": 7.06922121421551e-05, + "loss": 1.8318, + "step": 12488 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.3676142990589142, + "learning_rate": 7.068768710215928e-05, + "loss": 1.7792, + "step": 12489 + }, + { + "epoch": 3.8336402701043584, + "grad_norm": 0.23254090547561646, + "learning_rate": 7.068316185771557e-05, + "loss": 1.7154, + "step": 12490 + }, + { + "epoch": 3.8339472068753837, + "grad_norm": 0.35014036297798157, + "learning_rate": 7.067863640886876e-05, + "loss": 1.7031, + "step": 12491 + }, + { + "epoch": 3.834254143646409, + "grad_norm": 0.32155317068099976, + "learning_rate": 7.067411075566353e-05, + "loss": 1.7692, + "step": 12492 + }, + { + "epoch": 3.834561080417434, + "grad_norm": 0.260772705078125, + "learning_rate": 7.066958489814463e-05, + "loss": 1.7488, + "step": 12493 + }, + { + "epoch": 3.8348680171884593, + "grad_norm": 0.2624910771846771, + "learning_rate": 7.066505883635678e-05, + "loss": 1.7436, + "step": 12494 + }, + { + "epoch": 3.835174953959484, + "grad_norm": 0.2782299220561981, + "learning_rate": 7.066053257034471e-05, + "loss": 1.8219, + "step": 12495 + }, + { + "epoch": 3.8354818907305095, + "grad_norm": 0.2749497890472412, + "learning_rate": 7.065600610015312e-05, + "loss": 1.8068, + "step": 12496 + }, + { + "epoch": 3.835788827501535, + "grad_norm": 0.2730359733104706, + "learning_rate": 7.06514794258268e-05, + "loss": 1.7588, + "step": 12497 + }, + { + "epoch": 3.8360957642725597, + "grad_norm": 0.3606291711330414, + "learning_rate": 7.064695254741044e-05, + "loss": 1.8509, + "step": 12498 + }, + { + "epoch": 3.836402701043585, + "grad_norm": 0.23282989859580994, + "learning_rate": 7.064242546494879e-05, + "loss": 1.7444, + "step": 12499 + }, + { + "epoch": 3.83670963781461, + "grad_norm": 0.2554507255554199, + "learning_rate": 7.06378981784866e-05, + "loss": 1.7486, + "step": 12500 + }, + { + "epoch": 3.8370165745856353, + "grad_norm": 0.2916143834590912, + "learning_rate": 7.06333706880686e-05, + "loss": 1.8035, + "step": 12501 + }, + { + "epoch": 3.8373235113566606, + "grad_norm": 0.23719090223312378, + "learning_rate": 7.062884299373955e-05, + "loss": 1.7896, + "step": 12502 + }, + { + "epoch": 3.837630448127686, + "grad_norm": 0.2596152126789093, + "learning_rate": 7.062431509554417e-05, + "loss": 1.7944, + "step": 12503 + }, + { + "epoch": 3.837937384898711, + "grad_norm": 0.29140764474868774, + "learning_rate": 7.061978699352723e-05, + "loss": 1.7988, + "step": 12504 + }, + { + "epoch": 3.838244321669736, + "grad_norm": 0.3421068489551544, + "learning_rate": 7.061525868773347e-05, + "loss": 1.751, + "step": 12505 + }, + { + "epoch": 3.838551258440761, + "grad_norm": 0.2705349624156952, + "learning_rate": 7.061073017820764e-05, + "loss": 1.7578, + "step": 12506 + }, + { + "epoch": 3.8388581952117864, + "grad_norm": 0.2403286248445511, + "learning_rate": 7.060620146499448e-05, + "loss": 1.8422, + "step": 12507 + }, + { + "epoch": 3.8391651319828117, + "grad_norm": 0.3860442042350769, + "learning_rate": 7.060167254813876e-05, + "loss": 1.8168, + "step": 12508 + }, + { + "epoch": 3.8394720687538366, + "grad_norm": 0.4729512631893158, + "learning_rate": 7.059714342768526e-05, + "loss": 1.7786, + "step": 12509 + }, + { + "epoch": 3.839779005524862, + "grad_norm": 0.3522968888282776, + "learning_rate": 7.059261410367871e-05, + "loss": 1.8749, + "step": 12510 + }, + { + "epoch": 3.840085942295887, + "grad_norm": 0.28071436285972595, + "learning_rate": 7.058808457616386e-05, + "loss": 1.7959, + "step": 12511 + }, + { + "epoch": 3.840392879066912, + "grad_norm": 0.4356439411640167, + "learning_rate": 7.05835548451855e-05, + "loss": 1.8045, + "step": 12512 + }, + { + "epoch": 3.8406998158379375, + "grad_norm": 0.4051562249660492, + "learning_rate": 7.057902491078839e-05, + "loss": 1.7909, + "step": 12513 + }, + { + "epoch": 3.8410067526089624, + "grad_norm": 0.2817205488681793, + "learning_rate": 7.057449477301728e-05, + "loss": 1.8736, + "step": 12514 + }, + { + "epoch": 3.8413136893799877, + "grad_norm": 0.33369559049606323, + "learning_rate": 7.056996443191697e-05, + "loss": 1.7799, + "step": 12515 + }, + { + "epoch": 3.8416206261510126, + "grad_norm": 0.369954913854599, + "learning_rate": 7.056543388753221e-05, + "loss": 1.795, + "step": 12516 + }, + { + "epoch": 3.841927562922038, + "grad_norm": 0.289474755525589, + "learning_rate": 7.056090313990778e-05, + "loss": 1.786, + "step": 12517 + }, + { + "epoch": 3.8422344996930633, + "grad_norm": 0.2431849092245102, + "learning_rate": 7.055637218908845e-05, + "loss": 1.7363, + "step": 12518 + }, + { + "epoch": 3.8425414364640886, + "grad_norm": 0.3736060857772827, + "learning_rate": 7.0551841035119e-05, + "loss": 1.8234, + "step": 12519 + }, + { + "epoch": 3.8428483732351135, + "grad_norm": 0.34008854627609253, + "learning_rate": 7.054730967804422e-05, + "loss": 1.8001, + "step": 12520 + }, + { + "epoch": 3.843155310006139, + "grad_norm": 0.24852876365184784, + "learning_rate": 7.054277811790887e-05, + "loss": 1.8298, + "step": 12521 + }, + { + "epoch": 3.8434622467771637, + "grad_norm": 0.3491046726703644, + "learning_rate": 7.053824635475777e-05, + "loss": 1.7336, + "step": 12522 + }, + { + "epoch": 3.843769183548189, + "grad_norm": 0.38757824897766113, + "learning_rate": 7.053371438863566e-05, + "loss": 1.8241, + "step": 12523 + }, + { + "epoch": 3.8440761203192144, + "grad_norm": 0.2607647180557251, + "learning_rate": 7.052918221958735e-05, + "loss": 1.7813, + "step": 12524 + }, + { + "epoch": 3.8443830570902393, + "grad_norm": 0.25634410977363586, + "learning_rate": 7.052464984765764e-05, + "loss": 1.7836, + "step": 12525 + }, + { + "epoch": 3.8446899938612646, + "grad_norm": 0.3113503158092499, + "learning_rate": 7.052011727289129e-05, + "loss": 1.8477, + "step": 12526 + }, + { + "epoch": 3.8449969306322895, + "grad_norm": 0.2852596044540405, + "learning_rate": 7.051558449533313e-05, + "loss": 1.7607, + "step": 12527 + }, + { + "epoch": 3.845303867403315, + "grad_norm": 0.24841541051864624, + "learning_rate": 7.051105151502795e-05, + "loss": 1.8109, + "step": 12528 + }, + { + "epoch": 3.84561080417434, + "grad_norm": 0.2231549620628357, + "learning_rate": 7.050651833202053e-05, + "loss": 1.7245, + "step": 12529 + }, + { + "epoch": 3.845917740945365, + "grad_norm": 0.21975892782211304, + "learning_rate": 7.050198494635566e-05, + "loss": 1.7512, + "step": 12530 + }, + { + "epoch": 3.8462246777163904, + "grad_norm": 0.2546280324459076, + "learning_rate": 7.049745135807816e-05, + "loss": 1.8003, + "step": 12531 + }, + { + "epoch": 3.8465316144874153, + "grad_norm": 0.21507929265499115, + "learning_rate": 7.049291756723284e-05, + "loss": 1.7616, + "step": 12532 + }, + { + "epoch": 3.8468385512584407, + "grad_norm": 0.24927987158298492, + "learning_rate": 7.04883835738645e-05, + "loss": 1.7519, + "step": 12533 + }, + { + "epoch": 3.847145488029466, + "grad_norm": 0.24988602101802826, + "learning_rate": 7.048384937801793e-05, + "loss": 1.7966, + "step": 12534 + }, + { + "epoch": 3.8474524248004913, + "grad_norm": 0.24039845168590546, + "learning_rate": 7.047931497973798e-05, + "loss": 1.7834, + "step": 12535 + }, + { + "epoch": 3.847759361571516, + "grad_norm": 0.22826696932315826, + "learning_rate": 7.047478037906943e-05, + "loss": 1.7334, + "step": 12536 + }, + { + "epoch": 3.8480662983425415, + "grad_norm": 0.22260744869709015, + "learning_rate": 7.047024557605708e-05, + "loss": 1.787, + "step": 12537 + }, + { + "epoch": 3.8483732351135664, + "grad_norm": 0.2457917332649231, + "learning_rate": 7.046571057074578e-05, + "loss": 1.7865, + "step": 12538 + }, + { + "epoch": 3.8486801718845918, + "grad_norm": 0.23952928185462952, + "learning_rate": 7.046117536318035e-05, + "loss": 1.7764, + "step": 12539 + }, + { + "epoch": 3.848987108655617, + "grad_norm": 0.22186748683452606, + "learning_rate": 7.045663995340557e-05, + "loss": 1.7917, + "step": 12540 + }, + { + "epoch": 3.849294045426642, + "grad_norm": 0.24234962463378906, + "learning_rate": 7.045210434146629e-05, + "loss": 1.7697, + "step": 12541 + }, + { + "epoch": 3.8496009821976673, + "grad_norm": 0.2510770857334137, + "learning_rate": 7.044756852740732e-05, + "loss": 1.8012, + "step": 12542 + }, + { + "epoch": 3.849907918968692, + "grad_norm": 0.24910703301429749, + "learning_rate": 7.044303251127349e-05, + "loss": 1.831, + "step": 12543 + }, + { + "epoch": 3.8502148557397176, + "grad_norm": 0.3159966468811035, + "learning_rate": 7.043849629310964e-05, + "loss": 1.8029, + "step": 12544 + }, + { + "epoch": 3.850521792510743, + "grad_norm": 0.3155403733253479, + "learning_rate": 7.04339598729606e-05, + "loss": 1.7429, + "step": 12545 + }, + { + "epoch": 3.8508287292817682, + "grad_norm": 0.3037515878677368, + "learning_rate": 7.042942325087117e-05, + "loss": 1.8186, + "step": 12546 + }, + { + "epoch": 3.851135666052793, + "grad_norm": 0.2319766730070114, + "learning_rate": 7.042488642688621e-05, + "loss": 1.7853, + "step": 12547 + }, + { + "epoch": 3.8514426028238185, + "grad_norm": 0.23911969363689423, + "learning_rate": 7.042034940105055e-05, + "loss": 1.8314, + "step": 12548 + }, + { + "epoch": 3.8517495395948433, + "grad_norm": 0.2541846036911011, + "learning_rate": 7.041581217340905e-05, + "loss": 1.8289, + "step": 12549 + }, + { + "epoch": 3.8520564763658687, + "grad_norm": 0.22234943509101868, + "learning_rate": 7.04112747440065e-05, + "loss": 1.7847, + "step": 12550 + }, + { + "epoch": 3.852363413136894, + "grad_norm": 0.2747870981693268, + "learning_rate": 7.04067371128878e-05, + "loss": 1.7875, + "step": 12551 + }, + { + "epoch": 3.852670349907919, + "grad_norm": 0.28589147329330444, + "learning_rate": 7.040219928009775e-05, + "loss": 1.7289, + "step": 12552 + }, + { + "epoch": 3.8529772866789442, + "grad_norm": 0.21180351078510284, + "learning_rate": 7.039766124568119e-05, + "loss": 1.7611, + "step": 12553 + }, + { + "epoch": 3.853284223449969, + "grad_norm": 0.27751782536506653, + "learning_rate": 7.0393123009683e-05, + "loss": 1.7481, + "step": 12554 + }, + { + "epoch": 3.8535911602209945, + "grad_norm": 0.32883307337760925, + "learning_rate": 7.038858457214802e-05, + "loss": 1.7271, + "step": 12555 + }, + { + "epoch": 3.85389809699202, + "grad_norm": 0.30965641140937805, + "learning_rate": 7.03840459331211e-05, + "loss": 1.81, + "step": 12556 + }, + { + "epoch": 3.8542050337630447, + "grad_norm": 0.25184348225593567, + "learning_rate": 7.037950709264709e-05, + "loss": 1.7642, + "step": 12557 + }, + { + "epoch": 3.85451197053407, + "grad_norm": 0.2376822829246521, + "learning_rate": 7.037496805077084e-05, + "loss": 1.7774, + "step": 12558 + }, + { + "epoch": 3.854818907305095, + "grad_norm": 0.2395993024110794, + "learning_rate": 7.03704288075372e-05, + "loss": 1.8397, + "step": 12559 + }, + { + "epoch": 3.8551258440761202, + "grad_norm": 0.26460394263267517, + "learning_rate": 7.036588936299107e-05, + "loss": 1.7472, + "step": 12560 + }, + { + "epoch": 3.8554327808471456, + "grad_norm": 0.34742459654808044, + "learning_rate": 7.036134971717725e-05, + "loss": 1.8003, + "step": 12561 + }, + { + "epoch": 3.855739717618171, + "grad_norm": 0.2829316556453705, + "learning_rate": 7.035680987014068e-05, + "loss": 1.7765, + "step": 12562 + }, + { + "epoch": 3.856046654389196, + "grad_norm": 0.3087223172187805, + "learning_rate": 7.035226982192615e-05, + "loss": 1.8462, + "step": 12563 + }, + { + "epoch": 3.856353591160221, + "grad_norm": 0.2806380093097687, + "learning_rate": 7.034772957257858e-05, + "loss": 1.7704, + "step": 12564 + }, + { + "epoch": 3.856660527931246, + "grad_norm": 0.25598087906837463, + "learning_rate": 7.03431891221428e-05, + "loss": 1.7843, + "step": 12565 + }, + { + "epoch": 3.8569674647022714, + "grad_norm": 0.30833700299263, + "learning_rate": 7.033864847066373e-05, + "loss": 1.8404, + "step": 12566 + }, + { + "epoch": 3.8572744014732967, + "grad_norm": 0.29562532901763916, + "learning_rate": 7.03341076181862e-05, + "loss": 1.8044, + "step": 12567 + }, + { + "epoch": 3.8575813382443216, + "grad_norm": 0.2901719808578491, + "learning_rate": 7.03295665647551e-05, + "loss": 1.7789, + "step": 12568 + }, + { + "epoch": 3.857888275015347, + "grad_norm": 0.25453686714172363, + "learning_rate": 7.03250253104153e-05, + "loss": 1.6792, + "step": 12569 + }, + { + "epoch": 3.858195211786372, + "grad_norm": 0.26009416580200195, + "learning_rate": 7.03204838552117e-05, + "loss": 1.7835, + "step": 12570 + }, + { + "epoch": 3.858502148557397, + "grad_norm": 0.28074127435684204, + "learning_rate": 7.031594219918916e-05, + "loss": 1.7932, + "step": 12571 + }, + { + "epoch": 3.8588090853284225, + "grad_norm": 0.3341725170612335, + "learning_rate": 7.031140034239258e-05, + "loss": 1.7439, + "step": 12572 + }, + { + "epoch": 3.8591160220994474, + "grad_norm": 0.28142449259757996, + "learning_rate": 7.030685828486684e-05, + "loss": 1.8263, + "step": 12573 + }, + { + "epoch": 3.8594229588704727, + "grad_norm": 0.2571438252925873, + "learning_rate": 7.030231602665681e-05, + "loss": 1.7628, + "step": 12574 + }, + { + "epoch": 3.8597298956414976, + "grad_norm": 0.3079041838645935, + "learning_rate": 7.029777356780741e-05, + "loss": 1.7879, + "step": 12575 + }, + { + "epoch": 3.860036832412523, + "grad_norm": 0.2605433464050293, + "learning_rate": 7.029323090836349e-05, + "loss": 1.7841, + "step": 12576 + }, + { + "epoch": 3.8603437691835483, + "grad_norm": 0.24069640040397644, + "learning_rate": 7.028868804836999e-05, + "loss": 1.7939, + "step": 12577 + }, + { + "epoch": 3.8606507059545736, + "grad_norm": 0.26801639795303345, + "learning_rate": 7.028414498787177e-05, + "loss": 1.8082, + "step": 12578 + }, + { + "epoch": 3.8609576427255985, + "grad_norm": 0.28828585147857666, + "learning_rate": 7.027960172691375e-05, + "loss": 1.8094, + "step": 12579 + }, + { + "epoch": 3.861264579496624, + "grad_norm": 0.22927051782608032, + "learning_rate": 7.027505826554082e-05, + "loss": 1.7758, + "step": 12580 + }, + { + "epoch": 3.8615715162676487, + "grad_norm": 0.25755998492240906, + "learning_rate": 7.027051460379788e-05, + "loss": 1.8429, + "step": 12581 + }, + { + "epoch": 3.861878453038674, + "grad_norm": 0.23636581003665924, + "learning_rate": 7.026597074172982e-05, + "loss": 1.7662, + "step": 12582 + }, + { + "epoch": 3.8621853898096994, + "grad_norm": 0.22599349915981293, + "learning_rate": 7.026142667938156e-05, + "loss": 1.7199, + "step": 12583 + }, + { + "epoch": 3.8624923265807243, + "grad_norm": 0.2504875659942627, + "learning_rate": 7.025688241679802e-05, + "loss": 1.8473, + "step": 12584 + }, + { + "epoch": 3.8627992633517496, + "grad_norm": 0.3012976348400116, + "learning_rate": 7.025233795402408e-05, + "loss": 1.8715, + "step": 12585 + }, + { + "epoch": 3.8631062001227745, + "grad_norm": 0.31703677773475647, + "learning_rate": 7.024779329110469e-05, + "loss": 1.8143, + "step": 12586 + }, + { + "epoch": 3.8634131368938, + "grad_norm": 0.27287593483924866, + "learning_rate": 7.024324842808472e-05, + "loss": 1.7227, + "step": 12587 + }, + { + "epoch": 3.863720073664825, + "grad_norm": 0.24663801491260529, + "learning_rate": 7.02387033650091e-05, + "loss": 1.7529, + "step": 12588 + }, + { + "epoch": 3.86402701043585, + "grad_norm": 0.26127147674560547, + "learning_rate": 7.023415810192277e-05, + "loss": 1.7629, + "step": 12589 + }, + { + "epoch": 3.8643339472068754, + "grad_norm": 0.3457142114639282, + "learning_rate": 7.022961263887062e-05, + "loss": 1.8212, + "step": 12590 + }, + { + "epoch": 3.8646408839779003, + "grad_norm": 0.3296070694923401, + "learning_rate": 7.022506697589759e-05, + "loss": 1.7907, + "step": 12591 + }, + { + "epoch": 3.8649478207489256, + "grad_norm": 0.29474303126335144, + "learning_rate": 7.022052111304858e-05, + "loss": 1.7866, + "step": 12592 + }, + { + "epoch": 3.865254757519951, + "grad_norm": 0.2535403072834015, + "learning_rate": 7.021597505036852e-05, + "loss": 1.7607, + "step": 12593 + }, + { + "epoch": 3.8655616942909763, + "grad_norm": 0.26691222190856934, + "learning_rate": 7.021142878790237e-05, + "loss": 1.8063, + "step": 12594 + }, + { + "epoch": 3.865868631062001, + "grad_norm": 0.2784755229949951, + "learning_rate": 7.020688232569502e-05, + "loss": 1.8065, + "step": 12595 + }, + { + "epoch": 3.8661755678330265, + "grad_norm": 0.23714317381381989, + "learning_rate": 7.020233566379142e-05, + "loss": 1.8317, + "step": 12596 + }, + { + "epoch": 3.8664825046040514, + "grad_norm": 0.25010553002357483, + "learning_rate": 7.019778880223649e-05, + "loss": 1.8493, + "step": 12597 + }, + { + "epoch": 3.8667894413750767, + "grad_norm": 0.2798489034175873, + "learning_rate": 7.01932417410752e-05, + "loss": 1.8134, + "step": 12598 + }, + { + "epoch": 3.867096378146102, + "grad_norm": 0.26199260354042053, + "learning_rate": 7.018869448035243e-05, + "loss": 1.6931, + "step": 12599 + }, + { + "epoch": 3.867403314917127, + "grad_norm": 0.24582891166210175, + "learning_rate": 7.018414702011314e-05, + "loss": 1.8076, + "step": 12600 + }, + { + "epoch": 3.8677102516881523, + "grad_norm": 0.25493237376213074, + "learning_rate": 7.01795993604023e-05, + "loss": 1.7851, + "step": 12601 + }, + { + "epoch": 3.868017188459177, + "grad_norm": 0.2607674300670624, + "learning_rate": 7.017505150126483e-05, + "loss": 1.7285, + "step": 12602 + }, + { + "epoch": 3.8683241252302025, + "grad_norm": 0.23629581928253174, + "learning_rate": 7.017050344274568e-05, + "loss": 1.8254, + "step": 12603 + }, + { + "epoch": 3.868631062001228, + "grad_norm": 0.3129318058490753, + "learning_rate": 7.016595518488979e-05, + "loss": 1.7914, + "step": 12604 + }, + { + "epoch": 3.8689379987722528, + "grad_norm": 0.3178271949291229, + "learning_rate": 7.01614067277421e-05, + "loss": 1.8139, + "step": 12605 + }, + { + "epoch": 3.869244935543278, + "grad_norm": 0.3230711817741394, + "learning_rate": 7.015685807134757e-05, + "loss": 1.8203, + "step": 12606 + }, + { + "epoch": 3.869551872314303, + "grad_norm": 0.26339825987815857, + "learning_rate": 7.015230921575118e-05, + "loss": 1.8022, + "step": 12607 + }, + { + "epoch": 3.8698588090853283, + "grad_norm": 0.25337356328964233, + "learning_rate": 7.014776016099785e-05, + "loss": 1.7779, + "step": 12608 + }, + { + "epoch": 3.8701657458563536, + "grad_norm": 0.2506195306777954, + "learning_rate": 7.014321090713253e-05, + "loss": 1.7858, + "step": 12609 + }, + { + "epoch": 3.870472682627379, + "grad_norm": 0.26249951124191284, + "learning_rate": 7.013866145420021e-05, + "loss": 1.8051, + "step": 12610 + }, + { + "epoch": 3.870779619398404, + "grad_norm": 0.25666534900665283, + "learning_rate": 7.013411180224581e-05, + "loss": 1.7945, + "step": 12611 + }, + { + "epoch": 3.871086556169429, + "grad_norm": 0.23901648819446564, + "learning_rate": 7.012956195131433e-05, + "loss": 1.7844, + "step": 12612 + }, + { + "epoch": 3.871393492940454, + "grad_norm": 0.26814451813697815, + "learning_rate": 7.012501190145071e-05, + "loss": 1.7713, + "step": 12613 + }, + { + "epoch": 3.8717004297114794, + "grad_norm": 0.28377315402030945, + "learning_rate": 7.012046165269995e-05, + "loss": 1.7866, + "step": 12614 + }, + { + "epoch": 3.8720073664825048, + "grad_norm": 0.2751680612564087, + "learning_rate": 7.011591120510699e-05, + "loss": 1.7215, + "step": 12615 + }, + { + "epoch": 3.8723143032535297, + "grad_norm": 0.21988113224506378, + "learning_rate": 7.011136055871679e-05, + "loss": 1.8009, + "step": 12616 + }, + { + "epoch": 3.872621240024555, + "grad_norm": 0.26462143659591675, + "learning_rate": 7.010680971357434e-05, + "loss": 1.7618, + "step": 12617 + }, + { + "epoch": 3.87292817679558, + "grad_norm": 0.29054632782936096, + "learning_rate": 7.010225866972462e-05, + "loss": 1.7549, + "step": 12618 + }, + { + "epoch": 3.873235113566605, + "grad_norm": 0.31341224908828735, + "learning_rate": 7.00977074272126e-05, + "loss": 1.8827, + "step": 12619 + }, + { + "epoch": 3.8735420503376305, + "grad_norm": 0.24252115190029144, + "learning_rate": 7.009315598608324e-05, + "loss": 1.7544, + "step": 12620 + }, + { + "epoch": 3.873848987108656, + "grad_norm": 0.30036893486976624, + "learning_rate": 7.008860434638154e-05, + "loss": 1.7465, + "step": 12621 + }, + { + "epoch": 3.8741559238796808, + "grad_norm": 0.3217438757419586, + "learning_rate": 7.00840525081525e-05, + "loss": 1.72, + "step": 12622 + }, + { + "epoch": 3.874462860650706, + "grad_norm": 0.22507290542125702, + "learning_rate": 7.007950047144105e-05, + "loss": 1.7177, + "step": 12623 + }, + { + "epoch": 3.874769797421731, + "grad_norm": 0.3014441728591919, + "learning_rate": 7.007494823629224e-05, + "loss": 1.7502, + "step": 12624 + }, + { + "epoch": 3.8750767341927563, + "grad_norm": 0.3836904466152191, + "learning_rate": 7.0070395802751e-05, + "loss": 1.7971, + "step": 12625 + }, + { + "epoch": 3.8753836709637817, + "grad_norm": 0.33565691113471985, + "learning_rate": 7.006584317086235e-05, + "loss": 1.7439, + "step": 12626 + }, + { + "epoch": 3.8756906077348066, + "grad_norm": 0.2292134314775467, + "learning_rate": 7.006129034067128e-05, + "loss": 1.7998, + "step": 12627 + }, + { + "epoch": 3.875997544505832, + "grad_norm": 0.26385873556137085, + "learning_rate": 7.005673731222277e-05, + "loss": 1.7914, + "step": 12628 + }, + { + "epoch": 3.876304481276857, + "grad_norm": 0.2854950428009033, + "learning_rate": 7.005218408556184e-05, + "loss": 1.7761, + "step": 12629 + }, + { + "epoch": 3.876611418047882, + "grad_norm": 0.34260645508766174, + "learning_rate": 7.004763066073348e-05, + "loss": 1.8015, + "step": 12630 + }, + { + "epoch": 3.8769183548189075, + "grad_norm": 0.3223683834075928, + "learning_rate": 7.004307703778267e-05, + "loss": 1.7453, + "step": 12631 + }, + { + "epoch": 3.8772252915899323, + "grad_norm": 0.24715089797973633, + "learning_rate": 7.003852321675442e-05, + "loss": 1.7813, + "step": 12632 + }, + { + "epoch": 3.8775322283609577, + "grad_norm": 0.22822390496730804, + "learning_rate": 7.003396919769377e-05, + "loss": 1.7982, + "step": 12633 + }, + { + "epoch": 3.8778391651319826, + "grad_norm": 0.24125081300735474, + "learning_rate": 7.002941498064565e-05, + "loss": 1.8606, + "step": 12634 + }, + { + "epoch": 3.878146101903008, + "grad_norm": 0.23512506484985352, + "learning_rate": 7.002486056565513e-05, + "loss": 1.7469, + "step": 12635 + }, + { + "epoch": 3.8784530386740332, + "grad_norm": 0.2908322215080261, + "learning_rate": 7.00203059527672e-05, + "loss": 1.796, + "step": 12636 + }, + { + "epoch": 3.8787599754450586, + "grad_norm": 0.22931252419948578, + "learning_rate": 7.001575114202689e-05, + "loss": 1.7482, + "step": 12637 + }, + { + "epoch": 3.8790669122160835, + "grad_norm": 0.22574284672737122, + "learning_rate": 7.001119613347917e-05, + "loss": 1.7698, + "step": 12638 + }, + { + "epoch": 3.879373848987109, + "grad_norm": 0.23129726946353912, + "learning_rate": 7.000664092716909e-05, + "loss": 1.776, + "step": 12639 + }, + { + "epoch": 3.8796807857581337, + "grad_norm": 0.2763366401195526, + "learning_rate": 7.000208552314165e-05, + "loss": 1.7814, + "step": 12640 + }, + { + "epoch": 3.879987722529159, + "grad_norm": 0.29870158433914185, + "learning_rate": 6.99975299214419e-05, + "loss": 1.7467, + "step": 12641 + }, + { + "epoch": 3.8802946593001844, + "grad_norm": 0.33574381470680237, + "learning_rate": 6.999297412211484e-05, + "loss": 1.8159, + "step": 12642 + }, + { + "epoch": 3.8806015960712092, + "grad_norm": 0.30309897661209106, + "learning_rate": 6.998841812520547e-05, + "loss": 1.8454, + "step": 12643 + }, + { + "epoch": 3.8809085328422346, + "grad_norm": 0.27399247884750366, + "learning_rate": 6.998386193075886e-05, + "loss": 1.7956, + "step": 12644 + }, + { + "epoch": 3.8812154696132595, + "grad_norm": 0.28649580478668213, + "learning_rate": 6.997930553881998e-05, + "loss": 1.8308, + "step": 12645 + }, + { + "epoch": 3.881522406384285, + "grad_norm": 0.2716052532196045, + "learning_rate": 6.997474894943392e-05, + "loss": 1.7698, + "step": 12646 + }, + { + "epoch": 3.88182934315531, + "grad_norm": 0.21380536258220673, + "learning_rate": 6.997019216264567e-05, + "loss": 1.7028, + "step": 12647 + }, + { + "epoch": 3.882136279926335, + "grad_norm": 0.25262731313705444, + "learning_rate": 6.996563517850028e-05, + "loss": 1.8236, + "step": 12648 + }, + { + "epoch": 3.8824432166973604, + "grad_norm": 0.21150052547454834, + "learning_rate": 6.996107799704277e-05, + "loss": 1.7437, + "step": 12649 + }, + { + "epoch": 3.8827501534683853, + "grad_norm": 0.2614554464817047, + "learning_rate": 6.995652061831821e-05, + "loss": 1.7575, + "step": 12650 + }, + { + "epoch": 3.8830570902394106, + "grad_norm": 0.214684396982193, + "learning_rate": 6.995196304237159e-05, + "loss": 1.8195, + "step": 12651 + }, + { + "epoch": 3.883364027010436, + "grad_norm": 0.2226872444152832, + "learning_rate": 6.994740526924798e-05, + "loss": 1.7556, + "step": 12652 + }, + { + "epoch": 3.8836709637814613, + "grad_norm": 0.22270764410495758, + "learning_rate": 6.994284729899246e-05, + "loss": 1.7536, + "step": 12653 + }, + { + "epoch": 3.883977900552486, + "grad_norm": 0.20683564245700836, + "learning_rate": 6.993828913165e-05, + "loss": 1.7728, + "step": 12654 + }, + { + "epoch": 3.8842848373235115, + "grad_norm": 0.23667018115520477, + "learning_rate": 6.993373076726568e-05, + "loss": 1.7819, + "step": 12655 + }, + { + "epoch": 3.8845917740945364, + "grad_norm": 0.2265234887599945, + "learning_rate": 6.992917220588455e-05, + "loss": 1.7502, + "step": 12656 + }, + { + "epoch": 3.8848987108655617, + "grad_norm": 0.24490754306316376, + "learning_rate": 6.992461344755168e-05, + "loss": 1.7513, + "step": 12657 + }, + { + "epoch": 3.885205647636587, + "grad_norm": 0.23001348972320557, + "learning_rate": 6.992005449231208e-05, + "loss": 1.733, + "step": 12658 + }, + { + "epoch": 3.885512584407612, + "grad_norm": 0.25424695014953613, + "learning_rate": 6.991549534021084e-05, + "loss": 1.7621, + "step": 12659 + }, + { + "epoch": 3.8858195211786373, + "grad_norm": 0.25552862882614136, + "learning_rate": 6.991093599129299e-05, + "loss": 1.7974, + "step": 12660 + }, + { + "epoch": 3.886126457949662, + "grad_norm": 0.26876959204673767, + "learning_rate": 6.99063764456036e-05, + "loss": 1.7924, + "step": 12661 + }, + { + "epoch": 3.8864333947206875, + "grad_norm": 0.2754429578781128, + "learning_rate": 6.990181670318772e-05, + "loss": 1.7981, + "step": 12662 + }, + { + "epoch": 3.886740331491713, + "grad_norm": 0.281818687915802, + "learning_rate": 6.989725676409044e-05, + "loss": 1.7328, + "step": 12663 + }, + { + "epoch": 3.8870472682627377, + "grad_norm": 0.21676552295684814, + "learning_rate": 6.989269662835681e-05, + "loss": 1.7376, + "step": 12664 + }, + { + "epoch": 3.887354205033763, + "grad_norm": 0.276115745306015, + "learning_rate": 6.98881362960319e-05, + "loss": 1.7784, + "step": 12665 + }, + { + "epoch": 3.887661141804788, + "grad_norm": 0.2806364893913269, + "learning_rate": 6.988357576716075e-05, + "loss": 1.8078, + "step": 12666 + }, + { + "epoch": 3.8879680785758133, + "grad_norm": 0.27620184421539307, + "learning_rate": 6.987901504178845e-05, + "loss": 1.8115, + "step": 12667 + }, + { + "epoch": 3.8882750153468386, + "grad_norm": 0.23845402896404266, + "learning_rate": 6.987445411996009e-05, + "loss": 1.7485, + "step": 12668 + }, + { + "epoch": 3.888581952117864, + "grad_norm": 0.25063586235046387, + "learning_rate": 6.986989300172071e-05, + "loss": 1.7663, + "step": 12669 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.2417975515127182, + "learning_rate": 6.98653316871154e-05, + "loss": 1.7562, + "step": 12670 + }, + { + "epoch": 3.889195825659914, + "grad_norm": 0.24952733516693115, + "learning_rate": 6.986077017618923e-05, + "loss": 1.8063, + "step": 12671 + }, + { + "epoch": 3.889502762430939, + "grad_norm": 0.25847554206848145, + "learning_rate": 6.985620846898732e-05, + "loss": 1.7722, + "step": 12672 + }, + { + "epoch": 3.8898096992019644, + "grad_norm": 0.23762650787830353, + "learning_rate": 6.985164656555471e-05, + "loss": 1.8368, + "step": 12673 + }, + { + "epoch": 3.8901166359729897, + "grad_norm": 0.25346314907073975, + "learning_rate": 6.984708446593648e-05, + "loss": 1.7957, + "step": 12674 + }, + { + "epoch": 3.8904235727440146, + "grad_norm": 0.2466745674610138, + "learning_rate": 6.984252217017774e-05, + "loss": 1.8286, + "step": 12675 + }, + { + "epoch": 3.89073050951504, + "grad_norm": 0.25413215160369873, + "learning_rate": 6.983795967832356e-05, + "loss": 1.7711, + "step": 12676 + }, + { + "epoch": 3.891037446286065, + "grad_norm": 0.2315925806760788, + "learning_rate": 6.983339699041903e-05, + "loss": 1.7546, + "step": 12677 + }, + { + "epoch": 3.89134438305709, + "grad_norm": 0.26473405957221985, + "learning_rate": 6.982883410650925e-05, + "loss": 1.7563, + "step": 12678 + }, + { + "epoch": 3.8916513198281155, + "grad_norm": 0.24176491796970367, + "learning_rate": 6.982427102663932e-05, + "loss": 1.7734, + "step": 12679 + }, + { + "epoch": 3.891958256599141, + "grad_norm": 0.25444844365119934, + "learning_rate": 6.98197077508543e-05, + "loss": 1.803, + "step": 12680 + }, + { + "epoch": 3.8922651933701657, + "grad_norm": 0.25234144926071167, + "learning_rate": 6.981514427919933e-05, + "loss": 1.8099, + "step": 12681 + }, + { + "epoch": 3.892572130141191, + "grad_norm": 0.2571142315864563, + "learning_rate": 6.98105806117195e-05, + "loss": 1.8618, + "step": 12682 + }, + { + "epoch": 3.892879066912216, + "grad_norm": 0.21235275268554688, + "learning_rate": 6.980601674845988e-05, + "loss": 1.7121, + "step": 12683 + }, + { + "epoch": 3.8931860036832413, + "grad_norm": 0.27078527212142944, + "learning_rate": 6.98014526894656e-05, + "loss": 1.8103, + "step": 12684 + }, + { + "epoch": 3.8934929404542666, + "grad_norm": 0.3198096454143524, + "learning_rate": 6.979688843478176e-05, + "loss": 1.7529, + "step": 12685 + }, + { + "epoch": 3.8937998772252915, + "grad_norm": 0.3170493245124817, + "learning_rate": 6.979232398445345e-05, + "loss": 1.7629, + "step": 12686 + }, + { + "epoch": 3.894106813996317, + "grad_norm": 0.2495265007019043, + "learning_rate": 6.978775933852582e-05, + "loss": 1.7407, + "step": 12687 + }, + { + "epoch": 3.8944137507673418, + "grad_norm": 0.24570141732692719, + "learning_rate": 6.978319449704395e-05, + "loss": 1.7688, + "step": 12688 + }, + { + "epoch": 3.894720687538367, + "grad_norm": 0.23956388235092163, + "learning_rate": 6.977862946005295e-05, + "loss": 1.7115, + "step": 12689 + }, + { + "epoch": 3.8950276243093924, + "grad_norm": 0.21548940241336823, + "learning_rate": 6.977406422759793e-05, + "loss": 1.7611, + "step": 12690 + }, + { + "epoch": 3.8953345610804173, + "grad_norm": 0.25797295570373535, + "learning_rate": 6.976949879972403e-05, + "loss": 1.7688, + "step": 12691 + }, + { + "epoch": 3.8956414978514426, + "grad_norm": 0.28257784247398376, + "learning_rate": 6.976493317647636e-05, + "loss": 1.7517, + "step": 12692 + }, + { + "epoch": 3.8959484346224675, + "grad_norm": 0.23828580975532532, + "learning_rate": 6.976036735790004e-05, + "loss": 1.7877, + "step": 12693 + }, + { + "epoch": 3.896255371393493, + "grad_norm": 0.22915001213550568, + "learning_rate": 6.975580134404017e-05, + "loss": 1.7741, + "step": 12694 + }, + { + "epoch": 3.896562308164518, + "grad_norm": 0.22975030541419983, + "learning_rate": 6.97512351349419e-05, + "loss": 1.772, + "step": 12695 + }, + { + "epoch": 3.8968692449355435, + "grad_norm": 0.29515185952186584, + "learning_rate": 6.974666873065034e-05, + "loss": 1.8001, + "step": 12696 + }, + { + "epoch": 3.8971761817065684, + "grad_norm": 0.26904794573783875, + "learning_rate": 6.974210213121064e-05, + "loss": 1.7069, + "step": 12697 + }, + { + "epoch": 3.8974831184775938, + "grad_norm": 0.2549479603767395, + "learning_rate": 6.97375353366679e-05, + "loss": 1.7419, + "step": 12698 + }, + { + "epoch": 3.8977900552486187, + "grad_norm": 0.23750101029872894, + "learning_rate": 6.973296834706729e-05, + "loss": 1.7815, + "step": 12699 + }, + { + "epoch": 3.898096992019644, + "grad_norm": 0.23529762029647827, + "learning_rate": 6.972840116245389e-05, + "loss": 1.8139, + "step": 12700 + }, + { + "epoch": 3.8984039287906693, + "grad_norm": 0.3212098777294159, + "learning_rate": 6.97238337828729e-05, + "loss": 1.7507, + "step": 12701 + }, + { + "epoch": 3.898710865561694, + "grad_norm": 0.3167687952518463, + "learning_rate": 6.971926620836941e-05, + "loss": 1.8062, + "step": 12702 + }, + { + "epoch": 3.8990178023327196, + "grad_norm": 0.31298309564590454, + "learning_rate": 6.971469843898855e-05, + "loss": 1.8127, + "step": 12703 + }, + { + "epoch": 3.8993247391037444, + "grad_norm": 0.2537378668785095, + "learning_rate": 6.971013047477551e-05, + "loss": 1.7675, + "step": 12704 + }, + { + "epoch": 3.8996316758747698, + "grad_norm": 0.24292805790901184, + "learning_rate": 6.97055623157754e-05, + "loss": 1.8004, + "step": 12705 + }, + { + "epoch": 3.899938612645795, + "grad_norm": 0.2929537296295166, + "learning_rate": 6.970099396203338e-05, + "loss": 1.7963, + "step": 12706 + }, + { + "epoch": 3.90024554941682, + "grad_norm": 0.30531612038612366, + "learning_rate": 6.969642541359459e-05, + "loss": 1.7347, + "step": 12707 + }, + { + "epoch": 3.9005524861878453, + "grad_norm": 0.3138202726840973, + "learning_rate": 6.969185667050417e-05, + "loss": 1.7987, + "step": 12708 + }, + { + "epoch": 3.9008594229588702, + "grad_norm": 0.2366247922182083, + "learning_rate": 6.96872877328073e-05, + "loss": 1.7671, + "step": 12709 + }, + { + "epoch": 3.9011663597298956, + "grad_norm": 0.26251721382141113, + "learning_rate": 6.96827186005491e-05, + "loss": 1.7657, + "step": 12710 + }, + { + "epoch": 3.901473296500921, + "grad_norm": 0.32497119903564453, + "learning_rate": 6.967814927377474e-05, + "loss": 1.7873, + "step": 12711 + }, + { + "epoch": 3.9017802332719462, + "grad_norm": 0.3290228843688965, + "learning_rate": 6.967357975252939e-05, + "loss": 1.8076, + "step": 12712 + }, + { + "epoch": 3.902087170042971, + "grad_norm": 0.2737300992012024, + "learning_rate": 6.966901003685817e-05, + "loss": 1.7405, + "step": 12713 + }, + { + "epoch": 3.9023941068139965, + "grad_norm": 0.25465309619903564, + "learning_rate": 6.966444012680626e-05, + "loss": 1.8063, + "step": 12714 + }, + { + "epoch": 3.9027010435850213, + "grad_norm": 0.2397255003452301, + "learning_rate": 6.965987002241885e-05, + "loss": 1.8079, + "step": 12715 + }, + { + "epoch": 3.9030079803560467, + "grad_norm": 0.23115718364715576, + "learning_rate": 6.965529972374108e-05, + "loss": 1.8032, + "step": 12716 + }, + { + "epoch": 3.903314917127072, + "grad_norm": 0.2536461055278778, + "learning_rate": 6.96507292308181e-05, + "loss": 1.7477, + "step": 12717 + }, + { + "epoch": 3.903621853898097, + "grad_norm": 0.27151185274124146, + "learning_rate": 6.96461585436951e-05, + "loss": 1.75, + "step": 12718 + }, + { + "epoch": 3.9039287906691222, + "grad_norm": 0.26894113421440125, + "learning_rate": 6.964158766241726e-05, + "loss": 1.7816, + "step": 12719 + }, + { + "epoch": 3.904235727440147, + "grad_norm": 0.23541375994682312, + "learning_rate": 6.963701658702972e-05, + "loss": 1.7991, + "step": 12720 + }, + { + "epoch": 3.9045426642111725, + "grad_norm": 0.22142915427684784, + "learning_rate": 6.96324453175777e-05, + "loss": 1.7245, + "step": 12721 + }, + { + "epoch": 3.904849600982198, + "grad_norm": 0.32864269614219666, + "learning_rate": 6.962787385410632e-05, + "loss": 1.7631, + "step": 12722 + }, + { + "epoch": 3.9051565377532227, + "grad_norm": 0.23657776415348053, + "learning_rate": 6.96233021966608e-05, + "loss": 1.8081, + "step": 12723 + }, + { + "epoch": 3.905463474524248, + "grad_norm": 0.24790632724761963, + "learning_rate": 6.961873034528629e-05, + "loss": 1.7193, + "step": 12724 + }, + { + "epoch": 3.905770411295273, + "grad_norm": 0.2517886459827423, + "learning_rate": 6.961415830002801e-05, + "loss": 1.7785, + "step": 12725 + }, + { + "epoch": 3.9060773480662982, + "grad_norm": 0.2340923547744751, + "learning_rate": 6.960958606093113e-05, + "loss": 1.7632, + "step": 12726 + }, + { + "epoch": 3.9063842848373236, + "grad_norm": 0.23260441422462463, + "learning_rate": 6.960501362804079e-05, + "loss": 1.7865, + "step": 12727 + }, + { + "epoch": 3.906691221608349, + "grad_norm": 0.22616329789161682, + "learning_rate": 6.960044100140224e-05, + "loss": 1.7851, + "step": 12728 + }, + { + "epoch": 3.906998158379374, + "grad_norm": 0.2849951982498169, + "learning_rate": 6.959586818106064e-05, + "loss": 1.8618, + "step": 12729 + }, + { + "epoch": 3.907305095150399, + "grad_norm": 0.3279374837875366, + "learning_rate": 6.95912951670612e-05, + "loss": 1.8563, + "step": 12730 + }, + { + "epoch": 3.907612031921424, + "grad_norm": 0.24359555542469025, + "learning_rate": 6.958672195944906e-05, + "loss": 1.7604, + "step": 12731 + }, + { + "epoch": 3.9079189686924494, + "grad_norm": 0.30881935358047485, + "learning_rate": 6.958214855826947e-05, + "loss": 1.8463, + "step": 12732 + }, + { + "epoch": 3.9082259054634747, + "grad_norm": 0.25361543893814087, + "learning_rate": 6.957757496356763e-05, + "loss": 1.7831, + "step": 12733 + }, + { + "epoch": 3.9085328422344996, + "grad_norm": 0.26763513684272766, + "learning_rate": 6.957300117538869e-05, + "loss": 1.8383, + "step": 12734 + }, + { + "epoch": 3.908839779005525, + "grad_norm": 0.2238057255744934, + "learning_rate": 6.95684271937779e-05, + "loss": 1.7702, + "step": 12735 + }, + { + "epoch": 3.90914671577655, + "grad_norm": 0.22110232710838318, + "learning_rate": 6.956385301878045e-05, + "loss": 1.7931, + "step": 12736 + }, + { + "epoch": 3.909453652547575, + "grad_norm": 0.23765070736408234, + "learning_rate": 6.955927865044152e-05, + "loss": 1.7212, + "step": 12737 + }, + { + "epoch": 3.9097605893186005, + "grad_norm": 0.22324508428573608, + "learning_rate": 6.955470408880633e-05, + "loss": 1.7161, + "step": 12738 + }, + { + "epoch": 3.9100675260896254, + "grad_norm": 0.22485347092151642, + "learning_rate": 6.955012933392012e-05, + "loss": 1.7374, + "step": 12739 + }, + { + "epoch": 3.9103744628606507, + "grad_norm": 0.28046715259552, + "learning_rate": 6.954555438582806e-05, + "loss": 1.9264, + "step": 12740 + }, + { + "epoch": 3.9106813996316756, + "grad_norm": 0.26391276717185974, + "learning_rate": 6.954097924457536e-05, + "loss": 1.7343, + "step": 12741 + }, + { + "epoch": 3.910988336402701, + "grad_norm": 0.29596614837646484, + "learning_rate": 6.953640391020726e-05, + "loss": 1.8111, + "step": 12742 + }, + { + "epoch": 3.9112952731737263, + "grad_norm": 0.2709808051586151, + "learning_rate": 6.953182838276896e-05, + "loss": 1.7776, + "step": 12743 + }, + { + "epoch": 3.9116022099447516, + "grad_norm": 0.2585100531578064, + "learning_rate": 6.952725266230571e-05, + "loss": 1.7774, + "step": 12744 + }, + { + "epoch": 3.9119091467157765, + "grad_norm": 0.26490530371665955, + "learning_rate": 6.952267674886268e-05, + "loss": 1.78, + "step": 12745 + }, + { + "epoch": 3.912216083486802, + "grad_norm": 0.23654767870903015, + "learning_rate": 6.951810064248512e-05, + "loss": 1.8263, + "step": 12746 + }, + { + "epoch": 3.9125230202578267, + "grad_norm": 0.2495296597480774, + "learning_rate": 6.951352434321826e-05, + "loss": 1.787, + "step": 12747 + }, + { + "epoch": 3.912829957028852, + "grad_norm": 0.24038313329219818, + "learning_rate": 6.950894785110728e-05, + "loss": 1.774, + "step": 12748 + }, + { + "epoch": 3.9131368937998774, + "grad_norm": 0.23738732933998108, + "learning_rate": 6.950437116619749e-05, + "loss": 1.7401, + "step": 12749 + }, + { + "epoch": 3.9134438305709023, + "grad_norm": 0.28192025423049927, + "learning_rate": 6.949979428853405e-05, + "loss": 1.8416, + "step": 12750 + }, + { + "epoch": 3.9137507673419276, + "grad_norm": 0.30579057335853577, + "learning_rate": 6.949521721816221e-05, + "loss": 1.7404, + "step": 12751 + }, + { + "epoch": 3.9140577041129525, + "grad_norm": 0.23972894251346588, + "learning_rate": 6.949063995512721e-05, + "loss": 1.7543, + "step": 12752 + }, + { + "epoch": 3.914364640883978, + "grad_norm": 0.2837793231010437, + "learning_rate": 6.94860624994743e-05, + "loss": 1.7779, + "step": 12753 + }, + { + "epoch": 3.914671577655003, + "grad_norm": 0.3344916105270386, + "learning_rate": 6.948148485124868e-05, + "loss": 1.7803, + "step": 12754 + }, + { + "epoch": 3.9149785144260285, + "grad_norm": 0.24271291494369507, + "learning_rate": 6.94769070104956e-05, + "loss": 1.7362, + "step": 12755 + }, + { + "epoch": 3.9152854511970534, + "grad_norm": 0.25299304723739624, + "learning_rate": 6.947232897726031e-05, + "loss": 1.7685, + "step": 12756 + }, + { + "epoch": 3.9155923879680787, + "grad_norm": 0.24766205251216888, + "learning_rate": 6.946775075158807e-05, + "loss": 1.829, + "step": 12757 + }, + { + "epoch": 3.9158993247391036, + "grad_norm": 0.2508428692817688, + "learning_rate": 6.94631723335241e-05, + "loss": 1.809, + "step": 12758 + }, + { + "epoch": 3.916206261510129, + "grad_norm": 0.2172096222639084, + "learning_rate": 6.945859372311365e-05, + "loss": 1.7376, + "step": 12759 + }, + { + "epoch": 3.9165131982811543, + "grad_norm": 0.28976425528526306, + "learning_rate": 6.945401492040198e-05, + "loss": 1.8229, + "step": 12760 + }, + { + "epoch": 3.916820135052179, + "grad_norm": 0.3528063893318176, + "learning_rate": 6.944943592543432e-05, + "loss": 1.7559, + "step": 12761 + }, + { + "epoch": 3.9171270718232045, + "grad_norm": 0.46312370896339417, + "learning_rate": 6.944485673825595e-05, + "loss": 1.7664, + "step": 12762 + }, + { + "epoch": 3.9174340085942294, + "grad_norm": 0.4466164708137512, + "learning_rate": 6.94402773589121e-05, + "loss": 1.7833, + "step": 12763 + }, + { + "epoch": 3.9177409453652547, + "grad_norm": 0.2637740969657898, + "learning_rate": 6.943569778744804e-05, + "loss": 1.818, + "step": 12764 + }, + { + "epoch": 3.91804788213628, + "grad_norm": 0.37515267729759216, + "learning_rate": 6.943111802390901e-05, + "loss": 1.7898, + "step": 12765 + }, + { + "epoch": 3.918354818907305, + "grad_norm": 0.45146289467811584, + "learning_rate": 6.942653806834029e-05, + "loss": 1.7797, + "step": 12766 + }, + { + "epoch": 3.9186617556783303, + "grad_norm": 0.2809859812259674, + "learning_rate": 6.942195792078712e-05, + "loss": 1.7836, + "step": 12767 + }, + { + "epoch": 3.918968692449355, + "grad_norm": 0.3606306314468384, + "learning_rate": 6.94173775812948e-05, + "loss": 1.7657, + "step": 12768 + }, + { + "epoch": 3.9192756292203805, + "grad_norm": 0.49528738856315613, + "learning_rate": 6.941279704990857e-05, + "loss": 1.7628, + "step": 12769 + }, + { + "epoch": 3.919582565991406, + "grad_norm": 0.3484322428703308, + "learning_rate": 6.940821632667371e-05, + "loss": 1.7939, + "step": 12770 + }, + { + "epoch": 3.919889502762431, + "grad_norm": 0.2479606419801712, + "learning_rate": 6.940363541163546e-05, + "loss": 1.813, + "step": 12771 + }, + { + "epoch": 3.920196439533456, + "grad_norm": 0.3491765558719635, + "learning_rate": 6.939905430483911e-05, + "loss": 1.7338, + "step": 12772 + }, + { + "epoch": 3.9205033763044814, + "grad_norm": 0.291810005903244, + "learning_rate": 6.939447300632995e-05, + "loss": 1.7445, + "step": 12773 + }, + { + "epoch": 3.9208103130755063, + "grad_norm": 0.2467527985572815, + "learning_rate": 6.938989151615324e-05, + "loss": 1.8462, + "step": 12774 + }, + { + "epoch": 3.9211172498465316, + "grad_norm": 0.35656824707984924, + "learning_rate": 6.938530983435426e-05, + "loss": 1.7751, + "step": 12775 + }, + { + "epoch": 3.921424186617557, + "grad_norm": 0.31269776821136475, + "learning_rate": 6.938072796097828e-05, + "loss": 1.7714, + "step": 12776 + }, + { + "epoch": 3.921731123388582, + "grad_norm": 0.2082831859588623, + "learning_rate": 6.937614589607058e-05, + "loss": 1.7263, + "step": 12777 + }, + { + "epoch": 3.922038060159607, + "grad_norm": 0.27583765983581543, + "learning_rate": 6.937156363967646e-05, + "loss": 1.6822, + "step": 12778 + }, + { + "epoch": 3.922344996930632, + "grad_norm": 0.32773876190185547, + "learning_rate": 6.93669811918412e-05, + "loss": 1.7792, + "step": 12779 + }, + { + "epoch": 3.9226519337016574, + "grad_norm": 0.2583121657371521, + "learning_rate": 6.936239855261007e-05, + "loss": 1.7812, + "step": 12780 + }, + { + "epoch": 3.9229588704726828, + "grad_norm": 0.245570570230484, + "learning_rate": 6.935781572202836e-05, + "loss": 1.7252, + "step": 12781 + }, + { + "epoch": 3.9232658072437077, + "grad_norm": 0.2379419505596161, + "learning_rate": 6.935323270014138e-05, + "loss": 1.7485, + "step": 12782 + }, + { + "epoch": 3.923572744014733, + "grad_norm": 0.2239784598350525, + "learning_rate": 6.934864948699439e-05, + "loss": 1.7444, + "step": 12783 + }, + { + "epoch": 3.923879680785758, + "grad_norm": 0.2366618812084198, + "learning_rate": 6.934406608263274e-05, + "loss": 1.777, + "step": 12784 + }, + { + "epoch": 3.924186617556783, + "grad_norm": 0.22583791613578796, + "learning_rate": 6.933948248710169e-05, + "loss": 1.7291, + "step": 12785 + }, + { + "epoch": 3.9244935543278086, + "grad_norm": 0.24141047894954681, + "learning_rate": 6.933489870044651e-05, + "loss": 1.7748, + "step": 12786 + }, + { + "epoch": 3.924800491098834, + "grad_norm": 0.2389962524175644, + "learning_rate": 6.933031472271255e-05, + "loss": 1.7957, + "step": 12787 + }, + { + "epoch": 3.925107427869859, + "grad_norm": 0.25230300426483154, + "learning_rate": 6.932573055394509e-05, + "loss": 1.7621, + "step": 12788 + }, + { + "epoch": 3.925414364640884, + "grad_norm": 0.23894043266773224, + "learning_rate": 6.932114619418941e-05, + "loss": 1.7285, + "step": 12789 + }, + { + "epoch": 3.925721301411909, + "grad_norm": 0.2650291919708252, + "learning_rate": 6.931656164349086e-05, + "loss": 1.7613, + "step": 12790 + }, + { + "epoch": 3.9260282381829343, + "grad_norm": 0.20616789162158966, + "learning_rate": 6.931197690189472e-05, + "loss": 1.7505, + "step": 12791 + }, + { + "epoch": 3.9263351749539597, + "grad_norm": 0.23915675282478333, + "learning_rate": 6.930739196944633e-05, + "loss": 1.7477, + "step": 12792 + }, + { + "epoch": 3.9266421117249846, + "grad_norm": 0.2522687613964081, + "learning_rate": 6.930280684619094e-05, + "loss": 1.8, + "step": 12793 + }, + { + "epoch": 3.92694904849601, + "grad_norm": 0.264167845249176, + "learning_rate": 6.929822153217391e-05, + "loss": 1.7516, + "step": 12794 + }, + { + "epoch": 3.927255985267035, + "grad_norm": 0.21358054876327515, + "learning_rate": 6.929363602744054e-05, + "loss": 1.7207, + "step": 12795 + }, + { + "epoch": 3.92756292203806, + "grad_norm": 0.25632721185684204, + "learning_rate": 6.928905033203617e-05, + "loss": 1.7446, + "step": 12796 + }, + { + "epoch": 3.9278698588090855, + "grad_norm": 0.2717185318470001, + "learning_rate": 6.928446444600608e-05, + "loss": 1.8555, + "step": 12797 + }, + { + "epoch": 3.9281767955801103, + "grad_norm": 0.2871767282485962, + "learning_rate": 6.927987836939561e-05, + "loss": 1.7861, + "step": 12798 + }, + { + "epoch": 3.9284837323511357, + "grad_norm": 0.282507061958313, + "learning_rate": 6.927529210225009e-05, + "loss": 1.7683, + "step": 12799 + }, + { + "epoch": 3.9287906691221606, + "grad_norm": 0.24870644509792328, + "learning_rate": 6.927070564461482e-05, + "loss": 1.7355, + "step": 12800 + }, + { + "epoch": 3.929097605893186, + "grad_norm": 0.2093631625175476, + "learning_rate": 6.926611899653516e-05, + "loss": 1.7691, + "step": 12801 + }, + { + "epoch": 3.9294045426642112, + "grad_norm": 0.34258076548576355, + "learning_rate": 6.926153215805642e-05, + "loss": 1.8398, + "step": 12802 + }, + { + "epoch": 3.9297114794352366, + "grad_norm": 0.39179500937461853, + "learning_rate": 6.925694512922391e-05, + "loss": 1.8229, + "step": 12803 + }, + { + "epoch": 3.9300184162062615, + "grad_norm": 0.36814743280410767, + "learning_rate": 6.9252357910083e-05, + "loss": 1.7759, + "step": 12804 + }, + { + "epoch": 3.930325352977287, + "grad_norm": 0.2659403085708618, + "learning_rate": 6.924777050067902e-05, + "loss": 1.7553, + "step": 12805 + }, + { + "epoch": 3.9306322897483117, + "grad_norm": 0.20617491006851196, + "learning_rate": 6.924318290105724e-05, + "loss": 1.7398, + "step": 12806 + }, + { + "epoch": 3.930939226519337, + "grad_norm": 0.23730522394180298, + "learning_rate": 6.923859511126309e-05, + "loss": 1.699, + "step": 12807 + }, + { + "epoch": 3.9312461632903624, + "grad_norm": 0.24865423142910004, + "learning_rate": 6.923400713134184e-05, + "loss": 1.7801, + "step": 12808 + }, + { + "epoch": 3.9315531000613873, + "grad_norm": 0.2495356798171997, + "learning_rate": 6.92294189613389e-05, + "loss": 1.803, + "step": 12809 + }, + { + "epoch": 3.9318600368324126, + "grad_norm": 0.24223244190216064, + "learning_rate": 6.922483060129955e-05, + "loss": 1.751, + "step": 12810 + }, + { + "epoch": 3.9321669736034375, + "grad_norm": 0.2541450262069702, + "learning_rate": 6.922024205126913e-05, + "loss": 1.7721, + "step": 12811 + }, + { + "epoch": 3.932473910374463, + "grad_norm": 0.24528831243515015, + "learning_rate": 6.921565331129304e-05, + "loss": 1.792, + "step": 12812 + }, + { + "epoch": 3.932780847145488, + "grad_norm": 0.22789500653743744, + "learning_rate": 6.921106438141659e-05, + "loss": 1.8455, + "step": 12813 + }, + { + "epoch": 3.933087783916513, + "grad_norm": 0.26267170906066895, + "learning_rate": 6.920647526168515e-05, + "loss": 1.7254, + "step": 12814 + }, + { + "epoch": 3.9333947206875384, + "grad_norm": 0.23044808208942413, + "learning_rate": 6.920188595214406e-05, + "loss": 1.7217, + "step": 12815 + }, + { + "epoch": 3.9337016574585633, + "grad_norm": 0.2304011732339859, + "learning_rate": 6.919729645283867e-05, + "loss": 1.8121, + "step": 12816 + }, + { + "epoch": 3.9340085942295886, + "grad_norm": 0.21516792476177216, + "learning_rate": 6.919270676381435e-05, + "loss": 1.7305, + "step": 12817 + }, + { + "epoch": 3.934315531000614, + "grad_norm": 0.24698840081691742, + "learning_rate": 6.918811688511646e-05, + "loss": 1.7967, + "step": 12818 + }, + { + "epoch": 3.9346224677716393, + "grad_norm": 0.23132537305355072, + "learning_rate": 6.918352681679035e-05, + "loss": 1.7439, + "step": 12819 + }, + { + "epoch": 3.934929404542664, + "grad_norm": 0.2597793936729431, + "learning_rate": 6.917893655888139e-05, + "loss": 1.7882, + "step": 12820 + }, + { + "epoch": 3.9352363413136895, + "grad_norm": 0.23946607112884521, + "learning_rate": 6.917434611143493e-05, + "loss": 1.7991, + "step": 12821 + }, + { + "epoch": 3.9355432780847144, + "grad_norm": 0.25808244943618774, + "learning_rate": 6.916975547449634e-05, + "loss": 1.845, + "step": 12822 + }, + { + "epoch": 3.9358502148557397, + "grad_norm": 0.26082557439804077, + "learning_rate": 6.9165164648111e-05, + "loss": 1.7562, + "step": 12823 + }, + { + "epoch": 3.936157151626765, + "grad_norm": 0.24810053408145905, + "learning_rate": 6.916057363232425e-05, + "loss": 1.778, + "step": 12824 + }, + { + "epoch": 3.93646408839779, + "grad_norm": 0.24168157577514648, + "learning_rate": 6.91559824271815e-05, + "loss": 1.7628, + "step": 12825 + }, + { + "epoch": 3.9367710251688153, + "grad_norm": 0.23800434172153473, + "learning_rate": 6.91513910327281e-05, + "loss": 1.8063, + "step": 12826 + }, + { + "epoch": 3.93707796193984, + "grad_norm": 0.23055073618888855, + "learning_rate": 6.914679944900944e-05, + "loss": 1.749, + "step": 12827 + }, + { + "epoch": 3.9373848987108655, + "grad_norm": 0.22455987334251404, + "learning_rate": 6.914220767607088e-05, + "loss": 1.7471, + "step": 12828 + }, + { + "epoch": 3.937691835481891, + "grad_norm": 0.21808378398418427, + "learning_rate": 6.913761571395778e-05, + "loss": 1.7503, + "step": 12829 + }, + { + "epoch": 3.937998772252916, + "grad_norm": 0.23136213421821594, + "learning_rate": 6.913302356271556e-05, + "loss": 1.752, + "step": 12830 + }, + { + "epoch": 3.938305709023941, + "grad_norm": 0.29579970240592957, + "learning_rate": 6.912843122238959e-05, + "loss": 1.8028, + "step": 12831 + }, + { + "epoch": 3.9386126457949664, + "grad_norm": 0.28578072786331177, + "learning_rate": 6.912383869302526e-05, + "loss": 1.8183, + "step": 12832 + }, + { + "epoch": 3.9389195825659913, + "grad_norm": 0.2616737186908722, + "learning_rate": 6.911924597466793e-05, + "loss": 1.8366, + "step": 12833 + }, + { + "epoch": 3.9392265193370166, + "grad_norm": 0.29275768995285034, + "learning_rate": 6.911465306736302e-05, + "loss": 1.731, + "step": 12834 + }, + { + "epoch": 3.939533456108042, + "grad_norm": 0.3300873041152954, + "learning_rate": 6.91100599711559e-05, + "loss": 1.8713, + "step": 12835 + }, + { + "epoch": 3.939840392879067, + "grad_norm": 0.2744643986225128, + "learning_rate": 6.910546668609195e-05, + "loss": 1.8479, + "step": 12836 + }, + { + "epoch": 3.940147329650092, + "grad_norm": 0.25248417258262634, + "learning_rate": 6.91008732122166e-05, + "loss": 1.7962, + "step": 12837 + }, + { + "epoch": 3.940454266421117, + "grad_norm": 0.3068546652793884, + "learning_rate": 6.909627954957521e-05, + "loss": 1.759, + "step": 12838 + }, + { + "epoch": 3.9407612031921424, + "grad_norm": 0.3273559808731079, + "learning_rate": 6.909168569821321e-05, + "loss": 1.814, + "step": 12839 + }, + { + "epoch": 3.9410681399631677, + "grad_norm": 0.31192758679389954, + "learning_rate": 6.908709165817597e-05, + "loss": 1.7906, + "step": 12840 + }, + { + "epoch": 3.9413750767341926, + "grad_norm": 0.24487090110778809, + "learning_rate": 6.90824974295089e-05, + "loss": 1.8238, + "step": 12841 + }, + { + "epoch": 3.941682013505218, + "grad_norm": 0.24863721430301666, + "learning_rate": 6.907790301225743e-05, + "loss": 1.7651, + "step": 12842 + }, + { + "epoch": 3.941988950276243, + "grad_norm": 0.26555630564689636, + "learning_rate": 6.907330840646693e-05, + "loss": 1.8268, + "step": 12843 + }, + { + "epoch": 3.942295887047268, + "grad_norm": 0.2439817190170288, + "learning_rate": 6.906871361218281e-05, + "loss": 1.7291, + "step": 12844 + }, + { + "epoch": 3.9426028238182935, + "grad_norm": 0.2410304993391037, + "learning_rate": 6.906411862945048e-05, + "loss": 1.712, + "step": 12845 + }, + { + "epoch": 3.942909760589319, + "grad_norm": 0.28575149178504944, + "learning_rate": 6.905952345831537e-05, + "loss": 1.7269, + "step": 12846 + }, + { + "epoch": 3.9432166973603437, + "grad_norm": 0.3055815100669861, + "learning_rate": 6.905492809882286e-05, + "loss": 1.7234, + "step": 12847 + }, + { + "epoch": 3.943523634131369, + "grad_norm": 0.2762533724308014, + "learning_rate": 6.905033255101839e-05, + "loss": 1.7768, + "step": 12848 + }, + { + "epoch": 3.943830570902394, + "grad_norm": 0.22819125652313232, + "learning_rate": 6.904573681494738e-05, + "loss": 1.7416, + "step": 12849 + }, + { + "epoch": 3.9441375076734193, + "grad_norm": 0.21664194762706757, + "learning_rate": 6.904114089065523e-05, + "loss": 1.7506, + "step": 12850 + }, + { + "epoch": 3.9444444444444446, + "grad_norm": 0.21935151517391205, + "learning_rate": 6.903654477818735e-05, + "loss": 1.7522, + "step": 12851 + }, + { + "epoch": 3.9447513812154695, + "grad_norm": 0.2204175442457199, + "learning_rate": 6.903194847758918e-05, + "loss": 1.7753, + "step": 12852 + }, + { + "epoch": 3.945058317986495, + "grad_norm": 0.23130151629447937, + "learning_rate": 6.902735198890615e-05, + "loss": 1.7743, + "step": 12853 + }, + { + "epoch": 3.9453652547575198, + "grad_norm": 0.2548399567604065, + "learning_rate": 6.902275531218368e-05, + "loss": 1.8373, + "step": 12854 + }, + { + "epoch": 3.945672191528545, + "grad_norm": 0.2905479371547699, + "learning_rate": 6.901815844746718e-05, + "loss": 1.8336, + "step": 12855 + }, + { + "epoch": 3.9459791282995704, + "grad_norm": 0.2698945105075836, + "learning_rate": 6.90135613948021e-05, + "loss": 1.7498, + "step": 12856 + }, + { + "epoch": 3.9462860650705953, + "grad_norm": 0.24966828525066376, + "learning_rate": 6.900896415423387e-05, + "loss": 1.7664, + "step": 12857 + }, + { + "epoch": 3.9465930018416207, + "grad_norm": 0.23272784054279327, + "learning_rate": 6.90043667258079e-05, + "loss": 1.7742, + "step": 12858 + }, + { + "epoch": 3.9468999386126455, + "grad_norm": 0.2277698516845703, + "learning_rate": 6.899976910956965e-05, + "loss": 1.7465, + "step": 12859 + }, + { + "epoch": 3.947206875383671, + "grad_norm": 0.2376442402601242, + "learning_rate": 6.899517130556454e-05, + "loss": 1.7995, + "step": 12860 + }, + { + "epoch": 3.947513812154696, + "grad_norm": 0.25591593980789185, + "learning_rate": 6.899057331383802e-05, + "loss": 1.8017, + "step": 12861 + }, + { + "epoch": 3.9478207489257215, + "grad_norm": 0.2715262472629547, + "learning_rate": 6.898597513443551e-05, + "loss": 1.7967, + "step": 12862 + }, + { + "epoch": 3.9481276856967464, + "grad_norm": 0.20916256308555603, + "learning_rate": 6.898137676740246e-05, + "loss": 1.7711, + "step": 12863 + }, + { + "epoch": 3.9484346224677718, + "grad_norm": 0.2570229768753052, + "learning_rate": 6.897677821278435e-05, + "loss": 1.833, + "step": 12864 + }, + { + "epoch": 3.9487415592387967, + "grad_norm": 0.26343438029289246, + "learning_rate": 6.897217947062657e-05, + "loss": 1.7625, + "step": 12865 + }, + { + "epoch": 3.949048496009822, + "grad_norm": 0.23407024145126343, + "learning_rate": 6.896758054097459e-05, + "loss": 1.7211, + "step": 12866 + }, + { + "epoch": 3.9493554327808473, + "grad_norm": 0.2554715573787689, + "learning_rate": 6.896298142387387e-05, + "loss": 1.8548, + "step": 12867 + }, + { + "epoch": 3.949662369551872, + "grad_norm": 0.24143370985984802, + "learning_rate": 6.895838211936986e-05, + "loss": 1.7635, + "step": 12868 + }, + { + "epoch": 3.9499693063228976, + "grad_norm": 0.24634715914726257, + "learning_rate": 6.8953782627508e-05, + "loss": 1.8012, + "step": 12869 + }, + { + "epoch": 3.9502762430939224, + "grad_norm": 0.22740426659584045, + "learning_rate": 6.894918294833375e-05, + "loss": 1.7294, + "step": 12870 + }, + { + "epoch": 3.950583179864948, + "grad_norm": 0.2651631832122803, + "learning_rate": 6.894458308189257e-05, + "loss": 1.8289, + "step": 12871 + }, + { + "epoch": 3.950890116635973, + "grad_norm": 0.28693267703056335, + "learning_rate": 6.893998302822991e-05, + "loss": 1.8462, + "step": 12872 + }, + { + "epoch": 3.951197053406998, + "grad_norm": 0.26584213972091675, + "learning_rate": 6.893538278739125e-05, + "loss": 1.7621, + "step": 12873 + }, + { + "epoch": 3.9515039901780233, + "grad_norm": 0.29970669746398926, + "learning_rate": 6.893078235942203e-05, + "loss": 1.7659, + "step": 12874 + }, + { + "epoch": 3.9518109269490482, + "grad_norm": 0.2271152138710022, + "learning_rate": 6.892618174436771e-05, + "loss": 1.7151, + "step": 12875 + }, + { + "epoch": 3.9521178637200736, + "grad_norm": 0.24783682823181152, + "learning_rate": 6.892158094227379e-05, + "loss": 1.761, + "step": 12876 + }, + { + "epoch": 3.952424800491099, + "grad_norm": 0.2371140718460083, + "learning_rate": 6.891697995318573e-05, + "loss": 1.7557, + "step": 12877 + }, + { + "epoch": 3.9527317372621242, + "grad_norm": 0.29708394408226013, + "learning_rate": 6.891237877714896e-05, + "loss": 1.8629, + "step": 12878 + }, + { + "epoch": 3.953038674033149, + "grad_norm": 0.2724219262599945, + "learning_rate": 6.890777741420899e-05, + "loss": 1.7378, + "step": 12879 + }, + { + "epoch": 3.9533456108041745, + "grad_norm": 0.2227276861667633, + "learning_rate": 6.890317586441126e-05, + "loss": 1.6989, + "step": 12880 + }, + { + "epoch": 3.9536525475751993, + "grad_norm": 0.2546161413192749, + "learning_rate": 6.889857412780128e-05, + "loss": 1.8688, + "step": 12881 + }, + { + "epoch": 3.9539594843462247, + "grad_norm": 0.24882884323596954, + "learning_rate": 6.889397220442452e-05, + "loss": 1.8137, + "step": 12882 + }, + { + "epoch": 3.95426642111725, + "grad_norm": 0.2549113929271698, + "learning_rate": 6.888937009432644e-05, + "loss": 1.8366, + "step": 12883 + }, + { + "epoch": 3.954573357888275, + "grad_norm": 0.30032673478126526, + "learning_rate": 6.888476779755255e-05, + "loss": 1.8267, + "step": 12884 + }, + { + "epoch": 3.9548802946593002, + "grad_norm": 0.2887294292449951, + "learning_rate": 6.888016531414832e-05, + "loss": 1.8295, + "step": 12885 + }, + { + "epoch": 3.955187231430325, + "grad_norm": 0.2947406470775604, + "learning_rate": 6.88755626441592e-05, + "loss": 1.7713, + "step": 12886 + }, + { + "epoch": 3.9554941682013505, + "grad_norm": 0.2967108190059662, + "learning_rate": 6.887095978763072e-05, + "loss": 1.7636, + "step": 12887 + }, + { + "epoch": 3.955801104972376, + "grad_norm": 0.2495311200618744, + "learning_rate": 6.886635674460836e-05, + "loss": 1.8148, + "step": 12888 + }, + { + "epoch": 3.9561080417434007, + "grad_norm": 0.23367099463939667, + "learning_rate": 6.88617535151376e-05, + "loss": 1.7353, + "step": 12889 + }, + { + "epoch": 3.956414978514426, + "grad_norm": 0.36790570616722107, + "learning_rate": 6.885715009926395e-05, + "loss": 1.7853, + "step": 12890 + }, + { + "epoch": 3.9567219152854514, + "grad_norm": 0.5013020038604736, + "learning_rate": 6.885254649703287e-05, + "loss": 1.7923, + "step": 12891 + }, + { + "epoch": 3.9570288520564763, + "grad_norm": 0.4446276128292084, + "learning_rate": 6.884794270848988e-05, + "loss": 1.7504, + "step": 12892 + }, + { + "epoch": 3.9573357888275016, + "grad_norm": 0.2478526383638382, + "learning_rate": 6.88433387336805e-05, + "loss": 1.7629, + "step": 12893 + }, + { + "epoch": 3.957642725598527, + "grad_norm": 0.30111798644065857, + "learning_rate": 6.883873457265019e-05, + "loss": 1.8291, + "step": 12894 + }, + { + "epoch": 3.957949662369552, + "grad_norm": 0.3812437951564789, + "learning_rate": 6.883413022544445e-05, + "loss": 1.7919, + "step": 12895 + }, + { + "epoch": 3.958256599140577, + "grad_norm": 0.2895318269729614, + "learning_rate": 6.882952569210881e-05, + "loss": 1.7467, + "step": 12896 + }, + { + "epoch": 3.958563535911602, + "grad_norm": 0.30391454696655273, + "learning_rate": 6.882492097268873e-05, + "loss": 1.8145, + "step": 12897 + }, + { + "epoch": 3.9588704726826274, + "grad_norm": 0.5033623576164246, + "learning_rate": 6.882031606722977e-05, + "loss": 1.8231, + "step": 12898 + }, + { + "epoch": 3.9591774094536527, + "grad_norm": 0.5351777672767639, + "learning_rate": 6.881571097577742e-05, + "loss": 1.807, + "step": 12899 + }, + { + "epoch": 3.9594843462246776, + "grad_norm": 0.35540491342544556, + "learning_rate": 6.881110569837719e-05, + "loss": 1.7626, + "step": 12900 + }, + { + "epoch": 3.959791282995703, + "grad_norm": 0.22447600960731506, + "learning_rate": 6.880650023507457e-05, + "loss": 1.7392, + "step": 12901 + }, + { + "epoch": 3.960098219766728, + "grad_norm": 0.44619202613830566, + "learning_rate": 6.88018945859151e-05, + "loss": 1.8138, + "step": 12902 + }, + { + "epoch": 3.960405156537753, + "grad_norm": 0.41381633281707764, + "learning_rate": 6.879728875094428e-05, + "loss": 1.7676, + "step": 12903 + }, + { + "epoch": 3.9607120933087785, + "grad_norm": 0.2601528465747833, + "learning_rate": 6.879268273020764e-05, + "loss": 1.8406, + "step": 12904 + }, + { + "epoch": 3.961019030079804, + "grad_norm": 0.3309035003185272, + "learning_rate": 6.878807652375071e-05, + "loss": 1.7673, + "step": 12905 + }, + { + "epoch": 3.9613259668508287, + "grad_norm": 0.5281669497489929, + "learning_rate": 6.878347013161899e-05, + "loss": 1.7686, + "step": 12906 + }, + { + "epoch": 3.961632903621854, + "grad_norm": 0.5397645831108093, + "learning_rate": 6.8778863553858e-05, + "loss": 1.8575, + "step": 12907 + }, + { + "epoch": 3.961939840392879, + "grad_norm": 0.329485684633255, + "learning_rate": 6.877425679051327e-05, + "loss": 1.8185, + "step": 12908 + }, + { + "epoch": 3.9622467771639043, + "grad_norm": 0.3012789487838745, + "learning_rate": 6.876964984163034e-05, + "loss": 1.7962, + "step": 12909 + }, + { + "epoch": 3.9625537139349296, + "grad_norm": 0.5596817135810852, + "learning_rate": 6.876504270725472e-05, + "loss": 1.7972, + "step": 12910 + }, + { + "epoch": 3.9628606507059545, + "grad_norm": 0.5374729633331299, + "learning_rate": 6.876043538743197e-05, + "loss": 1.7863, + "step": 12911 + }, + { + "epoch": 3.96316758747698, + "grad_norm": 0.24617290496826172, + "learning_rate": 6.875582788220757e-05, + "loss": 1.7555, + "step": 12912 + }, + { + "epoch": 3.9634745242480047, + "grad_norm": 0.3493972420692444, + "learning_rate": 6.875122019162712e-05, + "loss": 1.8595, + "step": 12913 + }, + { + "epoch": 3.96378146101903, + "grad_norm": 0.4293089807033539, + "learning_rate": 6.874661231573609e-05, + "loss": 1.7647, + "step": 12914 + }, + { + "epoch": 3.9640883977900554, + "grad_norm": 0.30602574348449707, + "learning_rate": 6.874200425458006e-05, + "loss": 1.7122, + "step": 12915 + }, + { + "epoch": 3.9643953345610803, + "grad_norm": 0.22776013612747192, + "learning_rate": 6.873739600820457e-05, + "loss": 1.7136, + "step": 12916 + }, + { + "epoch": 3.9647022713321056, + "grad_norm": 0.3727327585220337, + "learning_rate": 6.873278757665513e-05, + "loss": 1.8314, + "step": 12917 + }, + { + "epoch": 3.9650092081031305, + "grad_norm": 0.35110536217689514, + "learning_rate": 6.872817895997733e-05, + "loss": 1.7506, + "step": 12918 + }, + { + "epoch": 3.965316144874156, + "grad_norm": 0.275560587644577, + "learning_rate": 6.872357015821666e-05, + "loss": 1.7865, + "step": 12919 + }, + { + "epoch": 3.965623081645181, + "grad_norm": 0.2686980366706848, + "learning_rate": 6.871896117141873e-05, + "loss": 1.8431, + "step": 12920 + }, + { + "epoch": 3.9659300184162065, + "grad_norm": 0.3299664556980133, + "learning_rate": 6.871435199962901e-05, + "loss": 1.7988, + "step": 12921 + }, + { + "epoch": 3.9662369551872314, + "grad_norm": 0.2833637297153473, + "learning_rate": 6.870974264289313e-05, + "loss": 1.6993, + "step": 12922 + }, + { + "epoch": 3.9665438919582567, + "grad_norm": 0.25062620639801025, + "learning_rate": 6.870513310125659e-05, + "loss": 1.7814, + "step": 12923 + }, + { + "epoch": 3.9668508287292816, + "grad_norm": 0.26609909534454346, + "learning_rate": 6.870052337476498e-05, + "loss": 1.7871, + "step": 12924 + }, + { + "epoch": 3.967157765500307, + "grad_norm": 0.22760890424251556, + "learning_rate": 6.869591346346382e-05, + "loss": 1.7941, + "step": 12925 + }, + { + "epoch": 3.9674647022713323, + "grad_norm": 0.2845582067966461, + "learning_rate": 6.869130336739869e-05, + "loss": 1.8215, + "step": 12926 + }, + { + "epoch": 3.967771639042357, + "grad_norm": 0.254948228597641, + "learning_rate": 6.868669308661514e-05, + "loss": 1.7515, + "step": 12927 + }, + { + "epoch": 3.9680785758133825, + "grad_norm": 0.2372167855501175, + "learning_rate": 6.868208262115875e-05, + "loss": 1.7524, + "step": 12928 + }, + { + "epoch": 3.9683855125844074, + "grad_norm": 0.31165993213653564, + "learning_rate": 6.867747197107506e-05, + "loss": 1.8139, + "step": 12929 + }, + { + "epoch": 3.9686924493554327, + "grad_norm": 0.2617839276790619, + "learning_rate": 6.867286113640965e-05, + "loss": 1.7388, + "step": 12930 + }, + { + "epoch": 3.968999386126458, + "grad_norm": 0.22749558091163635, + "learning_rate": 6.866825011720807e-05, + "loss": 1.7421, + "step": 12931 + }, + { + "epoch": 3.969306322897483, + "grad_norm": 0.27737462520599365, + "learning_rate": 6.86636389135159e-05, + "loss": 1.7977, + "step": 12932 + }, + { + "epoch": 3.9696132596685083, + "grad_norm": 0.3331063985824585, + "learning_rate": 6.865902752537871e-05, + "loss": 1.7925, + "step": 12933 + }, + { + "epoch": 3.969920196439533, + "grad_norm": 0.24229519069194794, + "learning_rate": 6.86544159528421e-05, + "loss": 1.7782, + "step": 12934 + }, + { + "epoch": 3.9702271332105585, + "grad_norm": 0.29494860768318176, + "learning_rate": 6.86498041959516e-05, + "loss": 1.7713, + "step": 12935 + }, + { + "epoch": 3.970534069981584, + "grad_norm": 0.26064008474349976, + "learning_rate": 6.86451922547528e-05, + "loss": 1.7161, + "step": 12936 + }, + { + "epoch": 3.970841006752609, + "grad_norm": 0.2656785547733307, + "learning_rate": 6.864058012929129e-05, + "loss": 1.8154, + "step": 12937 + }, + { + "epoch": 3.971147943523634, + "grad_norm": 0.21170997619628906, + "learning_rate": 6.863596781961263e-05, + "loss": 1.7614, + "step": 12938 + }, + { + "epoch": 3.9714548802946594, + "grad_norm": 0.21709072589874268, + "learning_rate": 6.863135532576241e-05, + "loss": 1.7896, + "step": 12939 + }, + { + "epoch": 3.9717618170656843, + "grad_norm": 0.2361367791891098, + "learning_rate": 6.862674264778623e-05, + "loss": 1.7775, + "step": 12940 + }, + { + "epoch": 3.9720687538367097, + "grad_norm": 0.22042550146579742, + "learning_rate": 6.862212978572967e-05, + "loss": 1.7781, + "step": 12941 + }, + { + "epoch": 3.972375690607735, + "grad_norm": 0.2535422146320343, + "learning_rate": 6.86175167396383e-05, + "loss": 1.7665, + "step": 12942 + }, + { + "epoch": 3.97268262737876, + "grad_norm": 0.23741906881332397, + "learning_rate": 6.861290350955771e-05, + "loss": 1.7829, + "step": 12943 + }, + { + "epoch": 3.972989564149785, + "grad_norm": 0.23789910972118378, + "learning_rate": 6.860829009553351e-05, + "loss": 1.7745, + "step": 12944 + }, + { + "epoch": 3.97329650092081, + "grad_norm": 0.26867765188217163, + "learning_rate": 6.860367649761127e-05, + "loss": 1.7239, + "step": 12945 + }, + { + "epoch": 3.9736034376918354, + "grad_norm": 0.3211663067340851, + "learning_rate": 6.85990627158366e-05, + "loss": 1.7976, + "step": 12946 + }, + { + "epoch": 3.9739103744628608, + "grad_norm": 0.26177310943603516, + "learning_rate": 6.85944487502551e-05, + "loss": 1.7446, + "step": 12947 + }, + { + "epoch": 3.9742173112338857, + "grad_norm": 0.23622745275497437, + "learning_rate": 6.858983460091234e-05, + "loss": 1.7824, + "step": 12948 + }, + { + "epoch": 3.974524248004911, + "grad_norm": 0.24372988939285278, + "learning_rate": 6.858522026785395e-05, + "loss": 1.8014, + "step": 12949 + }, + { + "epoch": 3.974831184775936, + "grad_norm": 0.2566998600959778, + "learning_rate": 6.85806057511255e-05, + "loss": 1.742, + "step": 12950 + }, + { + "epoch": 3.9751381215469612, + "grad_norm": 0.24418365955352783, + "learning_rate": 6.857599105077264e-05, + "loss": 1.7331, + "step": 12951 + }, + { + "epoch": 3.9754450583179866, + "grad_norm": 0.2260327935218811, + "learning_rate": 6.857137616684094e-05, + "loss": 1.7173, + "step": 12952 + }, + { + "epoch": 3.975751995089012, + "grad_norm": 0.277044415473938, + "learning_rate": 6.856676109937602e-05, + "loss": 1.7255, + "step": 12953 + }, + { + "epoch": 3.976058931860037, + "grad_norm": 0.228300079703331, + "learning_rate": 6.856214584842348e-05, + "loss": 1.7796, + "step": 12954 + }, + { + "epoch": 3.976365868631062, + "grad_norm": 0.2246638983488083, + "learning_rate": 6.855753041402893e-05, + "loss": 1.7458, + "step": 12955 + }, + { + "epoch": 3.976672805402087, + "grad_norm": 0.22235621511936188, + "learning_rate": 6.855291479623799e-05, + "loss": 1.7585, + "step": 12956 + }, + { + "epoch": 3.9769797421731123, + "grad_norm": 0.23710694909095764, + "learning_rate": 6.854829899509627e-05, + "loss": 1.767, + "step": 12957 + }, + { + "epoch": 3.9772866789441377, + "grad_norm": 0.2527346611022949, + "learning_rate": 6.854368301064939e-05, + "loss": 1.828, + "step": 12958 + }, + { + "epoch": 3.9775936157151626, + "grad_norm": 0.25032514333724976, + "learning_rate": 6.853906684294298e-05, + "loss": 1.8533, + "step": 12959 + }, + { + "epoch": 3.977900552486188, + "grad_norm": 0.2346320003271103, + "learning_rate": 6.853445049202262e-05, + "loss": 1.8046, + "step": 12960 + }, + { + "epoch": 3.978207489257213, + "grad_norm": 0.22576460242271423, + "learning_rate": 6.852983395793398e-05, + "loss": 1.7502, + "step": 12961 + }, + { + "epoch": 3.978514426028238, + "grad_norm": 0.2230147123336792, + "learning_rate": 6.852521724072266e-05, + "loss": 1.7362, + "step": 12962 + }, + { + "epoch": 3.9788213627992635, + "grad_norm": 0.2339705526828766, + "learning_rate": 6.852060034043425e-05, + "loss": 1.763, + "step": 12963 + }, + { + "epoch": 3.979128299570289, + "grad_norm": 0.24511271715164185, + "learning_rate": 6.851598325711446e-05, + "loss": 1.7988, + "step": 12964 + }, + { + "epoch": 3.9794352363413137, + "grad_norm": 0.2927285134792328, + "learning_rate": 6.851136599080885e-05, + "loss": 1.8346, + "step": 12965 + }, + { + "epoch": 3.979742173112339, + "grad_norm": 0.2593212425708771, + "learning_rate": 6.850674854156305e-05, + "loss": 1.7368, + "step": 12966 + }, + { + "epoch": 3.980049109883364, + "grad_norm": 0.3013291656970978, + "learning_rate": 6.850213090942275e-05, + "loss": 1.7911, + "step": 12967 + }, + { + "epoch": 3.9803560466543892, + "grad_norm": 0.3420047163963318, + "learning_rate": 6.849751309443352e-05, + "loss": 1.7899, + "step": 12968 + }, + { + "epoch": 3.9806629834254146, + "grad_norm": 0.2901746928691864, + "learning_rate": 6.849289509664105e-05, + "loss": 1.8244, + "step": 12969 + }, + { + "epoch": 3.9809699201964395, + "grad_norm": 0.2389298677444458, + "learning_rate": 6.848827691609093e-05, + "loss": 1.7116, + "step": 12970 + }, + { + "epoch": 3.981276856967465, + "grad_norm": 0.3153960704803467, + "learning_rate": 6.848365855282882e-05, + "loss": 1.7665, + "step": 12971 + }, + { + "epoch": 3.9815837937384897, + "grad_norm": 0.3162175118923187, + "learning_rate": 6.847904000690036e-05, + "loss": 1.7722, + "step": 12972 + }, + { + "epoch": 3.981890730509515, + "grad_norm": 0.27458643913269043, + "learning_rate": 6.847442127835122e-05, + "loss": 1.8095, + "step": 12973 + }, + { + "epoch": 3.9821976672805404, + "grad_norm": 0.22330710291862488, + "learning_rate": 6.846980236722699e-05, + "loss": 1.7179, + "step": 12974 + }, + { + "epoch": 3.9825046040515653, + "grad_norm": 0.2940923869609833, + "learning_rate": 6.846518327357339e-05, + "loss": 1.7363, + "step": 12975 + }, + { + "epoch": 3.9828115408225906, + "grad_norm": 0.26479849219322205, + "learning_rate": 6.846056399743599e-05, + "loss": 1.7788, + "step": 12976 + }, + { + "epoch": 3.9831184775936155, + "grad_norm": 0.24145057797431946, + "learning_rate": 6.845594453886048e-05, + "loss": 1.7825, + "step": 12977 + }, + { + "epoch": 3.983425414364641, + "grad_norm": 0.2795869708061218, + "learning_rate": 6.845132489789252e-05, + "loss": 1.7705, + "step": 12978 + }, + { + "epoch": 3.983732351135666, + "grad_norm": 0.3117202818393707, + "learning_rate": 6.844670507457776e-05, + "loss": 1.8183, + "step": 12979 + }, + { + "epoch": 3.9840392879066915, + "grad_norm": 0.2666899263858795, + "learning_rate": 6.844208506896184e-05, + "loss": 1.7434, + "step": 12980 + }, + { + "epoch": 3.9843462246777164, + "grad_norm": 0.24682332575321198, + "learning_rate": 6.843746488109042e-05, + "loss": 1.751, + "step": 12981 + }, + { + "epoch": 3.9846531614487417, + "grad_norm": 0.2558208703994751, + "learning_rate": 6.843284451100916e-05, + "loss": 1.7983, + "step": 12982 + }, + { + "epoch": 3.9849600982197666, + "grad_norm": 0.4236481189727783, + "learning_rate": 6.842822395876374e-05, + "loss": 1.8584, + "step": 12983 + }, + { + "epoch": 3.985267034990792, + "grad_norm": 0.4931485950946808, + "learning_rate": 6.84236032243998e-05, + "loss": 1.7617, + "step": 12984 + }, + { + "epoch": 3.9855739717618173, + "grad_norm": 0.37793654203414917, + "learning_rate": 6.841898230796302e-05, + "loss": 1.7411, + "step": 12985 + }, + { + "epoch": 3.985880908532842, + "grad_norm": 0.2093842774629593, + "learning_rate": 6.841436120949906e-05, + "loss": 1.772, + "step": 12986 + }, + { + "epoch": 3.9861878453038675, + "grad_norm": 0.4065552055835724, + "learning_rate": 6.840973992905359e-05, + "loss": 1.7675, + "step": 12987 + }, + { + "epoch": 3.9864947820748924, + "grad_norm": 0.5334183573722839, + "learning_rate": 6.840511846667228e-05, + "loss": 1.7872, + "step": 12988 + }, + { + "epoch": 3.9868017188459177, + "grad_norm": 0.378974974155426, + "learning_rate": 6.84004968224008e-05, + "loss": 1.8288, + "step": 12989 + }, + { + "epoch": 3.987108655616943, + "grad_norm": 0.22518309950828552, + "learning_rate": 6.839587499628483e-05, + "loss": 1.7715, + "step": 12990 + }, + { + "epoch": 3.987415592387968, + "grad_norm": 0.4270850718021393, + "learning_rate": 6.839125298837003e-05, + "loss": 1.7797, + "step": 12991 + }, + { + "epoch": 3.9877225291589933, + "grad_norm": 0.4629896879196167, + "learning_rate": 6.838663079870211e-05, + "loss": 1.7936, + "step": 12992 + }, + { + "epoch": 3.988029465930018, + "grad_norm": 0.29273948073387146, + "learning_rate": 6.838200842732672e-05, + "loss": 1.8264, + "step": 12993 + }, + { + "epoch": 3.9883364027010435, + "grad_norm": 0.31575852632522583, + "learning_rate": 6.837738587428954e-05, + "loss": 1.8043, + "step": 12994 + }, + { + "epoch": 3.988643339472069, + "grad_norm": 0.40602433681488037, + "learning_rate": 6.837276313963627e-05, + "loss": 1.7409, + "step": 12995 + }, + { + "epoch": 3.988950276243094, + "grad_norm": 0.23413142561912537, + "learning_rate": 6.836814022341259e-05, + "loss": 1.8585, + "step": 12996 + }, + { + "epoch": 3.989257213014119, + "grad_norm": 0.3518814444541931, + "learning_rate": 6.836351712566416e-05, + "loss": 1.7768, + "step": 12997 + }, + { + "epoch": 3.9895641497851444, + "grad_norm": 0.3811505436897278, + "learning_rate": 6.83588938464367e-05, + "loss": 1.7738, + "step": 12998 + }, + { + "epoch": 3.9898710865561693, + "grad_norm": 0.2516780197620392, + "learning_rate": 6.835427038577589e-05, + "loss": 1.7351, + "step": 12999 + }, + { + "epoch": 3.9901780233271946, + "grad_norm": 0.23704510927200317, + "learning_rate": 6.834964674372744e-05, + "loss": 1.7907, + "step": 13000 + }, + { + "epoch": 3.99048496009822, + "grad_norm": 0.2890201807022095, + "learning_rate": 6.8345022920337e-05, + "loss": 1.9546, + "step": 13001 + }, + { + "epoch": 3.990791896869245, + "grad_norm": 0.2678101360797882, + "learning_rate": 6.834039891565031e-05, + "loss": 1.7338, + "step": 13002 + }, + { + "epoch": 3.99109883364027, + "grad_norm": 0.31726256012916565, + "learning_rate": 6.833577472971304e-05, + "loss": 1.8464, + "step": 13003 + }, + { + "epoch": 3.991405770411295, + "grad_norm": 0.28112682700157166, + "learning_rate": 6.83311503625709e-05, + "loss": 1.7427, + "step": 13004 + }, + { + "epoch": 3.9917127071823204, + "grad_norm": 0.2651563584804535, + "learning_rate": 6.832652581426958e-05, + "loss": 1.8117, + "step": 13005 + }, + { + "epoch": 3.9920196439533457, + "grad_norm": 0.3095388114452362, + "learning_rate": 6.83219010848548e-05, + "loss": 1.8286, + "step": 13006 + }, + { + "epoch": 3.9923265807243706, + "grad_norm": 0.24704942107200623, + "learning_rate": 6.831727617437225e-05, + "loss": 1.77, + "step": 13007 + }, + { + "epoch": 3.992633517495396, + "grad_norm": 0.24868519604206085, + "learning_rate": 6.831265108286764e-05, + "loss": 1.8129, + "step": 13008 + }, + { + "epoch": 3.992940454266421, + "grad_norm": 0.26511049270629883, + "learning_rate": 6.830802581038669e-05, + "loss": 1.7539, + "step": 13009 + }, + { + "epoch": 3.993247391037446, + "grad_norm": 0.2823421061038971, + "learning_rate": 6.830340035697508e-05, + "loss": 1.8068, + "step": 13010 + }, + { + "epoch": 3.9935543278084715, + "grad_norm": 0.28526121377944946, + "learning_rate": 6.829877472267856e-05, + "loss": 1.764, + "step": 13011 + }, + { + "epoch": 3.993861264579497, + "grad_norm": 0.2576456069946289, + "learning_rate": 6.829414890754281e-05, + "loss": 1.728, + "step": 13012 + }, + { + "epoch": 3.9941682013505218, + "grad_norm": 0.27154842019081116, + "learning_rate": 6.828952291161356e-05, + "loss": 1.797, + "step": 13013 + }, + { + "epoch": 3.994475138121547, + "grad_norm": 0.3129710555076599, + "learning_rate": 6.828489673493652e-05, + "loss": 1.769, + "step": 13014 + }, + { + "epoch": 3.994782074892572, + "grad_norm": 0.40118902921676636, + "learning_rate": 6.828027037755742e-05, + "loss": 1.8029, + "step": 13015 + }, + { + "epoch": 3.9950890116635973, + "grad_norm": 0.33228442072868347, + "learning_rate": 6.827564383952197e-05, + "loss": 1.7295, + "step": 13016 + }, + { + "epoch": 3.9953959484346226, + "grad_norm": 0.218771830201149, + "learning_rate": 6.827101712087591e-05, + "loss": 1.7693, + "step": 13017 + }, + { + "epoch": 3.9957028852056475, + "grad_norm": 0.31354373693466187, + "learning_rate": 6.826639022166492e-05, + "loss": 1.743, + "step": 13018 + }, + { + "epoch": 3.996009821976673, + "grad_norm": 0.3584701418876648, + "learning_rate": 6.826176314193478e-05, + "loss": 1.7597, + "step": 13019 + }, + { + "epoch": 3.9963167587476978, + "grad_norm": 0.2692064344882965, + "learning_rate": 6.82571358817312e-05, + "loss": 1.7871, + "step": 13020 + }, + { + "epoch": 3.996623695518723, + "grad_norm": 0.3064020276069641, + "learning_rate": 6.825250844109987e-05, + "loss": 1.7858, + "step": 13021 + }, + { + "epoch": 3.9969306322897484, + "grad_norm": 0.29913413524627686, + "learning_rate": 6.824788082008657e-05, + "loss": 1.7773, + "step": 13022 + }, + { + "epoch": 3.9972375690607733, + "grad_norm": 0.2682165801525116, + "learning_rate": 6.824325301873703e-05, + "loss": 1.8321, + "step": 13023 + }, + { + "epoch": 3.9975445058317987, + "grad_norm": 0.3274376690387726, + "learning_rate": 6.823862503709694e-05, + "loss": 1.8514, + "step": 13024 + }, + { + "epoch": 3.9978514426028235, + "grad_norm": 0.29828041791915894, + "learning_rate": 6.823399687521211e-05, + "loss": 1.7923, + "step": 13025 + }, + { + "epoch": 3.998158379373849, + "grad_norm": 0.22339288890361786, + "learning_rate": 6.82293685331282e-05, + "loss": 1.756, + "step": 13026 + }, + { + "epoch": 3.998465316144874, + "grad_norm": 0.2254658192396164, + "learning_rate": 6.8224740010891e-05, + "loss": 1.7392, + "step": 13027 + }, + { + "epoch": 3.9987722529158995, + "grad_norm": 0.24932752549648285, + "learning_rate": 6.822011130854624e-05, + "loss": 1.7538, + "step": 13028 + }, + { + "epoch": 3.9990791896869244, + "grad_norm": 0.21429690718650818, + "learning_rate": 6.821548242613966e-05, + "loss": 1.7746, + "step": 13029 + }, + { + "epoch": 3.9993861264579498, + "grad_norm": 0.25503116846084595, + "learning_rate": 6.8210853363717e-05, + "loss": 1.814, + "step": 13030 + }, + { + "epoch": 3.9996930632289747, + "grad_norm": 0.23168155550956726, + "learning_rate": 6.820622412132402e-05, + "loss": 1.769, + "step": 13031 + }, + { + "epoch": 4.0, + "grad_norm": 0.2252223789691925, + "learning_rate": 6.820159469900645e-05, + "loss": 1.7782, + "step": 13032 + }, + { + "epoch": 4.000306936771025, + "grad_norm": 0.1996588408946991, + "learning_rate": 6.819696509681007e-05, + "loss": 1.6839, + "step": 13033 + }, + { + "epoch": 4.000613873542051, + "grad_norm": 0.22297053039073944, + "learning_rate": 6.81923353147806e-05, + "loss": 1.7767, + "step": 13034 + }, + { + "epoch": 4.000920810313075, + "grad_norm": 0.25867611169815063, + "learning_rate": 6.818770535296381e-05, + "loss": 1.8623, + "step": 13035 + }, + { + "epoch": 4.0012277470841005, + "grad_norm": 0.2173648178577423, + "learning_rate": 6.818307521140547e-05, + "loss": 1.8034, + "step": 13036 + }, + { + "epoch": 4.001534683855126, + "grad_norm": 0.23634609580039978, + "learning_rate": 6.81784448901513e-05, + "loss": 1.7503, + "step": 13037 + }, + { + "epoch": 4.001841620626151, + "grad_norm": 0.2626810073852539, + "learning_rate": 6.81738143892471e-05, + "loss": 1.8116, + "step": 13038 + }, + { + "epoch": 4.0021485573971765, + "grad_norm": 0.27888983488082886, + "learning_rate": 6.816918370873861e-05, + "loss": 1.8032, + "step": 13039 + }, + { + "epoch": 4.002455494168202, + "grad_norm": 0.275038480758667, + "learning_rate": 6.816455284867162e-05, + "loss": 1.7445, + "step": 13040 + }, + { + "epoch": 4.002762430939226, + "grad_norm": 0.3475828170776367, + "learning_rate": 6.815992180909184e-05, + "loss": 1.7404, + "step": 13041 + }, + { + "epoch": 4.003069367710252, + "grad_norm": 0.27314287424087524, + "learning_rate": 6.815529059004507e-05, + "loss": 1.8333, + "step": 13042 + }, + { + "epoch": 4.003376304481277, + "grad_norm": 0.34846973419189453, + "learning_rate": 6.815065919157709e-05, + "loss": 1.7921, + "step": 13043 + }, + { + "epoch": 4.003683241252302, + "grad_norm": 0.4191788136959076, + "learning_rate": 6.814602761373365e-05, + "loss": 1.8018, + "step": 13044 + }, + { + "epoch": 4.003990178023328, + "grad_norm": 0.2655608057975769, + "learning_rate": 6.814139585656055e-05, + "loss": 1.7638, + "step": 13045 + }, + { + "epoch": 4.004297114794352, + "grad_norm": 0.25938618183135986, + "learning_rate": 6.813676392010353e-05, + "loss": 1.794, + "step": 13046 + }, + { + "epoch": 4.004604051565377, + "grad_norm": 0.3464813828468323, + "learning_rate": 6.813213180440837e-05, + "loss": 1.8662, + "step": 13047 + }, + { + "epoch": 4.004910988336403, + "grad_norm": 0.30185338854789734, + "learning_rate": 6.812749950952087e-05, + "loss": 1.8029, + "step": 13048 + }, + { + "epoch": 4.005217925107428, + "grad_norm": 0.23291908204555511, + "learning_rate": 6.812286703548678e-05, + "loss": 1.7365, + "step": 13049 + }, + { + "epoch": 4.005524861878453, + "grad_norm": 0.3542841374874115, + "learning_rate": 6.811823438235189e-05, + "loss": 1.8674, + "step": 13050 + }, + { + "epoch": 4.005831798649478, + "grad_norm": 0.2914685606956482, + "learning_rate": 6.811360155016202e-05, + "loss": 1.8306, + "step": 13051 + }, + { + "epoch": 4.006138735420503, + "grad_norm": 0.24888737499713898, + "learning_rate": 6.810896853896289e-05, + "loss": 1.7767, + "step": 13052 + }, + { + "epoch": 4.0064456721915285, + "grad_norm": 0.2977537512779236, + "learning_rate": 6.810433534880033e-05, + "loss": 1.8227, + "step": 13053 + }, + { + "epoch": 4.006752608962554, + "grad_norm": 0.3367510735988617, + "learning_rate": 6.809970197972013e-05, + "loss": 1.734, + "step": 13054 + }, + { + "epoch": 4.007059545733579, + "grad_norm": 0.28098800778388977, + "learning_rate": 6.809506843176806e-05, + "loss": 1.7032, + "step": 13055 + }, + { + "epoch": 4.0073664825046045, + "grad_norm": 0.24016784131526947, + "learning_rate": 6.809043470498991e-05, + "loss": 1.7863, + "step": 13056 + }, + { + "epoch": 4.007673419275629, + "grad_norm": 0.2883957624435425, + "learning_rate": 6.808580079943148e-05, + "loss": 1.7342, + "step": 13057 + }, + { + "epoch": 4.007980356046654, + "grad_norm": 0.3069116473197937, + "learning_rate": 6.808116671513856e-05, + "loss": 1.8544, + "step": 13058 + }, + { + "epoch": 4.00828729281768, + "grad_norm": 0.24113236367702484, + "learning_rate": 6.807653245215697e-05, + "loss": 1.7692, + "step": 13059 + }, + { + "epoch": 4.008594229588705, + "grad_norm": 0.2651619017124176, + "learning_rate": 6.807189801053249e-05, + "loss": 1.8096, + "step": 13060 + }, + { + "epoch": 4.00890116635973, + "grad_norm": 0.2636481523513794, + "learning_rate": 6.806726339031092e-05, + "loss": 1.8062, + "step": 13061 + }, + { + "epoch": 4.009208103130755, + "grad_norm": 0.22691169381141663, + "learning_rate": 6.806262859153807e-05, + "loss": 1.7001, + "step": 13062 + }, + { + "epoch": 4.00951503990178, + "grad_norm": 0.23288170993328094, + "learning_rate": 6.805799361425972e-05, + "loss": 1.7508, + "step": 13063 + }, + { + "epoch": 4.009821976672805, + "grad_norm": 0.243272602558136, + "learning_rate": 6.80533584585217e-05, + "loss": 1.7797, + "step": 13064 + }, + { + "epoch": 4.010128913443831, + "grad_norm": 0.24594646692276, + "learning_rate": 6.80487231243698e-05, + "loss": 1.7894, + "step": 13065 + }, + { + "epoch": 4.010435850214856, + "grad_norm": 0.21726086735725403, + "learning_rate": 6.804408761184986e-05, + "loss": 1.7472, + "step": 13066 + }, + { + "epoch": 4.0107427869858805, + "grad_norm": 0.2262321561574936, + "learning_rate": 6.803945192100767e-05, + "loss": 1.7563, + "step": 13067 + }, + { + "epoch": 4.011049723756906, + "grad_norm": 0.2449522763490677, + "learning_rate": 6.803481605188903e-05, + "loss": 1.7282, + "step": 13068 + }, + { + "epoch": 4.011356660527931, + "grad_norm": 0.2281760573387146, + "learning_rate": 6.803018000453975e-05, + "loss": 1.8191, + "step": 13069 + }, + { + "epoch": 4.0116635972989565, + "grad_norm": 0.3039850890636444, + "learning_rate": 6.80255437790057e-05, + "loss": 1.8258, + "step": 13070 + }, + { + "epoch": 4.011970534069982, + "grad_norm": 0.3978467881679535, + "learning_rate": 6.802090737533264e-05, + "loss": 1.7338, + "step": 13071 + }, + { + "epoch": 4.012277470841007, + "grad_norm": 0.29175812005996704, + "learning_rate": 6.801627079356641e-05, + "loss": 1.7754, + "step": 13072 + }, + { + "epoch": 4.012584407612032, + "grad_norm": 0.24228449165821075, + "learning_rate": 6.801163403375285e-05, + "loss": 1.7624, + "step": 13073 + }, + { + "epoch": 4.012891344383057, + "grad_norm": 0.34527531266212463, + "learning_rate": 6.800699709593776e-05, + "loss": 1.87, + "step": 13074 + }, + { + "epoch": 4.013198281154082, + "grad_norm": 0.1995161920785904, + "learning_rate": 6.800235998016696e-05, + "loss": 1.7253, + "step": 13075 + }, + { + "epoch": 4.013505217925108, + "grad_norm": 0.3509151339530945, + "learning_rate": 6.799772268648628e-05, + "loss": 1.8013, + "step": 13076 + }, + { + "epoch": 4.013812154696133, + "grad_norm": 0.38569679856300354, + "learning_rate": 6.799308521494156e-05, + "loss": 1.7761, + "step": 13077 + }, + { + "epoch": 4.014119091467157, + "grad_norm": 0.2636256814002991, + "learning_rate": 6.798844756557865e-05, + "loss": 1.8101, + "step": 13078 + }, + { + "epoch": 4.014426028238183, + "grad_norm": 0.2570696473121643, + "learning_rate": 6.798380973844335e-05, + "loss": 1.7561, + "step": 13079 + }, + { + "epoch": 4.014732965009208, + "grad_norm": 0.38540002703666687, + "learning_rate": 6.797917173358148e-05, + "loss": 1.7893, + "step": 13080 + }, + { + "epoch": 4.015039901780233, + "grad_norm": 0.2974525988101959, + "learning_rate": 6.79745335510389e-05, + "loss": 1.8331, + "step": 13081 + }, + { + "epoch": 4.015346838551259, + "grad_norm": 0.2563362419605255, + "learning_rate": 6.796989519086146e-05, + "loss": 1.7784, + "step": 13082 + }, + { + "epoch": 4.015653775322283, + "grad_norm": 0.37037795782089233, + "learning_rate": 6.7965256653095e-05, + "loss": 1.7947, + "step": 13083 + }, + { + "epoch": 4.0159607120933085, + "grad_norm": 0.4145336449146271, + "learning_rate": 6.796061793778531e-05, + "loss": 1.7633, + "step": 13084 + }, + { + "epoch": 4.016267648864334, + "grad_norm": 0.32278406620025635, + "learning_rate": 6.795597904497828e-05, + "loss": 1.7827, + "step": 13085 + }, + { + "epoch": 4.016574585635359, + "grad_norm": 0.26466837525367737, + "learning_rate": 6.795133997471974e-05, + "loss": 1.7441, + "step": 13086 + }, + { + "epoch": 4.0168815224063845, + "grad_norm": 0.3212043344974518, + "learning_rate": 6.794670072705553e-05, + "loss": 1.7602, + "step": 13087 + }, + { + "epoch": 4.01718845917741, + "grad_norm": 0.3054736852645874, + "learning_rate": 6.79420613020315e-05, + "loss": 1.7417, + "step": 13088 + }, + { + "epoch": 4.017495395948434, + "grad_norm": 0.22281476855278015, + "learning_rate": 6.793742169969351e-05, + "loss": 1.7675, + "step": 13089 + }, + { + "epoch": 4.01780233271946, + "grad_norm": 0.32630839943885803, + "learning_rate": 6.793278192008742e-05, + "loss": 1.8409, + "step": 13090 + }, + { + "epoch": 4.018109269490485, + "grad_norm": 0.2658778429031372, + "learning_rate": 6.792814196325905e-05, + "loss": 1.7718, + "step": 13091 + }, + { + "epoch": 4.01841620626151, + "grad_norm": 0.24016901850700378, + "learning_rate": 6.792350182925429e-05, + "loss": 1.8393, + "step": 13092 + }, + { + "epoch": 4.018723143032536, + "grad_norm": 0.2882223427295685, + "learning_rate": 6.791886151811897e-05, + "loss": 1.7497, + "step": 13093 + }, + { + "epoch": 4.01903007980356, + "grad_norm": 0.24340751767158508, + "learning_rate": 6.791422102989895e-05, + "loss": 1.72, + "step": 13094 + }, + { + "epoch": 4.019337016574585, + "grad_norm": 0.235665962100029, + "learning_rate": 6.79095803646401e-05, + "loss": 1.7269, + "step": 13095 + }, + { + "epoch": 4.019643953345611, + "grad_norm": 0.32772955298423767, + "learning_rate": 6.79049395223883e-05, + "loss": 1.7916, + "step": 13096 + }, + { + "epoch": 4.019950890116636, + "grad_norm": 0.3189625144004822, + "learning_rate": 6.790029850318938e-05, + "loss": 1.7571, + "step": 13097 + }, + { + "epoch": 4.020257826887661, + "grad_norm": 0.2211185097694397, + "learning_rate": 6.789565730708921e-05, + "loss": 1.793, + "step": 13098 + }, + { + "epoch": 4.020564763658686, + "grad_norm": 0.2840392291545868, + "learning_rate": 6.789101593413367e-05, + "loss": 1.7434, + "step": 13099 + }, + { + "epoch": 4.020871700429711, + "grad_norm": 0.27857357263565063, + "learning_rate": 6.788637438436863e-05, + "loss": 1.742, + "step": 13100 + }, + { + "epoch": 4.0211786372007365, + "grad_norm": 0.314628005027771, + "learning_rate": 6.788173265783996e-05, + "loss": 1.7881, + "step": 13101 + }, + { + "epoch": 4.021485573971762, + "grad_norm": 0.2994774580001831, + "learning_rate": 6.787709075459352e-05, + "loss": 1.7741, + "step": 13102 + }, + { + "epoch": 4.021792510742787, + "grad_norm": 0.3256312310695648, + "learning_rate": 6.787244867467519e-05, + "loss": 1.7758, + "step": 13103 + }, + { + "epoch": 4.0220994475138125, + "grad_norm": 0.2332412451505661, + "learning_rate": 6.786780641813083e-05, + "loss": 1.7654, + "step": 13104 + }, + { + "epoch": 4.022406384284837, + "grad_norm": 0.23226258158683777, + "learning_rate": 6.786316398500636e-05, + "loss": 1.7605, + "step": 13105 + }, + { + "epoch": 4.022713321055862, + "grad_norm": 0.24631965160369873, + "learning_rate": 6.785852137534763e-05, + "loss": 1.7469, + "step": 13106 + }, + { + "epoch": 4.023020257826888, + "grad_norm": 0.1969226449728012, + "learning_rate": 6.785387858920051e-05, + "loss": 1.8151, + "step": 13107 + }, + { + "epoch": 4.023327194597913, + "grad_norm": 0.22769485414028168, + "learning_rate": 6.784923562661091e-05, + "loss": 1.7024, + "step": 13108 + }, + { + "epoch": 4.023634131368938, + "grad_norm": 0.2174670249223709, + "learning_rate": 6.78445924876247e-05, + "loss": 1.8094, + "step": 13109 + }, + { + "epoch": 4.023941068139963, + "grad_norm": 0.2606858015060425, + "learning_rate": 6.783994917228775e-05, + "loss": 1.8043, + "step": 13110 + }, + { + "epoch": 4.024248004910988, + "grad_norm": 0.24721349775791168, + "learning_rate": 6.783530568064599e-05, + "loss": 1.842, + "step": 13111 + }, + { + "epoch": 4.024554941682013, + "grad_norm": 0.2353603094816208, + "learning_rate": 6.783066201274529e-05, + "loss": 1.76, + "step": 13112 + }, + { + "epoch": 4.024861878453039, + "grad_norm": 0.22285830974578857, + "learning_rate": 6.782601816863153e-05, + "loss": 1.8014, + "step": 13113 + }, + { + "epoch": 4.025168815224064, + "grad_norm": 0.2482440173625946, + "learning_rate": 6.782137414835061e-05, + "loss": 1.7552, + "step": 13114 + }, + { + "epoch": 4.0254757519950894, + "grad_norm": 0.19926191866397858, + "learning_rate": 6.781672995194842e-05, + "loss": 1.7549, + "step": 13115 + }, + { + "epoch": 4.025782688766114, + "grad_norm": 0.2342877984046936, + "learning_rate": 6.781208557947086e-05, + "loss": 1.8622, + "step": 13116 + }, + { + "epoch": 4.026089625537139, + "grad_norm": 0.24096547067165375, + "learning_rate": 6.780744103096382e-05, + "loss": 1.7795, + "step": 13117 + }, + { + "epoch": 4.026396562308165, + "grad_norm": 0.23714657127857208, + "learning_rate": 6.780279630647322e-05, + "loss": 1.799, + "step": 13118 + }, + { + "epoch": 4.02670349907919, + "grad_norm": 0.28252026438713074, + "learning_rate": 6.779815140604496e-05, + "loss": 1.7573, + "step": 13119 + }, + { + "epoch": 4.027010435850215, + "grad_norm": 0.28028404712677, + "learning_rate": 6.779350632972493e-05, + "loss": 1.8103, + "step": 13120 + }, + { + "epoch": 4.02731737262124, + "grad_norm": 0.21088312566280365, + "learning_rate": 6.778886107755904e-05, + "loss": 1.7169, + "step": 13121 + }, + { + "epoch": 4.027624309392265, + "grad_norm": 0.22282038629055023, + "learning_rate": 6.77842156495932e-05, + "loss": 1.7206, + "step": 13122 + }, + { + "epoch": 4.02793124616329, + "grad_norm": 0.3281327784061432, + "learning_rate": 6.777957004587331e-05, + "loss": 1.8664, + "step": 13123 + }, + { + "epoch": 4.028238182934316, + "grad_norm": 0.29496827721595764, + "learning_rate": 6.77749242664453e-05, + "loss": 1.7532, + "step": 13124 + }, + { + "epoch": 4.028545119705341, + "grad_norm": 0.25299328565597534, + "learning_rate": 6.777027831135508e-05, + "loss": 1.7836, + "step": 13125 + }, + { + "epoch": 4.0288520564763655, + "grad_norm": 0.3000280559062958, + "learning_rate": 6.776563218064854e-05, + "loss": 1.8079, + "step": 13126 + }, + { + "epoch": 4.029158993247391, + "grad_norm": 0.3613673448562622, + "learning_rate": 6.77609858743716e-05, + "loss": 1.7931, + "step": 13127 + }, + { + "epoch": 4.029465930018416, + "grad_norm": 0.25613468885421753, + "learning_rate": 6.77563393925702e-05, + "loss": 1.7522, + "step": 13128 + }, + { + "epoch": 4.0297728667894415, + "grad_norm": 0.24391578137874603, + "learning_rate": 6.775169273529026e-05, + "loss": 1.818, + "step": 13129 + }, + { + "epoch": 4.030079803560467, + "grad_norm": 0.2806173264980316, + "learning_rate": 6.774704590257768e-05, + "loss": 1.7349, + "step": 13130 + }, + { + "epoch": 4.030386740331492, + "grad_norm": 0.22214172780513763, + "learning_rate": 6.774239889447838e-05, + "loss": 1.759, + "step": 13131 + }, + { + "epoch": 4.030693677102517, + "grad_norm": 0.27285513281822205, + "learning_rate": 6.773775171103828e-05, + "loss": 1.742, + "step": 13132 + }, + { + "epoch": 4.031000613873542, + "grad_norm": 0.22302402555942535, + "learning_rate": 6.773310435230334e-05, + "loss": 1.7277, + "step": 13133 + }, + { + "epoch": 4.031307550644567, + "grad_norm": 0.2350187450647354, + "learning_rate": 6.772845681831947e-05, + "loss": 1.8648, + "step": 13134 + }, + { + "epoch": 4.031614487415593, + "grad_norm": 0.2665547728538513, + "learning_rate": 6.772380910913261e-05, + "loss": 1.776, + "step": 13135 + }, + { + "epoch": 4.031921424186618, + "grad_norm": 0.30652403831481934, + "learning_rate": 6.771916122478867e-05, + "loss": 1.7884, + "step": 13136 + }, + { + "epoch": 4.032228360957642, + "grad_norm": 0.29372814297676086, + "learning_rate": 6.771451316533359e-05, + "loss": 1.8203, + "step": 13137 + }, + { + "epoch": 4.032535297728668, + "grad_norm": 0.2244873046875, + "learning_rate": 6.770986493081329e-05, + "loss": 1.7869, + "step": 13138 + }, + { + "epoch": 4.032842234499693, + "grad_norm": 0.25075265765190125, + "learning_rate": 6.770521652127375e-05, + "loss": 1.772, + "step": 13139 + }, + { + "epoch": 4.033149171270718, + "grad_norm": 0.28118211030960083, + "learning_rate": 6.770056793676087e-05, + "loss": 1.7922, + "step": 13140 + }, + { + "epoch": 4.033456108041744, + "grad_norm": 0.25199100375175476, + "learning_rate": 6.769591917732062e-05, + "loss": 1.7526, + "step": 13141 + }, + { + "epoch": 4.033763044812768, + "grad_norm": 0.2920379638671875, + "learning_rate": 6.769127024299892e-05, + "loss": 1.8365, + "step": 13142 + }, + { + "epoch": 4.0340699815837935, + "grad_norm": 0.23018018901348114, + "learning_rate": 6.768662113384171e-05, + "loss": 1.7411, + "step": 13143 + }, + { + "epoch": 4.034376918354819, + "grad_norm": 0.23253841698169708, + "learning_rate": 6.768197184989494e-05, + "loss": 1.7921, + "step": 13144 + }, + { + "epoch": 4.034683855125844, + "grad_norm": 0.22618864476680756, + "learning_rate": 6.767732239120456e-05, + "loss": 1.7421, + "step": 13145 + }, + { + "epoch": 4.0349907918968695, + "grad_norm": 0.24552187323570251, + "learning_rate": 6.767267275781655e-05, + "loss": 1.7299, + "step": 13146 + }, + { + "epoch": 4.035297728667895, + "grad_norm": 0.22562766075134277, + "learning_rate": 6.76680229497768e-05, + "loss": 1.766, + "step": 13147 + }, + { + "epoch": 4.035604665438919, + "grad_norm": 0.28718629479408264, + "learning_rate": 6.76633729671313e-05, + "loss": 1.7366, + "step": 13148 + }, + { + "epoch": 4.035911602209945, + "grad_norm": 0.38769885897636414, + "learning_rate": 6.765872280992598e-05, + "loss": 1.8244, + "step": 13149 + }, + { + "epoch": 4.03621853898097, + "grad_norm": 0.4232725501060486, + "learning_rate": 6.765407247820683e-05, + "loss": 1.8244, + "step": 13150 + }, + { + "epoch": 4.036525475751995, + "grad_norm": 0.2771088778972626, + "learning_rate": 6.764942197201977e-05, + "loss": 1.7863, + "step": 13151 + }, + { + "epoch": 4.036832412523021, + "grad_norm": 0.2917862832546234, + "learning_rate": 6.76447712914108e-05, + "loss": 1.791, + "step": 13152 + }, + { + "epoch": 4.037139349294045, + "grad_norm": 0.37355467677116394, + "learning_rate": 6.764012043642584e-05, + "loss": 1.74, + "step": 13153 + }, + { + "epoch": 4.03744628606507, + "grad_norm": 0.35664018988609314, + "learning_rate": 6.763546940711089e-05, + "loss": 1.7734, + "step": 13154 + }, + { + "epoch": 4.037753222836096, + "grad_norm": 0.2335754930973053, + "learning_rate": 6.763081820351188e-05, + "loss": 1.7765, + "step": 13155 + }, + { + "epoch": 4.038060159607121, + "grad_norm": 0.2825562357902527, + "learning_rate": 6.762616682567478e-05, + "loss": 1.7867, + "step": 13156 + }, + { + "epoch": 4.038367096378146, + "grad_norm": 0.3103202283382416, + "learning_rate": 6.762151527364559e-05, + "loss": 1.7331, + "step": 13157 + }, + { + "epoch": 4.038674033149171, + "grad_norm": 0.2897353172302246, + "learning_rate": 6.761686354747025e-05, + "loss": 1.7638, + "step": 13158 + }, + { + "epoch": 4.038980969920196, + "grad_norm": 0.21260851621627808, + "learning_rate": 6.761221164719474e-05, + "loss": 1.7302, + "step": 13159 + }, + { + "epoch": 4.0392879066912215, + "grad_norm": 0.2878021001815796, + "learning_rate": 6.760755957286503e-05, + "loss": 1.7368, + "step": 13160 + }, + { + "epoch": 4.039594843462247, + "grad_norm": 0.2785978317260742, + "learning_rate": 6.76029073245271e-05, + "loss": 1.7258, + "step": 13161 + }, + { + "epoch": 4.039901780233272, + "grad_norm": 0.1963953971862793, + "learning_rate": 6.759825490222692e-05, + "loss": 1.755, + "step": 13162 + }, + { + "epoch": 4.0402087170042975, + "grad_norm": 0.26776790618896484, + "learning_rate": 6.759360230601047e-05, + "loss": 1.7676, + "step": 13163 + }, + { + "epoch": 4.040515653775322, + "grad_norm": 0.2751332223415375, + "learning_rate": 6.758894953592373e-05, + "loss": 1.7313, + "step": 13164 + }, + { + "epoch": 4.040822590546347, + "grad_norm": 0.2339213341474533, + "learning_rate": 6.758429659201269e-05, + "loss": 1.714, + "step": 13165 + }, + { + "epoch": 4.041129527317373, + "grad_norm": 0.2624664008617401, + "learning_rate": 6.75796434743233e-05, + "loss": 1.8296, + "step": 13166 + }, + { + "epoch": 4.041436464088398, + "grad_norm": 0.40156883001327515, + "learning_rate": 6.757499018290159e-05, + "loss": 1.8228, + "step": 13167 + }, + { + "epoch": 4.041743400859423, + "grad_norm": 0.32976576685905457, + "learning_rate": 6.757033671779352e-05, + "loss": 1.7403, + "step": 13168 + }, + { + "epoch": 4.042050337630448, + "grad_norm": 0.2343887835741043, + "learning_rate": 6.756568307904508e-05, + "loss": 1.7837, + "step": 13169 + }, + { + "epoch": 4.042357274401473, + "grad_norm": 0.36174145340919495, + "learning_rate": 6.756102926670227e-05, + "loss": 1.7291, + "step": 13170 + }, + { + "epoch": 4.042664211172498, + "grad_norm": 0.3324793577194214, + "learning_rate": 6.755637528081108e-05, + "loss": 1.7414, + "step": 13171 + }, + { + "epoch": 4.042971147943524, + "grad_norm": 0.21945348381996155, + "learning_rate": 6.75517211214175e-05, + "loss": 1.7762, + "step": 13172 + }, + { + "epoch": 4.043278084714549, + "grad_norm": 0.31069812178611755, + "learning_rate": 6.75470667885675e-05, + "loss": 1.7666, + "step": 13173 + }, + { + "epoch": 4.043585021485574, + "grad_norm": 0.3931153118610382, + "learning_rate": 6.754241228230713e-05, + "loss": 1.7871, + "step": 13174 + }, + { + "epoch": 4.043891958256599, + "grad_norm": 0.25559595227241516, + "learning_rate": 6.753775760268234e-05, + "loss": 1.7916, + "step": 13175 + }, + { + "epoch": 4.044198895027624, + "grad_norm": 0.3686937391757965, + "learning_rate": 6.753310274973917e-05, + "loss": 1.7642, + "step": 13176 + }, + { + "epoch": 4.0445058317986495, + "grad_norm": 0.4793247580528259, + "learning_rate": 6.75284477235236e-05, + "loss": 1.739, + "step": 13177 + }, + { + "epoch": 4.044812768569675, + "grad_norm": 0.36179354786872864, + "learning_rate": 6.752379252408164e-05, + "loss": 1.7993, + "step": 13178 + }, + { + "epoch": 4.0451197053407, + "grad_norm": 0.22559234499931335, + "learning_rate": 6.751913715145926e-05, + "loss": 1.7401, + "step": 13179 + }, + { + "epoch": 4.045426642111725, + "grad_norm": 0.29058873653411865, + "learning_rate": 6.751448160570253e-05, + "loss": 1.8089, + "step": 13180 + }, + { + "epoch": 4.04573357888275, + "grad_norm": 0.3069808781147003, + "learning_rate": 6.750982588685742e-05, + "loss": 1.7587, + "step": 13181 + }, + { + "epoch": 4.046040515653775, + "grad_norm": 0.2292155921459198, + "learning_rate": 6.750516999496994e-05, + "loss": 1.7429, + "step": 13182 + }, + { + "epoch": 4.046347452424801, + "grad_norm": 0.2520677149295807, + "learning_rate": 6.750051393008612e-05, + "loss": 1.7842, + "step": 13183 + }, + { + "epoch": 4.046654389195826, + "grad_norm": 0.32546502351760864, + "learning_rate": 6.749585769225194e-05, + "loss": 1.8057, + "step": 13184 + }, + { + "epoch": 4.04696132596685, + "grad_norm": 0.27634644508361816, + "learning_rate": 6.749120128151346e-05, + "loss": 1.7708, + "step": 13185 + }, + { + "epoch": 4.047268262737876, + "grad_norm": 0.2546750009059906, + "learning_rate": 6.748654469791668e-05, + "loss": 1.8744, + "step": 13186 + }, + { + "epoch": 4.047575199508901, + "grad_norm": 0.43873605132102966, + "learning_rate": 6.748188794150761e-05, + "loss": 1.8573, + "step": 13187 + }, + { + "epoch": 4.047882136279926, + "grad_norm": 0.45526960492134094, + "learning_rate": 6.747723101233227e-05, + "loss": 1.7761, + "step": 13188 + }, + { + "epoch": 4.048189073050952, + "grad_norm": 0.24995557963848114, + "learning_rate": 6.74725739104367e-05, + "loss": 1.7679, + "step": 13189 + }, + { + "epoch": 4.048496009821977, + "grad_norm": 0.3203068971633911, + "learning_rate": 6.74679166358669e-05, + "loss": 1.7772, + "step": 13190 + }, + { + "epoch": 4.0488029465930016, + "grad_norm": 0.37020671367645264, + "learning_rate": 6.746325918866893e-05, + "loss": 1.8002, + "step": 13191 + }, + { + "epoch": 4.049109883364027, + "grad_norm": 0.2543959319591522, + "learning_rate": 6.745860156888878e-05, + "loss": 1.8057, + "step": 13192 + }, + { + "epoch": 4.049416820135052, + "grad_norm": 0.2566509246826172, + "learning_rate": 6.74539437765725e-05, + "loss": 1.7853, + "step": 13193 + }, + { + "epoch": 4.0497237569060776, + "grad_norm": 0.2545804977416992, + "learning_rate": 6.744928581176612e-05, + "loss": 1.8136, + "step": 13194 + }, + { + "epoch": 4.050030693677103, + "grad_norm": 0.24307197332382202, + "learning_rate": 6.744462767451568e-05, + "loss": 1.7919, + "step": 13195 + }, + { + "epoch": 4.050337630448127, + "grad_norm": 0.24427616596221924, + "learning_rate": 6.743996936486719e-05, + "loss": 1.8037, + "step": 13196 + }, + { + "epoch": 4.050644567219153, + "grad_norm": 0.2154439389705658, + "learning_rate": 6.743531088286673e-05, + "loss": 1.7088, + "step": 13197 + }, + { + "epoch": 4.050951503990178, + "grad_norm": 0.22251558303833008, + "learning_rate": 6.743065222856027e-05, + "loss": 1.7512, + "step": 13198 + }, + { + "epoch": 4.051258440761203, + "grad_norm": 0.2373272329568863, + "learning_rate": 6.74259934019939e-05, + "loss": 1.8056, + "step": 13199 + }, + { + "epoch": 4.051565377532229, + "grad_norm": 0.23308727145195007, + "learning_rate": 6.742133440321366e-05, + "loss": 1.731, + "step": 13200 + }, + { + "epoch": 4.051872314303253, + "grad_norm": 0.2438805252313614, + "learning_rate": 6.741667523226557e-05, + "loss": 1.7938, + "step": 13201 + }, + { + "epoch": 4.0521792510742785, + "grad_norm": 0.22354702651500702, + "learning_rate": 6.741201588919569e-05, + "loss": 1.762, + "step": 13202 + }, + { + "epoch": 4.052486187845304, + "grad_norm": 0.2505488097667694, + "learning_rate": 6.740735637405006e-05, + "loss": 1.7627, + "step": 13203 + }, + { + "epoch": 4.052793124616329, + "grad_norm": 0.21378709375858307, + "learning_rate": 6.740269668687474e-05, + "loss": 1.7598, + "step": 13204 + }, + { + "epoch": 4.0531000613873545, + "grad_norm": 0.24863660335540771, + "learning_rate": 6.739803682771577e-05, + "loss": 1.7665, + "step": 13205 + }, + { + "epoch": 4.05340699815838, + "grad_norm": 0.3041808605194092, + "learning_rate": 6.739337679661921e-05, + "loss": 1.7909, + "step": 13206 + }, + { + "epoch": 4.053713934929404, + "grad_norm": 0.2745797634124756, + "learning_rate": 6.738871659363109e-05, + "loss": 1.7547, + "step": 13207 + }, + { + "epoch": 4.05402087170043, + "grad_norm": 0.2610073387622833, + "learning_rate": 6.738405621879748e-05, + "loss": 1.7723, + "step": 13208 + }, + { + "epoch": 4.054327808471455, + "grad_norm": 0.22728075087070465, + "learning_rate": 6.737939567216446e-05, + "loss": 1.7865, + "step": 13209 + }, + { + "epoch": 4.05463474524248, + "grad_norm": 0.2877669930458069, + "learning_rate": 6.737473495377804e-05, + "loss": 1.8352, + "step": 13210 + }, + { + "epoch": 4.054941682013506, + "grad_norm": 0.35316282510757446, + "learning_rate": 6.737007406368432e-05, + "loss": 1.8202, + "step": 13211 + }, + { + "epoch": 4.05524861878453, + "grad_norm": 0.34625691175460815, + "learning_rate": 6.736541300192936e-05, + "loss": 1.8456, + "step": 13212 + }, + { + "epoch": 4.055555555555555, + "grad_norm": 0.2432134598493576, + "learning_rate": 6.736075176855917e-05, + "loss": 1.8237, + "step": 13213 + }, + { + "epoch": 4.055862492326581, + "grad_norm": 0.27446529269218445, + "learning_rate": 6.735609036361989e-05, + "loss": 1.71, + "step": 13214 + }, + { + "epoch": 4.056169429097606, + "grad_norm": 0.2870408892631531, + "learning_rate": 6.735142878715754e-05, + "loss": 1.7473, + "step": 13215 + }, + { + "epoch": 4.056476365868631, + "grad_norm": 0.22249078750610352, + "learning_rate": 6.734676703921822e-05, + "loss": 1.7462, + "step": 13216 + }, + { + "epoch": 4.056783302639656, + "grad_norm": 0.25519105792045593, + "learning_rate": 6.734210511984796e-05, + "loss": 1.7022, + "step": 13217 + }, + { + "epoch": 4.057090239410681, + "grad_norm": 0.3366561830043793, + "learning_rate": 6.733744302909285e-05, + "loss": 1.787, + "step": 13218 + }, + { + "epoch": 4.0573971761817065, + "grad_norm": 0.2443208247423172, + "learning_rate": 6.733278076699897e-05, + "loss": 1.8048, + "step": 13219 + }, + { + "epoch": 4.057704112952732, + "grad_norm": 0.2893153131008148, + "learning_rate": 6.73281183336124e-05, + "loss": 1.7805, + "step": 13220 + }, + { + "epoch": 4.058011049723757, + "grad_norm": 0.3178043067455292, + "learning_rate": 6.73234557289792e-05, + "loss": 1.8264, + "step": 13221 + }, + { + "epoch": 4.0583179864947825, + "grad_norm": 0.27355703711509705, + "learning_rate": 6.731879295314546e-05, + "loss": 1.8427, + "step": 13222 + }, + { + "epoch": 4.058624923265807, + "grad_norm": 0.32180166244506836, + "learning_rate": 6.731413000615726e-05, + "loss": 1.7332, + "step": 13223 + }, + { + "epoch": 4.058931860036832, + "grad_norm": 0.3736574351787567, + "learning_rate": 6.730946688806067e-05, + "loss": 1.7447, + "step": 13224 + }, + { + "epoch": 4.059238796807858, + "grad_norm": 0.2526068687438965, + "learning_rate": 6.73048035989018e-05, + "loss": 1.8104, + "step": 13225 + }, + { + "epoch": 4.059545733578883, + "grad_norm": 0.29076167941093445, + "learning_rate": 6.73001401387267e-05, + "loss": 1.7977, + "step": 13226 + }, + { + "epoch": 4.059852670349908, + "grad_norm": 0.37963762879371643, + "learning_rate": 6.729547650758148e-05, + "loss": 1.8336, + "step": 13227 + }, + { + "epoch": 4.060159607120933, + "grad_norm": 0.31584078073501587, + "learning_rate": 6.729081270551222e-05, + "loss": 1.7843, + "step": 13228 + }, + { + "epoch": 4.060466543891958, + "grad_norm": 0.22793468832969666, + "learning_rate": 6.728614873256502e-05, + "loss": 1.7444, + "step": 13229 + }, + { + "epoch": 4.060773480662983, + "grad_norm": 0.3114435076713562, + "learning_rate": 6.728148458878596e-05, + "loss": 1.8012, + "step": 13230 + }, + { + "epoch": 4.061080417434009, + "grad_norm": 0.29843854904174805, + "learning_rate": 6.727682027422116e-05, + "loss": 1.8014, + "step": 13231 + }, + { + "epoch": 4.061387354205034, + "grad_norm": 0.22745616734027863, + "learning_rate": 6.727215578891668e-05, + "loss": 1.7303, + "step": 13232 + }, + { + "epoch": 4.0616942909760585, + "grad_norm": 0.2701241970062256, + "learning_rate": 6.726749113291864e-05, + "loss": 1.7665, + "step": 13233 + }, + { + "epoch": 4.062001227747084, + "grad_norm": 0.29304635524749756, + "learning_rate": 6.726282630627313e-05, + "loss": 1.875, + "step": 13234 + }, + { + "epoch": 4.062308164518109, + "grad_norm": 0.21467708051204681, + "learning_rate": 6.725816130902625e-05, + "loss": 1.7442, + "step": 13235 + }, + { + "epoch": 4.0626151012891345, + "grad_norm": 0.23517470061779022, + "learning_rate": 6.72534961412241e-05, + "loss": 1.7154, + "step": 13236 + }, + { + "epoch": 4.06292203806016, + "grad_norm": 0.21483808755874634, + "learning_rate": 6.724883080291278e-05, + "loss": 1.7162, + "step": 13237 + }, + { + "epoch": 4.063228974831185, + "grad_norm": 0.2274744212627411, + "learning_rate": 6.724416529413843e-05, + "loss": 1.8066, + "step": 13238 + }, + { + "epoch": 4.06353591160221, + "grad_norm": 0.24682378768920898, + "learning_rate": 6.723949961494712e-05, + "loss": 1.7905, + "step": 13239 + }, + { + "epoch": 4.063842848373235, + "grad_norm": 0.2516227066516876, + "learning_rate": 6.723483376538498e-05, + "loss": 1.7693, + "step": 13240 + }, + { + "epoch": 4.06414978514426, + "grad_norm": 0.22076398134231567, + "learning_rate": 6.723016774549808e-05, + "loss": 1.7357, + "step": 13241 + }, + { + "epoch": 4.064456721915286, + "grad_norm": 0.20741026103496552, + "learning_rate": 6.722550155533258e-05, + "loss": 1.8082, + "step": 13242 + }, + { + "epoch": 4.064763658686311, + "grad_norm": 0.2074010819196701, + "learning_rate": 6.722083519493458e-05, + "loss": 1.71, + "step": 13243 + }, + { + "epoch": 4.065070595457335, + "grad_norm": 0.2661527991294861, + "learning_rate": 6.72161686643502e-05, + "loss": 1.7448, + "step": 13244 + }, + { + "epoch": 4.065377532228361, + "grad_norm": 0.2877216935157776, + "learning_rate": 6.721150196362555e-05, + "loss": 1.7574, + "step": 13245 + }, + { + "epoch": 4.065684468999386, + "grad_norm": 0.2520955801010132, + "learning_rate": 6.720683509280675e-05, + "loss": 1.7717, + "step": 13246 + }, + { + "epoch": 4.065991405770411, + "grad_norm": 0.2219560444355011, + "learning_rate": 6.72021680519399e-05, + "loss": 1.7355, + "step": 13247 + }, + { + "epoch": 4.066298342541437, + "grad_norm": 0.24671706557273865, + "learning_rate": 6.719750084107117e-05, + "loss": 1.8204, + "step": 13248 + }, + { + "epoch": 4.066605279312462, + "grad_norm": 0.24512135982513428, + "learning_rate": 6.719283346024664e-05, + "loss": 1.826, + "step": 13249 + }, + { + "epoch": 4.0669122160834865, + "grad_norm": 0.24370841681957245, + "learning_rate": 6.718816590951247e-05, + "loss": 1.8322, + "step": 13250 + }, + { + "epoch": 4.067219152854512, + "grad_norm": 0.2312363088130951, + "learning_rate": 6.718349818891475e-05, + "loss": 1.7621, + "step": 13251 + }, + { + "epoch": 4.067526089625537, + "grad_norm": 0.2500494420528412, + "learning_rate": 6.717883029849965e-05, + "loss": 1.829, + "step": 13252 + }, + { + "epoch": 4.0678330263965625, + "grad_norm": 0.29882633686065674, + "learning_rate": 6.717416223831324e-05, + "loss": 1.799, + "step": 13253 + }, + { + "epoch": 4.068139963167588, + "grad_norm": 0.21962928771972656, + "learning_rate": 6.716949400840172e-05, + "loss": 1.7714, + "step": 13254 + }, + { + "epoch": 4.068446899938612, + "grad_norm": 0.25544899702072144, + "learning_rate": 6.716482560881121e-05, + "loss": 1.7911, + "step": 13255 + }, + { + "epoch": 4.068753836709638, + "grad_norm": 0.24865686893463135, + "learning_rate": 6.716015703958781e-05, + "loss": 1.7107, + "step": 13256 + }, + { + "epoch": 4.069060773480663, + "grad_norm": 0.22669239342212677, + "learning_rate": 6.715548830077769e-05, + "loss": 1.8503, + "step": 13257 + }, + { + "epoch": 4.069367710251688, + "grad_norm": 0.2973819077014923, + "learning_rate": 6.715081939242698e-05, + "loss": 1.7859, + "step": 13258 + }, + { + "epoch": 4.069674647022714, + "grad_norm": 0.3178746700286865, + "learning_rate": 6.714615031458181e-05, + "loss": 1.7705, + "step": 13259 + }, + { + "epoch": 4.069981583793738, + "grad_norm": 0.20452535152435303, + "learning_rate": 6.714148106728835e-05, + "loss": 1.7386, + "step": 13260 + }, + { + "epoch": 4.070288520564763, + "grad_norm": 0.30288320779800415, + "learning_rate": 6.713681165059271e-05, + "loss": 1.7823, + "step": 13261 + }, + { + "epoch": 4.070595457335789, + "grad_norm": 0.30014416575431824, + "learning_rate": 6.713214206454107e-05, + "loss": 1.7626, + "step": 13262 + }, + { + "epoch": 4.070902394106814, + "grad_norm": 0.25144243240356445, + "learning_rate": 6.712747230917956e-05, + "loss": 1.8359, + "step": 13263 + }, + { + "epoch": 4.071209330877839, + "grad_norm": 0.308148592710495, + "learning_rate": 6.712280238455432e-05, + "loss": 1.7226, + "step": 13264 + }, + { + "epoch": 4.071516267648865, + "grad_norm": 0.2704198658466339, + "learning_rate": 6.711813229071151e-05, + "loss": 1.7982, + "step": 13265 + }, + { + "epoch": 4.071823204419889, + "grad_norm": 0.3928656280040741, + "learning_rate": 6.711346202769729e-05, + "loss": 1.7987, + "step": 13266 + }, + { + "epoch": 4.0721301411909145, + "grad_norm": 0.3603350520133972, + "learning_rate": 6.71087915955578e-05, + "loss": 1.7963, + "step": 13267 + }, + { + "epoch": 4.07243707796194, + "grad_norm": 0.2673214077949524, + "learning_rate": 6.710412099433921e-05, + "loss": 1.8011, + "step": 13268 + }, + { + "epoch": 4.072744014732965, + "grad_norm": 0.2523653209209442, + "learning_rate": 6.709945022408768e-05, + "loss": 1.755, + "step": 13269 + }, + { + "epoch": 4.0730509515039905, + "grad_norm": 0.3818903863430023, + "learning_rate": 6.709477928484934e-05, + "loss": 1.7968, + "step": 13270 + }, + { + "epoch": 4.073357888275015, + "grad_norm": 0.31509929895401, + "learning_rate": 6.709010817667039e-05, + "loss": 1.744, + "step": 13271 + }, + { + "epoch": 4.07366482504604, + "grad_norm": 0.21875518560409546, + "learning_rate": 6.708543689959697e-05, + "loss": 1.7511, + "step": 13272 + }, + { + "epoch": 4.073971761817066, + "grad_norm": 0.25381338596343994, + "learning_rate": 6.708076545367523e-05, + "loss": 1.7523, + "step": 13273 + }, + { + "epoch": 4.074278698588091, + "grad_norm": 0.24193842709064484, + "learning_rate": 6.707609383895137e-05, + "loss": 1.7713, + "step": 13274 + }, + { + "epoch": 4.074585635359116, + "grad_norm": 0.21972359716892242, + "learning_rate": 6.707142205547154e-05, + "loss": 1.7329, + "step": 13275 + }, + { + "epoch": 4.074892572130141, + "grad_norm": 0.22188499569892883, + "learning_rate": 6.706675010328192e-05, + "loss": 1.7507, + "step": 13276 + }, + { + "epoch": 4.075199508901166, + "grad_norm": 0.23344436287879944, + "learning_rate": 6.706207798242865e-05, + "loss": 1.771, + "step": 13277 + }, + { + "epoch": 4.0755064456721914, + "grad_norm": 0.3008805513381958, + "learning_rate": 6.705740569295795e-05, + "loss": 1.775, + "step": 13278 + }, + { + "epoch": 4.075813382443217, + "grad_norm": 0.31407982110977173, + "learning_rate": 6.705273323491595e-05, + "loss": 1.7625, + "step": 13279 + }, + { + "epoch": 4.076120319214242, + "grad_norm": 0.2430381178855896, + "learning_rate": 6.704806060834886e-05, + "loss": 1.7706, + "step": 13280 + }, + { + "epoch": 4.0764272559852675, + "grad_norm": 0.23250171542167664, + "learning_rate": 6.704338781330284e-05, + "loss": 1.7977, + "step": 13281 + }, + { + "epoch": 4.076734192756292, + "grad_norm": 0.22073723375797272, + "learning_rate": 6.703871484982407e-05, + "loss": 1.7686, + "step": 13282 + }, + { + "epoch": 4.077041129527317, + "grad_norm": 0.24987035989761353, + "learning_rate": 6.703404171795874e-05, + "loss": 1.736, + "step": 13283 + }, + { + "epoch": 4.077348066298343, + "grad_norm": 0.2697623670101166, + "learning_rate": 6.702936841775301e-05, + "loss": 1.8367, + "step": 13284 + }, + { + "epoch": 4.077655003069368, + "grad_norm": 0.21592749655246735, + "learning_rate": 6.702469494925309e-05, + "loss": 1.7467, + "step": 13285 + }, + { + "epoch": 4.077961939840393, + "grad_norm": 0.2612052261829376, + "learning_rate": 6.702002131250515e-05, + "loss": 1.7689, + "step": 13286 + }, + { + "epoch": 4.078268876611418, + "grad_norm": 0.3004797697067261, + "learning_rate": 6.701534750755539e-05, + "loss": 1.7586, + "step": 13287 + }, + { + "epoch": 4.078575813382443, + "grad_norm": 0.24615366756916046, + "learning_rate": 6.701067353444998e-05, + "loss": 1.7636, + "step": 13288 + }, + { + "epoch": 4.078882750153468, + "grad_norm": 0.23401159048080444, + "learning_rate": 6.700599939323515e-05, + "loss": 1.8015, + "step": 13289 + }, + { + "epoch": 4.079189686924494, + "grad_norm": 0.24546295404434204, + "learning_rate": 6.700132508395705e-05, + "loss": 1.7606, + "step": 13290 + }, + { + "epoch": 4.079496623695519, + "grad_norm": 0.24664412438869476, + "learning_rate": 6.69966506066619e-05, + "loss": 1.7994, + "step": 13291 + }, + { + "epoch": 4.0798035604665435, + "grad_norm": 0.2780163288116455, + "learning_rate": 6.699197596139587e-05, + "loss": 1.7972, + "step": 13292 + }, + { + "epoch": 4.080110497237569, + "grad_norm": 0.2554188668727875, + "learning_rate": 6.698730114820517e-05, + "loss": 1.7928, + "step": 13293 + }, + { + "epoch": 4.080417434008594, + "grad_norm": 0.2471141666173935, + "learning_rate": 6.698262616713602e-05, + "loss": 1.7948, + "step": 13294 + }, + { + "epoch": 4.0807243707796195, + "grad_norm": 0.2556581199169159, + "learning_rate": 6.697795101823461e-05, + "loss": 1.7942, + "step": 13295 + }, + { + "epoch": 4.081031307550645, + "grad_norm": 0.24462421238422394, + "learning_rate": 6.697327570154712e-05, + "loss": 1.7336, + "step": 13296 + }, + { + "epoch": 4.08133824432167, + "grad_norm": 0.22378689050674438, + "learning_rate": 6.696860021711978e-05, + "loss": 1.7703, + "step": 13297 + }, + { + "epoch": 4.081645181092695, + "grad_norm": 0.23949933052062988, + "learning_rate": 6.69639245649988e-05, + "loss": 1.7651, + "step": 13298 + }, + { + "epoch": 4.08195211786372, + "grad_norm": 0.27751216292381287, + "learning_rate": 6.695924874523035e-05, + "loss": 1.7866, + "step": 13299 + }, + { + "epoch": 4.082259054634745, + "grad_norm": 0.22700226306915283, + "learning_rate": 6.695457275786068e-05, + "loss": 1.79, + "step": 13300 + }, + { + "epoch": 4.082565991405771, + "grad_norm": 0.2138090431690216, + "learning_rate": 6.694989660293598e-05, + "loss": 1.7882, + "step": 13301 + }, + { + "epoch": 4.082872928176796, + "grad_norm": 0.2963469326496124, + "learning_rate": 6.694522028050246e-05, + "loss": 1.8779, + "step": 13302 + }, + { + "epoch": 4.08317986494782, + "grad_norm": 0.31833669543266296, + "learning_rate": 6.694054379060634e-05, + "loss": 1.7923, + "step": 13303 + }, + { + "epoch": 4.083486801718846, + "grad_norm": 0.27751585841178894, + "learning_rate": 6.693586713329385e-05, + "loss": 1.7557, + "step": 13304 + }, + { + "epoch": 4.083793738489871, + "grad_norm": 0.23790816962718964, + "learning_rate": 6.69311903086112e-05, + "loss": 1.7587, + "step": 13305 + }, + { + "epoch": 4.084100675260896, + "grad_norm": 0.24153777956962585, + "learning_rate": 6.692651331660458e-05, + "loss": 1.7573, + "step": 13306 + }, + { + "epoch": 4.084407612031922, + "grad_norm": 0.26607179641723633, + "learning_rate": 6.692183615732025e-05, + "loss": 1.7823, + "step": 13307 + }, + { + "epoch": 4.084714548802946, + "grad_norm": 0.26670268177986145, + "learning_rate": 6.691715883080442e-05, + "loss": 1.784, + "step": 13308 + }, + { + "epoch": 4.0850214855739715, + "grad_norm": 0.25980666279792786, + "learning_rate": 6.69124813371033e-05, + "loss": 1.797, + "step": 13309 + }, + { + "epoch": 4.085328422344997, + "grad_norm": 0.2805597484111786, + "learning_rate": 6.690780367626314e-05, + "loss": 1.8298, + "step": 13310 + }, + { + "epoch": 4.085635359116022, + "grad_norm": 0.27198413014411926, + "learning_rate": 6.690312584833012e-05, + "loss": 1.8104, + "step": 13311 + }, + { + "epoch": 4.0859422958870475, + "grad_norm": 0.2619116008281708, + "learning_rate": 6.689844785335054e-05, + "loss": 1.771, + "step": 13312 + }, + { + "epoch": 4.086249232658073, + "grad_norm": 0.22647863626480103, + "learning_rate": 6.689376969137057e-05, + "loss": 1.8114, + "step": 13313 + }, + { + "epoch": 4.086556169429097, + "grad_norm": 1.469475507736206, + "learning_rate": 6.68890913624365e-05, + "loss": 1.8796, + "step": 13314 + }, + { + "epoch": 4.086863106200123, + "grad_norm": 0.4577515423297882, + "learning_rate": 6.68844128665945e-05, + "loss": 1.716, + "step": 13315 + }, + { + "epoch": 4.087170042971148, + "grad_norm": 0.5830543637275696, + "learning_rate": 6.687973420389085e-05, + "loss": 1.7692, + "step": 13316 + }, + { + "epoch": 4.087476979742173, + "grad_norm": 0.4404197037220001, + "learning_rate": 6.687505537437178e-05, + "loss": 1.7909, + "step": 13317 + }, + { + "epoch": 4.087783916513199, + "grad_norm": 0.31379908323287964, + "learning_rate": 6.68703763780835e-05, + "loss": 1.7957, + "step": 13318 + }, + { + "epoch": 4.088090853284223, + "grad_norm": 0.49588730931282043, + "learning_rate": 6.686569721507229e-05, + "loss": 1.7126, + "step": 13319 + }, + { + "epoch": 4.088397790055248, + "grad_norm": 0.3690234124660492, + "learning_rate": 6.686101788538437e-05, + "loss": 1.8233, + "step": 13320 + }, + { + "epoch": 4.088704726826274, + "grad_norm": 0.337310254573822, + "learning_rate": 6.685633838906598e-05, + "loss": 1.6886, + "step": 13321 + }, + { + "epoch": 4.089011663597299, + "grad_norm": 0.5164821147918701, + "learning_rate": 6.685165872616337e-05, + "loss": 1.7967, + "step": 13322 + }, + { + "epoch": 4.089318600368324, + "grad_norm": 0.36501309275627136, + "learning_rate": 6.68469788967228e-05, + "loss": 1.755, + "step": 13323 + }, + { + "epoch": 4.08962553713935, + "grad_norm": 0.35017216205596924, + "learning_rate": 6.684229890079052e-05, + "loss": 1.7595, + "step": 13324 + }, + { + "epoch": 4.089932473910374, + "grad_norm": 0.5622650980949402, + "learning_rate": 6.683761873841277e-05, + "loss": 1.7841, + "step": 13325 + }, + { + "epoch": 4.0902394106813995, + "grad_norm": 0.47010260820388794, + "learning_rate": 6.683293840963578e-05, + "loss": 1.7537, + "step": 13326 + }, + { + "epoch": 4.090546347452425, + "grad_norm": 0.25515374541282654, + "learning_rate": 6.682825791450584e-05, + "loss": 1.7692, + "step": 13327 + }, + { + "epoch": 4.09085328422345, + "grad_norm": 0.5063003897666931, + "learning_rate": 6.682357725306919e-05, + "loss": 1.7454, + "step": 13328 + }, + { + "epoch": 4.0911602209944755, + "grad_norm": 0.4197622835636139, + "learning_rate": 6.681889642537209e-05, + "loss": 1.7792, + "step": 13329 + }, + { + "epoch": 4.0914671577655, + "grad_norm": 0.24038295447826385, + "learning_rate": 6.68142154314608e-05, + "loss": 1.7631, + "step": 13330 + }, + { + "epoch": 4.091774094536525, + "grad_norm": 0.42108532786369324, + "learning_rate": 6.680953427138159e-05, + "loss": 1.7784, + "step": 13331 + }, + { + "epoch": 4.092081031307551, + "grad_norm": 0.33729633688926697, + "learning_rate": 6.68048529451807e-05, + "loss": 1.8057, + "step": 13332 + }, + { + "epoch": 4.092387968078576, + "grad_norm": 0.31847241520881653, + "learning_rate": 6.68001714529044e-05, + "loss": 1.7375, + "step": 13333 + }, + { + "epoch": 4.092694904849601, + "grad_norm": 0.45276644825935364, + "learning_rate": 6.679548979459896e-05, + "loss": 1.7507, + "step": 13334 + }, + { + "epoch": 4.093001841620626, + "grad_norm": 0.3781665861606598, + "learning_rate": 6.679080797031065e-05, + "loss": 1.7718, + "step": 13335 + }, + { + "epoch": 4.093308778391651, + "grad_norm": 0.25868359208106995, + "learning_rate": 6.678612598008573e-05, + "loss": 1.8105, + "step": 13336 + }, + { + "epoch": 4.093615715162676, + "grad_norm": 0.32834702730178833, + "learning_rate": 6.678144382397048e-05, + "loss": 1.7883, + "step": 13337 + }, + { + "epoch": 4.093922651933702, + "grad_norm": 0.2830568253993988, + "learning_rate": 6.677676150201116e-05, + "loss": 1.7994, + "step": 13338 + }, + { + "epoch": 4.094229588704727, + "grad_norm": 0.219541534781456, + "learning_rate": 6.677207901425405e-05, + "loss": 1.7344, + "step": 13339 + }, + { + "epoch": 4.094536525475752, + "grad_norm": 0.2557326555252075, + "learning_rate": 6.676739636074542e-05, + "loss": 1.7734, + "step": 13340 + }, + { + "epoch": 4.094843462246777, + "grad_norm": 0.2741365432739258, + "learning_rate": 6.676271354153156e-05, + "loss": 1.7912, + "step": 13341 + }, + { + "epoch": 4.095150399017802, + "grad_norm": 0.31258970499038696, + "learning_rate": 6.675803055665874e-05, + "loss": 1.7798, + "step": 13342 + }, + { + "epoch": 4.0954573357888275, + "grad_norm": 0.30181947350502014, + "learning_rate": 6.675334740617322e-05, + "loss": 1.7746, + "step": 13343 + }, + { + "epoch": 4.095764272559853, + "grad_norm": 0.3000102937221527, + "learning_rate": 6.674866409012133e-05, + "loss": 1.7842, + "step": 13344 + }, + { + "epoch": 4.096071209330878, + "grad_norm": 0.22871005535125732, + "learning_rate": 6.674398060854931e-05, + "loss": 1.7473, + "step": 13345 + }, + { + "epoch": 4.096378146101903, + "grad_norm": 0.2700810432434082, + "learning_rate": 6.673929696150346e-05, + "loss": 1.7862, + "step": 13346 + }, + { + "epoch": 4.096685082872928, + "grad_norm": 0.27537551522254944, + "learning_rate": 6.673461314903007e-05, + "loss": 1.7843, + "step": 13347 + }, + { + "epoch": 4.096992019643953, + "grad_norm": 0.23700574040412903, + "learning_rate": 6.672992917117542e-05, + "loss": 1.765, + "step": 13348 + }, + { + "epoch": 4.097298956414979, + "grad_norm": 0.23331589996814728, + "learning_rate": 6.672524502798583e-05, + "loss": 1.7894, + "step": 13349 + }, + { + "epoch": 4.097605893186004, + "grad_norm": 0.28591978549957275, + "learning_rate": 6.672056071950753e-05, + "loss": 1.7736, + "step": 13350 + }, + { + "epoch": 4.097912829957028, + "grad_norm": 0.3000452518463135, + "learning_rate": 6.671587624578685e-05, + "loss": 1.7635, + "step": 13351 + }, + { + "epoch": 4.098219766728054, + "grad_norm": 0.21877998113632202, + "learning_rate": 6.67111916068701e-05, + "loss": 1.7225, + "step": 13352 + }, + { + "epoch": 4.098526703499079, + "grad_norm": 0.2598817050457001, + "learning_rate": 6.670650680280358e-05, + "loss": 1.6874, + "step": 13353 + }, + { + "epoch": 4.098833640270104, + "grad_norm": 0.3063203692436218, + "learning_rate": 6.670182183363353e-05, + "loss": 1.7821, + "step": 13354 + }, + { + "epoch": 4.09914057704113, + "grad_norm": 0.2328508347272873, + "learning_rate": 6.66971366994063e-05, + "loss": 1.788, + "step": 13355 + }, + { + "epoch": 4.099447513812155, + "grad_norm": 0.33936765789985657, + "learning_rate": 6.669245140016817e-05, + "loss": 1.8159, + "step": 13356 + }, + { + "epoch": 4.0997544505831796, + "grad_norm": 0.27464553713798523, + "learning_rate": 6.668776593596546e-05, + "loss": 1.7371, + "step": 13357 + }, + { + "epoch": 4.100061387354205, + "grad_norm": 0.24255812168121338, + "learning_rate": 6.668308030684447e-05, + "loss": 1.7993, + "step": 13358 + }, + { + "epoch": 4.10036832412523, + "grad_norm": 0.27203628420829773, + "learning_rate": 6.667839451285149e-05, + "loss": 1.8253, + "step": 13359 + }, + { + "epoch": 4.100675260896256, + "grad_norm": 0.2503862679004669, + "learning_rate": 6.667370855403286e-05, + "loss": 1.7927, + "step": 13360 + }, + { + "epoch": 4.100982197667281, + "grad_norm": 0.2616904377937317, + "learning_rate": 6.666902243043486e-05, + "loss": 1.8226, + "step": 13361 + }, + { + "epoch": 4.101289134438305, + "grad_norm": 0.26707521080970764, + "learning_rate": 6.666433614210379e-05, + "loss": 1.8485, + "step": 13362 + }, + { + "epoch": 4.101596071209331, + "grad_norm": 0.2427528202533722, + "learning_rate": 6.6659649689086e-05, + "loss": 1.7387, + "step": 13363 + }, + { + "epoch": 4.101903007980356, + "grad_norm": 0.2319549173116684, + "learning_rate": 6.66549630714278e-05, + "loss": 1.7396, + "step": 13364 + }, + { + "epoch": 4.102209944751381, + "grad_norm": 0.2248002141714096, + "learning_rate": 6.665027628917548e-05, + "loss": 1.7817, + "step": 13365 + }, + { + "epoch": 4.102516881522407, + "grad_norm": 0.21929535269737244, + "learning_rate": 6.664558934237538e-05, + "loss": 1.7478, + "step": 13366 + }, + { + "epoch": 4.102823818293431, + "grad_norm": 0.21144583821296692, + "learning_rate": 6.66409022310738e-05, + "loss": 1.7602, + "step": 13367 + }, + { + "epoch": 4.1031307550644565, + "grad_norm": 0.21984660625457764, + "learning_rate": 6.663621495531707e-05, + "loss": 1.7541, + "step": 13368 + }, + { + "epoch": 4.103437691835482, + "grad_norm": 0.2075357735157013, + "learning_rate": 6.663152751515152e-05, + "loss": 1.7362, + "step": 13369 + }, + { + "epoch": 4.103744628606507, + "grad_norm": 0.23316961526870728, + "learning_rate": 6.662683991062347e-05, + "loss": 1.8273, + "step": 13370 + }, + { + "epoch": 4.1040515653775325, + "grad_norm": 0.23142337799072266, + "learning_rate": 6.662215214177922e-05, + "loss": 1.7543, + "step": 13371 + }, + { + "epoch": 4.104358502148558, + "grad_norm": 0.24335260689258575, + "learning_rate": 6.661746420866515e-05, + "loss": 1.8328, + "step": 13372 + }, + { + "epoch": 4.104665438919582, + "grad_norm": 0.2440192997455597, + "learning_rate": 6.661277611132753e-05, + "loss": 1.8114, + "step": 13373 + }, + { + "epoch": 4.104972375690608, + "grad_norm": 0.252808541059494, + "learning_rate": 6.660808784981273e-05, + "loss": 1.8556, + "step": 13374 + }, + { + "epoch": 4.105279312461633, + "grad_norm": 0.24564477801322937, + "learning_rate": 6.660339942416708e-05, + "loss": 1.8231, + "step": 13375 + }, + { + "epoch": 4.105586249232658, + "grad_norm": 0.2371874898672104, + "learning_rate": 6.65987108344369e-05, + "loss": 1.7763, + "step": 13376 + }, + { + "epoch": 4.105893186003684, + "grad_norm": 0.22882802784442902, + "learning_rate": 6.659402208066854e-05, + "loss": 1.7388, + "step": 13377 + }, + { + "epoch": 4.106200122774708, + "grad_norm": 0.24857540428638458, + "learning_rate": 6.658933316290832e-05, + "loss": 1.7735, + "step": 13378 + }, + { + "epoch": 4.106507059545733, + "grad_norm": 0.22574029862880707, + "learning_rate": 6.658464408120257e-05, + "loss": 1.7403, + "step": 13379 + }, + { + "epoch": 4.106813996316759, + "grad_norm": 0.24944272637367249, + "learning_rate": 6.657995483559767e-05, + "loss": 1.7827, + "step": 13380 + }, + { + "epoch": 4.107120933087784, + "grad_norm": 0.27386224269866943, + "learning_rate": 6.657526542613992e-05, + "loss": 1.7673, + "step": 13381 + }, + { + "epoch": 4.107427869858809, + "grad_norm": 0.29222097992897034, + "learning_rate": 6.65705758528757e-05, + "loss": 1.7958, + "step": 13382 + }, + { + "epoch": 4.107734806629834, + "grad_norm": 0.2471150904893875, + "learning_rate": 6.656588611585133e-05, + "loss": 1.7706, + "step": 13383 + }, + { + "epoch": 4.108041743400859, + "grad_norm": 0.289316862821579, + "learning_rate": 6.656119621511317e-05, + "loss": 1.7828, + "step": 13384 + }, + { + "epoch": 4.1083486801718845, + "grad_norm": 0.36710497736930847, + "learning_rate": 6.655650615070756e-05, + "loss": 1.712, + "step": 13385 + }, + { + "epoch": 4.10865561694291, + "grad_norm": 0.2999880611896515, + "learning_rate": 6.655181592268084e-05, + "loss": 1.7711, + "step": 13386 + }, + { + "epoch": 4.108962553713935, + "grad_norm": 0.332011342048645, + "learning_rate": 6.654712553107939e-05, + "loss": 1.907, + "step": 13387 + }, + { + "epoch": 4.1092694904849605, + "grad_norm": 0.43125995993614197, + "learning_rate": 6.654243497594953e-05, + "loss": 1.7819, + "step": 13388 + }, + { + "epoch": 4.109576427255985, + "grad_norm": 0.33719149231910706, + "learning_rate": 6.653774425733765e-05, + "loss": 1.797, + "step": 13389 + }, + { + "epoch": 4.10988336402701, + "grad_norm": 0.23091599345207214, + "learning_rate": 6.653305337529006e-05, + "loss": 1.7384, + "step": 13390 + }, + { + "epoch": 4.110190300798036, + "grad_norm": 0.4283982515335083, + "learning_rate": 6.652836232985317e-05, + "loss": 1.8284, + "step": 13391 + }, + { + "epoch": 4.110497237569061, + "grad_norm": 0.43575870990753174, + "learning_rate": 6.652367112107332e-05, + "loss": 1.7235, + "step": 13392 + }, + { + "epoch": 4.110804174340086, + "grad_norm": 0.246877059340477, + "learning_rate": 6.651897974899685e-05, + "loss": 1.7174, + "step": 13393 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.36063629388809204, + "learning_rate": 6.651428821367015e-05, + "loss": 1.8064, + "step": 13394 + }, + { + "epoch": 4.111418047882136, + "grad_norm": 0.4454420804977417, + "learning_rate": 6.650959651513957e-05, + "loss": 1.7575, + "step": 13395 + }, + { + "epoch": 4.111724984653161, + "grad_norm": 0.2788856327533722, + "learning_rate": 6.650490465345149e-05, + "loss": 1.7696, + "step": 13396 + }, + { + "epoch": 4.112031921424187, + "grad_norm": 0.40281879901885986, + "learning_rate": 6.650021262865225e-05, + "loss": 1.8368, + "step": 13397 + }, + { + "epoch": 4.112338858195212, + "grad_norm": 0.5151103138923645, + "learning_rate": 6.649552044078825e-05, + "loss": 1.8224, + "step": 13398 + }, + { + "epoch": 4.112645794966237, + "grad_norm": 0.29390639066696167, + "learning_rate": 6.649082808990586e-05, + "loss": 1.7846, + "step": 13399 + }, + { + "epoch": 4.112952731737262, + "grad_norm": 0.3061942458152771, + "learning_rate": 6.648613557605142e-05, + "loss": 1.7954, + "step": 13400 + }, + { + "epoch": 4.113259668508287, + "grad_norm": 0.47628748416900635, + "learning_rate": 6.648144289927132e-05, + "loss": 1.7782, + "step": 13401 + }, + { + "epoch": 4.1135666052793125, + "grad_norm": 0.4299588203430176, + "learning_rate": 6.647675005961197e-05, + "loss": 1.7459, + "step": 13402 + }, + { + "epoch": 4.113873542050338, + "grad_norm": 0.24556589126586914, + "learning_rate": 6.64720570571197e-05, + "loss": 1.753, + "step": 13403 + }, + { + "epoch": 4.114180478821363, + "grad_norm": 0.29620522260665894, + "learning_rate": 6.646736389184092e-05, + "loss": 1.773, + "step": 13404 + }, + { + "epoch": 4.114487415592388, + "grad_norm": 0.37710070610046387, + "learning_rate": 6.646267056382199e-05, + "loss": 1.8389, + "step": 13405 + }, + { + "epoch": 4.114794352363413, + "grad_norm": 0.2562984824180603, + "learning_rate": 6.64579770731093e-05, + "loss": 1.7905, + "step": 13406 + }, + { + "epoch": 4.115101289134438, + "grad_norm": 0.3999946713447571, + "learning_rate": 6.645328341974924e-05, + "loss": 1.7734, + "step": 13407 + }, + { + "epoch": 4.115408225905464, + "grad_norm": 0.36087217926979065, + "learning_rate": 6.644858960378817e-05, + "loss": 1.801, + "step": 13408 + }, + { + "epoch": 4.115715162676489, + "grad_norm": 0.2520254850387573, + "learning_rate": 6.644389562527251e-05, + "loss": 1.7394, + "step": 13409 + }, + { + "epoch": 4.116022099447513, + "grad_norm": 0.4321835935115814, + "learning_rate": 6.643920148424864e-05, + "loss": 1.8091, + "step": 13410 + }, + { + "epoch": 4.116329036218539, + "grad_norm": 0.40900173783302307, + "learning_rate": 6.643450718076294e-05, + "loss": 1.8198, + "step": 13411 + }, + { + "epoch": 4.116635972989564, + "grad_norm": 0.23693956434726715, + "learning_rate": 6.642981271486182e-05, + "loss": 1.6807, + "step": 13412 + }, + { + "epoch": 4.116942909760589, + "grad_norm": 0.33526891469955444, + "learning_rate": 6.642511808659164e-05, + "loss": 1.8673, + "step": 13413 + }, + { + "epoch": 4.117249846531615, + "grad_norm": 0.4037325382232666, + "learning_rate": 6.642042329599883e-05, + "loss": 1.743, + "step": 13414 + }, + { + "epoch": 4.11755678330264, + "grad_norm": 0.25629740953445435, + "learning_rate": 6.641572834312975e-05, + "loss": 1.6904, + "step": 13415 + }, + { + "epoch": 4.1178637200736645, + "grad_norm": 0.29203253984451294, + "learning_rate": 6.641103322803087e-05, + "loss": 1.7811, + "step": 13416 + }, + { + "epoch": 4.11817065684469, + "grad_norm": 0.423926442861557, + "learning_rate": 6.64063379507485e-05, + "loss": 1.7341, + "step": 13417 + }, + { + "epoch": 4.118477593615715, + "grad_norm": 0.29561251401901245, + "learning_rate": 6.64016425113291e-05, + "loss": 1.7915, + "step": 13418 + }, + { + "epoch": 4.1187845303867405, + "grad_norm": 0.2536832094192505, + "learning_rate": 6.639694690981903e-05, + "loss": 1.7628, + "step": 13419 + }, + { + "epoch": 4.119091467157766, + "grad_norm": 0.2931392192840576, + "learning_rate": 6.639225114626475e-05, + "loss": 1.7877, + "step": 13420 + }, + { + "epoch": 4.11939840392879, + "grad_norm": 0.2219499796628952, + "learning_rate": 6.638755522071263e-05, + "loss": 1.7183, + "step": 13421 + }, + { + "epoch": 4.119705340699816, + "grad_norm": 0.2951931953430176, + "learning_rate": 6.638285913320908e-05, + "loss": 1.7983, + "step": 13422 + }, + { + "epoch": 4.120012277470841, + "grad_norm": 0.3495960533618927, + "learning_rate": 6.63781628838005e-05, + "loss": 1.7531, + "step": 13423 + }, + { + "epoch": 4.120319214241866, + "grad_norm": 0.2389262616634369, + "learning_rate": 6.637346647253333e-05, + "loss": 1.7454, + "step": 13424 + }, + { + "epoch": 4.120626151012892, + "grad_norm": 0.28729167580604553, + "learning_rate": 6.636876989945395e-05, + "loss": 1.8105, + "step": 13425 + }, + { + "epoch": 4.120933087783916, + "grad_norm": 0.2620082199573517, + "learning_rate": 6.636407316460882e-05, + "loss": 1.7948, + "step": 13426 + }, + { + "epoch": 4.121240024554941, + "grad_norm": 0.2694189250469208, + "learning_rate": 6.635937626804432e-05, + "loss": 1.809, + "step": 13427 + }, + { + "epoch": 4.121546961325967, + "grad_norm": 0.2660866379737854, + "learning_rate": 6.635467920980687e-05, + "loss": 1.7431, + "step": 13428 + }, + { + "epoch": 4.121853898096992, + "grad_norm": 0.2579907774925232, + "learning_rate": 6.634998198994289e-05, + "loss": 1.7941, + "step": 13429 + }, + { + "epoch": 4.122160834868017, + "grad_norm": 0.28349989652633667, + "learning_rate": 6.634528460849881e-05, + "loss": 1.8142, + "step": 13430 + }, + { + "epoch": 4.122467771639043, + "grad_norm": 0.28716522455215454, + "learning_rate": 6.634058706552104e-05, + "loss": 1.7496, + "step": 13431 + }, + { + "epoch": 4.122774708410067, + "grad_norm": 0.23228077590465546, + "learning_rate": 6.633588936105601e-05, + "loss": 1.7399, + "step": 13432 + }, + { + "epoch": 4.1230816451810925, + "grad_norm": 0.3649841248989105, + "learning_rate": 6.633119149515017e-05, + "loss": 1.7696, + "step": 13433 + }, + { + "epoch": 4.123388581952118, + "grad_norm": 0.2757830321788788, + "learning_rate": 6.632649346784992e-05, + "loss": 1.8329, + "step": 13434 + }, + { + "epoch": 4.123695518723143, + "grad_norm": 0.28163692355155945, + "learning_rate": 6.632179527920167e-05, + "loss": 1.7761, + "step": 13435 + }, + { + "epoch": 4.1240024554941686, + "grad_norm": 0.3453187048435211, + "learning_rate": 6.631709692925188e-05, + "loss": 1.7843, + "step": 13436 + }, + { + "epoch": 4.124309392265193, + "grad_norm": 0.2792697250843048, + "learning_rate": 6.631239841804698e-05, + "loss": 1.7889, + "step": 13437 + }, + { + "epoch": 4.124616329036218, + "grad_norm": 0.21881693601608276, + "learning_rate": 6.630769974563339e-05, + "loss": 1.8015, + "step": 13438 + }, + { + "epoch": 4.124923265807244, + "grad_norm": 0.4464910328388214, + "learning_rate": 6.630300091205756e-05, + "loss": 1.7851, + "step": 13439 + }, + { + "epoch": 4.125230202578269, + "grad_norm": 0.40191107988357544, + "learning_rate": 6.629830191736591e-05, + "loss": 1.8608, + "step": 13440 + }, + { + "epoch": 4.125537139349294, + "grad_norm": 0.2809060513973236, + "learning_rate": 6.62936027616049e-05, + "loss": 1.7374, + "step": 13441 + }, + { + "epoch": 4.12584407612032, + "grad_norm": 0.24980643391609192, + "learning_rate": 6.628890344482095e-05, + "loss": 1.8152, + "step": 13442 + }, + { + "epoch": 4.126151012891344, + "grad_norm": 0.24538342654705048, + "learning_rate": 6.62842039670605e-05, + "loss": 1.7687, + "step": 13443 + }, + { + "epoch": 4.1264579496623695, + "grad_norm": 0.24684634804725647, + "learning_rate": 6.627950432837002e-05, + "loss": 1.787, + "step": 13444 + }, + { + "epoch": 4.126764886433395, + "grad_norm": 0.22724607586860657, + "learning_rate": 6.627480452879593e-05, + "loss": 1.7871, + "step": 13445 + }, + { + "epoch": 4.12707182320442, + "grad_norm": 0.24724406003952026, + "learning_rate": 6.627010456838469e-05, + "loss": 1.7524, + "step": 13446 + }, + { + "epoch": 4.1273787599754455, + "grad_norm": 0.24219536781311035, + "learning_rate": 6.626540444718274e-05, + "loss": 1.7754, + "step": 13447 + }, + { + "epoch": 4.12768569674647, + "grad_norm": 0.24857915937900543, + "learning_rate": 6.626070416523652e-05, + "loss": 1.7839, + "step": 13448 + }, + { + "epoch": 4.127992633517495, + "grad_norm": 0.2639105021953583, + "learning_rate": 6.625600372259248e-05, + "loss": 1.7546, + "step": 13449 + }, + { + "epoch": 4.128299570288521, + "grad_norm": 0.23598137497901917, + "learning_rate": 6.62513031192971e-05, + "loss": 1.7957, + "step": 13450 + }, + { + "epoch": 4.128606507059546, + "grad_norm": 0.3038909137248993, + "learning_rate": 6.624660235539682e-05, + "loss": 1.8117, + "step": 13451 + }, + { + "epoch": 4.128913443830571, + "grad_norm": 0.27671241760253906, + "learning_rate": 6.624190143093809e-05, + "loss": 1.729, + "step": 13452 + }, + { + "epoch": 4.129220380601596, + "grad_norm": 0.24638360738754272, + "learning_rate": 6.623720034596735e-05, + "loss": 1.7414, + "step": 13453 + }, + { + "epoch": 4.129527317372621, + "grad_norm": 0.24073924124240875, + "learning_rate": 6.623249910053111e-05, + "loss": 1.8046, + "step": 13454 + }, + { + "epoch": 4.129834254143646, + "grad_norm": 0.29734376072883606, + "learning_rate": 6.622779769467578e-05, + "loss": 1.8336, + "step": 13455 + }, + { + "epoch": 4.130141190914672, + "grad_norm": 0.23182810842990875, + "learning_rate": 6.622309612844785e-05, + "loss": 1.7742, + "step": 13456 + }, + { + "epoch": 4.130448127685697, + "grad_norm": 0.2179390788078308, + "learning_rate": 6.621839440189378e-05, + "loss": 1.7656, + "step": 13457 + }, + { + "epoch": 4.1307550644567215, + "grad_norm": 0.21389013528823853, + "learning_rate": 6.621369251506002e-05, + "loss": 1.7504, + "step": 13458 + }, + { + "epoch": 4.131062001227747, + "grad_norm": 0.22306203842163086, + "learning_rate": 6.620899046799305e-05, + "loss": 1.7573, + "step": 13459 + }, + { + "epoch": 4.131368937998772, + "grad_norm": 0.2699708938598633, + "learning_rate": 6.620428826073934e-05, + "loss": 1.7419, + "step": 13460 + }, + { + "epoch": 4.1316758747697975, + "grad_norm": 0.34087565541267395, + "learning_rate": 6.619958589334534e-05, + "loss": 1.7545, + "step": 13461 + }, + { + "epoch": 4.131982811540823, + "grad_norm": 0.2934977412223816, + "learning_rate": 6.619488336585755e-05, + "loss": 1.7611, + "step": 13462 + }, + { + "epoch": 4.132289748311848, + "grad_norm": 0.22545567154884338, + "learning_rate": 6.619018067832243e-05, + "loss": 1.7562, + "step": 13463 + }, + { + "epoch": 4.132596685082873, + "grad_norm": 0.23334743082523346, + "learning_rate": 6.618547783078647e-05, + "loss": 1.7784, + "step": 13464 + }, + { + "epoch": 4.132903621853898, + "grad_norm": 0.22466403245925903, + "learning_rate": 6.618077482329612e-05, + "loss": 1.7277, + "step": 13465 + }, + { + "epoch": 4.133210558624923, + "grad_norm": 0.23504197597503662, + "learning_rate": 6.617607165589785e-05, + "loss": 1.7983, + "step": 13466 + }, + { + "epoch": 4.133517495395949, + "grad_norm": 0.2500833570957184, + "learning_rate": 6.617136832863819e-05, + "loss": 1.7826, + "step": 13467 + }, + { + "epoch": 4.133824432166974, + "grad_norm": 0.22398658096790314, + "learning_rate": 6.616666484156357e-05, + "loss": 1.7281, + "step": 13468 + }, + { + "epoch": 4.134131368937998, + "grad_norm": 0.2537873089313507, + "learning_rate": 6.616196119472052e-05, + "loss": 1.7598, + "step": 13469 + }, + { + "epoch": 4.134438305709024, + "grad_norm": 0.26881173253059387, + "learning_rate": 6.615725738815546e-05, + "loss": 1.8161, + "step": 13470 + }, + { + "epoch": 4.134745242480049, + "grad_norm": 0.3311346471309662, + "learning_rate": 6.615255342191492e-05, + "loss": 1.7954, + "step": 13471 + }, + { + "epoch": 4.135052179251074, + "grad_norm": 0.2562953233718872, + "learning_rate": 6.614784929604539e-05, + "loss": 1.7284, + "step": 13472 + }, + { + "epoch": 4.1353591160221, + "grad_norm": 0.2563154101371765, + "learning_rate": 6.614314501059334e-05, + "loss": 1.7995, + "step": 13473 + }, + { + "epoch": 4.135666052793125, + "grad_norm": 0.24861161410808563, + "learning_rate": 6.613844056560527e-05, + "loss": 1.7589, + "step": 13474 + }, + { + "epoch": 4.1359729895641495, + "grad_norm": 0.23815487325191498, + "learning_rate": 6.613373596112769e-05, + "loss": 1.6906, + "step": 13475 + }, + { + "epoch": 4.136279926335175, + "grad_norm": 0.25394049286842346, + "learning_rate": 6.612903119720705e-05, + "loss": 1.781, + "step": 13476 + }, + { + "epoch": 4.1365868631062, + "grad_norm": 0.24501466751098633, + "learning_rate": 6.612432627388988e-05, + "loss": 1.797, + "step": 13477 + }, + { + "epoch": 4.1368937998772255, + "grad_norm": 0.24909707903862, + "learning_rate": 6.611962119122267e-05, + "loss": 1.7643, + "step": 13478 + }, + { + "epoch": 4.137200736648251, + "grad_norm": 0.24954476952552795, + "learning_rate": 6.611491594925192e-05, + "loss": 1.8219, + "step": 13479 + }, + { + "epoch": 4.137507673419275, + "grad_norm": 0.30572372674942017, + "learning_rate": 6.611021054802411e-05, + "loss": 1.8039, + "step": 13480 + }, + { + "epoch": 4.137814610190301, + "grad_norm": 0.27466365694999695, + "learning_rate": 6.610550498758577e-05, + "loss": 1.6945, + "step": 13481 + }, + { + "epoch": 4.138121546961326, + "grad_norm": 0.2614271640777588, + "learning_rate": 6.610079926798339e-05, + "loss": 1.8648, + "step": 13482 + }, + { + "epoch": 4.138428483732351, + "grad_norm": 0.23645827174186707, + "learning_rate": 6.609609338926346e-05, + "loss": 1.7424, + "step": 13483 + }, + { + "epoch": 4.138735420503377, + "grad_norm": 0.24473626911640167, + "learning_rate": 6.609138735147253e-05, + "loss": 1.8036, + "step": 13484 + }, + { + "epoch": 4.139042357274401, + "grad_norm": 0.2472417950630188, + "learning_rate": 6.608668115465706e-05, + "loss": 1.794, + "step": 13485 + }, + { + "epoch": 4.139349294045426, + "grad_norm": 0.25330284237861633, + "learning_rate": 6.608197479886358e-05, + "loss": 1.8052, + "step": 13486 + }, + { + "epoch": 4.139656230816452, + "grad_norm": 0.24279309809207916, + "learning_rate": 6.60772682841386e-05, + "loss": 1.7375, + "step": 13487 + }, + { + "epoch": 4.139963167587477, + "grad_norm": 0.22319461405277252, + "learning_rate": 6.607256161052862e-05, + "loss": 1.7696, + "step": 13488 + }, + { + "epoch": 4.140270104358502, + "grad_norm": 0.25261563062667847, + "learning_rate": 6.606785477808017e-05, + "loss": 1.7646, + "step": 13489 + }, + { + "epoch": 4.140577041129528, + "grad_norm": 0.3127744793891907, + "learning_rate": 6.606314778683977e-05, + "loss": 1.7899, + "step": 13490 + }, + { + "epoch": 4.140883977900552, + "grad_norm": 0.3550816774368286, + "learning_rate": 6.605844063685392e-05, + "loss": 1.7971, + "step": 13491 + }, + { + "epoch": 4.1411909146715775, + "grad_norm": 0.20977813005447388, + "learning_rate": 6.605373332816916e-05, + "loss": 1.7416, + "step": 13492 + }, + { + "epoch": 4.141497851442603, + "grad_norm": 0.26593849062919617, + "learning_rate": 6.6049025860832e-05, + "loss": 1.7586, + "step": 13493 + }, + { + "epoch": 4.141804788213628, + "grad_norm": 0.2452937364578247, + "learning_rate": 6.604431823488893e-05, + "loss": 1.757, + "step": 13494 + }, + { + "epoch": 4.1421117249846535, + "grad_norm": 0.21029168367385864, + "learning_rate": 6.603961045038652e-05, + "loss": 1.7665, + "step": 13495 + }, + { + "epoch": 4.142418661755678, + "grad_norm": 0.2396312952041626, + "learning_rate": 6.603490250737128e-05, + "loss": 1.7609, + "step": 13496 + }, + { + "epoch": 4.142725598526703, + "grad_norm": 0.23266808688640594, + "learning_rate": 6.603019440588975e-05, + "loss": 1.7893, + "step": 13497 + }, + { + "epoch": 4.143032535297729, + "grad_norm": 0.25235217809677124, + "learning_rate": 6.602548614598842e-05, + "loss": 1.7465, + "step": 13498 + }, + { + "epoch": 4.143339472068754, + "grad_norm": 0.22944024205207825, + "learning_rate": 6.602077772771386e-05, + "loss": 1.7052, + "step": 13499 + }, + { + "epoch": 4.143646408839779, + "grad_norm": 0.2116660475730896, + "learning_rate": 6.601606915111257e-05, + "loss": 1.7042, + "step": 13500 + }, + { + "epoch": 4.143953345610804, + "grad_norm": 0.21777184307575226, + "learning_rate": 6.601136041623111e-05, + "loss": 1.7938, + "step": 13501 + }, + { + "epoch": 4.144260282381829, + "grad_norm": 0.23663075268268585, + "learning_rate": 6.600665152311601e-05, + "loss": 1.7475, + "step": 13502 + }, + { + "epoch": 4.144567219152854, + "grad_norm": 0.20644642412662506, + "learning_rate": 6.600194247181377e-05, + "loss": 1.7992, + "step": 13503 + }, + { + "epoch": 4.14487415592388, + "grad_norm": 0.21479010581970215, + "learning_rate": 6.599723326237098e-05, + "loss": 1.7877, + "step": 13504 + }, + { + "epoch": 4.145181092694905, + "grad_norm": 0.2266562283039093, + "learning_rate": 6.599252389483413e-05, + "loss": 1.8097, + "step": 13505 + }, + { + "epoch": 4.14548802946593, + "grad_norm": 0.2053738683462143, + "learning_rate": 6.59878143692498e-05, + "loss": 1.6878, + "step": 13506 + }, + { + "epoch": 4.145794966236955, + "grad_norm": 0.19583995640277863, + "learning_rate": 6.598310468566452e-05, + "loss": 1.7547, + "step": 13507 + }, + { + "epoch": 4.14610190300798, + "grad_norm": 0.23421542346477509, + "learning_rate": 6.597839484412484e-05, + "loss": 1.7926, + "step": 13508 + }, + { + "epoch": 4.1464088397790055, + "grad_norm": 0.24575260281562805, + "learning_rate": 6.597368484467728e-05, + "loss": 1.7311, + "step": 13509 + }, + { + "epoch": 4.146715776550031, + "grad_norm": 0.27519574761390686, + "learning_rate": 6.596897468736842e-05, + "loss": 1.7858, + "step": 13510 + }, + { + "epoch": 4.147022713321056, + "grad_norm": 0.26434022188186646, + "learning_rate": 6.596426437224477e-05, + "loss": 1.7387, + "step": 13511 + }, + { + "epoch": 4.147329650092081, + "grad_norm": 0.2192772775888443, + "learning_rate": 6.595955389935291e-05, + "loss": 1.7565, + "step": 13512 + }, + { + "epoch": 4.147636586863106, + "grad_norm": 0.21047350764274597, + "learning_rate": 6.595484326873938e-05, + "loss": 1.7234, + "step": 13513 + }, + { + "epoch": 4.147943523634131, + "grad_norm": 0.22838951647281647, + "learning_rate": 6.595013248045075e-05, + "loss": 1.8205, + "step": 13514 + }, + { + "epoch": 4.148250460405157, + "grad_norm": 0.3467923402786255, + "learning_rate": 6.594542153453356e-05, + "loss": 1.7973, + "step": 13515 + }, + { + "epoch": 4.148557397176182, + "grad_norm": 0.241237074136734, + "learning_rate": 6.594071043103438e-05, + "loss": 1.7764, + "step": 13516 + }, + { + "epoch": 4.148864333947207, + "grad_norm": 0.22543516755104065, + "learning_rate": 6.593599916999973e-05, + "loss": 1.7528, + "step": 13517 + }, + { + "epoch": 4.149171270718232, + "grad_norm": 0.24590276181697845, + "learning_rate": 6.593128775147623e-05, + "loss": 1.7422, + "step": 13518 + }, + { + "epoch": 4.149478207489257, + "grad_norm": 0.2434391975402832, + "learning_rate": 6.592657617551038e-05, + "loss": 1.7523, + "step": 13519 + }, + { + "epoch": 4.149785144260282, + "grad_norm": 0.23169009387493134, + "learning_rate": 6.592186444214877e-05, + "loss": 1.8158, + "step": 13520 + }, + { + "epoch": 4.150092081031308, + "grad_norm": 0.2217840999364853, + "learning_rate": 6.591715255143798e-05, + "loss": 1.7487, + "step": 13521 + }, + { + "epoch": 4.150399017802333, + "grad_norm": 0.2405092418193817, + "learning_rate": 6.591244050342454e-05, + "loss": 1.7726, + "step": 13522 + }, + { + "epoch": 4.150705954573358, + "grad_norm": 0.29432612657546997, + "learning_rate": 6.590772829815504e-05, + "loss": 1.7841, + "step": 13523 + }, + { + "epoch": 4.151012891344383, + "grad_norm": 0.2708737850189209, + "learning_rate": 6.590301593567605e-05, + "loss": 1.8551, + "step": 13524 + }, + { + "epoch": 4.151319828115408, + "grad_norm": 0.26643216609954834, + "learning_rate": 6.589830341603413e-05, + "loss": 1.7697, + "step": 13525 + }, + { + "epoch": 4.151626764886434, + "grad_norm": 0.3672652840614319, + "learning_rate": 6.589359073927587e-05, + "loss": 1.8292, + "step": 13526 + }, + { + "epoch": 4.151933701657459, + "grad_norm": 0.2413325160741806, + "learning_rate": 6.588887790544782e-05, + "loss": 1.7514, + "step": 13527 + }, + { + "epoch": 4.152240638428483, + "grad_norm": 0.3248155117034912, + "learning_rate": 6.588416491459657e-05, + "loss": 1.7437, + "step": 13528 + }, + { + "epoch": 4.152547575199509, + "grad_norm": 0.40951836109161377, + "learning_rate": 6.587945176676869e-05, + "loss": 1.7779, + "step": 13529 + }, + { + "epoch": 4.152854511970534, + "grad_norm": 0.23874351382255554, + "learning_rate": 6.587473846201075e-05, + "loss": 1.8343, + "step": 13530 + }, + { + "epoch": 4.153161448741559, + "grad_norm": 0.4535207450389862, + "learning_rate": 6.587002500036936e-05, + "loss": 1.8301, + "step": 13531 + }, + { + "epoch": 4.153468385512585, + "grad_norm": 0.458003968000412, + "learning_rate": 6.586531138189108e-05, + "loss": 1.7053, + "step": 13532 + }, + { + "epoch": 4.153775322283609, + "grad_norm": 0.24350887537002563, + "learning_rate": 6.586059760662248e-05, + "loss": 1.7642, + "step": 13533 + }, + { + "epoch": 4.1540822590546345, + "grad_norm": 0.46951553225517273, + "learning_rate": 6.585588367461017e-05, + "loss": 1.7345, + "step": 13534 + }, + { + "epoch": 4.15438919582566, + "grad_norm": 0.5524527430534363, + "learning_rate": 6.585116958590072e-05, + "loss": 1.7677, + "step": 13535 + }, + { + "epoch": 4.154696132596685, + "grad_norm": 0.2887112498283386, + "learning_rate": 6.584645534054072e-05, + "loss": 1.7704, + "step": 13536 + }, + { + "epoch": 4.1550030693677105, + "grad_norm": 0.36243724822998047, + "learning_rate": 6.584174093857675e-05, + "loss": 1.8133, + "step": 13537 + }, + { + "epoch": 4.155310006138736, + "grad_norm": 0.3869550824165344, + "learning_rate": 6.583702638005543e-05, + "loss": 1.7253, + "step": 13538 + }, + { + "epoch": 4.15561694290976, + "grad_norm": 0.25859662890434265, + "learning_rate": 6.583231166502333e-05, + "loss": 1.7683, + "step": 13539 + }, + { + "epoch": 4.155923879680786, + "grad_norm": 0.3011144995689392, + "learning_rate": 6.582759679352704e-05, + "loss": 1.7139, + "step": 13540 + }, + { + "epoch": 4.156230816451811, + "grad_norm": 0.38033372163772583, + "learning_rate": 6.582288176561316e-05, + "loss": 1.8182, + "step": 13541 + }, + { + "epoch": 4.156537753222836, + "grad_norm": 0.2224060595035553, + "learning_rate": 6.581816658132829e-05, + "loss": 1.7527, + "step": 13542 + }, + { + "epoch": 4.156844689993862, + "grad_norm": 0.4147234261035919, + "learning_rate": 6.581345124071903e-05, + "loss": 1.7339, + "step": 13543 + }, + { + "epoch": 4.157151626764886, + "grad_norm": 0.45334625244140625, + "learning_rate": 6.580873574383198e-05, + "loss": 1.8166, + "step": 13544 + }, + { + "epoch": 4.157458563535911, + "grad_norm": 0.3050530254840851, + "learning_rate": 6.580402009071372e-05, + "loss": 1.7967, + "step": 13545 + }, + { + "epoch": 4.157765500306937, + "grad_norm": 0.25901293754577637, + "learning_rate": 6.579930428141088e-05, + "loss": 1.7806, + "step": 13546 + }, + { + "epoch": 4.158072437077962, + "grad_norm": 0.3142934739589691, + "learning_rate": 6.579458831597006e-05, + "loss": 1.7724, + "step": 13547 + }, + { + "epoch": 4.158379373848987, + "grad_norm": 0.23943179845809937, + "learning_rate": 6.578987219443787e-05, + "loss": 1.7515, + "step": 13548 + }, + { + "epoch": 4.158686310620013, + "grad_norm": 0.2838635742664337, + "learning_rate": 6.578515591686089e-05, + "loss": 1.7707, + "step": 13549 + }, + { + "epoch": 4.158993247391037, + "grad_norm": 0.3064457178115845, + "learning_rate": 6.578043948328575e-05, + "loss": 1.7839, + "step": 13550 + }, + { + "epoch": 4.1593001841620625, + "grad_norm": 0.2311718463897705, + "learning_rate": 6.577572289375907e-05, + "loss": 1.8298, + "step": 13551 + }, + { + "epoch": 4.159607120933088, + "grad_norm": 0.35726481676101685, + "learning_rate": 6.577100614832743e-05, + "loss": 1.811, + "step": 13552 + }, + { + "epoch": 4.159914057704113, + "grad_norm": 0.3176140785217285, + "learning_rate": 6.576628924703749e-05, + "loss": 1.732, + "step": 13553 + }, + { + "epoch": 4.1602209944751385, + "grad_norm": 0.2325647473335266, + "learning_rate": 6.576157218993582e-05, + "loss": 1.827, + "step": 13554 + }, + { + "epoch": 4.160527931246163, + "grad_norm": 0.32260453701019287, + "learning_rate": 6.575685497706905e-05, + "loss": 1.8218, + "step": 13555 + }, + { + "epoch": 4.160834868017188, + "grad_norm": 0.2638537287712097, + "learning_rate": 6.575213760848382e-05, + "loss": 1.7091, + "step": 13556 + }, + { + "epoch": 4.161141804788214, + "grad_norm": 0.2501799762248993, + "learning_rate": 6.574742008422671e-05, + "loss": 1.7707, + "step": 13557 + }, + { + "epoch": 4.161448741559239, + "grad_norm": 0.3212645649909973, + "learning_rate": 6.574270240434439e-05, + "loss": 1.7541, + "step": 13558 + }, + { + "epoch": 4.161755678330264, + "grad_norm": 0.25915586948394775, + "learning_rate": 6.573798456888345e-05, + "loss": 1.7597, + "step": 13559 + }, + { + "epoch": 4.162062615101289, + "grad_norm": 0.2538192868232727, + "learning_rate": 6.573326657789052e-05, + "loss": 1.8507, + "step": 13560 + }, + { + "epoch": 4.162369551872314, + "grad_norm": 0.2542131543159485, + "learning_rate": 6.572854843141223e-05, + "loss": 1.782, + "step": 13561 + }, + { + "epoch": 4.162676488643339, + "grad_norm": 0.26163414120674133, + "learning_rate": 6.572383012949521e-05, + "loss": 1.8482, + "step": 13562 + }, + { + "epoch": 4.162983425414365, + "grad_norm": 0.2566238343715668, + "learning_rate": 6.571911167218608e-05, + "loss": 1.7284, + "step": 13563 + }, + { + "epoch": 4.16329036218539, + "grad_norm": 0.28413113951683044, + "learning_rate": 6.571439305953147e-05, + "loss": 1.7473, + "step": 13564 + }, + { + "epoch": 4.163597298956415, + "grad_norm": 0.20399242639541626, + "learning_rate": 6.570967429157802e-05, + "loss": 1.6942, + "step": 13565 + }, + { + "epoch": 4.16390423572744, + "grad_norm": 0.256104439496994, + "learning_rate": 6.570495536837235e-05, + "loss": 1.7346, + "step": 13566 + }, + { + "epoch": 4.164211172498465, + "grad_norm": 0.350909560918808, + "learning_rate": 6.570023628996112e-05, + "loss": 1.8284, + "step": 13567 + }, + { + "epoch": 4.1645181092694905, + "grad_norm": 0.23500367999076843, + "learning_rate": 6.569551705639096e-05, + "loss": 1.7504, + "step": 13568 + }, + { + "epoch": 4.164825046040516, + "grad_norm": 0.26683783531188965, + "learning_rate": 6.569079766770849e-05, + "loss": 1.7293, + "step": 13569 + }, + { + "epoch": 4.165131982811541, + "grad_norm": 0.3145855963230133, + "learning_rate": 6.568607812396037e-05, + "loss": 1.8171, + "step": 13570 + }, + { + "epoch": 4.165438919582566, + "grad_norm": 0.2354860156774521, + "learning_rate": 6.568135842519324e-05, + "loss": 1.7555, + "step": 13571 + }, + { + "epoch": 4.165745856353591, + "grad_norm": 0.2893243730068207, + "learning_rate": 6.56766385714537e-05, + "loss": 1.7636, + "step": 13572 + }, + { + "epoch": 4.166052793124616, + "grad_norm": 0.20707663893699646, + "learning_rate": 6.567191856278846e-05, + "loss": 1.7239, + "step": 13573 + }, + { + "epoch": 4.166359729895642, + "grad_norm": 0.34200331568717957, + "learning_rate": 6.566719839924412e-05, + "loss": 1.7848, + "step": 13574 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.23326615989208221, + "learning_rate": 6.566247808086734e-05, + "loss": 1.7447, + "step": 13575 + }, + { + "epoch": 4.166973603437691, + "grad_norm": 0.22375629842281342, + "learning_rate": 6.565775760770479e-05, + "loss": 1.7429, + "step": 13576 + }, + { + "epoch": 4.167280540208717, + "grad_norm": 0.2412862777709961, + "learning_rate": 6.565303697980308e-05, + "loss": 1.7671, + "step": 13577 + }, + { + "epoch": 4.167587476979742, + "grad_norm": 0.2482215315103531, + "learning_rate": 6.56483161972089e-05, + "loss": 1.812, + "step": 13578 + }, + { + "epoch": 4.167894413750767, + "grad_norm": 0.2252974659204483, + "learning_rate": 6.564359525996889e-05, + "loss": 1.8173, + "step": 13579 + }, + { + "epoch": 4.168201350521793, + "grad_norm": 0.23497292399406433, + "learning_rate": 6.563887416812969e-05, + "loss": 1.7945, + "step": 13580 + }, + { + "epoch": 4.168508287292818, + "grad_norm": 0.24911245703697205, + "learning_rate": 6.563415292173796e-05, + "loss": 1.7516, + "step": 13581 + }, + { + "epoch": 4.1688152240638425, + "grad_norm": 0.20920930802822113, + "learning_rate": 6.562943152084039e-05, + "loss": 1.765, + "step": 13582 + }, + { + "epoch": 4.169122160834868, + "grad_norm": 0.26001816987991333, + "learning_rate": 6.562470996548361e-05, + "loss": 1.7504, + "step": 13583 + }, + { + "epoch": 4.169429097605893, + "grad_norm": 0.2504529058933258, + "learning_rate": 6.561998825571429e-05, + "loss": 1.7689, + "step": 13584 + }, + { + "epoch": 4.1697360343769185, + "grad_norm": 0.2210187464952469, + "learning_rate": 6.561526639157908e-05, + "loss": 1.752, + "step": 13585 + }, + { + "epoch": 4.170042971147944, + "grad_norm": 0.26323240995407104, + "learning_rate": 6.561054437312467e-05, + "loss": 1.8104, + "step": 13586 + }, + { + "epoch": 4.170349907918968, + "grad_norm": 0.20436744391918182, + "learning_rate": 6.560582220039771e-05, + "loss": 1.7281, + "step": 13587 + }, + { + "epoch": 4.170656844689994, + "grad_norm": 0.2053878903388977, + "learning_rate": 6.560109987344487e-05, + "loss": 1.7192, + "step": 13588 + }, + { + "epoch": 4.170963781461019, + "grad_norm": 0.2416568547487259, + "learning_rate": 6.559637739231281e-05, + "loss": 1.7679, + "step": 13589 + }, + { + "epoch": 4.171270718232044, + "grad_norm": 0.23847989737987518, + "learning_rate": 6.55916547570482e-05, + "loss": 1.7182, + "step": 13590 + }, + { + "epoch": 4.17157765500307, + "grad_norm": 0.2057785540819168, + "learning_rate": 6.558693196769772e-05, + "loss": 1.816, + "step": 13591 + }, + { + "epoch": 4.171884591774095, + "grad_norm": 0.2270805537700653, + "learning_rate": 6.558220902430804e-05, + "loss": 1.7091, + "step": 13592 + }, + { + "epoch": 4.172191528545119, + "grad_norm": 0.22143644094467163, + "learning_rate": 6.557748592692585e-05, + "loss": 1.7446, + "step": 13593 + }, + { + "epoch": 4.172498465316145, + "grad_norm": 0.2032770961523056, + "learning_rate": 6.557276267559781e-05, + "loss": 1.7501, + "step": 13594 + }, + { + "epoch": 4.17280540208717, + "grad_norm": 0.20851244032382965, + "learning_rate": 6.55680392703706e-05, + "loss": 1.8283, + "step": 13595 + }, + { + "epoch": 4.173112338858195, + "grad_norm": 0.2603934109210968, + "learning_rate": 6.55633157112909e-05, + "loss": 1.8523, + "step": 13596 + }, + { + "epoch": 4.173419275629221, + "grad_norm": 0.2232515811920166, + "learning_rate": 6.55585919984054e-05, + "loss": 1.7803, + "step": 13597 + }, + { + "epoch": 4.173726212400245, + "grad_norm": 0.2541115880012512, + "learning_rate": 6.555386813176075e-05, + "loss": 1.7407, + "step": 13598 + }, + { + "epoch": 4.1740331491712706, + "grad_norm": 0.3044603765010834, + "learning_rate": 6.55491441114037e-05, + "loss": 1.8257, + "step": 13599 + }, + { + "epoch": 4.174340085942296, + "grad_norm": 0.29227301478385925, + "learning_rate": 6.554441993738086e-05, + "loss": 1.7998, + "step": 13600 + }, + { + "epoch": 4.174647022713321, + "grad_norm": 0.25166594982147217, + "learning_rate": 6.553969560973896e-05, + "loss": 1.8258, + "step": 13601 + }, + { + "epoch": 4.1749539594843466, + "grad_norm": 0.22973991930484772, + "learning_rate": 6.55349711285247e-05, + "loss": 1.7871, + "step": 13602 + }, + { + "epoch": 4.175260896255371, + "grad_norm": 0.2615009844303131, + "learning_rate": 6.553024649378473e-05, + "loss": 1.7572, + "step": 13603 + }, + { + "epoch": 4.175567833026396, + "grad_norm": 0.24145473539829254, + "learning_rate": 6.552552170556576e-05, + "loss": 1.7546, + "step": 13604 + }, + { + "epoch": 4.175874769797422, + "grad_norm": 0.21989156305789948, + "learning_rate": 6.55207967639145e-05, + "loss": 1.6939, + "step": 13605 + }, + { + "epoch": 4.176181706568447, + "grad_norm": 0.206025168299675, + "learning_rate": 6.551607166887761e-05, + "loss": 1.7531, + "step": 13606 + }, + { + "epoch": 4.176488643339472, + "grad_norm": 0.2175903469324112, + "learning_rate": 6.551134642050181e-05, + "loss": 1.7631, + "step": 13607 + }, + { + "epoch": 4.176795580110497, + "grad_norm": 0.23259282112121582, + "learning_rate": 6.550662101883379e-05, + "loss": 1.7773, + "step": 13608 + }, + { + "epoch": 4.177102516881522, + "grad_norm": 0.23955227434635162, + "learning_rate": 6.550189546392025e-05, + "loss": 1.7321, + "step": 13609 + }, + { + "epoch": 4.1774094536525475, + "grad_norm": 0.23614998161792755, + "learning_rate": 6.549716975580792e-05, + "loss": 1.7855, + "step": 13610 + }, + { + "epoch": 4.177716390423573, + "grad_norm": 0.2274426817893982, + "learning_rate": 6.549244389454345e-05, + "loss": 1.7778, + "step": 13611 + }, + { + "epoch": 4.178023327194598, + "grad_norm": 0.2204308807849884, + "learning_rate": 6.548771788017358e-05, + "loss": 1.7175, + "step": 13612 + }, + { + "epoch": 4.1783302639656235, + "grad_norm": 0.2283930778503418, + "learning_rate": 6.548299171274501e-05, + "loss": 1.8081, + "step": 13613 + }, + { + "epoch": 4.178637200736648, + "grad_norm": 0.25433486700057983, + "learning_rate": 6.547826539230442e-05, + "loss": 1.8009, + "step": 13614 + }, + { + "epoch": 4.178944137507673, + "grad_norm": 0.24452579021453857, + "learning_rate": 6.547353891889856e-05, + "loss": 1.7244, + "step": 13615 + }, + { + "epoch": 4.179251074278699, + "grad_norm": 0.20611275732517242, + "learning_rate": 6.546881229257411e-05, + "loss": 1.7566, + "step": 13616 + }, + { + "epoch": 4.179558011049724, + "grad_norm": 0.24557232856750488, + "learning_rate": 6.546408551337779e-05, + "loss": 1.7638, + "step": 13617 + }, + { + "epoch": 4.179864947820749, + "grad_norm": 0.2158801257610321, + "learning_rate": 6.545935858135631e-05, + "loss": 1.7659, + "step": 13618 + }, + { + "epoch": 4.180171884591774, + "grad_norm": 0.23800688982009888, + "learning_rate": 6.54546314965564e-05, + "loss": 1.7468, + "step": 13619 + }, + { + "epoch": 4.180478821362799, + "grad_norm": 0.2504122853279114, + "learning_rate": 6.544990425902476e-05, + "loss": 1.7682, + "step": 13620 + }, + { + "epoch": 4.180785758133824, + "grad_norm": 0.21556814014911652, + "learning_rate": 6.54451768688081e-05, + "loss": 1.772, + "step": 13621 + }, + { + "epoch": 4.18109269490485, + "grad_norm": 0.23404552042484283, + "learning_rate": 6.544044932595315e-05, + "loss": 1.7844, + "step": 13622 + }, + { + "epoch": 4.181399631675875, + "grad_norm": 0.22129055857658386, + "learning_rate": 6.543572163050664e-05, + "loss": 1.7725, + "step": 13623 + }, + { + "epoch": 4.1817065684469, + "grad_norm": 0.2533521354198456, + "learning_rate": 6.543099378251528e-05, + "loss": 1.7908, + "step": 13624 + }, + { + "epoch": 4.182013505217925, + "grad_norm": 0.2905815541744232, + "learning_rate": 6.542626578202579e-05, + "loss": 1.7913, + "step": 13625 + }, + { + "epoch": 4.18232044198895, + "grad_norm": 0.3330783247947693, + "learning_rate": 6.54215376290849e-05, + "loss": 1.8374, + "step": 13626 + }, + { + "epoch": 4.1826273787599755, + "grad_norm": 0.29268717765808105, + "learning_rate": 6.541680932373933e-05, + "loss": 1.8714, + "step": 13627 + }, + { + "epoch": 4.182934315531001, + "grad_norm": 0.2820781171321869, + "learning_rate": 6.541208086603584e-05, + "loss": 1.8089, + "step": 13628 + }, + { + "epoch": 4.183241252302026, + "grad_norm": 0.3062323033809662, + "learning_rate": 6.54073522560211e-05, + "loss": 1.7307, + "step": 13629 + }, + { + "epoch": 4.183548189073051, + "grad_norm": 0.3010510504245758, + "learning_rate": 6.54026234937419e-05, + "loss": 1.7523, + "step": 13630 + }, + { + "epoch": 4.183855125844076, + "grad_norm": 0.21932095289230347, + "learning_rate": 6.539789457924493e-05, + "loss": 1.737, + "step": 13631 + }, + { + "epoch": 4.184162062615101, + "grad_norm": 0.2710212469100952, + "learning_rate": 6.539316551257695e-05, + "loss": 1.7228, + "step": 13632 + }, + { + "epoch": 4.184468999386127, + "grad_norm": 0.2885816991329193, + "learning_rate": 6.538843629378469e-05, + "loss": 1.8734, + "step": 13633 + }, + { + "epoch": 4.184775936157152, + "grad_norm": 0.2621026635169983, + "learning_rate": 6.538370692291487e-05, + "loss": 1.7884, + "step": 13634 + }, + { + "epoch": 4.185082872928176, + "grad_norm": 0.30503126978874207, + "learning_rate": 6.537897740001426e-05, + "loss": 1.7833, + "step": 13635 + }, + { + "epoch": 4.185389809699202, + "grad_norm": 0.29491373896598816, + "learning_rate": 6.537424772512955e-05, + "loss": 1.7894, + "step": 13636 + }, + { + "epoch": 4.185696746470227, + "grad_norm": 0.24423296749591827, + "learning_rate": 6.536951789830754e-05, + "loss": 1.7409, + "step": 13637 + }, + { + "epoch": 4.186003683241252, + "grad_norm": 0.2184748351573944, + "learning_rate": 6.536478791959495e-05, + "loss": 1.747, + "step": 13638 + }, + { + "epoch": 4.186310620012278, + "grad_norm": 0.2348455935716629, + "learning_rate": 6.53600577890385e-05, + "loss": 1.7422, + "step": 13639 + }, + { + "epoch": 4.186617556783303, + "grad_norm": 0.2554566264152527, + "learning_rate": 6.535532750668497e-05, + "loss": 1.7623, + "step": 13640 + }, + { + "epoch": 4.1869244935543275, + "grad_norm": 0.26424553990364075, + "learning_rate": 6.535059707258109e-05, + "loss": 1.8408, + "step": 13641 + }, + { + "epoch": 4.187231430325353, + "grad_norm": 0.35363274812698364, + "learning_rate": 6.534586648677361e-05, + "loss": 1.7435, + "step": 13642 + }, + { + "epoch": 4.187538367096378, + "grad_norm": 0.3225265443325043, + "learning_rate": 6.534113574930926e-05, + "loss": 1.7181, + "step": 13643 + }, + { + "epoch": 4.1878453038674035, + "grad_norm": 0.23529650270938873, + "learning_rate": 6.533640486023485e-05, + "loss": 1.7712, + "step": 13644 + }, + { + "epoch": 4.188152240638429, + "grad_norm": 0.3490132987499237, + "learning_rate": 6.53316738195971e-05, + "loss": 1.7329, + "step": 13645 + }, + { + "epoch": 4.188459177409453, + "grad_norm": 0.3759285509586334, + "learning_rate": 6.532694262744274e-05, + "loss": 1.802, + "step": 13646 + }, + { + "epoch": 4.188766114180479, + "grad_norm": 0.27383577823638916, + "learning_rate": 6.532221128381858e-05, + "loss": 1.801, + "step": 13647 + }, + { + "epoch": 4.189073050951504, + "grad_norm": 0.23240652680397034, + "learning_rate": 6.531747978877132e-05, + "loss": 1.8415, + "step": 13648 + }, + { + "epoch": 4.189379987722529, + "grad_norm": 0.3302704989910126, + "learning_rate": 6.531274814234773e-05, + "loss": 1.7765, + "step": 13649 + }, + { + "epoch": 4.189686924493555, + "grad_norm": 0.3209368586540222, + "learning_rate": 6.530801634459463e-05, + "loss": 1.6935, + "step": 13650 + }, + { + "epoch": 4.189993861264579, + "grad_norm": 0.26643648743629456, + "learning_rate": 6.530328439555872e-05, + "loss": 1.8159, + "step": 13651 + }, + { + "epoch": 4.190300798035604, + "grad_norm": 0.22594431042671204, + "learning_rate": 6.529855229528679e-05, + "loss": 1.7764, + "step": 13652 + }, + { + "epoch": 4.19060773480663, + "grad_norm": 0.3288109302520752, + "learning_rate": 6.529382004382561e-05, + "loss": 1.7963, + "step": 13653 + }, + { + "epoch": 4.190914671577655, + "grad_norm": 0.3067106604576111, + "learning_rate": 6.528908764122191e-05, + "loss": 1.7564, + "step": 13654 + }, + { + "epoch": 4.19122160834868, + "grad_norm": 0.23437078297138214, + "learning_rate": 6.528435508752249e-05, + "loss": 1.759, + "step": 13655 + }, + { + "epoch": 4.191528545119706, + "grad_norm": 0.30662333965301514, + "learning_rate": 6.527962238277413e-05, + "loss": 1.7549, + "step": 13656 + }, + { + "epoch": 4.19183548189073, + "grad_norm": 0.3545009195804596, + "learning_rate": 6.527488952702356e-05, + "loss": 1.7761, + "step": 13657 + }, + { + "epoch": 4.1921424186617555, + "grad_norm": 0.2509438991546631, + "learning_rate": 6.52701565203176e-05, + "loss": 1.7162, + "step": 13658 + }, + { + "epoch": 4.192449355432781, + "grad_norm": 0.24423806369304657, + "learning_rate": 6.5265423362703e-05, + "loss": 1.735, + "step": 13659 + }, + { + "epoch": 4.192756292203806, + "grad_norm": 0.37365156412124634, + "learning_rate": 6.526069005422654e-05, + "loss": 1.7697, + "step": 13660 + }, + { + "epoch": 4.1930632289748315, + "grad_norm": 0.4025731682777405, + "learning_rate": 6.525595659493499e-05, + "loss": 1.7931, + "step": 13661 + }, + { + "epoch": 4.193370165745856, + "grad_norm": 0.31360915303230286, + "learning_rate": 6.525122298487514e-05, + "loss": 1.8014, + "step": 13662 + }, + { + "epoch": 4.193677102516881, + "grad_norm": 0.2480524778366089, + "learning_rate": 6.524648922409376e-05, + "loss": 1.7753, + "step": 13663 + }, + { + "epoch": 4.193984039287907, + "grad_norm": 0.33740919828414917, + "learning_rate": 6.524175531263765e-05, + "loss": 1.7296, + "step": 13664 + }, + { + "epoch": 4.194290976058932, + "grad_norm": 0.26871639490127563, + "learning_rate": 6.523702125055358e-05, + "loss": 1.7113, + "step": 13665 + }, + { + "epoch": 4.194597912829957, + "grad_norm": 0.2687455415725708, + "learning_rate": 6.52322870378883e-05, + "loss": 1.7645, + "step": 13666 + }, + { + "epoch": 4.194904849600983, + "grad_norm": 0.4207400679588318, + "learning_rate": 6.522755267468868e-05, + "loss": 1.7758, + "step": 13667 + }, + { + "epoch": 4.195211786372007, + "grad_norm": 0.36043494939804077, + "learning_rate": 6.522281816100142e-05, + "loss": 1.7433, + "step": 13668 + }, + { + "epoch": 4.195518723143032, + "grad_norm": 0.2515890598297119, + "learning_rate": 6.52180834968734e-05, + "loss": 1.7646, + "step": 13669 + }, + { + "epoch": 4.195825659914058, + "grad_norm": 0.2871458828449249, + "learning_rate": 6.521334868235132e-05, + "loss": 1.8147, + "step": 13670 + }, + { + "epoch": 4.196132596685083, + "grad_norm": 0.28454354405403137, + "learning_rate": 6.5208613717482e-05, + "loss": 1.8576, + "step": 13671 + }, + { + "epoch": 4.196439533456108, + "grad_norm": 0.2520541548728943, + "learning_rate": 6.520387860231227e-05, + "loss": 1.7513, + "step": 13672 + }, + { + "epoch": 4.196746470227133, + "grad_norm": 0.22782307863235474, + "learning_rate": 6.51991433368889e-05, + "loss": 1.7737, + "step": 13673 + }, + { + "epoch": 4.197053406998158, + "grad_norm": 0.2451259195804596, + "learning_rate": 6.519440792125869e-05, + "loss": 1.7483, + "step": 13674 + }, + { + "epoch": 4.1973603437691835, + "grad_norm": 0.21915963292121887, + "learning_rate": 6.518967235546841e-05, + "loss": 1.718, + "step": 13675 + }, + { + "epoch": 4.197667280540209, + "grad_norm": 0.23005805909633636, + "learning_rate": 6.51849366395649e-05, + "loss": 1.7786, + "step": 13676 + }, + { + "epoch": 4.197974217311234, + "grad_norm": 0.25039517879486084, + "learning_rate": 6.518020077359494e-05, + "loss": 1.7785, + "step": 13677 + }, + { + "epoch": 4.198281154082259, + "grad_norm": 0.26631081104278564, + "learning_rate": 6.517546475760535e-05, + "loss": 1.7921, + "step": 13678 + }, + { + "epoch": 4.198588090853284, + "grad_norm": 0.2220793515443802, + "learning_rate": 6.517072859164292e-05, + "loss": 1.7696, + "step": 13679 + }, + { + "epoch": 4.198895027624309, + "grad_norm": 0.24681030213832855, + "learning_rate": 6.516599227575446e-05, + "loss": 1.7702, + "step": 13680 + }, + { + "epoch": 4.199201964395335, + "grad_norm": 0.2421828955411911, + "learning_rate": 6.516125580998678e-05, + "loss": 1.8058, + "step": 13681 + }, + { + "epoch": 4.19950890116636, + "grad_norm": 0.2170087695121765, + "learning_rate": 6.515651919438667e-05, + "loss": 1.7271, + "step": 13682 + }, + { + "epoch": 4.199815837937384, + "grad_norm": 0.23383566737174988, + "learning_rate": 6.515178242900096e-05, + "loss": 1.7515, + "step": 13683 + }, + { + "epoch": 4.20012277470841, + "grad_norm": 0.2522997558116913, + "learning_rate": 6.514704551387645e-05, + "loss": 1.7619, + "step": 13684 + }, + { + "epoch": 4.200429711479435, + "grad_norm": 0.20973703265190125, + "learning_rate": 6.514230844905995e-05, + "loss": 1.7326, + "step": 13685 + }, + { + "epoch": 4.2007366482504604, + "grad_norm": 0.2308073341846466, + "learning_rate": 6.513757123459832e-05, + "loss": 1.811, + "step": 13686 + }, + { + "epoch": 4.201043585021486, + "grad_norm": 0.21751229465007782, + "learning_rate": 6.51328338705383e-05, + "loss": 1.7795, + "step": 13687 + }, + { + "epoch": 4.201350521792511, + "grad_norm": 0.2357407957315445, + "learning_rate": 6.512809635692675e-05, + "loss": 1.8069, + "step": 13688 + }, + { + "epoch": 4.201657458563536, + "grad_norm": 0.32245033979415894, + "learning_rate": 6.51233586938105e-05, + "loss": 1.8179, + "step": 13689 + }, + { + "epoch": 4.201964395334561, + "grad_norm": 0.22740167379379272, + "learning_rate": 6.511862088123635e-05, + "loss": 1.7482, + "step": 13690 + }, + { + "epoch": 4.202271332105586, + "grad_norm": 0.26880496740341187, + "learning_rate": 6.511388291925114e-05, + "loss": 1.7919, + "step": 13691 + }, + { + "epoch": 4.202578268876612, + "grad_norm": 0.2261822521686554, + "learning_rate": 6.510914480790166e-05, + "loss": 1.7543, + "step": 13692 + }, + { + "epoch": 4.202885205647637, + "grad_norm": 0.2635782063007355, + "learning_rate": 6.510440654723477e-05, + "loss": 1.7874, + "step": 13693 + }, + { + "epoch": 4.203192142418661, + "grad_norm": 0.2505982518196106, + "learning_rate": 6.509966813729726e-05, + "loss": 1.8016, + "step": 13694 + }, + { + "epoch": 4.203499079189687, + "grad_norm": 0.23177236318588257, + "learning_rate": 6.5094929578136e-05, + "loss": 1.7582, + "step": 13695 + }, + { + "epoch": 4.203806015960712, + "grad_norm": 0.2315056324005127, + "learning_rate": 6.509019086979779e-05, + "loss": 1.7418, + "step": 13696 + }, + { + "epoch": 4.204112952731737, + "grad_norm": 0.25565484166145325, + "learning_rate": 6.508545201232947e-05, + "loss": 1.7476, + "step": 13697 + }, + { + "epoch": 4.204419889502763, + "grad_norm": 0.29210081696510315, + "learning_rate": 6.508071300577787e-05, + "loss": 1.8397, + "step": 13698 + }, + { + "epoch": 4.204726826273788, + "grad_norm": 0.2830582559108734, + "learning_rate": 6.507597385018984e-05, + "loss": 1.834, + "step": 13699 + }, + { + "epoch": 4.2050337630448125, + "grad_norm": 0.23013398051261902, + "learning_rate": 6.507123454561217e-05, + "loss": 1.7593, + "step": 13700 + }, + { + "epoch": 4.205340699815838, + "grad_norm": 0.21970276534557343, + "learning_rate": 6.506649509209174e-05, + "loss": 1.754, + "step": 13701 + }, + { + "epoch": 4.205647636586863, + "grad_norm": 0.32052233815193176, + "learning_rate": 6.50617554896754e-05, + "loss": 1.7531, + "step": 13702 + }, + { + "epoch": 4.2059545733578885, + "grad_norm": 0.2597332000732422, + "learning_rate": 6.505701573840995e-05, + "loss": 1.7836, + "step": 13703 + }, + { + "epoch": 4.206261510128914, + "grad_norm": 0.22070355713367462, + "learning_rate": 6.505227583834224e-05, + "loss": 1.7225, + "step": 13704 + }, + { + "epoch": 4.206568446899938, + "grad_norm": 0.27219358086586, + "learning_rate": 6.50475357895191e-05, + "loss": 1.8215, + "step": 13705 + }, + { + "epoch": 4.206875383670964, + "grad_norm": 0.32541659474372864, + "learning_rate": 6.504279559198741e-05, + "loss": 1.7786, + "step": 13706 + }, + { + "epoch": 4.207182320441989, + "grad_norm": 0.25871729850769043, + "learning_rate": 6.5038055245794e-05, + "loss": 1.7621, + "step": 13707 + }, + { + "epoch": 4.207489257213014, + "grad_norm": 0.2190464735031128, + "learning_rate": 6.50333147509857e-05, + "loss": 1.7612, + "step": 13708 + }, + { + "epoch": 4.20779619398404, + "grad_norm": 0.19565832614898682, + "learning_rate": 6.50285741076094e-05, + "loss": 1.7581, + "step": 13709 + }, + { + "epoch": 4.208103130755064, + "grad_norm": 0.1889251321554184, + "learning_rate": 6.50238333157119e-05, + "loss": 1.7611, + "step": 13710 + }, + { + "epoch": 4.208410067526089, + "grad_norm": 0.2013053596019745, + "learning_rate": 6.501909237534008e-05, + "loss": 1.7393, + "step": 13711 + }, + { + "epoch": 4.208717004297115, + "grad_norm": 0.1899433434009552, + "learning_rate": 6.501435128654077e-05, + "loss": 1.7122, + "step": 13712 + }, + { + "epoch": 4.20902394106814, + "grad_norm": 0.19337882101535797, + "learning_rate": 6.500961004936085e-05, + "loss": 1.7538, + "step": 13713 + }, + { + "epoch": 4.209330877839165, + "grad_norm": 0.20419920980930328, + "learning_rate": 6.500486866384718e-05, + "loss": 1.728, + "step": 13714 + }, + { + "epoch": 4.209637814610191, + "grad_norm": 0.20615679025650024, + "learning_rate": 6.50001271300466e-05, + "loss": 1.7843, + "step": 13715 + }, + { + "epoch": 4.209944751381215, + "grad_norm": 0.22178977727890015, + "learning_rate": 6.499538544800596e-05, + "loss": 1.7751, + "step": 13716 + }, + { + "epoch": 4.2102516881522405, + "grad_norm": 0.23703891038894653, + "learning_rate": 6.499064361777214e-05, + "loss": 1.7304, + "step": 13717 + }, + { + "epoch": 4.210558624923266, + "grad_norm": 0.2785723805427551, + "learning_rate": 6.498590163939198e-05, + "loss": 1.802, + "step": 13718 + }, + { + "epoch": 4.210865561694291, + "grad_norm": 0.23277060687541962, + "learning_rate": 6.498115951291237e-05, + "loss": 1.7316, + "step": 13719 + }, + { + "epoch": 4.2111724984653165, + "grad_norm": 0.22289474308490753, + "learning_rate": 6.497641723838017e-05, + "loss": 1.8469, + "step": 13720 + }, + { + "epoch": 4.211479435236341, + "grad_norm": 0.2715846002101898, + "learning_rate": 6.497167481584221e-05, + "loss": 1.7919, + "step": 13721 + }, + { + "epoch": 4.211786372007366, + "grad_norm": 0.29262226819992065, + "learning_rate": 6.49669322453454e-05, + "loss": 1.8379, + "step": 13722 + }, + { + "epoch": 4.212093308778392, + "grad_norm": 0.29136186838150024, + "learning_rate": 6.49621895269366e-05, + "loss": 1.789, + "step": 13723 + }, + { + "epoch": 4.212400245549417, + "grad_norm": 0.25110194087028503, + "learning_rate": 6.495744666066266e-05, + "loss": 1.7574, + "step": 13724 + }, + { + "epoch": 4.212707182320442, + "grad_norm": 0.2301366776227951, + "learning_rate": 6.495270364657048e-05, + "loss": 1.7637, + "step": 13725 + }, + { + "epoch": 4.213014119091467, + "grad_norm": 0.2556478977203369, + "learning_rate": 6.49479604847069e-05, + "loss": 1.7975, + "step": 13726 + }, + { + "epoch": 4.213321055862492, + "grad_norm": 0.2645667493343353, + "learning_rate": 6.494321717511884e-05, + "loss": 1.7594, + "step": 13727 + }, + { + "epoch": 4.213627992633517, + "grad_norm": 0.23664188385009766, + "learning_rate": 6.493847371785312e-05, + "loss": 1.7963, + "step": 13728 + }, + { + "epoch": 4.213934929404543, + "grad_norm": 0.2947930693626404, + "learning_rate": 6.493373011295665e-05, + "loss": 1.7477, + "step": 13729 + }, + { + "epoch": 4.214241866175568, + "grad_norm": 0.34598737955093384, + "learning_rate": 6.492898636047631e-05, + "loss": 1.7014, + "step": 13730 + }, + { + "epoch": 4.214548802946593, + "grad_norm": 0.24406935274600983, + "learning_rate": 6.4924242460459e-05, + "loss": 1.7436, + "step": 13731 + }, + { + "epoch": 4.214855739717618, + "grad_norm": 0.27176225185394287, + "learning_rate": 6.491949841295156e-05, + "loss": 1.8429, + "step": 13732 + }, + { + "epoch": 4.215162676488643, + "grad_norm": 0.2506968080997467, + "learning_rate": 6.491475421800089e-05, + "loss": 1.7519, + "step": 13733 + }, + { + "epoch": 4.2154696132596685, + "grad_norm": 0.2240980863571167, + "learning_rate": 6.491000987565387e-05, + "loss": 1.7595, + "step": 13734 + }, + { + "epoch": 4.215776550030694, + "grad_norm": 0.23201732337474823, + "learning_rate": 6.490526538595741e-05, + "loss": 1.7466, + "step": 13735 + }, + { + "epoch": 4.216083486801719, + "grad_norm": 0.24624750018119812, + "learning_rate": 6.490052074895836e-05, + "loss": 1.7364, + "step": 13736 + }, + { + "epoch": 4.216390423572744, + "grad_norm": 0.22936980426311493, + "learning_rate": 6.489577596470366e-05, + "loss": 1.7095, + "step": 13737 + }, + { + "epoch": 4.216697360343769, + "grad_norm": 0.2106638103723526, + "learning_rate": 6.489103103324016e-05, + "loss": 1.7387, + "step": 13738 + }, + { + "epoch": 4.217004297114794, + "grad_norm": 0.2936140298843384, + "learning_rate": 6.488628595461477e-05, + "loss": 1.9129, + "step": 13739 + }, + { + "epoch": 4.21731123388582, + "grad_norm": 0.21871696412563324, + "learning_rate": 6.488154072887435e-05, + "loss": 1.7489, + "step": 13740 + }, + { + "epoch": 4.217618170656845, + "grad_norm": 0.25941070914268494, + "learning_rate": 6.487679535606583e-05, + "loss": 1.7788, + "step": 13741 + }, + { + "epoch": 4.21792510742787, + "grad_norm": 0.2540862560272217, + "learning_rate": 6.487204983623612e-05, + "loss": 1.8074, + "step": 13742 + }, + { + "epoch": 4.218232044198895, + "grad_norm": 0.25180327892303467, + "learning_rate": 6.486730416943207e-05, + "loss": 1.7503, + "step": 13743 + }, + { + "epoch": 4.21853898096992, + "grad_norm": 0.26625585556030273, + "learning_rate": 6.486255835570063e-05, + "loss": 1.8149, + "step": 13744 + }, + { + "epoch": 4.218845917740945, + "grad_norm": 0.3023914396762848, + "learning_rate": 6.485781239508867e-05, + "loss": 1.8599, + "step": 13745 + }, + { + "epoch": 4.219152854511971, + "grad_norm": 0.2683780789375305, + "learning_rate": 6.48530662876431e-05, + "loss": 1.7911, + "step": 13746 + }, + { + "epoch": 4.219459791282996, + "grad_norm": 0.20747442543506622, + "learning_rate": 6.484832003341081e-05, + "loss": 1.7343, + "step": 13747 + }, + { + "epoch": 4.2197667280540205, + "grad_norm": 0.29284465312957764, + "learning_rate": 6.484357363243873e-05, + "loss": 1.7917, + "step": 13748 + }, + { + "epoch": 4.220073664825046, + "grad_norm": 0.24303840100765228, + "learning_rate": 6.483882708477376e-05, + "loss": 1.7921, + "step": 13749 + }, + { + "epoch": 4.220380601596071, + "grad_norm": 0.26253026723861694, + "learning_rate": 6.48340803904628e-05, + "loss": 1.7971, + "step": 13750 + }, + { + "epoch": 4.2206875383670965, + "grad_norm": 0.23888511955738068, + "learning_rate": 6.482933354955275e-05, + "loss": 1.7967, + "step": 13751 + }, + { + "epoch": 4.220994475138122, + "grad_norm": 0.24966883659362793, + "learning_rate": 6.482458656209054e-05, + "loss": 1.7924, + "step": 13752 + }, + { + "epoch": 4.221301411909146, + "grad_norm": 0.26556864380836487, + "learning_rate": 6.481983942812309e-05, + "loss": 1.8608, + "step": 13753 + }, + { + "epoch": 4.221608348680172, + "grad_norm": 0.29064711928367615, + "learning_rate": 6.48150921476973e-05, + "loss": 1.7785, + "step": 13754 + }, + { + "epoch": 4.221915285451197, + "grad_norm": 0.30876123905181885, + "learning_rate": 6.481034472086008e-05, + "loss": 1.8287, + "step": 13755 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.2622467875480652, + "learning_rate": 6.480559714765835e-05, + "loss": 1.8336, + "step": 13756 + }, + { + "epoch": 4.222529158993248, + "grad_norm": 0.2502644956111908, + "learning_rate": 6.480084942813902e-05, + "loss": 1.7803, + "step": 13757 + }, + { + "epoch": 4.222836095764273, + "grad_norm": 0.2879922688007355, + "learning_rate": 6.479610156234903e-05, + "loss": 1.7544, + "step": 13758 + }, + { + "epoch": 4.223143032535297, + "grad_norm": 0.2831384241580963, + "learning_rate": 6.47913535503353e-05, + "loss": 1.887, + "step": 13759 + }, + { + "epoch": 4.223449969306323, + "grad_norm": 0.3221064805984497, + "learning_rate": 6.478660539214474e-05, + "loss": 1.7455, + "step": 13760 + }, + { + "epoch": 4.223756906077348, + "grad_norm": 0.4231930673122406, + "learning_rate": 6.478185708782427e-05, + "loss": 1.8209, + "step": 13761 + }, + { + "epoch": 4.224063842848373, + "grad_norm": 0.34327802062034607, + "learning_rate": 6.477710863742083e-05, + "loss": 1.7754, + "step": 13762 + }, + { + "epoch": 4.224370779619399, + "grad_norm": 0.21713349223136902, + "learning_rate": 6.477236004098135e-05, + "loss": 1.7576, + "step": 13763 + }, + { + "epoch": 4.224677716390423, + "grad_norm": 0.3262602388858795, + "learning_rate": 6.476761129855275e-05, + "loss": 1.7772, + "step": 13764 + }, + { + "epoch": 4.2249846531614486, + "grad_norm": 0.3231413662433624, + "learning_rate": 6.476286241018195e-05, + "loss": 1.7821, + "step": 13765 + }, + { + "epoch": 4.225291589932474, + "grad_norm": 0.2440098226070404, + "learning_rate": 6.475811337591588e-05, + "loss": 1.7684, + "step": 13766 + }, + { + "epoch": 4.225598526703499, + "grad_norm": 0.329949289560318, + "learning_rate": 6.475336419580151e-05, + "loss": 1.8564, + "step": 13767 + }, + { + "epoch": 4.225905463474525, + "grad_norm": 0.3567483425140381, + "learning_rate": 6.474861486988574e-05, + "loss": 1.7625, + "step": 13768 + }, + { + "epoch": 4.226212400245549, + "grad_norm": 0.25257283449172974, + "learning_rate": 6.47438653982155e-05, + "loss": 1.823, + "step": 13769 + }, + { + "epoch": 4.226519337016574, + "grad_norm": 0.31542617082595825, + "learning_rate": 6.473911578083776e-05, + "loss": 1.7817, + "step": 13770 + }, + { + "epoch": 4.2268262737876, + "grad_norm": 0.29670149087905884, + "learning_rate": 6.473436601779944e-05, + "loss": 1.7493, + "step": 13771 + }, + { + "epoch": 4.227133210558625, + "grad_norm": 0.2635453939437866, + "learning_rate": 6.472961610914745e-05, + "loss": 1.792, + "step": 13772 + }, + { + "epoch": 4.22744014732965, + "grad_norm": 0.25017979741096497, + "learning_rate": 6.472486605492878e-05, + "loss": 1.7183, + "step": 13773 + }, + { + "epoch": 4.227747084100676, + "grad_norm": 0.3766646087169647, + "learning_rate": 6.472011585519034e-05, + "loss": 1.8039, + "step": 13774 + }, + { + "epoch": 4.2280540208717, + "grad_norm": 0.29860204458236694, + "learning_rate": 6.47153655099791e-05, + "loss": 1.8016, + "step": 13775 + }, + { + "epoch": 4.2283609576427255, + "grad_norm": 0.2540898323059082, + "learning_rate": 6.4710615019342e-05, + "loss": 1.8481, + "step": 13776 + }, + { + "epoch": 4.228667894413751, + "grad_norm": 0.3677786886692047, + "learning_rate": 6.470586438332597e-05, + "loss": 1.7663, + "step": 13777 + }, + { + "epoch": 4.228974831184776, + "grad_norm": 0.35693466663360596, + "learning_rate": 6.470111360197797e-05, + "loss": 1.7733, + "step": 13778 + }, + { + "epoch": 4.2292817679558015, + "grad_norm": 0.23747926950454712, + "learning_rate": 6.469636267534496e-05, + "loss": 1.7938, + "step": 13779 + }, + { + "epoch": 4.229588704726826, + "grad_norm": 0.32890695333480835, + "learning_rate": 6.469161160347386e-05, + "loss": 1.7233, + "step": 13780 + }, + { + "epoch": 4.229895641497851, + "grad_norm": 0.3437706530094147, + "learning_rate": 6.468686038641164e-05, + "loss": 1.7716, + "step": 13781 + }, + { + "epoch": 4.230202578268877, + "grad_norm": 0.23452162742614746, + "learning_rate": 6.468210902420527e-05, + "loss": 1.764, + "step": 13782 + }, + { + "epoch": 4.230509515039902, + "grad_norm": 0.3205265402793884, + "learning_rate": 6.46773575169017e-05, + "loss": 1.7464, + "step": 13783 + }, + { + "epoch": 4.230816451810927, + "grad_norm": 0.4234732985496521, + "learning_rate": 6.467260586454787e-05, + "loss": 1.7786, + "step": 13784 + }, + { + "epoch": 4.231123388581952, + "grad_norm": 0.2484128773212433, + "learning_rate": 6.466785406719076e-05, + "loss": 1.8125, + "step": 13785 + }, + { + "epoch": 4.231430325352977, + "grad_norm": 0.3696556091308594, + "learning_rate": 6.46631021248773e-05, + "loss": 1.7974, + "step": 13786 + }, + { + "epoch": 4.231737262124002, + "grad_norm": 0.4251437485218048, + "learning_rate": 6.465835003765449e-05, + "loss": 1.7486, + "step": 13787 + }, + { + "epoch": 4.232044198895028, + "grad_norm": 0.2507621943950653, + "learning_rate": 6.465359780556927e-05, + "loss": 1.829, + "step": 13788 + }, + { + "epoch": 4.232351135666053, + "grad_norm": 0.2911818325519562, + "learning_rate": 6.464884542866861e-05, + "loss": 1.7401, + "step": 13789 + }, + { + "epoch": 4.232658072437078, + "grad_norm": 0.35354506969451904, + "learning_rate": 6.464409290699946e-05, + "loss": 1.7848, + "step": 13790 + }, + { + "epoch": 4.232965009208103, + "grad_norm": 0.2659081518650055, + "learning_rate": 6.46393402406088e-05, + "loss": 1.7408, + "step": 13791 + }, + { + "epoch": 4.233271945979128, + "grad_norm": 0.22676481306552887, + "learning_rate": 6.46345874295436e-05, + "loss": 1.7542, + "step": 13792 + }, + { + "epoch": 4.2335788827501535, + "grad_norm": 0.2549789845943451, + "learning_rate": 6.462983447385085e-05, + "loss": 1.8095, + "step": 13793 + }, + { + "epoch": 4.233885819521179, + "grad_norm": 0.2157238870859146, + "learning_rate": 6.462508137357748e-05, + "loss": 1.7529, + "step": 13794 + }, + { + "epoch": 4.234192756292204, + "grad_norm": 0.2494724988937378, + "learning_rate": 6.46203281287705e-05, + "loss": 1.7839, + "step": 13795 + }, + { + "epoch": 4.234499693063229, + "grad_norm": 0.29560065269470215, + "learning_rate": 6.461557473947685e-05, + "loss": 1.7239, + "step": 13796 + }, + { + "epoch": 4.234806629834254, + "grad_norm": 0.23693916201591492, + "learning_rate": 6.461082120574354e-05, + "loss": 1.8074, + "step": 13797 + }, + { + "epoch": 4.235113566605279, + "grad_norm": 0.2538869082927704, + "learning_rate": 6.460606752761752e-05, + "loss": 1.8319, + "step": 13798 + }, + { + "epoch": 4.235420503376305, + "grad_norm": 0.3186401426792145, + "learning_rate": 6.460131370514578e-05, + "loss": 1.7877, + "step": 13799 + }, + { + "epoch": 4.23572744014733, + "grad_norm": 0.2473619133234024, + "learning_rate": 6.45965597383753e-05, + "loss": 1.8323, + "step": 13800 + }, + { + "epoch": 4.236034376918354, + "grad_norm": 0.32806503772735596, + "learning_rate": 6.459180562735307e-05, + "loss": 1.744, + "step": 13801 + }, + { + "epoch": 4.23634131368938, + "grad_norm": 0.3975784480571747, + "learning_rate": 6.458705137212606e-05, + "loss": 1.7216, + "step": 13802 + }, + { + "epoch": 4.236648250460405, + "grad_norm": 0.2946135997772217, + "learning_rate": 6.458229697274125e-05, + "loss": 1.8781, + "step": 13803 + }, + { + "epoch": 4.23695518723143, + "grad_norm": 0.25109192728996277, + "learning_rate": 6.457754242924565e-05, + "loss": 1.7458, + "step": 13804 + }, + { + "epoch": 4.237262124002456, + "grad_norm": 0.2763883173465729, + "learning_rate": 6.457278774168623e-05, + "loss": 1.7612, + "step": 13805 + }, + { + "epoch": 4.237569060773481, + "grad_norm": 0.22427856922149658, + "learning_rate": 6.456803291010996e-05, + "loss": 1.8049, + "step": 13806 + }, + { + "epoch": 4.2378759975445055, + "grad_norm": 0.28295788168907166, + "learning_rate": 6.456327793456387e-05, + "loss": 1.7608, + "step": 13807 + }, + { + "epoch": 4.238182934315531, + "grad_norm": 0.27857527136802673, + "learning_rate": 6.455852281509493e-05, + "loss": 1.7281, + "step": 13808 + }, + { + "epoch": 4.238489871086556, + "grad_norm": 0.24014849960803986, + "learning_rate": 6.455376755175012e-05, + "loss": 1.7247, + "step": 13809 + }, + { + "epoch": 4.2387968078575815, + "grad_norm": 0.25149038434028625, + "learning_rate": 6.454901214457646e-05, + "loss": 1.8575, + "step": 13810 + }, + { + "epoch": 4.239103744628607, + "grad_norm": 0.32072681188583374, + "learning_rate": 6.454425659362093e-05, + "loss": 1.7421, + "step": 13811 + }, + { + "epoch": 4.239410681399631, + "grad_norm": 0.28418242931365967, + "learning_rate": 6.453950089893054e-05, + "loss": 1.7031, + "step": 13812 + }, + { + "epoch": 4.239717618170657, + "grad_norm": 0.23725132644176483, + "learning_rate": 6.453474506055228e-05, + "loss": 1.7901, + "step": 13813 + }, + { + "epoch": 4.240024554941682, + "grad_norm": 0.3056317865848541, + "learning_rate": 6.452998907853315e-05, + "loss": 1.7414, + "step": 13814 + }, + { + "epoch": 4.240331491712707, + "grad_norm": 0.3111891448497772, + "learning_rate": 6.452523295292013e-05, + "loss": 1.7532, + "step": 13815 + }, + { + "epoch": 4.240638428483733, + "grad_norm": 0.2126779705286026, + "learning_rate": 6.452047668376027e-05, + "loss": 1.6779, + "step": 13816 + }, + { + "epoch": 4.240945365254758, + "grad_norm": 0.26660779118537903, + "learning_rate": 6.451572027110054e-05, + "loss": 1.7162, + "step": 13817 + }, + { + "epoch": 4.241252302025782, + "grad_norm": 0.25901922583580017, + "learning_rate": 6.451096371498794e-05, + "loss": 1.7784, + "step": 13818 + }, + { + "epoch": 4.241559238796808, + "grad_norm": 0.24091807007789612, + "learning_rate": 6.450620701546953e-05, + "loss": 1.7928, + "step": 13819 + }, + { + "epoch": 4.241866175567833, + "grad_norm": 0.25097009539604187, + "learning_rate": 6.450145017259225e-05, + "loss": 1.761, + "step": 13820 + }, + { + "epoch": 4.242173112338858, + "grad_norm": 0.22978942096233368, + "learning_rate": 6.449669318640315e-05, + "loss": 1.7891, + "step": 13821 + }, + { + "epoch": 4.242480049109884, + "grad_norm": 0.27255937457084656, + "learning_rate": 6.449193605694923e-05, + "loss": 1.7964, + "step": 13822 + }, + { + "epoch": 4.242786985880908, + "grad_norm": 0.2210773378610611, + "learning_rate": 6.44871787842775e-05, + "loss": 1.7628, + "step": 13823 + }, + { + "epoch": 4.2430939226519335, + "grad_norm": 0.25784751772880554, + "learning_rate": 6.448242136843497e-05, + "loss": 1.7596, + "step": 13824 + }, + { + "epoch": 4.243400859422959, + "grad_norm": 0.23475486040115356, + "learning_rate": 6.447766380946868e-05, + "loss": 1.8174, + "step": 13825 + }, + { + "epoch": 4.243707796193984, + "grad_norm": 0.2567705512046814, + "learning_rate": 6.447290610742561e-05, + "loss": 1.737, + "step": 13826 + }, + { + "epoch": 4.2440147329650095, + "grad_norm": 0.23973144590854645, + "learning_rate": 6.446814826235281e-05, + "loss": 1.7881, + "step": 13827 + }, + { + "epoch": 4.244321669736034, + "grad_norm": 0.25584739446640015, + "learning_rate": 6.446339027429729e-05, + "loss": 1.7673, + "step": 13828 + }, + { + "epoch": 4.244628606507059, + "grad_norm": 0.2653748393058777, + "learning_rate": 6.445863214330608e-05, + "loss": 1.7443, + "step": 13829 + }, + { + "epoch": 4.244935543278085, + "grad_norm": 0.2492038607597351, + "learning_rate": 6.445387386942619e-05, + "loss": 1.7223, + "step": 13830 + }, + { + "epoch": 4.24524248004911, + "grad_norm": 0.2282228320837021, + "learning_rate": 6.444911545270464e-05, + "loss": 1.7577, + "step": 13831 + }, + { + "epoch": 4.245549416820135, + "grad_norm": 0.2411092072725296, + "learning_rate": 6.444435689318845e-05, + "loss": 1.7324, + "step": 13832 + }, + { + "epoch": 4.245856353591161, + "grad_norm": 0.21557089686393738, + "learning_rate": 6.443959819092468e-05, + "loss": 1.7355, + "step": 13833 + }, + { + "epoch": 4.246163290362185, + "grad_norm": 0.2500394880771637, + "learning_rate": 6.443483934596033e-05, + "loss": 1.775, + "step": 13834 + }, + { + "epoch": 4.24647022713321, + "grad_norm": 0.24135248363018036, + "learning_rate": 6.443008035834244e-05, + "loss": 1.7885, + "step": 13835 + }, + { + "epoch": 4.246777163904236, + "grad_norm": 0.22860904037952423, + "learning_rate": 6.442532122811803e-05, + "loss": 1.7891, + "step": 13836 + }, + { + "epoch": 4.247084100675261, + "grad_norm": 0.2277665138244629, + "learning_rate": 6.442056195533415e-05, + "loss": 1.7583, + "step": 13837 + }, + { + "epoch": 4.247391037446286, + "grad_norm": 0.22822454571723938, + "learning_rate": 6.441580254003782e-05, + "loss": 1.7777, + "step": 13838 + }, + { + "epoch": 4.247697974217311, + "grad_norm": 0.24274896085262299, + "learning_rate": 6.441104298227608e-05, + "loss": 1.7537, + "step": 13839 + }, + { + "epoch": 4.248004910988336, + "grad_norm": 0.25080999732017517, + "learning_rate": 6.440628328209598e-05, + "loss": 1.7537, + "step": 13840 + }, + { + "epoch": 4.2483118477593615, + "grad_norm": 0.22409579157829285, + "learning_rate": 6.440152343954453e-05, + "loss": 1.7652, + "step": 13841 + }, + { + "epoch": 4.248618784530387, + "grad_norm": 0.24028798937797546, + "learning_rate": 6.439676345466877e-05, + "loss": 1.7512, + "step": 13842 + }, + { + "epoch": 4.248925721301412, + "grad_norm": 0.28739503026008606, + "learning_rate": 6.439200332751576e-05, + "loss": 1.8034, + "step": 13843 + }, + { + "epoch": 4.249232658072437, + "grad_norm": 0.2244807928800583, + "learning_rate": 6.438724305813255e-05, + "loss": 1.7243, + "step": 13844 + }, + { + "epoch": 4.249539594843462, + "grad_norm": 0.24478118121623993, + "learning_rate": 6.438248264656618e-05, + "loss": 1.7754, + "step": 13845 + }, + { + "epoch": 4.249846531614487, + "grad_norm": 0.25554370880126953, + "learning_rate": 6.437772209286368e-05, + "loss": 1.7845, + "step": 13846 + }, + { + "epoch": 4.250153468385513, + "grad_norm": 0.24478472769260406, + "learning_rate": 6.43729613970721e-05, + "loss": 1.7954, + "step": 13847 + }, + { + "epoch": 4.250460405156538, + "grad_norm": 0.22287282347679138, + "learning_rate": 6.436820055923849e-05, + "loss": 1.7379, + "step": 13848 + }, + { + "epoch": 4.250767341927563, + "grad_norm": 0.2810569703578949, + "learning_rate": 6.43634395794099e-05, + "loss": 1.8492, + "step": 13849 + }, + { + "epoch": 4.251074278698588, + "grad_norm": 0.2544163465499878, + "learning_rate": 6.435867845763337e-05, + "loss": 1.7846, + "step": 13850 + }, + { + "epoch": 4.251381215469613, + "grad_norm": 0.27879175543785095, + "learning_rate": 6.435391719395598e-05, + "loss": 1.767, + "step": 13851 + }, + { + "epoch": 4.2516881522406385, + "grad_norm": 0.2876715362071991, + "learning_rate": 6.434915578842477e-05, + "loss": 1.8048, + "step": 13852 + }, + { + "epoch": 4.251995089011664, + "grad_norm": 0.27844297885894775, + "learning_rate": 6.434439424108678e-05, + "loss": 1.7472, + "step": 13853 + }, + { + "epoch": 4.252302025782689, + "grad_norm": 0.2417020946741104, + "learning_rate": 6.43396325519891e-05, + "loss": 1.8481, + "step": 13854 + }, + { + "epoch": 4.252608962553714, + "grad_norm": 0.23828522861003876, + "learning_rate": 6.433487072117874e-05, + "loss": 1.7536, + "step": 13855 + }, + { + "epoch": 4.252915899324739, + "grad_norm": 0.22304333746433258, + "learning_rate": 6.43301087487028e-05, + "loss": 1.741, + "step": 13856 + }, + { + "epoch": 4.253222836095764, + "grad_norm": 0.27089163661003113, + "learning_rate": 6.432534663460832e-05, + "loss": 1.7974, + "step": 13857 + }, + { + "epoch": 4.25352977286679, + "grad_norm": 0.2439592182636261, + "learning_rate": 6.432058437894237e-05, + "loss": 1.7713, + "step": 13858 + }, + { + "epoch": 4.253836709637815, + "grad_norm": 0.2368553727865219, + "learning_rate": 6.431582198175203e-05, + "loss": 1.6915, + "step": 13859 + }, + { + "epoch": 4.25414364640884, + "grad_norm": 0.25248441100120544, + "learning_rate": 6.431105944308431e-05, + "loss": 1.7286, + "step": 13860 + }, + { + "epoch": 4.254450583179865, + "grad_norm": 0.20928484201431274, + "learning_rate": 6.430629676298634e-05, + "loss": 1.79, + "step": 13861 + }, + { + "epoch": 4.25475751995089, + "grad_norm": 0.25262540578842163, + "learning_rate": 6.430153394150514e-05, + "loss": 1.7443, + "step": 13862 + }, + { + "epoch": 4.255064456721915, + "grad_norm": 0.27508237957954407, + "learning_rate": 6.429677097868783e-05, + "loss": 1.8207, + "step": 13863 + }, + { + "epoch": 4.255371393492941, + "grad_norm": 0.28129303455352783, + "learning_rate": 6.429200787458141e-05, + "loss": 1.7589, + "step": 13864 + }, + { + "epoch": 4.255678330263966, + "grad_norm": 0.3205658495426178, + "learning_rate": 6.428724462923302e-05, + "loss": 1.8037, + "step": 13865 + }, + { + "epoch": 4.2559852670349905, + "grad_norm": 0.24048078060150146, + "learning_rate": 6.428248124268969e-05, + "loss": 1.7303, + "step": 13866 + }, + { + "epoch": 4.256292203806016, + "grad_norm": 0.24742475152015686, + "learning_rate": 6.427771771499852e-05, + "loss": 1.7753, + "step": 13867 + }, + { + "epoch": 4.256599140577041, + "grad_norm": 0.3082354962825775, + "learning_rate": 6.427295404620656e-05, + "loss": 1.7275, + "step": 13868 + }, + { + "epoch": 4.2569060773480665, + "grad_norm": 0.23319822549819946, + "learning_rate": 6.426819023636093e-05, + "loss": 1.7562, + "step": 13869 + }, + { + "epoch": 4.257213014119092, + "grad_norm": 0.2611405551433563, + "learning_rate": 6.426342628550866e-05, + "loss": 1.7417, + "step": 13870 + }, + { + "epoch": 4.257519950890116, + "grad_norm": 0.2577543258666992, + "learning_rate": 6.425866219369686e-05, + "loss": 1.6906, + "step": 13871 + }, + { + "epoch": 4.257826887661142, + "grad_norm": 0.31353357434272766, + "learning_rate": 6.42538979609726e-05, + "loss": 1.7155, + "step": 13872 + }, + { + "epoch": 4.258133824432167, + "grad_norm": 0.23280073702335358, + "learning_rate": 6.424913358738296e-05, + "loss": 1.7576, + "step": 13873 + }, + { + "epoch": 4.258440761203192, + "grad_norm": 0.24087542295455933, + "learning_rate": 6.424436907297504e-05, + "loss": 1.7622, + "step": 13874 + }, + { + "epoch": 4.258747697974218, + "grad_norm": 0.3146509826183319, + "learning_rate": 6.42396044177959e-05, + "loss": 1.769, + "step": 13875 + }, + { + "epoch": 4.259054634745242, + "grad_norm": 0.2645811438560486, + "learning_rate": 6.423483962189268e-05, + "loss": 1.7713, + "step": 13876 + }, + { + "epoch": 4.259361571516267, + "grad_norm": 0.2166455090045929, + "learning_rate": 6.423007468531238e-05, + "loss": 1.7705, + "step": 13877 + }, + { + "epoch": 4.259668508287293, + "grad_norm": 0.29142528772354126, + "learning_rate": 6.422530960810217e-05, + "loss": 1.7725, + "step": 13878 + }, + { + "epoch": 4.259975445058318, + "grad_norm": 0.28777652978897095, + "learning_rate": 6.422054439030911e-05, + "loss": 1.7853, + "step": 13879 + }, + { + "epoch": 4.260282381829343, + "grad_norm": 0.2285117357969284, + "learning_rate": 6.42157790319803e-05, + "loss": 1.7034, + "step": 13880 + }, + { + "epoch": 4.260589318600369, + "grad_norm": 0.32407644391059875, + "learning_rate": 6.421101353316282e-05, + "loss": 1.7858, + "step": 13881 + }, + { + "epoch": 4.260896255371393, + "grad_norm": 0.4803469777107239, + "learning_rate": 6.420624789390378e-05, + "loss": 1.7337, + "step": 13882 + }, + { + "epoch": 4.2612031921424185, + "grad_norm": 0.4245823919773102, + "learning_rate": 6.420148211425027e-05, + "loss": 1.8024, + "step": 13883 + }, + { + "epoch": 4.261510128913444, + "grad_norm": 0.22298674285411835, + "learning_rate": 6.419671619424938e-05, + "loss": 1.7129, + "step": 13884 + }, + { + "epoch": 4.261817065684469, + "grad_norm": 0.46955862641334534, + "learning_rate": 6.419195013394824e-05, + "loss": 1.7151, + "step": 13885 + }, + { + "epoch": 4.2621240024554945, + "grad_norm": 0.4809224009513855, + "learning_rate": 6.418718393339392e-05, + "loss": 1.7697, + "step": 13886 + }, + { + "epoch": 4.262430939226519, + "grad_norm": 0.2741130292415619, + "learning_rate": 6.418241759263353e-05, + "loss": 1.8133, + "step": 13887 + }, + { + "epoch": 4.262737875997544, + "grad_norm": 0.3673117756843567, + "learning_rate": 6.417765111171419e-05, + "loss": 1.7424, + "step": 13888 + }, + { + "epoch": 4.26304481276857, + "grad_norm": 0.4609327018260956, + "learning_rate": 6.417288449068299e-05, + "loss": 1.741, + "step": 13889 + }, + { + "epoch": 4.263351749539595, + "grad_norm": 0.2929460406303406, + "learning_rate": 6.416811772958702e-05, + "loss": 1.8385, + "step": 13890 + }, + { + "epoch": 4.26365868631062, + "grad_norm": 0.2727305293083191, + "learning_rate": 6.416335082847342e-05, + "loss": 1.794, + "step": 13891 + }, + { + "epoch": 4.263965623081646, + "grad_norm": 0.26089411973953247, + "learning_rate": 6.41585837873893e-05, + "loss": 1.7907, + "step": 13892 + }, + { + "epoch": 4.26427255985267, + "grad_norm": 0.24655573070049286, + "learning_rate": 6.415381660638174e-05, + "loss": 1.7481, + "step": 13893 + }, + { + "epoch": 4.264579496623695, + "grad_norm": 0.4186919629573822, + "learning_rate": 6.414904928549787e-05, + "loss": 1.8021, + "step": 13894 + }, + { + "epoch": 4.264886433394721, + "grad_norm": 0.38188236951828003, + "learning_rate": 6.414428182478478e-05, + "loss": 1.75, + "step": 13895 + }, + { + "epoch": 4.265193370165746, + "grad_norm": 0.23686440289020538, + "learning_rate": 6.413951422428963e-05, + "loss": 1.7882, + "step": 13896 + }, + { + "epoch": 4.265500306936771, + "grad_norm": 0.35963737964630127, + "learning_rate": 6.413474648405952e-05, + "loss": 1.7427, + "step": 13897 + }, + { + "epoch": 4.265807243707796, + "grad_norm": 0.38558289408683777, + "learning_rate": 6.412997860414155e-05, + "loss": 1.7622, + "step": 13898 + }, + { + "epoch": 4.266114180478821, + "grad_norm": 0.2311459481716156, + "learning_rate": 6.412521058458285e-05, + "loss": 1.7894, + "step": 13899 + }, + { + "epoch": 4.2664211172498465, + "grad_norm": 0.2647818624973297, + "learning_rate": 6.412044242543054e-05, + "loss": 1.7399, + "step": 13900 + }, + { + "epoch": 4.266728054020872, + "grad_norm": 0.3174133002758026, + "learning_rate": 6.411567412673174e-05, + "loss": 1.7552, + "step": 13901 + }, + { + "epoch": 4.267034990791897, + "grad_norm": 0.25207316875457764, + "learning_rate": 6.411090568853358e-05, + "loss": 1.7876, + "step": 13902 + }, + { + "epoch": 4.267341927562922, + "grad_norm": 0.24549202620983124, + "learning_rate": 6.410613711088317e-05, + "loss": 1.8554, + "step": 13903 + }, + { + "epoch": 4.267648864333947, + "grad_norm": 0.26293641328811646, + "learning_rate": 6.410136839382765e-05, + "loss": 1.8553, + "step": 13904 + }, + { + "epoch": 4.267955801104972, + "grad_norm": 0.20258362591266632, + "learning_rate": 6.409659953741416e-05, + "loss": 1.7205, + "step": 13905 + }, + { + "epoch": 4.268262737875998, + "grad_norm": 0.24885907769203186, + "learning_rate": 6.409183054168979e-05, + "loss": 1.7718, + "step": 13906 + }, + { + "epoch": 4.268569674647023, + "grad_norm": 0.22737209498882294, + "learning_rate": 6.408706140670169e-05, + "loss": 1.7228, + "step": 13907 + }, + { + "epoch": 4.268876611418047, + "grad_norm": 0.2201235145330429, + "learning_rate": 6.4082292132497e-05, + "loss": 1.7451, + "step": 13908 + }, + { + "epoch": 4.269183548189073, + "grad_norm": 0.24108454585075378, + "learning_rate": 6.407752271912285e-05, + "loss": 1.7531, + "step": 13909 + }, + { + "epoch": 4.269490484960098, + "grad_norm": 0.21723641455173492, + "learning_rate": 6.407275316662636e-05, + "loss": 1.7139, + "step": 13910 + }, + { + "epoch": 4.269797421731123, + "grad_norm": 0.22557848691940308, + "learning_rate": 6.406798347505469e-05, + "loss": 1.7633, + "step": 13911 + }, + { + "epoch": 4.270104358502149, + "grad_norm": 0.24664700031280518, + "learning_rate": 6.406321364445494e-05, + "loss": 1.7854, + "step": 13912 + }, + { + "epoch": 4.270411295273174, + "grad_norm": 0.2599056661128998, + "learning_rate": 6.405844367487428e-05, + "loss": 1.7662, + "step": 13913 + }, + { + "epoch": 4.2707182320441985, + "grad_norm": 0.2378663718700409, + "learning_rate": 6.405367356635982e-05, + "loss": 1.7477, + "step": 13914 + }, + { + "epoch": 4.271025168815224, + "grad_norm": 0.27158626914024353, + "learning_rate": 6.404890331895876e-05, + "loss": 1.7426, + "step": 13915 + }, + { + "epoch": 4.271332105586249, + "grad_norm": 0.28585317730903625, + "learning_rate": 6.404413293271818e-05, + "loss": 1.7492, + "step": 13916 + }, + { + "epoch": 4.2716390423572745, + "grad_norm": 0.2321750968694687, + "learning_rate": 6.403936240768526e-05, + "loss": 1.8594, + "step": 13917 + }, + { + "epoch": 4.2719459791283, + "grad_norm": 0.25824111700057983, + "learning_rate": 6.40345917439071e-05, + "loss": 1.7622, + "step": 13918 + }, + { + "epoch": 4.272252915899324, + "grad_norm": 0.24641194939613342, + "learning_rate": 6.40298209414309e-05, + "loss": 1.7519, + "step": 13919 + }, + { + "epoch": 4.27255985267035, + "grad_norm": 0.2132398933172226, + "learning_rate": 6.40250500003038e-05, + "loss": 1.7339, + "step": 13920 + }, + { + "epoch": 4.272866789441375, + "grad_norm": 0.22630736231803894, + "learning_rate": 6.402027892057292e-05, + "loss": 1.7396, + "step": 13921 + }, + { + "epoch": 4.2731737262124, + "grad_norm": 0.295163631439209, + "learning_rate": 6.401550770228543e-05, + "loss": 1.8063, + "step": 13922 + }, + { + "epoch": 4.273480662983426, + "grad_norm": 0.2722746729850769, + "learning_rate": 6.401073634548848e-05, + "loss": 1.7775, + "step": 13923 + }, + { + "epoch": 4.273787599754451, + "grad_norm": 0.23201976716518402, + "learning_rate": 6.400596485022922e-05, + "loss": 1.7755, + "step": 13924 + }, + { + "epoch": 4.274094536525475, + "grad_norm": 0.23880761861801147, + "learning_rate": 6.40011932165548e-05, + "loss": 1.778, + "step": 13925 + }, + { + "epoch": 4.274401473296501, + "grad_norm": 0.22305625677108765, + "learning_rate": 6.399642144451239e-05, + "loss": 1.761, + "step": 13926 + }, + { + "epoch": 4.274708410067526, + "grad_norm": 0.21874886751174927, + "learning_rate": 6.399164953414914e-05, + "loss": 1.7148, + "step": 13927 + }, + { + "epoch": 4.2750153468385514, + "grad_norm": 0.2003604918718338, + "learning_rate": 6.398687748551221e-05, + "loss": 1.8049, + "step": 13928 + }, + { + "epoch": 4.275322283609577, + "grad_norm": 0.2443511188030243, + "learning_rate": 6.398210529864875e-05, + "loss": 1.782, + "step": 13929 + }, + { + "epoch": 4.275629220380601, + "grad_norm": 0.2297198623418808, + "learning_rate": 6.397733297360594e-05, + "loss": 1.7682, + "step": 13930 + }, + { + "epoch": 4.275936157151627, + "grad_norm": 0.23474562168121338, + "learning_rate": 6.39725605104309e-05, + "loss": 1.7809, + "step": 13931 + }, + { + "epoch": 4.276243093922652, + "grad_norm": 0.25908544659614563, + "learning_rate": 6.396778790917087e-05, + "loss": 1.7343, + "step": 13932 + }, + { + "epoch": 4.276550030693677, + "grad_norm": 0.2440379112958908, + "learning_rate": 6.396301516987295e-05, + "loss": 1.786, + "step": 13933 + }, + { + "epoch": 4.276856967464703, + "grad_norm": 0.26185858249664307, + "learning_rate": 6.395824229258435e-05, + "loss": 1.7863, + "step": 13934 + }, + { + "epoch": 4.277163904235728, + "grad_norm": 0.24470919370651245, + "learning_rate": 6.39534692773522e-05, + "loss": 1.7774, + "step": 13935 + }, + { + "epoch": 4.277470841006752, + "grad_norm": 0.2612632215023041, + "learning_rate": 6.39486961242237e-05, + "loss": 1.7536, + "step": 13936 + }, + { + "epoch": 4.277777777777778, + "grad_norm": 0.26870301365852356, + "learning_rate": 6.3943922833246e-05, + "loss": 1.8177, + "step": 13937 + }, + { + "epoch": 4.278084714548803, + "grad_norm": 0.24445784091949463, + "learning_rate": 6.393914940446628e-05, + "loss": 1.7539, + "step": 13938 + }, + { + "epoch": 4.278391651319828, + "grad_norm": 0.2622319757938385, + "learning_rate": 6.393437583793174e-05, + "loss": 1.8252, + "step": 13939 + }, + { + "epoch": 4.278698588090854, + "grad_norm": 0.2586652636528015, + "learning_rate": 6.39296021336895e-05, + "loss": 1.7975, + "step": 13940 + }, + { + "epoch": 4.279005524861878, + "grad_norm": 0.19488228857517242, + "learning_rate": 6.392482829178678e-05, + "loss": 1.7678, + "step": 13941 + }, + { + "epoch": 4.2793124616329035, + "grad_norm": 0.23956604301929474, + "learning_rate": 6.392005431227074e-05, + "loss": 1.7444, + "step": 13942 + }, + { + "epoch": 4.279619398403929, + "grad_norm": 0.24195842444896698, + "learning_rate": 6.391528019518857e-05, + "loss": 1.8116, + "step": 13943 + }, + { + "epoch": 4.279926335174954, + "grad_norm": 0.21479523181915283, + "learning_rate": 6.391050594058746e-05, + "loss": 1.7351, + "step": 13944 + }, + { + "epoch": 4.2802332719459795, + "grad_norm": 0.2309941202402115, + "learning_rate": 6.390573154851456e-05, + "loss": 1.8245, + "step": 13945 + }, + { + "epoch": 4.280540208717004, + "grad_norm": 0.2375536412000656, + "learning_rate": 6.390095701901706e-05, + "loss": 1.7921, + "step": 13946 + }, + { + "epoch": 4.280847145488029, + "grad_norm": 0.25518664717674255, + "learning_rate": 6.389618235214216e-05, + "loss": 1.7549, + "step": 13947 + }, + { + "epoch": 4.281154082259055, + "grad_norm": 0.2579016089439392, + "learning_rate": 6.389140754793705e-05, + "loss": 1.7637, + "step": 13948 + }, + { + "epoch": 4.28146101903008, + "grad_norm": 0.25350916385650635, + "learning_rate": 6.388663260644892e-05, + "loss": 1.746, + "step": 13949 + }, + { + "epoch": 4.281767955801105, + "grad_norm": 0.2994026839733124, + "learning_rate": 6.388185752772493e-05, + "loss": 1.8196, + "step": 13950 + }, + { + "epoch": 4.28207489257213, + "grad_norm": 0.29938533902168274, + "learning_rate": 6.387708231181229e-05, + "loss": 1.7187, + "step": 13951 + }, + { + "epoch": 4.282381829343155, + "grad_norm": 0.23865137994289398, + "learning_rate": 6.387230695875819e-05, + "loss": 1.7317, + "step": 13952 + }, + { + "epoch": 4.28268876611418, + "grad_norm": 0.23812857270240784, + "learning_rate": 6.386753146860982e-05, + "loss": 1.7536, + "step": 13953 + }, + { + "epoch": 4.282995702885206, + "grad_norm": 0.3395650088787079, + "learning_rate": 6.386275584141438e-05, + "loss": 1.7932, + "step": 13954 + }, + { + "epoch": 4.283302639656231, + "grad_norm": 0.38207507133483887, + "learning_rate": 6.385798007721906e-05, + "loss": 1.8196, + "step": 13955 + }, + { + "epoch": 4.283609576427256, + "grad_norm": 0.32960978150367737, + "learning_rate": 6.385320417607107e-05, + "loss": 1.7898, + "step": 13956 + }, + { + "epoch": 4.283916513198281, + "grad_norm": 0.22978928685188293, + "learning_rate": 6.384842813801757e-05, + "loss": 1.7835, + "step": 13957 + }, + { + "epoch": 4.284223449969306, + "grad_norm": 0.24607588350772858, + "learning_rate": 6.38436519631058e-05, + "loss": 1.7829, + "step": 13958 + }, + { + "epoch": 4.2845303867403315, + "grad_norm": 0.2770270109176636, + "learning_rate": 6.383887565138295e-05, + "loss": 1.7294, + "step": 13959 + }, + { + "epoch": 4.284837323511357, + "grad_norm": 0.27644863724708557, + "learning_rate": 6.383409920289622e-05, + "loss": 1.829, + "step": 13960 + }, + { + "epoch": 4.285144260282382, + "grad_norm": 0.3870919942855835, + "learning_rate": 6.382932261769282e-05, + "loss": 1.8146, + "step": 13961 + }, + { + "epoch": 4.285451197053407, + "grad_norm": 0.3562348186969757, + "learning_rate": 6.382454589581994e-05, + "loss": 1.8225, + "step": 13962 + }, + { + "epoch": 4.285758133824432, + "grad_norm": 0.28444886207580566, + "learning_rate": 6.38197690373248e-05, + "loss": 1.7734, + "step": 13963 + }, + { + "epoch": 4.286065070595457, + "grad_norm": 0.27935758233070374, + "learning_rate": 6.381499204225459e-05, + "loss": 1.7402, + "step": 13964 + }, + { + "epoch": 4.286372007366483, + "grad_norm": 0.34188997745513916, + "learning_rate": 6.381021491065653e-05, + "loss": 1.7661, + "step": 13965 + }, + { + "epoch": 4.286678944137508, + "grad_norm": 0.28648918867111206, + "learning_rate": 6.380543764257785e-05, + "loss": 1.8312, + "step": 13966 + }, + { + "epoch": 4.286985880908533, + "grad_norm": 0.2733290493488312, + "learning_rate": 6.380066023806572e-05, + "loss": 1.7505, + "step": 13967 + }, + { + "epoch": 4.287292817679558, + "grad_norm": 0.3344273865222931, + "learning_rate": 6.37958826971674e-05, + "loss": 1.8392, + "step": 13968 + }, + { + "epoch": 4.287599754450583, + "grad_norm": 0.2655799090862274, + "learning_rate": 6.379110501993006e-05, + "loss": 1.7575, + "step": 13969 + }, + { + "epoch": 4.287906691221608, + "grad_norm": 0.2569151818752289, + "learning_rate": 6.378632720640095e-05, + "loss": 1.6619, + "step": 13970 + }, + { + "epoch": 4.288213627992634, + "grad_norm": 0.2477198988199234, + "learning_rate": 6.378154925662727e-05, + "loss": 1.7532, + "step": 13971 + }, + { + "epoch": 4.288520564763659, + "grad_norm": 0.2867630422115326, + "learning_rate": 6.377677117065624e-05, + "loss": 1.7725, + "step": 13972 + }, + { + "epoch": 4.2888275015346835, + "grad_norm": 0.28316137194633484, + "learning_rate": 6.37719929485351e-05, + "loss": 1.7628, + "step": 13973 + }, + { + "epoch": 4.289134438305709, + "grad_norm": 0.2934304475784302, + "learning_rate": 6.376721459031106e-05, + "loss": 1.7346, + "step": 13974 + }, + { + "epoch": 4.289441375076734, + "grad_norm": 0.22847147285938263, + "learning_rate": 6.376243609603129e-05, + "loss": 1.7409, + "step": 13975 + }, + { + "epoch": 4.2897483118477595, + "grad_norm": 0.360441118478775, + "learning_rate": 6.375765746574311e-05, + "loss": 1.808, + "step": 13976 + }, + { + "epoch": 4.290055248618785, + "grad_norm": 0.2750907242298126, + "learning_rate": 6.375287869949367e-05, + "loss": 1.8046, + "step": 13977 + }, + { + "epoch": 4.290362185389809, + "grad_norm": 0.26193201541900635, + "learning_rate": 6.374809979733022e-05, + "loss": 1.7097, + "step": 13978 + }, + { + "epoch": 4.290669122160835, + "grad_norm": 0.3282175064086914, + "learning_rate": 6.37433207593e-05, + "loss": 1.7924, + "step": 13979 + }, + { + "epoch": 4.29097605893186, + "grad_norm": 0.2845167815685272, + "learning_rate": 6.373854158545021e-05, + "loss": 1.7663, + "step": 13980 + }, + { + "epoch": 4.291282995702885, + "grad_norm": 0.21816621720790863, + "learning_rate": 6.37337622758281e-05, + "loss": 1.7368, + "step": 13981 + }, + { + "epoch": 4.291589932473911, + "grad_norm": 0.264272540807724, + "learning_rate": 6.372898283048094e-05, + "loss": 1.7377, + "step": 13982 + }, + { + "epoch": 4.291896869244935, + "grad_norm": 0.2182006686925888, + "learning_rate": 6.37242032494559e-05, + "loss": 1.8107, + "step": 13983 + }, + { + "epoch": 4.29220380601596, + "grad_norm": 0.26856422424316406, + "learning_rate": 6.371942353280023e-05, + "loss": 1.7708, + "step": 13984 + }, + { + "epoch": 4.292510742786986, + "grad_norm": 0.3025323748588562, + "learning_rate": 6.37146436805612e-05, + "loss": 1.7768, + "step": 13985 + }, + { + "epoch": 4.292817679558011, + "grad_norm": 0.2949144244194031, + "learning_rate": 6.3709863692786e-05, + "loss": 1.7848, + "step": 13986 + }, + { + "epoch": 4.293124616329036, + "grad_norm": 0.20670418441295624, + "learning_rate": 6.370508356952188e-05, + "loss": 1.7367, + "step": 13987 + }, + { + "epoch": 4.293431553100062, + "grad_norm": 0.2453860342502594, + "learning_rate": 6.370030331081611e-05, + "loss": 1.7246, + "step": 13988 + }, + { + "epoch": 4.293738489871086, + "grad_norm": 0.3413507044315338, + "learning_rate": 6.369552291671592e-05, + "loss": 1.7829, + "step": 13989 + }, + { + "epoch": 4.2940454266421115, + "grad_norm": 0.28352782130241394, + "learning_rate": 6.369074238726856e-05, + "loss": 1.7755, + "step": 13990 + }, + { + "epoch": 4.294352363413137, + "grad_norm": 0.21408751606941223, + "learning_rate": 6.368596172252124e-05, + "loss": 1.7292, + "step": 13991 + }, + { + "epoch": 4.294659300184162, + "grad_norm": 0.28372085094451904, + "learning_rate": 6.36811809225212e-05, + "loss": 1.8197, + "step": 13992 + }, + { + "epoch": 4.2949662369551875, + "grad_norm": 0.2400829792022705, + "learning_rate": 6.367639998731573e-05, + "loss": 1.7559, + "step": 13993 + }, + { + "epoch": 4.295273173726212, + "grad_norm": 0.22853593528270721, + "learning_rate": 6.367161891695207e-05, + "loss": 1.8116, + "step": 13994 + }, + { + "epoch": 4.295580110497237, + "grad_norm": 0.22098208963871002, + "learning_rate": 6.366683771147745e-05, + "loss": 1.7269, + "step": 13995 + }, + { + "epoch": 4.295887047268263, + "grad_norm": 0.22293934226036072, + "learning_rate": 6.366205637093914e-05, + "loss": 1.7944, + "step": 13996 + }, + { + "epoch": 4.296193984039288, + "grad_norm": 0.26120004057884216, + "learning_rate": 6.365727489538437e-05, + "loss": 1.7581, + "step": 13997 + }, + { + "epoch": 4.296500920810313, + "grad_norm": 0.2568937838077545, + "learning_rate": 6.365249328486041e-05, + "loss": 1.7356, + "step": 13998 + }, + { + "epoch": 4.296807857581339, + "grad_norm": 0.2419043630361557, + "learning_rate": 6.364771153941449e-05, + "loss": 1.8127, + "step": 13999 + }, + { + "epoch": 4.297114794352363, + "grad_norm": 0.2521972060203552, + "learning_rate": 6.364292965909391e-05, + "loss": 1.7445, + "step": 14000 + }, + { + "epoch": 4.297421731123388, + "grad_norm": 0.3269292414188385, + "learning_rate": 6.363814764394589e-05, + "loss": 1.7835, + "step": 14001 + }, + { + "epoch": 4.297728667894414, + "grad_norm": 0.258405864238739, + "learning_rate": 6.36333654940177e-05, + "loss": 1.7407, + "step": 14002 + }, + { + "epoch": 4.298035604665439, + "grad_norm": 0.21527236700057983, + "learning_rate": 6.362858320935662e-05, + "loss": 1.7729, + "step": 14003 + }, + { + "epoch": 4.298342541436464, + "grad_norm": 0.25343602895736694, + "learning_rate": 6.362380079000988e-05, + "loss": 1.8087, + "step": 14004 + }, + { + "epoch": 4.298649478207489, + "grad_norm": 0.26110637187957764, + "learning_rate": 6.361901823602474e-05, + "loss": 1.813, + "step": 14005 + }, + { + "epoch": 4.298956414978514, + "grad_norm": 0.26749926805496216, + "learning_rate": 6.361423554744851e-05, + "loss": 1.8193, + "step": 14006 + }, + { + "epoch": 4.2992633517495396, + "grad_norm": 0.22357676923274994, + "learning_rate": 6.360945272432841e-05, + "loss": 1.7498, + "step": 14007 + }, + { + "epoch": 4.299570288520565, + "grad_norm": 0.2367832362651825, + "learning_rate": 6.360466976671172e-05, + "loss": 1.7843, + "step": 14008 + }, + { + "epoch": 4.29987722529159, + "grad_norm": 0.23594366014003754, + "learning_rate": 6.35998866746457e-05, + "loss": 1.7442, + "step": 14009 + }, + { + "epoch": 4.300184162062616, + "grad_norm": 0.2660543918609619, + "learning_rate": 6.359510344817765e-05, + "loss": 1.7557, + "step": 14010 + }, + { + "epoch": 4.30049109883364, + "grad_norm": 0.191593199968338, + "learning_rate": 6.359032008735481e-05, + "loss": 1.7988, + "step": 14011 + }, + { + "epoch": 4.300798035604665, + "grad_norm": 0.2755490243434906, + "learning_rate": 6.358553659222447e-05, + "loss": 1.7551, + "step": 14012 + }, + { + "epoch": 4.301104972375691, + "grad_norm": 0.2900530993938446, + "learning_rate": 6.358075296283387e-05, + "loss": 1.7523, + "step": 14013 + }, + { + "epoch": 4.301411909146716, + "grad_norm": 0.22242774069309235, + "learning_rate": 6.357596919923033e-05, + "loss": 1.7626, + "step": 14014 + }, + { + "epoch": 4.301718845917741, + "grad_norm": 0.26636210083961487, + "learning_rate": 6.357118530146108e-05, + "loss": 1.7855, + "step": 14015 + }, + { + "epoch": 4.302025782688766, + "grad_norm": 0.3055269718170166, + "learning_rate": 6.356640126957344e-05, + "loss": 1.7528, + "step": 14016 + }, + { + "epoch": 4.302332719459791, + "grad_norm": 0.29695719480514526, + "learning_rate": 6.356161710361468e-05, + "loss": 1.7482, + "step": 14017 + }, + { + "epoch": 4.3026396562308165, + "grad_norm": 0.2369711697101593, + "learning_rate": 6.355683280363207e-05, + "loss": 1.7635, + "step": 14018 + }, + { + "epoch": 4.302946593001842, + "grad_norm": 0.26681363582611084, + "learning_rate": 6.35520483696729e-05, + "loss": 1.8814, + "step": 14019 + }, + { + "epoch": 4.303253529772867, + "grad_norm": 0.2623308598995209, + "learning_rate": 6.354726380178442e-05, + "loss": 1.8645, + "step": 14020 + }, + { + "epoch": 4.303560466543892, + "grad_norm": 0.23326413333415985, + "learning_rate": 6.354247910001394e-05, + "loss": 1.8093, + "step": 14021 + }, + { + "epoch": 4.303867403314917, + "grad_norm": 0.3037295639514923, + "learning_rate": 6.353769426440875e-05, + "loss": 1.8556, + "step": 14022 + }, + { + "epoch": 4.304174340085942, + "grad_norm": 0.23624882102012634, + "learning_rate": 6.353290929501616e-05, + "loss": 1.803, + "step": 14023 + }, + { + "epoch": 4.304481276856968, + "grad_norm": 0.22106927633285522, + "learning_rate": 6.35281241918834e-05, + "loss": 1.7133, + "step": 14024 + }, + { + "epoch": 4.304788213627993, + "grad_norm": 0.2374040186405182, + "learning_rate": 6.352333895505778e-05, + "loss": 1.8127, + "step": 14025 + }, + { + "epoch": 4.305095150399017, + "grad_norm": 0.2782450318336487, + "learning_rate": 6.35185535845866e-05, + "loss": 1.8613, + "step": 14026 + }, + { + "epoch": 4.305402087170043, + "grad_norm": 0.2527763843536377, + "learning_rate": 6.351376808051717e-05, + "loss": 1.7533, + "step": 14027 + }, + { + "epoch": 4.305709023941068, + "grad_norm": 0.2462318390607834, + "learning_rate": 6.350898244289675e-05, + "loss": 1.8075, + "step": 14028 + }, + { + "epoch": 4.306015960712093, + "grad_norm": 0.2646189332008362, + "learning_rate": 6.350419667177265e-05, + "loss": 1.8261, + "step": 14029 + }, + { + "epoch": 4.306322897483119, + "grad_norm": 0.24918611347675323, + "learning_rate": 6.349941076719218e-05, + "loss": 1.7542, + "step": 14030 + }, + { + "epoch": 4.306629834254144, + "grad_norm": 0.22440841794013977, + "learning_rate": 6.349462472920259e-05, + "loss": 1.7897, + "step": 14031 + }, + { + "epoch": 4.3069367710251685, + "grad_norm": 0.28614330291748047, + "learning_rate": 6.348983855785121e-05, + "loss": 1.88, + "step": 14032 + }, + { + "epoch": 4.307243707796194, + "grad_norm": 0.25015848875045776, + "learning_rate": 6.348505225318535e-05, + "loss": 1.8008, + "step": 14033 + }, + { + "epoch": 4.307550644567219, + "grad_norm": 0.2468707263469696, + "learning_rate": 6.34802658152523e-05, + "loss": 1.8025, + "step": 14034 + }, + { + "epoch": 4.3078575813382445, + "grad_norm": 0.30504748225212097, + "learning_rate": 6.347547924409937e-05, + "loss": 1.8765, + "step": 14035 + }, + { + "epoch": 4.30816451810927, + "grad_norm": 0.35419392585754395, + "learning_rate": 6.347069253977385e-05, + "loss": 1.7807, + "step": 14036 + }, + { + "epoch": 4.308471454880294, + "grad_norm": 0.33683931827545166, + "learning_rate": 6.346590570232305e-05, + "loss": 1.7244, + "step": 14037 + }, + { + "epoch": 4.30877839165132, + "grad_norm": 0.3339467942714691, + "learning_rate": 6.346111873179427e-05, + "loss": 1.7642, + "step": 14038 + }, + { + "epoch": 4.309085328422345, + "grad_norm": 0.2369392216205597, + "learning_rate": 6.345633162823484e-05, + "loss": 1.7127, + "step": 14039 + }, + { + "epoch": 4.30939226519337, + "grad_norm": 0.26469686627388, + "learning_rate": 6.345154439169206e-05, + "loss": 1.7235, + "step": 14040 + }, + { + "epoch": 4.309699201964396, + "grad_norm": 0.2737344205379486, + "learning_rate": 6.344675702221321e-05, + "loss": 1.783, + "step": 14041 + }, + { + "epoch": 4.310006138735421, + "grad_norm": 0.2381773442029953, + "learning_rate": 6.344196951984565e-05, + "loss": 1.7172, + "step": 14042 + }, + { + "epoch": 4.310313075506445, + "grad_norm": 0.28199076652526855, + "learning_rate": 6.343718188463663e-05, + "loss": 1.8315, + "step": 14043 + }, + { + "epoch": 4.310620012277471, + "grad_norm": 0.24378590285778046, + "learning_rate": 6.343239411663353e-05, + "loss": 1.7828, + "step": 14044 + }, + { + "epoch": 4.310926949048496, + "grad_norm": 0.26343944668769836, + "learning_rate": 6.342760621588365e-05, + "loss": 1.7679, + "step": 14045 + }, + { + "epoch": 4.311233885819521, + "grad_norm": 0.23703521490097046, + "learning_rate": 6.342281818243427e-05, + "loss": 1.7885, + "step": 14046 + }, + { + "epoch": 4.311540822590547, + "grad_norm": 0.2230173498392105, + "learning_rate": 6.341803001633276e-05, + "loss": 1.767, + "step": 14047 + }, + { + "epoch": 4.311847759361571, + "grad_norm": 0.249002143740654, + "learning_rate": 6.34132417176264e-05, + "loss": 1.8032, + "step": 14048 + }, + { + "epoch": 4.3121546961325965, + "grad_norm": 0.2383791208267212, + "learning_rate": 6.34084532863625e-05, + "loss": 1.7558, + "step": 14049 + }, + { + "epoch": 4.312461632903622, + "grad_norm": 0.2783047556877136, + "learning_rate": 6.340366472258843e-05, + "loss": 1.8389, + "step": 14050 + }, + { + "epoch": 4.312768569674647, + "grad_norm": 0.2654891312122345, + "learning_rate": 6.339887602635148e-05, + "loss": 1.7989, + "step": 14051 + }, + { + "epoch": 4.3130755064456725, + "grad_norm": 0.2638411521911621, + "learning_rate": 6.3394087197699e-05, + "loss": 1.8707, + "step": 14052 + }, + { + "epoch": 4.313382443216697, + "grad_norm": 0.3026179075241089, + "learning_rate": 6.338929823667829e-05, + "loss": 1.7892, + "step": 14053 + }, + { + "epoch": 4.313689379987722, + "grad_norm": 0.27496880292892456, + "learning_rate": 6.338450914333668e-05, + "loss": 1.7398, + "step": 14054 + }, + { + "epoch": 4.313996316758748, + "grad_norm": 0.2601073086261749, + "learning_rate": 6.337971991772151e-05, + "loss": 1.7646, + "step": 14055 + }, + { + "epoch": 4.314303253529773, + "grad_norm": 0.2061719298362732, + "learning_rate": 6.337493055988011e-05, + "loss": 1.7372, + "step": 14056 + }, + { + "epoch": 4.314610190300798, + "grad_norm": 0.23722340166568756, + "learning_rate": 6.337014106985981e-05, + "loss": 1.7457, + "step": 14057 + }, + { + "epoch": 4.314917127071823, + "grad_norm": 0.2729428708553314, + "learning_rate": 6.336535144770793e-05, + "loss": 1.8423, + "step": 14058 + }, + { + "epoch": 4.315224063842848, + "grad_norm": 0.23520450294017792, + "learning_rate": 6.336056169347182e-05, + "loss": 1.8124, + "step": 14059 + }, + { + "epoch": 4.315531000613873, + "grad_norm": 0.25142738223075867, + "learning_rate": 6.33557718071988e-05, + "loss": 1.7285, + "step": 14060 + }, + { + "epoch": 4.315837937384899, + "grad_norm": 0.24833035469055176, + "learning_rate": 6.335098178893621e-05, + "loss": 1.766, + "step": 14061 + }, + { + "epoch": 4.316144874155924, + "grad_norm": 0.2406177669763565, + "learning_rate": 6.334619163873141e-05, + "loss": 1.8824, + "step": 14062 + }, + { + "epoch": 4.316451810926949, + "grad_norm": 0.23077574372291565, + "learning_rate": 6.334140135663172e-05, + "loss": 1.7589, + "step": 14063 + }, + { + "epoch": 4.316758747697974, + "grad_norm": 0.20476560294628143, + "learning_rate": 6.333661094268448e-05, + "loss": 1.7331, + "step": 14064 + }, + { + "epoch": 4.317065684468999, + "grad_norm": 0.207991823554039, + "learning_rate": 6.333182039693704e-05, + "loss": 1.6876, + "step": 14065 + }, + { + "epoch": 4.3173726212400245, + "grad_norm": 0.20813052356243134, + "learning_rate": 6.332702971943671e-05, + "loss": 1.775, + "step": 14066 + }, + { + "epoch": 4.31767955801105, + "grad_norm": 0.2470991462469101, + "learning_rate": 6.332223891023087e-05, + "loss": 1.7673, + "step": 14067 + }, + { + "epoch": 4.317986494782075, + "grad_norm": 0.23855723440647125, + "learning_rate": 6.331744796936687e-05, + "loss": 1.7842, + "step": 14068 + }, + { + "epoch": 4.3182934315531, + "grad_norm": 0.21852652728557587, + "learning_rate": 6.331265689689204e-05, + "loss": 1.7727, + "step": 14069 + }, + { + "epoch": 4.318600368324125, + "grad_norm": 0.284496545791626, + "learning_rate": 6.330786569285374e-05, + "loss": 1.8248, + "step": 14070 + }, + { + "epoch": 4.31890730509515, + "grad_norm": 0.21709981560707092, + "learning_rate": 6.33030743572993e-05, + "loss": 1.7547, + "step": 14071 + }, + { + "epoch": 4.319214241866176, + "grad_norm": 0.24209457635879517, + "learning_rate": 6.329828289027608e-05, + "loss": 1.7695, + "step": 14072 + }, + { + "epoch": 4.319521178637201, + "grad_norm": 0.24869373440742493, + "learning_rate": 6.329349129183144e-05, + "loss": 1.8204, + "step": 14073 + }, + { + "epoch": 4.319828115408226, + "grad_norm": 0.21702703833580017, + "learning_rate": 6.328869956201274e-05, + "loss": 1.779, + "step": 14074 + }, + { + "epoch": 4.320135052179251, + "grad_norm": 0.22993850708007812, + "learning_rate": 6.328390770086731e-05, + "loss": 1.7935, + "step": 14075 + }, + { + "epoch": 4.320441988950276, + "grad_norm": 0.23491734266281128, + "learning_rate": 6.327911570844252e-05, + "loss": 1.7261, + "step": 14076 + }, + { + "epoch": 4.320748925721301, + "grad_norm": 0.2479303777217865, + "learning_rate": 6.327432358478571e-05, + "loss": 1.7683, + "step": 14077 + }, + { + "epoch": 4.321055862492327, + "grad_norm": 0.24261580407619476, + "learning_rate": 6.326953132994427e-05, + "loss": 1.7147, + "step": 14078 + }, + { + "epoch": 4.321362799263352, + "grad_norm": 0.24627646803855896, + "learning_rate": 6.326473894396553e-05, + "loss": 1.7976, + "step": 14079 + }, + { + "epoch": 4.3216697360343765, + "grad_norm": 0.269149512052536, + "learning_rate": 6.325994642689688e-05, + "loss": 1.7247, + "step": 14080 + }, + { + "epoch": 4.321976672805402, + "grad_norm": 0.4162158966064453, + "learning_rate": 6.325515377878566e-05, + "loss": 1.7485, + "step": 14081 + }, + { + "epoch": 4.322283609576427, + "grad_norm": 0.366459459066391, + "learning_rate": 6.325036099967925e-05, + "loss": 1.7286, + "step": 14082 + }, + { + "epoch": 4.3225905463474525, + "grad_norm": 0.2465270757675171, + "learning_rate": 6.324556808962499e-05, + "loss": 1.8097, + "step": 14083 + }, + { + "epoch": 4.322897483118478, + "grad_norm": 0.2911076843738556, + "learning_rate": 6.324077504867026e-05, + "loss": 1.7979, + "step": 14084 + }, + { + "epoch": 4.323204419889503, + "grad_norm": 0.33455169200897217, + "learning_rate": 6.323598187686245e-05, + "loss": 1.7988, + "step": 14085 + }, + { + "epoch": 4.323511356660528, + "grad_norm": 0.25020337104797363, + "learning_rate": 6.32311885742489e-05, + "loss": 1.7184, + "step": 14086 + }, + { + "epoch": 4.323818293431553, + "grad_norm": 0.23941513895988464, + "learning_rate": 6.322639514087699e-05, + "loss": 1.7672, + "step": 14087 + }, + { + "epoch": 4.324125230202578, + "grad_norm": 0.35258981585502625, + "learning_rate": 6.32216015767941e-05, + "loss": 1.7571, + "step": 14088 + }, + { + "epoch": 4.324432166973604, + "grad_norm": 0.2854993939399719, + "learning_rate": 6.321680788204758e-05, + "loss": 1.8096, + "step": 14089 + }, + { + "epoch": 4.324739103744629, + "grad_norm": 0.24422863125801086, + "learning_rate": 6.321201405668482e-05, + "loss": 1.778, + "step": 14090 + }, + { + "epoch": 4.3250460405156534, + "grad_norm": 0.36629122495651245, + "learning_rate": 6.320722010075321e-05, + "loss": 1.716, + "step": 14091 + }, + { + "epoch": 4.325352977286679, + "grad_norm": 0.37115517258644104, + "learning_rate": 6.32024260143001e-05, + "loss": 1.77, + "step": 14092 + }, + { + "epoch": 4.325659914057704, + "grad_norm": 0.21540327370166779, + "learning_rate": 6.319763179737288e-05, + "loss": 1.7529, + "step": 14093 + }, + { + "epoch": 4.3259668508287294, + "grad_norm": 0.2573898732662201, + "learning_rate": 6.319283745001892e-05, + "loss": 1.8101, + "step": 14094 + }, + { + "epoch": 4.326273787599755, + "grad_norm": 0.29481247067451477, + "learning_rate": 6.31880429722856e-05, + "loss": 1.7459, + "step": 14095 + }, + { + "epoch": 4.326580724370779, + "grad_norm": 0.23474647104740143, + "learning_rate": 6.318324836422031e-05, + "loss": 1.786, + "step": 14096 + }, + { + "epoch": 4.326887661141805, + "grad_norm": 0.2884673476219177, + "learning_rate": 6.317845362587045e-05, + "loss": 1.8123, + "step": 14097 + }, + { + "epoch": 4.32719459791283, + "grad_norm": 0.39008447527885437, + "learning_rate": 6.317365875728338e-05, + "loss": 1.7729, + "step": 14098 + }, + { + "epoch": 4.327501534683855, + "grad_norm": 0.30568063259124756, + "learning_rate": 6.316886375850651e-05, + "loss": 1.7088, + "step": 14099 + }, + { + "epoch": 4.327808471454881, + "grad_norm": 0.2538018524646759, + "learning_rate": 6.316406862958718e-05, + "loss": 1.8028, + "step": 14100 + }, + { + "epoch": 4.328115408225905, + "grad_norm": 0.3815068006515503, + "learning_rate": 6.315927337057281e-05, + "loss": 1.7143, + "step": 14101 + }, + { + "epoch": 4.32842234499693, + "grad_norm": 0.3813243508338928, + "learning_rate": 6.31544779815108e-05, + "loss": 1.7072, + "step": 14102 + }, + { + "epoch": 4.328729281767956, + "grad_norm": 0.22438868880271912, + "learning_rate": 6.314968246244852e-05, + "loss": 1.7445, + "step": 14103 + }, + { + "epoch": 4.329036218538981, + "grad_norm": 0.3818886876106262, + "learning_rate": 6.314488681343337e-05, + "loss": 1.8292, + "step": 14104 + }, + { + "epoch": 4.329343155310006, + "grad_norm": 0.4376567006111145, + "learning_rate": 6.314009103451277e-05, + "loss": 1.8224, + "step": 14105 + }, + { + "epoch": 4.329650092081032, + "grad_norm": 0.2741515636444092, + "learning_rate": 6.313529512573406e-05, + "loss": 1.8078, + "step": 14106 + }, + { + "epoch": 4.329957028852056, + "grad_norm": 0.264343798160553, + "learning_rate": 6.313049908714467e-05, + "loss": 1.7314, + "step": 14107 + }, + { + "epoch": 4.3302639656230815, + "grad_norm": 0.3601943552494049, + "learning_rate": 6.312570291879201e-05, + "loss": 1.7351, + "step": 14108 + }, + { + "epoch": 4.330570902394107, + "grad_norm": 0.2931751012802124, + "learning_rate": 6.312090662072345e-05, + "loss": 1.8117, + "step": 14109 + }, + { + "epoch": 4.330877839165132, + "grad_norm": 0.27670225501060486, + "learning_rate": 6.31161101929864e-05, + "loss": 1.7707, + "step": 14110 + }, + { + "epoch": 4.3311847759361575, + "grad_norm": 0.33669596910476685, + "learning_rate": 6.311131363562825e-05, + "loss": 1.7337, + "step": 14111 + }, + { + "epoch": 4.331491712707182, + "grad_norm": 0.232634037733078, + "learning_rate": 6.310651694869643e-05, + "loss": 1.7372, + "step": 14112 + }, + { + "epoch": 4.331798649478207, + "grad_norm": 0.28611311316490173, + "learning_rate": 6.310172013223832e-05, + "loss": 1.6977, + "step": 14113 + }, + { + "epoch": 4.332105586249233, + "grad_norm": 0.30207201838493347, + "learning_rate": 6.309692318630132e-05, + "loss": 1.7765, + "step": 14114 + }, + { + "epoch": 4.332412523020258, + "grad_norm": 0.20757484436035156, + "learning_rate": 6.309212611093287e-05, + "loss": 1.697, + "step": 14115 + }, + { + "epoch": 4.332719459791283, + "grad_norm": 0.31472963094711304, + "learning_rate": 6.308732890618034e-05, + "loss": 1.7757, + "step": 14116 + }, + { + "epoch": 4.333026396562309, + "grad_norm": 0.37042325735092163, + "learning_rate": 6.308253157209117e-05, + "loss": 1.7745, + "step": 14117 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.25001442432403564, + "learning_rate": 6.307773410871275e-05, + "loss": 1.7461, + "step": 14118 + }, + { + "epoch": 4.333640270104358, + "grad_norm": 0.2691943347454071, + "learning_rate": 6.307293651609248e-05, + "loss": 1.7539, + "step": 14119 + }, + { + "epoch": 4.333947206875384, + "grad_norm": 0.30845868587493896, + "learning_rate": 6.306813879427782e-05, + "loss": 1.7559, + "step": 14120 + }, + { + "epoch": 4.334254143646409, + "grad_norm": 0.2244730293750763, + "learning_rate": 6.306334094331613e-05, + "loss": 1.7609, + "step": 14121 + }, + { + "epoch": 4.334561080417434, + "grad_norm": 0.32132062315940857, + "learning_rate": 6.305854296325485e-05, + "loss": 1.7837, + "step": 14122 + }, + { + "epoch": 4.334868017188459, + "grad_norm": 0.3762948513031006, + "learning_rate": 6.30537448541414e-05, + "loss": 1.7631, + "step": 14123 + }, + { + "epoch": 4.335174953959484, + "grad_norm": 0.24174273014068604, + "learning_rate": 6.30489466160232e-05, + "loss": 1.7532, + "step": 14124 + }, + { + "epoch": 4.3354818907305095, + "grad_norm": 0.23468497395515442, + "learning_rate": 6.304414824894765e-05, + "loss": 1.7731, + "step": 14125 + }, + { + "epoch": 4.335788827501535, + "grad_norm": 0.29086077213287354, + "learning_rate": 6.303934975296218e-05, + "loss": 1.7668, + "step": 14126 + }, + { + "epoch": 4.33609576427256, + "grad_norm": 0.2889879643917084, + "learning_rate": 6.303455112811422e-05, + "loss": 1.8188, + "step": 14127 + }, + { + "epoch": 4.336402701043585, + "grad_norm": 0.2335619181394577, + "learning_rate": 6.302975237445119e-05, + "loss": 1.7944, + "step": 14128 + }, + { + "epoch": 4.33670963781461, + "grad_norm": 0.29027310013771057, + "learning_rate": 6.302495349202051e-05, + "loss": 1.7771, + "step": 14129 + }, + { + "epoch": 4.337016574585635, + "grad_norm": 0.31961241364479065, + "learning_rate": 6.302015448086959e-05, + "loss": 1.8187, + "step": 14130 + }, + { + "epoch": 4.337323511356661, + "grad_norm": 0.26015788316726685, + "learning_rate": 6.301535534104587e-05, + "loss": 1.7819, + "step": 14131 + }, + { + "epoch": 4.337630448127686, + "grad_norm": 0.2440631091594696, + "learning_rate": 6.30105560725968e-05, + "loss": 1.7127, + "step": 14132 + }, + { + "epoch": 4.337937384898711, + "grad_norm": 0.304441899061203, + "learning_rate": 6.300575667556979e-05, + "loss": 1.7619, + "step": 14133 + }, + { + "epoch": 4.338244321669736, + "grad_norm": 0.3085228204727173, + "learning_rate": 6.300095715001226e-05, + "loss": 1.8287, + "step": 14134 + }, + { + "epoch": 4.338551258440761, + "grad_norm": 0.2863372564315796, + "learning_rate": 6.299615749597165e-05, + "loss": 1.8068, + "step": 14135 + }, + { + "epoch": 4.338858195211786, + "grad_norm": 0.25255265831947327, + "learning_rate": 6.299135771349537e-05, + "loss": 1.7506, + "step": 14136 + }, + { + "epoch": 4.339165131982812, + "grad_norm": 0.30224961042404175, + "learning_rate": 6.298655780263092e-05, + "loss": 1.7292, + "step": 14137 + }, + { + "epoch": 4.339472068753837, + "grad_norm": 0.24222104251384735, + "learning_rate": 6.298175776342567e-05, + "loss": 1.7616, + "step": 14138 + }, + { + "epoch": 4.3397790055248615, + "grad_norm": 0.3236368000507355, + "learning_rate": 6.29769575959271e-05, + "loss": 1.787, + "step": 14139 + }, + { + "epoch": 4.340085942295887, + "grad_norm": 0.26049408316612244, + "learning_rate": 6.297215730018261e-05, + "loss": 1.7108, + "step": 14140 + }, + { + "epoch": 4.340392879066912, + "grad_norm": 0.22833532094955444, + "learning_rate": 6.296735687623967e-05, + "loss": 1.7661, + "step": 14141 + }, + { + "epoch": 4.3406998158379375, + "grad_norm": 0.28397905826568604, + "learning_rate": 6.296255632414571e-05, + "loss": 1.7163, + "step": 14142 + }, + { + "epoch": 4.341006752608963, + "grad_norm": 0.3072611093521118, + "learning_rate": 6.295775564394817e-05, + "loss": 1.857, + "step": 14143 + }, + { + "epoch": 4.341313689379987, + "grad_norm": 0.22901058197021484, + "learning_rate": 6.295295483569448e-05, + "loss": 1.7325, + "step": 14144 + }, + { + "epoch": 4.341620626151013, + "grad_norm": 0.27433091402053833, + "learning_rate": 6.294815389943212e-05, + "loss": 1.8229, + "step": 14145 + }, + { + "epoch": 4.341927562922038, + "grad_norm": 0.2635616958141327, + "learning_rate": 6.29433528352085e-05, + "loss": 1.7585, + "step": 14146 + }, + { + "epoch": 4.342234499693063, + "grad_norm": 0.29129260778427124, + "learning_rate": 6.293855164307108e-05, + "loss": 1.8294, + "step": 14147 + }, + { + "epoch": 4.342541436464089, + "grad_norm": 0.3429001569747925, + "learning_rate": 6.293375032306731e-05, + "loss": 1.7725, + "step": 14148 + }, + { + "epoch": 4.342848373235114, + "grad_norm": 0.22407259047031403, + "learning_rate": 6.292894887524464e-05, + "loss": 1.7018, + "step": 14149 + }, + { + "epoch": 4.343155310006138, + "grad_norm": 0.3319321274757385, + "learning_rate": 6.292414729965053e-05, + "loss": 1.8472, + "step": 14150 + }, + { + "epoch": 4.343462246777164, + "grad_norm": 0.42744341492652893, + "learning_rate": 6.291934559633241e-05, + "loss": 1.8118, + "step": 14151 + }, + { + "epoch": 4.343769183548189, + "grad_norm": 0.24572840332984924, + "learning_rate": 6.291454376533774e-05, + "loss": 1.7184, + "step": 14152 + }, + { + "epoch": 4.344076120319214, + "grad_norm": 0.2485980987548828, + "learning_rate": 6.290974180671397e-05, + "loss": 1.7649, + "step": 14153 + }, + { + "epoch": 4.34438305709024, + "grad_norm": 0.3911706209182739, + "learning_rate": 6.29049397205086e-05, + "loss": 1.8105, + "step": 14154 + }, + { + "epoch": 4.344689993861264, + "grad_norm": 0.3008342981338501, + "learning_rate": 6.290013750676902e-05, + "loss": 1.7671, + "step": 14155 + }, + { + "epoch": 4.3449969306322895, + "grad_norm": 0.2072051614522934, + "learning_rate": 6.289533516554274e-05, + "loss": 1.7406, + "step": 14156 + }, + { + "epoch": 4.345303867403315, + "grad_norm": 0.3047312796115875, + "learning_rate": 6.289053269687719e-05, + "loss": 1.8133, + "step": 14157 + }, + { + "epoch": 4.34561080417434, + "grad_norm": 0.28260552883148193, + "learning_rate": 6.288573010081984e-05, + "loss": 1.7253, + "step": 14158 + }, + { + "epoch": 4.3459177409453655, + "grad_norm": 0.2474137246608734, + "learning_rate": 6.288092737741815e-05, + "loss": 1.822, + "step": 14159 + }, + { + "epoch": 4.346224677716391, + "grad_norm": 0.23717878758907318, + "learning_rate": 6.287612452671961e-05, + "loss": 1.7826, + "step": 14160 + }, + { + "epoch": 4.346531614487415, + "grad_norm": 0.2646107077598572, + "learning_rate": 6.287132154877163e-05, + "loss": 1.8118, + "step": 14161 + }, + { + "epoch": 4.346838551258441, + "grad_norm": 0.22026480734348297, + "learning_rate": 6.286651844362172e-05, + "loss": 1.7767, + "step": 14162 + }, + { + "epoch": 4.347145488029466, + "grad_norm": 0.2692350447177887, + "learning_rate": 6.286171521131733e-05, + "loss": 1.8718, + "step": 14163 + }, + { + "epoch": 4.347452424800491, + "grad_norm": 0.2749998867511749, + "learning_rate": 6.285691185190592e-05, + "loss": 1.7689, + "step": 14164 + }, + { + "epoch": 4.347759361571517, + "grad_norm": 0.24552448093891144, + "learning_rate": 6.2852108365435e-05, + "loss": 1.8049, + "step": 14165 + }, + { + "epoch": 4.348066298342541, + "grad_norm": 0.20530807971954346, + "learning_rate": 6.2847304751952e-05, + "loss": 1.7606, + "step": 14166 + }, + { + "epoch": 4.348373235113566, + "grad_norm": 0.23396088182926178, + "learning_rate": 6.28425010115044e-05, + "loss": 1.7482, + "step": 14167 + }, + { + "epoch": 4.348680171884592, + "grad_norm": 0.20512452721595764, + "learning_rate": 6.283769714413968e-05, + "loss": 1.6976, + "step": 14168 + }, + { + "epoch": 4.348987108655617, + "grad_norm": 0.20287172496318817, + "learning_rate": 6.283289314990531e-05, + "loss": 1.7439, + "step": 14169 + }, + { + "epoch": 4.349294045426642, + "grad_norm": 0.2193746268749237, + "learning_rate": 6.282808902884876e-05, + "loss": 1.763, + "step": 14170 + }, + { + "epoch": 4.349600982197667, + "grad_norm": 0.20415273308753967, + "learning_rate": 6.282328478101753e-05, + "loss": 1.7025, + "step": 14171 + }, + { + "epoch": 4.349907918968692, + "grad_norm": 0.19286803901195526, + "learning_rate": 6.281848040645907e-05, + "loss": 1.7529, + "step": 14172 + }, + { + "epoch": 4.350214855739718, + "grad_norm": 0.20908218622207642, + "learning_rate": 6.281367590522088e-05, + "loss": 1.7896, + "step": 14173 + }, + { + "epoch": 4.350521792510743, + "grad_norm": 0.2599989175796509, + "learning_rate": 6.280887127735045e-05, + "loss": 1.764, + "step": 14174 + }, + { + "epoch": 4.350828729281768, + "grad_norm": 0.23955710232257843, + "learning_rate": 6.280406652289523e-05, + "loss": 1.7321, + "step": 14175 + }, + { + "epoch": 4.351135666052793, + "grad_norm": 0.2311990112066269, + "learning_rate": 6.279926164190272e-05, + "loss": 1.7338, + "step": 14176 + }, + { + "epoch": 4.351442602823818, + "grad_norm": 0.2599658966064453, + "learning_rate": 6.27944566344204e-05, + "loss": 1.7444, + "step": 14177 + }, + { + "epoch": 4.351749539594843, + "grad_norm": 0.23079386353492737, + "learning_rate": 6.278965150049579e-05, + "loss": 1.7011, + "step": 14178 + }, + { + "epoch": 4.352056476365869, + "grad_norm": 0.24844171106815338, + "learning_rate": 6.278484624017631e-05, + "loss": 1.7298, + "step": 14179 + }, + { + "epoch": 4.352363413136894, + "grad_norm": 0.24839860200881958, + "learning_rate": 6.27800408535095e-05, + "loss": 1.7717, + "step": 14180 + }, + { + "epoch": 4.352670349907919, + "grad_norm": 0.2652966380119324, + "learning_rate": 6.277523534054284e-05, + "loss": 1.7759, + "step": 14181 + }, + { + "epoch": 4.352977286678944, + "grad_norm": 0.2787603735923767, + "learning_rate": 6.277042970132381e-05, + "loss": 1.8981, + "step": 14182 + }, + { + "epoch": 4.353284223449969, + "grad_norm": 0.2535475194454193, + "learning_rate": 6.276562393589991e-05, + "loss": 1.7538, + "step": 14183 + }, + { + "epoch": 4.3535911602209945, + "grad_norm": 0.3210967183113098, + "learning_rate": 6.276081804431863e-05, + "loss": 1.7087, + "step": 14184 + }, + { + "epoch": 4.35389809699202, + "grad_norm": 0.29936519265174866, + "learning_rate": 6.275601202662749e-05, + "loss": 1.7647, + "step": 14185 + }, + { + "epoch": 4.354205033763045, + "grad_norm": 0.21980762481689453, + "learning_rate": 6.275120588287394e-05, + "loss": 1.7759, + "step": 14186 + }, + { + "epoch": 4.35451197053407, + "grad_norm": 0.26833051443099976, + "learning_rate": 6.274639961310549e-05, + "loss": 1.7648, + "step": 14187 + }, + { + "epoch": 4.354818907305095, + "grad_norm": 0.27998095750808716, + "learning_rate": 6.274159321736966e-05, + "loss": 1.746, + "step": 14188 + }, + { + "epoch": 4.35512584407612, + "grad_norm": 0.21354494988918304, + "learning_rate": 6.273678669571395e-05, + "loss": 1.7417, + "step": 14189 + }, + { + "epoch": 4.355432780847146, + "grad_norm": 0.2295297235250473, + "learning_rate": 6.273198004818583e-05, + "loss": 1.7805, + "step": 14190 + }, + { + "epoch": 4.355739717618171, + "grad_norm": 0.2416422963142395, + "learning_rate": 6.272717327483283e-05, + "loss": 1.73, + "step": 14191 + }, + { + "epoch": 4.356046654389196, + "grad_norm": 0.2685304880142212, + "learning_rate": 6.272236637570244e-05, + "loss": 1.7936, + "step": 14192 + }, + { + "epoch": 4.356353591160221, + "grad_norm": 0.32481294870376587, + "learning_rate": 6.271755935084218e-05, + "loss": 1.7192, + "step": 14193 + }, + { + "epoch": 4.356660527931246, + "grad_norm": 0.2428581267595291, + "learning_rate": 6.271275220029954e-05, + "loss": 1.7428, + "step": 14194 + }, + { + "epoch": 4.356967464702271, + "grad_norm": 0.2266654521226883, + "learning_rate": 6.270794492412203e-05, + "loss": 1.7266, + "step": 14195 + }, + { + "epoch": 4.357274401473297, + "grad_norm": 0.25062093138694763, + "learning_rate": 6.270313752235716e-05, + "loss": 1.7476, + "step": 14196 + }, + { + "epoch": 4.357581338244322, + "grad_norm": 0.24085770547389984, + "learning_rate": 6.269832999505244e-05, + "loss": 1.7981, + "step": 14197 + }, + { + "epoch": 4.3578882750153465, + "grad_norm": 0.27035796642303467, + "learning_rate": 6.269352234225536e-05, + "loss": 1.8867, + "step": 14198 + }, + { + "epoch": 4.358195211786372, + "grad_norm": 0.22464458644390106, + "learning_rate": 6.268871456401348e-05, + "loss": 1.7514, + "step": 14199 + }, + { + "epoch": 4.358502148557397, + "grad_norm": 0.22485734522342682, + "learning_rate": 6.268390666037427e-05, + "loss": 1.7558, + "step": 14200 + }, + { + "epoch": 4.3588090853284225, + "grad_norm": 0.2052135169506073, + "learning_rate": 6.267909863138527e-05, + "loss": 1.7453, + "step": 14201 + }, + { + "epoch": 4.359116022099448, + "grad_norm": 0.2130763679742813, + "learning_rate": 6.267429047709397e-05, + "loss": 1.7712, + "step": 14202 + }, + { + "epoch": 4.359422958870473, + "grad_norm": 0.23146997392177582, + "learning_rate": 6.266948219754793e-05, + "loss": 1.6978, + "step": 14203 + }, + { + "epoch": 4.359729895641498, + "grad_norm": 0.21657225489616394, + "learning_rate": 6.266467379279463e-05, + "loss": 1.7641, + "step": 14204 + }, + { + "epoch": 4.360036832412523, + "grad_norm": 0.2598700523376465, + "learning_rate": 6.265986526288158e-05, + "loss": 1.7956, + "step": 14205 + }, + { + "epoch": 4.360343769183548, + "grad_norm": 0.23497453331947327, + "learning_rate": 6.265505660785633e-05, + "loss": 1.7835, + "step": 14206 + }, + { + "epoch": 4.360650705954574, + "grad_norm": 0.2491760104894638, + "learning_rate": 6.265024782776641e-05, + "loss": 1.8454, + "step": 14207 + }, + { + "epoch": 4.360957642725599, + "grad_norm": 0.224884033203125, + "learning_rate": 6.264543892265932e-05, + "loss": 1.8383, + "step": 14208 + }, + { + "epoch": 4.361264579496623, + "grad_norm": 0.24057646095752716, + "learning_rate": 6.264062989258259e-05, + "loss": 1.7437, + "step": 14209 + }, + { + "epoch": 4.361571516267649, + "grad_norm": 0.24661841988563538, + "learning_rate": 6.263582073758374e-05, + "loss": 1.8151, + "step": 14210 + }, + { + "epoch": 4.361878453038674, + "grad_norm": 0.24618980288505554, + "learning_rate": 6.263101145771031e-05, + "loss": 1.7955, + "step": 14211 + }, + { + "epoch": 4.362185389809699, + "grad_norm": 0.2615448236465454, + "learning_rate": 6.262620205300981e-05, + "loss": 1.7819, + "step": 14212 + }, + { + "epoch": 4.362492326580725, + "grad_norm": 0.3528309464454651, + "learning_rate": 6.26213925235298e-05, + "loss": 1.7723, + "step": 14213 + }, + { + "epoch": 4.362799263351749, + "grad_norm": 0.3099561035633087, + "learning_rate": 6.261658286931779e-05, + "loss": 1.7361, + "step": 14214 + }, + { + "epoch": 4.3631062001227745, + "grad_norm": 0.23693235218524933, + "learning_rate": 6.26117730904213e-05, + "loss": 1.8117, + "step": 14215 + }, + { + "epoch": 4.3634131368938, + "grad_norm": 0.4164150655269623, + "learning_rate": 6.260696318688786e-05, + "loss": 1.7908, + "step": 14216 + }, + { + "epoch": 4.363720073664825, + "grad_norm": 0.39376336336135864, + "learning_rate": 6.260215315876506e-05, + "loss": 1.7832, + "step": 14217 + }, + { + "epoch": 4.3640270104358505, + "grad_norm": 0.24071799218654633, + "learning_rate": 6.259734300610037e-05, + "loss": 1.7569, + "step": 14218 + }, + { + "epoch": 4.364333947206875, + "grad_norm": 0.4305122494697571, + "learning_rate": 6.259253272894136e-05, + "loss": 1.7974, + "step": 14219 + }, + { + "epoch": 4.3646408839779, + "grad_norm": 0.3023197054862976, + "learning_rate": 6.258772232733556e-05, + "loss": 1.7589, + "step": 14220 + }, + { + "epoch": 4.364947820748926, + "grad_norm": 0.23253366351127625, + "learning_rate": 6.258291180133052e-05, + "loss": 1.7138, + "step": 14221 + }, + { + "epoch": 4.365254757519951, + "grad_norm": 0.41141277551651, + "learning_rate": 6.257810115097376e-05, + "loss": 1.7608, + "step": 14222 + }, + { + "epoch": 4.365561694290976, + "grad_norm": 0.3308235704898834, + "learning_rate": 6.257329037631284e-05, + "loss": 1.8006, + "step": 14223 + }, + { + "epoch": 4.365868631062002, + "grad_norm": 0.2635105848312378, + "learning_rate": 6.256847947739528e-05, + "loss": 1.7275, + "step": 14224 + }, + { + "epoch": 4.366175567833026, + "grad_norm": 0.45886602997779846, + "learning_rate": 6.256366845426864e-05, + "loss": 1.7701, + "step": 14225 + }, + { + "epoch": 4.366482504604051, + "grad_norm": 0.48503565788269043, + "learning_rate": 6.255885730698049e-05, + "loss": 1.7409, + "step": 14226 + }, + { + "epoch": 4.366789441375077, + "grad_norm": 0.26727184653282166, + "learning_rate": 6.255404603557833e-05, + "loss": 1.7288, + "step": 14227 + }, + { + "epoch": 4.367096378146102, + "grad_norm": 0.3343912363052368, + "learning_rate": 6.254923464010974e-05, + "loss": 1.764, + "step": 14228 + }, + { + "epoch": 4.367403314917127, + "grad_norm": 0.40050622820854187, + "learning_rate": 6.254442312062224e-05, + "loss": 1.7653, + "step": 14229 + }, + { + "epoch": 4.367710251688152, + "grad_norm": 0.23941144347190857, + "learning_rate": 6.253961147716341e-05, + "loss": 1.6886, + "step": 14230 + }, + { + "epoch": 4.368017188459177, + "grad_norm": 0.25737255811691284, + "learning_rate": 6.253479970978079e-05, + "loss": 1.8047, + "step": 14231 + }, + { + "epoch": 4.3683241252302025, + "grad_norm": 0.28780993819236755, + "learning_rate": 6.252998781852192e-05, + "loss": 1.7453, + "step": 14232 + }, + { + "epoch": 4.368631062001228, + "grad_norm": 0.2362327128648758, + "learning_rate": 6.252517580343438e-05, + "loss": 1.7963, + "step": 14233 + }, + { + "epoch": 4.368937998772253, + "grad_norm": 0.263013631105423, + "learning_rate": 6.252036366456571e-05, + "loss": 1.7837, + "step": 14234 + }, + { + "epoch": 4.3692449355432785, + "grad_norm": 0.27674412727355957, + "learning_rate": 6.251555140196347e-05, + "loss": 1.767, + "step": 14235 + }, + { + "epoch": 4.369551872314303, + "grad_norm": 0.2360621690750122, + "learning_rate": 6.251073901567522e-05, + "loss": 1.7806, + "step": 14236 + }, + { + "epoch": 4.369858809085328, + "grad_norm": 0.2568018138408661, + "learning_rate": 6.25059265057485e-05, + "loss": 1.7672, + "step": 14237 + }, + { + "epoch": 4.370165745856354, + "grad_norm": 0.2512381374835968, + "learning_rate": 6.25011138722309e-05, + "loss": 1.7506, + "step": 14238 + }, + { + "epoch": 4.370472682627379, + "grad_norm": 0.21587291359901428, + "learning_rate": 6.249630111516994e-05, + "loss": 1.7336, + "step": 14239 + }, + { + "epoch": 4.370779619398404, + "grad_norm": 0.21791933476924896, + "learning_rate": 6.249148823461323e-05, + "loss": 1.7588, + "step": 14240 + }, + { + "epoch": 4.371086556169429, + "grad_norm": 0.23061512410640717, + "learning_rate": 6.248667523060831e-05, + "loss": 1.742, + "step": 14241 + }, + { + "epoch": 4.371393492940454, + "grad_norm": 0.2007007598876953, + "learning_rate": 6.248186210320274e-05, + "loss": 1.7227, + "step": 14242 + }, + { + "epoch": 4.371700429711479, + "grad_norm": 0.2564350366592407, + "learning_rate": 6.247704885244411e-05, + "loss": 1.7529, + "step": 14243 + }, + { + "epoch": 4.372007366482505, + "grad_norm": 0.21880537271499634, + "learning_rate": 6.247223547837995e-05, + "loss": 1.7828, + "step": 14244 + }, + { + "epoch": 4.37231430325353, + "grad_norm": 0.26154282689094543, + "learning_rate": 6.246742198105785e-05, + "loss": 1.7895, + "step": 14245 + }, + { + "epoch": 4.3726212400245545, + "grad_norm": 0.2652645707130432, + "learning_rate": 6.24626083605254e-05, + "loss": 1.8038, + "step": 14246 + }, + { + "epoch": 4.37292817679558, + "grad_norm": 0.21463751792907715, + "learning_rate": 6.245779461683013e-05, + "loss": 1.7139, + "step": 14247 + }, + { + "epoch": 4.373235113566605, + "grad_norm": 0.21285851299762726, + "learning_rate": 6.245298075001961e-05, + "loss": 1.7686, + "step": 14248 + }, + { + "epoch": 4.3735420503376305, + "grad_norm": 0.258602499961853, + "learning_rate": 6.244816676014149e-05, + "loss": 1.8518, + "step": 14249 + }, + { + "epoch": 4.373848987108656, + "grad_norm": 0.25747501850128174, + "learning_rate": 6.244335264724323e-05, + "loss": 1.8019, + "step": 14250 + }, + { + "epoch": 4.37415592387968, + "grad_norm": 0.24678784608840942, + "learning_rate": 6.243853841137251e-05, + "loss": 1.7846, + "step": 14251 + }, + { + "epoch": 4.374462860650706, + "grad_norm": 0.31382107734680176, + "learning_rate": 6.243372405257685e-05, + "loss": 1.8389, + "step": 14252 + }, + { + "epoch": 4.374769797421731, + "grad_norm": 0.30522868037223816, + "learning_rate": 6.242890957090383e-05, + "loss": 1.8057, + "step": 14253 + }, + { + "epoch": 4.375076734192756, + "grad_norm": 0.2449347972869873, + "learning_rate": 6.242409496640106e-05, + "loss": 1.7144, + "step": 14254 + }, + { + "epoch": 4.375383670963782, + "grad_norm": 0.3193594217300415, + "learning_rate": 6.241928023911609e-05, + "loss": 1.7404, + "step": 14255 + }, + { + "epoch": 4.375690607734807, + "grad_norm": 0.23948179185390472, + "learning_rate": 6.241446538909651e-05, + "loss": 1.7338, + "step": 14256 + }, + { + "epoch": 4.3759975445058314, + "grad_norm": 0.35325706005096436, + "learning_rate": 6.240965041638991e-05, + "loss": 1.7673, + "step": 14257 + }, + { + "epoch": 4.376304481276857, + "grad_norm": 0.38753262162208557, + "learning_rate": 6.240483532104387e-05, + "loss": 1.769, + "step": 14258 + }, + { + "epoch": 4.376611418047882, + "grad_norm": 0.2749052941799164, + "learning_rate": 6.2400020103106e-05, + "loss": 1.8086, + "step": 14259 + }, + { + "epoch": 4.3769183548189075, + "grad_norm": 0.2553126811981201, + "learning_rate": 6.239520476262384e-05, + "loss": 1.7733, + "step": 14260 + }, + { + "epoch": 4.377225291589933, + "grad_norm": 0.2854517698287964, + "learning_rate": 6.2390389299645e-05, + "loss": 1.7926, + "step": 14261 + }, + { + "epoch": 4.377532228360957, + "grad_norm": 0.24617259204387665, + "learning_rate": 6.238557371421708e-05, + "loss": 1.7297, + "step": 14262 + }, + { + "epoch": 4.377839165131983, + "grad_norm": 0.2555331289768219, + "learning_rate": 6.238075800638765e-05, + "loss": 1.7566, + "step": 14263 + }, + { + "epoch": 4.378146101903008, + "grad_norm": 0.31666773557662964, + "learning_rate": 6.237594217620432e-05, + "loss": 1.8003, + "step": 14264 + }, + { + "epoch": 4.378453038674033, + "grad_norm": 0.24166476726531982, + "learning_rate": 6.237112622371468e-05, + "loss": 1.7425, + "step": 14265 + }, + { + "epoch": 4.378759975445059, + "grad_norm": 0.21237102150917053, + "learning_rate": 6.236631014896633e-05, + "loss": 1.73, + "step": 14266 + }, + { + "epoch": 4.379066912216084, + "grad_norm": 0.2739151120185852, + "learning_rate": 6.236149395200683e-05, + "loss": 1.7113, + "step": 14267 + }, + { + "epoch": 4.379373848987108, + "grad_norm": 0.23700746893882751, + "learning_rate": 6.23566776328838e-05, + "loss": 1.7256, + "step": 14268 + }, + { + "epoch": 4.379680785758134, + "grad_norm": 0.22366748750209808, + "learning_rate": 6.235186119164485e-05, + "loss": 1.7981, + "step": 14269 + }, + { + "epoch": 4.379987722529159, + "grad_norm": 0.28440114855766296, + "learning_rate": 6.234704462833758e-05, + "loss": 1.8087, + "step": 14270 + }, + { + "epoch": 4.380294659300184, + "grad_norm": 0.2706616520881653, + "learning_rate": 6.234222794300957e-05, + "loss": 1.7502, + "step": 14271 + }, + { + "epoch": 4.38060159607121, + "grad_norm": 0.21666266024112701, + "learning_rate": 6.233741113570843e-05, + "loss": 1.7639, + "step": 14272 + }, + { + "epoch": 4.380908532842234, + "grad_norm": 0.26790255308151245, + "learning_rate": 6.233259420648175e-05, + "loss": 1.796, + "step": 14273 + }, + { + "epoch": 4.3812154696132595, + "grad_norm": 0.22233673930168152, + "learning_rate": 6.232777715537715e-05, + "loss": 1.7661, + "step": 14274 + }, + { + "epoch": 4.381522406384285, + "grad_norm": 0.3277546763420105, + "learning_rate": 6.232295998244223e-05, + "loss": 1.7932, + "step": 14275 + }, + { + "epoch": 4.38182934315531, + "grad_norm": 0.2907596826553345, + "learning_rate": 6.231814268772463e-05, + "loss": 1.7103, + "step": 14276 + }, + { + "epoch": 4.3821362799263355, + "grad_norm": 0.2318384349346161, + "learning_rate": 6.231332527127188e-05, + "loss": 1.7351, + "step": 14277 + }, + { + "epoch": 4.382443216697361, + "grad_norm": 0.32904061675071716, + "learning_rate": 6.230850773313163e-05, + "loss": 1.7967, + "step": 14278 + }, + { + "epoch": 4.382750153468385, + "grad_norm": 0.2455490082502365, + "learning_rate": 6.230369007335153e-05, + "loss": 1.7474, + "step": 14279 + }, + { + "epoch": 4.383057090239411, + "grad_norm": 0.23648180067539215, + "learning_rate": 6.229887229197913e-05, + "loss": 1.7106, + "step": 14280 + }, + { + "epoch": 4.383364027010436, + "grad_norm": 0.29552599787712097, + "learning_rate": 6.229405438906207e-05, + "loss": 1.7765, + "step": 14281 + }, + { + "epoch": 4.383670963781461, + "grad_norm": 0.2094641923904419, + "learning_rate": 6.228923636464796e-05, + "loss": 1.7105, + "step": 14282 + }, + { + "epoch": 4.383977900552487, + "grad_norm": 0.24632154405117035, + "learning_rate": 6.228441821878441e-05, + "loss": 1.7913, + "step": 14283 + }, + { + "epoch": 4.384284837323511, + "grad_norm": 0.28114691376686096, + "learning_rate": 6.227959995151904e-05, + "loss": 1.7456, + "step": 14284 + }, + { + "epoch": 4.384591774094536, + "grad_norm": 0.24226875603199005, + "learning_rate": 6.227478156289946e-05, + "loss": 1.797, + "step": 14285 + }, + { + "epoch": 4.384898710865562, + "grad_norm": 0.2526854872703552, + "learning_rate": 6.22699630529733e-05, + "loss": 1.7155, + "step": 14286 + }, + { + "epoch": 4.385205647636587, + "grad_norm": 0.312916100025177, + "learning_rate": 6.226514442178818e-05, + "loss": 1.7808, + "step": 14287 + }, + { + "epoch": 4.385512584407612, + "grad_norm": 0.23087100684642792, + "learning_rate": 6.22603256693917e-05, + "loss": 1.7543, + "step": 14288 + }, + { + "epoch": 4.385819521178637, + "grad_norm": 0.3042476177215576, + "learning_rate": 6.22555067958315e-05, + "loss": 1.747, + "step": 14289 + }, + { + "epoch": 4.386126457949662, + "grad_norm": 0.2604007422924042, + "learning_rate": 6.225068780115522e-05, + "loss": 1.7262, + "step": 14290 + }, + { + "epoch": 4.3864333947206875, + "grad_norm": 0.2200118750333786, + "learning_rate": 6.224586868541044e-05, + "loss": 1.75, + "step": 14291 + }, + { + "epoch": 4.386740331491713, + "grad_norm": 0.3452017307281494, + "learning_rate": 6.224104944864481e-05, + "loss": 1.7598, + "step": 14292 + }, + { + "epoch": 4.387047268262738, + "grad_norm": 0.3169453740119934, + "learning_rate": 6.223623009090597e-05, + "loss": 1.7939, + "step": 14293 + }, + { + "epoch": 4.387354205033763, + "grad_norm": 0.23640502989292145, + "learning_rate": 6.223141061224151e-05, + "loss": 1.8005, + "step": 14294 + }, + { + "epoch": 4.387661141804788, + "grad_norm": 0.26212456822395325, + "learning_rate": 6.22265910126991e-05, + "loss": 1.7951, + "step": 14295 + }, + { + "epoch": 4.387968078575813, + "grad_norm": 0.2687644362449646, + "learning_rate": 6.222177129232634e-05, + "loss": 1.7674, + "step": 14296 + }, + { + "epoch": 4.388275015346839, + "grad_norm": 0.2553202211856842, + "learning_rate": 6.221695145117086e-05, + "loss": 1.8142, + "step": 14297 + }, + { + "epoch": 4.388581952117864, + "grad_norm": 0.3317619264125824, + "learning_rate": 6.221213148928034e-05, + "loss": 1.7884, + "step": 14298 + }, + { + "epoch": 4.388888888888889, + "grad_norm": 0.3059331476688385, + "learning_rate": 6.220731140670235e-05, + "loss": 1.7377, + "step": 14299 + }, + { + "epoch": 4.389195825659914, + "grad_norm": 0.21544015407562256, + "learning_rate": 6.220249120348457e-05, + "loss": 1.6818, + "step": 14300 + }, + { + "epoch": 4.389502762430939, + "grad_norm": 0.3112640380859375, + "learning_rate": 6.219767087967461e-05, + "loss": 1.72, + "step": 14301 + }, + { + "epoch": 4.389809699201964, + "grad_norm": 0.2572654187679291, + "learning_rate": 6.219285043532011e-05, + "loss": 1.793, + "step": 14302 + }, + { + "epoch": 4.39011663597299, + "grad_norm": 0.2621476948261261, + "learning_rate": 6.218802987046874e-05, + "loss": 1.8301, + "step": 14303 + }, + { + "epoch": 4.390423572744015, + "grad_norm": 0.2592658996582031, + "learning_rate": 6.218320918516809e-05, + "loss": 1.7219, + "step": 14304 + }, + { + "epoch": 4.3907305095150395, + "grad_norm": 0.25503265857696533, + "learning_rate": 6.217838837946584e-05, + "loss": 1.8149, + "step": 14305 + }, + { + "epoch": 4.391037446286065, + "grad_norm": 0.21944166719913483, + "learning_rate": 6.217356745340962e-05, + "loss": 1.7174, + "step": 14306 + }, + { + "epoch": 4.39134438305709, + "grad_norm": 0.2937396466732025, + "learning_rate": 6.216874640704707e-05, + "loss": 1.8562, + "step": 14307 + }, + { + "epoch": 4.3916513198281155, + "grad_norm": 0.22520211338996887, + "learning_rate": 6.216392524042581e-05, + "loss": 1.7701, + "step": 14308 + }, + { + "epoch": 4.391958256599141, + "grad_norm": 0.24397830665111542, + "learning_rate": 6.215910395359355e-05, + "loss": 1.7794, + "step": 14309 + }, + { + "epoch": 4.392265193370166, + "grad_norm": 0.2867623567581177, + "learning_rate": 6.215428254659788e-05, + "loss": 1.7275, + "step": 14310 + }, + { + "epoch": 4.392572130141191, + "grad_norm": 0.2632426917552948, + "learning_rate": 6.214946101948648e-05, + "loss": 1.7919, + "step": 14311 + }, + { + "epoch": 4.392879066912216, + "grad_norm": 0.23146092891693115, + "learning_rate": 6.214463937230696e-05, + "loss": 1.744, + "step": 14312 + }, + { + "epoch": 4.393186003683241, + "grad_norm": 0.21877676248550415, + "learning_rate": 6.213981760510701e-05, + "loss": 1.7577, + "step": 14313 + }, + { + "epoch": 4.393492940454267, + "grad_norm": 0.2320399284362793, + "learning_rate": 6.213499571793426e-05, + "loss": 1.7864, + "step": 14314 + }, + { + "epoch": 4.393799877225292, + "grad_norm": 0.2951548993587494, + "learning_rate": 6.213017371083638e-05, + "loss": 1.8257, + "step": 14315 + }, + { + "epoch": 4.394106813996316, + "grad_norm": 0.26062941551208496, + "learning_rate": 6.212535158386102e-05, + "loss": 1.7448, + "step": 14316 + }, + { + "epoch": 4.394413750767342, + "grad_norm": 0.24760986864566803, + "learning_rate": 6.21205293370558e-05, + "loss": 1.7902, + "step": 14317 + }, + { + "epoch": 4.394720687538367, + "grad_norm": 0.2686399221420288, + "learning_rate": 6.211570697046844e-05, + "loss": 1.8209, + "step": 14318 + }, + { + "epoch": 4.395027624309392, + "grad_norm": 0.2599134147167206, + "learning_rate": 6.211088448414653e-05, + "loss": 1.8231, + "step": 14319 + }, + { + "epoch": 4.395334561080418, + "grad_norm": 0.254044771194458, + "learning_rate": 6.210606187813778e-05, + "loss": 1.806, + "step": 14320 + }, + { + "epoch": 4.395641497851442, + "grad_norm": 0.262229323387146, + "learning_rate": 6.210123915248982e-05, + "loss": 1.7857, + "step": 14321 + }, + { + "epoch": 4.3959484346224675, + "grad_norm": 0.2849259078502655, + "learning_rate": 6.209641630725033e-05, + "loss": 1.8005, + "step": 14322 + }, + { + "epoch": 4.396255371393493, + "grad_norm": 0.35480254888534546, + "learning_rate": 6.209159334246697e-05, + "loss": 1.8189, + "step": 14323 + }, + { + "epoch": 4.396562308164518, + "grad_norm": 0.2599184215068817, + "learning_rate": 6.20867702581874e-05, + "loss": 1.7384, + "step": 14324 + }, + { + "epoch": 4.3968692449355435, + "grad_norm": 0.23994222283363342, + "learning_rate": 6.208194705445926e-05, + "loss": 1.7566, + "step": 14325 + }, + { + "epoch": 4.397176181706568, + "grad_norm": 0.24361753463745117, + "learning_rate": 6.207712373133024e-05, + "loss": 1.6965, + "step": 14326 + }, + { + "epoch": 4.397483118477593, + "grad_norm": 0.23925161361694336, + "learning_rate": 6.207230028884803e-05, + "loss": 1.7596, + "step": 14327 + }, + { + "epoch": 4.397790055248619, + "grad_norm": 0.24365897476673126, + "learning_rate": 6.206747672706025e-05, + "loss": 1.7951, + "step": 14328 + }, + { + "epoch": 4.398096992019644, + "grad_norm": 0.25245413184165955, + "learning_rate": 6.206265304601461e-05, + "loss": 1.8086, + "step": 14329 + }, + { + "epoch": 4.398403928790669, + "grad_norm": 0.24272513389587402, + "learning_rate": 6.205782924575874e-05, + "loss": 1.8148, + "step": 14330 + }, + { + "epoch": 4.398710865561695, + "grad_norm": 0.21299590170383453, + "learning_rate": 6.205300532634036e-05, + "loss": 1.7666, + "step": 14331 + }, + { + "epoch": 4.399017802332719, + "grad_norm": 0.23543189465999603, + "learning_rate": 6.20481812878071e-05, + "loss": 1.7629, + "step": 14332 + }, + { + "epoch": 4.399324739103744, + "grad_norm": 0.2284495085477829, + "learning_rate": 6.204335713020665e-05, + "loss": 1.768, + "step": 14333 + }, + { + "epoch": 4.39963167587477, + "grad_norm": 0.23158542811870575, + "learning_rate": 6.20385328535867e-05, + "loss": 1.7761, + "step": 14334 + }, + { + "epoch": 4.399938612645795, + "grad_norm": 0.2378150224685669, + "learning_rate": 6.20337084579949e-05, + "loss": 1.8483, + "step": 14335 + }, + { + "epoch": 4.4002455494168204, + "grad_norm": 0.2407436966896057, + "learning_rate": 6.202888394347892e-05, + "loss": 1.7364, + "step": 14336 + }, + { + "epoch": 4.400552486187845, + "grad_norm": 0.256259560585022, + "learning_rate": 6.202405931008649e-05, + "loss": 1.7376, + "step": 14337 + }, + { + "epoch": 4.40085942295887, + "grad_norm": 0.29293057322502136, + "learning_rate": 6.201923455786524e-05, + "loss": 1.7493, + "step": 14338 + }, + { + "epoch": 4.401166359729896, + "grad_norm": 0.24025334417819977, + "learning_rate": 6.201440968686288e-05, + "loss": 1.7522, + "step": 14339 + }, + { + "epoch": 4.401473296500921, + "grad_norm": 0.3215656280517578, + "learning_rate": 6.200958469712708e-05, + "loss": 1.7748, + "step": 14340 + }, + { + "epoch": 4.401780233271946, + "grad_norm": 0.43553170561790466, + "learning_rate": 6.200475958870553e-05, + "loss": 1.771, + "step": 14341 + }, + { + "epoch": 4.402087170042972, + "grad_norm": 0.3112131953239441, + "learning_rate": 6.19999343616459e-05, + "loss": 1.7655, + "step": 14342 + }, + { + "epoch": 4.402394106813996, + "grad_norm": 0.25197842717170715, + "learning_rate": 6.199510901599589e-05, + "loss": 1.7214, + "step": 14343 + }, + { + "epoch": 4.402701043585021, + "grad_norm": 0.33227142691612244, + "learning_rate": 6.19902835518032e-05, + "loss": 1.7332, + "step": 14344 + }, + { + "epoch": 4.403007980356047, + "grad_norm": 0.27962982654571533, + "learning_rate": 6.198545796911548e-05, + "loss": 1.6943, + "step": 14345 + }, + { + "epoch": 4.403314917127072, + "grad_norm": 0.24374182522296906, + "learning_rate": 6.198063226798044e-05, + "loss": 1.7222, + "step": 14346 + }, + { + "epoch": 4.403621853898097, + "grad_norm": 0.3101944625377655, + "learning_rate": 6.197580644844576e-05, + "loss": 1.7113, + "step": 14347 + }, + { + "epoch": 4.403928790669122, + "grad_norm": 0.25919321179389954, + "learning_rate": 6.197098051055916e-05, + "loss": 1.71, + "step": 14348 + }, + { + "epoch": 4.404235727440147, + "grad_norm": 0.23140330612659454, + "learning_rate": 6.19661544543683e-05, + "loss": 1.7472, + "step": 14349 + }, + { + "epoch": 4.4045426642111725, + "grad_norm": 0.3274286687374115, + "learning_rate": 6.19613282799209e-05, + "loss": 1.7093, + "step": 14350 + }, + { + "epoch": 4.404849600982198, + "grad_norm": 0.3187442123889923, + "learning_rate": 6.195650198726464e-05, + "loss": 1.7488, + "step": 14351 + }, + { + "epoch": 4.405156537753223, + "grad_norm": 0.20547433197498322, + "learning_rate": 6.195167557644722e-05, + "loss": 1.7295, + "step": 14352 + }, + { + "epoch": 4.4054634745242485, + "grad_norm": 0.2623414993286133, + "learning_rate": 6.194684904751633e-05, + "loss": 1.8258, + "step": 14353 + }, + { + "epoch": 4.405770411295273, + "grad_norm": 0.2468457818031311, + "learning_rate": 6.194202240051967e-05, + "loss": 1.6957, + "step": 14354 + }, + { + "epoch": 4.406077348066298, + "grad_norm": 0.2082364559173584, + "learning_rate": 6.193719563550496e-05, + "loss": 1.7596, + "step": 14355 + }, + { + "epoch": 4.406384284837324, + "grad_norm": 0.27072983980178833, + "learning_rate": 6.193236875251988e-05, + "loss": 1.7341, + "step": 14356 + }, + { + "epoch": 4.406691221608349, + "grad_norm": 0.2630362808704376, + "learning_rate": 6.192754175161215e-05, + "loss": 1.7664, + "step": 14357 + }, + { + "epoch": 4.406998158379374, + "grad_norm": 0.25400006771087646, + "learning_rate": 6.192271463282944e-05, + "loss": 1.7582, + "step": 14358 + }, + { + "epoch": 4.407305095150399, + "grad_norm": 0.22256311774253845, + "learning_rate": 6.191788739621949e-05, + "loss": 1.7389, + "step": 14359 + }, + { + "epoch": 4.407612031921424, + "grad_norm": 0.2160387486219406, + "learning_rate": 6.191306004182999e-05, + "loss": 1.7051, + "step": 14360 + }, + { + "epoch": 4.407918968692449, + "grad_norm": 0.20665684342384338, + "learning_rate": 6.190823256970865e-05, + "loss": 1.7606, + "step": 14361 + }, + { + "epoch": 4.408225905463475, + "grad_norm": 0.2173188328742981, + "learning_rate": 6.190340497990318e-05, + "loss": 1.7944, + "step": 14362 + }, + { + "epoch": 4.4085328422345, + "grad_norm": 0.189287930727005, + "learning_rate": 6.189857727246127e-05, + "loss": 1.7283, + "step": 14363 + }, + { + "epoch": 4.4088397790055245, + "grad_norm": 0.2531645596027374, + "learning_rate": 6.189374944743065e-05, + "loss": 1.7554, + "step": 14364 + }, + { + "epoch": 4.40914671577655, + "grad_norm": 0.25439125299453735, + "learning_rate": 6.188892150485903e-05, + "loss": 1.8032, + "step": 14365 + }, + { + "epoch": 4.409453652547575, + "grad_norm": 0.20938685536384583, + "learning_rate": 6.188409344479412e-05, + "loss": 1.7385, + "step": 14366 + }, + { + "epoch": 4.4097605893186005, + "grad_norm": 0.20471477508544922, + "learning_rate": 6.187926526728364e-05, + "loss": 1.7487, + "step": 14367 + }, + { + "epoch": 4.410067526089626, + "grad_norm": 0.2381851226091385, + "learning_rate": 6.187443697237529e-05, + "loss": 1.7443, + "step": 14368 + }, + { + "epoch": 4.41037446286065, + "grad_norm": 0.21584098041057587, + "learning_rate": 6.18696085601168e-05, + "loss": 1.7818, + "step": 14369 + }, + { + "epoch": 4.410681399631676, + "grad_norm": 0.2575368583202362, + "learning_rate": 6.186478003055587e-05, + "loss": 1.8204, + "step": 14370 + }, + { + "epoch": 4.410988336402701, + "grad_norm": 0.21133238077163696, + "learning_rate": 6.185995138374024e-05, + "loss": 1.7274, + "step": 14371 + }, + { + "epoch": 4.411295273173726, + "grad_norm": 0.24918322265148163, + "learning_rate": 6.18551226197176e-05, + "loss": 1.8021, + "step": 14372 + }, + { + "epoch": 4.411602209944752, + "grad_norm": 0.2253655642271042, + "learning_rate": 6.185029373853572e-05, + "loss": 1.7308, + "step": 14373 + }, + { + "epoch": 4.411909146715777, + "grad_norm": 0.20098713040351868, + "learning_rate": 6.184546474024226e-05, + "loss": 1.7549, + "step": 14374 + }, + { + "epoch": 4.412216083486801, + "grad_norm": 0.25612789392471313, + "learning_rate": 6.1840635624885e-05, + "loss": 1.8305, + "step": 14375 + }, + { + "epoch": 4.412523020257827, + "grad_norm": 0.24287539720535278, + "learning_rate": 6.183580639251164e-05, + "loss": 1.7339, + "step": 14376 + }, + { + "epoch": 4.412829957028852, + "grad_norm": 0.2304944545030594, + "learning_rate": 6.183097704316988e-05, + "loss": 1.7023, + "step": 14377 + }, + { + "epoch": 4.413136893799877, + "grad_norm": 0.21911773085594177, + "learning_rate": 6.18261475769075e-05, + "loss": 1.7305, + "step": 14378 + }, + { + "epoch": 4.413443830570903, + "grad_norm": 0.24207864701747894, + "learning_rate": 6.182131799377217e-05, + "loss": 1.7318, + "step": 14379 + }, + { + "epoch": 4.413750767341927, + "grad_norm": 0.2551634609699249, + "learning_rate": 6.181648829381165e-05, + "loss": 1.8101, + "step": 14380 + }, + { + "epoch": 4.4140577041129525, + "grad_norm": 0.4114011526107788, + "learning_rate": 6.181165847707368e-05, + "loss": 1.772, + "step": 14381 + }, + { + "epoch": 4.414364640883978, + "grad_norm": 0.4592796862125397, + "learning_rate": 6.180682854360598e-05, + "loss": 1.7359, + "step": 14382 + }, + { + "epoch": 4.414671577655003, + "grad_norm": 0.2599259614944458, + "learning_rate": 6.180199849345627e-05, + "loss": 1.7028, + "step": 14383 + }, + { + "epoch": 4.4149785144260285, + "grad_norm": 0.3489506244659424, + "learning_rate": 6.17971683266723e-05, + "loss": 1.8252, + "step": 14384 + }, + { + "epoch": 4.415285451197054, + "grad_norm": 0.44563809037208557, + "learning_rate": 6.179233804330179e-05, + "loss": 1.6894, + "step": 14385 + }, + { + "epoch": 4.415592387968078, + "grad_norm": 0.2596888542175293, + "learning_rate": 6.17875076433925e-05, + "loss": 1.8141, + "step": 14386 + }, + { + "epoch": 4.415899324739104, + "grad_norm": 0.3560626804828644, + "learning_rate": 6.178267712699213e-05, + "loss": 1.7764, + "step": 14387 + }, + { + "epoch": 4.416206261510129, + "grad_norm": 0.3746717572212219, + "learning_rate": 6.177784649414843e-05, + "loss": 1.7528, + "step": 14388 + }, + { + "epoch": 4.416513198281154, + "grad_norm": 0.23248885571956635, + "learning_rate": 6.177301574490918e-05, + "loss": 1.7148, + "step": 14389 + }, + { + "epoch": 4.41682013505218, + "grad_norm": 0.26936978101730347, + "learning_rate": 6.176818487932208e-05, + "loss": 1.7199, + "step": 14390 + }, + { + "epoch": 4.417127071823204, + "grad_norm": 0.3102504014968872, + "learning_rate": 6.176335389743486e-05, + "loss": 1.6886, + "step": 14391 + }, + { + "epoch": 4.417434008594229, + "grad_norm": 0.24406832456588745, + "learning_rate": 6.175852279929531e-05, + "loss": 1.7766, + "step": 14392 + }, + { + "epoch": 4.417740945365255, + "grad_norm": 0.271158903837204, + "learning_rate": 6.175369158495112e-05, + "loss": 1.8099, + "step": 14393 + }, + { + "epoch": 4.41804788213628, + "grad_norm": 0.343667209148407, + "learning_rate": 6.174886025445008e-05, + "loss": 1.779, + "step": 14394 + }, + { + "epoch": 4.418354818907305, + "grad_norm": 0.37423139810562134, + "learning_rate": 6.17440288078399e-05, + "loss": 1.7796, + "step": 14395 + }, + { + "epoch": 4.41866175567833, + "grad_norm": 0.3152335286140442, + "learning_rate": 6.173919724516836e-05, + "loss": 1.7388, + "step": 14396 + }, + { + "epoch": 4.418968692449355, + "grad_norm": 0.21467824280261993, + "learning_rate": 6.173436556648319e-05, + "loss": 1.7689, + "step": 14397 + }, + { + "epoch": 4.4192756292203805, + "grad_norm": 0.2861369848251343, + "learning_rate": 6.172953377183213e-05, + "loss": 1.819, + "step": 14398 + }, + { + "epoch": 4.419582565991406, + "grad_norm": 0.34777504205703735, + "learning_rate": 6.172470186126295e-05, + "loss": 1.7444, + "step": 14399 + }, + { + "epoch": 4.419889502762431, + "grad_norm": 0.2728833854198456, + "learning_rate": 6.171986983482339e-05, + "loss": 1.7637, + "step": 14400 + }, + { + "epoch": 4.420196439533456, + "grad_norm": 0.2593914270401001, + "learning_rate": 6.17150376925612e-05, + "loss": 1.8196, + "step": 14401 + }, + { + "epoch": 4.420503376304481, + "grad_norm": 0.29425305128097534, + "learning_rate": 6.171020543452416e-05, + "loss": 1.7511, + "step": 14402 + }, + { + "epoch": 4.420810313075506, + "grad_norm": 0.2587110102176666, + "learning_rate": 6.170537306076e-05, + "loss": 1.8085, + "step": 14403 + }, + { + "epoch": 4.421117249846532, + "grad_norm": 0.22442933917045593, + "learning_rate": 6.170054057131648e-05, + "loss": 1.8023, + "step": 14404 + }, + { + "epoch": 4.421424186617557, + "grad_norm": 0.23302629590034485, + "learning_rate": 6.169570796624136e-05, + "loss": 1.7995, + "step": 14405 + }, + { + "epoch": 4.421731123388582, + "grad_norm": 0.2295885682106018, + "learning_rate": 6.169087524558239e-05, + "loss": 1.7948, + "step": 14406 + }, + { + "epoch": 4.422038060159607, + "grad_norm": 0.2161262482404709, + "learning_rate": 6.168604240938735e-05, + "loss": 1.7159, + "step": 14407 + }, + { + "epoch": 4.422344996930632, + "grad_norm": 0.20746205747127533, + "learning_rate": 6.1681209457704e-05, + "loss": 1.7703, + "step": 14408 + }, + { + "epoch": 4.422651933701657, + "grad_norm": 0.25677376985549927, + "learning_rate": 6.167637639058006e-05, + "loss": 1.7819, + "step": 14409 + }, + { + "epoch": 4.422958870472683, + "grad_norm": 0.226568341255188, + "learning_rate": 6.167154320806336e-05, + "loss": 1.7661, + "step": 14410 + }, + { + "epoch": 4.423265807243708, + "grad_norm": 0.22997824847698212, + "learning_rate": 6.166670991020162e-05, + "loss": 1.7364, + "step": 14411 + }, + { + "epoch": 4.4235727440147325, + "grad_norm": 0.2528770864009857, + "learning_rate": 6.166187649704261e-05, + "loss": 1.8505, + "step": 14412 + }, + { + "epoch": 4.423879680785758, + "grad_norm": 0.27278614044189453, + "learning_rate": 6.165704296863409e-05, + "loss": 1.7855, + "step": 14413 + }, + { + "epoch": 4.424186617556783, + "grad_norm": 0.23086364567279816, + "learning_rate": 6.165220932502385e-05, + "loss": 1.7489, + "step": 14414 + }, + { + "epoch": 4.4244935543278086, + "grad_norm": 0.2570587396621704, + "learning_rate": 6.164737556625965e-05, + "loss": 1.8008, + "step": 14415 + }, + { + "epoch": 4.424800491098834, + "grad_norm": 0.2637264132499695, + "learning_rate": 6.164254169238923e-05, + "loss": 1.7563, + "step": 14416 + }, + { + "epoch": 4.425107427869859, + "grad_norm": 0.23046623170375824, + "learning_rate": 6.163770770346043e-05, + "loss": 1.7433, + "step": 14417 + }, + { + "epoch": 4.425414364640884, + "grad_norm": 0.2531467080116272, + "learning_rate": 6.163287359952095e-05, + "loss": 1.8122, + "step": 14418 + }, + { + "epoch": 4.425721301411909, + "grad_norm": 0.26507216691970825, + "learning_rate": 6.162803938061861e-05, + "loss": 1.7019, + "step": 14419 + }, + { + "epoch": 4.426028238182934, + "grad_norm": 0.229641854763031, + "learning_rate": 6.162320504680117e-05, + "loss": 1.7518, + "step": 14420 + }, + { + "epoch": 4.42633517495396, + "grad_norm": 0.22777152061462402, + "learning_rate": 6.161837059811641e-05, + "loss": 1.8094, + "step": 14421 + }, + { + "epoch": 4.426642111724985, + "grad_norm": 0.22121338546276093, + "learning_rate": 6.161353603461209e-05, + "loss": 1.7204, + "step": 14422 + }, + { + "epoch": 4.4269490484960095, + "grad_norm": 0.21914128959178925, + "learning_rate": 6.1608701356336e-05, + "loss": 1.7554, + "step": 14423 + }, + { + "epoch": 4.427255985267035, + "grad_norm": 0.22649390995502472, + "learning_rate": 6.160386656333593e-05, + "loss": 1.8058, + "step": 14424 + }, + { + "epoch": 4.42756292203806, + "grad_norm": 0.24529023468494415, + "learning_rate": 6.159903165565964e-05, + "loss": 1.7302, + "step": 14425 + }, + { + "epoch": 4.4278698588090855, + "grad_norm": 0.2726481854915619, + "learning_rate": 6.159419663335492e-05, + "loss": 1.825, + "step": 14426 + }, + { + "epoch": 4.428176795580111, + "grad_norm": 0.2772440016269684, + "learning_rate": 6.158936149646957e-05, + "loss": 1.7322, + "step": 14427 + }, + { + "epoch": 4.428483732351136, + "grad_norm": 0.29778853058815, + "learning_rate": 6.158452624505135e-05, + "loss": 1.7421, + "step": 14428 + }, + { + "epoch": 4.428790669122161, + "grad_norm": 0.21327480673789978, + "learning_rate": 6.157969087914804e-05, + "loss": 1.7269, + "step": 14429 + }, + { + "epoch": 4.429097605893186, + "grad_norm": 0.2718868851661682, + "learning_rate": 6.157485539880744e-05, + "loss": 1.7817, + "step": 14430 + }, + { + "epoch": 4.429404542664211, + "grad_norm": 0.32242509722709656, + "learning_rate": 6.157001980407735e-05, + "loss": 1.7115, + "step": 14431 + }, + { + "epoch": 4.429711479435237, + "grad_norm": 0.2931978106498718, + "learning_rate": 6.156518409500553e-05, + "loss": 1.7822, + "step": 14432 + }, + { + "epoch": 4.430018416206262, + "grad_norm": 0.229528546333313, + "learning_rate": 6.156034827163977e-05, + "loss": 1.7623, + "step": 14433 + }, + { + "epoch": 4.430325352977286, + "grad_norm": 0.28702354431152344, + "learning_rate": 6.15555123340279e-05, + "loss": 1.8101, + "step": 14434 + }, + { + "epoch": 4.430632289748312, + "grad_norm": 0.27162131667137146, + "learning_rate": 6.155067628221766e-05, + "loss": 1.7525, + "step": 14435 + }, + { + "epoch": 4.430939226519337, + "grad_norm": 0.24290388822555542, + "learning_rate": 6.154584011625688e-05, + "loss": 1.8701, + "step": 14436 + }, + { + "epoch": 4.431246163290362, + "grad_norm": 0.3055405020713806, + "learning_rate": 6.154100383619334e-05, + "loss": 1.8659, + "step": 14437 + }, + { + "epoch": 4.431553100061388, + "grad_norm": 0.24528950452804565, + "learning_rate": 6.153616744207483e-05, + "loss": 1.8493, + "step": 14438 + }, + { + "epoch": 4.431860036832412, + "grad_norm": 0.2611897587776184, + "learning_rate": 6.153133093394917e-05, + "loss": 1.7905, + "step": 14439 + }, + { + "epoch": 4.4321669736034375, + "grad_norm": 0.2172730267047882, + "learning_rate": 6.15264943118641e-05, + "loss": 1.7087, + "step": 14440 + }, + { + "epoch": 4.432473910374463, + "grad_norm": 0.2320949286222458, + "learning_rate": 6.152165757586749e-05, + "loss": 1.7473, + "step": 14441 + }, + { + "epoch": 4.432780847145488, + "grad_norm": 0.2602086365222931, + "learning_rate": 6.15168207260071e-05, + "loss": 1.7365, + "step": 14442 + }, + { + "epoch": 4.4330877839165135, + "grad_norm": 0.25193190574645996, + "learning_rate": 6.151198376233074e-05, + "loss": 1.8205, + "step": 14443 + }, + { + "epoch": 4.433394720687538, + "grad_norm": 0.2894204556941986, + "learning_rate": 6.150714668488621e-05, + "loss": 1.7759, + "step": 14444 + }, + { + "epoch": 4.433701657458563, + "grad_norm": 0.24150310456752777, + "learning_rate": 6.150230949372131e-05, + "loss": 1.8415, + "step": 14445 + }, + { + "epoch": 4.434008594229589, + "grad_norm": 0.23475918173789978, + "learning_rate": 6.149747218888384e-05, + "loss": 1.7487, + "step": 14446 + }, + { + "epoch": 4.434315531000614, + "grad_norm": 0.29425546526908875, + "learning_rate": 6.149263477042162e-05, + "loss": 1.7538, + "step": 14447 + }, + { + "epoch": 4.434622467771639, + "grad_norm": 0.26241615414619446, + "learning_rate": 6.148779723838244e-05, + "loss": 1.7564, + "step": 14448 + }, + { + "epoch": 4.434929404542665, + "grad_norm": 0.23195287585258484, + "learning_rate": 6.148295959281411e-05, + "loss": 1.837, + "step": 14449 + }, + { + "epoch": 4.435236341313689, + "grad_norm": 0.34972792863845825, + "learning_rate": 6.147812183376445e-05, + "loss": 1.7632, + "step": 14450 + }, + { + "epoch": 4.435543278084714, + "grad_norm": 0.3536125719547272, + "learning_rate": 6.147328396128126e-05, + "loss": 1.8372, + "step": 14451 + }, + { + "epoch": 4.43585021485574, + "grad_norm": 0.2086079865694046, + "learning_rate": 6.146844597541235e-05, + "loss": 1.7014, + "step": 14452 + }, + { + "epoch": 4.436157151626765, + "grad_norm": 0.25547802448272705, + "learning_rate": 6.146360787620554e-05, + "loss": 1.7544, + "step": 14453 + }, + { + "epoch": 4.43646408839779, + "grad_norm": 0.26176998019218445, + "learning_rate": 6.145876966370864e-05, + "loss": 1.7617, + "step": 14454 + }, + { + "epoch": 4.436771025168815, + "grad_norm": 0.2672959566116333, + "learning_rate": 6.145393133796946e-05, + "loss": 1.8178, + "step": 14455 + }, + { + "epoch": 4.43707796193984, + "grad_norm": 0.23373909294605255, + "learning_rate": 6.144909289903582e-05, + "loss": 1.7295, + "step": 14456 + }, + { + "epoch": 4.4373848987108655, + "grad_norm": 0.2369835078716278, + "learning_rate": 6.144425434695551e-05, + "loss": 1.8097, + "step": 14457 + }, + { + "epoch": 4.437691835481891, + "grad_norm": 0.25528979301452637, + "learning_rate": 6.14394156817764e-05, + "loss": 1.7523, + "step": 14458 + }, + { + "epoch": 4.437998772252916, + "grad_norm": 0.2541787624359131, + "learning_rate": 6.143457690354626e-05, + "loss": 1.7606, + "step": 14459 + }, + { + "epoch": 4.4383057090239415, + "grad_norm": 0.2032637745141983, + "learning_rate": 6.142973801231295e-05, + "loss": 1.7967, + "step": 14460 + }, + { + "epoch": 4.438612645794966, + "grad_norm": 0.2413996160030365, + "learning_rate": 6.142489900812426e-05, + "loss": 1.7688, + "step": 14461 + }, + { + "epoch": 4.438919582565991, + "grad_norm": 0.43451038002967834, + "learning_rate": 6.142005989102803e-05, + "loss": 1.8269, + "step": 14462 + }, + { + "epoch": 4.439226519337017, + "grad_norm": 0.23981481790542603, + "learning_rate": 6.141522066107206e-05, + "loss": 1.7628, + "step": 14463 + }, + { + "epoch": 4.439533456108042, + "grad_norm": 0.25396493077278137, + "learning_rate": 6.14103813183042e-05, + "loss": 1.7913, + "step": 14464 + }, + { + "epoch": 4.439840392879067, + "grad_norm": 0.2567536532878876, + "learning_rate": 6.140554186277225e-05, + "loss": 1.7612, + "step": 14465 + }, + { + "epoch": 4.440147329650092, + "grad_norm": 0.2201337069272995, + "learning_rate": 6.140070229452406e-05, + "loss": 1.7541, + "step": 14466 + }, + { + "epoch": 4.440454266421117, + "grad_norm": 0.24202953279018402, + "learning_rate": 6.139586261360746e-05, + "loss": 1.777, + "step": 14467 + }, + { + "epoch": 4.440761203192142, + "grad_norm": 0.23891687393188477, + "learning_rate": 6.139102282007024e-05, + "loss": 1.7509, + "step": 14468 + }, + { + "epoch": 4.441068139963168, + "grad_norm": 0.21132555603981018, + "learning_rate": 6.138618291396026e-05, + "loss": 1.7362, + "step": 14469 + }, + { + "epoch": 4.441375076734193, + "grad_norm": 0.2731861472129822, + "learning_rate": 6.138134289532536e-05, + "loss": 1.8063, + "step": 14470 + }, + { + "epoch": 4.4416820135052175, + "grad_norm": 0.29503315687179565, + "learning_rate": 6.137650276421336e-05, + "loss": 1.7193, + "step": 14471 + }, + { + "epoch": 4.441988950276243, + "grad_norm": 0.2778526544570923, + "learning_rate": 6.137166252067208e-05, + "loss": 1.7507, + "step": 14472 + }, + { + "epoch": 4.442295887047268, + "grad_norm": 0.2907710075378418, + "learning_rate": 6.136682216474938e-05, + "loss": 1.7939, + "step": 14473 + }, + { + "epoch": 4.4426028238182935, + "grad_norm": 0.4133768379688263, + "learning_rate": 6.136198169649306e-05, + "loss": 1.8012, + "step": 14474 + }, + { + "epoch": 4.442909760589319, + "grad_norm": 0.2505052983760834, + "learning_rate": 6.135714111595099e-05, + "loss": 1.8426, + "step": 14475 + }, + { + "epoch": 4.443216697360343, + "grad_norm": 0.3884379267692566, + "learning_rate": 6.135230042317099e-05, + "loss": 1.7383, + "step": 14476 + }, + { + "epoch": 4.443523634131369, + "grad_norm": 0.42902377247810364, + "learning_rate": 6.134745961820091e-05, + "loss": 1.732, + "step": 14477 + }, + { + "epoch": 4.443830570902394, + "grad_norm": 0.21782708168029785, + "learning_rate": 6.134261870108858e-05, + "loss": 1.7369, + "step": 14478 + }, + { + "epoch": 4.444137507673419, + "grad_norm": 0.4160648286342621, + "learning_rate": 6.133777767188186e-05, + "loss": 1.8083, + "step": 14479 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.5057216882705688, + "learning_rate": 6.133293653062856e-05, + "loss": 1.8971, + "step": 14480 + }, + { + "epoch": 4.44475138121547, + "grad_norm": 0.2189750075340271, + "learning_rate": 6.132809527737654e-05, + "loss": 1.7508, + "step": 14481 + }, + { + "epoch": 4.445058317986494, + "grad_norm": 0.4415782392024994, + "learning_rate": 6.132325391217364e-05, + "loss": 1.8548, + "step": 14482 + }, + { + "epoch": 4.44536525475752, + "grad_norm": 0.3907296359539032, + "learning_rate": 6.13184124350677e-05, + "loss": 1.7879, + "step": 14483 + }, + { + "epoch": 4.445672191528545, + "grad_norm": 0.24117955565452576, + "learning_rate": 6.131357084610659e-05, + "loss": 1.7227, + "step": 14484 + }, + { + "epoch": 4.44597912829957, + "grad_norm": 0.3083679974079132, + "learning_rate": 6.130872914533815e-05, + "loss": 1.7505, + "step": 14485 + }, + { + "epoch": 4.446286065070596, + "grad_norm": 0.27730658650398254, + "learning_rate": 6.13038873328102e-05, + "loss": 1.7485, + "step": 14486 + }, + { + "epoch": 4.44659300184162, + "grad_norm": 0.28548410534858704, + "learning_rate": 6.12990454085706e-05, + "loss": 1.8145, + "step": 14487 + }, + { + "epoch": 4.4468999386126455, + "grad_norm": 0.24743106961250305, + "learning_rate": 6.129420337266724e-05, + "loss": 1.7131, + "step": 14488 + }, + { + "epoch": 4.447206875383671, + "grad_norm": 0.2899693250656128, + "learning_rate": 6.128936122514794e-05, + "loss": 1.8567, + "step": 14489 + }, + { + "epoch": 4.447513812154696, + "grad_norm": 0.259916752576828, + "learning_rate": 6.128451896606053e-05, + "loss": 1.7563, + "step": 14490 + }, + { + "epoch": 4.4478207489257215, + "grad_norm": 0.21112586557865143, + "learning_rate": 6.12796765954529e-05, + "loss": 1.6975, + "step": 14491 + }, + { + "epoch": 4.448127685696747, + "grad_norm": 0.2890239953994751, + "learning_rate": 6.12748341133729e-05, + "loss": 1.7904, + "step": 14492 + }, + { + "epoch": 4.448434622467771, + "grad_norm": 0.23394012451171875, + "learning_rate": 6.126999151986839e-05, + "loss": 1.7559, + "step": 14493 + }, + { + "epoch": 4.448741559238797, + "grad_norm": 0.3492949903011322, + "learning_rate": 6.12651488149872e-05, + "loss": 1.7734, + "step": 14494 + }, + { + "epoch": 4.449048496009822, + "grad_norm": 0.48309218883514404, + "learning_rate": 6.126030599877723e-05, + "loss": 1.7798, + "step": 14495 + }, + { + "epoch": 4.449355432780847, + "grad_norm": 0.341146320104599, + "learning_rate": 6.12554630712863e-05, + "loss": 1.7921, + "step": 14496 + }, + { + "epoch": 4.449662369551873, + "grad_norm": 0.223160982131958, + "learning_rate": 6.125062003256229e-05, + "loss": 1.7784, + "step": 14497 + }, + { + "epoch": 4.449969306322897, + "grad_norm": 0.32664811611175537, + "learning_rate": 6.124577688265306e-05, + "loss": 1.7353, + "step": 14498 + }, + { + "epoch": 4.4502762430939224, + "grad_norm": 0.215936541557312, + "learning_rate": 6.124093362160646e-05, + "loss": 1.68, + "step": 14499 + }, + { + "epoch": 4.450583179864948, + "grad_norm": 0.26081225275993347, + "learning_rate": 6.123609024947038e-05, + "loss": 1.7107, + "step": 14500 + }, + { + "epoch": 4.450890116635973, + "grad_norm": 0.3124069571495056, + "learning_rate": 6.123124676629267e-05, + "loss": 1.7338, + "step": 14501 + }, + { + "epoch": 4.4511970534069984, + "grad_norm": 0.23125620186328888, + "learning_rate": 6.122640317212118e-05, + "loss": 1.7842, + "step": 14502 + }, + { + "epoch": 4.451503990178024, + "grad_norm": 0.27065595984458923, + "learning_rate": 6.122155946700381e-05, + "loss": 1.7284, + "step": 14503 + }, + { + "epoch": 4.451810926949048, + "grad_norm": 0.4677436053752899, + "learning_rate": 6.121671565098841e-05, + "loss": 1.8156, + "step": 14504 + }, + { + "epoch": 4.452117863720074, + "grad_norm": 0.36325082182884216, + "learning_rate": 6.121187172412285e-05, + "loss": 1.7875, + "step": 14505 + }, + { + "epoch": 4.452424800491099, + "grad_norm": 0.23409567773342133, + "learning_rate": 6.1207027686455e-05, + "loss": 1.7421, + "step": 14506 + }, + { + "epoch": 4.452731737262124, + "grad_norm": 0.36919257044792175, + "learning_rate": 6.120218353803273e-05, + "loss": 1.7545, + "step": 14507 + }, + { + "epoch": 4.45303867403315, + "grad_norm": 0.318452388048172, + "learning_rate": 6.119733927890393e-05, + "loss": 1.7179, + "step": 14508 + }, + { + "epoch": 4.453345610804174, + "grad_norm": 0.21279768645763397, + "learning_rate": 6.119249490911643e-05, + "loss": 1.7534, + "step": 14509 + }, + { + "epoch": 4.453652547575199, + "grad_norm": 0.30565473437309265, + "learning_rate": 6.118765042871816e-05, + "loss": 1.7962, + "step": 14510 + }, + { + "epoch": 4.453959484346225, + "grad_norm": 0.2608480453491211, + "learning_rate": 6.118280583775697e-05, + "loss": 1.7336, + "step": 14511 + }, + { + "epoch": 4.45426642111725, + "grad_norm": 0.22978845238685608, + "learning_rate": 6.117796113628075e-05, + "loss": 1.8244, + "step": 14512 + }, + { + "epoch": 4.454573357888275, + "grad_norm": 0.26357781887054443, + "learning_rate": 6.117311632433735e-05, + "loss": 1.7425, + "step": 14513 + }, + { + "epoch": 4.4548802946593, + "grad_norm": 0.22127102315425873, + "learning_rate": 6.116827140197467e-05, + "loss": 1.7679, + "step": 14514 + }, + { + "epoch": 4.455187231430325, + "grad_norm": 0.2876584231853485, + "learning_rate": 6.116342636924058e-05, + "loss": 1.8104, + "step": 14515 + }, + { + "epoch": 4.4554941682013505, + "grad_norm": 0.28290677070617676, + "learning_rate": 6.115858122618297e-05, + "loss": 1.7485, + "step": 14516 + }, + { + "epoch": 4.455801104972376, + "grad_norm": 0.21914640069007874, + "learning_rate": 6.115373597284974e-05, + "loss": 1.7736, + "step": 14517 + }, + { + "epoch": 4.456108041743401, + "grad_norm": 0.2603909969329834, + "learning_rate": 6.114889060928873e-05, + "loss": 1.7446, + "step": 14518 + }, + { + "epoch": 4.456414978514426, + "grad_norm": 0.2157236635684967, + "learning_rate": 6.114404513554784e-05, + "loss": 1.7594, + "step": 14519 + }, + { + "epoch": 4.456721915285451, + "grad_norm": 0.27622368931770325, + "learning_rate": 6.113919955167499e-05, + "loss": 1.8154, + "step": 14520 + }, + { + "epoch": 4.457028852056476, + "grad_norm": 0.27298516035079956, + "learning_rate": 6.113435385771803e-05, + "loss": 1.7458, + "step": 14521 + }, + { + "epoch": 4.457335788827502, + "grad_norm": 0.22220586240291595, + "learning_rate": 6.112950805372485e-05, + "loss": 1.7102, + "step": 14522 + }, + { + "epoch": 4.457642725598527, + "grad_norm": 0.19480876624584198, + "learning_rate": 6.112466213974336e-05, + "loss": 1.7696, + "step": 14523 + }, + { + "epoch": 4.457949662369552, + "grad_norm": 0.24261653423309326, + "learning_rate": 6.111981611582144e-05, + "loss": 1.8193, + "step": 14524 + }, + { + "epoch": 4.458256599140577, + "grad_norm": 0.2502967417240143, + "learning_rate": 6.111496998200697e-05, + "loss": 1.7701, + "step": 14525 + }, + { + "epoch": 4.458563535911602, + "grad_norm": 0.25764599442481995, + "learning_rate": 6.111012373834786e-05, + "loss": 1.8055, + "step": 14526 + }, + { + "epoch": 4.458870472682627, + "grad_norm": 0.24085427820682526, + "learning_rate": 6.110527738489198e-05, + "loss": 1.7592, + "step": 14527 + }, + { + "epoch": 4.459177409453653, + "grad_norm": 0.2469809502363205, + "learning_rate": 6.110043092168727e-05, + "loss": 1.6977, + "step": 14528 + }, + { + "epoch": 4.459484346224678, + "grad_norm": 0.21888838708400726, + "learning_rate": 6.109558434878159e-05, + "loss": 1.777, + "step": 14529 + }, + { + "epoch": 4.4597912829957025, + "grad_norm": 0.2094014585018158, + "learning_rate": 6.109073766622281e-05, + "loss": 1.7041, + "step": 14530 + }, + { + "epoch": 4.460098219766728, + "grad_norm": 0.23801055550575256, + "learning_rate": 6.108589087405888e-05, + "loss": 1.8392, + "step": 14531 + }, + { + "epoch": 4.460405156537753, + "grad_norm": 0.2164965718984604, + "learning_rate": 6.108104397233769e-05, + "loss": 1.7643, + "step": 14532 + }, + { + "epoch": 4.4607120933087785, + "grad_norm": 0.21322336792945862, + "learning_rate": 6.107619696110712e-05, + "loss": 1.7063, + "step": 14533 + }, + { + "epoch": 4.461019030079804, + "grad_norm": 0.29019200801849365, + "learning_rate": 6.107134984041507e-05, + "loss": 1.8254, + "step": 14534 + }, + { + "epoch": 4.461325966850829, + "grad_norm": 0.2765025496482849, + "learning_rate": 6.106650261030947e-05, + "loss": 1.7609, + "step": 14535 + }, + { + "epoch": 4.461632903621854, + "grad_norm": 0.20879749953746796, + "learning_rate": 6.106165527083818e-05, + "loss": 1.7387, + "step": 14536 + }, + { + "epoch": 4.461939840392879, + "grad_norm": 0.22295843064785004, + "learning_rate": 6.105680782204913e-05, + "loss": 1.7691, + "step": 14537 + }, + { + "epoch": 4.462246777163904, + "grad_norm": 0.23502351343631744, + "learning_rate": 6.105196026399025e-05, + "loss": 1.7335, + "step": 14538 + }, + { + "epoch": 4.46255371393493, + "grad_norm": 0.22143007814884186, + "learning_rate": 6.104711259670941e-05, + "loss": 1.7338, + "step": 14539 + }, + { + "epoch": 4.462860650705955, + "grad_norm": 0.22361041605472565, + "learning_rate": 6.104226482025453e-05, + "loss": 1.7033, + "step": 14540 + }, + { + "epoch": 4.463167587476979, + "grad_norm": 0.27104905247688293, + "learning_rate": 6.10374169346735e-05, + "loss": 1.7926, + "step": 14541 + }, + { + "epoch": 4.463474524248005, + "grad_norm": 0.23564264178276062, + "learning_rate": 6.103256894001427e-05, + "loss": 1.7522, + "step": 14542 + }, + { + "epoch": 4.46378146101903, + "grad_norm": 0.2585970163345337, + "learning_rate": 6.102772083632471e-05, + "loss": 1.7755, + "step": 14543 + }, + { + "epoch": 4.464088397790055, + "grad_norm": 0.358634889125824, + "learning_rate": 6.102287262365276e-05, + "loss": 1.8092, + "step": 14544 + }, + { + "epoch": 4.464395334561081, + "grad_norm": 0.2862946689128876, + "learning_rate": 6.1018024302046314e-05, + "loss": 1.7051, + "step": 14545 + }, + { + "epoch": 4.464702271332105, + "grad_norm": 0.21907158195972443, + "learning_rate": 6.101317587155331e-05, + "loss": 1.7882, + "step": 14546 + }, + { + "epoch": 4.4650092081031305, + "grad_norm": 0.24268488585948944, + "learning_rate": 6.100832733222164e-05, + "loss": 1.7756, + "step": 14547 + }, + { + "epoch": 4.465316144874156, + "grad_norm": 0.2350744605064392, + "learning_rate": 6.1003478684099214e-05, + "loss": 1.7483, + "step": 14548 + }, + { + "epoch": 4.465623081645181, + "grad_norm": 0.22902250289916992, + "learning_rate": 6.099862992723397e-05, + "loss": 1.7687, + "step": 14549 + }, + { + "epoch": 4.4659300184162065, + "grad_norm": 0.23590944707393646, + "learning_rate": 6.099378106167382e-05, + "loss": 1.8481, + "step": 14550 + }, + { + "epoch": 4.466236955187231, + "grad_norm": 0.23644296824932098, + "learning_rate": 6.098893208746668e-05, + "loss": 1.7422, + "step": 14551 + }, + { + "epoch": 4.466543891958256, + "grad_norm": 0.23782360553741455, + "learning_rate": 6.0984083004660475e-05, + "loss": 1.7852, + "step": 14552 + }, + { + "epoch": 4.466850828729282, + "grad_norm": 0.2546575665473938, + "learning_rate": 6.097923381330313e-05, + "loss": 1.8483, + "step": 14553 + }, + { + "epoch": 4.467157765500307, + "grad_norm": 0.2555409371852875, + "learning_rate": 6.097438451344254e-05, + "loss": 1.7887, + "step": 14554 + }, + { + "epoch": 4.467464702271332, + "grad_norm": 0.28074198961257935, + "learning_rate": 6.0969535105126664e-05, + "loss": 1.7521, + "step": 14555 + }, + { + "epoch": 4.467771639042358, + "grad_norm": 0.22622554004192352, + "learning_rate": 6.096468558840341e-05, + "loss": 1.8088, + "step": 14556 + }, + { + "epoch": 4.468078575813382, + "grad_norm": 0.302749902009964, + "learning_rate": 6.095983596332071e-05, + "loss": 1.8192, + "step": 14557 + }, + { + "epoch": 4.468385512584407, + "grad_norm": 0.27925750613212585, + "learning_rate": 6.0954986229926494e-05, + "loss": 1.8453, + "step": 14558 + }, + { + "epoch": 4.468692449355433, + "grad_norm": 0.2246330976486206, + "learning_rate": 6.095013638826868e-05, + "loss": 1.744, + "step": 14559 + }, + { + "epoch": 4.468999386126458, + "grad_norm": 0.26677101850509644, + "learning_rate": 6.094528643839518e-05, + "loss": 1.708, + "step": 14560 + }, + { + "epoch": 4.469306322897483, + "grad_norm": 0.23684042692184448, + "learning_rate": 6.094043638035396e-05, + "loss": 1.713, + "step": 14561 + }, + { + "epoch": 4.469613259668508, + "grad_norm": 0.2470075935125351, + "learning_rate": 6.093558621419294e-05, + "loss": 1.8096, + "step": 14562 + }, + { + "epoch": 4.469920196439533, + "grad_norm": 0.2775517702102661, + "learning_rate": 6.093073593996005e-05, + "loss": 1.697, + "step": 14563 + }, + { + "epoch": 4.4702271332105585, + "grad_norm": 0.21053175628185272, + "learning_rate": 6.092588555770322e-05, + "loss": 1.6894, + "step": 14564 + }, + { + "epoch": 4.470534069981584, + "grad_norm": 0.2555869221687317, + "learning_rate": 6.0921035067470366e-05, + "loss": 1.7051, + "step": 14565 + }, + { + "epoch": 4.470841006752609, + "grad_norm": 0.34468984603881836, + "learning_rate": 6.0916184469309454e-05, + "loss": 1.7317, + "step": 14566 + }, + { + "epoch": 4.4711479435236345, + "grad_norm": 0.2517752945423126, + "learning_rate": 6.0911333763268407e-05, + "loss": 1.7524, + "step": 14567 + }, + { + "epoch": 4.471454880294659, + "grad_norm": 0.2749727666378021, + "learning_rate": 6.090648294939517e-05, + "loss": 1.7045, + "step": 14568 + }, + { + "epoch": 4.471761817065684, + "grad_norm": 0.36250773072242737, + "learning_rate": 6.0901632027737673e-05, + "loss": 1.7196, + "step": 14569 + }, + { + "epoch": 4.47206875383671, + "grad_norm": 0.2317698448896408, + "learning_rate": 6.089678099834386e-05, + "loss": 1.7318, + "step": 14570 + }, + { + "epoch": 4.472375690607735, + "grad_norm": 0.2863345444202423, + "learning_rate": 6.089192986126166e-05, + "loss": 1.7798, + "step": 14571 + }, + { + "epoch": 4.47268262737876, + "grad_norm": 0.3493366241455078, + "learning_rate": 6.088707861653904e-05, + "loss": 1.7749, + "step": 14572 + }, + { + "epoch": 4.472989564149785, + "grad_norm": 0.25718605518341064, + "learning_rate": 6.0882227264223924e-05, + "loss": 1.7683, + "step": 14573 + }, + { + "epoch": 4.47329650092081, + "grad_norm": 0.2320062816143036, + "learning_rate": 6.087737580436426e-05, + "loss": 1.8296, + "step": 14574 + }, + { + "epoch": 4.473603437691835, + "grad_norm": 0.29071560502052307, + "learning_rate": 6.087252423700799e-05, + "loss": 1.7428, + "step": 14575 + }, + { + "epoch": 4.473910374462861, + "grad_norm": 0.24233707785606384, + "learning_rate": 6.086767256220306e-05, + "loss": 1.7332, + "step": 14576 + }, + { + "epoch": 4.474217311233886, + "grad_norm": 0.228043332695961, + "learning_rate": 6.086282077999742e-05, + "loss": 1.7697, + "step": 14577 + }, + { + "epoch": 4.474524248004911, + "grad_norm": 0.29154402017593384, + "learning_rate": 6.085796889043902e-05, + "loss": 1.8043, + "step": 14578 + }, + { + "epoch": 4.474831184775936, + "grad_norm": 0.30543211102485657, + "learning_rate": 6.0853116893575814e-05, + "loss": 1.7665, + "step": 14579 + }, + { + "epoch": 4.475138121546961, + "grad_norm": 0.22792959213256836, + "learning_rate": 6.0848264789455754e-05, + "loss": 1.729, + "step": 14580 + }, + { + "epoch": 4.475445058317987, + "grad_norm": 0.2615707218647003, + "learning_rate": 6.084341257812677e-05, + "loss": 1.7438, + "step": 14581 + }, + { + "epoch": 4.475751995089012, + "grad_norm": 0.23342981934547424, + "learning_rate": 6.083856025963681e-05, + "loss": 1.7158, + "step": 14582 + }, + { + "epoch": 4.476058931860037, + "grad_norm": 0.22279240190982819, + "learning_rate": 6.083370783403387e-05, + "loss": 1.7413, + "step": 14583 + }, + { + "epoch": 4.476365868631062, + "grad_norm": 0.28867462277412415, + "learning_rate": 6.082885530136587e-05, + "loss": 1.7932, + "step": 14584 + }, + { + "epoch": 4.476672805402087, + "grad_norm": 0.2947152256965637, + "learning_rate": 6.082400266168078e-05, + "loss": 1.8986, + "step": 14585 + }, + { + "epoch": 4.476979742173112, + "grad_norm": 0.2948935627937317, + "learning_rate": 6.0819149915026555e-05, + "loss": 1.9134, + "step": 14586 + }, + { + "epoch": 4.477286678944138, + "grad_norm": 0.4436163902282715, + "learning_rate": 6.081429706145114e-05, + "loss": 1.7616, + "step": 14587 + }, + { + "epoch": 4.477593615715163, + "grad_norm": 0.4879693388938904, + "learning_rate": 6.080944410100249e-05, + "loss": 1.8155, + "step": 14588 + }, + { + "epoch": 4.4779005524861875, + "grad_norm": 0.29742667078971863, + "learning_rate": 6.08045910337286e-05, + "loss": 1.7428, + "step": 14589 + }, + { + "epoch": 4.478207489257213, + "grad_norm": 0.2994751036167145, + "learning_rate": 6.0799737859677395e-05, + "loss": 1.7764, + "step": 14590 + }, + { + "epoch": 4.478514426028238, + "grad_norm": 0.46379905939102173, + "learning_rate": 6.079488457889686e-05, + "loss": 1.7289, + "step": 14591 + }, + { + "epoch": 4.4788213627992635, + "grad_norm": 0.3511717617511749, + "learning_rate": 6.0790031191434946e-05, + "loss": 1.7658, + "step": 14592 + }, + { + "epoch": 4.479128299570289, + "grad_norm": 0.22678083181381226, + "learning_rate": 6.0785177697339626e-05, + "loss": 1.7973, + "step": 14593 + }, + { + "epoch": 4.479435236341313, + "grad_norm": 0.31201767921447754, + "learning_rate": 6.0780324096658837e-05, + "loss": 1.7542, + "step": 14594 + }, + { + "epoch": 4.479742173112339, + "grad_norm": 0.23759113252162933, + "learning_rate": 6.077547038944058e-05, + "loss": 1.7191, + "step": 14595 + }, + { + "epoch": 4.480049109883364, + "grad_norm": 0.25801756978034973, + "learning_rate": 6.077061657573282e-05, + "loss": 1.8229, + "step": 14596 + }, + { + "epoch": 4.480356046654389, + "grad_norm": 0.3435722887516022, + "learning_rate": 6.0765762655583514e-05, + "loss": 1.7633, + "step": 14597 + }, + { + "epoch": 4.480662983425415, + "grad_norm": 0.2710443437099457, + "learning_rate": 6.076090862904063e-05, + "loss": 1.8126, + "step": 14598 + }, + { + "epoch": 4.48096992019644, + "grad_norm": 0.25750285387039185, + "learning_rate": 6.075605449615212e-05, + "loss": 1.7382, + "step": 14599 + }, + { + "epoch": 4.481276856967464, + "grad_norm": 0.3638051152229309, + "learning_rate": 6.075120025696598e-05, + "loss": 1.8191, + "step": 14600 + }, + { + "epoch": 4.48158379373849, + "grad_norm": 0.24185293912887573, + "learning_rate": 6.074634591153019e-05, + "loss": 1.7637, + "step": 14601 + }, + { + "epoch": 4.481890730509515, + "grad_norm": 0.317283570766449, + "learning_rate": 6.0741491459892707e-05, + "loss": 1.7805, + "step": 14602 + }, + { + "epoch": 4.48219766728054, + "grad_norm": 0.33884385228157043, + "learning_rate": 6.073663690210151e-05, + "loss": 1.7719, + "step": 14603 + }, + { + "epoch": 4.482504604051566, + "grad_norm": 0.2554258704185486, + "learning_rate": 6.073178223820457e-05, + "loss": 1.836, + "step": 14604 + }, + { + "epoch": 4.48281154082259, + "grad_norm": 0.3363535702228546, + "learning_rate": 6.072692746824987e-05, + "loss": 1.8249, + "step": 14605 + }, + { + "epoch": 4.4831184775936155, + "grad_norm": 0.36090195178985596, + "learning_rate": 6.072207259228537e-05, + "loss": 1.733, + "step": 14606 + }, + { + "epoch": 4.483425414364641, + "grad_norm": 0.21928483247756958, + "learning_rate": 6.071721761035909e-05, + "loss": 1.7413, + "step": 14607 + }, + { + "epoch": 4.483732351135666, + "grad_norm": 0.4256608486175537, + "learning_rate": 6.071236252251897e-05, + "loss": 1.7585, + "step": 14608 + }, + { + "epoch": 4.4840392879066915, + "grad_norm": 0.41980308294296265, + "learning_rate": 6.0707507328813007e-05, + "loss": 1.7584, + "step": 14609 + }, + { + "epoch": 4.484346224677717, + "grad_norm": 0.200295090675354, + "learning_rate": 6.0702652029289186e-05, + "loss": 1.7492, + "step": 14610 + }, + { + "epoch": 4.484653161448741, + "grad_norm": 0.41847771406173706, + "learning_rate": 6.069779662399549e-05, + "loss": 1.8101, + "step": 14611 + }, + { + "epoch": 4.484960098219767, + "grad_norm": 0.4846353530883789, + "learning_rate": 6.069294111297987e-05, + "loss": 1.8227, + "step": 14612 + }, + { + "epoch": 4.485267034990792, + "grad_norm": 0.23216098546981812, + "learning_rate": 6.068808549629036e-05, + "loss": 1.6811, + "step": 14613 + }, + { + "epoch": 4.485573971761817, + "grad_norm": 0.34903186559677124, + "learning_rate": 6.0683229773974934e-05, + "loss": 1.6858, + "step": 14614 + }, + { + "epoch": 4.485880908532843, + "grad_norm": 0.4349122941493988, + "learning_rate": 6.0678373946081556e-05, + "loss": 1.7704, + "step": 14615 + }, + { + "epoch": 4.486187845303867, + "grad_norm": 0.25738775730133057, + "learning_rate": 6.067351801265824e-05, + "loss": 1.7487, + "step": 14616 + }, + { + "epoch": 4.486494782074892, + "grad_norm": 0.3052736818790436, + "learning_rate": 6.0668661973752936e-05, + "loss": 1.7528, + "step": 14617 + }, + { + "epoch": 4.486801718845918, + "grad_norm": 0.3400498628616333, + "learning_rate": 6.066380582941368e-05, + "loss": 1.7414, + "step": 14618 + }, + { + "epoch": 4.487108655616943, + "grad_norm": 0.28251948952674866, + "learning_rate": 6.065894957968845e-05, + "loss": 1.8078, + "step": 14619 + }, + { + "epoch": 4.487415592387968, + "grad_norm": 0.26907965540885925, + "learning_rate": 6.0654093224625216e-05, + "loss": 1.8143, + "step": 14620 + }, + { + "epoch": 4.487722529158993, + "grad_norm": 0.2821955978870392, + "learning_rate": 6.064923676427201e-05, + "loss": 1.7163, + "step": 14621 + }, + { + "epoch": 4.488029465930018, + "grad_norm": 0.2223028987646103, + "learning_rate": 6.0644380198676786e-05, + "loss": 1.704, + "step": 14622 + }, + { + "epoch": 4.4883364027010435, + "grad_norm": 0.25243067741394043, + "learning_rate": 6.063952352788755e-05, + "loss": 1.7236, + "step": 14623 + }, + { + "epoch": 4.488643339472069, + "grad_norm": 0.30026015639305115, + "learning_rate": 6.063466675195233e-05, + "loss": 1.7575, + "step": 14624 + }, + { + "epoch": 4.488950276243094, + "grad_norm": 0.2055491805076599, + "learning_rate": 6.0629809870919085e-05, + "loss": 1.7294, + "step": 14625 + }, + { + "epoch": 4.4892572130141195, + "grad_norm": 0.2507593035697937, + "learning_rate": 6.0624952884835836e-05, + "loss": 1.762, + "step": 14626 + }, + { + "epoch": 4.489564149785144, + "grad_norm": 0.21385909616947174, + "learning_rate": 6.0620095793750576e-05, + "loss": 1.7396, + "step": 14627 + }, + { + "epoch": 4.489871086556169, + "grad_norm": 0.21926651895046234, + "learning_rate": 6.06152385977113e-05, + "loss": 1.7863, + "step": 14628 + }, + { + "epoch": 4.490178023327195, + "grad_norm": 0.21950845420360565, + "learning_rate": 6.0610381296766016e-05, + "loss": 1.7576, + "step": 14629 + }, + { + "epoch": 4.49048496009822, + "grad_norm": 0.2030971795320511, + "learning_rate": 6.0605523890962736e-05, + "loss": 1.7069, + "step": 14630 + }, + { + "epoch": 4.490791896869245, + "grad_norm": 0.23991432785987854, + "learning_rate": 6.0600666380349436e-05, + "loss": 1.7598, + "step": 14631 + }, + { + "epoch": 4.49109883364027, + "grad_norm": 0.23766861855983734, + "learning_rate": 6.059580876497415e-05, + "loss": 1.7687, + "step": 14632 + }, + { + "epoch": 4.491405770411295, + "grad_norm": 0.2361454963684082, + "learning_rate": 6.059095104488487e-05, + "loss": 1.7883, + "step": 14633 + }, + { + "epoch": 4.49171270718232, + "grad_norm": 0.3128328323364258, + "learning_rate": 6.058609322012958e-05, + "loss": 1.8087, + "step": 14634 + }, + { + "epoch": 4.492019643953346, + "grad_norm": 0.2958957850933075, + "learning_rate": 6.0581235290756335e-05, + "loss": 1.782, + "step": 14635 + }, + { + "epoch": 4.492326580724371, + "grad_norm": 0.2197243571281433, + "learning_rate": 6.057637725681312e-05, + "loss": 1.7408, + "step": 14636 + }, + { + "epoch": 4.4926335174953955, + "grad_norm": 0.22227831184864044, + "learning_rate": 6.0571519118347944e-05, + "loss": 1.734, + "step": 14637 + }, + { + "epoch": 4.492940454266421, + "grad_norm": 0.2784527540206909, + "learning_rate": 6.056666087540882e-05, + "loss": 1.8017, + "step": 14638 + }, + { + "epoch": 4.493247391037446, + "grad_norm": 0.21929821372032166, + "learning_rate": 6.056180252804377e-05, + "loss": 1.7271, + "step": 14639 + }, + { + "epoch": 4.4935543278084715, + "grad_norm": 0.2156134843826294, + "learning_rate": 6.055694407630077e-05, + "loss": 1.8082, + "step": 14640 + }, + { + "epoch": 4.493861264579497, + "grad_norm": 0.22672387957572937, + "learning_rate": 6.0552085520227875e-05, + "loss": 1.7506, + "step": 14641 + }, + { + "epoch": 4.494168201350522, + "grad_norm": 0.228785440325737, + "learning_rate": 6.0547226859873086e-05, + "loss": 1.7023, + "step": 14642 + }, + { + "epoch": 4.494475138121547, + "grad_norm": 0.19483685493469238, + "learning_rate": 6.054236809528443e-05, + "loss": 1.6879, + "step": 14643 + }, + { + "epoch": 4.494782074892572, + "grad_norm": 0.24911309778690338, + "learning_rate": 6.0537509226509904e-05, + "loss": 1.7856, + "step": 14644 + }, + { + "epoch": 4.495089011663597, + "grad_norm": 0.24811938405036926, + "learning_rate": 6.053265025359753e-05, + "loss": 1.7581, + "step": 14645 + }, + { + "epoch": 4.495395948434623, + "grad_norm": 0.2487260401248932, + "learning_rate": 6.052779117659534e-05, + "loss": 1.7536, + "step": 14646 + }, + { + "epoch": 4.495702885205648, + "grad_norm": 0.2594854235649109, + "learning_rate": 6.052293199555136e-05, + "loss": 1.7822, + "step": 14647 + }, + { + "epoch": 4.496009821976672, + "grad_norm": 0.22837325930595398, + "learning_rate": 6.051807271051359e-05, + "loss": 1.7542, + "step": 14648 + }, + { + "epoch": 4.496316758747698, + "grad_norm": 0.23106649518013, + "learning_rate": 6.051321332153005e-05, + "loss": 1.7758, + "step": 14649 + }, + { + "epoch": 4.496623695518723, + "grad_norm": 0.29424673318862915, + "learning_rate": 6.050835382864878e-05, + "loss": 1.8335, + "step": 14650 + }, + { + "epoch": 4.496930632289748, + "grad_norm": 0.28297343850135803, + "learning_rate": 6.050349423191779e-05, + "loss": 1.7711, + "step": 14651 + }, + { + "epoch": 4.497237569060774, + "grad_norm": 0.2001795768737793, + "learning_rate": 6.049863453138511e-05, + "loss": 1.7008, + "step": 14652 + }, + { + "epoch": 4.497544505831799, + "grad_norm": 0.35177022218704224, + "learning_rate": 6.04937747270988e-05, + "loss": 1.7763, + "step": 14653 + }, + { + "epoch": 4.4978514426028235, + "grad_norm": 0.28870898485183716, + "learning_rate": 6.0488914819106835e-05, + "loss": 1.7373, + "step": 14654 + }, + { + "epoch": 4.498158379373849, + "grad_norm": 0.23962664604187012, + "learning_rate": 6.048405480745727e-05, + "loss": 1.7278, + "step": 14655 + }, + { + "epoch": 4.498465316144874, + "grad_norm": 0.324505478143692, + "learning_rate": 6.047919469219813e-05, + "loss": 1.7674, + "step": 14656 + }, + { + "epoch": 4.4987722529158995, + "grad_norm": 0.38313817977905273, + "learning_rate": 6.047433447337744e-05, + "loss": 1.789, + "step": 14657 + }, + { + "epoch": 4.499079189686925, + "grad_norm": 0.2101358324289322, + "learning_rate": 6.046947415104324e-05, + "loss": 1.7331, + "step": 14658 + }, + { + "epoch": 4.499386126457949, + "grad_norm": 0.3388524353504181, + "learning_rate": 6.046461372524357e-05, + "loss": 1.8467, + "step": 14659 + }, + { + "epoch": 4.499693063228975, + "grad_norm": 0.3360123634338379, + "learning_rate": 6.045975319602645e-05, + "loss": 1.8427, + "step": 14660 + }, + { + "epoch": 4.5, + "grad_norm": 0.27596545219421387, + "learning_rate": 6.0454892563439914e-05, + "loss": 1.7768, + "step": 14661 + }, + { + "epoch": 4.500306936771025, + "grad_norm": 0.2580861747264862, + "learning_rate": 6.0450031827532e-05, + "loss": 1.763, + "step": 14662 + }, + { + "epoch": 4.500613873542051, + "grad_norm": 0.3521091938018799, + "learning_rate": 6.044517098835074e-05, + "loss": 1.7118, + "step": 14663 + }, + { + "epoch": 4.500920810313076, + "grad_norm": 0.29412439465522766, + "learning_rate": 6.0440310045944204e-05, + "loss": 1.7252, + "step": 14664 + }, + { + "epoch": 4.5012277470841005, + "grad_norm": 0.23845252394676208, + "learning_rate": 6.043544900036039e-05, + "loss": 1.7622, + "step": 14665 + }, + { + "epoch": 4.501534683855126, + "grad_norm": 0.22957031428813934, + "learning_rate": 6.043058785164736e-05, + "loss": 1.7527, + "step": 14666 + }, + { + "epoch": 4.501841620626151, + "grad_norm": 0.2564462721347809, + "learning_rate": 6.042572659985314e-05, + "loss": 1.801, + "step": 14667 + }, + { + "epoch": 4.5021485573971765, + "grad_norm": 0.22588051855564117, + "learning_rate": 6.042086524502576e-05, + "loss": 1.7387, + "step": 14668 + }, + { + "epoch": 4.502455494168201, + "grad_norm": 0.2609740197658539, + "learning_rate": 6.0416003787213306e-05, + "loss": 1.7615, + "step": 14669 + }, + { + "epoch": 4.502762430939226, + "grad_norm": 0.2535521984100342, + "learning_rate": 6.041114222646379e-05, + "loss": 1.7398, + "step": 14670 + }, + { + "epoch": 4.503069367710252, + "grad_norm": 0.2512127757072449, + "learning_rate": 6.040628056282527e-05, + "loss": 1.7679, + "step": 14671 + }, + { + "epoch": 4.503376304481277, + "grad_norm": 0.2438639998435974, + "learning_rate": 6.0401418796345774e-05, + "loss": 1.7, + "step": 14672 + }, + { + "epoch": 4.503683241252302, + "grad_norm": 0.23428042232990265, + "learning_rate": 6.0396556927073376e-05, + "loss": 1.7748, + "step": 14673 + }, + { + "epoch": 4.503990178023328, + "grad_norm": 0.22894345223903656, + "learning_rate": 6.03916949550561e-05, + "loss": 1.7881, + "step": 14674 + }, + { + "epoch": 4.504297114794352, + "grad_norm": 0.24813716113567352, + "learning_rate": 6.0386832880342006e-05, + "loss": 1.7676, + "step": 14675 + }, + { + "epoch": 4.504604051565377, + "grad_norm": 0.23448842763900757, + "learning_rate": 6.038197070297914e-05, + "loss": 1.7828, + "step": 14676 + }, + { + "epoch": 4.504910988336403, + "grad_norm": 0.25302332639694214, + "learning_rate": 6.037710842301556e-05, + "loss": 1.8061, + "step": 14677 + }, + { + "epoch": 4.505217925107428, + "grad_norm": 0.2411813735961914, + "learning_rate": 6.0372246040499305e-05, + "loss": 1.6901, + "step": 14678 + }, + { + "epoch": 4.505524861878453, + "grad_norm": 0.3154819905757904, + "learning_rate": 6.036738355547844e-05, + "loss": 1.7472, + "step": 14679 + }, + { + "epoch": 4.505831798649478, + "grad_norm": 0.2935639023780823, + "learning_rate": 6.0362520968001014e-05, + "loss": 1.7508, + "step": 14680 + }, + { + "epoch": 4.506138735420503, + "grad_norm": 0.27064070105552673, + "learning_rate": 6.035765827811508e-05, + "loss": 1.8133, + "step": 14681 + }, + { + "epoch": 4.5064456721915285, + "grad_norm": 0.23748525977134705, + "learning_rate": 6.03527954858687e-05, + "loss": 1.7742, + "step": 14682 + }, + { + "epoch": 4.506752608962554, + "grad_norm": 0.216410830616951, + "learning_rate": 6.034793259130992e-05, + "loss": 1.7448, + "step": 14683 + }, + { + "epoch": 4.507059545733579, + "grad_norm": 0.23339977860450745, + "learning_rate": 6.034306959448681e-05, + "loss": 1.7437, + "step": 14684 + }, + { + "epoch": 4.5073664825046045, + "grad_norm": 0.23951120674610138, + "learning_rate": 6.0338206495447414e-05, + "loss": 1.7535, + "step": 14685 + }, + { + "epoch": 4.507673419275629, + "grad_norm": 0.22137518227100372, + "learning_rate": 6.0333343294239816e-05, + "loss": 1.7537, + "step": 14686 + }, + { + "epoch": 4.507980356046654, + "grad_norm": 0.2550075054168701, + "learning_rate": 6.032847999091206e-05, + "loss": 1.8069, + "step": 14687 + }, + { + "epoch": 4.50828729281768, + "grad_norm": 0.2166420966386795, + "learning_rate": 6.032361658551221e-05, + "loss": 1.7746, + "step": 14688 + }, + { + "epoch": 4.508594229588705, + "grad_norm": 0.21926096081733704, + "learning_rate": 6.031875307808833e-05, + "loss": 1.7848, + "step": 14689 + }, + { + "epoch": 4.50890116635973, + "grad_norm": 0.27769652009010315, + "learning_rate": 6.031388946868848e-05, + "loss": 1.7563, + "step": 14690 + }, + { + "epoch": 4.509208103130755, + "grad_norm": 0.23417410254478455, + "learning_rate": 6.030902575736074e-05, + "loss": 1.7475, + "step": 14691 + }, + { + "epoch": 4.50951503990178, + "grad_norm": 0.25454118847846985, + "learning_rate": 6.030416194415314e-05, + "loss": 1.7416, + "step": 14692 + }, + { + "epoch": 4.509821976672805, + "grad_norm": 0.3118220567703247, + "learning_rate": 6.029929802911379e-05, + "loss": 1.8001, + "step": 14693 + }, + { + "epoch": 4.510128913443831, + "grad_norm": 0.2338017225265503, + "learning_rate": 6.029443401229075e-05, + "loss": 1.7243, + "step": 14694 + }, + { + "epoch": 4.510435850214856, + "grad_norm": 0.2490454763174057, + "learning_rate": 6.028956989373207e-05, + "loss": 1.7866, + "step": 14695 + }, + { + "epoch": 4.510742786985881, + "grad_norm": 0.2579275369644165, + "learning_rate": 6.028470567348582e-05, + "loss": 1.7594, + "step": 14696 + }, + { + "epoch": 4.511049723756906, + "grad_norm": 0.23982174694538116, + "learning_rate": 6.0279841351600094e-05, + "loss": 1.7444, + "step": 14697 + }, + { + "epoch": 4.511356660527931, + "grad_norm": 0.2160159945487976, + "learning_rate": 6.027497692812295e-05, + "loss": 1.7002, + "step": 14698 + }, + { + "epoch": 4.5116635972989565, + "grad_norm": 0.24604511260986328, + "learning_rate": 6.0270112403102455e-05, + "loss": 1.7654, + "step": 14699 + }, + { + "epoch": 4.511970534069982, + "grad_norm": 0.21978263556957245, + "learning_rate": 6.026524777658669e-05, + "loss": 1.7278, + "step": 14700 + }, + { + "epoch": 4.512277470841006, + "grad_norm": 0.2814212441444397, + "learning_rate": 6.026038304862373e-05, + "loss": 1.7743, + "step": 14701 + }, + { + "epoch": 4.512584407612032, + "grad_norm": 0.23798944056034088, + "learning_rate": 6.025551821926165e-05, + "loss": 1.7348, + "step": 14702 + }, + { + "epoch": 4.512891344383057, + "grad_norm": 0.22415988147258759, + "learning_rate": 6.025065328854853e-05, + "loss": 1.7973, + "step": 14703 + }, + { + "epoch": 4.513198281154082, + "grad_norm": 0.34614792466163635, + "learning_rate": 6.0245788256532445e-05, + "loss": 1.7263, + "step": 14704 + }, + { + "epoch": 4.513505217925108, + "grad_norm": 0.333918958902359, + "learning_rate": 6.0240923123261485e-05, + "loss": 1.7305, + "step": 14705 + }, + { + "epoch": 4.513812154696133, + "grad_norm": 0.22231793403625488, + "learning_rate": 6.02360578887837e-05, + "loss": 1.806, + "step": 14706 + }, + { + "epoch": 4.514119091467157, + "grad_norm": 0.23323194682598114, + "learning_rate": 6.023119255314721e-05, + "loss": 1.7076, + "step": 14707 + }, + { + "epoch": 4.514426028238183, + "grad_norm": 0.26695477962493896, + "learning_rate": 6.022632711640007e-05, + "loss": 1.775, + "step": 14708 + }, + { + "epoch": 4.514732965009208, + "grad_norm": 0.21446476876735687, + "learning_rate": 6.0221461578590364e-05, + "loss": 1.7524, + "step": 14709 + }, + { + "epoch": 4.515039901780233, + "grad_norm": 0.2677358090877533, + "learning_rate": 6.0216595939766204e-05, + "loss": 1.7513, + "step": 14710 + }, + { + "epoch": 4.515346838551259, + "grad_norm": 0.28648239374160767, + "learning_rate": 6.021173019997565e-05, + "loss": 1.7249, + "step": 14711 + }, + { + "epoch": 4.515653775322283, + "grad_norm": 0.2178548276424408, + "learning_rate": 6.020686435926678e-05, + "loss": 1.7502, + "step": 14712 + }, + { + "epoch": 4.5159607120933085, + "grad_norm": 0.3391740024089813, + "learning_rate": 6.02019984176877e-05, + "loss": 1.6828, + "step": 14713 + }, + { + "epoch": 4.516267648864334, + "grad_norm": 0.25222229957580566, + "learning_rate": 6.01971323752865e-05, + "loss": 1.6982, + "step": 14714 + }, + { + "epoch": 4.516574585635359, + "grad_norm": 0.28776636719703674, + "learning_rate": 6.019226623211125e-05, + "loss": 1.8595, + "step": 14715 + }, + { + "epoch": 4.5168815224063845, + "grad_norm": 0.3240084648132324, + "learning_rate": 6.018739998821006e-05, + "loss": 1.7461, + "step": 14716 + }, + { + "epoch": 4.51718845917741, + "grad_norm": 0.26735052466392517, + "learning_rate": 6.0182533643631015e-05, + "loss": 1.7955, + "step": 14717 + }, + { + "epoch": 4.517495395948434, + "grad_norm": 0.24573692679405212, + "learning_rate": 6.017766719842219e-05, + "loss": 1.7441, + "step": 14718 + }, + { + "epoch": 4.51780233271946, + "grad_norm": 0.27401313185691833, + "learning_rate": 6.01728006526317e-05, + "loss": 1.7399, + "step": 14719 + }, + { + "epoch": 4.518109269490485, + "grad_norm": 0.23578806221485138, + "learning_rate": 6.016793400630763e-05, + "loss": 1.7936, + "step": 14720 + }, + { + "epoch": 4.51841620626151, + "grad_norm": 0.27763426303863525, + "learning_rate": 6.0163067259498074e-05, + "loss": 1.7263, + "step": 14721 + }, + { + "epoch": 4.518723143032536, + "grad_norm": 0.27102044224739075, + "learning_rate": 6.015820041225113e-05, + "loss": 1.7085, + "step": 14722 + }, + { + "epoch": 4.51903007980356, + "grad_norm": 0.2046152651309967, + "learning_rate": 6.01533334646149e-05, + "loss": 1.7602, + "step": 14723 + }, + { + "epoch": 4.519337016574585, + "grad_norm": 0.2645253837108612, + "learning_rate": 6.0148466416637484e-05, + "loss": 1.7729, + "step": 14724 + }, + { + "epoch": 4.519643953345611, + "grad_norm": 0.27467650175094604, + "learning_rate": 6.014359926836697e-05, + "loss": 1.7834, + "step": 14725 + }, + { + "epoch": 4.519950890116636, + "grad_norm": 0.30357635021209717, + "learning_rate": 6.013873201985145e-05, + "loss": 1.8685, + "step": 14726 + }, + { + "epoch": 4.520257826887661, + "grad_norm": 0.22923336923122406, + "learning_rate": 6.013386467113905e-05, + "loss": 1.7531, + "step": 14727 + }, + { + "epoch": 4.520564763658687, + "grad_norm": 0.2792156934738159, + "learning_rate": 6.012899722227786e-05, + "loss": 1.7927, + "step": 14728 + }, + { + "epoch": 4.520871700429711, + "grad_norm": 0.286161869764328, + "learning_rate": 6.012412967331598e-05, + "loss": 1.77, + "step": 14729 + }, + { + "epoch": 4.5211786372007365, + "grad_norm": 0.23964659869670868, + "learning_rate": 6.011926202430151e-05, + "loss": 1.7873, + "step": 14730 + }, + { + "epoch": 4.521485573971762, + "grad_norm": 0.2250162959098816, + "learning_rate": 6.011439427528258e-05, + "loss": 1.741, + "step": 14731 + }, + { + "epoch": 4.521792510742787, + "grad_norm": 0.2797175347805023, + "learning_rate": 6.010952642630726e-05, + "loss": 1.7482, + "step": 14732 + }, + { + "epoch": 4.5220994475138125, + "grad_norm": 0.22159560024738312, + "learning_rate": 6.010465847742368e-05, + "loss": 1.7591, + "step": 14733 + }, + { + "epoch": 4.522406384284837, + "grad_norm": 0.26638463139533997, + "learning_rate": 6.009979042867995e-05, + "loss": 1.8564, + "step": 14734 + }, + { + "epoch": 4.522713321055862, + "grad_norm": 0.2972821891307831, + "learning_rate": 6.009492228012416e-05, + "loss": 1.7569, + "step": 14735 + }, + { + "epoch": 4.523020257826888, + "grad_norm": 0.28108885884284973, + "learning_rate": 6.0090054031804444e-05, + "loss": 1.7256, + "step": 14736 + }, + { + "epoch": 4.523327194597913, + "grad_norm": 0.22359851002693176, + "learning_rate": 6.008518568376888e-05, + "loss": 1.7342, + "step": 14737 + }, + { + "epoch": 4.523634131368938, + "grad_norm": 0.2620728015899658, + "learning_rate": 6.008031723606562e-05, + "loss": 1.7703, + "step": 14738 + }, + { + "epoch": 4.523941068139964, + "grad_norm": 0.2641485333442688, + "learning_rate": 6.007544868874274e-05, + "loss": 1.6944, + "step": 14739 + }, + { + "epoch": 4.524248004910988, + "grad_norm": 0.24957752227783203, + "learning_rate": 6.007058004184839e-05, + "loss": 1.7746, + "step": 14740 + }, + { + "epoch": 4.524554941682013, + "grad_norm": 0.29830998182296753, + "learning_rate": 6.006571129543065e-05, + "loss": 1.7718, + "step": 14741 + }, + { + "epoch": 4.524861878453039, + "grad_norm": 0.32740798592567444, + "learning_rate": 6.006084244953766e-05, + "loss": 1.8194, + "step": 14742 + }, + { + "epoch": 4.525168815224064, + "grad_norm": 0.2614956796169281, + "learning_rate": 6.005597350421751e-05, + "loss": 1.7078, + "step": 14743 + }, + { + "epoch": 4.525475751995089, + "grad_norm": 0.23940515518188477, + "learning_rate": 6.005110445951836e-05, + "loss": 1.7488, + "step": 14744 + }, + { + "epoch": 4.525782688766114, + "grad_norm": 0.25485914945602417, + "learning_rate": 6.004623531548829e-05, + "loss": 1.7705, + "step": 14745 + }, + { + "epoch": 4.526089625537139, + "grad_norm": 0.213532954454422, + "learning_rate": 6.0041366072175445e-05, + "loss": 1.7501, + "step": 14746 + }, + { + "epoch": 4.526396562308165, + "grad_norm": 0.2420104295015335, + "learning_rate": 6.003649672962792e-05, + "loss": 1.717, + "step": 14747 + }, + { + "epoch": 4.52670349907919, + "grad_norm": 0.26179102063179016, + "learning_rate": 6.0031627287893865e-05, + "loss": 1.7665, + "step": 14748 + }, + { + "epoch": 4.527010435850215, + "grad_norm": 0.22032082080841064, + "learning_rate": 6.002675774702139e-05, + "loss": 1.7555, + "step": 14749 + }, + { + "epoch": 4.52731737262124, + "grad_norm": 0.23915240168571472, + "learning_rate": 6.002188810705861e-05, + "loss": 1.8219, + "step": 14750 + }, + { + "epoch": 4.527624309392265, + "grad_norm": 0.2275150567293167, + "learning_rate": 6.0017018368053665e-05, + "loss": 1.7418, + "step": 14751 + }, + { + "epoch": 4.52793124616329, + "grad_norm": 0.2349669486284256, + "learning_rate": 6.001214853005467e-05, + "loss": 1.7814, + "step": 14752 + }, + { + "epoch": 4.528238182934316, + "grad_norm": 0.29985731840133667, + "learning_rate": 6.000727859310975e-05, + "loss": 1.7109, + "step": 14753 + }, + { + "epoch": 4.528545119705341, + "grad_norm": 0.27282044291496277, + "learning_rate": 6.0002408557267044e-05, + "loss": 1.7806, + "step": 14754 + }, + { + "epoch": 4.5288520564763655, + "grad_norm": 0.20906320214271545, + "learning_rate": 5.9997538422574675e-05, + "loss": 1.7221, + "step": 14755 + }, + { + "epoch": 4.529158993247391, + "grad_norm": 0.24553455412387848, + "learning_rate": 5.999266818908076e-05, + "loss": 1.793, + "step": 14756 + }, + { + "epoch": 4.529465930018416, + "grad_norm": 0.29730647802352905, + "learning_rate": 5.998779785683345e-05, + "loss": 1.7597, + "step": 14757 + }, + { + "epoch": 4.5297728667894415, + "grad_norm": 0.28297582268714905, + "learning_rate": 5.998292742588087e-05, + "loss": 1.7459, + "step": 14758 + }, + { + "epoch": 4.530079803560467, + "grad_norm": 0.21853844821453094, + "learning_rate": 5.997805689627115e-05, + "loss": 1.7234, + "step": 14759 + }, + { + "epoch": 4.530386740331492, + "grad_norm": 0.2997361421585083, + "learning_rate": 5.997318626805242e-05, + "loss": 1.7294, + "step": 14760 + }, + { + "epoch": 4.530693677102517, + "grad_norm": 0.3298671543598175, + "learning_rate": 5.9968315541272804e-05, + "loss": 1.7837, + "step": 14761 + }, + { + "epoch": 4.531000613873542, + "grad_norm": 0.22812490165233612, + "learning_rate": 5.996344471598047e-05, + "loss": 1.7509, + "step": 14762 + }, + { + "epoch": 4.531307550644567, + "grad_norm": 0.3179669678211212, + "learning_rate": 5.995857379222354e-05, + "loss": 1.8354, + "step": 14763 + }, + { + "epoch": 4.531614487415593, + "grad_norm": 0.3072827458381653, + "learning_rate": 5.9953702770050135e-05, + "loss": 1.8051, + "step": 14764 + }, + { + "epoch": 4.531921424186618, + "grad_norm": 0.19386722147464752, + "learning_rate": 5.994883164950841e-05, + "loss": 1.7093, + "step": 14765 + }, + { + "epoch": 4.532228360957642, + "grad_norm": 0.2380950152873993, + "learning_rate": 5.99439604306465e-05, + "loss": 1.7547, + "step": 14766 + }, + { + "epoch": 4.532535297728668, + "grad_norm": 0.32604947686195374, + "learning_rate": 5.993908911351254e-05, + "loss": 1.8708, + "step": 14767 + }, + { + "epoch": 4.532842234499693, + "grad_norm": 0.2436954528093338, + "learning_rate": 5.993421769815468e-05, + "loss": 1.7272, + "step": 14768 + }, + { + "epoch": 4.533149171270718, + "grad_norm": 0.2470337301492691, + "learning_rate": 5.992934618462105e-05, + "loss": 1.7242, + "step": 14769 + }, + { + "epoch": 4.533456108041744, + "grad_norm": 0.25720325112342834, + "learning_rate": 5.992447457295981e-05, + "loss": 1.7219, + "step": 14770 + }, + { + "epoch": 4.533763044812769, + "grad_norm": 0.2518918812274933, + "learning_rate": 5.991960286321909e-05, + "loss": 1.7916, + "step": 14771 + }, + { + "epoch": 4.5340699815837935, + "grad_norm": 0.2561487853527069, + "learning_rate": 5.9914731055447037e-05, + "loss": 1.7695, + "step": 14772 + }, + { + "epoch": 4.534376918354819, + "grad_norm": 0.25361356139183044, + "learning_rate": 5.9909859149691804e-05, + "loss": 1.7464, + "step": 14773 + }, + { + "epoch": 4.534683855125844, + "grad_norm": 0.22827522456645966, + "learning_rate": 5.9904987146001545e-05, + "loss": 1.7288, + "step": 14774 + }, + { + "epoch": 4.5349907918968695, + "grad_norm": 0.2417261302471161, + "learning_rate": 5.9900115044424385e-05, + "loss": 1.7311, + "step": 14775 + }, + { + "epoch": 4.535297728667894, + "grad_norm": 0.20756755769252777, + "learning_rate": 5.9895242845008495e-05, + "loss": 1.7799, + "step": 14776 + }, + { + "epoch": 4.535604665438919, + "grad_norm": 0.21999207139015198, + "learning_rate": 5.989037054780201e-05, + "loss": 1.7782, + "step": 14777 + }, + { + "epoch": 4.535911602209945, + "grad_norm": 0.22863444685935974, + "learning_rate": 5.988549815285308e-05, + "loss": 1.7869, + "step": 14778 + }, + { + "epoch": 4.53621853898097, + "grad_norm": 0.23033374547958374, + "learning_rate": 5.988062566020987e-05, + "loss": 1.7328, + "step": 14779 + }, + { + "epoch": 4.536525475751995, + "grad_norm": 0.21903404593467712, + "learning_rate": 5.987575306992053e-05, + "loss": 1.7689, + "step": 14780 + }, + { + "epoch": 4.536832412523021, + "grad_norm": 0.2433948963880539, + "learning_rate": 5.98708803820332e-05, + "loss": 1.7647, + "step": 14781 + }, + { + "epoch": 4.537139349294045, + "grad_norm": 0.2564239799976349, + "learning_rate": 5.986600759659606e-05, + "loss": 1.7958, + "step": 14782 + }, + { + "epoch": 4.53744628606507, + "grad_norm": 0.24009190499782562, + "learning_rate": 5.9861134713657244e-05, + "loss": 1.7511, + "step": 14783 + }, + { + "epoch": 4.537753222836096, + "grad_norm": 0.2578975558280945, + "learning_rate": 5.985626173326491e-05, + "loss": 1.8285, + "step": 14784 + }, + { + "epoch": 4.538060159607121, + "grad_norm": 0.24334335327148438, + "learning_rate": 5.9851388655467225e-05, + "loss": 1.7391, + "step": 14785 + }, + { + "epoch": 4.538367096378146, + "grad_norm": 0.26446983218193054, + "learning_rate": 5.9846515480312335e-05, + "loss": 1.8232, + "step": 14786 + }, + { + "epoch": 4.538674033149171, + "grad_norm": 0.3125670850276947, + "learning_rate": 5.9841642207848415e-05, + "loss": 1.7202, + "step": 14787 + }, + { + "epoch": 4.538980969920196, + "grad_norm": 0.2524511218070984, + "learning_rate": 5.983676883812361e-05, + "loss": 1.7653, + "step": 14788 + }, + { + "epoch": 4.5392879066912215, + "grad_norm": 0.3693946897983551, + "learning_rate": 5.98318953711861e-05, + "loss": 1.7457, + "step": 14789 + }, + { + "epoch": 4.539594843462247, + "grad_norm": 0.32625386118888855, + "learning_rate": 5.9827021807084026e-05, + "loss": 1.784, + "step": 14790 + }, + { + "epoch": 4.539901780233272, + "grad_norm": 0.24243168532848358, + "learning_rate": 5.9822148145865574e-05, + "loss": 1.7651, + "step": 14791 + }, + { + "epoch": 4.5402087170042975, + "grad_norm": 0.2950129210948944, + "learning_rate": 5.9817274387578895e-05, + "loss": 1.7316, + "step": 14792 + }, + { + "epoch": 4.540515653775322, + "grad_norm": 0.29455235600471497, + "learning_rate": 5.981240053227216e-05, + "loss": 1.7504, + "step": 14793 + }, + { + "epoch": 4.540822590546347, + "grad_norm": 0.23161925375461578, + "learning_rate": 5.980752657999352e-05, + "loss": 1.7663, + "step": 14794 + }, + { + "epoch": 4.541129527317373, + "grad_norm": 0.2725144922733307, + "learning_rate": 5.980265253079116e-05, + "loss": 1.765, + "step": 14795 + }, + { + "epoch": 4.541436464088398, + "grad_norm": 0.30911222100257874, + "learning_rate": 5.979777838471324e-05, + "loss": 1.7888, + "step": 14796 + }, + { + "epoch": 4.541743400859423, + "grad_norm": 0.2818063497543335, + "learning_rate": 5.979290414180794e-05, + "loss": 1.8047, + "step": 14797 + }, + { + "epoch": 4.542050337630448, + "grad_norm": 0.23335030674934387, + "learning_rate": 5.978802980212341e-05, + "loss": 1.8205, + "step": 14798 + }, + { + "epoch": 4.542357274401473, + "grad_norm": 0.24228201806545258, + "learning_rate": 5.9783155365707855e-05, + "loss": 1.7774, + "step": 14799 + }, + { + "epoch": 4.542664211172498, + "grad_norm": 0.2410847544670105, + "learning_rate": 5.97782808326094e-05, + "loss": 1.6959, + "step": 14800 + }, + { + "epoch": 4.542971147943524, + "grad_norm": 0.24812567234039307, + "learning_rate": 5.9773406202876245e-05, + "loss": 1.8158, + "step": 14801 + }, + { + "epoch": 4.543278084714549, + "grad_norm": 0.2606147229671478, + "learning_rate": 5.9768531476556566e-05, + "loss": 1.7478, + "step": 14802 + }, + { + "epoch": 4.543585021485574, + "grad_norm": 0.24853013455867767, + "learning_rate": 5.976365665369854e-05, + "loss": 1.8158, + "step": 14803 + }, + { + "epoch": 4.543891958256599, + "grad_norm": 0.2320917695760727, + "learning_rate": 5.9758781734350334e-05, + "loss": 1.7812, + "step": 14804 + }, + { + "epoch": 4.544198895027624, + "grad_norm": 0.3460223376750946, + "learning_rate": 5.9753906718560127e-05, + "loss": 1.7562, + "step": 14805 + }, + { + "epoch": 4.5445058317986495, + "grad_norm": 0.2941136658191681, + "learning_rate": 5.9749031606376086e-05, + "loss": 1.7562, + "step": 14806 + }, + { + "epoch": 4.544812768569675, + "grad_norm": 0.2371312975883484, + "learning_rate": 5.9744156397846404e-05, + "loss": 1.7793, + "step": 14807 + }, + { + "epoch": 4.5451197053407, + "grad_norm": 0.2885094881057739, + "learning_rate": 5.973928109301926e-05, + "loss": 1.7564, + "step": 14808 + }, + { + "epoch": 4.545426642111725, + "grad_norm": 0.2369023859500885, + "learning_rate": 5.973440569194284e-05, + "loss": 1.7862, + "step": 14809 + }, + { + "epoch": 4.54573357888275, + "grad_norm": 0.26628994941711426, + "learning_rate": 5.972953019466531e-05, + "loss": 1.7828, + "step": 14810 + }, + { + "epoch": 4.546040515653775, + "grad_norm": 0.3091031610965729, + "learning_rate": 5.9724654601234864e-05, + "loss": 1.7623, + "step": 14811 + }, + { + "epoch": 4.546347452424801, + "grad_norm": 0.24652205407619476, + "learning_rate": 5.971977891169966e-05, + "loss": 1.6982, + "step": 14812 + }, + { + "epoch": 4.546654389195826, + "grad_norm": 0.21779046952724457, + "learning_rate": 5.971490312610793e-05, + "loss": 1.7363, + "step": 14813 + }, + { + "epoch": 4.546961325966851, + "grad_norm": 0.24130751192569733, + "learning_rate": 5.971002724450783e-05, + "loss": 1.7014, + "step": 14814 + }, + { + "epoch": 4.547268262737876, + "grad_norm": 0.21868734061717987, + "learning_rate": 5.9705151266947534e-05, + "loss": 1.7872, + "step": 14815 + }, + { + "epoch": 4.547575199508901, + "grad_norm": 0.257376492023468, + "learning_rate": 5.9700275193475275e-05, + "loss": 1.75, + "step": 14816 + }, + { + "epoch": 4.547882136279926, + "grad_norm": 0.3182791769504547, + "learning_rate": 5.9695399024139174e-05, + "loss": 1.7965, + "step": 14817 + }, + { + "epoch": 4.548189073050952, + "grad_norm": 0.25553280115127563, + "learning_rate": 5.969052275898748e-05, + "loss": 1.8394, + "step": 14818 + }, + { + "epoch": 4.548496009821976, + "grad_norm": 0.2810833752155304, + "learning_rate": 5.9685646398068354e-05, + "loss": 1.704, + "step": 14819 + }, + { + "epoch": 4.5488029465930016, + "grad_norm": 0.21320512890815735, + "learning_rate": 5.9680769941429993e-05, + "loss": 1.7248, + "step": 14820 + }, + { + "epoch": 4.549109883364027, + "grad_norm": 0.3159593939781189, + "learning_rate": 5.96758933891206e-05, + "loss": 1.7885, + "step": 14821 + }, + { + "epoch": 4.549416820135052, + "grad_norm": 0.21894599497318268, + "learning_rate": 5.967101674118834e-05, + "loss": 1.7388, + "step": 14822 + }, + { + "epoch": 4.5497237569060776, + "grad_norm": 0.24804852902889252, + "learning_rate": 5.9666139997681424e-05, + "loss": 1.7631, + "step": 14823 + }, + { + "epoch": 4.550030693677103, + "grad_norm": 0.2678423523902893, + "learning_rate": 5.966126315864806e-05, + "loss": 1.7631, + "step": 14824 + }, + { + "epoch": 4.550337630448127, + "grad_norm": 0.229649156332016, + "learning_rate": 5.9656386224136426e-05, + "loss": 1.7292, + "step": 14825 + }, + { + "epoch": 4.550644567219153, + "grad_norm": 0.25248458981513977, + "learning_rate": 5.965150919419473e-05, + "loss": 1.8, + "step": 14826 + }, + { + "epoch": 4.550951503990178, + "grad_norm": 0.2583169937133789, + "learning_rate": 5.964663206887116e-05, + "loss": 1.7641, + "step": 14827 + }, + { + "epoch": 4.551258440761203, + "grad_norm": 0.21465209126472473, + "learning_rate": 5.964175484821392e-05, + "loss": 1.7475, + "step": 14828 + }, + { + "epoch": 4.551565377532229, + "grad_norm": 0.28028783202171326, + "learning_rate": 5.963687753227118e-05, + "loss": 1.7649, + "step": 14829 + }, + { + "epoch": 4.551872314303253, + "grad_norm": 0.30248284339904785, + "learning_rate": 5.9632000121091194e-05, + "loss": 1.6969, + "step": 14830 + }, + { + "epoch": 4.5521792510742785, + "grad_norm": 0.24335962533950806, + "learning_rate": 5.962712261472213e-05, + "loss": 1.7295, + "step": 14831 + }, + { + "epoch": 4.552486187845304, + "grad_norm": 0.21014504134655, + "learning_rate": 5.9622245013212206e-05, + "loss": 1.7508, + "step": 14832 + }, + { + "epoch": 4.552793124616329, + "grad_norm": 0.24892041087150574, + "learning_rate": 5.961736731660963e-05, + "loss": 1.7317, + "step": 14833 + }, + { + "epoch": 4.5531000613873545, + "grad_norm": 0.2159881740808487, + "learning_rate": 5.9612489524962556e-05, + "loss": 1.7114, + "step": 14834 + }, + { + "epoch": 4.55340699815838, + "grad_norm": 0.2952292263507843, + "learning_rate": 5.960761163831925e-05, + "loss": 1.8226, + "step": 14835 + }, + { + "epoch": 4.553713934929404, + "grad_norm": 0.3019000291824341, + "learning_rate": 5.9602733656727895e-05, + "loss": 1.7391, + "step": 14836 + }, + { + "epoch": 4.55402087170043, + "grad_norm": 0.2273966521024704, + "learning_rate": 5.9597855580236696e-05, + "loss": 1.7718, + "step": 14837 + }, + { + "epoch": 4.554327808471455, + "grad_norm": 0.2462005764245987, + "learning_rate": 5.959297740889386e-05, + "loss": 1.8428, + "step": 14838 + }, + { + "epoch": 4.55463474524248, + "grad_norm": 0.2773323059082031, + "learning_rate": 5.95880991427476e-05, + "loss": 1.6878, + "step": 14839 + }, + { + "epoch": 4.554941682013506, + "grad_norm": 0.26519861817359924, + "learning_rate": 5.958322078184611e-05, + "loss": 1.737, + "step": 14840 + }, + { + "epoch": 4.55524861878453, + "grad_norm": 0.20157647132873535, + "learning_rate": 5.9578342326237626e-05, + "loss": 1.7164, + "step": 14841 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.21715669333934784, + "learning_rate": 5.957346377597035e-05, + "loss": 1.705, + "step": 14842 + }, + { + "epoch": 4.555862492326581, + "grad_norm": 0.3056442439556122, + "learning_rate": 5.95685851310925e-05, + "loss": 1.7672, + "step": 14843 + }, + { + "epoch": 4.556169429097606, + "grad_norm": 0.24832262098789215, + "learning_rate": 5.956370639165228e-05, + "loss": 1.7305, + "step": 14844 + }, + { + "epoch": 4.556476365868631, + "grad_norm": 0.25814661383628845, + "learning_rate": 5.955882755769791e-05, + "loss": 1.7562, + "step": 14845 + }, + { + "epoch": 4.556783302639657, + "grad_norm": 0.38242629170417786, + "learning_rate": 5.95539486292776e-05, + "loss": 1.7077, + "step": 14846 + }, + { + "epoch": 4.557090239410681, + "grad_norm": 0.2901807427406311, + "learning_rate": 5.954906960643956e-05, + "loss": 1.7233, + "step": 14847 + }, + { + "epoch": 4.5573971761817065, + "grad_norm": 0.22636106610298157, + "learning_rate": 5.954419048923202e-05, + "loss": 1.777, + "step": 14848 + }, + { + "epoch": 4.557704112952732, + "grad_norm": 0.32392850518226624, + "learning_rate": 5.953931127770321e-05, + "loss": 1.7477, + "step": 14849 + }, + { + "epoch": 4.558011049723757, + "grad_norm": 0.3403460681438446, + "learning_rate": 5.953443197190134e-05, + "loss": 1.7712, + "step": 14850 + }, + { + "epoch": 4.558317986494782, + "grad_norm": 0.22923234105110168, + "learning_rate": 5.95295525718746e-05, + "loss": 1.8154, + "step": 14851 + }, + { + "epoch": 4.558624923265807, + "grad_norm": 0.25152841210365295, + "learning_rate": 5.952467307767124e-05, + "loss": 1.7091, + "step": 14852 + }, + { + "epoch": 4.558931860036832, + "grad_norm": 0.27743563055992126, + "learning_rate": 5.951979348933949e-05, + "loss": 1.7621, + "step": 14853 + }, + { + "epoch": 4.559238796807858, + "grad_norm": 0.25809308886528015, + "learning_rate": 5.951491380692756e-05, + "loss": 1.7669, + "step": 14854 + }, + { + "epoch": 4.559545733578883, + "grad_norm": 0.24863946437835693, + "learning_rate": 5.9510034030483676e-05, + "loss": 1.7354, + "step": 14855 + }, + { + "epoch": 4.559852670349908, + "grad_norm": 0.2896040380001068, + "learning_rate": 5.9505154160056066e-05, + "loss": 1.7878, + "step": 14856 + }, + { + "epoch": 4.560159607120933, + "grad_norm": 0.23814482986927032, + "learning_rate": 5.950027419569294e-05, + "loss": 1.7781, + "step": 14857 + }, + { + "epoch": 4.560466543891958, + "grad_norm": 0.2531175911426544, + "learning_rate": 5.949539413744253e-05, + "loss": 1.762, + "step": 14858 + }, + { + "epoch": 4.560773480662983, + "grad_norm": 0.2541767656803131, + "learning_rate": 5.949051398535308e-05, + "loss": 1.7722, + "step": 14859 + }, + { + "epoch": 4.561080417434009, + "grad_norm": 0.25216221809387207, + "learning_rate": 5.948563373947281e-05, + "loss": 1.754, + "step": 14860 + }, + { + "epoch": 4.561387354205034, + "grad_norm": 0.24421775341033936, + "learning_rate": 5.948075339984994e-05, + "loss": 1.7976, + "step": 14861 + }, + { + "epoch": 4.5616942909760585, + "grad_norm": 0.24435418844223022, + "learning_rate": 5.947587296653272e-05, + "loss": 1.79, + "step": 14862 + }, + { + "epoch": 4.562001227747084, + "grad_norm": 0.24471627175807953, + "learning_rate": 5.947099243956936e-05, + "loss": 1.755, + "step": 14863 + }, + { + "epoch": 4.562308164518109, + "grad_norm": 0.2762158215045929, + "learning_rate": 5.9466111819008096e-05, + "loss": 1.7695, + "step": 14864 + }, + { + "epoch": 4.5626151012891345, + "grad_norm": 0.23841319978237152, + "learning_rate": 5.9461231104897174e-05, + "loss": 1.7302, + "step": 14865 + }, + { + "epoch": 4.56292203806016, + "grad_norm": 0.260231077671051, + "learning_rate": 5.9456350297284826e-05, + "loss": 1.7917, + "step": 14866 + }, + { + "epoch": 4.563228974831185, + "grad_norm": 0.2752247452735901, + "learning_rate": 5.945146939621929e-05, + "loss": 1.7953, + "step": 14867 + }, + { + "epoch": 4.56353591160221, + "grad_norm": 0.28760650753974915, + "learning_rate": 5.944658840174878e-05, + "loss": 1.8582, + "step": 14868 + }, + { + "epoch": 4.563842848373235, + "grad_norm": 0.24311676621437073, + "learning_rate": 5.944170731392153e-05, + "loss": 1.8006, + "step": 14869 + }, + { + "epoch": 4.56414978514426, + "grad_norm": 0.2692974805831909, + "learning_rate": 5.943682613278583e-05, + "loss": 1.6984, + "step": 14870 + }, + { + "epoch": 4.564456721915286, + "grad_norm": 0.2784348726272583, + "learning_rate": 5.943194485838985e-05, + "loss": 1.8082, + "step": 14871 + }, + { + "epoch": 4.564763658686311, + "grad_norm": 0.2557264268398285, + "learning_rate": 5.9427063490781885e-05, + "loss": 1.7715, + "step": 14872 + }, + { + "epoch": 4.565070595457335, + "grad_norm": 0.3738742470741272, + "learning_rate": 5.942218203001015e-05, + "loss": 1.7549, + "step": 14873 + }, + { + "epoch": 4.565377532228361, + "grad_norm": 0.2424495816230774, + "learning_rate": 5.941730047612288e-05, + "loss": 1.7388, + "step": 14874 + }, + { + "epoch": 4.565684468999386, + "grad_norm": 0.27020737528800964, + "learning_rate": 5.941241882916833e-05, + "loss": 1.752, + "step": 14875 + }, + { + "epoch": 4.565991405770411, + "grad_norm": 0.3763764798641205, + "learning_rate": 5.940753708919474e-05, + "loss": 1.7918, + "step": 14876 + }, + { + "epoch": 4.566298342541437, + "grad_norm": 0.26782163977622986, + "learning_rate": 5.940265525625036e-05, + "loss": 1.7244, + "step": 14877 + }, + { + "epoch": 4.566605279312462, + "grad_norm": 0.24978911876678467, + "learning_rate": 5.9397773330383434e-05, + "loss": 1.7706, + "step": 14878 + }, + { + "epoch": 4.5669122160834865, + "grad_norm": 0.32905304431915283, + "learning_rate": 5.93928913116422e-05, + "loss": 1.7381, + "step": 14879 + }, + { + "epoch": 4.567219152854512, + "grad_norm": 0.2196444720029831, + "learning_rate": 5.93880092000749e-05, + "loss": 1.7605, + "step": 14880 + }, + { + "epoch": 4.567526089625537, + "grad_norm": 0.3156622350215912, + "learning_rate": 5.9383126995729786e-05, + "loss": 1.9181, + "step": 14881 + }, + { + "epoch": 4.5678330263965625, + "grad_norm": 0.2895203232765198, + "learning_rate": 5.937824469865513e-05, + "loss": 1.7967, + "step": 14882 + }, + { + "epoch": 4.568139963167588, + "grad_norm": 0.24854810535907745, + "learning_rate": 5.937336230889916e-05, + "loss": 1.7332, + "step": 14883 + }, + { + "epoch": 4.568446899938612, + "grad_norm": 0.3417081832885742, + "learning_rate": 5.936847982651013e-05, + "loss": 1.7525, + "step": 14884 + }, + { + "epoch": 4.568753836709638, + "grad_norm": 0.2874949276447296, + "learning_rate": 5.936359725153629e-05, + "loss": 1.7659, + "step": 14885 + }, + { + "epoch": 4.569060773480663, + "grad_norm": 0.25031307339668274, + "learning_rate": 5.935871458402588e-05, + "loss": 1.8061, + "step": 14886 + }, + { + "epoch": 4.569367710251688, + "grad_norm": 0.27047309279441833, + "learning_rate": 5.935383182402717e-05, + "loss": 1.7318, + "step": 14887 + }, + { + "epoch": 4.569674647022714, + "grad_norm": 0.2642819881439209, + "learning_rate": 5.9348948971588425e-05, + "loss": 1.849, + "step": 14888 + }, + { + "epoch": 4.569981583793739, + "grad_norm": 0.2452307790517807, + "learning_rate": 5.9344066026757886e-05, + "loss": 1.7491, + "step": 14889 + }, + { + "epoch": 4.570288520564763, + "grad_norm": 0.24055036902427673, + "learning_rate": 5.9339182989583795e-05, + "loss": 1.7573, + "step": 14890 + }, + { + "epoch": 4.570595457335789, + "grad_norm": 0.23036183416843414, + "learning_rate": 5.933429986011444e-05, + "loss": 1.7841, + "step": 14891 + }, + { + "epoch": 4.570902394106814, + "grad_norm": 0.27987608313560486, + "learning_rate": 5.932941663839805e-05, + "loss": 1.7835, + "step": 14892 + }, + { + "epoch": 4.571209330877839, + "grad_norm": 0.31747013330459595, + "learning_rate": 5.93245333244829e-05, + "loss": 1.7905, + "step": 14893 + }, + { + "epoch": 4.571516267648864, + "grad_norm": 0.24841344356536865, + "learning_rate": 5.931964991841725e-05, + "loss": 1.8003, + "step": 14894 + }, + { + "epoch": 4.571823204419889, + "grad_norm": 0.2416950911283493, + "learning_rate": 5.9314766420249356e-05, + "loss": 1.7787, + "step": 14895 + }, + { + "epoch": 4.5721301411909145, + "grad_norm": 0.2322494238615036, + "learning_rate": 5.930988283002748e-05, + "loss": 1.8153, + "step": 14896 + }, + { + "epoch": 4.57243707796194, + "grad_norm": 0.22629016637802124, + "learning_rate": 5.930499914779989e-05, + "loss": 1.6743, + "step": 14897 + }, + { + "epoch": 4.572744014732965, + "grad_norm": 0.21481508016586304, + "learning_rate": 5.930011537361483e-05, + "loss": 1.7301, + "step": 14898 + }, + { + "epoch": 4.5730509515039905, + "grad_norm": 0.1993340700864792, + "learning_rate": 5.9295231507520586e-05, + "loss": 1.6796, + "step": 14899 + }, + { + "epoch": 4.573357888275015, + "grad_norm": 0.21681822836399078, + "learning_rate": 5.929034754956543e-05, + "loss": 1.7333, + "step": 14900 + }, + { + "epoch": 4.57366482504604, + "grad_norm": 0.23105305433273315, + "learning_rate": 5.928546349979761e-05, + "loss": 1.8207, + "step": 14901 + }, + { + "epoch": 4.573971761817066, + "grad_norm": 0.24656468629837036, + "learning_rate": 5.9280579358265384e-05, + "loss": 1.7805, + "step": 14902 + }, + { + "epoch": 4.574278698588091, + "grad_norm": 0.28564780950546265, + "learning_rate": 5.927569512501704e-05, + "loss": 1.7224, + "step": 14903 + }, + { + "epoch": 4.574585635359116, + "grad_norm": 0.26030251383781433, + "learning_rate": 5.927081080010084e-05, + "loss": 1.7417, + "step": 14904 + }, + { + "epoch": 4.574892572130141, + "grad_norm": 0.21427087485790253, + "learning_rate": 5.926592638356505e-05, + "loss": 1.7239, + "step": 14905 + }, + { + "epoch": 4.575199508901166, + "grad_norm": 0.2351662665605545, + "learning_rate": 5.9261041875457956e-05, + "loss": 1.7711, + "step": 14906 + }, + { + "epoch": 4.5755064456721914, + "grad_norm": 0.27335020899772644, + "learning_rate": 5.925615727582781e-05, + "loss": 1.7496, + "step": 14907 + }, + { + "epoch": 4.575813382443217, + "grad_norm": 0.27849945425987244, + "learning_rate": 5.925127258472289e-05, + "loss": 1.7576, + "step": 14908 + }, + { + "epoch": 4.576120319214242, + "grad_norm": 0.27859339118003845, + "learning_rate": 5.924638780219147e-05, + "loss": 1.8076, + "step": 14909 + }, + { + "epoch": 4.5764272559852675, + "grad_norm": 0.24664369225502014, + "learning_rate": 5.9241502928281836e-05, + "loss": 1.7657, + "step": 14910 + }, + { + "epoch": 4.576734192756292, + "grad_norm": 0.29881149530410767, + "learning_rate": 5.923661796304224e-05, + "loss": 1.7611, + "step": 14911 + }, + { + "epoch": 4.577041129527317, + "grad_norm": 0.2672356367111206, + "learning_rate": 5.9231732906520984e-05, + "loss": 1.7605, + "step": 14912 + }, + { + "epoch": 4.577348066298343, + "grad_norm": 0.24282832443714142, + "learning_rate": 5.9226847758766336e-05, + "loss": 1.7037, + "step": 14913 + }, + { + "epoch": 4.577655003069368, + "grad_norm": 0.3822915852069855, + "learning_rate": 5.922196251982656e-05, + "loss": 1.7609, + "step": 14914 + }, + { + "epoch": 4.577961939840393, + "grad_norm": 0.30721214413642883, + "learning_rate": 5.921707718974994e-05, + "loss": 1.7398, + "step": 14915 + }, + { + "epoch": 4.578268876611418, + "grad_norm": 0.235477477312088, + "learning_rate": 5.921219176858477e-05, + "loss": 1.6869, + "step": 14916 + }, + { + "epoch": 4.578575813382443, + "grad_norm": 0.3752216100692749, + "learning_rate": 5.920730625637934e-05, + "loss": 1.7296, + "step": 14917 + }, + { + "epoch": 4.578882750153468, + "grad_norm": 0.36901310086250305, + "learning_rate": 5.920242065318189e-05, + "loss": 1.7405, + "step": 14918 + }, + { + "epoch": 4.579189686924494, + "grad_norm": 0.2308608740568161, + "learning_rate": 5.9197534959040725e-05, + "loss": 1.7953, + "step": 14919 + }, + { + "epoch": 4.579496623695519, + "grad_norm": 0.3286738991737366, + "learning_rate": 5.919264917400412e-05, + "loss": 1.7669, + "step": 14920 + }, + { + "epoch": 4.579803560466544, + "grad_norm": 0.3944021165370941, + "learning_rate": 5.918776329812039e-05, + "loss": 1.7165, + "step": 14921 + }, + { + "epoch": 4.580110497237569, + "grad_norm": 0.22054845094680786, + "learning_rate": 5.9182877331437795e-05, + "loss": 1.7739, + "step": 14922 + }, + { + "epoch": 4.580417434008594, + "grad_norm": 0.3467540740966797, + "learning_rate": 5.9177991274004605e-05, + "loss": 1.7713, + "step": 14923 + }, + { + "epoch": 4.5807243707796195, + "grad_norm": 0.4313695728778839, + "learning_rate": 5.917310512586914e-05, + "loss": 1.7654, + "step": 14924 + }, + { + "epoch": 4.581031307550645, + "grad_norm": 0.2723502814769745, + "learning_rate": 5.9168218887079685e-05, + "loss": 1.7314, + "step": 14925 + }, + { + "epoch": 4.581338244321669, + "grad_norm": 0.2641250789165497, + "learning_rate": 5.9163332557684504e-05, + "loss": 1.7303, + "step": 14926 + }, + { + "epoch": 4.581645181092695, + "grad_norm": 0.3780760169029236, + "learning_rate": 5.915844613773189e-05, + "loss": 1.7748, + "step": 14927 + }, + { + "epoch": 4.58195211786372, + "grad_norm": 0.23379632830619812, + "learning_rate": 5.915355962727015e-05, + "loss": 1.7482, + "step": 14928 + }, + { + "epoch": 4.582259054634745, + "grad_norm": 0.35227084159851074, + "learning_rate": 5.914867302634758e-05, + "loss": 1.8198, + "step": 14929 + }, + { + "epoch": 4.582565991405771, + "grad_norm": 0.34348124265670776, + "learning_rate": 5.914378633501245e-05, + "loss": 1.8364, + "step": 14930 + }, + { + "epoch": 4.582872928176796, + "grad_norm": 0.2446804940700531, + "learning_rate": 5.9138899553313066e-05, + "loss": 1.7779, + "step": 14931 + }, + { + "epoch": 4.58317986494782, + "grad_norm": 0.23893557488918304, + "learning_rate": 5.913401268129772e-05, + "loss": 1.7582, + "step": 14932 + }, + { + "epoch": 4.583486801718846, + "grad_norm": 0.3046814203262329, + "learning_rate": 5.912912571901471e-05, + "loss": 1.6871, + "step": 14933 + }, + { + "epoch": 4.583793738489871, + "grad_norm": 0.2232733964920044, + "learning_rate": 5.912423866651233e-05, + "loss": 1.7269, + "step": 14934 + }, + { + "epoch": 4.584100675260896, + "grad_norm": 0.18664126098155975, + "learning_rate": 5.911935152383888e-05, + "loss": 1.7155, + "step": 14935 + }, + { + "epoch": 4.584407612031922, + "grad_norm": 0.2573263347148895, + "learning_rate": 5.911446429104265e-05, + "loss": 1.7901, + "step": 14936 + }, + { + "epoch": 4.584714548802946, + "grad_norm": 0.2382393181324005, + "learning_rate": 5.910957696817194e-05, + "loss": 1.7407, + "step": 14937 + }, + { + "epoch": 4.5850214855739715, + "grad_norm": 0.28363972902297974, + "learning_rate": 5.910468955527504e-05, + "loss": 1.7971, + "step": 14938 + }, + { + "epoch": 4.585328422344997, + "grad_norm": 0.3173120617866516, + "learning_rate": 5.909980205240027e-05, + "loss": 1.744, + "step": 14939 + }, + { + "epoch": 4.585635359116022, + "grad_norm": 0.2281302511692047, + "learning_rate": 5.909491445959592e-05, + "loss": 1.6976, + "step": 14940 + }, + { + "epoch": 4.5859422958870475, + "grad_norm": 0.24962912499904633, + "learning_rate": 5.9090026776910304e-05, + "loss": 1.7979, + "step": 14941 + }, + { + "epoch": 4.586249232658073, + "grad_norm": 0.22330854833126068, + "learning_rate": 5.908513900439171e-05, + "loss": 1.7854, + "step": 14942 + }, + { + "epoch": 4.586556169429097, + "grad_norm": 0.20861582458019257, + "learning_rate": 5.908025114208845e-05, + "loss": 1.7133, + "step": 14943 + }, + { + "epoch": 4.586863106200123, + "grad_norm": 0.21838510036468506, + "learning_rate": 5.90753631900488e-05, + "loss": 1.6919, + "step": 14944 + }, + { + "epoch": 4.587170042971148, + "grad_norm": 0.252798467874527, + "learning_rate": 5.907047514832112e-05, + "loss": 1.838, + "step": 14945 + }, + { + "epoch": 4.587476979742173, + "grad_norm": 0.326893150806427, + "learning_rate": 5.906558701695369e-05, + "loss": 1.7303, + "step": 14946 + }, + { + "epoch": 4.587783916513199, + "grad_norm": 0.36489585041999817, + "learning_rate": 5.9060698795994804e-05, + "loss": 1.7631, + "step": 14947 + }, + { + "epoch": 4.588090853284223, + "grad_norm": 0.27491649985313416, + "learning_rate": 5.905581048549279e-05, + "loss": 1.7773, + "step": 14948 + }, + { + "epoch": 4.588397790055248, + "grad_norm": 0.2334890067577362, + "learning_rate": 5.905092208549595e-05, + "loss": 1.7254, + "step": 14949 + }, + { + "epoch": 4.588704726826274, + "grad_norm": 0.24383895099163055, + "learning_rate": 5.904603359605257e-05, + "loss": 1.7496, + "step": 14950 + }, + { + "epoch": 4.589011663597299, + "grad_norm": 0.2144637256860733, + "learning_rate": 5.904114501721102e-05, + "loss": 1.7028, + "step": 14951 + }, + { + "epoch": 4.589318600368324, + "grad_norm": 0.19675977528095245, + "learning_rate": 5.9036256349019555e-05, + "loss": 1.7548, + "step": 14952 + }, + { + "epoch": 4.58962553713935, + "grad_norm": 0.23712843656539917, + "learning_rate": 5.903136759152652e-05, + "loss": 1.7722, + "step": 14953 + }, + { + "epoch": 4.589932473910374, + "grad_norm": 0.20307733118534088, + "learning_rate": 5.902647874478021e-05, + "loss": 1.7177, + "step": 14954 + }, + { + "epoch": 4.5902394106813995, + "grad_norm": 0.21767669916152954, + "learning_rate": 5.9021589808828936e-05, + "loss": 1.7963, + "step": 14955 + }, + { + "epoch": 4.590546347452425, + "grad_norm": 0.2056351602077484, + "learning_rate": 5.9016700783721036e-05, + "loss": 1.7439, + "step": 14956 + }, + { + "epoch": 4.59085328422345, + "grad_norm": 0.20480911433696747, + "learning_rate": 5.90118116695048e-05, + "loss": 1.7122, + "step": 14957 + }, + { + "epoch": 4.5911602209944755, + "grad_norm": 0.24091731011867523, + "learning_rate": 5.900692246622858e-05, + "loss": 1.7862, + "step": 14958 + }, + { + "epoch": 4.5914671577655, + "grad_norm": 0.20246434211730957, + "learning_rate": 5.900203317394066e-05, + "loss": 1.6895, + "step": 14959 + }, + { + "epoch": 4.591774094536525, + "grad_norm": 0.23771630227565765, + "learning_rate": 5.899714379268938e-05, + "loss": 1.7794, + "step": 14960 + }, + { + "epoch": 4.592081031307551, + "grad_norm": 0.2638718783855438, + "learning_rate": 5.899225432252303e-05, + "loss": 1.8059, + "step": 14961 + }, + { + "epoch": 4.592387968078576, + "grad_norm": 0.24251408874988556, + "learning_rate": 5.898736476348997e-05, + "loss": 1.8063, + "step": 14962 + }, + { + "epoch": 4.592694904849601, + "grad_norm": 0.2487735152244568, + "learning_rate": 5.8982475115638515e-05, + "loss": 1.7615, + "step": 14963 + }, + { + "epoch": 4.593001841620627, + "grad_norm": 0.23507241904735565, + "learning_rate": 5.897758537901696e-05, + "loss": 1.7496, + "step": 14964 + }, + { + "epoch": 4.593308778391651, + "grad_norm": 0.22354768216609955, + "learning_rate": 5.897269555367365e-05, + "loss": 1.7293, + "step": 14965 + }, + { + "epoch": 4.593615715162676, + "grad_norm": 0.2711353003978729, + "learning_rate": 5.89678056396569e-05, + "loss": 1.8127, + "step": 14966 + }, + { + "epoch": 4.593922651933702, + "grad_norm": 0.30061110854148865, + "learning_rate": 5.8962915637015036e-05, + "loss": 1.7653, + "step": 14967 + }, + { + "epoch": 4.594229588704727, + "grad_norm": 0.24577318131923676, + "learning_rate": 5.895802554579639e-05, + "loss": 1.7888, + "step": 14968 + }, + { + "epoch": 4.5945365254757515, + "grad_norm": 0.25568944215774536, + "learning_rate": 5.895313536604929e-05, + "loss": 1.7912, + "step": 14969 + }, + { + "epoch": 4.594843462246777, + "grad_norm": 0.2710168957710266, + "learning_rate": 5.894824509782206e-05, + "loss": 1.7681, + "step": 14970 + }, + { + "epoch": 4.595150399017802, + "grad_norm": 0.24056777358055115, + "learning_rate": 5.894335474116303e-05, + "loss": 1.7729, + "step": 14971 + }, + { + "epoch": 4.5954573357888275, + "grad_norm": 0.21956710517406464, + "learning_rate": 5.89384642961205e-05, + "loss": 1.7576, + "step": 14972 + }, + { + "epoch": 4.595764272559853, + "grad_norm": 0.27499106526374817, + "learning_rate": 5.893357376274284e-05, + "loss": 1.7909, + "step": 14973 + }, + { + "epoch": 4.596071209330878, + "grad_norm": 0.28581273555755615, + "learning_rate": 5.8928683141078376e-05, + "loss": 1.7592, + "step": 14974 + }, + { + "epoch": 4.596378146101903, + "grad_norm": 0.23218442499637604, + "learning_rate": 5.892379243117543e-05, + "loss": 1.7142, + "step": 14975 + }, + { + "epoch": 4.596685082872928, + "grad_norm": 0.34015771746635437, + "learning_rate": 5.891890163308234e-05, + "loss": 1.7457, + "step": 14976 + }, + { + "epoch": 4.596992019643953, + "grad_norm": 0.2630012333393097, + "learning_rate": 5.8914010746847435e-05, + "loss": 1.7612, + "step": 14977 + }, + { + "epoch": 4.597298956414979, + "grad_norm": 0.2265843003988266, + "learning_rate": 5.890911977251904e-05, + "loss": 1.7272, + "step": 14978 + }, + { + "epoch": 4.597605893186004, + "grad_norm": 0.22325244545936584, + "learning_rate": 5.8904228710145505e-05, + "loss": 1.7447, + "step": 14979 + }, + { + "epoch": 4.597912829957028, + "grad_norm": 0.23512716591358185, + "learning_rate": 5.889933755977517e-05, + "loss": 1.7123, + "step": 14980 + }, + { + "epoch": 4.598219766728054, + "grad_norm": 0.22534869611263275, + "learning_rate": 5.8894446321456365e-05, + "loss": 1.785, + "step": 14981 + }, + { + "epoch": 4.598526703499079, + "grad_norm": 0.2447836697101593, + "learning_rate": 5.888955499523743e-05, + "loss": 1.7154, + "step": 14982 + }, + { + "epoch": 4.598833640270104, + "grad_norm": 0.2451140582561493, + "learning_rate": 5.88846635811667e-05, + "loss": 1.7494, + "step": 14983 + }, + { + "epoch": 4.59914057704113, + "grad_norm": 0.2253585308790207, + "learning_rate": 5.8879772079292504e-05, + "loss": 1.7591, + "step": 14984 + }, + { + "epoch": 4.599447513812155, + "grad_norm": 0.21714572608470917, + "learning_rate": 5.887488048966322e-05, + "loss": 1.7314, + "step": 14985 + }, + { + "epoch": 4.5997544505831796, + "grad_norm": 0.24897411465644836, + "learning_rate": 5.8869988812327145e-05, + "loss": 1.776, + "step": 14986 + }, + { + "epoch": 4.600061387354205, + "grad_norm": 0.22575093805789948, + "learning_rate": 5.8865097047332653e-05, + "loss": 1.7168, + "step": 14987 + }, + { + "epoch": 4.60036832412523, + "grad_norm": 0.22857412695884705, + "learning_rate": 5.886020519472808e-05, + "loss": 1.8262, + "step": 14988 + }, + { + "epoch": 4.600675260896256, + "grad_norm": 0.22741298377513885, + "learning_rate": 5.885531325456174e-05, + "loss": 1.6732, + "step": 14989 + }, + { + "epoch": 4.600982197667281, + "grad_norm": 0.2229645550251007, + "learning_rate": 5.885042122688202e-05, + "loss": 1.7384, + "step": 14990 + }, + { + "epoch": 4.601289134438305, + "grad_norm": 0.22609494626522064, + "learning_rate": 5.884552911173726e-05, + "loss": 1.714, + "step": 14991 + }, + { + "epoch": 4.601596071209331, + "grad_norm": 0.2629149854183197, + "learning_rate": 5.884063690917578e-05, + "loss": 1.8133, + "step": 14992 + }, + { + "epoch": 4.601903007980356, + "grad_norm": 0.220725417137146, + "learning_rate": 5.883574461924597e-05, + "loss": 1.6898, + "step": 14993 + }, + { + "epoch": 4.602209944751381, + "grad_norm": 0.207612082362175, + "learning_rate": 5.8830852241996135e-05, + "loss": 1.7302, + "step": 14994 + }, + { + "epoch": 4.602516881522407, + "grad_norm": 0.22418084740638733, + "learning_rate": 5.8825959777474625e-05, + "loss": 1.763, + "step": 14995 + }, + { + "epoch": 4.602823818293432, + "grad_norm": 0.30606865882873535, + "learning_rate": 5.882106722572983e-05, + "loss": 1.7657, + "step": 14996 + }, + { + "epoch": 4.6031307550644565, + "grad_norm": 0.2947966456413269, + "learning_rate": 5.881617458681008e-05, + "loss": 1.7796, + "step": 14997 + }, + { + "epoch": 4.603437691835482, + "grad_norm": 0.23430216312408447, + "learning_rate": 5.881128186076372e-05, + "loss": 1.78, + "step": 14998 + }, + { + "epoch": 4.603744628606507, + "grad_norm": 0.28081849217414856, + "learning_rate": 5.880638904763911e-05, + "loss": 1.6791, + "step": 14999 + }, + { + "epoch": 4.6040515653775325, + "grad_norm": 0.25459226965904236, + "learning_rate": 5.88014961474846e-05, + "loss": 1.8064, + "step": 15000 + }, + { + "epoch": 4.604358502148557, + "grad_norm": 0.2358713001012802, + "learning_rate": 5.879660316034854e-05, + "loss": 1.763, + "step": 15001 + }, + { + "epoch": 4.604665438919582, + "grad_norm": 0.32954758405685425, + "learning_rate": 5.879171008627931e-05, + "loss": 1.7462, + "step": 15002 + }, + { + "epoch": 4.604972375690608, + "grad_norm": 0.2588615417480469, + "learning_rate": 5.878681692532523e-05, + "loss": 1.7771, + "step": 15003 + }, + { + "epoch": 4.605279312461633, + "grad_norm": 0.21216195821762085, + "learning_rate": 5.878192367753468e-05, + "loss": 1.7128, + "step": 15004 + }, + { + "epoch": 4.605586249232658, + "grad_norm": 0.26849040389060974, + "learning_rate": 5.8777030342956016e-05, + "loss": 1.7048, + "step": 15005 + }, + { + "epoch": 4.605893186003684, + "grad_norm": 0.22343295812606812, + "learning_rate": 5.877213692163759e-05, + "loss": 1.7695, + "step": 15006 + }, + { + "epoch": 4.606200122774708, + "grad_norm": 0.2794288694858551, + "learning_rate": 5.876724341362776e-05, + "loss": 1.7856, + "step": 15007 + }, + { + "epoch": 4.606507059545733, + "grad_norm": 0.3525427579879761, + "learning_rate": 5.8762349818974905e-05, + "loss": 1.7807, + "step": 15008 + }, + { + "epoch": 4.606813996316759, + "grad_norm": 0.25886499881744385, + "learning_rate": 5.875745613772736e-05, + "loss": 1.7818, + "step": 15009 + }, + { + "epoch": 4.607120933087784, + "grad_norm": 0.24822987616062164, + "learning_rate": 5.8752562369933515e-05, + "loss": 1.7369, + "step": 15010 + }, + { + "epoch": 4.607427869858809, + "grad_norm": 0.26067355275154114, + "learning_rate": 5.874766851564171e-05, + "loss": 1.7056, + "step": 15011 + }, + { + "epoch": 4.607734806629834, + "grad_norm": 0.2869747579097748, + "learning_rate": 5.874277457490033e-05, + "loss": 1.7284, + "step": 15012 + }, + { + "epoch": 4.608041743400859, + "grad_norm": 0.23153580725193024, + "learning_rate": 5.87378805477577e-05, + "loss": 1.7331, + "step": 15013 + }, + { + "epoch": 4.6083486801718845, + "grad_norm": 0.29307299852371216, + "learning_rate": 5.873298643426223e-05, + "loss": 1.7376, + "step": 15014 + }, + { + "epoch": 4.60865561694291, + "grad_norm": 0.25638771057128906, + "learning_rate": 5.872809223446227e-05, + "loss": 1.7585, + "step": 15015 + }, + { + "epoch": 4.608962553713935, + "grad_norm": 0.2272702306509018, + "learning_rate": 5.872319794840618e-05, + "loss": 1.7482, + "step": 15016 + }, + { + "epoch": 4.6092694904849605, + "grad_norm": 0.2579486072063446, + "learning_rate": 5.8718303576142356e-05, + "loss": 1.778, + "step": 15017 + }, + { + "epoch": 4.609576427255985, + "grad_norm": 0.2216452956199646, + "learning_rate": 5.871340911771912e-05, + "loss": 1.7517, + "step": 15018 + }, + { + "epoch": 4.60988336402701, + "grad_norm": 0.22628961503505707, + "learning_rate": 5.870851457318488e-05, + "loss": 1.7579, + "step": 15019 + }, + { + "epoch": 4.610190300798036, + "grad_norm": 0.31018149852752686, + "learning_rate": 5.8703619942588e-05, + "loss": 1.7911, + "step": 15020 + }, + { + "epoch": 4.610497237569061, + "grad_norm": 0.2618122100830078, + "learning_rate": 5.869872522597683e-05, + "loss": 1.8121, + "step": 15021 + }, + { + "epoch": 4.610804174340086, + "grad_norm": 0.26085740327835083, + "learning_rate": 5.869383042339978e-05, + "loss": 1.7952, + "step": 15022 + }, + { + "epoch": 4.611111111111111, + "grad_norm": 0.25237780809402466, + "learning_rate": 5.86889355349052e-05, + "loss": 1.7575, + "step": 15023 + }, + { + "epoch": 4.611418047882136, + "grad_norm": 0.27550897002220154, + "learning_rate": 5.868404056054144e-05, + "loss": 1.7816, + "step": 15024 + }, + { + "epoch": 4.611724984653161, + "grad_norm": 0.2458692342042923, + "learning_rate": 5.8679145500356926e-05, + "loss": 1.7783, + "step": 15025 + }, + { + "epoch": 4.612031921424187, + "grad_norm": 0.25606176257133484, + "learning_rate": 5.867425035439999e-05, + "loss": 1.7863, + "step": 15026 + }, + { + "epoch": 4.612338858195212, + "grad_norm": 0.3206995725631714, + "learning_rate": 5.866935512271905e-05, + "loss": 1.7468, + "step": 15027 + }, + { + "epoch": 4.612645794966237, + "grad_norm": 0.2754824459552765, + "learning_rate": 5.866445980536245e-05, + "loss": 1.793, + "step": 15028 + }, + { + "epoch": 4.612952731737262, + "grad_norm": 0.25168612599372864, + "learning_rate": 5.865956440237859e-05, + "loss": 1.7252, + "step": 15029 + }, + { + "epoch": 4.613259668508287, + "grad_norm": 0.3226735293865204, + "learning_rate": 5.8654668913815815e-05, + "loss": 1.7291, + "step": 15030 + }, + { + "epoch": 4.6135666052793125, + "grad_norm": 0.2580295503139496, + "learning_rate": 5.864977333972255e-05, + "loss": 1.7622, + "step": 15031 + }, + { + "epoch": 4.613873542050338, + "grad_norm": 0.21486075222492218, + "learning_rate": 5.864487768014715e-05, + "loss": 1.7662, + "step": 15032 + }, + { + "epoch": 4.614180478821363, + "grad_norm": 0.2331690639257431, + "learning_rate": 5.8639981935137996e-05, + "loss": 1.7389, + "step": 15033 + }, + { + "epoch": 4.614487415592388, + "grad_norm": 0.2573511302471161, + "learning_rate": 5.863508610474348e-05, + "loss": 1.7699, + "step": 15034 + }, + { + "epoch": 4.614794352363413, + "grad_norm": 0.2260694056749344, + "learning_rate": 5.863019018901199e-05, + "loss": 1.7784, + "step": 15035 + }, + { + "epoch": 4.615101289134438, + "grad_norm": 0.2283065915107727, + "learning_rate": 5.8625294187991895e-05, + "loss": 1.7061, + "step": 15036 + }, + { + "epoch": 4.615408225905464, + "grad_norm": 0.24772310256958008, + "learning_rate": 5.862039810173159e-05, + "loss": 1.7568, + "step": 15037 + }, + { + "epoch": 4.615715162676489, + "grad_norm": 0.2515513002872467, + "learning_rate": 5.861550193027945e-05, + "loss": 1.7445, + "step": 15038 + }, + { + "epoch": 4.616022099447514, + "grad_norm": 0.26472151279449463, + "learning_rate": 5.8610605673683885e-05, + "loss": 1.7735, + "step": 15039 + }, + { + "epoch": 4.616329036218539, + "grad_norm": 0.24053528904914856, + "learning_rate": 5.8605709331993254e-05, + "loss": 1.8009, + "step": 15040 + }, + { + "epoch": 4.616635972989564, + "grad_norm": 0.25125381350517273, + "learning_rate": 5.860081290525596e-05, + "loss": 1.7712, + "step": 15041 + }, + { + "epoch": 4.616942909760589, + "grad_norm": 0.23056018352508545, + "learning_rate": 5.85959163935204e-05, + "loss": 1.7684, + "step": 15042 + }, + { + "epoch": 4.617249846531615, + "grad_norm": 0.2533007562160492, + "learning_rate": 5.859101979683494e-05, + "loss": 1.7793, + "step": 15043 + }, + { + "epoch": 4.617556783302639, + "grad_norm": 0.21007375419139862, + "learning_rate": 5.8586123115248e-05, + "loss": 1.7484, + "step": 15044 + }, + { + "epoch": 4.6178637200736645, + "grad_norm": 0.21329566836357117, + "learning_rate": 5.858122634880797e-05, + "loss": 1.7763, + "step": 15045 + }, + { + "epoch": 4.61817065684469, + "grad_norm": 0.2362898588180542, + "learning_rate": 5.857632949756322e-05, + "loss": 1.7484, + "step": 15046 + }, + { + "epoch": 4.618477593615715, + "grad_norm": 0.2168794423341751, + "learning_rate": 5.857143256156214e-05, + "loss": 1.7752, + "step": 15047 + }, + { + "epoch": 4.6187845303867405, + "grad_norm": 0.24761471152305603, + "learning_rate": 5.856653554085316e-05, + "loss": 1.7793, + "step": 15048 + }, + { + "epoch": 4.619091467157766, + "grad_norm": 0.23202158510684967, + "learning_rate": 5.856163843548466e-05, + "loss": 1.6862, + "step": 15049 + }, + { + "epoch": 4.61939840392879, + "grad_norm": 0.23868000507354736, + "learning_rate": 5.855674124550501e-05, + "loss": 1.8075, + "step": 15050 + }, + { + "epoch": 4.619705340699816, + "grad_norm": 0.3063114583492279, + "learning_rate": 5.855184397096265e-05, + "loss": 1.8051, + "step": 15051 + }, + { + "epoch": 4.620012277470841, + "grad_norm": 0.22672493755817413, + "learning_rate": 5.854694661190594e-05, + "loss": 1.7478, + "step": 15052 + }, + { + "epoch": 4.620319214241866, + "grad_norm": 0.3403559923171997, + "learning_rate": 5.8542049168383296e-05, + "loss": 1.765, + "step": 15053 + }, + { + "epoch": 4.620626151012892, + "grad_norm": 0.33852189779281616, + "learning_rate": 5.853715164044312e-05, + "loss": 1.7602, + "step": 15054 + }, + { + "epoch": 4.620933087783916, + "grad_norm": 0.25166940689086914, + "learning_rate": 5.85322540281338e-05, + "loss": 1.7584, + "step": 15055 + }, + { + "epoch": 4.621240024554941, + "grad_norm": 0.3417987823486328, + "learning_rate": 5.8527356331503757e-05, + "loss": 1.8491, + "step": 15056 + }, + { + "epoch": 4.621546961325967, + "grad_norm": 0.3286994397640228, + "learning_rate": 5.852245855060138e-05, + "loss": 1.7146, + "step": 15057 + }, + { + "epoch": 4.621853898096992, + "grad_norm": 0.24394257366657257, + "learning_rate": 5.851756068547505e-05, + "loss": 1.8762, + "step": 15058 + }, + { + "epoch": 4.622160834868017, + "grad_norm": 0.34945347905158997, + "learning_rate": 5.851266273617321e-05, + "loss": 1.8086, + "step": 15059 + }, + { + "epoch": 4.622467771639043, + "grad_norm": 0.30189210176467896, + "learning_rate": 5.850776470274425e-05, + "loss": 1.7366, + "step": 15060 + }, + { + "epoch": 4.622774708410067, + "grad_norm": 0.24050579965114594, + "learning_rate": 5.850286658523657e-05, + "loss": 1.7599, + "step": 15061 + }, + { + "epoch": 4.6230816451810925, + "grad_norm": 0.33650726079940796, + "learning_rate": 5.849796838369857e-05, + "loss": 1.7343, + "step": 15062 + }, + { + "epoch": 4.623388581952118, + "grad_norm": 0.2855902910232544, + "learning_rate": 5.849307009817868e-05, + "loss": 1.7325, + "step": 15063 + }, + { + "epoch": 4.623695518723143, + "grad_norm": 0.2562592923641205, + "learning_rate": 5.8488171728725275e-05, + "loss": 1.7772, + "step": 15064 + }, + { + "epoch": 4.6240024554941686, + "grad_norm": 0.23494984209537506, + "learning_rate": 5.84832732753868e-05, + "loss": 1.7263, + "step": 15065 + }, + { + "epoch": 4.624309392265193, + "grad_norm": 0.23248226940631866, + "learning_rate": 5.847837473821164e-05, + "loss": 1.7441, + "step": 15066 + }, + { + "epoch": 4.624616329036218, + "grad_norm": 0.2291254848241806, + "learning_rate": 5.847347611724821e-05, + "loss": 1.7742, + "step": 15067 + }, + { + "epoch": 4.624923265807244, + "grad_norm": 0.28305280208587646, + "learning_rate": 5.8468577412544925e-05, + "loss": 1.8224, + "step": 15068 + }, + { + "epoch": 4.625230202578269, + "grad_norm": 0.25531691312789917, + "learning_rate": 5.84636786241502e-05, + "loss": 1.7458, + "step": 15069 + }, + { + "epoch": 4.625537139349294, + "grad_norm": 0.2363462746143341, + "learning_rate": 5.845877975211242e-05, + "loss": 1.7977, + "step": 15070 + }, + { + "epoch": 4.62584407612032, + "grad_norm": 0.2707001864910126, + "learning_rate": 5.845388079648004e-05, + "loss": 1.774, + "step": 15071 + }, + { + "epoch": 4.626151012891344, + "grad_norm": 0.22281844913959503, + "learning_rate": 5.844898175730146e-05, + "loss": 1.7888, + "step": 15072 + }, + { + "epoch": 4.6264579496623695, + "grad_norm": 0.24809995293617249, + "learning_rate": 5.8444082634625086e-05, + "loss": 1.7895, + "step": 15073 + }, + { + "epoch": 4.626764886433395, + "grad_norm": 0.2842096984386444, + "learning_rate": 5.843918342849933e-05, + "loss": 1.7323, + "step": 15074 + }, + { + "epoch": 4.62707182320442, + "grad_norm": 0.21343614161014557, + "learning_rate": 5.843428413897261e-05, + "loss": 1.7298, + "step": 15075 + }, + { + "epoch": 4.627378759975445, + "grad_norm": 0.2420526146888733, + "learning_rate": 5.842938476609336e-05, + "loss": 1.778, + "step": 15076 + }, + { + "epoch": 4.62768569674647, + "grad_norm": 0.22202003002166748, + "learning_rate": 5.842448530990999e-05, + "loss": 1.779, + "step": 15077 + }, + { + "epoch": 4.627992633517495, + "grad_norm": 0.26784011721611023, + "learning_rate": 5.841958577047092e-05, + "loss": 1.799, + "step": 15078 + }, + { + "epoch": 4.628299570288521, + "grad_norm": 0.3230212926864624, + "learning_rate": 5.841468614782457e-05, + "loss": 1.7789, + "step": 15079 + }, + { + "epoch": 4.628606507059546, + "grad_norm": 0.24062715470790863, + "learning_rate": 5.840978644201935e-05, + "loss": 1.7697, + "step": 15080 + }, + { + "epoch": 4.628913443830571, + "grad_norm": 0.2882130444049835, + "learning_rate": 5.84048866531037e-05, + "loss": 1.7946, + "step": 15081 + }, + { + "epoch": 4.629220380601596, + "grad_norm": 0.3145603537559509, + "learning_rate": 5.839998678112602e-05, + "loss": 1.7116, + "step": 15082 + }, + { + "epoch": 4.629527317372621, + "grad_norm": 0.270997017621994, + "learning_rate": 5.839508682613477e-05, + "loss": 1.8281, + "step": 15083 + }, + { + "epoch": 4.629834254143646, + "grad_norm": 0.27299395203590393, + "learning_rate": 5.839018678817834e-05, + "loss": 1.8233, + "step": 15084 + }, + { + "epoch": 4.630141190914672, + "grad_norm": 0.2684478461742401, + "learning_rate": 5.838528666730517e-05, + "loss": 1.8111, + "step": 15085 + }, + { + "epoch": 4.630448127685697, + "grad_norm": 0.2365201860666275, + "learning_rate": 5.838038646356367e-05, + "loss": 1.7475, + "step": 15086 + }, + { + "epoch": 4.6307550644567215, + "grad_norm": 0.2661258280277252, + "learning_rate": 5.8375486177002305e-05, + "loss": 1.748, + "step": 15087 + }, + { + "epoch": 4.631062001227747, + "grad_norm": 0.2865012586116791, + "learning_rate": 5.8370585807669455e-05, + "loss": 1.7525, + "step": 15088 + }, + { + "epoch": 4.631368937998772, + "grad_norm": 0.2445172518491745, + "learning_rate": 5.836568535561358e-05, + "loss": 1.7278, + "step": 15089 + }, + { + "epoch": 4.6316758747697975, + "grad_norm": 0.28192558884620667, + "learning_rate": 5.8360784820883083e-05, + "loss": 1.7371, + "step": 15090 + }, + { + "epoch": 4.631982811540823, + "grad_norm": 0.38927358388900757, + "learning_rate": 5.835588420352642e-05, + "loss": 1.8088, + "step": 15091 + }, + { + "epoch": 4.632289748311848, + "grad_norm": 0.3409229516983032, + "learning_rate": 5.8350983503592025e-05, + "loss": 1.8011, + "step": 15092 + }, + { + "epoch": 4.632596685082873, + "grad_norm": 0.2464994341135025, + "learning_rate": 5.8346082721128294e-05, + "loss": 1.8354, + "step": 15093 + }, + { + "epoch": 4.632903621853898, + "grad_norm": 0.38765814900398254, + "learning_rate": 5.834118185618369e-05, + "loss": 1.7811, + "step": 15094 + }, + { + "epoch": 4.633210558624923, + "grad_norm": 0.42435070872306824, + "learning_rate": 5.833628090880664e-05, + "loss": 1.7855, + "step": 15095 + }, + { + "epoch": 4.633517495395949, + "grad_norm": 0.244876891374588, + "learning_rate": 5.833137987904558e-05, + "loss": 1.7494, + "step": 15096 + }, + { + "epoch": 4.633824432166974, + "grad_norm": 0.30353477597236633, + "learning_rate": 5.8326478766948934e-05, + "loss": 1.7772, + "step": 15097 + }, + { + "epoch": 4.634131368937998, + "grad_norm": 0.38839244842529297, + "learning_rate": 5.8321577572565146e-05, + "loss": 1.7689, + "step": 15098 + }, + { + "epoch": 4.634438305709024, + "grad_norm": 0.357129842042923, + "learning_rate": 5.8316676295942644e-05, + "loss": 1.7777, + "step": 15099 + }, + { + "epoch": 4.634745242480049, + "grad_norm": 0.23458799719810486, + "learning_rate": 5.831177493712988e-05, + "loss": 1.7544, + "step": 15100 + }, + { + "epoch": 4.635052179251074, + "grad_norm": 0.23751308023929596, + "learning_rate": 5.830687349617529e-05, + "loss": 1.7491, + "step": 15101 + }, + { + "epoch": 4.6353591160221, + "grad_norm": 0.31978943943977356, + "learning_rate": 5.83019719731273e-05, + "loss": 1.7439, + "step": 15102 + }, + { + "epoch": 4.635666052793125, + "grad_norm": 0.2751142084598541, + "learning_rate": 5.829707036803438e-05, + "loss": 1.8598, + "step": 15103 + }, + { + "epoch": 4.6359729895641495, + "grad_norm": 0.23670406639575958, + "learning_rate": 5.8292168680944914e-05, + "loss": 1.7629, + "step": 15104 + }, + { + "epoch": 4.636279926335175, + "grad_norm": 0.2447349727153778, + "learning_rate": 5.828726691190739e-05, + "loss": 1.7606, + "step": 15105 + }, + { + "epoch": 4.6365868631062, + "grad_norm": 0.2739902436733246, + "learning_rate": 5.828236506097023e-05, + "loss": 1.707, + "step": 15106 + }, + { + "epoch": 4.6368937998772255, + "grad_norm": 0.2050863653421402, + "learning_rate": 5.82774631281819e-05, + "loss": 1.7235, + "step": 15107 + }, + { + "epoch": 4.637200736648251, + "grad_norm": 0.3005560338497162, + "learning_rate": 5.827256111359082e-05, + "loss": 1.7785, + "step": 15108 + }, + { + "epoch": 4.637507673419275, + "grad_norm": 0.27168264985084534, + "learning_rate": 5.8267659017245434e-05, + "loss": 1.7844, + "step": 15109 + }, + { + "epoch": 4.637814610190301, + "grad_norm": 0.2965840995311737, + "learning_rate": 5.82627568391942e-05, + "loss": 1.7631, + "step": 15110 + }, + { + "epoch": 4.638121546961326, + "grad_norm": 0.3114408552646637, + "learning_rate": 5.825785457948556e-05, + "loss": 1.77, + "step": 15111 + }, + { + "epoch": 4.638428483732351, + "grad_norm": 0.2638910114765167, + "learning_rate": 5.825295223816796e-05, + "loss": 1.9183, + "step": 15112 + }, + { + "epoch": 4.638735420503377, + "grad_norm": 0.3293665051460266, + "learning_rate": 5.824804981528986e-05, + "loss": 1.6779, + "step": 15113 + }, + { + "epoch": 4.639042357274402, + "grad_norm": 0.28586456179618835, + "learning_rate": 5.824314731089968e-05, + "loss": 1.7905, + "step": 15114 + }, + { + "epoch": 4.639349294045426, + "grad_norm": 0.2254554182291031, + "learning_rate": 5.8238244725045906e-05, + "loss": 1.7602, + "step": 15115 + }, + { + "epoch": 4.639656230816452, + "grad_norm": 0.2770406901836395, + "learning_rate": 5.823334205777695e-05, + "loss": 1.7789, + "step": 15116 + }, + { + "epoch": 4.639963167587477, + "grad_norm": 0.2867025136947632, + "learning_rate": 5.822843930914129e-05, + "loss": 1.7408, + "step": 15117 + }, + { + "epoch": 4.640270104358502, + "grad_norm": 0.23486989736557007, + "learning_rate": 5.822353647918737e-05, + "loss": 1.7489, + "step": 15118 + }, + { + "epoch": 4.640577041129527, + "grad_norm": 0.2274324595928192, + "learning_rate": 5.821863356796367e-05, + "loss": 1.768, + "step": 15119 + }, + { + "epoch": 4.640883977900552, + "grad_norm": 0.25032591819763184, + "learning_rate": 5.821373057551858e-05, + "loss": 1.7602, + "step": 15120 + }, + { + "epoch": 4.6411909146715775, + "grad_norm": 0.22332963347434998, + "learning_rate": 5.820882750190059e-05, + "loss": 1.756, + "step": 15121 + }, + { + "epoch": 4.641497851442603, + "grad_norm": 0.24975591897964478, + "learning_rate": 5.820392434715817e-05, + "loss": 1.6963, + "step": 15122 + }, + { + "epoch": 4.641804788213628, + "grad_norm": 0.27892687916755676, + "learning_rate": 5.819902111133976e-05, + "loss": 1.8295, + "step": 15123 + }, + { + "epoch": 4.6421117249846535, + "grad_norm": 0.23914897441864014, + "learning_rate": 5.819411779449381e-05, + "loss": 1.7636, + "step": 15124 + }, + { + "epoch": 4.642418661755678, + "grad_norm": 0.2349565476179123, + "learning_rate": 5.818921439666879e-05, + "loss": 1.7823, + "step": 15125 + }, + { + "epoch": 4.642725598526703, + "grad_norm": 0.2075800597667694, + "learning_rate": 5.818431091791315e-05, + "loss": 1.7282, + "step": 15126 + }, + { + "epoch": 4.643032535297729, + "grad_norm": 0.19781073927879333, + "learning_rate": 5.817940735827535e-05, + "loss": 1.7598, + "step": 15127 + }, + { + "epoch": 4.643339472068754, + "grad_norm": 0.21997439861297607, + "learning_rate": 5.8174503717803866e-05, + "loss": 1.766, + "step": 15128 + }, + { + "epoch": 4.643646408839779, + "grad_norm": 0.23971444368362427, + "learning_rate": 5.816959999654713e-05, + "loss": 1.7824, + "step": 15129 + }, + { + "epoch": 4.643953345610804, + "grad_norm": 0.23357853293418884, + "learning_rate": 5.816469619455363e-05, + "loss": 1.7353, + "step": 15130 + }, + { + "epoch": 4.644260282381829, + "grad_norm": 0.22030897438526154, + "learning_rate": 5.815979231187181e-05, + "loss": 1.7413, + "step": 15131 + }, + { + "epoch": 4.644567219152854, + "grad_norm": 0.2322571873664856, + "learning_rate": 5.815488834855014e-05, + "loss": 1.7305, + "step": 15132 + }, + { + "epoch": 4.64487415592388, + "grad_norm": 0.25256821513175964, + "learning_rate": 5.814998430463709e-05, + "loss": 1.7533, + "step": 15133 + }, + { + "epoch": 4.645181092694905, + "grad_norm": 0.248504638671875, + "learning_rate": 5.81450801801811e-05, + "loss": 1.7345, + "step": 15134 + }, + { + "epoch": 4.64548802946593, + "grad_norm": 0.22850964963436127, + "learning_rate": 5.8140175975230673e-05, + "loss": 1.8308, + "step": 15135 + }, + { + "epoch": 4.645794966236955, + "grad_norm": 0.3517951965332031, + "learning_rate": 5.813527168983426e-05, + "loss": 1.811, + "step": 15136 + }, + { + "epoch": 4.64610190300798, + "grad_norm": 0.32132068276405334, + "learning_rate": 5.813036732404031e-05, + "loss": 1.7584, + "step": 15137 + }, + { + "epoch": 4.6464088397790055, + "grad_norm": 0.2349396049976349, + "learning_rate": 5.812546287789731e-05, + "loss": 1.7762, + "step": 15138 + }, + { + "epoch": 4.646715776550031, + "grad_norm": 0.23519493639469147, + "learning_rate": 5.812055835145372e-05, + "loss": 1.7428, + "step": 15139 + }, + { + "epoch": 4.647022713321056, + "grad_norm": 0.29277852177619934, + "learning_rate": 5.8115653744758016e-05, + "loss": 1.7599, + "step": 15140 + }, + { + "epoch": 4.647329650092081, + "grad_norm": 0.2347593754529953, + "learning_rate": 5.811074905785867e-05, + "loss": 1.7401, + "step": 15141 + }, + { + "epoch": 4.647636586863106, + "grad_norm": 0.23080264031887054, + "learning_rate": 5.8105844290804147e-05, + "loss": 1.7705, + "step": 15142 + }, + { + "epoch": 4.647943523634131, + "grad_norm": 0.24686801433563232, + "learning_rate": 5.810093944364291e-05, + "loss": 1.7409, + "step": 15143 + }, + { + "epoch": 4.648250460405157, + "grad_norm": 0.24098120629787445, + "learning_rate": 5.809603451642344e-05, + "loss": 1.7893, + "step": 15144 + }, + { + "epoch": 4.648557397176182, + "grad_norm": 0.23020638525485992, + "learning_rate": 5.809112950919422e-05, + "loss": 1.7589, + "step": 15145 + }, + { + "epoch": 4.648864333947207, + "grad_norm": 0.3036736249923706, + "learning_rate": 5.808622442200371e-05, + "loss": 1.7964, + "step": 15146 + }, + { + "epoch": 4.649171270718232, + "grad_norm": 0.2965635657310486, + "learning_rate": 5.808131925490039e-05, + "loss": 1.7986, + "step": 15147 + }, + { + "epoch": 4.649478207489257, + "grad_norm": 0.22241640090942383, + "learning_rate": 5.8076414007932745e-05, + "loss": 1.749, + "step": 15148 + }, + { + "epoch": 4.649785144260282, + "grad_norm": 0.20304246246814728, + "learning_rate": 5.8071508681149246e-05, + "loss": 1.7374, + "step": 15149 + }, + { + "epoch": 4.650092081031308, + "grad_norm": 0.19534410536289215, + "learning_rate": 5.806660327459834e-05, + "loss": 1.7087, + "step": 15150 + }, + { + "epoch": 4.650399017802332, + "grad_norm": 0.2151753008365631, + "learning_rate": 5.806169778832856e-05, + "loss": 1.7409, + "step": 15151 + }, + { + "epoch": 4.650705954573358, + "grad_norm": 0.2180301696062088, + "learning_rate": 5.805679222238836e-05, + "loss": 1.7522, + "step": 15152 + }, + { + "epoch": 4.651012891344383, + "grad_norm": 0.19917607307434082, + "learning_rate": 5.8051886576826205e-05, + "loss": 1.768, + "step": 15153 + }, + { + "epoch": 4.651319828115408, + "grad_norm": 0.2312052994966507, + "learning_rate": 5.804698085169059e-05, + "loss": 1.7799, + "step": 15154 + }, + { + "epoch": 4.651626764886434, + "grad_norm": 0.21541514992713928, + "learning_rate": 5.804207504702999e-05, + "loss": 1.7595, + "step": 15155 + }, + { + "epoch": 4.651933701657459, + "grad_norm": 0.2029450386762619, + "learning_rate": 5.803716916289289e-05, + "loss": 1.7727, + "step": 15156 + }, + { + "epoch": 4.652240638428484, + "grad_norm": 0.21796850860118866, + "learning_rate": 5.8032263199327787e-05, + "loss": 1.7445, + "step": 15157 + }, + { + "epoch": 4.652547575199509, + "grad_norm": 0.20309078693389893, + "learning_rate": 5.802735715638314e-05, + "loss": 1.6971, + "step": 15158 + }, + { + "epoch": 4.652854511970534, + "grad_norm": 0.21270112693309784, + "learning_rate": 5.802245103410745e-05, + "loss": 1.7162, + "step": 15159 + }, + { + "epoch": 4.653161448741559, + "grad_norm": 0.25357750058174133, + "learning_rate": 5.8017544832549184e-05, + "loss": 1.7534, + "step": 15160 + }, + { + "epoch": 4.653468385512585, + "grad_norm": 0.24015015363693237, + "learning_rate": 5.8012638551756847e-05, + "loss": 1.7639, + "step": 15161 + }, + { + "epoch": 4.653775322283609, + "grad_norm": 0.20507018268108368, + "learning_rate": 5.800773219177893e-05, + "loss": 1.7293, + "step": 15162 + }, + { + "epoch": 4.6540822590546345, + "grad_norm": 0.23399868607521057, + "learning_rate": 5.800282575266389e-05, + "loss": 1.8286, + "step": 15163 + }, + { + "epoch": 4.65438919582566, + "grad_norm": 0.27126726508140564, + "learning_rate": 5.799791923446025e-05, + "loss": 1.8028, + "step": 15164 + }, + { + "epoch": 4.654696132596685, + "grad_norm": 0.23644569516181946, + "learning_rate": 5.7993012637216494e-05, + "loss": 1.7138, + "step": 15165 + }, + { + "epoch": 4.6550030693677105, + "grad_norm": 0.21557916700839996, + "learning_rate": 5.7988105960981086e-05, + "loss": 1.7703, + "step": 15166 + }, + { + "epoch": 4.655310006138736, + "grad_norm": 0.22030150890350342, + "learning_rate": 5.798319920580254e-05, + "loss": 1.7282, + "step": 15167 + }, + { + "epoch": 4.65561694290976, + "grad_norm": 0.2092939168214798, + "learning_rate": 5.7978292371729325e-05, + "loss": 1.7853, + "step": 15168 + }, + { + "epoch": 4.655923879680786, + "grad_norm": 0.21643707156181335, + "learning_rate": 5.797338545880997e-05, + "loss": 1.7582, + "step": 15169 + }, + { + "epoch": 4.656230816451811, + "grad_norm": 0.3064669668674469, + "learning_rate": 5.796847846709294e-05, + "loss": 1.8139, + "step": 15170 + }, + { + "epoch": 4.656537753222836, + "grad_norm": 0.3060479760169983, + "learning_rate": 5.796357139662674e-05, + "loss": 1.7356, + "step": 15171 + }, + { + "epoch": 4.656844689993862, + "grad_norm": 0.23546656966209412, + "learning_rate": 5.7958664247459835e-05, + "loss": 1.7937, + "step": 15172 + }, + { + "epoch": 4.657151626764886, + "grad_norm": 0.2890888750553131, + "learning_rate": 5.795375701964077e-05, + "loss": 1.7305, + "step": 15173 + }, + { + "epoch": 4.657458563535911, + "grad_norm": 0.27948084473609924, + "learning_rate": 5.794884971321801e-05, + "loss": 1.7428, + "step": 15174 + }, + { + "epoch": 4.657765500306937, + "grad_norm": 0.2354089468717575, + "learning_rate": 5.794394232824007e-05, + "loss": 1.7622, + "step": 15175 + }, + { + "epoch": 4.658072437077962, + "grad_norm": 0.3271159827709198, + "learning_rate": 5.793903486475541e-05, + "loss": 1.7826, + "step": 15176 + }, + { + "epoch": 4.658379373848987, + "grad_norm": 0.3561338782310486, + "learning_rate": 5.793412732281257e-05, + "loss": 1.7698, + "step": 15177 + }, + { + "epoch": 4.658686310620013, + "grad_norm": 0.2913050949573517, + "learning_rate": 5.7929219702460035e-05, + "loss": 1.8156, + "step": 15178 + }, + { + "epoch": 4.658993247391037, + "grad_norm": 0.2345089465379715, + "learning_rate": 5.7924312003746294e-05, + "loss": 1.7859, + "step": 15179 + }, + { + "epoch": 4.6593001841620625, + "grad_norm": 0.3018132150173187, + "learning_rate": 5.7919404226719865e-05, + "loss": 1.7622, + "step": 15180 + }, + { + "epoch": 4.659607120933088, + "grad_norm": 0.29134172201156616, + "learning_rate": 5.791449637142924e-05, + "loss": 1.7287, + "step": 15181 + }, + { + "epoch": 4.659914057704113, + "grad_norm": 0.24126321077346802, + "learning_rate": 5.7909588437922924e-05, + "loss": 1.7969, + "step": 15182 + }, + { + "epoch": 4.6602209944751385, + "grad_norm": 0.27053284645080566, + "learning_rate": 5.7904680426249415e-05, + "loss": 1.7399, + "step": 15183 + }, + { + "epoch": 4.660527931246163, + "grad_norm": 0.2636512219905853, + "learning_rate": 5.789977233645722e-05, + "loss": 1.7615, + "step": 15184 + }, + { + "epoch": 4.660834868017188, + "grad_norm": 0.2263207584619522, + "learning_rate": 5.789486416859484e-05, + "loss": 1.7668, + "step": 15185 + }, + { + "epoch": 4.661141804788214, + "grad_norm": 0.25387826561927795, + "learning_rate": 5.78899559227108e-05, + "loss": 1.7594, + "step": 15186 + }, + { + "epoch": 4.661448741559239, + "grad_norm": 0.2268977165222168, + "learning_rate": 5.7885047598853596e-05, + "loss": 1.75, + "step": 15187 + }, + { + "epoch": 4.661755678330264, + "grad_norm": 0.29093095660209656, + "learning_rate": 5.788013919707172e-05, + "loss": 1.7291, + "step": 15188 + }, + { + "epoch": 4.66206261510129, + "grad_norm": 0.26578736305236816, + "learning_rate": 5.7875230717413684e-05, + "loss": 1.7276, + "step": 15189 + }, + { + "epoch": 4.662369551872314, + "grad_norm": 0.2548983097076416, + "learning_rate": 5.7870322159928e-05, + "loss": 1.755, + "step": 15190 + }, + { + "epoch": 4.662676488643339, + "grad_norm": 0.2246701419353485, + "learning_rate": 5.7865413524663184e-05, + "loss": 1.751, + "step": 15191 + }, + { + "epoch": 4.662983425414365, + "grad_norm": 0.3069002032279968, + "learning_rate": 5.7860504811667747e-05, + "loss": 1.7522, + "step": 15192 + }, + { + "epoch": 4.66329036218539, + "grad_norm": 0.3081241250038147, + "learning_rate": 5.7855596020990186e-05, + "loss": 1.7152, + "step": 15193 + }, + { + "epoch": 4.6635972989564145, + "grad_norm": 0.29006731510162354, + "learning_rate": 5.7850687152679026e-05, + "loss": 1.8471, + "step": 15194 + }, + { + "epoch": 4.66390423572744, + "grad_norm": 0.24131664633750916, + "learning_rate": 5.7845778206782786e-05, + "loss": 1.763, + "step": 15195 + }, + { + "epoch": 4.664211172498465, + "grad_norm": 0.21808001399040222, + "learning_rate": 5.784086918334994e-05, + "loss": 1.6989, + "step": 15196 + }, + { + "epoch": 4.6645181092694905, + "grad_norm": 0.2413240373134613, + "learning_rate": 5.783596008242904e-05, + "loss": 1.7869, + "step": 15197 + }, + { + "epoch": 4.664825046040516, + "grad_norm": 0.23310934007167816, + "learning_rate": 5.7831050904068594e-05, + "loss": 1.8017, + "step": 15198 + }, + { + "epoch": 4.665131982811541, + "grad_norm": 0.2577926814556122, + "learning_rate": 5.7826141648317125e-05, + "loss": 1.6938, + "step": 15199 + }, + { + "epoch": 4.665438919582566, + "grad_norm": 0.22523443400859833, + "learning_rate": 5.782123231522312e-05, + "loss": 1.8104, + "step": 15200 + }, + { + "epoch": 4.665745856353591, + "grad_norm": 0.23603026568889618, + "learning_rate": 5.781632290483512e-05, + "loss": 1.7484, + "step": 15201 + }, + { + "epoch": 4.666052793124616, + "grad_norm": 0.23195989429950714, + "learning_rate": 5.781141341720162e-05, + "loss": 1.7786, + "step": 15202 + }, + { + "epoch": 4.666359729895642, + "grad_norm": 0.21838274598121643, + "learning_rate": 5.780650385237118e-05, + "loss": 1.7509, + "step": 15203 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.26656514406204224, + "learning_rate": 5.780159421039229e-05, + "loss": 1.7875, + "step": 15204 + }, + { + "epoch": 4.666973603437691, + "grad_norm": 0.2293243706226349, + "learning_rate": 5.7796684491313456e-05, + "loss": 1.7518, + "step": 15205 + }, + { + "epoch": 4.667280540208717, + "grad_norm": 0.24190817773342133, + "learning_rate": 5.779177469518323e-05, + "loss": 1.7593, + "step": 15206 + }, + { + "epoch": 4.667587476979742, + "grad_norm": 0.31113871932029724, + "learning_rate": 5.77868648220501e-05, + "loss": 1.7911, + "step": 15207 + }, + { + "epoch": 4.667894413750767, + "grad_norm": 0.2875262498855591, + "learning_rate": 5.778195487196263e-05, + "loss": 1.7871, + "step": 15208 + }, + { + "epoch": 4.668201350521793, + "grad_norm": 0.2172149419784546, + "learning_rate": 5.777704484496931e-05, + "loss": 1.7592, + "step": 15209 + }, + { + "epoch": 4.668508287292818, + "grad_norm": 0.3282458186149597, + "learning_rate": 5.7772134741118675e-05, + "loss": 1.7687, + "step": 15210 + }, + { + "epoch": 4.6688152240638425, + "grad_norm": 0.36963000893592834, + "learning_rate": 5.7767224560459255e-05, + "loss": 1.812, + "step": 15211 + }, + { + "epoch": 4.669122160834868, + "grad_norm": 0.22387740015983582, + "learning_rate": 5.776231430303957e-05, + "loss": 1.7449, + "step": 15212 + }, + { + "epoch": 4.669429097605893, + "grad_norm": 0.21468734741210938, + "learning_rate": 5.775740396890813e-05, + "loss": 1.716, + "step": 15213 + }, + { + "epoch": 4.6697360343769185, + "grad_norm": 0.2478475719690323, + "learning_rate": 5.7752493558113486e-05, + "loss": 1.7182, + "step": 15214 + }, + { + "epoch": 4.670042971147944, + "grad_norm": 0.20924845337867737, + "learning_rate": 5.774758307070416e-05, + "loss": 1.784, + "step": 15215 + }, + { + "epoch": 4.670349907918968, + "grad_norm": 0.2933209538459778, + "learning_rate": 5.774267250672868e-05, + "loss": 1.8375, + "step": 15216 + }, + { + "epoch": 4.670656844689994, + "grad_norm": 0.2744538486003876, + "learning_rate": 5.7737761866235565e-05, + "loss": 1.7019, + "step": 15217 + }, + { + "epoch": 4.670963781461019, + "grad_norm": 0.20991720259189606, + "learning_rate": 5.773285114927336e-05, + "loss": 1.7189, + "step": 15218 + }, + { + "epoch": 4.671270718232044, + "grad_norm": 0.2873254716396332, + "learning_rate": 5.772794035589057e-05, + "loss": 1.7492, + "step": 15219 + }, + { + "epoch": 4.67157765500307, + "grad_norm": 0.2781519591808319, + "learning_rate": 5.772302948613576e-05, + "loss": 1.7342, + "step": 15220 + }, + { + "epoch": 4.671884591774095, + "grad_norm": 0.23288768529891968, + "learning_rate": 5.7718118540057455e-05, + "loss": 1.7245, + "step": 15221 + }, + { + "epoch": 4.672191528545119, + "grad_norm": 0.40817564725875854, + "learning_rate": 5.771320751770417e-05, + "loss": 1.7659, + "step": 15222 + }, + { + "epoch": 4.672498465316145, + "grad_norm": 0.45521771907806396, + "learning_rate": 5.770829641912444e-05, + "loss": 1.7875, + "step": 15223 + }, + { + "epoch": 4.67280540208717, + "grad_norm": 0.22353248298168182, + "learning_rate": 5.77033852443668e-05, + "loss": 1.7098, + "step": 15224 + }, + { + "epoch": 4.673112338858195, + "grad_norm": 0.4066791534423828, + "learning_rate": 5.769847399347981e-05, + "loss": 1.7277, + "step": 15225 + }, + { + "epoch": 4.67341927562922, + "grad_norm": 0.4299545884132385, + "learning_rate": 5.769356266651198e-05, + "loss": 1.7777, + "step": 15226 + }, + { + "epoch": 4.673726212400245, + "grad_norm": 0.21037638187408447, + "learning_rate": 5.768865126351186e-05, + "loss": 1.7263, + "step": 15227 + }, + { + "epoch": 4.6740331491712706, + "grad_norm": 0.3390437066555023, + "learning_rate": 5.768373978452798e-05, + "loss": 1.7457, + "step": 15228 + }, + { + "epoch": 4.674340085942296, + "grad_norm": 0.40003323554992676, + "learning_rate": 5.767882822960887e-05, + "loss": 1.8137, + "step": 15229 + }, + { + "epoch": 4.674647022713321, + "grad_norm": 0.2212848961353302, + "learning_rate": 5.767391659880308e-05, + "loss": 1.7131, + "step": 15230 + }, + { + "epoch": 4.6749539594843466, + "grad_norm": 0.30634984374046326, + "learning_rate": 5.766900489215915e-05, + "loss": 1.7775, + "step": 15231 + }, + { + "epoch": 4.675260896255372, + "grad_norm": 0.31412798166275024, + "learning_rate": 5.766409310972563e-05, + "loss": 1.7383, + "step": 15232 + }, + { + "epoch": 4.675567833026396, + "grad_norm": 0.21125225722789764, + "learning_rate": 5.7659181251551045e-05, + "loss": 1.8046, + "step": 15233 + }, + { + "epoch": 4.675874769797422, + "grad_norm": 0.3234494924545288, + "learning_rate": 5.765426931768394e-05, + "loss": 1.7838, + "step": 15234 + }, + { + "epoch": 4.676181706568447, + "grad_norm": 0.2668779194355011, + "learning_rate": 5.764935730817286e-05, + "loss": 1.7464, + "step": 15235 + }, + { + "epoch": 4.676488643339472, + "grad_norm": 0.22423583269119263, + "learning_rate": 5.764444522306633e-05, + "loss": 1.7165, + "step": 15236 + }, + { + "epoch": 4.676795580110497, + "grad_norm": 0.29066675901412964, + "learning_rate": 5.7639533062412945e-05, + "loss": 1.75, + "step": 15237 + }, + { + "epoch": 4.677102516881522, + "grad_norm": 0.2963598370552063, + "learning_rate": 5.76346208262612e-05, + "loss": 1.8168, + "step": 15238 + }, + { + "epoch": 4.6774094536525475, + "grad_norm": 0.21484358608722687, + "learning_rate": 5.7629708514659655e-05, + "loss": 1.71, + "step": 15239 + }, + { + "epoch": 4.677716390423573, + "grad_norm": 0.20657925307750702, + "learning_rate": 5.762479612765686e-05, + "loss": 1.7239, + "step": 15240 + }, + { + "epoch": 4.678023327194598, + "grad_norm": 0.21336235105991364, + "learning_rate": 5.761988366530136e-05, + "loss": 1.7952, + "step": 15241 + }, + { + "epoch": 4.6783302639656235, + "grad_norm": 0.24156586825847626, + "learning_rate": 5.7614971127641696e-05, + "loss": 1.7709, + "step": 15242 + }, + { + "epoch": 4.678637200736648, + "grad_norm": 0.2633824944496155, + "learning_rate": 5.761005851472643e-05, + "loss": 1.7404, + "step": 15243 + }, + { + "epoch": 4.678944137507673, + "grad_norm": 0.23302829265594482, + "learning_rate": 5.760514582660411e-05, + "loss": 1.7006, + "step": 15244 + }, + { + "epoch": 4.679251074278699, + "grad_norm": 0.22404874861240387, + "learning_rate": 5.7600233063323283e-05, + "loss": 1.7731, + "step": 15245 + }, + { + "epoch": 4.679558011049724, + "grad_norm": 0.23217839002609253, + "learning_rate": 5.7595320224932495e-05, + "loss": 1.7452, + "step": 15246 + }, + { + "epoch": 4.679864947820749, + "grad_norm": 0.23131491243839264, + "learning_rate": 5.7590407311480296e-05, + "loss": 1.7547, + "step": 15247 + }, + { + "epoch": 4.680171884591774, + "grad_norm": 0.21907350420951843, + "learning_rate": 5.7585494323015245e-05, + "loss": 1.7556, + "step": 15248 + }, + { + "epoch": 4.680478821362799, + "grad_norm": 0.22416768968105316, + "learning_rate": 5.7580581259585895e-05, + "loss": 1.7783, + "step": 15249 + }, + { + "epoch": 4.680785758133824, + "grad_norm": 0.20203055441379547, + "learning_rate": 5.75756681212408e-05, + "loss": 1.7285, + "step": 15250 + }, + { + "epoch": 4.68109269490485, + "grad_norm": 0.27838602662086487, + "learning_rate": 5.75707549080285e-05, + "loss": 1.7489, + "step": 15251 + }, + { + "epoch": 4.681399631675875, + "grad_norm": 0.2415023297071457, + "learning_rate": 5.7565841619997586e-05, + "loss": 1.7453, + "step": 15252 + }, + { + "epoch": 4.6817065684469, + "grad_norm": 0.22986920177936554, + "learning_rate": 5.756092825719658e-05, + "loss": 1.7315, + "step": 15253 + }, + { + "epoch": 4.682013505217925, + "grad_norm": 0.2427850216627121, + "learning_rate": 5.755601481967404e-05, + "loss": 1.772, + "step": 15254 + }, + { + "epoch": 4.68232044198895, + "grad_norm": 0.24556589126586914, + "learning_rate": 5.755110130747854e-05, + "loss": 1.7475, + "step": 15255 + }, + { + "epoch": 4.6826273787599755, + "grad_norm": 0.25252529978752136, + "learning_rate": 5.754618772065864e-05, + "loss": 1.7152, + "step": 15256 + }, + { + "epoch": 4.682934315531001, + "grad_norm": 0.24599005281925201, + "learning_rate": 5.754127405926287e-05, + "loss": 1.7911, + "step": 15257 + }, + { + "epoch": 4.683241252302026, + "grad_norm": 0.18961480259895325, + "learning_rate": 5.7536360323339836e-05, + "loss": 1.681, + "step": 15258 + }, + { + "epoch": 4.683548189073051, + "grad_norm": 0.24372327327728271, + "learning_rate": 5.7531446512938035e-05, + "loss": 1.7771, + "step": 15259 + }, + { + "epoch": 4.683855125844076, + "grad_norm": 0.23239269852638245, + "learning_rate": 5.752653262810609e-05, + "loss": 1.7502, + "step": 15260 + }, + { + "epoch": 4.684162062615101, + "grad_norm": 0.25076135993003845, + "learning_rate": 5.752161866889254e-05, + "loss": 1.7974, + "step": 15261 + }, + { + "epoch": 4.684468999386127, + "grad_norm": 0.2703748941421509, + "learning_rate": 5.7516704635345945e-05, + "loss": 1.7245, + "step": 15262 + }, + { + "epoch": 4.684775936157152, + "grad_norm": 0.19247616827487946, + "learning_rate": 5.751179052751487e-05, + "loss": 1.7105, + "step": 15263 + }, + { + "epoch": 4.685082872928177, + "grad_norm": 0.23166817426681519, + "learning_rate": 5.750687634544787e-05, + "loss": 1.8026, + "step": 15264 + }, + { + "epoch": 4.685389809699202, + "grad_norm": 0.22434166073799133, + "learning_rate": 5.7501962089193507e-05, + "loss": 1.7779, + "step": 15265 + }, + { + "epoch": 4.685696746470227, + "grad_norm": 0.190699502825737, + "learning_rate": 5.749704775880037e-05, + "loss": 1.726, + "step": 15266 + }, + { + "epoch": 4.686003683241252, + "grad_norm": 0.22995290160179138, + "learning_rate": 5.749213335431702e-05, + "loss": 1.7495, + "step": 15267 + }, + { + "epoch": 4.686310620012278, + "grad_norm": 0.2712057828903198, + "learning_rate": 5.7487218875792016e-05, + "loss": 1.7862, + "step": 15268 + }, + { + "epoch": 4.686617556783302, + "grad_norm": 0.2524562180042267, + "learning_rate": 5.7482304323273913e-05, + "loss": 1.7092, + "step": 15269 + }, + { + "epoch": 4.6869244935543275, + "grad_norm": 0.23810559511184692, + "learning_rate": 5.747738969681131e-05, + "loss": 1.8049, + "step": 15270 + }, + { + "epoch": 4.687231430325353, + "grad_norm": 0.25521910190582275, + "learning_rate": 5.747247499645275e-05, + "loss": 1.8124, + "step": 15271 + }, + { + "epoch": 4.687538367096378, + "grad_norm": 0.27797845005989075, + "learning_rate": 5.746756022224682e-05, + "loss": 1.7694, + "step": 15272 + }, + { + "epoch": 4.6878453038674035, + "grad_norm": 0.23849260807037354, + "learning_rate": 5.746264537424208e-05, + "loss": 1.7771, + "step": 15273 + }, + { + "epoch": 4.688152240638429, + "grad_norm": 0.24368882179260254, + "learning_rate": 5.74577304524871e-05, + "loss": 1.8143, + "step": 15274 + }, + { + "epoch": 4.688459177409453, + "grad_norm": 0.2712198793888092, + "learning_rate": 5.745281545703045e-05, + "loss": 1.7683, + "step": 15275 + }, + { + "epoch": 4.688766114180479, + "grad_norm": 0.30913081765174866, + "learning_rate": 5.7447900387920716e-05, + "loss": 1.7111, + "step": 15276 + }, + { + "epoch": 4.689073050951504, + "grad_norm": 0.22123363614082336, + "learning_rate": 5.744298524520646e-05, + "loss": 1.7466, + "step": 15277 + }, + { + "epoch": 4.689379987722529, + "grad_norm": 0.32836318016052246, + "learning_rate": 5.743807002893628e-05, + "loss": 1.8083, + "step": 15278 + }, + { + "epoch": 4.689686924493555, + "grad_norm": 0.33319979906082153, + "learning_rate": 5.743315473915871e-05, + "loss": 1.7122, + "step": 15279 + }, + { + "epoch": 4.689993861264579, + "grad_norm": 0.252163290977478, + "learning_rate": 5.742823937592236e-05, + "loss": 1.7599, + "step": 15280 + }, + { + "epoch": 4.690300798035604, + "grad_norm": 0.23248571157455444, + "learning_rate": 5.7423323939275797e-05, + "loss": 1.7791, + "step": 15281 + }, + { + "epoch": 4.69060773480663, + "grad_norm": 0.27024057507514954, + "learning_rate": 5.741840842926759e-05, + "loss": 1.7608, + "step": 15282 + }, + { + "epoch": 4.690914671577655, + "grad_norm": 0.21888256072998047, + "learning_rate": 5.7413492845946326e-05, + "loss": 1.7407, + "step": 15283 + }, + { + "epoch": 4.69122160834868, + "grad_norm": 0.2574782073497772, + "learning_rate": 5.740857718936058e-05, + "loss": 1.707, + "step": 15284 + }, + { + "epoch": 4.691528545119706, + "grad_norm": 0.2541569769382477, + "learning_rate": 5.740366145955893e-05, + "loss": 1.7301, + "step": 15285 + }, + { + "epoch": 4.69183548189073, + "grad_norm": 0.23484647274017334, + "learning_rate": 5.7398745656589955e-05, + "loss": 1.772, + "step": 15286 + }, + { + "epoch": 4.6921424186617555, + "grad_norm": 0.2827093005180359, + "learning_rate": 5.739382978050225e-05, + "loss": 1.7745, + "step": 15287 + }, + { + "epoch": 4.692449355432781, + "grad_norm": 0.300387978553772, + "learning_rate": 5.738891383134437e-05, + "loss": 1.7966, + "step": 15288 + }, + { + "epoch": 4.692756292203806, + "grad_norm": 0.2414523959159851, + "learning_rate": 5.7383997809164926e-05, + "loss": 1.7355, + "step": 15289 + }, + { + "epoch": 4.6930632289748315, + "grad_norm": 0.21221841871738434, + "learning_rate": 5.737908171401248e-05, + "loss": 1.7935, + "step": 15290 + }, + { + "epoch": 4.693370165745856, + "grad_norm": 0.23488084971904755, + "learning_rate": 5.737416554593563e-05, + "loss": 1.7447, + "step": 15291 + }, + { + "epoch": 4.693677102516881, + "grad_norm": 0.26176631450653076, + "learning_rate": 5.7369249304982954e-05, + "loss": 1.769, + "step": 15292 + }, + { + "epoch": 4.693984039287907, + "grad_norm": 0.23060615360736847, + "learning_rate": 5.736433299120303e-05, + "loss": 1.7344, + "step": 15293 + }, + { + "epoch": 4.694290976058932, + "grad_norm": 0.2536846399307251, + "learning_rate": 5.7359416604644456e-05, + "loss": 1.7862, + "step": 15294 + }, + { + "epoch": 4.694597912829957, + "grad_norm": 0.23221342265605927, + "learning_rate": 5.735450014535581e-05, + "loss": 1.743, + "step": 15295 + }, + { + "epoch": 4.694904849600983, + "grad_norm": 0.25320062041282654, + "learning_rate": 5.734958361338568e-05, + "loss": 1.8001, + "step": 15296 + }, + { + "epoch": 4.695211786372007, + "grad_norm": 0.23132461309432983, + "learning_rate": 5.734466700878267e-05, + "loss": 1.7676, + "step": 15297 + }, + { + "epoch": 4.695518723143032, + "grad_norm": 0.2222728580236435, + "learning_rate": 5.7339750331595346e-05, + "loss": 1.7267, + "step": 15298 + }, + { + "epoch": 4.695825659914058, + "grad_norm": 0.2505118250846863, + "learning_rate": 5.733483358187231e-05, + "loss": 1.7467, + "step": 15299 + }, + { + "epoch": 4.696132596685083, + "grad_norm": 0.23609887063503265, + "learning_rate": 5.732991675966214e-05, + "loss": 1.7319, + "step": 15300 + }, + { + "epoch": 4.696439533456108, + "grad_norm": 0.2939738631248474, + "learning_rate": 5.732499986501345e-05, + "loss": 1.8676, + "step": 15301 + }, + { + "epoch": 4.696746470227133, + "grad_norm": 0.29868564009666443, + "learning_rate": 5.7320082897974814e-05, + "loss": 1.7541, + "step": 15302 + }, + { + "epoch": 4.697053406998158, + "grad_norm": 0.2366383820772171, + "learning_rate": 5.731516585859482e-05, + "loss": 1.7531, + "step": 15303 + }, + { + "epoch": 4.6973603437691835, + "grad_norm": 0.2721317410469055, + "learning_rate": 5.731024874692208e-05, + "loss": 1.7444, + "step": 15304 + }, + { + "epoch": 4.697667280540209, + "grad_norm": 0.24925900995731354, + "learning_rate": 5.730533156300517e-05, + "loss": 1.7716, + "step": 15305 + }, + { + "epoch": 4.697974217311234, + "grad_norm": 0.23012754321098328, + "learning_rate": 5.7300414306892704e-05, + "loss": 1.7211, + "step": 15306 + }, + { + "epoch": 4.6982811540822595, + "grad_norm": 0.21274085342884064, + "learning_rate": 5.7295496978633254e-05, + "loss": 1.7853, + "step": 15307 + }, + { + "epoch": 4.698588090853284, + "grad_norm": 0.21799001097679138, + "learning_rate": 5.729057957827544e-05, + "loss": 1.7505, + "step": 15308 + }, + { + "epoch": 4.698895027624309, + "grad_norm": 0.22365793585777283, + "learning_rate": 5.728566210586783e-05, + "loss": 1.7934, + "step": 15309 + }, + { + "epoch": 4.699201964395335, + "grad_norm": 0.23325085639953613, + "learning_rate": 5.728074456145903e-05, + "loss": 1.7354, + "step": 15310 + }, + { + "epoch": 4.69950890116636, + "grad_norm": 0.2175164669752121, + "learning_rate": 5.7275826945097654e-05, + "loss": 1.7541, + "step": 15311 + }, + { + "epoch": 4.699815837937384, + "grad_norm": 0.24657388031482697, + "learning_rate": 5.727090925683231e-05, + "loss": 1.814, + "step": 15312 + }, + { + "epoch": 4.70012277470841, + "grad_norm": 0.2437550574541092, + "learning_rate": 5.726599149671156e-05, + "loss": 1.7234, + "step": 15313 + }, + { + "epoch": 4.700429711479435, + "grad_norm": 0.21053487062454224, + "learning_rate": 5.726107366478402e-05, + "loss": 1.7788, + "step": 15314 + }, + { + "epoch": 4.7007366482504604, + "grad_norm": 0.2007097452878952, + "learning_rate": 5.725615576109831e-05, + "loss": 1.7453, + "step": 15315 + }, + { + "epoch": 4.701043585021486, + "grad_norm": 0.19331564009189606, + "learning_rate": 5.725123778570299e-05, + "loss": 1.7142, + "step": 15316 + }, + { + "epoch": 4.701350521792511, + "grad_norm": 0.24291567504405975, + "learning_rate": 5.7246319738646706e-05, + "loss": 1.8081, + "step": 15317 + }, + { + "epoch": 4.701657458563536, + "grad_norm": 0.21423695981502533, + "learning_rate": 5.724140161997804e-05, + "loss": 1.7021, + "step": 15318 + }, + { + "epoch": 4.701964395334561, + "grad_norm": 0.20857618749141693, + "learning_rate": 5.72364834297456e-05, + "loss": 1.7447, + "step": 15319 + }, + { + "epoch": 4.702271332105586, + "grad_norm": 0.2547401487827301, + "learning_rate": 5.7231565167998e-05, + "loss": 1.7505, + "step": 15320 + }, + { + "epoch": 4.702578268876612, + "grad_norm": 0.2729472219944, + "learning_rate": 5.7226646834783825e-05, + "loss": 1.7974, + "step": 15321 + }, + { + "epoch": 4.702885205647637, + "grad_norm": 0.23258371651172638, + "learning_rate": 5.722172843015169e-05, + "loss": 1.7562, + "step": 15322 + }, + { + "epoch": 4.703192142418661, + "grad_norm": 0.23399893939495087, + "learning_rate": 5.72168099541502e-05, + "loss": 1.7674, + "step": 15323 + }, + { + "epoch": 4.703499079189687, + "grad_norm": 0.2678206264972687, + "learning_rate": 5.721189140682797e-05, + "loss": 1.7331, + "step": 15324 + }, + { + "epoch": 4.703806015960712, + "grad_norm": 0.19472146034240723, + "learning_rate": 5.7206972788233593e-05, + "loss": 1.7003, + "step": 15325 + }, + { + "epoch": 4.704112952731737, + "grad_norm": 0.2199394404888153, + "learning_rate": 5.72020540984157e-05, + "loss": 1.7072, + "step": 15326 + }, + { + "epoch": 4.704419889502763, + "grad_norm": 0.219175323843956, + "learning_rate": 5.719713533742287e-05, + "loss": 1.7591, + "step": 15327 + }, + { + "epoch": 4.704726826273788, + "grad_norm": 0.21127547323703766, + "learning_rate": 5.719221650530374e-05, + "loss": 1.8059, + "step": 15328 + }, + { + "epoch": 4.7050337630448125, + "grad_norm": 0.22189834713935852, + "learning_rate": 5.7187297602106905e-05, + "loss": 1.7529, + "step": 15329 + }, + { + "epoch": 4.705340699815838, + "grad_norm": 0.19945195317268372, + "learning_rate": 5.7182378627881e-05, + "loss": 1.7133, + "step": 15330 + }, + { + "epoch": 4.705647636586863, + "grad_norm": 0.2177499681711197, + "learning_rate": 5.7177459582674595e-05, + "loss": 1.7451, + "step": 15331 + }, + { + "epoch": 4.7059545733578885, + "grad_norm": 0.19489440321922302, + "learning_rate": 5.717254046653635e-05, + "loss": 1.7499, + "step": 15332 + }, + { + "epoch": 4.706261510128914, + "grad_norm": 0.21366968750953674, + "learning_rate": 5.716762127951485e-05, + "loss": 1.7683, + "step": 15333 + }, + { + "epoch": 4.706568446899938, + "grad_norm": 0.2894177734851837, + "learning_rate": 5.71627020216587e-05, + "loss": 1.8235, + "step": 15334 + }, + { + "epoch": 4.706875383670964, + "grad_norm": 0.22175677120685577, + "learning_rate": 5.7157782693016534e-05, + "loss": 1.7421, + "step": 15335 + }, + { + "epoch": 4.707182320441989, + "grad_norm": 0.23653541505336761, + "learning_rate": 5.715286329363698e-05, + "loss": 1.6937, + "step": 15336 + }, + { + "epoch": 4.707489257213014, + "grad_norm": 0.3015746772289276, + "learning_rate": 5.714794382356863e-05, + "loss": 1.7159, + "step": 15337 + }, + { + "epoch": 4.70779619398404, + "grad_norm": 0.24045881628990173, + "learning_rate": 5.714302428286011e-05, + "loss": 1.7263, + "step": 15338 + }, + { + "epoch": 4.708103130755065, + "grad_norm": 0.19836920499801636, + "learning_rate": 5.7138104671560035e-05, + "loss": 1.7604, + "step": 15339 + }, + { + "epoch": 4.708410067526089, + "grad_norm": 0.2430238276720047, + "learning_rate": 5.7133184989717036e-05, + "loss": 1.7147, + "step": 15340 + }, + { + "epoch": 4.708717004297115, + "grad_norm": 0.19388417899608612, + "learning_rate": 5.712826523737971e-05, + "loss": 1.7153, + "step": 15341 + }, + { + "epoch": 4.70902394106814, + "grad_norm": 0.19648151099681854, + "learning_rate": 5.7123345414596694e-05, + "loss": 1.7373, + "step": 15342 + }, + { + "epoch": 4.709330877839165, + "grad_norm": 0.20326325297355652, + "learning_rate": 5.711842552141661e-05, + "loss": 1.7012, + "step": 15343 + }, + { + "epoch": 4.70963781461019, + "grad_norm": 0.20798304677009583, + "learning_rate": 5.711350555788806e-05, + "loss": 1.7134, + "step": 15344 + }, + { + "epoch": 4.709944751381215, + "grad_norm": 0.29318806529045105, + "learning_rate": 5.7108585524059674e-05, + "loss": 1.7661, + "step": 15345 + }, + { + "epoch": 4.7102516881522405, + "grad_norm": 0.273318350315094, + "learning_rate": 5.710366541998009e-05, + "loss": 1.7329, + "step": 15346 + }, + { + "epoch": 4.710558624923266, + "grad_norm": 0.2306031584739685, + "learning_rate": 5.7098745245697925e-05, + "loss": 1.8152, + "step": 15347 + }, + { + "epoch": 4.710865561694291, + "grad_norm": 0.27630630135536194, + "learning_rate": 5.709382500126179e-05, + "loss": 1.7955, + "step": 15348 + }, + { + "epoch": 4.7111724984653165, + "grad_norm": 0.2366025298833847, + "learning_rate": 5.7088904686720326e-05, + "loss": 1.7943, + "step": 15349 + }, + { + "epoch": 4.711479435236341, + "grad_norm": 0.24196656048297882, + "learning_rate": 5.708398430212215e-05, + "loss": 1.698, + "step": 15350 + }, + { + "epoch": 4.711786372007366, + "grad_norm": 0.2770058512687683, + "learning_rate": 5.707906384751588e-05, + "loss": 1.7618, + "step": 15351 + }, + { + "epoch": 4.712093308778392, + "grad_norm": 0.20432323217391968, + "learning_rate": 5.7074143322950157e-05, + "loss": 1.7422, + "step": 15352 + }, + { + "epoch": 4.712400245549417, + "grad_norm": 0.25543150305747986, + "learning_rate": 5.70692227284736e-05, + "loss": 1.7744, + "step": 15353 + }, + { + "epoch": 4.712707182320442, + "grad_norm": 0.24315913021564484, + "learning_rate": 5.7064302064134855e-05, + "loss": 1.7127, + "step": 15354 + }, + { + "epoch": 4.713014119091467, + "grad_norm": 0.23636099696159363, + "learning_rate": 5.705938132998252e-05, + "loss": 1.7725, + "step": 15355 + }, + { + "epoch": 4.713321055862492, + "grad_norm": 0.26809820532798767, + "learning_rate": 5.705446052606526e-05, + "loss": 1.8338, + "step": 15356 + }, + { + "epoch": 4.713627992633517, + "grad_norm": 0.24969002604484558, + "learning_rate": 5.704953965243167e-05, + "loss": 1.8225, + "step": 15357 + }, + { + "epoch": 4.713934929404543, + "grad_norm": 0.23189692199230194, + "learning_rate": 5.70446187091304e-05, + "loss": 1.7901, + "step": 15358 + }, + { + "epoch": 4.714241866175568, + "grad_norm": 0.22373750805854797, + "learning_rate": 5.703969769621008e-05, + "loss": 1.6919, + "step": 15359 + }, + { + "epoch": 4.714548802946593, + "grad_norm": 0.23963531851768494, + "learning_rate": 5.703477661371934e-05, + "loss": 1.7806, + "step": 15360 + }, + { + "epoch": 4.714855739717618, + "grad_norm": 0.20365150272846222, + "learning_rate": 5.702985546170683e-05, + "loss": 1.7207, + "step": 15361 + }, + { + "epoch": 4.715162676488643, + "grad_norm": 0.245658278465271, + "learning_rate": 5.702493424022114e-05, + "loss": 1.7589, + "step": 15362 + }, + { + "epoch": 4.7154696132596685, + "grad_norm": 0.22633756697177887, + "learning_rate": 5.702001294931094e-05, + "loss": 1.7893, + "step": 15363 + }, + { + "epoch": 4.715776550030694, + "grad_norm": 0.21587726473808289, + "learning_rate": 5.701509158902487e-05, + "loss": 1.8095, + "step": 15364 + }, + { + "epoch": 4.716083486801719, + "grad_norm": 0.22553963959217072, + "learning_rate": 5.701017015941155e-05, + "loss": 1.7419, + "step": 15365 + }, + { + "epoch": 4.716390423572744, + "grad_norm": 0.2276087999343872, + "learning_rate": 5.700524866051962e-05, + "loss": 1.7052, + "step": 15366 + }, + { + "epoch": 4.716697360343769, + "grad_norm": 0.22236761450767517, + "learning_rate": 5.700032709239771e-05, + "loss": 1.8612, + "step": 15367 + }, + { + "epoch": 4.717004297114794, + "grad_norm": 0.22816185653209686, + "learning_rate": 5.6995405455094465e-05, + "loss": 1.78, + "step": 15368 + }, + { + "epoch": 4.71731123388582, + "grad_norm": 0.21597479283809662, + "learning_rate": 5.6990483748658516e-05, + "loss": 1.8276, + "step": 15369 + }, + { + "epoch": 4.717618170656845, + "grad_norm": 0.22209586203098297, + "learning_rate": 5.6985561973138533e-05, + "loss": 1.74, + "step": 15370 + }, + { + "epoch": 4.71792510742787, + "grad_norm": 0.24249997735023499, + "learning_rate": 5.6980640128583116e-05, + "loss": 1.8035, + "step": 15371 + }, + { + "epoch": 4.718232044198895, + "grad_norm": 0.23326106369495392, + "learning_rate": 5.6975718215040943e-05, + "loss": 1.7969, + "step": 15372 + }, + { + "epoch": 4.71853898096992, + "grad_norm": 0.215044766664505, + "learning_rate": 5.6970796232560596e-05, + "loss": 1.7345, + "step": 15373 + }, + { + "epoch": 4.718845917740945, + "grad_norm": 0.20231883227825165, + "learning_rate": 5.696587418119078e-05, + "loss": 1.7231, + "step": 15374 + }, + { + "epoch": 4.719152854511971, + "grad_norm": 0.2136038839817047, + "learning_rate": 5.696095206098011e-05, + "loss": 1.7421, + "step": 15375 + }, + { + "epoch": 4.719459791282996, + "grad_norm": 0.2662335932254791, + "learning_rate": 5.6956029871977235e-05, + "loss": 1.7518, + "step": 15376 + }, + { + "epoch": 4.7197667280540205, + "grad_norm": 0.25649648904800415, + "learning_rate": 5.6951107614230783e-05, + "loss": 1.8314, + "step": 15377 + }, + { + "epoch": 4.720073664825046, + "grad_norm": 0.21995560824871063, + "learning_rate": 5.6946185287789425e-05, + "loss": 1.7511, + "step": 15378 + }, + { + "epoch": 4.720380601596071, + "grad_norm": 0.3388935923576355, + "learning_rate": 5.694126289270177e-05, + "loss": 1.7975, + "step": 15379 + }, + { + "epoch": 4.7206875383670965, + "grad_norm": 0.32886409759521484, + "learning_rate": 5.693634042901651e-05, + "loss": 1.7153, + "step": 15380 + }, + { + "epoch": 4.720994475138122, + "grad_norm": 0.21727977693080902, + "learning_rate": 5.693141789678226e-05, + "loss": 1.7095, + "step": 15381 + }, + { + "epoch": 4.721301411909147, + "grad_norm": 0.2680833041667938, + "learning_rate": 5.6926495296047675e-05, + "loss": 1.696, + "step": 15382 + }, + { + "epoch": 4.721608348680172, + "grad_norm": 0.2645499110221863, + "learning_rate": 5.692157262686141e-05, + "loss": 1.6889, + "step": 15383 + }, + { + "epoch": 4.721915285451197, + "grad_norm": 0.20362348854541779, + "learning_rate": 5.69166498892721e-05, + "loss": 1.7303, + "step": 15384 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.24259062111377716, + "learning_rate": 5.691172708332839e-05, + "loss": 1.7684, + "step": 15385 + }, + { + "epoch": 4.722529158993248, + "grad_norm": 0.24204276502132416, + "learning_rate": 5.690680420907897e-05, + "loss": 1.7728, + "step": 15386 + }, + { + "epoch": 4.722836095764272, + "grad_norm": 0.3038320243358612, + "learning_rate": 5.690188126657244e-05, + "loss": 1.7573, + "step": 15387 + }, + { + "epoch": 4.723143032535297, + "grad_norm": 0.24619868397712708, + "learning_rate": 5.689695825585749e-05, + "loss": 1.754, + "step": 15388 + }, + { + "epoch": 4.723449969306323, + "grad_norm": 0.19441325962543488, + "learning_rate": 5.689203517698276e-05, + "loss": 1.726, + "step": 15389 + }, + { + "epoch": 4.723756906077348, + "grad_norm": 0.2874276340007782, + "learning_rate": 5.688711202999688e-05, + "loss": 1.7704, + "step": 15390 + }, + { + "epoch": 4.724063842848373, + "grad_norm": 0.24488390982151031, + "learning_rate": 5.6882188814948535e-05, + "loss": 1.7477, + "step": 15391 + }, + { + "epoch": 4.724370779619399, + "grad_norm": 0.22674018144607544, + "learning_rate": 5.687726553188636e-05, + "loss": 1.7287, + "step": 15392 + }, + { + "epoch": 4.724677716390423, + "grad_norm": 0.2653258442878723, + "learning_rate": 5.687234218085902e-05, + "loss": 1.7415, + "step": 15393 + }, + { + "epoch": 4.7249846531614486, + "grad_norm": 0.20345374941825867, + "learning_rate": 5.686741876191516e-05, + "loss": 1.764, + "step": 15394 + }, + { + "epoch": 4.725291589932474, + "grad_norm": 0.23193977773189545, + "learning_rate": 5.686249527510345e-05, + "loss": 1.7557, + "step": 15395 + }, + { + "epoch": 4.725598526703499, + "grad_norm": 0.26426708698272705, + "learning_rate": 5.685757172047253e-05, + "loss": 1.7708, + "step": 15396 + }, + { + "epoch": 4.725905463474525, + "grad_norm": 0.21377156674861908, + "learning_rate": 5.685264809807107e-05, + "loss": 1.6921, + "step": 15397 + }, + { + "epoch": 4.726212400245549, + "grad_norm": 0.21628457307815552, + "learning_rate": 5.684772440794773e-05, + "loss": 1.72, + "step": 15398 + }, + { + "epoch": 4.726519337016574, + "grad_norm": 0.19200581312179565, + "learning_rate": 5.684280065015116e-05, + "loss": 1.7311, + "step": 15399 + }, + { + "epoch": 4.7268262737876, + "grad_norm": 0.22227540612220764, + "learning_rate": 5.683787682473003e-05, + "loss": 1.7451, + "step": 15400 + }, + { + "epoch": 4.727133210558625, + "grad_norm": 0.18053604662418365, + "learning_rate": 5.683295293173299e-05, + "loss": 1.6816, + "step": 15401 + }, + { + "epoch": 4.72744014732965, + "grad_norm": 0.19827169179916382, + "learning_rate": 5.682802897120869e-05, + "loss": 1.7315, + "step": 15402 + }, + { + "epoch": 4.727747084100676, + "grad_norm": 0.2768021821975708, + "learning_rate": 5.682310494320582e-05, + "loss": 1.7714, + "step": 15403 + }, + { + "epoch": 4.7280540208717, + "grad_norm": 0.2613474428653717, + "learning_rate": 5.6818180847773027e-05, + "loss": 1.7332, + "step": 15404 + }, + { + "epoch": 4.7283609576427255, + "grad_norm": 0.21546787023544312, + "learning_rate": 5.681325668495898e-05, + "loss": 1.771, + "step": 15405 + }, + { + "epoch": 4.728667894413751, + "grad_norm": 0.24442137777805328, + "learning_rate": 5.680833245481234e-05, + "loss": 1.7296, + "step": 15406 + }, + { + "epoch": 4.728974831184776, + "grad_norm": 0.2622109055519104, + "learning_rate": 5.680340815738175e-05, + "loss": 1.7778, + "step": 15407 + }, + { + "epoch": 4.7292817679558015, + "grad_norm": 0.22379513084888458, + "learning_rate": 5.6798483792715904e-05, + "loss": 1.7953, + "step": 15408 + }, + { + "epoch": 4.729588704726826, + "grad_norm": 0.21901065111160278, + "learning_rate": 5.679355936086346e-05, + "loss": 1.7287, + "step": 15409 + }, + { + "epoch": 4.729895641497851, + "grad_norm": 0.3023792505264282, + "learning_rate": 5.6788634861873066e-05, + "loss": 1.7851, + "step": 15410 + }, + { + "epoch": 4.730202578268877, + "grad_norm": 0.23882482945919037, + "learning_rate": 5.678371029579342e-05, + "loss": 1.7621, + "step": 15411 + }, + { + "epoch": 4.730509515039902, + "grad_norm": 0.2661043703556061, + "learning_rate": 5.6778785662673175e-05, + "loss": 1.7453, + "step": 15412 + }, + { + "epoch": 4.730816451810927, + "grad_norm": 0.330208957195282, + "learning_rate": 5.677386096256099e-05, + "loss": 1.761, + "step": 15413 + }, + { + "epoch": 4.731123388581953, + "grad_norm": 0.2686570882797241, + "learning_rate": 5.676893619550552e-05, + "loss": 1.7539, + "step": 15414 + }, + { + "epoch": 4.731430325352977, + "grad_norm": 0.24308046698570251, + "learning_rate": 5.676401136155548e-05, + "loss": 1.7345, + "step": 15415 + }, + { + "epoch": 4.731737262124002, + "grad_norm": 0.4137137830257416, + "learning_rate": 5.67590864607595e-05, + "loss": 1.7688, + "step": 15416 + }, + { + "epoch": 4.732044198895028, + "grad_norm": 0.32161539793014526, + "learning_rate": 5.675416149316628e-05, + "loss": 1.7881, + "step": 15417 + }, + { + "epoch": 4.732351135666053, + "grad_norm": 0.2336999475955963, + "learning_rate": 5.674923645882447e-05, + "loss": 1.755, + "step": 15418 + }, + { + "epoch": 4.7326580724370775, + "grad_norm": 0.32781684398651123, + "learning_rate": 5.6744311357782754e-05, + "loss": 1.8062, + "step": 15419 + }, + { + "epoch": 4.732965009208103, + "grad_norm": 0.2475704401731491, + "learning_rate": 5.6739386190089795e-05, + "loss": 1.725, + "step": 15420 + }, + { + "epoch": 4.733271945979128, + "grad_norm": 0.26295650005340576, + "learning_rate": 5.673446095579427e-05, + "loss": 1.7673, + "step": 15421 + }, + { + "epoch": 4.7335788827501535, + "grad_norm": 0.3454873859882355, + "learning_rate": 5.6729535654944864e-05, + "loss": 1.7523, + "step": 15422 + }, + { + "epoch": 4.733885819521179, + "grad_norm": 0.2306666374206543, + "learning_rate": 5.672461028759024e-05, + "loss": 1.7085, + "step": 15423 + }, + { + "epoch": 4.734192756292204, + "grad_norm": 0.30825871229171753, + "learning_rate": 5.671968485377908e-05, + "loss": 1.7642, + "step": 15424 + }, + { + "epoch": 4.734499693063229, + "grad_norm": 0.42611342668533325, + "learning_rate": 5.6714759353560045e-05, + "loss": 1.7832, + "step": 15425 + }, + { + "epoch": 4.734806629834254, + "grad_norm": 0.29502514004707336, + "learning_rate": 5.670983378698182e-05, + "loss": 1.8153, + "step": 15426 + }, + { + "epoch": 4.735113566605279, + "grad_norm": 0.28416305780410767, + "learning_rate": 5.6704908154093096e-05, + "loss": 1.756, + "step": 15427 + }, + { + "epoch": 4.735420503376305, + "grad_norm": 0.43111103773117065, + "learning_rate": 5.6699982454942534e-05, + "loss": 1.7797, + "step": 15428 + }, + { + "epoch": 4.73572744014733, + "grad_norm": 0.27667397260665894, + "learning_rate": 5.669505668957882e-05, + "loss": 1.7316, + "step": 15429 + }, + { + "epoch": 4.736034376918354, + "grad_norm": 0.3045295774936676, + "learning_rate": 5.669013085805063e-05, + "loss": 1.7591, + "step": 15430 + }, + { + "epoch": 4.73634131368938, + "grad_norm": 0.4494635760784149, + "learning_rate": 5.6685204960406635e-05, + "loss": 1.8295, + "step": 15431 + }, + { + "epoch": 4.736648250460405, + "grad_norm": 0.2951449453830719, + "learning_rate": 5.6680278996695544e-05, + "loss": 1.7857, + "step": 15432 + }, + { + "epoch": 4.73695518723143, + "grad_norm": 0.2714167535305023, + "learning_rate": 5.6675352966966014e-05, + "loss": 1.816, + "step": 15433 + }, + { + "epoch": 4.737262124002456, + "grad_norm": 0.32701000571250916, + "learning_rate": 5.667042687126673e-05, + "loss": 1.7637, + "step": 15434 + }, + { + "epoch": 4.737569060773481, + "grad_norm": 0.2466556429862976, + "learning_rate": 5.666550070964638e-05, + "loss": 1.7805, + "step": 15435 + }, + { + "epoch": 4.7378759975445055, + "grad_norm": 0.3283855617046356, + "learning_rate": 5.666057448215365e-05, + "loss": 1.786, + "step": 15436 + }, + { + "epoch": 4.738182934315531, + "grad_norm": 0.35860660672187805, + "learning_rate": 5.6655648188837205e-05, + "loss": 1.8309, + "step": 15437 + }, + { + "epoch": 4.738489871086556, + "grad_norm": 0.22293898463249207, + "learning_rate": 5.665072182974576e-05, + "loss": 1.7317, + "step": 15438 + }, + { + "epoch": 4.7387968078575815, + "grad_norm": 0.3155089020729065, + "learning_rate": 5.664579540492798e-05, + "loss": 1.7202, + "step": 15439 + }, + { + "epoch": 4.739103744628607, + "grad_norm": 0.28723904490470886, + "learning_rate": 5.6640868914432566e-05, + "loss": 1.7788, + "step": 15440 + }, + { + "epoch": 4.739410681399631, + "grad_norm": 0.2461984008550644, + "learning_rate": 5.6635942358308183e-05, + "loss": 1.8504, + "step": 15441 + }, + { + "epoch": 4.739717618170657, + "grad_norm": 0.2503122091293335, + "learning_rate": 5.663101573660351e-05, + "loss": 1.7375, + "step": 15442 + }, + { + "epoch": 4.740024554941682, + "grad_norm": 0.24925372004508972, + "learning_rate": 5.662608904936727e-05, + "loss": 1.7152, + "step": 15443 + }, + { + "epoch": 4.740331491712707, + "grad_norm": 0.2734573483467102, + "learning_rate": 5.662116229664813e-05, + "loss": 1.7476, + "step": 15444 + }, + { + "epoch": 4.740638428483733, + "grad_norm": 0.38122060894966125, + "learning_rate": 5.661623547849479e-05, + "loss": 1.7682, + "step": 15445 + }, + { + "epoch": 4.740945365254758, + "grad_norm": 0.3786417245864868, + "learning_rate": 5.661130859495593e-05, + "loss": 1.7446, + "step": 15446 + }, + { + "epoch": 4.741252302025782, + "grad_norm": 0.22618255019187927, + "learning_rate": 5.6606381646080244e-05, + "loss": 1.7427, + "step": 15447 + }, + { + "epoch": 4.741559238796808, + "grad_norm": 0.3000899851322174, + "learning_rate": 5.6601454631916405e-05, + "loss": 1.7087, + "step": 15448 + }, + { + "epoch": 4.741866175567833, + "grad_norm": 0.36542513966560364, + "learning_rate": 5.659652755251315e-05, + "loss": 1.7985, + "step": 15449 + }, + { + "epoch": 4.742173112338858, + "grad_norm": 0.23550496995449066, + "learning_rate": 5.659160040791912e-05, + "loss": 1.8163, + "step": 15450 + }, + { + "epoch": 4.742480049109884, + "grad_norm": 0.25615251064300537, + "learning_rate": 5.658667319818305e-05, + "loss": 1.7372, + "step": 15451 + }, + { + "epoch": 4.742786985880908, + "grad_norm": 0.28744083642959595, + "learning_rate": 5.6581745923353615e-05, + "loss": 1.7193, + "step": 15452 + }, + { + "epoch": 4.7430939226519335, + "grad_norm": 0.2500229775905609, + "learning_rate": 5.65768185834795e-05, + "loss": 1.7263, + "step": 15453 + }, + { + "epoch": 4.743400859422959, + "grad_norm": 0.21520425379276276, + "learning_rate": 5.6571891178609394e-05, + "loss": 1.7337, + "step": 15454 + }, + { + "epoch": 4.743707796193984, + "grad_norm": 0.212506502866745, + "learning_rate": 5.656696370879202e-05, + "loss": 1.7672, + "step": 15455 + }, + { + "epoch": 4.7440147329650095, + "grad_norm": 0.21143417060375214, + "learning_rate": 5.656203617407607e-05, + "loss": 1.7189, + "step": 15456 + }, + { + "epoch": 4.744321669736035, + "grad_norm": 0.18320922553539276, + "learning_rate": 5.6557108574510243e-05, + "loss": 1.7521, + "step": 15457 + }, + { + "epoch": 4.744628606507059, + "grad_norm": 0.19202999770641327, + "learning_rate": 5.655218091014321e-05, + "loss": 1.6756, + "step": 15458 + }, + { + "epoch": 4.744935543278085, + "grad_norm": 0.2152331918478012, + "learning_rate": 5.654725318102367e-05, + "loss": 1.7653, + "step": 15459 + }, + { + "epoch": 4.74524248004911, + "grad_norm": 0.24565903842449188, + "learning_rate": 5.6542325387200354e-05, + "loss": 1.7654, + "step": 15460 + }, + { + "epoch": 4.745549416820135, + "grad_norm": 0.2504819333553314, + "learning_rate": 5.653739752872195e-05, + "loss": 1.7073, + "step": 15461 + }, + { + "epoch": 4.74585635359116, + "grad_norm": 0.19258706271648407, + "learning_rate": 5.653246960563714e-05, + "loss": 1.7106, + "step": 15462 + }, + { + "epoch": 4.746163290362185, + "grad_norm": 0.22961968183517456, + "learning_rate": 5.652754161799465e-05, + "loss": 1.7868, + "step": 15463 + }, + { + "epoch": 4.74647022713321, + "grad_norm": 0.2763231098651886, + "learning_rate": 5.652261356584315e-05, + "loss": 1.7714, + "step": 15464 + }, + { + "epoch": 4.746777163904236, + "grad_norm": 0.23866096138954163, + "learning_rate": 5.651768544923136e-05, + "loss": 1.7537, + "step": 15465 + }, + { + "epoch": 4.747084100675261, + "grad_norm": 0.21851976215839386, + "learning_rate": 5.6512757268207997e-05, + "loss": 1.8109, + "step": 15466 + }, + { + "epoch": 4.747391037446286, + "grad_norm": 0.22249393165111542, + "learning_rate": 5.6507829022821745e-05, + "loss": 1.7357, + "step": 15467 + }, + { + "epoch": 4.747697974217311, + "grad_norm": 0.20202289521694183, + "learning_rate": 5.650290071312131e-05, + "loss": 1.7867, + "step": 15468 + }, + { + "epoch": 4.748004910988336, + "grad_norm": 0.20618727803230286, + "learning_rate": 5.649797233915539e-05, + "loss": 1.6904, + "step": 15469 + }, + { + "epoch": 4.7483118477593615, + "grad_norm": 0.25609052181243896, + "learning_rate": 5.649304390097272e-05, + "loss": 1.7287, + "step": 15470 + }, + { + "epoch": 4.748618784530387, + "grad_norm": 0.22966544330120087, + "learning_rate": 5.648811539862195e-05, + "loss": 1.7384, + "step": 15471 + }, + { + "epoch": 4.748925721301412, + "grad_norm": 0.24070143699645996, + "learning_rate": 5.6483186832151856e-05, + "loss": 1.7625, + "step": 15472 + }, + { + "epoch": 4.749232658072437, + "grad_norm": 0.22642426192760468, + "learning_rate": 5.647825820161109e-05, + "loss": 1.7291, + "step": 15473 + }, + { + "epoch": 4.749539594843462, + "grad_norm": 0.23255646228790283, + "learning_rate": 5.64733295070484e-05, + "loss": 1.8076, + "step": 15474 + }, + { + "epoch": 4.749846531614487, + "grad_norm": 0.20902042090892792, + "learning_rate": 5.646840074851246e-05, + "loss": 1.6627, + "step": 15475 + }, + { + "epoch": 4.750153468385513, + "grad_norm": 0.21608836948871613, + "learning_rate": 5.646347192605198e-05, + "loss": 1.7458, + "step": 15476 + }, + { + "epoch": 4.750460405156538, + "grad_norm": 0.22368495166301727, + "learning_rate": 5.6458543039715694e-05, + "loss": 1.7601, + "step": 15477 + }, + { + "epoch": 4.750767341927563, + "grad_norm": 0.30586308240890503, + "learning_rate": 5.645361408955231e-05, + "loss": 1.8389, + "step": 15478 + }, + { + "epoch": 4.751074278698588, + "grad_norm": 0.25122150778770447, + "learning_rate": 5.644868507561052e-05, + "loss": 1.7509, + "step": 15479 + }, + { + "epoch": 4.751381215469613, + "grad_norm": 0.28435763716697693, + "learning_rate": 5.644375599793904e-05, + "loss": 1.7723, + "step": 15480 + }, + { + "epoch": 4.7516881522406385, + "grad_norm": 0.3111409842967987, + "learning_rate": 5.643882685658659e-05, + "loss": 1.7973, + "step": 15481 + }, + { + "epoch": 4.751995089011664, + "grad_norm": 0.3108380138874054, + "learning_rate": 5.6433897651601874e-05, + "loss": 1.8126, + "step": 15482 + }, + { + "epoch": 4.752302025782689, + "grad_norm": 0.25894731283187866, + "learning_rate": 5.642896838303362e-05, + "loss": 1.7849, + "step": 15483 + }, + { + "epoch": 4.752608962553714, + "grad_norm": 0.39321839809417725, + "learning_rate": 5.642403905093052e-05, + "loss": 1.7583, + "step": 15484 + }, + { + "epoch": 4.752915899324739, + "grad_norm": 0.3206121027469635, + "learning_rate": 5.6419109655341315e-05, + "loss": 1.8061, + "step": 15485 + }, + { + "epoch": 4.753222836095764, + "grad_norm": 0.2817624807357788, + "learning_rate": 5.64141801963147e-05, + "loss": 1.8252, + "step": 15486 + }, + { + "epoch": 4.75352977286679, + "grad_norm": 0.3344736397266388, + "learning_rate": 5.6409250673899405e-05, + "loss": 1.6975, + "step": 15487 + }, + { + "epoch": 4.753836709637815, + "grad_norm": 0.21873882412910461, + "learning_rate": 5.640432108814413e-05, + "loss": 1.7126, + "step": 15488 + }, + { + "epoch": 4.75414364640884, + "grad_norm": 0.3317199945449829, + "learning_rate": 5.639939143909758e-05, + "loss": 1.7826, + "step": 15489 + }, + { + "epoch": 4.754450583179865, + "grad_norm": 0.34901630878448486, + "learning_rate": 5.639446172680854e-05, + "loss": 1.7411, + "step": 15490 + }, + { + "epoch": 4.75475751995089, + "grad_norm": 0.24015867710113525, + "learning_rate": 5.6389531951325645e-05, + "loss": 1.7514, + "step": 15491 + }, + { + "epoch": 4.755064456721915, + "grad_norm": 0.28364554047584534, + "learning_rate": 5.6384602112697674e-05, + "loss": 1.7569, + "step": 15492 + }, + { + "epoch": 4.755371393492941, + "grad_norm": 0.3561246693134308, + "learning_rate": 5.637967221097329e-05, + "loss": 1.7212, + "step": 15493 + }, + { + "epoch": 4.755678330263965, + "grad_norm": 0.3383684456348419, + "learning_rate": 5.637474224620126e-05, + "loss": 1.6866, + "step": 15494 + }, + { + "epoch": 4.7559852670349905, + "grad_norm": 0.2399235963821411, + "learning_rate": 5.63698122184303e-05, + "loss": 1.7609, + "step": 15495 + }, + { + "epoch": 4.756292203806016, + "grad_norm": 0.38559645414352417, + "learning_rate": 5.636488212770912e-05, + "loss": 1.7509, + "step": 15496 + }, + { + "epoch": 4.756599140577041, + "grad_norm": 0.365005224943161, + "learning_rate": 5.635995197408645e-05, + "loss": 1.7894, + "step": 15497 + }, + { + "epoch": 4.7569060773480665, + "grad_norm": 0.21254757046699524, + "learning_rate": 5.635502175761099e-05, + "loss": 1.6969, + "step": 15498 + }, + { + "epoch": 4.757213014119092, + "grad_norm": 0.42865821719169617, + "learning_rate": 5.635009147833149e-05, + "loss": 1.7989, + "step": 15499 + }, + { + "epoch": 4.757519950890116, + "grad_norm": 0.35717228055000305, + "learning_rate": 5.634516113629665e-05, + "loss": 1.7338, + "step": 15500 + }, + { + "epoch": 4.757826887661142, + "grad_norm": 0.21582463383674622, + "learning_rate": 5.634023073155523e-05, + "loss": 1.7429, + "step": 15501 + }, + { + "epoch": 4.758133824432167, + "grad_norm": 0.3376842141151428, + "learning_rate": 5.633530026415592e-05, + "loss": 1.7703, + "step": 15502 + }, + { + "epoch": 4.758440761203192, + "grad_norm": 0.2760981023311615, + "learning_rate": 5.633036973414747e-05, + "loss": 1.7389, + "step": 15503 + }, + { + "epoch": 4.758747697974218, + "grad_norm": 0.3808997571468353, + "learning_rate": 5.63254391415786e-05, + "loss": 1.7513, + "step": 15504 + }, + { + "epoch": 4.759054634745242, + "grad_norm": 0.5152496695518494, + "learning_rate": 5.6320508486498014e-05, + "loss": 1.7376, + "step": 15505 + }, + { + "epoch": 4.759361571516267, + "grad_norm": 0.33983346819877625, + "learning_rate": 5.6315577768954464e-05, + "loss": 1.7209, + "step": 15506 + }, + { + "epoch": 4.759668508287293, + "grad_norm": 0.27064043283462524, + "learning_rate": 5.631064698899669e-05, + "loss": 1.7808, + "step": 15507 + }, + { + "epoch": 4.759975445058318, + "grad_norm": 0.3659237027168274, + "learning_rate": 5.630571614667339e-05, + "loss": 1.7706, + "step": 15508 + }, + { + "epoch": 4.760282381829343, + "grad_norm": 0.246379554271698, + "learning_rate": 5.63007852420333e-05, + "loss": 1.7425, + "step": 15509 + }, + { + "epoch": 4.760589318600369, + "grad_norm": 0.2683795392513275, + "learning_rate": 5.629585427512518e-05, + "loss": 1.7332, + "step": 15510 + }, + { + "epoch": 4.760896255371393, + "grad_norm": 0.32626205682754517, + "learning_rate": 5.6290923245997704e-05, + "loss": 1.786, + "step": 15511 + }, + { + "epoch": 4.7612031921424185, + "grad_norm": 0.23723098635673523, + "learning_rate": 5.6285992154699666e-05, + "loss": 1.7305, + "step": 15512 + }, + { + "epoch": 4.761510128913444, + "grad_norm": 0.26316091418266296, + "learning_rate": 5.628106100127976e-05, + "loss": 1.7804, + "step": 15513 + }, + { + "epoch": 4.761817065684469, + "grad_norm": 0.24376356601715088, + "learning_rate": 5.6276129785786726e-05, + "loss": 1.738, + "step": 15514 + }, + { + "epoch": 4.7621240024554945, + "grad_norm": 0.27778422832489014, + "learning_rate": 5.627119850826931e-05, + "loss": 1.7444, + "step": 15515 + }, + { + "epoch": 4.762430939226519, + "grad_norm": 0.3134306073188782, + "learning_rate": 5.6266267168776224e-05, + "loss": 1.7696, + "step": 15516 + }, + { + "epoch": 4.762737875997544, + "grad_norm": 0.2354283481836319, + "learning_rate": 5.6261335767356195e-05, + "loss": 1.799, + "step": 15517 + }, + { + "epoch": 4.76304481276857, + "grad_norm": 0.26902756094932556, + "learning_rate": 5.6256404304058e-05, + "loss": 1.7091, + "step": 15518 + }, + { + "epoch": 4.763351749539595, + "grad_norm": 0.2760716676712036, + "learning_rate": 5.6251472778930345e-05, + "loss": 1.742, + "step": 15519 + }, + { + "epoch": 4.76365868631062, + "grad_norm": 0.2138829231262207, + "learning_rate": 5.624654119202197e-05, + "loss": 1.7093, + "step": 15520 + }, + { + "epoch": 4.763965623081646, + "grad_norm": 0.31404614448547363, + "learning_rate": 5.624160954338162e-05, + "loss": 1.7467, + "step": 15521 + }, + { + "epoch": 4.76427255985267, + "grad_norm": 0.24810083210468292, + "learning_rate": 5.623667783305803e-05, + "loss": 1.745, + "step": 15522 + }, + { + "epoch": 4.764579496623695, + "grad_norm": 0.23674242198467255, + "learning_rate": 5.6231746061099913e-05, + "loss": 1.7662, + "step": 15523 + }, + { + "epoch": 4.764886433394721, + "grad_norm": 0.264230877161026, + "learning_rate": 5.622681422755606e-05, + "loss": 1.7627, + "step": 15524 + }, + { + "epoch": 4.765193370165746, + "grad_norm": 0.2982041537761688, + "learning_rate": 5.6221882332475165e-05, + "loss": 1.7558, + "step": 15525 + }, + { + "epoch": 4.765500306936771, + "grad_norm": 0.29215967655181885, + "learning_rate": 5.6216950375905975e-05, + "loss": 1.7981, + "step": 15526 + }, + { + "epoch": 4.765807243707796, + "grad_norm": 0.20014487206935883, + "learning_rate": 5.6212018357897244e-05, + "loss": 1.7113, + "step": 15527 + }, + { + "epoch": 4.766114180478821, + "grad_norm": 0.22359825670719147, + "learning_rate": 5.620708627849769e-05, + "loss": 1.7356, + "step": 15528 + }, + { + "epoch": 4.7664211172498465, + "grad_norm": 0.2254783809185028, + "learning_rate": 5.620215413775609e-05, + "loss": 1.7397, + "step": 15529 + }, + { + "epoch": 4.766728054020872, + "grad_norm": 0.2827560305595398, + "learning_rate": 5.619722193572117e-05, + "loss": 1.732, + "step": 15530 + }, + { + "epoch": 4.767034990791897, + "grad_norm": 0.22591307759284973, + "learning_rate": 5.619228967244165e-05, + "loss": 1.7713, + "step": 15531 + }, + { + "epoch": 4.7673419275629225, + "grad_norm": 0.25872737169265747, + "learning_rate": 5.618735734796632e-05, + "loss": 1.7291, + "step": 15532 + }, + { + "epoch": 4.767648864333947, + "grad_norm": 0.24515275657176971, + "learning_rate": 5.6182424962343884e-05, + "loss": 1.8079, + "step": 15533 + }, + { + "epoch": 4.767955801104972, + "grad_norm": 0.2456643134355545, + "learning_rate": 5.617749251562309e-05, + "loss": 1.7082, + "step": 15534 + }, + { + "epoch": 4.768262737875998, + "grad_norm": 0.21684220433235168, + "learning_rate": 5.6172560007852716e-05, + "loss": 1.7563, + "step": 15535 + }, + { + "epoch": 4.768569674647023, + "grad_norm": 0.2141445428133011, + "learning_rate": 5.616762743908147e-05, + "loss": 1.7115, + "step": 15536 + }, + { + "epoch": 4.768876611418047, + "grad_norm": 0.22502638399600983, + "learning_rate": 5.616269480935812e-05, + "loss": 1.723, + "step": 15537 + }, + { + "epoch": 4.769183548189073, + "grad_norm": 0.23387989401817322, + "learning_rate": 5.6157762118731416e-05, + "loss": 1.7775, + "step": 15538 + }, + { + "epoch": 4.769490484960098, + "grad_norm": 0.19615057110786438, + "learning_rate": 5.6152829367250096e-05, + "loss": 1.7696, + "step": 15539 + }, + { + "epoch": 4.769797421731123, + "grad_norm": 0.2408154010772705, + "learning_rate": 5.614789655496289e-05, + "loss": 1.7758, + "step": 15540 + }, + { + "epoch": 4.770104358502149, + "grad_norm": 0.20994634926319122, + "learning_rate": 5.614296368191859e-05, + "loss": 1.6935, + "step": 15541 + }, + { + "epoch": 4.770411295273174, + "grad_norm": 0.24135129153728485, + "learning_rate": 5.613803074816591e-05, + "loss": 1.7644, + "step": 15542 + }, + { + "epoch": 4.7707182320441985, + "grad_norm": 0.2380143105983734, + "learning_rate": 5.6133097753753625e-05, + "loss": 1.741, + "step": 15543 + }, + { + "epoch": 4.771025168815224, + "grad_norm": 0.30300623178482056, + "learning_rate": 5.6128164698730465e-05, + "loss": 1.7935, + "step": 15544 + }, + { + "epoch": 4.771332105586249, + "grad_norm": 0.2620760500431061, + "learning_rate": 5.612323158314519e-05, + "loss": 1.7436, + "step": 15545 + }, + { + "epoch": 4.7716390423572745, + "grad_norm": 0.3791491389274597, + "learning_rate": 5.6118298407046544e-05, + "loss": 1.7503, + "step": 15546 + }, + { + "epoch": 4.7719459791283, + "grad_norm": 0.3830909729003906, + "learning_rate": 5.61133651704833e-05, + "loss": 1.7651, + "step": 15547 + }, + { + "epoch": 4.772252915899324, + "grad_norm": 0.26680612564086914, + "learning_rate": 5.610843187350419e-05, + "loss": 1.8075, + "step": 15548 + }, + { + "epoch": 4.77255985267035, + "grad_norm": 0.38018953800201416, + "learning_rate": 5.610349851615798e-05, + "loss": 1.8301, + "step": 15549 + }, + { + "epoch": 4.772866789441375, + "grad_norm": 0.4514484107494354, + "learning_rate": 5.6098565098493414e-05, + "loss": 1.7709, + "step": 15550 + }, + { + "epoch": 4.7731737262124, + "grad_norm": 0.28267863392829895, + "learning_rate": 5.6093631620559254e-05, + "loss": 1.8087, + "step": 15551 + }, + { + "epoch": 4.773480662983426, + "grad_norm": 0.22541162371635437, + "learning_rate": 5.6088698082404256e-05, + "loss": 1.7457, + "step": 15552 + }, + { + "epoch": 4.773787599754451, + "grad_norm": 0.3012544512748718, + "learning_rate": 5.608376448407718e-05, + "loss": 1.7454, + "step": 15553 + }, + { + "epoch": 4.774094536525475, + "grad_norm": 0.2460169941186905, + "learning_rate": 5.607883082562677e-05, + "loss": 1.8237, + "step": 15554 + }, + { + "epoch": 4.774401473296501, + "grad_norm": 0.2918507158756256, + "learning_rate": 5.6073897107101804e-05, + "loss": 1.7416, + "step": 15555 + }, + { + "epoch": 4.774708410067526, + "grad_norm": 0.3104710280895233, + "learning_rate": 5.6068963328551016e-05, + "loss": 1.8162, + "step": 15556 + }, + { + "epoch": 4.7750153468385514, + "grad_norm": 0.2576459050178528, + "learning_rate": 5.606402949002317e-05, + "loss": 1.7732, + "step": 15557 + }, + { + "epoch": 4.775322283609577, + "grad_norm": 0.2373739629983902, + "learning_rate": 5.605909559156706e-05, + "loss": 1.7812, + "step": 15558 + }, + { + "epoch": 4.775629220380601, + "grad_norm": 0.30436694622039795, + "learning_rate": 5.6054161633231385e-05, + "loss": 1.7606, + "step": 15559 + }, + { + "epoch": 4.775936157151627, + "grad_norm": 0.3058558702468872, + "learning_rate": 5.604922761506495e-05, + "loss": 1.8384, + "step": 15560 + }, + { + "epoch": 4.776243093922652, + "grad_norm": 0.26421624422073364, + "learning_rate": 5.6044293537116496e-05, + "loss": 1.8041, + "step": 15561 + }, + { + "epoch": 4.776550030693677, + "grad_norm": 0.4945085346698761, + "learning_rate": 5.603935939943479e-05, + "loss": 1.7522, + "step": 15562 + }, + { + "epoch": 4.776856967464703, + "grad_norm": 0.41049134731292725, + "learning_rate": 5.6034425202068595e-05, + "loss": 1.7471, + "step": 15563 + }, + { + "epoch": 4.777163904235728, + "grad_norm": 0.22972853481769562, + "learning_rate": 5.602949094506668e-05, + "loss": 1.7041, + "step": 15564 + }, + { + "epoch": 4.777470841006752, + "grad_norm": 0.37373700737953186, + "learning_rate": 5.6024556628477785e-05, + "loss": 1.7811, + "step": 15565 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.3603375554084778, + "learning_rate": 5.6019622252350714e-05, + "loss": 1.8396, + "step": 15566 + }, + { + "epoch": 4.778084714548803, + "grad_norm": 0.2085956335067749, + "learning_rate": 5.601468781673419e-05, + "loss": 1.7453, + "step": 15567 + }, + { + "epoch": 4.778391651319828, + "grad_norm": 0.28871405124664307, + "learning_rate": 5.6009753321677e-05, + "loss": 1.7135, + "step": 15568 + }, + { + "epoch": 4.778698588090853, + "grad_norm": 0.2378411591053009, + "learning_rate": 5.600481876722791e-05, + "loss": 1.77, + "step": 15569 + }, + { + "epoch": 4.779005524861878, + "grad_norm": 0.2902696430683136, + "learning_rate": 5.599988415343567e-05, + "loss": 1.7416, + "step": 15570 + }, + { + "epoch": 4.7793124616329035, + "grad_norm": 0.36155447363853455, + "learning_rate": 5.5994949480349066e-05, + "loss": 1.7095, + "step": 15571 + }, + { + "epoch": 4.779619398403929, + "grad_norm": 0.24867403507232666, + "learning_rate": 5.599001474801686e-05, + "loss": 1.8063, + "step": 15572 + }, + { + "epoch": 4.779926335174954, + "grad_norm": 0.24853186309337616, + "learning_rate": 5.5985079956487815e-05, + "loss": 1.7537, + "step": 15573 + }, + { + "epoch": 4.7802332719459795, + "grad_norm": 0.31984636187553406, + "learning_rate": 5.598014510581071e-05, + "loss": 1.7888, + "step": 15574 + }, + { + "epoch": 4.780540208717004, + "grad_norm": 0.23907123506069183, + "learning_rate": 5.597521019603429e-05, + "loss": 1.7157, + "step": 15575 + }, + { + "epoch": 4.780847145488029, + "grad_norm": 0.25759413838386536, + "learning_rate": 5.597027522720736e-05, + "loss": 1.7579, + "step": 15576 + }, + { + "epoch": 4.781154082259055, + "grad_norm": 0.34123921394348145, + "learning_rate": 5.5965340199378654e-05, + "loss": 1.838, + "step": 15577 + }, + { + "epoch": 4.78146101903008, + "grad_norm": 0.2769980728626251, + "learning_rate": 5.596040511259697e-05, + "loss": 1.7889, + "step": 15578 + }, + { + "epoch": 4.781767955801105, + "grad_norm": 0.21936915814876556, + "learning_rate": 5.5955469966911066e-05, + "loss": 1.7434, + "step": 15579 + }, + { + "epoch": 4.78207489257213, + "grad_norm": 0.27583181858062744, + "learning_rate": 5.59505347623697e-05, + "loss": 1.7229, + "step": 15580 + }, + { + "epoch": 4.782381829343155, + "grad_norm": 0.24246171116828918, + "learning_rate": 5.594559949902168e-05, + "loss": 1.7368, + "step": 15581 + }, + { + "epoch": 4.78268876611418, + "grad_norm": 0.22705630958080292, + "learning_rate": 5.594066417691576e-05, + "loss": 1.7261, + "step": 15582 + }, + { + "epoch": 4.782995702885206, + "grad_norm": 0.23308728635311127, + "learning_rate": 5.593572879610072e-05, + "loss": 1.7451, + "step": 15583 + }, + { + "epoch": 4.783302639656231, + "grad_norm": 0.21654267609119415, + "learning_rate": 5.5930793356625324e-05, + "loss": 1.7133, + "step": 15584 + }, + { + "epoch": 4.783609576427256, + "grad_norm": 0.22884133458137512, + "learning_rate": 5.5925857858538347e-05, + "loss": 1.6899, + "step": 15585 + }, + { + "epoch": 4.783916513198281, + "grad_norm": 0.2396838665008545, + "learning_rate": 5.5920922301888555e-05, + "loss": 1.7837, + "step": 15586 + }, + { + "epoch": 4.784223449969306, + "grad_norm": 0.22941450774669647, + "learning_rate": 5.5915986686724765e-05, + "loss": 1.7443, + "step": 15587 + }, + { + "epoch": 4.7845303867403315, + "grad_norm": 0.23992502689361572, + "learning_rate": 5.591105101309572e-05, + "loss": 1.8054, + "step": 15588 + }, + { + "epoch": 4.784837323511357, + "grad_norm": 0.2540588974952698, + "learning_rate": 5.59061152810502e-05, + "loss": 1.855, + "step": 15589 + }, + { + "epoch": 4.785144260282382, + "grad_norm": 0.22691720724105835, + "learning_rate": 5.590117949063699e-05, + "loss": 1.7441, + "step": 15590 + }, + { + "epoch": 4.785451197053407, + "grad_norm": 0.23691289126873016, + "learning_rate": 5.5896243641904864e-05, + "loss": 1.8156, + "step": 15591 + }, + { + "epoch": 4.785758133824432, + "grad_norm": 0.2749332785606384, + "learning_rate": 5.589130773490261e-05, + "loss": 1.8157, + "step": 15592 + }, + { + "epoch": 4.786065070595457, + "grad_norm": 0.2435624748468399, + "learning_rate": 5.588637176967899e-05, + "loss": 1.7473, + "step": 15593 + }, + { + "epoch": 4.786372007366483, + "grad_norm": 0.22931383550167084, + "learning_rate": 5.5881435746282795e-05, + "loss": 1.7652, + "step": 15594 + }, + { + "epoch": 4.786678944137508, + "grad_norm": 0.23916593194007874, + "learning_rate": 5.587649966476282e-05, + "loss": 1.7415, + "step": 15595 + }, + { + "epoch": 4.786985880908533, + "grad_norm": 0.23483172059059143, + "learning_rate": 5.5871563525167814e-05, + "loss": 1.7308, + "step": 15596 + }, + { + "epoch": 4.787292817679558, + "grad_norm": 0.24850021302700043, + "learning_rate": 5.586662732754656e-05, + "loss": 1.8294, + "step": 15597 + }, + { + "epoch": 4.787599754450583, + "grad_norm": 0.2439260333776474, + "learning_rate": 5.586169107194788e-05, + "loss": 1.7599, + "step": 15598 + }, + { + "epoch": 4.787906691221608, + "grad_norm": 0.22379007935523987, + "learning_rate": 5.585675475842054e-05, + "loss": 1.7278, + "step": 15599 + }, + { + "epoch": 4.788213627992634, + "grad_norm": 0.2633908689022064, + "learning_rate": 5.58518183870133e-05, + "loss": 1.7318, + "step": 15600 + }, + { + "epoch": 4.788520564763659, + "grad_norm": 0.20992474257946014, + "learning_rate": 5.584688195777497e-05, + "loss": 1.7003, + "step": 15601 + }, + { + "epoch": 4.7888275015346835, + "grad_norm": 0.2460084706544876, + "learning_rate": 5.584194547075432e-05, + "loss": 1.78, + "step": 15602 + }, + { + "epoch": 4.789134438305709, + "grad_norm": 0.23955418169498444, + "learning_rate": 5.583700892600013e-05, + "loss": 1.7953, + "step": 15603 + }, + { + "epoch": 4.789441375076734, + "grad_norm": 0.2495713233947754, + "learning_rate": 5.583207232356121e-05, + "loss": 1.7874, + "step": 15604 + }, + { + "epoch": 4.7897483118477595, + "grad_norm": 0.22878028452396393, + "learning_rate": 5.5827135663486344e-05, + "loss": 1.7961, + "step": 15605 + }, + { + "epoch": 4.790055248618785, + "grad_norm": 0.2299363762140274, + "learning_rate": 5.582219894582429e-05, + "loss": 1.7497, + "step": 15606 + }, + { + "epoch": 4.79036218538981, + "grad_norm": 0.22896108031272888, + "learning_rate": 5.5817262170623865e-05, + "loss": 1.7543, + "step": 15607 + }, + { + "epoch": 4.790669122160835, + "grad_norm": 0.2150495946407318, + "learning_rate": 5.581232533793383e-05, + "loss": 1.8034, + "step": 15608 + }, + { + "epoch": 4.79097605893186, + "grad_norm": 0.21317999064922333, + "learning_rate": 5.580738844780301e-05, + "loss": 1.7482, + "step": 15609 + }, + { + "epoch": 4.791282995702885, + "grad_norm": 0.21904391050338745, + "learning_rate": 5.580245150028016e-05, + "loss": 1.7647, + "step": 15610 + }, + { + "epoch": 4.791589932473911, + "grad_norm": 0.2026481032371521, + "learning_rate": 5.5797514495414095e-05, + "loss": 1.6997, + "step": 15611 + }, + { + "epoch": 4.791896869244935, + "grad_norm": 0.22508487105369568, + "learning_rate": 5.579257743325359e-05, + "loss": 1.8258, + "step": 15612 + }, + { + "epoch": 4.79220380601596, + "grad_norm": 0.2801211178302765, + "learning_rate": 5.5787640313847435e-05, + "loss": 1.6991, + "step": 15613 + }, + { + "epoch": 4.792510742786986, + "grad_norm": 0.2696724236011505, + "learning_rate": 5.578270313724442e-05, + "loss": 1.7339, + "step": 15614 + }, + { + "epoch": 4.792817679558011, + "grad_norm": 0.2909143269062042, + "learning_rate": 5.577776590349334e-05, + "loss": 1.8481, + "step": 15615 + }, + { + "epoch": 4.793124616329036, + "grad_norm": 0.21682757139205933, + "learning_rate": 5.5772828612643005e-05, + "loss": 1.759, + "step": 15616 + }, + { + "epoch": 4.793431553100062, + "grad_norm": 0.23074059188365936, + "learning_rate": 5.576789126474219e-05, + "loss": 1.7652, + "step": 15617 + }, + { + "epoch": 4.793738489871086, + "grad_norm": 0.24018999934196472, + "learning_rate": 5.576295385983969e-05, + "loss": 1.7986, + "step": 15618 + }, + { + "epoch": 4.7940454266421115, + "grad_norm": 0.23987948894500732, + "learning_rate": 5.575801639798431e-05, + "loss": 1.779, + "step": 15619 + }, + { + "epoch": 4.794352363413137, + "grad_norm": 0.2138533890247345, + "learning_rate": 5.575307887922482e-05, + "loss": 1.7097, + "step": 15620 + }, + { + "epoch": 4.794659300184162, + "grad_norm": 0.1995106190443039, + "learning_rate": 5.5748141303610044e-05, + "loss": 1.6924, + "step": 15621 + }, + { + "epoch": 4.7949662369551875, + "grad_norm": 0.23547641932964325, + "learning_rate": 5.574320367118877e-05, + "loss": 1.8492, + "step": 15622 + }, + { + "epoch": 4.795273173726212, + "grad_norm": 0.22931239008903503, + "learning_rate": 5.5738265982009794e-05, + "loss": 1.8054, + "step": 15623 + }, + { + "epoch": 4.795580110497237, + "grad_norm": 0.19957222044467926, + "learning_rate": 5.573332823612191e-05, + "loss": 1.7464, + "step": 15624 + }, + { + "epoch": 4.795887047268263, + "grad_norm": 0.1990327090024948, + "learning_rate": 5.5728390433573905e-05, + "loss": 1.7438, + "step": 15625 + }, + { + "epoch": 4.796193984039288, + "grad_norm": 0.22276802361011505, + "learning_rate": 5.572345257441459e-05, + "loss": 1.7674, + "step": 15626 + }, + { + "epoch": 4.796500920810313, + "grad_norm": 0.2109617441892624, + "learning_rate": 5.571851465869277e-05, + "loss": 1.7577, + "step": 15627 + }, + { + "epoch": 4.796807857581339, + "grad_norm": 0.22917217016220093, + "learning_rate": 5.5713576686457234e-05, + "loss": 1.7478, + "step": 15628 + }, + { + "epoch": 4.797114794352363, + "grad_norm": 0.21016938984394073, + "learning_rate": 5.570863865775678e-05, + "loss": 1.8078, + "step": 15629 + }, + { + "epoch": 4.797421731123388, + "grad_norm": 0.22478216886520386, + "learning_rate": 5.5703700572640215e-05, + "loss": 1.7621, + "step": 15630 + }, + { + "epoch": 4.797728667894414, + "grad_norm": 0.26899904012680054, + "learning_rate": 5.569876243115634e-05, + "loss": 1.8065, + "step": 15631 + }, + { + "epoch": 4.798035604665439, + "grad_norm": 0.23187808692455292, + "learning_rate": 5.569382423335394e-05, + "loss": 1.7337, + "step": 15632 + }, + { + "epoch": 4.798342541436464, + "grad_norm": 0.2264855057001114, + "learning_rate": 5.568888597928185e-05, + "loss": 1.7879, + "step": 15633 + }, + { + "epoch": 4.798649478207489, + "grad_norm": 0.244137242436409, + "learning_rate": 5.568394766898886e-05, + "loss": 1.8307, + "step": 15634 + }, + { + "epoch": 4.798956414978514, + "grad_norm": 0.2400583177804947, + "learning_rate": 5.5679009302523744e-05, + "loss": 1.76, + "step": 15635 + }, + { + "epoch": 4.7992633517495396, + "grad_norm": 0.2324059158563614, + "learning_rate": 5.5674070879935347e-05, + "loss": 1.7594, + "step": 15636 + }, + { + "epoch": 4.799570288520565, + "grad_norm": 0.21753786504268646, + "learning_rate": 5.566913240127244e-05, + "loss": 1.7568, + "step": 15637 + }, + { + "epoch": 4.79987722529159, + "grad_norm": 0.21557624638080597, + "learning_rate": 5.566419386658386e-05, + "loss": 1.7733, + "step": 15638 + }, + { + "epoch": 4.800184162062616, + "grad_norm": 0.22795113921165466, + "learning_rate": 5.565925527591839e-05, + "loss": 1.7624, + "step": 15639 + }, + { + "epoch": 4.80049109883364, + "grad_norm": 0.23035180568695068, + "learning_rate": 5.565431662932484e-05, + "loss": 1.7436, + "step": 15640 + }, + { + "epoch": 4.800798035604665, + "grad_norm": 0.2569425404071808, + "learning_rate": 5.564937792685203e-05, + "loss": 1.7027, + "step": 15641 + }, + { + "epoch": 4.801104972375691, + "grad_norm": 0.20544980466365814, + "learning_rate": 5.564443916854875e-05, + "loss": 1.7125, + "step": 15642 + }, + { + "epoch": 4.801411909146716, + "grad_norm": 0.25040850043296814, + "learning_rate": 5.5639500354463815e-05, + "loss": 1.7646, + "step": 15643 + }, + { + "epoch": 4.8017188459177405, + "grad_norm": 0.1991344839334488, + "learning_rate": 5.563456148464602e-05, + "loss": 1.7206, + "step": 15644 + }, + { + "epoch": 4.802025782688766, + "grad_norm": 0.236537903547287, + "learning_rate": 5.56296225591442e-05, + "loss": 1.7288, + "step": 15645 + }, + { + "epoch": 4.802332719459791, + "grad_norm": 0.253619521856308, + "learning_rate": 5.562468357800714e-05, + "loss": 1.7347, + "step": 15646 + }, + { + "epoch": 4.8026396562308165, + "grad_norm": 0.22038741409778595, + "learning_rate": 5.561974454128367e-05, + "loss": 1.7854, + "step": 15647 + }, + { + "epoch": 4.802946593001842, + "grad_norm": 0.24848157167434692, + "learning_rate": 5.5614805449022576e-05, + "loss": 1.6904, + "step": 15648 + }, + { + "epoch": 4.803253529772867, + "grad_norm": 0.28735271096229553, + "learning_rate": 5.56098663012727e-05, + "loss": 1.7476, + "step": 15649 + }, + { + "epoch": 4.803560466543892, + "grad_norm": 0.2658432722091675, + "learning_rate": 5.5604927098082825e-05, + "loss": 1.7314, + "step": 15650 + }, + { + "epoch": 4.803867403314917, + "grad_norm": 0.20409154891967773, + "learning_rate": 5.559998783950179e-05, + "loss": 1.7698, + "step": 15651 + }, + { + "epoch": 4.804174340085942, + "grad_norm": 0.21932728588581085, + "learning_rate": 5.5595048525578384e-05, + "loss": 1.7808, + "step": 15652 + }, + { + "epoch": 4.804481276856968, + "grad_norm": 0.2549879848957062, + "learning_rate": 5.559010915636143e-05, + "loss": 1.8294, + "step": 15653 + }, + { + "epoch": 4.804788213627993, + "grad_norm": 0.2002289742231369, + "learning_rate": 5.5585169731899736e-05, + "loss": 1.732, + "step": 15654 + }, + { + "epoch": 4.805095150399017, + "grad_norm": 0.19988931715488434, + "learning_rate": 5.558023025224212e-05, + "loss": 1.7482, + "step": 15655 + }, + { + "epoch": 4.805402087170043, + "grad_norm": 0.21265259385108948, + "learning_rate": 5.55752907174374e-05, + "loss": 1.8003, + "step": 15656 + }, + { + "epoch": 4.805709023941068, + "grad_norm": 0.22365640103816986, + "learning_rate": 5.5570351127534395e-05, + "loss": 1.7536, + "step": 15657 + }, + { + "epoch": 4.806015960712093, + "grad_norm": 0.25516408681869507, + "learning_rate": 5.556541148258192e-05, + "loss": 1.7648, + "step": 15658 + }, + { + "epoch": 4.806322897483119, + "grad_norm": 0.24870765209197998, + "learning_rate": 5.5560471782628775e-05, + "loss": 1.7793, + "step": 15659 + }, + { + "epoch": 4.806629834254144, + "grad_norm": 0.22119416296482086, + "learning_rate": 5.555553202772379e-05, + "loss": 1.7464, + "step": 15660 + }, + { + "epoch": 4.8069367710251685, + "grad_norm": 0.2781904637813568, + "learning_rate": 5.555059221791579e-05, + "loss": 1.7537, + "step": 15661 + }, + { + "epoch": 4.807243707796194, + "grad_norm": 0.2433774471282959, + "learning_rate": 5.5545652353253574e-05, + "loss": 1.74, + "step": 15662 + }, + { + "epoch": 4.807550644567219, + "grad_norm": 0.19932180643081665, + "learning_rate": 5.554071243378598e-05, + "loss": 1.75, + "step": 15663 + }, + { + "epoch": 4.8078575813382445, + "grad_norm": 0.2428865283727646, + "learning_rate": 5.553577245956182e-05, + "loss": 1.7198, + "step": 15664 + }, + { + "epoch": 4.80816451810927, + "grad_norm": 0.2914198338985443, + "learning_rate": 5.553083243062991e-05, + "loss": 1.7544, + "step": 15665 + }, + { + "epoch": 4.808471454880294, + "grad_norm": 0.2274291068315506, + "learning_rate": 5.5525892347039056e-05, + "loss": 1.8213, + "step": 15666 + }, + { + "epoch": 4.80877839165132, + "grad_norm": 0.23662471771240234, + "learning_rate": 5.552095220883811e-05, + "loss": 1.8025, + "step": 15667 + }, + { + "epoch": 4.809085328422345, + "grad_norm": 0.23062555491924286, + "learning_rate": 5.551601201607587e-05, + "loss": 1.7109, + "step": 15668 + }, + { + "epoch": 4.80939226519337, + "grad_norm": 0.19986943900585175, + "learning_rate": 5.551107176880117e-05, + "loss": 1.7442, + "step": 15669 + }, + { + "epoch": 4.809699201964396, + "grad_norm": 0.2545560300350189, + "learning_rate": 5.5506131467062836e-05, + "loss": 1.7609, + "step": 15670 + }, + { + "epoch": 4.810006138735421, + "grad_norm": 0.253296434879303, + "learning_rate": 5.550119111090968e-05, + "loss": 1.7307, + "step": 15671 + }, + { + "epoch": 4.810313075506445, + "grad_norm": 0.19617940485477448, + "learning_rate": 5.549625070039052e-05, + "loss": 1.7507, + "step": 15672 + }, + { + "epoch": 4.810620012277471, + "grad_norm": 0.2525297999382019, + "learning_rate": 5.5491310235554193e-05, + "loss": 1.8021, + "step": 15673 + }, + { + "epoch": 4.810926949048496, + "grad_norm": 0.20537389814853668, + "learning_rate": 5.548636971644953e-05, + "loss": 1.7432, + "step": 15674 + }, + { + "epoch": 4.811233885819521, + "grad_norm": 0.19924211502075195, + "learning_rate": 5.548142914312533e-05, + "loss": 1.7741, + "step": 15675 + }, + { + "epoch": 4.811540822590547, + "grad_norm": 0.21121448278427124, + "learning_rate": 5.547648851563046e-05, + "loss": 1.7198, + "step": 15676 + }, + { + "epoch": 4.811847759361571, + "grad_norm": 0.23504914343357086, + "learning_rate": 5.547154783401369e-05, + "loss": 1.7173, + "step": 15677 + }, + { + "epoch": 4.8121546961325965, + "grad_norm": 0.2362392097711563, + "learning_rate": 5.54666070983239e-05, + "loss": 1.7752, + "step": 15678 + }, + { + "epoch": 4.812461632903622, + "grad_norm": 0.2524966895580292, + "learning_rate": 5.5461666308609886e-05, + "loss": 1.7943, + "step": 15679 + }, + { + "epoch": 4.812768569674647, + "grad_norm": 0.2250952422618866, + "learning_rate": 5.5456725464920476e-05, + "loss": 1.7606, + "step": 15680 + }, + { + "epoch": 4.8130755064456725, + "grad_norm": 0.21753156185150146, + "learning_rate": 5.5451784567304524e-05, + "loss": 1.7846, + "step": 15681 + }, + { + "epoch": 4.813382443216698, + "grad_norm": 0.220795676112175, + "learning_rate": 5.5446843615810825e-05, + "loss": 1.7422, + "step": 15682 + }, + { + "epoch": 4.813689379987722, + "grad_norm": 0.23597733676433563, + "learning_rate": 5.544190261048823e-05, + "loss": 1.7818, + "step": 15683 + }, + { + "epoch": 4.813996316758748, + "grad_norm": 0.2625976502895355, + "learning_rate": 5.543696155138557e-05, + "loss": 1.7796, + "step": 15684 + }, + { + "epoch": 4.814303253529773, + "grad_norm": 0.20515871047973633, + "learning_rate": 5.5432020438551656e-05, + "loss": 1.7096, + "step": 15685 + }, + { + "epoch": 4.814610190300798, + "grad_norm": 0.19353924691677094, + "learning_rate": 5.542707927203536e-05, + "loss": 1.7541, + "step": 15686 + }, + { + "epoch": 4.814917127071823, + "grad_norm": 0.21998172998428345, + "learning_rate": 5.5422138051885454e-05, + "loss": 1.7696, + "step": 15687 + }, + { + "epoch": 4.815224063842848, + "grad_norm": 0.27576857805252075, + "learning_rate": 5.5417196778150816e-05, + "loss": 1.7491, + "step": 15688 + }, + { + "epoch": 4.815531000613873, + "grad_norm": 0.28202036023139954, + "learning_rate": 5.5412255450880254e-05, + "loss": 1.8615, + "step": 15689 + }, + { + "epoch": 4.815837937384899, + "grad_norm": 0.29632845520973206, + "learning_rate": 5.540731407012263e-05, + "loss": 1.7698, + "step": 15690 + }, + { + "epoch": 4.816144874155924, + "grad_norm": 0.35393890738487244, + "learning_rate": 5.540237263592675e-05, + "loss": 1.7924, + "step": 15691 + }, + { + "epoch": 4.816451810926949, + "grad_norm": 0.23756493628025055, + "learning_rate": 5.5397431148341447e-05, + "loss": 1.8301, + "step": 15692 + }, + { + "epoch": 4.816758747697974, + "grad_norm": 0.310153603553772, + "learning_rate": 5.53924896074156e-05, + "loss": 1.8162, + "step": 15693 + }, + { + "epoch": 4.817065684468999, + "grad_norm": 0.3355565369129181, + "learning_rate": 5.538754801319797e-05, + "loss": 1.7738, + "step": 15694 + }, + { + "epoch": 4.8173726212400245, + "grad_norm": 0.2360079288482666, + "learning_rate": 5.5382606365737446e-05, + "loss": 1.6883, + "step": 15695 + }, + { + "epoch": 4.81767955801105, + "grad_norm": 0.2932819724082947, + "learning_rate": 5.537766466508286e-05, + "loss": 1.8045, + "step": 15696 + }, + { + "epoch": 4.817986494782075, + "grad_norm": 0.31298181414604187, + "learning_rate": 5.537272291128304e-05, + "loss": 1.7516, + "step": 15697 + }, + { + "epoch": 4.8182934315531, + "grad_norm": 0.22871924936771393, + "learning_rate": 5.5367781104386806e-05, + "loss": 1.7386, + "step": 15698 + }, + { + "epoch": 4.818600368324125, + "grad_norm": 0.27097782492637634, + "learning_rate": 5.5362839244443034e-05, + "loss": 1.733, + "step": 15699 + }, + { + "epoch": 4.81890730509515, + "grad_norm": 0.23296736180782318, + "learning_rate": 5.535789733150052e-05, + "loss": 1.7735, + "step": 15700 + }, + { + "epoch": 4.819214241866176, + "grad_norm": 0.22650237381458282, + "learning_rate": 5.5352955365608125e-05, + "loss": 1.7443, + "step": 15701 + }, + { + "epoch": 4.819521178637201, + "grad_norm": 0.25525161623954773, + "learning_rate": 5.534801334681471e-05, + "loss": 1.7379, + "step": 15702 + }, + { + "epoch": 4.819828115408226, + "grad_norm": 0.2249457836151123, + "learning_rate": 5.534307127516908e-05, + "loss": 1.7393, + "step": 15703 + }, + { + "epoch": 4.820135052179251, + "grad_norm": 0.1995566338300705, + "learning_rate": 5.5338129150720084e-05, + "loss": 1.7411, + "step": 15704 + }, + { + "epoch": 4.820441988950276, + "grad_norm": 0.250851035118103, + "learning_rate": 5.533318697351657e-05, + "loss": 1.7801, + "step": 15705 + }, + { + "epoch": 4.820748925721301, + "grad_norm": 0.3175830543041229, + "learning_rate": 5.532824474360737e-05, + "loss": 1.7553, + "step": 15706 + }, + { + "epoch": 4.821055862492327, + "grad_norm": 0.22842039167881012, + "learning_rate": 5.532330246104134e-05, + "loss": 1.7489, + "step": 15707 + }, + { + "epoch": 4.821362799263352, + "grad_norm": 0.21125485002994537, + "learning_rate": 5.531836012586732e-05, + "loss": 1.7543, + "step": 15708 + }, + { + "epoch": 4.8216697360343765, + "grad_norm": 0.33028700947761536, + "learning_rate": 5.531341773813414e-05, + "loss": 1.8237, + "step": 15709 + }, + { + "epoch": 4.821976672805402, + "grad_norm": 0.324564129114151, + "learning_rate": 5.530847529789067e-05, + "loss": 1.7288, + "step": 15710 + }, + { + "epoch": 4.822283609576427, + "grad_norm": 0.3299528956413269, + "learning_rate": 5.530353280518571e-05, + "loss": 1.7536, + "step": 15711 + }, + { + "epoch": 4.8225905463474525, + "grad_norm": 0.3535030782222748, + "learning_rate": 5.5298590260068136e-05, + "loss": 1.7941, + "step": 15712 + }, + { + "epoch": 4.822897483118478, + "grad_norm": 0.2627669870853424, + "learning_rate": 5.5293647662586804e-05, + "loss": 1.7638, + "step": 15713 + }, + { + "epoch": 4.823204419889503, + "grad_norm": 0.25569450855255127, + "learning_rate": 5.5288705012790535e-05, + "loss": 1.7396, + "step": 15714 + }, + { + "epoch": 4.823511356660528, + "grad_norm": 0.26099520921707153, + "learning_rate": 5.528376231072817e-05, + "loss": 1.7415, + "step": 15715 + }, + { + "epoch": 4.823818293431553, + "grad_norm": 0.31833693385124207, + "learning_rate": 5.527881955644858e-05, + "loss": 1.7683, + "step": 15716 + }, + { + "epoch": 4.824125230202578, + "grad_norm": 0.2753448188304901, + "learning_rate": 5.5273876750000594e-05, + "loss": 1.6653, + "step": 15717 + }, + { + "epoch": 4.824432166973604, + "grad_norm": 0.23816895484924316, + "learning_rate": 5.526893389143307e-05, + "loss": 1.7575, + "step": 15718 + }, + { + "epoch": 4.824739103744628, + "grad_norm": 0.25376051664352417, + "learning_rate": 5.5263990980794856e-05, + "loss": 1.755, + "step": 15719 + }, + { + "epoch": 4.8250460405156534, + "grad_norm": 0.2483726590871811, + "learning_rate": 5.52590480181348e-05, + "loss": 1.7566, + "step": 15720 + }, + { + "epoch": 4.825352977286679, + "grad_norm": 0.2073517143726349, + "learning_rate": 5.5254105003501746e-05, + "loss": 1.7069, + "step": 15721 + }, + { + "epoch": 4.825659914057704, + "grad_norm": 0.3166659474372864, + "learning_rate": 5.524916193694455e-05, + "loss": 1.7012, + "step": 15722 + }, + { + "epoch": 4.8259668508287294, + "grad_norm": 0.24518641829490662, + "learning_rate": 5.524421881851205e-05, + "loss": 1.7027, + "step": 15723 + }, + { + "epoch": 4.826273787599755, + "grad_norm": 0.23137906193733215, + "learning_rate": 5.523927564825311e-05, + "loss": 1.746, + "step": 15724 + }, + { + "epoch": 4.82658072437078, + "grad_norm": 0.27937051653862, + "learning_rate": 5.5234332426216586e-05, + "loss": 1.7064, + "step": 15725 + }, + { + "epoch": 4.826887661141805, + "grad_norm": 0.26408496499061584, + "learning_rate": 5.522938915245131e-05, + "loss": 1.6598, + "step": 15726 + }, + { + "epoch": 4.82719459791283, + "grad_norm": 0.22269997000694275, + "learning_rate": 5.5224445827006164e-05, + "loss": 1.7166, + "step": 15727 + }, + { + "epoch": 4.827501534683855, + "grad_norm": 0.22687453031539917, + "learning_rate": 5.5219502449929964e-05, + "loss": 1.7156, + "step": 15728 + }, + { + "epoch": 4.827808471454881, + "grad_norm": 0.26355600357055664, + "learning_rate": 5.5214559021271585e-05, + "loss": 1.8016, + "step": 15729 + }, + { + "epoch": 4.828115408225905, + "grad_norm": 0.30103012919425964, + "learning_rate": 5.520961554107987e-05, + "loss": 1.7856, + "step": 15730 + }, + { + "epoch": 4.82842234499693, + "grad_norm": 0.22604018449783325, + "learning_rate": 5.520467200940369e-05, + "loss": 1.813, + "step": 15731 + }, + { + "epoch": 4.828729281767956, + "grad_norm": 0.25435203313827515, + "learning_rate": 5.51997284262919e-05, + "loss": 1.7511, + "step": 15732 + }, + { + "epoch": 4.829036218538981, + "grad_norm": 0.2740691304206848, + "learning_rate": 5.519478479179333e-05, + "loss": 1.7326, + "step": 15733 + }, + { + "epoch": 4.829343155310006, + "grad_norm": 0.19710861146450043, + "learning_rate": 5.5189841105956866e-05, + "loss": 1.7581, + "step": 15734 + }, + { + "epoch": 4.829650092081032, + "grad_norm": 0.2315293401479721, + "learning_rate": 5.518489736883132e-05, + "loss": 1.6796, + "step": 15735 + }, + { + "epoch": 4.829957028852056, + "grad_norm": 0.2465476542711258, + "learning_rate": 5.51799535804656e-05, + "loss": 1.7276, + "step": 15736 + }, + { + "epoch": 4.8302639656230815, + "grad_norm": 0.20438486337661743, + "learning_rate": 5.5175009740908546e-05, + "loss": 1.7188, + "step": 15737 + }, + { + "epoch": 4.830570902394107, + "grad_norm": 0.24328351020812988, + "learning_rate": 5.5170065850209016e-05, + "loss": 1.7165, + "step": 15738 + }, + { + "epoch": 4.830877839165132, + "grad_norm": 0.22486837208271027, + "learning_rate": 5.516512190841586e-05, + "loss": 1.7369, + "step": 15739 + }, + { + "epoch": 4.8311847759361575, + "grad_norm": 0.2065822333097458, + "learning_rate": 5.5160177915577934e-05, + "loss": 1.7125, + "step": 15740 + }, + { + "epoch": 4.831491712707182, + "grad_norm": 0.21223095059394836, + "learning_rate": 5.5155233871744104e-05, + "loss": 1.7319, + "step": 15741 + }, + { + "epoch": 4.831798649478207, + "grad_norm": 0.25712934136390686, + "learning_rate": 5.515028977696325e-05, + "loss": 1.7847, + "step": 15742 + }, + { + "epoch": 4.832105586249233, + "grad_norm": 0.21289978921413422, + "learning_rate": 5.5145345631284215e-05, + "loss": 1.7629, + "step": 15743 + }, + { + "epoch": 4.832412523020258, + "grad_norm": 0.22347134351730347, + "learning_rate": 5.514040143475585e-05, + "loss": 1.7491, + "step": 15744 + }, + { + "epoch": 4.832719459791283, + "grad_norm": 0.20660510659217834, + "learning_rate": 5.513545718742702e-05, + "loss": 1.7377, + "step": 15745 + }, + { + "epoch": 4.833026396562309, + "grad_norm": 0.21612273156642914, + "learning_rate": 5.513051288934658e-05, + "loss": 1.7973, + "step": 15746 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.22515933215618134, + "learning_rate": 5.512556854056342e-05, + "loss": 1.7774, + "step": 15747 + }, + { + "epoch": 4.833640270104358, + "grad_norm": 0.21075554192066193, + "learning_rate": 5.512062414112639e-05, + "loss": 1.7741, + "step": 15748 + }, + { + "epoch": 4.833947206875384, + "grad_norm": 0.2203720659017563, + "learning_rate": 5.511567969108436e-05, + "loss": 1.7902, + "step": 15749 + }, + { + "epoch": 4.834254143646409, + "grad_norm": 0.20247167348861694, + "learning_rate": 5.511073519048616e-05, + "loss": 1.7084, + "step": 15750 + }, + { + "epoch": 4.834561080417434, + "grad_norm": 0.247711181640625, + "learning_rate": 5.5105790639380695e-05, + "loss": 1.8465, + "step": 15751 + }, + { + "epoch": 4.834868017188459, + "grad_norm": 0.22866854071617126, + "learning_rate": 5.51008460378168e-05, + "loss": 1.7252, + "step": 15752 + }, + { + "epoch": 4.835174953959484, + "grad_norm": 0.2335643470287323, + "learning_rate": 5.5095901385843374e-05, + "loss": 1.703, + "step": 15753 + }, + { + "epoch": 4.8354818907305095, + "grad_norm": 0.20874348282814026, + "learning_rate": 5.509095668350926e-05, + "loss": 1.7114, + "step": 15754 + }, + { + "epoch": 4.835788827501535, + "grad_norm": 0.19156917929649353, + "learning_rate": 5.5086011930863314e-05, + "loss": 1.6975, + "step": 15755 + }, + { + "epoch": 4.83609576427256, + "grad_norm": 0.23480524122714996, + "learning_rate": 5.508106712795443e-05, + "loss": 1.8291, + "step": 15756 + }, + { + "epoch": 4.8364027010435855, + "grad_norm": 0.20430417358875275, + "learning_rate": 5.5076122274831454e-05, + "loss": 1.7605, + "step": 15757 + }, + { + "epoch": 4.83670963781461, + "grad_norm": 0.26790598034858704, + "learning_rate": 5.5071177371543256e-05, + "loss": 1.7541, + "step": 15758 + }, + { + "epoch": 4.837016574585635, + "grad_norm": 0.3339289724826813, + "learning_rate": 5.506623241813873e-05, + "loss": 1.7566, + "step": 15759 + }, + { + "epoch": 4.837323511356661, + "grad_norm": 0.30528193712234497, + "learning_rate": 5.5061287414666726e-05, + "loss": 1.7371, + "step": 15760 + }, + { + "epoch": 4.837630448127686, + "grad_norm": 0.21059657633304596, + "learning_rate": 5.5056342361176114e-05, + "loss": 1.7599, + "step": 15761 + }, + { + "epoch": 4.83793738489871, + "grad_norm": 0.27918973565101624, + "learning_rate": 5.5051397257715756e-05, + "loss": 1.7485, + "step": 15762 + }, + { + "epoch": 4.838244321669736, + "grad_norm": 0.23147793114185333, + "learning_rate": 5.5046452104334514e-05, + "loss": 1.7121, + "step": 15763 + }, + { + "epoch": 4.838551258440761, + "grad_norm": 0.22028742730617523, + "learning_rate": 5.5041506901081294e-05, + "loss": 1.803, + "step": 15764 + }, + { + "epoch": 4.838858195211786, + "grad_norm": 0.22840891778469086, + "learning_rate": 5.5036561648004946e-05, + "loss": 1.7555, + "step": 15765 + }, + { + "epoch": 4.839165131982812, + "grad_norm": 0.2610893249511719, + "learning_rate": 5.503161634515433e-05, + "loss": 1.7873, + "step": 15766 + }, + { + "epoch": 4.839472068753837, + "grad_norm": 0.2530003786087036, + "learning_rate": 5.502667099257836e-05, + "loss": 1.7604, + "step": 15767 + }, + { + "epoch": 4.8397790055248615, + "grad_norm": 0.20120400190353394, + "learning_rate": 5.5021725590325854e-05, + "loss": 1.7476, + "step": 15768 + }, + { + "epoch": 4.840085942295887, + "grad_norm": 0.2189723700284958, + "learning_rate": 5.501678013844571e-05, + "loss": 1.7174, + "step": 15769 + }, + { + "epoch": 4.840392879066912, + "grad_norm": 0.2511899173259735, + "learning_rate": 5.501183463698683e-05, + "loss": 1.7589, + "step": 15770 + }, + { + "epoch": 4.8406998158379375, + "grad_norm": 0.24899333715438843, + "learning_rate": 5.5006889085998035e-05, + "loss": 1.7253, + "step": 15771 + }, + { + "epoch": 4.841006752608963, + "grad_norm": 0.21223559975624084, + "learning_rate": 5.5001943485528254e-05, + "loss": 1.6949, + "step": 15772 + }, + { + "epoch": 4.841313689379987, + "grad_norm": 0.21394596993923187, + "learning_rate": 5.499699783562632e-05, + "loss": 1.7827, + "step": 15773 + }, + { + "epoch": 4.841620626151013, + "grad_norm": 0.2379613220691681, + "learning_rate": 5.4992052136341134e-05, + "loss": 1.7968, + "step": 15774 + }, + { + "epoch": 4.841927562922038, + "grad_norm": 0.23748385906219482, + "learning_rate": 5.498710638772154e-05, + "loss": 1.797, + "step": 15775 + }, + { + "epoch": 4.842234499693063, + "grad_norm": 0.2502206265926361, + "learning_rate": 5.498216058981646e-05, + "loss": 1.7292, + "step": 15776 + }, + { + "epoch": 4.842541436464089, + "grad_norm": 0.23613516986370087, + "learning_rate": 5.497721474267475e-05, + "loss": 1.7353, + "step": 15777 + }, + { + "epoch": 4.842848373235114, + "grad_norm": 0.25274696946144104, + "learning_rate": 5.497226884634527e-05, + "loss": 1.7782, + "step": 15778 + }, + { + "epoch": 4.843155310006138, + "grad_norm": 0.19574183225631714, + "learning_rate": 5.496732290087694e-05, + "loss": 1.6926, + "step": 15779 + }, + { + "epoch": 4.843462246777164, + "grad_norm": 0.21040405333042145, + "learning_rate": 5.496237690631858e-05, + "loss": 1.7235, + "step": 15780 + }, + { + "epoch": 4.843769183548189, + "grad_norm": 0.22499679028987885, + "learning_rate": 5.495743086271913e-05, + "loss": 1.7889, + "step": 15781 + }, + { + "epoch": 4.844076120319214, + "grad_norm": 0.24623246490955353, + "learning_rate": 5.4952484770127433e-05, + "loss": 1.7357, + "step": 15782 + }, + { + "epoch": 4.84438305709024, + "grad_norm": 0.21706275641918182, + "learning_rate": 5.494753862859238e-05, + "loss": 1.7349, + "step": 15783 + }, + { + "epoch": 4.844689993861264, + "grad_norm": 0.20705166459083557, + "learning_rate": 5.4942592438162855e-05, + "loss": 1.7047, + "step": 15784 + }, + { + "epoch": 4.8449969306322895, + "grad_norm": 0.21216751635074615, + "learning_rate": 5.493764619888773e-05, + "loss": 1.7335, + "step": 15785 + }, + { + "epoch": 4.845303867403315, + "grad_norm": 0.2945895195007324, + "learning_rate": 5.493269991081588e-05, + "loss": 1.838, + "step": 15786 + }, + { + "epoch": 4.84561080417434, + "grad_norm": 0.22013652324676514, + "learning_rate": 5.492775357399621e-05, + "loss": 1.7541, + "step": 15787 + }, + { + "epoch": 4.8459177409453655, + "grad_norm": 0.25428512692451477, + "learning_rate": 5.4922807188477585e-05, + "loss": 1.7405, + "step": 15788 + }, + { + "epoch": 4.846224677716391, + "grad_norm": 0.23189012706279755, + "learning_rate": 5.49178607543089e-05, + "loss": 1.8075, + "step": 15789 + }, + { + "epoch": 4.846531614487415, + "grad_norm": 0.21637389063835144, + "learning_rate": 5.491291427153904e-05, + "loss": 1.7229, + "step": 15790 + }, + { + "epoch": 4.846838551258441, + "grad_norm": 0.20628009736537933, + "learning_rate": 5.490796774021687e-05, + "loss": 1.7605, + "step": 15791 + }, + { + "epoch": 4.847145488029466, + "grad_norm": 0.20845308899879456, + "learning_rate": 5.4903021160391276e-05, + "loss": 1.7864, + "step": 15792 + }, + { + "epoch": 4.847452424800491, + "grad_norm": 0.20367322862148285, + "learning_rate": 5.4898074532111164e-05, + "loss": 1.733, + "step": 15793 + }, + { + "epoch": 4.847759361571516, + "grad_norm": 0.2066505253314972, + "learning_rate": 5.489312785542543e-05, + "loss": 1.7113, + "step": 15794 + }, + { + "epoch": 4.848066298342541, + "grad_norm": 0.23874987661838531, + "learning_rate": 5.488818113038292e-05, + "loss": 1.7735, + "step": 15795 + }, + { + "epoch": 4.848373235113566, + "grad_norm": 0.26583850383758545, + "learning_rate": 5.488323435703254e-05, + "loss": 1.8019, + "step": 15796 + }, + { + "epoch": 4.848680171884592, + "grad_norm": 0.25207552313804626, + "learning_rate": 5.487828753542317e-05, + "loss": 1.7491, + "step": 15797 + }, + { + "epoch": 4.848987108655617, + "grad_norm": 0.23065905272960663, + "learning_rate": 5.48733406656037e-05, + "loss": 1.7451, + "step": 15798 + }, + { + "epoch": 4.849294045426642, + "grad_norm": 0.26914483308792114, + "learning_rate": 5.486839374762304e-05, + "loss": 1.7553, + "step": 15799 + }, + { + "epoch": 4.849600982197668, + "grad_norm": 0.2509605884552002, + "learning_rate": 5.4863446781530046e-05, + "loss": 1.7124, + "step": 15800 + }, + { + "epoch": 4.849907918968692, + "grad_norm": 0.2618432343006134, + "learning_rate": 5.485849976737362e-05, + "loss": 1.7368, + "step": 15801 + }, + { + "epoch": 4.850214855739718, + "grad_norm": 0.46875160932540894, + "learning_rate": 5.485355270520266e-05, + "loss": 1.7883, + "step": 15802 + }, + { + "epoch": 4.850521792510743, + "grad_norm": 0.37585484981536865, + "learning_rate": 5.4848605595066025e-05, + "loss": 1.7894, + "step": 15803 + }, + { + "epoch": 4.850828729281768, + "grad_norm": 0.2244408279657364, + "learning_rate": 5.4843658437012646e-05, + "loss": 1.7394, + "step": 15804 + }, + { + "epoch": 4.851135666052793, + "grad_norm": 0.4061773419380188, + "learning_rate": 5.48387112310914e-05, + "loss": 1.7703, + "step": 15805 + }, + { + "epoch": 4.851442602823818, + "grad_norm": 0.35925009846687317, + "learning_rate": 5.483376397735117e-05, + "loss": 1.7798, + "step": 15806 + }, + { + "epoch": 4.851749539594843, + "grad_norm": 0.23050184547901154, + "learning_rate": 5.482881667584084e-05, + "loss": 1.7984, + "step": 15807 + }, + { + "epoch": 4.852056476365869, + "grad_norm": 0.37308645248413086, + "learning_rate": 5.4823869326609335e-05, + "loss": 1.6747, + "step": 15808 + }, + { + "epoch": 4.852363413136894, + "grad_norm": 0.29826754331588745, + "learning_rate": 5.481892192970551e-05, + "loss": 1.7432, + "step": 15809 + }, + { + "epoch": 4.852670349907919, + "grad_norm": 0.23652370274066925, + "learning_rate": 5.4813974485178266e-05, + "loss": 1.7557, + "step": 15810 + }, + { + "epoch": 4.852977286678944, + "grad_norm": 0.40549808740615845, + "learning_rate": 5.4809026993076526e-05, + "loss": 1.7317, + "step": 15811 + }, + { + "epoch": 4.853284223449969, + "grad_norm": 0.3367961347103119, + "learning_rate": 5.4804079453449156e-05, + "loss": 1.7648, + "step": 15812 + }, + { + "epoch": 4.8535911602209945, + "grad_norm": 0.21629661321640015, + "learning_rate": 5.4799131866345055e-05, + "loss": 1.7986, + "step": 15813 + }, + { + "epoch": 4.85389809699202, + "grad_norm": 0.26381492614746094, + "learning_rate": 5.4794184231813105e-05, + "loss": 1.7401, + "step": 15814 + }, + { + "epoch": 4.854205033763045, + "grad_norm": 0.22319363057613373, + "learning_rate": 5.478923654990223e-05, + "loss": 1.7773, + "step": 15815 + }, + { + "epoch": 4.85451197053407, + "grad_norm": 0.2547159492969513, + "learning_rate": 5.4784288820661326e-05, + "loss": 1.8194, + "step": 15816 + }, + { + "epoch": 4.854818907305095, + "grad_norm": 0.29574522376060486, + "learning_rate": 5.477934104413925e-05, + "loss": 1.7351, + "step": 15817 + }, + { + "epoch": 4.85512584407612, + "grad_norm": 0.17389361560344696, + "learning_rate": 5.4774393220384945e-05, + "loss": 1.6957, + "step": 15818 + }, + { + "epoch": 4.855432780847146, + "grad_norm": 0.23746751248836517, + "learning_rate": 5.476944534944728e-05, + "loss": 1.7713, + "step": 15819 + }, + { + "epoch": 4.855739717618171, + "grad_norm": 0.182356595993042, + "learning_rate": 5.476449743137516e-05, + "loss": 1.7144, + "step": 15820 + }, + { + "epoch": 4.856046654389196, + "grad_norm": 0.23716382682323456, + "learning_rate": 5.4759549466217475e-05, + "loss": 1.7451, + "step": 15821 + }, + { + "epoch": 4.856353591160221, + "grad_norm": 0.316806823015213, + "learning_rate": 5.475460145402313e-05, + "loss": 1.7823, + "step": 15822 + }, + { + "epoch": 4.856660527931246, + "grad_norm": 0.2333129197359085, + "learning_rate": 5.474965339484105e-05, + "loss": 1.7788, + "step": 15823 + }, + { + "epoch": 4.856967464702271, + "grad_norm": 0.21180212497711182, + "learning_rate": 5.47447052887201e-05, + "loss": 1.7513, + "step": 15824 + }, + { + "epoch": 4.857274401473297, + "grad_norm": 0.22641299664974213, + "learning_rate": 5.473975713570919e-05, + "loss": 1.7514, + "step": 15825 + }, + { + "epoch": 4.857581338244322, + "grad_norm": 0.3179668188095093, + "learning_rate": 5.473480893585723e-05, + "loss": 1.7939, + "step": 15826 + }, + { + "epoch": 4.8578882750153465, + "grad_norm": 0.27463147044181824, + "learning_rate": 5.472986068921309e-05, + "loss": 1.7487, + "step": 15827 + }, + { + "epoch": 4.858195211786372, + "grad_norm": 0.18621626496315002, + "learning_rate": 5.472491239582572e-05, + "loss": 1.7155, + "step": 15828 + }, + { + "epoch": 4.858502148557397, + "grad_norm": 0.2437327802181244, + "learning_rate": 5.471996405574399e-05, + "loss": 1.7586, + "step": 15829 + }, + { + "epoch": 4.8588090853284225, + "grad_norm": 0.26658934354782104, + "learning_rate": 5.47150156690168e-05, + "loss": 1.7331, + "step": 15830 + }, + { + "epoch": 4.859116022099448, + "grad_norm": 0.2257174700498581, + "learning_rate": 5.471006723569308e-05, + "loss": 1.7556, + "step": 15831 + }, + { + "epoch": 4.859422958870473, + "grad_norm": 0.25434550642967224, + "learning_rate": 5.470511875582168e-05, + "loss": 1.7196, + "step": 15832 + }, + { + "epoch": 4.859729895641498, + "grad_norm": 0.2251453697681427, + "learning_rate": 5.470017022945156e-05, + "loss": 1.7174, + "step": 15833 + }, + { + "epoch": 4.860036832412523, + "grad_norm": 0.2757972180843353, + "learning_rate": 5.469522165663161e-05, + "loss": 1.7701, + "step": 15834 + }, + { + "epoch": 4.860343769183548, + "grad_norm": 0.2771994173526764, + "learning_rate": 5.469027303741072e-05, + "loss": 1.8085, + "step": 15835 + }, + { + "epoch": 4.860650705954574, + "grad_norm": 0.23825454711914062, + "learning_rate": 5.468532437183781e-05, + "loss": 1.733, + "step": 15836 + }, + { + "epoch": 4.860957642725598, + "grad_norm": 0.18100066483020782, + "learning_rate": 5.468037565996177e-05, + "loss": 1.7012, + "step": 15837 + }, + { + "epoch": 4.861264579496623, + "grad_norm": 0.22552812099456787, + "learning_rate": 5.4675426901831506e-05, + "loss": 1.728, + "step": 15838 + }, + { + "epoch": 4.861571516267649, + "grad_norm": 0.2505643665790558, + "learning_rate": 5.467047809749595e-05, + "loss": 1.7219, + "step": 15839 + }, + { + "epoch": 4.861878453038674, + "grad_norm": 0.25920796394348145, + "learning_rate": 5.4665529247003975e-05, + "loss": 1.7945, + "step": 15840 + }, + { + "epoch": 4.862185389809699, + "grad_norm": 0.23549394309520721, + "learning_rate": 5.466058035040452e-05, + "loss": 1.7904, + "step": 15841 + }, + { + "epoch": 4.862492326580725, + "grad_norm": 0.26510992646217346, + "learning_rate": 5.465563140774648e-05, + "loss": 1.8051, + "step": 15842 + }, + { + "epoch": 4.862799263351749, + "grad_norm": 0.19175390899181366, + "learning_rate": 5.465068241907876e-05, + "loss": 1.6799, + "step": 15843 + }, + { + "epoch": 4.8631062001227745, + "grad_norm": 0.2588976323604584, + "learning_rate": 5.464573338445025e-05, + "loss": 1.7394, + "step": 15844 + }, + { + "epoch": 4.8634131368938, + "grad_norm": 0.28729483485221863, + "learning_rate": 5.464078430390991e-05, + "loss": 1.797, + "step": 15845 + }, + { + "epoch": 4.863720073664825, + "grad_norm": 0.21302445232868195, + "learning_rate": 5.463583517750661e-05, + "loss": 1.7303, + "step": 15846 + }, + { + "epoch": 4.8640270104358505, + "grad_norm": 0.2407636195421219, + "learning_rate": 5.463088600528926e-05, + "loss": 1.7175, + "step": 15847 + }, + { + "epoch": 4.864333947206875, + "grad_norm": 0.25653502345085144, + "learning_rate": 5.4625936787306784e-05, + "loss": 1.6996, + "step": 15848 + }, + { + "epoch": 4.8646408839779, + "grad_norm": 0.2100832760334015, + "learning_rate": 5.462098752360809e-05, + "loss": 1.7416, + "step": 15849 + }, + { + "epoch": 4.864947820748926, + "grad_norm": 0.2785186469554901, + "learning_rate": 5.461603821424208e-05, + "loss": 1.74, + "step": 15850 + }, + { + "epoch": 4.865254757519951, + "grad_norm": 0.2896614968776703, + "learning_rate": 5.4611088859257696e-05, + "loss": 1.7436, + "step": 15851 + }, + { + "epoch": 4.865561694290976, + "grad_norm": 0.18890418112277985, + "learning_rate": 5.460613945870382e-05, + "loss": 1.7093, + "step": 15852 + }, + { + "epoch": 4.865868631062002, + "grad_norm": 0.27681079506874084, + "learning_rate": 5.4601190012629364e-05, + "loss": 1.8772, + "step": 15853 + }, + { + "epoch": 4.866175567833026, + "grad_norm": 0.24658115208148956, + "learning_rate": 5.4596240521083265e-05, + "loss": 1.776, + "step": 15854 + }, + { + "epoch": 4.866482504604051, + "grad_norm": 0.21958144009113312, + "learning_rate": 5.459129098411441e-05, + "loss": 1.7503, + "step": 15855 + }, + { + "epoch": 4.866789441375077, + "grad_norm": 0.2778300642967224, + "learning_rate": 5.458634140177174e-05, + "loss": 1.8194, + "step": 15856 + }, + { + "epoch": 4.867096378146102, + "grad_norm": 0.28673580288887024, + "learning_rate": 5.458139177410414e-05, + "loss": 1.8033, + "step": 15857 + }, + { + "epoch": 4.867403314917127, + "grad_norm": 0.24472850561141968, + "learning_rate": 5.457644210116055e-05, + "loss": 1.7304, + "step": 15858 + }, + { + "epoch": 4.867710251688152, + "grad_norm": 0.24581189453601837, + "learning_rate": 5.4571492382989886e-05, + "loss": 1.7443, + "step": 15859 + }, + { + "epoch": 4.868017188459177, + "grad_norm": 0.22296221554279327, + "learning_rate": 5.4566542619641045e-05, + "loss": 1.7201, + "step": 15860 + }, + { + "epoch": 4.8683241252302025, + "grad_norm": 0.2378673404455185, + "learning_rate": 5.456159281116295e-05, + "loss": 1.7893, + "step": 15861 + }, + { + "epoch": 4.868631062001228, + "grad_norm": 0.3320823907852173, + "learning_rate": 5.4556642957604534e-05, + "loss": 1.7944, + "step": 15862 + }, + { + "epoch": 4.868937998772253, + "grad_norm": 0.3303453326225281, + "learning_rate": 5.45516930590147e-05, + "loss": 1.7267, + "step": 15863 + }, + { + "epoch": 4.8692449355432785, + "grad_norm": 0.223227858543396, + "learning_rate": 5.454674311544235e-05, + "loss": 1.7477, + "step": 15864 + }, + { + "epoch": 4.869551872314303, + "grad_norm": 0.3012549579143524, + "learning_rate": 5.454179312693643e-05, + "loss": 1.731, + "step": 15865 + }, + { + "epoch": 4.869858809085328, + "grad_norm": 0.3780311942100525, + "learning_rate": 5.453684309354585e-05, + "loss": 1.7296, + "step": 15866 + }, + { + "epoch": 4.870165745856354, + "grad_norm": 0.2753889262676239, + "learning_rate": 5.4531893015319526e-05, + "loss": 1.8024, + "step": 15867 + }, + { + "epoch": 4.870472682627379, + "grad_norm": 0.2270934134721756, + "learning_rate": 5.452694289230639e-05, + "loss": 1.7095, + "step": 15868 + }, + { + "epoch": 4.870779619398404, + "grad_norm": 0.2621576488018036, + "learning_rate": 5.452199272455534e-05, + "loss": 1.75, + "step": 15869 + }, + { + "epoch": 4.871086556169429, + "grad_norm": 0.22175776958465576, + "learning_rate": 5.45170425121153e-05, + "loss": 1.7658, + "step": 15870 + }, + { + "epoch": 4.871393492940454, + "grad_norm": 0.2038736790418625, + "learning_rate": 5.451209225503521e-05, + "loss": 1.6916, + "step": 15871 + }, + { + "epoch": 4.871700429711479, + "grad_norm": 0.2493467777967453, + "learning_rate": 5.450714195336397e-05, + "loss": 1.7408, + "step": 15872 + }, + { + "epoch": 4.872007366482505, + "grad_norm": 0.1966754049062729, + "learning_rate": 5.450219160715052e-05, + "loss": 1.7379, + "step": 15873 + }, + { + "epoch": 4.87231430325353, + "grad_norm": 0.23193517327308655, + "learning_rate": 5.4497241216443775e-05, + "loss": 1.7736, + "step": 15874 + }, + { + "epoch": 4.872621240024555, + "grad_norm": 0.2164391279220581, + "learning_rate": 5.4492290781292646e-05, + "loss": 1.7618, + "step": 15875 + }, + { + "epoch": 4.87292817679558, + "grad_norm": 0.286460816860199, + "learning_rate": 5.448734030174607e-05, + "loss": 1.7745, + "step": 15876 + }, + { + "epoch": 4.873235113566605, + "grad_norm": 0.3454538881778717, + "learning_rate": 5.448238977785298e-05, + "loss": 1.7605, + "step": 15877 + }, + { + "epoch": 4.8735420503376305, + "grad_norm": 0.26775062084198, + "learning_rate": 5.447743920966227e-05, + "loss": 1.7263, + "step": 15878 + }, + { + "epoch": 4.873848987108656, + "grad_norm": 0.2644907832145691, + "learning_rate": 5.447248859722289e-05, + "loss": 1.8489, + "step": 15879 + }, + { + "epoch": 4.87415592387968, + "grad_norm": 0.21646654605865479, + "learning_rate": 5.446753794058376e-05, + "loss": 1.7605, + "step": 15880 + }, + { + "epoch": 4.874462860650706, + "grad_norm": 0.23431318998336792, + "learning_rate": 5.446258723979381e-05, + "loss": 1.7209, + "step": 15881 + }, + { + "epoch": 4.874769797421731, + "grad_norm": 0.24665607511997223, + "learning_rate": 5.4457636494901934e-05, + "loss": 1.813, + "step": 15882 + }, + { + "epoch": 4.875076734192756, + "grad_norm": 0.26269975304603577, + "learning_rate": 5.445268570595708e-05, + "loss": 1.8255, + "step": 15883 + }, + { + "epoch": 4.875383670963782, + "grad_norm": 0.2722402811050415, + "learning_rate": 5.444773487300819e-05, + "loss": 1.7795, + "step": 15884 + }, + { + "epoch": 4.875690607734807, + "grad_norm": 0.3235624134540558, + "learning_rate": 5.444278399610417e-05, + "loss": 1.7804, + "step": 15885 + }, + { + "epoch": 4.8759975445058314, + "grad_norm": 0.2647583782672882, + "learning_rate": 5.4437833075293964e-05, + "loss": 1.7359, + "step": 15886 + }, + { + "epoch": 4.876304481276857, + "grad_norm": 0.272370845079422, + "learning_rate": 5.443288211062649e-05, + "loss": 1.7605, + "step": 15887 + }, + { + "epoch": 4.876611418047882, + "grad_norm": 0.3147594630718231, + "learning_rate": 5.4427931102150675e-05, + "loss": 1.7118, + "step": 15888 + }, + { + "epoch": 4.8769183548189075, + "grad_norm": 0.22751441597938538, + "learning_rate": 5.442298004991544e-05, + "loss": 1.723, + "step": 15889 + }, + { + "epoch": 4.877225291589933, + "grad_norm": 0.2121521681547165, + "learning_rate": 5.441802895396972e-05, + "loss": 1.7485, + "step": 15890 + }, + { + "epoch": 4.877532228360957, + "grad_norm": 0.25370222330093384, + "learning_rate": 5.4413077814362466e-05, + "loss": 1.8064, + "step": 15891 + }, + { + "epoch": 4.877839165131983, + "grad_norm": 0.19492633640766144, + "learning_rate": 5.440812663114259e-05, + "loss": 1.6773, + "step": 15892 + }, + { + "epoch": 4.878146101903008, + "grad_norm": 0.2101750522851944, + "learning_rate": 5.440317540435901e-05, + "loss": 1.7215, + "step": 15893 + }, + { + "epoch": 4.878453038674033, + "grad_norm": 0.21150651574134827, + "learning_rate": 5.439822413406068e-05, + "loss": 1.7875, + "step": 15894 + }, + { + "epoch": 4.878759975445059, + "grad_norm": 0.21008379757404327, + "learning_rate": 5.439327282029651e-05, + "loss": 1.7108, + "step": 15895 + }, + { + "epoch": 4.879066912216084, + "grad_norm": 0.22885502874851227, + "learning_rate": 5.4388321463115453e-05, + "loss": 1.7899, + "step": 15896 + }, + { + "epoch": 4.879373848987108, + "grad_norm": 0.24868059158325195, + "learning_rate": 5.4383370062566444e-05, + "loss": 1.7368, + "step": 15897 + }, + { + "epoch": 4.879680785758134, + "grad_norm": 0.27225378155708313, + "learning_rate": 5.437841861869838e-05, + "loss": 1.7623, + "step": 15898 + }, + { + "epoch": 4.879987722529159, + "grad_norm": 0.23353120684623718, + "learning_rate": 5.437346713156023e-05, + "loss": 1.7908, + "step": 15899 + }, + { + "epoch": 4.880294659300184, + "grad_norm": 0.19032470881938934, + "learning_rate": 5.436851560120091e-05, + "loss": 1.7511, + "step": 15900 + }, + { + "epoch": 4.88060159607121, + "grad_norm": 0.23714862763881683, + "learning_rate": 5.4363564027669345e-05, + "loss": 1.7197, + "step": 15901 + }, + { + "epoch": 4.880908532842234, + "grad_norm": 0.24897022545337677, + "learning_rate": 5.4358612411014495e-05, + "loss": 1.7822, + "step": 15902 + }, + { + "epoch": 4.8812154696132595, + "grad_norm": 0.21433588862419128, + "learning_rate": 5.435366075128528e-05, + "loss": 1.7928, + "step": 15903 + }, + { + "epoch": 4.881522406384285, + "grad_norm": 0.30019649863243103, + "learning_rate": 5.4348709048530646e-05, + "loss": 1.8067, + "step": 15904 + }, + { + "epoch": 4.88182934315531, + "grad_norm": 0.20227669179439545, + "learning_rate": 5.4343757302799515e-05, + "loss": 1.7254, + "step": 15905 + }, + { + "epoch": 4.8821362799263355, + "grad_norm": 0.23447728157043457, + "learning_rate": 5.4338805514140836e-05, + "loss": 1.7314, + "step": 15906 + }, + { + "epoch": 4.882443216697361, + "grad_norm": 0.29545050859451294, + "learning_rate": 5.4333853682603506e-05, + "loss": 1.7659, + "step": 15907 + }, + { + "epoch": 4.882750153468385, + "grad_norm": 0.245390385389328, + "learning_rate": 5.432890180823652e-05, + "loss": 1.7264, + "step": 15908 + }, + { + "epoch": 4.883057090239411, + "grad_norm": 0.209987074136734, + "learning_rate": 5.432394989108879e-05, + "loss": 1.7174, + "step": 15909 + }, + { + "epoch": 4.883364027010436, + "grad_norm": 0.2402341365814209, + "learning_rate": 5.431899793120925e-05, + "loss": 1.7512, + "step": 15910 + }, + { + "epoch": 4.883670963781461, + "grad_norm": 0.26227688789367676, + "learning_rate": 5.431404592864684e-05, + "loss": 1.7697, + "step": 15911 + }, + { + "epoch": 4.883977900552486, + "grad_norm": 0.2556503117084503, + "learning_rate": 5.4309093883450504e-05, + "loss": 1.8191, + "step": 15912 + }, + { + "epoch": 4.884284837323511, + "grad_norm": 0.24766884744167328, + "learning_rate": 5.4304141795669174e-05, + "loss": 1.7574, + "step": 15913 + }, + { + "epoch": 4.884591774094536, + "grad_norm": 0.19925951957702637, + "learning_rate": 5.429918966535179e-05, + "loss": 1.7249, + "step": 15914 + }, + { + "epoch": 4.884898710865562, + "grad_norm": 0.1899442970752716, + "learning_rate": 5.4294237492547294e-05, + "loss": 1.7446, + "step": 15915 + }, + { + "epoch": 4.885205647636587, + "grad_norm": 0.25900956988334656, + "learning_rate": 5.4289285277304636e-05, + "loss": 1.725, + "step": 15916 + }, + { + "epoch": 4.885512584407612, + "grad_norm": 0.2537781000137329, + "learning_rate": 5.428433301967274e-05, + "loss": 1.7861, + "step": 15917 + }, + { + "epoch": 4.885819521178637, + "grad_norm": 0.26432034373283386, + "learning_rate": 5.427938071970054e-05, + "loss": 1.7538, + "step": 15918 + }, + { + "epoch": 4.886126457949662, + "grad_norm": 0.22722363471984863, + "learning_rate": 5.4274428377437e-05, + "loss": 1.7631, + "step": 15919 + }, + { + "epoch": 4.8864333947206875, + "grad_norm": 0.24846172332763672, + "learning_rate": 5.426947599293106e-05, + "loss": 1.7833, + "step": 15920 + }, + { + "epoch": 4.886740331491713, + "grad_norm": 0.24821995198726654, + "learning_rate": 5.426452356623165e-05, + "loss": 1.7638, + "step": 15921 + }, + { + "epoch": 4.887047268262738, + "grad_norm": 0.2796781063079834, + "learning_rate": 5.425957109738773e-05, + "loss": 1.6982, + "step": 15922 + }, + { + "epoch": 4.887354205033763, + "grad_norm": 0.2875385284423828, + "learning_rate": 5.425461858644821e-05, + "loss": 1.7172, + "step": 15923 + }, + { + "epoch": 4.887661141804788, + "grad_norm": 0.21614491939544678, + "learning_rate": 5.424966603346207e-05, + "loss": 1.7521, + "step": 15924 + }, + { + "epoch": 4.887968078575813, + "grad_norm": 0.22944390773773193, + "learning_rate": 5.4244713438478235e-05, + "loss": 1.772, + "step": 15925 + }, + { + "epoch": 4.888275015346839, + "grad_norm": 0.21566039323806763, + "learning_rate": 5.423976080154566e-05, + "loss": 1.734, + "step": 15926 + }, + { + "epoch": 4.888581952117864, + "grad_norm": 0.4253925383090973, + "learning_rate": 5.4234808122713275e-05, + "loss": 1.8017, + "step": 15927 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.239146426320076, + "learning_rate": 5.422985540203004e-05, + "loss": 1.7229, + "step": 15928 + }, + { + "epoch": 4.889195825659914, + "grad_norm": 0.2344054877758026, + "learning_rate": 5.42249026395449e-05, + "loss": 1.7111, + "step": 15929 + }, + { + "epoch": 4.889502762430939, + "grad_norm": 0.21717922389507294, + "learning_rate": 5.421994983530679e-05, + "loss": 1.7427, + "step": 15930 + }, + { + "epoch": 4.889809699201964, + "grad_norm": 0.26895472407341003, + "learning_rate": 5.421499698936466e-05, + "loss": 1.8402, + "step": 15931 + }, + { + "epoch": 4.89011663597299, + "grad_norm": 0.25761866569519043, + "learning_rate": 5.421004410176746e-05, + "loss": 1.7822, + "step": 15932 + }, + { + "epoch": 4.890423572744015, + "grad_norm": 0.24465128779411316, + "learning_rate": 5.420509117256415e-05, + "loss": 1.8074, + "step": 15933 + }, + { + "epoch": 4.8907305095150395, + "grad_norm": 0.2527398467063904, + "learning_rate": 5.4200138201803655e-05, + "loss": 1.7522, + "step": 15934 + }, + { + "epoch": 4.891037446286065, + "grad_norm": 0.23118112981319427, + "learning_rate": 5.4195185189534916e-05, + "loss": 1.7394, + "step": 15935 + }, + { + "epoch": 4.89134438305709, + "grad_norm": 0.2054537534713745, + "learning_rate": 5.419023213580691e-05, + "loss": 1.7096, + "step": 15936 + }, + { + "epoch": 4.8916513198281155, + "grad_norm": 0.2929638922214508, + "learning_rate": 5.418527904066858e-05, + "loss": 1.8733, + "step": 15937 + }, + { + "epoch": 4.891958256599141, + "grad_norm": 0.2957170009613037, + "learning_rate": 5.418032590416886e-05, + "loss": 1.7201, + "step": 15938 + }, + { + "epoch": 4.892265193370166, + "grad_norm": 0.2520081698894501, + "learning_rate": 5.417537272635672e-05, + "loss": 1.7034, + "step": 15939 + }, + { + "epoch": 4.892572130141191, + "grad_norm": 0.25217053294181824, + "learning_rate": 5.41704195072811e-05, + "loss": 1.8538, + "step": 15940 + }, + { + "epoch": 4.892879066912216, + "grad_norm": 0.23605379462242126, + "learning_rate": 5.416546624699093e-05, + "loss": 1.724, + "step": 15941 + }, + { + "epoch": 4.893186003683241, + "grad_norm": 0.321750283241272, + "learning_rate": 5.416051294553519e-05, + "loss": 1.806, + "step": 15942 + }, + { + "epoch": 4.893492940454267, + "grad_norm": 0.23800241947174072, + "learning_rate": 5.415555960296284e-05, + "loss": 1.7578, + "step": 15943 + }, + { + "epoch": 4.893799877225292, + "grad_norm": 0.3423094153404236, + "learning_rate": 5.4150606219322796e-05, + "loss": 1.7324, + "step": 15944 + }, + { + "epoch": 4.894106813996316, + "grad_norm": 0.453074187040329, + "learning_rate": 5.414565279466404e-05, + "loss": 1.7268, + "step": 15945 + }, + { + "epoch": 4.894413750767342, + "grad_norm": 0.21972697973251343, + "learning_rate": 5.4140699329035504e-05, + "loss": 1.6547, + "step": 15946 + }, + { + "epoch": 4.894720687538367, + "grad_norm": 0.32876282930374146, + "learning_rate": 5.413574582248616e-05, + "loss": 1.7527, + "step": 15947 + }, + { + "epoch": 4.895027624309392, + "grad_norm": 0.34035229682922363, + "learning_rate": 5.413079227506494e-05, + "loss": 1.7636, + "step": 15948 + }, + { + "epoch": 4.895334561080418, + "grad_norm": 0.2410411536693573, + "learning_rate": 5.412583868682082e-05, + "loss": 1.8114, + "step": 15949 + }, + { + "epoch": 4.895641497851443, + "grad_norm": 0.2787366211414337, + "learning_rate": 5.412088505780274e-05, + "loss": 1.7393, + "step": 15950 + }, + { + "epoch": 4.8959484346224675, + "grad_norm": 0.23288428783416748, + "learning_rate": 5.411593138805966e-05, + "loss": 1.7413, + "step": 15951 + }, + { + "epoch": 4.896255371393493, + "grad_norm": 0.26302778720855713, + "learning_rate": 5.411097767764053e-05, + "loss": 1.7372, + "step": 15952 + }, + { + "epoch": 4.896562308164518, + "grad_norm": 0.31638020277023315, + "learning_rate": 5.410602392659431e-05, + "loss": 1.8114, + "step": 15953 + }, + { + "epoch": 4.8968692449355435, + "grad_norm": 0.23361825942993164, + "learning_rate": 5.410107013496996e-05, + "loss": 1.7592, + "step": 15954 + }, + { + "epoch": 4.897176181706568, + "grad_norm": 0.19887785613536835, + "learning_rate": 5.409611630281642e-05, + "loss": 1.7509, + "step": 15955 + }, + { + "epoch": 4.897483118477593, + "grad_norm": 0.22396783530712128, + "learning_rate": 5.409116243018266e-05, + "loss": 1.6841, + "step": 15956 + }, + { + "epoch": 4.897790055248619, + "grad_norm": 0.20397686958312988, + "learning_rate": 5.4086208517117645e-05, + "loss": 1.7427, + "step": 15957 + }, + { + "epoch": 4.898096992019644, + "grad_norm": 0.20848311483860016, + "learning_rate": 5.4081254563670314e-05, + "loss": 1.713, + "step": 15958 + }, + { + "epoch": 4.898403928790669, + "grad_norm": 0.2739275395870209, + "learning_rate": 5.407630056988964e-05, + "loss": 1.7673, + "step": 15959 + }, + { + "epoch": 4.898710865561695, + "grad_norm": 0.21485929191112518, + "learning_rate": 5.407134653582456e-05, + "loss": 1.7347, + "step": 15960 + }, + { + "epoch": 4.899017802332719, + "grad_norm": 0.26980286836624146, + "learning_rate": 5.406639246152406e-05, + "loss": 1.7158, + "step": 15961 + }, + { + "epoch": 4.899324739103744, + "grad_norm": 0.22327515482902527, + "learning_rate": 5.4061438347037084e-05, + "loss": 1.7387, + "step": 15962 + }, + { + "epoch": 4.89963167587477, + "grad_norm": 0.2542823553085327, + "learning_rate": 5.4056484192412603e-05, + "loss": 1.7826, + "step": 15963 + }, + { + "epoch": 4.899938612645795, + "grad_norm": 0.3248840868473053, + "learning_rate": 5.405152999769956e-05, + "loss": 1.7878, + "step": 15964 + }, + { + "epoch": 4.9002455494168204, + "grad_norm": 0.21210803091526031, + "learning_rate": 5.404657576294691e-05, + "loss": 1.7378, + "step": 15965 + }, + { + "epoch": 4.900552486187845, + "grad_norm": 0.25679782032966614, + "learning_rate": 5.404162148820365e-05, + "loss": 1.7493, + "step": 15966 + }, + { + "epoch": 4.90085942295887, + "grad_norm": 0.36698678135871887, + "learning_rate": 5.4036667173518704e-05, + "loss": 1.7662, + "step": 15967 + }, + { + "epoch": 4.901166359729896, + "grad_norm": 0.3396874964237213, + "learning_rate": 5.403171281894105e-05, + "loss": 1.7618, + "step": 15968 + }, + { + "epoch": 4.901473296500921, + "grad_norm": 0.2792030870914459, + "learning_rate": 5.402675842451964e-05, + "loss": 1.7858, + "step": 15969 + }, + { + "epoch": 4.901780233271946, + "grad_norm": 0.24499626457691193, + "learning_rate": 5.4021803990303454e-05, + "loss": 1.7503, + "step": 15970 + }, + { + "epoch": 4.902087170042972, + "grad_norm": 0.29185110330581665, + "learning_rate": 5.401684951634144e-05, + "loss": 1.7536, + "step": 15971 + }, + { + "epoch": 4.902394106813996, + "grad_norm": 0.2480020374059677, + "learning_rate": 5.401189500268256e-05, + "loss": 1.7877, + "step": 15972 + }, + { + "epoch": 4.902701043585021, + "grad_norm": 0.3302663564682007, + "learning_rate": 5.400694044937579e-05, + "loss": 1.8693, + "step": 15973 + }, + { + "epoch": 4.903007980356047, + "grad_norm": 0.2500915825366974, + "learning_rate": 5.400198585647008e-05, + "loss": 1.7489, + "step": 15974 + }, + { + "epoch": 4.903314917127072, + "grad_norm": 0.25079864263534546, + "learning_rate": 5.399703122401441e-05, + "loss": 1.7965, + "step": 15975 + }, + { + "epoch": 4.903621853898097, + "grad_norm": 0.2643207907676697, + "learning_rate": 5.399207655205771e-05, + "loss": 1.7696, + "step": 15976 + }, + { + "epoch": 4.903928790669122, + "grad_norm": 0.23719522356987, + "learning_rate": 5.398712184064899e-05, + "loss": 1.7608, + "step": 15977 + }, + { + "epoch": 4.904235727440147, + "grad_norm": 0.25226888060569763, + "learning_rate": 5.3982167089837184e-05, + "loss": 1.8055, + "step": 15978 + }, + { + "epoch": 4.9045426642111725, + "grad_norm": 0.21601852774620056, + "learning_rate": 5.39772122996713e-05, + "loss": 1.7553, + "step": 15979 + }, + { + "epoch": 4.904849600982198, + "grad_norm": 0.20275430381298065, + "learning_rate": 5.397225747020023e-05, + "loss": 1.7221, + "step": 15980 + }, + { + "epoch": 4.905156537753223, + "grad_norm": 0.24815937876701355, + "learning_rate": 5.3967302601473e-05, + "loss": 1.8098, + "step": 15981 + }, + { + "epoch": 4.9054634745242485, + "grad_norm": 0.2193612903356552, + "learning_rate": 5.3962347693538575e-05, + "loss": 1.7116, + "step": 15982 + }, + { + "epoch": 4.905770411295273, + "grad_norm": 0.21409118175506592, + "learning_rate": 5.395739274644589e-05, + "loss": 1.7503, + "step": 15983 + }, + { + "epoch": 4.906077348066298, + "grad_norm": 0.20907564461231232, + "learning_rate": 5.3952437760243935e-05, + "loss": 1.7518, + "step": 15984 + }, + { + "epoch": 4.906384284837324, + "grad_norm": 0.21193571388721466, + "learning_rate": 5.394748273498168e-05, + "loss": 1.6905, + "step": 15985 + }, + { + "epoch": 4.906691221608349, + "grad_norm": 0.19729891419410706, + "learning_rate": 5.394252767070808e-05, + "loss": 1.7398, + "step": 15986 + }, + { + "epoch": 4.906998158379373, + "grad_norm": 0.2654789686203003, + "learning_rate": 5.393757256747211e-05, + "loss": 1.7931, + "step": 15987 + }, + { + "epoch": 4.907305095150399, + "grad_norm": 0.2627345025539398, + "learning_rate": 5.3932617425322726e-05, + "loss": 1.8174, + "step": 15988 + }, + { + "epoch": 4.907612031921424, + "grad_norm": 0.27162298560142517, + "learning_rate": 5.392766224430894e-05, + "loss": 1.8015, + "step": 15989 + }, + { + "epoch": 4.907918968692449, + "grad_norm": 0.24248667061328888, + "learning_rate": 5.3922707024479676e-05, + "loss": 1.7457, + "step": 15990 + }, + { + "epoch": 4.908225905463475, + "grad_norm": 0.24715331196784973, + "learning_rate": 5.391775176588393e-05, + "loss": 1.7724, + "step": 15991 + }, + { + "epoch": 4.9085328422345, + "grad_norm": 0.26335644721984863, + "learning_rate": 5.3912796468570656e-05, + "loss": 1.7183, + "step": 15992 + }, + { + "epoch": 4.9088397790055245, + "grad_norm": 0.23459944128990173, + "learning_rate": 5.3907841132588843e-05, + "loss": 1.7245, + "step": 15993 + }, + { + "epoch": 4.90914671577655, + "grad_norm": 0.21779637038707733, + "learning_rate": 5.3902885757987444e-05, + "loss": 1.7485, + "step": 15994 + }, + { + "epoch": 4.909453652547575, + "grad_norm": 0.227664977312088, + "learning_rate": 5.389793034481545e-05, + "loss": 1.7418, + "step": 15995 + }, + { + "epoch": 4.9097605893186005, + "grad_norm": 0.26230278611183167, + "learning_rate": 5.389297489312183e-05, + "loss": 1.7619, + "step": 15996 + }, + { + "epoch": 4.910067526089626, + "grad_norm": 0.22563579678535461, + "learning_rate": 5.388801940295555e-05, + "loss": 1.7168, + "step": 15997 + }, + { + "epoch": 4.91037446286065, + "grad_norm": 0.24829435348510742, + "learning_rate": 5.388306387436556e-05, + "loss": 1.7422, + "step": 15998 + }, + { + "epoch": 4.910681399631676, + "grad_norm": 0.24395976960659027, + "learning_rate": 5.387810830740088e-05, + "loss": 1.7783, + "step": 15999 + }, + { + "epoch": 4.910988336402701, + "grad_norm": 0.2189297378063202, + "learning_rate": 5.387315270211044e-05, + "loss": 1.7885, + "step": 16000 + }, + { + "epoch": 4.911295273173726, + "grad_norm": 0.21750971674919128, + "learning_rate": 5.386819705854324e-05, + "loss": 1.7659, + "step": 16001 + }, + { + "epoch": 4.911602209944752, + "grad_norm": 0.21907657384872437, + "learning_rate": 5.386324137674826e-05, + "loss": 1.789, + "step": 16002 + }, + { + "epoch": 4.911909146715777, + "grad_norm": 0.18778781592845917, + "learning_rate": 5.3858285656774465e-05, + "loss": 1.7151, + "step": 16003 + }, + { + "epoch": 4.912216083486801, + "grad_norm": 0.24217712879180908, + "learning_rate": 5.385332989867082e-05, + "loss": 1.8108, + "step": 16004 + }, + { + "epoch": 4.912523020257827, + "grad_norm": 0.27637016773223877, + "learning_rate": 5.384837410248632e-05, + "loss": 1.8368, + "step": 16005 + }, + { + "epoch": 4.912829957028852, + "grad_norm": 0.22366084158420563, + "learning_rate": 5.3843418268269926e-05, + "loss": 1.7351, + "step": 16006 + }, + { + "epoch": 4.913136893799877, + "grad_norm": 0.2742357552051544, + "learning_rate": 5.383846239607062e-05, + "loss": 1.7599, + "step": 16007 + }, + { + "epoch": 4.913443830570903, + "grad_norm": 0.2288598269224167, + "learning_rate": 5.383350648593738e-05, + "loss": 1.7056, + "step": 16008 + }, + { + "epoch": 4.913750767341927, + "grad_norm": 0.23319020867347717, + "learning_rate": 5.382855053791919e-05, + "loss": 1.7356, + "step": 16009 + }, + { + "epoch": 4.9140577041129525, + "grad_norm": 0.2232198268175125, + "learning_rate": 5.382359455206499e-05, + "loss": 1.7375, + "step": 16010 + }, + { + "epoch": 4.914364640883978, + "grad_norm": 0.24420048296451569, + "learning_rate": 5.381863852842381e-05, + "loss": 1.8287, + "step": 16011 + }, + { + "epoch": 4.914671577655003, + "grad_norm": 0.22653080523014069, + "learning_rate": 5.381368246704461e-05, + "loss": 1.7137, + "step": 16012 + }, + { + "epoch": 4.9149785144260285, + "grad_norm": 0.20439405739307404, + "learning_rate": 5.380872636797637e-05, + "loss": 1.7688, + "step": 16013 + }, + { + "epoch": 4.915285451197054, + "grad_norm": 0.2602155804634094, + "learning_rate": 5.380377023126806e-05, + "loss": 1.7875, + "step": 16014 + }, + { + "epoch": 4.915592387968078, + "grad_norm": 0.2757892608642578, + "learning_rate": 5.3798814056968647e-05, + "loss": 1.7446, + "step": 16015 + }, + { + "epoch": 4.915899324739104, + "grad_norm": 0.25938209891319275, + "learning_rate": 5.379385784512714e-05, + "loss": 1.6997, + "step": 16016 + }, + { + "epoch": 4.916206261510129, + "grad_norm": 0.2056962549686432, + "learning_rate": 5.37889015957925e-05, + "loss": 1.6961, + "step": 16017 + }, + { + "epoch": 4.916513198281154, + "grad_norm": 0.24388402700424194, + "learning_rate": 5.3783945309013714e-05, + "loss": 1.712, + "step": 16018 + }, + { + "epoch": 4.91682013505218, + "grad_norm": 0.2381993532180786, + "learning_rate": 5.3778988984839775e-05, + "loss": 1.7444, + "step": 16019 + }, + { + "epoch": 4.917127071823204, + "grad_norm": 0.20201562345027924, + "learning_rate": 5.377403262331964e-05, + "loss": 1.7254, + "step": 16020 + }, + { + "epoch": 4.917434008594229, + "grad_norm": 0.24019409716129303, + "learning_rate": 5.376907622450229e-05, + "loss": 1.684, + "step": 16021 + }, + { + "epoch": 4.917740945365255, + "grad_norm": 0.2441694289445877, + "learning_rate": 5.376411978843674e-05, + "loss": 1.7334, + "step": 16022 + }, + { + "epoch": 4.91804788213628, + "grad_norm": 0.23866300284862518, + "learning_rate": 5.3759163315171945e-05, + "loss": 1.7258, + "step": 16023 + }, + { + "epoch": 4.918354818907305, + "grad_norm": 0.28068670630455017, + "learning_rate": 5.375420680475689e-05, + "loss": 1.8049, + "step": 16024 + }, + { + "epoch": 4.918661755678331, + "grad_norm": 0.2956274151802063, + "learning_rate": 5.3749250257240566e-05, + "loss": 1.8544, + "step": 16025 + }, + { + "epoch": 4.918968692449355, + "grad_norm": 0.1971627175807953, + "learning_rate": 5.374429367267196e-05, + "loss": 1.7314, + "step": 16026 + }, + { + "epoch": 4.9192756292203805, + "grad_norm": 0.28565749526023865, + "learning_rate": 5.373933705110004e-05, + "loss": 1.7587, + "step": 16027 + }, + { + "epoch": 4.919582565991406, + "grad_norm": 0.3087369501590729, + "learning_rate": 5.37343803925738e-05, + "loss": 1.7708, + "step": 16028 + }, + { + "epoch": 4.919889502762431, + "grad_norm": 0.22460010647773743, + "learning_rate": 5.372942369714223e-05, + "loss": 1.7401, + "step": 16029 + }, + { + "epoch": 4.920196439533456, + "grad_norm": 0.29492735862731934, + "learning_rate": 5.3724466964854326e-05, + "loss": 1.7033, + "step": 16030 + }, + { + "epoch": 4.920503376304481, + "grad_norm": 0.24452674388885498, + "learning_rate": 5.371951019575904e-05, + "loss": 1.7688, + "step": 16031 + }, + { + "epoch": 4.920810313075506, + "grad_norm": 0.24686957895755768, + "learning_rate": 5.3714553389905366e-05, + "loss": 1.7463, + "step": 16032 + }, + { + "epoch": 4.921117249846532, + "grad_norm": 0.23661597073078156, + "learning_rate": 5.37095965473423e-05, + "loss": 1.7256, + "step": 16033 + }, + { + "epoch": 4.921424186617557, + "grad_norm": 0.22861288487911224, + "learning_rate": 5.370463966811884e-05, + "loss": 1.7722, + "step": 16034 + }, + { + "epoch": 4.921731123388582, + "grad_norm": 0.2453136146068573, + "learning_rate": 5.3699682752283944e-05, + "loss": 1.7343, + "step": 16035 + }, + { + "epoch": 4.922038060159607, + "grad_norm": 0.25267064571380615, + "learning_rate": 5.369472579988663e-05, + "loss": 1.7817, + "step": 16036 + }, + { + "epoch": 4.922344996930632, + "grad_norm": 0.25301575660705566, + "learning_rate": 5.368976881097586e-05, + "loss": 1.8146, + "step": 16037 + }, + { + "epoch": 4.922651933701657, + "grad_norm": 0.23579831421375275, + "learning_rate": 5.368481178560062e-05, + "loss": 1.8089, + "step": 16038 + }, + { + "epoch": 4.922958870472683, + "grad_norm": 0.2181949019432068, + "learning_rate": 5.367985472380993e-05, + "loss": 1.7689, + "step": 16039 + }, + { + "epoch": 4.923265807243708, + "grad_norm": 0.24622827768325806, + "learning_rate": 5.367489762565276e-05, + "loss": 1.791, + "step": 16040 + }, + { + "epoch": 4.9235727440147325, + "grad_norm": 0.2545134723186493, + "learning_rate": 5.3669940491178084e-05, + "loss": 1.738, + "step": 16041 + }, + { + "epoch": 4.923879680785758, + "grad_norm": 0.258139431476593, + "learning_rate": 5.366498332043491e-05, + "loss": 1.8303, + "step": 16042 + }, + { + "epoch": 4.924186617556783, + "grad_norm": 0.23804105818271637, + "learning_rate": 5.366002611347223e-05, + "loss": 1.751, + "step": 16043 + }, + { + "epoch": 4.9244935543278086, + "grad_norm": 0.2354477345943451, + "learning_rate": 5.365506887033901e-05, + "loss": 1.7911, + "step": 16044 + }, + { + "epoch": 4.924800491098834, + "grad_norm": 0.22212550044059753, + "learning_rate": 5.3650111591084276e-05, + "loss": 1.7439, + "step": 16045 + }, + { + "epoch": 4.925107427869859, + "grad_norm": 0.23621168732643127, + "learning_rate": 5.3645154275756984e-05, + "loss": 1.7339, + "step": 16046 + }, + { + "epoch": 4.925414364640884, + "grad_norm": 0.2163209468126297, + "learning_rate": 5.364019692440616e-05, + "loss": 1.7247, + "step": 16047 + }, + { + "epoch": 4.925721301411909, + "grad_norm": 0.21352291107177734, + "learning_rate": 5.3635239537080774e-05, + "loss": 1.7431, + "step": 16048 + }, + { + "epoch": 4.926028238182934, + "grad_norm": 0.3170754909515381, + "learning_rate": 5.36302821138298e-05, + "loss": 1.8075, + "step": 16049 + }, + { + "epoch": 4.92633517495396, + "grad_norm": 0.27073633670806885, + "learning_rate": 5.362532465470226e-05, + "loss": 1.7209, + "step": 16050 + }, + { + "epoch": 4.926642111724985, + "grad_norm": 0.2677803039550781, + "learning_rate": 5.362036715974714e-05, + "loss": 1.7454, + "step": 16051 + }, + { + "epoch": 4.9269490484960095, + "grad_norm": 0.3555704355239868, + "learning_rate": 5.3615409629013436e-05, + "loss": 1.7737, + "step": 16052 + }, + { + "epoch": 4.927255985267035, + "grad_norm": 0.2819947302341461, + "learning_rate": 5.3610452062550124e-05, + "loss": 1.7588, + "step": 16053 + }, + { + "epoch": 4.92756292203806, + "grad_norm": 0.26638996601104736, + "learning_rate": 5.360549446040621e-05, + "loss": 1.8078, + "step": 16054 + }, + { + "epoch": 4.9278698588090855, + "grad_norm": 0.37828773260116577, + "learning_rate": 5.360053682263069e-05, + "loss": 1.7527, + "step": 16055 + }, + { + "epoch": 4.928176795580111, + "grad_norm": 0.35836395621299744, + "learning_rate": 5.359557914927254e-05, + "loss": 1.7199, + "step": 16056 + }, + { + "epoch": 4.928483732351136, + "grad_norm": 0.2720802128314972, + "learning_rate": 5.359062144038078e-05, + "loss": 1.7598, + "step": 16057 + }, + { + "epoch": 4.928790669122161, + "grad_norm": 0.36662939190864563, + "learning_rate": 5.358566369600441e-05, + "loss": 1.7199, + "step": 16058 + }, + { + "epoch": 4.929097605893186, + "grad_norm": 0.42243221402168274, + "learning_rate": 5.3580705916192395e-05, + "loss": 1.7584, + "step": 16059 + }, + { + "epoch": 4.929404542664211, + "grad_norm": 0.21667765080928802, + "learning_rate": 5.357574810099375e-05, + "loss": 1.7608, + "step": 16060 + }, + { + "epoch": 4.929711479435237, + "grad_norm": 0.48101645708084106, + "learning_rate": 5.3570790250457456e-05, + "loss": 1.8157, + "step": 16061 + }, + { + "epoch": 4.930018416206261, + "grad_norm": 0.5289245843887329, + "learning_rate": 5.356583236463253e-05, + "loss": 1.7173, + "step": 16062 + }, + { + "epoch": 4.930325352977286, + "grad_norm": 0.21454930305480957, + "learning_rate": 5.356087444356795e-05, + "loss": 1.7399, + "step": 16063 + }, + { + "epoch": 4.930632289748312, + "grad_norm": 0.5648324489593506, + "learning_rate": 5.355591648731274e-05, + "loss": 1.7814, + "step": 16064 + }, + { + "epoch": 4.930939226519337, + "grad_norm": 0.5669483542442322, + "learning_rate": 5.355095849591587e-05, + "loss": 1.7769, + "step": 16065 + }, + { + "epoch": 4.931246163290362, + "grad_norm": 0.33108505606651306, + "learning_rate": 5.354600046942635e-05, + "loss": 1.7704, + "step": 16066 + }, + { + "epoch": 4.931553100061388, + "grad_norm": 0.31149306893348694, + "learning_rate": 5.3541042407893164e-05, + "loss": 1.7631, + "step": 16067 + }, + { + "epoch": 4.931860036832412, + "grad_norm": 0.30377596616744995, + "learning_rate": 5.353608431136532e-05, + "loss": 1.7888, + "step": 16068 + }, + { + "epoch": 4.9321669736034375, + "grad_norm": 0.25041452050209045, + "learning_rate": 5.3531126179891825e-05, + "loss": 1.7507, + "step": 16069 + }, + { + "epoch": 4.932473910374463, + "grad_norm": 0.33900725841522217, + "learning_rate": 5.352616801352167e-05, + "loss": 1.7365, + "step": 16070 + }, + { + "epoch": 4.932780847145488, + "grad_norm": 0.23939846456050873, + "learning_rate": 5.352120981230386e-05, + "loss": 1.7934, + "step": 16071 + }, + { + "epoch": 4.9330877839165135, + "grad_norm": 0.2419881969690323, + "learning_rate": 5.351625157628739e-05, + "loss": 1.7555, + "step": 16072 + }, + { + "epoch": 4.933394720687538, + "grad_norm": 0.3517596423625946, + "learning_rate": 5.351129330552125e-05, + "loss": 1.7102, + "step": 16073 + }, + { + "epoch": 4.933701657458563, + "grad_norm": 0.2660250663757324, + "learning_rate": 5.350633500005446e-05, + "loss": 1.7692, + "step": 16074 + }, + { + "epoch": 4.934008594229589, + "grad_norm": 0.20726454257965088, + "learning_rate": 5.350137665993601e-05, + "loss": 1.718, + "step": 16075 + }, + { + "epoch": 4.934315531000614, + "grad_norm": 0.28218522667884827, + "learning_rate": 5.3496418285214914e-05, + "loss": 1.8402, + "step": 16076 + }, + { + "epoch": 4.934622467771639, + "grad_norm": 0.2142515480518341, + "learning_rate": 5.349145987594015e-05, + "loss": 1.7571, + "step": 16077 + }, + { + "epoch": 4.934929404542665, + "grad_norm": 0.2777026891708374, + "learning_rate": 5.348650143216074e-05, + "loss": 1.7617, + "step": 16078 + }, + { + "epoch": 4.935236341313689, + "grad_norm": 0.24057620763778687, + "learning_rate": 5.348154295392567e-05, + "loss": 1.7149, + "step": 16079 + }, + { + "epoch": 4.935543278084714, + "grad_norm": 0.22220350801944733, + "learning_rate": 5.3476584441283964e-05, + "loss": 1.7402, + "step": 16080 + }, + { + "epoch": 4.93585021485574, + "grad_norm": 0.2451290488243103, + "learning_rate": 5.347162589428462e-05, + "loss": 1.7004, + "step": 16081 + }, + { + "epoch": 4.936157151626765, + "grad_norm": 0.25621771812438965, + "learning_rate": 5.3466667312976625e-05, + "loss": 1.7765, + "step": 16082 + }, + { + "epoch": 4.93646408839779, + "grad_norm": 0.217393159866333, + "learning_rate": 5.346170869740899e-05, + "loss": 1.7695, + "step": 16083 + }, + { + "epoch": 4.936771025168815, + "grad_norm": 0.21248537302017212, + "learning_rate": 5.345675004763071e-05, + "loss": 1.7277, + "step": 16084 + }, + { + "epoch": 4.93707796193984, + "grad_norm": 0.19431474804878235, + "learning_rate": 5.3451791363690805e-05, + "loss": 1.7352, + "step": 16085 + }, + { + "epoch": 4.9373848987108655, + "grad_norm": 0.20233909785747528, + "learning_rate": 5.344683264563829e-05, + "loss": 1.71, + "step": 16086 + }, + { + "epoch": 4.937691835481891, + "grad_norm": 0.2199622094631195, + "learning_rate": 5.344187389352214e-05, + "loss": 1.7443, + "step": 16087 + }, + { + "epoch": 4.937998772252916, + "grad_norm": 0.23495158553123474, + "learning_rate": 5.343691510739138e-05, + "loss": 1.7758, + "step": 16088 + }, + { + "epoch": 4.9383057090239415, + "grad_norm": 0.228348970413208, + "learning_rate": 5.3431956287295015e-05, + "loss": 1.7645, + "step": 16089 + }, + { + "epoch": 4.938612645794966, + "grad_norm": 0.2337537258863449, + "learning_rate": 5.342699743328203e-05, + "loss": 1.7353, + "step": 16090 + }, + { + "epoch": 4.938919582565991, + "grad_norm": 0.1899309754371643, + "learning_rate": 5.3422038545401454e-05, + "loss": 1.6907, + "step": 16091 + }, + { + "epoch": 4.939226519337017, + "grad_norm": 0.2479192316532135, + "learning_rate": 5.341707962370229e-05, + "loss": 1.7961, + "step": 16092 + }, + { + "epoch": 4.939533456108042, + "grad_norm": 0.2444314956665039, + "learning_rate": 5.341212066823355e-05, + "loss": 1.7768, + "step": 16093 + }, + { + "epoch": 4.939840392879067, + "grad_norm": 0.2123393714427948, + "learning_rate": 5.340716167904423e-05, + "loss": 1.7617, + "step": 16094 + }, + { + "epoch": 4.940147329650092, + "grad_norm": 0.20779116451740265, + "learning_rate": 5.340220265618334e-05, + "loss": 1.6951, + "step": 16095 + }, + { + "epoch": 4.940454266421117, + "grad_norm": 0.22189265489578247, + "learning_rate": 5.3397243599699884e-05, + "loss": 1.8368, + "step": 16096 + }, + { + "epoch": 4.940761203192142, + "grad_norm": 0.22316497564315796, + "learning_rate": 5.3392284509642875e-05, + "loss": 1.7096, + "step": 16097 + }, + { + "epoch": 4.941068139963168, + "grad_norm": 0.20406664907932281, + "learning_rate": 5.3387325386061346e-05, + "loss": 1.7269, + "step": 16098 + }, + { + "epoch": 4.941375076734193, + "grad_norm": 0.263007789850235, + "learning_rate": 5.338236622900427e-05, + "loss": 1.7663, + "step": 16099 + }, + { + "epoch": 4.941682013505218, + "grad_norm": 0.24388311803340912, + "learning_rate": 5.3377407038520654e-05, + "loss": 1.7113, + "step": 16100 + }, + { + "epoch": 4.941988950276243, + "grad_norm": 0.21918313205242157, + "learning_rate": 5.3372447814659524e-05, + "loss": 1.775, + "step": 16101 + }, + { + "epoch": 4.942295887047268, + "grad_norm": 0.30842962861061096, + "learning_rate": 5.336748855746989e-05, + "loss": 1.8229, + "step": 16102 + }, + { + "epoch": 4.9426028238182935, + "grad_norm": 0.2875657379627228, + "learning_rate": 5.336252926700077e-05, + "loss": 1.7377, + "step": 16103 + }, + { + "epoch": 4.942909760589319, + "grad_norm": 0.23411425948143005, + "learning_rate": 5.3357569943301156e-05, + "loss": 1.754, + "step": 16104 + }, + { + "epoch": 4.943216697360343, + "grad_norm": 0.29758864641189575, + "learning_rate": 5.335261058642007e-05, + "loss": 1.7471, + "step": 16105 + }, + { + "epoch": 4.943523634131369, + "grad_norm": 0.31761085987091064, + "learning_rate": 5.3347651196406534e-05, + "loss": 1.7658, + "step": 16106 + }, + { + "epoch": 4.943830570902394, + "grad_norm": 0.2487023025751114, + "learning_rate": 5.334269177330952e-05, + "loss": 1.786, + "step": 16107 + }, + { + "epoch": 4.944137507673419, + "grad_norm": 0.23954913020133972, + "learning_rate": 5.333773231717808e-05, + "loss": 1.8486, + "step": 16108 + }, + { + "epoch": 4.944444444444445, + "grad_norm": 0.24893096089363098, + "learning_rate": 5.3332772828061214e-05, + "loss": 1.7927, + "step": 16109 + }, + { + "epoch": 4.94475138121547, + "grad_norm": 0.28653839230537415, + "learning_rate": 5.332781330600795e-05, + "loss": 1.8331, + "step": 16110 + }, + { + "epoch": 4.945058317986494, + "grad_norm": 0.2597404718399048, + "learning_rate": 5.332285375106726e-05, + "loss": 1.7128, + "step": 16111 + }, + { + "epoch": 4.94536525475752, + "grad_norm": 0.23813198506832123, + "learning_rate": 5.3317894163288196e-05, + "loss": 1.7483, + "step": 16112 + }, + { + "epoch": 4.945672191528545, + "grad_norm": 0.2545793652534485, + "learning_rate": 5.331293454271974e-05, + "loss": 1.7987, + "step": 16113 + }, + { + "epoch": 4.94597912829957, + "grad_norm": 0.2453712821006775, + "learning_rate": 5.330797488941095e-05, + "loss": 1.7376, + "step": 16114 + }, + { + "epoch": 4.946286065070596, + "grad_norm": 0.20583751797676086, + "learning_rate": 5.33030152034108e-05, + "loss": 1.7038, + "step": 16115 + }, + { + "epoch": 4.94659300184162, + "grad_norm": 0.22557811439037323, + "learning_rate": 5.3298055484768313e-05, + "loss": 1.6999, + "step": 16116 + }, + { + "epoch": 4.9468999386126455, + "grad_norm": 0.23163801431655884, + "learning_rate": 5.329309573353252e-05, + "loss": 1.7575, + "step": 16117 + }, + { + "epoch": 4.947206875383671, + "grad_norm": 0.3560176491737366, + "learning_rate": 5.3288135949752394e-05, + "loss": 1.8494, + "step": 16118 + }, + { + "epoch": 4.947513812154696, + "grad_norm": 0.306379109621048, + "learning_rate": 5.328317613347701e-05, + "loss": 1.7229, + "step": 16119 + }, + { + "epoch": 4.9478207489257215, + "grad_norm": 0.24428823590278625, + "learning_rate": 5.3278216284755344e-05, + "loss": 1.7939, + "step": 16120 + }, + { + "epoch": 4.948127685696747, + "grad_norm": 0.22251521050930023, + "learning_rate": 5.327325640363643e-05, + "loss": 1.7624, + "step": 16121 + }, + { + "epoch": 4.948434622467771, + "grad_norm": 0.23310889303684235, + "learning_rate": 5.326829649016928e-05, + "loss": 1.7727, + "step": 16122 + }, + { + "epoch": 4.948741559238797, + "grad_norm": 0.22457881271839142, + "learning_rate": 5.326333654440291e-05, + "loss": 1.7602, + "step": 16123 + }, + { + "epoch": 4.949048496009822, + "grad_norm": 0.24032343924045563, + "learning_rate": 5.325837656638631e-05, + "loss": 1.7591, + "step": 16124 + }, + { + "epoch": 4.949355432780847, + "grad_norm": 0.25082892179489136, + "learning_rate": 5.3253416556168546e-05, + "loss": 1.7745, + "step": 16125 + }, + { + "epoch": 4.949662369551873, + "grad_norm": 0.22859038412570953, + "learning_rate": 5.3248456513798615e-05, + "loss": 1.7475, + "step": 16126 + }, + { + "epoch": 4.949969306322897, + "grad_norm": 0.27282553911209106, + "learning_rate": 5.3243496439325525e-05, + "loss": 1.7438, + "step": 16127 + }, + { + "epoch": 4.9502762430939224, + "grad_norm": 0.23622353374958038, + "learning_rate": 5.3238536332798303e-05, + "loss": 1.7625, + "step": 16128 + }, + { + "epoch": 4.950583179864948, + "grad_norm": 0.28060024976730347, + "learning_rate": 5.3233576194265975e-05, + "loss": 1.8028, + "step": 16129 + }, + { + "epoch": 4.950890116635973, + "grad_norm": 0.33281829953193665, + "learning_rate": 5.322861602377755e-05, + "loss": 1.7163, + "step": 16130 + }, + { + "epoch": 4.9511970534069984, + "grad_norm": 0.26457497477531433, + "learning_rate": 5.322365582138203e-05, + "loss": 1.7347, + "step": 16131 + }, + { + "epoch": 4.951503990178024, + "grad_norm": 0.21651674807071686, + "learning_rate": 5.3218695587128476e-05, + "loss": 1.7123, + "step": 16132 + }, + { + "epoch": 4.951810926949048, + "grad_norm": 0.2299882024526596, + "learning_rate": 5.3213735321065885e-05, + "loss": 1.775, + "step": 16133 + }, + { + "epoch": 4.952117863720074, + "grad_norm": 0.2252396047115326, + "learning_rate": 5.3208775023243265e-05, + "loss": 1.7598, + "step": 16134 + }, + { + "epoch": 4.952424800491099, + "grad_norm": 0.2263660430908203, + "learning_rate": 5.3203814693709655e-05, + "loss": 1.7519, + "step": 16135 + }, + { + "epoch": 4.952731737262124, + "grad_norm": 0.2425432950258255, + "learning_rate": 5.3198854332514056e-05, + "loss": 1.7769, + "step": 16136 + }, + { + "epoch": 4.953038674033149, + "grad_norm": 0.22624996304512024, + "learning_rate": 5.319389393970553e-05, + "loss": 1.7686, + "step": 16137 + }, + { + "epoch": 4.953345610804174, + "grad_norm": 0.2240568846464157, + "learning_rate": 5.318893351533306e-05, + "loss": 1.7795, + "step": 16138 + }, + { + "epoch": 4.953652547575199, + "grad_norm": 0.21708132326602936, + "learning_rate": 5.318397305944568e-05, + "loss": 1.7348, + "step": 16139 + }, + { + "epoch": 4.953959484346225, + "grad_norm": 0.2263328731060028, + "learning_rate": 5.3179012572092415e-05, + "loss": 1.7645, + "step": 16140 + }, + { + "epoch": 4.95426642111725, + "grad_norm": 0.2541986107826233, + "learning_rate": 5.3174052053322274e-05, + "loss": 1.723, + "step": 16141 + }, + { + "epoch": 4.954573357888275, + "grad_norm": 0.25829461216926575, + "learning_rate": 5.316909150318429e-05, + "loss": 1.7469, + "step": 16142 + }, + { + "epoch": 4.9548802946593, + "grad_norm": 0.21251125633716583, + "learning_rate": 5.3164130921727494e-05, + "loss": 1.7699, + "step": 16143 + }, + { + "epoch": 4.955187231430325, + "grad_norm": 0.29195618629455566, + "learning_rate": 5.315917030900091e-05, + "loss": 1.7373, + "step": 16144 + }, + { + "epoch": 4.9554941682013505, + "grad_norm": 0.29457888007164, + "learning_rate": 5.315420966505355e-05, + "loss": 1.7202, + "step": 16145 + }, + { + "epoch": 4.955801104972376, + "grad_norm": 0.19679461419582367, + "learning_rate": 5.314924898993443e-05, + "loss": 1.75, + "step": 16146 + }, + { + "epoch": 4.956108041743401, + "grad_norm": 0.287955105304718, + "learning_rate": 5.314428828369259e-05, + "loss": 1.7385, + "step": 16147 + }, + { + "epoch": 4.956414978514426, + "grad_norm": 0.3081825375556946, + "learning_rate": 5.313932754637706e-05, + "loss": 1.7558, + "step": 16148 + }, + { + "epoch": 4.956721915285451, + "grad_norm": 0.25226521492004395, + "learning_rate": 5.3134366778036846e-05, + "loss": 1.8407, + "step": 16149 + }, + { + "epoch": 4.957028852056476, + "grad_norm": 0.43601852655410767, + "learning_rate": 5.3129405978720984e-05, + "loss": 1.7762, + "step": 16150 + }, + { + "epoch": 4.957335788827502, + "grad_norm": 0.3630274832248688, + "learning_rate": 5.31244451484785e-05, + "loss": 1.7802, + "step": 16151 + }, + { + "epoch": 4.957642725598527, + "grad_norm": 0.21337948739528656, + "learning_rate": 5.311948428735841e-05, + "loss": 1.7107, + "step": 16152 + }, + { + "epoch": 4.957949662369552, + "grad_norm": 0.38581085205078125, + "learning_rate": 5.311452339540974e-05, + "loss": 1.7583, + "step": 16153 + }, + { + "epoch": 4.958256599140577, + "grad_norm": 0.28447309136390686, + "learning_rate": 5.310956247268154e-05, + "loss": 1.6992, + "step": 16154 + }, + { + "epoch": 4.958563535911602, + "grad_norm": 0.24510730803012848, + "learning_rate": 5.310460151922283e-05, + "loss": 1.7059, + "step": 16155 + }, + { + "epoch": 4.958870472682627, + "grad_norm": 0.41670146584510803, + "learning_rate": 5.309964053508262e-05, + "loss": 1.7191, + "step": 16156 + }, + { + "epoch": 4.959177409453653, + "grad_norm": 0.3123849034309387, + "learning_rate": 5.309467952030993e-05, + "loss": 1.7161, + "step": 16157 + }, + { + "epoch": 4.959484346224678, + "grad_norm": 0.2275281697511673, + "learning_rate": 5.308971847495382e-05, + "loss": 1.722, + "step": 16158 + }, + { + "epoch": 4.9597912829957025, + "grad_norm": 0.40216436982154846, + "learning_rate": 5.308475739906329e-05, + "loss": 1.7477, + "step": 16159 + }, + { + "epoch": 4.960098219766728, + "grad_norm": 0.259981244802475, + "learning_rate": 5.307979629268739e-05, + "loss": 1.7384, + "step": 16160 + }, + { + "epoch": 4.960405156537753, + "grad_norm": 0.22969573736190796, + "learning_rate": 5.3074835155875134e-05, + "loss": 1.7328, + "step": 16161 + }, + { + "epoch": 4.9607120933087785, + "grad_norm": 0.2773746848106384, + "learning_rate": 5.3069873988675556e-05, + "loss": 1.7333, + "step": 16162 + }, + { + "epoch": 4.961019030079804, + "grad_norm": 0.2764189541339874, + "learning_rate": 5.306491279113768e-05, + "loss": 1.7956, + "step": 16163 + }, + { + "epoch": 4.961325966850829, + "grad_norm": 0.3640958070755005, + "learning_rate": 5.305995156331054e-05, + "loss": 1.7464, + "step": 16164 + }, + { + "epoch": 4.961632903621854, + "grad_norm": 0.3573450446128845, + "learning_rate": 5.305499030524317e-05, + "loss": 1.75, + "step": 16165 + }, + { + "epoch": 4.961939840392879, + "grad_norm": 0.24313980340957642, + "learning_rate": 5.305002901698459e-05, + "loss": 1.7505, + "step": 16166 + }, + { + "epoch": 4.962246777163904, + "grad_norm": 0.3417615592479706, + "learning_rate": 5.304506769858384e-05, + "loss": 1.7387, + "step": 16167 + }, + { + "epoch": 4.96255371393493, + "grad_norm": 0.23209623992443085, + "learning_rate": 5.304010635008995e-05, + "loss": 1.7111, + "step": 16168 + }, + { + "epoch": 4.962860650705955, + "grad_norm": 0.2994776666164398, + "learning_rate": 5.3035144971551944e-05, + "loss": 1.75, + "step": 16169 + }, + { + "epoch": 4.963167587476979, + "grad_norm": 0.3147084712982178, + "learning_rate": 5.303018356301884e-05, + "loss": 1.7598, + "step": 16170 + }, + { + "epoch": 4.963474524248005, + "grad_norm": 0.20136526226997375, + "learning_rate": 5.30252221245397e-05, + "loss": 1.7217, + "step": 16171 + }, + { + "epoch": 4.96378146101903, + "grad_norm": 0.3308684229850769, + "learning_rate": 5.302026065616355e-05, + "loss": 1.7554, + "step": 16172 + }, + { + "epoch": 4.964088397790055, + "grad_norm": 0.22890877723693848, + "learning_rate": 5.30152991579394e-05, + "loss": 1.7598, + "step": 16173 + }, + { + "epoch": 4.964395334561081, + "grad_norm": 0.3036035895347595, + "learning_rate": 5.301033762991631e-05, + "loss": 1.758, + "step": 16174 + }, + { + "epoch": 4.964702271332106, + "grad_norm": 0.2983579933643341, + "learning_rate": 5.300537607214329e-05, + "loss": 1.8132, + "step": 16175 + }, + { + "epoch": 4.9650092081031305, + "grad_norm": 0.21401815116405487, + "learning_rate": 5.300041448466937e-05, + "loss": 1.7179, + "step": 16176 + }, + { + "epoch": 4.965316144874156, + "grad_norm": 0.2939651608467102, + "learning_rate": 5.2995452867543606e-05, + "loss": 1.7928, + "step": 16177 + }, + { + "epoch": 4.965623081645181, + "grad_norm": 0.24803484976291656, + "learning_rate": 5.2990491220815034e-05, + "loss": 1.7366, + "step": 16178 + }, + { + "epoch": 4.9659300184162065, + "grad_norm": 0.1999569535255432, + "learning_rate": 5.2985529544532656e-05, + "loss": 1.6691, + "step": 16179 + }, + { + "epoch": 4.966236955187231, + "grad_norm": 0.22315269708633423, + "learning_rate": 5.298056783874553e-05, + "loss": 1.7693, + "step": 16180 + }, + { + "epoch": 4.966543891958256, + "grad_norm": 0.22688794136047363, + "learning_rate": 5.2975606103502694e-05, + "loss": 1.8401, + "step": 16181 + }, + { + "epoch": 4.966850828729282, + "grad_norm": 0.2592024505138397, + "learning_rate": 5.297064433885317e-05, + "loss": 1.8054, + "step": 16182 + }, + { + "epoch": 4.967157765500307, + "grad_norm": 0.2508920133113861, + "learning_rate": 5.2965682544846e-05, + "loss": 1.766, + "step": 16183 + }, + { + "epoch": 4.967464702271332, + "grad_norm": 0.22318799793720245, + "learning_rate": 5.296072072153022e-05, + "loss": 1.751, + "step": 16184 + }, + { + "epoch": 4.967771639042358, + "grad_norm": 0.2348448485136032, + "learning_rate": 5.2955758868954855e-05, + "loss": 1.7844, + "step": 16185 + }, + { + "epoch": 4.968078575813382, + "grad_norm": 0.23294343054294586, + "learning_rate": 5.295079698716895e-05, + "loss": 1.7685, + "step": 16186 + }, + { + "epoch": 4.968385512584407, + "grad_norm": 0.20854508876800537, + "learning_rate": 5.2945835076221526e-05, + "loss": 1.6914, + "step": 16187 + }, + { + "epoch": 4.968692449355433, + "grad_norm": 0.21952031552791595, + "learning_rate": 5.294087313616165e-05, + "loss": 1.7121, + "step": 16188 + }, + { + "epoch": 4.968999386126458, + "grad_norm": 0.24097788333892822, + "learning_rate": 5.2935911167038346e-05, + "loss": 1.7712, + "step": 16189 + }, + { + "epoch": 4.969306322897483, + "grad_norm": 0.24433603882789612, + "learning_rate": 5.293094916890063e-05, + "loss": 1.7608, + "step": 16190 + }, + { + "epoch": 4.969613259668508, + "grad_norm": 0.22209061682224274, + "learning_rate": 5.292598714179757e-05, + "loss": 1.7563, + "step": 16191 + }, + { + "epoch": 4.969920196439533, + "grad_norm": 0.24291595816612244, + "learning_rate": 5.29210250857782e-05, + "loss": 1.7765, + "step": 16192 + }, + { + "epoch": 4.9702271332105585, + "grad_norm": 0.3143673837184906, + "learning_rate": 5.291606300089151e-05, + "loss": 1.7945, + "step": 16193 + }, + { + "epoch": 4.970534069981584, + "grad_norm": 0.22693613171577454, + "learning_rate": 5.291110088718661e-05, + "loss": 1.7411, + "step": 16194 + }, + { + "epoch": 4.970841006752609, + "grad_norm": 0.2271365374326706, + "learning_rate": 5.2906138744712494e-05, + "loss": 1.7754, + "step": 16195 + }, + { + "epoch": 4.9711479435236345, + "grad_norm": 0.2428499162197113, + "learning_rate": 5.290117657351822e-05, + "loss": 1.8007, + "step": 16196 + }, + { + "epoch": 4.971454880294659, + "grad_norm": 0.21862711012363434, + "learning_rate": 5.289621437365281e-05, + "loss": 1.7484, + "step": 16197 + }, + { + "epoch": 4.971761817065684, + "grad_norm": 0.26744964718818665, + "learning_rate": 5.2891252145165315e-05, + "loss": 1.7759, + "step": 16198 + }, + { + "epoch": 4.97206875383671, + "grad_norm": 0.2608526647090912, + "learning_rate": 5.288628988810477e-05, + "loss": 1.8527, + "step": 16199 + }, + { + "epoch": 4.972375690607735, + "grad_norm": 0.2245805710554123, + "learning_rate": 5.2881327602520216e-05, + "loss": 1.7773, + "step": 16200 + }, + { + "epoch": 4.97268262737876, + "grad_norm": 0.22023041546344757, + "learning_rate": 5.2876365288460694e-05, + "loss": 1.7101, + "step": 16201 + }, + { + "epoch": 4.972989564149785, + "grad_norm": 0.22034525871276855, + "learning_rate": 5.287140294597525e-05, + "loss": 1.7672, + "step": 16202 + }, + { + "epoch": 4.97329650092081, + "grad_norm": 0.23101158440113068, + "learning_rate": 5.286644057511292e-05, + "loss": 1.741, + "step": 16203 + }, + { + "epoch": 4.973603437691835, + "grad_norm": 0.23050430417060852, + "learning_rate": 5.286147817592273e-05, + "loss": 1.7727, + "step": 16204 + }, + { + "epoch": 4.973910374462861, + "grad_norm": 0.21803520619869232, + "learning_rate": 5.285651574845374e-05, + "loss": 1.7353, + "step": 16205 + }, + { + "epoch": 4.974217311233886, + "grad_norm": 0.22252169251441956, + "learning_rate": 5.2851553292754995e-05, + "loss": 1.7658, + "step": 16206 + }, + { + "epoch": 4.974524248004911, + "grad_norm": 0.22458864748477936, + "learning_rate": 5.284659080887552e-05, + "loss": 1.7157, + "step": 16207 + }, + { + "epoch": 4.974831184775936, + "grad_norm": 0.20769210159778595, + "learning_rate": 5.2841628296864376e-05, + "loss": 1.7731, + "step": 16208 + }, + { + "epoch": 4.975138121546961, + "grad_norm": 0.1952340304851532, + "learning_rate": 5.283666575677059e-05, + "loss": 1.6907, + "step": 16209 + }, + { + "epoch": 4.975445058317987, + "grad_norm": 0.21943804621696472, + "learning_rate": 5.28317031886432e-05, + "loss": 1.8007, + "step": 16210 + }, + { + "epoch": 4.975751995089012, + "grad_norm": 0.21987493336200714, + "learning_rate": 5.2826740592531276e-05, + "loss": 1.7205, + "step": 16211 + }, + { + "epoch": 4.976058931860036, + "grad_norm": 0.2076522558927536, + "learning_rate": 5.2821777968483845e-05, + "loss": 1.7063, + "step": 16212 + }, + { + "epoch": 4.976365868631062, + "grad_norm": 0.19126583635807037, + "learning_rate": 5.281681531654994e-05, + "loss": 1.7118, + "step": 16213 + }, + { + "epoch": 4.976672805402087, + "grad_norm": 0.22308050096035004, + "learning_rate": 5.2811852636778625e-05, + "loss": 1.7565, + "step": 16214 + }, + { + "epoch": 4.976979742173112, + "grad_norm": 0.23187528550624847, + "learning_rate": 5.280688992921893e-05, + "loss": 1.8261, + "step": 16215 + }, + { + "epoch": 4.977286678944138, + "grad_norm": 0.21373791992664337, + "learning_rate": 5.28019271939199e-05, + "loss": 1.6974, + "step": 16216 + }, + { + "epoch": 4.977593615715163, + "grad_norm": 0.21647346019744873, + "learning_rate": 5.2796964430930585e-05, + "loss": 1.7967, + "step": 16217 + }, + { + "epoch": 4.9779005524861875, + "grad_norm": 0.2231660932302475, + "learning_rate": 5.279200164030002e-05, + "loss": 1.7495, + "step": 16218 + }, + { + "epoch": 4.978207489257213, + "grad_norm": 0.2810545563697815, + "learning_rate": 5.278703882207728e-05, + "loss": 1.875, + "step": 16219 + }, + { + "epoch": 4.978514426028238, + "grad_norm": 0.298984557390213, + "learning_rate": 5.2782075976311374e-05, + "loss": 1.7494, + "step": 16220 + }, + { + "epoch": 4.9788213627992635, + "grad_norm": 0.2530893385410309, + "learning_rate": 5.2777113103051365e-05, + "loss": 1.7594, + "step": 16221 + }, + { + "epoch": 4.979128299570289, + "grad_norm": 0.26165664196014404, + "learning_rate": 5.277215020234629e-05, + "loss": 1.7543, + "step": 16222 + }, + { + "epoch": 4.979435236341313, + "grad_norm": 0.25115957856178284, + "learning_rate": 5.276718727424521e-05, + "loss": 1.7925, + "step": 16223 + }, + { + "epoch": 4.979742173112339, + "grad_norm": 0.22134126722812653, + "learning_rate": 5.276222431879716e-05, + "loss": 1.8359, + "step": 16224 + }, + { + "epoch": 4.980049109883364, + "grad_norm": 0.24447613954544067, + "learning_rate": 5.275726133605119e-05, + "loss": 1.7693, + "step": 16225 + }, + { + "epoch": 4.980356046654389, + "grad_norm": 0.23025095462799072, + "learning_rate": 5.275229832605635e-05, + "loss": 1.7911, + "step": 16226 + }, + { + "epoch": 4.980662983425415, + "grad_norm": 0.23424232006072998, + "learning_rate": 5.2747335288861686e-05, + "loss": 1.7628, + "step": 16227 + }, + { + "epoch": 4.98096992019644, + "grad_norm": 0.24598535895347595, + "learning_rate": 5.2742372224516235e-05, + "loss": 1.7651, + "step": 16228 + }, + { + "epoch": 4.981276856967464, + "grad_norm": 0.262893944978714, + "learning_rate": 5.273740913306906e-05, + "loss": 1.7282, + "step": 16229 + }, + { + "epoch": 4.98158379373849, + "grad_norm": 0.21981783211231232, + "learning_rate": 5.2732446014569207e-05, + "loss": 1.7448, + "step": 16230 + }, + { + "epoch": 4.981890730509515, + "grad_norm": 0.24244973063468933, + "learning_rate": 5.272748286906573e-05, + "loss": 1.7216, + "step": 16231 + }, + { + "epoch": 4.98219766728054, + "grad_norm": 0.2365221232175827, + "learning_rate": 5.272251969660766e-05, + "loss": 1.7227, + "step": 16232 + }, + { + "epoch": 4.982504604051566, + "grad_norm": 0.2081129401922226, + "learning_rate": 5.271755649724405e-05, + "loss": 1.7184, + "step": 16233 + }, + { + "epoch": 4.98281154082259, + "grad_norm": 0.2256374955177307, + "learning_rate": 5.271259327102395e-05, + "loss": 1.7412, + "step": 16234 + }, + { + "epoch": 4.9831184775936155, + "grad_norm": 0.23727381229400635, + "learning_rate": 5.270763001799643e-05, + "loss": 1.8095, + "step": 16235 + }, + { + "epoch": 4.983425414364641, + "grad_norm": 0.21498435735702515, + "learning_rate": 5.2702666738210504e-05, + "loss": 1.744, + "step": 16236 + }, + { + "epoch": 4.983732351135666, + "grad_norm": 0.24772173166275024, + "learning_rate": 5.269770343171525e-05, + "loss": 1.741, + "step": 16237 + }, + { + "epoch": 4.9840392879066915, + "grad_norm": 0.2835623621940613, + "learning_rate": 5.269274009855971e-05, + "loss": 1.7765, + "step": 16238 + }, + { + "epoch": 4.984346224677717, + "grad_norm": 0.2570044696331024, + "learning_rate": 5.2687776738792926e-05, + "loss": 1.8206, + "step": 16239 + }, + { + "epoch": 4.984653161448741, + "grad_norm": 0.21549640595912933, + "learning_rate": 5.268281335246397e-05, + "loss": 1.7022, + "step": 16240 + }, + { + "epoch": 4.984960098219767, + "grad_norm": 0.23158684372901917, + "learning_rate": 5.267784993962187e-05, + "loss": 1.7882, + "step": 16241 + }, + { + "epoch": 4.985267034990792, + "grad_norm": 0.22778423130512238, + "learning_rate": 5.26728865003157e-05, + "loss": 1.7358, + "step": 16242 + }, + { + "epoch": 4.985573971761817, + "grad_norm": 0.23197145760059357, + "learning_rate": 5.266792303459449e-05, + "loss": 1.7687, + "step": 16243 + }, + { + "epoch": 4.985880908532843, + "grad_norm": 0.19270172715187073, + "learning_rate": 5.26629595425073e-05, + "loss": 1.6999, + "step": 16244 + }, + { + "epoch": 4.986187845303867, + "grad_norm": 0.25262632966041565, + "learning_rate": 5.2657996024103175e-05, + "loss": 1.7536, + "step": 16245 + }, + { + "epoch": 4.986494782074892, + "grad_norm": 0.18620926141738892, + "learning_rate": 5.2653032479431185e-05, + "loss": 1.7033, + "step": 16246 + }, + { + "epoch": 4.986801718845918, + "grad_norm": 0.19537273049354553, + "learning_rate": 5.2648068908540374e-05, + "loss": 1.7457, + "step": 16247 + }, + { + "epoch": 4.987108655616943, + "grad_norm": 0.19447599351406097, + "learning_rate": 5.26431053114798e-05, + "loss": 1.7053, + "step": 16248 + }, + { + "epoch": 4.987415592387968, + "grad_norm": 0.20431137084960938, + "learning_rate": 5.263814168829852e-05, + "loss": 1.7695, + "step": 16249 + }, + { + "epoch": 4.987722529158994, + "grad_norm": 0.21123024821281433, + "learning_rate": 5.263317803904554e-05, + "loss": 1.7666, + "step": 16250 + }, + { + "epoch": 4.988029465930018, + "grad_norm": 0.21279335021972656, + "learning_rate": 5.262821436376998e-05, + "loss": 1.7231, + "step": 16251 + }, + { + "epoch": 4.9883364027010435, + "grad_norm": 0.22504910826683044, + "learning_rate": 5.262325066252085e-05, + "loss": 1.7657, + "step": 16252 + }, + { + "epoch": 4.988643339472069, + "grad_norm": 0.23505981266498566, + "learning_rate": 5.261828693534723e-05, + "loss": 1.7576, + "step": 16253 + }, + { + "epoch": 4.988950276243094, + "grad_norm": 0.21553601324558258, + "learning_rate": 5.261332318229817e-05, + "loss": 1.7782, + "step": 16254 + }, + { + "epoch": 4.989257213014119, + "grad_norm": 0.29189521074295044, + "learning_rate": 5.26083594034227e-05, + "loss": 1.7664, + "step": 16255 + }, + { + "epoch": 4.989564149785144, + "grad_norm": 0.38108906149864197, + "learning_rate": 5.26033955987699e-05, + "loss": 1.8573, + "step": 16256 + }, + { + "epoch": 4.989871086556169, + "grad_norm": 0.30329224467277527, + "learning_rate": 5.2598431768388824e-05, + "loss": 1.7584, + "step": 16257 + }, + { + "epoch": 4.990178023327195, + "grad_norm": 0.2437417358160019, + "learning_rate": 5.259346791232852e-05, + "loss": 1.7352, + "step": 16258 + }, + { + "epoch": 4.99048496009822, + "grad_norm": 0.3601737320423126, + "learning_rate": 5.258850403063804e-05, + "loss": 1.7206, + "step": 16259 + }, + { + "epoch": 4.990791896869245, + "grad_norm": 0.20259195566177368, + "learning_rate": 5.258354012336646e-05, + "loss": 1.7403, + "step": 16260 + }, + { + "epoch": 4.99109883364027, + "grad_norm": 0.38022148609161377, + "learning_rate": 5.257857619056281e-05, + "loss": 1.7783, + "step": 16261 + }, + { + "epoch": 4.991405770411295, + "grad_norm": 0.30131712555885315, + "learning_rate": 5.257361223227615e-05, + "loss": 1.7826, + "step": 16262 + }, + { + "epoch": 4.99171270718232, + "grad_norm": 0.24159663915634155, + "learning_rate": 5.2568648248555565e-05, + "loss": 1.7792, + "step": 16263 + }, + { + "epoch": 4.992019643953346, + "grad_norm": 0.4641213119029999, + "learning_rate": 5.2563684239450084e-05, + "loss": 1.7432, + "step": 16264 + }, + { + "epoch": 4.992326580724371, + "grad_norm": 0.3526865541934967, + "learning_rate": 5.255872020500877e-05, + "loss": 1.7736, + "step": 16265 + }, + { + "epoch": 4.9926335174953955, + "grad_norm": 0.2396051585674286, + "learning_rate": 5.255375614528071e-05, + "loss": 1.7505, + "step": 16266 + }, + { + "epoch": 4.992940454266421, + "grad_norm": 0.320987343788147, + "learning_rate": 5.25487920603149e-05, + "loss": 1.8229, + "step": 16267 + }, + { + "epoch": 4.993247391037446, + "grad_norm": 0.24689678847789764, + "learning_rate": 5.254382795016044e-05, + "loss": 1.7011, + "step": 16268 + }, + { + "epoch": 4.9935543278084715, + "grad_norm": 0.2407137155532837, + "learning_rate": 5.253886381486639e-05, + "loss": 1.741, + "step": 16269 + }, + { + "epoch": 4.993861264579497, + "grad_norm": 0.3677252531051636, + "learning_rate": 5.25338996544818e-05, + "loss": 1.7792, + "step": 16270 + }, + { + "epoch": 4.994168201350522, + "grad_norm": 0.25096553564071655, + "learning_rate": 5.252893546905573e-05, + "loss": 1.7523, + "step": 16271 + }, + { + "epoch": 4.994475138121547, + "grad_norm": 0.2966327965259552, + "learning_rate": 5.252397125863723e-05, + "loss": 1.7114, + "step": 16272 + }, + { + "epoch": 4.994782074892572, + "grad_norm": 0.36577650904655457, + "learning_rate": 5.2519007023275356e-05, + "loss": 1.7609, + "step": 16273 + }, + { + "epoch": 4.995089011663597, + "grad_norm": 0.2450687140226364, + "learning_rate": 5.25140427630192e-05, + "loss": 1.7452, + "step": 16274 + }, + { + "epoch": 4.995395948434623, + "grad_norm": 0.20782120525836945, + "learning_rate": 5.250907847791778e-05, + "loss": 1.7109, + "step": 16275 + }, + { + "epoch": 4.995702885205648, + "grad_norm": 0.2423330545425415, + "learning_rate": 5.25041141680202e-05, + "loss": 1.7234, + "step": 16276 + }, + { + "epoch": 4.996009821976672, + "grad_norm": 0.20855975151062012, + "learning_rate": 5.2499149833375484e-05, + "loss": 1.7734, + "step": 16277 + }, + { + "epoch": 4.996316758747698, + "grad_norm": 0.24400894343852997, + "learning_rate": 5.24941854740327e-05, + "loss": 1.7566, + "step": 16278 + }, + { + "epoch": 4.996623695518723, + "grad_norm": 0.4378018379211426, + "learning_rate": 5.2489221090040906e-05, + "loss": 1.7536, + "step": 16279 + }, + { + "epoch": 4.996930632289748, + "grad_norm": 0.20726722478866577, + "learning_rate": 5.248425668144918e-05, + "loss": 1.8008, + "step": 16280 + }, + { + "epoch": 4.997237569060774, + "grad_norm": 0.2506333589553833, + "learning_rate": 5.247929224830658e-05, + "loss": 1.7404, + "step": 16281 + }, + { + "epoch": 4.997544505831799, + "grad_norm": 0.24178004264831543, + "learning_rate": 5.247432779066216e-05, + "loss": 1.7517, + "step": 16282 + }, + { + "epoch": 4.9978514426028235, + "grad_norm": 0.2500220835208893, + "learning_rate": 5.246936330856499e-05, + "loss": 1.7705, + "step": 16283 + }, + { + "epoch": 4.998158379373849, + "grad_norm": 0.30043718218803406, + "learning_rate": 5.24643988020641e-05, + "loss": 1.8118, + "step": 16284 + }, + { + "epoch": 4.998465316144874, + "grad_norm": 0.284805566072464, + "learning_rate": 5.245943427120859e-05, + "loss": 1.7968, + "step": 16285 + }, + { + "epoch": 4.9987722529158995, + "grad_norm": 0.3652406632900238, + "learning_rate": 5.245446971604751e-05, + "loss": 1.7785, + "step": 16286 + }, + { + "epoch": 4.999079189686924, + "grad_norm": 0.24879656732082367, + "learning_rate": 5.244950513662992e-05, + "loss": 1.734, + "step": 16287 + }, + { + "epoch": 4.999386126457949, + "grad_norm": 0.2374224215745926, + "learning_rate": 5.244454053300488e-05, + "loss": 1.7394, + "step": 16288 + }, + { + "epoch": 4.999693063228975, + "grad_norm": 0.27090463042259216, + "learning_rate": 5.243957590522147e-05, + "loss": 1.7529, + "step": 16289 + }, + { + "epoch": 5.0, + "grad_norm": 0.23060791194438934, + "learning_rate": 5.243461125332873e-05, + "loss": 1.7599, + "step": 16290 + } + ], + "logging_steps": 1.0, + "max_steps": 32580, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.066251937437041e+20, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-16290/training_args.bin b/checkpoint-16290/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9db7ad91da5423a229826113feb3e9db3ef40c31 --- /dev/null +++ b/checkpoint-16290/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682b697e933b6e2693e5f9af9a0654effab1ca392c8500bf8af0eb089116a263 +size 7288 diff --git a/checkpoint-16290/zero_to_fp32.py b/checkpoint-16290/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-16290/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-19548/config.json b/checkpoint-19548/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a29af639fbf705188c21aae22660a85fee1ca26e --- /dev/null +++ b/checkpoint-19548/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "freeze_mm_mlp_adapter": false, + "gen_hidden_size": 1792, + "gen_pooling": "early_pool2d_4", + "gen_vision_tower": "eva-clip-E-14-plus", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "image_aspect_ratio": "square", + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "llava_llama", + "n_query": 64, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "tokenizer_model_max_length": 256, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "tune_mm_mlp_adapter": false, + "use_cache": false, + "use_mm_proj": true, + "vision_tower_pretrained": null, + "vocab_size": 128260 +} diff --git a/checkpoint-19548/generation_config.json b/checkpoint-19548/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..05892c70fa899883072c585fa444b4aa7175d6bc --- /dev/null +++ b/checkpoint-19548/generation_config.json @@ -0,0 +1,13 @@ +{ + "attn_implementation": "flash_attention_2", + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-19548/latest b/checkpoint-19548/latest new file mode 100644 index 0000000000000000000000000000000000000000..bce88fac1241e3641515ab948731a1af54ff33d7 --- /dev/null +++ b/checkpoint-19548/latest @@ -0,0 +1 @@ +global_step19548 \ No newline at end of file diff --git a/checkpoint-19548/model-00001-of-00003.safetensors b/checkpoint-19548/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ca0b8721f91ed15686804b85f8ce1a6c96bed9ba --- /dev/null +++ b/checkpoint-19548/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4179b835cf0c249f5815ee83439099e0b61c4cfd5ac756e5f3bad67fa76c779c +size 4955415870 diff --git a/checkpoint-19548/model-00002-of-00003.safetensors b/checkpoint-19548/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1c6f3bf70f8abb1e7ffb233219debc10bc20bfc --- /dev/null +++ b/checkpoint-19548/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b088e0e2c4fb5916f448522fa5aef361db713e2c2c0ceac534662c8d52e330d +size 4971563008 diff --git a/checkpoint-19548/model-00003-of-00003.safetensors b/checkpoint-19548/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..41b150b999133715487e62741a3033a4b9401b70 --- /dev/null +++ b/checkpoint-19548/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3c31b0d39aaaaa50f7b94561a81ddb5efe84978b7a25e4bfee2b4d93bd741a9 +size 4180840856 diff --git a/checkpoint-19548/model.safetensors.index.json b/checkpoint-19548/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c911c94f46f802ae304903dd7796da96c28604 --- /dev/null +++ b/checkpoint-19548/model.safetensors.index.json @@ -0,0 +1,2358 @@ +{ + "metadata": { + "total_size": 14107506086 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.bias": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.cls_token": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.pos_embed": "model-00001-of-00003.safetensors", + "model.latent_queries": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/checkpoint-19548/rng_state_0.pth b/checkpoint-19548/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf0e175d0636efed32169981ab6b328b9f28ed6d --- /dev/null +++ b/checkpoint-19548/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebd347768dbf3bad4e344c40efcf363e3ab6ce37cc1fe02bf5fd4041ea620508 +size 15984 diff --git a/checkpoint-19548/rng_state_1.pth b/checkpoint-19548/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d0662c9e4ac0fd8668363f1ae8cd6015efa4dfd3 --- /dev/null +++ b/checkpoint-19548/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bffb95e4a700baf23aa512f2b0f9fda3e207c24dc3ce811ecb4b6020e1a96a6e +size 15984 diff --git a/checkpoint-19548/rng_state_10.pth b/checkpoint-19548/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..f7eb644744e021da61c7935394fa29b6230d00fe --- /dev/null +++ b/checkpoint-19548/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d49669c69b80315f111672aed4d7db0048cf05acc88847fee75740502749fa85 +size 15997 diff --git a/checkpoint-19548/rng_state_11.pth b/checkpoint-19548/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..95738648a8e5102eb12955ca1353fafe944b7339 --- /dev/null +++ b/checkpoint-19548/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a63746761671ff2030c23ffe1c8911f810ba856cdc02bd160bb5f5aad5948fff +size 15997 diff --git a/checkpoint-19548/rng_state_12.pth b/checkpoint-19548/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cb0b40033c0816a3beddc47eb5b2bdf23624deb --- /dev/null +++ b/checkpoint-19548/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c77e34dc85b7c3e85730241f78c8ba5ad8234f93c92792ccf5bda807fbaa7506 +size 15997 diff --git a/checkpoint-19548/rng_state_13.pth b/checkpoint-19548/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..e71c13ed948ce96b9d715c933e49cf297851a8ca --- /dev/null +++ b/checkpoint-19548/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb1721b09c05555e020752469662263a0159796ccdf05f4f52010effbd763ac3 +size 15997 diff --git a/checkpoint-19548/rng_state_14.pth b/checkpoint-19548/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c7742636956cc7e4580e3676daffcc1c99951ec --- /dev/null +++ b/checkpoint-19548/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ccb5cd2e1b13307f70790b31a00094e4aa02750786b48e696894893a8502ee9 +size 15997 diff --git a/checkpoint-19548/rng_state_15.pth b/checkpoint-19548/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..4921b732e6f5b61cfb2db1c7d3b1f04fee0e0343 --- /dev/null +++ b/checkpoint-19548/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf04cbc5b820b38e584633c4f09fac2e83d6a2d3820a9169bfe81bf99f4d4007 +size 15997 diff --git a/checkpoint-19548/rng_state_16.pth b/checkpoint-19548/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..2e915e77378251bc2a211fc5a5176b5787717fb5 --- /dev/null +++ b/checkpoint-19548/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35166887815ade29b89c1be791bfd5894728923d414fb97e29fd07dce537bbeb +size 15997 diff --git a/checkpoint-19548/rng_state_17.pth b/checkpoint-19548/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..a89decc12cbcca993468592a3cf54b45e0419471 --- /dev/null +++ b/checkpoint-19548/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d65bc2c1a02bcbd6cea53997d27ea0fda32453121bd2d79aa5c0d1ba2ddc1d8e +size 15997 diff --git a/checkpoint-19548/rng_state_18.pth b/checkpoint-19548/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..08c384744f251d967834065eb3741e26f4a726e2 --- /dev/null +++ b/checkpoint-19548/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e78a3a00910de9723f46870ecaef90af0b5e649c463af67b9c1646fa519ac5e +size 15997 diff --git a/checkpoint-19548/rng_state_19.pth b/checkpoint-19548/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..aff3bb1471af5bcd00e93eeb37cef9581f9bd7d8 --- /dev/null +++ b/checkpoint-19548/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2658fc89ce99ab0d72d8ea2e0a9a0a8729bd429c49137ea3a5d48dacfd26549 +size 15997 diff --git a/checkpoint-19548/rng_state_2.pth b/checkpoint-19548/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef3bfdb2c961cd035f3bc16569bfe40bc7a50d89 --- /dev/null +++ b/checkpoint-19548/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e58c7db580c964ee48d27a6429486bca8ea3b229a0061d90c6729feede5161c +size 15984 diff --git a/checkpoint-19548/rng_state_20.pth b/checkpoint-19548/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f0a1a9fe8da32e961d5280693199ad8a136792d --- /dev/null +++ b/checkpoint-19548/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2228e20f9a2e6f3521bdf1425aaf993f7524a5fa1f283091592f3f0fa6c28bb9 +size 15997 diff --git a/checkpoint-19548/rng_state_21.pth b/checkpoint-19548/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..01b81fbd405bd109e858cfa98430f821b418da98 --- /dev/null +++ b/checkpoint-19548/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f5e6d98541f8e040fc55391945fd6be31895b991dde46049f6dbc8b0592998 +size 15997 diff --git a/checkpoint-19548/rng_state_22.pth b/checkpoint-19548/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e87acccf8ba49a2e589c2aca14aaa87abcdf54d --- /dev/null +++ b/checkpoint-19548/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:972e8fb4316625d05f43a10cd5dbd2f4ff2ccd9719d5436ce2c15618706f3eea +size 15997 diff --git a/checkpoint-19548/rng_state_23.pth b/checkpoint-19548/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..d50306c9a4230d19e1e937a91718499168f256e7 --- /dev/null +++ b/checkpoint-19548/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc0a37ebd23b4098ccb25c413fa3ca8e917dd14f9427a4ab326b70b9b3daacca +size 15997 diff --git a/checkpoint-19548/rng_state_24.pth b/checkpoint-19548/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..9c78ee93f21295d746372b27cd6602b57f8e0582 --- /dev/null +++ b/checkpoint-19548/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58914376ef2f22dc62b0b60a78c82799c315cdf2f77e245484b12b716487b70e +size 15997 diff --git a/checkpoint-19548/rng_state_25.pth b/checkpoint-19548/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..efd8a1085db0d1c1801fe98fbd6d4868f5d58520 --- /dev/null +++ b/checkpoint-19548/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84a400042bd1e4ad4332ceef2754a604f8f35896d5e79f1fd205da13c2234986 +size 15997 diff --git a/checkpoint-19548/rng_state_26.pth b/checkpoint-19548/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..377983fdf7009d3f2c138c719c51e39fec5cbeda --- /dev/null +++ b/checkpoint-19548/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:222f4f02aadb249bbbc4f6fbb185d999d1928584bc366da38e00bbf8cec6cffc +size 15997 diff --git a/checkpoint-19548/rng_state_27.pth b/checkpoint-19548/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..795ab80b26c209950abb960c8a08d465e7700682 --- /dev/null +++ b/checkpoint-19548/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab08f4520ef13782d3f6e14299cdfc732b25c89a7d1eae28dc05c5865cde1dcb +size 15997 diff --git a/checkpoint-19548/rng_state_28.pth b/checkpoint-19548/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..e4db0c13f4dcd22cd97aaa7f1e2c2ebec4a43c7d --- /dev/null +++ b/checkpoint-19548/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c63980693a58c36dd1251b407ec2b4d6750727021439fc0fea0375c9e9bb6fb2 +size 15997 diff --git a/checkpoint-19548/rng_state_29.pth b/checkpoint-19548/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..835329b235eda4ed9e6bef28f35cf8d9b9e0e005 --- /dev/null +++ b/checkpoint-19548/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6d3b8e86be8c38ec6fbca275f8d2c4e2f8218f7c50779b0302c25e2b6c7bbf8 +size 15997 diff --git a/checkpoint-19548/rng_state_3.pth b/checkpoint-19548/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7c0587be5cd76c26ae95beb819821f8540e0ace5 --- /dev/null +++ b/checkpoint-19548/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27cd4714d73263d28613e22027023759a2bb7bcee0b8ffe4983b2141f19f5929 +size 15984 diff --git a/checkpoint-19548/rng_state_30.pth b/checkpoint-19548/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..48d8fd25860fdaeafe465b5744039c2e00c9f61a --- /dev/null +++ b/checkpoint-19548/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a71a27e3a659827722fca63934778d767003d2ec1dc9194c6e283df2cc59a9ea +size 15997 diff --git a/checkpoint-19548/rng_state_31.pth b/checkpoint-19548/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..20b0ff677aa1adab486c93d2f68150692c21de59 --- /dev/null +++ b/checkpoint-19548/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4530ae335986eed13434f08c541b045a9cad9b6c5d4bcb4d60ed61326329ab28 +size 15997 diff --git a/checkpoint-19548/rng_state_32.pth b/checkpoint-19548/rng_state_32.pth new file mode 100644 index 0000000000000000000000000000000000000000..24211fe088387459001324d3819ad1ac9aa2e45c --- /dev/null +++ b/checkpoint-19548/rng_state_32.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:075752c8a525fff4a3dc91953cccdfe6f75925428fad0271526c9aa6c29391c8 +size 15997 diff --git a/checkpoint-19548/rng_state_33.pth b/checkpoint-19548/rng_state_33.pth new file mode 100644 index 0000000000000000000000000000000000000000..56156c4642e1c851918e73d7d03eb2803bb2d54a --- /dev/null +++ b/checkpoint-19548/rng_state_33.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff5136551a0b4f8931be47de675e0c45ead68e7ded023ce44572181b63e551fb +size 15997 diff --git a/checkpoint-19548/rng_state_34.pth b/checkpoint-19548/rng_state_34.pth new file mode 100644 index 0000000000000000000000000000000000000000..7c19752939e09269e639431829f87247d4b4e6be --- /dev/null +++ b/checkpoint-19548/rng_state_34.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:928aff8e95c5fb193fb378ebbc00d63534a8fa5a11c88c6b7401c41e484c11be +size 15997 diff --git a/checkpoint-19548/rng_state_35.pth b/checkpoint-19548/rng_state_35.pth new file mode 100644 index 0000000000000000000000000000000000000000..33a4697611e0f341a4c7f47ebccdbfd9295cb631 --- /dev/null +++ b/checkpoint-19548/rng_state_35.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1fd6e219dc41f240b641e4efb0e5c5531c5ac84dc55f902b95fc5bf4345ef1d +size 15997 diff --git a/checkpoint-19548/rng_state_36.pth b/checkpoint-19548/rng_state_36.pth new file mode 100644 index 0000000000000000000000000000000000000000..8da0c0896da7cd2bdd99546013bc5e05a16a3005 --- /dev/null +++ b/checkpoint-19548/rng_state_36.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:091cf39d311b51ac7f0fb7f8a42f821224ce56e1975be08e55113ace04e41035 +size 15997 diff --git a/checkpoint-19548/rng_state_37.pth b/checkpoint-19548/rng_state_37.pth new file mode 100644 index 0000000000000000000000000000000000000000..3201de500655c4bba2baeafa60d86dbd98548921 --- /dev/null +++ b/checkpoint-19548/rng_state_37.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84a2dc9c0b1b5962ace20ec6a561cf21e7deb9017327f0f08fad97c2f63f8a38 +size 15997 diff --git a/checkpoint-19548/rng_state_38.pth b/checkpoint-19548/rng_state_38.pth new file mode 100644 index 0000000000000000000000000000000000000000..cfb97d7dcbc150c62f865d3c8d4c76bcc0301b0e --- /dev/null +++ b/checkpoint-19548/rng_state_38.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94698661316ac3e4278d4cae7384e941a53cf899069b76d0c0ffe12355e4c94a +size 15997 diff --git a/checkpoint-19548/rng_state_39.pth b/checkpoint-19548/rng_state_39.pth new file mode 100644 index 0000000000000000000000000000000000000000..eee1ac80d9db0a200a9a0d4f5e8c645d8e973393 --- /dev/null +++ b/checkpoint-19548/rng_state_39.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b573d63f2f4a01ea92e1663d4acaea4d7728dfeaf72b0970ea94e9b7b197bca1 +size 15997 diff --git a/checkpoint-19548/rng_state_4.pth b/checkpoint-19548/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..09bb1a18600742d9242c571a0139ecc8be7febfb --- /dev/null +++ b/checkpoint-19548/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91c20053c71c4d0761355c7fe42ecedc71033d5e43871c7a65483d85be4dc0fb +size 15984 diff --git a/checkpoint-19548/rng_state_40.pth b/checkpoint-19548/rng_state_40.pth new file mode 100644 index 0000000000000000000000000000000000000000..bbf508b9cc82f725d1bf732bab348cfb9fdd319c --- /dev/null +++ b/checkpoint-19548/rng_state_40.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53a0eebc7cd4b00c732ca065e5d29bf1c805f88151388cc2f2cf2e0259f23c00 +size 15997 diff --git a/checkpoint-19548/rng_state_41.pth b/checkpoint-19548/rng_state_41.pth new file mode 100644 index 0000000000000000000000000000000000000000..3bba56520b432730ef7b8a4b80fad48134582475 --- /dev/null +++ b/checkpoint-19548/rng_state_41.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41e6644513f14a79302ea50dea56c91e88640d0f0512b2f072a2ab837e8f6606 +size 15997 diff --git a/checkpoint-19548/rng_state_42.pth b/checkpoint-19548/rng_state_42.pth new file mode 100644 index 0000000000000000000000000000000000000000..3984a3db33bad30aa6a6b54fbf5aba310ef98240 --- /dev/null +++ b/checkpoint-19548/rng_state_42.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64940b2c351e3ae6c3f01bd5935d64447f0cd0bc4c635a270fef6cbf974f1bc4 +size 15997 diff --git a/checkpoint-19548/rng_state_43.pth b/checkpoint-19548/rng_state_43.pth new file mode 100644 index 0000000000000000000000000000000000000000..d681a5d8f4e3ea45317d2aa35343ffc924840fac --- /dev/null +++ b/checkpoint-19548/rng_state_43.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:556c2e5cf289249d266b784c38cba256a57ded862f4e6d1bbbabb371058c40d2 +size 15997 diff --git a/checkpoint-19548/rng_state_44.pth b/checkpoint-19548/rng_state_44.pth new file mode 100644 index 0000000000000000000000000000000000000000..152037fe15c0a6c88d4326d02720e5e4ff79ea5c --- /dev/null +++ b/checkpoint-19548/rng_state_44.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02f24ddf4aad92275d399601d61155077660827817a05ae60539c6c8c7552828 +size 15997 diff --git a/checkpoint-19548/rng_state_45.pth b/checkpoint-19548/rng_state_45.pth new file mode 100644 index 0000000000000000000000000000000000000000..df228f02650a47fce7c052e88e90f918730a5326 --- /dev/null +++ b/checkpoint-19548/rng_state_45.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6d6b3e6da72dbd69b1dd4e87b8107a685e8895c96f1bee3f759f7f9c627ec44 +size 15997 diff --git a/checkpoint-19548/rng_state_46.pth b/checkpoint-19548/rng_state_46.pth new file mode 100644 index 0000000000000000000000000000000000000000..e215a15832edb5ad2d59ac936203f9475d014ef4 --- /dev/null +++ b/checkpoint-19548/rng_state_46.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c588e190c4f7549705b829422b28faf010e454b050a1e4d93dbc86d1aa2498bb +size 15997 diff --git a/checkpoint-19548/rng_state_47.pth b/checkpoint-19548/rng_state_47.pth new file mode 100644 index 0000000000000000000000000000000000000000..516d7579ec053e3eff253da97b1f7f04467a7bb6 --- /dev/null +++ b/checkpoint-19548/rng_state_47.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f10b16a6253a6890be6bdc01d450b88e747b9d006ceb1ce499b75a5792d3374 +size 15997 diff --git a/checkpoint-19548/rng_state_48.pth b/checkpoint-19548/rng_state_48.pth new file mode 100644 index 0000000000000000000000000000000000000000..0c855413e9d77685e4053182500c9ced583dad16 --- /dev/null +++ b/checkpoint-19548/rng_state_48.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc4153328c148d84f718b60ca97f81a0cbbec3dbc9b3284043a7043fedd88077 +size 15997 diff --git a/checkpoint-19548/rng_state_49.pth b/checkpoint-19548/rng_state_49.pth new file mode 100644 index 0000000000000000000000000000000000000000..23ad3f711c45c33f316f34c4390df2e28dcf8690 --- /dev/null +++ b/checkpoint-19548/rng_state_49.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bebbb22d32c65e8ef724d05659e3b07839d66333f982f942af60c08fd5ba3694 +size 15997 diff --git a/checkpoint-19548/rng_state_5.pth b/checkpoint-19548/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..357d342c204f0ea0d47928578b078ac2b6fb838a --- /dev/null +++ b/checkpoint-19548/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2099bf31da9425b5d058158a8c293395fda83e7fea14ab6c6faef927b14a9b5b +size 15984 diff --git a/checkpoint-19548/rng_state_50.pth b/checkpoint-19548/rng_state_50.pth new file mode 100644 index 0000000000000000000000000000000000000000..f99c1f27da51a0b389ec6b68e84dbaaceec87724 --- /dev/null +++ b/checkpoint-19548/rng_state_50.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79db4b9914bbd942f3d07a568b9fbf075ecb4c2ea351f061fa9db9d80fe73b58 +size 15997 diff --git a/checkpoint-19548/rng_state_51.pth b/checkpoint-19548/rng_state_51.pth new file mode 100644 index 0000000000000000000000000000000000000000..21bef23280bde0f71201685ae441f92e831761db --- /dev/null +++ b/checkpoint-19548/rng_state_51.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b32eb84e24f9d2602c421eca171aeb042aa6187f9c87b21bc57b9aa790845a3 +size 15997 diff --git a/checkpoint-19548/rng_state_52.pth b/checkpoint-19548/rng_state_52.pth new file mode 100644 index 0000000000000000000000000000000000000000..03adf8c66ed7bb2ec4095bfff9d57ddd4f45f706 --- /dev/null +++ b/checkpoint-19548/rng_state_52.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5077ff2fed0e41dabb702e810f3741c874c52cbc3f9aef3afa15175250e4e77a +size 15997 diff --git a/checkpoint-19548/rng_state_53.pth b/checkpoint-19548/rng_state_53.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a60db3620b51b3831314dca2dd109fb677b0a09 --- /dev/null +++ b/checkpoint-19548/rng_state_53.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f48e811b8d67f0a0c47c63d9b3347371a22b4ee6c0ca6c519e73a766568de5d +size 15997 diff --git a/checkpoint-19548/rng_state_54.pth b/checkpoint-19548/rng_state_54.pth new file mode 100644 index 0000000000000000000000000000000000000000..84a60875ce25ef80bc032f2a5cfb8789187e6add --- /dev/null +++ b/checkpoint-19548/rng_state_54.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21ed2aa5887afca8ad3a4632208363d0a32147657d356310739a574b3657c4cf +size 15997 diff --git a/checkpoint-19548/rng_state_55.pth b/checkpoint-19548/rng_state_55.pth new file mode 100644 index 0000000000000000000000000000000000000000..f31d855573529cd2afeae277ea487341f802389e --- /dev/null +++ b/checkpoint-19548/rng_state_55.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5169ee3ba07795a1aa5d07632cea97ae0432f0f6263fbe4508a2677ba64fb88 +size 15997 diff --git a/checkpoint-19548/rng_state_56.pth b/checkpoint-19548/rng_state_56.pth new file mode 100644 index 0000000000000000000000000000000000000000..2638dde9673d5f16d5f3502fa32ba97031234d74 --- /dev/null +++ b/checkpoint-19548/rng_state_56.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09d770742db2fcd20d28e334c6617a2edbf8243e73f9a2bc7587aa1776b4e2a1 +size 15997 diff --git a/checkpoint-19548/rng_state_57.pth b/checkpoint-19548/rng_state_57.pth new file mode 100644 index 0000000000000000000000000000000000000000..4d841f0a3640c8f5f886235ce88d0b8ffe2cac64 --- /dev/null +++ b/checkpoint-19548/rng_state_57.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fd84e33e34c113d30fd479152c51573c334f079dfa1642974a4afadcf75cef7 +size 15997 diff --git a/checkpoint-19548/rng_state_58.pth b/checkpoint-19548/rng_state_58.pth new file mode 100644 index 0000000000000000000000000000000000000000..92f70e81164192d37923b84c78609f64f2d412d2 --- /dev/null +++ b/checkpoint-19548/rng_state_58.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27e39c997f3bd1307ece5a2059a982d33bec37aed3a11085f12776cb9778d406 +size 15997 diff --git a/checkpoint-19548/rng_state_59.pth b/checkpoint-19548/rng_state_59.pth new file mode 100644 index 0000000000000000000000000000000000000000..1fb18ca2ca899a34227f2f35f969f4e8cf6b5dc8 --- /dev/null +++ b/checkpoint-19548/rng_state_59.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdc791f4cd1e1b3295368ae929a95f642fd07cb7fed152779efc05d37b9e6240 +size 15997 diff --git a/checkpoint-19548/rng_state_6.pth b/checkpoint-19548/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1bec3079b8f974d1ee0438fc2c693cf84158ea3f --- /dev/null +++ b/checkpoint-19548/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd047888d2ec6b7a81551756ddda58b86712b06fc1ea53cbcee8dbce90c46896 +size 15984 diff --git a/checkpoint-19548/rng_state_60.pth b/checkpoint-19548/rng_state_60.pth new file mode 100644 index 0000000000000000000000000000000000000000..76552137aff2c0c44ea73017ca0c1b766d8f48b9 --- /dev/null +++ b/checkpoint-19548/rng_state_60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b4965e00f034170de93731dfd4b2970c4e2512c623134fc122f31039cfd6819 +size 15997 diff --git a/checkpoint-19548/rng_state_61.pth b/checkpoint-19548/rng_state_61.pth new file mode 100644 index 0000000000000000000000000000000000000000..e8c842a625fb88f02886dbcbb814cc1a37540ee7 --- /dev/null +++ b/checkpoint-19548/rng_state_61.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23b547df2d4aca3458a065c42c12d001b762a28c98011d9b02b0ccbc9fd2fa9b +size 15997 diff --git a/checkpoint-19548/rng_state_62.pth b/checkpoint-19548/rng_state_62.pth new file mode 100644 index 0000000000000000000000000000000000000000..6698469006bea644cea8dedb73374d886a4a3cf8 --- /dev/null +++ b/checkpoint-19548/rng_state_62.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7891b8e31dc65e99fcdd3e2bfcda8af862766d648c68775b7af516f287aaa2e3 +size 15997 diff --git a/checkpoint-19548/rng_state_63.pth b/checkpoint-19548/rng_state_63.pth new file mode 100644 index 0000000000000000000000000000000000000000..11d2e1b8bb684bf5b2fa905aab900f5ce5abff5f --- /dev/null +++ b/checkpoint-19548/rng_state_63.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:580fb711eb4a2fd35439bca70a163265e9c3773c91d4352f76304f1ba1b1709f +size 15997 diff --git a/checkpoint-19548/rng_state_7.pth b/checkpoint-19548/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..821bdf523fd52e3d7866272080de077f7f18c609 --- /dev/null +++ b/checkpoint-19548/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f6f0f5fd70e40bb25c62675409890a9e1f6214004941cc3ddb87ceefb3db476 +size 15984 diff --git a/checkpoint-19548/rng_state_8.pth b/checkpoint-19548/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..c4247e4fa440bee8f16b489375163a53c3d02d2f --- /dev/null +++ b/checkpoint-19548/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5d08c3a462be06b6732589d215ff465653de763c0963e7ee828426796bf7e15 +size 15984 diff --git a/checkpoint-19548/rng_state_9.pth b/checkpoint-19548/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..8b57ffa99ad0b4f0cd7dad20850a09ba626a374f --- /dev/null +++ b/checkpoint-19548/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:110064d704ceb8dd02360089fcbccad2a6133dbcb6d28d6263e1c54fe316b568 +size 15984 diff --git a/checkpoint-19548/scheduler.pt b/checkpoint-19548/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b17630302f9b3f929959e9f693742720f3957897 --- /dev/null +++ b/checkpoint-19548/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae130fac48c3e8c21cf996952596af5c376c350fb392b25886c2242940de3d8a +size 1064 diff --git a/checkpoint-19548/special_tokens_map.json b/checkpoint-19548/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad43db72a0e94321a5a9455dce616c68d1f9673 --- /dev/null +++ b/checkpoint-19548/special_tokens_map.json @@ -0,0 +1,46 @@ +{ + "additional_special_tokens": [ + { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-19548/tokenizer.json b/checkpoint-19548/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..444d43e1c25d11b63381073024becd006c83d4f6 --- /dev/null +++ b/checkpoint-19548/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fbef9068a1d82c7fafc3fdfd7c717524c8bfbcaea19c14ce4f8a4e616deb57 +size 17210651 diff --git a/checkpoint-19548/tokenizer_config.json b/checkpoint-19548/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a54102d00c210427fe2da524cea00c5ace13686 --- /dev/null +++ b/checkpoint-19548/tokenizer_config.json @@ -0,0 +1,2102 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "[IMG]", + "[/IMG]", + "" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 256, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-19548/trainer_state.json b/checkpoint-19548/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a5a07fc2048c211644a7812cc71f2edcaa1679a7 --- /dev/null +++ b/checkpoint-19548/trainer_state.json @@ -0,0 +1,136870 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 19548, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003069367710251688, + "grad_norm": 1.3492016792297363, + "learning_rate": 0.0, + "loss": 6.5185, + "step": 1 + }, + { + "epoch": 0.0006138735420503376, + "grad_norm": 1.4303781986236572, + "learning_rate": 1.0224948875255626e-07, + "loss": 6.5124, + "step": 2 + }, + { + "epoch": 0.0009208103130755065, + "grad_norm": 1.3981783390045166, + "learning_rate": 2.0449897750511251e-07, + "loss": 6.5204, + "step": 3 + }, + { + "epoch": 0.0012277470841006752, + "grad_norm": 1.3760672807693481, + "learning_rate": 3.0674846625766876e-07, + "loss": 6.502, + "step": 4 + }, + { + "epoch": 0.001534683855125844, + "grad_norm": 1.3704107999801636, + "learning_rate": 4.0899795501022503e-07, + "loss": 6.5021, + "step": 5 + }, + { + "epoch": 0.001841620626151013, + "grad_norm": 1.3109549283981323, + "learning_rate": 5.112474437627812e-07, + "loss": 6.521, + "step": 6 + }, + { + "epoch": 0.002148557397176182, + "grad_norm": 1.475183367729187, + "learning_rate": 6.134969325153375e-07, + "loss": 6.521, + "step": 7 + }, + { + "epoch": 0.0024554941682013503, + "grad_norm": 1.4563297033309937, + "learning_rate": 7.157464212678937e-07, + "loss": 6.5075, + "step": 8 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 1.437183141708374, + "learning_rate": 8.179959100204501e-07, + "loss": 6.5135, + "step": 9 + }, + { + "epoch": 0.003069367710251688, + "grad_norm": 1.336928129196167, + "learning_rate": 9.202453987730062e-07, + "loss": 6.5138, + "step": 10 + }, + { + "epoch": 0.003376304481276857, + "grad_norm": 1.3220698833465576, + "learning_rate": 1.0224948875255625e-06, + "loss": 6.5187, + "step": 11 + }, + { + "epoch": 0.003683241252302026, + "grad_norm": 1.3990652561187744, + "learning_rate": 1.1247443762781187e-06, + "loss": 6.5129, + "step": 12 + }, + { + "epoch": 0.003990178023327195, + "grad_norm": 1.4394340515136719, + "learning_rate": 1.226993865030675e-06, + "loss": 6.5078, + "step": 13 + }, + { + "epoch": 0.004297114794352364, + "grad_norm": 1.3675259351730347, + "learning_rate": 1.3292433537832312e-06, + "loss": 6.5115, + "step": 14 + }, + { + "epoch": 0.004604051565377533, + "grad_norm": 1.3085063695907593, + "learning_rate": 1.4314928425357874e-06, + "loss": 6.5092, + "step": 15 + }, + { + "epoch": 0.004910988336402701, + "grad_norm": 1.4214227199554443, + "learning_rate": 1.5337423312883435e-06, + "loss": 6.5026, + "step": 16 + }, + { + "epoch": 0.0052179251074278695, + "grad_norm": 1.377146601676941, + "learning_rate": 1.6359918200409001e-06, + "loss": 6.4882, + "step": 17 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.3461124897003174, + "learning_rate": 1.7382413087934563e-06, + "loss": 6.4935, + "step": 18 + }, + { + "epoch": 0.005831798649478207, + "grad_norm": 1.3161669969558716, + "learning_rate": 1.8404907975460124e-06, + "loss": 6.4795, + "step": 19 + }, + { + "epoch": 0.006138735420503376, + "grad_norm": 1.2915974855422974, + "learning_rate": 1.942740286298569e-06, + "loss": 6.4529, + "step": 20 + }, + { + "epoch": 0.006445672191528545, + "grad_norm": 1.2675414085388184, + "learning_rate": 2.044989775051125e-06, + "loss": 6.454, + "step": 21 + }, + { + "epoch": 0.006752608962553714, + "grad_norm": 1.2769283056259155, + "learning_rate": 2.147239263803681e-06, + "loss": 6.4574, + "step": 22 + }, + { + "epoch": 0.007059545733578883, + "grad_norm": 1.2556813955307007, + "learning_rate": 2.2494887525562373e-06, + "loss": 6.4486, + "step": 23 + }, + { + "epoch": 0.007366482504604052, + "grad_norm": 1.2158268690109253, + "learning_rate": 2.3517382413087935e-06, + "loss": 6.4357, + "step": 24 + }, + { + "epoch": 0.007673419275629221, + "grad_norm": 1.2383767366409302, + "learning_rate": 2.45398773006135e-06, + "loss": 6.4347, + "step": 25 + }, + { + "epoch": 0.00798035604665439, + "grad_norm": 1.2865383625030518, + "learning_rate": 2.5562372188139062e-06, + "loss": 6.3611, + "step": 26 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 1.1501989364624023, + "learning_rate": 2.6584867075664624e-06, + "loss": 6.3247, + "step": 27 + }, + { + "epoch": 0.008594229588704727, + "grad_norm": 1.0971378087997437, + "learning_rate": 2.7607361963190186e-06, + "loss": 6.3078, + "step": 28 + }, + { + "epoch": 0.008901166359729895, + "grad_norm": 1.1365599632263184, + "learning_rate": 2.8629856850715747e-06, + "loss": 6.3211, + "step": 29 + }, + { + "epoch": 0.009208103130755065, + "grad_norm": 1.1228944063186646, + "learning_rate": 2.965235173824131e-06, + "loss": 6.3185, + "step": 30 + }, + { + "epoch": 0.009515039901780233, + "grad_norm": 1.126287579536438, + "learning_rate": 3.067484662576687e-06, + "loss": 6.2845, + "step": 31 + }, + { + "epoch": 0.009821976672805401, + "grad_norm": 1.1070353984832764, + "learning_rate": 3.1697341513292436e-06, + "loss": 6.2855, + "step": 32 + }, + { + "epoch": 0.010128913443830571, + "grad_norm": 1.101291537284851, + "learning_rate": 3.2719836400818002e-06, + "loss": 6.2764, + "step": 33 + }, + { + "epoch": 0.010435850214855739, + "grad_norm": 1.0643113851547241, + "learning_rate": 3.374233128834356e-06, + "loss": 6.2363, + "step": 34 + }, + { + "epoch": 0.010742786985880909, + "grad_norm": 0.9714563488960266, + "learning_rate": 3.4764826175869125e-06, + "loss": 6.1771, + "step": 35 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8998560309410095, + "learning_rate": 3.5787321063394683e-06, + "loss": 6.1202, + "step": 36 + }, + { + "epoch": 0.011356660527931247, + "grad_norm": 0.8481987714767456, + "learning_rate": 3.680981595092025e-06, + "loss": 6.0954, + "step": 37 + }, + { + "epoch": 0.011663597298956415, + "grad_norm": 0.8124909996986389, + "learning_rate": 3.783231083844581e-06, + "loss": 6.0832, + "step": 38 + }, + { + "epoch": 0.011970534069981584, + "grad_norm": 0.7968178391456604, + "learning_rate": 3.885480572597138e-06, + "loss": 6.0661, + "step": 39 + }, + { + "epoch": 0.012277470841006752, + "grad_norm": 0.7714207768440247, + "learning_rate": 3.987730061349693e-06, + "loss": 6.0385, + "step": 40 + }, + { + "epoch": 0.012584407612031922, + "grad_norm": 0.7436742782592773, + "learning_rate": 4.08997955010225e-06, + "loss": 6.0227, + "step": 41 + }, + { + "epoch": 0.01289134438305709, + "grad_norm": 0.7447277307510376, + "learning_rate": 4.192229038854806e-06, + "loss": 6.0208, + "step": 42 + }, + { + "epoch": 0.013198281154082258, + "grad_norm": 0.6983785629272461, + "learning_rate": 4.294478527607362e-06, + "loss": 6.0295, + "step": 43 + }, + { + "epoch": 0.013505217925107428, + "grad_norm": 0.6630908250808716, + "learning_rate": 4.3967280163599184e-06, + "loss": 6.004, + "step": 44 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.6481929421424866, + "learning_rate": 4.498977505112475e-06, + "loss": 5.9986, + "step": 45 + }, + { + "epoch": 0.014119091467157766, + "grad_norm": 0.7187685966491699, + "learning_rate": 4.601226993865031e-06, + "loss": 6.0008, + "step": 46 + }, + { + "epoch": 0.014426028238182934, + "grad_norm": 0.6550983190536499, + "learning_rate": 4.703476482617587e-06, + "loss": 5.9735, + "step": 47 + }, + { + "epoch": 0.014732965009208104, + "grad_norm": 0.6780675649642944, + "learning_rate": 4.805725971370143e-06, + "loss": 5.9568, + "step": 48 + }, + { + "epoch": 0.015039901780233272, + "grad_norm": 0.703427791595459, + "learning_rate": 4.9079754601227e-06, + "loss": 5.961, + "step": 49 + }, + { + "epoch": 0.015346838551258441, + "grad_norm": 0.6507543921470642, + "learning_rate": 5.0102249488752554e-06, + "loss": 5.9557, + "step": 50 + }, + { + "epoch": 0.01565377532228361, + "grad_norm": 0.5959481000900269, + "learning_rate": 5.1124744376278124e-06, + "loss": 5.9391, + "step": 51 + }, + { + "epoch": 0.01596071209330878, + "grad_norm": 0.5798730254173279, + "learning_rate": 5.214723926380368e-06, + "loss": 5.9488, + "step": 52 + }, + { + "epoch": 0.016267648864333947, + "grad_norm": 0.5932896137237549, + "learning_rate": 5.316973415132925e-06, + "loss": 5.9176, + "step": 53 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5772561430931091, + "learning_rate": 5.419222903885481e-06, + "loss": 5.9069, + "step": 54 + }, + { + "epoch": 0.016881522406384283, + "grad_norm": 0.5578178763389587, + "learning_rate": 5.521472392638037e-06, + "loss": 5.8924, + "step": 55 + }, + { + "epoch": 0.017188459177409455, + "grad_norm": 0.5458457469940186, + "learning_rate": 5.623721881390593e-06, + "loss": 5.9001, + "step": 56 + }, + { + "epoch": 0.017495395948434623, + "grad_norm": 0.5381231904029846, + "learning_rate": 5.7259713701431494e-06, + "loss": 5.8827, + "step": 57 + }, + { + "epoch": 0.01780233271945979, + "grad_norm": 0.540920615196228, + "learning_rate": 5.828220858895706e-06, + "loss": 5.8763, + "step": 58 + }, + { + "epoch": 0.01810926949048496, + "grad_norm": 0.5378615260124207, + "learning_rate": 5.930470347648262e-06, + "loss": 5.865, + "step": 59 + }, + { + "epoch": 0.01841620626151013, + "grad_norm": 0.5139282941818237, + "learning_rate": 6.032719836400819e-06, + "loss": 5.873, + "step": 60 + }, + { + "epoch": 0.0187231430325353, + "grad_norm": 0.5298904776573181, + "learning_rate": 6.134969325153374e-06, + "loss": 5.861, + "step": 61 + }, + { + "epoch": 0.019030079803560467, + "grad_norm": 0.503131628036499, + "learning_rate": 6.237218813905931e-06, + "loss": 5.844, + "step": 62 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.5133433938026428, + "learning_rate": 6.339468302658487e-06, + "loss": 5.8535, + "step": 63 + }, + { + "epoch": 0.019643953345610803, + "grad_norm": 0.4909187853336334, + "learning_rate": 6.4417177914110434e-06, + "loss": 5.8378, + "step": 64 + }, + { + "epoch": 0.019950890116635974, + "grad_norm": 0.6916642785072327, + "learning_rate": 6.5439672801636004e-06, + "loss": 5.8385, + "step": 65 + }, + { + "epoch": 0.020257826887661142, + "grad_norm": 0.4801484942436218, + "learning_rate": 6.646216768916155e-06, + "loss": 5.8089, + "step": 66 + }, + { + "epoch": 0.02056476365868631, + "grad_norm": 0.47745251655578613, + "learning_rate": 6.748466257668712e-06, + "loss": 5.8119, + "step": 67 + }, + { + "epoch": 0.020871700429711478, + "grad_norm": 0.4693359136581421, + "learning_rate": 6.850715746421268e-06, + "loss": 5.8038, + "step": 68 + }, + { + "epoch": 0.02117863720073665, + "grad_norm": 0.46996453404426575, + "learning_rate": 6.952965235173825e-06, + "loss": 5.7966, + "step": 69 + }, + { + "epoch": 0.021485573971761818, + "grad_norm": 0.45779168605804443, + "learning_rate": 7.05521472392638e-06, + "loss": 5.7959, + "step": 70 + }, + { + "epoch": 0.021792510742786986, + "grad_norm": 0.49008259177207947, + "learning_rate": 7.1574642126789366e-06, + "loss": 5.7861, + "step": 71 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.44727766513824463, + "learning_rate": 7.259713701431494e-06, + "loss": 5.7716, + "step": 72 + }, + { + "epoch": 0.022406384284837322, + "grad_norm": 0.4392741918563843, + "learning_rate": 7.36196319018405e-06, + "loss": 5.7776, + "step": 73 + }, + { + "epoch": 0.022713321055862493, + "grad_norm": 0.43525391817092896, + "learning_rate": 7.464212678936605e-06, + "loss": 5.7687, + "step": 74 + }, + { + "epoch": 0.02302025782688766, + "grad_norm": 0.4370710253715515, + "learning_rate": 7.566462167689162e-06, + "loss": 5.7504, + "step": 75 + }, + { + "epoch": 0.02332719459791283, + "grad_norm": 0.4349770247936249, + "learning_rate": 7.668711656441718e-06, + "loss": 5.7425, + "step": 76 + }, + { + "epoch": 0.023634131368937997, + "grad_norm": 0.42710933089256287, + "learning_rate": 7.770961145194275e-06, + "loss": 5.7562, + "step": 77 + }, + { + "epoch": 0.02394106813996317, + "grad_norm": 0.42816224694252014, + "learning_rate": 7.87321063394683e-06, + "loss": 5.7301, + "step": 78 + }, + { + "epoch": 0.024248004910988337, + "grad_norm": 0.4183364510536194, + "learning_rate": 7.975460122699386e-06, + "loss": 5.7131, + "step": 79 + }, + { + "epoch": 0.024554941682013505, + "grad_norm": 0.4179428517818451, + "learning_rate": 8.077709611451943e-06, + "loss": 5.7057, + "step": 80 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.40880727767944336, + "learning_rate": 8.1799591002045e-06, + "loss": 5.7179, + "step": 81 + }, + { + "epoch": 0.025168815224063844, + "grad_norm": 0.40961235761642456, + "learning_rate": 8.282208588957055e-06, + "loss": 5.7008, + "step": 82 + }, + { + "epoch": 0.025475751995089013, + "grad_norm": 0.46789029240608215, + "learning_rate": 8.384458077709612e-06, + "loss": 5.7071, + "step": 83 + }, + { + "epoch": 0.02578268876611418, + "grad_norm": 0.4776248335838318, + "learning_rate": 8.486707566462168e-06, + "loss": 5.6829, + "step": 84 + }, + { + "epoch": 0.02608962553713935, + "grad_norm": 0.40660589933395386, + "learning_rate": 8.588957055214725e-06, + "loss": 5.6732, + "step": 85 + }, + { + "epoch": 0.026396562308164517, + "grad_norm": 0.3984324038028717, + "learning_rate": 8.69120654396728e-06, + "loss": 5.6777, + "step": 86 + }, + { + "epoch": 0.026703499079189688, + "grad_norm": 0.3972148597240448, + "learning_rate": 8.793456032719837e-06, + "loss": 5.6598, + "step": 87 + }, + { + "epoch": 0.027010435850214856, + "grad_norm": 0.3906182050704956, + "learning_rate": 8.895705521472392e-06, + "loss": 5.6468, + "step": 88 + }, + { + "epoch": 0.027317372621240024, + "grad_norm": 0.38598939776420593, + "learning_rate": 8.99795501022495e-06, + "loss": 5.6452, + "step": 89 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.405943363904953, + "learning_rate": 9.100204498977506e-06, + "loss": 5.6408, + "step": 90 + }, + { + "epoch": 0.027931246163290364, + "grad_norm": 0.3859459161758423, + "learning_rate": 9.202453987730062e-06, + "loss": 5.613, + "step": 91 + }, + { + "epoch": 0.028238182934315532, + "grad_norm": 0.3773545026779175, + "learning_rate": 9.304703476482619e-06, + "loss": 5.6277, + "step": 92 + }, + { + "epoch": 0.0285451197053407, + "grad_norm": 0.36915943026542664, + "learning_rate": 9.406952965235174e-06, + "loss": 5.618, + "step": 93 + }, + { + "epoch": 0.028852056476365868, + "grad_norm": 0.3732316792011261, + "learning_rate": 9.509202453987731e-06, + "loss": 5.6066, + "step": 94 + }, + { + "epoch": 0.029158993247391036, + "grad_norm": 0.3670802414417267, + "learning_rate": 9.611451942740286e-06, + "loss": 5.6189, + "step": 95 + }, + { + "epoch": 0.029465930018416207, + "grad_norm": 0.3672202229499817, + "learning_rate": 9.713701431492843e-06, + "loss": 5.6046, + "step": 96 + }, + { + "epoch": 0.029772866789441375, + "grad_norm": 0.3624509871006012, + "learning_rate": 9.8159509202454e-06, + "loss": 5.585, + "step": 97 + }, + { + "epoch": 0.030079803560466543, + "grad_norm": 0.36265870928764343, + "learning_rate": 9.918200408997956e-06, + "loss": 5.5867, + "step": 98 + }, + { + "epoch": 0.03038674033149171, + "grad_norm": 0.3606979548931122, + "learning_rate": 1.0020449897750511e-05, + "loss": 5.5658, + "step": 99 + }, + { + "epoch": 0.030693677102516883, + "grad_norm": 0.36800363659858704, + "learning_rate": 1.0122699386503068e-05, + "loss": 5.5494, + "step": 100 + }, + { + "epoch": 0.03100061387354205, + "grad_norm": 0.3641016483306885, + "learning_rate": 1.0224948875255625e-05, + "loss": 5.5553, + "step": 101 + }, + { + "epoch": 0.03130755064456722, + "grad_norm": 0.36807990074157715, + "learning_rate": 1.032719836400818e-05, + "loss": 5.5315, + "step": 102 + }, + { + "epoch": 0.03161448741559239, + "grad_norm": 0.37071728706359863, + "learning_rate": 1.0429447852760736e-05, + "loss": 5.522, + "step": 103 + }, + { + "epoch": 0.03192142418661756, + "grad_norm": 0.3549076020717621, + "learning_rate": 1.0531697341513293e-05, + "loss": 5.5354, + "step": 104 + }, + { + "epoch": 0.03222836095764273, + "grad_norm": 0.3589537441730499, + "learning_rate": 1.063394683026585e-05, + "loss": 5.534, + "step": 105 + }, + { + "epoch": 0.032535297728667895, + "grad_norm": 0.4341397285461426, + "learning_rate": 1.0736196319018407e-05, + "loss": 5.5088, + "step": 106 + }, + { + "epoch": 0.03284223449969306, + "grad_norm": 0.37220680713653564, + "learning_rate": 1.0838445807770962e-05, + "loss": 5.5213, + "step": 107 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.3776145875453949, + "learning_rate": 1.0940695296523517e-05, + "loss": 5.4955, + "step": 108 + }, + { + "epoch": 0.0334561080417434, + "grad_norm": 0.38651829957962036, + "learning_rate": 1.1042944785276074e-05, + "loss": 5.4916, + "step": 109 + }, + { + "epoch": 0.03376304481276857, + "grad_norm": 0.3749970495700836, + "learning_rate": 1.1145194274028631e-05, + "loss": 5.4686, + "step": 110 + }, + { + "epoch": 0.03406998158379374, + "grad_norm": 0.38184404373168945, + "learning_rate": 1.1247443762781187e-05, + "loss": 5.4694, + "step": 111 + }, + { + "epoch": 0.03437691835481891, + "grad_norm": 0.38783952593803406, + "learning_rate": 1.1349693251533742e-05, + "loss": 5.4447, + "step": 112 + }, + { + "epoch": 0.03468385512584408, + "grad_norm": 0.369125097990036, + "learning_rate": 1.1451942740286299e-05, + "loss": 5.4506, + "step": 113 + }, + { + "epoch": 0.034990791896869246, + "grad_norm": 0.3773012161254883, + "learning_rate": 1.1554192229038856e-05, + "loss": 5.4637, + "step": 114 + }, + { + "epoch": 0.035297728667894414, + "grad_norm": 0.47702446579933167, + "learning_rate": 1.1656441717791411e-05, + "loss": 5.4487, + "step": 115 + }, + { + "epoch": 0.03560466543891958, + "grad_norm": 0.5288241505622864, + "learning_rate": 1.1758691206543968e-05, + "loss": 5.4216, + "step": 116 + }, + { + "epoch": 0.03591160220994475, + "grad_norm": 0.49916699528694153, + "learning_rate": 1.1860940695296524e-05, + "loss": 5.4055, + "step": 117 + }, + { + "epoch": 0.03621853898096992, + "grad_norm": 0.5027921795845032, + "learning_rate": 1.196319018404908e-05, + "loss": 5.4141, + "step": 118 + }, + { + "epoch": 0.036525475751995086, + "grad_norm": 0.5069209933280945, + "learning_rate": 1.2065439672801638e-05, + "loss": 5.4277, + "step": 119 + }, + { + "epoch": 0.03683241252302026, + "grad_norm": 0.5208525657653809, + "learning_rate": 1.2167689161554193e-05, + "loss": 5.4023, + "step": 120 + }, + { + "epoch": 0.03713934929404543, + "grad_norm": 0.7059593796730042, + "learning_rate": 1.2269938650306748e-05, + "loss": 5.3797, + "step": 121 + }, + { + "epoch": 0.0374462860650706, + "grad_norm": 0.71112060546875, + "learning_rate": 1.2372188139059305e-05, + "loss": 5.3619, + "step": 122 + }, + { + "epoch": 0.037753222836095765, + "grad_norm": 0.5095361471176147, + "learning_rate": 1.2474437627811862e-05, + "loss": 5.3667, + "step": 123 + }, + { + "epoch": 0.03806015960712093, + "grad_norm": 0.986062228679657, + "learning_rate": 1.2576687116564418e-05, + "loss": 5.3459, + "step": 124 + }, + { + "epoch": 0.0383670963781461, + "grad_norm": 0.693392813205719, + "learning_rate": 1.2678936605316975e-05, + "loss": 5.3165, + "step": 125 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7835625410079956, + "learning_rate": 1.278118609406953e-05, + "loss": 5.3205, + "step": 126 + }, + { + "epoch": 0.03898096992019644, + "grad_norm": 0.6314569711685181, + "learning_rate": 1.2883435582822087e-05, + "loss": 5.3287, + "step": 127 + }, + { + "epoch": 0.039287906691221605, + "grad_norm": 0.9079526662826538, + "learning_rate": 1.2985685071574644e-05, + "loss": 5.2935, + "step": 128 + }, + { + "epoch": 0.03959484346224678, + "grad_norm": 0.6998131275177002, + "learning_rate": 1.3087934560327201e-05, + "loss": 5.315, + "step": 129 + }, + { + "epoch": 0.03990178023327195, + "grad_norm": 0.7570182085037231, + "learning_rate": 1.3190184049079754e-05, + "loss": 5.293, + "step": 130 + }, + { + "epoch": 0.040208717004297116, + "grad_norm": 0.6972737908363342, + "learning_rate": 1.329243353783231e-05, + "loss": 5.2863, + "step": 131 + }, + { + "epoch": 0.040515653775322284, + "grad_norm": 0.8841190934181213, + "learning_rate": 1.3394683026584867e-05, + "loss": 5.2518, + "step": 132 + }, + { + "epoch": 0.04082259054634745, + "grad_norm": 0.6792641282081604, + "learning_rate": 1.3496932515337424e-05, + "loss": 5.2386, + "step": 133 + }, + { + "epoch": 0.04112952731737262, + "grad_norm": 0.9234145879745483, + "learning_rate": 1.359918200408998e-05, + "loss": 5.2418, + "step": 134 + }, + { + "epoch": 0.04143646408839779, + "grad_norm": 1.1438226699829102, + "learning_rate": 1.3701431492842536e-05, + "loss": 5.2298, + "step": 135 + }, + { + "epoch": 0.041743400859422956, + "grad_norm": 0.910861074924469, + "learning_rate": 1.3803680981595093e-05, + "loss": 5.2437, + "step": 136 + }, + { + "epoch": 0.042050337630448124, + "grad_norm": 0.8995844721794128, + "learning_rate": 1.390593047034765e-05, + "loss": 5.2456, + "step": 137 + }, + { + "epoch": 0.0423572744014733, + "grad_norm": 0.8543404936790466, + "learning_rate": 1.4008179959100204e-05, + "loss": 5.1888, + "step": 138 + }, + { + "epoch": 0.04266421117249847, + "grad_norm": 0.7565917372703552, + "learning_rate": 1.411042944785276e-05, + "loss": 5.1939, + "step": 139 + }, + { + "epoch": 0.042971147943523635, + "grad_norm": 0.7103878259658813, + "learning_rate": 1.4212678936605318e-05, + "loss": 5.1693, + "step": 140 + }, + { + "epoch": 0.0432780847145488, + "grad_norm": 1.008686900138855, + "learning_rate": 1.4314928425357873e-05, + "loss": 5.1467, + "step": 141 + }, + { + "epoch": 0.04358502148557397, + "grad_norm": 1.1134421825408936, + "learning_rate": 1.441717791411043e-05, + "loss": 5.1695, + "step": 142 + }, + { + "epoch": 0.04389195825659914, + "grad_norm": 0.7418283820152283, + "learning_rate": 1.4519427402862987e-05, + "loss": 5.1556, + "step": 143 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 1.3332276344299316, + "learning_rate": 1.4621676891615542e-05, + "loss": 5.1736, + "step": 144 + }, + { + "epoch": 0.044505831798649476, + "grad_norm": 0.99709153175354, + "learning_rate": 1.47239263803681e-05, + "loss": 5.1326, + "step": 145 + }, + { + "epoch": 0.044812768569674644, + "grad_norm": 2.0185158252716064, + "learning_rate": 1.4826175869120657e-05, + "loss": 5.1075, + "step": 146 + }, + { + "epoch": 0.04511970534069982, + "grad_norm": 0.9810693264007568, + "learning_rate": 1.492842535787321e-05, + "loss": 5.1181, + "step": 147 + }, + { + "epoch": 0.04542664211172499, + "grad_norm": 1.3122087717056274, + "learning_rate": 1.5030674846625767e-05, + "loss": 5.1104, + "step": 148 + }, + { + "epoch": 0.045733578882750155, + "grad_norm": 1.230662226676941, + "learning_rate": 1.5132924335378324e-05, + "loss": 5.0721, + "step": 149 + }, + { + "epoch": 0.04604051565377532, + "grad_norm": 0.9584419131278992, + "learning_rate": 1.523517382413088e-05, + "loss": 5.0574, + "step": 150 + }, + { + "epoch": 0.04634745242480049, + "grad_norm": 1.3933353424072266, + "learning_rate": 1.5337423312883436e-05, + "loss": 5.0468, + "step": 151 + }, + { + "epoch": 0.04665438919582566, + "grad_norm": 1.2336134910583496, + "learning_rate": 1.5439672801635993e-05, + "loss": 5.0596, + "step": 152 + }, + { + "epoch": 0.04696132596685083, + "grad_norm": 1.3005256652832031, + "learning_rate": 1.554192229038855e-05, + "loss": 5.0236, + "step": 153 + }, + { + "epoch": 0.047268262737875995, + "grad_norm": 1.2528692483901978, + "learning_rate": 1.5644171779141108e-05, + "loss": 5.0269, + "step": 154 + }, + { + "epoch": 0.04757519950890117, + "grad_norm": 1.0448148250579834, + "learning_rate": 1.574642126789366e-05, + "loss": 5.0338, + "step": 155 + }, + { + "epoch": 0.04788213627992634, + "grad_norm": 1.2372045516967773, + "learning_rate": 1.5848670756646218e-05, + "loss": 4.9544, + "step": 156 + }, + { + "epoch": 0.048189073050951506, + "grad_norm": 1.2700645923614502, + "learning_rate": 1.5950920245398772e-05, + "loss": 4.9723, + "step": 157 + }, + { + "epoch": 0.048496009821976674, + "grad_norm": 1.1283228397369385, + "learning_rate": 1.605316973415133e-05, + "loss": 4.9801, + "step": 158 + }, + { + "epoch": 0.04880294659300184, + "grad_norm": 1.5563665628433228, + "learning_rate": 1.6155419222903886e-05, + "loss": 4.9118, + "step": 159 + }, + { + "epoch": 0.04910988336402701, + "grad_norm": 1.3759487867355347, + "learning_rate": 1.6257668711656443e-05, + "loss": 4.9552, + "step": 160 + }, + { + "epoch": 0.04941682013505218, + "grad_norm": 1.2167878150939941, + "learning_rate": 1.6359918200409e-05, + "loss": 4.9186, + "step": 161 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.6424930095672607, + "learning_rate": 1.6462167689161557e-05, + "loss": 4.9143, + "step": 162 + }, + { + "epoch": 0.050030693677102514, + "grad_norm": 1.0009948015213013, + "learning_rate": 1.656441717791411e-05, + "loss": 4.8615, + "step": 163 + }, + { + "epoch": 0.05033763044812769, + "grad_norm": 1.8803274631500244, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.8558, + "step": 164 + }, + { + "epoch": 0.05064456721915286, + "grad_norm": 1.1819735765457153, + "learning_rate": 1.6768916155419224e-05, + "loss": 4.8453, + "step": 165 + }, + { + "epoch": 0.050951503990178025, + "grad_norm": 1.9724273681640625, + "learning_rate": 1.6871165644171778e-05, + "loss": 4.8573, + "step": 166 + }, + { + "epoch": 0.05125844076120319, + "grad_norm": 1.4624557495117188, + "learning_rate": 1.6973415132924335e-05, + "loss": 4.8494, + "step": 167 + }, + { + "epoch": 0.05156537753222836, + "grad_norm": 1.4750267267227173, + "learning_rate": 1.7075664621676892e-05, + "loss": 4.8296, + "step": 168 + }, + { + "epoch": 0.05187231430325353, + "grad_norm": 1.3206923007965088, + "learning_rate": 1.717791411042945e-05, + "loss": 4.7834, + "step": 169 + }, + { + "epoch": 0.0521792510742787, + "grad_norm": 1.4332681894302368, + "learning_rate": 1.7280163599182006e-05, + "loss": 4.8008, + "step": 170 + }, + { + "epoch": 0.052486187845303865, + "grad_norm": 1.612804651260376, + "learning_rate": 1.738241308793456e-05, + "loss": 4.7885, + "step": 171 + }, + { + "epoch": 0.05279312461632903, + "grad_norm": 1.3880311250686646, + "learning_rate": 1.7484662576687117e-05, + "loss": 4.8034, + "step": 172 + }, + { + "epoch": 0.05310006138735421, + "grad_norm": 1.7550631761550903, + "learning_rate": 1.7586912065439674e-05, + "loss": 4.7568, + "step": 173 + }, + { + "epoch": 0.053406998158379376, + "grad_norm": 1.653678297996521, + "learning_rate": 1.768916155419223e-05, + "loss": 4.7294, + "step": 174 + }, + { + "epoch": 0.053713934929404544, + "grad_norm": 1.6094826459884644, + "learning_rate": 1.7791411042944784e-05, + "loss": 4.7409, + "step": 175 + }, + { + "epoch": 0.05402087170042971, + "grad_norm": 1.7453033924102783, + "learning_rate": 1.789366053169734e-05, + "loss": 4.7191, + "step": 176 + }, + { + "epoch": 0.05432780847145488, + "grad_norm": 1.3073794841766357, + "learning_rate": 1.79959100204499e-05, + "loss": 4.7347, + "step": 177 + }, + { + "epoch": 0.05463474524248005, + "grad_norm": 2.096515655517578, + "learning_rate": 1.8098159509202455e-05, + "loss": 4.7396, + "step": 178 + }, + { + "epoch": 0.054941682013505216, + "grad_norm": 1.3826024532318115, + "learning_rate": 1.8200408997955012e-05, + "loss": 4.6988, + "step": 179 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 1.9290310144424438, + "learning_rate": 1.8302658486707566e-05, + "loss": 4.6653, + "step": 180 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.7404149770736694, + "learning_rate": 1.8404907975460123e-05, + "loss": 4.7102, + "step": 181 + }, + { + "epoch": 0.05586249232658073, + "grad_norm": 1.7535779476165771, + "learning_rate": 1.850715746421268e-05, + "loss": 4.7124, + "step": 182 + }, + { + "epoch": 0.056169429097605895, + "grad_norm": 1.7792351245880127, + "learning_rate": 1.8609406952965237e-05, + "loss": 4.6969, + "step": 183 + }, + { + "epoch": 0.056476365868631064, + "grad_norm": 2.048332452774048, + "learning_rate": 1.8711656441717794e-05, + "loss": 4.6134, + "step": 184 + }, + { + "epoch": 0.05678330263965623, + "grad_norm": 1.9558366537094116, + "learning_rate": 1.8813905930470348e-05, + "loss": 4.6739, + "step": 185 + }, + { + "epoch": 0.0570902394106814, + "grad_norm": 2.5299644470214844, + "learning_rate": 1.8916155419222905e-05, + "loss": 4.6248, + "step": 186 + }, + { + "epoch": 0.05739717618170657, + "grad_norm": 2.143704891204834, + "learning_rate": 1.9018404907975462e-05, + "loss": 4.6664, + "step": 187 + }, + { + "epoch": 0.057704112952731736, + "grad_norm": 1.925010323524475, + "learning_rate": 1.9120654396728015e-05, + "loss": 4.5657, + "step": 188 + }, + { + "epoch": 0.058011049723756904, + "grad_norm": 1.8223596811294556, + "learning_rate": 1.9222903885480572e-05, + "loss": 4.6124, + "step": 189 + }, + { + "epoch": 0.05831798649478207, + "grad_norm": 1.9519827365875244, + "learning_rate": 1.932515337423313e-05, + "loss": 4.5937, + "step": 190 + }, + { + "epoch": 0.05862492326580725, + "grad_norm": 2.062534809112549, + "learning_rate": 1.9427402862985686e-05, + "loss": 4.6023, + "step": 191 + }, + { + "epoch": 0.058931860036832415, + "grad_norm": 1.8512892723083496, + "learning_rate": 1.9529652351738243e-05, + "loss": 4.5709, + "step": 192 + }, + { + "epoch": 0.05923879680785758, + "grad_norm": 2.7771248817443848, + "learning_rate": 1.96319018404908e-05, + "loss": 4.5902, + "step": 193 + }, + { + "epoch": 0.05954573357888275, + "grad_norm": 1.8911874294281006, + "learning_rate": 1.9734151329243354e-05, + "loss": 4.4973, + "step": 194 + }, + { + "epoch": 0.05985267034990792, + "grad_norm": 2.261096715927124, + "learning_rate": 1.983640081799591e-05, + "loss": 4.5343, + "step": 195 + }, + { + "epoch": 0.06015960712093309, + "grad_norm": 1.833983302116394, + "learning_rate": 1.9938650306748465e-05, + "loss": 4.5604, + "step": 196 + }, + { + "epoch": 0.060466543891958255, + "grad_norm": 2.6909141540527344, + "learning_rate": 2.0040899795501022e-05, + "loss": 4.5411, + "step": 197 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.8085883855819702, + "learning_rate": 2.014314928425358e-05, + "loss": 4.5127, + "step": 198 + }, + { + "epoch": 0.06108041743400859, + "grad_norm": 3.082063913345337, + "learning_rate": 2.0245398773006136e-05, + "loss": 4.5055, + "step": 199 + }, + { + "epoch": 0.061387354205033766, + "grad_norm": 1.6942392587661743, + "learning_rate": 2.0347648261758693e-05, + "loss": 4.4852, + "step": 200 + }, + { + "epoch": 0.061694290976058934, + "grad_norm": 2.428569793701172, + "learning_rate": 2.044989775051125e-05, + "loss": 4.4876, + "step": 201 + }, + { + "epoch": 0.0620012277470841, + "grad_norm": 2.1669068336486816, + "learning_rate": 2.0552147239263807e-05, + "loss": 4.5156, + "step": 202 + }, + { + "epoch": 0.06230816451810927, + "grad_norm": 1.8558237552642822, + "learning_rate": 2.065439672801636e-05, + "loss": 4.495, + "step": 203 + }, + { + "epoch": 0.06261510128913444, + "grad_norm": 2.86224627494812, + "learning_rate": 2.0756646216768917e-05, + "loss": 4.4881, + "step": 204 + }, + { + "epoch": 0.06292203806015961, + "grad_norm": 2.263230562210083, + "learning_rate": 2.085889570552147e-05, + "loss": 4.4349, + "step": 205 + }, + { + "epoch": 0.06322897483118478, + "grad_norm": 2.533039093017578, + "learning_rate": 2.0961145194274028e-05, + "loss": 4.4921, + "step": 206 + }, + { + "epoch": 0.06353591160220995, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.1063394683026585e-05, + "loss": 4.4581, + "step": 207 + }, + { + "epoch": 0.06384284837323512, + "grad_norm": 1.9801981449127197, + "learning_rate": 2.1165644171779142e-05, + "loss": 4.4646, + "step": 208 + }, + { + "epoch": 0.06414978514426029, + "grad_norm": 2.8499860763549805, + "learning_rate": 2.12678936605317e-05, + "loss": 4.3913, + "step": 209 + }, + { + "epoch": 0.06445672191528545, + "grad_norm": 1.8176993131637573, + "learning_rate": 2.1370143149284256e-05, + "loss": 4.4414, + "step": 210 + }, + { + "epoch": 0.06476365868631062, + "grad_norm": 3.1497061252593994, + "learning_rate": 2.1472392638036813e-05, + "loss": 4.4164, + "step": 211 + }, + { + "epoch": 0.06507059545733579, + "grad_norm": 2.0509049892425537, + "learning_rate": 2.1574642126789367e-05, + "loss": 4.4198, + "step": 212 + }, + { + "epoch": 0.06537753222836096, + "grad_norm": 2.5346014499664307, + "learning_rate": 2.1676891615541924e-05, + "loss": 4.3628, + "step": 213 + }, + { + "epoch": 0.06568446899938613, + "grad_norm": 2.281947135925293, + "learning_rate": 2.1779141104294477e-05, + "loss": 4.3824, + "step": 214 + }, + { + "epoch": 0.0659914057704113, + "grad_norm": 2.9005074501037598, + "learning_rate": 2.1881390593047034e-05, + "loss": 4.4227, + "step": 215 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 2.5869741439819336, + "learning_rate": 2.198364008179959e-05, + "loss": 4.4231, + "step": 216 + }, + { + "epoch": 0.06660527931246163, + "grad_norm": 2.339655637741089, + "learning_rate": 2.208588957055215e-05, + "loss": 4.3901, + "step": 217 + }, + { + "epoch": 0.0669122160834868, + "grad_norm": 2.430664539337158, + "learning_rate": 2.2188139059304705e-05, + "loss": 4.3487, + "step": 218 + }, + { + "epoch": 0.06721915285451197, + "grad_norm": 2.1791040897369385, + "learning_rate": 2.2290388548057262e-05, + "loss": 4.3404, + "step": 219 + }, + { + "epoch": 0.06752608962553713, + "grad_norm": 2.7054920196533203, + "learning_rate": 2.239263803680982e-05, + "loss": 4.4186, + "step": 220 + }, + { + "epoch": 0.0678330263965623, + "grad_norm": 2.516566514968872, + "learning_rate": 2.2494887525562373e-05, + "loss": 4.4102, + "step": 221 + }, + { + "epoch": 0.06813996316758748, + "grad_norm": 2.3522324562072754, + "learning_rate": 2.259713701431493e-05, + "loss": 4.4062, + "step": 222 + }, + { + "epoch": 0.06844689993861265, + "grad_norm": 2.557600259780884, + "learning_rate": 2.2699386503067484e-05, + "loss": 4.3711, + "step": 223 + }, + { + "epoch": 0.06875383670963782, + "grad_norm": 2.0590531826019287, + "learning_rate": 2.280163599182004e-05, + "loss": 4.3546, + "step": 224 + }, + { + "epoch": 0.06906077348066299, + "grad_norm": 4.704878330230713, + "learning_rate": 2.2903885480572598e-05, + "loss": 4.39, + "step": 225 + }, + { + "epoch": 0.06936771025168816, + "grad_norm": 2.237440347671509, + "learning_rate": 2.3006134969325155e-05, + "loss": 4.3425, + "step": 226 + }, + { + "epoch": 0.06967464702271332, + "grad_norm": 3.9394450187683105, + "learning_rate": 2.3108384458077712e-05, + "loss": 4.3641, + "step": 227 + }, + { + "epoch": 0.06998158379373849, + "grad_norm": 2.4857213497161865, + "learning_rate": 2.321063394683027e-05, + "loss": 4.3435, + "step": 228 + }, + { + "epoch": 0.07028852056476366, + "grad_norm": 2.893437147140503, + "learning_rate": 2.3312883435582822e-05, + "loss": 4.329, + "step": 229 + }, + { + "epoch": 0.07059545733578883, + "grad_norm": 2.6498284339904785, + "learning_rate": 2.341513292433538e-05, + "loss": 4.3058, + "step": 230 + }, + { + "epoch": 0.070902394106814, + "grad_norm": 2.4182214736938477, + "learning_rate": 2.3517382413087936e-05, + "loss": 4.3147, + "step": 231 + }, + { + "epoch": 0.07120933087783916, + "grad_norm": 2.532050371170044, + "learning_rate": 2.361963190184049e-05, + "loss": 4.3388, + "step": 232 + }, + { + "epoch": 0.07151626764886433, + "grad_norm": 2.5818533897399902, + "learning_rate": 2.3721881390593047e-05, + "loss": 4.3023, + "step": 233 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 2.1860098838806152, + "learning_rate": 2.3824130879345604e-05, + "loss": 4.2571, + "step": 234 + }, + { + "epoch": 0.07213014119091467, + "grad_norm": 3.5780131816864014, + "learning_rate": 2.392638036809816e-05, + "loss": 4.3336, + "step": 235 + }, + { + "epoch": 0.07243707796193984, + "grad_norm": 2.24653697013855, + "learning_rate": 2.4028629856850718e-05, + "loss": 4.3013, + "step": 236 + }, + { + "epoch": 0.072744014732965, + "grad_norm": 3.59663987159729, + "learning_rate": 2.4130879345603275e-05, + "loss": 4.3248, + "step": 237 + }, + { + "epoch": 0.07305095150399017, + "grad_norm": 2.818321943283081, + "learning_rate": 2.423312883435583e-05, + "loss": 4.2876, + "step": 238 + }, + { + "epoch": 0.07335788827501534, + "grad_norm": 2.457371950149536, + "learning_rate": 2.4335378323108386e-05, + "loss": 4.2584, + "step": 239 + }, + { + "epoch": 0.07366482504604052, + "grad_norm": 3.6243598461151123, + "learning_rate": 2.4437627811860943e-05, + "loss": 4.2786, + "step": 240 + }, + { + "epoch": 0.07397176181706569, + "grad_norm": 2.113060474395752, + "learning_rate": 2.4539877300613496e-05, + "loss": 4.2071, + "step": 241 + }, + { + "epoch": 0.07427869858809086, + "grad_norm": 5.355374813079834, + "learning_rate": 2.4642126789366053e-05, + "loss": 4.2871, + "step": 242 + }, + { + "epoch": 0.07458563535911603, + "grad_norm": 2.4509847164154053, + "learning_rate": 2.474437627811861e-05, + "loss": 4.2073, + "step": 243 + }, + { + "epoch": 0.0748925721301412, + "grad_norm": 3.313793659210205, + "learning_rate": 2.4846625766871167e-05, + "loss": 4.2938, + "step": 244 + }, + { + "epoch": 0.07519950890116636, + "grad_norm": 2.731903553009033, + "learning_rate": 2.4948875255623724e-05, + "loss": 4.2023, + "step": 245 + }, + { + "epoch": 0.07550644567219153, + "grad_norm": 2.6218042373657227, + "learning_rate": 2.505112474437628e-05, + "loss": 4.2492, + "step": 246 + }, + { + "epoch": 0.0758133824432167, + "grad_norm": 3.2865426540374756, + "learning_rate": 2.5153374233128835e-05, + "loss": 4.2358, + "step": 247 + }, + { + "epoch": 0.07612031921424187, + "grad_norm": 2.21870756149292, + "learning_rate": 2.5255623721881395e-05, + "loss": 4.1989, + "step": 248 + }, + { + "epoch": 0.07642725598526703, + "grad_norm": 4.095842361450195, + "learning_rate": 2.535787321063395e-05, + "loss": 4.2484, + "step": 249 + }, + { + "epoch": 0.0767341927562922, + "grad_norm": 2.21420955657959, + "learning_rate": 2.5460122699386503e-05, + "loss": 4.1985, + "step": 250 + }, + { + "epoch": 0.07704112952731737, + "grad_norm": 3.011272668838501, + "learning_rate": 2.556237218813906e-05, + "loss": 4.2182, + "step": 251 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 2.930999279022217, + "learning_rate": 2.5664621676891613e-05, + "loss": 4.1985, + "step": 252 + }, + { + "epoch": 0.0776550030693677, + "grad_norm": 2.8528032302856445, + "learning_rate": 2.5766871165644174e-05, + "loss": 4.1859, + "step": 253 + }, + { + "epoch": 0.07796193984039287, + "grad_norm": 3.215587854385376, + "learning_rate": 2.5869120654396727e-05, + "loss": 4.2416, + "step": 254 + }, + { + "epoch": 0.07826887661141804, + "grad_norm": 3.1349990367889404, + "learning_rate": 2.5971370143149288e-05, + "loss": 4.2204, + "step": 255 + }, + { + "epoch": 0.07857581338244321, + "grad_norm": 3.146942377090454, + "learning_rate": 2.607361963190184e-05, + "loss": 4.17, + "step": 256 + }, + { + "epoch": 0.07888275015346839, + "grad_norm": 2.2611942291259766, + "learning_rate": 2.6175869120654402e-05, + "loss": 4.191, + "step": 257 + }, + { + "epoch": 0.07918968692449356, + "grad_norm": 3.434574604034424, + "learning_rate": 2.6278118609406955e-05, + "loss": 4.1854, + "step": 258 + }, + { + "epoch": 0.07949662369551873, + "grad_norm": 2.3132400512695312, + "learning_rate": 2.638036809815951e-05, + "loss": 4.233, + "step": 259 + }, + { + "epoch": 0.0798035604665439, + "grad_norm": 3.2676596641540527, + "learning_rate": 2.6482617586912066e-05, + "loss": 4.1586, + "step": 260 + }, + { + "epoch": 0.08011049723756906, + "grad_norm": 2.6182920932769775, + "learning_rate": 2.658486707566462e-05, + "loss": 4.164, + "step": 261 + }, + { + "epoch": 0.08041743400859423, + "grad_norm": 2.872018814086914, + "learning_rate": 2.668711656441718e-05, + "loss": 4.1642, + "step": 262 + }, + { + "epoch": 0.0807243707796194, + "grad_norm": 3.147237539291382, + "learning_rate": 2.6789366053169734e-05, + "loss": 4.147, + "step": 263 + }, + { + "epoch": 0.08103130755064457, + "grad_norm": 2.363360643386841, + "learning_rate": 2.6891615541922294e-05, + "loss": 4.1388, + "step": 264 + }, + { + "epoch": 0.08133824432166974, + "grad_norm": 3.364442825317383, + "learning_rate": 2.6993865030674848e-05, + "loss": 4.1678, + "step": 265 + }, + { + "epoch": 0.0816451810926949, + "grad_norm": 2.393705368041992, + "learning_rate": 2.7096114519427408e-05, + "loss": 4.1626, + "step": 266 + }, + { + "epoch": 0.08195211786372007, + "grad_norm": 3.8512558937072754, + "learning_rate": 2.719836400817996e-05, + "loss": 4.1613, + "step": 267 + }, + { + "epoch": 0.08225905463474524, + "grad_norm": 3.0992584228515625, + "learning_rate": 2.7300613496932515e-05, + "loss": 4.1486, + "step": 268 + }, + { + "epoch": 0.08256599140577041, + "grad_norm": 3.481079578399658, + "learning_rate": 2.7402862985685072e-05, + "loss": 4.1772, + "step": 269 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 3.2167513370513916, + "learning_rate": 2.7505112474437626e-05, + "loss": 4.1253, + "step": 270 + }, + { + "epoch": 0.08317986494782074, + "grad_norm": 2.9698429107666016, + "learning_rate": 2.7607361963190186e-05, + "loss": 4.0897, + "step": 271 + }, + { + "epoch": 0.08348680171884591, + "grad_norm": 3.2549962997436523, + "learning_rate": 2.770961145194274e-05, + "loss": 4.0851, + "step": 272 + }, + { + "epoch": 0.08379373848987108, + "grad_norm": 3.089301824569702, + "learning_rate": 2.78118609406953e-05, + "loss": 4.1378, + "step": 273 + }, + { + "epoch": 0.08410067526089625, + "grad_norm": 3.1799745559692383, + "learning_rate": 2.7914110429447854e-05, + "loss": 4.159, + "step": 274 + }, + { + "epoch": 0.08440761203192143, + "grad_norm": 2.7577199935913086, + "learning_rate": 2.8016359918200408e-05, + "loss": 4.0524, + "step": 275 + }, + { + "epoch": 0.0847145488029466, + "grad_norm": 3.709740161895752, + "learning_rate": 2.8118609406952968e-05, + "loss": 4.0877, + "step": 276 + }, + { + "epoch": 0.08502148557397177, + "grad_norm": 2.930482864379883, + "learning_rate": 2.822085889570552e-05, + "loss": 4.0408, + "step": 277 + }, + { + "epoch": 0.08532842234499693, + "grad_norm": 3.8216278553009033, + "learning_rate": 2.832310838445808e-05, + "loss": 4.0915, + "step": 278 + }, + { + "epoch": 0.0856353591160221, + "grad_norm": 2.7614903450012207, + "learning_rate": 2.8425357873210636e-05, + "loss": 4.0793, + "step": 279 + }, + { + "epoch": 0.08594229588704727, + "grad_norm": 4.005281448364258, + "learning_rate": 2.8527607361963193e-05, + "loss": 4.1234, + "step": 280 + }, + { + "epoch": 0.08624923265807244, + "grad_norm": 2.731640338897705, + "learning_rate": 2.8629856850715746e-05, + "loss": 4.1408, + "step": 281 + }, + { + "epoch": 0.0865561694290976, + "grad_norm": 4.439471244812012, + "learning_rate": 2.8732106339468307e-05, + "loss": 4.08, + "step": 282 + }, + { + "epoch": 0.08686310620012277, + "grad_norm": 2.929032564163208, + "learning_rate": 2.883435582822086e-05, + "loss": 4.0521, + "step": 283 + }, + { + "epoch": 0.08717004297114794, + "grad_norm": 3.3943557739257812, + "learning_rate": 2.8936605316973414e-05, + "loss": 4.0936, + "step": 284 + }, + { + "epoch": 0.08747697974217311, + "grad_norm": 2.9899704456329346, + "learning_rate": 2.9038854805725974e-05, + "loss": 4.0985, + "step": 285 + }, + { + "epoch": 0.08778391651319828, + "grad_norm": 2.8169870376586914, + "learning_rate": 2.9141104294478528e-05, + "loss": 4.1044, + "step": 286 + }, + { + "epoch": 0.08809085328422345, + "grad_norm": 4.312693119049072, + "learning_rate": 2.9243353783231085e-05, + "loss": 4.0515, + "step": 287 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 2.9270846843719482, + "learning_rate": 2.9345603271983642e-05, + "loss": 4.0221, + "step": 288 + }, + { + "epoch": 0.08870472682627378, + "grad_norm": 3.9831974506378174, + "learning_rate": 2.94478527607362e-05, + "loss": 4.0807, + "step": 289 + }, + { + "epoch": 0.08901166359729895, + "grad_norm": 2.721794605255127, + "learning_rate": 2.9550102249488753e-05, + "loss": 4.0732, + "step": 290 + }, + { + "epoch": 0.08931860036832412, + "grad_norm": 4.721047878265381, + "learning_rate": 2.9652351738241313e-05, + "loss": 4.0457, + "step": 291 + }, + { + "epoch": 0.08962553713934929, + "grad_norm": 2.785738229751587, + "learning_rate": 2.9754601226993867e-05, + "loss": 4.0288, + "step": 292 + }, + { + "epoch": 0.08993247391037447, + "grad_norm": 4.842009544372559, + "learning_rate": 2.985685071574642e-05, + "loss": 4.1193, + "step": 293 + }, + { + "epoch": 0.09023941068139964, + "grad_norm": 2.802044153213501, + "learning_rate": 2.995910020449898e-05, + "loss": 4.0055, + "step": 294 + }, + { + "epoch": 0.0905463474524248, + "grad_norm": 3.7060954570770264, + "learning_rate": 3.0061349693251534e-05, + "loss": 4.0478, + "step": 295 + }, + { + "epoch": 0.09085328422344997, + "grad_norm": 2.8033370971679688, + "learning_rate": 3.0163599182004095e-05, + "loss": 4.0344, + "step": 296 + }, + { + "epoch": 0.09116022099447514, + "grad_norm": 3.148653984069824, + "learning_rate": 3.026584867075665e-05, + "loss": 3.9825, + "step": 297 + }, + { + "epoch": 0.09146715776550031, + "grad_norm": 3.925459384918213, + "learning_rate": 3.0368098159509205e-05, + "loss": 4.0253, + "step": 298 + }, + { + "epoch": 0.09177409453652548, + "grad_norm": 2.8502724170684814, + "learning_rate": 3.047034764826176e-05, + "loss": 4.0192, + "step": 299 + }, + { + "epoch": 0.09208103130755065, + "grad_norm": 3.8444268703460693, + "learning_rate": 3.057259713701431e-05, + "loss": 4.0354, + "step": 300 + }, + { + "epoch": 0.09238796807857581, + "grad_norm": 2.935976982116699, + "learning_rate": 3.067484662576687e-05, + "loss": 4.0397, + "step": 301 + }, + { + "epoch": 0.09269490484960098, + "grad_norm": 2.9375271797180176, + "learning_rate": 3.0777096114519427e-05, + "loss": 3.975, + "step": 302 + }, + { + "epoch": 0.09300184162062615, + "grad_norm": 3.7623329162597656, + "learning_rate": 3.087934560327199e-05, + "loss": 4.0259, + "step": 303 + }, + { + "epoch": 0.09330877839165132, + "grad_norm": 3.1480228900909424, + "learning_rate": 3.098159509202454e-05, + "loss": 3.9676, + "step": 304 + }, + { + "epoch": 0.09361571516267649, + "grad_norm": 4.572622299194336, + "learning_rate": 3.10838445807771e-05, + "loss": 4.0123, + "step": 305 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 2.469806671142578, + "learning_rate": 3.1186094069529655e-05, + "loss": 4.012, + "step": 306 + }, + { + "epoch": 0.09422958870472682, + "grad_norm": 5.133090019226074, + "learning_rate": 3.1288343558282215e-05, + "loss": 3.9892, + "step": 307 + }, + { + "epoch": 0.09453652547575199, + "grad_norm": 3.379105567932129, + "learning_rate": 3.139059304703477e-05, + "loss": 4.0286, + "step": 308 + }, + { + "epoch": 0.09484346224677716, + "grad_norm": 3.1413521766662598, + "learning_rate": 3.149284253578732e-05, + "loss": 4.0238, + "step": 309 + }, + { + "epoch": 0.09515039901780234, + "grad_norm": 2.832242250442505, + "learning_rate": 3.159509202453988e-05, + "loss": 3.9955, + "step": 310 + }, + { + "epoch": 0.09545733578882751, + "grad_norm": 4.405134201049805, + "learning_rate": 3.1697341513292436e-05, + "loss": 4.0093, + "step": 311 + }, + { + "epoch": 0.09576427255985268, + "grad_norm": 2.8928587436676025, + "learning_rate": 3.179959100204499e-05, + "loss": 3.9518, + "step": 312 + }, + { + "epoch": 0.09607120933087784, + "grad_norm": 3.8899731636047363, + "learning_rate": 3.1901840490797544e-05, + "loss": 3.9773, + "step": 313 + }, + { + "epoch": 0.09637814610190301, + "grad_norm": 2.768199920654297, + "learning_rate": 3.2004089979550104e-05, + "loss": 3.9671, + "step": 314 + }, + { + "epoch": 0.09668508287292818, + "grad_norm": 3.834092378616333, + "learning_rate": 3.210633946830266e-05, + "loss": 3.9641, + "step": 315 + }, + { + "epoch": 0.09699201964395335, + "grad_norm": 3.566220998764038, + "learning_rate": 3.220858895705521e-05, + "loss": 3.9585, + "step": 316 + }, + { + "epoch": 0.09729895641497852, + "grad_norm": 3.1876113414764404, + "learning_rate": 3.231083844580777e-05, + "loss": 3.9689, + "step": 317 + }, + { + "epoch": 0.09760589318600368, + "grad_norm": 3.122142791748047, + "learning_rate": 3.2413087934560325e-05, + "loss": 3.9601, + "step": 318 + }, + { + "epoch": 0.09791282995702885, + "grad_norm": 3.825195789337158, + "learning_rate": 3.2515337423312886e-05, + "loss": 3.9413, + "step": 319 + }, + { + "epoch": 0.09821976672805402, + "grad_norm": 3.3126778602600098, + "learning_rate": 3.261758691206544e-05, + "loss": 4.0414, + "step": 320 + }, + { + "epoch": 0.09852670349907919, + "grad_norm": 3.7704360485076904, + "learning_rate": 3.2719836400818e-05, + "loss": 3.9224, + "step": 321 + }, + { + "epoch": 0.09883364027010436, + "grad_norm": 2.997194290161133, + "learning_rate": 3.282208588957055e-05, + "loss": 3.9454, + "step": 322 + }, + { + "epoch": 0.09914057704112952, + "grad_norm": 3.4990131855010986, + "learning_rate": 3.2924335378323114e-05, + "loss": 3.8682, + "step": 323 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 3.146879196166992, + "learning_rate": 3.302658486707567e-05, + "loss": 3.8863, + "step": 324 + }, + { + "epoch": 0.09975445058317986, + "grad_norm": 4.963291645050049, + "learning_rate": 3.312883435582822e-05, + "loss": 3.9951, + "step": 325 + }, + { + "epoch": 0.10006138735420503, + "grad_norm": 2.4511775970458984, + "learning_rate": 3.323108384458078e-05, + "loss": 3.875, + "step": 326 + }, + { + "epoch": 0.1003683241252302, + "grad_norm": 5.670922756195068, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.0446, + "step": 327 + }, + { + "epoch": 0.10067526089625538, + "grad_norm": 3.54237699508667, + "learning_rate": 3.3435582822085895e-05, + "loss": 3.9877, + "step": 328 + }, + { + "epoch": 0.10098219766728055, + "grad_norm": 2.9059271812438965, + "learning_rate": 3.353783231083845e-05, + "loss": 3.949, + "step": 329 + }, + { + "epoch": 0.10128913443830571, + "grad_norm": 3.870962381362915, + "learning_rate": 3.3640081799591e-05, + "loss": 3.8985, + "step": 330 + }, + { + "epoch": 0.10159607120933088, + "grad_norm": 3.275129556655884, + "learning_rate": 3.3742331288343556e-05, + "loss": 4.0209, + "step": 331 + }, + { + "epoch": 0.10190300798035605, + "grad_norm": 3.040931224822998, + "learning_rate": 3.3844580777096117e-05, + "loss": 3.9938, + "step": 332 + }, + { + "epoch": 0.10220994475138122, + "grad_norm": 4.3355584144592285, + "learning_rate": 3.394683026584867e-05, + "loss": 3.876, + "step": 333 + }, + { + "epoch": 0.10251688152240639, + "grad_norm": 3.0981085300445557, + "learning_rate": 3.4049079754601224e-05, + "loss": 3.9014, + "step": 334 + }, + { + "epoch": 0.10282381829343155, + "grad_norm": 3.2902655601501465, + "learning_rate": 3.4151329243353784e-05, + "loss": 3.9599, + "step": 335 + }, + { + "epoch": 0.10313075506445672, + "grad_norm": 3.496514081954956, + "learning_rate": 3.425357873210634e-05, + "loss": 3.9005, + "step": 336 + }, + { + "epoch": 0.10343769183548189, + "grad_norm": 3.4680685997009277, + "learning_rate": 3.43558282208589e-05, + "loss": 3.8591, + "step": 337 + }, + { + "epoch": 0.10374462860650706, + "grad_norm": 3.3041694164276123, + "learning_rate": 3.445807770961145e-05, + "loss": 3.9566, + "step": 338 + }, + { + "epoch": 0.10405156537753223, + "grad_norm": 3.519709825515747, + "learning_rate": 3.456032719836401e-05, + "loss": 3.9219, + "step": 339 + }, + { + "epoch": 0.1043585021485574, + "grad_norm": 3.932344436645508, + "learning_rate": 3.4662576687116566e-05, + "loss": 3.9155, + "step": 340 + }, + { + "epoch": 0.10466543891958256, + "grad_norm": 3.3109822273254395, + "learning_rate": 3.476482617586912e-05, + "loss": 3.9729, + "step": 341 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 4.556341648101807, + "learning_rate": 3.486707566462168e-05, + "loss": 3.9459, + "step": 342 + }, + { + "epoch": 0.1052793124616329, + "grad_norm": 2.9105725288391113, + "learning_rate": 3.4969325153374234e-05, + "loss": 3.9384, + "step": 343 + }, + { + "epoch": 0.10558624923265807, + "grad_norm": 3.865682601928711, + "learning_rate": 3.5071574642126794e-05, + "loss": 3.9826, + "step": 344 + }, + { + "epoch": 0.10589318600368323, + "grad_norm": 2.8606700897216797, + "learning_rate": 3.517382413087935e-05, + "loss": 3.8184, + "step": 345 + }, + { + "epoch": 0.10620012277470842, + "grad_norm": 4.323507785797119, + "learning_rate": 3.527607361963191e-05, + "loss": 3.8772, + "step": 346 + }, + { + "epoch": 0.10650705954573358, + "grad_norm": 2.890390157699585, + "learning_rate": 3.537832310838446e-05, + "loss": 3.8769, + "step": 347 + }, + { + "epoch": 0.10681399631675875, + "grad_norm": 4.008283615112305, + "learning_rate": 3.5480572597137015e-05, + "loss": 3.8796, + "step": 348 + }, + { + "epoch": 0.10712093308778392, + "grad_norm": 3.3605823516845703, + "learning_rate": 3.558282208588957e-05, + "loss": 3.8924, + "step": 349 + }, + { + "epoch": 0.10742786985880909, + "grad_norm": 3.6573123931884766, + "learning_rate": 3.568507157464213e-05, + "loss": 3.812, + "step": 350 + }, + { + "epoch": 0.10773480662983426, + "grad_norm": 3.0771777629852295, + "learning_rate": 3.578732106339468e-05, + "loss": 3.8958, + "step": 351 + }, + { + "epoch": 0.10804174340085942, + "grad_norm": 3.6483314037323, + "learning_rate": 3.5889570552147236e-05, + "loss": 3.8863, + "step": 352 + }, + { + "epoch": 0.10834868017188459, + "grad_norm": 3.1320669651031494, + "learning_rate": 3.59918200408998e-05, + "loss": 3.8194, + "step": 353 + }, + { + "epoch": 0.10865561694290976, + "grad_norm": 3.6510627269744873, + "learning_rate": 3.609406952965235e-05, + "loss": 3.8916, + "step": 354 + }, + { + "epoch": 0.10896255371393493, + "grad_norm": 3.0419273376464844, + "learning_rate": 3.619631901840491e-05, + "loss": 3.7907, + "step": 355 + }, + { + "epoch": 0.1092694904849601, + "grad_norm": 4.519289493560791, + "learning_rate": 3.6298568507157465e-05, + "loss": 3.8902, + "step": 356 + }, + { + "epoch": 0.10957642725598526, + "grad_norm": 2.938493251800537, + "learning_rate": 3.6400817995910025e-05, + "loss": 3.8675, + "step": 357 + }, + { + "epoch": 0.10988336402701043, + "grad_norm": 4.398004531860352, + "learning_rate": 3.650306748466258e-05, + "loss": 3.9535, + "step": 358 + }, + { + "epoch": 0.1101903007980356, + "grad_norm": 2.9128408432006836, + "learning_rate": 3.660531697341513e-05, + "loss": 3.944, + "step": 359 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 5.364169597625732, + "learning_rate": 3.670756646216769e-05, + "loss": 3.9289, + "step": 360 + }, + { + "epoch": 0.11080417434008594, + "grad_norm": 2.8434085845947266, + "learning_rate": 3.6809815950920246e-05, + "loss": 3.8204, + "step": 361 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.494234561920166, + "learning_rate": 3.6912065439672807e-05, + "loss": 3.8518, + "step": 362 + }, + { + "epoch": 0.11141804788213629, + "grad_norm": 2.959608554840088, + "learning_rate": 3.701431492842536e-05, + "loss": 3.8365, + "step": 363 + }, + { + "epoch": 0.11172498465316145, + "grad_norm": 3.4115726947784424, + "learning_rate": 3.711656441717792e-05, + "loss": 3.8507, + "step": 364 + }, + { + "epoch": 0.11203192142418662, + "grad_norm": 3.8023531436920166, + "learning_rate": 3.7218813905930474e-05, + "loss": 3.8544, + "step": 365 + }, + { + "epoch": 0.11233885819521179, + "grad_norm": 3.0639398097991943, + "learning_rate": 3.732106339468303e-05, + "loss": 3.8772, + "step": 366 + }, + { + "epoch": 0.11264579496623696, + "grad_norm": 4.241199016571045, + "learning_rate": 3.742331288343559e-05, + "loss": 3.7739, + "step": 367 + }, + { + "epoch": 0.11295273173726213, + "grad_norm": 2.977330446243286, + "learning_rate": 3.752556237218814e-05, + "loss": 3.8376, + "step": 368 + }, + { + "epoch": 0.1132596685082873, + "grad_norm": 4.574001789093018, + "learning_rate": 3.7627811860940696e-05, + "loss": 3.8761, + "step": 369 + }, + { + "epoch": 0.11356660527931246, + "grad_norm": 3.1499617099761963, + "learning_rate": 3.773006134969325e-05, + "loss": 3.8884, + "step": 370 + }, + { + "epoch": 0.11387354205033763, + "grad_norm": 3.81887149810791, + "learning_rate": 3.783231083844581e-05, + "loss": 3.8474, + "step": 371 + }, + { + "epoch": 0.1141804788213628, + "grad_norm": 3.424117088317871, + "learning_rate": 3.793456032719836e-05, + "loss": 3.8715, + "step": 372 + }, + { + "epoch": 0.11448741559238797, + "grad_norm": 4.431595325469971, + "learning_rate": 3.8036809815950924e-05, + "loss": 3.8305, + "step": 373 + }, + { + "epoch": 0.11479435236341314, + "grad_norm": 3.1664443016052246, + "learning_rate": 3.813905930470348e-05, + "loss": 3.8203, + "step": 374 + }, + { + "epoch": 0.1151012891344383, + "grad_norm": 4.312273025512695, + "learning_rate": 3.824130879345603e-05, + "loss": 3.8195, + "step": 375 + }, + { + "epoch": 0.11540822590546347, + "grad_norm": 3.0893726348876953, + "learning_rate": 3.834355828220859e-05, + "loss": 3.8248, + "step": 376 + }, + { + "epoch": 0.11571516267648864, + "grad_norm": 4.526726722717285, + "learning_rate": 3.8445807770961145e-05, + "loss": 3.8505, + "step": 377 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 2.5805325508117676, + "learning_rate": 3.8548057259713705e-05, + "loss": 3.8153, + "step": 378 + }, + { + "epoch": 0.11632903621853898, + "grad_norm": 4.6043381690979, + "learning_rate": 3.865030674846626e-05, + "loss": 3.8248, + "step": 379 + }, + { + "epoch": 0.11663597298956414, + "grad_norm": 3.0713136196136475, + "learning_rate": 3.875255623721882e-05, + "loss": 3.7687, + "step": 380 + }, + { + "epoch": 0.11694290976058933, + "grad_norm": 3.6344685554504395, + "learning_rate": 3.885480572597137e-05, + "loss": 3.8061, + "step": 381 + }, + { + "epoch": 0.1172498465316145, + "grad_norm": 3.6261723041534424, + "learning_rate": 3.895705521472393e-05, + "loss": 3.7939, + "step": 382 + }, + { + "epoch": 0.11755678330263966, + "grad_norm": 3.811779260635376, + "learning_rate": 3.905930470347649e-05, + "loss": 3.7973, + "step": 383 + }, + { + "epoch": 0.11786372007366483, + "grad_norm": 3.741685628890991, + "learning_rate": 3.916155419222904e-05, + "loss": 3.8149, + "step": 384 + }, + { + "epoch": 0.11817065684469, + "grad_norm": 3.330526351928711, + "learning_rate": 3.92638036809816e-05, + "loss": 3.8058, + "step": 385 + }, + { + "epoch": 0.11847759361571517, + "grad_norm": 3.2102115154266357, + "learning_rate": 3.9366053169734155e-05, + "loss": 3.7199, + "step": 386 + }, + { + "epoch": 0.11878453038674033, + "grad_norm": 3.670474052429199, + "learning_rate": 3.946830265848671e-05, + "loss": 3.8087, + "step": 387 + }, + { + "epoch": 0.1190914671577655, + "grad_norm": 3.218390941619873, + "learning_rate": 3.957055214723926e-05, + "loss": 3.7631, + "step": 388 + }, + { + "epoch": 0.11939840392879067, + "grad_norm": 4.2256693840026855, + "learning_rate": 3.967280163599182e-05, + "loss": 3.7624, + "step": 389 + }, + { + "epoch": 0.11970534069981584, + "grad_norm": 2.86247181892395, + "learning_rate": 3.9775051124744376e-05, + "loss": 3.7638, + "step": 390 + }, + { + "epoch": 0.120012277470841, + "grad_norm": 4.083118915557861, + "learning_rate": 3.987730061349693e-05, + "loss": 3.7581, + "step": 391 + }, + { + "epoch": 0.12031921424186617, + "grad_norm": 2.836794376373291, + "learning_rate": 3.997955010224949e-05, + "loss": 3.7466, + "step": 392 + }, + { + "epoch": 0.12062615101289134, + "grad_norm": 4.071137428283691, + "learning_rate": 4.0081799591002043e-05, + "loss": 3.7836, + "step": 393 + }, + { + "epoch": 0.12093308778391651, + "grad_norm": 3.3141064643859863, + "learning_rate": 4.0184049079754604e-05, + "loss": 3.754, + "step": 394 + }, + { + "epoch": 0.12124002455494168, + "grad_norm": 3.6064393520355225, + "learning_rate": 4.028629856850716e-05, + "loss": 3.8379, + "step": 395 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 3.7306606769561768, + "learning_rate": 4.038854805725972e-05, + "loss": 3.6848, + "step": 396 + }, + { + "epoch": 0.12185389809699201, + "grad_norm": 3.5877859592437744, + "learning_rate": 4.049079754601227e-05, + "loss": 3.8201, + "step": 397 + }, + { + "epoch": 0.12216083486801718, + "grad_norm": 3.930271625518799, + "learning_rate": 4.059304703476483e-05, + "loss": 3.7507, + "step": 398 + }, + { + "epoch": 0.12246777163904236, + "grad_norm": 2.974968194961548, + "learning_rate": 4.0695296523517386e-05, + "loss": 3.7545, + "step": 399 + }, + { + "epoch": 0.12277470841006753, + "grad_norm": 4.655934810638428, + "learning_rate": 4.079754601226994e-05, + "loss": 3.8093, + "step": 400 + }, + { + "epoch": 0.1230816451810927, + "grad_norm": 3.201986312866211, + "learning_rate": 4.08997955010225e-05, + "loss": 3.7252, + "step": 401 + }, + { + "epoch": 0.12338858195211787, + "grad_norm": 4.447626113891602, + "learning_rate": 4.100204498977505e-05, + "loss": 3.7132, + "step": 402 + }, + { + "epoch": 0.12369551872314304, + "grad_norm": 2.6518118381500244, + "learning_rate": 4.1104294478527614e-05, + "loss": 3.7637, + "step": 403 + }, + { + "epoch": 0.1240024554941682, + "grad_norm": 5.116448402404785, + "learning_rate": 4.120654396728017e-05, + "loss": 3.6991, + "step": 404 + }, + { + "epoch": 0.12430939226519337, + "grad_norm": 2.7780613899230957, + "learning_rate": 4.130879345603272e-05, + "loss": 3.7555, + "step": 405 + }, + { + "epoch": 0.12461632903621854, + "grad_norm": 4.281010627746582, + "learning_rate": 4.1411042944785274e-05, + "loss": 3.688, + "step": 406 + }, + { + "epoch": 0.12492326580724371, + "grad_norm": 2.851562023162842, + "learning_rate": 4.1513292433537835e-05, + "loss": 3.7557, + "step": 407 + }, + { + "epoch": 0.1252302025782689, + "grad_norm": 4.092229843139648, + "learning_rate": 4.161554192229039e-05, + "loss": 3.7179, + "step": 408 + }, + { + "epoch": 0.12553713934929406, + "grad_norm": 3.410094976425171, + "learning_rate": 4.171779141104294e-05, + "loss": 3.7292, + "step": 409 + }, + { + "epoch": 0.12584407612031923, + "grad_norm": 4.266562461853027, + "learning_rate": 4.18200408997955e-05, + "loss": 3.8204, + "step": 410 + }, + { + "epoch": 0.1261510128913444, + "grad_norm": 2.997642755508423, + "learning_rate": 4.1922290388548056e-05, + "loss": 3.7773, + "step": 411 + }, + { + "epoch": 0.12645794966236956, + "grad_norm": 4.50873327255249, + "learning_rate": 4.2024539877300617e-05, + "loss": 3.7255, + "step": 412 + }, + { + "epoch": 0.12676488643339473, + "grad_norm": 3.65312123298645, + "learning_rate": 4.212678936605317e-05, + "loss": 3.6472, + "step": 413 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 3.985487222671509, + "learning_rate": 4.222903885480573e-05, + "loss": 3.6915, + "step": 414 + }, + { + "epoch": 0.12737875997544507, + "grad_norm": 3.6020219326019287, + "learning_rate": 4.2331288343558284e-05, + "loss": 3.7299, + "step": 415 + }, + { + "epoch": 0.12768569674647023, + "grad_norm": 3.414529323577881, + "learning_rate": 4.243353783231084e-05, + "loss": 3.7827, + "step": 416 + }, + { + "epoch": 0.1279926335174954, + "grad_norm": 3.537292718887329, + "learning_rate": 4.25357873210634e-05, + "loss": 3.751, + "step": 417 + }, + { + "epoch": 0.12829957028852057, + "grad_norm": 3.5442280769348145, + "learning_rate": 4.263803680981595e-05, + "loss": 3.6828, + "step": 418 + }, + { + "epoch": 0.12860650705954574, + "grad_norm": 3.9816019535064697, + "learning_rate": 4.274028629856851e-05, + "loss": 3.7668, + "step": 419 + }, + { + "epoch": 0.1289134438305709, + "grad_norm": 3.1632657051086426, + "learning_rate": 4.2842535787321066e-05, + "loss": 3.6946, + "step": 420 + }, + { + "epoch": 0.12922038060159607, + "grad_norm": 4.731013298034668, + "learning_rate": 4.2944785276073626e-05, + "loss": 3.7078, + "step": 421 + }, + { + "epoch": 0.12952731737262124, + "grad_norm": 2.7973382472991943, + "learning_rate": 4.304703476482618e-05, + "loss": 3.5934, + "step": 422 + }, + { + "epoch": 0.1298342541436464, + "grad_norm": 4.555461406707764, + "learning_rate": 4.3149284253578733e-05, + "loss": 3.7406, + "step": 423 + }, + { + "epoch": 0.13014119091467158, + "grad_norm": 3.25795841217041, + "learning_rate": 4.3251533742331294e-05, + "loss": 3.6302, + "step": 424 + }, + { + "epoch": 0.13044812768569675, + "grad_norm": 3.9974427223205566, + "learning_rate": 4.335378323108385e-05, + "loss": 3.6995, + "step": 425 + }, + { + "epoch": 0.13075506445672191, + "grad_norm": 3.4234917163848877, + "learning_rate": 4.34560327198364e-05, + "loss": 3.727, + "step": 426 + }, + { + "epoch": 0.13106200122774708, + "grad_norm": 3.40573787689209, + "learning_rate": 4.3558282208588955e-05, + "loss": 3.6964, + "step": 427 + }, + { + "epoch": 0.13136893799877225, + "grad_norm": 3.6903765201568604, + "learning_rate": 4.3660531697341515e-05, + "loss": 3.7139, + "step": 428 + }, + { + "epoch": 0.13167587476979742, + "grad_norm": 3.3252439498901367, + "learning_rate": 4.376278118609407e-05, + "loss": 3.7221, + "step": 429 + }, + { + "epoch": 0.1319828115408226, + "grad_norm": 3.591610908508301, + "learning_rate": 4.386503067484663e-05, + "loss": 3.6592, + "step": 430 + }, + { + "epoch": 0.13228974831184775, + "grad_norm": 3.584683418273926, + "learning_rate": 4.396728016359918e-05, + "loss": 3.695, + "step": 431 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 3.5093443393707275, + "learning_rate": 4.4069529652351736e-05, + "loss": 3.6368, + "step": 432 + }, + { + "epoch": 0.1329036218538981, + "grad_norm": 3.5040347576141357, + "learning_rate": 4.41717791411043e-05, + "loss": 3.6463, + "step": 433 + }, + { + "epoch": 0.13321055862492326, + "grad_norm": 3.534536361694336, + "learning_rate": 4.427402862985685e-05, + "loss": 3.681, + "step": 434 + }, + { + "epoch": 0.13351749539594843, + "grad_norm": 4.016106605529785, + "learning_rate": 4.437627811860941e-05, + "loss": 3.7592, + "step": 435 + }, + { + "epoch": 0.1338244321669736, + "grad_norm": 3.4661898612976074, + "learning_rate": 4.4478527607361964e-05, + "loss": 3.6437, + "step": 436 + }, + { + "epoch": 0.13413136893799876, + "grad_norm": 3.917189359664917, + "learning_rate": 4.4580777096114525e-05, + "loss": 3.6809, + "step": 437 + }, + { + "epoch": 0.13443830570902393, + "grad_norm": 3.472147226333618, + "learning_rate": 4.468302658486708e-05, + "loss": 3.5978, + "step": 438 + }, + { + "epoch": 0.1347452424800491, + "grad_norm": 3.2357044219970703, + "learning_rate": 4.478527607361964e-05, + "loss": 3.6758, + "step": 439 + }, + { + "epoch": 0.13505217925107427, + "grad_norm": 3.8607826232910156, + "learning_rate": 4.488752556237219e-05, + "loss": 3.7155, + "step": 440 + }, + { + "epoch": 0.13535911602209943, + "grad_norm": 3.085242509841919, + "learning_rate": 4.4989775051124746e-05, + "loss": 3.674, + "step": 441 + }, + { + "epoch": 0.1356660527931246, + "grad_norm": 4.0473432540893555, + "learning_rate": 4.5092024539877307e-05, + "loss": 3.6542, + "step": 442 + }, + { + "epoch": 0.1359729895641498, + "grad_norm": 3.4742088317871094, + "learning_rate": 4.519427402862986e-05, + "loss": 3.6226, + "step": 443 + }, + { + "epoch": 0.13627992633517497, + "grad_norm": 3.8838884830474854, + "learning_rate": 4.5296523517382414e-05, + "loss": 3.695, + "step": 444 + }, + { + "epoch": 0.13658686310620013, + "grad_norm": 3.1551895141601562, + "learning_rate": 4.539877300613497e-05, + "loss": 3.6886, + "step": 445 + }, + { + "epoch": 0.1368937998772253, + "grad_norm": 3.6824824810028076, + "learning_rate": 4.550102249488753e-05, + "loss": 3.6397, + "step": 446 + }, + { + "epoch": 0.13720073664825047, + "grad_norm": 3.3671298027038574, + "learning_rate": 4.560327198364008e-05, + "loss": 3.5983, + "step": 447 + }, + { + "epoch": 0.13750767341927564, + "grad_norm": 4.11976957321167, + "learning_rate": 4.570552147239264e-05, + "loss": 3.6371, + "step": 448 + }, + { + "epoch": 0.1378146101903008, + "grad_norm": 3.2035205364227295, + "learning_rate": 4.5807770961145195e-05, + "loss": 3.6097, + "step": 449 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 4.944174289703369, + "learning_rate": 4.591002044989775e-05, + "loss": 3.6317, + "step": 450 + }, + { + "epoch": 0.13842848373235114, + "grad_norm": 3.0040266513824463, + "learning_rate": 4.601226993865031e-05, + "loss": 3.6407, + "step": 451 + }, + { + "epoch": 0.1387354205033763, + "grad_norm": 5.124639511108398, + "learning_rate": 4.611451942740286e-05, + "loss": 3.6539, + "step": 452 + }, + { + "epoch": 0.13904235727440148, + "grad_norm": 2.792884349822998, + "learning_rate": 4.6216768916155423e-05, + "loss": 3.6542, + "step": 453 + }, + { + "epoch": 0.13934929404542665, + "grad_norm": 4.394725799560547, + "learning_rate": 4.631901840490798e-05, + "loss": 3.6811, + "step": 454 + }, + { + "epoch": 0.13965623081645182, + "grad_norm": 3.209400177001953, + "learning_rate": 4.642126789366054e-05, + "loss": 3.6635, + "step": 455 + }, + { + "epoch": 0.13996316758747698, + "grad_norm": 3.6599526405334473, + "learning_rate": 4.652351738241309e-05, + "loss": 3.5732, + "step": 456 + }, + { + "epoch": 0.14027010435850215, + "grad_norm": 3.6527204513549805, + "learning_rate": 4.6625766871165645e-05, + "loss": 3.5979, + "step": 457 + }, + { + "epoch": 0.14057704112952732, + "grad_norm": 3.4562110900878906, + "learning_rate": 4.6728016359918205e-05, + "loss": 3.6761, + "step": 458 + }, + { + "epoch": 0.1408839779005525, + "grad_norm": 3.5935721397399902, + "learning_rate": 4.683026584867076e-05, + "loss": 3.6598, + "step": 459 + }, + { + "epoch": 0.14119091467157766, + "grad_norm": 3.4518251419067383, + "learning_rate": 4.693251533742332e-05, + "loss": 3.5707, + "step": 460 + }, + { + "epoch": 0.14149785144260282, + "grad_norm": 3.3248815536499023, + "learning_rate": 4.703476482617587e-05, + "loss": 3.6949, + "step": 461 + }, + { + "epoch": 0.141804788213628, + "grad_norm": 3.6379971504211426, + "learning_rate": 4.7137014314928426e-05, + "loss": 3.6265, + "step": 462 + }, + { + "epoch": 0.14211172498465316, + "grad_norm": 4.068325996398926, + "learning_rate": 4.723926380368098e-05, + "loss": 3.6096, + "step": 463 + }, + { + "epoch": 0.14241866175567833, + "grad_norm": 3.0870959758758545, + "learning_rate": 4.734151329243354e-05, + "loss": 3.5201, + "step": 464 + }, + { + "epoch": 0.1427255985267035, + "grad_norm": 4.013638973236084, + "learning_rate": 4.7443762781186094e-05, + "loss": 3.5845, + "step": 465 + }, + { + "epoch": 0.14303253529772866, + "grad_norm": 3.421921968460083, + "learning_rate": 4.754601226993865e-05, + "loss": 3.6718, + "step": 466 + }, + { + "epoch": 0.14333947206875383, + "grad_norm": 3.4814112186431885, + "learning_rate": 4.764826175869121e-05, + "loss": 3.6225, + "step": 467 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 2.9323105812072754, + "learning_rate": 4.775051124744376e-05, + "loss": 3.5881, + "step": 468 + }, + { + "epoch": 0.14395334561080417, + "grad_norm": 3.862344264984131, + "learning_rate": 4.785276073619632e-05, + "loss": 3.6264, + "step": 469 + }, + { + "epoch": 0.14426028238182934, + "grad_norm": 2.950495481491089, + "learning_rate": 4.7955010224948876e-05, + "loss": 3.5891, + "step": 470 + }, + { + "epoch": 0.1445672191528545, + "grad_norm": 4.360744476318359, + "learning_rate": 4.8057259713701436e-05, + "loss": 3.6746, + "step": 471 + }, + { + "epoch": 0.14487415592387967, + "grad_norm": 2.689297914505005, + "learning_rate": 4.815950920245399e-05, + "loss": 3.616, + "step": 472 + }, + { + "epoch": 0.14518109269490484, + "grad_norm": 4.433006286621094, + "learning_rate": 4.826175869120655e-05, + "loss": 3.6259, + "step": 473 + }, + { + "epoch": 0.14548802946593, + "grad_norm": 2.9184467792510986, + "learning_rate": 4.8364008179959104e-05, + "loss": 3.59, + "step": 474 + }, + { + "epoch": 0.14579496623695518, + "grad_norm": 4.472714424133301, + "learning_rate": 4.846625766871166e-05, + "loss": 3.5608, + "step": 475 + }, + { + "epoch": 0.14610190300798034, + "grad_norm": 3.0839431285858154, + "learning_rate": 4.856850715746422e-05, + "loss": 3.6069, + "step": 476 + }, + { + "epoch": 0.1464088397790055, + "grad_norm": 3.8900411128997803, + "learning_rate": 4.867075664621677e-05, + "loss": 3.5387, + "step": 477 + }, + { + "epoch": 0.14671577655003068, + "grad_norm": 3.0446956157684326, + "learning_rate": 4.877300613496933e-05, + "loss": 3.5374, + "step": 478 + }, + { + "epoch": 0.14702271332105588, + "grad_norm": 3.805018901824951, + "learning_rate": 4.8875255623721885e-05, + "loss": 3.6032, + "step": 479 + }, + { + "epoch": 0.14732965009208104, + "grad_norm": 2.9937491416931152, + "learning_rate": 4.897750511247444e-05, + "loss": 3.548, + "step": 480 + }, + { + "epoch": 0.1476365868631062, + "grad_norm": 4.103757858276367, + "learning_rate": 4.907975460122699e-05, + "loss": 3.6292, + "step": 481 + }, + { + "epoch": 0.14794352363413138, + "grad_norm": 2.8275530338287354, + "learning_rate": 4.918200408997955e-05, + "loss": 3.5885, + "step": 482 + }, + { + "epoch": 0.14825046040515655, + "grad_norm": 4.104444980621338, + "learning_rate": 4.928425357873211e-05, + "loss": 3.5566, + "step": 483 + }, + { + "epoch": 0.14855739717618172, + "grad_norm": 2.820648670196533, + "learning_rate": 4.938650306748466e-05, + "loss": 3.6576, + "step": 484 + }, + { + "epoch": 0.14886433394720688, + "grad_norm": 4.639568328857422, + "learning_rate": 4.948875255623722e-05, + "loss": 3.583, + "step": 485 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 2.8675858974456787, + "learning_rate": 4.9591002044989774e-05, + "loss": 3.5982, + "step": 486 + }, + { + "epoch": 0.14947820748925722, + "grad_norm": 4.820484638214111, + "learning_rate": 4.9693251533742335e-05, + "loss": 3.5479, + "step": 487 + }, + { + "epoch": 0.1497851442602824, + "grad_norm": 2.9569075107574463, + "learning_rate": 4.979550102249489e-05, + "loss": 3.5846, + "step": 488 + }, + { + "epoch": 0.15009208103130756, + "grad_norm": 4.402152061462402, + "learning_rate": 4.989775051124745e-05, + "loss": 3.5368, + "step": 489 + }, + { + "epoch": 0.15039901780233272, + "grad_norm": 3.0454704761505127, + "learning_rate": 5e-05, + "loss": 3.5233, + "step": 490 + }, + { + "epoch": 0.1507059545733579, + "grad_norm": 3.564425468444824, + "learning_rate": 5.010224948875256e-05, + "loss": 3.5747, + "step": 491 + }, + { + "epoch": 0.15101289134438306, + "grad_norm": 3.2065536975860596, + "learning_rate": 5.020449897750511e-05, + "loss": 3.4803, + "step": 492 + }, + { + "epoch": 0.15131982811540823, + "grad_norm": 4.06170129776001, + "learning_rate": 5.030674846625767e-05, + "loss": 3.5867, + "step": 493 + }, + { + "epoch": 0.1516267648864334, + "grad_norm": 2.937181234359741, + "learning_rate": 5.040899795501023e-05, + "loss": 3.5098, + "step": 494 + }, + { + "epoch": 0.15193370165745856, + "grad_norm": 3.7272653579711914, + "learning_rate": 5.051124744376279e-05, + "loss": 3.5959, + "step": 495 + }, + { + "epoch": 0.15224063842848373, + "grad_norm": 2.8606886863708496, + "learning_rate": 5.061349693251534e-05, + "loss": 3.4881, + "step": 496 + }, + { + "epoch": 0.1525475751995089, + "grad_norm": 3.4861185550689697, + "learning_rate": 5.07157464212679e-05, + "loss": 3.563, + "step": 497 + }, + { + "epoch": 0.15285451197053407, + "grad_norm": 3.1362967491149902, + "learning_rate": 5.081799591002045e-05, + "loss": 3.5564, + "step": 498 + }, + { + "epoch": 0.15316144874155924, + "grad_norm": 3.360508441925049, + "learning_rate": 5.0920245398773005e-05, + "loss": 3.5307, + "step": 499 + }, + { + "epoch": 0.1534683855125844, + "grad_norm": 3.2896840572357178, + "learning_rate": 5.1022494887525566e-05, + "loss": 3.4843, + "step": 500 + }, + { + "epoch": 0.15377532228360957, + "grad_norm": 3.320429801940918, + "learning_rate": 5.112474437627812e-05, + "loss": 3.484, + "step": 501 + }, + { + "epoch": 0.15408225905463474, + "grad_norm": 3.409586191177368, + "learning_rate": 5.122699386503068e-05, + "loss": 3.506, + "step": 502 + }, + { + "epoch": 0.1543891958256599, + "grad_norm": 3.0944409370422363, + "learning_rate": 5.1329243353783227e-05, + "loss": 3.5011, + "step": 503 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 3.7220418453216553, + "learning_rate": 5.143149284253579e-05, + "loss": 3.5629, + "step": 504 + }, + { + "epoch": 0.15500306936771024, + "grad_norm": 3.217435359954834, + "learning_rate": 5.153374233128835e-05, + "loss": 3.4957, + "step": 505 + }, + { + "epoch": 0.1553100061387354, + "grad_norm": 4.0457444190979, + "learning_rate": 5.163599182004091e-05, + "loss": 3.5152, + "step": 506 + }, + { + "epoch": 0.15561694290976058, + "grad_norm": 2.9380006790161133, + "learning_rate": 5.1738241308793455e-05, + "loss": 3.5261, + "step": 507 + }, + { + "epoch": 0.15592387968078575, + "grad_norm": 4.134535312652588, + "learning_rate": 5.1840490797546015e-05, + "loss": 3.5622, + "step": 508 + }, + { + "epoch": 0.15623081645181092, + "grad_norm": 2.8209407329559326, + "learning_rate": 5.1942740286298575e-05, + "loss": 3.5335, + "step": 509 + }, + { + "epoch": 0.15653775322283608, + "grad_norm": 4.4260711669921875, + "learning_rate": 5.204498977505112e-05, + "loss": 3.5554, + "step": 510 + }, + { + "epoch": 0.15684468999386125, + "grad_norm": 2.8649590015411377, + "learning_rate": 5.214723926380368e-05, + "loss": 3.4989, + "step": 511 + }, + { + "epoch": 0.15715162676488642, + "grad_norm": 4.0349812507629395, + "learning_rate": 5.224948875255624e-05, + "loss": 3.4883, + "step": 512 + }, + { + "epoch": 0.1574585635359116, + "grad_norm": 2.841923475265503, + "learning_rate": 5.2351738241308803e-05, + "loss": 3.4748, + "step": 513 + }, + { + "epoch": 0.15776550030693678, + "grad_norm": 3.8810653686523438, + "learning_rate": 5.245398773006135e-05, + "loss": 3.5403, + "step": 514 + }, + { + "epoch": 0.15807243707796195, + "grad_norm": 3.0830774307250977, + "learning_rate": 5.255623721881391e-05, + "loss": 3.513, + "step": 515 + }, + { + "epoch": 0.15837937384898712, + "grad_norm": 3.8688604831695557, + "learning_rate": 5.265848670756647e-05, + "loss": 3.5409, + "step": 516 + }, + { + "epoch": 0.1586863106200123, + "grad_norm": 2.854600429534912, + "learning_rate": 5.276073619631902e-05, + "loss": 3.4441, + "step": 517 + }, + { + "epoch": 0.15899324739103746, + "grad_norm": 3.9125611782073975, + "learning_rate": 5.286298568507158e-05, + "loss": 3.4953, + "step": 518 + }, + { + "epoch": 0.15930018416206262, + "grad_norm": 2.8626177310943604, + "learning_rate": 5.296523517382413e-05, + "loss": 3.5279, + "step": 519 + }, + { + "epoch": 0.1596071209330878, + "grad_norm": 3.5023677349090576, + "learning_rate": 5.306748466257669e-05, + "loss": 3.4886, + "step": 520 + }, + { + "epoch": 0.15991405770411296, + "grad_norm": 2.960505962371826, + "learning_rate": 5.316973415132924e-05, + "loss": 3.5278, + "step": 521 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 3.976245164871216, + "learning_rate": 5.32719836400818e-05, + "loss": 3.5236, + "step": 522 + }, + { + "epoch": 0.1605279312461633, + "grad_norm": 3.078248977661133, + "learning_rate": 5.337423312883436e-05, + "loss": 3.5194, + "step": 523 + }, + { + "epoch": 0.16083486801718846, + "grad_norm": 3.7498552799224854, + "learning_rate": 5.347648261758691e-05, + "loss": 3.5315, + "step": 524 + }, + { + "epoch": 0.16114180478821363, + "grad_norm": 2.87638258934021, + "learning_rate": 5.357873210633947e-05, + "loss": 3.434, + "step": 525 + }, + { + "epoch": 0.1614487415592388, + "grad_norm": 3.786454677581787, + "learning_rate": 5.368098159509203e-05, + "loss": 3.4985, + "step": 526 + }, + { + "epoch": 0.16175567833026397, + "grad_norm": 2.915156364440918, + "learning_rate": 5.378323108384459e-05, + "loss": 3.4979, + "step": 527 + }, + { + "epoch": 0.16206261510128914, + "grad_norm": 4.095824718475342, + "learning_rate": 5.3885480572597135e-05, + "loss": 3.4605, + "step": 528 + }, + { + "epoch": 0.1623695518723143, + "grad_norm": 2.793501853942871, + "learning_rate": 5.3987730061349695e-05, + "loss": 3.476, + "step": 529 + }, + { + "epoch": 0.16267648864333947, + "grad_norm": 3.9074480533599854, + "learning_rate": 5.4089979550102256e-05, + "loss": 3.4636, + "step": 530 + }, + { + "epoch": 0.16298342541436464, + "grad_norm": 2.8382515907287598, + "learning_rate": 5.4192229038854816e-05, + "loss": 3.4364, + "step": 531 + }, + { + "epoch": 0.1632903621853898, + "grad_norm": 3.4670751094818115, + "learning_rate": 5.429447852760736e-05, + "loss": 3.5033, + "step": 532 + }, + { + "epoch": 0.16359729895641498, + "grad_norm": 2.8805580139160156, + "learning_rate": 5.439672801635992e-05, + "loss": 3.471, + "step": 533 + }, + { + "epoch": 0.16390423572744015, + "grad_norm": 3.745434522628784, + "learning_rate": 5.4498977505112484e-05, + "loss": 3.4565, + "step": 534 + }, + { + "epoch": 0.1642111724984653, + "grad_norm": 3.290579319000244, + "learning_rate": 5.460122699386503e-05, + "loss": 3.47, + "step": 535 + }, + { + "epoch": 0.16451810926949048, + "grad_norm": 3.2988481521606445, + "learning_rate": 5.470347648261759e-05, + "loss": 3.3781, + "step": 536 + }, + { + "epoch": 0.16482504604051565, + "grad_norm": 3.3673248291015625, + "learning_rate": 5.4805725971370145e-05, + "loss": 3.4891, + "step": 537 + }, + { + "epoch": 0.16513198281154082, + "grad_norm": 3.1917717456817627, + "learning_rate": 5.4907975460122705e-05, + "loss": 3.4493, + "step": 538 + }, + { + "epoch": 0.16543891958256599, + "grad_norm": 3.3869614601135254, + "learning_rate": 5.501022494887525e-05, + "loss": 3.3954, + "step": 539 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 2.896742820739746, + "learning_rate": 5.511247443762781e-05, + "loss": 3.4465, + "step": 540 + }, + { + "epoch": 0.16605279312461632, + "grad_norm": 3.771268844604492, + "learning_rate": 5.521472392638037e-05, + "loss": 3.4889, + "step": 541 + }, + { + "epoch": 0.1663597298956415, + "grad_norm": 2.8693349361419678, + "learning_rate": 5.531697341513292e-05, + "loss": 3.3661, + "step": 542 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.093103885650635, + "learning_rate": 5.541922290388548e-05, + "loss": 3.4451, + "step": 543 + }, + { + "epoch": 0.16697360343769183, + "grad_norm": 3.050361394882202, + "learning_rate": 5.552147239263804e-05, + "loss": 3.4203, + "step": 544 + }, + { + "epoch": 0.167280540208717, + "grad_norm": 3.041480302810669, + "learning_rate": 5.56237218813906e-05, + "loss": 3.4173, + "step": 545 + }, + { + "epoch": 0.16758747697974216, + "grad_norm": 3.385680675506592, + "learning_rate": 5.572597137014315e-05, + "loss": 3.4408, + "step": 546 + }, + { + "epoch": 0.16789441375076733, + "grad_norm": 2.88845157623291, + "learning_rate": 5.582822085889571e-05, + "loss": 3.4536, + "step": 547 + }, + { + "epoch": 0.1682013505217925, + "grad_norm": 3.7155961990356445, + "learning_rate": 5.593047034764827e-05, + "loss": 3.4392, + "step": 548 + }, + { + "epoch": 0.1685082872928177, + "grad_norm": 3.4626615047454834, + "learning_rate": 5.6032719836400815e-05, + "loss": 3.4395, + "step": 549 + }, + { + "epoch": 0.16881522406384286, + "grad_norm": 3.182154417037964, + "learning_rate": 5.6134969325153376e-05, + "loss": 3.5239, + "step": 550 + }, + { + "epoch": 0.16912216083486803, + "grad_norm": 3.478602886199951, + "learning_rate": 5.6237218813905936e-05, + "loss": 3.4258, + "step": 551 + }, + { + "epoch": 0.1694290976058932, + "grad_norm": 2.9652369022369385, + "learning_rate": 5.6339468302658496e-05, + "loss": 3.3919, + "step": 552 + }, + { + "epoch": 0.16973603437691837, + "grad_norm": 3.736821413040161, + "learning_rate": 5.644171779141104e-05, + "loss": 3.4491, + "step": 553 + }, + { + "epoch": 0.17004297114794353, + "grad_norm": 2.7791361808776855, + "learning_rate": 5.6543967280163604e-05, + "loss": 3.4748, + "step": 554 + }, + { + "epoch": 0.1703499079189687, + "grad_norm": 4.583637714385986, + "learning_rate": 5.664621676891616e-05, + "loss": 3.4554, + "step": 555 + }, + { + "epoch": 0.17065684468999387, + "grad_norm": 2.8527474403381348, + "learning_rate": 5.674846625766872e-05, + "loss": 3.4327, + "step": 556 + }, + { + "epoch": 0.17096378146101904, + "grad_norm": 4.116163730621338, + "learning_rate": 5.685071574642127e-05, + "loss": 3.4043, + "step": 557 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 3.0130903720855713, + "learning_rate": 5.6952965235173825e-05, + "loss": 3.4823, + "step": 558 + }, + { + "epoch": 0.17157765500306937, + "grad_norm": 3.3556432723999023, + "learning_rate": 5.7055214723926385e-05, + "loss": 3.4464, + "step": 559 + }, + { + "epoch": 0.17188459177409454, + "grad_norm": 2.854952573776245, + "learning_rate": 5.715746421267893e-05, + "loss": 3.3768, + "step": 560 + }, + { + "epoch": 0.1721915285451197, + "grad_norm": 3.9891982078552246, + "learning_rate": 5.725971370143149e-05, + "loss": 3.3949, + "step": 561 + }, + { + "epoch": 0.17249846531614488, + "grad_norm": 2.980468511581421, + "learning_rate": 5.736196319018405e-05, + "loss": 3.459, + "step": 562 + }, + { + "epoch": 0.17280540208717005, + "grad_norm": 3.453510284423828, + "learning_rate": 5.7464212678936613e-05, + "loss": 3.4549, + "step": 563 + }, + { + "epoch": 0.1731123388581952, + "grad_norm": 2.8926782608032227, + "learning_rate": 5.756646216768916e-05, + "loss": 3.392, + "step": 564 + }, + { + "epoch": 0.17341927562922038, + "grad_norm": 3.3722894191741943, + "learning_rate": 5.766871165644172e-05, + "loss": 3.4002, + "step": 565 + }, + { + "epoch": 0.17372621240024555, + "grad_norm": 2.8093647956848145, + "learning_rate": 5.777096114519428e-05, + "loss": 3.3862, + "step": 566 + }, + { + "epoch": 0.17403314917127072, + "grad_norm": 4.1722731590271, + "learning_rate": 5.787321063394683e-05, + "loss": 3.3903, + "step": 567 + }, + { + "epoch": 0.17434008594229589, + "grad_norm": 2.778069257736206, + "learning_rate": 5.797546012269939e-05, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.17464702271332105, + "grad_norm": 3.8501908779144287, + "learning_rate": 5.807770961145195e-05, + "loss": 3.4094, + "step": 569 + }, + { + "epoch": 0.17495395948434622, + "grad_norm": 2.5164549350738525, + "learning_rate": 5.817995910020451e-05, + "loss": 3.4343, + "step": 570 + }, + { + "epoch": 0.1752608962553714, + "grad_norm": 4.0673065185546875, + "learning_rate": 5.8282208588957056e-05, + "loss": 3.3993, + "step": 571 + }, + { + "epoch": 0.17556783302639656, + "grad_norm": 2.7882072925567627, + "learning_rate": 5.8384458077709616e-05, + "loss": 3.4759, + "step": 572 + }, + { + "epoch": 0.17587476979742173, + "grad_norm": 3.3252487182617188, + "learning_rate": 5.848670756646217e-05, + "loss": 3.3562, + "step": 573 + }, + { + "epoch": 0.1761817065684469, + "grad_norm": 2.7499115467071533, + "learning_rate": 5.8588957055214724e-05, + "loss": 3.3376, + "step": 574 + }, + { + "epoch": 0.17648864333947206, + "grad_norm": 4.061224460601807, + "learning_rate": 5.8691206543967284e-05, + "loss": 3.3521, + "step": 575 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 3.022193431854248, + "learning_rate": 5.879345603271984e-05, + "loss": 3.3933, + "step": 576 + }, + { + "epoch": 0.1771025168815224, + "grad_norm": 3.2442128658294678, + "learning_rate": 5.88957055214724e-05, + "loss": 3.4531, + "step": 577 + }, + { + "epoch": 0.17740945365254757, + "grad_norm": 2.9524872303009033, + "learning_rate": 5.8997955010224945e-05, + "loss": 3.332, + "step": 578 + }, + { + "epoch": 0.17771639042357273, + "grad_norm": 3.4604902267456055, + "learning_rate": 5.9100204498977505e-05, + "loss": 3.3706, + "step": 579 + }, + { + "epoch": 0.1780233271945979, + "grad_norm": 3.05216646194458, + "learning_rate": 5.9202453987730066e-05, + "loss": 3.463, + "step": 580 + }, + { + "epoch": 0.17833026396562307, + "grad_norm": 3.427311658859253, + "learning_rate": 5.9304703476482626e-05, + "loss": 3.4204, + "step": 581 + }, + { + "epoch": 0.17863720073664824, + "grad_norm": 2.5583856105804443, + "learning_rate": 5.940695296523517e-05, + "loss": 3.4686, + "step": 582 + }, + { + "epoch": 0.1789441375076734, + "grad_norm": 3.85471248626709, + "learning_rate": 5.950920245398773e-05, + "loss": 3.4518, + "step": 583 + }, + { + "epoch": 0.17925107427869857, + "grad_norm": 2.6894235610961914, + "learning_rate": 5.9611451942740294e-05, + "loss": 3.4179, + "step": 584 + }, + { + "epoch": 0.17955801104972377, + "grad_norm": 3.7592904567718506, + "learning_rate": 5.971370143149284e-05, + "loss": 3.3197, + "step": 585 + }, + { + "epoch": 0.17986494782074894, + "grad_norm": 2.8180313110351562, + "learning_rate": 5.98159509202454e-05, + "loss": 3.4098, + "step": 586 + }, + { + "epoch": 0.1801718845917741, + "grad_norm": 3.5678224563598633, + "learning_rate": 5.991820040899796e-05, + "loss": 3.3644, + "step": 587 + }, + { + "epoch": 0.18047882136279927, + "grad_norm": 2.920607328414917, + "learning_rate": 6.002044989775052e-05, + "loss": 3.4158, + "step": 588 + }, + { + "epoch": 0.18078575813382444, + "grad_norm": 2.9465436935424805, + "learning_rate": 6.012269938650307e-05, + "loss": 3.3369, + "step": 589 + }, + { + "epoch": 0.1810926949048496, + "grad_norm": 3.8760533332824707, + "learning_rate": 6.022494887525563e-05, + "loss": 3.4205, + "step": 590 + }, + { + "epoch": 0.18139963167587478, + "grad_norm": 3.2972259521484375, + "learning_rate": 6.032719836400819e-05, + "loss": 3.3234, + "step": 591 + }, + { + "epoch": 0.18170656844689995, + "grad_norm": 2.8855841159820557, + "learning_rate": 6.0429447852760736e-05, + "loss": 3.4172, + "step": 592 + }, + { + "epoch": 0.18201350521792511, + "grad_norm": 3.3035166263580322, + "learning_rate": 6.05316973415133e-05, + "loss": 3.3235, + "step": 593 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 2.5975232124328613, + "learning_rate": 6.063394683026585e-05, + "loss": 3.3245, + "step": 594 + }, + { + "epoch": 0.18262737875997545, + "grad_norm": 3.68007755279541, + "learning_rate": 6.073619631901841e-05, + "loss": 3.4348, + "step": 595 + }, + { + "epoch": 0.18293431553100062, + "grad_norm": 2.774419069290161, + "learning_rate": 6.083844580777096e-05, + "loss": 3.2763, + "step": 596 + }, + { + "epoch": 0.1832412523020258, + "grad_norm": 3.686140298843384, + "learning_rate": 6.094069529652352e-05, + "loss": 3.29, + "step": 597 + }, + { + "epoch": 0.18354818907305095, + "grad_norm": 2.71142315864563, + "learning_rate": 6.104294478527609e-05, + "loss": 3.3899, + "step": 598 + }, + { + "epoch": 0.18385512584407612, + "grad_norm": 3.725736141204834, + "learning_rate": 6.114519427402863e-05, + "loss": 3.3844, + "step": 599 + }, + { + "epoch": 0.1841620626151013, + "grad_norm": 2.691237211227417, + "learning_rate": 6.124744376278119e-05, + "loss": 3.3138, + "step": 600 + }, + { + "epoch": 0.18446899938612646, + "grad_norm": 3.467499256134033, + "learning_rate": 6.134969325153375e-05, + "loss": 3.3501, + "step": 601 + }, + { + "epoch": 0.18477593615715163, + "grad_norm": 2.776309013366699, + "learning_rate": 6.14519427402863e-05, + "loss": 3.3278, + "step": 602 + }, + { + "epoch": 0.1850828729281768, + "grad_norm": 3.4674019813537598, + "learning_rate": 6.155419222903885e-05, + "loss": 3.262, + "step": 603 + }, + { + "epoch": 0.18538980969920196, + "grad_norm": 2.8091421127319336, + "learning_rate": 6.165644171779141e-05, + "loss": 3.3296, + "step": 604 + }, + { + "epoch": 0.18569674647022713, + "grad_norm": 3.4938528537750244, + "learning_rate": 6.175869120654397e-05, + "loss": 3.4028, + "step": 605 + }, + { + "epoch": 0.1860036832412523, + "grad_norm": 2.5200188159942627, + "learning_rate": 6.186094069529653e-05, + "loss": 3.3726, + "step": 606 + }, + { + "epoch": 0.18631062001227747, + "grad_norm": 3.6415109634399414, + "learning_rate": 6.196319018404908e-05, + "loss": 3.3539, + "step": 607 + }, + { + "epoch": 0.18661755678330263, + "grad_norm": 2.553532123565674, + "learning_rate": 6.206543967280163e-05, + "loss": 3.2971, + "step": 608 + }, + { + "epoch": 0.1869244935543278, + "grad_norm": 3.7287046909332275, + "learning_rate": 6.21676891615542e-05, + "loss": 3.3987, + "step": 609 + }, + { + "epoch": 0.18723143032535297, + "grad_norm": 2.6285226345062256, + "learning_rate": 6.226993865030674e-05, + "loss": 3.2446, + "step": 610 + }, + { + "epoch": 0.18753836709637814, + "grad_norm": 3.453766107559204, + "learning_rate": 6.237218813905931e-05, + "loss": 3.2644, + "step": 611 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 2.7924115657806396, + "learning_rate": 6.247443762781186e-05, + "loss": 3.3056, + "step": 612 + }, + { + "epoch": 0.18815224063842848, + "grad_norm": 3.4854533672332764, + "learning_rate": 6.257668711656443e-05, + "loss": 3.3468, + "step": 613 + }, + { + "epoch": 0.18845917740945364, + "grad_norm": 2.8738653659820557, + "learning_rate": 6.267893660531697e-05, + "loss": 3.3079, + "step": 614 + }, + { + "epoch": 0.1887661141804788, + "grad_norm": 3.496342420578003, + "learning_rate": 6.278118609406954e-05, + "loss": 3.3453, + "step": 615 + }, + { + "epoch": 0.18907305095150398, + "grad_norm": 3.1935245990753174, + "learning_rate": 6.288343558282209e-05, + "loss": 3.303, + "step": 616 + }, + { + "epoch": 0.18937998772252915, + "grad_norm": 2.9726579189300537, + "learning_rate": 6.298568507157464e-05, + "loss": 3.284, + "step": 617 + }, + { + "epoch": 0.18968692449355432, + "grad_norm": 2.8515241146087646, + "learning_rate": 6.30879345603272e-05, + "loss": 3.2748, + "step": 618 + }, + { + "epoch": 0.18999386126457948, + "grad_norm": 3.216681480407715, + "learning_rate": 6.319018404907977e-05, + "loss": 3.2613, + "step": 619 + }, + { + "epoch": 0.19030079803560468, + "grad_norm": 2.9164562225341797, + "learning_rate": 6.329243353783232e-05, + "loss": 3.3234, + "step": 620 + }, + { + "epoch": 0.19060773480662985, + "grad_norm": 2.6724259853363037, + "learning_rate": 6.339468302658487e-05, + "loss": 3.3271, + "step": 621 + }, + { + "epoch": 0.19091467157765502, + "grad_norm": 3.298551082611084, + "learning_rate": 6.349693251533743e-05, + "loss": 3.2715, + "step": 622 + }, + { + "epoch": 0.19122160834868018, + "grad_norm": 2.609632968902588, + "learning_rate": 6.359918200408998e-05, + "loss": 3.2392, + "step": 623 + }, + { + "epoch": 0.19152854511970535, + "grad_norm": 3.6469385623931885, + "learning_rate": 6.370143149284253e-05, + "loss": 3.428, + "step": 624 + }, + { + "epoch": 0.19183548189073052, + "grad_norm": 2.4231622219085693, + "learning_rate": 6.380368098159509e-05, + "loss": 3.3436, + "step": 625 + }, + { + "epoch": 0.1921424186617557, + "grad_norm": 3.9182474613189697, + "learning_rate": 6.390593047034765e-05, + "loss": 3.3375, + "step": 626 + }, + { + "epoch": 0.19244935543278086, + "grad_norm": 2.3975942134857178, + "learning_rate": 6.400817995910021e-05, + "loss": 3.2711, + "step": 627 + }, + { + "epoch": 0.19275629220380602, + "grad_norm": 3.061039447784424, + "learning_rate": 6.411042944785276e-05, + "loss": 3.3124, + "step": 628 + }, + { + "epoch": 0.1930632289748312, + "grad_norm": 2.9461817741394043, + "learning_rate": 6.421267893660532e-05, + "loss": 3.2954, + "step": 629 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 2.6603662967681885, + "learning_rate": 6.431492842535788e-05, + "loss": 3.2138, + "step": 630 + }, + { + "epoch": 0.19367710251688153, + "grad_norm": 3.339444875717163, + "learning_rate": 6.441717791411042e-05, + "loss": 3.2796, + "step": 631 + }, + { + "epoch": 0.1939840392879067, + "grad_norm": 2.59061861038208, + "learning_rate": 6.451942740286299e-05, + "loss": 3.3906, + "step": 632 + }, + { + "epoch": 0.19429097605893186, + "grad_norm": 3.704300880432129, + "learning_rate": 6.462167689161554e-05, + "loss": 3.2604, + "step": 633 + }, + { + "epoch": 0.19459791282995703, + "grad_norm": 3.110203266143799, + "learning_rate": 6.472392638036811e-05, + "loss": 3.3236, + "step": 634 + }, + { + "epoch": 0.1949048496009822, + "grad_norm": 3.016730308532715, + "learning_rate": 6.482617586912065e-05, + "loss": 3.2911, + "step": 635 + }, + { + "epoch": 0.19521178637200737, + "grad_norm": 2.896956205368042, + "learning_rate": 6.492842535787322e-05, + "loss": 3.35, + "step": 636 + }, + { + "epoch": 0.19551872314303254, + "grad_norm": 2.7913663387298584, + "learning_rate": 6.503067484662577e-05, + "loss": 3.3474, + "step": 637 + }, + { + "epoch": 0.1958256599140577, + "grad_norm": 3.285518169403076, + "learning_rate": 6.513292433537832e-05, + "loss": 3.2131, + "step": 638 + }, + { + "epoch": 0.19613259668508287, + "grad_norm": 2.588491201400757, + "learning_rate": 6.523517382413088e-05, + "loss": 3.2955, + "step": 639 + }, + { + "epoch": 0.19643953345610804, + "grad_norm": 2.9417827129364014, + "learning_rate": 6.533742331288345e-05, + "loss": 3.2917, + "step": 640 + }, + { + "epoch": 0.1967464702271332, + "grad_norm": 3.2209408283233643, + "learning_rate": 6.5439672801636e-05, + "loss": 3.233, + "step": 641 + }, + { + "epoch": 0.19705340699815838, + "grad_norm": 2.8424925804138184, + "learning_rate": 6.554192229038855e-05, + "loss": 3.3194, + "step": 642 + }, + { + "epoch": 0.19736034376918354, + "grad_norm": 2.9005842208862305, + "learning_rate": 6.56441717791411e-05, + "loss": 3.275, + "step": 643 + }, + { + "epoch": 0.1976672805402087, + "grad_norm": 3.0277016162872314, + "learning_rate": 6.574642126789366e-05, + "loss": 3.2881, + "step": 644 + }, + { + "epoch": 0.19797421731123388, + "grad_norm": 2.8932368755340576, + "learning_rate": 6.584867075664623e-05, + "loss": 3.2799, + "step": 645 + }, + { + "epoch": 0.19828115408225905, + "grad_norm": 2.994464635848999, + "learning_rate": 6.595092024539877e-05, + "loss": 3.258, + "step": 646 + }, + { + "epoch": 0.19858809085328422, + "grad_norm": 2.943040132522583, + "learning_rate": 6.605316973415133e-05, + "loss": 3.1994, + "step": 647 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 2.942765712738037, + "learning_rate": 6.615541922290389e-05, + "loss": 3.1802, + "step": 648 + }, + { + "epoch": 0.19920196439533455, + "grad_norm": 2.8036246299743652, + "learning_rate": 6.625766871165644e-05, + "loss": 3.2426, + "step": 649 + }, + { + "epoch": 0.19950890116635972, + "grad_norm": 2.814507484436035, + "learning_rate": 6.6359918200409e-05, + "loss": 3.2978, + "step": 650 + }, + { + "epoch": 0.1998158379373849, + "grad_norm": 2.8133158683776855, + "learning_rate": 6.646216768916156e-05, + "loss": 3.2435, + "step": 651 + }, + { + "epoch": 0.20012277470841006, + "grad_norm": 2.8596129417419434, + "learning_rate": 6.656441717791412e-05, + "loss": 3.2154, + "step": 652 + }, + { + "epoch": 0.20042971147943522, + "grad_norm": 2.663926839828491, + "learning_rate": 6.666666666666667e-05, + "loss": 3.2487, + "step": 653 + }, + { + "epoch": 0.2007366482504604, + "grad_norm": 3.40561580657959, + "learning_rate": 6.676891615541922e-05, + "loss": 3.1509, + "step": 654 + }, + { + "epoch": 0.20104358502148556, + "grad_norm": 2.5786798000335693, + "learning_rate": 6.687116564417179e-05, + "loss": 3.2686, + "step": 655 + }, + { + "epoch": 0.20135052179251076, + "grad_norm": 3.007436752319336, + "learning_rate": 6.697341513292433e-05, + "loss": 3.2543, + "step": 656 + }, + { + "epoch": 0.20165745856353592, + "grad_norm": 2.5966951847076416, + "learning_rate": 6.70756646216769e-05, + "loss": 3.2643, + "step": 657 + }, + { + "epoch": 0.2019643953345611, + "grad_norm": 3.2698333263397217, + "learning_rate": 6.717791411042945e-05, + "loss": 3.2002, + "step": 658 + }, + { + "epoch": 0.20227133210558626, + "grad_norm": 2.513129472732544, + "learning_rate": 6.7280163599182e-05, + "loss": 3.1551, + "step": 659 + }, + { + "epoch": 0.20257826887661143, + "grad_norm": 2.9690299034118652, + "learning_rate": 6.738241308793456e-05, + "loss": 3.3037, + "step": 660 + }, + { + "epoch": 0.2028852056476366, + "grad_norm": 2.6644227504730225, + "learning_rate": 6.748466257668711e-05, + "loss": 3.3225, + "step": 661 + }, + { + "epoch": 0.20319214241866176, + "grad_norm": 2.6990232467651367, + "learning_rate": 6.758691206543968e-05, + "loss": 3.227, + "step": 662 + }, + { + "epoch": 0.20349907918968693, + "grad_norm": 3.6271350383758545, + "learning_rate": 6.768916155419223e-05, + "loss": 3.32, + "step": 663 + }, + { + "epoch": 0.2038060159607121, + "grad_norm": 2.6351428031921387, + "learning_rate": 6.779141104294479e-05, + "loss": 3.2104, + "step": 664 + }, + { + "epoch": 0.20411295273173727, + "grad_norm": 3.980685234069824, + "learning_rate": 6.789366053169734e-05, + "loss": 3.2602, + "step": 665 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 2.5207509994506836, + "learning_rate": 6.799591002044991e-05, + "loss": 3.2256, + "step": 666 + }, + { + "epoch": 0.2047268262737876, + "grad_norm": 3.0568666458129883, + "learning_rate": 6.809815950920245e-05, + "loss": 3.2918, + "step": 667 + }, + { + "epoch": 0.20503376304481277, + "grad_norm": 2.6476826667785645, + "learning_rate": 6.820040899795501e-05, + "loss": 3.2745, + "step": 668 + }, + { + "epoch": 0.20534069981583794, + "grad_norm": 3.0413191318511963, + "learning_rate": 6.830265848670757e-05, + "loss": 3.2683, + "step": 669 + }, + { + "epoch": 0.2056476365868631, + "grad_norm": 2.6214709281921387, + "learning_rate": 6.840490797546014e-05, + "loss": 3.1399, + "step": 670 + }, + { + "epoch": 0.20595457335788828, + "grad_norm": 3.0577988624572754, + "learning_rate": 6.850715746421268e-05, + "loss": 3.2131, + "step": 671 + }, + { + "epoch": 0.20626151012891344, + "grad_norm": 2.795365571975708, + "learning_rate": 6.860940695296524e-05, + "loss": 3.1633, + "step": 672 + }, + { + "epoch": 0.2065684468999386, + "grad_norm": 3.3030495643615723, + "learning_rate": 6.87116564417178e-05, + "loss": 3.2036, + "step": 673 + }, + { + "epoch": 0.20687538367096378, + "grad_norm": 2.3182966709136963, + "learning_rate": 6.881390593047035e-05, + "loss": 3.2154, + "step": 674 + }, + { + "epoch": 0.20718232044198895, + "grad_norm": 3.133702039718628, + "learning_rate": 6.89161554192229e-05, + "loss": 3.1828, + "step": 675 + }, + { + "epoch": 0.20748925721301412, + "grad_norm": 2.555358409881592, + "learning_rate": 6.901840490797547e-05, + "loss": 3.1434, + "step": 676 + }, + { + "epoch": 0.20779619398403928, + "grad_norm": 2.990675687789917, + "learning_rate": 6.912065439672802e-05, + "loss": 3.2182, + "step": 677 + }, + { + "epoch": 0.20810313075506445, + "grad_norm": 2.5072035789489746, + "learning_rate": 6.922290388548058e-05, + "loss": 3.2735, + "step": 678 + }, + { + "epoch": 0.20841006752608962, + "grad_norm": 3.311474323272705, + "learning_rate": 6.932515337423313e-05, + "loss": 3.2152, + "step": 679 + }, + { + "epoch": 0.2087170042971148, + "grad_norm": 2.7110986709594727, + "learning_rate": 6.942740286298569e-05, + "loss": 3.1633, + "step": 680 + }, + { + "epoch": 0.20902394106813996, + "grad_norm": 2.6963095664978027, + "learning_rate": 6.952965235173824e-05, + "loss": 3.2097, + "step": 681 + }, + { + "epoch": 0.20933087783916512, + "grad_norm": 2.7126448154449463, + "learning_rate": 6.963190184049079e-05, + "loss": 3.232, + "step": 682 + }, + { + "epoch": 0.2096378146101903, + "grad_norm": 2.723257541656494, + "learning_rate": 6.973415132924336e-05, + "loss": 3.1024, + "step": 683 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 2.985407829284668, + "learning_rate": 6.983640081799591e-05, + "loss": 3.215, + "step": 684 + }, + { + "epoch": 0.21025168815224063, + "grad_norm": 2.4878063201904297, + "learning_rate": 6.993865030674847e-05, + "loss": 3.2543, + "step": 685 + }, + { + "epoch": 0.2105586249232658, + "grad_norm": 3.417191505432129, + "learning_rate": 7.004089979550102e-05, + "loss": 3.217, + "step": 686 + }, + { + "epoch": 0.21086556169429096, + "grad_norm": 2.606513738632202, + "learning_rate": 7.014314928425359e-05, + "loss": 3.1831, + "step": 687 + }, + { + "epoch": 0.21117249846531613, + "grad_norm": 2.777334213256836, + "learning_rate": 7.024539877300614e-05, + "loss": 3.1513, + "step": 688 + }, + { + "epoch": 0.2114794352363413, + "grad_norm": 2.718494415283203, + "learning_rate": 7.03476482617587e-05, + "loss": 3.1695, + "step": 689 + }, + { + "epoch": 0.21178637200736647, + "grad_norm": 3.041794776916504, + "learning_rate": 7.044989775051125e-05, + "loss": 3.2078, + "step": 690 + }, + { + "epoch": 0.21209330877839166, + "grad_norm": 2.6473169326782227, + "learning_rate": 7.055214723926382e-05, + "loss": 3.177, + "step": 691 + }, + { + "epoch": 0.21240024554941683, + "grad_norm": 3.2349517345428467, + "learning_rate": 7.065439672801636e-05, + "loss": 3.2144, + "step": 692 + }, + { + "epoch": 0.212707182320442, + "grad_norm": 2.6024651527404785, + "learning_rate": 7.075664621676892e-05, + "loss": 3.2204, + "step": 693 + }, + { + "epoch": 0.21301411909146717, + "grad_norm": 2.9090511798858643, + "learning_rate": 7.085889570552148e-05, + "loss": 3.2473, + "step": 694 + }, + { + "epoch": 0.21332105586249234, + "grad_norm": 3.230525255203247, + "learning_rate": 7.096114519427403e-05, + "loss": 3.2552, + "step": 695 + }, + { + "epoch": 0.2136279926335175, + "grad_norm": 2.2609128952026367, + "learning_rate": 7.106339468302658e-05, + "loss": 3.1302, + "step": 696 + }, + { + "epoch": 0.21393492940454267, + "grad_norm": 3.484372854232788, + "learning_rate": 7.116564417177914e-05, + "loss": 3.1578, + "step": 697 + }, + { + "epoch": 0.21424186617556784, + "grad_norm": 2.130702257156372, + "learning_rate": 7.12678936605317e-05, + "loss": 3.2089, + "step": 698 + }, + { + "epoch": 0.214548802946593, + "grad_norm": 3.0673611164093018, + "learning_rate": 7.137014314928426e-05, + "loss": 3.214, + "step": 699 + }, + { + "epoch": 0.21485573971761818, + "grad_norm": 2.572826862335205, + "learning_rate": 7.147239263803681e-05, + "loss": 3.1824, + "step": 700 + }, + { + "epoch": 0.21516267648864335, + "grad_norm": 2.8327746391296387, + "learning_rate": 7.157464212678937e-05, + "loss": 3.2384, + "step": 701 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 2.863041877746582, + "learning_rate": 7.167689161554193e-05, + "loss": 3.1102, + "step": 702 + }, + { + "epoch": 0.21577655003069368, + "grad_norm": 2.2519750595092773, + "learning_rate": 7.177914110429447e-05, + "loss": 3.1541, + "step": 703 + }, + { + "epoch": 0.21608348680171885, + "grad_norm": 3.197129011154175, + "learning_rate": 7.188139059304704e-05, + "loss": 3.2407, + "step": 704 + }, + { + "epoch": 0.21639042357274402, + "grad_norm": 2.32582426071167, + "learning_rate": 7.19836400817996e-05, + "loss": 3.1895, + "step": 705 + }, + { + "epoch": 0.21669736034376919, + "grad_norm": 3.0128488540649414, + "learning_rate": 7.208588957055215e-05, + "loss": 3.2839, + "step": 706 + }, + { + "epoch": 0.21700429711479435, + "grad_norm": 2.503342390060425, + "learning_rate": 7.21881390593047e-05, + "loss": 3.2093, + "step": 707 + }, + { + "epoch": 0.21731123388581952, + "grad_norm": 2.7540833950042725, + "learning_rate": 7.229038854805727e-05, + "loss": 3.2143, + "step": 708 + }, + { + "epoch": 0.2176181706568447, + "grad_norm": 2.8838772773742676, + "learning_rate": 7.239263803680982e-05, + "loss": 3.2051, + "step": 709 + }, + { + "epoch": 0.21792510742786986, + "grad_norm": 2.7495758533477783, + "learning_rate": 7.249488752556238e-05, + "loss": 3.0701, + "step": 710 + }, + { + "epoch": 0.21823204419889503, + "grad_norm": 2.684539794921875, + "learning_rate": 7.259713701431493e-05, + "loss": 3.1917, + "step": 711 + }, + { + "epoch": 0.2185389809699202, + "grad_norm": 2.8330819606781006, + "learning_rate": 7.26993865030675e-05, + "loss": 3.1685, + "step": 712 + }, + { + "epoch": 0.21884591774094536, + "grad_norm": 2.6974711418151855, + "learning_rate": 7.280163599182005e-05, + "loss": 3.0953, + "step": 713 + }, + { + "epoch": 0.21915285451197053, + "grad_norm": 2.5129306316375732, + "learning_rate": 7.29038854805726e-05, + "loss": 3.1371, + "step": 714 + }, + { + "epoch": 0.2194597912829957, + "grad_norm": 2.7884230613708496, + "learning_rate": 7.300613496932516e-05, + "loss": 3.1386, + "step": 715 + }, + { + "epoch": 0.21976672805402087, + "grad_norm": 2.296306610107422, + "learning_rate": 7.310838445807771e-05, + "loss": 3.1735, + "step": 716 + }, + { + "epoch": 0.22007366482504603, + "grad_norm": 2.777911424636841, + "learning_rate": 7.321063394683026e-05, + "loss": 3.1726, + "step": 717 + }, + { + "epoch": 0.2203806015960712, + "grad_norm": 2.5349695682525635, + "learning_rate": 7.331288343558282e-05, + "loss": 3.1603, + "step": 718 + }, + { + "epoch": 0.22068753836709637, + "grad_norm": 2.415412425994873, + "learning_rate": 7.341513292433539e-05, + "loss": 3.1378, + "step": 719 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 2.7188358306884766, + "learning_rate": 7.351738241308794e-05, + "loss": 3.1321, + "step": 720 + }, + { + "epoch": 0.2213014119091467, + "grad_norm": 2.4872183799743652, + "learning_rate": 7.361963190184049e-05, + "loss": 3.1283, + "step": 721 + }, + { + "epoch": 0.22160834868017187, + "grad_norm": 2.454535961151123, + "learning_rate": 7.372188139059305e-05, + "loss": 3.1085, + "step": 722 + }, + { + "epoch": 0.22191528545119704, + "grad_norm": 2.5621426105499268, + "learning_rate": 7.382413087934561e-05, + "loss": 3.1307, + "step": 723 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.513777256011963, + "learning_rate": 7.392638036809815e-05, + "loss": 3.1103, + "step": 724 + }, + { + "epoch": 0.22252915899324738, + "grad_norm": 2.596559762954712, + "learning_rate": 7.402862985685072e-05, + "loss": 3.1563, + "step": 725 + }, + { + "epoch": 0.22283609576427257, + "grad_norm": 2.371487617492676, + "learning_rate": 7.413087934560327e-05, + "loss": 3.1344, + "step": 726 + }, + { + "epoch": 0.22314303253529774, + "grad_norm": 2.7252206802368164, + "learning_rate": 7.423312883435584e-05, + "loss": 3.2139, + "step": 727 + }, + { + "epoch": 0.2234499693063229, + "grad_norm": 2.2834722995758057, + "learning_rate": 7.433537832310838e-05, + "loss": 3.1461, + "step": 728 + }, + { + "epoch": 0.22375690607734808, + "grad_norm": 3.0965540409088135, + "learning_rate": 7.443762781186095e-05, + "loss": 3.1433, + "step": 729 + }, + { + "epoch": 0.22406384284837325, + "grad_norm": 2.351365804672241, + "learning_rate": 7.45398773006135e-05, + "loss": 3.1737, + "step": 730 + }, + { + "epoch": 0.2243707796193984, + "grad_norm": 3.0938596725463867, + "learning_rate": 7.464212678936606e-05, + "loss": 3.1689, + "step": 731 + }, + { + "epoch": 0.22467771639042358, + "grad_norm": 2.415039300918579, + "learning_rate": 7.474437627811861e-05, + "loss": 3.1146, + "step": 732 + }, + { + "epoch": 0.22498465316144875, + "grad_norm": 2.8242318630218506, + "learning_rate": 7.484662576687118e-05, + "loss": 3.0812, + "step": 733 + }, + { + "epoch": 0.22529158993247392, + "grad_norm": 2.4347777366638184, + "learning_rate": 7.494887525562373e-05, + "loss": 3.203, + "step": 734 + }, + { + "epoch": 0.22559852670349909, + "grad_norm": 2.953418016433716, + "learning_rate": 7.505112474437628e-05, + "loss": 3.109, + "step": 735 + }, + { + "epoch": 0.22590546347452425, + "grad_norm": 2.600888252258301, + "learning_rate": 7.515337423312884e-05, + "loss": 3.1859, + "step": 736 + }, + { + "epoch": 0.22621240024554942, + "grad_norm": 2.7484869956970215, + "learning_rate": 7.525562372188139e-05, + "loss": 3.1169, + "step": 737 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 2.4797677993774414, + "learning_rate": 7.535787321063396e-05, + "loss": 3.0696, + "step": 738 + }, + { + "epoch": 0.22682627378759976, + "grad_norm": 2.641873359680176, + "learning_rate": 7.54601226993865e-05, + "loss": 3.1545, + "step": 739 + }, + { + "epoch": 0.22713321055862493, + "grad_norm": 2.3956825733184814, + "learning_rate": 7.556237218813907e-05, + "loss": 3.1295, + "step": 740 + }, + { + "epoch": 0.2274401473296501, + "grad_norm": 2.8832130432128906, + "learning_rate": 7.566462167689162e-05, + "loss": 3.1119, + "step": 741 + }, + { + "epoch": 0.22774708410067526, + "grad_norm": 2.3001184463500977, + "learning_rate": 7.576687116564417e-05, + "loss": 3.0068, + "step": 742 + }, + { + "epoch": 0.22805402087170043, + "grad_norm": 2.8682122230529785, + "learning_rate": 7.586912065439673e-05, + "loss": 3.0562, + "step": 743 + }, + { + "epoch": 0.2283609576427256, + "grad_norm": 2.2176413536071777, + "learning_rate": 7.59713701431493e-05, + "loss": 3.1395, + "step": 744 + }, + { + "epoch": 0.22866789441375077, + "grad_norm": 3.698274612426758, + "learning_rate": 7.607361963190185e-05, + "loss": 3.209, + "step": 745 + }, + { + "epoch": 0.22897483118477593, + "grad_norm": 2.141063928604126, + "learning_rate": 7.61758691206544e-05, + "loss": 3.1734, + "step": 746 + }, + { + "epoch": 0.2292817679558011, + "grad_norm": 2.728498697280884, + "learning_rate": 7.627811860940695e-05, + "loss": 3.1498, + "step": 747 + }, + { + "epoch": 0.22958870472682627, + "grad_norm": 2.271678924560547, + "learning_rate": 7.638036809815952e-05, + "loss": 3.1538, + "step": 748 + }, + { + "epoch": 0.22989564149785144, + "grad_norm": 2.6095521450042725, + "learning_rate": 7.648261758691206e-05, + "loss": 3.155, + "step": 749 + }, + { + "epoch": 0.2302025782688766, + "grad_norm": 2.410792112350464, + "learning_rate": 7.658486707566463e-05, + "loss": 3.0478, + "step": 750 + }, + { + "epoch": 0.23050951503990177, + "grad_norm": 2.6980888843536377, + "learning_rate": 7.668711656441718e-05, + "loss": 3.1369, + "step": 751 + }, + { + "epoch": 0.23081645181092694, + "grad_norm": 2.353308916091919, + "learning_rate": 7.678936605316974e-05, + "loss": 3.0052, + "step": 752 + }, + { + "epoch": 0.2311233885819521, + "grad_norm": 2.4530155658721924, + "learning_rate": 7.689161554192229e-05, + "loss": 3.1348, + "step": 753 + }, + { + "epoch": 0.23143032535297728, + "grad_norm": 2.393601894378662, + "learning_rate": 7.699386503067484e-05, + "loss": 2.9941, + "step": 754 + }, + { + "epoch": 0.23173726212400245, + "grad_norm": 2.576876401901245, + "learning_rate": 7.709611451942741e-05, + "loss": 3.114, + "step": 755 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 2.0420913696289062, + "learning_rate": 7.719836400817996e-05, + "loss": 3.132, + "step": 756 + }, + { + "epoch": 0.23235113566605278, + "grad_norm": 3.0095622539520264, + "learning_rate": 7.730061349693252e-05, + "loss": 3.1763, + "step": 757 + }, + { + "epoch": 0.23265807243707795, + "grad_norm": 2.224005937576294, + "learning_rate": 7.740286298568507e-05, + "loss": 3.0703, + "step": 758 + }, + { + "epoch": 0.23296500920810312, + "grad_norm": 2.7559845447540283, + "learning_rate": 7.750511247443764e-05, + "loss": 3.1026, + "step": 759 + }, + { + "epoch": 0.2332719459791283, + "grad_norm": 2.2965753078460693, + "learning_rate": 7.760736196319018e-05, + "loss": 3.0284, + "step": 760 + }, + { + "epoch": 0.23357888275015345, + "grad_norm": 2.374398708343506, + "learning_rate": 7.770961145194275e-05, + "loss": 3.0636, + "step": 761 + }, + { + "epoch": 0.23388581952117865, + "grad_norm": 2.4315314292907715, + "learning_rate": 7.78118609406953e-05, + "loss": 3.0906, + "step": 762 + }, + { + "epoch": 0.23419275629220382, + "grad_norm": 2.5609946250915527, + "learning_rate": 7.791411042944787e-05, + "loss": 3.0692, + "step": 763 + }, + { + "epoch": 0.234499693063229, + "grad_norm": 2.419597864151001, + "learning_rate": 7.80163599182004e-05, + "loss": 3.1934, + "step": 764 + }, + { + "epoch": 0.23480662983425415, + "grad_norm": 3.0499062538146973, + "learning_rate": 7.811860940695297e-05, + "loss": 3.18, + "step": 765 + }, + { + "epoch": 0.23511356660527932, + "grad_norm": 2.464421510696411, + "learning_rate": 7.822085889570553e-05, + "loss": 3.1591, + "step": 766 + }, + { + "epoch": 0.2354205033763045, + "grad_norm": 3.4370174407958984, + "learning_rate": 7.832310838445808e-05, + "loss": 3.1156, + "step": 767 + }, + { + "epoch": 0.23572744014732966, + "grad_norm": 2.207406520843506, + "learning_rate": 7.842535787321063e-05, + "loss": 3.0557, + "step": 768 + }, + { + "epoch": 0.23603437691835483, + "grad_norm": 2.484807014465332, + "learning_rate": 7.85276073619632e-05, + "loss": 3.1003, + "step": 769 + }, + { + "epoch": 0.23634131368938, + "grad_norm": 2.33217716217041, + "learning_rate": 7.862985685071576e-05, + "loss": 3.0707, + "step": 770 + }, + { + "epoch": 0.23664825046040516, + "grad_norm": 2.493717670440674, + "learning_rate": 7.873210633946831e-05, + "loss": 3.127, + "step": 771 + }, + { + "epoch": 0.23695518723143033, + "grad_norm": 2.5824413299560547, + "learning_rate": 7.883435582822086e-05, + "loss": 3.1042, + "step": 772 + }, + { + "epoch": 0.2372621240024555, + "grad_norm": 2.4137654304504395, + "learning_rate": 7.893660531697342e-05, + "loss": 3.136, + "step": 773 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 2.4657833576202393, + "learning_rate": 7.903885480572597e-05, + "loss": 3.038, + "step": 774 + }, + { + "epoch": 0.23787599754450584, + "grad_norm": 2.426260471343994, + "learning_rate": 7.914110429447852e-05, + "loss": 3.0102, + "step": 775 + }, + { + "epoch": 0.238182934315531, + "grad_norm": 2.4658050537109375, + "learning_rate": 7.924335378323109e-05, + "loss": 3.0645, + "step": 776 + }, + { + "epoch": 0.23848987108655617, + "grad_norm": 2.186267614364624, + "learning_rate": 7.934560327198364e-05, + "loss": 3.0585, + "step": 777 + }, + { + "epoch": 0.23879680785758134, + "grad_norm": 2.8824141025543213, + "learning_rate": 7.94478527607362e-05, + "loss": 3.0796, + "step": 778 + }, + { + "epoch": 0.2391037446286065, + "grad_norm": 1.9940539598464966, + "learning_rate": 7.955010224948875e-05, + "loss": 2.9894, + "step": 779 + }, + { + "epoch": 0.23941068139963168, + "grad_norm": 2.9386861324310303, + "learning_rate": 7.965235173824132e-05, + "loss": 3.1147, + "step": 780 + }, + { + "epoch": 0.23971761817065684, + "grad_norm": 2.241983413696289, + "learning_rate": 7.975460122699386e-05, + "loss": 2.9977, + "step": 781 + }, + { + "epoch": 0.240024554941682, + "grad_norm": 2.4796900749206543, + "learning_rate": 7.985685071574643e-05, + "loss": 3.0507, + "step": 782 + }, + { + "epoch": 0.24033149171270718, + "grad_norm": 2.6178741455078125, + "learning_rate": 7.995910020449898e-05, + "loss": 3.0299, + "step": 783 + }, + { + "epoch": 0.24063842848373235, + "grad_norm": 2.157179594039917, + "learning_rate": 8.006134969325155e-05, + "loss": 3.0419, + "step": 784 + }, + { + "epoch": 0.24094536525475752, + "grad_norm": 2.49029541015625, + "learning_rate": 8.016359918200409e-05, + "loss": 3.0785, + "step": 785 + }, + { + "epoch": 0.24125230202578268, + "grad_norm": 2.254014492034912, + "learning_rate": 8.026584867075665e-05, + "loss": 3.0009, + "step": 786 + }, + { + "epoch": 0.24155923879680785, + "grad_norm": 2.514465570449829, + "learning_rate": 8.036809815950921e-05, + "loss": 3.0221, + "step": 787 + }, + { + "epoch": 0.24186617556783302, + "grad_norm": 2.309812545776367, + "learning_rate": 8.047034764826176e-05, + "loss": 2.9822, + "step": 788 + }, + { + "epoch": 0.2421731123388582, + "grad_norm": 2.5367796421051025, + "learning_rate": 8.057259713701431e-05, + "loss": 2.966, + "step": 789 + }, + { + "epoch": 0.24248004910988336, + "grad_norm": 2.4668943881988525, + "learning_rate": 8.067484662576688e-05, + "loss": 3.1177, + "step": 790 + }, + { + "epoch": 0.24278698588090852, + "grad_norm": 2.9424917697906494, + "learning_rate": 8.077709611451944e-05, + "loss": 3.078, + "step": 791 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 2.3068933486938477, + "learning_rate": 8.087934560327199e-05, + "loss": 3.0415, + "step": 792 + }, + { + "epoch": 0.24340085942295886, + "grad_norm": 2.675631284713745, + "learning_rate": 8.098159509202454e-05, + "loss": 3.012, + "step": 793 + }, + { + "epoch": 0.24370779619398403, + "grad_norm": 2.0261662006378174, + "learning_rate": 8.10838445807771e-05, + "loss": 3.0023, + "step": 794 + }, + { + "epoch": 0.2440147329650092, + "grad_norm": 3.32330322265625, + "learning_rate": 8.118609406952966e-05, + "loss": 3.0992, + "step": 795 + }, + { + "epoch": 0.24432166973603436, + "grad_norm": 2.1587088108062744, + "learning_rate": 8.12883435582822e-05, + "loss": 3.0922, + "step": 796 + }, + { + "epoch": 0.24462860650705956, + "grad_norm": 2.639254331588745, + "learning_rate": 8.139059304703477e-05, + "loss": 2.9856, + "step": 797 + }, + { + "epoch": 0.24493554327808473, + "grad_norm": 1.9976975917816162, + "learning_rate": 8.149284253578732e-05, + "loss": 3.0015, + "step": 798 + }, + { + "epoch": 0.2452424800491099, + "grad_norm": 2.763504981994629, + "learning_rate": 8.159509202453988e-05, + "loss": 3.0437, + "step": 799 + }, + { + "epoch": 0.24554941682013506, + "grad_norm": 1.9080138206481934, + "learning_rate": 8.169734151329243e-05, + "loss": 3.0009, + "step": 800 + }, + { + "epoch": 0.24585635359116023, + "grad_norm": 3.1276164054870605, + "learning_rate": 8.1799591002045e-05, + "loss": 3.0433, + "step": 801 + }, + { + "epoch": 0.2461632903621854, + "grad_norm": 2.0463218688964844, + "learning_rate": 8.190184049079755e-05, + "loss": 2.988, + "step": 802 + }, + { + "epoch": 0.24647022713321057, + "grad_norm": 2.8476648330688477, + "learning_rate": 8.20040899795501e-05, + "loss": 3.0238, + "step": 803 + }, + { + "epoch": 0.24677716390423574, + "grad_norm": 1.9715898036956787, + "learning_rate": 8.210633946830266e-05, + "loss": 3.0657, + "step": 804 + }, + { + "epoch": 0.2470841006752609, + "grad_norm": 3.369995594024658, + "learning_rate": 8.220858895705523e-05, + "loss": 3.0181, + "step": 805 + }, + { + "epoch": 0.24739103744628607, + "grad_norm": 2.0333900451660156, + "learning_rate": 8.231083844580777e-05, + "loss": 3.0589, + "step": 806 + }, + { + "epoch": 0.24769797421731124, + "grad_norm": 2.5702931880950928, + "learning_rate": 8.241308793456033e-05, + "loss": 2.9908, + "step": 807 + }, + { + "epoch": 0.2480049109883364, + "grad_norm": 2.12131929397583, + "learning_rate": 8.251533742331289e-05, + "loss": 3.0519, + "step": 808 + }, + { + "epoch": 0.24831184775936158, + "grad_norm": 2.5457377433776855, + "learning_rate": 8.261758691206544e-05, + "loss": 3.019, + "step": 809 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 2.0954740047454834, + "learning_rate": 8.2719836400818e-05, + "loss": 2.9805, + "step": 810 + }, + { + "epoch": 0.2489257213014119, + "grad_norm": 2.2456700801849365, + "learning_rate": 8.282208588957055e-05, + "loss": 3.0627, + "step": 811 + }, + { + "epoch": 0.24923265807243708, + "grad_norm": 2.4453790187835693, + "learning_rate": 8.292433537832312e-05, + "loss": 3.0447, + "step": 812 + }, + { + "epoch": 0.24953959484346225, + "grad_norm": 2.1835873126983643, + "learning_rate": 8.302658486707567e-05, + "loss": 3.0008, + "step": 813 + }, + { + "epoch": 0.24984653161448742, + "grad_norm": 2.292989492416382, + "learning_rate": 8.312883435582822e-05, + "loss": 2.9175, + "step": 814 + }, + { + "epoch": 0.2501534683855126, + "grad_norm": 2.408888816833496, + "learning_rate": 8.323108384458078e-05, + "loss": 2.9649, + "step": 815 + }, + { + "epoch": 0.2504604051565378, + "grad_norm": 2.1873834133148193, + "learning_rate": 8.333333333333334e-05, + "loss": 2.9812, + "step": 816 + }, + { + "epoch": 0.25076734192756295, + "grad_norm": 2.2599284648895264, + "learning_rate": 8.343558282208588e-05, + "loss": 3.0086, + "step": 817 + }, + { + "epoch": 0.2510742786985881, + "grad_norm": 2.1902761459350586, + "learning_rate": 8.353783231083845e-05, + "loss": 2.9295, + "step": 818 + }, + { + "epoch": 0.2513812154696133, + "grad_norm": 2.4830422401428223, + "learning_rate": 8.3640081799591e-05, + "loss": 2.9808, + "step": 819 + }, + { + "epoch": 0.25168815224063845, + "grad_norm": 2.2274281978607178, + "learning_rate": 8.374233128834357e-05, + "loss": 2.9525, + "step": 820 + }, + { + "epoch": 0.2519950890116636, + "grad_norm": 2.2949111461639404, + "learning_rate": 8.384458077709611e-05, + "loss": 3.0313, + "step": 821 + }, + { + "epoch": 0.2523020257826888, + "grad_norm": 2.2345564365386963, + "learning_rate": 8.394683026584868e-05, + "loss": 2.9024, + "step": 822 + }, + { + "epoch": 0.25260896255371396, + "grad_norm": 2.488744020462036, + "learning_rate": 8.404907975460123e-05, + "loss": 2.9907, + "step": 823 + }, + { + "epoch": 0.2529158993247391, + "grad_norm": 1.9192837476730347, + "learning_rate": 8.415132924335379e-05, + "loss": 2.9792, + "step": 824 + }, + { + "epoch": 0.2532228360957643, + "grad_norm": 2.6426947116851807, + "learning_rate": 8.425357873210634e-05, + "loss": 2.972, + "step": 825 + }, + { + "epoch": 0.25352977286678946, + "grad_norm": 1.9950047731399536, + "learning_rate": 8.435582822085891e-05, + "loss": 2.9885, + "step": 826 + }, + { + "epoch": 0.25383670963781463, + "grad_norm": 2.30191969871521, + "learning_rate": 8.445807770961146e-05, + "loss": 2.9358, + "step": 827 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 2.1111395359039307, + "learning_rate": 8.456032719836401e-05, + "loss": 3.0343, + "step": 828 + }, + { + "epoch": 0.25445058317986496, + "grad_norm": 2.7292258739471436, + "learning_rate": 8.466257668711657e-05, + "loss": 2.9465, + "step": 829 + }, + { + "epoch": 0.25475751995089013, + "grad_norm": 1.9130604267120361, + "learning_rate": 8.476482617586912e-05, + "loss": 2.9443, + "step": 830 + }, + { + "epoch": 0.2550644567219153, + "grad_norm": 2.4240024089813232, + "learning_rate": 8.486707566462168e-05, + "loss": 2.963, + "step": 831 + }, + { + "epoch": 0.25537139349294047, + "grad_norm": 2.062875509262085, + "learning_rate": 8.496932515337423e-05, + "loss": 3.0127, + "step": 832 + }, + { + "epoch": 0.25567833026396564, + "grad_norm": 2.223639726638794, + "learning_rate": 8.50715746421268e-05, + "loss": 2.944, + "step": 833 + }, + { + "epoch": 0.2559852670349908, + "grad_norm": 2.2969272136688232, + "learning_rate": 8.517382413087935e-05, + "loss": 2.9495, + "step": 834 + }, + { + "epoch": 0.256292203806016, + "grad_norm": 2.1343178749084473, + "learning_rate": 8.52760736196319e-05, + "loss": 3.0383, + "step": 835 + }, + { + "epoch": 0.25659914057704114, + "grad_norm": 2.2348313331604004, + "learning_rate": 8.537832310838446e-05, + "loss": 2.9205, + "step": 836 + }, + { + "epoch": 0.2569060773480663, + "grad_norm": 2.2653896808624268, + "learning_rate": 8.548057259713702e-05, + "loss": 2.9699, + "step": 837 + }, + { + "epoch": 0.2572130141190915, + "grad_norm": 2.1332547664642334, + "learning_rate": 8.558282208588958e-05, + "loss": 2.9318, + "step": 838 + }, + { + "epoch": 0.25751995089011664, + "grad_norm": 2.5935778617858887, + "learning_rate": 8.568507157464213e-05, + "loss": 2.9754, + "step": 839 + }, + { + "epoch": 0.2578268876611418, + "grad_norm": 2.073923110961914, + "learning_rate": 8.578732106339469e-05, + "loss": 3.0396, + "step": 840 + }, + { + "epoch": 0.258133824432167, + "grad_norm": 2.485049247741699, + "learning_rate": 8.588957055214725e-05, + "loss": 2.9297, + "step": 841 + }, + { + "epoch": 0.25844076120319215, + "grad_norm": 1.9425253868103027, + "learning_rate": 8.599182004089979e-05, + "loss": 3.0131, + "step": 842 + }, + { + "epoch": 0.2587476979742173, + "grad_norm": 2.6248724460601807, + "learning_rate": 8.609406952965236e-05, + "loss": 3.0345, + "step": 843 + }, + { + "epoch": 0.2590546347452425, + "grad_norm": 1.9123374223709106, + "learning_rate": 8.619631901840491e-05, + "loss": 3.0259, + "step": 844 + }, + { + "epoch": 0.25936157151626765, + "grad_norm": 2.457913637161255, + "learning_rate": 8.629856850715747e-05, + "loss": 3.0015, + "step": 845 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 2.0444202423095703, + "learning_rate": 8.640081799591002e-05, + "loss": 2.9663, + "step": 846 + }, + { + "epoch": 0.259975445058318, + "grad_norm": 2.1673583984375, + "learning_rate": 8.650306748466259e-05, + "loss": 3.0646, + "step": 847 + }, + { + "epoch": 0.26028238182934316, + "grad_norm": 2.1198627948760986, + "learning_rate": 8.660531697341514e-05, + "loss": 2.8769, + "step": 848 + }, + { + "epoch": 0.2605893186003683, + "grad_norm": 2.379960775375366, + "learning_rate": 8.67075664621677e-05, + "loss": 2.9637, + "step": 849 + }, + { + "epoch": 0.2608962553713935, + "grad_norm": 2.3954226970672607, + "learning_rate": 8.680981595092025e-05, + "loss": 3.025, + "step": 850 + }, + { + "epoch": 0.26120319214241866, + "grad_norm": 2.254746198654175, + "learning_rate": 8.69120654396728e-05, + "loss": 2.9962, + "step": 851 + }, + { + "epoch": 0.26151012891344383, + "grad_norm": 2.0851991176605225, + "learning_rate": 8.701431492842537e-05, + "loss": 2.9399, + "step": 852 + }, + { + "epoch": 0.261817065684469, + "grad_norm": 2.2800698280334473, + "learning_rate": 8.711656441717791e-05, + "loss": 2.9465, + "step": 853 + }, + { + "epoch": 0.26212400245549416, + "grad_norm": 2.3628437519073486, + "learning_rate": 8.721881390593048e-05, + "loss": 3.0298, + "step": 854 + }, + { + "epoch": 0.26243093922651933, + "grad_norm": 1.9642207622528076, + "learning_rate": 8.732106339468303e-05, + "loss": 2.8462, + "step": 855 + }, + { + "epoch": 0.2627378759975445, + "grad_norm": 2.5833423137664795, + "learning_rate": 8.742331288343558e-05, + "loss": 2.9024, + "step": 856 + }, + { + "epoch": 0.26304481276856967, + "grad_norm": 1.7022998332977295, + "learning_rate": 8.752556237218814e-05, + "loss": 2.9948, + "step": 857 + }, + { + "epoch": 0.26335174953959484, + "grad_norm": 3.181725025177002, + "learning_rate": 8.76278118609407e-05, + "loss": 3.0634, + "step": 858 + }, + { + "epoch": 0.26365868631062, + "grad_norm": 1.8931077718734741, + "learning_rate": 8.773006134969326e-05, + "loss": 2.9974, + "step": 859 + }, + { + "epoch": 0.2639656230816452, + "grad_norm": 2.5016703605651855, + "learning_rate": 8.783231083844581e-05, + "loss": 3.0109, + "step": 860 + }, + { + "epoch": 0.26427255985267034, + "grad_norm": 1.810957908630371, + "learning_rate": 8.793456032719837e-05, + "loss": 3.0143, + "step": 861 + }, + { + "epoch": 0.2645794966236955, + "grad_norm": 2.3004086017608643, + "learning_rate": 8.803680981595093e-05, + "loss": 2.9825, + "step": 862 + }, + { + "epoch": 0.2648864333947207, + "grad_norm": 2.23740816116333, + "learning_rate": 8.813905930470347e-05, + "loss": 2.8897, + "step": 863 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 2.441157579421997, + "learning_rate": 8.824130879345604e-05, + "loss": 2.8966, + "step": 864 + }, + { + "epoch": 0.265500306936771, + "grad_norm": 2.063201665878296, + "learning_rate": 8.83435582822086e-05, + "loss": 2.9468, + "step": 865 + }, + { + "epoch": 0.2658072437077962, + "grad_norm": 2.1484951972961426, + "learning_rate": 8.844580777096115e-05, + "loss": 2.9199, + "step": 866 + }, + { + "epoch": 0.26611418047882135, + "grad_norm": 2.167827844619751, + "learning_rate": 8.85480572597137e-05, + "loss": 2.9403, + "step": 867 + }, + { + "epoch": 0.2664211172498465, + "grad_norm": 2.193556070327759, + "learning_rate": 8.865030674846625e-05, + "loss": 2.9171, + "step": 868 + }, + { + "epoch": 0.2667280540208717, + "grad_norm": 2.0754151344299316, + "learning_rate": 8.875255623721882e-05, + "loss": 2.9605, + "step": 869 + }, + { + "epoch": 0.26703499079189685, + "grad_norm": 2.1351094245910645, + "learning_rate": 8.885480572597138e-05, + "loss": 2.9272, + "step": 870 + }, + { + "epoch": 0.267341927562922, + "grad_norm": 2.0486347675323486, + "learning_rate": 8.895705521472393e-05, + "loss": 3.0308, + "step": 871 + }, + { + "epoch": 0.2676488643339472, + "grad_norm": 2.3303308486938477, + "learning_rate": 8.905930470347648e-05, + "loss": 2.9061, + "step": 872 + }, + { + "epoch": 0.26795580110497236, + "grad_norm": 1.9345083236694336, + "learning_rate": 8.916155419222905e-05, + "loss": 2.9644, + "step": 873 + }, + { + "epoch": 0.2682627378759975, + "grad_norm": 2.451918601989746, + "learning_rate": 8.926380368098159e-05, + "loss": 2.9536, + "step": 874 + }, + { + "epoch": 0.2685696746470227, + "grad_norm": 1.6964573860168457, + "learning_rate": 8.936605316973416e-05, + "loss": 2.9228, + "step": 875 + }, + { + "epoch": 0.26887661141804786, + "grad_norm": 2.2414000034332275, + "learning_rate": 8.946830265848671e-05, + "loss": 2.9776, + "step": 876 + }, + { + "epoch": 0.26918354818907303, + "grad_norm": 1.725002408027649, + "learning_rate": 8.957055214723928e-05, + "loss": 2.9837, + "step": 877 + }, + { + "epoch": 0.2694904849600982, + "grad_norm": 2.1498587131500244, + "learning_rate": 8.967280163599182e-05, + "loss": 2.8684, + "step": 878 + }, + { + "epoch": 0.26979742173112337, + "grad_norm": 1.814738392829895, + "learning_rate": 8.977505112474438e-05, + "loss": 2.9077, + "step": 879 + }, + { + "epoch": 0.27010435850214853, + "grad_norm": 2.3086628913879395, + "learning_rate": 8.987730061349694e-05, + "loss": 2.9482, + "step": 880 + }, + { + "epoch": 0.2704112952731737, + "grad_norm": 1.7470855712890625, + "learning_rate": 8.997955010224949e-05, + "loss": 2.9775, + "step": 881 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 2.2822775840759277, + "learning_rate": 9.008179959100205e-05, + "loss": 3.0004, + "step": 882 + }, + { + "epoch": 0.27102516881522404, + "grad_norm": 1.9530903100967407, + "learning_rate": 9.018404907975461e-05, + "loss": 2.949, + "step": 883 + }, + { + "epoch": 0.2713321055862492, + "grad_norm": 2.0626885890960693, + "learning_rate": 9.028629856850717e-05, + "loss": 2.9184, + "step": 884 + }, + { + "epoch": 0.2716390423572744, + "grad_norm": 2.0040712356567383, + "learning_rate": 9.038854805725972e-05, + "loss": 2.8562, + "step": 885 + }, + { + "epoch": 0.2719459791282996, + "grad_norm": 2.026193141937256, + "learning_rate": 9.049079754601227e-05, + "loss": 2.883, + "step": 886 + }, + { + "epoch": 0.27225291589932477, + "grad_norm": 1.8337095975875854, + "learning_rate": 9.059304703476483e-05, + "loss": 2.8512, + "step": 887 + }, + { + "epoch": 0.27255985267034993, + "grad_norm": 2.1098122596740723, + "learning_rate": 9.069529652351738e-05, + "loss": 2.9024, + "step": 888 + }, + { + "epoch": 0.2728667894413751, + "grad_norm": 2.065650701522827, + "learning_rate": 9.079754601226993e-05, + "loss": 2.9291, + "step": 889 + }, + { + "epoch": 0.27317372621240027, + "grad_norm": 2.204819679260254, + "learning_rate": 9.08997955010225e-05, + "loss": 2.9153, + "step": 890 + }, + { + "epoch": 0.27348066298342544, + "grad_norm": 1.7931475639343262, + "learning_rate": 9.100204498977506e-05, + "loss": 2.9104, + "step": 891 + }, + { + "epoch": 0.2737875997544506, + "grad_norm": 2.4288859367370605, + "learning_rate": 9.110429447852761e-05, + "loss": 2.9974, + "step": 892 + }, + { + "epoch": 0.2740945365254758, + "grad_norm": 2.095872640609741, + "learning_rate": 9.120654396728016e-05, + "loss": 2.8446, + "step": 893 + }, + { + "epoch": 0.27440147329650094, + "grad_norm": 2.054410696029663, + "learning_rate": 9.130879345603273e-05, + "loss": 2.9008, + "step": 894 + }, + { + "epoch": 0.2747084100675261, + "grad_norm": 2.1989710330963135, + "learning_rate": 9.141104294478528e-05, + "loss": 2.8808, + "step": 895 + }, + { + "epoch": 0.2750153468385513, + "grad_norm": 2.531081199645996, + "learning_rate": 9.151329243353784e-05, + "loss": 2.8928, + "step": 896 + }, + { + "epoch": 0.27532228360957645, + "grad_norm": 2.010425567626953, + "learning_rate": 9.161554192229039e-05, + "loss": 2.9051, + "step": 897 + }, + { + "epoch": 0.2756292203806016, + "grad_norm": 1.9320241212844849, + "learning_rate": 9.171779141104296e-05, + "loss": 2.8675, + "step": 898 + }, + { + "epoch": 0.2759361571516268, + "grad_norm": 2.2280430793762207, + "learning_rate": 9.18200408997955e-05, + "loss": 2.9082, + "step": 899 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 1.9172335863113403, + "learning_rate": 9.192229038854807e-05, + "loss": 2.8947, + "step": 900 + }, + { + "epoch": 0.2765500306936771, + "grad_norm": 2.0846056938171387, + "learning_rate": 9.202453987730062e-05, + "loss": 2.9161, + "step": 901 + }, + { + "epoch": 0.2768569674647023, + "grad_norm": 1.875034213066101, + "learning_rate": 9.212678936605317e-05, + "loss": 2.8937, + "step": 902 + }, + { + "epoch": 0.27716390423572745, + "grad_norm": 2.230164051055908, + "learning_rate": 9.222903885480573e-05, + "loss": 2.8396, + "step": 903 + }, + { + "epoch": 0.2774708410067526, + "grad_norm": 1.6204382181167603, + "learning_rate": 9.233128834355828e-05, + "loss": 2.9367, + "step": 904 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.4218156337738037, + "learning_rate": 9.243353783231085e-05, + "loss": 2.9727, + "step": 905 + }, + { + "epoch": 0.27808471454880296, + "grad_norm": 1.7401793003082275, + "learning_rate": 9.25357873210634e-05, + "loss": 2.8957, + "step": 906 + }, + { + "epoch": 0.2783916513198281, + "grad_norm": 2.2128076553344727, + "learning_rate": 9.263803680981595e-05, + "loss": 2.8725, + "step": 907 + }, + { + "epoch": 0.2786985880908533, + "grad_norm": 2.004179000854492, + "learning_rate": 9.274028629856851e-05, + "loss": 2.8879, + "step": 908 + }, + { + "epoch": 0.27900552486187846, + "grad_norm": 2.198784112930298, + "learning_rate": 9.284253578732107e-05, + "loss": 2.9655, + "step": 909 + }, + { + "epoch": 0.27931246163290363, + "grad_norm": 1.8064004182815552, + "learning_rate": 9.294478527607362e-05, + "loss": 2.7801, + "step": 910 + }, + { + "epoch": 0.2796193984039288, + "grad_norm": 2.1273581981658936, + "learning_rate": 9.304703476482618e-05, + "loss": 2.8615, + "step": 911 + }, + { + "epoch": 0.27992633517495397, + "grad_norm": 1.7843197584152222, + "learning_rate": 9.314928425357874e-05, + "loss": 2.8735, + "step": 912 + }, + { + "epoch": 0.28023327194597913, + "grad_norm": 2.234886884689331, + "learning_rate": 9.325153374233129e-05, + "loss": 2.9444, + "step": 913 + }, + { + "epoch": 0.2805402087170043, + "grad_norm": 2.0565783977508545, + "learning_rate": 9.335378323108384e-05, + "loss": 2.9784, + "step": 914 + }, + { + "epoch": 0.28084714548802947, + "grad_norm": 1.836901068687439, + "learning_rate": 9.345603271983641e-05, + "loss": 2.9217, + "step": 915 + }, + { + "epoch": 0.28115408225905464, + "grad_norm": 2.0981357097625732, + "learning_rate": 9.355828220858896e-05, + "loss": 2.9091, + "step": 916 + }, + { + "epoch": 0.2814610190300798, + "grad_norm": 1.9199821949005127, + "learning_rate": 9.366053169734152e-05, + "loss": 2.8882, + "step": 917 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 1.9928756952285767, + "learning_rate": 9.376278118609407e-05, + "loss": 2.8463, + "step": 918 + }, + { + "epoch": 0.28207489257213014, + "grad_norm": 1.9580156803131104, + "learning_rate": 9.386503067484664e-05, + "loss": 2.7814, + "step": 919 + }, + { + "epoch": 0.2823818293431553, + "grad_norm": 2.016144275665283, + "learning_rate": 9.396728016359919e-05, + "loss": 2.8725, + "step": 920 + }, + { + "epoch": 0.2826887661141805, + "grad_norm": 1.967668890953064, + "learning_rate": 9.406952965235175e-05, + "loss": 2.912, + "step": 921 + }, + { + "epoch": 0.28299570288520565, + "grad_norm": 1.8826593160629272, + "learning_rate": 9.41717791411043e-05, + "loss": 2.7885, + "step": 922 + }, + { + "epoch": 0.2833026396562308, + "grad_norm": 2.0615732669830322, + "learning_rate": 9.427402862985685e-05, + "loss": 2.9111, + "step": 923 + }, + { + "epoch": 0.283609576427256, + "grad_norm": 1.7132701873779297, + "learning_rate": 9.43762781186094e-05, + "loss": 2.89, + "step": 924 + }, + { + "epoch": 0.28391651319828115, + "grad_norm": 2.1561272144317627, + "learning_rate": 9.447852760736196e-05, + "loss": 2.8741, + "step": 925 + }, + { + "epoch": 0.2842234499693063, + "grad_norm": 1.727338433265686, + "learning_rate": 9.458077709611453e-05, + "loss": 2.8449, + "step": 926 + }, + { + "epoch": 0.2845303867403315, + "grad_norm": 2.19234299659729, + "learning_rate": 9.468302658486708e-05, + "loss": 2.8499, + "step": 927 + }, + { + "epoch": 0.28483732351135665, + "grad_norm": 1.7370812892913818, + "learning_rate": 9.478527607361963e-05, + "loss": 2.882, + "step": 928 + }, + { + "epoch": 0.2851442602823818, + "grad_norm": 2.0576157569885254, + "learning_rate": 9.488752556237219e-05, + "loss": 2.7869, + "step": 929 + }, + { + "epoch": 0.285451197053407, + "grad_norm": 1.7926486730575562, + "learning_rate": 9.498977505112476e-05, + "loss": 2.906, + "step": 930 + }, + { + "epoch": 0.28575813382443216, + "grad_norm": 1.6877856254577637, + "learning_rate": 9.50920245398773e-05, + "loss": 2.8422, + "step": 931 + }, + { + "epoch": 0.2860650705954573, + "grad_norm": 2.3053178787231445, + "learning_rate": 9.519427402862986e-05, + "loss": 2.9039, + "step": 932 + }, + { + "epoch": 0.2863720073664825, + "grad_norm": 1.7746092081069946, + "learning_rate": 9.529652351738242e-05, + "loss": 2.9082, + "step": 933 + }, + { + "epoch": 0.28667894413750766, + "grad_norm": 2.1900086402893066, + "learning_rate": 9.539877300613498e-05, + "loss": 2.8511, + "step": 934 + }, + { + "epoch": 0.28698588090853283, + "grad_norm": 1.781988501548767, + "learning_rate": 9.550102249488752e-05, + "loss": 2.8264, + "step": 935 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 1.845797061920166, + "learning_rate": 9.560327198364009e-05, + "loss": 2.8657, + "step": 936 + }, + { + "epoch": 0.28759975445058317, + "grad_norm": 1.8794586658477783, + "learning_rate": 9.570552147239264e-05, + "loss": 2.8365, + "step": 937 + }, + { + "epoch": 0.28790669122160834, + "grad_norm": 2.078359603881836, + "learning_rate": 9.58077709611452e-05, + "loss": 2.8829, + "step": 938 + }, + { + "epoch": 0.2882136279926335, + "grad_norm": 1.8091285228729248, + "learning_rate": 9.591002044989775e-05, + "loss": 2.8083, + "step": 939 + }, + { + "epoch": 0.28852056476365867, + "grad_norm": 2.0130608081817627, + "learning_rate": 9.601226993865032e-05, + "loss": 2.8922, + "step": 940 + }, + { + "epoch": 0.28882750153468384, + "grad_norm": 1.8504360914230347, + "learning_rate": 9.611451942740287e-05, + "loss": 2.8034, + "step": 941 + }, + { + "epoch": 0.289134438305709, + "grad_norm": 1.860420823097229, + "learning_rate": 9.621676891615543e-05, + "loss": 2.8249, + "step": 942 + }, + { + "epoch": 0.2894413750767342, + "grad_norm": 2.157158374786377, + "learning_rate": 9.631901840490798e-05, + "loss": 2.8629, + "step": 943 + }, + { + "epoch": 0.28974831184775934, + "grad_norm": 1.8066895008087158, + "learning_rate": 9.642126789366053e-05, + "loss": 2.7965, + "step": 944 + }, + { + "epoch": 0.2900552486187845, + "grad_norm": 1.9674500226974487, + "learning_rate": 9.65235173824131e-05, + "loss": 2.8043, + "step": 945 + }, + { + "epoch": 0.2903621853898097, + "grad_norm": 1.7899354696273804, + "learning_rate": 9.662576687116564e-05, + "loss": 2.8803, + "step": 946 + }, + { + "epoch": 0.29066912216083485, + "grad_norm": 2.220201015472412, + "learning_rate": 9.672801635991821e-05, + "loss": 2.8201, + "step": 947 + }, + { + "epoch": 0.29097605893186, + "grad_norm": 1.76320219039917, + "learning_rate": 9.683026584867076e-05, + "loss": 2.8921, + "step": 948 + }, + { + "epoch": 0.2912829957028852, + "grad_norm": 1.6863081455230713, + "learning_rate": 9.693251533742331e-05, + "loss": 2.8208, + "step": 949 + }, + { + "epoch": 0.29158993247391035, + "grad_norm": 2.1578476428985596, + "learning_rate": 9.703476482617587e-05, + "loss": 2.8972, + "step": 950 + }, + { + "epoch": 0.2918968692449355, + "grad_norm": 1.6925181150436401, + "learning_rate": 9.713701431492844e-05, + "loss": 2.8225, + "step": 951 + }, + { + "epoch": 0.2922038060159607, + "grad_norm": 1.8861147165298462, + "learning_rate": 9.723926380368099e-05, + "loss": 2.8707, + "step": 952 + }, + { + "epoch": 0.29251074278698586, + "grad_norm": 1.5894604921340942, + "learning_rate": 9.734151329243354e-05, + "loss": 2.7576, + "step": 953 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 1.9092673063278198, + "learning_rate": 9.74437627811861e-05, + "loss": 2.8659, + "step": 954 + }, + { + "epoch": 0.2931246163290362, + "grad_norm": 1.8600605726242065, + "learning_rate": 9.754601226993866e-05, + "loss": 2.752, + "step": 955 + }, + { + "epoch": 0.29343155310006136, + "grad_norm": 2.005805015563965, + "learning_rate": 9.76482617586912e-05, + "loss": 2.8511, + "step": 956 + }, + { + "epoch": 0.2937384898710866, + "grad_norm": 1.9485148191452026, + "learning_rate": 9.775051124744377e-05, + "loss": 2.9726, + "step": 957 + }, + { + "epoch": 0.29404542664211175, + "grad_norm": 1.9197280406951904, + "learning_rate": 9.785276073619632e-05, + "loss": 2.7753, + "step": 958 + }, + { + "epoch": 0.2943523634131369, + "grad_norm": 1.6279773712158203, + "learning_rate": 9.795501022494888e-05, + "loss": 2.8855, + "step": 959 + }, + { + "epoch": 0.2946593001841621, + "grad_norm": 2.0233097076416016, + "learning_rate": 9.805725971370143e-05, + "loss": 2.749, + "step": 960 + }, + { + "epoch": 0.29496623695518726, + "grad_norm": 1.550295352935791, + "learning_rate": 9.815950920245399e-05, + "loss": 2.7991, + "step": 961 + }, + { + "epoch": 0.2952731737262124, + "grad_norm": 2.3194360733032227, + "learning_rate": 9.826175869120655e-05, + "loss": 2.8208, + "step": 962 + }, + { + "epoch": 0.2955801104972376, + "grad_norm": 1.634867787361145, + "learning_rate": 9.83640081799591e-05, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.29588704726826276, + "grad_norm": 2.1152596473693848, + "learning_rate": 9.846625766871166e-05, + "loss": 2.7667, + "step": 964 + }, + { + "epoch": 0.2961939840392879, + "grad_norm": 1.8927233219146729, + "learning_rate": 9.856850715746421e-05, + "loss": 2.8308, + "step": 965 + }, + { + "epoch": 0.2965009208103131, + "grad_norm": 1.765026330947876, + "learning_rate": 9.867075664621678e-05, + "loss": 2.7546, + "step": 966 + }, + { + "epoch": 0.29680785758133826, + "grad_norm": 1.7491015195846558, + "learning_rate": 9.877300613496932e-05, + "loss": 2.8156, + "step": 967 + }, + { + "epoch": 0.29711479435236343, + "grad_norm": 1.8352077007293701, + "learning_rate": 9.887525562372189e-05, + "loss": 2.8542, + "step": 968 + }, + { + "epoch": 0.2974217311233886, + "grad_norm": 1.8892323970794678, + "learning_rate": 9.897750511247444e-05, + "loss": 2.8216, + "step": 969 + }, + { + "epoch": 0.29772866789441377, + "grad_norm": 1.7171403169631958, + "learning_rate": 9.907975460122701e-05, + "loss": 2.8428, + "step": 970 + }, + { + "epoch": 0.29803560466543894, + "grad_norm": 1.8318040370941162, + "learning_rate": 9.918200408997955e-05, + "loss": 2.7821, + "step": 971 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 1.5829975605010986, + "learning_rate": 9.928425357873212e-05, + "loss": 2.9091, + "step": 972 + }, + { + "epoch": 0.29864947820748927, + "grad_norm": 1.7248235940933228, + "learning_rate": 9.938650306748467e-05, + "loss": 2.7914, + "step": 973 + }, + { + "epoch": 0.29895641497851444, + "grad_norm": 1.7741187810897827, + "learning_rate": 9.948875255623722e-05, + "loss": 2.8711, + "step": 974 + }, + { + "epoch": 0.2992633517495396, + "grad_norm": 1.7419151067733765, + "learning_rate": 9.959100204498978e-05, + "loss": 2.8933, + "step": 975 + }, + { + "epoch": 0.2995702885205648, + "grad_norm": 1.6603926420211792, + "learning_rate": 9.969325153374234e-05, + "loss": 2.7138, + "step": 976 + }, + { + "epoch": 0.29987722529158994, + "grad_norm": 1.8423576354980469, + "learning_rate": 9.97955010224949e-05, + "loss": 2.7776, + "step": 977 + }, + { + "epoch": 0.3001841620626151, + "grad_norm": 1.5548568964004517, + "learning_rate": 9.989775051124745e-05, + "loss": 2.8193, + "step": 978 + }, + { + "epoch": 0.3004910988336403, + "grad_norm": 1.711785078048706, + "learning_rate": 0.0001, + "loss": 2.7082, + "step": 979 + }, + { + "epoch": 0.30079803560466545, + "grad_norm": 1.6395221948623657, + "learning_rate": 9.999999975293535e-05, + "loss": 2.7526, + "step": 980 + }, + { + "epoch": 0.3011049723756906, + "grad_norm": 1.829174518585205, + "learning_rate": 9.999999901174139e-05, + "loss": 2.7555, + "step": 981 + }, + { + "epoch": 0.3014119091467158, + "grad_norm": 1.5807569026947021, + "learning_rate": 9.999999777641814e-05, + "loss": 2.848, + "step": 982 + }, + { + "epoch": 0.30171884591774095, + "grad_norm": 2.014803171157837, + "learning_rate": 9.99999960469656e-05, + "loss": 2.8318, + "step": 983 + }, + { + "epoch": 0.3020257826887661, + "grad_norm": 1.4732542037963867, + "learning_rate": 9.99999938233838e-05, + "loss": 2.8143, + "step": 984 + }, + { + "epoch": 0.3023327194597913, + "grad_norm": 2.4888343811035156, + "learning_rate": 9.999999110567275e-05, + "loss": 2.7979, + "step": 985 + }, + { + "epoch": 0.30263965623081646, + "grad_norm": 1.4265737533569336, + "learning_rate": 9.99999878938325e-05, + "loss": 2.7968, + "step": 986 + }, + { + "epoch": 0.3029465930018416, + "grad_norm": 2.0397326946258545, + "learning_rate": 9.999998418786303e-05, + "loss": 2.7413, + "step": 987 + }, + { + "epoch": 0.3032535297728668, + "grad_norm": 1.6565579175949097, + "learning_rate": 9.999997998776443e-05, + "loss": 2.8249, + "step": 988 + }, + { + "epoch": 0.30356046654389196, + "grad_norm": 1.8470033407211304, + "learning_rate": 9.999997529353673e-05, + "loss": 2.7815, + "step": 989 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 1.571768045425415, + "learning_rate": 9.999997010517995e-05, + "loss": 2.7202, + "step": 990 + }, + { + "epoch": 0.3041743400859423, + "grad_norm": 1.6217811107635498, + "learning_rate": 9.999996442269417e-05, + "loss": 2.832, + "step": 991 + }, + { + "epoch": 0.30448127685696746, + "grad_norm": 1.745591640472412, + "learning_rate": 9.999995824607943e-05, + "loss": 2.8271, + "step": 992 + }, + { + "epoch": 0.30478821362799263, + "grad_norm": 1.6469355821609497, + "learning_rate": 9.99999515753358e-05, + "loss": 2.7699, + "step": 993 + }, + { + "epoch": 0.3050951503990178, + "grad_norm": 1.733182430267334, + "learning_rate": 9.999994441046334e-05, + "loss": 2.7927, + "step": 994 + }, + { + "epoch": 0.30540208717004297, + "grad_norm": 1.6043230295181274, + "learning_rate": 9.999993675146213e-05, + "loss": 2.7536, + "step": 995 + }, + { + "epoch": 0.30570902394106814, + "grad_norm": 1.8154711723327637, + "learning_rate": 9.999992859833222e-05, + "loss": 2.7795, + "step": 996 + }, + { + "epoch": 0.3060159607120933, + "grad_norm": 1.7553666830062866, + "learning_rate": 9.999991995107374e-05, + "loss": 2.8128, + "step": 997 + }, + { + "epoch": 0.3063228974831185, + "grad_norm": 1.702697992324829, + "learning_rate": 9.999991080968672e-05, + "loss": 2.7234, + "step": 998 + }, + { + "epoch": 0.30662983425414364, + "grad_norm": 1.512619972229004, + "learning_rate": 9.99999011741713e-05, + "loss": 2.7555, + "step": 999 + }, + { + "epoch": 0.3069367710251688, + "grad_norm": 1.735844612121582, + "learning_rate": 9.999989104452753e-05, + "loss": 2.7847, + "step": 1000 + }, + { + "epoch": 0.307243707796194, + "grad_norm": 1.4687904119491577, + "learning_rate": 9.999988042075555e-05, + "loss": 2.8039, + "step": 1001 + }, + { + "epoch": 0.30755064456721914, + "grad_norm": 1.6867917776107788, + "learning_rate": 9.999986930285542e-05, + "loss": 2.7643, + "step": 1002 + }, + { + "epoch": 0.3078575813382443, + "grad_norm": 1.6974400281906128, + "learning_rate": 9.99998576908273e-05, + "loss": 2.7284, + "step": 1003 + }, + { + "epoch": 0.3081645181092695, + "grad_norm": 1.6622353792190552, + "learning_rate": 9.999984558467126e-05, + "loss": 2.8364, + "step": 1004 + }, + { + "epoch": 0.30847145488029465, + "grad_norm": 1.7920496463775635, + "learning_rate": 9.999983298438744e-05, + "loss": 2.7769, + "step": 1005 + }, + { + "epoch": 0.3087783916513198, + "grad_norm": 1.7111997604370117, + "learning_rate": 9.999981988997598e-05, + "loss": 2.7323, + "step": 1006 + }, + { + "epoch": 0.309085328422345, + "grad_norm": 1.6372064352035522, + "learning_rate": 9.9999806301437e-05, + "loss": 2.8128, + "step": 1007 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 1.841002345085144, + "learning_rate": 9.999979221877061e-05, + "loss": 2.7049, + "step": 1008 + }, + { + "epoch": 0.3096992019643953, + "grad_norm": 1.4474141597747803, + "learning_rate": 9.999977764197697e-05, + "loss": 2.64, + "step": 1009 + }, + { + "epoch": 0.3100061387354205, + "grad_norm": 1.6599560976028442, + "learning_rate": 9.999976257105622e-05, + "loss": 2.7989, + "step": 1010 + }, + { + "epoch": 0.31031307550644566, + "grad_norm": 1.7502890825271606, + "learning_rate": 9.999974700600851e-05, + "loss": 2.7949, + "step": 1011 + }, + { + "epoch": 0.3106200122774708, + "grad_norm": 1.8119313716888428, + "learning_rate": 9.9999730946834e-05, + "loss": 2.7577, + "step": 1012 + }, + { + "epoch": 0.310926949048496, + "grad_norm": 1.4398404359817505, + "learning_rate": 9.999971439353284e-05, + "loss": 2.7369, + "step": 1013 + }, + { + "epoch": 0.31123388581952116, + "grad_norm": 1.8501840829849243, + "learning_rate": 9.999969734610522e-05, + "loss": 2.6651, + "step": 1014 + }, + { + "epoch": 0.31154082259054633, + "grad_norm": 1.450804352760315, + "learning_rate": 9.999967980455125e-05, + "loss": 2.7231, + "step": 1015 + }, + { + "epoch": 0.3118477593615715, + "grad_norm": 1.9445282220840454, + "learning_rate": 9.999966176887115e-05, + "loss": 2.795, + "step": 1016 + }, + { + "epoch": 0.31215469613259667, + "grad_norm": 1.6361008882522583, + "learning_rate": 9.99996432390651e-05, + "loss": 2.8894, + "step": 1017 + }, + { + "epoch": 0.31246163290362183, + "grad_norm": 2.0804831981658936, + "learning_rate": 9.999962421513325e-05, + "loss": 2.8313, + "step": 1018 + }, + { + "epoch": 0.312768569674647, + "grad_norm": 1.3779852390289307, + "learning_rate": 9.999960469707582e-05, + "loss": 2.6776, + "step": 1019 + }, + { + "epoch": 0.31307550644567217, + "grad_norm": 1.7727700471878052, + "learning_rate": 9.999958468489299e-05, + "loss": 2.8076, + "step": 1020 + }, + { + "epoch": 0.31338244321669734, + "grad_norm": 1.5273795127868652, + "learning_rate": 9.999956417858496e-05, + "loss": 2.7069, + "step": 1021 + }, + { + "epoch": 0.3136893799877225, + "grad_norm": 1.8135402202606201, + "learning_rate": 9.999954317815193e-05, + "loss": 2.7375, + "step": 1022 + }, + { + "epoch": 0.3139963167587477, + "grad_norm": 1.6642818450927734, + "learning_rate": 9.99995216835941e-05, + "loss": 2.8085, + "step": 1023 + }, + { + "epoch": 0.31430325352977284, + "grad_norm": 1.681378722190857, + "learning_rate": 9.999949969491169e-05, + "loss": 2.807, + "step": 1024 + }, + { + "epoch": 0.314610190300798, + "grad_norm": 1.5521160364151, + "learning_rate": 9.999947721210493e-05, + "loss": 2.7266, + "step": 1025 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 1.486830711364746, + "learning_rate": 9.999945423517403e-05, + "loss": 2.774, + "step": 1026 + }, + { + "epoch": 0.3152240638428484, + "grad_norm": 1.5730900764465332, + "learning_rate": 9.99994307641192e-05, + "loss": 2.7101, + "step": 1027 + }, + { + "epoch": 0.31553100061387357, + "grad_norm": 1.4835596084594727, + "learning_rate": 9.999940679894071e-05, + "loss": 2.8195, + "step": 1028 + }, + { + "epoch": 0.31583793738489874, + "grad_norm": 1.7885956764221191, + "learning_rate": 9.999938233963877e-05, + "loss": 2.796, + "step": 1029 + }, + { + "epoch": 0.3161448741559239, + "grad_norm": 1.4036259651184082, + "learning_rate": 9.999935738621362e-05, + "loss": 2.7167, + "step": 1030 + }, + { + "epoch": 0.3164518109269491, + "grad_norm": 1.7480512857437134, + "learning_rate": 9.999933193866554e-05, + "loss": 2.6774, + "step": 1031 + }, + { + "epoch": 0.31675874769797424, + "grad_norm": 1.66177499294281, + "learning_rate": 9.999930599699473e-05, + "loss": 2.7635, + "step": 1032 + }, + { + "epoch": 0.3170656844689994, + "grad_norm": 1.5088306665420532, + "learning_rate": 9.999927956120147e-05, + "loss": 2.7284, + "step": 1033 + }, + { + "epoch": 0.3173726212400246, + "grad_norm": 1.6847199201583862, + "learning_rate": 9.999925263128605e-05, + "loss": 2.8287, + "step": 1034 + }, + { + "epoch": 0.31767955801104975, + "grad_norm": 1.6092369556427002, + "learning_rate": 9.999922520724869e-05, + "loss": 2.7189, + "step": 1035 + }, + { + "epoch": 0.3179864947820749, + "grad_norm": 1.41717529296875, + "learning_rate": 9.999919728908969e-05, + "loss": 2.7134, + "step": 1036 + }, + { + "epoch": 0.3182934315531001, + "grad_norm": 1.6256498098373413, + "learning_rate": 9.999916887680931e-05, + "loss": 2.7312, + "step": 1037 + }, + { + "epoch": 0.31860036832412525, + "grad_norm": 1.4934377670288086, + "learning_rate": 9.999913997040784e-05, + "loss": 2.7548, + "step": 1038 + }, + { + "epoch": 0.3189073050951504, + "grad_norm": 1.6037719249725342, + "learning_rate": 9.999911056988557e-05, + "loss": 2.7682, + "step": 1039 + }, + { + "epoch": 0.3192142418661756, + "grad_norm": 1.4746284484863281, + "learning_rate": 9.999908067524277e-05, + "loss": 2.7256, + "step": 1040 + }, + { + "epoch": 0.31952117863720075, + "grad_norm": 1.4633710384368896, + "learning_rate": 9.999905028647976e-05, + "loss": 2.6779, + "step": 1041 + }, + { + "epoch": 0.3198281154082259, + "grad_norm": 1.6108646392822266, + "learning_rate": 9.999901940359684e-05, + "loss": 2.781, + "step": 1042 + }, + { + "epoch": 0.3201350521792511, + "grad_norm": 1.4130996465682983, + "learning_rate": 9.999898802659428e-05, + "loss": 2.6327, + "step": 1043 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 2.110307455062866, + "learning_rate": 9.999895615547244e-05, + "loss": 2.7965, + "step": 1044 + }, + { + "epoch": 0.3207489257213014, + "grad_norm": 1.500618815422058, + "learning_rate": 9.99989237902316e-05, + "loss": 2.7874, + "step": 1045 + }, + { + "epoch": 0.3210558624923266, + "grad_norm": 1.577890157699585, + "learning_rate": 9.999889093087207e-05, + "loss": 2.6816, + "step": 1046 + }, + { + "epoch": 0.32136279926335176, + "grad_norm": 1.2820981740951538, + "learning_rate": 9.999885757739422e-05, + "loss": 2.6799, + "step": 1047 + }, + { + "epoch": 0.32166973603437693, + "grad_norm": 1.629936695098877, + "learning_rate": 9.999882372979835e-05, + "loss": 2.6783, + "step": 1048 + }, + { + "epoch": 0.3219766728054021, + "grad_norm": 1.3119972944259644, + "learning_rate": 9.999878938808478e-05, + "loss": 2.6403, + "step": 1049 + }, + { + "epoch": 0.32228360957642727, + "grad_norm": 1.720093846321106, + "learning_rate": 9.999875455225389e-05, + "loss": 2.709, + "step": 1050 + }, + { + "epoch": 0.32259054634745243, + "grad_norm": 1.446273922920227, + "learning_rate": 9.999871922230599e-05, + "loss": 2.6463, + "step": 1051 + }, + { + "epoch": 0.3228974831184776, + "grad_norm": 1.5000908374786377, + "learning_rate": 9.999868339824145e-05, + "loss": 2.7502, + "step": 1052 + }, + { + "epoch": 0.32320441988950277, + "grad_norm": 1.6257869005203247, + "learning_rate": 9.999864708006061e-05, + "loss": 2.6984, + "step": 1053 + }, + { + "epoch": 0.32351135666052794, + "grad_norm": 1.509638786315918, + "learning_rate": 9.999861026776384e-05, + "loss": 2.6931, + "step": 1054 + }, + { + "epoch": 0.3238182934315531, + "grad_norm": 1.5305874347686768, + "learning_rate": 9.999857296135149e-05, + "loss": 2.8423, + "step": 1055 + }, + { + "epoch": 0.3241252302025783, + "grad_norm": 1.7664300203323364, + "learning_rate": 9.999853516082394e-05, + "loss": 2.7703, + "step": 1056 + }, + { + "epoch": 0.32443216697360344, + "grad_norm": 1.4633153676986694, + "learning_rate": 9.999849686618157e-05, + "loss": 2.7588, + "step": 1057 + }, + { + "epoch": 0.3247391037446286, + "grad_norm": 1.5177773237228394, + "learning_rate": 9.999845807742473e-05, + "loss": 2.7376, + "step": 1058 + }, + { + "epoch": 0.3250460405156538, + "grad_norm": 1.6122089624404907, + "learning_rate": 9.999841879455383e-05, + "loss": 2.7871, + "step": 1059 + }, + { + "epoch": 0.32535297728667895, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.999837901756926e-05, + "loss": 2.6602, + "step": 1060 + }, + { + "epoch": 0.3256599140577041, + "grad_norm": 1.5714327096939087, + "learning_rate": 9.99983387464714e-05, + "loss": 2.6279, + "step": 1061 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 1.399731993675232, + "learning_rate": 9.999829798126065e-05, + "loss": 2.7957, + "step": 1062 + }, + { + "epoch": 0.32627378759975445, + "grad_norm": 1.694368839263916, + "learning_rate": 9.999825672193741e-05, + "loss": 2.6859, + "step": 1063 + }, + { + "epoch": 0.3265807243707796, + "grad_norm": 1.2585967779159546, + "learning_rate": 9.99982149685021e-05, + "loss": 2.7964, + "step": 1064 + }, + { + "epoch": 0.3268876611418048, + "grad_norm": 1.802262306213379, + "learning_rate": 9.999817272095512e-05, + "loss": 2.6325, + "step": 1065 + }, + { + "epoch": 0.32719459791282995, + "grad_norm": 1.213222861289978, + "learning_rate": 9.99981299792969e-05, + "loss": 2.718, + "step": 1066 + }, + { + "epoch": 0.3275015346838551, + "grad_norm": 1.5745760202407837, + "learning_rate": 9.999808674352785e-05, + "loss": 2.8589, + "step": 1067 + }, + { + "epoch": 0.3278084714548803, + "grad_norm": 1.516995906829834, + "learning_rate": 9.999804301364839e-05, + "loss": 2.6691, + "step": 1068 + }, + { + "epoch": 0.32811540822590546, + "grad_norm": 1.4223122596740723, + "learning_rate": 9.999799878965897e-05, + "loss": 2.6899, + "step": 1069 + }, + { + "epoch": 0.3284223449969306, + "grad_norm": 1.4502828121185303, + "learning_rate": 9.999795407156003e-05, + "loss": 2.7801, + "step": 1070 + }, + { + "epoch": 0.3287292817679558, + "grad_norm": 1.4692026376724243, + "learning_rate": 9.999790885935198e-05, + "loss": 2.6869, + "step": 1071 + }, + { + "epoch": 0.32903621853898096, + "grad_norm": 1.4182246923446655, + "learning_rate": 9.999786315303532e-05, + "loss": 2.7802, + "step": 1072 + }, + { + "epoch": 0.32934315531000613, + "grad_norm": 1.781173586845398, + "learning_rate": 9.999781695261046e-05, + "loss": 2.7522, + "step": 1073 + }, + { + "epoch": 0.3296500920810313, + "grad_norm": 1.3958306312561035, + "learning_rate": 9.999777025807786e-05, + "loss": 2.6894, + "step": 1074 + }, + { + "epoch": 0.32995702885205647, + "grad_norm": 1.7938110828399658, + "learning_rate": 9.9997723069438e-05, + "loss": 2.6468, + "step": 1075 + }, + { + "epoch": 0.33026396562308163, + "grad_norm": 1.2314528226852417, + "learning_rate": 9.999767538669134e-05, + "loss": 2.7446, + "step": 1076 + }, + { + "epoch": 0.3305709023941068, + "grad_norm": 1.4881565570831299, + "learning_rate": 9.999762720983835e-05, + "loss": 2.6904, + "step": 1077 + }, + { + "epoch": 0.33087783916513197, + "grad_norm": 1.3903130292892456, + "learning_rate": 9.999757853887948e-05, + "loss": 2.7315, + "step": 1078 + }, + { + "epoch": 0.33118477593615714, + "grad_norm": 1.491129755973816, + "learning_rate": 9.999752937381525e-05, + "loss": 2.7325, + "step": 1079 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 1.4748190641403198, + "learning_rate": 9.999747971464612e-05, + "loss": 2.7288, + "step": 1080 + }, + { + "epoch": 0.3317986494782075, + "grad_norm": 1.5664055347442627, + "learning_rate": 9.99974295613726e-05, + "loss": 2.8225, + "step": 1081 + }, + { + "epoch": 0.33210558624923264, + "grad_norm": 1.4422696828842163, + "learning_rate": 9.999737891399518e-05, + "loss": 2.6537, + "step": 1082 + }, + { + "epoch": 0.3324125230202578, + "grad_norm": 1.397817850112915, + "learning_rate": 9.999732777251436e-05, + "loss": 2.6329, + "step": 1083 + }, + { + "epoch": 0.332719459791283, + "grad_norm": 1.4253548383712769, + "learning_rate": 9.999727613693063e-05, + "loss": 2.7028, + "step": 1084 + }, + { + "epoch": 0.33302639656230815, + "grad_norm": 1.4327688217163086, + "learning_rate": 9.999722400724451e-05, + "loss": 2.6524, + "step": 1085 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.999717138345654e-05, + "loss": 2.7278, + "step": 1086 + }, + { + "epoch": 0.3336402701043585, + "grad_norm": 1.536656379699707, + "learning_rate": 9.999711826556719e-05, + "loss": 2.5858, + "step": 1087 + }, + { + "epoch": 0.33394720687538365, + "grad_norm": 1.4210286140441895, + "learning_rate": 9.999706465357703e-05, + "loss": 2.7057, + "step": 1088 + }, + { + "epoch": 0.3342541436464088, + "grad_norm": 1.4605839252471924, + "learning_rate": 9.999701054748657e-05, + "loss": 2.6461, + "step": 1089 + }, + { + "epoch": 0.334561080417434, + "grad_norm": 1.4764037132263184, + "learning_rate": 9.999695594729636e-05, + "loss": 2.608, + "step": 1090 + }, + { + "epoch": 0.33486801718845915, + "grad_norm": 1.630843162536621, + "learning_rate": 9.99969008530069e-05, + "loss": 2.6165, + "step": 1091 + }, + { + "epoch": 0.3351749539594843, + "grad_norm": 1.3693522214889526, + "learning_rate": 9.999684526461879e-05, + "loss": 2.72, + "step": 1092 + }, + { + "epoch": 0.3354818907305095, + "grad_norm": 1.609580636024475, + "learning_rate": 9.999678918213254e-05, + "loss": 2.7602, + "step": 1093 + }, + { + "epoch": 0.33578882750153466, + "grad_norm": 1.3815720081329346, + "learning_rate": 9.999673260554872e-05, + "loss": 2.6297, + "step": 1094 + }, + { + "epoch": 0.3360957642725598, + "grad_norm": 1.4511120319366455, + "learning_rate": 9.999667553486787e-05, + "loss": 2.7515, + "step": 1095 + }, + { + "epoch": 0.336402701043585, + "grad_norm": 1.486387848854065, + "learning_rate": 9.999661797009057e-05, + "loss": 2.6839, + "step": 1096 + }, + { + "epoch": 0.33670963781461016, + "grad_norm": 1.239160180091858, + "learning_rate": 9.999655991121739e-05, + "loss": 2.6033, + "step": 1097 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 1.499598741531372, + "learning_rate": 9.999650135824891e-05, + "loss": 2.5582, + "step": 1098 + }, + { + "epoch": 0.33732351135666055, + "grad_norm": 1.32973051071167, + "learning_rate": 9.999644231118571e-05, + "loss": 2.6253, + "step": 1099 + }, + { + "epoch": 0.3376304481276857, + "grad_norm": 1.4025259017944336, + "learning_rate": 9.999638277002833e-05, + "loss": 2.6199, + "step": 1100 + }, + { + "epoch": 0.3379373848987109, + "grad_norm": 1.3162082433700562, + "learning_rate": 9.999632273477742e-05, + "loss": 2.5528, + "step": 1101 + }, + { + "epoch": 0.33824432166973606, + "grad_norm": 1.5454723834991455, + "learning_rate": 9.999626220543352e-05, + "loss": 2.6724, + "step": 1102 + }, + { + "epoch": 0.3385512584407612, + "grad_norm": 1.45896315574646, + "learning_rate": 9.999620118199727e-05, + "loss": 2.688, + "step": 1103 + }, + { + "epoch": 0.3388581952117864, + "grad_norm": 1.3940998315811157, + "learning_rate": 9.999613966446926e-05, + "loss": 2.6991, + "step": 1104 + }, + { + "epoch": 0.33916513198281156, + "grad_norm": 1.4427480697631836, + "learning_rate": 9.999607765285009e-05, + "loss": 2.6869, + "step": 1105 + }, + { + "epoch": 0.33947206875383673, + "grad_norm": 1.260373830795288, + "learning_rate": 9.999601514714036e-05, + "loss": 2.7011, + "step": 1106 + }, + { + "epoch": 0.3397790055248619, + "grad_norm": 1.5985103845596313, + "learning_rate": 9.999595214734072e-05, + "loss": 2.599, + "step": 1107 + }, + { + "epoch": 0.34008594229588707, + "grad_norm": 1.1968494653701782, + "learning_rate": 9.999588865345179e-05, + "loss": 2.6346, + "step": 1108 + }, + { + "epoch": 0.34039287906691224, + "grad_norm": 1.4565916061401367, + "learning_rate": 9.999582466547417e-05, + "loss": 2.6303, + "step": 1109 + }, + { + "epoch": 0.3406998158379374, + "grad_norm": 1.2992361783981323, + "learning_rate": 9.999576018340851e-05, + "loss": 2.6121, + "step": 1110 + }, + { + "epoch": 0.34100675260896257, + "grad_norm": 1.402471899986267, + "learning_rate": 9.999569520725543e-05, + "loss": 2.6697, + "step": 1111 + }, + { + "epoch": 0.34131368937998774, + "grad_norm": 1.3006439208984375, + "learning_rate": 9.99956297370156e-05, + "loss": 2.6347, + "step": 1112 + }, + { + "epoch": 0.3416206261510129, + "grad_norm": 1.4235650300979614, + "learning_rate": 9.999556377268966e-05, + "loss": 2.6869, + "step": 1113 + }, + { + "epoch": 0.3419275629220381, + "grad_norm": 1.3288183212280273, + "learning_rate": 9.999549731427824e-05, + "loss": 2.5834, + "step": 1114 + }, + { + "epoch": 0.34223449969306324, + "grad_norm": 1.430736780166626, + "learning_rate": 9.999543036178203e-05, + "loss": 2.6248, + "step": 1115 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 1.467417597770691, + "learning_rate": 9.999536291520167e-05, + "loss": 2.6563, + "step": 1116 + }, + { + "epoch": 0.3428483732351136, + "grad_norm": 1.3988397121429443, + "learning_rate": 9.999529497453782e-05, + "loss": 2.6634, + "step": 1117 + }, + { + "epoch": 0.34315531000613875, + "grad_norm": 1.2072746753692627, + "learning_rate": 9.999522653979117e-05, + "loss": 2.6129, + "step": 1118 + }, + { + "epoch": 0.3434622467771639, + "grad_norm": 1.5297373533248901, + "learning_rate": 9.999515761096239e-05, + "loss": 2.6359, + "step": 1119 + }, + { + "epoch": 0.3437691835481891, + "grad_norm": 1.2022082805633545, + "learning_rate": 9.999508818805214e-05, + "loss": 2.6934, + "step": 1120 + }, + { + "epoch": 0.34407612031921425, + "grad_norm": 1.5655800104141235, + "learning_rate": 9.999501827106114e-05, + "loss": 2.6132, + "step": 1121 + }, + { + "epoch": 0.3443830570902394, + "grad_norm": 1.1639407873153687, + "learning_rate": 9.999494785999007e-05, + "loss": 2.6416, + "step": 1122 + }, + { + "epoch": 0.3446899938612646, + "grad_norm": 1.5784116983413696, + "learning_rate": 9.999487695483962e-05, + "loss": 2.5967, + "step": 1123 + }, + { + "epoch": 0.34499693063228976, + "grad_norm": 1.1812770366668701, + "learning_rate": 9.999480555561049e-05, + "loss": 2.6303, + "step": 1124 + }, + { + "epoch": 0.3453038674033149, + "grad_norm": 1.5105888843536377, + "learning_rate": 9.99947336623034e-05, + "loss": 2.58, + "step": 1125 + }, + { + "epoch": 0.3456108041743401, + "grad_norm": 1.2969506978988647, + "learning_rate": 9.999466127491904e-05, + "loss": 2.6857, + "step": 1126 + }, + { + "epoch": 0.34591774094536526, + "grad_norm": 1.679018259048462, + "learning_rate": 9.999458839345812e-05, + "loss": 2.6304, + "step": 1127 + }, + { + "epoch": 0.3462246777163904, + "grad_norm": 1.2718015909194946, + "learning_rate": 9.99945150179214e-05, + "loss": 2.6929, + "step": 1128 + }, + { + "epoch": 0.3465316144874156, + "grad_norm": 1.5834014415740967, + "learning_rate": 9.999444114830957e-05, + "loss": 2.6477, + "step": 1129 + }, + { + "epoch": 0.34683855125844076, + "grad_norm": 1.1575955152511597, + "learning_rate": 9.999436678462338e-05, + "loss": 2.6908, + "step": 1130 + }, + { + "epoch": 0.34714548802946593, + "grad_norm": 1.6231988668441772, + "learning_rate": 9.999429192686352e-05, + "loss": 2.6741, + "step": 1131 + }, + { + "epoch": 0.3474524248004911, + "grad_norm": 1.1616390943527222, + "learning_rate": 9.99942165750308e-05, + "loss": 2.5977, + "step": 1132 + }, + { + "epoch": 0.34775936157151627, + "grad_norm": 1.6188498735427856, + "learning_rate": 9.999414072912592e-05, + "loss": 2.6776, + "step": 1133 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 1.3885529041290283, + "learning_rate": 9.999406438914962e-05, + "loss": 2.7136, + "step": 1134 + }, + { + "epoch": 0.3483732351135666, + "grad_norm": 1.4522851705551147, + "learning_rate": 9.999398755510269e-05, + "loss": 2.6817, + "step": 1135 + }, + { + "epoch": 0.34868017188459177, + "grad_norm": 1.2695082426071167, + "learning_rate": 9.999391022698588e-05, + "loss": 2.6257, + "step": 1136 + }, + { + "epoch": 0.34898710865561694, + "grad_norm": 1.1735594272613525, + "learning_rate": 9.999383240479993e-05, + "loss": 2.5908, + "step": 1137 + }, + { + "epoch": 0.3492940454266421, + "grad_norm": 1.4158523082733154, + "learning_rate": 9.999375408854564e-05, + "loss": 2.572, + "step": 1138 + }, + { + "epoch": 0.3496009821976673, + "grad_norm": 1.1342333555221558, + "learning_rate": 9.999367527822376e-05, + "loss": 2.6918, + "step": 1139 + }, + { + "epoch": 0.34990791896869244, + "grad_norm": 1.4462997913360596, + "learning_rate": 9.999359597383509e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 0.3502148557397176, + "grad_norm": 1.254346251487732, + "learning_rate": 9.99935161753804e-05, + "loss": 2.6426, + "step": 1141 + }, + { + "epoch": 0.3505217925107428, + "grad_norm": 1.5101851224899292, + "learning_rate": 9.999343588286048e-05, + "loss": 2.6261, + "step": 1142 + }, + { + "epoch": 0.35082872928176795, + "grad_norm": 1.2910065650939941, + "learning_rate": 9.999335509627612e-05, + "loss": 2.5587, + "step": 1143 + }, + { + "epoch": 0.3511356660527931, + "grad_norm": 1.4421133995056152, + "learning_rate": 9.999327381562812e-05, + "loss": 2.6812, + "step": 1144 + }, + { + "epoch": 0.3514426028238183, + "grad_norm": 1.3265037536621094, + "learning_rate": 9.999319204091728e-05, + "loss": 2.6506, + "step": 1145 + }, + { + "epoch": 0.35174953959484345, + "grad_norm": 1.346258521080017, + "learning_rate": 9.999310977214443e-05, + "loss": 2.7038, + "step": 1146 + }, + { + "epoch": 0.3520564763658686, + "grad_norm": 1.3683836460113525, + "learning_rate": 9.999302700931037e-05, + "loss": 2.5823, + "step": 1147 + }, + { + "epoch": 0.3523634131368938, + "grad_norm": 1.3593783378601074, + "learning_rate": 9.99929437524159e-05, + "loss": 2.5705, + "step": 1148 + }, + { + "epoch": 0.35267034990791896, + "grad_norm": 1.4077095985412598, + "learning_rate": 9.999286000146186e-05, + "loss": 2.6259, + "step": 1149 + }, + { + "epoch": 0.3529772866789441, + "grad_norm": 1.3095922470092773, + "learning_rate": 9.99927757564491e-05, + "loss": 2.683, + "step": 1150 + }, + { + "epoch": 0.3532842234499693, + "grad_norm": 1.4188631772994995, + "learning_rate": 9.999269101737841e-05, + "loss": 2.619, + "step": 1151 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 1.2483123540878296, + "learning_rate": 9.999260578425063e-05, + "loss": 2.6477, + "step": 1152 + }, + { + "epoch": 0.35389809699201963, + "grad_norm": 1.4601099491119385, + "learning_rate": 9.999252005706663e-05, + "loss": 2.5861, + "step": 1153 + }, + { + "epoch": 0.3542050337630448, + "grad_norm": 1.107335090637207, + "learning_rate": 9.999243383582726e-05, + "loss": 2.6308, + "step": 1154 + }, + { + "epoch": 0.35451197053406996, + "grad_norm": 1.60590398311615, + "learning_rate": 9.999234712053334e-05, + "loss": 2.7057, + "step": 1155 + }, + { + "epoch": 0.35481890730509513, + "grad_norm": 1.2256578207015991, + "learning_rate": 9.999225991118575e-05, + "loss": 2.6371, + "step": 1156 + }, + { + "epoch": 0.3551258440761203, + "grad_norm": 1.4451910257339478, + "learning_rate": 9.999217220778535e-05, + "loss": 2.6424, + "step": 1157 + }, + { + "epoch": 0.35543278084714547, + "grad_norm": 1.184781789779663, + "learning_rate": 9.999208401033299e-05, + "loss": 2.6576, + "step": 1158 + }, + { + "epoch": 0.35573971761817064, + "grad_norm": 1.3395711183547974, + "learning_rate": 9.999199531882956e-05, + "loss": 2.6109, + "step": 1159 + }, + { + "epoch": 0.3560466543891958, + "grad_norm": 1.2052571773529053, + "learning_rate": 9.999190613327594e-05, + "loss": 2.5486, + "step": 1160 + }, + { + "epoch": 0.356353591160221, + "grad_norm": 1.2690850496292114, + "learning_rate": 9.999181645367299e-05, + "loss": 2.6457, + "step": 1161 + }, + { + "epoch": 0.35666052793124614, + "grad_norm": 1.2832787036895752, + "learning_rate": 9.999172628002162e-05, + "loss": 2.6097, + "step": 1162 + }, + { + "epoch": 0.3569674647022713, + "grad_norm": 1.3791579008102417, + "learning_rate": 9.999163561232272e-05, + "loss": 2.7458, + "step": 1163 + }, + { + "epoch": 0.3572744014732965, + "grad_norm": 1.260743498802185, + "learning_rate": 9.999154445057715e-05, + "loss": 2.594, + "step": 1164 + }, + { + "epoch": 0.35758133824432164, + "grad_norm": 1.1595406532287598, + "learning_rate": 9.999145279478585e-05, + "loss": 2.5315, + "step": 1165 + }, + { + "epoch": 0.3578882750153468, + "grad_norm": 1.3424396514892578, + "learning_rate": 9.999136064494972e-05, + "loss": 2.6017, + "step": 1166 + }, + { + "epoch": 0.358195211786372, + "grad_norm": 1.317750334739685, + "learning_rate": 9.999126800106963e-05, + "loss": 2.5787, + "step": 1167 + }, + { + "epoch": 0.35850214855739715, + "grad_norm": 1.104471206665039, + "learning_rate": 9.999117486314657e-05, + "loss": 2.6801, + "step": 1168 + }, + { + "epoch": 0.3588090853284224, + "grad_norm": 1.5555830001831055, + "learning_rate": 9.99910812311814e-05, + "loss": 2.6575, + "step": 1169 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 1.1883453130722046, + "learning_rate": 9.999098710517507e-05, + "loss": 2.5801, + "step": 1170 + }, + { + "epoch": 0.3594229588704727, + "grad_norm": 1.3885222673416138, + "learning_rate": 9.99908924851285e-05, + "loss": 2.5637, + "step": 1171 + }, + { + "epoch": 0.3597298956414979, + "grad_norm": 1.1860510110855103, + "learning_rate": 9.999079737104262e-05, + "loss": 2.6528, + "step": 1172 + }, + { + "epoch": 0.36003683241252304, + "grad_norm": 1.4319096803665161, + "learning_rate": 9.99907017629184e-05, + "loss": 2.579, + "step": 1173 + }, + { + "epoch": 0.3603437691835482, + "grad_norm": 1.256819725036621, + "learning_rate": 9.999060566075676e-05, + "loss": 2.5638, + "step": 1174 + }, + { + "epoch": 0.3606507059545734, + "grad_norm": 1.5452641248703003, + "learning_rate": 9.999050906455865e-05, + "loss": 2.6318, + "step": 1175 + }, + { + "epoch": 0.36095764272559855, + "grad_norm": 1.1933847665786743, + "learning_rate": 9.999041197432503e-05, + "loss": 2.5451, + "step": 1176 + }, + { + "epoch": 0.3612645794966237, + "grad_norm": 1.245689034461975, + "learning_rate": 9.999031439005684e-05, + "loss": 2.5452, + "step": 1177 + }, + { + "epoch": 0.3615715162676489, + "grad_norm": 1.2228111028671265, + "learning_rate": 9.99902163117551e-05, + "loss": 2.5856, + "step": 1178 + }, + { + "epoch": 0.36187845303867405, + "grad_norm": 1.3547098636627197, + "learning_rate": 9.999011773942071e-05, + "loss": 2.6604, + "step": 1179 + }, + { + "epoch": 0.3621853898096992, + "grad_norm": 1.25395929813385, + "learning_rate": 9.999001867305469e-05, + "loss": 2.5947, + "step": 1180 + }, + { + "epoch": 0.3624923265807244, + "grad_norm": 1.1676687002182007, + "learning_rate": 9.9989919112658e-05, + "loss": 2.5728, + "step": 1181 + }, + { + "epoch": 0.36279926335174956, + "grad_norm": 1.2076375484466553, + "learning_rate": 9.998981905823163e-05, + "loss": 2.569, + "step": 1182 + }, + { + "epoch": 0.3631062001227747, + "grad_norm": 1.3417900800704956, + "learning_rate": 9.998971850977659e-05, + "loss": 2.5552, + "step": 1183 + }, + { + "epoch": 0.3634131368937999, + "grad_norm": 1.135088324546814, + "learning_rate": 9.998961746729383e-05, + "loss": 2.5883, + "step": 1184 + }, + { + "epoch": 0.36372007366482506, + "grad_norm": 1.3329869508743286, + "learning_rate": 9.998951593078438e-05, + "loss": 2.6398, + "step": 1185 + }, + { + "epoch": 0.36402701043585023, + "grad_norm": 1.1681292057037354, + "learning_rate": 9.998941390024923e-05, + "loss": 2.6082, + "step": 1186 + }, + { + "epoch": 0.3643339472068754, + "grad_norm": 1.4083843231201172, + "learning_rate": 9.998931137568939e-05, + "loss": 2.6585, + "step": 1187 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 1.0879896879196167, + "learning_rate": 9.998920835710587e-05, + "loss": 2.4779, + "step": 1188 + }, + { + "epoch": 0.36494782074892573, + "grad_norm": 1.2977828979492188, + "learning_rate": 9.99891048444997e-05, + "loss": 2.6586, + "step": 1189 + }, + { + "epoch": 0.3652547575199509, + "grad_norm": 1.2552378177642822, + "learning_rate": 9.998900083787188e-05, + "loss": 2.5211, + "step": 1190 + }, + { + "epoch": 0.36556169429097607, + "grad_norm": 1.178227186203003, + "learning_rate": 9.998889633722348e-05, + "loss": 2.5365, + "step": 1191 + }, + { + "epoch": 0.36586863106200124, + "grad_norm": 1.36601722240448, + "learning_rate": 9.99887913425555e-05, + "loss": 2.6108, + "step": 1192 + }, + { + "epoch": 0.3661755678330264, + "grad_norm": 1.1947816610336304, + "learning_rate": 9.998868585386898e-05, + "loss": 2.5269, + "step": 1193 + }, + { + "epoch": 0.3664825046040516, + "grad_norm": 1.3113429546356201, + "learning_rate": 9.998857987116497e-05, + "loss": 2.5241, + "step": 1194 + }, + { + "epoch": 0.36678944137507674, + "grad_norm": 1.1573466062545776, + "learning_rate": 9.99884733944445e-05, + "loss": 2.5772, + "step": 1195 + }, + { + "epoch": 0.3670963781461019, + "grad_norm": 1.3841795921325684, + "learning_rate": 9.998836642370866e-05, + "loss": 2.6254, + "step": 1196 + }, + { + "epoch": 0.3674033149171271, + "grad_norm": 1.3332045078277588, + "learning_rate": 9.998825895895848e-05, + "loss": 2.6846, + "step": 1197 + }, + { + "epoch": 0.36771025168815225, + "grad_norm": 1.1578748226165771, + "learning_rate": 9.9988151000195e-05, + "loss": 2.4717, + "step": 1198 + }, + { + "epoch": 0.3680171884591774, + "grad_norm": 1.1045753955841064, + "learning_rate": 9.998804254741934e-05, + "loss": 2.6433, + "step": 1199 + }, + { + "epoch": 0.3683241252302026, + "grad_norm": 1.3260962963104248, + "learning_rate": 9.998793360063254e-05, + "loss": 2.6385, + "step": 1200 + }, + { + "epoch": 0.36863106200122775, + "grad_norm": 1.1483805179595947, + "learning_rate": 9.998782415983568e-05, + "loss": 2.6013, + "step": 1201 + }, + { + "epoch": 0.3689379987722529, + "grad_norm": 1.1897181272506714, + "learning_rate": 9.998771422502984e-05, + "loss": 2.485, + "step": 1202 + }, + { + "epoch": 0.3692449355432781, + "grad_norm": 1.2124346494674683, + "learning_rate": 9.99876037962161e-05, + "loss": 2.6271, + "step": 1203 + }, + { + "epoch": 0.36955187231430325, + "grad_norm": 1.2274240255355835, + "learning_rate": 9.998749287339557e-05, + "loss": 2.6072, + "step": 1204 + }, + { + "epoch": 0.3698588090853284, + "grad_norm": 1.2045015096664429, + "learning_rate": 9.998738145656934e-05, + "loss": 2.5567, + "step": 1205 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 1.187698483467102, + "learning_rate": 9.998726954573852e-05, + "loss": 2.6251, + "step": 1206 + }, + { + "epoch": 0.37047268262737876, + "grad_norm": 1.1760836839675903, + "learning_rate": 9.998715714090419e-05, + "loss": 2.6544, + "step": 1207 + }, + { + "epoch": 0.3707796193984039, + "grad_norm": 1.2181260585784912, + "learning_rate": 9.998704424206746e-05, + "loss": 2.6258, + "step": 1208 + }, + { + "epoch": 0.3710865561694291, + "grad_norm": 1.2106094360351562, + "learning_rate": 9.998693084922947e-05, + "loss": 2.5932, + "step": 1209 + }, + { + "epoch": 0.37139349294045426, + "grad_norm": 1.2973625659942627, + "learning_rate": 9.998681696239133e-05, + "loss": 2.5257, + "step": 1210 + }, + { + "epoch": 0.37170042971147943, + "grad_norm": 1.2477924823760986, + "learning_rate": 9.998670258155417e-05, + "loss": 2.6579, + "step": 1211 + }, + { + "epoch": 0.3720073664825046, + "grad_norm": 1.3301422595977783, + "learning_rate": 9.998658770671913e-05, + "loss": 2.4903, + "step": 1212 + }, + { + "epoch": 0.37231430325352977, + "grad_norm": 1.224321722984314, + "learning_rate": 9.998647233788732e-05, + "loss": 2.5865, + "step": 1213 + }, + { + "epoch": 0.37262124002455493, + "grad_norm": 1.3110655546188354, + "learning_rate": 9.99863564750599e-05, + "loss": 2.6134, + "step": 1214 + }, + { + "epoch": 0.3729281767955801, + "grad_norm": 1.2323014736175537, + "learning_rate": 9.998624011823801e-05, + "loss": 2.5892, + "step": 1215 + }, + { + "epoch": 0.37323511356660527, + "grad_norm": 1.0873770713806152, + "learning_rate": 9.998612326742279e-05, + "loss": 2.4897, + "step": 1216 + }, + { + "epoch": 0.37354205033763044, + "grad_norm": 1.2789679765701294, + "learning_rate": 9.998600592261539e-05, + "loss": 2.5603, + "step": 1217 + }, + { + "epoch": 0.3738489871086556, + "grad_norm": 1.1311540603637695, + "learning_rate": 9.998588808381699e-05, + "loss": 2.5327, + "step": 1218 + }, + { + "epoch": 0.3741559238796808, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.998576975102876e-05, + "loss": 2.4789, + "step": 1219 + }, + { + "epoch": 0.37446286065070594, + "grad_norm": 1.1840651035308838, + "learning_rate": 9.998565092425182e-05, + "loss": 2.5026, + "step": 1220 + }, + { + "epoch": 0.3747697974217311, + "grad_norm": 1.3145099878311157, + "learning_rate": 9.998553160348743e-05, + "loss": 2.5424, + "step": 1221 + }, + { + "epoch": 0.3750767341927563, + "grad_norm": 1.2192758321762085, + "learning_rate": 9.998541178873668e-05, + "loss": 2.5556, + "step": 1222 + }, + { + "epoch": 0.37538367096378145, + "grad_norm": 1.1329905986785889, + "learning_rate": 9.99852914800008e-05, + "loss": 2.4624, + "step": 1223 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 1.2490339279174805, + "learning_rate": 9.9985170677281e-05, + "loss": 2.5016, + "step": 1224 + }, + { + "epoch": 0.3759975445058318, + "grad_norm": 1.1884582042694092, + "learning_rate": 9.998504938057841e-05, + "loss": 2.5345, + "step": 1225 + }, + { + "epoch": 0.37630448127685695, + "grad_norm": 1.2075775861740112, + "learning_rate": 9.998492758989428e-05, + "loss": 2.5206, + "step": 1226 + }, + { + "epoch": 0.3766114180478821, + "grad_norm": 1.238457441329956, + "learning_rate": 9.99848053052298e-05, + "loss": 2.6748, + "step": 1227 + }, + { + "epoch": 0.3769183548189073, + "grad_norm": 1.3056883811950684, + "learning_rate": 9.998468252658618e-05, + "loss": 2.6146, + "step": 1228 + }, + { + "epoch": 0.37722529158993245, + "grad_norm": 1.191575050354004, + "learning_rate": 9.998455925396461e-05, + "loss": 2.4743, + "step": 1229 + }, + { + "epoch": 0.3775322283609576, + "grad_norm": 1.2834603786468506, + "learning_rate": 9.998443548736635e-05, + "loss": 2.5504, + "step": 1230 + }, + { + "epoch": 0.3778391651319828, + "grad_norm": 1.3023632764816284, + "learning_rate": 9.99843112267926e-05, + "loss": 2.5832, + "step": 1231 + }, + { + "epoch": 0.37814610190300796, + "grad_norm": 1.1219336986541748, + "learning_rate": 9.998418647224458e-05, + "loss": 2.5715, + "step": 1232 + }, + { + "epoch": 0.3784530386740331, + "grad_norm": 1.0666810274124146, + "learning_rate": 9.998406122372354e-05, + "loss": 2.4865, + "step": 1233 + }, + { + "epoch": 0.3787599754450583, + "grad_norm": 1.3699263334274292, + "learning_rate": 9.998393548123072e-05, + "loss": 2.5523, + "step": 1234 + }, + { + "epoch": 0.37906691221608346, + "grad_norm": 1.1383014917373657, + "learning_rate": 9.998380924476733e-05, + "loss": 2.7054, + "step": 1235 + }, + { + "epoch": 0.37937384898710863, + "grad_norm": 1.1304205656051636, + "learning_rate": 9.998368251433465e-05, + "loss": 2.5007, + "step": 1236 + }, + { + "epoch": 0.3796807857581338, + "grad_norm": 1.2220405340194702, + "learning_rate": 9.998355528993394e-05, + "loss": 2.5635, + "step": 1237 + }, + { + "epoch": 0.37998772252915897, + "grad_norm": 1.1126691102981567, + "learning_rate": 9.998342757156642e-05, + "loss": 2.5795, + "step": 1238 + }, + { + "epoch": 0.38029465930018413, + "grad_norm": 1.1675945520401, + "learning_rate": 9.998329935923339e-05, + "loss": 2.564, + "step": 1239 + }, + { + "epoch": 0.38060159607120936, + "grad_norm": 1.1286569833755493, + "learning_rate": 9.998317065293607e-05, + "loss": 2.5476, + "step": 1240 + }, + { + "epoch": 0.3809085328422345, + "grad_norm": 1.1252213716506958, + "learning_rate": 9.998304145267579e-05, + "loss": 2.5406, + "step": 1241 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 1.1931700706481934, + "learning_rate": 9.998291175845378e-05, + "loss": 2.5277, + "step": 1242 + }, + { + "epoch": 0.38152240638428486, + "grad_norm": 1.2148306369781494, + "learning_rate": 9.998278157027136e-05, + "loss": 2.5178, + "step": 1243 + }, + { + "epoch": 0.38182934315531003, + "grad_norm": 1.1597660779953003, + "learning_rate": 9.998265088812978e-05, + "loss": 2.5522, + "step": 1244 + }, + { + "epoch": 0.3821362799263352, + "grad_norm": 1.105973243713379, + "learning_rate": 9.998251971203035e-05, + "loss": 2.4558, + "step": 1245 + }, + { + "epoch": 0.38244321669736037, + "grad_norm": 1.1082781553268433, + "learning_rate": 9.998238804197437e-05, + "loss": 2.5504, + "step": 1246 + }, + { + "epoch": 0.38275015346838553, + "grad_norm": 1.2124732732772827, + "learning_rate": 9.998225587796312e-05, + "loss": 2.5536, + "step": 1247 + }, + { + "epoch": 0.3830570902394107, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.998212321999795e-05, + "loss": 2.4837, + "step": 1248 + }, + { + "epoch": 0.38336402701043587, + "grad_norm": 1.353562355041504, + "learning_rate": 9.998199006808014e-05, + "loss": 2.4554, + "step": 1249 + }, + { + "epoch": 0.38367096378146104, + "grad_norm": 1.2103357315063477, + "learning_rate": 9.998185642221098e-05, + "loss": 2.4843, + "step": 1250 + }, + { + "epoch": 0.3839779005524862, + "grad_norm": 1.2572352886199951, + "learning_rate": 9.998172228239185e-05, + "loss": 2.497, + "step": 1251 + }, + { + "epoch": 0.3842848373235114, + "grad_norm": 1.0910226106643677, + "learning_rate": 9.998158764862402e-05, + "loss": 2.577, + "step": 1252 + }, + { + "epoch": 0.38459177409453654, + "grad_norm": 1.2550606727600098, + "learning_rate": 9.998145252090886e-05, + "loss": 2.5087, + "step": 1253 + }, + { + "epoch": 0.3848987108655617, + "grad_norm": 1.0103787183761597, + "learning_rate": 9.998131689924768e-05, + "loss": 2.5306, + "step": 1254 + }, + { + "epoch": 0.3852056476365869, + "grad_norm": 1.2965941429138184, + "learning_rate": 9.998118078364184e-05, + "loss": 2.5622, + "step": 1255 + }, + { + "epoch": 0.38551258440761205, + "grad_norm": 1.0791535377502441, + "learning_rate": 9.998104417409269e-05, + "loss": 2.5608, + "step": 1256 + }, + { + "epoch": 0.3858195211786372, + "grad_norm": 1.3277596235275269, + "learning_rate": 9.998090707060155e-05, + "loss": 2.5748, + "step": 1257 + }, + { + "epoch": 0.3861264579496624, + "grad_norm": 1.004031777381897, + "learning_rate": 9.99807694731698e-05, + "loss": 2.5532, + "step": 1258 + }, + { + "epoch": 0.38643339472068755, + "grad_norm": 1.4802277088165283, + "learning_rate": 9.998063138179877e-05, + "loss": 2.585, + "step": 1259 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 1.0821146965026855, + "learning_rate": 9.998049279648987e-05, + "loss": 2.5248, + "step": 1260 + }, + { + "epoch": 0.3870472682627379, + "grad_norm": 1.2902108430862427, + "learning_rate": 9.998035371724443e-05, + "loss": 2.5134, + "step": 1261 + }, + { + "epoch": 0.38735420503376305, + "grad_norm": 1.082943320274353, + "learning_rate": 9.998021414406385e-05, + "loss": 2.5937, + "step": 1262 + }, + { + "epoch": 0.3876611418047882, + "grad_norm": 1.2164193391799927, + "learning_rate": 9.998007407694949e-05, + "loss": 2.5106, + "step": 1263 + }, + { + "epoch": 0.3879680785758134, + "grad_norm": 1.0999115705490112, + "learning_rate": 9.997993351590276e-05, + "loss": 2.5458, + "step": 1264 + }, + { + "epoch": 0.38827501534683856, + "grad_norm": 1.2275537252426147, + "learning_rate": 9.997979246092503e-05, + "loss": 2.5664, + "step": 1265 + }, + { + "epoch": 0.3885819521178637, + "grad_norm": 1.3246204853057861, + "learning_rate": 9.997965091201769e-05, + "loss": 2.5289, + "step": 1266 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.2404677867889404, + "learning_rate": 9.997950886918214e-05, + "loss": 2.5302, + "step": 1267 + }, + { + "epoch": 0.38919582565991406, + "grad_norm": 1.0993810892105103, + "learning_rate": 9.99793663324198e-05, + "loss": 2.5085, + "step": 1268 + }, + { + "epoch": 0.38950276243093923, + "grad_norm": 1.3394049406051636, + "learning_rate": 9.997922330173206e-05, + "loss": 2.5882, + "step": 1269 + }, + { + "epoch": 0.3898096992019644, + "grad_norm": 1.1464321613311768, + "learning_rate": 9.997907977712036e-05, + "loss": 2.5211, + "step": 1270 + }, + { + "epoch": 0.39011663597298957, + "grad_norm": 1.1246297359466553, + "learning_rate": 9.997893575858608e-05, + "loss": 2.4204, + "step": 1271 + }, + { + "epoch": 0.39042357274401474, + "grad_norm": 1.1278076171875, + "learning_rate": 9.997879124613067e-05, + "loss": 2.4405, + "step": 1272 + }, + { + "epoch": 0.3907305095150399, + "grad_norm": 1.2284942865371704, + "learning_rate": 9.997864623975555e-05, + "loss": 2.5674, + "step": 1273 + }, + { + "epoch": 0.39103744628606507, + "grad_norm": 1.1243138313293457, + "learning_rate": 9.997850073946215e-05, + "loss": 2.489, + "step": 1274 + }, + { + "epoch": 0.39134438305709024, + "grad_norm": 1.198461890220642, + "learning_rate": 9.997835474525193e-05, + "loss": 2.51, + "step": 1275 + }, + { + "epoch": 0.3916513198281154, + "grad_norm": 1.1643213033676147, + "learning_rate": 9.997820825712629e-05, + "loss": 2.5688, + "step": 1276 + }, + { + "epoch": 0.3919582565991406, + "grad_norm": 1.2107082605361938, + "learning_rate": 9.997806127508671e-05, + "loss": 2.5614, + "step": 1277 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 1.1856440305709839, + "learning_rate": 9.997791379913464e-05, + "loss": 2.5893, + "step": 1278 + }, + { + "epoch": 0.3925721301411909, + "grad_norm": 1.166395664215088, + "learning_rate": 9.997776582927153e-05, + "loss": 2.539, + "step": 1279 + }, + { + "epoch": 0.3928790669122161, + "grad_norm": 1.1638765335083008, + "learning_rate": 9.997761736549886e-05, + "loss": 2.5384, + "step": 1280 + }, + { + "epoch": 0.39318600368324125, + "grad_norm": 1.107485055923462, + "learning_rate": 9.997746840781806e-05, + "loss": 2.559, + "step": 1281 + }, + { + "epoch": 0.3934929404542664, + "grad_norm": 1.174592137336731, + "learning_rate": 9.997731895623063e-05, + "loss": 2.5132, + "step": 1282 + }, + { + "epoch": 0.3937998772252916, + "grad_norm": 1.0407745838165283, + "learning_rate": 9.997716901073806e-05, + "loss": 2.4871, + "step": 1283 + }, + { + "epoch": 0.39410681399631675, + "grad_norm": 1.059743046760559, + "learning_rate": 9.997701857134179e-05, + "loss": 2.4865, + "step": 1284 + }, + { + "epoch": 0.3944137507673419, + "grad_norm": 1.0606070756912231, + "learning_rate": 9.997686763804335e-05, + "loss": 2.5651, + "step": 1285 + }, + { + "epoch": 0.3947206875383671, + "grad_norm": 1.0753284692764282, + "learning_rate": 9.99767162108442e-05, + "loss": 2.4699, + "step": 1286 + }, + { + "epoch": 0.39502762430939226, + "grad_norm": 1.1155509948730469, + "learning_rate": 9.997656428974585e-05, + "loss": 2.5326, + "step": 1287 + }, + { + "epoch": 0.3953345610804174, + "grad_norm": 1.2243739366531372, + "learning_rate": 9.99764118747498e-05, + "loss": 2.5189, + "step": 1288 + }, + { + "epoch": 0.3956414978514426, + "grad_norm": 1.2526514530181885, + "learning_rate": 9.997625896585757e-05, + "loss": 2.5464, + "step": 1289 + }, + { + "epoch": 0.39594843462246776, + "grad_norm": 1.297153115272522, + "learning_rate": 9.997610556307062e-05, + "loss": 2.5752, + "step": 1290 + }, + { + "epoch": 0.39625537139349293, + "grad_norm": 1.1064956188201904, + "learning_rate": 9.997595166639054e-05, + "loss": 2.5743, + "step": 1291 + }, + { + "epoch": 0.3965623081645181, + "grad_norm": 1.255810022354126, + "learning_rate": 9.997579727581879e-05, + "loss": 2.7087, + "step": 1292 + }, + { + "epoch": 0.39686924493554326, + "grad_norm": 1.4290298223495483, + "learning_rate": 9.997564239135692e-05, + "loss": 2.5417, + "step": 1293 + }, + { + "epoch": 0.39717618170656843, + "grad_norm": 1.1937109231948853, + "learning_rate": 9.997548701300648e-05, + "loss": 2.4862, + "step": 1294 + }, + { + "epoch": 0.3974831184775936, + "grad_norm": 1.1707425117492676, + "learning_rate": 9.997533114076897e-05, + "loss": 2.4715, + "step": 1295 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 1.1248551607131958, + "learning_rate": 9.997517477464596e-05, + "loss": 2.4859, + "step": 1296 + }, + { + "epoch": 0.39809699201964394, + "grad_norm": 1.1656453609466553, + "learning_rate": 9.997501791463897e-05, + "loss": 2.5402, + "step": 1297 + }, + { + "epoch": 0.3984039287906691, + "grad_norm": 0.9916674494743347, + "learning_rate": 9.997486056074956e-05, + "loss": 2.5116, + "step": 1298 + }, + { + "epoch": 0.39871086556169427, + "grad_norm": 1.3229619264602661, + "learning_rate": 9.997470271297928e-05, + "loss": 2.5565, + "step": 1299 + }, + { + "epoch": 0.39901780233271944, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.997454437132971e-05, + "loss": 2.5191, + "step": 1300 + }, + { + "epoch": 0.3993247391037446, + "grad_norm": 1.2117778062820435, + "learning_rate": 9.997438553580241e-05, + "loss": 2.558, + "step": 1301 + }, + { + "epoch": 0.3996316758747698, + "grad_norm": 1.1083563566207886, + "learning_rate": 9.997422620639892e-05, + "loss": 2.4734, + "step": 1302 + }, + { + "epoch": 0.39993861264579494, + "grad_norm": 0.9662174582481384, + "learning_rate": 9.997406638312084e-05, + "loss": 2.4866, + "step": 1303 + }, + { + "epoch": 0.4002455494168201, + "grad_norm": 1.0886632204055786, + "learning_rate": 9.997390606596976e-05, + "loss": 2.5397, + "step": 1304 + }, + { + "epoch": 0.4005524861878453, + "grad_norm": 1.2318742275238037, + "learning_rate": 9.997374525494723e-05, + "loss": 2.6281, + "step": 1305 + }, + { + "epoch": 0.40085942295887045, + "grad_norm": 1.1717815399169922, + "learning_rate": 9.997358395005487e-05, + "loss": 2.5202, + "step": 1306 + }, + { + "epoch": 0.4011663597298956, + "grad_norm": 1.0533723831176758, + "learning_rate": 9.997342215129427e-05, + "loss": 2.5096, + "step": 1307 + }, + { + "epoch": 0.4014732965009208, + "grad_norm": 1.0814248323440552, + "learning_rate": 9.997325985866701e-05, + "loss": 2.5513, + "step": 1308 + }, + { + "epoch": 0.40178023327194595, + "grad_norm": 1.078261137008667, + "learning_rate": 9.997309707217472e-05, + "loss": 2.5115, + "step": 1309 + }, + { + "epoch": 0.4020871700429711, + "grad_norm": 1.0834710597991943, + "learning_rate": 9.997293379181897e-05, + "loss": 2.4754, + "step": 1310 + }, + { + "epoch": 0.40239410681399634, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.997277001760142e-05, + "loss": 2.5068, + "step": 1311 + }, + { + "epoch": 0.4027010435850215, + "grad_norm": 1.3008345365524292, + "learning_rate": 9.997260574952366e-05, + "loss": 2.4675, + "step": 1312 + }, + { + "epoch": 0.4030079803560467, + "grad_norm": 1.176858901977539, + "learning_rate": 9.997244098758732e-05, + "loss": 2.4786, + "step": 1313 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 1.0121303796768188, + "learning_rate": 9.997227573179403e-05, + "loss": 2.476, + "step": 1314 + }, + { + "epoch": 0.403621853898097, + "grad_norm": 1.326298713684082, + "learning_rate": 9.997210998214542e-05, + "loss": 2.4093, + "step": 1315 + }, + { + "epoch": 0.4039287906691222, + "grad_norm": 0.9008898735046387, + "learning_rate": 9.997194373864314e-05, + "loss": 2.4523, + "step": 1316 + }, + { + "epoch": 0.40423572744014735, + "grad_norm": 1.0441854000091553, + "learning_rate": 9.99717770012888e-05, + "loss": 2.5419, + "step": 1317 + }, + { + "epoch": 0.4045426642111725, + "grad_norm": 1.0490028858184814, + "learning_rate": 9.997160977008408e-05, + "loss": 2.4855, + "step": 1318 + }, + { + "epoch": 0.4048496009821977, + "grad_norm": 1.0244388580322266, + "learning_rate": 9.997144204503063e-05, + "loss": 2.4555, + "step": 1319 + }, + { + "epoch": 0.40515653775322286, + "grad_norm": 1.1217700242996216, + "learning_rate": 9.99712738261301e-05, + "loss": 2.4872, + "step": 1320 + }, + { + "epoch": 0.405463474524248, + "grad_norm": 1.031691551208496, + "learning_rate": 9.997110511338414e-05, + "loss": 2.4094, + "step": 1321 + }, + { + "epoch": 0.4057704112952732, + "grad_norm": 1.1658705472946167, + "learning_rate": 9.997093590679444e-05, + "loss": 2.407, + "step": 1322 + }, + { + "epoch": 0.40607734806629836, + "grad_norm": 1.1527072191238403, + "learning_rate": 9.997076620636266e-05, + "loss": 2.5041, + "step": 1323 + }, + { + "epoch": 0.40638428483732353, + "grad_norm": 1.2039116621017456, + "learning_rate": 9.997059601209049e-05, + "loss": 2.4682, + "step": 1324 + }, + { + "epoch": 0.4066912216083487, + "grad_norm": 1.142160177230835, + "learning_rate": 9.997042532397957e-05, + "loss": 2.4629, + "step": 1325 + }, + { + "epoch": 0.40699815837937386, + "grad_norm": 0.972081184387207, + "learning_rate": 9.997025414203164e-05, + "loss": 2.3941, + "step": 1326 + }, + { + "epoch": 0.40730509515039903, + "grad_norm": 1.0181753635406494, + "learning_rate": 9.99700824662484e-05, + "loss": 2.5649, + "step": 1327 + }, + { + "epoch": 0.4076120319214242, + "grad_norm": 1.145769715309143, + "learning_rate": 9.996991029663148e-05, + "loss": 2.5284, + "step": 1328 + }, + { + "epoch": 0.40791896869244937, + "grad_norm": 1.0604028701782227, + "learning_rate": 9.996973763318262e-05, + "loss": 2.4488, + "step": 1329 + }, + { + "epoch": 0.40822590546347454, + "grad_norm": 1.161383867263794, + "learning_rate": 9.996956447590354e-05, + "loss": 2.6081, + "step": 1330 + }, + { + "epoch": 0.4085328422344997, + "grad_norm": 1.0880714654922485, + "learning_rate": 9.996939082479591e-05, + "loss": 2.4695, + "step": 1331 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 1.036556601524353, + "learning_rate": 9.99692166798615e-05, + "loss": 2.4428, + "step": 1332 + }, + { + "epoch": 0.40914671577655004, + "grad_norm": 1.079179286956787, + "learning_rate": 9.996904204110198e-05, + "loss": 2.4543, + "step": 1333 + }, + { + "epoch": 0.4094536525475752, + "grad_norm": 1.0588144063949585, + "learning_rate": 9.996886690851912e-05, + "loss": 2.4755, + "step": 1334 + }, + { + "epoch": 0.4097605893186004, + "grad_norm": 1.0359580516815186, + "learning_rate": 9.996869128211462e-05, + "loss": 2.4933, + "step": 1335 + }, + { + "epoch": 0.41006752608962554, + "grad_norm": 1.0067389011383057, + "learning_rate": 9.996851516189021e-05, + "loss": 2.4291, + "step": 1336 + }, + { + "epoch": 0.4103744628606507, + "grad_norm": 1.0173524618148804, + "learning_rate": 9.996833854784766e-05, + "loss": 2.4856, + "step": 1337 + }, + { + "epoch": 0.4106813996316759, + "grad_norm": 1.0740927457809448, + "learning_rate": 9.99681614399887e-05, + "loss": 2.5248, + "step": 1338 + }, + { + "epoch": 0.41098833640270105, + "grad_norm": 0.9638547301292419, + "learning_rate": 9.99679838383151e-05, + "loss": 2.4777, + "step": 1339 + }, + { + "epoch": 0.4112952731737262, + "grad_norm": 1.0349369049072266, + "learning_rate": 9.996780574282856e-05, + "loss": 2.5188, + "step": 1340 + }, + { + "epoch": 0.4116022099447514, + "grad_norm": 1.099743127822876, + "learning_rate": 9.996762715353089e-05, + "loss": 2.4141, + "step": 1341 + }, + { + "epoch": 0.41190914671577655, + "grad_norm": 1.027178406715393, + "learning_rate": 9.996744807042386e-05, + "loss": 2.5134, + "step": 1342 + }, + { + "epoch": 0.4122160834868017, + "grad_norm": 1.1933472156524658, + "learning_rate": 9.996726849350922e-05, + "loss": 2.4821, + "step": 1343 + }, + { + "epoch": 0.4125230202578269, + "grad_norm": 1.1663923263549805, + "learning_rate": 9.996708842278872e-05, + "loss": 2.4593, + "step": 1344 + }, + { + "epoch": 0.41282995702885206, + "grad_norm": 1.2633854150772095, + "learning_rate": 9.996690785826418e-05, + "loss": 2.5524, + "step": 1345 + }, + { + "epoch": 0.4131368937998772, + "grad_norm": 1.03873610496521, + "learning_rate": 9.996672679993737e-05, + "loss": 2.5403, + "step": 1346 + }, + { + "epoch": 0.4134438305709024, + "grad_norm": 1.106656789779663, + "learning_rate": 9.996654524781009e-05, + "loss": 2.5172, + "step": 1347 + }, + { + "epoch": 0.41375076734192756, + "grad_norm": 1.015608310699463, + "learning_rate": 9.996636320188411e-05, + "loss": 2.423, + "step": 1348 + }, + { + "epoch": 0.41405770411295273, + "grad_norm": 1.0672087669372559, + "learning_rate": 9.996618066216124e-05, + "loss": 2.4861, + "step": 1349 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 1.1289842128753662, + "learning_rate": 9.996599762864329e-05, + "loss": 2.3944, + "step": 1350 + }, + { + "epoch": 0.41467157765500307, + "grad_norm": 1.080428957939148, + "learning_rate": 9.996581410133207e-05, + "loss": 2.4563, + "step": 1351 + }, + { + "epoch": 0.41497851442602823, + "grad_norm": 1.257104516029358, + "learning_rate": 9.996563008022939e-05, + "loss": 2.437, + "step": 1352 + }, + { + "epoch": 0.4152854511970534, + "grad_norm": 1.039293646812439, + "learning_rate": 9.996544556533706e-05, + "loss": 2.4654, + "step": 1353 + }, + { + "epoch": 0.41559238796807857, + "grad_norm": 1.0976085662841797, + "learning_rate": 9.996526055665692e-05, + "loss": 2.4755, + "step": 1354 + }, + { + "epoch": 0.41589932473910374, + "grad_norm": 0.937647819519043, + "learning_rate": 9.996507505419078e-05, + "loss": 2.4687, + "step": 1355 + }, + { + "epoch": 0.4162062615101289, + "grad_norm": 1.0461267232894897, + "learning_rate": 9.996488905794047e-05, + "loss": 2.4092, + "step": 1356 + }, + { + "epoch": 0.4165131982811541, + "grad_norm": 1.0510658025741577, + "learning_rate": 9.996470256790787e-05, + "loss": 2.4806, + "step": 1357 + }, + { + "epoch": 0.41682013505217924, + "grad_norm": 1.2323371171951294, + "learning_rate": 9.996451558409478e-05, + "loss": 2.5017, + "step": 1358 + }, + { + "epoch": 0.4171270718232044, + "grad_norm": 0.9880139827728271, + "learning_rate": 9.996432810650307e-05, + "loss": 2.5171, + "step": 1359 + }, + { + "epoch": 0.4174340085942296, + "grad_norm": 1.2572466135025024, + "learning_rate": 9.996414013513458e-05, + "loss": 2.4285, + "step": 1360 + }, + { + "epoch": 0.41774094536525475, + "grad_norm": 0.8809559941291809, + "learning_rate": 9.996395166999118e-05, + "loss": 2.398, + "step": 1361 + }, + { + "epoch": 0.4180478821362799, + "grad_norm": 0.9389429688453674, + "learning_rate": 9.996376271107471e-05, + "loss": 2.4539, + "step": 1362 + }, + { + "epoch": 0.4183548189073051, + "grad_norm": 0.8821789026260376, + "learning_rate": 9.996357325838705e-05, + "loss": 2.4762, + "step": 1363 + }, + { + "epoch": 0.41866175567833025, + "grad_norm": 1.0148484706878662, + "learning_rate": 9.99633833119301e-05, + "loss": 2.5292, + "step": 1364 + }, + { + "epoch": 0.4189686924493554, + "grad_norm": 0.9861947894096375, + "learning_rate": 9.996319287170569e-05, + "loss": 2.4285, + "step": 1365 + }, + { + "epoch": 0.4192756292203806, + "grad_norm": 1.1907099485397339, + "learning_rate": 9.996300193771573e-05, + "loss": 2.4325, + "step": 1366 + }, + { + "epoch": 0.41958256599140575, + "grad_norm": 1.0746681690216064, + "learning_rate": 9.99628105099621e-05, + "loss": 2.3349, + "step": 1367 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 1.2040268182754517, + "learning_rate": 9.996261858844669e-05, + "loss": 2.4427, + "step": 1368 + }, + { + "epoch": 0.4201964395334561, + "grad_norm": 1.0487430095672607, + "learning_rate": 9.99624261731714e-05, + "loss": 2.4305, + "step": 1369 + }, + { + "epoch": 0.42050337630448126, + "grad_norm": 1.0047999620437622, + "learning_rate": 9.996223326413812e-05, + "loss": 2.4442, + "step": 1370 + }, + { + "epoch": 0.4208103130755064, + "grad_norm": 1.147078275680542, + "learning_rate": 9.996203986134879e-05, + "loss": 2.5189, + "step": 1371 + }, + { + "epoch": 0.4211172498465316, + "grad_norm": 1.2269455194473267, + "learning_rate": 9.996184596480529e-05, + "loss": 2.3905, + "step": 1372 + }, + { + "epoch": 0.42142418661755676, + "grad_norm": 0.9716771245002747, + "learning_rate": 9.996165157450954e-05, + "loss": 2.4246, + "step": 1373 + }, + { + "epoch": 0.42173112338858193, + "grad_norm": 1.0569939613342285, + "learning_rate": 9.996145669046347e-05, + "loss": 2.529, + "step": 1374 + }, + { + "epoch": 0.4220380601596071, + "grad_norm": 1.1145942211151123, + "learning_rate": 9.996126131266899e-05, + "loss": 2.3965, + "step": 1375 + }, + { + "epoch": 0.42234499693063227, + "grad_norm": 0.9990974068641663, + "learning_rate": 9.996106544112805e-05, + "loss": 2.4991, + "step": 1376 + }, + { + "epoch": 0.42265193370165743, + "grad_norm": 0.9536247253417969, + "learning_rate": 9.99608690758426e-05, + "loss": 2.4347, + "step": 1377 + }, + { + "epoch": 0.4229588704726826, + "grad_norm": 1.0053460597991943, + "learning_rate": 9.996067221681452e-05, + "loss": 2.4213, + "step": 1378 + }, + { + "epoch": 0.42326580724370777, + "grad_norm": 1.0727168321609497, + "learning_rate": 9.99604748640458e-05, + "loss": 2.4479, + "step": 1379 + }, + { + "epoch": 0.42357274401473294, + "grad_norm": 1.2539277076721191, + "learning_rate": 9.996027701753841e-05, + "loss": 2.4721, + "step": 1380 + }, + { + "epoch": 0.4238796807857581, + "grad_norm": 1.0348230600357056, + "learning_rate": 9.996007867729427e-05, + "loss": 2.4263, + "step": 1381 + }, + { + "epoch": 0.42418661755678333, + "grad_norm": 1.051802158355713, + "learning_rate": 9.995987984331533e-05, + "loss": 2.4492, + "step": 1382 + }, + { + "epoch": 0.4244935543278085, + "grad_norm": 1.0394505262374878, + "learning_rate": 9.995968051560361e-05, + "loss": 2.4625, + "step": 1383 + }, + { + "epoch": 0.42480049109883367, + "grad_norm": 1.1121852397918701, + "learning_rate": 9.995948069416103e-05, + "loss": 2.4999, + "step": 1384 + }, + { + "epoch": 0.42510742786985883, + "grad_norm": 0.9693613052368164, + "learning_rate": 9.995928037898957e-05, + "loss": 2.4112, + "step": 1385 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 1.1416810750961304, + "learning_rate": 9.995907957009123e-05, + "loss": 2.5452, + "step": 1386 + }, + { + "epoch": 0.42572130141190917, + "grad_norm": 1.010640025138855, + "learning_rate": 9.995887826746797e-05, + "loss": 2.412, + "step": 1387 + }, + { + "epoch": 0.42602823818293434, + "grad_norm": 1.0800373554229736, + "learning_rate": 9.99586764711218e-05, + "loss": 2.4451, + "step": 1388 + }, + { + "epoch": 0.4263351749539595, + "grad_norm": 1.058931589126587, + "learning_rate": 9.995847418105471e-05, + "loss": 2.474, + "step": 1389 + }, + { + "epoch": 0.4266421117249847, + "grad_norm": 1.0727131366729736, + "learning_rate": 9.99582713972687e-05, + "loss": 2.468, + "step": 1390 + }, + { + "epoch": 0.42694904849600984, + "grad_norm": 1.0237464904785156, + "learning_rate": 9.995806811976576e-05, + "loss": 2.5208, + "step": 1391 + }, + { + "epoch": 0.427255985267035, + "grad_norm": 1.036582112312317, + "learning_rate": 9.995786434854793e-05, + "loss": 2.4338, + "step": 1392 + }, + { + "epoch": 0.4275629220380602, + "grad_norm": 0.9617817997932434, + "learning_rate": 9.995766008361719e-05, + "loss": 2.4465, + "step": 1393 + }, + { + "epoch": 0.42786985880908535, + "grad_norm": 1.2188911437988281, + "learning_rate": 9.995745532497556e-05, + "loss": 2.5069, + "step": 1394 + }, + { + "epoch": 0.4281767955801105, + "grad_norm": 1.0796585083007812, + "learning_rate": 9.99572500726251e-05, + "loss": 2.4839, + "step": 1395 + }, + { + "epoch": 0.4284837323511357, + "grad_norm": 0.9843130111694336, + "learning_rate": 9.99570443265678e-05, + "loss": 2.4968, + "step": 1396 + }, + { + "epoch": 0.42879066912216085, + "grad_norm": 1.0441415309906006, + "learning_rate": 9.99568380868057e-05, + "loss": 2.4134, + "step": 1397 + }, + { + "epoch": 0.429097605893186, + "grad_norm": 0.9156177639961243, + "learning_rate": 9.995663135334085e-05, + "loss": 2.4891, + "step": 1398 + }, + { + "epoch": 0.4294045426642112, + "grad_norm": 1.1159545183181763, + "learning_rate": 9.995642412617529e-05, + "loss": 2.4507, + "step": 1399 + }, + { + "epoch": 0.42971147943523635, + "grad_norm": 0.8944577574729919, + "learning_rate": 9.995621640531107e-05, + "loss": 2.4465, + "step": 1400 + }, + { + "epoch": 0.4300184162062615, + "grad_norm": 0.9043408036231995, + "learning_rate": 9.995600819075025e-05, + "loss": 2.3726, + "step": 1401 + }, + { + "epoch": 0.4303253529772867, + "grad_norm": 0.9028464555740356, + "learning_rate": 9.995579948249486e-05, + "loss": 2.427, + "step": 1402 + }, + { + "epoch": 0.43063228974831186, + "grad_norm": 0.9497705101966858, + "learning_rate": 9.995559028054699e-05, + "loss": 2.4666, + "step": 1403 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.927601158618927, + "learning_rate": 9.995538058490868e-05, + "loss": 2.3679, + "step": 1404 + }, + { + "epoch": 0.4312461632903622, + "grad_norm": 1.050394892692566, + "learning_rate": 9.995517039558204e-05, + "loss": 2.4096, + "step": 1405 + }, + { + "epoch": 0.43155310006138736, + "grad_norm": 1.3011974096298218, + "learning_rate": 9.995495971256911e-05, + "loss": 2.4439, + "step": 1406 + }, + { + "epoch": 0.43186003683241253, + "grad_norm": 1.0740708112716675, + "learning_rate": 9.9954748535872e-05, + "loss": 2.4891, + "step": 1407 + }, + { + "epoch": 0.4321669736034377, + "grad_norm": 1.1132466793060303, + "learning_rate": 9.995453686549279e-05, + "loss": 2.46, + "step": 1408 + }, + { + "epoch": 0.43247391037446287, + "grad_norm": 1.063275933265686, + "learning_rate": 9.995432470143356e-05, + "loss": 2.5035, + "step": 1409 + }, + { + "epoch": 0.43278084714548803, + "grad_norm": 1.065679669380188, + "learning_rate": 9.99541120436964e-05, + "loss": 2.4471, + "step": 1410 + }, + { + "epoch": 0.4330877839165132, + "grad_norm": 1.017587423324585, + "learning_rate": 9.995389889228344e-05, + "loss": 2.4879, + "step": 1411 + }, + { + "epoch": 0.43339472068753837, + "grad_norm": 0.9744442701339722, + "learning_rate": 9.995368524719678e-05, + "loss": 2.3923, + "step": 1412 + }, + { + "epoch": 0.43370165745856354, + "grad_norm": 0.8916706442832947, + "learning_rate": 9.995347110843851e-05, + "loss": 2.3965, + "step": 1413 + }, + { + "epoch": 0.4340085942295887, + "grad_norm": 0.916221559047699, + "learning_rate": 9.995325647601075e-05, + "loss": 2.4742, + "step": 1414 + }, + { + "epoch": 0.4343155310006139, + "grad_norm": 0.9388782978057861, + "learning_rate": 9.995304134991565e-05, + "loss": 2.453, + "step": 1415 + }, + { + "epoch": 0.43462246777163904, + "grad_norm": 1.057085633277893, + "learning_rate": 9.995282573015532e-05, + "loss": 2.5791, + "step": 1416 + }, + { + "epoch": 0.4349294045426642, + "grad_norm": 1.055145025253296, + "learning_rate": 9.995260961673187e-05, + "loss": 2.3565, + "step": 1417 + }, + { + "epoch": 0.4352363413136894, + "grad_norm": 1.0733528137207031, + "learning_rate": 9.995239300964747e-05, + "loss": 2.5413, + "step": 1418 + }, + { + "epoch": 0.43554327808471455, + "grad_norm": 1.1478198766708374, + "learning_rate": 9.995217590890425e-05, + "loss": 2.4093, + "step": 1419 + }, + { + "epoch": 0.4358502148557397, + "grad_norm": 0.8663081526756287, + "learning_rate": 9.995195831450432e-05, + "loss": 2.3968, + "step": 1420 + }, + { + "epoch": 0.4361571516267649, + "grad_norm": 0.9811860918998718, + "learning_rate": 9.995174022644988e-05, + "loss": 2.3536, + "step": 1421 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.9883477687835693, + "learning_rate": 9.995152164474306e-05, + "loss": 2.5372, + "step": 1422 + }, + { + "epoch": 0.4367710251688152, + "grad_norm": 1.2196532487869263, + "learning_rate": 9.995130256938603e-05, + "loss": 2.429, + "step": 1423 + }, + { + "epoch": 0.4370779619398404, + "grad_norm": 1.000264286994934, + "learning_rate": 9.995108300038096e-05, + "loss": 2.4116, + "step": 1424 + }, + { + "epoch": 0.43738489871086556, + "grad_norm": 1.1259286403656006, + "learning_rate": 9.995086293773e-05, + "loss": 2.4405, + "step": 1425 + }, + { + "epoch": 0.4376918354818907, + "grad_norm": 0.9334595203399658, + "learning_rate": 9.995064238143533e-05, + "loss": 2.3849, + "step": 1426 + }, + { + "epoch": 0.4379987722529159, + "grad_norm": 0.8880285620689392, + "learning_rate": 9.995042133149914e-05, + "loss": 2.4177, + "step": 1427 + }, + { + "epoch": 0.43830570902394106, + "grad_norm": 0.8823251724243164, + "learning_rate": 9.995019978792362e-05, + "loss": 2.4876, + "step": 1428 + }, + { + "epoch": 0.4386126457949662, + "grad_norm": 0.9289014339447021, + "learning_rate": 9.994997775071094e-05, + "loss": 2.4725, + "step": 1429 + }, + { + "epoch": 0.4389195825659914, + "grad_norm": 0.9100427627563477, + "learning_rate": 9.994975521986329e-05, + "loss": 2.3834, + "step": 1430 + }, + { + "epoch": 0.43922651933701656, + "grad_norm": 0.8956978917121887, + "learning_rate": 9.99495321953829e-05, + "loss": 2.4418, + "step": 1431 + }, + { + "epoch": 0.43953345610804173, + "grad_norm": 1.1248396635055542, + "learning_rate": 9.994930867727195e-05, + "loss": 2.4389, + "step": 1432 + }, + { + "epoch": 0.4398403928790669, + "grad_norm": 0.9285669922828674, + "learning_rate": 9.994908466553266e-05, + "loss": 2.3922, + "step": 1433 + }, + { + "epoch": 0.44014732965009207, + "grad_norm": 0.9604844450950623, + "learning_rate": 9.994886016016723e-05, + "loss": 2.4365, + "step": 1434 + }, + { + "epoch": 0.44045426642111724, + "grad_norm": 1.0534024238586426, + "learning_rate": 9.99486351611779e-05, + "loss": 2.4377, + "step": 1435 + }, + { + "epoch": 0.4407612031921424, + "grad_norm": 1.1028003692626953, + "learning_rate": 9.994840966856686e-05, + "loss": 2.4299, + "step": 1436 + }, + { + "epoch": 0.44106813996316757, + "grad_norm": 1.119832158088684, + "learning_rate": 9.994818368233639e-05, + "loss": 2.4656, + "step": 1437 + }, + { + "epoch": 0.44137507673419274, + "grad_norm": 0.9782878160476685, + "learning_rate": 9.994795720248867e-05, + "loss": 2.3661, + "step": 1438 + }, + { + "epoch": 0.4416820135052179, + "grad_norm": 1.0002741813659668, + "learning_rate": 9.994773022902597e-05, + "loss": 2.4157, + "step": 1439 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 1.051486611366272, + "learning_rate": 9.994750276195053e-05, + "loss": 2.452, + "step": 1440 + }, + { + "epoch": 0.44229588704726824, + "grad_norm": 1.0375488996505737, + "learning_rate": 9.994727480126457e-05, + "loss": 2.4406, + "step": 1441 + }, + { + "epoch": 0.4426028238182934, + "grad_norm": 0.9407445192337036, + "learning_rate": 9.99470463469704e-05, + "loss": 2.3434, + "step": 1442 + }, + { + "epoch": 0.4429097605893186, + "grad_norm": 1.0371474027633667, + "learning_rate": 9.994681739907022e-05, + "loss": 2.5094, + "step": 1443 + }, + { + "epoch": 0.44321669736034375, + "grad_norm": 1.057519555091858, + "learning_rate": 9.994658795756632e-05, + "loss": 2.4501, + "step": 1444 + }, + { + "epoch": 0.4435236341313689, + "grad_norm": 0.9340078234672546, + "learning_rate": 9.994635802246097e-05, + "loss": 2.4151, + "step": 1445 + }, + { + "epoch": 0.4438305709023941, + "grad_norm": 0.8906050324440002, + "learning_rate": 9.994612759375644e-05, + "loss": 2.3837, + "step": 1446 + }, + { + "epoch": 0.44413750767341925, + "grad_norm": 0.8349595665931702, + "learning_rate": 9.994589667145497e-05, + "loss": 2.4317, + "step": 1447 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.9362117648124695, + "learning_rate": 9.994566525555891e-05, + "loss": 2.4586, + "step": 1448 + }, + { + "epoch": 0.4447513812154696, + "grad_norm": 0.869215190410614, + "learning_rate": 9.99454333460705e-05, + "loss": 2.4458, + "step": 1449 + }, + { + "epoch": 0.44505831798649476, + "grad_norm": 0.904531717300415, + "learning_rate": 9.994520094299204e-05, + "loss": 2.4198, + "step": 1450 + }, + { + "epoch": 0.4453652547575199, + "grad_norm": 0.9153178930282593, + "learning_rate": 9.994496804632583e-05, + "loss": 2.3718, + "step": 1451 + }, + { + "epoch": 0.44567219152854515, + "grad_norm": 1.0229307413101196, + "learning_rate": 9.994473465607418e-05, + "loss": 2.3787, + "step": 1452 + }, + { + "epoch": 0.4459791282995703, + "grad_norm": 1.0449415445327759, + "learning_rate": 9.994450077223938e-05, + "loss": 2.4965, + "step": 1453 + }, + { + "epoch": 0.4462860650705955, + "grad_norm": 1.0524135828018188, + "learning_rate": 9.994426639482375e-05, + "loss": 2.3518, + "step": 1454 + }, + { + "epoch": 0.44659300184162065, + "grad_norm": 1.0612086057662964, + "learning_rate": 9.994403152382961e-05, + "loss": 2.4501, + "step": 1455 + }, + { + "epoch": 0.4468999386126458, + "grad_norm": 1.0568779706954956, + "learning_rate": 9.994379615925929e-05, + "loss": 2.3754, + "step": 1456 + }, + { + "epoch": 0.447206875383671, + "grad_norm": 1.0984265804290771, + "learning_rate": 9.994356030111509e-05, + "loss": 2.4318, + "step": 1457 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.9227646589279175, + "learning_rate": 9.994332394939936e-05, + "loss": 2.3928, + "step": 1458 + }, + { + "epoch": 0.4478207489257213, + "grad_norm": 1.0073471069335938, + "learning_rate": 9.994308710411442e-05, + "loss": 2.4203, + "step": 1459 + }, + { + "epoch": 0.4481276856967465, + "grad_norm": 1.1347973346710205, + "learning_rate": 9.994284976526263e-05, + "loss": 2.4991, + "step": 1460 + }, + { + "epoch": 0.44843462246777166, + "grad_norm": 0.9912654757499695, + "learning_rate": 9.994261193284631e-05, + "loss": 2.471, + "step": 1461 + }, + { + "epoch": 0.4487415592387968, + "grad_norm": 1.0599550008773804, + "learning_rate": 9.994237360686784e-05, + "loss": 2.505, + "step": 1462 + }, + { + "epoch": 0.449048496009822, + "grad_norm": 0.9811004996299744, + "learning_rate": 9.994213478732957e-05, + "loss": 2.3868, + "step": 1463 + }, + { + "epoch": 0.44935543278084716, + "grad_norm": 0.8389631509780884, + "learning_rate": 9.994189547423384e-05, + "loss": 2.4766, + "step": 1464 + }, + { + "epoch": 0.44966236955187233, + "grad_norm": 0.8475043773651123, + "learning_rate": 9.994165566758302e-05, + "loss": 2.3666, + "step": 1465 + }, + { + "epoch": 0.4499693063228975, + "grad_norm": 0.8922824859619141, + "learning_rate": 9.994141536737951e-05, + "loss": 2.3823, + "step": 1466 + }, + { + "epoch": 0.45027624309392267, + "grad_norm": 1.0286083221435547, + "learning_rate": 9.994117457362564e-05, + "loss": 2.4639, + "step": 1467 + }, + { + "epoch": 0.45058317986494784, + "grad_norm": 1.094282865524292, + "learning_rate": 9.994093328632383e-05, + "loss": 2.3984, + "step": 1468 + }, + { + "epoch": 0.450890116635973, + "grad_norm": 1.0993603467941284, + "learning_rate": 9.994069150547642e-05, + "loss": 2.3719, + "step": 1469 + }, + { + "epoch": 0.45119705340699817, + "grad_norm": 1.0274133682250977, + "learning_rate": 9.994044923108585e-05, + "loss": 2.3644, + "step": 1470 + }, + { + "epoch": 0.45150399017802334, + "grad_norm": 0.8834434747695923, + "learning_rate": 9.994020646315448e-05, + "loss": 2.4955, + "step": 1471 + }, + { + "epoch": 0.4518109269490485, + "grad_norm": 0.8540776968002319, + "learning_rate": 9.993996320168473e-05, + "loss": 2.4292, + "step": 1472 + }, + { + "epoch": 0.4521178637200737, + "grad_norm": 0.8735383749008179, + "learning_rate": 9.993971944667897e-05, + "loss": 2.4343, + "step": 1473 + }, + { + "epoch": 0.45242480049109884, + "grad_norm": 0.976224422454834, + "learning_rate": 9.993947519813965e-05, + "loss": 2.4173, + "step": 1474 + }, + { + "epoch": 0.452731737262124, + "grad_norm": 0.9638139009475708, + "learning_rate": 9.993923045606917e-05, + "loss": 2.4322, + "step": 1475 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.9689927697181702, + "learning_rate": 9.993898522046992e-05, + "loss": 2.4625, + "step": 1476 + }, + { + "epoch": 0.45334561080417435, + "grad_norm": 1.0496052503585815, + "learning_rate": 9.993873949134437e-05, + "loss": 2.4788, + "step": 1477 + }, + { + "epoch": 0.4536525475751995, + "grad_norm": 1.0285090208053589, + "learning_rate": 9.993849326869491e-05, + "loss": 2.4119, + "step": 1478 + }, + { + "epoch": 0.4539594843462247, + "grad_norm": 0.9423730373382568, + "learning_rate": 9.993824655252401e-05, + "loss": 2.3919, + "step": 1479 + }, + { + "epoch": 0.45426642111724985, + "grad_norm": 1.0312988758087158, + "learning_rate": 9.993799934283407e-05, + "loss": 2.3829, + "step": 1480 + }, + { + "epoch": 0.454573357888275, + "grad_norm": 1.0985655784606934, + "learning_rate": 9.993775163962755e-05, + "loss": 2.3958, + "step": 1481 + }, + { + "epoch": 0.4548802946593002, + "grad_norm": 0.9346623420715332, + "learning_rate": 9.993750344290691e-05, + "loss": 2.3611, + "step": 1482 + }, + { + "epoch": 0.45518723143032536, + "grad_norm": 1.039681315422058, + "learning_rate": 9.993725475267459e-05, + "loss": 2.3989, + "step": 1483 + }, + { + "epoch": 0.4554941682013505, + "grad_norm": 0.9941854476928711, + "learning_rate": 9.993700556893304e-05, + "loss": 2.3092, + "step": 1484 + }, + { + "epoch": 0.4558011049723757, + "grad_norm": 0.9752130508422852, + "learning_rate": 9.993675589168473e-05, + "loss": 2.3727, + "step": 1485 + }, + { + "epoch": 0.45610804174340086, + "grad_norm": 0.9946039319038391, + "learning_rate": 9.993650572093216e-05, + "loss": 2.4121, + "step": 1486 + }, + { + "epoch": 0.45641497851442603, + "grad_norm": 1.1340489387512207, + "learning_rate": 9.993625505667774e-05, + "loss": 2.4477, + "step": 1487 + }, + { + "epoch": 0.4567219152854512, + "grad_norm": 0.9300981760025024, + "learning_rate": 9.993600389892399e-05, + "loss": 2.4045, + "step": 1488 + }, + { + "epoch": 0.45702885205647636, + "grad_norm": 0.8670973181724548, + "learning_rate": 9.993575224767338e-05, + "loss": 2.3596, + "step": 1489 + }, + { + "epoch": 0.45733578882750153, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.99355001029284e-05, + "loss": 2.4191, + "step": 1490 + }, + { + "epoch": 0.4576427255985267, + "grad_norm": 0.9099079370498657, + "learning_rate": 9.993524746469154e-05, + "loss": 2.4139, + "step": 1491 + }, + { + "epoch": 0.45794966236955187, + "grad_norm": 0.9740153551101685, + "learning_rate": 9.99349943329653e-05, + "loss": 2.4269, + "step": 1492 + }, + { + "epoch": 0.45825659914057704, + "grad_norm": 0.9112171530723572, + "learning_rate": 9.993474070775217e-05, + "loss": 2.3575, + "step": 1493 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 1.124553918838501, + "learning_rate": 9.993448658905466e-05, + "loss": 2.5518, + "step": 1494 + }, + { + "epoch": 0.4588704726826274, + "grad_norm": 1.1732012033462524, + "learning_rate": 9.99342319768753e-05, + "loss": 2.4346, + "step": 1495 + }, + { + "epoch": 0.45917740945365254, + "grad_norm": 0.8880025148391724, + "learning_rate": 9.993397687121659e-05, + "loss": 2.3593, + "step": 1496 + }, + { + "epoch": 0.4594843462246777, + "grad_norm": 0.9916797876358032, + "learning_rate": 9.993372127208105e-05, + "loss": 2.3283, + "step": 1497 + }, + { + "epoch": 0.4597912829957029, + "grad_norm": 0.9372622966766357, + "learning_rate": 9.99334651794712e-05, + "loss": 2.3868, + "step": 1498 + }, + { + "epoch": 0.46009821976672804, + "grad_norm": 1.0630989074707031, + "learning_rate": 9.99332085933896e-05, + "loss": 2.3605, + "step": 1499 + }, + { + "epoch": 0.4604051565377532, + "grad_norm": 1.000473976135254, + "learning_rate": 9.993295151383874e-05, + "loss": 2.3478, + "step": 1500 + }, + { + "epoch": 0.4607120933087784, + "grad_norm": 1.0269688367843628, + "learning_rate": 9.99326939408212e-05, + "loss": 2.4104, + "step": 1501 + }, + { + "epoch": 0.46101903007980355, + "grad_norm": 0.9003174901008606, + "learning_rate": 9.993243587433952e-05, + "loss": 2.3461, + "step": 1502 + }, + { + "epoch": 0.4613259668508287, + "grad_norm": 0.7938058972358704, + "learning_rate": 9.993217731439623e-05, + "loss": 2.3463, + "step": 1503 + }, + { + "epoch": 0.4616329036218539, + "grad_norm": 0.8715407252311707, + "learning_rate": 9.993191826099391e-05, + "loss": 2.3962, + "step": 1504 + }, + { + "epoch": 0.46193984039287905, + "grad_norm": 0.8319756984710693, + "learning_rate": 9.99316587141351e-05, + "loss": 2.342, + "step": 1505 + }, + { + "epoch": 0.4622467771639042, + "grad_norm": 0.846592903137207, + "learning_rate": 9.993139867382238e-05, + "loss": 2.4064, + "step": 1506 + }, + { + "epoch": 0.4625537139349294, + "grad_norm": 0.8567312955856323, + "learning_rate": 9.99311381400583e-05, + "loss": 2.3603, + "step": 1507 + }, + { + "epoch": 0.46286065070595456, + "grad_norm": 0.8784321546554565, + "learning_rate": 9.993087711284546e-05, + "loss": 2.4031, + "step": 1508 + }, + { + "epoch": 0.4631675874769797, + "grad_norm": 0.838233232498169, + "learning_rate": 9.993061559218641e-05, + "loss": 2.3156, + "step": 1509 + }, + { + "epoch": 0.4634745242480049, + "grad_norm": 0.8804462552070618, + "learning_rate": 9.993035357808376e-05, + "loss": 2.4322, + "step": 1510 + }, + { + "epoch": 0.46378146101903006, + "grad_norm": 1.1055982112884521, + "learning_rate": 9.99300910705401e-05, + "loss": 2.5006, + "step": 1511 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.9872145056724548, + "learning_rate": 9.992982806955799e-05, + "loss": 2.3547, + "step": 1512 + }, + { + "epoch": 0.4643953345610804, + "grad_norm": 1.0710479021072388, + "learning_rate": 9.99295645751401e-05, + "loss": 2.4867, + "step": 1513 + }, + { + "epoch": 0.46470227133210557, + "grad_norm": 0.9858919382095337, + "learning_rate": 9.992930058728894e-05, + "loss": 2.2986, + "step": 1514 + }, + { + "epoch": 0.46500920810313073, + "grad_norm": 0.9031065702438354, + "learning_rate": 9.992903610600719e-05, + "loss": 2.3172, + "step": 1515 + }, + { + "epoch": 0.4653161448741559, + "grad_norm": 0.923160970211029, + "learning_rate": 9.992877113129744e-05, + "loss": 2.4231, + "step": 1516 + }, + { + "epoch": 0.46562308164518107, + "grad_norm": 1.0130947828292847, + "learning_rate": 9.992850566316231e-05, + "loss": 2.3593, + "step": 1517 + }, + { + "epoch": 0.46593001841620624, + "grad_norm": 0.8947033286094666, + "learning_rate": 9.992823970160441e-05, + "loss": 2.3324, + "step": 1518 + }, + { + "epoch": 0.4662369551872314, + "grad_norm": 0.8819900155067444, + "learning_rate": 9.992797324662639e-05, + "loss": 2.2885, + "step": 1519 + }, + { + "epoch": 0.4665438919582566, + "grad_norm": 0.9434374570846558, + "learning_rate": 9.99277062982309e-05, + "loss": 2.427, + "step": 1520 + }, + { + "epoch": 0.46685082872928174, + "grad_norm": 0.9568646550178528, + "learning_rate": 9.99274388564205e-05, + "loss": 2.4059, + "step": 1521 + }, + { + "epoch": 0.4671577655003069, + "grad_norm": 0.9125105142593384, + "learning_rate": 9.992717092119794e-05, + "loss": 2.3306, + "step": 1522 + }, + { + "epoch": 0.46746470227133213, + "grad_norm": 0.8893206715583801, + "learning_rate": 9.992690249256578e-05, + "loss": 2.4211, + "step": 1523 + }, + { + "epoch": 0.4677716390423573, + "grad_norm": 0.8655402660369873, + "learning_rate": 9.992663357052672e-05, + "loss": 2.3493, + "step": 1524 + }, + { + "epoch": 0.46807857581338247, + "grad_norm": 0.7973037958145142, + "learning_rate": 9.99263641550834e-05, + "loss": 2.4255, + "step": 1525 + }, + { + "epoch": 0.46838551258440764, + "grad_norm": 0.8158934116363525, + "learning_rate": 9.992609424623849e-05, + "loss": 2.3518, + "step": 1526 + }, + { + "epoch": 0.4686924493554328, + "grad_norm": 0.7919436693191528, + "learning_rate": 9.992582384399465e-05, + "loss": 2.3762, + "step": 1527 + }, + { + "epoch": 0.468999386126458, + "grad_norm": 0.911490261554718, + "learning_rate": 9.992555294835455e-05, + "loss": 2.454, + "step": 1528 + }, + { + "epoch": 0.46930632289748314, + "grad_norm": 0.9504674077033997, + "learning_rate": 9.992528155932088e-05, + "loss": 2.3554, + "step": 1529 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.9833991527557373, + "learning_rate": 9.99250096768963e-05, + "loss": 2.4245, + "step": 1530 + }, + { + "epoch": 0.4699201964395335, + "grad_norm": 0.9994687438011169, + "learning_rate": 9.992473730108354e-05, + "loss": 2.3269, + "step": 1531 + }, + { + "epoch": 0.47022713321055865, + "grad_norm": 0.977237343788147, + "learning_rate": 9.992446443188526e-05, + "loss": 2.3938, + "step": 1532 + }, + { + "epoch": 0.4705340699815838, + "grad_norm": 1.018334150314331, + "learning_rate": 9.992419106930415e-05, + "loss": 2.3076, + "step": 1533 + }, + { + "epoch": 0.470841006752609, + "grad_norm": 0.9752077460289001, + "learning_rate": 9.992391721334293e-05, + "loss": 2.4224, + "step": 1534 + }, + { + "epoch": 0.47114794352363415, + "grad_norm": 0.9457291960716248, + "learning_rate": 9.992364286400428e-05, + "loss": 2.3859, + "step": 1535 + }, + { + "epoch": 0.4714548802946593, + "grad_norm": 0.9112275838851929, + "learning_rate": 9.992336802129096e-05, + "loss": 2.3343, + "step": 1536 + }, + { + "epoch": 0.4717618170656845, + "grad_norm": 0.7701164484024048, + "learning_rate": 9.992309268520563e-05, + "loss": 2.3912, + "step": 1537 + }, + { + "epoch": 0.47206875383670965, + "grad_norm": 0.826822817325592, + "learning_rate": 9.992281685575105e-05, + "loss": 2.3794, + "step": 1538 + }, + { + "epoch": 0.4723756906077348, + "grad_norm": 0.8690019249916077, + "learning_rate": 9.992254053292994e-05, + "loss": 2.3474, + "step": 1539 + }, + { + "epoch": 0.47268262737876, + "grad_norm": 0.935954213142395, + "learning_rate": 9.9922263716745e-05, + "loss": 2.3794, + "step": 1540 + }, + { + "epoch": 0.47298956414978516, + "grad_norm": 1.0606616735458374, + "learning_rate": 9.992198640719901e-05, + "loss": 2.3491, + "step": 1541 + }, + { + "epoch": 0.4732965009208103, + "grad_norm": 1.0020630359649658, + "learning_rate": 9.992170860429469e-05, + "loss": 2.4723, + "step": 1542 + }, + { + "epoch": 0.4736034376918355, + "grad_norm": 0.9738268256187439, + "learning_rate": 9.992143030803476e-05, + "loss": 2.4282, + "step": 1543 + }, + { + "epoch": 0.47391037446286066, + "grad_norm": 1.0320461988449097, + "learning_rate": 9.992115151842203e-05, + "loss": 2.3935, + "step": 1544 + }, + { + "epoch": 0.47421731123388583, + "grad_norm": 0.926980197429657, + "learning_rate": 9.992087223545921e-05, + "loss": 2.4403, + "step": 1545 + }, + { + "epoch": 0.474524248004911, + "grad_norm": 0.8760805130004883, + "learning_rate": 9.992059245914906e-05, + "loss": 2.3282, + "step": 1546 + }, + { + "epoch": 0.47483118477593617, + "grad_norm": 0.807569146156311, + "learning_rate": 9.992031218949435e-05, + "loss": 2.351, + "step": 1547 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.7491574883460999, + "learning_rate": 9.992003142649788e-05, + "loss": 2.3788, + "step": 1548 + }, + { + "epoch": 0.4754450583179865, + "grad_norm": 0.8402566909790039, + "learning_rate": 9.99197501701624e-05, + "loss": 2.4025, + "step": 1549 + }, + { + "epoch": 0.47575199508901167, + "grad_norm": 0.9501824975013733, + "learning_rate": 9.991946842049067e-05, + "loss": 2.4433, + "step": 1550 + }, + { + "epoch": 0.47605893186003684, + "grad_norm": 1.0070267915725708, + "learning_rate": 9.99191861774855e-05, + "loss": 2.4267, + "step": 1551 + }, + { + "epoch": 0.476365868631062, + "grad_norm": 0.9052779078483582, + "learning_rate": 9.991890344114969e-05, + "loss": 2.37, + "step": 1552 + }, + { + "epoch": 0.4766728054020872, + "grad_norm": 0.9453344345092773, + "learning_rate": 9.9918620211486e-05, + "loss": 2.4687, + "step": 1553 + }, + { + "epoch": 0.47697974217311234, + "grad_norm": 0.9836863875389099, + "learning_rate": 9.991833648849725e-05, + "loss": 2.4005, + "step": 1554 + }, + { + "epoch": 0.4772866789441375, + "grad_norm": 0.856532633304596, + "learning_rate": 9.991805227218624e-05, + "loss": 2.329, + "step": 1555 + }, + { + "epoch": 0.4775936157151627, + "grad_norm": 0.8338705897331238, + "learning_rate": 9.991776756255579e-05, + "loss": 2.3648, + "step": 1556 + }, + { + "epoch": 0.47790055248618785, + "grad_norm": 0.7738644480705261, + "learning_rate": 9.991748235960869e-05, + "loss": 2.2784, + "step": 1557 + }, + { + "epoch": 0.478207489257213, + "grad_norm": 0.7771223783493042, + "learning_rate": 9.991719666334778e-05, + "loss": 2.2747, + "step": 1558 + }, + { + "epoch": 0.4785144260282382, + "grad_norm": 0.7564612627029419, + "learning_rate": 9.991691047377588e-05, + "loss": 2.2964, + "step": 1559 + }, + { + "epoch": 0.47882136279926335, + "grad_norm": 0.7877290844917297, + "learning_rate": 9.99166237908958e-05, + "loss": 2.3149, + "step": 1560 + }, + { + "epoch": 0.4791282995702885, + "grad_norm": 0.7967450022697449, + "learning_rate": 9.991633661471039e-05, + "loss": 2.4035, + "step": 1561 + }, + { + "epoch": 0.4794352363413137, + "grad_norm": 0.8993534445762634, + "learning_rate": 9.991604894522248e-05, + "loss": 2.4028, + "step": 1562 + }, + { + "epoch": 0.47974217311233885, + "grad_norm": 0.9135516881942749, + "learning_rate": 9.991576078243494e-05, + "loss": 2.3968, + "step": 1563 + }, + { + "epoch": 0.480049109883364, + "grad_norm": 0.8438525795936584, + "learning_rate": 9.991547212635057e-05, + "loss": 2.3589, + "step": 1564 + }, + { + "epoch": 0.4803560466543892, + "grad_norm": 0.8979686498641968, + "learning_rate": 9.991518297697226e-05, + "loss": 2.3835, + "step": 1565 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.8821539878845215, + "learning_rate": 9.991489333430286e-05, + "loss": 2.3503, + "step": 1566 + }, + { + "epoch": 0.4809699201964395, + "grad_norm": 0.8649077415466309, + "learning_rate": 9.991460319834523e-05, + "loss": 2.3806, + "step": 1567 + }, + { + "epoch": 0.4812768569674647, + "grad_norm": 0.8360965847969055, + "learning_rate": 9.991431256910223e-05, + "loss": 2.3997, + "step": 1568 + }, + { + "epoch": 0.48158379373848986, + "grad_norm": 0.9178828597068787, + "learning_rate": 9.991402144657673e-05, + "loss": 2.3611, + "step": 1569 + }, + { + "epoch": 0.48189073050951503, + "grad_norm": 0.7961607575416565, + "learning_rate": 9.991372983077161e-05, + "loss": 2.3588, + "step": 1570 + }, + { + "epoch": 0.4821976672805402, + "grad_norm": 0.8136993646621704, + "learning_rate": 9.991343772168978e-05, + "loss": 2.3241, + "step": 1571 + }, + { + "epoch": 0.48250460405156537, + "grad_norm": 0.8421273231506348, + "learning_rate": 9.991314511933407e-05, + "loss": 2.3493, + "step": 1572 + }, + { + "epoch": 0.48281154082259053, + "grad_norm": 0.774861752986908, + "learning_rate": 9.991285202370743e-05, + "loss": 2.362, + "step": 1573 + }, + { + "epoch": 0.4831184775936157, + "grad_norm": 0.9181589484214783, + "learning_rate": 9.991255843481273e-05, + "loss": 2.443, + "step": 1574 + }, + { + "epoch": 0.48342541436464087, + "grad_norm": 0.873884379863739, + "learning_rate": 9.991226435265286e-05, + "loss": 2.3819, + "step": 1575 + }, + { + "epoch": 0.48373235113566604, + "grad_norm": 0.923200786113739, + "learning_rate": 9.991196977723077e-05, + "loss": 2.4152, + "step": 1576 + }, + { + "epoch": 0.4840392879066912, + "grad_norm": 0.9097923040390015, + "learning_rate": 9.99116747085493e-05, + "loss": 2.4072, + "step": 1577 + }, + { + "epoch": 0.4843462246777164, + "grad_norm": 0.8885805010795593, + "learning_rate": 9.991137914661143e-05, + "loss": 2.3963, + "step": 1578 + }, + { + "epoch": 0.48465316144874154, + "grad_norm": 0.9016655683517456, + "learning_rate": 9.991108309142006e-05, + "loss": 2.4287, + "step": 1579 + }, + { + "epoch": 0.4849600982197667, + "grad_norm": 0.957548201084137, + "learning_rate": 9.99107865429781e-05, + "loss": 2.4306, + "step": 1580 + }, + { + "epoch": 0.4852670349907919, + "grad_norm": 0.9604195356369019, + "learning_rate": 9.99104895012885e-05, + "loss": 2.3721, + "step": 1581 + }, + { + "epoch": 0.48557397176181705, + "grad_norm": 1.0423815250396729, + "learning_rate": 9.991019196635419e-05, + "loss": 2.3847, + "step": 1582 + }, + { + "epoch": 0.4858809085328422, + "grad_norm": 0.9538045525550842, + "learning_rate": 9.990989393817809e-05, + "loss": 2.4307, + "step": 1583 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 1.0103334188461304, + "learning_rate": 9.990959541676318e-05, + "loss": 2.409, + "step": 1584 + }, + { + "epoch": 0.48649478207489255, + "grad_norm": 1.0780646800994873, + "learning_rate": 9.99092964021124e-05, + "loss": 2.3314, + "step": 1585 + }, + { + "epoch": 0.4868017188459177, + "grad_norm": 1.0062072277069092, + "learning_rate": 9.99089968942287e-05, + "loss": 2.3922, + "step": 1586 + }, + { + "epoch": 0.4871086556169429, + "grad_norm": 1.0575196743011475, + "learning_rate": 9.990869689311504e-05, + "loss": 2.4156, + "step": 1587 + }, + { + "epoch": 0.48741559238796806, + "grad_norm": 0.9953998923301697, + "learning_rate": 9.990839639877438e-05, + "loss": 2.381, + "step": 1588 + }, + { + "epoch": 0.4877225291589932, + "grad_norm": 0.8848470449447632, + "learning_rate": 9.99080954112097e-05, + "loss": 2.4178, + "step": 1589 + }, + { + "epoch": 0.4880294659300184, + "grad_norm": 0.7849117517471313, + "learning_rate": 9.990779393042397e-05, + "loss": 2.3021, + "step": 1590 + }, + { + "epoch": 0.48833640270104356, + "grad_norm": 0.7611599564552307, + "learning_rate": 9.990749195642016e-05, + "loss": 2.4426, + "step": 1591 + }, + { + "epoch": 0.4886433394720687, + "grad_norm": 0.8361895084381104, + "learning_rate": 9.990718948920127e-05, + "loss": 2.3442, + "step": 1592 + }, + { + "epoch": 0.4889502762430939, + "grad_norm": 0.8249576687812805, + "learning_rate": 9.990688652877028e-05, + "loss": 2.2745, + "step": 1593 + }, + { + "epoch": 0.4892572130141191, + "grad_norm": 0.763889729976654, + "learning_rate": 9.990658307513019e-05, + "loss": 2.3123, + "step": 1594 + }, + { + "epoch": 0.4895641497851443, + "grad_norm": 0.7517281770706177, + "learning_rate": 9.990627912828399e-05, + "loss": 2.3811, + "step": 1595 + }, + { + "epoch": 0.48987108655616945, + "grad_norm": 0.8254112005233765, + "learning_rate": 9.990597468823468e-05, + "loss": 2.4269, + "step": 1596 + }, + { + "epoch": 0.4901780233271946, + "grad_norm": 0.8267236948013306, + "learning_rate": 9.99056697549853e-05, + "loss": 2.354, + "step": 1597 + }, + { + "epoch": 0.4904849600982198, + "grad_norm": 0.8511303067207336, + "learning_rate": 9.990536432853881e-05, + "loss": 2.3755, + "step": 1598 + }, + { + "epoch": 0.49079189686924496, + "grad_norm": 0.8639636635780334, + "learning_rate": 9.990505840889828e-05, + "loss": 2.3828, + "step": 1599 + }, + { + "epoch": 0.4910988336402701, + "grad_norm": 0.8371795415878296, + "learning_rate": 9.990475199606672e-05, + "loss": 2.4235, + "step": 1600 + }, + { + "epoch": 0.4914057704112953, + "grad_norm": 0.7639186382293701, + "learning_rate": 9.990444509004713e-05, + "loss": 2.3547, + "step": 1601 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.7835492491722107, + "learning_rate": 9.990413769084257e-05, + "loss": 2.2983, + "step": 1602 + }, + { + "epoch": 0.49201964395334563, + "grad_norm": 0.8301565647125244, + "learning_rate": 9.990382979845609e-05, + "loss": 2.4109, + "step": 1603 + }, + { + "epoch": 0.4923265807243708, + "grad_norm": 0.9005976915359497, + "learning_rate": 9.99035214128907e-05, + "loss": 2.3618, + "step": 1604 + }, + { + "epoch": 0.49263351749539597, + "grad_norm": 1.0234936475753784, + "learning_rate": 9.990321253414945e-05, + "loss": 2.4622, + "step": 1605 + }, + { + "epoch": 0.49294045426642114, + "grad_norm": 1.1613819599151611, + "learning_rate": 9.990290316223542e-05, + "loss": 2.3231, + "step": 1606 + }, + { + "epoch": 0.4932473910374463, + "grad_norm": 0.9382983446121216, + "learning_rate": 9.990259329715165e-05, + "loss": 2.357, + "step": 1607 + }, + { + "epoch": 0.49355432780847147, + "grad_norm": 1.0277435779571533, + "learning_rate": 9.990228293890121e-05, + "loss": 2.3497, + "step": 1608 + }, + { + "epoch": 0.49386126457949664, + "grad_norm": 0.9809542894363403, + "learning_rate": 9.990197208748716e-05, + "loss": 2.363, + "step": 1609 + }, + { + "epoch": 0.4941682013505218, + "grad_norm": 1.151412844657898, + "learning_rate": 9.990166074291255e-05, + "loss": 2.4859, + "step": 1610 + }, + { + "epoch": 0.494475138121547, + "grad_norm": 0.9663482308387756, + "learning_rate": 9.990134890518051e-05, + "loss": 2.3848, + "step": 1611 + }, + { + "epoch": 0.49478207489257214, + "grad_norm": 0.9619266986846924, + "learning_rate": 9.990103657429405e-05, + "loss": 2.3381, + "step": 1612 + }, + { + "epoch": 0.4950890116635973, + "grad_norm": 1.1306475400924683, + "learning_rate": 9.990072375025634e-05, + "loss": 2.3859, + "step": 1613 + }, + { + "epoch": 0.4953959484346225, + "grad_norm": 1.127801537513733, + "learning_rate": 9.990041043307043e-05, + "loss": 2.4259, + "step": 1614 + }, + { + "epoch": 0.49570288520564765, + "grad_norm": 0.9880200624465942, + "learning_rate": 9.990009662273941e-05, + "loss": 2.3629, + "step": 1615 + }, + { + "epoch": 0.4960098219766728, + "grad_norm": 0.940493643283844, + "learning_rate": 9.989978231926636e-05, + "loss": 2.3716, + "step": 1616 + }, + { + "epoch": 0.496316758747698, + "grad_norm": 0.7923702597618103, + "learning_rate": 9.989946752265445e-05, + "loss": 2.3017, + "step": 1617 + }, + { + "epoch": 0.49662369551872315, + "grad_norm": 0.7668408155441284, + "learning_rate": 9.989915223290673e-05, + "loss": 2.3273, + "step": 1618 + }, + { + "epoch": 0.4969306322897483, + "grad_norm": 0.7134098410606384, + "learning_rate": 9.989883645002636e-05, + "loss": 2.302, + "step": 1619 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.6878800392150879, + "learning_rate": 9.989852017401643e-05, + "loss": 2.3047, + "step": 1620 + }, + { + "epoch": 0.49754450583179866, + "grad_norm": 0.8099397420883179, + "learning_rate": 9.989820340488008e-05, + "loss": 2.4747, + "step": 1621 + }, + { + "epoch": 0.4978514426028238, + "grad_norm": 0.9677640795707703, + "learning_rate": 9.989788614262043e-05, + "loss": 2.3347, + "step": 1622 + }, + { + "epoch": 0.498158379373849, + "grad_norm": 0.7592893838882446, + "learning_rate": 9.989756838724064e-05, + "loss": 2.3238, + "step": 1623 + }, + { + "epoch": 0.49846531614487416, + "grad_norm": 0.872529923915863, + "learning_rate": 9.989725013874382e-05, + "loss": 2.4117, + "step": 1624 + }, + { + "epoch": 0.49877225291589933, + "grad_norm": 1.023362159729004, + "learning_rate": 9.989693139713315e-05, + "loss": 2.3307, + "step": 1625 + }, + { + "epoch": 0.4990791896869245, + "grad_norm": 0.8994693756103516, + "learning_rate": 9.989661216241172e-05, + "loss": 2.3661, + "step": 1626 + }, + { + "epoch": 0.49938612645794966, + "grad_norm": 0.8854429125785828, + "learning_rate": 9.989629243458275e-05, + "loss": 2.311, + "step": 1627 + }, + { + "epoch": 0.49969306322897483, + "grad_norm": 0.8326926231384277, + "learning_rate": 9.989597221364937e-05, + "loss": 2.302, + "step": 1628 + }, + { + "epoch": 0.5, + "grad_norm": 0.8778239488601685, + "learning_rate": 9.989565149961475e-05, + "loss": 2.4653, + "step": 1629 + }, + { + "epoch": 0.5003069367710252, + "grad_norm": 0.9369759559631348, + "learning_rate": 9.989533029248205e-05, + "loss": 2.4165, + "step": 1630 + }, + { + "epoch": 0.5006138735420503, + "grad_norm": 0.8510915637016296, + "learning_rate": 9.989500859225445e-05, + "loss": 2.3345, + "step": 1631 + }, + { + "epoch": 0.5009208103130756, + "grad_norm": 0.787972629070282, + "learning_rate": 9.989468639893513e-05, + "loss": 2.283, + "step": 1632 + }, + { + "epoch": 0.5012277470841007, + "grad_norm": 0.7370568513870239, + "learning_rate": 9.989436371252729e-05, + "loss": 2.2867, + "step": 1633 + }, + { + "epoch": 0.5015346838551259, + "grad_norm": 0.8459502458572388, + "learning_rate": 9.989404053303409e-05, + "loss": 2.2875, + "step": 1634 + }, + { + "epoch": 0.501841620626151, + "grad_norm": 0.9123181700706482, + "learning_rate": 9.989371686045874e-05, + "loss": 2.2653, + "step": 1635 + }, + { + "epoch": 0.5021485573971762, + "grad_norm": 1.1908178329467773, + "learning_rate": 9.989339269480445e-05, + "loss": 2.4849, + "step": 1636 + }, + { + "epoch": 0.5024554941682013, + "grad_norm": 0.8162623643875122, + "learning_rate": 9.989306803607439e-05, + "loss": 2.2409, + "step": 1637 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.9289522171020508, + "learning_rate": 9.98927428842718e-05, + "loss": 2.455, + "step": 1638 + }, + { + "epoch": 0.5030693677102517, + "grad_norm": 1.212346076965332, + "learning_rate": 9.989241723939988e-05, + "loss": 2.3461, + "step": 1639 + }, + { + "epoch": 0.5033763044812769, + "grad_norm": 0.8971593976020813, + "learning_rate": 9.989209110146184e-05, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.503683241252302, + "grad_norm": 0.9293156862258911, + "learning_rate": 9.989176447046092e-05, + "loss": 2.3235, + "step": 1641 + }, + { + "epoch": 0.5039901780233272, + "grad_norm": 0.8665596842765808, + "learning_rate": 9.989143734640034e-05, + "loss": 2.4694, + "step": 1642 + }, + { + "epoch": 0.5042971147943524, + "grad_norm": 0.7732648253440857, + "learning_rate": 9.989110972928333e-05, + "loss": 2.1985, + "step": 1643 + }, + { + "epoch": 0.5046040515653776, + "grad_norm": 0.8124692440032959, + "learning_rate": 9.989078161911314e-05, + "loss": 2.315, + "step": 1644 + }, + { + "epoch": 0.5049109883364027, + "grad_norm": 0.8534342050552368, + "learning_rate": 9.989045301589301e-05, + "loss": 2.3491, + "step": 1645 + }, + { + "epoch": 0.5052179251074279, + "grad_norm": 0.8351274132728577, + "learning_rate": 9.989012391962617e-05, + "loss": 2.3416, + "step": 1646 + }, + { + "epoch": 0.505524861878453, + "grad_norm": 0.9143189787864685, + "learning_rate": 9.988979433031588e-05, + "loss": 2.4665, + "step": 1647 + }, + { + "epoch": 0.5058317986494782, + "grad_norm": 0.8978474140167236, + "learning_rate": 9.988946424796542e-05, + "loss": 2.389, + "step": 1648 + }, + { + "epoch": 0.5061387354205034, + "grad_norm": 1.0245648622512817, + "learning_rate": 9.988913367257802e-05, + "loss": 2.3391, + "step": 1649 + }, + { + "epoch": 0.5064456721915286, + "grad_norm": 0.9991573691368103, + "learning_rate": 9.988880260415695e-05, + "loss": 2.405, + "step": 1650 + }, + { + "epoch": 0.5067526089625537, + "grad_norm": 1.042378306388855, + "learning_rate": 9.98884710427055e-05, + "loss": 2.3467, + "step": 1651 + }, + { + "epoch": 0.5070595457335789, + "grad_norm": 0.9569510817527771, + "learning_rate": 9.988813898822694e-05, + "loss": 2.31, + "step": 1652 + }, + { + "epoch": 0.507366482504604, + "grad_norm": 0.9343158006668091, + "learning_rate": 9.988780644072456e-05, + "loss": 2.3659, + "step": 1653 + }, + { + "epoch": 0.5076734192756293, + "grad_norm": 0.7857093811035156, + "learning_rate": 9.988747340020162e-05, + "loss": 2.3424, + "step": 1654 + }, + { + "epoch": 0.5079803560466544, + "grad_norm": 0.7613041996955872, + "learning_rate": 9.988713986666144e-05, + "loss": 2.2698, + "step": 1655 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.8077516555786133, + "learning_rate": 9.98868058401073e-05, + "loss": 2.3827, + "step": 1656 + }, + { + "epoch": 0.5085942295887047, + "grad_norm": 0.8794304132461548, + "learning_rate": 9.98864713205425e-05, + "loss": 2.3079, + "step": 1657 + }, + { + "epoch": 0.5089011663597299, + "grad_norm": 0.8333674073219299, + "learning_rate": 9.988613630797036e-05, + "loss": 2.3622, + "step": 1658 + }, + { + "epoch": 0.509208103130755, + "grad_norm": 0.9654781222343445, + "learning_rate": 9.988580080239417e-05, + "loss": 2.3979, + "step": 1659 + }, + { + "epoch": 0.5095150399017803, + "grad_norm": 0.9278727769851685, + "learning_rate": 9.988546480381727e-05, + "loss": 2.3728, + "step": 1660 + }, + { + "epoch": 0.5098219766728054, + "grad_norm": 0.7971704006195068, + "learning_rate": 9.988512831224298e-05, + "loss": 2.2983, + "step": 1661 + }, + { + "epoch": 0.5101289134438306, + "grad_norm": 0.8991698026657104, + "learning_rate": 9.988479132767459e-05, + "loss": 2.3992, + "step": 1662 + }, + { + "epoch": 0.5104358502148557, + "grad_norm": 1.0208392143249512, + "learning_rate": 9.988445385011546e-05, + "loss": 2.3847, + "step": 1663 + }, + { + "epoch": 0.5107427869858809, + "grad_norm": 0.878237247467041, + "learning_rate": 9.988411587956891e-05, + "loss": 2.2851, + "step": 1664 + }, + { + "epoch": 0.511049723756906, + "grad_norm": 0.903287410736084, + "learning_rate": 9.98837774160383e-05, + "loss": 2.4233, + "step": 1665 + }, + { + "epoch": 0.5113566605279313, + "grad_norm": 0.8845674991607666, + "learning_rate": 9.988343845952697e-05, + "loss": 2.2923, + "step": 1666 + }, + { + "epoch": 0.5116635972989564, + "grad_norm": 0.7729392051696777, + "learning_rate": 9.988309901003825e-05, + "loss": 2.3044, + "step": 1667 + }, + { + "epoch": 0.5119705340699816, + "grad_norm": 0.719302237033844, + "learning_rate": 9.988275906757551e-05, + "loss": 2.3207, + "step": 1668 + }, + { + "epoch": 0.5122774708410067, + "grad_norm": 0.7205179333686829, + "learning_rate": 9.988241863214211e-05, + "loss": 2.341, + "step": 1669 + }, + { + "epoch": 0.512584407612032, + "grad_norm": 0.7318145036697388, + "learning_rate": 9.988207770374142e-05, + "loss": 2.3419, + "step": 1670 + }, + { + "epoch": 0.5128913443830571, + "grad_norm": 0.770630955696106, + "learning_rate": 9.98817362823768e-05, + "loss": 2.27, + "step": 1671 + }, + { + "epoch": 0.5131982811540823, + "grad_norm": 0.6485452651977539, + "learning_rate": 9.988139436805162e-05, + "loss": 2.2715, + "step": 1672 + }, + { + "epoch": 0.5135052179251074, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.988105196076925e-05, + "loss": 2.2806, + "step": 1673 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.695818305015564, + "learning_rate": 9.98807090605331e-05, + "loss": 2.3387, + "step": 1674 + }, + { + "epoch": 0.5141190914671577, + "grad_norm": 0.7685426473617554, + "learning_rate": 9.988036566734655e-05, + "loss": 2.2921, + "step": 1675 + }, + { + "epoch": 0.514426028238183, + "grad_norm": 0.6522897481918335, + "learning_rate": 9.988002178121301e-05, + "loss": 2.2507, + "step": 1676 + }, + { + "epoch": 0.5147329650092081, + "grad_norm": 0.7442181706428528, + "learning_rate": 9.987967740213583e-05, + "loss": 2.3292, + "step": 1677 + }, + { + "epoch": 0.5150399017802333, + "grad_norm": 0.8093023300170898, + "learning_rate": 9.987933253011846e-05, + "loss": 2.3384, + "step": 1678 + }, + { + "epoch": 0.5153468385512584, + "grad_norm": 0.8014655113220215, + "learning_rate": 9.987898716516428e-05, + "loss": 2.3619, + "step": 1679 + }, + { + "epoch": 0.5156537753222836, + "grad_norm": 0.8230258822441101, + "learning_rate": 9.987864130727671e-05, + "loss": 2.3242, + "step": 1680 + }, + { + "epoch": 0.5159607120933087, + "grad_norm": 0.9222247004508972, + "learning_rate": 9.987829495645918e-05, + "loss": 2.3907, + "step": 1681 + }, + { + "epoch": 0.516267648864334, + "grad_norm": 0.9293351769447327, + "learning_rate": 9.987794811271511e-05, + "loss": 2.3632, + "step": 1682 + }, + { + "epoch": 0.5165745856353591, + "grad_norm": 0.9555168747901917, + "learning_rate": 9.987760077604791e-05, + "loss": 2.3273, + "step": 1683 + }, + { + "epoch": 0.5168815224063843, + "grad_norm": 0.9839370250701904, + "learning_rate": 9.987725294646102e-05, + "loss": 2.3451, + "step": 1684 + }, + { + "epoch": 0.5171884591774094, + "grad_norm": 1.097970962524414, + "learning_rate": 9.987690462395791e-05, + "loss": 2.308, + "step": 1685 + }, + { + "epoch": 0.5174953959484346, + "grad_norm": 0.9345484972000122, + "learning_rate": 9.987655580854198e-05, + "loss": 2.3051, + "step": 1686 + }, + { + "epoch": 0.5178023327194597, + "grad_norm": 0.8075851798057556, + "learning_rate": 9.987620650021668e-05, + "loss": 2.3005, + "step": 1687 + }, + { + "epoch": 0.518109269490485, + "grad_norm": 0.7287935614585876, + "learning_rate": 9.987585669898549e-05, + "loss": 2.3709, + "step": 1688 + }, + { + "epoch": 0.5184162062615101, + "grad_norm": 0.7611173987388611, + "learning_rate": 9.987550640485184e-05, + "loss": 2.3265, + "step": 1689 + }, + { + "epoch": 0.5187231430325353, + "grad_norm": 0.7932588458061218, + "learning_rate": 9.987515561781921e-05, + "loss": 2.3625, + "step": 1690 + }, + { + "epoch": 0.5190300798035604, + "grad_norm": 0.7837479114532471, + "learning_rate": 9.987480433789106e-05, + "loss": 2.2614, + "step": 1691 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.905799925327301, + "learning_rate": 9.987445256507085e-05, + "loss": 2.2915, + "step": 1692 + }, + { + "epoch": 0.5196439533456108, + "grad_norm": 0.9417183995246887, + "learning_rate": 9.987410029936208e-05, + "loss": 2.3624, + "step": 1693 + }, + { + "epoch": 0.519950890116636, + "grad_norm": 0.9971327185630798, + "learning_rate": 9.987374754076822e-05, + "loss": 2.3913, + "step": 1694 + }, + { + "epoch": 0.5202578268876611, + "grad_norm": 0.8719072341918945, + "learning_rate": 9.987339428929274e-05, + "loss": 2.3412, + "step": 1695 + }, + { + "epoch": 0.5205647636586863, + "grad_norm": 0.8198116421699524, + "learning_rate": 9.987304054493916e-05, + "loss": 2.333, + "step": 1696 + }, + { + "epoch": 0.5208717004297114, + "grad_norm": 0.7450931668281555, + "learning_rate": 9.987268630771096e-05, + "loss": 2.2817, + "step": 1697 + }, + { + "epoch": 0.5211786372007366, + "grad_norm": 0.6867587566375732, + "learning_rate": 9.987233157761164e-05, + "loss": 2.3456, + "step": 1698 + }, + { + "epoch": 0.5214855739717618, + "grad_norm": 0.7537778615951538, + "learning_rate": 9.987197635464471e-05, + "loss": 2.176, + "step": 1699 + }, + { + "epoch": 0.521792510742787, + "grad_norm": 0.8347577452659607, + "learning_rate": 9.987162063881366e-05, + "loss": 2.3296, + "step": 1700 + }, + { + "epoch": 0.5220994475138122, + "grad_norm": 0.8714643120765686, + "learning_rate": 9.987126443012205e-05, + "loss": 2.3648, + "step": 1701 + }, + { + "epoch": 0.5224063842848373, + "grad_norm": 0.8579849004745483, + "learning_rate": 9.987090772857336e-05, + "loss": 2.4189, + "step": 1702 + }, + { + "epoch": 0.5227133210558625, + "grad_norm": 0.8651238083839417, + "learning_rate": 9.987055053417114e-05, + "loss": 2.3036, + "step": 1703 + }, + { + "epoch": 0.5230202578268877, + "grad_norm": 0.8447873592376709, + "learning_rate": 9.98701928469189e-05, + "loss": 2.3243, + "step": 1704 + }, + { + "epoch": 0.5233271945979129, + "grad_norm": 0.8218941688537598, + "learning_rate": 9.986983466682019e-05, + "loss": 2.3888, + "step": 1705 + }, + { + "epoch": 0.523634131368938, + "grad_norm": 0.7862920761108398, + "learning_rate": 9.986947599387855e-05, + "loss": 2.335, + "step": 1706 + }, + { + "epoch": 0.5239410681399632, + "grad_norm": 0.8096200227737427, + "learning_rate": 9.986911682809749e-05, + "loss": 2.4034, + "step": 1707 + }, + { + "epoch": 0.5242480049109883, + "grad_norm": 0.8217427730560303, + "learning_rate": 9.986875716948062e-05, + "loss": 2.2659, + "step": 1708 + }, + { + "epoch": 0.5245549416820136, + "grad_norm": 0.7676928043365479, + "learning_rate": 9.986839701803146e-05, + "loss": 2.2736, + "step": 1709 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.7783572673797607, + "learning_rate": 9.986803637375356e-05, + "loss": 2.3611, + "step": 1710 + }, + { + "epoch": 0.5251688152240639, + "grad_norm": 0.7657338380813599, + "learning_rate": 9.98676752366505e-05, + "loss": 2.3573, + "step": 1711 + }, + { + "epoch": 0.525475751995089, + "grad_norm": 0.8946976065635681, + "learning_rate": 9.986731360672585e-05, + "loss": 2.3443, + "step": 1712 + }, + { + "epoch": 0.5257826887661142, + "grad_norm": 0.8047227263450623, + "learning_rate": 9.986695148398318e-05, + "loss": 2.345, + "step": 1713 + }, + { + "epoch": 0.5260896255371393, + "grad_norm": 0.8407939672470093, + "learning_rate": 9.986658886842605e-05, + "loss": 2.2828, + "step": 1714 + }, + { + "epoch": 0.5263965623081646, + "grad_norm": 0.8460215330123901, + "learning_rate": 9.986622576005806e-05, + "loss": 2.2786, + "step": 1715 + }, + { + "epoch": 0.5267034990791897, + "grad_norm": 0.8291949033737183, + "learning_rate": 9.986586215888283e-05, + "loss": 2.3491, + "step": 1716 + }, + { + "epoch": 0.5270104358502149, + "grad_norm": 0.8812628388404846, + "learning_rate": 9.98654980649039e-05, + "loss": 2.3392, + "step": 1717 + }, + { + "epoch": 0.52731737262124, + "grad_norm": 0.8666933178901672, + "learning_rate": 9.98651334781249e-05, + "loss": 2.2585, + "step": 1718 + }, + { + "epoch": 0.5276243093922652, + "grad_norm": 0.8393275737762451, + "learning_rate": 9.986476839854941e-05, + "loss": 2.3315, + "step": 1719 + }, + { + "epoch": 0.5279312461632903, + "grad_norm": 0.8431777954101562, + "learning_rate": 9.986440282618105e-05, + "loss": 2.268, + "step": 1720 + }, + { + "epoch": 0.5282381829343156, + "grad_norm": 0.8020747900009155, + "learning_rate": 9.986403676102346e-05, + "loss": 2.2306, + "step": 1721 + }, + { + "epoch": 0.5285451197053407, + "grad_norm": 0.817395806312561, + "learning_rate": 9.986367020308022e-05, + "loss": 2.2914, + "step": 1722 + }, + { + "epoch": 0.5288520564763659, + "grad_norm": 0.8034493327140808, + "learning_rate": 9.986330315235497e-05, + "loss": 2.3598, + "step": 1723 + }, + { + "epoch": 0.529158993247391, + "grad_norm": 0.9001252055168152, + "learning_rate": 9.986293560885131e-05, + "loss": 2.3456, + "step": 1724 + }, + { + "epoch": 0.5294659300184162, + "grad_norm": 0.9782349467277527, + "learning_rate": 9.986256757257293e-05, + "loss": 2.231, + "step": 1725 + }, + { + "epoch": 0.5297728667894414, + "grad_norm": 1.0022578239440918, + "learning_rate": 9.98621990435234e-05, + "loss": 2.3457, + "step": 1726 + }, + { + "epoch": 0.5300798035604666, + "grad_norm": 1.0705206394195557, + "learning_rate": 9.986183002170642e-05, + "loss": 2.2775, + "step": 1727 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.8464064598083496, + "learning_rate": 9.98614605071256e-05, + "loss": 2.4006, + "step": 1728 + }, + { + "epoch": 0.5306936771025169, + "grad_norm": 0.7128132581710815, + "learning_rate": 9.98610904997846e-05, + "loss": 2.3273, + "step": 1729 + }, + { + "epoch": 0.531000613873542, + "grad_norm": 0.8113927245140076, + "learning_rate": 9.986071999968706e-05, + "loss": 2.3467, + "step": 1730 + }, + { + "epoch": 0.5313075506445673, + "grad_norm": 0.9236831665039062, + "learning_rate": 9.986034900683669e-05, + "loss": 2.3815, + "step": 1731 + }, + { + "epoch": 0.5316144874155924, + "grad_norm": 0.9325668811798096, + "learning_rate": 9.985997752123713e-05, + "loss": 2.3411, + "step": 1732 + }, + { + "epoch": 0.5319214241866176, + "grad_norm": 0.9585117101669312, + "learning_rate": 9.985960554289203e-05, + "loss": 2.3309, + "step": 1733 + }, + { + "epoch": 0.5322283609576427, + "grad_norm": 0.9459986686706543, + "learning_rate": 9.98592330718051e-05, + "loss": 2.3525, + "step": 1734 + }, + { + "epoch": 0.5325352977286679, + "grad_norm": 0.971592366695404, + "learning_rate": 9.985886010797997e-05, + "loss": 2.3665, + "step": 1735 + }, + { + "epoch": 0.532842234499693, + "grad_norm": 0.8533779978752136, + "learning_rate": 9.985848665142039e-05, + "loss": 2.26, + "step": 1736 + }, + { + "epoch": 0.5331491712707183, + "grad_norm": 0.8224228620529175, + "learning_rate": 9.985811270213002e-05, + "loss": 2.3523, + "step": 1737 + }, + { + "epoch": 0.5334561080417434, + "grad_norm": 0.8649810552597046, + "learning_rate": 9.985773826011255e-05, + "loss": 2.3262, + "step": 1738 + }, + { + "epoch": 0.5337630448127686, + "grad_norm": 0.8099339604377747, + "learning_rate": 9.98573633253717e-05, + "loss": 2.3038, + "step": 1739 + }, + { + "epoch": 0.5340699815837937, + "grad_norm": 0.6788219213485718, + "learning_rate": 9.985698789791115e-05, + "loss": 2.3278, + "step": 1740 + }, + { + "epoch": 0.5343769183548189, + "grad_norm": 0.8716040253639221, + "learning_rate": 9.985661197773464e-05, + "loss": 2.2955, + "step": 1741 + }, + { + "epoch": 0.534683855125844, + "grad_norm": 0.8377614617347717, + "learning_rate": 9.985623556484587e-05, + "loss": 2.2801, + "step": 1742 + }, + { + "epoch": 0.5349907918968693, + "grad_norm": 0.8452683091163635, + "learning_rate": 9.985585865924853e-05, + "loss": 2.3313, + "step": 1743 + }, + { + "epoch": 0.5352977286678944, + "grad_norm": 0.8226203918457031, + "learning_rate": 9.98554812609464e-05, + "loss": 2.3464, + "step": 1744 + }, + { + "epoch": 0.5356046654389196, + "grad_norm": 0.7476974725723267, + "learning_rate": 9.985510336994316e-05, + "loss": 2.3721, + "step": 1745 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.7132230997085571, + "learning_rate": 9.98547249862426e-05, + "loss": 2.2657, + "step": 1746 + }, + { + "epoch": 0.5362185389809699, + "grad_norm": 0.7022002339363098, + "learning_rate": 9.98543461098484e-05, + "loss": 2.2656, + "step": 1747 + }, + { + "epoch": 0.536525475751995, + "grad_norm": 0.7174789309501648, + "learning_rate": 9.985396674076435e-05, + "loss": 2.2914, + "step": 1748 + }, + { + "epoch": 0.5368324125230203, + "grad_norm": 0.78509920835495, + "learning_rate": 9.985358687899417e-05, + "loss": 2.3155, + "step": 1749 + }, + { + "epoch": 0.5371393492940454, + "grad_norm": 0.7670894861221313, + "learning_rate": 9.985320652454162e-05, + "loss": 2.2608, + "step": 1750 + }, + { + "epoch": 0.5374462860650706, + "grad_norm": 0.6196603178977966, + "learning_rate": 9.985282567741047e-05, + "loss": 2.2796, + "step": 1751 + }, + { + "epoch": 0.5377532228360957, + "grad_norm": 0.7119829058647156, + "learning_rate": 9.985244433760448e-05, + "loss": 2.2262, + "step": 1752 + }, + { + "epoch": 0.538060159607121, + "grad_norm": 0.6665359735488892, + "learning_rate": 9.98520625051274e-05, + "loss": 2.2714, + "step": 1753 + }, + { + "epoch": 0.5383670963781461, + "grad_norm": 0.7960934042930603, + "learning_rate": 9.985168017998303e-05, + "loss": 2.3703, + "step": 1754 + }, + { + "epoch": 0.5386740331491713, + "grad_norm": 0.9428521394729614, + "learning_rate": 9.985129736217513e-05, + "loss": 2.3334, + "step": 1755 + }, + { + "epoch": 0.5389809699201964, + "grad_norm": 0.9900842905044556, + "learning_rate": 9.985091405170751e-05, + "loss": 2.2369, + "step": 1756 + }, + { + "epoch": 0.5392879066912216, + "grad_norm": 0.9340593814849854, + "learning_rate": 9.985053024858393e-05, + "loss": 2.4332, + "step": 1757 + }, + { + "epoch": 0.5395948434622467, + "grad_norm": 0.9241896271705627, + "learning_rate": 9.985014595280818e-05, + "loss": 2.3484, + "step": 1758 + }, + { + "epoch": 0.539901780233272, + "grad_norm": 0.7724506258964539, + "learning_rate": 9.984976116438408e-05, + "loss": 2.282, + "step": 1759 + }, + { + "epoch": 0.5402087170042971, + "grad_norm": 0.9098101854324341, + "learning_rate": 9.984937588331543e-05, + "loss": 2.3039, + "step": 1760 + }, + { + "epoch": 0.5405156537753223, + "grad_norm": 0.9430370330810547, + "learning_rate": 9.984899010960601e-05, + "loss": 2.2555, + "step": 1761 + }, + { + "epoch": 0.5408225905463474, + "grad_norm": 0.8927021026611328, + "learning_rate": 9.984860384325965e-05, + "loss": 2.3034, + "step": 1762 + }, + { + "epoch": 0.5411295273173726, + "grad_norm": 0.8331896662712097, + "learning_rate": 9.98482170842802e-05, + "loss": 2.3341, + "step": 1763 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.8311246633529663, + "learning_rate": 9.984782983267142e-05, + "loss": 2.3913, + "step": 1764 + }, + { + "epoch": 0.541743400859423, + "grad_norm": 0.7459335923194885, + "learning_rate": 9.98474420884372e-05, + "loss": 2.2912, + "step": 1765 + }, + { + "epoch": 0.5420503376304481, + "grad_norm": 0.84760981798172, + "learning_rate": 9.984705385158131e-05, + "loss": 2.316, + "step": 1766 + }, + { + "epoch": 0.5423572744014733, + "grad_norm": 0.888793408870697, + "learning_rate": 9.984666512210762e-05, + "loss": 2.3452, + "step": 1767 + }, + { + "epoch": 0.5426642111724984, + "grad_norm": 0.7977499961853027, + "learning_rate": 9.984627590001999e-05, + "loss": 2.3325, + "step": 1768 + }, + { + "epoch": 0.5429711479435236, + "grad_norm": 0.8059934377670288, + "learning_rate": 9.984588618532224e-05, + "loss": 2.3347, + "step": 1769 + }, + { + "epoch": 0.5432780847145487, + "grad_norm": 0.8190197348594666, + "learning_rate": 9.984549597801822e-05, + "loss": 2.3446, + "step": 1770 + }, + { + "epoch": 0.543585021485574, + "grad_norm": 0.774773895740509, + "learning_rate": 9.98451052781118e-05, + "loss": 2.2598, + "step": 1771 + }, + { + "epoch": 0.5438919582565992, + "grad_norm": 0.7341485023498535, + "learning_rate": 9.984471408560682e-05, + "loss": 2.2728, + "step": 1772 + }, + { + "epoch": 0.5441988950276243, + "grad_norm": 0.6881145238876343, + "learning_rate": 9.984432240050719e-05, + "loss": 2.2922, + "step": 1773 + }, + { + "epoch": 0.5445058317986495, + "grad_norm": 0.6896151304244995, + "learning_rate": 9.984393022281673e-05, + "loss": 2.2915, + "step": 1774 + }, + { + "epoch": 0.5448127685696746, + "grad_norm": 0.6902059316635132, + "learning_rate": 9.984353755253932e-05, + "loss": 2.31, + "step": 1775 + }, + { + "epoch": 0.5451197053406999, + "grad_norm": 0.7594140768051147, + "learning_rate": 9.984314438967888e-05, + "loss": 2.3092, + "step": 1776 + }, + { + "epoch": 0.545426642111725, + "grad_norm": 0.8682328462600708, + "learning_rate": 9.984275073423927e-05, + "loss": 2.2851, + "step": 1777 + }, + { + "epoch": 0.5457335788827502, + "grad_norm": 0.8747107982635498, + "learning_rate": 9.98423565862244e-05, + "loss": 2.2927, + "step": 1778 + }, + { + "epoch": 0.5460405156537753, + "grad_norm": 0.9824326038360596, + "learning_rate": 9.984196194563813e-05, + "loss": 2.3622, + "step": 1779 + }, + { + "epoch": 0.5463474524248005, + "grad_norm": 1.0006790161132812, + "learning_rate": 9.984156681248438e-05, + "loss": 2.2531, + "step": 1780 + }, + { + "epoch": 0.5466543891958257, + "grad_norm": 0.9501944184303284, + "learning_rate": 9.984117118676705e-05, + "loss": 2.3902, + "step": 1781 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.7835353016853333, + "learning_rate": 9.984077506849005e-05, + "loss": 2.2754, + "step": 1782 + }, + { + "epoch": 0.547268262737876, + "grad_norm": 0.7310026288032532, + "learning_rate": 9.984037845765732e-05, + "loss": 2.2742, + "step": 1783 + }, + { + "epoch": 0.5475751995089012, + "grad_norm": 0.9469361901283264, + "learning_rate": 9.983998135427275e-05, + "loss": 2.4026, + "step": 1784 + }, + { + "epoch": 0.5478821362799263, + "grad_norm": 1.0639240741729736, + "learning_rate": 9.983958375834025e-05, + "loss": 2.3522, + "step": 1785 + }, + { + "epoch": 0.5481890730509515, + "grad_norm": 0.7771989703178406, + "learning_rate": 9.983918566986379e-05, + "loss": 2.216, + "step": 1786 + }, + { + "epoch": 0.5484960098219767, + "grad_norm": 0.6809307932853699, + "learning_rate": 9.983878708884728e-05, + "loss": 2.256, + "step": 1787 + }, + { + "epoch": 0.5488029465930019, + "grad_norm": 0.7300165891647339, + "learning_rate": 9.983838801529469e-05, + "loss": 2.3156, + "step": 1788 + }, + { + "epoch": 0.549109883364027, + "grad_norm": 0.8352389335632324, + "learning_rate": 9.98379884492099e-05, + "loss": 2.3344, + "step": 1789 + }, + { + "epoch": 0.5494168201350522, + "grad_norm": 0.830585777759552, + "learning_rate": 9.983758839059692e-05, + "loss": 2.3076, + "step": 1790 + }, + { + "epoch": 0.5497237569060773, + "grad_norm": 0.7384640574455261, + "learning_rate": 9.983718783945968e-05, + "loss": 2.2387, + "step": 1791 + }, + { + "epoch": 0.5500306936771026, + "grad_norm": 0.7133243083953857, + "learning_rate": 9.983678679580213e-05, + "loss": 2.2933, + "step": 1792 + }, + { + "epoch": 0.5503376304481277, + "grad_norm": 0.8462459444999695, + "learning_rate": 9.983638525962823e-05, + "loss": 2.3294, + "step": 1793 + }, + { + "epoch": 0.5506445672191529, + "grad_norm": 0.7841110825538635, + "learning_rate": 9.983598323094199e-05, + "loss": 2.3156, + "step": 1794 + }, + { + "epoch": 0.550951503990178, + "grad_norm": 0.8454114198684692, + "learning_rate": 9.983558070974735e-05, + "loss": 2.2203, + "step": 1795 + }, + { + "epoch": 0.5512584407612032, + "grad_norm": 0.7741531729698181, + "learning_rate": 9.983517769604826e-05, + "loss": 2.2585, + "step": 1796 + }, + { + "epoch": 0.5515653775322283, + "grad_norm": 0.717714250087738, + "learning_rate": 9.983477418984876e-05, + "loss": 2.3127, + "step": 1797 + }, + { + "epoch": 0.5518723143032536, + "grad_norm": 0.7546361088752747, + "learning_rate": 9.983437019115283e-05, + "loss": 2.2591, + "step": 1798 + }, + { + "epoch": 0.5521792510742787, + "grad_norm": 0.7947681546211243, + "learning_rate": 9.983396569996442e-05, + "loss": 2.337, + "step": 1799 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.9286270141601562, + "learning_rate": 9.983356071628756e-05, + "loss": 2.371, + "step": 1800 + }, + { + "epoch": 0.552793124616329, + "grad_norm": 1.0236682891845703, + "learning_rate": 9.983315524012625e-05, + "loss": 2.2673, + "step": 1801 + }, + { + "epoch": 0.5531000613873542, + "grad_norm": 1.043534278869629, + "learning_rate": 9.983274927148447e-05, + "loss": 2.3204, + "step": 1802 + }, + { + "epoch": 0.5534069981583793, + "grad_norm": 0.9694257378578186, + "learning_rate": 9.983234281036626e-05, + "loss": 2.2642, + "step": 1803 + }, + { + "epoch": 0.5537139349294046, + "grad_norm": 0.8890992403030396, + "learning_rate": 9.983193585677563e-05, + "loss": 2.2546, + "step": 1804 + }, + { + "epoch": 0.5540208717004297, + "grad_norm": 0.8109140396118164, + "learning_rate": 9.983152841071662e-05, + "loss": 2.3088, + "step": 1805 + }, + { + "epoch": 0.5543278084714549, + "grad_norm": 0.7762413620948792, + "learning_rate": 9.983112047219323e-05, + "loss": 2.2277, + "step": 1806 + }, + { + "epoch": 0.55463474524248, + "grad_norm": 0.7949336767196655, + "learning_rate": 9.983071204120951e-05, + "loss": 2.3004, + "step": 1807 + }, + { + "epoch": 0.5549416820135052, + "grad_norm": 0.9118300080299377, + "learning_rate": 9.983030311776946e-05, + "loss": 2.3986, + "step": 1808 + }, + { + "epoch": 0.5552486187845304, + "grad_norm": 0.874891996383667, + "learning_rate": 9.982989370187717e-05, + "loss": 2.2721, + "step": 1809 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.8089940547943115, + "learning_rate": 9.982948379353667e-05, + "loss": 2.2846, + "step": 1810 + }, + { + "epoch": 0.5558624923265807, + "grad_norm": 0.7407395839691162, + "learning_rate": 9.982907339275198e-05, + "loss": 2.2848, + "step": 1811 + }, + { + "epoch": 0.5561694290976059, + "grad_norm": 0.7487329244613647, + "learning_rate": 9.982866249952721e-05, + "loss": 2.266, + "step": 1812 + }, + { + "epoch": 0.556476365868631, + "grad_norm": 0.7910557389259338, + "learning_rate": 9.982825111386638e-05, + "loss": 2.2975, + "step": 1813 + }, + { + "epoch": 0.5567833026396563, + "grad_norm": 0.767186164855957, + "learning_rate": 9.982783923577356e-05, + "loss": 2.2867, + "step": 1814 + }, + { + "epoch": 0.5570902394106814, + "grad_norm": 0.7296959757804871, + "learning_rate": 9.982742686525284e-05, + "loss": 2.2167, + "step": 1815 + }, + { + "epoch": 0.5573971761817066, + "grad_norm": 0.6536411643028259, + "learning_rate": 9.982701400230827e-05, + "loss": 2.2278, + "step": 1816 + }, + { + "epoch": 0.5577041129527317, + "grad_norm": 0.7393643260002136, + "learning_rate": 9.982660064694394e-05, + "loss": 2.3275, + "step": 1817 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.7837240099906921, + "learning_rate": 9.982618679916396e-05, + "loss": 2.3516, + "step": 1818 + }, + { + "epoch": 0.558317986494782, + "grad_norm": 0.8186847567558289, + "learning_rate": 9.982577245897238e-05, + "loss": 2.4104, + "step": 1819 + }, + { + "epoch": 0.5586249232658073, + "grad_norm": 0.733651340007782, + "learning_rate": 9.98253576263733e-05, + "loss": 2.2151, + "step": 1820 + }, + { + "epoch": 0.5589318600368324, + "grad_norm": 0.7452411651611328, + "learning_rate": 9.982494230137086e-05, + "loss": 2.3288, + "step": 1821 + }, + { + "epoch": 0.5592387968078576, + "grad_norm": 0.7369456887245178, + "learning_rate": 9.982452648396913e-05, + "loss": 2.3023, + "step": 1822 + }, + { + "epoch": 0.5595457335788827, + "grad_norm": 0.794789731502533, + "learning_rate": 9.982411017417222e-05, + "loss": 2.2774, + "step": 1823 + }, + { + "epoch": 0.5598526703499079, + "grad_norm": 0.7677412033081055, + "learning_rate": 9.982369337198425e-05, + "loss": 2.3213, + "step": 1824 + }, + { + "epoch": 0.560159607120933, + "grad_norm": 0.8195241689682007, + "learning_rate": 9.982327607740934e-05, + "loss": 2.3721, + "step": 1825 + }, + { + "epoch": 0.5604665438919583, + "grad_norm": 0.867115318775177, + "learning_rate": 9.982285829045162e-05, + "loss": 2.3653, + "step": 1826 + }, + { + "epoch": 0.5607734806629834, + "grad_norm": 0.8519865870475769, + "learning_rate": 9.98224400111152e-05, + "loss": 2.3646, + "step": 1827 + }, + { + "epoch": 0.5610804174340086, + "grad_norm": 0.9408721923828125, + "learning_rate": 9.982202123940425e-05, + "loss": 2.2051, + "step": 1828 + }, + { + "epoch": 0.5613873542050337, + "grad_norm": 0.985325813293457, + "learning_rate": 9.982160197532287e-05, + "loss": 2.3402, + "step": 1829 + }, + { + "epoch": 0.5616942909760589, + "grad_norm": 1.018094539642334, + "learning_rate": 9.982118221887521e-05, + "loss": 2.2712, + "step": 1830 + }, + { + "epoch": 0.562001227747084, + "grad_norm": 0.9246920347213745, + "learning_rate": 9.982076197006543e-05, + "loss": 2.3808, + "step": 1831 + }, + { + "epoch": 0.5623081645181093, + "grad_norm": 0.8519729971885681, + "learning_rate": 9.982034122889768e-05, + "loss": 2.3774, + "step": 1832 + }, + { + "epoch": 0.5626151012891344, + "grad_norm": 0.801567018032074, + "learning_rate": 9.981991999537612e-05, + "loss": 2.2713, + "step": 1833 + }, + { + "epoch": 0.5629220380601596, + "grad_norm": 0.7212518453598022, + "learning_rate": 9.981949826950492e-05, + "loss": 2.1902, + "step": 1834 + }, + { + "epoch": 0.5632289748311847, + "grad_norm": 0.7644798755645752, + "learning_rate": 9.981907605128822e-05, + "loss": 2.2751, + "step": 1835 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.7941999435424805, + "learning_rate": 9.981865334073022e-05, + "loss": 2.2991, + "step": 1836 + }, + { + "epoch": 0.5638428483732351, + "grad_norm": 0.7274888753890991, + "learning_rate": 9.981823013783508e-05, + "loss": 2.3536, + "step": 1837 + }, + { + "epoch": 0.5641497851442603, + "grad_norm": 0.845024585723877, + "learning_rate": 9.9817806442607e-05, + "loss": 2.2796, + "step": 1838 + }, + { + "epoch": 0.5644567219152854, + "grad_norm": 0.8225597739219666, + "learning_rate": 9.981738225505015e-05, + "loss": 2.3339, + "step": 1839 + }, + { + "epoch": 0.5647636586863106, + "grad_norm": 0.8456425070762634, + "learning_rate": 9.981695757516873e-05, + "loss": 2.2583, + "step": 1840 + }, + { + "epoch": 0.5650705954573357, + "grad_norm": 1.0066497325897217, + "learning_rate": 9.981653240296695e-05, + "loss": 2.3628, + "step": 1841 + }, + { + "epoch": 0.565377532228361, + "grad_norm": 0.9574379920959473, + "learning_rate": 9.981610673844899e-05, + "loss": 2.306, + "step": 1842 + }, + { + "epoch": 0.5656844689993862, + "grad_norm": 0.7427437901496887, + "learning_rate": 9.981568058161905e-05, + "loss": 2.267, + "step": 1843 + }, + { + "epoch": 0.5659914057704113, + "grad_norm": 0.6984857320785522, + "learning_rate": 9.981525393248138e-05, + "loss": 2.2095, + "step": 1844 + }, + { + "epoch": 0.5662983425414365, + "grad_norm": 0.748062789440155, + "learning_rate": 9.981482679104016e-05, + "loss": 2.211, + "step": 1845 + }, + { + "epoch": 0.5666052793124616, + "grad_norm": 0.7978217005729675, + "learning_rate": 9.981439915729964e-05, + "loss": 2.2437, + "step": 1846 + }, + { + "epoch": 0.5669122160834869, + "grad_norm": 0.807849109172821, + "learning_rate": 9.981397103126401e-05, + "loss": 2.3063, + "step": 1847 + }, + { + "epoch": 0.567219152854512, + "grad_norm": 0.8626619577407837, + "learning_rate": 9.981354241293752e-05, + "loss": 2.3616, + "step": 1848 + }, + { + "epoch": 0.5675260896255372, + "grad_norm": 0.8991526961326599, + "learning_rate": 9.981311330232442e-05, + "loss": 2.2355, + "step": 1849 + }, + { + "epoch": 0.5678330263965623, + "grad_norm": 0.7399953007698059, + "learning_rate": 9.981268369942894e-05, + "loss": 2.2452, + "step": 1850 + }, + { + "epoch": 0.5681399631675875, + "grad_norm": 0.7787104845046997, + "learning_rate": 9.981225360425533e-05, + "loss": 2.4141, + "step": 1851 + }, + { + "epoch": 0.5684468999386126, + "grad_norm": 0.8570892214775085, + "learning_rate": 9.98118230168078e-05, + "loss": 2.2487, + "step": 1852 + }, + { + "epoch": 0.5687538367096379, + "grad_norm": 0.8277538418769836, + "learning_rate": 9.981139193709068e-05, + "loss": 2.2602, + "step": 1853 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.7638106942176819, + "learning_rate": 9.981096036510817e-05, + "loss": 2.2886, + "step": 1854 + }, + { + "epoch": 0.5693677102516882, + "grad_norm": 0.8480616807937622, + "learning_rate": 9.981052830086454e-05, + "loss": 2.2893, + "step": 1855 + }, + { + "epoch": 0.5696746470227133, + "grad_norm": 0.8568599820137024, + "learning_rate": 9.98100957443641e-05, + "loss": 2.3802, + "step": 1856 + }, + { + "epoch": 0.5699815837937385, + "grad_norm": 0.7863987684249878, + "learning_rate": 9.98096626956111e-05, + "loss": 2.2996, + "step": 1857 + }, + { + "epoch": 0.5702885205647636, + "grad_norm": 0.7636334896087646, + "learning_rate": 9.980922915460979e-05, + "loss": 2.2569, + "step": 1858 + }, + { + "epoch": 0.5705954573357889, + "grad_norm": 0.7514677047729492, + "learning_rate": 9.98087951213645e-05, + "loss": 2.3317, + "step": 1859 + }, + { + "epoch": 0.570902394106814, + "grad_norm": 0.717637300491333, + "learning_rate": 9.980836059587951e-05, + "loss": 2.2855, + "step": 1860 + }, + { + "epoch": 0.5712093308778392, + "grad_norm": 0.728518545627594, + "learning_rate": 9.98079255781591e-05, + "loss": 2.3166, + "step": 1861 + }, + { + "epoch": 0.5715162676488643, + "grad_norm": 0.7158043384552002, + "learning_rate": 9.980749006820757e-05, + "loss": 2.2639, + "step": 1862 + }, + { + "epoch": 0.5718232044198895, + "grad_norm": 0.7565107941627502, + "learning_rate": 9.980705406602924e-05, + "loss": 2.2833, + "step": 1863 + }, + { + "epoch": 0.5721301411909147, + "grad_norm": 0.7873388528823853, + "learning_rate": 9.980661757162841e-05, + "loss": 2.201, + "step": 1864 + }, + { + "epoch": 0.5724370779619399, + "grad_norm": 0.7818259596824646, + "learning_rate": 9.980618058500939e-05, + "loss": 2.242, + "step": 1865 + }, + { + "epoch": 0.572744014732965, + "grad_norm": 0.7464665770530701, + "learning_rate": 9.98057431061765e-05, + "loss": 2.2325, + "step": 1866 + }, + { + "epoch": 0.5730509515039902, + "grad_norm": 0.7778184413909912, + "learning_rate": 9.980530513513406e-05, + "loss": 2.3258, + "step": 1867 + }, + { + "epoch": 0.5733578882750153, + "grad_norm": 0.825661301612854, + "learning_rate": 9.980486667188642e-05, + "loss": 2.3477, + "step": 1868 + }, + { + "epoch": 0.5736648250460405, + "grad_norm": 0.8448848724365234, + "learning_rate": 9.980442771643788e-05, + "loss": 2.3523, + "step": 1869 + }, + { + "epoch": 0.5739717618170657, + "grad_norm": 0.8330404758453369, + "learning_rate": 9.98039882687928e-05, + "loss": 2.2274, + "step": 1870 + }, + { + "epoch": 0.5742786985880909, + "grad_norm": 0.7520943284034729, + "learning_rate": 9.98035483289555e-05, + "loss": 2.2773, + "step": 1871 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.8312448263168335, + "learning_rate": 9.980310789693037e-05, + "loss": 2.302, + "step": 1872 + }, + { + "epoch": 0.5748925721301412, + "grad_norm": 0.7383994460105896, + "learning_rate": 9.980266697272173e-05, + "loss": 2.2168, + "step": 1873 + }, + { + "epoch": 0.5751995089011663, + "grad_norm": 0.9612922072410583, + "learning_rate": 9.980222555633394e-05, + "loss": 2.3558, + "step": 1874 + }, + { + "epoch": 0.5755064456721916, + "grad_norm": 0.9921227097511292, + "learning_rate": 9.980178364777136e-05, + "loss": 2.2913, + "step": 1875 + }, + { + "epoch": 0.5758133824432167, + "grad_norm": 0.9152889847755432, + "learning_rate": 9.980134124703837e-05, + "loss": 2.2615, + "step": 1876 + }, + { + "epoch": 0.5761203192142419, + "grad_norm": 0.8090541362762451, + "learning_rate": 9.980089835413936e-05, + "loss": 2.2661, + "step": 1877 + }, + { + "epoch": 0.576427255985267, + "grad_norm": 0.8074322938919067, + "learning_rate": 9.980045496907865e-05, + "loss": 2.3209, + "step": 1878 + }, + { + "epoch": 0.5767341927562922, + "grad_norm": 0.784649670124054, + "learning_rate": 9.980001109186065e-05, + "loss": 2.241, + "step": 1879 + }, + { + "epoch": 0.5770411295273173, + "grad_norm": 0.768108069896698, + "learning_rate": 9.979956672248978e-05, + "loss": 2.3333, + "step": 1880 + }, + { + "epoch": 0.5773480662983426, + "grad_norm": 0.798058271408081, + "learning_rate": 9.97991218609704e-05, + "loss": 2.3564, + "step": 1881 + }, + { + "epoch": 0.5776550030693677, + "grad_norm": 0.7606865763664246, + "learning_rate": 9.97986765073069e-05, + "loss": 2.2277, + "step": 1882 + }, + { + "epoch": 0.5779619398403929, + "grad_norm": 0.8320558667182922, + "learning_rate": 9.979823066150369e-05, + "loss": 2.3715, + "step": 1883 + }, + { + "epoch": 0.578268876611418, + "grad_norm": 0.7935798168182373, + "learning_rate": 9.979778432356517e-05, + "loss": 2.2605, + "step": 1884 + }, + { + "epoch": 0.5785758133824432, + "grad_norm": 0.6914796829223633, + "learning_rate": 9.979733749349578e-05, + "loss": 2.2699, + "step": 1885 + }, + { + "epoch": 0.5788827501534684, + "grad_norm": 0.6546899676322937, + "learning_rate": 9.979689017129989e-05, + "loss": 2.1908, + "step": 1886 + }, + { + "epoch": 0.5791896869244936, + "grad_norm": 0.7231267094612122, + "learning_rate": 9.979644235698195e-05, + "loss": 2.2084, + "step": 1887 + }, + { + "epoch": 0.5794966236955187, + "grad_norm": 0.668933093547821, + "learning_rate": 9.979599405054639e-05, + "loss": 2.2722, + "step": 1888 + }, + { + "epoch": 0.5798035604665439, + "grad_norm": 0.678191602230072, + "learning_rate": 9.979554525199763e-05, + "loss": 2.2312, + "step": 1889 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.6407462954521179, + "learning_rate": 9.97950959613401e-05, + "loss": 2.2381, + "step": 1890 + }, + { + "epoch": 0.5804174340085942, + "grad_norm": 0.6920403242111206, + "learning_rate": 9.979464617857826e-05, + "loss": 2.2678, + "step": 1891 + }, + { + "epoch": 0.5807243707796194, + "grad_norm": 0.6907110810279846, + "learning_rate": 9.979419590371651e-05, + "loss": 2.2579, + "step": 1892 + }, + { + "epoch": 0.5810313075506446, + "grad_norm": 0.7683933973312378, + "learning_rate": 9.979374513675935e-05, + "loss": 2.2184, + "step": 1893 + }, + { + "epoch": 0.5813382443216697, + "grad_norm": 0.797286868095398, + "learning_rate": 9.979329387771121e-05, + "loss": 2.2518, + "step": 1894 + }, + { + "epoch": 0.5816451810926949, + "grad_norm": 0.8192877769470215, + "learning_rate": 9.979284212657657e-05, + "loss": 2.2271, + "step": 1895 + }, + { + "epoch": 0.58195211786372, + "grad_norm": 0.7510090470314026, + "learning_rate": 9.979238988335986e-05, + "loss": 2.2864, + "step": 1896 + }, + { + "epoch": 0.5822590546347453, + "grad_norm": 0.7541393041610718, + "learning_rate": 9.979193714806558e-05, + "loss": 2.239, + "step": 1897 + }, + { + "epoch": 0.5825659914057704, + "grad_norm": 0.7353073358535767, + "learning_rate": 9.97914839206982e-05, + "loss": 2.2145, + "step": 1898 + }, + { + "epoch": 0.5828729281767956, + "grad_norm": 0.6813456416130066, + "learning_rate": 9.979103020126218e-05, + "loss": 2.194, + "step": 1899 + }, + { + "epoch": 0.5831798649478207, + "grad_norm": 0.6922066807746887, + "learning_rate": 9.979057598976202e-05, + "loss": 2.2335, + "step": 1900 + }, + { + "epoch": 0.5834868017188459, + "grad_norm": 0.5800344944000244, + "learning_rate": 9.97901212862022e-05, + "loss": 2.2159, + "step": 1901 + }, + { + "epoch": 0.583793738489871, + "grad_norm": 0.5770835280418396, + "learning_rate": 9.978966609058722e-05, + "loss": 2.2217, + "step": 1902 + }, + { + "epoch": 0.5841006752608963, + "grad_norm": 0.6217128038406372, + "learning_rate": 9.978921040292158e-05, + "loss": 2.2703, + "step": 1903 + }, + { + "epoch": 0.5844076120319214, + "grad_norm": 0.6684436798095703, + "learning_rate": 9.97887542232098e-05, + "loss": 2.2747, + "step": 1904 + }, + { + "epoch": 0.5847145488029466, + "grad_norm": 0.6261670589447021, + "learning_rate": 9.978829755145633e-05, + "loss": 2.2867, + "step": 1905 + }, + { + "epoch": 0.5850214855739717, + "grad_norm": 0.646051824092865, + "learning_rate": 9.978784038766575e-05, + "loss": 2.2493, + "step": 1906 + }, + { + "epoch": 0.5853284223449969, + "grad_norm": 0.6757060885429382, + "learning_rate": 9.978738273184254e-05, + "loss": 2.218, + "step": 1907 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.7867937684059143, + "learning_rate": 9.978692458399122e-05, + "loss": 2.3405, + "step": 1908 + }, + { + "epoch": 0.5859422958870473, + "grad_norm": 0.8349789381027222, + "learning_rate": 9.978646594411636e-05, + "loss": 2.3292, + "step": 1909 + }, + { + "epoch": 0.5862492326580724, + "grad_norm": 0.8739562034606934, + "learning_rate": 9.978600681222243e-05, + "loss": 2.2132, + "step": 1910 + }, + { + "epoch": 0.5865561694290976, + "grad_norm": 0.8187520503997803, + "learning_rate": 9.978554718831402e-05, + "loss": 2.3078, + "step": 1911 + }, + { + "epoch": 0.5868631062001227, + "grad_norm": 0.8463271856307983, + "learning_rate": 9.978508707239565e-05, + "loss": 2.1924, + "step": 1912 + }, + { + "epoch": 0.5871700429711479, + "grad_norm": 0.8674206733703613, + "learning_rate": 9.978462646447187e-05, + "loss": 2.2185, + "step": 1913 + }, + { + "epoch": 0.5874769797421732, + "grad_norm": 0.7828893065452576, + "learning_rate": 9.978416536454722e-05, + "loss": 2.3137, + "step": 1914 + }, + { + "epoch": 0.5877839165131983, + "grad_norm": 0.7868914604187012, + "learning_rate": 9.978370377262629e-05, + "loss": 2.2202, + "step": 1915 + }, + { + "epoch": 0.5880908532842235, + "grad_norm": 0.811596155166626, + "learning_rate": 9.97832416887136e-05, + "loss": 2.3463, + "step": 1916 + }, + { + "epoch": 0.5883977900552486, + "grad_norm": 0.9281075596809387, + "learning_rate": 9.978277911281375e-05, + "loss": 2.2394, + "step": 1917 + }, + { + "epoch": 0.5887047268262738, + "grad_norm": 0.8862313628196716, + "learning_rate": 9.978231604493129e-05, + "loss": 2.2456, + "step": 1918 + }, + { + "epoch": 0.589011663597299, + "grad_norm": 0.8411116600036621, + "learning_rate": 9.978185248507081e-05, + "loss": 2.2409, + "step": 1919 + }, + { + "epoch": 0.5893186003683242, + "grad_norm": 0.8205060958862305, + "learning_rate": 9.978138843323688e-05, + "loss": 2.2468, + "step": 1920 + }, + { + "epoch": 0.5896255371393493, + "grad_norm": 0.8103171586990356, + "learning_rate": 9.97809238894341e-05, + "loss": 2.2979, + "step": 1921 + }, + { + "epoch": 0.5899324739103745, + "grad_norm": 0.7937025427818298, + "learning_rate": 9.978045885366704e-05, + "loss": 2.3582, + "step": 1922 + }, + { + "epoch": 0.5902394106813996, + "grad_norm": 0.7983896136283875, + "learning_rate": 9.977999332594032e-05, + "loss": 2.2725, + "step": 1923 + }, + { + "epoch": 0.5905463474524248, + "grad_norm": 0.8274399042129517, + "learning_rate": 9.977952730625852e-05, + "loss": 2.3091, + "step": 1924 + }, + { + "epoch": 0.59085328422345, + "grad_norm": 0.9385362863540649, + "learning_rate": 9.977906079462627e-05, + "loss": 2.4322, + "step": 1925 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.8405537009239197, + "learning_rate": 9.977859379104814e-05, + "loss": 2.1606, + "step": 1926 + }, + { + "epoch": 0.5914671577655003, + "grad_norm": 0.8082418441772461, + "learning_rate": 9.97781262955288e-05, + "loss": 2.2929, + "step": 1927 + }, + { + "epoch": 0.5917740945365255, + "grad_norm": 0.7444280385971069, + "learning_rate": 9.977765830807283e-05, + "loss": 2.3217, + "step": 1928 + }, + { + "epoch": 0.5920810313075506, + "grad_norm": 0.7369982600212097, + "learning_rate": 9.977718982868485e-05, + "loss": 2.2658, + "step": 1929 + }, + { + "epoch": 0.5923879680785759, + "grad_norm": 0.6842257380485535, + "learning_rate": 9.977672085736951e-05, + "loss": 2.2243, + "step": 1930 + }, + { + "epoch": 0.592694904849601, + "grad_norm": 0.6954882740974426, + "learning_rate": 9.977625139413145e-05, + "loss": 2.2802, + "step": 1931 + }, + { + "epoch": 0.5930018416206262, + "grad_norm": 0.749829888343811, + "learning_rate": 9.97757814389753e-05, + "loss": 2.3166, + "step": 1932 + }, + { + "epoch": 0.5933087783916513, + "grad_norm": 0.7725609540939331, + "learning_rate": 9.977531099190569e-05, + "loss": 2.2367, + "step": 1933 + }, + { + "epoch": 0.5936157151626765, + "grad_norm": 0.7467440366744995, + "learning_rate": 9.977484005292728e-05, + "loss": 2.2704, + "step": 1934 + }, + { + "epoch": 0.5939226519337016, + "grad_norm": 0.7104424834251404, + "learning_rate": 9.977436862204475e-05, + "loss": 2.1983, + "step": 1935 + }, + { + "epoch": 0.5942295887047269, + "grad_norm": 0.7562711834907532, + "learning_rate": 9.977389669926272e-05, + "loss": 2.2857, + "step": 1936 + }, + { + "epoch": 0.594536525475752, + "grad_norm": 0.7803298830986023, + "learning_rate": 9.977342428458585e-05, + "loss": 2.3526, + "step": 1937 + }, + { + "epoch": 0.5948434622467772, + "grad_norm": 0.7487826943397522, + "learning_rate": 9.977295137801885e-05, + "loss": 2.2338, + "step": 1938 + }, + { + "epoch": 0.5951503990178023, + "grad_norm": 0.6969291567802429, + "learning_rate": 9.977247797956639e-05, + "loss": 2.2185, + "step": 1939 + }, + { + "epoch": 0.5954573357888275, + "grad_norm": 0.6293052434921265, + "learning_rate": 9.977200408923311e-05, + "loss": 2.2767, + "step": 1940 + }, + { + "epoch": 0.5957642725598526, + "grad_norm": 0.7457680702209473, + "learning_rate": 9.97715297070237e-05, + "loss": 2.2688, + "step": 1941 + }, + { + "epoch": 0.5960712093308779, + "grad_norm": 0.7255130410194397, + "learning_rate": 9.977105483294288e-05, + "loss": 2.2157, + "step": 1942 + }, + { + "epoch": 0.596378146101903, + "grad_norm": 0.739815890789032, + "learning_rate": 9.977057946699532e-05, + "loss": 2.306, + "step": 1943 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.7493855357170105, + "learning_rate": 9.977010360918571e-05, + "loss": 2.1893, + "step": 1944 + }, + { + "epoch": 0.5969920196439533, + "grad_norm": 0.7976173758506775, + "learning_rate": 9.976962725951878e-05, + "loss": 2.3288, + "step": 1945 + }, + { + "epoch": 0.5972989564149785, + "grad_norm": 0.9487287998199463, + "learning_rate": 9.976915041799921e-05, + "loss": 2.4484, + "step": 1946 + }, + { + "epoch": 0.5976058931860037, + "grad_norm": 0.9866845011711121, + "learning_rate": 9.976867308463174e-05, + "loss": 2.3223, + "step": 1947 + }, + { + "epoch": 0.5979128299570289, + "grad_norm": 0.9258660674095154, + "learning_rate": 9.976819525942107e-05, + "loss": 2.2358, + "step": 1948 + }, + { + "epoch": 0.598219766728054, + "grad_norm": 0.9822832345962524, + "learning_rate": 9.976771694237192e-05, + "loss": 2.2951, + "step": 1949 + }, + { + "epoch": 0.5985267034990792, + "grad_norm": 1.005528450012207, + "learning_rate": 9.976723813348902e-05, + "loss": 2.2604, + "step": 1950 + }, + { + "epoch": 0.5988336402701043, + "grad_norm": 0.8988018035888672, + "learning_rate": 9.976675883277711e-05, + "loss": 2.3419, + "step": 1951 + }, + { + "epoch": 0.5991405770411296, + "grad_norm": 0.7386319041252136, + "learning_rate": 9.976627904024091e-05, + "loss": 2.2357, + "step": 1952 + }, + { + "epoch": 0.5994475138121547, + "grad_norm": 0.7715404033660889, + "learning_rate": 9.976579875588518e-05, + "loss": 2.3482, + "step": 1953 + }, + { + "epoch": 0.5997544505831799, + "grad_norm": 0.7529712319374084, + "learning_rate": 9.976531797971464e-05, + "loss": 2.1735, + "step": 1954 + }, + { + "epoch": 0.600061387354205, + "grad_norm": 0.8589643836021423, + "learning_rate": 9.97648367117341e-05, + "loss": 2.305, + "step": 1955 + }, + { + "epoch": 0.6003683241252302, + "grad_norm": 0.9038915634155273, + "learning_rate": 9.976435495194823e-05, + "loss": 2.2123, + "step": 1956 + }, + { + "epoch": 0.6006752608962553, + "grad_norm": 0.9388678073883057, + "learning_rate": 9.976387270036186e-05, + "loss": 2.1792, + "step": 1957 + }, + { + "epoch": 0.6009821976672806, + "grad_norm": 0.7970952391624451, + "learning_rate": 9.976338995697974e-05, + "loss": 2.2425, + "step": 1958 + }, + { + "epoch": 0.6012891344383057, + "grad_norm": 0.7219900488853455, + "learning_rate": 9.976290672180662e-05, + "loss": 2.1984, + "step": 1959 + }, + { + "epoch": 0.6015960712093309, + "grad_norm": 0.639715313911438, + "learning_rate": 9.976242299484728e-05, + "loss": 2.2796, + "step": 1960 + }, + { + "epoch": 0.601903007980356, + "grad_norm": 0.6734911799430847, + "learning_rate": 9.976193877610652e-05, + "loss": 2.3066, + "step": 1961 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.8328932523727417, + "learning_rate": 9.976145406558912e-05, + "loss": 2.3958, + "step": 1962 + }, + { + "epoch": 0.6025168815224063, + "grad_norm": 0.9552088379859924, + "learning_rate": 9.976096886329986e-05, + "loss": 2.3246, + "step": 1963 + }, + { + "epoch": 0.6028238182934316, + "grad_norm": 0.8407328128814697, + "learning_rate": 9.976048316924354e-05, + "loss": 2.2922, + "step": 1964 + }, + { + "epoch": 0.6031307550644567, + "grad_norm": 0.6899709105491638, + "learning_rate": 9.975999698342495e-05, + "loss": 2.1808, + "step": 1965 + }, + { + "epoch": 0.6034376918354819, + "grad_norm": 0.8114390969276428, + "learning_rate": 9.975951030584892e-05, + "loss": 2.3516, + "step": 1966 + }, + { + "epoch": 0.603744628606507, + "grad_norm": 0.8071461319923401, + "learning_rate": 9.975902313652024e-05, + "loss": 2.2044, + "step": 1967 + }, + { + "epoch": 0.6040515653775322, + "grad_norm": 0.8767913579940796, + "learning_rate": 9.975853547544372e-05, + "loss": 2.24, + "step": 1968 + }, + { + "epoch": 0.6043585021485574, + "grad_norm": 0.817095935344696, + "learning_rate": 9.975804732262419e-05, + "loss": 2.169, + "step": 1969 + }, + { + "epoch": 0.6046654389195826, + "grad_norm": 0.6818623542785645, + "learning_rate": 9.975755867806648e-05, + "loss": 2.2869, + "step": 1970 + }, + { + "epoch": 0.6049723756906077, + "grad_norm": 0.7248693704605103, + "learning_rate": 9.97570695417754e-05, + "loss": 2.2159, + "step": 1971 + }, + { + "epoch": 0.6052793124616329, + "grad_norm": 0.6425455212593079, + "learning_rate": 9.975657991375581e-05, + "loss": 2.2173, + "step": 1972 + }, + { + "epoch": 0.605586249232658, + "grad_norm": 0.6856566071510315, + "learning_rate": 9.975608979401252e-05, + "loss": 2.2994, + "step": 1973 + }, + { + "epoch": 0.6058931860036832, + "grad_norm": 0.6731004118919373, + "learning_rate": 9.97555991825504e-05, + "loss": 2.2286, + "step": 1974 + }, + { + "epoch": 0.6062001227747084, + "grad_norm": 0.7461759448051453, + "learning_rate": 9.975510807937428e-05, + "loss": 2.2057, + "step": 1975 + }, + { + "epoch": 0.6065070595457336, + "grad_norm": 0.7256236672401428, + "learning_rate": 9.975461648448902e-05, + "loss": 2.2686, + "step": 1976 + }, + { + "epoch": 0.6068139963167587, + "grad_norm": 0.7254514098167419, + "learning_rate": 9.975412439789949e-05, + "loss": 2.2748, + "step": 1977 + }, + { + "epoch": 0.6071209330877839, + "grad_norm": 0.7280047535896301, + "learning_rate": 9.975363181961052e-05, + "loss": 2.27, + "step": 1978 + }, + { + "epoch": 0.607427869858809, + "grad_norm": 0.6801813244819641, + "learning_rate": 9.9753138749627e-05, + "loss": 2.2356, + "step": 1979 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.841946005821228, + "learning_rate": 9.975264518795382e-05, + "loss": 2.3887, + "step": 1980 + }, + { + "epoch": 0.6080417434008594, + "grad_norm": 0.9610007405281067, + "learning_rate": 9.975215113459582e-05, + "loss": 2.2857, + "step": 1981 + }, + { + "epoch": 0.6083486801718846, + "grad_norm": 0.8726536631584167, + "learning_rate": 9.975165658955791e-05, + "loss": 2.3137, + "step": 1982 + }, + { + "epoch": 0.6086556169429097, + "grad_norm": 0.9275946021080017, + "learning_rate": 9.975116155284498e-05, + "loss": 2.291, + "step": 1983 + }, + { + "epoch": 0.6089625537139349, + "grad_norm": 0.9045402407646179, + "learning_rate": 9.97506660244619e-05, + "loss": 2.2183, + "step": 1984 + }, + { + "epoch": 0.6092694904849602, + "grad_norm": 0.7913599610328674, + "learning_rate": 9.975017000441358e-05, + "loss": 2.349, + "step": 1985 + }, + { + "epoch": 0.6095764272559853, + "grad_norm": 0.714824378490448, + "learning_rate": 9.974967349270492e-05, + "loss": 2.2163, + "step": 1986 + }, + { + "epoch": 0.6098833640270105, + "grad_norm": 0.7178559899330139, + "learning_rate": 9.974917648934084e-05, + "loss": 2.2338, + "step": 1987 + }, + { + "epoch": 0.6101903007980356, + "grad_norm": 0.8417280912399292, + "learning_rate": 9.97486789943262e-05, + "loss": 2.1961, + "step": 1988 + }, + { + "epoch": 0.6104972375690608, + "grad_norm": 0.8488532304763794, + "learning_rate": 9.9748181007666e-05, + "loss": 2.2509, + "step": 1989 + }, + { + "epoch": 0.6108041743400859, + "grad_norm": 0.796309769153595, + "learning_rate": 9.974768252936509e-05, + "loss": 2.2948, + "step": 1990 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.7163965702056885, + "learning_rate": 9.974718355942843e-05, + "loss": 2.2136, + "step": 1991 + }, + { + "epoch": 0.6114180478821363, + "grad_norm": 0.6620060205459595, + "learning_rate": 9.974668409786095e-05, + "loss": 2.2442, + "step": 1992 + }, + { + "epoch": 0.6117249846531615, + "grad_norm": 0.6843542456626892, + "learning_rate": 9.974618414466759e-05, + "loss": 2.1972, + "step": 1993 + }, + { + "epoch": 0.6120319214241866, + "grad_norm": 0.699847936630249, + "learning_rate": 9.974568369985327e-05, + "loss": 2.2194, + "step": 1994 + }, + { + "epoch": 0.6123388581952118, + "grad_norm": 0.693384051322937, + "learning_rate": 9.974518276342293e-05, + "loss": 2.2446, + "step": 1995 + }, + { + "epoch": 0.612645794966237, + "grad_norm": 0.6022316813468933, + "learning_rate": 9.974468133538155e-05, + "loss": 2.2037, + "step": 1996 + }, + { + "epoch": 0.6129527317372622, + "grad_norm": 0.6317062377929688, + "learning_rate": 9.974417941573409e-05, + "loss": 2.1855, + "step": 1997 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.7291355133056641, + "learning_rate": 9.974367700448547e-05, + "loss": 2.2179, + "step": 1998 + }, + { + "epoch": 0.6135666052793125, + "grad_norm": 0.6776867508888245, + "learning_rate": 9.97431741016407e-05, + "loss": 2.2437, + "step": 1999 + }, + { + "epoch": 0.6138735420503376, + "grad_norm": 0.6598517298698425, + "learning_rate": 9.97426707072047e-05, + "loss": 2.2775, + "step": 2000 + }, + { + "epoch": 0.6141804788213628, + "grad_norm": 0.6681709289550781, + "learning_rate": 9.974216682118249e-05, + "loss": 2.2004, + "step": 2001 + }, + { + "epoch": 0.614487415592388, + "grad_norm": 0.6725168228149414, + "learning_rate": 9.974166244357903e-05, + "loss": 2.2922, + "step": 2002 + }, + { + "epoch": 0.6147943523634132, + "grad_norm": 0.6547908782958984, + "learning_rate": 9.974115757439931e-05, + "loss": 2.2195, + "step": 2003 + }, + { + "epoch": 0.6151012891344383, + "grad_norm": 0.7195348739624023, + "learning_rate": 9.974065221364831e-05, + "loss": 2.2862, + "step": 2004 + }, + { + "epoch": 0.6154082259054635, + "grad_norm": 0.7992655038833618, + "learning_rate": 9.974014636133103e-05, + "loss": 2.3109, + "step": 2005 + }, + { + "epoch": 0.6157151626764886, + "grad_norm": 0.7932934165000916, + "learning_rate": 9.973964001745249e-05, + "loss": 2.2869, + "step": 2006 + }, + { + "epoch": 0.6160220994475138, + "grad_norm": 0.7778924107551575, + "learning_rate": 9.973913318201763e-05, + "loss": 2.2046, + "step": 2007 + }, + { + "epoch": 0.616329036218539, + "grad_norm": 0.7951294183731079, + "learning_rate": 9.973862585503155e-05, + "loss": 2.221, + "step": 2008 + }, + { + "epoch": 0.6166359729895642, + "grad_norm": 0.729552686214447, + "learning_rate": 9.97381180364992e-05, + "loss": 2.2929, + "step": 2009 + }, + { + "epoch": 0.6169429097605893, + "grad_norm": 0.731516420841217, + "learning_rate": 9.973760972642561e-05, + "loss": 2.2673, + "step": 2010 + }, + { + "epoch": 0.6172498465316145, + "grad_norm": 0.6950094103813171, + "learning_rate": 9.973710092481581e-05, + "loss": 2.2029, + "step": 2011 + }, + { + "epoch": 0.6175567833026396, + "grad_norm": 0.6260825395584106, + "learning_rate": 9.973659163167484e-05, + "loss": 2.3037, + "step": 2012 + }, + { + "epoch": 0.6178637200736649, + "grad_norm": 0.6949467658996582, + "learning_rate": 9.97360818470077e-05, + "loss": 2.2699, + "step": 2013 + }, + { + "epoch": 0.61817065684469, + "grad_norm": 0.7322572469711304, + "learning_rate": 9.973557157081945e-05, + "loss": 2.2921, + "step": 2014 + }, + { + "epoch": 0.6184775936157152, + "grad_norm": 0.8999563455581665, + "learning_rate": 9.973506080311514e-05, + "loss": 2.2499, + "step": 2015 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.9269914031028748, + "learning_rate": 9.973454954389981e-05, + "loss": 2.2676, + "step": 2016 + }, + { + "epoch": 0.6190914671577655, + "grad_norm": 0.8630712628364563, + "learning_rate": 9.973403779317852e-05, + "loss": 2.1379, + "step": 2017 + }, + { + "epoch": 0.6193984039287906, + "grad_norm": 0.8249645233154297, + "learning_rate": 9.97335255509563e-05, + "loss": 2.3109, + "step": 2018 + }, + { + "epoch": 0.6197053406998159, + "grad_norm": 0.7832711338996887, + "learning_rate": 9.973301281723824e-05, + "loss": 2.1316, + "step": 2019 + }, + { + "epoch": 0.620012277470841, + "grad_norm": 0.7502821683883667, + "learning_rate": 9.97324995920294e-05, + "loss": 2.2188, + "step": 2020 + }, + { + "epoch": 0.6203192142418662, + "grad_norm": 0.7804487347602844, + "learning_rate": 9.973198587533483e-05, + "loss": 2.2639, + "step": 2021 + }, + { + "epoch": 0.6206261510128913, + "grad_norm": 0.9198356866836548, + "learning_rate": 9.973147166715963e-05, + "loss": 2.2574, + "step": 2022 + }, + { + "epoch": 0.6209330877839165, + "grad_norm": 0.8792869448661804, + "learning_rate": 9.97309569675089e-05, + "loss": 2.2228, + "step": 2023 + }, + { + "epoch": 0.6212400245549416, + "grad_norm": 0.779772937297821, + "learning_rate": 9.97304417763877e-05, + "loss": 2.2179, + "step": 2024 + }, + { + "epoch": 0.6215469613259669, + "grad_norm": 0.7702100276947021, + "learning_rate": 9.972992609380111e-05, + "loss": 2.3872, + "step": 2025 + }, + { + "epoch": 0.621853898096992, + "grad_norm": 0.8576669096946716, + "learning_rate": 9.972940991975426e-05, + "loss": 2.2279, + "step": 2026 + }, + { + "epoch": 0.6221608348680172, + "grad_norm": 0.8312802314758301, + "learning_rate": 9.972889325425223e-05, + "loss": 2.3507, + "step": 2027 + }, + { + "epoch": 0.6224677716390423, + "grad_norm": 0.7873719930648804, + "learning_rate": 9.972837609730013e-05, + "loss": 2.2252, + "step": 2028 + }, + { + "epoch": 0.6227747084100675, + "grad_norm": 0.7763897180557251, + "learning_rate": 9.972785844890307e-05, + "loss": 2.2559, + "step": 2029 + }, + { + "epoch": 0.6230816451810927, + "grad_norm": 0.7053700685501099, + "learning_rate": 9.972734030906617e-05, + "loss": 2.2248, + "step": 2030 + }, + { + "epoch": 0.6233885819521179, + "grad_norm": 0.8800643682479858, + "learning_rate": 9.972682167779453e-05, + "loss": 2.3111, + "step": 2031 + }, + { + "epoch": 0.623695518723143, + "grad_norm": 0.7237632274627686, + "learning_rate": 9.97263025550933e-05, + "loss": 2.2255, + "step": 2032 + }, + { + "epoch": 0.6240024554941682, + "grad_norm": 0.7139064073562622, + "learning_rate": 9.97257829409676e-05, + "loss": 2.2065, + "step": 2033 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.6514315009117126, + "learning_rate": 9.972526283542259e-05, + "loss": 2.2176, + "step": 2034 + }, + { + "epoch": 0.6246163290362186, + "grad_norm": 0.726828932762146, + "learning_rate": 9.972474223846337e-05, + "loss": 2.2236, + "step": 2035 + }, + { + "epoch": 0.6249232658072437, + "grad_norm": 0.7121313810348511, + "learning_rate": 9.97242211500951e-05, + "loss": 2.2696, + "step": 2036 + }, + { + "epoch": 0.6252302025782689, + "grad_norm": 0.7203021049499512, + "learning_rate": 9.972369957032293e-05, + "loss": 2.2418, + "step": 2037 + }, + { + "epoch": 0.625537139349294, + "grad_norm": 0.6843051910400391, + "learning_rate": 9.972317749915203e-05, + "loss": 2.2408, + "step": 2038 + }, + { + "epoch": 0.6258440761203192, + "grad_norm": 0.6523141264915466, + "learning_rate": 9.972265493658754e-05, + "loss": 2.1693, + "step": 2039 + }, + { + "epoch": 0.6261510128913443, + "grad_norm": 0.6263946294784546, + "learning_rate": 9.972213188263463e-05, + "loss": 2.2477, + "step": 2040 + }, + { + "epoch": 0.6264579496623696, + "grad_norm": 0.6428464651107788, + "learning_rate": 9.972160833729847e-05, + "loss": 2.2131, + "step": 2041 + }, + { + "epoch": 0.6267648864333947, + "grad_norm": 0.6333484649658203, + "learning_rate": 9.972108430058423e-05, + "loss": 2.2806, + "step": 2042 + }, + { + "epoch": 0.6270718232044199, + "grad_norm": 0.7168832421302795, + "learning_rate": 9.97205597724971e-05, + "loss": 2.2468, + "step": 2043 + }, + { + "epoch": 0.627378759975445, + "grad_norm": 0.7522227168083191, + "learning_rate": 9.972003475304226e-05, + "loss": 2.249, + "step": 2044 + }, + { + "epoch": 0.6276856967464702, + "grad_norm": 0.6810066103935242, + "learning_rate": 9.971950924222488e-05, + "loss": 2.1988, + "step": 2045 + }, + { + "epoch": 0.6279926335174953, + "grad_norm": 0.6983187198638916, + "learning_rate": 9.971898324005018e-05, + "loss": 2.2444, + "step": 2046 + }, + { + "epoch": 0.6282995702885206, + "grad_norm": 0.7261439561843872, + "learning_rate": 9.971845674652333e-05, + "loss": 2.1789, + "step": 2047 + }, + { + "epoch": 0.6286065070595457, + "grad_norm": 0.6844322681427002, + "learning_rate": 9.971792976164957e-05, + "loss": 2.2666, + "step": 2048 + }, + { + "epoch": 0.6289134438305709, + "grad_norm": 0.7166746258735657, + "learning_rate": 9.971740228543407e-05, + "loss": 2.3002, + "step": 2049 + }, + { + "epoch": 0.629220380601596, + "grad_norm": 0.7386785745620728, + "learning_rate": 9.971687431788207e-05, + "loss": 2.1798, + "step": 2050 + }, + { + "epoch": 0.6295273173726212, + "grad_norm": 0.6873611211776733, + "learning_rate": 9.971634585899878e-05, + "loss": 2.184, + "step": 2051 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.8005948066711426, + "learning_rate": 9.971581690878941e-05, + "loss": 2.2778, + "step": 2052 + }, + { + "epoch": 0.6301411909146716, + "grad_norm": 0.8972415924072266, + "learning_rate": 9.971528746725922e-05, + "loss": 2.2822, + "step": 2053 + }, + { + "epoch": 0.6304481276856968, + "grad_norm": 0.7935822010040283, + "learning_rate": 9.97147575344134e-05, + "loss": 2.1732, + "step": 2054 + }, + { + "epoch": 0.6307550644567219, + "grad_norm": 0.7891644239425659, + "learning_rate": 9.971422711025721e-05, + "loss": 2.2765, + "step": 2055 + }, + { + "epoch": 0.6310620012277471, + "grad_norm": 0.7857005000114441, + "learning_rate": 9.971369619479589e-05, + "loss": 2.2386, + "step": 2056 + }, + { + "epoch": 0.6313689379987723, + "grad_norm": 0.6909852623939514, + "learning_rate": 9.97131647880347e-05, + "loss": 2.1251, + "step": 2057 + }, + { + "epoch": 0.6316758747697975, + "grad_norm": 0.6352387070655823, + "learning_rate": 9.971263288997885e-05, + "loss": 2.1883, + "step": 2058 + }, + { + "epoch": 0.6319828115408226, + "grad_norm": 0.5811386704444885, + "learning_rate": 9.971210050063364e-05, + "loss": 2.281, + "step": 2059 + }, + { + "epoch": 0.6322897483118478, + "grad_norm": 0.6227630376815796, + "learning_rate": 9.971156762000432e-05, + "loss": 2.1346, + "step": 2060 + }, + { + "epoch": 0.6325966850828729, + "grad_norm": 0.6628422737121582, + "learning_rate": 9.971103424809616e-05, + "loss": 2.2617, + "step": 2061 + }, + { + "epoch": 0.6329036218538981, + "grad_norm": 0.7212308645248413, + "learning_rate": 9.97105003849144e-05, + "loss": 2.1764, + "step": 2062 + }, + { + "epoch": 0.6332105586249233, + "grad_norm": 0.8368894457817078, + "learning_rate": 9.970996603046435e-05, + "loss": 2.2897, + "step": 2063 + }, + { + "epoch": 0.6335174953959485, + "grad_norm": 0.8797467350959778, + "learning_rate": 9.970943118475129e-05, + "loss": 2.1987, + "step": 2064 + }, + { + "epoch": 0.6338244321669736, + "grad_norm": 0.9241101145744324, + "learning_rate": 9.970889584778047e-05, + "loss": 2.2759, + "step": 2065 + }, + { + "epoch": 0.6341313689379988, + "grad_norm": 0.8636183142662048, + "learning_rate": 9.970836001955723e-05, + "loss": 2.2188, + "step": 2066 + }, + { + "epoch": 0.6344383057090239, + "grad_norm": 0.8965754508972168, + "learning_rate": 9.970782370008682e-05, + "loss": 2.2845, + "step": 2067 + }, + { + "epoch": 0.6347452424800492, + "grad_norm": 0.9064372777938843, + "learning_rate": 9.970728688937459e-05, + "loss": 2.1787, + "step": 2068 + }, + { + "epoch": 0.6350521792510743, + "grad_norm": 0.7387171387672424, + "learning_rate": 9.970674958742579e-05, + "loss": 2.1805, + "step": 2069 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.6220484972000122, + "learning_rate": 9.970621179424578e-05, + "loss": 2.2762, + "step": 2070 + }, + { + "epoch": 0.6356660527931246, + "grad_norm": 0.6268464922904968, + "learning_rate": 9.970567350983984e-05, + "loss": 2.2491, + "step": 2071 + }, + { + "epoch": 0.6359729895641498, + "grad_norm": 0.6385738253593445, + "learning_rate": 9.97051347342133e-05, + "loss": 2.2126, + "step": 2072 + }, + { + "epoch": 0.6362799263351749, + "grad_norm": 0.7084285020828247, + "learning_rate": 9.970459546737148e-05, + "loss": 2.2364, + "step": 2073 + }, + { + "epoch": 0.6365868631062002, + "grad_norm": 0.6957145929336548, + "learning_rate": 9.97040557093197e-05, + "loss": 2.266, + "step": 2074 + }, + { + "epoch": 0.6368937998772253, + "grad_norm": 0.6037309169769287, + "learning_rate": 9.970351546006334e-05, + "loss": 2.1514, + "step": 2075 + }, + { + "epoch": 0.6372007366482505, + "grad_norm": 0.6342970132827759, + "learning_rate": 9.97029747196077e-05, + "loss": 2.1602, + "step": 2076 + }, + { + "epoch": 0.6375076734192756, + "grad_norm": 0.5793863534927368, + "learning_rate": 9.970243348795812e-05, + "loss": 2.1853, + "step": 2077 + }, + { + "epoch": 0.6378146101903008, + "grad_norm": 0.5420103073120117, + "learning_rate": 9.970189176511997e-05, + "loss": 2.1885, + "step": 2078 + }, + { + "epoch": 0.638121546961326, + "grad_norm": 0.6713188886642456, + "learning_rate": 9.97013495510986e-05, + "loss": 2.2641, + "step": 2079 + }, + { + "epoch": 0.6384284837323512, + "grad_norm": 0.7410796880722046, + "learning_rate": 9.970080684589935e-05, + "loss": 2.2248, + "step": 2080 + }, + { + "epoch": 0.6387354205033763, + "grad_norm": 0.7138017416000366, + "learning_rate": 9.970026364952761e-05, + "loss": 2.1975, + "step": 2081 + }, + { + "epoch": 0.6390423572744015, + "grad_norm": 0.7553584575653076, + "learning_rate": 9.969971996198873e-05, + "loss": 2.2482, + "step": 2082 + }, + { + "epoch": 0.6393492940454266, + "grad_norm": 0.7082852125167847, + "learning_rate": 9.969917578328808e-05, + "loss": 2.1681, + "step": 2083 + }, + { + "epoch": 0.6396562308164518, + "grad_norm": 0.6190223097801208, + "learning_rate": 9.969863111343105e-05, + "loss": 2.1995, + "step": 2084 + }, + { + "epoch": 0.639963167587477, + "grad_norm": 0.6640429496765137, + "learning_rate": 9.969808595242302e-05, + "loss": 2.2969, + "step": 2085 + }, + { + "epoch": 0.6402701043585022, + "grad_norm": 0.761377215385437, + "learning_rate": 9.969754030026936e-05, + "loss": 2.2412, + "step": 2086 + }, + { + "epoch": 0.6405770411295273, + "grad_norm": 0.7226401567459106, + "learning_rate": 9.969699415697551e-05, + "loss": 2.1852, + "step": 2087 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.6474639177322388, + "learning_rate": 9.969644752254681e-05, + "loss": 2.1867, + "step": 2088 + }, + { + "epoch": 0.6411909146715776, + "grad_norm": 0.6725835800170898, + "learning_rate": 9.96959003969887e-05, + "loss": 2.1962, + "step": 2089 + }, + { + "epoch": 0.6414978514426029, + "grad_norm": 0.6669641733169556, + "learning_rate": 9.969535278030657e-05, + "loss": 2.2045, + "step": 2090 + }, + { + "epoch": 0.641804788213628, + "grad_norm": 0.7604048252105713, + "learning_rate": 9.969480467250583e-05, + "loss": 2.2543, + "step": 2091 + }, + { + "epoch": 0.6421117249846532, + "grad_norm": 0.9369953870773315, + "learning_rate": 9.969425607359191e-05, + "loss": 2.2461, + "step": 2092 + }, + { + "epoch": 0.6424186617556783, + "grad_norm": 1.116156816482544, + "learning_rate": 9.969370698357022e-05, + "loss": 2.2447, + "step": 2093 + }, + { + "epoch": 0.6427255985267035, + "grad_norm": 0.9179674983024597, + "learning_rate": 9.96931574024462e-05, + "loss": 2.2164, + "step": 2094 + }, + { + "epoch": 0.6430325352977286, + "grad_norm": 0.7629393339157104, + "learning_rate": 9.969260733022526e-05, + "loss": 2.22, + "step": 2095 + }, + { + "epoch": 0.6433394720687539, + "grad_norm": 0.7152948379516602, + "learning_rate": 9.969205676691286e-05, + "loss": 2.1967, + "step": 2096 + }, + { + "epoch": 0.643646408839779, + "grad_norm": 0.7527763247489929, + "learning_rate": 9.969150571251442e-05, + "loss": 2.2263, + "step": 2097 + }, + { + "epoch": 0.6439533456108042, + "grad_norm": 0.9889422655105591, + "learning_rate": 9.96909541670354e-05, + "loss": 2.2127, + "step": 2098 + }, + { + "epoch": 0.6442602823818293, + "grad_norm": 1.0340619087219238, + "learning_rate": 9.969040213048125e-05, + "loss": 2.2392, + "step": 2099 + }, + { + "epoch": 0.6445672191528545, + "grad_norm": 0.735322892665863, + "learning_rate": 9.968984960285743e-05, + "loss": 2.1351, + "step": 2100 + }, + { + "epoch": 0.6448741559238796, + "grad_norm": 0.6575397849082947, + "learning_rate": 9.968929658416936e-05, + "loss": 2.2481, + "step": 2101 + }, + { + "epoch": 0.6451810926949049, + "grad_norm": 0.6891960501670837, + "learning_rate": 9.968874307442258e-05, + "loss": 2.2164, + "step": 2102 + }, + { + "epoch": 0.64548802946593, + "grad_norm": 0.792298436164856, + "learning_rate": 9.968818907362248e-05, + "loss": 2.1681, + "step": 2103 + }, + { + "epoch": 0.6457949662369552, + "grad_norm": 0.8438142538070679, + "learning_rate": 9.968763458177459e-05, + "loss": 2.2123, + "step": 2104 + }, + { + "epoch": 0.6461019030079803, + "grad_norm": 0.7494921088218689, + "learning_rate": 9.968707959888436e-05, + "loss": 2.1863, + "step": 2105 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.7049927115440369, + "learning_rate": 9.968652412495731e-05, + "loss": 2.2364, + "step": 2106 + }, + { + "epoch": 0.6467157765500307, + "grad_norm": 0.7586455345153809, + "learning_rate": 9.968596815999889e-05, + "loss": 2.1976, + "step": 2107 + }, + { + "epoch": 0.6470227133210559, + "grad_norm": 0.7762691974639893, + "learning_rate": 9.968541170401462e-05, + "loss": 2.2323, + "step": 2108 + }, + { + "epoch": 0.647329650092081, + "grad_norm": 0.8127642869949341, + "learning_rate": 9.968485475700998e-05, + "loss": 2.1577, + "step": 2109 + }, + { + "epoch": 0.6476365868631062, + "grad_norm": 0.6762635111808777, + "learning_rate": 9.968429731899049e-05, + "loss": 2.1972, + "step": 2110 + }, + { + "epoch": 0.6479435236341313, + "grad_norm": 0.675707995891571, + "learning_rate": 9.968373938996165e-05, + "loss": 2.1932, + "step": 2111 + }, + { + "epoch": 0.6482504604051565, + "grad_norm": 0.6996815204620361, + "learning_rate": 9.968318096992898e-05, + "loss": 2.2695, + "step": 2112 + }, + { + "epoch": 0.6485573971761817, + "grad_norm": 0.8519851565361023, + "learning_rate": 9.968262205889799e-05, + "loss": 2.2662, + "step": 2113 + }, + { + "epoch": 0.6488643339472069, + "grad_norm": 0.7621145844459534, + "learning_rate": 9.968206265687421e-05, + "loss": 2.2888, + "step": 2114 + }, + { + "epoch": 0.649171270718232, + "grad_norm": 0.786609411239624, + "learning_rate": 9.968150276386317e-05, + "loss": 2.3354, + "step": 2115 + }, + { + "epoch": 0.6494782074892572, + "grad_norm": 0.7693428993225098, + "learning_rate": 9.96809423798704e-05, + "loss": 2.1981, + "step": 2116 + }, + { + "epoch": 0.6497851442602823, + "grad_norm": 0.72762131690979, + "learning_rate": 9.968038150490145e-05, + "loss": 2.2387, + "step": 2117 + }, + { + "epoch": 0.6500920810313076, + "grad_norm": 0.737617015838623, + "learning_rate": 9.967982013896184e-05, + "loss": 2.258, + "step": 2118 + }, + { + "epoch": 0.6503990178023327, + "grad_norm": 0.7320968508720398, + "learning_rate": 9.967925828205712e-05, + "loss": 2.3248, + "step": 2119 + }, + { + "epoch": 0.6507059545733579, + "grad_norm": 0.7904484868049622, + "learning_rate": 9.967869593419286e-05, + "loss": 2.2121, + "step": 2120 + }, + { + "epoch": 0.651012891344383, + "grad_norm": 0.7519722580909729, + "learning_rate": 9.967813309537461e-05, + "loss": 2.1999, + "step": 2121 + }, + { + "epoch": 0.6513198281154082, + "grad_norm": 0.7201504707336426, + "learning_rate": 9.967756976560793e-05, + "loss": 2.2022, + "step": 2122 + }, + { + "epoch": 0.6516267648864333, + "grad_norm": 0.6134514808654785, + "learning_rate": 9.96770059448984e-05, + "loss": 2.2105, + "step": 2123 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.6086028218269348, + "learning_rate": 9.967644163325156e-05, + "loss": 2.212, + "step": 2124 + }, + { + "epoch": 0.6522406384284838, + "grad_norm": 0.6550475358963013, + "learning_rate": 9.967587683067302e-05, + "loss": 2.181, + "step": 2125 + }, + { + "epoch": 0.6525475751995089, + "grad_norm": 0.7557916045188904, + "learning_rate": 9.967531153716835e-05, + "loss": 2.3194, + "step": 2126 + }, + { + "epoch": 0.6528545119705341, + "grad_norm": 0.8859965801239014, + "learning_rate": 9.967474575274314e-05, + "loss": 2.2104, + "step": 2127 + }, + { + "epoch": 0.6531614487415592, + "grad_norm": 0.8049005270004272, + "learning_rate": 9.967417947740296e-05, + "loss": 2.2949, + "step": 2128 + }, + { + "epoch": 0.6534683855125845, + "grad_norm": 0.708297073841095, + "learning_rate": 9.967361271115343e-05, + "loss": 2.1703, + "step": 2129 + }, + { + "epoch": 0.6537753222836096, + "grad_norm": 0.6764169335365295, + "learning_rate": 9.967304545400016e-05, + "loss": 2.2177, + "step": 2130 + }, + { + "epoch": 0.6540822590546348, + "grad_norm": 0.6987971067428589, + "learning_rate": 9.967247770594872e-05, + "loss": 2.1699, + "step": 2131 + }, + { + "epoch": 0.6543891958256599, + "grad_norm": 0.7212976217269897, + "learning_rate": 9.967190946700476e-05, + "loss": 2.1217, + "step": 2132 + }, + { + "epoch": 0.6546961325966851, + "grad_norm": 0.6805562973022461, + "learning_rate": 9.967134073717386e-05, + "loss": 2.2295, + "step": 2133 + }, + { + "epoch": 0.6550030693677102, + "grad_norm": 0.665428102016449, + "learning_rate": 9.967077151646167e-05, + "loss": 2.1742, + "step": 2134 + }, + { + "epoch": 0.6553100061387355, + "grad_norm": 0.6691353917121887, + "learning_rate": 9.967020180487378e-05, + "loss": 2.2313, + "step": 2135 + }, + { + "epoch": 0.6556169429097606, + "grad_norm": 0.7095547914505005, + "learning_rate": 9.966963160241587e-05, + "loss": 2.1367, + "step": 2136 + }, + { + "epoch": 0.6559238796807858, + "grad_norm": 0.7050215601921082, + "learning_rate": 9.966906090909353e-05, + "loss": 2.3234, + "step": 2137 + }, + { + "epoch": 0.6562308164518109, + "grad_norm": 0.7592353820800781, + "learning_rate": 9.966848972491245e-05, + "loss": 2.1722, + "step": 2138 + }, + { + "epoch": 0.6565377532228361, + "grad_norm": 0.6520100831985474, + "learning_rate": 9.96679180498782e-05, + "loss": 2.2401, + "step": 2139 + }, + { + "epoch": 0.6568446899938613, + "grad_norm": 0.6650902628898621, + "learning_rate": 9.966734588399651e-05, + "loss": 2.2094, + "step": 2140 + }, + { + "epoch": 0.6571516267648865, + "grad_norm": 0.7236151099205017, + "learning_rate": 9.966677322727299e-05, + "loss": 2.3021, + "step": 2141 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.7160753011703491, + "learning_rate": 9.966620007971327e-05, + "loss": 2.1992, + "step": 2142 + }, + { + "epoch": 0.6577655003069368, + "grad_norm": 0.6761705279350281, + "learning_rate": 9.966562644132309e-05, + "loss": 2.1853, + "step": 2143 + }, + { + "epoch": 0.6580724370779619, + "grad_norm": 0.7017555236816406, + "learning_rate": 9.966505231210806e-05, + "loss": 2.208, + "step": 2144 + }, + { + "epoch": 0.6583793738489871, + "grad_norm": 0.7652586102485657, + "learning_rate": 9.966447769207387e-05, + "loss": 2.3065, + "step": 2145 + }, + { + "epoch": 0.6586863106200123, + "grad_norm": 0.7148436307907104, + "learning_rate": 9.966390258122621e-05, + "loss": 2.1388, + "step": 2146 + }, + { + "epoch": 0.6589932473910375, + "grad_norm": 0.5885360240936279, + "learning_rate": 9.966332697957076e-05, + "loss": 2.1463, + "step": 2147 + }, + { + "epoch": 0.6593001841620626, + "grad_norm": 0.6800816655158997, + "learning_rate": 9.966275088711321e-05, + "loss": 2.3397, + "step": 2148 + }, + { + "epoch": 0.6596071209330878, + "grad_norm": 0.6856956481933594, + "learning_rate": 9.966217430385925e-05, + "loss": 2.0893, + "step": 2149 + }, + { + "epoch": 0.6599140577041129, + "grad_norm": 0.6302888989448547, + "learning_rate": 9.966159722981456e-05, + "loss": 2.1108, + "step": 2150 + }, + { + "epoch": 0.6602209944751382, + "grad_norm": 0.6145252585411072, + "learning_rate": 9.966101966498486e-05, + "loss": 2.2668, + "step": 2151 + }, + { + "epoch": 0.6605279312461633, + "grad_norm": 0.7258949279785156, + "learning_rate": 9.966044160937586e-05, + "loss": 2.2163, + "step": 2152 + }, + { + "epoch": 0.6608348680171885, + "grad_norm": 0.6809847950935364, + "learning_rate": 9.965986306299327e-05, + "loss": 2.1828, + "step": 2153 + }, + { + "epoch": 0.6611418047882136, + "grad_norm": 0.6673223376274109, + "learning_rate": 9.96592840258428e-05, + "loss": 2.232, + "step": 2154 + }, + { + "epoch": 0.6614487415592388, + "grad_norm": 0.6483572721481323, + "learning_rate": 9.96587044979302e-05, + "loss": 2.199, + "step": 2155 + }, + { + "epoch": 0.6617556783302639, + "grad_norm": 0.6227185726165771, + "learning_rate": 9.965812447926115e-05, + "loss": 2.166, + "step": 2156 + }, + { + "epoch": 0.6620626151012892, + "grad_norm": 0.5982463955879211, + "learning_rate": 9.965754396984142e-05, + "loss": 2.2074, + "step": 2157 + }, + { + "epoch": 0.6623695518723143, + "grad_norm": 0.6357809901237488, + "learning_rate": 9.965696296967673e-05, + "loss": 2.2086, + "step": 2158 + }, + { + "epoch": 0.6626764886433395, + "grad_norm": 0.5908147692680359, + "learning_rate": 9.965638147877283e-05, + "loss": 2.1103, + "step": 2159 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.591332733631134, + "learning_rate": 9.965579949713545e-05, + "loss": 2.1698, + "step": 2160 + }, + { + "epoch": 0.6632903621853898, + "grad_norm": 0.5748336911201477, + "learning_rate": 9.965521702477038e-05, + "loss": 2.1812, + "step": 2161 + }, + { + "epoch": 0.663597298956415, + "grad_norm": 0.6643908023834229, + "learning_rate": 9.965463406168334e-05, + "loss": 2.2129, + "step": 2162 + }, + { + "epoch": 0.6639042357274402, + "grad_norm": 0.637627124786377, + "learning_rate": 9.965405060788011e-05, + "loss": 2.226, + "step": 2163 + }, + { + "epoch": 0.6642111724984653, + "grad_norm": 0.6170387268066406, + "learning_rate": 9.965346666336644e-05, + "loss": 2.2025, + "step": 2164 + }, + { + "epoch": 0.6645181092694905, + "grad_norm": 0.6038833260536194, + "learning_rate": 9.965288222814812e-05, + "loss": 2.1761, + "step": 2165 + }, + { + "epoch": 0.6648250460405156, + "grad_norm": 0.5705585479736328, + "learning_rate": 9.965229730223092e-05, + "loss": 2.1511, + "step": 2166 + }, + { + "epoch": 0.6651319828115408, + "grad_norm": 0.5994759798049927, + "learning_rate": 9.965171188562059e-05, + "loss": 2.1763, + "step": 2167 + }, + { + "epoch": 0.665438919582566, + "grad_norm": 0.5887313485145569, + "learning_rate": 9.965112597832296e-05, + "loss": 2.2185, + "step": 2168 + }, + { + "epoch": 0.6657458563535912, + "grad_norm": 0.5688689947128296, + "learning_rate": 9.96505395803438e-05, + "loss": 2.2387, + "step": 2169 + }, + { + "epoch": 0.6660527931246163, + "grad_norm": 0.6121554970741272, + "learning_rate": 9.96499526916889e-05, + "loss": 2.1938, + "step": 2170 + }, + { + "epoch": 0.6663597298956415, + "grad_norm": 0.6048038005828857, + "learning_rate": 9.964936531236407e-05, + "loss": 2.197, + "step": 2171 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6442995071411133, + "learning_rate": 9.96487774423751e-05, + "loss": 2.1725, + "step": 2172 + }, + { + "epoch": 0.6669736034376919, + "grad_norm": 0.7136862874031067, + "learning_rate": 9.964818908172783e-05, + "loss": 2.2166, + "step": 2173 + }, + { + "epoch": 0.667280540208717, + "grad_norm": 0.6902804970741272, + "learning_rate": 9.964760023042805e-05, + "loss": 2.2318, + "step": 2174 + }, + { + "epoch": 0.6675874769797422, + "grad_norm": 0.6946488618850708, + "learning_rate": 9.964701088848158e-05, + "loss": 2.177, + "step": 2175 + }, + { + "epoch": 0.6678944137507673, + "grad_norm": 0.6283712983131409, + "learning_rate": 9.964642105589425e-05, + "loss": 2.2227, + "step": 2176 + }, + { + "epoch": 0.6682013505217925, + "grad_norm": 0.5768510103225708, + "learning_rate": 9.96458307326719e-05, + "loss": 2.1559, + "step": 2177 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.6045784950256348, + "learning_rate": 9.964523991882035e-05, + "loss": 2.2018, + "step": 2178 + }, + { + "epoch": 0.6688152240638429, + "grad_norm": 0.5962889790534973, + "learning_rate": 9.964464861434544e-05, + "loss": 2.1898, + "step": 2179 + }, + { + "epoch": 0.669122160834868, + "grad_norm": 0.6611660718917847, + "learning_rate": 9.964405681925301e-05, + "loss": 2.1989, + "step": 2180 + }, + { + "epoch": 0.6694290976058932, + "grad_norm": 0.6764575242996216, + "learning_rate": 9.964346453354891e-05, + "loss": 2.2764, + "step": 2181 + }, + { + "epoch": 0.6697360343769183, + "grad_norm": 0.6795048117637634, + "learning_rate": 9.964287175723899e-05, + "loss": 2.1313, + "step": 2182 + }, + { + "epoch": 0.6700429711479435, + "grad_norm": 0.6697003841400146, + "learning_rate": 9.964227849032914e-05, + "loss": 2.1999, + "step": 2183 + }, + { + "epoch": 0.6703499079189686, + "grad_norm": 0.669682502746582, + "learning_rate": 9.964168473282519e-05, + "loss": 2.202, + "step": 2184 + }, + { + "epoch": 0.6706568446899939, + "grad_norm": 0.6823530793190002, + "learning_rate": 9.9641090484733e-05, + "loss": 2.2326, + "step": 2185 + }, + { + "epoch": 0.670963781461019, + "grad_norm": 0.7460775971412659, + "learning_rate": 9.964049574605848e-05, + "loss": 2.1594, + "step": 2186 + }, + { + "epoch": 0.6712707182320442, + "grad_norm": 0.8075460195541382, + "learning_rate": 9.963990051680744e-05, + "loss": 2.1506, + "step": 2187 + }, + { + "epoch": 0.6715776550030693, + "grad_norm": 0.8041695356369019, + "learning_rate": 9.963930479698585e-05, + "loss": 2.123, + "step": 2188 + }, + { + "epoch": 0.6718845917740945, + "grad_norm": 0.9129732251167297, + "learning_rate": 9.963870858659955e-05, + "loss": 2.116, + "step": 2189 + }, + { + "epoch": 0.6721915285451197, + "grad_norm": 0.9989685416221619, + "learning_rate": 9.963811188565444e-05, + "loss": 2.3194, + "step": 2190 + }, + { + "epoch": 0.6724984653161449, + "grad_norm": 1.0353670120239258, + "learning_rate": 9.96375146941564e-05, + "loss": 2.113, + "step": 2191 + }, + { + "epoch": 0.67280540208717, + "grad_norm": 0.897750735282898, + "learning_rate": 9.963691701211135e-05, + "loss": 2.1038, + "step": 2192 + }, + { + "epoch": 0.6731123388581952, + "grad_norm": 0.7353916168212891, + "learning_rate": 9.96363188395252e-05, + "loss": 2.2185, + "step": 2193 + }, + { + "epoch": 0.6734192756292203, + "grad_norm": 0.6474063992500305, + "learning_rate": 9.963572017640385e-05, + "loss": 2.2229, + "step": 2194 + }, + { + "epoch": 0.6737262124002455, + "grad_norm": 0.7194583415985107, + "learning_rate": 9.963512102275322e-05, + "loss": 2.2172, + "step": 2195 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.6638131737709045, + "learning_rate": 9.963452137857926e-05, + "loss": 2.2212, + "step": 2196 + }, + { + "epoch": 0.6743400859422959, + "grad_norm": 0.7219048738479614, + "learning_rate": 9.963392124388782e-05, + "loss": 2.3302, + "step": 2197 + }, + { + "epoch": 0.6746470227133211, + "grad_norm": 0.7941164374351501, + "learning_rate": 9.963332061868491e-05, + "loss": 2.2982, + "step": 2198 + }, + { + "epoch": 0.6749539594843462, + "grad_norm": 0.7356888055801392, + "learning_rate": 9.963271950297643e-05, + "loss": 2.1761, + "step": 2199 + }, + { + "epoch": 0.6752608962553714, + "grad_norm": 0.6705774664878845, + "learning_rate": 9.963211789676831e-05, + "loss": 2.2483, + "step": 2200 + }, + { + "epoch": 0.6755678330263966, + "grad_norm": 0.7958056926727295, + "learning_rate": 9.963151580006653e-05, + "loss": 2.2209, + "step": 2201 + }, + { + "epoch": 0.6758747697974218, + "grad_norm": 0.7215412259101868, + "learning_rate": 9.9630913212877e-05, + "loss": 2.1676, + "step": 2202 + }, + { + "epoch": 0.6761817065684469, + "grad_norm": 0.705649197101593, + "learning_rate": 9.963031013520572e-05, + "loss": 2.1855, + "step": 2203 + }, + { + "epoch": 0.6764886433394721, + "grad_norm": 0.7050254344940186, + "learning_rate": 9.962970656705861e-05, + "loss": 2.171, + "step": 2204 + }, + { + "epoch": 0.6767955801104972, + "grad_norm": 0.7163556218147278, + "learning_rate": 9.962910250844167e-05, + "loss": 2.1295, + "step": 2205 + }, + { + "epoch": 0.6771025168815225, + "grad_norm": 0.7195280194282532, + "learning_rate": 9.962849795936083e-05, + "loss": 2.1436, + "step": 2206 + }, + { + "epoch": 0.6774094536525476, + "grad_norm": 0.7356030344963074, + "learning_rate": 9.962789291982208e-05, + "loss": 2.2739, + "step": 2207 + }, + { + "epoch": 0.6777163904235728, + "grad_norm": 0.783649742603302, + "learning_rate": 9.962728738983143e-05, + "loss": 2.2461, + "step": 2208 + }, + { + "epoch": 0.6780233271945979, + "grad_norm": 0.6966754794120789, + "learning_rate": 9.962668136939481e-05, + "loss": 2.1977, + "step": 2209 + }, + { + "epoch": 0.6783302639656231, + "grad_norm": 0.6986487507820129, + "learning_rate": 9.962607485851825e-05, + "loss": 2.1806, + "step": 2210 + }, + { + "epoch": 0.6786372007366482, + "grad_norm": 0.6502536535263062, + "learning_rate": 9.962546785720774e-05, + "loss": 2.174, + "step": 2211 + }, + { + "epoch": 0.6789441375076735, + "grad_norm": 0.6797144412994385, + "learning_rate": 9.962486036546926e-05, + "loss": 2.2635, + "step": 2212 + }, + { + "epoch": 0.6792510742786986, + "grad_norm": 0.7190150022506714, + "learning_rate": 9.962425238330884e-05, + "loss": 2.2231, + "step": 2213 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.6770560145378113, + "learning_rate": 9.962364391073245e-05, + "loss": 2.1639, + "step": 2214 + }, + { + "epoch": 0.6798649478207489, + "grad_norm": 0.624911904335022, + "learning_rate": 9.962303494774614e-05, + "loss": 2.1754, + "step": 2215 + }, + { + "epoch": 0.6801718845917741, + "grad_norm": 0.7127423286437988, + "learning_rate": 9.96224254943559e-05, + "loss": 2.2047, + "step": 2216 + }, + { + "epoch": 0.6804788213627992, + "grad_norm": 0.6729345321655273, + "learning_rate": 9.962181555056778e-05, + "loss": 2.2245, + "step": 2217 + }, + { + "epoch": 0.6807857581338245, + "grad_norm": 0.7142044901847839, + "learning_rate": 9.96212051163878e-05, + "loss": 2.1827, + "step": 2218 + }, + { + "epoch": 0.6810926949048496, + "grad_norm": 0.686295211315155, + "learning_rate": 9.962059419182196e-05, + "loss": 2.1784, + "step": 2219 + }, + { + "epoch": 0.6813996316758748, + "grad_norm": 0.7207211256027222, + "learning_rate": 9.961998277687634e-05, + "loss": 2.2603, + "step": 2220 + }, + { + "epoch": 0.6817065684468999, + "grad_norm": 0.814552903175354, + "learning_rate": 9.961937087155697e-05, + "loss": 2.2328, + "step": 2221 + }, + { + "epoch": 0.6820135052179251, + "grad_norm": 0.851860761642456, + "learning_rate": 9.96187584758699e-05, + "loss": 2.2334, + "step": 2222 + }, + { + "epoch": 0.6823204419889503, + "grad_norm": 0.9232058525085449, + "learning_rate": 9.961814558982117e-05, + "loss": 2.2259, + "step": 2223 + }, + { + "epoch": 0.6826273787599755, + "grad_norm": 0.8393358588218689, + "learning_rate": 9.961753221341684e-05, + "loss": 2.1347, + "step": 2224 + }, + { + "epoch": 0.6829343155310006, + "grad_norm": 0.7124439477920532, + "learning_rate": 9.961691834666297e-05, + "loss": 2.195, + "step": 2225 + }, + { + "epoch": 0.6832412523020258, + "grad_norm": 0.644290566444397, + "learning_rate": 9.961630398956565e-05, + "loss": 2.1967, + "step": 2226 + }, + { + "epoch": 0.6835481890730509, + "grad_norm": 0.6896283030509949, + "learning_rate": 9.961568914213092e-05, + "loss": 2.1781, + "step": 2227 + }, + { + "epoch": 0.6838551258440762, + "grad_norm": 0.711643636226654, + "learning_rate": 9.961507380436487e-05, + "loss": 2.1091, + "step": 2228 + }, + { + "epoch": 0.6841620626151013, + "grad_norm": 0.7056689858436584, + "learning_rate": 9.961445797627358e-05, + "loss": 2.1848, + "step": 2229 + }, + { + "epoch": 0.6844689993861265, + "grad_norm": 0.60573410987854, + "learning_rate": 9.961384165786314e-05, + "loss": 2.1156, + "step": 2230 + }, + { + "epoch": 0.6847759361571516, + "grad_norm": 0.5612443089485168, + "learning_rate": 9.961322484913963e-05, + "loss": 2.2311, + "step": 2231 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.6356449723243713, + "learning_rate": 9.961260755010916e-05, + "loss": 2.1945, + "step": 2232 + }, + { + "epoch": 0.6853898096992019, + "grad_norm": 0.7393341660499573, + "learning_rate": 9.961198976077782e-05, + "loss": 2.2743, + "step": 2233 + }, + { + "epoch": 0.6856967464702272, + "grad_norm": 0.7658794522285461, + "learning_rate": 9.961137148115171e-05, + "loss": 2.1729, + "step": 2234 + }, + { + "epoch": 0.6860036832412523, + "grad_norm": 0.790540337562561, + "learning_rate": 9.961075271123697e-05, + "loss": 2.1372, + "step": 2235 + }, + { + "epoch": 0.6863106200122775, + "grad_norm": 0.71295565366745, + "learning_rate": 9.961013345103968e-05, + "loss": 2.1325, + "step": 2236 + }, + { + "epoch": 0.6866175567833026, + "grad_norm": 0.6648302674293518, + "learning_rate": 9.960951370056597e-05, + "loss": 2.1626, + "step": 2237 + }, + { + "epoch": 0.6869244935543278, + "grad_norm": 0.6276865601539612, + "learning_rate": 9.960889345982198e-05, + "loss": 2.1848, + "step": 2238 + }, + { + "epoch": 0.6872314303253529, + "grad_norm": 0.6786942481994629, + "learning_rate": 9.960827272881383e-05, + "loss": 2.2402, + "step": 2239 + }, + { + "epoch": 0.6875383670963782, + "grad_norm": 0.7752293348312378, + "learning_rate": 9.960765150754764e-05, + "loss": 2.2187, + "step": 2240 + }, + { + "epoch": 0.6878453038674033, + "grad_norm": 0.7958577871322632, + "learning_rate": 9.960702979602956e-05, + "loss": 2.1995, + "step": 2241 + }, + { + "epoch": 0.6881522406384285, + "grad_norm": 0.7327582240104675, + "learning_rate": 9.960640759426575e-05, + "loss": 2.1709, + "step": 2242 + }, + { + "epoch": 0.6884591774094536, + "grad_norm": 0.7002710103988647, + "learning_rate": 9.960578490226233e-05, + "loss": 2.1966, + "step": 2243 + }, + { + "epoch": 0.6887661141804788, + "grad_norm": 0.6163785457611084, + "learning_rate": 9.960516172002548e-05, + "loss": 2.2012, + "step": 2244 + }, + { + "epoch": 0.689073050951504, + "grad_norm": 0.6808127760887146, + "learning_rate": 9.960453804756134e-05, + "loss": 2.1704, + "step": 2245 + }, + { + "epoch": 0.6893799877225292, + "grad_norm": 0.6571208834648132, + "learning_rate": 9.960391388487609e-05, + "loss": 2.17, + "step": 2246 + }, + { + "epoch": 0.6896869244935543, + "grad_norm": 0.7180834412574768, + "learning_rate": 9.960328923197588e-05, + "loss": 2.229, + "step": 2247 + }, + { + "epoch": 0.6899938612645795, + "grad_norm": 0.7283746600151062, + "learning_rate": 9.96026640888669e-05, + "loss": 2.195, + "step": 2248 + }, + { + "epoch": 0.6903007980356046, + "grad_norm": 0.6808122992515564, + "learning_rate": 9.960203845555531e-05, + "loss": 2.1327, + "step": 2249 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.7105094790458679, + "learning_rate": 9.960141233204731e-05, + "loss": 2.2747, + "step": 2250 + }, + { + "epoch": 0.690914671577655, + "grad_norm": 0.7650291919708252, + "learning_rate": 9.960078571834909e-05, + "loss": 2.2751, + "step": 2251 + }, + { + "epoch": 0.6912216083486802, + "grad_norm": 0.8347647786140442, + "learning_rate": 9.960015861446684e-05, + "loss": 2.2101, + "step": 2252 + }, + { + "epoch": 0.6915285451197053, + "grad_norm": 0.7774063348770142, + "learning_rate": 9.959953102040672e-05, + "loss": 2.1275, + "step": 2253 + }, + { + "epoch": 0.6918354818907305, + "grad_norm": 0.7466274499893188, + "learning_rate": 9.959890293617497e-05, + "loss": 2.1352, + "step": 2254 + }, + { + "epoch": 0.6921424186617556, + "grad_norm": 0.7451669573783875, + "learning_rate": 9.959827436177781e-05, + "loss": 2.1229, + "step": 2255 + }, + { + "epoch": 0.6924493554327809, + "grad_norm": 0.651746392250061, + "learning_rate": 9.959764529722142e-05, + "loss": 2.1416, + "step": 2256 + }, + { + "epoch": 0.692756292203806, + "grad_norm": 0.6267968416213989, + "learning_rate": 9.959701574251203e-05, + "loss": 2.1346, + "step": 2257 + }, + { + "epoch": 0.6930632289748312, + "grad_norm": 0.6087000966072083, + "learning_rate": 9.959638569765586e-05, + "loss": 2.2136, + "step": 2258 + }, + { + "epoch": 0.6933701657458563, + "grad_norm": 0.6032208204269409, + "learning_rate": 9.959575516265914e-05, + "loss": 2.1211, + "step": 2259 + }, + { + "epoch": 0.6936771025168815, + "grad_norm": 0.83074551820755, + "learning_rate": 9.95951241375281e-05, + "loss": 2.2951, + "step": 2260 + }, + { + "epoch": 0.6939840392879066, + "grad_norm": 0.8564106225967407, + "learning_rate": 9.959449262226897e-05, + "loss": 2.1496, + "step": 2261 + }, + { + "epoch": 0.6942909760589319, + "grad_norm": 0.8558153510093689, + "learning_rate": 9.9593860616888e-05, + "loss": 2.2325, + "step": 2262 + }, + { + "epoch": 0.694597912829957, + "grad_norm": 0.7391008734703064, + "learning_rate": 9.959322812139143e-05, + "loss": 2.1133, + "step": 2263 + }, + { + "epoch": 0.6949048496009822, + "grad_norm": 0.6090536713600159, + "learning_rate": 9.959259513578552e-05, + "loss": 2.1453, + "step": 2264 + }, + { + "epoch": 0.6952117863720073, + "grad_norm": 0.5893986821174622, + "learning_rate": 9.95919616600765e-05, + "loss": 2.2035, + "step": 2265 + }, + { + "epoch": 0.6955187231430325, + "grad_norm": 0.6274020671844482, + "learning_rate": 9.959132769427065e-05, + "loss": 2.2118, + "step": 2266 + }, + { + "epoch": 0.6958256599140578, + "grad_norm": 0.6287395358085632, + "learning_rate": 9.959069323837424e-05, + "loss": 2.2167, + "step": 2267 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.6281611323356628, + "learning_rate": 9.959005829239354e-05, + "loss": 2.1945, + "step": 2268 + }, + { + "epoch": 0.6964395334561081, + "grad_norm": 0.6422389149665833, + "learning_rate": 9.958942285633481e-05, + "loss": 2.1826, + "step": 2269 + }, + { + "epoch": 0.6967464702271332, + "grad_norm": 0.6461887955665588, + "learning_rate": 9.958878693020434e-05, + "loss": 2.2454, + "step": 2270 + }, + { + "epoch": 0.6970534069981584, + "grad_norm": 0.562102735042572, + "learning_rate": 9.958815051400841e-05, + "loss": 2.1375, + "step": 2271 + }, + { + "epoch": 0.6973603437691835, + "grad_norm": 0.5737003087997437, + "learning_rate": 9.958751360775331e-05, + "loss": 2.2344, + "step": 2272 + }, + { + "epoch": 0.6976672805402088, + "grad_norm": 0.5516494512557983, + "learning_rate": 9.958687621144535e-05, + "loss": 2.249, + "step": 2273 + }, + { + "epoch": 0.6979742173112339, + "grad_norm": 0.7148357629776001, + "learning_rate": 9.958623832509081e-05, + "loss": 2.2383, + "step": 2274 + }, + { + "epoch": 0.6982811540822591, + "grad_norm": 0.7151525020599365, + "learning_rate": 9.958559994869599e-05, + "loss": 2.1697, + "step": 2275 + }, + { + "epoch": 0.6985880908532842, + "grad_norm": 0.6927846670150757, + "learning_rate": 9.958496108226722e-05, + "loss": 2.1534, + "step": 2276 + }, + { + "epoch": 0.6988950276243094, + "grad_norm": 0.811660647392273, + "learning_rate": 9.958432172581079e-05, + "loss": 2.2197, + "step": 2277 + }, + { + "epoch": 0.6992019643953346, + "grad_norm": 0.9680081009864807, + "learning_rate": 9.958368187933305e-05, + "loss": 2.2241, + "step": 2278 + }, + { + "epoch": 0.6995089011663598, + "grad_norm": 0.9996320605278015, + "learning_rate": 9.958304154284028e-05, + "loss": 2.1598, + "step": 2279 + }, + { + "epoch": 0.6998158379373849, + "grad_norm": 1.008695363998413, + "learning_rate": 9.958240071633884e-05, + "loss": 2.2082, + "step": 2280 + }, + { + "epoch": 0.7001227747084101, + "grad_norm": 0.9931860566139221, + "learning_rate": 9.958175939983506e-05, + "loss": 2.1478, + "step": 2281 + }, + { + "epoch": 0.7004297114794352, + "grad_norm": 0.8637800812721252, + "learning_rate": 9.958111759333528e-05, + "loss": 2.149, + "step": 2282 + }, + { + "epoch": 0.7007366482504604, + "grad_norm": 0.7089012861251831, + "learning_rate": 9.958047529684582e-05, + "loss": 2.1845, + "step": 2283 + }, + { + "epoch": 0.7010435850214856, + "grad_norm": 0.6083673238754272, + "learning_rate": 9.957983251037303e-05, + "loss": 2.1542, + "step": 2284 + }, + { + "epoch": 0.7013505217925108, + "grad_norm": 0.7092905044555664, + "learning_rate": 9.957918923392331e-05, + "loss": 2.2305, + "step": 2285 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.8416675925254822, + "learning_rate": 9.957854546750297e-05, + "loss": 2.2975, + "step": 2286 + }, + { + "epoch": 0.7019643953345611, + "grad_norm": 0.7778663039207458, + "learning_rate": 9.957790121111838e-05, + "loss": 2.2363, + "step": 2287 + }, + { + "epoch": 0.7022713321055862, + "grad_norm": 0.7886617183685303, + "learning_rate": 9.957725646477592e-05, + "loss": 2.1547, + "step": 2288 + }, + { + "epoch": 0.7025782688766115, + "grad_norm": 0.6596038937568665, + "learning_rate": 9.957661122848194e-05, + "loss": 2.1537, + "step": 2289 + }, + { + "epoch": 0.7028852056476366, + "grad_norm": 0.6441544890403748, + "learning_rate": 9.957596550224285e-05, + "loss": 2.1678, + "step": 2290 + }, + { + "epoch": 0.7031921424186618, + "grad_norm": 0.7106116414070129, + "learning_rate": 9.957531928606499e-05, + "loss": 2.2039, + "step": 2291 + }, + { + "epoch": 0.7034990791896869, + "grad_norm": 0.6948207020759583, + "learning_rate": 9.957467257995476e-05, + "loss": 2.176, + "step": 2292 + }, + { + "epoch": 0.7038060159607121, + "grad_norm": 0.6834874153137207, + "learning_rate": 9.957402538391859e-05, + "loss": 2.2182, + "step": 2293 + }, + { + "epoch": 0.7041129527317372, + "grad_norm": 0.6246630549430847, + "learning_rate": 9.957337769796282e-05, + "loss": 2.1181, + "step": 2294 + }, + { + "epoch": 0.7044198895027625, + "grad_norm": 0.6421988606452942, + "learning_rate": 9.957272952209389e-05, + "loss": 2.1352, + "step": 2295 + }, + { + "epoch": 0.7047268262737876, + "grad_norm": 0.5955870151519775, + "learning_rate": 9.95720808563182e-05, + "loss": 2.1852, + "step": 2296 + }, + { + "epoch": 0.7050337630448128, + "grad_norm": 0.6961265206336975, + "learning_rate": 9.957143170064214e-05, + "loss": 2.242, + "step": 2297 + }, + { + "epoch": 0.7053406998158379, + "grad_norm": 0.6966063380241394, + "learning_rate": 9.957078205507213e-05, + "loss": 2.1505, + "step": 2298 + }, + { + "epoch": 0.7056476365868631, + "grad_norm": 0.6155996322631836, + "learning_rate": 9.957013191961459e-05, + "loss": 2.1928, + "step": 2299 + }, + { + "epoch": 0.7059545733578882, + "grad_norm": 0.6092718839645386, + "learning_rate": 9.956948129427597e-05, + "loss": 2.138, + "step": 2300 + }, + { + "epoch": 0.7062615101289135, + "grad_norm": 0.645746111869812, + "learning_rate": 9.95688301790627e-05, + "loss": 2.2334, + "step": 2301 + }, + { + "epoch": 0.7065684468999386, + "grad_norm": 0.5959149599075317, + "learning_rate": 9.956817857398116e-05, + "loss": 2.1985, + "step": 2302 + }, + { + "epoch": 0.7068753836709638, + "grad_norm": 0.7127073407173157, + "learning_rate": 9.956752647903785e-05, + "loss": 2.2157, + "step": 2303 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.5589274764060974, + "learning_rate": 9.956687389423917e-05, + "loss": 2.1251, + "step": 2304 + }, + { + "epoch": 0.7074892572130141, + "grad_norm": 0.5502300262451172, + "learning_rate": 9.95662208195916e-05, + "loss": 2.1344, + "step": 2305 + }, + { + "epoch": 0.7077961939840393, + "grad_norm": 0.6577275991439819, + "learning_rate": 9.95655672551016e-05, + "loss": 2.1646, + "step": 2306 + }, + { + "epoch": 0.7081031307550645, + "grad_norm": 0.6241618394851685, + "learning_rate": 9.956491320077559e-05, + "loss": 2.1153, + "step": 2307 + }, + { + "epoch": 0.7084100675260896, + "grad_norm": 0.5846728086471558, + "learning_rate": 9.956425865662007e-05, + "loss": 2.1477, + "step": 2308 + }, + { + "epoch": 0.7087170042971148, + "grad_norm": 0.6005275249481201, + "learning_rate": 9.95636036226415e-05, + "loss": 2.2034, + "step": 2309 + }, + { + "epoch": 0.7090239410681399, + "grad_norm": 0.6545519828796387, + "learning_rate": 9.956294809884635e-05, + "loss": 2.23, + "step": 2310 + }, + { + "epoch": 0.7093308778391652, + "grad_norm": 0.7513750791549683, + "learning_rate": 9.956229208524108e-05, + "loss": 2.2497, + "step": 2311 + }, + { + "epoch": 0.7096378146101903, + "grad_norm": 0.7308349609375, + "learning_rate": 9.956163558183219e-05, + "loss": 2.166, + "step": 2312 + }, + { + "epoch": 0.7099447513812155, + "grad_norm": 0.6278798580169678, + "learning_rate": 9.956097858862619e-05, + "loss": 2.1994, + "step": 2313 + }, + { + "epoch": 0.7102516881522406, + "grad_norm": 0.6725621223449707, + "learning_rate": 9.956032110562953e-05, + "loss": 2.2212, + "step": 2314 + }, + { + "epoch": 0.7105586249232658, + "grad_norm": 0.7116945385932922, + "learning_rate": 9.955966313284872e-05, + "loss": 2.2033, + "step": 2315 + }, + { + "epoch": 0.7108655616942909, + "grad_norm": 0.5906245112419128, + "learning_rate": 9.95590046702903e-05, + "loss": 2.1419, + "step": 2316 + }, + { + "epoch": 0.7111724984653162, + "grad_norm": 0.6911863684654236, + "learning_rate": 9.955834571796073e-05, + "loss": 2.1697, + "step": 2317 + }, + { + "epoch": 0.7114794352363413, + "grad_norm": 0.600350558757782, + "learning_rate": 9.955768627586655e-05, + "loss": 2.0864, + "step": 2318 + }, + { + "epoch": 0.7117863720073665, + "grad_norm": 0.6246278285980225, + "learning_rate": 9.955702634401427e-05, + "loss": 2.1549, + "step": 2319 + }, + { + "epoch": 0.7120933087783916, + "grad_norm": 0.6530009508132935, + "learning_rate": 9.95563659224104e-05, + "loss": 2.1457, + "step": 2320 + }, + { + "epoch": 0.7124002455494168, + "grad_norm": 0.6566256880760193, + "learning_rate": 9.955570501106148e-05, + "loss": 2.1589, + "step": 2321 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.6607041358947754, + "learning_rate": 9.955504360997404e-05, + "loss": 2.1692, + "step": 2322 + }, + { + "epoch": 0.7130141190914672, + "grad_norm": 0.7257810235023499, + "learning_rate": 9.95543817191546e-05, + "loss": 2.2067, + "step": 2323 + }, + { + "epoch": 0.7133210558624923, + "grad_norm": 0.7413349151611328, + "learning_rate": 9.955371933860973e-05, + "loss": 2.1817, + "step": 2324 + }, + { + "epoch": 0.7136279926335175, + "grad_norm": 0.6968317031860352, + "learning_rate": 9.955305646834596e-05, + "loss": 2.2574, + "step": 2325 + }, + { + "epoch": 0.7139349294045426, + "grad_norm": 0.8065732717514038, + "learning_rate": 9.955239310836983e-05, + "loss": 2.1957, + "step": 2326 + }, + { + "epoch": 0.7142418661755678, + "grad_norm": 0.7563133835792542, + "learning_rate": 9.955172925868792e-05, + "loss": 2.2113, + "step": 2327 + }, + { + "epoch": 0.714548802946593, + "grad_norm": 0.6790496110916138, + "learning_rate": 9.955106491930678e-05, + "loss": 2.103, + "step": 2328 + }, + { + "epoch": 0.7148557397176182, + "grad_norm": 0.65167236328125, + "learning_rate": 9.955040009023298e-05, + "loss": 2.1919, + "step": 2329 + }, + { + "epoch": 0.7151626764886433, + "grad_norm": 0.6869332790374756, + "learning_rate": 9.954973477147307e-05, + "loss": 2.2141, + "step": 2330 + }, + { + "epoch": 0.7154696132596685, + "grad_norm": 0.8613699078559875, + "learning_rate": 9.954906896303363e-05, + "loss": 2.1962, + "step": 2331 + }, + { + "epoch": 0.7157765500306936, + "grad_norm": 0.8827282786369324, + "learning_rate": 9.954840266492127e-05, + "loss": 2.216, + "step": 2332 + }, + { + "epoch": 0.7160834868017188, + "grad_norm": 0.9737905263900757, + "learning_rate": 9.954773587714255e-05, + "loss": 2.2118, + "step": 2333 + }, + { + "epoch": 0.716390423572744, + "grad_norm": 0.9978635311126709, + "learning_rate": 9.954706859970404e-05, + "loss": 2.0998, + "step": 2334 + }, + { + "epoch": 0.7166973603437692, + "grad_norm": 0.8694623112678528, + "learning_rate": 9.954640083261238e-05, + "loss": 2.1533, + "step": 2335 + }, + { + "epoch": 0.7170042971147943, + "grad_norm": 0.641293466091156, + "learning_rate": 9.954573257587415e-05, + "loss": 2.2095, + "step": 2336 + }, + { + "epoch": 0.7173112338858195, + "grad_norm": 0.6289860010147095, + "learning_rate": 9.954506382949594e-05, + "loss": 2.1683, + "step": 2337 + }, + { + "epoch": 0.7176181706568447, + "grad_norm": 0.8292246460914612, + "learning_rate": 9.954439459348437e-05, + "loss": 2.1729, + "step": 2338 + }, + { + "epoch": 0.7179251074278699, + "grad_norm": 0.8990920782089233, + "learning_rate": 9.954372486784605e-05, + "loss": 2.0888, + "step": 2339 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.7905614376068115, + "learning_rate": 9.954305465258762e-05, + "loss": 2.2262, + "step": 2340 + }, + { + "epoch": 0.7185389809699202, + "grad_norm": 0.7142611145973206, + "learning_rate": 9.954238394771567e-05, + "loss": 2.1311, + "step": 2341 + }, + { + "epoch": 0.7188459177409454, + "grad_norm": 0.68161541223526, + "learning_rate": 9.954171275323684e-05, + "loss": 2.2622, + "step": 2342 + }, + { + "epoch": 0.7191528545119705, + "grad_norm": 0.7524895668029785, + "learning_rate": 9.954104106915779e-05, + "loss": 2.1709, + "step": 2343 + }, + { + "epoch": 0.7194597912829958, + "grad_norm": 0.7419885396957397, + "learning_rate": 9.954036889548511e-05, + "loss": 2.1528, + "step": 2344 + }, + { + "epoch": 0.7197667280540209, + "grad_norm": 0.8045634031295776, + "learning_rate": 9.953969623222547e-05, + "loss": 2.1774, + "step": 2345 + }, + { + "epoch": 0.7200736648250461, + "grad_norm": 0.6680217385292053, + "learning_rate": 9.953902307938554e-05, + "loss": 2.2345, + "step": 2346 + }, + { + "epoch": 0.7203806015960712, + "grad_norm": 0.6900907754898071, + "learning_rate": 9.953834943697193e-05, + "loss": 2.1696, + "step": 2347 + }, + { + "epoch": 0.7206875383670964, + "grad_norm": 0.7231009006500244, + "learning_rate": 9.953767530499132e-05, + "loss": 2.2556, + "step": 2348 + }, + { + "epoch": 0.7209944751381215, + "grad_norm": 0.7766092419624329, + "learning_rate": 9.953700068345036e-05, + "loss": 2.1522, + "step": 2349 + }, + { + "epoch": 0.7213014119091468, + "grad_norm": 0.7361852526664734, + "learning_rate": 9.953632557235574e-05, + "loss": 2.2427, + "step": 2350 + }, + { + "epoch": 0.7216083486801719, + "grad_norm": 0.7170109152793884, + "learning_rate": 9.953564997171411e-05, + "loss": 2.2439, + "step": 2351 + }, + { + "epoch": 0.7219152854511971, + "grad_norm": 0.7192662954330444, + "learning_rate": 9.953497388153214e-05, + "loss": 2.1242, + "step": 2352 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.7363288402557373, + "learning_rate": 9.953429730181653e-05, + "loss": 2.2748, + "step": 2353 + }, + { + "epoch": 0.7225291589932474, + "grad_norm": 0.8516983985900879, + "learning_rate": 9.953362023257397e-05, + "loss": 2.2471, + "step": 2354 + }, + { + "epoch": 0.7228360957642725, + "grad_norm": 0.7928574681282043, + "learning_rate": 9.953294267381114e-05, + "loss": 2.164, + "step": 2355 + }, + { + "epoch": 0.7231430325352978, + "grad_norm": 0.6803320646286011, + "learning_rate": 9.953226462553474e-05, + "loss": 2.1671, + "step": 2356 + }, + { + "epoch": 0.7234499693063229, + "grad_norm": 0.6811994910240173, + "learning_rate": 9.953158608775147e-05, + "loss": 2.1042, + "step": 2357 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.6077840328216553, + "learning_rate": 9.953090706046804e-05, + "loss": 2.2161, + "step": 2358 + }, + { + "epoch": 0.7240638428483732, + "grad_norm": 0.5938412547111511, + "learning_rate": 9.953022754369114e-05, + "loss": 2.1177, + "step": 2359 + }, + { + "epoch": 0.7243707796193984, + "grad_norm": 0.6752299070358276, + "learning_rate": 9.952954753742751e-05, + "loss": 2.2255, + "step": 2360 + }, + { + "epoch": 0.7246777163904236, + "grad_norm": 0.6745245456695557, + "learning_rate": 9.952886704168387e-05, + "loss": 2.1817, + "step": 2361 + }, + { + "epoch": 0.7249846531614488, + "grad_norm": 0.6645397543907166, + "learning_rate": 9.95281860564669e-05, + "loss": 2.2495, + "step": 2362 + }, + { + "epoch": 0.7252915899324739, + "grad_norm": 0.6758745312690735, + "learning_rate": 9.95275045817834e-05, + "loss": 2.2059, + "step": 2363 + }, + { + "epoch": 0.7255985267034991, + "grad_norm": 0.6584516763687134, + "learning_rate": 9.952682261764006e-05, + "loss": 2.1868, + "step": 2364 + }, + { + "epoch": 0.7259054634745242, + "grad_norm": 0.6335561871528625, + "learning_rate": 9.952614016404363e-05, + "loss": 2.1352, + "step": 2365 + }, + { + "epoch": 0.7262124002455494, + "grad_norm": 0.6656816601753235, + "learning_rate": 9.952545722100087e-05, + "loss": 2.1805, + "step": 2366 + }, + { + "epoch": 0.7265193370165746, + "grad_norm": 0.6262782216072083, + "learning_rate": 9.95247737885185e-05, + "loss": 2.1435, + "step": 2367 + }, + { + "epoch": 0.7268262737875998, + "grad_norm": 0.569795548915863, + "learning_rate": 9.952408986660329e-05, + "loss": 2.1547, + "step": 2368 + }, + { + "epoch": 0.7271332105586249, + "grad_norm": 0.5249118208885193, + "learning_rate": 9.952340545526199e-05, + "loss": 2.1213, + "step": 2369 + }, + { + "epoch": 0.7274401473296501, + "grad_norm": 0.5581740140914917, + "learning_rate": 9.952272055450139e-05, + "loss": 2.1866, + "step": 2370 + }, + { + "epoch": 0.7277470841006752, + "grad_norm": 0.5986969470977783, + "learning_rate": 9.952203516432821e-05, + "loss": 2.143, + "step": 2371 + }, + { + "epoch": 0.7280540208717005, + "grad_norm": 0.6426723599433899, + "learning_rate": 9.952134928474926e-05, + "loss": 2.2132, + "step": 2372 + }, + { + "epoch": 0.7283609576427256, + "grad_norm": 0.5856953263282776, + "learning_rate": 9.952066291577133e-05, + "loss": 2.1502, + "step": 2373 + }, + { + "epoch": 0.7286678944137508, + "grad_norm": 0.5420570969581604, + "learning_rate": 9.951997605740117e-05, + "loss": 2.1213, + "step": 2374 + }, + { + "epoch": 0.7289748311847759, + "grad_norm": 0.6201688647270203, + "learning_rate": 9.951928870964558e-05, + "loss": 2.218, + "step": 2375 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.7023850083351135, + "learning_rate": 9.951860087251137e-05, + "loss": 2.2787, + "step": 2376 + }, + { + "epoch": 0.7295887047268262, + "grad_norm": 0.733650803565979, + "learning_rate": 9.951791254600532e-05, + "loss": 2.1861, + "step": 2377 + }, + { + "epoch": 0.7298956414978515, + "grad_norm": 0.7177363038063049, + "learning_rate": 9.951722373013421e-05, + "loss": 2.1905, + "step": 2378 + }, + { + "epoch": 0.7302025782688766, + "grad_norm": 0.7963547706604004, + "learning_rate": 9.95165344249049e-05, + "loss": 2.1842, + "step": 2379 + }, + { + "epoch": 0.7305095150399018, + "grad_norm": 0.8466546535491943, + "learning_rate": 9.951584463032416e-05, + "loss": 2.1661, + "step": 2380 + }, + { + "epoch": 0.7308164518109269, + "grad_norm": 0.7288870811462402, + "learning_rate": 9.951515434639882e-05, + "loss": 2.1153, + "step": 2381 + }, + { + "epoch": 0.7311233885819521, + "grad_norm": 0.6168704032897949, + "learning_rate": 9.951446357313571e-05, + "loss": 2.121, + "step": 2382 + }, + { + "epoch": 0.7314303253529773, + "grad_norm": 0.6534848809242249, + "learning_rate": 9.951377231054166e-05, + "loss": 2.2087, + "step": 2383 + }, + { + "epoch": 0.7317372621240025, + "grad_norm": 0.7872020602226257, + "learning_rate": 9.951308055862347e-05, + "loss": 2.2428, + "step": 2384 + }, + { + "epoch": 0.7320441988950276, + "grad_norm": 0.864799439907074, + "learning_rate": 9.9512388317388e-05, + "loss": 2.2392, + "step": 2385 + }, + { + "epoch": 0.7323511356660528, + "grad_norm": 0.7365485429763794, + "learning_rate": 9.95116955868421e-05, + "loss": 2.1614, + "step": 2386 + }, + { + "epoch": 0.7326580724370779, + "grad_norm": 0.6509390473365784, + "learning_rate": 9.95110023669926e-05, + "loss": 2.1917, + "step": 2387 + }, + { + "epoch": 0.7329650092081031, + "grad_norm": 0.7660403847694397, + "learning_rate": 9.951030865784635e-05, + "loss": 2.2414, + "step": 2388 + }, + { + "epoch": 0.7332719459791283, + "grad_norm": 0.9997872114181519, + "learning_rate": 9.950961445941022e-05, + "loss": 2.2063, + "step": 2389 + }, + { + "epoch": 0.7335788827501535, + "grad_norm": 1.0113418102264404, + "learning_rate": 9.950891977169106e-05, + "loss": 2.1898, + "step": 2390 + }, + { + "epoch": 0.7338858195211786, + "grad_norm": 0.8849206566810608, + "learning_rate": 9.950822459469573e-05, + "loss": 2.1503, + "step": 2391 + }, + { + "epoch": 0.7341927562922038, + "grad_norm": 0.6561055779457092, + "learning_rate": 9.950752892843112e-05, + "loss": 2.1234, + "step": 2392 + }, + { + "epoch": 0.7344996930632289, + "grad_norm": 0.5568758845329285, + "learning_rate": 9.950683277290407e-05, + "loss": 2.2129, + "step": 2393 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.7019078135490417, + "learning_rate": 9.950613612812149e-05, + "loss": 2.1162, + "step": 2394 + }, + { + "epoch": 0.7351135666052793, + "grad_norm": 0.7633521556854248, + "learning_rate": 9.950543899409026e-05, + "loss": 2.2427, + "step": 2395 + }, + { + "epoch": 0.7354205033763045, + "grad_norm": 0.6743205785751343, + "learning_rate": 9.950474137081726e-05, + "loss": 2.2213, + "step": 2396 + }, + { + "epoch": 0.7357274401473296, + "grad_norm": 0.6008336544036865, + "learning_rate": 9.950404325830941e-05, + "loss": 2.1605, + "step": 2397 + }, + { + "epoch": 0.7360343769183548, + "grad_norm": 0.648760199546814, + "learning_rate": 9.950334465657357e-05, + "loss": 2.2298, + "step": 2398 + }, + { + "epoch": 0.7363413136893799, + "grad_norm": 0.6996559500694275, + "learning_rate": 9.950264556561667e-05, + "loss": 2.1616, + "step": 2399 + }, + { + "epoch": 0.7366482504604052, + "grad_norm": 0.741629421710968, + "learning_rate": 9.950194598544561e-05, + "loss": 2.2162, + "step": 2400 + }, + { + "epoch": 0.7369551872314303, + "grad_norm": 0.6144673824310303, + "learning_rate": 9.95012459160673e-05, + "loss": 2.15, + "step": 2401 + }, + { + "epoch": 0.7372621240024555, + "grad_norm": 0.5826541781425476, + "learning_rate": 9.950054535748867e-05, + "loss": 2.1792, + "step": 2402 + }, + { + "epoch": 0.7375690607734806, + "grad_norm": 0.6489288806915283, + "learning_rate": 9.949984430971665e-05, + "loss": 2.1703, + "step": 2403 + }, + { + "epoch": 0.7378759975445058, + "grad_norm": 0.6752250790596008, + "learning_rate": 9.949914277275814e-05, + "loss": 2.2561, + "step": 2404 + }, + { + "epoch": 0.738182934315531, + "grad_norm": 0.5570092797279358, + "learning_rate": 9.94984407466201e-05, + "loss": 2.1418, + "step": 2405 + }, + { + "epoch": 0.7384898710865562, + "grad_norm": 0.5966812968254089, + "learning_rate": 9.949773823130944e-05, + "loss": 2.2168, + "step": 2406 + }, + { + "epoch": 0.7387968078575813, + "grad_norm": 0.6253142952919006, + "learning_rate": 9.949703522683314e-05, + "loss": 2.1646, + "step": 2407 + }, + { + "epoch": 0.7391037446286065, + "grad_norm": 0.6673659086227417, + "learning_rate": 9.94963317331981e-05, + "loss": 2.1904, + "step": 2408 + }, + { + "epoch": 0.7394106813996317, + "grad_norm": 0.6243279576301575, + "learning_rate": 9.949562775041133e-05, + "loss": 2.2568, + "step": 2409 + }, + { + "epoch": 0.7397176181706568, + "grad_norm": 0.7014298439025879, + "learning_rate": 9.949492327847973e-05, + "loss": 2.2331, + "step": 2410 + }, + { + "epoch": 0.7400245549416821, + "grad_norm": 0.698403537273407, + "learning_rate": 9.94942183174103e-05, + "loss": 2.1928, + "step": 2411 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.6354022026062012, + "learning_rate": 9.949351286721001e-05, + "loss": 2.0975, + "step": 2412 + }, + { + "epoch": 0.7406384284837324, + "grad_norm": 0.595302164554596, + "learning_rate": 9.949280692788579e-05, + "loss": 2.177, + "step": 2413 + }, + { + "epoch": 0.7409453652547575, + "grad_norm": 0.6844484210014343, + "learning_rate": 9.949210049944465e-05, + "loss": 2.1962, + "step": 2414 + }, + { + "epoch": 0.7412523020257827, + "grad_norm": 0.6242616176605225, + "learning_rate": 9.949139358189357e-05, + "loss": 2.2143, + "step": 2415 + }, + { + "epoch": 0.7415592387968079, + "grad_norm": 0.6524595022201538, + "learning_rate": 9.949068617523954e-05, + "loss": 2.1438, + "step": 2416 + }, + { + "epoch": 0.7418661755678331, + "grad_norm": 0.6667510867118835, + "learning_rate": 9.948997827948953e-05, + "loss": 2.2115, + "step": 2417 + }, + { + "epoch": 0.7421731123388582, + "grad_norm": 0.7688906192779541, + "learning_rate": 9.948926989465056e-05, + "loss": 2.1887, + "step": 2418 + }, + { + "epoch": 0.7424800491098834, + "grad_norm": 0.6888165473937988, + "learning_rate": 9.948856102072958e-05, + "loss": 2.1349, + "step": 2419 + }, + { + "epoch": 0.7427869858809085, + "grad_norm": 0.5672495365142822, + "learning_rate": 9.948785165773367e-05, + "loss": 2.1109, + "step": 2420 + }, + { + "epoch": 0.7430939226519337, + "grad_norm": 0.5714489221572876, + "learning_rate": 9.94871418056698e-05, + "loss": 2.1483, + "step": 2421 + }, + { + "epoch": 0.7434008594229589, + "grad_norm": 0.6061533093452454, + "learning_rate": 9.948643146454498e-05, + "loss": 2.211, + "step": 2422 + }, + { + "epoch": 0.7437077961939841, + "grad_norm": 0.6132726073265076, + "learning_rate": 9.948572063436625e-05, + "loss": 2.23, + "step": 2423 + }, + { + "epoch": 0.7440147329650092, + "grad_norm": 0.684301495552063, + "learning_rate": 9.948500931514062e-05, + "loss": 2.129, + "step": 2424 + }, + { + "epoch": 0.7443216697360344, + "grad_norm": 0.6325442790985107, + "learning_rate": 9.948429750687512e-05, + "loss": 2.129, + "step": 2425 + }, + { + "epoch": 0.7446286065070595, + "grad_norm": 0.6245989203453064, + "learning_rate": 9.948358520957678e-05, + "loss": 2.1999, + "step": 2426 + }, + { + "epoch": 0.7449355432780848, + "grad_norm": 0.6638534069061279, + "learning_rate": 9.948287242325267e-05, + "loss": 2.203, + "step": 2427 + }, + { + "epoch": 0.7452424800491099, + "grad_norm": 0.6121437549591064, + "learning_rate": 9.94821591479098e-05, + "loss": 2.1204, + "step": 2428 + }, + { + "epoch": 0.7455494168201351, + "grad_norm": 0.7919846177101135, + "learning_rate": 9.948144538355522e-05, + "loss": 2.2353, + "step": 2429 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.7246984839439392, + "learning_rate": 9.948073113019602e-05, + "loss": 2.1284, + "step": 2430 + }, + { + "epoch": 0.7461632903621854, + "grad_norm": 0.6120265126228333, + "learning_rate": 9.948001638783921e-05, + "loss": 2.0873, + "step": 2431 + }, + { + "epoch": 0.7464702271332105, + "grad_norm": 0.628588080406189, + "learning_rate": 9.947930115649189e-05, + "loss": 2.1713, + "step": 2432 + }, + { + "epoch": 0.7467771639042358, + "grad_norm": 0.63116854429245, + "learning_rate": 9.947858543616111e-05, + "loss": 2.123, + "step": 2433 + }, + { + "epoch": 0.7470841006752609, + "grad_norm": 0.6533017754554749, + "learning_rate": 9.947786922685394e-05, + "loss": 2.1593, + "step": 2434 + }, + { + "epoch": 0.7473910374462861, + "grad_norm": 0.6854177117347717, + "learning_rate": 9.947715252857749e-05, + "loss": 2.162, + "step": 2435 + }, + { + "epoch": 0.7476979742173112, + "grad_norm": 0.7257967591285706, + "learning_rate": 9.94764353413388e-05, + "loss": 2.2644, + "step": 2436 + }, + { + "epoch": 0.7480049109883364, + "grad_norm": 0.6806700825691223, + "learning_rate": 9.947571766514498e-05, + "loss": 2.0875, + "step": 2437 + }, + { + "epoch": 0.7483118477593615, + "grad_norm": 0.6616181135177612, + "learning_rate": 9.947499950000312e-05, + "loss": 2.1353, + "step": 2438 + }, + { + "epoch": 0.7486187845303868, + "grad_norm": 0.7249685525894165, + "learning_rate": 9.947428084592032e-05, + "loss": 2.148, + "step": 2439 + }, + { + "epoch": 0.7489257213014119, + "grad_norm": 0.6372905969619751, + "learning_rate": 9.947356170290369e-05, + "loss": 2.1749, + "step": 2440 + }, + { + "epoch": 0.7492326580724371, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.947284207096031e-05, + "loss": 2.1909, + "step": 2441 + }, + { + "epoch": 0.7495395948434622, + "grad_norm": 0.5830507278442383, + "learning_rate": 9.94721219500973e-05, + "loss": 2.1351, + "step": 2442 + }, + { + "epoch": 0.7498465316144874, + "grad_norm": 0.650262713432312, + "learning_rate": 9.94714013403218e-05, + "loss": 2.2602, + "step": 2443 + }, + { + "epoch": 0.7501534683855126, + "grad_norm": 0.6658717393875122, + "learning_rate": 9.947068024164091e-05, + "loss": 2.0919, + "step": 2444 + }, + { + "epoch": 0.7504604051565378, + "grad_norm": 0.7299105525016785, + "learning_rate": 9.946995865406177e-05, + "loss": 2.2079, + "step": 2445 + }, + { + "epoch": 0.7507673419275629, + "grad_norm": 0.762246310710907, + "learning_rate": 9.946923657759148e-05, + "loss": 2.2225, + "step": 2446 + }, + { + "epoch": 0.7510742786985881, + "grad_norm": 0.7019835710525513, + "learning_rate": 9.946851401223722e-05, + "loss": 2.175, + "step": 2447 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.6214791536331177, + "learning_rate": 9.946779095800611e-05, + "loss": 2.2095, + "step": 2448 + }, + { + "epoch": 0.7516881522406385, + "grad_norm": 0.6380667090415955, + "learning_rate": 9.94670674149053e-05, + "loss": 2.2325, + "step": 2449 + }, + { + "epoch": 0.7519950890116636, + "grad_norm": 0.6175886392593384, + "learning_rate": 9.946634338294191e-05, + "loss": 2.1431, + "step": 2450 + }, + { + "epoch": 0.7523020257826888, + "grad_norm": 0.6642621159553528, + "learning_rate": 9.946561886212315e-05, + "loss": 2.1538, + "step": 2451 + }, + { + "epoch": 0.7526089625537139, + "grad_norm": 0.7078617215156555, + "learning_rate": 9.946489385245614e-05, + "loss": 2.1544, + "step": 2452 + }, + { + "epoch": 0.7529158993247391, + "grad_norm": 0.6939398050308228, + "learning_rate": 9.946416835394806e-05, + "loss": 2.1131, + "step": 2453 + }, + { + "epoch": 0.7532228360957642, + "grad_norm": 0.7080716490745544, + "learning_rate": 9.946344236660608e-05, + "loss": 2.2135, + "step": 2454 + }, + { + "epoch": 0.7535297728667895, + "grad_norm": 0.7451115250587463, + "learning_rate": 9.946271589043736e-05, + "loss": 2.1475, + "step": 2455 + }, + { + "epoch": 0.7538367096378146, + "grad_norm": 0.6718367338180542, + "learning_rate": 9.946198892544909e-05, + "loss": 2.1853, + "step": 2456 + }, + { + "epoch": 0.7541436464088398, + "grad_norm": 0.7071637511253357, + "learning_rate": 9.946126147164847e-05, + "loss": 2.0981, + "step": 2457 + }, + { + "epoch": 0.7544505831798649, + "grad_norm": 0.6745624542236328, + "learning_rate": 9.946053352904267e-05, + "loss": 2.1914, + "step": 2458 + }, + { + "epoch": 0.7547575199508901, + "grad_norm": 0.7267486453056335, + "learning_rate": 9.945980509763888e-05, + "loss": 2.1091, + "step": 2459 + }, + { + "epoch": 0.7550644567219152, + "grad_norm": 0.6128695607185364, + "learning_rate": 9.94590761774443e-05, + "loss": 2.1721, + "step": 2460 + }, + { + "epoch": 0.7553713934929405, + "grad_norm": 0.6574678421020508, + "learning_rate": 9.945834676846615e-05, + "loss": 2.1609, + "step": 2461 + }, + { + "epoch": 0.7556783302639656, + "grad_norm": 0.6209995150566101, + "learning_rate": 9.945761687071164e-05, + "loss": 2.1889, + "step": 2462 + }, + { + "epoch": 0.7559852670349908, + "grad_norm": 0.7425361275672913, + "learning_rate": 9.945688648418795e-05, + "loss": 2.2189, + "step": 2463 + }, + { + "epoch": 0.7562922038060159, + "grad_norm": 1.0604934692382812, + "learning_rate": 9.945615560890234e-05, + "loss": 2.1858, + "step": 2464 + }, + { + "epoch": 0.7565991405770411, + "grad_norm": 0.7162829041481018, + "learning_rate": 9.945542424486201e-05, + "loss": 2.101, + "step": 2465 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.6361207962036133, + "learning_rate": 9.945469239207416e-05, + "loss": 2.0836, + "step": 2466 + }, + { + "epoch": 0.7572130141190915, + "grad_norm": 0.5858156085014343, + "learning_rate": 9.945396005054609e-05, + "loss": 2.2059, + "step": 2467 + }, + { + "epoch": 0.7575199508901166, + "grad_norm": 0.7322074174880981, + "learning_rate": 9.945322722028498e-05, + "loss": 2.2295, + "step": 2468 + }, + { + "epoch": 0.7578268876611418, + "grad_norm": 0.775900661945343, + "learning_rate": 9.945249390129811e-05, + "loss": 2.2171, + "step": 2469 + }, + { + "epoch": 0.7581338244321669, + "grad_norm": 0.8801379799842834, + "learning_rate": 9.94517600935927e-05, + "loss": 2.1632, + "step": 2470 + }, + { + "epoch": 0.7584407612031921, + "grad_norm": 0.8258405923843384, + "learning_rate": 9.945102579717602e-05, + "loss": 2.1591, + "step": 2471 + }, + { + "epoch": 0.7587476979742173, + "grad_norm": 0.7472482323646545, + "learning_rate": 9.945029101205532e-05, + "loss": 2.2242, + "step": 2472 + }, + { + "epoch": 0.7590546347452425, + "grad_norm": 0.6594643592834473, + "learning_rate": 9.944955573823785e-05, + "loss": 2.1217, + "step": 2473 + }, + { + "epoch": 0.7593615715162676, + "grad_norm": 0.6547524333000183, + "learning_rate": 9.944881997573088e-05, + "loss": 2.131, + "step": 2474 + }, + { + "epoch": 0.7596685082872928, + "grad_norm": 0.6630129814147949, + "learning_rate": 9.94480837245417e-05, + "loss": 2.1264, + "step": 2475 + }, + { + "epoch": 0.7599754450583179, + "grad_norm": 0.6877384781837463, + "learning_rate": 9.944734698467757e-05, + "loss": 2.2453, + "step": 2476 + }, + { + "epoch": 0.7602823818293432, + "grad_norm": 0.6736158728599548, + "learning_rate": 9.944660975614579e-05, + "loss": 2.1425, + "step": 2477 + }, + { + "epoch": 0.7605893186003683, + "grad_norm": 0.6140786409378052, + "learning_rate": 9.944587203895361e-05, + "loss": 2.1345, + "step": 2478 + }, + { + "epoch": 0.7608962553713935, + "grad_norm": 0.5515910387039185, + "learning_rate": 9.944513383310837e-05, + "loss": 2.086, + "step": 2479 + }, + { + "epoch": 0.7612031921424187, + "grad_norm": 0.49419671297073364, + "learning_rate": 9.944439513861731e-05, + "loss": 2.1069, + "step": 2480 + }, + { + "epoch": 0.7615101289134438, + "grad_norm": 0.5526577234268188, + "learning_rate": 9.944365595548777e-05, + "loss": 2.1702, + "step": 2481 + }, + { + "epoch": 0.761817065684469, + "grad_norm": 0.5430580973625183, + "learning_rate": 9.944291628372702e-05, + "loss": 2.121, + "step": 2482 + }, + { + "epoch": 0.7621240024554942, + "grad_norm": 0.5333554148674011, + "learning_rate": 9.94421761233424e-05, + "loss": 2.1154, + "step": 2483 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.5856761932373047, + "learning_rate": 9.944143547434124e-05, + "loss": 2.1734, + "step": 2484 + }, + { + "epoch": 0.7627378759975445, + "grad_norm": 0.6619083881378174, + "learning_rate": 9.944069433673082e-05, + "loss": 2.2068, + "step": 2485 + }, + { + "epoch": 0.7630448127685697, + "grad_norm": 0.5791018009185791, + "learning_rate": 9.943995271051849e-05, + "loss": 2.0834, + "step": 2486 + }, + { + "epoch": 0.7633517495395948, + "grad_norm": 0.5942522287368774, + "learning_rate": 9.943921059571155e-05, + "loss": 2.2001, + "step": 2487 + }, + { + "epoch": 0.7636586863106201, + "grad_norm": 0.6285880208015442, + "learning_rate": 9.943846799231738e-05, + "loss": 2.1601, + "step": 2488 + }, + { + "epoch": 0.7639656230816452, + "grad_norm": 0.6337715983390808, + "learning_rate": 9.943772490034326e-05, + "loss": 2.1722, + "step": 2489 + }, + { + "epoch": 0.7642725598526704, + "grad_norm": 0.6912121772766113, + "learning_rate": 9.94369813197966e-05, + "loss": 2.1933, + "step": 2490 + }, + { + "epoch": 0.7645794966236955, + "grad_norm": 0.8028284311294556, + "learning_rate": 9.943623725068469e-05, + "loss": 2.129, + "step": 2491 + }, + { + "epoch": 0.7648864333947207, + "grad_norm": 0.8527138233184814, + "learning_rate": 9.943549269301491e-05, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.7651933701657458, + "grad_norm": 0.8422580361366272, + "learning_rate": 9.943474764679462e-05, + "loss": 2.2958, + "step": 2493 + }, + { + "epoch": 0.7655003069367711, + "grad_norm": 0.7698150873184204, + "learning_rate": 9.943400211203118e-05, + "loss": 2.1415, + "step": 2494 + }, + { + "epoch": 0.7658072437077962, + "grad_norm": 0.6360690593719482, + "learning_rate": 9.943325608873196e-05, + "loss": 2.1188, + "step": 2495 + }, + { + "epoch": 0.7661141804788214, + "grad_norm": 0.6225799918174744, + "learning_rate": 9.943250957690433e-05, + "loss": 2.1006, + "step": 2496 + }, + { + "epoch": 0.7664211172498465, + "grad_norm": 0.6694490909576416, + "learning_rate": 9.943176257655567e-05, + "loss": 2.2455, + "step": 2497 + }, + { + "epoch": 0.7667280540208717, + "grad_norm": 0.6188158988952637, + "learning_rate": 9.943101508769335e-05, + "loss": 2.0853, + "step": 2498 + }, + { + "epoch": 0.7670349907918969, + "grad_norm": 0.5934504866600037, + "learning_rate": 9.943026711032477e-05, + "loss": 2.0718, + "step": 2499 + }, + { + "epoch": 0.7673419275629221, + "grad_norm": 0.6261292695999146, + "learning_rate": 9.942951864445732e-05, + "loss": 2.1747, + "step": 2500 + }, + { + "epoch": 0.7676488643339472, + "grad_norm": 0.5891184210777283, + "learning_rate": 9.94287696900984e-05, + "loss": 2.1637, + "step": 2501 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.5321740508079529, + "learning_rate": 9.94280202472554e-05, + "loss": 2.0717, + "step": 2502 + }, + { + "epoch": 0.7682627378759975, + "grad_norm": 0.5563281178474426, + "learning_rate": 9.942727031593573e-05, + "loss": 2.1654, + "step": 2503 + }, + { + "epoch": 0.7685696746470227, + "grad_norm": 0.5672664046287537, + "learning_rate": 9.942651989614681e-05, + "loss": 2.0853, + "step": 2504 + }, + { + "epoch": 0.7688766114180479, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.942576898789606e-05, + "loss": 2.0636, + "step": 2505 + }, + { + "epoch": 0.7691835481890731, + "grad_norm": 0.5802470445632935, + "learning_rate": 9.942501759119088e-05, + "loss": 2.0924, + "step": 2506 + }, + { + "epoch": 0.7694904849600982, + "grad_norm": 0.5630003213882446, + "learning_rate": 9.94242657060387e-05, + "loss": 2.1975, + "step": 2507 + }, + { + "epoch": 0.7697974217311234, + "grad_norm": 0.6001835465431213, + "learning_rate": 9.942351333244697e-05, + "loss": 2.1187, + "step": 2508 + }, + { + "epoch": 0.7701043585021485, + "grad_norm": 0.6702088117599487, + "learning_rate": 9.942276047042311e-05, + "loss": 2.1489, + "step": 2509 + }, + { + "epoch": 0.7704112952731738, + "grad_norm": 0.7941808700561523, + "learning_rate": 9.942200711997456e-05, + "loss": 2.1404, + "step": 2510 + }, + { + "epoch": 0.7707182320441989, + "grad_norm": 0.8202539682388306, + "learning_rate": 9.942125328110876e-05, + "loss": 2.1242, + "step": 2511 + }, + { + "epoch": 0.7710251688152241, + "grad_norm": 0.7667655348777771, + "learning_rate": 9.942049895383319e-05, + "loss": 2.118, + "step": 2512 + }, + { + "epoch": 0.7713321055862492, + "grad_norm": 0.6766887307167053, + "learning_rate": 9.941974413815527e-05, + "loss": 2.2632, + "step": 2513 + }, + { + "epoch": 0.7716390423572744, + "grad_norm": 0.5923287272453308, + "learning_rate": 9.941898883408248e-05, + "loss": 2.1096, + "step": 2514 + }, + { + "epoch": 0.7719459791282995, + "grad_norm": 0.8847586512565613, + "learning_rate": 9.941823304162227e-05, + "loss": 2.2629, + "step": 2515 + }, + { + "epoch": 0.7722529158993248, + "grad_norm": 1.2274069786071777, + "learning_rate": 9.941747676078211e-05, + "loss": 2.2493, + "step": 2516 + }, + { + "epoch": 0.7725598526703499, + "grad_norm": 0.8637729287147522, + "learning_rate": 9.94167199915695e-05, + "loss": 2.1545, + "step": 2517 + }, + { + "epoch": 0.7728667894413751, + "grad_norm": 0.7852178812026978, + "learning_rate": 9.941596273399187e-05, + "loss": 2.1984, + "step": 2518 + }, + { + "epoch": 0.7731737262124002, + "grad_norm": 0.6839576959609985, + "learning_rate": 9.941520498805677e-05, + "loss": 2.1913, + "step": 2519 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.7051649689674377, + "learning_rate": 9.941444675377163e-05, + "loss": 2.1678, + "step": 2520 + }, + { + "epoch": 0.7737875997544506, + "grad_norm": 0.702549159526825, + "learning_rate": 9.941368803114395e-05, + "loss": 2.1426, + "step": 2521 + }, + { + "epoch": 0.7740945365254758, + "grad_norm": 0.6717942953109741, + "learning_rate": 9.941292882018127e-05, + "loss": 2.1873, + "step": 2522 + }, + { + "epoch": 0.7744014732965009, + "grad_norm": 0.6705282926559448, + "learning_rate": 9.941216912089104e-05, + "loss": 2.1363, + "step": 2523 + }, + { + "epoch": 0.7747084100675261, + "grad_norm": 0.5858317017555237, + "learning_rate": 9.941140893328082e-05, + "loss": 2.1019, + "step": 2524 + }, + { + "epoch": 0.7750153468385512, + "grad_norm": 0.6353682279586792, + "learning_rate": 9.941064825735808e-05, + "loss": 2.1765, + "step": 2525 + }, + { + "epoch": 0.7753222836095764, + "grad_norm": 0.6573354601860046, + "learning_rate": 9.940988709313035e-05, + "loss": 2.0636, + "step": 2526 + }, + { + "epoch": 0.7756292203806016, + "grad_norm": 0.6040489077568054, + "learning_rate": 9.940912544060517e-05, + "loss": 2.0902, + "step": 2527 + }, + { + "epoch": 0.7759361571516268, + "grad_norm": 0.7024530172348022, + "learning_rate": 9.940836329979004e-05, + "loss": 2.2198, + "step": 2528 + }, + { + "epoch": 0.7762430939226519, + "grad_norm": 0.6910196542739868, + "learning_rate": 9.940760067069251e-05, + "loss": 2.0546, + "step": 2529 + }, + { + "epoch": 0.7765500306936771, + "grad_norm": 0.6841506361961365, + "learning_rate": 9.940683755332012e-05, + "loss": 2.2159, + "step": 2530 + }, + { + "epoch": 0.7768569674647022, + "grad_norm": 0.6503066420555115, + "learning_rate": 9.940607394768038e-05, + "loss": 2.2156, + "step": 2531 + }, + { + "epoch": 0.7771639042357275, + "grad_norm": 0.6512146592140198, + "learning_rate": 9.940530985378089e-05, + "loss": 2.1417, + "step": 2532 + }, + { + "epoch": 0.7774708410067526, + "grad_norm": 0.6234787106513977, + "learning_rate": 9.940454527162914e-05, + "loss": 2.1315, + "step": 2533 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.6279457211494446, + "learning_rate": 9.940378020123273e-05, + "loss": 2.2699, + "step": 2534 + }, + { + "epoch": 0.7780847145488029, + "grad_norm": 0.6793956160545349, + "learning_rate": 9.940301464259921e-05, + "loss": 2.2488, + "step": 2535 + }, + { + "epoch": 0.7783916513198281, + "grad_norm": 0.721234142780304, + "learning_rate": 9.940224859573614e-05, + "loss": 2.1183, + "step": 2536 + }, + { + "epoch": 0.7786985880908532, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.94014820606511e-05, + "loss": 2.0995, + "step": 2537 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.6358578205108643, + "learning_rate": 9.940071503735165e-05, + "loss": 2.2024, + "step": 2538 + }, + { + "epoch": 0.7793124616329036, + "grad_norm": 0.6250868439674377, + "learning_rate": 9.939994752584538e-05, + "loss": 2.1574, + "step": 2539 + }, + { + "epoch": 0.7796193984039288, + "grad_norm": 0.7657763361930847, + "learning_rate": 9.939917952613989e-05, + "loss": 2.2625, + "step": 2540 + }, + { + "epoch": 0.7799263351749539, + "grad_norm": 0.7625400424003601, + "learning_rate": 9.939841103824275e-05, + "loss": 2.1809, + "step": 2541 + }, + { + "epoch": 0.7802332719459791, + "grad_norm": 0.8593107461929321, + "learning_rate": 9.939764206216155e-05, + "loss": 2.2359, + "step": 2542 + }, + { + "epoch": 0.7805402087170042, + "grad_norm": 0.8441007733345032, + "learning_rate": 9.93968725979039e-05, + "loss": 2.1844, + "step": 2543 + }, + { + "epoch": 0.7808471454880295, + "grad_norm": 0.6408470273017883, + "learning_rate": 9.93961026454774e-05, + "loss": 2.1871, + "step": 2544 + }, + { + "epoch": 0.7811540822590546, + "grad_norm": 0.6779976487159729, + "learning_rate": 9.939533220488966e-05, + "loss": 2.1651, + "step": 2545 + }, + { + "epoch": 0.7814610190300798, + "grad_norm": 0.5885556936264038, + "learning_rate": 9.93945612761483e-05, + "loss": 2.0172, + "step": 2546 + }, + { + "epoch": 0.7817679558011049, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.939378985926094e-05, + "loss": 2.1358, + "step": 2547 + }, + { + "epoch": 0.7820748925721301, + "grad_norm": 0.685183584690094, + "learning_rate": 9.939301795423519e-05, + "loss": 2.1822, + "step": 2548 + }, + { + "epoch": 0.7823818293431553, + "grad_norm": 0.6666997671127319, + "learning_rate": 9.939224556107869e-05, + "loss": 2.288, + "step": 2549 + }, + { + "epoch": 0.7826887661141805, + "grad_norm": 0.6401170492172241, + "learning_rate": 9.939147267979905e-05, + "loss": 2.1038, + "step": 2550 + }, + { + "epoch": 0.7829957028852057, + "grad_norm": 0.645182728767395, + "learning_rate": 9.939069931040396e-05, + "loss": 2.1285, + "step": 2551 + }, + { + "epoch": 0.7833026396562308, + "grad_norm": 0.6795851588249207, + "learning_rate": 9.9389925452901e-05, + "loss": 2.1844, + "step": 2552 + }, + { + "epoch": 0.783609576427256, + "grad_norm": 0.7027488946914673, + "learning_rate": 9.938915110729788e-05, + "loss": 2.1712, + "step": 2553 + }, + { + "epoch": 0.7839165131982812, + "grad_norm": 0.7076524496078491, + "learning_rate": 9.93883762736022e-05, + "loss": 2.1812, + "step": 2554 + }, + { + "epoch": 0.7842234499693064, + "grad_norm": 0.5979459881782532, + "learning_rate": 9.938760095182165e-05, + "loss": 2.0877, + "step": 2555 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.6408665776252747, + "learning_rate": 9.938682514196387e-05, + "loss": 2.191, + "step": 2556 + }, + { + "epoch": 0.7848373235113567, + "grad_norm": 0.6545908451080322, + "learning_rate": 9.938604884403654e-05, + "loss": 2.0933, + "step": 2557 + }, + { + "epoch": 0.7851442602823818, + "grad_norm": 0.7271838784217834, + "learning_rate": 9.938527205804733e-05, + "loss": 2.1804, + "step": 2558 + }, + { + "epoch": 0.785451197053407, + "grad_norm": 0.6371840834617615, + "learning_rate": 9.938449478400391e-05, + "loss": 2.1161, + "step": 2559 + }, + { + "epoch": 0.7857581338244322, + "grad_norm": 0.5922467708587646, + "learning_rate": 9.938371702191398e-05, + "loss": 2.0929, + "step": 2560 + }, + { + "epoch": 0.7860650705954574, + "grad_norm": 0.536125898361206, + "learning_rate": 9.938293877178522e-05, + "loss": 2.0815, + "step": 2561 + }, + { + "epoch": 0.7863720073664825, + "grad_norm": 0.6026225090026855, + "learning_rate": 9.93821600336253e-05, + "loss": 2.1719, + "step": 2562 + }, + { + "epoch": 0.7866789441375077, + "grad_norm": 0.584267795085907, + "learning_rate": 9.938138080744192e-05, + "loss": 2.1515, + "step": 2563 + }, + { + "epoch": 0.7869858809085328, + "grad_norm": 0.6616362929344177, + "learning_rate": 9.938060109324281e-05, + "loss": 2.2425, + "step": 2564 + }, + { + "epoch": 0.787292817679558, + "grad_norm": 0.669987678527832, + "learning_rate": 9.937982089103566e-05, + "loss": 2.1883, + "step": 2565 + }, + { + "epoch": 0.7875997544505832, + "grad_norm": 0.6769465208053589, + "learning_rate": 9.937904020082815e-05, + "loss": 2.1508, + "step": 2566 + }, + { + "epoch": 0.7879066912216084, + "grad_norm": 0.5796112418174744, + "learning_rate": 9.937825902262805e-05, + "loss": 2.0925, + "step": 2567 + }, + { + "epoch": 0.7882136279926335, + "grad_norm": 0.5895870923995972, + "learning_rate": 9.937747735644305e-05, + "loss": 2.1002, + "step": 2568 + }, + { + "epoch": 0.7885205647636587, + "grad_norm": 0.5870219469070435, + "learning_rate": 9.937669520228088e-05, + "loss": 2.1189, + "step": 2569 + }, + { + "epoch": 0.7888275015346838, + "grad_norm": 0.6191404461860657, + "learning_rate": 9.937591256014925e-05, + "loss": 2.1783, + "step": 2570 + }, + { + "epoch": 0.7891344383057091, + "grad_norm": 0.6033806204795837, + "learning_rate": 9.937512943005592e-05, + "loss": 2.1507, + "step": 2571 + }, + { + "epoch": 0.7894413750767342, + "grad_norm": 0.6319470405578613, + "learning_rate": 9.937434581200863e-05, + "loss": 2.2088, + "step": 2572 + }, + { + "epoch": 0.7897483118477594, + "grad_norm": 0.621004581451416, + "learning_rate": 9.93735617060151e-05, + "loss": 2.1523, + "step": 2573 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.6069821715354919, + "learning_rate": 9.937277711208311e-05, + "loss": 2.1437, + "step": 2574 + }, + { + "epoch": 0.7903621853898097, + "grad_norm": 0.6186996102333069, + "learning_rate": 9.937199203022039e-05, + "loss": 2.1541, + "step": 2575 + }, + { + "epoch": 0.7906691221608348, + "grad_norm": 0.6531949639320374, + "learning_rate": 9.937120646043471e-05, + "loss": 2.1928, + "step": 2576 + }, + { + "epoch": 0.7909760589318601, + "grad_norm": 0.5974560379981995, + "learning_rate": 9.937042040273383e-05, + "loss": 2.1814, + "step": 2577 + }, + { + "epoch": 0.7912829957028852, + "grad_norm": 0.59506756067276, + "learning_rate": 9.936963385712552e-05, + "loss": 2.2143, + "step": 2578 + }, + { + "epoch": 0.7915899324739104, + "grad_norm": 0.5878757834434509, + "learning_rate": 9.936884682361755e-05, + "loss": 2.0718, + "step": 2579 + }, + { + "epoch": 0.7918968692449355, + "grad_norm": 0.6318243145942688, + "learning_rate": 9.936805930221769e-05, + "loss": 2.1465, + "step": 2580 + }, + { + "epoch": 0.7922038060159607, + "grad_norm": 0.6474836468696594, + "learning_rate": 9.936727129293376e-05, + "loss": 2.0869, + "step": 2581 + }, + { + "epoch": 0.7925107427869859, + "grad_norm": 0.6589438915252686, + "learning_rate": 9.936648279577349e-05, + "loss": 2.1422, + "step": 2582 + }, + { + "epoch": 0.7928176795580111, + "grad_norm": 0.6935134530067444, + "learning_rate": 9.93656938107447e-05, + "loss": 2.1571, + "step": 2583 + }, + { + "epoch": 0.7931246163290362, + "grad_norm": 0.655430793762207, + "learning_rate": 9.936490433785522e-05, + "loss": 2.1044, + "step": 2584 + }, + { + "epoch": 0.7934315531000614, + "grad_norm": 0.6856111288070679, + "learning_rate": 9.93641143771128e-05, + "loss": 2.0551, + "step": 2585 + }, + { + "epoch": 0.7937384898710865, + "grad_norm": 0.6783097386360168, + "learning_rate": 9.936332392852527e-05, + "loss": 2.1475, + "step": 2586 + }, + { + "epoch": 0.7940454266421118, + "grad_norm": 0.6746678948402405, + "learning_rate": 9.936253299210045e-05, + "loss": 2.1462, + "step": 2587 + }, + { + "epoch": 0.7943523634131369, + "grad_norm": 0.6854017972946167, + "learning_rate": 9.936174156784614e-05, + "loss": 2.1649, + "step": 2588 + }, + { + "epoch": 0.7946593001841621, + "grad_norm": 0.6740380525588989, + "learning_rate": 9.936094965577017e-05, + "loss": 2.06, + "step": 2589 + }, + { + "epoch": 0.7949662369551872, + "grad_norm": 0.6354179978370667, + "learning_rate": 9.936015725588037e-05, + "loss": 2.1938, + "step": 2590 + }, + { + "epoch": 0.7952731737262124, + "grad_norm": 0.6496716141700745, + "learning_rate": 9.935936436818453e-05, + "loss": 2.089, + "step": 2591 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.5996106266975403, + "learning_rate": 9.935857099269057e-05, + "loss": 2.2254, + "step": 2592 + }, + { + "epoch": 0.7958870472682628, + "grad_norm": 0.5630382895469666, + "learning_rate": 9.935777712940625e-05, + "loss": 2.069, + "step": 2593 + }, + { + "epoch": 0.7961939840392879, + "grad_norm": 0.5480468273162842, + "learning_rate": 9.935698277833946e-05, + "loss": 2.1288, + "step": 2594 + }, + { + "epoch": 0.7965009208103131, + "grad_norm": 0.5127096772193909, + "learning_rate": 9.935618793949803e-05, + "loss": 2.0753, + "step": 2595 + }, + { + "epoch": 0.7968078575813382, + "grad_norm": 0.6451439261436462, + "learning_rate": 9.935539261288983e-05, + "loss": 2.3005, + "step": 2596 + }, + { + "epoch": 0.7971147943523634, + "grad_norm": 0.7047737836837769, + "learning_rate": 9.935459679852271e-05, + "loss": 2.1307, + "step": 2597 + }, + { + "epoch": 0.7974217311233885, + "grad_norm": 0.6382983922958374, + "learning_rate": 9.935380049640454e-05, + "loss": 2.1136, + "step": 2598 + }, + { + "epoch": 0.7977286678944138, + "grad_norm": 0.7337773442268372, + "learning_rate": 9.935300370654317e-05, + "loss": 2.0719, + "step": 2599 + }, + { + "epoch": 0.7980356046654389, + "grad_norm": 0.7481197118759155, + "learning_rate": 9.935220642894652e-05, + "loss": 2.2263, + "step": 2600 + }, + { + "epoch": 0.7983425414364641, + "grad_norm": 0.7383365631103516, + "learning_rate": 9.93514086636224e-05, + "loss": 2.2207, + "step": 2601 + }, + { + "epoch": 0.7986494782074892, + "grad_norm": 0.800762951374054, + "learning_rate": 9.935061041057876e-05, + "loss": 2.1848, + "step": 2602 + }, + { + "epoch": 0.7989564149785144, + "grad_norm": 0.6972829699516296, + "learning_rate": 9.934981166982346e-05, + "loss": 2.1301, + "step": 2603 + }, + { + "epoch": 0.7992633517495396, + "grad_norm": 0.5842304229736328, + "learning_rate": 9.93490124413644e-05, + "loss": 2.1311, + "step": 2604 + }, + { + "epoch": 0.7995702885205648, + "grad_norm": 0.6070491075515747, + "learning_rate": 9.934821272520946e-05, + "loss": 2.2226, + "step": 2605 + }, + { + "epoch": 0.7998772252915899, + "grad_norm": 0.6141406297683716, + "learning_rate": 9.934741252136656e-05, + "loss": 2.1425, + "step": 2606 + }, + { + "epoch": 0.8001841620626151, + "grad_norm": 0.5515148043632507, + "learning_rate": 9.934661182984363e-05, + "loss": 2.1138, + "step": 2607 + }, + { + "epoch": 0.8004910988336402, + "grad_norm": 0.5819688439369202, + "learning_rate": 9.934581065064854e-05, + "loss": 2.0835, + "step": 2608 + }, + { + "epoch": 0.8007980356046654, + "grad_norm": 0.593979001045227, + "learning_rate": 9.934500898378922e-05, + "loss": 2.2262, + "step": 2609 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.6978363990783691, + "learning_rate": 9.934420682927361e-05, + "loss": 2.1283, + "step": 2610 + }, + { + "epoch": 0.8014119091467158, + "grad_norm": 0.6205853223800659, + "learning_rate": 9.934340418710963e-05, + "loss": 2.1254, + "step": 2611 + }, + { + "epoch": 0.8017188459177409, + "grad_norm": 0.5547113418579102, + "learning_rate": 9.93426010573052e-05, + "loss": 2.0895, + "step": 2612 + }, + { + "epoch": 0.8020257826887661, + "grad_norm": 0.5652415156364441, + "learning_rate": 9.934179743986827e-05, + "loss": 2.1496, + "step": 2613 + }, + { + "epoch": 0.8023327194597912, + "grad_norm": 0.5833094120025635, + "learning_rate": 9.934099333480678e-05, + "loss": 2.1159, + "step": 2614 + }, + { + "epoch": 0.8026396562308165, + "grad_norm": 0.5929473638534546, + "learning_rate": 9.934018874212866e-05, + "loss": 2.1512, + "step": 2615 + }, + { + "epoch": 0.8029465930018416, + "grad_norm": 0.6359207630157471, + "learning_rate": 9.93393836618419e-05, + "loss": 2.1384, + "step": 2616 + }, + { + "epoch": 0.8032535297728668, + "grad_norm": 0.5934728384017944, + "learning_rate": 9.933857809395441e-05, + "loss": 2.1087, + "step": 2617 + }, + { + "epoch": 0.8035604665438919, + "grad_norm": 0.5685787796974182, + "learning_rate": 9.933777203847418e-05, + "loss": 2.1521, + "step": 2618 + }, + { + "epoch": 0.8038674033149171, + "grad_norm": 0.6276339292526245, + "learning_rate": 9.933696549540918e-05, + "loss": 2.1151, + "step": 2619 + }, + { + "epoch": 0.8041743400859422, + "grad_norm": 0.6206804513931274, + "learning_rate": 9.933615846476736e-05, + "loss": 2.1872, + "step": 2620 + }, + { + "epoch": 0.8044812768569675, + "grad_norm": 0.6645623445510864, + "learning_rate": 9.933535094655671e-05, + "loss": 2.217, + "step": 2621 + }, + { + "epoch": 0.8047882136279927, + "grad_norm": 0.6639950275421143, + "learning_rate": 9.93345429407852e-05, + "loss": 2.1479, + "step": 2622 + }, + { + "epoch": 0.8050951503990178, + "grad_norm": 0.6284301280975342, + "learning_rate": 9.933373444746081e-05, + "loss": 2.1763, + "step": 2623 + }, + { + "epoch": 0.805402087170043, + "grad_norm": 0.5974198579788208, + "learning_rate": 9.933292546659156e-05, + "loss": 2.1453, + "step": 2624 + }, + { + "epoch": 0.8057090239410681, + "grad_norm": 0.6465814113616943, + "learning_rate": 9.933211599818541e-05, + "loss": 2.1999, + "step": 2625 + }, + { + "epoch": 0.8060159607120934, + "grad_norm": 0.6099503040313721, + "learning_rate": 9.933130604225038e-05, + "loss": 2.1523, + "step": 2626 + }, + { + "epoch": 0.8063228974831185, + "grad_norm": 0.5749596953392029, + "learning_rate": 9.933049559879448e-05, + "loss": 2.0802, + "step": 2627 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.5541282892227173, + "learning_rate": 9.93296846678257e-05, + "loss": 2.0851, + "step": 2628 + }, + { + "epoch": 0.8069367710251688, + "grad_norm": 0.5884469747543335, + "learning_rate": 9.932887324935207e-05, + "loss": 2.1824, + "step": 2629 + }, + { + "epoch": 0.807243707796194, + "grad_norm": 0.7330854535102844, + "learning_rate": 9.93280613433816e-05, + "loss": 2.1463, + "step": 2630 + }, + { + "epoch": 0.8075506445672191, + "grad_norm": 0.7012677192687988, + "learning_rate": 9.932724894992232e-05, + "loss": 2.0907, + "step": 2631 + }, + { + "epoch": 0.8078575813382444, + "grad_norm": 0.6487980484962463, + "learning_rate": 9.932643606898224e-05, + "loss": 2.2131, + "step": 2632 + }, + { + "epoch": 0.8081645181092695, + "grad_norm": 0.7956567406654358, + "learning_rate": 9.932562270056941e-05, + "loss": 2.2289, + "step": 2633 + }, + { + "epoch": 0.8084714548802947, + "grad_norm": 0.7904889583587646, + "learning_rate": 9.932480884469187e-05, + "loss": 2.195, + "step": 2634 + }, + { + "epoch": 0.8087783916513198, + "grad_norm": 0.8088505864143372, + "learning_rate": 9.932399450135766e-05, + "loss": 2.1199, + "step": 2635 + }, + { + "epoch": 0.809085328422345, + "grad_norm": 0.7557070851325989, + "learning_rate": 9.932317967057483e-05, + "loss": 2.177, + "step": 2636 + }, + { + "epoch": 0.8093922651933702, + "grad_norm": 0.8585113286972046, + "learning_rate": 9.932236435235143e-05, + "loss": 2.2215, + "step": 2637 + }, + { + "epoch": 0.8096992019643954, + "grad_norm": 0.9541242718696594, + "learning_rate": 9.932154854669551e-05, + "loss": 2.0971, + "step": 2638 + }, + { + "epoch": 0.8100061387354205, + "grad_norm": 0.9696017503738403, + "learning_rate": 9.932073225361513e-05, + "loss": 2.1723, + "step": 2639 + }, + { + "epoch": 0.8103130755064457, + "grad_norm": 0.9876028895378113, + "learning_rate": 9.931991547311839e-05, + "loss": 2.2266, + "step": 2640 + }, + { + "epoch": 0.8106200122774708, + "grad_norm": 0.9169884324073792, + "learning_rate": 9.931909820521332e-05, + "loss": 2.1453, + "step": 2641 + }, + { + "epoch": 0.810926949048496, + "grad_norm": 0.7645174860954285, + "learning_rate": 9.931828044990801e-05, + "loss": 2.1683, + "step": 2642 + }, + { + "epoch": 0.8112338858195212, + "grad_norm": 0.6733110547065735, + "learning_rate": 9.931746220721056e-05, + "loss": 2.0869, + "step": 2643 + }, + { + "epoch": 0.8115408225905464, + "grad_norm": 0.6033461689949036, + "learning_rate": 9.931664347712904e-05, + "loss": 2.1395, + "step": 2644 + }, + { + "epoch": 0.8118477593615715, + "grad_norm": 0.5953301191329956, + "learning_rate": 9.931582425967154e-05, + "loss": 2.0886, + "step": 2645 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.6587704420089722, + "learning_rate": 9.931500455484616e-05, + "loss": 2.1846, + "step": 2646 + }, + { + "epoch": 0.8124616329036218, + "grad_norm": 0.5837808847427368, + "learning_rate": 9.931418436266101e-05, + "loss": 2.0953, + "step": 2647 + }, + { + "epoch": 0.8127685696746471, + "grad_norm": 0.5593163967132568, + "learning_rate": 9.931336368312417e-05, + "loss": 2.1044, + "step": 2648 + }, + { + "epoch": 0.8130755064456722, + "grad_norm": 0.5758668780326843, + "learning_rate": 9.931254251624378e-05, + "loss": 2.1813, + "step": 2649 + }, + { + "epoch": 0.8133824432166974, + "grad_norm": 0.7128240466117859, + "learning_rate": 9.931172086202793e-05, + "loss": 2.1743, + "step": 2650 + }, + { + "epoch": 0.8136893799877225, + "grad_norm": 0.6214346885681152, + "learning_rate": 9.931089872048476e-05, + "loss": 2.0566, + "step": 2651 + }, + { + "epoch": 0.8139963167587477, + "grad_norm": 0.6279975771903992, + "learning_rate": 9.931007609162239e-05, + "loss": 2.1487, + "step": 2652 + }, + { + "epoch": 0.8143032535297728, + "grad_norm": 0.6137428879737854, + "learning_rate": 9.930925297544895e-05, + "loss": 2.1281, + "step": 2653 + }, + { + "epoch": 0.8146101903007981, + "grad_norm": 0.7433622479438782, + "learning_rate": 9.930842937197255e-05, + "loss": 2.2398, + "step": 2654 + }, + { + "epoch": 0.8149171270718232, + "grad_norm": 0.7490934729576111, + "learning_rate": 9.930760528120137e-05, + "loss": 2.0626, + "step": 2655 + }, + { + "epoch": 0.8152240638428484, + "grad_norm": 0.6829020380973816, + "learning_rate": 9.930678070314352e-05, + "loss": 2.0685, + "step": 2656 + }, + { + "epoch": 0.8155310006138735, + "grad_norm": 0.6328942775726318, + "learning_rate": 9.930595563780718e-05, + "loss": 2.1415, + "step": 2657 + }, + { + "epoch": 0.8158379373848987, + "grad_norm": 0.6919183135032654, + "learning_rate": 9.930513008520048e-05, + "loss": 2.1764, + "step": 2658 + }, + { + "epoch": 0.8161448741559238, + "grad_norm": 0.6600683331489563, + "learning_rate": 9.930430404533158e-05, + "loss": 2.2252, + "step": 2659 + }, + { + "epoch": 0.8164518109269491, + "grad_norm": 0.6614112257957458, + "learning_rate": 9.930347751820866e-05, + "loss": 2.0842, + "step": 2660 + }, + { + "epoch": 0.8167587476979742, + "grad_norm": 0.634395182132721, + "learning_rate": 9.930265050383987e-05, + "loss": 2.1784, + "step": 2661 + }, + { + "epoch": 0.8170656844689994, + "grad_norm": 0.6563819050788879, + "learning_rate": 9.930182300223338e-05, + "loss": 2.1845, + "step": 2662 + }, + { + "epoch": 0.8173726212400245, + "grad_norm": 0.7023175954818726, + "learning_rate": 9.93009950133974e-05, + "loss": 2.1913, + "step": 2663 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.6042037010192871, + "learning_rate": 9.930016653734007e-05, + "loss": 2.1624, + "step": 2664 + }, + { + "epoch": 0.8179864947820749, + "grad_norm": 0.5729875564575195, + "learning_rate": 9.929933757406962e-05, + "loss": 2.0439, + "step": 2665 + }, + { + "epoch": 0.8182934315531001, + "grad_norm": 0.5399687886238098, + "learning_rate": 9.929850812359421e-05, + "loss": 2.1438, + "step": 2666 + }, + { + "epoch": 0.8186003683241252, + "grad_norm": 0.6325745582580566, + "learning_rate": 9.929767818592205e-05, + "loss": 2.1644, + "step": 2667 + }, + { + "epoch": 0.8189073050951504, + "grad_norm": 0.6303146481513977, + "learning_rate": 9.929684776106134e-05, + "loss": 2.1106, + "step": 2668 + }, + { + "epoch": 0.8192142418661755, + "grad_norm": 0.6482712030410767, + "learning_rate": 9.929601684902027e-05, + "loss": 2.0877, + "step": 2669 + }, + { + "epoch": 0.8195211786372008, + "grad_norm": 0.6858036518096924, + "learning_rate": 9.92951854498071e-05, + "loss": 2.1263, + "step": 2670 + }, + { + "epoch": 0.8198281154082259, + "grad_norm": 0.6214284896850586, + "learning_rate": 9.929435356343e-05, + "loss": 2.1516, + "step": 2671 + }, + { + "epoch": 0.8201350521792511, + "grad_norm": 0.5486865639686584, + "learning_rate": 9.92935211898972e-05, + "loss": 2.1199, + "step": 2672 + }, + { + "epoch": 0.8204419889502762, + "grad_norm": 0.62936931848526, + "learning_rate": 9.929268832921693e-05, + "loss": 2.1555, + "step": 2673 + }, + { + "epoch": 0.8207489257213014, + "grad_norm": 0.6402064561843872, + "learning_rate": 9.929185498139744e-05, + "loss": 2.1017, + "step": 2674 + }, + { + "epoch": 0.8210558624923265, + "grad_norm": 0.7254593372344971, + "learning_rate": 9.929102114644693e-05, + "loss": 2.1145, + "step": 2675 + }, + { + "epoch": 0.8213627992633518, + "grad_norm": 0.776472806930542, + "learning_rate": 9.929018682437366e-05, + "loss": 2.2582, + "step": 2676 + }, + { + "epoch": 0.8216697360343769, + "grad_norm": 0.7073757648468018, + "learning_rate": 9.928935201518587e-05, + "loss": 2.1135, + "step": 2677 + }, + { + "epoch": 0.8219766728054021, + "grad_norm": 0.7075079679489136, + "learning_rate": 9.928851671889184e-05, + "loss": 2.128, + "step": 2678 + }, + { + "epoch": 0.8222836095764272, + "grad_norm": 0.7937450408935547, + "learning_rate": 9.928768093549979e-05, + "loss": 2.1401, + "step": 2679 + }, + { + "epoch": 0.8225905463474524, + "grad_norm": 0.7523970603942871, + "learning_rate": 9.928684466501797e-05, + "loss": 2.2055, + "step": 2680 + }, + { + "epoch": 0.8228974831184775, + "grad_norm": 0.6644876599311829, + "learning_rate": 9.928600790745466e-05, + "loss": 2.1449, + "step": 2681 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.6054069399833679, + "learning_rate": 9.928517066281816e-05, + "loss": 2.1191, + "step": 2682 + }, + { + "epoch": 0.8235113566605279, + "grad_norm": 0.6610973477363586, + "learning_rate": 9.92843329311167e-05, + "loss": 2.2247, + "step": 2683 + }, + { + "epoch": 0.8238182934315531, + "grad_norm": 0.69968181848526, + "learning_rate": 9.928349471235858e-05, + "loss": 2.149, + "step": 2684 + }, + { + "epoch": 0.8241252302025782, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.928265600655206e-05, + "loss": 2.1906, + "step": 2685 + }, + { + "epoch": 0.8244321669736034, + "grad_norm": 0.6621972918510437, + "learning_rate": 9.928181681370547e-05, + "loss": 2.1259, + "step": 2686 + }, + { + "epoch": 0.8247391037446286, + "grad_norm": 0.6452053785324097, + "learning_rate": 9.928097713382708e-05, + "loss": 2.1301, + "step": 2687 + }, + { + "epoch": 0.8250460405156538, + "grad_norm": 0.6137326955795288, + "learning_rate": 9.928013696692519e-05, + "loss": 2.0942, + "step": 2688 + }, + { + "epoch": 0.8253529772866789, + "grad_norm": 0.6449215412139893, + "learning_rate": 9.92792963130081e-05, + "loss": 2.2135, + "step": 2689 + }, + { + "epoch": 0.8256599140577041, + "grad_norm": 0.5838732123374939, + "learning_rate": 9.927845517208411e-05, + "loss": 2.1161, + "step": 2690 + }, + { + "epoch": 0.8259668508287292, + "grad_norm": 0.6642805337905884, + "learning_rate": 9.927761354416157e-05, + "loss": 2.1228, + "step": 2691 + }, + { + "epoch": 0.8262737875997545, + "grad_norm": 0.653274416923523, + "learning_rate": 9.927677142924874e-05, + "loss": 2.1777, + "step": 2692 + }, + { + "epoch": 0.8265807243707797, + "grad_norm": 0.6471827030181885, + "learning_rate": 9.927592882735398e-05, + "loss": 2.0756, + "step": 2693 + }, + { + "epoch": 0.8268876611418048, + "grad_norm": 0.6215457916259766, + "learning_rate": 9.927508573848562e-05, + "loss": 2.0691, + "step": 2694 + }, + { + "epoch": 0.82719459791283, + "grad_norm": 0.6343390345573425, + "learning_rate": 9.927424216265198e-05, + "loss": 2.2145, + "step": 2695 + }, + { + "epoch": 0.8275015346838551, + "grad_norm": 0.5296334624290466, + "learning_rate": 9.927339809986138e-05, + "loss": 2.0861, + "step": 2696 + }, + { + "epoch": 0.8278084714548803, + "grad_norm": 0.6457146406173706, + "learning_rate": 9.92725535501222e-05, + "loss": 2.1703, + "step": 2697 + }, + { + "epoch": 0.8281154082259055, + "grad_norm": 0.753579318523407, + "learning_rate": 9.927170851344276e-05, + "loss": 2.1628, + "step": 2698 + }, + { + "epoch": 0.8284223449969307, + "grad_norm": 0.7327163815498352, + "learning_rate": 9.927086298983141e-05, + "loss": 2.105, + "step": 2699 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.7786175608634949, + "learning_rate": 9.927001697929653e-05, + "loss": 2.084, + "step": 2700 + }, + { + "epoch": 0.829036218538981, + "grad_norm": 0.6370857357978821, + "learning_rate": 9.926917048184646e-05, + "loss": 2.0888, + "step": 2701 + }, + { + "epoch": 0.8293431553100061, + "grad_norm": 0.6600006818771362, + "learning_rate": 9.926832349748955e-05, + "loss": 2.148, + "step": 2702 + }, + { + "epoch": 0.8296500920810314, + "grad_norm": 0.6266845464706421, + "learning_rate": 9.926747602623422e-05, + "loss": 2.2182, + "step": 2703 + }, + { + "epoch": 0.8299570288520565, + "grad_norm": 0.588934600353241, + "learning_rate": 9.92666280680888e-05, + "loss": 2.1879, + "step": 2704 + }, + { + "epoch": 0.8302639656230817, + "grad_norm": 0.6467881202697754, + "learning_rate": 9.926577962306168e-05, + "loss": 2.1082, + "step": 2705 + }, + { + "epoch": 0.8305709023941068, + "grad_norm": 0.6256638765335083, + "learning_rate": 9.926493069116127e-05, + "loss": 2.1007, + "step": 2706 + }, + { + "epoch": 0.830877839165132, + "grad_norm": 0.5710256099700928, + "learning_rate": 9.926408127239592e-05, + "loss": 2.0783, + "step": 2707 + }, + { + "epoch": 0.8311847759361571, + "grad_norm": 0.5836597681045532, + "learning_rate": 9.926323136677405e-05, + "loss": 2.1292, + "step": 2708 + }, + { + "epoch": 0.8314917127071824, + "grad_norm": 0.6420408487319946, + "learning_rate": 9.926238097430405e-05, + "loss": 2.1191, + "step": 2709 + }, + { + "epoch": 0.8317986494782075, + "grad_norm": 0.6192520260810852, + "learning_rate": 9.926153009499433e-05, + "loss": 2.1401, + "step": 2710 + }, + { + "epoch": 0.8321055862492327, + "grad_norm": 0.5986925959587097, + "learning_rate": 9.92606787288533e-05, + "loss": 2.0466, + "step": 2711 + }, + { + "epoch": 0.8324125230202578, + "grad_norm": 0.6386710405349731, + "learning_rate": 9.925982687588937e-05, + "loss": 2.1975, + "step": 2712 + }, + { + "epoch": 0.832719459791283, + "grad_norm": 0.6678250432014465, + "learning_rate": 9.925897453611095e-05, + "loss": 2.1744, + "step": 2713 + }, + { + "epoch": 0.8330263965623081, + "grad_norm": 0.628873348236084, + "learning_rate": 9.925812170952648e-05, + "loss": 2.0901, + "step": 2714 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6365368366241455, + "learning_rate": 9.925726839614438e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.8336402701043585, + "grad_norm": 0.6812825798988342, + "learning_rate": 9.925641459597309e-05, + "loss": 2.1163, + "step": 2716 + }, + { + "epoch": 0.8339472068753837, + "grad_norm": 0.6961301565170288, + "learning_rate": 9.925556030902103e-05, + "loss": 2.1634, + "step": 2717 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.687017023563385, + "learning_rate": 9.925470553529666e-05, + "loss": 2.1921, + "step": 2718 + }, + { + "epoch": 0.834561080417434, + "grad_norm": 0.6528787612915039, + "learning_rate": 9.925385027480841e-05, + "loss": 2.1148, + "step": 2719 + }, + { + "epoch": 0.8348680171884592, + "grad_norm": 0.6092917323112488, + "learning_rate": 9.925299452756476e-05, + "loss": 2.0154, + "step": 2720 + }, + { + "epoch": 0.8351749539594844, + "grad_norm": 0.6537092328071594, + "learning_rate": 9.925213829357413e-05, + "loss": 2.1775, + "step": 2721 + }, + { + "epoch": 0.8354818907305095, + "grad_norm": 0.6560773849487305, + "learning_rate": 9.925128157284503e-05, + "loss": 2.1628, + "step": 2722 + }, + { + "epoch": 0.8357888275015347, + "grad_norm": 0.5976104140281677, + "learning_rate": 9.925042436538588e-05, + "loss": 2.1527, + "step": 2723 + }, + { + "epoch": 0.8360957642725598, + "grad_norm": 0.6577131152153015, + "learning_rate": 9.924956667120516e-05, + "loss": 2.1449, + "step": 2724 + }, + { + "epoch": 0.836402701043585, + "grad_norm": 0.6574232578277588, + "learning_rate": 9.924870849031136e-05, + "loss": 2.0517, + "step": 2725 + }, + { + "epoch": 0.8367096378146102, + "grad_norm": 0.5988326072692871, + "learning_rate": 9.924784982271297e-05, + "loss": 2.0975, + "step": 2726 + }, + { + "epoch": 0.8370165745856354, + "grad_norm": 0.5970706939697266, + "learning_rate": 9.924699066841845e-05, + "loss": 2.1754, + "step": 2727 + }, + { + "epoch": 0.8373235113566605, + "grad_norm": 0.6547200679779053, + "learning_rate": 9.924613102743632e-05, + "loss": 2.1651, + "step": 2728 + }, + { + "epoch": 0.8376304481276857, + "grad_norm": 0.643358588218689, + "learning_rate": 9.924527089977504e-05, + "loss": 2.1355, + "step": 2729 + }, + { + "epoch": 0.8379373848987108, + "grad_norm": 0.6696504950523376, + "learning_rate": 9.924441028544314e-05, + "loss": 2.1444, + "step": 2730 + }, + { + "epoch": 0.8382443216697361, + "grad_norm": 0.5923263430595398, + "learning_rate": 9.924354918444911e-05, + "loss": 2.1656, + "step": 2731 + }, + { + "epoch": 0.8385512584407612, + "grad_norm": 0.6507698893547058, + "learning_rate": 9.924268759680146e-05, + "loss": 2.1172, + "step": 2732 + }, + { + "epoch": 0.8388581952117864, + "grad_norm": 0.6240561008453369, + "learning_rate": 9.924182552250873e-05, + "loss": 2.113, + "step": 2733 + }, + { + "epoch": 0.8391651319828115, + "grad_norm": 0.7350605726242065, + "learning_rate": 9.92409629615794e-05, + "loss": 2.2099, + "step": 2734 + }, + { + "epoch": 0.8394720687538367, + "grad_norm": 0.679027795791626, + "learning_rate": 9.924009991402202e-05, + "loss": 2.1202, + "step": 2735 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.7187801003456116, + "learning_rate": 9.923923637984512e-05, + "loss": 2.1994, + "step": 2736 + }, + { + "epoch": 0.8400859422958871, + "grad_norm": 0.7437569499015808, + "learning_rate": 9.92383723590572e-05, + "loss": 2.1778, + "step": 2737 + }, + { + "epoch": 0.8403928790669122, + "grad_norm": 0.7004902958869934, + "learning_rate": 9.923750785166686e-05, + "loss": 2.1478, + "step": 2738 + }, + { + "epoch": 0.8406998158379374, + "grad_norm": 0.632478654384613, + "learning_rate": 9.923664285768258e-05, + "loss": 2.1785, + "step": 2739 + }, + { + "epoch": 0.8410067526089625, + "grad_norm": 0.6399826407432556, + "learning_rate": 9.923577737711295e-05, + "loss": 2.1708, + "step": 2740 + }, + { + "epoch": 0.8413136893799877, + "grad_norm": 0.649340033531189, + "learning_rate": 9.92349114099665e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.8416206261510129, + "grad_norm": 0.6143749952316284, + "learning_rate": 9.923404495625182e-05, + "loss": 2.0696, + "step": 2742 + }, + { + "epoch": 0.8419275629220381, + "grad_norm": 0.655846357345581, + "learning_rate": 9.923317801597742e-05, + "loss": 2.1163, + "step": 2743 + }, + { + "epoch": 0.8422344996930632, + "grad_norm": 0.588096022605896, + "learning_rate": 9.923231058915192e-05, + "loss": 2.0893, + "step": 2744 + }, + { + "epoch": 0.8425414364640884, + "grad_norm": 0.5445908904075623, + "learning_rate": 9.923144267578386e-05, + "loss": 2.1223, + "step": 2745 + }, + { + "epoch": 0.8428483732351135, + "grad_norm": 0.5372910499572754, + "learning_rate": 9.923057427588182e-05, + "loss": 2.1386, + "step": 2746 + }, + { + "epoch": 0.8431553100061387, + "grad_norm": 0.5118899345397949, + "learning_rate": 9.922970538945442e-05, + "loss": 2.0532, + "step": 2747 + }, + { + "epoch": 0.8434622467771639, + "grad_norm": 0.5252440571784973, + "learning_rate": 9.922883601651019e-05, + "loss": 2.1679, + "step": 2748 + }, + { + "epoch": 0.8437691835481891, + "grad_norm": 0.5978875160217285, + "learning_rate": 9.922796615705776e-05, + "loss": 2.2054, + "step": 2749 + }, + { + "epoch": 0.8440761203192142, + "grad_norm": 0.5642610788345337, + "learning_rate": 9.922709581110572e-05, + "loss": 2.1886, + "step": 2750 + }, + { + "epoch": 0.8443830570902394, + "grad_norm": 0.6332407593727112, + "learning_rate": 9.922622497866265e-05, + "loss": 2.1618, + "step": 2751 + }, + { + "epoch": 0.8446899938612645, + "grad_norm": 0.6971728801727295, + "learning_rate": 9.922535365973718e-05, + "loss": 2.1011, + "step": 2752 + }, + { + "epoch": 0.8449969306322898, + "grad_norm": 0.6917250156402588, + "learning_rate": 9.922448185433792e-05, + "loss": 2.1408, + "step": 2753 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.748960554599762, + "learning_rate": 9.922360956247348e-05, + "loss": 2.1612, + "step": 2754 + }, + { + "epoch": 0.8456108041743401, + "grad_norm": 0.6739722490310669, + "learning_rate": 9.922273678415245e-05, + "loss": 2.1234, + "step": 2755 + }, + { + "epoch": 0.8459177409453652, + "grad_norm": 0.6310722827911377, + "learning_rate": 9.922186351938351e-05, + "loss": 2.1476, + "step": 2756 + }, + { + "epoch": 0.8462246777163904, + "grad_norm": 0.5992079973220825, + "learning_rate": 9.922098976817527e-05, + "loss": 2.1009, + "step": 2757 + }, + { + "epoch": 0.8465316144874155, + "grad_norm": 0.5697188973426819, + "learning_rate": 9.922011553053637e-05, + "loss": 2.1277, + "step": 2758 + }, + { + "epoch": 0.8468385512584408, + "grad_norm": 0.7005256414413452, + "learning_rate": 9.921924080647541e-05, + "loss": 2.1592, + "step": 2759 + }, + { + "epoch": 0.8471454880294659, + "grad_norm": 0.7664382457733154, + "learning_rate": 9.921836559600109e-05, + "loss": 2.2328, + "step": 2760 + }, + { + "epoch": 0.8474524248004911, + "grad_norm": 0.8668230772018433, + "learning_rate": 9.921748989912201e-05, + "loss": 2.2285, + "step": 2761 + }, + { + "epoch": 0.8477593615715162, + "grad_norm": 0.9423169493675232, + "learning_rate": 9.921661371584685e-05, + "loss": 2.1172, + "step": 2762 + }, + { + "epoch": 0.8480662983425414, + "grad_norm": 0.8547552824020386, + "learning_rate": 9.921573704618428e-05, + "loss": 2.1426, + "step": 2763 + }, + { + "epoch": 0.8483732351135667, + "grad_norm": 0.7568690776824951, + "learning_rate": 9.921485989014294e-05, + "loss": 2.0861, + "step": 2764 + }, + { + "epoch": 0.8486801718845918, + "grad_norm": 0.6535828709602356, + "learning_rate": 9.92139822477315e-05, + "loss": 2.1705, + "step": 2765 + }, + { + "epoch": 0.848987108655617, + "grad_norm": 0.6099218130111694, + "learning_rate": 9.921310411895867e-05, + "loss": 2.1666, + "step": 2766 + }, + { + "epoch": 0.8492940454266421, + "grad_norm": 0.6315065026283264, + "learning_rate": 9.92122255038331e-05, + "loss": 2.1868, + "step": 2767 + }, + { + "epoch": 0.8496009821976673, + "grad_norm": 0.6861329078674316, + "learning_rate": 9.921134640236344e-05, + "loss": 2.1056, + "step": 2768 + }, + { + "epoch": 0.8499079189686924, + "grad_norm": 0.6357519626617432, + "learning_rate": 9.921046681455844e-05, + "loss": 2.1272, + "step": 2769 + }, + { + "epoch": 0.8502148557397177, + "grad_norm": 0.6245810389518738, + "learning_rate": 9.920958674042676e-05, + "loss": 2.1313, + "step": 2770 + }, + { + "epoch": 0.8505217925107428, + "grad_norm": 0.6087192296981812, + "learning_rate": 9.920870617997709e-05, + "loss": 2.123, + "step": 2771 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.6384228467941284, + "learning_rate": 9.920782513321814e-05, + "loss": 2.1343, + "step": 2772 + }, + { + "epoch": 0.8511356660527931, + "grad_norm": 0.6143882274627686, + "learning_rate": 9.920694360015863e-05, + "loss": 2.0706, + "step": 2773 + }, + { + "epoch": 0.8514426028238183, + "grad_norm": 0.5561975240707397, + "learning_rate": 9.920606158080725e-05, + "loss": 2.1015, + "step": 2774 + }, + { + "epoch": 0.8517495395948435, + "grad_norm": 0.5434146523475647, + "learning_rate": 9.920517907517275e-05, + "loss": 2.1306, + "step": 2775 + }, + { + "epoch": 0.8520564763658687, + "grad_norm": 0.6028591990470886, + "learning_rate": 9.920429608326382e-05, + "loss": 2.1665, + "step": 2776 + }, + { + "epoch": 0.8523634131368938, + "grad_norm": 0.6491599082946777, + "learning_rate": 9.920341260508918e-05, + "loss": 2.0715, + "step": 2777 + }, + { + "epoch": 0.852670349907919, + "grad_norm": 0.6350167989730835, + "learning_rate": 9.92025286406576e-05, + "loss": 2.1492, + "step": 2778 + }, + { + "epoch": 0.8529772866789441, + "grad_norm": 0.5726897120475769, + "learning_rate": 9.92016441899778e-05, + "loss": 2.1128, + "step": 2779 + }, + { + "epoch": 0.8532842234499693, + "grad_norm": 0.5680630207061768, + "learning_rate": 9.92007592530585e-05, + "loss": 2.0718, + "step": 2780 + }, + { + "epoch": 0.8535911602209945, + "grad_norm": 0.5901346802711487, + "learning_rate": 9.919987382990845e-05, + "loss": 2.0577, + "step": 2781 + }, + { + "epoch": 0.8538980969920197, + "grad_norm": 0.5756994485855103, + "learning_rate": 9.919898792053643e-05, + "loss": 2.106, + "step": 2782 + }, + { + "epoch": 0.8542050337630448, + "grad_norm": 0.5831238031387329, + "learning_rate": 9.919810152495116e-05, + "loss": 2.0507, + "step": 2783 + }, + { + "epoch": 0.85451197053407, + "grad_norm": 0.529931902885437, + "learning_rate": 9.919721464316143e-05, + "loss": 2.0934, + "step": 2784 + }, + { + "epoch": 0.8548189073050951, + "grad_norm": 0.603672981262207, + "learning_rate": 9.919632727517597e-05, + "loss": 2.164, + "step": 2785 + }, + { + "epoch": 0.8551258440761204, + "grad_norm": 0.5741528868675232, + "learning_rate": 9.919543942100357e-05, + "loss": 2.0948, + "step": 2786 + }, + { + "epoch": 0.8554327808471455, + "grad_norm": 0.5689142942428589, + "learning_rate": 9.919455108065303e-05, + "loss": 2.1572, + "step": 2787 + }, + { + "epoch": 0.8557397176181707, + "grad_norm": 0.5767523646354675, + "learning_rate": 9.919366225413308e-05, + "loss": 2.0528, + "step": 2788 + }, + { + "epoch": 0.8560466543891958, + "grad_norm": 0.6004374623298645, + "learning_rate": 9.919277294145252e-05, + "loss": 2.1078, + "step": 2789 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.6199560761451721, + "learning_rate": 9.919188314262017e-05, + "loss": 2.034, + "step": 2790 + }, + { + "epoch": 0.8566605279312461, + "grad_norm": 0.5928464531898499, + "learning_rate": 9.919099285764478e-05, + "loss": 2.1226, + "step": 2791 + }, + { + "epoch": 0.8569674647022714, + "grad_norm": 0.5620111227035522, + "learning_rate": 9.919010208653517e-05, + "loss": 2.1387, + "step": 2792 + }, + { + "epoch": 0.8572744014732965, + "grad_norm": 0.6035314798355103, + "learning_rate": 9.918921082930015e-05, + "loss": 2.0888, + "step": 2793 + }, + { + "epoch": 0.8575813382443217, + "grad_norm": 0.6842171549797058, + "learning_rate": 9.91883190859485e-05, + "loss": 2.15, + "step": 2794 + }, + { + "epoch": 0.8578882750153468, + "grad_norm": 0.7600229978561401, + "learning_rate": 9.918742685648906e-05, + "loss": 2.1776, + "step": 2795 + }, + { + "epoch": 0.858195211786372, + "grad_norm": 0.641504168510437, + "learning_rate": 9.918653414093065e-05, + "loss": 2.086, + "step": 2796 + }, + { + "epoch": 0.8585021485573971, + "grad_norm": 0.6062462329864502, + "learning_rate": 9.918564093928207e-05, + "loss": 2.0772, + "step": 2797 + }, + { + "epoch": 0.8588090853284224, + "grad_norm": 0.5259165167808533, + "learning_rate": 9.918474725155214e-05, + "loss": 2.1034, + "step": 2798 + }, + { + "epoch": 0.8591160220994475, + "grad_norm": 0.532511830329895, + "learning_rate": 9.918385307774973e-05, + "loss": 2.103, + "step": 2799 + }, + { + "epoch": 0.8594229588704727, + "grad_norm": 0.5996485352516174, + "learning_rate": 9.918295841788366e-05, + "loss": 2.1698, + "step": 2800 + }, + { + "epoch": 0.8597298956414978, + "grad_norm": 0.5895976424217224, + "learning_rate": 9.918206327196276e-05, + "loss": 2.132, + "step": 2801 + }, + { + "epoch": 0.860036832412523, + "grad_norm": 0.6363179087638855, + "learning_rate": 9.918116763999588e-05, + "loss": 2.0967, + "step": 2802 + }, + { + "epoch": 0.8603437691835482, + "grad_norm": 0.6594113707542419, + "learning_rate": 9.918027152199187e-05, + "loss": 2.1266, + "step": 2803 + }, + { + "epoch": 0.8606507059545734, + "grad_norm": 0.694879412651062, + "learning_rate": 9.917937491795961e-05, + "loss": 2.0694, + "step": 2804 + }, + { + "epoch": 0.8609576427255985, + "grad_norm": 0.6310710906982422, + "learning_rate": 9.917847782790793e-05, + "loss": 2.1546, + "step": 2805 + }, + { + "epoch": 0.8612645794966237, + "grad_norm": 0.6166081428527832, + "learning_rate": 9.917758025184572e-05, + "loss": 2.131, + "step": 2806 + }, + { + "epoch": 0.8615715162676488, + "grad_norm": 0.5857066512107849, + "learning_rate": 9.917668218978182e-05, + "loss": 2.1529, + "step": 2807 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.6374151706695557, + "learning_rate": 9.917578364172513e-05, + "loss": 2.151, + "step": 2808 + }, + { + "epoch": 0.8621853898096992, + "grad_norm": 0.6760959625244141, + "learning_rate": 9.917488460768453e-05, + "loss": 2.1955, + "step": 2809 + }, + { + "epoch": 0.8624923265807244, + "grad_norm": 0.6308501362800598, + "learning_rate": 9.917398508766889e-05, + "loss": 2.1449, + "step": 2810 + }, + { + "epoch": 0.8627992633517495, + "grad_norm": 0.615181028842926, + "learning_rate": 9.91730850816871e-05, + "loss": 2.0326, + "step": 2811 + }, + { + "epoch": 0.8631062001227747, + "grad_norm": 0.6746891736984253, + "learning_rate": 9.917218458974809e-05, + "loss": 2.1472, + "step": 2812 + }, + { + "epoch": 0.8634131368937998, + "grad_norm": 0.6594959497451782, + "learning_rate": 9.91712836118607e-05, + "loss": 2.0879, + "step": 2813 + }, + { + "epoch": 0.8637200736648251, + "grad_norm": 0.6843087077140808, + "learning_rate": 9.91703821480339e-05, + "loss": 2.13, + "step": 2814 + }, + { + "epoch": 0.8640270104358502, + "grad_norm": 0.7513928413391113, + "learning_rate": 9.916948019827653e-05, + "loss": 2.1866, + "step": 2815 + }, + { + "epoch": 0.8643339472068754, + "grad_norm": 0.7352319955825806, + "learning_rate": 9.916857776259755e-05, + "loss": 2.0844, + "step": 2816 + }, + { + "epoch": 0.8646408839779005, + "grad_norm": 0.6901769638061523, + "learning_rate": 9.916767484100587e-05, + "loss": 2.086, + "step": 2817 + }, + { + "epoch": 0.8649478207489257, + "grad_norm": 0.621734619140625, + "learning_rate": 9.91667714335104e-05, + "loss": 2.0764, + "step": 2818 + }, + { + "epoch": 0.8652547575199508, + "grad_norm": 0.5779813528060913, + "learning_rate": 9.916586754012008e-05, + "loss": 2.0568, + "step": 2819 + }, + { + "epoch": 0.8655616942909761, + "grad_norm": 0.566251814365387, + "learning_rate": 9.916496316084385e-05, + "loss": 2.1624, + "step": 2820 + }, + { + "epoch": 0.8658686310620012, + "grad_norm": 0.6039763689041138, + "learning_rate": 9.916405829569062e-05, + "loss": 2.0412, + "step": 2821 + }, + { + "epoch": 0.8661755678330264, + "grad_norm": 0.587469220161438, + "learning_rate": 9.916315294466935e-05, + "loss": 2.1513, + "step": 2822 + }, + { + "epoch": 0.8664825046040515, + "grad_norm": 0.5792883634567261, + "learning_rate": 9.916224710778901e-05, + "loss": 2.055, + "step": 2823 + }, + { + "epoch": 0.8667894413750767, + "grad_norm": 0.5533844232559204, + "learning_rate": 9.916134078505852e-05, + "loss": 2.1237, + "step": 2824 + }, + { + "epoch": 0.8670963781461019, + "grad_norm": 0.6140845417976379, + "learning_rate": 9.916043397648685e-05, + "loss": 2.1481, + "step": 2825 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.6092365384101868, + "learning_rate": 9.915952668208295e-05, + "loss": 2.1567, + "step": 2826 + }, + { + "epoch": 0.8677102516881522, + "grad_norm": 0.5712884068489075, + "learning_rate": 9.915861890185578e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.8680171884591774, + "grad_norm": 0.5314213633537292, + "learning_rate": 9.915771063581434e-05, + "loss": 2.0408, + "step": 2828 + }, + { + "epoch": 0.8683241252302025, + "grad_norm": 0.5258345007896423, + "learning_rate": 9.915680188396759e-05, + "loss": 2.0968, + "step": 2829 + }, + { + "epoch": 0.8686310620012277, + "grad_norm": 0.6071497797966003, + "learning_rate": 9.915589264632453e-05, + "loss": 2.0924, + "step": 2830 + }, + { + "epoch": 0.8689379987722529, + "grad_norm": 0.6742420792579651, + "learning_rate": 9.915498292289408e-05, + "loss": 2.1276, + "step": 2831 + }, + { + "epoch": 0.8692449355432781, + "grad_norm": 0.7642729878425598, + "learning_rate": 9.915407271368533e-05, + "loss": 2.204, + "step": 2832 + }, + { + "epoch": 0.8695518723143032, + "grad_norm": 0.8024489283561707, + "learning_rate": 9.915316201870718e-05, + "loss": 2.163, + "step": 2833 + }, + { + "epoch": 0.8698588090853284, + "grad_norm": 0.8268367648124695, + "learning_rate": 9.915225083796871e-05, + "loss": 2.117, + "step": 2834 + }, + { + "epoch": 0.8701657458563536, + "grad_norm": 0.7761407494544983, + "learning_rate": 9.915133917147888e-05, + "loss": 2.0727, + "step": 2835 + }, + { + "epoch": 0.8704726826273788, + "grad_norm": 0.7515753507614136, + "learning_rate": 9.91504270192467e-05, + "loss": 2.075, + "step": 2836 + }, + { + "epoch": 0.870779619398404, + "grad_norm": 0.6203973889350891, + "learning_rate": 9.914951438128119e-05, + "loss": 2.1163, + "step": 2837 + }, + { + "epoch": 0.8710865561694291, + "grad_norm": 0.6056976318359375, + "learning_rate": 9.914860125759138e-05, + "loss": 2.1515, + "step": 2838 + }, + { + "epoch": 0.8713934929404543, + "grad_norm": 0.6472234725952148, + "learning_rate": 9.914768764818627e-05, + "loss": 2.1618, + "step": 2839 + }, + { + "epoch": 0.8717004297114794, + "grad_norm": 0.5981749892234802, + "learning_rate": 9.914677355307491e-05, + "loss": 2.0763, + "step": 2840 + }, + { + "epoch": 0.8720073664825047, + "grad_norm": 0.5721938014030457, + "learning_rate": 9.914585897226634e-05, + "loss": 2.0916, + "step": 2841 + }, + { + "epoch": 0.8723143032535298, + "grad_norm": 0.6079535484313965, + "learning_rate": 9.914494390576958e-05, + "loss": 2.0767, + "step": 2842 + }, + { + "epoch": 0.872621240024555, + "grad_norm": 0.6684066653251648, + "learning_rate": 9.914402835359368e-05, + "loss": 2.2712, + "step": 2843 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.6992711424827576, + "learning_rate": 9.91431123157477e-05, + "loss": 2.0813, + "step": 2844 + }, + { + "epoch": 0.8732351135666053, + "grad_norm": 0.6585392951965332, + "learning_rate": 9.914219579224065e-05, + "loss": 2.1303, + "step": 2845 + }, + { + "epoch": 0.8735420503376304, + "grad_norm": 0.7267395257949829, + "learning_rate": 9.914127878308164e-05, + "loss": 2.2253, + "step": 2846 + }, + { + "epoch": 0.8738489871086557, + "grad_norm": 0.6764006018638611, + "learning_rate": 9.91403612882797e-05, + "loss": 2.0886, + "step": 2847 + }, + { + "epoch": 0.8741559238796808, + "grad_norm": 0.612808108329773, + "learning_rate": 9.91394433078439e-05, + "loss": 2.0469, + "step": 2848 + }, + { + "epoch": 0.874462860650706, + "grad_norm": 0.5598782896995544, + "learning_rate": 9.913852484178334e-05, + "loss": 2.1745, + "step": 2849 + }, + { + "epoch": 0.8747697974217311, + "grad_norm": 0.6498168706893921, + "learning_rate": 9.913760589010707e-05, + "loss": 2.2657, + "step": 2850 + }, + { + "epoch": 0.8750767341927563, + "grad_norm": 0.6796014904975891, + "learning_rate": 9.913668645282418e-05, + "loss": 2.1056, + "step": 2851 + }, + { + "epoch": 0.8753836709637814, + "grad_norm": 0.7409440279006958, + "learning_rate": 9.913576652994376e-05, + "loss": 2.1533, + "step": 2852 + }, + { + "epoch": 0.8756906077348067, + "grad_norm": 0.7044464945793152, + "learning_rate": 9.913484612147488e-05, + "loss": 2.2088, + "step": 2853 + }, + { + "epoch": 0.8759975445058318, + "grad_norm": 0.6333544254302979, + "learning_rate": 9.913392522742666e-05, + "loss": 2.132, + "step": 2854 + }, + { + "epoch": 0.876304481276857, + "grad_norm": 0.603382408618927, + "learning_rate": 9.91330038478082e-05, + "loss": 2.0657, + "step": 2855 + }, + { + "epoch": 0.8766114180478821, + "grad_norm": 0.5919856429100037, + "learning_rate": 9.913208198262858e-05, + "loss": 2.0854, + "step": 2856 + }, + { + "epoch": 0.8769183548189073, + "grad_norm": 0.6033365726470947, + "learning_rate": 9.913115963189694e-05, + "loss": 2.0825, + "step": 2857 + }, + { + "epoch": 0.8772252915899325, + "grad_norm": 0.5917964577674866, + "learning_rate": 9.913023679562238e-05, + "loss": 2.1608, + "step": 2858 + }, + { + "epoch": 0.8775322283609577, + "grad_norm": 0.5953360795974731, + "learning_rate": 9.912931347381402e-05, + "loss": 2.1454, + "step": 2859 + }, + { + "epoch": 0.8778391651319828, + "grad_norm": 0.5949352979660034, + "learning_rate": 9.9128389666481e-05, + "loss": 2.1575, + "step": 2860 + }, + { + "epoch": 0.878146101903008, + "grad_norm": 0.5468181371688843, + "learning_rate": 9.912746537363243e-05, + "loss": 2.151, + "step": 2861 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.5476632714271545, + "learning_rate": 9.912654059527746e-05, + "loss": 2.1015, + "step": 2862 + }, + { + "epoch": 0.8787599754450584, + "grad_norm": 0.6881390810012817, + "learning_rate": 9.912561533142521e-05, + "loss": 2.2002, + "step": 2863 + }, + { + "epoch": 0.8790669122160835, + "grad_norm": 0.6663404703140259, + "learning_rate": 9.912468958208486e-05, + "loss": 2.0691, + "step": 2864 + }, + { + "epoch": 0.8793738489871087, + "grad_norm": 0.5739100575447083, + "learning_rate": 9.91237633472655e-05, + "loss": 2.0852, + "step": 2865 + }, + { + "epoch": 0.8796807857581338, + "grad_norm": 0.5227558016777039, + "learning_rate": 9.912283662697635e-05, + "loss": 2.1144, + "step": 2866 + }, + { + "epoch": 0.879987722529159, + "grad_norm": 0.5626821517944336, + "learning_rate": 9.912190942122652e-05, + "loss": 2.0796, + "step": 2867 + }, + { + "epoch": 0.8802946593001841, + "grad_norm": 0.5367855429649353, + "learning_rate": 9.912098173002518e-05, + "loss": 2.0768, + "step": 2868 + }, + { + "epoch": 0.8806015960712094, + "grad_norm": 0.5285482406616211, + "learning_rate": 9.912005355338152e-05, + "loss": 2.0832, + "step": 2869 + }, + { + "epoch": 0.8809085328422345, + "grad_norm": 0.5384502410888672, + "learning_rate": 9.91191248913047e-05, + "loss": 2.0187, + "step": 2870 + }, + { + "epoch": 0.8812154696132597, + "grad_norm": 0.5099567770957947, + "learning_rate": 9.91181957438039e-05, + "loss": 2.0865, + "step": 2871 + }, + { + "epoch": 0.8815224063842848, + "grad_norm": 0.5513966679573059, + "learning_rate": 9.911726611088831e-05, + "loss": 2.1097, + "step": 2872 + }, + { + "epoch": 0.88182934315531, + "grad_norm": 0.5411790609359741, + "learning_rate": 9.911633599256709e-05, + "loss": 2.0964, + "step": 2873 + }, + { + "epoch": 0.8821362799263351, + "grad_norm": 0.6151100397109985, + "learning_rate": 9.911540538884947e-05, + "loss": 2.1006, + "step": 2874 + }, + { + "epoch": 0.8824432166973604, + "grad_norm": 0.754391610622406, + "learning_rate": 9.911447429974461e-05, + "loss": 2.1493, + "step": 2875 + }, + { + "epoch": 0.8827501534683855, + "grad_norm": 0.7485715746879578, + "learning_rate": 9.911354272526172e-05, + "loss": 2.1136, + "step": 2876 + }, + { + "epoch": 0.8830570902394107, + "grad_norm": 0.6808591485023499, + "learning_rate": 9.911261066541003e-05, + "loss": 2.1238, + "step": 2877 + }, + { + "epoch": 0.8833640270104358, + "grad_norm": 0.5771127343177795, + "learning_rate": 9.911167812019874e-05, + "loss": 2.0846, + "step": 2878 + }, + { + "epoch": 0.883670963781461, + "grad_norm": 0.5991767048835754, + "learning_rate": 9.911074508963705e-05, + "loss": 2.1486, + "step": 2879 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.6899440884590149, + "learning_rate": 9.91098115737342e-05, + "loss": 2.1357, + "step": 2880 + }, + { + "epoch": 0.8842848373235114, + "grad_norm": 0.7102574110031128, + "learning_rate": 9.91088775724994e-05, + "loss": 2.1269, + "step": 2881 + }, + { + "epoch": 0.8845917740945365, + "grad_norm": 0.7238754034042358, + "learning_rate": 9.910794308594189e-05, + "loss": 2.0829, + "step": 2882 + }, + { + "epoch": 0.8848987108655617, + "grad_norm": 0.7232441902160645, + "learning_rate": 9.91070081140709e-05, + "loss": 2.1704, + "step": 2883 + }, + { + "epoch": 0.8852056476365868, + "grad_norm": 0.7136173844337463, + "learning_rate": 9.910607265689569e-05, + "loss": 2.1553, + "step": 2884 + }, + { + "epoch": 0.885512584407612, + "grad_norm": 0.6566216945648193, + "learning_rate": 9.910513671442547e-05, + "loss": 2.0856, + "step": 2885 + }, + { + "epoch": 0.8858195211786372, + "grad_norm": 0.5712916851043701, + "learning_rate": 9.910420028666951e-05, + "loss": 2.1399, + "step": 2886 + }, + { + "epoch": 0.8861264579496624, + "grad_norm": 0.727664589881897, + "learning_rate": 9.910326337363707e-05, + "loss": 2.088, + "step": 2887 + }, + { + "epoch": 0.8864333947206875, + "grad_norm": 0.799963653087616, + "learning_rate": 9.91023259753374e-05, + "loss": 2.0984, + "step": 2888 + }, + { + "epoch": 0.8867403314917127, + "grad_norm": 0.9462977051734924, + "learning_rate": 9.910138809177975e-05, + "loss": 2.1262, + "step": 2889 + }, + { + "epoch": 0.8870472682627378, + "grad_norm": 0.9130533933639526, + "learning_rate": 9.910044972297343e-05, + "loss": 2.1967, + "step": 2890 + }, + { + "epoch": 0.887354205033763, + "grad_norm": 0.6971304416656494, + "learning_rate": 9.909951086892767e-05, + "loss": 2.0797, + "step": 2891 + }, + { + "epoch": 0.8876611418047882, + "grad_norm": 0.5822353363037109, + "learning_rate": 9.909857152965176e-05, + "loss": 2.1152, + "step": 2892 + }, + { + "epoch": 0.8879680785758134, + "grad_norm": 0.5885453820228577, + "learning_rate": 9.9097631705155e-05, + "loss": 2.0323, + "step": 2893 + }, + { + "epoch": 0.8882750153468385, + "grad_norm": 0.6249284744262695, + "learning_rate": 9.909669139544666e-05, + "loss": 2.1076, + "step": 2894 + }, + { + "epoch": 0.8885819521178637, + "grad_norm": 0.6117702722549438, + "learning_rate": 9.909575060053604e-05, + "loss": 2.0608, + "step": 2895 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.560357928276062, + "learning_rate": 9.909480932043245e-05, + "loss": 2.145, + "step": 2896 + }, + { + "epoch": 0.8891958256599141, + "grad_norm": 0.5442607998847961, + "learning_rate": 9.909386755514516e-05, + "loss": 2.1091, + "step": 2897 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.5653077363967896, + "learning_rate": 9.909292530468351e-05, + "loss": 2.1097, + "step": 2898 + }, + { + "epoch": 0.8898096992019644, + "grad_norm": 0.531939685344696, + "learning_rate": 9.909198256905679e-05, + "loss": 2.0866, + "step": 2899 + }, + { + "epoch": 0.8901166359729895, + "grad_norm": 0.6238400340080261, + "learning_rate": 9.909103934827433e-05, + "loss": 2.1421, + "step": 2900 + }, + { + "epoch": 0.8904235727440147, + "grad_norm": 0.5685901045799255, + "learning_rate": 9.909009564234543e-05, + "loss": 2.0019, + "step": 2901 + }, + { + "epoch": 0.8907305095150398, + "grad_norm": 0.5979083180427551, + "learning_rate": 9.908915145127945e-05, + "loss": 2.0891, + "step": 2902 + }, + { + "epoch": 0.8910374462860651, + "grad_norm": 0.5847237706184387, + "learning_rate": 9.90882067750857e-05, + "loss": 2.1165, + "step": 2903 + }, + { + "epoch": 0.8913443830570903, + "grad_norm": 0.6281530261039734, + "learning_rate": 9.908726161377351e-05, + "loss": 2.1396, + "step": 2904 + }, + { + "epoch": 0.8916513198281154, + "grad_norm": 0.5685252547264099, + "learning_rate": 9.908631596735225e-05, + "loss": 2.0781, + "step": 2905 + }, + { + "epoch": 0.8919582565991406, + "grad_norm": 0.5427065491676331, + "learning_rate": 9.908536983583123e-05, + "loss": 2.1387, + "step": 2906 + }, + { + "epoch": 0.8922651933701657, + "grad_norm": 0.5972270965576172, + "learning_rate": 9.908442321921982e-05, + "loss": 2.0546, + "step": 2907 + }, + { + "epoch": 0.892572130141191, + "grad_norm": 0.562685489654541, + "learning_rate": 9.908347611752735e-05, + "loss": 2.093, + "step": 2908 + }, + { + "epoch": 0.8928790669122161, + "grad_norm": 0.6781734824180603, + "learning_rate": 9.908252853076323e-05, + "loss": 2.1589, + "step": 2909 + }, + { + "epoch": 0.8931860036832413, + "grad_norm": 0.7591540813446045, + "learning_rate": 9.908158045893678e-05, + "loss": 2.164, + "step": 2910 + }, + { + "epoch": 0.8934929404542664, + "grad_norm": 0.7161938548088074, + "learning_rate": 9.908063190205738e-05, + "loss": 2.079, + "step": 2911 + }, + { + "epoch": 0.8937998772252916, + "grad_norm": 0.7338036298751831, + "learning_rate": 9.907968286013442e-05, + "loss": 2.0033, + "step": 2912 + }, + { + "epoch": 0.8941068139963168, + "grad_norm": 0.7641176581382751, + "learning_rate": 9.907873333317727e-05, + "loss": 2.187, + "step": 2913 + }, + { + "epoch": 0.894413750767342, + "grad_norm": 0.6073760390281677, + "learning_rate": 9.90777833211953e-05, + "loss": 2.0589, + "step": 2914 + }, + { + "epoch": 0.8947206875383671, + "grad_norm": 0.49493756890296936, + "learning_rate": 9.907683282419791e-05, + "loss": 2.0555, + "step": 2915 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.6428996920585632, + "learning_rate": 9.907588184219449e-05, + "loss": 2.1083, + "step": 2916 + }, + { + "epoch": 0.8953345610804174, + "grad_norm": 0.6752644777297974, + "learning_rate": 9.907493037519447e-05, + "loss": 2.0987, + "step": 2917 + }, + { + "epoch": 0.8956414978514426, + "grad_norm": 0.5719494223594666, + "learning_rate": 9.907397842320719e-05, + "loss": 2.1735, + "step": 2918 + }, + { + "epoch": 0.8959484346224678, + "grad_norm": 0.5799626111984253, + "learning_rate": 9.907302598624211e-05, + "loss": 2.0978, + "step": 2919 + }, + { + "epoch": 0.896255371393493, + "grad_norm": 0.5407500267028809, + "learning_rate": 9.907207306430861e-05, + "loss": 2.0303, + "step": 2920 + }, + { + "epoch": 0.8965623081645181, + "grad_norm": 0.5950884222984314, + "learning_rate": 9.907111965741614e-05, + "loss": 2.0721, + "step": 2921 + }, + { + "epoch": 0.8968692449355433, + "grad_norm": 0.7711441516876221, + "learning_rate": 9.907016576557409e-05, + "loss": 2.1693, + "step": 2922 + }, + { + "epoch": 0.8971761817065684, + "grad_norm": 0.5522177815437317, + "learning_rate": 9.906921138879191e-05, + "loss": 2.1057, + "step": 2923 + }, + { + "epoch": 0.8974831184775937, + "grad_norm": 0.5743894577026367, + "learning_rate": 9.906825652707903e-05, + "loss": 2.119, + "step": 2924 + }, + { + "epoch": 0.8977900552486188, + "grad_norm": 0.5996440649032593, + "learning_rate": 9.906730118044486e-05, + "loss": 2.1251, + "step": 2925 + }, + { + "epoch": 0.898096992019644, + "grad_norm": 0.691302478313446, + "learning_rate": 9.906634534889887e-05, + "loss": 2.1459, + "step": 2926 + }, + { + "epoch": 0.8984039287906691, + "grad_norm": 0.6125866770744324, + "learning_rate": 9.90653890324505e-05, + "loss": 2.0739, + "step": 2927 + }, + { + "epoch": 0.8987108655616943, + "grad_norm": 0.5285681486129761, + "learning_rate": 9.906443223110919e-05, + "loss": 2.0398, + "step": 2928 + }, + { + "epoch": 0.8990178023327194, + "grad_norm": 0.5747935771942139, + "learning_rate": 9.90634749448844e-05, + "loss": 2.0688, + "step": 2929 + }, + { + "epoch": 0.8993247391037447, + "grad_norm": 0.5686646103858948, + "learning_rate": 9.90625171737856e-05, + "loss": 2.1196, + "step": 2930 + }, + { + "epoch": 0.8996316758747698, + "grad_norm": 0.5320247411727905, + "learning_rate": 9.906155891782225e-05, + "loss": 2.1069, + "step": 2931 + }, + { + "epoch": 0.899938612645795, + "grad_norm": 0.5626047849655151, + "learning_rate": 9.906060017700383e-05, + "loss": 2.1091, + "step": 2932 + }, + { + "epoch": 0.9002455494168201, + "grad_norm": 0.5284978151321411, + "learning_rate": 9.905964095133979e-05, + "loss": 2.036, + "step": 2933 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.5362093448638916, + "learning_rate": 9.905868124083962e-05, + "loss": 2.1273, + "step": 2934 + }, + { + "epoch": 0.9008594229588704, + "grad_norm": 0.5583781599998474, + "learning_rate": 9.90577210455128e-05, + "loss": 2.0871, + "step": 2935 + }, + { + "epoch": 0.9011663597298957, + "grad_norm": 0.5552016496658325, + "learning_rate": 9.905676036536883e-05, + "loss": 2.0785, + "step": 2936 + }, + { + "epoch": 0.9014732965009208, + "grad_norm": 0.6875657439231873, + "learning_rate": 9.905579920041724e-05, + "loss": 2.083, + "step": 2937 + }, + { + "epoch": 0.901780233271946, + "grad_norm": 0.5396340489387512, + "learning_rate": 9.905483755066744e-05, + "loss": 2.0717, + "step": 2938 + }, + { + "epoch": 0.9020871700429711, + "grad_norm": 0.594739556312561, + "learning_rate": 9.9053875416129e-05, + "loss": 2.1305, + "step": 2939 + }, + { + "epoch": 0.9023941068139963, + "grad_norm": 0.6208831667900085, + "learning_rate": 9.905291279681143e-05, + "loss": 2.0034, + "step": 2940 + }, + { + "epoch": 0.9027010435850215, + "grad_norm": 0.5154325366020203, + "learning_rate": 9.90519496927242e-05, + "loss": 2.098, + "step": 2941 + }, + { + "epoch": 0.9030079803560467, + "grad_norm": 0.5217738151550293, + "learning_rate": 9.905098610387687e-05, + "loss": 2.0467, + "step": 2942 + }, + { + "epoch": 0.9033149171270718, + "grad_norm": 0.5623623728752136, + "learning_rate": 9.905002203027894e-05, + "loss": 2.1854, + "step": 2943 + }, + { + "epoch": 0.903621853898097, + "grad_norm": 0.5365456938743591, + "learning_rate": 9.904905747193993e-05, + "loss": 2.1021, + "step": 2944 + }, + { + "epoch": 0.9039287906691221, + "grad_norm": 0.5391906499862671, + "learning_rate": 9.904809242886941e-05, + "loss": 2.1102, + "step": 2945 + }, + { + "epoch": 0.9042357274401474, + "grad_norm": 0.5439971685409546, + "learning_rate": 9.904712690107687e-05, + "loss": 2.0691, + "step": 2946 + }, + { + "epoch": 0.9045426642111725, + "grad_norm": 0.539383053779602, + "learning_rate": 9.904616088857189e-05, + "loss": 2.0514, + "step": 2947 + }, + { + "epoch": 0.9048496009821977, + "grad_norm": 0.5370060801506042, + "learning_rate": 9.904519439136399e-05, + "loss": 2.1069, + "step": 2948 + }, + { + "epoch": 0.9051565377532228, + "grad_norm": 0.5136541724205017, + "learning_rate": 9.904422740946274e-05, + "loss": 2.0519, + "step": 2949 + }, + { + "epoch": 0.905463474524248, + "grad_norm": 0.4970051348209381, + "learning_rate": 9.904325994287768e-05, + "loss": 2.0624, + "step": 2950 + }, + { + "epoch": 0.9057704112952731, + "grad_norm": 0.5003986954689026, + "learning_rate": 9.90422919916184e-05, + "loss": 2.135, + "step": 2951 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.5559821724891663, + "learning_rate": 9.904132355569443e-05, + "loss": 2.0733, + "step": 2952 + }, + { + "epoch": 0.9063842848373235, + "grad_norm": 0.5450533628463745, + "learning_rate": 9.904035463511537e-05, + "loss": 2.1491, + "step": 2953 + }, + { + "epoch": 0.9066912216083487, + "grad_norm": 0.5789141058921814, + "learning_rate": 9.903938522989076e-05, + "loss": 2.0604, + "step": 2954 + }, + { + "epoch": 0.9069981583793738, + "grad_norm": 0.6327412128448486, + "learning_rate": 9.903841534003023e-05, + "loss": 2.1307, + "step": 2955 + }, + { + "epoch": 0.907305095150399, + "grad_norm": 0.5694023966789246, + "learning_rate": 9.90374449655433e-05, + "loss": 2.1322, + "step": 2956 + }, + { + "epoch": 0.9076120319214241, + "grad_norm": 0.6241337060928345, + "learning_rate": 9.903647410643963e-05, + "loss": 2.1026, + "step": 2957 + }, + { + "epoch": 0.9079189686924494, + "grad_norm": 0.6257766485214233, + "learning_rate": 9.903550276272878e-05, + "loss": 2.0449, + "step": 2958 + }, + { + "epoch": 0.9082259054634745, + "grad_norm": 0.708626389503479, + "learning_rate": 9.903453093442032e-05, + "loss": 2.095, + "step": 2959 + }, + { + "epoch": 0.9085328422344997, + "grad_norm": 0.6769086122512817, + "learning_rate": 9.903355862152391e-05, + "loss": 2.0939, + "step": 2960 + }, + { + "epoch": 0.9088397790055248, + "grad_norm": 0.6221890449523926, + "learning_rate": 9.903258582404913e-05, + "loss": 2.1552, + "step": 2961 + }, + { + "epoch": 0.90914671577655, + "grad_norm": 0.7477858662605286, + "learning_rate": 9.903161254200561e-05, + "loss": 2.1155, + "step": 2962 + }, + { + "epoch": 0.9094536525475752, + "grad_norm": 0.665538489818573, + "learning_rate": 9.903063877540294e-05, + "loss": 2.1032, + "step": 2963 + }, + { + "epoch": 0.9097605893186004, + "grad_norm": 0.5973435044288635, + "learning_rate": 9.902966452425076e-05, + "loss": 2.0793, + "step": 2964 + }, + { + "epoch": 0.9100675260896255, + "grad_norm": 0.6544547080993652, + "learning_rate": 9.90286897885587e-05, + "loss": 2.1566, + "step": 2965 + }, + { + "epoch": 0.9103744628606507, + "grad_norm": 0.7162452936172485, + "learning_rate": 9.90277145683364e-05, + "loss": 2.1234, + "step": 2966 + }, + { + "epoch": 0.9106813996316758, + "grad_norm": 0.8400503993034363, + "learning_rate": 9.902673886359349e-05, + "loss": 2.216, + "step": 2967 + }, + { + "epoch": 0.910988336402701, + "grad_norm": 1.0350611209869385, + "learning_rate": 9.902576267433961e-05, + "loss": 2.0785, + "step": 2968 + }, + { + "epoch": 0.9112952731737262, + "grad_norm": 0.9551987051963806, + "learning_rate": 9.90247860005844e-05, + "loss": 2.0652, + "step": 2969 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.839712381362915, + "learning_rate": 9.902380884233751e-05, + "loss": 2.1197, + "step": 2970 + }, + { + "epoch": 0.9119091467157765, + "grad_norm": 0.6588022708892822, + "learning_rate": 9.902283119960863e-05, + "loss": 2.155, + "step": 2971 + }, + { + "epoch": 0.9122160834868017, + "grad_norm": 0.6532430052757263, + "learning_rate": 9.902185307240739e-05, + "loss": 2.0947, + "step": 2972 + }, + { + "epoch": 0.9125230202578268, + "grad_norm": 0.7890481352806091, + "learning_rate": 9.902087446074346e-05, + "loss": 2.0246, + "step": 2973 + }, + { + "epoch": 0.9128299570288521, + "grad_norm": 0.6234511137008667, + "learning_rate": 9.901989536462652e-05, + "loss": 2.1033, + "step": 2974 + }, + { + "epoch": 0.9131368937998773, + "grad_norm": 0.5875300168991089, + "learning_rate": 9.901891578406623e-05, + "loss": 2.0553, + "step": 2975 + }, + { + "epoch": 0.9134438305709024, + "grad_norm": 0.6868174076080322, + "learning_rate": 9.901793571907231e-05, + "loss": 2.1398, + "step": 2976 + }, + { + "epoch": 0.9137507673419276, + "grad_norm": 0.7423301339149475, + "learning_rate": 9.90169551696544e-05, + "loss": 2.1034, + "step": 2977 + }, + { + "epoch": 0.9140577041129527, + "grad_norm": 0.588916003704071, + "learning_rate": 9.901597413582222e-05, + "loss": 2.078, + "step": 2978 + }, + { + "epoch": 0.914364640883978, + "grad_norm": 0.5895309448242188, + "learning_rate": 9.901499261758544e-05, + "loss": 2.0902, + "step": 2979 + }, + { + "epoch": 0.9146715776550031, + "grad_norm": 0.5403301119804382, + "learning_rate": 9.901401061495379e-05, + "loss": 2.0291, + "step": 2980 + }, + { + "epoch": 0.9149785144260283, + "grad_norm": 0.6102077960968018, + "learning_rate": 9.901302812793696e-05, + "loss": 2.0415, + "step": 2981 + }, + { + "epoch": 0.9152854511970534, + "grad_norm": 0.6728450059890747, + "learning_rate": 9.901204515654465e-05, + "loss": 2.105, + "step": 2982 + }, + { + "epoch": 0.9155923879680786, + "grad_norm": 0.5886163711547852, + "learning_rate": 9.901106170078657e-05, + "loss": 2.0186, + "step": 2983 + }, + { + "epoch": 0.9158993247391037, + "grad_norm": 0.539252758026123, + "learning_rate": 9.901007776067247e-05, + "loss": 2.0604, + "step": 2984 + }, + { + "epoch": 0.916206261510129, + "grad_norm": 0.6169516444206238, + "learning_rate": 9.900909333621205e-05, + "loss": 2.1257, + "step": 2985 + }, + { + "epoch": 0.9165131982811541, + "grad_norm": 0.5624274015426636, + "learning_rate": 9.900810842741506e-05, + "loss": 2.0325, + "step": 2986 + }, + { + "epoch": 0.9168201350521793, + "grad_norm": 0.5931735634803772, + "learning_rate": 9.900712303429119e-05, + "loss": 2.0815, + "step": 2987 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.5720505714416504, + "learning_rate": 9.900613715685023e-05, + "loss": 2.1261, + "step": 2988 + }, + { + "epoch": 0.9174340085942296, + "grad_norm": 0.5752067565917969, + "learning_rate": 9.900515079510189e-05, + "loss": 2.1402, + "step": 2989 + }, + { + "epoch": 0.9177409453652547, + "grad_norm": 0.5836917757987976, + "learning_rate": 9.900416394905591e-05, + "loss": 2.0523, + "step": 2990 + }, + { + "epoch": 0.91804788213628, + "grad_norm": 0.6408325433731079, + "learning_rate": 9.900317661872209e-05, + "loss": 2.1874, + "step": 2991 + }, + { + "epoch": 0.9183548189073051, + "grad_norm": 0.6188341379165649, + "learning_rate": 9.900218880411013e-05, + "loss": 2.0903, + "step": 2992 + }, + { + "epoch": 0.9186617556783303, + "grad_norm": 0.5740565657615662, + "learning_rate": 9.900120050522985e-05, + "loss": 2.1243, + "step": 2993 + }, + { + "epoch": 0.9189686924493554, + "grad_norm": 0.635638952255249, + "learning_rate": 9.900021172209096e-05, + "loss": 2.089, + "step": 2994 + }, + { + "epoch": 0.9192756292203806, + "grad_norm": 0.5538209676742554, + "learning_rate": 9.899922245470326e-05, + "loss": 2.0489, + "step": 2995 + }, + { + "epoch": 0.9195825659914058, + "grad_norm": 0.5440292954444885, + "learning_rate": 9.899823270307654e-05, + "loss": 2.0534, + "step": 2996 + }, + { + "epoch": 0.919889502762431, + "grad_norm": 0.6203792691230774, + "learning_rate": 9.899724246722055e-05, + "loss": 2.2799, + "step": 2997 + }, + { + "epoch": 0.9201964395334561, + "grad_norm": 0.6299278140068054, + "learning_rate": 9.89962517471451e-05, + "loss": 2.0813, + "step": 2998 + }, + { + "epoch": 0.9205033763044813, + "grad_norm": 0.6156774759292603, + "learning_rate": 9.899526054285997e-05, + "loss": 2.1345, + "step": 2999 + }, + { + "epoch": 0.9208103130755064, + "grad_norm": 0.5940032601356506, + "learning_rate": 9.899426885437496e-05, + "loss": 2.133, + "step": 3000 + }, + { + "epoch": 0.9211172498465316, + "grad_norm": 0.6210232377052307, + "learning_rate": 9.899327668169987e-05, + "loss": 2.0275, + "step": 3001 + }, + { + "epoch": 0.9214241866175568, + "grad_norm": 0.5578985214233398, + "learning_rate": 9.89922840248445e-05, + "loss": 2.0806, + "step": 3002 + }, + { + "epoch": 0.921731123388582, + "grad_norm": 0.5264963507652283, + "learning_rate": 9.899129088381866e-05, + "loss": 2.1233, + "step": 3003 + }, + { + "epoch": 0.9220380601596071, + "grad_norm": 0.5414119958877563, + "learning_rate": 9.899029725863218e-05, + "loss": 2.1052, + "step": 3004 + }, + { + "epoch": 0.9223449969306323, + "grad_norm": 0.5933207869529724, + "learning_rate": 9.898930314929486e-05, + "loss": 2.108, + "step": 3005 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.6170317530632019, + "learning_rate": 9.898830855581654e-05, + "loss": 2.0997, + "step": 3006 + }, + { + "epoch": 0.9229588704726827, + "grad_norm": 0.5930282473564148, + "learning_rate": 9.898731347820705e-05, + "loss": 2.0507, + "step": 3007 + }, + { + "epoch": 0.9232658072437078, + "grad_norm": 0.5894142985343933, + "learning_rate": 9.898631791647619e-05, + "loss": 2.0687, + "step": 3008 + }, + { + "epoch": 0.923572744014733, + "grad_norm": 0.6560437083244324, + "learning_rate": 9.898532187063383e-05, + "loss": 2.096, + "step": 3009 + }, + { + "epoch": 0.9238796807857581, + "grad_norm": 0.6083245873451233, + "learning_rate": 9.898432534068983e-05, + "loss": 2.0526, + "step": 3010 + }, + { + "epoch": 0.9241866175567833, + "grad_norm": 0.5152565240859985, + "learning_rate": 9.8983328326654e-05, + "loss": 2.0802, + "step": 3011 + }, + { + "epoch": 0.9244935543278084, + "grad_norm": 0.6326588988304138, + "learning_rate": 9.89823308285362e-05, + "loss": 2.1246, + "step": 3012 + }, + { + "epoch": 0.9248004910988337, + "grad_norm": 0.6821309328079224, + "learning_rate": 9.898133284634632e-05, + "loss": 2.1106, + "step": 3013 + }, + { + "epoch": 0.9251074278698588, + "grad_norm": 0.6192164421081543, + "learning_rate": 9.898033438009419e-05, + "loss": 2.0475, + "step": 3014 + }, + { + "epoch": 0.925414364640884, + "grad_norm": 0.6112427115440369, + "learning_rate": 9.897933542978967e-05, + "loss": 2.0904, + "step": 3015 + }, + { + "epoch": 0.9257213014119091, + "grad_norm": 0.5729427933692932, + "learning_rate": 9.897833599544268e-05, + "loss": 2.1151, + "step": 3016 + }, + { + "epoch": 0.9260282381829343, + "grad_norm": 0.6200255751609802, + "learning_rate": 9.897733607706305e-05, + "loss": 2.0815, + "step": 3017 + }, + { + "epoch": 0.9263351749539595, + "grad_norm": 0.635920524597168, + "learning_rate": 9.897633567466068e-05, + "loss": 2.0724, + "step": 3018 + }, + { + "epoch": 0.9266421117249847, + "grad_norm": 0.5916038155555725, + "learning_rate": 9.897533478824546e-05, + "loss": 2.1527, + "step": 3019 + }, + { + "epoch": 0.9269490484960098, + "grad_norm": 0.5552941560745239, + "learning_rate": 9.897433341782727e-05, + "loss": 2.0958, + "step": 3020 + }, + { + "epoch": 0.927255985267035, + "grad_norm": 0.562383770942688, + "learning_rate": 9.897333156341602e-05, + "loss": 2.0939, + "step": 3021 + }, + { + "epoch": 0.9275629220380601, + "grad_norm": 0.5227869153022766, + "learning_rate": 9.897232922502158e-05, + "loss": 2.1358, + "step": 3022 + }, + { + "epoch": 0.9278698588090853, + "grad_norm": 0.5671074986457825, + "learning_rate": 9.897132640265391e-05, + "loss": 2.0877, + "step": 3023 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.5176356434822083, + "learning_rate": 9.897032309632287e-05, + "loss": 2.0392, + "step": 3024 + }, + { + "epoch": 0.9284837323511357, + "grad_norm": 0.5160155296325684, + "learning_rate": 9.89693193060384e-05, + "loss": 2.069, + "step": 3025 + }, + { + "epoch": 0.9287906691221608, + "grad_norm": 0.5034440159797668, + "learning_rate": 9.896831503181042e-05, + "loss": 2.0348, + "step": 3026 + }, + { + "epoch": 0.929097605893186, + "grad_norm": 0.5146151781082153, + "learning_rate": 9.896731027364884e-05, + "loss": 2.0884, + "step": 3027 + }, + { + "epoch": 0.9294045426642111, + "grad_norm": 0.7153071165084839, + "learning_rate": 9.896630503156361e-05, + "loss": 2.2295, + "step": 3028 + }, + { + "epoch": 0.9297114794352364, + "grad_norm": 0.7201753258705139, + "learning_rate": 9.896529930556464e-05, + "loss": 2.1285, + "step": 3029 + }, + { + "epoch": 0.9300184162062615, + "grad_norm": 0.7110029458999634, + "learning_rate": 9.89642930956619e-05, + "loss": 2.1371, + "step": 3030 + }, + { + "epoch": 0.9303253529772867, + "grad_norm": 0.695444643497467, + "learning_rate": 9.896328640186531e-05, + "loss": 2.0698, + "step": 3031 + }, + { + "epoch": 0.9306322897483118, + "grad_norm": 0.6157357096672058, + "learning_rate": 9.896227922418482e-05, + "loss": 2.1294, + "step": 3032 + }, + { + "epoch": 0.930939226519337, + "grad_norm": 0.5473730564117432, + "learning_rate": 9.896127156263039e-05, + "loss": 2.0487, + "step": 3033 + }, + { + "epoch": 0.9312461632903621, + "grad_norm": 0.6400229334831238, + "learning_rate": 9.896026341721198e-05, + "loss": 2.0422, + "step": 3034 + }, + { + "epoch": 0.9315531000613874, + "grad_norm": 0.5046324729919434, + "learning_rate": 9.895925478793955e-05, + "loss": 2.0715, + "step": 3035 + }, + { + "epoch": 0.9318600368324125, + "grad_norm": 0.5316528081893921, + "learning_rate": 9.895824567482307e-05, + "loss": 2.11, + "step": 3036 + }, + { + "epoch": 0.9321669736034377, + "grad_norm": 0.5760478973388672, + "learning_rate": 9.895723607787251e-05, + "loss": 2.0885, + "step": 3037 + }, + { + "epoch": 0.9324739103744628, + "grad_norm": 0.5034705996513367, + "learning_rate": 9.895622599709785e-05, + "loss": 2.0024, + "step": 3038 + }, + { + "epoch": 0.932780847145488, + "grad_norm": 0.46088743209838867, + "learning_rate": 9.895521543250906e-05, + "loss": 2.0794, + "step": 3039 + }, + { + "epoch": 0.9330877839165131, + "grad_norm": 0.5219544172286987, + "learning_rate": 9.895420438411616e-05, + "loss": 2.1002, + "step": 3040 + }, + { + "epoch": 0.9333947206875384, + "grad_norm": 0.5363453030586243, + "learning_rate": 9.89531928519291e-05, + "loss": 2.0629, + "step": 3041 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.5860787630081177, + "learning_rate": 9.89521808359579e-05, + "loss": 2.0999, + "step": 3042 + }, + { + "epoch": 0.9340085942295887, + "grad_norm": 0.7155836224555969, + "learning_rate": 9.895116833621255e-05, + "loss": 2.1674, + "step": 3043 + }, + { + "epoch": 0.9343155310006138, + "grad_norm": 0.8029196262359619, + "learning_rate": 9.895015535270307e-05, + "loss": 2.0776, + "step": 3044 + }, + { + "epoch": 0.934622467771639, + "grad_norm": 0.6973832845687866, + "learning_rate": 9.894914188543946e-05, + "loss": 2.0537, + "step": 3045 + }, + { + "epoch": 0.9349294045426643, + "grad_norm": 0.6646706461906433, + "learning_rate": 9.894812793443175e-05, + "loss": 2.0857, + "step": 3046 + }, + { + "epoch": 0.9352363413136894, + "grad_norm": 0.6343888640403748, + "learning_rate": 9.894711349968995e-05, + "loss": 2.0832, + "step": 3047 + }, + { + "epoch": 0.9355432780847146, + "grad_norm": 0.54819256067276, + "learning_rate": 9.894609858122407e-05, + "loss": 2.1576, + "step": 3048 + }, + { + "epoch": 0.9358502148557397, + "grad_norm": 0.6905701160430908, + "learning_rate": 9.894508317904419e-05, + "loss": 2.0685, + "step": 3049 + }, + { + "epoch": 0.9361571516267649, + "grad_norm": 0.605591356754303, + "learning_rate": 9.894406729316028e-05, + "loss": 2.0931, + "step": 3050 + }, + { + "epoch": 0.93646408839779, + "grad_norm": 0.5702943801879883, + "learning_rate": 9.89430509235824e-05, + "loss": 2.1224, + "step": 3051 + }, + { + "epoch": 0.9367710251688153, + "grad_norm": 0.5855122804641724, + "learning_rate": 9.894203407032064e-05, + "loss": 2.0747, + "step": 3052 + }, + { + "epoch": 0.9370779619398404, + "grad_norm": 0.6002167463302612, + "learning_rate": 9.894101673338498e-05, + "loss": 2.0991, + "step": 3053 + }, + { + "epoch": 0.9373848987108656, + "grad_norm": 0.5914842486381531, + "learning_rate": 9.893999891278553e-05, + "loss": 2.0427, + "step": 3054 + }, + { + "epoch": 0.9376918354818907, + "grad_norm": 0.6283048391342163, + "learning_rate": 9.893898060853232e-05, + "loss": 2.0558, + "step": 3055 + }, + { + "epoch": 0.937998772252916, + "grad_norm": 0.5955209136009216, + "learning_rate": 9.893796182063542e-05, + "loss": 2.1286, + "step": 3056 + }, + { + "epoch": 0.9383057090239411, + "grad_norm": 0.5579878687858582, + "learning_rate": 9.893694254910489e-05, + "loss": 2.0799, + "step": 3057 + }, + { + "epoch": 0.9386126457949663, + "grad_norm": 0.5690281391143799, + "learning_rate": 9.893592279395082e-05, + "loss": 2.0699, + "step": 3058 + }, + { + "epoch": 0.9389195825659914, + "grad_norm": 0.5189259648323059, + "learning_rate": 9.893490255518327e-05, + "loss": 2.0627, + "step": 3059 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.5205439925193787, + "learning_rate": 9.893388183281233e-05, + "loss": 2.0136, + "step": 3060 + }, + { + "epoch": 0.9395334561080417, + "grad_norm": 0.492593914270401, + "learning_rate": 9.89328606268481e-05, + "loss": 2.0799, + "step": 3061 + }, + { + "epoch": 0.939840392879067, + "grad_norm": 0.6511666178703308, + "learning_rate": 9.893183893730067e-05, + "loss": 2.1297, + "step": 3062 + }, + { + "epoch": 0.9401473296500921, + "grad_norm": 0.7640050053596497, + "learning_rate": 9.89308167641801e-05, + "loss": 2.1384, + "step": 3063 + }, + { + "epoch": 0.9404542664211173, + "grad_norm": 0.7526536583900452, + "learning_rate": 9.892979410749654e-05, + "loss": 2.0454, + "step": 3064 + }, + { + "epoch": 0.9407612031921424, + "grad_norm": 0.7140639424324036, + "learning_rate": 9.892877096726007e-05, + "loss": 2.0219, + "step": 3065 + }, + { + "epoch": 0.9410681399631676, + "grad_norm": 0.6584374308586121, + "learning_rate": 9.89277473434808e-05, + "loss": 2.0943, + "step": 3066 + }, + { + "epoch": 0.9413750767341927, + "grad_norm": 0.5889024138450623, + "learning_rate": 9.892672323616888e-05, + "loss": 2.1088, + "step": 3067 + }, + { + "epoch": 0.941682013505218, + "grad_norm": 0.6196749806404114, + "learning_rate": 9.892569864533438e-05, + "loss": 2.101, + "step": 3068 + }, + { + "epoch": 0.9419889502762431, + "grad_norm": 0.6432211399078369, + "learning_rate": 9.892467357098744e-05, + "loss": 2.0828, + "step": 3069 + }, + { + "epoch": 0.9422958870472683, + "grad_norm": 0.6448069214820862, + "learning_rate": 9.892364801313823e-05, + "loss": 2.1389, + "step": 3070 + }, + { + "epoch": 0.9426028238182934, + "grad_norm": 0.597197949886322, + "learning_rate": 9.892262197179682e-05, + "loss": 2.0902, + "step": 3071 + }, + { + "epoch": 0.9429097605893186, + "grad_norm": 0.625348687171936, + "learning_rate": 9.892159544697341e-05, + "loss": 2.0659, + "step": 3072 + }, + { + "epoch": 0.9432166973603437, + "grad_norm": 0.5109166502952576, + "learning_rate": 9.892056843867812e-05, + "loss": 2.0895, + "step": 3073 + }, + { + "epoch": 0.943523634131369, + "grad_norm": 0.5917959213256836, + "learning_rate": 9.891954094692108e-05, + "loss": 2.0646, + "step": 3074 + }, + { + "epoch": 0.9438305709023941, + "grad_norm": 0.5320633053779602, + "learning_rate": 9.891851297171249e-05, + "loss": 2.107, + "step": 3075 + }, + { + "epoch": 0.9441375076734193, + "grad_norm": 0.5271332263946533, + "learning_rate": 9.891748451306246e-05, + "loss": 2.0984, + "step": 3076 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.5389983057975769, + "learning_rate": 9.89164555709812e-05, + "loss": 2.1097, + "step": 3077 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.5536573529243469, + "learning_rate": 9.891542614547885e-05, + "loss": 2.1271, + "step": 3078 + }, + { + "epoch": 0.9450583179864948, + "grad_norm": 0.5481712222099304, + "learning_rate": 9.891439623656558e-05, + "loss": 2.0975, + "step": 3079 + }, + { + "epoch": 0.94536525475752, + "grad_norm": 0.626431941986084, + "learning_rate": 9.891336584425157e-05, + "loss": 2.1561, + "step": 3080 + }, + { + "epoch": 0.9456721915285451, + "grad_norm": 0.7452689409255981, + "learning_rate": 9.891233496854702e-05, + "loss": 2.0791, + "step": 3081 + }, + { + "epoch": 0.9459791282995703, + "grad_norm": 0.9399113059043884, + "learning_rate": 9.89113036094621e-05, + "loss": 2.0706, + "step": 3082 + }, + { + "epoch": 0.9462860650705954, + "grad_norm": 1.0733267068862915, + "learning_rate": 9.891027176700701e-05, + "loss": 2.0705, + "step": 3083 + }, + { + "epoch": 0.9465930018416207, + "grad_norm": 0.7521542906761169, + "learning_rate": 9.890923944119194e-05, + "loss": 2.0862, + "step": 3084 + }, + { + "epoch": 0.9468999386126458, + "grad_norm": 0.5447198152542114, + "learning_rate": 9.890820663202713e-05, + "loss": 2.1047, + "step": 3085 + }, + { + "epoch": 0.947206875383671, + "grad_norm": 0.5733833312988281, + "learning_rate": 9.890717333952273e-05, + "loss": 2.121, + "step": 3086 + }, + { + "epoch": 0.9475138121546961, + "grad_norm": 0.7225440144538879, + "learning_rate": 9.890613956368899e-05, + "loss": 2.0533, + "step": 3087 + }, + { + "epoch": 0.9478207489257213, + "grad_norm": 0.6377096176147461, + "learning_rate": 9.89051053045361e-05, + "loss": 2.07, + "step": 3088 + }, + { + "epoch": 0.9481276856967464, + "grad_norm": 0.556656002998352, + "learning_rate": 9.890407056207432e-05, + "loss": 2.1103, + "step": 3089 + }, + { + "epoch": 0.9484346224677717, + "grad_norm": 0.6807621121406555, + "learning_rate": 9.890303533631382e-05, + "loss": 2.1351, + "step": 3090 + }, + { + "epoch": 0.9487415592387968, + "grad_norm": 0.7187803983688354, + "learning_rate": 9.890199962726487e-05, + "loss": 2.0582, + "step": 3091 + }, + { + "epoch": 0.949048496009822, + "grad_norm": 0.6201196908950806, + "learning_rate": 9.890096343493771e-05, + "loss": 2.0799, + "step": 3092 + }, + { + "epoch": 0.9493554327808471, + "grad_norm": 0.6258496046066284, + "learning_rate": 9.889992675934257e-05, + "loss": 2.156, + "step": 3093 + }, + { + "epoch": 0.9496623695518723, + "grad_norm": 0.6191570162773132, + "learning_rate": 9.889888960048967e-05, + "loss": 2.0121, + "step": 3094 + }, + { + "epoch": 0.9499693063228974, + "grad_norm": 0.5668848752975464, + "learning_rate": 9.88978519583893e-05, + "loss": 2.0954, + "step": 3095 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.5596859455108643, + "learning_rate": 9.88968138330517e-05, + "loss": 2.1274, + "step": 3096 + }, + { + "epoch": 0.9505831798649478, + "grad_norm": 0.6199706196784973, + "learning_rate": 9.889577522448712e-05, + "loss": 2.0588, + "step": 3097 + }, + { + "epoch": 0.950890116635973, + "grad_norm": 0.5129860639572144, + "learning_rate": 9.889473613270584e-05, + "loss": 2.0722, + "step": 3098 + }, + { + "epoch": 0.9511970534069981, + "grad_norm": 0.513263463973999, + "learning_rate": 9.88936965577181e-05, + "loss": 2.0298, + "step": 3099 + }, + { + "epoch": 0.9515039901780233, + "grad_norm": 0.4870156943798065, + "learning_rate": 9.88926564995342e-05, + "loss": 2.025, + "step": 3100 + }, + { + "epoch": 0.9518109269490485, + "grad_norm": 0.5310595035552979, + "learning_rate": 9.889161595816442e-05, + "loss": 2.0767, + "step": 3101 + }, + { + "epoch": 0.9521178637200737, + "grad_norm": 0.5993812084197998, + "learning_rate": 9.889057493361903e-05, + "loss": 2.1931, + "step": 3102 + }, + { + "epoch": 0.9524248004910988, + "grad_norm": 0.6157637238502502, + "learning_rate": 9.888953342590832e-05, + "loss": 2.0757, + "step": 3103 + }, + { + "epoch": 0.952731737262124, + "grad_norm": 0.6280032992362976, + "learning_rate": 9.88884914350426e-05, + "loss": 2.0042, + "step": 3104 + }, + { + "epoch": 0.9530386740331491, + "grad_norm": 0.6740781664848328, + "learning_rate": 9.888744896103212e-05, + "loss": 2.0663, + "step": 3105 + }, + { + "epoch": 0.9533456108041743, + "grad_norm": 0.5851804614067078, + "learning_rate": 9.888640600388725e-05, + "loss": 2.0585, + "step": 3106 + }, + { + "epoch": 0.9536525475751995, + "grad_norm": 0.6590312719345093, + "learning_rate": 9.888536256361825e-05, + "loss": 2.0698, + "step": 3107 + }, + { + "epoch": 0.9539594843462247, + "grad_norm": 0.5356595516204834, + "learning_rate": 9.888431864023544e-05, + "loss": 2.1019, + "step": 3108 + }, + { + "epoch": 0.9542664211172498, + "grad_norm": 0.6401084661483765, + "learning_rate": 9.888327423374915e-05, + "loss": 2.1176, + "step": 3109 + }, + { + "epoch": 0.954573357888275, + "grad_norm": 0.6582900285720825, + "learning_rate": 9.888222934416968e-05, + "loss": 2.0375, + "step": 3110 + }, + { + "epoch": 0.9548802946593001, + "grad_norm": 0.6245424151420593, + "learning_rate": 9.888118397150738e-05, + "loss": 1.9913, + "step": 3111 + }, + { + "epoch": 0.9551872314303254, + "grad_norm": 0.5871780514717102, + "learning_rate": 9.888013811577256e-05, + "loss": 2.1434, + "step": 3112 + }, + { + "epoch": 0.9554941682013505, + "grad_norm": 0.6295487284660339, + "learning_rate": 9.887909177697559e-05, + "loss": 2.0805, + "step": 3113 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.5844045877456665, + "learning_rate": 9.887804495512676e-05, + "loss": 2.076, + "step": 3114 + }, + { + "epoch": 0.9561080417434008, + "grad_norm": 0.5581921339035034, + "learning_rate": 9.887699765023645e-05, + "loss": 2.131, + "step": 3115 + }, + { + "epoch": 0.956414978514426, + "grad_norm": 0.6659174561500549, + "learning_rate": 9.8875949862315e-05, + "loss": 2.0759, + "step": 3116 + }, + { + "epoch": 0.9567219152854513, + "grad_norm": 0.5852961540222168, + "learning_rate": 9.887490159137276e-05, + "loss": 2.0486, + "step": 3117 + }, + { + "epoch": 0.9570288520564764, + "grad_norm": 0.6077566146850586, + "learning_rate": 9.887385283742011e-05, + "loss": 2.1132, + "step": 3118 + }, + { + "epoch": 0.9573357888275016, + "grad_norm": 0.5991361141204834, + "learning_rate": 9.88728036004674e-05, + "loss": 2.0322, + "step": 3119 + }, + { + "epoch": 0.9576427255985267, + "grad_norm": 0.5832391977310181, + "learning_rate": 9.887175388052499e-05, + "loss": 2.135, + "step": 3120 + }, + { + "epoch": 0.9579496623695519, + "grad_norm": 0.5479732751846313, + "learning_rate": 9.887070367760327e-05, + "loss": 2.1222, + "step": 3121 + }, + { + "epoch": 0.958256599140577, + "grad_norm": 0.5630220770835876, + "learning_rate": 9.88696529917126e-05, + "loss": 2.1247, + "step": 3122 + }, + { + "epoch": 0.9585635359116023, + "grad_norm": 0.7052439451217651, + "learning_rate": 9.88686018228634e-05, + "loss": 2.204, + "step": 3123 + }, + { + "epoch": 0.9588704726826274, + "grad_norm": 0.5995638370513916, + "learning_rate": 9.8867550171066e-05, + "loss": 2.0153, + "step": 3124 + }, + { + "epoch": 0.9591774094536526, + "grad_norm": 0.5689408779144287, + "learning_rate": 9.886649803633086e-05, + "loss": 2.0341, + "step": 3125 + }, + { + "epoch": 0.9594843462246777, + "grad_norm": 0.5247456431388855, + "learning_rate": 9.886544541866832e-05, + "loss": 2.0657, + "step": 3126 + }, + { + "epoch": 0.9597912829957029, + "grad_norm": 0.5596463084220886, + "learning_rate": 9.886439231808882e-05, + "loss": 2.0829, + "step": 3127 + }, + { + "epoch": 0.960098219766728, + "grad_norm": 0.4993874430656433, + "learning_rate": 9.886333873460275e-05, + "loss": 2.0517, + "step": 3128 + }, + { + "epoch": 0.9604051565377533, + "grad_norm": 0.5776910185813904, + "learning_rate": 9.886228466822054e-05, + "loss": 2.0124, + "step": 3129 + }, + { + "epoch": 0.9607120933087784, + "grad_norm": 0.5871354341506958, + "learning_rate": 9.886123011895258e-05, + "loss": 2.0327, + "step": 3130 + }, + { + "epoch": 0.9610190300798036, + "grad_norm": 0.5873207449913025, + "learning_rate": 9.886017508680931e-05, + "loss": 2.0756, + "step": 3131 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.6422720551490784, + "learning_rate": 9.885911957180113e-05, + "loss": 2.0649, + "step": 3132 + }, + { + "epoch": 0.9616329036218539, + "grad_norm": 0.6040814518928528, + "learning_rate": 9.885806357393853e-05, + "loss": 2.066, + "step": 3133 + }, + { + "epoch": 0.961939840392879, + "grad_norm": 0.6629621982574463, + "learning_rate": 9.885700709323189e-05, + "loss": 2.0824, + "step": 3134 + }, + { + "epoch": 0.9622467771639043, + "grad_norm": 0.572485625743866, + "learning_rate": 9.885595012969168e-05, + "loss": 2.0572, + "step": 3135 + }, + { + "epoch": 0.9625537139349294, + "grad_norm": 0.5050783753395081, + "learning_rate": 9.885489268332833e-05, + "loss": 2.0645, + "step": 3136 + }, + { + "epoch": 0.9628606507059546, + "grad_norm": 0.5744417309761047, + "learning_rate": 9.885383475415229e-05, + "loss": 2.0549, + "step": 3137 + }, + { + "epoch": 0.9631675874769797, + "grad_norm": 0.5604275465011597, + "learning_rate": 9.885277634217403e-05, + "loss": 2.1339, + "step": 3138 + }, + { + "epoch": 0.963474524248005, + "grad_norm": 0.6182584762573242, + "learning_rate": 9.8851717447404e-05, + "loss": 2.0397, + "step": 3139 + }, + { + "epoch": 0.9637814610190301, + "grad_norm": 0.510515570640564, + "learning_rate": 9.885065806985266e-05, + "loss": 1.9761, + "step": 3140 + }, + { + "epoch": 0.9640883977900553, + "grad_norm": 0.4881763756275177, + "learning_rate": 9.884959820953048e-05, + "loss": 2.005, + "step": 3141 + }, + { + "epoch": 0.9643953345610804, + "grad_norm": 0.47206851840019226, + "learning_rate": 9.884853786644794e-05, + "loss": 2.0661, + "step": 3142 + }, + { + "epoch": 0.9647022713321056, + "grad_norm": 0.5691676735877991, + "learning_rate": 9.884747704061552e-05, + "loss": 2.1316, + "step": 3143 + }, + { + "epoch": 0.9650092081031307, + "grad_norm": 0.5338765978813171, + "learning_rate": 9.884641573204372e-05, + "loss": 2.0715, + "step": 3144 + }, + { + "epoch": 0.965316144874156, + "grad_norm": 0.5721597075462341, + "learning_rate": 9.884535394074299e-05, + "loss": 2.1004, + "step": 3145 + }, + { + "epoch": 0.9656230816451811, + "grad_norm": 0.5269518494606018, + "learning_rate": 9.884429166672384e-05, + "loss": 2.1233, + "step": 3146 + }, + { + "epoch": 0.9659300184162063, + "grad_norm": 0.5264385342597961, + "learning_rate": 9.884322890999678e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.9662369551872314, + "grad_norm": 0.6094604730606079, + "learning_rate": 9.88421656705723e-05, + "loss": 2.1009, + "step": 3148 + }, + { + "epoch": 0.9665438919582566, + "grad_norm": 0.5538906455039978, + "learning_rate": 9.884110194846093e-05, + "loss": 2.0055, + "step": 3149 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.591526985168457, + "learning_rate": 9.884003774367313e-05, + "loss": 2.0655, + "step": 3150 + }, + { + "epoch": 0.967157765500307, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.883897305621948e-05, + "loss": 2.0775, + "step": 3151 + }, + { + "epoch": 0.9674647022713321, + "grad_norm": 0.5074640512466431, + "learning_rate": 9.883790788611045e-05, + "loss": 2.0322, + "step": 3152 + }, + { + "epoch": 0.9677716390423573, + "grad_norm": 0.5111376047134399, + "learning_rate": 9.883684223335661e-05, + "loss": 2.0972, + "step": 3153 + }, + { + "epoch": 0.9680785758133824, + "grad_norm": 0.5187644362449646, + "learning_rate": 9.883577609796846e-05, + "loss": 2.072, + "step": 3154 + }, + { + "epoch": 0.9683855125844076, + "grad_norm": 0.5285201072692871, + "learning_rate": 9.883470947995654e-05, + "loss": 2.0468, + "step": 3155 + }, + { + "epoch": 0.9686924493554327, + "grad_norm": 0.49360916018486023, + "learning_rate": 9.883364237933142e-05, + "loss": 2.07, + "step": 3156 + }, + { + "epoch": 0.968999386126458, + "grad_norm": 0.6359294056892395, + "learning_rate": 9.88325747961036e-05, + "loss": 2.1169, + "step": 3157 + }, + { + "epoch": 0.9693063228974831, + "grad_norm": 0.6274764537811279, + "learning_rate": 9.883150673028367e-05, + "loss": 2.1412, + "step": 3158 + }, + { + "epoch": 0.9696132596685083, + "grad_norm": 0.5755917429924011, + "learning_rate": 9.883043818188215e-05, + "loss": 2.0547, + "step": 3159 + }, + { + "epoch": 0.9699201964395334, + "grad_norm": 0.4765770137310028, + "learning_rate": 9.882936915090964e-05, + "loss": 2.02, + "step": 3160 + }, + { + "epoch": 0.9702271332105586, + "grad_norm": 0.5085053443908691, + "learning_rate": 9.882829963737667e-05, + "loss": 2.0355, + "step": 3161 + }, + { + "epoch": 0.9705340699815838, + "grad_norm": 0.49804505705833435, + "learning_rate": 9.882722964129385e-05, + "loss": 2.1274, + "step": 3162 + }, + { + "epoch": 0.970841006752609, + "grad_norm": 0.5575076341629028, + "learning_rate": 9.882615916267171e-05, + "loss": 2.0661, + "step": 3163 + }, + { + "epoch": 0.9711479435236341, + "grad_norm": 0.5678727626800537, + "learning_rate": 9.882508820152084e-05, + "loss": 2.1135, + "step": 3164 + }, + { + "epoch": 0.9714548802946593, + "grad_norm": 0.5505611896514893, + "learning_rate": 9.882401675785185e-05, + "loss": 2.0888, + "step": 3165 + }, + { + "epoch": 0.9717618170656844, + "grad_norm": 0.5224125385284424, + "learning_rate": 9.88229448316753e-05, + "loss": 2.0492, + "step": 3166 + }, + { + "epoch": 0.9720687538367097, + "grad_norm": 0.437215656042099, + "learning_rate": 9.882187242300178e-05, + "loss": 1.9927, + "step": 3167 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.4914848804473877, + "learning_rate": 9.882079953184192e-05, + "loss": 2.0309, + "step": 3168 + }, + { + "epoch": 0.97268262737876, + "grad_norm": 0.4990764260292053, + "learning_rate": 9.88197261582063e-05, + "loss": 2.0408, + "step": 3169 + }, + { + "epoch": 0.9729895641497851, + "grad_norm": 0.5283234715461731, + "learning_rate": 9.881865230210552e-05, + "loss": 2.0627, + "step": 3170 + }, + { + "epoch": 0.9732965009208103, + "grad_norm": 0.5771347284317017, + "learning_rate": 9.88175779635502e-05, + "loss": 2.1591, + "step": 3171 + }, + { + "epoch": 0.9736034376918354, + "grad_norm": 0.5020268559455872, + "learning_rate": 9.881650314255098e-05, + "loss": 2.0311, + "step": 3172 + }, + { + "epoch": 0.9739103744628607, + "grad_norm": 0.5476529002189636, + "learning_rate": 9.881542783911846e-05, + "loss": 2.1114, + "step": 3173 + }, + { + "epoch": 0.9742173112338858, + "grad_norm": 0.5630559921264648, + "learning_rate": 9.881435205326327e-05, + "loss": 2.0617, + "step": 3174 + }, + { + "epoch": 0.974524248004911, + "grad_norm": 0.5931001305580139, + "learning_rate": 9.881327578499604e-05, + "loss": 2.0376, + "step": 3175 + }, + { + "epoch": 0.9748311847759361, + "grad_norm": 0.6123979091644287, + "learning_rate": 9.881219903432742e-05, + "loss": 2.0995, + "step": 3176 + }, + { + "epoch": 0.9751381215469613, + "grad_norm": 0.6064465641975403, + "learning_rate": 9.881112180126802e-05, + "loss": 2.0533, + "step": 3177 + }, + { + "epoch": 0.9754450583179864, + "grad_norm": 0.6071485877037048, + "learning_rate": 9.881004408582852e-05, + "loss": 2.1007, + "step": 3178 + }, + { + "epoch": 0.9757519950890117, + "grad_norm": 0.6021482944488525, + "learning_rate": 9.880896588801954e-05, + "loss": 2.0528, + "step": 3179 + }, + { + "epoch": 0.9760589318600368, + "grad_norm": 0.5204832553863525, + "learning_rate": 9.880788720785177e-05, + "loss": 2.0489, + "step": 3180 + }, + { + "epoch": 0.976365868631062, + "grad_norm": 0.5347138047218323, + "learning_rate": 9.880680804533585e-05, + "loss": 2.1021, + "step": 3181 + }, + { + "epoch": 0.9766728054020871, + "grad_norm": 0.6318790912628174, + "learning_rate": 9.880572840048243e-05, + "loss": 2.0808, + "step": 3182 + }, + { + "epoch": 0.9769797421731123, + "grad_norm": 0.6978665590286255, + "learning_rate": 9.88046482733022e-05, + "loss": 2.0067, + "step": 3183 + }, + { + "epoch": 0.9772866789441375, + "grad_norm": 0.7986917495727539, + "learning_rate": 9.880356766380582e-05, + "loss": 2.0239, + "step": 3184 + }, + { + "epoch": 0.9775936157151627, + "grad_norm": 0.853898286819458, + "learning_rate": 9.880248657200402e-05, + "loss": 2.085, + "step": 3185 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.8207793235778809, + "learning_rate": 9.880140499790741e-05, + "loss": 2.0504, + "step": 3186 + }, + { + "epoch": 0.978207489257213, + "grad_norm": 0.7750336527824402, + "learning_rate": 9.880032294152673e-05, + "loss": 2.0962, + "step": 3187 + }, + { + "epoch": 0.9785144260282382, + "grad_norm": 0.7141241431236267, + "learning_rate": 9.879924040287263e-05, + "loss": 2.0655, + "step": 3188 + }, + { + "epoch": 0.9788213627992634, + "grad_norm": 0.6119080781936646, + "learning_rate": 9.879815738195585e-05, + "loss": 2.0611, + "step": 3189 + }, + { + "epoch": 0.9791282995702886, + "grad_norm": 0.5963751673698425, + "learning_rate": 9.879707387878708e-05, + "loss": 2.0978, + "step": 3190 + }, + { + "epoch": 0.9794352363413137, + "grad_norm": 0.5016428828239441, + "learning_rate": 9.879598989337703e-05, + "loss": 2.0323, + "step": 3191 + }, + { + "epoch": 0.9797421731123389, + "grad_norm": 0.5610151290893555, + "learning_rate": 9.87949054257364e-05, + "loss": 2.1362, + "step": 3192 + }, + { + "epoch": 0.980049109883364, + "grad_norm": 0.5687069296836853, + "learning_rate": 9.879382047587591e-05, + "loss": 2.0234, + "step": 3193 + }, + { + "epoch": 0.9803560466543892, + "grad_norm": 0.6210914254188538, + "learning_rate": 9.87927350438063e-05, + "loss": 2.0455, + "step": 3194 + }, + { + "epoch": 0.9806629834254144, + "grad_norm": 0.530215322971344, + "learning_rate": 9.879164912953827e-05, + "loss": 2.0607, + "step": 3195 + }, + { + "epoch": 0.9809699201964396, + "grad_norm": 0.5462486147880554, + "learning_rate": 9.879056273308258e-05, + "loss": 2.1229, + "step": 3196 + }, + { + "epoch": 0.9812768569674647, + "grad_norm": 0.5765405297279358, + "learning_rate": 9.878947585444994e-05, + "loss": 2.0575, + "step": 3197 + }, + { + "epoch": 0.9815837937384899, + "grad_norm": 0.531679630279541, + "learning_rate": 9.878838849365111e-05, + "loss": 2.0208, + "step": 3198 + }, + { + "epoch": 0.981890730509515, + "grad_norm": 0.5190781950950623, + "learning_rate": 9.878730065069683e-05, + "loss": 2.0073, + "step": 3199 + }, + { + "epoch": 0.9821976672805403, + "grad_norm": 0.6260761022567749, + "learning_rate": 9.878621232559784e-05, + "loss": 2.1144, + "step": 3200 + }, + { + "epoch": 0.9825046040515654, + "grad_norm": 0.664830207824707, + "learning_rate": 9.878512351836491e-05, + "loss": 2.1423, + "step": 3201 + }, + { + "epoch": 0.9828115408225906, + "grad_norm": 0.7107433676719666, + "learning_rate": 9.878403422900881e-05, + "loss": 2.0851, + "step": 3202 + }, + { + "epoch": 0.9831184775936157, + "grad_norm": 0.7426268458366394, + "learning_rate": 9.878294445754027e-05, + "loss": 2.0637, + "step": 3203 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.7643515467643738, + "learning_rate": 9.878185420397008e-05, + "loss": 2.0623, + "step": 3204 + }, + { + "epoch": 0.983732351135666, + "grad_norm": 0.644257664680481, + "learning_rate": 9.878076346830904e-05, + "loss": 2.103, + "step": 3205 + }, + { + "epoch": 0.9840392879066913, + "grad_norm": 0.5871284008026123, + "learning_rate": 9.877967225056787e-05, + "loss": 2.0695, + "step": 3206 + }, + { + "epoch": 0.9843462246777164, + "grad_norm": 0.6907737851142883, + "learning_rate": 9.877858055075742e-05, + "loss": 2.1148, + "step": 3207 + }, + { + "epoch": 0.9846531614487416, + "grad_norm": 0.6685691475868225, + "learning_rate": 9.877748836888843e-05, + "loss": 2.0356, + "step": 3208 + }, + { + "epoch": 0.9849600982197667, + "grad_norm": 0.797210156917572, + "learning_rate": 9.87763957049717e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.9852670349907919, + "grad_norm": 0.8397588133811951, + "learning_rate": 9.877530255901806e-05, + "loss": 2.0697, + "step": 3210 + }, + { + "epoch": 0.985573971761817, + "grad_norm": 0.6988976001739502, + "learning_rate": 9.877420893103828e-05, + "loss": 2.0676, + "step": 3211 + }, + { + "epoch": 0.9858809085328423, + "grad_norm": 0.5828577876091003, + "learning_rate": 9.877311482104319e-05, + "loss": 2.0988, + "step": 3212 + }, + { + "epoch": 0.9861878453038674, + "grad_norm": 0.66143798828125, + "learning_rate": 9.877202022904359e-05, + "loss": 2.101, + "step": 3213 + }, + { + "epoch": 0.9864947820748926, + "grad_norm": 0.7351155877113342, + "learning_rate": 9.877092515505028e-05, + "loss": 2.0198, + "step": 3214 + }, + { + "epoch": 0.9868017188459177, + "grad_norm": 0.6817437410354614, + "learning_rate": 9.876982959907413e-05, + "loss": 2.1182, + "step": 3215 + }, + { + "epoch": 0.9871086556169429, + "grad_norm": 0.6640676259994507, + "learning_rate": 9.876873356112592e-05, + "loss": 2.1264, + "step": 3216 + }, + { + "epoch": 0.987415592387968, + "grad_norm": 0.6146695017814636, + "learning_rate": 9.876763704121652e-05, + "loss": 2.0378, + "step": 3217 + }, + { + "epoch": 0.9877225291589933, + "grad_norm": 0.6681298017501831, + "learning_rate": 9.876654003935672e-05, + "loss": 2.1916, + "step": 3218 + }, + { + "epoch": 0.9880294659300184, + "grad_norm": 0.7407983541488647, + "learning_rate": 9.876544255555742e-05, + "loss": 2.0996, + "step": 3219 + }, + { + "epoch": 0.9883364027010436, + "grad_norm": 0.5995208621025085, + "learning_rate": 9.876434458982941e-05, + "loss": 2.0023, + "step": 3220 + }, + { + "epoch": 0.9886433394720687, + "grad_norm": 0.6491377949714661, + "learning_rate": 9.876324614218357e-05, + "loss": 2.129, + "step": 3221 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.6356569528579712, + "learning_rate": 9.876214721263074e-05, + "loss": 2.1396, + "step": 3222 + }, + { + "epoch": 0.9892572130141191, + "grad_norm": 0.6149557828903198, + "learning_rate": 9.876104780118182e-05, + "loss": 2.0204, + "step": 3223 + }, + { + "epoch": 0.9895641497851443, + "grad_norm": 0.600841224193573, + "learning_rate": 9.875994790784764e-05, + "loss": 2.0585, + "step": 3224 + }, + { + "epoch": 0.9898710865561694, + "grad_norm": 0.6398041248321533, + "learning_rate": 9.875884753263906e-05, + "loss": 2.1296, + "step": 3225 + }, + { + "epoch": 0.9901780233271946, + "grad_norm": 0.5978466272354126, + "learning_rate": 9.875774667556697e-05, + "loss": 1.9765, + "step": 3226 + }, + { + "epoch": 0.9904849600982197, + "grad_norm": 0.49499931931495667, + "learning_rate": 9.875664533664227e-05, + "loss": 2.0516, + "step": 3227 + }, + { + "epoch": 0.990791896869245, + "grad_norm": 0.5660768151283264, + "learning_rate": 9.875554351587579e-05, + "loss": 2.0743, + "step": 3228 + }, + { + "epoch": 0.9910988336402701, + "grad_norm": 0.56971275806427, + "learning_rate": 9.875444121327849e-05, + "loss": 2.0794, + "step": 3229 + }, + { + "epoch": 0.9914057704112953, + "grad_norm": 0.5806300044059753, + "learning_rate": 9.87533384288612e-05, + "loss": 2.1636, + "step": 3230 + }, + { + "epoch": 0.9917127071823204, + "grad_norm": 0.5485837459564209, + "learning_rate": 9.875223516263485e-05, + "loss": 2.025, + "step": 3231 + }, + { + "epoch": 0.9920196439533456, + "grad_norm": 0.6353451013565063, + "learning_rate": 9.875113141461034e-05, + "loss": 2.1033, + "step": 3232 + }, + { + "epoch": 0.9923265807243707, + "grad_norm": 0.577608048915863, + "learning_rate": 9.875002718479858e-05, + "loss": 2.1306, + "step": 3233 + }, + { + "epoch": 0.992633517495396, + "grad_norm": 0.5305901765823364, + "learning_rate": 9.874892247321046e-05, + "loss": 2.1123, + "step": 3234 + }, + { + "epoch": 0.9929404542664211, + "grad_norm": 0.5554118752479553, + "learning_rate": 9.874781727985693e-05, + "loss": 2.0524, + "step": 3235 + }, + { + "epoch": 0.9932473910374463, + "grad_norm": 0.48555269837379456, + "learning_rate": 9.87467116047489e-05, + "loss": 2.0699, + "step": 3236 + }, + { + "epoch": 0.9935543278084714, + "grad_norm": 0.578976035118103, + "learning_rate": 9.874560544789729e-05, + "loss": 2.0747, + "step": 3237 + }, + { + "epoch": 0.9938612645794966, + "grad_norm": 0.5508282780647278, + "learning_rate": 9.874449880931304e-05, + "loss": 2.0947, + "step": 3238 + }, + { + "epoch": 0.9941682013505218, + "grad_norm": 0.5458595752716064, + "learning_rate": 9.874339168900707e-05, + "loss": 2.0417, + "step": 3239 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.5668261647224426, + "learning_rate": 9.874228408699035e-05, + "loss": 2.0948, + "step": 3240 + }, + { + "epoch": 0.9947820748925721, + "grad_norm": 0.6127253174781799, + "learning_rate": 9.87411760032738e-05, + "loss": 2.0904, + "step": 3241 + }, + { + "epoch": 0.9950890116635973, + "grad_norm": 0.5736191868782043, + "learning_rate": 9.874006743786839e-05, + "loss": 2.0637, + "step": 3242 + }, + { + "epoch": 0.9953959484346224, + "grad_norm": 0.574163019657135, + "learning_rate": 9.873895839078507e-05, + "loss": 2.0925, + "step": 3243 + }, + { + "epoch": 0.9957028852056476, + "grad_norm": 0.5660602450370789, + "learning_rate": 9.873784886203478e-05, + "loss": 2.0743, + "step": 3244 + }, + { + "epoch": 0.9960098219766728, + "grad_norm": 0.6037993431091309, + "learning_rate": 9.87367388516285e-05, + "loss": 2.1274, + "step": 3245 + }, + { + "epoch": 0.996316758747698, + "grad_norm": 0.5664488673210144, + "learning_rate": 9.873562835957722e-05, + "loss": 2.0403, + "step": 3246 + }, + { + "epoch": 0.9966236955187231, + "grad_norm": 0.6170254349708557, + "learning_rate": 9.873451738589188e-05, + "loss": 2.0198, + "step": 3247 + }, + { + "epoch": 0.9969306322897483, + "grad_norm": 0.5582032799720764, + "learning_rate": 9.873340593058348e-05, + "loss": 2.1494, + "step": 3248 + }, + { + "epoch": 0.9972375690607734, + "grad_norm": 0.5565598607063293, + "learning_rate": 9.8732293993663e-05, + "loss": 2.1062, + "step": 3249 + }, + { + "epoch": 0.9975445058317987, + "grad_norm": 0.5526474118232727, + "learning_rate": 9.873118157514142e-05, + "loss": 2.1184, + "step": 3250 + }, + { + "epoch": 0.9978514426028238, + "grad_norm": 0.5864302515983582, + "learning_rate": 9.873006867502975e-05, + "loss": 2.1389, + "step": 3251 + }, + { + "epoch": 0.998158379373849, + "grad_norm": 0.5295118689537048, + "learning_rate": 9.872895529333899e-05, + "loss": 2.05, + "step": 3252 + }, + { + "epoch": 0.9984653161448741, + "grad_norm": 0.553537905216217, + "learning_rate": 9.872784143008012e-05, + "loss": 2.0464, + "step": 3253 + }, + { + "epoch": 0.9987722529158993, + "grad_norm": 0.558159589767456, + "learning_rate": 9.872672708526416e-05, + "loss": 2.1013, + "step": 3254 + }, + { + "epoch": 0.9990791896869244, + "grad_norm": 0.5479860901832581, + "learning_rate": 9.872561225890211e-05, + "loss": 2.0497, + "step": 3255 + }, + { + "epoch": 0.9993861264579497, + "grad_norm": 0.5538234114646912, + "learning_rate": 9.872449695100503e-05, + "loss": 2.1239, + "step": 3256 + }, + { + "epoch": 0.9996930632289748, + "grad_norm": 0.5970771908760071, + "learning_rate": 9.872338116158389e-05, + "loss": 2.0693, + "step": 3257 + }, + { + "epoch": 1.0, + "grad_norm": 0.5118132829666138, + "learning_rate": 9.872226489064975e-05, + "loss": 2.0302, + "step": 3258 + }, + { + "epoch": 1.0003069367710251, + "grad_norm": 0.538902223110199, + "learning_rate": 9.872114813821363e-05, + "loss": 2.0604, + "step": 3259 + }, + { + "epoch": 1.0006138735420504, + "grad_norm": 0.47124916315078735, + "learning_rate": 9.872003090428657e-05, + "loss": 2.054, + "step": 3260 + }, + { + "epoch": 1.0009208103130756, + "grad_norm": 0.5109235048294067, + "learning_rate": 9.87189131888796e-05, + "loss": 2.0107, + "step": 3261 + }, + { + "epoch": 1.0012277470841007, + "grad_norm": 0.5530306696891785, + "learning_rate": 9.871779499200377e-05, + "loss": 2.0914, + "step": 3262 + }, + { + "epoch": 1.0015346838551258, + "grad_norm": 0.6271992325782776, + "learning_rate": 9.871667631367017e-05, + "loss": 1.9855, + "step": 3263 + }, + { + "epoch": 1.0018416206261511, + "grad_norm": 0.5752004384994507, + "learning_rate": 9.871555715388978e-05, + "loss": 2.0689, + "step": 3264 + }, + { + "epoch": 1.0021485573971762, + "grad_norm": 0.6185278296470642, + "learning_rate": 9.871443751267373e-05, + "loss": 2.0751, + "step": 3265 + }, + { + "epoch": 1.0024554941682013, + "grad_norm": 0.625248908996582, + "learning_rate": 9.871331739003304e-05, + "loss": 2.102, + "step": 3266 + }, + { + "epoch": 1.0027624309392265, + "grad_norm": 0.6345300078392029, + "learning_rate": 9.87121967859788e-05, + "loss": 2.0898, + "step": 3267 + }, + { + "epoch": 1.0030693677102518, + "grad_norm": 0.6836622953414917, + "learning_rate": 9.871107570052207e-05, + "loss": 2.1348, + "step": 3268 + }, + { + "epoch": 1.003376304481277, + "grad_norm": 0.699739933013916, + "learning_rate": 9.870995413367397e-05, + "loss": 2.0085, + "step": 3269 + }, + { + "epoch": 1.003683241252302, + "grad_norm": 0.650558590888977, + "learning_rate": 9.870883208544553e-05, + "loss": 2.0927, + "step": 3270 + }, + { + "epoch": 1.0039901780233271, + "grad_norm": 0.6837300658226013, + "learning_rate": 9.870770955584785e-05, + "loss": 2.1415, + "step": 3271 + }, + { + "epoch": 1.0042971147943525, + "grad_norm": 0.595761239528656, + "learning_rate": 9.870658654489206e-05, + "loss": 2.0372, + "step": 3272 + }, + { + "epoch": 1.0046040515653776, + "grad_norm": 0.5177203416824341, + "learning_rate": 9.870546305258922e-05, + "loss": 2.053, + "step": 3273 + }, + { + "epoch": 1.0049109883364027, + "grad_norm": 0.5392438173294067, + "learning_rate": 9.870433907895045e-05, + "loss": 2.0886, + "step": 3274 + }, + { + "epoch": 1.0052179251074278, + "grad_norm": 0.594776451587677, + "learning_rate": 9.870321462398686e-05, + "loss": 2.0158, + "step": 3275 + }, + { + "epoch": 1.0055248618784531, + "grad_norm": 0.6363179683685303, + "learning_rate": 9.870208968770955e-05, + "loss": 2.0532, + "step": 3276 + }, + { + "epoch": 1.0058317986494782, + "grad_norm": 0.7506567239761353, + "learning_rate": 9.870096427012965e-05, + "loss": 2.1288, + "step": 3277 + }, + { + "epoch": 1.0061387354205034, + "grad_norm": 0.7155289053916931, + "learning_rate": 9.869983837125828e-05, + "loss": 2.0859, + "step": 3278 + }, + { + "epoch": 1.0064456721915285, + "grad_norm": 0.7589760422706604, + "learning_rate": 9.869871199110656e-05, + "loss": 2.1668, + "step": 3279 + }, + { + "epoch": 1.0067526089625538, + "grad_norm": 0.6161168217658997, + "learning_rate": 9.869758512968562e-05, + "loss": 2.0421, + "step": 3280 + }, + { + "epoch": 1.007059545733579, + "grad_norm": 0.5722637176513672, + "learning_rate": 9.86964577870066e-05, + "loss": 2.1333, + "step": 3281 + }, + { + "epoch": 1.007366482504604, + "grad_norm": 0.6443020701408386, + "learning_rate": 9.869532996308065e-05, + "loss": 2.0227, + "step": 3282 + }, + { + "epoch": 1.0076734192756291, + "grad_norm": 0.6603342890739441, + "learning_rate": 9.869420165791891e-05, + "loss": 2.0888, + "step": 3283 + }, + { + "epoch": 1.0079803560466545, + "grad_norm": 0.6666482090950012, + "learning_rate": 9.869307287153251e-05, + "loss": 2.0132, + "step": 3284 + }, + { + "epoch": 1.0082872928176796, + "grad_norm": 0.6691575646400452, + "learning_rate": 9.869194360393264e-05, + "loss": 2.0752, + "step": 3285 + }, + { + "epoch": 1.0085942295887047, + "grad_norm": 0.6142565011978149, + "learning_rate": 9.869081385513044e-05, + "loss": 2.0491, + "step": 3286 + }, + { + "epoch": 1.0089011663597298, + "grad_norm": 0.5869930386543274, + "learning_rate": 9.868968362513708e-05, + "loss": 2.1252, + "step": 3287 + }, + { + "epoch": 1.0092081031307552, + "grad_norm": 0.532183825969696, + "learning_rate": 9.868855291396373e-05, + "loss": 2.0589, + "step": 3288 + }, + { + "epoch": 1.0095150399017803, + "grad_norm": 0.616374135017395, + "learning_rate": 9.868742172162156e-05, + "loss": 2.0808, + "step": 3289 + }, + { + "epoch": 1.0098219766728054, + "grad_norm": 0.5750923156738281, + "learning_rate": 9.868629004812176e-05, + "loss": 2.0407, + "step": 3290 + }, + { + "epoch": 1.0101289134438305, + "grad_norm": 0.6161531209945679, + "learning_rate": 9.86851578934755e-05, + "loss": 2.0938, + "step": 3291 + }, + { + "epoch": 1.0104358502148558, + "grad_norm": 0.5369158983230591, + "learning_rate": 9.868402525769397e-05, + "loss": 2.1298, + "step": 3292 + }, + { + "epoch": 1.010742786985881, + "grad_norm": 0.5134824514389038, + "learning_rate": 9.868289214078837e-05, + "loss": 2.0345, + "step": 3293 + }, + { + "epoch": 1.011049723756906, + "grad_norm": 0.4972594082355499, + "learning_rate": 9.868175854276991e-05, + "loss": 2.1264, + "step": 3294 + }, + { + "epoch": 1.0113566605279312, + "grad_norm": 0.5727534890174866, + "learning_rate": 9.868062446364976e-05, + "loss": 2.1668, + "step": 3295 + }, + { + "epoch": 1.0116635972989565, + "grad_norm": 0.6384626030921936, + "learning_rate": 9.867948990343915e-05, + "loss": 2.1125, + "step": 3296 + }, + { + "epoch": 1.0119705340699816, + "grad_norm": 0.7591070532798767, + "learning_rate": 9.867835486214929e-05, + "loss": 2.0975, + "step": 3297 + }, + { + "epoch": 1.0122774708410067, + "grad_norm": 0.7940282821655273, + "learning_rate": 9.86772193397914e-05, + "loss": 2.0107, + "step": 3298 + }, + { + "epoch": 1.0125844076120318, + "grad_norm": 0.6877933144569397, + "learning_rate": 9.86760833363767e-05, + "loss": 2.0684, + "step": 3299 + }, + { + "epoch": 1.0128913443830572, + "grad_norm": 0.5361137986183167, + "learning_rate": 9.867494685191641e-05, + "loss": 2.0426, + "step": 3300 + }, + { + "epoch": 1.0131982811540823, + "grad_norm": 0.5104349851608276, + "learning_rate": 9.867380988642177e-05, + "loss": 2.0849, + "step": 3301 + }, + { + "epoch": 1.0135052179251074, + "grad_norm": 0.6133849024772644, + "learning_rate": 9.867267243990399e-05, + "loss": 2.0789, + "step": 3302 + }, + { + "epoch": 1.0138121546961325, + "grad_norm": 0.6607559323310852, + "learning_rate": 9.867153451237436e-05, + "loss": 2.0978, + "step": 3303 + }, + { + "epoch": 1.0141190914671578, + "grad_norm": 0.6853774189949036, + "learning_rate": 9.867039610384409e-05, + "loss": 2.1612, + "step": 3304 + }, + { + "epoch": 1.014426028238183, + "grad_norm": 0.6326626539230347, + "learning_rate": 9.866925721432442e-05, + "loss": 2.0887, + "step": 3305 + }, + { + "epoch": 1.014732965009208, + "grad_norm": 0.5483830571174622, + "learning_rate": 9.866811784382665e-05, + "loss": 2.0522, + "step": 3306 + }, + { + "epoch": 1.0150399017802332, + "grad_norm": 0.5980744957923889, + "learning_rate": 9.866697799236201e-05, + "loss": 2.0666, + "step": 3307 + }, + { + "epoch": 1.0153468385512585, + "grad_norm": 0.6047075986862183, + "learning_rate": 9.866583765994177e-05, + "loss": 2.0924, + "step": 3308 + }, + { + "epoch": 1.0156537753222836, + "grad_norm": 0.5932674407958984, + "learning_rate": 9.86646968465772e-05, + "loss": 2.0426, + "step": 3309 + }, + { + "epoch": 1.0159607120933087, + "grad_norm": 0.5349873304367065, + "learning_rate": 9.866355555227957e-05, + "loss": 2.027, + "step": 3310 + }, + { + "epoch": 1.0162676488643339, + "grad_norm": 0.5090891122817993, + "learning_rate": 9.866241377706015e-05, + "loss": 2.0554, + "step": 3311 + }, + { + "epoch": 1.0165745856353592, + "grad_norm": 0.605268120765686, + "learning_rate": 9.866127152093025e-05, + "loss": 2.0788, + "step": 3312 + }, + { + "epoch": 1.0168815224063843, + "grad_norm": 0.6006563305854797, + "learning_rate": 9.866012878390113e-05, + "loss": 2.0154, + "step": 3313 + }, + { + "epoch": 1.0171884591774094, + "grad_norm": 0.6412727236747742, + "learning_rate": 9.865898556598409e-05, + "loss": 2.0948, + "step": 3314 + }, + { + "epoch": 1.0174953959484345, + "grad_norm": 0.512140154838562, + "learning_rate": 9.865784186719046e-05, + "loss": 2.0314, + "step": 3315 + }, + { + "epoch": 1.0178023327194599, + "grad_norm": 0.48285913467407227, + "learning_rate": 9.865669768753151e-05, + "loss": 1.9689, + "step": 3316 + }, + { + "epoch": 1.018109269490485, + "grad_norm": 0.6067737340927124, + "learning_rate": 9.865555302701854e-05, + "loss": 2.1042, + "step": 3317 + }, + { + "epoch": 1.01841620626151, + "grad_norm": 0.6272363662719727, + "learning_rate": 9.865440788566289e-05, + "loss": 2.1092, + "step": 3318 + }, + { + "epoch": 1.0187231430325352, + "grad_norm": 0.6264182925224304, + "learning_rate": 9.865326226347586e-05, + "loss": 2.0445, + "step": 3319 + }, + { + "epoch": 1.0190300798035605, + "grad_norm": 0.5642834901809692, + "learning_rate": 9.86521161604688e-05, + "loss": 2.1041, + "step": 3320 + }, + { + "epoch": 1.0193370165745856, + "grad_norm": 0.5188324451446533, + "learning_rate": 9.865096957665297e-05, + "loss": 2.0174, + "step": 3321 + }, + { + "epoch": 1.0196439533456108, + "grad_norm": 0.5204416513442993, + "learning_rate": 9.864982251203976e-05, + "loss": 2.0927, + "step": 3322 + }, + { + "epoch": 1.0199508901166359, + "grad_norm": 0.5845292806625366, + "learning_rate": 9.86486749666405e-05, + "loss": 2.0751, + "step": 3323 + }, + { + "epoch": 1.0202578268876612, + "grad_norm": 0.5514994263648987, + "learning_rate": 9.86475269404665e-05, + "loss": 2.0976, + "step": 3324 + }, + { + "epoch": 1.0205647636586863, + "grad_norm": 0.6578981280326843, + "learning_rate": 9.864637843352915e-05, + "loss": 2.0668, + "step": 3325 + }, + { + "epoch": 1.0208717004297114, + "grad_norm": 0.6396434307098389, + "learning_rate": 9.864522944583976e-05, + "loss": 2.0648, + "step": 3326 + }, + { + "epoch": 1.0211786372007365, + "grad_norm": 0.548759400844574, + "learning_rate": 9.86440799774097e-05, + "loss": 2.0873, + "step": 3327 + }, + { + "epoch": 1.0214855739717619, + "grad_norm": 0.5739279985427856, + "learning_rate": 9.864293002825033e-05, + "loss": 2.0623, + "step": 3328 + }, + { + "epoch": 1.021792510742787, + "grad_norm": 0.5882315039634705, + "learning_rate": 9.864177959837303e-05, + "loss": 2.0399, + "step": 3329 + }, + { + "epoch": 1.022099447513812, + "grad_norm": 0.563359797000885, + "learning_rate": 9.864062868778914e-05, + "loss": 2.0839, + "step": 3330 + }, + { + "epoch": 1.0224063842848374, + "grad_norm": 0.6162607073783875, + "learning_rate": 9.863947729651006e-05, + "loss": 2.0439, + "step": 3331 + }, + { + "epoch": 1.0227133210558625, + "grad_norm": 0.6540365815162659, + "learning_rate": 9.863832542454715e-05, + "loss": 2.1234, + "step": 3332 + }, + { + "epoch": 1.0230202578268877, + "grad_norm": 0.6401089429855347, + "learning_rate": 9.86371730719118e-05, + "loss": 2.0418, + "step": 3333 + }, + { + "epoch": 1.0233271945979128, + "grad_norm": 0.6456391215324402, + "learning_rate": 9.86360202386154e-05, + "loss": 2.1191, + "step": 3334 + }, + { + "epoch": 1.023634131368938, + "grad_norm": 0.59992516040802, + "learning_rate": 9.863486692466933e-05, + "loss": 2.0582, + "step": 3335 + }, + { + "epoch": 1.0239410681399632, + "grad_norm": 0.5932520627975464, + "learning_rate": 9.8633713130085e-05, + "loss": 2.1812, + "step": 3336 + }, + { + "epoch": 1.0242480049109883, + "grad_norm": 0.6322866082191467, + "learning_rate": 9.863255885487384e-05, + "loss": 2.1523, + "step": 3337 + }, + { + "epoch": 1.0245549416820134, + "grad_norm": 0.6291313171386719, + "learning_rate": 9.863140409904719e-05, + "loss": 2.0495, + "step": 3338 + }, + { + "epoch": 1.0248618784530388, + "grad_norm": 0.6272565126419067, + "learning_rate": 9.863024886261653e-05, + "loss": 1.9812, + "step": 3339 + }, + { + "epoch": 1.025168815224064, + "grad_norm": 0.6485729217529297, + "learning_rate": 9.862909314559323e-05, + "loss": 2.0826, + "step": 3340 + }, + { + "epoch": 1.025475751995089, + "grad_norm": 0.608239471912384, + "learning_rate": 9.862793694798875e-05, + "loss": 2.0519, + "step": 3341 + }, + { + "epoch": 1.0257826887661141, + "grad_norm": 0.5492779612541199, + "learning_rate": 9.862678026981447e-05, + "loss": 1.9901, + "step": 3342 + }, + { + "epoch": 1.0260896255371394, + "grad_norm": 0.524030327796936, + "learning_rate": 9.862562311108187e-05, + "loss": 2.0695, + "step": 3343 + }, + { + "epoch": 1.0263965623081646, + "grad_norm": 0.6835227608680725, + "learning_rate": 9.862446547180235e-05, + "loss": 2.1312, + "step": 3344 + }, + { + "epoch": 1.0267034990791897, + "grad_norm": 0.6771748065948486, + "learning_rate": 9.862330735198736e-05, + "loss": 2.0566, + "step": 3345 + }, + { + "epoch": 1.0270104358502148, + "grad_norm": 0.609993577003479, + "learning_rate": 9.862214875164835e-05, + "loss": 2.1463, + "step": 3346 + }, + { + "epoch": 1.0273173726212401, + "grad_norm": 0.6617777347564697, + "learning_rate": 9.862098967079677e-05, + "loss": 2.0485, + "step": 3347 + }, + { + "epoch": 1.0276243093922652, + "grad_norm": 0.7935113906860352, + "learning_rate": 9.861983010944407e-05, + "loss": 2.0528, + "step": 3348 + }, + { + "epoch": 1.0279312461632903, + "grad_norm": 0.7510255575180054, + "learning_rate": 9.861867006760172e-05, + "loss": 1.9803, + "step": 3349 + }, + { + "epoch": 1.0282381829343155, + "grad_norm": 0.6944519281387329, + "learning_rate": 9.861750954528117e-05, + "loss": 2.0488, + "step": 3350 + }, + { + "epoch": 1.0285451197053408, + "grad_norm": 0.6057126522064209, + "learning_rate": 9.861634854249389e-05, + "loss": 2.1465, + "step": 3351 + }, + { + "epoch": 1.028852056476366, + "grad_norm": 0.6156182289123535, + "learning_rate": 9.861518705925135e-05, + "loss": 2.1227, + "step": 3352 + }, + { + "epoch": 1.029158993247391, + "grad_norm": 0.6016978621482849, + "learning_rate": 9.861402509556506e-05, + "loss": 2.0238, + "step": 3353 + }, + { + "epoch": 1.0294659300184161, + "grad_norm": 0.5987950563430786, + "learning_rate": 9.861286265144648e-05, + "loss": 2.0529, + "step": 3354 + }, + { + "epoch": 1.0297728667894415, + "grad_norm": 0.6011384725570679, + "learning_rate": 9.861169972690707e-05, + "loss": 2.0612, + "step": 3355 + }, + { + "epoch": 1.0300798035604666, + "grad_norm": 0.5217840671539307, + "learning_rate": 9.861053632195838e-05, + "loss": 2.0472, + "step": 3356 + }, + { + "epoch": 1.0303867403314917, + "grad_norm": 0.5202180743217468, + "learning_rate": 9.860937243661186e-05, + "loss": 2.1301, + "step": 3357 + }, + { + "epoch": 1.0306936771025168, + "grad_norm": 0.572290301322937, + "learning_rate": 9.860820807087905e-05, + "loss": 2.0309, + "step": 3358 + }, + { + "epoch": 1.0310006138735421, + "grad_norm": 0.5088694095611572, + "learning_rate": 9.860704322477142e-05, + "loss": 2.0789, + "step": 3359 + }, + { + "epoch": 1.0313075506445673, + "grad_norm": 0.5546056032180786, + "learning_rate": 9.860587789830052e-05, + "loss": 1.9708, + "step": 3360 + }, + { + "epoch": 1.0316144874155924, + "grad_norm": 0.5152996182441711, + "learning_rate": 9.860471209147782e-05, + "loss": 2.0656, + "step": 3361 + }, + { + "epoch": 1.0319214241866175, + "grad_norm": 0.4997018873691559, + "learning_rate": 9.860354580431488e-05, + "loss": 2.1404, + "step": 3362 + }, + { + "epoch": 1.0322283609576428, + "grad_norm": 0.5464209318161011, + "learning_rate": 9.860237903682321e-05, + "loss": 2.0013, + "step": 3363 + }, + { + "epoch": 1.032535297728668, + "grad_norm": 0.4934932589530945, + "learning_rate": 9.860121178901435e-05, + "loss": 2.0873, + "step": 3364 + }, + { + "epoch": 1.032842234499693, + "grad_norm": 0.5755184292793274, + "learning_rate": 9.860004406089982e-05, + "loss": 2.0706, + "step": 3365 + }, + { + "epoch": 1.0331491712707181, + "grad_norm": 0.6155427098274231, + "learning_rate": 9.859887585249117e-05, + "loss": 2.1153, + "step": 3366 + }, + { + "epoch": 1.0334561080417435, + "grad_norm": 0.6251068711280823, + "learning_rate": 9.859770716379995e-05, + "loss": 1.9988, + "step": 3367 + }, + { + "epoch": 1.0337630448127686, + "grad_norm": 0.5652515888214111, + "learning_rate": 9.85965379948377e-05, + "loss": 1.9834, + "step": 3368 + }, + { + "epoch": 1.0340699815837937, + "grad_norm": 0.49031418561935425, + "learning_rate": 9.859536834561599e-05, + "loss": 2.0719, + "step": 3369 + }, + { + "epoch": 1.0343769183548188, + "grad_norm": 0.5014585852622986, + "learning_rate": 9.859419821614635e-05, + "loss": 2.0309, + "step": 3370 + }, + { + "epoch": 1.0346838551258442, + "grad_norm": 0.5657221674919128, + "learning_rate": 9.859302760644036e-05, + "loss": 2.048, + "step": 3371 + }, + { + "epoch": 1.0349907918968693, + "grad_norm": 0.7023506164550781, + "learning_rate": 9.85918565165096e-05, + "loss": 2.033, + "step": 3372 + }, + { + "epoch": 1.0352977286678944, + "grad_norm": 0.5712850689888, + "learning_rate": 9.859068494636565e-05, + "loss": 2.1006, + "step": 3373 + }, + { + "epoch": 1.0356046654389195, + "grad_norm": 0.5352653861045837, + "learning_rate": 9.858951289602004e-05, + "loss": 1.9775, + "step": 3374 + }, + { + "epoch": 1.0359116022099448, + "grad_norm": 0.5282073616981506, + "learning_rate": 9.85883403654844e-05, + "loss": 2.0388, + "step": 3375 + }, + { + "epoch": 1.03621853898097, + "grad_norm": 0.6164727210998535, + "learning_rate": 9.85871673547703e-05, + "loss": 2.0758, + "step": 3376 + }, + { + "epoch": 1.036525475751995, + "grad_norm": 0.6034660935401917, + "learning_rate": 9.858599386388933e-05, + "loss": 2.0619, + "step": 3377 + }, + { + "epoch": 1.0368324125230202, + "grad_norm": 0.6129952073097229, + "learning_rate": 9.85848198928531e-05, + "loss": 2.0709, + "step": 3378 + }, + { + "epoch": 1.0371393492940455, + "grad_norm": 0.6287248134613037, + "learning_rate": 9.85836454416732e-05, + "loss": 2.1493, + "step": 3379 + }, + { + "epoch": 1.0374462860650706, + "grad_norm": 0.675419807434082, + "learning_rate": 9.858247051036124e-05, + "loss": 2.0558, + "step": 3380 + }, + { + "epoch": 1.0377532228360957, + "grad_norm": 0.6493481397628784, + "learning_rate": 9.858129509892882e-05, + "loss": 2.2019, + "step": 3381 + }, + { + "epoch": 1.0380601596071208, + "grad_norm": 0.6690036058425903, + "learning_rate": 9.85801192073876e-05, + "loss": 2.0069, + "step": 3382 + }, + { + "epoch": 1.0383670963781462, + "grad_norm": 0.6682954430580139, + "learning_rate": 9.857894283574913e-05, + "loss": 2.0559, + "step": 3383 + }, + { + "epoch": 1.0386740331491713, + "grad_norm": 0.6408236622810364, + "learning_rate": 9.857776598402508e-05, + "loss": 2.0837, + "step": 3384 + }, + { + "epoch": 1.0389809699201964, + "grad_norm": 0.7896385192871094, + "learning_rate": 9.85765886522271e-05, + "loss": 2.1344, + "step": 3385 + }, + { + "epoch": 1.0392879066912215, + "grad_norm": 0.7404007911682129, + "learning_rate": 9.857541084036677e-05, + "loss": 2.0937, + "step": 3386 + }, + { + "epoch": 1.0395948434622468, + "grad_norm": 0.6780609488487244, + "learning_rate": 9.857423254845577e-05, + "loss": 2.0279, + "step": 3387 + }, + { + "epoch": 1.039901780233272, + "grad_norm": 0.5989474654197693, + "learning_rate": 9.857305377650574e-05, + "loss": 2.0997, + "step": 3388 + }, + { + "epoch": 1.040208717004297, + "grad_norm": 0.5449484586715698, + "learning_rate": 9.857187452452832e-05, + "loss": 2.0544, + "step": 3389 + }, + { + "epoch": 1.0405156537753222, + "grad_norm": 0.6261779069900513, + "learning_rate": 9.857069479253516e-05, + "loss": 2.024, + "step": 3390 + }, + { + "epoch": 1.0408225905463475, + "grad_norm": 0.6665713787078857, + "learning_rate": 9.856951458053794e-05, + "loss": 2.1139, + "step": 3391 + }, + { + "epoch": 1.0411295273173726, + "grad_norm": 0.5861490964889526, + "learning_rate": 9.856833388854829e-05, + "loss": 2.0087, + "step": 3392 + }, + { + "epoch": 1.0414364640883977, + "grad_norm": 0.5511623620986938, + "learning_rate": 9.856715271657793e-05, + "loss": 2.106, + "step": 3393 + }, + { + "epoch": 1.0417434008594229, + "grad_norm": 0.5450705885887146, + "learning_rate": 9.856597106463848e-05, + "loss": 2.0669, + "step": 3394 + }, + { + "epoch": 1.0420503376304482, + "grad_norm": 0.5172801613807678, + "learning_rate": 9.856478893274163e-05, + "loss": 2.0492, + "step": 3395 + }, + { + "epoch": 1.0423572744014733, + "grad_norm": 0.580157458782196, + "learning_rate": 9.856360632089907e-05, + "loss": 2.0794, + "step": 3396 + }, + { + "epoch": 1.0426642111724984, + "grad_norm": 0.5138662457466125, + "learning_rate": 9.856242322912251e-05, + "loss": 2.0813, + "step": 3397 + }, + { + "epoch": 1.0429711479435237, + "grad_norm": 0.5626689791679382, + "learning_rate": 9.85612396574236e-05, + "loss": 2.071, + "step": 3398 + }, + { + "epoch": 1.0432780847145489, + "grad_norm": 0.6069894433021545, + "learning_rate": 9.856005560581407e-05, + "loss": 2.132, + "step": 3399 + }, + { + "epoch": 1.043585021485574, + "grad_norm": 0.547346293926239, + "learning_rate": 9.85588710743056e-05, + "loss": 2.0572, + "step": 3400 + }, + { + "epoch": 1.043891958256599, + "grad_norm": 0.5712311863899231, + "learning_rate": 9.855768606290992e-05, + "loss": 2.0943, + "step": 3401 + }, + { + "epoch": 1.0441988950276242, + "grad_norm": 0.5945014953613281, + "learning_rate": 9.85565005716387e-05, + "loss": 2.1004, + "step": 3402 + }, + { + "epoch": 1.0445058317986495, + "grad_norm": 0.5712563395500183, + "learning_rate": 9.85553146005037e-05, + "loss": 2.0817, + "step": 3403 + }, + { + "epoch": 1.0448127685696746, + "grad_norm": 0.552578866481781, + "learning_rate": 9.855412814951661e-05, + "loss": 2.0514, + "step": 3404 + }, + { + "epoch": 1.0451197053406998, + "grad_norm": 0.5654930472373962, + "learning_rate": 9.855294121868918e-05, + "loss": 2.1342, + "step": 3405 + }, + { + "epoch": 1.045426642111725, + "grad_norm": 0.516094446182251, + "learning_rate": 9.855175380803312e-05, + "loss": 2.01, + "step": 3406 + }, + { + "epoch": 1.0457335788827502, + "grad_norm": 0.5198549628257751, + "learning_rate": 9.855056591756018e-05, + "loss": 2.0423, + "step": 3407 + }, + { + "epoch": 1.0460405156537753, + "grad_norm": 0.45312678813934326, + "learning_rate": 9.854937754728209e-05, + "loss": 1.9767, + "step": 3408 + }, + { + "epoch": 1.0463474524248004, + "grad_norm": 0.4647958278656006, + "learning_rate": 9.854818869721059e-05, + "loss": 2.107, + "step": 3409 + }, + { + "epoch": 1.0466543891958258, + "grad_norm": 0.5034347772598267, + "learning_rate": 9.854699936735742e-05, + "loss": 2.0358, + "step": 3410 + }, + { + "epoch": 1.0469613259668509, + "grad_norm": 0.48189103603363037, + "learning_rate": 9.854580955773435e-05, + "loss": 2.0441, + "step": 3411 + }, + { + "epoch": 1.047268262737876, + "grad_norm": 0.5315099954605103, + "learning_rate": 9.854461926835316e-05, + "loss": 2.0222, + "step": 3412 + }, + { + "epoch": 1.047575199508901, + "grad_norm": 0.6013970971107483, + "learning_rate": 9.854342849922557e-05, + "loss": 2.09, + "step": 3413 + }, + { + "epoch": 1.0478821362799264, + "grad_norm": 0.7554240226745605, + "learning_rate": 9.854223725036339e-05, + "loss": 2.0411, + "step": 3414 + }, + { + "epoch": 1.0481890730509515, + "grad_norm": 0.7160158157348633, + "learning_rate": 9.854104552177835e-05, + "loss": 2.0858, + "step": 3415 + }, + { + "epoch": 1.0484960098219767, + "grad_norm": 0.5641576051712036, + "learning_rate": 9.853985331348225e-05, + "loss": 2.0287, + "step": 3416 + }, + { + "epoch": 1.0488029465930018, + "grad_norm": 0.5947676301002502, + "learning_rate": 9.853866062548687e-05, + "loss": 2.1177, + "step": 3417 + }, + { + "epoch": 1.049109883364027, + "grad_norm": 0.5780991911888123, + "learning_rate": 9.853746745780401e-05, + "loss": 2.024, + "step": 3418 + }, + { + "epoch": 1.0494168201350522, + "grad_norm": 0.6753053665161133, + "learning_rate": 9.853627381044543e-05, + "loss": 2.1303, + "step": 3419 + }, + { + "epoch": 1.0497237569060773, + "grad_norm": 0.7183442711830139, + "learning_rate": 9.853507968342295e-05, + "loss": 2.0845, + "step": 3420 + }, + { + "epoch": 1.0500306936771024, + "grad_norm": 0.6768840551376343, + "learning_rate": 9.853388507674837e-05, + "loss": 2.0991, + "step": 3421 + }, + { + "epoch": 1.0503376304481278, + "grad_norm": 0.624703049659729, + "learning_rate": 9.85326899904335e-05, + "loss": 2.0952, + "step": 3422 + }, + { + "epoch": 1.050644567219153, + "grad_norm": 0.523289144039154, + "learning_rate": 9.853149442449013e-05, + "loss": 2.0244, + "step": 3423 + }, + { + "epoch": 1.050951503990178, + "grad_norm": 0.4939860701560974, + "learning_rate": 9.853029837893008e-05, + "loss": 2.0312, + "step": 3424 + }, + { + "epoch": 1.0512584407612031, + "grad_norm": 0.5685132145881653, + "learning_rate": 9.852910185376519e-05, + "loss": 2.0863, + "step": 3425 + }, + { + "epoch": 1.0515653775322285, + "grad_norm": 0.5713129639625549, + "learning_rate": 9.852790484900725e-05, + "loss": 2.1182, + "step": 3426 + }, + { + "epoch": 1.0518723143032536, + "grad_norm": 0.5626100301742554, + "learning_rate": 9.852670736466813e-05, + "loss": 2.0187, + "step": 3427 + }, + { + "epoch": 1.0521792510742787, + "grad_norm": 0.5129684805870056, + "learning_rate": 9.852550940075965e-05, + "loss": 2.0354, + "step": 3428 + }, + { + "epoch": 1.0524861878453038, + "grad_norm": 0.6123769879341125, + "learning_rate": 9.852431095729361e-05, + "loss": 2.1315, + "step": 3429 + }, + { + "epoch": 1.0527931246163291, + "grad_norm": 0.66834956407547, + "learning_rate": 9.852311203428192e-05, + "loss": 2.1642, + "step": 3430 + }, + { + "epoch": 1.0531000613873542, + "grad_norm": 0.6253052353858948, + "learning_rate": 9.85219126317364e-05, + "loss": 2.0651, + "step": 3431 + }, + { + "epoch": 1.0534069981583793, + "grad_norm": 0.5162510871887207, + "learning_rate": 9.852071274966888e-05, + "loss": 2.0029, + "step": 3432 + }, + { + "epoch": 1.0537139349294045, + "grad_norm": 0.5725626349449158, + "learning_rate": 9.851951238809125e-05, + "loss": 2.0875, + "step": 3433 + }, + { + "epoch": 1.0540208717004298, + "grad_norm": 0.5319885611534119, + "learning_rate": 9.851831154701537e-05, + "loss": 2.0042, + "step": 3434 + }, + { + "epoch": 1.054327808471455, + "grad_norm": 0.5030925273895264, + "learning_rate": 9.851711022645307e-05, + "loss": 1.9805, + "step": 3435 + }, + { + "epoch": 1.05463474524248, + "grad_norm": 0.5786148309707642, + "learning_rate": 9.851590842641627e-05, + "loss": 2.1456, + "step": 3436 + }, + { + "epoch": 1.0549416820135051, + "grad_norm": 0.6246622800827026, + "learning_rate": 9.851470614691682e-05, + "loss": 2.042, + "step": 3437 + }, + { + "epoch": 1.0552486187845305, + "grad_norm": 0.5181210041046143, + "learning_rate": 9.851350338796662e-05, + "loss": 2.0423, + "step": 3438 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.5505120754241943, + "learning_rate": 9.851230014957754e-05, + "loss": 2.0478, + "step": 3439 + }, + { + "epoch": 1.0558624923265807, + "grad_norm": 0.6193632483482361, + "learning_rate": 9.851109643176147e-05, + "loss": 1.9904, + "step": 3440 + }, + { + "epoch": 1.0561694290976058, + "grad_norm": 0.6332803964614868, + "learning_rate": 9.85098922345303e-05, + "loss": 2.0037, + "step": 3441 + }, + { + "epoch": 1.0564763658686311, + "grad_norm": 0.5601481199264526, + "learning_rate": 9.850868755789595e-05, + "loss": 2.141, + "step": 3442 + }, + { + "epoch": 1.0567833026396563, + "grad_norm": 0.588182270526886, + "learning_rate": 9.850748240187033e-05, + "loss": 2.17, + "step": 3443 + }, + { + "epoch": 1.0570902394106814, + "grad_norm": 0.5955865383148193, + "learning_rate": 9.850627676646533e-05, + "loss": 2.1004, + "step": 3444 + }, + { + "epoch": 1.0573971761817065, + "grad_norm": 0.6412670612335205, + "learning_rate": 9.850507065169288e-05, + "loss": 2.0642, + "step": 3445 + }, + { + "epoch": 1.0577041129527318, + "grad_norm": 0.5597305297851562, + "learning_rate": 9.850386405756489e-05, + "loss": 2.0412, + "step": 3446 + }, + { + "epoch": 1.058011049723757, + "grad_norm": 0.5633887052536011, + "learning_rate": 9.850265698409328e-05, + "loss": 1.9976, + "step": 3447 + }, + { + "epoch": 1.058317986494782, + "grad_norm": 0.5924213528633118, + "learning_rate": 9.850144943128998e-05, + "loss": 2.0715, + "step": 3448 + }, + { + "epoch": 1.0586249232658071, + "grad_norm": 0.5968048572540283, + "learning_rate": 9.850024139916694e-05, + "loss": 2.0755, + "step": 3449 + }, + { + "epoch": 1.0589318600368325, + "grad_norm": 0.5745044946670532, + "learning_rate": 9.849903288773609e-05, + "loss": 2.0615, + "step": 3450 + }, + { + "epoch": 1.0592387968078576, + "grad_norm": 0.5154273509979248, + "learning_rate": 9.849782389700936e-05, + "loss": 2.0429, + "step": 3451 + }, + { + "epoch": 1.0595457335788827, + "grad_norm": 0.5307286977767944, + "learning_rate": 9.849661442699871e-05, + "loss": 2.0788, + "step": 3452 + }, + { + "epoch": 1.0598526703499078, + "grad_norm": 0.5445010662078857, + "learning_rate": 9.84954044777161e-05, + "loss": 2.0598, + "step": 3453 + }, + { + "epoch": 1.0601596071209332, + "grad_norm": 0.5858064889907837, + "learning_rate": 9.849419404917347e-05, + "loss": 2.069, + "step": 3454 + }, + { + "epoch": 1.0604665438919583, + "grad_norm": 0.5906962156295776, + "learning_rate": 9.84929831413828e-05, + "loss": 2.1256, + "step": 3455 + }, + { + "epoch": 1.0607734806629834, + "grad_norm": 0.6632845997810364, + "learning_rate": 9.849177175435605e-05, + "loss": 2.1002, + "step": 3456 + }, + { + "epoch": 1.0610804174340085, + "grad_norm": 0.6352782845497131, + "learning_rate": 9.849055988810518e-05, + "loss": 2.0901, + "step": 3457 + }, + { + "epoch": 1.0613873542050338, + "grad_norm": 0.5406731963157654, + "learning_rate": 9.848934754264218e-05, + "loss": 2.0562, + "step": 3458 + }, + { + "epoch": 1.061694290976059, + "grad_norm": 0.6067590117454529, + "learning_rate": 9.848813471797902e-05, + "loss": 2.0914, + "step": 3459 + }, + { + "epoch": 1.062001227747084, + "grad_norm": 0.5876826047897339, + "learning_rate": 9.84869214141277e-05, + "loss": 2.0065, + "step": 3460 + }, + { + "epoch": 1.0623081645181092, + "grad_norm": 0.611648440361023, + "learning_rate": 9.84857076311002e-05, + "loss": 2.1252, + "step": 3461 + }, + { + "epoch": 1.0626151012891345, + "grad_norm": 0.568358302116394, + "learning_rate": 9.848449336890853e-05, + "loss": 2.0312, + "step": 3462 + }, + { + "epoch": 1.0629220380601596, + "grad_norm": 0.5303518772125244, + "learning_rate": 9.848327862756466e-05, + "loss": 1.9989, + "step": 3463 + }, + { + "epoch": 1.0632289748311847, + "grad_norm": 0.5377182960510254, + "learning_rate": 9.848206340708062e-05, + "loss": 2.0759, + "step": 3464 + }, + { + "epoch": 1.06353591160221, + "grad_norm": 0.5178431868553162, + "learning_rate": 9.848084770746842e-05, + "loss": 2.0613, + "step": 3465 + }, + { + "epoch": 1.0638428483732352, + "grad_norm": 0.4605518877506256, + "learning_rate": 9.847963152874007e-05, + "loss": 1.9961, + "step": 3466 + }, + { + "epoch": 1.0641497851442603, + "grad_norm": 0.5262506604194641, + "learning_rate": 9.847841487090758e-05, + "loss": 2.032, + "step": 3467 + }, + { + "epoch": 1.0644567219152854, + "grad_norm": 0.5210484862327576, + "learning_rate": 9.847719773398298e-05, + "loss": 2.106, + "step": 3468 + }, + { + "epoch": 1.0647636586863105, + "grad_norm": 0.5159584283828735, + "learning_rate": 9.84759801179783e-05, + "loss": 2.07, + "step": 3469 + }, + { + "epoch": 1.0650705954573358, + "grad_norm": 0.5094224810600281, + "learning_rate": 9.847476202290557e-05, + "loss": 2.1379, + "step": 3470 + }, + { + "epoch": 1.065377532228361, + "grad_norm": 0.5180851221084595, + "learning_rate": 9.847354344877684e-05, + "loss": 2.0911, + "step": 3471 + }, + { + "epoch": 1.065684468999386, + "grad_norm": 0.5476199984550476, + "learning_rate": 9.847232439560412e-05, + "loss": 2.0654, + "step": 3472 + }, + { + "epoch": 1.0659914057704114, + "grad_norm": 0.5314182639122009, + "learning_rate": 9.84711048633995e-05, + "loss": 1.9829, + "step": 3473 + }, + { + "epoch": 1.0662983425414365, + "grad_norm": 0.549379825592041, + "learning_rate": 9.8469884852175e-05, + "loss": 2.0876, + "step": 3474 + }, + { + "epoch": 1.0666052793124616, + "grad_norm": 0.6280861496925354, + "learning_rate": 9.84686643619427e-05, + "loss": 2.1026, + "step": 3475 + }, + { + "epoch": 1.0669122160834867, + "grad_norm": 0.5838838219642639, + "learning_rate": 9.846744339271464e-05, + "loss": 2.0553, + "step": 3476 + }, + { + "epoch": 1.0672191528545119, + "grad_norm": 0.6090747117996216, + "learning_rate": 9.84662219445029e-05, + "loss": 2.0983, + "step": 3477 + }, + { + "epoch": 1.0675260896255372, + "grad_norm": 0.515504002571106, + "learning_rate": 9.846500001731955e-05, + "loss": 2.0992, + "step": 3478 + }, + { + "epoch": 1.0678330263965623, + "grad_norm": 0.5083954930305481, + "learning_rate": 9.846377761117667e-05, + "loss": 1.9851, + "step": 3479 + }, + { + "epoch": 1.0681399631675874, + "grad_norm": 0.5102222561836243, + "learning_rate": 9.846255472608632e-05, + "loss": 2.0553, + "step": 3480 + }, + { + "epoch": 1.0684468999386127, + "grad_norm": 0.5123574137687683, + "learning_rate": 9.846133136206061e-05, + "loss": 2.0382, + "step": 3481 + }, + { + "epoch": 1.0687538367096379, + "grad_norm": 0.5657833814620972, + "learning_rate": 9.84601075191116e-05, + "loss": 2.0735, + "step": 3482 + }, + { + "epoch": 1.069060773480663, + "grad_norm": 0.5460711121559143, + "learning_rate": 9.845888319725143e-05, + "loss": 2.0445, + "step": 3483 + }, + { + "epoch": 1.069367710251688, + "grad_norm": 0.42860034108161926, + "learning_rate": 9.845765839649217e-05, + "loss": 2.0166, + "step": 3484 + }, + { + "epoch": 1.0696746470227134, + "grad_norm": 0.5413190126419067, + "learning_rate": 9.845643311684592e-05, + "loss": 1.9923, + "step": 3485 + }, + { + "epoch": 1.0699815837937385, + "grad_norm": 0.4982166290283203, + "learning_rate": 9.84552073583248e-05, + "loss": 2.0279, + "step": 3486 + }, + { + "epoch": 1.0702885205647636, + "grad_norm": 0.4824393689632416, + "learning_rate": 9.845398112094091e-05, + "loss": 1.9661, + "step": 3487 + }, + { + "epoch": 1.0705954573357888, + "grad_norm": 0.5690898895263672, + "learning_rate": 9.845275440470639e-05, + "loss": 2.0866, + "step": 3488 + }, + { + "epoch": 1.070902394106814, + "grad_norm": 0.6087098717689514, + "learning_rate": 9.845152720963335e-05, + "loss": 2.055, + "step": 3489 + }, + { + "epoch": 1.0712093308778392, + "grad_norm": 0.5754218101501465, + "learning_rate": 9.845029953573392e-05, + "loss": 2.0577, + "step": 3490 + }, + { + "epoch": 1.0715162676488643, + "grad_norm": 0.619746744632721, + "learning_rate": 9.844907138302023e-05, + "loss": 2.0694, + "step": 3491 + }, + { + "epoch": 1.0718232044198894, + "grad_norm": 0.5165389776229858, + "learning_rate": 9.844784275150442e-05, + "loss": 1.9618, + "step": 3492 + }, + { + "epoch": 1.0721301411909148, + "grad_norm": 0.5098079442977905, + "learning_rate": 9.844661364119863e-05, + "loss": 2.0021, + "step": 3493 + }, + { + "epoch": 1.0724370779619399, + "grad_norm": 0.5978688597679138, + "learning_rate": 9.8445384052115e-05, + "loss": 2.0861, + "step": 3494 + }, + { + "epoch": 1.072744014732965, + "grad_norm": 0.5498695373535156, + "learning_rate": 9.844415398426572e-05, + "loss": 2.095, + "step": 3495 + }, + { + "epoch": 1.07305095150399, + "grad_norm": 0.4890369474887848, + "learning_rate": 9.844292343766289e-05, + "loss": 1.9819, + "step": 3496 + }, + { + "epoch": 1.0733578882750154, + "grad_norm": 0.49551400542259216, + "learning_rate": 9.844169241231871e-05, + "loss": 2.109, + "step": 3497 + }, + { + "epoch": 1.0736648250460405, + "grad_norm": 0.5358633399009705, + "learning_rate": 9.844046090824533e-05, + "loss": 2.0579, + "step": 3498 + }, + { + "epoch": 1.0739717618170657, + "grad_norm": 0.5990919470787048, + "learning_rate": 9.843922892545492e-05, + "loss": 2.1962, + "step": 3499 + }, + { + "epoch": 1.0742786985880908, + "grad_norm": 0.5973169207572937, + "learning_rate": 9.843799646395967e-05, + "loss": 2.0691, + "step": 3500 + }, + { + "epoch": 1.074585635359116, + "grad_norm": 0.5875831246376038, + "learning_rate": 9.843676352377172e-05, + "loss": 2.0807, + "step": 3501 + }, + { + "epoch": 1.0748925721301412, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.84355301049033e-05, + "loss": 2.0694, + "step": 3502 + }, + { + "epoch": 1.0751995089011663, + "grad_norm": 0.7694209814071655, + "learning_rate": 9.843429620736659e-05, + "loss": 2.1504, + "step": 3503 + }, + { + "epoch": 1.0755064456721914, + "grad_norm": 0.7930089831352234, + "learning_rate": 9.843306183117376e-05, + "loss": 2.0635, + "step": 3504 + }, + { + "epoch": 1.0758133824432168, + "grad_norm": 0.6518469452857971, + "learning_rate": 9.843182697633704e-05, + "loss": 2.0395, + "step": 3505 + }, + { + "epoch": 1.076120319214242, + "grad_norm": 0.49737605452537537, + "learning_rate": 9.843059164286861e-05, + "loss": 1.9875, + "step": 3506 + }, + { + "epoch": 1.076427255985267, + "grad_norm": 0.5311492085456848, + "learning_rate": 9.84293558307807e-05, + "loss": 2.1331, + "step": 3507 + }, + { + "epoch": 1.0767341927562921, + "grad_norm": 0.6801449656486511, + "learning_rate": 9.842811954008551e-05, + "loss": 2.0991, + "step": 3508 + }, + { + "epoch": 1.0770411295273175, + "grad_norm": 0.5404406189918518, + "learning_rate": 9.842688277079523e-05, + "loss": 2.0482, + "step": 3509 + }, + { + "epoch": 1.0773480662983426, + "grad_norm": 0.6136532425880432, + "learning_rate": 9.842564552292215e-05, + "loss": 2.1016, + "step": 3510 + }, + { + "epoch": 1.0776550030693677, + "grad_norm": 0.5874183773994446, + "learning_rate": 9.842440779647843e-05, + "loss": 2.0495, + "step": 3511 + }, + { + "epoch": 1.0779619398403928, + "grad_norm": 0.4891047775745392, + "learning_rate": 9.842316959147635e-05, + "loss": 2.0592, + "step": 3512 + }, + { + "epoch": 1.0782688766114181, + "grad_norm": 0.5115689635276794, + "learning_rate": 9.84219309079281e-05, + "loss": 2.0084, + "step": 3513 + }, + { + "epoch": 1.0785758133824432, + "grad_norm": 0.5662370324134827, + "learning_rate": 9.842069174584597e-05, + "loss": 2.1134, + "step": 3514 + }, + { + "epoch": 1.0788827501534684, + "grad_norm": 0.6859605312347412, + "learning_rate": 9.841945210524217e-05, + "loss": 2.1144, + "step": 3515 + }, + { + "epoch": 1.0791896869244935, + "grad_norm": 0.8003933429718018, + "learning_rate": 9.841821198612897e-05, + "loss": 2.0353, + "step": 3516 + }, + { + "epoch": 1.0794966236955188, + "grad_norm": 0.8481027483940125, + "learning_rate": 9.841697138851863e-05, + "loss": 2.1012, + "step": 3517 + }, + { + "epoch": 1.079803560466544, + "grad_norm": 0.7234178185462952, + "learning_rate": 9.84157303124234e-05, + "loss": 2.1134, + "step": 3518 + }, + { + "epoch": 1.080110497237569, + "grad_norm": 0.6129522919654846, + "learning_rate": 9.841448875785553e-05, + "loss": 2.0736, + "step": 3519 + }, + { + "epoch": 1.0804174340085941, + "grad_norm": 0.4983314573764801, + "learning_rate": 9.841324672482732e-05, + "loss": 2.0334, + "step": 3520 + }, + { + "epoch": 1.0807243707796195, + "grad_norm": 0.6069099307060242, + "learning_rate": 9.841200421335101e-05, + "loss": 2.0506, + "step": 3521 + }, + { + "epoch": 1.0810313075506446, + "grad_norm": 0.5841798186302185, + "learning_rate": 9.841076122343893e-05, + "loss": 2.0491, + "step": 3522 + }, + { + "epoch": 1.0813382443216697, + "grad_norm": 0.5629861354827881, + "learning_rate": 9.84095177551033e-05, + "loss": 2.0435, + "step": 3523 + }, + { + "epoch": 1.0816451810926948, + "grad_norm": 0.48676446080207825, + "learning_rate": 9.840827380835646e-05, + "loss": 2.0543, + "step": 3524 + }, + { + "epoch": 1.0819521178637201, + "grad_norm": 0.5119389295578003, + "learning_rate": 9.840702938321069e-05, + "loss": 2.0461, + "step": 3525 + }, + { + "epoch": 1.0822590546347453, + "grad_norm": 0.47259917855262756, + "learning_rate": 9.840578447967827e-05, + "loss": 2.0494, + "step": 3526 + }, + { + "epoch": 1.0825659914057704, + "grad_norm": 0.5083605647087097, + "learning_rate": 9.840453909777153e-05, + "loss": 2.0518, + "step": 3527 + }, + { + "epoch": 1.0828729281767955, + "grad_norm": 0.46149778366088867, + "learning_rate": 9.840329323750276e-05, + "loss": 2.0087, + "step": 3528 + }, + { + "epoch": 1.0831798649478208, + "grad_norm": 0.4698919951915741, + "learning_rate": 9.840204689888427e-05, + "loss": 2.0715, + "step": 3529 + }, + { + "epoch": 1.083486801718846, + "grad_norm": 0.514570951461792, + "learning_rate": 9.840080008192838e-05, + "loss": 2.1067, + "step": 3530 + }, + { + "epoch": 1.083793738489871, + "grad_norm": 0.5938723087310791, + "learning_rate": 9.839955278664743e-05, + "loss": 2.1246, + "step": 3531 + }, + { + "epoch": 1.0841006752608962, + "grad_norm": 0.58525550365448, + "learning_rate": 9.839830501305372e-05, + "loss": 2.0695, + "step": 3532 + }, + { + "epoch": 1.0844076120319215, + "grad_norm": 0.5693490505218506, + "learning_rate": 9.83970567611596e-05, + "loss": 2.0166, + "step": 3533 + }, + { + "epoch": 1.0847145488029466, + "grad_norm": 0.544964075088501, + "learning_rate": 9.839580803097738e-05, + "loss": 2.0093, + "step": 3534 + }, + { + "epoch": 1.0850214855739717, + "grad_norm": 0.5509639978408813, + "learning_rate": 9.839455882251945e-05, + "loss": 2.0511, + "step": 3535 + }, + { + "epoch": 1.0853284223449968, + "grad_norm": 0.5092516541481018, + "learning_rate": 9.83933091357981e-05, + "loss": 2.0586, + "step": 3536 + }, + { + "epoch": 1.0856353591160222, + "grad_norm": 0.5163968205451965, + "learning_rate": 9.83920589708257e-05, + "loss": 2.0541, + "step": 3537 + }, + { + "epoch": 1.0859422958870473, + "grad_norm": 0.49756479263305664, + "learning_rate": 9.839080832761464e-05, + "loss": 2.0495, + "step": 3538 + }, + { + "epoch": 1.0862492326580724, + "grad_norm": 0.6246916055679321, + "learning_rate": 9.838955720617722e-05, + "loss": 2.2082, + "step": 3539 + }, + { + "epoch": 1.0865561694290977, + "grad_norm": 0.5826153755187988, + "learning_rate": 9.838830560652585e-05, + "loss": 2.0318, + "step": 3540 + }, + { + "epoch": 1.0868631062001228, + "grad_norm": 0.6131548285484314, + "learning_rate": 9.838705352867287e-05, + "loss": 2.1172, + "step": 3541 + }, + { + "epoch": 1.087170042971148, + "grad_norm": 0.7028201818466187, + "learning_rate": 9.838580097263068e-05, + "loss": 2.061, + "step": 3542 + }, + { + "epoch": 1.087476979742173, + "grad_norm": 0.7061073780059814, + "learning_rate": 9.838454793841166e-05, + "loss": 2.0944, + "step": 3543 + }, + { + "epoch": 1.0877839165131982, + "grad_norm": 0.6820229887962341, + "learning_rate": 9.838329442602814e-05, + "loss": 2.072, + "step": 3544 + }, + { + "epoch": 1.0880908532842235, + "grad_norm": 0.5658139586448669, + "learning_rate": 9.838204043549257e-05, + "loss": 2.0499, + "step": 3545 + }, + { + "epoch": 1.0883977900552486, + "grad_norm": 0.5714126825332642, + "learning_rate": 9.838078596681731e-05, + "loss": 2.06, + "step": 3546 + }, + { + "epoch": 1.0887047268262737, + "grad_norm": 0.5343610048294067, + "learning_rate": 9.837953102001477e-05, + "loss": 2.0932, + "step": 3547 + }, + { + "epoch": 1.089011663597299, + "grad_norm": 0.5799851417541504, + "learning_rate": 9.837827559509735e-05, + "loss": 2.0615, + "step": 3548 + }, + { + "epoch": 1.0893186003683242, + "grad_norm": 0.5679401159286499, + "learning_rate": 9.837701969207745e-05, + "loss": 2.0161, + "step": 3549 + }, + { + "epoch": 1.0896255371393493, + "grad_norm": 0.5369420647621155, + "learning_rate": 9.83757633109675e-05, + "loss": 2.0066, + "step": 3550 + }, + { + "epoch": 1.0899324739103744, + "grad_norm": 0.5276355147361755, + "learning_rate": 9.837450645177988e-05, + "loss": 2.03, + "step": 3551 + }, + { + "epoch": 1.0902394106813997, + "grad_norm": 0.49717894196510315, + "learning_rate": 9.837324911452705e-05, + "loss": 1.9897, + "step": 3552 + }, + { + "epoch": 1.0905463474524248, + "grad_norm": 0.460783451795578, + "learning_rate": 9.837199129922142e-05, + "loss": 2.089, + "step": 3553 + }, + { + "epoch": 1.09085328422345, + "grad_norm": 0.505473792552948, + "learning_rate": 9.837073300587541e-05, + "loss": 2.035, + "step": 3554 + }, + { + "epoch": 1.091160220994475, + "grad_norm": 0.4588155150413513, + "learning_rate": 9.836947423450147e-05, + "loss": 2.0029, + "step": 3555 + }, + { + "epoch": 1.0914671577655004, + "grad_norm": 0.5151825547218323, + "learning_rate": 9.836821498511203e-05, + "loss": 2.1075, + "step": 3556 + }, + { + "epoch": 1.0917740945365255, + "grad_norm": 0.46669647097587585, + "learning_rate": 9.836695525771955e-05, + "loss": 2.0468, + "step": 3557 + }, + { + "epoch": 1.0920810313075506, + "grad_norm": 0.49291539192199707, + "learning_rate": 9.836569505233647e-05, + "loss": 2.1201, + "step": 3558 + }, + { + "epoch": 1.0923879680785757, + "grad_norm": 0.49323126673698425, + "learning_rate": 9.836443436897525e-05, + "loss": 1.9796, + "step": 3559 + }, + { + "epoch": 1.092694904849601, + "grad_norm": 0.4784039258956909, + "learning_rate": 9.836317320764832e-05, + "loss": 2.0267, + "step": 3560 + }, + { + "epoch": 1.0930018416206262, + "grad_norm": 0.5402999520301819, + "learning_rate": 9.836191156836818e-05, + "loss": 2.07, + "step": 3561 + }, + { + "epoch": 1.0933087783916513, + "grad_norm": 0.5989857912063599, + "learning_rate": 9.83606494511473e-05, + "loss": 2.0518, + "step": 3562 + }, + { + "epoch": 1.0936157151626764, + "grad_norm": 0.685855507850647, + "learning_rate": 9.835938685599811e-05, + "loss": 2.0632, + "step": 3563 + }, + { + "epoch": 1.0939226519337018, + "grad_norm": 0.7716066837310791, + "learning_rate": 9.835812378293312e-05, + "loss": 2.0758, + "step": 3564 + }, + { + "epoch": 1.0942295887047269, + "grad_norm": 0.6822659969329834, + "learning_rate": 9.835686023196481e-05, + "loss": 2.0077, + "step": 3565 + }, + { + "epoch": 1.094536525475752, + "grad_norm": 0.5031718611717224, + "learning_rate": 9.835559620310566e-05, + "loss": 2.0432, + "step": 3566 + }, + { + "epoch": 1.094843462246777, + "grad_norm": 0.5570902228355408, + "learning_rate": 9.835433169636818e-05, + "loss": 2.1203, + "step": 3567 + }, + { + "epoch": 1.0951503990178024, + "grad_norm": 0.6224993467330933, + "learning_rate": 9.835306671176484e-05, + "loss": 2.0281, + "step": 3568 + }, + { + "epoch": 1.0954573357888275, + "grad_norm": 0.67215895652771, + "learning_rate": 9.835180124930816e-05, + "loss": 2.1158, + "step": 3569 + }, + { + "epoch": 1.0957642725598526, + "grad_norm": 0.5764983892440796, + "learning_rate": 9.835053530901064e-05, + "loss": 1.9735, + "step": 3570 + }, + { + "epoch": 1.0960712093308778, + "grad_norm": 0.48459672927856445, + "learning_rate": 9.834926889088478e-05, + "loss": 2.0074, + "step": 3571 + }, + { + "epoch": 1.096378146101903, + "grad_norm": 0.4789890944957733, + "learning_rate": 9.834800199494312e-05, + "loss": 1.9942, + "step": 3572 + }, + { + "epoch": 1.0966850828729282, + "grad_norm": 0.5133237838745117, + "learning_rate": 9.834673462119817e-05, + "loss": 2.0204, + "step": 3573 + }, + { + "epoch": 1.0969920196439533, + "grad_norm": 0.638518750667572, + "learning_rate": 9.834546676966244e-05, + "loss": 2.1396, + "step": 3574 + }, + { + "epoch": 1.0972989564149784, + "grad_norm": 0.5471677780151367, + "learning_rate": 9.834419844034848e-05, + "loss": 1.99, + "step": 3575 + }, + { + "epoch": 1.0976058931860038, + "grad_norm": 0.5372926592826843, + "learning_rate": 9.83429296332688e-05, + "loss": 2.0241, + "step": 3576 + }, + { + "epoch": 1.0979128299570289, + "grad_norm": 0.5284983515739441, + "learning_rate": 9.834166034843597e-05, + "loss": 2.0705, + "step": 3577 + }, + { + "epoch": 1.098219766728054, + "grad_norm": 0.5212574601173401, + "learning_rate": 9.834039058586252e-05, + "loss": 2.0648, + "step": 3578 + }, + { + "epoch": 1.098526703499079, + "grad_norm": 0.439454048871994, + "learning_rate": 9.833912034556099e-05, + "loss": 1.9981, + "step": 3579 + }, + { + "epoch": 1.0988336402701044, + "grad_norm": 0.529550313949585, + "learning_rate": 9.833784962754394e-05, + "loss": 2.0092, + "step": 3580 + }, + { + "epoch": 1.0991405770411296, + "grad_norm": 0.5555844902992249, + "learning_rate": 9.833657843182394e-05, + "loss": 2.0457, + "step": 3581 + }, + { + "epoch": 1.0994475138121547, + "grad_norm": 0.56191086769104, + "learning_rate": 9.833530675841352e-05, + "loss": 2.0742, + "step": 3582 + }, + { + "epoch": 1.0997544505831798, + "grad_norm": 0.5119436383247375, + "learning_rate": 9.833403460732529e-05, + "loss": 2.0836, + "step": 3583 + }, + { + "epoch": 1.1000613873542051, + "grad_norm": 0.48049578070640564, + "learning_rate": 9.833276197857179e-05, + "loss": 2.0018, + "step": 3584 + }, + { + "epoch": 1.1003683241252302, + "grad_norm": 0.48501092195510864, + "learning_rate": 9.83314888721656e-05, + "loss": 2.0158, + "step": 3585 + }, + { + "epoch": 1.1006752608962553, + "grad_norm": 0.528548538684845, + "learning_rate": 9.833021528811932e-05, + "loss": 2.0327, + "step": 3586 + }, + { + "epoch": 1.1009821976672804, + "grad_norm": 0.5243194699287415, + "learning_rate": 9.832894122644551e-05, + "loss": 1.9874, + "step": 3587 + }, + { + "epoch": 1.1012891344383058, + "grad_norm": 0.46920302510261536, + "learning_rate": 9.832766668715681e-05, + "loss": 2.0487, + "step": 3588 + }, + { + "epoch": 1.101596071209331, + "grad_norm": 0.45994171500205994, + "learning_rate": 9.832639167026575e-05, + "loss": 2.0926, + "step": 3589 + }, + { + "epoch": 1.101903007980356, + "grad_norm": 0.5337465405464172, + "learning_rate": 9.832511617578497e-05, + "loss": 1.9957, + "step": 3590 + }, + { + "epoch": 1.1022099447513811, + "grad_norm": 0.5920217633247375, + "learning_rate": 9.832384020372707e-05, + "loss": 2.0571, + "step": 3591 + }, + { + "epoch": 1.1025168815224065, + "grad_norm": 0.651720404624939, + "learning_rate": 9.832256375410466e-05, + "loss": 2.0382, + "step": 3592 + }, + { + "epoch": 1.1028238182934316, + "grad_norm": 0.6063461899757385, + "learning_rate": 9.832128682693035e-05, + "loss": 1.9932, + "step": 3593 + }, + { + "epoch": 1.1031307550644567, + "grad_norm": 0.5111881494522095, + "learning_rate": 9.832000942221676e-05, + "loss": 1.9821, + "step": 3594 + }, + { + "epoch": 1.1034376918354818, + "grad_norm": 0.5419835448265076, + "learning_rate": 9.831873153997652e-05, + "loss": 2.0535, + "step": 3595 + }, + { + "epoch": 1.1037446286065071, + "grad_norm": 0.5685762763023376, + "learning_rate": 9.831745318022226e-05, + "loss": 2.0715, + "step": 3596 + }, + { + "epoch": 1.1040515653775322, + "grad_norm": 0.6095051765441895, + "learning_rate": 9.831617434296659e-05, + "loss": 2.0382, + "step": 3597 + }, + { + "epoch": 1.1043585021485574, + "grad_norm": 0.548292338848114, + "learning_rate": 9.831489502822217e-05, + "loss": 1.98, + "step": 3598 + }, + { + "epoch": 1.1046654389195825, + "grad_norm": 0.5056986808776855, + "learning_rate": 9.831361523600165e-05, + "loss": 2.0271, + "step": 3599 + }, + { + "epoch": 1.1049723756906078, + "grad_norm": 0.48790082335472107, + "learning_rate": 9.831233496631767e-05, + "loss": 1.9555, + "step": 3600 + }, + { + "epoch": 1.105279312461633, + "grad_norm": 0.4663766622543335, + "learning_rate": 9.831105421918287e-05, + "loss": 1.9985, + "step": 3601 + }, + { + "epoch": 1.105586249232658, + "grad_norm": 0.4549616277217865, + "learning_rate": 9.83097729946099e-05, + "loss": 2.0543, + "step": 3602 + }, + { + "epoch": 1.1058931860036831, + "grad_norm": 0.46699193120002747, + "learning_rate": 9.830849129261146e-05, + "loss": 2.0395, + "step": 3603 + }, + { + "epoch": 1.1062001227747085, + "grad_norm": 0.4600387215614319, + "learning_rate": 9.830720911320019e-05, + "loss": 2.0155, + "step": 3604 + }, + { + "epoch": 1.1065070595457336, + "grad_norm": 0.4854283034801483, + "learning_rate": 9.830592645638877e-05, + "loss": 2.0698, + "step": 3605 + }, + { + "epoch": 1.1068139963167587, + "grad_norm": 0.5249526500701904, + "learning_rate": 9.830464332218987e-05, + "loss": 2.0842, + "step": 3606 + }, + { + "epoch": 1.107120933087784, + "grad_norm": 0.6377332806587219, + "learning_rate": 9.830335971061616e-05, + "loss": 2.1399, + "step": 3607 + }, + { + "epoch": 1.1074278698588091, + "grad_norm": 0.632194995880127, + "learning_rate": 9.830207562168034e-05, + "loss": 2.1203, + "step": 3608 + }, + { + "epoch": 1.1077348066298343, + "grad_norm": 0.5585857629776001, + "learning_rate": 9.830079105539512e-05, + "loss": 2.0219, + "step": 3609 + }, + { + "epoch": 1.1080417434008594, + "grad_norm": 0.5613297820091248, + "learning_rate": 9.829950601177316e-05, + "loss": 2.0464, + "step": 3610 + }, + { + "epoch": 1.1083486801718845, + "grad_norm": 0.5213276743888855, + "learning_rate": 9.829822049082716e-05, + "loss": 2.0134, + "step": 3611 + }, + { + "epoch": 1.1086556169429098, + "grad_norm": 0.5008644461631775, + "learning_rate": 9.829693449256984e-05, + "loss": 1.9952, + "step": 3612 + }, + { + "epoch": 1.108962553713935, + "grad_norm": 0.5565455555915833, + "learning_rate": 9.829564801701392e-05, + "loss": 1.9737, + "step": 3613 + }, + { + "epoch": 1.10926949048496, + "grad_norm": 0.6150243878364563, + "learning_rate": 9.82943610641721e-05, + "loss": 2.0414, + "step": 3614 + }, + { + "epoch": 1.1095764272559854, + "grad_norm": 0.6731769442558289, + "learning_rate": 9.829307363405709e-05, + "loss": 2.0262, + "step": 3615 + }, + { + "epoch": 1.1098833640270105, + "grad_norm": 0.5681004524230957, + "learning_rate": 9.829178572668162e-05, + "loss": 2.0303, + "step": 3616 + }, + { + "epoch": 1.1101903007980356, + "grad_norm": 0.4748475253582001, + "learning_rate": 9.829049734205841e-05, + "loss": 1.9756, + "step": 3617 + }, + { + "epoch": 1.1104972375690607, + "grad_norm": 0.4218698740005493, + "learning_rate": 9.82892084802002e-05, + "loss": 2.0243, + "step": 3618 + }, + { + "epoch": 1.1108041743400858, + "grad_norm": 0.47928178310394287, + "learning_rate": 9.828791914111976e-05, + "loss": 2.0368, + "step": 3619 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5805749297142029, + "learning_rate": 9.828662932482977e-05, + "loss": 2.0071, + "step": 3620 + }, + { + "epoch": 1.1114180478821363, + "grad_norm": 0.5580070614814758, + "learning_rate": 9.828533903134302e-05, + "loss": 1.9568, + "step": 3621 + }, + { + "epoch": 1.1117249846531614, + "grad_norm": 0.572694718837738, + "learning_rate": 9.828404826067224e-05, + "loss": 2.0128, + "step": 3622 + }, + { + "epoch": 1.1120319214241867, + "grad_norm": 0.605338990688324, + "learning_rate": 9.828275701283021e-05, + "loss": 2.0638, + "step": 3623 + }, + { + "epoch": 1.1123388581952118, + "grad_norm": 0.550521969795227, + "learning_rate": 9.828146528782967e-05, + "loss": 2.118, + "step": 3624 + }, + { + "epoch": 1.112645794966237, + "grad_norm": 0.5420751571655273, + "learning_rate": 9.828017308568337e-05, + "loss": 2.0685, + "step": 3625 + }, + { + "epoch": 1.112952731737262, + "grad_norm": 0.5761057734489441, + "learning_rate": 9.827888040640414e-05, + "loss": 2.1111, + "step": 3626 + }, + { + "epoch": 1.1132596685082874, + "grad_norm": 0.5724154710769653, + "learning_rate": 9.827758725000468e-05, + "loss": 2.0596, + "step": 3627 + }, + { + "epoch": 1.1135666052793125, + "grad_norm": 0.5120618343353271, + "learning_rate": 9.827629361649783e-05, + "loss": 1.9811, + "step": 3628 + }, + { + "epoch": 1.1138735420503376, + "grad_norm": 0.4449520409107208, + "learning_rate": 9.827499950589633e-05, + "loss": 1.9935, + "step": 3629 + }, + { + "epoch": 1.1141804788213627, + "grad_norm": 0.5478667616844177, + "learning_rate": 9.827370491821302e-05, + "loss": 2.0142, + "step": 3630 + }, + { + "epoch": 1.114487415592388, + "grad_norm": 0.6170383095741272, + "learning_rate": 9.827240985346064e-05, + "loss": 2.0588, + "step": 3631 + }, + { + "epoch": 1.1147943523634132, + "grad_norm": 0.5950221419334412, + "learning_rate": 9.827111431165202e-05, + "loss": 2.0187, + "step": 3632 + }, + { + "epoch": 1.1151012891344383, + "grad_norm": 0.5250533819198608, + "learning_rate": 9.826981829279995e-05, + "loss": 2.0288, + "step": 3633 + }, + { + "epoch": 1.1154082259054634, + "grad_norm": 0.6252482533454895, + "learning_rate": 9.826852179691725e-05, + "loss": 2.1834, + "step": 3634 + }, + { + "epoch": 1.1157151626764887, + "grad_norm": 0.5258986353874207, + "learning_rate": 9.826722482401673e-05, + "loss": 1.9894, + "step": 3635 + }, + { + "epoch": 1.1160220994475138, + "grad_norm": 0.5532206892967224, + "learning_rate": 9.82659273741112e-05, + "loss": 2.013, + "step": 3636 + }, + { + "epoch": 1.116329036218539, + "grad_norm": 0.5178828835487366, + "learning_rate": 9.826462944721349e-05, + "loss": 1.955, + "step": 3637 + }, + { + "epoch": 1.116635972989564, + "grad_norm": 0.5466227531433105, + "learning_rate": 9.826333104333642e-05, + "loss": 2.1073, + "step": 3638 + }, + { + "epoch": 1.1169429097605894, + "grad_norm": 0.5513507723808289, + "learning_rate": 9.826203216249282e-05, + "loss": 2.0735, + "step": 3639 + }, + { + "epoch": 1.1172498465316145, + "grad_norm": 0.5485204458236694, + "learning_rate": 9.826073280469554e-05, + "loss": 2.0699, + "step": 3640 + }, + { + "epoch": 1.1175567833026396, + "grad_norm": 0.5148037075996399, + "learning_rate": 9.825943296995741e-05, + "loss": 1.9364, + "step": 3641 + }, + { + "epoch": 1.1178637200736647, + "grad_norm": 0.5639125108718872, + "learning_rate": 9.825813265829127e-05, + "loss": 2.078, + "step": 3642 + }, + { + "epoch": 1.11817065684469, + "grad_norm": 0.581631064414978, + "learning_rate": 9.825683186970997e-05, + "loss": 2.0404, + "step": 3643 + }, + { + "epoch": 1.1184775936157152, + "grad_norm": 0.5630286335945129, + "learning_rate": 9.82555306042264e-05, + "loss": 2.0615, + "step": 3644 + }, + { + "epoch": 1.1187845303867403, + "grad_norm": 0.5661062598228455, + "learning_rate": 9.825422886185338e-05, + "loss": 2.0432, + "step": 3645 + }, + { + "epoch": 1.1190914671577654, + "grad_norm": 0.4960556626319885, + "learning_rate": 9.825292664260379e-05, + "loss": 2.0576, + "step": 3646 + }, + { + "epoch": 1.1193984039287908, + "grad_norm": 0.5052362084388733, + "learning_rate": 9.825162394649048e-05, + "loss": 2.0615, + "step": 3647 + }, + { + "epoch": 1.1197053406998159, + "grad_norm": 0.566758930683136, + "learning_rate": 9.825032077352636e-05, + "loss": 2.0821, + "step": 3648 + }, + { + "epoch": 1.120012277470841, + "grad_norm": 0.5705568790435791, + "learning_rate": 9.824901712372429e-05, + "loss": 2.1455, + "step": 3649 + }, + { + "epoch": 1.120319214241866, + "grad_norm": 0.5584011673927307, + "learning_rate": 9.824771299709714e-05, + "loss": 2.0911, + "step": 3650 + }, + { + "epoch": 1.1206261510128914, + "grad_norm": 0.5621497631072998, + "learning_rate": 9.824640839365782e-05, + "loss": 2.1209, + "step": 3651 + }, + { + "epoch": 1.1209330877839165, + "grad_norm": 0.4893646240234375, + "learning_rate": 9.824510331341921e-05, + "loss": 1.977, + "step": 3652 + }, + { + "epoch": 1.1212400245549416, + "grad_norm": 0.5626688599586487, + "learning_rate": 9.82437977563942e-05, + "loss": 2.1114, + "step": 3653 + }, + { + "epoch": 1.1215469613259668, + "grad_norm": 0.5714966058731079, + "learning_rate": 9.824249172259573e-05, + "loss": 2.021, + "step": 3654 + }, + { + "epoch": 1.121853898096992, + "grad_norm": 0.5190821886062622, + "learning_rate": 9.824118521203666e-05, + "loss": 1.9788, + "step": 3655 + }, + { + "epoch": 1.1221608348680172, + "grad_norm": 0.46421363949775696, + "learning_rate": 9.823987822472994e-05, + "loss": 1.9762, + "step": 3656 + }, + { + "epoch": 1.1224677716390423, + "grad_norm": 0.5071156620979309, + "learning_rate": 9.823857076068846e-05, + "loss": 1.9625, + "step": 3657 + }, + { + "epoch": 1.1227747084100674, + "grad_norm": 0.5762679576873779, + "learning_rate": 9.823726281992515e-05, + "loss": 2.0543, + "step": 3658 + }, + { + "epoch": 1.1230816451810928, + "grad_norm": 0.6275226473808289, + "learning_rate": 9.823595440245294e-05, + "loss": 2.0878, + "step": 3659 + }, + { + "epoch": 1.1233885819521179, + "grad_norm": 0.6893213391304016, + "learning_rate": 9.823464550828476e-05, + "loss": 2.1059, + "step": 3660 + }, + { + "epoch": 1.123695518723143, + "grad_norm": 0.5521993041038513, + "learning_rate": 9.823333613743353e-05, + "loss": 2.035, + "step": 3661 + }, + { + "epoch": 1.124002455494168, + "grad_norm": 0.4918796718120575, + "learning_rate": 9.823202628991221e-05, + "loss": 1.9873, + "step": 3662 + }, + { + "epoch": 1.1243093922651934, + "grad_norm": 0.5177932977676392, + "learning_rate": 9.823071596573373e-05, + "loss": 2.0376, + "step": 3663 + }, + { + "epoch": 1.1246163290362186, + "grad_norm": 0.5337314009666443, + "learning_rate": 9.822940516491106e-05, + "loss": 2.1065, + "step": 3664 + }, + { + "epoch": 1.1249232658072437, + "grad_norm": 0.5179010629653931, + "learning_rate": 9.822809388745713e-05, + "loss": 1.9642, + "step": 3665 + }, + { + "epoch": 1.125230202578269, + "grad_norm": 0.5394679307937622, + "learning_rate": 9.82267821333849e-05, + "loss": 2.0275, + "step": 3666 + }, + { + "epoch": 1.1255371393492941, + "grad_norm": 0.582873523235321, + "learning_rate": 9.822546990270735e-05, + "loss": 2.0369, + "step": 3667 + }, + { + "epoch": 1.1258440761203192, + "grad_norm": 0.6595674753189087, + "learning_rate": 9.822415719543745e-05, + "loss": 1.9776, + "step": 3668 + }, + { + "epoch": 1.1261510128913443, + "grad_norm": 0.8103840947151184, + "learning_rate": 9.822284401158814e-05, + "loss": 2.0784, + "step": 3669 + }, + { + "epoch": 1.1264579496623695, + "grad_norm": 0.9062070250511169, + "learning_rate": 9.822153035117245e-05, + "loss": 1.9886, + "step": 3670 + }, + { + "epoch": 1.1267648864333948, + "grad_norm": 0.8718156814575195, + "learning_rate": 9.822021621420333e-05, + "loss": 2.0499, + "step": 3671 + }, + { + "epoch": 1.12707182320442, + "grad_norm": 0.6499583721160889, + "learning_rate": 9.821890160069375e-05, + "loss": 2.0734, + "step": 3672 + }, + { + "epoch": 1.127378759975445, + "grad_norm": 0.4573141932487488, + "learning_rate": 9.821758651065673e-05, + "loss": 2.0306, + "step": 3673 + }, + { + "epoch": 1.1276856967464703, + "grad_norm": 0.6441135406494141, + "learning_rate": 9.821627094410526e-05, + "loss": 2.051, + "step": 3674 + }, + { + "epoch": 1.1279926335174955, + "grad_norm": 0.7201390266418457, + "learning_rate": 9.821495490105235e-05, + "loss": 2.0187, + "step": 3675 + }, + { + "epoch": 1.1282995702885206, + "grad_norm": 0.6751874685287476, + "learning_rate": 9.821363838151099e-05, + "loss": 2.0363, + "step": 3676 + }, + { + "epoch": 1.1286065070595457, + "grad_norm": 0.5435949563980103, + "learning_rate": 9.821232138549419e-05, + "loss": 1.939, + "step": 3677 + }, + { + "epoch": 1.1289134438305708, + "grad_norm": 0.605248212814331, + "learning_rate": 9.821100391301497e-05, + "loss": 2.146, + "step": 3678 + }, + { + "epoch": 1.1292203806015961, + "grad_norm": 0.6798139810562134, + "learning_rate": 9.820968596408636e-05, + "loss": 2.0423, + "step": 3679 + }, + { + "epoch": 1.1295273173726212, + "grad_norm": 0.6683683395385742, + "learning_rate": 9.820836753872137e-05, + "loss": 1.9768, + "step": 3680 + }, + { + "epoch": 1.1298342541436464, + "grad_norm": 0.578346312046051, + "learning_rate": 9.820704863693304e-05, + "loss": 1.9313, + "step": 3681 + }, + { + "epoch": 1.1301411909146717, + "grad_norm": 0.5639599561691284, + "learning_rate": 9.820572925873441e-05, + "loss": 2.0706, + "step": 3682 + }, + { + "epoch": 1.1304481276856968, + "grad_norm": 0.5749368071556091, + "learning_rate": 9.82044094041385e-05, + "loss": 2.0072, + "step": 3683 + }, + { + "epoch": 1.130755064456722, + "grad_norm": 0.6490229368209839, + "learning_rate": 9.820308907315836e-05, + "loss": 1.9947, + "step": 3684 + }, + { + "epoch": 1.131062001227747, + "grad_norm": 0.6207692623138428, + "learning_rate": 9.820176826580705e-05, + "loss": 2.1426, + "step": 3685 + }, + { + "epoch": 1.1313689379987721, + "grad_norm": 0.6421573162078857, + "learning_rate": 9.82004469820976e-05, + "loss": 2.0558, + "step": 3686 + }, + { + "epoch": 1.1316758747697975, + "grad_norm": 0.5462764501571655, + "learning_rate": 9.81991252220431e-05, + "loss": 2.0072, + "step": 3687 + }, + { + "epoch": 1.1319828115408226, + "grad_norm": 0.49791282415390015, + "learning_rate": 9.819780298565657e-05, + "loss": 1.9949, + "step": 3688 + }, + { + "epoch": 1.1322897483118477, + "grad_norm": 0.5120366215705872, + "learning_rate": 9.819648027295112e-05, + "loss": 2.0503, + "step": 3689 + }, + { + "epoch": 1.132596685082873, + "grad_norm": 0.5118343830108643, + "learning_rate": 9.81951570839398e-05, + "loss": 2.0104, + "step": 3690 + }, + { + "epoch": 1.1329036218538981, + "grad_norm": 0.44520822167396545, + "learning_rate": 9.81938334186357e-05, + "loss": 2.0024, + "step": 3691 + }, + { + "epoch": 1.1332105586249233, + "grad_norm": 0.5505960583686829, + "learning_rate": 9.819250927705188e-05, + "loss": 2.0924, + "step": 3692 + }, + { + "epoch": 1.1335174953959484, + "grad_norm": 0.5269182920455933, + "learning_rate": 9.819118465920143e-05, + "loss": 2.0553, + "step": 3693 + }, + { + "epoch": 1.1338244321669735, + "grad_norm": 0.4864311218261719, + "learning_rate": 9.818985956509745e-05, + "loss": 2.0405, + "step": 3694 + }, + { + "epoch": 1.1341313689379988, + "grad_norm": 0.515202522277832, + "learning_rate": 9.818853399475304e-05, + "loss": 2.0211, + "step": 3695 + }, + { + "epoch": 1.134438305709024, + "grad_norm": 0.5360483527183533, + "learning_rate": 9.818720794818128e-05, + "loss": 2.1077, + "step": 3696 + }, + { + "epoch": 1.134745242480049, + "grad_norm": 0.5469255447387695, + "learning_rate": 9.818588142539531e-05, + "loss": 1.9538, + "step": 3697 + }, + { + "epoch": 1.1350521792510744, + "grad_norm": 0.5042214393615723, + "learning_rate": 9.818455442640819e-05, + "loss": 2.0477, + "step": 3698 + }, + { + "epoch": 1.1353591160220995, + "grad_norm": 0.5678744316101074, + "learning_rate": 9.81832269512331e-05, + "loss": 2.0871, + "step": 3699 + }, + { + "epoch": 1.1356660527931246, + "grad_norm": 0.5218677520751953, + "learning_rate": 9.818189899988308e-05, + "loss": 2.1014, + "step": 3700 + }, + { + "epoch": 1.1359729895641497, + "grad_norm": 0.5141727924346924, + "learning_rate": 9.818057057237132e-05, + "loss": 2.0385, + "step": 3701 + }, + { + "epoch": 1.136279926335175, + "grad_norm": 0.5288038849830627, + "learning_rate": 9.81792416687109e-05, + "loss": 2.0736, + "step": 3702 + }, + { + "epoch": 1.1365868631062002, + "grad_norm": 0.5533168911933899, + "learning_rate": 9.817791228891499e-05, + "loss": 2.032, + "step": 3703 + }, + { + "epoch": 1.1368937998772253, + "grad_norm": 0.4840674102306366, + "learning_rate": 9.81765824329967e-05, + "loss": 2.027, + "step": 3704 + }, + { + "epoch": 1.1372007366482504, + "grad_norm": 0.5060023069381714, + "learning_rate": 9.817525210096921e-05, + "loss": 2.0561, + "step": 3705 + }, + { + "epoch": 1.1375076734192757, + "grad_norm": 0.48830488324165344, + "learning_rate": 9.817392129284561e-05, + "loss": 1.9807, + "step": 3706 + }, + { + "epoch": 1.1378146101903008, + "grad_norm": 0.4644564390182495, + "learning_rate": 9.817259000863911e-05, + "loss": 1.9871, + "step": 3707 + }, + { + "epoch": 1.138121546961326, + "grad_norm": 0.4644739329814911, + "learning_rate": 9.817125824836283e-05, + "loss": 2.0253, + "step": 3708 + }, + { + "epoch": 1.138428483732351, + "grad_norm": 0.5376463532447815, + "learning_rate": 9.816992601202994e-05, + "loss": 2.0693, + "step": 3709 + }, + { + "epoch": 1.1387354205033764, + "grad_norm": 0.49980148673057556, + "learning_rate": 9.816859329965363e-05, + "loss": 2.0123, + "step": 3710 + }, + { + "epoch": 1.1390423572744015, + "grad_norm": 0.5452225208282471, + "learning_rate": 9.816726011124702e-05, + "loss": 2.0725, + "step": 3711 + }, + { + "epoch": 1.1393492940454266, + "grad_norm": 0.5428896546363831, + "learning_rate": 9.816592644682332e-05, + "loss": 2.0446, + "step": 3712 + }, + { + "epoch": 1.1396562308164517, + "grad_norm": 0.5448847413063049, + "learning_rate": 9.816459230639571e-05, + "loss": 2.0262, + "step": 3713 + }, + { + "epoch": 1.139963167587477, + "grad_norm": 0.48574572801589966, + "learning_rate": 9.816325768997736e-05, + "loss": 2.0105, + "step": 3714 + }, + { + "epoch": 1.1402701043585022, + "grad_norm": 0.5566397905349731, + "learning_rate": 9.816192259758147e-05, + "loss": 2.0665, + "step": 3715 + }, + { + "epoch": 1.1405770411295273, + "grad_norm": 0.6098625659942627, + "learning_rate": 9.816058702922124e-05, + "loss": 2.0589, + "step": 3716 + }, + { + "epoch": 1.1408839779005524, + "grad_norm": 0.6118699312210083, + "learning_rate": 9.815925098490985e-05, + "loss": 2.0683, + "step": 3717 + }, + { + "epoch": 1.1411909146715777, + "grad_norm": 0.5213121175765991, + "learning_rate": 9.815791446466053e-05, + "loss": 2.0226, + "step": 3718 + }, + { + "epoch": 1.1414978514426029, + "grad_norm": 0.45717960596084595, + "learning_rate": 9.815657746848648e-05, + "loss": 2.0371, + "step": 3719 + }, + { + "epoch": 1.141804788213628, + "grad_norm": 0.4613656997680664, + "learning_rate": 9.815523999640088e-05, + "loss": 2.0702, + "step": 3720 + }, + { + "epoch": 1.142111724984653, + "grad_norm": 0.4527476727962494, + "learning_rate": 9.8153902048417e-05, + "loss": 1.9893, + "step": 3721 + }, + { + "epoch": 1.1424186617556784, + "grad_norm": 0.4524305462837219, + "learning_rate": 9.815256362454801e-05, + "loss": 1.975, + "step": 3722 + }, + { + "epoch": 1.1427255985267035, + "grad_norm": 0.4421180188655853, + "learning_rate": 9.815122472480718e-05, + "loss": 1.9987, + "step": 3723 + }, + { + "epoch": 1.1430325352977286, + "grad_norm": 0.4833788275718689, + "learning_rate": 9.814988534920771e-05, + "loss": 2.0246, + "step": 3724 + }, + { + "epoch": 1.1433394720687537, + "grad_norm": 0.46547624468803406, + "learning_rate": 9.814854549776287e-05, + "loss": 2.0007, + "step": 3725 + }, + { + "epoch": 1.143646408839779, + "grad_norm": 0.43220648169517517, + "learning_rate": 9.814720517048587e-05, + "loss": 1.9845, + "step": 3726 + }, + { + "epoch": 1.1439533456108042, + "grad_norm": 0.473910391330719, + "learning_rate": 9.814586436738998e-05, + "loss": 2.0518, + "step": 3727 + }, + { + "epoch": 1.1442602823818293, + "grad_norm": 0.507354199886322, + "learning_rate": 9.814452308848843e-05, + "loss": 2.0708, + "step": 3728 + }, + { + "epoch": 1.1445672191528544, + "grad_norm": 0.4585053622722626, + "learning_rate": 9.814318133379448e-05, + "loss": 2.0124, + "step": 3729 + }, + { + "epoch": 1.1448741559238798, + "grad_norm": 0.5280457735061646, + "learning_rate": 9.81418391033214e-05, + "loss": 2.0424, + "step": 3730 + }, + { + "epoch": 1.1451810926949049, + "grad_norm": 0.5173056125640869, + "learning_rate": 9.814049639708245e-05, + "loss": 1.9666, + "step": 3731 + }, + { + "epoch": 1.14548802946593, + "grad_norm": 0.5850839018821716, + "learning_rate": 9.81391532150909e-05, + "loss": 2.0765, + "step": 3732 + }, + { + "epoch": 1.145794966236955, + "grad_norm": 0.5450417995452881, + "learning_rate": 9.813780955736002e-05, + "loss": 2.0696, + "step": 3733 + }, + { + "epoch": 1.1461019030079804, + "grad_norm": 0.4577319622039795, + "learning_rate": 9.81364654239031e-05, + "loss": 2.0493, + "step": 3734 + }, + { + "epoch": 1.1464088397790055, + "grad_norm": 0.5211838483810425, + "learning_rate": 9.813512081473339e-05, + "loss": 2.0578, + "step": 3735 + }, + { + "epoch": 1.1467157765500307, + "grad_norm": 0.6763051152229309, + "learning_rate": 9.813377572986422e-05, + "loss": 2.0859, + "step": 3736 + }, + { + "epoch": 1.1470227133210558, + "grad_norm": 0.8591815233230591, + "learning_rate": 9.813243016930887e-05, + "loss": 1.9743, + "step": 3737 + }, + { + "epoch": 1.147329650092081, + "grad_norm": 0.8573755025863647, + "learning_rate": 9.813108413308063e-05, + "loss": 2.048, + "step": 3738 + }, + { + "epoch": 1.1476365868631062, + "grad_norm": 0.6887713074684143, + "learning_rate": 9.812973762119281e-05, + "loss": 2.0184, + "step": 3739 + }, + { + "epoch": 1.1479435236341313, + "grad_norm": 0.5491438508033752, + "learning_rate": 9.81283906336587e-05, + "loss": 2.0373, + "step": 3740 + }, + { + "epoch": 1.1482504604051567, + "grad_norm": 0.6413923501968384, + "learning_rate": 9.812704317049164e-05, + "loss": 2.067, + "step": 3741 + }, + { + "epoch": 1.1485573971761818, + "grad_norm": 0.8731338381767273, + "learning_rate": 9.812569523170492e-05, + "loss": 1.9996, + "step": 3742 + }, + { + "epoch": 1.1488643339472069, + "grad_norm": 0.8043886423110962, + "learning_rate": 9.812434681731189e-05, + "loss": 2.0464, + "step": 3743 + }, + { + "epoch": 1.149171270718232, + "grad_norm": 0.5458279252052307, + "learning_rate": 9.812299792732584e-05, + "loss": 2.0026, + "step": 3744 + }, + { + "epoch": 1.149478207489257, + "grad_norm": 0.5135432481765747, + "learning_rate": 9.812164856176011e-05, + "loss": 2.0302, + "step": 3745 + }, + { + "epoch": 1.1497851442602824, + "grad_norm": 0.6673153638839722, + "learning_rate": 9.812029872062807e-05, + "loss": 2.0435, + "step": 3746 + }, + { + "epoch": 1.1500920810313076, + "grad_norm": 0.6777083873748779, + "learning_rate": 9.811894840394302e-05, + "loss": 2.0591, + "step": 3747 + }, + { + "epoch": 1.1503990178023327, + "grad_norm": 0.6660524010658264, + "learning_rate": 9.811759761171833e-05, + "loss": 2.0461, + "step": 3748 + }, + { + "epoch": 1.150705954573358, + "grad_norm": 0.6079594492912292, + "learning_rate": 9.811624634396733e-05, + "loss": 2.0708, + "step": 3749 + }, + { + "epoch": 1.1510128913443831, + "grad_norm": 0.5242465734481812, + "learning_rate": 9.811489460070337e-05, + "loss": 2.0513, + "step": 3750 + }, + { + "epoch": 1.1513198281154082, + "grad_norm": 0.7091820240020752, + "learning_rate": 9.811354238193984e-05, + "loss": 2.0356, + "step": 3751 + }, + { + "epoch": 1.1516267648864333, + "grad_norm": 0.6781896948814392, + "learning_rate": 9.811218968769007e-05, + "loss": 2.0693, + "step": 3752 + }, + { + "epoch": 1.1519337016574585, + "grad_norm": 0.6036314368247986, + "learning_rate": 9.811083651796744e-05, + "loss": 2.134, + "step": 3753 + }, + { + "epoch": 1.1522406384284838, + "grad_norm": 0.6173892617225647, + "learning_rate": 9.810948287278534e-05, + "loss": 2.056, + "step": 3754 + }, + { + "epoch": 1.152547575199509, + "grad_norm": 0.4903198182582855, + "learning_rate": 9.810812875215712e-05, + "loss": 2.0037, + "step": 3755 + }, + { + "epoch": 1.152854511970534, + "grad_norm": 0.5527236461639404, + "learning_rate": 9.810677415609619e-05, + "loss": 2.0334, + "step": 3756 + }, + { + "epoch": 1.1531614487415593, + "grad_norm": 0.5342993140220642, + "learning_rate": 9.81054190846159e-05, + "loss": 2.0376, + "step": 3757 + }, + { + "epoch": 1.1534683855125845, + "grad_norm": 0.4860527515411377, + "learning_rate": 9.810406353772968e-05, + "loss": 2.0009, + "step": 3758 + }, + { + "epoch": 1.1537753222836096, + "grad_norm": 0.49722176790237427, + "learning_rate": 9.810270751545089e-05, + "loss": 2.051, + "step": 3759 + }, + { + "epoch": 1.1540822590546347, + "grad_norm": 0.4714743196964264, + "learning_rate": 9.810135101779296e-05, + "loss": 2.0474, + "step": 3760 + }, + { + "epoch": 1.1543891958256598, + "grad_norm": 0.5183619856834412, + "learning_rate": 9.80999940447693e-05, + "loss": 2.1032, + "step": 3761 + }, + { + "epoch": 1.1546961325966851, + "grad_norm": 0.6118659377098083, + "learning_rate": 9.809863659639328e-05, + "loss": 2.0967, + "step": 3762 + }, + { + "epoch": 1.1550030693677102, + "grad_norm": 0.49166184663772583, + "learning_rate": 9.809727867267838e-05, + "loss": 2.0683, + "step": 3763 + }, + { + "epoch": 1.1553100061387354, + "grad_norm": 0.5190026164054871, + "learning_rate": 9.809592027363795e-05, + "loss": 2.0161, + "step": 3764 + }, + { + "epoch": 1.1556169429097607, + "grad_norm": 0.516914427280426, + "learning_rate": 9.809456139928546e-05, + "loss": 2.0886, + "step": 3765 + }, + { + "epoch": 1.1559238796807858, + "grad_norm": 0.49737948179244995, + "learning_rate": 9.809320204963433e-05, + "loss": 2.0111, + "step": 3766 + }, + { + "epoch": 1.156230816451811, + "grad_norm": 0.44676536321640015, + "learning_rate": 9.809184222469796e-05, + "loss": 2.0571, + "step": 3767 + }, + { + "epoch": 1.156537753222836, + "grad_norm": 0.5008999109268188, + "learning_rate": 9.809048192448983e-05, + "loss": 2.0489, + "step": 3768 + }, + { + "epoch": 1.1568446899938611, + "grad_norm": 0.5116657614707947, + "learning_rate": 9.80891211490234e-05, + "loss": 1.9571, + "step": 3769 + }, + { + "epoch": 1.1571516267648865, + "grad_norm": 0.49909651279449463, + "learning_rate": 9.808775989831207e-05, + "loss": 2.0568, + "step": 3770 + }, + { + "epoch": 1.1574585635359116, + "grad_norm": 0.5186662077903748, + "learning_rate": 9.80863981723693e-05, + "loss": 2.0283, + "step": 3771 + }, + { + "epoch": 1.1577655003069367, + "grad_norm": 0.4974740445613861, + "learning_rate": 9.808503597120858e-05, + "loss": 1.9525, + "step": 3772 + }, + { + "epoch": 1.158072437077962, + "grad_norm": 0.5369553565979004, + "learning_rate": 9.808367329484333e-05, + "loss": 1.9627, + "step": 3773 + }, + { + "epoch": 1.1583793738489871, + "grad_norm": 0.5084113478660583, + "learning_rate": 9.808231014328704e-05, + "loss": 1.9563, + "step": 3774 + }, + { + "epoch": 1.1586863106200123, + "grad_norm": 0.6059956550598145, + "learning_rate": 9.808094651655319e-05, + "loss": 2.078, + "step": 3775 + }, + { + "epoch": 1.1589932473910374, + "grad_norm": 0.5677124261856079, + "learning_rate": 9.807958241465523e-05, + "loss": 1.9977, + "step": 3776 + }, + { + "epoch": 1.1593001841620627, + "grad_norm": 0.5582616329193115, + "learning_rate": 9.807821783760667e-05, + "loss": 2.0053, + "step": 3777 + }, + { + "epoch": 1.1596071209330878, + "grad_norm": 0.5558032989501953, + "learning_rate": 9.807685278542097e-05, + "loss": 2.0015, + "step": 3778 + }, + { + "epoch": 1.159914057704113, + "grad_norm": 0.553292989730835, + "learning_rate": 9.807548725811165e-05, + "loss": 2.133, + "step": 3779 + }, + { + "epoch": 1.160220994475138, + "grad_norm": 0.5281317234039307, + "learning_rate": 9.807412125569217e-05, + "loss": 2.0018, + "step": 3780 + }, + { + "epoch": 1.1605279312461634, + "grad_norm": 0.45385050773620605, + "learning_rate": 9.807275477817605e-05, + "loss": 1.9986, + "step": 3781 + }, + { + "epoch": 1.1608348680171885, + "grad_norm": 0.5843673944473267, + "learning_rate": 9.80713878255768e-05, + "loss": 2.0653, + "step": 3782 + }, + { + "epoch": 1.1611418047882136, + "grad_norm": 0.6193283796310425, + "learning_rate": 9.807002039790792e-05, + "loss": 1.9646, + "step": 3783 + }, + { + "epoch": 1.1614487415592387, + "grad_norm": 0.5831897258758545, + "learning_rate": 9.806865249518292e-05, + "loss": 1.9708, + "step": 3784 + }, + { + "epoch": 1.161755678330264, + "grad_norm": 0.49771901965141296, + "learning_rate": 9.806728411741533e-05, + "loss": 1.9953, + "step": 3785 + }, + { + "epoch": 1.1620626151012892, + "grad_norm": 0.5003515481948853, + "learning_rate": 9.806591526461864e-05, + "loss": 2.0503, + "step": 3786 + }, + { + "epoch": 1.1623695518723143, + "grad_norm": 0.5710052847862244, + "learning_rate": 9.806454593680642e-05, + "loss": 1.9976, + "step": 3787 + }, + { + "epoch": 1.1626764886433394, + "grad_norm": 0.5180788040161133, + "learning_rate": 9.806317613399218e-05, + "loss": 1.9872, + "step": 3788 + }, + { + "epoch": 1.1629834254143647, + "grad_norm": 0.5202008485794067, + "learning_rate": 9.806180585618949e-05, + "loss": 1.9628, + "step": 3789 + }, + { + "epoch": 1.1632903621853898, + "grad_norm": 0.47358211874961853, + "learning_rate": 9.806043510341183e-05, + "loss": 1.9994, + "step": 3790 + }, + { + "epoch": 1.163597298956415, + "grad_norm": 0.4258720278739929, + "learning_rate": 9.80590638756728e-05, + "loss": 1.9547, + "step": 3791 + }, + { + "epoch": 1.16390423572744, + "grad_norm": 0.4487614035606384, + "learning_rate": 9.805769217298593e-05, + "loss": 1.9912, + "step": 3792 + }, + { + "epoch": 1.1642111724984654, + "grad_norm": 0.4970495104789734, + "learning_rate": 9.805631999536477e-05, + "loss": 2.0568, + "step": 3793 + }, + { + "epoch": 1.1645181092694905, + "grad_norm": 0.4535474479198456, + "learning_rate": 9.805494734282289e-05, + "loss": 2.0088, + "step": 3794 + }, + { + "epoch": 1.1648250460405156, + "grad_norm": 0.44582805037498474, + "learning_rate": 9.805357421537385e-05, + "loss": 1.9694, + "step": 3795 + }, + { + "epoch": 1.1651319828115407, + "grad_norm": 0.43872734904289246, + "learning_rate": 9.805220061303125e-05, + "loss": 2.0041, + "step": 3796 + }, + { + "epoch": 1.165438919582566, + "grad_norm": 0.5050458908081055, + "learning_rate": 9.805082653580861e-05, + "loss": 1.9963, + "step": 3797 + }, + { + "epoch": 1.1657458563535912, + "grad_norm": 0.5346884727478027, + "learning_rate": 9.804945198371956e-05, + "loss": 2.0334, + "step": 3798 + }, + { + "epoch": 1.1660527931246163, + "grad_norm": 0.5607240796089172, + "learning_rate": 9.804807695677764e-05, + "loss": 2.0474, + "step": 3799 + }, + { + "epoch": 1.1663597298956414, + "grad_norm": 0.5343592166900635, + "learning_rate": 9.804670145499648e-05, + "loss": 2.0542, + "step": 3800 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5195753574371338, + "learning_rate": 9.804532547838964e-05, + "loss": 2.0816, + "step": 3801 + }, + { + "epoch": 1.1669736034376919, + "grad_norm": 0.575821042060852, + "learning_rate": 9.804394902697075e-05, + "loss": 2.0182, + "step": 3802 + }, + { + "epoch": 1.167280540208717, + "grad_norm": 0.6385466456413269, + "learning_rate": 9.804257210075339e-05, + "loss": 2.0519, + "step": 3803 + }, + { + "epoch": 1.167587476979742, + "grad_norm": 0.7202457785606384, + "learning_rate": 9.804119469975117e-05, + "loss": 1.9871, + "step": 3804 + }, + { + "epoch": 1.1678944137507674, + "grad_norm": 0.696793258190155, + "learning_rate": 9.803981682397772e-05, + "loss": 2.1018, + "step": 3805 + }, + { + "epoch": 1.1682013505217925, + "grad_norm": 0.6217656135559082, + "learning_rate": 9.803843847344662e-05, + "loss": 2.1009, + "step": 3806 + }, + { + "epoch": 1.1685082872928176, + "grad_norm": 0.5296351313591003, + "learning_rate": 9.803705964817153e-05, + "loss": 2.1057, + "step": 3807 + }, + { + "epoch": 1.168815224063843, + "grad_norm": 0.5280975699424744, + "learning_rate": 9.803568034816606e-05, + "loss": 2.0019, + "step": 3808 + }, + { + "epoch": 1.169122160834868, + "grad_norm": 0.4981881380081177, + "learning_rate": 9.803430057344385e-05, + "loss": 1.9918, + "step": 3809 + }, + { + "epoch": 1.1694290976058932, + "grad_norm": 0.43662941455841064, + "learning_rate": 9.803292032401852e-05, + "loss": 2.0273, + "step": 3810 + }, + { + "epoch": 1.1697360343769183, + "grad_norm": 0.5039259791374207, + "learning_rate": 9.80315395999037e-05, + "loss": 2.0475, + "step": 3811 + }, + { + "epoch": 1.1700429711479434, + "grad_norm": 0.4330410957336426, + "learning_rate": 9.803015840111308e-05, + "loss": 1.99, + "step": 3812 + }, + { + "epoch": 1.1703499079189688, + "grad_norm": 0.4603813886642456, + "learning_rate": 9.802877672766026e-05, + "loss": 2.0288, + "step": 3813 + }, + { + "epoch": 1.1706568446899939, + "grad_norm": 0.45815590023994446, + "learning_rate": 9.802739457955894e-05, + "loss": 2.0026, + "step": 3814 + }, + { + "epoch": 1.170963781461019, + "grad_norm": 0.46995803713798523, + "learning_rate": 9.802601195682275e-05, + "loss": 2.0608, + "step": 3815 + }, + { + "epoch": 1.1712707182320443, + "grad_norm": 0.4511576294898987, + "learning_rate": 9.802462885946536e-05, + "loss": 1.9793, + "step": 3816 + }, + { + "epoch": 1.1715776550030694, + "grad_norm": 0.49079468846321106, + "learning_rate": 9.802324528750044e-05, + "loss": 2.0049, + "step": 3817 + }, + { + "epoch": 1.1718845917740945, + "grad_norm": 0.47245466709136963, + "learning_rate": 9.802186124094166e-05, + "loss": 1.9562, + "step": 3818 + }, + { + "epoch": 1.1721915285451197, + "grad_norm": 0.485575795173645, + "learning_rate": 9.80204767198027e-05, + "loss": 2.0212, + "step": 3819 + }, + { + "epoch": 1.1724984653161448, + "grad_norm": 0.5924440622329712, + "learning_rate": 9.801909172409724e-05, + "loss": 1.9875, + "step": 3820 + }, + { + "epoch": 1.17280540208717, + "grad_norm": 0.48908641934394836, + "learning_rate": 9.801770625383899e-05, + "loss": 1.9778, + "step": 3821 + }, + { + "epoch": 1.1731123388581952, + "grad_norm": 0.4372415840625763, + "learning_rate": 9.80163203090416e-05, + "loss": 1.9368, + "step": 3822 + }, + { + "epoch": 1.1734192756292203, + "grad_norm": 0.5811094641685486, + "learning_rate": 9.801493388971881e-05, + "loss": 2.1293, + "step": 3823 + }, + { + "epoch": 1.1737262124002457, + "grad_norm": 0.516983151435852, + "learning_rate": 9.801354699588428e-05, + "loss": 2.039, + "step": 3824 + }, + { + "epoch": 1.1740331491712708, + "grad_norm": 0.53409343957901, + "learning_rate": 9.801215962755175e-05, + "loss": 2.0294, + "step": 3825 + }, + { + "epoch": 1.1743400859422959, + "grad_norm": 0.5703202486038208, + "learning_rate": 9.801077178473492e-05, + "loss": 2.0241, + "step": 3826 + }, + { + "epoch": 1.174647022713321, + "grad_norm": 0.49341192841529846, + "learning_rate": 9.80093834674475e-05, + "loss": 1.9092, + "step": 3827 + }, + { + "epoch": 1.174953959484346, + "grad_norm": 0.46960577368736267, + "learning_rate": 9.800799467570321e-05, + "loss": 1.9994, + "step": 3828 + }, + { + "epoch": 1.1752608962553714, + "grad_norm": 0.468108594417572, + "learning_rate": 9.800660540951577e-05, + "loss": 1.9471, + "step": 3829 + }, + { + "epoch": 1.1755678330263966, + "grad_norm": 0.4133259057998657, + "learning_rate": 9.800521566889893e-05, + "loss": 2.0159, + "step": 3830 + }, + { + "epoch": 1.1758747697974217, + "grad_norm": 0.44991979002952576, + "learning_rate": 9.800382545386641e-05, + "loss": 2.0179, + "step": 3831 + }, + { + "epoch": 1.176181706568447, + "grad_norm": 0.43111294507980347, + "learning_rate": 9.800243476443195e-05, + "loss": 2.1092, + "step": 3832 + }, + { + "epoch": 1.1764886433394721, + "grad_norm": 0.4859693944454193, + "learning_rate": 9.800104360060929e-05, + "loss": 2.0134, + "step": 3833 + }, + { + "epoch": 1.1767955801104972, + "grad_norm": 0.474960058927536, + "learning_rate": 9.799965196241219e-05, + "loss": 2.0288, + "step": 3834 + }, + { + "epoch": 1.1771025168815223, + "grad_norm": 0.5269008278846741, + "learning_rate": 9.79982598498544e-05, + "loss": 2.063, + "step": 3835 + }, + { + "epoch": 1.1774094536525475, + "grad_norm": 0.4923003613948822, + "learning_rate": 9.799686726294965e-05, + "loss": 1.9506, + "step": 3836 + }, + { + "epoch": 1.1777163904235728, + "grad_norm": 0.5355561971664429, + "learning_rate": 9.799547420171175e-05, + "loss": 2.0066, + "step": 3837 + }, + { + "epoch": 1.178023327194598, + "grad_norm": 0.6095728874206543, + "learning_rate": 9.799408066615443e-05, + "loss": 1.9799, + "step": 3838 + }, + { + "epoch": 1.178330263965623, + "grad_norm": 0.5268104672431946, + "learning_rate": 9.799268665629148e-05, + "loss": 2.0409, + "step": 3839 + }, + { + "epoch": 1.1786372007366483, + "grad_norm": 0.4478130340576172, + "learning_rate": 9.799129217213667e-05, + "loss": 1.9521, + "step": 3840 + }, + { + "epoch": 1.1789441375076735, + "grad_norm": 0.4691653847694397, + "learning_rate": 9.798989721370379e-05, + "loss": 2.0432, + "step": 3841 + }, + { + "epoch": 1.1792510742786986, + "grad_norm": 0.5602376461029053, + "learning_rate": 9.798850178100661e-05, + "loss": 2.0557, + "step": 3842 + }, + { + "epoch": 1.1795580110497237, + "grad_norm": 0.5619905591011047, + "learning_rate": 9.798710587405893e-05, + "loss": 2.0258, + "step": 3843 + }, + { + "epoch": 1.179864947820749, + "grad_norm": 0.5845574736595154, + "learning_rate": 9.798570949287454e-05, + "loss": 2.0637, + "step": 3844 + }, + { + "epoch": 1.1801718845917741, + "grad_norm": 0.5339313745498657, + "learning_rate": 9.798431263746725e-05, + "loss": 2.0265, + "step": 3845 + }, + { + "epoch": 1.1804788213627992, + "grad_norm": 0.45720914006233215, + "learning_rate": 9.798291530785086e-05, + "loss": 1.9745, + "step": 3846 + }, + { + "epoch": 1.1807857581338244, + "grad_norm": 0.5121282935142517, + "learning_rate": 9.798151750403917e-05, + "loss": 2.0427, + "step": 3847 + }, + { + "epoch": 1.1810926949048497, + "grad_norm": 0.48100459575653076, + "learning_rate": 9.7980119226046e-05, + "loss": 2.0307, + "step": 3848 + }, + { + "epoch": 1.1813996316758748, + "grad_norm": 0.4424034655094147, + "learning_rate": 9.797872047388517e-05, + "loss": 1.9697, + "step": 3849 + }, + { + "epoch": 1.1817065684469, + "grad_norm": 0.45154938101768494, + "learning_rate": 9.797732124757051e-05, + "loss": 1.9689, + "step": 3850 + }, + { + "epoch": 1.182013505217925, + "grad_norm": 0.4807071387767792, + "learning_rate": 9.797592154711584e-05, + "loss": 1.9616, + "step": 3851 + }, + { + "epoch": 1.1823204419889504, + "grad_norm": 0.5113904476165771, + "learning_rate": 9.797452137253498e-05, + "loss": 2.0158, + "step": 3852 + }, + { + "epoch": 1.1826273787599755, + "grad_norm": 0.5456753969192505, + "learning_rate": 9.797312072384179e-05, + "loss": 1.977, + "step": 3853 + }, + { + "epoch": 1.1829343155310006, + "grad_norm": 0.5545704364776611, + "learning_rate": 9.797171960105012e-05, + "loss": 2.0622, + "step": 3854 + }, + { + "epoch": 1.1832412523020257, + "grad_norm": 0.651498556137085, + "learning_rate": 9.797031800417377e-05, + "loss": 2.0739, + "step": 3855 + }, + { + "epoch": 1.183548189073051, + "grad_norm": 0.748968780040741, + "learning_rate": 9.796891593322665e-05, + "loss": 2.0713, + "step": 3856 + }, + { + "epoch": 1.1838551258440762, + "grad_norm": 0.8724157214164734, + "learning_rate": 9.796751338822256e-05, + "loss": 2.0224, + "step": 3857 + }, + { + "epoch": 1.1841620626151013, + "grad_norm": 0.8158844709396362, + "learning_rate": 9.796611036917542e-05, + "loss": 2.0165, + "step": 3858 + }, + { + "epoch": 1.1844689993861264, + "grad_norm": 0.6231487989425659, + "learning_rate": 9.796470687609904e-05, + "loss": 1.9607, + "step": 3859 + }, + { + "epoch": 1.1847759361571517, + "grad_norm": 0.49367067217826843, + "learning_rate": 9.796330290900731e-05, + "loss": 2.0074, + "step": 3860 + }, + { + "epoch": 1.1850828729281768, + "grad_norm": 0.5546393990516663, + "learning_rate": 9.796189846791413e-05, + "loss": 1.9688, + "step": 3861 + }, + { + "epoch": 1.185389809699202, + "grad_norm": 0.5880963802337646, + "learning_rate": 9.796049355283333e-05, + "loss": 2.0192, + "step": 3862 + }, + { + "epoch": 1.185696746470227, + "grad_norm": 0.6064910292625427, + "learning_rate": 9.795908816377884e-05, + "loss": 2.0236, + "step": 3863 + }, + { + "epoch": 1.1860036832412524, + "grad_norm": 0.524116575717926, + "learning_rate": 9.795768230076454e-05, + "loss": 2.0315, + "step": 3864 + }, + { + "epoch": 1.1863106200122775, + "grad_norm": 0.449158251285553, + "learning_rate": 9.79562759638043e-05, + "loss": 1.9423, + "step": 3865 + }, + { + "epoch": 1.1866175567833026, + "grad_norm": 0.5623016953468323, + "learning_rate": 9.795486915291203e-05, + "loss": 2.096, + "step": 3866 + }, + { + "epoch": 1.1869244935543277, + "grad_norm": 0.6107217073440552, + "learning_rate": 9.795346186810164e-05, + "loss": 1.9994, + "step": 3867 + }, + { + "epoch": 1.187231430325353, + "grad_norm": 0.5559211373329163, + "learning_rate": 9.795205410938704e-05, + "loss": 2.0138, + "step": 3868 + }, + { + "epoch": 1.1875383670963782, + "grad_norm": 0.5022037029266357, + "learning_rate": 9.795064587678212e-05, + "loss": 2.0835, + "step": 3869 + }, + { + "epoch": 1.1878453038674033, + "grad_norm": 0.5760810971260071, + "learning_rate": 9.794923717030082e-05, + "loss": 2.0839, + "step": 3870 + }, + { + "epoch": 1.1881522406384284, + "grad_norm": 0.559018075466156, + "learning_rate": 9.794782798995706e-05, + "loss": 2.0397, + "step": 3871 + }, + { + "epoch": 1.1884591774094537, + "grad_norm": 0.48842501640319824, + "learning_rate": 9.794641833576477e-05, + "loss": 2.022, + "step": 3872 + }, + { + "epoch": 1.1887661141804788, + "grad_norm": 0.47267377376556396, + "learning_rate": 9.794500820773785e-05, + "loss": 1.9677, + "step": 3873 + }, + { + "epoch": 1.189073050951504, + "grad_norm": 0.5107980966567993, + "learning_rate": 9.794359760589026e-05, + "loss": 2.124, + "step": 3874 + }, + { + "epoch": 1.189379987722529, + "grad_norm": 0.4993875026702881, + "learning_rate": 9.794218653023595e-05, + "loss": 1.9528, + "step": 3875 + }, + { + "epoch": 1.1896869244935544, + "grad_norm": 0.49543896317481995, + "learning_rate": 9.794077498078885e-05, + "loss": 2.0257, + "step": 3876 + }, + { + "epoch": 1.1899938612645795, + "grad_norm": 0.5207403302192688, + "learning_rate": 9.79393629575629e-05, + "loss": 2.0853, + "step": 3877 + }, + { + "epoch": 1.1903007980356046, + "grad_norm": 0.44884833693504333, + "learning_rate": 9.793795046057208e-05, + "loss": 1.9366, + "step": 3878 + }, + { + "epoch": 1.1906077348066297, + "grad_norm": 0.47921934723854065, + "learning_rate": 9.793653748983033e-05, + "loss": 2.0614, + "step": 3879 + }, + { + "epoch": 1.190914671577655, + "grad_norm": 0.5371566414833069, + "learning_rate": 9.793512404535163e-05, + "loss": 2.0433, + "step": 3880 + }, + { + "epoch": 1.1912216083486802, + "grad_norm": 0.48760104179382324, + "learning_rate": 9.793371012714994e-05, + "loss": 2.0061, + "step": 3881 + }, + { + "epoch": 1.1915285451197053, + "grad_norm": 0.47291669249534607, + "learning_rate": 9.793229573523922e-05, + "loss": 2.0661, + "step": 3882 + }, + { + "epoch": 1.1918354818907306, + "grad_norm": 0.5348502397537231, + "learning_rate": 9.793088086963347e-05, + "loss": 2.0131, + "step": 3883 + }, + { + "epoch": 1.1921424186617557, + "grad_norm": 0.6291812062263489, + "learning_rate": 9.792946553034666e-05, + "loss": 2.0312, + "step": 3884 + }, + { + "epoch": 1.1924493554327809, + "grad_norm": 0.5620503425598145, + "learning_rate": 9.792804971739276e-05, + "loss": 2.0429, + "step": 3885 + }, + { + "epoch": 1.192756292203806, + "grad_norm": 0.4984607696533203, + "learning_rate": 9.792663343078581e-05, + "loss": 2.0183, + "step": 3886 + }, + { + "epoch": 1.193063228974831, + "grad_norm": 0.5867961645126343, + "learning_rate": 9.792521667053975e-05, + "loss": 2.0609, + "step": 3887 + }, + { + "epoch": 1.1933701657458564, + "grad_norm": 0.5819169282913208, + "learning_rate": 9.792379943666863e-05, + "loss": 1.9412, + "step": 3888 + }, + { + "epoch": 1.1936771025168815, + "grad_norm": 0.6232548952102661, + "learning_rate": 9.792238172918643e-05, + "loss": 2.0607, + "step": 3889 + }, + { + "epoch": 1.1939840392879066, + "grad_norm": 0.5859619379043579, + "learning_rate": 9.792096354810716e-05, + "loss": 2.0718, + "step": 3890 + }, + { + "epoch": 1.194290976058932, + "grad_norm": 0.47209057211875916, + "learning_rate": 9.791954489344485e-05, + "loss": 1.9872, + "step": 3891 + }, + { + "epoch": 1.194597912829957, + "grad_norm": 0.5183662176132202, + "learning_rate": 9.79181257652135e-05, + "loss": 2.0782, + "step": 3892 + }, + { + "epoch": 1.1949048496009822, + "grad_norm": 0.551873505115509, + "learning_rate": 9.791670616342715e-05, + "loss": 2.0477, + "step": 3893 + }, + { + "epoch": 1.1952117863720073, + "grad_norm": 0.47254955768585205, + "learning_rate": 9.791528608809984e-05, + "loss": 1.9859, + "step": 3894 + }, + { + "epoch": 1.1955187231430324, + "grad_norm": 0.45482897758483887, + "learning_rate": 9.791386553924556e-05, + "loss": 1.9939, + "step": 3895 + }, + { + "epoch": 1.1958256599140578, + "grad_norm": 0.4687066078186035, + "learning_rate": 9.79124445168784e-05, + "loss": 1.9982, + "step": 3896 + }, + { + "epoch": 1.1961325966850829, + "grad_norm": 0.4855460524559021, + "learning_rate": 9.791102302101236e-05, + "loss": 1.9667, + "step": 3897 + }, + { + "epoch": 1.196439533456108, + "grad_norm": 0.48152467608451843, + "learning_rate": 9.790960105166153e-05, + "loss": 1.9914, + "step": 3898 + }, + { + "epoch": 1.1967464702271333, + "grad_norm": 0.48487406969070435, + "learning_rate": 9.790817860883993e-05, + "loss": 1.9978, + "step": 3899 + }, + { + "epoch": 1.1970534069981584, + "grad_norm": 0.47665563225746155, + "learning_rate": 9.790675569256162e-05, + "loss": 1.9995, + "step": 3900 + }, + { + "epoch": 1.1973603437691835, + "grad_norm": 0.48938530683517456, + "learning_rate": 9.790533230284069e-05, + "loss": 2.0461, + "step": 3901 + }, + { + "epoch": 1.1976672805402087, + "grad_norm": 0.6336411237716675, + "learning_rate": 9.790390843969119e-05, + "loss": 2.0003, + "step": 3902 + }, + { + "epoch": 1.1979742173112338, + "grad_norm": 0.6946616172790527, + "learning_rate": 9.790248410312717e-05, + "loss": 1.9979, + "step": 3903 + }, + { + "epoch": 1.198281154082259, + "grad_norm": 0.7829384803771973, + "learning_rate": 9.790105929316274e-05, + "loss": 2.015, + "step": 3904 + }, + { + "epoch": 1.1985880908532842, + "grad_norm": 0.6874059438705444, + "learning_rate": 9.789963400981197e-05, + "loss": 1.9887, + "step": 3905 + }, + { + "epoch": 1.1988950276243093, + "grad_norm": 0.6074720025062561, + "learning_rate": 9.789820825308893e-05, + "loss": 2.0287, + "step": 3906 + }, + { + "epoch": 1.1992019643953347, + "grad_norm": 0.49311673641204834, + "learning_rate": 9.789678202300774e-05, + "loss": 1.9846, + "step": 3907 + }, + { + "epoch": 1.1995089011663598, + "grad_norm": 0.5266487002372742, + "learning_rate": 9.789535531958244e-05, + "loss": 2.017, + "step": 3908 + }, + { + "epoch": 1.1998158379373849, + "grad_norm": 0.6170570850372314, + "learning_rate": 9.789392814282721e-05, + "loss": 2.0615, + "step": 3909 + }, + { + "epoch": 1.20012277470841, + "grad_norm": 0.5820409059524536, + "learning_rate": 9.789250049275609e-05, + "loss": 2.0459, + "step": 3910 + }, + { + "epoch": 1.2004297114794351, + "grad_norm": 0.5220739841461182, + "learning_rate": 9.78910723693832e-05, + "loss": 2.0843, + "step": 3911 + }, + { + "epoch": 1.2007366482504604, + "grad_norm": 0.5884750485420227, + "learning_rate": 9.788964377272267e-05, + "loss": 2.1068, + "step": 3912 + }, + { + "epoch": 1.2010435850214856, + "grad_norm": 0.5634950995445251, + "learning_rate": 9.788821470278861e-05, + "loss": 2.0206, + "step": 3913 + }, + { + "epoch": 1.2013505217925107, + "grad_norm": 0.5219514966011047, + "learning_rate": 9.788678515959517e-05, + "loss": 2.0802, + "step": 3914 + }, + { + "epoch": 1.201657458563536, + "grad_norm": 0.5870078206062317, + "learning_rate": 9.788535514315642e-05, + "loss": 2.0149, + "step": 3915 + }, + { + "epoch": 1.2019643953345611, + "grad_norm": 0.4850577414035797, + "learning_rate": 9.788392465348653e-05, + "loss": 2.0424, + "step": 3916 + }, + { + "epoch": 1.2022713321055862, + "grad_norm": 0.5354881882667542, + "learning_rate": 9.788249369059964e-05, + "loss": 2.0822, + "step": 3917 + }, + { + "epoch": 1.2025782688766113, + "grad_norm": 0.5817529559135437, + "learning_rate": 9.788106225450988e-05, + "loss": 2.0384, + "step": 3918 + }, + { + "epoch": 1.2028852056476367, + "grad_norm": 0.5685575008392334, + "learning_rate": 9.78796303452314e-05, + "loss": 1.9777, + "step": 3919 + }, + { + "epoch": 1.2031921424186618, + "grad_norm": 0.5086472034454346, + "learning_rate": 9.787819796277835e-05, + "loss": 1.9109, + "step": 3920 + }, + { + "epoch": 1.203499079189687, + "grad_norm": 0.45905008912086487, + "learning_rate": 9.787676510716488e-05, + "loss": 1.9945, + "step": 3921 + }, + { + "epoch": 1.203806015960712, + "grad_norm": 0.6052672863006592, + "learning_rate": 9.787533177840516e-05, + "loss": 2.0873, + "step": 3922 + }, + { + "epoch": 1.2041129527317374, + "grad_norm": 0.636320173740387, + "learning_rate": 9.787389797651334e-05, + "loss": 1.954, + "step": 3923 + }, + { + "epoch": 1.2044198895027625, + "grad_norm": 0.5775459408760071, + "learning_rate": 9.78724637015036e-05, + "loss": 1.9632, + "step": 3924 + }, + { + "epoch": 1.2047268262737876, + "grad_norm": 0.4593936502933502, + "learning_rate": 9.787102895339013e-05, + "loss": 1.948, + "step": 3925 + }, + { + "epoch": 1.2050337630448127, + "grad_norm": 0.4568643867969513, + "learning_rate": 9.78695937321871e-05, + "loss": 1.977, + "step": 3926 + }, + { + "epoch": 1.205340699815838, + "grad_norm": 0.6079357266426086, + "learning_rate": 9.786815803790867e-05, + "loss": 1.9738, + "step": 3927 + }, + { + "epoch": 1.2056476365868631, + "grad_norm": 0.5991626977920532, + "learning_rate": 9.786672187056905e-05, + "loss": 1.9603, + "step": 3928 + }, + { + "epoch": 1.2059545733578882, + "grad_norm": 0.4844282865524292, + "learning_rate": 9.786528523018242e-05, + "loss": 1.9739, + "step": 3929 + }, + { + "epoch": 1.2062615101289134, + "grad_norm": 0.43694475293159485, + "learning_rate": 9.786384811676298e-05, + "loss": 1.957, + "step": 3930 + }, + { + "epoch": 1.2065684468999387, + "grad_norm": 0.5742451548576355, + "learning_rate": 9.786241053032496e-05, + "loss": 1.9872, + "step": 3931 + }, + { + "epoch": 1.2068753836709638, + "grad_norm": 0.6246824860572815, + "learning_rate": 9.786097247088255e-05, + "loss": 2.0747, + "step": 3932 + }, + { + "epoch": 1.207182320441989, + "grad_norm": 0.5364731550216675, + "learning_rate": 9.785953393844996e-05, + "loss": 1.9793, + "step": 3933 + }, + { + "epoch": 1.207489257213014, + "grad_norm": 0.42909273505210876, + "learning_rate": 9.785809493304139e-05, + "loss": 1.9959, + "step": 3934 + }, + { + "epoch": 1.2077961939840394, + "grad_norm": 0.43952879309654236, + "learning_rate": 9.785665545467108e-05, + "loss": 2.0019, + "step": 3935 + }, + { + "epoch": 1.2081031307550645, + "grad_norm": 0.45972180366516113, + "learning_rate": 9.785521550335323e-05, + "loss": 1.9504, + "step": 3936 + }, + { + "epoch": 1.2084100675260896, + "grad_norm": 0.5592246651649475, + "learning_rate": 9.785377507910212e-05, + "loss": 2.0214, + "step": 3937 + }, + { + "epoch": 1.2087170042971147, + "grad_norm": 0.6084285378456116, + "learning_rate": 9.785233418193196e-05, + "loss": 2.08, + "step": 3938 + }, + { + "epoch": 1.20902394106814, + "grad_norm": 0.5370670557022095, + "learning_rate": 9.785089281185698e-05, + "loss": 2.0877, + "step": 3939 + }, + { + "epoch": 1.2093308778391652, + "grad_norm": 0.466501921415329, + "learning_rate": 9.784945096889143e-05, + "loss": 1.9795, + "step": 3940 + }, + { + "epoch": 1.2096378146101903, + "grad_norm": 0.48617517948150635, + "learning_rate": 9.784800865304954e-05, + "loss": 2.0099, + "step": 3941 + }, + { + "epoch": 1.2099447513812154, + "grad_norm": 0.528110921382904, + "learning_rate": 9.78465658643456e-05, + "loss": 2.0597, + "step": 3942 + }, + { + "epoch": 1.2102516881522407, + "grad_norm": 0.47355538606643677, + "learning_rate": 9.784512260279385e-05, + "loss": 2.0145, + "step": 3943 + }, + { + "epoch": 1.2105586249232658, + "grad_norm": 0.46970823407173157, + "learning_rate": 9.784367886840856e-05, + "loss": 2.0533, + "step": 3944 + }, + { + "epoch": 1.210865561694291, + "grad_norm": 0.41206037998199463, + "learning_rate": 9.784223466120399e-05, + "loss": 1.9226, + "step": 3945 + }, + { + "epoch": 1.211172498465316, + "grad_norm": 0.4298155605792999, + "learning_rate": 9.784078998119442e-05, + "loss": 2.0686, + "step": 3946 + }, + { + "epoch": 1.2114794352363414, + "grad_norm": 0.4616359770298004, + "learning_rate": 9.783934482839412e-05, + "loss": 2.0063, + "step": 3947 + }, + { + "epoch": 1.2117863720073665, + "grad_norm": 0.476726233959198, + "learning_rate": 9.783789920281737e-05, + "loss": 1.9868, + "step": 3948 + }, + { + "epoch": 1.2120933087783916, + "grad_norm": 0.5075610876083374, + "learning_rate": 9.783645310447846e-05, + "loss": 2.1019, + "step": 3949 + }, + { + "epoch": 1.212400245549417, + "grad_norm": 0.49806225299835205, + "learning_rate": 9.78350065333917e-05, + "loss": 2.0503, + "step": 3950 + }, + { + "epoch": 1.212707182320442, + "grad_norm": 0.5278452634811401, + "learning_rate": 9.783355948957134e-05, + "loss": 2.0513, + "step": 3951 + }, + { + "epoch": 1.2130141190914672, + "grad_norm": 0.5634627938270569, + "learning_rate": 9.783211197303174e-05, + "loss": 2.1135, + "step": 3952 + }, + { + "epoch": 1.2133210558624923, + "grad_norm": 0.5152999758720398, + "learning_rate": 9.783066398378715e-05, + "loss": 2.0392, + "step": 3953 + }, + { + "epoch": 1.2136279926335174, + "grad_norm": 0.48095864057540894, + "learning_rate": 9.782921552185191e-05, + "loss": 1.982, + "step": 3954 + }, + { + "epoch": 1.2139349294045427, + "grad_norm": 0.47377893328666687, + "learning_rate": 9.782776658724034e-05, + "loss": 1.9538, + "step": 3955 + }, + { + "epoch": 1.2142418661755678, + "grad_norm": 0.5260181427001953, + "learning_rate": 9.782631717996675e-05, + "loss": 2.1197, + "step": 3956 + }, + { + "epoch": 1.214548802946593, + "grad_norm": 0.5640038251876831, + "learning_rate": 9.782486730004544e-05, + "loss": 2.0338, + "step": 3957 + }, + { + "epoch": 1.2148557397176183, + "grad_norm": 0.5091645121574402, + "learning_rate": 9.782341694749078e-05, + "loss": 1.9921, + "step": 3958 + }, + { + "epoch": 1.2151626764886434, + "grad_norm": 0.48285624384880066, + "learning_rate": 9.782196612231706e-05, + "loss": 2.0358, + "step": 3959 + }, + { + "epoch": 1.2154696132596685, + "grad_norm": 0.5013573169708252, + "learning_rate": 9.782051482453867e-05, + "loss": 1.9378, + "step": 3960 + }, + { + "epoch": 1.2157765500306936, + "grad_norm": 0.42000052332878113, + "learning_rate": 9.781906305416991e-05, + "loss": 1.9232, + "step": 3961 + }, + { + "epoch": 1.2160834868017187, + "grad_norm": 0.4651196599006653, + "learning_rate": 9.781761081122514e-05, + "loss": 2.0244, + "step": 3962 + }, + { + "epoch": 1.216390423572744, + "grad_norm": 0.48081469535827637, + "learning_rate": 9.781615809571871e-05, + "loss": 1.938, + "step": 3963 + }, + { + "epoch": 1.2166973603437692, + "grad_norm": 0.4692462086677551, + "learning_rate": 9.7814704907665e-05, + "loss": 1.9592, + "step": 3964 + }, + { + "epoch": 1.2170042971147943, + "grad_norm": 0.5545635223388672, + "learning_rate": 9.781325124707832e-05, + "loss": 2.0882, + "step": 3965 + }, + { + "epoch": 1.2173112338858196, + "grad_norm": 0.47801801562309265, + "learning_rate": 9.78117971139731e-05, + "loss": 2.0127, + "step": 3966 + }, + { + "epoch": 1.2176181706568447, + "grad_norm": 0.4705824851989746, + "learning_rate": 9.781034250836364e-05, + "loss": 2.0659, + "step": 3967 + }, + { + "epoch": 1.2179251074278699, + "grad_norm": 0.4757092297077179, + "learning_rate": 9.78088874302644e-05, + "loss": 1.9177, + "step": 3968 + }, + { + "epoch": 1.218232044198895, + "grad_norm": 0.4563291370868683, + "learning_rate": 9.780743187968968e-05, + "loss": 1.991, + "step": 3969 + }, + { + "epoch": 1.21853898096992, + "grad_norm": 0.4641762375831604, + "learning_rate": 9.78059758566539e-05, + "loss": 2.0357, + "step": 3970 + }, + { + "epoch": 1.2188459177409454, + "grad_norm": 0.510754406452179, + "learning_rate": 9.780451936117145e-05, + "loss": 2.0754, + "step": 3971 + }, + { + "epoch": 1.2191528545119705, + "grad_norm": 0.5595460534095764, + "learning_rate": 9.780306239325671e-05, + "loss": 2.0449, + "step": 3972 + }, + { + "epoch": 1.2194597912829956, + "grad_norm": 0.5778231620788574, + "learning_rate": 9.780160495292412e-05, + "loss": 2.0187, + "step": 3973 + }, + { + "epoch": 1.219766728054021, + "grad_norm": 0.5098022818565369, + "learning_rate": 9.780014704018803e-05, + "loss": 1.9881, + "step": 3974 + }, + { + "epoch": 1.220073664825046, + "grad_norm": 0.46725937724113464, + "learning_rate": 9.779868865506288e-05, + "loss": 1.9929, + "step": 3975 + }, + { + "epoch": 1.2203806015960712, + "grad_norm": 0.48517540097236633, + "learning_rate": 9.779722979756304e-05, + "loss": 1.9446, + "step": 3976 + }, + { + "epoch": 1.2206875383670963, + "grad_norm": 0.5013269186019897, + "learning_rate": 9.7795770467703e-05, + "loss": 2.0256, + "step": 3977 + }, + { + "epoch": 1.2209944751381214, + "grad_norm": 0.4918982982635498, + "learning_rate": 9.779431066549713e-05, + "loss": 1.9732, + "step": 3978 + }, + { + "epoch": 1.2213014119091468, + "grad_norm": 0.45646655559539795, + "learning_rate": 9.779285039095987e-05, + "loss": 1.9672, + "step": 3979 + }, + { + "epoch": 1.2216083486801719, + "grad_norm": 0.4712901711463928, + "learning_rate": 9.779138964410565e-05, + "loss": 2.0074, + "step": 3980 + }, + { + "epoch": 1.221915285451197, + "grad_norm": 0.4901394844055176, + "learning_rate": 9.77899284249489e-05, + "loss": 2.0073, + "step": 3981 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.579247772693634, + "learning_rate": 9.778846673350407e-05, + "loss": 2.0983, + "step": 3982 + }, + { + "epoch": 1.2225291589932474, + "grad_norm": 0.6108444929122925, + "learning_rate": 9.77870045697856e-05, + "loss": 2.0268, + "step": 3983 + }, + { + "epoch": 1.2228360957642725, + "grad_norm": 0.5592121481895447, + "learning_rate": 9.778554193380796e-05, + "loss": 2.0549, + "step": 3984 + }, + { + "epoch": 1.2231430325352977, + "grad_norm": 0.538088858127594, + "learning_rate": 9.778407882558556e-05, + "loss": 1.9398, + "step": 3985 + }, + { + "epoch": 1.223449969306323, + "grad_norm": 0.5928295850753784, + "learning_rate": 9.77826152451329e-05, + "loss": 2.0341, + "step": 3986 + }, + { + "epoch": 1.223756906077348, + "grad_norm": 0.566687822341919, + "learning_rate": 9.778115119246442e-05, + "loss": 2.0629, + "step": 3987 + }, + { + "epoch": 1.2240638428483732, + "grad_norm": 0.7019027471542358, + "learning_rate": 9.777968666759461e-05, + "loss": 1.9979, + "step": 3988 + }, + { + "epoch": 1.2243707796193983, + "grad_norm": 0.7198969721794128, + "learning_rate": 9.777822167053793e-05, + "loss": 1.9898, + "step": 3989 + }, + { + "epoch": 1.2246777163904237, + "grad_norm": 0.6319006085395813, + "learning_rate": 9.777675620130887e-05, + "loss": 1.9591, + "step": 3990 + }, + { + "epoch": 1.2249846531614488, + "grad_norm": 0.5372903347015381, + "learning_rate": 9.777529025992187e-05, + "loss": 1.9605, + "step": 3991 + }, + { + "epoch": 1.225291589932474, + "grad_norm": 0.47436487674713135, + "learning_rate": 9.777382384639147e-05, + "loss": 1.9667, + "step": 3992 + }, + { + "epoch": 1.225598526703499, + "grad_norm": 0.5885797739028931, + "learning_rate": 9.777235696073214e-05, + "loss": 2.0363, + "step": 3993 + }, + { + "epoch": 1.2259054634745243, + "grad_norm": 0.6333138346672058, + "learning_rate": 9.777088960295838e-05, + "loss": 1.9352, + "step": 3994 + }, + { + "epoch": 1.2262124002455494, + "grad_norm": 0.6364251971244812, + "learning_rate": 9.776942177308468e-05, + "loss": 1.9577, + "step": 3995 + }, + { + "epoch": 1.2265193370165746, + "grad_norm": 0.5114668607711792, + "learning_rate": 9.776795347112557e-05, + "loss": 2.0241, + "step": 3996 + }, + { + "epoch": 1.2268262737875997, + "grad_norm": 0.6139995455741882, + "learning_rate": 9.776648469709556e-05, + "loss": 1.9847, + "step": 3997 + }, + { + "epoch": 1.227133210558625, + "grad_norm": 0.6104671955108643, + "learning_rate": 9.776501545100911e-05, + "loss": 1.9311, + "step": 3998 + }, + { + "epoch": 1.2274401473296501, + "grad_norm": 0.5099297761917114, + "learning_rate": 9.776354573288081e-05, + "loss": 2.0877, + "step": 3999 + }, + { + "epoch": 1.2277470841006752, + "grad_norm": 0.48199233412742615, + "learning_rate": 9.776207554272516e-05, + "loss": 1.9802, + "step": 4000 + }, + { + "epoch": 1.2280540208717003, + "grad_norm": 0.5323067307472229, + "learning_rate": 9.776060488055667e-05, + "loss": 2.0278, + "step": 4001 + }, + { + "epoch": 1.2283609576427257, + "grad_norm": 0.49086472392082214, + "learning_rate": 9.775913374638988e-05, + "loss": 2.0242, + "step": 4002 + }, + { + "epoch": 1.2286678944137508, + "grad_norm": 0.4812946319580078, + "learning_rate": 9.775766214023936e-05, + "loss": 1.9762, + "step": 4003 + }, + { + "epoch": 1.228974831184776, + "grad_norm": 0.44118809700012207, + "learning_rate": 9.775619006211962e-05, + "loss": 1.9242, + "step": 4004 + }, + { + "epoch": 1.229281767955801, + "grad_norm": 0.4507352113723755, + "learning_rate": 9.775471751204522e-05, + "loss": 2.0015, + "step": 4005 + }, + { + "epoch": 1.2295887047268264, + "grad_norm": 0.4620691239833832, + "learning_rate": 9.775324449003072e-05, + "loss": 2.0269, + "step": 4006 + }, + { + "epoch": 1.2298956414978515, + "grad_norm": 0.5053025484085083, + "learning_rate": 9.775177099609065e-05, + "loss": 1.9764, + "step": 4007 + }, + { + "epoch": 1.2302025782688766, + "grad_norm": 0.5113483667373657, + "learning_rate": 9.775029703023961e-05, + "loss": 2.0583, + "step": 4008 + }, + { + "epoch": 1.2305095150399017, + "grad_norm": 0.517400324344635, + "learning_rate": 9.774882259249214e-05, + "loss": 2.0918, + "step": 4009 + }, + { + "epoch": 1.230816451810927, + "grad_norm": 0.5575035214424133, + "learning_rate": 9.774734768286282e-05, + "loss": 2.0573, + "step": 4010 + }, + { + "epoch": 1.2311233885819521, + "grad_norm": 0.5556582808494568, + "learning_rate": 9.774587230136622e-05, + "loss": 1.9612, + "step": 4011 + }, + { + "epoch": 1.2314303253529773, + "grad_norm": 0.541752815246582, + "learning_rate": 9.774439644801693e-05, + "loss": 2.0165, + "step": 4012 + }, + { + "epoch": 1.2317372621240024, + "grad_norm": 0.46944886445999146, + "learning_rate": 9.774292012282953e-05, + "loss": 2.0068, + "step": 4013 + }, + { + "epoch": 1.2320441988950277, + "grad_norm": 0.5507385730743408, + "learning_rate": 9.77414433258186e-05, + "loss": 2.0092, + "step": 4014 + }, + { + "epoch": 1.2323511356660528, + "grad_norm": 0.550862193107605, + "learning_rate": 9.773996605699875e-05, + "loss": 1.9887, + "step": 4015 + }, + { + "epoch": 1.232658072437078, + "grad_norm": 0.5281004905700684, + "learning_rate": 9.77384883163846e-05, + "loss": 2.0214, + "step": 4016 + }, + { + "epoch": 1.232965009208103, + "grad_norm": 0.5682541131973267, + "learning_rate": 9.77370101039907e-05, + "loss": 2.0021, + "step": 4017 + }, + { + "epoch": 1.2332719459791284, + "grad_norm": 0.5083168745040894, + "learning_rate": 9.77355314198317e-05, + "loss": 1.9589, + "step": 4018 + }, + { + "epoch": 1.2335788827501535, + "grad_norm": 0.48763957619667053, + "learning_rate": 9.773405226392218e-05, + "loss": 1.9517, + "step": 4019 + }, + { + "epoch": 1.2338858195211786, + "grad_norm": 0.4721868634223938, + "learning_rate": 9.77325726362768e-05, + "loss": 1.959, + "step": 4020 + }, + { + "epoch": 1.2341927562922037, + "grad_norm": 0.5072606205940247, + "learning_rate": 9.773109253691016e-05, + "loss": 2.0252, + "step": 4021 + }, + { + "epoch": 1.234499693063229, + "grad_norm": 0.483260840177536, + "learning_rate": 9.772961196583686e-05, + "loss": 2.0205, + "step": 4022 + }, + { + "epoch": 1.2348066298342542, + "grad_norm": 0.4468609392642975, + "learning_rate": 9.772813092307158e-05, + "loss": 2.0182, + "step": 4023 + }, + { + "epoch": 1.2351135666052793, + "grad_norm": 0.4950753152370453, + "learning_rate": 9.772664940862893e-05, + "loss": 2.0276, + "step": 4024 + }, + { + "epoch": 1.2354205033763046, + "grad_norm": 0.45740416646003723, + "learning_rate": 9.772516742252356e-05, + "loss": 1.9519, + "step": 4025 + }, + { + "epoch": 1.2357274401473297, + "grad_norm": 0.409072607755661, + "learning_rate": 9.772368496477011e-05, + "loss": 1.9441, + "step": 4026 + }, + { + "epoch": 1.2360343769183548, + "grad_norm": 0.44857287406921387, + "learning_rate": 9.772220203538325e-05, + "loss": 1.9941, + "step": 4027 + }, + { + "epoch": 1.23634131368938, + "grad_norm": 0.4610998034477234, + "learning_rate": 9.77207186343776e-05, + "loss": 1.9855, + "step": 4028 + }, + { + "epoch": 1.236648250460405, + "grad_norm": 0.4809660017490387, + "learning_rate": 9.771923476176784e-05, + "loss": 1.9596, + "step": 4029 + }, + { + "epoch": 1.2369551872314304, + "grad_norm": 0.5011657476425171, + "learning_rate": 9.771775041756865e-05, + "loss": 1.9537, + "step": 4030 + }, + { + "epoch": 1.2372621240024555, + "grad_norm": 0.476001501083374, + "learning_rate": 9.771626560179465e-05, + "loss": 1.9447, + "step": 4031 + }, + { + "epoch": 1.2375690607734806, + "grad_norm": 0.4733816385269165, + "learning_rate": 9.771478031446057e-05, + "loss": 2.08, + "step": 4032 + }, + { + "epoch": 1.237875997544506, + "grad_norm": 0.4763995409011841, + "learning_rate": 9.771329455558108e-05, + "loss": 1.9483, + "step": 4033 + }, + { + "epoch": 1.238182934315531, + "grad_norm": 0.4906281530857086, + "learning_rate": 9.771180832517082e-05, + "loss": 1.9619, + "step": 4034 + }, + { + "epoch": 1.2384898710865562, + "grad_norm": 0.48713672161102295, + "learning_rate": 9.77103216232445e-05, + "loss": 1.9753, + "step": 4035 + }, + { + "epoch": 1.2387968078575813, + "grad_norm": 0.5214180946350098, + "learning_rate": 9.770883444981683e-05, + "loss": 2.0407, + "step": 4036 + }, + { + "epoch": 1.2391037446286064, + "grad_norm": 0.5161129236221313, + "learning_rate": 9.77073468049025e-05, + "loss": 2.0298, + "step": 4037 + }, + { + "epoch": 1.2394106813996317, + "grad_norm": 0.5041607022285461, + "learning_rate": 9.770585868851621e-05, + "loss": 1.9898, + "step": 4038 + }, + { + "epoch": 1.2397176181706568, + "grad_norm": 0.5076795220375061, + "learning_rate": 9.770437010067264e-05, + "loss": 1.9899, + "step": 4039 + }, + { + "epoch": 1.240024554941682, + "grad_norm": 0.47992074489593506, + "learning_rate": 9.770288104138654e-05, + "loss": 1.9923, + "step": 4040 + }, + { + "epoch": 1.2403314917127073, + "grad_norm": 0.4655405580997467, + "learning_rate": 9.770139151067261e-05, + "loss": 2.0082, + "step": 4041 + }, + { + "epoch": 1.2406384284837324, + "grad_norm": 0.499953031539917, + "learning_rate": 9.769990150854558e-05, + "loss": 2.0412, + "step": 4042 + }, + { + "epoch": 1.2409453652547575, + "grad_norm": 0.5288184285163879, + "learning_rate": 9.769841103502016e-05, + "loss": 2.0163, + "step": 4043 + }, + { + "epoch": 1.2412523020257826, + "grad_norm": 0.6660463809967041, + "learning_rate": 9.769692009011107e-05, + "loss": 2.1644, + "step": 4044 + }, + { + "epoch": 1.2415592387968077, + "grad_norm": 0.7020677328109741, + "learning_rate": 9.769542867383306e-05, + "loss": 1.9921, + "step": 4045 + }, + { + "epoch": 1.241866175567833, + "grad_norm": 0.8394366502761841, + "learning_rate": 9.769393678620089e-05, + "loss": 2.0099, + "step": 4046 + }, + { + "epoch": 1.2421731123388582, + "grad_norm": 0.9541008472442627, + "learning_rate": 9.769244442722927e-05, + "loss": 2.0035, + "step": 4047 + }, + { + "epoch": 1.2424800491098833, + "grad_norm": 0.8454573750495911, + "learning_rate": 9.769095159693296e-05, + "loss": 2.0075, + "step": 4048 + }, + { + "epoch": 1.2427869858809086, + "grad_norm": 0.6634951233863831, + "learning_rate": 9.768945829532672e-05, + "loss": 2.0352, + "step": 4049 + }, + { + "epoch": 1.2430939226519337, + "grad_norm": 0.5453166365623474, + "learning_rate": 9.76879645224253e-05, + "loss": 2.0259, + "step": 4050 + }, + { + "epoch": 1.2434008594229589, + "grad_norm": 0.8018995523452759, + "learning_rate": 9.768647027824344e-05, + "loss": 2.0175, + "step": 4051 + }, + { + "epoch": 1.243707796193984, + "grad_norm": 0.8518994450569153, + "learning_rate": 9.768497556279596e-05, + "loss": 1.986, + "step": 4052 + }, + { + "epoch": 1.244014732965009, + "grad_norm": 0.670764684677124, + "learning_rate": 9.76834803760976e-05, + "loss": 1.9779, + "step": 4053 + }, + { + "epoch": 1.2443216697360344, + "grad_norm": 0.5042433142662048, + "learning_rate": 9.768198471816312e-05, + "loss": 1.9808, + "step": 4054 + }, + { + "epoch": 1.2446286065070595, + "grad_norm": 0.45487603545188904, + "learning_rate": 9.768048858900733e-05, + "loss": 2.011, + "step": 4055 + }, + { + "epoch": 1.2449355432780846, + "grad_norm": 0.5012104511260986, + "learning_rate": 9.767899198864502e-05, + "loss": 1.9945, + "step": 4056 + }, + { + "epoch": 1.24524248004911, + "grad_norm": 0.6275805234909058, + "learning_rate": 9.767749491709095e-05, + "loss": 2.0397, + "step": 4057 + }, + { + "epoch": 1.245549416820135, + "grad_norm": 0.601513683795929, + "learning_rate": 9.767599737435993e-05, + "loss": 2.0201, + "step": 4058 + }, + { + "epoch": 1.2458563535911602, + "grad_norm": 0.531112551689148, + "learning_rate": 9.767449936046678e-05, + "loss": 2.0449, + "step": 4059 + }, + { + "epoch": 1.2461632903621853, + "grad_norm": 0.48515528440475464, + "learning_rate": 9.767300087542626e-05, + "loss": 2.0318, + "step": 4060 + }, + { + "epoch": 1.2464702271332107, + "grad_norm": 0.49292388558387756, + "learning_rate": 9.767150191925321e-05, + "loss": 2.0004, + "step": 4061 + }, + { + "epoch": 1.2467771639042358, + "grad_norm": 0.6046907901763916, + "learning_rate": 9.767000249196242e-05, + "loss": 2.0141, + "step": 4062 + }, + { + "epoch": 1.2470841006752609, + "grad_norm": 0.5311875939369202, + "learning_rate": 9.766850259356876e-05, + "loss": 1.9909, + "step": 4063 + }, + { + "epoch": 1.247391037446286, + "grad_norm": 0.535664975643158, + "learning_rate": 9.7667002224087e-05, + "loss": 2.07, + "step": 4064 + }, + { + "epoch": 1.2476979742173113, + "grad_norm": 0.594886839389801, + "learning_rate": 9.766550138353199e-05, + "loss": 1.9646, + "step": 4065 + }, + { + "epoch": 1.2480049109883364, + "grad_norm": 0.6726763844490051, + "learning_rate": 9.766400007191856e-05, + "loss": 1.9778, + "step": 4066 + }, + { + "epoch": 1.2483118477593615, + "grad_norm": 0.6045297384262085, + "learning_rate": 9.766249828926154e-05, + "loss": 2.0215, + "step": 4067 + }, + { + "epoch": 1.2486187845303867, + "grad_norm": 0.56207275390625, + "learning_rate": 9.766099603557576e-05, + "loss": 2.0252, + "step": 4068 + }, + { + "epoch": 1.248925721301412, + "grad_norm": 0.6623022556304932, + "learning_rate": 9.765949331087611e-05, + "loss": 1.975, + "step": 4069 + }, + { + "epoch": 1.249232658072437, + "grad_norm": 0.6274738311767578, + "learning_rate": 9.76579901151774e-05, + "loss": 2.037, + "step": 4070 + }, + { + "epoch": 1.2495395948434622, + "grad_norm": 0.5161643028259277, + "learning_rate": 9.76564864484945e-05, + "loss": 1.969, + "step": 4071 + }, + { + "epoch": 1.2498465316144873, + "grad_norm": 0.5624449849128723, + "learning_rate": 9.765498231084227e-05, + "loss": 2.0322, + "step": 4072 + }, + { + "epoch": 1.2501534683855127, + "grad_norm": 0.6198796629905701, + "learning_rate": 9.765347770223556e-05, + "loss": 1.986, + "step": 4073 + }, + { + "epoch": 1.2504604051565378, + "grad_norm": 0.5928165316581726, + "learning_rate": 9.765197262268927e-05, + "loss": 1.9886, + "step": 4074 + }, + { + "epoch": 1.250767341927563, + "grad_norm": 0.476484090089798, + "learning_rate": 9.765046707221825e-05, + "loss": 2.0476, + "step": 4075 + }, + { + "epoch": 1.2510742786985882, + "grad_norm": 0.5001220703125, + "learning_rate": 9.764896105083738e-05, + "loss": 1.9222, + "step": 4076 + }, + { + "epoch": 1.2513812154696133, + "grad_norm": 0.5429214239120483, + "learning_rate": 9.764745455856156e-05, + "loss": 2.0005, + "step": 4077 + }, + { + "epoch": 1.2516881522406385, + "grad_norm": 0.49443748593330383, + "learning_rate": 9.764594759540566e-05, + "loss": 1.9746, + "step": 4078 + }, + { + "epoch": 1.2519950890116636, + "grad_norm": 0.46963369846343994, + "learning_rate": 9.764444016138458e-05, + "loss": 1.9133, + "step": 4079 + }, + { + "epoch": 1.2523020257826887, + "grad_norm": 0.5112172365188599, + "learning_rate": 9.764293225651324e-05, + "loss": 1.9488, + "step": 4080 + }, + { + "epoch": 1.252608962553714, + "grad_norm": 0.4584117829799652, + "learning_rate": 9.764142388080648e-05, + "loss": 1.9895, + "step": 4081 + }, + { + "epoch": 1.2529158993247391, + "grad_norm": 0.48059090971946716, + "learning_rate": 9.763991503427927e-05, + "loss": 2.0436, + "step": 4082 + }, + { + "epoch": 1.2532228360957642, + "grad_norm": 0.5877810120582581, + "learning_rate": 9.763840571694649e-05, + "loss": 1.97, + "step": 4083 + }, + { + "epoch": 1.2535297728667896, + "grad_norm": 0.5370834469795227, + "learning_rate": 9.763689592882306e-05, + "loss": 2.0369, + "step": 4084 + }, + { + "epoch": 1.2538367096378147, + "grad_norm": 0.5483170747756958, + "learning_rate": 9.763538566992392e-05, + "loss": 2.066, + "step": 4085 + }, + { + "epoch": 1.2541436464088398, + "grad_norm": 0.5209359526634216, + "learning_rate": 9.763387494026396e-05, + "loss": 2.0685, + "step": 4086 + }, + { + "epoch": 1.254450583179865, + "grad_norm": 0.5569130182266235, + "learning_rate": 9.763236373985813e-05, + "loss": 2.0253, + "step": 4087 + }, + { + "epoch": 1.25475751995089, + "grad_norm": 0.48483753204345703, + "learning_rate": 9.763085206872136e-05, + "loss": 1.9851, + "step": 4088 + }, + { + "epoch": 1.2550644567219154, + "grad_norm": 0.4289563000202179, + "learning_rate": 9.76293399268686e-05, + "loss": 1.9374, + "step": 4089 + }, + { + "epoch": 1.2553713934929405, + "grad_norm": 0.4691961109638214, + "learning_rate": 9.762782731431478e-05, + "loss": 1.9588, + "step": 4090 + }, + { + "epoch": 1.2556783302639656, + "grad_norm": 0.49626582860946655, + "learning_rate": 9.762631423107488e-05, + "loss": 1.999, + "step": 4091 + }, + { + "epoch": 1.255985267034991, + "grad_norm": 0.5099872946739197, + "learning_rate": 9.762480067716381e-05, + "loss": 2.013, + "step": 4092 + }, + { + "epoch": 1.256292203806016, + "grad_norm": 0.47525838017463684, + "learning_rate": 9.762328665259654e-05, + "loss": 1.9953, + "step": 4093 + }, + { + "epoch": 1.2565991405770411, + "grad_norm": 0.4277878999710083, + "learning_rate": 9.762177215738804e-05, + "loss": 1.9623, + "step": 4094 + }, + { + "epoch": 1.2569060773480663, + "grad_norm": 0.46068885922431946, + "learning_rate": 9.762025719155328e-05, + "loss": 2.0012, + "step": 4095 + }, + { + "epoch": 1.2572130141190914, + "grad_norm": 0.4566059410572052, + "learning_rate": 9.761874175510723e-05, + "loss": 1.9666, + "step": 4096 + }, + { + "epoch": 1.2575199508901167, + "grad_norm": 0.44656631350517273, + "learning_rate": 9.761722584806487e-05, + "loss": 1.9912, + "step": 4097 + }, + { + "epoch": 1.2578268876611418, + "grad_norm": 0.5149295330047607, + "learning_rate": 9.761570947044117e-05, + "loss": 1.9876, + "step": 4098 + }, + { + "epoch": 1.258133824432167, + "grad_norm": 0.5265617370605469, + "learning_rate": 9.761419262225111e-05, + "loss": 2.0817, + "step": 4099 + }, + { + "epoch": 1.2584407612031923, + "grad_norm": 0.5015068054199219, + "learning_rate": 9.76126753035097e-05, + "loss": 1.9767, + "step": 4100 + }, + { + "epoch": 1.2587476979742174, + "grad_norm": 0.5178890228271484, + "learning_rate": 9.761115751423192e-05, + "loss": 1.9968, + "step": 4101 + }, + { + "epoch": 1.2590546347452425, + "grad_norm": 0.46565014123916626, + "learning_rate": 9.760963925443279e-05, + "loss": 1.8977, + "step": 4102 + }, + { + "epoch": 1.2593615715162676, + "grad_norm": 0.466398686170578, + "learning_rate": 9.760812052412728e-05, + "loss": 2.0317, + "step": 4103 + }, + { + "epoch": 1.2596685082872927, + "grad_norm": 0.48445576429367065, + "learning_rate": 9.760660132333043e-05, + "loss": 1.9953, + "step": 4104 + }, + { + "epoch": 1.259975445058318, + "grad_norm": 0.5716978907585144, + "learning_rate": 9.760508165205724e-05, + "loss": 2.0468, + "step": 4105 + }, + { + "epoch": 1.2602823818293432, + "grad_norm": 0.5168376564979553, + "learning_rate": 9.760356151032273e-05, + "loss": 1.9896, + "step": 4106 + }, + { + "epoch": 1.2605893186003683, + "grad_norm": 0.5014469027519226, + "learning_rate": 9.760204089814192e-05, + "loss": 2.0855, + "step": 4107 + }, + { + "epoch": 1.2608962553713936, + "grad_norm": 0.5283352732658386, + "learning_rate": 9.760051981552984e-05, + "loss": 2.0477, + "step": 4108 + }, + { + "epoch": 1.2612031921424187, + "grad_norm": 0.4526209533214569, + "learning_rate": 9.759899826250153e-05, + "loss": 1.9638, + "step": 4109 + }, + { + "epoch": 1.2615101289134438, + "grad_norm": 0.4565027058124542, + "learning_rate": 9.759747623907203e-05, + "loss": 1.9401, + "step": 4110 + }, + { + "epoch": 1.261817065684469, + "grad_norm": 0.48825928568840027, + "learning_rate": 9.759595374525636e-05, + "loss": 1.9721, + "step": 4111 + }, + { + "epoch": 1.262124002455494, + "grad_norm": 0.4922933578491211, + "learning_rate": 9.759443078106958e-05, + "loss": 1.969, + "step": 4112 + }, + { + "epoch": 1.2624309392265194, + "grad_norm": 0.5227758884429932, + "learning_rate": 9.759290734652674e-05, + "loss": 2.0144, + "step": 4113 + }, + { + "epoch": 1.2627378759975445, + "grad_norm": 0.48013919591903687, + "learning_rate": 9.759138344164289e-05, + "loss": 1.9889, + "step": 4114 + }, + { + "epoch": 1.2630448127685696, + "grad_norm": 0.5039379596710205, + "learning_rate": 9.758985906643309e-05, + "loss": 1.9313, + "step": 4115 + }, + { + "epoch": 1.263351749539595, + "grad_norm": 0.5248776078224182, + "learning_rate": 9.758833422091244e-05, + "loss": 2.0091, + "step": 4116 + }, + { + "epoch": 1.26365868631062, + "grad_norm": 0.4788825809955597, + "learning_rate": 9.758680890509595e-05, + "loss": 2.0197, + "step": 4117 + }, + { + "epoch": 1.2639656230816452, + "grad_norm": 0.4926285743713379, + "learning_rate": 9.758528311899873e-05, + "loss": 2.0558, + "step": 4118 + }, + { + "epoch": 1.2642725598526703, + "grad_norm": 0.44785842299461365, + "learning_rate": 9.758375686263586e-05, + "loss": 1.9505, + "step": 4119 + }, + { + "epoch": 1.2645794966236954, + "grad_norm": 0.44693484902381897, + "learning_rate": 9.75822301360224e-05, + "loss": 1.9734, + "step": 4120 + }, + { + "epoch": 1.2648864333947207, + "grad_norm": 0.4691752791404724, + "learning_rate": 9.758070293917346e-05, + "loss": 2.0069, + "step": 4121 + }, + { + "epoch": 1.2651933701657458, + "grad_norm": 0.4718364477157593, + "learning_rate": 9.757917527210413e-05, + "loss": 1.9926, + "step": 4122 + }, + { + "epoch": 1.265500306936771, + "grad_norm": 0.47527435421943665, + "learning_rate": 9.757764713482949e-05, + "loss": 2.0304, + "step": 4123 + }, + { + "epoch": 1.2658072437077963, + "grad_norm": 0.5030924677848816, + "learning_rate": 9.757611852736467e-05, + "loss": 2.0281, + "step": 4124 + }, + { + "epoch": 1.2661141804788214, + "grad_norm": 0.5260440707206726, + "learning_rate": 9.757458944972475e-05, + "loss": 1.9952, + "step": 4125 + }, + { + "epoch": 1.2664211172498465, + "grad_norm": 0.5542300939559937, + "learning_rate": 9.757305990192486e-05, + "loss": 1.979, + "step": 4126 + }, + { + "epoch": 1.2667280540208716, + "grad_norm": 0.5589221715927124, + "learning_rate": 9.757152988398011e-05, + "loss": 2.0123, + "step": 4127 + }, + { + "epoch": 1.2670349907918967, + "grad_norm": 0.48933175206184387, + "learning_rate": 9.75699993959056e-05, + "loss": 1.9671, + "step": 4128 + }, + { + "epoch": 1.267341927562922, + "grad_norm": 0.4785501956939697, + "learning_rate": 9.75684684377165e-05, + "loss": 1.9452, + "step": 4129 + }, + { + "epoch": 1.2676488643339472, + "grad_norm": 0.5000367760658264, + "learning_rate": 9.75669370094279e-05, + "loss": 1.9637, + "step": 4130 + }, + { + "epoch": 1.2679558011049723, + "grad_norm": 0.5292743444442749, + "learning_rate": 9.756540511105496e-05, + "loss": 2.0464, + "step": 4131 + }, + { + "epoch": 1.2682627378759976, + "grad_norm": 0.4979592561721802, + "learning_rate": 9.75638727426128e-05, + "loss": 1.9863, + "step": 4132 + }, + { + "epoch": 1.2685696746470227, + "grad_norm": 0.4681611657142639, + "learning_rate": 9.756233990411656e-05, + "loss": 1.9978, + "step": 4133 + }, + { + "epoch": 1.2688766114180479, + "grad_norm": 0.5034354329109192, + "learning_rate": 9.756080659558142e-05, + "loss": 2.0332, + "step": 4134 + }, + { + "epoch": 1.269183548189073, + "grad_norm": 0.4815942347049713, + "learning_rate": 9.75592728170225e-05, + "loss": 1.9669, + "step": 4135 + }, + { + "epoch": 1.269490484960098, + "grad_norm": 0.49555137753486633, + "learning_rate": 9.755773856845498e-05, + "loss": 1.9774, + "step": 4136 + }, + { + "epoch": 1.2697974217311234, + "grad_norm": 0.5533550381660461, + "learning_rate": 9.755620384989401e-05, + "loss": 2.0236, + "step": 4137 + }, + { + "epoch": 1.2701043585021485, + "grad_norm": 0.49497511982917786, + "learning_rate": 9.755466866135476e-05, + "loss": 1.9266, + "step": 4138 + }, + { + "epoch": 1.2704112952731736, + "grad_norm": 0.5009804964065552, + "learning_rate": 9.755313300285239e-05, + "loss": 1.9463, + "step": 4139 + }, + { + "epoch": 1.270718232044199, + "grad_norm": 0.49870428442955017, + "learning_rate": 9.755159687440209e-05, + "loss": 1.9566, + "step": 4140 + }, + { + "epoch": 1.271025168815224, + "grad_norm": 0.49113500118255615, + "learning_rate": 9.755006027601905e-05, + "loss": 2.0075, + "step": 4141 + }, + { + "epoch": 1.2713321055862492, + "grad_norm": 0.45977187156677246, + "learning_rate": 9.754852320771845e-05, + "loss": 1.9358, + "step": 4142 + }, + { + "epoch": 1.2716390423572743, + "grad_norm": 0.5493664145469666, + "learning_rate": 9.754698566951545e-05, + "loss": 1.9996, + "step": 4143 + }, + { + "epoch": 1.2719459791282997, + "grad_norm": 0.4791078567504883, + "learning_rate": 9.75454476614253e-05, + "loss": 1.9426, + "step": 4144 + }, + { + "epoch": 1.2722529158993248, + "grad_norm": 0.4809282720088959, + "learning_rate": 9.754390918346315e-05, + "loss": 2.0197, + "step": 4145 + }, + { + "epoch": 1.2725598526703499, + "grad_norm": 0.5380387902259827, + "learning_rate": 9.754237023564423e-05, + "loss": 2.0261, + "step": 4146 + }, + { + "epoch": 1.272866789441375, + "grad_norm": 0.48302608728408813, + "learning_rate": 9.754083081798374e-05, + "loss": 2.0539, + "step": 4147 + }, + { + "epoch": 1.2731737262124003, + "grad_norm": 0.5752124786376953, + "learning_rate": 9.75392909304969e-05, + "loss": 2.0901, + "step": 4148 + }, + { + "epoch": 1.2734806629834254, + "grad_norm": 0.5538807511329651, + "learning_rate": 9.75377505731989e-05, + "loss": 1.9721, + "step": 4149 + }, + { + "epoch": 1.2737875997544506, + "grad_norm": 0.6331756114959717, + "learning_rate": 9.753620974610502e-05, + "loss": 2.0124, + "step": 4150 + }, + { + "epoch": 1.2740945365254759, + "grad_norm": 0.6422140598297119, + "learning_rate": 9.753466844923042e-05, + "loss": 2.0115, + "step": 4151 + }, + { + "epoch": 1.274401473296501, + "grad_norm": 0.6650347113609314, + "learning_rate": 9.753312668259038e-05, + "loss": 1.9735, + "step": 4152 + }, + { + "epoch": 1.274708410067526, + "grad_norm": 0.587230384349823, + "learning_rate": 9.753158444620013e-05, + "loss": 1.9382, + "step": 4153 + }, + { + "epoch": 1.2750153468385512, + "grad_norm": 0.5357664823532104, + "learning_rate": 9.75300417400749e-05, + "loss": 2.0437, + "step": 4154 + }, + { + "epoch": 1.2753222836095763, + "grad_norm": 0.5058115720748901, + "learning_rate": 9.752849856422994e-05, + "loss": 2.0031, + "step": 4155 + }, + { + "epoch": 1.2756292203806017, + "grad_norm": 0.5913745164871216, + "learning_rate": 9.75269549186805e-05, + "loss": 1.9923, + "step": 4156 + }, + { + "epoch": 1.2759361571516268, + "grad_norm": 0.6766920685768127, + "learning_rate": 9.752541080344181e-05, + "loss": 1.9619, + "step": 4157 + }, + { + "epoch": 1.276243093922652, + "grad_norm": 0.606132984161377, + "learning_rate": 9.752386621852919e-05, + "loss": 1.9689, + "step": 4158 + }, + { + "epoch": 1.2765500306936772, + "grad_norm": 0.521133542060852, + "learning_rate": 9.752232116395785e-05, + "loss": 1.9602, + "step": 4159 + }, + { + "epoch": 1.2768569674647023, + "grad_norm": 0.45266324281692505, + "learning_rate": 9.75207756397431e-05, + "loss": 2.0032, + "step": 4160 + }, + { + "epoch": 1.2771639042357275, + "grad_norm": 0.5078892707824707, + "learning_rate": 9.751922964590017e-05, + "loss": 2.0656, + "step": 4161 + }, + { + "epoch": 1.2774708410067526, + "grad_norm": 0.5042154788970947, + "learning_rate": 9.751768318244437e-05, + "loss": 1.9356, + "step": 4162 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.5866135954856873, + "learning_rate": 9.751613624939098e-05, + "loss": 1.9655, + "step": 4163 + }, + { + "epoch": 1.278084714548803, + "grad_norm": 0.6038163304328918, + "learning_rate": 9.751458884675527e-05, + "loss": 1.9445, + "step": 4164 + }, + { + "epoch": 1.2783916513198281, + "grad_norm": 0.4938269555568695, + "learning_rate": 9.751304097455254e-05, + "loss": 2.0164, + "step": 4165 + }, + { + "epoch": 1.2786985880908532, + "grad_norm": 0.4289272427558899, + "learning_rate": 9.75114926327981e-05, + "loss": 1.912, + "step": 4166 + }, + { + "epoch": 1.2790055248618786, + "grad_norm": 0.524058997631073, + "learning_rate": 9.750994382150724e-05, + "loss": 1.9279, + "step": 4167 + }, + { + "epoch": 1.2793124616329037, + "grad_norm": 0.6318224668502808, + "learning_rate": 9.750839454069527e-05, + "loss": 1.98, + "step": 4168 + }, + { + "epoch": 1.2796193984039288, + "grad_norm": 0.5709670782089233, + "learning_rate": 9.750684479037749e-05, + "loss": 2.0029, + "step": 4169 + }, + { + "epoch": 1.279926335174954, + "grad_norm": 0.4621523916721344, + "learning_rate": 9.750529457056924e-05, + "loss": 2.0295, + "step": 4170 + }, + { + "epoch": 1.280233271945979, + "grad_norm": 0.5812001824378967, + "learning_rate": 9.750374388128581e-05, + "loss": 2.0839, + "step": 4171 + }, + { + "epoch": 1.2805402087170044, + "grad_norm": 0.6389874219894409, + "learning_rate": 9.750219272254256e-05, + "loss": 2.0825, + "step": 4172 + }, + { + "epoch": 1.2808471454880295, + "grad_norm": 0.49902382493019104, + "learning_rate": 9.750064109435478e-05, + "loss": 1.8902, + "step": 4173 + }, + { + "epoch": 1.2811540822590546, + "grad_norm": 0.5641525983810425, + "learning_rate": 9.749908899673783e-05, + "loss": 2.0463, + "step": 4174 + }, + { + "epoch": 1.28146101903008, + "grad_norm": 0.5977841019630432, + "learning_rate": 9.749753642970704e-05, + "loss": 2.0253, + "step": 4175 + }, + { + "epoch": 1.281767955801105, + "grad_norm": 0.5438104271888733, + "learning_rate": 9.749598339327777e-05, + "loss": 1.9862, + "step": 4176 + }, + { + "epoch": 1.2820748925721301, + "grad_norm": 0.4542587697505951, + "learning_rate": 9.749442988746535e-05, + "loss": 1.9476, + "step": 4177 + }, + { + "epoch": 1.2823818293431553, + "grad_norm": 0.4900791347026825, + "learning_rate": 9.749287591228513e-05, + "loss": 2.0093, + "step": 4178 + }, + { + "epoch": 1.2826887661141804, + "grad_norm": 0.5837534666061401, + "learning_rate": 9.749132146775247e-05, + "loss": 2.0699, + "step": 4179 + }, + { + "epoch": 1.2829957028852057, + "grad_norm": 0.5315881967544556, + "learning_rate": 9.748976655388274e-05, + "loss": 1.9514, + "step": 4180 + }, + { + "epoch": 1.2833026396562308, + "grad_norm": 0.5284895300865173, + "learning_rate": 9.74882111706913e-05, + "loss": 2.0171, + "step": 4181 + }, + { + "epoch": 1.283609576427256, + "grad_norm": 0.521202802658081, + "learning_rate": 9.748665531819352e-05, + "loss": 2.025, + "step": 4182 + }, + { + "epoch": 1.2839165131982813, + "grad_norm": 0.5437573194503784, + "learning_rate": 9.748509899640479e-05, + "loss": 2.0352, + "step": 4183 + }, + { + "epoch": 1.2842234499693064, + "grad_norm": 0.5394143462181091, + "learning_rate": 9.748354220534048e-05, + "loss": 2.0245, + "step": 4184 + }, + { + "epoch": 1.2845303867403315, + "grad_norm": 0.47468093037605286, + "learning_rate": 9.748198494501597e-05, + "loss": 1.9719, + "step": 4185 + }, + { + "epoch": 1.2848373235113566, + "grad_norm": 0.5312216877937317, + "learning_rate": 9.748042721544666e-05, + "loss": 2.0111, + "step": 4186 + }, + { + "epoch": 1.2851442602823817, + "grad_norm": 0.525694727897644, + "learning_rate": 9.747886901664794e-05, + "loss": 2.0582, + "step": 4187 + }, + { + "epoch": 1.285451197053407, + "grad_norm": 0.4965955317020416, + "learning_rate": 9.74773103486352e-05, + "loss": 1.9777, + "step": 4188 + }, + { + "epoch": 1.2857581338244322, + "grad_norm": 0.4391513466835022, + "learning_rate": 9.747575121142385e-05, + "loss": 1.9725, + "step": 4189 + }, + { + "epoch": 1.2860650705954573, + "grad_norm": 0.48999011516571045, + "learning_rate": 9.74741916050293e-05, + "loss": 1.953, + "step": 4190 + }, + { + "epoch": 1.2863720073664826, + "grad_norm": 0.5297304391860962, + "learning_rate": 9.747263152946698e-05, + "loss": 2.0484, + "step": 4191 + }, + { + "epoch": 1.2866789441375077, + "grad_norm": 0.4878230690956116, + "learning_rate": 9.747107098475226e-05, + "loss": 2.0423, + "step": 4192 + }, + { + "epoch": 1.2869858809085328, + "grad_norm": 0.538070023059845, + "learning_rate": 9.74695099709006e-05, + "loss": 2.0699, + "step": 4193 + }, + { + "epoch": 1.287292817679558, + "grad_norm": 0.6656436324119568, + "learning_rate": 9.746794848792743e-05, + "loss": 2.0689, + "step": 4194 + }, + { + "epoch": 1.287599754450583, + "grad_norm": 0.6416848301887512, + "learning_rate": 9.746638653584819e-05, + "loss": 1.9796, + "step": 4195 + }, + { + "epoch": 1.2879066912216084, + "grad_norm": 0.5917447805404663, + "learning_rate": 9.746482411467827e-05, + "loss": 2.0324, + "step": 4196 + }, + { + "epoch": 1.2882136279926335, + "grad_norm": 0.5234537124633789, + "learning_rate": 9.746326122443314e-05, + "loss": 2.0468, + "step": 4197 + }, + { + "epoch": 1.2885205647636586, + "grad_norm": 0.4885808229446411, + "learning_rate": 9.746169786512827e-05, + "loss": 1.9619, + "step": 4198 + }, + { + "epoch": 1.288827501534684, + "grad_norm": 0.5776945948600769, + "learning_rate": 9.746013403677905e-05, + "loss": 2.0167, + "step": 4199 + }, + { + "epoch": 1.289134438305709, + "grad_norm": 0.5722271203994751, + "learning_rate": 9.745856973940099e-05, + "loss": 1.9751, + "step": 4200 + }, + { + "epoch": 1.2894413750767342, + "grad_norm": 0.49253931641578674, + "learning_rate": 9.745700497300951e-05, + "loss": 1.9821, + "step": 4201 + }, + { + "epoch": 1.2897483118477593, + "grad_norm": 0.4739282727241516, + "learning_rate": 9.74554397376201e-05, + "loss": 1.9926, + "step": 4202 + }, + { + "epoch": 1.2900552486187844, + "grad_norm": 0.5133153200149536, + "learning_rate": 9.745387403324823e-05, + "loss": 1.9655, + "step": 4203 + }, + { + "epoch": 1.2903621853898097, + "grad_norm": 0.48941388726234436, + "learning_rate": 9.745230785990935e-05, + "loss": 1.9401, + "step": 4204 + }, + { + "epoch": 1.2906691221608348, + "grad_norm": 0.5998152494430542, + "learning_rate": 9.745074121761896e-05, + "loss": 2.0223, + "step": 4205 + }, + { + "epoch": 1.29097605893186, + "grad_norm": 0.4423331618309021, + "learning_rate": 9.744917410639253e-05, + "loss": 1.9602, + "step": 4206 + }, + { + "epoch": 1.2912829957028853, + "grad_norm": 0.5387418866157532, + "learning_rate": 9.744760652624553e-05, + "loss": 2.0631, + "step": 4207 + }, + { + "epoch": 1.2915899324739104, + "grad_norm": 0.5992900729179382, + "learning_rate": 9.744603847719352e-05, + "loss": 1.9805, + "step": 4208 + }, + { + "epoch": 1.2918968692449355, + "grad_norm": 0.5033924579620361, + "learning_rate": 9.744446995925192e-05, + "loss": 1.9817, + "step": 4209 + }, + { + "epoch": 1.2922038060159606, + "grad_norm": 0.47493448853492737, + "learning_rate": 9.744290097243624e-05, + "loss": 2.0259, + "step": 4210 + }, + { + "epoch": 1.2925107427869857, + "grad_norm": 0.5161942839622498, + "learning_rate": 9.744133151676203e-05, + "loss": 1.9686, + "step": 4211 + }, + { + "epoch": 1.292817679558011, + "grad_norm": 0.4476351737976074, + "learning_rate": 9.743976159224477e-05, + "loss": 1.9488, + "step": 4212 + }, + { + "epoch": 1.2931246163290362, + "grad_norm": 0.5168361663818359, + "learning_rate": 9.743819119889999e-05, + "loss": 2.0645, + "step": 4213 + }, + { + "epoch": 1.2934315531000613, + "grad_norm": 0.5098811984062195, + "learning_rate": 9.743662033674319e-05, + "loss": 1.9889, + "step": 4214 + }, + { + "epoch": 1.2937384898710866, + "grad_norm": 0.5559372305870056, + "learning_rate": 9.74350490057899e-05, + "loss": 2.0348, + "step": 4215 + }, + { + "epoch": 1.2940454266421118, + "grad_norm": 0.5274948477745056, + "learning_rate": 9.743347720605566e-05, + "loss": 2.0566, + "step": 4216 + }, + { + "epoch": 1.2943523634131369, + "grad_norm": 0.5009967088699341, + "learning_rate": 9.743190493755601e-05, + "loss": 1.9915, + "step": 4217 + }, + { + "epoch": 1.2946593001841622, + "grad_norm": 0.5365834832191467, + "learning_rate": 9.743033220030646e-05, + "loss": 2.0581, + "step": 4218 + }, + { + "epoch": 1.2949662369551873, + "grad_norm": 0.519478976726532, + "learning_rate": 9.742875899432255e-05, + "loss": 1.9766, + "step": 4219 + }, + { + "epoch": 1.2952731737262124, + "grad_norm": 0.48030364513397217, + "learning_rate": 9.742718531961988e-05, + "loss": 2.0006, + "step": 4220 + }, + { + "epoch": 1.2955801104972375, + "grad_norm": 0.5257472991943359, + "learning_rate": 9.742561117621394e-05, + "loss": 2.0636, + "step": 4221 + }, + { + "epoch": 1.2958870472682626, + "grad_norm": 0.44784319400787354, + "learning_rate": 9.742403656412034e-05, + "loss": 1.9975, + "step": 4222 + }, + { + "epoch": 1.296193984039288, + "grad_norm": 0.4997022747993469, + "learning_rate": 9.742246148335459e-05, + "loss": 2.0167, + "step": 4223 + }, + { + "epoch": 1.296500920810313, + "grad_norm": 0.43378305435180664, + "learning_rate": 9.742088593393228e-05, + "loss": 1.9202, + "step": 4224 + }, + { + "epoch": 1.2968078575813382, + "grad_norm": 0.5256497859954834, + "learning_rate": 9.741930991586899e-05, + "loss": 2.0306, + "step": 4225 + }, + { + "epoch": 1.2971147943523635, + "grad_norm": 0.5017027258872986, + "learning_rate": 9.741773342918028e-05, + "loss": 2.0124, + "step": 4226 + }, + { + "epoch": 1.2974217311233887, + "grad_norm": 0.5393915176391602, + "learning_rate": 9.741615647388175e-05, + "loss": 2.0255, + "step": 4227 + }, + { + "epoch": 1.2977286678944138, + "grad_norm": 0.48618295788764954, + "learning_rate": 9.741457904998896e-05, + "loss": 1.9863, + "step": 4228 + }, + { + "epoch": 1.2980356046654389, + "grad_norm": 0.48060059547424316, + "learning_rate": 9.741300115751752e-05, + "loss": 2.0787, + "step": 4229 + }, + { + "epoch": 1.298342541436464, + "grad_norm": 0.4966236650943756, + "learning_rate": 9.741142279648298e-05, + "loss": 1.9818, + "step": 4230 + }, + { + "epoch": 1.2986494782074893, + "grad_norm": 0.5178021788597107, + "learning_rate": 9.7409843966901e-05, + "loss": 1.9847, + "step": 4231 + }, + { + "epoch": 1.2989564149785144, + "grad_norm": 0.5161063075065613, + "learning_rate": 9.740826466878716e-05, + "loss": 2.0028, + "step": 4232 + }, + { + "epoch": 1.2992633517495396, + "grad_norm": 0.5972462296485901, + "learning_rate": 9.740668490215705e-05, + "loss": 2.0205, + "step": 4233 + }, + { + "epoch": 1.2995702885205649, + "grad_norm": 0.5929185152053833, + "learning_rate": 9.740510466702629e-05, + "loss": 1.9802, + "step": 4234 + }, + { + "epoch": 1.29987722529159, + "grad_norm": 0.5496684908866882, + "learning_rate": 9.74035239634105e-05, + "loss": 1.9331, + "step": 4235 + }, + { + "epoch": 1.3001841620626151, + "grad_norm": 0.5822622179985046, + "learning_rate": 9.740194279132531e-05, + "loss": 2.1079, + "step": 4236 + }, + { + "epoch": 1.3004910988336402, + "grad_norm": 0.5886369943618774, + "learning_rate": 9.740036115078634e-05, + "loss": 1.9938, + "step": 4237 + }, + { + "epoch": 1.3007980356046653, + "grad_norm": 0.5259171724319458, + "learning_rate": 9.73987790418092e-05, + "loss": 2.0787, + "step": 4238 + }, + { + "epoch": 1.3011049723756907, + "grad_norm": 0.6112152934074402, + "learning_rate": 9.739719646440956e-05, + "loss": 2.0488, + "step": 4239 + }, + { + "epoch": 1.3014119091467158, + "grad_norm": 0.5786338448524475, + "learning_rate": 9.739561341860306e-05, + "loss": 1.9917, + "step": 4240 + }, + { + "epoch": 1.301718845917741, + "grad_norm": 0.5099230408668518, + "learning_rate": 9.739402990440531e-05, + "loss": 1.9949, + "step": 4241 + }, + { + "epoch": 1.3020257826887662, + "grad_norm": 0.5040346384048462, + "learning_rate": 9.739244592183198e-05, + "loss": 1.9368, + "step": 4242 + }, + { + "epoch": 1.3023327194597913, + "grad_norm": 0.48172008991241455, + "learning_rate": 9.739086147089871e-05, + "loss": 1.97, + "step": 4243 + }, + { + "epoch": 1.3026396562308165, + "grad_norm": 0.5350810885429382, + "learning_rate": 9.738927655162119e-05, + "loss": 2.0584, + "step": 4244 + }, + { + "epoch": 1.3029465930018416, + "grad_norm": 0.566371738910675, + "learning_rate": 9.738769116401505e-05, + "loss": 2.0138, + "step": 4245 + }, + { + "epoch": 1.3032535297728667, + "grad_norm": 0.5697746872901917, + "learning_rate": 9.738610530809598e-05, + "loss": 2.0319, + "step": 4246 + }, + { + "epoch": 1.303560466543892, + "grad_norm": 0.5186757445335388, + "learning_rate": 9.738451898387964e-05, + "loss": 1.9958, + "step": 4247 + }, + { + "epoch": 1.3038674033149171, + "grad_norm": 0.5318703651428223, + "learning_rate": 9.73829321913817e-05, + "loss": 2.0857, + "step": 4248 + }, + { + "epoch": 1.3041743400859422, + "grad_norm": 0.5013560056686401, + "learning_rate": 9.738134493061786e-05, + "loss": 1.9545, + "step": 4249 + }, + { + "epoch": 1.3044812768569676, + "grad_norm": 0.499009907245636, + "learning_rate": 9.737975720160382e-05, + "loss": 1.9773, + "step": 4250 + }, + { + "epoch": 1.3047882136279927, + "grad_norm": 0.5187140703201294, + "learning_rate": 9.737816900435522e-05, + "loss": 1.9826, + "step": 4251 + }, + { + "epoch": 1.3050951503990178, + "grad_norm": 0.4950683116912842, + "learning_rate": 9.73765803388878e-05, + "loss": 2.0061, + "step": 4252 + }, + { + "epoch": 1.305402087170043, + "grad_norm": 0.40729087591171265, + "learning_rate": 9.737499120521722e-05, + "loss": 1.9502, + "step": 4253 + }, + { + "epoch": 1.305709023941068, + "grad_norm": 0.4959156811237335, + "learning_rate": 9.737340160335924e-05, + "loss": 2.0975, + "step": 4254 + }, + { + "epoch": 1.3060159607120934, + "grad_norm": 0.5127618312835693, + "learning_rate": 9.737181153332952e-05, + "loss": 2.0098, + "step": 4255 + }, + { + "epoch": 1.3063228974831185, + "grad_norm": 0.45458972454071045, + "learning_rate": 9.737022099514381e-05, + "loss": 1.9475, + "step": 4256 + }, + { + "epoch": 1.3066298342541436, + "grad_norm": 0.5024627447128296, + "learning_rate": 9.736862998881779e-05, + "loss": 2.0682, + "step": 4257 + }, + { + "epoch": 1.306936771025169, + "grad_norm": 0.5217326283454895, + "learning_rate": 9.736703851436722e-05, + "loss": 2.0363, + "step": 4258 + }, + { + "epoch": 1.307243707796194, + "grad_norm": 0.4798679053783417, + "learning_rate": 9.736544657180781e-05, + "loss": 2.0357, + "step": 4259 + }, + { + "epoch": 1.3075506445672191, + "grad_norm": 0.6031736135482788, + "learning_rate": 9.73638541611553e-05, + "loss": 2.0143, + "step": 4260 + }, + { + "epoch": 1.3078575813382443, + "grad_norm": 0.4914969801902771, + "learning_rate": 9.736226128242542e-05, + "loss": 1.9292, + "step": 4261 + }, + { + "epoch": 1.3081645181092694, + "grad_norm": 0.40556418895721436, + "learning_rate": 9.736066793563392e-05, + "loss": 1.9528, + "step": 4262 + }, + { + "epoch": 1.3084714548802947, + "grad_norm": 0.45605841279029846, + "learning_rate": 9.735907412079652e-05, + "loss": 2.0704, + "step": 4263 + }, + { + "epoch": 1.3087783916513198, + "grad_norm": 0.4992324113845825, + "learning_rate": 9.7357479837929e-05, + "loss": 2.0211, + "step": 4264 + }, + { + "epoch": 1.309085328422345, + "grad_norm": 0.4904097020626068, + "learning_rate": 9.735588508704712e-05, + "loss": 1.987, + "step": 4265 + }, + { + "epoch": 1.3093922651933703, + "grad_norm": 0.5436086058616638, + "learning_rate": 9.735428986816661e-05, + "loss": 2.0704, + "step": 4266 + }, + { + "epoch": 1.3096992019643954, + "grad_norm": 0.4850294589996338, + "learning_rate": 9.735269418130326e-05, + "loss": 1.9576, + "step": 4267 + }, + { + "epoch": 1.3100061387354205, + "grad_norm": 0.44082164764404297, + "learning_rate": 9.735109802647283e-05, + "loss": 2.0018, + "step": 4268 + }, + { + "epoch": 1.3103130755064456, + "grad_norm": 0.4844531714916229, + "learning_rate": 9.73495014036911e-05, + "loss": 1.9852, + "step": 4269 + }, + { + "epoch": 1.3106200122774707, + "grad_norm": 0.547596275806427, + "learning_rate": 9.734790431297384e-05, + "loss": 2.0632, + "step": 4270 + }, + { + "epoch": 1.310926949048496, + "grad_norm": 0.517882764339447, + "learning_rate": 9.734630675433684e-05, + "loss": 1.9851, + "step": 4271 + }, + { + "epoch": 1.3112338858195212, + "grad_norm": 0.5148623585700989, + "learning_rate": 9.734470872779589e-05, + "loss": 2.0446, + "step": 4272 + }, + { + "epoch": 1.3115408225905463, + "grad_norm": 0.5872887372970581, + "learning_rate": 9.734311023336678e-05, + "loss": 2.0588, + "step": 4273 + }, + { + "epoch": 1.3118477593615716, + "grad_norm": 0.7116255164146423, + "learning_rate": 9.73415112710653e-05, + "loss": 2.0213, + "step": 4274 + }, + { + "epoch": 1.3121546961325967, + "grad_norm": 0.8191964626312256, + "learning_rate": 9.733991184090725e-05, + "loss": 1.9528, + "step": 4275 + }, + { + "epoch": 1.3124616329036218, + "grad_norm": 0.8214605450630188, + "learning_rate": 9.733831194290846e-05, + "loss": 1.9614, + "step": 4276 + }, + { + "epoch": 1.312768569674647, + "grad_norm": 0.7057182788848877, + "learning_rate": 9.733671157708472e-05, + "loss": 2.0767, + "step": 4277 + }, + { + "epoch": 1.313075506445672, + "grad_norm": 0.5114007592201233, + "learning_rate": 9.733511074345185e-05, + "loss": 1.946, + "step": 4278 + }, + { + "epoch": 1.3133824432166974, + "grad_norm": 0.5347970128059387, + "learning_rate": 9.733350944202566e-05, + "loss": 1.9658, + "step": 4279 + }, + { + "epoch": 1.3136893799877225, + "grad_norm": 0.6962214112281799, + "learning_rate": 9.733190767282202e-05, + "loss": 2.0943, + "step": 4280 + }, + { + "epoch": 1.3139963167587476, + "grad_norm": 0.5942707657814026, + "learning_rate": 9.733030543585668e-05, + "loss": 2.0101, + "step": 4281 + }, + { + "epoch": 1.314303253529773, + "grad_norm": 0.46218639612197876, + "learning_rate": 9.732870273114556e-05, + "loss": 2.0292, + "step": 4282 + }, + { + "epoch": 1.314610190300798, + "grad_norm": 0.5194444060325623, + "learning_rate": 9.732709955870445e-05, + "loss": 2.0666, + "step": 4283 + }, + { + "epoch": 1.3149171270718232, + "grad_norm": 0.5112141370773315, + "learning_rate": 9.732549591854918e-05, + "loss": 2.0205, + "step": 4284 + }, + { + "epoch": 1.3152240638428485, + "grad_norm": 0.5282790660858154, + "learning_rate": 9.732389181069566e-05, + "loss": 2.0704, + "step": 4285 + }, + { + "epoch": 1.3155310006138736, + "grad_norm": 0.4598311185836792, + "learning_rate": 9.732228723515968e-05, + "loss": 1.9485, + "step": 4286 + }, + { + "epoch": 1.3158379373848987, + "grad_norm": 0.4700186550617218, + "learning_rate": 9.732068219195711e-05, + "loss": 2.0329, + "step": 4287 + }, + { + "epoch": 1.3161448741559238, + "grad_norm": 0.4512452781200409, + "learning_rate": 9.731907668110384e-05, + "loss": 1.9829, + "step": 4288 + }, + { + "epoch": 1.316451810926949, + "grad_norm": 0.5053353309631348, + "learning_rate": 9.731747070261572e-05, + "loss": 2.0583, + "step": 4289 + }, + { + "epoch": 1.3167587476979743, + "grad_norm": 0.48143625259399414, + "learning_rate": 9.73158642565086e-05, + "loss": 2.014, + "step": 4290 + }, + { + "epoch": 1.3170656844689994, + "grad_norm": 0.4843716025352478, + "learning_rate": 9.73142573427984e-05, + "loss": 1.9951, + "step": 4291 + }, + { + "epoch": 1.3173726212400245, + "grad_norm": 0.45646217465400696, + "learning_rate": 9.731264996150098e-05, + "loss": 1.9701, + "step": 4292 + }, + { + "epoch": 1.3176795580110499, + "grad_norm": 0.5176306962966919, + "learning_rate": 9.73110421126322e-05, + "loss": 1.9915, + "step": 4293 + }, + { + "epoch": 1.317986494782075, + "grad_norm": 0.4862259328365326, + "learning_rate": 9.730943379620799e-05, + "loss": 2.0157, + "step": 4294 + }, + { + "epoch": 1.3182934315531, + "grad_norm": 0.4941593110561371, + "learning_rate": 9.730782501224423e-05, + "loss": 2.0164, + "step": 4295 + }, + { + "epoch": 1.3186003683241252, + "grad_norm": 0.46818530559539795, + "learning_rate": 9.73062157607568e-05, + "loss": 1.9749, + "step": 4296 + }, + { + "epoch": 1.3189073050951503, + "grad_norm": 0.41685113310813904, + "learning_rate": 9.730460604176163e-05, + "loss": 1.9443, + "step": 4297 + }, + { + "epoch": 1.3192142418661756, + "grad_norm": 0.40586861968040466, + "learning_rate": 9.73029958552746e-05, + "loss": 1.9227, + "step": 4298 + }, + { + "epoch": 1.3195211786372008, + "grad_norm": 0.3946068286895752, + "learning_rate": 9.730138520131167e-05, + "loss": 1.9073, + "step": 4299 + }, + { + "epoch": 1.3198281154082259, + "grad_norm": 0.3722321093082428, + "learning_rate": 9.729977407988871e-05, + "loss": 1.9299, + "step": 4300 + }, + { + "epoch": 1.3201350521792512, + "grad_norm": 0.39335691928863525, + "learning_rate": 9.729816249102164e-05, + "loss": 1.9673, + "step": 4301 + }, + { + "epoch": 1.3204419889502763, + "grad_norm": 0.4342779815196991, + "learning_rate": 9.729655043472643e-05, + "loss": 2.0704, + "step": 4302 + }, + { + "epoch": 1.3207489257213014, + "grad_norm": 0.46981000900268555, + "learning_rate": 9.729493791101899e-05, + "loss": 2.0593, + "step": 4303 + }, + { + "epoch": 1.3210558624923265, + "grad_norm": 0.4319849908351898, + "learning_rate": 9.729332491991524e-05, + "loss": 1.9378, + "step": 4304 + }, + { + "epoch": 1.3213627992633517, + "grad_norm": 0.4555012285709381, + "learning_rate": 9.729171146143115e-05, + "loss": 1.993, + "step": 4305 + }, + { + "epoch": 1.321669736034377, + "grad_norm": 0.5122297406196594, + "learning_rate": 9.729009753558262e-05, + "loss": 2.0237, + "step": 4306 + }, + { + "epoch": 1.321976672805402, + "grad_norm": 0.4814549386501312, + "learning_rate": 9.728848314238566e-05, + "loss": 2.0063, + "step": 4307 + }, + { + "epoch": 1.3222836095764272, + "grad_norm": 0.45410022139549255, + "learning_rate": 9.728686828185618e-05, + "loss": 2.0262, + "step": 4308 + }, + { + "epoch": 1.3225905463474525, + "grad_norm": 0.44759154319763184, + "learning_rate": 9.728525295401014e-05, + "loss": 1.9746, + "step": 4309 + }, + { + "epoch": 1.3228974831184777, + "grad_norm": 0.41539889574050903, + "learning_rate": 9.728363715886352e-05, + "loss": 1.9197, + "step": 4310 + }, + { + "epoch": 1.3232044198895028, + "grad_norm": 0.549961268901825, + "learning_rate": 9.72820208964323e-05, + "loss": 2.0168, + "step": 4311 + }, + { + "epoch": 1.3235113566605279, + "grad_norm": 0.6832249164581299, + "learning_rate": 9.728040416673243e-05, + "loss": 1.9711, + "step": 4312 + }, + { + "epoch": 1.323818293431553, + "grad_norm": 0.7458481788635254, + "learning_rate": 9.727878696977988e-05, + "loss": 2.1677, + "step": 4313 + }, + { + "epoch": 1.3241252302025783, + "grad_norm": 0.6268119812011719, + "learning_rate": 9.727716930559066e-05, + "loss": 2.0222, + "step": 4314 + }, + { + "epoch": 1.3244321669736034, + "grad_norm": 0.540987491607666, + "learning_rate": 9.727555117418075e-05, + "loss": 2.0552, + "step": 4315 + }, + { + "epoch": 1.3247391037446286, + "grad_norm": 0.6105024814605713, + "learning_rate": 9.727393257556612e-05, + "loss": 1.9287, + "step": 4316 + }, + { + "epoch": 1.325046040515654, + "grad_norm": 0.594327449798584, + "learning_rate": 9.727231350976277e-05, + "loss": 1.9737, + "step": 4317 + }, + { + "epoch": 1.325352977286679, + "grad_norm": 0.5686312913894653, + "learning_rate": 9.727069397678674e-05, + "loss": 1.988, + "step": 4318 + }, + { + "epoch": 1.3256599140577041, + "grad_norm": 0.5335875153541565, + "learning_rate": 9.726907397665399e-05, + "loss": 1.9992, + "step": 4319 + }, + { + "epoch": 1.3259668508287292, + "grad_norm": 0.514209508895874, + "learning_rate": 9.726745350938055e-05, + "loss": 2.0928, + "step": 4320 + }, + { + "epoch": 1.3262737875997543, + "grad_norm": 0.58844393491745, + "learning_rate": 9.726583257498242e-05, + "loss": 1.968, + "step": 4321 + }, + { + "epoch": 1.3265807243707797, + "grad_norm": 0.5247591733932495, + "learning_rate": 9.726421117347563e-05, + "loss": 1.9529, + "step": 4322 + }, + { + "epoch": 1.3268876611418048, + "grad_norm": 0.5057464241981506, + "learning_rate": 9.726258930487622e-05, + "loss": 2.0595, + "step": 4323 + }, + { + "epoch": 1.32719459791283, + "grad_norm": 0.564689040184021, + "learning_rate": 9.726096696920019e-05, + "loss": 1.9974, + "step": 4324 + }, + { + "epoch": 1.3275015346838552, + "grad_norm": 0.5755618214607239, + "learning_rate": 9.725934416646358e-05, + "loss": 1.9949, + "step": 4325 + }, + { + "epoch": 1.3278084714548803, + "grad_norm": 0.5969316959381104, + "learning_rate": 9.725772089668243e-05, + "loss": 1.972, + "step": 4326 + }, + { + "epoch": 1.3281154082259055, + "grad_norm": 0.5776877403259277, + "learning_rate": 9.725609715987278e-05, + "loss": 2.1018, + "step": 4327 + }, + { + "epoch": 1.3284223449969306, + "grad_norm": 0.5471270680427551, + "learning_rate": 9.725447295605071e-05, + "loss": 2.0153, + "step": 4328 + }, + { + "epoch": 1.3287292817679557, + "grad_norm": 0.49090373516082764, + "learning_rate": 9.725284828523222e-05, + "loss": 1.9651, + "step": 4329 + }, + { + "epoch": 1.329036218538981, + "grad_norm": 0.49420034885406494, + "learning_rate": 9.725122314743337e-05, + "loss": 2.0119, + "step": 4330 + }, + { + "epoch": 1.3293431553100061, + "grad_norm": 0.4841148853302002, + "learning_rate": 9.724959754267027e-05, + "loss": 1.974, + "step": 4331 + }, + { + "epoch": 1.3296500920810312, + "grad_norm": 0.42349007725715637, + "learning_rate": 9.724797147095893e-05, + "loss": 1.9779, + "step": 4332 + }, + { + "epoch": 1.3299570288520566, + "grad_norm": 0.47239863872528076, + "learning_rate": 9.724634493231545e-05, + "loss": 1.9184, + "step": 4333 + }, + { + "epoch": 1.3302639656230817, + "grad_norm": 0.5583773255348206, + "learning_rate": 9.72447179267559e-05, + "loss": 2.0742, + "step": 4334 + }, + { + "epoch": 1.3305709023941068, + "grad_norm": 0.486937552690506, + "learning_rate": 9.724309045429636e-05, + "loss": 2.0101, + "step": 4335 + }, + { + "epoch": 1.330877839165132, + "grad_norm": 0.42204493284225464, + "learning_rate": 9.724146251495289e-05, + "loss": 1.9564, + "step": 4336 + }, + { + "epoch": 1.331184775936157, + "grad_norm": 0.451628714799881, + "learning_rate": 9.723983410874163e-05, + "loss": 1.9949, + "step": 4337 + }, + { + "epoch": 1.3314917127071824, + "grad_norm": 0.4453491270542145, + "learning_rate": 9.723820523567861e-05, + "loss": 1.9415, + "step": 4338 + }, + { + "epoch": 1.3317986494782075, + "grad_norm": 0.4628424644470215, + "learning_rate": 9.723657589577999e-05, + "loss": 2.0296, + "step": 4339 + }, + { + "epoch": 1.3321055862492326, + "grad_norm": 0.5362148284912109, + "learning_rate": 9.723494608906181e-05, + "loss": 2.0719, + "step": 4340 + }, + { + "epoch": 1.332412523020258, + "grad_norm": 0.45357146859169006, + "learning_rate": 9.723331581554023e-05, + "loss": 1.9107, + "step": 4341 + }, + { + "epoch": 1.332719459791283, + "grad_norm": 0.5042485594749451, + "learning_rate": 9.723168507523133e-05, + "loss": 1.9838, + "step": 4342 + }, + { + "epoch": 1.3330263965623081, + "grad_norm": 0.4797585606575012, + "learning_rate": 9.723005386815123e-05, + "loss": 1.9779, + "step": 4343 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4489155113697052, + "learning_rate": 9.722842219431607e-05, + "loss": 1.9805, + "step": 4344 + }, + { + "epoch": 1.3336402701043584, + "grad_norm": 0.43091216683387756, + "learning_rate": 9.722679005374196e-05, + "loss": 1.9708, + "step": 4345 + }, + { + "epoch": 1.3339472068753837, + "grad_norm": 0.453937292098999, + "learning_rate": 9.722515744644502e-05, + "loss": 2.0038, + "step": 4346 + }, + { + "epoch": 1.3342541436464088, + "grad_norm": 0.38905346393585205, + "learning_rate": 9.722352437244138e-05, + "loss": 2.0042, + "step": 4347 + }, + { + "epoch": 1.334561080417434, + "grad_norm": 0.46686118841171265, + "learning_rate": 9.722189083174722e-05, + "loss": 2.0733, + "step": 4348 + }, + { + "epoch": 1.3348680171884593, + "grad_norm": 0.42737439274787903, + "learning_rate": 9.722025682437865e-05, + "loss": 1.9572, + "step": 4349 + }, + { + "epoch": 1.3351749539594844, + "grad_norm": 0.3857511878013611, + "learning_rate": 9.721862235035181e-05, + "loss": 1.9288, + "step": 4350 + }, + { + "epoch": 1.3354818907305095, + "grad_norm": 0.42448824644088745, + "learning_rate": 9.721698740968288e-05, + "loss": 1.99, + "step": 4351 + }, + { + "epoch": 1.3357888275015346, + "grad_norm": 0.4753642976284027, + "learning_rate": 9.721535200238802e-05, + "loss": 2.0268, + "step": 4352 + }, + { + "epoch": 1.3360957642725597, + "grad_norm": 0.5248960256576538, + "learning_rate": 9.721371612848336e-05, + "loss": 2.008, + "step": 4353 + }, + { + "epoch": 1.336402701043585, + "grad_norm": 0.5046865344047546, + "learning_rate": 9.721207978798507e-05, + "loss": 1.9248, + "step": 4354 + }, + { + "epoch": 1.3367096378146102, + "grad_norm": 0.48205190896987915, + "learning_rate": 9.721044298090937e-05, + "loss": 1.9895, + "step": 4355 + }, + { + "epoch": 1.3370165745856353, + "grad_norm": 0.46149346232414246, + "learning_rate": 9.720880570727238e-05, + "loss": 2.0001, + "step": 4356 + }, + { + "epoch": 1.3373235113566606, + "grad_norm": 0.6212405562400818, + "learning_rate": 9.72071679670903e-05, + "loss": 2.0772, + "step": 4357 + }, + { + "epoch": 1.3376304481276857, + "grad_norm": 0.6935828924179077, + "learning_rate": 9.720552976037934e-05, + "loss": 1.9865, + "step": 4358 + }, + { + "epoch": 1.3379373848987108, + "grad_norm": 0.6850154399871826, + "learning_rate": 9.720389108715564e-05, + "loss": 1.9964, + "step": 4359 + }, + { + "epoch": 1.3382443216697362, + "grad_norm": 0.5925734043121338, + "learning_rate": 9.720225194743544e-05, + "loss": 2.0109, + "step": 4360 + }, + { + "epoch": 1.3385512584407613, + "grad_norm": 0.47503459453582764, + "learning_rate": 9.720061234123492e-05, + "loss": 2.0406, + "step": 4361 + }, + { + "epoch": 1.3388581952117864, + "grad_norm": 0.44226083159446716, + "learning_rate": 9.719897226857026e-05, + "loss": 1.953, + "step": 4362 + }, + { + "epoch": 1.3391651319828115, + "grad_norm": 0.5688608884811401, + "learning_rate": 9.719733172945772e-05, + "loss": 1.9422, + "step": 4363 + }, + { + "epoch": 1.3394720687538366, + "grad_norm": 0.6097545027732849, + "learning_rate": 9.719569072391347e-05, + "loss": 2.0204, + "step": 4364 + }, + { + "epoch": 1.339779005524862, + "grad_norm": 0.44313064217567444, + "learning_rate": 9.719404925195374e-05, + "loss": 1.9458, + "step": 4365 + }, + { + "epoch": 1.340085942295887, + "grad_norm": 0.495632141828537, + "learning_rate": 9.719240731359476e-05, + "loss": 1.9682, + "step": 4366 + }, + { + "epoch": 1.3403928790669122, + "grad_norm": 0.5843736529350281, + "learning_rate": 9.719076490885275e-05, + "loss": 1.9948, + "step": 4367 + }, + { + "epoch": 1.3406998158379375, + "grad_norm": 0.6249645352363586, + "learning_rate": 9.718912203774395e-05, + "loss": 1.9675, + "step": 4368 + }, + { + "epoch": 1.3410067526089626, + "grad_norm": 0.48386043310165405, + "learning_rate": 9.718747870028457e-05, + "loss": 1.9678, + "step": 4369 + }, + { + "epoch": 1.3413136893799877, + "grad_norm": 0.4797835648059845, + "learning_rate": 9.718583489649088e-05, + "loss": 2.0118, + "step": 4370 + }, + { + "epoch": 1.3416206261510129, + "grad_norm": 0.6131169199943542, + "learning_rate": 9.718419062637911e-05, + "loss": 2.0057, + "step": 4371 + }, + { + "epoch": 1.341927562922038, + "grad_norm": 0.6230120062828064, + "learning_rate": 9.718254588996552e-05, + "loss": 1.9871, + "step": 4372 + }, + { + "epoch": 1.3422344996930633, + "grad_norm": 0.5323978662490845, + "learning_rate": 9.718090068726633e-05, + "loss": 1.9389, + "step": 4373 + }, + { + "epoch": 1.3425414364640884, + "grad_norm": 0.429446280002594, + "learning_rate": 9.717925501829786e-05, + "loss": 1.9928, + "step": 4374 + }, + { + "epoch": 1.3428483732351135, + "grad_norm": 0.5588231086730957, + "learning_rate": 9.717760888307632e-05, + "loss": 2.0197, + "step": 4375 + }, + { + "epoch": 1.3431553100061389, + "grad_norm": 0.608248770236969, + "learning_rate": 9.7175962281618e-05, + "loss": 1.9486, + "step": 4376 + }, + { + "epoch": 1.343462246777164, + "grad_norm": 0.6100868582725525, + "learning_rate": 9.717431521393918e-05, + "loss": 2.044, + "step": 4377 + }, + { + "epoch": 1.343769183548189, + "grad_norm": 0.5428611636161804, + "learning_rate": 9.717266768005611e-05, + "loss": 2.0078, + "step": 4378 + }, + { + "epoch": 1.3440761203192142, + "grad_norm": 0.4338260889053345, + "learning_rate": 9.71710196799851e-05, + "loss": 1.9206, + "step": 4379 + }, + { + "epoch": 1.3443830570902393, + "grad_norm": 0.4879632294178009, + "learning_rate": 9.716937121374243e-05, + "loss": 1.9852, + "step": 4380 + }, + { + "epoch": 1.3446899938612646, + "grad_norm": 0.5174580216407776, + "learning_rate": 9.716772228134438e-05, + "loss": 1.9328, + "step": 4381 + }, + { + "epoch": 1.3449969306322898, + "grad_norm": 0.4461662173271179, + "learning_rate": 9.716607288280726e-05, + "loss": 1.9653, + "step": 4382 + }, + { + "epoch": 1.3453038674033149, + "grad_norm": 0.49747103452682495, + "learning_rate": 9.716442301814735e-05, + "loss": 1.9904, + "step": 4383 + }, + { + "epoch": 1.3456108041743402, + "grad_norm": 0.5059060454368591, + "learning_rate": 9.716277268738097e-05, + "loss": 1.9408, + "step": 4384 + }, + { + "epoch": 1.3459177409453653, + "grad_norm": 0.47981831431388855, + "learning_rate": 9.716112189052445e-05, + "loss": 1.9604, + "step": 4385 + }, + { + "epoch": 1.3462246777163904, + "grad_norm": 0.48941048979759216, + "learning_rate": 9.715947062759405e-05, + "loss": 2.0005, + "step": 4386 + }, + { + "epoch": 1.3465316144874155, + "grad_norm": 0.4544732868671417, + "learning_rate": 9.715781889860613e-05, + "loss": 1.9641, + "step": 4387 + }, + { + "epoch": 1.3468385512584407, + "grad_norm": 0.4564060866832733, + "learning_rate": 9.715616670357701e-05, + "loss": 1.8786, + "step": 4388 + }, + { + "epoch": 1.347145488029466, + "grad_norm": 0.4216209352016449, + "learning_rate": 9.715451404252301e-05, + "loss": 1.9402, + "step": 4389 + }, + { + "epoch": 1.347452424800491, + "grad_norm": 0.5024694204330444, + "learning_rate": 9.715286091546046e-05, + "loss": 1.9815, + "step": 4390 + }, + { + "epoch": 1.3477593615715162, + "grad_norm": 0.523953378200531, + "learning_rate": 9.715120732240571e-05, + "loss": 2.008, + "step": 4391 + }, + { + "epoch": 1.3480662983425415, + "grad_norm": 0.5068427920341492, + "learning_rate": 9.714955326337508e-05, + "loss": 1.9984, + "step": 4392 + }, + { + "epoch": 1.3483732351135667, + "grad_norm": 0.4349055290222168, + "learning_rate": 9.714789873838494e-05, + "loss": 1.9576, + "step": 4393 + }, + { + "epoch": 1.3486801718845918, + "grad_norm": 0.4677357077598572, + "learning_rate": 9.714624374745162e-05, + "loss": 2.0491, + "step": 4394 + }, + { + "epoch": 1.3489871086556169, + "grad_norm": 0.5942007899284363, + "learning_rate": 9.71445882905915e-05, + "loss": 1.9951, + "step": 4395 + }, + { + "epoch": 1.349294045426642, + "grad_norm": 0.5354358553886414, + "learning_rate": 9.714293236782092e-05, + "loss": 2.0033, + "step": 4396 + }, + { + "epoch": 1.3496009821976673, + "grad_norm": 0.5081890821456909, + "learning_rate": 9.714127597915625e-05, + "loss": 1.9944, + "step": 4397 + }, + { + "epoch": 1.3499079189686924, + "grad_norm": 0.5279759764671326, + "learning_rate": 9.713961912461386e-05, + "loss": 2.025, + "step": 4398 + }, + { + "epoch": 1.3502148557397176, + "grad_norm": 0.41777312755584717, + "learning_rate": 9.713796180421012e-05, + "loss": 1.9214, + "step": 4399 + }, + { + "epoch": 1.350521792510743, + "grad_norm": 0.48946598172187805, + "learning_rate": 9.713630401796141e-05, + "loss": 1.9851, + "step": 4400 + }, + { + "epoch": 1.350828729281768, + "grad_norm": 0.45182350277900696, + "learning_rate": 9.713464576588413e-05, + "loss": 1.9825, + "step": 4401 + }, + { + "epoch": 1.3511356660527931, + "grad_norm": 0.4178939461708069, + "learning_rate": 9.713298704799465e-05, + "loss": 1.8944, + "step": 4402 + }, + { + "epoch": 1.3514426028238182, + "grad_norm": 0.4178236424922943, + "learning_rate": 9.713132786430937e-05, + "loss": 1.9884, + "step": 4403 + }, + { + "epoch": 1.3517495395948433, + "grad_norm": 0.45951130986213684, + "learning_rate": 9.712966821484467e-05, + "loss": 2.0786, + "step": 4404 + }, + { + "epoch": 1.3520564763658687, + "grad_norm": 0.4884461760520935, + "learning_rate": 9.712800809961697e-05, + "loss": 2.0494, + "step": 4405 + }, + { + "epoch": 1.3523634131368938, + "grad_norm": 0.5342240929603577, + "learning_rate": 9.712634751864268e-05, + "loss": 2.1068, + "step": 4406 + }, + { + "epoch": 1.352670349907919, + "grad_norm": 0.5503208637237549, + "learning_rate": 9.71246864719382e-05, + "loss": 1.9588, + "step": 4407 + }, + { + "epoch": 1.3529772866789442, + "grad_norm": 0.5576291084289551, + "learning_rate": 9.712302495951994e-05, + "loss": 2.0461, + "step": 4408 + }, + { + "epoch": 1.3532842234499693, + "grad_norm": 0.5063806772232056, + "learning_rate": 9.712136298140433e-05, + "loss": 1.9606, + "step": 4409 + }, + { + "epoch": 1.3535911602209945, + "grad_norm": 0.5391512513160706, + "learning_rate": 9.71197005376078e-05, + "loss": 2.0115, + "step": 4410 + }, + { + "epoch": 1.3538980969920196, + "grad_norm": 0.4934769868850708, + "learning_rate": 9.711803762814676e-05, + "loss": 1.9966, + "step": 4411 + }, + { + "epoch": 1.3542050337630447, + "grad_norm": 0.4658334255218506, + "learning_rate": 9.711637425303766e-05, + "loss": 1.9477, + "step": 4412 + }, + { + "epoch": 1.35451197053407, + "grad_norm": 0.4407191574573517, + "learning_rate": 9.711471041229693e-05, + "loss": 1.9334, + "step": 4413 + }, + { + "epoch": 1.3548189073050951, + "grad_norm": 0.5043092370033264, + "learning_rate": 9.711304610594104e-05, + "loss": 2.0068, + "step": 4414 + }, + { + "epoch": 1.3551258440761202, + "grad_norm": 0.4502009451389313, + "learning_rate": 9.711138133398639e-05, + "loss": 1.9389, + "step": 4415 + }, + { + "epoch": 1.3554327808471456, + "grad_norm": 0.41863033175468445, + "learning_rate": 9.710971609644945e-05, + "loss": 1.9244, + "step": 4416 + }, + { + "epoch": 1.3557397176181707, + "grad_norm": 0.47590091824531555, + "learning_rate": 9.71080503933467e-05, + "loss": 2.0144, + "step": 4417 + }, + { + "epoch": 1.3560466543891958, + "grad_norm": 0.47155439853668213, + "learning_rate": 9.71063842246946e-05, + "loss": 2.0729, + "step": 4418 + }, + { + "epoch": 1.356353591160221, + "grad_norm": 0.5231152176856995, + "learning_rate": 9.710471759050957e-05, + "loss": 2.0654, + "step": 4419 + }, + { + "epoch": 1.356660527931246, + "grad_norm": 0.5952544212341309, + "learning_rate": 9.710305049080812e-05, + "loss": 1.9983, + "step": 4420 + }, + { + "epoch": 1.3569674647022714, + "grad_norm": 0.4810022711753845, + "learning_rate": 9.710138292560673e-05, + "loss": 1.9725, + "step": 4421 + }, + { + "epoch": 1.3572744014732965, + "grad_norm": 0.553421676158905, + "learning_rate": 9.709971489492185e-05, + "loss": 2.0666, + "step": 4422 + }, + { + "epoch": 1.3575813382443216, + "grad_norm": 0.48790663480758667, + "learning_rate": 9.709804639877001e-05, + "loss": 1.9312, + "step": 4423 + }, + { + "epoch": 1.357888275015347, + "grad_norm": 0.42968273162841797, + "learning_rate": 9.709637743716764e-05, + "loss": 1.9061, + "step": 4424 + }, + { + "epoch": 1.358195211786372, + "grad_norm": 0.40183690190315247, + "learning_rate": 9.709470801013128e-05, + "loss": 2.0547, + "step": 4425 + }, + { + "epoch": 1.3585021485573971, + "grad_norm": 0.5162881016731262, + "learning_rate": 9.70930381176774e-05, + "loss": 2.0246, + "step": 4426 + }, + { + "epoch": 1.3588090853284225, + "grad_norm": 0.517995297908783, + "learning_rate": 9.709136775982252e-05, + "loss": 2.0029, + "step": 4427 + }, + { + "epoch": 1.3591160220994476, + "grad_norm": 0.47416025400161743, + "learning_rate": 9.708969693658314e-05, + "loss": 1.9517, + "step": 4428 + }, + { + "epoch": 1.3594229588704727, + "grad_norm": 0.4192255437374115, + "learning_rate": 9.708802564797578e-05, + "loss": 1.9138, + "step": 4429 + }, + { + "epoch": 1.3597298956414978, + "grad_norm": 0.4643617868423462, + "learning_rate": 9.708635389401697e-05, + "loss": 1.9753, + "step": 4430 + }, + { + "epoch": 1.360036832412523, + "grad_norm": 0.5007988214492798, + "learning_rate": 9.708468167472317e-05, + "loss": 1.9654, + "step": 4431 + }, + { + "epoch": 1.3603437691835483, + "grad_norm": 0.5188244581222534, + "learning_rate": 9.708300899011098e-05, + "loss": 1.9959, + "step": 4432 + }, + { + "epoch": 1.3606507059545734, + "grad_norm": 0.5209388732910156, + "learning_rate": 9.70813358401969e-05, + "loss": 2.0028, + "step": 4433 + }, + { + "epoch": 1.3609576427255985, + "grad_norm": 0.48829126358032227, + "learning_rate": 9.707966222499745e-05, + "loss": 2.0554, + "step": 4434 + }, + { + "epoch": 1.3612645794966238, + "grad_norm": 0.4373438358306885, + "learning_rate": 9.707798814452919e-05, + "loss": 1.9611, + "step": 4435 + }, + { + "epoch": 1.361571516267649, + "grad_norm": 0.4294830858707428, + "learning_rate": 9.707631359880867e-05, + "loss": 1.9049, + "step": 4436 + }, + { + "epoch": 1.361878453038674, + "grad_norm": 0.46988123655319214, + "learning_rate": 9.70746385878524e-05, + "loss": 1.9221, + "step": 4437 + }, + { + "epoch": 1.3621853898096992, + "grad_norm": 0.4956746995449066, + "learning_rate": 9.707296311167697e-05, + "loss": 1.9215, + "step": 4438 + }, + { + "epoch": 1.3624923265807243, + "grad_norm": 0.43748801946640015, + "learning_rate": 9.707128717029894e-05, + "loss": 1.9882, + "step": 4439 + }, + { + "epoch": 1.3627992633517496, + "grad_norm": 0.4926415979862213, + "learning_rate": 9.706961076373485e-05, + "loss": 1.9664, + "step": 4440 + }, + { + "epoch": 1.3631062001227747, + "grad_norm": 0.5239415764808655, + "learning_rate": 9.706793389200129e-05, + "loss": 1.9809, + "step": 4441 + }, + { + "epoch": 1.3634131368937998, + "grad_norm": 0.5134629607200623, + "learning_rate": 9.706625655511481e-05, + "loss": 1.9559, + "step": 4442 + }, + { + "epoch": 1.3637200736648252, + "grad_norm": 0.49562570452690125, + "learning_rate": 9.706457875309198e-05, + "loss": 1.9603, + "step": 4443 + }, + { + "epoch": 1.3640270104358503, + "grad_norm": 0.45000702142715454, + "learning_rate": 9.706290048594942e-05, + "loss": 1.9395, + "step": 4444 + }, + { + "epoch": 1.3643339472068754, + "grad_norm": 0.4216759502887726, + "learning_rate": 9.70612217537037e-05, + "loss": 1.8857, + "step": 4445 + }, + { + "epoch": 1.3646408839779005, + "grad_norm": 0.5022158622741699, + "learning_rate": 9.705954255637138e-05, + "loss": 1.9388, + "step": 4446 + }, + { + "epoch": 1.3649478207489256, + "grad_norm": 0.5086642503738403, + "learning_rate": 9.70578628939691e-05, + "loss": 1.9325, + "step": 4447 + }, + { + "epoch": 1.365254757519951, + "grad_norm": 0.4891139566898346, + "learning_rate": 9.705618276651342e-05, + "loss": 1.9068, + "step": 4448 + }, + { + "epoch": 1.365561694290976, + "grad_norm": 0.42479926347732544, + "learning_rate": 9.705450217402096e-05, + "loss": 2.0345, + "step": 4449 + }, + { + "epoch": 1.3658686310620012, + "grad_norm": 0.45347172021865845, + "learning_rate": 9.705282111650834e-05, + "loss": 1.9343, + "step": 4450 + }, + { + "epoch": 1.3661755678330265, + "grad_norm": 0.5443231463432312, + "learning_rate": 9.705113959399217e-05, + "loss": 2.0428, + "step": 4451 + }, + { + "epoch": 1.3664825046040516, + "grad_norm": 0.5320110321044922, + "learning_rate": 9.704945760648905e-05, + "loss": 2.0015, + "step": 4452 + }, + { + "epoch": 1.3667894413750767, + "grad_norm": 0.5018410086631775, + "learning_rate": 9.704777515401561e-05, + "loss": 1.9284, + "step": 4453 + }, + { + "epoch": 1.3670963781461019, + "grad_norm": 0.4587440490722656, + "learning_rate": 9.704609223658848e-05, + "loss": 1.8945, + "step": 4454 + }, + { + "epoch": 1.367403314917127, + "grad_norm": 0.4634784758090973, + "learning_rate": 9.70444088542243e-05, + "loss": 1.9564, + "step": 4455 + }, + { + "epoch": 1.3677102516881523, + "grad_norm": 0.43047839403152466, + "learning_rate": 9.70427250069397e-05, + "loss": 2.0417, + "step": 4456 + }, + { + "epoch": 1.3680171884591774, + "grad_norm": 0.46661630272865295, + "learning_rate": 9.70410406947513e-05, + "loss": 2.0563, + "step": 4457 + }, + { + "epoch": 1.3683241252302025, + "grad_norm": 0.46544912457466125, + "learning_rate": 9.703935591767579e-05, + "loss": 2.0115, + "step": 4458 + }, + { + "epoch": 1.3686310620012279, + "grad_norm": 0.466172993183136, + "learning_rate": 9.703767067572977e-05, + "loss": 1.9177, + "step": 4459 + }, + { + "epoch": 1.368937998772253, + "grad_norm": 0.44513949751853943, + "learning_rate": 9.703598496892994e-05, + "loss": 1.9954, + "step": 4460 + }, + { + "epoch": 1.369244935543278, + "grad_norm": 0.4502551257610321, + "learning_rate": 9.703429879729293e-05, + "loss": 1.9155, + "step": 4461 + }, + { + "epoch": 1.3695518723143032, + "grad_norm": 0.4618416726589203, + "learning_rate": 9.703261216083541e-05, + "loss": 2.015, + "step": 4462 + }, + { + "epoch": 1.3698588090853283, + "grad_norm": 0.4691082239151001, + "learning_rate": 9.703092505957405e-05, + "loss": 2.0332, + "step": 4463 + }, + { + "epoch": 1.3701657458563536, + "grad_norm": 0.5674530863761902, + "learning_rate": 9.702923749352553e-05, + "loss": 2.0, + "step": 4464 + }, + { + "epoch": 1.3704726826273788, + "grad_norm": 0.5828661322593689, + "learning_rate": 9.702754946270651e-05, + "loss": 1.9727, + "step": 4465 + }, + { + "epoch": 1.3707796193984039, + "grad_norm": 0.5861548781394958, + "learning_rate": 9.702586096713369e-05, + "loss": 2.0337, + "step": 4466 + }, + { + "epoch": 1.3710865561694292, + "grad_norm": 0.5607923865318298, + "learning_rate": 9.702417200682374e-05, + "loss": 1.9639, + "step": 4467 + }, + { + "epoch": 1.3713934929404543, + "grad_norm": 0.553827702999115, + "learning_rate": 9.702248258179337e-05, + "loss": 1.9644, + "step": 4468 + }, + { + "epoch": 1.3717004297114794, + "grad_norm": 0.6120470762252808, + "learning_rate": 9.702079269205925e-05, + "loss": 1.9562, + "step": 4469 + }, + { + "epoch": 1.3720073664825045, + "grad_norm": 0.6354473829269409, + "learning_rate": 9.70191023376381e-05, + "loss": 2.0984, + "step": 4470 + }, + { + "epoch": 1.3723143032535297, + "grad_norm": 0.5426626801490784, + "learning_rate": 9.701741151854665e-05, + "loss": 1.9473, + "step": 4471 + }, + { + "epoch": 1.372621240024555, + "grad_norm": 0.5632089376449585, + "learning_rate": 9.701572023480156e-05, + "loss": 2.0167, + "step": 4472 + }, + { + "epoch": 1.37292817679558, + "grad_norm": 0.5315039157867432, + "learning_rate": 9.701402848641957e-05, + "loss": 1.9537, + "step": 4473 + }, + { + "epoch": 1.3732351135666052, + "grad_norm": 0.4552931785583496, + "learning_rate": 9.70123362734174e-05, + "loss": 1.9553, + "step": 4474 + }, + { + "epoch": 1.3735420503376305, + "grad_norm": 0.49282166361808777, + "learning_rate": 9.701064359581176e-05, + "loss": 2.0409, + "step": 4475 + }, + { + "epoch": 1.3738489871086557, + "grad_norm": 0.46548575162887573, + "learning_rate": 9.700895045361939e-05, + "loss": 1.9707, + "step": 4476 + }, + { + "epoch": 1.3741559238796808, + "grad_norm": 0.4619027078151703, + "learning_rate": 9.7007256846857e-05, + "loss": 1.9531, + "step": 4477 + }, + { + "epoch": 1.3744628606507059, + "grad_norm": 0.5122626423835754, + "learning_rate": 9.700556277554138e-05, + "loss": 2.0625, + "step": 4478 + }, + { + "epoch": 1.374769797421731, + "grad_norm": 0.487246036529541, + "learning_rate": 9.700386823968922e-05, + "loss": 1.9667, + "step": 4479 + }, + { + "epoch": 1.3750767341927563, + "grad_norm": 0.5093865990638733, + "learning_rate": 9.700217323931729e-05, + "loss": 1.9982, + "step": 4480 + }, + { + "epoch": 1.3753836709637814, + "grad_norm": 0.47049981355667114, + "learning_rate": 9.700047777444232e-05, + "loss": 1.9876, + "step": 4481 + }, + { + "epoch": 1.3756906077348066, + "grad_norm": 0.4997411370277405, + "learning_rate": 9.699878184508109e-05, + "loss": 1.9925, + "step": 4482 + }, + { + "epoch": 1.375997544505832, + "grad_norm": 0.49374327063560486, + "learning_rate": 9.699708545125034e-05, + "loss": 1.9468, + "step": 4483 + }, + { + "epoch": 1.376304481276857, + "grad_norm": 0.44101378321647644, + "learning_rate": 9.699538859296686e-05, + "loss": 2.0577, + "step": 4484 + }, + { + "epoch": 1.3766114180478821, + "grad_norm": 0.47289925813674927, + "learning_rate": 9.699369127024741e-05, + "loss": 1.9611, + "step": 4485 + }, + { + "epoch": 1.3769183548189072, + "grad_norm": 0.4616342782974243, + "learning_rate": 9.699199348310875e-05, + "loss": 2.0196, + "step": 4486 + }, + { + "epoch": 1.3772252915899323, + "grad_norm": 0.45797309279441833, + "learning_rate": 9.699029523156766e-05, + "loss": 2.0168, + "step": 4487 + }, + { + "epoch": 1.3775322283609577, + "grad_norm": 0.5224477648735046, + "learning_rate": 9.698859651564095e-05, + "loss": 2.0312, + "step": 4488 + }, + { + "epoch": 1.3778391651319828, + "grad_norm": 0.4831027388572693, + "learning_rate": 9.698689733534539e-05, + "loss": 2.0084, + "step": 4489 + }, + { + "epoch": 1.378146101903008, + "grad_norm": 0.49492040276527405, + "learning_rate": 9.698519769069774e-05, + "loss": 1.9474, + "step": 4490 + }, + { + "epoch": 1.3784530386740332, + "grad_norm": 0.4911774694919586, + "learning_rate": 9.698349758171486e-05, + "loss": 1.987, + "step": 4491 + }, + { + "epoch": 1.3787599754450584, + "grad_norm": 0.5415390729904175, + "learning_rate": 9.69817970084135e-05, + "loss": 1.9927, + "step": 4492 + }, + { + "epoch": 1.3790669122160835, + "grad_norm": 0.6870381832122803, + "learning_rate": 9.698009597081048e-05, + "loss": 2.0348, + "step": 4493 + }, + { + "epoch": 1.3793738489871086, + "grad_norm": 0.6322616934776306, + "learning_rate": 9.697839446892263e-05, + "loss": 2.0119, + "step": 4494 + }, + { + "epoch": 1.3796807857581337, + "grad_norm": 0.5950151681900024, + "learning_rate": 9.697669250276675e-05, + "loss": 2.002, + "step": 4495 + }, + { + "epoch": 1.379987722529159, + "grad_norm": 0.4321151673793793, + "learning_rate": 9.697499007235966e-05, + "loss": 1.9173, + "step": 4496 + }, + { + "epoch": 1.3802946593001841, + "grad_norm": 0.4627344608306885, + "learning_rate": 9.697328717771818e-05, + "loss": 2.0289, + "step": 4497 + }, + { + "epoch": 1.3806015960712092, + "grad_norm": 0.5040726661682129, + "learning_rate": 9.697158381885915e-05, + "loss": 1.9844, + "step": 4498 + }, + { + "epoch": 1.3809085328422346, + "grad_norm": 0.5219398736953735, + "learning_rate": 9.696987999579939e-05, + "loss": 1.9536, + "step": 4499 + }, + { + "epoch": 1.3812154696132597, + "grad_norm": 0.487734317779541, + "learning_rate": 9.696817570855575e-05, + "loss": 1.9655, + "step": 4500 + }, + { + "epoch": 1.3815224063842848, + "grad_norm": 0.40818822383880615, + "learning_rate": 9.696647095714506e-05, + "loss": 1.9524, + "step": 4501 + }, + { + "epoch": 1.3818293431553101, + "grad_norm": 0.41752889752388, + "learning_rate": 9.69647657415842e-05, + "loss": 1.9927, + "step": 4502 + }, + { + "epoch": 1.3821362799263353, + "grad_norm": 0.44540464878082275, + "learning_rate": 9.696306006188998e-05, + "loss": 1.9207, + "step": 4503 + }, + { + "epoch": 1.3824432166973604, + "grad_norm": 0.44818806648254395, + "learning_rate": 9.696135391807927e-05, + "loss": 1.9054, + "step": 4504 + }, + { + "epoch": 1.3827501534683855, + "grad_norm": 0.430758535861969, + "learning_rate": 9.695964731016896e-05, + "loss": 1.9644, + "step": 4505 + }, + { + "epoch": 1.3830570902394106, + "grad_norm": 0.3787635564804077, + "learning_rate": 9.695794023817586e-05, + "loss": 1.9601, + "step": 4506 + }, + { + "epoch": 1.383364027010436, + "grad_norm": 0.42520588636398315, + "learning_rate": 9.695623270211689e-05, + "loss": 1.9681, + "step": 4507 + }, + { + "epoch": 1.383670963781461, + "grad_norm": 0.39063912630081177, + "learning_rate": 9.69545247020089e-05, + "loss": 2.0323, + "step": 4508 + }, + { + "epoch": 1.3839779005524862, + "grad_norm": 0.41405799984931946, + "learning_rate": 9.695281623786879e-05, + "loss": 1.9239, + "step": 4509 + }, + { + "epoch": 1.3842848373235115, + "grad_norm": 0.4275501072406769, + "learning_rate": 9.695110730971342e-05, + "loss": 1.941, + "step": 4510 + }, + { + "epoch": 1.3845917740945366, + "grad_norm": 0.5254966616630554, + "learning_rate": 9.694939791755968e-05, + "loss": 1.9997, + "step": 4511 + }, + { + "epoch": 1.3848987108655617, + "grad_norm": 0.581857442855835, + "learning_rate": 9.694768806142448e-05, + "loss": 2.0085, + "step": 4512 + }, + { + "epoch": 1.3852056476365868, + "grad_norm": 0.6330662965774536, + "learning_rate": 9.69459777413247e-05, + "loss": 1.9898, + "step": 4513 + }, + { + "epoch": 1.385512584407612, + "grad_norm": 0.693536639213562, + "learning_rate": 9.694426695727727e-05, + "loss": 1.9466, + "step": 4514 + }, + { + "epoch": 1.3858195211786373, + "grad_norm": 0.6494079232215881, + "learning_rate": 9.694255570929906e-05, + "loss": 1.9523, + "step": 4515 + }, + { + "epoch": 1.3861264579496624, + "grad_norm": 0.573515772819519, + "learning_rate": 9.694084399740701e-05, + "loss": 1.9789, + "step": 4516 + }, + { + "epoch": 1.3864333947206875, + "grad_norm": 0.5253448486328125, + "learning_rate": 9.693913182161805e-05, + "loss": 2.0348, + "step": 4517 + }, + { + "epoch": 1.3867403314917128, + "grad_norm": 0.49921590089797974, + "learning_rate": 9.693741918194904e-05, + "loss": 1.9684, + "step": 4518 + }, + { + "epoch": 1.387047268262738, + "grad_norm": 0.5164174437522888, + "learning_rate": 9.693570607841696e-05, + "loss": 2.0104, + "step": 4519 + }, + { + "epoch": 1.387354205033763, + "grad_norm": 0.5620231032371521, + "learning_rate": 9.693399251103872e-05, + "loss": 1.9969, + "step": 4520 + }, + { + "epoch": 1.3876611418047882, + "grad_norm": 0.495890349149704, + "learning_rate": 9.693227847983126e-05, + "loss": 2.0037, + "step": 4521 + }, + { + "epoch": 1.3879680785758133, + "grad_norm": 0.4942645728588104, + "learning_rate": 9.693056398481151e-05, + "loss": 2.0199, + "step": 4522 + }, + { + "epoch": 1.3882750153468386, + "grad_norm": 0.5366860628128052, + "learning_rate": 9.692884902599643e-05, + "loss": 2.0395, + "step": 4523 + }, + { + "epoch": 1.3885819521178637, + "grad_norm": 0.48179951310157776, + "learning_rate": 9.692713360340295e-05, + "loss": 2.0292, + "step": 4524 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.4709320366382599, + "learning_rate": 9.692541771704804e-05, + "loss": 2.006, + "step": 4525 + }, + { + "epoch": 1.3891958256599142, + "grad_norm": 0.4311975836753845, + "learning_rate": 9.692370136694864e-05, + "loss": 2.0122, + "step": 4526 + }, + { + "epoch": 1.3895027624309393, + "grad_norm": 0.4489841163158417, + "learning_rate": 9.692198455312172e-05, + "loss": 1.9635, + "step": 4527 + }, + { + "epoch": 1.3898096992019644, + "grad_norm": 0.40383243560791016, + "learning_rate": 9.692026727558425e-05, + "loss": 1.9352, + "step": 4528 + }, + { + "epoch": 1.3901166359729895, + "grad_norm": 0.4732677638530731, + "learning_rate": 9.691854953435319e-05, + "loss": 1.9882, + "step": 4529 + }, + { + "epoch": 1.3904235727440146, + "grad_norm": 0.5124688744544983, + "learning_rate": 9.691683132944553e-05, + "loss": 2.0068, + "step": 4530 + }, + { + "epoch": 1.39073050951504, + "grad_norm": 0.4810490906238556, + "learning_rate": 9.691511266087824e-05, + "loss": 2.0163, + "step": 4531 + }, + { + "epoch": 1.391037446286065, + "grad_norm": 0.4019710421562195, + "learning_rate": 9.691339352866831e-05, + "loss": 1.8943, + "step": 4532 + }, + { + "epoch": 1.3913443830570902, + "grad_norm": 0.4144287705421448, + "learning_rate": 9.691167393283274e-05, + "loss": 1.9457, + "step": 4533 + }, + { + "epoch": 1.3916513198281155, + "grad_norm": 0.42622655630111694, + "learning_rate": 9.690995387338851e-05, + "loss": 1.9618, + "step": 4534 + }, + { + "epoch": 1.3919582565991406, + "grad_norm": 0.4547794461250305, + "learning_rate": 9.690823335035259e-05, + "loss": 2.0243, + "step": 4535 + }, + { + "epoch": 1.3922651933701657, + "grad_norm": 0.4298909604549408, + "learning_rate": 9.690651236374205e-05, + "loss": 1.9872, + "step": 4536 + }, + { + "epoch": 1.3925721301411909, + "grad_norm": 0.40463829040527344, + "learning_rate": 9.690479091357386e-05, + "loss": 1.9617, + "step": 4537 + }, + { + "epoch": 1.392879066912216, + "grad_norm": 0.441487580537796, + "learning_rate": 9.690306899986502e-05, + "loss": 1.8965, + "step": 4538 + }, + { + "epoch": 1.3931860036832413, + "grad_norm": 0.4713582694530487, + "learning_rate": 9.690134662263256e-05, + "loss": 2.0112, + "step": 4539 + }, + { + "epoch": 1.3934929404542664, + "grad_norm": 0.5772922039031982, + "learning_rate": 9.689962378189351e-05, + "loss": 1.9903, + "step": 4540 + }, + { + "epoch": 1.3937998772252915, + "grad_norm": 0.6658890247344971, + "learning_rate": 9.689790047766489e-05, + "loss": 2.0569, + "step": 4541 + }, + { + "epoch": 1.3941068139963169, + "grad_norm": 0.6710116267204285, + "learning_rate": 9.689617670996372e-05, + "loss": 1.9692, + "step": 4542 + }, + { + "epoch": 1.394413750767342, + "grad_norm": 0.6778390407562256, + "learning_rate": 9.689445247880707e-05, + "loss": 2.0363, + "step": 4543 + }, + { + "epoch": 1.394720687538367, + "grad_norm": 0.6921203136444092, + "learning_rate": 9.689272778421192e-05, + "loss": 2.0104, + "step": 4544 + }, + { + "epoch": 1.3950276243093922, + "grad_norm": 0.48772117495536804, + "learning_rate": 9.689100262619537e-05, + "loss": 2.0006, + "step": 4545 + }, + { + "epoch": 1.3953345610804173, + "grad_norm": 0.4956360459327698, + "learning_rate": 9.688927700477445e-05, + "loss": 1.9724, + "step": 4546 + }, + { + "epoch": 1.3956414978514426, + "grad_norm": 0.6304072141647339, + "learning_rate": 9.68875509199662e-05, + "loss": 1.9904, + "step": 4547 + }, + { + "epoch": 1.3959484346224678, + "grad_norm": 0.6372275948524475, + "learning_rate": 9.68858243717877e-05, + "loss": 2.0328, + "step": 4548 + }, + { + "epoch": 1.3962553713934929, + "grad_norm": 0.48642870783805847, + "learning_rate": 9.688409736025601e-05, + "loss": 1.9898, + "step": 4549 + }, + { + "epoch": 1.3965623081645182, + "grad_norm": 0.41096800565719604, + "learning_rate": 9.688236988538817e-05, + "loss": 1.8945, + "step": 4550 + }, + { + "epoch": 1.3968692449355433, + "grad_norm": 0.48746830224990845, + "learning_rate": 9.68806419472013e-05, + "loss": 1.9809, + "step": 4551 + }, + { + "epoch": 1.3971761817065684, + "grad_norm": 0.5296676754951477, + "learning_rate": 9.687891354571242e-05, + "loss": 1.9194, + "step": 4552 + }, + { + "epoch": 1.3974831184775935, + "grad_norm": 0.43177086114883423, + "learning_rate": 9.687718468093865e-05, + "loss": 1.8785, + "step": 4553 + }, + { + "epoch": 1.3977900552486187, + "grad_norm": 0.4617565870285034, + "learning_rate": 9.687545535289705e-05, + "loss": 2.0021, + "step": 4554 + }, + { + "epoch": 1.398096992019644, + "grad_norm": 0.4460168182849884, + "learning_rate": 9.687372556160477e-05, + "loss": 1.9368, + "step": 4555 + }, + { + "epoch": 1.398403928790669, + "grad_norm": 0.5051010847091675, + "learning_rate": 9.687199530707882e-05, + "loss": 2.0321, + "step": 4556 + }, + { + "epoch": 1.3987108655616942, + "grad_norm": 0.5623685717582703, + "learning_rate": 9.687026458933636e-05, + "loss": 2.007, + "step": 4557 + }, + { + "epoch": 1.3990178023327196, + "grad_norm": 0.48149919509887695, + "learning_rate": 9.686853340839446e-05, + "loss": 1.9346, + "step": 4558 + }, + { + "epoch": 1.3993247391037447, + "grad_norm": 0.4651631712913513, + "learning_rate": 9.686680176427025e-05, + "loss": 1.9603, + "step": 4559 + }, + { + "epoch": 1.3996316758747698, + "grad_norm": 0.5255021452903748, + "learning_rate": 9.686506965698083e-05, + "loss": 2.0206, + "step": 4560 + }, + { + "epoch": 1.3999386126457949, + "grad_norm": 0.5137404799461365, + "learning_rate": 9.686333708654334e-05, + "loss": 1.9736, + "step": 4561 + }, + { + "epoch": 1.40024554941682, + "grad_norm": 0.5037943124771118, + "learning_rate": 9.686160405297487e-05, + "loss": 1.9886, + "step": 4562 + }, + { + "epoch": 1.4005524861878453, + "grad_norm": 0.46424365043640137, + "learning_rate": 9.685987055629256e-05, + "loss": 1.9316, + "step": 4563 + }, + { + "epoch": 1.4008594229588704, + "grad_norm": 0.4839535355567932, + "learning_rate": 9.685813659651355e-05, + "loss": 1.9651, + "step": 4564 + }, + { + "epoch": 1.4011663597298956, + "grad_norm": 0.48972323536872864, + "learning_rate": 9.685640217365497e-05, + "loss": 1.9544, + "step": 4565 + }, + { + "epoch": 1.401473296500921, + "grad_norm": 0.43038102984428406, + "learning_rate": 9.685466728773396e-05, + "loss": 1.9522, + "step": 4566 + }, + { + "epoch": 1.401780233271946, + "grad_norm": 0.5174641013145447, + "learning_rate": 9.685293193876765e-05, + "loss": 2.046, + "step": 4567 + }, + { + "epoch": 1.4020871700429711, + "grad_norm": 0.6731263995170593, + "learning_rate": 9.685119612677323e-05, + "loss": 2.0123, + "step": 4568 + }, + { + "epoch": 1.4023941068139965, + "grad_norm": 0.5863515734672546, + "learning_rate": 9.684945985176782e-05, + "loss": 1.9951, + "step": 4569 + }, + { + "epoch": 1.4027010435850216, + "grad_norm": 0.4479050934314728, + "learning_rate": 9.684772311376859e-05, + "loss": 1.9287, + "step": 4570 + }, + { + "epoch": 1.4030079803560467, + "grad_norm": 0.432740718126297, + "learning_rate": 9.68459859127927e-05, + "loss": 1.955, + "step": 4571 + }, + { + "epoch": 1.4033149171270718, + "grad_norm": 0.571775496006012, + "learning_rate": 9.684424824885731e-05, + "loss": 1.9519, + "step": 4572 + }, + { + "epoch": 1.403621853898097, + "grad_norm": 0.6454880237579346, + "learning_rate": 9.684251012197963e-05, + "loss": 1.9858, + "step": 4573 + }, + { + "epoch": 1.4039287906691222, + "grad_norm": 0.5274731516838074, + "learning_rate": 9.684077153217677e-05, + "loss": 1.9956, + "step": 4574 + }, + { + "epoch": 1.4042357274401474, + "grad_norm": 0.4459272027015686, + "learning_rate": 9.683903247946597e-05, + "loss": 2.0412, + "step": 4575 + }, + { + "epoch": 1.4045426642111725, + "grad_norm": 0.47089213132858276, + "learning_rate": 9.683729296386441e-05, + "loss": 1.9247, + "step": 4576 + }, + { + "epoch": 1.4048496009821978, + "grad_norm": 0.628490149974823, + "learning_rate": 9.683555298538927e-05, + "loss": 2.1311, + "step": 4577 + }, + { + "epoch": 1.405156537753223, + "grad_norm": 0.5498626232147217, + "learning_rate": 9.683381254405773e-05, + "loss": 1.9538, + "step": 4578 + }, + { + "epoch": 1.405463474524248, + "grad_norm": 0.4556458294391632, + "learning_rate": 9.6832071639887e-05, + "loss": 1.9957, + "step": 4579 + }, + { + "epoch": 1.4057704112952731, + "grad_norm": 0.5684164762496948, + "learning_rate": 9.68303302728943e-05, + "loss": 1.9339, + "step": 4580 + }, + { + "epoch": 1.4060773480662982, + "grad_norm": 0.5723292231559753, + "learning_rate": 9.682858844309682e-05, + "loss": 2.0043, + "step": 4581 + }, + { + "epoch": 1.4063842848373236, + "grad_norm": 0.4734770953655243, + "learning_rate": 9.682684615051178e-05, + "loss": 1.9854, + "step": 4582 + }, + { + "epoch": 1.4066912216083487, + "grad_norm": 0.49376189708709717, + "learning_rate": 9.682510339515642e-05, + "loss": 2.0436, + "step": 4583 + }, + { + "epoch": 1.4069981583793738, + "grad_norm": 0.6263520121574402, + "learning_rate": 9.682336017704793e-05, + "loss": 1.9426, + "step": 4584 + }, + { + "epoch": 1.4073050951503991, + "grad_norm": 0.5852357745170593, + "learning_rate": 9.682161649620355e-05, + "loss": 1.9865, + "step": 4585 + }, + { + "epoch": 1.4076120319214243, + "grad_norm": 0.45548367500305176, + "learning_rate": 9.681987235264052e-05, + "loss": 2.0454, + "step": 4586 + }, + { + "epoch": 1.4079189686924494, + "grad_norm": 0.4961472153663635, + "learning_rate": 9.681812774637607e-05, + "loss": 2.0414, + "step": 4587 + }, + { + "epoch": 1.4082259054634745, + "grad_norm": 0.5739028453826904, + "learning_rate": 9.681638267742741e-05, + "loss": 1.9591, + "step": 4588 + }, + { + "epoch": 1.4085328422344996, + "grad_norm": 0.546283483505249, + "learning_rate": 9.681463714581184e-05, + "loss": 1.9631, + "step": 4589 + }, + { + "epoch": 1.408839779005525, + "grad_norm": 0.4757421910762787, + "learning_rate": 9.681289115154659e-05, + "loss": 1.954, + "step": 4590 + }, + { + "epoch": 1.40914671577655, + "grad_norm": 0.5116898417472839, + "learning_rate": 9.681114469464891e-05, + "loss": 1.9816, + "step": 4591 + }, + { + "epoch": 1.4094536525475752, + "grad_norm": 0.6128544807434082, + "learning_rate": 9.680939777513607e-05, + "loss": 1.9408, + "step": 4592 + }, + { + "epoch": 1.4097605893186005, + "grad_norm": 0.5577036142349243, + "learning_rate": 9.680765039302531e-05, + "loss": 1.906, + "step": 4593 + }, + { + "epoch": 1.4100675260896256, + "grad_norm": 0.4608074128627777, + "learning_rate": 9.680590254833393e-05, + "loss": 1.9421, + "step": 4594 + }, + { + "epoch": 1.4103744628606507, + "grad_norm": 0.4221206307411194, + "learning_rate": 9.680415424107917e-05, + "loss": 1.9596, + "step": 4595 + }, + { + "epoch": 1.4106813996316758, + "grad_norm": 0.4278069734573364, + "learning_rate": 9.680240547127832e-05, + "loss": 1.9718, + "step": 4596 + }, + { + "epoch": 1.410988336402701, + "grad_norm": 0.48608019948005676, + "learning_rate": 9.680065623894869e-05, + "loss": 2.0595, + "step": 4597 + }, + { + "epoch": 1.4112952731737263, + "grad_norm": 0.4559817910194397, + "learning_rate": 9.679890654410753e-05, + "loss": 1.959, + "step": 4598 + }, + { + "epoch": 1.4116022099447514, + "grad_norm": 0.5122750997543335, + "learning_rate": 9.679715638677216e-05, + "loss": 2.0669, + "step": 4599 + }, + { + "epoch": 1.4119091467157765, + "grad_norm": 0.5203170776367188, + "learning_rate": 9.679540576695985e-05, + "loss": 1.9475, + "step": 4600 + }, + { + "epoch": 1.4122160834868018, + "grad_norm": 0.5420581698417664, + "learning_rate": 9.679365468468791e-05, + "loss": 1.9603, + "step": 4601 + }, + { + "epoch": 1.412523020257827, + "grad_norm": 0.527387261390686, + "learning_rate": 9.679190313997364e-05, + "loss": 1.9172, + "step": 4602 + }, + { + "epoch": 1.412829957028852, + "grad_norm": 0.48417946696281433, + "learning_rate": 9.679015113283438e-05, + "loss": 1.9619, + "step": 4603 + }, + { + "epoch": 1.4131368937998772, + "grad_norm": 0.49174100160598755, + "learning_rate": 9.678839866328742e-05, + "loss": 1.9959, + "step": 4604 + }, + { + "epoch": 1.4134438305709023, + "grad_norm": 0.5096092224121094, + "learning_rate": 9.678664573135006e-05, + "loss": 2.0046, + "step": 4605 + }, + { + "epoch": 1.4137507673419276, + "grad_norm": 0.4536958634853363, + "learning_rate": 9.678489233703965e-05, + "loss": 1.9289, + "step": 4606 + }, + { + "epoch": 1.4140577041129527, + "grad_norm": 0.40438196063041687, + "learning_rate": 9.678313848037353e-05, + "loss": 1.9488, + "step": 4607 + }, + { + "epoch": 1.4143646408839778, + "grad_norm": 0.4447456896305084, + "learning_rate": 9.6781384161369e-05, + "loss": 1.9638, + "step": 4608 + }, + { + "epoch": 1.4146715776550032, + "grad_norm": 0.44451746344566345, + "learning_rate": 9.677962938004342e-05, + "loss": 1.9026, + "step": 4609 + }, + { + "epoch": 1.4149785144260283, + "grad_norm": 0.4262266457080841, + "learning_rate": 9.677787413641412e-05, + "loss": 1.9408, + "step": 4610 + }, + { + "epoch": 1.4152854511970534, + "grad_norm": 0.42755937576293945, + "learning_rate": 9.677611843049845e-05, + "loss": 1.9542, + "step": 4611 + }, + { + "epoch": 1.4155923879680785, + "grad_norm": 0.43264830112457275, + "learning_rate": 9.677436226231375e-05, + "loss": 2.0244, + "step": 4612 + }, + { + "epoch": 1.4158993247391036, + "grad_norm": 0.4521278142929077, + "learning_rate": 9.67726056318774e-05, + "loss": 2.0343, + "step": 4613 + }, + { + "epoch": 1.416206261510129, + "grad_norm": 0.45257535576820374, + "learning_rate": 9.677084853920675e-05, + "loss": 1.9743, + "step": 4614 + }, + { + "epoch": 1.416513198281154, + "grad_norm": 0.42859771847724915, + "learning_rate": 9.676909098431915e-05, + "loss": 2.0067, + "step": 4615 + }, + { + "epoch": 1.4168201350521792, + "grad_norm": 0.4057050049304962, + "learning_rate": 9.6767332967232e-05, + "loss": 1.9074, + "step": 4616 + }, + { + "epoch": 1.4171270718232045, + "grad_norm": 0.46177807450294495, + "learning_rate": 9.676557448796264e-05, + "loss": 1.9899, + "step": 4617 + }, + { + "epoch": 1.4174340085942296, + "grad_norm": 0.44164395332336426, + "learning_rate": 9.676381554652846e-05, + "loss": 1.9759, + "step": 4618 + }, + { + "epoch": 1.4177409453652547, + "grad_norm": 0.42987993359565735, + "learning_rate": 9.676205614294684e-05, + "loss": 1.8783, + "step": 4619 + }, + { + "epoch": 1.4180478821362799, + "grad_norm": 0.541702389717102, + "learning_rate": 9.67602962772352e-05, + "loss": 2.0099, + "step": 4620 + }, + { + "epoch": 1.418354818907305, + "grad_norm": 0.42173272371292114, + "learning_rate": 9.67585359494109e-05, + "loss": 1.9281, + "step": 4621 + }, + { + "epoch": 1.4186617556783303, + "grad_norm": 0.432476669549942, + "learning_rate": 9.67567751594913e-05, + "loss": 1.9124, + "step": 4622 + }, + { + "epoch": 1.4189686924493554, + "grad_norm": 0.4952125549316406, + "learning_rate": 9.675501390749388e-05, + "loss": 1.973, + "step": 4623 + }, + { + "epoch": 1.4192756292203805, + "grad_norm": 0.5270698070526123, + "learning_rate": 9.6753252193436e-05, + "loss": 2.003, + "step": 4624 + }, + { + "epoch": 1.4195825659914059, + "grad_norm": 0.5735524892807007, + "learning_rate": 9.67514900173351e-05, + "loss": 1.9266, + "step": 4625 + }, + { + "epoch": 1.419889502762431, + "grad_norm": 0.508196234703064, + "learning_rate": 9.674972737920855e-05, + "loss": 1.9633, + "step": 4626 + }, + { + "epoch": 1.420196439533456, + "grad_norm": 0.4321250319480896, + "learning_rate": 9.674796427907379e-05, + "loss": 1.9994, + "step": 4627 + }, + { + "epoch": 1.4205033763044812, + "grad_norm": 0.5697643756866455, + "learning_rate": 9.674620071694826e-05, + "loss": 2.0018, + "step": 4628 + }, + { + "epoch": 1.4208103130755063, + "grad_norm": 0.6797513365745544, + "learning_rate": 9.674443669284936e-05, + "loss": 2.0514, + "step": 4629 + }, + { + "epoch": 1.4211172498465316, + "grad_norm": 0.6622742414474487, + "learning_rate": 9.674267220679456e-05, + "loss": 1.9315, + "step": 4630 + }, + { + "epoch": 1.4214241866175568, + "grad_norm": 0.5143589377403259, + "learning_rate": 9.674090725880125e-05, + "loss": 1.9691, + "step": 4631 + }, + { + "epoch": 1.4217311233885819, + "grad_norm": 0.4472220838069916, + "learning_rate": 9.673914184888692e-05, + "loss": 1.9629, + "step": 4632 + }, + { + "epoch": 1.4220380601596072, + "grad_norm": 0.4992378354072571, + "learning_rate": 9.6737375977069e-05, + "loss": 1.9202, + "step": 4633 + }, + { + "epoch": 1.4223449969306323, + "grad_norm": 0.5463345646858215, + "learning_rate": 9.673560964336493e-05, + "loss": 2.0143, + "step": 4634 + }, + { + "epoch": 1.4226519337016574, + "grad_norm": 0.4566437304019928, + "learning_rate": 9.673384284779217e-05, + "loss": 1.8907, + "step": 4635 + }, + { + "epoch": 1.4229588704726825, + "grad_norm": 0.41718652844429016, + "learning_rate": 9.673207559036816e-05, + "loss": 1.8955, + "step": 4636 + }, + { + "epoch": 1.4232658072437077, + "grad_norm": 0.5017329454421997, + "learning_rate": 9.673030787111043e-05, + "loss": 1.9745, + "step": 4637 + }, + { + "epoch": 1.423572744014733, + "grad_norm": 0.48890092968940735, + "learning_rate": 9.67285396900364e-05, + "loss": 1.9448, + "step": 4638 + }, + { + "epoch": 1.423879680785758, + "grad_norm": 0.4519537687301636, + "learning_rate": 9.672677104716352e-05, + "loss": 1.9572, + "step": 4639 + }, + { + "epoch": 1.4241866175567832, + "grad_norm": 0.4786919355392456, + "learning_rate": 9.672500194250932e-05, + "loss": 2.0212, + "step": 4640 + }, + { + "epoch": 1.4244935543278086, + "grad_norm": 0.4938487112522125, + "learning_rate": 9.672323237609127e-05, + "loss": 1.9842, + "step": 4641 + }, + { + "epoch": 1.4248004910988337, + "grad_norm": 0.5786599516868591, + "learning_rate": 9.672146234792686e-05, + "loss": 1.9575, + "step": 4642 + }, + { + "epoch": 1.4251074278698588, + "grad_norm": 0.5532247424125671, + "learning_rate": 9.671969185803356e-05, + "loss": 1.9972, + "step": 4643 + }, + { + "epoch": 1.4254143646408841, + "grad_norm": 0.5058014988899231, + "learning_rate": 9.671792090642889e-05, + "loss": 2.0042, + "step": 4644 + }, + { + "epoch": 1.4257213014119092, + "grad_norm": 0.46545106172561646, + "learning_rate": 9.671614949313033e-05, + "loss": 1.9853, + "step": 4645 + }, + { + "epoch": 1.4260282381829343, + "grad_norm": 0.47626879811286926, + "learning_rate": 9.671437761815541e-05, + "loss": 1.9725, + "step": 4646 + }, + { + "epoch": 1.4263351749539595, + "grad_norm": 0.4476237893104553, + "learning_rate": 9.671260528152165e-05, + "loss": 1.8876, + "step": 4647 + }, + { + "epoch": 1.4266421117249846, + "grad_norm": 0.4290693700313568, + "learning_rate": 9.671083248324651e-05, + "loss": 1.9766, + "step": 4648 + }, + { + "epoch": 1.42694904849601, + "grad_norm": 0.443131685256958, + "learning_rate": 9.670905922334757e-05, + "loss": 2.0201, + "step": 4649 + }, + { + "epoch": 1.427255985267035, + "grad_norm": 0.5181389451026917, + "learning_rate": 9.670728550184231e-05, + "loss": 2.0013, + "step": 4650 + }, + { + "epoch": 1.4275629220380601, + "grad_norm": 0.48453402519226074, + "learning_rate": 9.670551131874829e-05, + "loss": 1.9536, + "step": 4651 + }, + { + "epoch": 1.4278698588090855, + "grad_norm": 0.49652302265167236, + "learning_rate": 9.670373667408303e-05, + "loss": 1.9934, + "step": 4652 + }, + { + "epoch": 1.4281767955801106, + "grad_norm": 0.47071191668510437, + "learning_rate": 9.670196156786406e-05, + "loss": 2.0319, + "step": 4653 + }, + { + "epoch": 1.4284837323511357, + "grad_norm": 0.46828708052635193, + "learning_rate": 9.670018600010894e-05, + "loss": 1.9248, + "step": 4654 + }, + { + "epoch": 1.4287906691221608, + "grad_norm": 0.48472490906715393, + "learning_rate": 9.669840997083524e-05, + "loss": 1.9681, + "step": 4655 + }, + { + "epoch": 1.429097605893186, + "grad_norm": 0.48628562688827515, + "learning_rate": 9.669663348006044e-05, + "loss": 1.9818, + "step": 4656 + }, + { + "epoch": 1.4294045426642112, + "grad_norm": 0.40770742297172546, + "learning_rate": 9.669485652780215e-05, + "loss": 1.927, + "step": 4657 + }, + { + "epoch": 1.4297114794352364, + "grad_norm": 0.5005267858505249, + "learning_rate": 9.669307911407794e-05, + "loss": 2.0564, + "step": 4658 + }, + { + "epoch": 1.4300184162062615, + "grad_norm": 0.42432111501693726, + "learning_rate": 9.669130123890533e-05, + "loss": 1.9344, + "step": 4659 + }, + { + "epoch": 1.4303253529772868, + "grad_norm": 0.42347240447998047, + "learning_rate": 9.668952290230192e-05, + "loss": 1.962, + "step": 4660 + }, + { + "epoch": 1.430632289748312, + "grad_norm": 0.4718005955219269, + "learning_rate": 9.668774410428529e-05, + "loss": 2.0081, + "step": 4661 + }, + { + "epoch": 1.430939226519337, + "grad_norm": 0.45922374725341797, + "learning_rate": 9.6685964844873e-05, + "loss": 1.9378, + "step": 4662 + }, + { + "epoch": 1.4312461632903621, + "grad_norm": 0.43764227628707886, + "learning_rate": 9.668418512408263e-05, + "loss": 2.0084, + "step": 4663 + }, + { + "epoch": 1.4315531000613873, + "grad_norm": 0.42079678177833557, + "learning_rate": 9.668240494193179e-05, + "loss": 1.9675, + "step": 4664 + }, + { + "epoch": 1.4318600368324126, + "grad_norm": 0.4470539093017578, + "learning_rate": 9.668062429843808e-05, + "loss": 1.9781, + "step": 4665 + }, + { + "epoch": 1.4321669736034377, + "grad_norm": 0.4903084337711334, + "learning_rate": 9.667884319361906e-05, + "loss": 1.9612, + "step": 4666 + }, + { + "epoch": 1.4324739103744628, + "grad_norm": 0.4906228482723236, + "learning_rate": 9.667706162749234e-05, + "loss": 2.0115, + "step": 4667 + }, + { + "epoch": 1.4327808471454881, + "grad_norm": 0.4868105351924896, + "learning_rate": 9.667527960007556e-05, + "loss": 1.9648, + "step": 4668 + }, + { + "epoch": 1.4330877839165133, + "grad_norm": 0.5115882754325867, + "learning_rate": 9.667349711138632e-05, + "loss": 2.0366, + "step": 4669 + }, + { + "epoch": 1.4333947206875384, + "grad_norm": 0.47366276383399963, + "learning_rate": 9.66717141614422e-05, + "loss": 1.9467, + "step": 4670 + }, + { + "epoch": 1.4337016574585635, + "grad_norm": 0.6110171675682068, + "learning_rate": 9.666993075026086e-05, + "loss": 1.9272, + "step": 4671 + }, + { + "epoch": 1.4340085942295886, + "grad_norm": 0.5915683507919312, + "learning_rate": 9.66681468778599e-05, + "loss": 2.0444, + "step": 4672 + }, + { + "epoch": 1.434315531000614, + "grad_norm": 0.5783519744873047, + "learning_rate": 9.666636254425697e-05, + "loss": 1.9579, + "step": 4673 + }, + { + "epoch": 1.434622467771639, + "grad_norm": 0.4646502137184143, + "learning_rate": 9.66645777494697e-05, + "loss": 1.9172, + "step": 4674 + }, + { + "epoch": 1.4349294045426642, + "grad_norm": 0.4184744656085968, + "learning_rate": 9.666279249351571e-05, + "loss": 1.9189, + "step": 4675 + }, + { + "epoch": 1.4352363413136895, + "grad_norm": 0.5444575548171997, + "learning_rate": 9.666100677641266e-05, + "loss": 2.045, + "step": 4676 + }, + { + "epoch": 1.4355432780847146, + "grad_norm": 0.5232846140861511, + "learning_rate": 9.665922059817818e-05, + "loss": 2.0059, + "step": 4677 + }, + { + "epoch": 1.4358502148557397, + "grad_norm": 0.439259797334671, + "learning_rate": 9.665743395882994e-05, + "loss": 1.9164, + "step": 4678 + }, + { + "epoch": 1.4361571516267648, + "grad_norm": 0.405073344707489, + "learning_rate": 9.66556468583856e-05, + "loss": 1.9211, + "step": 4679 + }, + { + "epoch": 1.43646408839779, + "grad_norm": 0.47113174200057983, + "learning_rate": 9.665385929686279e-05, + "loss": 2.0732, + "step": 4680 + }, + { + "epoch": 1.4367710251688153, + "grad_norm": 0.4710143506526947, + "learning_rate": 9.665207127427923e-05, + "loss": 1.9153, + "step": 4681 + }, + { + "epoch": 1.4370779619398404, + "grad_norm": 0.41988152265548706, + "learning_rate": 9.665028279065254e-05, + "loss": 1.9985, + "step": 4682 + }, + { + "epoch": 1.4373848987108655, + "grad_norm": 0.4629889130592346, + "learning_rate": 9.664849384600042e-05, + "loss": 2.0188, + "step": 4683 + }, + { + "epoch": 1.4376918354818908, + "grad_norm": 0.42099106311798096, + "learning_rate": 9.664670444034051e-05, + "loss": 1.8915, + "step": 4684 + }, + { + "epoch": 1.437998772252916, + "grad_norm": 0.4132508337497711, + "learning_rate": 9.664491457369056e-05, + "loss": 1.9842, + "step": 4685 + }, + { + "epoch": 1.438305709023941, + "grad_norm": 0.4019499123096466, + "learning_rate": 9.664312424606822e-05, + "loss": 1.8653, + "step": 4686 + }, + { + "epoch": 1.4386126457949662, + "grad_norm": 0.40366294980049133, + "learning_rate": 9.664133345749118e-05, + "loss": 1.8993, + "step": 4687 + }, + { + "epoch": 1.4389195825659913, + "grad_norm": 0.4391988217830658, + "learning_rate": 9.663954220797715e-05, + "loss": 1.9471, + "step": 4688 + }, + { + "epoch": 1.4392265193370166, + "grad_norm": 0.44109684228897095, + "learning_rate": 9.663775049754382e-05, + "loss": 1.9579, + "step": 4689 + }, + { + "epoch": 1.4395334561080417, + "grad_norm": 0.45682960748672485, + "learning_rate": 9.663595832620891e-05, + "loss": 1.9757, + "step": 4690 + }, + { + "epoch": 1.4398403928790668, + "grad_norm": 0.4106207489967346, + "learning_rate": 9.663416569399013e-05, + "loss": 2.0038, + "step": 4691 + }, + { + "epoch": 1.4401473296500922, + "grad_norm": 0.4627512991428375, + "learning_rate": 9.66323726009052e-05, + "loss": 2.0253, + "step": 4692 + }, + { + "epoch": 1.4404542664211173, + "grad_norm": 0.43822941184043884, + "learning_rate": 9.663057904697182e-05, + "loss": 1.9565, + "step": 4693 + }, + { + "epoch": 1.4407612031921424, + "grad_norm": 0.46254315972328186, + "learning_rate": 9.662878503220772e-05, + "loss": 2.0042, + "step": 4694 + }, + { + "epoch": 1.4410681399631675, + "grad_norm": 0.49801671504974365, + "learning_rate": 9.662699055663065e-05, + "loss": 1.9725, + "step": 4695 + }, + { + "epoch": 1.4413750767341926, + "grad_norm": 0.40280646085739136, + "learning_rate": 9.662519562025832e-05, + "loss": 1.9016, + "step": 4696 + }, + { + "epoch": 1.441682013505218, + "grad_norm": 0.4095497131347656, + "learning_rate": 9.662340022310848e-05, + "loss": 2.0054, + "step": 4697 + }, + { + "epoch": 1.441988950276243, + "grad_norm": 0.44916659593582153, + "learning_rate": 9.662160436519889e-05, + "loss": 2.0126, + "step": 4698 + }, + { + "epoch": 1.4422958870472682, + "grad_norm": 0.47450655698776245, + "learning_rate": 9.661980804654725e-05, + "loss": 1.9679, + "step": 4699 + }, + { + "epoch": 1.4426028238182935, + "grad_norm": 0.4454696774482727, + "learning_rate": 9.661801126717136e-05, + "loss": 1.9335, + "step": 4700 + }, + { + "epoch": 1.4429097605893186, + "grad_norm": 0.5009927153587341, + "learning_rate": 9.661621402708896e-05, + "loss": 1.9777, + "step": 4701 + }, + { + "epoch": 1.4432166973603437, + "grad_norm": 0.49912458658218384, + "learning_rate": 9.66144163263178e-05, + "loss": 2.0095, + "step": 4702 + }, + { + "epoch": 1.4435236341313689, + "grad_norm": 0.4477069079875946, + "learning_rate": 9.661261816487568e-05, + "loss": 1.9265, + "step": 4703 + }, + { + "epoch": 1.443830570902394, + "grad_norm": 0.4170798361301422, + "learning_rate": 9.661081954278033e-05, + "loss": 1.9458, + "step": 4704 + }, + { + "epoch": 1.4441375076734193, + "grad_norm": 0.45160573720932007, + "learning_rate": 9.660902046004953e-05, + "loss": 1.9596, + "step": 4705 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4391551911830902, + "learning_rate": 9.660722091670109e-05, + "loss": 1.9158, + "step": 4706 + }, + { + "epoch": 1.4447513812154695, + "grad_norm": 0.5183218121528625, + "learning_rate": 9.660542091275276e-05, + "loss": 2.0055, + "step": 4707 + }, + { + "epoch": 1.4450583179864949, + "grad_norm": 0.49749481678009033, + "learning_rate": 9.660362044822235e-05, + "loss": 1.9695, + "step": 4708 + }, + { + "epoch": 1.44536525475752, + "grad_norm": 0.4839307963848114, + "learning_rate": 9.660181952312766e-05, + "loss": 1.9447, + "step": 4709 + }, + { + "epoch": 1.445672191528545, + "grad_norm": 0.5218588709831238, + "learning_rate": 9.660001813748647e-05, + "loss": 1.9892, + "step": 4710 + }, + { + "epoch": 1.4459791282995704, + "grad_norm": 0.5628986954689026, + "learning_rate": 9.659821629131658e-05, + "loss": 2.0598, + "step": 4711 + }, + { + "epoch": 1.4462860650705955, + "grad_norm": 0.5226300358772278, + "learning_rate": 9.65964139846358e-05, + "loss": 1.977, + "step": 4712 + }, + { + "epoch": 1.4465930018416207, + "grad_norm": 0.4345463216304779, + "learning_rate": 9.659461121746196e-05, + "loss": 1.9649, + "step": 4713 + }, + { + "epoch": 1.4468999386126458, + "grad_norm": 0.47233885526657104, + "learning_rate": 9.659280798981285e-05, + "loss": 1.9791, + "step": 4714 + }, + { + "epoch": 1.4472068753836709, + "grad_norm": 0.5272542238235474, + "learning_rate": 9.659100430170631e-05, + "loss": 2.0153, + "step": 4715 + }, + { + "epoch": 1.4475138121546962, + "grad_norm": 0.5567492246627808, + "learning_rate": 9.658920015316015e-05, + "loss": 2.0196, + "step": 4716 + }, + { + "epoch": 1.4478207489257213, + "grad_norm": 0.5393046140670776, + "learning_rate": 9.658739554419222e-05, + "loss": 1.9871, + "step": 4717 + }, + { + "epoch": 1.4481276856967464, + "grad_norm": 0.46408072113990784, + "learning_rate": 9.658559047482034e-05, + "loss": 1.9896, + "step": 4718 + }, + { + "epoch": 1.4484346224677718, + "grad_norm": 0.47001218795776367, + "learning_rate": 9.658378494506234e-05, + "loss": 2.0281, + "step": 4719 + }, + { + "epoch": 1.4487415592387969, + "grad_norm": 0.555749773979187, + "learning_rate": 9.658197895493608e-05, + "loss": 2.0184, + "step": 4720 + }, + { + "epoch": 1.449048496009822, + "grad_norm": 0.6206443905830383, + "learning_rate": 9.65801725044594e-05, + "loss": 1.9788, + "step": 4721 + }, + { + "epoch": 1.449355432780847, + "grad_norm": 0.533336877822876, + "learning_rate": 9.657836559365016e-05, + "loss": 1.9755, + "step": 4722 + }, + { + "epoch": 1.4496623695518722, + "grad_norm": 0.4553185701370239, + "learning_rate": 9.65765582225262e-05, + "loss": 1.9791, + "step": 4723 + }, + { + "epoch": 1.4499693063228976, + "grad_norm": 0.5754305124282837, + "learning_rate": 9.65747503911054e-05, + "loss": 1.9485, + "step": 4724 + }, + { + "epoch": 1.4502762430939227, + "grad_norm": 0.6812698245048523, + "learning_rate": 9.657294209940562e-05, + "loss": 2.0326, + "step": 4725 + }, + { + "epoch": 1.4505831798649478, + "grad_norm": 0.7532522678375244, + "learning_rate": 9.657113334744472e-05, + "loss": 1.9387, + "step": 4726 + }, + { + "epoch": 1.4508901166359731, + "grad_norm": 0.5618684887886047, + "learning_rate": 9.656932413524058e-05, + "loss": 1.9395, + "step": 4727 + }, + { + "epoch": 1.4511970534069982, + "grad_norm": 0.4818387031555176, + "learning_rate": 9.65675144628111e-05, + "loss": 1.9473, + "step": 4728 + }, + { + "epoch": 1.4515039901780233, + "grad_norm": 0.5152607560157776, + "learning_rate": 9.656570433017413e-05, + "loss": 1.894, + "step": 4729 + }, + { + "epoch": 1.4518109269490485, + "grad_norm": 0.5098578333854675, + "learning_rate": 9.656389373734759e-05, + "loss": 1.9519, + "step": 4730 + }, + { + "epoch": 1.4521178637200736, + "grad_norm": 0.5862317681312561, + "learning_rate": 9.656208268434936e-05, + "loss": 1.9968, + "step": 4731 + }, + { + "epoch": 1.452424800491099, + "grad_norm": 0.501220703125, + "learning_rate": 9.656027117119732e-05, + "loss": 1.993, + "step": 4732 + }, + { + "epoch": 1.452731737262124, + "grad_norm": 0.4974796772003174, + "learning_rate": 9.655845919790943e-05, + "loss": 2.0007, + "step": 4733 + }, + { + "epoch": 1.4530386740331491, + "grad_norm": 0.513671875, + "learning_rate": 9.655664676450351e-05, + "loss": 1.9321, + "step": 4734 + }, + { + "epoch": 1.4533456108041745, + "grad_norm": 0.5111755728721619, + "learning_rate": 9.655483387099756e-05, + "loss": 2.0187, + "step": 4735 + }, + { + "epoch": 1.4536525475751996, + "grad_norm": 0.47103258967399597, + "learning_rate": 9.655302051740942e-05, + "loss": 1.9716, + "step": 4736 + }, + { + "epoch": 1.4539594843462247, + "grad_norm": 0.4526553750038147, + "learning_rate": 9.655120670375707e-05, + "loss": 2.0424, + "step": 4737 + }, + { + "epoch": 1.4542664211172498, + "grad_norm": 0.44393640756607056, + "learning_rate": 9.65493924300584e-05, + "loss": 1.9318, + "step": 4738 + }, + { + "epoch": 1.454573357888275, + "grad_norm": 0.4070759415626526, + "learning_rate": 9.654757769633136e-05, + "loss": 1.9292, + "step": 4739 + }, + { + "epoch": 1.4548802946593002, + "grad_norm": 0.4010253846645355, + "learning_rate": 9.654576250259387e-05, + "loss": 1.9641, + "step": 4740 + }, + { + "epoch": 1.4551872314303254, + "grad_norm": 0.39156264066696167, + "learning_rate": 9.654394684886387e-05, + "loss": 1.9575, + "step": 4741 + }, + { + "epoch": 1.4554941682013505, + "grad_norm": 0.4360155463218689, + "learning_rate": 9.65421307351593e-05, + "loss": 1.9615, + "step": 4742 + }, + { + "epoch": 1.4558011049723758, + "grad_norm": 0.4203348755836487, + "learning_rate": 9.654031416149813e-05, + "loss": 1.9629, + "step": 4743 + }, + { + "epoch": 1.456108041743401, + "grad_norm": 0.42294225096702576, + "learning_rate": 9.653849712789828e-05, + "loss": 1.9756, + "step": 4744 + }, + { + "epoch": 1.456414978514426, + "grad_norm": 0.46253907680511475, + "learning_rate": 9.653667963437775e-05, + "loss": 2.0128, + "step": 4745 + }, + { + "epoch": 1.4567219152854511, + "grad_norm": 0.41743987798690796, + "learning_rate": 9.653486168095446e-05, + "loss": 1.938, + "step": 4746 + }, + { + "epoch": 1.4570288520564763, + "grad_norm": 0.43411263823509216, + "learning_rate": 9.653304326764639e-05, + "loss": 1.9744, + "step": 4747 + }, + { + "epoch": 1.4573357888275016, + "grad_norm": 0.4569607973098755, + "learning_rate": 9.653122439447151e-05, + "loss": 1.9844, + "step": 4748 + }, + { + "epoch": 1.4576427255985267, + "grad_norm": 0.41858115792274475, + "learning_rate": 9.652940506144781e-05, + "loss": 1.9835, + "step": 4749 + }, + { + "epoch": 1.4579496623695518, + "grad_norm": 0.4259703755378723, + "learning_rate": 9.652758526859324e-05, + "loss": 1.9467, + "step": 4750 + }, + { + "epoch": 1.4582565991405771, + "grad_norm": 0.49847620725631714, + "learning_rate": 9.652576501592583e-05, + "loss": 1.989, + "step": 4751 + }, + { + "epoch": 1.4585635359116023, + "grad_norm": 0.5898705720901489, + "learning_rate": 9.652394430346352e-05, + "loss": 1.9896, + "step": 4752 + }, + { + "epoch": 1.4588704726826274, + "grad_norm": 0.6528434157371521, + "learning_rate": 9.652212313122433e-05, + "loss": 1.9814, + "step": 4753 + }, + { + "epoch": 1.4591774094536525, + "grad_norm": 0.5704251527786255, + "learning_rate": 9.652030149922624e-05, + "loss": 1.9735, + "step": 4754 + }, + { + "epoch": 1.4594843462246776, + "grad_norm": 0.4349142014980316, + "learning_rate": 9.651847940748727e-05, + "loss": 1.9923, + "step": 4755 + }, + { + "epoch": 1.459791282995703, + "grad_norm": 0.43891096115112305, + "learning_rate": 9.651665685602542e-05, + "loss": 1.9429, + "step": 4756 + }, + { + "epoch": 1.460098219766728, + "grad_norm": 0.5881633758544922, + "learning_rate": 9.651483384485871e-05, + "loss": 2.0075, + "step": 4757 + }, + { + "epoch": 1.4604051565377532, + "grad_norm": 0.569064736366272, + "learning_rate": 9.651301037400515e-05, + "loss": 1.9968, + "step": 4758 + }, + { + "epoch": 1.4607120933087785, + "grad_norm": 0.49636805057525635, + "learning_rate": 9.651118644348276e-05, + "loss": 2.0844, + "step": 4759 + }, + { + "epoch": 1.4610190300798036, + "grad_norm": 0.4893283247947693, + "learning_rate": 9.650936205330955e-05, + "loss": 1.9635, + "step": 4760 + }, + { + "epoch": 1.4613259668508287, + "grad_norm": 0.5199632048606873, + "learning_rate": 9.650753720350358e-05, + "loss": 1.8934, + "step": 4761 + }, + { + "epoch": 1.4616329036218538, + "grad_norm": 0.5655859708786011, + "learning_rate": 9.650571189408287e-05, + "loss": 2.0473, + "step": 4762 + }, + { + "epoch": 1.461939840392879, + "grad_norm": 0.5004158020019531, + "learning_rate": 9.650388612506545e-05, + "loss": 1.9388, + "step": 4763 + }, + { + "epoch": 1.4622467771639043, + "grad_norm": 0.5075541734695435, + "learning_rate": 9.650205989646937e-05, + "loss": 2.0362, + "step": 4764 + }, + { + "epoch": 1.4625537139349294, + "grad_norm": 0.52835613489151, + "learning_rate": 9.650023320831267e-05, + "loss": 1.9849, + "step": 4765 + }, + { + "epoch": 1.4628606507059545, + "grad_norm": 0.5208338499069214, + "learning_rate": 9.649840606061342e-05, + "loss": 1.9619, + "step": 4766 + }, + { + "epoch": 1.4631675874769798, + "grad_norm": 0.4954691529273987, + "learning_rate": 9.649657845338966e-05, + "loss": 1.9282, + "step": 4767 + }, + { + "epoch": 1.463474524248005, + "grad_norm": 0.4260660409927368, + "learning_rate": 9.649475038665947e-05, + "loss": 2.0108, + "step": 4768 + }, + { + "epoch": 1.46378146101903, + "grad_norm": 0.4954771101474762, + "learning_rate": 9.64929218604409e-05, + "loss": 1.9995, + "step": 4769 + }, + { + "epoch": 1.4640883977900552, + "grad_norm": 0.6004415154457092, + "learning_rate": 9.649109287475202e-05, + "loss": 1.9816, + "step": 4770 + }, + { + "epoch": 1.4643953345610803, + "grad_norm": 0.6472858190536499, + "learning_rate": 9.648926342961092e-05, + "loss": 1.927, + "step": 4771 + }, + { + "epoch": 1.4647022713321056, + "grad_norm": 0.5293224453926086, + "learning_rate": 9.648743352503567e-05, + "loss": 1.9082, + "step": 4772 + }, + { + "epoch": 1.4650092081031307, + "grad_norm": 0.4413148760795593, + "learning_rate": 9.648560316104435e-05, + "loss": 1.9368, + "step": 4773 + }, + { + "epoch": 1.4653161448741558, + "grad_norm": 0.4727863371372223, + "learning_rate": 9.648377233765507e-05, + "loss": 1.944, + "step": 4774 + }, + { + "epoch": 1.4656230816451812, + "grad_norm": 0.5681154131889343, + "learning_rate": 9.648194105488589e-05, + "loss": 2.0003, + "step": 4775 + }, + { + "epoch": 1.4659300184162063, + "grad_norm": 0.5893644690513611, + "learning_rate": 9.648010931275493e-05, + "loss": 1.936, + "step": 4776 + }, + { + "epoch": 1.4662369551872314, + "grad_norm": 0.5034298300743103, + "learning_rate": 9.647827711128029e-05, + "loss": 2.0318, + "step": 4777 + }, + { + "epoch": 1.4665438919582565, + "grad_norm": 0.4954885244369507, + "learning_rate": 9.647644445048006e-05, + "loss": 2.0053, + "step": 4778 + }, + { + "epoch": 1.4668508287292816, + "grad_norm": 0.475923627614975, + "learning_rate": 9.647461133037236e-05, + "loss": 1.8911, + "step": 4779 + }, + { + "epoch": 1.467157765500307, + "grad_norm": 0.4725008010864258, + "learning_rate": 9.647277775097534e-05, + "loss": 1.8954, + "step": 4780 + }, + { + "epoch": 1.467464702271332, + "grad_norm": 0.4183707535266876, + "learning_rate": 9.647094371230707e-05, + "loss": 1.9891, + "step": 4781 + }, + { + "epoch": 1.4677716390423572, + "grad_norm": 0.4862513244152069, + "learning_rate": 9.64691092143857e-05, + "loss": 2.0364, + "step": 4782 + }, + { + "epoch": 1.4680785758133825, + "grad_norm": 0.5038082599639893, + "learning_rate": 9.646727425722936e-05, + "loss": 1.9304, + "step": 4783 + }, + { + "epoch": 1.4683855125844076, + "grad_norm": 0.47281327843666077, + "learning_rate": 9.646543884085618e-05, + "loss": 1.9453, + "step": 4784 + }, + { + "epoch": 1.4686924493554327, + "grad_norm": 0.42275354266166687, + "learning_rate": 9.646360296528431e-05, + "loss": 1.9434, + "step": 4785 + }, + { + "epoch": 1.468999386126458, + "grad_norm": 0.5757746696472168, + "learning_rate": 9.646176663053185e-05, + "loss": 2.0241, + "step": 4786 + }, + { + "epoch": 1.4693063228974832, + "grad_norm": 0.6757779121398926, + "learning_rate": 9.645992983661701e-05, + "loss": 1.9823, + "step": 4787 + }, + { + "epoch": 1.4696132596685083, + "grad_norm": 0.7052981853485107, + "learning_rate": 9.645809258355792e-05, + "loss": 2.0553, + "step": 4788 + }, + { + "epoch": 1.4699201964395334, + "grad_norm": 0.5630238652229309, + "learning_rate": 9.64562548713727e-05, + "loss": 2.0241, + "step": 4789 + }, + { + "epoch": 1.4702271332105585, + "grad_norm": 0.5034958124160767, + "learning_rate": 9.645441670007955e-05, + "loss": 1.9788, + "step": 4790 + }, + { + "epoch": 1.4705340699815839, + "grad_norm": 0.48978129029273987, + "learning_rate": 9.645257806969663e-05, + "loss": 1.9415, + "step": 4791 + }, + { + "epoch": 1.470841006752609, + "grad_norm": 0.4718508720397949, + "learning_rate": 9.645073898024211e-05, + "loss": 1.9657, + "step": 4792 + }, + { + "epoch": 1.471147943523634, + "grad_norm": 0.5171064734458923, + "learning_rate": 9.644889943173417e-05, + "loss": 1.9311, + "step": 4793 + }, + { + "epoch": 1.4714548802946594, + "grad_norm": 0.4556005597114563, + "learning_rate": 9.644705942419097e-05, + "loss": 1.9093, + "step": 4794 + }, + { + "epoch": 1.4717618170656845, + "grad_norm": 0.44836321473121643, + "learning_rate": 9.64452189576307e-05, + "loss": 1.9715, + "step": 4795 + }, + { + "epoch": 1.4720687538367097, + "grad_norm": 0.5139105916023254, + "learning_rate": 9.644337803207155e-05, + "loss": 1.967, + "step": 4796 + }, + { + "epoch": 1.4723756906077348, + "grad_norm": 0.49145743250846863, + "learning_rate": 9.644153664753173e-05, + "loss": 1.9679, + "step": 4797 + }, + { + "epoch": 1.4726826273787599, + "grad_norm": 0.4353790283203125, + "learning_rate": 9.643969480402942e-05, + "loss": 1.9438, + "step": 4798 + }, + { + "epoch": 1.4729895641497852, + "grad_norm": 0.39393118023872375, + "learning_rate": 9.643785250158283e-05, + "loss": 1.91, + "step": 4799 + }, + { + "epoch": 1.4732965009208103, + "grad_norm": 0.4250284731388092, + "learning_rate": 9.643600974021017e-05, + "loss": 1.9315, + "step": 4800 + }, + { + "epoch": 1.4736034376918354, + "grad_norm": 0.40301406383514404, + "learning_rate": 9.643416651992962e-05, + "loss": 1.9344, + "step": 4801 + }, + { + "epoch": 1.4739103744628608, + "grad_norm": 0.4428589940071106, + "learning_rate": 9.643232284075944e-05, + "loss": 1.9767, + "step": 4802 + }, + { + "epoch": 1.4742173112338859, + "grad_norm": 0.5098150372505188, + "learning_rate": 9.643047870271783e-05, + "loss": 2.0471, + "step": 4803 + }, + { + "epoch": 1.474524248004911, + "grad_norm": 0.5230079293251038, + "learning_rate": 9.642863410582302e-05, + "loss": 1.9647, + "step": 4804 + }, + { + "epoch": 1.474831184775936, + "grad_norm": 0.44200628995895386, + "learning_rate": 9.642678905009322e-05, + "loss": 1.9046, + "step": 4805 + }, + { + "epoch": 1.4751381215469612, + "grad_norm": 0.42684751749038696, + "learning_rate": 9.642494353554669e-05, + "loss": 1.82, + "step": 4806 + }, + { + "epoch": 1.4754450583179866, + "grad_norm": 0.3907437324523926, + "learning_rate": 9.642309756220165e-05, + "loss": 1.9257, + "step": 4807 + }, + { + "epoch": 1.4757519950890117, + "grad_norm": 0.43622660636901855, + "learning_rate": 9.642125113007636e-05, + "loss": 1.9319, + "step": 4808 + }, + { + "epoch": 1.4760589318600368, + "grad_norm": 0.4553097188472748, + "learning_rate": 9.641940423918905e-05, + "loss": 1.9699, + "step": 4809 + }, + { + "epoch": 1.4763658686310621, + "grad_norm": 0.48997193574905396, + "learning_rate": 9.641755688955798e-05, + "loss": 1.9843, + "step": 4810 + }, + { + "epoch": 1.4766728054020872, + "grad_norm": 0.5008227825164795, + "learning_rate": 9.641570908120141e-05, + "loss": 1.9616, + "step": 4811 + }, + { + "epoch": 1.4769797421731123, + "grad_norm": 0.49788615107536316, + "learning_rate": 9.64138608141376e-05, + "loss": 2.0233, + "step": 4812 + }, + { + "epoch": 1.4772866789441375, + "grad_norm": 0.509159505367279, + "learning_rate": 9.64120120883848e-05, + "loss": 1.9982, + "step": 4813 + }, + { + "epoch": 1.4775936157151626, + "grad_norm": 0.4976164996623993, + "learning_rate": 9.641016290396132e-05, + "loss": 1.9944, + "step": 4814 + }, + { + "epoch": 1.477900552486188, + "grad_norm": 0.4925370514392853, + "learning_rate": 9.640831326088539e-05, + "loss": 1.9547, + "step": 4815 + }, + { + "epoch": 1.478207489257213, + "grad_norm": 0.5058705806732178, + "learning_rate": 9.64064631591753e-05, + "loss": 2.0147, + "step": 4816 + }, + { + "epoch": 1.4785144260282381, + "grad_norm": 0.5614715814590454, + "learning_rate": 9.640461259884937e-05, + "loss": 1.9475, + "step": 4817 + }, + { + "epoch": 1.4788213627992635, + "grad_norm": 0.4417608380317688, + "learning_rate": 9.640276157992582e-05, + "loss": 1.9422, + "step": 4818 + }, + { + "epoch": 1.4791282995702886, + "grad_norm": 0.5124607682228088, + "learning_rate": 9.6400910102423e-05, + "loss": 1.9489, + "step": 4819 + }, + { + "epoch": 1.4794352363413137, + "grad_norm": 0.4931279420852661, + "learning_rate": 9.63990581663592e-05, + "loss": 1.9717, + "step": 4820 + }, + { + "epoch": 1.4797421731123388, + "grad_norm": 0.4716447591781616, + "learning_rate": 9.639720577175271e-05, + "loss": 1.9758, + "step": 4821 + }, + { + "epoch": 1.480049109883364, + "grad_norm": 0.4613695740699768, + "learning_rate": 9.639535291862183e-05, + "loss": 1.8998, + "step": 4822 + }, + { + "epoch": 1.4803560466543892, + "grad_norm": 0.4430600702762604, + "learning_rate": 9.639349960698489e-05, + "loss": 1.9539, + "step": 4823 + }, + { + "epoch": 1.4806629834254144, + "grad_norm": 0.45596009492874146, + "learning_rate": 9.639164583686018e-05, + "loss": 1.9626, + "step": 4824 + }, + { + "epoch": 1.4809699201964395, + "grad_norm": 0.4248705804347992, + "learning_rate": 9.638979160826604e-05, + "loss": 1.9627, + "step": 4825 + }, + { + "epoch": 1.4812768569674648, + "grad_norm": 0.43419960141181946, + "learning_rate": 9.63879369212208e-05, + "loss": 1.9589, + "step": 4826 + }, + { + "epoch": 1.48158379373849, + "grad_norm": 0.4715637266635895, + "learning_rate": 9.638608177574278e-05, + "loss": 1.981, + "step": 4827 + }, + { + "epoch": 1.481890730509515, + "grad_norm": 0.41809993982315063, + "learning_rate": 9.63842261718503e-05, + "loss": 1.9587, + "step": 4828 + }, + { + "epoch": 1.4821976672805401, + "grad_norm": 0.4085060656070709, + "learning_rate": 9.63823701095617e-05, + "loss": 1.9497, + "step": 4829 + }, + { + "epoch": 1.4825046040515653, + "grad_norm": 0.4199173152446747, + "learning_rate": 9.638051358889535e-05, + "loss": 1.9543, + "step": 4830 + }, + { + "epoch": 1.4828115408225906, + "grad_norm": 0.4560040235519409, + "learning_rate": 9.637865660986958e-05, + "loss": 1.9451, + "step": 4831 + }, + { + "epoch": 1.4831184775936157, + "grad_norm": 0.4059405028820038, + "learning_rate": 9.637679917250272e-05, + "loss": 1.9154, + "step": 4832 + }, + { + "epoch": 1.4834254143646408, + "grad_norm": 0.43314236402511597, + "learning_rate": 9.637494127681318e-05, + "loss": 1.9589, + "step": 4833 + }, + { + "epoch": 1.4837323511356661, + "grad_norm": 0.3866138458251953, + "learning_rate": 9.637308292281928e-05, + "loss": 1.9239, + "step": 4834 + }, + { + "epoch": 1.4840392879066913, + "grad_norm": 0.40781381726264954, + "learning_rate": 9.637122411053939e-05, + "loss": 1.9805, + "step": 4835 + }, + { + "epoch": 1.4843462246777164, + "grad_norm": 0.4605334401130676, + "learning_rate": 9.636936483999189e-05, + "loss": 1.9571, + "step": 4836 + }, + { + "epoch": 1.4846531614487415, + "grad_norm": 0.4730539917945862, + "learning_rate": 9.636750511119513e-05, + "loss": 1.9429, + "step": 4837 + }, + { + "epoch": 1.4849600982197666, + "grad_norm": 0.47973817586898804, + "learning_rate": 9.636564492416753e-05, + "loss": 1.9865, + "step": 4838 + }, + { + "epoch": 1.485267034990792, + "grad_norm": 0.4541794955730438, + "learning_rate": 9.636378427892744e-05, + "loss": 1.9796, + "step": 4839 + }, + { + "epoch": 1.485573971761817, + "grad_norm": 0.4863722026348114, + "learning_rate": 9.636192317549327e-05, + "loss": 1.9581, + "step": 4840 + }, + { + "epoch": 1.4858809085328422, + "grad_norm": 0.4559536278247833, + "learning_rate": 9.636006161388338e-05, + "loss": 1.9444, + "step": 4841 + }, + { + "epoch": 1.4861878453038675, + "grad_norm": 0.4385206401348114, + "learning_rate": 9.63581995941162e-05, + "loss": 1.9323, + "step": 4842 + }, + { + "epoch": 1.4864947820748926, + "grad_norm": 0.48802945017814636, + "learning_rate": 9.635633711621012e-05, + "loss": 1.9643, + "step": 4843 + }, + { + "epoch": 1.4868017188459177, + "grad_norm": 0.4051367938518524, + "learning_rate": 9.635447418018355e-05, + "loss": 1.9342, + "step": 4844 + }, + { + "epoch": 1.4871086556169428, + "grad_norm": 0.46384257078170776, + "learning_rate": 9.63526107860549e-05, + "loss": 1.9656, + "step": 4845 + }, + { + "epoch": 1.487415592387968, + "grad_norm": 0.3950713574886322, + "learning_rate": 9.635074693384257e-05, + "loss": 1.8673, + "step": 4846 + }, + { + "epoch": 1.4877225291589933, + "grad_norm": 0.4694644808769226, + "learning_rate": 9.634888262356501e-05, + "loss": 1.9484, + "step": 4847 + }, + { + "epoch": 1.4880294659300184, + "grad_norm": 0.45068567991256714, + "learning_rate": 9.63470178552406e-05, + "loss": 1.9221, + "step": 4848 + }, + { + "epoch": 1.4883364027010435, + "grad_norm": 0.44717836380004883, + "learning_rate": 9.634515262888781e-05, + "loss": 1.9968, + "step": 4849 + }, + { + "epoch": 1.4886433394720688, + "grad_norm": 0.42189615964889526, + "learning_rate": 9.634328694452506e-05, + "loss": 2.0262, + "step": 4850 + }, + { + "epoch": 1.488950276243094, + "grad_norm": 0.4895322322845459, + "learning_rate": 9.63414208021708e-05, + "loss": 2.0628, + "step": 4851 + }, + { + "epoch": 1.489257213014119, + "grad_norm": 0.4732883870601654, + "learning_rate": 9.633955420184342e-05, + "loss": 1.9487, + "step": 4852 + }, + { + "epoch": 1.4895641497851444, + "grad_norm": 0.4426051676273346, + "learning_rate": 9.633768714356143e-05, + "loss": 2.0181, + "step": 4853 + }, + { + "epoch": 1.4898710865561695, + "grad_norm": 0.5831739902496338, + "learning_rate": 9.633581962734326e-05, + "loss": 1.9311, + "step": 4854 + }, + { + "epoch": 1.4901780233271946, + "grad_norm": 0.6048587560653687, + "learning_rate": 9.633395165320734e-05, + "loss": 1.9159, + "step": 4855 + }, + { + "epoch": 1.4904849600982197, + "grad_norm": 0.60125732421875, + "learning_rate": 9.633208322117218e-05, + "loss": 1.9732, + "step": 4856 + }, + { + "epoch": 1.4907918968692448, + "grad_norm": 0.4806794822216034, + "learning_rate": 9.63302143312562e-05, + "loss": 1.9101, + "step": 4857 + }, + { + "epoch": 1.4910988336402702, + "grad_norm": 0.4032946228981018, + "learning_rate": 9.632834498347789e-05, + "loss": 1.9097, + "step": 4858 + }, + { + "epoch": 1.4914057704112953, + "grad_norm": 0.400632381439209, + "learning_rate": 9.632647517785571e-05, + "loss": 1.9949, + "step": 4859 + }, + { + "epoch": 1.4917127071823204, + "grad_norm": 0.49766576290130615, + "learning_rate": 9.632460491440818e-05, + "loss": 1.9762, + "step": 4860 + }, + { + "epoch": 1.4920196439533457, + "grad_norm": 0.6273209452629089, + "learning_rate": 9.632273419315372e-05, + "loss": 2.0797, + "step": 4861 + }, + { + "epoch": 1.4923265807243709, + "grad_norm": 0.5848406553268433, + "learning_rate": 9.632086301411087e-05, + "loss": 1.9366, + "step": 4862 + }, + { + "epoch": 1.492633517495396, + "grad_norm": 0.4683595597743988, + "learning_rate": 9.631899137729809e-05, + "loss": 1.9802, + "step": 4863 + }, + { + "epoch": 1.492940454266421, + "grad_norm": 0.43066033720970154, + "learning_rate": 9.63171192827339e-05, + "loss": 1.9621, + "step": 4864 + }, + { + "epoch": 1.4932473910374462, + "grad_norm": 0.47469422221183777, + "learning_rate": 9.63152467304368e-05, + "loss": 1.9795, + "step": 4865 + }, + { + "epoch": 1.4935543278084715, + "grad_norm": 0.5453927516937256, + "learning_rate": 9.631337372042526e-05, + "loss": 1.9711, + "step": 4866 + }, + { + "epoch": 1.4938612645794966, + "grad_norm": 0.5361614227294922, + "learning_rate": 9.631150025271782e-05, + "loss": 1.9849, + "step": 4867 + }, + { + "epoch": 1.4941682013505218, + "grad_norm": 0.4773578643798828, + "learning_rate": 9.6309626327333e-05, + "loss": 2.065, + "step": 4868 + }, + { + "epoch": 1.494475138121547, + "grad_norm": 0.428091824054718, + "learning_rate": 9.630775194428932e-05, + "loss": 1.9448, + "step": 4869 + }, + { + "epoch": 1.4947820748925722, + "grad_norm": 0.41679108142852783, + "learning_rate": 9.630587710360527e-05, + "loss": 1.9511, + "step": 4870 + }, + { + "epoch": 1.4950890116635973, + "grad_norm": 0.5072546601295471, + "learning_rate": 9.630400180529942e-05, + "loss": 1.9973, + "step": 4871 + }, + { + "epoch": 1.4953959484346224, + "grad_norm": 0.5230575799942017, + "learning_rate": 9.630212604939026e-05, + "loss": 1.9659, + "step": 4872 + }, + { + "epoch": 1.4957028852056475, + "grad_norm": 0.44307753443717957, + "learning_rate": 9.630024983589638e-05, + "loss": 1.9056, + "step": 4873 + }, + { + "epoch": 1.4960098219766729, + "grad_norm": 0.43783196806907654, + "learning_rate": 9.629837316483628e-05, + "loss": 1.9716, + "step": 4874 + }, + { + "epoch": 1.496316758747698, + "grad_norm": 0.4553990960121155, + "learning_rate": 9.629649603622852e-05, + "loss": 2.044, + "step": 4875 + }, + { + "epoch": 1.496623695518723, + "grad_norm": 0.49152833223342896, + "learning_rate": 9.629461845009164e-05, + "loss": 1.948, + "step": 4876 + }, + { + "epoch": 1.4969306322897484, + "grad_norm": 0.4371738135814667, + "learning_rate": 9.629274040644422e-05, + "loss": 1.9497, + "step": 4877 + }, + { + "epoch": 1.4972375690607735, + "grad_norm": 0.4973873198032379, + "learning_rate": 9.629086190530482e-05, + "loss": 2.0053, + "step": 4878 + }, + { + "epoch": 1.4975445058317987, + "grad_norm": 0.4250672459602356, + "learning_rate": 9.628898294669197e-05, + "loss": 1.9617, + "step": 4879 + }, + { + "epoch": 1.4978514426028238, + "grad_norm": 0.4514639675617218, + "learning_rate": 9.628710353062427e-05, + "loss": 1.9503, + "step": 4880 + }, + { + "epoch": 1.4981583793738489, + "grad_norm": 0.4960804879665375, + "learning_rate": 9.628522365712027e-05, + "loss": 1.9932, + "step": 4881 + }, + { + "epoch": 1.4984653161448742, + "grad_norm": 0.5604363083839417, + "learning_rate": 9.628334332619857e-05, + "loss": 2.0186, + "step": 4882 + }, + { + "epoch": 1.4987722529158993, + "grad_norm": 0.5125443935394287, + "learning_rate": 9.628146253787776e-05, + "loss": 1.9897, + "step": 4883 + }, + { + "epoch": 1.4990791896869244, + "grad_norm": 0.4029771089553833, + "learning_rate": 9.627958129217639e-05, + "loss": 1.9083, + "step": 4884 + }, + { + "epoch": 1.4993861264579498, + "grad_norm": 0.4608222544193268, + "learning_rate": 9.627769958911308e-05, + "loss": 2.0153, + "step": 4885 + }, + { + "epoch": 1.4996930632289749, + "grad_norm": 0.4253246486186981, + "learning_rate": 9.627581742870641e-05, + "loss": 1.9278, + "step": 4886 + }, + { + "epoch": 1.5, + "grad_norm": 0.4247463047504425, + "learning_rate": 9.6273934810975e-05, + "loss": 1.9456, + "step": 4887 + }, + { + "epoch": 1.5003069367710253, + "grad_norm": 0.44055816531181335, + "learning_rate": 9.627205173593744e-05, + "loss": 2.0225, + "step": 4888 + }, + { + "epoch": 1.5006138735420502, + "grad_norm": 0.47912710905075073, + "learning_rate": 9.627016820361235e-05, + "loss": 1.9716, + "step": 4889 + }, + { + "epoch": 1.5009208103130756, + "grad_norm": 0.47608625888824463, + "learning_rate": 9.626828421401832e-05, + "loss": 1.9444, + "step": 4890 + }, + { + "epoch": 1.5012277470841007, + "grad_norm": 0.4757349193096161, + "learning_rate": 9.6266399767174e-05, + "loss": 2.0699, + "step": 4891 + }, + { + "epoch": 1.5015346838551258, + "grad_norm": 0.5556650757789612, + "learning_rate": 9.6264514863098e-05, + "loss": 1.99, + "step": 4892 + }, + { + "epoch": 1.5018416206261511, + "grad_norm": 0.5072291493415833, + "learning_rate": 9.626262950180894e-05, + "loss": 1.9435, + "step": 4893 + }, + { + "epoch": 1.5021485573971762, + "grad_norm": 0.47811564803123474, + "learning_rate": 9.626074368332546e-05, + "loss": 1.9399, + "step": 4894 + }, + { + "epoch": 1.5024554941682013, + "grad_norm": 0.4613232910633087, + "learning_rate": 9.62588574076662e-05, + "loss": 1.9259, + "step": 4895 + }, + { + "epoch": 1.5027624309392267, + "grad_norm": 0.4170697331428528, + "learning_rate": 9.62569706748498e-05, + "loss": 1.9319, + "step": 4896 + }, + { + "epoch": 1.5030693677102516, + "grad_norm": 0.4731575548648834, + "learning_rate": 9.62550834848949e-05, + "loss": 1.9862, + "step": 4897 + }, + { + "epoch": 1.503376304481277, + "grad_norm": 0.49881401658058167, + "learning_rate": 9.625319583782016e-05, + "loss": 1.9837, + "step": 4898 + }, + { + "epoch": 1.503683241252302, + "grad_norm": 0.4689660668373108, + "learning_rate": 9.625130773364424e-05, + "loss": 1.9662, + "step": 4899 + }, + { + "epoch": 1.5039901780233271, + "grad_norm": 0.48389768600463867, + "learning_rate": 9.624941917238577e-05, + "loss": 2.0087, + "step": 4900 + }, + { + "epoch": 1.5042971147943525, + "grad_norm": 0.46716609597206116, + "learning_rate": 9.624753015406342e-05, + "loss": 1.9718, + "step": 4901 + }, + { + "epoch": 1.5046040515653776, + "grad_norm": 0.544793963432312, + "learning_rate": 9.62456406786959e-05, + "loss": 1.9878, + "step": 4902 + }, + { + "epoch": 1.5049109883364027, + "grad_norm": 0.44499701261520386, + "learning_rate": 9.624375074630183e-05, + "loss": 1.8849, + "step": 4903 + }, + { + "epoch": 1.505217925107428, + "grad_norm": 0.42464208602905273, + "learning_rate": 9.624186035689993e-05, + "loss": 1.8995, + "step": 4904 + }, + { + "epoch": 1.505524861878453, + "grad_norm": 0.41650670766830444, + "learning_rate": 9.623996951050885e-05, + "loss": 1.9138, + "step": 4905 + }, + { + "epoch": 1.5058317986494782, + "grad_norm": 0.37955889105796814, + "learning_rate": 9.62380782071473e-05, + "loss": 1.9746, + "step": 4906 + }, + { + "epoch": 1.5061387354205034, + "grad_norm": 0.3799228072166443, + "learning_rate": 9.623618644683394e-05, + "loss": 1.942, + "step": 4907 + }, + { + "epoch": 1.5064456721915285, + "grad_norm": 0.3799766004085541, + "learning_rate": 9.623429422958751e-05, + "loss": 1.9025, + "step": 4908 + }, + { + "epoch": 1.5067526089625538, + "grad_norm": 0.3780234456062317, + "learning_rate": 9.623240155542668e-05, + "loss": 1.9581, + "step": 4909 + }, + { + "epoch": 1.507059545733579, + "grad_norm": 0.36379706859588623, + "learning_rate": 9.623050842437014e-05, + "loss": 1.9299, + "step": 4910 + }, + { + "epoch": 1.507366482504604, + "grad_norm": 0.5230580568313599, + "learning_rate": 9.622861483643663e-05, + "loss": 2.0306, + "step": 4911 + }, + { + "epoch": 1.5076734192756294, + "grad_norm": 0.443945050239563, + "learning_rate": 9.622672079164486e-05, + "loss": 1.9032, + "step": 4912 + }, + { + "epoch": 1.5079803560466543, + "grad_norm": 0.4689701795578003, + "learning_rate": 9.622482629001355e-05, + "loss": 1.9901, + "step": 4913 + }, + { + "epoch": 1.5082872928176796, + "grad_norm": 0.4483632445335388, + "learning_rate": 9.622293133156139e-05, + "loss": 1.948, + "step": 4914 + }, + { + "epoch": 1.5085942295887047, + "grad_norm": 0.4064919948577881, + "learning_rate": 9.622103591630715e-05, + "loss": 1.9487, + "step": 4915 + }, + { + "epoch": 1.5089011663597298, + "grad_norm": 0.44170522689819336, + "learning_rate": 9.621914004426952e-05, + "loss": 1.9929, + "step": 4916 + }, + { + "epoch": 1.5092081031307552, + "grad_norm": 0.45979443192481995, + "learning_rate": 9.621724371546727e-05, + "loss": 1.9428, + "step": 4917 + }, + { + "epoch": 1.5095150399017803, + "grad_norm": 0.5258452892303467, + "learning_rate": 9.621534692991913e-05, + "loss": 2.0049, + "step": 4918 + }, + { + "epoch": 1.5098219766728054, + "grad_norm": 0.45191919803619385, + "learning_rate": 9.621344968764385e-05, + "loss": 2.0364, + "step": 4919 + }, + { + "epoch": 1.5101289134438307, + "grad_norm": 0.539245069026947, + "learning_rate": 9.621155198866016e-05, + "loss": 2.072, + "step": 4920 + }, + { + "epoch": 1.5104358502148556, + "grad_norm": 0.5410256385803223, + "learning_rate": 9.620965383298684e-05, + "loss": 2.0231, + "step": 4921 + }, + { + "epoch": 1.510742786985881, + "grad_norm": 0.4409741759300232, + "learning_rate": 9.620775522064264e-05, + "loss": 1.9024, + "step": 4922 + }, + { + "epoch": 1.511049723756906, + "grad_norm": 0.4911535680294037, + "learning_rate": 9.620585615164631e-05, + "loss": 2.0057, + "step": 4923 + }, + { + "epoch": 1.5113566605279312, + "grad_norm": 0.48139557242393494, + "learning_rate": 9.620395662601663e-05, + "loss": 2.0175, + "step": 4924 + }, + { + "epoch": 1.5116635972989565, + "grad_norm": 0.5130077004432678, + "learning_rate": 9.620205664377238e-05, + "loss": 1.952, + "step": 4925 + }, + { + "epoch": 1.5119705340699816, + "grad_norm": 0.5428542494773865, + "learning_rate": 9.62001562049323e-05, + "loss": 1.977, + "step": 4926 + }, + { + "epoch": 1.5122774708410067, + "grad_norm": 0.4586256444454193, + "learning_rate": 9.619825530951522e-05, + "loss": 1.9997, + "step": 4927 + }, + { + "epoch": 1.512584407612032, + "grad_norm": 0.3941349387168884, + "learning_rate": 9.61963539575399e-05, + "loss": 1.9174, + "step": 4928 + }, + { + "epoch": 1.512891344383057, + "grad_norm": 0.4396456480026245, + "learning_rate": 9.619445214902511e-05, + "loss": 1.9696, + "step": 4929 + }, + { + "epoch": 1.5131982811540823, + "grad_norm": 0.5413886904716492, + "learning_rate": 9.61925498839897e-05, + "loss": 2.0332, + "step": 4930 + }, + { + "epoch": 1.5135052179251074, + "grad_norm": 0.5946230888366699, + "learning_rate": 9.619064716245242e-05, + "loss": 2.0433, + "step": 4931 + }, + { + "epoch": 1.5138121546961325, + "grad_norm": 0.6353569030761719, + "learning_rate": 9.618874398443211e-05, + "loss": 1.9828, + "step": 4932 + }, + { + "epoch": 1.5141190914671578, + "grad_norm": 0.523690938949585, + "learning_rate": 9.618684034994754e-05, + "loss": 1.9024, + "step": 4933 + }, + { + "epoch": 1.514426028238183, + "grad_norm": 0.4437367022037506, + "learning_rate": 9.618493625901754e-05, + "loss": 1.9961, + "step": 4934 + }, + { + "epoch": 1.514732965009208, + "grad_norm": 0.48458734154701233, + "learning_rate": 9.618303171166094e-05, + "loss": 1.9515, + "step": 4935 + }, + { + "epoch": 1.5150399017802334, + "grad_norm": 0.47659310698509216, + "learning_rate": 9.618112670789657e-05, + "loss": 1.9943, + "step": 4936 + }, + { + "epoch": 1.5153468385512583, + "grad_norm": 0.49281415343284607, + "learning_rate": 9.617922124774322e-05, + "loss": 1.9311, + "step": 4937 + }, + { + "epoch": 1.5156537753222836, + "grad_norm": 0.4706041216850281, + "learning_rate": 9.617731533121972e-05, + "loss": 1.9478, + "step": 4938 + }, + { + "epoch": 1.5159607120933087, + "grad_norm": 0.4187149405479431, + "learning_rate": 9.617540895834496e-05, + "loss": 1.9915, + "step": 4939 + }, + { + "epoch": 1.5162676488643339, + "grad_norm": 0.3792540431022644, + "learning_rate": 9.617350212913772e-05, + "loss": 1.8609, + "step": 4940 + }, + { + "epoch": 1.5165745856353592, + "grad_norm": 0.46558165550231934, + "learning_rate": 9.617159484361688e-05, + "loss": 1.9574, + "step": 4941 + }, + { + "epoch": 1.5168815224063843, + "grad_norm": 0.4930344820022583, + "learning_rate": 9.616968710180127e-05, + "loss": 1.9924, + "step": 4942 + }, + { + "epoch": 1.5171884591774094, + "grad_norm": 0.44909337162971497, + "learning_rate": 9.616777890370976e-05, + "loss": 1.9674, + "step": 4943 + }, + { + "epoch": 1.5174953959484347, + "grad_norm": 0.43266600370407104, + "learning_rate": 9.616587024936119e-05, + "loss": 1.8899, + "step": 4944 + }, + { + "epoch": 1.5178023327194596, + "grad_norm": 0.43229207396507263, + "learning_rate": 9.616396113877444e-05, + "loss": 1.9671, + "step": 4945 + }, + { + "epoch": 1.518109269490485, + "grad_norm": 0.4609402120113373, + "learning_rate": 9.616205157196837e-05, + "loss": 1.9844, + "step": 4946 + }, + { + "epoch": 1.51841620626151, + "grad_norm": 0.4598314166069031, + "learning_rate": 9.616014154896184e-05, + "loss": 1.985, + "step": 4947 + }, + { + "epoch": 1.5187231430325352, + "grad_norm": 0.4746960997581482, + "learning_rate": 9.615823106977376e-05, + "loss": 2.0199, + "step": 4948 + }, + { + "epoch": 1.5190300798035605, + "grad_norm": 0.47560420632362366, + "learning_rate": 9.615632013442295e-05, + "loss": 1.8864, + "step": 4949 + }, + { + "epoch": 1.5193370165745856, + "grad_norm": 0.447837233543396, + "learning_rate": 9.615440874292835e-05, + "loss": 1.9699, + "step": 4950 + }, + { + "epoch": 1.5196439533456108, + "grad_norm": 0.49653175473213196, + "learning_rate": 9.615249689530883e-05, + "loss": 2.0645, + "step": 4951 + }, + { + "epoch": 1.519950890116636, + "grad_norm": 0.47083014249801636, + "learning_rate": 9.615058459158328e-05, + "loss": 2.01, + "step": 4952 + }, + { + "epoch": 1.520257826887661, + "grad_norm": 0.5299197435379028, + "learning_rate": 9.614867183177061e-05, + "loss": 2.0232, + "step": 4953 + }, + { + "epoch": 1.5205647636586863, + "grad_norm": 0.5005922317504883, + "learning_rate": 9.614675861588971e-05, + "loss": 1.9703, + "step": 4954 + }, + { + "epoch": 1.5208717004297114, + "grad_norm": 0.5131978392601013, + "learning_rate": 9.61448449439595e-05, + "loss": 1.9921, + "step": 4955 + }, + { + "epoch": 1.5211786372007365, + "grad_norm": 0.5278428196907043, + "learning_rate": 9.614293081599889e-05, + "loss": 1.9111, + "step": 4956 + }, + { + "epoch": 1.5214855739717619, + "grad_norm": 0.4914579689502716, + "learning_rate": 9.614101623202678e-05, + "loss": 2.0398, + "step": 4957 + }, + { + "epoch": 1.521792510742787, + "grad_norm": 0.454863041639328, + "learning_rate": 9.61391011920621e-05, + "loss": 1.9674, + "step": 4958 + }, + { + "epoch": 1.522099447513812, + "grad_norm": 0.464491605758667, + "learning_rate": 9.613718569612379e-05, + "loss": 2.0123, + "step": 4959 + }, + { + "epoch": 1.5224063842848374, + "grad_norm": 0.4252295196056366, + "learning_rate": 9.613526974423078e-05, + "loss": 1.9796, + "step": 4960 + }, + { + "epoch": 1.5227133210558625, + "grad_norm": 0.4643968641757965, + "learning_rate": 9.613335333640199e-05, + "loss": 1.9448, + "step": 4961 + }, + { + "epoch": 1.5230202578268877, + "grad_norm": 0.4204397201538086, + "learning_rate": 9.613143647265635e-05, + "loss": 2.0191, + "step": 4962 + }, + { + "epoch": 1.523327194597913, + "grad_norm": 0.3838767111301422, + "learning_rate": 9.612951915301283e-05, + "loss": 1.9057, + "step": 4963 + }, + { + "epoch": 1.5236341313689379, + "grad_norm": 0.4353863000869751, + "learning_rate": 9.612760137749035e-05, + "loss": 2.0435, + "step": 4964 + }, + { + "epoch": 1.5239410681399632, + "grad_norm": 0.4082738757133484, + "learning_rate": 9.612568314610788e-05, + "loss": 1.9229, + "step": 4965 + }, + { + "epoch": 1.5242480049109883, + "grad_norm": 0.4382591247558594, + "learning_rate": 9.612376445888437e-05, + "loss": 1.9185, + "step": 4966 + }, + { + "epoch": 1.5245549416820134, + "grad_norm": 0.48340749740600586, + "learning_rate": 9.61218453158388e-05, + "loss": 1.9669, + "step": 4967 + }, + { + "epoch": 1.5248618784530388, + "grad_norm": 0.47423556447029114, + "learning_rate": 9.611992571699012e-05, + "loss": 1.9372, + "step": 4968 + }, + { + "epoch": 1.525168815224064, + "grad_norm": 0.4070637822151184, + "learning_rate": 9.611800566235728e-05, + "loss": 2.0201, + "step": 4969 + }, + { + "epoch": 1.525475751995089, + "grad_norm": 0.43758198618888855, + "learning_rate": 9.61160851519593e-05, + "loss": 1.982, + "step": 4970 + }, + { + "epoch": 1.5257826887661143, + "grad_norm": 0.4724174737930298, + "learning_rate": 9.611416418581513e-05, + "loss": 1.9938, + "step": 4971 + }, + { + "epoch": 1.5260896255371392, + "grad_norm": 0.492405503988266, + "learning_rate": 9.611224276394374e-05, + "loss": 1.9462, + "step": 4972 + }, + { + "epoch": 1.5263965623081646, + "grad_norm": 0.5064161419868469, + "learning_rate": 9.611032088636418e-05, + "loss": 2.0326, + "step": 4973 + }, + { + "epoch": 1.5267034990791897, + "grad_norm": 0.4256031811237335, + "learning_rate": 9.610839855309537e-05, + "loss": 1.8885, + "step": 4974 + }, + { + "epoch": 1.5270104358502148, + "grad_norm": 0.4283316731452942, + "learning_rate": 9.610647576415636e-05, + "loss": 2.005, + "step": 4975 + }, + { + "epoch": 1.5273173726212401, + "grad_norm": 0.44234412908554077, + "learning_rate": 9.610455251956614e-05, + "loss": 1.9626, + "step": 4976 + }, + { + "epoch": 1.5276243093922652, + "grad_norm": 0.4135831594467163, + "learning_rate": 9.610262881934369e-05, + "loss": 1.9529, + "step": 4977 + }, + { + "epoch": 1.5279312461632903, + "grad_norm": 0.48090922832489014, + "learning_rate": 9.610070466350805e-05, + "loss": 2.0239, + "step": 4978 + }, + { + "epoch": 1.5282381829343157, + "grad_norm": 0.4546974301338196, + "learning_rate": 9.609878005207822e-05, + "loss": 1.9556, + "step": 4979 + }, + { + "epoch": 1.5285451197053406, + "grad_norm": 0.4197862148284912, + "learning_rate": 9.609685498507323e-05, + "loss": 1.9117, + "step": 4980 + }, + { + "epoch": 1.528852056476366, + "grad_norm": 0.4376974105834961, + "learning_rate": 9.60949294625121e-05, + "loss": 1.9514, + "step": 4981 + }, + { + "epoch": 1.529158993247391, + "grad_norm": 0.3671407401561737, + "learning_rate": 9.609300348441385e-05, + "loss": 1.9042, + "step": 4982 + }, + { + "epoch": 1.5294659300184161, + "grad_norm": 0.4326031506061554, + "learning_rate": 9.609107705079754e-05, + "loss": 1.9606, + "step": 4983 + }, + { + "epoch": 1.5297728667894415, + "grad_norm": 0.423308402299881, + "learning_rate": 9.608915016168218e-05, + "loss": 1.9663, + "step": 4984 + }, + { + "epoch": 1.5300798035604666, + "grad_norm": 0.46309906244277954, + "learning_rate": 9.608722281708683e-05, + "loss": 2.0114, + "step": 4985 + }, + { + "epoch": 1.5303867403314917, + "grad_norm": 0.4619913101196289, + "learning_rate": 9.608529501703053e-05, + "loss": 1.9328, + "step": 4986 + }, + { + "epoch": 1.530693677102517, + "grad_norm": 0.4335738718509674, + "learning_rate": 9.608336676153234e-05, + "loss": 1.9069, + "step": 4987 + }, + { + "epoch": 1.531000613873542, + "grad_norm": 0.40606966614723206, + "learning_rate": 9.608143805061129e-05, + "loss": 1.9243, + "step": 4988 + }, + { + "epoch": 1.5313075506445673, + "grad_norm": 0.45613235235214233, + "learning_rate": 9.607950888428649e-05, + "loss": 1.9943, + "step": 4989 + }, + { + "epoch": 1.5316144874155924, + "grad_norm": 0.4905582666397095, + "learning_rate": 9.607757926257696e-05, + "loss": 1.9649, + "step": 4990 + }, + { + "epoch": 1.5319214241866175, + "grad_norm": 0.44312527775764465, + "learning_rate": 9.607564918550179e-05, + "loss": 1.927, + "step": 4991 + }, + { + "epoch": 1.5322283609576428, + "grad_norm": 0.5193700790405273, + "learning_rate": 9.607371865308004e-05, + "loss": 1.9038, + "step": 4992 + }, + { + "epoch": 1.532535297728668, + "grad_norm": 0.5528806447982788, + "learning_rate": 9.607178766533078e-05, + "loss": 1.9194, + "step": 4993 + }, + { + "epoch": 1.532842234499693, + "grad_norm": 0.6561285257339478, + "learning_rate": 9.606985622227314e-05, + "loss": 2.0098, + "step": 4994 + }, + { + "epoch": 1.5331491712707184, + "grad_norm": 0.5642603635787964, + "learning_rate": 9.606792432392617e-05, + "loss": 1.9781, + "step": 4995 + }, + { + "epoch": 1.5334561080417433, + "grad_norm": 0.4974311590194702, + "learning_rate": 9.606599197030896e-05, + "loss": 1.9558, + "step": 4996 + }, + { + "epoch": 1.5337630448127686, + "grad_norm": 0.4324510395526886, + "learning_rate": 9.606405916144063e-05, + "loss": 1.9749, + "step": 4997 + }, + { + "epoch": 1.5340699815837937, + "grad_norm": 0.45244327187538147, + "learning_rate": 9.606212589734027e-05, + "loss": 1.8902, + "step": 4998 + }, + { + "epoch": 1.5343769183548188, + "grad_norm": 0.5418685078620911, + "learning_rate": 9.606019217802698e-05, + "loss": 1.9766, + "step": 4999 + }, + { + "epoch": 1.5346838551258442, + "grad_norm": 0.48479241132736206, + "learning_rate": 9.605825800351987e-05, + "loss": 1.9949, + "step": 5000 + }, + { + "epoch": 1.5349907918968693, + "grad_norm": 0.4958111643791199, + "learning_rate": 9.605632337383806e-05, + "loss": 1.988, + "step": 5001 + }, + { + "epoch": 1.5352977286678944, + "grad_norm": 0.47347983717918396, + "learning_rate": 9.605438828900067e-05, + "loss": 1.9157, + "step": 5002 + }, + { + "epoch": 1.5356046654389197, + "grad_norm": 0.4018974304199219, + "learning_rate": 9.605245274902684e-05, + "loss": 1.9347, + "step": 5003 + }, + { + "epoch": 1.5359116022099446, + "grad_norm": 0.46161791682243347, + "learning_rate": 9.605051675393565e-05, + "loss": 1.9785, + "step": 5004 + }, + { + "epoch": 1.53621853898097, + "grad_norm": 0.5113234519958496, + "learning_rate": 9.604858030374627e-05, + "loss": 1.9595, + "step": 5005 + }, + { + "epoch": 1.536525475751995, + "grad_norm": 0.6643409132957458, + "learning_rate": 9.604664339847784e-05, + "loss": 2.0395, + "step": 5006 + }, + { + "epoch": 1.5368324125230202, + "grad_norm": 0.6759974360466003, + "learning_rate": 9.604470603814948e-05, + "loss": 1.9058, + "step": 5007 + }, + { + "epoch": 1.5371393492940455, + "grad_norm": 0.5576213598251343, + "learning_rate": 9.604276822278035e-05, + "loss": 1.9326, + "step": 5008 + }, + { + "epoch": 1.5374462860650706, + "grad_norm": 0.4472630023956299, + "learning_rate": 9.60408299523896e-05, + "loss": 1.9553, + "step": 5009 + }, + { + "epoch": 1.5377532228360957, + "grad_norm": 0.48445144295692444, + "learning_rate": 9.603889122699638e-05, + "loss": 2.0136, + "step": 5010 + }, + { + "epoch": 1.538060159607121, + "grad_norm": 0.4793097972869873, + "learning_rate": 9.603695204661987e-05, + "loss": 1.9777, + "step": 5011 + }, + { + "epoch": 1.538367096378146, + "grad_norm": 0.5003167390823364, + "learning_rate": 9.60350124112792e-05, + "loss": 1.9672, + "step": 5012 + }, + { + "epoch": 1.5386740331491713, + "grad_norm": 0.5131042003631592, + "learning_rate": 9.603307232099355e-05, + "loss": 2.0058, + "step": 5013 + }, + { + "epoch": 1.5389809699201964, + "grad_norm": 0.4145869314670563, + "learning_rate": 9.603113177578212e-05, + "loss": 1.9332, + "step": 5014 + }, + { + "epoch": 1.5392879066912215, + "grad_norm": 0.4939991235733032, + "learning_rate": 9.602919077566404e-05, + "loss": 1.9967, + "step": 5015 + }, + { + "epoch": 1.5395948434622468, + "grad_norm": 0.4768902361392975, + "learning_rate": 9.602724932065853e-05, + "loss": 1.873, + "step": 5016 + }, + { + "epoch": 1.539901780233272, + "grad_norm": 0.45381611585617065, + "learning_rate": 9.602530741078476e-05, + "loss": 1.9416, + "step": 5017 + }, + { + "epoch": 1.540208717004297, + "grad_norm": 0.43104392290115356, + "learning_rate": 9.602336504606193e-05, + "loss": 1.9566, + "step": 5018 + }, + { + "epoch": 1.5405156537753224, + "grad_norm": 0.5354776978492737, + "learning_rate": 9.602142222650924e-05, + "loss": 1.9939, + "step": 5019 + }, + { + "epoch": 1.5408225905463473, + "grad_norm": 0.5623740553855896, + "learning_rate": 9.601947895214586e-05, + "loss": 1.9622, + "step": 5020 + }, + { + "epoch": 1.5411295273173726, + "grad_norm": 0.5234485268592834, + "learning_rate": 9.601753522299103e-05, + "loss": 1.9636, + "step": 5021 + }, + { + "epoch": 1.5414364640883977, + "grad_norm": 0.416384756565094, + "learning_rate": 9.601559103906396e-05, + "loss": 1.92, + "step": 5022 + }, + { + "epoch": 1.5417434008594229, + "grad_norm": 0.47080478072166443, + "learning_rate": 9.601364640038384e-05, + "loss": 1.9147, + "step": 5023 + }, + { + "epoch": 1.5420503376304482, + "grad_norm": 0.527463972568512, + "learning_rate": 9.601170130696988e-05, + "loss": 1.9458, + "step": 5024 + }, + { + "epoch": 1.5423572744014733, + "grad_norm": 0.4761022925376892, + "learning_rate": 9.600975575884134e-05, + "loss": 1.95, + "step": 5025 + }, + { + "epoch": 1.5426642111724984, + "grad_norm": 0.48202264308929443, + "learning_rate": 9.600780975601741e-05, + "loss": 1.9618, + "step": 5026 + }, + { + "epoch": 1.5429711479435237, + "grad_norm": 0.43222522735595703, + "learning_rate": 9.600586329851735e-05, + "loss": 1.9869, + "step": 5027 + }, + { + "epoch": 1.5432780847145486, + "grad_norm": 0.40816691517829895, + "learning_rate": 9.600391638636037e-05, + "loss": 1.991, + "step": 5028 + }, + { + "epoch": 1.543585021485574, + "grad_norm": 0.4365478754043579, + "learning_rate": 9.600196901956572e-05, + "loss": 1.9904, + "step": 5029 + }, + { + "epoch": 1.5438919582565993, + "grad_norm": 0.41411092877388, + "learning_rate": 9.600002119815268e-05, + "loss": 1.9449, + "step": 5030 + }, + { + "epoch": 1.5441988950276242, + "grad_norm": 0.41023650765419006, + "learning_rate": 9.599807292214045e-05, + "loss": 1.9318, + "step": 5031 + }, + { + "epoch": 1.5445058317986495, + "grad_norm": 0.4844631254673004, + "learning_rate": 9.599612419154831e-05, + "loss": 1.9884, + "step": 5032 + }, + { + "epoch": 1.5448127685696746, + "grad_norm": 0.4347037374973297, + "learning_rate": 9.59941750063955e-05, + "loss": 1.8992, + "step": 5033 + }, + { + "epoch": 1.5451197053406998, + "grad_norm": 0.6414445638656616, + "learning_rate": 9.59922253667013e-05, + "loss": 2.0268, + "step": 5034 + }, + { + "epoch": 1.545426642111725, + "grad_norm": 0.6607222557067871, + "learning_rate": 9.599027527248498e-05, + "loss": 2.0116, + "step": 5035 + }, + { + "epoch": 1.5457335788827502, + "grad_norm": 0.6406869292259216, + "learning_rate": 9.59883247237658e-05, + "loss": 1.9256, + "step": 5036 + }, + { + "epoch": 1.5460405156537753, + "grad_norm": 0.5388308167457581, + "learning_rate": 9.598637372056303e-05, + "loss": 1.906, + "step": 5037 + }, + { + "epoch": 1.5463474524248007, + "grad_norm": 0.42285510897636414, + "learning_rate": 9.598442226289596e-05, + "loss": 1.9137, + "step": 5038 + }, + { + "epoch": 1.5466543891958255, + "grad_norm": 0.5622994303703308, + "learning_rate": 9.598247035078389e-05, + "loss": 1.9825, + "step": 5039 + }, + { + "epoch": 1.5469613259668509, + "grad_norm": 0.7120574116706848, + "learning_rate": 9.59805179842461e-05, + "loss": 1.9467, + "step": 5040 + }, + { + "epoch": 1.547268262737876, + "grad_norm": 0.7050338983535767, + "learning_rate": 9.597856516330187e-05, + "loss": 1.9763, + "step": 5041 + }, + { + "epoch": 1.547575199508901, + "grad_norm": 0.4908922016620636, + "learning_rate": 9.597661188797051e-05, + "loss": 1.9826, + "step": 5042 + }, + { + "epoch": 1.5478821362799264, + "grad_norm": 0.47363361716270447, + "learning_rate": 9.597465815827133e-05, + "loss": 1.9769, + "step": 5043 + }, + { + "epoch": 1.5481890730509515, + "grad_norm": 0.6289864182472229, + "learning_rate": 9.597270397422364e-05, + "loss": 1.9364, + "step": 5044 + }, + { + "epoch": 1.5484960098219767, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.597074933584673e-05, + "loss": 1.949, + "step": 5045 + }, + { + "epoch": 1.548802946593002, + "grad_norm": 0.559152364730835, + "learning_rate": 9.596879424315993e-05, + "loss": 2.0194, + "step": 5046 + }, + { + "epoch": 1.5491098833640269, + "grad_norm": 0.4613901674747467, + "learning_rate": 9.596683869618257e-05, + "loss": 1.9658, + "step": 5047 + }, + { + "epoch": 1.5494168201350522, + "grad_norm": 0.6245483160018921, + "learning_rate": 9.596488269493396e-05, + "loss": 1.9265, + "step": 5048 + }, + { + "epoch": 1.5497237569060773, + "grad_norm": 0.8100824356079102, + "learning_rate": 9.596292623943343e-05, + "loss": 1.9536, + "step": 5049 + }, + { + "epoch": 1.5500306936771024, + "grad_norm": 0.7486092448234558, + "learning_rate": 9.596096932970035e-05, + "loss": 1.9801, + "step": 5050 + }, + { + "epoch": 1.5503376304481278, + "grad_norm": 0.4803295135498047, + "learning_rate": 9.595901196575401e-05, + "loss": 1.9943, + "step": 5051 + }, + { + "epoch": 1.550644567219153, + "grad_norm": 0.5027125477790833, + "learning_rate": 9.595705414761379e-05, + "loss": 1.9036, + "step": 5052 + }, + { + "epoch": 1.550951503990178, + "grad_norm": 0.5785070657730103, + "learning_rate": 9.595509587529902e-05, + "loss": 1.9489, + "step": 5053 + }, + { + "epoch": 1.5512584407612033, + "grad_norm": 0.6017338633537292, + "learning_rate": 9.595313714882906e-05, + "loss": 1.9964, + "step": 5054 + }, + { + "epoch": 1.5515653775322282, + "grad_norm": 0.5023195147514343, + "learning_rate": 9.595117796822326e-05, + "loss": 1.9778, + "step": 5055 + }, + { + "epoch": 1.5518723143032536, + "grad_norm": 0.4488884508609772, + "learning_rate": 9.594921833350099e-05, + "loss": 2.0141, + "step": 5056 + }, + { + "epoch": 1.5521792510742787, + "grad_norm": 0.47110801935195923, + "learning_rate": 9.59472582446816e-05, + "loss": 1.9294, + "step": 5057 + }, + { + "epoch": 1.5524861878453038, + "grad_norm": 0.5292330980300903, + "learning_rate": 9.594529770178449e-05, + "loss": 2.0427, + "step": 5058 + }, + { + "epoch": 1.5527931246163291, + "grad_norm": 0.522756814956665, + "learning_rate": 9.5943336704829e-05, + "loss": 1.9854, + "step": 5059 + }, + { + "epoch": 1.5531000613873542, + "grad_norm": 0.44659632444381714, + "learning_rate": 9.594137525383455e-05, + "loss": 2.028, + "step": 5060 + }, + { + "epoch": 1.5534069981583793, + "grad_norm": 0.4745616614818573, + "learning_rate": 9.593941334882048e-05, + "loss": 1.9994, + "step": 5061 + }, + { + "epoch": 1.5537139349294047, + "grad_norm": 0.41752973198890686, + "learning_rate": 9.593745098980622e-05, + "loss": 1.9466, + "step": 5062 + }, + { + "epoch": 1.5540208717004296, + "grad_norm": 0.4548248052597046, + "learning_rate": 9.593548817681115e-05, + "loss": 1.9064, + "step": 5063 + }, + { + "epoch": 1.554327808471455, + "grad_norm": 0.45780888199806213, + "learning_rate": 9.593352490985464e-05, + "loss": 2.0254, + "step": 5064 + }, + { + "epoch": 1.55463474524248, + "grad_norm": 0.4118718206882477, + "learning_rate": 9.593156118895613e-05, + "loss": 1.9761, + "step": 5065 + }, + { + "epoch": 1.5549416820135051, + "grad_norm": 0.41350236535072327, + "learning_rate": 9.592959701413501e-05, + "loss": 1.9476, + "step": 5066 + }, + { + "epoch": 1.5552486187845305, + "grad_norm": 0.4116091728210449, + "learning_rate": 9.59276323854107e-05, + "loss": 1.9325, + "step": 5067 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.44039735198020935, + "learning_rate": 9.592566730280259e-05, + "loss": 1.9916, + "step": 5068 + }, + { + "epoch": 1.5558624923265807, + "grad_norm": 0.4028816819190979, + "learning_rate": 9.592370176633012e-05, + "loss": 1.916, + "step": 5069 + }, + { + "epoch": 1.556169429097606, + "grad_norm": 0.42046302556991577, + "learning_rate": 9.592173577601271e-05, + "loss": 1.961, + "step": 5070 + }, + { + "epoch": 1.556476365868631, + "grad_norm": 0.3749450147151947, + "learning_rate": 9.591976933186982e-05, + "loss": 1.9279, + "step": 5071 + }, + { + "epoch": 1.5567833026396563, + "grad_norm": 0.3441384434700012, + "learning_rate": 9.591780243392081e-05, + "loss": 1.8967, + "step": 5072 + }, + { + "epoch": 1.5570902394106814, + "grad_norm": 0.4032546877861023, + "learning_rate": 9.59158350821852e-05, + "loss": 1.9912, + "step": 5073 + }, + { + "epoch": 1.5573971761817065, + "grad_norm": 0.44628265500068665, + "learning_rate": 9.591386727668238e-05, + "loss": 2.0539, + "step": 5074 + }, + { + "epoch": 1.5577041129527318, + "grad_norm": 0.43606969714164734, + "learning_rate": 9.59118990174318e-05, + "loss": 1.97, + "step": 5075 + }, + { + "epoch": 1.558011049723757, + "grad_norm": 0.42076775431632996, + "learning_rate": 9.590993030445295e-05, + "loss": 1.962, + "step": 5076 + }, + { + "epoch": 1.558317986494782, + "grad_norm": 0.34569117426872253, + "learning_rate": 9.590796113776526e-05, + "loss": 1.8815, + "step": 5077 + }, + { + "epoch": 1.5586249232658074, + "grad_norm": 0.3931111693382263, + "learning_rate": 9.590599151738817e-05, + "loss": 1.9016, + "step": 5078 + }, + { + "epoch": 1.5589318600368323, + "grad_norm": 0.3952369689941406, + "learning_rate": 9.590402144334117e-05, + "loss": 1.9277, + "step": 5079 + }, + { + "epoch": 1.5592387968078576, + "grad_norm": 0.3960857689380646, + "learning_rate": 9.590205091564372e-05, + "loss": 1.947, + "step": 5080 + }, + { + "epoch": 1.5595457335788827, + "grad_norm": 0.37946292757987976, + "learning_rate": 9.590007993431532e-05, + "loss": 1.9907, + "step": 5081 + }, + { + "epoch": 1.5598526703499078, + "grad_norm": 0.41619375348091125, + "learning_rate": 9.589810849937541e-05, + "loss": 1.9451, + "step": 5082 + }, + { + "epoch": 1.5601596071209332, + "grad_norm": 0.39266669750213623, + "learning_rate": 9.58961366108435e-05, + "loss": 2.0137, + "step": 5083 + }, + { + "epoch": 1.5604665438919583, + "grad_norm": 0.39510276913642883, + "learning_rate": 9.589416426873907e-05, + "loss": 1.947, + "step": 5084 + }, + { + "epoch": 1.5607734806629834, + "grad_norm": 0.40243181586265564, + "learning_rate": 9.58921914730816e-05, + "loss": 1.8957, + "step": 5085 + }, + { + "epoch": 1.5610804174340087, + "grad_norm": 0.39877578616142273, + "learning_rate": 9.58902182238906e-05, + "loss": 1.9497, + "step": 5086 + }, + { + "epoch": 1.5613873542050336, + "grad_norm": 0.39367151260375977, + "learning_rate": 9.588824452118557e-05, + "loss": 1.9616, + "step": 5087 + }, + { + "epoch": 1.561694290976059, + "grad_norm": 0.35690104961395264, + "learning_rate": 9.5886270364986e-05, + "loss": 1.9108, + "step": 5088 + }, + { + "epoch": 1.562001227747084, + "grad_norm": 0.39512762427330017, + "learning_rate": 9.588429575531141e-05, + "loss": 1.9909, + "step": 5089 + }, + { + "epoch": 1.5623081645181092, + "grad_norm": 0.39253926277160645, + "learning_rate": 9.588232069218132e-05, + "loss": 1.937, + "step": 5090 + }, + { + "epoch": 1.5626151012891345, + "grad_norm": 0.37811553478240967, + "learning_rate": 9.588034517561526e-05, + "loss": 1.8918, + "step": 5091 + }, + { + "epoch": 1.5629220380601596, + "grad_norm": 0.38191986083984375, + "learning_rate": 9.587836920563272e-05, + "loss": 1.9149, + "step": 5092 + }, + { + "epoch": 1.5632289748311847, + "grad_norm": 0.3903779089450836, + "learning_rate": 9.587639278225326e-05, + "loss": 1.9714, + "step": 5093 + }, + { + "epoch": 1.56353591160221, + "grad_norm": 0.4467499554157257, + "learning_rate": 9.587441590549639e-05, + "loss": 1.8822, + "step": 5094 + }, + { + "epoch": 1.563842848373235, + "grad_norm": 0.3819296956062317, + "learning_rate": 9.587243857538164e-05, + "loss": 1.9212, + "step": 5095 + }, + { + "epoch": 1.5641497851442603, + "grad_norm": 0.4305097162723541, + "learning_rate": 9.587046079192858e-05, + "loss": 1.9264, + "step": 5096 + }, + { + "epoch": 1.5644567219152854, + "grad_norm": 0.4135383367538452, + "learning_rate": 9.586848255515675e-05, + "loss": 1.9743, + "step": 5097 + }, + { + "epoch": 1.5647636586863105, + "grad_norm": 0.44688066840171814, + "learning_rate": 9.586650386508566e-05, + "loss": 1.8804, + "step": 5098 + }, + { + "epoch": 1.5650705954573358, + "grad_norm": 0.5358461737632751, + "learning_rate": 9.586452472173492e-05, + "loss": 1.9485, + "step": 5099 + }, + { + "epoch": 1.565377532228361, + "grad_norm": 0.5585343837738037, + "learning_rate": 9.586254512512408e-05, + "loss": 2.0901, + "step": 5100 + }, + { + "epoch": 1.565684468999386, + "grad_norm": 0.4682343602180481, + "learning_rate": 9.586056507527266e-05, + "loss": 1.8877, + "step": 5101 + }, + { + "epoch": 1.5659914057704114, + "grad_norm": 0.44076529145240784, + "learning_rate": 9.585858457220026e-05, + "loss": 1.93, + "step": 5102 + }, + { + "epoch": 1.5662983425414365, + "grad_norm": 0.4613071382045746, + "learning_rate": 9.585660361592646e-05, + "loss": 1.9689, + "step": 5103 + }, + { + "epoch": 1.5666052793124616, + "grad_norm": 0.4589289128780365, + "learning_rate": 9.585462220647082e-05, + "loss": 1.8876, + "step": 5104 + }, + { + "epoch": 1.566912216083487, + "grad_norm": 0.3495907485485077, + "learning_rate": 9.585264034385292e-05, + "loss": 1.9013, + "step": 5105 + }, + { + "epoch": 1.5672191528545119, + "grad_norm": 0.42263728380203247, + "learning_rate": 9.585065802809235e-05, + "loss": 1.8886, + "step": 5106 + }, + { + "epoch": 1.5675260896255372, + "grad_norm": 0.4275301694869995, + "learning_rate": 9.584867525920872e-05, + "loss": 1.9865, + "step": 5107 + }, + { + "epoch": 1.5678330263965623, + "grad_norm": 0.4228142201900482, + "learning_rate": 9.584669203722161e-05, + "loss": 1.8573, + "step": 5108 + }, + { + "epoch": 1.5681399631675874, + "grad_norm": 0.4422524571418762, + "learning_rate": 9.58447083621506e-05, + "loss": 1.924, + "step": 5109 + }, + { + "epoch": 1.5684468999386127, + "grad_norm": 0.41540947556495667, + "learning_rate": 9.584272423401532e-05, + "loss": 1.969, + "step": 5110 + }, + { + "epoch": 1.5687538367096379, + "grad_norm": 0.3963775336742401, + "learning_rate": 9.584073965283538e-05, + "loss": 1.9509, + "step": 5111 + }, + { + "epoch": 1.569060773480663, + "grad_norm": 0.41465985774993896, + "learning_rate": 9.583875461863037e-05, + "loss": 1.9393, + "step": 5112 + }, + { + "epoch": 1.5693677102516883, + "grad_norm": 0.4396083652973175, + "learning_rate": 9.583676913141991e-05, + "loss": 1.9872, + "step": 5113 + }, + { + "epoch": 1.5696746470227132, + "grad_norm": 0.4247182607650757, + "learning_rate": 9.583478319122366e-05, + "loss": 1.9807, + "step": 5114 + }, + { + "epoch": 1.5699815837937385, + "grad_norm": 0.3612080216407776, + "learning_rate": 9.583279679806119e-05, + "loss": 1.9563, + "step": 5115 + }, + { + "epoch": 1.5702885205647636, + "grad_norm": 0.40084055066108704, + "learning_rate": 9.583080995195217e-05, + "loss": 1.9099, + "step": 5116 + }, + { + "epoch": 1.5705954573357888, + "grad_norm": 0.432381272315979, + "learning_rate": 9.582882265291621e-05, + "loss": 2.0167, + "step": 5117 + }, + { + "epoch": 1.570902394106814, + "grad_norm": 0.45490768551826477, + "learning_rate": 9.5826834900973e-05, + "loss": 1.9179, + "step": 5118 + }, + { + "epoch": 1.5712093308778392, + "grad_norm": 0.39158329367637634, + "learning_rate": 9.582484669614211e-05, + "loss": 1.8716, + "step": 5119 + }, + { + "epoch": 1.5715162676488643, + "grad_norm": 0.45607441663742065, + "learning_rate": 9.582285803844324e-05, + "loss": 1.9631, + "step": 5120 + }, + { + "epoch": 1.5718232044198897, + "grad_norm": 0.42591094970703125, + "learning_rate": 9.582086892789604e-05, + "loss": 1.9809, + "step": 5121 + }, + { + "epoch": 1.5721301411909145, + "grad_norm": 0.46772903203964233, + "learning_rate": 9.581887936452015e-05, + "loss": 1.9991, + "step": 5122 + }, + { + "epoch": 1.5724370779619399, + "grad_norm": 0.4450485408306122, + "learning_rate": 9.581688934833524e-05, + "loss": 1.9471, + "step": 5123 + }, + { + "epoch": 1.572744014732965, + "grad_norm": 0.37539350986480713, + "learning_rate": 9.581489887936097e-05, + "loss": 1.8624, + "step": 5124 + }, + { + "epoch": 1.57305095150399, + "grad_norm": 0.4184030294418335, + "learning_rate": 9.581290795761702e-05, + "loss": 1.9746, + "step": 5125 + }, + { + "epoch": 1.5733578882750154, + "grad_norm": 0.43275317549705505, + "learning_rate": 9.581091658312305e-05, + "loss": 2.0484, + "step": 5126 + }, + { + "epoch": 1.5736648250460405, + "grad_norm": 0.48845502734184265, + "learning_rate": 9.580892475589876e-05, + "loss": 1.9331, + "step": 5127 + }, + { + "epoch": 1.5739717618170657, + "grad_norm": 0.4653528034687042, + "learning_rate": 9.580693247596383e-05, + "loss": 1.8888, + "step": 5128 + }, + { + "epoch": 1.574278698588091, + "grad_norm": 0.4371016323566437, + "learning_rate": 9.580493974333794e-05, + "loss": 1.9004, + "step": 5129 + }, + { + "epoch": 1.5745856353591159, + "grad_norm": 0.4274102747440338, + "learning_rate": 9.580294655804079e-05, + "loss": 1.9877, + "step": 5130 + }, + { + "epoch": 1.5748925721301412, + "grad_norm": 0.4053245484828949, + "learning_rate": 9.580095292009208e-05, + "loss": 1.9253, + "step": 5131 + }, + { + "epoch": 1.5751995089011663, + "grad_norm": 0.47868627309799194, + "learning_rate": 9.579895882951151e-05, + "loss": 1.9659, + "step": 5132 + }, + { + "epoch": 1.5755064456721914, + "grad_norm": 0.47420576214790344, + "learning_rate": 9.579696428631877e-05, + "loss": 1.9115, + "step": 5133 + }, + { + "epoch": 1.5758133824432168, + "grad_norm": 0.41192150115966797, + "learning_rate": 9.57949692905336e-05, + "loss": 1.8949, + "step": 5134 + }, + { + "epoch": 1.576120319214242, + "grad_norm": 0.44949471950531006, + "learning_rate": 9.57929738421757e-05, + "loss": 1.9393, + "step": 5135 + }, + { + "epoch": 1.576427255985267, + "grad_norm": 0.38450154662132263, + "learning_rate": 9.57909779412648e-05, + "loss": 1.8399, + "step": 5136 + }, + { + "epoch": 1.5767341927562923, + "grad_norm": 0.43553364276885986, + "learning_rate": 9.57889815878206e-05, + "loss": 1.9477, + "step": 5137 + }, + { + "epoch": 1.5770411295273172, + "grad_norm": 0.4546982944011688, + "learning_rate": 9.578698478186285e-05, + "loss": 1.9169, + "step": 5138 + }, + { + "epoch": 1.5773480662983426, + "grad_norm": 0.47802838683128357, + "learning_rate": 9.57849875234113e-05, + "loss": 1.9204, + "step": 5139 + }, + { + "epoch": 1.5776550030693677, + "grad_norm": 0.3648034930229187, + "learning_rate": 9.578298981248565e-05, + "loss": 1.9157, + "step": 5140 + }, + { + "epoch": 1.5779619398403928, + "grad_norm": 0.41951245069503784, + "learning_rate": 9.578099164910565e-05, + "loss": 1.9171, + "step": 5141 + }, + { + "epoch": 1.5782688766114181, + "grad_norm": 0.5198701620101929, + "learning_rate": 9.577899303329107e-05, + "loss": 1.9786, + "step": 5142 + }, + { + "epoch": 1.5785758133824432, + "grad_norm": 0.45244187116622925, + "learning_rate": 9.577699396506165e-05, + "loss": 2.0044, + "step": 5143 + }, + { + "epoch": 1.5788827501534684, + "grad_norm": 0.3874819874763489, + "learning_rate": 9.577499444443715e-05, + "loss": 1.9385, + "step": 5144 + }, + { + "epoch": 1.5791896869244937, + "grad_norm": 0.4578075110912323, + "learning_rate": 9.577299447143733e-05, + "loss": 1.9679, + "step": 5145 + }, + { + "epoch": 1.5794966236955186, + "grad_norm": 0.6001343727111816, + "learning_rate": 9.577099404608192e-05, + "loss": 1.9331, + "step": 5146 + }, + { + "epoch": 1.579803560466544, + "grad_norm": 0.5592501759529114, + "learning_rate": 9.576899316839074e-05, + "loss": 1.8968, + "step": 5147 + }, + { + "epoch": 1.580110497237569, + "grad_norm": 0.4333004951477051, + "learning_rate": 9.576699183838356e-05, + "loss": 2.0378, + "step": 5148 + }, + { + "epoch": 1.5804174340085941, + "grad_norm": 0.40593892335891724, + "learning_rate": 9.576499005608011e-05, + "loss": 1.9878, + "step": 5149 + }, + { + "epoch": 1.5807243707796195, + "grad_norm": 0.4805290400981903, + "learning_rate": 9.576298782150023e-05, + "loss": 1.9897, + "step": 5150 + }, + { + "epoch": 1.5810313075506446, + "grad_norm": 0.4620860517024994, + "learning_rate": 9.576098513466367e-05, + "loss": 1.9808, + "step": 5151 + }, + { + "epoch": 1.5813382443216697, + "grad_norm": 0.47085410356521606, + "learning_rate": 9.575898199559023e-05, + "loss": 1.9526, + "step": 5152 + }, + { + "epoch": 1.581645181092695, + "grad_norm": 0.512971043586731, + "learning_rate": 9.575697840429971e-05, + "loss": 1.9684, + "step": 5153 + }, + { + "epoch": 1.58195211786372, + "grad_norm": 0.5474939346313477, + "learning_rate": 9.575497436081193e-05, + "loss": 2.0052, + "step": 5154 + }, + { + "epoch": 1.5822590546347453, + "grad_norm": 0.6277830004692078, + "learning_rate": 9.575296986514666e-05, + "loss": 2.042, + "step": 5155 + }, + { + "epoch": 1.5825659914057704, + "grad_norm": 0.46941256523132324, + "learning_rate": 9.575096491732372e-05, + "loss": 1.952, + "step": 5156 + }, + { + "epoch": 1.5828729281767955, + "grad_norm": 0.4948115646839142, + "learning_rate": 9.574895951736294e-05, + "loss": 1.9573, + "step": 5157 + }, + { + "epoch": 1.5831798649478208, + "grad_norm": 0.5677160024642944, + "learning_rate": 9.574695366528411e-05, + "loss": 1.9696, + "step": 5158 + }, + { + "epoch": 1.583486801718846, + "grad_norm": 0.5915918350219727, + "learning_rate": 9.574494736110708e-05, + "loss": 1.9822, + "step": 5159 + }, + { + "epoch": 1.583793738489871, + "grad_norm": 0.556413471698761, + "learning_rate": 9.574294060485168e-05, + "loss": 1.9548, + "step": 5160 + }, + { + "epoch": 1.5841006752608964, + "grad_norm": 0.4706072509288788, + "learning_rate": 9.574093339653772e-05, + "loss": 2.0052, + "step": 5161 + }, + { + "epoch": 1.5844076120319213, + "grad_norm": 0.3931087553501129, + "learning_rate": 9.573892573618505e-05, + "loss": 1.9071, + "step": 5162 + }, + { + "epoch": 1.5847145488029466, + "grad_norm": 0.4590308368206024, + "learning_rate": 9.573691762381349e-05, + "loss": 2.048, + "step": 5163 + }, + { + "epoch": 1.5850214855739717, + "grad_norm": 0.4404078423976898, + "learning_rate": 9.573490905944293e-05, + "loss": 1.9426, + "step": 5164 + }, + { + "epoch": 1.5853284223449968, + "grad_norm": 0.486074298620224, + "learning_rate": 9.573290004309318e-05, + "loss": 1.9937, + "step": 5165 + }, + { + "epoch": 1.5856353591160222, + "grad_norm": 0.4650556445121765, + "learning_rate": 9.57308905747841e-05, + "loss": 1.9821, + "step": 5166 + }, + { + "epoch": 1.5859422958870473, + "grad_norm": 0.48193567991256714, + "learning_rate": 9.572888065453557e-05, + "loss": 2.0143, + "step": 5167 + }, + { + "epoch": 1.5862492326580724, + "grad_norm": 0.43178877234458923, + "learning_rate": 9.572687028236744e-05, + "loss": 2.0066, + "step": 5168 + }, + { + "epoch": 1.5865561694290977, + "grad_norm": 0.5256033539772034, + "learning_rate": 9.572485945829957e-05, + "loss": 2.0431, + "step": 5169 + }, + { + "epoch": 1.5868631062001226, + "grad_norm": 0.4714619517326355, + "learning_rate": 9.572284818235182e-05, + "loss": 1.9411, + "step": 5170 + }, + { + "epoch": 1.587170042971148, + "grad_norm": 0.4224734902381897, + "learning_rate": 9.572083645454411e-05, + "loss": 1.9648, + "step": 5171 + }, + { + "epoch": 1.5874769797421733, + "grad_norm": 0.45965152978897095, + "learning_rate": 9.571882427489628e-05, + "loss": 1.9241, + "step": 5172 + }, + { + "epoch": 1.5877839165131982, + "grad_norm": 0.459114670753479, + "learning_rate": 9.571681164342825e-05, + "loss": 2.0197, + "step": 5173 + }, + { + "epoch": 1.5880908532842235, + "grad_norm": 0.4278501272201538, + "learning_rate": 9.571479856015988e-05, + "loss": 1.9411, + "step": 5174 + }, + { + "epoch": 1.5883977900552486, + "grad_norm": 0.6875150799751282, + "learning_rate": 9.571278502511107e-05, + "loss": 1.8876, + "step": 5175 + }, + { + "epoch": 1.5887047268262737, + "grad_norm": 0.4596772789955139, + "learning_rate": 9.571077103830174e-05, + "loss": 1.9002, + "step": 5176 + }, + { + "epoch": 1.589011663597299, + "grad_norm": 0.47587937116622925, + "learning_rate": 9.570875659975178e-05, + "loss": 2.0034, + "step": 5177 + }, + { + "epoch": 1.5893186003683242, + "grad_norm": 0.42494842410087585, + "learning_rate": 9.570674170948109e-05, + "loss": 1.9668, + "step": 5178 + }, + { + "epoch": 1.5896255371393493, + "grad_norm": 0.4231310784816742, + "learning_rate": 9.570472636750957e-05, + "loss": 1.9365, + "step": 5179 + }, + { + "epoch": 1.5899324739103746, + "grad_norm": 0.4585247337818146, + "learning_rate": 9.570271057385719e-05, + "loss": 1.9707, + "step": 5180 + }, + { + "epoch": 1.5902394106813995, + "grad_norm": 0.4146895408630371, + "learning_rate": 9.570069432854382e-05, + "loss": 1.9405, + "step": 5181 + }, + { + "epoch": 1.5905463474524248, + "grad_norm": 0.42243605852127075, + "learning_rate": 9.56986776315894e-05, + "loss": 1.8893, + "step": 5182 + }, + { + "epoch": 1.59085328422345, + "grad_norm": 0.44299328327178955, + "learning_rate": 9.569666048301386e-05, + "loss": 1.9596, + "step": 5183 + }, + { + "epoch": 1.591160220994475, + "grad_norm": 0.4950970709323883, + "learning_rate": 9.569464288283716e-05, + "loss": 1.9066, + "step": 5184 + }, + { + "epoch": 1.5914671577655004, + "grad_norm": 0.4664969742298126, + "learning_rate": 9.569262483107919e-05, + "loss": 1.9485, + "step": 5185 + }, + { + "epoch": 1.5917740945365255, + "grad_norm": 0.5052160024642944, + "learning_rate": 9.569060632775993e-05, + "loss": 1.9189, + "step": 5186 + }, + { + "epoch": 1.5920810313075506, + "grad_norm": 0.4109063446521759, + "learning_rate": 9.568858737289932e-05, + "loss": 1.9236, + "step": 5187 + }, + { + "epoch": 1.592387968078576, + "grad_norm": 0.4078194499015808, + "learning_rate": 9.568656796651731e-05, + "loss": 1.9465, + "step": 5188 + }, + { + "epoch": 1.5926949048496009, + "grad_norm": 0.43199312686920166, + "learning_rate": 9.568454810863385e-05, + "loss": 1.9537, + "step": 5189 + }, + { + "epoch": 1.5930018416206262, + "grad_norm": 0.46389925479888916, + "learning_rate": 9.568252779926891e-05, + "loss": 1.9463, + "step": 5190 + }, + { + "epoch": 1.5933087783916513, + "grad_norm": 0.4130708575248718, + "learning_rate": 9.568050703844247e-05, + "loss": 1.948, + "step": 5191 + }, + { + "epoch": 1.5936157151626764, + "grad_norm": 0.4699256122112274, + "learning_rate": 9.567848582617448e-05, + "loss": 1.957, + "step": 5192 + }, + { + "epoch": 1.5939226519337018, + "grad_norm": 0.41965460777282715, + "learning_rate": 9.56764641624849e-05, + "loss": 1.9622, + "step": 5193 + }, + { + "epoch": 1.5942295887047269, + "grad_norm": 0.4313151240348816, + "learning_rate": 9.567444204739376e-05, + "loss": 1.981, + "step": 5194 + }, + { + "epoch": 1.594536525475752, + "grad_norm": 0.4149332642555237, + "learning_rate": 9.5672419480921e-05, + "loss": 1.9542, + "step": 5195 + }, + { + "epoch": 1.5948434622467773, + "grad_norm": 0.4456483721733093, + "learning_rate": 9.567039646308661e-05, + "loss": 2.0206, + "step": 5196 + }, + { + "epoch": 1.5951503990178022, + "grad_norm": 0.46637552976608276, + "learning_rate": 9.56683729939106e-05, + "loss": 2.0264, + "step": 5197 + }, + { + "epoch": 1.5954573357888275, + "grad_norm": 0.4809871315956116, + "learning_rate": 9.566634907341297e-05, + "loss": 1.9113, + "step": 5198 + }, + { + "epoch": 1.5957642725598526, + "grad_norm": 0.5220670104026794, + "learning_rate": 9.566432470161371e-05, + "loss": 1.9806, + "step": 5199 + }, + { + "epoch": 1.5960712093308778, + "grad_norm": 0.5020555853843689, + "learning_rate": 9.566229987853283e-05, + "loss": 1.9925, + "step": 5200 + }, + { + "epoch": 1.596378146101903, + "grad_norm": 0.5481683611869812, + "learning_rate": 9.566027460419034e-05, + "loss": 1.978, + "step": 5201 + }, + { + "epoch": 1.5966850828729282, + "grad_norm": 0.5014147758483887, + "learning_rate": 9.565824887860624e-05, + "loss": 1.9402, + "step": 5202 + }, + { + "epoch": 1.5969920196439533, + "grad_norm": 0.43973588943481445, + "learning_rate": 9.565622270180057e-05, + "loss": 1.9877, + "step": 5203 + }, + { + "epoch": 1.5972989564149787, + "grad_norm": 0.5172939300537109, + "learning_rate": 9.565419607379335e-05, + "loss": 1.9304, + "step": 5204 + }, + { + "epoch": 1.5976058931860035, + "grad_norm": 0.4767214357852936, + "learning_rate": 9.56521689946046e-05, + "loss": 1.9063, + "step": 5205 + }, + { + "epoch": 1.5979128299570289, + "grad_norm": 0.48810651898384094, + "learning_rate": 9.565014146425437e-05, + "loss": 1.9473, + "step": 5206 + }, + { + "epoch": 1.598219766728054, + "grad_norm": 0.4204402565956116, + "learning_rate": 9.564811348276269e-05, + "loss": 1.9562, + "step": 5207 + }, + { + "epoch": 1.598526703499079, + "grad_norm": 0.42679163813591003, + "learning_rate": 9.564608505014958e-05, + "loss": 1.8904, + "step": 5208 + }, + { + "epoch": 1.5988336402701044, + "grad_norm": 0.4240354299545288, + "learning_rate": 9.56440561664351e-05, + "loss": 1.9982, + "step": 5209 + }, + { + "epoch": 1.5991405770411296, + "grad_norm": 0.41588497161865234, + "learning_rate": 9.564202683163932e-05, + "loss": 1.9904, + "step": 5210 + }, + { + "epoch": 1.5994475138121547, + "grad_norm": 0.486240029335022, + "learning_rate": 9.563999704578226e-05, + "loss": 1.9379, + "step": 5211 + }, + { + "epoch": 1.59975445058318, + "grad_norm": 0.4628448188304901, + "learning_rate": 9.563796680888403e-05, + "loss": 2.0061, + "step": 5212 + }, + { + "epoch": 1.600061387354205, + "grad_norm": 0.4514544606208801, + "learning_rate": 9.563593612096464e-05, + "loss": 1.9692, + "step": 5213 + }, + { + "epoch": 1.6003683241252302, + "grad_norm": 0.3869803845882416, + "learning_rate": 9.563390498204419e-05, + "loss": 1.8801, + "step": 5214 + }, + { + "epoch": 1.6006752608962553, + "grad_norm": 0.47029098868370056, + "learning_rate": 9.563187339214274e-05, + "loss": 2.0457, + "step": 5215 + }, + { + "epoch": 1.6009821976672804, + "grad_norm": 0.49051982164382935, + "learning_rate": 9.562984135128037e-05, + "loss": 1.9121, + "step": 5216 + }, + { + "epoch": 1.6012891344383058, + "grad_norm": 0.5087830424308777, + "learning_rate": 9.562780885947717e-05, + "loss": 1.9165, + "step": 5217 + }, + { + "epoch": 1.601596071209331, + "grad_norm": 0.4597826600074768, + "learning_rate": 9.562577591675322e-05, + "loss": 1.9037, + "step": 5218 + }, + { + "epoch": 1.601903007980356, + "grad_norm": 0.43610528111457825, + "learning_rate": 9.562374252312858e-05, + "loss": 1.8785, + "step": 5219 + }, + { + "epoch": 1.6022099447513813, + "grad_norm": 0.45797282457351685, + "learning_rate": 9.56217086786234e-05, + "loss": 2.0713, + "step": 5220 + }, + { + "epoch": 1.6025168815224062, + "grad_norm": 0.46097078919410706, + "learning_rate": 9.561967438325777e-05, + "loss": 1.9176, + "step": 5221 + }, + { + "epoch": 1.6028238182934316, + "grad_norm": 0.47368288040161133, + "learning_rate": 9.561763963705176e-05, + "loss": 1.9333, + "step": 5222 + }, + { + "epoch": 1.6031307550644567, + "grad_norm": 0.5048179626464844, + "learning_rate": 9.561560444002551e-05, + "loss": 1.9473, + "step": 5223 + }, + { + "epoch": 1.6034376918354818, + "grad_norm": 0.42069435119628906, + "learning_rate": 9.56135687921991e-05, + "loss": 1.8507, + "step": 5224 + }, + { + "epoch": 1.6037446286065071, + "grad_norm": 0.37166985869407654, + "learning_rate": 9.561153269359269e-05, + "loss": 1.9404, + "step": 5225 + }, + { + "epoch": 1.6040515653775322, + "grad_norm": 0.42752668261528015, + "learning_rate": 9.560949614422637e-05, + "loss": 1.9791, + "step": 5226 + }, + { + "epoch": 1.6043585021485574, + "grad_norm": 0.4334527552127838, + "learning_rate": 9.560745914412029e-05, + "loss": 1.972, + "step": 5227 + }, + { + "epoch": 1.6046654389195827, + "grad_norm": 0.44162631034851074, + "learning_rate": 9.560542169329454e-05, + "loss": 1.9054, + "step": 5228 + }, + { + "epoch": 1.6049723756906076, + "grad_norm": 0.3891509771347046, + "learning_rate": 9.560338379176929e-05, + "loss": 1.9356, + "step": 5229 + }, + { + "epoch": 1.605279312461633, + "grad_norm": 0.3821989893913269, + "learning_rate": 9.56013454395647e-05, + "loss": 1.9197, + "step": 5230 + }, + { + "epoch": 1.605586249232658, + "grad_norm": 0.4338948428630829, + "learning_rate": 9.559930663670084e-05, + "loss": 2.002, + "step": 5231 + }, + { + "epoch": 1.6058931860036831, + "grad_norm": 0.4784114956855774, + "learning_rate": 9.559726738319794e-05, + "loss": 2.0344, + "step": 5232 + }, + { + "epoch": 1.6062001227747085, + "grad_norm": 0.43362441658973694, + "learning_rate": 9.559522767907612e-05, + "loss": 1.9282, + "step": 5233 + }, + { + "epoch": 1.6065070595457336, + "grad_norm": 0.40863800048828125, + "learning_rate": 9.559318752435553e-05, + "loss": 1.8468, + "step": 5234 + }, + { + "epoch": 1.6068139963167587, + "grad_norm": 0.4509727358818054, + "learning_rate": 9.559114691905633e-05, + "loss": 2.0175, + "step": 5235 + }, + { + "epoch": 1.607120933087784, + "grad_norm": 0.4650020897388458, + "learning_rate": 9.55891058631987e-05, + "loss": 1.9946, + "step": 5236 + }, + { + "epoch": 1.607427869858809, + "grad_norm": 0.4315911829471588, + "learning_rate": 9.55870643568028e-05, + "loss": 1.9271, + "step": 5237 + }, + { + "epoch": 1.6077348066298343, + "grad_norm": 0.4109809994697571, + "learning_rate": 9.558502239988882e-05, + "loss": 1.9791, + "step": 5238 + }, + { + "epoch": 1.6080417434008594, + "grad_norm": 0.4323776662349701, + "learning_rate": 9.558297999247692e-05, + "loss": 1.9745, + "step": 5239 + }, + { + "epoch": 1.6083486801718845, + "grad_norm": 0.4255007207393646, + "learning_rate": 9.558093713458729e-05, + "loss": 1.96, + "step": 5240 + }, + { + "epoch": 1.6086556169429098, + "grad_norm": 0.4045571982860565, + "learning_rate": 9.557889382624014e-05, + "loss": 1.9148, + "step": 5241 + }, + { + "epoch": 1.608962553713935, + "grad_norm": 0.39663615822792053, + "learning_rate": 9.557685006745564e-05, + "loss": 1.9313, + "step": 5242 + }, + { + "epoch": 1.60926949048496, + "grad_norm": 0.39130523800849915, + "learning_rate": 9.5574805858254e-05, + "loss": 2.0073, + "step": 5243 + }, + { + "epoch": 1.6095764272559854, + "grad_norm": 0.4071548581123352, + "learning_rate": 9.55727611986554e-05, + "loss": 1.9353, + "step": 5244 + }, + { + "epoch": 1.6098833640270105, + "grad_norm": 0.44347357749938965, + "learning_rate": 9.557071608868007e-05, + "loss": 1.9325, + "step": 5245 + }, + { + "epoch": 1.6101903007980356, + "grad_norm": 0.48900067806243896, + "learning_rate": 9.556867052834821e-05, + "loss": 2.0083, + "step": 5246 + }, + { + "epoch": 1.610497237569061, + "grad_norm": 0.44374197721481323, + "learning_rate": 9.556662451768006e-05, + "loss": 2.0143, + "step": 5247 + }, + { + "epoch": 1.6108041743400858, + "grad_norm": 0.385268896818161, + "learning_rate": 9.556457805669581e-05, + "loss": 1.8981, + "step": 5248 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.5355607867240906, + "learning_rate": 9.556253114541569e-05, + "loss": 2.0413, + "step": 5249 + }, + { + "epoch": 1.6114180478821363, + "grad_norm": 0.5672646164894104, + "learning_rate": 9.556048378385992e-05, + "loss": 1.9429, + "step": 5250 + }, + { + "epoch": 1.6117249846531614, + "grad_norm": 0.46225669980049133, + "learning_rate": 9.555843597204875e-05, + "loss": 1.9883, + "step": 5251 + }, + { + "epoch": 1.6120319214241867, + "grad_norm": 0.43236228823661804, + "learning_rate": 9.555638771000243e-05, + "loss": 1.9641, + "step": 5252 + }, + { + "epoch": 1.6123388581952118, + "grad_norm": 0.4843178987503052, + "learning_rate": 9.555433899774116e-05, + "loss": 1.9224, + "step": 5253 + }, + { + "epoch": 1.612645794966237, + "grad_norm": 0.4693675637245178, + "learning_rate": 9.555228983528523e-05, + "loss": 1.9774, + "step": 5254 + }, + { + "epoch": 1.6129527317372623, + "grad_norm": 0.3968529999256134, + "learning_rate": 9.555024022265487e-05, + "loss": 1.8939, + "step": 5255 + }, + { + "epoch": 1.6132596685082872, + "grad_norm": 0.42781850695610046, + "learning_rate": 9.554819015987033e-05, + "loss": 1.9561, + "step": 5256 + }, + { + "epoch": 1.6135666052793125, + "grad_norm": 0.5241015553474426, + "learning_rate": 9.554613964695189e-05, + "loss": 1.963, + "step": 5257 + }, + { + "epoch": 1.6138735420503376, + "grad_norm": 0.4292888641357422, + "learning_rate": 9.554408868391979e-05, + "loss": 2.0248, + "step": 5258 + }, + { + "epoch": 1.6141804788213627, + "grad_norm": 0.49197763204574585, + "learning_rate": 9.554203727079433e-05, + "loss": 1.9612, + "step": 5259 + }, + { + "epoch": 1.614487415592388, + "grad_norm": 0.45733556151390076, + "learning_rate": 9.553998540759575e-05, + "loss": 1.9093, + "step": 5260 + }, + { + "epoch": 1.6147943523634132, + "grad_norm": 0.4139576256275177, + "learning_rate": 9.553793309434436e-05, + "loss": 1.875, + "step": 5261 + }, + { + "epoch": 1.6151012891344383, + "grad_norm": 0.42295894026756287, + "learning_rate": 9.55358803310604e-05, + "loss": 1.9427, + "step": 5262 + }, + { + "epoch": 1.6154082259054636, + "grad_norm": 0.370761513710022, + "learning_rate": 9.55338271177642e-05, + "loss": 1.932, + "step": 5263 + }, + { + "epoch": 1.6157151626764885, + "grad_norm": 0.38912683725357056, + "learning_rate": 9.553177345447602e-05, + "loss": 1.9606, + "step": 5264 + }, + { + "epoch": 1.6160220994475138, + "grad_norm": 0.3901510238647461, + "learning_rate": 9.552971934121618e-05, + "loss": 1.9455, + "step": 5265 + }, + { + "epoch": 1.616329036218539, + "grad_norm": 0.4517458975315094, + "learning_rate": 9.552766477800494e-05, + "loss": 1.9291, + "step": 5266 + }, + { + "epoch": 1.616635972989564, + "grad_norm": 0.47282713651657104, + "learning_rate": 9.552560976486266e-05, + "loss": 1.9326, + "step": 5267 + }, + { + "epoch": 1.6169429097605894, + "grad_norm": 0.4741488993167877, + "learning_rate": 9.552355430180961e-05, + "loss": 1.9782, + "step": 5268 + }, + { + "epoch": 1.6172498465316145, + "grad_norm": 0.42634037137031555, + "learning_rate": 9.552149838886612e-05, + "loss": 1.9871, + "step": 5269 + }, + { + "epoch": 1.6175567833026396, + "grad_norm": 0.39007633924484253, + "learning_rate": 9.55194420260525e-05, + "loss": 1.9397, + "step": 5270 + }, + { + "epoch": 1.617863720073665, + "grad_norm": 0.41707170009613037, + "learning_rate": 9.551738521338906e-05, + "loss": 1.8555, + "step": 5271 + }, + { + "epoch": 1.6181706568446899, + "grad_norm": 0.46702343225479126, + "learning_rate": 9.551532795089616e-05, + "loss": 1.9987, + "step": 5272 + }, + { + "epoch": 1.6184775936157152, + "grad_norm": 0.44585564732551575, + "learning_rate": 9.551327023859411e-05, + "loss": 1.8512, + "step": 5273 + }, + { + "epoch": 1.6187845303867403, + "grad_norm": 0.42617684602737427, + "learning_rate": 9.551121207650324e-05, + "loss": 1.9405, + "step": 5274 + }, + { + "epoch": 1.6190914671577654, + "grad_norm": 0.39399340748786926, + "learning_rate": 9.55091534646439e-05, + "loss": 1.9787, + "step": 5275 + }, + { + "epoch": 1.6193984039287908, + "grad_norm": 0.44386324286460876, + "learning_rate": 9.550709440303642e-05, + "loss": 1.9791, + "step": 5276 + }, + { + "epoch": 1.6197053406998159, + "grad_norm": 0.3871287405490875, + "learning_rate": 9.550503489170117e-05, + "loss": 1.9354, + "step": 5277 + }, + { + "epoch": 1.620012277470841, + "grad_norm": 0.4131690263748169, + "learning_rate": 9.550297493065851e-05, + "loss": 1.9709, + "step": 5278 + }, + { + "epoch": 1.6203192142418663, + "grad_norm": 0.3919534683227539, + "learning_rate": 9.550091451992877e-05, + "loss": 1.8997, + "step": 5279 + }, + { + "epoch": 1.6206261510128912, + "grad_norm": 0.40001583099365234, + "learning_rate": 9.54988536595323e-05, + "loss": 1.9006, + "step": 5280 + }, + { + "epoch": 1.6209330877839165, + "grad_norm": 0.44222408533096313, + "learning_rate": 9.549679234948952e-05, + "loss": 2.0033, + "step": 5281 + }, + { + "epoch": 1.6212400245549416, + "grad_norm": 0.4243159592151642, + "learning_rate": 9.549473058982077e-05, + "loss": 1.9582, + "step": 5282 + }, + { + "epoch": 1.6215469613259668, + "grad_norm": 0.411408007144928, + "learning_rate": 9.549266838054641e-05, + "loss": 1.9244, + "step": 5283 + }, + { + "epoch": 1.621853898096992, + "grad_norm": 0.3833782970905304, + "learning_rate": 9.549060572168686e-05, + "loss": 1.9184, + "step": 5284 + }, + { + "epoch": 1.6221608348680172, + "grad_norm": 0.3925926685333252, + "learning_rate": 9.548854261326246e-05, + "loss": 1.9299, + "step": 5285 + }, + { + "epoch": 1.6224677716390423, + "grad_norm": 0.4472656846046448, + "learning_rate": 9.548647905529363e-05, + "loss": 2.0622, + "step": 5286 + }, + { + "epoch": 1.6227747084100677, + "grad_norm": 0.4842108488082886, + "learning_rate": 9.548441504780074e-05, + "loss": 1.9759, + "step": 5287 + }, + { + "epoch": 1.6230816451810925, + "grad_norm": 0.49826517701148987, + "learning_rate": 9.548235059080422e-05, + "loss": 1.9162, + "step": 5288 + }, + { + "epoch": 1.6233885819521179, + "grad_norm": 0.4672689735889435, + "learning_rate": 9.548028568432445e-05, + "loss": 1.9843, + "step": 5289 + }, + { + "epoch": 1.623695518723143, + "grad_norm": 0.48113325238227844, + "learning_rate": 9.547822032838182e-05, + "loss": 1.9426, + "step": 5290 + }, + { + "epoch": 1.624002455494168, + "grad_norm": 0.49646374583244324, + "learning_rate": 9.54761545229968e-05, + "loss": 1.908, + "step": 5291 + }, + { + "epoch": 1.6243093922651934, + "grad_norm": 0.42530664801597595, + "learning_rate": 9.547408826818974e-05, + "loss": 1.9189, + "step": 5292 + }, + { + "epoch": 1.6246163290362186, + "grad_norm": 0.592721164226532, + "learning_rate": 9.54720215639811e-05, + "loss": 1.9656, + "step": 5293 + }, + { + "epoch": 1.6249232658072437, + "grad_norm": 0.5530748963356018, + "learning_rate": 9.546995441039127e-05, + "loss": 1.8815, + "step": 5294 + }, + { + "epoch": 1.625230202578269, + "grad_norm": 0.4551030695438385, + "learning_rate": 9.546788680744073e-05, + "loss": 1.9485, + "step": 5295 + }, + { + "epoch": 1.625537139349294, + "grad_norm": 0.42004409432411194, + "learning_rate": 9.546581875514985e-05, + "loss": 1.9903, + "step": 5296 + }, + { + "epoch": 1.6258440761203192, + "grad_norm": 0.5363507270812988, + "learning_rate": 9.546375025353911e-05, + "loss": 1.93, + "step": 5297 + }, + { + "epoch": 1.6261510128913443, + "grad_norm": 0.457795649766922, + "learning_rate": 9.546168130262896e-05, + "loss": 1.9279, + "step": 5298 + }, + { + "epoch": 1.6264579496623695, + "grad_norm": 0.5061174631118774, + "learning_rate": 9.545961190243982e-05, + "loss": 1.9198, + "step": 5299 + }, + { + "epoch": 1.6267648864333948, + "grad_norm": 0.4366548955440521, + "learning_rate": 9.545754205299214e-05, + "loss": 1.9206, + "step": 5300 + }, + { + "epoch": 1.62707182320442, + "grad_norm": 0.361251562833786, + "learning_rate": 9.54554717543064e-05, + "loss": 1.8638, + "step": 5301 + }, + { + "epoch": 1.627378759975445, + "grad_norm": 0.45089036226272583, + "learning_rate": 9.545340100640303e-05, + "loss": 1.9206, + "step": 5302 + }, + { + "epoch": 1.6276856967464703, + "grad_norm": 0.38224726915359497, + "learning_rate": 9.545132980930251e-05, + "loss": 1.9893, + "step": 5303 + }, + { + "epoch": 1.6279926335174952, + "grad_norm": 0.43573206663131714, + "learning_rate": 9.544925816302533e-05, + "loss": 1.9358, + "step": 5304 + }, + { + "epoch": 1.6282995702885206, + "grad_norm": 0.5618723630905151, + "learning_rate": 9.544718606759193e-05, + "loss": 1.9745, + "step": 5305 + }, + { + "epoch": 1.6286065070595457, + "grad_norm": 0.517867386341095, + "learning_rate": 9.54451135230228e-05, + "loss": 2.0238, + "step": 5306 + }, + { + "epoch": 1.6289134438305708, + "grad_norm": 0.4745725393295288, + "learning_rate": 9.544304052933842e-05, + "loss": 1.999, + "step": 5307 + }, + { + "epoch": 1.6292203806015961, + "grad_norm": 0.4454270899295807, + "learning_rate": 9.544096708655928e-05, + "loss": 1.9215, + "step": 5308 + }, + { + "epoch": 1.6295273173726212, + "grad_norm": 0.5604696273803711, + "learning_rate": 9.543889319470586e-05, + "loss": 1.8756, + "step": 5309 + }, + { + "epoch": 1.6298342541436464, + "grad_norm": 0.645453155040741, + "learning_rate": 9.543681885379869e-05, + "loss": 1.9177, + "step": 5310 + }, + { + "epoch": 1.6301411909146717, + "grad_norm": 0.7018140554428101, + "learning_rate": 9.543474406385824e-05, + "loss": 1.9231, + "step": 5311 + }, + { + "epoch": 1.6304481276856968, + "grad_norm": 0.691644549369812, + "learning_rate": 9.543266882490501e-05, + "loss": 1.9055, + "step": 5312 + }, + { + "epoch": 1.630755064456722, + "grad_norm": 0.5484849810600281, + "learning_rate": 9.54305931369595e-05, + "loss": 1.8977, + "step": 5313 + }, + { + "epoch": 1.6310620012277472, + "grad_norm": 0.4035104811191559, + "learning_rate": 9.542851700004227e-05, + "loss": 1.9098, + "step": 5314 + }, + { + "epoch": 1.6313689379987721, + "grad_norm": 0.4578574299812317, + "learning_rate": 9.542644041417379e-05, + "loss": 1.9946, + "step": 5315 + }, + { + "epoch": 1.6316758747697975, + "grad_norm": 0.646272599697113, + "learning_rate": 9.542436337937462e-05, + "loss": 1.9489, + "step": 5316 + }, + { + "epoch": 1.6319828115408226, + "grad_norm": 0.5796291828155518, + "learning_rate": 9.542228589566524e-05, + "loss": 1.8396, + "step": 5317 + }, + { + "epoch": 1.6322897483118477, + "grad_norm": 0.42690619826316833, + "learning_rate": 9.542020796306623e-05, + "loss": 1.9691, + "step": 5318 + }, + { + "epoch": 1.632596685082873, + "grad_norm": 0.3943910002708435, + "learning_rate": 9.54181295815981e-05, + "loss": 1.8711, + "step": 5319 + }, + { + "epoch": 1.6329036218538981, + "grad_norm": 0.4636860489845276, + "learning_rate": 9.541605075128137e-05, + "loss": 1.8659, + "step": 5320 + }, + { + "epoch": 1.6332105586249233, + "grad_norm": 0.5485807061195374, + "learning_rate": 9.541397147213664e-05, + "loss": 2.031, + "step": 5321 + }, + { + "epoch": 1.6335174953959486, + "grad_norm": 0.40169721841812134, + "learning_rate": 9.541189174418441e-05, + "loss": 1.9346, + "step": 5322 + }, + { + "epoch": 1.6338244321669735, + "grad_norm": 0.3407663106918335, + "learning_rate": 9.540981156744524e-05, + "loss": 1.9238, + "step": 5323 + }, + { + "epoch": 1.6341313689379988, + "grad_norm": 0.4062422513961792, + "learning_rate": 9.540773094193971e-05, + "loss": 1.914, + "step": 5324 + }, + { + "epoch": 1.634438305709024, + "grad_norm": 0.47654685378074646, + "learning_rate": 9.540564986768836e-05, + "loss": 1.8957, + "step": 5325 + }, + { + "epoch": 1.634745242480049, + "grad_norm": 0.4369850754737854, + "learning_rate": 9.540356834471178e-05, + "loss": 1.968, + "step": 5326 + }, + { + "epoch": 1.6350521792510744, + "grad_norm": 0.38868457078933716, + "learning_rate": 9.540148637303052e-05, + "loss": 1.931, + "step": 5327 + }, + { + "epoch": 1.6353591160220995, + "grad_norm": 0.4998358190059662, + "learning_rate": 9.539940395266515e-05, + "loss": 1.9316, + "step": 5328 + }, + { + "epoch": 1.6356660527931246, + "grad_norm": 0.5497372150421143, + "learning_rate": 9.539732108363628e-05, + "loss": 1.9233, + "step": 5329 + }, + { + "epoch": 1.63597298956415, + "grad_norm": 0.5609846115112305, + "learning_rate": 9.539523776596445e-05, + "loss": 1.898, + "step": 5330 + }, + { + "epoch": 1.6362799263351748, + "grad_norm": 0.44984617829322815, + "learning_rate": 9.539315399967029e-05, + "loss": 2.0103, + "step": 5331 + }, + { + "epoch": 1.6365868631062002, + "grad_norm": 0.41710013151168823, + "learning_rate": 9.539106978477436e-05, + "loss": 1.9008, + "step": 5332 + }, + { + "epoch": 1.6368937998772253, + "grad_norm": 0.44854703545570374, + "learning_rate": 9.53889851212973e-05, + "loss": 1.9591, + "step": 5333 + }, + { + "epoch": 1.6372007366482504, + "grad_norm": 0.4259171485900879, + "learning_rate": 9.538690000925968e-05, + "loss": 1.915, + "step": 5334 + }, + { + "epoch": 1.6375076734192757, + "grad_norm": 0.4444480240345001, + "learning_rate": 9.53848144486821e-05, + "loss": 1.9562, + "step": 5335 + }, + { + "epoch": 1.6378146101903008, + "grad_norm": 0.40078794956207275, + "learning_rate": 9.538272843958518e-05, + "loss": 1.8802, + "step": 5336 + }, + { + "epoch": 1.638121546961326, + "grad_norm": 0.5346726179122925, + "learning_rate": 9.538064198198955e-05, + "loss": 2.0214, + "step": 5337 + }, + { + "epoch": 1.6384284837323513, + "grad_norm": 0.47136780619621277, + "learning_rate": 9.537855507591581e-05, + "loss": 1.9593, + "step": 5338 + }, + { + "epoch": 1.6387354205033762, + "grad_norm": 0.3839198052883148, + "learning_rate": 9.53764677213846e-05, + "loss": 1.9507, + "step": 5339 + }, + { + "epoch": 1.6390423572744015, + "grad_norm": 0.4565586447715759, + "learning_rate": 9.537437991841654e-05, + "loss": 1.9292, + "step": 5340 + }, + { + "epoch": 1.6393492940454266, + "grad_norm": 0.5139011740684509, + "learning_rate": 9.537229166703225e-05, + "loss": 1.9388, + "step": 5341 + }, + { + "epoch": 1.6396562308164517, + "grad_norm": 0.5421571135520935, + "learning_rate": 9.537020296725238e-05, + "loss": 1.9031, + "step": 5342 + }, + { + "epoch": 1.639963167587477, + "grad_norm": 0.4085434675216675, + "learning_rate": 9.536811381909758e-05, + "loss": 1.9167, + "step": 5343 + }, + { + "epoch": 1.6402701043585022, + "grad_norm": 0.3567824065685272, + "learning_rate": 9.536602422258849e-05, + "loss": 1.89, + "step": 5344 + }, + { + "epoch": 1.6405770411295273, + "grad_norm": 0.5427443385124207, + "learning_rate": 9.536393417774575e-05, + "loss": 2.0036, + "step": 5345 + }, + { + "epoch": 1.6408839779005526, + "grad_norm": 0.5275370478630066, + "learning_rate": 9.536184368459003e-05, + "loss": 1.94, + "step": 5346 + }, + { + "epoch": 1.6411909146715775, + "grad_norm": 0.3916989862918854, + "learning_rate": 9.535975274314198e-05, + "loss": 1.8769, + "step": 5347 + }, + { + "epoch": 1.6414978514426029, + "grad_norm": 0.4200802743434906, + "learning_rate": 9.535766135342228e-05, + "loss": 1.9384, + "step": 5348 + }, + { + "epoch": 1.641804788213628, + "grad_norm": 0.5287195444107056, + "learning_rate": 9.535556951545157e-05, + "loss": 1.9159, + "step": 5349 + }, + { + "epoch": 1.642111724984653, + "grad_norm": 0.5934851765632629, + "learning_rate": 9.535347722925055e-05, + "loss": 1.9927, + "step": 5350 + }, + { + "epoch": 1.6424186617556784, + "grad_norm": 0.49941807985305786, + "learning_rate": 9.535138449483987e-05, + "loss": 1.9124, + "step": 5351 + }, + { + "epoch": 1.6427255985267035, + "grad_norm": 0.41778016090393066, + "learning_rate": 9.534929131224024e-05, + "loss": 1.9468, + "step": 5352 + }, + { + "epoch": 1.6430325352977286, + "grad_norm": 0.5172474384307861, + "learning_rate": 9.534719768147233e-05, + "loss": 1.928, + "step": 5353 + }, + { + "epoch": 1.643339472068754, + "grad_norm": 0.6690294146537781, + "learning_rate": 9.534510360255683e-05, + "loss": 1.9697, + "step": 5354 + }, + { + "epoch": 1.6436464088397789, + "grad_norm": 0.617683470249176, + "learning_rate": 9.534300907551444e-05, + "loss": 1.9529, + "step": 5355 + }, + { + "epoch": 1.6439533456108042, + "grad_norm": 0.40067893266677856, + "learning_rate": 9.534091410036587e-05, + "loss": 1.915, + "step": 5356 + }, + { + "epoch": 1.6442602823818293, + "grad_norm": 0.46418440341949463, + "learning_rate": 9.53388186771318e-05, + "loss": 1.9056, + "step": 5357 + }, + { + "epoch": 1.6445672191528544, + "grad_norm": 0.6600098013877869, + "learning_rate": 9.533672280583295e-05, + "loss": 1.9641, + "step": 5358 + }, + { + "epoch": 1.6448741559238798, + "grad_norm": 0.6510347127914429, + "learning_rate": 9.533462648649004e-05, + "loss": 1.916, + "step": 5359 + }, + { + "epoch": 1.6451810926949049, + "grad_norm": 0.5004377365112305, + "learning_rate": 9.533252971912376e-05, + "loss": 1.9584, + "step": 5360 + }, + { + "epoch": 1.64548802946593, + "grad_norm": 0.45522230863571167, + "learning_rate": 9.533043250375488e-05, + "loss": 1.973, + "step": 5361 + }, + { + "epoch": 1.6457949662369553, + "grad_norm": 0.5304180383682251, + "learning_rate": 9.532833484040408e-05, + "loss": 1.8542, + "step": 5362 + }, + { + "epoch": 1.6461019030079802, + "grad_norm": 0.5320406556129456, + "learning_rate": 9.53262367290921e-05, + "loss": 1.9405, + "step": 5363 + }, + { + "epoch": 1.6464088397790055, + "grad_norm": 0.4377361536026001, + "learning_rate": 9.532413816983969e-05, + "loss": 1.9126, + "step": 5364 + }, + { + "epoch": 1.6467157765500307, + "grad_norm": 0.4632298946380615, + "learning_rate": 9.532203916266758e-05, + "loss": 1.9868, + "step": 5365 + }, + { + "epoch": 1.6470227133210558, + "grad_norm": 0.4861730635166168, + "learning_rate": 9.531993970759651e-05, + "loss": 1.895, + "step": 5366 + }, + { + "epoch": 1.647329650092081, + "grad_norm": 0.45012348890304565, + "learning_rate": 9.531783980464726e-05, + "loss": 1.9583, + "step": 5367 + }, + { + "epoch": 1.6476365868631062, + "grad_norm": 0.43772751092910767, + "learning_rate": 9.531573945384053e-05, + "loss": 1.9341, + "step": 5368 + }, + { + "epoch": 1.6479435236341313, + "grad_norm": 0.39253392815589905, + "learning_rate": 9.531363865519711e-05, + "loss": 1.8629, + "step": 5369 + }, + { + "epoch": 1.6482504604051567, + "grad_norm": 0.44614076614379883, + "learning_rate": 9.531153740873775e-05, + "loss": 1.9508, + "step": 5370 + }, + { + "epoch": 1.6485573971761815, + "grad_norm": 0.4442307949066162, + "learning_rate": 9.530943571448322e-05, + "loss": 1.9624, + "step": 5371 + }, + { + "epoch": 1.6488643339472069, + "grad_norm": 0.44962942600250244, + "learning_rate": 9.53073335724543e-05, + "loss": 1.9315, + "step": 5372 + }, + { + "epoch": 1.649171270718232, + "grad_norm": 0.4903222620487213, + "learning_rate": 9.530523098267173e-05, + "loss": 1.8776, + "step": 5373 + }, + { + "epoch": 1.649478207489257, + "grad_norm": 0.4733131229877472, + "learning_rate": 9.530312794515633e-05, + "loss": 1.958, + "step": 5374 + }, + { + "epoch": 1.6497851442602824, + "grad_norm": 0.4134232997894287, + "learning_rate": 9.530102445992886e-05, + "loss": 1.9184, + "step": 5375 + }, + { + "epoch": 1.6500920810313076, + "grad_norm": 0.43521758913993835, + "learning_rate": 9.529892052701012e-05, + "loss": 1.9383, + "step": 5376 + }, + { + "epoch": 1.6503990178023327, + "grad_norm": 0.5098583102226257, + "learning_rate": 9.52968161464209e-05, + "loss": 1.9596, + "step": 5377 + }, + { + "epoch": 1.650705954573358, + "grad_norm": 0.48421037197113037, + "learning_rate": 9.5294711318182e-05, + "loss": 1.9258, + "step": 5378 + }, + { + "epoch": 1.651012891344383, + "grad_norm": 0.4039461314678192, + "learning_rate": 9.52926060423142e-05, + "loss": 1.9975, + "step": 5379 + }, + { + "epoch": 1.6513198281154082, + "grad_norm": 0.491858571767807, + "learning_rate": 9.529050031883832e-05, + "loss": 1.9564, + "step": 5380 + }, + { + "epoch": 1.6516267648864333, + "grad_norm": 0.45920100808143616, + "learning_rate": 9.528839414777517e-05, + "loss": 1.8513, + "step": 5381 + }, + { + "epoch": 1.6519337016574585, + "grad_norm": 0.4812139868736267, + "learning_rate": 9.528628752914558e-05, + "loss": 1.9638, + "step": 5382 + }, + { + "epoch": 1.6522406384284838, + "grad_norm": 0.38021141290664673, + "learning_rate": 9.528418046297034e-05, + "loss": 1.848, + "step": 5383 + }, + { + "epoch": 1.652547575199509, + "grad_norm": 0.438681960105896, + "learning_rate": 9.52820729492703e-05, + "loss": 1.9931, + "step": 5384 + }, + { + "epoch": 1.652854511970534, + "grad_norm": 0.4387293756008148, + "learning_rate": 9.527996498806627e-05, + "loss": 1.9969, + "step": 5385 + }, + { + "epoch": 1.6531614487415593, + "grad_norm": 0.43315380811691284, + "learning_rate": 9.527785657937907e-05, + "loss": 1.9607, + "step": 5386 + }, + { + "epoch": 1.6534683855125845, + "grad_norm": 0.4800446927547455, + "learning_rate": 9.527574772322956e-05, + "loss": 1.9645, + "step": 5387 + }, + { + "epoch": 1.6537753222836096, + "grad_norm": 0.45495909452438354, + "learning_rate": 9.527363841963857e-05, + "loss": 1.8748, + "step": 5388 + }, + { + "epoch": 1.654082259054635, + "grad_norm": 0.4052638113498688, + "learning_rate": 9.527152866862696e-05, + "loss": 1.9491, + "step": 5389 + }, + { + "epoch": 1.6543891958256598, + "grad_norm": 0.44545745849609375, + "learning_rate": 9.526941847021558e-05, + "loss": 1.8938, + "step": 5390 + }, + { + "epoch": 1.6546961325966851, + "grad_norm": 0.5576399564743042, + "learning_rate": 9.526730782442526e-05, + "loss": 1.9656, + "step": 5391 + }, + { + "epoch": 1.6550030693677102, + "grad_norm": 0.5678401589393616, + "learning_rate": 9.526519673127686e-05, + "loss": 1.9914, + "step": 5392 + }, + { + "epoch": 1.6553100061387354, + "grad_norm": 0.4391598701477051, + "learning_rate": 9.526308519079127e-05, + "loss": 1.9452, + "step": 5393 + }, + { + "epoch": 1.6556169429097607, + "grad_norm": 0.4375559091567993, + "learning_rate": 9.526097320298934e-05, + "loss": 1.9335, + "step": 5394 + }, + { + "epoch": 1.6559238796807858, + "grad_norm": 0.4976498782634735, + "learning_rate": 9.525886076789194e-05, + "loss": 2.0065, + "step": 5395 + }, + { + "epoch": 1.656230816451811, + "grad_norm": 0.5966445207595825, + "learning_rate": 9.525674788551996e-05, + "loss": 1.9924, + "step": 5396 + }, + { + "epoch": 1.6565377532228363, + "grad_norm": 0.5119359493255615, + "learning_rate": 9.525463455589427e-05, + "loss": 2.0061, + "step": 5397 + }, + { + "epoch": 1.6568446899938611, + "grad_norm": 0.46835067868232727, + "learning_rate": 9.525252077903574e-05, + "loss": 1.9441, + "step": 5398 + }, + { + "epoch": 1.6571516267648865, + "grad_norm": 0.5319140553474426, + "learning_rate": 9.52504065549653e-05, + "loss": 1.9704, + "step": 5399 + }, + { + "epoch": 1.6574585635359116, + "grad_norm": 0.5132572054862976, + "learning_rate": 9.52482918837038e-05, + "loss": 1.9037, + "step": 5400 + }, + { + "epoch": 1.6577655003069367, + "grad_norm": 0.41260987520217896, + "learning_rate": 9.524617676527218e-05, + "loss": 1.9103, + "step": 5401 + }, + { + "epoch": 1.658072437077962, + "grad_norm": 0.41780540347099304, + "learning_rate": 9.524406119969131e-05, + "loss": 1.9419, + "step": 5402 + }, + { + "epoch": 1.6583793738489871, + "grad_norm": 0.42015889286994934, + "learning_rate": 9.524194518698211e-05, + "loss": 1.9143, + "step": 5403 + }, + { + "epoch": 1.6586863106200123, + "grad_norm": 0.4449796676635742, + "learning_rate": 9.523982872716548e-05, + "loss": 1.9794, + "step": 5404 + }, + { + "epoch": 1.6589932473910376, + "grad_norm": 0.4392293393611908, + "learning_rate": 9.523771182026237e-05, + "loss": 1.8687, + "step": 5405 + }, + { + "epoch": 1.6593001841620625, + "grad_norm": 0.49595963954925537, + "learning_rate": 9.523559446629366e-05, + "loss": 2.013, + "step": 5406 + }, + { + "epoch": 1.6596071209330878, + "grad_norm": 0.4456728994846344, + "learning_rate": 9.523347666528029e-05, + "loss": 1.9269, + "step": 5407 + }, + { + "epoch": 1.659914057704113, + "grad_norm": 0.3835284411907196, + "learning_rate": 9.52313584172432e-05, + "loss": 1.9042, + "step": 5408 + }, + { + "epoch": 1.660220994475138, + "grad_norm": 0.39068692922592163, + "learning_rate": 9.522923972220332e-05, + "loss": 1.999, + "step": 5409 + }, + { + "epoch": 1.6605279312461634, + "grad_norm": 0.4522729814052582, + "learning_rate": 9.522712058018157e-05, + "loss": 1.9546, + "step": 5410 + }, + { + "epoch": 1.6608348680171885, + "grad_norm": 0.3834155201911926, + "learning_rate": 9.522500099119891e-05, + "loss": 1.9184, + "step": 5411 + }, + { + "epoch": 1.6611418047882136, + "grad_norm": 0.36149126291275024, + "learning_rate": 9.522288095527629e-05, + "loss": 1.8973, + "step": 5412 + }, + { + "epoch": 1.661448741559239, + "grad_norm": 0.3502398729324341, + "learning_rate": 9.522076047243464e-05, + "loss": 1.8775, + "step": 5413 + }, + { + "epoch": 1.6617556783302638, + "grad_norm": 0.36552321910858154, + "learning_rate": 9.521863954269495e-05, + "loss": 1.901, + "step": 5414 + }, + { + "epoch": 1.6620626151012892, + "grad_norm": 0.37815216183662415, + "learning_rate": 9.521651816607814e-05, + "loss": 1.9143, + "step": 5415 + }, + { + "epoch": 1.6623695518723143, + "grad_norm": 0.4048994481563568, + "learning_rate": 9.52143963426052e-05, + "loss": 1.9892, + "step": 5416 + }, + { + "epoch": 1.6626764886433394, + "grad_norm": 0.35271233320236206, + "learning_rate": 9.52122740722971e-05, + "loss": 1.9209, + "step": 5417 + }, + { + "epoch": 1.6629834254143647, + "grad_norm": 0.405009925365448, + "learning_rate": 9.521015135517482e-05, + "loss": 1.9583, + "step": 5418 + }, + { + "epoch": 1.6632903621853898, + "grad_norm": 0.4041683077812195, + "learning_rate": 9.520802819125932e-05, + "loss": 1.8937, + "step": 5419 + }, + { + "epoch": 1.663597298956415, + "grad_norm": 0.41353970766067505, + "learning_rate": 9.520590458057157e-05, + "loss": 1.949, + "step": 5420 + }, + { + "epoch": 1.6639042357274403, + "grad_norm": 0.3704569637775421, + "learning_rate": 9.520378052313258e-05, + "loss": 1.9287, + "step": 5421 + }, + { + "epoch": 1.6642111724984652, + "grad_norm": 0.4043133854866028, + "learning_rate": 9.520165601896334e-05, + "loss": 1.9116, + "step": 5422 + }, + { + "epoch": 1.6645181092694905, + "grad_norm": 0.3976849317550659, + "learning_rate": 9.519953106808485e-05, + "loss": 1.9578, + "step": 5423 + }, + { + "epoch": 1.6648250460405156, + "grad_norm": 0.41225695610046387, + "learning_rate": 9.51974056705181e-05, + "loss": 1.8861, + "step": 5424 + }, + { + "epoch": 1.6651319828115407, + "grad_norm": 0.40096259117126465, + "learning_rate": 9.519527982628409e-05, + "loss": 1.926, + "step": 5425 + }, + { + "epoch": 1.665438919582566, + "grad_norm": 0.4373134970664978, + "learning_rate": 9.519315353540384e-05, + "loss": 1.8761, + "step": 5426 + }, + { + "epoch": 1.6657458563535912, + "grad_norm": 0.3798682689666748, + "learning_rate": 9.519102679789835e-05, + "loss": 1.8655, + "step": 5427 + }, + { + "epoch": 1.6660527931246163, + "grad_norm": 0.3889687955379486, + "learning_rate": 9.518889961378865e-05, + "loss": 1.8928, + "step": 5428 + }, + { + "epoch": 1.6663597298956416, + "grad_norm": 0.39567697048187256, + "learning_rate": 9.518677198309575e-05, + "loss": 1.9193, + "step": 5429 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.37571004033088684, + "learning_rate": 9.51846439058407e-05, + "loss": 1.9653, + "step": 5430 + }, + { + "epoch": 1.6669736034376919, + "grad_norm": 0.36011725664138794, + "learning_rate": 9.518251538204451e-05, + "loss": 1.9202, + "step": 5431 + }, + { + "epoch": 1.667280540208717, + "grad_norm": 0.42314839363098145, + "learning_rate": 9.518038641172822e-05, + "loss": 1.9883, + "step": 5432 + }, + { + "epoch": 1.667587476979742, + "grad_norm": 0.3986029326915741, + "learning_rate": 9.517825699491287e-05, + "loss": 1.9838, + "step": 5433 + }, + { + "epoch": 1.6678944137507674, + "grad_norm": 0.388236939907074, + "learning_rate": 9.517612713161949e-05, + "loss": 1.901, + "step": 5434 + }, + { + "epoch": 1.6682013505217925, + "grad_norm": 0.3849826455116272, + "learning_rate": 9.517399682186917e-05, + "loss": 1.9621, + "step": 5435 + }, + { + "epoch": 1.6685082872928176, + "grad_norm": 0.40182530879974365, + "learning_rate": 9.517186606568292e-05, + "loss": 1.9081, + "step": 5436 + }, + { + "epoch": 1.668815224063843, + "grad_norm": 0.4260261654853821, + "learning_rate": 9.516973486308181e-05, + "loss": 1.9701, + "step": 5437 + }, + { + "epoch": 1.6691221608348679, + "grad_norm": 0.4035099744796753, + "learning_rate": 9.516760321408692e-05, + "loss": 1.9269, + "step": 5438 + }, + { + "epoch": 1.6694290976058932, + "grad_norm": 0.42106589674949646, + "learning_rate": 9.51654711187193e-05, + "loss": 1.9026, + "step": 5439 + }, + { + "epoch": 1.6697360343769183, + "grad_norm": 0.4629819989204407, + "learning_rate": 9.516333857700001e-05, + "loss": 1.9128, + "step": 5440 + }, + { + "epoch": 1.6700429711479434, + "grad_norm": 0.3824837803840637, + "learning_rate": 9.516120558895014e-05, + "loss": 1.8861, + "step": 5441 + }, + { + "epoch": 1.6703499079189688, + "grad_norm": 0.37263223528862, + "learning_rate": 9.515907215459076e-05, + "loss": 1.9098, + "step": 5442 + }, + { + "epoch": 1.6706568446899939, + "grad_norm": 0.3980494439601898, + "learning_rate": 9.515693827394299e-05, + "loss": 1.9764, + "step": 5443 + }, + { + "epoch": 1.670963781461019, + "grad_norm": 0.5064507722854614, + "learning_rate": 9.515480394702786e-05, + "loss": 1.9771, + "step": 5444 + }, + { + "epoch": 1.6712707182320443, + "grad_norm": 0.5012909770011902, + "learning_rate": 9.515266917386649e-05, + "loss": 1.9162, + "step": 5445 + }, + { + "epoch": 1.6715776550030692, + "grad_norm": 0.5422279238700867, + "learning_rate": 9.515053395447999e-05, + "loss": 1.8913, + "step": 5446 + }, + { + "epoch": 1.6718845917740945, + "grad_norm": 0.4677022397518158, + "learning_rate": 9.514839828888946e-05, + "loss": 1.9156, + "step": 5447 + }, + { + "epoch": 1.6721915285451197, + "grad_norm": 0.39561185240745544, + "learning_rate": 9.514626217711597e-05, + "loss": 1.9203, + "step": 5448 + }, + { + "epoch": 1.6724984653161448, + "grad_norm": 0.4435743987560272, + "learning_rate": 9.514412561918068e-05, + "loss": 1.953, + "step": 5449 + }, + { + "epoch": 1.67280540208717, + "grad_norm": 0.5383535027503967, + "learning_rate": 9.514198861510467e-05, + "loss": 1.9662, + "step": 5450 + }, + { + "epoch": 1.6731123388581952, + "grad_norm": 0.4787214696407318, + "learning_rate": 9.513985116490906e-05, + "loss": 1.9278, + "step": 5451 + }, + { + "epoch": 1.6734192756292203, + "grad_norm": 0.40962034463882446, + "learning_rate": 9.513771326861501e-05, + "loss": 1.9267, + "step": 5452 + }, + { + "epoch": 1.6737262124002457, + "grad_norm": 0.43605929613113403, + "learning_rate": 9.513557492624359e-05, + "loss": 1.9537, + "step": 5453 + }, + { + "epoch": 1.6740331491712708, + "grad_norm": 0.46278494596481323, + "learning_rate": 9.513343613781599e-05, + "loss": 1.9383, + "step": 5454 + }, + { + "epoch": 1.6743400859422959, + "grad_norm": 0.4052918255329132, + "learning_rate": 9.513129690335331e-05, + "loss": 1.9289, + "step": 5455 + }, + { + "epoch": 1.6746470227133212, + "grad_norm": 0.37791141867637634, + "learning_rate": 9.51291572228767e-05, + "loss": 1.9185, + "step": 5456 + }, + { + "epoch": 1.674953959484346, + "grad_norm": 0.41135111451148987, + "learning_rate": 9.512701709640731e-05, + "loss": 2.0003, + "step": 5457 + }, + { + "epoch": 1.6752608962553714, + "grad_norm": 0.41175320744514465, + "learning_rate": 9.512487652396629e-05, + "loss": 1.9307, + "step": 5458 + }, + { + "epoch": 1.6755678330263966, + "grad_norm": 0.40061330795288086, + "learning_rate": 9.512273550557478e-05, + "loss": 1.9361, + "step": 5459 + }, + { + "epoch": 1.6758747697974217, + "grad_norm": 0.3938329219818115, + "learning_rate": 9.512059404125397e-05, + "loss": 1.9419, + "step": 5460 + }, + { + "epoch": 1.676181706568447, + "grad_norm": 0.42825883626937866, + "learning_rate": 9.511845213102498e-05, + "loss": 1.9201, + "step": 5461 + }, + { + "epoch": 1.6764886433394721, + "grad_norm": 0.3795798122882843, + "learning_rate": 9.511630977490901e-05, + "loss": 1.9872, + "step": 5462 + }, + { + "epoch": 1.6767955801104972, + "grad_norm": 0.3639005422592163, + "learning_rate": 9.511416697292724e-05, + "loss": 1.9066, + "step": 5463 + }, + { + "epoch": 1.6771025168815226, + "grad_norm": 0.4200088381767273, + "learning_rate": 9.511202372510082e-05, + "loss": 1.9928, + "step": 5464 + }, + { + "epoch": 1.6774094536525475, + "grad_norm": 0.436638742685318, + "learning_rate": 9.510988003145092e-05, + "loss": 1.8527, + "step": 5465 + }, + { + "epoch": 1.6777163904235728, + "grad_norm": 0.40901345014572144, + "learning_rate": 9.510773589199877e-05, + "loss": 1.9915, + "step": 5466 + }, + { + "epoch": 1.678023327194598, + "grad_norm": 0.39717167615890503, + "learning_rate": 9.510559130676553e-05, + "loss": 1.9682, + "step": 5467 + }, + { + "epoch": 1.678330263965623, + "grad_norm": 0.37574490904808044, + "learning_rate": 9.510344627577239e-05, + "loss": 1.9641, + "step": 5468 + }, + { + "epoch": 1.6786372007366483, + "grad_norm": 0.36686137318611145, + "learning_rate": 9.510130079904057e-05, + "loss": 1.9082, + "step": 5469 + }, + { + "epoch": 1.6789441375076735, + "grad_norm": 0.37321972846984863, + "learning_rate": 9.509915487659125e-05, + "loss": 1.8911, + "step": 5470 + }, + { + "epoch": 1.6792510742786986, + "grad_norm": 0.3911389112472534, + "learning_rate": 9.509700850844566e-05, + "loss": 1.9721, + "step": 5471 + }, + { + "epoch": 1.679558011049724, + "grad_norm": 0.41182973980903625, + "learning_rate": 9.509486169462499e-05, + "loss": 1.9188, + "step": 5472 + }, + { + "epoch": 1.6798649478207488, + "grad_norm": 0.4141900837421417, + "learning_rate": 9.509271443515047e-05, + "loss": 1.875, + "step": 5473 + }, + { + "epoch": 1.6801718845917741, + "grad_norm": 0.4259745478630066, + "learning_rate": 9.509056673004333e-05, + "loss": 1.9258, + "step": 5474 + }, + { + "epoch": 1.6804788213627992, + "grad_norm": 0.47081178426742554, + "learning_rate": 9.508841857932476e-05, + "loss": 2.0494, + "step": 5475 + }, + { + "epoch": 1.6807857581338244, + "grad_norm": 0.5346465110778809, + "learning_rate": 9.508626998301602e-05, + "loss": 1.9371, + "step": 5476 + }, + { + "epoch": 1.6810926949048497, + "grad_norm": 0.5532976388931274, + "learning_rate": 9.508412094113832e-05, + "loss": 1.8727, + "step": 5477 + }, + { + "epoch": 1.6813996316758748, + "grad_norm": 0.5262138843536377, + "learning_rate": 9.508197145371294e-05, + "loss": 1.9098, + "step": 5478 + }, + { + "epoch": 1.6817065684469, + "grad_norm": 0.47581788897514343, + "learning_rate": 9.507982152076108e-05, + "loss": 1.9174, + "step": 5479 + }, + { + "epoch": 1.6820135052179253, + "grad_norm": 0.41795024275779724, + "learning_rate": 9.507767114230399e-05, + "loss": 1.9333, + "step": 5480 + }, + { + "epoch": 1.6823204419889501, + "grad_norm": 0.5213392376899719, + "learning_rate": 9.507552031836295e-05, + "loss": 1.9731, + "step": 5481 + }, + { + "epoch": 1.6826273787599755, + "grad_norm": 0.624969482421875, + "learning_rate": 9.507336904895919e-05, + "loss": 1.965, + "step": 5482 + }, + { + "epoch": 1.6829343155310006, + "grad_norm": 0.5719303488731384, + "learning_rate": 9.507121733411397e-05, + "loss": 1.9325, + "step": 5483 + }, + { + "epoch": 1.6832412523020257, + "grad_norm": 0.45429563522338867, + "learning_rate": 9.506906517384858e-05, + "loss": 1.8846, + "step": 5484 + }, + { + "epoch": 1.683548189073051, + "grad_norm": 0.4679521322250366, + "learning_rate": 9.506691256818427e-05, + "loss": 1.9609, + "step": 5485 + }, + { + "epoch": 1.6838551258440762, + "grad_norm": 0.64385986328125, + "learning_rate": 9.50647595171423e-05, + "loss": 1.9138, + "step": 5486 + }, + { + "epoch": 1.6841620626151013, + "grad_norm": 0.6783073544502258, + "learning_rate": 9.506260602074398e-05, + "loss": 2.0252, + "step": 5487 + }, + { + "epoch": 1.6844689993861266, + "grad_norm": 0.6151844263076782, + "learning_rate": 9.506045207901058e-05, + "loss": 2.0077, + "step": 5488 + }, + { + "epoch": 1.6847759361571515, + "grad_norm": 0.43046683073043823, + "learning_rate": 9.505829769196338e-05, + "loss": 1.8945, + "step": 5489 + }, + { + "epoch": 1.6850828729281768, + "grad_norm": 0.44831258058547974, + "learning_rate": 9.505614285962366e-05, + "loss": 1.9775, + "step": 5490 + }, + { + "epoch": 1.685389809699202, + "grad_norm": 0.4917668402194977, + "learning_rate": 9.505398758201272e-05, + "loss": 1.9115, + "step": 5491 + }, + { + "epoch": 1.685696746470227, + "grad_norm": 0.4595036506652832, + "learning_rate": 9.505183185915187e-05, + "loss": 1.9103, + "step": 5492 + }, + { + "epoch": 1.6860036832412524, + "grad_norm": 0.43335607647895813, + "learning_rate": 9.504967569106243e-05, + "loss": 1.9147, + "step": 5493 + }, + { + "epoch": 1.6863106200122775, + "grad_norm": 0.42885956168174744, + "learning_rate": 9.504751907776567e-05, + "loss": 2.0085, + "step": 5494 + }, + { + "epoch": 1.6866175567833026, + "grad_norm": 0.4121492803096771, + "learning_rate": 9.504536201928295e-05, + "loss": 1.9212, + "step": 5495 + }, + { + "epoch": 1.686924493554328, + "grad_norm": 0.4387015700340271, + "learning_rate": 9.504320451563555e-05, + "loss": 1.9202, + "step": 5496 + }, + { + "epoch": 1.6872314303253528, + "grad_norm": 0.4333394467830658, + "learning_rate": 9.504104656684481e-05, + "loss": 1.9165, + "step": 5497 + }, + { + "epoch": 1.6875383670963782, + "grad_norm": 0.37835901975631714, + "learning_rate": 9.503888817293203e-05, + "loss": 1.9087, + "step": 5498 + }, + { + "epoch": 1.6878453038674033, + "grad_norm": 0.42156684398651123, + "learning_rate": 9.503672933391857e-05, + "loss": 1.8909, + "step": 5499 + }, + { + "epoch": 1.6881522406384284, + "grad_norm": 0.4315885603427887, + "learning_rate": 9.503457004982574e-05, + "loss": 1.8892, + "step": 5500 + }, + { + "epoch": 1.6884591774094537, + "grad_norm": 0.4349892735481262, + "learning_rate": 9.50324103206749e-05, + "loss": 1.9532, + "step": 5501 + }, + { + "epoch": 1.6887661141804788, + "grad_norm": 0.45786523818969727, + "learning_rate": 9.503025014648739e-05, + "loss": 1.9285, + "step": 5502 + }, + { + "epoch": 1.689073050951504, + "grad_norm": 0.36640092730522156, + "learning_rate": 9.502808952728456e-05, + "loss": 1.9167, + "step": 5503 + }, + { + "epoch": 1.6893799877225293, + "grad_norm": 0.46942031383514404, + "learning_rate": 9.502592846308775e-05, + "loss": 2.08, + "step": 5504 + }, + { + "epoch": 1.6896869244935542, + "grad_norm": 0.44714173674583435, + "learning_rate": 9.502376695391833e-05, + "loss": 1.9618, + "step": 5505 + }, + { + "epoch": 1.6899938612645795, + "grad_norm": 0.4216810464859009, + "learning_rate": 9.502160499979764e-05, + "loss": 1.888, + "step": 5506 + }, + { + "epoch": 1.6903007980356046, + "grad_norm": 0.40471377968788147, + "learning_rate": 9.501944260074709e-05, + "loss": 1.9048, + "step": 5507 + }, + { + "epoch": 1.6906077348066297, + "grad_norm": 0.399309366941452, + "learning_rate": 9.501727975678801e-05, + "loss": 1.8796, + "step": 5508 + }, + { + "epoch": 1.690914671577655, + "grad_norm": 0.36903873085975647, + "learning_rate": 9.501511646794176e-05, + "loss": 1.9607, + "step": 5509 + }, + { + "epoch": 1.6912216083486802, + "grad_norm": 0.40781939029693604, + "learning_rate": 9.501295273422977e-05, + "loss": 1.9328, + "step": 5510 + }, + { + "epoch": 1.6915285451197053, + "grad_norm": 0.38062483072280884, + "learning_rate": 9.50107885556734e-05, + "loss": 1.9552, + "step": 5511 + }, + { + "epoch": 1.6918354818907306, + "grad_norm": 0.4047648012638092, + "learning_rate": 9.500862393229402e-05, + "loss": 1.9503, + "step": 5512 + }, + { + "epoch": 1.6921424186617555, + "grad_norm": 0.3829517066478729, + "learning_rate": 9.500645886411305e-05, + "loss": 1.9034, + "step": 5513 + }, + { + "epoch": 1.6924493554327809, + "grad_norm": 0.3657867908477783, + "learning_rate": 9.500429335115188e-05, + "loss": 1.869, + "step": 5514 + }, + { + "epoch": 1.692756292203806, + "grad_norm": 0.410877525806427, + "learning_rate": 9.50021273934319e-05, + "loss": 1.9824, + "step": 5515 + }, + { + "epoch": 1.693063228974831, + "grad_norm": 0.420682817697525, + "learning_rate": 9.499996099097453e-05, + "loss": 1.969, + "step": 5516 + }, + { + "epoch": 1.6933701657458564, + "grad_norm": 0.44578227400779724, + "learning_rate": 9.499779414380115e-05, + "loss": 1.9513, + "step": 5517 + }, + { + "epoch": 1.6936771025168815, + "grad_norm": 0.42710423469543457, + "learning_rate": 9.499562685193319e-05, + "loss": 1.9423, + "step": 5518 + }, + { + "epoch": 1.6939840392879066, + "grad_norm": 0.4503214657306671, + "learning_rate": 9.49934591153921e-05, + "loss": 1.9849, + "step": 5519 + }, + { + "epoch": 1.694290976058932, + "grad_norm": 0.427157998085022, + "learning_rate": 9.499129093419926e-05, + "loss": 1.9502, + "step": 5520 + }, + { + "epoch": 1.6945979128299569, + "grad_norm": 0.4356638491153717, + "learning_rate": 9.498912230837611e-05, + "loss": 1.8593, + "step": 5521 + }, + { + "epoch": 1.6949048496009822, + "grad_norm": 0.3894338309764862, + "learning_rate": 9.498695323794409e-05, + "loss": 1.8857, + "step": 5522 + }, + { + "epoch": 1.6952117863720073, + "grad_norm": 0.4285121262073517, + "learning_rate": 9.498478372292464e-05, + "loss": 1.9774, + "step": 5523 + }, + { + "epoch": 1.6955187231430324, + "grad_norm": 0.4316183924674988, + "learning_rate": 9.498261376333916e-05, + "loss": 1.9067, + "step": 5524 + }, + { + "epoch": 1.6958256599140578, + "grad_norm": 0.3760167956352234, + "learning_rate": 9.498044335920914e-05, + "loss": 1.8375, + "step": 5525 + }, + { + "epoch": 1.6961325966850829, + "grad_norm": 0.4327097237110138, + "learning_rate": 9.497827251055602e-05, + "loss": 1.9333, + "step": 5526 + }, + { + "epoch": 1.696439533456108, + "grad_norm": 0.4169953167438507, + "learning_rate": 9.497610121740126e-05, + "loss": 1.9015, + "step": 5527 + }, + { + "epoch": 1.6967464702271333, + "grad_norm": 0.3915253281593323, + "learning_rate": 9.49739294797663e-05, + "loss": 1.8608, + "step": 5528 + }, + { + "epoch": 1.6970534069981584, + "grad_norm": 0.4071075916290283, + "learning_rate": 9.497175729767259e-05, + "loss": 1.9336, + "step": 5529 + }, + { + "epoch": 1.6973603437691835, + "grad_norm": 0.3550303876399994, + "learning_rate": 9.496958467114163e-05, + "loss": 1.8614, + "step": 5530 + }, + { + "epoch": 1.6976672805402089, + "grad_norm": 0.3757273554801941, + "learning_rate": 9.496741160019487e-05, + "loss": 1.9959, + "step": 5531 + }, + { + "epoch": 1.6979742173112338, + "grad_norm": 0.4126262366771698, + "learning_rate": 9.49652380848538e-05, + "loss": 1.935, + "step": 5532 + }, + { + "epoch": 1.698281154082259, + "grad_norm": 0.46366190910339355, + "learning_rate": 9.496306412513988e-05, + "loss": 1.9336, + "step": 5533 + }, + { + "epoch": 1.6985880908532842, + "grad_norm": 0.42553630471229553, + "learning_rate": 9.496088972107463e-05, + "loss": 1.9388, + "step": 5534 + }, + { + "epoch": 1.6988950276243093, + "grad_norm": 0.4060843884944916, + "learning_rate": 9.49587148726795e-05, + "loss": 1.917, + "step": 5535 + }, + { + "epoch": 1.6992019643953347, + "grad_norm": 0.37994736433029175, + "learning_rate": 9.495653957997601e-05, + "loss": 1.9268, + "step": 5536 + }, + { + "epoch": 1.6995089011663598, + "grad_norm": 0.4148559272289276, + "learning_rate": 9.495436384298563e-05, + "loss": 1.8936, + "step": 5537 + }, + { + "epoch": 1.6998158379373849, + "grad_norm": 0.39814767241477966, + "learning_rate": 9.495218766172989e-05, + "loss": 1.9468, + "step": 5538 + }, + { + "epoch": 1.7001227747084102, + "grad_norm": 0.40800294280052185, + "learning_rate": 9.495001103623027e-05, + "loss": 1.9649, + "step": 5539 + }, + { + "epoch": 1.7004297114794351, + "grad_norm": 0.4225989282131195, + "learning_rate": 9.49478339665083e-05, + "loss": 1.987, + "step": 5540 + }, + { + "epoch": 1.7007366482504604, + "grad_norm": 0.4280939996242523, + "learning_rate": 9.494565645258551e-05, + "loss": 2.0487, + "step": 5541 + }, + { + "epoch": 1.7010435850214856, + "grad_norm": 0.44816237688064575, + "learning_rate": 9.494347849448338e-05, + "loss": 1.9112, + "step": 5542 + }, + { + "epoch": 1.7013505217925107, + "grad_norm": 0.424629271030426, + "learning_rate": 9.494130009222346e-05, + "loss": 1.9284, + "step": 5543 + }, + { + "epoch": 1.701657458563536, + "grad_norm": 0.40010082721710205, + "learning_rate": 9.493912124582727e-05, + "loss": 1.9307, + "step": 5544 + }, + { + "epoch": 1.7019643953345611, + "grad_norm": 0.42541825771331787, + "learning_rate": 9.493694195531633e-05, + "loss": 2.0009, + "step": 5545 + }, + { + "epoch": 1.7022713321055862, + "grad_norm": 0.39693546295166016, + "learning_rate": 9.49347622207122e-05, + "loss": 1.9237, + "step": 5546 + }, + { + "epoch": 1.7025782688766116, + "grad_norm": 0.37853676080703735, + "learning_rate": 9.493258204203644e-05, + "loss": 1.9212, + "step": 5547 + }, + { + "epoch": 1.7028852056476365, + "grad_norm": 0.3856247663497925, + "learning_rate": 9.493040141931054e-05, + "loss": 1.926, + "step": 5548 + }, + { + "epoch": 1.7031921424186618, + "grad_norm": 0.3429555892944336, + "learning_rate": 9.492822035255608e-05, + "loss": 1.8854, + "step": 5549 + }, + { + "epoch": 1.703499079189687, + "grad_norm": 0.3500545620918274, + "learning_rate": 9.49260388417946e-05, + "loss": 1.8627, + "step": 5550 + }, + { + "epoch": 1.703806015960712, + "grad_norm": 0.3461480140686035, + "learning_rate": 9.49238568870477e-05, + "loss": 1.8962, + "step": 5551 + }, + { + "epoch": 1.7041129527317374, + "grad_norm": 0.36311015486717224, + "learning_rate": 9.492167448833691e-05, + "loss": 1.9398, + "step": 5552 + }, + { + "epoch": 1.7044198895027625, + "grad_norm": 0.36770105361938477, + "learning_rate": 9.491949164568379e-05, + "loss": 1.9083, + "step": 5553 + }, + { + "epoch": 1.7047268262737876, + "grad_norm": 0.42491769790649414, + "learning_rate": 9.491730835910993e-05, + "loss": 1.8874, + "step": 5554 + }, + { + "epoch": 1.705033763044813, + "grad_norm": 0.5321764945983887, + "learning_rate": 9.491512462863691e-05, + "loss": 1.9813, + "step": 5555 + }, + { + "epoch": 1.7053406998158378, + "grad_norm": 0.5481576323509216, + "learning_rate": 9.49129404542863e-05, + "loss": 1.8696, + "step": 5556 + }, + { + "epoch": 1.7056476365868631, + "grad_norm": 0.47720953822135925, + "learning_rate": 9.491075583607969e-05, + "loss": 1.9026, + "step": 5557 + }, + { + "epoch": 1.7059545733578882, + "grad_norm": 0.3976534605026245, + "learning_rate": 9.490857077403865e-05, + "loss": 1.8551, + "step": 5558 + }, + { + "epoch": 1.7062615101289134, + "grad_norm": 0.3744281828403473, + "learning_rate": 9.49063852681848e-05, + "loss": 2.012, + "step": 5559 + }, + { + "epoch": 1.7065684468999387, + "grad_norm": 0.3931918740272522, + "learning_rate": 9.490419931853974e-05, + "loss": 1.845, + "step": 5560 + }, + { + "epoch": 1.7068753836709638, + "grad_norm": 0.5411466956138611, + "learning_rate": 9.490201292512506e-05, + "loss": 2.0225, + "step": 5561 + }, + { + "epoch": 1.707182320441989, + "grad_norm": 0.6602910757064819, + "learning_rate": 9.489982608796237e-05, + "loss": 1.9559, + "step": 5562 + }, + { + "epoch": 1.7074892572130143, + "grad_norm": 0.5455329418182373, + "learning_rate": 9.489763880707329e-05, + "loss": 1.8855, + "step": 5563 + }, + { + "epoch": 1.7077961939840391, + "grad_norm": 0.42309099435806274, + "learning_rate": 9.489545108247941e-05, + "loss": 1.8784, + "step": 5564 + }, + { + "epoch": 1.7081031307550645, + "grad_norm": 0.3817001283168793, + "learning_rate": 9.489326291420239e-05, + "loss": 1.8926, + "step": 5565 + }, + { + "epoch": 1.7084100675260896, + "grad_norm": 0.5077582597732544, + "learning_rate": 9.489107430226381e-05, + "loss": 1.8742, + "step": 5566 + }, + { + "epoch": 1.7087170042971147, + "grad_norm": 0.5634065866470337, + "learning_rate": 9.488888524668533e-05, + "loss": 1.9251, + "step": 5567 + }, + { + "epoch": 1.70902394106814, + "grad_norm": 0.5182891488075256, + "learning_rate": 9.488669574748859e-05, + "loss": 1.9689, + "step": 5568 + }, + { + "epoch": 1.7093308778391652, + "grad_norm": 0.4180498719215393, + "learning_rate": 9.48845058046952e-05, + "loss": 1.9248, + "step": 5569 + }, + { + "epoch": 1.7096378146101903, + "grad_norm": 0.4833194315433502, + "learning_rate": 9.488231541832682e-05, + "loss": 2.0115, + "step": 5570 + }, + { + "epoch": 1.7099447513812156, + "grad_norm": 0.46525415778160095, + "learning_rate": 9.488012458840509e-05, + "loss": 1.9108, + "step": 5571 + }, + { + "epoch": 1.7102516881522405, + "grad_norm": 0.5051191449165344, + "learning_rate": 9.487793331495166e-05, + "loss": 1.9055, + "step": 5572 + }, + { + "epoch": 1.7105586249232658, + "grad_norm": 0.4713154137134552, + "learning_rate": 9.48757415979882e-05, + "loss": 1.9104, + "step": 5573 + }, + { + "epoch": 1.710865561694291, + "grad_norm": 0.44901835918426514, + "learning_rate": 9.487354943753635e-05, + "loss": 1.9536, + "step": 5574 + }, + { + "epoch": 1.711172498465316, + "grad_norm": 0.41106006503105164, + "learning_rate": 9.487135683361778e-05, + "loss": 1.9549, + "step": 5575 + }, + { + "epoch": 1.7114794352363414, + "grad_norm": 0.4571320116519928, + "learning_rate": 9.486916378625416e-05, + "loss": 1.859, + "step": 5576 + }, + { + "epoch": 1.7117863720073665, + "grad_norm": 0.4423540532588959, + "learning_rate": 9.486697029546718e-05, + "loss": 1.9621, + "step": 5577 + }, + { + "epoch": 1.7120933087783916, + "grad_norm": 0.44291070103645325, + "learning_rate": 9.48647763612785e-05, + "loss": 1.8567, + "step": 5578 + }, + { + "epoch": 1.712400245549417, + "grad_norm": 0.4374423921108246, + "learning_rate": 9.486258198370981e-05, + "loss": 1.9754, + "step": 5579 + }, + { + "epoch": 1.7127071823204418, + "grad_norm": 0.44008153676986694, + "learning_rate": 9.486038716278277e-05, + "loss": 1.8815, + "step": 5580 + }, + { + "epoch": 1.7130141190914672, + "grad_norm": 0.3571348190307617, + "learning_rate": 9.48581918985191e-05, + "loss": 1.8948, + "step": 5581 + }, + { + "epoch": 1.7133210558624923, + "grad_norm": 0.42260754108428955, + "learning_rate": 9.485599619094049e-05, + "loss": 1.9964, + "step": 5582 + }, + { + "epoch": 1.7136279926335174, + "grad_norm": 0.44568777084350586, + "learning_rate": 9.485380004006863e-05, + "loss": 1.9596, + "step": 5583 + }, + { + "epoch": 1.7139349294045427, + "grad_norm": 0.5488269925117493, + "learning_rate": 9.485160344592523e-05, + "loss": 1.9239, + "step": 5584 + }, + { + "epoch": 1.7142418661755678, + "grad_norm": 0.5653155446052551, + "learning_rate": 9.484940640853199e-05, + "loss": 1.9115, + "step": 5585 + }, + { + "epoch": 1.714548802946593, + "grad_norm": 0.4652312099933624, + "learning_rate": 9.484720892791064e-05, + "loss": 1.9973, + "step": 5586 + }, + { + "epoch": 1.7148557397176183, + "grad_norm": 0.41521382331848145, + "learning_rate": 9.484501100408288e-05, + "loss": 1.9395, + "step": 5587 + }, + { + "epoch": 1.7151626764886432, + "grad_norm": 0.46761438250541687, + "learning_rate": 9.484281263707043e-05, + "loss": 1.9465, + "step": 5588 + }, + { + "epoch": 1.7154696132596685, + "grad_norm": 0.46990182995796204, + "learning_rate": 9.484061382689501e-05, + "loss": 1.8969, + "step": 5589 + }, + { + "epoch": 1.7157765500306936, + "grad_norm": 0.44951021671295166, + "learning_rate": 9.48384145735784e-05, + "loss": 1.9925, + "step": 5590 + }, + { + "epoch": 1.7160834868017187, + "grad_norm": 0.4029327630996704, + "learning_rate": 9.483621487714227e-05, + "loss": 1.8574, + "step": 5591 + }, + { + "epoch": 1.716390423572744, + "grad_norm": 0.3501027226448059, + "learning_rate": 9.48340147376084e-05, + "loss": 1.9156, + "step": 5592 + }, + { + "epoch": 1.7166973603437692, + "grad_norm": 0.5058720111846924, + "learning_rate": 9.48318141549985e-05, + "loss": 2.071, + "step": 5593 + }, + { + "epoch": 1.7170042971147943, + "grad_norm": 0.5097518563270569, + "learning_rate": 9.482961312933435e-05, + "loss": 1.9609, + "step": 5594 + }, + { + "epoch": 1.7173112338858196, + "grad_norm": 0.4728573262691498, + "learning_rate": 9.482741166063769e-05, + "loss": 1.9552, + "step": 5595 + }, + { + "epoch": 1.7176181706568447, + "grad_norm": 0.44095897674560547, + "learning_rate": 9.482520974893026e-05, + "loss": 2.011, + "step": 5596 + }, + { + "epoch": 1.7179251074278699, + "grad_norm": 0.48331573605537415, + "learning_rate": 9.482300739423385e-05, + "loss": 1.9676, + "step": 5597 + }, + { + "epoch": 1.7182320441988952, + "grad_norm": 0.4890894293785095, + "learning_rate": 9.482080459657019e-05, + "loss": 1.9571, + "step": 5598 + }, + { + "epoch": 1.71853898096992, + "grad_norm": 0.4486929476261139, + "learning_rate": 9.481860135596109e-05, + "loss": 1.9205, + "step": 5599 + }, + { + "epoch": 1.7188459177409454, + "grad_norm": 0.44154083728790283, + "learning_rate": 9.48163976724283e-05, + "loss": 1.9995, + "step": 5600 + }, + { + "epoch": 1.7191528545119705, + "grad_norm": 0.4155641496181488, + "learning_rate": 9.481419354599358e-05, + "loss": 1.9192, + "step": 5601 + }, + { + "epoch": 1.7194597912829956, + "grad_norm": 0.453253835439682, + "learning_rate": 9.481198897667875e-05, + "loss": 2.0102, + "step": 5602 + }, + { + "epoch": 1.719766728054021, + "grad_norm": 0.4325653314590454, + "learning_rate": 9.480978396450557e-05, + "loss": 1.8859, + "step": 5603 + }, + { + "epoch": 1.720073664825046, + "grad_norm": 0.4191089868545532, + "learning_rate": 9.480757850949584e-05, + "loss": 2.0007, + "step": 5604 + }, + { + "epoch": 1.7203806015960712, + "grad_norm": 0.4182284474372864, + "learning_rate": 9.480537261167137e-05, + "loss": 1.9374, + "step": 5605 + }, + { + "epoch": 1.7206875383670965, + "grad_norm": 0.4695988893508911, + "learning_rate": 9.480316627105394e-05, + "loss": 1.983, + "step": 5606 + }, + { + "epoch": 1.7209944751381214, + "grad_norm": 0.4668160378932953, + "learning_rate": 9.480095948766536e-05, + "loss": 1.8705, + "step": 5607 + }, + { + "epoch": 1.7213014119091468, + "grad_norm": 0.3689236044883728, + "learning_rate": 9.479875226152744e-05, + "loss": 1.8695, + "step": 5608 + }, + { + "epoch": 1.7216083486801719, + "grad_norm": 0.4206932485103607, + "learning_rate": 9.4796544592662e-05, + "loss": 1.9494, + "step": 5609 + }, + { + "epoch": 1.721915285451197, + "grad_norm": 0.4420578181743622, + "learning_rate": 9.479433648109083e-05, + "loss": 1.8749, + "step": 5610 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.4243582487106323, + "learning_rate": 9.479212792683579e-05, + "loss": 1.9524, + "step": 5611 + }, + { + "epoch": 1.7225291589932474, + "grad_norm": 0.5053666234016418, + "learning_rate": 9.478991892991868e-05, + "loss": 1.9308, + "step": 5612 + }, + { + "epoch": 1.7228360957642725, + "grad_norm": 0.4365650713443756, + "learning_rate": 9.478770949036136e-05, + "loss": 1.9469, + "step": 5613 + }, + { + "epoch": 1.7231430325352979, + "grad_norm": 0.3916216194629669, + "learning_rate": 9.478549960818561e-05, + "loss": 1.8239, + "step": 5614 + }, + { + "epoch": 1.7234499693063228, + "grad_norm": 0.4051356911659241, + "learning_rate": 9.478328928341334e-05, + "loss": 1.892, + "step": 5615 + }, + { + "epoch": 1.723756906077348, + "grad_norm": 0.36592593789100647, + "learning_rate": 9.478107851606633e-05, + "loss": 1.8763, + "step": 5616 + }, + { + "epoch": 1.7240638428483732, + "grad_norm": 0.45741888880729675, + "learning_rate": 9.477886730616645e-05, + "loss": 1.9502, + "step": 5617 + }, + { + "epoch": 1.7243707796193983, + "grad_norm": 0.38170990347862244, + "learning_rate": 9.477665565373558e-05, + "loss": 1.8568, + "step": 5618 + }, + { + "epoch": 1.7246777163904237, + "grad_norm": 0.4193691313266754, + "learning_rate": 9.477444355879554e-05, + "loss": 1.9553, + "step": 5619 + }, + { + "epoch": 1.7249846531614488, + "grad_norm": 0.39682838320732117, + "learning_rate": 9.477223102136821e-05, + "loss": 1.9474, + "step": 5620 + }, + { + "epoch": 1.725291589932474, + "grad_norm": 0.391544371843338, + "learning_rate": 9.477001804147545e-05, + "loss": 1.9277, + "step": 5621 + }, + { + "epoch": 1.7255985267034992, + "grad_norm": 0.42348888516426086, + "learning_rate": 9.476780461913913e-05, + "loss": 1.8923, + "step": 5622 + }, + { + "epoch": 1.7259054634745241, + "grad_norm": 0.4393916130065918, + "learning_rate": 9.476559075438114e-05, + "loss": 1.9052, + "step": 5623 + }, + { + "epoch": 1.7262124002455494, + "grad_norm": 0.42631569504737854, + "learning_rate": 9.476337644722333e-05, + "loss": 1.8849, + "step": 5624 + }, + { + "epoch": 1.7265193370165746, + "grad_norm": 0.3514206111431122, + "learning_rate": 9.47611616976876e-05, + "loss": 1.9286, + "step": 5625 + }, + { + "epoch": 1.7268262737875997, + "grad_norm": 0.4104609191417694, + "learning_rate": 9.475894650579582e-05, + "loss": 1.9178, + "step": 5626 + }, + { + "epoch": 1.727133210558625, + "grad_norm": 0.44329676032066345, + "learning_rate": 9.475673087156992e-05, + "loss": 1.9789, + "step": 5627 + }, + { + "epoch": 1.7274401473296501, + "grad_norm": 0.41865840554237366, + "learning_rate": 9.475451479503175e-05, + "loss": 1.9105, + "step": 5628 + }, + { + "epoch": 1.7277470841006752, + "grad_norm": 0.4166790544986725, + "learning_rate": 9.475229827620326e-05, + "loss": 1.9089, + "step": 5629 + }, + { + "epoch": 1.7280540208717006, + "grad_norm": 0.353771448135376, + "learning_rate": 9.475008131510633e-05, + "loss": 1.9081, + "step": 5630 + }, + { + "epoch": 1.7283609576427255, + "grad_norm": 0.385046124458313, + "learning_rate": 9.474786391176284e-05, + "loss": 1.9268, + "step": 5631 + }, + { + "epoch": 1.7286678944137508, + "grad_norm": 0.3956538438796997, + "learning_rate": 9.474564606619474e-05, + "loss": 1.9445, + "step": 5632 + }, + { + "epoch": 1.728974831184776, + "grad_norm": 0.41305112838745117, + "learning_rate": 9.474342777842394e-05, + "loss": 1.9331, + "step": 5633 + }, + { + "epoch": 1.729281767955801, + "grad_norm": 0.39336860179901123, + "learning_rate": 9.474120904847237e-05, + "loss": 1.9792, + "step": 5634 + }, + { + "epoch": 1.7295887047268264, + "grad_norm": 0.41963186860084534, + "learning_rate": 9.473898987636194e-05, + "loss": 1.8719, + "step": 5635 + }, + { + "epoch": 1.7298956414978515, + "grad_norm": 0.4087338149547577, + "learning_rate": 9.473677026211458e-05, + "loss": 1.9121, + "step": 5636 + }, + { + "epoch": 1.7302025782688766, + "grad_norm": 0.3693830966949463, + "learning_rate": 9.473455020575226e-05, + "loss": 1.9293, + "step": 5637 + }, + { + "epoch": 1.730509515039902, + "grad_norm": 0.40699541568756104, + "learning_rate": 9.473232970729688e-05, + "loss": 1.94, + "step": 5638 + }, + { + "epoch": 1.7308164518109268, + "grad_norm": 0.4222811162471771, + "learning_rate": 9.473010876677041e-05, + "loss": 1.9416, + "step": 5639 + }, + { + "epoch": 1.7311233885819521, + "grad_norm": 0.41459110379219055, + "learning_rate": 9.472788738419477e-05, + "loss": 1.8801, + "step": 5640 + }, + { + "epoch": 1.7314303253529773, + "grad_norm": 0.36970487236976624, + "learning_rate": 9.472566555959195e-05, + "loss": 1.9122, + "step": 5641 + }, + { + "epoch": 1.7317372621240024, + "grad_norm": 0.35511577129364014, + "learning_rate": 9.472344329298388e-05, + "loss": 1.8646, + "step": 5642 + }, + { + "epoch": 1.7320441988950277, + "grad_norm": 0.3511577248573303, + "learning_rate": 9.472122058439252e-05, + "loss": 1.9047, + "step": 5643 + }, + { + "epoch": 1.7323511356660528, + "grad_norm": 0.3421955108642578, + "learning_rate": 9.471899743383986e-05, + "loss": 1.8732, + "step": 5644 + }, + { + "epoch": 1.732658072437078, + "grad_norm": 0.44008341431617737, + "learning_rate": 9.471677384134785e-05, + "loss": 1.8956, + "step": 5645 + }, + { + "epoch": 1.7329650092081033, + "grad_norm": 0.49410128593444824, + "learning_rate": 9.471454980693848e-05, + "loss": 1.9197, + "step": 5646 + }, + { + "epoch": 1.7332719459791281, + "grad_norm": 0.4664965867996216, + "learning_rate": 9.471232533063373e-05, + "loss": 1.8945, + "step": 5647 + }, + { + "epoch": 1.7335788827501535, + "grad_norm": 0.3789248764514923, + "learning_rate": 9.471010041245555e-05, + "loss": 1.9153, + "step": 5648 + }, + { + "epoch": 1.7338858195211786, + "grad_norm": 0.34556612372398376, + "learning_rate": 9.470787505242596e-05, + "loss": 1.9144, + "step": 5649 + }, + { + "epoch": 1.7341927562922037, + "grad_norm": 0.3466256856918335, + "learning_rate": 9.470564925056695e-05, + "loss": 1.8837, + "step": 5650 + }, + { + "epoch": 1.734499693063229, + "grad_norm": 0.34612321853637695, + "learning_rate": 9.470342300690051e-05, + "loss": 1.8667, + "step": 5651 + }, + { + "epoch": 1.7348066298342542, + "grad_norm": 0.3648833632469177, + "learning_rate": 9.470119632144864e-05, + "loss": 1.9499, + "step": 5652 + }, + { + "epoch": 1.7351135666052793, + "grad_norm": 0.3600454330444336, + "learning_rate": 9.469896919423334e-05, + "loss": 1.9093, + "step": 5653 + }, + { + "epoch": 1.7354205033763046, + "grad_norm": 0.41487598419189453, + "learning_rate": 9.469674162527664e-05, + "loss": 1.9714, + "step": 5654 + }, + { + "epoch": 1.7357274401473295, + "grad_norm": 0.35980695486068726, + "learning_rate": 9.469451361460053e-05, + "loss": 1.9006, + "step": 5655 + }, + { + "epoch": 1.7360343769183548, + "grad_norm": 0.42676928639411926, + "learning_rate": 9.469228516222705e-05, + "loss": 1.9286, + "step": 5656 + }, + { + "epoch": 1.73634131368938, + "grad_norm": 0.41541969776153564, + "learning_rate": 9.469005626817822e-05, + "loss": 1.9243, + "step": 5657 + }, + { + "epoch": 1.736648250460405, + "grad_norm": 0.4245065152645111, + "learning_rate": 9.468782693247604e-05, + "loss": 1.9427, + "step": 5658 + }, + { + "epoch": 1.7369551872314304, + "grad_norm": 0.46148940920829773, + "learning_rate": 9.468559715514257e-05, + "loss": 2.0201, + "step": 5659 + }, + { + "epoch": 1.7372621240024555, + "grad_norm": 0.47727301716804504, + "learning_rate": 9.468336693619985e-05, + "loss": 1.9792, + "step": 5660 + }, + { + "epoch": 1.7375690607734806, + "grad_norm": 0.4807848036289215, + "learning_rate": 9.46811362756699e-05, + "loss": 1.9036, + "step": 5661 + }, + { + "epoch": 1.737875997544506, + "grad_norm": 0.5129636526107788, + "learning_rate": 9.467890517357477e-05, + "loss": 1.8861, + "step": 5662 + }, + { + "epoch": 1.7381829343155308, + "grad_norm": 0.467804878950119, + "learning_rate": 9.467667362993651e-05, + "loss": 1.868, + "step": 5663 + }, + { + "epoch": 1.7384898710865562, + "grad_norm": 0.4179893136024475, + "learning_rate": 9.46744416447772e-05, + "loss": 1.9521, + "step": 5664 + }, + { + "epoch": 1.7387968078575813, + "grad_norm": 0.4384612739086151, + "learning_rate": 9.467220921811884e-05, + "loss": 1.9167, + "step": 5665 + }, + { + "epoch": 1.7391037446286064, + "grad_norm": 0.517855703830719, + "learning_rate": 9.466997634998354e-05, + "loss": 1.8919, + "step": 5666 + }, + { + "epoch": 1.7394106813996317, + "grad_norm": 0.4875940978527069, + "learning_rate": 9.466774304039334e-05, + "loss": 1.8774, + "step": 5667 + }, + { + "epoch": 1.7397176181706568, + "grad_norm": 0.44286540150642395, + "learning_rate": 9.466550928937034e-05, + "loss": 1.9696, + "step": 5668 + }, + { + "epoch": 1.740024554941682, + "grad_norm": 0.4092461168766022, + "learning_rate": 9.466327509693658e-05, + "loss": 1.9978, + "step": 5669 + }, + { + "epoch": 1.7403314917127073, + "grad_norm": 0.42797163128852844, + "learning_rate": 9.466104046311418e-05, + "loss": 1.9428, + "step": 5670 + }, + { + "epoch": 1.7406384284837324, + "grad_norm": 0.5174738764762878, + "learning_rate": 9.465880538792518e-05, + "loss": 1.9493, + "step": 5671 + }, + { + "epoch": 1.7409453652547575, + "grad_norm": 0.6263836622238159, + "learning_rate": 9.46565698713917e-05, + "loss": 1.9131, + "step": 5672 + }, + { + "epoch": 1.7412523020257828, + "grad_norm": 0.6452967524528503, + "learning_rate": 9.465433391353582e-05, + "loss": 2.0412, + "step": 5673 + }, + { + "epoch": 1.7415592387968077, + "grad_norm": 0.5004684925079346, + "learning_rate": 9.465209751437964e-05, + "loss": 1.8721, + "step": 5674 + }, + { + "epoch": 1.741866175567833, + "grad_norm": 0.4694507420063019, + "learning_rate": 9.464986067394526e-05, + "loss": 1.9614, + "step": 5675 + }, + { + "epoch": 1.7421731123388582, + "grad_norm": 0.4519532322883606, + "learning_rate": 9.464762339225479e-05, + "loss": 1.9687, + "step": 5676 + }, + { + "epoch": 1.7424800491098833, + "grad_norm": 0.4297941029071808, + "learning_rate": 9.464538566933033e-05, + "loss": 1.965, + "step": 5677 + }, + { + "epoch": 1.7427869858809086, + "grad_norm": 0.4612393081188202, + "learning_rate": 9.464314750519401e-05, + "loss": 1.9651, + "step": 5678 + }, + { + "epoch": 1.7430939226519337, + "grad_norm": 0.394142210483551, + "learning_rate": 9.464090889986794e-05, + "loss": 1.9185, + "step": 5679 + }, + { + "epoch": 1.7434008594229589, + "grad_norm": 0.39999979734420776, + "learning_rate": 9.463866985337424e-05, + "loss": 1.899, + "step": 5680 + }, + { + "epoch": 1.7437077961939842, + "grad_norm": 0.40942859649658203, + "learning_rate": 9.463643036573504e-05, + "loss": 1.9653, + "step": 5681 + }, + { + "epoch": 1.744014732965009, + "grad_norm": 0.4097300171852112, + "learning_rate": 9.463419043697248e-05, + "loss": 1.9944, + "step": 5682 + }, + { + "epoch": 1.7443216697360344, + "grad_norm": 0.41627535223960876, + "learning_rate": 9.463195006710868e-05, + "loss": 1.9156, + "step": 5683 + }, + { + "epoch": 1.7446286065070595, + "grad_norm": 0.3789215385913849, + "learning_rate": 9.46297092561658e-05, + "loss": 1.9262, + "step": 5684 + }, + { + "epoch": 1.7449355432780846, + "grad_norm": 0.4867783188819885, + "learning_rate": 9.462746800416595e-05, + "loss": 1.961, + "step": 5685 + }, + { + "epoch": 1.74524248004911, + "grad_norm": 0.6078580617904663, + "learning_rate": 9.462522631113133e-05, + "loss": 1.9694, + "step": 5686 + }, + { + "epoch": 1.745549416820135, + "grad_norm": 0.558968186378479, + "learning_rate": 9.462298417708406e-05, + "loss": 1.9537, + "step": 5687 + }, + { + "epoch": 1.7458563535911602, + "grad_norm": 0.4677596986293793, + "learning_rate": 9.46207416020463e-05, + "loss": 1.9253, + "step": 5688 + }, + { + "epoch": 1.7461632903621855, + "grad_norm": 0.40353646874427795, + "learning_rate": 9.461849858604023e-05, + "loss": 1.8992, + "step": 5689 + }, + { + "epoch": 1.7464702271332104, + "grad_norm": 0.3738614618778229, + "learning_rate": 9.4616255129088e-05, + "loss": 1.9109, + "step": 5690 + }, + { + "epoch": 1.7467771639042358, + "grad_norm": 0.4040324091911316, + "learning_rate": 9.461401123121179e-05, + "loss": 1.8981, + "step": 5691 + }, + { + "epoch": 1.7470841006752609, + "grad_norm": 0.44214901328086853, + "learning_rate": 9.461176689243376e-05, + "loss": 1.9244, + "step": 5692 + }, + { + "epoch": 1.747391037446286, + "grad_norm": 0.44187378883361816, + "learning_rate": 9.460952211277611e-05, + "loss": 1.9329, + "step": 5693 + }, + { + "epoch": 1.7476979742173113, + "grad_norm": 0.44287410378456116, + "learning_rate": 9.460727689226102e-05, + "loss": 1.97, + "step": 5694 + }, + { + "epoch": 1.7480049109883364, + "grad_norm": 0.3757341504096985, + "learning_rate": 9.460503123091067e-05, + "loss": 1.8766, + "step": 5695 + }, + { + "epoch": 1.7483118477593615, + "grad_norm": 0.4139314591884613, + "learning_rate": 9.460278512874725e-05, + "loss": 1.902, + "step": 5696 + }, + { + "epoch": 1.7486187845303869, + "grad_norm": 0.37526339292526245, + "learning_rate": 9.460053858579298e-05, + "loss": 1.9325, + "step": 5697 + }, + { + "epoch": 1.7489257213014118, + "grad_norm": 0.3770616948604584, + "learning_rate": 9.459829160207004e-05, + "loss": 1.9437, + "step": 5698 + }, + { + "epoch": 1.749232658072437, + "grad_norm": 0.4069806933403015, + "learning_rate": 9.459604417760064e-05, + "loss": 1.9454, + "step": 5699 + }, + { + "epoch": 1.7495395948434622, + "grad_norm": 0.42822694778442383, + "learning_rate": 9.459379631240699e-05, + "loss": 1.8798, + "step": 5700 + }, + { + "epoch": 1.7498465316144873, + "grad_norm": 0.44075292348861694, + "learning_rate": 9.459154800651131e-05, + "loss": 1.9842, + "step": 5701 + }, + { + "epoch": 1.7501534683855127, + "grad_norm": 0.4151122272014618, + "learning_rate": 9.458929925993583e-05, + "loss": 1.8495, + "step": 5702 + }, + { + "epoch": 1.7504604051565378, + "grad_norm": 0.41887882351875305, + "learning_rate": 9.458705007270275e-05, + "loss": 1.9611, + "step": 5703 + }, + { + "epoch": 1.750767341927563, + "grad_norm": 0.3976796865463257, + "learning_rate": 9.45848004448343e-05, + "loss": 1.8841, + "step": 5704 + }, + { + "epoch": 1.7510742786985882, + "grad_norm": 0.3783813416957855, + "learning_rate": 9.458255037635272e-05, + "loss": 1.8897, + "step": 5705 + }, + { + "epoch": 1.7513812154696131, + "grad_norm": 0.35153308510780334, + "learning_rate": 9.458029986728026e-05, + "loss": 1.911, + "step": 5706 + }, + { + "epoch": 1.7516881522406385, + "grad_norm": 0.38390985131263733, + "learning_rate": 9.457804891763913e-05, + "loss": 2.0105, + "step": 5707 + }, + { + "epoch": 1.7519950890116636, + "grad_norm": 0.3830740451812744, + "learning_rate": 9.457579752745161e-05, + "loss": 1.9635, + "step": 5708 + }, + { + "epoch": 1.7523020257826887, + "grad_norm": 0.3711417019367218, + "learning_rate": 9.457354569673993e-05, + "loss": 1.8553, + "step": 5709 + }, + { + "epoch": 1.752608962553714, + "grad_norm": 0.3670618236064911, + "learning_rate": 9.457129342552633e-05, + "loss": 1.9044, + "step": 5710 + }, + { + "epoch": 1.7529158993247391, + "grad_norm": 0.398863285779953, + "learning_rate": 9.45690407138331e-05, + "loss": 1.987, + "step": 5711 + }, + { + "epoch": 1.7532228360957642, + "grad_norm": 0.4100732207298279, + "learning_rate": 9.456678756168248e-05, + "loss": 1.8552, + "step": 5712 + }, + { + "epoch": 1.7535297728667896, + "grad_norm": 0.41883236169815063, + "learning_rate": 9.456453396909676e-05, + "loss": 1.9183, + "step": 5713 + }, + { + "epoch": 1.7538367096378145, + "grad_norm": 0.4063440263271332, + "learning_rate": 9.456227993609818e-05, + "loss": 1.8751, + "step": 5714 + }, + { + "epoch": 1.7541436464088398, + "grad_norm": 0.3880515694618225, + "learning_rate": 9.456002546270904e-05, + "loss": 1.9558, + "step": 5715 + }, + { + "epoch": 1.754450583179865, + "grad_norm": 0.38582444190979004, + "learning_rate": 9.45577705489516e-05, + "loss": 1.9588, + "step": 5716 + }, + { + "epoch": 1.75475751995089, + "grad_norm": 0.3678396940231323, + "learning_rate": 9.455551519484816e-05, + "loss": 1.9108, + "step": 5717 + }, + { + "epoch": 1.7550644567219154, + "grad_norm": 0.3590768277645111, + "learning_rate": 9.455325940042098e-05, + "loss": 1.9027, + "step": 5718 + }, + { + "epoch": 1.7553713934929405, + "grad_norm": 0.4104592204093933, + "learning_rate": 9.455100316569241e-05, + "loss": 1.9099, + "step": 5719 + }, + { + "epoch": 1.7556783302639656, + "grad_norm": 0.3774401843547821, + "learning_rate": 9.45487464906847e-05, + "loss": 1.9098, + "step": 5720 + }, + { + "epoch": 1.755985267034991, + "grad_norm": 0.38464388251304626, + "learning_rate": 9.454648937542019e-05, + "loss": 1.9194, + "step": 5721 + }, + { + "epoch": 1.7562922038060158, + "grad_norm": 0.435131698846817, + "learning_rate": 9.454423181992114e-05, + "loss": 1.9798, + "step": 5722 + }, + { + "epoch": 1.7565991405770411, + "grad_norm": 0.4583236575126648, + "learning_rate": 9.454197382420988e-05, + "loss": 1.9862, + "step": 5723 + }, + { + "epoch": 1.7569060773480663, + "grad_norm": 0.3644738793373108, + "learning_rate": 9.453971538830874e-05, + "loss": 1.8535, + "step": 5724 + }, + { + "epoch": 1.7572130141190914, + "grad_norm": 0.3644218444824219, + "learning_rate": 9.453745651224002e-05, + "loss": 1.8773, + "step": 5725 + }, + { + "epoch": 1.7575199508901167, + "grad_norm": 0.42884743213653564, + "learning_rate": 9.453519719602604e-05, + "loss": 1.882, + "step": 5726 + }, + { + "epoch": 1.7578268876611418, + "grad_norm": 0.41049477458000183, + "learning_rate": 9.453293743968916e-05, + "loss": 1.9133, + "step": 5727 + }, + { + "epoch": 1.758133824432167, + "grad_norm": 0.35882604122161865, + "learning_rate": 9.453067724325169e-05, + "loss": 1.9056, + "step": 5728 + }, + { + "epoch": 1.7584407612031923, + "grad_norm": 0.34516364336013794, + "learning_rate": 9.452841660673595e-05, + "loss": 1.8894, + "step": 5729 + }, + { + "epoch": 1.7587476979742172, + "grad_norm": 0.41804373264312744, + "learning_rate": 9.45261555301643e-05, + "loss": 1.8798, + "step": 5730 + }, + { + "epoch": 1.7590546347452425, + "grad_norm": 0.48584702610969543, + "learning_rate": 9.45238940135591e-05, + "loss": 1.9353, + "step": 5731 + }, + { + "epoch": 1.7593615715162676, + "grad_norm": 0.5693044662475586, + "learning_rate": 9.452163205694267e-05, + "loss": 1.8813, + "step": 5732 + }, + { + "epoch": 1.7596685082872927, + "grad_norm": 0.6146205067634583, + "learning_rate": 9.451936966033738e-05, + "loss": 1.9993, + "step": 5733 + }, + { + "epoch": 1.759975445058318, + "grad_norm": 0.4658338129520416, + "learning_rate": 9.451710682376558e-05, + "loss": 1.8977, + "step": 5734 + }, + { + "epoch": 1.7602823818293432, + "grad_norm": 0.35184696316719055, + "learning_rate": 9.451484354724964e-05, + "loss": 1.9924, + "step": 5735 + }, + { + "epoch": 1.7605893186003683, + "grad_norm": 0.48720163106918335, + "learning_rate": 9.451257983081194e-05, + "loss": 1.9054, + "step": 5736 + }, + { + "epoch": 1.7608962553713936, + "grad_norm": 0.6268271803855896, + "learning_rate": 9.451031567447482e-05, + "loss": 1.9956, + "step": 5737 + }, + { + "epoch": 1.7612031921424187, + "grad_norm": 0.5384534001350403, + "learning_rate": 9.450805107826068e-05, + "loss": 1.9169, + "step": 5738 + }, + { + "epoch": 1.7615101289134438, + "grad_norm": 0.4011121094226837, + "learning_rate": 9.450578604219188e-05, + "loss": 1.9845, + "step": 5739 + }, + { + "epoch": 1.7618170656844692, + "grad_norm": 0.4422668516635895, + "learning_rate": 9.450352056629082e-05, + "loss": 2.0014, + "step": 5740 + }, + { + "epoch": 1.762124002455494, + "grad_norm": 0.5033303499221802, + "learning_rate": 9.45012546505799e-05, + "loss": 1.9142, + "step": 5741 + }, + { + "epoch": 1.7624309392265194, + "grad_norm": 0.6074427366256714, + "learning_rate": 9.449898829508148e-05, + "loss": 1.9385, + "step": 5742 + }, + { + "epoch": 1.7627378759975445, + "grad_norm": 0.6405495405197144, + "learning_rate": 9.449672149981799e-05, + "loss": 1.9792, + "step": 5743 + }, + { + "epoch": 1.7630448127685696, + "grad_norm": 0.5432560443878174, + "learning_rate": 9.449445426481182e-05, + "loss": 1.9294, + "step": 5744 + }, + { + "epoch": 1.763351749539595, + "grad_norm": 0.41406089067459106, + "learning_rate": 9.449218659008536e-05, + "loss": 1.9266, + "step": 5745 + }, + { + "epoch": 1.76365868631062, + "grad_norm": 0.41278013586997986, + "learning_rate": 9.448991847566104e-05, + "loss": 1.9448, + "step": 5746 + }, + { + "epoch": 1.7639656230816452, + "grad_norm": 0.4682934582233429, + "learning_rate": 9.448764992156128e-05, + "loss": 1.9836, + "step": 5747 + }, + { + "epoch": 1.7642725598526705, + "grad_norm": 0.47673073410987854, + "learning_rate": 9.448538092780848e-05, + "loss": 2.0229, + "step": 5748 + }, + { + "epoch": 1.7645794966236954, + "grad_norm": 0.3956258296966553, + "learning_rate": 9.448311149442507e-05, + "loss": 1.9871, + "step": 5749 + }, + { + "epoch": 1.7648864333947207, + "grad_norm": 0.39578214287757874, + "learning_rate": 9.448084162143348e-05, + "loss": 1.8991, + "step": 5750 + }, + { + "epoch": 1.7651933701657458, + "grad_norm": 0.42902353405952454, + "learning_rate": 9.447857130885614e-05, + "loss": 1.9925, + "step": 5751 + }, + { + "epoch": 1.765500306936771, + "grad_norm": 0.45643556118011475, + "learning_rate": 9.44763005567155e-05, + "loss": 1.9662, + "step": 5752 + }, + { + "epoch": 1.7658072437077963, + "grad_norm": 0.39291635155677795, + "learning_rate": 9.447402936503398e-05, + "loss": 1.8925, + "step": 5753 + }, + { + "epoch": 1.7661141804788214, + "grad_norm": 0.36709296703338623, + "learning_rate": 9.447175773383404e-05, + "loss": 1.8669, + "step": 5754 + }, + { + "epoch": 1.7664211172498465, + "grad_norm": 0.41586652398109436, + "learning_rate": 9.446948566313812e-05, + "loss": 1.8925, + "step": 5755 + }, + { + "epoch": 1.7667280540208719, + "grad_norm": 0.42532578110694885, + "learning_rate": 9.446721315296867e-05, + "loss": 1.9923, + "step": 5756 + }, + { + "epoch": 1.7670349907918967, + "grad_norm": 0.45310646295547485, + "learning_rate": 9.446494020334817e-05, + "loss": 1.9908, + "step": 5757 + }, + { + "epoch": 1.767341927562922, + "grad_norm": 0.4391445219516754, + "learning_rate": 9.446266681429907e-05, + "loss": 1.9391, + "step": 5758 + }, + { + "epoch": 1.7676488643339472, + "grad_norm": 0.3728313446044922, + "learning_rate": 9.446039298584382e-05, + "loss": 1.9352, + "step": 5759 + }, + { + "epoch": 1.7679558011049723, + "grad_norm": 0.3862408697605133, + "learning_rate": 9.445811871800492e-05, + "loss": 1.9628, + "step": 5760 + }, + { + "epoch": 1.7682627378759976, + "grad_norm": 0.3704443573951721, + "learning_rate": 9.445584401080482e-05, + "loss": 1.9041, + "step": 5761 + }, + { + "epoch": 1.7685696746470227, + "grad_norm": 0.3490816652774811, + "learning_rate": 9.445356886426603e-05, + "loss": 1.9203, + "step": 5762 + }, + { + "epoch": 1.7688766114180479, + "grad_norm": 0.40135613083839417, + "learning_rate": 9.445129327841102e-05, + "loss": 1.9166, + "step": 5763 + }, + { + "epoch": 1.7691835481890732, + "grad_norm": 0.3794950246810913, + "learning_rate": 9.444901725326227e-05, + "loss": 1.8735, + "step": 5764 + }, + { + "epoch": 1.769490484960098, + "grad_norm": 0.3908408284187317, + "learning_rate": 9.444674078884228e-05, + "loss": 1.9044, + "step": 5765 + }, + { + "epoch": 1.7697974217311234, + "grad_norm": 0.45880573987960815, + "learning_rate": 9.444446388517354e-05, + "loss": 1.999, + "step": 5766 + }, + { + "epoch": 1.7701043585021485, + "grad_norm": 0.44833555817604065, + "learning_rate": 9.444218654227856e-05, + "loss": 1.8638, + "step": 5767 + }, + { + "epoch": 1.7704112952731736, + "grad_norm": 0.4608282446861267, + "learning_rate": 9.443990876017985e-05, + "loss": 2.0073, + "step": 5768 + }, + { + "epoch": 1.770718232044199, + "grad_norm": 0.41873493790626526, + "learning_rate": 9.44376305388999e-05, + "loss": 1.9337, + "step": 5769 + }, + { + "epoch": 1.771025168815224, + "grad_norm": 0.44395530223846436, + "learning_rate": 9.443535187846125e-05, + "loss": 1.9218, + "step": 5770 + }, + { + "epoch": 1.7713321055862492, + "grad_norm": 0.4347928464412689, + "learning_rate": 9.443307277888641e-05, + "loss": 1.9251, + "step": 5771 + }, + { + "epoch": 1.7716390423572745, + "grad_norm": 0.4892890155315399, + "learning_rate": 9.44307932401979e-05, + "loss": 1.9549, + "step": 5772 + }, + { + "epoch": 1.7719459791282994, + "grad_norm": 0.4234324097633362, + "learning_rate": 9.442851326241826e-05, + "loss": 1.9835, + "step": 5773 + }, + { + "epoch": 1.7722529158993248, + "grad_norm": 0.3614303171634674, + "learning_rate": 9.442623284557e-05, + "loss": 1.8942, + "step": 5774 + }, + { + "epoch": 1.7725598526703499, + "grad_norm": 0.4273429214954376, + "learning_rate": 9.442395198967566e-05, + "loss": 1.9363, + "step": 5775 + }, + { + "epoch": 1.772866789441375, + "grad_norm": 0.5049880146980286, + "learning_rate": 9.44216706947578e-05, + "loss": 1.904, + "step": 5776 + }, + { + "epoch": 1.7731737262124003, + "grad_norm": 0.5713424682617188, + "learning_rate": 9.441938896083895e-05, + "loss": 1.9756, + "step": 5777 + }, + { + "epoch": 1.7734806629834254, + "grad_norm": 0.4836362600326538, + "learning_rate": 9.441710678794166e-05, + "loss": 1.9657, + "step": 5778 + }, + { + "epoch": 1.7737875997544506, + "grad_norm": 0.39967820048332214, + "learning_rate": 9.44148241760885e-05, + "loss": 1.9566, + "step": 5779 + }, + { + "epoch": 1.7740945365254759, + "grad_norm": 0.38304075598716736, + "learning_rate": 9.4412541125302e-05, + "loss": 1.9055, + "step": 5780 + }, + { + "epoch": 1.7744014732965008, + "grad_norm": 0.3932463526725769, + "learning_rate": 9.441025763560474e-05, + "loss": 1.9603, + "step": 5781 + }, + { + "epoch": 1.774708410067526, + "grad_norm": 0.4528409242630005, + "learning_rate": 9.44079737070193e-05, + "loss": 2.0095, + "step": 5782 + }, + { + "epoch": 1.7750153468385512, + "grad_norm": 0.42075392603874207, + "learning_rate": 9.440568933956822e-05, + "loss": 1.8818, + "step": 5783 + }, + { + "epoch": 1.7753222836095763, + "grad_norm": 0.4114269018173218, + "learning_rate": 9.44034045332741e-05, + "loss": 1.8524, + "step": 5784 + }, + { + "epoch": 1.7756292203806017, + "grad_norm": 0.4052261412143707, + "learning_rate": 9.44011192881595e-05, + "loss": 1.9759, + "step": 5785 + }, + { + "epoch": 1.7759361571516268, + "grad_norm": 0.3551998436450958, + "learning_rate": 9.439883360424702e-05, + "loss": 1.9534, + "step": 5786 + }, + { + "epoch": 1.776243093922652, + "grad_norm": 0.404109925031662, + "learning_rate": 9.439654748155924e-05, + "loss": 1.8944, + "step": 5787 + }, + { + "epoch": 1.7765500306936772, + "grad_norm": 0.4092860519886017, + "learning_rate": 9.439426092011875e-05, + "loss": 2.0341, + "step": 5788 + }, + { + "epoch": 1.7768569674647021, + "grad_norm": 0.36132386326789856, + "learning_rate": 9.439197391994819e-05, + "loss": 1.8746, + "step": 5789 + }, + { + "epoch": 1.7771639042357275, + "grad_norm": 0.34845319390296936, + "learning_rate": 9.438968648107009e-05, + "loss": 1.8646, + "step": 5790 + }, + { + "epoch": 1.7774708410067526, + "grad_norm": 0.33360353112220764, + "learning_rate": 9.43873986035071e-05, + "loss": 1.901, + "step": 5791 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.348147988319397, + "learning_rate": 9.438511028728181e-05, + "loss": 1.8703, + "step": 5792 + }, + { + "epoch": 1.778084714548803, + "grad_norm": 0.385662704706192, + "learning_rate": 9.438282153241686e-05, + "loss": 1.9806, + "step": 5793 + }, + { + "epoch": 1.7783916513198281, + "grad_norm": 0.39457234740257263, + "learning_rate": 9.438053233893484e-05, + "loss": 1.9324, + "step": 5794 + }, + { + "epoch": 1.7786985880908532, + "grad_norm": 0.35745853185653687, + "learning_rate": 9.43782427068584e-05, + "loss": 1.9754, + "step": 5795 + }, + { + "epoch": 1.7790055248618786, + "grad_norm": 0.40866991877555847, + "learning_rate": 9.437595263621015e-05, + "loss": 1.959, + "step": 5796 + }, + { + "epoch": 1.7793124616329035, + "grad_norm": 0.3938930630683899, + "learning_rate": 9.437366212701274e-05, + "loss": 1.8746, + "step": 5797 + }, + { + "epoch": 1.7796193984039288, + "grad_norm": 0.36665603518486023, + "learning_rate": 9.437137117928878e-05, + "loss": 1.9209, + "step": 5798 + }, + { + "epoch": 1.779926335174954, + "grad_norm": 0.38514846563339233, + "learning_rate": 9.436907979306092e-05, + "loss": 1.8697, + "step": 5799 + }, + { + "epoch": 1.780233271945979, + "grad_norm": 0.4100898206233978, + "learning_rate": 9.43667879683518e-05, + "loss": 1.9606, + "step": 5800 + }, + { + "epoch": 1.7805402087170044, + "grad_norm": 0.40195250511169434, + "learning_rate": 9.43644957051841e-05, + "loss": 1.918, + "step": 5801 + }, + { + "epoch": 1.7808471454880295, + "grad_norm": 0.3943032920360565, + "learning_rate": 9.436220300358043e-05, + "loss": 1.9394, + "step": 5802 + }, + { + "epoch": 1.7811540822590546, + "grad_norm": 0.4171943664550781, + "learning_rate": 9.435990986356349e-05, + "loss": 1.9773, + "step": 5803 + }, + { + "epoch": 1.78146101903008, + "grad_norm": 0.4278806746006012, + "learning_rate": 9.435761628515589e-05, + "loss": 1.8696, + "step": 5804 + }, + { + "epoch": 1.7817679558011048, + "grad_norm": 0.4659377634525299, + "learning_rate": 9.435532226838036e-05, + "loss": 1.9387, + "step": 5805 + }, + { + "epoch": 1.7820748925721301, + "grad_norm": 0.4428139925003052, + "learning_rate": 9.435302781325952e-05, + "loss": 1.9673, + "step": 5806 + }, + { + "epoch": 1.7823818293431553, + "grad_norm": 0.4488377869129181, + "learning_rate": 9.435073291981607e-05, + "loss": 1.8493, + "step": 5807 + }, + { + "epoch": 1.7826887661141804, + "grad_norm": 0.5337218046188354, + "learning_rate": 9.434843758807268e-05, + "loss": 1.8631, + "step": 5808 + }, + { + "epoch": 1.7829957028852057, + "grad_norm": 0.5479410886764526, + "learning_rate": 9.434614181805202e-05, + "loss": 1.8548, + "step": 5809 + }, + { + "epoch": 1.7833026396562308, + "grad_norm": 0.5154398679733276, + "learning_rate": 9.434384560977681e-05, + "loss": 1.9558, + "step": 5810 + }, + { + "epoch": 1.783609576427256, + "grad_norm": 0.44863855838775635, + "learning_rate": 9.434154896326974e-05, + "loss": 1.9287, + "step": 5811 + }, + { + "epoch": 1.7839165131982813, + "grad_norm": 0.43923139572143555, + "learning_rate": 9.433925187855348e-05, + "loss": 1.9475, + "step": 5812 + }, + { + "epoch": 1.7842234499693064, + "grad_norm": 0.3602962791919708, + "learning_rate": 9.433695435565073e-05, + "loss": 1.8705, + "step": 5813 + }, + { + "epoch": 1.7845303867403315, + "grad_norm": 0.3956433832645416, + "learning_rate": 9.433465639458423e-05, + "loss": 1.9402, + "step": 5814 + }, + { + "epoch": 1.7848373235113568, + "grad_norm": 0.3382786810398102, + "learning_rate": 9.433235799537666e-05, + "loss": 1.9176, + "step": 5815 + }, + { + "epoch": 1.7851442602823817, + "grad_norm": 0.3681669533252716, + "learning_rate": 9.433005915805076e-05, + "loss": 1.8628, + "step": 5816 + }, + { + "epoch": 1.785451197053407, + "grad_norm": 0.32285505533218384, + "learning_rate": 9.432775988262921e-05, + "loss": 1.8875, + "step": 5817 + }, + { + "epoch": 1.7857581338244322, + "grad_norm": 0.35673508048057556, + "learning_rate": 9.432546016913477e-05, + "loss": 1.925, + "step": 5818 + }, + { + "epoch": 1.7860650705954573, + "grad_norm": 0.363308310508728, + "learning_rate": 9.432316001759015e-05, + "loss": 1.8711, + "step": 5819 + }, + { + "epoch": 1.7863720073664826, + "grad_norm": 0.36789265275001526, + "learning_rate": 9.432085942801808e-05, + "loss": 1.8578, + "step": 5820 + }, + { + "epoch": 1.7866789441375077, + "grad_norm": 0.3791796565055847, + "learning_rate": 9.43185584004413e-05, + "loss": 1.9162, + "step": 5821 + }, + { + "epoch": 1.7869858809085328, + "grad_norm": 0.3819539248943329, + "learning_rate": 9.431625693488256e-05, + "loss": 1.9042, + "step": 5822 + }, + { + "epoch": 1.7872928176795582, + "grad_norm": 0.36675095558166504, + "learning_rate": 9.43139550313646e-05, + "loss": 1.9775, + "step": 5823 + }, + { + "epoch": 1.787599754450583, + "grad_norm": 0.40895935893058777, + "learning_rate": 9.431165268991013e-05, + "loss": 1.9249, + "step": 5824 + }, + { + "epoch": 1.7879066912216084, + "grad_norm": 0.3866878151893616, + "learning_rate": 9.430934991054197e-05, + "loss": 1.8706, + "step": 5825 + }, + { + "epoch": 1.7882136279926335, + "grad_norm": 0.4892923831939697, + "learning_rate": 9.430704669328283e-05, + "loss": 1.9177, + "step": 5826 + }, + { + "epoch": 1.7885205647636586, + "grad_norm": 0.46216699481010437, + "learning_rate": 9.430474303815548e-05, + "loss": 1.8606, + "step": 5827 + }, + { + "epoch": 1.788827501534684, + "grad_norm": 0.4253760874271393, + "learning_rate": 9.430243894518271e-05, + "loss": 1.9123, + "step": 5828 + }, + { + "epoch": 1.789134438305709, + "grad_norm": 0.3316090404987335, + "learning_rate": 9.430013441438726e-05, + "loss": 1.9138, + "step": 5829 + }, + { + "epoch": 1.7894413750767342, + "grad_norm": 0.36144545674324036, + "learning_rate": 9.429782944579191e-05, + "loss": 1.8851, + "step": 5830 + }, + { + "epoch": 1.7897483118477595, + "grad_norm": 0.47213298082351685, + "learning_rate": 9.429552403941946e-05, + "loss": 1.9614, + "step": 5831 + }, + { + "epoch": 1.7900552486187844, + "grad_norm": 0.5166186094284058, + "learning_rate": 9.429321819529267e-05, + "loss": 1.9297, + "step": 5832 + }, + { + "epoch": 1.7903621853898097, + "grad_norm": 0.5276393294334412, + "learning_rate": 9.429091191343433e-05, + "loss": 1.8803, + "step": 5833 + }, + { + "epoch": 1.7906691221608348, + "grad_norm": 0.5736613869667053, + "learning_rate": 9.428860519386726e-05, + "loss": 1.9256, + "step": 5834 + }, + { + "epoch": 1.79097605893186, + "grad_norm": 0.6111080050468445, + "learning_rate": 9.428629803661421e-05, + "loss": 1.9624, + "step": 5835 + }, + { + "epoch": 1.7912829957028853, + "grad_norm": 0.45036107301712036, + "learning_rate": 9.428399044169802e-05, + "loss": 1.8625, + "step": 5836 + }, + { + "epoch": 1.7915899324739104, + "grad_norm": 0.35049325227737427, + "learning_rate": 9.428168240914148e-05, + "loss": 1.8988, + "step": 5837 + }, + { + "epoch": 1.7918968692449355, + "grad_norm": 0.4196048080921173, + "learning_rate": 9.427937393896739e-05, + "loss": 1.8593, + "step": 5838 + }, + { + "epoch": 1.7922038060159609, + "grad_norm": 0.5051491856575012, + "learning_rate": 9.42770650311986e-05, + "loss": 1.9283, + "step": 5839 + }, + { + "epoch": 1.7925107427869857, + "grad_norm": 0.5883297324180603, + "learning_rate": 9.427475568585787e-05, + "loss": 1.9211, + "step": 5840 + }, + { + "epoch": 1.792817679558011, + "grad_norm": 0.54326993227005, + "learning_rate": 9.427244590296807e-05, + "loss": 1.8856, + "step": 5841 + }, + { + "epoch": 1.7931246163290362, + "grad_norm": 0.3963034152984619, + "learning_rate": 9.4270135682552e-05, + "loss": 1.9302, + "step": 5842 + }, + { + "epoch": 1.7934315531000613, + "grad_norm": 0.3804232180118561, + "learning_rate": 9.426782502463251e-05, + "loss": 1.8615, + "step": 5843 + }, + { + "epoch": 1.7937384898710866, + "grad_norm": 0.5173880457878113, + "learning_rate": 9.426551392923244e-05, + "loss": 1.9702, + "step": 5844 + }, + { + "epoch": 1.7940454266421118, + "grad_norm": 0.5509253144264221, + "learning_rate": 9.42632023963746e-05, + "loss": 1.9091, + "step": 5845 + }, + { + "epoch": 1.7943523634131369, + "grad_norm": 0.4918860197067261, + "learning_rate": 9.426089042608186e-05, + "loss": 1.956, + "step": 5846 + }, + { + "epoch": 1.7946593001841622, + "grad_norm": 0.40632131695747375, + "learning_rate": 9.425857801837705e-05, + "loss": 1.978, + "step": 5847 + }, + { + "epoch": 1.794966236955187, + "grad_norm": 0.429643839597702, + "learning_rate": 9.425626517328303e-05, + "loss": 1.9293, + "step": 5848 + }, + { + "epoch": 1.7952731737262124, + "grad_norm": 0.46690109372138977, + "learning_rate": 9.425395189082267e-05, + "loss": 1.935, + "step": 5849 + }, + { + "epoch": 1.7955801104972375, + "grad_norm": 0.47745081782341003, + "learning_rate": 9.425163817101881e-05, + "loss": 1.9308, + "step": 5850 + }, + { + "epoch": 1.7958870472682626, + "grad_norm": 0.40971288084983826, + "learning_rate": 9.424932401389433e-05, + "loss": 1.8818, + "step": 5851 + }, + { + "epoch": 1.796193984039288, + "grad_norm": 0.44640809297561646, + "learning_rate": 9.424700941947209e-05, + "loss": 1.9298, + "step": 5852 + }, + { + "epoch": 1.796500920810313, + "grad_norm": 0.4068106412887573, + "learning_rate": 9.424469438777497e-05, + "loss": 1.9176, + "step": 5853 + }, + { + "epoch": 1.7968078575813382, + "grad_norm": 0.39228180050849915, + "learning_rate": 9.424237891882584e-05, + "loss": 1.9822, + "step": 5854 + }, + { + "epoch": 1.7971147943523635, + "grad_norm": 0.4050966203212738, + "learning_rate": 9.424006301264761e-05, + "loss": 2.0092, + "step": 5855 + }, + { + "epoch": 1.7974217311233884, + "grad_norm": 0.4402252733707428, + "learning_rate": 9.423774666926313e-05, + "loss": 1.9686, + "step": 5856 + }, + { + "epoch": 1.7977286678944138, + "grad_norm": 0.4362206757068634, + "learning_rate": 9.423542988869531e-05, + "loss": 1.9472, + "step": 5857 + }, + { + "epoch": 1.7980356046654389, + "grad_norm": 0.4363079369068146, + "learning_rate": 9.423311267096706e-05, + "loss": 1.9046, + "step": 5858 + }, + { + "epoch": 1.798342541436464, + "grad_norm": 0.4619371294975281, + "learning_rate": 9.423079501610123e-05, + "loss": 1.9322, + "step": 5859 + }, + { + "epoch": 1.7986494782074893, + "grad_norm": 0.3747330605983734, + "learning_rate": 9.42284769241208e-05, + "loss": 1.8859, + "step": 5860 + }, + { + "epoch": 1.7989564149785144, + "grad_norm": 0.46349939703941345, + "learning_rate": 9.422615839504863e-05, + "loss": 2.0343, + "step": 5861 + }, + { + "epoch": 1.7992633517495396, + "grad_norm": 0.4081406891345978, + "learning_rate": 9.422383942890762e-05, + "loss": 1.9261, + "step": 5862 + }, + { + "epoch": 1.7995702885205649, + "grad_norm": 0.4200274348258972, + "learning_rate": 9.42215200257207e-05, + "loss": 1.8922, + "step": 5863 + }, + { + "epoch": 1.7998772252915898, + "grad_norm": 0.4353233277797699, + "learning_rate": 9.421920018551084e-05, + "loss": 1.9263, + "step": 5864 + }, + { + "epoch": 1.8001841620626151, + "grad_norm": 0.43261346220970154, + "learning_rate": 9.42168799083009e-05, + "loss": 1.872, + "step": 5865 + }, + { + "epoch": 1.8004910988336402, + "grad_norm": 0.41588231921195984, + "learning_rate": 9.421455919411385e-05, + "loss": 1.9427, + "step": 5866 + }, + { + "epoch": 1.8007980356046653, + "grad_norm": 0.36490678787231445, + "learning_rate": 9.421223804297261e-05, + "loss": 1.9458, + "step": 5867 + }, + { + "epoch": 1.8011049723756907, + "grad_norm": 0.40656644105911255, + "learning_rate": 9.42099164549001e-05, + "loss": 1.8791, + "step": 5868 + }, + { + "epoch": 1.8014119091467158, + "grad_norm": 0.35529834032058716, + "learning_rate": 9.42075944299193e-05, + "loss": 1.8889, + "step": 5869 + }, + { + "epoch": 1.801718845917741, + "grad_norm": 0.3530628979206085, + "learning_rate": 9.420527196805314e-05, + "loss": 1.9093, + "step": 5870 + }, + { + "epoch": 1.8020257826887662, + "grad_norm": 0.35012003779411316, + "learning_rate": 9.420294906932457e-05, + "loss": 1.84, + "step": 5871 + }, + { + "epoch": 1.8023327194597911, + "grad_norm": 0.37993142008781433, + "learning_rate": 9.420062573375654e-05, + "loss": 1.9943, + "step": 5872 + }, + { + "epoch": 1.8026396562308165, + "grad_norm": 0.34801873564720154, + "learning_rate": 9.419830196137204e-05, + "loss": 1.9092, + "step": 5873 + }, + { + "epoch": 1.8029465930018416, + "grad_norm": 0.3381052017211914, + "learning_rate": 9.4195977752194e-05, + "loss": 1.9212, + "step": 5874 + }, + { + "epoch": 1.8032535297728667, + "grad_norm": 0.3624991476535797, + "learning_rate": 9.419365310624542e-05, + "loss": 1.9491, + "step": 5875 + }, + { + "epoch": 1.803560466543892, + "grad_norm": 0.3840768337249756, + "learning_rate": 9.419132802354925e-05, + "loss": 1.9531, + "step": 5876 + }, + { + "epoch": 1.8038674033149171, + "grad_norm": 0.377481073141098, + "learning_rate": 9.418900250412846e-05, + "loss": 1.9103, + "step": 5877 + }, + { + "epoch": 1.8041743400859422, + "grad_norm": 0.41462278366088867, + "learning_rate": 9.418667654800606e-05, + "loss": 1.944, + "step": 5878 + }, + { + "epoch": 1.8044812768569676, + "grad_norm": 0.5620705485343933, + "learning_rate": 9.418435015520502e-05, + "loss": 1.9184, + "step": 5879 + }, + { + "epoch": 1.8047882136279927, + "grad_norm": 0.6150699853897095, + "learning_rate": 9.418202332574833e-05, + "loss": 1.8971, + "step": 5880 + }, + { + "epoch": 1.8050951503990178, + "grad_norm": 0.5631645321846008, + "learning_rate": 9.4179696059659e-05, + "loss": 1.9668, + "step": 5881 + }, + { + "epoch": 1.8054020871700431, + "grad_norm": 0.4416831433773041, + "learning_rate": 9.417736835696001e-05, + "loss": 1.8531, + "step": 5882 + }, + { + "epoch": 1.805709023941068, + "grad_norm": 0.37340816855430603, + "learning_rate": 9.417504021767438e-05, + "loss": 1.8928, + "step": 5883 + }, + { + "epoch": 1.8060159607120934, + "grad_norm": 0.46018123626708984, + "learning_rate": 9.41727116418251e-05, + "loss": 1.8943, + "step": 5884 + }, + { + "epoch": 1.8063228974831185, + "grad_norm": 0.3852032721042633, + "learning_rate": 9.41703826294352e-05, + "loss": 1.8927, + "step": 5885 + }, + { + "epoch": 1.8066298342541436, + "grad_norm": 0.36783283948898315, + "learning_rate": 9.41680531805277e-05, + "loss": 1.9255, + "step": 5886 + }, + { + "epoch": 1.806936771025169, + "grad_norm": 0.39950302243232727, + "learning_rate": 9.416572329512559e-05, + "loss": 1.9215, + "step": 5887 + }, + { + "epoch": 1.807243707796194, + "grad_norm": 0.37217068672180176, + "learning_rate": 9.416339297325193e-05, + "loss": 1.8798, + "step": 5888 + }, + { + "epoch": 1.8075506445672191, + "grad_norm": 0.4334213137626648, + "learning_rate": 9.416106221492974e-05, + "loss": 1.9583, + "step": 5889 + }, + { + "epoch": 1.8078575813382445, + "grad_norm": 0.39610370993614197, + "learning_rate": 9.415873102018204e-05, + "loss": 1.9526, + "step": 5890 + }, + { + "epoch": 1.8081645181092694, + "grad_norm": 0.4256335496902466, + "learning_rate": 9.41563993890319e-05, + "loss": 1.9633, + "step": 5891 + }, + { + "epoch": 1.8084714548802947, + "grad_norm": 0.48030543327331543, + "learning_rate": 9.41540673215023e-05, + "loss": 1.8869, + "step": 5892 + }, + { + "epoch": 1.8087783916513198, + "grad_norm": 0.5549675822257996, + "learning_rate": 9.415173481761634e-05, + "loss": 1.9894, + "step": 5893 + }, + { + "epoch": 1.809085328422345, + "grad_norm": 0.5706361532211304, + "learning_rate": 9.414940187739708e-05, + "loss": 1.9721, + "step": 5894 + }, + { + "epoch": 1.8093922651933703, + "grad_norm": 0.4263947606086731, + "learning_rate": 9.414706850086754e-05, + "loss": 1.9408, + "step": 5895 + }, + { + "epoch": 1.8096992019643954, + "grad_norm": 0.3934611976146698, + "learning_rate": 9.414473468805078e-05, + "loss": 1.9444, + "step": 5896 + }, + { + "epoch": 1.8100061387354205, + "grad_norm": 0.4267776608467102, + "learning_rate": 9.41424004389699e-05, + "loss": 1.8774, + "step": 5897 + }, + { + "epoch": 1.8103130755064458, + "grad_norm": 0.46216219663619995, + "learning_rate": 9.414006575364795e-05, + "loss": 1.9648, + "step": 5898 + }, + { + "epoch": 1.8106200122774707, + "grad_norm": 0.4730767607688904, + "learning_rate": 9.413773063210798e-05, + "loss": 1.9528, + "step": 5899 + }, + { + "epoch": 1.810926949048496, + "grad_norm": 0.36383283138275146, + "learning_rate": 9.413539507437308e-05, + "loss": 1.843, + "step": 5900 + }, + { + "epoch": 1.8112338858195212, + "grad_norm": 0.343729168176651, + "learning_rate": 9.413305908046636e-05, + "loss": 1.9101, + "step": 5901 + }, + { + "epoch": 1.8115408225905463, + "grad_norm": 0.3774524927139282, + "learning_rate": 9.413072265041087e-05, + "loss": 1.8705, + "step": 5902 + }, + { + "epoch": 1.8118477593615716, + "grad_norm": 0.37734711170196533, + "learning_rate": 9.412838578422972e-05, + "loss": 1.868, + "step": 5903 + }, + { + "epoch": 1.8121546961325967, + "grad_norm": 0.3705524206161499, + "learning_rate": 9.4126048481946e-05, + "loss": 1.9587, + "step": 5904 + }, + { + "epoch": 1.8124616329036218, + "grad_norm": 0.45906612277030945, + "learning_rate": 9.41237107435828e-05, + "loss": 1.9872, + "step": 5905 + }, + { + "epoch": 1.8127685696746472, + "grad_norm": 0.5013484954833984, + "learning_rate": 9.412137256916323e-05, + "loss": 1.8692, + "step": 5906 + }, + { + "epoch": 1.813075506445672, + "grad_norm": 0.5123991370201111, + "learning_rate": 9.411903395871038e-05, + "loss": 1.9574, + "step": 5907 + }, + { + "epoch": 1.8133824432166974, + "grad_norm": 0.45425844192504883, + "learning_rate": 9.411669491224739e-05, + "loss": 1.9295, + "step": 5908 + }, + { + "epoch": 1.8136893799877225, + "grad_norm": 0.3939640522003174, + "learning_rate": 9.411435542979736e-05, + "loss": 1.9258, + "step": 5909 + }, + { + "epoch": 1.8139963167587476, + "grad_norm": 0.5032235383987427, + "learning_rate": 9.411201551138342e-05, + "loss": 1.9012, + "step": 5910 + }, + { + "epoch": 1.814303253529773, + "grad_norm": 0.6334826946258545, + "learning_rate": 9.410967515702869e-05, + "loss": 1.9699, + "step": 5911 + }, + { + "epoch": 1.814610190300798, + "grad_norm": 0.56645667552948, + "learning_rate": 9.41073343667563e-05, + "loss": 1.9346, + "step": 5912 + }, + { + "epoch": 1.8149171270718232, + "grad_norm": 0.461668461561203, + "learning_rate": 9.410499314058936e-05, + "loss": 1.9549, + "step": 5913 + }, + { + "epoch": 1.8152240638428485, + "grad_norm": 0.39917534589767456, + "learning_rate": 9.410265147855104e-05, + "loss": 1.9503, + "step": 5914 + }, + { + "epoch": 1.8155310006138734, + "grad_norm": 0.4409043788909912, + "learning_rate": 9.410030938066448e-05, + "loss": 1.897, + "step": 5915 + }, + { + "epoch": 1.8158379373848987, + "grad_norm": 0.5793384313583374, + "learning_rate": 9.40979668469528e-05, + "loss": 1.9526, + "step": 5916 + }, + { + "epoch": 1.8161448741559238, + "grad_norm": 0.4642924666404724, + "learning_rate": 9.409562387743917e-05, + "loss": 1.8993, + "step": 5917 + }, + { + "epoch": 1.816451810926949, + "grad_norm": 0.3799861669540405, + "learning_rate": 9.409328047214674e-05, + "loss": 1.9412, + "step": 5918 + }, + { + "epoch": 1.8167587476979743, + "grad_norm": 0.40758320689201355, + "learning_rate": 9.409093663109866e-05, + "loss": 1.9908, + "step": 5919 + }, + { + "epoch": 1.8170656844689994, + "grad_norm": 0.41446420550346375, + "learning_rate": 9.40885923543181e-05, + "loss": 1.8711, + "step": 5920 + }, + { + "epoch": 1.8173726212400245, + "grad_norm": 0.4744807183742523, + "learning_rate": 9.408624764182823e-05, + "loss": 2.0297, + "step": 5921 + }, + { + "epoch": 1.8176795580110499, + "grad_norm": 0.43377524614334106, + "learning_rate": 9.408390249365224e-05, + "loss": 1.9613, + "step": 5922 + }, + { + "epoch": 1.8179864947820747, + "grad_norm": 0.38450872898101807, + "learning_rate": 9.408155690981328e-05, + "loss": 1.8716, + "step": 5923 + }, + { + "epoch": 1.8182934315531, + "grad_norm": 0.4989684820175171, + "learning_rate": 9.407921089033452e-05, + "loss": 1.9909, + "step": 5924 + }, + { + "epoch": 1.8186003683241252, + "grad_norm": 0.4137042462825775, + "learning_rate": 9.407686443523918e-05, + "loss": 1.8778, + "step": 5925 + }, + { + "epoch": 1.8189073050951503, + "grad_norm": 0.3816729485988617, + "learning_rate": 9.407451754455042e-05, + "loss": 1.9355, + "step": 5926 + }, + { + "epoch": 1.8192142418661756, + "grad_norm": 0.48876214027404785, + "learning_rate": 9.407217021829145e-05, + "loss": 1.9256, + "step": 5927 + }, + { + "epoch": 1.8195211786372008, + "grad_norm": 0.5273690223693848, + "learning_rate": 9.406982245648547e-05, + "loss": 1.9456, + "step": 5928 + }, + { + "epoch": 1.8198281154082259, + "grad_norm": 0.4148990511894226, + "learning_rate": 9.406747425915566e-05, + "loss": 1.9184, + "step": 5929 + }, + { + "epoch": 1.8201350521792512, + "grad_norm": 0.4484131634235382, + "learning_rate": 9.406512562632526e-05, + "loss": 1.9305, + "step": 5930 + }, + { + "epoch": 1.820441988950276, + "grad_norm": 0.6036938428878784, + "learning_rate": 9.406277655801744e-05, + "loss": 1.9294, + "step": 5931 + }, + { + "epoch": 1.8207489257213014, + "grad_norm": 0.5399366021156311, + "learning_rate": 9.406042705425543e-05, + "loss": 1.9265, + "step": 5932 + }, + { + "epoch": 1.8210558624923265, + "grad_norm": 0.3591126501560211, + "learning_rate": 9.405807711506249e-05, + "loss": 1.8634, + "step": 5933 + }, + { + "epoch": 1.8213627992633517, + "grad_norm": 0.4474995732307434, + "learning_rate": 9.405572674046179e-05, + "loss": 2.0084, + "step": 5934 + }, + { + "epoch": 1.821669736034377, + "grad_norm": 0.4841657876968384, + "learning_rate": 9.405337593047657e-05, + "loss": 1.8885, + "step": 5935 + }, + { + "epoch": 1.821976672805402, + "grad_norm": 0.4786655008792877, + "learning_rate": 9.405102468513008e-05, + "loss": 1.9273, + "step": 5936 + }, + { + "epoch": 1.8222836095764272, + "grad_norm": 0.4675963521003723, + "learning_rate": 9.404867300444553e-05, + "loss": 1.9267, + "step": 5937 + }, + { + "epoch": 1.8225905463474525, + "grad_norm": 0.40235474705696106, + "learning_rate": 9.404632088844619e-05, + "loss": 2.0208, + "step": 5938 + }, + { + "epoch": 1.8228974831184774, + "grad_norm": 0.40626317262649536, + "learning_rate": 9.404396833715527e-05, + "loss": 1.9079, + "step": 5939 + }, + { + "epoch": 1.8232044198895028, + "grad_norm": 0.4164435565471649, + "learning_rate": 9.404161535059607e-05, + "loss": 1.8818, + "step": 5940 + }, + { + "epoch": 1.8235113566605279, + "grad_norm": 0.44487184286117554, + "learning_rate": 9.40392619287918e-05, + "loss": 1.9184, + "step": 5941 + }, + { + "epoch": 1.823818293431553, + "grad_norm": 0.4009508192539215, + "learning_rate": 9.403690807176572e-05, + "loss": 1.8814, + "step": 5942 + }, + { + "epoch": 1.8241252302025783, + "grad_norm": 0.3518575429916382, + "learning_rate": 9.403455377954112e-05, + "loss": 1.9319, + "step": 5943 + }, + { + "epoch": 1.8244321669736034, + "grad_norm": 0.36712533235549927, + "learning_rate": 9.403219905214125e-05, + "loss": 1.8609, + "step": 5944 + }, + { + "epoch": 1.8247391037446286, + "grad_norm": 0.3926267623901367, + "learning_rate": 9.402984388958937e-05, + "loss": 1.9328, + "step": 5945 + }, + { + "epoch": 1.825046040515654, + "grad_norm": 0.370781272649765, + "learning_rate": 9.402748829190878e-05, + "loss": 1.9848, + "step": 5946 + }, + { + "epoch": 1.8253529772866788, + "grad_norm": 0.38226625323295593, + "learning_rate": 9.402513225912273e-05, + "loss": 1.8933, + "step": 5947 + }, + { + "epoch": 1.8256599140577041, + "grad_norm": 0.40101101994514465, + "learning_rate": 9.402277579125451e-05, + "loss": 1.9231, + "step": 5948 + }, + { + "epoch": 1.8259668508287292, + "grad_norm": 0.41038060188293457, + "learning_rate": 9.402041888832744e-05, + "loss": 1.9445, + "step": 5949 + }, + { + "epoch": 1.8262737875997543, + "grad_norm": 0.37442395091056824, + "learning_rate": 9.401806155036479e-05, + "loss": 1.9271, + "step": 5950 + }, + { + "epoch": 1.8265807243707797, + "grad_norm": 0.43142926692962646, + "learning_rate": 9.401570377738984e-05, + "loss": 1.9489, + "step": 5951 + }, + { + "epoch": 1.8268876611418048, + "grad_norm": 0.38730981945991516, + "learning_rate": 9.401334556942591e-05, + "loss": 1.8802, + "step": 5952 + }, + { + "epoch": 1.82719459791283, + "grad_norm": 0.34189531207084656, + "learning_rate": 9.40109869264963e-05, + "loss": 1.9116, + "step": 5953 + }, + { + "epoch": 1.8275015346838552, + "grad_norm": 0.3632197678089142, + "learning_rate": 9.400862784862434e-05, + "loss": 1.8456, + "step": 5954 + }, + { + "epoch": 1.8278084714548803, + "grad_norm": 0.4008798599243164, + "learning_rate": 9.400626833583331e-05, + "loss": 1.9984, + "step": 5955 + }, + { + "epoch": 1.8281154082259055, + "grad_norm": 0.4087502062320709, + "learning_rate": 9.400390838814655e-05, + "loss": 1.8177, + "step": 5956 + }, + { + "epoch": 1.8284223449969308, + "grad_norm": 0.3753478229045868, + "learning_rate": 9.400154800558737e-05, + "loss": 1.864, + "step": 5957 + }, + { + "epoch": 1.8287292817679557, + "grad_norm": 0.37939608097076416, + "learning_rate": 9.399918718817911e-05, + "loss": 1.9331, + "step": 5958 + }, + { + "epoch": 1.829036218538981, + "grad_norm": 0.41382426023483276, + "learning_rate": 9.399682593594507e-05, + "loss": 1.9014, + "step": 5959 + }, + { + "epoch": 1.8293431553100061, + "grad_norm": 0.46129345893859863, + "learning_rate": 9.399446424890864e-05, + "loss": 1.9591, + "step": 5960 + }, + { + "epoch": 1.8296500920810312, + "grad_norm": 0.487870454788208, + "learning_rate": 9.399210212709312e-05, + "loss": 1.9073, + "step": 5961 + }, + { + "epoch": 1.8299570288520566, + "grad_norm": 0.4693615138530731, + "learning_rate": 9.398973957052185e-05, + "loss": 1.8336, + "step": 5962 + }, + { + "epoch": 1.8302639656230817, + "grad_norm": 0.38947850465774536, + "learning_rate": 9.39873765792182e-05, + "loss": 1.8599, + "step": 5963 + }, + { + "epoch": 1.8305709023941068, + "grad_norm": 0.372242271900177, + "learning_rate": 9.398501315320551e-05, + "loss": 1.9653, + "step": 5964 + }, + { + "epoch": 1.8308778391651321, + "grad_norm": 0.37679895758628845, + "learning_rate": 9.398264929250714e-05, + "loss": 1.8886, + "step": 5965 + }, + { + "epoch": 1.831184775936157, + "grad_norm": 0.347989022731781, + "learning_rate": 9.398028499714645e-05, + "loss": 1.8665, + "step": 5966 + }, + { + "epoch": 1.8314917127071824, + "grad_norm": 0.4297877550125122, + "learning_rate": 9.397792026714681e-05, + "loss": 1.9646, + "step": 5967 + }, + { + "epoch": 1.8317986494782075, + "grad_norm": 0.3698103427886963, + "learning_rate": 9.397555510253158e-05, + "loss": 1.9537, + "step": 5968 + }, + { + "epoch": 1.8321055862492326, + "grad_norm": 0.3268609941005707, + "learning_rate": 9.397318950332414e-05, + "loss": 1.8679, + "step": 5969 + }, + { + "epoch": 1.832412523020258, + "grad_norm": 0.3487341105937958, + "learning_rate": 9.397082346954788e-05, + "loss": 1.8936, + "step": 5970 + }, + { + "epoch": 1.832719459791283, + "grad_norm": 0.36363741755485535, + "learning_rate": 9.396845700122616e-05, + "loss": 1.8926, + "step": 5971 + }, + { + "epoch": 1.8330263965623081, + "grad_norm": 0.42258647084236145, + "learning_rate": 9.396609009838237e-05, + "loss": 1.9439, + "step": 5972 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.4087521433830261, + "learning_rate": 9.396372276103992e-05, + "loss": 1.8868, + "step": 5973 + }, + { + "epoch": 1.8336402701043584, + "grad_norm": 0.41857820749282837, + "learning_rate": 9.396135498922218e-05, + "loss": 1.9824, + "step": 5974 + }, + { + "epoch": 1.8339472068753837, + "grad_norm": 0.44207099080085754, + "learning_rate": 9.395898678295259e-05, + "loss": 1.9183, + "step": 5975 + }, + { + "epoch": 1.8342541436464088, + "grad_norm": 0.38295891880989075, + "learning_rate": 9.39566181422545e-05, + "loss": 1.8882, + "step": 5976 + }, + { + "epoch": 1.834561080417434, + "grad_norm": 0.4440687298774719, + "learning_rate": 9.395424906715136e-05, + "loss": 1.9401, + "step": 5977 + }, + { + "epoch": 1.8348680171884593, + "grad_norm": 0.3867577016353607, + "learning_rate": 9.395187955766655e-05, + "loss": 1.9243, + "step": 5978 + }, + { + "epoch": 1.8351749539594844, + "grad_norm": 0.47536182403564453, + "learning_rate": 9.394950961382354e-05, + "loss": 1.9248, + "step": 5979 + }, + { + "epoch": 1.8354818907305095, + "grad_norm": 0.4071936011314392, + "learning_rate": 9.394713923564569e-05, + "loss": 1.8701, + "step": 5980 + }, + { + "epoch": 1.8357888275015348, + "grad_norm": 0.41844502091407776, + "learning_rate": 9.394476842315645e-05, + "loss": 2.0087, + "step": 5981 + }, + { + "epoch": 1.8360957642725597, + "grad_norm": 0.40439316630363464, + "learning_rate": 9.394239717637927e-05, + "loss": 1.8945, + "step": 5982 + }, + { + "epoch": 1.836402701043585, + "grad_norm": 0.36738064885139465, + "learning_rate": 9.394002549533754e-05, + "loss": 1.9361, + "step": 5983 + }, + { + "epoch": 1.8367096378146102, + "grad_norm": 0.4733370542526245, + "learning_rate": 9.393765338005476e-05, + "loss": 1.9301, + "step": 5984 + }, + { + "epoch": 1.8370165745856353, + "grad_norm": 0.4467030465602875, + "learning_rate": 9.39352808305543e-05, + "loss": 1.8691, + "step": 5985 + }, + { + "epoch": 1.8373235113566606, + "grad_norm": 0.5276423692703247, + "learning_rate": 9.393290784685967e-05, + "loss": 1.9211, + "step": 5986 + }, + { + "epoch": 1.8376304481276857, + "grad_norm": 0.4791669547557831, + "learning_rate": 9.393053442899428e-05, + "loss": 1.9876, + "step": 5987 + }, + { + "epoch": 1.8379373848987108, + "grad_norm": 0.41468554735183716, + "learning_rate": 9.392816057698159e-05, + "loss": 1.9483, + "step": 5988 + }, + { + "epoch": 1.8382443216697362, + "grad_norm": 0.3979242742061615, + "learning_rate": 9.39257862908451e-05, + "loss": 1.8962, + "step": 5989 + }, + { + "epoch": 1.838551258440761, + "grad_norm": 0.47706472873687744, + "learning_rate": 9.392341157060822e-05, + "loss": 1.9028, + "step": 5990 + }, + { + "epoch": 1.8388581952117864, + "grad_norm": 0.5254244804382324, + "learning_rate": 9.392103641629446e-05, + "loss": 1.9244, + "step": 5991 + }, + { + "epoch": 1.8391651319828115, + "grad_norm": 0.49596595764160156, + "learning_rate": 9.391866082792727e-05, + "loss": 1.8731, + "step": 5992 + }, + { + "epoch": 1.8394720687538366, + "grad_norm": 0.3787136971950531, + "learning_rate": 9.391628480553013e-05, + "loss": 1.9404, + "step": 5993 + }, + { + "epoch": 1.839779005524862, + "grad_norm": 0.3986566960811615, + "learning_rate": 9.391390834912651e-05, + "loss": 1.9319, + "step": 5994 + }, + { + "epoch": 1.840085942295887, + "grad_norm": 0.4466419219970703, + "learning_rate": 9.391153145873992e-05, + "loss": 1.9755, + "step": 5995 + }, + { + "epoch": 1.8403928790669122, + "grad_norm": 0.43374884128570557, + "learning_rate": 9.390915413439385e-05, + "loss": 1.913, + "step": 5996 + }, + { + "epoch": 1.8406998158379375, + "grad_norm": 0.3897610902786255, + "learning_rate": 9.390677637611176e-05, + "loss": 1.9488, + "step": 5997 + }, + { + "epoch": 1.8410067526089624, + "grad_norm": 0.38407614827156067, + "learning_rate": 9.390439818391718e-05, + "loss": 1.8712, + "step": 5998 + }, + { + "epoch": 1.8413136893799877, + "grad_norm": 0.4159192740917206, + "learning_rate": 9.390201955783362e-05, + "loss": 1.9254, + "step": 5999 + }, + { + "epoch": 1.8416206261510129, + "grad_norm": 0.42220592498779297, + "learning_rate": 9.389964049788455e-05, + "loss": 1.9684, + "step": 6000 + }, + { + "epoch": 1.841927562922038, + "grad_norm": 0.3792029619216919, + "learning_rate": 9.389726100409351e-05, + "loss": 1.9091, + "step": 6001 + }, + { + "epoch": 1.8422344996930633, + "grad_norm": 0.37374788522720337, + "learning_rate": 9.389488107648401e-05, + "loss": 1.9498, + "step": 6002 + }, + { + "epoch": 1.8425414364640884, + "grad_norm": 0.4237084686756134, + "learning_rate": 9.389250071507958e-05, + "loss": 1.9177, + "step": 6003 + }, + { + "epoch": 1.8428483732351135, + "grad_norm": 0.5332993865013123, + "learning_rate": 9.38901199199037e-05, + "loss": 1.8994, + "step": 6004 + }, + { + "epoch": 1.8431553100061389, + "grad_norm": 0.42202335596084595, + "learning_rate": 9.388773869097996e-05, + "loss": 1.8365, + "step": 6005 + }, + { + "epoch": 1.8434622467771637, + "grad_norm": 0.3581100106239319, + "learning_rate": 9.388535702833185e-05, + "loss": 1.8536, + "step": 6006 + }, + { + "epoch": 1.843769183548189, + "grad_norm": 0.3670782446861267, + "learning_rate": 9.388297493198293e-05, + "loss": 1.8965, + "step": 6007 + }, + { + "epoch": 1.8440761203192142, + "grad_norm": 0.39181825518608093, + "learning_rate": 9.38805924019567e-05, + "loss": 1.8674, + "step": 6008 + }, + { + "epoch": 1.8443830570902393, + "grad_norm": 0.46757015585899353, + "learning_rate": 9.387820943827676e-05, + "loss": 1.8945, + "step": 6009 + }, + { + "epoch": 1.8446899938612646, + "grad_norm": 0.4656504690647125, + "learning_rate": 9.387582604096664e-05, + "loss": 1.8626, + "step": 6010 + }, + { + "epoch": 1.8449969306322898, + "grad_norm": 0.4699888825416565, + "learning_rate": 9.387344221004988e-05, + "loss": 1.9396, + "step": 6011 + }, + { + "epoch": 1.8453038674033149, + "grad_norm": 0.36591392755508423, + "learning_rate": 9.387105794555006e-05, + "loss": 1.8031, + "step": 6012 + }, + { + "epoch": 1.8456108041743402, + "grad_norm": 0.3563486933708191, + "learning_rate": 9.386867324749073e-05, + "loss": 1.8658, + "step": 6013 + }, + { + "epoch": 1.845917740945365, + "grad_norm": 0.4490883946418762, + "learning_rate": 9.386628811589547e-05, + "loss": 1.9809, + "step": 6014 + }, + { + "epoch": 1.8462246777163904, + "grad_norm": 0.39862295985221863, + "learning_rate": 9.38639025507878e-05, + "loss": 1.9268, + "step": 6015 + }, + { + "epoch": 1.8465316144874155, + "grad_norm": 0.3579883575439453, + "learning_rate": 9.386151655219138e-05, + "loss": 1.8538, + "step": 6016 + }, + { + "epoch": 1.8468385512584407, + "grad_norm": 0.411685973405838, + "learning_rate": 9.385913012012973e-05, + "loss": 1.9034, + "step": 6017 + }, + { + "epoch": 1.847145488029466, + "grad_norm": 0.44486066699028015, + "learning_rate": 9.385674325462643e-05, + "loss": 1.9279, + "step": 6018 + }, + { + "epoch": 1.847452424800491, + "grad_norm": 0.42794153094291687, + "learning_rate": 9.385435595570511e-05, + "loss": 1.9117, + "step": 6019 + }, + { + "epoch": 1.8477593615715162, + "grad_norm": 0.3652110695838928, + "learning_rate": 9.385196822338933e-05, + "loss": 1.9636, + "step": 6020 + }, + { + "epoch": 1.8480662983425415, + "grad_norm": 0.36490142345428467, + "learning_rate": 9.38495800577027e-05, + "loss": 1.9468, + "step": 6021 + }, + { + "epoch": 1.8483732351135667, + "grad_norm": 0.3946039080619812, + "learning_rate": 9.384719145866882e-05, + "loss": 1.8851, + "step": 6022 + }, + { + "epoch": 1.8486801718845918, + "grad_norm": 0.4236997067928314, + "learning_rate": 9.38448024263113e-05, + "loss": 2.0256, + "step": 6023 + }, + { + "epoch": 1.848987108655617, + "grad_norm": 0.34637942910194397, + "learning_rate": 9.384241296065374e-05, + "loss": 1.9032, + "step": 6024 + }, + { + "epoch": 1.849294045426642, + "grad_norm": 0.4096907079219818, + "learning_rate": 9.384002306171975e-05, + "loss": 1.9762, + "step": 6025 + }, + { + "epoch": 1.8496009821976673, + "grad_norm": 0.38225218653678894, + "learning_rate": 9.383763272953297e-05, + "loss": 2.023, + "step": 6026 + }, + { + "epoch": 1.8499079189686924, + "grad_norm": 0.4297153055667877, + "learning_rate": 9.3835241964117e-05, + "loss": 1.977, + "step": 6027 + }, + { + "epoch": 1.8502148557397176, + "grad_norm": 0.5225360989570618, + "learning_rate": 9.383285076549548e-05, + "loss": 1.919, + "step": 6028 + }, + { + "epoch": 1.850521792510743, + "grad_norm": 0.6799743175506592, + "learning_rate": 9.383045913369205e-05, + "loss": 1.9382, + "step": 6029 + }, + { + "epoch": 1.850828729281768, + "grad_norm": 0.6274817585945129, + "learning_rate": 9.382806706873031e-05, + "loss": 1.9782, + "step": 6030 + }, + { + "epoch": 1.8511356660527931, + "grad_norm": 0.4939708113670349, + "learning_rate": 9.382567457063392e-05, + "loss": 1.8794, + "step": 6031 + }, + { + "epoch": 1.8514426028238185, + "grad_norm": 0.3876135051250458, + "learning_rate": 9.382328163942656e-05, + "loss": 2.0153, + "step": 6032 + }, + { + "epoch": 1.8517495395948433, + "grad_norm": 0.592051088809967, + "learning_rate": 9.38208882751318e-05, + "loss": 1.9277, + "step": 6033 + }, + { + "epoch": 1.8520564763658687, + "grad_norm": 0.660763144493103, + "learning_rate": 9.381849447777337e-05, + "loss": 1.9177, + "step": 6034 + }, + { + "epoch": 1.8523634131368938, + "grad_norm": 0.5823151469230652, + "learning_rate": 9.381610024737489e-05, + "loss": 1.9363, + "step": 6035 + }, + { + "epoch": 1.852670349907919, + "grad_norm": 0.39519962668418884, + "learning_rate": 9.381370558396004e-05, + "loss": 1.8627, + "step": 6036 + }, + { + "epoch": 1.8529772866789442, + "grad_norm": 0.44657328724861145, + "learning_rate": 9.381131048755244e-05, + "loss": 1.9075, + "step": 6037 + }, + { + "epoch": 1.8532842234499693, + "grad_norm": 0.540743887424469, + "learning_rate": 9.380891495817581e-05, + "loss": 1.9518, + "step": 6038 + }, + { + "epoch": 1.8535911602209945, + "grad_norm": 0.4388680160045624, + "learning_rate": 9.38065189958538e-05, + "loss": 1.8485, + "step": 6039 + }, + { + "epoch": 1.8538980969920198, + "grad_norm": 0.37645572423934937, + "learning_rate": 9.38041226006101e-05, + "loss": 1.9542, + "step": 6040 + }, + { + "epoch": 1.8542050337630447, + "grad_norm": 0.4405656158924103, + "learning_rate": 9.380172577246837e-05, + "loss": 1.9054, + "step": 6041 + }, + { + "epoch": 1.85451197053407, + "grad_norm": 0.45483505725860596, + "learning_rate": 9.379932851145232e-05, + "loss": 1.9077, + "step": 6042 + }, + { + "epoch": 1.8548189073050951, + "grad_norm": 0.40666261315345764, + "learning_rate": 9.379693081758564e-05, + "loss": 1.9977, + "step": 6043 + }, + { + "epoch": 1.8551258440761202, + "grad_norm": 0.365241140127182, + "learning_rate": 9.379453269089202e-05, + "loss": 1.9047, + "step": 6044 + }, + { + "epoch": 1.8554327808471456, + "grad_norm": 0.40797916054725647, + "learning_rate": 9.379213413139516e-05, + "loss": 1.9621, + "step": 6045 + }, + { + "epoch": 1.8557397176181707, + "grad_norm": 0.4525306820869446, + "learning_rate": 9.378973513911875e-05, + "loss": 1.9479, + "step": 6046 + }, + { + "epoch": 1.8560466543891958, + "grad_norm": 0.45422959327697754, + "learning_rate": 9.378733571408652e-05, + "loss": 1.9754, + "step": 6047 + }, + { + "epoch": 1.8563535911602211, + "grad_norm": 0.381862998008728, + "learning_rate": 9.378493585632217e-05, + "loss": 1.8542, + "step": 6048 + }, + { + "epoch": 1.856660527931246, + "grad_norm": 0.40489691495895386, + "learning_rate": 9.378253556584944e-05, + "loss": 1.9331, + "step": 6049 + }, + { + "epoch": 1.8569674647022714, + "grad_norm": 0.40347445011138916, + "learning_rate": 9.378013484269201e-05, + "loss": 1.9414, + "step": 6050 + }, + { + "epoch": 1.8572744014732965, + "grad_norm": 0.35401904582977295, + "learning_rate": 9.377773368687363e-05, + "loss": 1.8094, + "step": 6051 + }, + { + "epoch": 1.8575813382443216, + "grad_norm": 0.4061582684516907, + "learning_rate": 9.377533209841805e-05, + "loss": 1.8686, + "step": 6052 + }, + { + "epoch": 1.857888275015347, + "grad_norm": 0.44419318437576294, + "learning_rate": 9.377293007734895e-05, + "loss": 1.929, + "step": 6053 + }, + { + "epoch": 1.858195211786372, + "grad_norm": 0.41038191318511963, + "learning_rate": 9.37705276236901e-05, + "loss": 1.9636, + "step": 6054 + }, + { + "epoch": 1.8585021485573971, + "grad_norm": 0.4431348145008087, + "learning_rate": 9.376812473746526e-05, + "loss": 1.953, + "step": 6055 + }, + { + "epoch": 1.8588090853284225, + "grad_norm": 0.42502057552337646, + "learning_rate": 9.376572141869814e-05, + "loss": 1.95, + "step": 6056 + }, + { + "epoch": 1.8591160220994474, + "grad_norm": 0.40050914883613586, + "learning_rate": 9.376331766741253e-05, + "loss": 1.9507, + "step": 6057 + }, + { + "epoch": 1.8594229588704727, + "grad_norm": 0.3863932490348816, + "learning_rate": 9.376091348363216e-05, + "loss": 1.8746, + "step": 6058 + }, + { + "epoch": 1.8597298956414978, + "grad_norm": 0.37295350432395935, + "learning_rate": 9.375850886738077e-05, + "loss": 1.8778, + "step": 6059 + }, + { + "epoch": 1.860036832412523, + "grad_norm": 0.37965887784957886, + "learning_rate": 9.375610381868217e-05, + "loss": 1.8511, + "step": 6060 + }, + { + "epoch": 1.8603437691835483, + "grad_norm": 0.3740752637386322, + "learning_rate": 9.37536983375601e-05, + "loss": 1.8988, + "step": 6061 + }, + { + "epoch": 1.8606507059545734, + "grad_norm": 0.40466782450675964, + "learning_rate": 9.375129242403834e-05, + "loss": 1.9195, + "step": 6062 + }, + { + "epoch": 1.8609576427255985, + "grad_norm": 0.3658956289291382, + "learning_rate": 9.374888607814067e-05, + "loss": 1.9598, + "step": 6063 + }, + { + "epoch": 1.8612645794966238, + "grad_norm": 0.3752783238887787, + "learning_rate": 9.374647929989085e-05, + "loss": 1.9791, + "step": 6064 + }, + { + "epoch": 1.8615715162676487, + "grad_norm": 0.408774733543396, + "learning_rate": 9.374407208931268e-05, + "loss": 1.88, + "step": 6065 + }, + { + "epoch": 1.861878453038674, + "grad_norm": 0.3968205749988556, + "learning_rate": 9.374166444642997e-05, + "loss": 1.8755, + "step": 6066 + }, + { + "epoch": 1.8621853898096992, + "grad_norm": 0.37851858139038086, + "learning_rate": 9.373925637126648e-05, + "loss": 1.9296, + "step": 6067 + }, + { + "epoch": 1.8624923265807243, + "grad_norm": 0.34285619854927063, + "learning_rate": 9.373684786384604e-05, + "loss": 2.0149, + "step": 6068 + }, + { + "epoch": 1.8627992633517496, + "grad_norm": 0.38841512799263, + "learning_rate": 9.373443892419242e-05, + "loss": 1.9134, + "step": 6069 + }, + { + "epoch": 1.8631062001227747, + "grad_norm": 0.4744485914707184, + "learning_rate": 9.373202955232943e-05, + "loss": 1.9164, + "step": 6070 + }, + { + "epoch": 1.8634131368937998, + "grad_norm": 0.522659420967102, + "learning_rate": 9.372961974828092e-05, + "loss": 1.9155, + "step": 6071 + }, + { + "epoch": 1.8637200736648252, + "grad_norm": 0.5794001817703247, + "learning_rate": 9.372720951207066e-05, + "loss": 1.9003, + "step": 6072 + }, + { + "epoch": 1.86402701043585, + "grad_norm": 0.5135447978973389, + "learning_rate": 9.372479884372247e-05, + "loss": 1.948, + "step": 6073 + }, + { + "epoch": 1.8643339472068754, + "grad_norm": 0.4060198664665222, + "learning_rate": 9.372238774326021e-05, + "loss": 1.8634, + "step": 6074 + }, + { + "epoch": 1.8646408839779005, + "grad_norm": 0.3880244195461273, + "learning_rate": 9.371997621070769e-05, + "loss": 1.8729, + "step": 6075 + }, + { + "epoch": 1.8649478207489256, + "grad_norm": 0.4862929582595825, + "learning_rate": 9.371756424608875e-05, + "loss": 1.9185, + "step": 6076 + }, + { + "epoch": 1.865254757519951, + "grad_norm": 0.4763035476207733, + "learning_rate": 9.371515184942719e-05, + "loss": 1.9696, + "step": 6077 + }, + { + "epoch": 1.865561694290976, + "grad_norm": 0.3552228808403015, + "learning_rate": 9.371273902074689e-05, + "loss": 1.9101, + "step": 6078 + }, + { + "epoch": 1.8658686310620012, + "grad_norm": 0.46329566836357117, + "learning_rate": 9.371032576007168e-05, + "loss": 1.8807, + "step": 6079 + }, + { + "epoch": 1.8661755678330265, + "grad_norm": 0.5176550149917603, + "learning_rate": 9.370791206742541e-05, + "loss": 1.9044, + "step": 6080 + }, + { + "epoch": 1.8664825046040514, + "grad_norm": 0.3929184675216675, + "learning_rate": 9.370549794283194e-05, + "loss": 1.8858, + "step": 6081 + }, + { + "epoch": 1.8667894413750767, + "grad_norm": 0.35135987401008606, + "learning_rate": 9.370308338631511e-05, + "loss": 1.8518, + "step": 6082 + }, + { + "epoch": 1.8670963781461019, + "grad_norm": 0.4229072034358978, + "learning_rate": 9.370066839789881e-05, + "loss": 1.891, + "step": 6083 + }, + { + "epoch": 1.867403314917127, + "grad_norm": 0.4862394630908966, + "learning_rate": 9.369825297760688e-05, + "loss": 1.9058, + "step": 6084 + }, + { + "epoch": 1.8677102516881523, + "grad_norm": 0.4775281548500061, + "learning_rate": 9.369583712546322e-05, + "loss": 1.9738, + "step": 6085 + }, + { + "epoch": 1.8680171884591774, + "grad_norm": 0.3831046521663666, + "learning_rate": 9.369342084149166e-05, + "loss": 1.9516, + "step": 6086 + }, + { + "epoch": 1.8683241252302025, + "grad_norm": 0.3970867395401001, + "learning_rate": 9.369100412571612e-05, + "loss": 2.0158, + "step": 6087 + }, + { + "epoch": 1.8686310620012279, + "grad_norm": 0.41662725806236267, + "learning_rate": 9.368858697816047e-05, + "loss": 1.86, + "step": 6088 + }, + { + "epoch": 1.8689379987722528, + "grad_norm": 0.44235244393348694, + "learning_rate": 9.36861693988486e-05, + "loss": 1.9257, + "step": 6089 + }, + { + "epoch": 1.869244935543278, + "grad_norm": 0.37863966822624207, + "learning_rate": 9.36837513878044e-05, + "loss": 1.8877, + "step": 6090 + }, + { + "epoch": 1.8695518723143032, + "grad_norm": 0.44757044315338135, + "learning_rate": 9.368133294505175e-05, + "loss": 1.8962, + "step": 6091 + }, + { + "epoch": 1.8698588090853283, + "grad_norm": 0.5299558639526367, + "learning_rate": 9.367891407061458e-05, + "loss": 1.8655, + "step": 6092 + }, + { + "epoch": 1.8701657458563536, + "grad_norm": 0.4899531900882721, + "learning_rate": 9.367649476451678e-05, + "loss": 1.8933, + "step": 6093 + }, + { + "epoch": 1.8704726826273788, + "grad_norm": 0.3883507251739502, + "learning_rate": 9.367407502678224e-05, + "loss": 1.88, + "step": 6094 + }, + { + "epoch": 1.8707796193984039, + "grad_norm": 0.40936750173568726, + "learning_rate": 9.367165485743493e-05, + "loss": 1.8926, + "step": 6095 + }, + { + "epoch": 1.8710865561694292, + "grad_norm": 0.5708447098731995, + "learning_rate": 9.36692342564987e-05, + "loss": 1.9701, + "step": 6096 + }, + { + "epoch": 1.8713934929404543, + "grad_norm": 0.5559602379798889, + "learning_rate": 9.366681322399751e-05, + "loss": 1.8962, + "step": 6097 + }, + { + "epoch": 1.8717004297114794, + "grad_norm": 0.45344826579093933, + "learning_rate": 9.366439175995528e-05, + "loss": 1.9766, + "step": 6098 + }, + { + "epoch": 1.8720073664825048, + "grad_norm": 0.4887133538722992, + "learning_rate": 9.366196986439592e-05, + "loss": 1.8982, + "step": 6099 + }, + { + "epoch": 1.8723143032535297, + "grad_norm": 0.536568284034729, + "learning_rate": 9.365954753734339e-05, + "loss": 1.9506, + "step": 6100 + }, + { + "epoch": 1.872621240024555, + "grad_norm": 0.4792746901512146, + "learning_rate": 9.365712477882162e-05, + "loss": 1.9392, + "step": 6101 + }, + { + "epoch": 1.87292817679558, + "grad_norm": 0.39836910367012024, + "learning_rate": 9.365470158885458e-05, + "loss": 1.8812, + "step": 6102 + }, + { + "epoch": 1.8732351135666052, + "grad_norm": 0.4263121783733368, + "learning_rate": 9.365227796746617e-05, + "loss": 1.8326, + "step": 6103 + }, + { + "epoch": 1.8735420503376305, + "grad_norm": 0.4158315360546112, + "learning_rate": 9.364985391468038e-05, + "loss": 1.8857, + "step": 6104 + }, + { + "epoch": 1.8738489871086557, + "grad_norm": 0.4384559094905853, + "learning_rate": 9.364742943052112e-05, + "loss": 1.9247, + "step": 6105 + }, + { + "epoch": 1.8741559238796808, + "grad_norm": 0.34221649169921875, + "learning_rate": 9.364500451501242e-05, + "loss": 1.8869, + "step": 6106 + }, + { + "epoch": 1.874462860650706, + "grad_norm": 0.38786688446998596, + "learning_rate": 9.364257916817817e-05, + "loss": 1.8879, + "step": 6107 + }, + { + "epoch": 1.874769797421731, + "grad_norm": 0.39408090710639954, + "learning_rate": 9.364015339004239e-05, + "loss": 1.8832, + "step": 6108 + }, + { + "epoch": 1.8750767341927563, + "grad_norm": 0.33985385298728943, + "learning_rate": 9.363772718062902e-05, + "loss": 1.8823, + "step": 6109 + }, + { + "epoch": 1.8753836709637814, + "grad_norm": 0.35319194197654724, + "learning_rate": 9.363530053996206e-05, + "loss": 1.9205, + "step": 6110 + }, + { + "epoch": 1.8756906077348066, + "grad_norm": 0.3455435335636139, + "learning_rate": 9.36328734680655e-05, + "loss": 1.9028, + "step": 6111 + }, + { + "epoch": 1.875997544505832, + "grad_norm": 0.3689115643501282, + "learning_rate": 9.363044596496329e-05, + "loss": 1.8996, + "step": 6112 + }, + { + "epoch": 1.876304481276857, + "grad_norm": 0.35776960849761963, + "learning_rate": 9.362801803067945e-05, + "loss": 1.9563, + "step": 6113 + }, + { + "epoch": 1.8766114180478821, + "grad_norm": 0.3524370491504669, + "learning_rate": 9.362558966523797e-05, + "loss": 1.9016, + "step": 6114 + }, + { + "epoch": 1.8769183548189075, + "grad_norm": 0.3725074529647827, + "learning_rate": 9.362316086866283e-05, + "loss": 1.9467, + "step": 6115 + }, + { + "epoch": 1.8772252915899323, + "grad_norm": 0.390055775642395, + "learning_rate": 9.362073164097807e-05, + "loss": 1.9326, + "step": 6116 + }, + { + "epoch": 1.8775322283609577, + "grad_norm": 0.39119964838027954, + "learning_rate": 9.361830198220764e-05, + "loss": 1.8723, + "step": 6117 + }, + { + "epoch": 1.8778391651319828, + "grad_norm": 0.3659103512763977, + "learning_rate": 9.36158718923756e-05, + "loss": 1.835, + "step": 6118 + }, + { + "epoch": 1.878146101903008, + "grad_norm": 0.3360283076763153, + "learning_rate": 9.361344137150597e-05, + "loss": 1.8622, + "step": 6119 + }, + { + "epoch": 1.8784530386740332, + "grad_norm": 0.35440295934677124, + "learning_rate": 9.361101041962272e-05, + "loss": 1.8523, + "step": 6120 + }, + { + "epoch": 1.8787599754450584, + "grad_norm": 1.2606174945831299, + "learning_rate": 9.36085790367499e-05, + "loss": 1.9826, + "step": 6121 + }, + { + "epoch": 1.8790669122160835, + "grad_norm": 0.49294769763946533, + "learning_rate": 9.360614722291157e-05, + "loss": 1.8478, + "step": 6122 + }, + { + "epoch": 1.8793738489871088, + "grad_norm": 0.5642881393432617, + "learning_rate": 9.360371497813172e-05, + "loss": 1.883, + "step": 6123 + }, + { + "epoch": 1.8796807857581337, + "grad_norm": 0.5257276296615601, + "learning_rate": 9.36012823024344e-05, + "loss": 1.8577, + "step": 6124 + }, + { + "epoch": 1.879987722529159, + "grad_norm": 0.36913231015205383, + "learning_rate": 9.359884919584366e-05, + "loss": 1.8934, + "step": 6125 + }, + { + "epoch": 1.8802946593001841, + "grad_norm": 0.43373262882232666, + "learning_rate": 9.359641565838353e-05, + "loss": 1.8354, + "step": 6126 + }, + { + "epoch": 1.8806015960712092, + "grad_norm": 0.5280462503433228, + "learning_rate": 9.359398169007807e-05, + "loss": 1.9446, + "step": 6127 + }, + { + "epoch": 1.8809085328422346, + "grad_norm": 0.4991915225982666, + "learning_rate": 9.359154729095135e-05, + "loss": 1.9003, + "step": 6128 + }, + { + "epoch": 1.8812154696132597, + "grad_norm": 0.3766331374645233, + "learning_rate": 9.358911246102738e-05, + "loss": 1.9149, + "step": 6129 + }, + { + "epoch": 1.8815224063842848, + "grad_norm": 0.39050692319869995, + "learning_rate": 9.358667720033026e-05, + "loss": 1.8945, + "step": 6130 + }, + { + "epoch": 1.8818293431553101, + "grad_norm": 0.47633904218673706, + "learning_rate": 9.358424150888405e-05, + "loss": 1.8772, + "step": 6131 + }, + { + "epoch": 1.882136279926335, + "grad_norm": 0.46322503685951233, + "learning_rate": 9.358180538671283e-05, + "loss": 1.893, + "step": 6132 + }, + { + "epoch": 1.8824432166973604, + "grad_norm": 0.39437612891197205, + "learning_rate": 9.357936883384066e-05, + "loss": 1.9394, + "step": 6133 + }, + { + "epoch": 1.8827501534683855, + "grad_norm": 0.4534996747970581, + "learning_rate": 9.357693185029162e-05, + "loss": 1.9689, + "step": 6134 + }, + { + "epoch": 1.8830570902394106, + "grad_norm": 0.4408230483531952, + "learning_rate": 9.35744944360898e-05, + "loss": 1.876, + "step": 6135 + }, + { + "epoch": 1.883364027010436, + "grad_norm": 0.5688899755477905, + "learning_rate": 9.35720565912593e-05, + "loss": 2.0153, + "step": 6136 + }, + { + "epoch": 1.883670963781461, + "grad_norm": 0.5005510449409485, + "learning_rate": 9.356961831582418e-05, + "loss": 1.9454, + "step": 6137 + }, + { + "epoch": 1.8839779005524862, + "grad_norm": 0.4002588987350464, + "learning_rate": 9.356717960980856e-05, + "loss": 1.9153, + "step": 6138 + }, + { + "epoch": 1.8842848373235115, + "grad_norm": 0.49053385853767395, + "learning_rate": 9.356474047323653e-05, + "loss": 1.9734, + "step": 6139 + }, + { + "epoch": 1.8845917740945364, + "grad_norm": 0.4828382432460785, + "learning_rate": 9.35623009061322e-05, + "loss": 1.8946, + "step": 6140 + }, + { + "epoch": 1.8848987108655617, + "grad_norm": 0.4389181137084961, + "learning_rate": 9.35598609085197e-05, + "loss": 1.9491, + "step": 6141 + }, + { + "epoch": 1.8852056476365868, + "grad_norm": 0.4010564982891083, + "learning_rate": 9.35574204804231e-05, + "loss": 1.8786, + "step": 6142 + }, + { + "epoch": 1.885512584407612, + "grad_norm": 0.4038756787776947, + "learning_rate": 9.355497962186657e-05, + "loss": 1.907, + "step": 6143 + }, + { + "epoch": 1.8858195211786373, + "grad_norm": 0.5030881762504578, + "learning_rate": 9.355253833287418e-05, + "loss": 1.8438, + "step": 6144 + }, + { + "epoch": 1.8861264579496624, + "grad_norm": 0.42690956592559814, + "learning_rate": 9.355009661347007e-05, + "loss": 1.8254, + "step": 6145 + }, + { + "epoch": 1.8864333947206875, + "grad_norm": 0.37733983993530273, + "learning_rate": 9.35476544636784e-05, + "loss": 1.9035, + "step": 6146 + }, + { + "epoch": 1.8867403314917128, + "grad_norm": 0.36874648928642273, + "learning_rate": 9.354521188352327e-05, + "loss": 1.885, + "step": 6147 + }, + { + "epoch": 1.8870472682627377, + "grad_norm": 0.36208659410476685, + "learning_rate": 9.354276887302885e-05, + "loss": 1.9416, + "step": 6148 + }, + { + "epoch": 1.887354205033763, + "grad_norm": 0.3952158987522125, + "learning_rate": 9.354032543221926e-05, + "loss": 1.9073, + "step": 6149 + }, + { + "epoch": 1.8876611418047882, + "grad_norm": 0.3603280782699585, + "learning_rate": 9.353788156111864e-05, + "loss": 1.9204, + "step": 6150 + }, + { + "epoch": 1.8879680785758133, + "grad_norm": 0.4325824975967407, + "learning_rate": 9.353543725975118e-05, + "loss": 1.9345, + "step": 6151 + }, + { + "epoch": 1.8882750153468386, + "grad_norm": 0.46270960569381714, + "learning_rate": 9.3532992528141e-05, + "loss": 1.9783, + "step": 6152 + }, + { + "epoch": 1.8885819521178637, + "grad_norm": 0.42317959666252136, + "learning_rate": 9.353054736631228e-05, + "loss": 1.9252, + "step": 6153 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.42653194069862366, + "learning_rate": 9.352810177428917e-05, + "loss": 1.9875, + "step": 6154 + }, + { + "epoch": 1.8891958256599142, + "grad_norm": 0.49819129705429077, + "learning_rate": 9.352565575209584e-05, + "loss": 1.9334, + "step": 6155 + }, + { + "epoch": 1.889502762430939, + "grad_norm": 0.4481790065765381, + "learning_rate": 9.352320929975646e-05, + "loss": 1.8939, + "step": 6156 + }, + { + "epoch": 1.8898096992019644, + "grad_norm": 0.41602686047554016, + "learning_rate": 9.352076241729524e-05, + "loss": 1.9207, + "step": 6157 + }, + { + "epoch": 1.8901166359729895, + "grad_norm": 0.4516759216785431, + "learning_rate": 9.351831510473633e-05, + "loss": 1.9384, + "step": 6158 + }, + { + "epoch": 1.8904235727440146, + "grad_norm": 0.5030251741409302, + "learning_rate": 9.351586736210391e-05, + "loss": 1.9787, + "step": 6159 + }, + { + "epoch": 1.89073050951504, + "grad_norm": 0.37176215648651123, + "learning_rate": 9.35134191894222e-05, + "loss": 1.8826, + "step": 6160 + }, + { + "epoch": 1.891037446286065, + "grad_norm": 0.3850235939025879, + "learning_rate": 9.351097058671537e-05, + "loss": 1.8689, + "step": 6161 + }, + { + "epoch": 1.8913443830570902, + "grad_norm": 0.3740260601043701, + "learning_rate": 9.350852155400764e-05, + "loss": 1.8624, + "step": 6162 + }, + { + "epoch": 1.8916513198281155, + "grad_norm": 0.386124849319458, + "learning_rate": 9.350607209132318e-05, + "loss": 1.8506, + "step": 6163 + }, + { + "epoch": 1.8919582565991406, + "grad_norm": 0.3743472993373871, + "learning_rate": 9.350362219868623e-05, + "loss": 1.9499, + "step": 6164 + }, + { + "epoch": 1.8922651933701657, + "grad_norm": 0.4257555603981018, + "learning_rate": 9.350117187612097e-05, + "loss": 1.9407, + "step": 6165 + }, + { + "epoch": 1.892572130141191, + "grad_norm": 0.37218552827835083, + "learning_rate": 9.349872112365163e-05, + "loss": 1.8772, + "step": 6166 + }, + { + "epoch": 1.892879066912216, + "grad_norm": 0.3443894386291504, + "learning_rate": 9.349626994130245e-05, + "loss": 1.8846, + "step": 6167 + }, + { + "epoch": 1.8931860036832413, + "grad_norm": 0.33507248759269714, + "learning_rate": 9.349381832909763e-05, + "loss": 1.9303, + "step": 6168 + }, + { + "epoch": 1.8934929404542664, + "grad_norm": 0.3844592869281769, + "learning_rate": 9.349136628706141e-05, + "loss": 1.9453, + "step": 6169 + }, + { + "epoch": 1.8937998772252915, + "grad_norm": 0.35765793919563293, + "learning_rate": 9.348891381521802e-05, + "loss": 1.8745, + "step": 6170 + }, + { + "epoch": 1.8941068139963169, + "grad_norm": 0.3732185661792755, + "learning_rate": 9.348646091359168e-05, + "loss": 1.9318, + "step": 6171 + }, + { + "epoch": 1.894413750767342, + "grad_norm": 0.3704257607460022, + "learning_rate": 9.348400758220666e-05, + "loss": 1.9285, + "step": 6172 + }, + { + "epoch": 1.894720687538367, + "grad_norm": 0.32159942388534546, + "learning_rate": 9.348155382108717e-05, + "loss": 1.8368, + "step": 6173 + }, + { + "epoch": 1.8950276243093924, + "grad_norm": 0.32755646109580994, + "learning_rate": 9.34790996302575e-05, + "loss": 1.8975, + "step": 6174 + }, + { + "epoch": 1.8953345610804173, + "grad_norm": 0.38797906041145325, + "learning_rate": 9.347664500974186e-05, + "loss": 1.9684, + "step": 6175 + }, + { + "epoch": 1.8956414978514426, + "grad_norm": 0.3870599865913391, + "learning_rate": 9.347418995956456e-05, + "loss": 1.963, + "step": 6176 + }, + { + "epoch": 1.8959484346224678, + "grad_norm": 0.35739025473594666, + "learning_rate": 9.347173447974982e-05, + "loss": 1.8912, + "step": 6177 + }, + { + "epoch": 1.8962553713934929, + "grad_norm": 0.3525852859020233, + "learning_rate": 9.346927857032193e-05, + "loss": 1.8455, + "step": 6178 + }, + { + "epoch": 1.8965623081645182, + "grad_norm": 0.39735934138298035, + "learning_rate": 9.346682223130514e-05, + "loss": 1.8824, + "step": 6179 + }, + { + "epoch": 1.8968692449355433, + "grad_norm": 0.3677692413330078, + "learning_rate": 9.346436546272373e-05, + "loss": 1.8723, + "step": 6180 + }, + { + "epoch": 1.8971761817065684, + "grad_norm": 0.3660476505756378, + "learning_rate": 9.346190826460199e-05, + "loss": 1.9674, + "step": 6181 + }, + { + "epoch": 1.8974831184775938, + "grad_norm": 0.4416230022907257, + "learning_rate": 9.34594506369642e-05, + "loss": 1.9309, + "step": 6182 + }, + { + "epoch": 1.8977900552486187, + "grad_norm": 0.39761826395988464, + "learning_rate": 9.345699257983466e-05, + "loss": 1.9408, + "step": 6183 + }, + { + "epoch": 1.898096992019644, + "grad_norm": 0.44419440627098083, + "learning_rate": 9.345453409323763e-05, + "loss": 2.0013, + "step": 6184 + }, + { + "epoch": 1.898403928790669, + "grad_norm": 0.4173676371574402, + "learning_rate": 9.345207517719743e-05, + "loss": 1.8462, + "step": 6185 + }, + { + "epoch": 1.8987108655616942, + "grad_norm": 0.39312002062797546, + "learning_rate": 9.344961583173837e-05, + "loss": 1.8716, + "step": 6186 + }, + { + "epoch": 1.8990178023327196, + "grad_norm": 0.389996737241745, + "learning_rate": 9.344715605688472e-05, + "loss": 1.9331, + "step": 6187 + }, + { + "epoch": 1.8993247391037447, + "grad_norm": 0.4575251340866089, + "learning_rate": 9.34446958526608e-05, + "loss": 1.9408, + "step": 6188 + }, + { + "epoch": 1.8996316758747698, + "grad_norm": 0.425075888633728, + "learning_rate": 9.344223521909097e-05, + "loss": 1.8632, + "step": 6189 + }, + { + "epoch": 1.899938612645795, + "grad_norm": 0.3622394800186157, + "learning_rate": 9.343977415619948e-05, + "loss": 1.8671, + "step": 6190 + }, + { + "epoch": 1.90024554941682, + "grad_norm": 0.38955047726631165, + "learning_rate": 9.343731266401068e-05, + "loss": 1.8955, + "step": 6191 + }, + { + "epoch": 1.9005524861878453, + "grad_norm": 0.40853381156921387, + "learning_rate": 9.34348507425489e-05, + "loss": 1.8477, + "step": 6192 + }, + { + "epoch": 1.9008594229588704, + "grad_norm": 0.36416095495224, + "learning_rate": 9.343238839183848e-05, + "loss": 1.8596, + "step": 6193 + }, + { + "epoch": 1.9011663597298956, + "grad_norm": 0.3371017277240753, + "learning_rate": 9.342992561190374e-05, + "loss": 1.9646, + "step": 6194 + }, + { + "epoch": 1.901473296500921, + "grad_norm": 0.3605191111564636, + "learning_rate": 9.3427462402769e-05, + "loss": 1.9165, + "step": 6195 + }, + { + "epoch": 1.901780233271946, + "grad_norm": 0.32952287793159485, + "learning_rate": 9.342499876445863e-05, + "loss": 1.8827, + "step": 6196 + }, + { + "epoch": 1.9020871700429711, + "grad_norm": 0.3627411425113678, + "learning_rate": 9.342253469699698e-05, + "loss": 1.9058, + "step": 6197 + }, + { + "epoch": 1.9023941068139965, + "grad_norm": 0.3830505311489105, + "learning_rate": 9.342007020040839e-05, + "loss": 1.89, + "step": 6198 + }, + { + "epoch": 1.9027010435850213, + "grad_norm": 0.36550065875053406, + "learning_rate": 9.341760527471722e-05, + "loss": 1.9004, + "step": 6199 + }, + { + "epoch": 1.9030079803560467, + "grad_norm": 0.4098506569862366, + "learning_rate": 9.341513991994782e-05, + "loss": 1.8656, + "step": 6200 + }, + { + "epoch": 1.9033149171270718, + "grad_norm": 0.5218825340270996, + "learning_rate": 9.341267413612456e-05, + "loss": 1.9179, + "step": 6201 + }, + { + "epoch": 1.903621853898097, + "grad_norm": 0.6201978921890259, + "learning_rate": 9.34102079232718e-05, + "loss": 1.9485, + "step": 6202 + }, + { + "epoch": 1.9039287906691222, + "grad_norm": 0.597594141960144, + "learning_rate": 9.340774128141395e-05, + "loss": 1.9074, + "step": 6203 + }, + { + "epoch": 1.9042357274401474, + "grad_norm": 0.477268248796463, + "learning_rate": 9.340527421057533e-05, + "loss": 1.9202, + "step": 6204 + }, + { + "epoch": 1.9045426642111725, + "grad_norm": 0.39805278182029724, + "learning_rate": 9.340280671078035e-05, + "loss": 1.8801, + "step": 6205 + }, + { + "epoch": 1.9048496009821978, + "grad_norm": 0.5815454721450806, + "learning_rate": 9.340033878205342e-05, + "loss": 1.8564, + "step": 6206 + }, + { + "epoch": 1.9051565377532227, + "grad_norm": 0.6385661363601685, + "learning_rate": 9.339787042441888e-05, + "loss": 1.8992, + "step": 6207 + }, + { + "epoch": 1.905463474524248, + "grad_norm": 0.5905124545097351, + "learning_rate": 9.339540163790116e-05, + "loss": 1.9608, + "step": 6208 + }, + { + "epoch": 1.9057704112952731, + "grad_norm": 0.37329113483428955, + "learning_rate": 9.339293242252465e-05, + "loss": 1.9037, + "step": 6209 + }, + { + "epoch": 1.9060773480662982, + "grad_norm": 0.4568968117237091, + "learning_rate": 9.339046277831374e-05, + "loss": 1.8719, + "step": 6210 + }, + { + "epoch": 1.9063842848373236, + "grad_norm": 0.43003782629966736, + "learning_rate": 9.338799270529284e-05, + "loss": 1.8594, + "step": 6211 + }, + { + "epoch": 1.9066912216083487, + "grad_norm": 0.3795240819454193, + "learning_rate": 9.338552220348637e-05, + "loss": 1.8645, + "step": 6212 + }, + { + "epoch": 1.9069981583793738, + "grad_norm": 0.3791581392288208, + "learning_rate": 9.338305127291876e-05, + "loss": 1.9076, + "step": 6213 + }, + { + "epoch": 1.9073050951503991, + "grad_norm": 0.3747733533382416, + "learning_rate": 9.338057991361438e-05, + "loss": 1.8665, + "step": 6214 + }, + { + "epoch": 1.907612031921424, + "grad_norm": 0.3994114100933075, + "learning_rate": 9.337810812559771e-05, + "loss": 1.9202, + "step": 6215 + }, + { + "epoch": 1.9079189686924494, + "grad_norm": 0.3808605670928955, + "learning_rate": 9.337563590889312e-05, + "loss": 1.9272, + "step": 6216 + }, + { + "epoch": 1.9082259054634745, + "grad_norm": 0.3461966812610626, + "learning_rate": 9.33731632635251e-05, + "loss": 1.8621, + "step": 6217 + }, + { + "epoch": 1.9085328422344996, + "grad_norm": 0.37272316217422485, + "learning_rate": 9.337069018951805e-05, + "loss": 1.8996, + "step": 6218 + }, + { + "epoch": 1.908839779005525, + "grad_norm": 0.40319329500198364, + "learning_rate": 9.336821668689642e-05, + "loss": 1.8852, + "step": 6219 + }, + { + "epoch": 1.90914671577655, + "grad_norm": 0.4059053659439087, + "learning_rate": 9.336574275568463e-05, + "loss": 1.9156, + "step": 6220 + }, + { + "epoch": 1.9094536525475752, + "grad_norm": 0.41244640946388245, + "learning_rate": 9.336326839590719e-05, + "loss": 1.9858, + "step": 6221 + }, + { + "epoch": 1.9097605893186005, + "grad_norm": 0.38230007886886597, + "learning_rate": 9.336079360758849e-05, + "loss": 1.8756, + "step": 6222 + }, + { + "epoch": 1.9100675260896254, + "grad_norm": 0.3620646297931671, + "learning_rate": 9.335831839075304e-05, + "loss": 1.9305, + "step": 6223 + }, + { + "epoch": 1.9103744628606507, + "grad_norm": 0.3700193166732788, + "learning_rate": 9.335584274542525e-05, + "loss": 1.8544, + "step": 6224 + }, + { + "epoch": 1.9106813996316758, + "grad_norm": 0.36827734112739563, + "learning_rate": 9.335336667162962e-05, + "loss": 1.8658, + "step": 6225 + }, + { + "epoch": 1.910988336402701, + "grad_norm": 0.33878061175346375, + "learning_rate": 9.33508901693906e-05, + "loss": 1.8638, + "step": 6226 + }, + { + "epoch": 1.9112952731737263, + "grad_norm": 0.3522186577320099, + "learning_rate": 9.334841323873269e-05, + "loss": 1.9109, + "step": 6227 + }, + { + "epoch": 1.9116022099447514, + "grad_norm": 0.3552776277065277, + "learning_rate": 9.334593587968035e-05, + "loss": 1.8499, + "step": 6228 + }, + { + "epoch": 1.9119091467157765, + "grad_norm": 0.3232300877571106, + "learning_rate": 9.334345809225805e-05, + "loss": 1.9078, + "step": 6229 + }, + { + "epoch": 1.9122160834868018, + "grad_norm": 0.3500599265098572, + "learning_rate": 9.33409798764903e-05, + "loss": 1.8953, + "step": 6230 + }, + { + "epoch": 1.9125230202578267, + "grad_norm": 0.4011479914188385, + "learning_rate": 9.333850123240159e-05, + "loss": 1.8961, + "step": 6231 + }, + { + "epoch": 1.912829957028852, + "grad_norm": 0.419539213180542, + "learning_rate": 9.333602216001642e-05, + "loss": 1.9381, + "step": 6232 + }, + { + "epoch": 1.9131368937998774, + "grad_norm": 0.364956259727478, + "learning_rate": 9.333354265935926e-05, + "loss": 1.8495, + "step": 6233 + }, + { + "epoch": 1.9134438305709023, + "grad_norm": 0.3322601318359375, + "learning_rate": 9.333106273045464e-05, + "loss": 1.8389, + "step": 6234 + }, + { + "epoch": 1.9137507673419276, + "grad_norm": 0.3706522583961487, + "learning_rate": 9.332858237332705e-05, + "loss": 1.904, + "step": 6235 + }, + { + "epoch": 1.9140577041129527, + "grad_norm": 0.3900963366031647, + "learning_rate": 9.332610158800104e-05, + "loss": 1.8974, + "step": 6236 + }, + { + "epoch": 1.9143646408839778, + "grad_norm": 0.3308334946632385, + "learning_rate": 9.332362037450108e-05, + "loss": 1.959, + "step": 6237 + }, + { + "epoch": 1.9146715776550032, + "grad_norm": 0.37876754999160767, + "learning_rate": 9.332113873285171e-05, + "loss": 1.9187, + "step": 6238 + }, + { + "epoch": 1.9149785144260283, + "grad_norm": 0.3557550609111786, + "learning_rate": 9.331865666307746e-05, + "loss": 1.9351, + "step": 6239 + }, + { + "epoch": 1.9152854511970534, + "grad_norm": 0.3792133927345276, + "learning_rate": 9.331617416520285e-05, + "loss": 1.8488, + "step": 6240 + }, + { + "epoch": 1.9155923879680787, + "grad_norm": 0.40517017245292664, + "learning_rate": 9.331369123925242e-05, + "loss": 1.9311, + "step": 6241 + }, + { + "epoch": 1.9158993247391036, + "grad_norm": 0.34011030197143555, + "learning_rate": 9.331120788525072e-05, + "loss": 1.8606, + "step": 6242 + }, + { + "epoch": 1.916206261510129, + "grad_norm": 0.39949584007263184, + "learning_rate": 9.330872410322227e-05, + "loss": 1.9156, + "step": 6243 + }, + { + "epoch": 1.916513198281154, + "grad_norm": 0.3771394193172455, + "learning_rate": 9.330623989319162e-05, + "loss": 1.8448, + "step": 6244 + }, + { + "epoch": 1.9168201350521792, + "grad_norm": 0.32114169001579285, + "learning_rate": 9.330375525518333e-05, + "loss": 1.8681, + "step": 6245 + }, + { + "epoch": 1.9171270718232045, + "grad_norm": 0.3438408672809601, + "learning_rate": 9.330127018922194e-05, + "loss": 1.8582, + "step": 6246 + }, + { + "epoch": 1.9174340085942296, + "grad_norm": 0.35971906781196594, + "learning_rate": 9.329878469533201e-05, + "loss": 1.9026, + "step": 6247 + }, + { + "epoch": 1.9177409453652547, + "grad_norm": 0.3953855633735657, + "learning_rate": 9.329629877353813e-05, + "loss": 1.8837, + "step": 6248 + }, + { + "epoch": 1.91804788213628, + "grad_norm": 0.36541905999183655, + "learning_rate": 9.329381242386485e-05, + "loss": 1.9156, + "step": 6249 + }, + { + "epoch": 1.918354818907305, + "grad_norm": 0.3577594459056854, + "learning_rate": 9.329132564633673e-05, + "loss": 1.8791, + "step": 6250 + }, + { + "epoch": 1.9186617556783303, + "grad_norm": 0.3869122564792633, + "learning_rate": 9.328883844097837e-05, + "loss": 1.9048, + "step": 6251 + }, + { + "epoch": 1.9189686924493554, + "grad_norm": 0.35097724199295044, + "learning_rate": 9.328635080781433e-05, + "loss": 1.9602, + "step": 6252 + }, + { + "epoch": 1.9192756292203805, + "grad_norm": 0.3813062012195587, + "learning_rate": 9.328386274686919e-05, + "loss": 1.9133, + "step": 6253 + }, + { + "epoch": 1.9195825659914059, + "grad_norm": 0.3950280249118805, + "learning_rate": 9.328137425816756e-05, + "loss": 1.9462, + "step": 6254 + }, + { + "epoch": 1.919889502762431, + "grad_norm": 0.41710540652275085, + "learning_rate": 9.327888534173402e-05, + "loss": 1.8616, + "step": 6255 + }, + { + "epoch": 1.920196439533456, + "grad_norm": 0.39998626708984375, + "learning_rate": 9.327639599759318e-05, + "loss": 1.8758, + "step": 6256 + }, + { + "epoch": 1.9205033763044814, + "grad_norm": 0.35425302386283875, + "learning_rate": 9.32739062257696e-05, + "loss": 1.8896, + "step": 6257 + }, + { + "epoch": 1.9208103130755063, + "grad_norm": 0.3487682640552521, + "learning_rate": 9.327141602628793e-05, + "loss": 1.8901, + "step": 6258 + }, + { + "epoch": 1.9211172498465316, + "grad_norm": 0.38767126202583313, + "learning_rate": 9.326892539917277e-05, + "loss": 1.9264, + "step": 6259 + }, + { + "epoch": 1.9214241866175568, + "grad_norm": 0.4265333116054535, + "learning_rate": 9.326643434444872e-05, + "loss": 1.9282, + "step": 6260 + }, + { + "epoch": 1.9217311233885819, + "grad_norm": 0.3386894166469574, + "learning_rate": 9.326394286214042e-05, + "loss": 1.8167, + "step": 6261 + }, + { + "epoch": 1.9220380601596072, + "grad_norm": 0.3594066798686981, + "learning_rate": 9.326145095227246e-05, + "loss": 1.9293, + "step": 6262 + }, + { + "epoch": 1.9223449969306323, + "grad_norm": 0.4041733741760254, + "learning_rate": 9.32589586148695e-05, + "loss": 2.0066, + "step": 6263 + }, + { + "epoch": 1.9226519337016574, + "grad_norm": 0.45588794350624084, + "learning_rate": 9.325646584995615e-05, + "loss": 1.9485, + "step": 6264 + }, + { + "epoch": 1.9229588704726828, + "grad_norm": 0.42583590745925903, + "learning_rate": 9.325397265755705e-05, + "loss": 1.8973, + "step": 6265 + }, + { + "epoch": 1.9232658072437077, + "grad_norm": 0.38701504468917847, + "learning_rate": 9.325147903769684e-05, + "loss": 1.9624, + "step": 6266 + }, + { + "epoch": 1.923572744014733, + "grad_norm": 0.4298608899116516, + "learning_rate": 9.324898499040017e-05, + "loss": 1.9033, + "step": 6267 + }, + { + "epoch": 1.923879680785758, + "grad_norm": 0.3692619800567627, + "learning_rate": 9.324649051569167e-05, + "loss": 1.973, + "step": 6268 + }, + { + "epoch": 1.9241866175567832, + "grad_norm": 0.40625011920928955, + "learning_rate": 9.324399561359602e-05, + "loss": 1.8629, + "step": 6269 + }, + { + "epoch": 1.9244935543278086, + "grad_norm": 0.43613263964653015, + "learning_rate": 9.324150028413784e-05, + "loss": 1.8928, + "step": 6270 + }, + { + "epoch": 1.9248004910988337, + "grad_norm": 0.4670937657356262, + "learning_rate": 9.323900452734182e-05, + "loss": 1.8809, + "step": 6271 + }, + { + "epoch": 1.9251074278698588, + "grad_norm": 0.43263986706733704, + "learning_rate": 9.323650834323262e-05, + "loss": 1.891, + "step": 6272 + }, + { + "epoch": 1.9254143646408841, + "grad_norm": 0.4253878891468048, + "learning_rate": 9.32340117318349e-05, + "loss": 2.0064, + "step": 6273 + }, + { + "epoch": 1.925721301411909, + "grad_norm": 0.3742302358150482, + "learning_rate": 9.323151469317332e-05, + "loss": 1.9441, + "step": 6274 + }, + { + "epoch": 1.9260282381829343, + "grad_norm": 0.37415632605552673, + "learning_rate": 9.32290172272726e-05, + "loss": 1.8901, + "step": 6275 + }, + { + "epoch": 1.9263351749539595, + "grad_norm": 0.402935266494751, + "learning_rate": 9.322651933415738e-05, + "loss": 1.9013, + "step": 6276 + }, + { + "epoch": 1.9266421117249846, + "grad_norm": 0.479819118976593, + "learning_rate": 9.322402101385235e-05, + "loss": 1.9713, + "step": 6277 + }, + { + "epoch": 1.92694904849601, + "grad_norm": 0.4472719430923462, + "learning_rate": 9.322152226638222e-05, + "loss": 1.9106, + "step": 6278 + }, + { + "epoch": 1.927255985267035, + "grad_norm": 0.36508920788764954, + "learning_rate": 9.321902309177168e-05, + "loss": 1.8999, + "step": 6279 + }, + { + "epoch": 1.9275629220380601, + "grad_norm": 0.38674476742744446, + "learning_rate": 9.321652349004542e-05, + "loss": 1.8653, + "step": 6280 + }, + { + "epoch": 1.9278698588090855, + "grad_norm": 0.3745587170124054, + "learning_rate": 9.321402346122814e-05, + "loss": 1.8764, + "step": 6281 + }, + { + "epoch": 1.9281767955801103, + "grad_norm": 0.37824445962905884, + "learning_rate": 9.321152300534454e-05, + "loss": 1.8712, + "step": 6282 + }, + { + "epoch": 1.9284837323511357, + "grad_norm": 0.3442685306072235, + "learning_rate": 9.320902212241936e-05, + "loss": 1.8242, + "step": 6283 + }, + { + "epoch": 1.9287906691221608, + "grad_norm": 0.3152186870574951, + "learning_rate": 9.32065208124773e-05, + "loss": 1.9282, + "step": 6284 + }, + { + "epoch": 1.929097605893186, + "grad_norm": 0.35380542278289795, + "learning_rate": 9.320401907554306e-05, + "loss": 1.8783, + "step": 6285 + }, + { + "epoch": 1.9294045426642112, + "grad_norm": 0.3140089511871338, + "learning_rate": 9.320151691164138e-05, + "loss": 1.9174, + "step": 6286 + }, + { + "epoch": 1.9297114794352364, + "grad_norm": 0.33666202425956726, + "learning_rate": 9.3199014320797e-05, + "loss": 1.8926, + "step": 6287 + }, + { + "epoch": 1.9300184162062615, + "grad_norm": 0.3297472894191742, + "learning_rate": 9.319651130303465e-05, + "loss": 1.8763, + "step": 6288 + }, + { + "epoch": 1.9303253529772868, + "grad_norm": 0.3323235511779785, + "learning_rate": 9.319400785837906e-05, + "loss": 1.9088, + "step": 6289 + }, + { + "epoch": 1.9306322897483117, + "grad_norm": 0.32601413130760193, + "learning_rate": 9.319150398685494e-05, + "loss": 1.8672, + "step": 6290 + }, + { + "epoch": 1.930939226519337, + "grad_norm": 0.35310089588165283, + "learning_rate": 9.318899968848708e-05, + "loss": 1.9492, + "step": 6291 + }, + { + "epoch": 1.9312461632903621, + "grad_norm": 0.3718548119068146, + "learning_rate": 9.31864949633002e-05, + "loss": 1.8692, + "step": 6292 + }, + { + "epoch": 1.9315531000613873, + "grad_norm": 0.42382025718688965, + "learning_rate": 9.318398981131908e-05, + "loss": 1.9693, + "step": 6293 + }, + { + "epoch": 1.9318600368324126, + "grad_norm": 0.5123299360275269, + "learning_rate": 9.318148423256845e-05, + "loss": 2.0117, + "step": 6294 + }, + { + "epoch": 1.9321669736034377, + "grad_norm": 0.4483809769153595, + "learning_rate": 9.317897822707308e-05, + "loss": 1.9165, + "step": 6295 + }, + { + "epoch": 1.9324739103744628, + "grad_norm": 0.4385908544063568, + "learning_rate": 9.317647179485776e-05, + "loss": 1.8869, + "step": 6296 + }, + { + "epoch": 1.9327808471454881, + "grad_norm": 0.42863771319389343, + "learning_rate": 9.317396493594724e-05, + "loss": 1.9484, + "step": 6297 + }, + { + "epoch": 1.933087783916513, + "grad_norm": 0.4130534529685974, + "learning_rate": 9.317145765036627e-05, + "loss": 1.9201, + "step": 6298 + }, + { + "epoch": 1.9333947206875384, + "grad_norm": 0.39024612307548523, + "learning_rate": 9.316894993813965e-05, + "loss": 1.9674, + "step": 6299 + }, + { + "epoch": 1.9337016574585635, + "grad_norm": 0.41060271859169006, + "learning_rate": 9.316644179929219e-05, + "loss": 1.9529, + "step": 6300 + }, + { + "epoch": 1.9340085942295886, + "grad_norm": 0.4302372634410858, + "learning_rate": 9.316393323384863e-05, + "loss": 1.8998, + "step": 6301 + }, + { + "epoch": 1.934315531000614, + "grad_norm": 0.3739410936832428, + "learning_rate": 9.316142424183379e-05, + "loss": 1.8812, + "step": 6302 + }, + { + "epoch": 1.934622467771639, + "grad_norm": 0.3965891897678375, + "learning_rate": 9.315891482327245e-05, + "loss": 1.8851, + "step": 6303 + }, + { + "epoch": 1.9349294045426642, + "grad_norm": 0.4486664831638336, + "learning_rate": 9.315640497818943e-05, + "loss": 1.9494, + "step": 6304 + }, + { + "epoch": 1.9352363413136895, + "grad_norm": 0.5530070662498474, + "learning_rate": 9.315389470660951e-05, + "loss": 1.9716, + "step": 6305 + }, + { + "epoch": 1.9355432780847146, + "grad_norm": 0.7142495512962341, + "learning_rate": 9.315138400855751e-05, + "loss": 1.947, + "step": 6306 + }, + { + "epoch": 1.9358502148557397, + "grad_norm": 0.7555594444274902, + "learning_rate": 9.314887288405827e-05, + "loss": 1.873, + "step": 6307 + }, + { + "epoch": 1.936157151626765, + "grad_norm": 0.6025232076644897, + "learning_rate": 9.314636133313654e-05, + "loss": 1.9189, + "step": 6308 + }, + { + "epoch": 1.93646408839779, + "grad_norm": 0.3686346113681793, + "learning_rate": 9.314384935581719e-05, + "loss": 1.8461, + "step": 6309 + }, + { + "epoch": 1.9367710251688153, + "grad_norm": 0.46265771985054016, + "learning_rate": 9.314133695212505e-05, + "loss": 1.8955, + "step": 6310 + }, + { + "epoch": 1.9370779619398404, + "grad_norm": 0.7023865580558777, + "learning_rate": 9.313882412208492e-05, + "loss": 1.9378, + "step": 6311 + }, + { + "epoch": 1.9373848987108655, + "grad_norm": 0.7163348197937012, + "learning_rate": 9.313631086572163e-05, + "loss": 1.9278, + "step": 6312 + }, + { + "epoch": 1.9376918354818908, + "grad_norm": 0.4772320091724396, + "learning_rate": 9.313379718306006e-05, + "loss": 1.9215, + "step": 6313 + }, + { + "epoch": 1.937998772252916, + "grad_norm": 0.4934171438217163, + "learning_rate": 9.313128307412501e-05, + "loss": 1.9725, + "step": 6314 + }, + { + "epoch": 1.938305709023941, + "grad_norm": 0.5988278985023499, + "learning_rate": 9.312876853894134e-05, + "loss": 1.9238, + "step": 6315 + }, + { + "epoch": 1.9386126457949664, + "grad_norm": 0.5819640159606934, + "learning_rate": 9.31262535775339e-05, + "loss": 1.9228, + "step": 6316 + }, + { + "epoch": 1.9389195825659913, + "grad_norm": 0.49525877833366394, + "learning_rate": 9.312373818992756e-05, + "loss": 1.8939, + "step": 6317 + }, + { + "epoch": 1.9392265193370166, + "grad_norm": 0.3778049647808075, + "learning_rate": 9.312122237614715e-05, + "loss": 1.8709, + "step": 6318 + }, + { + "epoch": 1.9395334561080417, + "grad_norm": 0.48716801404953003, + "learning_rate": 9.311870613621754e-05, + "loss": 1.9014, + "step": 6319 + }, + { + "epoch": 1.9398403928790668, + "grad_norm": 0.47298866510391235, + "learning_rate": 9.311618947016362e-05, + "loss": 1.8686, + "step": 6320 + }, + { + "epoch": 1.9401473296500922, + "grad_norm": 0.3709685206413269, + "learning_rate": 9.311367237801023e-05, + "loss": 1.9531, + "step": 6321 + }, + { + "epoch": 1.9404542664211173, + "grad_norm": 0.3898928761482239, + "learning_rate": 9.311115485978228e-05, + "loss": 1.8806, + "step": 6322 + }, + { + "epoch": 1.9407612031921424, + "grad_norm": 0.43091922998428345, + "learning_rate": 9.310863691550461e-05, + "loss": 1.9278, + "step": 6323 + }, + { + "epoch": 1.9410681399631677, + "grad_norm": 0.3788231909275055, + "learning_rate": 9.310611854520212e-05, + "loss": 1.893, + "step": 6324 + }, + { + "epoch": 1.9413750767341926, + "grad_norm": 0.4471469819545746, + "learning_rate": 9.310359974889972e-05, + "loss": 1.9706, + "step": 6325 + }, + { + "epoch": 1.941682013505218, + "grad_norm": 0.4047459661960602, + "learning_rate": 9.310108052662228e-05, + "loss": 1.8863, + "step": 6326 + }, + { + "epoch": 1.941988950276243, + "grad_norm": 0.4334566593170166, + "learning_rate": 9.309856087839468e-05, + "loss": 1.9543, + "step": 6327 + }, + { + "epoch": 1.9422958870472682, + "grad_norm": 0.3828316032886505, + "learning_rate": 9.309604080424185e-05, + "loss": 1.8601, + "step": 6328 + }, + { + "epoch": 1.9426028238182935, + "grad_norm": 0.3702560067176819, + "learning_rate": 9.30935203041887e-05, + "loss": 1.9055, + "step": 6329 + }, + { + "epoch": 1.9429097605893186, + "grad_norm": 0.4922797977924347, + "learning_rate": 9.309099937826011e-05, + "loss": 1.9589, + "step": 6330 + }, + { + "epoch": 1.9432166973603437, + "grad_norm": 0.4073271155357361, + "learning_rate": 9.308847802648102e-05, + "loss": 1.9727, + "step": 6331 + }, + { + "epoch": 1.943523634131369, + "grad_norm": 0.3833904266357422, + "learning_rate": 9.308595624887633e-05, + "loss": 1.8641, + "step": 6332 + }, + { + "epoch": 1.943830570902394, + "grad_norm": 0.44063761830329895, + "learning_rate": 9.308343404547095e-05, + "loss": 1.8996, + "step": 6333 + }, + { + "epoch": 1.9441375076734193, + "grad_norm": 0.4776977300643921, + "learning_rate": 9.308091141628983e-05, + "loss": 1.9353, + "step": 6334 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.39584699273109436, + "learning_rate": 9.307838836135792e-05, + "loss": 1.8521, + "step": 6335 + }, + { + "epoch": 1.9447513812154695, + "grad_norm": 0.3220890760421753, + "learning_rate": 9.30758648807001e-05, + "loss": 1.825, + "step": 6336 + }, + { + "epoch": 1.9450583179864949, + "grad_norm": 0.4301774501800537, + "learning_rate": 9.307334097434133e-05, + "loss": 1.9317, + "step": 6337 + }, + { + "epoch": 1.94536525475752, + "grad_norm": 0.439165323972702, + "learning_rate": 9.307081664230658e-05, + "loss": 1.8669, + "step": 6338 + }, + { + "epoch": 1.945672191528545, + "grad_norm": 0.4185279607772827, + "learning_rate": 9.306829188462076e-05, + "loss": 1.9512, + "step": 6339 + }, + { + "epoch": 1.9459791282995704, + "grad_norm": 0.4089502990245819, + "learning_rate": 9.306576670130885e-05, + "loss": 1.9607, + "step": 6340 + }, + { + "epoch": 1.9462860650705953, + "grad_norm": 0.508836567401886, + "learning_rate": 9.306324109239578e-05, + "loss": 1.9187, + "step": 6341 + }, + { + "epoch": 1.9465930018416207, + "grad_norm": 0.637534499168396, + "learning_rate": 9.306071505790652e-05, + "loss": 1.8237, + "step": 6342 + }, + { + "epoch": 1.9468999386126458, + "grad_norm": 0.5845112800598145, + "learning_rate": 9.305818859786603e-05, + "loss": 1.8238, + "step": 6343 + }, + { + "epoch": 1.9472068753836709, + "grad_norm": 0.4168374240398407, + "learning_rate": 9.305566171229932e-05, + "loss": 1.9343, + "step": 6344 + }, + { + "epoch": 1.9475138121546962, + "grad_norm": 0.43040701746940613, + "learning_rate": 9.305313440123129e-05, + "loss": 1.8774, + "step": 6345 + }, + { + "epoch": 1.9478207489257213, + "grad_norm": 0.6011641025543213, + "learning_rate": 9.305060666468696e-05, + "loss": 1.89, + "step": 6346 + }, + { + "epoch": 1.9481276856967464, + "grad_norm": 0.5530022382736206, + "learning_rate": 9.304807850269131e-05, + "loss": 2.0006, + "step": 6347 + }, + { + "epoch": 1.9484346224677718, + "grad_norm": 0.3707423210144043, + "learning_rate": 9.30455499152693e-05, + "loss": 1.9116, + "step": 6348 + }, + { + "epoch": 1.9487415592387967, + "grad_norm": 0.5013771653175354, + "learning_rate": 9.304302090244595e-05, + "loss": 1.8902, + "step": 6349 + }, + { + "epoch": 1.949048496009822, + "grad_norm": 0.5873609781265259, + "learning_rate": 9.304049146424623e-05, + "loss": 1.8879, + "step": 6350 + }, + { + "epoch": 1.949355432780847, + "grad_norm": 0.4389801621437073, + "learning_rate": 9.303796160069516e-05, + "loss": 1.9215, + "step": 6351 + }, + { + "epoch": 1.9496623695518722, + "grad_norm": 0.4004434645175934, + "learning_rate": 9.303543131181772e-05, + "loss": 1.9137, + "step": 6352 + }, + { + "epoch": 1.9499693063228976, + "grad_norm": 0.4928852617740631, + "learning_rate": 9.303290059763892e-05, + "loss": 1.9415, + "step": 6353 + }, + { + "epoch": 1.9502762430939227, + "grad_norm": 0.5045879483222961, + "learning_rate": 9.303036945818377e-05, + "loss": 1.8727, + "step": 6354 + }, + { + "epoch": 1.9505831798649478, + "grad_norm": 0.3434823453426361, + "learning_rate": 9.30278378934773e-05, + "loss": 1.8971, + "step": 6355 + }, + { + "epoch": 1.9508901166359731, + "grad_norm": 0.42980003356933594, + "learning_rate": 9.302530590354452e-05, + "loss": 1.9233, + "step": 6356 + }, + { + "epoch": 1.951197053406998, + "grad_norm": 0.3832406997680664, + "learning_rate": 9.302277348841042e-05, + "loss": 1.9317, + "step": 6357 + }, + { + "epoch": 1.9515039901780233, + "grad_norm": 0.37214264273643494, + "learning_rate": 9.30202406481001e-05, + "loss": 1.9172, + "step": 6358 + }, + { + "epoch": 1.9518109269490485, + "grad_norm": 0.3601585924625397, + "learning_rate": 9.30177073826385e-05, + "loss": 1.9286, + "step": 6359 + }, + { + "epoch": 1.9521178637200736, + "grad_norm": 0.36419349908828735, + "learning_rate": 9.301517369205072e-05, + "loss": 1.8624, + "step": 6360 + }, + { + "epoch": 1.952424800491099, + "grad_norm": 0.3808813691139221, + "learning_rate": 9.30126395763618e-05, + "loss": 1.8656, + "step": 6361 + }, + { + "epoch": 1.952731737262124, + "grad_norm": 0.39045700430870056, + "learning_rate": 9.301010503559675e-05, + "loss": 1.9205, + "step": 6362 + }, + { + "epoch": 1.9530386740331491, + "grad_norm": 0.37281444668769836, + "learning_rate": 9.300757006978065e-05, + "loss": 1.9162, + "step": 6363 + }, + { + "epoch": 1.9533456108041745, + "grad_norm": 0.4525204002857208, + "learning_rate": 9.300503467893851e-05, + "loss": 1.8999, + "step": 6364 + }, + { + "epoch": 1.9536525475751993, + "grad_norm": 0.41406187415122986, + "learning_rate": 9.300249886309542e-05, + "loss": 1.9804, + "step": 6365 + }, + { + "epoch": 1.9539594843462247, + "grad_norm": 0.4125058650970459, + "learning_rate": 9.299996262227644e-05, + "loss": 1.8464, + "step": 6366 + }, + { + "epoch": 1.9542664211172498, + "grad_norm": 0.41582876443862915, + "learning_rate": 9.299742595650663e-05, + "loss": 1.9937, + "step": 6367 + }, + { + "epoch": 1.954573357888275, + "grad_norm": 0.4360882639884949, + "learning_rate": 9.299488886581103e-05, + "loss": 1.9064, + "step": 6368 + }, + { + "epoch": 1.9548802946593002, + "grad_norm": 0.38369372487068176, + "learning_rate": 9.299235135021476e-05, + "loss": 1.9202, + "step": 6369 + }, + { + "epoch": 1.9551872314303254, + "grad_norm": 0.34401383996009827, + "learning_rate": 9.298981340974287e-05, + "loss": 1.844, + "step": 6370 + }, + { + "epoch": 1.9554941682013505, + "grad_norm": 0.3434326946735382, + "learning_rate": 9.298727504442044e-05, + "loss": 1.8206, + "step": 6371 + }, + { + "epoch": 1.9558011049723758, + "grad_norm": 0.35966724157333374, + "learning_rate": 9.298473625427257e-05, + "loss": 1.9, + "step": 6372 + }, + { + "epoch": 1.9561080417434007, + "grad_norm": 0.3726016581058502, + "learning_rate": 9.298219703932434e-05, + "loss": 1.9004, + "step": 6373 + }, + { + "epoch": 1.956414978514426, + "grad_norm": 0.3377366364002228, + "learning_rate": 9.297965739960084e-05, + "loss": 1.8747, + "step": 6374 + }, + { + "epoch": 1.9567219152854514, + "grad_norm": 0.36824578046798706, + "learning_rate": 9.297711733512718e-05, + "loss": 1.9059, + "step": 6375 + }, + { + "epoch": 1.9570288520564763, + "grad_norm": 0.3434023857116699, + "learning_rate": 9.297457684592847e-05, + "loss": 1.8624, + "step": 6376 + }, + { + "epoch": 1.9573357888275016, + "grad_norm": 0.36236703395843506, + "learning_rate": 9.297203593202979e-05, + "loss": 1.8558, + "step": 6377 + }, + { + "epoch": 1.9576427255985267, + "grad_norm": 0.3326953947544098, + "learning_rate": 9.296949459345625e-05, + "loss": 1.9189, + "step": 6378 + }, + { + "epoch": 1.9579496623695518, + "grad_norm": 0.3358452022075653, + "learning_rate": 9.2966952830233e-05, + "loss": 1.8601, + "step": 6379 + }, + { + "epoch": 1.9582565991405771, + "grad_norm": 0.36092114448547363, + "learning_rate": 9.296441064238514e-05, + "loss": 1.873, + "step": 6380 + }, + { + "epoch": 1.9585635359116023, + "grad_norm": 0.345683217048645, + "learning_rate": 9.296186802993778e-05, + "loss": 1.9122, + "step": 6381 + }, + { + "epoch": 1.9588704726826274, + "grad_norm": 0.32488611340522766, + "learning_rate": 9.295932499291606e-05, + "loss": 1.8709, + "step": 6382 + }, + { + "epoch": 1.9591774094536527, + "grad_norm": 0.34276288747787476, + "learning_rate": 9.295678153134512e-05, + "loss": 1.937, + "step": 6383 + }, + { + "epoch": 1.9594843462246776, + "grad_norm": 0.3953622877597809, + "learning_rate": 9.295423764525008e-05, + "loss": 1.9357, + "step": 6384 + }, + { + "epoch": 1.959791282995703, + "grad_norm": 0.37806951999664307, + "learning_rate": 9.29516933346561e-05, + "loss": 1.8813, + "step": 6385 + }, + { + "epoch": 1.960098219766728, + "grad_norm": 0.39551272988319397, + "learning_rate": 9.29491485995883e-05, + "loss": 1.8812, + "step": 6386 + }, + { + "epoch": 1.9604051565377532, + "grad_norm": 0.37042370438575745, + "learning_rate": 9.294660344007184e-05, + "loss": 1.9059, + "step": 6387 + }, + { + "epoch": 1.9607120933087785, + "grad_norm": 0.37503576278686523, + "learning_rate": 9.294405785613187e-05, + "loss": 1.9792, + "step": 6388 + }, + { + "epoch": 1.9610190300798036, + "grad_norm": 0.3515741229057312, + "learning_rate": 9.294151184779355e-05, + "loss": 1.8792, + "step": 6389 + }, + { + "epoch": 1.9613259668508287, + "grad_norm": 0.319890558719635, + "learning_rate": 9.293896541508205e-05, + "loss": 1.9222, + "step": 6390 + }, + { + "epoch": 1.961632903621854, + "grad_norm": 0.3517487645149231, + "learning_rate": 9.293641855802252e-05, + "loss": 1.8751, + "step": 6391 + }, + { + "epoch": 1.961939840392879, + "grad_norm": 0.33269986510276794, + "learning_rate": 9.293387127664012e-05, + "loss": 1.8372, + "step": 6392 + }, + { + "epoch": 1.9622467771639043, + "grad_norm": 0.36048516631126404, + "learning_rate": 9.293132357096007e-05, + "loss": 1.8944, + "step": 6393 + }, + { + "epoch": 1.9625537139349294, + "grad_norm": 0.4329642057418823, + "learning_rate": 9.292877544100751e-05, + "loss": 1.9868, + "step": 6394 + }, + { + "epoch": 1.9628606507059545, + "grad_norm": 0.445496529340744, + "learning_rate": 9.292622688680762e-05, + "loss": 1.9885, + "step": 6395 + }, + { + "epoch": 1.9631675874769798, + "grad_norm": 0.3818886876106262, + "learning_rate": 9.292367790838561e-05, + "loss": 1.9515, + "step": 6396 + }, + { + "epoch": 1.963474524248005, + "grad_norm": 0.3800121545791626, + "learning_rate": 9.292112850576664e-05, + "loss": 1.8838, + "step": 6397 + }, + { + "epoch": 1.96378146101903, + "grad_norm": 0.44252321124076843, + "learning_rate": 9.291857867897593e-05, + "loss": 1.9296, + "step": 6398 + }, + { + "epoch": 1.9640883977900554, + "grad_norm": 0.463766485452652, + "learning_rate": 9.291602842803867e-05, + "loss": 1.9164, + "step": 6399 + }, + { + "epoch": 1.9643953345610803, + "grad_norm": 0.4599217474460602, + "learning_rate": 9.291347775298006e-05, + "loss": 1.9277, + "step": 6400 + }, + { + "epoch": 1.9647022713321056, + "grad_norm": 0.371346652507782, + "learning_rate": 9.291092665382532e-05, + "loss": 1.9036, + "step": 6401 + }, + { + "epoch": 1.9650092081031307, + "grad_norm": 0.327197402715683, + "learning_rate": 9.290837513059965e-05, + "loss": 1.8214, + "step": 6402 + }, + { + "epoch": 1.9653161448741558, + "grad_norm": 0.3346688747406006, + "learning_rate": 9.290582318332826e-05, + "loss": 1.8671, + "step": 6403 + }, + { + "epoch": 1.9656230816451812, + "grad_norm": 0.342208594083786, + "learning_rate": 9.290327081203637e-05, + "loss": 1.9143, + "step": 6404 + }, + { + "epoch": 1.9659300184162063, + "grad_norm": 0.3430559039115906, + "learning_rate": 9.290071801674923e-05, + "loss": 1.9135, + "step": 6405 + }, + { + "epoch": 1.9662369551872314, + "grad_norm": 0.3335573971271515, + "learning_rate": 9.289816479749202e-05, + "loss": 1.9011, + "step": 6406 + }, + { + "epoch": 1.9665438919582567, + "grad_norm": 0.3464879095554352, + "learning_rate": 9.289561115429004e-05, + "loss": 1.9061, + "step": 6407 + }, + { + "epoch": 1.9668508287292816, + "grad_norm": 0.3513408899307251, + "learning_rate": 9.289305708716847e-05, + "loss": 1.8982, + "step": 6408 + }, + { + "epoch": 1.967157765500307, + "grad_norm": 0.3888663947582245, + "learning_rate": 9.289050259615256e-05, + "loss": 1.9196, + "step": 6409 + }, + { + "epoch": 1.967464702271332, + "grad_norm": 0.3414073884487152, + "learning_rate": 9.288794768126759e-05, + "loss": 1.932, + "step": 6410 + }, + { + "epoch": 1.9677716390423572, + "grad_norm": 0.33067384362220764, + "learning_rate": 9.288539234253876e-05, + "loss": 1.8547, + "step": 6411 + }, + { + "epoch": 1.9680785758133825, + "grad_norm": 0.31827688217163086, + "learning_rate": 9.288283657999135e-05, + "loss": 1.8691, + "step": 6412 + }, + { + "epoch": 1.9683855125844076, + "grad_norm": 0.32259073853492737, + "learning_rate": 9.288028039365062e-05, + "loss": 1.8889, + "step": 6413 + }, + { + "epoch": 1.9686924493554327, + "grad_norm": 0.37552687525749207, + "learning_rate": 9.287772378354182e-05, + "loss": 1.8709, + "step": 6414 + }, + { + "epoch": 1.968999386126458, + "grad_norm": 0.3446151316165924, + "learning_rate": 9.287516674969024e-05, + "loss": 1.8749, + "step": 6415 + }, + { + "epoch": 1.969306322897483, + "grad_norm": 0.3648208975791931, + "learning_rate": 9.287260929212111e-05, + "loss": 1.93, + "step": 6416 + }, + { + "epoch": 1.9696132596685083, + "grad_norm": 0.3430599868297577, + "learning_rate": 9.287005141085974e-05, + "loss": 1.8537, + "step": 6417 + }, + { + "epoch": 1.9699201964395334, + "grad_norm": 0.39110586047172546, + "learning_rate": 9.286749310593139e-05, + "loss": 1.987, + "step": 6418 + }, + { + "epoch": 1.9702271332105585, + "grad_norm": 0.4033393859863281, + "learning_rate": 9.286493437736136e-05, + "loss": 1.9793, + "step": 6419 + }, + { + "epoch": 1.9705340699815839, + "grad_norm": 0.3950151205062866, + "learning_rate": 9.286237522517491e-05, + "loss": 1.8781, + "step": 6420 + }, + { + "epoch": 1.970841006752609, + "grad_norm": 0.4614053964614868, + "learning_rate": 9.285981564939735e-05, + "loss": 1.9886, + "step": 6421 + }, + { + "epoch": 1.971147943523634, + "grad_norm": 0.4990023076534271, + "learning_rate": 9.285725565005398e-05, + "loss": 1.8957, + "step": 6422 + }, + { + "epoch": 1.9714548802946594, + "grad_norm": 0.501301109790802, + "learning_rate": 9.285469522717008e-05, + "loss": 1.8606, + "step": 6423 + }, + { + "epoch": 1.9717618170656843, + "grad_norm": 0.3820148706436157, + "learning_rate": 9.285213438077097e-05, + "loss": 1.9097, + "step": 6424 + }, + { + "epoch": 1.9720687538367097, + "grad_norm": 0.3959129750728607, + "learning_rate": 9.284957311088193e-05, + "loss": 1.8972, + "step": 6425 + }, + { + "epoch": 1.9723756906077348, + "grad_norm": 0.4914678931236267, + "learning_rate": 9.284701141752831e-05, + "loss": 1.9211, + "step": 6426 + }, + { + "epoch": 1.9726826273787599, + "grad_norm": 0.5992010831832886, + "learning_rate": 9.284444930073542e-05, + "loss": 1.917, + "step": 6427 + }, + { + "epoch": 1.9729895641497852, + "grad_norm": 0.6089407801628113, + "learning_rate": 9.284188676052856e-05, + "loss": 1.9497, + "step": 6428 + }, + { + "epoch": 1.9732965009208103, + "grad_norm": 0.5493173003196716, + "learning_rate": 9.283932379693306e-05, + "loss": 1.9888, + "step": 6429 + }, + { + "epoch": 1.9736034376918354, + "grad_norm": 0.4451984167098999, + "learning_rate": 9.283676040997426e-05, + "loss": 1.892, + "step": 6430 + }, + { + "epoch": 1.9739103744628608, + "grad_norm": 0.35765743255615234, + "learning_rate": 9.283419659967748e-05, + "loss": 1.8768, + "step": 6431 + }, + { + "epoch": 1.9742173112338857, + "grad_norm": 0.36561164259910583, + "learning_rate": 9.283163236606807e-05, + "loss": 1.825, + "step": 6432 + }, + { + "epoch": 1.974524248004911, + "grad_norm": 0.38473913073539734, + "learning_rate": 9.282906770917137e-05, + "loss": 1.9247, + "step": 6433 + }, + { + "epoch": 1.974831184775936, + "grad_norm": 0.324945867061615, + "learning_rate": 9.28265026290127e-05, + "loss": 1.8832, + "step": 6434 + }, + { + "epoch": 1.9751381215469612, + "grad_norm": 0.38697487115859985, + "learning_rate": 9.282393712561744e-05, + "loss": 1.9282, + "step": 6435 + }, + { + "epoch": 1.9754450583179866, + "grad_norm": 0.3772333264350891, + "learning_rate": 9.282137119901094e-05, + "loss": 1.8822, + "step": 6436 + }, + { + "epoch": 1.9757519950890117, + "grad_norm": 0.3522745668888092, + "learning_rate": 9.281880484921854e-05, + "loss": 1.9102, + "step": 6437 + }, + { + "epoch": 1.9760589318600368, + "grad_norm": 0.36745330691337585, + "learning_rate": 9.281623807626562e-05, + "loss": 1.8842, + "step": 6438 + }, + { + "epoch": 1.9763658686310621, + "grad_norm": 0.3990548253059387, + "learning_rate": 9.281367088017755e-05, + "loss": 1.9642, + "step": 6439 + }, + { + "epoch": 1.976672805402087, + "grad_norm": 0.3333520293235779, + "learning_rate": 9.281110326097969e-05, + "loss": 1.8541, + "step": 6440 + }, + { + "epoch": 1.9769797421731123, + "grad_norm": 0.3282802700996399, + "learning_rate": 9.280853521869739e-05, + "loss": 1.8416, + "step": 6441 + }, + { + "epoch": 1.9772866789441375, + "grad_norm": 0.3415268361568451, + "learning_rate": 9.280596675335607e-05, + "loss": 1.9009, + "step": 6442 + }, + { + "epoch": 1.9775936157151626, + "grad_norm": 0.3621836006641388, + "learning_rate": 9.28033978649811e-05, + "loss": 1.8584, + "step": 6443 + }, + { + "epoch": 1.977900552486188, + "grad_norm": 0.34778010845184326, + "learning_rate": 9.280082855359786e-05, + "loss": 1.9455, + "step": 6444 + }, + { + "epoch": 1.978207489257213, + "grad_norm": 0.36525633931159973, + "learning_rate": 9.279825881923174e-05, + "loss": 1.9182, + "step": 6445 + }, + { + "epoch": 1.9785144260282381, + "grad_norm": 0.3404203951358795, + "learning_rate": 9.279568866190815e-05, + "loss": 1.8853, + "step": 6446 + }, + { + "epoch": 1.9788213627992635, + "grad_norm": 0.4564785659313202, + "learning_rate": 9.279311808165249e-05, + "loss": 2.0012, + "step": 6447 + }, + { + "epoch": 1.9791282995702886, + "grad_norm": 0.4371441602706909, + "learning_rate": 9.279054707849015e-05, + "loss": 1.9372, + "step": 6448 + }, + { + "epoch": 1.9794352363413137, + "grad_norm": 0.3928726017475128, + "learning_rate": 9.278797565244652e-05, + "loss": 1.882, + "step": 6449 + }, + { + "epoch": 1.979742173112339, + "grad_norm": 0.483331561088562, + "learning_rate": 9.278540380354706e-05, + "loss": 1.9664, + "step": 6450 + }, + { + "epoch": 1.980049109883364, + "grad_norm": 0.39085066318511963, + "learning_rate": 9.278283153181716e-05, + "loss": 1.874, + "step": 6451 + }, + { + "epoch": 1.9803560466543892, + "grad_norm": 0.3549460172653198, + "learning_rate": 9.278025883728224e-05, + "loss": 1.9108, + "step": 6452 + }, + { + "epoch": 1.9806629834254144, + "grad_norm": 0.4260072410106659, + "learning_rate": 9.277768571996772e-05, + "loss": 1.8621, + "step": 6453 + }, + { + "epoch": 1.9809699201964395, + "grad_norm": 0.4531188905239105, + "learning_rate": 9.277511217989904e-05, + "loss": 1.9924, + "step": 6454 + }, + { + "epoch": 1.9812768569674648, + "grad_norm": 0.34916743636131287, + "learning_rate": 9.277253821710165e-05, + "loss": 1.9459, + "step": 6455 + }, + { + "epoch": 1.98158379373849, + "grad_norm": 0.45466169714927673, + "learning_rate": 9.276996383160095e-05, + "loss": 1.9129, + "step": 6456 + }, + { + "epoch": 1.981890730509515, + "grad_norm": 0.4948022663593292, + "learning_rate": 9.27673890234224e-05, + "loss": 1.9362, + "step": 6457 + }, + { + "epoch": 1.9821976672805404, + "grad_norm": 0.43365779519081116, + "learning_rate": 9.276481379259146e-05, + "loss": 1.9323, + "step": 6458 + }, + { + "epoch": 1.9825046040515653, + "grad_norm": 0.5301255583763123, + "learning_rate": 9.276223813913354e-05, + "loss": 1.9611, + "step": 6459 + }, + { + "epoch": 1.9828115408225906, + "grad_norm": 0.4785257577896118, + "learning_rate": 9.275966206307412e-05, + "loss": 1.8945, + "step": 6460 + }, + { + "epoch": 1.9831184775936157, + "grad_norm": 0.4091590940952301, + "learning_rate": 9.275708556443868e-05, + "loss": 1.9171, + "step": 6461 + }, + { + "epoch": 1.9834254143646408, + "grad_norm": 0.4031025767326355, + "learning_rate": 9.275450864325264e-05, + "loss": 1.9518, + "step": 6462 + }, + { + "epoch": 1.9837323511356661, + "grad_norm": 0.39147642254829407, + "learning_rate": 9.275193129954149e-05, + "loss": 1.8756, + "step": 6463 + }, + { + "epoch": 1.9840392879066913, + "grad_norm": 0.3863523006439209, + "learning_rate": 9.27493535333307e-05, + "loss": 1.8894, + "step": 6464 + }, + { + "epoch": 1.9843462246777164, + "grad_norm": 0.36373165249824524, + "learning_rate": 9.274677534464576e-05, + "loss": 1.8574, + "step": 6465 + }, + { + "epoch": 1.9846531614487417, + "grad_norm": 0.40247389674186707, + "learning_rate": 9.274419673351211e-05, + "loss": 1.832, + "step": 6466 + }, + { + "epoch": 1.9849600982197666, + "grad_norm": 0.3874013125896454, + "learning_rate": 9.274161769995526e-05, + "loss": 1.9079, + "step": 6467 + }, + { + "epoch": 1.985267034990792, + "grad_norm": 0.35506606101989746, + "learning_rate": 9.27390382440007e-05, + "loss": 1.8784, + "step": 6468 + }, + { + "epoch": 1.985573971761817, + "grad_norm": 0.406325101852417, + "learning_rate": 9.273645836567388e-05, + "loss": 1.9822, + "step": 6469 + }, + { + "epoch": 1.9858809085328422, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.273387806500036e-05, + "loss": 1.9334, + "step": 6470 + }, + { + "epoch": 1.9861878453038675, + "grad_norm": 0.4810343384742737, + "learning_rate": 9.273129734200561e-05, + "loss": 1.9598, + "step": 6471 + }, + { + "epoch": 1.9864947820748926, + "grad_norm": 0.4552834630012512, + "learning_rate": 9.272871619671513e-05, + "loss": 1.9504, + "step": 6472 + }, + { + "epoch": 1.9868017188459177, + "grad_norm": 0.38974207639694214, + "learning_rate": 9.272613462915443e-05, + "loss": 1.8811, + "step": 6473 + }, + { + "epoch": 1.987108655616943, + "grad_norm": 0.40983298420906067, + "learning_rate": 9.272355263934902e-05, + "loss": 1.8876, + "step": 6474 + }, + { + "epoch": 1.987415592387968, + "grad_norm": 0.3684757947921753, + "learning_rate": 9.272097022732443e-05, + "loss": 1.921, + "step": 6475 + }, + { + "epoch": 1.9877225291589933, + "grad_norm": 0.38384270668029785, + "learning_rate": 9.271838739310618e-05, + "loss": 1.9099, + "step": 6476 + }, + { + "epoch": 1.9880294659300184, + "grad_norm": 0.3783731460571289, + "learning_rate": 9.271580413671976e-05, + "loss": 1.9322, + "step": 6477 + }, + { + "epoch": 1.9883364027010435, + "grad_norm": 0.3686216473579407, + "learning_rate": 9.271322045819076e-05, + "loss": 1.914, + "step": 6478 + }, + { + "epoch": 1.9886433394720688, + "grad_norm": 0.38776305317878723, + "learning_rate": 9.271063635754466e-05, + "loss": 1.9331, + "step": 6479 + }, + { + "epoch": 1.988950276243094, + "grad_norm": 0.35099950432777405, + "learning_rate": 9.270805183480702e-05, + "loss": 1.9837, + "step": 6480 + }, + { + "epoch": 1.989257213014119, + "grad_norm": 0.3736453652381897, + "learning_rate": 9.270546689000339e-05, + "loss": 1.846, + "step": 6481 + }, + { + "epoch": 1.9895641497851444, + "grad_norm": 0.3654848635196686, + "learning_rate": 9.27028815231593e-05, + "loss": 1.8987, + "step": 6482 + }, + { + "epoch": 1.9898710865561693, + "grad_norm": 0.3534870147705078, + "learning_rate": 9.27002957343003e-05, + "loss": 1.868, + "step": 6483 + }, + { + "epoch": 1.9901780233271946, + "grad_norm": 0.3143392503261566, + "learning_rate": 9.269770952345197e-05, + "loss": 1.8042, + "step": 6484 + }, + { + "epoch": 1.9904849600982197, + "grad_norm": 0.37151026725769043, + "learning_rate": 9.269512289063982e-05, + "loss": 1.8392, + "step": 6485 + }, + { + "epoch": 1.9907918968692448, + "grad_norm": 0.39781463146209717, + "learning_rate": 9.269253583588947e-05, + "loss": 1.9911, + "step": 6486 + }, + { + "epoch": 1.9910988336402702, + "grad_norm": 0.44022107124328613, + "learning_rate": 9.268994835922643e-05, + "loss": 1.9644, + "step": 6487 + }, + { + "epoch": 1.9914057704112953, + "grad_norm": 0.4058530628681183, + "learning_rate": 9.268736046067632e-05, + "loss": 1.9062, + "step": 6488 + }, + { + "epoch": 1.9917127071823204, + "grad_norm": 0.3754481077194214, + "learning_rate": 9.268477214026467e-05, + "loss": 1.8278, + "step": 6489 + }, + { + "epoch": 1.9920196439533457, + "grad_norm": 0.318208247423172, + "learning_rate": 9.268218339801711e-05, + "loss": 1.8529, + "step": 6490 + }, + { + "epoch": 1.9923265807243706, + "grad_norm": 0.350777268409729, + "learning_rate": 9.267959423395918e-05, + "loss": 1.9024, + "step": 6491 + }, + { + "epoch": 1.992633517495396, + "grad_norm": 0.3145158588886261, + "learning_rate": 9.26770046481165e-05, + "loss": 1.934, + "step": 6492 + }, + { + "epoch": 1.992940454266421, + "grad_norm": 0.3347548842430115, + "learning_rate": 9.267441464051463e-05, + "loss": 1.8989, + "step": 6493 + }, + { + "epoch": 1.9932473910374462, + "grad_norm": 0.33111512660980225, + "learning_rate": 9.267182421117919e-05, + "loss": 1.8808, + "step": 6494 + }, + { + "epoch": 1.9935543278084715, + "grad_norm": 0.3135010898113251, + "learning_rate": 9.266923336013577e-05, + "loss": 1.895, + "step": 6495 + }, + { + "epoch": 1.9938612645794966, + "grad_norm": 0.3638830780982971, + "learning_rate": 9.266664208740998e-05, + "loss": 1.9331, + "step": 6496 + }, + { + "epoch": 1.9941682013505218, + "grad_norm": 0.3592624068260193, + "learning_rate": 9.266405039302743e-05, + "loss": 1.8963, + "step": 6497 + }, + { + "epoch": 1.994475138121547, + "grad_norm": 0.34216129779815674, + "learning_rate": 9.266145827701371e-05, + "loss": 1.9062, + "step": 6498 + }, + { + "epoch": 1.994782074892572, + "grad_norm": 0.4180343747138977, + "learning_rate": 9.265886573939447e-05, + "loss": 1.9351, + "step": 6499 + }, + { + "epoch": 1.9950890116635973, + "grad_norm": 0.36890342831611633, + "learning_rate": 9.265627278019531e-05, + "loss": 1.9037, + "step": 6500 + }, + { + "epoch": 1.9953959484346224, + "grad_norm": 0.36638152599334717, + "learning_rate": 9.265367939944188e-05, + "loss": 1.9524, + "step": 6501 + }, + { + "epoch": 1.9957028852056475, + "grad_norm": 0.44918373227119446, + "learning_rate": 9.265108559715976e-05, + "loss": 1.9236, + "step": 6502 + }, + { + "epoch": 1.9960098219766729, + "grad_norm": 0.3805326521396637, + "learning_rate": 9.264849137337462e-05, + "loss": 1.8526, + "step": 6503 + }, + { + "epoch": 1.996316758747698, + "grad_norm": 0.39035212993621826, + "learning_rate": 9.26458967281121e-05, + "loss": 1.8256, + "step": 6504 + }, + { + "epoch": 1.996623695518723, + "grad_norm": 0.330522358417511, + "learning_rate": 9.264330166139783e-05, + "loss": 1.8487, + "step": 6505 + }, + { + "epoch": 1.9969306322897484, + "grad_norm": 0.33569198846817017, + "learning_rate": 9.264070617325746e-05, + "loss": 1.8735, + "step": 6506 + }, + { + "epoch": 1.9972375690607733, + "grad_norm": 0.4121384918689728, + "learning_rate": 9.263811026371664e-05, + "loss": 2.0028, + "step": 6507 + }, + { + "epoch": 1.9975445058317987, + "grad_norm": 0.3419879972934723, + "learning_rate": 9.263551393280103e-05, + "loss": 1.8432, + "step": 6508 + }, + { + "epoch": 1.9978514426028238, + "grad_norm": 0.33369818329811096, + "learning_rate": 9.263291718053626e-05, + "loss": 1.8752, + "step": 6509 + }, + { + "epoch": 1.9981583793738489, + "grad_norm": 0.3580996096134186, + "learning_rate": 9.263032000694804e-05, + "loss": 1.9319, + "step": 6510 + }, + { + "epoch": 1.9984653161448742, + "grad_norm": 0.38216903805732727, + "learning_rate": 9.2627722412062e-05, + "loss": 1.9424, + "step": 6511 + }, + { + "epoch": 1.9987722529158993, + "grad_norm": 0.3836761713027954, + "learning_rate": 9.26251243959038e-05, + "loss": 1.9259, + "step": 6512 + }, + { + "epoch": 1.9990791896869244, + "grad_norm": 0.34978967905044556, + "learning_rate": 9.262252595849917e-05, + "loss": 1.8648, + "step": 6513 + }, + { + "epoch": 1.9993861264579498, + "grad_norm": 0.4190160632133484, + "learning_rate": 9.261992709987375e-05, + "loss": 1.9456, + "step": 6514 + }, + { + "epoch": 1.9996930632289747, + "grad_norm": 0.38700881600379944, + "learning_rate": 9.261732782005322e-05, + "loss": 1.8768, + "step": 6515 + }, + { + "epoch": 2.0, + "grad_norm": 0.3706338405609131, + "learning_rate": 9.261472811906328e-05, + "loss": 1.9247, + "step": 6516 + }, + { + "epoch": 2.0003069367710253, + "grad_norm": 0.36679908633232117, + "learning_rate": 9.261212799692962e-05, + "loss": 1.8193, + "step": 6517 + }, + { + "epoch": 2.0006138735420502, + "grad_norm": 0.45219072699546814, + "learning_rate": 9.260952745367795e-05, + "loss": 1.9019, + "step": 6518 + }, + { + "epoch": 2.0009208103130756, + "grad_norm": 0.6038491725921631, + "learning_rate": 9.260692648933393e-05, + "loss": 1.8834, + "step": 6519 + }, + { + "epoch": 2.001227747084101, + "grad_norm": 0.5823990106582642, + "learning_rate": 9.260432510392331e-05, + "loss": 1.9066, + "step": 6520 + }, + { + "epoch": 2.001534683855126, + "grad_norm": 0.4731088876724243, + "learning_rate": 9.260172329747178e-05, + "loss": 1.8997, + "step": 6521 + }, + { + "epoch": 2.001841620626151, + "grad_norm": 0.3397974669933319, + "learning_rate": 9.259912107000504e-05, + "loss": 1.9396, + "step": 6522 + }, + { + "epoch": 2.002148557397176, + "grad_norm": 0.374734103679657, + "learning_rate": 9.259651842154882e-05, + "loss": 1.9311, + "step": 6523 + }, + { + "epoch": 2.0024554941682013, + "grad_norm": 0.48218441009521484, + "learning_rate": 9.259391535212884e-05, + "loss": 1.948, + "step": 6524 + }, + { + "epoch": 2.0027624309392267, + "grad_norm": 0.40540626645088196, + "learning_rate": 9.259131186177082e-05, + "loss": 1.8541, + "step": 6525 + }, + { + "epoch": 2.0030693677102516, + "grad_norm": 0.3698440492153168, + "learning_rate": 9.258870795050048e-05, + "loss": 1.9622, + "step": 6526 + }, + { + "epoch": 2.003376304481277, + "grad_norm": 0.35084524750709534, + "learning_rate": 9.258610361834358e-05, + "loss": 1.8882, + "step": 6527 + }, + { + "epoch": 2.0036832412523022, + "grad_norm": 0.38982072472572327, + "learning_rate": 9.258349886532584e-05, + "loss": 1.9523, + "step": 6528 + }, + { + "epoch": 2.003990178023327, + "grad_norm": 0.3737744390964508, + "learning_rate": 9.258089369147302e-05, + "loss": 1.9091, + "step": 6529 + }, + { + "epoch": 2.0042971147943525, + "grad_norm": 0.36094167828559875, + "learning_rate": 9.257828809681083e-05, + "loss": 1.8711, + "step": 6530 + }, + { + "epoch": 2.0046040515653774, + "grad_norm": 0.3270244896411896, + "learning_rate": 9.257568208136506e-05, + "loss": 1.8738, + "step": 6531 + }, + { + "epoch": 2.0049109883364027, + "grad_norm": 0.3320237100124359, + "learning_rate": 9.257307564516145e-05, + "loss": 1.8889, + "step": 6532 + }, + { + "epoch": 2.005217925107428, + "grad_norm": 0.3091014623641968, + "learning_rate": 9.257046878822573e-05, + "loss": 1.8683, + "step": 6533 + }, + { + "epoch": 2.005524861878453, + "grad_norm": 0.3234712779521942, + "learning_rate": 9.25678615105837e-05, + "loss": 1.8787, + "step": 6534 + }, + { + "epoch": 2.0058317986494782, + "grad_norm": 0.38402292132377625, + "learning_rate": 9.25652538122611e-05, + "loss": 1.9414, + "step": 6535 + }, + { + "epoch": 2.0061387354205036, + "grad_norm": 0.41379863023757935, + "learning_rate": 9.256264569328372e-05, + "loss": 1.9185, + "step": 6536 + }, + { + "epoch": 2.0064456721915285, + "grad_norm": 0.35990384221076965, + "learning_rate": 9.256003715367733e-05, + "loss": 1.8756, + "step": 6537 + }, + { + "epoch": 2.006752608962554, + "grad_norm": 0.3489217460155487, + "learning_rate": 9.25574281934677e-05, + "loss": 1.8984, + "step": 6538 + }, + { + "epoch": 2.0070595457335787, + "grad_norm": 0.326541006565094, + "learning_rate": 9.255481881268064e-05, + "loss": 1.8559, + "step": 6539 + }, + { + "epoch": 2.007366482504604, + "grad_norm": 0.40900397300720215, + "learning_rate": 9.25522090113419e-05, + "loss": 1.8832, + "step": 6540 + }, + { + "epoch": 2.0076734192756294, + "grad_norm": 0.4130956828594208, + "learning_rate": 9.254959878947731e-05, + "loss": 1.8437, + "step": 6541 + }, + { + "epoch": 2.0079803560466543, + "grad_norm": 0.38869336247444153, + "learning_rate": 9.254698814711263e-05, + "loss": 1.8839, + "step": 6542 + }, + { + "epoch": 2.0082872928176796, + "grad_norm": 0.37832918763160706, + "learning_rate": 9.254437708427368e-05, + "loss": 1.9519, + "step": 6543 + }, + { + "epoch": 2.008594229588705, + "grad_norm": 0.35336560010910034, + "learning_rate": 9.254176560098625e-05, + "loss": 1.8928, + "step": 6544 + }, + { + "epoch": 2.00890116635973, + "grad_norm": 0.347260981798172, + "learning_rate": 9.253915369727617e-05, + "loss": 1.9133, + "step": 6545 + }, + { + "epoch": 2.009208103130755, + "grad_norm": 0.3706999719142914, + "learning_rate": 9.253654137316923e-05, + "loss": 1.9048, + "step": 6546 + }, + { + "epoch": 2.00951503990178, + "grad_norm": 0.40080907940864563, + "learning_rate": 9.253392862869127e-05, + "loss": 1.9169, + "step": 6547 + }, + { + "epoch": 2.0098219766728054, + "grad_norm": 0.3635334074497223, + "learning_rate": 9.253131546386808e-05, + "loss": 1.8623, + "step": 6548 + }, + { + "epoch": 2.0101289134438307, + "grad_norm": 0.32642990350723267, + "learning_rate": 9.252870187872552e-05, + "loss": 1.8624, + "step": 6549 + }, + { + "epoch": 2.0104358502148556, + "grad_norm": 0.32467779517173767, + "learning_rate": 9.25260878732894e-05, + "loss": 1.8867, + "step": 6550 + }, + { + "epoch": 2.010742786985881, + "grad_norm": 0.3496699631214142, + "learning_rate": 9.252347344758553e-05, + "loss": 1.8441, + "step": 6551 + }, + { + "epoch": 2.0110497237569063, + "grad_norm": 0.3624981939792633, + "learning_rate": 9.252085860163981e-05, + "loss": 1.9045, + "step": 6552 + }, + { + "epoch": 2.011356660527931, + "grad_norm": 0.3801099359989166, + "learning_rate": 9.251824333547801e-05, + "loss": 1.9273, + "step": 6553 + }, + { + "epoch": 2.0116635972989565, + "grad_norm": 0.355866402387619, + "learning_rate": 9.251562764912602e-05, + "loss": 1.9032, + "step": 6554 + }, + { + "epoch": 2.0119705340699814, + "grad_norm": 0.31210052967071533, + "learning_rate": 9.251301154260968e-05, + "loss": 1.8148, + "step": 6555 + }, + { + "epoch": 2.0122774708410067, + "grad_norm": 0.3583676218986511, + "learning_rate": 9.251039501595485e-05, + "loss": 1.9326, + "step": 6556 + }, + { + "epoch": 2.012584407612032, + "grad_norm": 0.40221846103668213, + "learning_rate": 9.250777806918737e-05, + "loss": 1.8968, + "step": 6557 + }, + { + "epoch": 2.012891344383057, + "grad_norm": 0.3403627574443817, + "learning_rate": 9.250516070233311e-05, + "loss": 1.8956, + "step": 6558 + }, + { + "epoch": 2.0131982811540823, + "grad_norm": 0.37752729654312134, + "learning_rate": 9.250254291541796e-05, + "loss": 1.9136, + "step": 6559 + }, + { + "epoch": 2.0135052179251076, + "grad_norm": 0.3661794364452362, + "learning_rate": 9.249992470846774e-05, + "loss": 1.8796, + "step": 6560 + }, + { + "epoch": 2.0138121546961325, + "grad_norm": 0.315603643655777, + "learning_rate": 9.249730608150837e-05, + "loss": 1.8711, + "step": 6561 + }, + { + "epoch": 2.014119091467158, + "grad_norm": 0.3187065124511719, + "learning_rate": 9.249468703456571e-05, + "loss": 1.8611, + "step": 6562 + }, + { + "epoch": 2.0144260282381827, + "grad_norm": 0.3018025755882263, + "learning_rate": 9.249206756766564e-05, + "loss": 1.786, + "step": 6563 + }, + { + "epoch": 2.014732965009208, + "grad_norm": 0.344963401556015, + "learning_rate": 9.248944768083406e-05, + "loss": 1.9428, + "step": 6564 + }, + { + "epoch": 2.0150399017802334, + "grad_norm": 0.29776978492736816, + "learning_rate": 9.248682737409687e-05, + "loss": 1.8089, + "step": 6565 + }, + { + "epoch": 2.0153468385512583, + "grad_norm": 0.348982572555542, + "learning_rate": 9.248420664747992e-05, + "loss": 1.8407, + "step": 6566 + }, + { + "epoch": 2.0156537753222836, + "grad_norm": 0.3413224518299103, + "learning_rate": 9.248158550100915e-05, + "loss": 1.9802, + "step": 6567 + }, + { + "epoch": 2.015960712093309, + "grad_norm": 0.3598950505256653, + "learning_rate": 9.247896393471044e-05, + "loss": 1.8882, + "step": 6568 + }, + { + "epoch": 2.016267648864334, + "grad_norm": 0.3609221875667572, + "learning_rate": 9.247634194860974e-05, + "loss": 1.934, + "step": 6569 + }, + { + "epoch": 2.016574585635359, + "grad_norm": 0.3893497586250305, + "learning_rate": 9.247371954273291e-05, + "loss": 1.8808, + "step": 6570 + }, + { + "epoch": 2.016881522406384, + "grad_norm": 0.347417950630188, + "learning_rate": 9.24710967171059e-05, + "loss": 1.863, + "step": 6571 + }, + { + "epoch": 2.0171884591774094, + "grad_norm": 0.35378298163414, + "learning_rate": 9.246847347175461e-05, + "loss": 1.8664, + "step": 6572 + }, + { + "epoch": 2.0174953959484347, + "grad_norm": 0.2819608151912689, + "learning_rate": 9.246584980670499e-05, + "loss": 1.9007, + "step": 6573 + }, + { + "epoch": 2.0178023327194596, + "grad_norm": 0.32445117831230164, + "learning_rate": 9.246322572198293e-05, + "loss": 1.9176, + "step": 6574 + }, + { + "epoch": 2.018109269490485, + "grad_norm": 0.33579203486442566, + "learning_rate": 9.24606012176144e-05, + "loss": 1.8192, + "step": 6575 + }, + { + "epoch": 2.0184162062615103, + "grad_norm": 0.40369588136672974, + "learning_rate": 9.245797629362532e-05, + "loss": 1.8731, + "step": 6576 + }, + { + "epoch": 2.018723143032535, + "grad_norm": 0.34241169691085815, + "learning_rate": 9.245535095004163e-05, + "loss": 1.8555, + "step": 6577 + }, + { + "epoch": 2.0190300798035605, + "grad_norm": 0.3627666234970093, + "learning_rate": 9.245272518688927e-05, + "loss": 1.9212, + "step": 6578 + }, + { + "epoch": 2.0193370165745854, + "grad_norm": 0.3330884873867035, + "learning_rate": 9.245009900419422e-05, + "loss": 1.8727, + "step": 6579 + }, + { + "epoch": 2.0196439533456108, + "grad_norm": 0.3259236514568329, + "learning_rate": 9.244747240198239e-05, + "loss": 1.8471, + "step": 6580 + }, + { + "epoch": 2.019950890116636, + "grad_norm": 0.3715277910232544, + "learning_rate": 9.244484538027976e-05, + "loss": 1.8925, + "step": 6581 + }, + { + "epoch": 2.020257826887661, + "grad_norm": 0.4752909541130066, + "learning_rate": 9.24422179391123e-05, + "loss": 1.889, + "step": 6582 + }, + { + "epoch": 2.0205647636586863, + "grad_norm": 0.5166791677474976, + "learning_rate": 9.243959007850597e-05, + "loss": 1.8637, + "step": 6583 + }, + { + "epoch": 2.0208717004297116, + "grad_norm": 0.5350266695022583, + "learning_rate": 9.243696179848673e-05, + "loss": 1.8916, + "step": 6584 + }, + { + "epoch": 2.0211786372007365, + "grad_norm": 0.6115607619285583, + "learning_rate": 9.243433309908055e-05, + "loss": 1.8847, + "step": 6585 + }, + { + "epoch": 2.021485573971762, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.243170398031343e-05, + "loss": 1.8889, + "step": 6586 + }, + { + "epoch": 2.021792510742787, + "grad_norm": 0.4547630846500397, + "learning_rate": 9.242907444221134e-05, + "loss": 1.8752, + "step": 6587 + }, + { + "epoch": 2.022099447513812, + "grad_norm": 0.39437413215637207, + "learning_rate": 9.242644448480027e-05, + "loss": 1.9318, + "step": 6588 + }, + { + "epoch": 2.0224063842848374, + "grad_norm": 0.39216291904449463, + "learning_rate": 9.24238141081062e-05, + "loss": 1.8799, + "step": 6589 + }, + { + "epoch": 2.0227133210558623, + "grad_norm": 0.4100605547428131, + "learning_rate": 9.242118331215513e-05, + "loss": 1.9278, + "step": 6590 + }, + { + "epoch": 2.0230202578268877, + "grad_norm": 0.38527074456214905, + "learning_rate": 9.241855209697307e-05, + "loss": 1.9085, + "step": 6591 + }, + { + "epoch": 2.023327194597913, + "grad_norm": 0.39856311678886414, + "learning_rate": 9.241592046258602e-05, + "loss": 1.8057, + "step": 6592 + }, + { + "epoch": 2.023634131368938, + "grad_norm": 0.4070499539375305, + "learning_rate": 9.241328840902e-05, + "loss": 1.8099, + "step": 6593 + }, + { + "epoch": 2.023941068139963, + "grad_norm": 0.40319183468818665, + "learning_rate": 9.241065593630097e-05, + "loss": 1.8654, + "step": 6594 + }, + { + "epoch": 2.0242480049109886, + "grad_norm": 0.3788430988788605, + "learning_rate": 9.240802304445499e-05, + "loss": 1.9419, + "step": 6595 + }, + { + "epoch": 2.0245549416820134, + "grad_norm": 0.3656894564628601, + "learning_rate": 9.240538973350809e-05, + "loss": 1.8625, + "step": 6596 + }, + { + "epoch": 2.0248618784530388, + "grad_norm": 0.4384852945804596, + "learning_rate": 9.240275600348625e-05, + "loss": 1.8893, + "step": 6597 + }, + { + "epoch": 2.0251688152240637, + "grad_norm": 0.5054775476455688, + "learning_rate": 9.240012185441554e-05, + "loss": 1.826, + "step": 6598 + }, + { + "epoch": 2.025475751995089, + "grad_norm": 0.4576725959777832, + "learning_rate": 9.239748728632196e-05, + "loss": 1.9319, + "step": 6599 + }, + { + "epoch": 2.0257826887661143, + "grad_norm": 0.40581515431404114, + "learning_rate": 9.239485229923157e-05, + "loss": 1.905, + "step": 6600 + }, + { + "epoch": 2.0260896255371392, + "grad_norm": 0.3168322443962097, + "learning_rate": 9.23922168931704e-05, + "loss": 1.8937, + "step": 6601 + }, + { + "epoch": 2.0263965623081646, + "grad_norm": 0.39211124181747437, + "learning_rate": 9.238958106816449e-05, + "loss": 1.8346, + "step": 6602 + }, + { + "epoch": 2.02670349907919, + "grad_norm": 0.4722496569156647, + "learning_rate": 9.23869448242399e-05, + "loss": 1.933, + "step": 6603 + }, + { + "epoch": 2.027010435850215, + "grad_norm": 0.47029170393943787, + "learning_rate": 9.238430816142268e-05, + "loss": 1.8873, + "step": 6604 + }, + { + "epoch": 2.02731737262124, + "grad_norm": 0.36421555280685425, + "learning_rate": 9.238167107973888e-05, + "loss": 1.8311, + "step": 6605 + }, + { + "epoch": 2.027624309392265, + "grad_norm": 0.36506712436676025, + "learning_rate": 9.237903357921455e-05, + "loss": 1.9025, + "step": 6606 + }, + { + "epoch": 2.0279312461632903, + "grad_norm": 0.5055087208747864, + "learning_rate": 9.237639565987579e-05, + "loss": 1.9138, + "step": 6607 + }, + { + "epoch": 2.0282381829343157, + "grad_norm": 0.5850993394851685, + "learning_rate": 9.237375732174867e-05, + "loss": 1.869, + "step": 6608 + }, + { + "epoch": 2.0285451197053406, + "grad_norm": 0.5053986310958862, + "learning_rate": 9.237111856485921e-05, + "loss": 1.8196, + "step": 6609 + }, + { + "epoch": 2.028852056476366, + "grad_norm": 0.40635839104652405, + "learning_rate": 9.236847938923354e-05, + "loss": 1.8399, + "step": 6610 + }, + { + "epoch": 2.0291589932473912, + "grad_norm": 0.32075709104537964, + "learning_rate": 9.236583979489771e-05, + "loss": 1.8532, + "step": 6611 + }, + { + "epoch": 2.029465930018416, + "grad_norm": 0.4474230408668518, + "learning_rate": 9.236319978187783e-05, + "loss": 1.8807, + "step": 6612 + }, + { + "epoch": 2.0297728667894415, + "grad_norm": 0.5391832590103149, + "learning_rate": 9.236055935019998e-05, + "loss": 1.8887, + "step": 6613 + }, + { + "epoch": 2.0300798035604664, + "grad_norm": 0.5129361748695374, + "learning_rate": 9.235791849989024e-05, + "loss": 1.8541, + "step": 6614 + }, + { + "epoch": 2.0303867403314917, + "grad_norm": 0.33113735914230347, + "learning_rate": 9.235527723097474e-05, + "loss": 1.8611, + "step": 6615 + }, + { + "epoch": 2.030693677102517, + "grad_norm": 0.3526761531829834, + "learning_rate": 9.235263554347956e-05, + "loss": 1.8436, + "step": 6616 + }, + { + "epoch": 2.031000613873542, + "grad_norm": 0.4380190670490265, + "learning_rate": 9.234999343743081e-05, + "loss": 1.854, + "step": 6617 + }, + { + "epoch": 2.0313075506445673, + "grad_norm": 0.4300559163093567, + "learning_rate": 9.23473509128546e-05, + "loss": 1.919, + "step": 6618 + }, + { + "epoch": 2.0316144874155926, + "grad_norm": 0.3445209860801697, + "learning_rate": 9.234470796977705e-05, + "loss": 1.88, + "step": 6619 + }, + { + "epoch": 2.0319214241866175, + "grad_norm": 0.35759109258651733, + "learning_rate": 9.234206460822428e-05, + "loss": 1.9244, + "step": 6620 + }, + { + "epoch": 2.032228360957643, + "grad_norm": 0.432804137468338, + "learning_rate": 9.23394208282224e-05, + "loss": 1.9312, + "step": 6621 + }, + { + "epoch": 2.0325352977286677, + "grad_norm": 0.446865439414978, + "learning_rate": 9.233677662979756e-05, + "loss": 1.8791, + "step": 6622 + }, + { + "epoch": 2.032842234499693, + "grad_norm": 0.37617436051368713, + "learning_rate": 9.233413201297588e-05, + "loss": 1.8794, + "step": 6623 + }, + { + "epoch": 2.0331491712707184, + "grad_norm": 0.33695775270462036, + "learning_rate": 9.233148697778349e-05, + "loss": 1.8649, + "step": 6624 + }, + { + "epoch": 2.0334561080417433, + "grad_norm": 0.3893069624900818, + "learning_rate": 9.232884152424654e-05, + "loss": 1.899, + "step": 6625 + }, + { + "epoch": 2.0337630448127686, + "grad_norm": 0.38993194699287415, + "learning_rate": 9.232619565239116e-05, + "loss": 1.8994, + "step": 6626 + }, + { + "epoch": 2.034069981583794, + "grad_norm": 0.3725507855415344, + "learning_rate": 9.23235493622435e-05, + "loss": 1.8758, + "step": 6627 + }, + { + "epoch": 2.034376918354819, + "grad_norm": 0.3236019015312195, + "learning_rate": 9.232090265382973e-05, + "loss": 1.9041, + "step": 6628 + }, + { + "epoch": 2.034683855125844, + "grad_norm": 0.3399617671966553, + "learning_rate": 9.231825552717599e-05, + "loss": 1.9081, + "step": 6629 + }, + { + "epoch": 2.034990791896869, + "grad_norm": 0.352096289396286, + "learning_rate": 9.231560798230845e-05, + "loss": 1.9001, + "step": 6630 + }, + { + "epoch": 2.0352977286678944, + "grad_norm": 0.39621952176094055, + "learning_rate": 9.231296001925327e-05, + "loss": 1.9258, + "step": 6631 + }, + { + "epoch": 2.0356046654389197, + "grad_norm": 0.36686012148857117, + "learning_rate": 9.23103116380366e-05, + "loss": 1.9325, + "step": 6632 + }, + { + "epoch": 2.0359116022099446, + "grad_norm": 0.36286696791648865, + "learning_rate": 9.230766283868466e-05, + "loss": 1.9623, + "step": 6633 + }, + { + "epoch": 2.03621853898097, + "grad_norm": 0.34748387336730957, + "learning_rate": 9.230501362122359e-05, + "loss": 1.8326, + "step": 6634 + }, + { + "epoch": 2.0365254757519953, + "grad_norm": 0.350993275642395, + "learning_rate": 9.230236398567958e-05, + "loss": 1.8333, + "step": 6635 + }, + { + "epoch": 2.03683241252302, + "grad_norm": 0.3181723356246948, + "learning_rate": 9.229971393207881e-05, + "loss": 1.8852, + "step": 6636 + }, + { + "epoch": 2.0371393492940455, + "grad_norm": 0.3446536660194397, + "learning_rate": 9.229706346044747e-05, + "loss": 1.8833, + "step": 6637 + }, + { + "epoch": 2.0374462860650704, + "grad_norm": 0.3077203631401062, + "learning_rate": 9.229441257081176e-05, + "loss": 1.8546, + "step": 6638 + }, + { + "epoch": 2.0377532228360957, + "grad_norm": 0.3659566342830658, + "learning_rate": 9.229176126319788e-05, + "loss": 1.8687, + "step": 6639 + }, + { + "epoch": 2.038060159607121, + "grad_norm": 0.379779577255249, + "learning_rate": 9.228910953763204e-05, + "loss": 1.9208, + "step": 6640 + }, + { + "epoch": 2.038367096378146, + "grad_norm": 0.4496903121471405, + "learning_rate": 9.228645739414042e-05, + "loss": 1.9471, + "step": 6641 + }, + { + "epoch": 2.0386740331491713, + "grad_norm": 0.37597209215164185, + "learning_rate": 9.228380483274923e-05, + "loss": 1.9047, + "step": 6642 + }, + { + "epoch": 2.0389809699201966, + "grad_norm": 0.3739323019981384, + "learning_rate": 9.228115185348471e-05, + "loss": 1.9697, + "step": 6643 + }, + { + "epoch": 2.0392879066912215, + "grad_norm": 0.3524092435836792, + "learning_rate": 9.227849845637306e-05, + "loss": 1.8716, + "step": 6644 + }, + { + "epoch": 2.039594843462247, + "grad_norm": 0.36939096450805664, + "learning_rate": 9.227584464144051e-05, + "loss": 1.9836, + "step": 6645 + }, + { + "epoch": 2.0399017802332717, + "grad_norm": 0.39015519618988037, + "learning_rate": 9.22731904087133e-05, + "loss": 1.907, + "step": 6646 + }, + { + "epoch": 2.040208717004297, + "grad_norm": 0.3725626468658447, + "learning_rate": 9.227053575821763e-05, + "loss": 1.9483, + "step": 6647 + }, + { + "epoch": 2.0405156537753224, + "grad_norm": 0.41595613956451416, + "learning_rate": 9.226788068997974e-05, + "loss": 1.9352, + "step": 6648 + }, + { + "epoch": 2.0408225905463473, + "grad_norm": 0.4026443660259247, + "learning_rate": 9.226522520402589e-05, + "loss": 1.9166, + "step": 6649 + }, + { + "epoch": 2.0411295273173726, + "grad_norm": 0.39883533120155334, + "learning_rate": 9.226256930038233e-05, + "loss": 1.8594, + "step": 6650 + }, + { + "epoch": 2.041436464088398, + "grad_norm": 0.35540083050727844, + "learning_rate": 9.225991297907526e-05, + "loss": 1.9065, + "step": 6651 + }, + { + "epoch": 2.041743400859423, + "grad_norm": 0.3799804747104645, + "learning_rate": 9.225725624013097e-05, + "loss": 1.9232, + "step": 6652 + }, + { + "epoch": 2.042050337630448, + "grad_norm": 0.37289959192276, + "learning_rate": 9.225459908357572e-05, + "loss": 1.9679, + "step": 6653 + }, + { + "epoch": 2.042357274401473, + "grad_norm": 0.38069143891334534, + "learning_rate": 9.225194150943574e-05, + "loss": 1.9699, + "step": 6654 + }, + { + "epoch": 2.0426642111724984, + "grad_norm": 0.43708884716033936, + "learning_rate": 9.224928351773731e-05, + "loss": 1.8907, + "step": 6655 + }, + { + "epoch": 2.0429711479435237, + "grad_norm": 0.47203195095062256, + "learning_rate": 9.22466251085067e-05, + "loss": 1.9615, + "step": 6656 + }, + { + "epoch": 2.0432780847145486, + "grad_norm": 0.405129998922348, + "learning_rate": 9.224396628177019e-05, + "loss": 1.9165, + "step": 6657 + }, + { + "epoch": 2.043585021485574, + "grad_norm": 0.33447468280792236, + "learning_rate": 9.224130703755403e-05, + "loss": 1.852, + "step": 6658 + }, + { + "epoch": 2.0438919582565993, + "grad_norm": 0.33780771493911743, + "learning_rate": 9.223864737588453e-05, + "loss": 1.875, + "step": 6659 + }, + { + "epoch": 2.044198895027624, + "grad_norm": 0.37942594289779663, + "learning_rate": 9.223598729678796e-05, + "loss": 1.9115, + "step": 6660 + }, + { + "epoch": 2.0445058317986495, + "grad_norm": 0.3368874192237854, + "learning_rate": 9.223332680029059e-05, + "loss": 1.822, + "step": 6661 + }, + { + "epoch": 2.044812768569675, + "grad_norm": 0.3029201924800873, + "learning_rate": 9.223066588641873e-05, + "loss": 1.8902, + "step": 6662 + }, + { + "epoch": 2.0451197053406998, + "grad_norm": 0.4605506360530853, + "learning_rate": 9.22280045551987e-05, + "loss": 1.9164, + "step": 6663 + }, + { + "epoch": 2.045426642111725, + "grad_norm": 0.5012617111206055, + "learning_rate": 9.222534280665675e-05, + "loss": 1.8859, + "step": 6664 + }, + { + "epoch": 2.04573357888275, + "grad_norm": 0.5177115797996521, + "learning_rate": 9.222268064081924e-05, + "loss": 1.93, + "step": 6665 + }, + { + "epoch": 2.0460405156537753, + "grad_norm": 0.3966628313064575, + "learning_rate": 9.222001805771244e-05, + "loss": 1.8817, + "step": 6666 + }, + { + "epoch": 2.0463474524248007, + "grad_norm": 0.3670666813850403, + "learning_rate": 9.221735505736269e-05, + "loss": 1.8224, + "step": 6667 + }, + { + "epoch": 2.0466543891958255, + "grad_norm": 0.4584221839904785, + "learning_rate": 9.221469163979628e-05, + "loss": 1.7788, + "step": 6668 + }, + { + "epoch": 2.046961325966851, + "grad_norm": 0.5598693490028381, + "learning_rate": 9.221202780503954e-05, + "loss": 1.9263, + "step": 6669 + }, + { + "epoch": 2.047268262737876, + "grad_norm": 0.44200289249420166, + "learning_rate": 9.22093635531188e-05, + "loss": 1.8455, + "step": 6670 + }, + { + "epoch": 2.047575199508901, + "grad_norm": 0.33257725834846497, + "learning_rate": 9.22066988840604e-05, + "loss": 1.9019, + "step": 6671 + }, + { + "epoch": 2.0478821362799264, + "grad_norm": 0.4716290831565857, + "learning_rate": 9.220403379789066e-05, + "loss": 1.9012, + "step": 6672 + }, + { + "epoch": 2.0481890730509513, + "grad_norm": 0.5600453615188599, + "learning_rate": 9.220136829463591e-05, + "loss": 1.9158, + "step": 6673 + }, + { + "epoch": 2.0484960098219767, + "grad_norm": 0.5345216393470764, + "learning_rate": 9.219870237432252e-05, + "loss": 1.931, + "step": 6674 + }, + { + "epoch": 2.048802946593002, + "grad_norm": 0.36617112159729004, + "learning_rate": 9.219603603697682e-05, + "loss": 1.9019, + "step": 6675 + }, + { + "epoch": 2.049109883364027, + "grad_norm": 0.33677804470062256, + "learning_rate": 9.219336928262514e-05, + "loss": 1.8897, + "step": 6676 + }, + { + "epoch": 2.049416820135052, + "grad_norm": 0.48563066124916077, + "learning_rate": 9.219070211129388e-05, + "loss": 1.9147, + "step": 6677 + }, + { + "epoch": 2.0497237569060776, + "grad_norm": 0.5029729008674622, + "learning_rate": 9.218803452300935e-05, + "loss": 1.8926, + "step": 6678 + }, + { + "epoch": 2.0500306936771024, + "grad_norm": 0.3969452977180481, + "learning_rate": 9.218536651779795e-05, + "loss": 1.9337, + "step": 6679 + }, + { + "epoch": 2.050337630448128, + "grad_norm": 0.37374138832092285, + "learning_rate": 9.218269809568603e-05, + "loss": 1.9147, + "step": 6680 + }, + { + "epoch": 2.0506445672191527, + "grad_norm": 0.416608065366745, + "learning_rate": 9.218002925669996e-05, + "loss": 1.975, + "step": 6681 + }, + { + "epoch": 2.050951503990178, + "grad_norm": 0.35848283767700195, + "learning_rate": 9.217736000086612e-05, + "loss": 1.9194, + "step": 6682 + }, + { + "epoch": 2.0512584407612033, + "grad_norm": 0.3294626772403717, + "learning_rate": 9.217469032821088e-05, + "loss": 1.8541, + "step": 6683 + }, + { + "epoch": 2.0515653775322282, + "grad_norm": 0.4164618253707886, + "learning_rate": 9.217202023876064e-05, + "loss": 1.8999, + "step": 6684 + }, + { + "epoch": 2.0518723143032536, + "grad_norm": 0.4067288935184479, + "learning_rate": 9.216934973254179e-05, + "loss": 1.8609, + "step": 6685 + }, + { + "epoch": 2.052179251074279, + "grad_norm": 0.38743069767951965, + "learning_rate": 9.216667880958069e-05, + "loss": 1.8571, + "step": 6686 + }, + { + "epoch": 2.052486187845304, + "grad_norm": 0.3430919647216797, + "learning_rate": 9.216400746990377e-05, + "loss": 1.9229, + "step": 6687 + }, + { + "epoch": 2.052793124616329, + "grad_norm": 0.3512028753757477, + "learning_rate": 9.21613357135374e-05, + "loss": 1.9331, + "step": 6688 + }, + { + "epoch": 2.053100061387354, + "grad_norm": 0.3708036541938782, + "learning_rate": 9.215866354050799e-05, + "loss": 1.8499, + "step": 6689 + }, + { + "epoch": 2.0534069981583793, + "grad_norm": 0.39376455545425415, + "learning_rate": 9.215599095084199e-05, + "loss": 1.8531, + "step": 6690 + }, + { + "epoch": 2.0537139349294047, + "grad_norm": 0.3855830430984497, + "learning_rate": 9.215331794456576e-05, + "loss": 1.8597, + "step": 6691 + }, + { + "epoch": 2.0540208717004296, + "grad_norm": 0.3515113592147827, + "learning_rate": 9.215064452170574e-05, + "loss": 1.8776, + "step": 6692 + }, + { + "epoch": 2.054327808471455, + "grad_norm": 0.3165057897567749, + "learning_rate": 9.214797068228833e-05, + "loss": 1.926, + "step": 6693 + }, + { + "epoch": 2.0546347452424802, + "grad_norm": 0.3516407310962677, + "learning_rate": 9.214529642633998e-05, + "loss": 1.9397, + "step": 6694 + }, + { + "epoch": 2.054941682013505, + "grad_norm": 0.36943888664245605, + "learning_rate": 9.214262175388713e-05, + "loss": 1.9114, + "step": 6695 + }, + { + "epoch": 2.0552486187845305, + "grad_norm": 0.3490065634250641, + "learning_rate": 9.213994666495616e-05, + "loss": 1.8637, + "step": 6696 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.30341869592666626, + "learning_rate": 9.213727115957356e-05, + "loss": 1.8525, + "step": 6697 + }, + { + "epoch": 2.0558624923265807, + "grad_norm": 0.3899247646331787, + "learning_rate": 9.213459523776573e-05, + "loss": 2.0578, + "step": 6698 + }, + { + "epoch": 2.056169429097606, + "grad_norm": 0.34904104471206665, + "learning_rate": 9.213191889955915e-05, + "loss": 1.9135, + "step": 6699 + }, + { + "epoch": 2.056476365868631, + "grad_norm": 0.3806450366973877, + "learning_rate": 9.212924214498024e-05, + "loss": 1.9252, + "step": 6700 + }, + { + "epoch": 2.0567833026396563, + "grad_norm": 0.33185848593711853, + "learning_rate": 9.212656497405547e-05, + "loss": 1.8457, + "step": 6701 + }, + { + "epoch": 2.0570902394106816, + "grad_norm": 0.356717050075531, + "learning_rate": 9.21238873868113e-05, + "loss": 1.9086, + "step": 6702 + }, + { + "epoch": 2.0573971761817065, + "grad_norm": 0.41743260622024536, + "learning_rate": 9.212120938327418e-05, + "loss": 1.9255, + "step": 6703 + }, + { + "epoch": 2.057704112952732, + "grad_norm": 0.3937377631664276, + "learning_rate": 9.211853096347058e-05, + "loss": 1.9529, + "step": 6704 + }, + { + "epoch": 2.0580110497237567, + "grad_norm": 0.43980923295021057, + "learning_rate": 9.211585212742698e-05, + "loss": 1.905, + "step": 6705 + }, + { + "epoch": 2.058317986494782, + "grad_norm": 0.36891186237335205, + "learning_rate": 9.211317287516984e-05, + "loss": 1.8109, + "step": 6706 + }, + { + "epoch": 2.0586249232658074, + "grad_norm": 0.3582547605037689, + "learning_rate": 9.211049320672563e-05, + "loss": 1.9633, + "step": 6707 + }, + { + "epoch": 2.0589318600368323, + "grad_norm": 0.3421446979045868, + "learning_rate": 9.210781312212087e-05, + "loss": 1.8956, + "step": 6708 + }, + { + "epoch": 2.0592387968078576, + "grad_norm": 0.34717023372650146, + "learning_rate": 9.210513262138199e-05, + "loss": 1.837, + "step": 6709 + }, + { + "epoch": 2.059545733578883, + "grad_norm": 0.32769930362701416, + "learning_rate": 9.210245170453553e-05, + "loss": 1.8588, + "step": 6710 + }, + { + "epoch": 2.059852670349908, + "grad_norm": 0.3694380223751068, + "learning_rate": 9.209977037160796e-05, + "loss": 1.9298, + "step": 6711 + }, + { + "epoch": 2.060159607120933, + "grad_norm": 0.38598594069480896, + "learning_rate": 9.209708862262578e-05, + "loss": 1.9011, + "step": 6712 + }, + { + "epoch": 2.060466543891958, + "grad_norm": 0.33520397543907166, + "learning_rate": 9.20944064576155e-05, + "loss": 1.9689, + "step": 6713 + }, + { + "epoch": 2.0607734806629834, + "grad_norm": 0.36898335814476013, + "learning_rate": 9.209172387660363e-05, + "loss": 1.9362, + "step": 6714 + }, + { + "epoch": 2.0610804174340087, + "grad_norm": 0.3989763855934143, + "learning_rate": 9.208904087961667e-05, + "loss": 1.8875, + "step": 6715 + }, + { + "epoch": 2.0613873542050336, + "grad_norm": 0.38079237937927246, + "learning_rate": 9.208635746668113e-05, + "loss": 1.8645, + "step": 6716 + }, + { + "epoch": 2.061694290976059, + "grad_norm": 0.3853057026863098, + "learning_rate": 9.208367363782355e-05, + "loss": 1.9346, + "step": 6717 + }, + { + "epoch": 2.0620012277470843, + "grad_norm": 0.33557942509651184, + "learning_rate": 9.208098939307044e-05, + "loss": 1.8629, + "step": 6718 + }, + { + "epoch": 2.062308164518109, + "grad_norm": 0.31848183274269104, + "learning_rate": 9.207830473244832e-05, + "loss": 1.7616, + "step": 6719 + }, + { + "epoch": 2.0626151012891345, + "grad_norm": 0.2901391088962555, + "learning_rate": 9.207561965598375e-05, + "loss": 1.8876, + "step": 6720 + }, + { + "epoch": 2.06292203806016, + "grad_norm": 0.33935174345970154, + "learning_rate": 9.207293416370322e-05, + "loss": 1.8407, + "step": 6721 + }, + { + "epoch": 2.0632289748311847, + "grad_norm": 0.3615114390850067, + "learning_rate": 9.207024825563331e-05, + "loss": 1.8378, + "step": 6722 + }, + { + "epoch": 2.06353591160221, + "grad_norm": 0.35903334617614746, + "learning_rate": 9.206756193180053e-05, + "loss": 1.8316, + "step": 6723 + }, + { + "epoch": 2.063842848373235, + "grad_norm": 0.35222968459129333, + "learning_rate": 9.206487519223146e-05, + "loss": 1.8786, + "step": 6724 + }, + { + "epoch": 2.0641497851442603, + "grad_norm": 0.3412967622280121, + "learning_rate": 9.206218803695264e-05, + "loss": 1.8682, + "step": 6725 + }, + { + "epoch": 2.0644567219152856, + "grad_norm": 0.4166354835033417, + "learning_rate": 9.205950046599062e-05, + "loss": 1.8871, + "step": 6726 + }, + { + "epoch": 2.0647636586863105, + "grad_norm": 0.4631161093711853, + "learning_rate": 9.205681247937196e-05, + "loss": 1.9328, + "step": 6727 + }, + { + "epoch": 2.065070595457336, + "grad_norm": 0.39197248220443726, + "learning_rate": 9.205412407712325e-05, + "loss": 1.9434, + "step": 6728 + }, + { + "epoch": 2.0653775322283607, + "grad_norm": 0.37939852476119995, + "learning_rate": 9.205143525927103e-05, + "loss": 1.9115, + "step": 6729 + }, + { + "epoch": 2.065684468999386, + "grad_norm": 0.35442814230918884, + "learning_rate": 9.204874602584186e-05, + "loss": 1.9197, + "step": 6730 + }, + { + "epoch": 2.0659914057704114, + "grad_norm": 0.3598809242248535, + "learning_rate": 9.204605637686235e-05, + "loss": 1.8684, + "step": 6731 + }, + { + "epoch": 2.0662983425414363, + "grad_norm": 0.3360415995121002, + "learning_rate": 9.204336631235905e-05, + "loss": 1.8531, + "step": 6732 + }, + { + "epoch": 2.0666052793124616, + "grad_norm": 0.4487619698047638, + "learning_rate": 9.204067583235859e-05, + "loss": 1.8509, + "step": 6733 + }, + { + "epoch": 2.066912216083487, + "grad_norm": 0.37166881561279297, + "learning_rate": 9.203798493688753e-05, + "loss": 1.8826, + "step": 6734 + }, + { + "epoch": 2.067219152854512, + "grad_norm": 0.35294032096862793, + "learning_rate": 9.203529362597244e-05, + "loss": 1.9029, + "step": 6735 + }, + { + "epoch": 2.067526089625537, + "grad_norm": 0.4115317165851593, + "learning_rate": 9.203260189963995e-05, + "loss": 1.9117, + "step": 6736 + }, + { + "epoch": 2.0678330263965625, + "grad_norm": 0.44137999415397644, + "learning_rate": 9.202990975791666e-05, + "loss": 1.8754, + "step": 6737 + }, + { + "epoch": 2.0681399631675874, + "grad_norm": 0.46055081486701965, + "learning_rate": 9.202721720082916e-05, + "loss": 1.8322, + "step": 6738 + }, + { + "epoch": 2.0684468999386127, + "grad_norm": 0.38548141717910767, + "learning_rate": 9.202452422840407e-05, + "loss": 1.8341, + "step": 6739 + }, + { + "epoch": 2.0687538367096376, + "grad_norm": 0.3542765974998474, + "learning_rate": 9.2021830840668e-05, + "loss": 1.9301, + "step": 6740 + }, + { + "epoch": 2.069060773480663, + "grad_norm": 0.35987207293510437, + "learning_rate": 9.201913703764755e-05, + "loss": 1.8756, + "step": 6741 + }, + { + "epoch": 2.0693677102516883, + "grad_norm": 0.4297364056110382, + "learning_rate": 9.201644281936938e-05, + "loss": 1.8549, + "step": 6742 + }, + { + "epoch": 2.069674647022713, + "grad_norm": 0.3679873049259186, + "learning_rate": 9.20137481858601e-05, + "loss": 1.8905, + "step": 6743 + }, + { + "epoch": 2.0699815837937385, + "grad_norm": 0.3402685523033142, + "learning_rate": 9.201105313714632e-05, + "loss": 1.8834, + "step": 6744 + }, + { + "epoch": 2.070288520564764, + "grad_norm": 0.40986955165863037, + "learning_rate": 9.200835767325469e-05, + "loss": 1.8861, + "step": 6745 + }, + { + "epoch": 2.0705954573357888, + "grad_norm": 0.4305949807167053, + "learning_rate": 9.200566179421186e-05, + "loss": 1.8977, + "step": 6746 + }, + { + "epoch": 2.070902394106814, + "grad_norm": 0.3948439359664917, + "learning_rate": 9.200296550004446e-05, + "loss": 1.8801, + "step": 6747 + }, + { + "epoch": 2.071209330877839, + "grad_norm": 0.3404015600681305, + "learning_rate": 9.200026879077912e-05, + "loss": 1.8417, + "step": 6748 + }, + { + "epoch": 2.0715162676488643, + "grad_norm": 0.39447101950645447, + "learning_rate": 9.199757166644252e-05, + "loss": 1.9675, + "step": 6749 + }, + { + "epoch": 2.0718232044198897, + "grad_norm": 0.44323647022247314, + "learning_rate": 9.199487412706129e-05, + "loss": 1.9014, + "step": 6750 + }, + { + "epoch": 2.0721301411909145, + "grad_norm": 0.47096556425094604, + "learning_rate": 9.199217617266212e-05, + "loss": 1.8783, + "step": 6751 + }, + { + "epoch": 2.07243707796194, + "grad_norm": 0.42863038182258606, + "learning_rate": 9.198947780327163e-05, + "loss": 1.8369, + "step": 6752 + }, + { + "epoch": 2.072744014732965, + "grad_norm": 0.414079874753952, + "learning_rate": 9.198677901891652e-05, + "loss": 1.9247, + "step": 6753 + }, + { + "epoch": 2.07305095150399, + "grad_norm": 0.3445589542388916, + "learning_rate": 9.198407981962345e-05, + "loss": 1.8494, + "step": 6754 + }, + { + "epoch": 2.0733578882750154, + "grad_norm": 0.4340321719646454, + "learning_rate": 9.198138020541908e-05, + "loss": 1.904, + "step": 6755 + }, + { + "epoch": 2.0736648250460403, + "grad_norm": 0.55349200963974, + "learning_rate": 9.197868017633013e-05, + "loss": 1.9368, + "step": 6756 + }, + { + "epoch": 2.0739717618170657, + "grad_norm": 0.5893970727920532, + "learning_rate": 9.197597973238326e-05, + "loss": 1.9329, + "step": 6757 + }, + { + "epoch": 2.074278698588091, + "grad_norm": 0.4942009449005127, + "learning_rate": 9.197327887360514e-05, + "loss": 1.7726, + "step": 6758 + }, + { + "epoch": 2.074585635359116, + "grad_norm": 0.36411046981811523, + "learning_rate": 9.197057760002247e-05, + "loss": 1.8214, + "step": 6759 + }, + { + "epoch": 2.074892572130141, + "grad_norm": 0.31520166993141174, + "learning_rate": 9.196787591166198e-05, + "loss": 1.8491, + "step": 6760 + }, + { + "epoch": 2.0751995089011666, + "grad_norm": 0.47392621636390686, + "learning_rate": 9.196517380855032e-05, + "loss": 2.0165, + "step": 6761 + }, + { + "epoch": 2.0755064456721914, + "grad_norm": 0.4768085181713104, + "learning_rate": 9.196247129071423e-05, + "loss": 1.9289, + "step": 6762 + }, + { + "epoch": 2.075813382443217, + "grad_norm": 0.396391361951828, + "learning_rate": 9.195976835818039e-05, + "loss": 1.9521, + "step": 6763 + }, + { + "epoch": 2.0761203192142417, + "grad_norm": 0.4030967950820923, + "learning_rate": 9.195706501097551e-05, + "loss": 1.8386, + "step": 6764 + }, + { + "epoch": 2.076427255985267, + "grad_norm": 0.48308777809143066, + "learning_rate": 9.195436124912635e-05, + "loss": 1.8874, + "step": 6765 + }, + { + "epoch": 2.0767341927562923, + "grad_norm": 0.5232771635055542, + "learning_rate": 9.19516570726596e-05, + "loss": 1.8822, + "step": 6766 + }, + { + "epoch": 2.0770411295273172, + "grad_norm": 0.3607174754142761, + "learning_rate": 9.194895248160198e-05, + "loss": 1.8995, + "step": 6767 + }, + { + "epoch": 2.0773480662983426, + "grad_norm": 0.4354429841041565, + "learning_rate": 9.194624747598022e-05, + "loss": 1.8629, + "step": 6768 + }, + { + "epoch": 2.077655003069368, + "grad_norm": 0.5405299067497253, + "learning_rate": 9.194354205582107e-05, + "loss": 1.8608, + "step": 6769 + }, + { + "epoch": 2.077961939840393, + "grad_norm": 0.5442025065422058, + "learning_rate": 9.194083622115123e-05, + "loss": 1.885, + "step": 6770 + }, + { + "epoch": 2.078268876611418, + "grad_norm": 0.4160112142562866, + "learning_rate": 9.193812997199749e-05, + "loss": 1.8617, + "step": 6771 + }, + { + "epoch": 2.078575813382443, + "grad_norm": 0.3550199866294861, + "learning_rate": 9.193542330838656e-05, + "loss": 1.9277, + "step": 6772 + }, + { + "epoch": 2.0788827501534684, + "grad_norm": 0.5224893093109131, + "learning_rate": 9.19327162303452e-05, + "loss": 1.7893, + "step": 6773 + }, + { + "epoch": 2.0791896869244937, + "grad_norm": 0.45021727681159973, + "learning_rate": 9.193000873790014e-05, + "loss": 1.8635, + "step": 6774 + }, + { + "epoch": 2.0794966236955186, + "grad_norm": 0.3087892532348633, + "learning_rate": 9.192730083107819e-05, + "loss": 1.842, + "step": 6775 + }, + { + "epoch": 2.079803560466544, + "grad_norm": 0.4304139018058777, + "learning_rate": 9.192459250990606e-05, + "loss": 1.8461, + "step": 6776 + }, + { + "epoch": 2.0801104972375692, + "grad_norm": 0.4388587474822998, + "learning_rate": 9.192188377441054e-05, + "loss": 1.8978, + "step": 6777 + }, + { + "epoch": 2.080417434008594, + "grad_norm": 0.3452616333961487, + "learning_rate": 9.19191746246184e-05, + "loss": 1.8849, + "step": 6778 + }, + { + "epoch": 2.0807243707796195, + "grad_norm": 0.3127618432044983, + "learning_rate": 9.191646506055638e-05, + "loss": 1.8703, + "step": 6779 + }, + { + "epoch": 2.0810313075506444, + "grad_norm": 0.3424977958202362, + "learning_rate": 9.191375508225131e-05, + "loss": 1.8446, + "step": 6780 + }, + { + "epoch": 2.0813382443216697, + "grad_norm": 0.3536671996116638, + "learning_rate": 9.191104468972993e-05, + "loss": 1.9079, + "step": 6781 + }, + { + "epoch": 2.081645181092695, + "grad_norm": 0.3689599633216858, + "learning_rate": 9.190833388301905e-05, + "loss": 1.8683, + "step": 6782 + }, + { + "epoch": 2.08195211786372, + "grad_norm": 0.30976906418800354, + "learning_rate": 9.190562266214546e-05, + "loss": 1.89, + "step": 6783 + }, + { + "epoch": 2.0822590546347453, + "grad_norm": 0.34682777523994446, + "learning_rate": 9.190291102713593e-05, + "loss": 1.8384, + "step": 6784 + }, + { + "epoch": 2.0825659914057706, + "grad_norm": 0.4135018587112427, + "learning_rate": 9.190019897801727e-05, + "loss": 1.8878, + "step": 6785 + }, + { + "epoch": 2.0828729281767955, + "grad_norm": 0.4247548580169678, + "learning_rate": 9.189748651481629e-05, + "loss": 1.9244, + "step": 6786 + }, + { + "epoch": 2.083179864947821, + "grad_norm": 0.3961609899997711, + "learning_rate": 9.18947736375598e-05, + "loss": 1.9539, + "step": 6787 + }, + { + "epoch": 2.0834868017188457, + "grad_norm": 0.4174231290817261, + "learning_rate": 9.18920603462746e-05, + "loss": 1.9705, + "step": 6788 + }, + { + "epoch": 2.083793738489871, + "grad_norm": 0.38771605491638184, + "learning_rate": 9.18893466409875e-05, + "loss": 1.9038, + "step": 6789 + }, + { + "epoch": 2.0841006752608964, + "grad_norm": 0.38480475544929504, + "learning_rate": 9.188663252172534e-05, + "loss": 1.8725, + "step": 6790 + }, + { + "epoch": 2.0844076120319213, + "grad_norm": 0.37508267164230347, + "learning_rate": 9.18839179885149e-05, + "loss": 1.8819, + "step": 6791 + }, + { + "epoch": 2.0847145488029466, + "grad_norm": 0.3970893621444702, + "learning_rate": 9.188120304138306e-05, + "loss": 1.9035, + "step": 6792 + }, + { + "epoch": 2.085021485573972, + "grad_norm": 0.42629706859588623, + "learning_rate": 9.18784876803566e-05, + "loss": 1.993, + "step": 6793 + }, + { + "epoch": 2.085328422344997, + "grad_norm": 0.40387317538261414, + "learning_rate": 9.18757719054624e-05, + "loss": 1.8987, + "step": 6794 + }, + { + "epoch": 2.085635359116022, + "grad_norm": 0.40304768085479736, + "learning_rate": 9.187305571672726e-05, + "loss": 1.9017, + "step": 6795 + }, + { + "epoch": 2.0859422958870475, + "grad_norm": 0.34255313873291016, + "learning_rate": 9.187033911417805e-05, + "loss": 1.8406, + "step": 6796 + }, + { + "epoch": 2.0862492326580724, + "grad_norm": 0.34713810682296753, + "learning_rate": 9.18676220978416e-05, + "loss": 1.8773, + "step": 6797 + }, + { + "epoch": 2.0865561694290977, + "grad_norm": 0.3651806712150574, + "learning_rate": 9.186490466774478e-05, + "loss": 1.9158, + "step": 6798 + }, + { + "epoch": 2.0868631062001226, + "grad_norm": 0.3859401047229767, + "learning_rate": 9.186218682391443e-05, + "loss": 1.8488, + "step": 6799 + }, + { + "epoch": 2.087170042971148, + "grad_norm": 0.34309303760528564, + "learning_rate": 9.185946856637742e-05, + "loss": 1.8373, + "step": 6800 + }, + { + "epoch": 2.0874769797421733, + "grad_norm": 0.3597384989261627, + "learning_rate": 9.18567498951606e-05, + "loss": 1.8297, + "step": 6801 + }, + { + "epoch": 2.087783916513198, + "grad_norm": 0.39170950651168823, + "learning_rate": 9.185403081029085e-05, + "loss": 1.9623, + "step": 6802 + }, + { + "epoch": 2.0880908532842235, + "grad_norm": 0.37024664878845215, + "learning_rate": 9.185131131179503e-05, + "loss": 1.8966, + "step": 6803 + }, + { + "epoch": 2.0883977900552484, + "grad_norm": 0.37869709730148315, + "learning_rate": 9.184859139970001e-05, + "loss": 1.9121, + "step": 6804 + }, + { + "epoch": 2.0887047268262737, + "grad_norm": 0.3808143436908722, + "learning_rate": 9.184587107403271e-05, + "loss": 1.918, + "step": 6805 + }, + { + "epoch": 2.089011663597299, + "grad_norm": 0.3864719271659851, + "learning_rate": 9.184315033481996e-05, + "loss": 1.9087, + "step": 6806 + }, + { + "epoch": 2.089318600368324, + "grad_norm": 0.41121476888656616, + "learning_rate": 9.184042918208869e-05, + "loss": 1.8971, + "step": 6807 + }, + { + "epoch": 2.0896255371393493, + "grad_norm": 0.33098986744880676, + "learning_rate": 9.183770761586576e-05, + "loss": 1.8497, + "step": 6808 + }, + { + "epoch": 2.0899324739103746, + "grad_norm": 0.336174339056015, + "learning_rate": 9.183498563617809e-05, + "loss": 1.8341, + "step": 6809 + }, + { + "epoch": 2.0902394106813995, + "grad_norm": 0.339040070772171, + "learning_rate": 9.183226324305258e-05, + "loss": 1.9228, + "step": 6810 + }, + { + "epoch": 2.090546347452425, + "grad_norm": 0.395000159740448, + "learning_rate": 9.182954043651613e-05, + "loss": 1.9773, + "step": 6811 + }, + { + "epoch": 2.09085328422345, + "grad_norm": 0.3884550929069519, + "learning_rate": 9.182681721659563e-05, + "loss": 1.9665, + "step": 6812 + }, + { + "epoch": 2.091160220994475, + "grad_norm": 0.38752105832099915, + "learning_rate": 9.182409358331801e-05, + "loss": 1.9337, + "step": 6813 + }, + { + "epoch": 2.0914671577655004, + "grad_norm": 0.3557493984699249, + "learning_rate": 9.182136953671017e-05, + "loss": 1.8506, + "step": 6814 + }, + { + "epoch": 2.0917740945365253, + "grad_norm": 0.36052554845809937, + "learning_rate": 9.181864507679906e-05, + "loss": 1.8336, + "step": 6815 + }, + { + "epoch": 2.0920810313075506, + "grad_norm": 0.3311133086681366, + "learning_rate": 9.181592020361158e-05, + "loss": 1.9121, + "step": 6816 + }, + { + "epoch": 2.092387968078576, + "grad_norm": 0.33922117948532104, + "learning_rate": 9.181319491717468e-05, + "loss": 1.8366, + "step": 6817 + }, + { + "epoch": 2.092694904849601, + "grad_norm": 0.30820000171661377, + "learning_rate": 9.181046921751527e-05, + "loss": 1.8931, + "step": 6818 + }, + { + "epoch": 2.093001841620626, + "grad_norm": 0.327374666929245, + "learning_rate": 9.180774310466031e-05, + "loss": 1.8818, + "step": 6819 + }, + { + "epoch": 2.0933087783916515, + "grad_norm": 0.3244091868400574, + "learning_rate": 9.180501657863672e-05, + "loss": 1.8542, + "step": 6820 + }, + { + "epoch": 2.0936157151626764, + "grad_norm": 0.32823657989501953, + "learning_rate": 9.180228963947144e-05, + "loss": 1.8745, + "step": 6821 + }, + { + "epoch": 2.0939226519337018, + "grad_norm": 0.32869017124176025, + "learning_rate": 9.179956228719144e-05, + "loss": 1.8497, + "step": 6822 + }, + { + "epoch": 2.0942295887047266, + "grad_norm": 0.3624805808067322, + "learning_rate": 9.179683452182369e-05, + "loss": 1.9499, + "step": 6823 + }, + { + "epoch": 2.094536525475752, + "grad_norm": 0.35709038376808167, + "learning_rate": 9.179410634339509e-05, + "loss": 1.8709, + "step": 6824 + }, + { + "epoch": 2.0948434622467773, + "grad_norm": 0.3875027298927307, + "learning_rate": 9.179137775193266e-05, + "loss": 1.883, + "step": 6825 + }, + { + "epoch": 2.095150399017802, + "grad_norm": 0.4203769862651825, + "learning_rate": 9.178864874746333e-05, + "loss": 1.814, + "step": 6826 + }, + { + "epoch": 2.0954573357888275, + "grad_norm": 0.46331214904785156, + "learning_rate": 9.178591933001407e-05, + "loss": 1.9821, + "step": 6827 + }, + { + "epoch": 2.095764272559853, + "grad_norm": 0.4264145791530609, + "learning_rate": 9.178318949961188e-05, + "loss": 1.9249, + "step": 6828 + }, + { + "epoch": 2.0960712093308778, + "grad_norm": 0.3697608709335327, + "learning_rate": 9.178045925628371e-05, + "loss": 2.0052, + "step": 6829 + }, + { + "epoch": 2.096378146101903, + "grad_norm": 0.39582517743110657, + "learning_rate": 9.177772860005656e-05, + "loss": 1.9086, + "step": 6830 + }, + { + "epoch": 2.096685082872928, + "grad_norm": 0.3287788927555084, + "learning_rate": 9.17749975309574e-05, + "loss": 1.8766, + "step": 6831 + }, + { + "epoch": 2.0969920196439533, + "grad_norm": 0.33648282289505005, + "learning_rate": 9.177226604901324e-05, + "loss": 1.933, + "step": 6832 + }, + { + "epoch": 2.0972989564149787, + "grad_norm": 0.34225910902023315, + "learning_rate": 9.176953415425106e-05, + "loss": 1.8801, + "step": 6833 + }, + { + "epoch": 2.0976058931860035, + "grad_norm": 0.35536935925483704, + "learning_rate": 9.176680184669786e-05, + "loss": 1.9472, + "step": 6834 + }, + { + "epoch": 2.097912829957029, + "grad_norm": 0.39152607321739197, + "learning_rate": 9.176406912638064e-05, + "loss": 1.9502, + "step": 6835 + }, + { + "epoch": 2.098219766728054, + "grad_norm": 0.3812694549560547, + "learning_rate": 9.176133599332643e-05, + "loss": 1.8746, + "step": 6836 + }, + { + "epoch": 2.098526703499079, + "grad_norm": 0.36225396394729614, + "learning_rate": 9.17586024475622e-05, + "loss": 1.8489, + "step": 6837 + }, + { + "epoch": 2.0988336402701044, + "grad_norm": 0.3953205943107605, + "learning_rate": 9.1755868489115e-05, + "loss": 1.8671, + "step": 6838 + }, + { + "epoch": 2.0991405770411293, + "grad_norm": 0.33443906903266907, + "learning_rate": 9.175313411801181e-05, + "loss": 1.8574, + "step": 6839 + }, + { + "epoch": 2.0994475138121547, + "grad_norm": 0.3358154892921448, + "learning_rate": 9.17503993342797e-05, + "loss": 1.8329, + "step": 6840 + }, + { + "epoch": 2.09975445058318, + "grad_norm": 0.45934513211250305, + "learning_rate": 9.174766413794566e-05, + "loss": 1.862, + "step": 6841 + }, + { + "epoch": 2.100061387354205, + "grad_norm": 0.46342480182647705, + "learning_rate": 9.174492852903673e-05, + "loss": 1.8747, + "step": 6842 + }, + { + "epoch": 2.1003683241252302, + "grad_norm": 0.4199588894844055, + "learning_rate": 9.174219250757996e-05, + "loss": 1.9308, + "step": 6843 + }, + { + "epoch": 2.1006752608962556, + "grad_norm": 0.3508588373661041, + "learning_rate": 9.173945607360238e-05, + "loss": 1.8622, + "step": 6844 + }, + { + "epoch": 2.1009821976672804, + "grad_norm": 0.3656609356403351, + "learning_rate": 9.173671922713104e-05, + "loss": 1.899, + "step": 6845 + }, + { + "epoch": 2.101289134438306, + "grad_norm": 0.43374791741371155, + "learning_rate": 9.173398196819295e-05, + "loss": 1.8725, + "step": 6846 + }, + { + "epoch": 2.1015960712093307, + "grad_norm": 0.49730411171913147, + "learning_rate": 9.17312442968152e-05, + "loss": 1.9224, + "step": 6847 + }, + { + "epoch": 2.101903007980356, + "grad_norm": 0.45392677187919617, + "learning_rate": 9.172850621302484e-05, + "loss": 1.8374, + "step": 6848 + }, + { + "epoch": 2.1022099447513813, + "grad_norm": 0.3507382273674011, + "learning_rate": 9.172576771684892e-05, + "loss": 1.8875, + "step": 6849 + }, + { + "epoch": 2.1025168815224062, + "grad_norm": 0.4124681055545807, + "learning_rate": 9.172302880831451e-05, + "loss": 1.8828, + "step": 6850 + }, + { + "epoch": 2.1028238182934316, + "grad_norm": 0.5120462775230408, + "learning_rate": 9.172028948744867e-05, + "loss": 1.8218, + "step": 6851 + }, + { + "epoch": 2.103130755064457, + "grad_norm": 0.5858038067817688, + "learning_rate": 9.171754975427848e-05, + "loss": 1.8679, + "step": 6852 + }, + { + "epoch": 2.103437691835482, + "grad_norm": 0.5196588039398193, + "learning_rate": 9.171480960883101e-05, + "loss": 1.8885, + "step": 6853 + }, + { + "epoch": 2.103744628606507, + "grad_norm": 0.38581255078315735, + "learning_rate": 9.171206905113335e-05, + "loss": 1.9127, + "step": 6854 + }, + { + "epoch": 2.104051565377532, + "grad_norm": 0.31531259417533875, + "learning_rate": 9.170932808121256e-05, + "loss": 1.84, + "step": 6855 + }, + { + "epoch": 2.1043585021485574, + "grad_norm": 0.4595080018043518, + "learning_rate": 9.170658669909575e-05, + "loss": 1.908, + "step": 6856 + }, + { + "epoch": 2.1046654389195827, + "grad_norm": 0.42485639452934265, + "learning_rate": 9.170384490481001e-05, + "loss": 1.8943, + "step": 6857 + }, + { + "epoch": 2.1049723756906076, + "grad_norm": 0.3465791344642639, + "learning_rate": 9.170110269838243e-05, + "loss": 1.8362, + "step": 6858 + }, + { + "epoch": 2.105279312461633, + "grad_norm": 0.26863181591033936, + "learning_rate": 9.16983600798401e-05, + "loss": 1.856, + "step": 6859 + }, + { + "epoch": 2.1055862492326582, + "grad_norm": 0.33826425671577454, + "learning_rate": 9.169561704921014e-05, + "loss": 1.8148, + "step": 6860 + }, + { + "epoch": 2.105893186003683, + "grad_norm": 0.3657929301261902, + "learning_rate": 9.169287360651967e-05, + "loss": 1.8978, + "step": 6861 + }, + { + "epoch": 2.1062001227747085, + "grad_norm": 0.2963617444038391, + "learning_rate": 9.169012975179579e-05, + "loss": 1.8432, + "step": 6862 + }, + { + "epoch": 2.1065070595457334, + "grad_norm": 0.32966092228889465, + "learning_rate": 9.168738548506559e-05, + "loss": 1.9137, + "step": 6863 + }, + { + "epoch": 2.1068139963167587, + "grad_norm": 0.4043191075325012, + "learning_rate": 9.168464080635622e-05, + "loss": 1.9294, + "step": 6864 + }, + { + "epoch": 2.107120933087784, + "grad_norm": 0.41461876034736633, + "learning_rate": 9.168189571569479e-05, + "loss": 1.8582, + "step": 6865 + }, + { + "epoch": 2.107427869858809, + "grad_norm": 0.34119492769241333, + "learning_rate": 9.167915021310845e-05, + "loss": 1.8245, + "step": 6866 + }, + { + "epoch": 2.1077348066298343, + "grad_norm": 0.3259434401988983, + "learning_rate": 9.167640429862429e-05, + "loss": 1.8962, + "step": 6867 + }, + { + "epoch": 2.1080417434008596, + "grad_norm": 0.3074548840522766, + "learning_rate": 9.167365797226951e-05, + "loss": 1.8617, + "step": 6868 + }, + { + "epoch": 2.1083486801718845, + "grad_norm": 0.40738388895988464, + "learning_rate": 9.167091123407121e-05, + "loss": 1.9701, + "step": 6869 + }, + { + "epoch": 2.10865561694291, + "grad_norm": 0.3931449055671692, + "learning_rate": 9.166816408405653e-05, + "loss": 1.8874, + "step": 6870 + }, + { + "epoch": 2.108962553713935, + "grad_norm": 0.3726460635662079, + "learning_rate": 9.166541652225264e-05, + "loss": 1.9307, + "step": 6871 + }, + { + "epoch": 2.10926949048496, + "grad_norm": 0.36566078662872314, + "learning_rate": 9.166266854868667e-05, + "loss": 1.8782, + "step": 6872 + }, + { + "epoch": 2.1095764272559854, + "grad_norm": 0.33448025584220886, + "learning_rate": 9.16599201633858e-05, + "loss": 1.8007, + "step": 6873 + }, + { + "epoch": 2.1098833640270103, + "grad_norm": 0.4261031150817871, + "learning_rate": 9.165717136637716e-05, + "loss": 1.9092, + "step": 6874 + }, + { + "epoch": 2.1101903007980356, + "grad_norm": 0.37860241532325745, + "learning_rate": 9.165442215768798e-05, + "loss": 1.8538, + "step": 6875 + }, + { + "epoch": 2.110497237569061, + "grad_norm": 0.35417279601097107, + "learning_rate": 9.165167253734535e-05, + "loss": 1.8859, + "step": 6876 + }, + { + "epoch": 2.110804174340086, + "grad_norm": 0.33357858657836914, + "learning_rate": 9.16489225053765e-05, + "loss": 1.8615, + "step": 6877 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.40441447496414185, + "learning_rate": 9.164617206180856e-05, + "loss": 1.8711, + "step": 6878 + }, + { + "epoch": 2.1114180478821365, + "grad_norm": 0.401530921459198, + "learning_rate": 9.164342120666876e-05, + "loss": 1.8378, + "step": 6879 + }, + { + "epoch": 2.1117249846531614, + "grad_norm": 0.36379504203796387, + "learning_rate": 9.164066993998426e-05, + "loss": 1.87, + "step": 6880 + }, + { + "epoch": 2.1120319214241867, + "grad_norm": 0.36242642998695374, + "learning_rate": 9.163791826178225e-05, + "loss": 1.9041, + "step": 6881 + }, + { + "epoch": 2.1123388581952116, + "grad_norm": 0.34601980447769165, + "learning_rate": 9.163516617208994e-05, + "loss": 1.9248, + "step": 6882 + }, + { + "epoch": 2.112645794966237, + "grad_norm": 0.4664660096168518, + "learning_rate": 9.163241367093451e-05, + "loss": 1.901, + "step": 6883 + }, + { + "epoch": 2.1129527317372623, + "grad_norm": 0.5991809964179993, + "learning_rate": 9.162966075834315e-05, + "loss": 1.9061, + "step": 6884 + }, + { + "epoch": 2.113259668508287, + "grad_norm": 0.5235050320625305, + "learning_rate": 9.16269074343431e-05, + "loss": 1.8958, + "step": 6885 + }, + { + "epoch": 2.1135666052793125, + "grad_norm": 0.39008161425590515, + "learning_rate": 9.162415369896153e-05, + "loss": 1.7935, + "step": 6886 + }, + { + "epoch": 2.113873542050338, + "grad_norm": 0.4212269186973572, + "learning_rate": 9.16213995522257e-05, + "loss": 1.9876, + "step": 6887 + }, + { + "epoch": 2.1141804788213627, + "grad_norm": 0.44495880603790283, + "learning_rate": 9.161864499416279e-05, + "loss": 1.9011, + "step": 6888 + }, + { + "epoch": 2.114487415592388, + "grad_norm": 0.40533384680747986, + "learning_rate": 9.161589002480006e-05, + "loss": 1.8734, + "step": 6889 + }, + { + "epoch": 2.114794352363413, + "grad_norm": 0.45783132314682007, + "learning_rate": 9.161313464416469e-05, + "loss": 1.9769, + "step": 6890 + }, + { + "epoch": 2.1151012891344383, + "grad_norm": 0.37975600361824036, + "learning_rate": 9.161037885228393e-05, + "loss": 1.8988, + "step": 6891 + }, + { + "epoch": 2.1154082259054636, + "grad_norm": 0.394987553358078, + "learning_rate": 9.160762264918504e-05, + "loss": 1.8076, + "step": 6892 + }, + { + "epoch": 2.1157151626764885, + "grad_norm": 0.4180262088775635, + "learning_rate": 9.160486603489522e-05, + "loss": 1.9497, + "step": 6893 + }, + { + "epoch": 2.116022099447514, + "grad_norm": 0.3917383849620819, + "learning_rate": 9.160210900944173e-05, + "loss": 1.9093, + "step": 6894 + }, + { + "epoch": 2.116329036218539, + "grad_norm": 0.3631739616394043, + "learning_rate": 9.15993515728518e-05, + "loss": 1.8724, + "step": 6895 + }, + { + "epoch": 2.116635972989564, + "grad_norm": 0.3304460942745209, + "learning_rate": 9.159659372515272e-05, + "loss": 1.8291, + "step": 6896 + }, + { + "epoch": 2.1169429097605894, + "grad_norm": 0.38202792406082153, + "learning_rate": 9.159383546637172e-05, + "loss": 1.8919, + "step": 6897 + }, + { + "epoch": 2.1172498465316143, + "grad_norm": 0.39544618129730225, + "learning_rate": 9.159107679653605e-05, + "loss": 1.8748, + "step": 6898 + }, + { + "epoch": 2.1175567833026396, + "grad_norm": 0.44175153970718384, + "learning_rate": 9.158831771567298e-05, + "loss": 1.9063, + "step": 6899 + }, + { + "epoch": 2.117863720073665, + "grad_norm": 0.3696559965610504, + "learning_rate": 9.158555822380979e-05, + "loss": 1.8356, + "step": 6900 + }, + { + "epoch": 2.11817065684469, + "grad_norm": 0.2917703688144684, + "learning_rate": 9.158279832097372e-05, + "loss": 1.8996, + "step": 6901 + }, + { + "epoch": 2.118477593615715, + "grad_norm": 0.3991266191005707, + "learning_rate": 9.158003800719208e-05, + "loss": 1.8872, + "step": 6902 + }, + { + "epoch": 2.1187845303867405, + "grad_norm": 0.41425880789756775, + "learning_rate": 9.157727728249213e-05, + "loss": 1.845, + "step": 6903 + }, + { + "epoch": 2.1190914671577654, + "grad_norm": 0.33590519428253174, + "learning_rate": 9.157451614690115e-05, + "loss": 1.8779, + "step": 6904 + }, + { + "epoch": 2.1193984039287908, + "grad_norm": 0.34963786602020264, + "learning_rate": 9.157175460044644e-05, + "loss": 1.8846, + "step": 6905 + }, + { + "epoch": 2.1197053406998156, + "grad_norm": 0.3274745047092438, + "learning_rate": 9.156899264315528e-05, + "loss": 1.8859, + "step": 6906 + }, + { + "epoch": 2.120012277470841, + "grad_norm": 0.35821303725242615, + "learning_rate": 9.156623027505498e-05, + "loss": 1.8314, + "step": 6907 + }, + { + "epoch": 2.1203192142418663, + "grad_norm": 0.41185733675956726, + "learning_rate": 9.156346749617283e-05, + "loss": 1.9162, + "step": 6908 + }, + { + "epoch": 2.120626151012891, + "grad_norm": 0.4120326042175293, + "learning_rate": 9.156070430653613e-05, + "loss": 1.8593, + "step": 6909 + }, + { + "epoch": 2.1209330877839165, + "grad_norm": 0.39017269015312195, + "learning_rate": 9.155794070617218e-05, + "loss": 1.9333, + "step": 6910 + }, + { + "epoch": 2.121240024554942, + "grad_norm": 0.3104727864265442, + "learning_rate": 9.155517669510832e-05, + "loss": 1.8274, + "step": 6911 + }, + { + "epoch": 2.1215469613259668, + "grad_norm": 0.38360875844955444, + "learning_rate": 9.155241227337183e-05, + "loss": 1.9013, + "step": 6912 + }, + { + "epoch": 2.121853898096992, + "grad_norm": 0.3752502501010895, + "learning_rate": 9.154964744099006e-05, + "loss": 1.9079, + "step": 6913 + }, + { + "epoch": 2.122160834868017, + "grad_norm": 0.32074928283691406, + "learning_rate": 9.154688219799033e-05, + "loss": 1.8232, + "step": 6914 + }, + { + "epoch": 2.1224677716390423, + "grad_norm": 0.39559221267700195, + "learning_rate": 9.154411654439993e-05, + "loss": 1.9273, + "step": 6915 + }, + { + "epoch": 2.1227747084100677, + "grad_norm": 0.4010276198387146, + "learning_rate": 9.154135048024623e-05, + "loss": 1.8368, + "step": 6916 + }, + { + "epoch": 2.1230816451810925, + "grad_norm": 0.5745936036109924, + "learning_rate": 9.153858400555658e-05, + "loss": 2.0344, + "step": 6917 + }, + { + "epoch": 2.123388581952118, + "grad_norm": 0.45708227157592773, + "learning_rate": 9.153581712035827e-05, + "loss": 1.9309, + "step": 6918 + }, + { + "epoch": 2.123695518723143, + "grad_norm": 0.43845629692077637, + "learning_rate": 9.153304982467868e-05, + "loss": 1.9213, + "step": 6919 + }, + { + "epoch": 2.124002455494168, + "grad_norm": 0.34456655383110046, + "learning_rate": 9.153028211854516e-05, + "loss": 1.9, + "step": 6920 + }, + { + "epoch": 2.1243093922651934, + "grad_norm": 0.3903563618659973, + "learning_rate": 9.152751400198502e-05, + "loss": 1.8619, + "step": 6921 + }, + { + "epoch": 2.1246163290362183, + "grad_norm": 0.3465174436569214, + "learning_rate": 9.152474547502566e-05, + "loss": 1.8253, + "step": 6922 + }, + { + "epoch": 2.1249232658072437, + "grad_norm": 0.38335317373275757, + "learning_rate": 9.152197653769444e-05, + "loss": 1.8824, + "step": 6923 + }, + { + "epoch": 2.125230202578269, + "grad_norm": 0.3583361506462097, + "learning_rate": 9.15192071900187e-05, + "loss": 1.8749, + "step": 6924 + }, + { + "epoch": 2.125537139349294, + "grad_norm": 0.38249272108078003, + "learning_rate": 9.151643743202582e-05, + "loss": 1.9289, + "step": 6925 + }, + { + "epoch": 2.1258440761203192, + "grad_norm": 0.3972204327583313, + "learning_rate": 9.151366726374318e-05, + "loss": 1.8259, + "step": 6926 + }, + { + "epoch": 2.1261510128913446, + "grad_norm": 0.42475268244743347, + "learning_rate": 9.151089668519814e-05, + "loss": 1.9026, + "step": 6927 + }, + { + "epoch": 2.1264579496623695, + "grad_norm": 0.39575010538101196, + "learning_rate": 9.15081256964181e-05, + "loss": 1.8835, + "step": 6928 + }, + { + "epoch": 2.126764886433395, + "grad_norm": 0.33592918515205383, + "learning_rate": 9.150535429743041e-05, + "loss": 1.9439, + "step": 6929 + }, + { + "epoch": 2.12707182320442, + "grad_norm": 0.41760140657424927, + "learning_rate": 9.150258248826249e-05, + "loss": 1.9326, + "step": 6930 + }, + { + "epoch": 2.127378759975445, + "grad_norm": 0.4759281575679779, + "learning_rate": 9.149981026894173e-05, + "loss": 1.8443, + "step": 6931 + }, + { + "epoch": 2.1276856967464703, + "grad_norm": 0.4669014513492584, + "learning_rate": 9.149703763949552e-05, + "loss": 1.9254, + "step": 6932 + }, + { + "epoch": 2.1279926335174952, + "grad_norm": 0.3498002588748932, + "learning_rate": 9.149426459995126e-05, + "loss": 1.8814, + "step": 6933 + }, + { + "epoch": 2.1282995702885206, + "grad_norm": 0.332998663187027, + "learning_rate": 9.149149115033637e-05, + "loss": 1.8223, + "step": 6934 + }, + { + "epoch": 2.128606507059546, + "grad_norm": 0.36990395188331604, + "learning_rate": 9.148871729067823e-05, + "loss": 1.917, + "step": 6935 + }, + { + "epoch": 2.128913443830571, + "grad_norm": 0.4807330369949341, + "learning_rate": 9.148594302100426e-05, + "loss": 1.9138, + "step": 6936 + }, + { + "epoch": 2.129220380601596, + "grad_norm": 0.4821743369102478, + "learning_rate": 9.14831683413419e-05, + "loss": 1.9201, + "step": 6937 + }, + { + "epoch": 2.129527317372621, + "grad_norm": 0.45373013615608215, + "learning_rate": 9.148039325171855e-05, + "loss": 1.88, + "step": 6938 + }, + { + "epoch": 2.1298342541436464, + "grad_norm": 0.3712935745716095, + "learning_rate": 9.147761775216166e-05, + "loss": 1.8424, + "step": 6939 + }, + { + "epoch": 2.1301411909146717, + "grad_norm": 0.32493939995765686, + "learning_rate": 9.147484184269862e-05, + "loss": 1.8691, + "step": 6940 + }, + { + "epoch": 2.1304481276856966, + "grad_norm": 0.41952449083328247, + "learning_rate": 9.14720655233569e-05, + "loss": 1.8468, + "step": 6941 + }, + { + "epoch": 2.130755064456722, + "grad_norm": 0.4730648398399353, + "learning_rate": 9.14692887941639e-05, + "loss": 2.0333, + "step": 6942 + }, + { + "epoch": 2.1310620012277472, + "grad_norm": 0.3745786249637604, + "learning_rate": 9.14665116551471e-05, + "loss": 1.8835, + "step": 6943 + }, + { + "epoch": 2.131368937998772, + "grad_norm": 0.3747421205043793, + "learning_rate": 9.146373410633392e-05, + "loss": 1.8958, + "step": 6944 + }, + { + "epoch": 2.1316758747697975, + "grad_norm": 0.4383934438228607, + "learning_rate": 9.146095614775182e-05, + "loss": 1.8527, + "step": 6945 + }, + { + "epoch": 2.131982811540823, + "grad_norm": 0.4657299220561981, + "learning_rate": 9.145817777942824e-05, + "loss": 1.9073, + "step": 6946 + }, + { + "epoch": 2.1322897483118477, + "grad_norm": 0.4741605818271637, + "learning_rate": 9.145539900139067e-05, + "loss": 1.8736, + "step": 6947 + }, + { + "epoch": 2.132596685082873, + "grad_norm": 0.4058460295200348, + "learning_rate": 9.145261981366653e-05, + "loss": 1.9365, + "step": 6948 + }, + { + "epoch": 2.132903621853898, + "grad_norm": 0.3430838882923126, + "learning_rate": 9.14498402162833e-05, + "loss": 1.8992, + "step": 6949 + }, + { + "epoch": 2.1332105586249233, + "grad_norm": 0.43009114265441895, + "learning_rate": 9.144706020926847e-05, + "loss": 1.925, + "step": 6950 + }, + { + "epoch": 2.1335174953959486, + "grad_norm": 0.47696158289909363, + "learning_rate": 9.144427979264949e-05, + "loss": 1.858, + "step": 6951 + }, + { + "epoch": 2.1338244321669735, + "grad_norm": 0.4477602243423462, + "learning_rate": 9.144149896645386e-05, + "loss": 1.9042, + "step": 6952 + }, + { + "epoch": 2.134131368937999, + "grad_norm": 0.3736960291862488, + "learning_rate": 9.143871773070903e-05, + "loss": 1.782, + "step": 6953 + }, + { + "epoch": 2.1344383057090237, + "grad_norm": 0.3065558075904846, + "learning_rate": 9.143593608544251e-05, + "loss": 1.8711, + "step": 6954 + }, + { + "epoch": 2.134745242480049, + "grad_norm": 0.41738569736480713, + "learning_rate": 9.143315403068178e-05, + "loss": 1.8651, + "step": 6955 + }, + { + "epoch": 2.1350521792510744, + "grad_norm": 0.4652978479862213, + "learning_rate": 9.143037156645435e-05, + "loss": 1.8225, + "step": 6956 + }, + { + "epoch": 2.1353591160220993, + "grad_norm": 0.3625001311302185, + "learning_rate": 9.142758869278769e-05, + "loss": 1.9045, + "step": 6957 + }, + { + "epoch": 2.1356660527931246, + "grad_norm": 0.34516090154647827, + "learning_rate": 9.142480540970933e-05, + "loss": 1.8527, + "step": 6958 + }, + { + "epoch": 2.13597298956415, + "grad_norm": 0.36983323097229004, + "learning_rate": 9.142202171724674e-05, + "loss": 1.7911, + "step": 6959 + }, + { + "epoch": 2.136279926335175, + "grad_norm": 0.46084535121917725, + "learning_rate": 9.141923761542748e-05, + "loss": 1.9489, + "step": 6960 + }, + { + "epoch": 2.1365868631062, + "grad_norm": 0.49472227692604065, + "learning_rate": 9.141645310427903e-05, + "loss": 1.9904, + "step": 6961 + }, + { + "epoch": 2.1368937998772255, + "grad_norm": 0.39878135919570923, + "learning_rate": 9.14136681838289e-05, + "loss": 1.8969, + "step": 6962 + }, + { + "epoch": 2.1372007366482504, + "grad_norm": 0.3451174795627594, + "learning_rate": 9.141088285410464e-05, + "loss": 1.9186, + "step": 6963 + }, + { + "epoch": 2.1375076734192757, + "grad_norm": 0.4497967064380646, + "learning_rate": 9.140809711513377e-05, + "loss": 1.8636, + "step": 6964 + }, + { + "epoch": 2.1378146101903006, + "grad_norm": 0.4643685221672058, + "learning_rate": 9.14053109669438e-05, + "loss": 1.8427, + "step": 6965 + }, + { + "epoch": 2.138121546961326, + "grad_norm": 0.3748690187931061, + "learning_rate": 9.140252440956229e-05, + "loss": 1.8529, + "step": 6966 + }, + { + "epoch": 2.1384284837323513, + "grad_norm": 0.3211230933666229, + "learning_rate": 9.139973744301675e-05, + "loss": 1.8849, + "step": 6967 + }, + { + "epoch": 2.138735420503376, + "grad_norm": 0.41169998049736023, + "learning_rate": 9.139695006733476e-05, + "loss": 1.8535, + "step": 6968 + }, + { + "epoch": 2.1390423572744015, + "grad_norm": 0.48356300592422485, + "learning_rate": 9.139416228254382e-05, + "loss": 1.8182, + "step": 6969 + }, + { + "epoch": 2.139349294045427, + "grad_norm": 0.4596598148345947, + "learning_rate": 9.139137408867153e-05, + "loss": 1.8522, + "step": 6970 + }, + { + "epoch": 2.1396562308164517, + "grad_norm": 0.37168747186660767, + "learning_rate": 9.138858548574543e-05, + "loss": 1.896, + "step": 6971 + }, + { + "epoch": 2.139963167587477, + "grad_norm": 0.34447649121284485, + "learning_rate": 9.138579647379305e-05, + "loss": 1.8473, + "step": 6972 + }, + { + "epoch": 2.140270104358502, + "grad_norm": 0.466169536113739, + "learning_rate": 9.138300705284197e-05, + "loss": 1.9131, + "step": 6973 + }, + { + "epoch": 2.1405770411295273, + "grad_norm": 0.4297258257865906, + "learning_rate": 9.138021722291977e-05, + "loss": 1.9013, + "step": 6974 + }, + { + "epoch": 2.1408839779005526, + "grad_norm": 0.29336342215538025, + "learning_rate": 9.1377426984054e-05, + "loss": 1.8242, + "step": 6975 + }, + { + "epoch": 2.1411909146715775, + "grad_norm": 0.4282750189304352, + "learning_rate": 9.137463633627226e-05, + "loss": 1.9159, + "step": 6976 + }, + { + "epoch": 2.141497851442603, + "grad_norm": 0.6071211099624634, + "learning_rate": 9.13718452796021e-05, + "loss": 1.9105, + "step": 6977 + }, + { + "epoch": 2.141804788213628, + "grad_norm": 0.5837090015411377, + "learning_rate": 9.136905381407113e-05, + "loss": 1.8735, + "step": 6978 + }, + { + "epoch": 2.142111724984653, + "grad_norm": 0.36910486221313477, + "learning_rate": 9.13662619397069e-05, + "loss": 1.9013, + "step": 6979 + }, + { + "epoch": 2.1424186617556784, + "grad_norm": 0.37497541308403015, + "learning_rate": 9.136346965653704e-05, + "loss": 1.8444, + "step": 6980 + }, + { + "epoch": 2.1427255985267033, + "grad_norm": 0.508252739906311, + "learning_rate": 9.136067696458911e-05, + "loss": 1.8756, + "step": 6981 + }, + { + "epoch": 2.1430325352977286, + "grad_norm": 0.4045214056968689, + "learning_rate": 9.135788386389077e-05, + "loss": 1.8843, + "step": 6982 + }, + { + "epoch": 2.143339472068754, + "grad_norm": 0.36260777711868286, + "learning_rate": 9.135509035446955e-05, + "loss": 1.9264, + "step": 6983 + }, + { + "epoch": 2.143646408839779, + "grad_norm": 0.4112427234649658, + "learning_rate": 9.135229643635309e-05, + "loss": 1.8843, + "step": 6984 + }, + { + "epoch": 2.143953345610804, + "grad_norm": 0.43893104791641235, + "learning_rate": 9.1349502109569e-05, + "loss": 1.9486, + "step": 6985 + }, + { + "epoch": 2.1442602823818295, + "grad_norm": 0.3942745625972748, + "learning_rate": 9.13467073741449e-05, + "loss": 1.8607, + "step": 6986 + }, + { + "epoch": 2.1445672191528544, + "grad_norm": 0.3920004963874817, + "learning_rate": 9.13439122301084e-05, + "loss": 1.8102, + "step": 6987 + }, + { + "epoch": 2.1448741559238798, + "grad_norm": 0.3774373531341553, + "learning_rate": 9.134111667748712e-05, + "loss": 1.8326, + "step": 6988 + }, + { + "epoch": 2.1451810926949046, + "grad_norm": 0.355228453874588, + "learning_rate": 9.13383207163087e-05, + "loss": 1.895, + "step": 6989 + }, + { + "epoch": 2.14548802946593, + "grad_norm": 0.40284648537635803, + "learning_rate": 9.133552434660077e-05, + "loss": 1.928, + "step": 6990 + }, + { + "epoch": 2.1457949662369553, + "grad_norm": 0.3974910378456116, + "learning_rate": 9.133272756839096e-05, + "loss": 1.8567, + "step": 6991 + }, + { + "epoch": 2.14610190300798, + "grad_norm": 0.3878382742404938, + "learning_rate": 9.13299303817069e-05, + "loss": 1.9125, + "step": 6992 + }, + { + "epoch": 2.1464088397790055, + "grad_norm": 0.36132267117500305, + "learning_rate": 9.132713278657625e-05, + "loss": 1.8395, + "step": 6993 + }, + { + "epoch": 2.146715776550031, + "grad_norm": 0.4648832082748413, + "learning_rate": 9.132433478302667e-05, + "loss": 1.8877, + "step": 6994 + }, + { + "epoch": 2.1470227133210558, + "grad_norm": 0.5171563625335693, + "learning_rate": 9.132153637108577e-05, + "loss": 1.857, + "step": 6995 + }, + { + "epoch": 2.147329650092081, + "grad_norm": 0.4256175756454468, + "learning_rate": 9.131873755078124e-05, + "loss": 1.8434, + "step": 6996 + }, + { + "epoch": 2.147636586863106, + "grad_norm": 0.3421500623226166, + "learning_rate": 9.131593832214072e-05, + "loss": 1.8747, + "step": 6997 + }, + { + "epoch": 2.1479435236341313, + "grad_norm": 0.3880314230918884, + "learning_rate": 9.131313868519188e-05, + "loss": 1.8592, + "step": 6998 + }, + { + "epoch": 2.1482504604051567, + "grad_norm": 0.41070252656936646, + "learning_rate": 9.131033863996239e-05, + "loss": 1.8746, + "step": 6999 + }, + { + "epoch": 2.1485573971761815, + "grad_norm": 0.3837376534938812, + "learning_rate": 9.130753818647992e-05, + "loss": 1.8722, + "step": 7000 + }, + { + "epoch": 2.148864333947207, + "grad_norm": 0.311184823513031, + "learning_rate": 9.130473732477217e-05, + "loss": 1.8964, + "step": 7001 + }, + { + "epoch": 2.149171270718232, + "grad_norm": 0.3548091948032379, + "learning_rate": 9.130193605486677e-05, + "loss": 1.9235, + "step": 7002 + }, + { + "epoch": 2.149478207489257, + "grad_norm": 0.3509860932826996, + "learning_rate": 9.129913437679143e-05, + "loss": 1.8088, + "step": 7003 + }, + { + "epoch": 2.1497851442602824, + "grad_norm": 0.3301749527454376, + "learning_rate": 9.129633229057384e-05, + "loss": 1.8926, + "step": 7004 + }, + { + "epoch": 2.150092081031308, + "grad_norm": 0.3071286082267761, + "learning_rate": 9.129352979624169e-05, + "loss": 1.8045, + "step": 7005 + }, + { + "epoch": 2.1503990178023327, + "grad_norm": 0.3222786486148834, + "learning_rate": 9.129072689382268e-05, + "loss": 1.877, + "step": 7006 + }, + { + "epoch": 2.150705954573358, + "grad_norm": 0.31817424297332764, + "learning_rate": 9.128792358334451e-05, + "loss": 1.8863, + "step": 7007 + }, + { + "epoch": 2.151012891344383, + "grad_norm": 0.29379183053970337, + "learning_rate": 9.128511986483487e-05, + "loss": 1.8339, + "step": 7008 + }, + { + "epoch": 2.1513198281154082, + "grad_norm": 0.3618883788585663, + "learning_rate": 9.128231573832149e-05, + "loss": 1.9521, + "step": 7009 + }, + { + "epoch": 2.1516267648864336, + "grad_norm": 0.3188464045524597, + "learning_rate": 9.127951120383205e-05, + "loss": 1.811, + "step": 7010 + }, + { + "epoch": 2.1519337016574585, + "grad_norm": 0.3257068395614624, + "learning_rate": 9.127670626139431e-05, + "loss": 1.9084, + "step": 7011 + }, + { + "epoch": 2.152240638428484, + "grad_norm": 0.3389057219028473, + "learning_rate": 9.127390091103595e-05, + "loss": 1.9272, + "step": 7012 + }, + { + "epoch": 2.1525475751995087, + "grad_norm": 0.3376730680465698, + "learning_rate": 9.127109515278471e-05, + "loss": 1.8841, + "step": 7013 + }, + { + "epoch": 2.152854511970534, + "grad_norm": 0.3032901883125305, + "learning_rate": 9.126828898666833e-05, + "loss": 1.8057, + "step": 7014 + }, + { + "epoch": 2.1531614487415593, + "grad_norm": 0.32034799456596375, + "learning_rate": 9.126548241271451e-05, + "loss": 1.7988, + "step": 7015 + }, + { + "epoch": 2.1534683855125842, + "grad_norm": 0.31879931688308716, + "learning_rate": 9.126267543095102e-05, + "loss": 1.8932, + "step": 7016 + }, + { + "epoch": 2.1537753222836096, + "grad_norm": 0.3282395005226135, + "learning_rate": 9.125986804140559e-05, + "loss": 1.907, + "step": 7017 + }, + { + "epoch": 2.154082259054635, + "grad_norm": 0.36310696601867676, + "learning_rate": 9.125706024410594e-05, + "loss": 1.9812, + "step": 7018 + }, + { + "epoch": 2.15438919582566, + "grad_norm": 0.39414262771606445, + "learning_rate": 9.125425203907985e-05, + "loss": 1.9112, + "step": 7019 + }, + { + "epoch": 2.154696132596685, + "grad_norm": 0.4457061290740967, + "learning_rate": 9.125144342635508e-05, + "loss": 1.8876, + "step": 7020 + }, + { + "epoch": 2.1550030693677105, + "grad_norm": 0.4651646316051483, + "learning_rate": 9.124863440595934e-05, + "loss": 1.8283, + "step": 7021 + }, + { + "epoch": 2.1553100061387354, + "grad_norm": 0.4404383897781372, + "learning_rate": 9.124582497792043e-05, + "loss": 1.8646, + "step": 7022 + }, + { + "epoch": 2.1556169429097607, + "grad_norm": 0.3569783866405487, + "learning_rate": 9.124301514226612e-05, + "loss": 1.9603, + "step": 7023 + }, + { + "epoch": 2.1559238796807856, + "grad_norm": 0.3878212571144104, + "learning_rate": 9.124020489902414e-05, + "loss": 1.889, + "step": 7024 + }, + { + "epoch": 2.156230816451811, + "grad_norm": 0.43005698919296265, + "learning_rate": 9.123739424822229e-05, + "loss": 1.9127, + "step": 7025 + }, + { + "epoch": 2.1565377532228363, + "grad_norm": 0.37798774242401123, + "learning_rate": 9.123458318988834e-05, + "loss": 1.8434, + "step": 7026 + }, + { + "epoch": 2.156844689993861, + "grad_norm": 0.38182979822158813, + "learning_rate": 9.123177172405007e-05, + "loss": 1.8905, + "step": 7027 + }, + { + "epoch": 2.1571516267648865, + "grad_norm": 0.4695180058479309, + "learning_rate": 9.122895985073524e-05, + "loss": 1.9035, + "step": 7028 + }, + { + "epoch": 2.1574585635359114, + "grad_norm": 0.37112870812416077, + "learning_rate": 9.12261475699717e-05, + "loss": 1.8497, + "step": 7029 + }, + { + "epoch": 2.1577655003069367, + "grad_norm": 0.36758264899253845, + "learning_rate": 9.122333488178721e-05, + "loss": 1.9015, + "step": 7030 + }, + { + "epoch": 2.158072437077962, + "grad_norm": 0.4691081643104553, + "learning_rate": 9.122052178620953e-05, + "loss": 1.9707, + "step": 7031 + }, + { + "epoch": 2.158379373848987, + "grad_norm": 0.47068753838539124, + "learning_rate": 9.121770828326653e-05, + "loss": 1.9103, + "step": 7032 + }, + { + "epoch": 2.1586863106200123, + "grad_norm": 0.38539063930511475, + "learning_rate": 9.121489437298593e-05, + "loss": 1.7872, + "step": 7033 + }, + { + "epoch": 2.1589932473910376, + "grad_norm": 0.43769749999046326, + "learning_rate": 9.121208005539563e-05, + "loss": 1.9654, + "step": 7034 + }, + { + "epoch": 2.1593001841620625, + "grad_norm": 0.4770655930042267, + "learning_rate": 9.120926533052338e-05, + "loss": 1.9754, + "step": 7035 + }, + { + "epoch": 2.159607120933088, + "grad_norm": 0.526979386806488, + "learning_rate": 9.120645019839702e-05, + "loss": 1.8833, + "step": 7036 + }, + { + "epoch": 2.159914057704113, + "grad_norm": 0.4734671413898468, + "learning_rate": 9.120363465904438e-05, + "loss": 1.8695, + "step": 7037 + }, + { + "epoch": 2.160220994475138, + "grad_norm": 0.40346798300743103, + "learning_rate": 9.120081871249326e-05, + "loss": 1.9216, + "step": 7038 + }, + { + "epoch": 2.1605279312461634, + "grad_norm": 0.38210105895996094, + "learning_rate": 9.119800235877149e-05, + "loss": 1.9334, + "step": 7039 + }, + { + "epoch": 2.1608348680171883, + "grad_norm": 0.5528677105903625, + "learning_rate": 9.119518559790694e-05, + "loss": 1.8858, + "step": 7040 + }, + { + "epoch": 2.1611418047882136, + "grad_norm": 0.6684148907661438, + "learning_rate": 9.11923684299274e-05, + "loss": 1.9105, + "step": 7041 + }, + { + "epoch": 2.161448741559239, + "grad_norm": 0.4497738778591156, + "learning_rate": 9.118955085486073e-05, + "loss": 1.8789, + "step": 7042 + }, + { + "epoch": 2.161755678330264, + "grad_norm": 0.4440831243991852, + "learning_rate": 9.11867328727348e-05, + "loss": 1.9966, + "step": 7043 + }, + { + "epoch": 2.162062615101289, + "grad_norm": 0.5910835266113281, + "learning_rate": 9.118391448357742e-05, + "loss": 1.8841, + "step": 7044 + }, + { + "epoch": 2.1623695518723145, + "grad_norm": 0.5312752723693848, + "learning_rate": 9.118109568741645e-05, + "loss": 1.8825, + "step": 7045 + }, + { + "epoch": 2.1626764886433394, + "grad_norm": 0.3885713815689087, + "learning_rate": 9.117827648427977e-05, + "loss": 1.8763, + "step": 7046 + }, + { + "epoch": 2.1629834254143647, + "grad_norm": 0.4274894893169403, + "learning_rate": 9.117545687419522e-05, + "loss": 1.8802, + "step": 7047 + }, + { + "epoch": 2.1632903621853896, + "grad_norm": 0.3984382748603821, + "learning_rate": 9.117263685719067e-05, + "loss": 1.8319, + "step": 7048 + }, + { + "epoch": 2.163597298956415, + "grad_norm": 0.3687778115272522, + "learning_rate": 9.1169816433294e-05, + "loss": 1.838, + "step": 7049 + }, + { + "epoch": 2.1639042357274403, + "grad_norm": 0.37597915530204773, + "learning_rate": 9.116699560253306e-05, + "loss": 1.8711, + "step": 7050 + }, + { + "epoch": 2.164211172498465, + "grad_norm": 0.41217467188835144, + "learning_rate": 9.116417436493574e-05, + "loss": 1.8552, + "step": 7051 + }, + { + "epoch": 2.1645181092694905, + "grad_norm": 0.3937448263168335, + "learning_rate": 9.116135272052994e-05, + "loss": 1.8548, + "step": 7052 + }, + { + "epoch": 2.164825046040516, + "grad_norm": 0.3545389175415039, + "learning_rate": 9.115853066934351e-05, + "loss": 1.8694, + "step": 7053 + }, + { + "epoch": 2.1651319828115407, + "grad_norm": 0.32625243067741394, + "learning_rate": 9.115570821140436e-05, + "loss": 1.8579, + "step": 7054 + }, + { + "epoch": 2.165438919582566, + "grad_norm": 0.32701975107192993, + "learning_rate": 9.115288534674038e-05, + "loss": 1.8676, + "step": 7055 + }, + { + "epoch": 2.165745856353591, + "grad_norm": 0.39372533559799194, + "learning_rate": 9.115006207537947e-05, + "loss": 1.8895, + "step": 7056 + }, + { + "epoch": 2.1660527931246163, + "grad_norm": 0.3688350021839142, + "learning_rate": 9.114723839734954e-05, + "loss": 1.8742, + "step": 7057 + }, + { + "epoch": 2.1663597298956416, + "grad_norm": 0.35461875796318054, + "learning_rate": 9.114441431267846e-05, + "loss": 1.8723, + "step": 7058 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.3331618010997772, + "learning_rate": 9.114158982139416e-05, + "loss": 1.8514, + "step": 7059 + }, + { + "epoch": 2.166973603437692, + "grad_norm": 0.3313215374946594, + "learning_rate": 9.113876492352458e-05, + "loss": 1.912, + "step": 7060 + }, + { + "epoch": 2.167280540208717, + "grad_norm": 0.3320949375629425, + "learning_rate": 9.113593961909759e-05, + "loss": 1.8908, + "step": 7061 + }, + { + "epoch": 2.167587476979742, + "grad_norm": 0.3292064070701599, + "learning_rate": 9.113311390814115e-05, + "loss": 1.8702, + "step": 7062 + }, + { + "epoch": 2.1678944137507674, + "grad_norm": 0.33991244435310364, + "learning_rate": 9.113028779068316e-05, + "loss": 1.8503, + "step": 7063 + }, + { + "epoch": 2.1682013505217923, + "grad_norm": 0.3602859377861023, + "learning_rate": 9.112746126675156e-05, + "loss": 1.9185, + "step": 7064 + }, + { + "epoch": 2.1685082872928176, + "grad_norm": 0.3354876637458801, + "learning_rate": 9.112463433637428e-05, + "loss": 1.8857, + "step": 7065 + }, + { + "epoch": 2.168815224063843, + "grad_norm": 0.32364192605018616, + "learning_rate": 9.112180699957926e-05, + "loss": 1.8548, + "step": 7066 + }, + { + "epoch": 2.169122160834868, + "grad_norm": 0.3617163896560669, + "learning_rate": 9.111897925639446e-05, + "loss": 1.9021, + "step": 7067 + }, + { + "epoch": 2.169429097605893, + "grad_norm": 0.3852904438972473, + "learning_rate": 9.111615110684778e-05, + "loss": 1.9331, + "step": 7068 + }, + { + "epoch": 2.1697360343769185, + "grad_norm": 0.332939088344574, + "learning_rate": 9.111332255096721e-05, + "loss": 1.9156, + "step": 7069 + }, + { + "epoch": 2.1700429711479434, + "grad_norm": 0.3386891186237335, + "learning_rate": 9.111049358878067e-05, + "loss": 1.8898, + "step": 7070 + }, + { + "epoch": 2.1703499079189688, + "grad_norm": 0.3559711277484894, + "learning_rate": 9.110766422031617e-05, + "loss": 1.8546, + "step": 7071 + }, + { + "epoch": 2.1706568446899936, + "grad_norm": 0.3440175950527191, + "learning_rate": 9.110483444560162e-05, + "loss": 1.9005, + "step": 7072 + }, + { + "epoch": 2.170963781461019, + "grad_norm": 0.3239493668079376, + "learning_rate": 9.110200426466499e-05, + "loss": 1.9258, + "step": 7073 + }, + { + "epoch": 2.1712707182320443, + "grad_norm": 0.3658723533153534, + "learning_rate": 9.109917367753428e-05, + "loss": 2.0203, + "step": 7074 + }, + { + "epoch": 2.171577655003069, + "grad_norm": 0.35419905185699463, + "learning_rate": 9.109634268423746e-05, + "loss": 1.8515, + "step": 7075 + }, + { + "epoch": 2.1718845917740945, + "grad_norm": 0.40852081775665283, + "learning_rate": 9.109351128480246e-05, + "loss": 1.8744, + "step": 7076 + }, + { + "epoch": 2.17219152854512, + "grad_norm": 0.3502386212348938, + "learning_rate": 9.109067947925732e-05, + "loss": 1.8785, + "step": 7077 + }, + { + "epoch": 2.1724984653161448, + "grad_norm": 0.42964309453964233, + "learning_rate": 9.108784726763e-05, + "loss": 1.9175, + "step": 7078 + }, + { + "epoch": 2.17280540208717, + "grad_norm": 0.39438319206237793, + "learning_rate": 9.108501464994849e-05, + "loss": 1.9072, + "step": 7079 + }, + { + "epoch": 2.1731123388581954, + "grad_norm": 0.5045785903930664, + "learning_rate": 9.108218162624079e-05, + "loss": 1.9246, + "step": 7080 + }, + { + "epoch": 2.1734192756292203, + "grad_norm": 0.4374946653842926, + "learning_rate": 9.107934819653488e-05, + "loss": 1.8669, + "step": 7081 + }, + { + "epoch": 2.1737262124002457, + "grad_norm": 0.3263556957244873, + "learning_rate": 9.107651436085878e-05, + "loss": 1.8402, + "step": 7082 + }, + { + "epoch": 2.1740331491712706, + "grad_norm": 0.4380986988544464, + "learning_rate": 9.107368011924048e-05, + "loss": 1.8948, + "step": 7083 + }, + { + "epoch": 2.174340085942296, + "grad_norm": 0.4350908696651459, + "learning_rate": 9.1070845471708e-05, + "loss": 1.8717, + "step": 7084 + }, + { + "epoch": 2.174647022713321, + "grad_norm": 0.37809762358665466, + "learning_rate": 9.106801041828936e-05, + "loss": 1.8703, + "step": 7085 + }, + { + "epoch": 2.174953959484346, + "grad_norm": 0.3473457992076874, + "learning_rate": 9.106517495901257e-05, + "loss": 1.8999, + "step": 7086 + }, + { + "epoch": 2.1752608962553714, + "grad_norm": 0.48066645860671997, + "learning_rate": 9.106233909390564e-05, + "loss": 1.8788, + "step": 7087 + }, + { + "epoch": 2.1755678330263963, + "grad_norm": 0.5873035788536072, + "learning_rate": 9.105950282299663e-05, + "loss": 1.8879, + "step": 7088 + }, + { + "epoch": 2.1758747697974217, + "grad_norm": 0.47609585523605347, + "learning_rate": 9.105666614631354e-05, + "loss": 1.8813, + "step": 7089 + }, + { + "epoch": 2.176181706568447, + "grad_norm": 0.3845362365245819, + "learning_rate": 9.10538290638844e-05, + "loss": 1.9629, + "step": 7090 + }, + { + "epoch": 2.176488643339472, + "grad_norm": 0.5463572144508362, + "learning_rate": 9.105099157573727e-05, + "loss": 1.9455, + "step": 7091 + }, + { + "epoch": 2.1767955801104972, + "grad_norm": 0.4875337779521942, + "learning_rate": 9.104815368190017e-05, + "loss": 1.9146, + "step": 7092 + }, + { + "epoch": 2.1771025168815226, + "grad_norm": 0.37513965368270874, + "learning_rate": 9.104531538240116e-05, + "loss": 1.8626, + "step": 7093 + }, + { + "epoch": 2.1774094536525475, + "grad_norm": 0.3477539122104645, + "learning_rate": 9.104247667726828e-05, + "loss": 1.878, + "step": 7094 + }, + { + "epoch": 2.177716390423573, + "grad_norm": 0.5122693181037903, + "learning_rate": 9.103963756652961e-05, + "loss": 1.8784, + "step": 7095 + }, + { + "epoch": 2.178023327194598, + "grad_norm": 0.49106159806251526, + "learning_rate": 9.103679805021317e-05, + "loss": 1.8441, + "step": 7096 + }, + { + "epoch": 2.178330263965623, + "grad_norm": 0.3801479637622833, + "learning_rate": 9.103395812834705e-05, + "loss": 1.8986, + "step": 7097 + }, + { + "epoch": 2.1786372007366483, + "grad_norm": 0.3429640233516693, + "learning_rate": 9.10311178009593e-05, + "loss": 1.8806, + "step": 7098 + }, + { + "epoch": 2.1789441375076732, + "grad_norm": 0.36715295910835266, + "learning_rate": 9.102827706807799e-05, + "loss": 1.8215, + "step": 7099 + }, + { + "epoch": 2.1792510742786986, + "grad_norm": 0.37225866317749023, + "learning_rate": 9.10254359297312e-05, + "loss": 1.8851, + "step": 7100 + }, + { + "epoch": 2.179558011049724, + "grad_norm": 0.3552459180355072, + "learning_rate": 9.102259438594702e-05, + "loss": 1.9345, + "step": 7101 + }, + { + "epoch": 2.179864947820749, + "grad_norm": 0.3876415193080902, + "learning_rate": 9.10197524367535e-05, + "loss": 1.8657, + "step": 7102 + }, + { + "epoch": 2.180171884591774, + "grad_norm": 0.4635472595691681, + "learning_rate": 9.101691008217875e-05, + "loss": 1.8527, + "step": 7103 + }, + { + "epoch": 2.1804788213627995, + "grad_norm": 0.46319296956062317, + "learning_rate": 9.101406732225086e-05, + "loss": 1.869, + "step": 7104 + }, + { + "epoch": 2.1807857581338244, + "grad_norm": 0.36179330945014954, + "learning_rate": 9.101122415699792e-05, + "loss": 1.9157, + "step": 7105 + }, + { + "epoch": 2.1810926949048497, + "grad_norm": 0.30921339988708496, + "learning_rate": 9.100838058644801e-05, + "loss": 1.858, + "step": 7106 + }, + { + "epoch": 2.1813996316758746, + "grad_norm": 0.4568884074687958, + "learning_rate": 9.100553661062925e-05, + "loss": 1.8663, + "step": 7107 + }, + { + "epoch": 2.1817065684469, + "grad_norm": 0.43856412172317505, + "learning_rate": 9.100269222956976e-05, + "loss": 1.8492, + "step": 7108 + }, + { + "epoch": 2.1820135052179253, + "grad_norm": 0.3025546967983246, + "learning_rate": 9.099984744329761e-05, + "loss": 1.8532, + "step": 7109 + }, + { + "epoch": 2.18232044198895, + "grad_norm": 0.38365665078163147, + "learning_rate": 9.099700225184096e-05, + "loss": 1.8883, + "step": 7110 + }, + { + "epoch": 2.1826273787599755, + "grad_norm": 0.4863334596157074, + "learning_rate": 9.099415665522788e-05, + "loss": 1.8682, + "step": 7111 + }, + { + "epoch": 2.182934315531001, + "grad_norm": 0.42789241671562195, + "learning_rate": 9.099131065348653e-05, + "loss": 1.8867, + "step": 7112 + }, + { + "epoch": 2.1832412523020257, + "grad_norm": 0.35933569073677063, + "learning_rate": 9.098846424664504e-05, + "loss": 1.9282, + "step": 7113 + }, + { + "epoch": 2.183548189073051, + "grad_norm": 0.42611026763916016, + "learning_rate": 9.09856174347315e-05, + "loss": 1.9609, + "step": 7114 + }, + { + "epoch": 2.183855125844076, + "grad_norm": 0.43970558047294617, + "learning_rate": 9.098277021777406e-05, + "loss": 1.823, + "step": 7115 + }, + { + "epoch": 2.1841620626151013, + "grad_norm": 0.36792683601379395, + "learning_rate": 9.097992259580089e-05, + "loss": 1.9231, + "step": 7116 + }, + { + "epoch": 2.1844689993861266, + "grad_norm": 0.3554590344429016, + "learning_rate": 9.097707456884008e-05, + "loss": 1.914, + "step": 7117 + }, + { + "epoch": 2.1847759361571515, + "grad_norm": 0.4271651804447174, + "learning_rate": 9.097422613691982e-05, + "loss": 1.8666, + "step": 7118 + }, + { + "epoch": 2.185082872928177, + "grad_norm": 0.32142770290374756, + "learning_rate": 9.097137730006822e-05, + "loss": 1.7989, + "step": 7119 + }, + { + "epoch": 2.185389809699202, + "grad_norm": 0.33245620131492615, + "learning_rate": 9.096852805831348e-05, + "loss": 1.8536, + "step": 7120 + }, + { + "epoch": 2.185696746470227, + "grad_norm": 0.3480495810508728, + "learning_rate": 9.09656784116837e-05, + "loss": 1.9008, + "step": 7121 + }, + { + "epoch": 2.1860036832412524, + "grad_norm": 0.35290226340293884, + "learning_rate": 9.09628283602071e-05, + "loss": 1.8593, + "step": 7122 + }, + { + "epoch": 2.1863106200122773, + "grad_norm": 0.3084987998008728, + "learning_rate": 9.095997790391183e-05, + "loss": 1.827, + "step": 7123 + }, + { + "epoch": 2.1866175567833026, + "grad_norm": 0.36295285820961, + "learning_rate": 9.095712704282604e-05, + "loss": 1.909, + "step": 7124 + }, + { + "epoch": 2.186924493554328, + "grad_norm": 0.3893873691558838, + "learning_rate": 9.095427577697791e-05, + "loss": 1.9221, + "step": 7125 + }, + { + "epoch": 2.187231430325353, + "grad_norm": 0.3699241578578949, + "learning_rate": 9.095142410639564e-05, + "loss": 1.9352, + "step": 7126 + }, + { + "epoch": 2.187538367096378, + "grad_norm": 0.3384705185890198, + "learning_rate": 9.094857203110738e-05, + "loss": 1.8541, + "step": 7127 + }, + { + "epoch": 2.1878453038674035, + "grad_norm": 0.377687007188797, + "learning_rate": 9.094571955114133e-05, + "loss": 1.8336, + "step": 7128 + }, + { + "epoch": 2.1881522406384284, + "grad_norm": 0.40227916836738586, + "learning_rate": 9.094286666652567e-05, + "loss": 1.9565, + "step": 7129 + }, + { + "epoch": 2.1884591774094537, + "grad_norm": 0.3679705560207367, + "learning_rate": 9.094001337728862e-05, + "loss": 1.8152, + "step": 7130 + }, + { + "epoch": 2.1887661141804786, + "grad_norm": 0.3197132647037506, + "learning_rate": 9.093715968345836e-05, + "loss": 1.9263, + "step": 7131 + }, + { + "epoch": 2.189073050951504, + "grad_norm": 0.3518284559249878, + "learning_rate": 9.09343055850631e-05, + "loss": 1.8675, + "step": 7132 + }, + { + "epoch": 2.1893799877225293, + "grad_norm": 0.3214010000228882, + "learning_rate": 9.093145108213103e-05, + "loss": 1.8991, + "step": 7133 + }, + { + "epoch": 2.189686924493554, + "grad_norm": 0.3563176393508911, + "learning_rate": 9.092859617469037e-05, + "loss": 1.8603, + "step": 7134 + }, + { + "epoch": 2.1899938612645795, + "grad_norm": 0.34053143858909607, + "learning_rate": 9.092574086276933e-05, + "loss": 1.8955, + "step": 7135 + }, + { + "epoch": 2.190300798035605, + "grad_norm": 0.3833705484867096, + "learning_rate": 9.092288514639613e-05, + "loss": 1.8845, + "step": 7136 + }, + { + "epoch": 2.1906077348066297, + "grad_norm": 0.3932427763938904, + "learning_rate": 9.092002902559901e-05, + "loss": 1.8608, + "step": 7137 + }, + { + "epoch": 2.190914671577655, + "grad_norm": 0.332955539226532, + "learning_rate": 9.091717250040617e-05, + "loss": 1.8558, + "step": 7138 + }, + { + "epoch": 2.1912216083486804, + "grad_norm": 0.3149980306625366, + "learning_rate": 9.091431557084584e-05, + "loss": 1.893, + "step": 7139 + }, + { + "epoch": 2.1915285451197053, + "grad_norm": 0.3679150640964508, + "learning_rate": 9.091145823694628e-05, + "loss": 1.9012, + "step": 7140 + }, + { + "epoch": 2.1918354818907306, + "grad_norm": 0.36836057901382446, + "learning_rate": 9.09086004987357e-05, + "loss": 1.9121, + "step": 7141 + }, + { + "epoch": 2.1921424186617555, + "grad_norm": 0.3581927418708801, + "learning_rate": 9.090574235624237e-05, + "loss": 1.8826, + "step": 7142 + }, + { + "epoch": 2.192449355432781, + "grad_norm": 0.40886545181274414, + "learning_rate": 9.09028838094945e-05, + "loss": 1.8828, + "step": 7143 + }, + { + "epoch": 2.192756292203806, + "grad_norm": 0.32729873061180115, + "learning_rate": 9.090002485852037e-05, + "loss": 1.8827, + "step": 7144 + }, + { + "epoch": 2.193063228974831, + "grad_norm": 0.35304784774780273, + "learning_rate": 9.089716550334819e-05, + "loss": 1.846, + "step": 7145 + }, + { + "epoch": 2.1933701657458564, + "grad_norm": 0.35022708773612976, + "learning_rate": 9.089430574400629e-05, + "loss": 1.9169, + "step": 7146 + }, + { + "epoch": 2.1936771025168813, + "grad_norm": 0.4137697219848633, + "learning_rate": 9.089144558052287e-05, + "loss": 1.9111, + "step": 7147 + }, + { + "epoch": 2.1939840392879066, + "grad_norm": 0.3193536102771759, + "learning_rate": 9.088858501292622e-05, + "loss": 1.8577, + "step": 7148 + }, + { + "epoch": 2.194290976058932, + "grad_norm": 0.35795432329177856, + "learning_rate": 9.08857240412446e-05, + "loss": 1.8645, + "step": 7149 + }, + { + "epoch": 2.194597912829957, + "grad_norm": 0.3626460134983063, + "learning_rate": 9.088286266550632e-05, + "loss": 1.9288, + "step": 7150 + }, + { + "epoch": 2.194904849600982, + "grad_norm": 0.3438000977039337, + "learning_rate": 9.08800008857396e-05, + "loss": 1.9112, + "step": 7151 + }, + { + "epoch": 2.1952117863720075, + "grad_norm": 0.3445241153240204, + "learning_rate": 9.087713870197276e-05, + "loss": 1.8711, + "step": 7152 + }, + { + "epoch": 2.1955187231430324, + "grad_norm": 0.34294596314430237, + "learning_rate": 9.087427611423408e-05, + "loss": 1.9061, + "step": 7153 + }, + { + "epoch": 2.1958256599140578, + "grad_norm": 0.3608735203742981, + "learning_rate": 9.087141312255184e-05, + "loss": 1.8634, + "step": 7154 + }, + { + "epoch": 2.196132596685083, + "grad_norm": 0.3417772352695465, + "learning_rate": 9.086854972695434e-05, + "loss": 1.9, + "step": 7155 + }, + { + "epoch": 2.196439533456108, + "grad_norm": 0.3516700863838196, + "learning_rate": 9.086568592746988e-05, + "loss": 1.9021, + "step": 7156 + }, + { + "epoch": 2.1967464702271333, + "grad_norm": 0.37481075525283813, + "learning_rate": 9.086282172412677e-05, + "loss": 1.8845, + "step": 7157 + }, + { + "epoch": 2.197053406998158, + "grad_norm": 0.3413105010986328, + "learning_rate": 9.08599571169533e-05, + "loss": 1.8128, + "step": 7158 + }, + { + "epoch": 2.1973603437691835, + "grad_norm": 0.3539934754371643, + "learning_rate": 9.085709210597777e-05, + "loss": 1.857, + "step": 7159 + }, + { + "epoch": 2.197667280540209, + "grad_norm": 0.4345060884952545, + "learning_rate": 9.085422669122851e-05, + "loss": 1.8698, + "step": 7160 + }, + { + "epoch": 2.1979742173112338, + "grad_norm": 0.40369880199432373, + "learning_rate": 9.085136087273386e-05, + "loss": 1.7948, + "step": 7161 + }, + { + "epoch": 2.198281154082259, + "grad_norm": 0.3832145035266876, + "learning_rate": 9.08484946505221e-05, + "loss": 1.8682, + "step": 7162 + }, + { + "epoch": 2.198588090853284, + "grad_norm": 0.2859131097793579, + "learning_rate": 9.084562802462158e-05, + "loss": 1.8123, + "step": 7163 + }, + { + "epoch": 2.1988950276243093, + "grad_norm": 0.3062222898006439, + "learning_rate": 9.084276099506062e-05, + "loss": 1.8448, + "step": 7164 + }, + { + "epoch": 2.1992019643953347, + "grad_norm": 0.3819046914577484, + "learning_rate": 9.083989356186757e-05, + "loss": 1.8661, + "step": 7165 + }, + { + "epoch": 2.1995089011663596, + "grad_norm": 0.5007020235061646, + "learning_rate": 9.083702572507074e-05, + "loss": 1.9144, + "step": 7166 + }, + { + "epoch": 2.199815837937385, + "grad_norm": 0.521885097026825, + "learning_rate": 9.083415748469849e-05, + "loss": 1.8695, + "step": 7167 + }, + { + "epoch": 2.2001227747084102, + "grad_norm": 0.35051268339157104, + "learning_rate": 9.083128884077916e-05, + "loss": 1.9378, + "step": 7168 + }, + { + "epoch": 2.200429711479435, + "grad_norm": 0.40265345573425293, + "learning_rate": 9.082841979334111e-05, + "loss": 1.8902, + "step": 7169 + }, + { + "epoch": 2.2007366482504604, + "grad_norm": 0.506377637386322, + "learning_rate": 9.082555034241267e-05, + "loss": 1.9115, + "step": 7170 + }, + { + "epoch": 2.201043585021486, + "grad_norm": 0.42828384041786194, + "learning_rate": 9.082268048802223e-05, + "loss": 1.8173, + "step": 7171 + }, + { + "epoch": 2.2013505217925107, + "grad_norm": 0.2979312539100647, + "learning_rate": 9.081981023019812e-05, + "loss": 1.8089, + "step": 7172 + }, + { + "epoch": 2.201657458563536, + "grad_norm": 0.3840465843677521, + "learning_rate": 9.081693956896872e-05, + "loss": 1.8557, + "step": 7173 + }, + { + "epoch": 2.201964395334561, + "grad_norm": 0.41454845666885376, + "learning_rate": 9.081406850436241e-05, + "loss": 1.8599, + "step": 7174 + }, + { + "epoch": 2.2022713321055862, + "grad_norm": 0.3305908739566803, + "learning_rate": 9.081119703640756e-05, + "loss": 1.8013, + "step": 7175 + }, + { + "epoch": 2.2025782688766116, + "grad_norm": 0.33649876713752747, + "learning_rate": 9.080832516513252e-05, + "loss": 1.9028, + "step": 7176 + }, + { + "epoch": 2.2028852056476365, + "grad_norm": 0.41247284412384033, + "learning_rate": 9.08054528905657e-05, + "loss": 1.8636, + "step": 7177 + }, + { + "epoch": 2.203192142418662, + "grad_norm": 0.4355279505252838, + "learning_rate": 9.080258021273548e-05, + "loss": 1.8923, + "step": 7178 + }, + { + "epoch": 2.203499079189687, + "grad_norm": 0.34598320722579956, + "learning_rate": 9.079970713167026e-05, + "loss": 1.9187, + "step": 7179 + }, + { + "epoch": 2.203806015960712, + "grad_norm": 0.3560951054096222, + "learning_rate": 9.07968336473984e-05, + "loss": 1.9382, + "step": 7180 + }, + { + "epoch": 2.2041129527317374, + "grad_norm": 0.3873176872730255, + "learning_rate": 9.079395975994834e-05, + "loss": 1.8377, + "step": 7181 + }, + { + "epoch": 2.2044198895027622, + "grad_norm": 0.38699567317962646, + "learning_rate": 9.079108546934844e-05, + "loss": 1.848, + "step": 7182 + }, + { + "epoch": 2.2047268262737876, + "grad_norm": 0.3658364713191986, + "learning_rate": 9.078821077562712e-05, + "loss": 1.9308, + "step": 7183 + }, + { + "epoch": 2.205033763044813, + "grad_norm": 0.35228830575942993, + "learning_rate": 9.078533567881281e-05, + "loss": 1.8886, + "step": 7184 + }, + { + "epoch": 2.205340699815838, + "grad_norm": 0.4177337884902954, + "learning_rate": 9.07824601789339e-05, + "loss": 1.8695, + "step": 7185 + }, + { + "epoch": 2.205647636586863, + "grad_norm": 0.4778536260128021, + "learning_rate": 9.077958427601882e-05, + "loss": 1.8288, + "step": 7186 + }, + { + "epoch": 2.2059545733578885, + "grad_norm": 0.46544820070266724, + "learning_rate": 9.077670797009599e-05, + "loss": 1.8974, + "step": 7187 + }, + { + "epoch": 2.2062615101289134, + "grad_norm": 0.36188805103302, + "learning_rate": 9.077383126119382e-05, + "loss": 1.8953, + "step": 7188 + }, + { + "epoch": 2.2065684468999387, + "grad_norm": 0.30941206216812134, + "learning_rate": 9.077095414934075e-05, + "loss": 1.8395, + "step": 7189 + }, + { + "epoch": 2.2068753836709636, + "grad_norm": 0.4497200846672058, + "learning_rate": 9.076807663456524e-05, + "loss": 1.8485, + "step": 7190 + }, + { + "epoch": 2.207182320441989, + "grad_norm": 0.4923233985900879, + "learning_rate": 9.076519871689568e-05, + "loss": 1.8233, + "step": 7191 + }, + { + "epoch": 2.2074892572130143, + "grad_norm": 0.32226502895355225, + "learning_rate": 9.076232039636053e-05, + "loss": 1.8563, + "step": 7192 + }, + { + "epoch": 2.207796193984039, + "grad_norm": 0.46719446778297424, + "learning_rate": 9.075944167298824e-05, + "loss": 1.8602, + "step": 7193 + }, + { + "epoch": 2.2081031307550645, + "grad_norm": 0.5534674525260925, + "learning_rate": 9.075656254680727e-05, + "loss": 1.8804, + "step": 7194 + }, + { + "epoch": 2.20841006752609, + "grad_norm": 0.4895678162574768, + "learning_rate": 9.075368301784606e-05, + "loss": 1.8893, + "step": 7195 + }, + { + "epoch": 2.2087170042971147, + "grad_norm": 0.33137625455856323, + "learning_rate": 9.075080308613306e-05, + "loss": 1.9158, + "step": 7196 + }, + { + "epoch": 2.20902394106814, + "grad_norm": 0.469319611787796, + "learning_rate": 9.074792275169674e-05, + "loss": 1.8628, + "step": 7197 + }, + { + "epoch": 2.209330877839165, + "grad_norm": 0.43872305750846863, + "learning_rate": 9.074504201456556e-05, + "loss": 1.8867, + "step": 7198 + }, + { + "epoch": 2.2096378146101903, + "grad_norm": 0.32900992035865784, + "learning_rate": 9.0742160874768e-05, + "loss": 1.8079, + "step": 7199 + }, + { + "epoch": 2.2099447513812156, + "grad_norm": 0.34231048822402954, + "learning_rate": 9.073927933233253e-05, + "loss": 1.9018, + "step": 7200 + }, + { + "epoch": 2.2102516881522405, + "grad_norm": 0.43461740016937256, + "learning_rate": 9.07363973872876e-05, + "loss": 1.8299, + "step": 7201 + }, + { + "epoch": 2.210558624923266, + "grad_norm": 0.43819913268089294, + "learning_rate": 9.073351503966174e-05, + "loss": 1.8641, + "step": 7202 + }, + { + "epoch": 2.210865561694291, + "grad_norm": 0.330683171749115, + "learning_rate": 9.073063228948339e-05, + "loss": 1.8595, + "step": 7203 + }, + { + "epoch": 2.211172498465316, + "grad_norm": 0.35648414492607117, + "learning_rate": 9.072774913678108e-05, + "loss": 1.8265, + "step": 7204 + }, + { + "epoch": 2.2114794352363414, + "grad_norm": 0.4420771300792694, + "learning_rate": 9.072486558158329e-05, + "loss": 1.902, + "step": 7205 + }, + { + "epoch": 2.2117863720073663, + "grad_norm": 0.41682472825050354, + "learning_rate": 9.072198162391849e-05, + "loss": 1.903, + "step": 7206 + }, + { + "epoch": 2.2120933087783916, + "grad_norm": 0.3194744288921356, + "learning_rate": 9.07190972638152e-05, + "loss": 1.8221, + "step": 7207 + }, + { + "epoch": 2.212400245549417, + "grad_norm": 0.35625776648521423, + "learning_rate": 9.071621250130192e-05, + "loss": 1.8737, + "step": 7208 + }, + { + "epoch": 2.212707182320442, + "grad_norm": 0.4136293828487396, + "learning_rate": 9.071332733640716e-05, + "loss": 1.7995, + "step": 7209 + }, + { + "epoch": 2.213014119091467, + "grad_norm": 0.39144495129585266, + "learning_rate": 9.071044176915947e-05, + "loss": 1.8446, + "step": 7210 + }, + { + "epoch": 2.2133210558624925, + "grad_norm": 0.3082813322544098, + "learning_rate": 9.07075557995873e-05, + "loss": 1.7635, + "step": 7211 + }, + { + "epoch": 2.2136279926335174, + "grad_norm": 0.3642291724681854, + "learning_rate": 9.070466942771921e-05, + "loss": 1.9471, + "step": 7212 + }, + { + "epoch": 2.2139349294045427, + "grad_norm": 0.4506807029247284, + "learning_rate": 9.070178265358372e-05, + "loss": 1.8542, + "step": 7213 + }, + { + "epoch": 2.214241866175568, + "grad_norm": 0.5011601448059082, + "learning_rate": 9.069889547720936e-05, + "loss": 1.9135, + "step": 7214 + }, + { + "epoch": 2.214548802946593, + "grad_norm": 0.3946228623390198, + "learning_rate": 9.069600789862467e-05, + "loss": 1.876, + "step": 7215 + }, + { + "epoch": 2.2148557397176183, + "grad_norm": 0.34833815693855286, + "learning_rate": 9.069311991785816e-05, + "loss": 1.8666, + "step": 7216 + }, + { + "epoch": 2.215162676488643, + "grad_norm": 0.43735191226005554, + "learning_rate": 9.069023153493839e-05, + "loss": 1.9238, + "step": 7217 + }, + { + "epoch": 2.2154696132596685, + "grad_norm": 0.5010718107223511, + "learning_rate": 9.06873427498939e-05, + "loss": 1.8724, + "step": 7218 + }, + { + "epoch": 2.215776550030694, + "grad_norm": 0.35850396752357483, + "learning_rate": 9.068445356275326e-05, + "loss": 1.8825, + "step": 7219 + }, + { + "epoch": 2.2160834868017187, + "grad_norm": 0.3528468906879425, + "learning_rate": 9.0681563973545e-05, + "loss": 1.8724, + "step": 7220 + }, + { + "epoch": 2.216390423572744, + "grad_norm": 0.34725508093833923, + "learning_rate": 9.067867398229767e-05, + "loss": 1.8722, + "step": 7221 + }, + { + "epoch": 2.216697360343769, + "grad_norm": 0.3343757092952728, + "learning_rate": 9.067578358903985e-05, + "loss": 1.8144, + "step": 7222 + }, + { + "epoch": 2.2170042971147943, + "grad_norm": 0.33384087681770325, + "learning_rate": 9.067289279380009e-05, + "loss": 1.832, + "step": 7223 + }, + { + "epoch": 2.2173112338858196, + "grad_norm": 0.3275810778141022, + "learning_rate": 9.067000159660697e-05, + "loss": 1.8819, + "step": 7224 + }, + { + "epoch": 2.2176181706568445, + "grad_norm": 0.405293732881546, + "learning_rate": 9.066710999748904e-05, + "loss": 1.8669, + "step": 7225 + }, + { + "epoch": 2.21792510742787, + "grad_norm": 0.3554569482803345, + "learning_rate": 9.066421799647491e-05, + "loss": 1.8331, + "step": 7226 + }, + { + "epoch": 2.218232044198895, + "grad_norm": 0.3896840810775757, + "learning_rate": 9.066132559359313e-05, + "loss": 1.891, + "step": 7227 + }, + { + "epoch": 2.21853898096992, + "grad_norm": 0.38668718934059143, + "learning_rate": 9.065843278887231e-05, + "loss": 1.9162, + "step": 7228 + }, + { + "epoch": 2.2188459177409454, + "grad_norm": 0.3593392074108124, + "learning_rate": 9.065553958234103e-05, + "loss": 1.866, + "step": 7229 + }, + { + "epoch": 2.2191528545119708, + "grad_norm": 0.3509809076786041, + "learning_rate": 9.065264597402788e-05, + "loss": 1.8979, + "step": 7230 + }, + { + "epoch": 2.2194597912829956, + "grad_norm": 0.35477882623672485, + "learning_rate": 9.064975196396144e-05, + "loss": 1.8425, + "step": 7231 + }, + { + "epoch": 2.219766728054021, + "grad_norm": 0.38763463497161865, + "learning_rate": 9.064685755217033e-05, + "loss": 1.8853, + "step": 7232 + }, + { + "epoch": 2.220073664825046, + "grad_norm": 0.33559930324554443, + "learning_rate": 9.064396273868316e-05, + "loss": 1.8825, + "step": 7233 + }, + { + "epoch": 2.220380601596071, + "grad_norm": 0.3130233585834503, + "learning_rate": 9.064106752352852e-05, + "loss": 1.8082, + "step": 7234 + }, + { + "epoch": 2.2206875383670965, + "grad_norm": 0.33321285247802734, + "learning_rate": 9.063817190673503e-05, + "loss": 1.8795, + "step": 7235 + }, + { + "epoch": 2.2209944751381214, + "grad_norm": 0.47564151883125305, + "learning_rate": 9.063527588833132e-05, + "loss": 1.9461, + "step": 7236 + }, + { + "epoch": 2.2213014119091468, + "grad_norm": 0.38102859258651733, + "learning_rate": 9.063237946834597e-05, + "loss": 1.8656, + "step": 7237 + }, + { + "epoch": 2.2216083486801717, + "grad_norm": 0.32240456342697144, + "learning_rate": 9.062948264680765e-05, + "loss": 1.8187, + "step": 7238 + }, + { + "epoch": 2.221915285451197, + "grad_norm": 0.2852800190448761, + "learning_rate": 9.062658542374496e-05, + "loss": 1.8172, + "step": 7239 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3506350815296173, + "learning_rate": 9.062368779918655e-05, + "loss": 1.8909, + "step": 7240 + }, + { + "epoch": 2.222529158993247, + "grad_norm": 0.29418817162513733, + "learning_rate": 9.062078977316104e-05, + "loss": 1.8078, + "step": 7241 + }, + { + "epoch": 2.2228360957642725, + "grad_norm": 0.31221407651901245, + "learning_rate": 9.061789134569707e-05, + "loss": 1.8813, + "step": 7242 + }, + { + "epoch": 2.223143032535298, + "grad_norm": 0.32314184308052063, + "learning_rate": 9.061499251682331e-05, + "loss": 1.8838, + "step": 7243 + }, + { + "epoch": 2.2234499693063228, + "grad_norm": 0.3329566419124603, + "learning_rate": 9.061209328656838e-05, + "loss": 1.8987, + "step": 7244 + }, + { + "epoch": 2.223756906077348, + "grad_norm": 0.35992133617401123, + "learning_rate": 9.060919365496094e-05, + "loss": 1.9194, + "step": 7245 + }, + { + "epoch": 2.2240638428483734, + "grad_norm": 0.33594536781311035, + "learning_rate": 9.060629362202964e-05, + "loss": 1.8303, + "step": 7246 + }, + { + "epoch": 2.2243707796193983, + "grad_norm": 0.3469938635826111, + "learning_rate": 9.060339318780316e-05, + "loss": 1.905, + "step": 7247 + }, + { + "epoch": 2.2246777163904237, + "grad_norm": 0.3989942967891693, + "learning_rate": 9.060049235231015e-05, + "loss": 1.8655, + "step": 7248 + }, + { + "epoch": 2.2249846531614486, + "grad_norm": 0.35004356503486633, + "learning_rate": 9.059759111557926e-05, + "loss": 1.8081, + "step": 7249 + }, + { + "epoch": 2.225291589932474, + "grad_norm": 0.38162320852279663, + "learning_rate": 9.059468947763919e-05, + "loss": 1.9243, + "step": 7250 + }, + { + "epoch": 2.2255985267034992, + "grad_norm": 0.3417564034461975, + "learning_rate": 9.059178743851859e-05, + "loss": 1.8246, + "step": 7251 + }, + { + "epoch": 2.225905463474524, + "grad_norm": 0.39185380935668945, + "learning_rate": 9.058888499824618e-05, + "loss": 1.9235, + "step": 7252 + }, + { + "epoch": 2.2262124002455494, + "grad_norm": 0.5741223096847534, + "learning_rate": 9.058598215685061e-05, + "loss": 1.9104, + "step": 7253 + }, + { + "epoch": 2.226519337016575, + "grad_norm": 0.6595804691314697, + "learning_rate": 9.058307891436057e-05, + "loss": 1.9956, + "step": 7254 + }, + { + "epoch": 2.2268262737875997, + "grad_norm": 0.6249661445617676, + "learning_rate": 9.058017527080476e-05, + "loss": 1.8913, + "step": 7255 + }, + { + "epoch": 2.227133210558625, + "grad_norm": 0.48208609223365784, + "learning_rate": 9.057727122621188e-05, + "loss": 1.9116, + "step": 7256 + }, + { + "epoch": 2.22744014732965, + "grad_norm": 0.37400147318840027, + "learning_rate": 9.057436678061062e-05, + "loss": 1.8828, + "step": 7257 + }, + { + "epoch": 2.2277470841006752, + "grad_norm": 0.40321463346481323, + "learning_rate": 9.057146193402968e-05, + "loss": 1.7984, + "step": 7258 + }, + { + "epoch": 2.2280540208717006, + "grad_norm": 0.43090149760246277, + "learning_rate": 9.056855668649778e-05, + "loss": 1.9135, + "step": 7259 + }, + { + "epoch": 2.2283609576427255, + "grad_norm": 0.3625677525997162, + "learning_rate": 9.056565103804362e-05, + "loss": 1.9005, + "step": 7260 + }, + { + "epoch": 2.228667894413751, + "grad_norm": 0.3386496901512146, + "learning_rate": 9.056274498869593e-05, + "loss": 1.879, + "step": 7261 + }, + { + "epoch": 2.228974831184776, + "grad_norm": 0.45207980275154114, + "learning_rate": 9.05598385384834e-05, + "loss": 1.8748, + "step": 7262 + }, + { + "epoch": 2.229281767955801, + "grad_norm": 0.38665562868118286, + "learning_rate": 9.055693168743478e-05, + "loss": 1.8828, + "step": 7263 + }, + { + "epoch": 2.2295887047268264, + "grad_norm": 0.3074968159198761, + "learning_rate": 9.05540244355788e-05, + "loss": 1.8443, + "step": 7264 + }, + { + "epoch": 2.2298956414978512, + "grad_norm": 0.36243903636932373, + "learning_rate": 9.055111678294418e-05, + "loss": 1.8681, + "step": 7265 + }, + { + "epoch": 2.2302025782688766, + "grad_norm": 0.4070085287094116, + "learning_rate": 9.054820872955965e-05, + "loss": 1.8643, + "step": 7266 + }, + { + "epoch": 2.230509515039902, + "grad_norm": 0.3784204125404358, + "learning_rate": 9.054530027545396e-05, + "loss": 1.9197, + "step": 7267 + }, + { + "epoch": 2.230816451810927, + "grad_norm": 0.32002586126327515, + "learning_rate": 9.054239142065583e-05, + "loss": 1.9, + "step": 7268 + }, + { + "epoch": 2.231123388581952, + "grad_norm": 0.3701259195804596, + "learning_rate": 9.053948216519405e-05, + "loss": 1.8815, + "step": 7269 + }, + { + "epoch": 2.2314303253529775, + "grad_norm": 0.32927554845809937, + "learning_rate": 9.053657250909734e-05, + "loss": 1.8599, + "step": 7270 + }, + { + "epoch": 2.2317372621240024, + "grad_norm": 0.2915503680706024, + "learning_rate": 9.053366245239445e-05, + "loss": 1.8553, + "step": 7271 + }, + { + "epoch": 2.2320441988950277, + "grad_norm": 0.3347928822040558, + "learning_rate": 9.053075199511416e-05, + "loss": 1.926, + "step": 7272 + }, + { + "epoch": 2.2323511356660526, + "grad_norm": 0.37499183416366577, + "learning_rate": 9.052784113728523e-05, + "loss": 1.8636, + "step": 7273 + }, + { + "epoch": 2.232658072437078, + "grad_norm": 0.38303107023239136, + "learning_rate": 9.05249298789364e-05, + "loss": 1.8739, + "step": 7274 + }, + { + "epoch": 2.2329650092081033, + "grad_norm": 0.356942355632782, + "learning_rate": 9.052201822009648e-05, + "loss": 1.8401, + "step": 7275 + }, + { + "epoch": 2.233271945979128, + "grad_norm": 0.3391316533088684, + "learning_rate": 9.051910616079422e-05, + "loss": 1.8954, + "step": 7276 + }, + { + "epoch": 2.2335788827501535, + "grad_norm": 0.3100464344024658, + "learning_rate": 9.051619370105839e-05, + "loss": 1.8726, + "step": 7277 + }, + { + "epoch": 2.233885819521179, + "grad_norm": 0.38745078444480896, + "learning_rate": 9.05132808409178e-05, + "loss": 1.9605, + "step": 7278 + }, + { + "epoch": 2.2341927562922037, + "grad_norm": 0.40631747245788574, + "learning_rate": 9.051036758040123e-05, + "loss": 1.8458, + "step": 7279 + }, + { + "epoch": 2.234499693063229, + "grad_norm": 0.4084717929363251, + "learning_rate": 9.050745391953745e-05, + "loss": 1.8696, + "step": 7280 + }, + { + "epoch": 2.234806629834254, + "grad_norm": 0.4426955282688141, + "learning_rate": 9.050453985835527e-05, + "loss": 1.9063, + "step": 7281 + }, + { + "epoch": 2.2351135666052793, + "grad_norm": 0.37360796332359314, + "learning_rate": 9.05016253968835e-05, + "loss": 1.9299, + "step": 7282 + }, + { + "epoch": 2.2354205033763046, + "grad_norm": 0.34415799379348755, + "learning_rate": 9.049871053515091e-05, + "loss": 1.8877, + "step": 7283 + }, + { + "epoch": 2.2357274401473295, + "grad_norm": 0.3745698928833008, + "learning_rate": 9.049579527318633e-05, + "loss": 1.9272, + "step": 7284 + }, + { + "epoch": 2.236034376918355, + "grad_norm": 0.3293079435825348, + "learning_rate": 9.049287961101857e-05, + "loss": 1.8599, + "step": 7285 + }, + { + "epoch": 2.23634131368938, + "grad_norm": 0.3563106060028076, + "learning_rate": 9.048996354867644e-05, + "loss": 1.938, + "step": 7286 + }, + { + "epoch": 2.236648250460405, + "grad_norm": 0.36354976892471313, + "learning_rate": 9.048704708618876e-05, + "loss": 1.9401, + "step": 7287 + }, + { + "epoch": 2.2369551872314304, + "grad_norm": 0.32659000158309937, + "learning_rate": 9.048413022358434e-05, + "loss": 1.8056, + "step": 7288 + }, + { + "epoch": 2.2372621240024557, + "grad_norm": 0.30486637353897095, + "learning_rate": 9.048121296089202e-05, + "loss": 1.8178, + "step": 7289 + }, + { + "epoch": 2.2375690607734806, + "grad_norm": 0.34506455063819885, + "learning_rate": 9.047829529814063e-05, + "loss": 1.8866, + "step": 7290 + }, + { + "epoch": 2.237875997544506, + "grad_norm": 0.3200983703136444, + "learning_rate": 9.047537723535902e-05, + "loss": 1.8218, + "step": 7291 + }, + { + "epoch": 2.238182934315531, + "grad_norm": 0.33315715193748474, + "learning_rate": 9.047245877257597e-05, + "loss": 1.8939, + "step": 7292 + }, + { + "epoch": 2.238489871086556, + "grad_norm": 0.38259127736091614, + "learning_rate": 9.046953990982039e-05, + "loss": 1.9566, + "step": 7293 + }, + { + "epoch": 2.2387968078575815, + "grad_norm": 0.32880350947380066, + "learning_rate": 9.04666206471211e-05, + "loss": 1.9056, + "step": 7294 + }, + { + "epoch": 2.2391037446286064, + "grad_norm": 0.39114195108413696, + "learning_rate": 9.046370098450692e-05, + "loss": 1.8773, + "step": 7295 + }, + { + "epoch": 2.2394106813996317, + "grad_norm": 0.37625813484191895, + "learning_rate": 9.046078092200675e-05, + "loss": 1.8685, + "step": 7296 + }, + { + "epoch": 2.2397176181706566, + "grad_norm": 0.3604978621006012, + "learning_rate": 9.045786045964942e-05, + "loss": 1.885, + "step": 7297 + }, + { + "epoch": 2.240024554941682, + "grad_norm": 0.32200589776039124, + "learning_rate": 9.045493959746381e-05, + "loss": 1.9146, + "step": 7298 + }, + { + "epoch": 2.2403314917127073, + "grad_norm": 0.3635976314544678, + "learning_rate": 9.045201833547876e-05, + "loss": 1.8597, + "step": 7299 + }, + { + "epoch": 2.240638428483732, + "grad_norm": 0.3326318562030792, + "learning_rate": 9.044909667372317e-05, + "loss": 1.8577, + "step": 7300 + }, + { + "epoch": 2.2409453652547575, + "grad_norm": 0.32209664583206177, + "learning_rate": 9.044617461222589e-05, + "loss": 1.844, + "step": 7301 + }, + { + "epoch": 2.241252302025783, + "grad_norm": 0.3654637634754181, + "learning_rate": 9.044325215101581e-05, + "loss": 1.8858, + "step": 7302 + }, + { + "epoch": 2.2415592387968077, + "grad_norm": 0.3583166003227234, + "learning_rate": 9.04403292901218e-05, + "loss": 1.8148, + "step": 7303 + }, + { + "epoch": 2.241866175567833, + "grad_norm": 0.3315606117248535, + "learning_rate": 9.043740602957276e-05, + "loss": 1.8504, + "step": 7304 + }, + { + "epoch": 2.2421731123388584, + "grad_norm": 0.36084556579589844, + "learning_rate": 9.043448236939758e-05, + "loss": 1.9167, + "step": 7305 + }, + { + "epoch": 2.2424800491098833, + "grad_norm": 0.43558987975120544, + "learning_rate": 9.043155830962514e-05, + "loss": 1.8937, + "step": 7306 + }, + { + "epoch": 2.2427869858809086, + "grad_norm": 0.455240398645401, + "learning_rate": 9.042863385028433e-05, + "loss": 1.9774, + "step": 7307 + }, + { + "epoch": 2.2430939226519335, + "grad_norm": 0.35868698358535767, + "learning_rate": 9.042570899140408e-05, + "loss": 1.7999, + "step": 7308 + }, + { + "epoch": 2.243400859422959, + "grad_norm": 0.33930447697639465, + "learning_rate": 9.042278373301327e-05, + "loss": 1.965, + "step": 7309 + }, + { + "epoch": 2.243707796193984, + "grad_norm": 0.34124335646629333, + "learning_rate": 9.041985807514082e-05, + "loss": 1.8916, + "step": 7310 + }, + { + "epoch": 2.244014732965009, + "grad_norm": 0.3905695974826813, + "learning_rate": 9.041693201781565e-05, + "loss": 1.9066, + "step": 7311 + }, + { + "epoch": 2.2443216697360344, + "grad_norm": 0.3108711242675781, + "learning_rate": 9.041400556106667e-05, + "loss": 1.8038, + "step": 7312 + }, + { + "epoch": 2.2446286065070598, + "grad_norm": 0.2853390872478485, + "learning_rate": 9.041107870492279e-05, + "loss": 1.8945, + "step": 7313 + }, + { + "epoch": 2.2449355432780846, + "grad_norm": 0.33351564407348633, + "learning_rate": 9.040815144941295e-05, + "loss": 1.8796, + "step": 7314 + }, + { + "epoch": 2.24524248004911, + "grad_norm": 0.3470609486103058, + "learning_rate": 9.040522379456606e-05, + "loss": 1.8914, + "step": 7315 + }, + { + "epoch": 2.245549416820135, + "grad_norm": 0.3474356532096863, + "learning_rate": 9.040229574041109e-05, + "loss": 1.838, + "step": 7316 + }, + { + "epoch": 2.24585635359116, + "grad_norm": 0.36590397357940674, + "learning_rate": 9.039936728697693e-05, + "loss": 1.86, + "step": 7317 + }, + { + "epoch": 2.2461632903621855, + "grad_norm": 0.35168272256851196, + "learning_rate": 9.039643843429257e-05, + "loss": 1.9337, + "step": 7318 + }, + { + "epoch": 2.2464702271332104, + "grad_norm": 0.3402341604232788, + "learning_rate": 9.039350918238691e-05, + "loss": 1.9291, + "step": 7319 + }, + { + "epoch": 2.2467771639042358, + "grad_norm": 0.3505321443080902, + "learning_rate": 9.03905795312889e-05, + "loss": 1.8252, + "step": 7320 + }, + { + "epoch": 2.247084100675261, + "grad_norm": 0.38366270065307617, + "learning_rate": 9.038764948102754e-05, + "loss": 1.8685, + "step": 7321 + }, + { + "epoch": 2.247391037446286, + "grad_norm": 0.3616010844707489, + "learning_rate": 9.038471903163176e-05, + "loss": 1.8734, + "step": 7322 + }, + { + "epoch": 2.2476979742173113, + "grad_norm": 0.2982875108718872, + "learning_rate": 9.038178818313048e-05, + "loss": 1.824, + "step": 7323 + }, + { + "epoch": 2.248004910988336, + "grad_norm": 0.41936174035072327, + "learning_rate": 9.037885693555273e-05, + "loss": 1.8799, + "step": 7324 + }, + { + "epoch": 2.2483118477593615, + "grad_norm": 0.3460717797279358, + "learning_rate": 9.037592528892744e-05, + "loss": 1.8889, + "step": 7325 + }, + { + "epoch": 2.248618784530387, + "grad_norm": 0.34347018599510193, + "learning_rate": 9.03729932432836e-05, + "loss": 1.8779, + "step": 7326 + }, + { + "epoch": 2.2489257213014118, + "grad_norm": 0.2988032400608063, + "learning_rate": 9.037006079865016e-05, + "loss": 1.8753, + "step": 7327 + }, + { + "epoch": 2.249232658072437, + "grad_norm": 0.32754310965538025, + "learning_rate": 9.036712795505613e-05, + "loss": 1.8896, + "step": 7328 + }, + { + "epoch": 2.2495395948434624, + "grad_norm": 0.3599032163619995, + "learning_rate": 9.036419471253049e-05, + "loss": 1.8752, + "step": 7329 + }, + { + "epoch": 2.2498465316144873, + "grad_norm": 0.3461225926876068, + "learning_rate": 9.03612610711022e-05, + "loss": 1.8723, + "step": 7330 + }, + { + "epoch": 2.2501534683855127, + "grad_norm": 0.3141838610172272, + "learning_rate": 9.035832703080027e-05, + "loss": 1.8825, + "step": 7331 + }, + { + "epoch": 2.250460405156538, + "grad_norm": 0.35188567638397217, + "learning_rate": 9.035539259165371e-05, + "loss": 1.8832, + "step": 7332 + }, + { + "epoch": 2.250767341927563, + "grad_norm": 0.3496280014514923, + "learning_rate": 9.035245775369151e-05, + "loss": 1.9084, + "step": 7333 + }, + { + "epoch": 2.2510742786985882, + "grad_norm": 0.34936273097991943, + "learning_rate": 9.034952251694266e-05, + "loss": 1.8142, + "step": 7334 + }, + { + "epoch": 2.251381215469613, + "grad_norm": 0.4227045774459839, + "learning_rate": 9.034658688143618e-05, + "loss": 1.9454, + "step": 7335 + }, + { + "epoch": 2.2516881522406385, + "grad_norm": 0.4042366147041321, + "learning_rate": 9.034365084720108e-05, + "loss": 1.8993, + "step": 7336 + }, + { + "epoch": 2.251995089011664, + "grad_norm": 0.392633318901062, + "learning_rate": 9.03407144142664e-05, + "loss": 1.9229, + "step": 7337 + }, + { + "epoch": 2.2523020257826887, + "grad_norm": 0.31304940581321716, + "learning_rate": 9.033777758266111e-05, + "loss": 1.8746, + "step": 7338 + }, + { + "epoch": 2.252608962553714, + "grad_norm": 0.3205752372741699, + "learning_rate": 9.033484035241426e-05, + "loss": 1.8224, + "step": 7339 + }, + { + "epoch": 2.252915899324739, + "grad_norm": 0.32164251804351807, + "learning_rate": 9.033190272355488e-05, + "loss": 1.8164, + "step": 7340 + }, + { + "epoch": 2.2532228360957642, + "grad_norm": 0.3567545413970947, + "learning_rate": 9.032896469611201e-05, + "loss": 1.8892, + "step": 7341 + }, + { + "epoch": 2.2535297728667896, + "grad_norm": 0.3475800156593323, + "learning_rate": 9.032602627011467e-05, + "loss": 1.8594, + "step": 7342 + }, + { + "epoch": 2.2538367096378145, + "grad_norm": 0.38770994544029236, + "learning_rate": 9.032308744559189e-05, + "loss": 1.8899, + "step": 7343 + }, + { + "epoch": 2.25414364640884, + "grad_norm": 0.3671153783798218, + "learning_rate": 9.032014822257273e-05, + "loss": 1.8795, + "step": 7344 + }, + { + "epoch": 2.254450583179865, + "grad_norm": 0.3415989875793457, + "learning_rate": 9.031720860108623e-05, + "loss": 1.9007, + "step": 7345 + }, + { + "epoch": 2.25475751995089, + "grad_norm": 0.3317084014415741, + "learning_rate": 9.031426858116145e-05, + "loss": 1.8604, + "step": 7346 + }, + { + "epoch": 2.2550644567219154, + "grad_norm": 0.3760251998901367, + "learning_rate": 9.031132816282745e-05, + "loss": 1.9061, + "step": 7347 + }, + { + "epoch": 2.2553713934929407, + "grad_norm": 0.4288908541202545, + "learning_rate": 9.030838734611326e-05, + "loss": 1.8621, + "step": 7348 + }, + { + "epoch": 2.2556783302639656, + "grad_norm": 0.3840491771697998, + "learning_rate": 9.030544613104797e-05, + "loss": 1.8743, + "step": 7349 + }, + { + "epoch": 2.255985267034991, + "grad_norm": 0.32746297121047974, + "learning_rate": 9.030250451766063e-05, + "loss": 1.8813, + "step": 7350 + }, + { + "epoch": 2.256292203806016, + "grad_norm": 0.31266525387763977, + "learning_rate": 9.029956250598032e-05, + "loss": 1.816, + "step": 7351 + }, + { + "epoch": 2.256599140577041, + "grad_norm": 0.34744998812675476, + "learning_rate": 9.029662009603613e-05, + "loss": 1.8728, + "step": 7352 + }, + { + "epoch": 2.2569060773480665, + "grad_norm": 0.36204856634140015, + "learning_rate": 9.029367728785709e-05, + "loss": 1.9331, + "step": 7353 + }, + { + "epoch": 2.2572130141190914, + "grad_norm": 0.3839271664619446, + "learning_rate": 9.029073408147234e-05, + "loss": 2.0018, + "step": 7354 + }, + { + "epoch": 2.2575199508901167, + "grad_norm": 0.34844526648521423, + "learning_rate": 9.028779047691094e-05, + "loss": 1.8873, + "step": 7355 + }, + { + "epoch": 2.2578268876611416, + "grad_norm": 0.31876906752586365, + "learning_rate": 9.028484647420196e-05, + "loss": 1.8569, + "step": 7356 + }, + { + "epoch": 2.258133824432167, + "grad_norm": 0.3633274435997009, + "learning_rate": 9.028190207337452e-05, + "loss": 1.8645, + "step": 7357 + }, + { + "epoch": 2.2584407612031923, + "grad_norm": 0.39025530219078064, + "learning_rate": 9.027895727445775e-05, + "loss": 1.911, + "step": 7358 + }, + { + "epoch": 2.258747697974217, + "grad_norm": 0.34168434143066406, + "learning_rate": 9.027601207748067e-05, + "loss": 1.8675, + "step": 7359 + }, + { + "epoch": 2.2590546347452425, + "grad_norm": 0.3539605438709259, + "learning_rate": 9.027306648247245e-05, + "loss": 1.9001, + "step": 7360 + }, + { + "epoch": 2.259361571516268, + "grad_norm": 0.30433401465415955, + "learning_rate": 9.02701204894622e-05, + "loss": 1.8598, + "step": 7361 + }, + { + "epoch": 2.2596685082872927, + "grad_norm": 0.35448700189590454, + "learning_rate": 9.026717409847898e-05, + "loss": 1.8845, + "step": 7362 + }, + { + "epoch": 2.259975445058318, + "grad_norm": 0.34060248732566833, + "learning_rate": 9.026422730955197e-05, + "loss": 1.9322, + "step": 7363 + }, + { + "epoch": 2.2602823818293434, + "grad_norm": 0.3370642364025116, + "learning_rate": 9.026128012271026e-05, + "loss": 1.8356, + "step": 7364 + }, + { + "epoch": 2.2605893186003683, + "grad_norm": 0.3148033022880554, + "learning_rate": 9.025833253798298e-05, + "loss": 1.7723, + "step": 7365 + }, + { + "epoch": 2.2608962553713936, + "grad_norm": 0.3062879145145416, + "learning_rate": 9.025538455539925e-05, + "loss": 1.8548, + "step": 7366 + }, + { + "epoch": 2.2612031921424185, + "grad_norm": 0.3378484547138214, + "learning_rate": 9.025243617498825e-05, + "loss": 1.9049, + "step": 7367 + }, + { + "epoch": 2.261510128913444, + "grad_norm": 0.277660608291626, + "learning_rate": 9.024948739677905e-05, + "loss": 1.7833, + "step": 7368 + }, + { + "epoch": 2.261817065684469, + "grad_norm": 0.3986060619354248, + "learning_rate": 9.024653822080083e-05, + "loss": 1.8837, + "step": 7369 + }, + { + "epoch": 2.262124002455494, + "grad_norm": 0.3013289272785187, + "learning_rate": 9.024358864708275e-05, + "loss": 1.8659, + "step": 7370 + }, + { + "epoch": 2.2624309392265194, + "grad_norm": 0.3403053879737854, + "learning_rate": 9.024063867565391e-05, + "loss": 1.8914, + "step": 7371 + }, + { + "epoch": 2.2627378759975443, + "grad_norm": 0.3488257825374603, + "learning_rate": 9.023768830654351e-05, + "loss": 1.8887, + "step": 7372 + }, + { + "epoch": 2.2630448127685696, + "grad_norm": 0.2950255274772644, + "learning_rate": 9.023473753978069e-05, + "loss": 1.8385, + "step": 7373 + }, + { + "epoch": 2.263351749539595, + "grad_norm": 0.35732173919677734, + "learning_rate": 9.023178637539461e-05, + "loss": 1.8769, + "step": 7374 + }, + { + "epoch": 2.26365868631062, + "grad_norm": 0.5403436422348022, + "learning_rate": 9.022883481341445e-05, + "loss": 1.9742, + "step": 7375 + }, + { + "epoch": 2.263965623081645, + "grad_norm": 0.5506799221038818, + "learning_rate": 9.022588285386935e-05, + "loss": 1.8667, + "step": 7376 + }, + { + "epoch": 2.2642725598526705, + "grad_norm": 0.4272395372390747, + "learning_rate": 9.02229304967885e-05, + "loss": 1.8336, + "step": 7377 + }, + { + "epoch": 2.2645794966236954, + "grad_norm": 0.34911462664604187, + "learning_rate": 9.021997774220108e-05, + "loss": 1.8608, + "step": 7378 + }, + { + "epoch": 2.2648864333947207, + "grad_norm": 0.3592715263366699, + "learning_rate": 9.021702459013626e-05, + "loss": 1.925, + "step": 7379 + }, + { + "epoch": 2.265193370165746, + "grad_norm": 0.38482216000556946, + "learning_rate": 9.021407104062323e-05, + "loss": 1.8553, + "step": 7380 + }, + { + "epoch": 2.265500306936771, + "grad_norm": 0.4675584137439728, + "learning_rate": 9.021111709369118e-05, + "loss": 1.9303, + "step": 7381 + }, + { + "epoch": 2.2658072437077963, + "grad_norm": 0.40397754311561584, + "learning_rate": 9.02081627493693e-05, + "loss": 1.9512, + "step": 7382 + }, + { + "epoch": 2.266114180478821, + "grad_norm": 0.3385498821735382, + "learning_rate": 9.02052080076868e-05, + "loss": 1.8314, + "step": 7383 + }, + { + "epoch": 2.2664211172498465, + "grad_norm": 0.40668871998786926, + "learning_rate": 9.020225286867285e-05, + "loss": 1.8658, + "step": 7384 + }, + { + "epoch": 2.266728054020872, + "grad_norm": 0.4566061198711395, + "learning_rate": 9.01992973323567e-05, + "loss": 1.8429, + "step": 7385 + }, + { + "epoch": 2.2670349907918967, + "grad_norm": 0.42283549904823303, + "learning_rate": 9.019634139876752e-05, + "loss": 1.8858, + "step": 7386 + }, + { + "epoch": 2.267341927562922, + "grad_norm": 0.3491251468658447, + "learning_rate": 9.019338506793454e-05, + "loss": 1.8389, + "step": 7387 + }, + { + "epoch": 2.267648864333947, + "grad_norm": 0.33846428990364075, + "learning_rate": 9.019042833988696e-05, + "loss": 1.8309, + "step": 7388 + }, + { + "epoch": 2.2679558011049723, + "grad_norm": 0.39968016743659973, + "learning_rate": 9.0187471214654e-05, + "loss": 1.8591, + "step": 7389 + }, + { + "epoch": 2.2682627378759976, + "grad_norm": 0.39926376938819885, + "learning_rate": 9.018451369226493e-05, + "loss": 1.9341, + "step": 7390 + }, + { + "epoch": 2.2685696746470225, + "grad_norm": 0.41112056374549866, + "learning_rate": 9.018155577274892e-05, + "loss": 1.8856, + "step": 7391 + }, + { + "epoch": 2.268876611418048, + "grad_norm": 0.49490058422088623, + "learning_rate": 9.017859745613521e-05, + "loss": 1.8458, + "step": 7392 + }, + { + "epoch": 2.269183548189073, + "grad_norm": 0.42149874567985535, + "learning_rate": 9.017563874245308e-05, + "loss": 1.862, + "step": 7393 + }, + { + "epoch": 2.269490484960098, + "grad_norm": 0.37284091114997864, + "learning_rate": 9.017267963173173e-05, + "loss": 1.8698, + "step": 7394 + }, + { + "epoch": 2.2697974217311234, + "grad_norm": 0.3743322193622589, + "learning_rate": 9.016972012400041e-05, + "loss": 1.8847, + "step": 7395 + }, + { + "epoch": 2.2701043585021488, + "grad_norm": 0.4327050447463989, + "learning_rate": 9.016676021928838e-05, + "loss": 1.8227, + "step": 7396 + }, + { + "epoch": 2.2704112952731736, + "grad_norm": 0.4334336519241333, + "learning_rate": 9.016379991762487e-05, + "loss": 1.9292, + "step": 7397 + }, + { + "epoch": 2.270718232044199, + "grad_norm": 0.37071630358695984, + "learning_rate": 9.016083921903915e-05, + "loss": 1.8045, + "step": 7398 + }, + { + "epoch": 2.271025168815224, + "grad_norm": 0.32131752371788025, + "learning_rate": 9.015787812356049e-05, + "loss": 1.8697, + "step": 7399 + }, + { + "epoch": 2.271332105586249, + "grad_norm": 0.3604664206504822, + "learning_rate": 9.015491663121813e-05, + "loss": 1.9259, + "step": 7400 + }, + { + "epoch": 2.2716390423572745, + "grad_norm": 0.3364580571651459, + "learning_rate": 9.015195474204136e-05, + "loss": 1.8964, + "step": 7401 + }, + { + "epoch": 2.2719459791282994, + "grad_norm": 0.3141402304172516, + "learning_rate": 9.014899245605944e-05, + "loss": 1.8536, + "step": 7402 + }, + { + "epoch": 2.2722529158993248, + "grad_norm": 0.3387024402618408, + "learning_rate": 9.014602977330162e-05, + "loss": 1.8362, + "step": 7403 + }, + { + "epoch": 2.27255985267035, + "grad_norm": 0.42270272970199585, + "learning_rate": 9.014306669379723e-05, + "loss": 1.8288, + "step": 7404 + }, + { + "epoch": 2.272866789441375, + "grad_norm": 0.4565230906009674, + "learning_rate": 9.01401032175755e-05, + "loss": 1.8573, + "step": 7405 + }, + { + "epoch": 2.2731737262124003, + "grad_norm": 0.38861140608787537, + "learning_rate": 9.013713934466576e-05, + "loss": 1.8778, + "step": 7406 + }, + { + "epoch": 2.2734806629834257, + "grad_norm": 0.31552520394325256, + "learning_rate": 9.01341750750973e-05, + "loss": 1.8342, + "step": 7407 + }, + { + "epoch": 2.2737875997544506, + "grad_norm": 0.3771591782569885, + "learning_rate": 9.013121040889938e-05, + "loss": 1.8847, + "step": 7408 + }, + { + "epoch": 2.274094536525476, + "grad_norm": 0.3689042925834656, + "learning_rate": 9.012824534610132e-05, + "loss": 1.9014, + "step": 7409 + }, + { + "epoch": 2.2744014732965008, + "grad_norm": 0.31477800011634827, + "learning_rate": 9.012527988673241e-05, + "loss": 1.8631, + "step": 7410 + }, + { + "epoch": 2.274708410067526, + "grad_norm": 0.3238977789878845, + "learning_rate": 9.012231403082199e-05, + "loss": 1.8319, + "step": 7411 + }, + { + "epoch": 2.2750153468385514, + "grad_norm": 0.3587593138217926, + "learning_rate": 9.011934777839932e-05, + "loss": 1.8982, + "step": 7412 + }, + { + "epoch": 2.2753222836095763, + "grad_norm": 0.35946986079216003, + "learning_rate": 9.011638112949376e-05, + "loss": 1.9206, + "step": 7413 + }, + { + "epoch": 2.2756292203806017, + "grad_norm": 0.3451001048088074, + "learning_rate": 9.01134140841346e-05, + "loss": 1.8122, + "step": 7414 + }, + { + "epoch": 2.2759361571516266, + "grad_norm": 0.3779532313346863, + "learning_rate": 9.011044664235116e-05, + "loss": 1.8851, + "step": 7415 + }, + { + "epoch": 2.276243093922652, + "grad_norm": 0.3812767267227173, + "learning_rate": 9.010747880417279e-05, + "loss": 1.902, + "step": 7416 + }, + { + "epoch": 2.2765500306936772, + "grad_norm": 0.3666127920150757, + "learning_rate": 9.01045105696288e-05, + "loss": 1.8296, + "step": 7417 + }, + { + "epoch": 2.276856967464702, + "grad_norm": 0.3588816225528717, + "learning_rate": 9.010154193874854e-05, + "loss": 1.9023, + "step": 7418 + }, + { + "epoch": 2.2771639042357275, + "grad_norm": 0.37766706943511963, + "learning_rate": 9.009857291156134e-05, + "loss": 1.7996, + "step": 7419 + }, + { + "epoch": 2.277470841006753, + "grad_norm": 0.4222901165485382, + "learning_rate": 9.009560348809654e-05, + "loss": 1.8802, + "step": 7420 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.39289870858192444, + "learning_rate": 9.009263366838348e-05, + "loss": 1.8988, + "step": 7421 + }, + { + "epoch": 2.278084714548803, + "grad_norm": 0.3670540750026703, + "learning_rate": 9.008966345245152e-05, + "loss": 1.8348, + "step": 7422 + }, + { + "epoch": 2.2783916513198283, + "grad_norm": 0.36671552062034607, + "learning_rate": 9.008669284032998e-05, + "loss": 1.9059, + "step": 7423 + }, + { + "epoch": 2.2786985880908532, + "grad_norm": 0.33226338028907776, + "learning_rate": 9.008372183204827e-05, + "loss": 1.8736, + "step": 7424 + }, + { + "epoch": 2.2790055248618786, + "grad_norm": 0.3424983322620392, + "learning_rate": 9.008075042763573e-05, + "loss": 1.8537, + "step": 7425 + }, + { + "epoch": 2.2793124616329035, + "grad_norm": 0.3336870074272156, + "learning_rate": 9.007777862712172e-05, + "loss": 1.8622, + "step": 7426 + }, + { + "epoch": 2.279619398403929, + "grad_norm": 0.3488881289958954, + "learning_rate": 9.007480643053561e-05, + "loss": 1.88, + "step": 7427 + }, + { + "epoch": 2.279926335174954, + "grad_norm": 0.34159761667251587, + "learning_rate": 9.007183383790676e-05, + "loss": 1.8893, + "step": 7428 + }, + { + "epoch": 2.280233271945979, + "grad_norm": 0.3075805604457855, + "learning_rate": 9.006886084926459e-05, + "loss": 1.8613, + "step": 7429 + }, + { + "epoch": 2.2805402087170044, + "grad_norm": 0.32371413707733154, + "learning_rate": 9.006588746463844e-05, + "loss": 1.909, + "step": 7430 + }, + { + "epoch": 2.2808471454880292, + "grad_norm": 0.34343451261520386, + "learning_rate": 9.006291368405769e-05, + "loss": 1.8696, + "step": 7431 + }, + { + "epoch": 2.2811540822590546, + "grad_norm": 0.34018251299858093, + "learning_rate": 9.005993950755177e-05, + "loss": 1.9155, + "step": 7432 + }, + { + "epoch": 2.28146101903008, + "grad_norm": 0.42582982778549194, + "learning_rate": 9.005696493515003e-05, + "loss": 1.8901, + "step": 7433 + }, + { + "epoch": 2.281767955801105, + "grad_norm": 0.44168829917907715, + "learning_rate": 9.005398996688188e-05, + "loss": 1.8693, + "step": 7434 + }, + { + "epoch": 2.28207489257213, + "grad_norm": 0.3650555908679962, + "learning_rate": 9.005101460277673e-05, + "loss": 1.8726, + "step": 7435 + }, + { + "epoch": 2.2823818293431555, + "grad_norm": 0.2945705056190491, + "learning_rate": 9.004803884286399e-05, + "loss": 1.8655, + "step": 7436 + }, + { + "epoch": 2.2826887661141804, + "grad_norm": 0.4192120432853699, + "learning_rate": 9.004506268717305e-05, + "loss": 1.9859, + "step": 7437 + }, + { + "epoch": 2.2829957028852057, + "grad_norm": 0.35403937101364136, + "learning_rate": 9.004208613573334e-05, + "loss": 1.785, + "step": 7438 + }, + { + "epoch": 2.283302639656231, + "grad_norm": 0.3038218021392822, + "learning_rate": 9.003910918857426e-05, + "loss": 1.8199, + "step": 7439 + }, + { + "epoch": 2.283609576427256, + "grad_norm": 0.3447442352771759, + "learning_rate": 9.003613184572522e-05, + "loss": 1.882, + "step": 7440 + }, + { + "epoch": 2.2839165131982813, + "grad_norm": 0.32208123803138733, + "learning_rate": 9.003315410721567e-05, + "loss": 1.8326, + "step": 7441 + }, + { + "epoch": 2.284223449969306, + "grad_norm": 0.31731268763542175, + "learning_rate": 9.003017597307504e-05, + "loss": 1.8947, + "step": 7442 + }, + { + "epoch": 2.2845303867403315, + "grad_norm": 0.3491910398006439, + "learning_rate": 9.002719744333273e-05, + "loss": 1.924, + "step": 7443 + }, + { + "epoch": 2.284837323511357, + "grad_norm": 0.32135117053985596, + "learning_rate": 9.00242185180182e-05, + "loss": 1.838, + "step": 7444 + }, + { + "epoch": 2.2851442602823817, + "grad_norm": 0.32201823592185974, + "learning_rate": 9.00212391971609e-05, + "loss": 1.8449, + "step": 7445 + }, + { + "epoch": 2.285451197053407, + "grad_norm": 0.3983609676361084, + "learning_rate": 9.001825948079024e-05, + "loss": 1.8897, + "step": 7446 + }, + { + "epoch": 2.285758133824432, + "grad_norm": 0.4174421727657318, + "learning_rate": 9.001527936893568e-05, + "loss": 1.8671, + "step": 7447 + }, + { + "epoch": 2.2860650705954573, + "grad_norm": 0.3456888496875763, + "learning_rate": 9.001229886162668e-05, + "loss": 1.9064, + "step": 7448 + }, + { + "epoch": 2.2863720073664826, + "grad_norm": 0.3092229664325714, + "learning_rate": 9.000931795889269e-05, + "loss": 1.8478, + "step": 7449 + }, + { + "epoch": 2.2866789441375075, + "grad_norm": 0.40093541145324707, + "learning_rate": 9.000633666076317e-05, + "loss": 1.9226, + "step": 7450 + }, + { + "epoch": 2.286985880908533, + "grad_norm": 0.41090336441993713, + "learning_rate": 9.000335496726759e-05, + "loss": 1.8542, + "step": 7451 + }, + { + "epoch": 2.287292817679558, + "grad_norm": 0.48479974269866943, + "learning_rate": 9.00003728784354e-05, + "loss": 1.9217, + "step": 7452 + }, + { + "epoch": 2.287599754450583, + "grad_norm": 0.662677526473999, + "learning_rate": 8.999739039429609e-05, + "loss": 1.957, + "step": 7453 + }, + { + "epoch": 2.2879066912216084, + "grad_norm": 0.6417959928512573, + "learning_rate": 8.999440751487911e-05, + "loss": 1.8273, + "step": 7454 + }, + { + "epoch": 2.2882136279926337, + "grad_norm": 0.5561745762825012, + "learning_rate": 8.999142424021396e-05, + "loss": 1.9081, + "step": 7455 + }, + { + "epoch": 2.2885205647636586, + "grad_norm": 0.3603537976741791, + "learning_rate": 8.998844057033013e-05, + "loss": 1.8256, + "step": 7456 + }, + { + "epoch": 2.288827501534684, + "grad_norm": 0.5149406790733337, + "learning_rate": 8.998545650525707e-05, + "loss": 1.8257, + "step": 7457 + }, + { + "epoch": 2.289134438305709, + "grad_norm": 0.6777750253677368, + "learning_rate": 8.99824720450243e-05, + "loss": 1.8581, + "step": 7458 + }, + { + "epoch": 2.289441375076734, + "grad_norm": 0.6244171857833862, + "learning_rate": 8.997948718966132e-05, + "loss": 1.9195, + "step": 7459 + }, + { + "epoch": 2.2897483118477595, + "grad_norm": 0.3903466463088989, + "learning_rate": 8.99765019391976e-05, + "loss": 1.8996, + "step": 7460 + }, + { + "epoch": 2.2900552486187844, + "grad_norm": 0.4231773614883423, + "learning_rate": 8.997351629366266e-05, + "loss": 1.9447, + "step": 7461 + }, + { + "epoch": 2.2903621853898097, + "grad_norm": 0.5735896825790405, + "learning_rate": 8.997053025308602e-05, + "loss": 1.9082, + "step": 7462 + }, + { + "epoch": 2.2906691221608346, + "grad_norm": 0.5015980005264282, + "learning_rate": 8.996754381749715e-05, + "loss": 1.8744, + "step": 7463 + }, + { + "epoch": 2.29097605893186, + "grad_norm": 0.3385339677333832, + "learning_rate": 8.996455698692558e-05, + "loss": 1.8908, + "step": 7464 + }, + { + "epoch": 2.2912829957028853, + "grad_norm": 0.35323935747146606, + "learning_rate": 8.996156976140086e-05, + "loss": 1.8739, + "step": 7465 + }, + { + "epoch": 2.29158993247391, + "grad_norm": 0.386081725358963, + "learning_rate": 8.995858214095248e-05, + "loss": 1.8734, + "step": 7466 + }, + { + "epoch": 2.2918968692449355, + "grad_norm": 0.32834386825561523, + "learning_rate": 8.995559412560996e-05, + "loss": 1.8849, + "step": 7467 + }, + { + "epoch": 2.292203806015961, + "grad_norm": 0.3868117034435272, + "learning_rate": 8.995260571540284e-05, + "loss": 1.8992, + "step": 7468 + }, + { + "epoch": 2.2925107427869857, + "grad_norm": 0.3869209885597229, + "learning_rate": 8.994961691036066e-05, + "loss": 1.8562, + "step": 7469 + }, + { + "epoch": 2.292817679558011, + "grad_norm": 0.39098650217056274, + "learning_rate": 8.994662771051294e-05, + "loss": 1.9077, + "step": 7470 + }, + { + "epoch": 2.2931246163290364, + "grad_norm": 0.4433341920375824, + "learning_rate": 8.994363811588923e-05, + "loss": 1.9193, + "step": 7471 + }, + { + "epoch": 2.2934315531000613, + "grad_norm": 0.37947940826416016, + "learning_rate": 8.99406481265191e-05, + "loss": 1.8843, + "step": 7472 + }, + { + "epoch": 2.2937384898710866, + "grad_norm": 0.4123954772949219, + "learning_rate": 8.993765774243206e-05, + "loss": 1.8847, + "step": 7473 + }, + { + "epoch": 2.2940454266421115, + "grad_norm": 0.3863835036754608, + "learning_rate": 8.993466696365768e-05, + "loss": 1.8226, + "step": 7474 + }, + { + "epoch": 2.294352363413137, + "grad_norm": 0.34903961420059204, + "learning_rate": 8.993167579022551e-05, + "loss": 1.9151, + "step": 7475 + }, + { + "epoch": 2.294659300184162, + "grad_norm": 0.439989298582077, + "learning_rate": 8.992868422216512e-05, + "loss": 1.8494, + "step": 7476 + }, + { + "epoch": 2.294966236955187, + "grad_norm": 0.42929476499557495, + "learning_rate": 8.992569225950607e-05, + "loss": 1.8174, + "step": 7477 + }, + { + "epoch": 2.2952731737262124, + "grad_norm": 0.39554497599601746, + "learning_rate": 8.992269990227792e-05, + "loss": 1.8692, + "step": 7478 + }, + { + "epoch": 2.2955801104972378, + "grad_norm": 0.29355254769325256, + "learning_rate": 8.991970715051026e-05, + "loss": 1.8033, + "step": 7479 + }, + { + "epoch": 2.2958870472682626, + "grad_norm": 0.3488605320453644, + "learning_rate": 8.991671400423265e-05, + "loss": 1.8979, + "step": 7480 + }, + { + "epoch": 2.296193984039288, + "grad_norm": 0.34984245896339417, + "learning_rate": 8.991372046347468e-05, + "loss": 1.8931, + "step": 7481 + }, + { + "epoch": 2.2965009208103133, + "grad_norm": 0.29404810070991516, + "learning_rate": 8.991072652826593e-05, + "loss": 1.8626, + "step": 7482 + }, + { + "epoch": 2.296807857581338, + "grad_norm": 0.2838701009750366, + "learning_rate": 8.990773219863598e-05, + "loss": 1.8542, + "step": 7483 + }, + { + "epoch": 2.2971147943523635, + "grad_norm": 0.28008925914764404, + "learning_rate": 8.990473747461444e-05, + "loss": 1.8354, + "step": 7484 + }, + { + "epoch": 2.2974217311233884, + "grad_norm": 0.3046751320362091, + "learning_rate": 8.99017423562309e-05, + "loss": 1.8657, + "step": 7485 + }, + { + "epoch": 2.2977286678944138, + "grad_norm": 0.28220781683921814, + "learning_rate": 8.989874684351494e-05, + "loss": 1.8349, + "step": 7486 + }, + { + "epoch": 2.298035604665439, + "grad_norm": 0.2665577232837677, + "learning_rate": 8.989575093649619e-05, + "loss": 1.8551, + "step": 7487 + }, + { + "epoch": 2.298342541436464, + "grad_norm": 0.2797924280166626, + "learning_rate": 8.989275463520423e-05, + "loss": 1.8568, + "step": 7488 + }, + { + "epoch": 2.2986494782074893, + "grad_norm": 0.2917410731315613, + "learning_rate": 8.98897579396687e-05, + "loss": 1.843, + "step": 7489 + }, + { + "epoch": 2.298956414978514, + "grad_norm": 0.3014819920063019, + "learning_rate": 8.98867608499192e-05, + "loss": 1.8527, + "step": 7490 + }, + { + "epoch": 2.2992633517495396, + "grad_norm": 0.28019243478775024, + "learning_rate": 8.988376336598537e-05, + "loss": 1.7744, + "step": 7491 + }, + { + "epoch": 2.299570288520565, + "grad_norm": 0.35014277696609497, + "learning_rate": 8.988076548789678e-05, + "loss": 1.9604, + "step": 7492 + }, + { + "epoch": 2.2998772252915898, + "grad_norm": 0.3060695230960846, + "learning_rate": 8.987776721568311e-05, + "loss": 1.8463, + "step": 7493 + }, + { + "epoch": 2.300184162062615, + "grad_norm": 0.29870638251304626, + "learning_rate": 8.987476854937395e-05, + "loss": 1.815, + "step": 7494 + }, + { + "epoch": 2.3004910988336404, + "grad_norm": 0.27395132184028625, + "learning_rate": 8.987176948899898e-05, + "loss": 1.8126, + "step": 7495 + }, + { + "epoch": 2.3007980356046653, + "grad_norm": 0.2982339859008789, + "learning_rate": 8.986877003458781e-05, + "loss": 1.9114, + "step": 7496 + }, + { + "epoch": 2.3011049723756907, + "grad_norm": 0.3113982081413269, + "learning_rate": 8.986577018617008e-05, + "loss": 1.8429, + "step": 7497 + }, + { + "epoch": 2.301411909146716, + "grad_norm": 0.3538585603237152, + "learning_rate": 8.986276994377544e-05, + "loss": 1.9045, + "step": 7498 + }, + { + "epoch": 2.301718845917741, + "grad_norm": 0.37576064467430115, + "learning_rate": 8.985976930743356e-05, + "loss": 1.8955, + "step": 7499 + }, + { + "epoch": 2.3020257826887662, + "grad_norm": 0.3080044388771057, + "learning_rate": 8.985676827717406e-05, + "loss": 1.7946, + "step": 7500 + }, + { + "epoch": 2.302332719459791, + "grad_norm": 0.33935341238975525, + "learning_rate": 8.985376685302662e-05, + "loss": 1.8817, + "step": 7501 + }, + { + "epoch": 2.3026396562308165, + "grad_norm": 0.3817180395126343, + "learning_rate": 8.98507650350209e-05, + "loss": 1.9178, + "step": 7502 + }, + { + "epoch": 2.302946593001842, + "grad_norm": 0.35170307755470276, + "learning_rate": 8.984776282318657e-05, + "loss": 1.9451, + "step": 7503 + }, + { + "epoch": 2.3032535297728667, + "grad_norm": 0.3451419770717621, + "learning_rate": 8.984476021755329e-05, + "loss": 1.9127, + "step": 7504 + }, + { + "epoch": 2.303560466543892, + "grad_norm": 0.4312259554862976, + "learning_rate": 8.984175721815071e-05, + "loss": 1.8784, + "step": 7505 + }, + { + "epoch": 2.303867403314917, + "grad_norm": 0.4684976041316986, + "learning_rate": 8.983875382500856e-05, + "loss": 1.8782, + "step": 7506 + }, + { + "epoch": 2.3041743400859422, + "grad_norm": 0.4230491518974304, + "learning_rate": 8.983575003815648e-05, + "loss": 1.8769, + "step": 7507 + }, + { + "epoch": 2.3044812768569676, + "grad_norm": 0.32715409994125366, + "learning_rate": 8.983274585762417e-05, + "loss": 1.8535, + "step": 7508 + }, + { + "epoch": 2.3047882136279925, + "grad_norm": 0.3857569396495819, + "learning_rate": 8.982974128344134e-05, + "loss": 1.8689, + "step": 7509 + }, + { + "epoch": 2.305095150399018, + "grad_norm": 0.46266329288482666, + "learning_rate": 8.982673631563766e-05, + "loss": 1.9151, + "step": 7510 + }, + { + "epoch": 2.305402087170043, + "grad_norm": 0.455713152885437, + "learning_rate": 8.98237309542428e-05, + "loss": 1.9304, + "step": 7511 + }, + { + "epoch": 2.305709023941068, + "grad_norm": 0.3413514792919159, + "learning_rate": 8.98207251992865e-05, + "loss": 1.8516, + "step": 7512 + }, + { + "epoch": 2.3060159607120934, + "grad_norm": 0.3705863058567047, + "learning_rate": 8.981771905079846e-05, + "loss": 1.8434, + "step": 7513 + }, + { + "epoch": 2.3063228974831187, + "grad_norm": 0.46615147590637207, + "learning_rate": 8.981471250880839e-05, + "loss": 1.9265, + "step": 7514 + }, + { + "epoch": 2.3066298342541436, + "grad_norm": 0.5400925278663635, + "learning_rate": 8.981170557334598e-05, + "loss": 1.9061, + "step": 7515 + }, + { + "epoch": 2.306936771025169, + "grad_norm": 0.40317288041114807, + "learning_rate": 8.980869824444096e-05, + "loss": 1.7916, + "step": 7516 + }, + { + "epoch": 2.307243707796194, + "grad_norm": 0.3522326648235321, + "learning_rate": 8.980569052212307e-05, + "loss": 1.867, + "step": 7517 + }, + { + "epoch": 2.307550644567219, + "grad_norm": 0.5134142637252808, + "learning_rate": 8.9802682406422e-05, + "loss": 1.8406, + "step": 7518 + }, + { + "epoch": 2.3078575813382445, + "grad_norm": 0.5792621970176697, + "learning_rate": 8.97996738973675e-05, + "loss": 1.8467, + "step": 7519 + }, + { + "epoch": 2.3081645181092694, + "grad_norm": 0.424405962228775, + "learning_rate": 8.979666499498928e-05, + "loss": 1.779, + "step": 7520 + }, + { + "epoch": 2.3084714548802947, + "grad_norm": 0.3233562409877777, + "learning_rate": 8.979365569931712e-05, + "loss": 1.9043, + "step": 7521 + }, + { + "epoch": 2.3087783916513196, + "grad_norm": 0.6043062806129456, + "learning_rate": 8.979064601038071e-05, + "loss": 1.9245, + "step": 7522 + }, + { + "epoch": 2.309085328422345, + "grad_norm": 0.6618810892105103, + "learning_rate": 8.978763592820982e-05, + "loss": 1.8601, + "step": 7523 + }, + { + "epoch": 2.3093922651933703, + "grad_norm": 0.44771909713745117, + "learning_rate": 8.978462545283418e-05, + "loss": 1.7836, + "step": 7524 + }, + { + "epoch": 2.309699201964395, + "grad_norm": 0.3473430871963501, + "learning_rate": 8.978161458428356e-05, + "loss": 1.8743, + "step": 7525 + }, + { + "epoch": 2.3100061387354205, + "grad_norm": 0.46158188581466675, + "learning_rate": 8.977860332258772e-05, + "loss": 1.8802, + "step": 7526 + }, + { + "epoch": 2.310313075506446, + "grad_norm": 0.42034098505973816, + "learning_rate": 8.977559166777639e-05, + "loss": 1.8773, + "step": 7527 + }, + { + "epoch": 2.3106200122774707, + "grad_norm": 0.30994895100593567, + "learning_rate": 8.977257961987936e-05, + "loss": 1.8042, + "step": 7528 + }, + { + "epoch": 2.310926949048496, + "grad_norm": 0.32265907526016235, + "learning_rate": 8.976956717892638e-05, + "loss": 1.8, + "step": 7529 + }, + { + "epoch": 2.3112338858195214, + "grad_norm": 0.3592197000980377, + "learning_rate": 8.976655434494723e-05, + "loss": 1.9053, + "step": 7530 + }, + { + "epoch": 2.3115408225905463, + "grad_norm": 0.36494702100753784, + "learning_rate": 8.97635411179717e-05, + "loss": 1.8982, + "step": 7531 + }, + { + "epoch": 2.3118477593615716, + "grad_norm": 0.3697327971458435, + "learning_rate": 8.976052749802952e-05, + "loss": 1.9446, + "step": 7532 + }, + { + "epoch": 2.3121546961325965, + "grad_norm": 0.5200048089027405, + "learning_rate": 8.975751348515052e-05, + "loss": 1.9429, + "step": 7533 + }, + { + "epoch": 2.312461632903622, + "grad_norm": 0.4033229947090149, + "learning_rate": 8.975449907936446e-05, + "loss": 1.8128, + "step": 7534 + }, + { + "epoch": 2.312768569674647, + "grad_norm": 0.35759851336479187, + "learning_rate": 8.975148428070115e-05, + "loss": 1.8721, + "step": 7535 + }, + { + "epoch": 2.313075506445672, + "grad_norm": 0.4578085243701935, + "learning_rate": 8.974846908919037e-05, + "loss": 1.8397, + "step": 7536 + }, + { + "epoch": 2.3133824432166974, + "grad_norm": 0.4557357132434845, + "learning_rate": 8.974545350486192e-05, + "loss": 1.8726, + "step": 7537 + }, + { + "epoch": 2.3136893799877223, + "grad_norm": 0.3946380615234375, + "learning_rate": 8.974243752774561e-05, + "loss": 1.8662, + "step": 7538 + }, + { + "epoch": 2.3139963167587476, + "grad_norm": 0.29723790287971497, + "learning_rate": 8.973942115787122e-05, + "loss": 1.8215, + "step": 7539 + }, + { + "epoch": 2.314303253529773, + "grad_norm": 0.37225791811943054, + "learning_rate": 8.973640439526858e-05, + "loss": 1.9422, + "step": 7540 + }, + { + "epoch": 2.314610190300798, + "grad_norm": 0.3359868824481964, + "learning_rate": 8.973338723996751e-05, + "loss": 1.7974, + "step": 7541 + }, + { + "epoch": 2.314917127071823, + "grad_norm": 0.2993139922618866, + "learning_rate": 8.973036969199782e-05, + "loss": 1.8691, + "step": 7542 + }, + { + "epoch": 2.3152240638428485, + "grad_norm": 0.3155567944049835, + "learning_rate": 8.972735175138933e-05, + "loss": 1.857, + "step": 7543 + }, + { + "epoch": 2.3155310006138734, + "grad_norm": 0.315820574760437, + "learning_rate": 8.972433341817188e-05, + "loss": 1.8597, + "step": 7544 + }, + { + "epoch": 2.3158379373848987, + "grad_norm": 0.32500606775283813, + "learning_rate": 8.972131469237526e-05, + "loss": 1.9293, + "step": 7545 + }, + { + "epoch": 2.316144874155924, + "grad_norm": 0.3481442332267761, + "learning_rate": 8.971829557402933e-05, + "loss": 1.8839, + "step": 7546 + }, + { + "epoch": 2.316451810926949, + "grad_norm": 0.3110404312610626, + "learning_rate": 8.971527606316394e-05, + "loss": 1.8717, + "step": 7547 + }, + { + "epoch": 2.3167587476979743, + "grad_norm": 0.319795161485672, + "learning_rate": 8.97122561598089e-05, + "loss": 1.8855, + "step": 7548 + }, + { + "epoch": 2.317065684468999, + "grad_norm": 0.33142411708831787, + "learning_rate": 8.970923586399407e-05, + "loss": 1.863, + "step": 7549 + }, + { + "epoch": 2.3173726212400245, + "grad_norm": 0.348715603351593, + "learning_rate": 8.970621517574929e-05, + "loss": 1.8886, + "step": 7550 + }, + { + "epoch": 2.31767955801105, + "grad_norm": 0.3179607689380646, + "learning_rate": 8.970319409510444e-05, + "loss": 1.8955, + "step": 7551 + }, + { + "epoch": 2.3179864947820747, + "grad_norm": 0.33166465163230896, + "learning_rate": 8.970017262208934e-05, + "loss": 1.8366, + "step": 7552 + }, + { + "epoch": 2.3182934315531, + "grad_norm": 0.30798691511154175, + "learning_rate": 8.969715075673386e-05, + "loss": 1.8437, + "step": 7553 + }, + { + "epoch": 2.3186003683241254, + "grad_norm": 0.292639821767807, + "learning_rate": 8.969412849906788e-05, + "loss": 1.8056, + "step": 7554 + }, + { + "epoch": 2.3189073050951503, + "grad_norm": 0.2972165048122406, + "learning_rate": 8.969110584912125e-05, + "loss": 1.8596, + "step": 7555 + }, + { + "epoch": 2.3192142418661756, + "grad_norm": 0.3346043527126312, + "learning_rate": 8.968808280692385e-05, + "loss": 1.8652, + "step": 7556 + }, + { + "epoch": 2.319521178637201, + "grad_norm": 0.31866857409477234, + "learning_rate": 8.968505937250555e-05, + "loss": 1.9263, + "step": 7557 + }, + { + "epoch": 2.319828115408226, + "grad_norm": 0.3511367440223694, + "learning_rate": 8.968203554589625e-05, + "loss": 1.8615, + "step": 7558 + }, + { + "epoch": 2.320135052179251, + "grad_norm": 0.36077243089675903, + "learning_rate": 8.96790113271258e-05, + "loss": 1.9155, + "step": 7559 + }, + { + "epoch": 2.320441988950276, + "grad_norm": 0.3335363268852234, + "learning_rate": 8.96759867162241e-05, + "loss": 1.8313, + "step": 7560 + }, + { + "epoch": 2.3207489257213014, + "grad_norm": 0.31834676861763, + "learning_rate": 8.967296171322105e-05, + "loss": 1.809, + "step": 7561 + }, + { + "epoch": 2.3210558624923268, + "grad_norm": 0.3629632890224457, + "learning_rate": 8.966993631814655e-05, + "loss": 1.854, + "step": 7562 + }, + { + "epoch": 2.3213627992633517, + "grad_norm": 0.3164220154285431, + "learning_rate": 8.966691053103049e-05, + "loss": 1.8431, + "step": 7563 + }, + { + "epoch": 2.321669736034377, + "grad_norm": 0.408178448677063, + "learning_rate": 8.966388435190276e-05, + "loss": 1.8652, + "step": 7564 + }, + { + "epoch": 2.321976672805402, + "grad_norm": 0.4244436025619507, + "learning_rate": 8.966085778079327e-05, + "loss": 1.8834, + "step": 7565 + }, + { + "epoch": 2.322283609576427, + "grad_norm": 0.44187989830970764, + "learning_rate": 8.965783081773195e-05, + "loss": 1.8822, + "step": 7566 + }, + { + "epoch": 2.3225905463474525, + "grad_norm": 0.30801042914390564, + "learning_rate": 8.965480346274869e-05, + "loss": 1.8145, + "step": 7567 + }, + { + "epoch": 2.3228974831184774, + "grad_norm": 0.30103740096092224, + "learning_rate": 8.965177571587343e-05, + "loss": 1.8207, + "step": 7568 + }, + { + "epoch": 2.3232044198895028, + "grad_norm": 0.417538046836853, + "learning_rate": 8.964874757713608e-05, + "loss": 1.9213, + "step": 7569 + }, + { + "epoch": 2.323511356660528, + "grad_norm": 0.4238434433937073, + "learning_rate": 8.964571904656656e-05, + "loss": 1.8309, + "step": 7570 + }, + { + "epoch": 2.323818293431553, + "grad_norm": 0.3717726171016693, + "learning_rate": 8.964269012419482e-05, + "loss": 1.8613, + "step": 7571 + }, + { + "epoch": 2.3241252302025783, + "grad_norm": 0.369182288646698, + "learning_rate": 8.963966081005078e-05, + "loss": 1.9232, + "step": 7572 + }, + { + "epoch": 2.3244321669736037, + "grad_norm": 0.40301385521888733, + "learning_rate": 8.963663110416436e-05, + "loss": 1.9509, + "step": 7573 + }, + { + "epoch": 2.3247391037446286, + "grad_norm": 0.3336825966835022, + "learning_rate": 8.963360100656553e-05, + "loss": 1.807, + "step": 7574 + }, + { + "epoch": 2.325046040515654, + "grad_norm": 0.4070039987564087, + "learning_rate": 8.963057051728423e-05, + "loss": 1.9349, + "step": 7575 + }, + { + "epoch": 2.325352977286679, + "grad_norm": 0.34244731068611145, + "learning_rate": 8.96275396363504e-05, + "loss": 1.8378, + "step": 7576 + }, + { + "epoch": 2.325659914057704, + "grad_norm": 0.3408849835395813, + "learning_rate": 8.962450836379401e-05, + "loss": 1.8087, + "step": 7577 + }, + { + "epoch": 2.3259668508287294, + "grad_norm": 0.34224358201026917, + "learning_rate": 8.962147669964498e-05, + "loss": 1.9158, + "step": 7578 + }, + { + "epoch": 2.3262737875997543, + "grad_norm": 0.36177051067352295, + "learning_rate": 8.961844464393332e-05, + "loss": 1.8774, + "step": 7579 + }, + { + "epoch": 2.3265807243707797, + "grad_norm": 0.3000224232673645, + "learning_rate": 8.961541219668895e-05, + "loss": 1.8092, + "step": 7580 + }, + { + "epoch": 2.3268876611418046, + "grad_norm": 0.34738194942474365, + "learning_rate": 8.961237935794185e-05, + "loss": 1.9107, + "step": 7581 + }, + { + "epoch": 2.32719459791283, + "grad_norm": 0.355585515499115, + "learning_rate": 8.960934612772203e-05, + "loss": 1.8343, + "step": 7582 + }, + { + "epoch": 2.3275015346838552, + "grad_norm": 0.29839828610420227, + "learning_rate": 8.96063125060594e-05, + "loss": 1.8345, + "step": 7583 + }, + { + "epoch": 2.32780847145488, + "grad_norm": 0.3695736229419708, + "learning_rate": 8.960327849298399e-05, + "loss": 1.8763, + "step": 7584 + }, + { + "epoch": 2.3281154082259055, + "grad_norm": 0.38834989070892334, + "learning_rate": 8.960024408852578e-05, + "loss": 1.8732, + "step": 7585 + }, + { + "epoch": 2.328422344996931, + "grad_norm": 0.4515606462955475, + "learning_rate": 8.959720929271474e-05, + "loss": 1.9685, + "step": 7586 + }, + { + "epoch": 2.3287292817679557, + "grad_norm": 0.39115825295448303, + "learning_rate": 8.959417410558087e-05, + "loss": 1.7969, + "step": 7587 + }, + { + "epoch": 2.329036218538981, + "grad_norm": 0.37858307361602783, + "learning_rate": 8.959113852715417e-05, + "loss": 1.9013, + "step": 7588 + }, + { + "epoch": 2.3293431553100064, + "grad_norm": 0.35533010959625244, + "learning_rate": 8.958810255746462e-05, + "loss": 1.8862, + "step": 7589 + }, + { + "epoch": 2.3296500920810312, + "grad_norm": 0.36994054913520813, + "learning_rate": 8.958506619654226e-05, + "loss": 1.9783, + "step": 7590 + }, + { + "epoch": 2.3299570288520566, + "grad_norm": 0.4424416124820709, + "learning_rate": 8.958202944441705e-05, + "loss": 1.9095, + "step": 7591 + }, + { + "epoch": 2.3302639656230815, + "grad_norm": 0.41932111978530884, + "learning_rate": 8.957899230111903e-05, + "loss": 1.8623, + "step": 7592 + }, + { + "epoch": 2.330570902394107, + "grad_norm": 0.4359748363494873, + "learning_rate": 8.957595476667822e-05, + "loss": 1.8917, + "step": 7593 + }, + { + "epoch": 2.330877839165132, + "grad_norm": 0.362957239151001, + "learning_rate": 8.957291684112463e-05, + "loss": 1.8478, + "step": 7594 + }, + { + "epoch": 2.331184775936157, + "grad_norm": 0.3442717492580414, + "learning_rate": 8.956987852448827e-05, + "loss": 1.862, + "step": 7595 + }, + { + "epoch": 2.3314917127071824, + "grad_norm": 0.33355212211608887, + "learning_rate": 8.956683981679918e-05, + "loss": 1.8319, + "step": 7596 + }, + { + "epoch": 2.3317986494782073, + "grad_norm": 0.36758801341056824, + "learning_rate": 8.95638007180874e-05, + "loss": 1.8989, + "step": 7597 + }, + { + "epoch": 2.3321055862492326, + "grad_norm": 0.3574751019477844, + "learning_rate": 8.956076122838294e-05, + "loss": 1.8304, + "step": 7598 + }, + { + "epoch": 2.332412523020258, + "grad_norm": 0.30615341663360596, + "learning_rate": 8.955772134771585e-05, + "loss": 1.9078, + "step": 7599 + }, + { + "epoch": 2.332719459791283, + "grad_norm": 0.38824397325515747, + "learning_rate": 8.955468107611618e-05, + "loss": 1.8733, + "step": 7600 + }, + { + "epoch": 2.333026396562308, + "grad_norm": 0.40545380115509033, + "learning_rate": 8.955164041361395e-05, + "loss": 1.8264, + "step": 7601 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.3104313910007477, + "learning_rate": 8.954859936023925e-05, + "loss": 1.8272, + "step": 7602 + }, + { + "epoch": 2.3336402701043584, + "grad_norm": 0.34795114398002625, + "learning_rate": 8.954555791602211e-05, + "loss": 1.8711, + "step": 7603 + }, + { + "epoch": 2.3339472068753837, + "grad_norm": 0.42790937423706055, + "learning_rate": 8.954251608099257e-05, + "loss": 1.8802, + "step": 7604 + }, + { + "epoch": 2.334254143646409, + "grad_norm": 0.3903054893016815, + "learning_rate": 8.953947385518072e-05, + "loss": 1.8489, + "step": 7605 + }, + { + "epoch": 2.334561080417434, + "grad_norm": 0.35869601368904114, + "learning_rate": 8.953643123861661e-05, + "loss": 1.8565, + "step": 7606 + }, + { + "epoch": 2.3348680171884593, + "grad_norm": 0.3960758447647095, + "learning_rate": 8.953338823133033e-05, + "loss": 1.9335, + "step": 7607 + }, + { + "epoch": 2.335174953959484, + "grad_norm": 0.3884136974811554, + "learning_rate": 8.953034483335191e-05, + "loss": 1.887, + "step": 7608 + }, + { + "epoch": 2.3354818907305095, + "grad_norm": 0.3734811246395111, + "learning_rate": 8.952730104471147e-05, + "loss": 1.861, + "step": 7609 + }, + { + "epoch": 2.335788827501535, + "grad_norm": 0.3074554204940796, + "learning_rate": 8.952425686543908e-05, + "loss": 1.8556, + "step": 7610 + }, + { + "epoch": 2.3360957642725597, + "grad_norm": 0.3098750412464142, + "learning_rate": 8.952121229556481e-05, + "loss": 1.8724, + "step": 7611 + }, + { + "epoch": 2.336402701043585, + "grad_norm": 0.3514649569988251, + "learning_rate": 8.951816733511875e-05, + "loss": 1.8023, + "step": 7612 + }, + { + "epoch": 2.33670963781461, + "grad_norm": 0.3275100290775299, + "learning_rate": 8.951512198413101e-05, + "loss": 1.8805, + "step": 7613 + }, + { + "epoch": 2.3370165745856353, + "grad_norm": 0.3380829989910126, + "learning_rate": 8.951207624263165e-05, + "loss": 1.8559, + "step": 7614 + }, + { + "epoch": 2.3373235113566606, + "grad_norm": 0.43179723620414734, + "learning_rate": 8.950903011065082e-05, + "loss": 1.937, + "step": 7615 + }, + { + "epoch": 2.337630448127686, + "grad_norm": 0.4981893002986908, + "learning_rate": 8.950598358821858e-05, + "loss": 1.8828, + "step": 7616 + }, + { + "epoch": 2.337937384898711, + "grad_norm": 0.42164552211761475, + "learning_rate": 8.950293667536506e-05, + "loss": 1.8898, + "step": 7617 + }, + { + "epoch": 2.338244321669736, + "grad_norm": 0.32897287607192993, + "learning_rate": 8.949988937212037e-05, + "loss": 1.9073, + "step": 7618 + }, + { + "epoch": 2.338551258440761, + "grad_norm": 0.38831618428230286, + "learning_rate": 8.949684167851462e-05, + "loss": 1.9694, + "step": 7619 + }, + { + "epoch": 2.3388581952117864, + "grad_norm": 0.3728467524051666, + "learning_rate": 8.949379359457793e-05, + "loss": 1.8803, + "step": 7620 + }, + { + "epoch": 2.3391651319828117, + "grad_norm": 0.4003579020500183, + "learning_rate": 8.949074512034044e-05, + "loss": 1.9306, + "step": 7621 + }, + { + "epoch": 2.3394720687538366, + "grad_norm": 0.35670751333236694, + "learning_rate": 8.948769625583224e-05, + "loss": 1.9176, + "step": 7622 + }, + { + "epoch": 2.339779005524862, + "grad_norm": 0.3257119357585907, + "learning_rate": 8.948464700108347e-05, + "loss": 1.8781, + "step": 7623 + }, + { + "epoch": 2.340085942295887, + "grad_norm": 0.2840226888656616, + "learning_rate": 8.94815973561243e-05, + "loss": 1.8112, + "step": 7624 + }, + { + "epoch": 2.340392879066912, + "grad_norm": 0.33156147599220276, + "learning_rate": 8.947854732098484e-05, + "loss": 1.8562, + "step": 7625 + }, + { + "epoch": 2.3406998158379375, + "grad_norm": 0.33335328102111816, + "learning_rate": 8.947549689569524e-05, + "loss": 1.8404, + "step": 7626 + }, + { + "epoch": 2.3410067526089624, + "grad_norm": 0.2913919985294342, + "learning_rate": 8.947244608028562e-05, + "loss": 1.83, + "step": 7627 + }, + { + "epoch": 2.3413136893799877, + "grad_norm": 0.32735875248908997, + "learning_rate": 8.946939487478618e-05, + "loss": 1.9047, + "step": 7628 + }, + { + "epoch": 2.341620626151013, + "grad_norm": 0.3421878516674042, + "learning_rate": 8.946634327922703e-05, + "loss": 1.8771, + "step": 7629 + }, + { + "epoch": 2.341927562922038, + "grad_norm": 0.33164483308792114, + "learning_rate": 8.946329129363835e-05, + "loss": 1.8463, + "step": 7630 + }, + { + "epoch": 2.3422344996930633, + "grad_norm": 0.35423099994659424, + "learning_rate": 8.946023891805029e-05, + "loss": 1.9254, + "step": 7631 + }, + { + "epoch": 2.3425414364640886, + "grad_norm": 0.3554958403110504, + "learning_rate": 8.9457186152493e-05, + "loss": 1.8949, + "step": 7632 + }, + { + "epoch": 2.3428483732351135, + "grad_norm": 0.35155919194221497, + "learning_rate": 8.94541329969967e-05, + "loss": 1.8432, + "step": 7633 + }, + { + "epoch": 2.343155310006139, + "grad_norm": 0.3210476338863373, + "learning_rate": 8.945107945159154e-05, + "loss": 1.8512, + "step": 7634 + }, + { + "epoch": 2.3434622467771637, + "grad_norm": 0.3587365746498108, + "learning_rate": 8.944802551630767e-05, + "loss": 1.8355, + "step": 7635 + }, + { + "epoch": 2.343769183548189, + "grad_norm": 0.41851457953453064, + "learning_rate": 8.94449711911753e-05, + "loss": 1.814, + "step": 7636 + }, + { + "epoch": 2.3440761203192144, + "grad_norm": 0.3516016900539398, + "learning_rate": 8.94419164762246e-05, + "loss": 1.8563, + "step": 7637 + }, + { + "epoch": 2.3443830570902393, + "grad_norm": 0.2917228937149048, + "learning_rate": 8.943886137148576e-05, + "loss": 1.8037, + "step": 7638 + }, + { + "epoch": 2.3446899938612646, + "grad_norm": 0.3597778379917145, + "learning_rate": 8.943580587698899e-05, + "loss": 1.8766, + "step": 7639 + }, + { + "epoch": 2.3449969306322895, + "grad_norm": 0.359642893075943, + "learning_rate": 8.943274999276445e-05, + "loss": 1.8485, + "step": 7640 + }, + { + "epoch": 2.345303867403315, + "grad_norm": 0.3543380796909332, + "learning_rate": 8.942969371884238e-05, + "loss": 1.8853, + "step": 7641 + }, + { + "epoch": 2.34561080417434, + "grad_norm": 0.371267706155777, + "learning_rate": 8.942663705525296e-05, + "loss": 1.869, + "step": 7642 + }, + { + "epoch": 2.345917740945365, + "grad_norm": 0.34073930978775024, + "learning_rate": 8.942358000202642e-05, + "loss": 1.831, + "step": 7643 + }, + { + "epoch": 2.3462246777163904, + "grad_norm": 0.3654492497444153, + "learning_rate": 8.942052255919293e-05, + "loss": 1.8697, + "step": 7644 + }, + { + "epoch": 2.3465316144874158, + "grad_norm": 0.31281957030296326, + "learning_rate": 8.941746472678275e-05, + "loss": 1.7908, + "step": 7645 + }, + { + "epoch": 2.3468385512584407, + "grad_norm": 0.3310844302177429, + "learning_rate": 8.941440650482607e-05, + "loss": 1.8523, + "step": 7646 + }, + { + "epoch": 2.347145488029466, + "grad_norm": 0.3187454342842102, + "learning_rate": 8.941134789335312e-05, + "loss": 1.8808, + "step": 7647 + }, + { + "epoch": 2.3474524248004913, + "grad_norm": 0.35980424284935, + "learning_rate": 8.940828889239415e-05, + "loss": 1.8713, + "step": 7648 + }, + { + "epoch": 2.347759361571516, + "grad_norm": 0.2960885763168335, + "learning_rate": 8.940522950197935e-05, + "loss": 1.8077, + "step": 7649 + }, + { + "epoch": 2.3480662983425415, + "grad_norm": 0.3056114912033081, + "learning_rate": 8.940216972213897e-05, + "loss": 1.8805, + "step": 7650 + }, + { + "epoch": 2.3483732351135664, + "grad_norm": 0.3047563135623932, + "learning_rate": 8.939910955290328e-05, + "loss": 1.793, + "step": 7651 + }, + { + "epoch": 2.3486801718845918, + "grad_norm": 0.3381251394748688, + "learning_rate": 8.939604899430248e-05, + "loss": 1.8267, + "step": 7652 + }, + { + "epoch": 2.348987108655617, + "grad_norm": 0.36855414509773254, + "learning_rate": 8.939298804636684e-05, + "loss": 1.9386, + "step": 7653 + }, + { + "epoch": 2.349294045426642, + "grad_norm": 0.3742626905441284, + "learning_rate": 8.93899267091266e-05, + "loss": 1.8695, + "step": 7654 + }, + { + "epoch": 2.3496009821976673, + "grad_norm": 0.3170017600059509, + "learning_rate": 8.938686498261201e-05, + "loss": 1.881, + "step": 7655 + }, + { + "epoch": 2.349907918968692, + "grad_norm": 0.2740418016910553, + "learning_rate": 8.938380286685334e-05, + "loss": 1.7992, + "step": 7656 + }, + { + "epoch": 2.3502148557397176, + "grad_norm": 0.3170342743396759, + "learning_rate": 8.938074036188087e-05, + "loss": 1.8281, + "step": 7657 + }, + { + "epoch": 2.350521792510743, + "grad_norm": 0.3487764298915863, + "learning_rate": 8.93776774677248e-05, + "loss": 1.8508, + "step": 7658 + }, + { + "epoch": 2.350828729281768, + "grad_norm": 0.3193725347518921, + "learning_rate": 8.937461418441549e-05, + "loss": 1.802, + "step": 7659 + }, + { + "epoch": 2.351135666052793, + "grad_norm": 0.30621078610420227, + "learning_rate": 8.937155051198312e-05, + "loss": 1.8723, + "step": 7660 + }, + { + "epoch": 2.3514426028238185, + "grad_norm": 0.3154527544975281, + "learning_rate": 8.936848645045803e-05, + "loss": 1.8276, + "step": 7661 + }, + { + "epoch": 2.3517495395948433, + "grad_norm": 0.3809822201728821, + "learning_rate": 8.936542199987048e-05, + "loss": 1.9682, + "step": 7662 + }, + { + "epoch": 2.3520564763658687, + "grad_norm": 0.3817490339279175, + "learning_rate": 8.936235716025076e-05, + "loss": 1.8896, + "step": 7663 + }, + { + "epoch": 2.352363413136894, + "grad_norm": 0.2996097207069397, + "learning_rate": 8.935929193162915e-05, + "loss": 1.7994, + "step": 7664 + }, + { + "epoch": 2.352670349907919, + "grad_norm": 0.30788013339042664, + "learning_rate": 8.935622631403596e-05, + "loss": 1.8243, + "step": 7665 + }, + { + "epoch": 2.3529772866789442, + "grad_norm": 0.331193745136261, + "learning_rate": 8.935316030750145e-05, + "loss": 1.9044, + "step": 7666 + }, + { + "epoch": 2.353284223449969, + "grad_norm": 0.31796711683273315, + "learning_rate": 8.935009391205598e-05, + "loss": 1.8006, + "step": 7667 + }, + { + "epoch": 2.3535911602209945, + "grad_norm": 0.3864014744758606, + "learning_rate": 8.934702712772979e-05, + "loss": 2.0193, + "step": 7668 + }, + { + "epoch": 2.35389809699202, + "grad_norm": 0.3923170566558838, + "learning_rate": 8.934395995455323e-05, + "loss": 1.9418, + "step": 7669 + }, + { + "epoch": 2.3542050337630447, + "grad_norm": 0.3210037052631378, + "learning_rate": 8.934089239255659e-05, + "loss": 1.7964, + "step": 7670 + }, + { + "epoch": 2.35451197053407, + "grad_norm": 0.32465317845344543, + "learning_rate": 8.933782444177019e-05, + "loss": 1.9405, + "step": 7671 + }, + { + "epoch": 2.354818907305095, + "grad_norm": 0.35554173588752747, + "learning_rate": 8.933475610222435e-05, + "loss": 1.8645, + "step": 7672 + }, + { + "epoch": 2.3551258440761202, + "grad_norm": 0.32723551988601685, + "learning_rate": 8.933168737394942e-05, + "loss": 1.8941, + "step": 7673 + }, + { + "epoch": 2.3554327808471456, + "grad_norm": 0.3295009732246399, + "learning_rate": 8.932861825697567e-05, + "loss": 1.9047, + "step": 7674 + }, + { + "epoch": 2.3557397176181705, + "grad_norm": 0.32315388321876526, + "learning_rate": 8.932554875133348e-05, + "loss": 1.8535, + "step": 7675 + }, + { + "epoch": 2.356046654389196, + "grad_norm": 0.31577154994010925, + "learning_rate": 8.932247885705315e-05, + "loss": 1.8697, + "step": 7676 + }, + { + "epoch": 2.356353591160221, + "grad_norm": 0.31099769473075867, + "learning_rate": 8.931940857416506e-05, + "loss": 1.8377, + "step": 7677 + }, + { + "epoch": 2.356660527931246, + "grad_norm": 0.32998642325401306, + "learning_rate": 8.931633790269954e-05, + "loss": 1.8528, + "step": 7678 + }, + { + "epoch": 2.3569674647022714, + "grad_norm": 0.29609233140945435, + "learning_rate": 8.93132668426869e-05, + "loss": 1.8646, + "step": 7679 + }, + { + "epoch": 2.3572744014732967, + "grad_norm": 0.31335413455963135, + "learning_rate": 8.931019539415752e-05, + "loss": 1.9011, + "step": 7680 + }, + { + "epoch": 2.3575813382443216, + "grad_norm": 0.3441788852214813, + "learning_rate": 8.930712355714174e-05, + "loss": 1.8673, + "step": 7681 + }, + { + "epoch": 2.357888275015347, + "grad_norm": 0.34610918164253235, + "learning_rate": 8.930405133166992e-05, + "loss": 1.8613, + "step": 7682 + }, + { + "epoch": 2.358195211786372, + "grad_norm": 0.31753265857696533, + "learning_rate": 8.930097871777245e-05, + "loss": 1.873, + "step": 7683 + }, + { + "epoch": 2.358502148557397, + "grad_norm": 0.29862073063850403, + "learning_rate": 8.929790571547966e-05, + "loss": 1.8392, + "step": 7684 + }, + { + "epoch": 2.3588090853284225, + "grad_norm": 0.2953017055988312, + "learning_rate": 8.929483232482194e-05, + "loss": 1.8402, + "step": 7685 + }, + { + "epoch": 2.3591160220994474, + "grad_norm": 0.36613956093788147, + "learning_rate": 8.929175854582966e-05, + "loss": 1.8954, + "step": 7686 + }, + { + "epoch": 2.3594229588704727, + "grad_norm": 0.3867746889591217, + "learning_rate": 8.928868437853319e-05, + "loss": 1.8496, + "step": 7687 + }, + { + "epoch": 2.359729895641498, + "grad_norm": 0.30742913484573364, + "learning_rate": 8.928560982296292e-05, + "loss": 1.82, + "step": 7688 + }, + { + "epoch": 2.360036832412523, + "grad_norm": 0.306905061006546, + "learning_rate": 8.928253487914921e-05, + "loss": 1.8299, + "step": 7689 + }, + { + "epoch": 2.3603437691835483, + "grad_norm": 0.3253326416015625, + "learning_rate": 8.927945954712247e-05, + "loss": 1.896, + "step": 7690 + }, + { + "epoch": 2.3606507059545736, + "grad_norm": 0.3139156699180603, + "learning_rate": 8.927638382691309e-05, + "loss": 1.838, + "step": 7691 + }, + { + "epoch": 2.3609576427255985, + "grad_norm": 0.3865121006965637, + "learning_rate": 8.927330771855147e-05, + "loss": 1.8502, + "step": 7692 + }, + { + "epoch": 2.361264579496624, + "grad_norm": 0.3640300929546356, + "learning_rate": 8.927023122206799e-05, + "loss": 1.8929, + "step": 7693 + }, + { + "epoch": 2.3615715162676487, + "grad_norm": 0.3446909487247467, + "learning_rate": 8.926715433749309e-05, + "loss": 1.864, + "step": 7694 + }, + { + "epoch": 2.361878453038674, + "grad_norm": 0.3086490035057068, + "learning_rate": 8.926407706485713e-05, + "loss": 1.8588, + "step": 7695 + }, + { + "epoch": 2.3621853898096994, + "grad_norm": 0.28351619839668274, + "learning_rate": 8.926099940419057e-05, + "loss": 1.8114, + "step": 7696 + }, + { + "epoch": 2.3624923265807243, + "grad_norm": 0.31882742047309875, + "learning_rate": 8.925792135552379e-05, + "loss": 1.8544, + "step": 7697 + }, + { + "epoch": 2.3627992633517496, + "grad_norm": 0.2691894769668579, + "learning_rate": 8.925484291888723e-05, + "loss": 1.8143, + "step": 7698 + }, + { + "epoch": 2.3631062001227745, + "grad_norm": 0.2815118432044983, + "learning_rate": 8.925176409431129e-05, + "loss": 1.8687, + "step": 7699 + }, + { + "epoch": 2.3634131368938, + "grad_norm": 0.34842196106910706, + "learning_rate": 8.924868488182643e-05, + "loss": 1.8673, + "step": 7700 + }, + { + "epoch": 2.363720073664825, + "grad_norm": 0.33553025126457214, + "learning_rate": 8.924560528146304e-05, + "loss": 1.8982, + "step": 7701 + }, + { + "epoch": 2.36402701043585, + "grad_norm": 0.30077221989631653, + "learning_rate": 8.924252529325159e-05, + "loss": 1.8155, + "step": 7702 + }, + { + "epoch": 2.3643339472068754, + "grad_norm": 0.3376595079898834, + "learning_rate": 8.923944491722252e-05, + "loss": 1.8871, + "step": 7703 + }, + { + "epoch": 2.3646408839779007, + "grad_norm": 0.3980284333229065, + "learning_rate": 8.923636415340622e-05, + "loss": 1.8414, + "step": 7704 + }, + { + "epoch": 2.3649478207489256, + "grad_norm": 0.4772777259349823, + "learning_rate": 8.92332830018332e-05, + "loss": 1.8393, + "step": 7705 + }, + { + "epoch": 2.365254757519951, + "grad_norm": 0.5061559081077576, + "learning_rate": 8.923020146253387e-05, + "loss": 1.9134, + "step": 7706 + }, + { + "epoch": 2.3655616942909763, + "grad_norm": 0.47147873044013977, + "learning_rate": 8.922711953553871e-05, + "loss": 1.9026, + "step": 7707 + }, + { + "epoch": 2.365868631062001, + "grad_norm": 0.37263748049736023, + "learning_rate": 8.922403722087814e-05, + "loss": 1.8474, + "step": 7708 + }, + { + "epoch": 2.3661755678330265, + "grad_norm": 0.3158501386642456, + "learning_rate": 8.922095451858265e-05, + "loss": 1.8771, + "step": 7709 + }, + { + "epoch": 2.3664825046040514, + "grad_norm": 0.3170566260814667, + "learning_rate": 8.921787142868271e-05, + "loss": 1.8111, + "step": 7710 + }, + { + "epoch": 2.3667894413750767, + "grad_norm": 0.3532208502292633, + "learning_rate": 8.921478795120877e-05, + "loss": 1.8708, + "step": 7711 + }, + { + "epoch": 2.367096378146102, + "grad_norm": 0.3211480379104614, + "learning_rate": 8.921170408619131e-05, + "loss": 1.8487, + "step": 7712 + }, + { + "epoch": 2.367403314917127, + "grad_norm": 0.2806071937084198, + "learning_rate": 8.920861983366083e-05, + "loss": 1.8325, + "step": 7713 + }, + { + "epoch": 2.3677102516881523, + "grad_norm": 0.30703970789909363, + "learning_rate": 8.920553519364777e-05, + "loss": 1.8364, + "step": 7714 + }, + { + "epoch": 2.368017188459177, + "grad_norm": 0.30848923325538635, + "learning_rate": 8.920245016618263e-05, + "loss": 1.833, + "step": 7715 + }, + { + "epoch": 2.3683241252302025, + "grad_norm": 0.31656739115715027, + "learning_rate": 8.919936475129588e-05, + "loss": 1.8884, + "step": 7716 + }, + { + "epoch": 2.368631062001228, + "grad_norm": 0.2806589603424072, + "learning_rate": 8.919627894901806e-05, + "loss": 1.7779, + "step": 7717 + }, + { + "epoch": 2.3689379987722528, + "grad_norm": 0.2943432629108429, + "learning_rate": 8.919319275937962e-05, + "loss": 1.8741, + "step": 7718 + }, + { + "epoch": 2.369244935543278, + "grad_norm": 0.2870347499847412, + "learning_rate": 8.919010618241111e-05, + "loss": 1.8415, + "step": 7719 + }, + { + "epoch": 2.3695518723143034, + "grad_norm": 0.3224312663078308, + "learning_rate": 8.918701921814297e-05, + "loss": 1.8594, + "step": 7720 + }, + { + "epoch": 2.3698588090853283, + "grad_norm": 0.3007681369781494, + "learning_rate": 8.918393186660575e-05, + "loss": 1.878, + "step": 7721 + }, + { + "epoch": 2.3701657458563536, + "grad_norm": 0.3083780109882355, + "learning_rate": 8.918084412782994e-05, + "loss": 1.9088, + "step": 7722 + }, + { + "epoch": 2.370472682627379, + "grad_norm": 0.30599063634872437, + "learning_rate": 8.917775600184608e-05, + "loss": 1.8743, + "step": 7723 + }, + { + "epoch": 2.370779619398404, + "grad_norm": 0.33503273129463196, + "learning_rate": 8.917466748868466e-05, + "loss": 1.9048, + "step": 7724 + }, + { + "epoch": 2.371086556169429, + "grad_norm": 0.3861919343471527, + "learning_rate": 8.917157858837622e-05, + "loss": 1.9073, + "step": 7725 + }, + { + "epoch": 2.371393492940454, + "grad_norm": 0.395945280790329, + "learning_rate": 8.916848930095128e-05, + "loss": 1.8678, + "step": 7726 + }, + { + "epoch": 2.3717004297114794, + "grad_norm": 0.3657386600971222, + "learning_rate": 8.916539962644037e-05, + "loss": 1.9138, + "step": 7727 + }, + { + "epoch": 2.3720073664825048, + "grad_norm": 0.32392752170562744, + "learning_rate": 8.916230956487402e-05, + "loss": 1.803, + "step": 7728 + }, + { + "epoch": 2.3723143032535297, + "grad_norm": 0.406703382730484, + "learning_rate": 8.915921911628278e-05, + "loss": 1.9222, + "step": 7729 + }, + { + "epoch": 2.372621240024555, + "grad_norm": 0.4293023645877838, + "learning_rate": 8.915612828069718e-05, + "loss": 1.8874, + "step": 7730 + }, + { + "epoch": 2.37292817679558, + "grad_norm": 0.45155876874923706, + "learning_rate": 8.915303705814777e-05, + "loss": 1.9059, + "step": 7731 + }, + { + "epoch": 2.373235113566605, + "grad_norm": 0.35105881094932556, + "learning_rate": 8.91499454486651e-05, + "loss": 1.8387, + "step": 7732 + }, + { + "epoch": 2.3735420503376305, + "grad_norm": 0.3197930157184601, + "learning_rate": 8.914685345227973e-05, + "loss": 1.8174, + "step": 7733 + }, + { + "epoch": 2.3738489871086554, + "grad_norm": 0.3610389232635498, + "learning_rate": 8.91437610690222e-05, + "loss": 1.841, + "step": 7734 + }, + { + "epoch": 2.3741559238796808, + "grad_norm": 0.3696954548358917, + "learning_rate": 8.91406682989231e-05, + "loss": 1.8511, + "step": 7735 + }, + { + "epoch": 2.374462860650706, + "grad_norm": 0.3364555239677429, + "learning_rate": 8.913757514201295e-05, + "loss": 1.8382, + "step": 7736 + }, + { + "epoch": 2.374769797421731, + "grad_norm": 0.4600698947906494, + "learning_rate": 8.913448159832236e-05, + "loss": 1.8247, + "step": 7737 + }, + { + "epoch": 2.3750767341927563, + "grad_norm": 0.5877843499183655, + "learning_rate": 8.913138766788187e-05, + "loss": 1.8449, + "step": 7738 + }, + { + "epoch": 2.3753836709637817, + "grad_norm": 0.5380640029907227, + "learning_rate": 8.912829335072208e-05, + "loss": 1.8647, + "step": 7739 + }, + { + "epoch": 2.3756906077348066, + "grad_norm": 0.5100306272506714, + "learning_rate": 8.912519864687357e-05, + "loss": 1.884, + "step": 7740 + }, + { + "epoch": 2.375997544505832, + "grad_norm": 0.48175910115242004, + "learning_rate": 8.91221035563669e-05, + "loss": 1.8378, + "step": 7741 + }, + { + "epoch": 2.376304481276857, + "grad_norm": 0.3296540081501007, + "learning_rate": 8.911900807923268e-05, + "loss": 1.8036, + "step": 7742 + }, + { + "epoch": 2.376611418047882, + "grad_norm": 0.32398131489753723, + "learning_rate": 8.911591221550149e-05, + "loss": 1.8415, + "step": 7743 + }, + { + "epoch": 2.3769183548189075, + "grad_norm": 0.33934786915779114, + "learning_rate": 8.911281596520393e-05, + "loss": 1.9002, + "step": 7744 + }, + { + "epoch": 2.3772252915899323, + "grad_norm": 0.33059465885162354, + "learning_rate": 8.91097193283706e-05, + "loss": 1.8194, + "step": 7745 + }, + { + "epoch": 2.3775322283609577, + "grad_norm": 0.2908796966075897, + "learning_rate": 8.91066223050321e-05, + "loss": 1.8272, + "step": 7746 + }, + { + "epoch": 2.3778391651319826, + "grad_norm": 0.31551963090896606, + "learning_rate": 8.910352489521904e-05, + "loss": 1.8717, + "step": 7747 + }, + { + "epoch": 2.378146101903008, + "grad_norm": 0.2886766493320465, + "learning_rate": 8.910042709896203e-05, + "loss": 1.8714, + "step": 7748 + }, + { + "epoch": 2.3784530386740332, + "grad_norm": 0.3288721740245819, + "learning_rate": 8.909732891629167e-05, + "loss": 1.9194, + "step": 7749 + }, + { + "epoch": 2.378759975445058, + "grad_norm": 0.42444637417793274, + "learning_rate": 8.90942303472386e-05, + "loss": 1.8871, + "step": 7750 + }, + { + "epoch": 2.3790669122160835, + "grad_norm": 0.3550770580768585, + "learning_rate": 8.909113139183343e-05, + "loss": 1.8639, + "step": 7751 + }, + { + "epoch": 2.379373848987109, + "grad_norm": 0.3291744589805603, + "learning_rate": 8.908803205010679e-05, + "loss": 1.8284, + "step": 7752 + }, + { + "epoch": 2.3796807857581337, + "grad_norm": 0.2803054451942444, + "learning_rate": 8.908493232208928e-05, + "loss": 1.8113, + "step": 7753 + }, + { + "epoch": 2.379987722529159, + "grad_norm": 0.30959245562553406, + "learning_rate": 8.908183220781158e-05, + "loss": 1.8821, + "step": 7754 + }, + { + "epoch": 2.3802946593001844, + "grad_norm": 0.37838777899742126, + "learning_rate": 8.907873170730431e-05, + "loss": 1.8749, + "step": 7755 + }, + { + "epoch": 2.3806015960712092, + "grad_norm": 0.34625449776649475, + "learning_rate": 8.907563082059813e-05, + "loss": 1.8804, + "step": 7756 + }, + { + "epoch": 2.3809085328422346, + "grad_norm": 0.3966830372810364, + "learning_rate": 8.907252954772364e-05, + "loss": 1.9295, + "step": 7757 + }, + { + "epoch": 2.3812154696132595, + "grad_norm": 0.3144119679927826, + "learning_rate": 8.906942788871151e-05, + "loss": 1.8486, + "step": 7758 + }, + { + "epoch": 2.381522406384285, + "grad_norm": 0.3498438596725464, + "learning_rate": 8.90663258435924e-05, + "loss": 1.8813, + "step": 7759 + }, + { + "epoch": 2.38182934315531, + "grad_norm": 0.32803723216056824, + "learning_rate": 8.906322341239696e-05, + "loss": 1.8282, + "step": 7760 + }, + { + "epoch": 2.382136279926335, + "grad_norm": 0.28600773215293884, + "learning_rate": 8.906012059515585e-05, + "loss": 1.8319, + "step": 7761 + }, + { + "epoch": 2.3824432166973604, + "grad_norm": 0.2743505537509918, + "learning_rate": 8.905701739189973e-05, + "loss": 1.8198, + "step": 7762 + }, + { + "epoch": 2.3827501534683857, + "grad_norm": 0.3011966347694397, + "learning_rate": 8.905391380265929e-05, + "loss": 1.8476, + "step": 7763 + }, + { + "epoch": 2.3830570902394106, + "grad_norm": 0.3022943437099457, + "learning_rate": 8.905080982746516e-05, + "loss": 1.9037, + "step": 7764 + }, + { + "epoch": 2.383364027010436, + "grad_norm": 0.3333243727684021, + "learning_rate": 8.904770546634805e-05, + "loss": 1.8487, + "step": 7765 + }, + { + "epoch": 2.3836709637814613, + "grad_norm": 0.3773072361946106, + "learning_rate": 8.904460071933862e-05, + "loss": 1.8828, + "step": 7766 + }, + { + "epoch": 2.383977900552486, + "grad_norm": 0.4382041096687317, + "learning_rate": 8.904149558646756e-05, + "loss": 1.9069, + "step": 7767 + }, + { + "epoch": 2.3842848373235115, + "grad_norm": 0.3963650166988373, + "learning_rate": 8.903839006776557e-05, + "loss": 1.816, + "step": 7768 + }, + { + "epoch": 2.3845917740945364, + "grad_norm": 0.35340386629104614, + "learning_rate": 8.903528416326333e-05, + "loss": 1.8853, + "step": 7769 + }, + { + "epoch": 2.3848987108655617, + "grad_norm": 0.31519120931625366, + "learning_rate": 8.903217787299153e-05, + "loss": 1.8953, + "step": 7770 + }, + { + "epoch": 2.385205647636587, + "grad_norm": 0.41126203536987305, + "learning_rate": 8.902907119698088e-05, + "loss": 1.9494, + "step": 7771 + }, + { + "epoch": 2.385512584407612, + "grad_norm": 0.4488140344619751, + "learning_rate": 8.902596413526205e-05, + "loss": 1.8717, + "step": 7772 + }, + { + "epoch": 2.3858195211786373, + "grad_norm": 0.36129191517829895, + "learning_rate": 8.902285668786578e-05, + "loss": 1.8472, + "step": 7773 + }, + { + "epoch": 2.386126457949662, + "grad_norm": 0.3357439935207367, + "learning_rate": 8.901974885482277e-05, + "loss": 1.8143, + "step": 7774 + }, + { + "epoch": 2.3864333947206875, + "grad_norm": 0.2832469046115875, + "learning_rate": 8.901664063616372e-05, + "loss": 1.7952, + "step": 7775 + }, + { + "epoch": 2.386740331491713, + "grad_norm": 0.31065669655799866, + "learning_rate": 8.901353203191937e-05, + "loss": 1.8651, + "step": 7776 + }, + { + "epoch": 2.3870472682627377, + "grad_norm": 0.2985263764858246, + "learning_rate": 8.901042304212042e-05, + "loss": 1.8106, + "step": 7777 + }, + { + "epoch": 2.387354205033763, + "grad_norm": 0.31606364250183105, + "learning_rate": 8.900731366679761e-05, + "loss": 1.8831, + "step": 7778 + }, + { + "epoch": 2.3876611418047884, + "grad_norm": 0.33167949318885803, + "learning_rate": 8.900420390598166e-05, + "loss": 1.9494, + "step": 7779 + }, + { + "epoch": 2.3879680785758133, + "grad_norm": 0.32814472913742065, + "learning_rate": 8.900109375970333e-05, + "loss": 1.8654, + "step": 7780 + }, + { + "epoch": 2.3882750153468386, + "grad_norm": 0.35307401418685913, + "learning_rate": 8.899798322799331e-05, + "loss": 1.904, + "step": 7781 + }, + { + "epoch": 2.388581952117864, + "grad_norm": 0.3936740458011627, + "learning_rate": 8.899487231088236e-05, + "loss": 1.8404, + "step": 7782 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.3675380349159241, + "learning_rate": 8.899176100840124e-05, + "loss": 1.8689, + "step": 7783 + }, + { + "epoch": 2.389195825659914, + "grad_norm": 0.34065911173820496, + "learning_rate": 8.898864932058067e-05, + "loss": 1.8819, + "step": 7784 + }, + { + "epoch": 2.389502762430939, + "grad_norm": 0.31531861424446106, + "learning_rate": 8.898553724745142e-05, + "loss": 1.8379, + "step": 7785 + }, + { + "epoch": 2.3898096992019644, + "grad_norm": 0.33485177159309387, + "learning_rate": 8.898242478904424e-05, + "loss": 1.9206, + "step": 7786 + }, + { + "epoch": 2.3901166359729897, + "grad_norm": 0.33116385340690613, + "learning_rate": 8.897931194538989e-05, + "loss": 1.8744, + "step": 7787 + }, + { + "epoch": 2.3904235727440146, + "grad_norm": 0.33216002583503723, + "learning_rate": 8.897619871651915e-05, + "loss": 1.8794, + "step": 7788 + }, + { + "epoch": 2.39073050951504, + "grad_norm": 0.3246794641017914, + "learning_rate": 8.897308510246273e-05, + "loss": 1.8739, + "step": 7789 + }, + { + "epoch": 2.391037446286065, + "grad_norm": 0.3038793206214905, + "learning_rate": 8.896997110325146e-05, + "loss": 1.8314, + "step": 7790 + }, + { + "epoch": 2.39134438305709, + "grad_norm": 0.35726267099380493, + "learning_rate": 8.896685671891612e-05, + "loss": 1.8764, + "step": 7791 + }, + { + "epoch": 2.3916513198281155, + "grad_norm": 0.421522855758667, + "learning_rate": 8.896374194948744e-05, + "loss": 1.8215, + "step": 7792 + }, + { + "epoch": 2.3919582565991404, + "grad_norm": 0.4456072747707367, + "learning_rate": 8.896062679499621e-05, + "loss": 1.9146, + "step": 7793 + }, + { + "epoch": 2.3922651933701657, + "grad_norm": 0.33498415350914, + "learning_rate": 8.895751125547325e-05, + "loss": 1.8372, + "step": 7794 + }, + { + "epoch": 2.392572130141191, + "grad_norm": 0.3279598355293274, + "learning_rate": 8.895439533094933e-05, + "loss": 1.8469, + "step": 7795 + }, + { + "epoch": 2.392879066912216, + "grad_norm": 0.4238305687904358, + "learning_rate": 8.895127902145524e-05, + "loss": 1.8259, + "step": 7796 + }, + { + "epoch": 2.3931860036832413, + "grad_norm": 0.473057359457016, + "learning_rate": 8.89481623270218e-05, + "loss": 1.8374, + "step": 7797 + }, + { + "epoch": 2.3934929404542666, + "grad_norm": 0.30914968252182007, + "learning_rate": 8.894504524767976e-05, + "loss": 1.7803, + "step": 7798 + }, + { + "epoch": 2.3937998772252915, + "grad_norm": 0.3433384597301483, + "learning_rate": 8.894192778345996e-05, + "loss": 1.8568, + "step": 7799 + }, + { + "epoch": 2.394106813996317, + "grad_norm": 0.4965706467628479, + "learning_rate": 8.893880993439323e-05, + "loss": 1.8576, + "step": 7800 + }, + { + "epoch": 2.3944137507673418, + "grad_norm": 0.4996519684791565, + "learning_rate": 8.893569170051032e-05, + "loss": 1.788, + "step": 7801 + }, + { + "epoch": 2.394720687538367, + "grad_norm": 0.31231364607810974, + "learning_rate": 8.893257308184212e-05, + "loss": 1.7846, + "step": 7802 + }, + { + "epoch": 2.3950276243093924, + "grad_norm": 0.32845574617385864, + "learning_rate": 8.89294540784194e-05, + "loss": 1.8811, + "step": 7803 + }, + { + "epoch": 2.3953345610804173, + "grad_norm": 0.525324285030365, + "learning_rate": 8.8926334690273e-05, + "loss": 1.8458, + "step": 7804 + }, + { + "epoch": 2.3956414978514426, + "grad_norm": 0.5107213854789734, + "learning_rate": 8.892321491743373e-05, + "loss": 1.8419, + "step": 7805 + }, + { + "epoch": 2.3959484346224675, + "grad_norm": 0.33831658959388733, + "learning_rate": 8.892009475993245e-05, + "loss": 1.811, + "step": 7806 + }, + { + "epoch": 2.396255371393493, + "grad_norm": 0.3781357407569885, + "learning_rate": 8.891697421779999e-05, + "loss": 1.9385, + "step": 7807 + }, + { + "epoch": 2.396562308164518, + "grad_norm": 0.43507882952690125, + "learning_rate": 8.891385329106717e-05, + "loss": 1.7705, + "step": 7808 + }, + { + "epoch": 2.396869244935543, + "grad_norm": 0.45114290714263916, + "learning_rate": 8.891073197976483e-05, + "loss": 1.8661, + "step": 7809 + }, + { + "epoch": 2.3971761817065684, + "grad_norm": 0.29369547963142395, + "learning_rate": 8.890761028392385e-05, + "loss": 1.873, + "step": 7810 + }, + { + "epoch": 2.3974831184775938, + "grad_norm": 0.3268595337867737, + "learning_rate": 8.890448820357506e-05, + "loss": 1.8461, + "step": 7811 + }, + { + "epoch": 2.3977900552486187, + "grad_norm": 0.4514225423336029, + "learning_rate": 8.890136573874931e-05, + "loss": 1.8458, + "step": 7812 + }, + { + "epoch": 2.398096992019644, + "grad_norm": 0.5288760662078857, + "learning_rate": 8.889824288947745e-05, + "loss": 1.8301, + "step": 7813 + }, + { + "epoch": 2.3984039287906693, + "grad_norm": 0.46517884731292725, + "learning_rate": 8.889511965579038e-05, + "loss": 1.8769, + "step": 7814 + }, + { + "epoch": 2.398710865561694, + "grad_norm": 0.29907044768333435, + "learning_rate": 8.889199603771892e-05, + "loss": 1.7815, + "step": 7815 + }, + { + "epoch": 2.3990178023327196, + "grad_norm": 0.36091622710227966, + "learning_rate": 8.888887203529398e-05, + "loss": 1.8375, + "step": 7816 + }, + { + "epoch": 2.3993247391037444, + "grad_norm": 0.5604190230369568, + "learning_rate": 8.88857476485464e-05, + "loss": 1.9176, + "step": 7817 + }, + { + "epoch": 2.3996316758747698, + "grad_norm": 0.48299452662467957, + "learning_rate": 8.888262287750707e-05, + "loss": 1.8682, + "step": 7818 + }, + { + "epoch": 2.399938612645795, + "grad_norm": 0.32829394936561584, + "learning_rate": 8.887949772220687e-05, + "loss": 1.9143, + "step": 7819 + }, + { + "epoch": 2.40024554941682, + "grad_norm": 0.401719868183136, + "learning_rate": 8.88763721826767e-05, + "loss": 1.8517, + "step": 7820 + }, + { + "epoch": 2.4005524861878453, + "grad_norm": 0.5205032825469971, + "learning_rate": 8.887324625894741e-05, + "loss": 1.811, + "step": 7821 + }, + { + "epoch": 2.4008594229588702, + "grad_norm": 0.3828800618648529, + "learning_rate": 8.887011995104993e-05, + "loss": 1.8042, + "step": 7822 + }, + { + "epoch": 2.4011663597298956, + "grad_norm": 0.31816062331199646, + "learning_rate": 8.886699325901514e-05, + "loss": 1.8998, + "step": 7823 + }, + { + "epoch": 2.401473296500921, + "grad_norm": 0.36172720789909363, + "learning_rate": 8.886386618287394e-05, + "loss": 1.8689, + "step": 7824 + }, + { + "epoch": 2.401780233271946, + "grad_norm": 0.3582005202770233, + "learning_rate": 8.886073872265725e-05, + "loss": 1.8565, + "step": 7825 + }, + { + "epoch": 2.402087170042971, + "grad_norm": 0.2915255129337311, + "learning_rate": 8.885761087839594e-05, + "loss": 1.8686, + "step": 7826 + }, + { + "epoch": 2.4023941068139965, + "grad_norm": 0.26619917154312134, + "learning_rate": 8.885448265012095e-05, + "loss": 1.7737, + "step": 7827 + }, + { + "epoch": 2.4027010435850213, + "grad_norm": 0.31685733795166016, + "learning_rate": 8.88513540378632e-05, + "loss": 1.9136, + "step": 7828 + }, + { + "epoch": 2.4030079803560467, + "grad_norm": 0.3427450954914093, + "learning_rate": 8.884822504165359e-05, + "loss": 1.8824, + "step": 7829 + }, + { + "epoch": 2.403314917127072, + "grad_norm": 0.3207513689994812, + "learning_rate": 8.884509566152306e-05, + "loss": 1.8332, + "step": 7830 + }, + { + "epoch": 2.403621853898097, + "grad_norm": 0.3301675319671631, + "learning_rate": 8.884196589750251e-05, + "loss": 1.9129, + "step": 7831 + }, + { + "epoch": 2.4039287906691222, + "grad_norm": 0.3232486844062805, + "learning_rate": 8.88388357496229e-05, + "loss": 1.8362, + "step": 7832 + }, + { + "epoch": 2.404235727440147, + "grad_norm": 0.3152230381965637, + "learning_rate": 8.883570521791514e-05, + "loss": 1.8586, + "step": 7833 + }, + { + "epoch": 2.4045426642111725, + "grad_norm": 0.3204822540283203, + "learning_rate": 8.883257430241019e-05, + "loss": 1.842, + "step": 7834 + }, + { + "epoch": 2.404849600982198, + "grad_norm": 0.28253886103630066, + "learning_rate": 8.882944300313897e-05, + "loss": 1.8521, + "step": 7835 + }, + { + "epoch": 2.4051565377532227, + "grad_norm": 0.37631165981292725, + "learning_rate": 8.882631132013245e-05, + "loss": 1.8838, + "step": 7836 + }, + { + "epoch": 2.405463474524248, + "grad_norm": 0.3606031537055969, + "learning_rate": 8.882317925342157e-05, + "loss": 1.8452, + "step": 7837 + }, + { + "epoch": 2.4057704112952734, + "grad_norm": 0.33793914318084717, + "learning_rate": 8.882004680303726e-05, + "loss": 1.8866, + "step": 7838 + }, + { + "epoch": 2.4060773480662982, + "grad_norm": 0.2714223265647888, + "learning_rate": 8.881691396901048e-05, + "loss": 1.7953, + "step": 7839 + }, + { + "epoch": 2.4063842848373236, + "grad_norm": 0.3588239252567291, + "learning_rate": 8.881378075137224e-05, + "loss": 1.9679, + "step": 7840 + }, + { + "epoch": 2.406691221608349, + "grad_norm": 0.3266383707523346, + "learning_rate": 8.881064715015344e-05, + "loss": 1.8747, + "step": 7841 + }, + { + "epoch": 2.406998158379374, + "grad_norm": 0.3498428761959076, + "learning_rate": 8.88075131653851e-05, + "loss": 1.8882, + "step": 7842 + }, + { + "epoch": 2.407305095150399, + "grad_norm": 0.36646100878715515, + "learning_rate": 8.880437879709815e-05, + "loss": 1.8624, + "step": 7843 + }, + { + "epoch": 2.407612031921424, + "grad_norm": 0.36088457703590393, + "learning_rate": 8.88012440453236e-05, + "loss": 1.8527, + "step": 7844 + }, + { + "epoch": 2.4079189686924494, + "grad_norm": 0.3267477750778198, + "learning_rate": 8.87981089100924e-05, + "loss": 1.8374, + "step": 7845 + }, + { + "epoch": 2.4082259054634747, + "grad_norm": 0.3262403607368469, + "learning_rate": 8.879497339143556e-05, + "loss": 1.8752, + "step": 7846 + }, + { + "epoch": 2.4085328422344996, + "grad_norm": 0.278877854347229, + "learning_rate": 8.879183748938405e-05, + "loss": 1.8056, + "step": 7847 + }, + { + "epoch": 2.408839779005525, + "grad_norm": 0.35509005188941956, + "learning_rate": 8.878870120396886e-05, + "loss": 1.8555, + "step": 7848 + }, + { + "epoch": 2.40914671577655, + "grad_norm": 0.3621126413345337, + "learning_rate": 8.8785564535221e-05, + "loss": 1.8084, + "step": 7849 + }, + { + "epoch": 2.409453652547575, + "grad_norm": 0.2772746682167053, + "learning_rate": 8.878242748317145e-05, + "loss": 1.8034, + "step": 7850 + }, + { + "epoch": 2.4097605893186005, + "grad_norm": 0.30938875675201416, + "learning_rate": 8.877929004785121e-05, + "loss": 1.8341, + "step": 7851 + }, + { + "epoch": 2.4100675260896254, + "grad_norm": 0.3349369764328003, + "learning_rate": 8.877615222929133e-05, + "loss": 1.8306, + "step": 7852 + }, + { + "epoch": 2.4103744628606507, + "grad_norm": 0.3109685778617859, + "learning_rate": 8.877301402752277e-05, + "loss": 1.7998, + "step": 7853 + }, + { + "epoch": 2.410681399631676, + "grad_norm": 0.3337927460670471, + "learning_rate": 8.876987544257655e-05, + "loss": 1.8766, + "step": 7854 + }, + { + "epoch": 2.410988336402701, + "grad_norm": 0.33891361951828003, + "learning_rate": 8.87667364744837e-05, + "loss": 1.8535, + "step": 7855 + }, + { + "epoch": 2.4112952731737263, + "grad_norm": 0.30946552753448486, + "learning_rate": 8.876359712327524e-05, + "loss": 1.8144, + "step": 7856 + }, + { + "epoch": 2.4116022099447516, + "grad_norm": 0.354981929063797, + "learning_rate": 8.87604573889822e-05, + "loss": 1.9253, + "step": 7857 + }, + { + "epoch": 2.4119091467157765, + "grad_norm": 0.42054516077041626, + "learning_rate": 8.875731727163559e-05, + "loss": 1.9122, + "step": 7858 + }, + { + "epoch": 2.412216083486802, + "grad_norm": 0.37435492873191833, + "learning_rate": 8.875417677126646e-05, + "loss": 1.8639, + "step": 7859 + }, + { + "epoch": 2.4125230202578267, + "grad_norm": 0.3742216229438782, + "learning_rate": 8.875103588790584e-05, + "loss": 1.8398, + "step": 7860 + }, + { + "epoch": 2.412829957028852, + "grad_norm": 0.3152104616165161, + "learning_rate": 8.874789462158478e-05, + "loss": 1.8078, + "step": 7861 + }, + { + "epoch": 2.4131368937998774, + "grad_norm": 0.32342761754989624, + "learning_rate": 8.87447529723343e-05, + "loss": 1.8632, + "step": 7862 + }, + { + "epoch": 2.4134438305709023, + "grad_norm": 0.31065210700035095, + "learning_rate": 8.874161094018547e-05, + "loss": 1.845, + "step": 7863 + }, + { + "epoch": 2.4137507673419276, + "grad_norm": 0.31379538774490356, + "learning_rate": 8.873846852516933e-05, + "loss": 1.8184, + "step": 7864 + }, + { + "epoch": 2.4140577041129525, + "grad_norm": 0.29058924317359924, + "learning_rate": 8.873532572731694e-05, + "loss": 1.8671, + "step": 7865 + }, + { + "epoch": 2.414364640883978, + "grad_norm": 0.3024691641330719, + "learning_rate": 8.873218254665936e-05, + "loss": 1.7977, + "step": 7866 + }, + { + "epoch": 2.414671577655003, + "grad_norm": 0.30356913805007935, + "learning_rate": 8.872903898322764e-05, + "loss": 1.8284, + "step": 7867 + }, + { + "epoch": 2.414978514426028, + "grad_norm": 0.29594334959983826, + "learning_rate": 8.872589503705287e-05, + "loss": 1.8651, + "step": 7868 + }, + { + "epoch": 2.4152854511970534, + "grad_norm": 0.2929564118385315, + "learning_rate": 8.872275070816612e-05, + "loss": 1.8671, + "step": 7869 + }, + { + "epoch": 2.4155923879680787, + "grad_norm": 0.30591902136802673, + "learning_rate": 8.871960599659842e-05, + "loss": 1.9341, + "step": 7870 + }, + { + "epoch": 2.4158993247391036, + "grad_norm": 0.3944799304008484, + "learning_rate": 8.87164609023809e-05, + "loss": 1.8947, + "step": 7871 + }, + { + "epoch": 2.416206261510129, + "grad_norm": 0.3568263351917267, + "learning_rate": 8.871331542554461e-05, + "loss": 1.8466, + "step": 7872 + }, + { + "epoch": 2.4165131982811543, + "grad_norm": 0.3182635009288788, + "learning_rate": 8.871016956612066e-05, + "loss": 1.8373, + "step": 7873 + }, + { + "epoch": 2.416820135052179, + "grad_norm": 0.31941649317741394, + "learning_rate": 8.870702332414012e-05, + "loss": 1.8356, + "step": 7874 + }, + { + "epoch": 2.4171270718232045, + "grad_norm": 0.3090899586677551, + "learning_rate": 8.870387669963407e-05, + "loss": 1.9308, + "step": 7875 + }, + { + "epoch": 2.4174340085942294, + "grad_norm": 0.3078390955924988, + "learning_rate": 8.870072969263364e-05, + "loss": 1.8521, + "step": 7876 + }, + { + "epoch": 2.4177409453652547, + "grad_norm": 0.29126885533332825, + "learning_rate": 8.869758230316992e-05, + "loss": 1.8091, + "step": 7877 + }, + { + "epoch": 2.41804788213628, + "grad_norm": 0.36473605036735535, + "learning_rate": 8.869443453127402e-05, + "loss": 1.8282, + "step": 7878 + }, + { + "epoch": 2.418354818907305, + "grad_norm": 0.3617660701274872, + "learning_rate": 8.869128637697702e-05, + "loss": 1.8843, + "step": 7879 + }, + { + "epoch": 2.4186617556783303, + "grad_norm": 0.33267220854759216, + "learning_rate": 8.868813784031005e-05, + "loss": 1.8647, + "step": 7880 + }, + { + "epoch": 2.418968692449355, + "grad_norm": 0.29990482330322266, + "learning_rate": 8.868498892130424e-05, + "loss": 1.7697, + "step": 7881 + }, + { + "epoch": 2.4192756292203805, + "grad_norm": 0.3618892431259155, + "learning_rate": 8.868183961999068e-05, + "loss": 1.7699, + "step": 7882 + }, + { + "epoch": 2.419582565991406, + "grad_norm": 0.29534587264060974, + "learning_rate": 8.867868993640051e-05, + "loss": 1.828, + "step": 7883 + }, + { + "epoch": 2.4198895027624308, + "grad_norm": 0.3086758255958557, + "learning_rate": 8.867553987056487e-05, + "loss": 1.8652, + "step": 7884 + }, + { + "epoch": 2.420196439533456, + "grad_norm": 0.3273947834968567, + "learning_rate": 8.867238942251487e-05, + "loss": 1.8553, + "step": 7885 + }, + { + "epoch": 2.4205033763044814, + "grad_norm": 0.3069070279598236, + "learning_rate": 8.866923859228165e-05, + "loss": 1.8057, + "step": 7886 + }, + { + "epoch": 2.4208103130755063, + "grad_norm": 0.2884439527988434, + "learning_rate": 8.866608737989635e-05, + "loss": 1.8479, + "step": 7887 + }, + { + "epoch": 2.4211172498465316, + "grad_norm": 0.32123002409935, + "learning_rate": 8.866293578539011e-05, + "loss": 1.916, + "step": 7888 + }, + { + "epoch": 2.421424186617557, + "grad_norm": 0.285966157913208, + "learning_rate": 8.865978380879407e-05, + "loss": 1.834, + "step": 7889 + }, + { + "epoch": 2.421731123388582, + "grad_norm": 0.28088799118995667, + "learning_rate": 8.865663145013941e-05, + "loss": 1.7794, + "step": 7890 + }, + { + "epoch": 2.422038060159607, + "grad_norm": 0.31160372495651245, + "learning_rate": 8.865347870945724e-05, + "loss": 1.8584, + "step": 7891 + }, + { + "epoch": 2.422344996930632, + "grad_norm": 0.3121089041233063, + "learning_rate": 8.865032558677874e-05, + "loss": 1.8797, + "step": 7892 + }, + { + "epoch": 2.4226519337016574, + "grad_norm": 0.35856643319129944, + "learning_rate": 8.864717208213506e-05, + "loss": 1.8664, + "step": 7893 + }, + { + "epoch": 2.4229588704726828, + "grad_norm": 0.32826781272888184, + "learning_rate": 8.864401819555739e-05, + "loss": 1.8473, + "step": 7894 + }, + { + "epoch": 2.4232658072437077, + "grad_norm": 0.34450921416282654, + "learning_rate": 8.86408639270769e-05, + "loss": 1.918, + "step": 7895 + }, + { + "epoch": 2.423572744014733, + "grad_norm": 0.39621153473854065, + "learning_rate": 8.86377092767247e-05, + "loss": 1.9411, + "step": 7896 + }, + { + "epoch": 2.423879680785758, + "grad_norm": 0.3765166103839874, + "learning_rate": 8.863455424453204e-05, + "loss": 1.9003, + "step": 7897 + }, + { + "epoch": 2.424186617556783, + "grad_norm": 0.3942621946334839, + "learning_rate": 8.863139883053007e-05, + "loss": 1.9647, + "step": 7898 + }, + { + "epoch": 2.4244935543278086, + "grad_norm": 0.4255806803703308, + "learning_rate": 8.862824303474996e-05, + "loss": 1.9147, + "step": 7899 + }, + { + "epoch": 2.424800491098834, + "grad_norm": 0.3993197977542877, + "learning_rate": 8.862508685722292e-05, + "loss": 1.8822, + "step": 7900 + }, + { + "epoch": 2.425107427869859, + "grad_norm": 0.3734201490879059, + "learning_rate": 8.862193029798013e-05, + "loss": 1.8745, + "step": 7901 + }, + { + "epoch": 2.425414364640884, + "grad_norm": 0.40955278277397156, + "learning_rate": 8.861877335705279e-05, + "loss": 1.877, + "step": 7902 + }, + { + "epoch": 2.425721301411909, + "grad_norm": 0.3975965678691864, + "learning_rate": 8.861561603447211e-05, + "loss": 1.868, + "step": 7903 + }, + { + "epoch": 2.4260282381829343, + "grad_norm": 0.30194091796875, + "learning_rate": 8.861245833026926e-05, + "loss": 1.7849, + "step": 7904 + }, + { + "epoch": 2.4263351749539597, + "grad_norm": 0.349930077791214, + "learning_rate": 8.860930024447547e-05, + "loss": 1.891, + "step": 7905 + }, + { + "epoch": 2.4266421117249846, + "grad_norm": 0.40644606947898865, + "learning_rate": 8.860614177712196e-05, + "loss": 1.8463, + "step": 7906 + }, + { + "epoch": 2.42694904849601, + "grad_norm": 0.3627426028251648, + "learning_rate": 8.86029829282399e-05, + "loss": 1.8518, + "step": 7907 + }, + { + "epoch": 2.427255985267035, + "grad_norm": 0.4019826054573059, + "learning_rate": 8.859982369786055e-05, + "loss": 1.7997, + "step": 7908 + }, + { + "epoch": 2.42756292203806, + "grad_norm": 0.375589519739151, + "learning_rate": 8.859666408601512e-05, + "loss": 1.9136, + "step": 7909 + }, + { + "epoch": 2.4278698588090855, + "grad_norm": 0.3135814070701599, + "learning_rate": 8.859350409273484e-05, + "loss": 1.8511, + "step": 7910 + }, + { + "epoch": 2.4281767955801103, + "grad_norm": 0.4534473717212677, + "learning_rate": 8.859034371805093e-05, + "loss": 1.9827, + "step": 7911 + }, + { + "epoch": 2.4284837323511357, + "grad_norm": 0.5559772849082947, + "learning_rate": 8.858718296199462e-05, + "loss": 1.8578, + "step": 7912 + }, + { + "epoch": 2.428790669122161, + "grad_norm": 0.4518011212348938, + "learning_rate": 8.858402182459715e-05, + "loss": 1.8374, + "step": 7913 + }, + { + "epoch": 2.429097605893186, + "grad_norm": 0.31662946939468384, + "learning_rate": 8.858086030588977e-05, + "loss": 1.8356, + "step": 7914 + }, + { + "epoch": 2.4294045426642112, + "grad_norm": 0.4660717844963074, + "learning_rate": 8.857769840590371e-05, + "loss": 1.7977, + "step": 7915 + }, + { + "epoch": 2.4297114794352366, + "grad_norm": 0.5611162185668945, + "learning_rate": 8.857453612467022e-05, + "loss": 1.8423, + "step": 7916 + }, + { + "epoch": 2.4300184162062615, + "grad_norm": 0.5055921077728271, + "learning_rate": 8.857137346222056e-05, + "loss": 1.8595, + "step": 7917 + }, + { + "epoch": 2.430325352977287, + "grad_norm": 0.3589123487472534, + "learning_rate": 8.856821041858597e-05, + "loss": 1.776, + "step": 7918 + }, + { + "epoch": 2.4306322897483117, + "grad_norm": 0.36849313974380493, + "learning_rate": 8.856504699379773e-05, + "loss": 1.8695, + "step": 7919 + }, + { + "epoch": 2.430939226519337, + "grad_norm": 0.47566625475883484, + "learning_rate": 8.856188318788709e-05, + "loss": 1.8578, + "step": 7920 + }, + { + "epoch": 2.4312461632903624, + "grad_norm": 0.554790735244751, + "learning_rate": 8.855871900088532e-05, + "loss": 1.8406, + "step": 7921 + }, + { + "epoch": 2.4315531000613873, + "grad_norm": 0.4846283197402954, + "learning_rate": 8.855555443282369e-05, + "loss": 1.8475, + "step": 7922 + }, + { + "epoch": 2.4318600368324126, + "grad_norm": 0.35256531834602356, + "learning_rate": 8.855238948373346e-05, + "loss": 1.8594, + "step": 7923 + }, + { + "epoch": 2.4321669736034375, + "grad_norm": 0.3713412880897522, + "learning_rate": 8.854922415364593e-05, + "loss": 1.893, + "step": 7924 + }, + { + "epoch": 2.432473910374463, + "grad_norm": 0.4289644658565521, + "learning_rate": 8.854605844259237e-05, + "loss": 1.8958, + "step": 7925 + }, + { + "epoch": 2.432780847145488, + "grad_norm": 0.4209578335285187, + "learning_rate": 8.854289235060406e-05, + "loss": 1.8419, + "step": 7926 + }, + { + "epoch": 2.433087783916513, + "grad_norm": 0.41226091980934143, + "learning_rate": 8.853972587771232e-05, + "loss": 1.958, + "step": 7927 + }, + { + "epoch": 2.4333947206875384, + "grad_norm": 0.36133915185928345, + "learning_rate": 8.853655902394841e-05, + "loss": 1.9181, + "step": 7928 + }, + { + "epoch": 2.4337016574585637, + "grad_norm": 0.44178202748298645, + "learning_rate": 8.853339178934363e-05, + "loss": 1.9242, + "step": 7929 + }, + { + "epoch": 2.4340085942295886, + "grad_norm": 0.4537523686885834, + "learning_rate": 8.853022417392929e-05, + "loss": 2.0451, + "step": 7930 + }, + { + "epoch": 2.434315531000614, + "grad_norm": 0.3214915990829468, + "learning_rate": 8.852705617773669e-05, + "loss": 1.8549, + "step": 7931 + }, + { + "epoch": 2.4346224677716393, + "grad_norm": 0.4621930420398712, + "learning_rate": 8.852388780079714e-05, + "loss": 1.8705, + "step": 7932 + }, + { + "epoch": 2.434929404542664, + "grad_norm": 0.52337646484375, + "learning_rate": 8.852071904314196e-05, + "loss": 1.8381, + "step": 7933 + }, + { + "epoch": 2.4352363413136895, + "grad_norm": 0.3846060633659363, + "learning_rate": 8.851754990480246e-05, + "loss": 1.828, + "step": 7934 + }, + { + "epoch": 2.4355432780847144, + "grad_norm": 0.34233763813972473, + "learning_rate": 8.851438038580994e-05, + "loss": 1.924, + "step": 7935 + }, + { + "epoch": 2.4358502148557397, + "grad_norm": 0.39583292603492737, + "learning_rate": 8.851121048619574e-05, + "loss": 1.8383, + "step": 7936 + }, + { + "epoch": 2.436157151626765, + "grad_norm": 0.3715476393699646, + "learning_rate": 8.850804020599119e-05, + "loss": 1.9251, + "step": 7937 + }, + { + "epoch": 2.43646408839779, + "grad_norm": 0.32089582085609436, + "learning_rate": 8.850486954522762e-05, + "loss": 1.9317, + "step": 7938 + }, + { + "epoch": 2.4367710251688153, + "grad_norm": 0.46823611855506897, + "learning_rate": 8.850169850393634e-05, + "loss": 1.9743, + "step": 7939 + }, + { + "epoch": 2.43707796193984, + "grad_norm": 0.405205637216568, + "learning_rate": 8.849852708214874e-05, + "loss": 1.8772, + "step": 7940 + }, + { + "epoch": 2.4373848987108655, + "grad_norm": 0.33672770857810974, + "learning_rate": 8.849535527989612e-05, + "loss": 1.8767, + "step": 7941 + }, + { + "epoch": 2.437691835481891, + "grad_norm": 0.38022953271865845, + "learning_rate": 8.849218309720983e-05, + "loss": 1.8882, + "step": 7942 + }, + { + "epoch": 2.4379987722529157, + "grad_norm": 0.4224186837673187, + "learning_rate": 8.848901053412124e-05, + "loss": 1.9016, + "step": 7943 + }, + { + "epoch": 2.438305709023941, + "grad_norm": 0.3890904486179352, + "learning_rate": 8.848583759066167e-05, + "loss": 1.8761, + "step": 7944 + }, + { + "epoch": 2.4386126457949664, + "grad_norm": 0.3747030794620514, + "learning_rate": 8.84826642668625e-05, + "loss": 1.8576, + "step": 7945 + }, + { + "epoch": 2.4389195825659913, + "grad_norm": 0.3317604959011078, + "learning_rate": 8.84794905627551e-05, + "loss": 1.9249, + "step": 7946 + }, + { + "epoch": 2.4392265193370166, + "grad_norm": 0.3294972777366638, + "learning_rate": 8.84763164783708e-05, + "loss": 1.8308, + "step": 7947 + }, + { + "epoch": 2.439533456108042, + "grad_norm": 0.42031124234199524, + "learning_rate": 8.847314201374101e-05, + "loss": 1.7884, + "step": 7948 + }, + { + "epoch": 2.439840392879067, + "grad_norm": 0.4018419682979584, + "learning_rate": 8.846996716889708e-05, + "loss": 1.8334, + "step": 7949 + }, + { + "epoch": 2.440147329650092, + "grad_norm": 0.39541858434677124, + "learning_rate": 8.846679194387036e-05, + "loss": 1.888, + "step": 7950 + }, + { + "epoch": 2.440454266421117, + "grad_norm": 0.34641456604003906, + "learning_rate": 8.846361633869228e-05, + "loss": 1.8521, + "step": 7951 + }, + { + "epoch": 2.4407612031921424, + "grad_norm": 0.42987826466560364, + "learning_rate": 8.846044035339419e-05, + "loss": 1.8789, + "step": 7952 + }, + { + "epoch": 2.4410681399631677, + "grad_norm": 0.3651089072227478, + "learning_rate": 8.845726398800749e-05, + "loss": 1.9024, + "step": 7953 + }, + { + "epoch": 2.4413750767341926, + "grad_norm": 0.3024137616157532, + "learning_rate": 8.845408724256356e-05, + "loss": 1.7773, + "step": 7954 + }, + { + "epoch": 2.441682013505218, + "grad_norm": 0.32426944375038147, + "learning_rate": 8.845091011709381e-05, + "loss": 1.7873, + "step": 7955 + }, + { + "epoch": 2.441988950276243, + "grad_norm": 0.34448274970054626, + "learning_rate": 8.844773261162962e-05, + "loss": 1.8854, + "step": 7956 + }, + { + "epoch": 2.442295887047268, + "grad_norm": 0.2942068874835968, + "learning_rate": 8.844455472620241e-05, + "loss": 1.8186, + "step": 7957 + }, + { + "epoch": 2.4426028238182935, + "grad_norm": 0.3849888741970062, + "learning_rate": 8.844137646084358e-05, + "loss": 1.905, + "step": 7958 + }, + { + "epoch": 2.4429097605893184, + "grad_norm": 0.44277897477149963, + "learning_rate": 8.843819781558452e-05, + "loss": 1.8836, + "step": 7959 + }, + { + "epoch": 2.4432166973603437, + "grad_norm": 0.34470248222351074, + "learning_rate": 8.843501879045667e-05, + "loss": 1.9368, + "step": 7960 + }, + { + "epoch": 2.443523634131369, + "grad_norm": 0.29713204503059387, + "learning_rate": 8.843183938549145e-05, + "loss": 1.8562, + "step": 7961 + }, + { + "epoch": 2.443830570902394, + "grad_norm": 0.370623379945755, + "learning_rate": 8.842865960072025e-05, + "loss": 1.8501, + "step": 7962 + }, + { + "epoch": 2.4441375076734193, + "grad_norm": 0.38828277587890625, + "learning_rate": 8.842547943617453e-05, + "loss": 1.884, + "step": 7963 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.294223427772522, + "learning_rate": 8.842229889188566e-05, + "loss": 1.857, + "step": 7964 + }, + { + "epoch": 2.4447513812154695, + "grad_norm": 0.31901589035987854, + "learning_rate": 8.841911796788516e-05, + "loss": 1.8675, + "step": 7965 + }, + { + "epoch": 2.445058317986495, + "grad_norm": 0.3586447834968567, + "learning_rate": 8.84159366642044e-05, + "loss": 1.86, + "step": 7966 + }, + { + "epoch": 2.4453652547575198, + "grad_norm": 0.30848199129104614, + "learning_rate": 8.841275498087482e-05, + "loss": 1.8153, + "step": 7967 + }, + { + "epoch": 2.445672191528545, + "grad_norm": 0.2694801688194275, + "learning_rate": 8.84095729179279e-05, + "loss": 1.7702, + "step": 7968 + }, + { + "epoch": 2.4459791282995704, + "grad_norm": 0.3068044185638428, + "learning_rate": 8.840639047539507e-05, + "loss": 1.8531, + "step": 7969 + }, + { + "epoch": 2.4462860650705953, + "grad_norm": 0.32885125279426575, + "learning_rate": 8.840320765330776e-05, + "loss": 1.9194, + "step": 7970 + }, + { + "epoch": 2.4465930018416207, + "grad_norm": 0.2949635088443756, + "learning_rate": 8.840002445169746e-05, + "loss": 1.8427, + "step": 7971 + }, + { + "epoch": 2.446899938612646, + "grad_norm": 0.27281275391578674, + "learning_rate": 8.83968408705956e-05, + "loss": 1.8279, + "step": 7972 + }, + { + "epoch": 2.447206875383671, + "grad_norm": 0.3038519620895386, + "learning_rate": 8.839365691003367e-05, + "loss": 1.8629, + "step": 7973 + }, + { + "epoch": 2.447513812154696, + "grad_norm": 0.28468266129493713, + "learning_rate": 8.839047257004311e-05, + "loss": 1.8765, + "step": 7974 + }, + { + "epoch": 2.4478207489257215, + "grad_norm": 0.29807159304618835, + "learning_rate": 8.83872878506554e-05, + "loss": 1.8152, + "step": 7975 + }, + { + "epoch": 2.4481276856967464, + "grad_norm": 0.3005301356315613, + "learning_rate": 8.838410275190201e-05, + "loss": 1.8577, + "step": 7976 + }, + { + "epoch": 2.4484346224677718, + "grad_norm": 0.3068598806858063, + "learning_rate": 8.838091727381442e-05, + "loss": 1.863, + "step": 7977 + }, + { + "epoch": 2.4487415592387967, + "grad_norm": 0.33748000860214233, + "learning_rate": 8.837773141642411e-05, + "loss": 1.7889, + "step": 7978 + }, + { + "epoch": 2.449048496009822, + "grad_norm": 0.344417542219162, + "learning_rate": 8.837454517976256e-05, + "loss": 1.9167, + "step": 7979 + }, + { + "epoch": 2.4493554327808473, + "grad_norm": 0.29128298163414, + "learning_rate": 8.837135856386127e-05, + "loss": 1.8246, + "step": 7980 + }, + { + "epoch": 2.449662369551872, + "grad_norm": 0.27023759484291077, + "learning_rate": 8.836817156875172e-05, + "loss": 1.8493, + "step": 7981 + }, + { + "epoch": 2.4499693063228976, + "grad_norm": 0.2792586088180542, + "learning_rate": 8.836498419446541e-05, + "loss": 1.8739, + "step": 7982 + }, + { + "epoch": 2.4502762430939224, + "grad_norm": 0.2715211510658264, + "learning_rate": 8.836179644103384e-05, + "loss": 1.8218, + "step": 7983 + }, + { + "epoch": 2.450583179864948, + "grad_norm": 0.273576557636261, + "learning_rate": 8.835860830848851e-05, + "loss": 1.9063, + "step": 7984 + }, + { + "epoch": 2.450890116635973, + "grad_norm": 0.2992589473724365, + "learning_rate": 8.835541979686093e-05, + "loss": 1.8799, + "step": 7985 + }, + { + "epoch": 2.451197053406998, + "grad_norm": 0.3231843411922455, + "learning_rate": 8.835223090618263e-05, + "loss": 1.8956, + "step": 7986 + }, + { + "epoch": 2.4515039901780233, + "grad_norm": 0.31108468770980835, + "learning_rate": 8.834904163648508e-05, + "loss": 1.8371, + "step": 7987 + }, + { + "epoch": 2.4518109269490487, + "grad_norm": 0.26657021045684814, + "learning_rate": 8.834585198779983e-05, + "loss": 1.8384, + "step": 7988 + }, + { + "epoch": 2.4521178637200736, + "grad_norm": 0.32093849778175354, + "learning_rate": 8.83426619601584e-05, + "loss": 1.8603, + "step": 7989 + }, + { + "epoch": 2.452424800491099, + "grad_norm": 0.32942765951156616, + "learning_rate": 8.833947155359231e-05, + "loss": 1.8306, + "step": 7990 + }, + { + "epoch": 2.4527317372621242, + "grad_norm": 0.31677374243736267, + "learning_rate": 8.83362807681331e-05, + "loss": 1.8339, + "step": 7991 + }, + { + "epoch": 2.453038674033149, + "grad_norm": 0.2739655673503876, + "learning_rate": 8.833308960381228e-05, + "loss": 1.8514, + "step": 7992 + }, + { + "epoch": 2.4533456108041745, + "grad_norm": 0.3194214105606079, + "learning_rate": 8.83298980606614e-05, + "loss": 1.8413, + "step": 7993 + }, + { + "epoch": 2.4536525475751993, + "grad_norm": 0.3346202075481415, + "learning_rate": 8.832670613871202e-05, + "loss": 1.8558, + "step": 7994 + }, + { + "epoch": 2.4539594843462247, + "grad_norm": 0.3400736451148987, + "learning_rate": 8.832351383799565e-05, + "loss": 1.8668, + "step": 7995 + }, + { + "epoch": 2.45426642111725, + "grad_norm": 0.2807479202747345, + "learning_rate": 8.832032115854385e-05, + "loss": 1.8361, + "step": 7996 + }, + { + "epoch": 2.454573357888275, + "grad_norm": 0.2977379262447357, + "learning_rate": 8.831712810038817e-05, + "loss": 1.84, + "step": 7997 + }, + { + "epoch": 2.4548802946593002, + "grad_norm": 0.3242948353290558, + "learning_rate": 8.831393466356019e-05, + "loss": 1.9421, + "step": 7998 + }, + { + "epoch": 2.455187231430325, + "grad_norm": 0.3289327025413513, + "learning_rate": 8.831074084809144e-05, + "loss": 1.9348, + "step": 7999 + }, + { + "epoch": 2.4554941682013505, + "grad_norm": 0.3378387987613678, + "learning_rate": 8.830754665401351e-05, + "loss": 1.7871, + "step": 8000 + }, + { + "epoch": 2.455801104972376, + "grad_norm": 0.29627665877342224, + "learning_rate": 8.830435208135794e-05, + "loss": 1.815, + "step": 8001 + }, + { + "epoch": 2.4561080417434007, + "grad_norm": 0.3509432375431061, + "learning_rate": 8.83011571301563e-05, + "loss": 1.9209, + "step": 8002 + }, + { + "epoch": 2.456414978514426, + "grad_norm": 0.3272305130958557, + "learning_rate": 8.829796180044019e-05, + "loss": 1.8437, + "step": 8003 + }, + { + "epoch": 2.4567219152854514, + "grad_norm": 0.33997493982315063, + "learning_rate": 8.829476609224119e-05, + "loss": 1.8827, + "step": 8004 + }, + { + "epoch": 2.4570288520564763, + "grad_norm": 0.30387789011001587, + "learning_rate": 8.829157000559084e-05, + "loss": 1.8427, + "step": 8005 + }, + { + "epoch": 2.4573357888275016, + "grad_norm": 0.30266425013542175, + "learning_rate": 8.828837354052075e-05, + "loss": 1.8274, + "step": 8006 + }, + { + "epoch": 2.457642725598527, + "grad_norm": 0.365546315908432, + "learning_rate": 8.828517669706254e-05, + "loss": 1.8455, + "step": 8007 + }, + { + "epoch": 2.457949662369552, + "grad_norm": 0.339226633310318, + "learning_rate": 8.828197947524774e-05, + "loss": 1.8665, + "step": 8008 + }, + { + "epoch": 2.458256599140577, + "grad_norm": 0.31167346239089966, + "learning_rate": 8.8278781875108e-05, + "loss": 1.7807, + "step": 8009 + }, + { + "epoch": 2.458563535911602, + "grad_norm": 0.2788028120994568, + "learning_rate": 8.82755838966749e-05, + "loss": 1.8834, + "step": 8010 + }, + { + "epoch": 2.4588704726826274, + "grad_norm": 0.34648752212524414, + "learning_rate": 8.827238553998005e-05, + "loss": 1.8981, + "step": 8011 + }, + { + "epoch": 2.4591774094536527, + "grad_norm": 0.3169974982738495, + "learning_rate": 8.826918680505504e-05, + "loss": 1.81, + "step": 8012 + }, + { + "epoch": 2.4594843462246776, + "grad_norm": 0.46924272179603577, + "learning_rate": 8.826598769193151e-05, + "loss": 1.9016, + "step": 8013 + }, + { + "epoch": 2.459791282995703, + "grad_norm": 0.38437098264694214, + "learning_rate": 8.826278820064106e-05, + "loss": 1.8924, + "step": 8014 + }, + { + "epoch": 2.460098219766728, + "grad_norm": 0.3350604474544525, + "learning_rate": 8.82595883312153e-05, + "loss": 1.8591, + "step": 8015 + }, + { + "epoch": 2.460405156537753, + "grad_norm": 0.3053742051124573, + "learning_rate": 8.825638808368588e-05, + "loss": 1.8114, + "step": 8016 + }, + { + "epoch": 2.4607120933087785, + "grad_norm": 0.29566875100135803, + "learning_rate": 8.82531874580844e-05, + "loss": 1.8055, + "step": 8017 + }, + { + "epoch": 2.4610190300798034, + "grad_norm": 0.3057360053062439, + "learning_rate": 8.824998645444249e-05, + "loss": 1.8268, + "step": 8018 + }, + { + "epoch": 2.4613259668508287, + "grad_norm": 0.27333348989486694, + "learning_rate": 8.82467850727918e-05, + "loss": 1.7876, + "step": 8019 + }, + { + "epoch": 2.461632903621854, + "grad_norm": 0.29202890396118164, + "learning_rate": 8.824358331316398e-05, + "loss": 1.8488, + "step": 8020 + }, + { + "epoch": 2.461939840392879, + "grad_norm": 0.3640623986721039, + "learning_rate": 8.824038117559064e-05, + "loss": 1.9665, + "step": 8021 + }, + { + "epoch": 2.4622467771639043, + "grad_norm": 0.35411131381988525, + "learning_rate": 8.823717866010344e-05, + "loss": 1.8561, + "step": 8022 + }, + { + "epoch": 2.4625537139349296, + "grad_norm": 0.3695240020751953, + "learning_rate": 8.823397576673403e-05, + "loss": 1.8489, + "step": 8023 + }, + { + "epoch": 2.4628606507059545, + "grad_norm": 0.36554715037345886, + "learning_rate": 8.823077249551406e-05, + "loss": 1.8523, + "step": 8024 + }, + { + "epoch": 2.46316758747698, + "grad_norm": 0.2982638478279114, + "learning_rate": 8.822756884647521e-05, + "loss": 1.8006, + "step": 8025 + }, + { + "epoch": 2.4634745242480047, + "grad_norm": 0.3693525791168213, + "learning_rate": 8.822436481964909e-05, + "loss": 1.8695, + "step": 8026 + }, + { + "epoch": 2.46378146101903, + "grad_norm": 0.46769842505455017, + "learning_rate": 8.82211604150674e-05, + "loss": 1.8509, + "step": 8027 + }, + { + "epoch": 2.4640883977900554, + "grad_norm": 0.5327584743499756, + "learning_rate": 8.82179556327618e-05, + "loss": 1.8642, + "step": 8028 + }, + { + "epoch": 2.4643953345610803, + "grad_norm": 0.5302795767784119, + "learning_rate": 8.821475047276398e-05, + "loss": 1.8645, + "step": 8029 + }, + { + "epoch": 2.4647022713321056, + "grad_norm": 0.43549028038978577, + "learning_rate": 8.821154493510557e-05, + "loss": 1.9193, + "step": 8030 + }, + { + "epoch": 2.4650092081031305, + "grad_norm": 0.3013847768306732, + "learning_rate": 8.82083390198183e-05, + "loss": 1.7819, + "step": 8031 + }, + { + "epoch": 2.465316144874156, + "grad_norm": 0.422325074672699, + "learning_rate": 8.820513272693383e-05, + "loss": 1.9307, + "step": 8032 + }, + { + "epoch": 2.465623081645181, + "grad_norm": 0.4823217988014221, + "learning_rate": 8.820192605648383e-05, + "loss": 1.8681, + "step": 8033 + }, + { + "epoch": 2.465930018416206, + "grad_norm": 0.3938382863998413, + "learning_rate": 8.819871900850001e-05, + "loss": 1.8483, + "step": 8034 + }, + { + "epoch": 2.4662369551872314, + "grad_norm": 0.30860164761543274, + "learning_rate": 8.819551158301406e-05, + "loss": 1.8818, + "step": 8035 + }, + { + "epoch": 2.4665438919582567, + "grad_norm": 0.3715503215789795, + "learning_rate": 8.819230378005767e-05, + "loss": 1.8443, + "step": 8036 + }, + { + "epoch": 2.4668508287292816, + "grad_norm": 0.4750272333621979, + "learning_rate": 8.818909559966255e-05, + "loss": 1.8379, + "step": 8037 + }, + { + "epoch": 2.467157765500307, + "grad_norm": 0.4794345796108246, + "learning_rate": 8.818588704186041e-05, + "loss": 1.8585, + "step": 8038 + }, + { + "epoch": 2.4674647022713323, + "grad_norm": 0.33470577001571655, + "learning_rate": 8.818267810668296e-05, + "loss": 1.8231, + "step": 8039 + }, + { + "epoch": 2.467771639042357, + "grad_norm": 0.31480371952056885, + "learning_rate": 8.817946879416191e-05, + "loss": 1.867, + "step": 8040 + }, + { + "epoch": 2.4680785758133825, + "grad_norm": 0.41635531187057495, + "learning_rate": 8.817625910432897e-05, + "loss": 1.9385, + "step": 8041 + }, + { + "epoch": 2.4683855125844074, + "grad_norm": 0.4570399522781372, + "learning_rate": 8.817304903721584e-05, + "loss": 1.7855, + "step": 8042 + }, + { + "epoch": 2.4686924493554327, + "grad_norm": 0.36506229639053345, + "learning_rate": 8.816983859285429e-05, + "loss": 1.808, + "step": 8043 + }, + { + "epoch": 2.468999386126458, + "grad_norm": 0.2650545537471771, + "learning_rate": 8.8166627771276e-05, + "loss": 1.8271, + "step": 8044 + }, + { + "epoch": 2.469306322897483, + "grad_norm": 0.3143758475780487, + "learning_rate": 8.816341657251272e-05, + "loss": 1.9016, + "step": 8045 + }, + { + "epoch": 2.4696132596685083, + "grad_norm": 0.3015407621860504, + "learning_rate": 8.81602049965962e-05, + "loss": 1.8357, + "step": 8046 + }, + { + "epoch": 2.4699201964395336, + "grad_norm": 0.26860085129737854, + "learning_rate": 8.815699304355819e-05, + "loss": 1.8223, + "step": 8047 + }, + { + "epoch": 2.4702271332105585, + "grad_norm": 0.2852436602115631, + "learning_rate": 8.81537807134304e-05, + "loss": 1.8298, + "step": 8048 + }, + { + "epoch": 2.470534069981584, + "grad_norm": 0.29519692063331604, + "learning_rate": 8.815056800624457e-05, + "loss": 1.863, + "step": 8049 + }, + { + "epoch": 2.470841006752609, + "grad_norm": 0.3163367807865143, + "learning_rate": 8.814735492203247e-05, + "loss": 1.878, + "step": 8050 + }, + { + "epoch": 2.471147943523634, + "grad_norm": 0.2955954968929291, + "learning_rate": 8.814414146082586e-05, + "loss": 1.8657, + "step": 8051 + }, + { + "epoch": 2.4714548802946594, + "grad_norm": 0.2773810029029846, + "learning_rate": 8.814092762265648e-05, + "loss": 1.7626, + "step": 8052 + }, + { + "epoch": 2.4717618170656843, + "grad_norm": 0.33908557891845703, + "learning_rate": 8.813771340755609e-05, + "loss": 1.8902, + "step": 8053 + }, + { + "epoch": 2.4720687538367097, + "grad_norm": 0.3083830773830414, + "learning_rate": 8.81344988155565e-05, + "loss": 1.876, + "step": 8054 + }, + { + "epoch": 2.472375690607735, + "grad_norm": 0.29082754254341125, + "learning_rate": 8.81312838466894e-05, + "loss": 1.8637, + "step": 8055 + }, + { + "epoch": 2.47268262737876, + "grad_norm": 0.3240490257740021, + "learning_rate": 8.81280685009866e-05, + "loss": 1.9096, + "step": 8056 + }, + { + "epoch": 2.472989564149785, + "grad_norm": 0.364561527967453, + "learning_rate": 8.812485277847991e-05, + "loss": 1.9361, + "step": 8057 + }, + { + "epoch": 2.47329650092081, + "grad_norm": 0.3420087695121765, + "learning_rate": 8.812163667920107e-05, + "loss": 1.9014, + "step": 8058 + }, + { + "epoch": 2.4736034376918354, + "grad_norm": 0.3346010148525238, + "learning_rate": 8.811842020318186e-05, + "loss": 1.9195, + "step": 8059 + }, + { + "epoch": 2.4739103744628608, + "grad_norm": 0.2990448772907257, + "learning_rate": 8.811520335045409e-05, + "loss": 1.8866, + "step": 8060 + }, + { + "epoch": 2.4742173112338857, + "grad_norm": 0.3047022223472595, + "learning_rate": 8.811198612104953e-05, + "loss": 1.8226, + "step": 8061 + }, + { + "epoch": 2.474524248004911, + "grad_norm": 0.300020307302475, + "learning_rate": 8.8108768515e-05, + "loss": 1.8496, + "step": 8062 + }, + { + "epoch": 2.4748311847759363, + "grad_norm": 0.31999605894088745, + "learning_rate": 8.810555053233729e-05, + "loss": 1.7853, + "step": 8063 + }, + { + "epoch": 2.4751381215469612, + "grad_norm": 0.3136597275733948, + "learning_rate": 8.810233217309318e-05, + "loss": 1.9317, + "step": 8064 + }, + { + "epoch": 2.4754450583179866, + "grad_norm": 0.3373543322086334, + "learning_rate": 8.809911343729948e-05, + "loss": 1.7827, + "step": 8065 + }, + { + "epoch": 2.475751995089012, + "grad_norm": 0.33876341581344604, + "learning_rate": 8.809589432498804e-05, + "loss": 1.8803, + "step": 8066 + }, + { + "epoch": 2.476058931860037, + "grad_norm": 0.3455486297607422, + "learning_rate": 8.809267483619061e-05, + "loss": 1.8987, + "step": 8067 + }, + { + "epoch": 2.476365868631062, + "grad_norm": 0.34245389699935913, + "learning_rate": 8.808945497093907e-05, + "loss": 1.8948, + "step": 8068 + }, + { + "epoch": 2.476672805402087, + "grad_norm": 0.3200787901878357, + "learning_rate": 8.808623472926521e-05, + "loss": 1.8234, + "step": 8069 + }, + { + "epoch": 2.4769797421731123, + "grad_norm": 0.3244795799255371, + "learning_rate": 8.808301411120083e-05, + "loss": 1.8974, + "step": 8070 + }, + { + "epoch": 2.4772866789441377, + "grad_norm": 0.30235809087753296, + "learning_rate": 8.80797931167778e-05, + "loss": 1.8461, + "step": 8071 + }, + { + "epoch": 2.4775936157151626, + "grad_norm": 0.3719651997089386, + "learning_rate": 8.807657174602792e-05, + "loss": 1.9717, + "step": 8072 + }, + { + "epoch": 2.477900552486188, + "grad_norm": 0.3349135220050812, + "learning_rate": 8.807334999898307e-05, + "loss": 1.9, + "step": 8073 + }, + { + "epoch": 2.478207489257213, + "grad_norm": 0.28822100162506104, + "learning_rate": 8.807012787567503e-05, + "loss": 1.7606, + "step": 8074 + }, + { + "epoch": 2.478514426028238, + "grad_norm": 0.33698850870132446, + "learning_rate": 8.806690537613568e-05, + "loss": 1.8909, + "step": 8075 + }, + { + "epoch": 2.4788213627992635, + "grad_norm": 0.35167089104652405, + "learning_rate": 8.806368250039687e-05, + "loss": 1.8529, + "step": 8076 + }, + { + "epoch": 2.4791282995702884, + "grad_norm": 0.3142544627189636, + "learning_rate": 8.806045924849044e-05, + "loss": 1.8169, + "step": 8077 + }, + { + "epoch": 2.4794352363413137, + "grad_norm": 0.3489094078540802, + "learning_rate": 8.805723562044824e-05, + "loss": 1.8822, + "step": 8078 + }, + { + "epoch": 2.479742173112339, + "grad_norm": 0.33814284205436707, + "learning_rate": 8.805401161630214e-05, + "loss": 1.7982, + "step": 8079 + }, + { + "epoch": 2.480049109883364, + "grad_norm": 0.26772376894950867, + "learning_rate": 8.805078723608398e-05, + "loss": 1.8354, + "step": 8080 + }, + { + "epoch": 2.4803560466543892, + "grad_norm": 0.3259965777397156, + "learning_rate": 8.804756247982563e-05, + "loss": 1.8292, + "step": 8081 + }, + { + "epoch": 2.4806629834254146, + "grad_norm": 0.32701683044433594, + "learning_rate": 8.804433734755899e-05, + "loss": 1.8339, + "step": 8082 + }, + { + "epoch": 2.4809699201964395, + "grad_norm": 0.3180190324783325, + "learning_rate": 8.804111183931589e-05, + "loss": 1.8839, + "step": 8083 + }, + { + "epoch": 2.481276856967465, + "grad_norm": 0.3318104147911072, + "learning_rate": 8.803788595512824e-05, + "loss": 1.9024, + "step": 8084 + }, + { + "epoch": 2.4815837937384897, + "grad_norm": 0.3849479854106903, + "learning_rate": 8.80346596950279e-05, + "loss": 1.8497, + "step": 8085 + }, + { + "epoch": 2.481890730509515, + "grad_norm": 0.48812124133110046, + "learning_rate": 8.803143305904676e-05, + "loss": 1.799, + "step": 8086 + }, + { + "epoch": 2.4821976672805404, + "grad_norm": 0.4957241415977478, + "learning_rate": 8.802820604721671e-05, + "loss": 1.8842, + "step": 8087 + }, + { + "epoch": 2.4825046040515653, + "grad_norm": 0.4011611342430115, + "learning_rate": 8.802497865956964e-05, + "loss": 1.8354, + "step": 8088 + }, + { + "epoch": 2.4828115408225906, + "grad_norm": 0.3676159679889679, + "learning_rate": 8.802175089613744e-05, + "loss": 1.8564, + "step": 8089 + }, + { + "epoch": 2.4831184775936155, + "grad_norm": 0.30699628591537476, + "learning_rate": 8.801852275695202e-05, + "loss": 1.8403, + "step": 8090 + }, + { + "epoch": 2.483425414364641, + "grad_norm": 0.4100657105445862, + "learning_rate": 8.801529424204527e-05, + "loss": 1.7885, + "step": 8091 + }, + { + "epoch": 2.483732351135666, + "grad_norm": 0.30880647897720337, + "learning_rate": 8.801206535144909e-05, + "loss": 1.8682, + "step": 8092 + }, + { + "epoch": 2.484039287906691, + "grad_norm": 0.2775783836841583, + "learning_rate": 8.800883608519541e-05, + "loss": 1.8179, + "step": 8093 + }, + { + "epoch": 2.4843462246777164, + "grad_norm": 0.3048902451992035, + "learning_rate": 8.800560644331613e-05, + "loss": 1.8799, + "step": 8094 + }, + { + "epoch": 2.4846531614487417, + "grad_norm": 0.30332526564598083, + "learning_rate": 8.800237642584318e-05, + "loss": 1.8892, + "step": 8095 + }, + { + "epoch": 2.4849600982197666, + "grad_norm": 0.27216237783432007, + "learning_rate": 8.799914603280847e-05, + "loss": 1.7896, + "step": 8096 + }, + { + "epoch": 2.485267034990792, + "grad_norm": 0.28771117329597473, + "learning_rate": 8.799591526424393e-05, + "loss": 1.8593, + "step": 8097 + }, + { + "epoch": 2.4855739717618173, + "grad_norm": 0.2986912429332733, + "learning_rate": 8.799268412018146e-05, + "loss": 1.8205, + "step": 8098 + }, + { + "epoch": 2.485880908532842, + "grad_norm": 0.3072153925895691, + "learning_rate": 8.798945260065306e-05, + "loss": 1.841, + "step": 8099 + }, + { + "epoch": 2.4861878453038675, + "grad_norm": 0.33869001269340515, + "learning_rate": 8.798622070569059e-05, + "loss": 1.8353, + "step": 8100 + }, + { + "epoch": 2.4864947820748924, + "grad_norm": 0.3075481951236725, + "learning_rate": 8.798298843532605e-05, + "loss": 1.8824, + "step": 8101 + }, + { + "epoch": 2.4868017188459177, + "grad_norm": 0.2758934795856476, + "learning_rate": 8.797975578959132e-05, + "loss": 1.8068, + "step": 8102 + }, + { + "epoch": 2.487108655616943, + "grad_norm": 0.3065447211265564, + "learning_rate": 8.79765227685184e-05, + "loss": 1.8661, + "step": 8103 + }, + { + "epoch": 2.487415592387968, + "grad_norm": 0.34466415643692017, + "learning_rate": 8.797328937213923e-05, + "loss": 1.8579, + "step": 8104 + }, + { + "epoch": 2.4877225291589933, + "grad_norm": 0.4202970862388611, + "learning_rate": 8.797005560048575e-05, + "loss": 1.8526, + "step": 8105 + }, + { + "epoch": 2.488029465930018, + "grad_norm": 0.35885924100875854, + "learning_rate": 8.796682145358991e-05, + "loss": 1.8194, + "step": 8106 + }, + { + "epoch": 2.4883364027010435, + "grad_norm": 0.3208492696285248, + "learning_rate": 8.796358693148372e-05, + "loss": 1.8379, + "step": 8107 + }, + { + "epoch": 2.488643339472069, + "grad_norm": 0.26514047384262085, + "learning_rate": 8.79603520341991e-05, + "loss": 1.7978, + "step": 8108 + }, + { + "epoch": 2.4889502762430937, + "grad_norm": 0.34550225734710693, + "learning_rate": 8.795711676176803e-05, + "loss": 1.8771, + "step": 8109 + }, + { + "epoch": 2.489257213014119, + "grad_norm": 0.3016511797904968, + "learning_rate": 8.795388111422248e-05, + "loss": 1.8184, + "step": 8110 + }, + { + "epoch": 2.4895641497851444, + "grad_norm": 0.34824177622795105, + "learning_rate": 8.795064509159444e-05, + "loss": 1.8486, + "step": 8111 + }, + { + "epoch": 2.4898710865561693, + "grad_norm": 0.341482013463974, + "learning_rate": 8.794740869391587e-05, + "loss": 1.7872, + "step": 8112 + }, + { + "epoch": 2.4901780233271946, + "grad_norm": 0.3366520404815674, + "learning_rate": 8.794417192121878e-05, + "loss": 1.838, + "step": 8113 + }, + { + "epoch": 2.49048496009822, + "grad_norm": 0.3168759047985077, + "learning_rate": 8.794093477353514e-05, + "loss": 1.8195, + "step": 8114 + }, + { + "epoch": 2.490791896869245, + "grad_norm": 0.36757516860961914, + "learning_rate": 8.793769725089693e-05, + "loss": 1.8825, + "step": 8115 + }, + { + "epoch": 2.49109883364027, + "grad_norm": 0.3936297297477722, + "learning_rate": 8.793445935333617e-05, + "loss": 1.855, + "step": 8116 + }, + { + "epoch": 2.491405770411295, + "grad_norm": 0.31962448358535767, + "learning_rate": 8.793122108088485e-05, + "loss": 1.8307, + "step": 8117 + }, + { + "epoch": 2.4917127071823204, + "grad_norm": 0.3082095980644226, + "learning_rate": 8.792798243357499e-05, + "loss": 1.8204, + "step": 8118 + }, + { + "epoch": 2.4920196439533457, + "grad_norm": 0.4574470520019531, + "learning_rate": 8.792474341143855e-05, + "loss": 1.8989, + "step": 8119 + }, + { + "epoch": 2.4923265807243706, + "grad_norm": 0.4596022367477417, + "learning_rate": 8.792150401450757e-05, + "loss": 1.8773, + "step": 8120 + }, + { + "epoch": 2.492633517495396, + "grad_norm": 0.32090309262275696, + "learning_rate": 8.791826424281407e-05, + "loss": 1.8621, + "step": 8121 + }, + { + "epoch": 2.4929404542664213, + "grad_norm": 0.3492026925086975, + "learning_rate": 8.791502409639006e-05, + "loss": 1.8887, + "step": 8122 + }, + { + "epoch": 2.493247391037446, + "grad_norm": 0.39859771728515625, + "learning_rate": 8.791178357526754e-05, + "loss": 1.8326, + "step": 8123 + }, + { + "epoch": 2.4935543278084715, + "grad_norm": 0.40439239144325256, + "learning_rate": 8.790854267947857e-05, + "loss": 1.8716, + "step": 8124 + }, + { + "epoch": 2.493861264579497, + "grad_norm": 0.4004671573638916, + "learning_rate": 8.790530140905515e-05, + "loss": 1.8253, + "step": 8125 + }, + { + "epoch": 2.4941682013505218, + "grad_norm": 0.31446993350982666, + "learning_rate": 8.790205976402934e-05, + "loss": 1.8356, + "step": 8126 + }, + { + "epoch": 2.494475138121547, + "grad_norm": 0.3069862723350525, + "learning_rate": 8.789881774443315e-05, + "loss": 1.8532, + "step": 8127 + }, + { + "epoch": 2.494782074892572, + "grad_norm": 0.3192054033279419, + "learning_rate": 8.789557535029864e-05, + "loss": 1.7991, + "step": 8128 + }, + { + "epoch": 2.4950890116635973, + "grad_norm": 0.30979350209236145, + "learning_rate": 8.789233258165783e-05, + "loss": 1.8874, + "step": 8129 + }, + { + "epoch": 2.4953959484346226, + "grad_norm": 0.3193976879119873, + "learning_rate": 8.788908943854279e-05, + "loss": 1.8218, + "step": 8130 + }, + { + "epoch": 2.4957028852056475, + "grad_norm": 0.3120083808898926, + "learning_rate": 8.788584592098557e-05, + "loss": 1.9542, + "step": 8131 + }, + { + "epoch": 2.496009821976673, + "grad_norm": 0.36913001537323, + "learning_rate": 8.788260202901819e-05, + "loss": 1.8543, + "step": 8132 + }, + { + "epoch": 2.4963167587476978, + "grad_norm": 0.40216776728630066, + "learning_rate": 8.787935776267275e-05, + "loss": 1.8645, + "step": 8133 + }, + { + "epoch": 2.496623695518723, + "grad_norm": 0.3553076684474945, + "learning_rate": 8.78761131219813e-05, + "loss": 1.8881, + "step": 8134 + }, + { + "epoch": 2.4969306322897484, + "grad_norm": 0.2926538288593292, + "learning_rate": 8.787286810697589e-05, + "loss": 1.8419, + "step": 8135 + }, + { + "epoch": 2.4972375690607733, + "grad_norm": 0.3412233293056488, + "learning_rate": 8.78696227176886e-05, + "loss": 1.8766, + "step": 8136 + }, + { + "epoch": 2.4975445058317987, + "grad_norm": 0.30935296416282654, + "learning_rate": 8.78663769541515e-05, + "loss": 1.8002, + "step": 8137 + }, + { + "epoch": 2.497851442602824, + "grad_norm": 0.31171828508377075, + "learning_rate": 8.786313081639666e-05, + "loss": 1.7795, + "step": 8138 + }, + { + "epoch": 2.498158379373849, + "grad_norm": 0.2874031364917755, + "learning_rate": 8.785988430445619e-05, + "loss": 1.8508, + "step": 8139 + }, + { + "epoch": 2.498465316144874, + "grad_norm": 0.3126043379306793, + "learning_rate": 8.785663741836215e-05, + "loss": 1.8328, + "step": 8140 + }, + { + "epoch": 2.4987722529158995, + "grad_norm": 0.32581454515457153, + "learning_rate": 8.785339015814662e-05, + "loss": 1.8333, + "step": 8141 + }, + { + "epoch": 2.4990791896869244, + "grad_norm": 0.329745888710022, + "learning_rate": 8.78501425238417e-05, + "loss": 1.8257, + "step": 8142 + }, + { + "epoch": 2.4993861264579498, + "grad_norm": 0.29101938009262085, + "learning_rate": 8.78468945154795e-05, + "loss": 1.8472, + "step": 8143 + }, + { + "epoch": 2.4996930632289747, + "grad_norm": 0.3123742341995239, + "learning_rate": 8.784364613309208e-05, + "loss": 1.9226, + "step": 8144 + }, + { + "epoch": 2.5, + "grad_norm": 0.3330230116844177, + "learning_rate": 8.784039737671159e-05, + "loss": 1.8768, + "step": 8145 + }, + { + "epoch": 2.5003069367710253, + "grad_norm": 0.3147718012332916, + "learning_rate": 8.783714824637011e-05, + "loss": 1.853, + "step": 8146 + }, + { + "epoch": 2.5006138735420502, + "grad_norm": 0.34790241718292236, + "learning_rate": 8.783389874209977e-05, + "loss": 1.8328, + "step": 8147 + }, + { + "epoch": 2.5009208103130756, + "grad_norm": 0.29425308108329773, + "learning_rate": 8.783064886393264e-05, + "loss": 1.8487, + "step": 8148 + }, + { + "epoch": 2.5012277470841005, + "grad_norm": 0.30555078387260437, + "learning_rate": 8.782739861190088e-05, + "loss": 1.8588, + "step": 8149 + }, + { + "epoch": 2.501534683855126, + "grad_norm": 0.29712429642677307, + "learning_rate": 8.78241479860366e-05, + "loss": 1.8056, + "step": 8150 + }, + { + "epoch": 2.501841620626151, + "grad_norm": 0.32512977719306946, + "learning_rate": 8.782089698637191e-05, + "loss": 1.9099, + "step": 8151 + }, + { + "epoch": 2.5021485573971765, + "grad_norm": 0.3660493493080139, + "learning_rate": 8.781764561293895e-05, + "loss": 1.905, + "step": 8152 + }, + { + "epoch": 2.5024554941682013, + "grad_norm": 0.33591583371162415, + "learning_rate": 8.781439386576984e-05, + "loss": 1.8353, + "step": 8153 + }, + { + "epoch": 2.5027624309392267, + "grad_norm": 0.3774370551109314, + "learning_rate": 8.781114174489673e-05, + "loss": 1.8626, + "step": 8154 + }, + { + "epoch": 2.5030693677102516, + "grad_norm": 0.3628109097480774, + "learning_rate": 8.780788925035178e-05, + "loss": 1.8549, + "step": 8155 + }, + { + "epoch": 2.503376304481277, + "grad_norm": 0.3089732825756073, + "learning_rate": 8.78046363821671e-05, + "loss": 1.835, + "step": 8156 + }, + { + "epoch": 2.5036832412523022, + "grad_norm": 0.3630690574645996, + "learning_rate": 8.780138314037482e-05, + "loss": 1.8308, + "step": 8157 + }, + { + "epoch": 2.503990178023327, + "grad_norm": 0.3658130466938019, + "learning_rate": 8.779812952500714e-05, + "loss": 1.8484, + "step": 8158 + }, + { + "epoch": 2.5042971147943525, + "grad_norm": 0.38401272892951965, + "learning_rate": 8.779487553609617e-05, + "loss": 1.8408, + "step": 8159 + }, + { + "epoch": 2.5046040515653774, + "grad_norm": 0.354514479637146, + "learning_rate": 8.77916211736741e-05, + "loss": 1.8491, + "step": 8160 + }, + { + "epoch": 2.5049109883364027, + "grad_norm": 0.3604681193828583, + "learning_rate": 8.778836643777309e-05, + "loss": 1.8887, + "step": 8161 + }, + { + "epoch": 2.505217925107428, + "grad_norm": 0.3155761957168579, + "learning_rate": 8.778511132842528e-05, + "loss": 1.8066, + "step": 8162 + }, + { + "epoch": 2.505524861878453, + "grad_norm": 0.35986092686653137, + "learning_rate": 8.778185584566286e-05, + "loss": 1.8348, + "step": 8163 + }, + { + "epoch": 2.5058317986494782, + "grad_norm": 0.558273434638977, + "learning_rate": 8.777859998951799e-05, + "loss": 1.9118, + "step": 8164 + }, + { + "epoch": 2.506138735420503, + "grad_norm": 0.6520169377326965, + "learning_rate": 8.777534376002285e-05, + "loss": 1.8747, + "step": 8165 + }, + { + "epoch": 2.5064456721915285, + "grad_norm": 0.5059971213340759, + "learning_rate": 8.777208715720963e-05, + "loss": 1.8218, + "step": 8166 + }, + { + "epoch": 2.506752608962554, + "grad_norm": 0.2873745560646057, + "learning_rate": 8.77688301811105e-05, + "loss": 1.8266, + "step": 8167 + }, + { + "epoch": 2.507059545733579, + "grad_norm": 0.4212021827697754, + "learning_rate": 8.776557283175765e-05, + "loss": 1.8553, + "step": 8168 + }, + { + "epoch": 2.507366482504604, + "grad_norm": 0.49324098229408264, + "learning_rate": 8.776231510918328e-05, + "loss": 1.8625, + "step": 8169 + }, + { + "epoch": 2.5076734192756294, + "grad_norm": 0.4414234459400177, + "learning_rate": 8.775905701341959e-05, + "loss": 1.7956, + "step": 8170 + }, + { + "epoch": 2.5079803560466543, + "grad_norm": 0.2691541612148285, + "learning_rate": 8.775579854449876e-05, + "loss": 1.8216, + "step": 8171 + }, + { + "epoch": 2.5082872928176796, + "grad_norm": 0.3366323411464691, + "learning_rate": 8.775253970245299e-05, + "loss": 1.8738, + "step": 8172 + }, + { + "epoch": 2.508594229588705, + "grad_norm": 0.49541351199150085, + "learning_rate": 8.77492804873145e-05, + "loss": 1.8281, + "step": 8173 + }, + { + "epoch": 2.50890116635973, + "grad_norm": 0.584227442741394, + "learning_rate": 8.774602089911548e-05, + "loss": 1.8248, + "step": 8174 + }, + { + "epoch": 2.509208103130755, + "grad_norm": 0.4493597149848938, + "learning_rate": 8.774276093788818e-05, + "loss": 1.8624, + "step": 8175 + }, + { + "epoch": 2.50951503990178, + "grad_norm": 0.29684513807296753, + "learning_rate": 8.77395006036648e-05, + "loss": 1.7806, + "step": 8176 + }, + { + "epoch": 2.5098219766728054, + "grad_norm": 0.38788866996765137, + "learning_rate": 8.773623989647754e-05, + "loss": 1.8334, + "step": 8177 + }, + { + "epoch": 2.5101289134438307, + "grad_norm": 0.44810980558395386, + "learning_rate": 8.773297881635865e-05, + "loss": 1.823, + "step": 8178 + }, + { + "epoch": 2.5104358502148556, + "grad_norm": 0.39918363094329834, + "learning_rate": 8.772971736334032e-05, + "loss": 1.8535, + "step": 8179 + }, + { + "epoch": 2.510742786985881, + "grad_norm": 0.3454466462135315, + "learning_rate": 8.772645553745484e-05, + "loss": 1.8532, + "step": 8180 + }, + { + "epoch": 2.511049723756906, + "grad_norm": 0.3523466885089874, + "learning_rate": 8.77231933387344e-05, + "loss": 1.8402, + "step": 8181 + }, + { + "epoch": 2.511356660527931, + "grad_norm": 0.41947969794273376, + "learning_rate": 8.771993076721126e-05, + "loss": 1.8509, + "step": 8182 + }, + { + "epoch": 2.5116635972989565, + "grad_norm": 0.43224433064460754, + "learning_rate": 8.771666782291765e-05, + "loss": 1.858, + "step": 8183 + }, + { + "epoch": 2.511970534069982, + "grad_norm": 0.3467538058757782, + "learning_rate": 8.771340450588584e-05, + "loss": 1.8528, + "step": 8184 + }, + { + "epoch": 2.5122774708410067, + "grad_norm": 0.33712685108184814, + "learning_rate": 8.771014081614803e-05, + "loss": 1.8741, + "step": 8185 + }, + { + "epoch": 2.512584407612032, + "grad_norm": 0.4289829134941101, + "learning_rate": 8.770687675373652e-05, + "loss": 1.8252, + "step": 8186 + }, + { + "epoch": 2.512891344383057, + "grad_norm": 0.4774068295955658, + "learning_rate": 8.770361231868356e-05, + "loss": 1.8285, + "step": 8187 + }, + { + "epoch": 2.5131982811540823, + "grad_norm": 0.3455580472946167, + "learning_rate": 8.77003475110214e-05, + "loss": 1.8025, + "step": 8188 + }, + { + "epoch": 2.5135052179251076, + "grad_norm": 0.3050900399684906, + "learning_rate": 8.769708233078231e-05, + "loss": 1.8764, + "step": 8189 + }, + { + "epoch": 2.5138121546961325, + "grad_norm": 0.42384061217308044, + "learning_rate": 8.769381677799855e-05, + "loss": 1.8937, + "step": 8190 + }, + { + "epoch": 2.514119091467158, + "grad_norm": 0.4084749221801758, + "learning_rate": 8.76905508527024e-05, + "loss": 1.8124, + "step": 8191 + }, + { + "epoch": 2.5144260282381827, + "grad_norm": 0.38785848021507263, + "learning_rate": 8.768728455492615e-05, + "loss": 1.8731, + "step": 8192 + }, + { + "epoch": 2.514732965009208, + "grad_norm": 0.28196588158607483, + "learning_rate": 8.768401788470206e-05, + "loss": 1.809, + "step": 8193 + }, + { + "epoch": 2.5150399017802334, + "grad_norm": 0.3551066815853119, + "learning_rate": 8.76807508420624e-05, + "loss": 1.8955, + "step": 8194 + }, + { + "epoch": 2.5153468385512583, + "grad_norm": 0.4327031373977661, + "learning_rate": 8.76774834270395e-05, + "loss": 1.8651, + "step": 8195 + }, + { + "epoch": 2.5156537753222836, + "grad_norm": 0.3748793303966522, + "learning_rate": 8.76742156396656e-05, + "loss": 1.8158, + "step": 8196 + }, + { + "epoch": 2.5159607120933085, + "grad_norm": 0.32504430413246155, + "learning_rate": 8.767094747997304e-05, + "loss": 1.8598, + "step": 8197 + }, + { + "epoch": 2.516267648864334, + "grad_norm": 0.3639826476573944, + "learning_rate": 8.76676789479941e-05, + "loss": 1.8829, + "step": 8198 + }, + { + "epoch": 2.516574585635359, + "grad_norm": 0.36793577671051025, + "learning_rate": 8.766441004376106e-05, + "loss": 1.8215, + "step": 8199 + }, + { + "epoch": 2.5168815224063845, + "grad_norm": 0.3245735466480255, + "learning_rate": 8.766114076730624e-05, + "loss": 1.8309, + "step": 8200 + }, + { + "epoch": 2.5171884591774094, + "grad_norm": 0.3022485673427582, + "learning_rate": 8.765787111866198e-05, + "loss": 1.8286, + "step": 8201 + }, + { + "epoch": 2.5174953959484347, + "grad_norm": 0.40962809324264526, + "learning_rate": 8.765460109786056e-05, + "loss": 1.8032, + "step": 8202 + }, + { + "epoch": 2.5178023327194596, + "grad_norm": 0.4123937487602234, + "learning_rate": 8.765133070493428e-05, + "loss": 1.9311, + "step": 8203 + }, + { + "epoch": 2.518109269490485, + "grad_norm": 0.30352556705474854, + "learning_rate": 8.764805993991551e-05, + "loss": 1.8197, + "step": 8204 + }, + { + "epoch": 2.5184162062615103, + "grad_norm": 0.3201169967651367, + "learning_rate": 8.764478880283653e-05, + "loss": 1.9355, + "step": 8205 + }, + { + "epoch": 2.518723143032535, + "grad_norm": 0.36343297362327576, + "learning_rate": 8.764151729372969e-05, + "loss": 1.9201, + "step": 8206 + }, + { + "epoch": 2.5190300798035605, + "grad_norm": 0.3273618817329407, + "learning_rate": 8.763824541262729e-05, + "loss": 1.8195, + "step": 8207 + }, + { + "epoch": 2.5193370165745854, + "grad_norm": 0.30200251936912537, + "learning_rate": 8.76349731595617e-05, + "loss": 1.8094, + "step": 8208 + }, + { + "epoch": 2.5196439533456108, + "grad_norm": 0.3177770674228668, + "learning_rate": 8.763170053456527e-05, + "loss": 1.8519, + "step": 8209 + }, + { + "epoch": 2.519950890116636, + "grad_norm": 0.3206307291984558, + "learning_rate": 8.762842753767031e-05, + "loss": 1.8496, + "step": 8210 + }, + { + "epoch": 2.520257826887661, + "grad_norm": 0.31902456283569336, + "learning_rate": 8.762515416890915e-05, + "loss": 1.9069, + "step": 8211 + }, + { + "epoch": 2.5205647636586863, + "grad_norm": 0.3088377118110657, + "learning_rate": 8.762188042831419e-05, + "loss": 1.8482, + "step": 8212 + }, + { + "epoch": 2.520871700429711, + "grad_norm": 0.3046402931213379, + "learning_rate": 8.761860631591773e-05, + "loss": 1.8241, + "step": 8213 + }, + { + "epoch": 2.5211786372007365, + "grad_norm": 0.291831910610199, + "learning_rate": 8.761533183175217e-05, + "loss": 1.846, + "step": 8214 + }, + { + "epoch": 2.521485573971762, + "grad_norm": 0.3514893054962158, + "learning_rate": 8.761205697584986e-05, + "loss": 1.9, + "step": 8215 + }, + { + "epoch": 2.521792510742787, + "grad_norm": 0.31843090057373047, + "learning_rate": 8.760878174824316e-05, + "loss": 1.78, + "step": 8216 + }, + { + "epoch": 2.522099447513812, + "grad_norm": 0.30090904235839844, + "learning_rate": 8.760550614896443e-05, + "loss": 1.8718, + "step": 8217 + }, + { + "epoch": 2.5224063842848374, + "grad_norm": 0.38502126932144165, + "learning_rate": 8.760223017804604e-05, + "loss": 1.8772, + "step": 8218 + }, + { + "epoch": 2.5227133210558623, + "grad_norm": 0.30862319469451904, + "learning_rate": 8.759895383552037e-05, + "loss": 1.8532, + "step": 8219 + }, + { + "epoch": 2.5230202578268877, + "grad_norm": 0.36331596970558167, + "learning_rate": 8.759567712141981e-05, + "loss": 1.8587, + "step": 8220 + }, + { + "epoch": 2.523327194597913, + "grad_norm": 0.3370853662490845, + "learning_rate": 8.759240003577673e-05, + "loss": 1.8065, + "step": 8221 + }, + { + "epoch": 2.523634131368938, + "grad_norm": 0.3047318160533905, + "learning_rate": 8.758912257862351e-05, + "loss": 1.8783, + "step": 8222 + }, + { + "epoch": 2.523941068139963, + "grad_norm": 0.3172069787979126, + "learning_rate": 8.758584474999257e-05, + "loss": 1.7844, + "step": 8223 + }, + { + "epoch": 2.524248004910988, + "grad_norm": 0.3063897490501404, + "learning_rate": 8.758256654991626e-05, + "loss": 1.8642, + "step": 8224 + }, + { + "epoch": 2.5245549416820134, + "grad_norm": 0.2535867393016815, + "learning_rate": 8.757928797842702e-05, + "loss": 1.7784, + "step": 8225 + }, + { + "epoch": 2.5248618784530388, + "grad_norm": 0.27732348442077637, + "learning_rate": 8.757600903555722e-05, + "loss": 1.8223, + "step": 8226 + }, + { + "epoch": 2.525168815224064, + "grad_norm": 0.29819566011428833, + "learning_rate": 8.757272972133927e-05, + "loss": 1.8237, + "step": 8227 + }, + { + "epoch": 2.525475751995089, + "grad_norm": 0.26726382970809937, + "learning_rate": 8.756945003580559e-05, + "loss": 1.8134, + "step": 8228 + }, + { + "epoch": 2.5257826887661143, + "grad_norm": 0.2845614552497864, + "learning_rate": 8.756616997898859e-05, + "loss": 1.8757, + "step": 8229 + }, + { + "epoch": 2.5260896255371392, + "grad_norm": 0.33399102091789246, + "learning_rate": 8.756288955092066e-05, + "loss": 1.9036, + "step": 8230 + }, + { + "epoch": 2.5263965623081646, + "grad_norm": 0.3839001953601837, + "learning_rate": 8.755960875163426e-05, + "loss": 1.8205, + "step": 8231 + }, + { + "epoch": 2.52670349907919, + "grad_norm": 0.3703761696815491, + "learning_rate": 8.75563275811618e-05, + "loss": 1.768, + "step": 8232 + }, + { + "epoch": 2.527010435850215, + "grad_norm": 0.3083760440349579, + "learning_rate": 8.755304603953568e-05, + "loss": 1.8621, + "step": 8233 + }, + { + "epoch": 2.52731737262124, + "grad_norm": 0.2995334267616272, + "learning_rate": 8.754976412678833e-05, + "loss": 1.8246, + "step": 8234 + }, + { + "epoch": 2.527624309392265, + "grad_norm": 0.3482929766178131, + "learning_rate": 8.754648184295222e-05, + "loss": 1.7982, + "step": 8235 + }, + { + "epoch": 2.5279312461632903, + "grad_norm": 0.37462911009788513, + "learning_rate": 8.754319918805978e-05, + "loss": 1.8458, + "step": 8236 + }, + { + "epoch": 2.5282381829343157, + "grad_norm": 0.3112029433250427, + "learning_rate": 8.753991616214343e-05, + "loss": 1.9116, + "step": 8237 + }, + { + "epoch": 2.5285451197053406, + "grad_norm": 0.309711217880249, + "learning_rate": 8.753663276523563e-05, + "loss": 1.8072, + "step": 8238 + }, + { + "epoch": 2.528852056476366, + "grad_norm": 0.3831833302974701, + "learning_rate": 8.753334899736882e-05, + "loss": 1.8769, + "step": 8239 + }, + { + "epoch": 2.529158993247391, + "grad_norm": 0.30272287130355835, + "learning_rate": 8.753006485857547e-05, + "loss": 1.7874, + "step": 8240 + }, + { + "epoch": 2.529465930018416, + "grad_norm": 0.3613976538181305, + "learning_rate": 8.752678034888801e-05, + "loss": 1.8591, + "step": 8241 + }, + { + "epoch": 2.5297728667894415, + "grad_norm": 0.35976549983024597, + "learning_rate": 8.75234954683389e-05, + "loss": 1.7831, + "step": 8242 + }, + { + "epoch": 2.530079803560467, + "grad_norm": 0.33987951278686523, + "learning_rate": 8.752021021696064e-05, + "loss": 1.7986, + "step": 8243 + }, + { + "epoch": 2.5303867403314917, + "grad_norm": 0.29231634736061096, + "learning_rate": 8.751692459478567e-05, + "loss": 1.8205, + "step": 8244 + }, + { + "epoch": 2.530693677102517, + "grad_norm": 0.3382028341293335, + "learning_rate": 8.751363860184644e-05, + "loss": 1.8403, + "step": 8245 + }, + { + "epoch": 2.531000613873542, + "grad_norm": 0.44643479585647583, + "learning_rate": 8.751035223817546e-05, + "loss": 1.8273, + "step": 8246 + }, + { + "epoch": 2.5313075506445673, + "grad_norm": 0.4412732720375061, + "learning_rate": 8.750706550380518e-05, + "loss": 1.7935, + "step": 8247 + }, + { + "epoch": 2.5316144874155926, + "grad_norm": 0.3826131820678711, + "learning_rate": 8.750377839876811e-05, + "loss": 1.8622, + "step": 8248 + }, + { + "epoch": 2.5319214241866175, + "grad_norm": 0.27509525418281555, + "learning_rate": 8.750049092309672e-05, + "loss": 1.8359, + "step": 8249 + }, + { + "epoch": 2.532228360957643, + "grad_norm": 0.36282727122306824, + "learning_rate": 8.749720307682348e-05, + "loss": 1.8531, + "step": 8250 + }, + { + "epoch": 2.5325352977286677, + "grad_norm": 0.3730177581310272, + "learning_rate": 8.749391485998091e-05, + "loss": 1.8616, + "step": 8251 + }, + { + "epoch": 2.532842234499693, + "grad_norm": 0.3347858190536499, + "learning_rate": 8.749062627260152e-05, + "loss": 1.8078, + "step": 8252 + }, + { + "epoch": 2.5331491712707184, + "grad_norm": 0.29422396421432495, + "learning_rate": 8.748733731471777e-05, + "loss": 1.8623, + "step": 8253 + }, + { + "epoch": 2.5334561080417433, + "grad_norm": 0.36915895342826843, + "learning_rate": 8.748404798636219e-05, + "loss": 1.8461, + "step": 8254 + }, + { + "epoch": 2.5337630448127686, + "grad_norm": 0.4497677981853485, + "learning_rate": 8.748075828756725e-05, + "loss": 1.8328, + "step": 8255 + }, + { + "epoch": 2.5340699815837935, + "grad_norm": 0.4770478308200836, + "learning_rate": 8.747746821836552e-05, + "loss": 1.8418, + "step": 8256 + }, + { + "epoch": 2.534376918354819, + "grad_norm": 0.39125776290893555, + "learning_rate": 8.747417777878946e-05, + "loss": 1.8044, + "step": 8257 + }, + { + "epoch": 2.534683855125844, + "grad_norm": 0.2976539731025696, + "learning_rate": 8.747088696887163e-05, + "loss": 1.8819, + "step": 8258 + }, + { + "epoch": 2.5349907918968695, + "grad_norm": 0.37511107325553894, + "learning_rate": 8.746759578864452e-05, + "loss": 1.8304, + "step": 8259 + }, + { + "epoch": 2.5352977286678944, + "grad_norm": 0.4462794363498688, + "learning_rate": 8.746430423814068e-05, + "loss": 1.8248, + "step": 8260 + }, + { + "epoch": 2.5356046654389197, + "grad_norm": 0.3465537130832672, + "learning_rate": 8.746101231739261e-05, + "loss": 1.7987, + "step": 8261 + }, + { + "epoch": 2.5359116022099446, + "grad_norm": 0.3182581663131714, + "learning_rate": 8.745772002643287e-05, + "loss": 1.8817, + "step": 8262 + }, + { + "epoch": 2.53621853898097, + "grad_norm": 0.43006083369255066, + "learning_rate": 8.745442736529398e-05, + "loss": 1.8003, + "step": 8263 + }, + { + "epoch": 2.5365254757519953, + "grad_norm": 0.45511460304260254, + "learning_rate": 8.745113433400849e-05, + "loss": 1.8735, + "step": 8264 + }, + { + "epoch": 2.53683241252302, + "grad_norm": 0.3625985085964203, + "learning_rate": 8.744784093260894e-05, + "loss": 1.8469, + "step": 8265 + }, + { + "epoch": 2.5371393492940455, + "grad_norm": 0.2977297306060791, + "learning_rate": 8.744454716112787e-05, + "loss": 1.7885, + "step": 8266 + }, + { + "epoch": 2.5374462860650704, + "grad_norm": 0.34910085797309875, + "learning_rate": 8.744125301959785e-05, + "loss": 1.8885, + "step": 8267 + }, + { + "epoch": 2.5377532228360957, + "grad_norm": 0.40707942843437195, + "learning_rate": 8.743795850805141e-05, + "loss": 1.8829, + "step": 8268 + }, + { + "epoch": 2.538060159607121, + "grad_norm": 0.4142697751522064, + "learning_rate": 8.743466362652114e-05, + "loss": 1.903, + "step": 8269 + }, + { + "epoch": 2.538367096378146, + "grad_norm": 0.38610437512397766, + "learning_rate": 8.743136837503958e-05, + "loss": 1.9245, + "step": 8270 + }, + { + "epoch": 2.5386740331491713, + "grad_norm": 0.2940465211868286, + "learning_rate": 8.742807275363928e-05, + "loss": 1.8532, + "step": 8271 + }, + { + "epoch": 2.538980969920196, + "grad_norm": 0.3257673978805542, + "learning_rate": 8.742477676235284e-05, + "loss": 1.8517, + "step": 8272 + }, + { + "epoch": 2.5392879066912215, + "grad_norm": 0.3709326982498169, + "learning_rate": 8.742148040121282e-05, + "loss": 1.872, + "step": 8273 + }, + { + "epoch": 2.539594843462247, + "grad_norm": 0.3433123826980591, + "learning_rate": 8.741818367025179e-05, + "loss": 1.8717, + "step": 8274 + }, + { + "epoch": 2.539901780233272, + "grad_norm": 0.39426255226135254, + "learning_rate": 8.741488656950234e-05, + "loss": 1.8155, + "step": 8275 + }, + { + "epoch": 2.540208717004297, + "grad_norm": 0.48205071687698364, + "learning_rate": 8.741158909899706e-05, + "loss": 1.8668, + "step": 8276 + }, + { + "epoch": 2.5405156537753224, + "grad_norm": 0.35280337929725647, + "learning_rate": 8.740829125876853e-05, + "loss": 1.7845, + "step": 8277 + }, + { + "epoch": 2.5408225905463473, + "grad_norm": 0.3148525059223175, + "learning_rate": 8.740499304884932e-05, + "loss": 1.8539, + "step": 8278 + }, + { + "epoch": 2.5411295273173726, + "grad_norm": 0.387932687997818, + "learning_rate": 8.740169446927207e-05, + "loss": 1.8514, + "step": 8279 + }, + { + "epoch": 2.541436464088398, + "grad_norm": 0.37375807762145996, + "learning_rate": 8.739839552006934e-05, + "loss": 1.8497, + "step": 8280 + }, + { + "epoch": 2.541743400859423, + "grad_norm": 0.3094288408756256, + "learning_rate": 8.739509620127375e-05, + "loss": 1.8675, + "step": 8281 + }, + { + "epoch": 2.542050337630448, + "grad_norm": 0.36951884627342224, + "learning_rate": 8.73917965129179e-05, + "loss": 1.8533, + "step": 8282 + }, + { + "epoch": 2.542357274401473, + "grad_norm": 0.39360809326171875, + "learning_rate": 8.73884964550344e-05, + "loss": 1.8688, + "step": 8283 + }, + { + "epoch": 2.5426642111724984, + "grad_norm": 0.29781201481819153, + "learning_rate": 8.738519602765586e-05, + "loss": 1.8285, + "step": 8284 + }, + { + "epoch": 2.5429711479435237, + "grad_norm": 0.29476743936538696, + "learning_rate": 8.73818952308149e-05, + "loss": 1.8234, + "step": 8285 + }, + { + "epoch": 2.5432780847145486, + "grad_norm": 0.3660123646259308, + "learning_rate": 8.737859406454416e-05, + "loss": 1.8933, + "step": 8286 + }, + { + "epoch": 2.543585021485574, + "grad_norm": 0.41587865352630615, + "learning_rate": 8.737529252887621e-05, + "loss": 1.8799, + "step": 8287 + }, + { + "epoch": 2.5438919582565993, + "grad_norm": 0.4183691143989563, + "learning_rate": 8.737199062384374e-05, + "loss": 1.8479, + "step": 8288 + }, + { + "epoch": 2.544198895027624, + "grad_norm": 0.35940057039260864, + "learning_rate": 8.736868834947935e-05, + "loss": 1.8164, + "step": 8289 + }, + { + "epoch": 2.5445058317986495, + "grad_norm": 0.26804691553115845, + "learning_rate": 8.736538570581568e-05, + "loss": 1.8017, + "step": 8290 + }, + { + "epoch": 2.544812768569675, + "grad_norm": 0.34537792205810547, + "learning_rate": 8.736208269288534e-05, + "loss": 1.9002, + "step": 8291 + }, + { + "epoch": 2.5451197053406998, + "grad_norm": 0.4636915624141693, + "learning_rate": 8.735877931072106e-05, + "loss": 1.8207, + "step": 8292 + }, + { + "epoch": 2.545426642111725, + "grad_norm": 0.4897560775279999, + "learning_rate": 8.735547555935537e-05, + "loss": 1.7981, + "step": 8293 + }, + { + "epoch": 2.54573357888275, + "grad_norm": 0.37379372119903564, + "learning_rate": 8.7352171438821e-05, + "loss": 1.8727, + "step": 8294 + }, + { + "epoch": 2.5460405156537753, + "grad_norm": 0.295436292886734, + "learning_rate": 8.734886694915059e-05, + "loss": 1.8321, + "step": 8295 + }, + { + "epoch": 2.5463474524248007, + "grad_norm": 0.40406084060668945, + "learning_rate": 8.734556209037676e-05, + "loss": 1.8666, + "step": 8296 + }, + { + "epoch": 2.5466543891958255, + "grad_norm": 0.3286290466785431, + "learning_rate": 8.734225686253221e-05, + "loss": 1.8574, + "step": 8297 + }, + { + "epoch": 2.546961325966851, + "grad_norm": 0.3200569152832031, + "learning_rate": 8.73389512656496e-05, + "loss": 1.8253, + "step": 8298 + }, + { + "epoch": 2.5472682627378758, + "grad_norm": 0.35550132393836975, + "learning_rate": 8.733564529976157e-05, + "loss": 1.8293, + "step": 8299 + }, + { + "epoch": 2.547575199508901, + "grad_norm": 0.3804685175418854, + "learning_rate": 8.733233896490081e-05, + "loss": 1.8689, + "step": 8300 + }, + { + "epoch": 2.5478821362799264, + "grad_norm": 0.34739598631858826, + "learning_rate": 8.73290322611e-05, + "loss": 1.8441, + "step": 8301 + }, + { + "epoch": 2.5481890730509518, + "grad_norm": 0.29757586121559143, + "learning_rate": 8.732572518839182e-05, + "loss": 1.8698, + "step": 8302 + }, + { + "epoch": 2.5484960098219767, + "grad_norm": 0.30403536558151245, + "learning_rate": 8.732241774680895e-05, + "loss": 1.8305, + "step": 8303 + }, + { + "epoch": 2.548802946593002, + "grad_norm": 0.326876699924469, + "learning_rate": 8.731910993638406e-05, + "loss": 1.8514, + "step": 8304 + }, + { + "epoch": 2.549109883364027, + "grad_norm": 0.3108467161655426, + "learning_rate": 8.731580175714986e-05, + "loss": 1.8509, + "step": 8305 + }, + { + "epoch": 2.549416820135052, + "grad_norm": 0.31641489267349243, + "learning_rate": 8.731249320913904e-05, + "loss": 1.9009, + "step": 8306 + }, + { + "epoch": 2.5497237569060776, + "grad_norm": 0.3166131377220154, + "learning_rate": 8.730918429238428e-05, + "loss": 1.8291, + "step": 8307 + }, + { + "epoch": 2.5500306936771024, + "grad_norm": 0.27900195121765137, + "learning_rate": 8.730587500691829e-05, + "loss": 1.856, + "step": 8308 + }, + { + "epoch": 2.550337630448128, + "grad_norm": 0.3000704050064087, + "learning_rate": 8.730256535277379e-05, + "loss": 1.839, + "step": 8309 + }, + { + "epoch": 2.5506445672191527, + "grad_norm": 0.30938518047332764, + "learning_rate": 8.729925532998348e-05, + "loss": 1.929, + "step": 8310 + }, + { + "epoch": 2.550951503990178, + "grad_norm": 0.3687250316143036, + "learning_rate": 8.729594493858007e-05, + "loss": 1.9214, + "step": 8311 + }, + { + "epoch": 2.5512584407612033, + "grad_norm": 0.3302690386772156, + "learning_rate": 8.729263417859625e-05, + "loss": 1.8667, + "step": 8312 + }, + { + "epoch": 2.5515653775322282, + "grad_norm": 0.32535505294799805, + "learning_rate": 8.728932305006478e-05, + "loss": 1.8298, + "step": 8313 + }, + { + "epoch": 2.5518723143032536, + "grad_norm": 0.3425545394420624, + "learning_rate": 8.728601155301834e-05, + "loss": 1.9479, + "step": 8314 + }, + { + "epoch": 2.5521792510742785, + "grad_norm": 0.29452621936798096, + "learning_rate": 8.72826996874897e-05, + "loss": 1.7963, + "step": 8315 + }, + { + "epoch": 2.552486187845304, + "grad_norm": 0.28749120235443115, + "learning_rate": 8.727938745351156e-05, + "loss": 1.7993, + "step": 8316 + }, + { + "epoch": 2.552793124616329, + "grad_norm": 0.29261404275894165, + "learning_rate": 8.727607485111669e-05, + "loss": 1.8307, + "step": 8317 + }, + { + "epoch": 2.5531000613873545, + "grad_norm": 0.2949221730232239, + "learning_rate": 8.727276188033778e-05, + "loss": 1.7918, + "step": 8318 + }, + { + "epoch": 2.5534069981583793, + "grad_norm": 0.2975117862224579, + "learning_rate": 8.726944854120757e-05, + "loss": 1.8488, + "step": 8319 + }, + { + "epoch": 2.5537139349294047, + "grad_norm": 0.30285659432411194, + "learning_rate": 8.726613483375885e-05, + "loss": 1.8763, + "step": 8320 + }, + { + "epoch": 2.5540208717004296, + "grad_norm": 0.3068414330482483, + "learning_rate": 8.726282075802435e-05, + "loss": 1.8684, + "step": 8321 + }, + { + "epoch": 2.554327808471455, + "grad_norm": 0.3904091715812683, + "learning_rate": 8.72595063140368e-05, + "loss": 1.8643, + "step": 8322 + }, + { + "epoch": 2.5546347452424802, + "grad_norm": 0.443294882774353, + "learning_rate": 8.725619150182897e-05, + "loss": 1.8268, + "step": 8323 + }, + { + "epoch": 2.554941682013505, + "grad_norm": 0.4574877619743347, + "learning_rate": 8.725287632143362e-05, + "loss": 1.8686, + "step": 8324 + }, + { + "epoch": 2.5552486187845305, + "grad_norm": 0.3246860206127167, + "learning_rate": 8.724956077288351e-05, + "loss": 1.8304, + "step": 8325 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.30745935440063477, + "learning_rate": 8.724624485621141e-05, + "loss": 1.8129, + "step": 8326 + }, + { + "epoch": 2.5558624923265807, + "grad_norm": 0.4026782214641571, + "learning_rate": 8.72429285714501e-05, + "loss": 1.8511, + "step": 8327 + }, + { + "epoch": 2.556169429097606, + "grad_norm": 0.41659530997276306, + "learning_rate": 8.723961191863232e-05, + "loss": 1.891, + "step": 8328 + }, + { + "epoch": 2.556476365868631, + "grad_norm": 0.31792551279067993, + "learning_rate": 8.723629489779088e-05, + "loss": 1.8413, + "step": 8329 + }, + { + "epoch": 2.5567833026396563, + "grad_norm": 0.3168247640132904, + "learning_rate": 8.723297750895856e-05, + "loss": 1.902, + "step": 8330 + }, + { + "epoch": 2.557090239410681, + "grad_norm": 0.27834242582321167, + "learning_rate": 8.72296597521681e-05, + "loss": 1.8185, + "step": 8331 + }, + { + "epoch": 2.5573971761817065, + "grad_norm": 0.2997399568557739, + "learning_rate": 8.722634162745236e-05, + "loss": 1.8389, + "step": 8332 + }, + { + "epoch": 2.557704112952732, + "grad_norm": 0.29116490483283997, + "learning_rate": 8.722302313484407e-05, + "loss": 1.8391, + "step": 8333 + }, + { + "epoch": 2.558011049723757, + "grad_norm": 0.2898460030555725, + "learning_rate": 8.721970427437605e-05, + "loss": 1.8891, + "step": 8334 + }, + { + "epoch": 2.558317986494782, + "grad_norm": 0.3231159746646881, + "learning_rate": 8.721638504608109e-05, + "loss": 1.826, + "step": 8335 + }, + { + "epoch": 2.5586249232658074, + "grad_norm": 0.38665273785591125, + "learning_rate": 8.721306544999203e-05, + "loss": 1.9162, + "step": 8336 + }, + { + "epoch": 2.5589318600368323, + "grad_norm": 0.367824912071228, + "learning_rate": 8.720974548614162e-05, + "loss": 1.8165, + "step": 8337 + }, + { + "epoch": 2.5592387968078576, + "grad_norm": 0.3095315098762512, + "learning_rate": 8.72064251545627e-05, + "loss": 1.8887, + "step": 8338 + }, + { + "epoch": 2.559545733578883, + "grad_norm": 0.316890150308609, + "learning_rate": 8.720310445528807e-05, + "loss": 1.8547, + "step": 8339 + }, + { + "epoch": 2.559852670349908, + "grad_norm": 0.2962728440761566, + "learning_rate": 8.719978338835057e-05, + "loss": 1.8252, + "step": 8340 + }, + { + "epoch": 2.560159607120933, + "grad_norm": 0.3351762890815735, + "learning_rate": 8.719646195378302e-05, + "loss": 1.8056, + "step": 8341 + }, + { + "epoch": 2.560466543891958, + "grad_norm": 0.2946149706840515, + "learning_rate": 8.719314015161822e-05, + "loss": 1.8219, + "step": 8342 + }, + { + "epoch": 2.5607734806629834, + "grad_norm": 0.30291053652763367, + "learning_rate": 8.718981798188899e-05, + "loss": 1.8161, + "step": 8343 + }, + { + "epoch": 2.5610804174340087, + "grad_norm": 0.30717429518699646, + "learning_rate": 8.71864954446282e-05, + "loss": 1.8763, + "step": 8344 + }, + { + "epoch": 2.5613873542050336, + "grad_norm": 0.28360515832901, + "learning_rate": 8.718317253986866e-05, + "loss": 1.7972, + "step": 8345 + }, + { + "epoch": 2.561694290976059, + "grad_norm": 0.34898701310157776, + "learning_rate": 8.717984926764322e-05, + "loss": 1.8843, + "step": 8346 + }, + { + "epoch": 2.562001227747084, + "grad_norm": 0.2702360451221466, + "learning_rate": 8.717652562798472e-05, + "loss": 1.7917, + "step": 8347 + }, + { + "epoch": 2.562308164518109, + "grad_norm": 0.30566295981407166, + "learning_rate": 8.7173201620926e-05, + "loss": 1.9027, + "step": 8348 + }, + { + "epoch": 2.5626151012891345, + "grad_norm": 0.2882433533668518, + "learning_rate": 8.716987724649991e-05, + "loss": 1.8167, + "step": 8349 + }, + { + "epoch": 2.56292203806016, + "grad_norm": 0.2616370916366577, + "learning_rate": 8.71665525047393e-05, + "loss": 1.7779, + "step": 8350 + }, + { + "epoch": 2.5632289748311847, + "grad_norm": 0.3033899664878845, + "learning_rate": 8.716322739567706e-05, + "loss": 1.9022, + "step": 8351 + }, + { + "epoch": 2.56353591160221, + "grad_norm": 0.30584800243377686, + "learning_rate": 8.7159901919346e-05, + "loss": 1.8808, + "step": 8352 + }, + { + "epoch": 2.563842848373235, + "grad_norm": 0.34650805592536926, + "learning_rate": 8.715657607577903e-05, + "loss": 1.8817, + "step": 8353 + }, + { + "epoch": 2.5641497851442603, + "grad_norm": 0.30568572878837585, + "learning_rate": 8.715324986500898e-05, + "loss": 1.8852, + "step": 8354 + }, + { + "epoch": 2.5644567219152856, + "grad_norm": 0.36174869537353516, + "learning_rate": 8.714992328706875e-05, + "loss": 1.8518, + "step": 8355 + }, + { + "epoch": 2.5647636586863105, + "grad_norm": 0.48538872599601746, + "learning_rate": 8.714659634199119e-05, + "loss": 1.8902, + "step": 8356 + }, + { + "epoch": 2.565070595457336, + "grad_norm": 0.44997766613960266, + "learning_rate": 8.71432690298092e-05, + "loss": 1.8914, + "step": 8357 + }, + { + "epoch": 2.5653775322283607, + "grad_norm": 0.30164965987205505, + "learning_rate": 8.713994135055566e-05, + "loss": 1.826, + "step": 8358 + }, + { + "epoch": 2.565684468999386, + "grad_norm": 0.35495996475219727, + "learning_rate": 8.713661330426345e-05, + "loss": 1.8006, + "step": 8359 + }, + { + "epoch": 2.5659914057704114, + "grad_norm": 0.4141593277454376, + "learning_rate": 8.713328489096545e-05, + "loss": 1.782, + "step": 8360 + }, + { + "epoch": 2.5662983425414367, + "grad_norm": 0.4758378267288208, + "learning_rate": 8.712995611069458e-05, + "loss": 1.8378, + "step": 8361 + }, + { + "epoch": 2.5666052793124616, + "grad_norm": 0.4852865934371948, + "learning_rate": 8.71266269634837e-05, + "loss": 1.8472, + "step": 8362 + }, + { + "epoch": 2.566912216083487, + "grad_norm": 0.43413496017456055, + "learning_rate": 8.712329744936576e-05, + "loss": 1.8118, + "step": 8363 + }, + { + "epoch": 2.567219152854512, + "grad_norm": 0.3100700080394745, + "learning_rate": 8.711996756837361e-05, + "loss": 1.8699, + "step": 8364 + }, + { + "epoch": 2.567526089625537, + "grad_norm": 0.31886258721351624, + "learning_rate": 8.711663732054021e-05, + "loss": 1.8022, + "step": 8365 + }, + { + "epoch": 2.5678330263965625, + "grad_norm": 0.38900697231292725, + "learning_rate": 8.711330670589841e-05, + "loss": 1.8119, + "step": 8366 + }, + { + "epoch": 2.5681399631675874, + "grad_norm": 0.4188348650932312, + "learning_rate": 8.710997572448119e-05, + "loss": 1.8561, + "step": 8367 + }, + { + "epoch": 2.5684468999386127, + "grad_norm": 0.3562021255493164, + "learning_rate": 8.710664437632143e-05, + "loss": 1.8605, + "step": 8368 + }, + { + "epoch": 2.5687538367096376, + "grad_norm": 0.3105112910270691, + "learning_rate": 8.710331266145206e-05, + "loss": 1.8122, + "step": 8369 + }, + { + "epoch": 2.569060773480663, + "grad_norm": 0.3209846615791321, + "learning_rate": 8.7099980579906e-05, + "loss": 1.8914, + "step": 8370 + }, + { + "epoch": 2.5693677102516883, + "grad_norm": 0.32560455799102783, + "learning_rate": 8.70966481317162e-05, + "loss": 1.9245, + "step": 8371 + }, + { + "epoch": 2.569674647022713, + "grad_norm": 0.29573267698287964, + "learning_rate": 8.709331531691558e-05, + "loss": 1.8576, + "step": 8372 + }, + { + "epoch": 2.5699815837937385, + "grad_norm": 0.2974778115749359, + "learning_rate": 8.708998213553707e-05, + "loss": 1.8464, + "step": 8373 + }, + { + "epoch": 2.5702885205647634, + "grad_norm": 0.3264322578907013, + "learning_rate": 8.708664858761362e-05, + "loss": 1.8945, + "step": 8374 + }, + { + "epoch": 2.5705954573357888, + "grad_norm": 0.28260353207588196, + "learning_rate": 8.708331467317816e-05, + "loss": 1.8296, + "step": 8375 + }, + { + "epoch": 2.570902394106814, + "grad_norm": 0.2991141676902771, + "learning_rate": 8.707998039226367e-05, + "loss": 1.9227, + "step": 8376 + }, + { + "epoch": 2.5712093308778394, + "grad_norm": 0.28582924604415894, + "learning_rate": 8.707664574490306e-05, + "loss": 1.8465, + "step": 8377 + }, + { + "epoch": 2.5715162676488643, + "grad_norm": 0.2860773205757141, + "learning_rate": 8.707331073112932e-05, + "loss": 1.8403, + "step": 8378 + }, + { + "epoch": 2.5718232044198897, + "grad_norm": 0.31145161390304565, + "learning_rate": 8.70699753509754e-05, + "loss": 1.8775, + "step": 8379 + }, + { + "epoch": 2.5721301411909145, + "grad_norm": 0.28711119294166565, + "learning_rate": 8.706663960447424e-05, + "loss": 1.8354, + "step": 8380 + }, + { + "epoch": 2.57243707796194, + "grad_norm": 0.2884272634983063, + "learning_rate": 8.706330349165884e-05, + "loss": 1.8772, + "step": 8381 + }, + { + "epoch": 2.572744014732965, + "grad_norm": 0.3581789433956146, + "learning_rate": 8.705996701256214e-05, + "loss": 1.8654, + "step": 8382 + }, + { + "epoch": 2.57305095150399, + "grad_norm": 0.41561809182167053, + "learning_rate": 8.705663016721712e-05, + "loss": 1.9112, + "step": 8383 + }, + { + "epoch": 2.5733578882750154, + "grad_norm": 0.301883727312088, + "learning_rate": 8.705329295565676e-05, + "loss": 1.803, + "step": 8384 + }, + { + "epoch": 2.5736648250460403, + "grad_norm": 0.37060779333114624, + "learning_rate": 8.704995537791405e-05, + "loss": 1.9371, + "step": 8385 + }, + { + "epoch": 2.5739717618170657, + "grad_norm": 0.44705548882484436, + "learning_rate": 8.704661743402195e-05, + "loss": 1.8599, + "step": 8386 + }, + { + "epoch": 2.574278698588091, + "grad_norm": 0.44097039103507996, + "learning_rate": 8.70432791240135e-05, + "loss": 1.8305, + "step": 8387 + }, + { + "epoch": 2.574585635359116, + "grad_norm": 0.3278143107891083, + "learning_rate": 8.703994044792161e-05, + "loss": 1.8817, + "step": 8388 + }, + { + "epoch": 2.574892572130141, + "grad_norm": 0.347153902053833, + "learning_rate": 8.703660140577934e-05, + "loss": 1.8182, + "step": 8389 + }, + { + "epoch": 2.575199508901166, + "grad_norm": 0.4667893052101135, + "learning_rate": 8.703326199761966e-05, + "loss": 1.8354, + "step": 8390 + }, + { + "epoch": 2.5755064456721914, + "grad_norm": 0.4956285059452057, + "learning_rate": 8.702992222347559e-05, + "loss": 1.8284, + "step": 8391 + }, + { + "epoch": 2.575813382443217, + "grad_norm": 0.3489355146884918, + "learning_rate": 8.702658208338012e-05, + "loss": 1.8439, + "step": 8392 + }, + { + "epoch": 2.576120319214242, + "grad_norm": 0.3054865002632141, + "learning_rate": 8.702324157736625e-05, + "loss": 1.8659, + "step": 8393 + }, + { + "epoch": 2.576427255985267, + "grad_norm": 0.3459004759788513, + "learning_rate": 8.701990070546703e-05, + "loss": 1.8644, + "step": 8394 + }, + { + "epoch": 2.5767341927562923, + "grad_norm": 0.34715306758880615, + "learning_rate": 8.701655946771544e-05, + "loss": 1.8765, + "step": 8395 + }, + { + "epoch": 2.5770411295273172, + "grad_norm": 0.35610535740852356, + "learning_rate": 8.701321786414452e-05, + "loss": 1.886, + "step": 8396 + }, + { + "epoch": 2.5773480662983426, + "grad_norm": 0.34869852662086487, + "learning_rate": 8.700987589478728e-05, + "loss": 1.8858, + "step": 8397 + }, + { + "epoch": 2.577655003069368, + "grad_norm": 0.33508050441741943, + "learning_rate": 8.700653355967675e-05, + "loss": 1.8429, + "step": 8398 + }, + { + "epoch": 2.577961939840393, + "grad_norm": 0.4707668721675873, + "learning_rate": 8.700319085884597e-05, + "loss": 1.8806, + "step": 8399 + }, + { + "epoch": 2.578268876611418, + "grad_norm": 0.5073609948158264, + "learning_rate": 8.699984779232797e-05, + "loss": 1.9252, + "step": 8400 + }, + { + "epoch": 2.578575813382443, + "grad_norm": 0.4120771884918213, + "learning_rate": 8.699650436015578e-05, + "loss": 1.9463, + "step": 8401 + }, + { + "epoch": 2.5788827501534684, + "grad_norm": 0.5639505386352539, + "learning_rate": 8.699316056236246e-05, + "loss": 1.9076, + "step": 8402 + }, + { + "epoch": 2.5791896869244937, + "grad_norm": 0.7611388564109802, + "learning_rate": 8.698981639898106e-05, + "loss": 1.8344, + "step": 8403 + }, + { + "epoch": 2.5794966236955186, + "grad_norm": 0.715629518032074, + "learning_rate": 8.69864718700446e-05, + "loss": 1.7928, + "step": 8404 + }, + { + "epoch": 2.579803560466544, + "grad_norm": 0.4248988926410675, + "learning_rate": 8.698312697558614e-05, + "loss": 1.835, + "step": 8405 + }, + { + "epoch": 2.580110497237569, + "grad_norm": 0.3638152778148651, + "learning_rate": 8.697978171563875e-05, + "loss": 1.8544, + "step": 8406 + }, + { + "epoch": 2.580417434008594, + "grad_norm": 0.40734997391700745, + "learning_rate": 8.697643609023547e-05, + "loss": 1.7759, + "step": 8407 + }, + { + "epoch": 2.5807243707796195, + "grad_norm": 0.41469305753707886, + "learning_rate": 8.697309009940939e-05, + "loss": 1.8989, + "step": 8408 + }, + { + "epoch": 2.581031307550645, + "grad_norm": 0.3003403842449188, + "learning_rate": 8.696974374319355e-05, + "loss": 1.8138, + "step": 8409 + }, + { + "epoch": 2.5813382443216697, + "grad_norm": 0.3475555181503296, + "learning_rate": 8.696639702162104e-05, + "loss": 1.8851, + "step": 8410 + }, + { + "epoch": 2.581645181092695, + "grad_norm": 0.3952930271625519, + "learning_rate": 8.696304993472493e-05, + "loss": 1.8421, + "step": 8411 + }, + { + "epoch": 2.58195211786372, + "grad_norm": 0.33059266209602356, + "learning_rate": 8.69597024825383e-05, + "loss": 1.886, + "step": 8412 + }, + { + "epoch": 2.5822590546347453, + "grad_norm": 0.291877806186676, + "learning_rate": 8.695635466509422e-05, + "loss": 1.8001, + "step": 8413 + }, + { + "epoch": 2.5825659914057706, + "grad_norm": 0.3707219064235687, + "learning_rate": 8.69530064824258e-05, + "loss": 1.8419, + "step": 8414 + }, + { + "epoch": 2.5828729281767955, + "grad_norm": 0.4656111001968384, + "learning_rate": 8.694965793456609e-05, + "loss": 1.8925, + "step": 8415 + }, + { + "epoch": 2.583179864947821, + "grad_norm": 0.4284421503543854, + "learning_rate": 8.694630902154821e-05, + "loss": 1.8794, + "step": 8416 + }, + { + "epoch": 2.5834868017188457, + "grad_norm": 0.25311100482940674, + "learning_rate": 8.694295974340525e-05, + "loss": 1.8004, + "step": 8417 + }, + { + "epoch": 2.583793738489871, + "grad_norm": 0.3463805615901947, + "learning_rate": 8.693961010017031e-05, + "loss": 1.8666, + "step": 8418 + }, + { + "epoch": 2.5841006752608964, + "grad_norm": 0.3193957209587097, + "learning_rate": 8.693626009187647e-05, + "loss": 1.8787, + "step": 8419 + }, + { + "epoch": 2.5844076120319213, + "grad_norm": 0.30919939279556274, + "learning_rate": 8.69329097185569e-05, + "loss": 1.9066, + "step": 8420 + }, + { + "epoch": 2.5847145488029466, + "grad_norm": 0.31369611620903015, + "learning_rate": 8.692955898024464e-05, + "loss": 1.8714, + "step": 8421 + }, + { + "epoch": 2.5850214855739715, + "grad_norm": 0.3191319406032562, + "learning_rate": 8.692620787697284e-05, + "loss": 1.8535, + "step": 8422 + }, + { + "epoch": 2.585328422344997, + "grad_norm": 0.3148418366909027, + "learning_rate": 8.692285640877462e-05, + "loss": 1.8648, + "step": 8423 + }, + { + "epoch": 2.585635359116022, + "grad_norm": 0.28245437145233154, + "learning_rate": 8.691950457568307e-05, + "loss": 1.8574, + "step": 8424 + }, + { + "epoch": 2.5859422958870475, + "grad_norm": 0.28383150696754456, + "learning_rate": 8.691615237773137e-05, + "loss": 1.7993, + "step": 8425 + }, + { + "epoch": 2.5862492326580724, + "grad_norm": 0.30522802472114563, + "learning_rate": 8.691279981495257e-05, + "loss": 1.8809, + "step": 8426 + }, + { + "epoch": 2.5865561694290977, + "grad_norm": 0.2936995327472687, + "learning_rate": 8.690944688737988e-05, + "loss": 1.745, + "step": 8427 + }, + { + "epoch": 2.5868631062001226, + "grad_norm": 0.2923533320426941, + "learning_rate": 8.69060935950464e-05, + "loss": 1.8929, + "step": 8428 + }, + { + "epoch": 2.587170042971148, + "grad_norm": 0.3280770182609558, + "learning_rate": 8.690273993798526e-05, + "loss": 1.8587, + "step": 8429 + }, + { + "epoch": 2.5874769797421733, + "grad_norm": 0.314712792634964, + "learning_rate": 8.689938591622962e-05, + "loss": 1.8569, + "step": 8430 + }, + { + "epoch": 2.587783916513198, + "grad_norm": 0.3230959475040436, + "learning_rate": 8.689603152981263e-05, + "loss": 1.8451, + "step": 8431 + }, + { + "epoch": 2.5880908532842235, + "grad_norm": 0.35917067527770996, + "learning_rate": 8.689267677876742e-05, + "loss": 1.7755, + "step": 8432 + }, + { + "epoch": 2.5883977900552484, + "grad_norm": 0.3590618968009949, + "learning_rate": 8.688932166312715e-05, + "loss": 1.8236, + "step": 8433 + }, + { + "epoch": 2.5887047268262737, + "grad_norm": 0.29416507482528687, + "learning_rate": 8.6885966182925e-05, + "loss": 1.7852, + "step": 8434 + }, + { + "epoch": 2.589011663597299, + "grad_norm": 0.24230079352855682, + "learning_rate": 8.688261033819409e-05, + "loss": 1.8006, + "step": 8435 + }, + { + "epoch": 2.5893186003683244, + "grad_norm": 0.2519497573375702, + "learning_rate": 8.687925412896762e-05, + "loss": 1.7787, + "step": 8436 + }, + { + "epoch": 2.5896255371393493, + "grad_norm": 0.2794395089149475, + "learning_rate": 8.687589755527874e-05, + "loss": 1.8408, + "step": 8437 + }, + { + "epoch": 2.5899324739103746, + "grad_norm": 0.28811511397361755, + "learning_rate": 8.687254061716063e-05, + "loss": 1.8961, + "step": 8438 + }, + { + "epoch": 2.5902394106813995, + "grad_norm": 0.28127825260162354, + "learning_rate": 8.686918331464647e-05, + "loss": 1.8235, + "step": 8439 + }, + { + "epoch": 2.590546347452425, + "grad_norm": 0.2869607210159302, + "learning_rate": 8.686582564776942e-05, + "loss": 1.8452, + "step": 8440 + }, + { + "epoch": 2.59085328422345, + "grad_norm": 0.36350393295288086, + "learning_rate": 8.686246761656268e-05, + "loss": 1.9262, + "step": 8441 + }, + { + "epoch": 2.591160220994475, + "grad_norm": 0.30231785774230957, + "learning_rate": 8.685910922105942e-05, + "loss": 1.8674, + "step": 8442 + }, + { + "epoch": 2.5914671577655004, + "grad_norm": 0.28321847319602966, + "learning_rate": 8.685575046129285e-05, + "loss": 1.8243, + "step": 8443 + }, + { + "epoch": 2.5917740945365253, + "grad_norm": 0.30235186219215393, + "learning_rate": 8.685239133729615e-05, + "loss": 1.8442, + "step": 8444 + }, + { + "epoch": 2.5920810313075506, + "grad_norm": 0.2684946060180664, + "learning_rate": 8.684903184910252e-05, + "loss": 1.8584, + "step": 8445 + }, + { + "epoch": 2.592387968078576, + "grad_norm": 0.33788567781448364, + "learning_rate": 8.684567199674514e-05, + "loss": 1.8296, + "step": 8446 + }, + { + "epoch": 2.592694904849601, + "grad_norm": 0.38110965490341187, + "learning_rate": 8.684231178025726e-05, + "loss": 1.8581, + "step": 8447 + }, + { + "epoch": 2.593001841620626, + "grad_norm": 0.36466923356056213, + "learning_rate": 8.683895119967204e-05, + "loss": 1.8799, + "step": 8448 + }, + { + "epoch": 2.593308778391651, + "grad_norm": 0.3052733838558197, + "learning_rate": 8.683559025502272e-05, + "loss": 1.8834, + "step": 8449 + }, + { + "epoch": 2.5936157151626764, + "grad_norm": 0.31457164883613586, + "learning_rate": 8.683222894634251e-05, + "loss": 1.8635, + "step": 8450 + }, + { + "epoch": 2.5939226519337018, + "grad_norm": 0.46189576387405396, + "learning_rate": 8.682886727366464e-05, + "loss": 1.8852, + "step": 8451 + }, + { + "epoch": 2.594229588704727, + "grad_norm": 0.467640221118927, + "learning_rate": 8.682550523702229e-05, + "loss": 1.8306, + "step": 8452 + }, + { + "epoch": 2.594536525475752, + "grad_norm": 0.3384416699409485, + "learning_rate": 8.682214283644873e-05, + "loss": 1.8298, + "step": 8453 + }, + { + "epoch": 2.5948434622467773, + "grad_norm": 0.2842169404029846, + "learning_rate": 8.681878007197717e-05, + "loss": 1.8091, + "step": 8454 + }, + { + "epoch": 2.595150399017802, + "grad_norm": 0.31266552209854126, + "learning_rate": 8.681541694364084e-05, + "loss": 1.8329, + "step": 8455 + }, + { + "epoch": 2.5954573357888275, + "grad_norm": 0.36803483963012695, + "learning_rate": 8.681205345147298e-05, + "loss": 1.8427, + "step": 8456 + }, + { + "epoch": 2.595764272559853, + "grad_norm": 0.37500229477882385, + "learning_rate": 8.680868959550684e-05, + "loss": 1.8865, + "step": 8457 + }, + { + "epoch": 2.5960712093308778, + "grad_norm": 0.30494266748428345, + "learning_rate": 8.680532537577565e-05, + "loss": 1.8375, + "step": 8458 + }, + { + "epoch": 2.596378146101903, + "grad_norm": 0.38320985436439514, + "learning_rate": 8.680196079231266e-05, + "loss": 1.8762, + "step": 8459 + }, + { + "epoch": 2.596685082872928, + "grad_norm": 0.48555347323417664, + "learning_rate": 8.679859584515112e-05, + "loss": 1.8558, + "step": 8460 + }, + { + "epoch": 2.5969920196439533, + "grad_norm": 0.3975796401500702, + "learning_rate": 8.67952305343243e-05, + "loss": 1.8265, + "step": 8461 + }, + { + "epoch": 2.5972989564149787, + "grad_norm": 0.3312734365463257, + "learning_rate": 8.679186485986544e-05, + "loss": 1.8346, + "step": 8462 + }, + { + "epoch": 2.5976058931860035, + "grad_norm": 0.37137889862060547, + "learning_rate": 8.67884988218078e-05, + "loss": 1.8894, + "step": 8463 + }, + { + "epoch": 2.597912829957029, + "grad_norm": 0.3645901083946228, + "learning_rate": 8.678513242018467e-05, + "loss": 1.8103, + "step": 8464 + }, + { + "epoch": 2.5982197667280538, + "grad_norm": 0.35010847449302673, + "learning_rate": 8.67817656550293e-05, + "loss": 1.8704, + "step": 8465 + }, + { + "epoch": 2.598526703499079, + "grad_norm": 0.36948931217193604, + "learning_rate": 8.677839852637492e-05, + "loss": 1.8413, + "step": 8466 + }, + { + "epoch": 2.5988336402701044, + "grad_norm": 0.3512018322944641, + "learning_rate": 8.67750310342549e-05, + "loss": 1.8222, + "step": 8467 + }, + { + "epoch": 2.5991405770411298, + "grad_norm": 0.3678590953350067, + "learning_rate": 8.677166317870245e-05, + "loss": 1.852, + "step": 8468 + }, + { + "epoch": 2.5994475138121547, + "grad_norm": 0.46718111634254456, + "learning_rate": 8.676829495975087e-05, + "loss": 1.8459, + "step": 8469 + }, + { + "epoch": 2.59975445058318, + "grad_norm": 0.4580456018447876, + "learning_rate": 8.676492637743345e-05, + "loss": 1.8547, + "step": 8470 + }, + { + "epoch": 2.600061387354205, + "grad_norm": 0.3790566921234131, + "learning_rate": 8.676155743178348e-05, + "loss": 1.8483, + "step": 8471 + }, + { + "epoch": 2.6003683241252302, + "grad_norm": 0.34775233268737793, + "learning_rate": 8.675818812283424e-05, + "loss": 1.9, + "step": 8472 + }, + { + "epoch": 2.6006752608962556, + "grad_norm": 0.4257417619228363, + "learning_rate": 8.675481845061906e-05, + "loss": 1.8354, + "step": 8473 + }, + { + "epoch": 2.6009821976672804, + "grad_norm": 0.46964964270591736, + "learning_rate": 8.675144841517122e-05, + "loss": 1.8305, + "step": 8474 + }, + { + "epoch": 2.601289134438306, + "grad_norm": 0.3592812120914459, + "learning_rate": 8.674807801652403e-05, + "loss": 1.778, + "step": 8475 + }, + { + "epoch": 2.6015960712093307, + "grad_norm": 0.3184985816478729, + "learning_rate": 8.674470725471078e-05, + "loss": 1.8706, + "step": 8476 + }, + { + "epoch": 2.601903007980356, + "grad_norm": 0.31306785345077515, + "learning_rate": 8.674133612976481e-05, + "loss": 1.8482, + "step": 8477 + }, + { + "epoch": 2.6022099447513813, + "grad_norm": 0.30568715929985046, + "learning_rate": 8.673796464171939e-05, + "loss": 1.8346, + "step": 8478 + }, + { + "epoch": 2.6025168815224062, + "grad_norm": 0.33701828122138977, + "learning_rate": 8.673459279060791e-05, + "loss": 1.8165, + "step": 8479 + }, + { + "epoch": 2.6028238182934316, + "grad_norm": 0.3153107166290283, + "learning_rate": 8.673122057646364e-05, + "loss": 1.8175, + "step": 8480 + }, + { + "epoch": 2.6031307550644565, + "grad_norm": 0.3428439497947693, + "learning_rate": 8.67278479993199e-05, + "loss": 1.8344, + "step": 8481 + }, + { + "epoch": 2.603437691835482, + "grad_norm": 0.39118432998657227, + "learning_rate": 8.672447505921006e-05, + "loss": 1.7904, + "step": 8482 + }, + { + "epoch": 2.603744628606507, + "grad_norm": 0.3845612108707428, + "learning_rate": 8.672110175616743e-05, + "loss": 1.8442, + "step": 8483 + }, + { + "epoch": 2.6040515653775325, + "grad_norm": 0.3402850329875946, + "learning_rate": 8.671772809022535e-05, + "loss": 1.8578, + "step": 8484 + }, + { + "epoch": 2.6043585021485574, + "grad_norm": 0.30314967036247253, + "learning_rate": 8.671435406141716e-05, + "loss": 1.8235, + "step": 8485 + }, + { + "epoch": 2.6046654389195827, + "grad_norm": 0.29402145743370056, + "learning_rate": 8.67109796697762e-05, + "loss": 1.8105, + "step": 8486 + }, + { + "epoch": 2.6049723756906076, + "grad_norm": 0.33207419514656067, + "learning_rate": 8.670760491533582e-05, + "loss": 1.9133, + "step": 8487 + }, + { + "epoch": 2.605279312461633, + "grad_norm": 0.3287195861339569, + "learning_rate": 8.670422979812938e-05, + "loss": 1.8344, + "step": 8488 + }, + { + "epoch": 2.6055862492326582, + "grad_norm": 0.37947842478752136, + "learning_rate": 8.670085431819021e-05, + "loss": 1.8504, + "step": 8489 + }, + { + "epoch": 2.605893186003683, + "grad_norm": 0.3688724935054779, + "learning_rate": 8.669747847555171e-05, + "loss": 1.8305, + "step": 8490 + }, + { + "epoch": 2.6062001227747085, + "grad_norm": 0.33962976932525635, + "learning_rate": 8.669410227024721e-05, + "loss": 1.861, + "step": 8491 + }, + { + "epoch": 2.6065070595457334, + "grad_norm": 0.27068057656288147, + "learning_rate": 8.669072570231009e-05, + "loss": 1.7666, + "step": 8492 + }, + { + "epoch": 2.6068139963167587, + "grad_norm": 0.32670122385025024, + "learning_rate": 8.668734877177371e-05, + "loss": 1.8434, + "step": 8493 + }, + { + "epoch": 2.607120933087784, + "grad_norm": 0.37303030490875244, + "learning_rate": 8.668397147867144e-05, + "loss": 1.8326, + "step": 8494 + }, + { + "epoch": 2.607427869858809, + "grad_norm": 0.2860218286514282, + "learning_rate": 8.668059382303666e-05, + "loss": 1.7993, + "step": 8495 + }, + { + "epoch": 2.6077348066298343, + "grad_norm": 0.3480636477470398, + "learning_rate": 8.667721580490278e-05, + "loss": 1.8895, + "step": 8496 + }, + { + "epoch": 2.608041743400859, + "grad_norm": 0.37609198689460754, + "learning_rate": 8.667383742430313e-05, + "loss": 1.8906, + "step": 8497 + }, + { + "epoch": 2.6083486801718845, + "grad_norm": 0.30747851729393005, + "learning_rate": 8.667045868127113e-05, + "loss": 1.8169, + "step": 8498 + }, + { + "epoch": 2.60865561694291, + "grad_norm": 0.3108443021774292, + "learning_rate": 8.666707957584016e-05, + "loss": 1.8296, + "step": 8499 + }, + { + "epoch": 2.608962553713935, + "grad_norm": 0.36353448033332825, + "learning_rate": 8.666370010804361e-05, + "loss": 1.879, + "step": 8500 + }, + { + "epoch": 2.60926949048496, + "grad_norm": 0.39959096908569336, + "learning_rate": 8.666032027791491e-05, + "loss": 1.8602, + "step": 8501 + }, + { + "epoch": 2.6095764272559854, + "grad_norm": 0.3505500853061676, + "learning_rate": 8.665694008548742e-05, + "loss": 1.861, + "step": 8502 + }, + { + "epoch": 2.6098833640270103, + "grad_norm": 0.3155219852924347, + "learning_rate": 8.665355953079457e-05, + "loss": 1.7911, + "step": 8503 + }, + { + "epoch": 2.6101903007980356, + "grad_norm": 0.2868075668811798, + "learning_rate": 8.665017861386975e-05, + "loss": 1.8023, + "step": 8504 + }, + { + "epoch": 2.610497237569061, + "grad_norm": 0.2890832722187042, + "learning_rate": 8.664679733474641e-05, + "loss": 1.8653, + "step": 8505 + }, + { + "epoch": 2.610804174340086, + "grad_norm": 0.3143366575241089, + "learning_rate": 8.66434156934579e-05, + "loss": 1.8024, + "step": 8506 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.28702911734580994, + "learning_rate": 8.664003369003772e-05, + "loss": 1.8231, + "step": 8507 + }, + { + "epoch": 2.611418047882136, + "grad_norm": 0.37087059020996094, + "learning_rate": 8.663665132451924e-05, + "loss": 1.8565, + "step": 8508 + }, + { + "epoch": 2.6117249846531614, + "grad_norm": 0.29796209931373596, + "learning_rate": 8.663326859693588e-05, + "loss": 1.8188, + "step": 8509 + }, + { + "epoch": 2.6120319214241867, + "grad_norm": 0.31352412700653076, + "learning_rate": 8.66298855073211e-05, + "loss": 1.806, + "step": 8510 + }, + { + "epoch": 2.612338858195212, + "grad_norm": 0.28749167919158936, + "learning_rate": 8.662650205570832e-05, + "loss": 1.8082, + "step": 8511 + }, + { + "epoch": 2.612645794966237, + "grad_norm": 0.26889678835868835, + "learning_rate": 8.662311824213099e-05, + "loss": 1.8211, + "step": 8512 + }, + { + "epoch": 2.6129527317372623, + "grad_norm": 0.2562754154205322, + "learning_rate": 8.661973406662253e-05, + "loss": 1.7519, + "step": 8513 + }, + { + "epoch": 2.613259668508287, + "grad_norm": 0.26967912912368774, + "learning_rate": 8.661634952921639e-05, + "loss": 1.8339, + "step": 8514 + }, + { + "epoch": 2.6135666052793125, + "grad_norm": 0.3468424081802368, + "learning_rate": 8.661296462994602e-05, + "loss": 1.9219, + "step": 8515 + }, + { + "epoch": 2.613873542050338, + "grad_norm": 0.34790560603141785, + "learning_rate": 8.660957936884489e-05, + "loss": 1.9089, + "step": 8516 + }, + { + "epoch": 2.6141804788213627, + "grad_norm": 0.350337952375412, + "learning_rate": 8.660619374594643e-05, + "loss": 1.8228, + "step": 8517 + }, + { + "epoch": 2.614487415592388, + "grad_norm": 0.37077057361602783, + "learning_rate": 8.660280776128411e-05, + "loss": 1.8658, + "step": 8518 + }, + { + "epoch": 2.614794352363413, + "grad_norm": 0.35846221446990967, + "learning_rate": 8.659942141489139e-05, + "loss": 1.8573, + "step": 8519 + }, + { + "epoch": 2.6151012891344383, + "grad_norm": 0.339101642370224, + "learning_rate": 8.659603470680173e-05, + "loss": 1.875, + "step": 8520 + }, + { + "epoch": 2.6154082259054636, + "grad_norm": 0.35074207186698914, + "learning_rate": 8.65926476370486e-05, + "loss": 1.8395, + "step": 8521 + }, + { + "epoch": 2.6157151626764885, + "grad_norm": 0.31544017791748047, + "learning_rate": 8.658926020566551e-05, + "loss": 1.8453, + "step": 8522 + }, + { + "epoch": 2.616022099447514, + "grad_norm": 0.30619683861732483, + "learning_rate": 8.658587241268587e-05, + "loss": 1.775, + "step": 8523 + }, + { + "epoch": 2.6163290362185387, + "grad_norm": 0.29331618547439575, + "learning_rate": 8.658248425814322e-05, + "loss": 1.8068, + "step": 8524 + }, + { + "epoch": 2.616635972989564, + "grad_norm": 0.2824336290359497, + "learning_rate": 8.6579095742071e-05, + "loss": 1.8759, + "step": 8525 + }, + { + "epoch": 2.6169429097605894, + "grad_norm": 0.2697986364364624, + "learning_rate": 8.657570686450271e-05, + "loss": 1.8295, + "step": 8526 + }, + { + "epoch": 2.6172498465316147, + "grad_norm": 0.3031822144985199, + "learning_rate": 8.657231762547186e-05, + "loss": 1.9205, + "step": 8527 + }, + { + "epoch": 2.6175567833026396, + "grad_norm": 0.2867984473705292, + "learning_rate": 8.656892802501196e-05, + "loss": 1.8638, + "step": 8528 + }, + { + "epoch": 2.617863720073665, + "grad_norm": 0.29799792170524597, + "learning_rate": 8.656553806315644e-05, + "loss": 1.8187, + "step": 8529 + }, + { + "epoch": 2.61817065684469, + "grad_norm": 0.3222150504589081, + "learning_rate": 8.656214773993884e-05, + "loss": 1.8661, + "step": 8530 + }, + { + "epoch": 2.618477593615715, + "grad_norm": 0.35999616980552673, + "learning_rate": 8.655875705539269e-05, + "loss": 1.9155, + "step": 8531 + }, + { + "epoch": 2.6187845303867405, + "grad_norm": 0.36571675539016724, + "learning_rate": 8.655536600955147e-05, + "loss": 1.8536, + "step": 8532 + }, + { + "epoch": 2.6190914671577654, + "grad_norm": 0.29667189717292786, + "learning_rate": 8.655197460244868e-05, + "loss": 1.8208, + "step": 8533 + }, + { + "epoch": 2.6193984039287908, + "grad_norm": 0.3216320276260376, + "learning_rate": 8.654858283411787e-05, + "loss": 1.8613, + "step": 8534 + }, + { + "epoch": 2.6197053406998156, + "grad_norm": 0.28880423307418823, + "learning_rate": 8.654519070459254e-05, + "loss": 1.8547, + "step": 8535 + }, + { + "epoch": 2.620012277470841, + "grad_norm": 0.3130050301551819, + "learning_rate": 8.654179821390621e-05, + "loss": 1.9355, + "step": 8536 + }, + { + "epoch": 2.6203192142418663, + "grad_norm": 0.3151358664035797, + "learning_rate": 8.653840536209241e-05, + "loss": 1.8462, + "step": 8537 + }, + { + "epoch": 2.620626151012891, + "grad_norm": 0.2702169120311737, + "learning_rate": 8.653501214918468e-05, + "loss": 1.7966, + "step": 8538 + }, + { + "epoch": 2.6209330877839165, + "grad_norm": 0.31494441628456116, + "learning_rate": 8.653161857521655e-05, + "loss": 1.7449, + "step": 8539 + }, + { + "epoch": 2.6212400245549414, + "grad_norm": 0.3219514787197113, + "learning_rate": 8.652822464022154e-05, + "loss": 1.8238, + "step": 8540 + }, + { + "epoch": 2.6215469613259668, + "grad_norm": 0.3237066864967346, + "learning_rate": 8.652483034423322e-05, + "loss": 1.8273, + "step": 8541 + }, + { + "epoch": 2.621853898096992, + "grad_norm": 0.31354910135269165, + "learning_rate": 8.65214356872851e-05, + "loss": 1.8662, + "step": 8542 + }, + { + "epoch": 2.6221608348680174, + "grad_norm": 0.30085036158561707, + "learning_rate": 8.651804066941077e-05, + "loss": 1.8922, + "step": 8543 + }, + { + "epoch": 2.6224677716390423, + "grad_norm": 0.337528258562088, + "learning_rate": 8.651464529064373e-05, + "loss": 1.8234, + "step": 8544 + }, + { + "epoch": 2.6227747084100677, + "grad_norm": 0.33202415704727173, + "learning_rate": 8.65112495510176e-05, + "loss": 1.8331, + "step": 8545 + }, + { + "epoch": 2.6230816451810925, + "grad_norm": 0.3288112282752991, + "learning_rate": 8.650785345056586e-05, + "loss": 1.8129, + "step": 8546 + }, + { + "epoch": 2.623388581952118, + "grad_norm": 0.35483047366142273, + "learning_rate": 8.650445698932214e-05, + "loss": 1.8488, + "step": 8547 + }, + { + "epoch": 2.623695518723143, + "grad_norm": 0.32108932733535767, + "learning_rate": 8.650106016731998e-05, + "loss": 1.8263, + "step": 8548 + }, + { + "epoch": 2.624002455494168, + "grad_norm": 0.2902318239212036, + "learning_rate": 8.649766298459295e-05, + "loss": 1.8352, + "step": 8549 + }, + { + "epoch": 2.6243093922651934, + "grad_norm": 0.29014477133750916, + "learning_rate": 8.64942654411746e-05, + "loss": 1.8568, + "step": 8550 + }, + { + "epoch": 2.6246163290362183, + "grad_norm": 0.3996742367744446, + "learning_rate": 8.649086753709855e-05, + "loss": 1.8928, + "step": 8551 + }, + { + "epoch": 2.6249232658072437, + "grad_norm": 0.3703175187110901, + "learning_rate": 8.648746927239835e-05, + "loss": 1.829, + "step": 8552 + }, + { + "epoch": 2.625230202578269, + "grad_norm": 0.33802542090415955, + "learning_rate": 8.64840706471076e-05, + "loss": 1.8827, + "step": 8553 + }, + { + "epoch": 2.625537139349294, + "grad_norm": 0.33303168416023254, + "learning_rate": 8.648067166125988e-05, + "loss": 1.8964, + "step": 8554 + }, + { + "epoch": 2.6258440761203192, + "grad_norm": 0.33449646830558777, + "learning_rate": 8.647727231488878e-05, + "loss": 1.8477, + "step": 8555 + }, + { + "epoch": 2.626151012891344, + "grad_norm": 0.3260989189147949, + "learning_rate": 8.647387260802788e-05, + "loss": 1.8623, + "step": 8556 + }, + { + "epoch": 2.6264579496623695, + "grad_norm": 0.2847815752029419, + "learning_rate": 8.647047254071082e-05, + "loss": 1.769, + "step": 8557 + }, + { + "epoch": 2.626764886433395, + "grad_norm": 0.30041372776031494, + "learning_rate": 8.646707211297116e-05, + "loss": 1.8451, + "step": 8558 + }, + { + "epoch": 2.62707182320442, + "grad_norm": 0.3557286560535431, + "learning_rate": 8.646367132484252e-05, + "loss": 1.8233, + "step": 8559 + }, + { + "epoch": 2.627378759975445, + "grad_norm": 0.39471131563186646, + "learning_rate": 8.646027017635851e-05, + "loss": 1.8364, + "step": 8560 + }, + { + "epoch": 2.6276856967464703, + "grad_norm": 0.37501803040504456, + "learning_rate": 8.645686866755273e-05, + "loss": 1.8129, + "step": 8561 + }, + { + "epoch": 2.6279926335174952, + "grad_norm": 0.374553918838501, + "learning_rate": 8.645346679845881e-05, + "loss": 1.9388, + "step": 8562 + }, + { + "epoch": 2.6282995702885206, + "grad_norm": 0.34410929679870605, + "learning_rate": 8.645006456911037e-05, + "loss": 1.8496, + "step": 8563 + }, + { + "epoch": 2.628606507059546, + "grad_norm": 0.28208592534065247, + "learning_rate": 8.644666197954103e-05, + "loss": 1.8405, + "step": 8564 + }, + { + "epoch": 2.628913443830571, + "grad_norm": 0.2913917005062103, + "learning_rate": 8.644325902978441e-05, + "loss": 1.8775, + "step": 8565 + }, + { + "epoch": 2.629220380601596, + "grad_norm": 0.33285796642303467, + "learning_rate": 8.643985571987414e-05, + "loss": 1.8217, + "step": 8566 + }, + { + "epoch": 2.629527317372621, + "grad_norm": 0.3419492244720459, + "learning_rate": 8.643645204984386e-05, + "loss": 1.8911, + "step": 8567 + }, + { + "epoch": 2.6298342541436464, + "grad_norm": 0.33901095390319824, + "learning_rate": 8.643304801972721e-05, + "loss": 1.8653, + "step": 8568 + }, + { + "epoch": 2.6301411909146717, + "grad_norm": 0.30073773860931396, + "learning_rate": 8.642964362955781e-05, + "loss": 1.7544, + "step": 8569 + }, + { + "epoch": 2.630448127685697, + "grad_norm": 0.3300367593765259, + "learning_rate": 8.642623887936933e-05, + "loss": 1.8764, + "step": 8570 + }, + { + "epoch": 2.630755064456722, + "grad_norm": 0.330671101808548, + "learning_rate": 8.642283376919542e-05, + "loss": 1.8227, + "step": 8571 + }, + { + "epoch": 2.6310620012277472, + "grad_norm": 0.3498590290546417, + "learning_rate": 8.64194282990697e-05, + "loss": 1.8639, + "step": 8572 + }, + { + "epoch": 2.631368937998772, + "grad_norm": 0.33145999908447266, + "learning_rate": 8.641602246902586e-05, + "loss": 1.8442, + "step": 8573 + }, + { + "epoch": 2.6316758747697975, + "grad_norm": 0.29510337114334106, + "learning_rate": 8.641261627909754e-05, + "loss": 1.829, + "step": 8574 + }, + { + "epoch": 2.631982811540823, + "grad_norm": 0.2788131833076477, + "learning_rate": 8.640920972931839e-05, + "loss": 1.7717, + "step": 8575 + }, + { + "epoch": 2.6322897483118477, + "grad_norm": 0.27459269762039185, + "learning_rate": 8.640580281972209e-05, + "loss": 1.7924, + "step": 8576 + }, + { + "epoch": 2.632596685082873, + "grad_norm": 0.3517146110534668, + "learning_rate": 8.640239555034232e-05, + "loss": 1.8921, + "step": 8577 + }, + { + "epoch": 2.632903621853898, + "grad_norm": 0.2852388620376587, + "learning_rate": 8.639898792121273e-05, + "loss": 1.8207, + "step": 8578 + }, + { + "epoch": 2.6332105586249233, + "grad_norm": 0.3164372742176056, + "learning_rate": 8.639557993236702e-05, + "loss": 1.8782, + "step": 8579 + }, + { + "epoch": 2.6335174953959486, + "grad_norm": 0.43939462304115295, + "learning_rate": 8.639217158383885e-05, + "loss": 1.8345, + "step": 8580 + }, + { + "epoch": 2.6338244321669735, + "grad_norm": 0.45321017503738403, + "learning_rate": 8.63887628756619e-05, + "loss": 1.904, + "step": 8581 + }, + { + "epoch": 2.634131368937999, + "grad_norm": 0.4423905611038208, + "learning_rate": 8.638535380786989e-05, + "loss": 1.8894, + "step": 8582 + }, + { + "epoch": 2.6344383057090237, + "grad_norm": 0.3929237723350525, + "learning_rate": 8.638194438049648e-05, + "loss": 1.8835, + "step": 8583 + }, + { + "epoch": 2.634745242480049, + "grad_norm": 0.3178403973579407, + "learning_rate": 8.637853459357536e-05, + "loss": 1.8125, + "step": 8584 + }, + { + "epoch": 2.6350521792510744, + "grad_norm": 0.3796660602092743, + "learning_rate": 8.637512444714024e-05, + "loss": 1.9376, + "step": 8585 + }, + { + "epoch": 2.6353591160220997, + "grad_norm": 0.34011390805244446, + "learning_rate": 8.637171394122483e-05, + "loss": 1.8339, + "step": 8586 + }, + { + "epoch": 2.6356660527931246, + "grad_norm": 0.3423489034175873, + "learning_rate": 8.636830307586281e-05, + "loss": 1.82, + "step": 8587 + }, + { + "epoch": 2.63597298956415, + "grad_norm": 0.3644867241382599, + "learning_rate": 8.636489185108791e-05, + "loss": 1.811, + "step": 8588 + }, + { + "epoch": 2.636279926335175, + "grad_norm": 0.35383811593055725, + "learning_rate": 8.636148026693384e-05, + "loss": 1.8228, + "step": 8589 + }, + { + "epoch": 2.6365868631062, + "grad_norm": 0.28066012263298035, + "learning_rate": 8.635806832343431e-05, + "loss": 1.7752, + "step": 8590 + }, + { + "epoch": 2.6368937998772255, + "grad_norm": 0.27132275700569153, + "learning_rate": 8.635465602062304e-05, + "loss": 1.8053, + "step": 8591 + }, + { + "epoch": 2.6372007366482504, + "grad_norm": 0.3076920211315155, + "learning_rate": 8.635124335853375e-05, + "loss": 1.77, + "step": 8592 + }, + { + "epoch": 2.6375076734192757, + "grad_norm": 0.35130617022514343, + "learning_rate": 8.634783033720015e-05, + "loss": 1.8272, + "step": 8593 + }, + { + "epoch": 2.6378146101903006, + "grad_norm": 0.3805561661720276, + "learning_rate": 8.634441695665601e-05, + "loss": 1.8549, + "step": 8594 + }, + { + "epoch": 2.638121546961326, + "grad_norm": 0.3168867230415344, + "learning_rate": 8.634100321693504e-05, + "loss": 1.9131, + "step": 8595 + }, + { + "epoch": 2.6384284837323513, + "grad_norm": 0.3061029314994812, + "learning_rate": 8.633758911807095e-05, + "loss": 1.84, + "step": 8596 + }, + { + "epoch": 2.638735420503376, + "grad_norm": 0.2766086459159851, + "learning_rate": 8.633417466009752e-05, + "loss": 1.8519, + "step": 8597 + }, + { + "epoch": 2.6390423572744015, + "grad_norm": 0.3250633180141449, + "learning_rate": 8.633075984304849e-05, + "loss": 1.8434, + "step": 8598 + }, + { + "epoch": 2.6393492940454264, + "grad_norm": 0.2819656729698181, + "learning_rate": 8.63273446669576e-05, + "loss": 1.8181, + "step": 8599 + }, + { + "epoch": 2.6396562308164517, + "grad_norm": 0.3506627678871155, + "learning_rate": 8.632392913185859e-05, + "loss": 1.8521, + "step": 8600 + }, + { + "epoch": 2.639963167587477, + "grad_norm": 0.3026714026927948, + "learning_rate": 8.632051323778521e-05, + "loss": 1.8183, + "step": 8601 + }, + { + "epoch": 2.6402701043585024, + "grad_norm": 0.31900104880332947, + "learning_rate": 8.631709698477124e-05, + "loss": 1.8615, + "step": 8602 + }, + { + "epoch": 2.6405770411295273, + "grad_norm": 0.3017260730266571, + "learning_rate": 8.631368037285044e-05, + "loss": 1.837, + "step": 8603 + }, + { + "epoch": 2.6408839779005526, + "grad_norm": 0.29461613297462463, + "learning_rate": 8.631026340205655e-05, + "loss": 1.8398, + "step": 8604 + }, + { + "epoch": 2.6411909146715775, + "grad_norm": 0.3405241370201111, + "learning_rate": 8.630684607242337e-05, + "loss": 1.9241, + "step": 8605 + }, + { + "epoch": 2.641497851442603, + "grad_norm": 0.36280715465545654, + "learning_rate": 8.630342838398465e-05, + "loss": 1.8319, + "step": 8606 + }, + { + "epoch": 2.641804788213628, + "grad_norm": 0.32274433970451355, + "learning_rate": 8.630001033677414e-05, + "loss": 1.8462, + "step": 8607 + }, + { + "epoch": 2.642111724984653, + "grad_norm": 0.28930720686912537, + "learning_rate": 8.629659193082571e-05, + "loss": 1.8251, + "step": 8608 + }, + { + "epoch": 2.6424186617556784, + "grad_norm": 0.30114278197288513, + "learning_rate": 8.629317316617305e-05, + "loss": 1.8037, + "step": 8609 + }, + { + "epoch": 2.6427255985267033, + "grad_norm": 0.31895074248313904, + "learning_rate": 8.628975404285e-05, + "loss": 1.808, + "step": 8610 + }, + { + "epoch": 2.6430325352977286, + "grad_norm": 0.31819066405296326, + "learning_rate": 8.62863345608903e-05, + "loss": 1.811, + "step": 8611 + }, + { + "epoch": 2.643339472068754, + "grad_norm": 0.3860008716583252, + "learning_rate": 8.628291472032779e-05, + "loss": 1.9041, + "step": 8612 + }, + { + "epoch": 2.643646408839779, + "grad_norm": 0.4598442614078522, + "learning_rate": 8.627949452119626e-05, + "loss": 1.788, + "step": 8613 + }, + { + "epoch": 2.643953345610804, + "grad_norm": 0.4720706641674042, + "learning_rate": 8.62760739635295e-05, + "loss": 1.8436, + "step": 8614 + }, + { + "epoch": 2.644260282381829, + "grad_norm": 0.3894381523132324, + "learning_rate": 8.627265304736131e-05, + "loss": 1.8188, + "step": 8615 + }, + { + "epoch": 2.6445672191528544, + "grad_norm": 0.2819352149963379, + "learning_rate": 8.626923177272551e-05, + "loss": 1.7804, + "step": 8616 + }, + { + "epoch": 2.6448741559238798, + "grad_norm": 0.33847305178642273, + "learning_rate": 8.626581013965588e-05, + "loss": 1.8628, + "step": 8617 + }, + { + "epoch": 2.645181092694905, + "grad_norm": 0.49113303422927856, + "learning_rate": 8.626238814818628e-05, + "loss": 1.821, + "step": 8618 + }, + { + "epoch": 2.64548802946593, + "grad_norm": 0.5562265515327454, + "learning_rate": 8.62589657983505e-05, + "loss": 1.8732, + "step": 8619 + }, + { + "epoch": 2.6457949662369553, + "grad_norm": 0.48525476455688477, + "learning_rate": 8.625554309018237e-05, + "loss": 1.8711, + "step": 8620 + }, + { + "epoch": 2.64610190300798, + "grad_norm": 0.35900986194610596, + "learning_rate": 8.62521200237157e-05, + "loss": 1.8922, + "step": 8621 + }, + { + "epoch": 2.6464088397790055, + "grad_norm": 0.2920636832714081, + "learning_rate": 8.624869659898435e-05, + "loss": 1.8121, + "step": 8622 + }, + { + "epoch": 2.646715776550031, + "grad_norm": 0.3626689314842224, + "learning_rate": 8.624527281602213e-05, + "loss": 1.8231, + "step": 8623 + }, + { + "epoch": 2.6470227133210558, + "grad_norm": 0.37683549523353577, + "learning_rate": 8.624184867486288e-05, + "loss": 1.8648, + "step": 8624 + }, + { + "epoch": 2.647329650092081, + "grad_norm": 0.293865829706192, + "learning_rate": 8.623842417554043e-05, + "loss": 1.8347, + "step": 8625 + }, + { + "epoch": 2.647636586863106, + "grad_norm": 0.28916221857070923, + "learning_rate": 8.623499931808863e-05, + "loss": 1.8337, + "step": 8626 + }, + { + "epoch": 2.6479435236341313, + "grad_norm": 0.439003586769104, + "learning_rate": 8.623157410254134e-05, + "loss": 1.8933, + "step": 8627 + }, + { + "epoch": 2.6482504604051567, + "grad_norm": 0.39125844836235046, + "learning_rate": 8.62281485289324e-05, + "loss": 1.7986, + "step": 8628 + }, + { + "epoch": 2.6485573971761815, + "grad_norm": 0.3968810439109802, + "learning_rate": 8.622472259729566e-05, + "loss": 1.8211, + "step": 8629 + }, + { + "epoch": 2.648864333947207, + "grad_norm": 0.37775713205337524, + "learning_rate": 8.622129630766498e-05, + "loss": 1.8976, + "step": 8630 + }, + { + "epoch": 2.6491712707182318, + "grad_norm": 0.329583078622818, + "learning_rate": 8.621786966007422e-05, + "loss": 1.9164, + "step": 8631 + }, + { + "epoch": 2.649478207489257, + "grad_norm": 0.3499230742454529, + "learning_rate": 8.621444265455725e-05, + "loss": 1.8589, + "step": 8632 + }, + { + "epoch": 2.6497851442602824, + "grad_norm": 0.504540741443634, + "learning_rate": 8.621101529114792e-05, + "loss": 1.7853, + "step": 8633 + }, + { + "epoch": 2.650092081031308, + "grad_norm": 0.47648704051971436, + "learning_rate": 8.620758756988012e-05, + "loss": 1.865, + "step": 8634 + }, + { + "epoch": 2.6503990178023327, + "grad_norm": 0.3592020869255066, + "learning_rate": 8.62041594907877e-05, + "loss": 1.886, + "step": 8635 + }, + { + "epoch": 2.650705954573358, + "grad_norm": 0.4862852096557617, + "learning_rate": 8.620073105390458e-05, + "loss": 1.8408, + "step": 8636 + }, + { + "epoch": 2.651012891344383, + "grad_norm": 0.5418413877487183, + "learning_rate": 8.619730225926462e-05, + "loss": 1.8715, + "step": 8637 + }, + { + "epoch": 2.6513198281154082, + "grad_norm": 0.4154299795627594, + "learning_rate": 8.619387310690168e-05, + "loss": 1.8879, + "step": 8638 + }, + { + "epoch": 2.6516267648864336, + "grad_norm": 0.3325296938419342, + "learning_rate": 8.619044359684968e-05, + "loss": 1.8422, + "step": 8639 + }, + { + "epoch": 2.6519337016574585, + "grad_norm": 0.4082878828048706, + "learning_rate": 8.61870137291425e-05, + "loss": 1.8375, + "step": 8640 + }, + { + "epoch": 2.652240638428484, + "grad_norm": 0.46948596835136414, + "learning_rate": 8.618358350381406e-05, + "loss": 1.8367, + "step": 8641 + }, + { + "epoch": 2.6525475751995087, + "grad_norm": 0.3770928978919983, + "learning_rate": 8.618015292089823e-05, + "loss": 1.8236, + "step": 8642 + }, + { + "epoch": 2.652854511970534, + "grad_norm": 0.27340826392173767, + "learning_rate": 8.617672198042892e-05, + "loss": 1.8446, + "step": 8643 + }, + { + "epoch": 2.6531614487415593, + "grad_norm": 0.4071608781814575, + "learning_rate": 8.617329068244004e-05, + "loss": 1.8576, + "step": 8644 + }, + { + "epoch": 2.6534683855125847, + "grad_norm": 0.5041884779930115, + "learning_rate": 8.61698590269655e-05, + "loss": 1.9075, + "step": 8645 + }, + { + "epoch": 2.6537753222836096, + "grad_norm": 0.4129817485809326, + "learning_rate": 8.616642701403921e-05, + "loss": 1.8592, + "step": 8646 + }, + { + "epoch": 2.654082259054635, + "grad_norm": 0.2837994694709778, + "learning_rate": 8.616299464369508e-05, + "loss": 1.8383, + "step": 8647 + }, + { + "epoch": 2.65438919582566, + "grad_norm": 0.3413170278072357, + "learning_rate": 8.615956191596707e-05, + "loss": 1.8083, + "step": 8648 + }, + { + "epoch": 2.654696132596685, + "grad_norm": 0.3661767244338989, + "learning_rate": 8.615612883088907e-05, + "loss": 1.9141, + "step": 8649 + }, + { + "epoch": 2.6550030693677105, + "grad_norm": 0.3209584951400757, + "learning_rate": 8.6152695388495e-05, + "loss": 1.8886, + "step": 8650 + }, + { + "epoch": 2.6553100061387354, + "grad_norm": 0.3161548674106598, + "learning_rate": 8.61492615888188e-05, + "loss": 1.832, + "step": 8651 + }, + { + "epoch": 2.6556169429097607, + "grad_norm": 0.3258545696735382, + "learning_rate": 8.614582743189441e-05, + "loss": 1.8747, + "step": 8652 + }, + { + "epoch": 2.6559238796807856, + "grad_norm": 0.3528682291507721, + "learning_rate": 8.614239291775579e-05, + "loss": 1.9192, + "step": 8653 + }, + { + "epoch": 2.656230816451811, + "grad_norm": 0.3430826961994171, + "learning_rate": 8.613895804643684e-05, + "loss": 1.8601, + "step": 8654 + }, + { + "epoch": 2.6565377532228363, + "grad_norm": 0.3221988379955292, + "learning_rate": 8.613552281797152e-05, + "loss": 1.9218, + "step": 8655 + }, + { + "epoch": 2.656844689993861, + "grad_norm": 0.2917289137840271, + "learning_rate": 8.613208723239379e-05, + "loss": 1.7443, + "step": 8656 + }, + { + "epoch": 2.6571516267648865, + "grad_norm": 0.28350377082824707, + "learning_rate": 8.612865128973762e-05, + "loss": 1.809, + "step": 8657 + }, + { + "epoch": 2.6574585635359114, + "grad_norm": 0.2758159339427948, + "learning_rate": 8.61252149900369e-05, + "loss": 1.8628, + "step": 8658 + }, + { + "epoch": 2.6577655003069367, + "grad_norm": 0.3537377417087555, + "learning_rate": 8.612177833332566e-05, + "loss": 1.8586, + "step": 8659 + }, + { + "epoch": 2.658072437077962, + "grad_norm": 0.38237693905830383, + "learning_rate": 8.611834131963783e-05, + "loss": 1.8869, + "step": 8660 + }, + { + "epoch": 2.6583793738489874, + "grad_norm": 0.30623751878738403, + "learning_rate": 8.611490394900739e-05, + "loss": 1.8508, + "step": 8661 + }, + { + "epoch": 2.6586863106200123, + "grad_norm": 0.2597752809524536, + "learning_rate": 8.611146622146828e-05, + "loss": 1.7931, + "step": 8662 + }, + { + "epoch": 2.6589932473910376, + "grad_norm": 0.2953357696533203, + "learning_rate": 8.61080281370545e-05, + "loss": 1.837, + "step": 8663 + }, + { + "epoch": 2.6593001841620625, + "grad_norm": 0.3018724322319031, + "learning_rate": 8.610458969580003e-05, + "loss": 1.871, + "step": 8664 + }, + { + "epoch": 2.659607120933088, + "grad_norm": 0.36607179045677185, + "learning_rate": 8.610115089773885e-05, + "loss": 1.9453, + "step": 8665 + }, + { + "epoch": 2.659914057704113, + "grad_norm": 0.38754695653915405, + "learning_rate": 8.609771174290493e-05, + "loss": 1.8886, + "step": 8666 + }, + { + "epoch": 2.660220994475138, + "grad_norm": 0.3752847909927368, + "learning_rate": 8.609427223133226e-05, + "loss": 1.8662, + "step": 8667 + }, + { + "epoch": 2.6605279312461634, + "grad_norm": 0.3301216661930084, + "learning_rate": 8.609083236305483e-05, + "loss": 1.8697, + "step": 8668 + }, + { + "epoch": 2.6608348680171883, + "grad_norm": 0.31682586669921875, + "learning_rate": 8.608739213810666e-05, + "loss": 1.8982, + "step": 8669 + }, + { + "epoch": 2.6611418047882136, + "grad_norm": 0.30835145711898804, + "learning_rate": 8.608395155652172e-05, + "loss": 1.8245, + "step": 8670 + }, + { + "epoch": 2.661448741559239, + "grad_norm": 0.32517582178115845, + "learning_rate": 8.608051061833402e-05, + "loss": 1.9117, + "step": 8671 + }, + { + "epoch": 2.661755678330264, + "grad_norm": 0.3120395541191101, + "learning_rate": 8.607706932357757e-05, + "loss": 1.76, + "step": 8672 + }, + { + "epoch": 2.662062615101289, + "grad_norm": 0.31719091534614563, + "learning_rate": 8.607362767228637e-05, + "loss": 1.8939, + "step": 8673 + }, + { + "epoch": 2.662369551872314, + "grad_norm": 0.28792136907577515, + "learning_rate": 8.607018566449445e-05, + "loss": 1.8403, + "step": 8674 + }, + { + "epoch": 2.6626764886433394, + "grad_norm": 0.28327643871307373, + "learning_rate": 8.606674330023581e-05, + "loss": 1.8204, + "step": 8675 + }, + { + "epoch": 2.6629834254143647, + "grad_norm": 0.29808422923088074, + "learning_rate": 8.606330057954446e-05, + "loss": 1.8325, + "step": 8676 + }, + { + "epoch": 2.66329036218539, + "grad_norm": 0.36162641644477844, + "learning_rate": 8.605985750245446e-05, + "loss": 1.8387, + "step": 8677 + }, + { + "epoch": 2.663597298956415, + "grad_norm": 0.3418589234352112, + "learning_rate": 8.605641406899978e-05, + "loss": 1.8139, + "step": 8678 + }, + { + "epoch": 2.6639042357274403, + "grad_norm": 0.31307870149612427, + "learning_rate": 8.605297027921451e-05, + "loss": 1.8897, + "step": 8679 + }, + { + "epoch": 2.664211172498465, + "grad_norm": 0.36962878704071045, + "learning_rate": 8.604952613313264e-05, + "loss": 1.9233, + "step": 8680 + }, + { + "epoch": 2.6645181092694905, + "grad_norm": 0.3502652049064636, + "learning_rate": 8.604608163078824e-05, + "loss": 1.8218, + "step": 8681 + }, + { + "epoch": 2.664825046040516, + "grad_norm": 0.3703038692474365, + "learning_rate": 8.604263677221533e-05, + "loss": 1.8484, + "step": 8682 + }, + { + "epoch": 2.6651319828115407, + "grad_norm": 0.2609662711620331, + "learning_rate": 8.603919155744796e-05, + "loss": 1.7645, + "step": 8683 + }, + { + "epoch": 2.665438919582566, + "grad_norm": 0.33297231793403625, + "learning_rate": 8.603574598652015e-05, + "loss": 1.8543, + "step": 8684 + }, + { + "epoch": 2.665745856353591, + "grad_norm": 0.28411462903022766, + "learning_rate": 8.603230005946601e-05, + "loss": 1.867, + "step": 8685 + }, + { + "epoch": 2.6660527931246163, + "grad_norm": 0.3209732174873352, + "learning_rate": 8.602885377631954e-05, + "loss": 1.8886, + "step": 8686 + }, + { + "epoch": 2.6663597298956416, + "grad_norm": 0.35397234559059143, + "learning_rate": 8.602540713711482e-05, + "loss": 1.8965, + "step": 8687 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2925071716308594, + "learning_rate": 8.602196014188593e-05, + "loss": 1.8027, + "step": 8688 + }, + { + "epoch": 2.666973603437692, + "grad_norm": 0.2902941107749939, + "learning_rate": 8.60185127906669e-05, + "loss": 1.8022, + "step": 8689 + }, + { + "epoch": 2.6672805402087167, + "grad_norm": 0.31528550386428833, + "learning_rate": 8.601506508349181e-05, + "loss": 1.8153, + "step": 8690 + }, + { + "epoch": 2.667587476979742, + "grad_norm": 0.32254844903945923, + "learning_rate": 8.601161702039477e-05, + "loss": 1.8199, + "step": 8691 + }, + { + "epoch": 2.6678944137507674, + "grad_norm": 0.2999059855937958, + "learning_rate": 8.600816860140979e-05, + "loss": 1.8404, + "step": 8692 + }, + { + "epoch": 2.6682013505217927, + "grad_norm": 0.32727453112602234, + "learning_rate": 8.6004719826571e-05, + "loss": 1.8148, + "step": 8693 + }, + { + "epoch": 2.6685082872928176, + "grad_norm": 0.3048906624317169, + "learning_rate": 8.600127069591245e-05, + "loss": 1.833, + "step": 8694 + }, + { + "epoch": 2.668815224063843, + "grad_norm": 0.43790102005004883, + "learning_rate": 8.599782120946826e-05, + "loss": 1.8537, + "step": 8695 + }, + { + "epoch": 2.669122160834868, + "grad_norm": 0.38096752762794495, + "learning_rate": 8.59943713672725e-05, + "loss": 1.8094, + "step": 8696 + }, + { + "epoch": 2.669429097605893, + "grad_norm": 0.3065931499004364, + "learning_rate": 8.599092116935927e-05, + "loss": 1.8878, + "step": 8697 + }, + { + "epoch": 2.6697360343769185, + "grad_norm": 0.41807904839515686, + "learning_rate": 8.598747061576264e-05, + "loss": 1.8753, + "step": 8698 + }, + { + "epoch": 2.6700429711479434, + "grad_norm": 0.4906943142414093, + "learning_rate": 8.598401970651676e-05, + "loss": 1.7642, + "step": 8699 + }, + { + "epoch": 2.6703499079189688, + "grad_norm": 0.37138858437538147, + "learning_rate": 8.598056844165567e-05, + "loss": 1.8191, + "step": 8700 + }, + { + "epoch": 2.6706568446899936, + "grad_norm": 0.2804940938949585, + "learning_rate": 8.597711682121354e-05, + "loss": 1.8238, + "step": 8701 + }, + { + "epoch": 2.670963781461019, + "grad_norm": 0.3853018581867218, + "learning_rate": 8.597366484522445e-05, + "loss": 1.8762, + "step": 8702 + }, + { + "epoch": 2.6712707182320443, + "grad_norm": 0.3066580295562744, + "learning_rate": 8.597021251372253e-05, + "loss": 1.7638, + "step": 8703 + }, + { + "epoch": 2.671577655003069, + "grad_norm": 0.30797824263572693, + "learning_rate": 8.596675982674186e-05, + "loss": 1.8574, + "step": 8704 + }, + { + "epoch": 2.6718845917740945, + "grad_norm": 0.3268548548221588, + "learning_rate": 8.596330678431661e-05, + "loss": 1.9184, + "step": 8705 + }, + { + "epoch": 2.6721915285451194, + "grad_norm": 0.4077534079551697, + "learning_rate": 8.595985338648087e-05, + "loss": 1.8967, + "step": 8706 + }, + { + "epoch": 2.6724984653161448, + "grad_norm": 0.4514889419078827, + "learning_rate": 8.595639963326881e-05, + "loss": 1.8491, + "step": 8707 + }, + { + "epoch": 2.67280540208717, + "grad_norm": 0.39269959926605225, + "learning_rate": 8.59529455247145e-05, + "loss": 1.7865, + "step": 8708 + }, + { + "epoch": 2.6731123388581954, + "grad_norm": 0.3139820694923401, + "learning_rate": 8.594949106085212e-05, + "loss": 1.8007, + "step": 8709 + }, + { + "epoch": 2.6734192756292203, + "grad_norm": 0.3423599600791931, + "learning_rate": 8.59460362417158e-05, + "loss": 1.8389, + "step": 8710 + }, + { + "epoch": 2.6737262124002457, + "grad_norm": 0.3829670548439026, + "learning_rate": 8.594258106733968e-05, + "loss": 1.8355, + "step": 8711 + }, + { + "epoch": 2.6740331491712706, + "grad_norm": 0.34447145462036133, + "learning_rate": 8.593912553775791e-05, + "loss": 1.8595, + "step": 8712 + }, + { + "epoch": 2.674340085942296, + "grad_norm": 0.34868502616882324, + "learning_rate": 8.593566965300465e-05, + "loss": 1.9195, + "step": 8713 + }, + { + "epoch": 2.674647022713321, + "grad_norm": 0.4919234812259674, + "learning_rate": 8.593221341311402e-05, + "loss": 1.8321, + "step": 8714 + }, + { + "epoch": 2.674953959484346, + "grad_norm": 0.4413202702999115, + "learning_rate": 8.59287568181202e-05, + "loss": 1.7976, + "step": 8715 + }, + { + "epoch": 2.6752608962553714, + "grad_norm": 0.3395153880119324, + "learning_rate": 8.592529986805736e-05, + "loss": 1.7974, + "step": 8716 + }, + { + "epoch": 2.6755678330263963, + "grad_norm": 0.30407002568244934, + "learning_rate": 8.592184256295965e-05, + "loss": 1.7929, + "step": 8717 + }, + { + "epoch": 2.6758747697974217, + "grad_norm": 0.31925150752067566, + "learning_rate": 8.591838490286121e-05, + "loss": 1.8413, + "step": 8718 + }, + { + "epoch": 2.676181706568447, + "grad_norm": 0.28456512093544006, + "learning_rate": 8.591492688779627e-05, + "loss": 1.8686, + "step": 8719 + }, + { + "epoch": 2.6764886433394723, + "grad_norm": 0.3286445438861847, + "learning_rate": 8.591146851779895e-05, + "loss": 1.8538, + "step": 8720 + }, + { + "epoch": 2.6767955801104972, + "grad_norm": 0.40354880690574646, + "learning_rate": 8.590800979290346e-05, + "loss": 1.8599, + "step": 8721 + }, + { + "epoch": 2.6771025168815226, + "grad_norm": 0.3654378652572632, + "learning_rate": 8.590455071314397e-05, + "loss": 1.8063, + "step": 8722 + }, + { + "epoch": 2.6774094536525475, + "grad_norm": 0.3211844861507416, + "learning_rate": 8.590109127855466e-05, + "loss": 1.8146, + "step": 8723 + }, + { + "epoch": 2.677716390423573, + "grad_norm": 0.30884361267089844, + "learning_rate": 8.589763148916973e-05, + "loss": 1.8725, + "step": 8724 + }, + { + "epoch": 2.678023327194598, + "grad_norm": 0.303095281124115, + "learning_rate": 8.589417134502336e-05, + "loss": 1.8994, + "step": 8725 + }, + { + "epoch": 2.678330263965623, + "grad_norm": 0.3086979389190674, + "learning_rate": 8.589071084614977e-05, + "loss": 1.7941, + "step": 8726 + }, + { + "epoch": 2.6786372007366483, + "grad_norm": 0.30298081040382385, + "learning_rate": 8.588724999258311e-05, + "loss": 1.8945, + "step": 8727 + }, + { + "epoch": 2.6789441375076732, + "grad_norm": 0.33253392577171326, + "learning_rate": 8.588378878435763e-05, + "loss": 1.8397, + "step": 8728 + }, + { + "epoch": 2.6792510742786986, + "grad_norm": 0.2782913148403168, + "learning_rate": 8.588032722150752e-05, + "loss": 1.8505, + "step": 8729 + }, + { + "epoch": 2.679558011049724, + "grad_norm": 0.3482373058795929, + "learning_rate": 8.587686530406697e-05, + "loss": 1.9144, + "step": 8730 + }, + { + "epoch": 2.679864947820749, + "grad_norm": 0.31985580921173096, + "learning_rate": 8.587340303207021e-05, + "loss": 1.7695, + "step": 8731 + }, + { + "epoch": 2.680171884591774, + "grad_norm": 0.3222995400428772, + "learning_rate": 8.586994040555147e-05, + "loss": 1.8624, + "step": 8732 + }, + { + "epoch": 2.680478821362799, + "grad_norm": 0.28178468346595764, + "learning_rate": 8.586647742454495e-05, + "loss": 1.8036, + "step": 8733 + }, + { + "epoch": 2.6807857581338244, + "grad_norm": 0.27367156744003296, + "learning_rate": 8.586301408908487e-05, + "loss": 1.801, + "step": 8734 + }, + { + "epoch": 2.6810926949048497, + "grad_norm": 0.2696636915206909, + "learning_rate": 8.585955039920547e-05, + "loss": 1.8211, + "step": 8735 + }, + { + "epoch": 2.681399631675875, + "grad_norm": 0.2880568504333496, + "learning_rate": 8.585608635494098e-05, + "loss": 1.8543, + "step": 8736 + }, + { + "epoch": 2.6817065684469, + "grad_norm": 0.28708669543266296, + "learning_rate": 8.585262195632562e-05, + "loss": 1.8311, + "step": 8737 + }, + { + "epoch": 2.6820135052179253, + "grad_norm": 0.2633354663848877, + "learning_rate": 8.584915720339364e-05, + "loss": 1.7815, + "step": 8738 + }, + { + "epoch": 2.68232044198895, + "grad_norm": 0.25772908329963684, + "learning_rate": 8.584569209617928e-05, + "loss": 1.8322, + "step": 8739 + }, + { + "epoch": 2.6826273787599755, + "grad_norm": 0.2665303647518158, + "learning_rate": 8.584222663471677e-05, + "loss": 1.8456, + "step": 8740 + }, + { + "epoch": 2.682934315531001, + "grad_norm": 0.26330938935279846, + "learning_rate": 8.583876081904038e-05, + "loss": 1.8552, + "step": 8741 + }, + { + "epoch": 2.6832412523020257, + "grad_norm": 0.29758915305137634, + "learning_rate": 8.583529464918434e-05, + "loss": 1.8362, + "step": 8742 + }, + { + "epoch": 2.683548189073051, + "grad_norm": 0.32018154859542847, + "learning_rate": 8.583182812518293e-05, + "loss": 1.8439, + "step": 8743 + }, + { + "epoch": 2.683855125844076, + "grad_norm": 0.33279770612716675, + "learning_rate": 8.582836124707036e-05, + "loss": 1.8629, + "step": 8744 + }, + { + "epoch": 2.6841620626151013, + "grad_norm": 0.40244174003601074, + "learning_rate": 8.582489401488096e-05, + "loss": 1.8221, + "step": 8745 + }, + { + "epoch": 2.6844689993861266, + "grad_norm": 0.3935016393661499, + "learning_rate": 8.582142642864895e-05, + "loss": 1.8564, + "step": 8746 + }, + { + "epoch": 2.6847759361571515, + "grad_norm": 0.3062369227409363, + "learning_rate": 8.58179584884086e-05, + "loss": 1.8587, + "step": 8747 + }, + { + "epoch": 2.685082872928177, + "grad_norm": 0.320422500371933, + "learning_rate": 8.58144901941942e-05, + "loss": 1.8758, + "step": 8748 + }, + { + "epoch": 2.6853898096992017, + "grad_norm": 0.3681413531303406, + "learning_rate": 8.581102154604001e-05, + "loss": 1.7899, + "step": 8749 + }, + { + "epoch": 2.685696746470227, + "grad_norm": 0.37779754400253296, + "learning_rate": 8.580755254398032e-05, + "loss": 1.8584, + "step": 8750 + }, + { + "epoch": 2.6860036832412524, + "grad_norm": 0.34761306643486023, + "learning_rate": 8.58040831880494e-05, + "loss": 1.8656, + "step": 8751 + }, + { + "epoch": 2.6863106200122777, + "grad_norm": 0.2833636403083801, + "learning_rate": 8.580061347828156e-05, + "loss": 1.8043, + "step": 8752 + }, + { + "epoch": 2.6866175567833026, + "grad_norm": 0.29990699887275696, + "learning_rate": 8.579714341471106e-05, + "loss": 1.8365, + "step": 8753 + }, + { + "epoch": 2.686924493554328, + "grad_norm": 0.3322729766368866, + "learning_rate": 8.579367299737222e-05, + "loss": 1.8541, + "step": 8754 + }, + { + "epoch": 2.687231430325353, + "grad_norm": 0.31999245285987854, + "learning_rate": 8.579020222629931e-05, + "loss": 1.8405, + "step": 8755 + }, + { + "epoch": 2.687538367096378, + "grad_norm": 0.332714319229126, + "learning_rate": 8.578673110152666e-05, + "loss": 1.9512, + "step": 8756 + }, + { + "epoch": 2.6878453038674035, + "grad_norm": 0.36372992396354675, + "learning_rate": 8.578325962308855e-05, + "loss": 1.8969, + "step": 8757 + }, + { + "epoch": 2.6881522406384284, + "grad_norm": 0.27239182591438293, + "learning_rate": 8.577978779101929e-05, + "loss": 1.7898, + "step": 8758 + }, + { + "epoch": 2.6884591774094537, + "grad_norm": 0.3552536070346832, + "learning_rate": 8.57763156053532e-05, + "loss": 1.8919, + "step": 8759 + }, + { + "epoch": 2.6887661141804786, + "grad_norm": 0.40591174364089966, + "learning_rate": 8.577284306612458e-05, + "loss": 1.8021, + "step": 8760 + }, + { + "epoch": 2.689073050951504, + "grad_norm": 0.37012994289398193, + "learning_rate": 8.576937017336777e-05, + "loss": 1.7803, + "step": 8761 + }, + { + "epoch": 2.6893799877225293, + "grad_norm": 0.33496031165122986, + "learning_rate": 8.576589692711707e-05, + "loss": 1.8573, + "step": 8762 + }, + { + "epoch": 2.689686924493554, + "grad_norm": 0.35000404715538025, + "learning_rate": 8.576242332740683e-05, + "loss": 1.8769, + "step": 8763 + }, + { + "epoch": 2.6899938612645795, + "grad_norm": 0.32730549573898315, + "learning_rate": 8.575894937427135e-05, + "loss": 1.823, + "step": 8764 + }, + { + "epoch": 2.6903007980356044, + "grad_norm": 0.31418806314468384, + "learning_rate": 8.575547506774497e-05, + "loss": 1.7646, + "step": 8765 + }, + { + "epoch": 2.6906077348066297, + "grad_norm": 0.277721107006073, + "learning_rate": 8.575200040786205e-05, + "loss": 1.8046, + "step": 8766 + }, + { + "epoch": 2.690914671577655, + "grad_norm": 0.3289557695388794, + "learning_rate": 8.574852539465688e-05, + "loss": 1.8145, + "step": 8767 + }, + { + "epoch": 2.6912216083486804, + "grad_norm": 0.28926602005958557, + "learning_rate": 8.574505002816385e-05, + "loss": 1.7627, + "step": 8768 + }, + { + "epoch": 2.6915285451197053, + "grad_norm": 0.2972332835197449, + "learning_rate": 8.574157430841727e-05, + "loss": 1.8294, + "step": 8769 + }, + { + "epoch": 2.6918354818907306, + "grad_norm": 0.28366953134536743, + "learning_rate": 8.57380982354515e-05, + "loss": 1.8535, + "step": 8770 + }, + { + "epoch": 2.6921424186617555, + "grad_norm": 0.2798771262168884, + "learning_rate": 8.57346218093009e-05, + "loss": 1.8298, + "step": 8771 + }, + { + "epoch": 2.692449355432781, + "grad_norm": 0.2614765465259552, + "learning_rate": 8.573114502999983e-05, + "loss": 1.8555, + "step": 8772 + }, + { + "epoch": 2.692756292203806, + "grad_norm": 0.30653777718544006, + "learning_rate": 8.572766789758265e-05, + "loss": 1.8507, + "step": 8773 + }, + { + "epoch": 2.693063228974831, + "grad_norm": 0.3189094066619873, + "learning_rate": 8.572419041208369e-05, + "loss": 1.8791, + "step": 8774 + }, + { + "epoch": 2.6933701657458564, + "grad_norm": 0.33381524682044983, + "learning_rate": 8.572071257353735e-05, + "loss": 1.8241, + "step": 8775 + }, + { + "epoch": 2.6936771025168813, + "grad_norm": 0.2776879668235779, + "learning_rate": 8.571723438197801e-05, + "loss": 1.7837, + "step": 8776 + }, + { + "epoch": 2.6939840392879066, + "grad_norm": 0.35845425724983215, + "learning_rate": 8.571375583744001e-05, + "loss": 1.8896, + "step": 8777 + }, + { + "epoch": 2.694290976058932, + "grad_norm": 0.28849005699157715, + "learning_rate": 8.571027693995775e-05, + "loss": 1.803, + "step": 8778 + }, + { + "epoch": 2.694597912829957, + "grad_norm": 0.3008786141872406, + "learning_rate": 8.57067976895656e-05, + "loss": 1.8559, + "step": 8779 + }, + { + "epoch": 2.694904849600982, + "grad_norm": 0.2924736440181732, + "learning_rate": 8.570331808629795e-05, + "loss": 1.8016, + "step": 8780 + }, + { + "epoch": 2.695211786372007, + "grad_norm": 0.2962380051612854, + "learning_rate": 8.569983813018917e-05, + "loss": 1.819, + "step": 8781 + }, + { + "epoch": 2.6955187231430324, + "grad_norm": 0.3141970634460449, + "learning_rate": 8.569635782127367e-05, + "loss": 1.8462, + "step": 8782 + }, + { + "epoch": 2.6958256599140578, + "grad_norm": 0.297061562538147, + "learning_rate": 8.569287715958584e-05, + "loss": 1.855, + "step": 8783 + }, + { + "epoch": 2.696132596685083, + "grad_norm": 0.30669623613357544, + "learning_rate": 8.568939614516009e-05, + "loss": 1.8626, + "step": 8784 + }, + { + "epoch": 2.696439533456108, + "grad_norm": 0.2782025933265686, + "learning_rate": 8.568591477803081e-05, + "loss": 1.8993, + "step": 8785 + }, + { + "epoch": 2.6967464702271333, + "grad_norm": 0.3644821345806122, + "learning_rate": 8.568243305823239e-05, + "loss": 1.8318, + "step": 8786 + }, + { + "epoch": 2.697053406998158, + "grad_norm": 0.4073259234428406, + "learning_rate": 8.567895098579925e-05, + "loss": 1.8963, + "step": 8787 + }, + { + "epoch": 2.6973603437691835, + "grad_norm": 0.40539780259132385, + "learning_rate": 8.567546856076583e-05, + "loss": 1.8644, + "step": 8788 + }, + { + "epoch": 2.697667280540209, + "grad_norm": 0.36739271879196167, + "learning_rate": 8.567198578316648e-05, + "loss": 1.8555, + "step": 8789 + }, + { + "epoch": 2.6979742173112338, + "grad_norm": 0.3339182138442993, + "learning_rate": 8.566850265303568e-05, + "loss": 1.8431, + "step": 8790 + }, + { + "epoch": 2.698281154082259, + "grad_norm": 0.3389740586280823, + "learning_rate": 8.566501917040784e-05, + "loss": 1.8271, + "step": 8791 + }, + { + "epoch": 2.698588090853284, + "grad_norm": 0.33819615840911865, + "learning_rate": 8.566153533531737e-05, + "loss": 1.8504, + "step": 8792 + }, + { + "epoch": 2.6988950276243093, + "grad_norm": 0.39106276631355286, + "learning_rate": 8.56580511477987e-05, + "loss": 1.7656, + "step": 8793 + }, + { + "epoch": 2.6992019643953347, + "grad_norm": 0.3374726474285126, + "learning_rate": 8.565456660788628e-05, + "loss": 1.8256, + "step": 8794 + }, + { + "epoch": 2.69950890116636, + "grad_norm": 0.33096614480018616, + "learning_rate": 8.565108171561452e-05, + "loss": 1.9486, + "step": 8795 + }, + { + "epoch": 2.699815837937385, + "grad_norm": 0.3202100396156311, + "learning_rate": 8.564759647101788e-05, + "loss": 1.7708, + "step": 8796 + }, + { + "epoch": 2.7001227747084102, + "grad_norm": 0.28830909729003906, + "learning_rate": 8.56441108741308e-05, + "loss": 1.8247, + "step": 8797 + }, + { + "epoch": 2.700429711479435, + "grad_norm": 0.32385459542274475, + "learning_rate": 8.564062492498772e-05, + "loss": 1.8338, + "step": 8798 + }, + { + "epoch": 2.7007366482504604, + "grad_norm": 0.3059900104999542, + "learning_rate": 8.56371386236231e-05, + "loss": 1.8321, + "step": 8799 + }, + { + "epoch": 2.701043585021486, + "grad_norm": 0.2922738492488861, + "learning_rate": 8.563365197007141e-05, + "loss": 1.7734, + "step": 8800 + }, + { + "epoch": 2.7013505217925107, + "grad_norm": 0.32542386651039124, + "learning_rate": 8.563016496436704e-05, + "loss": 1.8696, + "step": 8801 + }, + { + "epoch": 2.701657458563536, + "grad_norm": 0.2830851674079895, + "learning_rate": 8.562667760654452e-05, + "loss": 1.8237, + "step": 8802 + }, + { + "epoch": 2.701964395334561, + "grad_norm": 0.2794142961502075, + "learning_rate": 8.562318989663831e-05, + "loss": 1.8301, + "step": 8803 + }, + { + "epoch": 2.7022713321055862, + "grad_norm": 0.3149101436138153, + "learning_rate": 8.561970183468281e-05, + "loss": 1.8716, + "step": 8804 + }, + { + "epoch": 2.7025782688766116, + "grad_norm": 0.29530593752861023, + "learning_rate": 8.561621342071258e-05, + "loss": 1.9069, + "step": 8805 + }, + { + "epoch": 2.7028852056476365, + "grad_norm": 0.33965879678726196, + "learning_rate": 8.561272465476204e-05, + "loss": 1.8381, + "step": 8806 + }, + { + "epoch": 2.703192142418662, + "grad_norm": 0.3310995399951935, + "learning_rate": 8.560923553686569e-05, + "loss": 1.9293, + "step": 8807 + }, + { + "epoch": 2.7034990791896867, + "grad_norm": 0.3828842043876648, + "learning_rate": 8.5605746067058e-05, + "loss": 1.8789, + "step": 8808 + }, + { + "epoch": 2.703806015960712, + "grad_norm": 0.3666260242462158, + "learning_rate": 8.560225624537346e-05, + "loss": 1.8622, + "step": 8809 + }, + { + "epoch": 2.7041129527317374, + "grad_norm": 0.36732783913612366, + "learning_rate": 8.559876607184653e-05, + "loss": 1.8177, + "step": 8810 + }, + { + "epoch": 2.7044198895027627, + "grad_norm": 0.35554859042167664, + "learning_rate": 8.559527554651176e-05, + "loss": 1.884, + "step": 8811 + }, + { + "epoch": 2.7047268262737876, + "grad_norm": 0.3118159770965576, + "learning_rate": 8.55917846694036e-05, + "loss": 1.8779, + "step": 8812 + }, + { + "epoch": 2.705033763044813, + "grad_norm": 0.278105765581131, + "learning_rate": 8.558829344055657e-05, + "loss": 1.8513, + "step": 8813 + }, + { + "epoch": 2.705340699815838, + "grad_norm": 0.30809372663497925, + "learning_rate": 8.558480186000517e-05, + "loss": 1.8023, + "step": 8814 + }, + { + "epoch": 2.705647636586863, + "grad_norm": 0.28222522139549255, + "learning_rate": 8.558130992778388e-05, + "loss": 1.8421, + "step": 8815 + }, + { + "epoch": 2.7059545733578885, + "grad_norm": 0.29532718658447266, + "learning_rate": 8.557781764392725e-05, + "loss": 1.8131, + "step": 8816 + }, + { + "epoch": 2.7062615101289134, + "grad_norm": 0.2670072317123413, + "learning_rate": 8.557432500846975e-05, + "loss": 1.7856, + "step": 8817 + }, + { + "epoch": 2.7065684468999387, + "grad_norm": 0.3431483805179596, + "learning_rate": 8.557083202144594e-05, + "loss": 1.8484, + "step": 8818 + }, + { + "epoch": 2.7068753836709636, + "grad_norm": 0.3824561536312103, + "learning_rate": 8.556733868289033e-05, + "loss": 1.8954, + "step": 8819 + }, + { + "epoch": 2.707182320441989, + "grad_norm": 0.4189379811286926, + "learning_rate": 8.55638449928374e-05, + "loss": 1.7846, + "step": 8820 + }, + { + "epoch": 2.7074892572130143, + "grad_norm": 0.34948450326919556, + "learning_rate": 8.556035095132173e-05, + "loss": 1.7696, + "step": 8821 + }, + { + "epoch": 2.707796193984039, + "grad_norm": 0.2906292676925659, + "learning_rate": 8.555685655837783e-05, + "loss": 1.8359, + "step": 8822 + }, + { + "epoch": 2.7081031307550645, + "grad_norm": 0.2756035029888153, + "learning_rate": 8.555336181404023e-05, + "loss": 1.8684, + "step": 8823 + }, + { + "epoch": 2.7084100675260894, + "grad_norm": 0.3714772164821625, + "learning_rate": 8.554986671834346e-05, + "loss": 1.8833, + "step": 8824 + }, + { + "epoch": 2.7087170042971147, + "grad_norm": 0.41674792766571045, + "learning_rate": 8.554637127132209e-05, + "loss": 1.8272, + "step": 8825 + }, + { + "epoch": 2.70902394106814, + "grad_norm": 0.333915650844574, + "learning_rate": 8.554287547301063e-05, + "loss": 1.8343, + "step": 8826 + }, + { + "epoch": 2.7093308778391654, + "grad_norm": 0.33764639496803284, + "learning_rate": 8.553937932344365e-05, + "loss": 1.812, + "step": 8827 + }, + { + "epoch": 2.7096378146101903, + "grad_norm": 0.4445551931858063, + "learning_rate": 8.553588282265569e-05, + "loss": 1.8386, + "step": 8828 + }, + { + "epoch": 2.7099447513812156, + "grad_norm": 0.43314024806022644, + "learning_rate": 8.553238597068131e-05, + "loss": 1.7727, + "step": 8829 + }, + { + "epoch": 2.7102516881522405, + "grad_norm": 0.364596426486969, + "learning_rate": 8.552888876755506e-05, + "loss": 1.8875, + "step": 8830 + }, + { + "epoch": 2.710558624923266, + "grad_norm": 0.3023224174976349, + "learning_rate": 8.552539121331151e-05, + "loss": 1.8676, + "step": 8831 + }, + { + "epoch": 2.710865561694291, + "grad_norm": 0.3278682231903076, + "learning_rate": 8.552189330798522e-05, + "loss": 1.852, + "step": 8832 + }, + { + "epoch": 2.711172498465316, + "grad_norm": 0.34684303402900696, + "learning_rate": 8.551839505161077e-05, + "loss": 1.8449, + "step": 8833 + }, + { + "epoch": 2.7114794352363414, + "grad_norm": 0.3398132920265198, + "learning_rate": 8.551489644422271e-05, + "loss": 1.8493, + "step": 8834 + }, + { + "epoch": 2.7117863720073663, + "grad_norm": 0.2835905849933624, + "learning_rate": 8.551139748585563e-05, + "loss": 1.8283, + "step": 8835 + }, + { + "epoch": 2.7120933087783916, + "grad_norm": 0.30910351872444153, + "learning_rate": 8.55078981765441e-05, + "loss": 1.8429, + "step": 8836 + }, + { + "epoch": 2.712400245549417, + "grad_norm": 0.3802061676979065, + "learning_rate": 8.550439851632272e-05, + "loss": 1.8348, + "step": 8837 + }, + { + "epoch": 2.712707182320442, + "grad_norm": 0.3686448931694031, + "learning_rate": 8.550089850522606e-05, + "loss": 1.8652, + "step": 8838 + }, + { + "epoch": 2.713014119091467, + "grad_norm": 0.2919705808162689, + "learning_rate": 8.549739814328872e-05, + "loss": 1.8318, + "step": 8839 + }, + { + "epoch": 2.713321055862492, + "grad_norm": 0.34780198335647583, + "learning_rate": 8.549389743054527e-05, + "loss": 1.8781, + "step": 8840 + }, + { + "epoch": 2.7136279926335174, + "grad_norm": 0.3955966532230377, + "learning_rate": 8.549039636703034e-05, + "loss": 1.867, + "step": 8841 + }, + { + "epoch": 2.7139349294045427, + "grad_norm": 0.2836689054965973, + "learning_rate": 8.548689495277851e-05, + "loss": 1.7859, + "step": 8842 + }, + { + "epoch": 2.714241866175568, + "grad_norm": 0.369865357875824, + "learning_rate": 8.548339318782436e-05, + "loss": 1.8246, + "step": 8843 + }, + { + "epoch": 2.714548802946593, + "grad_norm": 0.2901081442832947, + "learning_rate": 8.547989107220256e-05, + "loss": 1.7888, + "step": 8844 + }, + { + "epoch": 2.7148557397176183, + "grad_norm": 0.2790970802307129, + "learning_rate": 8.547638860594764e-05, + "loss": 1.8311, + "step": 8845 + }, + { + "epoch": 2.715162676488643, + "grad_norm": 0.2935783267021179, + "learning_rate": 8.547288578909429e-05, + "loss": 1.857, + "step": 8846 + }, + { + "epoch": 2.7154696132596685, + "grad_norm": 0.27074959874153137, + "learning_rate": 8.546938262167708e-05, + "loss": 1.7457, + "step": 8847 + }, + { + "epoch": 2.715776550030694, + "grad_norm": 0.3042888343334198, + "learning_rate": 8.546587910373063e-05, + "loss": 1.8598, + "step": 8848 + }, + { + "epoch": 2.7160834868017187, + "grad_norm": 0.29088664054870605, + "learning_rate": 8.546237523528958e-05, + "loss": 1.8461, + "step": 8849 + }, + { + "epoch": 2.716390423572744, + "grad_norm": 0.3022211492061615, + "learning_rate": 8.545887101638857e-05, + "loss": 1.8327, + "step": 8850 + }, + { + "epoch": 2.716697360343769, + "grad_norm": 0.30194929242134094, + "learning_rate": 8.545536644706218e-05, + "loss": 1.8331, + "step": 8851 + }, + { + "epoch": 2.7170042971147943, + "grad_norm": 0.31702303886413574, + "learning_rate": 8.54518615273451e-05, + "loss": 1.8576, + "step": 8852 + }, + { + "epoch": 2.7173112338858196, + "grad_norm": 0.30386796593666077, + "learning_rate": 8.544835625727195e-05, + "loss": 1.8278, + "step": 8853 + }, + { + "epoch": 2.717618170656845, + "grad_norm": 0.30670568346977234, + "learning_rate": 8.544485063687735e-05, + "loss": 1.8123, + "step": 8854 + }, + { + "epoch": 2.71792510742787, + "grad_norm": 0.3896371126174927, + "learning_rate": 8.544134466619597e-05, + "loss": 1.8101, + "step": 8855 + }, + { + "epoch": 2.718232044198895, + "grad_norm": 0.4742000699043274, + "learning_rate": 8.543783834526245e-05, + "loss": 1.8402, + "step": 8856 + }, + { + "epoch": 2.71853898096992, + "grad_norm": 0.4234209954738617, + "learning_rate": 8.543433167411143e-05, + "loss": 1.8814, + "step": 8857 + }, + { + "epoch": 2.7188459177409454, + "grad_norm": 0.28478503227233887, + "learning_rate": 8.54308246527776e-05, + "loss": 1.8165, + "step": 8858 + }, + { + "epoch": 2.7191528545119708, + "grad_norm": 0.3534078896045685, + "learning_rate": 8.542731728129558e-05, + "loss": 1.7947, + "step": 8859 + }, + { + "epoch": 2.7194597912829956, + "grad_norm": 0.5471592545509338, + "learning_rate": 8.542380955970004e-05, + "loss": 1.9073, + "step": 8860 + }, + { + "epoch": 2.719766728054021, + "grad_norm": 0.5037226676940918, + "learning_rate": 8.542030148802566e-05, + "loss": 1.8701, + "step": 8861 + }, + { + "epoch": 2.720073664825046, + "grad_norm": 0.3415449559688568, + "learning_rate": 8.54167930663071e-05, + "loss": 1.827, + "step": 8862 + }, + { + "epoch": 2.720380601596071, + "grad_norm": 0.33516764640808105, + "learning_rate": 8.541328429457903e-05, + "loss": 1.9396, + "step": 8863 + }, + { + "epoch": 2.7206875383670965, + "grad_norm": 0.3934863209724426, + "learning_rate": 8.540977517287612e-05, + "loss": 1.8738, + "step": 8864 + }, + { + "epoch": 2.7209944751381214, + "grad_norm": 0.5137139558792114, + "learning_rate": 8.540626570123307e-05, + "loss": 1.9007, + "step": 8865 + }, + { + "epoch": 2.7213014119091468, + "grad_norm": 0.5846540331840515, + "learning_rate": 8.540275587968453e-05, + "loss": 1.9335, + "step": 8866 + }, + { + "epoch": 2.7216083486801717, + "grad_norm": 0.613388180732727, + "learning_rate": 8.539924570826523e-05, + "loss": 1.8967, + "step": 8867 + }, + { + "epoch": 2.721915285451197, + "grad_norm": 0.4804840087890625, + "learning_rate": 8.539573518700983e-05, + "loss": 1.7712, + "step": 8868 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.34939101338386536, + "learning_rate": 8.539222431595303e-05, + "loss": 1.8578, + "step": 8869 + }, + { + "epoch": 2.7225291589932477, + "grad_norm": 0.4230511486530304, + "learning_rate": 8.538871309512951e-05, + "loss": 1.793, + "step": 8870 + }, + { + "epoch": 2.7228360957642725, + "grad_norm": 0.5383400917053223, + "learning_rate": 8.538520152457402e-05, + "loss": 1.8153, + "step": 8871 + }, + { + "epoch": 2.723143032535298, + "grad_norm": 0.46213194727897644, + "learning_rate": 8.538168960432118e-05, + "loss": 1.9357, + "step": 8872 + }, + { + "epoch": 2.7234499693063228, + "grad_norm": 0.3126194477081299, + "learning_rate": 8.537817733440577e-05, + "loss": 1.7954, + "step": 8873 + }, + { + "epoch": 2.723756906077348, + "grad_norm": 0.4018714129924774, + "learning_rate": 8.537466471486248e-05, + "loss": 1.824, + "step": 8874 + }, + { + "epoch": 2.7240638428483734, + "grad_norm": 0.5690213441848755, + "learning_rate": 8.537115174572602e-05, + "loss": 1.7807, + "step": 8875 + }, + { + "epoch": 2.7243707796193983, + "grad_norm": 0.4669814705848694, + "learning_rate": 8.53676384270311e-05, + "loss": 1.7438, + "step": 8876 + }, + { + "epoch": 2.7246777163904237, + "grad_norm": 0.3040566146373749, + "learning_rate": 8.536412475881246e-05, + "loss": 1.8613, + "step": 8877 + }, + { + "epoch": 2.7249846531614486, + "grad_norm": 0.38985559344291687, + "learning_rate": 8.53606107411048e-05, + "loss": 1.816, + "step": 8878 + }, + { + "epoch": 2.725291589932474, + "grad_norm": 0.4417174160480499, + "learning_rate": 8.535709637394285e-05, + "loss": 1.8675, + "step": 8879 + }, + { + "epoch": 2.7255985267034992, + "grad_norm": 0.3254696726799011, + "learning_rate": 8.535358165736138e-05, + "loss": 1.8419, + "step": 8880 + }, + { + "epoch": 2.725905463474524, + "grad_norm": 0.36002370715141296, + "learning_rate": 8.535006659139506e-05, + "loss": 1.9084, + "step": 8881 + }, + { + "epoch": 2.7262124002455494, + "grad_norm": 0.3471790850162506, + "learning_rate": 8.534655117607869e-05, + "loss": 1.8442, + "step": 8882 + }, + { + "epoch": 2.7265193370165743, + "grad_norm": 0.3042849004268646, + "learning_rate": 8.534303541144697e-05, + "loss": 1.8261, + "step": 8883 + }, + { + "epoch": 2.7268262737875997, + "grad_norm": 0.32416659593582153, + "learning_rate": 8.533951929753465e-05, + "loss": 1.8625, + "step": 8884 + }, + { + "epoch": 2.727133210558625, + "grad_norm": 0.32449519634246826, + "learning_rate": 8.53360028343765e-05, + "loss": 1.8653, + "step": 8885 + }, + { + "epoch": 2.7274401473296503, + "grad_norm": 0.34744054079055786, + "learning_rate": 8.533248602200726e-05, + "loss": 1.8742, + "step": 8886 + }, + { + "epoch": 2.7277470841006752, + "grad_norm": 0.30540695786476135, + "learning_rate": 8.532896886046167e-05, + "loss": 1.8064, + "step": 8887 + }, + { + "epoch": 2.7280540208717006, + "grad_norm": 0.27105677127838135, + "learning_rate": 8.532545134977452e-05, + "loss": 1.7867, + "step": 8888 + }, + { + "epoch": 2.7283609576427255, + "grad_norm": 0.2682685852050781, + "learning_rate": 8.532193348998054e-05, + "loss": 1.8191, + "step": 8889 + }, + { + "epoch": 2.728667894413751, + "grad_norm": 0.33534809947013855, + "learning_rate": 8.531841528111452e-05, + "loss": 1.8758, + "step": 8890 + }, + { + "epoch": 2.728974831184776, + "grad_norm": 0.33555057644844055, + "learning_rate": 8.531489672321122e-05, + "loss": 1.8932, + "step": 8891 + }, + { + "epoch": 2.729281767955801, + "grad_norm": 0.3532167077064514, + "learning_rate": 8.531137781630542e-05, + "loss": 1.8621, + "step": 8892 + }, + { + "epoch": 2.7295887047268264, + "grad_norm": 0.337634414434433, + "learning_rate": 8.530785856043186e-05, + "loss": 1.8618, + "step": 8893 + }, + { + "epoch": 2.7298956414978512, + "grad_norm": 0.28855568170547485, + "learning_rate": 8.530433895562538e-05, + "loss": 1.8248, + "step": 8894 + }, + { + "epoch": 2.7302025782688766, + "grad_norm": 0.3128049373626709, + "learning_rate": 8.530081900192071e-05, + "loss": 1.8071, + "step": 8895 + }, + { + "epoch": 2.730509515039902, + "grad_norm": 0.2949801981449127, + "learning_rate": 8.529729869935265e-05, + "loss": 1.7704, + "step": 8896 + }, + { + "epoch": 2.730816451810927, + "grad_norm": 0.2708294987678528, + "learning_rate": 8.529377804795603e-05, + "loss": 1.8127, + "step": 8897 + }, + { + "epoch": 2.731123388581952, + "grad_norm": 0.300516813993454, + "learning_rate": 8.529025704776559e-05, + "loss": 1.9063, + "step": 8898 + }, + { + "epoch": 2.731430325352977, + "grad_norm": 0.2590954005718231, + "learning_rate": 8.528673569881613e-05, + "loss": 1.7595, + "step": 8899 + }, + { + "epoch": 2.7317372621240024, + "grad_norm": 0.30067136883735657, + "learning_rate": 8.528321400114248e-05, + "loss": 1.8697, + "step": 8900 + }, + { + "epoch": 2.7320441988950277, + "grad_norm": 0.3289981186389923, + "learning_rate": 8.527969195477943e-05, + "loss": 1.8257, + "step": 8901 + }, + { + "epoch": 2.732351135666053, + "grad_norm": 0.3205581307411194, + "learning_rate": 8.527616955976178e-05, + "loss": 1.9002, + "step": 8902 + }, + { + "epoch": 2.732658072437078, + "grad_norm": 0.30869361758232117, + "learning_rate": 8.527264681612435e-05, + "loss": 1.8239, + "step": 8903 + }, + { + "epoch": 2.7329650092081033, + "grad_norm": 0.3237484097480774, + "learning_rate": 8.526912372390195e-05, + "loss": 1.8879, + "step": 8904 + }, + { + "epoch": 2.733271945979128, + "grad_norm": 0.3172036111354828, + "learning_rate": 8.52656002831294e-05, + "loss": 1.8118, + "step": 8905 + }, + { + "epoch": 2.7335788827501535, + "grad_norm": 0.3326823115348816, + "learning_rate": 8.52620764938415e-05, + "loss": 1.8035, + "step": 8906 + }, + { + "epoch": 2.733885819521179, + "grad_norm": 0.36605212092399597, + "learning_rate": 8.525855235607311e-05, + "loss": 1.8689, + "step": 8907 + }, + { + "epoch": 2.7341927562922037, + "grad_norm": 0.31904828548431396, + "learning_rate": 8.525502786985905e-05, + "loss": 1.8188, + "step": 8908 + }, + { + "epoch": 2.734499693063229, + "grad_norm": 0.2657643258571625, + "learning_rate": 8.525150303523413e-05, + "loss": 1.7471, + "step": 8909 + }, + { + "epoch": 2.734806629834254, + "grad_norm": 0.32748520374298096, + "learning_rate": 8.524797785223318e-05, + "loss": 1.8678, + "step": 8910 + }, + { + "epoch": 2.7351135666052793, + "grad_norm": 0.32576173543930054, + "learning_rate": 8.524445232089107e-05, + "loss": 1.8296, + "step": 8911 + }, + { + "epoch": 2.7354205033763046, + "grad_norm": 0.3028578758239746, + "learning_rate": 8.524092644124261e-05, + "loss": 1.8656, + "step": 8912 + }, + { + "epoch": 2.7357274401473295, + "grad_norm": 0.29967090487480164, + "learning_rate": 8.523740021332268e-05, + "loss": 1.8206, + "step": 8913 + }, + { + "epoch": 2.736034376918355, + "grad_norm": 0.3042941391468048, + "learning_rate": 8.523387363716611e-05, + "loss": 1.7928, + "step": 8914 + }, + { + "epoch": 2.7363413136893797, + "grad_norm": 0.3278021216392517, + "learning_rate": 8.523034671280772e-05, + "loss": 1.9213, + "step": 8915 + }, + { + "epoch": 2.736648250460405, + "grad_norm": 0.39839017391204834, + "learning_rate": 8.522681944028242e-05, + "loss": 1.8242, + "step": 8916 + }, + { + "epoch": 2.7369551872314304, + "grad_norm": 0.3960748016834259, + "learning_rate": 8.522329181962504e-05, + "loss": 1.8761, + "step": 8917 + }, + { + "epoch": 2.7372621240024557, + "grad_norm": 0.3250591456890106, + "learning_rate": 8.521976385087044e-05, + "loss": 1.8318, + "step": 8918 + }, + { + "epoch": 2.7375690607734806, + "grad_norm": 0.31731119751930237, + "learning_rate": 8.521623553405349e-05, + "loss": 1.8062, + "step": 8919 + }, + { + "epoch": 2.737875997544506, + "grad_norm": 0.32452264428138733, + "learning_rate": 8.521270686920906e-05, + "loss": 1.8384, + "step": 8920 + }, + { + "epoch": 2.738182934315531, + "grad_norm": 0.2892500162124634, + "learning_rate": 8.520917785637204e-05, + "loss": 1.8128, + "step": 8921 + }, + { + "epoch": 2.738489871086556, + "grad_norm": 0.30028483271598816, + "learning_rate": 8.520564849557726e-05, + "loss": 1.8512, + "step": 8922 + }, + { + "epoch": 2.7387968078575815, + "grad_norm": 0.29927411675453186, + "learning_rate": 8.520211878685964e-05, + "loss": 1.8431, + "step": 8923 + }, + { + "epoch": 2.7391037446286064, + "grad_norm": 0.3426479995250702, + "learning_rate": 8.519858873025405e-05, + "loss": 1.8724, + "step": 8924 + }, + { + "epoch": 2.7394106813996317, + "grad_norm": 0.3795917332172394, + "learning_rate": 8.519505832579538e-05, + "loss": 1.8888, + "step": 8925 + }, + { + "epoch": 2.7397176181706566, + "grad_norm": 0.4924582839012146, + "learning_rate": 8.519152757351849e-05, + "loss": 1.7743, + "step": 8926 + }, + { + "epoch": 2.740024554941682, + "grad_norm": 0.43054282665252686, + "learning_rate": 8.518799647345832e-05, + "loss": 1.8556, + "step": 8927 + }, + { + "epoch": 2.7403314917127073, + "grad_norm": 0.37040412425994873, + "learning_rate": 8.518446502564974e-05, + "loss": 1.9162, + "step": 8928 + }, + { + "epoch": 2.7406384284837326, + "grad_norm": 0.38334885239601135, + "learning_rate": 8.518093323012766e-05, + "loss": 1.8078, + "step": 8929 + }, + { + "epoch": 2.7409453652547575, + "grad_norm": 0.409101665019989, + "learning_rate": 8.517740108692698e-05, + "loss": 1.7874, + "step": 8930 + }, + { + "epoch": 2.741252302025783, + "grad_norm": 0.3953499495983124, + "learning_rate": 8.517386859608258e-05, + "loss": 1.8455, + "step": 8931 + }, + { + "epoch": 2.7415592387968077, + "grad_norm": 0.30524972081184387, + "learning_rate": 8.517033575762942e-05, + "loss": 1.822, + "step": 8932 + }, + { + "epoch": 2.741866175567833, + "grad_norm": 0.354086309671402, + "learning_rate": 8.516680257160239e-05, + "loss": 1.859, + "step": 8933 + }, + { + "epoch": 2.7421731123388584, + "grad_norm": 0.4305376410484314, + "learning_rate": 8.516326903803638e-05, + "loss": 1.8918, + "step": 8934 + }, + { + "epoch": 2.7424800491098833, + "grad_norm": 0.590727686882019, + "learning_rate": 8.515973515696635e-05, + "loss": 1.8841, + "step": 8935 + }, + { + "epoch": 2.7427869858809086, + "grad_norm": 0.665314257144928, + "learning_rate": 8.515620092842723e-05, + "loss": 1.8166, + "step": 8936 + }, + { + "epoch": 2.7430939226519335, + "grad_norm": 0.5579181909561157, + "learning_rate": 8.515266635245389e-05, + "loss": 1.8344, + "step": 8937 + }, + { + "epoch": 2.743400859422959, + "grad_norm": 0.3698382079601288, + "learning_rate": 8.514913142908132e-05, + "loss": 1.8445, + "step": 8938 + }, + { + "epoch": 2.743707796193984, + "grad_norm": 0.30882057547569275, + "learning_rate": 8.514559615834442e-05, + "loss": 1.8443, + "step": 8939 + }, + { + "epoch": 2.744014732965009, + "grad_norm": 0.35821446776390076, + "learning_rate": 8.514206054027815e-05, + "loss": 1.8482, + "step": 8940 + }, + { + "epoch": 2.7443216697360344, + "grad_norm": 0.35552099347114563, + "learning_rate": 8.513852457491744e-05, + "loss": 1.7848, + "step": 8941 + }, + { + "epoch": 2.7446286065070593, + "grad_norm": 0.27788954973220825, + "learning_rate": 8.513498826229722e-05, + "loss": 1.7935, + "step": 8942 + }, + { + "epoch": 2.7449355432780846, + "grad_norm": 0.30653929710388184, + "learning_rate": 8.513145160245246e-05, + "loss": 1.808, + "step": 8943 + }, + { + "epoch": 2.74524248004911, + "grad_norm": 0.34749966859817505, + "learning_rate": 8.512791459541812e-05, + "loss": 1.8498, + "step": 8944 + }, + { + "epoch": 2.7455494168201353, + "grad_norm": 0.362326979637146, + "learning_rate": 8.512437724122912e-05, + "loss": 1.8263, + "step": 8945 + }, + { + "epoch": 2.74585635359116, + "grad_norm": 0.2914038598537445, + "learning_rate": 8.512083953992044e-05, + "loss": 1.834, + "step": 8946 + }, + { + "epoch": 2.7461632903621855, + "grad_norm": 0.31662893295288086, + "learning_rate": 8.511730149152705e-05, + "loss": 1.8157, + "step": 8947 + }, + { + "epoch": 2.7464702271332104, + "grad_norm": 0.38970568776130676, + "learning_rate": 8.51137630960839e-05, + "loss": 1.8764, + "step": 8948 + }, + { + "epoch": 2.7467771639042358, + "grad_norm": 0.3907272517681122, + "learning_rate": 8.511022435362594e-05, + "loss": 1.8665, + "step": 8949 + }, + { + "epoch": 2.747084100675261, + "grad_norm": 0.3315196931362152, + "learning_rate": 8.510668526418819e-05, + "loss": 1.8076, + "step": 8950 + }, + { + "epoch": 2.747391037446286, + "grad_norm": 0.29783520102500916, + "learning_rate": 8.510314582780559e-05, + "loss": 1.8518, + "step": 8951 + }, + { + "epoch": 2.7476979742173113, + "grad_norm": 0.3085685670375824, + "learning_rate": 8.509960604451312e-05, + "loss": 1.8961, + "step": 8952 + }, + { + "epoch": 2.748004910988336, + "grad_norm": 0.3204992711544037, + "learning_rate": 8.509606591434579e-05, + "loss": 1.8374, + "step": 8953 + }, + { + "epoch": 2.7483118477593615, + "grad_norm": 0.2801276445388794, + "learning_rate": 8.509252543733855e-05, + "loss": 1.8455, + "step": 8954 + }, + { + "epoch": 2.748618784530387, + "grad_norm": 0.26911506056785583, + "learning_rate": 8.508898461352641e-05, + "loss": 1.8093, + "step": 8955 + }, + { + "epoch": 2.7489257213014118, + "grad_norm": 0.30429625511169434, + "learning_rate": 8.508544344294435e-05, + "loss": 1.8526, + "step": 8956 + }, + { + "epoch": 2.749232658072437, + "grad_norm": 0.308403342962265, + "learning_rate": 8.50819019256274e-05, + "loss": 1.7917, + "step": 8957 + }, + { + "epoch": 2.749539594843462, + "grad_norm": 0.3292251229286194, + "learning_rate": 8.507836006161052e-05, + "loss": 1.8206, + "step": 8958 + }, + { + "epoch": 2.7498465316144873, + "grad_norm": 0.30014076828956604, + "learning_rate": 8.507481785092871e-05, + "loss": 1.8136, + "step": 8959 + }, + { + "epoch": 2.7501534683855127, + "grad_norm": 0.2879343032836914, + "learning_rate": 8.5071275293617e-05, + "loss": 1.8476, + "step": 8960 + }, + { + "epoch": 2.750460405156538, + "grad_norm": 0.30646058917045593, + "learning_rate": 8.506773238971039e-05, + "loss": 1.7936, + "step": 8961 + }, + { + "epoch": 2.750767341927563, + "grad_norm": 0.309804230928421, + "learning_rate": 8.506418913924391e-05, + "loss": 1.8076, + "step": 8962 + }, + { + "epoch": 2.7510742786985882, + "grad_norm": 0.27035996317863464, + "learning_rate": 8.506064554225255e-05, + "loss": 1.8169, + "step": 8963 + }, + { + "epoch": 2.751381215469613, + "grad_norm": 0.3185548782348633, + "learning_rate": 8.505710159877134e-05, + "loss": 1.8265, + "step": 8964 + }, + { + "epoch": 2.7516881522406385, + "grad_norm": 0.3806973099708557, + "learning_rate": 8.505355730883532e-05, + "loss": 1.824, + "step": 8965 + }, + { + "epoch": 2.751995089011664, + "grad_norm": 0.3206372857093811, + "learning_rate": 8.505001267247949e-05, + "loss": 1.8436, + "step": 8966 + }, + { + "epoch": 2.7523020257826887, + "grad_norm": 0.2957460880279541, + "learning_rate": 8.504646768973889e-05, + "loss": 1.8212, + "step": 8967 + }, + { + "epoch": 2.752608962553714, + "grad_norm": 0.2854628562927246, + "learning_rate": 8.504292236064854e-05, + "loss": 1.862, + "step": 8968 + }, + { + "epoch": 2.752915899324739, + "grad_norm": 0.30056047439575195, + "learning_rate": 8.503937668524351e-05, + "loss": 1.8007, + "step": 8969 + }, + { + "epoch": 2.7532228360957642, + "grad_norm": 0.33884522318840027, + "learning_rate": 8.503583066355883e-05, + "loss": 1.8972, + "step": 8970 + }, + { + "epoch": 2.7535297728667896, + "grad_norm": 0.29358747601509094, + "learning_rate": 8.503228429562951e-05, + "loss": 1.8343, + "step": 8971 + }, + { + "epoch": 2.7538367096378145, + "grad_norm": 0.3650909662246704, + "learning_rate": 8.502873758149063e-05, + "loss": 1.7866, + "step": 8972 + }, + { + "epoch": 2.75414364640884, + "grad_norm": 0.3245839476585388, + "learning_rate": 8.502519052117725e-05, + "loss": 1.8451, + "step": 8973 + }, + { + "epoch": 2.7544505831798647, + "grad_norm": 0.305429071187973, + "learning_rate": 8.502164311472441e-05, + "loss": 1.9277, + "step": 8974 + }, + { + "epoch": 2.75475751995089, + "grad_norm": 0.3520638942718506, + "learning_rate": 8.501809536216716e-05, + "loss": 1.7648, + "step": 8975 + }, + { + "epoch": 2.7550644567219154, + "grad_norm": 0.419918030500412, + "learning_rate": 8.501454726354054e-05, + "loss": 1.7862, + "step": 8976 + }, + { + "epoch": 2.7553713934929407, + "grad_norm": 0.3854345977306366, + "learning_rate": 8.501099881887968e-05, + "loss": 1.8234, + "step": 8977 + }, + { + "epoch": 2.7556783302639656, + "grad_norm": 0.27826064825057983, + "learning_rate": 8.50074500282196e-05, + "loss": 1.7694, + "step": 8978 + }, + { + "epoch": 2.755985267034991, + "grad_norm": 0.3439055383205414, + "learning_rate": 8.500390089159536e-05, + "loss": 1.8136, + "step": 8979 + }, + { + "epoch": 2.756292203806016, + "grad_norm": 0.3434913754463196, + "learning_rate": 8.500035140904208e-05, + "loss": 1.8053, + "step": 8980 + }, + { + "epoch": 2.756599140577041, + "grad_norm": 0.27551600337028503, + "learning_rate": 8.49968015805948e-05, + "loss": 1.8349, + "step": 8981 + }, + { + "epoch": 2.7569060773480665, + "grad_norm": 0.304706871509552, + "learning_rate": 8.499325140628863e-05, + "loss": 1.8488, + "step": 8982 + }, + { + "epoch": 2.7572130141190914, + "grad_norm": 0.36910584568977356, + "learning_rate": 8.498970088615861e-05, + "loss": 1.8519, + "step": 8983 + }, + { + "epoch": 2.7575199508901167, + "grad_norm": 0.30584999918937683, + "learning_rate": 8.498615002023987e-05, + "loss": 1.8479, + "step": 8984 + }, + { + "epoch": 2.7578268876611416, + "grad_norm": 0.28511542081832886, + "learning_rate": 8.498259880856749e-05, + "loss": 1.8047, + "step": 8985 + }, + { + "epoch": 2.758133824432167, + "grad_norm": 0.28804922103881836, + "learning_rate": 8.497904725117658e-05, + "loss": 1.891, + "step": 8986 + }, + { + "epoch": 2.7584407612031923, + "grad_norm": 0.32592445611953735, + "learning_rate": 8.497549534810221e-05, + "loss": 1.8081, + "step": 8987 + }, + { + "epoch": 2.758747697974217, + "grad_norm": 0.3298552632331848, + "learning_rate": 8.497194309937949e-05, + "loss": 1.8897, + "step": 8988 + }, + { + "epoch": 2.7590546347452425, + "grad_norm": 0.3506438136100769, + "learning_rate": 8.496839050504353e-05, + "loss": 1.9007, + "step": 8989 + }, + { + "epoch": 2.7593615715162674, + "grad_norm": 0.30891793966293335, + "learning_rate": 8.496483756512946e-05, + "loss": 1.8154, + "step": 8990 + }, + { + "epoch": 2.7596685082872927, + "grad_norm": 0.3697068691253662, + "learning_rate": 8.496128427967235e-05, + "loss": 1.8301, + "step": 8991 + }, + { + "epoch": 2.759975445058318, + "grad_norm": 0.3090182840824127, + "learning_rate": 8.495773064870734e-05, + "loss": 1.8443, + "step": 8992 + }, + { + "epoch": 2.7602823818293434, + "grad_norm": 0.31172695755958557, + "learning_rate": 8.495417667226955e-05, + "loss": 1.8051, + "step": 8993 + }, + { + "epoch": 2.7605893186003683, + "grad_norm": 0.34285077452659607, + "learning_rate": 8.495062235039411e-05, + "loss": 1.8766, + "step": 8994 + }, + { + "epoch": 2.7608962553713936, + "grad_norm": 0.30001118779182434, + "learning_rate": 8.494706768311612e-05, + "loss": 1.8267, + "step": 8995 + }, + { + "epoch": 2.7612031921424185, + "grad_norm": 0.2767544984817505, + "learning_rate": 8.494351267047074e-05, + "loss": 1.8038, + "step": 8996 + }, + { + "epoch": 2.761510128913444, + "grad_norm": 0.2952648401260376, + "learning_rate": 8.493995731249307e-05, + "loss": 1.7863, + "step": 8997 + }, + { + "epoch": 2.761817065684469, + "grad_norm": 0.27491581439971924, + "learning_rate": 8.493640160921828e-05, + "loss": 1.844, + "step": 8998 + }, + { + "epoch": 2.762124002455494, + "grad_norm": 0.2733328938484192, + "learning_rate": 8.493284556068147e-05, + "loss": 1.7909, + "step": 8999 + }, + { + "epoch": 2.7624309392265194, + "grad_norm": 0.3201010525226593, + "learning_rate": 8.492928916691783e-05, + "loss": 1.8827, + "step": 9000 + }, + { + "epoch": 2.7627378759975443, + "grad_norm": 0.293652206659317, + "learning_rate": 8.492573242796244e-05, + "loss": 1.7755, + "step": 9001 + }, + { + "epoch": 2.7630448127685696, + "grad_norm": 0.2862321436405182, + "learning_rate": 8.492217534385053e-05, + "loss": 1.7868, + "step": 9002 + }, + { + "epoch": 2.763351749539595, + "grad_norm": 0.364490270614624, + "learning_rate": 8.491861791461722e-05, + "loss": 1.8276, + "step": 9003 + }, + { + "epoch": 2.7636586863106203, + "grad_norm": 0.4316955506801605, + "learning_rate": 8.491506014029765e-05, + "loss": 1.8727, + "step": 9004 + }, + { + "epoch": 2.763965623081645, + "grad_norm": 0.37957659363746643, + "learning_rate": 8.491150202092697e-05, + "loss": 1.8471, + "step": 9005 + }, + { + "epoch": 2.7642725598526705, + "grad_norm": 0.2936808168888092, + "learning_rate": 8.490794355654039e-05, + "loss": 1.7964, + "step": 9006 + }, + { + "epoch": 2.7645794966236954, + "grad_norm": 0.3742556869983673, + "learning_rate": 8.490438474717304e-05, + "loss": 1.8461, + "step": 9007 + }, + { + "epoch": 2.7648864333947207, + "grad_norm": 0.4273780286312103, + "learning_rate": 8.49008255928601e-05, + "loss": 1.7947, + "step": 9008 + }, + { + "epoch": 2.765193370165746, + "grad_norm": 0.35967808961868286, + "learning_rate": 8.489726609363675e-05, + "loss": 1.8125, + "step": 9009 + }, + { + "epoch": 2.765500306936771, + "grad_norm": 0.27607613801956177, + "learning_rate": 8.489370624953817e-05, + "loss": 1.8413, + "step": 9010 + }, + { + "epoch": 2.7658072437077963, + "grad_norm": 0.38287433981895447, + "learning_rate": 8.489014606059952e-05, + "loss": 1.8184, + "step": 9011 + }, + { + "epoch": 2.766114180478821, + "grad_norm": 0.4284100830554962, + "learning_rate": 8.4886585526856e-05, + "loss": 1.7965, + "step": 9012 + }, + { + "epoch": 2.7664211172498465, + "grad_norm": 0.35851627588272095, + "learning_rate": 8.48830246483428e-05, + "loss": 1.8275, + "step": 9013 + }, + { + "epoch": 2.766728054020872, + "grad_norm": 0.30598360300064087, + "learning_rate": 8.487946342509509e-05, + "loss": 1.8383, + "step": 9014 + }, + { + "epoch": 2.7670349907918967, + "grad_norm": 0.30098259449005127, + "learning_rate": 8.487590185714811e-05, + "loss": 1.8229, + "step": 9015 + }, + { + "epoch": 2.767341927562922, + "grad_norm": 0.45887723565101624, + "learning_rate": 8.487233994453701e-05, + "loss": 1.9128, + "step": 9016 + }, + { + "epoch": 2.767648864333947, + "grad_norm": 0.4983403980731964, + "learning_rate": 8.4868777687297e-05, + "loss": 1.8269, + "step": 9017 + }, + { + "epoch": 2.7679558011049723, + "grad_norm": 0.4925507605075836, + "learning_rate": 8.48652150854633e-05, + "loss": 1.9231, + "step": 9018 + }, + { + "epoch": 2.7682627378759976, + "grad_norm": 0.31434112787246704, + "learning_rate": 8.48616521390711e-05, + "loss": 1.7782, + "step": 9019 + }, + { + "epoch": 2.768569674647023, + "grad_norm": 0.31802332401275635, + "learning_rate": 8.485808884815563e-05, + "loss": 1.8927, + "step": 9020 + }, + { + "epoch": 2.768876611418048, + "grad_norm": 0.4615871012210846, + "learning_rate": 8.485452521275208e-05, + "loss": 1.7866, + "step": 9021 + }, + { + "epoch": 2.769183548189073, + "grad_norm": 0.43722355365753174, + "learning_rate": 8.48509612328957e-05, + "loss": 1.8159, + "step": 9022 + }, + { + "epoch": 2.769490484960098, + "grad_norm": 0.27137285470962524, + "learning_rate": 8.484739690862169e-05, + "loss": 1.7613, + "step": 9023 + }, + { + "epoch": 2.7697974217311234, + "grad_norm": 0.32973676919937134, + "learning_rate": 8.484383223996528e-05, + "loss": 1.8321, + "step": 9024 + }, + { + "epoch": 2.7701043585021488, + "grad_norm": 0.38628003001213074, + "learning_rate": 8.484026722696169e-05, + "loss": 1.8154, + "step": 9025 + }, + { + "epoch": 2.7704112952731736, + "grad_norm": 0.33044543862342834, + "learning_rate": 8.483670186964617e-05, + "loss": 1.857, + "step": 9026 + }, + { + "epoch": 2.770718232044199, + "grad_norm": 0.2778245210647583, + "learning_rate": 8.483313616805393e-05, + "loss": 1.8524, + "step": 9027 + }, + { + "epoch": 2.771025168815224, + "grad_norm": 0.32064709067344666, + "learning_rate": 8.482957012222024e-05, + "loss": 1.8757, + "step": 9028 + }, + { + "epoch": 2.771332105586249, + "grad_norm": 0.29325249791145325, + "learning_rate": 8.48260037321803e-05, + "loss": 1.8504, + "step": 9029 + }, + { + "epoch": 2.7716390423572745, + "grad_norm": 0.308626651763916, + "learning_rate": 8.48224369979694e-05, + "loss": 1.882, + "step": 9030 + }, + { + "epoch": 2.7719459791282994, + "grad_norm": 0.34577706456184387, + "learning_rate": 8.481886991962276e-05, + "loss": 1.8178, + "step": 9031 + }, + { + "epoch": 2.7722529158993248, + "grad_norm": 0.3902320861816406, + "learning_rate": 8.481530249717564e-05, + "loss": 1.9111, + "step": 9032 + }, + { + "epoch": 2.7725598526703497, + "grad_norm": 0.431540310382843, + "learning_rate": 8.481173473066328e-05, + "loss": 1.8145, + "step": 9033 + }, + { + "epoch": 2.772866789441375, + "grad_norm": 0.3637184798717499, + "learning_rate": 8.480816662012097e-05, + "loss": 1.8298, + "step": 9034 + }, + { + "epoch": 2.7731737262124003, + "grad_norm": 0.3045017123222351, + "learning_rate": 8.480459816558397e-05, + "loss": 1.8099, + "step": 9035 + }, + { + "epoch": 2.7734806629834257, + "grad_norm": 0.4252402186393738, + "learning_rate": 8.48010293670875e-05, + "loss": 1.8125, + "step": 9036 + }, + { + "epoch": 2.7737875997544506, + "grad_norm": 0.37933188676834106, + "learning_rate": 8.479746022466688e-05, + "loss": 1.8162, + "step": 9037 + }, + { + "epoch": 2.774094536525476, + "grad_norm": 0.287536084651947, + "learning_rate": 8.479389073835735e-05, + "loss": 1.8377, + "step": 9038 + }, + { + "epoch": 2.7744014732965008, + "grad_norm": 0.3484840393066406, + "learning_rate": 8.47903209081942e-05, + "loss": 1.8166, + "step": 9039 + }, + { + "epoch": 2.774708410067526, + "grad_norm": 0.4489477872848511, + "learning_rate": 8.478675073421272e-05, + "loss": 1.8618, + "step": 9040 + }, + { + "epoch": 2.7750153468385514, + "grad_norm": 0.3817744553089142, + "learning_rate": 8.478318021644817e-05, + "loss": 1.86, + "step": 9041 + }, + { + "epoch": 2.7753222836095763, + "grad_norm": 0.263468861579895, + "learning_rate": 8.477960935493585e-05, + "loss": 1.7802, + "step": 9042 + }, + { + "epoch": 2.7756292203806017, + "grad_norm": 0.3218925893306732, + "learning_rate": 8.477603814971104e-05, + "loss": 1.8056, + "step": 9043 + }, + { + "epoch": 2.7759361571516266, + "grad_norm": 0.38502782583236694, + "learning_rate": 8.477246660080905e-05, + "loss": 1.8405, + "step": 9044 + }, + { + "epoch": 2.776243093922652, + "grad_norm": 0.3504064381122589, + "learning_rate": 8.476889470826517e-05, + "loss": 1.8606, + "step": 9045 + }, + { + "epoch": 2.7765500306936772, + "grad_norm": 0.3007161021232605, + "learning_rate": 8.476532247211468e-05, + "loss": 1.8407, + "step": 9046 + }, + { + "epoch": 2.776856967464702, + "grad_norm": 0.30306726694107056, + "learning_rate": 8.476174989239289e-05, + "loss": 1.8399, + "step": 9047 + }, + { + "epoch": 2.7771639042357275, + "grad_norm": 0.3898545801639557, + "learning_rate": 8.475817696913511e-05, + "loss": 1.8971, + "step": 9048 + }, + { + "epoch": 2.7774708410067523, + "grad_norm": 0.35386478900909424, + "learning_rate": 8.475460370237667e-05, + "loss": 1.8213, + "step": 9049 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.35815873742103577, + "learning_rate": 8.475103009215287e-05, + "loss": 1.9593, + "step": 9050 + }, + { + "epoch": 2.778084714548803, + "grad_norm": 0.28021275997161865, + "learning_rate": 8.474745613849901e-05, + "loss": 1.7767, + "step": 9051 + }, + { + "epoch": 2.7783916513198283, + "grad_norm": 0.3393603563308716, + "learning_rate": 8.474388184145042e-05, + "loss": 1.8484, + "step": 9052 + }, + { + "epoch": 2.7786985880908532, + "grad_norm": 0.30488693714141846, + "learning_rate": 8.474030720104243e-05, + "loss": 1.835, + "step": 9053 + }, + { + "epoch": 2.7790055248618786, + "grad_norm": 0.2839586138725281, + "learning_rate": 8.473673221731037e-05, + "loss": 1.8054, + "step": 9054 + }, + { + "epoch": 2.7793124616329035, + "grad_norm": 0.2718851864337921, + "learning_rate": 8.473315689028955e-05, + "loss": 1.8216, + "step": 9055 + }, + { + "epoch": 2.779619398403929, + "grad_norm": 0.3072827458381653, + "learning_rate": 8.472958122001531e-05, + "loss": 1.8537, + "step": 9056 + }, + { + "epoch": 2.779926335174954, + "grad_norm": 0.36827966570854187, + "learning_rate": 8.472600520652301e-05, + "loss": 1.8174, + "step": 9057 + }, + { + "epoch": 2.780233271945979, + "grad_norm": 0.37436968088150024, + "learning_rate": 8.472242884984797e-05, + "loss": 1.7983, + "step": 9058 + }, + { + "epoch": 2.7805402087170044, + "grad_norm": 0.3039530813694, + "learning_rate": 8.471885215002554e-05, + "loss": 1.839, + "step": 9059 + }, + { + "epoch": 2.7808471454880292, + "grad_norm": 0.2949865162372589, + "learning_rate": 8.471527510709106e-05, + "loss": 1.8191, + "step": 9060 + }, + { + "epoch": 2.7811540822590546, + "grad_norm": 0.2914051413536072, + "learning_rate": 8.471169772107987e-05, + "loss": 1.8511, + "step": 9061 + }, + { + "epoch": 2.78146101903008, + "grad_norm": 0.29169002175331116, + "learning_rate": 8.470811999202734e-05, + "loss": 1.8242, + "step": 9062 + }, + { + "epoch": 2.781767955801105, + "grad_norm": 0.2862909436225891, + "learning_rate": 8.470454191996884e-05, + "loss": 1.8471, + "step": 9063 + }, + { + "epoch": 2.78207489257213, + "grad_norm": 0.2820829749107361, + "learning_rate": 8.47009635049397e-05, + "loss": 1.8539, + "step": 9064 + }, + { + "epoch": 2.782381829343155, + "grad_norm": 0.2778072655200958, + "learning_rate": 8.469738474697532e-05, + "loss": 1.7999, + "step": 9065 + }, + { + "epoch": 2.7826887661141804, + "grad_norm": 0.35963353514671326, + "learning_rate": 8.469380564611103e-05, + "loss": 1.8589, + "step": 9066 + }, + { + "epoch": 2.7829957028852057, + "grad_norm": 0.29438379406929016, + "learning_rate": 8.469022620238223e-05, + "loss": 1.7898, + "step": 9067 + }, + { + "epoch": 2.783302639656231, + "grad_norm": 0.2766551971435547, + "learning_rate": 8.468664641582428e-05, + "loss": 1.858, + "step": 9068 + }, + { + "epoch": 2.783609576427256, + "grad_norm": 0.29893574118614197, + "learning_rate": 8.468306628647256e-05, + "loss": 1.7859, + "step": 9069 + }, + { + "epoch": 2.7839165131982813, + "grad_norm": 0.2744910717010498, + "learning_rate": 8.467948581436243e-05, + "loss": 1.7803, + "step": 9070 + }, + { + "epoch": 2.784223449969306, + "grad_norm": 0.2405908703804016, + "learning_rate": 8.467590499952931e-05, + "loss": 1.8064, + "step": 9071 + }, + { + "epoch": 2.7845303867403315, + "grad_norm": 0.28585049510002136, + "learning_rate": 8.467232384200858e-05, + "loss": 1.809, + "step": 9072 + }, + { + "epoch": 2.784837323511357, + "grad_norm": 0.25816819071769714, + "learning_rate": 8.466874234183562e-05, + "loss": 1.7687, + "step": 9073 + }, + { + "epoch": 2.7851442602823817, + "grad_norm": 0.3135145306587219, + "learning_rate": 8.466516049904582e-05, + "loss": 1.8902, + "step": 9074 + }, + { + "epoch": 2.785451197053407, + "grad_norm": 0.32004159688949585, + "learning_rate": 8.46615783136746e-05, + "loss": 1.8227, + "step": 9075 + }, + { + "epoch": 2.785758133824432, + "grad_norm": 0.2775251567363739, + "learning_rate": 8.465799578575733e-05, + "loss": 1.8293, + "step": 9076 + }, + { + "epoch": 2.7860650705954573, + "grad_norm": 0.3377391993999481, + "learning_rate": 8.465441291532944e-05, + "loss": 1.9096, + "step": 9077 + }, + { + "epoch": 2.7863720073664826, + "grad_norm": 0.322818398475647, + "learning_rate": 8.465082970242634e-05, + "loss": 1.8372, + "step": 9078 + }, + { + "epoch": 2.786678944137508, + "grad_norm": 0.30539727210998535, + "learning_rate": 8.464724614708342e-05, + "loss": 1.8678, + "step": 9079 + }, + { + "epoch": 2.786985880908533, + "grad_norm": 0.3148079216480255, + "learning_rate": 8.464366224933611e-05, + "loss": 1.798, + "step": 9080 + }, + { + "epoch": 2.787292817679558, + "grad_norm": 0.3834371566772461, + "learning_rate": 8.464007800921983e-05, + "loss": 1.7871, + "step": 9081 + }, + { + "epoch": 2.787599754450583, + "grad_norm": 0.360202431678772, + "learning_rate": 8.463649342676998e-05, + "loss": 1.8396, + "step": 9082 + }, + { + "epoch": 2.7879066912216084, + "grad_norm": 0.28360050916671753, + "learning_rate": 8.463290850202201e-05, + "loss": 1.7905, + "step": 9083 + }, + { + "epoch": 2.7882136279926337, + "grad_norm": 0.28087326884269714, + "learning_rate": 8.462932323501134e-05, + "loss": 1.8079, + "step": 9084 + }, + { + "epoch": 2.7885205647636586, + "grad_norm": 0.2725851833820343, + "learning_rate": 8.462573762577339e-05, + "loss": 1.8099, + "step": 9085 + }, + { + "epoch": 2.788827501534684, + "grad_norm": 0.27776938676834106, + "learning_rate": 8.462215167434363e-05, + "loss": 1.8002, + "step": 9086 + }, + { + "epoch": 2.789134438305709, + "grad_norm": 0.3118545711040497, + "learning_rate": 8.461856538075745e-05, + "loss": 1.8541, + "step": 9087 + }, + { + "epoch": 2.789441375076734, + "grad_norm": 0.29499873518943787, + "learning_rate": 8.461497874505034e-05, + "loss": 1.8667, + "step": 9088 + }, + { + "epoch": 2.7897483118477595, + "grad_norm": 0.31346917152404785, + "learning_rate": 8.46113917672577e-05, + "loss": 1.8737, + "step": 9089 + }, + { + "epoch": 2.7900552486187844, + "grad_norm": 0.30406203866004944, + "learning_rate": 8.460780444741501e-05, + "loss": 1.8467, + "step": 9090 + }, + { + "epoch": 2.7903621853898097, + "grad_norm": 0.28438735008239746, + "learning_rate": 8.46042167855577e-05, + "loss": 1.8008, + "step": 9091 + }, + { + "epoch": 2.7906691221608346, + "grad_norm": 0.29893866181373596, + "learning_rate": 8.460062878172125e-05, + "loss": 1.8498, + "step": 9092 + }, + { + "epoch": 2.79097605893186, + "grad_norm": 0.33810749650001526, + "learning_rate": 8.459704043594112e-05, + "loss": 1.8259, + "step": 9093 + }, + { + "epoch": 2.7912829957028853, + "grad_norm": 0.3726813495159149, + "learning_rate": 8.459345174825273e-05, + "loss": 1.8831, + "step": 9094 + }, + { + "epoch": 2.7915899324739106, + "grad_norm": 0.2983379662036896, + "learning_rate": 8.45898627186916e-05, + "loss": 1.7886, + "step": 9095 + }, + { + "epoch": 2.7918968692449355, + "grad_norm": 0.3235681354999542, + "learning_rate": 8.458627334729316e-05, + "loss": 1.8616, + "step": 9096 + }, + { + "epoch": 2.792203806015961, + "grad_norm": 0.47961094975471497, + "learning_rate": 8.458268363409288e-05, + "loss": 1.8134, + "step": 9097 + }, + { + "epoch": 2.7925107427869857, + "grad_norm": 0.5463281869888306, + "learning_rate": 8.457909357912628e-05, + "loss": 1.8288, + "step": 9098 + }, + { + "epoch": 2.792817679558011, + "grad_norm": 0.5377171635627747, + "learning_rate": 8.45755031824288e-05, + "loss": 1.8032, + "step": 9099 + }, + { + "epoch": 2.7931246163290364, + "grad_norm": 0.30159178376197815, + "learning_rate": 8.457191244403592e-05, + "loss": 1.7619, + "step": 9100 + }, + { + "epoch": 2.7934315531000613, + "grad_norm": 0.33798086643218994, + "learning_rate": 8.456832136398315e-05, + "loss": 1.839, + "step": 9101 + }, + { + "epoch": 2.7937384898710866, + "grad_norm": 0.5194488167762756, + "learning_rate": 8.456472994230595e-05, + "loss": 1.7908, + "step": 9102 + }, + { + "epoch": 2.7940454266421115, + "grad_norm": 0.49310582876205444, + "learning_rate": 8.456113817903986e-05, + "loss": 1.8471, + "step": 9103 + }, + { + "epoch": 2.794352363413137, + "grad_norm": 0.27490735054016113, + "learning_rate": 8.455754607422032e-05, + "loss": 1.8168, + "step": 9104 + }, + { + "epoch": 2.794659300184162, + "grad_norm": 0.3760504126548767, + "learning_rate": 8.455395362788285e-05, + "loss": 1.8796, + "step": 9105 + }, + { + "epoch": 2.794966236955187, + "grad_norm": 0.4636823534965515, + "learning_rate": 8.455036084006298e-05, + "loss": 1.8001, + "step": 9106 + }, + { + "epoch": 2.7952731737262124, + "grad_norm": 0.38666999340057373, + "learning_rate": 8.454676771079619e-05, + "loss": 1.8396, + "step": 9107 + }, + { + "epoch": 2.7955801104972373, + "grad_norm": 0.2992180585861206, + "learning_rate": 8.454317424011797e-05, + "loss": 1.8298, + "step": 9108 + }, + { + "epoch": 2.7958870472682626, + "grad_norm": 0.3744206428527832, + "learning_rate": 8.453958042806389e-05, + "loss": 1.8396, + "step": 9109 + }, + { + "epoch": 2.796193984039288, + "grad_norm": 0.5117284059524536, + "learning_rate": 8.453598627466941e-05, + "loss": 1.9734, + "step": 9110 + }, + { + "epoch": 2.7965009208103133, + "grad_norm": 0.36792969703674316, + "learning_rate": 8.453239177997008e-05, + "loss": 1.8347, + "step": 9111 + }, + { + "epoch": 2.796807857581338, + "grad_norm": 0.3352719843387604, + "learning_rate": 8.452879694400139e-05, + "loss": 1.7967, + "step": 9112 + }, + { + "epoch": 2.7971147943523635, + "grad_norm": 0.45745235681533813, + "learning_rate": 8.452520176679893e-05, + "loss": 1.8484, + "step": 9113 + }, + { + "epoch": 2.7974217311233884, + "grad_norm": 0.43958255648612976, + "learning_rate": 8.452160624839816e-05, + "loss": 1.7954, + "step": 9114 + }, + { + "epoch": 2.7977286678944138, + "grad_norm": 0.28715837001800537, + "learning_rate": 8.451801038883467e-05, + "loss": 1.8088, + "step": 9115 + }, + { + "epoch": 2.798035604665439, + "grad_norm": 0.3552972078323364, + "learning_rate": 8.451441418814394e-05, + "loss": 1.7654, + "step": 9116 + }, + { + "epoch": 2.798342541436464, + "grad_norm": 0.5065462589263916, + "learning_rate": 8.451081764636156e-05, + "loss": 1.7841, + "step": 9117 + }, + { + "epoch": 2.7986494782074893, + "grad_norm": 0.48900917172431946, + "learning_rate": 8.450722076352306e-05, + "loss": 1.8709, + "step": 9118 + }, + { + "epoch": 2.798956414978514, + "grad_norm": 0.31420227885246277, + "learning_rate": 8.450362353966395e-05, + "loss": 1.9057, + "step": 9119 + }, + { + "epoch": 2.7992633517495396, + "grad_norm": 0.35886913537979126, + "learning_rate": 8.450002597481982e-05, + "loss": 1.877, + "step": 9120 + }, + { + "epoch": 2.799570288520565, + "grad_norm": 0.3822213113307953, + "learning_rate": 8.449642806902623e-05, + "loss": 1.9171, + "step": 9121 + }, + { + "epoch": 2.7998772252915898, + "grad_norm": 0.3286183476448059, + "learning_rate": 8.449282982231869e-05, + "loss": 1.8342, + "step": 9122 + }, + { + "epoch": 2.800184162062615, + "grad_norm": 0.3498966693878174, + "learning_rate": 8.448923123473282e-05, + "loss": 1.8276, + "step": 9123 + }, + { + "epoch": 2.80049109883364, + "grad_norm": 0.3550187647342682, + "learning_rate": 8.448563230630413e-05, + "loss": 1.8585, + "step": 9124 + }, + { + "epoch": 2.8007980356046653, + "grad_norm": 0.32100117206573486, + "learning_rate": 8.448203303706821e-05, + "loss": 1.8168, + "step": 9125 + }, + { + "epoch": 2.8011049723756907, + "grad_norm": 0.3859860301017761, + "learning_rate": 8.447843342706063e-05, + "loss": 1.8941, + "step": 9126 + }, + { + "epoch": 2.801411909146716, + "grad_norm": 0.41674432158470154, + "learning_rate": 8.447483347631697e-05, + "loss": 1.7894, + "step": 9127 + }, + { + "epoch": 2.801718845917741, + "grad_norm": 0.3324837386608124, + "learning_rate": 8.44712331848728e-05, + "loss": 1.8901, + "step": 9128 + }, + { + "epoch": 2.8020257826887662, + "grad_norm": 0.30357789993286133, + "learning_rate": 8.44676325527637e-05, + "loss": 1.8434, + "step": 9129 + }, + { + "epoch": 2.802332719459791, + "grad_norm": 0.3215816617012024, + "learning_rate": 8.446403158002525e-05, + "loss": 1.8291, + "step": 9130 + }, + { + "epoch": 2.8026396562308165, + "grad_norm": 0.26280832290649414, + "learning_rate": 8.446043026669303e-05, + "loss": 1.7934, + "step": 9131 + }, + { + "epoch": 2.802946593001842, + "grad_norm": 0.2963539659976959, + "learning_rate": 8.445682861280265e-05, + "loss": 1.824, + "step": 9132 + }, + { + "epoch": 2.8032535297728667, + "grad_norm": 0.4251864552497864, + "learning_rate": 8.44532266183897e-05, + "loss": 1.9, + "step": 9133 + }, + { + "epoch": 2.803560466543892, + "grad_norm": 0.3920140862464905, + "learning_rate": 8.444962428348978e-05, + "loss": 1.7753, + "step": 9134 + }, + { + "epoch": 2.803867403314917, + "grad_norm": 0.2614890933036804, + "learning_rate": 8.444602160813845e-05, + "loss": 1.844, + "step": 9135 + }, + { + "epoch": 2.8041743400859422, + "grad_norm": 0.3359995484352112, + "learning_rate": 8.444241859237135e-05, + "loss": 1.8636, + "step": 9136 + }, + { + "epoch": 2.8044812768569676, + "grad_norm": 0.34399285912513733, + "learning_rate": 8.44388152362241e-05, + "loss": 1.8304, + "step": 9137 + }, + { + "epoch": 2.804788213627993, + "grad_norm": 0.27815961837768555, + "learning_rate": 8.443521153973228e-05, + "loss": 1.7916, + "step": 9138 + }, + { + "epoch": 2.805095150399018, + "grad_norm": 0.40705251693725586, + "learning_rate": 8.443160750293152e-05, + "loss": 1.7707, + "step": 9139 + }, + { + "epoch": 2.805402087170043, + "grad_norm": 0.49512532353401184, + "learning_rate": 8.442800312585744e-05, + "loss": 1.866, + "step": 9140 + }, + { + "epoch": 2.805709023941068, + "grad_norm": 0.31373831629753113, + "learning_rate": 8.442439840854565e-05, + "loss": 1.8495, + "step": 9141 + }, + { + "epoch": 2.8060159607120934, + "grad_norm": 0.33470213413238525, + "learning_rate": 8.442079335103177e-05, + "loss": 1.8459, + "step": 9142 + }, + { + "epoch": 2.8063228974831187, + "grad_norm": 0.4092586636543274, + "learning_rate": 8.441718795335145e-05, + "loss": 1.8547, + "step": 9143 + }, + { + "epoch": 2.8066298342541436, + "grad_norm": 0.37220728397369385, + "learning_rate": 8.44135822155403e-05, + "loss": 1.8922, + "step": 9144 + }, + { + "epoch": 2.806936771025169, + "grad_norm": 0.3197399973869324, + "learning_rate": 8.440997613763395e-05, + "loss": 1.872, + "step": 9145 + }, + { + "epoch": 2.807243707796194, + "grad_norm": 0.31258881092071533, + "learning_rate": 8.440636971966805e-05, + "loss": 1.8394, + "step": 9146 + }, + { + "epoch": 2.807550644567219, + "grad_norm": 0.31450721621513367, + "learning_rate": 8.440276296167825e-05, + "loss": 1.8496, + "step": 9147 + }, + { + "epoch": 2.8078575813382445, + "grad_norm": 0.30959805846214294, + "learning_rate": 8.439915586370018e-05, + "loss": 1.8326, + "step": 9148 + }, + { + "epoch": 2.8081645181092694, + "grad_norm": 0.2942456901073456, + "learning_rate": 8.439554842576949e-05, + "loss": 1.8742, + "step": 9149 + }, + { + "epoch": 2.8084714548802947, + "grad_norm": 0.32378795742988586, + "learning_rate": 8.439194064792182e-05, + "loss": 1.7991, + "step": 9150 + }, + { + "epoch": 2.8087783916513196, + "grad_norm": 0.30733996629714966, + "learning_rate": 8.438833253019285e-05, + "loss": 1.8822, + "step": 9151 + }, + { + "epoch": 2.809085328422345, + "grad_norm": 0.29933521151542664, + "learning_rate": 8.438472407261821e-05, + "loss": 1.7785, + "step": 9152 + }, + { + "epoch": 2.8093922651933703, + "grad_norm": 0.2992005944252014, + "learning_rate": 8.438111527523358e-05, + "loss": 1.9056, + "step": 9153 + }, + { + "epoch": 2.8096992019643956, + "grad_norm": 0.3074969947338104, + "learning_rate": 8.43775061380746e-05, + "loss": 1.8283, + "step": 9154 + }, + { + "epoch": 2.8100061387354205, + "grad_norm": 0.29843345284461975, + "learning_rate": 8.437389666117699e-05, + "loss": 1.87, + "step": 9155 + }, + { + "epoch": 2.810313075506446, + "grad_norm": 0.2939853072166443, + "learning_rate": 8.437028684457635e-05, + "loss": 1.8657, + "step": 9156 + }, + { + "epoch": 2.8106200122774707, + "grad_norm": 0.292972207069397, + "learning_rate": 8.436667668830841e-05, + "loss": 1.821, + "step": 9157 + }, + { + "epoch": 2.810926949048496, + "grad_norm": 0.298244833946228, + "learning_rate": 8.436306619240882e-05, + "loss": 1.8531, + "step": 9158 + }, + { + "epoch": 2.8112338858195214, + "grad_norm": 0.28567394614219666, + "learning_rate": 8.435945535691328e-05, + "loss": 1.7719, + "step": 9159 + }, + { + "epoch": 2.8115408225905463, + "grad_norm": 0.2876092493534088, + "learning_rate": 8.435584418185745e-05, + "loss": 1.7622, + "step": 9160 + }, + { + "epoch": 2.8118477593615716, + "grad_norm": 0.2656804919242859, + "learning_rate": 8.435223266727704e-05, + "loss": 1.7624, + "step": 9161 + }, + { + "epoch": 2.8121546961325965, + "grad_norm": 0.26690298318862915, + "learning_rate": 8.434862081320774e-05, + "loss": 1.807, + "step": 9162 + }, + { + "epoch": 2.812461632903622, + "grad_norm": 0.3088238537311554, + "learning_rate": 8.434500861968521e-05, + "loss": 1.9214, + "step": 9163 + }, + { + "epoch": 2.812768569674647, + "grad_norm": 0.32310751080513, + "learning_rate": 8.43413960867452e-05, + "loss": 1.8341, + "step": 9164 + }, + { + "epoch": 2.813075506445672, + "grad_norm": 0.3028428554534912, + "learning_rate": 8.433778321442339e-05, + "loss": 1.8316, + "step": 9165 + }, + { + "epoch": 2.8133824432166974, + "grad_norm": 0.28363901376724243, + "learning_rate": 8.433417000275545e-05, + "loss": 1.8506, + "step": 9166 + }, + { + "epoch": 2.8136893799877223, + "grad_norm": 0.2976547181606293, + "learning_rate": 8.433055645177714e-05, + "loss": 1.8654, + "step": 9167 + }, + { + "epoch": 2.8139963167587476, + "grad_norm": 0.2945725619792938, + "learning_rate": 8.432694256152414e-05, + "loss": 1.8146, + "step": 9168 + }, + { + "epoch": 2.814303253529773, + "grad_norm": 0.30364149808883667, + "learning_rate": 8.432332833203217e-05, + "loss": 1.8152, + "step": 9169 + }, + { + "epoch": 2.8146101903007983, + "grad_norm": 0.2776038348674774, + "learning_rate": 8.431971376333699e-05, + "loss": 1.7723, + "step": 9170 + }, + { + "epoch": 2.814917127071823, + "grad_norm": 0.41802000999450684, + "learning_rate": 8.431609885547425e-05, + "loss": 1.7909, + "step": 9171 + }, + { + "epoch": 2.8152240638428485, + "grad_norm": 0.400622695684433, + "learning_rate": 8.43124836084797e-05, + "loss": 1.8241, + "step": 9172 + }, + { + "epoch": 2.8155310006138734, + "grad_norm": 0.3760300576686859, + "learning_rate": 8.430886802238908e-05, + "loss": 1.9298, + "step": 9173 + }, + { + "epoch": 2.8158379373848987, + "grad_norm": 0.2944977283477783, + "learning_rate": 8.430525209723813e-05, + "loss": 1.8181, + "step": 9174 + }, + { + "epoch": 2.816144874155924, + "grad_norm": 0.28091785311698914, + "learning_rate": 8.430163583306257e-05, + "loss": 1.8178, + "step": 9175 + }, + { + "epoch": 2.816451810926949, + "grad_norm": 0.33689528703689575, + "learning_rate": 8.429801922989812e-05, + "loss": 1.8195, + "step": 9176 + }, + { + "epoch": 2.8167587476979743, + "grad_norm": 0.3541412055492401, + "learning_rate": 8.429440228778058e-05, + "loss": 1.8951, + "step": 9177 + }, + { + "epoch": 2.817065684468999, + "grad_norm": 0.2846376299858093, + "learning_rate": 8.429078500674564e-05, + "loss": 1.7858, + "step": 9178 + }, + { + "epoch": 2.8173726212400245, + "grad_norm": 0.28097108006477356, + "learning_rate": 8.428716738682905e-05, + "loss": 1.8503, + "step": 9179 + }, + { + "epoch": 2.81767955801105, + "grad_norm": 0.354670912027359, + "learning_rate": 8.428354942806658e-05, + "loss": 1.8332, + "step": 9180 + }, + { + "epoch": 2.8179864947820747, + "grad_norm": 0.3589770793914795, + "learning_rate": 8.427993113049397e-05, + "loss": 1.8527, + "step": 9181 + }, + { + "epoch": 2.8182934315531, + "grad_norm": 0.3171144723892212, + "learning_rate": 8.4276312494147e-05, + "loss": 1.789, + "step": 9182 + }, + { + "epoch": 2.818600368324125, + "grad_norm": 0.3540917932987213, + "learning_rate": 8.427269351906143e-05, + "loss": 1.8338, + "step": 9183 + }, + { + "epoch": 2.8189073050951503, + "grad_norm": 0.34149861335754395, + "learning_rate": 8.426907420527302e-05, + "loss": 1.8202, + "step": 9184 + }, + { + "epoch": 2.8192142418661756, + "grad_norm": 0.3035878837108612, + "learning_rate": 8.426545455281751e-05, + "loss": 1.842, + "step": 9185 + }, + { + "epoch": 2.819521178637201, + "grad_norm": 0.29007625579833984, + "learning_rate": 8.426183456173072e-05, + "loss": 1.8486, + "step": 9186 + }, + { + "epoch": 2.819828115408226, + "grad_norm": 0.3066602647304535, + "learning_rate": 8.425821423204837e-05, + "loss": 1.7833, + "step": 9187 + }, + { + "epoch": 2.820135052179251, + "grad_norm": 0.3163747191429138, + "learning_rate": 8.425459356380627e-05, + "loss": 1.8037, + "step": 9188 + }, + { + "epoch": 2.820441988950276, + "grad_norm": 0.3282648026943207, + "learning_rate": 8.425097255704022e-05, + "loss": 1.8476, + "step": 9189 + }, + { + "epoch": 2.8207489257213014, + "grad_norm": 0.3573009669780731, + "learning_rate": 8.424735121178598e-05, + "loss": 1.87, + "step": 9190 + }, + { + "epoch": 2.8210558624923268, + "grad_norm": 0.3480490744113922, + "learning_rate": 8.424372952807933e-05, + "loss": 1.8773, + "step": 9191 + }, + { + "epoch": 2.8213627992633517, + "grad_norm": 0.3296821415424347, + "learning_rate": 8.424010750595608e-05, + "loss": 1.8775, + "step": 9192 + }, + { + "epoch": 2.821669736034377, + "grad_norm": 0.33366382122039795, + "learning_rate": 8.423648514545202e-05, + "loss": 1.8064, + "step": 9193 + }, + { + "epoch": 2.821976672805402, + "grad_norm": 0.454303503036499, + "learning_rate": 8.423286244660295e-05, + "loss": 1.9702, + "step": 9194 + }, + { + "epoch": 2.822283609576427, + "grad_norm": 0.361215740442276, + "learning_rate": 8.422923940944466e-05, + "loss": 1.8055, + "step": 9195 + }, + { + "epoch": 2.8225905463474525, + "grad_norm": 0.3678447902202606, + "learning_rate": 8.422561603401297e-05, + "loss": 1.8924, + "step": 9196 + }, + { + "epoch": 2.8228974831184774, + "grad_norm": 0.32999005913734436, + "learning_rate": 8.422199232034369e-05, + "loss": 1.7887, + "step": 9197 + }, + { + "epoch": 2.8232044198895028, + "grad_norm": 0.2811618149280548, + "learning_rate": 8.42183682684726e-05, + "loss": 1.8166, + "step": 9198 + }, + { + "epoch": 2.8235113566605277, + "grad_norm": 0.3178839385509491, + "learning_rate": 8.421474387843555e-05, + "loss": 1.7868, + "step": 9199 + }, + { + "epoch": 2.823818293431553, + "grad_norm": 0.27299264073371887, + "learning_rate": 8.421111915026836e-05, + "loss": 1.816, + "step": 9200 + }, + { + "epoch": 2.8241252302025783, + "grad_norm": 0.3191591203212738, + "learning_rate": 8.420749408400684e-05, + "loss": 1.912, + "step": 9201 + }, + { + "epoch": 2.8244321669736037, + "grad_norm": 0.3638809323310852, + "learning_rate": 8.42038686796868e-05, + "loss": 1.7716, + "step": 9202 + }, + { + "epoch": 2.8247391037446286, + "grad_norm": 0.33573171496391296, + "learning_rate": 8.420024293734407e-05, + "loss": 1.8599, + "step": 9203 + }, + { + "epoch": 2.825046040515654, + "grad_norm": 0.29062843322753906, + "learning_rate": 8.419661685701452e-05, + "loss": 1.7982, + "step": 9204 + }, + { + "epoch": 2.825352977286679, + "grad_norm": 0.27475887537002563, + "learning_rate": 8.419299043873394e-05, + "loss": 1.7763, + "step": 9205 + }, + { + "epoch": 2.825659914057704, + "grad_norm": 0.2996850609779358, + "learning_rate": 8.41893636825382e-05, + "loss": 1.7957, + "step": 9206 + }, + { + "epoch": 2.8259668508287294, + "grad_norm": 0.38112908601760864, + "learning_rate": 8.418573658846314e-05, + "loss": 1.8536, + "step": 9207 + }, + { + "epoch": 2.8262737875997543, + "grad_norm": 0.3245584964752197, + "learning_rate": 8.418210915654456e-05, + "loss": 1.8254, + "step": 9208 + }, + { + "epoch": 2.8265807243707797, + "grad_norm": 0.24600234627723694, + "learning_rate": 8.417848138681837e-05, + "loss": 1.825, + "step": 9209 + }, + { + "epoch": 2.8268876611418046, + "grad_norm": 0.3130429685115814, + "learning_rate": 8.417485327932038e-05, + "loss": 1.7954, + "step": 9210 + }, + { + "epoch": 2.82719459791283, + "grad_norm": 0.3218819200992584, + "learning_rate": 8.417122483408647e-05, + "loss": 1.8343, + "step": 9211 + }, + { + "epoch": 2.8275015346838552, + "grad_norm": 0.3020598292350769, + "learning_rate": 8.416759605115248e-05, + "loss": 1.8547, + "step": 9212 + }, + { + "epoch": 2.8278084714548806, + "grad_norm": 0.2685437798500061, + "learning_rate": 8.416396693055429e-05, + "loss": 1.7828, + "step": 9213 + }, + { + "epoch": 2.8281154082259055, + "grad_norm": 0.2990378737449646, + "learning_rate": 8.416033747232775e-05, + "loss": 1.8108, + "step": 9214 + }, + { + "epoch": 2.828422344996931, + "grad_norm": 0.25395238399505615, + "learning_rate": 8.415670767650871e-05, + "loss": 1.786, + "step": 9215 + }, + { + "epoch": 2.8287292817679557, + "grad_norm": 0.3406725823879242, + "learning_rate": 8.41530775431331e-05, + "loss": 1.9015, + "step": 9216 + }, + { + "epoch": 2.829036218538981, + "grad_norm": 0.279859721660614, + "learning_rate": 8.414944707223676e-05, + "loss": 1.8639, + "step": 9217 + }, + { + "epoch": 2.8293431553100064, + "grad_norm": 0.2574310600757599, + "learning_rate": 8.414581626385554e-05, + "loss": 1.7595, + "step": 9218 + }, + { + "epoch": 2.8296500920810312, + "grad_norm": 0.2956291437149048, + "learning_rate": 8.414218511802537e-05, + "loss": 1.8418, + "step": 9219 + }, + { + "epoch": 2.8299570288520566, + "grad_norm": 0.30965283513069153, + "learning_rate": 8.41385536347821e-05, + "loss": 1.8241, + "step": 9220 + }, + { + "epoch": 2.8302639656230815, + "grad_norm": 0.3125357925891876, + "learning_rate": 8.413492181416166e-05, + "loss": 1.7961, + "step": 9221 + }, + { + "epoch": 2.830570902394107, + "grad_norm": 0.23901188373565674, + "learning_rate": 8.413128965619988e-05, + "loss": 1.8109, + "step": 9222 + }, + { + "epoch": 2.830877839165132, + "grad_norm": 0.26556700468063354, + "learning_rate": 8.412765716093272e-05, + "loss": 1.8756, + "step": 9223 + }, + { + "epoch": 2.831184775936157, + "grad_norm": 0.3080972731113434, + "learning_rate": 8.412402432839604e-05, + "loss": 1.8271, + "step": 9224 + }, + { + "epoch": 2.8314917127071824, + "grad_norm": 0.32894501090049744, + "learning_rate": 8.412039115862573e-05, + "loss": 1.8427, + "step": 9225 + }, + { + "epoch": 2.8317986494782073, + "grad_norm": 0.3136049509048462, + "learning_rate": 8.411675765165774e-05, + "loss": 1.8716, + "step": 9226 + }, + { + "epoch": 2.8321055862492326, + "grad_norm": 0.26859185099601746, + "learning_rate": 8.411312380752795e-05, + "loss": 1.8138, + "step": 9227 + }, + { + "epoch": 2.832412523020258, + "grad_norm": 0.26863718032836914, + "learning_rate": 8.410948962627227e-05, + "loss": 1.8286, + "step": 9228 + }, + { + "epoch": 2.8327194597912833, + "grad_norm": 0.25599852204322815, + "learning_rate": 8.410585510792663e-05, + "loss": 1.8274, + "step": 9229 + }, + { + "epoch": 2.833026396562308, + "grad_norm": 0.22787287831306458, + "learning_rate": 8.410222025252694e-05, + "loss": 1.7961, + "step": 9230 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.22957643866539001, + "learning_rate": 8.409858506010912e-05, + "loss": 1.7763, + "step": 9231 + }, + { + "epoch": 2.8336402701043584, + "grad_norm": 0.2794438302516937, + "learning_rate": 8.409494953070909e-05, + "loss": 1.8552, + "step": 9232 + }, + { + "epoch": 2.8339472068753837, + "grad_norm": 0.2755461037158966, + "learning_rate": 8.409131366436279e-05, + "loss": 1.8418, + "step": 9233 + }, + { + "epoch": 2.834254143646409, + "grad_norm": 0.27968719601631165, + "learning_rate": 8.408767746110616e-05, + "loss": 1.8774, + "step": 9234 + }, + { + "epoch": 2.834561080417434, + "grad_norm": 0.3014982044696808, + "learning_rate": 8.408404092097511e-05, + "loss": 1.8886, + "step": 9235 + }, + { + "epoch": 2.8348680171884593, + "grad_norm": 0.3139450252056122, + "learning_rate": 8.408040404400558e-05, + "loss": 1.8119, + "step": 9236 + }, + { + "epoch": 2.835174953959484, + "grad_norm": 0.43578827381134033, + "learning_rate": 8.407676683023353e-05, + "loss": 1.8173, + "step": 9237 + }, + { + "epoch": 2.8354818907305095, + "grad_norm": 0.4939953088760376, + "learning_rate": 8.407312927969489e-05, + "loss": 1.8647, + "step": 9238 + }, + { + "epoch": 2.835788827501535, + "grad_norm": 0.40801018476486206, + "learning_rate": 8.406949139242562e-05, + "loss": 1.8259, + "step": 9239 + }, + { + "epoch": 2.8360957642725597, + "grad_norm": 0.331249862909317, + "learning_rate": 8.406585316846168e-05, + "loss": 1.8727, + "step": 9240 + }, + { + "epoch": 2.836402701043585, + "grad_norm": 0.3368569314479828, + "learning_rate": 8.406221460783901e-05, + "loss": 1.8362, + "step": 9241 + }, + { + "epoch": 2.83670963781461, + "grad_norm": 0.4736326336860657, + "learning_rate": 8.405857571059355e-05, + "loss": 1.9543, + "step": 9242 + }, + { + "epoch": 2.8370165745856353, + "grad_norm": 0.4151712656021118, + "learning_rate": 8.405493647676131e-05, + "loss": 1.8764, + "step": 9243 + }, + { + "epoch": 2.8373235113566606, + "grad_norm": 0.3463367819786072, + "learning_rate": 8.405129690637821e-05, + "loss": 1.8578, + "step": 9244 + }, + { + "epoch": 2.837630448127686, + "grad_norm": 0.28701671957969666, + "learning_rate": 8.404765699948023e-05, + "loss": 1.8201, + "step": 9245 + }, + { + "epoch": 2.837937384898711, + "grad_norm": 0.2893613874912262, + "learning_rate": 8.404401675610336e-05, + "loss": 1.7918, + "step": 9246 + }, + { + "epoch": 2.838244321669736, + "grad_norm": 0.29359766840934753, + "learning_rate": 8.404037617628357e-05, + "loss": 1.7919, + "step": 9247 + }, + { + "epoch": 2.838551258440761, + "grad_norm": 0.30147913098335266, + "learning_rate": 8.403673526005682e-05, + "loss": 1.8227, + "step": 9248 + }, + { + "epoch": 2.8388581952117864, + "grad_norm": 0.28443291783332825, + "learning_rate": 8.403309400745908e-05, + "loss": 1.8128, + "step": 9249 + }, + { + "epoch": 2.8391651319828117, + "grad_norm": 0.27890142798423767, + "learning_rate": 8.40294524185264e-05, + "loss": 1.8109, + "step": 9250 + }, + { + "epoch": 2.8394720687538366, + "grad_norm": 0.29900890588760376, + "learning_rate": 8.402581049329471e-05, + "loss": 1.7852, + "step": 9251 + }, + { + "epoch": 2.839779005524862, + "grad_norm": 0.34249019622802734, + "learning_rate": 8.402216823180001e-05, + "loss": 1.8681, + "step": 9252 + }, + { + "epoch": 2.840085942295887, + "grad_norm": 0.3387257754802704, + "learning_rate": 8.40185256340783e-05, + "loss": 1.9171, + "step": 9253 + }, + { + "epoch": 2.840392879066912, + "grad_norm": 0.2831752598285675, + "learning_rate": 8.40148827001656e-05, + "loss": 1.8422, + "step": 9254 + }, + { + "epoch": 2.8406998158379375, + "grad_norm": 0.30895891785621643, + "learning_rate": 8.401123943009788e-05, + "loss": 1.7967, + "step": 9255 + }, + { + "epoch": 2.8410067526089624, + "grad_norm": 0.381154328584671, + "learning_rate": 8.400759582391116e-05, + "loss": 1.8359, + "step": 9256 + }, + { + "epoch": 2.8413136893799877, + "grad_norm": 0.4041622281074524, + "learning_rate": 8.400395188164144e-05, + "loss": 1.8306, + "step": 9257 + }, + { + "epoch": 2.8416206261510126, + "grad_norm": 0.3801247775554657, + "learning_rate": 8.400030760332474e-05, + "loss": 1.8696, + "step": 9258 + }, + { + "epoch": 2.841927562922038, + "grad_norm": 0.27382874488830566, + "learning_rate": 8.399666298899706e-05, + "loss": 1.8369, + "step": 9259 + }, + { + "epoch": 2.8422344996930633, + "grad_norm": 0.31395214796066284, + "learning_rate": 8.399301803869445e-05, + "loss": 1.8135, + "step": 9260 + }, + { + "epoch": 2.8425414364640886, + "grad_norm": 0.36473774909973145, + "learning_rate": 8.398937275245291e-05, + "loss": 1.8025, + "step": 9261 + }, + { + "epoch": 2.8428483732351135, + "grad_norm": 0.38420331478118896, + "learning_rate": 8.398572713030846e-05, + "loss": 1.7873, + "step": 9262 + }, + { + "epoch": 2.843155310006139, + "grad_norm": 0.2707001566886902, + "learning_rate": 8.398208117229714e-05, + "loss": 1.8071, + "step": 9263 + }, + { + "epoch": 2.8434622467771637, + "grad_norm": 0.3391258418560028, + "learning_rate": 8.397843487845496e-05, + "loss": 1.8186, + "step": 9264 + }, + { + "epoch": 2.843769183548189, + "grad_norm": 0.4473530650138855, + "learning_rate": 8.397478824881799e-05, + "loss": 1.9144, + "step": 9265 + }, + { + "epoch": 2.8440761203192144, + "grad_norm": 0.3141709268093109, + "learning_rate": 8.397114128342224e-05, + "loss": 1.77, + "step": 9266 + }, + { + "epoch": 2.8443830570902393, + "grad_norm": 0.29191854596138, + "learning_rate": 8.396749398230377e-05, + "loss": 1.8645, + "step": 9267 + }, + { + "epoch": 2.8446899938612646, + "grad_norm": 0.4399743676185608, + "learning_rate": 8.39638463454986e-05, + "loss": 1.8261, + "step": 9268 + }, + { + "epoch": 2.8449969306322895, + "grad_norm": 0.4741196036338806, + "learning_rate": 8.396019837304281e-05, + "loss": 1.8566, + "step": 9269 + }, + { + "epoch": 2.845303867403315, + "grad_norm": 0.39640361070632935, + "learning_rate": 8.395655006497243e-05, + "loss": 1.8062, + "step": 9270 + }, + { + "epoch": 2.84561080417434, + "grad_norm": 0.290171355009079, + "learning_rate": 8.39529014213235e-05, + "loss": 1.8463, + "step": 9271 + }, + { + "epoch": 2.845917740945365, + "grad_norm": 0.2773928940296173, + "learning_rate": 8.394925244213212e-05, + "loss": 1.7929, + "step": 9272 + }, + { + "epoch": 2.8462246777163904, + "grad_norm": 0.38512173295021057, + "learning_rate": 8.394560312743433e-05, + "loss": 1.8724, + "step": 9273 + }, + { + "epoch": 2.8465316144874153, + "grad_norm": 0.44405680894851685, + "learning_rate": 8.394195347726619e-05, + "loss": 1.8184, + "step": 9274 + }, + { + "epoch": 2.8468385512584407, + "grad_norm": 0.32526880502700806, + "learning_rate": 8.393830349166376e-05, + "loss": 1.8207, + "step": 9275 + }, + { + "epoch": 2.847145488029466, + "grad_norm": 0.2934194803237915, + "learning_rate": 8.393465317066313e-05, + "loss": 1.8023, + "step": 9276 + }, + { + "epoch": 2.8474524248004913, + "grad_norm": 0.43126001954078674, + "learning_rate": 8.393100251430037e-05, + "loss": 1.8283, + "step": 9277 + }, + { + "epoch": 2.847759361571516, + "grad_norm": 0.48253729939460754, + "learning_rate": 8.392735152261157e-05, + "loss": 1.8359, + "step": 9278 + }, + { + "epoch": 2.8480662983425415, + "grad_norm": 0.3736251890659332, + "learning_rate": 8.392370019563279e-05, + "loss": 1.8553, + "step": 9279 + }, + { + "epoch": 2.8483732351135664, + "grad_norm": 0.33329901099205017, + "learning_rate": 8.39200485334001e-05, + "loss": 1.8156, + "step": 9280 + }, + { + "epoch": 2.8486801718845918, + "grad_norm": 0.42538657784461975, + "learning_rate": 8.391639653594963e-05, + "loss": 1.7812, + "step": 9281 + }, + { + "epoch": 2.848987108655617, + "grad_norm": 0.39076727628707886, + "learning_rate": 8.391274420331744e-05, + "loss": 1.8027, + "step": 9282 + }, + { + "epoch": 2.849294045426642, + "grad_norm": 0.3558272123336792, + "learning_rate": 8.390909153553963e-05, + "loss": 1.8448, + "step": 9283 + }, + { + "epoch": 2.8496009821976673, + "grad_norm": 0.26782071590423584, + "learning_rate": 8.390543853265232e-05, + "loss": 1.7995, + "step": 9284 + }, + { + "epoch": 2.849907918968692, + "grad_norm": 0.3449724614620209, + "learning_rate": 8.390178519469158e-05, + "loss": 1.7888, + "step": 9285 + }, + { + "epoch": 2.8502148557397176, + "grad_norm": 0.36390578746795654, + "learning_rate": 8.389813152169355e-05, + "loss": 1.8072, + "step": 9286 + }, + { + "epoch": 2.850521792510743, + "grad_norm": 0.31959423422813416, + "learning_rate": 8.389447751369428e-05, + "loss": 1.8513, + "step": 9287 + }, + { + "epoch": 2.8508287292817682, + "grad_norm": 0.2717762589454651, + "learning_rate": 8.389082317072994e-05, + "loss": 1.8457, + "step": 9288 + }, + { + "epoch": 2.851135666052793, + "grad_norm": 0.28937265276908875, + "learning_rate": 8.388716849283662e-05, + "loss": 1.7945, + "step": 9289 + }, + { + "epoch": 2.8514426028238185, + "grad_norm": 0.293079674243927, + "learning_rate": 8.388351348005044e-05, + "loss": 1.7731, + "step": 9290 + }, + { + "epoch": 2.8517495395948433, + "grad_norm": 0.32930463552474976, + "learning_rate": 8.38798581324075e-05, + "loss": 1.9017, + "step": 9291 + }, + { + "epoch": 2.8520564763658687, + "grad_norm": 0.2972584664821625, + "learning_rate": 8.387620244994397e-05, + "loss": 1.861, + "step": 9292 + }, + { + "epoch": 2.852363413136894, + "grad_norm": 0.24732981622219086, + "learning_rate": 8.387254643269595e-05, + "loss": 1.7749, + "step": 9293 + }, + { + "epoch": 2.852670349907919, + "grad_norm": 0.31004419922828674, + "learning_rate": 8.386889008069955e-05, + "loss": 1.7848, + "step": 9294 + }, + { + "epoch": 2.8529772866789442, + "grad_norm": 0.2916278541088104, + "learning_rate": 8.386523339399095e-05, + "loss": 1.8299, + "step": 9295 + }, + { + "epoch": 2.853284223449969, + "grad_norm": 0.3109573423862457, + "learning_rate": 8.386157637260626e-05, + "loss": 1.8072, + "step": 9296 + }, + { + "epoch": 2.8535911602209945, + "grad_norm": 0.26398584246635437, + "learning_rate": 8.385791901658162e-05, + "loss": 1.8157, + "step": 9297 + }, + { + "epoch": 2.85389809699202, + "grad_norm": 0.3289371132850647, + "learning_rate": 8.385426132595317e-05, + "loss": 1.9382, + "step": 9298 + }, + { + "epoch": 2.8542050337630447, + "grad_norm": 0.2946974039077759, + "learning_rate": 8.38506033007571e-05, + "loss": 1.7893, + "step": 9299 + }, + { + "epoch": 2.85451197053407, + "grad_norm": 0.2909530699253082, + "learning_rate": 8.384694494102949e-05, + "loss": 1.8223, + "step": 9300 + }, + { + "epoch": 2.854818907305095, + "grad_norm": 0.2886645793914795, + "learning_rate": 8.384328624680655e-05, + "loss": 1.8239, + "step": 9301 + }, + { + "epoch": 2.8551258440761202, + "grad_norm": 0.2669137716293335, + "learning_rate": 8.383962721812442e-05, + "loss": 1.8102, + "step": 9302 + }, + { + "epoch": 2.8554327808471456, + "grad_norm": 0.3740660548210144, + "learning_rate": 8.383596785501926e-05, + "loss": 1.9014, + "step": 9303 + }, + { + "epoch": 2.855739717618171, + "grad_norm": 0.3062593638896942, + "learning_rate": 8.383230815752724e-05, + "loss": 1.8071, + "step": 9304 + }, + { + "epoch": 2.856046654389196, + "grad_norm": 0.2509091794490814, + "learning_rate": 8.382864812568452e-05, + "loss": 1.7968, + "step": 9305 + }, + { + "epoch": 2.856353591160221, + "grad_norm": 0.2764138877391815, + "learning_rate": 8.382498775952725e-05, + "loss": 1.7463, + "step": 9306 + }, + { + "epoch": 2.856660527931246, + "grad_norm": 0.3292323350906372, + "learning_rate": 8.382132705909165e-05, + "loss": 1.7888, + "step": 9307 + }, + { + "epoch": 2.8569674647022714, + "grad_norm": 0.3169284462928772, + "learning_rate": 8.381766602441386e-05, + "loss": 1.841, + "step": 9308 + }, + { + "epoch": 2.8572744014732967, + "grad_norm": 0.27665168046951294, + "learning_rate": 8.381400465553007e-05, + "loss": 1.7659, + "step": 9309 + }, + { + "epoch": 2.8575813382443216, + "grad_norm": 0.34908005595207214, + "learning_rate": 8.381034295247647e-05, + "loss": 1.8752, + "step": 9310 + }, + { + "epoch": 2.857888275015347, + "grad_norm": 0.31204238533973694, + "learning_rate": 8.380668091528924e-05, + "loss": 1.8201, + "step": 9311 + }, + { + "epoch": 2.858195211786372, + "grad_norm": 0.2713339328765869, + "learning_rate": 8.380301854400459e-05, + "loss": 1.8002, + "step": 9312 + }, + { + "epoch": 2.858502148557397, + "grad_norm": 0.30525076389312744, + "learning_rate": 8.379935583865868e-05, + "loss": 1.8533, + "step": 9313 + }, + { + "epoch": 2.8588090853284225, + "grad_norm": 0.3294430673122406, + "learning_rate": 8.379569279928774e-05, + "loss": 1.8895, + "step": 9314 + }, + { + "epoch": 2.8591160220994474, + "grad_norm": 0.31798750162124634, + "learning_rate": 8.379202942592795e-05, + "loss": 1.8148, + "step": 9315 + }, + { + "epoch": 2.8594229588704727, + "grad_norm": 0.3044969141483307, + "learning_rate": 8.378836571861553e-05, + "loss": 1.8477, + "step": 9316 + }, + { + "epoch": 2.8597298956414976, + "grad_norm": 0.2694118320941925, + "learning_rate": 8.378470167738665e-05, + "loss": 1.7998, + "step": 9317 + }, + { + "epoch": 2.860036832412523, + "grad_norm": 0.2601872980594635, + "learning_rate": 8.378103730227758e-05, + "loss": 1.8118, + "step": 9318 + }, + { + "epoch": 2.8603437691835483, + "grad_norm": 0.28168994188308716, + "learning_rate": 8.377737259332446e-05, + "loss": 1.8048, + "step": 9319 + }, + { + "epoch": 2.8606507059545736, + "grad_norm": 0.3008260428905487, + "learning_rate": 8.377370755056358e-05, + "loss": 1.7743, + "step": 9320 + }, + { + "epoch": 2.8609576427255985, + "grad_norm": 0.2578682601451874, + "learning_rate": 8.37700421740311e-05, + "loss": 1.8011, + "step": 9321 + }, + { + "epoch": 2.861264579496624, + "grad_norm": 0.3051932752132416, + "learning_rate": 8.376637646376329e-05, + "loss": 1.8747, + "step": 9322 + }, + { + "epoch": 2.8615715162676487, + "grad_norm": 0.27534300088882446, + "learning_rate": 8.376271041979636e-05, + "loss": 1.8018, + "step": 9323 + }, + { + "epoch": 2.861878453038674, + "grad_norm": 0.3990626335144043, + "learning_rate": 8.375904404216653e-05, + "loss": 1.9223, + "step": 9324 + }, + { + "epoch": 2.8621853898096994, + "grad_norm": 0.43015196919441223, + "learning_rate": 8.375537733091003e-05, + "loss": 1.8219, + "step": 9325 + }, + { + "epoch": 2.8624923265807243, + "grad_norm": 0.4051269590854645, + "learning_rate": 8.37517102860631e-05, + "loss": 1.8057, + "step": 9326 + }, + { + "epoch": 2.8627992633517496, + "grad_norm": 0.31781086325645447, + "learning_rate": 8.3748042907662e-05, + "loss": 1.8374, + "step": 9327 + }, + { + "epoch": 2.8631062001227745, + "grad_norm": 0.3476638197898865, + "learning_rate": 8.374437519574297e-05, + "loss": 1.8679, + "step": 9328 + }, + { + "epoch": 2.8634131368938, + "grad_norm": 0.40497875213623047, + "learning_rate": 8.374070715034224e-05, + "loss": 1.7996, + "step": 9329 + }, + { + "epoch": 2.863720073664825, + "grad_norm": 0.40277308225631714, + "learning_rate": 8.373703877149605e-05, + "loss": 1.8156, + "step": 9330 + }, + { + "epoch": 2.86402701043585, + "grad_norm": 0.3012325167655945, + "learning_rate": 8.373337005924069e-05, + "loss": 1.8765, + "step": 9331 + }, + { + "epoch": 2.8643339472068754, + "grad_norm": 0.3151897192001343, + "learning_rate": 8.372970101361238e-05, + "loss": 1.8395, + "step": 9332 + }, + { + "epoch": 2.8646408839779003, + "grad_norm": 0.33645790815353394, + "learning_rate": 8.372603163464741e-05, + "loss": 1.8587, + "step": 9333 + }, + { + "epoch": 2.8649478207489256, + "grad_norm": 0.29943743348121643, + "learning_rate": 8.3722361922382e-05, + "loss": 1.8007, + "step": 9334 + }, + { + "epoch": 2.865254757519951, + "grad_norm": 0.24727779626846313, + "learning_rate": 8.371869187685248e-05, + "loss": 1.766, + "step": 9335 + }, + { + "epoch": 2.8655616942909763, + "grad_norm": 0.3177282512187958, + "learning_rate": 8.371502149809507e-05, + "loss": 1.7954, + "step": 9336 + }, + { + "epoch": 2.865868631062001, + "grad_norm": 0.3415081202983856, + "learning_rate": 8.371135078614605e-05, + "loss": 1.8036, + "step": 9337 + }, + { + "epoch": 2.8661755678330265, + "grad_norm": 0.3044268488883972, + "learning_rate": 8.37076797410417e-05, + "loss": 1.8196, + "step": 9338 + }, + { + "epoch": 2.8664825046040514, + "grad_norm": 0.24425630271434784, + "learning_rate": 8.370400836281831e-05, + "loss": 1.8267, + "step": 9339 + }, + { + "epoch": 2.8667894413750767, + "grad_norm": 0.27264806628227234, + "learning_rate": 8.370033665151216e-05, + "loss": 1.8218, + "step": 9340 + }, + { + "epoch": 2.867096378146102, + "grad_norm": 0.275601327419281, + "learning_rate": 8.369666460715953e-05, + "loss": 1.8427, + "step": 9341 + }, + { + "epoch": 2.867403314917127, + "grad_norm": 0.2670573592185974, + "learning_rate": 8.36929922297967e-05, + "loss": 1.8449, + "step": 9342 + }, + { + "epoch": 2.8677102516881523, + "grad_norm": 0.2991434335708618, + "learning_rate": 8.368931951945998e-05, + "loss": 1.8866, + "step": 9343 + }, + { + "epoch": 2.868017188459177, + "grad_norm": 0.2975110411643982, + "learning_rate": 8.368564647618564e-05, + "loss": 1.7992, + "step": 9344 + }, + { + "epoch": 2.8683241252302025, + "grad_norm": 0.30109819769859314, + "learning_rate": 8.368197310001001e-05, + "loss": 1.8402, + "step": 9345 + }, + { + "epoch": 2.868631062001228, + "grad_norm": 0.3303714692592621, + "learning_rate": 8.367829939096938e-05, + "loss": 1.8329, + "step": 9346 + }, + { + "epoch": 2.8689379987722528, + "grad_norm": 0.3697182834148407, + "learning_rate": 8.367462534910007e-05, + "loss": 1.9328, + "step": 9347 + }, + { + "epoch": 2.869244935543278, + "grad_norm": 0.3292355537414551, + "learning_rate": 8.367095097443836e-05, + "loss": 1.8284, + "step": 9348 + }, + { + "epoch": 2.869551872314303, + "grad_norm": 0.30440348386764526, + "learning_rate": 8.366727626702058e-05, + "loss": 1.8891, + "step": 9349 + }, + { + "epoch": 2.8698588090853283, + "grad_norm": 0.28200212121009827, + "learning_rate": 8.366360122688303e-05, + "loss": 1.7931, + "step": 9350 + }, + { + "epoch": 2.8701657458563536, + "grad_norm": 0.3162787854671478, + "learning_rate": 8.365992585406207e-05, + "loss": 1.8033, + "step": 9351 + }, + { + "epoch": 2.870472682627379, + "grad_norm": 0.3326094448566437, + "learning_rate": 8.365625014859399e-05, + "loss": 1.8474, + "step": 9352 + }, + { + "epoch": 2.870779619398404, + "grad_norm": 0.36957383155822754, + "learning_rate": 8.36525741105151e-05, + "loss": 1.8387, + "step": 9353 + }, + { + "epoch": 2.871086556169429, + "grad_norm": 0.32996198534965515, + "learning_rate": 8.364889773986175e-05, + "loss": 1.9087, + "step": 9354 + }, + { + "epoch": 2.871393492940454, + "grad_norm": 0.3164239227771759, + "learning_rate": 8.36452210366703e-05, + "loss": 1.8735, + "step": 9355 + }, + { + "epoch": 2.8717004297114794, + "grad_norm": 0.411538302898407, + "learning_rate": 8.364154400097702e-05, + "loss": 1.832, + "step": 9356 + }, + { + "epoch": 2.8720073664825048, + "grad_norm": 0.48294687271118164, + "learning_rate": 8.36378666328183e-05, + "loss": 1.7772, + "step": 9357 + }, + { + "epoch": 2.8723143032535297, + "grad_norm": 0.4894202649593353, + "learning_rate": 8.363418893223046e-05, + "loss": 1.8396, + "step": 9358 + }, + { + "epoch": 2.872621240024555, + "grad_norm": 0.3328344225883484, + "learning_rate": 8.363051089924986e-05, + "loss": 1.8264, + "step": 9359 + }, + { + "epoch": 2.87292817679558, + "grad_norm": 0.29800695180892944, + "learning_rate": 8.362683253391284e-05, + "loss": 1.8609, + "step": 9360 + }, + { + "epoch": 2.873235113566605, + "grad_norm": 0.48049718141555786, + "learning_rate": 8.362315383625574e-05, + "loss": 1.8703, + "step": 9361 + }, + { + "epoch": 2.8735420503376305, + "grad_norm": 0.5477426052093506, + "learning_rate": 8.361947480631494e-05, + "loss": 1.8336, + "step": 9362 + }, + { + "epoch": 2.873848987108656, + "grad_norm": 0.42515942454338074, + "learning_rate": 8.361579544412676e-05, + "loss": 1.826, + "step": 9363 + }, + { + "epoch": 2.8741559238796808, + "grad_norm": 0.3049539029598236, + "learning_rate": 8.361211574972762e-05, + "loss": 1.9117, + "step": 9364 + }, + { + "epoch": 2.874462860650706, + "grad_norm": 0.4089799225330353, + "learning_rate": 8.360843572315384e-05, + "loss": 1.8669, + "step": 9365 + }, + { + "epoch": 2.874769797421731, + "grad_norm": 0.42594894766807556, + "learning_rate": 8.36047553644418e-05, + "loss": 1.8527, + "step": 9366 + }, + { + "epoch": 2.8750767341927563, + "grad_norm": 0.3282840847969055, + "learning_rate": 8.360107467362785e-05, + "loss": 1.833, + "step": 9367 + }, + { + "epoch": 2.8753836709637817, + "grad_norm": 0.26597294211387634, + "learning_rate": 8.359739365074841e-05, + "loss": 1.7735, + "step": 9368 + }, + { + "epoch": 2.8756906077348066, + "grad_norm": 0.33498096466064453, + "learning_rate": 8.359371229583983e-05, + "loss": 1.7923, + "step": 9369 + }, + { + "epoch": 2.875997544505832, + "grad_norm": 0.3046290874481201, + "learning_rate": 8.35900306089385e-05, + "loss": 1.8296, + "step": 9370 + }, + { + "epoch": 2.876304481276857, + "grad_norm": 0.3128269612789154, + "learning_rate": 8.358634859008079e-05, + "loss": 1.8115, + "step": 9371 + }, + { + "epoch": 2.876611418047882, + "grad_norm": 0.3814822733402252, + "learning_rate": 8.358266623930309e-05, + "loss": 1.8454, + "step": 9372 + }, + { + "epoch": 2.8769183548189075, + "grad_norm": 0.42400503158569336, + "learning_rate": 8.35789835566418e-05, + "loss": 1.8162, + "step": 9373 + }, + { + "epoch": 2.8772252915899323, + "grad_norm": 0.3131491243839264, + "learning_rate": 8.357530054213333e-05, + "loss": 1.8281, + "step": 9374 + }, + { + "epoch": 2.8775322283609577, + "grad_norm": 0.2566036581993103, + "learning_rate": 8.357161719581406e-05, + "loss": 1.7751, + "step": 9375 + }, + { + "epoch": 2.8778391651319826, + "grad_norm": 0.3858461081981659, + "learning_rate": 8.356793351772038e-05, + "loss": 1.8558, + "step": 9376 + }, + { + "epoch": 2.878146101903008, + "grad_norm": 0.38664349913597107, + "learning_rate": 8.35642495078887e-05, + "loss": 1.8009, + "step": 9377 + }, + { + "epoch": 2.8784530386740332, + "grad_norm": 0.33365172147750854, + "learning_rate": 8.356056516635545e-05, + "loss": 1.8689, + "step": 9378 + }, + { + "epoch": 2.8787599754450586, + "grad_norm": 0.3602980971336365, + "learning_rate": 8.355688049315702e-05, + "loss": 1.8397, + "step": 9379 + }, + { + "epoch": 2.8790669122160835, + "grad_norm": 0.4508447051048279, + "learning_rate": 8.355319548832983e-05, + "loss": 1.8163, + "step": 9380 + }, + { + "epoch": 2.879373848987109, + "grad_norm": 0.4433961808681488, + "learning_rate": 8.35495101519103e-05, + "loss": 1.7868, + "step": 9381 + }, + { + "epoch": 2.8796807857581337, + "grad_norm": 0.2754592299461365, + "learning_rate": 8.354582448393483e-05, + "loss": 1.8222, + "step": 9382 + }, + { + "epoch": 2.879987722529159, + "grad_norm": 0.29384344816207886, + "learning_rate": 8.354213848443987e-05, + "loss": 1.7742, + "step": 9383 + }, + { + "epoch": 2.8802946593001844, + "grad_norm": 0.33183756470680237, + "learning_rate": 8.353845215346183e-05, + "loss": 1.8327, + "step": 9384 + }, + { + "epoch": 2.8806015960712092, + "grad_norm": 0.3018858730792999, + "learning_rate": 8.353476549103717e-05, + "loss": 1.8606, + "step": 9385 + }, + { + "epoch": 2.8809085328422346, + "grad_norm": 0.38592803478240967, + "learning_rate": 8.353107849720229e-05, + "loss": 1.8091, + "step": 9386 + }, + { + "epoch": 2.8812154696132595, + "grad_norm": 0.448723703622818, + "learning_rate": 8.352739117199364e-05, + "loss": 1.8537, + "step": 9387 + }, + { + "epoch": 2.881522406384285, + "grad_norm": 0.25959616899490356, + "learning_rate": 8.352370351544765e-05, + "loss": 1.8188, + "step": 9388 + }, + { + "epoch": 2.88182934315531, + "grad_norm": 0.3304184079170227, + "learning_rate": 8.352001552760078e-05, + "loss": 1.8008, + "step": 9389 + }, + { + "epoch": 2.882136279926335, + "grad_norm": 0.3831254541873932, + "learning_rate": 8.351632720848947e-05, + "loss": 1.7636, + "step": 9390 + }, + { + "epoch": 2.8824432166973604, + "grad_norm": 0.3358294665813446, + "learning_rate": 8.351263855815017e-05, + "loss": 1.8375, + "step": 9391 + }, + { + "epoch": 2.8827501534683853, + "grad_norm": 0.31194913387298584, + "learning_rate": 8.350894957661935e-05, + "loss": 1.817, + "step": 9392 + }, + { + "epoch": 2.8830570902394106, + "grad_norm": 0.4156818687915802, + "learning_rate": 8.350526026393343e-05, + "loss": 1.799, + "step": 9393 + }, + { + "epoch": 2.883364027010436, + "grad_norm": 0.3062533140182495, + "learning_rate": 8.350157062012889e-05, + "loss": 1.8535, + "step": 9394 + }, + { + "epoch": 2.8836709637814613, + "grad_norm": 0.3091447949409485, + "learning_rate": 8.34978806452422e-05, + "loss": 1.839, + "step": 9395 + }, + { + "epoch": 2.883977900552486, + "grad_norm": 0.38731643557548523, + "learning_rate": 8.349419033930981e-05, + "loss": 1.8714, + "step": 9396 + }, + { + "epoch": 2.8842848373235115, + "grad_norm": 0.34655869007110596, + "learning_rate": 8.34904997023682e-05, + "loss": 1.8694, + "step": 9397 + }, + { + "epoch": 2.8845917740945364, + "grad_norm": 0.3094301223754883, + "learning_rate": 8.348680873445386e-05, + "loss": 1.8773, + "step": 9398 + }, + { + "epoch": 2.8848987108655617, + "grad_norm": 0.2954508364200592, + "learning_rate": 8.348311743560325e-05, + "loss": 1.7716, + "step": 9399 + }, + { + "epoch": 2.885205647636587, + "grad_norm": 0.32545948028564453, + "learning_rate": 8.347942580585282e-05, + "loss": 1.871, + "step": 9400 + }, + { + "epoch": 2.885512584407612, + "grad_norm": 0.3251612186431885, + "learning_rate": 8.34757338452391e-05, + "loss": 1.8553, + "step": 9401 + }, + { + "epoch": 2.8858195211786373, + "grad_norm": 0.2610895335674286, + "learning_rate": 8.347204155379856e-05, + "loss": 1.8018, + "step": 9402 + }, + { + "epoch": 2.886126457949662, + "grad_norm": 0.3369129002094269, + "learning_rate": 8.346834893156768e-05, + "loss": 1.8536, + "step": 9403 + }, + { + "epoch": 2.8864333947206875, + "grad_norm": 0.4544060528278351, + "learning_rate": 8.346465597858296e-05, + "loss": 1.8332, + "step": 9404 + }, + { + "epoch": 2.886740331491713, + "grad_norm": 0.45742174983024597, + "learning_rate": 8.346096269488089e-05, + "loss": 1.89, + "step": 9405 + }, + { + "epoch": 2.8870472682627377, + "grad_norm": 0.3458103537559509, + "learning_rate": 8.345726908049799e-05, + "loss": 1.8902, + "step": 9406 + }, + { + "epoch": 2.887354205033763, + "grad_norm": 0.33266058564186096, + "learning_rate": 8.345357513547074e-05, + "loss": 1.7975, + "step": 9407 + }, + { + "epoch": 2.887661141804788, + "grad_norm": 0.3503437042236328, + "learning_rate": 8.344988085983565e-05, + "loss": 1.8503, + "step": 9408 + }, + { + "epoch": 2.8879680785758133, + "grad_norm": 0.33511486649513245, + "learning_rate": 8.344618625362923e-05, + "loss": 1.8731, + "step": 9409 + }, + { + "epoch": 2.8882750153468386, + "grad_norm": 0.295250803232193, + "learning_rate": 8.344249131688799e-05, + "loss": 1.8557, + "step": 9410 + }, + { + "epoch": 2.888581952117864, + "grad_norm": 0.33287179470062256, + "learning_rate": 8.343879604964846e-05, + "loss": 1.8015, + "step": 9411 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.35169747471809387, + "learning_rate": 8.343510045194715e-05, + "loss": 1.7857, + "step": 9412 + }, + { + "epoch": 2.889195825659914, + "grad_norm": 0.3191360533237457, + "learning_rate": 8.343140452382056e-05, + "loss": 1.8474, + "step": 9413 + }, + { + "epoch": 2.889502762430939, + "grad_norm": 0.27216482162475586, + "learning_rate": 8.342770826530526e-05, + "loss": 1.7941, + "step": 9414 + }, + { + "epoch": 2.8898096992019644, + "grad_norm": 0.32968905568122864, + "learning_rate": 8.342401167643774e-05, + "loss": 1.8568, + "step": 9415 + }, + { + "epoch": 2.8901166359729897, + "grad_norm": 0.37429341673851013, + "learning_rate": 8.342031475725456e-05, + "loss": 1.8995, + "step": 9416 + }, + { + "epoch": 2.8904235727440146, + "grad_norm": 0.3318146765232086, + "learning_rate": 8.341661750779223e-05, + "loss": 1.8886, + "step": 9417 + }, + { + "epoch": 2.89073050951504, + "grad_norm": 0.3208807408809662, + "learning_rate": 8.34129199280873e-05, + "loss": 1.8306, + "step": 9418 + }, + { + "epoch": 2.891037446286065, + "grad_norm": 0.30906134843826294, + "learning_rate": 8.340922201817632e-05, + "loss": 1.8931, + "step": 9419 + }, + { + "epoch": 2.89134438305709, + "grad_norm": 0.2949373722076416, + "learning_rate": 8.340552377809581e-05, + "loss": 1.8375, + "step": 9420 + }, + { + "epoch": 2.8916513198281155, + "grad_norm": 0.2553368806838989, + "learning_rate": 8.340182520788236e-05, + "loss": 1.7816, + "step": 9421 + }, + { + "epoch": 2.891958256599141, + "grad_norm": 0.26867765188217163, + "learning_rate": 8.339812630757246e-05, + "loss": 1.7721, + "step": 9422 + }, + { + "epoch": 2.8922651933701657, + "grad_norm": 0.3132673501968384, + "learning_rate": 8.339442707720273e-05, + "loss": 1.8412, + "step": 9423 + }, + { + "epoch": 2.892572130141191, + "grad_norm": 0.32028669118881226, + "learning_rate": 8.33907275168097e-05, + "loss": 1.8081, + "step": 9424 + }, + { + "epoch": 2.892879066912216, + "grad_norm": 0.30383285880088806, + "learning_rate": 8.338702762642992e-05, + "loss": 1.8294, + "step": 9425 + }, + { + "epoch": 2.8931860036832413, + "grad_norm": 0.284161239862442, + "learning_rate": 8.338332740609995e-05, + "loss": 1.7788, + "step": 9426 + }, + { + "epoch": 2.8934929404542666, + "grad_norm": 0.26731929183006287, + "learning_rate": 8.337962685585638e-05, + "loss": 1.8244, + "step": 9427 + }, + { + "epoch": 2.8937998772252915, + "grad_norm": 0.2687760889530182, + "learning_rate": 8.337592597573578e-05, + "loss": 1.8104, + "step": 9428 + }, + { + "epoch": 2.894106813996317, + "grad_norm": 0.3097872734069824, + "learning_rate": 8.337222476577472e-05, + "loss": 1.8311, + "step": 9429 + }, + { + "epoch": 2.8944137507673418, + "grad_norm": 0.2915988862514496, + "learning_rate": 8.336852322600977e-05, + "loss": 1.8878, + "step": 9430 + }, + { + "epoch": 2.894720687538367, + "grad_norm": 0.2783167362213135, + "learning_rate": 8.336482135647751e-05, + "loss": 1.829, + "step": 9431 + }, + { + "epoch": 2.8950276243093924, + "grad_norm": 0.27866432070732117, + "learning_rate": 8.336111915721454e-05, + "loss": 1.8881, + "step": 9432 + }, + { + "epoch": 2.8953345610804173, + "grad_norm": 0.26949164271354675, + "learning_rate": 8.335741662825743e-05, + "loss": 1.7652, + "step": 9433 + }, + { + "epoch": 2.8956414978514426, + "grad_norm": 0.31324130296707153, + "learning_rate": 8.335371376964278e-05, + "loss": 1.8362, + "step": 9434 + }, + { + "epoch": 2.8959484346224675, + "grad_norm": 0.31150999665260315, + "learning_rate": 8.335001058140718e-05, + "loss": 1.8588, + "step": 9435 + }, + { + "epoch": 2.896255371393493, + "grad_norm": 0.30692601203918457, + "learning_rate": 8.334630706358724e-05, + "loss": 1.8473, + "step": 9436 + }, + { + "epoch": 2.896562308164518, + "grad_norm": 0.2764357328414917, + "learning_rate": 8.334260321621954e-05, + "loss": 1.8696, + "step": 9437 + }, + { + "epoch": 2.8968692449355435, + "grad_norm": 0.26108071208000183, + "learning_rate": 8.333889903934069e-05, + "loss": 1.7647, + "step": 9438 + }, + { + "epoch": 2.8971761817065684, + "grad_norm": 0.3382989466190338, + "learning_rate": 8.33351945329873e-05, + "loss": 1.8936, + "step": 9439 + }, + { + "epoch": 2.8974831184775938, + "grad_norm": 0.3121405839920044, + "learning_rate": 8.333148969719598e-05, + "loss": 1.8281, + "step": 9440 + }, + { + "epoch": 2.8977900552486187, + "grad_norm": 0.283149778842926, + "learning_rate": 8.332778453200334e-05, + "loss": 1.8642, + "step": 9441 + }, + { + "epoch": 2.898096992019644, + "grad_norm": 0.4140075445175171, + "learning_rate": 8.332407903744598e-05, + "loss": 1.8553, + "step": 9442 + }, + { + "epoch": 2.8984039287906693, + "grad_norm": 0.4345620274543762, + "learning_rate": 8.332037321356057e-05, + "loss": 1.7879, + "step": 9443 + }, + { + "epoch": 2.898710865561694, + "grad_norm": 0.4103661775588989, + "learning_rate": 8.33166670603837e-05, + "loss": 1.7928, + "step": 9444 + }, + { + "epoch": 2.8990178023327196, + "grad_norm": 0.2874266505241394, + "learning_rate": 8.3312960577952e-05, + "loss": 1.8097, + "step": 9445 + }, + { + "epoch": 2.8993247391037444, + "grad_norm": 0.2949487864971161, + "learning_rate": 8.330925376630208e-05, + "loss": 1.8679, + "step": 9446 + }, + { + "epoch": 2.8996316758747698, + "grad_norm": 0.3222406804561615, + "learning_rate": 8.330554662547059e-05, + "loss": 1.8184, + "step": 9447 + }, + { + "epoch": 2.899938612645795, + "grad_norm": 0.32089436054229736, + "learning_rate": 8.330183915549418e-05, + "loss": 1.8798, + "step": 9448 + }, + { + "epoch": 2.90024554941682, + "grad_norm": 0.28950363397598267, + "learning_rate": 8.329813135640947e-05, + "loss": 1.8502, + "step": 9449 + }, + { + "epoch": 2.9005524861878453, + "grad_norm": 0.29070547223091125, + "learning_rate": 8.329442322825312e-05, + "loss": 1.8826, + "step": 9450 + }, + { + "epoch": 2.9008594229588702, + "grad_norm": 0.3030688464641571, + "learning_rate": 8.329071477106175e-05, + "loss": 1.8002, + "step": 9451 + }, + { + "epoch": 2.9011663597298956, + "grad_norm": 0.33711570501327515, + "learning_rate": 8.328700598487203e-05, + "loss": 1.8876, + "step": 9452 + }, + { + "epoch": 2.901473296500921, + "grad_norm": 0.31995612382888794, + "learning_rate": 8.328329686972063e-05, + "loss": 1.7952, + "step": 9453 + }, + { + "epoch": 2.9017802332719462, + "grad_norm": 0.2619616389274597, + "learning_rate": 8.327958742564415e-05, + "loss": 1.7371, + "step": 9454 + }, + { + "epoch": 2.902087170042971, + "grad_norm": 0.3527650535106659, + "learning_rate": 8.32758776526793e-05, + "loss": 1.8385, + "step": 9455 + }, + { + "epoch": 2.9023941068139965, + "grad_norm": 0.3238582909107208, + "learning_rate": 8.327216755086271e-05, + "loss": 1.7955, + "step": 9456 + }, + { + "epoch": 2.9027010435850213, + "grad_norm": 0.2647970914840698, + "learning_rate": 8.326845712023106e-05, + "loss": 1.8639, + "step": 9457 + }, + { + "epoch": 2.9030079803560467, + "grad_norm": 0.3435346186161041, + "learning_rate": 8.326474636082103e-05, + "loss": 1.7831, + "step": 9458 + }, + { + "epoch": 2.903314917127072, + "grad_norm": 0.42539843916893005, + "learning_rate": 8.326103527266927e-05, + "loss": 1.8473, + "step": 9459 + }, + { + "epoch": 2.903621853898097, + "grad_norm": 0.3773367404937744, + "learning_rate": 8.325732385581247e-05, + "loss": 1.8993, + "step": 9460 + }, + { + "epoch": 2.9039287906691222, + "grad_norm": 0.2918262183666229, + "learning_rate": 8.32536121102873e-05, + "loss": 1.8198, + "step": 9461 + }, + { + "epoch": 2.904235727440147, + "grad_norm": 0.3997703492641449, + "learning_rate": 8.324990003613044e-05, + "loss": 1.8307, + "step": 9462 + }, + { + "epoch": 2.9045426642111725, + "grad_norm": 0.4593566656112671, + "learning_rate": 8.324618763337858e-05, + "loss": 1.8068, + "step": 9463 + }, + { + "epoch": 2.904849600982198, + "grad_norm": 0.30200180411338806, + "learning_rate": 8.324247490206841e-05, + "loss": 1.7935, + "step": 9464 + }, + { + "epoch": 2.9051565377532227, + "grad_norm": 0.37651970982551575, + "learning_rate": 8.323876184223663e-05, + "loss": 1.9268, + "step": 9465 + }, + { + "epoch": 2.905463474524248, + "grad_norm": 0.465863436460495, + "learning_rate": 8.32350484539199e-05, + "loss": 1.8331, + "step": 9466 + }, + { + "epoch": 2.905770411295273, + "grad_norm": 0.3527480661869049, + "learning_rate": 8.323133473715496e-05, + "loss": 1.899, + "step": 9467 + }, + { + "epoch": 2.9060773480662982, + "grad_norm": 0.30979883670806885, + "learning_rate": 8.32276206919785e-05, + "loss": 1.7578, + "step": 9468 + }, + { + "epoch": 2.9063842848373236, + "grad_norm": 0.5039793252944946, + "learning_rate": 8.322390631842718e-05, + "loss": 1.7822, + "step": 9469 + }, + { + "epoch": 2.906691221608349, + "grad_norm": 0.4683503806591034, + "learning_rate": 8.322019161653777e-05, + "loss": 1.7958, + "step": 9470 + }, + { + "epoch": 2.906998158379374, + "grad_norm": 0.27022865414619446, + "learning_rate": 8.321647658634696e-05, + "loss": 1.838, + "step": 9471 + }, + { + "epoch": 2.907305095150399, + "grad_norm": 0.3253246247768402, + "learning_rate": 8.321276122789146e-05, + "loss": 1.862, + "step": 9472 + }, + { + "epoch": 2.907612031921424, + "grad_norm": 0.3654547929763794, + "learning_rate": 8.320904554120798e-05, + "loss": 1.8578, + "step": 9473 + }, + { + "epoch": 2.9079189686924494, + "grad_norm": 0.3140239417552948, + "learning_rate": 8.320532952633325e-05, + "loss": 1.7954, + "step": 9474 + }, + { + "epoch": 2.9082259054634747, + "grad_norm": 0.24541302025318146, + "learning_rate": 8.3201613183304e-05, + "loss": 1.7711, + "step": 9475 + }, + { + "epoch": 2.9085328422344996, + "grad_norm": 0.2538415491580963, + "learning_rate": 8.319789651215692e-05, + "loss": 1.7756, + "step": 9476 + }, + { + "epoch": 2.908839779005525, + "grad_norm": 0.3181871175765991, + "learning_rate": 8.31941795129288e-05, + "loss": 1.7957, + "step": 9477 + }, + { + "epoch": 2.90914671577655, + "grad_norm": 0.3094673752784729, + "learning_rate": 8.319046218565633e-05, + "loss": 1.8897, + "step": 9478 + }, + { + "epoch": 2.909453652547575, + "grad_norm": 0.3004473149776459, + "learning_rate": 8.318674453037626e-05, + "loss": 1.7853, + "step": 9479 + }, + { + "epoch": 2.9097605893186005, + "grad_norm": 0.28673505783081055, + "learning_rate": 8.318302654712532e-05, + "loss": 1.8119, + "step": 9480 + }, + { + "epoch": 2.9100675260896254, + "grad_norm": 0.3177729547023773, + "learning_rate": 8.317930823594027e-05, + "loss": 1.8211, + "step": 9481 + }, + { + "epoch": 2.9103744628606507, + "grad_norm": 0.28347232937812805, + "learning_rate": 8.317558959685786e-05, + "loss": 1.8061, + "step": 9482 + }, + { + "epoch": 2.9106813996316756, + "grad_norm": 0.28247126936912537, + "learning_rate": 8.317187062991482e-05, + "loss": 1.8175, + "step": 9483 + }, + { + "epoch": 2.910988336402701, + "grad_norm": 0.3153017461299896, + "learning_rate": 8.31681513351479e-05, + "loss": 1.8619, + "step": 9484 + }, + { + "epoch": 2.9112952731737263, + "grad_norm": 0.265821635723114, + "learning_rate": 8.316443171259389e-05, + "loss": 1.7783, + "step": 9485 + }, + { + "epoch": 2.9116022099447516, + "grad_norm": 0.33247366547584534, + "learning_rate": 8.31607117622895e-05, + "loss": 1.8701, + "step": 9486 + }, + { + "epoch": 2.9119091467157765, + "grad_norm": 0.3343275189399719, + "learning_rate": 8.315699148427154e-05, + "loss": 1.742, + "step": 9487 + }, + { + "epoch": 2.912216083486802, + "grad_norm": 0.3427117168903351, + "learning_rate": 8.315327087857677e-05, + "loss": 1.8382, + "step": 9488 + }, + { + "epoch": 2.9125230202578267, + "grad_norm": 0.2884635925292969, + "learning_rate": 8.31495499452419e-05, + "loss": 1.8378, + "step": 9489 + }, + { + "epoch": 2.912829957028852, + "grad_norm": 0.30335184931755066, + "learning_rate": 8.31458286843038e-05, + "loss": 1.7619, + "step": 9490 + }, + { + "epoch": 2.9131368937998774, + "grad_norm": 0.3224368095397949, + "learning_rate": 8.314210709579916e-05, + "loss": 1.8289, + "step": 9491 + }, + { + "epoch": 2.9134438305709023, + "grad_norm": 0.28016242384910583, + "learning_rate": 8.31383851797648e-05, + "loss": 1.8027, + "step": 9492 + }, + { + "epoch": 2.9137507673419276, + "grad_norm": 0.32091468572616577, + "learning_rate": 8.313466293623749e-05, + "loss": 1.9027, + "step": 9493 + }, + { + "epoch": 2.9140577041129525, + "grad_norm": 0.2809069752693176, + "learning_rate": 8.313094036525403e-05, + "loss": 1.9194, + "step": 9494 + }, + { + "epoch": 2.914364640883978, + "grad_norm": 0.30734366178512573, + "learning_rate": 8.312721746685119e-05, + "loss": 1.8612, + "step": 9495 + }, + { + "epoch": 2.914671577655003, + "grad_norm": 0.25953513383865356, + "learning_rate": 8.312349424106578e-05, + "loss": 1.7593, + "step": 9496 + }, + { + "epoch": 2.9149785144260285, + "grad_norm": 0.27583983540534973, + "learning_rate": 8.311977068793459e-05, + "loss": 1.8138, + "step": 9497 + }, + { + "epoch": 2.9152854511970534, + "grad_norm": 0.30315884947776794, + "learning_rate": 8.31160468074944e-05, + "loss": 1.7704, + "step": 9498 + }, + { + "epoch": 2.9155923879680787, + "grad_norm": 0.321603387594223, + "learning_rate": 8.311232259978204e-05, + "loss": 1.8055, + "step": 9499 + }, + { + "epoch": 2.9158993247391036, + "grad_norm": 0.27882421016693115, + "learning_rate": 8.310859806483429e-05, + "loss": 1.8257, + "step": 9500 + }, + { + "epoch": 2.916206261510129, + "grad_norm": 0.3095625042915344, + "learning_rate": 8.310487320268795e-05, + "loss": 1.8561, + "step": 9501 + }, + { + "epoch": 2.9165131982811543, + "grad_norm": 0.27503731846809387, + "learning_rate": 8.310114801337988e-05, + "loss": 1.7588, + "step": 9502 + }, + { + "epoch": 2.916820135052179, + "grad_norm": 0.2534404695034027, + "learning_rate": 8.309742249694686e-05, + "loss": 1.7289, + "step": 9503 + }, + { + "epoch": 2.9171270718232045, + "grad_norm": 0.24968849122524261, + "learning_rate": 8.30936966534257e-05, + "loss": 1.7763, + "step": 9504 + }, + { + "epoch": 2.9174340085942294, + "grad_norm": 0.2728060781955719, + "learning_rate": 8.308997048285324e-05, + "loss": 1.7847, + "step": 9505 + }, + { + "epoch": 2.9177409453652547, + "grad_norm": 0.28728193044662476, + "learning_rate": 8.308624398526629e-05, + "loss": 1.7957, + "step": 9506 + }, + { + "epoch": 2.91804788213628, + "grad_norm": 0.3097241520881653, + "learning_rate": 8.308251716070169e-05, + "loss": 1.8141, + "step": 9507 + }, + { + "epoch": 2.918354818907305, + "grad_norm": 0.3570188879966736, + "learning_rate": 8.307879000919628e-05, + "loss": 1.8246, + "step": 9508 + }, + { + "epoch": 2.9186617556783303, + "grad_norm": 0.27077826857566833, + "learning_rate": 8.307506253078685e-05, + "loss": 1.7912, + "step": 9509 + }, + { + "epoch": 2.918968692449355, + "grad_norm": 0.26213565468788147, + "learning_rate": 8.307133472551028e-05, + "loss": 1.8378, + "step": 9510 + }, + { + "epoch": 2.9192756292203805, + "grad_norm": 0.3482845723628998, + "learning_rate": 8.306760659340339e-05, + "loss": 1.8031, + "step": 9511 + }, + { + "epoch": 2.919582565991406, + "grad_norm": 0.3730507791042328, + "learning_rate": 8.306387813450303e-05, + "loss": 1.7404, + "step": 9512 + }, + { + "epoch": 2.919889502762431, + "grad_norm": 0.2957874536514282, + "learning_rate": 8.306014934884606e-05, + "loss": 1.8623, + "step": 9513 + }, + { + "epoch": 2.920196439533456, + "grad_norm": 0.29137885570526123, + "learning_rate": 8.30564202364693e-05, + "loss": 1.847, + "step": 9514 + }, + { + "epoch": 2.9205033763044814, + "grad_norm": 0.35623642802238464, + "learning_rate": 8.305269079740964e-05, + "loss": 1.8382, + "step": 9515 + }, + { + "epoch": 2.9208103130755063, + "grad_norm": 0.28263330459594727, + "learning_rate": 8.304896103170389e-05, + "loss": 1.7732, + "step": 9516 + }, + { + "epoch": 2.9211172498465316, + "grad_norm": 0.23631221055984497, + "learning_rate": 8.304523093938897e-05, + "loss": 1.7709, + "step": 9517 + }, + { + "epoch": 2.921424186617557, + "grad_norm": 0.25887101888656616, + "learning_rate": 8.304150052050169e-05, + "loss": 1.7966, + "step": 9518 + }, + { + "epoch": 2.921731123388582, + "grad_norm": 0.31445473432540894, + "learning_rate": 8.303776977507894e-05, + "loss": 1.8735, + "step": 9519 + }, + { + "epoch": 2.922038060159607, + "grad_norm": 0.264930784702301, + "learning_rate": 8.303403870315757e-05, + "loss": 1.7983, + "step": 9520 + }, + { + "epoch": 2.922344996930632, + "grad_norm": 0.2664194107055664, + "learning_rate": 8.30303073047745e-05, + "loss": 1.8573, + "step": 9521 + }, + { + "epoch": 2.9226519337016574, + "grad_norm": 0.31645768880844116, + "learning_rate": 8.302657557996656e-05, + "loss": 1.913, + "step": 9522 + }, + { + "epoch": 2.9229588704726828, + "grad_norm": 0.2820858657360077, + "learning_rate": 8.302284352877063e-05, + "loss": 1.8714, + "step": 9523 + }, + { + "epoch": 2.9232658072437077, + "grad_norm": 0.2960543930530548, + "learning_rate": 8.30191111512236e-05, + "loss": 1.8296, + "step": 9524 + }, + { + "epoch": 2.923572744014733, + "grad_norm": 0.319363534450531, + "learning_rate": 8.301537844736237e-05, + "loss": 1.8533, + "step": 9525 + }, + { + "epoch": 2.923879680785758, + "grad_norm": 0.28047996759414673, + "learning_rate": 8.301164541722384e-05, + "loss": 1.7415, + "step": 9526 + }, + { + "epoch": 2.924186617556783, + "grad_norm": 0.3106628656387329, + "learning_rate": 8.300791206084486e-05, + "loss": 1.8809, + "step": 9527 + }, + { + "epoch": 2.9244935543278086, + "grad_norm": 0.2650253474712372, + "learning_rate": 8.300417837826235e-05, + "loss": 1.8097, + "step": 9528 + }, + { + "epoch": 2.924800491098834, + "grad_norm": 0.31832796335220337, + "learning_rate": 8.30004443695132e-05, + "loss": 1.881, + "step": 9529 + }, + { + "epoch": 2.925107427869859, + "grad_norm": 0.311018168926239, + "learning_rate": 8.299671003463432e-05, + "loss": 1.8725, + "step": 9530 + }, + { + "epoch": 2.925414364640884, + "grad_norm": 0.3125450909137726, + "learning_rate": 8.299297537366262e-05, + "loss": 1.8159, + "step": 9531 + }, + { + "epoch": 2.925721301411909, + "grad_norm": 0.30022570490837097, + "learning_rate": 8.298924038663498e-05, + "loss": 1.8217, + "step": 9532 + }, + { + "epoch": 2.9260282381829343, + "grad_norm": 0.3061163127422333, + "learning_rate": 8.298550507358836e-05, + "loss": 1.8529, + "step": 9533 + }, + { + "epoch": 2.9263351749539597, + "grad_norm": 0.258891224861145, + "learning_rate": 8.298176943455962e-05, + "loss": 1.8579, + "step": 9534 + }, + { + "epoch": 2.9266421117249846, + "grad_norm": 0.2871147096157074, + "learning_rate": 8.297803346958571e-05, + "loss": 1.8699, + "step": 9535 + }, + { + "epoch": 2.92694904849601, + "grad_norm": 0.3047468066215515, + "learning_rate": 8.297429717870356e-05, + "loss": 1.9165, + "step": 9536 + }, + { + "epoch": 2.927255985267035, + "grad_norm": 0.2852346897125244, + "learning_rate": 8.297056056195005e-05, + "loss": 1.8417, + "step": 9537 + }, + { + "epoch": 2.92756292203806, + "grad_norm": 0.30782654881477356, + "learning_rate": 8.296682361936216e-05, + "loss": 1.835, + "step": 9538 + }, + { + "epoch": 2.9278698588090855, + "grad_norm": 0.44828128814697266, + "learning_rate": 8.296308635097678e-05, + "loss": 1.8997, + "step": 9539 + }, + { + "epoch": 2.9281767955801103, + "grad_norm": 0.48911961913108826, + "learning_rate": 8.295934875683087e-05, + "loss": 1.8249, + "step": 9540 + }, + { + "epoch": 2.9284837323511357, + "grad_norm": 0.3377256691455841, + "learning_rate": 8.295561083696136e-05, + "loss": 1.757, + "step": 9541 + }, + { + "epoch": 2.9287906691221606, + "grad_norm": 0.29486989974975586, + "learning_rate": 8.295187259140518e-05, + "loss": 1.8282, + "step": 9542 + }, + { + "epoch": 2.929097605893186, + "grad_norm": 0.4291549026966095, + "learning_rate": 8.294813402019927e-05, + "loss": 1.7633, + "step": 9543 + }, + { + "epoch": 2.9294045426642112, + "grad_norm": 0.43153640627861023, + "learning_rate": 8.294439512338061e-05, + "loss": 1.7904, + "step": 9544 + }, + { + "epoch": 2.9297114794352366, + "grad_norm": 0.3454402685165405, + "learning_rate": 8.294065590098611e-05, + "loss": 1.8586, + "step": 9545 + }, + { + "epoch": 2.9300184162062615, + "grad_norm": 0.2709622383117676, + "learning_rate": 8.293691635305276e-05, + "loss": 1.8225, + "step": 9546 + }, + { + "epoch": 2.930325352977287, + "grad_norm": 0.34379467368125916, + "learning_rate": 8.293317647961749e-05, + "loss": 1.9005, + "step": 9547 + }, + { + "epoch": 2.9306322897483117, + "grad_norm": 0.37137365341186523, + "learning_rate": 8.292943628071727e-05, + "loss": 1.829, + "step": 9548 + }, + { + "epoch": 2.930939226519337, + "grad_norm": 0.31634894013404846, + "learning_rate": 8.292569575638905e-05, + "loss": 1.8062, + "step": 9549 + }, + { + "epoch": 2.9312461632903624, + "grad_norm": 0.25719332695007324, + "learning_rate": 8.292195490666981e-05, + "loss": 1.8044, + "step": 9550 + }, + { + "epoch": 2.9315531000613873, + "grad_norm": 0.3341852128505707, + "learning_rate": 8.291821373159652e-05, + "loss": 1.8627, + "step": 9551 + }, + { + "epoch": 2.9318600368324126, + "grad_norm": 0.38499385118484497, + "learning_rate": 8.291447223120614e-05, + "loss": 1.8138, + "step": 9552 + }, + { + "epoch": 2.9321669736034375, + "grad_norm": 0.28036460280418396, + "learning_rate": 8.291073040553567e-05, + "loss": 1.7958, + "step": 9553 + }, + { + "epoch": 2.932473910374463, + "grad_norm": 0.30798816680908203, + "learning_rate": 8.290698825462207e-05, + "loss": 1.899, + "step": 9554 + }, + { + "epoch": 2.932780847145488, + "grad_norm": 0.40930941700935364, + "learning_rate": 8.290324577850232e-05, + "loss": 1.841, + "step": 9555 + }, + { + "epoch": 2.933087783916513, + "grad_norm": 0.38794800639152527, + "learning_rate": 8.289950297721341e-05, + "loss": 1.8022, + "step": 9556 + }, + { + "epoch": 2.9333947206875384, + "grad_norm": 0.2716790437698364, + "learning_rate": 8.289575985079232e-05, + "loss": 1.8009, + "step": 9557 + }, + { + "epoch": 2.9337016574585633, + "grad_norm": 0.3063231110572815, + "learning_rate": 8.289201639927605e-05, + "loss": 1.8677, + "step": 9558 + }, + { + "epoch": 2.9340085942295886, + "grad_norm": 0.3279048800468445, + "learning_rate": 8.28882726227016e-05, + "loss": 1.8071, + "step": 9559 + }, + { + "epoch": 2.934315531000614, + "grad_norm": 0.32144758105278015, + "learning_rate": 8.288452852110596e-05, + "loss": 1.8601, + "step": 9560 + }, + { + "epoch": 2.9346224677716393, + "grad_norm": 0.284495085477829, + "learning_rate": 8.288078409452614e-05, + "loss": 1.8358, + "step": 9561 + }, + { + "epoch": 2.934929404542664, + "grad_norm": 0.3779112696647644, + "learning_rate": 8.287703934299915e-05, + "loss": 1.7903, + "step": 9562 + }, + { + "epoch": 2.9352363413136895, + "grad_norm": 0.33851495385169983, + "learning_rate": 8.287329426656197e-05, + "loss": 1.806, + "step": 9563 + }, + { + "epoch": 2.9355432780847144, + "grad_norm": 0.26610738039016724, + "learning_rate": 8.286954886525164e-05, + "loss": 1.7739, + "step": 9564 + }, + { + "epoch": 2.9358502148557397, + "grad_norm": 0.24825556576251984, + "learning_rate": 8.286580313910515e-05, + "loss": 1.7595, + "step": 9565 + }, + { + "epoch": 2.936157151626765, + "grad_norm": 0.28356245160102844, + "learning_rate": 8.286205708815954e-05, + "loss": 1.8497, + "step": 9566 + }, + { + "epoch": 2.93646408839779, + "grad_norm": 0.2974208891391754, + "learning_rate": 8.285831071245182e-05, + "loss": 1.8561, + "step": 9567 + }, + { + "epoch": 2.9367710251688153, + "grad_norm": 0.26718810200691223, + "learning_rate": 8.2854564012019e-05, + "loss": 1.776, + "step": 9568 + }, + { + "epoch": 2.93707796193984, + "grad_norm": 0.30627691745758057, + "learning_rate": 8.285081698689814e-05, + "loss": 1.8141, + "step": 9569 + }, + { + "epoch": 2.9373848987108655, + "grad_norm": 0.33287444710731506, + "learning_rate": 8.284706963712625e-05, + "loss": 1.8727, + "step": 9570 + }, + { + "epoch": 2.937691835481891, + "grad_norm": 0.30571332573890686, + "learning_rate": 8.284332196274036e-05, + "loss": 1.8388, + "step": 9571 + }, + { + "epoch": 2.937998772252916, + "grad_norm": 0.3603699207305908, + "learning_rate": 8.283957396377753e-05, + "loss": 1.8655, + "step": 9572 + }, + { + "epoch": 2.938305709023941, + "grad_norm": 0.2890760898590088, + "learning_rate": 8.283582564027477e-05, + "loss": 1.7919, + "step": 9573 + }, + { + "epoch": 2.9386126457949664, + "grad_norm": 0.34981194138526917, + "learning_rate": 8.283207699226912e-05, + "loss": 1.8542, + "step": 9574 + }, + { + "epoch": 2.9389195825659913, + "grad_norm": 0.43490317463874817, + "learning_rate": 8.282832801979766e-05, + "loss": 1.8109, + "step": 9575 + }, + { + "epoch": 2.9392265193370166, + "grad_norm": 0.4337438941001892, + "learning_rate": 8.282457872289742e-05, + "loss": 1.8856, + "step": 9576 + }, + { + "epoch": 2.939533456108042, + "grad_norm": 0.2723710834980011, + "learning_rate": 8.282082910160544e-05, + "loss": 1.8554, + "step": 9577 + }, + { + "epoch": 2.939840392879067, + "grad_norm": 0.32447734475135803, + "learning_rate": 8.28170791559588e-05, + "loss": 1.8086, + "step": 9578 + }, + { + "epoch": 2.940147329650092, + "grad_norm": 0.3495276868343353, + "learning_rate": 8.281332888599455e-05, + "loss": 1.785, + "step": 9579 + }, + { + "epoch": 2.940454266421117, + "grad_norm": 0.3324705958366394, + "learning_rate": 8.280957829174975e-05, + "loss": 1.8086, + "step": 9580 + }, + { + "epoch": 2.9407612031921424, + "grad_norm": 0.2633898854255676, + "learning_rate": 8.280582737326146e-05, + "loss": 1.8116, + "step": 9581 + }, + { + "epoch": 2.9410681399631677, + "grad_norm": 0.3109157085418701, + "learning_rate": 8.280207613056676e-05, + "loss": 1.8649, + "step": 9582 + }, + { + "epoch": 2.9413750767341926, + "grad_norm": 0.2772599756717682, + "learning_rate": 8.279832456370273e-05, + "loss": 1.8578, + "step": 9583 + }, + { + "epoch": 2.941682013505218, + "grad_norm": 0.32322654128074646, + "learning_rate": 8.279457267270642e-05, + "loss": 1.8621, + "step": 9584 + }, + { + "epoch": 2.941988950276243, + "grad_norm": 0.3678343594074249, + "learning_rate": 8.279082045761493e-05, + "loss": 1.8819, + "step": 9585 + }, + { + "epoch": 2.942295887047268, + "grad_norm": 0.30976057052612305, + "learning_rate": 8.27870679184653e-05, + "loss": 1.8126, + "step": 9586 + }, + { + "epoch": 2.9426028238182935, + "grad_norm": 0.26715603470802307, + "learning_rate": 8.278331505529469e-05, + "loss": 1.8831, + "step": 9587 + }, + { + "epoch": 2.942909760589319, + "grad_norm": 0.263288289308548, + "learning_rate": 8.277956186814014e-05, + "loss": 1.8057, + "step": 9588 + }, + { + "epoch": 2.9432166973603437, + "grad_norm": 0.29458633065223694, + "learning_rate": 8.277580835703873e-05, + "loss": 1.7307, + "step": 9589 + }, + { + "epoch": 2.943523634131369, + "grad_norm": 0.27819791436195374, + "learning_rate": 8.277205452202759e-05, + "loss": 1.8783, + "step": 9590 + }, + { + "epoch": 2.943830570902394, + "grad_norm": 0.29286056756973267, + "learning_rate": 8.276830036314379e-05, + "loss": 1.8061, + "step": 9591 + }, + { + "epoch": 2.9441375076734193, + "grad_norm": 0.2955230474472046, + "learning_rate": 8.276454588042442e-05, + "loss": 1.8227, + "step": 9592 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.3070714473724365, + "learning_rate": 8.276079107390663e-05, + "loss": 1.8451, + "step": 9593 + }, + { + "epoch": 2.9447513812154695, + "grad_norm": 0.34235841035842896, + "learning_rate": 8.275703594362749e-05, + "loss": 1.8052, + "step": 9594 + }, + { + "epoch": 2.945058317986495, + "grad_norm": 0.2863236665725708, + "learning_rate": 8.275328048962412e-05, + "loss": 1.8741, + "step": 9595 + }, + { + "epoch": 2.9453652547575198, + "grad_norm": 0.3013235032558441, + "learning_rate": 8.274952471193364e-05, + "loss": 1.8177, + "step": 9596 + }, + { + "epoch": 2.945672191528545, + "grad_norm": 0.2994023561477661, + "learning_rate": 8.274576861059316e-05, + "loss": 1.903, + "step": 9597 + }, + { + "epoch": 2.9459791282995704, + "grad_norm": 0.320049524307251, + "learning_rate": 8.27420121856398e-05, + "loss": 1.882, + "step": 9598 + }, + { + "epoch": 2.9462860650705953, + "grad_norm": 0.2789655029773712, + "learning_rate": 8.273825543711069e-05, + "loss": 1.794, + "step": 9599 + }, + { + "epoch": 2.9465930018416207, + "grad_norm": 0.3148564398288727, + "learning_rate": 8.273449836504294e-05, + "loss": 1.8453, + "step": 9600 + }, + { + "epoch": 2.9468999386126455, + "grad_norm": 0.46754372119903564, + "learning_rate": 8.273074096947371e-05, + "loss": 1.8147, + "step": 9601 + }, + { + "epoch": 2.947206875383671, + "grad_norm": 0.5946900844573975, + "learning_rate": 8.27269832504401e-05, + "loss": 1.8099, + "step": 9602 + }, + { + "epoch": 2.947513812154696, + "grad_norm": 0.4916069507598877, + "learning_rate": 8.272322520797926e-05, + "loss": 1.8315, + "step": 9603 + }, + { + "epoch": 2.9478207489257215, + "grad_norm": 0.30378973484039307, + "learning_rate": 8.271946684212833e-05, + "loss": 1.87, + "step": 9604 + }, + { + "epoch": 2.9481276856967464, + "grad_norm": 0.5197327136993408, + "learning_rate": 8.271570815292447e-05, + "loss": 1.8109, + "step": 9605 + }, + { + "epoch": 2.9484346224677718, + "grad_norm": 0.7213841080665588, + "learning_rate": 8.271194914040478e-05, + "loss": 1.8526, + "step": 9606 + }, + { + "epoch": 2.9487415592387967, + "grad_norm": 0.5521572232246399, + "learning_rate": 8.270818980460643e-05, + "loss": 1.7982, + "step": 9607 + }, + { + "epoch": 2.949048496009822, + "grad_norm": 0.3072868287563324, + "learning_rate": 8.27044301455666e-05, + "loss": 1.8708, + "step": 9608 + }, + { + "epoch": 2.9493554327808473, + "grad_norm": 0.5477200746536255, + "learning_rate": 8.270067016332241e-05, + "loss": 1.8708, + "step": 9609 + }, + { + "epoch": 2.949662369551872, + "grad_norm": 0.5991030335426331, + "learning_rate": 8.269690985791104e-05, + "loss": 1.7983, + "step": 9610 + }, + { + "epoch": 2.9499693063228976, + "grad_norm": 0.33343803882598877, + "learning_rate": 8.269314922936964e-05, + "loss": 1.7867, + "step": 9611 + }, + { + "epoch": 2.9502762430939224, + "grad_norm": 0.3671727776527405, + "learning_rate": 8.268938827773538e-05, + "loss": 1.9604, + "step": 9612 + }, + { + "epoch": 2.950583179864948, + "grad_norm": 0.5015503764152527, + "learning_rate": 8.26856270030454e-05, + "loss": 1.8424, + "step": 9613 + }, + { + "epoch": 2.950890116635973, + "grad_norm": 0.4369170367717743, + "learning_rate": 8.268186540533693e-05, + "loss": 1.7915, + "step": 9614 + }, + { + "epoch": 2.951197053406998, + "grad_norm": 0.2739746868610382, + "learning_rate": 8.267810348464709e-05, + "loss": 1.7816, + "step": 9615 + }, + { + "epoch": 2.9515039901780233, + "grad_norm": 0.3660983145236969, + "learning_rate": 8.26743412410131e-05, + "loss": 1.8235, + "step": 9616 + }, + { + "epoch": 2.9518109269490482, + "grad_norm": 0.44442248344421387, + "learning_rate": 8.26705786744721e-05, + "loss": 1.8566, + "step": 9617 + }, + { + "epoch": 2.9521178637200736, + "grad_norm": 0.28847622871398926, + "learning_rate": 8.266681578506129e-05, + "loss": 1.82, + "step": 9618 + }, + { + "epoch": 2.952424800491099, + "grad_norm": 0.32827475666999817, + "learning_rate": 8.266305257281786e-05, + "loss": 1.8422, + "step": 9619 + }, + { + "epoch": 2.9527317372621242, + "grad_norm": 0.3459654748439789, + "learning_rate": 8.265928903777902e-05, + "loss": 1.7919, + "step": 9620 + }, + { + "epoch": 2.953038674033149, + "grad_norm": 0.31467050313949585, + "learning_rate": 8.265552517998191e-05, + "loss": 1.8178, + "step": 9621 + }, + { + "epoch": 2.9533456108041745, + "grad_norm": 0.2814936935901642, + "learning_rate": 8.265176099946381e-05, + "loss": 1.7823, + "step": 9622 + }, + { + "epoch": 2.9536525475751993, + "grad_norm": 0.36387261748313904, + "learning_rate": 8.264799649626182e-05, + "loss": 1.7861, + "step": 9623 + }, + { + "epoch": 2.9539594843462247, + "grad_norm": 0.3504095673561096, + "learning_rate": 8.264423167041322e-05, + "loss": 1.8216, + "step": 9624 + }, + { + "epoch": 2.95426642111725, + "grad_norm": 0.28199300169944763, + "learning_rate": 8.264046652195519e-05, + "loss": 1.8397, + "step": 9625 + }, + { + "epoch": 2.954573357888275, + "grad_norm": 0.435774028301239, + "learning_rate": 8.263670105092494e-05, + "loss": 1.8316, + "step": 9626 + }, + { + "epoch": 2.9548802946593002, + "grad_norm": 0.37712937593460083, + "learning_rate": 8.263293525735967e-05, + "loss": 1.8089, + "step": 9627 + }, + { + "epoch": 2.955187231430325, + "grad_norm": 0.34833967685699463, + "learning_rate": 8.26291691412966e-05, + "loss": 1.8324, + "step": 9628 + }, + { + "epoch": 2.9554941682013505, + "grad_norm": 0.37515538930892944, + "learning_rate": 8.262540270277297e-05, + "loss": 1.7958, + "step": 9629 + }, + { + "epoch": 2.955801104972376, + "grad_norm": 0.3392273485660553, + "learning_rate": 8.262163594182598e-05, + "loss": 1.8322, + "step": 9630 + }, + { + "epoch": 2.9561080417434007, + "grad_norm": 0.3477925956249237, + "learning_rate": 8.261786885849287e-05, + "loss": 1.8525, + "step": 9631 + }, + { + "epoch": 2.956414978514426, + "grad_norm": 0.35574036836624146, + "learning_rate": 8.261410145281085e-05, + "loss": 1.8148, + "step": 9632 + }, + { + "epoch": 2.9567219152854514, + "grad_norm": 0.3166620135307312, + "learning_rate": 8.261033372481717e-05, + "loss": 1.7914, + "step": 9633 + }, + { + "epoch": 2.9570288520564763, + "grad_norm": 0.2562217116355896, + "learning_rate": 8.260656567454907e-05, + "loss": 1.7794, + "step": 9634 + }, + { + "epoch": 2.9573357888275016, + "grad_norm": 0.3328792452812195, + "learning_rate": 8.260279730204377e-05, + "loss": 1.8235, + "step": 9635 + }, + { + "epoch": 2.957642725598527, + "grad_norm": 0.33144834637641907, + "learning_rate": 8.259902860733852e-05, + "loss": 1.7668, + "step": 9636 + }, + { + "epoch": 2.957949662369552, + "grad_norm": 0.30557021498680115, + "learning_rate": 8.259525959047056e-05, + "loss": 1.9135, + "step": 9637 + }, + { + "epoch": 2.958256599140577, + "grad_norm": 0.2901468575000763, + "learning_rate": 8.259149025147713e-05, + "loss": 1.8023, + "step": 9638 + }, + { + "epoch": 2.958563535911602, + "grad_norm": 0.35177919268608093, + "learning_rate": 8.25877205903955e-05, + "loss": 1.8541, + "step": 9639 + }, + { + "epoch": 2.9588704726826274, + "grad_norm": 0.2745177447795868, + "learning_rate": 8.258395060726291e-05, + "loss": 1.8103, + "step": 9640 + }, + { + "epoch": 2.9591774094536527, + "grad_norm": 0.29005685448646545, + "learning_rate": 8.258018030211663e-05, + "loss": 1.7587, + "step": 9641 + }, + { + "epoch": 2.9594843462246776, + "grad_norm": 0.27498918771743774, + "learning_rate": 8.257640967499391e-05, + "loss": 1.8052, + "step": 9642 + }, + { + "epoch": 2.959791282995703, + "grad_norm": 0.2689644694328308, + "learning_rate": 8.257263872593202e-05, + "loss": 1.8582, + "step": 9643 + }, + { + "epoch": 2.960098219766728, + "grad_norm": 0.2953707277774811, + "learning_rate": 8.256886745496821e-05, + "loss": 1.7654, + "step": 9644 + }, + { + "epoch": 2.960405156537753, + "grad_norm": 0.2573971450328827, + "learning_rate": 8.256509586213978e-05, + "loss": 1.7819, + "step": 9645 + }, + { + "epoch": 2.9607120933087785, + "grad_norm": 0.29667192697525024, + "learning_rate": 8.256132394748398e-05, + "loss": 1.8632, + "step": 9646 + }, + { + "epoch": 2.961019030079804, + "grad_norm": 0.2953830361366272, + "learning_rate": 8.255755171103808e-05, + "loss": 1.8672, + "step": 9647 + }, + { + "epoch": 2.9613259668508287, + "grad_norm": 0.2925500273704529, + "learning_rate": 8.255377915283937e-05, + "loss": 1.8691, + "step": 9648 + }, + { + "epoch": 2.961632903621854, + "grad_norm": 0.32245302200317383, + "learning_rate": 8.255000627292515e-05, + "loss": 1.8701, + "step": 9649 + }, + { + "epoch": 2.961939840392879, + "grad_norm": 0.2671414315700531, + "learning_rate": 8.254623307133268e-05, + "loss": 1.8045, + "step": 9650 + }, + { + "epoch": 2.9622467771639043, + "grad_norm": 0.3135749101638794, + "learning_rate": 8.254245954809928e-05, + "loss": 1.7573, + "step": 9651 + }, + { + "epoch": 2.9625537139349296, + "grad_norm": 0.2604369521141052, + "learning_rate": 8.253868570326218e-05, + "loss": 1.8513, + "step": 9652 + }, + { + "epoch": 2.9628606507059545, + "grad_norm": 0.24657092988491058, + "learning_rate": 8.253491153685875e-05, + "loss": 1.8303, + "step": 9653 + }, + { + "epoch": 2.96316758747698, + "grad_norm": 0.24310527741909027, + "learning_rate": 8.253113704892623e-05, + "loss": 1.7648, + "step": 9654 + }, + { + "epoch": 2.9634745242480047, + "grad_norm": 0.24558408558368683, + "learning_rate": 8.252736223950198e-05, + "loss": 1.7517, + "step": 9655 + }, + { + "epoch": 2.96378146101903, + "grad_norm": 0.2500043511390686, + "learning_rate": 8.252358710862324e-05, + "loss": 1.7588, + "step": 9656 + }, + { + "epoch": 2.9640883977900554, + "grad_norm": 0.2532055079936981, + "learning_rate": 8.251981165632737e-05, + "loss": 1.8414, + "step": 9657 + }, + { + "epoch": 2.9643953345610803, + "grad_norm": 0.2692684829235077, + "learning_rate": 8.251603588265165e-05, + "loss": 1.8701, + "step": 9658 + }, + { + "epoch": 2.9647022713321056, + "grad_norm": 0.2511022984981537, + "learning_rate": 8.251225978763341e-05, + "loss": 1.8068, + "step": 9659 + }, + { + "epoch": 2.9650092081031305, + "grad_norm": 0.24702081084251404, + "learning_rate": 8.250848337130997e-05, + "loss": 1.7993, + "step": 9660 + }, + { + "epoch": 2.965316144874156, + "grad_norm": 0.26960623264312744, + "learning_rate": 8.250470663371862e-05, + "loss": 1.8269, + "step": 9661 + }, + { + "epoch": 2.965623081645181, + "grad_norm": 0.2651064693927765, + "learning_rate": 8.250092957489673e-05, + "loss": 1.8235, + "step": 9662 + }, + { + "epoch": 2.9659300184162065, + "grad_norm": 0.3117934465408325, + "learning_rate": 8.249715219488158e-05, + "loss": 1.9603, + "step": 9663 + }, + { + "epoch": 2.9662369551872314, + "grad_norm": 0.3244706988334656, + "learning_rate": 8.249337449371055e-05, + "loss": 1.8766, + "step": 9664 + }, + { + "epoch": 2.9665438919582567, + "grad_norm": 0.3071763515472412, + "learning_rate": 8.248959647142094e-05, + "loss": 1.8118, + "step": 9665 + }, + { + "epoch": 2.9668508287292816, + "grad_norm": 0.2575626075267792, + "learning_rate": 8.24858181280501e-05, + "loss": 1.8578, + "step": 9666 + }, + { + "epoch": 2.967157765500307, + "grad_norm": 0.369356244802475, + "learning_rate": 8.248203946363535e-05, + "loss": 1.7831, + "step": 9667 + }, + { + "epoch": 2.9674647022713323, + "grad_norm": 0.317775160074234, + "learning_rate": 8.247826047821405e-05, + "loss": 1.8839, + "step": 9668 + }, + { + "epoch": 2.967771639042357, + "grad_norm": 0.31816980242729187, + "learning_rate": 8.247448117182355e-05, + "loss": 1.8111, + "step": 9669 + }, + { + "epoch": 2.9680785758133825, + "grad_norm": 0.2943781316280365, + "learning_rate": 8.247070154450119e-05, + "loss": 1.848, + "step": 9670 + }, + { + "epoch": 2.9683855125844074, + "grad_norm": 0.28252434730529785, + "learning_rate": 8.246692159628433e-05, + "loss": 1.8601, + "step": 9671 + }, + { + "epoch": 2.9686924493554327, + "grad_norm": 0.29150691628456116, + "learning_rate": 8.246314132721032e-05, + "loss": 1.7738, + "step": 9672 + }, + { + "epoch": 2.968999386126458, + "grad_norm": 0.3699757754802704, + "learning_rate": 8.245936073731653e-05, + "loss": 1.842, + "step": 9673 + }, + { + "epoch": 2.969306322897483, + "grad_norm": 0.37951794266700745, + "learning_rate": 8.245557982664031e-05, + "loss": 1.8648, + "step": 9674 + }, + { + "epoch": 2.9696132596685083, + "grad_norm": 0.2792273461818695, + "learning_rate": 8.245179859521901e-05, + "loss": 1.889, + "step": 9675 + }, + { + "epoch": 2.969920196439533, + "grad_norm": 0.3405047059059143, + "learning_rate": 8.244801704309002e-05, + "loss": 1.7658, + "step": 9676 + }, + { + "epoch": 2.9702271332105585, + "grad_norm": 0.40138551592826843, + "learning_rate": 8.244423517029072e-05, + "loss": 1.79, + "step": 9677 + }, + { + "epoch": 2.970534069981584, + "grad_norm": 0.42260462045669556, + "learning_rate": 8.244045297685846e-05, + "loss": 1.9248, + "step": 9678 + }, + { + "epoch": 2.970841006752609, + "grad_norm": 0.30391061305999756, + "learning_rate": 8.243667046283063e-05, + "loss": 1.7922, + "step": 9679 + }, + { + "epoch": 2.971147943523634, + "grad_norm": 0.3194752037525177, + "learning_rate": 8.243288762824463e-05, + "loss": 1.8582, + "step": 9680 + }, + { + "epoch": 2.9714548802946594, + "grad_norm": 0.47853100299835205, + "learning_rate": 8.24291044731378e-05, + "loss": 1.8206, + "step": 9681 + }, + { + "epoch": 2.9717618170656843, + "grad_norm": 0.47428956627845764, + "learning_rate": 8.242532099754756e-05, + "loss": 1.8271, + "step": 9682 + }, + { + "epoch": 2.9720687538367097, + "grad_norm": 0.30275169014930725, + "learning_rate": 8.24215372015113e-05, + "loss": 1.8532, + "step": 9683 + }, + { + "epoch": 2.972375690607735, + "grad_norm": 0.31766825914382935, + "learning_rate": 8.24177530850664e-05, + "loss": 1.7751, + "step": 9684 + }, + { + "epoch": 2.97268262737876, + "grad_norm": 0.3738986551761627, + "learning_rate": 8.241396864825026e-05, + "loss": 1.7644, + "step": 9685 + }, + { + "epoch": 2.972989564149785, + "grad_norm": 0.2794596254825592, + "learning_rate": 8.24101838911003e-05, + "loss": 1.7445, + "step": 9686 + }, + { + "epoch": 2.97329650092081, + "grad_norm": 0.30008718371391296, + "learning_rate": 8.240639881365388e-05, + "loss": 1.8181, + "step": 9687 + }, + { + "epoch": 2.9736034376918354, + "grad_norm": 0.36667200922966003, + "learning_rate": 8.240261341594846e-05, + "loss": 1.8606, + "step": 9688 + }, + { + "epoch": 2.9739103744628608, + "grad_norm": 0.2943612039089203, + "learning_rate": 8.23988276980214e-05, + "loss": 1.8169, + "step": 9689 + }, + { + "epoch": 2.9742173112338857, + "grad_norm": 0.3499365746974945, + "learning_rate": 8.239504165991015e-05, + "loss": 1.8901, + "step": 9690 + }, + { + "epoch": 2.974524248004911, + "grad_norm": 0.35552978515625, + "learning_rate": 8.239125530165211e-05, + "loss": 1.8266, + "step": 9691 + }, + { + "epoch": 2.974831184775936, + "grad_norm": 0.35415011644363403, + "learning_rate": 8.23874686232847e-05, + "loss": 1.8588, + "step": 9692 + }, + { + "epoch": 2.9751381215469612, + "grad_norm": 0.3237420618534088, + "learning_rate": 8.238368162484533e-05, + "loss": 1.8112, + "step": 9693 + }, + { + "epoch": 2.9754450583179866, + "grad_norm": 0.31672203540802, + "learning_rate": 8.237989430637145e-05, + "loss": 1.7983, + "step": 9694 + }, + { + "epoch": 2.975751995089012, + "grad_norm": 0.2926657795906067, + "learning_rate": 8.237610666790048e-05, + "loss": 1.8137, + "step": 9695 + }, + { + "epoch": 2.976058931860037, + "grad_norm": 0.2924230992794037, + "learning_rate": 8.237231870946983e-05, + "loss": 1.8789, + "step": 9696 + }, + { + "epoch": 2.976365868631062, + "grad_norm": 0.2768077850341797, + "learning_rate": 8.236853043111697e-05, + "loss": 1.8643, + "step": 9697 + }, + { + "epoch": 2.976672805402087, + "grad_norm": 0.24151389300823212, + "learning_rate": 8.23647418328793e-05, + "loss": 1.8245, + "step": 9698 + }, + { + "epoch": 2.9769797421731123, + "grad_norm": 0.24514195322990417, + "learning_rate": 8.23609529147943e-05, + "loss": 1.761, + "step": 9699 + }, + { + "epoch": 2.9772866789441377, + "grad_norm": 0.2619125545024872, + "learning_rate": 8.235716367689938e-05, + "loss": 1.8445, + "step": 9700 + }, + { + "epoch": 2.9775936157151626, + "grad_norm": 0.2570437490940094, + "learning_rate": 8.235337411923203e-05, + "loss": 1.7881, + "step": 9701 + }, + { + "epoch": 2.977900552486188, + "grad_norm": 0.288775235414505, + "learning_rate": 8.234958424182966e-05, + "loss": 1.8177, + "step": 9702 + }, + { + "epoch": 2.978207489257213, + "grad_norm": 0.3186240792274475, + "learning_rate": 8.234579404472973e-05, + "loss": 1.8438, + "step": 9703 + }, + { + "epoch": 2.978514426028238, + "grad_norm": 0.2520117163658142, + "learning_rate": 8.23420035279697e-05, + "loss": 1.7791, + "step": 9704 + }, + { + "epoch": 2.9788213627992635, + "grad_norm": 0.23164312541484833, + "learning_rate": 8.233821269158706e-05, + "loss": 1.7368, + "step": 9705 + }, + { + "epoch": 2.979128299570289, + "grad_norm": 0.33843451738357544, + "learning_rate": 8.233442153561924e-05, + "loss": 1.8656, + "step": 9706 + }, + { + "epoch": 2.9794352363413137, + "grad_norm": 0.3070257604122162, + "learning_rate": 8.23306300601037e-05, + "loss": 1.7982, + "step": 9707 + }, + { + "epoch": 2.979742173112339, + "grad_norm": 0.29138872027397156, + "learning_rate": 8.232683826507793e-05, + "loss": 1.8227, + "step": 9708 + }, + { + "epoch": 2.980049109883364, + "grad_norm": 0.22698308527469635, + "learning_rate": 8.23230461505794e-05, + "loss": 1.7841, + "step": 9709 + }, + { + "epoch": 2.9803560466543892, + "grad_norm": 0.2597857713699341, + "learning_rate": 8.231925371664559e-05, + "loss": 1.7438, + "step": 9710 + }, + { + "epoch": 2.9806629834254146, + "grad_norm": 0.28672367334365845, + "learning_rate": 8.231546096331395e-05, + "loss": 1.8415, + "step": 9711 + }, + { + "epoch": 2.9809699201964395, + "grad_norm": 0.24295037984848022, + "learning_rate": 8.2311667890622e-05, + "loss": 1.8179, + "step": 9712 + }, + { + "epoch": 2.981276856967465, + "grad_norm": 0.24558894336223602, + "learning_rate": 8.23078744986072e-05, + "loss": 1.8092, + "step": 9713 + }, + { + "epoch": 2.9815837937384897, + "grad_norm": 0.2644276022911072, + "learning_rate": 8.230408078730706e-05, + "loss": 1.8214, + "step": 9714 + }, + { + "epoch": 2.981890730509515, + "grad_norm": 0.27007076144218445, + "learning_rate": 8.230028675675907e-05, + "loss": 1.8042, + "step": 9715 + }, + { + "epoch": 2.9821976672805404, + "grad_norm": 0.2729937732219696, + "learning_rate": 8.229649240700069e-05, + "loss": 1.8419, + "step": 9716 + }, + { + "epoch": 2.9825046040515653, + "grad_norm": 0.26545679569244385, + "learning_rate": 8.229269773806945e-05, + "loss": 1.823, + "step": 9717 + }, + { + "epoch": 2.9828115408225906, + "grad_norm": 0.23276878893375397, + "learning_rate": 8.228890275000285e-05, + "loss": 1.7635, + "step": 9718 + }, + { + "epoch": 2.9831184775936155, + "grad_norm": 0.28991779685020447, + "learning_rate": 8.228510744283837e-05, + "loss": 1.8303, + "step": 9719 + }, + { + "epoch": 2.983425414364641, + "grad_norm": 0.2821960151195526, + "learning_rate": 8.228131181661357e-05, + "loss": 1.8246, + "step": 9720 + }, + { + "epoch": 2.983732351135666, + "grad_norm": 0.25588423013687134, + "learning_rate": 8.22775158713659e-05, + "loss": 1.7764, + "step": 9721 + }, + { + "epoch": 2.9840392879066915, + "grad_norm": 0.2694758176803589, + "learning_rate": 8.227371960713289e-05, + "loss": 1.8026, + "step": 9722 + }, + { + "epoch": 2.9843462246777164, + "grad_norm": 0.27571097016334534, + "learning_rate": 8.226992302395209e-05, + "loss": 1.8051, + "step": 9723 + }, + { + "epoch": 2.9846531614487417, + "grad_norm": 0.2940119504928589, + "learning_rate": 8.226612612186099e-05, + "loss": 1.8782, + "step": 9724 + }, + { + "epoch": 2.9849600982197666, + "grad_norm": 0.34924936294555664, + "learning_rate": 8.226232890089711e-05, + "loss": 1.7845, + "step": 9725 + }, + { + "epoch": 2.985267034990792, + "grad_norm": 0.30503180623054504, + "learning_rate": 8.2258531361098e-05, + "loss": 1.8345, + "step": 9726 + }, + { + "epoch": 2.9855739717618173, + "grad_norm": 0.2463730275630951, + "learning_rate": 8.225473350250117e-05, + "loss": 1.8188, + "step": 9727 + }, + { + "epoch": 2.985880908532842, + "grad_norm": 0.3514629900455475, + "learning_rate": 8.225093532514417e-05, + "loss": 1.9253, + "step": 9728 + }, + { + "epoch": 2.9861878453038675, + "grad_norm": 0.26462769508361816, + "learning_rate": 8.224713682906449e-05, + "loss": 1.7396, + "step": 9729 + }, + { + "epoch": 2.9864947820748924, + "grad_norm": 0.27125996351242065, + "learning_rate": 8.224333801429973e-05, + "loss": 1.7784, + "step": 9730 + }, + { + "epoch": 2.9868017188459177, + "grad_norm": 0.3083387315273285, + "learning_rate": 8.22395388808874e-05, + "loss": 1.8503, + "step": 9731 + }, + { + "epoch": 2.987108655616943, + "grad_norm": 0.28289708495140076, + "learning_rate": 8.223573942886505e-05, + "loss": 1.8337, + "step": 9732 + }, + { + "epoch": 2.987415592387968, + "grad_norm": 0.3667753040790558, + "learning_rate": 8.223193965827023e-05, + "loss": 1.8213, + "step": 9733 + }, + { + "epoch": 2.9877225291589933, + "grad_norm": 0.3568948805332184, + "learning_rate": 8.222813956914049e-05, + "loss": 1.8337, + "step": 9734 + }, + { + "epoch": 2.988029465930018, + "grad_norm": 0.2883065640926361, + "learning_rate": 8.22243391615134e-05, + "loss": 1.7227, + "step": 9735 + }, + { + "epoch": 2.9883364027010435, + "grad_norm": 0.24940936267375946, + "learning_rate": 8.222053843542648e-05, + "loss": 1.7889, + "step": 9736 + }, + { + "epoch": 2.988643339472069, + "grad_norm": 0.31267982721328735, + "learning_rate": 8.221673739091732e-05, + "loss": 1.8432, + "step": 9737 + }, + { + "epoch": 2.988950276243094, + "grad_norm": 0.3552311658859253, + "learning_rate": 8.221293602802349e-05, + "loss": 1.8569, + "step": 9738 + }, + { + "epoch": 2.989257213014119, + "grad_norm": 0.4149966835975647, + "learning_rate": 8.220913434678252e-05, + "loss": 1.8052, + "step": 9739 + }, + { + "epoch": 2.9895641497851444, + "grad_norm": 0.282320499420166, + "learning_rate": 8.220533234723204e-05, + "loss": 1.7629, + "step": 9740 + }, + { + "epoch": 2.9898710865561693, + "grad_norm": 0.27737030386924744, + "learning_rate": 8.220153002940958e-05, + "loss": 1.8331, + "step": 9741 + }, + { + "epoch": 2.9901780233271946, + "grad_norm": 0.29296645522117615, + "learning_rate": 8.219772739335272e-05, + "loss": 1.8414, + "step": 9742 + }, + { + "epoch": 2.99048496009822, + "grad_norm": 0.35226449370384216, + "learning_rate": 8.219392443909903e-05, + "loss": 1.8608, + "step": 9743 + }, + { + "epoch": 2.990791896869245, + "grad_norm": 0.3199223577976227, + "learning_rate": 8.219012116668612e-05, + "loss": 1.7868, + "step": 9744 + }, + { + "epoch": 2.99109883364027, + "grad_norm": 0.2904597818851471, + "learning_rate": 8.218631757615159e-05, + "loss": 1.8495, + "step": 9745 + }, + { + "epoch": 2.991405770411295, + "grad_norm": 0.34674009680747986, + "learning_rate": 8.218251366753298e-05, + "loss": 1.8143, + "step": 9746 + }, + { + "epoch": 2.9917127071823204, + "grad_norm": 0.38007479906082153, + "learning_rate": 8.217870944086791e-05, + "loss": 1.8534, + "step": 9747 + }, + { + "epoch": 2.9920196439533457, + "grad_norm": 0.31660130620002747, + "learning_rate": 8.217490489619398e-05, + "loss": 1.7807, + "step": 9748 + }, + { + "epoch": 2.9923265807243706, + "grad_norm": 0.2923539876937866, + "learning_rate": 8.217110003354877e-05, + "loss": 1.8517, + "step": 9749 + }, + { + "epoch": 2.992633517495396, + "grad_norm": 0.31018227338790894, + "learning_rate": 8.21672948529699e-05, + "loss": 1.7998, + "step": 9750 + }, + { + "epoch": 2.992940454266421, + "grad_norm": 0.29448994994163513, + "learning_rate": 8.216348935449496e-05, + "loss": 1.7883, + "step": 9751 + }, + { + "epoch": 2.993247391037446, + "grad_norm": 0.26120781898498535, + "learning_rate": 8.215968353816158e-05, + "loss": 1.7762, + "step": 9752 + }, + { + "epoch": 2.9935543278084715, + "grad_norm": 0.27784180641174316, + "learning_rate": 8.215587740400735e-05, + "loss": 1.8711, + "step": 9753 + }, + { + "epoch": 2.993861264579497, + "grad_norm": 0.3106052577495575, + "learning_rate": 8.21520709520699e-05, + "loss": 1.8112, + "step": 9754 + }, + { + "epoch": 2.9941682013505218, + "grad_norm": 0.3170885145664215, + "learning_rate": 8.214826418238684e-05, + "loss": 1.8893, + "step": 9755 + }, + { + "epoch": 2.994475138121547, + "grad_norm": 0.2969432473182678, + "learning_rate": 8.214445709499577e-05, + "loss": 1.8628, + "step": 9756 + }, + { + "epoch": 2.994782074892572, + "grad_norm": 0.30484744906425476, + "learning_rate": 8.214064968993436e-05, + "loss": 1.8421, + "step": 9757 + }, + { + "epoch": 2.9950890116635973, + "grad_norm": 0.24819856882095337, + "learning_rate": 8.213684196724019e-05, + "loss": 1.8243, + "step": 9758 + }, + { + "epoch": 2.9953959484346226, + "grad_norm": 0.28566786646842957, + "learning_rate": 8.213303392695092e-05, + "loss": 1.8064, + "step": 9759 + }, + { + "epoch": 2.9957028852056475, + "grad_norm": 0.27742111682891846, + "learning_rate": 8.212922556910418e-05, + "loss": 1.8174, + "step": 9760 + }, + { + "epoch": 2.996009821976673, + "grad_norm": 0.27103090286254883, + "learning_rate": 8.212541689373761e-05, + "loss": 1.761, + "step": 9761 + }, + { + "epoch": 2.9963167587476978, + "grad_norm": 0.27157172560691833, + "learning_rate": 8.212160790088883e-05, + "loss": 1.8893, + "step": 9762 + }, + { + "epoch": 2.996623695518723, + "grad_norm": 0.2742370367050171, + "learning_rate": 8.21177985905955e-05, + "loss": 1.8774, + "step": 9763 + }, + { + "epoch": 2.9969306322897484, + "grad_norm": 0.26467064023017883, + "learning_rate": 8.211398896289524e-05, + "loss": 1.7805, + "step": 9764 + }, + { + "epoch": 2.9972375690607733, + "grad_norm": 0.2622149884700775, + "learning_rate": 8.211017901782574e-05, + "loss": 1.7346, + "step": 9765 + }, + { + "epoch": 2.9975445058317987, + "grad_norm": 0.3163202106952667, + "learning_rate": 8.210636875542462e-05, + "loss": 1.8348, + "step": 9766 + }, + { + "epoch": 2.9978514426028235, + "grad_norm": 0.2789528965950012, + "learning_rate": 8.210255817572955e-05, + "loss": 1.7535, + "step": 9767 + }, + { + "epoch": 2.998158379373849, + "grad_norm": 0.25694188475608826, + "learning_rate": 8.209874727877818e-05, + "loss": 1.8731, + "step": 9768 + }, + { + "epoch": 2.998465316144874, + "grad_norm": 0.40298742055892944, + "learning_rate": 8.209493606460818e-05, + "loss": 1.7924, + "step": 9769 + }, + { + "epoch": 2.9987722529158995, + "grad_norm": 0.5090280771255493, + "learning_rate": 8.20911245332572e-05, + "loss": 1.8253, + "step": 9770 + }, + { + "epoch": 2.9990791896869244, + "grad_norm": 0.41809162497520447, + "learning_rate": 8.208731268476293e-05, + "loss": 1.8233, + "step": 9771 + }, + { + "epoch": 2.9993861264579498, + "grad_norm": 0.23141434788703918, + "learning_rate": 8.208350051916303e-05, + "loss": 1.7842, + "step": 9772 + }, + { + "epoch": 2.9996930632289747, + "grad_norm": 0.3174372613430023, + "learning_rate": 8.207968803649517e-05, + "loss": 1.8477, + "step": 9773 + }, + { + "epoch": 3.0, + "grad_norm": 0.41795292496681213, + "learning_rate": 8.207587523679704e-05, + "loss": 1.8407, + "step": 9774 + }, + { + "epoch": 3.0003069367710253, + "grad_norm": 0.43365660309791565, + "learning_rate": 8.20720621201063e-05, + "loss": 1.8074, + "step": 9775 + }, + { + "epoch": 3.0006138735420502, + "grad_norm": 0.461374968290329, + "learning_rate": 8.206824868646064e-05, + "loss": 1.9089, + "step": 9776 + }, + { + "epoch": 3.0009208103130756, + "grad_norm": 0.3747929632663727, + "learning_rate": 8.206443493589776e-05, + "loss": 1.8358, + "step": 9777 + }, + { + "epoch": 3.001227747084101, + "grad_norm": 0.28436774015426636, + "learning_rate": 8.206062086845532e-05, + "loss": 1.8527, + "step": 9778 + }, + { + "epoch": 3.001534683855126, + "grad_norm": 0.33642131090164185, + "learning_rate": 8.205680648417106e-05, + "loss": 1.8142, + "step": 9779 + }, + { + "epoch": 3.001841620626151, + "grad_norm": 0.4283481240272522, + "learning_rate": 8.205299178308263e-05, + "loss": 1.9006, + "step": 9780 + }, + { + "epoch": 3.002148557397176, + "grad_norm": 0.34405630826950073, + "learning_rate": 8.204917676522777e-05, + "loss": 1.7988, + "step": 9781 + }, + { + "epoch": 3.0024554941682013, + "grad_norm": 0.3161070942878723, + "learning_rate": 8.204536143064414e-05, + "loss": 1.8271, + "step": 9782 + }, + { + "epoch": 3.0027624309392267, + "grad_norm": 0.42518749833106995, + "learning_rate": 8.204154577936946e-05, + "loss": 1.864, + "step": 9783 + }, + { + "epoch": 3.0030693677102516, + "grad_norm": 0.3760852813720703, + "learning_rate": 8.203772981144146e-05, + "loss": 1.8543, + "step": 9784 + }, + { + "epoch": 3.003376304481277, + "grad_norm": 0.32794755697250366, + "learning_rate": 8.203391352689784e-05, + "loss": 1.8776, + "step": 9785 + }, + { + "epoch": 3.0036832412523022, + "grad_norm": 0.3053889274597168, + "learning_rate": 8.20300969257763e-05, + "loss": 1.8064, + "step": 9786 + }, + { + "epoch": 3.003990178023327, + "grad_norm": 0.40283143520355225, + "learning_rate": 8.202628000811456e-05, + "loss": 1.8083, + "step": 9787 + }, + { + "epoch": 3.0042971147943525, + "grad_norm": 0.49270665645599365, + "learning_rate": 8.202246277395038e-05, + "loss": 1.802, + "step": 9788 + }, + { + "epoch": 3.0046040515653774, + "grad_norm": 0.4373023211956024, + "learning_rate": 8.201864522332143e-05, + "loss": 1.8429, + "step": 9789 + }, + { + "epoch": 3.0049109883364027, + "grad_norm": 0.3136310875415802, + "learning_rate": 8.201482735626547e-05, + "loss": 1.8224, + "step": 9790 + }, + { + "epoch": 3.005217925107428, + "grad_norm": 0.3306807279586792, + "learning_rate": 8.201100917282023e-05, + "loss": 1.8463, + "step": 9791 + }, + { + "epoch": 3.005524861878453, + "grad_norm": 0.45082196593284607, + "learning_rate": 8.200719067302342e-05, + "loss": 1.7587, + "step": 9792 + }, + { + "epoch": 3.0058317986494782, + "grad_norm": 0.49246448278427124, + "learning_rate": 8.20033718569128e-05, + "loss": 1.8245, + "step": 9793 + }, + { + "epoch": 3.0061387354205036, + "grad_norm": 0.3040246367454529, + "learning_rate": 8.199955272452609e-05, + "loss": 1.8309, + "step": 9794 + }, + { + "epoch": 3.0064456721915285, + "grad_norm": 0.3909318149089813, + "learning_rate": 8.199573327590105e-05, + "loss": 1.8187, + "step": 9795 + }, + { + "epoch": 3.006752608962554, + "grad_norm": 0.5753183960914612, + "learning_rate": 8.199191351107543e-05, + "loss": 1.826, + "step": 9796 + }, + { + "epoch": 3.0070595457335787, + "grad_norm": 0.48908689618110657, + "learning_rate": 8.198809343008695e-05, + "loss": 1.8475, + "step": 9797 + }, + { + "epoch": 3.007366482504604, + "grad_norm": 0.31570208072662354, + "learning_rate": 8.198427303297341e-05, + "loss": 1.8046, + "step": 9798 + }, + { + "epoch": 3.0076734192756294, + "grad_norm": 0.39205440878868103, + "learning_rate": 8.198045231977251e-05, + "loss": 1.8413, + "step": 9799 + }, + { + "epoch": 3.0079803560466543, + "grad_norm": 0.5117597579956055, + "learning_rate": 8.197663129052204e-05, + "loss": 1.8184, + "step": 9800 + }, + { + "epoch": 3.0082872928176796, + "grad_norm": 0.3623514175415039, + "learning_rate": 8.197280994525978e-05, + "loss": 1.8292, + "step": 9801 + }, + { + "epoch": 3.008594229588705, + "grad_norm": 0.2826726734638214, + "learning_rate": 8.196898828402344e-05, + "loss": 1.8216, + "step": 9802 + }, + { + "epoch": 3.00890116635973, + "grad_norm": 0.38658398389816284, + "learning_rate": 8.196516630685085e-05, + "loss": 1.867, + "step": 9803 + }, + { + "epoch": 3.009208103130755, + "grad_norm": 0.3371698260307312, + "learning_rate": 8.196134401377973e-05, + "loss": 1.8077, + "step": 9804 + }, + { + "epoch": 3.00951503990178, + "grad_norm": 0.24108785390853882, + "learning_rate": 8.195752140484789e-05, + "loss": 1.7858, + "step": 9805 + }, + { + "epoch": 3.0098219766728054, + "grad_norm": 0.34410104155540466, + "learning_rate": 8.195369848009309e-05, + "loss": 1.801, + "step": 9806 + }, + { + "epoch": 3.0101289134438307, + "grad_norm": 0.3412116467952728, + "learning_rate": 8.194987523955311e-05, + "loss": 1.7905, + "step": 9807 + }, + { + "epoch": 3.0104358502148556, + "grad_norm": 0.2473030537366867, + "learning_rate": 8.194605168326573e-05, + "loss": 1.7765, + "step": 9808 + }, + { + "epoch": 3.010742786985881, + "grad_norm": 0.28590065240859985, + "learning_rate": 8.194222781126875e-05, + "loss": 1.7897, + "step": 9809 + }, + { + "epoch": 3.0110497237569063, + "grad_norm": 0.2994272708892822, + "learning_rate": 8.193840362359994e-05, + "loss": 1.7976, + "step": 9810 + }, + { + "epoch": 3.011356660527931, + "grad_norm": 0.2971307635307312, + "learning_rate": 8.193457912029713e-05, + "loss": 1.829, + "step": 9811 + }, + { + "epoch": 3.0116635972989565, + "grad_norm": 0.25149810314178467, + "learning_rate": 8.193075430139809e-05, + "loss": 1.7709, + "step": 9812 + }, + { + "epoch": 3.0119705340699814, + "grad_norm": 0.2561332583427429, + "learning_rate": 8.19269291669406e-05, + "loss": 1.7689, + "step": 9813 + }, + { + "epoch": 3.0122774708410067, + "grad_norm": 0.2658882141113281, + "learning_rate": 8.192310371696249e-05, + "loss": 1.8497, + "step": 9814 + }, + { + "epoch": 3.012584407612032, + "grad_norm": 0.2873780429363251, + "learning_rate": 8.191927795150156e-05, + "loss": 1.8217, + "step": 9815 + }, + { + "epoch": 3.012891344383057, + "grad_norm": 0.2181183248758316, + "learning_rate": 8.191545187059562e-05, + "loss": 1.7261, + "step": 9816 + }, + { + "epoch": 3.0131982811540823, + "grad_norm": 0.2414858490228653, + "learning_rate": 8.191162547428248e-05, + "loss": 1.8035, + "step": 9817 + }, + { + "epoch": 3.0135052179251076, + "grad_norm": 0.2799840271472931, + "learning_rate": 8.190779876259995e-05, + "loss": 1.8279, + "step": 9818 + }, + { + "epoch": 3.0138121546961325, + "grad_norm": 0.2669760584831238, + "learning_rate": 8.190397173558584e-05, + "loss": 1.8155, + "step": 9819 + }, + { + "epoch": 3.014119091467158, + "grad_norm": 0.28857991099357605, + "learning_rate": 8.1900144393278e-05, + "loss": 1.8479, + "step": 9820 + }, + { + "epoch": 3.0144260282381827, + "grad_norm": 0.30534693598747253, + "learning_rate": 8.189631673571422e-05, + "loss": 1.8609, + "step": 9821 + }, + { + "epoch": 3.014732965009208, + "grad_norm": 0.3238218128681183, + "learning_rate": 8.189248876293236e-05, + "loss": 1.9292, + "step": 9822 + }, + { + "epoch": 3.0150399017802334, + "grad_norm": 0.3000536561012268, + "learning_rate": 8.188866047497022e-05, + "loss": 1.8214, + "step": 9823 + }, + { + "epoch": 3.0153468385512583, + "grad_norm": 0.2960065007209778, + "learning_rate": 8.188483187186565e-05, + "loss": 1.8316, + "step": 9824 + }, + { + "epoch": 3.0156537753222836, + "grad_norm": 0.28609779477119446, + "learning_rate": 8.188100295365648e-05, + "loss": 1.8002, + "step": 9825 + }, + { + "epoch": 3.015960712093309, + "grad_norm": 0.31390634179115295, + "learning_rate": 8.187717372038057e-05, + "loss": 1.8134, + "step": 9826 + }, + { + "epoch": 3.016267648864334, + "grad_norm": 0.28550946712493896, + "learning_rate": 8.187334417207573e-05, + "loss": 1.8359, + "step": 9827 + }, + { + "epoch": 3.016574585635359, + "grad_norm": 0.3085210621356964, + "learning_rate": 8.186951430877982e-05, + "loss": 1.813, + "step": 9828 + }, + { + "epoch": 3.016881522406384, + "grad_norm": 0.3043847978115082, + "learning_rate": 8.18656841305307e-05, + "loss": 1.8222, + "step": 9829 + }, + { + "epoch": 3.0171884591774094, + "grad_norm": 0.32524731755256653, + "learning_rate": 8.18618536373662e-05, + "loss": 1.8258, + "step": 9830 + }, + { + "epoch": 3.0174953959484347, + "grad_norm": 0.2690991461277008, + "learning_rate": 8.18580228293242e-05, + "loss": 1.8492, + "step": 9831 + }, + { + "epoch": 3.0178023327194596, + "grad_norm": 0.34936225414276123, + "learning_rate": 8.185419170644253e-05, + "loss": 1.8363, + "step": 9832 + }, + { + "epoch": 3.018109269490485, + "grad_norm": 0.3274296820163727, + "learning_rate": 8.185036026875908e-05, + "loss": 1.7789, + "step": 9833 + }, + { + "epoch": 3.0184162062615103, + "grad_norm": 0.2729836106300354, + "learning_rate": 8.184652851631169e-05, + "loss": 1.8264, + "step": 9834 + }, + { + "epoch": 3.018723143032535, + "grad_norm": 0.28682780265808105, + "learning_rate": 8.184269644913826e-05, + "loss": 1.8399, + "step": 9835 + }, + { + "epoch": 3.0190300798035605, + "grad_norm": 0.3224826455116272, + "learning_rate": 8.183886406727662e-05, + "loss": 1.8338, + "step": 9836 + }, + { + "epoch": 3.0193370165745854, + "grad_norm": 0.30945318937301636, + "learning_rate": 8.183503137076467e-05, + "loss": 1.8248, + "step": 9837 + }, + { + "epoch": 3.0196439533456108, + "grad_norm": 0.27580398321151733, + "learning_rate": 8.183119835964029e-05, + "loss": 1.8096, + "step": 9838 + }, + { + "epoch": 3.019950890116636, + "grad_norm": 0.28927183151245117, + "learning_rate": 8.182736503394132e-05, + "loss": 1.825, + "step": 9839 + }, + { + "epoch": 3.020257826887661, + "grad_norm": 0.253000408411026, + "learning_rate": 8.182353139370571e-05, + "loss": 1.7678, + "step": 9840 + }, + { + "epoch": 3.0205647636586863, + "grad_norm": 0.2882022559642792, + "learning_rate": 8.18196974389713e-05, + "loss": 1.8895, + "step": 9841 + }, + { + "epoch": 3.0208717004297116, + "grad_norm": 0.26864609122276306, + "learning_rate": 8.1815863169776e-05, + "loss": 1.7674, + "step": 9842 + }, + { + "epoch": 3.0211786372007365, + "grad_norm": 0.27344849705696106, + "learning_rate": 8.181202858615769e-05, + "loss": 1.8146, + "step": 9843 + }, + { + "epoch": 3.021485573971762, + "grad_norm": 0.31659772992134094, + "learning_rate": 8.180819368815425e-05, + "loss": 1.8485, + "step": 9844 + }, + { + "epoch": 3.021792510742787, + "grad_norm": 0.3163176476955414, + "learning_rate": 8.18043584758036e-05, + "loss": 1.8994, + "step": 9845 + }, + { + "epoch": 3.022099447513812, + "grad_norm": 0.2583829462528229, + "learning_rate": 8.180052294914365e-05, + "loss": 1.764, + "step": 9846 + }, + { + "epoch": 3.0224063842848374, + "grad_norm": 0.3006649315357208, + "learning_rate": 8.179668710821227e-05, + "loss": 1.9232, + "step": 9847 + }, + { + "epoch": 3.0227133210558623, + "grad_norm": 0.35702988505363464, + "learning_rate": 8.179285095304741e-05, + "loss": 1.8403, + "step": 9848 + }, + { + "epoch": 3.0230202578268877, + "grad_norm": 0.29699379205703735, + "learning_rate": 8.178901448368697e-05, + "loss": 1.8412, + "step": 9849 + }, + { + "epoch": 3.023327194597913, + "grad_norm": 0.3022700548171997, + "learning_rate": 8.178517770016885e-05, + "loss": 1.8197, + "step": 9850 + }, + { + "epoch": 3.023634131368938, + "grad_norm": 0.2943836748600006, + "learning_rate": 8.178134060253097e-05, + "loss": 1.8127, + "step": 9851 + }, + { + "epoch": 3.023941068139963, + "grad_norm": 0.31290489435195923, + "learning_rate": 8.177750319081126e-05, + "loss": 1.821, + "step": 9852 + }, + { + "epoch": 3.0242480049109886, + "grad_norm": 0.30308374762535095, + "learning_rate": 8.177366546504763e-05, + "loss": 1.8522, + "step": 9853 + }, + { + "epoch": 3.0245549416820134, + "grad_norm": 0.301559716463089, + "learning_rate": 8.176982742527802e-05, + "loss": 1.8758, + "step": 9854 + }, + { + "epoch": 3.0248618784530388, + "grad_norm": 0.33314836025238037, + "learning_rate": 8.176598907154034e-05, + "loss": 1.8178, + "step": 9855 + }, + { + "epoch": 3.0251688152240637, + "grad_norm": 0.3567935526371002, + "learning_rate": 8.176215040387255e-05, + "loss": 1.7847, + "step": 9856 + }, + { + "epoch": 3.025475751995089, + "grad_norm": 0.27716195583343506, + "learning_rate": 8.175831142231258e-05, + "loss": 1.772, + "step": 9857 + }, + { + "epoch": 3.0257826887661143, + "grad_norm": 0.24568212032318115, + "learning_rate": 8.175447212689836e-05, + "loss": 1.8171, + "step": 9858 + }, + { + "epoch": 3.0260896255371392, + "grad_norm": 0.25368261337280273, + "learning_rate": 8.175063251766784e-05, + "loss": 1.852, + "step": 9859 + }, + { + "epoch": 3.0263965623081646, + "grad_norm": 0.2509497404098511, + "learning_rate": 8.174679259465894e-05, + "loss": 1.7737, + "step": 9860 + }, + { + "epoch": 3.02670349907919, + "grad_norm": 0.3539343774318695, + "learning_rate": 8.174295235790963e-05, + "loss": 1.8663, + "step": 9861 + }, + { + "epoch": 3.027010435850215, + "grad_norm": 0.36450034379959106, + "learning_rate": 8.173911180745788e-05, + "loss": 1.8179, + "step": 9862 + }, + { + "epoch": 3.02731737262124, + "grad_norm": 0.3550017178058624, + "learning_rate": 8.173527094334162e-05, + "loss": 1.8256, + "step": 9863 + }, + { + "epoch": 3.027624309392265, + "grad_norm": 0.33518701791763306, + "learning_rate": 8.17314297655988e-05, + "loss": 1.7842, + "step": 9864 + }, + { + "epoch": 3.0279312461632903, + "grad_norm": 0.2522886097431183, + "learning_rate": 8.172758827426739e-05, + "loss": 1.7688, + "step": 9865 + }, + { + "epoch": 3.0282381829343157, + "grad_norm": 0.26222914457321167, + "learning_rate": 8.172374646938536e-05, + "loss": 1.8517, + "step": 9866 + }, + { + "epoch": 3.0285451197053406, + "grad_norm": 0.3355788588523865, + "learning_rate": 8.171990435099068e-05, + "loss": 1.9002, + "step": 9867 + }, + { + "epoch": 3.028852056476366, + "grad_norm": 0.32907500863075256, + "learning_rate": 8.171606191912131e-05, + "loss": 1.7801, + "step": 9868 + }, + { + "epoch": 3.0291589932473912, + "grad_norm": 0.29234179854393005, + "learning_rate": 8.171221917381523e-05, + "loss": 1.8055, + "step": 9869 + }, + { + "epoch": 3.029465930018416, + "grad_norm": 0.26374876499176025, + "learning_rate": 8.170837611511041e-05, + "loss": 1.781, + "step": 9870 + }, + { + "epoch": 3.0297728667894415, + "grad_norm": 0.311282217502594, + "learning_rate": 8.170453274304483e-05, + "loss": 1.839, + "step": 9871 + }, + { + "epoch": 3.0300798035604664, + "grad_norm": 0.24225831031799316, + "learning_rate": 8.170068905765648e-05, + "loss": 1.804, + "step": 9872 + }, + { + "epoch": 3.0303867403314917, + "grad_norm": 0.29383334517478943, + "learning_rate": 8.169684505898335e-05, + "loss": 1.7817, + "step": 9873 + }, + { + "epoch": 3.030693677102517, + "grad_norm": 0.2607928514480591, + "learning_rate": 8.169300074706339e-05, + "loss": 1.8379, + "step": 9874 + }, + { + "epoch": 3.031000613873542, + "grad_norm": 0.283028244972229, + "learning_rate": 8.168915612193464e-05, + "loss": 1.7797, + "step": 9875 + }, + { + "epoch": 3.0313075506445673, + "grad_norm": 0.27675309777259827, + "learning_rate": 8.168531118363508e-05, + "loss": 1.8355, + "step": 9876 + }, + { + "epoch": 3.0316144874155926, + "grad_norm": 0.2598227262496948, + "learning_rate": 8.16814659322027e-05, + "loss": 1.7898, + "step": 9877 + }, + { + "epoch": 3.0319214241866175, + "grad_norm": 0.24715003371238708, + "learning_rate": 8.16776203676755e-05, + "loss": 1.7791, + "step": 9878 + }, + { + "epoch": 3.032228360957643, + "grad_norm": 0.2749374210834503, + "learning_rate": 8.167377449009149e-05, + "loss": 1.8303, + "step": 9879 + }, + { + "epoch": 3.0325352977286677, + "grad_norm": 0.26150834560394287, + "learning_rate": 8.166992829948868e-05, + "loss": 1.8462, + "step": 9880 + }, + { + "epoch": 3.032842234499693, + "grad_norm": 0.3044755160808563, + "learning_rate": 8.166608179590506e-05, + "loss": 1.806, + "step": 9881 + }, + { + "epoch": 3.0331491712707184, + "grad_norm": 0.2949555516242981, + "learning_rate": 8.166223497937868e-05, + "loss": 1.8785, + "step": 9882 + }, + { + "epoch": 3.0334561080417433, + "grad_norm": 0.33206698298454285, + "learning_rate": 8.165838784994752e-05, + "loss": 1.8476, + "step": 9883 + }, + { + "epoch": 3.0337630448127686, + "grad_norm": 0.2720400094985962, + "learning_rate": 8.165454040764962e-05, + "loss": 1.843, + "step": 9884 + }, + { + "epoch": 3.034069981583794, + "grad_norm": 0.29340869188308716, + "learning_rate": 8.1650692652523e-05, + "loss": 1.7761, + "step": 9885 + }, + { + "epoch": 3.034376918354819, + "grad_norm": 0.35155293345451355, + "learning_rate": 8.16468445846057e-05, + "loss": 1.8887, + "step": 9886 + }, + { + "epoch": 3.034683855125844, + "grad_norm": 0.2688990831375122, + "learning_rate": 8.164299620393571e-05, + "loss": 1.8001, + "step": 9887 + }, + { + "epoch": 3.034990791896869, + "grad_norm": 0.2921253442764282, + "learning_rate": 8.16391475105511e-05, + "loss": 1.7951, + "step": 9888 + }, + { + "epoch": 3.0352977286678944, + "grad_norm": 0.28100699186325073, + "learning_rate": 8.163529850448988e-05, + "loss": 1.8041, + "step": 9889 + }, + { + "epoch": 3.0356046654389197, + "grad_norm": 0.3155081868171692, + "learning_rate": 8.16314491857901e-05, + "loss": 1.8026, + "step": 9890 + }, + { + "epoch": 3.0359116022099446, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.16275995544898e-05, + "loss": 1.8502, + "step": 9891 + }, + { + "epoch": 3.03621853898097, + "grad_norm": 0.2732076644897461, + "learning_rate": 8.162374961062704e-05, + "loss": 1.8424, + "step": 9892 + }, + { + "epoch": 3.0365254757519953, + "grad_norm": 0.2943679690361023, + "learning_rate": 8.161989935423984e-05, + "loss": 1.7635, + "step": 9893 + }, + { + "epoch": 3.03683241252302, + "grad_norm": 0.28894683718681335, + "learning_rate": 8.161604878536626e-05, + "loss": 1.78, + "step": 9894 + }, + { + "epoch": 3.0371393492940455, + "grad_norm": 0.2718082666397095, + "learning_rate": 8.161219790404435e-05, + "loss": 1.7664, + "step": 9895 + }, + { + "epoch": 3.0374462860650704, + "grad_norm": 0.29092124104499817, + "learning_rate": 8.160834671031216e-05, + "loss": 1.8621, + "step": 9896 + }, + { + "epoch": 3.0377532228360957, + "grad_norm": 0.284665584564209, + "learning_rate": 8.160449520420779e-05, + "loss": 1.8607, + "step": 9897 + }, + { + "epoch": 3.038060159607121, + "grad_norm": 0.23676982522010803, + "learning_rate": 8.160064338576925e-05, + "loss": 1.7137, + "step": 9898 + }, + { + "epoch": 3.038367096378146, + "grad_norm": 0.2666932940483093, + "learning_rate": 8.159679125503466e-05, + "loss": 1.8038, + "step": 9899 + }, + { + "epoch": 3.0386740331491713, + "grad_norm": 0.36214375495910645, + "learning_rate": 8.159293881204204e-05, + "loss": 1.8902, + "step": 9900 + }, + { + "epoch": 3.0389809699201966, + "grad_norm": 0.30301332473754883, + "learning_rate": 8.158908605682948e-05, + "loss": 1.8456, + "step": 9901 + }, + { + "epoch": 3.0392879066912215, + "grad_norm": 0.32190418243408203, + "learning_rate": 8.158523298943506e-05, + "loss": 1.8246, + "step": 9902 + }, + { + "epoch": 3.039594843462247, + "grad_norm": 0.2938043475151062, + "learning_rate": 8.158137960989685e-05, + "loss": 1.8324, + "step": 9903 + }, + { + "epoch": 3.0399017802332717, + "grad_norm": 0.29493969678878784, + "learning_rate": 8.157752591825294e-05, + "loss": 1.8458, + "step": 9904 + }, + { + "epoch": 3.040208717004297, + "grad_norm": 0.2681889832019806, + "learning_rate": 8.157367191454141e-05, + "loss": 1.889, + "step": 9905 + }, + { + "epoch": 3.0405156537753224, + "grad_norm": 0.3111969232559204, + "learning_rate": 8.156981759880035e-05, + "loss": 1.8966, + "step": 9906 + }, + { + "epoch": 3.0408225905463473, + "grad_norm": 0.345262736082077, + "learning_rate": 8.156596297106784e-05, + "loss": 1.8174, + "step": 9907 + }, + { + "epoch": 3.0411295273173726, + "grad_norm": 0.30156534910202026, + "learning_rate": 8.156210803138199e-05, + "loss": 1.766, + "step": 9908 + }, + { + "epoch": 3.041436464088398, + "grad_norm": 0.28691565990448, + "learning_rate": 8.15582527797809e-05, + "loss": 1.8436, + "step": 9909 + }, + { + "epoch": 3.041743400859423, + "grad_norm": 0.33418282866477966, + "learning_rate": 8.155439721630264e-05, + "loss": 1.8939, + "step": 9910 + }, + { + "epoch": 3.042050337630448, + "grad_norm": 0.25496938824653625, + "learning_rate": 8.155054134098535e-05, + "loss": 1.8368, + "step": 9911 + }, + { + "epoch": 3.042357274401473, + "grad_norm": 0.3806788921356201, + "learning_rate": 8.154668515386711e-05, + "loss": 1.8635, + "step": 9912 + }, + { + "epoch": 3.0426642111724984, + "grad_norm": 0.42668119072914124, + "learning_rate": 8.154282865498603e-05, + "loss": 1.76, + "step": 9913 + }, + { + "epoch": 3.0429711479435237, + "grad_norm": 0.35945314168930054, + "learning_rate": 8.153897184438024e-05, + "loss": 1.8275, + "step": 9914 + }, + { + "epoch": 3.0432780847145486, + "grad_norm": 0.3225449323654175, + "learning_rate": 8.153511472208784e-05, + "loss": 1.7901, + "step": 9915 + }, + { + "epoch": 3.043585021485574, + "grad_norm": 0.2905425727367401, + "learning_rate": 8.153125728814694e-05, + "loss": 1.8021, + "step": 9916 + }, + { + "epoch": 3.0438919582565993, + "grad_norm": 0.3315529525279999, + "learning_rate": 8.15273995425957e-05, + "loss": 1.8003, + "step": 9917 + }, + { + "epoch": 3.044198895027624, + "grad_norm": 0.30256444215774536, + "learning_rate": 8.152354148547221e-05, + "loss": 1.8243, + "step": 9918 + }, + { + "epoch": 3.0445058317986495, + "grad_norm": 0.2563035190105438, + "learning_rate": 8.15196831168146e-05, + "loss": 1.7877, + "step": 9919 + }, + { + "epoch": 3.044812768569675, + "grad_norm": 0.25705814361572266, + "learning_rate": 8.151582443666101e-05, + "loss": 1.813, + "step": 9920 + }, + { + "epoch": 3.0451197053406998, + "grad_norm": 0.3649071455001831, + "learning_rate": 8.151196544504957e-05, + "loss": 1.8114, + "step": 9921 + }, + { + "epoch": 3.045426642111725, + "grad_norm": 0.4076193571090698, + "learning_rate": 8.150810614201841e-05, + "loss": 1.7869, + "step": 9922 + }, + { + "epoch": 3.04573357888275, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.150424652760569e-05, + "loss": 1.7878, + "step": 9923 + }, + { + "epoch": 3.0460405156537753, + "grad_norm": 0.2243243157863617, + "learning_rate": 8.150038660184955e-05, + "loss": 1.8224, + "step": 9924 + }, + { + "epoch": 3.0463474524248007, + "grad_norm": 0.3295031487941742, + "learning_rate": 8.149652636478811e-05, + "loss": 1.8685, + "step": 9925 + }, + { + "epoch": 3.0466543891958255, + "grad_norm": 0.2973531186580658, + "learning_rate": 8.149266581645954e-05, + "loss": 1.8082, + "step": 9926 + }, + { + "epoch": 3.046961325966851, + "grad_norm": 0.25648918747901917, + "learning_rate": 8.148880495690199e-05, + "loss": 1.8089, + "step": 9927 + }, + { + "epoch": 3.047268262737876, + "grad_norm": 0.2845752537250519, + "learning_rate": 8.148494378615361e-05, + "loss": 1.8726, + "step": 9928 + }, + { + "epoch": 3.047575199508901, + "grad_norm": 0.2917105555534363, + "learning_rate": 8.148108230425255e-05, + "loss": 1.8035, + "step": 9929 + }, + { + "epoch": 3.0478821362799264, + "grad_norm": 0.2775834798812866, + "learning_rate": 8.1477220511237e-05, + "loss": 1.8545, + "step": 9930 + }, + { + "epoch": 3.0481890730509513, + "grad_norm": 0.3522767424583435, + "learning_rate": 8.14733584071451e-05, + "loss": 1.8261, + "step": 9931 + }, + { + "epoch": 3.0484960098219767, + "grad_norm": 0.3759000599384308, + "learning_rate": 8.146949599201503e-05, + "loss": 1.8405, + "step": 9932 + }, + { + "epoch": 3.048802946593002, + "grad_norm": 0.3353044390678406, + "learning_rate": 8.146563326588496e-05, + "loss": 1.7762, + "step": 9933 + }, + { + "epoch": 3.049109883364027, + "grad_norm": 0.263810932636261, + "learning_rate": 8.146177022879304e-05, + "loss": 1.7546, + "step": 9934 + }, + { + "epoch": 3.049416820135052, + "grad_norm": 0.24064256250858307, + "learning_rate": 8.14579068807775e-05, + "loss": 1.7903, + "step": 9935 + }, + { + "epoch": 3.0497237569060776, + "grad_norm": 0.3144194781780243, + "learning_rate": 8.145404322187645e-05, + "loss": 1.8011, + "step": 9936 + }, + { + "epoch": 3.0500306936771024, + "grad_norm": 0.3362879455089569, + "learning_rate": 8.145017925212812e-05, + "loss": 1.8224, + "step": 9937 + }, + { + "epoch": 3.050337630448128, + "grad_norm": 0.33979395031929016, + "learning_rate": 8.144631497157071e-05, + "loss": 1.8415, + "step": 9938 + }, + { + "epoch": 3.0506445672191527, + "grad_norm": 0.33391237258911133, + "learning_rate": 8.144245038024235e-05, + "loss": 1.7983, + "step": 9939 + }, + { + "epoch": 3.050951503990178, + "grad_norm": 0.34034964442253113, + "learning_rate": 8.143858547818128e-05, + "loss": 1.8635, + "step": 9940 + }, + { + "epoch": 3.0512584407612033, + "grad_norm": 0.3472529947757721, + "learning_rate": 8.143472026542569e-05, + "loss": 1.8067, + "step": 9941 + }, + { + "epoch": 3.0515653775322282, + "grad_norm": 0.3369109630584717, + "learning_rate": 8.143085474201376e-05, + "loss": 1.7933, + "step": 9942 + }, + { + "epoch": 3.0518723143032536, + "grad_norm": 0.3055182993412018, + "learning_rate": 8.14269889079837e-05, + "loss": 1.7358, + "step": 9943 + }, + { + "epoch": 3.052179251074279, + "grad_norm": 0.26729708909988403, + "learning_rate": 8.142312276337372e-05, + "loss": 1.8315, + "step": 9944 + }, + { + "epoch": 3.052486187845304, + "grad_norm": 0.3626720607280731, + "learning_rate": 8.141925630822203e-05, + "loss": 1.7593, + "step": 9945 + }, + { + "epoch": 3.052793124616329, + "grad_norm": 0.3673512637615204, + "learning_rate": 8.141538954256683e-05, + "loss": 1.8414, + "step": 9946 + }, + { + "epoch": 3.053100061387354, + "grad_norm": 0.30554768443107605, + "learning_rate": 8.141152246644632e-05, + "loss": 1.7504, + "step": 9947 + }, + { + "epoch": 3.0534069981583793, + "grad_norm": 0.41163405776023865, + "learning_rate": 8.140765507989875e-05, + "loss": 1.8794, + "step": 9948 + }, + { + "epoch": 3.0537139349294047, + "grad_norm": 0.592751145362854, + "learning_rate": 8.140378738296233e-05, + "loss": 1.8538, + "step": 9949 + }, + { + "epoch": 3.0540208717004296, + "grad_norm": 0.483828604221344, + "learning_rate": 8.139991937567527e-05, + "loss": 1.7952, + "step": 9950 + }, + { + "epoch": 3.054327808471455, + "grad_norm": 0.26665306091308594, + "learning_rate": 8.13960510580758e-05, + "loss": 1.8268, + "step": 9951 + }, + { + "epoch": 3.0546347452424802, + "grad_norm": 0.42917072772979736, + "learning_rate": 8.139218243020215e-05, + "loss": 1.843, + "step": 9952 + }, + { + "epoch": 3.054941682013505, + "grad_norm": 0.47911396622657776, + "learning_rate": 8.138831349209256e-05, + "loss": 1.8223, + "step": 9953 + }, + { + "epoch": 3.0552486187845305, + "grad_norm": 0.4540431797504425, + "learning_rate": 8.138444424378524e-05, + "loss": 1.9198, + "step": 9954 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.29719051718711853, + "learning_rate": 8.138057468531845e-05, + "loss": 1.7873, + "step": 9955 + }, + { + "epoch": 3.0558624923265807, + "grad_norm": 0.35133618116378784, + "learning_rate": 8.137670481673045e-05, + "loss": 1.8459, + "step": 9956 + }, + { + "epoch": 3.056169429097606, + "grad_norm": 0.42896488308906555, + "learning_rate": 8.137283463805945e-05, + "loss": 1.7814, + "step": 9957 + }, + { + "epoch": 3.056476365868631, + "grad_norm": 0.38993972539901733, + "learning_rate": 8.136896414934372e-05, + "loss": 1.7636, + "step": 9958 + }, + { + "epoch": 3.0567833026396563, + "grad_norm": 0.31362372636795044, + "learning_rate": 8.13650933506215e-05, + "loss": 1.8021, + "step": 9959 + }, + { + "epoch": 3.0570902394106816, + "grad_norm": 0.27980196475982666, + "learning_rate": 8.136122224193103e-05, + "loss": 1.8445, + "step": 9960 + }, + { + "epoch": 3.0573971761817065, + "grad_norm": 0.2721461057662964, + "learning_rate": 8.135735082331059e-05, + "loss": 1.7614, + "step": 9961 + }, + { + "epoch": 3.057704112952732, + "grad_norm": 0.25157424807548523, + "learning_rate": 8.135347909479843e-05, + "loss": 1.7598, + "step": 9962 + }, + { + "epoch": 3.0580110497237567, + "grad_norm": 0.25798025727272034, + "learning_rate": 8.13496070564328e-05, + "loss": 1.7823, + "step": 9963 + }, + { + "epoch": 3.058317986494782, + "grad_norm": 0.30775198340415955, + "learning_rate": 8.134573470825199e-05, + "loss": 1.7755, + "step": 9964 + }, + { + "epoch": 3.0586249232658074, + "grad_norm": 0.28916797041893005, + "learning_rate": 8.134186205029426e-05, + "loss": 1.8189, + "step": 9965 + }, + { + "epoch": 3.0589318600368323, + "grad_norm": 0.2829149067401886, + "learning_rate": 8.133798908259787e-05, + "loss": 1.8546, + "step": 9966 + }, + { + "epoch": 3.0592387968078576, + "grad_norm": 0.2884117662906647, + "learning_rate": 8.13341158052011e-05, + "loss": 1.7705, + "step": 9967 + }, + { + "epoch": 3.059545733578883, + "grad_norm": 0.28311973810195923, + "learning_rate": 8.133024221814225e-05, + "loss": 1.8147, + "step": 9968 + }, + { + "epoch": 3.059852670349908, + "grad_norm": 0.25405213236808777, + "learning_rate": 8.132636832145957e-05, + "loss": 1.7813, + "step": 9969 + }, + { + "epoch": 3.060159607120933, + "grad_norm": 0.3082229793071747, + "learning_rate": 8.132249411519137e-05, + "loss": 1.8536, + "step": 9970 + }, + { + "epoch": 3.060466543891958, + "grad_norm": 0.29918181896209717, + "learning_rate": 8.13186195993759e-05, + "loss": 1.8181, + "step": 9971 + }, + { + "epoch": 3.0607734806629834, + "grad_norm": 0.3025238811969757, + "learning_rate": 8.13147447740515e-05, + "loss": 1.7785, + "step": 9972 + }, + { + "epoch": 3.0610804174340087, + "grad_norm": 0.2798222303390503, + "learning_rate": 8.131086963925643e-05, + "loss": 1.7873, + "step": 9973 + }, + { + "epoch": 3.0613873542050336, + "grad_norm": 0.32636210322380066, + "learning_rate": 8.130699419502898e-05, + "loss": 1.882, + "step": 9974 + }, + { + "epoch": 3.061694290976059, + "grad_norm": 0.27722054719924927, + "learning_rate": 8.130311844140748e-05, + "loss": 1.7788, + "step": 9975 + }, + { + "epoch": 3.0620012277470843, + "grad_norm": 0.289156436920166, + "learning_rate": 8.129924237843023e-05, + "loss": 1.8591, + "step": 9976 + }, + { + "epoch": 3.062308164518109, + "grad_norm": 0.2839665412902832, + "learning_rate": 8.12953660061355e-05, + "loss": 1.8255, + "step": 9977 + }, + { + "epoch": 3.0626151012891345, + "grad_norm": 0.2650148272514343, + "learning_rate": 8.129148932456161e-05, + "loss": 1.8353, + "step": 9978 + }, + { + "epoch": 3.06292203806016, + "grad_norm": 0.2884560227394104, + "learning_rate": 8.128761233374691e-05, + "loss": 1.8099, + "step": 9979 + }, + { + "epoch": 3.0632289748311847, + "grad_norm": 0.2610029876232147, + "learning_rate": 8.128373503372967e-05, + "loss": 1.8173, + "step": 9980 + }, + { + "epoch": 3.06353591160221, + "grad_norm": 0.32512393593788147, + "learning_rate": 8.127985742454822e-05, + "loss": 1.8619, + "step": 9981 + }, + { + "epoch": 3.063842848373235, + "grad_norm": 0.3382968604564667, + "learning_rate": 8.127597950624091e-05, + "loss": 1.831, + "step": 9982 + }, + { + "epoch": 3.0641497851442603, + "grad_norm": 0.33773133158683777, + "learning_rate": 8.127210127884602e-05, + "loss": 1.8194, + "step": 9983 + }, + { + "epoch": 3.0644567219152856, + "grad_norm": 0.31642746925354004, + "learning_rate": 8.126822274240188e-05, + "loss": 1.8782, + "step": 9984 + }, + { + "epoch": 3.0647636586863105, + "grad_norm": 0.2476506233215332, + "learning_rate": 8.126434389694686e-05, + "loss": 1.7866, + "step": 9985 + }, + { + "epoch": 3.065070595457336, + "grad_norm": 0.27296319603919983, + "learning_rate": 8.126046474251927e-05, + "loss": 1.8276, + "step": 9986 + }, + { + "epoch": 3.0653775322283607, + "grad_norm": 0.353865385055542, + "learning_rate": 8.125658527915744e-05, + "loss": 1.9525, + "step": 9987 + }, + { + "epoch": 3.065684468999386, + "grad_norm": 0.370256632566452, + "learning_rate": 8.12527055068997e-05, + "loss": 1.8514, + "step": 9988 + }, + { + "epoch": 3.0659914057704114, + "grad_norm": 0.30738842487335205, + "learning_rate": 8.124882542578442e-05, + "loss": 1.8125, + "step": 9989 + }, + { + "epoch": 3.0662983425414363, + "grad_norm": 0.3151233494281769, + "learning_rate": 8.124494503584995e-05, + "loss": 1.8165, + "step": 9990 + }, + { + "epoch": 3.0666052793124616, + "grad_norm": 0.29071590304374695, + "learning_rate": 8.124106433713458e-05, + "loss": 1.7617, + "step": 9991 + }, + { + "epoch": 3.066912216083487, + "grad_norm": 0.2898697853088379, + "learning_rate": 8.123718332967672e-05, + "loss": 1.7779, + "step": 9992 + }, + { + "epoch": 3.067219152854512, + "grad_norm": 0.26601701974868774, + "learning_rate": 8.123330201351471e-05, + "loss": 1.8307, + "step": 9993 + }, + { + "epoch": 3.067526089625537, + "grad_norm": 0.2622119188308716, + "learning_rate": 8.12294203886869e-05, + "loss": 1.7958, + "step": 9994 + }, + { + "epoch": 3.0678330263965625, + "grad_norm": 0.29709386825561523, + "learning_rate": 8.122553845523166e-05, + "loss": 1.7799, + "step": 9995 + }, + { + "epoch": 3.0681399631675874, + "grad_norm": 0.31267789006233215, + "learning_rate": 8.122165621318733e-05, + "loss": 1.8149, + "step": 9996 + }, + { + "epoch": 3.0684468999386127, + "grad_norm": 0.3076523244380951, + "learning_rate": 8.121777366259232e-05, + "loss": 1.7701, + "step": 9997 + }, + { + "epoch": 3.0687538367096376, + "grad_norm": 0.30096009373664856, + "learning_rate": 8.121389080348496e-05, + "loss": 1.8323, + "step": 9998 + }, + { + "epoch": 3.069060773480663, + "grad_norm": 0.25739142298698425, + "learning_rate": 8.121000763590363e-05, + "loss": 1.8105, + "step": 9999 + }, + { + "epoch": 3.0693677102516883, + "grad_norm": 0.2780844271183014, + "learning_rate": 8.120612415988671e-05, + "loss": 1.8502, + "step": 10000 + }, + { + "epoch": 3.069674647022713, + "grad_norm": 0.3316378593444824, + "learning_rate": 8.120224037547259e-05, + "loss": 1.8244, + "step": 10001 + }, + { + "epoch": 3.0699815837937385, + "grad_norm": 0.261129766702652, + "learning_rate": 8.119835628269964e-05, + "loss": 1.7769, + "step": 10002 + }, + { + "epoch": 3.070288520564764, + "grad_norm": 0.29213985800743103, + "learning_rate": 8.119447188160625e-05, + "loss": 1.7717, + "step": 10003 + }, + { + "epoch": 3.0705954573357888, + "grad_norm": 0.38545623421669006, + "learning_rate": 8.11905871722308e-05, + "loss": 1.8433, + "step": 10004 + }, + { + "epoch": 3.070902394106814, + "grad_norm": 0.3617223799228668, + "learning_rate": 8.118670215461168e-05, + "loss": 1.8172, + "step": 10005 + }, + { + "epoch": 3.071209330877839, + "grad_norm": 0.3241543769836426, + "learning_rate": 8.11828168287873e-05, + "loss": 1.8325, + "step": 10006 + }, + { + "epoch": 3.0715162676488643, + "grad_norm": 0.3538578152656555, + "learning_rate": 8.117893119479605e-05, + "loss": 1.8188, + "step": 10007 + }, + { + "epoch": 3.0718232044198897, + "grad_norm": 0.3861970603466034, + "learning_rate": 8.117504525267632e-05, + "loss": 1.8518, + "step": 10008 + }, + { + "epoch": 3.0721301411909145, + "grad_norm": 0.35433146357536316, + "learning_rate": 8.117115900246652e-05, + "loss": 1.8601, + "step": 10009 + }, + { + "epoch": 3.07243707796194, + "grad_norm": 0.29796987771987915, + "learning_rate": 8.116727244420507e-05, + "loss": 1.7934, + "step": 10010 + }, + { + "epoch": 3.072744014732965, + "grad_norm": 0.3091779947280884, + "learning_rate": 8.116338557793035e-05, + "loss": 1.8111, + "step": 10011 + }, + { + "epoch": 3.07305095150399, + "grad_norm": 0.2741319537162781, + "learning_rate": 8.11594984036808e-05, + "loss": 1.8079, + "step": 10012 + }, + { + "epoch": 3.0733578882750154, + "grad_norm": 0.28905320167541504, + "learning_rate": 8.115561092149482e-05, + "loss": 1.8475, + "step": 10013 + }, + { + "epoch": 3.0736648250460403, + "grad_norm": 0.2897081673145294, + "learning_rate": 8.115172313141081e-05, + "loss": 1.838, + "step": 10014 + }, + { + "epoch": 3.0739717618170657, + "grad_norm": 0.2620783746242523, + "learning_rate": 8.114783503346725e-05, + "loss": 1.8024, + "step": 10015 + }, + { + "epoch": 3.074278698588091, + "grad_norm": 0.26478636264801025, + "learning_rate": 8.11439466277025e-05, + "loss": 1.8137, + "step": 10016 + }, + { + "epoch": 3.074585635359116, + "grad_norm": 0.2796174883842468, + "learning_rate": 8.114005791415502e-05, + "loss": 1.7976, + "step": 10017 + }, + { + "epoch": 3.074892572130141, + "grad_norm": 0.26813286542892456, + "learning_rate": 8.113616889286325e-05, + "loss": 1.7945, + "step": 10018 + }, + { + "epoch": 3.0751995089011666, + "grad_norm": 0.2443828582763672, + "learning_rate": 8.11322795638656e-05, + "loss": 1.7829, + "step": 10019 + }, + { + "epoch": 3.0755064456721914, + "grad_norm": 0.2981395423412323, + "learning_rate": 8.112838992720053e-05, + "loss": 1.7928, + "step": 10020 + }, + { + "epoch": 3.075813382443217, + "grad_norm": 0.25605037808418274, + "learning_rate": 8.112449998290644e-05, + "loss": 1.8129, + "step": 10021 + }, + { + "epoch": 3.0761203192142417, + "grad_norm": 0.31180307269096375, + "learning_rate": 8.112060973102181e-05, + "loss": 1.7393, + "step": 10022 + }, + { + "epoch": 3.076427255985267, + "grad_norm": 0.3230421543121338, + "learning_rate": 8.111671917158508e-05, + "loss": 1.818, + "step": 10023 + }, + { + "epoch": 3.0767341927562923, + "grad_norm": 0.3158549964427948, + "learning_rate": 8.111282830463468e-05, + "loss": 1.7582, + "step": 10024 + }, + { + "epoch": 3.0770411295273172, + "grad_norm": 0.24524325132369995, + "learning_rate": 8.110893713020908e-05, + "loss": 1.8215, + "step": 10025 + }, + { + "epoch": 3.0773480662983426, + "grad_norm": 0.2793932259082794, + "learning_rate": 8.110504564834675e-05, + "loss": 1.8551, + "step": 10026 + }, + { + "epoch": 3.077655003069368, + "grad_norm": 0.29629403352737427, + "learning_rate": 8.110115385908612e-05, + "loss": 1.8019, + "step": 10027 + }, + { + "epoch": 3.077961939840393, + "grad_norm": 0.3138490915298462, + "learning_rate": 8.109726176246564e-05, + "loss": 1.8436, + "step": 10028 + }, + { + "epoch": 3.078268876611418, + "grad_norm": 0.29802024364471436, + "learning_rate": 8.10933693585238e-05, + "loss": 1.8158, + "step": 10029 + }, + { + "epoch": 3.078575813382443, + "grad_norm": 0.30785220861434937, + "learning_rate": 8.108947664729907e-05, + "loss": 1.8674, + "step": 10030 + }, + { + "epoch": 3.0788827501534684, + "grad_norm": 0.277662992477417, + "learning_rate": 8.10855836288299e-05, + "loss": 1.8253, + "step": 10031 + }, + { + "epoch": 3.0791896869244937, + "grad_norm": 0.27399590611457825, + "learning_rate": 8.108169030315477e-05, + "loss": 1.8587, + "step": 10032 + }, + { + "epoch": 3.0794966236955186, + "grad_norm": 0.28398239612579346, + "learning_rate": 8.107779667031217e-05, + "loss": 1.8326, + "step": 10033 + }, + { + "epoch": 3.079803560466544, + "grad_norm": 0.2882741093635559, + "learning_rate": 8.107390273034057e-05, + "loss": 1.785, + "step": 10034 + }, + { + "epoch": 3.0801104972375692, + "grad_norm": 0.271043598651886, + "learning_rate": 8.107000848327843e-05, + "loss": 1.765, + "step": 10035 + }, + { + "epoch": 3.080417434008594, + "grad_norm": 0.2589638829231262, + "learning_rate": 8.106611392916427e-05, + "loss": 1.8136, + "step": 10036 + }, + { + "epoch": 3.0807243707796195, + "grad_norm": 0.3068227469921112, + "learning_rate": 8.106221906803656e-05, + "loss": 1.8034, + "step": 10037 + }, + { + "epoch": 3.0810313075506444, + "grad_norm": 0.2714168131351471, + "learning_rate": 8.105832389993379e-05, + "loss": 1.8007, + "step": 10038 + }, + { + "epoch": 3.0813382443216697, + "grad_norm": 0.2747504711151123, + "learning_rate": 8.105442842489447e-05, + "loss": 1.8135, + "step": 10039 + }, + { + "epoch": 3.081645181092695, + "grad_norm": 0.2719285488128662, + "learning_rate": 8.105053264295708e-05, + "loss": 1.7629, + "step": 10040 + }, + { + "epoch": 3.08195211786372, + "grad_norm": 0.3119582235813141, + "learning_rate": 8.104663655416014e-05, + "loss": 1.7887, + "step": 10041 + }, + { + "epoch": 3.0822590546347453, + "grad_norm": 0.35965192317962646, + "learning_rate": 8.104274015854212e-05, + "loss": 1.8484, + "step": 10042 + }, + { + "epoch": 3.0825659914057706, + "grad_norm": 0.3045980632305145, + "learning_rate": 8.103884345614157e-05, + "loss": 1.8625, + "step": 10043 + }, + { + "epoch": 3.0828729281767955, + "grad_norm": 0.2925138473510742, + "learning_rate": 8.103494644699696e-05, + "loss": 1.9306, + "step": 10044 + }, + { + "epoch": 3.083179864947821, + "grad_norm": 0.2894277274608612, + "learning_rate": 8.103104913114681e-05, + "loss": 1.7796, + "step": 10045 + }, + { + "epoch": 3.0834868017188457, + "grad_norm": 0.2776826322078705, + "learning_rate": 8.102715150862967e-05, + "loss": 1.8169, + "step": 10046 + }, + { + "epoch": 3.083793738489871, + "grad_norm": 0.3315230906009674, + "learning_rate": 8.102325357948402e-05, + "loss": 1.8139, + "step": 10047 + }, + { + "epoch": 3.0841006752608964, + "grad_norm": 0.2906761169433594, + "learning_rate": 8.10193553437484e-05, + "loss": 1.8162, + "step": 10048 + }, + { + "epoch": 3.0844076120319213, + "grad_norm": 0.32681339979171753, + "learning_rate": 8.101545680146132e-05, + "loss": 1.8245, + "step": 10049 + }, + { + "epoch": 3.0847145488029466, + "grad_norm": 0.32525795698165894, + "learning_rate": 8.101155795266131e-05, + "loss": 1.8605, + "step": 10050 + }, + { + "epoch": 3.085021485573972, + "grad_norm": 0.31705379486083984, + "learning_rate": 8.100765879738692e-05, + "loss": 1.8214, + "step": 10051 + }, + { + "epoch": 3.085328422344997, + "grad_norm": 0.27772918343544006, + "learning_rate": 8.100375933567668e-05, + "loss": 1.7822, + "step": 10052 + }, + { + "epoch": 3.085635359116022, + "grad_norm": 0.2877809405326843, + "learning_rate": 8.09998595675691e-05, + "loss": 1.7935, + "step": 10053 + }, + { + "epoch": 3.0859422958870475, + "grad_norm": 0.29759806394577026, + "learning_rate": 8.099595949310276e-05, + "loss": 1.8041, + "step": 10054 + }, + { + "epoch": 3.0862492326580724, + "grad_norm": 0.2715320289134979, + "learning_rate": 8.099205911231617e-05, + "loss": 1.7923, + "step": 10055 + }, + { + "epoch": 3.0865561694290977, + "grad_norm": 0.33566340804100037, + "learning_rate": 8.098815842524789e-05, + "loss": 1.7953, + "step": 10056 + }, + { + "epoch": 3.0868631062001226, + "grad_norm": 0.3360871970653534, + "learning_rate": 8.098425743193645e-05, + "loss": 1.8275, + "step": 10057 + }, + { + "epoch": 3.087170042971148, + "grad_norm": 0.2797739803791046, + "learning_rate": 8.098035613242043e-05, + "loss": 1.7597, + "step": 10058 + }, + { + "epoch": 3.0874769797421733, + "grad_norm": 0.25500187277793884, + "learning_rate": 8.097645452673837e-05, + "loss": 1.8059, + "step": 10059 + }, + { + "epoch": 3.087783916513198, + "grad_norm": 0.28042587637901306, + "learning_rate": 8.097255261492884e-05, + "loss": 1.7954, + "step": 10060 + }, + { + "epoch": 3.0880908532842235, + "grad_norm": 0.3616262376308441, + "learning_rate": 8.096865039703038e-05, + "loss": 1.8605, + "step": 10061 + }, + { + "epoch": 3.0883977900552484, + "grad_norm": 0.3453714847564697, + "learning_rate": 8.096474787308157e-05, + "loss": 1.7643, + "step": 10062 + }, + { + "epoch": 3.0887047268262737, + "grad_norm": 0.3192278742790222, + "learning_rate": 8.096084504312098e-05, + "loss": 1.8415, + "step": 10063 + }, + { + "epoch": 3.089011663597299, + "grad_norm": 0.2714482545852661, + "learning_rate": 8.095694190718715e-05, + "loss": 1.8204, + "step": 10064 + }, + { + "epoch": 3.089318600368324, + "grad_norm": 0.26562005281448364, + "learning_rate": 8.09530384653187e-05, + "loss": 1.7322, + "step": 10065 + }, + { + "epoch": 3.0896255371393493, + "grad_norm": 0.33727800846099854, + "learning_rate": 8.094913471755417e-05, + "loss": 1.8221, + "step": 10066 + }, + { + "epoch": 3.0899324739103746, + "grad_norm": 0.3561044931411743, + "learning_rate": 8.094523066393215e-05, + "loss": 1.8879, + "step": 10067 + }, + { + "epoch": 3.0902394106813995, + "grad_norm": 0.2568742334842682, + "learning_rate": 8.094132630449122e-05, + "loss": 1.8178, + "step": 10068 + }, + { + "epoch": 3.090546347452425, + "grad_norm": 0.4025525450706482, + "learning_rate": 8.093742163926998e-05, + "loss": 1.8186, + "step": 10069 + }, + { + "epoch": 3.09085328422345, + "grad_norm": 0.43863433599472046, + "learning_rate": 8.0933516668307e-05, + "loss": 1.8371, + "step": 10070 + }, + { + "epoch": 3.091160220994475, + "grad_norm": 0.34873950481414795, + "learning_rate": 8.092961139164087e-05, + "loss": 1.8083, + "step": 10071 + }, + { + "epoch": 3.0914671577655004, + "grad_norm": 0.31433534622192383, + "learning_rate": 8.092570580931021e-05, + "loss": 1.8154, + "step": 10072 + }, + { + "epoch": 3.0917740945365253, + "grad_norm": 0.25523966550827026, + "learning_rate": 8.092179992135358e-05, + "loss": 1.8158, + "step": 10073 + }, + { + "epoch": 3.0920810313075506, + "grad_norm": 0.348469078540802, + "learning_rate": 8.09178937278096e-05, + "loss": 1.8358, + "step": 10074 + }, + { + "epoch": 3.092387968078576, + "grad_norm": 0.33455297350883484, + "learning_rate": 8.091398722871688e-05, + "loss": 1.7779, + "step": 10075 + }, + { + "epoch": 3.092694904849601, + "grad_norm": 0.36544880270957947, + "learning_rate": 8.091008042411403e-05, + "loss": 1.9186, + "step": 10076 + }, + { + "epoch": 3.093001841620626, + "grad_norm": 0.29165831208229065, + "learning_rate": 8.090617331403965e-05, + "loss": 1.8964, + "step": 10077 + }, + { + "epoch": 3.0933087783916515, + "grad_norm": 0.31011059880256653, + "learning_rate": 8.090226589853234e-05, + "loss": 1.8453, + "step": 10078 + }, + { + "epoch": 3.0936157151626764, + "grad_norm": 0.2835703492164612, + "learning_rate": 8.089835817763071e-05, + "loss": 1.7718, + "step": 10079 + }, + { + "epoch": 3.0939226519337018, + "grad_norm": 0.2910583019256592, + "learning_rate": 8.08944501513734e-05, + "loss": 1.7881, + "step": 10080 + }, + { + "epoch": 3.0942295887047266, + "grad_norm": 0.391303688287735, + "learning_rate": 8.089054181979905e-05, + "loss": 1.7915, + "step": 10081 + }, + { + "epoch": 3.094536525475752, + "grad_norm": 0.4119330048561096, + "learning_rate": 8.088663318294623e-05, + "loss": 1.7975, + "step": 10082 + }, + { + "epoch": 3.0948434622467773, + "grad_norm": 0.2980102002620697, + "learning_rate": 8.088272424085361e-05, + "loss": 1.805, + "step": 10083 + }, + { + "epoch": 3.095150399017802, + "grad_norm": 0.3089980483055115, + "learning_rate": 8.087881499355983e-05, + "loss": 1.8265, + "step": 10084 + }, + { + "epoch": 3.0954573357888275, + "grad_norm": 0.3851003348827362, + "learning_rate": 8.087490544110348e-05, + "loss": 1.8174, + "step": 10085 + }, + { + "epoch": 3.095764272559853, + "grad_norm": 0.42357420921325684, + "learning_rate": 8.08709955835232e-05, + "loss": 1.8083, + "step": 10086 + }, + { + "epoch": 3.0960712093308778, + "grad_norm": 0.291777640581131, + "learning_rate": 8.086708542085768e-05, + "loss": 1.7713, + "step": 10087 + }, + { + "epoch": 3.096378146101903, + "grad_norm": 0.2563805878162384, + "learning_rate": 8.086317495314552e-05, + "loss": 1.7691, + "step": 10088 + }, + { + "epoch": 3.096685082872928, + "grad_norm": 0.3418877422809601, + "learning_rate": 8.085926418042536e-05, + "loss": 1.8547, + "step": 10089 + }, + { + "epoch": 3.0969920196439533, + "grad_norm": 0.3859385550022125, + "learning_rate": 8.085535310273589e-05, + "loss": 1.8226, + "step": 10090 + }, + { + "epoch": 3.0972989564149787, + "grad_norm": 0.3427267372608185, + "learning_rate": 8.085144172011571e-05, + "loss": 1.837, + "step": 10091 + }, + { + "epoch": 3.0976058931860035, + "grad_norm": 0.29290953278541565, + "learning_rate": 8.084753003260352e-05, + "loss": 1.8392, + "step": 10092 + }, + { + "epoch": 3.097912829957029, + "grad_norm": 0.33282020688056946, + "learning_rate": 8.084361804023795e-05, + "loss": 1.8351, + "step": 10093 + }, + { + "epoch": 3.098219766728054, + "grad_norm": 0.3802134394645691, + "learning_rate": 8.083970574305768e-05, + "loss": 1.7467, + "step": 10094 + }, + { + "epoch": 3.098526703499079, + "grad_norm": 0.3142111897468567, + "learning_rate": 8.083579314110135e-05, + "loss": 1.7966, + "step": 10095 + }, + { + "epoch": 3.0988336402701044, + "grad_norm": 0.2956278324127197, + "learning_rate": 8.083188023440765e-05, + "loss": 1.8724, + "step": 10096 + }, + { + "epoch": 3.0991405770411293, + "grad_norm": 0.3262473940849304, + "learning_rate": 8.082796702301522e-05, + "loss": 1.8448, + "step": 10097 + }, + { + "epoch": 3.0994475138121547, + "grad_norm": 0.29358017444610596, + "learning_rate": 8.082405350696276e-05, + "loss": 1.8679, + "step": 10098 + }, + { + "epoch": 3.09975445058318, + "grad_norm": 0.36439722776412964, + "learning_rate": 8.082013968628893e-05, + "loss": 1.8801, + "step": 10099 + }, + { + "epoch": 3.100061387354205, + "grad_norm": 0.3565322458744049, + "learning_rate": 8.081622556103244e-05, + "loss": 1.794, + "step": 10100 + }, + { + "epoch": 3.1003683241252302, + "grad_norm": 0.2841760814189911, + "learning_rate": 8.081231113123191e-05, + "loss": 1.7593, + "step": 10101 + }, + { + "epoch": 3.1006752608962556, + "grad_norm": 0.28589630126953125, + "learning_rate": 8.080839639692608e-05, + "loss": 1.864, + "step": 10102 + }, + { + "epoch": 3.1009821976672804, + "grad_norm": 0.3595057427883148, + "learning_rate": 8.080448135815362e-05, + "loss": 1.8067, + "step": 10103 + }, + { + "epoch": 3.101289134438306, + "grad_norm": 0.3909708261489868, + "learning_rate": 8.080056601495322e-05, + "loss": 1.8601, + "step": 10104 + }, + { + "epoch": 3.1015960712093307, + "grad_norm": 0.35180148482322693, + "learning_rate": 8.079665036736358e-05, + "loss": 1.8328, + "step": 10105 + }, + { + "epoch": 3.101903007980356, + "grad_norm": 0.3065175712108612, + "learning_rate": 8.079273441542338e-05, + "loss": 1.8449, + "step": 10106 + }, + { + "epoch": 3.1022099447513813, + "grad_norm": 0.31358617544174194, + "learning_rate": 8.078881815917134e-05, + "loss": 1.8325, + "step": 10107 + }, + { + "epoch": 3.1025168815224062, + "grad_norm": 0.4737118184566498, + "learning_rate": 8.078490159864614e-05, + "loss": 1.8232, + "step": 10108 + }, + { + "epoch": 3.1028238182934316, + "grad_norm": 0.435148686170578, + "learning_rate": 8.078098473388651e-05, + "loss": 1.8227, + "step": 10109 + }, + { + "epoch": 3.103130755064457, + "grad_norm": 0.3080987334251404, + "learning_rate": 8.077706756493115e-05, + "loss": 1.8072, + "step": 10110 + }, + { + "epoch": 3.103437691835482, + "grad_norm": 0.3225170075893402, + "learning_rate": 8.077315009181876e-05, + "loss": 1.7716, + "step": 10111 + }, + { + "epoch": 3.103744628606507, + "grad_norm": 0.46642443537712097, + "learning_rate": 8.076923231458808e-05, + "loss": 1.8295, + "step": 10112 + }, + { + "epoch": 3.104051565377532, + "grad_norm": 0.42561766505241394, + "learning_rate": 8.07653142332778e-05, + "loss": 1.8553, + "step": 10113 + }, + { + "epoch": 3.1043585021485574, + "grad_norm": 0.27187541127204895, + "learning_rate": 8.076139584792664e-05, + "loss": 1.7937, + "step": 10114 + }, + { + "epoch": 3.1046654389195827, + "grad_norm": 0.27822238206863403, + "learning_rate": 8.075747715857335e-05, + "loss": 1.8151, + "step": 10115 + }, + { + "epoch": 3.1049723756906076, + "grad_norm": 0.40106478333473206, + "learning_rate": 8.075355816525665e-05, + "loss": 1.8637, + "step": 10116 + }, + { + "epoch": 3.105279312461633, + "grad_norm": 0.33455124497413635, + "learning_rate": 8.074963886801525e-05, + "loss": 1.8543, + "step": 10117 + }, + { + "epoch": 3.1055862492326582, + "grad_norm": 0.32246437668800354, + "learning_rate": 8.07457192668879e-05, + "loss": 1.7907, + "step": 10118 + }, + { + "epoch": 3.105893186003683, + "grad_norm": 0.45360109210014343, + "learning_rate": 8.074179936191332e-05, + "loss": 1.7404, + "step": 10119 + }, + { + "epoch": 3.1062001227747085, + "grad_norm": 0.445916086435318, + "learning_rate": 8.07378791531303e-05, + "loss": 1.778, + "step": 10120 + }, + { + "epoch": 3.1065070595457334, + "grad_norm": 0.28561538457870483, + "learning_rate": 8.073395864057751e-05, + "loss": 1.8723, + "step": 10121 + }, + { + "epoch": 3.1068139963167587, + "grad_norm": 0.3258218467235565, + "learning_rate": 8.073003782429373e-05, + "loss": 1.8106, + "step": 10122 + }, + { + "epoch": 3.107120933087784, + "grad_norm": 0.5459560751914978, + "learning_rate": 8.07261167043177e-05, + "loss": 1.8022, + "step": 10123 + }, + { + "epoch": 3.107427869858809, + "grad_norm": 0.4828549921512604, + "learning_rate": 8.072219528068819e-05, + "loss": 1.7556, + "step": 10124 + }, + { + "epoch": 3.1077348066298343, + "grad_norm": 0.24075324833393097, + "learning_rate": 8.071827355344393e-05, + "loss": 1.7901, + "step": 10125 + }, + { + "epoch": 3.1080417434008596, + "grad_norm": 0.44677188992500305, + "learning_rate": 8.071435152262367e-05, + "loss": 1.7858, + "step": 10126 + }, + { + "epoch": 3.1083486801718845, + "grad_norm": 0.49862590432167053, + "learning_rate": 8.071042918826622e-05, + "loss": 1.805, + "step": 10127 + }, + { + "epoch": 3.10865561694291, + "grad_norm": 0.30883491039276123, + "learning_rate": 8.07065065504103e-05, + "loss": 1.7693, + "step": 10128 + }, + { + "epoch": 3.108962553713935, + "grad_norm": 0.29583030939102173, + "learning_rate": 8.070258360909467e-05, + "loss": 1.8141, + "step": 10129 + }, + { + "epoch": 3.10926949048496, + "grad_norm": 0.3595346510410309, + "learning_rate": 8.069866036435812e-05, + "loss": 1.8286, + "step": 10130 + }, + { + "epoch": 3.1095764272559854, + "grad_norm": 0.3215504288673401, + "learning_rate": 8.069473681623942e-05, + "loss": 1.8557, + "step": 10131 + }, + { + "epoch": 3.1098833640270103, + "grad_norm": 0.29734939336776733, + "learning_rate": 8.069081296477734e-05, + "loss": 1.7996, + "step": 10132 + }, + { + "epoch": 3.1101903007980356, + "grad_norm": 0.33546003699302673, + "learning_rate": 8.068688881001065e-05, + "loss": 1.8307, + "step": 10133 + }, + { + "epoch": 3.110497237569061, + "grad_norm": 0.3886832296848297, + "learning_rate": 8.068296435197814e-05, + "loss": 1.751, + "step": 10134 + }, + { + "epoch": 3.110804174340086, + "grad_norm": 0.34505394101142883, + "learning_rate": 8.06790395907186e-05, + "loss": 1.7543, + "step": 10135 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.27018141746520996, + "learning_rate": 8.06751145262708e-05, + "loss": 1.8109, + "step": 10136 + }, + { + "epoch": 3.1114180478821365, + "grad_norm": 0.3367149531841278, + "learning_rate": 8.067118915867355e-05, + "loss": 1.8025, + "step": 10137 + }, + { + "epoch": 3.1117249846531614, + "grad_norm": 0.40811091661453247, + "learning_rate": 8.066726348796562e-05, + "loss": 1.7327, + "step": 10138 + }, + { + "epoch": 3.1120319214241867, + "grad_norm": 0.3511471152305603, + "learning_rate": 8.066333751418583e-05, + "loss": 1.8711, + "step": 10139 + }, + { + "epoch": 3.1123388581952116, + "grad_norm": 0.3112446367740631, + "learning_rate": 8.065941123737295e-05, + "loss": 1.8621, + "step": 10140 + }, + { + "epoch": 3.112645794966237, + "grad_norm": 0.3424238860607147, + "learning_rate": 8.065548465756581e-05, + "loss": 1.8383, + "step": 10141 + }, + { + "epoch": 3.1129527317372623, + "grad_norm": 0.380013108253479, + "learning_rate": 8.06515577748032e-05, + "loss": 1.8121, + "step": 10142 + }, + { + "epoch": 3.113259668508287, + "grad_norm": 0.2650558650493622, + "learning_rate": 8.064763058912393e-05, + "loss": 1.866, + "step": 10143 + }, + { + "epoch": 3.1135666052793125, + "grad_norm": 0.30580762028694153, + "learning_rate": 8.06437031005668e-05, + "loss": 1.7769, + "step": 10144 + }, + { + "epoch": 3.113873542050338, + "grad_norm": 0.29927194118499756, + "learning_rate": 8.063977530917066e-05, + "loss": 1.7897, + "step": 10145 + }, + { + "epoch": 3.1141804788213627, + "grad_norm": 0.24322012066841125, + "learning_rate": 8.063584721497429e-05, + "loss": 1.7968, + "step": 10146 + }, + { + "epoch": 3.114487415592388, + "grad_norm": 0.3082945644855499, + "learning_rate": 8.063191881801651e-05, + "loss": 1.8456, + "step": 10147 + }, + { + "epoch": 3.114794352363413, + "grad_norm": 0.3247329890727997, + "learning_rate": 8.062799011833617e-05, + "loss": 1.7436, + "step": 10148 + }, + { + "epoch": 3.1151012891344383, + "grad_norm": 0.27591946721076965, + "learning_rate": 8.062406111597207e-05, + "loss": 1.7976, + "step": 10149 + }, + { + "epoch": 3.1154082259054636, + "grad_norm": 0.2752058804035187, + "learning_rate": 8.062013181096306e-05, + "loss": 1.7814, + "step": 10150 + }, + { + "epoch": 3.1157151626764885, + "grad_norm": 0.3207196891307831, + "learning_rate": 8.061620220334795e-05, + "loss": 1.7767, + "step": 10151 + }, + { + "epoch": 3.116022099447514, + "grad_norm": 0.2895309627056122, + "learning_rate": 8.061227229316559e-05, + "loss": 1.8588, + "step": 10152 + }, + { + "epoch": 3.116329036218539, + "grad_norm": 0.333843469619751, + "learning_rate": 8.060834208045481e-05, + "loss": 1.7871, + "step": 10153 + }, + { + "epoch": 3.116635972989564, + "grad_norm": 0.43877774477005005, + "learning_rate": 8.060441156525445e-05, + "loss": 1.8165, + "step": 10154 + }, + { + "epoch": 3.1169429097605894, + "grad_norm": 0.35700589418411255, + "learning_rate": 8.060048074760337e-05, + "loss": 1.777, + "step": 10155 + }, + { + "epoch": 3.1172498465316143, + "grad_norm": 0.26124534010887146, + "learning_rate": 8.059654962754039e-05, + "loss": 1.8343, + "step": 10156 + }, + { + "epoch": 3.1175567833026396, + "grad_norm": 0.331444650888443, + "learning_rate": 8.059261820510438e-05, + "loss": 1.9437, + "step": 10157 + }, + { + "epoch": 3.117863720073665, + "grad_norm": 0.31657731533050537, + "learning_rate": 8.058868648033419e-05, + "loss": 1.7621, + "step": 10158 + }, + { + "epoch": 3.11817065684469, + "grad_norm": 0.2785957455635071, + "learning_rate": 8.058475445326867e-05, + "loss": 1.9049, + "step": 10159 + }, + { + "epoch": 3.118477593615715, + "grad_norm": 0.2605743408203125, + "learning_rate": 8.058082212394667e-05, + "loss": 1.7895, + "step": 10160 + }, + { + "epoch": 3.1187845303867405, + "grad_norm": 0.2981378138065338, + "learning_rate": 8.057688949240707e-05, + "loss": 1.8373, + "step": 10161 + }, + { + "epoch": 3.1190914671577654, + "grad_norm": 0.2944273054599762, + "learning_rate": 8.057295655868873e-05, + "loss": 1.8373, + "step": 10162 + }, + { + "epoch": 3.1193984039287908, + "grad_norm": 0.2696721851825714, + "learning_rate": 8.056902332283052e-05, + "loss": 1.8023, + "step": 10163 + }, + { + "epoch": 3.1197053406998156, + "grad_norm": 0.27659857273101807, + "learning_rate": 8.056508978487128e-05, + "loss": 1.8453, + "step": 10164 + }, + { + "epoch": 3.120012277470841, + "grad_norm": 0.2982441186904907, + "learning_rate": 8.056115594484992e-05, + "loss": 1.9072, + "step": 10165 + }, + { + "epoch": 3.1203192142418663, + "grad_norm": 0.3136404752731323, + "learning_rate": 8.055722180280531e-05, + "loss": 1.8585, + "step": 10166 + }, + { + "epoch": 3.120626151012891, + "grad_norm": 0.2979940176010132, + "learning_rate": 8.055328735877631e-05, + "loss": 1.8699, + "step": 10167 + }, + { + "epoch": 3.1209330877839165, + "grad_norm": 0.2585618793964386, + "learning_rate": 8.054935261280184e-05, + "loss": 1.8323, + "step": 10168 + }, + { + "epoch": 3.121240024554942, + "grad_norm": 0.28734859824180603, + "learning_rate": 8.054541756492075e-05, + "loss": 1.8694, + "step": 10169 + }, + { + "epoch": 3.1215469613259668, + "grad_norm": 0.30582788586616516, + "learning_rate": 8.054148221517193e-05, + "loss": 1.856, + "step": 10170 + }, + { + "epoch": 3.121853898096992, + "grad_norm": 0.3128255009651184, + "learning_rate": 8.053754656359429e-05, + "loss": 1.8329, + "step": 10171 + }, + { + "epoch": 3.122160834868017, + "grad_norm": 0.2845318615436554, + "learning_rate": 8.053361061022671e-05, + "loss": 1.8111, + "step": 10172 + }, + { + "epoch": 3.1224677716390423, + "grad_norm": 0.2994609773159027, + "learning_rate": 8.05296743551081e-05, + "loss": 1.8157, + "step": 10173 + }, + { + "epoch": 3.1227747084100677, + "grad_norm": 0.26397961378097534, + "learning_rate": 8.052573779827737e-05, + "loss": 1.8572, + "step": 10174 + }, + { + "epoch": 3.1230816451810925, + "grad_norm": 0.2911500334739685, + "learning_rate": 8.052180093977339e-05, + "loss": 1.8312, + "step": 10175 + }, + { + "epoch": 3.123388581952118, + "grad_norm": 0.33455008268356323, + "learning_rate": 8.051786377963509e-05, + "loss": 1.8748, + "step": 10176 + }, + { + "epoch": 3.123695518723143, + "grad_norm": 0.3127586841583252, + "learning_rate": 8.051392631790135e-05, + "loss": 1.8224, + "step": 10177 + }, + { + "epoch": 3.124002455494168, + "grad_norm": 0.2910686433315277, + "learning_rate": 8.050998855461113e-05, + "loss": 1.8557, + "step": 10178 + }, + { + "epoch": 3.1243093922651934, + "grad_norm": 0.2849208414554596, + "learning_rate": 8.050605048980333e-05, + "loss": 1.82, + "step": 10179 + }, + { + "epoch": 3.1246163290362183, + "grad_norm": 0.35189691185951233, + "learning_rate": 8.050211212351683e-05, + "loss": 1.7884, + "step": 10180 + }, + { + "epoch": 3.1249232658072437, + "grad_norm": 0.3641110360622406, + "learning_rate": 8.04981734557906e-05, + "loss": 1.7984, + "step": 10181 + }, + { + "epoch": 3.125230202578269, + "grad_norm": 0.3111717700958252, + "learning_rate": 8.049423448666353e-05, + "loss": 1.8134, + "step": 10182 + }, + { + "epoch": 3.125537139349294, + "grad_norm": 0.2608453631401062, + "learning_rate": 8.049029521617457e-05, + "loss": 1.765, + "step": 10183 + }, + { + "epoch": 3.1258440761203192, + "grad_norm": 0.28779423236846924, + "learning_rate": 8.048635564436265e-05, + "loss": 1.8355, + "step": 10184 + }, + { + "epoch": 3.1261510128913446, + "grad_norm": 0.38227665424346924, + "learning_rate": 8.048241577126668e-05, + "loss": 1.8487, + "step": 10185 + }, + { + "epoch": 3.1264579496623695, + "grad_norm": 0.3603171706199646, + "learning_rate": 8.047847559692562e-05, + "loss": 1.8035, + "step": 10186 + }, + { + "epoch": 3.126764886433395, + "grad_norm": 0.21950066089630127, + "learning_rate": 8.04745351213784e-05, + "loss": 1.7399, + "step": 10187 + }, + { + "epoch": 3.12707182320442, + "grad_norm": 0.2796075642108917, + "learning_rate": 8.047059434466395e-05, + "loss": 1.8229, + "step": 10188 + }, + { + "epoch": 3.127378759975445, + "grad_norm": 0.3382907807826996, + "learning_rate": 8.046665326682125e-05, + "loss": 1.7713, + "step": 10189 + }, + { + "epoch": 3.1276856967464703, + "grad_norm": 0.36472463607788086, + "learning_rate": 8.04627118878892e-05, + "loss": 1.8129, + "step": 10190 + }, + { + "epoch": 3.1279926335174952, + "grad_norm": 0.2971884310245514, + "learning_rate": 8.045877020790679e-05, + "loss": 1.7894, + "step": 10191 + }, + { + "epoch": 3.1282995702885206, + "grad_norm": 0.2292303442955017, + "learning_rate": 8.045482822691297e-05, + "loss": 1.7637, + "step": 10192 + }, + { + "epoch": 3.128606507059546, + "grad_norm": 0.300750732421875, + "learning_rate": 8.045088594494668e-05, + "loss": 1.7678, + "step": 10193 + }, + { + "epoch": 3.128913443830571, + "grad_norm": 0.3121531009674072, + "learning_rate": 8.044694336204688e-05, + "loss": 1.8651, + "step": 10194 + }, + { + "epoch": 3.129220380601596, + "grad_norm": 0.2456093430519104, + "learning_rate": 8.044300047825254e-05, + "loss": 1.7769, + "step": 10195 + }, + { + "epoch": 3.129527317372621, + "grad_norm": 0.25085800886154175, + "learning_rate": 8.043905729360264e-05, + "loss": 1.7723, + "step": 10196 + }, + { + "epoch": 3.1298342541436464, + "grad_norm": 0.2505287826061249, + "learning_rate": 8.043511380813612e-05, + "loss": 1.7943, + "step": 10197 + }, + { + "epoch": 3.1301411909146717, + "grad_norm": 0.27144530415534973, + "learning_rate": 8.043117002189198e-05, + "loss": 1.8119, + "step": 10198 + }, + { + "epoch": 3.1304481276856966, + "grad_norm": 0.2702989876270294, + "learning_rate": 8.042722593490916e-05, + "loss": 1.8517, + "step": 10199 + }, + { + "epoch": 3.130755064456722, + "grad_norm": 0.2585136890411377, + "learning_rate": 8.042328154722667e-05, + "loss": 1.8382, + "step": 10200 + }, + { + "epoch": 3.1310620012277472, + "grad_norm": 0.26306065917015076, + "learning_rate": 8.041933685888348e-05, + "loss": 1.8211, + "step": 10201 + }, + { + "epoch": 3.131368937998772, + "grad_norm": 0.2208927720785141, + "learning_rate": 8.041539186991858e-05, + "loss": 1.7765, + "step": 10202 + }, + { + "epoch": 3.1316758747697975, + "grad_norm": 0.2756440043449402, + "learning_rate": 8.041144658037095e-05, + "loss": 1.898, + "step": 10203 + }, + { + "epoch": 3.131982811540823, + "grad_norm": 0.29718101024627686, + "learning_rate": 8.040750099027958e-05, + "loss": 1.8226, + "step": 10204 + }, + { + "epoch": 3.1322897483118477, + "grad_norm": 0.3166738748550415, + "learning_rate": 8.040355509968345e-05, + "loss": 1.8129, + "step": 10205 + }, + { + "epoch": 3.132596685082873, + "grad_norm": 0.3534909784793854, + "learning_rate": 8.039960890862158e-05, + "loss": 1.8915, + "step": 10206 + }, + { + "epoch": 3.132903621853898, + "grad_norm": 0.3015006184577942, + "learning_rate": 8.039566241713297e-05, + "loss": 1.8389, + "step": 10207 + }, + { + "epoch": 3.1332105586249233, + "grad_norm": 0.35226619243621826, + "learning_rate": 8.039171562525659e-05, + "loss": 1.7287, + "step": 10208 + }, + { + "epoch": 3.1335174953959486, + "grad_norm": 0.4290136694908142, + "learning_rate": 8.038776853303146e-05, + "loss": 1.8768, + "step": 10209 + }, + { + "epoch": 3.1338244321669735, + "grad_norm": 0.2828960418701172, + "learning_rate": 8.03838211404966e-05, + "loss": 1.7552, + "step": 10210 + }, + { + "epoch": 3.134131368937999, + "grad_norm": 0.3781953752040863, + "learning_rate": 8.0379873447691e-05, + "loss": 1.7812, + "step": 10211 + }, + { + "epoch": 3.1344383057090237, + "grad_norm": 0.4282926023006439, + "learning_rate": 8.037592545465371e-05, + "loss": 1.84, + "step": 10212 + }, + { + "epoch": 3.134745242480049, + "grad_norm": 0.2622411251068115, + "learning_rate": 8.03719771614237e-05, + "loss": 1.8114, + "step": 10213 + }, + { + "epoch": 3.1350521792510744, + "grad_norm": 0.34881457686424255, + "learning_rate": 8.036802856804001e-05, + "loss": 1.7694, + "step": 10214 + }, + { + "epoch": 3.1353591160220993, + "grad_norm": 0.40797632932662964, + "learning_rate": 8.036407967454167e-05, + "loss": 1.7595, + "step": 10215 + }, + { + "epoch": 3.1356660527931246, + "grad_norm": 0.24902814626693726, + "learning_rate": 8.036013048096769e-05, + "loss": 1.8068, + "step": 10216 + }, + { + "epoch": 3.13597298956415, + "grad_norm": 0.3682909607887268, + "learning_rate": 8.035618098735711e-05, + "loss": 1.8519, + "step": 10217 + }, + { + "epoch": 3.136279926335175, + "grad_norm": 0.6111233234405518, + "learning_rate": 8.035223119374895e-05, + "loss": 1.9254, + "step": 10218 + }, + { + "epoch": 3.1365868631062, + "grad_norm": 0.4793062210083008, + "learning_rate": 8.034828110018227e-05, + "loss": 1.786, + "step": 10219 + }, + { + "epoch": 3.1368937998772255, + "grad_norm": 0.3074932396411896, + "learning_rate": 8.034433070669607e-05, + "loss": 1.8495, + "step": 10220 + }, + { + "epoch": 3.1372007366482504, + "grad_norm": 0.4366479218006134, + "learning_rate": 8.034038001332942e-05, + "loss": 1.8501, + "step": 10221 + }, + { + "epoch": 3.1375076734192757, + "grad_norm": 0.4660070538520813, + "learning_rate": 8.033642902012135e-05, + "loss": 1.8317, + "step": 10222 + }, + { + "epoch": 3.1378146101903006, + "grad_norm": 0.3452899158000946, + "learning_rate": 8.03324777271109e-05, + "loss": 1.8702, + "step": 10223 + }, + { + "epoch": 3.138121546961326, + "grad_norm": 0.3658824563026428, + "learning_rate": 8.032852613433713e-05, + "loss": 1.8754, + "step": 10224 + }, + { + "epoch": 3.1384284837323513, + "grad_norm": 0.3777768909931183, + "learning_rate": 8.03245742418391e-05, + "loss": 1.8613, + "step": 10225 + }, + { + "epoch": 3.138735420503376, + "grad_norm": 0.3873192071914673, + "learning_rate": 8.032062204965582e-05, + "loss": 1.8438, + "step": 10226 + }, + { + "epoch": 3.1390423572744015, + "grad_norm": 0.30686715245246887, + "learning_rate": 8.031666955782641e-05, + "loss": 1.811, + "step": 10227 + }, + { + "epoch": 3.139349294045427, + "grad_norm": 0.2738516330718994, + "learning_rate": 8.03127167663899e-05, + "loss": 1.757, + "step": 10228 + }, + { + "epoch": 3.1396562308164517, + "grad_norm": 0.3093133270740509, + "learning_rate": 8.030876367538536e-05, + "loss": 1.8181, + "step": 10229 + }, + { + "epoch": 3.139963167587477, + "grad_norm": 0.3247159719467163, + "learning_rate": 8.030481028485185e-05, + "loss": 1.7798, + "step": 10230 + }, + { + "epoch": 3.140270104358502, + "grad_norm": 0.2855088412761688, + "learning_rate": 8.030085659482845e-05, + "loss": 1.825, + "step": 10231 + }, + { + "epoch": 3.1405770411295273, + "grad_norm": 0.2818242907524109, + "learning_rate": 8.02969026053542e-05, + "loss": 1.7737, + "step": 10232 + }, + { + "epoch": 3.1408839779005526, + "grad_norm": 0.27074751257896423, + "learning_rate": 8.029294831646822e-05, + "loss": 1.8306, + "step": 10233 + }, + { + "epoch": 3.1411909146715775, + "grad_norm": 0.29740920662879944, + "learning_rate": 8.028899372820954e-05, + "loss": 1.8157, + "step": 10234 + }, + { + "epoch": 3.141497851442603, + "grad_norm": 0.30743202567100525, + "learning_rate": 8.028503884061731e-05, + "loss": 1.7626, + "step": 10235 + }, + { + "epoch": 3.141804788213628, + "grad_norm": 0.27812567353248596, + "learning_rate": 8.028108365373058e-05, + "loss": 1.7604, + "step": 10236 + }, + { + "epoch": 3.142111724984653, + "grad_norm": 0.26212629675865173, + "learning_rate": 8.027712816758839e-05, + "loss": 1.8161, + "step": 10237 + }, + { + "epoch": 3.1424186617556784, + "grad_norm": 0.3611658811569214, + "learning_rate": 8.02731723822299e-05, + "loss": 1.8283, + "step": 10238 + }, + { + "epoch": 3.1427255985267033, + "grad_norm": 0.31705498695373535, + "learning_rate": 8.026921629769418e-05, + "loss": 1.7986, + "step": 10239 + }, + { + "epoch": 3.1430325352977286, + "grad_norm": 0.25905972719192505, + "learning_rate": 8.026525991402032e-05, + "loss": 1.7926, + "step": 10240 + }, + { + "epoch": 3.143339472068754, + "grad_norm": 0.42376595735549927, + "learning_rate": 8.026130323124741e-05, + "loss": 1.8275, + "step": 10241 + }, + { + "epoch": 3.143646408839779, + "grad_norm": 0.415556401014328, + "learning_rate": 8.025734624941458e-05, + "loss": 1.7938, + "step": 10242 + }, + { + "epoch": 3.143953345610804, + "grad_norm": 0.3558904528617859, + "learning_rate": 8.025338896856091e-05, + "loss": 1.836, + "step": 10243 + }, + { + "epoch": 3.1442602823818295, + "grad_norm": 0.3091062307357788, + "learning_rate": 8.024943138872553e-05, + "loss": 1.8285, + "step": 10244 + }, + { + "epoch": 3.1445672191528544, + "grad_norm": 0.2620905041694641, + "learning_rate": 8.024547350994753e-05, + "loss": 1.7115, + "step": 10245 + }, + { + "epoch": 3.1448741559238798, + "grad_norm": 0.25716835260391235, + "learning_rate": 8.024151533226604e-05, + "loss": 1.7702, + "step": 10246 + }, + { + "epoch": 3.1451810926949046, + "grad_norm": 0.250844269990921, + "learning_rate": 8.023755685572017e-05, + "loss": 1.7617, + "step": 10247 + }, + { + "epoch": 3.14548802946593, + "grad_norm": 0.23898956179618835, + "learning_rate": 8.023359808034903e-05, + "loss": 1.7872, + "step": 10248 + }, + { + "epoch": 3.1457949662369553, + "grad_norm": 0.2335387021303177, + "learning_rate": 8.022963900619176e-05, + "loss": 1.7656, + "step": 10249 + }, + { + "epoch": 3.14610190300798, + "grad_norm": 0.21822704374790192, + "learning_rate": 8.022567963328749e-05, + "loss": 1.7706, + "step": 10250 + }, + { + "epoch": 3.1464088397790055, + "grad_norm": 0.2627898156642914, + "learning_rate": 8.022171996167531e-05, + "loss": 1.8559, + "step": 10251 + }, + { + "epoch": 3.146715776550031, + "grad_norm": 0.2530064582824707, + "learning_rate": 8.021775999139441e-05, + "loss": 1.788, + "step": 10252 + }, + { + "epoch": 3.1470227133210558, + "grad_norm": 0.2293635457754135, + "learning_rate": 8.021379972248387e-05, + "loss": 1.8129, + "step": 10253 + }, + { + "epoch": 3.147329650092081, + "grad_norm": 0.27753588557243347, + "learning_rate": 8.020983915498286e-05, + "loss": 1.7957, + "step": 10254 + }, + { + "epoch": 3.147636586863106, + "grad_norm": 0.24507668614387512, + "learning_rate": 8.020587828893051e-05, + "loss": 1.7969, + "step": 10255 + }, + { + "epoch": 3.1479435236341313, + "grad_norm": 0.24818891286849976, + "learning_rate": 8.020191712436598e-05, + "loss": 1.8412, + "step": 10256 + }, + { + "epoch": 3.1482504604051567, + "grad_norm": 0.2463149130344391, + "learning_rate": 8.01979556613284e-05, + "loss": 1.8097, + "step": 10257 + }, + { + "epoch": 3.1485573971761815, + "grad_norm": 0.26742151379585266, + "learning_rate": 8.019399389985692e-05, + "loss": 1.8487, + "step": 10258 + }, + { + "epoch": 3.148864333947207, + "grad_norm": 0.3078254461288452, + "learning_rate": 8.01900318399907e-05, + "loss": 1.8189, + "step": 10259 + }, + { + "epoch": 3.149171270718232, + "grad_norm": 0.3819321393966675, + "learning_rate": 8.018606948176887e-05, + "loss": 1.8019, + "step": 10260 + }, + { + "epoch": 3.149478207489257, + "grad_norm": 0.3932126462459564, + "learning_rate": 8.018210682523061e-05, + "loss": 1.787, + "step": 10261 + }, + { + "epoch": 3.1497851442602824, + "grad_norm": 0.2696186900138855, + "learning_rate": 8.017814387041511e-05, + "loss": 1.8345, + "step": 10262 + }, + { + "epoch": 3.150092081031308, + "grad_norm": 0.32631832361221313, + "learning_rate": 8.017418061736149e-05, + "loss": 1.7724, + "step": 10263 + }, + { + "epoch": 3.1503990178023327, + "grad_norm": 0.36187833547592163, + "learning_rate": 8.017021706610893e-05, + "loss": 1.7829, + "step": 10264 + }, + { + "epoch": 3.150705954573358, + "grad_norm": 0.29678142070770264, + "learning_rate": 8.01662532166966e-05, + "loss": 1.7896, + "step": 10265 + }, + { + "epoch": 3.151012891344383, + "grad_norm": 0.2997078001499176, + "learning_rate": 8.016228906916368e-05, + "loss": 1.8401, + "step": 10266 + }, + { + "epoch": 3.1513198281154082, + "grad_norm": 0.4688792824745178, + "learning_rate": 8.015832462354933e-05, + "loss": 1.8263, + "step": 10267 + }, + { + "epoch": 3.1516267648864336, + "grad_norm": 0.42710503935813904, + "learning_rate": 8.015435987989275e-05, + "loss": 1.8233, + "step": 10268 + }, + { + "epoch": 3.1519337016574585, + "grad_norm": 0.2490987628698349, + "learning_rate": 8.01503948382331e-05, + "loss": 1.7792, + "step": 10269 + }, + { + "epoch": 3.152240638428484, + "grad_norm": 0.400836706161499, + "learning_rate": 8.014642949860957e-05, + "loss": 1.8113, + "step": 10270 + }, + { + "epoch": 3.1525475751995087, + "grad_norm": 0.47995972633361816, + "learning_rate": 8.014246386106138e-05, + "loss": 1.8754, + "step": 10271 + }, + { + "epoch": 3.152854511970534, + "grad_norm": 0.39069879055023193, + "learning_rate": 8.013849792562769e-05, + "loss": 1.8541, + "step": 10272 + }, + { + "epoch": 3.1531614487415593, + "grad_norm": 0.27174463868141174, + "learning_rate": 8.013453169234768e-05, + "loss": 1.8018, + "step": 10273 + }, + { + "epoch": 3.1534683855125842, + "grad_norm": 0.37808045744895935, + "learning_rate": 8.013056516126058e-05, + "loss": 1.8346, + "step": 10274 + }, + { + "epoch": 3.1537753222836096, + "grad_norm": 0.43864908814430237, + "learning_rate": 8.012659833240557e-05, + "loss": 1.7626, + "step": 10275 + }, + { + "epoch": 3.154082259054635, + "grad_norm": 0.3592168688774109, + "learning_rate": 8.012263120582187e-05, + "loss": 1.8261, + "step": 10276 + }, + { + "epoch": 3.15438919582566, + "grad_norm": 0.3056562542915344, + "learning_rate": 8.011866378154866e-05, + "loss": 1.903, + "step": 10277 + }, + { + "epoch": 3.154696132596685, + "grad_norm": 0.2898549735546112, + "learning_rate": 8.011469605962517e-05, + "loss": 1.7781, + "step": 10278 + }, + { + "epoch": 3.1550030693677105, + "grad_norm": 0.3498871624469757, + "learning_rate": 8.011072804009059e-05, + "loss": 1.7571, + "step": 10279 + }, + { + "epoch": 3.1553100061387354, + "grad_norm": 0.3330932557582855, + "learning_rate": 8.010675972298416e-05, + "loss": 1.8298, + "step": 10280 + }, + { + "epoch": 3.1556169429097607, + "grad_norm": 0.2540839910507202, + "learning_rate": 8.010279110834507e-05, + "loss": 1.8327, + "step": 10281 + }, + { + "epoch": 3.1559238796807856, + "grad_norm": 0.3557111322879791, + "learning_rate": 8.009882219621257e-05, + "loss": 1.7611, + "step": 10282 + }, + { + "epoch": 3.156230816451811, + "grad_norm": 0.28293952345848083, + "learning_rate": 8.009485298662584e-05, + "loss": 1.7761, + "step": 10283 + }, + { + "epoch": 3.1565377532228363, + "grad_norm": 0.27089303731918335, + "learning_rate": 8.009088347962416e-05, + "loss": 1.8081, + "step": 10284 + }, + { + "epoch": 3.156844689993861, + "grad_norm": 0.2689332664012909, + "learning_rate": 8.008691367524673e-05, + "loss": 1.7458, + "step": 10285 + }, + { + "epoch": 3.1571516267648865, + "grad_norm": 0.2495841234922409, + "learning_rate": 8.008294357353278e-05, + "loss": 1.8307, + "step": 10286 + }, + { + "epoch": 3.1574585635359114, + "grad_norm": 0.29242852330207825, + "learning_rate": 8.007897317452156e-05, + "loss": 1.9216, + "step": 10287 + }, + { + "epoch": 3.1577655003069367, + "grad_norm": 0.26574134826660156, + "learning_rate": 8.007500247825229e-05, + "loss": 1.8392, + "step": 10288 + }, + { + "epoch": 3.158072437077962, + "grad_norm": 0.2503872811794281, + "learning_rate": 8.00710314847642e-05, + "loss": 1.7742, + "step": 10289 + }, + { + "epoch": 3.158379373848987, + "grad_norm": 0.25614771246910095, + "learning_rate": 8.006706019409658e-05, + "loss": 1.828, + "step": 10290 + }, + { + "epoch": 3.1586863106200123, + "grad_norm": 0.259369820356369, + "learning_rate": 8.006308860628863e-05, + "loss": 1.8328, + "step": 10291 + }, + { + "epoch": 3.1589932473910376, + "grad_norm": 0.28183647990226746, + "learning_rate": 8.005911672137962e-05, + "loss": 1.8269, + "step": 10292 + }, + { + "epoch": 3.1593001841620625, + "grad_norm": 0.2926514446735382, + "learning_rate": 8.005514453940881e-05, + "loss": 1.8334, + "step": 10293 + }, + { + "epoch": 3.159607120933088, + "grad_norm": 0.34313449263572693, + "learning_rate": 8.005117206041543e-05, + "loss": 1.7866, + "step": 10294 + }, + { + "epoch": 3.159914057704113, + "grad_norm": 0.30971628427505493, + "learning_rate": 8.004719928443875e-05, + "loss": 1.7827, + "step": 10295 + }, + { + "epoch": 3.160220994475138, + "grad_norm": 0.23955371975898743, + "learning_rate": 8.004322621151807e-05, + "loss": 1.7619, + "step": 10296 + }, + { + "epoch": 3.1605279312461634, + "grad_norm": 0.31311795115470886, + "learning_rate": 8.003925284169261e-05, + "loss": 1.8247, + "step": 10297 + }, + { + "epoch": 3.1608348680171883, + "grad_norm": 0.3408358097076416, + "learning_rate": 8.003527917500163e-05, + "loss": 1.8146, + "step": 10298 + }, + { + "epoch": 3.1611418047882136, + "grad_norm": 0.3030858337879181, + "learning_rate": 8.003130521148442e-05, + "loss": 1.857, + "step": 10299 + }, + { + "epoch": 3.161448741559239, + "grad_norm": 0.25168511271476746, + "learning_rate": 8.002733095118025e-05, + "loss": 1.8404, + "step": 10300 + }, + { + "epoch": 3.161755678330264, + "grad_norm": 0.2956216335296631, + "learning_rate": 8.002335639412839e-05, + "loss": 1.7352, + "step": 10301 + }, + { + "epoch": 3.162062615101289, + "grad_norm": 0.27791857719421387, + "learning_rate": 8.001938154036814e-05, + "loss": 1.7797, + "step": 10302 + }, + { + "epoch": 3.1623695518723145, + "grad_norm": 0.3106420040130615, + "learning_rate": 8.001540638993876e-05, + "loss": 1.8434, + "step": 10303 + }, + { + "epoch": 3.1626764886433394, + "grad_norm": 0.2940445840358734, + "learning_rate": 8.001143094287954e-05, + "loss": 1.8459, + "step": 10304 + }, + { + "epoch": 3.1629834254143647, + "grad_norm": 0.3857429325580597, + "learning_rate": 8.000745519922977e-05, + "loss": 1.7853, + "step": 10305 + }, + { + "epoch": 3.1632903621853896, + "grad_norm": 0.3585071861743927, + "learning_rate": 8.000347915902874e-05, + "loss": 1.8905, + "step": 10306 + }, + { + "epoch": 3.163597298956415, + "grad_norm": 0.320003867149353, + "learning_rate": 7.999950282231574e-05, + "loss": 1.8397, + "step": 10307 + }, + { + "epoch": 3.1639042357274403, + "grad_norm": 0.24986252188682556, + "learning_rate": 7.999552618913009e-05, + "loss": 1.7916, + "step": 10308 + }, + { + "epoch": 3.164211172498465, + "grad_norm": 0.33077237010002136, + "learning_rate": 7.999154925951104e-05, + "loss": 1.8334, + "step": 10309 + }, + { + "epoch": 3.1645181092694905, + "grad_norm": 0.35700327157974243, + "learning_rate": 7.998757203349794e-05, + "loss": 1.7773, + "step": 10310 + }, + { + "epoch": 3.164825046040516, + "grad_norm": 0.3095493018627167, + "learning_rate": 7.998359451113007e-05, + "loss": 1.8156, + "step": 10311 + }, + { + "epoch": 3.1651319828115407, + "grad_norm": 0.3004748225212097, + "learning_rate": 7.997961669244673e-05, + "loss": 1.7862, + "step": 10312 + }, + { + "epoch": 3.165438919582566, + "grad_norm": 0.39382806420326233, + "learning_rate": 7.99756385774873e-05, + "loss": 1.764, + "step": 10313 + }, + { + "epoch": 3.165745856353591, + "grad_norm": 0.3109463155269623, + "learning_rate": 7.997166016629099e-05, + "loss": 1.8006, + "step": 10314 + }, + { + "epoch": 3.1660527931246163, + "grad_norm": 0.2896469235420227, + "learning_rate": 7.996768145889717e-05, + "loss": 1.8373, + "step": 10315 + }, + { + "epoch": 3.1663597298956416, + "grad_norm": 0.35024940967559814, + "learning_rate": 7.996370245534517e-05, + "loss": 1.797, + "step": 10316 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.3228827714920044, + "learning_rate": 7.995972315567431e-05, + "loss": 1.7757, + "step": 10317 + }, + { + "epoch": 3.166973603437692, + "grad_norm": 0.27102410793304443, + "learning_rate": 7.995574355992388e-05, + "loss": 1.7786, + "step": 10318 + }, + { + "epoch": 3.167280540208717, + "grad_norm": 0.2556116580963135, + "learning_rate": 7.995176366813325e-05, + "loss": 1.7621, + "step": 10319 + }, + { + "epoch": 3.167587476979742, + "grad_norm": 0.28279444575309753, + "learning_rate": 7.994778348034173e-05, + "loss": 1.7954, + "step": 10320 + }, + { + "epoch": 3.1678944137507674, + "grad_norm": 0.31778639554977417, + "learning_rate": 7.994380299658867e-05, + "loss": 1.7657, + "step": 10321 + }, + { + "epoch": 3.1682013505217923, + "grad_norm": 0.27935469150543213, + "learning_rate": 7.993982221691339e-05, + "loss": 1.7502, + "step": 10322 + }, + { + "epoch": 3.1685082872928176, + "grad_norm": 0.29012617468833923, + "learning_rate": 7.993584114135524e-05, + "loss": 1.8497, + "step": 10323 + }, + { + "epoch": 3.168815224063843, + "grad_norm": 0.2674056887626648, + "learning_rate": 7.993185976995356e-05, + "loss": 1.7875, + "step": 10324 + }, + { + "epoch": 3.169122160834868, + "grad_norm": 0.2667328417301178, + "learning_rate": 7.992787810274771e-05, + "loss": 1.771, + "step": 10325 + }, + { + "epoch": 3.169429097605893, + "grad_norm": 0.25807151198387146, + "learning_rate": 7.992389613977702e-05, + "loss": 1.7638, + "step": 10326 + }, + { + "epoch": 3.1697360343769185, + "grad_norm": 0.2572930157184601, + "learning_rate": 7.991991388108084e-05, + "loss": 1.8218, + "step": 10327 + }, + { + "epoch": 3.1700429711479434, + "grad_norm": 0.3955067992210388, + "learning_rate": 7.991593132669855e-05, + "loss": 1.8458, + "step": 10328 + }, + { + "epoch": 3.1703499079189688, + "grad_norm": 0.2813466489315033, + "learning_rate": 7.991194847666948e-05, + "loss": 1.8042, + "step": 10329 + }, + { + "epoch": 3.1706568446899936, + "grad_norm": 0.2645012140274048, + "learning_rate": 7.990796533103302e-05, + "loss": 1.8241, + "step": 10330 + }, + { + "epoch": 3.170963781461019, + "grad_norm": 0.28462091088294983, + "learning_rate": 7.99039818898285e-05, + "loss": 1.8853, + "step": 10331 + }, + { + "epoch": 3.1712707182320443, + "grad_norm": 0.2727372944355011, + "learning_rate": 7.98999981530953e-05, + "loss": 1.7564, + "step": 10332 + }, + { + "epoch": 3.171577655003069, + "grad_norm": 0.2658170759677887, + "learning_rate": 7.989601412087281e-05, + "loss": 1.8344, + "step": 10333 + }, + { + "epoch": 3.1718845917740945, + "grad_norm": 0.29713502526283264, + "learning_rate": 7.989202979320039e-05, + "loss": 1.8721, + "step": 10334 + }, + { + "epoch": 3.17219152854512, + "grad_norm": 0.26609495282173157, + "learning_rate": 7.98880451701174e-05, + "loss": 1.7991, + "step": 10335 + }, + { + "epoch": 3.1724984653161448, + "grad_norm": 0.29779741168022156, + "learning_rate": 7.988406025166322e-05, + "loss": 1.8182, + "step": 10336 + }, + { + "epoch": 3.17280540208717, + "grad_norm": 0.2771340012550354, + "learning_rate": 7.988007503787724e-05, + "loss": 1.8034, + "step": 10337 + }, + { + "epoch": 3.1731123388581954, + "grad_norm": 0.30510422587394714, + "learning_rate": 7.987608952879886e-05, + "loss": 1.8477, + "step": 10338 + }, + { + "epoch": 3.1734192756292203, + "grad_norm": 0.3097476363182068, + "learning_rate": 7.987210372446745e-05, + "loss": 1.7572, + "step": 10339 + }, + { + "epoch": 3.1737262124002457, + "grad_norm": 0.2553942799568176, + "learning_rate": 7.986811762492239e-05, + "loss": 1.7837, + "step": 10340 + }, + { + "epoch": 3.1740331491712706, + "grad_norm": 0.26546719670295715, + "learning_rate": 7.986413123020312e-05, + "loss": 1.7893, + "step": 10341 + }, + { + "epoch": 3.174340085942296, + "grad_norm": 0.37721553444862366, + "learning_rate": 7.986014454034895e-05, + "loss": 1.8475, + "step": 10342 + }, + { + "epoch": 3.174647022713321, + "grad_norm": 0.3215494453907013, + "learning_rate": 7.985615755539937e-05, + "loss": 1.7806, + "step": 10343 + }, + { + "epoch": 3.174953959484346, + "grad_norm": 0.2662442922592163, + "learning_rate": 7.985217027539373e-05, + "loss": 1.8116, + "step": 10344 + }, + { + "epoch": 3.1752608962553714, + "grad_norm": 0.23334236443042755, + "learning_rate": 7.984818270037145e-05, + "loss": 1.7929, + "step": 10345 + }, + { + "epoch": 3.1755678330263963, + "grad_norm": 0.2873367667198181, + "learning_rate": 7.98441948303719e-05, + "loss": 1.7808, + "step": 10346 + }, + { + "epoch": 3.1758747697974217, + "grad_norm": 0.3623826801776886, + "learning_rate": 7.984020666543458e-05, + "loss": 1.8817, + "step": 10347 + }, + { + "epoch": 3.176181706568447, + "grad_norm": 0.3060589134693146, + "learning_rate": 7.983621820559881e-05, + "loss": 1.796, + "step": 10348 + }, + { + "epoch": 3.176488643339472, + "grad_norm": 0.2396882325410843, + "learning_rate": 7.983222945090407e-05, + "loss": 1.7455, + "step": 10349 + }, + { + "epoch": 3.1767955801104972, + "grad_norm": 0.24811476469039917, + "learning_rate": 7.982824040138974e-05, + "loss": 1.7907, + "step": 10350 + }, + { + "epoch": 3.1771025168815226, + "grad_norm": 0.32749706506729126, + "learning_rate": 7.982425105709524e-05, + "loss": 1.8553, + "step": 10351 + }, + { + "epoch": 3.1774094536525475, + "grad_norm": 0.3648095726966858, + "learning_rate": 7.982026141806003e-05, + "loss": 1.8387, + "step": 10352 + }, + { + "epoch": 3.177716390423573, + "grad_norm": 0.2749348282814026, + "learning_rate": 7.981627148432352e-05, + "loss": 1.7676, + "step": 10353 + }, + { + "epoch": 3.178023327194598, + "grad_norm": 0.2735142409801483, + "learning_rate": 7.981228125592513e-05, + "loss": 1.822, + "step": 10354 + }, + { + "epoch": 3.178330263965623, + "grad_norm": 0.28759655356407166, + "learning_rate": 7.98082907329043e-05, + "loss": 1.8113, + "step": 10355 + }, + { + "epoch": 3.1786372007366483, + "grad_norm": 0.33661654591560364, + "learning_rate": 7.980429991530048e-05, + "loss": 1.8036, + "step": 10356 + }, + { + "epoch": 3.1789441375076732, + "grad_norm": 0.2634892761707306, + "learning_rate": 7.98003088031531e-05, + "loss": 1.8323, + "step": 10357 + }, + { + "epoch": 3.1792510742786986, + "grad_norm": 0.25864094495773315, + "learning_rate": 7.979631739650158e-05, + "loss": 1.8199, + "step": 10358 + }, + { + "epoch": 3.179558011049724, + "grad_norm": 0.27368444204330444, + "learning_rate": 7.979232569538541e-05, + "loss": 1.7673, + "step": 10359 + }, + { + "epoch": 3.179864947820749, + "grad_norm": 0.2506616413593292, + "learning_rate": 7.9788333699844e-05, + "loss": 1.7912, + "step": 10360 + }, + { + "epoch": 3.180171884591774, + "grad_norm": 0.2539178133010864, + "learning_rate": 7.978434140991684e-05, + "loss": 1.7934, + "step": 10361 + }, + { + "epoch": 3.1804788213627995, + "grad_norm": 0.2605626881122589, + "learning_rate": 7.978034882564334e-05, + "loss": 1.8031, + "step": 10362 + }, + { + "epoch": 3.1807857581338244, + "grad_norm": 0.2610207796096802, + "learning_rate": 7.977635594706299e-05, + "loss": 1.8664, + "step": 10363 + }, + { + "epoch": 3.1810926949048497, + "grad_norm": 0.26164132356643677, + "learning_rate": 7.977236277421523e-05, + "loss": 1.7758, + "step": 10364 + }, + { + "epoch": 3.1813996316758746, + "grad_norm": 0.3122340142726898, + "learning_rate": 7.976836930713953e-05, + "loss": 1.9033, + "step": 10365 + }, + { + "epoch": 3.1817065684469, + "grad_norm": 0.3317202031612396, + "learning_rate": 7.976437554587537e-05, + "loss": 1.7899, + "step": 10366 + }, + { + "epoch": 3.1820135052179253, + "grad_norm": 0.28612568974494934, + "learning_rate": 7.97603814904622e-05, + "loss": 1.8145, + "step": 10367 + }, + { + "epoch": 3.18232044198895, + "grad_norm": 0.349917471408844, + "learning_rate": 7.975638714093949e-05, + "loss": 1.877, + "step": 10368 + }, + { + "epoch": 3.1826273787599755, + "grad_norm": 0.3737771809101105, + "learning_rate": 7.975239249734672e-05, + "loss": 1.8204, + "step": 10369 + }, + { + "epoch": 3.182934315531001, + "grad_norm": 0.3688446879386902, + "learning_rate": 7.974839755972339e-05, + "loss": 1.8487, + "step": 10370 + }, + { + "epoch": 3.1832412523020257, + "grad_norm": 0.2934897541999817, + "learning_rate": 7.974440232810894e-05, + "loss": 1.8243, + "step": 10371 + }, + { + "epoch": 3.183548189073051, + "grad_norm": 0.2596173882484436, + "learning_rate": 7.974040680254287e-05, + "loss": 1.7887, + "step": 10372 + }, + { + "epoch": 3.183855125844076, + "grad_norm": 0.35686594247817993, + "learning_rate": 7.973641098306468e-05, + "loss": 1.8653, + "step": 10373 + }, + { + "epoch": 3.1841620626151013, + "grad_norm": 0.3187713921070099, + "learning_rate": 7.973241486971383e-05, + "loss": 1.8767, + "step": 10374 + }, + { + "epoch": 3.1844689993861266, + "grad_norm": 0.2596273124217987, + "learning_rate": 7.972841846252985e-05, + "loss": 1.8028, + "step": 10375 + }, + { + "epoch": 3.1847759361571515, + "grad_norm": 0.2637474834918976, + "learning_rate": 7.972442176155221e-05, + "loss": 1.802, + "step": 10376 + }, + { + "epoch": 3.185082872928177, + "grad_norm": 0.2641126215457916, + "learning_rate": 7.97204247668204e-05, + "loss": 1.7931, + "step": 10377 + }, + { + "epoch": 3.185389809699202, + "grad_norm": 0.25594159960746765, + "learning_rate": 7.971642747837393e-05, + "loss": 1.818, + "step": 10378 + }, + { + "epoch": 3.185696746470227, + "grad_norm": 0.26567938923835754, + "learning_rate": 7.971242989625233e-05, + "loss": 1.8174, + "step": 10379 + }, + { + "epoch": 3.1860036832412524, + "grad_norm": 0.29580214619636536, + "learning_rate": 7.970843202049508e-05, + "loss": 1.869, + "step": 10380 + }, + { + "epoch": 3.1863106200122773, + "grad_norm": 0.2657530605792999, + "learning_rate": 7.970443385114168e-05, + "loss": 1.8352, + "step": 10381 + }, + { + "epoch": 3.1866175567833026, + "grad_norm": 0.2468358278274536, + "learning_rate": 7.970043538823165e-05, + "loss": 1.7851, + "step": 10382 + }, + { + "epoch": 3.186924493554328, + "grad_norm": 0.26464715600013733, + "learning_rate": 7.969643663180451e-05, + "loss": 1.8208, + "step": 10383 + }, + { + "epoch": 3.187231430325353, + "grad_norm": 0.26035723090171814, + "learning_rate": 7.969243758189979e-05, + "loss": 1.8089, + "step": 10384 + }, + { + "epoch": 3.187538367096378, + "grad_norm": 0.2644619941711426, + "learning_rate": 7.968843823855699e-05, + "loss": 1.8379, + "step": 10385 + }, + { + "epoch": 3.1878453038674035, + "grad_norm": 0.25576624274253845, + "learning_rate": 7.968443860181565e-05, + "loss": 1.7932, + "step": 10386 + }, + { + "epoch": 3.1881522406384284, + "grad_norm": 0.24276074767112732, + "learning_rate": 7.968043867171528e-05, + "loss": 1.8037, + "step": 10387 + }, + { + "epoch": 3.1884591774094537, + "grad_norm": 0.27156540751457214, + "learning_rate": 7.967643844829543e-05, + "loss": 1.7998, + "step": 10388 + }, + { + "epoch": 3.1887661141804786, + "grad_norm": 0.2555428743362427, + "learning_rate": 7.96724379315956e-05, + "loss": 1.7612, + "step": 10389 + }, + { + "epoch": 3.189073050951504, + "grad_norm": 0.3358438014984131, + "learning_rate": 7.966843712165537e-05, + "loss": 1.8543, + "step": 10390 + }, + { + "epoch": 3.1893799877225293, + "grad_norm": 0.2799586355686188, + "learning_rate": 7.966443601851424e-05, + "loss": 1.819, + "step": 10391 + }, + { + "epoch": 3.189686924493554, + "grad_norm": 0.2364189177751541, + "learning_rate": 7.966043462221178e-05, + "loss": 1.8537, + "step": 10392 + }, + { + "epoch": 3.1899938612645795, + "grad_norm": 0.23849403858184814, + "learning_rate": 7.96564329327875e-05, + "loss": 1.8125, + "step": 10393 + }, + { + "epoch": 3.190300798035605, + "grad_norm": 0.2371583878993988, + "learning_rate": 7.965243095028098e-05, + "loss": 1.7352, + "step": 10394 + }, + { + "epoch": 3.1906077348066297, + "grad_norm": 0.2584737539291382, + "learning_rate": 7.964842867473176e-05, + "loss": 1.8801, + "step": 10395 + }, + { + "epoch": 3.190914671577655, + "grad_norm": 0.27768051624298096, + "learning_rate": 7.964442610617939e-05, + "loss": 1.8221, + "step": 10396 + }, + { + "epoch": 3.1912216083486804, + "grad_norm": 0.2680891752243042, + "learning_rate": 7.964042324466341e-05, + "loss": 1.8371, + "step": 10397 + }, + { + "epoch": 3.1915285451197053, + "grad_norm": 0.25301921367645264, + "learning_rate": 7.963642009022343e-05, + "loss": 1.7972, + "step": 10398 + }, + { + "epoch": 3.1918354818907306, + "grad_norm": 0.2589731216430664, + "learning_rate": 7.963241664289896e-05, + "loss": 1.8145, + "step": 10399 + }, + { + "epoch": 3.1921424186617555, + "grad_norm": 0.2611297369003296, + "learning_rate": 7.962841290272956e-05, + "loss": 1.8736, + "step": 10400 + }, + { + "epoch": 3.192449355432781, + "grad_norm": 0.2812272906303406, + "learning_rate": 7.962440886975483e-05, + "loss": 1.8116, + "step": 10401 + }, + { + "epoch": 3.192756292203806, + "grad_norm": 0.3261657655239105, + "learning_rate": 7.962040454401434e-05, + "loss": 1.7935, + "step": 10402 + }, + { + "epoch": 3.193063228974831, + "grad_norm": 0.3355373442173004, + "learning_rate": 7.961639992554764e-05, + "loss": 1.7957, + "step": 10403 + }, + { + "epoch": 3.1933701657458564, + "grad_norm": 0.2811843156814575, + "learning_rate": 7.961239501439432e-05, + "loss": 1.797, + "step": 10404 + }, + { + "epoch": 3.1936771025168813, + "grad_norm": 0.24933238327503204, + "learning_rate": 7.960838981059395e-05, + "loss": 1.7594, + "step": 10405 + }, + { + "epoch": 3.1939840392879066, + "grad_norm": 0.29110121726989746, + "learning_rate": 7.960438431418613e-05, + "loss": 1.8268, + "step": 10406 + }, + { + "epoch": 3.194290976058932, + "grad_norm": 0.3702283799648285, + "learning_rate": 7.960037852521043e-05, + "loss": 1.7629, + "step": 10407 + }, + { + "epoch": 3.194597912829957, + "grad_norm": 0.33275437355041504, + "learning_rate": 7.959637244370644e-05, + "loss": 1.8507, + "step": 10408 + }, + { + "epoch": 3.194904849600982, + "grad_norm": 0.2691981792449951, + "learning_rate": 7.959236606971375e-05, + "loss": 1.8084, + "step": 10409 + }, + { + "epoch": 3.1952117863720075, + "grad_norm": 0.30108413100242615, + "learning_rate": 7.958835940327194e-05, + "loss": 1.8525, + "step": 10410 + }, + { + "epoch": 3.1955187231430324, + "grad_norm": 0.32112306356430054, + "learning_rate": 7.958435244442064e-05, + "loss": 1.7431, + "step": 10411 + }, + { + "epoch": 3.1958256599140578, + "grad_norm": 0.2795291543006897, + "learning_rate": 7.958034519319942e-05, + "loss": 1.7985, + "step": 10412 + }, + { + "epoch": 3.196132596685083, + "grad_norm": 0.2485792338848114, + "learning_rate": 7.957633764964788e-05, + "loss": 1.7363, + "step": 10413 + }, + { + "epoch": 3.196439533456108, + "grad_norm": 0.3552432358264923, + "learning_rate": 7.957232981380565e-05, + "loss": 1.8174, + "step": 10414 + }, + { + "epoch": 3.1967464702271333, + "grad_norm": 0.3829655051231384, + "learning_rate": 7.956832168571234e-05, + "loss": 1.9249, + "step": 10415 + }, + { + "epoch": 3.197053406998158, + "grad_norm": 0.2498074769973755, + "learning_rate": 7.956431326540752e-05, + "loss": 1.8104, + "step": 10416 + }, + { + "epoch": 3.1973603437691835, + "grad_norm": 0.24596504867076874, + "learning_rate": 7.956030455293082e-05, + "loss": 1.8007, + "step": 10417 + }, + { + "epoch": 3.197667280540209, + "grad_norm": 0.2795363664627075, + "learning_rate": 7.95562955483219e-05, + "loss": 1.775, + "step": 10418 + }, + { + "epoch": 3.1979742173112338, + "grad_norm": 0.3581138253211975, + "learning_rate": 7.95522862516203e-05, + "loss": 1.8567, + "step": 10419 + }, + { + "epoch": 3.198281154082259, + "grad_norm": 0.36102500557899475, + "learning_rate": 7.95482766628657e-05, + "loss": 1.8509, + "step": 10420 + }, + { + "epoch": 3.198588090853284, + "grad_norm": 0.4717029929161072, + "learning_rate": 7.954426678209774e-05, + "loss": 1.8218, + "step": 10421 + }, + { + "epoch": 3.1988950276243093, + "grad_norm": 0.3211984932422638, + "learning_rate": 7.9540256609356e-05, + "loss": 1.8696, + "step": 10422 + }, + { + "epoch": 3.1992019643953347, + "grad_norm": 0.30094626545906067, + "learning_rate": 7.953624614468011e-05, + "loss": 1.8714, + "step": 10423 + }, + { + "epoch": 3.1995089011663596, + "grad_norm": 0.267578125, + "learning_rate": 7.953223538810976e-05, + "loss": 1.7903, + "step": 10424 + }, + { + "epoch": 3.199815837937385, + "grad_norm": 0.35577845573425293, + "learning_rate": 7.952822433968453e-05, + "loss": 1.7808, + "step": 10425 + }, + { + "epoch": 3.2001227747084102, + "grad_norm": 0.4117741882801056, + "learning_rate": 7.952421299944408e-05, + "loss": 1.7856, + "step": 10426 + }, + { + "epoch": 3.200429711479435, + "grad_norm": 0.35202035307884216, + "learning_rate": 7.952020136742806e-05, + "loss": 1.8112, + "step": 10427 + }, + { + "epoch": 3.2007366482504604, + "grad_norm": 0.26514917612075806, + "learning_rate": 7.951618944367611e-05, + "loss": 1.828, + "step": 10428 + }, + { + "epoch": 3.201043585021486, + "grad_norm": 0.29219159483909607, + "learning_rate": 7.951217722822786e-05, + "loss": 1.9366, + "step": 10429 + }, + { + "epoch": 3.2013505217925107, + "grad_norm": 0.2929961383342743, + "learning_rate": 7.950816472112298e-05, + "loss": 1.8006, + "step": 10430 + }, + { + "epoch": 3.201657458563536, + "grad_norm": 0.28339722752571106, + "learning_rate": 7.950415192240114e-05, + "loss": 1.7411, + "step": 10431 + }, + { + "epoch": 3.201964395334561, + "grad_norm": 0.258884996175766, + "learning_rate": 7.950013883210196e-05, + "loss": 1.8153, + "step": 10432 + }, + { + "epoch": 3.2022713321055862, + "grad_norm": 0.3065929114818573, + "learning_rate": 7.949612545026512e-05, + "loss": 1.7918, + "step": 10433 + }, + { + "epoch": 3.2025782688766116, + "grad_norm": 0.289874404668808, + "learning_rate": 7.949211177693029e-05, + "loss": 1.7975, + "step": 10434 + }, + { + "epoch": 3.2028852056476365, + "grad_norm": 0.27025631070137024, + "learning_rate": 7.948809781213711e-05, + "loss": 1.8129, + "step": 10435 + }, + { + "epoch": 3.203192142418662, + "grad_norm": 0.2501074969768524, + "learning_rate": 7.948408355592528e-05, + "loss": 1.7653, + "step": 10436 + }, + { + "epoch": 3.203499079189687, + "grad_norm": 0.30402958393096924, + "learning_rate": 7.948006900833445e-05, + "loss": 1.8311, + "step": 10437 + }, + { + "epoch": 3.203806015960712, + "grad_norm": 0.28783223032951355, + "learning_rate": 7.94760541694043e-05, + "loss": 1.82, + "step": 10438 + }, + { + "epoch": 3.2041129527317374, + "grad_norm": 0.30428317189216614, + "learning_rate": 7.947203903917451e-05, + "loss": 1.8673, + "step": 10439 + }, + { + "epoch": 3.2044198895027622, + "grad_norm": 0.2860367000102997, + "learning_rate": 7.946802361768473e-05, + "loss": 1.824, + "step": 10440 + }, + { + "epoch": 3.2047268262737876, + "grad_norm": 0.2995273172855377, + "learning_rate": 7.946400790497469e-05, + "loss": 1.7342, + "step": 10441 + }, + { + "epoch": 3.205033763044813, + "grad_norm": 0.4374088943004608, + "learning_rate": 7.945999190108407e-05, + "loss": 1.8522, + "step": 10442 + }, + { + "epoch": 3.205340699815838, + "grad_norm": 0.37659478187561035, + "learning_rate": 7.945597560605252e-05, + "loss": 1.7518, + "step": 10443 + }, + { + "epoch": 3.205647636586863, + "grad_norm": 0.24257932603359222, + "learning_rate": 7.945195901991975e-05, + "loss": 1.7892, + "step": 10444 + }, + { + "epoch": 3.2059545733578885, + "grad_norm": 0.3682694435119629, + "learning_rate": 7.944794214272546e-05, + "loss": 1.7757, + "step": 10445 + }, + { + "epoch": 3.2062615101289134, + "grad_norm": 0.434692919254303, + "learning_rate": 7.944392497450936e-05, + "loss": 1.8207, + "step": 10446 + }, + { + "epoch": 3.2065684468999387, + "grad_norm": 0.3982211947441101, + "learning_rate": 7.943990751531113e-05, + "loss": 1.8303, + "step": 10447 + }, + { + "epoch": 3.2068753836709636, + "grad_norm": 0.2877334654331207, + "learning_rate": 7.943588976517049e-05, + "loss": 1.8495, + "step": 10448 + }, + { + "epoch": 3.207182320441989, + "grad_norm": 0.34589654207229614, + "learning_rate": 7.943187172412712e-05, + "loss": 1.7773, + "step": 10449 + }, + { + "epoch": 3.2074892572130143, + "grad_norm": 0.4727517366409302, + "learning_rate": 7.942785339222074e-05, + "loss": 1.8702, + "step": 10450 + }, + { + "epoch": 3.207796193984039, + "grad_norm": 0.4019354581832886, + "learning_rate": 7.942383476949107e-05, + "loss": 1.8095, + "step": 10451 + }, + { + "epoch": 3.2081031307550645, + "grad_norm": 0.2726243734359741, + "learning_rate": 7.941981585597782e-05, + "loss": 1.7273, + "step": 10452 + }, + { + "epoch": 3.20841006752609, + "grad_norm": 0.2944760024547577, + "learning_rate": 7.941579665172072e-05, + "loss": 1.7507, + "step": 10453 + }, + { + "epoch": 3.2087170042971147, + "grad_norm": 0.3530777096748352, + "learning_rate": 7.941177715675945e-05, + "loss": 1.8434, + "step": 10454 + }, + { + "epoch": 3.20902394106814, + "grad_norm": 0.28612539172172546, + "learning_rate": 7.940775737113378e-05, + "loss": 1.8094, + "step": 10455 + }, + { + "epoch": 3.209330877839165, + "grad_norm": 0.27006468176841736, + "learning_rate": 7.94037372948834e-05, + "loss": 1.7854, + "step": 10456 + }, + { + "epoch": 3.2096378146101903, + "grad_norm": 0.3027147054672241, + "learning_rate": 7.939971692804806e-05, + "loss": 1.7596, + "step": 10457 + }, + { + "epoch": 3.2099447513812156, + "grad_norm": 0.31999528408050537, + "learning_rate": 7.939569627066749e-05, + "loss": 1.8836, + "step": 10458 + }, + { + "epoch": 3.2102516881522405, + "grad_norm": 0.267600417137146, + "learning_rate": 7.939167532278142e-05, + "loss": 1.8508, + "step": 10459 + }, + { + "epoch": 3.210558624923266, + "grad_norm": 0.3171706795692444, + "learning_rate": 7.938765408442958e-05, + "loss": 1.7507, + "step": 10460 + }, + { + "epoch": 3.210865561694291, + "grad_norm": 0.2955280840396881, + "learning_rate": 7.938363255565171e-05, + "loss": 1.733, + "step": 10461 + }, + { + "epoch": 3.211172498465316, + "grad_norm": 0.3427969217300415, + "learning_rate": 7.937961073648759e-05, + "loss": 1.9208, + "step": 10462 + }, + { + "epoch": 3.2114794352363414, + "grad_norm": 0.28788647055625916, + "learning_rate": 7.937558862697692e-05, + "loss": 1.7723, + "step": 10463 + }, + { + "epoch": 3.2117863720073663, + "grad_norm": 0.26093682646751404, + "learning_rate": 7.937156622715945e-05, + "loss": 1.803, + "step": 10464 + }, + { + "epoch": 3.2120933087783916, + "grad_norm": 0.2791301906108856, + "learning_rate": 7.936754353707497e-05, + "loss": 1.7601, + "step": 10465 + }, + { + "epoch": 3.212400245549417, + "grad_norm": 0.3039831519126892, + "learning_rate": 7.93635205567632e-05, + "loss": 1.7864, + "step": 10466 + }, + { + "epoch": 3.212707182320442, + "grad_norm": 0.28498128056526184, + "learning_rate": 7.935949728626392e-05, + "loss": 1.7745, + "step": 10467 + }, + { + "epoch": 3.213014119091467, + "grad_norm": 0.2908780872821808, + "learning_rate": 7.935547372561687e-05, + "loss": 1.8281, + "step": 10468 + }, + { + "epoch": 3.2133210558624925, + "grad_norm": 0.26148509979248047, + "learning_rate": 7.935144987486183e-05, + "loss": 1.8545, + "step": 10469 + }, + { + "epoch": 3.2136279926335174, + "grad_norm": 0.2853962481021881, + "learning_rate": 7.934742573403856e-05, + "loss": 1.7765, + "step": 10470 + }, + { + "epoch": 3.2139349294045427, + "grad_norm": 0.26497501134872437, + "learning_rate": 7.934340130318681e-05, + "loss": 1.7472, + "step": 10471 + }, + { + "epoch": 3.214241866175568, + "grad_norm": 0.2806912660598755, + "learning_rate": 7.933937658234638e-05, + "loss": 1.7879, + "step": 10472 + }, + { + "epoch": 3.214548802946593, + "grad_norm": 0.2699974477291107, + "learning_rate": 7.933535157155705e-05, + "loss": 1.7539, + "step": 10473 + }, + { + "epoch": 3.2148557397176183, + "grad_norm": 0.22714731097221375, + "learning_rate": 7.933132627085856e-05, + "loss": 1.7861, + "step": 10474 + }, + { + "epoch": 3.215162676488643, + "grad_norm": 0.291340708732605, + "learning_rate": 7.932730068029072e-05, + "loss": 1.8381, + "step": 10475 + }, + { + "epoch": 3.2154696132596685, + "grad_norm": 0.3257324695587158, + "learning_rate": 7.93232747998933e-05, + "loss": 1.8293, + "step": 10476 + }, + { + "epoch": 3.215776550030694, + "grad_norm": 0.3518911600112915, + "learning_rate": 7.93192486297061e-05, + "loss": 1.853, + "step": 10477 + }, + { + "epoch": 3.2160834868017187, + "grad_norm": 0.27663540840148926, + "learning_rate": 7.93152221697689e-05, + "loss": 1.7831, + "step": 10478 + }, + { + "epoch": 3.216390423572744, + "grad_norm": 0.3153248429298401, + "learning_rate": 7.931119542012149e-05, + "loss": 1.7443, + "step": 10479 + }, + { + "epoch": 3.216697360343769, + "grad_norm": 0.2919597029685974, + "learning_rate": 7.930716838080368e-05, + "loss": 1.8108, + "step": 10480 + }, + { + "epoch": 3.2170042971147943, + "grad_norm": 0.26892516016960144, + "learning_rate": 7.930314105185524e-05, + "loss": 1.7791, + "step": 10481 + }, + { + "epoch": 3.2173112338858196, + "grad_norm": 0.2486005276441574, + "learning_rate": 7.929911343331599e-05, + "loss": 1.8184, + "step": 10482 + }, + { + "epoch": 3.2176181706568445, + "grad_norm": 0.260728120803833, + "learning_rate": 7.929508552522571e-05, + "loss": 1.7933, + "step": 10483 + }, + { + "epoch": 3.21792510742787, + "grad_norm": 0.3081948757171631, + "learning_rate": 7.929105732762425e-05, + "loss": 1.7732, + "step": 10484 + }, + { + "epoch": 3.218232044198895, + "grad_norm": 0.3807671368122101, + "learning_rate": 7.928702884055138e-05, + "loss": 1.7652, + "step": 10485 + }, + { + "epoch": 3.21853898096992, + "grad_norm": 0.31637755036354065, + "learning_rate": 7.928300006404692e-05, + "loss": 1.7605, + "step": 10486 + }, + { + "epoch": 3.2188459177409454, + "grad_norm": 0.2812853455543518, + "learning_rate": 7.927897099815071e-05, + "loss": 1.7925, + "step": 10487 + }, + { + "epoch": 3.2191528545119708, + "grad_norm": 0.3472350239753723, + "learning_rate": 7.927494164290253e-05, + "loss": 1.8252, + "step": 10488 + }, + { + "epoch": 3.2194597912829956, + "grad_norm": 0.4202714264392853, + "learning_rate": 7.927091199834222e-05, + "loss": 1.7993, + "step": 10489 + }, + { + "epoch": 3.219766728054021, + "grad_norm": 0.44552353024482727, + "learning_rate": 7.92668820645096e-05, + "loss": 1.8609, + "step": 10490 + }, + { + "epoch": 3.220073664825046, + "grad_norm": 0.38964664936065674, + "learning_rate": 7.926285184144451e-05, + "loss": 1.864, + "step": 10491 + }, + { + "epoch": 3.220380601596071, + "grad_norm": 0.2978462278842926, + "learning_rate": 7.925882132918676e-05, + "loss": 1.7892, + "step": 10492 + }, + { + "epoch": 3.2206875383670965, + "grad_norm": 0.2520316243171692, + "learning_rate": 7.925479052777619e-05, + "loss": 1.7702, + "step": 10493 + }, + { + "epoch": 3.2209944751381214, + "grad_norm": 0.28151068091392517, + "learning_rate": 7.925075943725263e-05, + "loss": 1.7613, + "step": 10494 + }, + { + "epoch": 3.2213014119091468, + "grad_norm": 0.3346099555492401, + "learning_rate": 7.924672805765592e-05, + "loss": 1.894, + "step": 10495 + }, + { + "epoch": 3.2216083486801717, + "grad_norm": 0.2981362044811249, + "learning_rate": 7.924269638902591e-05, + "loss": 1.8157, + "step": 10496 + }, + { + "epoch": 3.221915285451197, + "grad_norm": 0.2561499774456024, + "learning_rate": 7.923866443140242e-05, + "loss": 1.8259, + "step": 10497 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.26480481028556824, + "learning_rate": 7.923463218482532e-05, + "loss": 1.7856, + "step": 10498 + }, + { + "epoch": 3.222529158993247, + "grad_norm": 0.24103692173957825, + "learning_rate": 7.923059964933446e-05, + "loss": 1.7765, + "step": 10499 + }, + { + "epoch": 3.2228360957642725, + "grad_norm": 0.2399173080921173, + "learning_rate": 7.922656682496967e-05, + "loss": 1.8216, + "step": 10500 + }, + { + "epoch": 3.223143032535298, + "grad_norm": 0.24530018866062164, + "learning_rate": 7.922253371177082e-05, + "loss": 1.8155, + "step": 10501 + }, + { + "epoch": 3.2234499693063228, + "grad_norm": 0.23298653960227966, + "learning_rate": 7.921850030977775e-05, + "loss": 1.7843, + "step": 10502 + }, + { + "epoch": 3.223756906077348, + "grad_norm": 0.3053973317146301, + "learning_rate": 7.921446661903035e-05, + "loss": 1.8113, + "step": 10503 + }, + { + "epoch": 3.2240638428483734, + "grad_norm": 0.261336088180542, + "learning_rate": 7.921043263956847e-05, + "loss": 1.8073, + "step": 10504 + }, + { + "epoch": 3.2243707796193983, + "grad_norm": 0.24877268075942993, + "learning_rate": 7.920639837143195e-05, + "loss": 1.8344, + "step": 10505 + }, + { + "epoch": 3.2246777163904237, + "grad_norm": 0.26784422993659973, + "learning_rate": 7.920236381466071e-05, + "loss": 1.7757, + "step": 10506 + }, + { + "epoch": 3.2249846531614486, + "grad_norm": 0.2672121226787567, + "learning_rate": 7.919832896929458e-05, + "loss": 1.8384, + "step": 10507 + }, + { + "epoch": 3.225291589932474, + "grad_norm": 0.27254921197891235, + "learning_rate": 7.919429383537346e-05, + "loss": 1.8056, + "step": 10508 + }, + { + "epoch": 3.2255985267034992, + "grad_norm": 0.24467822909355164, + "learning_rate": 7.91902584129372e-05, + "loss": 1.8109, + "step": 10509 + }, + { + "epoch": 3.225905463474524, + "grad_norm": 0.25966358184814453, + "learning_rate": 7.918622270202571e-05, + "loss": 1.82, + "step": 10510 + }, + { + "epoch": 3.2262124002455494, + "grad_norm": 0.28601330518722534, + "learning_rate": 7.918218670267886e-05, + "loss": 1.7266, + "step": 10511 + }, + { + "epoch": 3.226519337016575, + "grad_norm": 0.4017516076564789, + "learning_rate": 7.917815041493653e-05, + "loss": 1.8408, + "step": 10512 + }, + { + "epoch": 3.2268262737875997, + "grad_norm": 0.3995787501335144, + "learning_rate": 7.917411383883862e-05, + "loss": 1.8441, + "step": 10513 + }, + { + "epoch": 3.227133210558625, + "grad_norm": 0.26997458934783936, + "learning_rate": 7.917007697442502e-05, + "loss": 1.8078, + "step": 10514 + }, + { + "epoch": 3.22744014732965, + "grad_norm": 0.34353014826774597, + "learning_rate": 7.916603982173562e-05, + "loss": 1.7523, + "step": 10515 + }, + { + "epoch": 3.2277470841006752, + "grad_norm": 0.39522337913513184, + "learning_rate": 7.916200238081032e-05, + "loss": 1.7532, + "step": 10516 + }, + { + "epoch": 3.2280540208717006, + "grad_norm": 0.4176923334598541, + "learning_rate": 7.915796465168903e-05, + "loss": 1.8895, + "step": 10517 + }, + { + "epoch": 3.2283609576427255, + "grad_norm": 0.30232906341552734, + "learning_rate": 7.915392663441164e-05, + "loss": 1.8223, + "step": 10518 + }, + { + "epoch": 3.228667894413751, + "grad_norm": 0.230951726436615, + "learning_rate": 7.914988832901805e-05, + "loss": 1.7265, + "step": 10519 + }, + { + "epoch": 3.228974831184776, + "grad_norm": 0.26381877064704895, + "learning_rate": 7.914584973554819e-05, + "loss": 1.7858, + "step": 10520 + }, + { + "epoch": 3.229281767955801, + "grad_norm": 0.2500905394554138, + "learning_rate": 7.914181085404194e-05, + "loss": 1.7606, + "step": 10521 + }, + { + "epoch": 3.2295887047268264, + "grad_norm": 0.2585415840148926, + "learning_rate": 7.913777168453925e-05, + "loss": 1.787, + "step": 10522 + }, + { + "epoch": 3.2298956414978512, + "grad_norm": 0.24236604571342468, + "learning_rate": 7.913373222708001e-05, + "loss": 1.7623, + "step": 10523 + }, + { + "epoch": 3.2302025782688766, + "grad_norm": 0.3113093078136444, + "learning_rate": 7.912969248170416e-05, + "loss": 1.7736, + "step": 10524 + }, + { + "epoch": 3.230509515039902, + "grad_norm": 0.3341342806816101, + "learning_rate": 7.912565244845163e-05, + "loss": 1.8583, + "step": 10525 + }, + { + "epoch": 3.230816451810927, + "grad_norm": 0.2644478678703308, + "learning_rate": 7.912161212736231e-05, + "loss": 1.7891, + "step": 10526 + }, + { + "epoch": 3.231123388581952, + "grad_norm": 0.22916561365127563, + "learning_rate": 7.911757151847616e-05, + "loss": 1.7642, + "step": 10527 + }, + { + "epoch": 3.2314303253529775, + "grad_norm": 0.24204877018928528, + "learning_rate": 7.911353062183309e-05, + "loss": 1.8522, + "step": 10528 + }, + { + "epoch": 3.2317372621240024, + "grad_norm": 0.25339365005493164, + "learning_rate": 7.910948943747307e-05, + "loss": 1.8391, + "step": 10529 + }, + { + "epoch": 3.2320441988950277, + "grad_norm": 0.2652709186077118, + "learning_rate": 7.9105447965436e-05, + "loss": 1.7735, + "step": 10530 + }, + { + "epoch": 3.2323511356660526, + "grad_norm": 0.2711019217967987, + "learning_rate": 7.910140620576183e-05, + "loss": 1.8491, + "step": 10531 + }, + { + "epoch": 3.232658072437078, + "grad_norm": 0.2598389685153961, + "learning_rate": 7.909736415849052e-05, + "loss": 1.8417, + "step": 10532 + }, + { + "epoch": 3.2329650092081033, + "grad_norm": 0.278037428855896, + "learning_rate": 7.9093321823662e-05, + "loss": 1.8774, + "step": 10533 + }, + { + "epoch": 3.233271945979128, + "grad_norm": 0.32015568017959595, + "learning_rate": 7.90892792013162e-05, + "loss": 1.8873, + "step": 10534 + }, + { + "epoch": 3.2335788827501535, + "grad_norm": 0.3098098635673523, + "learning_rate": 7.908523629149312e-05, + "loss": 1.8141, + "step": 10535 + }, + { + "epoch": 3.233885819521179, + "grad_norm": 0.3127266764640808, + "learning_rate": 7.908119309423267e-05, + "loss": 1.8587, + "step": 10536 + }, + { + "epoch": 3.2341927562922037, + "grad_norm": 0.3085545301437378, + "learning_rate": 7.907714960957483e-05, + "loss": 1.8544, + "step": 10537 + }, + { + "epoch": 3.234499693063229, + "grad_norm": 0.3051004409790039, + "learning_rate": 7.907310583755956e-05, + "loss": 1.8144, + "step": 10538 + }, + { + "epoch": 3.234806629834254, + "grad_norm": 0.3458186686038971, + "learning_rate": 7.906906177822682e-05, + "loss": 1.8388, + "step": 10539 + }, + { + "epoch": 3.2351135666052793, + "grad_norm": 0.37064439058303833, + "learning_rate": 7.906501743161656e-05, + "loss": 1.7574, + "step": 10540 + }, + { + "epoch": 3.2354205033763046, + "grad_norm": 0.3382316827774048, + "learning_rate": 7.906097279776876e-05, + "loss": 1.8785, + "step": 10541 + }, + { + "epoch": 3.2357274401473295, + "grad_norm": 0.254802942276001, + "learning_rate": 7.905692787672341e-05, + "loss": 1.8276, + "step": 10542 + }, + { + "epoch": 3.236034376918355, + "grad_norm": 0.3362341523170471, + "learning_rate": 7.905288266852047e-05, + "loss": 1.8057, + "step": 10543 + }, + { + "epoch": 3.23634131368938, + "grad_norm": 0.38821661472320557, + "learning_rate": 7.904883717319988e-05, + "loss": 1.7841, + "step": 10544 + }, + { + "epoch": 3.236648250460405, + "grad_norm": 0.33889076113700867, + "learning_rate": 7.90447913908017e-05, + "loss": 1.7892, + "step": 10545 + }, + { + "epoch": 3.2369551872314304, + "grad_norm": 0.2741014361381531, + "learning_rate": 7.904074532136585e-05, + "loss": 1.7611, + "step": 10546 + }, + { + "epoch": 3.2372621240024557, + "grad_norm": 0.28950995206832886, + "learning_rate": 7.903669896493233e-05, + "loss": 1.7963, + "step": 10547 + }, + { + "epoch": 3.2375690607734806, + "grad_norm": 0.30647143721580505, + "learning_rate": 7.903265232154113e-05, + "loss": 1.7522, + "step": 10548 + }, + { + "epoch": 3.237875997544506, + "grad_norm": 0.30428263545036316, + "learning_rate": 7.902860539123225e-05, + "loss": 1.7383, + "step": 10549 + }, + { + "epoch": 3.238182934315531, + "grad_norm": 0.2357146292924881, + "learning_rate": 7.902455817404569e-05, + "loss": 1.7243, + "step": 10550 + }, + { + "epoch": 3.238489871086556, + "grad_norm": 0.3125104606151581, + "learning_rate": 7.90205106700214e-05, + "loss": 1.8542, + "step": 10551 + }, + { + "epoch": 3.2387968078575815, + "grad_norm": 0.25797244906425476, + "learning_rate": 7.901646287919944e-05, + "loss": 1.8374, + "step": 10552 + }, + { + "epoch": 3.2391037446286064, + "grad_norm": 0.3127591907978058, + "learning_rate": 7.901241480161978e-05, + "loss": 1.9457, + "step": 10553 + }, + { + "epoch": 3.2394106813996317, + "grad_norm": 0.2971835434436798, + "learning_rate": 7.900836643732243e-05, + "loss": 1.7933, + "step": 10554 + }, + { + "epoch": 3.2397176181706566, + "grad_norm": 0.28931814432144165, + "learning_rate": 7.90043177863474e-05, + "loss": 1.8201, + "step": 10555 + }, + { + "epoch": 3.240024554941682, + "grad_norm": 0.3348724842071533, + "learning_rate": 7.90002688487347e-05, + "loss": 1.8718, + "step": 10556 + }, + { + "epoch": 3.2403314917127073, + "grad_norm": 0.28566426038742065, + "learning_rate": 7.899621962452436e-05, + "loss": 1.805, + "step": 10557 + }, + { + "epoch": 3.240638428483732, + "grad_norm": 0.27074119448661804, + "learning_rate": 7.899217011375637e-05, + "loss": 1.842, + "step": 10558 + }, + { + "epoch": 3.2409453652547575, + "grad_norm": 0.27014291286468506, + "learning_rate": 7.898812031647076e-05, + "loss": 1.8156, + "step": 10559 + }, + { + "epoch": 3.241252302025783, + "grad_norm": 0.28087863326072693, + "learning_rate": 7.898407023270756e-05, + "loss": 1.8399, + "step": 10560 + }, + { + "epoch": 3.2415592387968077, + "grad_norm": 0.2641037404537201, + "learning_rate": 7.898001986250679e-05, + "loss": 1.7977, + "step": 10561 + }, + { + "epoch": 3.241866175567833, + "grad_norm": 0.2843858301639557, + "learning_rate": 7.897596920590848e-05, + "loss": 1.834, + "step": 10562 + }, + { + "epoch": 3.2421731123388584, + "grad_norm": 0.2724611163139343, + "learning_rate": 7.897191826295266e-05, + "loss": 1.7547, + "step": 10563 + }, + { + "epoch": 3.2424800491098833, + "grad_norm": 0.2583858370780945, + "learning_rate": 7.896786703367935e-05, + "loss": 1.7658, + "step": 10564 + }, + { + "epoch": 3.2427869858809086, + "grad_norm": 0.2666650712490082, + "learning_rate": 7.896381551812861e-05, + "loss": 1.8017, + "step": 10565 + }, + { + "epoch": 3.2430939226519335, + "grad_norm": 0.23269347846508026, + "learning_rate": 7.895976371634047e-05, + "loss": 1.8267, + "step": 10566 + }, + { + "epoch": 3.243400859422959, + "grad_norm": 0.27865225076675415, + "learning_rate": 7.895571162835496e-05, + "loss": 1.8093, + "step": 10567 + }, + { + "epoch": 3.243707796193984, + "grad_norm": 0.29445022344589233, + "learning_rate": 7.895165925421216e-05, + "loss": 1.7999, + "step": 10568 + }, + { + "epoch": 3.244014732965009, + "grad_norm": 0.32135528326034546, + "learning_rate": 7.894760659395206e-05, + "loss": 1.8405, + "step": 10569 + }, + { + "epoch": 3.2443216697360344, + "grad_norm": 0.3409091532230377, + "learning_rate": 7.894355364761477e-05, + "loss": 1.7861, + "step": 10570 + }, + { + "epoch": 3.2446286065070598, + "grad_norm": 0.3379025459289551, + "learning_rate": 7.893950041524032e-05, + "loss": 1.8495, + "step": 10571 + }, + { + "epoch": 3.2449355432780846, + "grad_norm": 0.2843063473701477, + "learning_rate": 7.893544689686874e-05, + "loss": 1.7888, + "step": 10572 + }, + { + "epoch": 3.24524248004911, + "grad_norm": 0.2914074957370758, + "learning_rate": 7.893139309254013e-05, + "loss": 1.7866, + "step": 10573 + }, + { + "epoch": 3.245549416820135, + "grad_norm": 0.39855021238327026, + "learning_rate": 7.892733900229454e-05, + "loss": 1.7865, + "step": 10574 + }, + { + "epoch": 3.24585635359116, + "grad_norm": 0.4232102632522583, + "learning_rate": 7.892328462617203e-05, + "loss": 1.8443, + "step": 10575 + }, + { + "epoch": 3.2461632903621855, + "grad_norm": 0.390794962644577, + "learning_rate": 7.891922996421267e-05, + "loss": 1.8735, + "step": 10576 + }, + { + "epoch": 3.2464702271332104, + "grad_norm": 0.3051595687866211, + "learning_rate": 7.891517501645653e-05, + "loss": 1.8654, + "step": 10577 + }, + { + "epoch": 3.2467771639042358, + "grad_norm": 0.25363096594810486, + "learning_rate": 7.891111978294367e-05, + "loss": 1.7602, + "step": 10578 + }, + { + "epoch": 3.247084100675261, + "grad_norm": 0.29785794019699097, + "learning_rate": 7.890706426371419e-05, + "loss": 1.8242, + "step": 10579 + }, + { + "epoch": 3.247391037446286, + "grad_norm": 0.346162885427475, + "learning_rate": 7.890300845880816e-05, + "loss": 1.8551, + "step": 10580 + }, + { + "epoch": 3.2476979742173113, + "grad_norm": 0.33906155824661255, + "learning_rate": 7.889895236826566e-05, + "loss": 1.765, + "step": 10581 + }, + { + "epoch": 3.248004910988336, + "grad_norm": 0.26083165407180786, + "learning_rate": 7.889489599212676e-05, + "loss": 1.8246, + "step": 10582 + }, + { + "epoch": 3.2483118477593615, + "grad_norm": 0.3042019009590149, + "learning_rate": 7.889083933043157e-05, + "loss": 1.9017, + "step": 10583 + }, + { + "epoch": 3.248618784530387, + "grad_norm": 0.34833577275276184, + "learning_rate": 7.888678238322018e-05, + "loss": 1.7863, + "step": 10584 + }, + { + "epoch": 3.2489257213014118, + "grad_norm": 0.34436655044555664, + "learning_rate": 7.888272515053267e-05, + "loss": 1.7937, + "step": 10585 + }, + { + "epoch": 3.249232658072437, + "grad_norm": 0.2550172507762909, + "learning_rate": 7.887866763240914e-05, + "loss": 1.7615, + "step": 10586 + }, + { + "epoch": 3.2495395948434624, + "grad_norm": 0.3334405720233917, + "learning_rate": 7.88746098288897e-05, + "loss": 1.7465, + "step": 10587 + }, + { + "epoch": 3.2498465316144873, + "grad_norm": 0.4668157696723938, + "learning_rate": 7.887055174001443e-05, + "loss": 1.7836, + "step": 10588 + }, + { + "epoch": 3.2501534683855127, + "grad_norm": 0.524680495262146, + "learning_rate": 7.886649336582344e-05, + "loss": 1.844, + "step": 10589 + }, + { + "epoch": 3.250460405156538, + "grad_norm": 0.36859074234962463, + "learning_rate": 7.886243470635685e-05, + "loss": 1.8072, + "step": 10590 + }, + { + "epoch": 3.250767341927563, + "grad_norm": 0.32370296120643616, + "learning_rate": 7.885837576165478e-05, + "loss": 1.802, + "step": 10591 + }, + { + "epoch": 3.2510742786985882, + "grad_norm": 0.3506374955177307, + "learning_rate": 7.88543165317573e-05, + "loss": 1.7965, + "step": 10592 + }, + { + "epoch": 3.251381215469613, + "grad_norm": 0.39058688282966614, + "learning_rate": 7.885025701670457e-05, + "loss": 1.7987, + "step": 10593 + }, + { + "epoch": 3.2516881522406385, + "grad_norm": 0.3042154014110565, + "learning_rate": 7.884619721653669e-05, + "loss": 1.8345, + "step": 10594 + }, + { + "epoch": 3.251995089011664, + "grad_norm": 0.2249498963356018, + "learning_rate": 7.884213713129378e-05, + "loss": 1.7796, + "step": 10595 + }, + { + "epoch": 3.2523020257826887, + "grad_norm": 0.2701997458934784, + "learning_rate": 7.883807676101595e-05, + "loss": 1.8027, + "step": 10596 + }, + { + "epoch": 3.252608962553714, + "grad_norm": 0.2574785053730011, + "learning_rate": 7.883401610574336e-05, + "loss": 1.7878, + "step": 10597 + }, + { + "epoch": 3.252915899324739, + "grad_norm": 0.24964739382266998, + "learning_rate": 7.882995516551613e-05, + "loss": 1.7612, + "step": 10598 + }, + { + "epoch": 3.2532228360957642, + "grad_norm": 0.2519865930080414, + "learning_rate": 7.882589394037437e-05, + "loss": 1.7583, + "step": 10599 + }, + { + "epoch": 3.2535297728667896, + "grad_norm": 0.23174463212490082, + "learning_rate": 7.882183243035823e-05, + "loss": 1.7607, + "step": 10600 + }, + { + "epoch": 3.2538367096378145, + "grad_norm": 0.28103554248809814, + "learning_rate": 7.881777063550786e-05, + "loss": 1.904, + "step": 10601 + }, + { + "epoch": 3.25414364640884, + "grad_norm": 0.265677809715271, + "learning_rate": 7.881370855586339e-05, + "loss": 1.8169, + "step": 10602 + }, + { + "epoch": 3.254450583179865, + "grad_norm": 0.2539603114128113, + "learning_rate": 7.880964619146493e-05, + "loss": 1.8439, + "step": 10603 + }, + { + "epoch": 3.25475751995089, + "grad_norm": 0.2741886377334595, + "learning_rate": 7.88055835423527e-05, + "loss": 1.8737, + "step": 10604 + }, + { + "epoch": 3.2550644567219154, + "grad_norm": 0.27548348903656006, + "learning_rate": 7.88015206085668e-05, + "loss": 1.8385, + "step": 10605 + }, + { + "epoch": 3.2553713934929407, + "grad_norm": 0.2958502769470215, + "learning_rate": 7.879745739014739e-05, + "loss": 1.8603, + "step": 10606 + }, + { + "epoch": 3.2556783302639656, + "grad_norm": 0.2728644907474518, + "learning_rate": 7.879339388713462e-05, + "loss": 1.8, + "step": 10607 + }, + { + "epoch": 3.255985267034991, + "grad_norm": 0.28718289732933044, + "learning_rate": 7.878933009956866e-05, + "loss": 1.7803, + "step": 10608 + }, + { + "epoch": 3.256292203806016, + "grad_norm": 0.2989691197872162, + "learning_rate": 7.878526602748967e-05, + "loss": 1.8155, + "step": 10609 + }, + { + "epoch": 3.256599140577041, + "grad_norm": 0.24515527486801147, + "learning_rate": 7.87812016709378e-05, + "loss": 1.7623, + "step": 10610 + }, + { + "epoch": 3.2569060773480665, + "grad_norm": 0.29946041107177734, + "learning_rate": 7.877713702995324e-05, + "loss": 1.8097, + "step": 10611 + }, + { + "epoch": 3.2572130141190914, + "grad_norm": 0.2854483723640442, + "learning_rate": 7.877307210457613e-05, + "loss": 1.8088, + "step": 10612 + }, + { + "epoch": 3.2575199508901167, + "grad_norm": 0.27812930941581726, + "learning_rate": 7.876900689484668e-05, + "loss": 1.8151, + "step": 10613 + }, + { + "epoch": 3.2578268876611416, + "grad_norm": 0.2658015787601471, + "learning_rate": 7.876494140080503e-05, + "loss": 1.8314, + "step": 10614 + }, + { + "epoch": 3.258133824432167, + "grad_norm": 0.28935661911964417, + "learning_rate": 7.876087562249137e-05, + "loss": 1.7948, + "step": 10615 + }, + { + "epoch": 3.2584407612031923, + "grad_norm": 0.27497121691703796, + "learning_rate": 7.875680955994587e-05, + "loss": 1.7964, + "step": 10616 + }, + { + "epoch": 3.258747697974217, + "grad_norm": 0.3313405513763428, + "learning_rate": 7.875274321320873e-05, + "loss": 1.8143, + "step": 10617 + }, + { + "epoch": 3.2590546347452425, + "grad_norm": 0.3217218816280365, + "learning_rate": 7.874867658232013e-05, + "loss": 1.7749, + "step": 10618 + }, + { + "epoch": 3.259361571516268, + "grad_norm": 0.25105544924736023, + "learning_rate": 7.874460966732025e-05, + "loss": 1.7834, + "step": 10619 + }, + { + "epoch": 3.2596685082872927, + "grad_norm": 0.2931382358074188, + "learning_rate": 7.874054246824931e-05, + "loss": 1.8252, + "step": 10620 + }, + { + "epoch": 3.259975445058318, + "grad_norm": 0.2803363502025604, + "learning_rate": 7.873647498514747e-05, + "loss": 1.7527, + "step": 10621 + }, + { + "epoch": 3.2602823818293434, + "grad_norm": 0.29857927560806274, + "learning_rate": 7.873240721805492e-05, + "loss": 1.8085, + "step": 10622 + }, + { + "epoch": 3.2605893186003683, + "grad_norm": 0.24864110350608826, + "learning_rate": 7.872833916701192e-05, + "loss": 1.7509, + "step": 10623 + }, + { + "epoch": 3.2608962553713936, + "grad_norm": 0.24105949699878693, + "learning_rate": 7.872427083205862e-05, + "loss": 1.7871, + "step": 10624 + }, + { + "epoch": 3.2612031921424185, + "grad_norm": 0.2429245114326477, + "learning_rate": 7.872020221323523e-05, + "loss": 1.777, + "step": 10625 + }, + { + "epoch": 3.261510128913444, + "grad_norm": 0.234287828207016, + "learning_rate": 7.871613331058197e-05, + "loss": 1.8001, + "step": 10626 + }, + { + "epoch": 3.261817065684469, + "grad_norm": 0.3463406264781952, + "learning_rate": 7.871206412413905e-05, + "loss": 1.8925, + "step": 10627 + }, + { + "epoch": 3.262124002455494, + "grad_norm": 0.26798921823501587, + "learning_rate": 7.87079946539467e-05, + "loss": 1.7963, + "step": 10628 + }, + { + "epoch": 3.2624309392265194, + "grad_norm": 0.28603312373161316, + "learning_rate": 7.87039249000451e-05, + "loss": 1.8308, + "step": 10629 + }, + { + "epoch": 3.2627378759975443, + "grad_norm": 0.2717527747154236, + "learning_rate": 7.86998548624745e-05, + "loss": 1.8246, + "step": 10630 + }, + { + "epoch": 3.2630448127685696, + "grad_norm": 0.32215580344200134, + "learning_rate": 7.86957845412751e-05, + "loss": 1.7278, + "step": 10631 + }, + { + "epoch": 3.263351749539595, + "grad_norm": 0.3578735589981079, + "learning_rate": 7.869171393648717e-05, + "loss": 1.7288, + "step": 10632 + }, + { + "epoch": 3.26365868631062, + "grad_norm": 0.3120707869529724, + "learning_rate": 7.868764304815089e-05, + "loss": 1.7971, + "step": 10633 + }, + { + "epoch": 3.263965623081645, + "grad_norm": 0.27419236302375793, + "learning_rate": 7.86835718763065e-05, + "loss": 1.8529, + "step": 10634 + }, + { + "epoch": 3.2642725598526705, + "grad_norm": 0.3200531601905823, + "learning_rate": 7.867950042099423e-05, + "loss": 1.7892, + "step": 10635 + }, + { + "epoch": 3.2645794966236954, + "grad_norm": 0.325706422328949, + "learning_rate": 7.867542868225435e-05, + "loss": 1.8236, + "step": 10636 + }, + { + "epoch": 3.2648864333947207, + "grad_norm": 0.2950136065483093, + "learning_rate": 7.867135666012707e-05, + "loss": 1.8163, + "step": 10637 + }, + { + "epoch": 3.265193370165746, + "grad_norm": 0.2772117257118225, + "learning_rate": 7.866728435465263e-05, + "loss": 1.8373, + "step": 10638 + }, + { + "epoch": 3.265500306936771, + "grad_norm": 0.2887401580810547, + "learning_rate": 7.866321176587129e-05, + "loss": 1.7756, + "step": 10639 + }, + { + "epoch": 3.2658072437077963, + "grad_norm": 0.3474489152431488, + "learning_rate": 7.865913889382329e-05, + "loss": 1.7539, + "step": 10640 + }, + { + "epoch": 3.266114180478821, + "grad_norm": 0.3433493971824646, + "learning_rate": 7.865506573854888e-05, + "loss": 1.7987, + "step": 10641 + }, + { + "epoch": 3.2664211172498465, + "grad_norm": 0.3075394630432129, + "learning_rate": 7.865099230008832e-05, + "loss": 1.7907, + "step": 10642 + }, + { + "epoch": 3.266728054020872, + "grad_norm": 0.24817697703838348, + "learning_rate": 7.864691857848187e-05, + "loss": 1.7941, + "step": 10643 + }, + { + "epoch": 3.2670349907918967, + "grad_norm": 0.290147602558136, + "learning_rate": 7.864284457376976e-05, + "loss": 1.9125, + "step": 10644 + }, + { + "epoch": 3.267341927562922, + "grad_norm": 0.253684937953949, + "learning_rate": 7.863877028599229e-05, + "loss": 1.8084, + "step": 10645 + }, + { + "epoch": 3.267648864333947, + "grad_norm": 0.26349252462387085, + "learning_rate": 7.863469571518969e-05, + "loss": 1.7548, + "step": 10646 + }, + { + "epoch": 3.2679558011049723, + "grad_norm": 0.30568864941596985, + "learning_rate": 7.863062086140224e-05, + "loss": 1.8551, + "step": 10647 + }, + { + "epoch": 3.2682627378759976, + "grad_norm": 0.2866690456867218, + "learning_rate": 7.862654572467024e-05, + "loss": 1.8145, + "step": 10648 + }, + { + "epoch": 3.2685696746470225, + "grad_norm": 0.32022854685783386, + "learning_rate": 7.862247030503391e-05, + "loss": 1.896, + "step": 10649 + }, + { + "epoch": 3.268876611418048, + "grad_norm": 0.25260284543037415, + "learning_rate": 7.861839460253356e-05, + "loss": 1.814, + "step": 10650 + }, + { + "epoch": 3.269183548189073, + "grad_norm": 0.26776066422462463, + "learning_rate": 7.861431861720947e-05, + "loss": 1.7755, + "step": 10651 + }, + { + "epoch": 3.269490484960098, + "grad_norm": 0.26514193415641785, + "learning_rate": 7.861024234910191e-05, + "loss": 1.7606, + "step": 10652 + }, + { + "epoch": 3.2697974217311234, + "grad_norm": 0.27213940024375916, + "learning_rate": 7.860616579825116e-05, + "loss": 1.8074, + "step": 10653 + }, + { + "epoch": 3.2701043585021488, + "grad_norm": 0.29192888736724854, + "learning_rate": 7.860208896469752e-05, + "loss": 1.8436, + "step": 10654 + }, + { + "epoch": 3.2704112952731736, + "grad_norm": 0.3772370219230652, + "learning_rate": 7.859801184848127e-05, + "loss": 1.8096, + "step": 10655 + }, + { + "epoch": 3.270718232044199, + "grad_norm": 0.4574970006942749, + "learning_rate": 7.859393444964269e-05, + "loss": 1.7612, + "step": 10656 + }, + { + "epoch": 3.271025168815224, + "grad_norm": 0.4614393413066864, + "learning_rate": 7.858985676822211e-05, + "loss": 1.8529, + "step": 10657 + }, + { + "epoch": 3.271332105586249, + "grad_norm": 0.33567267656326294, + "learning_rate": 7.85857788042598e-05, + "loss": 1.8391, + "step": 10658 + }, + { + "epoch": 3.2716390423572745, + "grad_norm": 0.2564064860343933, + "learning_rate": 7.858170055779609e-05, + "loss": 1.7621, + "step": 10659 + }, + { + "epoch": 3.2719459791282994, + "grad_norm": 0.26769882440567017, + "learning_rate": 7.857762202887122e-05, + "loss": 1.8145, + "step": 10660 + }, + { + "epoch": 3.2722529158993248, + "grad_norm": 0.262008935213089, + "learning_rate": 7.857354321752558e-05, + "loss": 1.7513, + "step": 10661 + }, + { + "epoch": 3.27255985267035, + "grad_norm": 0.26494377851486206, + "learning_rate": 7.856946412379942e-05, + "loss": 1.8071, + "step": 10662 + }, + { + "epoch": 3.272866789441375, + "grad_norm": 0.25613999366760254, + "learning_rate": 7.856538474773307e-05, + "loss": 1.8775, + "step": 10663 + }, + { + "epoch": 3.2731737262124003, + "grad_norm": 0.24789929389953613, + "learning_rate": 7.856130508936684e-05, + "loss": 1.8055, + "step": 10664 + }, + { + "epoch": 3.2734806629834257, + "grad_norm": 0.29111939668655396, + "learning_rate": 7.855722514874107e-05, + "loss": 1.8114, + "step": 10665 + }, + { + "epoch": 3.2737875997544506, + "grad_norm": 0.30511030554771423, + "learning_rate": 7.855314492589605e-05, + "loss": 1.8131, + "step": 10666 + }, + { + "epoch": 3.274094536525476, + "grad_norm": 0.2545989453792572, + "learning_rate": 7.854906442087212e-05, + "loss": 1.7933, + "step": 10667 + }, + { + "epoch": 3.2744014732965008, + "grad_norm": 0.26684823632240295, + "learning_rate": 7.85449836337096e-05, + "loss": 1.7604, + "step": 10668 + }, + { + "epoch": 3.274708410067526, + "grad_norm": 0.5097808837890625, + "learning_rate": 7.854090256444881e-05, + "loss": 1.777, + "step": 10669 + }, + { + "epoch": 3.2750153468385514, + "grad_norm": 0.27828142046928406, + "learning_rate": 7.853682121313011e-05, + "loss": 1.7885, + "step": 10670 + }, + { + "epoch": 3.2753222836095763, + "grad_norm": 0.2925552725791931, + "learning_rate": 7.853273957979381e-05, + "loss": 1.7962, + "step": 10671 + }, + { + "epoch": 3.2756292203806017, + "grad_norm": 0.284574955701828, + "learning_rate": 7.852865766448025e-05, + "loss": 1.8645, + "step": 10672 + }, + { + "epoch": 3.2759361571516266, + "grad_norm": 0.23407664895057678, + "learning_rate": 7.85245754672298e-05, + "loss": 1.7106, + "step": 10673 + }, + { + "epoch": 3.276243093922652, + "grad_norm": 0.2555919885635376, + "learning_rate": 7.852049298808274e-05, + "loss": 1.8237, + "step": 10674 + }, + { + "epoch": 3.2765500306936772, + "grad_norm": 0.26703694462776184, + "learning_rate": 7.851641022707947e-05, + "loss": 1.7844, + "step": 10675 + }, + { + "epoch": 3.276856967464702, + "grad_norm": 0.24889135360717773, + "learning_rate": 7.851232718426033e-05, + "loss": 1.7783, + "step": 10676 + }, + { + "epoch": 3.2771639042357275, + "grad_norm": 0.25770726799964905, + "learning_rate": 7.850824385966564e-05, + "loss": 1.8007, + "step": 10677 + }, + { + "epoch": 3.277470841006753, + "grad_norm": 0.31806984543800354, + "learning_rate": 7.850416025333578e-05, + "loss": 1.8623, + "step": 10678 + }, + { + "epoch": 3.2777777777777777, + "grad_norm": 0.2906930148601532, + "learning_rate": 7.850007636531111e-05, + "loss": 1.8315, + "step": 10679 + }, + { + "epoch": 3.278084714548803, + "grad_norm": 0.2802525460720062, + "learning_rate": 7.849599219563197e-05, + "loss": 1.8488, + "step": 10680 + }, + { + "epoch": 3.2783916513198283, + "grad_norm": 0.26150405406951904, + "learning_rate": 7.849190774433874e-05, + "loss": 1.7967, + "step": 10681 + }, + { + "epoch": 3.2786985880908532, + "grad_norm": 0.25863370299339294, + "learning_rate": 7.848782301147178e-05, + "loss": 1.864, + "step": 10682 + }, + { + "epoch": 3.2790055248618786, + "grad_norm": 0.25381043553352356, + "learning_rate": 7.848373799707145e-05, + "loss": 1.8239, + "step": 10683 + }, + { + "epoch": 3.2793124616329035, + "grad_norm": 0.2583387792110443, + "learning_rate": 7.847965270117814e-05, + "loss": 1.8449, + "step": 10684 + }, + { + "epoch": 3.279619398403929, + "grad_norm": 0.30759841203689575, + "learning_rate": 7.84755671238322e-05, + "loss": 1.7992, + "step": 10685 + }, + { + "epoch": 3.279926335174954, + "grad_norm": 0.4316023588180542, + "learning_rate": 7.847148126507402e-05, + "loss": 1.7912, + "step": 10686 + }, + { + "epoch": 3.280233271945979, + "grad_norm": 0.3988901674747467, + "learning_rate": 7.846739512494396e-05, + "loss": 1.8831, + "step": 10687 + }, + { + "epoch": 3.2805402087170044, + "grad_norm": 0.318934828042984, + "learning_rate": 7.846330870348244e-05, + "loss": 1.8411, + "step": 10688 + }, + { + "epoch": 3.2808471454880292, + "grad_norm": 0.27755632996559143, + "learning_rate": 7.84592220007298e-05, + "loss": 1.8763, + "step": 10689 + }, + { + "epoch": 3.2811540822590546, + "grad_norm": 0.33544883131980896, + "learning_rate": 7.845513501672646e-05, + "loss": 1.731, + "step": 10690 + }, + { + "epoch": 3.28146101903008, + "grad_norm": 0.28299057483673096, + "learning_rate": 7.845104775151278e-05, + "loss": 1.813, + "step": 10691 + }, + { + "epoch": 3.281767955801105, + "grad_norm": 0.2761382460594177, + "learning_rate": 7.844696020512918e-05, + "loss": 1.8018, + "step": 10692 + }, + { + "epoch": 3.28207489257213, + "grad_norm": 0.2919033169746399, + "learning_rate": 7.844287237761605e-05, + "loss": 1.793, + "step": 10693 + }, + { + "epoch": 3.2823818293431555, + "grad_norm": 0.32922014594078064, + "learning_rate": 7.843878426901378e-05, + "loss": 1.8186, + "step": 10694 + }, + { + "epoch": 3.2826887661141804, + "grad_norm": 0.2818562090396881, + "learning_rate": 7.843469587936279e-05, + "loss": 1.7794, + "step": 10695 + }, + { + "epoch": 3.2829957028852057, + "grad_norm": 0.26414254307746887, + "learning_rate": 7.843060720870345e-05, + "loss": 1.7854, + "step": 10696 + }, + { + "epoch": 3.283302639656231, + "grad_norm": 0.28345760703086853, + "learning_rate": 7.842651825707618e-05, + "loss": 1.7659, + "step": 10697 + }, + { + "epoch": 3.283609576427256, + "grad_norm": 0.3522340655326843, + "learning_rate": 7.842242902452141e-05, + "loss": 1.8427, + "step": 10698 + }, + { + "epoch": 3.2839165131982813, + "grad_norm": 0.2861590087413788, + "learning_rate": 7.841833951107954e-05, + "loss": 1.7539, + "step": 10699 + }, + { + "epoch": 3.284223449969306, + "grad_norm": 0.2596624493598938, + "learning_rate": 7.841424971679099e-05, + "loss": 1.8407, + "step": 10700 + }, + { + "epoch": 3.2845303867403315, + "grad_norm": 0.2847718298435211, + "learning_rate": 7.841015964169616e-05, + "loss": 1.8085, + "step": 10701 + }, + { + "epoch": 3.284837323511357, + "grad_norm": 0.29566115140914917, + "learning_rate": 7.840606928583547e-05, + "loss": 1.7873, + "step": 10702 + }, + { + "epoch": 3.2851442602823817, + "grad_norm": 0.2752111256122589, + "learning_rate": 7.840197864924936e-05, + "loss": 1.8186, + "step": 10703 + }, + { + "epoch": 3.285451197053407, + "grad_norm": 0.2907958924770355, + "learning_rate": 7.839788773197826e-05, + "loss": 1.8081, + "step": 10704 + }, + { + "epoch": 3.285758133824432, + "grad_norm": 0.25808724761009216, + "learning_rate": 7.839379653406258e-05, + "loss": 1.7635, + "step": 10705 + }, + { + "epoch": 3.2860650705954573, + "grad_norm": 0.2732730507850647, + "learning_rate": 7.838970505554277e-05, + "loss": 1.8061, + "step": 10706 + }, + { + "epoch": 3.2863720073664826, + "grad_norm": 0.23820067942142487, + "learning_rate": 7.838561329645923e-05, + "loss": 1.8091, + "step": 10707 + }, + { + "epoch": 3.2866789441375075, + "grad_norm": 0.24179396033287048, + "learning_rate": 7.838152125685245e-05, + "loss": 1.7513, + "step": 10708 + }, + { + "epoch": 3.286985880908533, + "grad_norm": 0.2627546787261963, + "learning_rate": 7.837742893676283e-05, + "loss": 1.8741, + "step": 10709 + }, + { + "epoch": 3.287292817679558, + "grad_norm": 0.2827817499637604, + "learning_rate": 7.837333633623083e-05, + "loss": 1.8387, + "step": 10710 + }, + { + "epoch": 3.287599754450583, + "grad_norm": 0.2666749060153961, + "learning_rate": 7.836924345529688e-05, + "loss": 1.8319, + "step": 10711 + }, + { + "epoch": 3.2879066912216084, + "grad_norm": 0.3403390944004059, + "learning_rate": 7.836515029400145e-05, + "loss": 1.7827, + "step": 10712 + }, + { + "epoch": 3.2882136279926337, + "grad_norm": 0.30646705627441406, + "learning_rate": 7.836105685238497e-05, + "loss": 1.8612, + "step": 10713 + }, + { + "epoch": 3.2885205647636586, + "grad_norm": 0.2580253481864929, + "learning_rate": 7.83569631304879e-05, + "loss": 1.7332, + "step": 10714 + }, + { + "epoch": 3.288827501534684, + "grad_norm": 0.23734542727470398, + "learning_rate": 7.835286912835071e-05, + "loss": 1.7899, + "step": 10715 + }, + { + "epoch": 3.289134438305709, + "grad_norm": 0.2457810491323471, + "learning_rate": 7.834877484601384e-05, + "loss": 1.8059, + "step": 10716 + }, + { + "epoch": 3.289441375076734, + "grad_norm": 0.2558443248271942, + "learning_rate": 7.834468028351778e-05, + "loss": 1.8689, + "step": 10717 + }, + { + "epoch": 3.2897483118477595, + "grad_norm": 0.26596710085868835, + "learning_rate": 7.834058544090298e-05, + "loss": 1.816, + "step": 10718 + }, + { + "epoch": 3.2900552486187844, + "grad_norm": 0.25424903631210327, + "learning_rate": 7.833649031820987e-05, + "loss": 1.7907, + "step": 10719 + }, + { + "epoch": 3.2903621853898097, + "grad_norm": 0.23873139917850494, + "learning_rate": 7.833239491547896e-05, + "loss": 1.7666, + "step": 10720 + }, + { + "epoch": 3.2906691221608346, + "grad_norm": 0.23292972147464752, + "learning_rate": 7.832829923275073e-05, + "loss": 1.7674, + "step": 10721 + }, + { + "epoch": 3.29097605893186, + "grad_norm": 0.30133312940597534, + "learning_rate": 7.832420327006566e-05, + "loss": 1.8229, + "step": 10722 + }, + { + "epoch": 3.2912829957028853, + "grad_norm": 0.2882522642612457, + "learning_rate": 7.83201070274642e-05, + "loss": 1.7855, + "step": 10723 + }, + { + "epoch": 3.29158993247391, + "grad_norm": 0.2578088045120239, + "learning_rate": 7.831601050498683e-05, + "loss": 1.7276, + "step": 10724 + }, + { + "epoch": 3.2918968692449355, + "grad_norm": 0.29511600732803345, + "learning_rate": 7.831191370267406e-05, + "loss": 1.8085, + "step": 10725 + }, + { + "epoch": 3.292203806015961, + "grad_norm": 0.29557499289512634, + "learning_rate": 7.830781662056634e-05, + "loss": 1.815, + "step": 10726 + }, + { + "epoch": 3.2925107427869857, + "grad_norm": 0.32722121477127075, + "learning_rate": 7.830371925870422e-05, + "loss": 1.7889, + "step": 10727 + }, + { + "epoch": 3.292817679558011, + "grad_norm": 0.3124488592147827, + "learning_rate": 7.829962161712814e-05, + "loss": 1.8063, + "step": 10728 + }, + { + "epoch": 3.2931246163290364, + "grad_norm": 0.311334490776062, + "learning_rate": 7.829552369587861e-05, + "loss": 1.8852, + "step": 10729 + }, + { + "epoch": 3.2934315531000613, + "grad_norm": 0.28010860085487366, + "learning_rate": 7.829142549499613e-05, + "loss": 1.8274, + "step": 10730 + }, + { + "epoch": 3.2937384898710866, + "grad_norm": 0.3453529477119446, + "learning_rate": 7.828732701452119e-05, + "loss": 1.8618, + "step": 10731 + }, + { + "epoch": 3.2940454266421115, + "grad_norm": 0.2946802079677582, + "learning_rate": 7.828322825449432e-05, + "loss": 1.7123, + "step": 10732 + }, + { + "epoch": 3.294352363413137, + "grad_norm": 0.2467648684978485, + "learning_rate": 7.827912921495601e-05, + "loss": 1.7786, + "step": 10733 + }, + { + "epoch": 3.294659300184162, + "grad_norm": 0.2957034707069397, + "learning_rate": 7.827502989594677e-05, + "loss": 1.7817, + "step": 10734 + }, + { + "epoch": 3.294966236955187, + "grad_norm": 0.300905704498291, + "learning_rate": 7.827093029750713e-05, + "loss": 1.7582, + "step": 10735 + }, + { + "epoch": 3.2952731737262124, + "grad_norm": 0.28935131430625916, + "learning_rate": 7.826683041967757e-05, + "loss": 1.7766, + "step": 10736 + }, + { + "epoch": 3.2955801104972378, + "grad_norm": 0.26046010851860046, + "learning_rate": 7.826273026249861e-05, + "loss": 1.8152, + "step": 10737 + }, + { + "epoch": 3.2958870472682626, + "grad_norm": 0.24247924983501434, + "learning_rate": 7.82586298260108e-05, + "loss": 1.8679, + "step": 10738 + }, + { + "epoch": 3.296193984039288, + "grad_norm": 0.25977620482444763, + "learning_rate": 7.825452911025466e-05, + "loss": 1.8108, + "step": 10739 + }, + { + "epoch": 3.2965009208103133, + "grad_norm": 0.2732592821121216, + "learning_rate": 7.825042811527068e-05, + "loss": 1.7355, + "step": 10740 + }, + { + "epoch": 3.296807857581338, + "grad_norm": 0.38407859206199646, + "learning_rate": 7.824632684109941e-05, + "loss": 1.8418, + "step": 10741 + }, + { + "epoch": 3.2971147943523635, + "grad_norm": 0.4239252805709839, + "learning_rate": 7.82422252877814e-05, + "loss": 1.7655, + "step": 10742 + }, + { + "epoch": 3.2974217311233884, + "grad_norm": 0.3810526132583618, + "learning_rate": 7.823812345535716e-05, + "loss": 1.8804, + "step": 10743 + }, + { + "epoch": 3.2977286678944138, + "grad_norm": 0.29939520359039307, + "learning_rate": 7.823402134386722e-05, + "loss": 1.8207, + "step": 10744 + }, + { + "epoch": 3.298035604665439, + "grad_norm": 0.4053972065448761, + "learning_rate": 7.822991895335215e-05, + "loss": 1.7901, + "step": 10745 + }, + { + "epoch": 3.298342541436464, + "grad_norm": 0.4975005090236664, + "learning_rate": 7.822581628385247e-05, + "loss": 1.8344, + "step": 10746 + }, + { + "epoch": 3.2986494782074893, + "grad_norm": 0.4100436270236969, + "learning_rate": 7.822171333540874e-05, + "loss": 1.7891, + "step": 10747 + }, + { + "epoch": 3.298956414978514, + "grad_norm": 0.2817644476890564, + "learning_rate": 7.821761010806147e-05, + "loss": 1.7895, + "step": 10748 + }, + { + "epoch": 3.2992633517495396, + "grad_norm": 0.332660973072052, + "learning_rate": 7.821350660185125e-05, + "loss": 1.7281, + "step": 10749 + }, + { + "epoch": 3.299570288520565, + "grad_norm": 0.42652732133865356, + "learning_rate": 7.820940281681863e-05, + "loss": 1.7855, + "step": 10750 + }, + { + "epoch": 3.2998772252915898, + "grad_norm": 0.35700714588165283, + "learning_rate": 7.820529875300415e-05, + "loss": 1.8722, + "step": 10751 + }, + { + "epoch": 3.300184162062615, + "grad_norm": 0.25305211544036865, + "learning_rate": 7.820119441044838e-05, + "loss": 1.7696, + "step": 10752 + }, + { + "epoch": 3.3004910988336404, + "grad_norm": 0.280205637216568, + "learning_rate": 7.819708978919188e-05, + "loss": 1.756, + "step": 10753 + }, + { + "epoch": 3.3007980356046653, + "grad_norm": 0.4176226854324341, + "learning_rate": 7.819298488927521e-05, + "loss": 1.7731, + "step": 10754 + }, + { + "epoch": 3.3011049723756907, + "grad_norm": 0.4264865517616272, + "learning_rate": 7.818887971073894e-05, + "loss": 1.7851, + "step": 10755 + }, + { + "epoch": 3.301411909146716, + "grad_norm": 0.2901221215724945, + "learning_rate": 7.818477425362363e-05, + "loss": 1.7356, + "step": 10756 + }, + { + "epoch": 3.301718845917741, + "grad_norm": 0.29583361744880676, + "learning_rate": 7.818066851796986e-05, + "loss": 1.8269, + "step": 10757 + }, + { + "epoch": 3.3020257826887662, + "grad_norm": 0.38592997193336487, + "learning_rate": 7.817656250381821e-05, + "loss": 1.7515, + "step": 10758 + }, + { + "epoch": 3.302332719459791, + "grad_norm": 0.29301533102989197, + "learning_rate": 7.817245621120927e-05, + "loss": 1.7955, + "step": 10759 + }, + { + "epoch": 3.3026396562308165, + "grad_norm": 0.2770880162715912, + "learning_rate": 7.816834964018359e-05, + "loss": 1.7899, + "step": 10760 + }, + { + "epoch": 3.302946593001842, + "grad_norm": 0.32566413283348083, + "learning_rate": 7.816424279078176e-05, + "loss": 1.74, + "step": 10761 + }, + { + "epoch": 3.3032535297728667, + "grad_norm": 0.3077750504016876, + "learning_rate": 7.81601356630444e-05, + "loss": 1.8123, + "step": 10762 + }, + { + "epoch": 3.303560466543892, + "grad_norm": 0.2826370298862457, + "learning_rate": 7.815602825701206e-05, + "loss": 1.865, + "step": 10763 + }, + { + "epoch": 3.303867403314917, + "grad_norm": 0.31700822710990906, + "learning_rate": 7.815192057272534e-05, + "loss": 1.8021, + "step": 10764 + }, + { + "epoch": 3.3041743400859422, + "grad_norm": 0.33182790875434875, + "learning_rate": 7.814781261022486e-05, + "loss": 1.818, + "step": 10765 + }, + { + "epoch": 3.3044812768569676, + "grad_norm": 0.2720039486885071, + "learning_rate": 7.814370436955118e-05, + "loss": 1.8369, + "step": 10766 + }, + { + "epoch": 3.3047882136279925, + "grad_norm": 0.28134068846702576, + "learning_rate": 7.813959585074493e-05, + "loss": 1.8391, + "step": 10767 + }, + { + "epoch": 3.305095150399018, + "grad_norm": 0.25748828053474426, + "learning_rate": 7.813548705384667e-05, + "loss": 1.7987, + "step": 10768 + }, + { + "epoch": 3.305402087170043, + "grad_norm": 0.26187625527381897, + "learning_rate": 7.813137797889708e-05, + "loss": 1.7645, + "step": 10769 + }, + { + "epoch": 3.305709023941068, + "grad_norm": 0.297262579202652, + "learning_rate": 7.812726862593671e-05, + "loss": 1.771, + "step": 10770 + }, + { + "epoch": 3.3060159607120934, + "grad_norm": 0.2987872064113617, + "learning_rate": 7.812315899500618e-05, + "loss": 1.8115, + "step": 10771 + }, + { + "epoch": 3.3063228974831187, + "grad_norm": 0.31963878870010376, + "learning_rate": 7.81190490861461e-05, + "loss": 1.7685, + "step": 10772 + }, + { + "epoch": 3.3066298342541436, + "grad_norm": 0.27007177472114563, + "learning_rate": 7.81149388993971e-05, + "loss": 1.8272, + "step": 10773 + }, + { + "epoch": 3.306936771025169, + "grad_norm": 0.26818498969078064, + "learning_rate": 7.811082843479981e-05, + "loss": 1.7894, + "step": 10774 + }, + { + "epoch": 3.307243707796194, + "grad_norm": 0.28857091069221497, + "learning_rate": 7.810671769239483e-05, + "loss": 1.8769, + "step": 10775 + }, + { + "epoch": 3.307550644567219, + "grad_norm": 0.26983144879341125, + "learning_rate": 7.810260667222277e-05, + "loss": 1.796, + "step": 10776 + }, + { + "epoch": 3.3078575813382445, + "grad_norm": 0.2566467225551605, + "learning_rate": 7.809849537432432e-05, + "loss": 1.848, + "step": 10777 + }, + { + "epoch": 3.3081645181092694, + "grad_norm": 0.25607848167419434, + "learning_rate": 7.809438379874005e-05, + "loss": 1.8072, + "step": 10778 + }, + { + "epoch": 3.3084714548802947, + "grad_norm": 0.29158470034599304, + "learning_rate": 7.809027194551059e-05, + "loss": 1.7772, + "step": 10779 + }, + { + "epoch": 3.3087783916513196, + "grad_norm": 0.360897421836853, + "learning_rate": 7.808615981467664e-05, + "loss": 1.8404, + "step": 10780 + }, + { + "epoch": 3.309085328422345, + "grad_norm": 0.31121253967285156, + "learning_rate": 7.808204740627877e-05, + "loss": 1.8137, + "step": 10781 + }, + { + "epoch": 3.3093922651933703, + "grad_norm": 0.2846451699733734, + "learning_rate": 7.807793472035765e-05, + "loss": 1.8367, + "step": 10782 + }, + { + "epoch": 3.309699201964395, + "grad_norm": 0.2711004316806793, + "learning_rate": 7.807382175695393e-05, + "loss": 1.7728, + "step": 10783 + }, + { + "epoch": 3.3100061387354205, + "grad_norm": 0.2693859338760376, + "learning_rate": 7.806970851610824e-05, + "loss": 1.7026, + "step": 10784 + }, + { + "epoch": 3.310313075506446, + "grad_norm": 0.3050517439842224, + "learning_rate": 7.806559499786125e-05, + "loss": 1.8041, + "step": 10785 + }, + { + "epoch": 3.3106200122774707, + "grad_norm": 0.27304747700691223, + "learning_rate": 7.80614812022536e-05, + "loss": 1.8182, + "step": 10786 + }, + { + "epoch": 3.310926949048496, + "grad_norm": 0.28378555178642273, + "learning_rate": 7.805736712932594e-05, + "loss": 1.8519, + "step": 10787 + }, + { + "epoch": 3.3112338858195214, + "grad_norm": 0.30620133876800537, + "learning_rate": 7.805325277911892e-05, + "loss": 1.8594, + "step": 10788 + }, + { + "epoch": 3.3115408225905463, + "grad_norm": 0.2580169141292572, + "learning_rate": 7.804913815167325e-05, + "loss": 1.7897, + "step": 10789 + }, + { + "epoch": 3.3118477593615716, + "grad_norm": 0.28937023878097534, + "learning_rate": 7.804502324702951e-05, + "loss": 1.8362, + "step": 10790 + }, + { + "epoch": 3.3121546961325965, + "grad_norm": 0.28032705187797546, + "learning_rate": 7.804090806522844e-05, + "loss": 1.8168, + "step": 10791 + }, + { + "epoch": 3.312461632903622, + "grad_norm": 0.33712559938430786, + "learning_rate": 7.803679260631069e-05, + "loss": 1.7489, + "step": 10792 + }, + { + "epoch": 3.312768569674647, + "grad_norm": 0.40536820888519287, + "learning_rate": 7.80326768703169e-05, + "loss": 1.8413, + "step": 10793 + }, + { + "epoch": 3.313075506445672, + "grad_norm": 0.34967559576034546, + "learning_rate": 7.802856085728778e-05, + "loss": 1.8076, + "step": 10794 + }, + { + "epoch": 3.3133824432166974, + "grad_norm": 0.2429870367050171, + "learning_rate": 7.8024444567264e-05, + "loss": 1.8002, + "step": 10795 + }, + { + "epoch": 3.3136893799877223, + "grad_norm": 0.40956684947013855, + "learning_rate": 7.802032800028621e-05, + "loss": 1.8151, + "step": 10796 + }, + { + "epoch": 3.3139963167587476, + "grad_norm": 0.4908781945705414, + "learning_rate": 7.801621115639512e-05, + "loss": 1.8124, + "step": 10797 + }, + { + "epoch": 3.314303253529773, + "grad_norm": 0.3922197222709656, + "learning_rate": 7.801209403563143e-05, + "loss": 1.7911, + "step": 10798 + }, + { + "epoch": 3.314610190300798, + "grad_norm": 0.29467105865478516, + "learning_rate": 7.800797663803578e-05, + "loss": 1.8472, + "step": 10799 + }, + { + "epoch": 3.314917127071823, + "grad_norm": 0.384974867105484, + "learning_rate": 7.800385896364891e-05, + "loss": 1.8139, + "step": 10800 + }, + { + "epoch": 3.3152240638428485, + "grad_norm": 0.4605129063129425, + "learning_rate": 7.79997410125115e-05, + "loss": 1.7982, + "step": 10801 + }, + { + "epoch": 3.3155310006138734, + "grad_norm": 0.2982464134693146, + "learning_rate": 7.799562278466423e-05, + "loss": 1.8496, + "step": 10802 + }, + { + "epoch": 3.3158379373848987, + "grad_norm": 0.3101392984390259, + "learning_rate": 7.79915042801478e-05, + "loss": 1.8172, + "step": 10803 + }, + { + "epoch": 3.316144874155924, + "grad_norm": 0.3651282489299774, + "learning_rate": 7.798738549900292e-05, + "loss": 1.7497, + "step": 10804 + }, + { + "epoch": 3.316451810926949, + "grad_norm": 0.28504419326782227, + "learning_rate": 7.79832664412703e-05, + "loss": 1.8027, + "step": 10805 + }, + { + "epoch": 3.3167587476979743, + "grad_norm": 0.28333309292793274, + "learning_rate": 7.797914710699063e-05, + "loss": 1.8121, + "step": 10806 + }, + { + "epoch": 3.317065684468999, + "grad_norm": 0.37549784779548645, + "learning_rate": 7.797502749620462e-05, + "loss": 1.817, + "step": 10807 + }, + { + "epoch": 3.3173726212400245, + "grad_norm": 0.3864210844039917, + "learning_rate": 7.797090760895301e-05, + "loss": 1.852, + "step": 10808 + }, + { + "epoch": 3.31767955801105, + "grad_norm": 0.2422102987766266, + "learning_rate": 7.79667874452765e-05, + "loss": 1.7523, + "step": 10809 + }, + { + "epoch": 3.3179864947820747, + "grad_norm": 0.307892382144928, + "learning_rate": 7.79626670052158e-05, + "loss": 1.7436, + "step": 10810 + }, + { + "epoch": 3.3182934315531, + "grad_norm": 0.29607462882995605, + "learning_rate": 7.795854628881162e-05, + "loss": 1.768, + "step": 10811 + }, + { + "epoch": 3.3186003683241254, + "grad_norm": 0.23334427177906036, + "learning_rate": 7.795442529610471e-05, + "loss": 1.7687, + "step": 10812 + }, + { + "epoch": 3.3189073050951503, + "grad_norm": 0.26257455348968506, + "learning_rate": 7.795030402713578e-05, + "loss": 1.8266, + "step": 10813 + }, + { + "epoch": 3.3192142418661756, + "grad_norm": 0.3252788782119751, + "learning_rate": 7.794618248194556e-05, + "loss": 1.8645, + "step": 10814 + }, + { + "epoch": 3.319521178637201, + "grad_norm": 0.3807232975959778, + "learning_rate": 7.79420606605748e-05, + "loss": 1.8154, + "step": 10815 + }, + { + "epoch": 3.319828115408226, + "grad_norm": 0.3395625948905945, + "learning_rate": 7.793793856306422e-05, + "loss": 1.8002, + "step": 10816 + }, + { + "epoch": 3.320135052179251, + "grad_norm": 0.2896415889263153, + "learning_rate": 7.793381618945455e-05, + "loss": 1.8077, + "step": 10817 + }, + { + "epoch": 3.320441988950276, + "grad_norm": 0.27733489871025085, + "learning_rate": 7.792969353978652e-05, + "loss": 1.7976, + "step": 10818 + }, + { + "epoch": 3.3207489257213014, + "grad_norm": 0.36985141038894653, + "learning_rate": 7.79255706141009e-05, + "loss": 1.8724, + "step": 10819 + }, + { + "epoch": 3.3210558624923268, + "grad_norm": 0.37886983156204224, + "learning_rate": 7.792144741243843e-05, + "loss": 1.8249, + "step": 10820 + }, + { + "epoch": 3.3213627992633517, + "grad_norm": 0.3030721843242645, + "learning_rate": 7.791732393483986e-05, + "loss": 1.7975, + "step": 10821 + }, + { + "epoch": 3.321669736034377, + "grad_norm": 0.2637709081172943, + "learning_rate": 7.791320018134592e-05, + "loss": 1.7205, + "step": 10822 + }, + { + "epoch": 3.321976672805402, + "grad_norm": 0.35307520627975464, + "learning_rate": 7.790907615199736e-05, + "loss": 1.8786, + "step": 10823 + }, + { + "epoch": 3.322283609576427, + "grad_norm": 0.3333272635936737, + "learning_rate": 7.790495184683497e-05, + "loss": 1.7715, + "step": 10824 + }, + { + "epoch": 3.3225905463474525, + "grad_norm": 0.2597469091415405, + "learning_rate": 7.790082726589948e-05, + "loss": 1.8379, + "step": 10825 + }, + { + "epoch": 3.3228974831184774, + "grad_norm": 0.34176257252693176, + "learning_rate": 7.789670240923168e-05, + "loss": 1.8305, + "step": 10826 + }, + { + "epoch": 3.3232044198895028, + "grad_norm": 0.37954533100128174, + "learning_rate": 7.789257727687229e-05, + "loss": 1.7728, + "step": 10827 + }, + { + "epoch": 3.323511356660528, + "grad_norm": 0.2840248644351959, + "learning_rate": 7.788845186886212e-05, + "loss": 1.8059, + "step": 10828 + }, + { + "epoch": 3.323818293431553, + "grad_norm": 0.3650275766849518, + "learning_rate": 7.788432618524193e-05, + "loss": 1.8127, + "step": 10829 + }, + { + "epoch": 3.3241252302025783, + "grad_norm": 0.4869692623615265, + "learning_rate": 7.788020022605247e-05, + "loss": 1.833, + "step": 10830 + }, + { + "epoch": 3.3244321669736037, + "grad_norm": 0.3419482707977295, + "learning_rate": 7.787607399133453e-05, + "loss": 1.7812, + "step": 10831 + }, + { + "epoch": 3.3247391037446286, + "grad_norm": 0.27625617384910583, + "learning_rate": 7.787194748112889e-05, + "loss": 1.8513, + "step": 10832 + }, + { + "epoch": 3.325046040515654, + "grad_norm": 0.4287806749343872, + "learning_rate": 7.786782069547633e-05, + "loss": 1.836, + "step": 10833 + }, + { + "epoch": 3.325352977286679, + "grad_norm": 0.4345545172691345, + "learning_rate": 7.786369363441763e-05, + "loss": 1.8027, + "step": 10834 + }, + { + "epoch": 3.325659914057704, + "grad_norm": 0.32976534962654114, + "learning_rate": 7.78595662979936e-05, + "loss": 1.7987, + "step": 10835 + }, + { + "epoch": 3.3259668508287294, + "grad_norm": 0.2677469849586487, + "learning_rate": 7.785543868624498e-05, + "loss": 1.8312, + "step": 10836 + }, + { + "epoch": 3.3262737875997543, + "grad_norm": 0.2547740638256073, + "learning_rate": 7.785131079921259e-05, + "loss": 1.7844, + "step": 10837 + }, + { + "epoch": 3.3265807243707797, + "grad_norm": 0.26755592226982117, + "learning_rate": 7.784718263693725e-05, + "loss": 1.8263, + "step": 10838 + }, + { + "epoch": 3.3268876611418046, + "grad_norm": 0.23884403705596924, + "learning_rate": 7.784305419945969e-05, + "loss": 1.7862, + "step": 10839 + }, + { + "epoch": 3.32719459791283, + "grad_norm": 0.2896903157234192, + "learning_rate": 7.783892548682077e-05, + "loss": 1.9138, + "step": 10840 + }, + { + "epoch": 3.3275015346838552, + "grad_norm": 0.3201359510421753, + "learning_rate": 7.783479649906127e-05, + "loss": 1.8382, + "step": 10841 + }, + { + "epoch": 3.32780847145488, + "grad_norm": 0.39285311102867126, + "learning_rate": 7.7830667236222e-05, + "loss": 1.7763, + "step": 10842 + }, + { + "epoch": 3.3281154082259055, + "grad_norm": 0.435007244348526, + "learning_rate": 7.782653769834376e-05, + "loss": 1.8415, + "step": 10843 + }, + { + "epoch": 3.328422344996931, + "grad_norm": 0.34605318307876587, + "learning_rate": 7.782240788546736e-05, + "loss": 1.757, + "step": 10844 + }, + { + "epoch": 3.3287292817679557, + "grad_norm": 0.26830604672431946, + "learning_rate": 7.781827779763362e-05, + "loss": 1.7779, + "step": 10845 + }, + { + "epoch": 3.329036218538981, + "grad_norm": 0.41851529479026794, + "learning_rate": 7.781414743488336e-05, + "loss": 1.8609, + "step": 10846 + }, + { + "epoch": 3.3293431553100064, + "grad_norm": 0.5058079361915588, + "learning_rate": 7.78100167972574e-05, + "loss": 1.8146, + "step": 10847 + }, + { + "epoch": 3.3296500920810312, + "grad_norm": 0.34394967555999756, + "learning_rate": 7.780588588479654e-05, + "loss": 1.8079, + "step": 10848 + }, + { + "epoch": 3.3299570288520566, + "grad_norm": 0.3033885061740875, + "learning_rate": 7.780175469754161e-05, + "loss": 1.8223, + "step": 10849 + }, + { + "epoch": 3.3302639656230815, + "grad_norm": 0.4431045651435852, + "learning_rate": 7.779762323553347e-05, + "loss": 1.8841, + "step": 10850 + }, + { + "epoch": 3.330570902394107, + "grad_norm": 0.3451448976993561, + "learning_rate": 7.77934914988129e-05, + "loss": 1.8092, + "step": 10851 + }, + { + "epoch": 3.330877839165132, + "grad_norm": 0.26580891013145447, + "learning_rate": 7.778935948742077e-05, + "loss": 1.8244, + "step": 10852 + }, + { + "epoch": 3.331184775936157, + "grad_norm": 0.32079070806503296, + "learning_rate": 7.778522720139792e-05, + "loss": 1.7816, + "step": 10853 + }, + { + "epoch": 3.3314917127071824, + "grad_norm": 0.35789042711257935, + "learning_rate": 7.778109464078514e-05, + "loss": 1.8211, + "step": 10854 + }, + { + "epoch": 3.3317986494782073, + "grad_norm": 0.2808612585067749, + "learning_rate": 7.77769618056233e-05, + "loss": 1.8387, + "step": 10855 + }, + { + "epoch": 3.3321055862492326, + "grad_norm": 0.24760548770427704, + "learning_rate": 7.777282869595326e-05, + "loss": 1.7795, + "step": 10856 + }, + { + "epoch": 3.332412523020258, + "grad_norm": 0.2840912640094757, + "learning_rate": 7.776869531181583e-05, + "loss": 1.7492, + "step": 10857 + }, + { + "epoch": 3.332719459791283, + "grad_norm": 0.2881413698196411, + "learning_rate": 7.77645616532519e-05, + "loss": 1.8157, + "step": 10858 + }, + { + "epoch": 3.333026396562308, + "grad_norm": 0.2508779764175415, + "learning_rate": 7.776042772030228e-05, + "loss": 1.8196, + "step": 10859 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.3307822048664093, + "learning_rate": 7.775629351300785e-05, + "loss": 1.8195, + "step": 10860 + }, + { + "epoch": 3.3336402701043584, + "grad_norm": 0.34392043948173523, + "learning_rate": 7.775215903140946e-05, + "loss": 1.7775, + "step": 10861 + }, + { + "epoch": 3.3339472068753837, + "grad_norm": 0.2594252824783325, + "learning_rate": 7.774802427554796e-05, + "loss": 1.7687, + "step": 10862 + }, + { + "epoch": 3.334254143646409, + "grad_norm": 0.3109053075313568, + "learning_rate": 7.774388924546423e-05, + "loss": 1.7908, + "step": 10863 + }, + { + "epoch": 3.334561080417434, + "grad_norm": 0.4801923930644989, + "learning_rate": 7.773975394119913e-05, + "loss": 1.8316, + "step": 10864 + }, + { + "epoch": 3.3348680171884593, + "grad_norm": 0.4754973351955414, + "learning_rate": 7.77356183627935e-05, + "loss": 1.8015, + "step": 10865 + }, + { + "epoch": 3.335174953959484, + "grad_norm": 0.29624658823013306, + "learning_rate": 7.773148251028825e-05, + "loss": 1.8179, + "step": 10866 + }, + { + "epoch": 3.3354818907305095, + "grad_norm": 0.32207581400871277, + "learning_rate": 7.772734638372423e-05, + "loss": 1.799, + "step": 10867 + }, + { + "epoch": 3.335788827501535, + "grad_norm": 0.5227517485618591, + "learning_rate": 7.772320998314233e-05, + "loss": 1.8452, + "step": 10868 + }, + { + "epoch": 3.3360957642725597, + "grad_norm": 0.4081100523471832, + "learning_rate": 7.771907330858341e-05, + "loss": 1.8182, + "step": 10869 + }, + { + "epoch": 3.336402701043585, + "grad_norm": 0.23786653578281403, + "learning_rate": 7.771493636008838e-05, + "loss": 1.7392, + "step": 10870 + }, + { + "epoch": 3.33670963781461, + "grad_norm": 0.37913820147514343, + "learning_rate": 7.771079913769807e-05, + "loss": 1.7559, + "step": 10871 + }, + { + "epoch": 3.3370165745856353, + "grad_norm": 0.4939163625240326, + "learning_rate": 7.770666164145344e-05, + "loss": 1.8076, + "step": 10872 + }, + { + "epoch": 3.3373235113566606, + "grad_norm": 0.3322528302669525, + "learning_rate": 7.770252387139532e-05, + "loss": 1.8045, + "step": 10873 + }, + { + "epoch": 3.337630448127686, + "grad_norm": 0.3685782849788666, + "learning_rate": 7.769838582756461e-05, + "loss": 1.7703, + "step": 10874 + }, + { + "epoch": 3.337937384898711, + "grad_norm": 0.5564271807670593, + "learning_rate": 7.769424751000224e-05, + "loss": 1.7697, + "step": 10875 + }, + { + "epoch": 3.338244321669736, + "grad_norm": 0.38610726594924927, + "learning_rate": 7.769010891874906e-05, + "loss": 1.7944, + "step": 10876 + }, + { + "epoch": 3.338551258440761, + "grad_norm": 0.23838558793067932, + "learning_rate": 7.768597005384602e-05, + "loss": 1.765, + "step": 10877 + }, + { + "epoch": 3.3388581952117864, + "grad_norm": 0.4334571063518524, + "learning_rate": 7.768183091533399e-05, + "loss": 1.7854, + "step": 10878 + }, + { + "epoch": 3.3391651319828117, + "grad_norm": 0.44844719767570496, + "learning_rate": 7.767769150325386e-05, + "loss": 1.7955, + "step": 10879 + }, + { + "epoch": 3.3394720687538366, + "grad_norm": 0.26543378829956055, + "learning_rate": 7.767355181764659e-05, + "loss": 1.8311, + "step": 10880 + }, + { + "epoch": 3.339779005524862, + "grad_norm": 0.39401358366012573, + "learning_rate": 7.766941185855304e-05, + "loss": 1.8264, + "step": 10881 + }, + { + "epoch": 3.340085942295887, + "grad_norm": 0.5476824045181274, + "learning_rate": 7.766527162601416e-05, + "loss": 1.8051, + "step": 10882 + }, + { + "epoch": 3.340392879066912, + "grad_norm": 0.4021138548851013, + "learning_rate": 7.766113112007084e-05, + "loss": 1.7941, + "step": 10883 + }, + { + "epoch": 3.3406998158379375, + "grad_norm": 0.3262040317058563, + "learning_rate": 7.765699034076402e-05, + "loss": 1.8317, + "step": 10884 + }, + { + "epoch": 3.3410067526089624, + "grad_norm": 0.5461146831512451, + "learning_rate": 7.765284928813459e-05, + "loss": 1.833, + "step": 10885 + }, + { + "epoch": 3.3413136893799877, + "grad_norm": 0.5067405700683594, + "learning_rate": 7.764870796222351e-05, + "loss": 1.7862, + "step": 10886 + }, + { + "epoch": 3.341620626151013, + "grad_norm": 0.2731069028377533, + "learning_rate": 7.76445663630717e-05, + "loss": 1.8173, + "step": 10887 + }, + { + "epoch": 3.341927562922038, + "grad_norm": 0.48928195238113403, + "learning_rate": 7.764042449072008e-05, + "loss": 1.7992, + "step": 10888 + }, + { + "epoch": 3.3422344996930633, + "grad_norm": 0.5338504910469055, + "learning_rate": 7.763628234520958e-05, + "loss": 1.7891, + "step": 10889 + }, + { + "epoch": 3.3425414364640886, + "grad_norm": 0.3136523365974426, + "learning_rate": 7.763213992658114e-05, + "loss": 1.8623, + "step": 10890 + }, + { + "epoch": 3.3428483732351135, + "grad_norm": 0.36551395058631897, + "learning_rate": 7.762799723487568e-05, + "loss": 1.8474, + "step": 10891 + }, + { + "epoch": 3.343155310006139, + "grad_norm": 0.35772353410720825, + "learning_rate": 7.762385427013419e-05, + "loss": 1.8625, + "step": 10892 + }, + { + "epoch": 3.3434622467771637, + "grad_norm": 0.29944708943367004, + "learning_rate": 7.761971103239755e-05, + "loss": 1.8181, + "step": 10893 + }, + { + "epoch": 3.343769183548189, + "grad_norm": 0.3395330309867859, + "learning_rate": 7.761556752170676e-05, + "loss": 1.7943, + "step": 10894 + }, + { + "epoch": 3.3440761203192144, + "grad_norm": 0.3624265193939209, + "learning_rate": 7.761142373810274e-05, + "loss": 1.8234, + "step": 10895 + }, + { + "epoch": 3.3443830570902393, + "grad_norm": 0.25409621000289917, + "learning_rate": 7.760727968162644e-05, + "loss": 1.7532, + "step": 10896 + }, + { + "epoch": 3.3446899938612646, + "grad_norm": 0.321437805891037, + "learning_rate": 7.760313535231883e-05, + "loss": 1.8808, + "step": 10897 + }, + { + "epoch": 3.3449969306322895, + "grad_norm": 0.2919142544269562, + "learning_rate": 7.759899075022086e-05, + "loss": 1.7677, + "step": 10898 + }, + { + "epoch": 3.345303867403315, + "grad_norm": 0.26515716314315796, + "learning_rate": 7.759484587537346e-05, + "loss": 1.8118, + "step": 10899 + }, + { + "epoch": 3.34561080417434, + "grad_norm": 0.2963240146636963, + "learning_rate": 7.759070072781764e-05, + "loss": 1.8329, + "step": 10900 + }, + { + "epoch": 3.345917740945365, + "grad_norm": 0.3186480700969696, + "learning_rate": 7.758655530759435e-05, + "loss": 1.8013, + "step": 10901 + }, + { + "epoch": 3.3462246777163904, + "grad_norm": 0.256145715713501, + "learning_rate": 7.758240961474454e-05, + "loss": 1.7865, + "step": 10902 + }, + { + "epoch": 3.3465316144874158, + "grad_norm": 0.28951629996299744, + "learning_rate": 7.757826364930921e-05, + "loss": 1.8091, + "step": 10903 + }, + { + "epoch": 3.3468385512584407, + "grad_norm": 0.2692483365535736, + "learning_rate": 7.75741174113293e-05, + "loss": 1.8308, + "step": 10904 + }, + { + "epoch": 3.347145488029466, + "grad_norm": 0.27615389227867126, + "learning_rate": 7.75699709008458e-05, + "loss": 1.7888, + "step": 10905 + }, + { + "epoch": 3.3474524248004913, + "grad_norm": 0.2819034457206726, + "learning_rate": 7.75658241178997e-05, + "loss": 1.7624, + "step": 10906 + }, + { + "epoch": 3.347759361571516, + "grad_norm": 0.2627592086791992, + "learning_rate": 7.756167706253196e-05, + "loss": 1.7696, + "step": 10907 + }, + { + "epoch": 3.3480662983425415, + "grad_norm": 0.3528621196746826, + "learning_rate": 7.755752973478356e-05, + "loss": 1.7725, + "step": 10908 + }, + { + "epoch": 3.3483732351135664, + "grad_norm": 0.35949698090553284, + "learning_rate": 7.755338213469552e-05, + "loss": 1.8163, + "step": 10909 + }, + { + "epoch": 3.3486801718845918, + "grad_norm": 0.25142577290534973, + "learning_rate": 7.75492342623088e-05, + "loss": 1.7879, + "step": 10910 + }, + { + "epoch": 3.348987108655617, + "grad_norm": 0.25766023993492126, + "learning_rate": 7.75450861176644e-05, + "loss": 1.8143, + "step": 10911 + }, + { + "epoch": 3.349294045426642, + "grad_norm": 0.2736956477165222, + "learning_rate": 7.754093770080331e-05, + "loss": 1.8907, + "step": 10912 + }, + { + "epoch": 3.3496009821976673, + "grad_norm": 0.23700755834579468, + "learning_rate": 7.753678901176654e-05, + "loss": 1.813, + "step": 10913 + }, + { + "epoch": 3.349907918968692, + "grad_norm": 0.245509073138237, + "learning_rate": 7.753264005059507e-05, + "loss": 1.8019, + "step": 10914 + }, + { + "epoch": 3.3502148557397176, + "grad_norm": 0.232910618185997, + "learning_rate": 7.752849081732993e-05, + "loss": 1.784, + "step": 10915 + }, + { + "epoch": 3.350521792510743, + "grad_norm": 0.22989360988140106, + "learning_rate": 7.75243413120121e-05, + "loss": 1.7597, + "step": 10916 + }, + { + "epoch": 3.350828729281768, + "grad_norm": 0.2093925178050995, + "learning_rate": 7.752019153468258e-05, + "loss": 1.7698, + "step": 10917 + }, + { + "epoch": 3.351135666052793, + "grad_norm": 0.25539630651474, + "learning_rate": 7.751604148538241e-05, + "loss": 1.8287, + "step": 10918 + }, + { + "epoch": 3.3514426028238185, + "grad_norm": 0.2731820046901703, + "learning_rate": 7.75118911641526e-05, + "loss": 1.8862, + "step": 10919 + }, + { + "epoch": 3.3517495395948433, + "grad_norm": 0.2464541345834732, + "learning_rate": 7.750774057103416e-05, + "loss": 1.8165, + "step": 10920 + }, + { + "epoch": 3.3520564763658687, + "grad_norm": 0.26380276679992676, + "learning_rate": 7.75035897060681e-05, + "loss": 1.78, + "step": 10921 + }, + { + "epoch": 3.352363413136894, + "grad_norm": 0.3080748915672302, + "learning_rate": 7.749943856929542e-05, + "loss": 1.7925, + "step": 10922 + }, + { + "epoch": 3.352670349907919, + "grad_norm": 0.317754864692688, + "learning_rate": 7.74952871607572e-05, + "loss": 1.8248, + "step": 10923 + }, + { + "epoch": 3.3529772866789442, + "grad_norm": 0.2525196373462677, + "learning_rate": 7.749113548049442e-05, + "loss": 1.762, + "step": 10924 + }, + { + "epoch": 3.353284223449969, + "grad_norm": 0.3149549961090088, + "learning_rate": 7.748698352854814e-05, + "loss": 1.8289, + "step": 10925 + }, + { + "epoch": 3.3535911602209945, + "grad_norm": 0.35744383931159973, + "learning_rate": 7.748283130495937e-05, + "loss": 1.8132, + "step": 10926 + }, + { + "epoch": 3.35389809699202, + "grad_norm": 0.28599128127098083, + "learning_rate": 7.747867880976916e-05, + "loss": 1.7351, + "step": 10927 + }, + { + "epoch": 3.3542050337630447, + "grad_norm": 0.24428869783878326, + "learning_rate": 7.747452604301852e-05, + "loss": 1.794, + "step": 10928 + }, + { + "epoch": 3.35451197053407, + "grad_norm": 0.29067808389663696, + "learning_rate": 7.747037300474854e-05, + "loss": 1.8181, + "step": 10929 + }, + { + "epoch": 3.354818907305095, + "grad_norm": 0.32417505979537964, + "learning_rate": 7.746621969500021e-05, + "loss": 1.8338, + "step": 10930 + }, + { + "epoch": 3.3551258440761202, + "grad_norm": 0.29536551237106323, + "learning_rate": 7.746206611381462e-05, + "loss": 1.8732, + "step": 10931 + }, + { + "epoch": 3.3554327808471456, + "grad_norm": 0.3169345259666443, + "learning_rate": 7.745791226123278e-05, + "loss": 1.876, + "step": 10932 + }, + { + "epoch": 3.3557397176181705, + "grad_norm": 0.2680271565914154, + "learning_rate": 7.745375813729576e-05, + "loss": 1.7347, + "step": 10933 + }, + { + "epoch": 3.356046654389196, + "grad_norm": 0.28339266777038574, + "learning_rate": 7.74496037420446e-05, + "loss": 1.8507, + "step": 10934 + }, + { + "epoch": 3.356353591160221, + "grad_norm": 0.2567409574985504, + "learning_rate": 7.744544907552038e-05, + "loss": 1.8244, + "step": 10935 + }, + { + "epoch": 3.356660527931246, + "grad_norm": 0.266063928604126, + "learning_rate": 7.744129413776416e-05, + "loss": 1.7864, + "step": 10936 + }, + { + "epoch": 3.3569674647022714, + "grad_norm": 0.2490999698638916, + "learning_rate": 7.743713892881696e-05, + "loss": 1.7637, + "step": 10937 + }, + { + "epoch": 3.3572744014732967, + "grad_norm": 0.25857025384902954, + "learning_rate": 7.743298344871988e-05, + "loss": 1.8101, + "step": 10938 + }, + { + "epoch": 3.3575813382443216, + "grad_norm": 0.2549006938934326, + "learning_rate": 7.742882769751398e-05, + "loss": 1.7782, + "step": 10939 + }, + { + "epoch": 3.357888275015347, + "grad_norm": 0.23915350437164307, + "learning_rate": 7.742467167524035e-05, + "loss": 1.7822, + "step": 10940 + }, + { + "epoch": 3.358195211786372, + "grad_norm": 0.25501590967178345, + "learning_rate": 7.742051538194e-05, + "loss": 1.798, + "step": 10941 + }, + { + "epoch": 3.358502148557397, + "grad_norm": 0.29332005977630615, + "learning_rate": 7.741635881765408e-05, + "loss": 1.8334, + "step": 10942 + }, + { + "epoch": 3.3588090853284225, + "grad_norm": 0.28878241777420044, + "learning_rate": 7.741220198242362e-05, + "loss": 1.8266, + "step": 10943 + }, + { + "epoch": 3.3591160220994474, + "grad_norm": 0.3068650960922241, + "learning_rate": 7.740804487628971e-05, + "loss": 1.8562, + "step": 10944 + }, + { + "epoch": 3.3594229588704727, + "grad_norm": 0.2522405683994293, + "learning_rate": 7.740388749929343e-05, + "loss": 1.8001, + "step": 10945 + }, + { + "epoch": 3.359729895641498, + "grad_norm": 0.3073521554470062, + "learning_rate": 7.739972985147588e-05, + "loss": 1.7454, + "step": 10946 + }, + { + "epoch": 3.360036832412523, + "grad_norm": 0.3018052577972412, + "learning_rate": 7.739557193287815e-05, + "loss": 1.7888, + "step": 10947 + }, + { + "epoch": 3.3603437691835483, + "grad_norm": 0.2738604247570038, + "learning_rate": 7.73914137435413e-05, + "loss": 1.7208, + "step": 10948 + }, + { + "epoch": 3.3606507059545736, + "grad_norm": 0.37699586153030396, + "learning_rate": 7.738725528350646e-05, + "loss": 1.8175, + "step": 10949 + }, + { + "epoch": 3.3609576427255985, + "grad_norm": 0.3479778468608856, + "learning_rate": 7.738309655281471e-05, + "loss": 1.818, + "step": 10950 + }, + { + "epoch": 3.361264579496624, + "grad_norm": 0.24871166050434113, + "learning_rate": 7.737893755150715e-05, + "loss": 1.7046, + "step": 10951 + }, + { + "epoch": 3.3615715162676487, + "grad_norm": 0.45015642046928406, + "learning_rate": 7.737477827962488e-05, + "loss": 1.8517, + "step": 10952 + }, + { + "epoch": 3.361878453038674, + "grad_norm": 0.4149077534675598, + "learning_rate": 7.7370618737209e-05, + "loss": 1.7403, + "step": 10953 + }, + { + "epoch": 3.3621853898096994, + "grad_norm": 0.2556059658527374, + "learning_rate": 7.736645892430064e-05, + "loss": 1.8167, + "step": 10954 + }, + { + "epoch": 3.3624923265807243, + "grad_norm": 0.3153657615184784, + "learning_rate": 7.736229884094088e-05, + "loss": 1.8471, + "step": 10955 + }, + { + "epoch": 3.3627992633517496, + "grad_norm": 0.27943772077560425, + "learning_rate": 7.735813848717084e-05, + "loss": 1.7742, + "step": 10956 + }, + { + "epoch": 3.3631062001227745, + "grad_norm": 0.28270283341407776, + "learning_rate": 7.735397786303164e-05, + "loss": 1.8418, + "step": 10957 + }, + { + "epoch": 3.3634131368938, + "grad_norm": 0.3596261441707611, + "learning_rate": 7.734981696856442e-05, + "loss": 1.8213, + "step": 10958 + }, + { + "epoch": 3.363720073664825, + "grad_norm": 0.3678492307662964, + "learning_rate": 7.734565580381026e-05, + "loss": 1.806, + "step": 10959 + }, + { + "epoch": 3.36402701043585, + "grad_norm": 0.27758681774139404, + "learning_rate": 7.734149436881031e-05, + "loss": 1.7832, + "step": 10960 + }, + { + "epoch": 3.3643339472068754, + "grad_norm": 0.2821379005908966, + "learning_rate": 7.733733266360568e-05, + "loss": 1.8888, + "step": 10961 + }, + { + "epoch": 3.3646408839779007, + "grad_norm": 0.33676958084106445, + "learning_rate": 7.733317068823751e-05, + "loss": 1.902, + "step": 10962 + }, + { + "epoch": 3.3649478207489256, + "grad_norm": 0.3116114139556885, + "learning_rate": 7.732900844274691e-05, + "loss": 1.8228, + "step": 10963 + }, + { + "epoch": 3.365254757519951, + "grad_norm": 0.3286324143409729, + "learning_rate": 7.732484592717506e-05, + "loss": 1.8707, + "step": 10964 + }, + { + "epoch": 3.3655616942909763, + "grad_norm": 0.2732192873954773, + "learning_rate": 7.732068314156304e-05, + "loss": 1.773, + "step": 10965 + }, + { + "epoch": 3.365868631062001, + "grad_norm": 0.26663896441459656, + "learning_rate": 7.731652008595204e-05, + "loss": 1.7837, + "step": 10966 + }, + { + "epoch": 3.3661755678330265, + "grad_norm": 0.27447745203971863, + "learning_rate": 7.731235676038317e-05, + "loss": 1.9103, + "step": 10967 + }, + { + "epoch": 3.3664825046040514, + "grad_norm": 0.30832916498184204, + "learning_rate": 7.730819316489757e-05, + "loss": 1.7552, + "step": 10968 + }, + { + "epoch": 3.3667894413750767, + "grad_norm": 0.29657161235809326, + "learning_rate": 7.73040292995364e-05, + "loss": 1.7654, + "step": 10969 + }, + { + "epoch": 3.367096378146102, + "grad_norm": 0.30434274673461914, + "learning_rate": 7.729986516434082e-05, + "loss": 1.8646, + "step": 10970 + }, + { + "epoch": 3.367403314917127, + "grad_norm": 0.25926661491394043, + "learning_rate": 7.729570075935198e-05, + "loss": 1.7555, + "step": 10971 + }, + { + "epoch": 3.3677102516881523, + "grad_norm": 0.2775980532169342, + "learning_rate": 7.729153608461102e-05, + "loss": 1.8427, + "step": 10972 + }, + { + "epoch": 3.368017188459177, + "grad_norm": 0.23915666341781616, + "learning_rate": 7.72873711401591e-05, + "loss": 1.7902, + "step": 10973 + }, + { + "epoch": 3.3683241252302025, + "grad_norm": 0.2603691518306732, + "learning_rate": 7.728320592603737e-05, + "loss": 1.8587, + "step": 10974 + }, + { + "epoch": 3.368631062001228, + "grad_norm": 0.2579508125782013, + "learning_rate": 7.727904044228703e-05, + "loss": 1.7617, + "step": 10975 + }, + { + "epoch": 3.3689379987722528, + "grad_norm": 0.3384297788143158, + "learning_rate": 7.72748746889492e-05, + "loss": 1.8499, + "step": 10976 + }, + { + "epoch": 3.369244935543278, + "grad_norm": 0.36756646633148193, + "learning_rate": 7.727070866606509e-05, + "loss": 1.808, + "step": 10977 + }, + { + "epoch": 3.3695518723143034, + "grad_norm": 0.3212372958660126, + "learning_rate": 7.726654237367587e-05, + "loss": 1.8245, + "step": 10978 + }, + { + "epoch": 3.3698588090853283, + "grad_norm": 0.23782415688037872, + "learning_rate": 7.726237581182267e-05, + "loss": 1.7629, + "step": 10979 + }, + { + "epoch": 3.3701657458563536, + "grad_norm": 0.2782919108867645, + "learning_rate": 7.725820898054669e-05, + "loss": 1.8, + "step": 10980 + }, + { + "epoch": 3.370472682627379, + "grad_norm": 0.2973455488681793, + "learning_rate": 7.725404187988914e-05, + "loss": 1.7949, + "step": 10981 + }, + { + "epoch": 3.370779619398404, + "grad_norm": 0.2875392735004425, + "learning_rate": 7.724987450989114e-05, + "loss": 1.8019, + "step": 10982 + }, + { + "epoch": 3.371086556169429, + "grad_norm": 0.26133236289024353, + "learning_rate": 7.724570687059394e-05, + "loss": 1.7984, + "step": 10983 + }, + { + "epoch": 3.371393492940454, + "grad_norm": 0.2760173976421356, + "learning_rate": 7.724153896203867e-05, + "loss": 1.8082, + "step": 10984 + }, + { + "epoch": 3.3717004297114794, + "grad_norm": 0.26373061537742615, + "learning_rate": 7.723737078426656e-05, + "loss": 1.8408, + "step": 10985 + }, + { + "epoch": 3.3720073664825048, + "grad_norm": 0.29425618052482605, + "learning_rate": 7.723320233731879e-05, + "loss": 1.7992, + "step": 10986 + }, + { + "epoch": 3.3723143032535297, + "grad_norm": 0.29822099208831787, + "learning_rate": 7.722903362123655e-05, + "loss": 1.8204, + "step": 10987 + }, + { + "epoch": 3.372621240024555, + "grad_norm": 0.25945618748664856, + "learning_rate": 7.722486463606104e-05, + "loss": 1.7376, + "step": 10988 + }, + { + "epoch": 3.37292817679558, + "grad_norm": 0.26367196440696716, + "learning_rate": 7.722069538183345e-05, + "loss": 1.814, + "step": 10989 + }, + { + "epoch": 3.373235113566605, + "grad_norm": 0.25015249848365784, + "learning_rate": 7.7216525858595e-05, + "loss": 1.8199, + "step": 10990 + }, + { + "epoch": 3.3735420503376305, + "grad_norm": 0.3035781681537628, + "learning_rate": 7.72123560663869e-05, + "loss": 1.739, + "step": 10991 + }, + { + "epoch": 3.3738489871086554, + "grad_norm": 0.2847912013530731, + "learning_rate": 7.720818600525033e-05, + "loss": 1.8754, + "step": 10992 + }, + { + "epoch": 3.3741559238796808, + "grad_norm": 0.2533976435661316, + "learning_rate": 7.720401567522653e-05, + "loss": 1.7616, + "step": 10993 + }, + { + "epoch": 3.374462860650706, + "grad_norm": 0.250828355550766, + "learning_rate": 7.719984507635669e-05, + "loss": 1.7973, + "step": 10994 + }, + { + "epoch": 3.374769797421731, + "grad_norm": 0.3019898235797882, + "learning_rate": 7.719567420868206e-05, + "loss": 1.7563, + "step": 10995 + }, + { + "epoch": 3.3750767341927563, + "grad_norm": 0.2703310549259186, + "learning_rate": 7.719150307224382e-05, + "loss": 1.8183, + "step": 10996 + }, + { + "epoch": 3.3753836709637817, + "grad_norm": 0.2434745579957962, + "learning_rate": 7.718733166708321e-05, + "loss": 1.7913, + "step": 10997 + }, + { + "epoch": 3.3756906077348066, + "grad_norm": 0.28036773204803467, + "learning_rate": 7.718315999324146e-05, + "loss": 1.7884, + "step": 10998 + }, + { + "epoch": 3.375997544505832, + "grad_norm": 0.25123077630996704, + "learning_rate": 7.717898805075978e-05, + "loss": 1.7394, + "step": 10999 + }, + { + "epoch": 3.376304481276857, + "grad_norm": 0.2313947230577469, + "learning_rate": 7.717481583967943e-05, + "loss": 1.7537, + "step": 11000 + }, + { + "epoch": 3.376611418047882, + "grad_norm": 0.27152860164642334, + "learning_rate": 7.71706433600416e-05, + "loss": 1.8596, + "step": 11001 + }, + { + "epoch": 3.3769183548189075, + "grad_norm": 0.32866382598876953, + "learning_rate": 7.716647061188757e-05, + "loss": 1.9007, + "step": 11002 + }, + { + "epoch": 3.3772252915899323, + "grad_norm": 0.2842368185520172, + "learning_rate": 7.716229759525854e-05, + "loss": 1.7781, + "step": 11003 + }, + { + "epoch": 3.3775322283609577, + "grad_norm": 0.30411216616630554, + "learning_rate": 7.715812431019576e-05, + "loss": 1.7403, + "step": 11004 + }, + { + "epoch": 3.3778391651319826, + "grad_norm": 0.31848132610321045, + "learning_rate": 7.71539507567405e-05, + "loss": 1.817, + "step": 11005 + }, + { + "epoch": 3.378146101903008, + "grad_norm": 0.24206148087978363, + "learning_rate": 7.714977693493397e-05, + "loss": 1.7796, + "step": 11006 + }, + { + "epoch": 3.3784530386740332, + "grad_norm": 0.2982998490333557, + "learning_rate": 7.714560284481742e-05, + "loss": 1.7883, + "step": 11007 + }, + { + "epoch": 3.378759975445058, + "grad_norm": 0.24857483804225922, + "learning_rate": 7.714142848643213e-05, + "loss": 1.7447, + "step": 11008 + }, + { + "epoch": 3.3790669122160835, + "grad_norm": 0.2509039044380188, + "learning_rate": 7.713725385981932e-05, + "loss": 1.8362, + "step": 11009 + }, + { + "epoch": 3.379373848987109, + "grad_norm": 0.2759779095649719, + "learning_rate": 7.713307896502027e-05, + "loss": 1.8655, + "step": 11010 + }, + { + "epoch": 3.3796807857581337, + "grad_norm": 0.264776349067688, + "learning_rate": 7.712890380207623e-05, + "loss": 1.8221, + "step": 11011 + }, + { + "epoch": 3.379987722529159, + "grad_norm": 0.2771971821784973, + "learning_rate": 7.712472837102846e-05, + "loss": 1.6992, + "step": 11012 + }, + { + "epoch": 3.3802946593001844, + "grad_norm": 0.2749316096305847, + "learning_rate": 7.712055267191822e-05, + "loss": 1.8128, + "step": 11013 + }, + { + "epoch": 3.3806015960712092, + "grad_norm": 0.256656289100647, + "learning_rate": 7.71163767047868e-05, + "loss": 1.8382, + "step": 11014 + }, + { + "epoch": 3.3809085328422346, + "grad_norm": 0.27646976709365845, + "learning_rate": 7.711220046967545e-05, + "loss": 1.8321, + "step": 11015 + }, + { + "epoch": 3.3812154696132595, + "grad_norm": 0.3083149194717407, + "learning_rate": 7.710802396662542e-05, + "loss": 1.904, + "step": 11016 + }, + { + "epoch": 3.381522406384285, + "grad_norm": 0.2750856280326843, + "learning_rate": 7.710384719567803e-05, + "loss": 1.7596, + "step": 11017 + }, + { + "epoch": 3.38182934315531, + "grad_norm": 0.3029455244541168, + "learning_rate": 7.709967015687452e-05, + "loss": 1.8542, + "step": 11018 + }, + { + "epoch": 3.382136279926335, + "grad_norm": 0.3144093453884125, + "learning_rate": 7.709549285025622e-05, + "loss": 1.7489, + "step": 11019 + }, + { + "epoch": 3.3824432166973604, + "grad_norm": 0.2675442099571228, + "learning_rate": 7.709131527586433e-05, + "loss": 1.7324, + "step": 11020 + }, + { + "epoch": 3.3827501534683857, + "grad_norm": 0.2906095087528229, + "learning_rate": 7.708713743374021e-05, + "loss": 1.7848, + "step": 11021 + }, + { + "epoch": 3.3830570902394106, + "grad_norm": 0.25141623616218567, + "learning_rate": 7.708295932392513e-05, + "loss": 1.7423, + "step": 11022 + }, + { + "epoch": 3.383364027010436, + "grad_norm": 0.25832003355026245, + "learning_rate": 7.707878094646037e-05, + "loss": 1.7792, + "step": 11023 + }, + { + "epoch": 3.3836709637814613, + "grad_norm": 0.23710070550441742, + "learning_rate": 7.70746023013872e-05, + "loss": 1.7916, + "step": 11024 + }, + { + "epoch": 3.383977900552486, + "grad_norm": 0.286735862493515, + "learning_rate": 7.707042338874697e-05, + "loss": 1.8272, + "step": 11025 + }, + { + "epoch": 3.3842848373235115, + "grad_norm": 0.2536577582359314, + "learning_rate": 7.706624420858094e-05, + "loss": 1.7839, + "step": 11026 + }, + { + "epoch": 3.3845917740945364, + "grad_norm": 0.5564702749252319, + "learning_rate": 7.706206476093043e-05, + "loss": 1.7832, + "step": 11027 + }, + { + "epoch": 3.3848987108655617, + "grad_norm": 0.34694772958755493, + "learning_rate": 7.705788504583671e-05, + "loss": 1.8668, + "step": 11028 + }, + { + "epoch": 3.385205647636587, + "grad_norm": 0.30388176441192627, + "learning_rate": 7.705370506334113e-05, + "loss": 1.8244, + "step": 11029 + }, + { + "epoch": 3.385512584407612, + "grad_norm": 0.2998919188976288, + "learning_rate": 7.704952481348497e-05, + "loss": 1.7927, + "step": 11030 + }, + { + "epoch": 3.3858195211786373, + "grad_norm": 0.2714936435222626, + "learning_rate": 7.704534429630955e-05, + "loss": 1.8757, + "step": 11031 + }, + { + "epoch": 3.386126457949662, + "grad_norm": 0.26670241355895996, + "learning_rate": 7.704116351185619e-05, + "loss": 1.8146, + "step": 11032 + }, + { + "epoch": 3.3864333947206875, + "grad_norm": 0.2500552833080292, + "learning_rate": 7.703698246016621e-05, + "loss": 1.7984, + "step": 11033 + }, + { + "epoch": 3.386740331491713, + "grad_norm": 0.2494918406009674, + "learning_rate": 7.703280114128091e-05, + "loss": 1.7433, + "step": 11034 + }, + { + "epoch": 3.3870472682627377, + "grad_norm": 0.25658491253852844, + "learning_rate": 7.702861955524163e-05, + "loss": 1.8487, + "step": 11035 + }, + { + "epoch": 3.387354205033763, + "grad_norm": 0.2871410548686981, + "learning_rate": 7.702443770208969e-05, + "loss": 1.7919, + "step": 11036 + }, + { + "epoch": 3.3876611418047884, + "grad_norm": 0.3347938060760498, + "learning_rate": 7.702025558186643e-05, + "loss": 1.8091, + "step": 11037 + }, + { + "epoch": 3.3879680785758133, + "grad_norm": 0.39016643166542053, + "learning_rate": 7.701607319461315e-05, + "loss": 1.7816, + "step": 11038 + }, + { + "epoch": 3.3882750153468386, + "grad_norm": 0.3423028290271759, + "learning_rate": 7.701189054037121e-05, + "loss": 1.8454, + "step": 11039 + }, + { + "epoch": 3.388581952117864, + "grad_norm": 0.27592089772224426, + "learning_rate": 7.700770761918192e-05, + "loss": 1.8431, + "step": 11040 + }, + { + "epoch": 3.388888888888889, + "grad_norm": 0.46047264337539673, + "learning_rate": 7.700352443108665e-05, + "loss": 1.8412, + "step": 11041 + }, + { + "epoch": 3.389195825659914, + "grad_norm": 0.49226754903793335, + "learning_rate": 7.699934097612673e-05, + "loss": 1.8212, + "step": 11042 + }, + { + "epoch": 3.389502762430939, + "grad_norm": 0.3958778381347656, + "learning_rate": 7.699515725434348e-05, + "loss": 1.747, + "step": 11043 + }, + { + "epoch": 3.3898096992019644, + "grad_norm": 0.26097169518470764, + "learning_rate": 7.699097326577827e-05, + "loss": 1.7631, + "step": 11044 + }, + { + "epoch": 3.3901166359729897, + "grad_norm": 0.2922612130641937, + "learning_rate": 7.698678901047245e-05, + "loss": 1.7891, + "step": 11045 + }, + { + "epoch": 3.3904235727440146, + "grad_norm": 0.4195055365562439, + "learning_rate": 7.698260448846734e-05, + "loss": 1.7765, + "step": 11046 + }, + { + "epoch": 3.39073050951504, + "grad_norm": 0.4572988450527191, + "learning_rate": 7.697841969980434e-05, + "loss": 1.8085, + "step": 11047 + }, + { + "epoch": 3.391037446286065, + "grad_norm": 0.38819587230682373, + "learning_rate": 7.697423464452478e-05, + "loss": 1.8854, + "step": 11048 + }, + { + "epoch": 3.39134438305709, + "grad_norm": 0.27421653270721436, + "learning_rate": 7.697004932267003e-05, + "loss": 1.8327, + "step": 11049 + }, + { + "epoch": 3.3916513198281155, + "grad_norm": 0.33559146523475647, + "learning_rate": 7.696586373428142e-05, + "loss": 1.8109, + "step": 11050 + }, + { + "epoch": 3.3919582565991404, + "grad_norm": 0.39438655972480774, + "learning_rate": 7.696167787940037e-05, + "loss": 1.7909, + "step": 11051 + }, + { + "epoch": 3.3922651933701657, + "grad_norm": 0.3425842523574829, + "learning_rate": 7.695749175806819e-05, + "loss": 1.8571, + "step": 11052 + }, + { + "epoch": 3.392572130141191, + "grad_norm": 0.2860080301761627, + "learning_rate": 7.695330537032628e-05, + "loss": 1.8546, + "step": 11053 + }, + { + "epoch": 3.392879066912216, + "grad_norm": 0.35894665122032166, + "learning_rate": 7.694911871621601e-05, + "loss": 1.7895, + "step": 11054 + }, + { + "epoch": 3.3931860036832413, + "grad_norm": 0.351193904876709, + "learning_rate": 7.694493179577879e-05, + "loss": 1.7453, + "step": 11055 + }, + { + "epoch": 3.3934929404542666, + "grad_norm": 0.24812865257263184, + "learning_rate": 7.694074460905592e-05, + "loss": 1.8131, + "step": 11056 + }, + { + "epoch": 3.3937998772252915, + "grad_norm": 0.38620972633361816, + "learning_rate": 7.693655715608883e-05, + "loss": 1.8346, + "step": 11057 + }, + { + "epoch": 3.394106813996317, + "grad_norm": 0.5005692839622498, + "learning_rate": 7.69323694369189e-05, + "loss": 1.9031, + "step": 11058 + }, + { + "epoch": 3.3944137507673418, + "grad_norm": 0.4321887791156769, + "learning_rate": 7.692818145158751e-05, + "loss": 1.8783, + "step": 11059 + }, + { + "epoch": 3.394720687538367, + "grad_norm": 0.269307017326355, + "learning_rate": 7.692399320013603e-05, + "loss": 1.8075, + "step": 11060 + }, + { + "epoch": 3.3950276243093924, + "grad_norm": 0.2945556342601776, + "learning_rate": 7.69198046826059e-05, + "loss": 1.8366, + "step": 11061 + }, + { + "epoch": 3.3953345610804173, + "grad_norm": 0.30531853437423706, + "learning_rate": 7.691561589903847e-05, + "loss": 1.7665, + "step": 11062 + }, + { + "epoch": 3.3956414978514426, + "grad_norm": 0.25105199217796326, + "learning_rate": 7.691142684947513e-05, + "loss": 1.782, + "step": 11063 + }, + { + "epoch": 3.3959484346224675, + "grad_norm": 0.3373202085494995, + "learning_rate": 7.69072375339573e-05, + "loss": 1.8148, + "step": 11064 + }, + { + "epoch": 3.396255371393493, + "grad_norm": 0.34207093715667725, + "learning_rate": 7.690304795252638e-05, + "loss": 1.8287, + "step": 11065 + }, + { + "epoch": 3.396562308164518, + "grad_norm": 0.26281681656837463, + "learning_rate": 7.68988581052238e-05, + "loss": 1.8551, + "step": 11066 + }, + { + "epoch": 3.396869244935543, + "grad_norm": 0.3091152608394623, + "learning_rate": 7.689466799209091e-05, + "loss": 1.7689, + "step": 11067 + }, + { + "epoch": 3.3971761817065684, + "grad_norm": 0.37421298027038574, + "learning_rate": 7.689047761316914e-05, + "loss": 1.7908, + "step": 11068 + }, + { + "epoch": 3.3974831184775938, + "grad_norm": 0.3745511770248413, + "learning_rate": 7.688628696849993e-05, + "loss": 1.8408, + "step": 11069 + }, + { + "epoch": 3.3977900552486187, + "grad_norm": 0.3003663122653961, + "learning_rate": 7.688209605812467e-05, + "loss": 1.9109, + "step": 11070 + }, + { + "epoch": 3.398096992019644, + "grad_norm": 0.3437681496143341, + "learning_rate": 7.687790488208478e-05, + "loss": 1.811, + "step": 11071 + }, + { + "epoch": 3.3984039287906693, + "grad_norm": 0.3480641841888428, + "learning_rate": 7.687371344042168e-05, + "loss": 1.8114, + "step": 11072 + }, + { + "epoch": 3.398710865561694, + "grad_norm": 0.24670913815498352, + "learning_rate": 7.686952173317679e-05, + "loss": 1.7959, + "step": 11073 + }, + { + "epoch": 3.3990178023327196, + "grad_norm": 0.2939499020576477, + "learning_rate": 7.686532976039154e-05, + "loss": 1.7518, + "step": 11074 + }, + { + "epoch": 3.3993247391037444, + "grad_norm": 0.3332279622554779, + "learning_rate": 7.686113752210736e-05, + "loss": 1.843, + "step": 11075 + }, + { + "epoch": 3.3996316758747698, + "grad_norm": 0.22967280447483063, + "learning_rate": 7.685694501836566e-05, + "loss": 1.7408, + "step": 11076 + }, + { + "epoch": 3.399938612645795, + "grad_norm": 0.3443470001220703, + "learning_rate": 7.685275224920789e-05, + "loss": 1.8004, + "step": 11077 + }, + { + "epoch": 3.40024554941682, + "grad_norm": 0.3725457489490509, + "learning_rate": 7.684855921467548e-05, + "loss": 1.833, + "step": 11078 + }, + { + "epoch": 3.4005524861878453, + "grad_norm": 0.3178638219833374, + "learning_rate": 7.68443659148099e-05, + "loss": 1.8055, + "step": 11079 + }, + { + "epoch": 3.4008594229588702, + "grad_norm": 0.2609167695045471, + "learning_rate": 7.684017234965254e-05, + "loss": 1.7881, + "step": 11080 + }, + { + "epoch": 3.4011663597298956, + "grad_norm": 0.26975762844085693, + "learning_rate": 7.683597851924486e-05, + "loss": 1.8424, + "step": 11081 + }, + { + "epoch": 3.401473296500921, + "grad_norm": 0.266661673784256, + "learning_rate": 7.683178442362832e-05, + "loss": 1.7785, + "step": 11082 + }, + { + "epoch": 3.401780233271946, + "grad_norm": 0.27915671467781067, + "learning_rate": 7.682759006284436e-05, + "loss": 1.8241, + "step": 11083 + }, + { + "epoch": 3.402087170042971, + "grad_norm": 0.25167274475097656, + "learning_rate": 7.682339543693444e-05, + "loss": 1.7637, + "step": 11084 + }, + { + "epoch": 3.4023941068139965, + "grad_norm": 0.2439529299736023, + "learning_rate": 7.681920054593999e-05, + "loss": 1.7796, + "step": 11085 + }, + { + "epoch": 3.4027010435850213, + "grad_norm": 0.26224252581596375, + "learning_rate": 7.681500538990249e-05, + "loss": 1.8018, + "step": 11086 + }, + { + "epoch": 3.4030079803560467, + "grad_norm": 0.25093868374824524, + "learning_rate": 7.681080996886336e-05, + "loss": 1.7664, + "step": 11087 + }, + { + "epoch": 3.403314917127072, + "grad_norm": 0.26393210887908936, + "learning_rate": 7.680661428286413e-05, + "loss": 1.8389, + "step": 11088 + }, + { + "epoch": 3.403621853898097, + "grad_norm": 0.24750283360481262, + "learning_rate": 7.680241833194622e-05, + "loss": 1.8358, + "step": 11089 + }, + { + "epoch": 3.4039287906691222, + "grad_norm": 0.21568982303142548, + "learning_rate": 7.67982221161511e-05, + "loss": 1.7874, + "step": 11090 + }, + { + "epoch": 3.404235727440147, + "grad_norm": 0.24407126009464264, + "learning_rate": 7.679402563552023e-05, + "loss": 1.7753, + "step": 11091 + }, + { + "epoch": 3.4045426642111725, + "grad_norm": 0.23288260400295258, + "learning_rate": 7.67898288900951e-05, + "loss": 1.8046, + "step": 11092 + }, + { + "epoch": 3.404849600982198, + "grad_norm": 0.2548544108867645, + "learning_rate": 7.678563187991718e-05, + "loss": 1.8778, + "step": 11093 + }, + { + "epoch": 3.4051565377532227, + "grad_norm": 0.24008090794086456, + "learning_rate": 7.678143460502796e-05, + "loss": 1.7912, + "step": 11094 + }, + { + "epoch": 3.405463474524248, + "grad_norm": 0.26085031032562256, + "learning_rate": 7.677723706546889e-05, + "loss": 1.849, + "step": 11095 + }, + { + "epoch": 3.4057704112952734, + "grad_norm": 0.2830932140350342, + "learning_rate": 7.677303926128147e-05, + "loss": 1.8265, + "step": 11096 + }, + { + "epoch": 3.4060773480662982, + "grad_norm": 0.27593597769737244, + "learning_rate": 7.676884119250718e-05, + "loss": 1.8555, + "step": 11097 + }, + { + "epoch": 3.4063842848373236, + "grad_norm": 0.2403372824192047, + "learning_rate": 7.676464285918751e-05, + "loss": 1.7243, + "step": 11098 + }, + { + "epoch": 3.406691221608349, + "grad_norm": 0.28830090165138245, + "learning_rate": 7.676044426136397e-05, + "loss": 1.8108, + "step": 11099 + }, + { + "epoch": 3.406998158379374, + "grad_norm": 0.2918153405189514, + "learning_rate": 7.675624539907802e-05, + "loss": 1.7875, + "step": 11100 + }, + { + "epoch": 3.407305095150399, + "grad_norm": 0.2609013020992279, + "learning_rate": 7.675204627237117e-05, + "loss": 1.778, + "step": 11101 + }, + { + "epoch": 3.407612031921424, + "grad_norm": 0.2714763283729553, + "learning_rate": 7.674784688128494e-05, + "loss": 1.8472, + "step": 11102 + }, + { + "epoch": 3.4079189686924494, + "grad_norm": 0.25857117772102356, + "learning_rate": 7.674364722586078e-05, + "loss": 1.7495, + "step": 11103 + }, + { + "epoch": 3.4082259054634747, + "grad_norm": 0.25485143065452576, + "learning_rate": 7.673944730614023e-05, + "loss": 1.7817, + "step": 11104 + }, + { + "epoch": 3.4085328422344996, + "grad_norm": 0.2735857665538788, + "learning_rate": 7.67352471221648e-05, + "loss": 1.7522, + "step": 11105 + }, + { + "epoch": 3.408839779005525, + "grad_norm": 0.25079572200775146, + "learning_rate": 7.6731046673976e-05, + "loss": 1.765, + "step": 11106 + }, + { + "epoch": 3.40914671577655, + "grad_norm": 0.3080148696899414, + "learning_rate": 7.672684596161532e-05, + "loss": 1.8305, + "step": 11107 + }, + { + "epoch": 3.409453652547575, + "grad_norm": 0.23771968483924866, + "learning_rate": 7.672264498512427e-05, + "loss": 1.7837, + "step": 11108 + }, + { + "epoch": 3.4097605893186005, + "grad_norm": 0.29941999912261963, + "learning_rate": 7.671844374454437e-05, + "loss": 1.8013, + "step": 11109 + }, + { + "epoch": 3.4100675260896254, + "grad_norm": 0.27871644496917725, + "learning_rate": 7.671424223991717e-05, + "loss": 1.8598, + "step": 11110 + }, + { + "epoch": 3.4103744628606507, + "grad_norm": 0.2751443684101105, + "learning_rate": 7.671004047128416e-05, + "loss": 1.8341, + "step": 11111 + }, + { + "epoch": 3.410681399631676, + "grad_norm": 0.27227312326431274, + "learning_rate": 7.670583843868688e-05, + "loss": 1.81, + "step": 11112 + }, + { + "epoch": 3.410988336402701, + "grad_norm": 0.29617756605148315, + "learning_rate": 7.670163614216685e-05, + "loss": 1.8795, + "step": 11113 + }, + { + "epoch": 3.4112952731737263, + "grad_norm": 0.268920361995697, + "learning_rate": 7.669743358176563e-05, + "loss": 1.7659, + "step": 11114 + }, + { + "epoch": 3.4116022099447516, + "grad_norm": 0.2875109314918518, + "learning_rate": 7.669323075752467e-05, + "loss": 1.8263, + "step": 11115 + }, + { + "epoch": 3.4119091467157765, + "grad_norm": 0.34703585505485535, + "learning_rate": 7.668902766948558e-05, + "loss": 1.7622, + "step": 11116 + }, + { + "epoch": 3.412216083486802, + "grad_norm": 0.3090265393257141, + "learning_rate": 7.668482431768989e-05, + "loss": 1.7381, + "step": 11117 + }, + { + "epoch": 3.4125230202578267, + "grad_norm": 0.2619737684726715, + "learning_rate": 7.668062070217911e-05, + "loss": 1.8004, + "step": 11118 + }, + { + "epoch": 3.412829957028852, + "grad_norm": 0.289815217256546, + "learning_rate": 7.667641682299482e-05, + "loss": 1.7946, + "step": 11119 + }, + { + "epoch": 3.4131368937998774, + "grad_norm": 0.28732073307037354, + "learning_rate": 7.667221268017852e-05, + "loss": 1.8746, + "step": 11120 + }, + { + "epoch": 3.4134438305709023, + "grad_norm": 0.23232576251029968, + "learning_rate": 7.666800827377178e-05, + "loss": 1.7403, + "step": 11121 + }, + { + "epoch": 3.4137507673419276, + "grad_norm": 0.22903507947921753, + "learning_rate": 7.666380360381616e-05, + "loss": 1.7785, + "step": 11122 + }, + { + "epoch": 3.4140577041129525, + "grad_norm": 0.25023025274276733, + "learning_rate": 7.665959867035321e-05, + "loss": 1.7881, + "step": 11123 + }, + { + "epoch": 3.414364640883978, + "grad_norm": 0.2199166864156723, + "learning_rate": 7.665539347342449e-05, + "loss": 1.7522, + "step": 11124 + }, + { + "epoch": 3.414671577655003, + "grad_norm": 0.2539862394332886, + "learning_rate": 7.665118801307152e-05, + "loss": 1.7964, + "step": 11125 + }, + { + "epoch": 3.414978514426028, + "grad_norm": 0.22670161724090576, + "learning_rate": 7.664698228933591e-05, + "loss": 1.7071, + "step": 11126 + }, + { + "epoch": 3.4152854511970534, + "grad_norm": 0.24827396869659424, + "learning_rate": 7.664277630225919e-05, + "loss": 1.7897, + "step": 11127 + }, + { + "epoch": 3.4155923879680787, + "grad_norm": 0.29391366243362427, + "learning_rate": 7.663857005188296e-05, + "loss": 1.7967, + "step": 11128 + }, + { + "epoch": 3.4158993247391036, + "grad_norm": 0.3201812505722046, + "learning_rate": 7.663436353824874e-05, + "loss": 1.7681, + "step": 11129 + }, + { + "epoch": 3.416206261510129, + "grad_norm": 0.2274552583694458, + "learning_rate": 7.663015676139814e-05, + "loss": 1.7535, + "step": 11130 + }, + { + "epoch": 3.4165131982811543, + "grad_norm": 0.3955044150352478, + "learning_rate": 7.662594972137273e-05, + "loss": 1.8175, + "step": 11131 + }, + { + "epoch": 3.416820135052179, + "grad_norm": 0.46493569016456604, + "learning_rate": 7.662174241821406e-05, + "loss": 1.7806, + "step": 11132 + }, + { + "epoch": 3.4171270718232045, + "grad_norm": 0.37731611728668213, + "learning_rate": 7.661753485196375e-05, + "loss": 1.7555, + "step": 11133 + }, + { + "epoch": 3.4174340085942294, + "grad_norm": 0.23983556032180786, + "learning_rate": 7.661332702266334e-05, + "loss": 1.7662, + "step": 11134 + }, + { + "epoch": 3.4177409453652547, + "grad_norm": 0.34964314103126526, + "learning_rate": 7.660911893035445e-05, + "loss": 1.7786, + "step": 11135 + }, + { + "epoch": 3.41804788213628, + "grad_norm": 0.44820764660835266, + "learning_rate": 7.660491057507864e-05, + "loss": 1.778, + "step": 11136 + }, + { + "epoch": 3.418354818907305, + "grad_norm": 0.32936233282089233, + "learning_rate": 7.660070195687752e-05, + "loss": 1.8181, + "step": 11137 + }, + { + "epoch": 3.4186617556783303, + "grad_norm": 0.2874850332736969, + "learning_rate": 7.659649307579266e-05, + "loss": 1.8733, + "step": 11138 + }, + { + "epoch": 3.418968692449355, + "grad_norm": 0.46269866824150085, + "learning_rate": 7.659228393186566e-05, + "loss": 1.8566, + "step": 11139 + }, + { + "epoch": 3.4192756292203805, + "grad_norm": 0.5873839855194092, + "learning_rate": 7.658807452513816e-05, + "loss": 1.8317, + "step": 11140 + }, + { + "epoch": 3.419582565991406, + "grad_norm": 0.43150341510772705, + "learning_rate": 7.65838648556517e-05, + "loss": 1.7702, + "step": 11141 + }, + { + "epoch": 3.4198895027624308, + "grad_norm": 0.2803891599178314, + "learning_rate": 7.65796549234479e-05, + "loss": 1.8043, + "step": 11142 + }, + { + "epoch": 3.420196439533456, + "grad_norm": 0.37295013666152954, + "learning_rate": 7.657544472856838e-05, + "loss": 1.7923, + "step": 11143 + }, + { + "epoch": 3.4205033763044814, + "grad_norm": 0.3922573924064636, + "learning_rate": 7.657123427105473e-05, + "loss": 1.8231, + "step": 11144 + }, + { + "epoch": 3.4208103130755063, + "grad_norm": 0.27254152297973633, + "learning_rate": 7.656702355094859e-05, + "loss": 1.8168, + "step": 11145 + }, + { + "epoch": 3.4211172498465316, + "grad_norm": 0.28005337715148926, + "learning_rate": 7.656281256829152e-05, + "loss": 1.8047, + "step": 11146 + }, + { + "epoch": 3.421424186617557, + "grad_norm": 0.4369073808193207, + "learning_rate": 7.655860132312519e-05, + "loss": 1.7243, + "step": 11147 + }, + { + "epoch": 3.421731123388582, + "grad_norm": 0.4127553701400757, + "learning_rate": 7.655438981549119e-05, + "loss": 1.8148, + "step": 11148 + }, + { + "epoch": 3.422038060159607, + "grad_norm": 0.3131798207759857, + "learning_rate": 7.655017804543114e-05, + "loss": 1.789, + "step": 11149 + }, + { + "epoch": 3.422344996930632, + "grad_norm": 0.2947194576263428, + "learning_rate": 7.654596601298666e-05, + "loss": 1.8221, + "step": 11150 + }, + { + "epoch": 3.4226519337016574, + "grad_norm": 0.3072497546672821, + "learning_rate": 7.654175371819941e-05, + "loss": 1.7747, + "step": 11151 + }, + { + "epoch": 3.4229588704726828, + "grad_norm": 0.29408320784568787, + "learning_rate": 7.653754116111099e-05, + "loss": 1.9009, + "step": 11152 + }, + { + "epoch": 3.4232658072437077, + "grad_norm": 0.2629215717315674, + "learning_rate": 7.653332834176303e-05, + "loss": 1.7354, + "step": 11153 + }, + { + "epoch": 3.423572744014733, + "grad_norm": 0.2850257456302643, + "learning_rate": 7.652911526019716e-05, + "loss": 1.8422, + "step": 11154 + }, + { + "epoch": 3.423879680785758, + "grad_norm": 0.29787111282348633, + "learning_rate": 7.652490191645503e-05, + "loss": 1.8122, + "step": 11155 + }, + { + "epoch": 3.424186617556783, + "grad_norm": 0.2670947015285492, + "learning_rate": 7.652068831057826e-05, + "loss": 1.7734, + "step": 11156 + }, + { + "epoch": 3.4244935543278086, + "grad_norm": 0.26415133476257324, + "learning_rate": 7.651647444260853e-05, + "loss": 1.7661, + "step": 11157 + }, + { + "epoch": 3.424800491098834, + "grad_norm": 0.2614886164665222, + "learning_rate": 7.651226031258745e-05, + "loss": 1.6918, + "step": 11158 + }, + { + "epoch": 3.425107427869859, + "grad_norm": 0.28485649824142456, + "learning_rate": 7.650804592055667e-05, + "loss": 1.7771, + "step": 11159 + }, + { + "epoch": 3.425414364640884, + "grad_norm": 0.26080289483070374, + "learning_rate": 7.650383126655784e-05, + "loss": 1.7637, + "step": 11160 + }, + { + "epoch": 3.425721301411909, + "grad_norm": 0.2503695487976074, + "learning_rate": 7.649961635063261e-05, + "loss": 1.7864, + "step": 11161 + }, + { + "epoch": 3.4260282381829343, + "grad_norm": 0.3165570795536041, + "learning_rate": 7.649540117282263e-05, + "loss": 1.8107, + "step": 11162 + }, + { + "epoch": 3.4263351749539597, + "grad_norm": 0.28411731123924255, + "learning_rate": 7.649118573316959e-05, + "loss": 1.7557, + "step": 11163 + }, + { + "epoch": 3.4266421117249846, + "grad_norm": 0.24469570815563202, + "learning_rate": 7.648697003171512e-05, + "loss": 1.7597, + "step": 11164 + }, + { + "epoch": 3.42694904849601, + "grad_norm": 0.31968292593955994, + "learning_rate": 7.648275406850087e-05, + "loss": 1.7796, + "step": 11165 + }, + { + "epoch": 3.427255985267035, + "grad_norm": 0.24520765244960785, + "learning_rate": 7.647853784356856e-05, + "loss": 1.7931, + "step": 11166 + }, + { + "epoch": 3.42756292203806, + "grad_norm": 0.23946821689605713, + "learning_rate": 7.647432135695977e-05, + "loss": 1.7143, + "step": 11167 + }, + { + "epoch": 3.4278698588090855, + "grad_norm": 0.321455180644989, + "learning_rate": 7.647010460871624e-05, + "loss": 1.8682, + "step": 11168 + }, + { + "epoch": 3.4281767955801103, + "grad_norm": 0.2803197503089905, + "learning_rate": 7.646588759887964e-05, + "loss": 1.8, + "step": 11169 + }, + { + "epoch": 3.4284837323511357, + "grad_norm": 0.2597559988498688, + "learning_rate": 7.64616703274916e-05, + "loss": 1.8027, + "step": 11170 + }, + { + "epoch": 3.428790669122161, + "grad_norm": 0.25055503845214844, + "learning_rate": 7.645745279459384e-05, + "loss": 1.7659, + "step": 11171 + }, + { + "epoch": 3.429097605893186, + "grad_norm": 0.34582629799842834, + "learning_rate": 7.645323500022803e-05, + "loss": 1.7868, + "step": 11172 + }, + { + "epoch": 3.4294045426642112, + "grad_norm": 0.32845041155815125, + "learning_rate": 7.644901694443584e-05, + "loss": 1.8247, + "step": 11173 + }, + { + "epoch": 3.4297114794352366, + "grad_norm": 0.2570398449897766, + "learning_rate": 7.644479862725896e-05, + "loss": 1.7802, + "step": 11174 + }, + { + "epoch": 3.4300184162062615, + "grad_norm": 0.23117294907569885, + "learning_rate": 7.644058004873908e-05, + "loss": 1.7575, + "step": 11175 + }, + { + "epoch": 3.430325352977287, + "grad_norm": 0.2417830377817154, + "learning_rate": 7.64363612089179e-05, + "loss": 1.7954, + "step": 11176 + }, + { + "epoch": 3.4306322897483117, + "grad_norm": 0.249378964304924, + "learning_rate": 7.643214210783708e-05, + "loss": 1.8161, + "step": 11177 + }, + { + "epoch": 3.430939226519337, + "grad_norm": 0.24494746327400208, + "learning_rate": 7.642792274553836e-05, + "loss": 1.825, + "step": 11178 + }, + { + "epoch": 3.4312461632903624, + "grad_norm": 0.2663760185241699, + "learning_rate": 7.642370312206342e-05, + "loss": 1.7589, + "step": 11179 + }, + { + "epoch": 3.4315531000613873, + "grad_norm": 0.2819322645664215, + "learning_rate": 7.641948323745395e-05, + "loss": 1.8097, + "step": 11180 + }, + { + "epoch": 3.4318600368324126, + "grad_norm": 0.26917630434036255, + "learning_rate": 7.641526309175166e-05, + "loss": 1.7934, + "step": 11181 + }, + { + "epoch": 3.4321669736034375, + "grad_norm": 0.31618112325668335, + "learning_rate": 7.641104268499826e-05, + "loss": 1.8522, + "step": 11182 + }, + { + "epoch": 3.432473910374463, + "grad_norm": 0.29209139943122864, + "learning_rate": 7.640682201723546e-05, + "loss": 1.7499, + "step": 11183 + }, + { + "epoch": 3.432780847145488, + "grad_norm": 0.24831914901733398, + "learning_rate": 7.640260108850496e-05, + "loss": 1.7897, + "step": 11184 + }, + { + "epoch": 3.433087783916513, + "grad_norm": 0.2459818720817566, + "learning_rate": 7.639837989884849e-05, + "loss": 1.7604, + "step": 11185 + }, + { + "epoch": 3.4333947206875384, + "grad_norm": 0.27157485485076904, + "learning_rate": 7.639415844830774e-05, + "loss": 1.7776, + "step": 11186 + }, + { + "epoch": 3.4337016574585637, + "grad_norm": 0.3021515905857086, + "learning_rate": 7.638993673692445e-05, + "loss": 1.7771, + "step": 11187 + }, + { + "epoch": 3.4340085942295886, + "grad_norm": 0.2591722309589386, + "learning_rate": 7.638571476474036e-05, + "loss": 1.8333, + "step": 11188 + }, + { + "epoch": 3.434315531000614, + "grad_norm": 0.2255258709192276, + "learning_rate": 7.638149253179717e-05, + "loss": 1.7647, + "step": 11189 + }, + { + "epoch": 3.4346224677716393, + "grad_norm": 0.2585793733596802, + "learning_rate": 7.637727003813658e-05, + "loss": 1.786, + "step": 11190 + }, + { + "epoch": 3.434929404542664, + "grad_norm": 0.23649543523788452, + "learning_rate": 7.637304728380036e-05, + "loss": 1.822, + "step": 11191 + }, + { + "epoch": 3.4352363413136895, + "grad_norm": 0.2610832452774048, + "learning_rate": 7.636882426883023e-05, + "loss": 1.7925, + "step": 11192 + }, + { + "epoch": 3.4355432780847144, + "grad_norm": 0.26230642199516296, + "learning_rate": 7.636460099326793e-05, + "loss": 1.8169, + "step": 11193 + }, + { + "epoch": 3.4358502148557397, + "grad_norm": 0.2800561189651489, + "learning_rate": 7.636037745715518e-05, + "loss": 1.845, + "step": 11194 + }, + { + "epoch": 3.436157151626765, + "grad_norm": 0.27790409326553345, + "learning_rate": 7.635615366053372e-05, + "loss": 1.8141, + "step": 11195 + }, + { + "epoch": 3.43646408839779, + "grad_norm": 0.2894865870475769, + "learning_rate": 7.635192960344533e-05, + "loss": 1.7916, + "step": 11196 + }, + { + "epoch": 3.4367710251688153, + "grad_norm": 0.22310738265514374, + "learning_rate": 7.634770528593171e-05, + "loss": 1.79, + "step": 11197 + }, + { + "epoch": 3.43707796193984, + "grad_norm": 0.2837755084037781, + "learning_rate": 7.634348070803463e-05, + "loss": 1.8763, + "step": 11198 + }, + { + "epoch": 3.4373848987108655, + "grad_norm": 0.32488104701042175, + "learning_rate": 7.633925586979583e-05, + "loss": 1.8331, + "step": 11199 + }, + { + "epoch": 3.437691835481891, + "grad_norm": 0.2708779573440552, + "learning_rate": 7.633503077125706e-05, + "loss": 1.761, + "step": 11200 + }, + { + "epoch": 3.4379987722529157, + "grad_norm": 0.23929642140865326, + "learning_rate": 7.633080541246008e-05, + "loss": 1.8217, + "step": 11201 + }, + { + "epoch": 3.438305709023941, + "grad_norm": 0.3213331997394562, + "learning_rate": 7.632657979344667e-05, + "loss": 1.8375, + "step": 11202 + }, + { + "epoch": 3.4386126457949664, + "grad_norm": 0.38420629501342773, + "learning_rate": 7.632235391425854e-05, + "loss": 1.765, + "step": 11203 + }, + { + "epoch": 3.4389195825659913, + "grad_norm": 0.40466073155403137, + "learning_rate": 7.631812777493749e-05, + "loss": 1.8262, + "step": 11204 + }, + { + "epoch": 3.4392265193370166, + "grad_norm": 0.35904639959335327, + "learning_rate": 7.631390137552527e-05, + "loss": 1.894, + "step": 11205 + }, + { + "epoch": 3.439533456108042, + "grad_norm": 0.28880515694618225, + "learning_rate": 7.630967471606368e-05, + "loss": 1.87, + "step": 11206 + }, + { + "epoch": 3.439840392879067, + "grad_norm": 0.2878882884979248, + "learning_rate": 7.630544779659444e-05, + "loss": 1.7841, + "step": 11207 + }, + { + "epoch": 3.440147329650092, + "grad_norm": 0.36002418398857117, + "learning_rate": 7.630122061715935e-05, + "loss": 1.7318, + "step": 11208 + }, + { + "epoch": 3.440454266421117, + "grad_norm": 0.3304644227027893, + "learning_rate": 7.629699317780019e-05, + "loss": 1.8581, + "step": 11209 + }, + { + "epoch": 3.4407612031921424, + "grad_norm": 0.23396331071853638, + "learning_rate": 7.629276547855872e-05, + "loss": 1.7897, + "step": 11210 + }, + { + "epoch": 3.4410681399631677, + "grad_norm": 0.34914183616638184, + "learning_rate": 7.628853751947674e-05, + "loss": 1.8531, + "step": 11211 + }, + { + "epoch": 3.4413750767341926, + "grad_norm": 0.3700502812862396, + "learning_rate": 7.6284309300596e-05, + "loss": 1.7884, + "step": 11212 + }, + { + "epoch": 3.441682013505218, + "grad_norm": 0.24606801569461823, + "learning_rate": 7.628008082195835e-05, + "loss": 1.7292, + "step": 11213 + }, + { + "epoch": 3.441988950276243, + "grad_norm": 0.26344993710517883, + "learning_rate": 7.627585208360551e-05, + "loss": 1.7832, + "step": 11214 + }, + { + "epoch": 3.442295887047268, + "grad_norm": 0.4034743010997772, + "learning_rate": 7.62716230855793e-05, + "loss": 1.8164, + "step": 11215 + }, + { + "epoch": 3.4426028238182935, + "grad_norm": 0.4508039355278015, + "learning_rate": 7.626739382792152e-05, + "loss": 1.7855, + "step": 11216 + }, + { + "epoch": 3.4429097605893184, + "grad_norm": 0.2963111400604248, + "learning_rate": 7.626316431067395e-05, + "loss": 1.7995, + "step": 11217 + }, + { + "epoch": 3.4432166973603437, + "grad_norm": 0.35248515009880066, + "learning_rate": 7.625893453387841e-05, + "loss": 1.8761, + "step": 11218 + }, + { + "epoch": 3.443523634131369, + "grad_norm": 0.4032224416732788, + "learning_rate": 7.625470449757668e-05, + "loss": 1.7746, + "step": 11219 + }, + { + "epoch": 3.443830570902394, + "grad_norm": 0.3505195081233978, + "learning_rate": 7.625047420181057e-05, + "loss": 1.851, + "step": 11220 + }, + { + "epoch": 3.4441375076734193, + "grad_norm": 0.288968563079834, + "learning_rate": 7.62462436466219e-05, + "loss": 1.8055, + "step": 11221 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.43141910433769226, + "learning_rate": 7.624201283205246e-05, + "loss": 1.816, + "step": 11222 + }, + { + "epoch": 3.4447513812154695, + "grad_norm": 0.46902137994766235, + "learning_rate": 7.623778175814407e-05, + "loss": 1.8478, + "step": 11223 + }, + { + "epoch": 3.445058317986495, + "grad_norm": 0.3333328366279602, + "learning_rate": 7.623355042493854e-05, + "loss": 1.7949, + "step": 11224 + }, + { + "epoch": 3.4453652547575198, + "grad_norm": 0.2625340521335602, + "learning_rate": 7.622931883247768e-05, + "loss": 1.745, + "step": 11225 + }, + { + "epoch": 3.445672191528545, + "grad_norm": 0.4565848410129547, + "learning_rate": 7.622508698080333e-05, + "loss": 1.796, + "step": 11226 + }, + { + "epoch": 3.4459791282995704, + "grad_norm": 0.4676518738269806, + "learning_rate": 7.622085486995729e-05, + "loss": 1.8115, + "step": 11227 + }, + { + "epoch": 3.4462860650705953, + "grad_norm": 0.3828938603401184, + "learning_rate": 7.62166224999814e-05, + "loss": 1.8758, + "step": 11228 + }, + { + "epoch": 3.4465930018416207, + "grad_norm": 0.2786383628845215, + "learning_rate": 7.621238987091747e-05, + "loss": 1.7616, + "step": 11229 + }, + { + "epoch": 3.446899938612646, + "grad_norm": 0.4442835748195648, + "learning_rate": 7.620815698280734e-05, + "loss": 1.8342, + "step": 11230 + }, + { + "epoch": 3.447206875383671, + "grad_norm": 0.45760586857795715, + "learning_rate": 7.620392383569286e-05, + "loss": 1.8159, + "step": 11231 + }, + { + "epoch": 3.447513812154696, + "grad_norm": 0.2567009925842285, + "learning_rate": 7.619969042961583e-05, + "loss": 1.774, + "step": 11232 + }, + { + "epoch": 3.4478207489257215, + "grad_norm": 0.3720102310180664, + "learning_rate": 7.619545676461812e-05, + "loss": 1.8366, + "step": 11233 + }, + { + "epoch": 3.4481276856967464, + "grad_norm": 0.36436137557029724, + "learning_rate": 7.619122284074154e-05, + "loss": 1.832, + "step": 11234 + }, + { + "epoch": 3.4484346224677718, + "grad_norm": 0.310310959815979, + "learning_rate": 7.618698865802795e-05, + "loss": 1.9023, + "step": 11235 + }, + { + "epoch": 3.4487415592387967, + "grad_norm": 0.2693026661872864, + "learning_rate": 7.618275421651916e-05, + "loss": 1.7696, + "step": 11236 + }, + { + "epoch": 3.449048496009822, + "grad_norm": 0.2942425608634949, + "learning_rate": 7.61785195162571e-05, + "loss": 1.822, + "step": 11237 + }, + { + "epoch": 3.4493554327808473, + "grad_norm": 0.22454749047756195, + "learning_rate": 7.617428455728353e-05, + "loss": 1.7011, + "step": 11238 + }, + { + "epoch": 3.449662369551872, + "grad_norm": 0.23345038294792175, + "learning_rate": 7.617004933964035e-05, + "loss": 1.7563, + "step": 11239 + }, + { + "epoch": 3.4499693063228976, + "grad_norm": 0.24990662932395935, + "learning_rate": 7.616581386336941e-05, + "loss": 1.8031, + "step": 11240 + }, + { + "epoch": 3.4502762430939224, + "grad_norm": 0.2919348478317261, + "learning_rate": 7.616157812851254e-05, + "loss": 1.7355, + "step": 11241 + }, + { + "epoch": 3.450583179864948, + "grad_norm": 0.2926909327507019, + "learning_rate": 7.615734213511165e-05, + "loss": 1.8341, + "step": 11242 + }, + { + "epoch": 3.450890116635973, + "grad_norm": 0.24316683411598206, + "learning_rate": 7.615310588320855e-05, + "loss": 1.8154, + "step": 11243 + }, + { + "epoch": 3.451197053406998, + "grad_norm": 0.23154498636722565, + "learning_rate": 7.614886937284513e-05, + "loss": 1.7904, + "step": 11244 + }, + { + "epoch": 3.4515039901780233, + "grad_norm": 0.25973939895629883, + "learning_rate": 7.614463260406327e-05, + "loss": 1.7598, + "step": 11245 + }, + { + "epoch": 3.4518109269490487, + "grad_norm": 0.22110119462013245, + "learning_rate": 7.614039557690482e-05, + "loss": 1.7903, + "step": 11246 + }, + { + "epoch": 3.4521178637200736, + "grad_norm": 0.26184993982315063, + "learning_rate": 7.613615829141165e-05, + "loss": 1.748, + "step": 11247 + }, + { + "epoch": 3.452424800491099, + "grad_norm": 0.26128727197647095, + "learning_rate": 7.613192074762565e-05, + "loss": 1.7786, + "step": 11248 + }, + { + "epoch": 3.4527317372621242, + "grad_norm": 0.23230813443660736, + "learning_rate": 7.612768294558871e-05, + "loss": 1.8114, + "step": 11249 + }, + { + "epoch": 3.453038674033149, + "grad_norm": 0.2686540186405182, + "learning_rate": 7.612344488534268e-05, + "loss": 1.7311, + "step": 11250 + }, + { + "epoch": 3.4533456108041745, + "grad_norm": 0.25553348660469055, + "learning_rate": 7.611920656692946e-05, + "loss": 1.8468, + "step": 11251 + }, + { + "epoch": 3.4536525475751993, + "grad_norm": 0.2639308273792267, + "learning_rate": 7.611496799039092e-05, + "loss": 1.8292, + "step": 11252 + }, + { + "epoch": 3.4539594843462247, + "grad_norm": 0.2468358874320984, + "learning_rate": 7.611072915576895e-05, + "loss": 1.8173, + "step": 11253 + }, + { + "epoch": 3.45426642111725, + "grad_norm": 0.27236035466194153, + "learning_rate": 7.610649006310549e-05, + "loss": 1.8082, + "step": 11254 + }, + { + "epoch": 3.454573357888275, + "grad_norm": 0.2277914434671402, + "learning_rate": 7.610225071244237e-05, + "loss": 1.7483, + "step": 11255 + }, + { + "epoch": 3.4548802946593002, + "grad_norm": 0.2292868196964264, + "learning_rate": 7.60980111038215e-05, + "loss": 1.7716, + "step": 11256 + }, + { + "epoch": 3.455187231430325, + "grad_norm": 0.22116152942180634, + "learning_rate": 7.60937712372848e-05, + "loss": 1.773, + "step": 11257 + }, + { + "epoch": 3.4554941682013505, + "grad_norm": 0.23238304257392883, + "learning_rate": 7.608953111287416e-05, + "loss": 1.7602, + "step": 11258 + }, + { + "epoch": 3.455801104972376, + "grad_norm": 0.2810615003108978, + "learning_rate": 7.608529073063149e-05, + "loss": 1.8781, + "step": 11259 + }, + { + "epoch": 3.4561080417434007, + "grad_norm": 0.2516821324825287, + "learning_rate": 7.608105009059867e-05, + "loss": 1.835, + "step": 11260 + }, + { + "epoch": 3.456414978514426, + "grad_norm": 0.25698330998420715, + "learning_rate": 7.607680919281763e-05, + "loss": 1.7859, + "step": 11261 + }, + { + "epoch": 3.4567219152854514, + "grad_norm": 0.2597602903842926, + "learning_rate": 7.60725680373303e-05, + "loss": 1.8287, + "step": 11262 + }, + { + "epoch": 3.4570288520564763, + "grad_norm": 0.2564091980457306, + "learning_rate": 7.606832662417855e-05, + "loss": 1.8003, + "step": 11263 + }, + { + "epoch": 3.4573357888275016, + "grad_norm": 0.2872684597969055, + "learning_rate": 7.606408495340432e-05, + "loss": 1.8242, + "step": 11264 + }, + { + "epoch": 3.457642725598527, + "grad_norm": 0.27513590455055237, + "learning_rate": 7.605984302504952e-05, + "loss": 1.8605, + "step": 11265 + }, + { + "epoch": 3.457949662369552, + "grad_norm": 0.27768459916114807, + "learning_rate": 7.605560083915609e-05, + "loss": 1.7948, + "step": 11266 + }, + { + "epoch": 3.458256599140577, + "grad_norm": 0.23911382257938385, + "learning_rate": 7.605135839576593e-05, + "loss": 1.7575, + "step": 11267 + }, + { + "epoch": 3.458563535911602, + "grad_norm": 0.26773568987846375, + "learning_rate": 7.604711569492098e-05, + "loss": 1.752, + "step": 11268 + }, + { + "epoch": 3.4588704726826274, + "grad_norm": 0.30079394578933716, + "learning_rate": 7.604287273666316e-05, + "loss": 1.8022, + "step": 11269 + }, + { + "epoch": 3.4591774094536527, + "grad_norm": 0.27393853664398193, + "learning_rate": 7.603862952103441e-05, + "loss": 1.8054, + "step": 11270 + }, + { + "epoch": 3.4594843462246776, + "grad_norm": 0.2794870436191559, + "learning_rate": 7.603438604807667e-05, + "loss": 1.808, + "step": 11271 + }, + { + "epoch": 3.459791282995703, + "grad_norm": 0.26482146978378296, + "learning_rate": 7.603014231783185e-05, + "loss": 1.8696, + "step": 11272 + }, + { + "epoch": 3.460098219766728, + "grad_norm": 0.2755354344844818, + "learning_rate": 7.602589833034192e-05, + "loss": 1.8412, + "step": 11273 + }, + { + "epoch": 3.460405156537753, + "grad_norm": 0.2666642367839813, + "learning_rate": 7.602165408564883e-05, + "loss": 1.8333, + "step": 11274 + }, + { + "epoch": 3.4607120933087785, + "grad_norm": 0.26958519220352173, + "learning_rate": 7.601740958379448e-05, + "loss": 1.7943, + "step": 11275 + }, + { + "epoch": 3.4610190300798034, + "grad_norm": 0.2915789783000946, + "learning_rate": 7.601316482482084e-05, + "loss": 1.7519, + "step": 11276 + }, + { + "epoch": 3.4613259668508287, + "grad_norm": 0.2456950694322586, + "learning_rate": 7.600891980876985e-05, + "loss": 1.8064, + "step": 11277 + }, + { + "epoch": 3.461632903621854, + "grad_norm": 0.2517867088317871, + "learning_rate": 7.600467453568348e-05, + "loss": 1.7766, + "step": 11278 + }, + { + "epoch": 3.461939840392879, + "grad_norm": 0.24567969143390656, + "learning_rate": 7.600042900560368e-05, + "loss": 1.7331, + "step": 11279 + }, + { + "epoch": 3.4622467771639043, + "grad_norm": 0.23986820876598358, + "learning_rate": 7.599618321857239e-05, + "loss": 1.7477, + "step": 11280 + }, + { + "epoch": 3.4625537139349296, + "grad_norm": 0.2555375397205353, + "learning_rate": 7.599193717463158e-05, + "loss": 1.8154, + "step": 11281 + }, + { + "epoch": 3.4628606507059545, + "grad_norm": 0.2522781193256378, + "learning_rate": 7.598769087382323e-05, + "loss": 1.7821, + "step": 11282 + }, + { + "epoch": 3.46316758747698, + "grad_norm": 0.25631004571914673, + "learning_rate": 7.598344431618926e-05, + "loss": 1.8043, + "step": 11283 + }, + { + "epoch": 3.4634745242480047, + "grad_norm": 0.2611328661441803, + "learning_rate": 7.597919750177168e-05, + "loss": 1.8036, + "step": 11284 + }, + { + "epoch": 3.46378146101903, + "grad_norm": 0.255670428276062, + "learning_rate": 7.597495043061244e-05, + "loss": 1.7375, + "step": 11285 + }, + { + "epoch": 3.4640883977900554, + "grad_norm": 0.2687236964702606, + "learning_rate": 7.597070310275353e-05, + "loss": 1.7496, + "step": 11286 + }, + { + "epoch": 3.4643953345610803, + "grad_norm": 0.2643752992153168, + "learning_rate": 7.596645551823688e-05, + "loss": 1.8444, + "step": 11287 + }, + { + "epoch": 3.4647022713321056, + "grad_norm": 0.2564511299133301, + "learning_rate": 7.596220767710452e-05, + "loss": 1.7557, + "step": 11288 + }, + { + "epoch": 3.4650092081031305, + "grad_norm": 0.2510208487510681, + "learning_rate": 7.59579595793984e-05, + "loss": 1.7234, + "step": 11289 + }, + { + "epoch": 3.465316144874156, + "grad_norm": 0.2765158712863922, + "learning_rate": 7.595371122516051e-05, + "loss": 1.8215, + "step": 11290 + }, + { + "epoch": 3.465623081645181, + "grad_norm": 0.28233039379119873, + "learning_rate": 7.594946261443286e-05, + "loss": 1.7752, + "step": 11291 + }, + { + "epoch": 3.465930018416206, + "grad_norm": 0.26971468329429626, + "learning_rate": 7.594521374725735e-05, + "loss": 1.7924, + "step": 11292 + }, + { + "epoch": 3.4662369551872314, + "grad_norm": 0.29425930976867676, + "learning_rate": 7.594096462367608e-05, + "loss": 1.8144, + "step": 11293 + }, + { + "epoch": 3.4665438919582567, + "grad_norm": 0.233150452375412, + "learning_rate": 7.593671524373098e-05, + "loss": 1.7741, + "step": 11294 + }, + { + "epoch": 3.4668508287292816, + "grad_norm": 0.2947762608528137, + "learning_rate": 7.593246560746406e-05, + "loss": 1.8031, + "step": 11295 + }, + { + "epoch": 3.467157765500307, + "grad_norm": 0.250552773475647, + "learning_rate": 7.59282157149173e-05, + "loss": 1.7501, + "step": 11296 + }, + { + "epoch": 3.4674647022713323, + "grad_norm": 0.26091331243515015, + "learning_rate": 7.592396556613274e-05, + "loss": 1.836, + "step": 11297 + }, + { + "epoch": 3.467771639042357, + "grad_norm": 0.28625619411468506, + "learning_rate": 7.591971516115233e-05, + "loss": 1.7555, + "step": 11298 + }, + { + "epoch": 3.4680785758133825, + "grad_norm": 0.2723398804664612, + "learning_rate": 7.591546450001811e-05, + "loss": 1.825, + "step": 11299 + }, + { + "epoch": 3.4683855125844074, + "grad_norm": 0.24289946258068085, + "learning_rate": 7.591121358277211e-05, + "loss": 1.7441, + "step": 11300 + }, + { + "epoch": 3.4686924493554327, + "grad_norm": 0.2706952691078186, + "learning_rate": 7.590696240945629e-05, + "loss": 1.8651, + "step": 11301 + }, + { + "epoch": 3.468999386126458, + "grad_norm": 0.24632862210273743, + "learning_rate": 7.590271098011268e-05, + "loss": 1.8229, + "step": 11302 + }, + { + "epoch": 3.469306322897483, + "grad_norm": 0.29275211691856384, + "learning_rate": 7.58984592947833e-05, + "loss": 1.7591, + "step": 11303 + }, + { + "epoch": 3.4696132596685083, + "grad_norm": 0.29228144884109497, + "learning_rate": 7.589420735351016e-05, + "loss": 1.8395, + "step": 11304 + }, + { + "epoch": 3.4699201964395336, + "grad_norm": 0.28339114785194397, + "learning_rate": 7.588995515633528e-05, + "loss": 1.8543, + "step": 11305 + }, + { + "epoch": 3.4702271332105585, + "grad_norm": 0.2834693193435669, + "learning_rate": 7.588570270330071e-05, + "loss": 1.826, + "step": 11306 + }, + { + "epoch": 3.470534069981584, + "grad_norm": 0.26130759716033936, + "learning_rate": 7.588144999444844e-05, + "loss": 1.7887, + "step": 11307 + }, + { + "epoch": 3.470841006752609, + "grad_norm": 0.29554685950279236, + "learning_rate": 7.587719702982052e-05, + "loss": 1.819, + "step": 11308 + }, + { + "epoch": 3.471147943523634, + "grad_norm": 0.2687968611717224, + "learning_rate": 7.587294380945898e-05, + "loss": 1.7354, + "step": 11309 + }, + { + "epoch": 3.4714548802946594, + "grad_norm": 0.28795287013053894, + "learning_rate": 7.586869033340582e-05, + "loss": 1.8267, + "step": 11310 + }, + { + "epoch": 3.4717618170656843, + "grad_norm": 0.33244553208351135, + "learning_rate": 7.58644366017031e-05, + "loss": 1.86, + "step": 11311 + }, + { + "epoch": 3.4720687538367097, + "grad_norm": 0.2878025472164154, + "learning_rate": 7.586018261439288e-05, + "loss": 1.7587, + "step": 11312 + }, + { + "epoch": 3.472375690607735, + "grad_norm": 0.26856711506843567, + "learning_rate": 7.585592837151716e-05, + "loss": 1.7351, + "step": 11313 + }, + { + "epoch": 3.47268262737876, + "grad_norm": 0.2554367780685425, + "learning_rate": 7.585167387311802e-05, + "loss": 1.7664, + "step": 11314 + }, + { + "epoch": 3.472989564149785, + "grad_norm": 0.3193204700946808, + "learning_rate": 7.584741911923748e-05, + "loss": 1.7487, + "step": 11315 + }, + { + "epoch": 3.47329650092081, + "grad_norm": 0.3227958679199219, + "learning_rate": 7.584316410991759e-05, + "loss": 1.8107, + "step": 11316 + }, + { + "epoch": 3.4736034376918354, + "grad_norm": 0.33891916275024414, + "learning_rate": 7.58389088452004e-05, + "loss": 1.8466, + "step": 11317 + }, + { + "epoch": 3.4739103744628608, + "grad_norm": 0.27050724625587463, + "learning_rate": 7.583465332512797e-05, + "loss": 1.7877, + "step": 11318 + }, + { + "epoch": 3.4742173112338857, + "grad_norm": 0.2935837209224701, + "learning_rate": 7.583039754974235e-05, + "loss": 1.7932, + "step": 11319 + }, + { + "epoch": 3.474524248004911, + "grad_norm": 0.27780550718307495, + "learning_rate": 7.582614151908561e-05, + "loss": 1.8374, + "step": 11320 + }, + { + "epoch": 3.4748311847759363, + "grad_norm": 0.2579033076763153, + "learning_rate": 7.58218852331998e-05, + "loss": 1.7305, + "step": 11321 + }, + { + "epoch": 3.4751381215469612, + "grad_norm": 0.2531716227531433, + "learning_rate": 7.581762869212699e-05, + "loss": 1.8136, + "step": 11322 + }, + { + "epoch": 3.4754450583179866, + "grad_norm": 0.25504544377326965, + "learning_rate": 7.581337189590924e-05, + "loss": 1.787, + "step": 11323 + }, + { + "epoch": 3.475751995089012, + "grad_norm": 0.23659855127334595, + "learning_rate": 7.580911484458861e-05, + "loss": 1.77, + "step": 11324 + }, + { + "epoch": 3.476058931860037, + "grad_norm": 0.22556856274604797, + "learning_rate": 7.580485753820721e-05, + "loss": 1.7808, + "step": 11325 + }, + { + "epoch": 3.476365868631062, + "grad_norm": 0.2860291600227356, + "learning_rate": 7.580059997680705e-05, + "loss": 1.8224, + "step": 11326 + }, + { + "epoch": 3.476672805402087, + "grad_norm": 0.3134596645832062, + "learning_rate": 7.579634216043023e-05, + "loss": 1.8278, + "step": 11327 + }, + { + "epoch": 3.4769797421731123, + "grad_norm": 0.2883087992668152, + "learning_rate": 7.579208408911887e-05, + "loss": 1.7917, + "step": 11328 + }, + { + "epoch": 3.4772866789441377, + "grad_norm": 0.2743333578109741, + "learning_rate": 7.578782576291501e-05, + "loss": 1.8228, + "step": 11329 + }, + { + "epoch": 3.4775936157151626, + "grad_norm": 0.25026053190231323, + "learning_rate": 7.578356718186073e-05, + "loss": 1.7717, + "step": 11330 + }, + { + "epoch": 3.477900552486188, + "grad_norm": 0.246905118227005, + "learning_rate": 7.577930834599813e-05, + "loss": 1.7979, + "step": 11331 + }, + { + "epoch": 3.478207489257213, + "grad_norm": 0.24709418416023254, + "learning_rate": 7.577504925536929e-05, + "loss": 1.8111, + "step": 11332 + }, + { + "epoch": 3.478514426028238, + "grad_norm": 0.25685814023017883, + "learning_rate": 7.577078991001632e-05, + "loss": 1.8255, + "step": 11333 + }, + { + "epoch": 3.4788213627992635, + "grad_norm": 0.23937836289405823, + "learning_rate": 7.576653030998129e-05, + "loss": 1.7254, + "step": 11334 + }, + { + "epoch": 3.4791282995702884, + "grad_norm": 0.22638650238513947, + "learning_rate": 7.57622704553063e-05, + "loss": 1.7847, + "step": 11335 + }, + { + "epoch": 3.4794352363413137, + "grad_norm": 0.26083993911743164, + "learning_rate": 7.575801034603347e-05, + "loss": 1.7947, + "step": 11336 + }, + { + "epoch": 3.479742173112339, + "grad_norm": 0.2715466022491455, + "learning_rate": 7.575374998220488e-05, + "loss": 1.848, + "step": 11337 + }, + { + "epoch": 3.480049109883364, + "grad_norm": 0.25554224848747253, + "learning_rate": 7.574948936386262e-05, + "loss": 1.7811, + "step": 11338 + }, + { + "epoch": 3.4803560466543892, + "grad_norm": 0.2689397931098938, + "learning_rate": 7.574522849104882e-05, + "loss": 1.82, + "step": 11339 + }, + { + "epoch": 3.4806629834254146, + "grad_norm": 0.25027474761009216, + "learning_rate": 7.57409673638056e-05, + "loss": 1.775, + "step": 11340 + }, + { + "epoch": 3.4809699201964395, + "grad_norm": 0.2545457184314728, + "learning_rate": 7.573670598217504e-05, + "loss": 1.8056, + "step": 11341 + }, + { + "epoch": 3.481276856967465, + "grad_norm": 0.28404027223587036, + "learning_rate": 7.573244434619928e-05, + "loss": 1.8372, + "step": 11342 + }, + { + "epoch": 3.4815837937384897, + "grad_norm": 0.28046950697898865, + "learning_rate": 7.572818245592041e-05, + "loss": 1.7851, + "step": 11343 + }, + { + "epoch": 3.481890730509515, + "grad_norm": 0.23005759716033936, + "learning_rate": 7.572392031138056e-05, + "loss": 1.7059, + "step": 11344 + }, + { + "epoch": 3.4821976672805404, + "grad_norm": 0.2931719124317169, + "learning_rate": 7.571965791262185e-05, + "loss": 1.84, + "step": 11345 + }, + { + "epoch": 3.4825046040515653, + "grad_norm": 0.4399266242980957, + "learning_rate": 7.571539525968642e-05, + "loss": 1.7465, + "step": 11346 + }, + { + "epoch": 3.4828115408225906, + "grad_norm": 0.48957565426826477, + "learning_rate": 7.571113235261638e-05, + "loss": 1.8494, + "step": 11347 + }, + { + "epoch": 3.4831184775936155, + "grad_norm": 0.37828895449638367, + "learning_rate": 7.570686919145385e-05, + "loss": 1.7598, + "step": 11348 + }, + { + "epoch": 3.483425414364641, + "grad_norm": 0.22943973541259766, + "learning_rate": 7.570260577624098e-05, + "loss": 1.7443, + "step": 11349 + }, + { + "epoch": 3.483732351135666, + "grad_norm": 0.3245384991168976, + "learning_rate": 7.569834210701987e-05, + "loss": 1.7232, + "step": 11350 + }, + { + "epoch": 3.484039287906691, + "grad_norm": 0.4419693648815155, + "learning_rate": 7.569407818383271e-05, + "loss": 1.841, + "step": 11351 + }, + { + "epoch": 3.4843462246777164, + "grad_norm": 0.4061864912509918, + "learning_rate": 7.568981400672159e-05, + "loss": 1.8274, + "step": 11352 + }, + { + "epoch": 3.4846531614487417, + "grad_norm": 0.2609417736530304, + "learning_rate": 7.56855495757287e-05, + "loss": 1.8631, + "step": 11353 + }, + { + "epoch": 3.4849600982197666, + "grad_norm": 0.28758567571640015, + "learning_rate": 7.568128489089612e-05, + "loss": 1.8169, + "step": 11354 + }, + { + "epoch": 3.485267034990792, + "grad_norm": 0.40643060207366943, + "learning_rate": 7.567701995226606e-05, + "loss": 1.809, + "step": 11355 + }, + { + "epoch": 3.4855739717618173, + "grad_norm": 0.37649446725845337, + "learning_rate": 7.56727547598806e-05, + "loss": 1.7661, + "step": 11356 + }, + { + "epoch": 3.485880908532842, + "grad_norm": 0.22863779962062836, + "learning_rate": 7.566848931378197e-05, + "loss": 1.808, + "step": 11357 + }, + { + "epoch": 3.4861878453038675, + "grad_norm": 0.4487019181251526, + "learning_rate": 7.566422361401226e-05, + "loss": 1.7627, + "step": 11358 + }, + { + "epoch": 3.4864947820748924, + "grad_norm": 0.4583640694618225, + "learning_rate": 7.565995766061367e-05, + "loss": 1.8186, + "step": 11359 + }, + { + "epoch": 3.4868017188459177, + "grad_norm": 0.27231526374816895, + "learning_rate": 7.565569145362833e-05, + "loss": 1.8465, + "step": 11360 + }, + { + "epoch": 3.487108655616943, + "grad_norm": 0.3877887725830078, + "learning_rate": 7.565142499309841e-05, + "loss": 1.7668, + "step": 11361 + }, + { + "epoch": 3.487415592387968, + "grad_norm": 0.5511242747306824, + "learning_rate": 7.564715827906606e-05, + "loss": 1.8417, + "step": 11362 + }, + { + "epoch": 3.4877225291589933, + "grad_norm": 0.5112231373786926, + "learning_rate": 7.564289131157348e-05, + "loss": 1.8038, + "step": 11363 + }, + { + "epoch": 3.488029465930018, + "grad_norm": 0.279502809047699, + "learning_rate": 7.56386240906628e-05, + "loss": 1.7545, + "step": 11364 + }, + { + "epoch": 3.4883364027010435, + "grad_norm": 0.30080464482307434, + "learning_rate": 7.563435661637623e-05, + "loss": 1.8136, + "step": 11365 + }, + { + "epoch": 3.488643339472069, + "grad_norm": 0.4424717128276825, + "learning_rate": 7.563008888875591e-05, + "loss": 1.7542, + "step": 11366 + }, + { + "epoch": 3.4889502762430937, + "grad_norm": 0.42144715785980225, + "learning_rate": 7.562582090784403e-05, + "loss": 1.8245, + "step": 11367 + }, + { + "epoch": 3.489257213014119, + "grad_norm": 0.2533668875694275, + "learning_rate": 7.562155267368277e-05, + "loss": 1.8654, + "step": 11368 + }, + { + "epoch": 3.4895641497851444, + "grad_norm": 0.3327534794807434, + "learning_rate": 7.56172841863143e-05, + "loss": 1.7882, + "step": 11369 + }, + { + "epoch": 3.4898710865561693, + "grad_norm": 0.44001486897468567, + "learning_rate": 7.561301544578081e-05, + "loss": 1.8397, + "step": 11370 + }, + { + "epoch": 3.4901780233271946, + "grad_norm": 0.2779090106487274, + "learning_rate": 7.56087464521245e-05, + "loss": 1.7398, + "step": 11371 + }, + { + "epoch": 3.49048496009822, + "grad_norm": 0.3018067479133606, + "learning_rate": 7.560447720538755e-05, + "loss": 1.8076, + "step": 11372 + }, + { + "epoch": 3.490791896869245, + "grad_norm": 0.4370935261249542, + "learning_rate": 7.560020770561216e-05, + "loss": 1.8057, + "step": 11373 + }, + { + "epoch": 3.49109883364027, + "grad_norm": 0.2936978042125702, + "learning_rate": 7.559593795284047e-05, + "loss": 1.7726, + "step": 11374 + }, + { + "epoch": 3.491405770411295, + "grad_norm": 0.28825095295906067, + "learning_rate": 7.559166794711476e-05, + "loss": 1.8039, + "step": 11375 + }, + { + "epoch": 3.4917127071823204, + "grad_norm": 0.39334073662757874, + "learning_rate": 7.55873976884772e-05, + "loss": 1.8388, + "step": 11376 + }, + { + "epoch": 3.4920196439533457, + "grad_norm": 0.33880460262298584, + "learning_rate": 7.558312717696995e-05, + "loss": 1.7791, + "step": 11377 + }, + { + "epoch": 3.4923265807243706, + "grad_norm": 0.4433762729167938, + "learning_rate": 7.557885641263524e-05, + "loss": 1.7786, + "step": 11378 + }, + { + "epoch": 3.492633517495396, + "grad_norm": 0.4710264205932617, + "learning_rate": 7.557458539551527e-05, + "loss": 1.7193, + "step": 11379 + }, + { + "epoch": 3.4929404542664213, + "grad_norm": 0.27514326572418213, + "learning_rate": 7.557031412565228e-05, + "loss": 1.823, + "step": 11380 + }, + { + "epoch": 3.493247391037446, + "grad_norm": 0.4681413471698761, + "learning_rate": 7.556604260308846e-05, + "loss": 1.7598, + "step": 11381 + }, + { + "epoch": 3.4935543278084715, + "grad_norm": 0.5032503604888916, + "learning_rate": 7.556177082786602e-05, + "loss": 1.741, + "step": 11382 + }, + { + "epoch": 3.493861264579497, + "grad_norm": 0.2677086889743805, + "learning_rate": 7.555749880002716e-05, + "loss": 1.8528, + "step": 11383 + }, + { + "epoch": 3.4941682013505218, + "grad_norm": 0.43870940804481506, + "learning_rate": 7.555322651961414e-05, + "loss": 1.7632, + "step": 11384 + }, + { + "epoch": 3.494475138121547, + "grad_norm": 0.5403209924697876, + "learning_rate": 7.554895398666914e-05, + "loss": 1.8181, + "step": 11385 + }, + { + "epoch": 3.494782074892572, + "grad_norm": 0.2714318335056305, + "learning_rate": 7.554468120123441e-05, + "loss": 1.8151, + "step": 11386 + }, + { + "epoch": 3.4950890116635973, + "grad_norm": 0.49661698937416077, + "learning_rate": 7.554040816335217e-05, + "loss": 1.8116, + "step": 11387 + }, + { + "epoch": 3.4953959484346226, + "grad_norm": 0.49954715371131897, + "learning_rate": 7.553613487306465e-05, + "loss": 1.8841, + "step": 11388 + }, + { + "epoch": 3.4957028852056475, + "grad_norm": 0.28189441561698914, + "learning_rate": 7.553186133041406e-05, + "loss": 1.7834, + "step": 11389 + }, + { + "epoch": 3.496009821976673, + "grad_norm": 0.36029115319252014, + "learning_rate": 7.552758753544267e-05, + "loss": 1.7796, + "step": 11390 + }, + { + "epoch": 3.4963167587476978, + "grad_norm": 0.45023465156555176, + "learning_rate": 7.552331348819268e-05, + "loss": 1.8773, + "step": 11391 + }, + { + "epoch": 3.496623695518723, + "grad_norm": 0.3235788643360138, + "learning_rate": 7.551903918870636e-05, + "loss": 1.7984, + "step": 11392 + }, + { + "epoch": 3.4969306322897484, + "grad_norm": 0.25656190514564514, + "learning_rate": 7.551476463702596e-05, + "loss": 1.8403, + "step": 11393 + }, + { + "epoch": 3.4972375690607733, + "grad_norm": 0.2866458594799042, + "learning_rate": 7.551048983319366e-05, + "loss": 1.7428, + "step": 11394 + }, + { + "epoch": 3.4975445058317987, + "grad_norm": 0.2713877856731415, + "learning_rate": 7.550621477725177e-05, + "loss": 1.8508, + "step": 11395 + }, + { + "epoch": 3.497851442602824, + "grad_norm": 0.27978867292404175, + "learning_rate": 7.55019394692425e-05, + "loss": 1.8049, + "step": 11396 + }, + { + "epoch": 3.498158379373849, + "grad_norm": 0.3275020122528076, + "learning_rate": 7.549766390920814e-05, + "loss": 1.8553, + "step": 11397 + }, + { + "epoch": 3.498465316144874, + "grad_norm": 0.29947492480278015, + "learning_rate": 7.54933880971909e-05, + "loss": 1.7614, + "step": 11398 + }, + { + "epoch": 3.4987722529158995, + "grad_norm": 0.25790849328041077, + "learning_rate": 7.548911203323308e-05, + "loss": 1.8223, + "step": 11399 + }, + { + "epoch": 3.4990791896869244, + "grad_norm": 0.3145451545715332, + "learning_rate": 7.54848357173769e-05, + "loss": 1.7642, + "step": 11400 + }, + { + "epoch": 3.4993861264579498, + "grad_norm": 0.29052913188934326, + "learning_rate": 7.548055914966463e-05, + "loss": 1.7728, + "step": 11401 + }, + { + "epoch": 3.4996930632289747, + "grad_norm": 0.2741037905216217, + "learning_rate": 7.547628233013854e-05, + "loss": 1.7382, + "step": 11402 + }, + { + "epoch": 3.5, + "grad_norm": 0.2562723755836487, + "learning_rate": 7.54720052588409e-05, + "loss": 1.7455, + "step": 11403 + }, + { + "epoch": 3.5003069367710253, + "grad_norm": 0.27649983763694763, + "learning_rate": 7.546772793581398e-05, + "loss": 1.7194, + "step": 11404 + }, + { + "epoch": 3.5006138735420502, + "grad_norm": 0.27290579676628113, + "learning_rate": 7.546345036110004e-05, + "loss": 1.87, + "step": 11405 + }, + { + "epoch": 3.5009208103130756, + "grad_norm": 0.33585605025291443, + "learning_rate": 7.545917253474136e-05, + "loss": 1.7703, + "step": 11406 + }, + { + "epoch": 3.5012277470841005, + "grad_norm": 0.2592691481113434, + "learning_rate": 7.545489445678022e-05, + "loss": 1.7657, + "step": 11407 + }, + { + "epoch": 3.501534683855126, + "grad_norm": 0.3081367015838623, + "learning_rate": 7.545061612725888e-05, + "loss": 1.8067, + "step": 11408 + }, + { + "epoch": 3.501841620626151, + "grad_norm": 0.31012001633644104, + "learning_rate": 7.544633754621965e-05, + "loss": 1.8009, + "step": 11409 + }, + { + "epoch": 3.5021485573971765, + "grad_norm": 0.28232479095458984, + "learning_rate": 7.54420587137048e-05, + "loss": 1.8124, + "step": 11410 + }, + { + "epoch": 3.5024554941682013, + "grad_norm": 0.24079222977161407, + "learning_rate": 7.54377796297566e-05, + "loss": 1.789, + "step": 11411 + }, + { + "epoch": 3.5027624309392267, + "grad_norm": 0.27347204089164734, + "learning_rate": 7.543350029441737e-05, + "loss": 1.7704, + "step": 11412 + }, + { + "epoch": 3.5030693677102516, + "grad_norm": 0.25545811653137207, + "learning_rate": 7.542922070772935e-05, + "loss": 1.7871, + "step": 11413 + }, + { + "epoch": 3.503376304481277, + "grad_norm": 0.2507263123989105, + "learning_rate": 7.54249408697349e-05, + "loss": 1.8424, + "step": 11414 + }, + { + "epoch": 3.5036832412523022, + "grad_norm": 0.2776084244251251, + "learning_rate": 7.542066078047627e-05, + "loss": 1.8246, + "step": 11415 + }, + { + "epoch": 3.503990178023327, + "grad_norm": 0.32833749055862427, + "learning_rate": 7.541638043999577e-05, + "loss": 1.7785, + "step": 11416 + }, + { + "epoch": 3.5042971147943525, + "grad_norm": 0.258486270904541, + "learning_rate": 7.541209984833571e-05, + "loss": 1.7543, + "step": 11417 + }, + { + "epoch": 3.5046040515653774, + "grad_norm": 0.25825178623199463, + "learning_rate": 7.540781900553837e-05, + "loss": 1.7939, + "step": 11418 + }, + { + "epoch": 3.5049109883364027, + "grad_norm": 0.26980888843536377, + "learning_rate": 7.540353791164606e-05, + "loss": 1.7777, + "step": 11419 + }, + { + "epoch": 3.505217925107428, + "grad_norm": 0.24103333055973053, + "learning_rate": 7.539925656670111e-05, + "loss": 1.7565, + "step": 11420 + }, + { + "epoch": 3.505524861878453, + "grad_norm": 0.25192007422447205, + "learning_rate": 7.539497497074584e-05, + "loss": 1.7696, + "step": 11421 + }, + { + "epoch": 3.5058317986494782, + "grad_norm": 0.218489870429039, + "learning_rate": 7.539069312382252e-05, + "loss": 1.761, + "step": 11422 + }, + { + "epoch": 3.506138735420503, + "grad_norm": 0.27533552050590515, + "learning_rate": 7.53864110259735e-05, + "loss": 1.7374, + "step": 11423 + }, + { + "epoch": 3.5064456721915285, + "grad_norm": 0.2603490650653839, + "learning_rate": 7.538212867724108e-05, + "loss": 1.8342, + "step": 11424 + }, + { + "epoch": 3.506752608962554, + "grad_norm": 0.27340635657310486, + "learning_rate": 7.537784607766758e-05, + "loss": 1.8099, + "step": 11425 + }, + { + "epoch": 3.507059545733579, + "grad_norm": 0.25342679023742676, + "learning_rate": 7.537356322729537e-05, + "loss": 1.7949, + "step": 11426 + }, + { + "epoch": 3.507366482504604, + "grad_norm": 0.292819082736969, + "learning_rate": 7.536928012616669e-05, + "loss": 1.9049, + "step": 11427 + }, + { + "epoch": 3.5076734192756294, + "grad_norm": 0.28256532549858093, + "learning_rate": 7.536499677432393e-05, + "loss": 1.8464, + "step": 11428 + }, + { + "epoch": 3.5079803560466543, + "grad_norm": 0.2672989070415497, + "learning_rate": 7.536071317180942e-05, + "loss": 1.8301, + "step": 11429 + }, + { + "epoch": 3.5082872928176796, + "grad_norm": 0.2525518238544464, + "learning_rate": 7.535642931866546e-05, + "loss": 1.8054, + "step": 11430 + }, + { + "epoch": 3.508594229588705, + "grad_norm": 0.2622447609901428, + "learning_rate": 7.535214521493442e-05, + "loss": 1.8293, + "step": 11431 + }, + { + "epoch": 3.50890116635973, + "grad_norm": 0.27057385444641113, + "learning_rate": 7.534786086065859e-05, + "loss": 1.7426, + "step": 11432 + }, + { + "epoch": 3.509208103130755, + "grad_norm": 0.27363866567611694, + "learning_rate": 7.534357625588038e-05, + "loss": 1.7138, + "step": 11433 + }, + { + "epoch": 3.50951503990178, + "grad_norm": 0.3029060363769531, + "learning_rate": 7.533929140064207e-05, + "loss": 1.864, + "step": 11434 + }, + { + "epoch": 3.5098219766728054, + "grad_norm": 0.3144821524620056, + "learning_rate": 7.533500629498604e-05, + "loss": 1.7846, + "step": 11435 + }, + { + "epoch": 3.5101289134438307, + "grad_norm": 0.44535213708877563, + "learning_rate": 7.533072093895461e-05, + "loss": 1.799, + "step": 11436 + }, + { + "epoch": 3.5104358502148556, + "grad_norm": 0.25344160199165344, + "learning_rate": 7.532643533259017e-05, + "loss": 1.7391, + "step": 11437 + }, + { + "epoch": 3.510742786985881, + "grad_norm": 0.286026269197464, + "learning_rate": 7.532214947593506e-05, + "loss": 1.8436, + "step": 11438 + }, + { + "epoch": 3.511049723756906, + "grad_norm": 0.3317352533340454, + "learning_rate": 7.53178633690316e-05, + "loss": 1.8507, + "step": 11439 + }, + { + "epoch": 3.511356660527931, + "grad_norm": 0.2547265589237213, + "learning_rate": 7.53135770119222e-05, + "loss": 1.7483, + "step": 11440 + }, + { + "epoch": 3.5116635972989565, + "grad_norm": 0.24281835556030273, + "learning_rate": 7.530929040464917e-05, + "loss": 1.759, + "step": 11441 + }, + { + "epoch": 3.511970534069982, + "grad_norm": 0.2935381829738617, + "learning_rate": 7.530500354725491e-05, + "loss": 1.8235, + "step": 11442 + }, + { + "epoch": 3.5122774708410067, + "grad_norm": 0.26642969250679016, + "learning_rate": 7.53007164397818e-05, + "loss": 1.8324, + "step": 11443 + }, + { + "epoch": 3.512584407612032, + "grad_norm": 0.24830882251262665, + "learning_rate": 7.529642908227215e-05, + "loss": 1.8132, + "step": 11444 + }, + { + "epoch": 3.512891344383057, + "grad_norm": 0.3100191056728363, + "learning_rate": 7.529214147476838e-05, + "loss": 1.8453, + "step": 11445 + }, + { + "epoch": 3.5131982811540823, + "grad_norm": 0.27948811650276184, + "learning_rate": 7.528785361731282e-05, + "loss": 1.7792, + "step": 11446 + }, + { + "epoch": 3.5135052179251076, + "grad_norm": 0.26978832483291626, + "learning_rate": 7.528356550994787e-05, + "loss": 1.7857, + "step": 11447 + }, + { + "epoch": 3.5138121546961325, + "grad_norm": 0.30527836084365845, + "learning_rate": 7.527927715271592e-05, + "loss": 1.807, + "step": 11448 + }, + { + "epoch": 3.514119091467158, + "grad_norm": 0.2915664315223694, + "learning_rate": 7.527498854565934e-05, + "loss": 1.8414, + "step": 11449 + }, + { + "epoch": 3.5144260282381827, + "grad_norm": 0.2854034900665283, + "learning_rate": 7.52706996888205e-05, + "loss": 1.793, + "step": 11450 + }, + { + "epoch": 3.514732965009208, + "grad_norm": 0.30281978845596313, + "learning_rate": 7.52664105822418e-05, + "loss": 1.7896, + "step": 11451 + }, + { + "epoch": 3.5150399017802334, + "grad_norm": 0.3317166566848755, + "learning_rate": 7.526212122596561e-05, + "loss": 1.7776, + "step": 11452 + }, + { + "epoch": 3.5153468385512583, + "grad_norm": 0.3400021195411682, + "learning_rate": 7.525783162003434e-05, + "loss": 1.8411, + "step": 11453 + }, + { + "epoch": 3.5156537753222836, + "grad_norm": 0.25169485807418823, + "learning_rate": 7.525354176449037e-05, + "loss": 1.7871, + "step": 11454 + }, + { + "epoch": 3.5159607120933085, + "grad_norm": 0.3442455530166626, + "learning_rate": 7.52492516593761e-05, + "loss": 1.7644, + "step": 11455 + }, + { + "epoch": 3.516267648864334, + "grad_norm": 0.35644033551216125, + "learning_rate": 7.524496130473394e-05, + "loss": 1.801, + "step": 11456 + }, + { + "epoch": 3.516574585635359, + "grad_norm": 0.3180185854434967, + "learning_rate": 7.524067070060625e-05, + "loss": 1.7897, + "step": 11457 + }, + { + "epoch": 3.5168815224063845, + "grad_norm": 0.2417978048324585, + "learning_rate": 7.523637984703548e-05, + "loss": 1.8527, + "step": 11458 + }, + { + "epoch": 3.5171884591774094, + "grad_norm": 0.29661375284194946, + "learning_rate": 7.5232088744064e-05, + "loss": 1.8276, + "step": 11459 + }, + { + "epoch": 3.5174953959484347, + "grad_norm": 0.2467545121908188, + "learning_rate": 7.522779739173424e-05, + "loss": 1.7819, + "step": 11460 + }, + { + "epoch": 3.5178023327194596, + "grad_norm": 0.26177898049354553, + "learning_rate": 7.522350579008859e-05, + "loss": 1.8017, + "step": 11461 + }, + { + "epoch": 3.518109269490485, + "grad_norm": 0.28740498423576355, + "learning_rate": 7.521921393916948e-05, + "loss": 1.7863, + "step": 11462 + }, + { + "epoch": 3.5184162062615103, + "grad_norm": 0.28685200214385986, + "learning_rate": 7.521492183901932e-05, + "loss": 1.8069, + "step": 11463 + }, + { + "epoch": 3.518723143032535, + "grad_norm": 0.24174338579177856, + "learning_rate": 7.521062948968051e-05, + "loss": 1.7523, + "step": 11464 + }, + { + "epoch": 3.5190300798035605, + "grad_norm": 0.23273243010044098, + "learning_rate": 7.520633689119548e-05, + "loss": 1.7827, + "step": 11465 + }, + { + "epoch": 3.5193370165745854, + "grad_norm": 0.22708217799663544, + "learning_rate": 7.520204404360667e-05, + "loss": 1.7377, + "step": 11466 + }, + { + "epoch": 3.5196439533456108, + "grad_norm": 0.24725353717803955, + "learning_rate": 7.519775094695649e-05, + "loss": 1.7828, + "step": 11467 + }, + { + "epoch": 3.519950890116636, + "grad_norm": 0.23046265542507172, + "learning_rate": 7.519345760128736e-05, + "loss": 1.7427, + "step": 11468 + }, + { + "epoch": 3.520257826887661, + "grad_norm": 0.2618728280067444, + "learning_rate": 7.518916400664171e-05, + "loss": 1.8133, + "step": 11469 + }, + { + "epoch": 3.5205647636586863, + "grad_norm": 0.23232363164424896, + "learning_rate": 7.5184870163062e-05, + "loss": 1.7468, + "step": 11470 + }, + { + "epoch": 3.520871700429711, + "grad_norm": 0.21993626654148102, + "learning_rate": 7.51805760705906e-05, + "loss": 1.7565, + "step": 11471 + }, + { + "epoch": 3.5211786372007365, + "grad_norm": 0.23563124239444733, + "learning_rate": 7.517628172927001e-05, + "loss": 1.7795, + "step": 11472 + }, + { + "epoch": 3.521485573971762, + "grad_norm": 0.24502862989902496, + "learning_rate": 7.517198713914266e-05, + "loss": 1.813, + "step": 11473 + }, + { + "epoch": 3.521792510742787, + "grad_norm": 0.24745969474315643, + "learning_rate": 7.516769230025097e-05, + "loss": 1.7601, + "step": 11474 + }, + { + "epoch": 3.522099447513812, + "grad_norm": 0.27686986327171326, + "learning_rate": 7.516339721263739e-05, + "loss": 1.8121, + "step": 11475 + }, + { + "epoch": 3.5224063842848374, + "grad_norm": 0.3110332787036896, + "learning_rate": 7.515910187634439e-05, + "loss": 1.7978, + "step": 11476 + }, + { + "epoch": 3.5227133210558623, + "grad_norm": 0.3394792377948761, + "learning_rate": 7.515480629141436e-05, + "loss": 1.8427, + "step": 11477 + }, + { + "epoch": 3.5230202578268877, + "grad_norm": 0.2802537679672241, + "learning_rate": 7.515051045788984e-05, + "loss": 1.7343, + "step": 11478 + }, + { + "epoch": 3.523327194597913, + "grad_norm": 0.23687711358070374, + "learning_rate": 7.514621437581319e-05, + "loss": 1.7786, + "step": 11479 + }, + { + "epoch": 3.523634131368938, + "grad_norm": 0.31114310026168823, + "learning_rate": 7.514191804522693e-05, + "loss": 1.8137, + "step": 11480 + }, + { + "epoch": 3.523941068139963, + "grad_norm": 0.3257891833782196, + "learning_rate": 7.513762146617351e-05, + "loss": 1.8015, + "step": 11481 + }, + { + "epoch": 3.524248004910988, + "grad_norm": 0.24353443086147308, + "learning_rate": 7.513332463869536e-05, + "loss": 1.7485, + "step": 11482 + }, + { + "epoch": 3.5245549416820134, + "grad_norm": 0.29861485958099365, + "learning_rate": 7.512902756283498e-05, + "loss": 1.7993, + "step": 11483 + }, + { + "epoch": 3.5248618784530388, + "grad_norm": 0.40380924940109253, + "learning_rate": 7.51247302386348e-05, + "loss": 1.7664, + "step": 11484 + }, + { + "epoch": 3.525168815224064, + "grad_norm": 0.3365862965583801, + "learning_rate": 7.512043266613733e-05, + "loss": 1.7512, + "step": 11485 + }, + { + "epoch": 3.525475751995089, + "grad_norm": 0.2502824068069458, + "learning_rate": 7.511613484538502e-05, + "loss": 1.8414, + "step": 11486 + }, + { + "epoch": 3.5257826887661143, + "grad_norm": 0.2598603069782257, + "learning_rate": 7.511183677642034e-05, + "loss": 1.7358, + "step": 11487 + }, + { + "epoch": 3.5260896255371392, + "grad_norm": 0.30246880650520325, + "learning_rate": 7.510753845928576e-05, + "loss": 1.791, + "step": 11488 + }, + { + "epoch": 3.5263965623081646, + "grad_norm": 0.25170832872390747, + "learning_rate": 7.510323989402378e-05, + "loss": 1.7498, + "step": 11489 + }, + { + "epoch": 3.52670349907919, + "grad_norm": 0.2925282418727875, + "learning_rate": 7.509894108067688e-05, + "loss": 1.8413, + "step": 11490 + }, + { + "epoch": 3.527010435850215, + "grad_norm": 0.2643601596355438, + "learning_rate": 7.509464201928752e-05, + "loss": 1.8052, + "step": 11491 + }, + { + "epoch": 3.52731737262124, + "grad_norm": 0.2938917279243469, + "learning_rate": 7.50903427098982e-05, + "loss": 1.7308, + "step": 11492 + }, + { + "epoch": 3.527624309392265, + "grad_norm": 0.2978343367576599, + "learning_rate": 7.508604315255142e-05, + "loss": 1.8147, + "step": 11493 + }, + { + "epoch": 3.5279312461632903, + "grad_norm": 0.2507816255092621, + "learning_rate": 7.508174334728963e-05, + "loss": 1.774, + "step": 11494 + }, + { + "epoch": 3.5282381829343157, + "grad_norm": 0.32971861958503723, + "learning_rate": 7.507744329415538e-05, + "loss": 1.7634, + "step": 11495 + }, + { + "epoch": 3.5285451197053406, + "grad_norm": 0.3149639964103699, + "learning_rate": 7.507314299319113e-05, + "loss": 1.8032, + "step": 11496 + }, + { + "epoch": 3.528852056476366, + "grad_norm": 0.2721364498138428, + "learning_rate": 7.506884244443937e-05, + "loss": 1.7702, + "step": 11497 + }, + { + "epoch": 3.529158993247391, + "grad_norm": 0.29375985264778137, + "learning_rate": 7.506454164794263e-05, + "loss": 1.8673, + "step": 11498 + }, + { + "epoch": 3.529465930018416, + "grad_norm": 0.379944384098053, + "learning_rate": 7.50602406037434e-05, + "loss": 1.883, + "step": 11499 + }, + { + "epoch": 3.5297728667894415, + "grad_norm": 0.4041840136051178, + "learning_rate": 7.505593931188417e-05, + "loss": 1.7998, + "step": 11500 + }, + { + "epoch": 3.530079803560467, + "grad_norm": 0.30013784766197205, + "learning_rate": 7.505163777240747e-05, + "loss": 1.775, + "step": 11501 + }, + { + "epoch": 3.5303867403314917, + "grad_norm": 0.25161153078079224, + "learning_rate": 7.50473359853558e-05, + "loss": 1.8609, + "step": 11502 + }, + { + "epoch": 3.530693677102517, + "grad_norm": 0.2803831100463867, + "learning_rate": 7.504303395077168e-05, + "loss": 1.8397, + "step": 11503 + }, + { + "epoch": 3.531000613873542, + "grad_norm": 0.26678118109703064, + "learning_rate": 7.503873166869762e-05, + "loss": 1.7877, + "step": 11504 + }, + { + "epoch": 3.5313075506445673, + "grad_norm": 0.24280449748039246, + "learning_rate": 7.503442913917613e-05, + "loss": 1.7891, + "step": 11505 + }, + { + "epoch": 3.5316144874155926, + "grad_norm": 0.26461485028266907, + "learning_rate": 7.503012636224976e-05, + "loss": 1.7993, + "step": 11506 + }, + { + "epoch": 3.5319214241866175, + "grad_norm": 0.27001824975013733, + "learning_rate": 7.502582333796098e-05, + "loss": 1.7719, + "step": 11507 + }, + { + "epoch": 3.532228360957643, + "grad_norm": 0.27585846185684204, + "learning_rate": 7.502152006635237e-05, + "loss": 1.7412, + "step": 11508 + }, + { + "epoch": 3.5325352977286677, + "grad_norm": 0.24896648526191711, + "learning_rate": 7.501721654746643e-05, + "loss": 1.7459, + "step": 11509 + }, + { + "epoch": 3.532842234499693, + "grad_norm": 0.2308502197265625, + "learning_rate": 7.501291278134569e-05, + "loss": 1.7717, + "step": 11510 + }, + { + "epoch": 3.5331491712707184, + "grad_norm": 0.3026069104671478, + "learning_rate": 7.500860876803267e-05, + "loss": 1.8578, + "step": 11511 + }, + { + "epoch": 3.5334561080417433, + "grad_norm": 0.30242082476615906, + "learning_rate": 7.500430450756995e-05, + "loss": 1.7793, + "step": 11512 + }, + { + "epoch": 3.5337630448127686, + "grad_norm": 0.2583339214324951, + "learning_rate": 7.500000000000001e-05, + "loss": 1.8388, + "step": 11513 + }, + { + "epoch": 3.5340699815837935, + "grad_norm": 0.29673871397972107, + "learning_rate": 7.499569524536542e-05, + "loss": 1.7749, + "step": 11514 + }, + { + "epoch": 3.534376918354819, + "grad_norm": 0.35199788212776184, + "learning_rate": 7.499139024370874e-05, + "loss": 1.7863, + "step": 11515 + }, + { + "epoch": 3.534683855125844, + "grad_norm": 0.25776436924934387, + "learning_rate": 7.498708499507247e-05, + "loss": 1.7568, + "step": 11516 + }, + { + "epoch": 3.5349907918968695, + "grad_norm": 0.26081520318984985, + "learning_rate": 7.498277949949919e-05, + "loss": 1.807, + "step": 11517 + }, + { + "epoch": 3.5352977286678944, + "grad_norm": 0.29247912764549255, + "learning_rate": 7.497847375703145e-05, + "loss": 1.7568, + "step": 11518 + }, + { + "epoch": 3.5356046654389197, + "grad_norm": 0.20964498817920685, + "learning_rate": 7.497416776771178e-05, + "loss": 1.7601, + "step": 11519 + }, + { + "epoch": 3.5359116022099446, + "grad_norm": 0.28739818930625916, + "learning_rate": 7.496986153158273e-05, + "loss": 1.7915, + "step": 11520 + }, + { + "epoch": 3.53621853898097, + "grad_norm": 0.3109932839870453, + "learning_rate": 7.496555504868691e-05, + "loss": 1.8046, + "step": 11521 + }, + { + "epoch": 3.5365254757519953, + "grad_norm": 0.259284108877182, + "learning_rate": 7.496124831906681e-05, + "loss": 1.7595, + "step": 11522 + }, + { + "epoch": 3.53683241252302, + "grad_norm": 0.265909343957901, + "learning_rate": 7.495694134276504e-05, + "loss": 1.8249, + "step": 11523 + }, + { + "epoch": 3.5371393492940455, + "grad_norm": 0.2478799819946289, + "learning_rate": 7.495263411982415e-05, + "loss": 1.8531, + "step": 11524 + }, + { + "epoch": 3.5374462860650704, + "grad_norm": 0.2636432945728302, + "learning_rate": 7.494832665028671e-05, + "loss": 1.8114, + "step": 11525 + }, + { + "epoch": 3.5377532228360957, + "grad_norm": 0.25323864817619324, + "learning_rate": 7.494401893419527e-05, + "loss": 1.8271, + "step": 11526 + }, + { + "epoch": 3.538060159607121, + "grad_norm": 0.2352467179298401, + "learning_rate": 7.493971097159241e-05, + "loss": 1.7524, + "step": 11527 + }, + { + "epoch": 3.538367096378146, + "grad_norm": 0.2788623869419098, + "learning_rate": 7.493540276252072e-05, + "loss": 1.8238, + "step": 11528 + }, + { + "epoch": 3.5386740331491713, + "grad_norm": 0.3506326377391815, + "learning_rate": 7.493109430702277e-05, + "loss": 1.8525, + "step": 11529 + }, + { + "epoch": 3.538980969920196, + "grad_norm": 0.3685263395309448, + "learning_rate": 7.492678560514113e-05, + "loss": 1.8497, + "step": 11530 + }, + { + "epoch": 3.5392879066912215, + "grad_norm": 0.32200056314468384, + "learning_rate": 7.492247665691837e-05, + "loss": 1.7587, + "step": 11531 + }, + { + "epoch": 3.539594843462247, + "grad_norm": 0.2800062894821167, + "learning_rate": 7.49181674623971e-05, + "loss": 1.8188, + "step": 11532 + }, + { + "epoch": 3.539901780233272, + "grad_norm": 0.24137580394744873, + "learning_rate": 7.491385802161989e-05, + "loss": 1.7947, + "step": 11533 + }, + { + "epoch": 3.540208717004297, + "grad_norm": 0.21900027990341187, + "learning_rate": 7.490954833462933e-05, + "loss": 1.7722, + "step": 11534 + }, + { + "epoch": 3.5405156537753224, + "grad_norm": 0.25009945034980774, + "learning_rate": 7.490523840146803e-05, + "loss": 1.8173, + "step": 11535 + }, + { + "epoch": 3.5408225905463473, + "grad_norm": 0.2778431475162506, + "learning_rate": 7.490092822217855e-05, + "loss": 1.8368, + "step": 11536 + }, + { + "epoch": 3.5411295273173726, + "grad_norm": 0.2845982611179352, + "learning_rate": 7.48966177968035e-05, + "loss": 1.7539, + "step": 11537 + }, + { + "epoch": 3.541436464088398, + "grad_norm": 0.27480921149253845, + "learning_rate": 7.48923071253855e-05, + "loss": 1.8494, + "step": 11538 + }, + { + "epoch": 3.541743400859423, + "grad_norm": 0.2722087502479553, + "learning_rate": 7.488799620796711e-05, + "loss": 1.8422, + "step": 11539 + }, + { + "epoch": 3.542050337630448, + "grad_norm": 0.2984340190887451, + "learning_rate": 7.488368504459097e-05, + "loss": 1.8042, + "step": 11540 + }, + { + "epoch": 3.542357274401473, + "grad_norm": 0.2405850738286972, + "learning_rate": 7.487937363529966e-05, + "loss": 1.749, + "step": 11541 + }, + { + "epoch": 3.5426642111724984, + "grad_norm": 0.24816973507404327, + "learning_rate": 7.487506198013579e-05, + "loss": 1.8671, + "step": 11542 + }, + { + "epoch": 3.5429711479435237, + "grad_norm": 0.2796473503112793, + "learning_rate": 7.487075007914199e-05, + "loss": 1.8023, + "step": 11543 + }, + { + "epoch": 3.5432780847145486, + "grad_norm": 0.2600162625312805, + "learning_rate": 7.486643793236086e-05, + "loss": 1.7997, + "step": 11544 + }, + { + "epoch": 3.543585021485574, + "grad_norm": 0.2746226489543915, + "learning_rate": 7.486212553983503e-05, + "loss": 1.7773, + "step": 11545 + }, + { + "epoch": 3.5438919582565993, + "grad_norm": 0.24142079055309296, + "learning_rate": 7.485781290160708e-05, + "loss": 1.791, + "step": 11546 + }, + { + "epoch": 3.544198895027624, + "grad_norm": 0.2472934126853943, + "learning_rate": 7.485350001771966e-05, + "loss": 1.8183, + "step": 11547 + }, + { + "epoch": 3.5445058317986495, + "grad_norm": 0.26891404390335083, + "learning_rate": 7.48491868882154e-05, + "loss": 1.7421, + "step": 11548 + }, + { + "epoch": 3.544812768569675, + "grad_norm": 0.24820464849472046, + "learning_rate": 7.48448735131369e-05, + "loss": 1.7372, + "step": 11549 + }, + { + "epoch": 3.5451197053406998, + "grad_norm": 0.2456594705581665, + "learning_rate": 7.484055989252679e-05, + "loss": 1.7883, + "step": 11550 + }, + { + "epoch": 3.545426642111725, + "grad_norm": 0.32420551776885986, + "learning_rate": 7.48362460264277e-05, + "loss": 1.8363, + "step": 11551 + }, + { + "epoch": 3.54573357888275, + "grad_norm": 0.3187662661075592, + "learning_rate": 7.483193191488229e-05, + "loss": 1.7957, + "step": 11552 + }, + { + "epoch": 3.5460405156537753, + "grad_norm": 0.2845410108566284, + "learning_rate": 7.482761755793316e-05, + "loss": 1.8288, + "step": 11553 + }, + { + "epoch": 3.5463474524248007, + "grad_norm": 0.2816021740436554, + "learning_rate": 7.482330295562298e-05, + "loss": 1.7562, + "step": 11554 + }, + { + "epoch": 3.5466543891958255, + "grad_norm": 0.28938058018684387, + "learning_rate": 7.481898810799435e-05, + "loss": 1.8139, + "step": 11555 + }, + { + "epoch": 3.546961325966851, + "grad_norm": 0.3305707573890686, + "learning_rate": 7.481467301508995e-05, + "loss": 1.8956, + "step": 11556 + }, + { + "epoch": 3.5472682627378758, + "grad_norm": 0.3890376091003418, + "learning_rate": 7.48103576769524e-05, + "loss": 1.8552, + "step": 11557 + }, + { + "epoch": 3.547575199508901, + "grad_norm": 0.3900652825832367, + "learning_rate": 7.480604209362434e-05, + "loss": 1.7748, + "step": 11558 + }, + { + "epoch": 3.5478821362799264, + "grad_norm": 0.3297326862812042, + "learning_rate": 7.480172626514845e-05, + "loss": 1.8201, + "step": 11559 + }, + { + "epoch": 3.5481890730509518, + "grad_norm": 0.28797218203544617, + "learning_rate": 7.479741019156737e-05, + "loss": 1.7652, + "step": 11560 + }, + { + "epoch": 3.5484960098219767, + "grad_norm": 0.2764691114425659, + "learning_rate": 7.479309387292373e-05, + "loss": 1.7534, + "step": 11561 + }, + { + "epoch": 3.548802946593002, + "grad_norm": 0.25067585706710815, + "learning_rate": 7.47887773092602e-05, + "loss": 1.7849, + "step": 11562 + }, + { + "epoch": 3.549109883364027, + "grad_norm": 0.29966798424720764, + "learning_rate": 7.478446050061947e-05, + "loss": 1.8299, + "step": 11563 + }, + { + "epoch": 3.549416820135052, + "grad_norm": 0.24068406224250793, + "learning_rate": 7.478014344704416e-05, + "loss": 1.8366, + "step": 11564 + }, + { + "epoch": 3.5497237569060776, + "grad_norm": 0.2559303641319275, + "learning_rate": 7.477582614857695e-05, + "loss": 1.7665, + "step": 11565 + }, + { + "epoch": 3.5500306936771024, + "grad_norm": 0.24617858231067657, + "learning_rate": 7.47715086052605e-05, + "loss": 1.8334, + "step": 11566 + }, + { + "epoch": 3.550337630448128, + "grad_norm": 0.2433501034975052, + "learning_rate": 7.476719081713749e-05, + "loss": 1.7963, + "step": 11567 + }, + { + "epoch": 3.5506445672191527, + "grad_norm": 0.2583518326282501, + "learning_rate": 7.476287278425057e-05, + "loss": 1.8311, + "step": 11568 + }, + { + "epoch": 3.550951503990178, + "grad_norm": 0.3232485055923462, + "learning_rate": 7.475855450664244e-05, + "loss": 1.9162, + "step": 11569 + }, + { + "epoch": 3.5512584407612033, + "grad_norm": 0.28247153759002686, + "learning_rate": 7.475423598435576e-05, + "loss": 1.8027, + "step": 11570 + }, + { + "epoch": 3.5515653775322282, + "grad_norm": 0.27201834321022034, + "learning_rate": 7.47499172174332e-05, + "loss": 1.7822, + "step": 11571 + }, + { + "epoch": 3.5518723143032536, + "grad_norm": 0.2408471554517746, + "learning_rate": 7.474559820591748e-05, + "loss": 1.7735, + "step": 11572 + }, + { + "epoch": 3.5521792510742785, + "grad_norm": 0.24187393486499786, + "learning_rate": 7.474127894985124e-05, + "loss": 1.7931, + "step": 11573 + }, + { + "epoch": 3.552486187845304, + "grad_norm": 0.2759699523448944, + "learning_rate": 7.473695944927717e-05, + "loss": 1.8407, + "step": 11574 + }, + { + "epoch": 3.552793124616329, + "grad_norm": 0.2503111958503723, + "learning_rate": 7.473263970423797e-05, + "loss": 1.7613, + "step": 11575 + }, + { + "epoch": 3.5531000613873545, + "grad_norm": 0.24795177578926086, + "learning_rate": 7.472831971477633e-05, + "loss": 1.8221, + "step": 11576 + }, + { + "epoch": 3.5534069981583793, + "grad_norm": 0.23190177977085114, + "learning_rate": 7.472399948093494e-05, + "loss": 1.7541, + "step": 11577 + }, + { + "epoch": 3.5537139349294047, + "grad_norm": 0.24650825560092926, + "learning_rate": 7.471967900275653e-05, + "loss": 1.8002, + "step": 11578 + }, + { + "epoch": 3.5540208717004296, + "grad_norm": 0.256598562002182, + "learning_rate": 7.471535828028372e-05, + "loss": 1.7052, + "step": 11579 + }, + { + "epoch": 3.554327808471455, + "grad_norm": 0.2715381681919098, + "learning_rate": 7.471103731355926e-05, + "loss": 1.7701, + "step": 11580 + }, + { + "epoch": 3.5546347452424802, + "grad_norm": 0.29806044697761536, + "learning_rate": 7.470671610262586e-05, + "loss": 1.7614, + "step": 11581 + }, + { + "epoch": 3.554941682013505, + "grad_norm": 0.26364314556121826, + "learning_rate": 7.470239464752621e-05, + "loss": 1.7957, + "step": 11582 + }, + { + "epoch": 3.5552486187845305, + "grad_norm": 0.29270800948143005, + "learning_rate": 7.4698072948303e-05, + "loss": 1.8263, + "step": 11583 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.25941839814186096, + "learning_rate": 7.469375100499898e-05, + "loss": 1.8517, + "step": 11584 + }, + { + "epoch": 3.5558624923265807, + "grad_norm": 0.29509237408638, + "learning_rate": 7.468942881765681e-05, + "loss": 1.8643, + "step": 11585 + }, + { + "epoch": 3.556169429097606, + "grad_norm": 0.23090367019176483, + "learning_rate": 7.468510638631926e-05, + "loss": 1.7239, + "step": 11586 + }, + { + "epoch": 3.556476365868631, + "grad_norm": 0.2696724236011505, + "learning_rate": 7.468078371102901e-05, + "loss": 1.848, + "step": 11587 + }, + { + "epoch": 3.5567833026396563, + "grad_norm": 0.2691192626953125, + "learning_rate": 7.46764607918288e-05, + "loss": 1.8194, + "step": 11588 + }, + { + "epoch": 3.557090239410681, + "grad_norm": 0.26616501808166504, + "learning_rate": 7.467213762876131e-05, + "loss": 1.8382, + "step": 11589 + }, + { + "epoch": 3.5573971761817065, + "grad_norm": 0.30629831552505493, + "learning_rate": 7.466781422186933e-05, + "loss": 1.8417, + "step": 11590 + }, + { + "epoch": 3.557704112952732, + "grad_norm": 0.27212417125701904, + "learning_rate": 7.466349057119552e-05, + "loss": 1.7612, + "step": 11591 + }, + { + "epoch": 3.558011049723757, + "grad_norm": 0.2872084379196167, + "learning_rate": 7.465916667678266e-05, + "loss": 1.7998, + "step": 11592 + }, + { + "epoch": 3.558317986494782, + "grad_norm": 0.3017117977142334, + "learning_rate": 7.465484253867348e-05, + "loss": 1.7996, + "step": 11593 + }, + { + "epoch": 3.5586249232658074, + "grad_norm": 0.2707957327365875, + "learning_rate": 7.465051815691066e-05, + "loss": 1.7678, + "step": 11594 + }, + { + "epoch": 3.5589318600368323, + "grad_norm": 0.28932711482048035, + "learning_rate": 7.464619353153702e-05, + "loss": 1.8576, + "step": 11595 + }, + { + "epoch": 3.5592387968078576, + "grad_norm": 0.2585125267505646, + "learning_rate": 7.464186866259519e-05, + "loss": 1.8678, + "step": 11596 + }, + { + "epoch": 3.559545733578883, + "grad_norm": 0.24386851489543915, + "learning_rate": 7.4637543550128e-05, + "loss": 1.7778, + "step": 11597 + }, + { + "epoch": 3.559852670349908, + "grad_norm": 0.2375860959291458, + "learning_rate": 7.463321819417817e-05, + "loss": 1.8096, + "step": 11598 + }, + { + "epoch": 3.560159607120933, + "grad_norm": 0.2341299206018448, + "learning_rate": 7.462889259478842e-05, + "loss": 1.7191, + "step": 11599 + }, + { + "epoch": 3.560466543891958, + "grad_norm": 0.2510595917701721, + "learning_rate": 7.462456675200154e-05, + "loss": 1.7763, + "step": 11600 + }, + { + "epoch": 3.5607734806629834, + "grad_norm": 0.2554674744606018, + "learning_rate": 7.462024066586025e-05, + "loss": 1.7578, + "step": 11601 + }, + { + "epoch": 3.5610804174340087, + "grad_norm": 0.25040730834007263, + "learning_rate": 7.46159143364073e-05, + "loss": 1.8194, + "step": 11602 + }, + { + "epoch": 3.5613873542050336, + "grad_norm": 0.24294932186603546, + "learning_rate": 7.461158776368547e-05, + "loss": 1.8063, + "step": 11603 + }, + { + "epoch": 3.561694290976059, + "grad_norm": 0.2388325333595276, + "learning_rate": 7.46072609477375e-05, + "loss": 1.7942, + "step": 11604 + }, + { + "epoch": 3.562001227747084, + "grad_norm": 0.2569502890110016, + "learning_rate": 7.460293388860615e-05, + "loss": 1.7824, + "step": 11605 + }, + { + "epoch": 3.562308164518109, + "grad_norm": 0.24004346132278442, + "learning_rate": 7.45986065863342e-05, + "loss": 1.8676, + "step": 11606 + }, + { + "epoch": 3.5626151012891345, + "grad_norm": 0.25446319580078125, + "learning_rate": 7.45942790409644e-05, + "loss": 1.7726, + "step": 11607 + }, + { + "epoch": 3.56292203806016, + "grad_norm": 0.26257482171058655, + "learning_rate": 7.458995125253951e-05, + "loss": 1.779, + "step": 11608 + }, + { + "epoch": 3.5632289748311847, + "grad_norm": 0.27703070640563965, + "learning_rate": 7.458562322110231e-05, + "loss": 1.8247, + "step": 11609 + }, + { + "epoch": 3.56353591160221, + "grad_norm": 0.25478535890579224, + "learning_rate": 7.458129494669556e-05, + "loss": 1.7794, + "step": 11610 + }, + { + "epoch": 3.563842848373235, + "grad_norm": 0.26173365116119385, + "learning_rate": 7.457696642936207e-05, + "loss": 1.758, + "step": 11611 + }, + { + "epoch": 3.5641497851442603, + "grad_norm": 0.25077274441719055, + "learning_rate": 7.45726376691446e-05, + "loss": 1.8234, + "step": 11612 + }, + { + "epoch": 3.5644567219152856, + "grad_norm": 0.2591109275817871, + "learning_rate": 7.456830866608589e-05, + "loss": 1.7723, + "step": 11613 + }, + { + "epoch": 3.5647636586863105, + "grad_norm": 0.2653447091579437, + "learning_rate": 7.456397942022877e-05, + "loss": 1.7839, + "step": 11614 + }, + { + "epoch": 3.565070595457336, + "grad_norm": 0.3203454911708832, + "learning_rate": 7.455964993161601e-05, + "loss": 1.8548, + "step": 11615 + }, + { + "epoch": 3.5653775322283607, + "grad_norm": 0.3041793704032898, + "learning_rate": 7.455532020029039e-05, + "loss": 1.7925, + "step": 11616 + }, + { + "epoch": 3.565684468999386, + "grad_norm": 0.26066139340400696, + "learning_rate": 7.45509902262947e-05, + "loss": 1.7905, + "step": 11617 + }, + { + "epoch": 3.5659914057704114, + "grad_norm": 0.2483314871788025, + "learning_rate": 7.454666000967174e-05, + "loss": 1.7658, + "step": 11618 + }, + { + "epoch": 3.5662983425414367, + "grad_norm": 0.24285900592803955, + "learning_rate": 7.45423295504643e-05, + "loss": 1.7575, + "step": 11619 + }, + { + "epoch": 3.5666052793124616, + "grad_norm": 0.27231669425964355, + "learning_rate": 7.453799884871517e-05, + "loss": 1.8389, + "step": 11620 + }, + { + "epoch": 3.566912216083487, + "grad_norm": 0.24324406683444977, + "learning_rate": 7.453366790446717e-05, + "loss": 1.7775, + "step": 11621 + }, + { + "epoch": 3.567219152854512, + "grad_norm": 0.2724440097808838, + "learning_rate": 7.452933671776305e-05, + "loss": 1.8135, + "step": 11622 + }, + { + "epoch": 3.567526089625537, + "grad_norm": 0.22207655012607574, + "learning_rate": 7.452500528864568e-05, + "loss": 1.722, + "step": 11623 + }, + { + "epoch": 3.5678330263965625, + "grad_norm": 0.25650298595428467, + "learning_rate": 7.452067361715782e-05, + "loss": 1.7813, + "step": 11624 + }, + { + "epoch": 3.5681399631675874, + "grad_norm": 0.2582200765609741, + "learning_rate": 7.45163417033423e-05, + "loss": 1.8253, + "step": 11625 + }, + { + "epoch": 3.5684468999386127, + "grad_norm": 0.29545384645462036, + "learning_rate": 7.451200954724188e-05, + "loss": 1.8108, + "step": 11626 + }, + { + "epoch": 3.5687538367096376, + "grad_norm": 0.30457428097724915, + "learning_rate": 7.450767714889946e-05, + "loss": 1.8257, + "step": 11627 + }, + { + "epoch": 3.569060773480663, + "grad_norm": 0.2955166697502136, + "learning_rate": 7.450334450835781e-05, + "loss": 1.8172, + "step": 11628 + }, + { + "epoch": 3.5693677102516883, + "grad_norm": 0.2793857753276825, + "learning_rate": 7.449901162565974e-05, + "loss": 1.8493, + "step": 11629 + }, + { + "epoch": 3.569674647022713, + "grad_norm": 0.27154335379600525, + "learning_rate": 7.449467850084808e-05, + "loss": 1.8306, + "step": 11630 + }, + { + "epoch": 3.5699815837937385, + "grad_norm": 0.22336189448833466, + "learning_rate": 7.449034513396564e-05, + "loss": 1.7435, + "step": 11631 + }, + { + "epoch": 3.5702885205647634, + "grad_norm": 0.22799183428287506, + "learning_rate": 7.448601152505526e-05, + "loss": 1.7818, + "step": 11632 + }, + { + "epoch": 3.5705954573357888, + "grad_norm": 0.26670658588409424, + "learning_rate": 7.448167767415976e-05, + "loss": 1.7777, + "step": 11633 + }, + { + "epoch": 3.570902394106814, + "grad_norm": 0.2848666310310364, + "learning_rate": 7.447734358132196e-05, + "loss": 1.7572, + "step": 11634 + }, + { + "epoch": 3.5712093308778394, + "grad_norm": 0.26843544840812683, + "learning_rate": 7.447300924658473e-05, + "loss": 1.7642, + "step": 11635 + }, + { + "epoch": 3.5715162676488643, + "grad_norm": 0.24666404724121094, + "learning_rate": 7.446867466999087e-05, + "loss": 1.7533, + "step": 11636 + }, + { + "epoch": 3.5718232044198897, + "grad_norm": 0.31111210584640503, + "learning_rate": 7.44643398515832e-05, + "loss": 1.7875, + "step": 11637 + }, + { + "epoch": 3.5721301411909145, + "grad_norm": 0.3157108724117279, + "learning_rate": 7.446000479140462e-05, + "loss": 1.7879, + "step": 11638 + }, + { + "epoch": 3.57243707796194, + "grad_norm": 0.2935558259487152, + "learning_rate": 7.445566948949792e-05, + "loss": 1.7819, + "step": 11639 + }, + { + "epoch": 3.572744014732965, + "grad_norm": 0.2265472710132599, + "learning_rate": 7.445133394590597e-05, + "loss": 1.7518, + "step": 11640 + }, + { + "epoch": 3.57305095150399, + "grad_norm": 0.2564176023006439, + "learning_rate": 7.444699816067159e-05, + "loss": 1.7281, + "step": 11641 + }, + { + "epoch": 3.5733578882750154, + "grad_norm": 0.27933555841445923, + "learning_rate": 7.444266213383766e-05, + "loss": 1.7852, + "step": 11642 + }, + { + "epoch": 3.5736648250460403, + "grad_norm": 0.29105356335639954, + "learning_rate": 7.4438325865447e-05, + "loss": 1.8056, + "step": 11643 + }, + { + "epoch": 3.5739717618170657, + "grad_norm": 0.27665549516677856, + "learning_rate": 7.443398935554249e-05, + "loss": 1.7249, + "step": 11644 + }, + { + "epoch": 3.574278698588091, + "grad_norm": 0.21899232268333435, + "learning_rate": 7.442965260416698e-05, + "loss": 1.7689, + "step": 11645 + }, + { + "epoch": 3.574585635359116, + "grad_norm": 0.3250672221183777, + "learning_rate": 7.442531561136333e-05, + "loss": 1.8058, + "step": 11646 + }, + { + "epoch": 3.574892572130141, + "grad_norm": 0.42442524433135986, + "learning_rate": 7.442097837717438e-05, + "loss": 1.7887, + "step": 11647 + }, + { + "epoch": 3.575199508901166, + "grad_norm": 0.33108964562416077, + "learning_rate": 7.441664090164302e-05, + "loss": 1.7628, + "step": 11648 + }, + { + "epoch": 3.5755064456721914, + "grad_norm": 0.23050357401371002, + "learning_rate": 7.44123031848121e-05, + "loss": 1.8121, + "step": 11649 + }, + { + "epoch": 3.575813382443217, + "grad_norm": 0.29251593351364136, + "learning_rate": 7.440796522672448e-05, + "loss": 1.8051, + "step": 11650 + }, + { + "epoch": 3.576120319214242, + "grad_norm": 0.3764750063419342, + "learning_rate": 7.440362702742305e-05, + "loss": 1.9002, + "step": 11651 + }, + { + "epoch": 3.576427255985267, + "grad_norm": 0.3751949071884155, + "learning_rate": 7.439928858695069e-05, + "loss": 1.821, + "step": 11652 + }, + { + "epoch": 3.5767341927562923, + "grad_norm": 0.268476665019989, + "learning_rate": 7.439494990535024e-05, + "loss": 1.8241, + "step": 11653 + }, + { + "epoch": 3.5770411295273172, + "grad_norm": 0.3072795271873474, + "learning_rate": 7.439061098266459e-05, + "loss": 1.8169, + "step": 11654 + }, + { + "epoch": 3.5773480662983426, + "grad_norm": 0.4948901832103729, + "learning_rate": 7.438627181893664e-05, + "loss": 1.7706, + "step": 11655 + }, + { + "epoch": 3.577655003069368, + "grad_norm": 0.5892601013183594, + "learning_rate": 7.438193241420926e-05, + "loss": 1.7631, + "step": 11656 + }, + { + "epoch": 3.577961939840393, + "grad_norm": 0.4599401652812958, + "learning_rate": 7.437759276852533e-05, + "loss": 1.7471, + "step": 11657 + }, + { + "epoch": 3.578268876611418, + "grad_norm": 0.2545170783996582, + "learning_rate": 7.437325288192773e-05, + "loss": 1.7945, + "step": 11658 + }, + { + "epoch": 3.578575813382443, + "grad_norm": 0.3136496841907501, + "learning_rate": 7.436891275445938e-05, + "loss": 1.828, + "step": 11659 + }, + { + "epoch": 3.5788827501534684, + "grad_norm": 0.3631688058376312, + "learning_rate": 7.436457238616313e-05, + "loss": 1.8302, + "step": 11660 + }, + { + "epoch": 3.5791896869244937, + "grad_norm": 0.3097386658191681, + "learning_rate": 7.436023177708192e-05, + "loss": 1.8397, + "step": 11661 + }, + { + "epoch": 3.5794966236955186, + "grad_norm": 0.20948798954486847, + "learning_rate": 7.43558909272586e-05, + "loss": 1.7844, + "step": 11662 + }, + { + "epoch": 3.579803560466544, + "grad_norm": 0.24327392876148224, + "learning_rate": 7.43515498367361e-05, + "loss": 1.7827, + "step": 11663 + }, + { + "epoch": 3.580110497237569, + "grad_norm": 0.25268325209617615, + "learning_rate": 7.434720850555731e-05, + "loss": 1.8224, + "step": 11664 + }, + { + "epoch": 3.580417434008594, + "grad_norm": 0.24883607029914856, + "learning_rate": 7.434286693376513e-05, + "loss": 1.8189, + "step": 11665 + }, + { + "epoch": 3.5807243707796195, + "grad_norm": 0.2942518889904022, + "learning_rate": 7.433852512140248e-05, + "loss": 1.8325, + "step": 11666 + }, + { + "epoch": 3.581031307550645, + "grad_norm": 0.3556186556816101, + "learning_rate": 7.433418306851225e-05, + "loss": 1.7511, + "step": 11667 + }, + { + "epoch": 3.5813382443216697, + "grad_norm": 0.421220600605011, + "learning_rate": 7.432984077513738e-05, + "loss": 1.8081, + "step": 11668 + }, + { + "epoch": 3.581645181092695, + "grad_norm": 0.3338243067264557, + "learning_rate": 7.432549824132074e-05, + "loss": 1.8274, + "step": 11669 + }, + { + "epoch": 3.58195211786372, + "grad_norm": 0.25091543793678284, + "learning_rate": 7.432115546710528e-05, + "loss": 1.7637, + "step": 11670 + }, + { + "epoch": 3.5822590546347453, + "grad_norm": 0.29870370030403137, + "learning_rate": 7.431681245253389e-05, + "loss": 1.8036, + "step": 11671 + }, + { + "epoch": 3.5825659914057706, + "grad_norm": 0.2682137191295624, + "learning_rate": 7.431246919764953e-05, + "loss": 1.8252, + "step": 11672 + }, + { + "epoch": 3.5828729281767955, + "grad_norm": 0.28790801763534546, + "learning_rate": 7.430812570249508e-05, + "loss": 1.7713, + "step": 11673 + }, + { + "epoch": 3.583179864947821, + "grad_norm": 0.26357609033584595, + "learning_rate": 7.43037819671135e-05, + "loss": 1.8388, + "step": 11674 + }, + { + "epoch": 3.5834868017188457, + "grad_norm": 0.2505483031272888, + "learning_rate": 7.42994379915477e-05, + "loss": 1.7722, + "step": 11675 + }, + { + "epoch": 3.583793738489871, + "grad_norm": 0.2535844147205353, + "learning_rate": 7.42950937758406e-05, + "loss": 1.756, + "step": 11676 + }, + { + "epoch": 3.5841006752608964, + "grad_norm": 0.23045027256011963, + "learning_rate": 7.429074932003515e-05, + "loss": 1.791, + "step": 11677 + }, + { + "epoch": 3.5844076120319213, + "grad_norm": 0.22525762021541595, + "learning_rate": 7.428640462417428e-05, + "loss": 1.7234, + "step": 11678 + }, + { + "epoch": 3.5847145488029466, + "grad_norm": 0.2402270883321762, + "learning_rate": 7.428205968830094e-05, + "loss": 1.845, + "step": 11679 + }, + { + "epoch": 3.5850214855739715, + "grad_norm": 0.24909646809101105, + "learning_rate": 7.427771451245802e-05, + "loss": 1.8537, + "step": 11680 + }, + { + "epoch": 3.585328422344997, + "grad_norm": 0.25813063979148865, + "learning_rate": 7.427336909668853e-05, + "loss": 1.7353, + "step": 11681 + }, + { + "epoch": 3.585635359116022, + "grad_norm": 0.26073768734931946, + "learning_rate": 7.426902344103534e-05, + "loss": 1.8142, + "step": 11682 + }, + { + "epoch": 3.5859422958870475, + "grad_norm": 0.2498280256986618, + "learning_rate": 7.426467754554147e-05, + "loss": 1.7996, + "step": 11683 + }, + { + "epoch": 3.5862492326580724, + "grad_norm": 0.3131188154220581, + "learning_rate": 7.426033141024981e-05, + "loss": 1.7793, + "step": 11684 + }, + { + "epoch": 3.5865561694290977, + "grad_norm": 0.24118199944496155, + "learning_rate": 7.425598503520337e-05, + "loss": 1.8249, + "step": 11685 + }, + { + "epoch": 3.5868631062001226, + "grad_norm": 0.2791197597980499, + "learning_rate": 7.425163842044504e-05, + "loss": 1.7966, + "step": 11686 + }, + { + "epoch": 3.587170042971148, + "grad_norm": 0.2298576384782791, + "learning_rate": 7.424729156601781e-05, + "loss": 1.7224, + "step": 11687 + }, + { + "epoch": 3.5874769797421733, + "grad_norm": 0.23113438487052917, + "learning_rate": 7.424294447196462e-05, + "loss": 1.7641, + "step": 11688 + }, + { + "epoch": 3.587783916513198, + "grad_norm": 0.3064495027065277, + "learning_rate": 7.423859713832847e-05, + "loss": 1.8688, + "step": 11689 + }, + { + "epoch": 3.5880908532842235, + "grad_norm": 0.22847676277160645, + "learning_rate": 7.423424956515228e-05, + "loss": 1.7513, + "step": 11690 + }, + { + "epoch": 3.5883977900552484, + "grad_norm": 0.2797350585460663, + "learning_rate": 7.422990175247905e-05, + "loss": 1.8268, + "step": 11691 + }, + { + "epoch": 3.5887047268262737, + "grad_norm": 0.2753821313381195, + "learning_rate": 7.422555370035171e-05, + "loss": 1.7313, + "step": 11692 + }, + { + "epoch": 3.589011663597299, + "grad_norm": 0.2981179654598236, + "learning_rate": 7.422120540881326e-05, + "loss": 1.8455, + "step": 11693 + }, + { + "epoch": 3.5893186003683244, + "grad_norm": 0.33028867840766907, + "learning_rate": 7.421685687790667e-05, + "loss": 1.8397, + "step": 11694 + }, + { + "epoch": 3.5896255371393493, + "grad_norm": 0.409173846244812, + "learning_rate": 7.421250810767487e-05, + "loss": 1.8088, + "step": 11695 + }, + { + "epoch": 3.5899324739103746, + "grad_norm": 0.4118194878101349, + "learning_rate": 7.42081590981609e-05, + "loss": 1.7719, + "step": 11696 + }, + { + "epoch": 3.5902394106813995, + "grad_norm": 0.34716179966926575, + "learning_rate": 7.420380984940773e-05, + "loss": 1.8063, + "step": 11697 + }, + { + "epoch": 3.590546347452425, + "grad_norm": 0.27763083577156067, + "learning_rate": 7.419946036145829e-05, + "loss": 1.7777, + "step": 11698 + }, + { + "epoch": 3.59085328422345, + "grad_norm": 0.3175280690193176, + "learning_rate": 7.419511063435562e-05, + "loss": 1.697, + "step": 11699 + }, + { + "epoch": 3.591160220994475, + "grad_norm": 0.3151503801345825, + "learning_rate": 7.419076066814268e-05, + "loss": 1.8067, + "step": 11700 + }, + { + "epoch": 3.5914671577655004, + "grad_norm": 0.26914867758750916, + "learning_rate": 7.418641046286245e-05, + "loss": 1.7797, + "step": 11701 + }, + { + "epoch": 3.5917740945365253, + "grad_norm": 0.27231964468955994, + "learning_rate": 7.418206001855797e-05, + "loss": 1.7931, + "step": 11702 + }, + { + "epoch": 3.5920810313075506, + "grad_norm": 0.3352177143096924, + "learning_rate": 7.417770933527217e-05, + "loss": 1.9187, + "step": 11703 + }, + { + "epoch": 3.592387968078576, + "grad_norm": 0.3510081470012665, + "learning_rate": 7.417335841304808e-05, + "loss": 1.7889, + "step": 11704 + }, + { + "epoch": 3.592694904849601, + "grad_norm": 0.24949313700199127, + "learning_rate": 7.41690072519287e-05, + "loss": 1.7683, + "step": 11705 + }, + { + "epoch": 3.593001841620626, + "grad_norm": 0.28442221879959106, + "learning_rate": 7.416465585195702e-05, + "loss": 1.7889, + "step": 11706 + }, + { + "epoch": 3.593308778391651, + "grad_norm": 0.3355824649333954, + "learning_rate": 7.416030421317605e-05, + "loss": 1.7637, + "step": 11707 + }, + { + "epoch": 3.5936157151626764, + "grad_norm": 0.33569446206092834, + "learning_rate": 7.415595233562878e-05, + "loss": 1.919, + "step": 11708 + }, + { + "epoch": 3.5939226519337018, + "grad_norm": 0.2488354742527008, + "learning_rate": 7.415160021935825e-05, + "loss": 1.8424, + "step": 11709 + }, + { + "epoch": 3.594229588704727, + "grad_norm": 0.2701130509376526, + "learning_rate": 7.414724786440746e-05, + "loss": 1.7586, + "step": 11710 + }, + { + "epoch": 3.594536525475752, + "grad_norm": 0.26289790868759155, + "learning_rate": 7.414289527081939e-05, + "loss": 1.7975, + "step": 11711 + }, + { + "epoch": 3.5948434622467773, + "grad_norm": 0.25382301211357117, + "learning_rate": 7.413854243863707e-05, + "loss": 1.7393, + "step": 11712 + }, + { + "epoch": 3.595150399017802, + "grad_norm": 0.28282979130744934, + "learning_rate": 7.413418936790357e-05, + "loss": 1.8048, + "step": 11713 + }, + { + "epoch": 3.5954573357888275, + "grad_norm": 0.28001347184181213, + "learning_rate": 7.412983605866183e-05, + "loss": 1.7864, + "step": 11714 + }, + { + "epoch": 3.595764272559853, + "grad_norm": 0.26107707619667053, + "learning_rate": 7.412548251095491e-05, + "loss": 1.8016, + "step": 11715 + }, + { + "epoch": 3.5960712093308778, + "grad_norm": 0.2518761456012726, + "learning_rate": 7.412112872482583e-05, + "loss": 1.7565, + "step": 11716 + }, + { + "epoch": 3.596378146101903, + "grad_norm": 0.25911152362823486, + "learning_rate": 7.411677470031762e-05, + "loss": 1.8333, + "step": 11717 + }, + { + "epoch": 3.596685082872928, + "grad_norm": 0.3411506414413452, + "learning_rate": 7.41124204374733e-05, + "loss": 1.8027, + "step": 11718 + }, + { + "epoch": 3.5969920196439533, + "grad_norm": 0.28535547852516174, + "learning_rate": 7.410806593633593e-05, + "loss": 1.7596, + "step": 11719 + }, + { + "epoch": 3.5972989564149787, + "grad_norm": 0.24665530025959015, + "learning_rate": 7.410371119694852e-05, + "loss": 1.7777, + "step": 11720 + }, + { + "epoch": 3.5976058931860035, + "grad_norm": 0.29162275791168213, + "learning_rate": 7.40993562193541e-05, + "loss": 1.795, + "step": 11721 + }, + { + "epoch": 3.597912829957029, + "grad_norm": 0.2712220549583435, + "learning_rate": 7.409500100359573e-05, + "loss": 1.824, + "step": 11722 + }, + { + "epoch": 3.5982197667280538, + "grad_norm": 0.239755779504776, + "learning_rate": 7.40906455497164e-05, + "loss": 1.7534, + "step": 11723 + }, + { + "epoch": 3.598526703499079, + "grad_norm": 0.26056957244873047, + "learning_rate": 7.408628985775922e-05, + "loss": 1.757, + "step": 11724 + }, + { + "epoch": 3.5988336402701044, + "grad_norm": 0.3230258822441101, + "learning_rate": 7.40819339277672e-05, + "loss": 1.8684, + "step": 11725 + }, + { + "epoch": 3.5991405770411298, + "grad_norm": 0.26070696115493774, + "learning_rate": 7.407757775978339e-05, + "loss": 1.7868, + "step": 11726 + }, + { + "epoch": 3.5994475138121547, + "grad_norm": 0.24940893054008484, + "learning_rate": 7.407322135385085e-05, + "loss": 1.8391, + "step": 11727 + }, + { + "epoch": 3.59975445058318, + "grad_norm": 0.2717723250389099, + "learning_rate": 7.406886471001263e-05, + "loss": 1.7567, + "step": 11728 + }, + { + "epoch": 3.600061387354205, + "grad_norm": 0.2328445315361023, + "learning_rate": 7.406450782831177e-05, + "loss": 1.7761, + "step": 11729 + }, + { + "epoch": 3.6003683241252302, + "grad_norm": 0.2740287184715271, + "learning_rate": 7.406015070879136e-05, + "loss": 1.8599, + "step": 11730 + }, + { + "epoch": 3.6006752608962556, + "grad_norm": 0.2930558919906616, + "learning_rate": 7.405579335149441e-05, + "loss": 1.852, + "step": 11731 + }, + { + "epoch": 3.6009821976672804, + "grad_norm": 0.30175161361694336, + "learning_rate": 7.405143575646403e-05, + "loss": 1.8861, + "step": 11732 + }, + { + "epoch": 3.601289134438306, + "grad_norm": 0.2617531418800354, + "learning_rate": 7.404707792374328e-05, + "loss": 1.7598, + "step": 11733 + }, + { + "epoch": 3.6015960712093307, + "grad_norm": 0.25384122133255005, + "learning_rate": 7.404271985337517e-05, + "loss": 1.7634, + "step": 11734 + }, + { + "epoch": 3.601903007980356, + "grad_norm": 0.31706711649894714, + "learning_rate": 7.403836154540284e-05, + "loss": 1.8125, + "step": 11735 + }, + { + "epoch": 3.6022099447513813, + "grad_norm": 0.299662709236145, + "learning_rate": 7.403400299986932e-05, + "loss": 1.748, + "step": 11736 + }, + { + "epoch": 3.6025168815224062, + "grad_norm": 0.23828944563865662, + "learning_rate": 7.40296442168177e-05, + "loss": 1.7473, + "step": 11737 + }, + { + "epoch": 3.6028238182934316, + "grad_norm": 0.22611604630947113, + "learning_rate": 7.402528519629106e-05, + "loss": 1.7519, + "step": 11738 + }, + { + "epoch": 3.6031307550644565, + "grad_norm": 0.28498536348342896, + "learning_rate": 7.402092593833246e-05, + "loss": 1.7792, + "step": 11739 + }, + { + "epoch": 3.603437691835482, + "grad_norm": 0.2404283881187439, + "learning_rate": 7.4016566442985e-05, + "loss": 1.7434, + "step": 11740 + }, + { + "epoch": 3.603744628606507, + "grad_norm": 0.2291589230298996, + "learning_rate": 7.401220671029173e-05, + "loss": 1.7623, + "step": 11741 + }, + { + "epoch": 3.6040515653775325, + "grad_norm": 0.23962698876857758, + "learning_rate": 7.400784674029578e-05, + "loss": 1.7232, + "step": 11742 + }, + { + "epoch": 3.6043585021485574, + "grad_norm": 0.3015185594558716, + "learning_rate": 7.400348653304022e-05, + "loss": 1.7808, + "step": 11743 + }, + { + "epoch": 3.6046654389195827, + "grad_norm": 0.30623099207878113, + "learning_rate": 7.399912608856813e-05, + "loss": 1.8518, + "step": 11744 + }, + { + "epoch": 3.6049723756906076, + "grad_norm": 0.2698235511779785, + "learning_rate": 7.39947654069226e-05, + "loss": 1.7829, + "step": 11745 + }, + { + "epoch": 3.605279312461633, + "grad_norm": 0.2195274829864502, + "learning_rate": 7.399040448814674e-05, + "loss": 1.7709, + "step": 11746 + }, + { + "epoch": 3.6055862492326582, + "grad_norm": 0.22962357103824615, + "learning_rate": 7.398604333228366e-05, + "loss": 1.7482, + "step": 11747 + }, + { + "epoch": 3.605893186003683, + "grad_norm": 0.2403932511806488, + "learning_rate": 7.398168193937642e-05, + "loss": 1.8063, + "step": 11748 + }, + { + "epoch": 3.6062001227747085, + "grad_norm": 0.23542718589305878, + "learning_rate": 7.397732030946816e-05, + "loss": 1.7599, + "step": 11749 + }, + { + "epoch": 3.6065070595457334, + "grad_norm": 0.2462490350008011, + "learning_rate": 7.397295844260195e-05, + "loss": 1.8183, + "step": 11750 + }, + { + "epoch": 3.6068139963167587, + "grad_norm": 0.21428349614143372, + "learning_rate": 7.396859633882091e-05, + "loss": 1.6944, + "step": 11751 + }, + { + "epoch": 3.607120933087784, + "grad_norm": 0.21240907907485962, + "learning_rate": 7.396423399816817e-05, + "loss": 1.7795, + "step": 11752 + }, + { + "epoch": 3.607427869858809, + "grad_norm": 0.23413677513599396, + "learning_rate": 7.395987142068682e-05, + "loss": 1.8015, + "step": 11753 + }, + { + "epoch": 3.6077348066298343, + "grad_norm": 0.26724907755851746, + "learning_rate": 7.395550860641998e-05, + "loss": 1.8174, + "step": 11754 + }, + { + "epoch": 3.608041743400859, + "grad_norm": 0.22077679634094238, + "learning_rate": 7.395114555541077e-05, + "loss": 1.7929, + "step": 11755 + }, + { + "epoch": 3.6083486801718845, + "grad_norm": 0.2475263774394989, + "learning_rate": 7.394678226770228e-05, + "loss": 1.7744, + "step": 11756 + }, + { + "epoch": 3.60865561694291, + "grad_norm": 0.22579342126846313, + "learning_rate": 7.394241874333764e-05, + "loss": 1.79, + "step": 11757 + }, + { + "epoch": 3.608962553713935, + "grad_norm": 0.26798152923583984, + "learning_rate": 7.393805498236001e-05, + "loss": 1.8087, + "step": 11758 + }, + { + "epoch": 3.60926949048496, + "grad_norm": 0.2755621373653412, + "learning_rate": 7.393369098481248e-05, + "loss": 1.7834, + "step": 11759 + }, + { + "epoch": 3.6095764272559854, + "grad_norm": 0.2741812467575073, + "learning_rate": 7.39293267507382e-05, + "loss": 1.7948, + "step": 11760 + }, + { + "epoch": 3.6098833640270103, + "grad_norm": 0.2378924936056137, + "learning_rate": 7.392496228018028e-05, + "loss": 1.8317, + "step": 11761 + }, + { + "epoch": 3.6101903007980356, + "grad_norm": 0.2628132700920105, + "learning_rate": 7.392059757318187e-05, + "loss": 1.8123, + "step": 11762 + }, + { + "epoch": 3.610497237569061, + "grad_norm": 0.2613002359867096, + "learning_rate": 7.391623262978607e-05, + "loss": 1.795, + "step": 11763 + }, + { + "epoch": 3.610804174340086, + "grad_norm": 0.27272161841392517, + "learning_rate": 7.391186745003608e-05, + "loss": 1.7808, + "step": 11764 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.21366162598133087, + "learning_rate": 7.390750203397497e-05, + "loss": 1.77, + "step": 11765 + }, + { + "epoch": 3.611418047882136, + "grad_norm": 0.25559261441230774, + "learning_rate": 7.390313638164593e-05, + "loss": 1.8442, + "step": 11766 + }, + { + "epoch": 3.6117249846531614, + "grad_norm": 0.23794838786125183, + "learning_rate": 7.389877049309207e-05, + "loss": 1.8237, + "step": 11767 + }, + { + "epoch": 3.6120319214241867, + "grad_norm": 0.2690154016017914, + "learning_rate": 7.389440436835656e-05, + "loss": 1.8194, + "step": 11768 + }, + { + "epoch": 3.612338858195212, + "grad_norm": 0.26148009300231934, + "learning_rate": 7.389003800748254e-05, + "loss": 1.7862, + "step": 11769 + }, + { + "epoch": 3.612645794966237, + "grad_norm": 0.26414936780929565, + "learning_rate": 7.388567141051315e-05, + "loss": 1.7815, + "step": 11770 + }, + { + "epoch": 3.6129527317372623, + "grad_norm": 0.24473857879638672, + "learning_rate": 7.388130457749157e-05, + "loss": 1.801, + "step": 11771 + }, + { + "epoch": 3.613259668508287, + "grad_norm": 0.24356001615524292, + "learning_rate": 7.387693750846094e-05, + "loss": 1.8031, + "step": 11772 + }, + { + "epoch": 3.6135666052793125, + "grad_norm": 0.26716411113739014, + "learning_rate": 7.387257020346441e-05, + "loss": 1.7999, + "step": 11773 + }, + { + "epoch": 3.613873542050338, + "grad_norm": 0.2730760872364044, + "learning_rate": 7.386820266254516e-05, + "loss": 1.8079, + "step": 11774 + }, + { + "epoch": 3.6141804788213627, + "grad_norm": 0.2570728361606598, + "learning_rate": 7.386383488574635e-05, + "loss": 1.7374, + "step": 11775 + }, + { + "epoch": 3.614487415592388, + "grad_norm": 0.24992883205413818, + "learning_rate": 7.385946687311112e-05, + "loss": 1.8432, + "step": 11776 + }, + { + "epoch": 3.614794352363413, + "grad_norm": 0.28632259368896484, + "learning_rate": 7.385509862468266e-05, + "loss": 1.8014, + "step": 11777 + }, + { + "epoch": 3.6151012891344383, + "grad_norm": 0.257303923368454, + "learning_rate": 7.385073014050412e-05, + "loss": 1.8166, + "step": 11778 + }, + { + "epoch": 3.6154082259054636, + "grad_norm": 0.2791872024536133, + "learning_rate": 7.38463614206187e-05, + "loss": 1.7865, + "step": 11779 + }, + { + "epoch": 3.6157151626764885, + "grad_norm": 0.25708603858947754, + "learning_rate": 7.384199246506956e-05, + "loss": 1.807, + "step": 11780 + }, + { + "epoch": 3.616022099447514, + "grad_norm": 0.28693172335624695, + "learning_rate": 7.383762327389988e-05, + "loss": 1.8049, + "step": 11781 + }, + { + "epoch": 3.6163290362185387, + "grad_norm": 0.2731167674064636, + "learning_rate": 7.383325384715283e-05, + "loss": 1.8937, + "step": 11782 + }, + { + "epoch": 3.616635972989564, + "grad_norm": 0.26151663064956665, + "learning_rate": 7.38288841848716e-05, + "loss": 1.8288, + "step": 11783 + }, + { + "epoch": 3.6169429097605894, + "grad_norm": 0.2732257843017578, + "learning_rate": 7.382451428709936e-05, + "loss": 1.7668, + "step": 11784 + }, + { + "epoch": 3.6172498465316147, + "grad_norm": 0.2747575640678406, + "learning_rate": 7.38201441538793e-05, + "loss": 1.7991, + "step": 11785 + }, + { + "epoch": 3.6175567833026396, + "grad_norm": 0.2884783446788788, + "learning_rate": 7.381577378525462e-05, + "loss": 1.7798, + "step": 11786 + }, + { + "epoch": 3.617863720073665, + "grad_norm": 0.2716344892978668, + "learning_rate": 7.381140318126851e-05, + "loss": 1.7923, + "step": 11787 + }, + { + "epoch": 3.61817065684469, + "grad_norm": 0.3007747232913971, + "learning_rate": 7.380703234196416e-05, + "loss": 1.8397, + "step": 11788 + }, + { + "epoch": 3.618477593615715, + "grad_norm": 0.39218056201934814, + "learning_rate": 7.380266126738476e-05, + "loss": 1.8517, + "step": 11789 + }, + { + "epoch": 3.6187845303867405, + "grad_norm": 0.43425866961479187, + "learning_rate": 7.379828995757351e-05, + "loss": 1.7518, + "step": 11790 + }, + { + "epoch": 3.6190914671577654, + "grad_norm": 0.34399518370628357, + "learning_rate": 7.37939184125736e-05, + "loss": 1.7607, + "step": 11791 + }, + { + "epoch": 3.6193984039287908, + "grad_norm": 0.23124302923679352, + "learning_rate": 7.378954663242825e-05, + "loss": 1.7898, + "step": 11792 + }, + { + "epoch": 3.6197053406998156, + "grad_norm": 0.32839757204055786, + "learning_rate": 7.378517461718066e-05, + "loss": 1.7472, + "step": 11793 + }, + { + "epoch": 3.620012277470841, + "grad_norm": 0.38583460450172424, + "learning_rate": 7.378080236687403e-05, + "loss": 1.7947, + "step": 11794 + }, + { + "epoch": 3.6203192142418663, + "grad_norm": 0.4622896909713745, + "learning_rate": 7.377642988155157e-05, + "loss": 1.9023, + "step": 11795 + }, + { + "epoch": 3.620626151012891, + "grad_norm": 0.3783189058303833, + "learning_rate": 7.37720571612565e-05, + "loss": 1.7813, + "step": 11796 + }, + { + "epoch": 3.6209330877839165, + "grad_norm": 0.3468814790248871, + "learning_rate": 7.376768420603204e-05, + "loss": 1.7509, + "step": 11797 + }, + { + "epoch": 3.6212400245549414, + "grad_norm": 0.2602507174015045, + "learning_rate": 7.376331101592138e-05, + "loss": 1.8158, + "step": 11798 + }, + { + "epoch": 3.6215469613259668, + "grad_norm": 0.28337883949279785, + "learning_rate": 7.375893759096775e-05, + "loss": 1.7755, + "step": 11799 + }, + { + "epoch": 3.621853898096992, + "grad_norm": 0.3644609749317169, + "learning_rate": 7.375456393121437e-05, + "loss": 1.8193, + "step": 11800 + }, + { + "epoch": 3.6221608348680174, + "grad_norm": 0.338211327791214, + "learning_rate": 7.375019003670448e-05, + "loss": 1.821, + "step": 11801 + }, + { + "epoch": 3.6224677716390423, + "grad_norm": 0.23850654065608978, + "learning_rate": 7.374581590748129e-05, + "loss": 1.7317, + "step": 11802 + }, + { + "epoch": 3.6227747084100677, + "grad_norm": 0.3496716618537903, + "learning_rate": 7.374144154358801e-05, + "loss": 1.8361, + "step": 11803 + }, + { + "epoch": 3.6230816451810925, + "grad_norm": 0.5585216283798218, + "learning_rate": 7.37370669450679e-05, + "loss": 1.7667, + "step": 11804 + }, + { + "epoch": 3.623388581952118, + "grad_norm": 0.4578089714050293, + "learning_rate": 7.373269211196418e-05, + "loss": 1.8051, + "step": 11805 + }, + { + "epoch": 3.623695518723143, + "grad_norm": 0.28195759654045105, + "learning_rate": 7.37283170443201e-05, + "loss": 1.7823, + "step": 11806 + }, + { + "epoch": 3.624002455494168, + "grad_norm": 0.4066108465194702, + "learning_rate": 7.372394174217887e-05, + "loss": 1.7819, + "step": 11807 + }, + { + "epoch": 3.6243093922651934, + "grad_norm": 0.5368703007698059, + "learning_rate": 7.371956620558375e-05, + "loss": 1.8121, + "step": 11808 + }, + { + "epoch": 3.6246163290362183, + "grad_norm": 0.36627063155174255, + "learning_rate": 7.371519043457795e-05, + "loss": 1.7944, + "step": 11809 + }, + { + "epoch": 3.6249232658072437, + "grad_norm": 0.3100780248641968, + "learning_rate": 7.371081442920476e-05, + "loss": 1.783, + "step": 11810 + }, + { + "epoch": 3.625230202578269, + "grad_norm": 0.3277178704738617, + "learning_rate": 7.370643818950741e-05, + "loss": 1.8105, + "step": 11811 + }, + { + "epoch": 3.625537139349294, + "grad_norm": 0.3887772560119629, + "learning_rate": 7.370206171552914e-05, + "loss": 1.8136, + "step": 11812 + }, + { + "epoch": 3.6258440761203192, + "grad_norm": 0.2770824134349823, + "learning_rate": 7.36976850073132e-05, + "loss": 1.7852, + "step": 11813 + }, + { + "epoch": 3.626151012891344, + "grad_norm": 0.26357728242874146, + "learning_rate": 7.369330806490284e-05, + "loss": 1.7621, + "step": 11814 + }, + { + "epoch": 3.6264579496623695, + "grad_norm": 0.3387344181537628, + "learning_rate": 7.368893088834135e-05, + "loss": 1.7785, + "step": 11815 + }, + { + "epoch": 3.626764886433395, + "grad_norm": 0.35155174136161804, + "learning_rate": 7.368455347767193e-05, + "loss": 1.8081, + "step": 11816 + }, + { + "epoch": 3.62707182320442, + "grad_norm": 0.2855289876461029, + "learning_rate": 7.368017583293788e-05, + "loss": 1.8245, + "step": 11817 + }, + { + "epoch": 3.627378759975445, + "grad_norm": 0.28462162613868713, + "learning_rate": 7.367579795418245e-05, + "loss": 1.8066, + "step": 11818 + }, + { + "epoch": 3.6276856967464703, + "grad_norm": 0.40696555376052856, + "learning_rate": 7.367141984144891e-05, + "loss": 1.8897, + "step": 11819 + }, + { + "epoch": 3.6279926335174952, + "grad_norm": 0.472782701253891, + "learning_rate": 7.366704149478054e-05, + "loss": 1.8071, + "step": 11820 + }, + { + "epoch": 3.6282995702885206, + "grad_norm": 0.27022916078567505, + "learning_rate": 7.366266291422057e-05, + "loss": 1.8574, + "step": 11821 + }, + { + "epoch": 3.628606507059546, + "grad_norm": 0.4207148253917694, + "learning_rate": 7.365828409981231e-05, + "loss": 1.7759, + "step": 11822 + }, + { + "epoch": 3.628913443830571, + "grad_norm": 0.42866072058677673, + "learning_rate": 7.365390505159902e-05, + "loss": 1.7366, + "step": 11823 + }, + { + "epoch": 3.629220380601596, + "grad_norm": 0.28288859128952026, + "learning_rate": 7.364952576962398e-05, + "loss": 1.8591, + "step": 11824 + }, + { + "epoch": 3.629527317372621, + "grad_norm": 0.30544906854629517, + "learning_rate": 7.364514625393045e-05, + "loss": 1.7965, + "step": 11825 + }, + { + "epoch": 3.6298342541436464, + "grad_norm": 0.3251616954803467, + "learning_rate": 7.364076650456173e-05, + "loss": 1.8197, + "step": 11826 + }, + { + "epoch": 3.6301411909146717, + "grad_norm": 0.3133888840675354, + "learning_rate": 7.363638652156109e-05, + "loss": 1.7978, + "step": 11827 + }, + { + "epoch": 3.630448127685697, + "grad_norm": 0.29004594683647156, + "learning_rate": 7.363200630497185e-05, + "loss": 1.8035, + "step": 11828 + }, + { + "epoch": 3.630755064456722, + "grad_norm": 0.2781279683113098, + "learning_rate": 7.362762585483725e-05, + "loss": 1.8462, + "step": 11829 + }, + { + "epoch": 3.6310620012277472, + "grad_norm": 0.29003822803497314, + "learning_rate": 7.362324517120063e-05, + "loss": 1.7952, + "step": 11830 + }, + { + "epoch": 3.631368937998772, + "grad_norm": 0.2510940134525299, + "learning_rate": 7.361886425410524e-05, + "loss": 1.7645, + "step": 11831 + }, + { + "epoch": 3.6316758747697975, + "grad_norm": 0.23798540234565735, + "learning_rate": 7.361448310359438e-05, + "loss": 1.7329, + "step": 11832 + }, + { + "epoch": 3.631982811540823, + "grad_norm": 0.2711278796195984, + "learning_rate": 7.361010171971137e-05, + "loss": 1.8245, + "step": 11833 + }, + { + "epoch": 3.6322897483118477, + "grad_norm": 0.2895669639110565, + "learning_rate": 7.360572010249949e-05, + "loss": 1.7668, + "step": 11834 + }, + { + "epoch": 3.632596685082873, + "grad_norm": 0.2216273844242096, + "learning_rate": 7.360133825200205e-05, + "loss": 1.8164, + "step": 11835 + }, + { + "epoch": 3.632903621853898, + "grad_norm": 0.3075082302093506, + "learning_rate": 7.359695616826236e-05, + "loss": 1.8159, + "step": 11836 + }, + { + "epoch": 3.6332105586249233, + "grad_norm": 0.3208801746368408, + "learning_rate": 7.35925738513237e-05, + "loss": 1.8385, + "step": 11837 + }, + { + "epoch": 3.6335174953959486, + "grad_norm": 0.272517591714859, + "learning_rate": 7.35881913012294e-05, + "loss": 1.7653, + "step": 11838 + }, + { + "epoch": 3.6338244321669735, + "grad_norm": 0.23105360567569733, + "learning_rate": 7.358380851802277e-05, + "loss": 1.7697, + "step": 11839 + }, + { + "epoch": 3.634131368937999, + "grad_norm": 0.2643153667449951, + "learning_rate": 7.357942550174714e-05, + "loss": 1.7885, + "step": 11840 + }, + { + "epoch": 3.6344383057090237, + "grad_norm": 0.22643202543258667, + "learning_rate": 7.357504225244579e-05, + "loss": 1.746, + "step": 11841 + }, + { + "epoch": 3.634745242480049, + "grad_norm": 0.27782970666885376, + "learning_rate": 7.357065877016207e-05, + "loss": 1.794, + "step": 11842 + }, + { + "epoch": 3.6350521792510744, + "grad_norm": 0.3035561740398407, + "learning_rate": 7.356627505493925e-05, + "loss": 1.7892, + "step": 11843 + }, + { + "epoch": 3.6353591160220997, + "grad_norm": 0.31859731674194336, + "learning_rate": 7.356189110682072e-05, + "loss": 1.7636, + "step": 11844 + }, + { + "epoch": 3.6356660527931246, + "grad_norm": 0.2960890233516693, + "learning_rate": 7.355750692584977e-05, + "loss": 1.8294, + "step": 11845 + }, + { + "epoch": 3.63597298956415, + "grad_norm": 0.2544194459915161, + "learning_rate": 7.355312251206972e-05, + "loss": 1.7603, + "step": 11846 + }, + { + "epoch": 3.636279926335175, + "grad_norm": 0.27864789962768555, + "learning_rate": 7.354873786552391e-05, + "loss": 1.7917, + "step": 11847 + }, + { + "epoch": 3.6365868631062, + "grad_norm": 0.32552552223205566, + "learning_rate": 7.354435298625568e-05, + "loss": 1.7769, + "step": 11848 + }, + { + "epoch": 3.6368937998772255, + "grad_norm": 0.25094640254974365, + "learning_rate": 7.353996787430833e-05, + "loss": 1.8371, + "step": 11849 + }, + { + "epoch": 3.6372007366482504, + "grad_norm": 0.26656433939933777, + "learning_rate": 7.353558252972524e-05, + "loss": 1.7686, + "step": 11850 + }, + { + "epoch": 3.6375076734192757, + "grad_norm": 0.3023635745048523, + "learning_rate": 7.353119695254973e-05, + "loss": 1.7892, + "step": 11851 + }, + { + "epoch": 3.6378146101903006, + "grad_norm": 0.2822463810443878, + "learning_rate": 7.352681114282514e-05, + "loss": 1.8221, + "step": 11852 + }, + { + "epoch": 3.638121546961326, + "grad_norm": 0.31159496307373047, + "learning_rate": 7.35224251005948e-05, + "loss": 1.803, + "step": 11853 + }, + { + "epoch": 3.6384284837323513, + "grad_norm": 0.3133087158203125, + "learning_rate": 7.351803882590207e-05, + "loss": 1.744, + "step": 11854 + }, + { + "epoch": 3.638735420503376, + "grad_norm": 0.3050002455711365, + "learning_rate": 7.351365231879029e-05, + "loss": 1.7522, + "step": 11855 + }, + { + "epoch": 3.6390423572744015, + "grad_norm": 0.2729037404060364, + "learning_rate": 7.350926557930283e-05, + "loss": 1.7629, + "step": 11856 + }, + { + "epoch": 3.6393492940454264, + "grad_norm": 0.3181995153427124, + "learning_rate": 7.350487860748303e-05, + "loss": 1.7603, + "step": 11857 + }, + { + "epoch": 3.6396562308164517, + "grad_norm": 0.352651447057724, + "learning_rate": 7.350049140337423e-05, + "loss": 1.8177, + "step": 11858 + }, + { + "epoch": 3.639963167587477, + "grad_norm": 0.22935177385807037, + "learning_rate": 7.349610396701981e-05, + "loss": 1.7421, + "step": 11859 + }, + { + "epoch": 3.6402701043585024, + "grad_norm": 0.26442599296569824, + "learning_rate": 7.349171629846312e-05, + "loss": 1.8026, + "step": 11860 + }, + { + "epoch": 3.6405770411295273, + "grad_norm": 0.25357648730278015, + "learning_rate": 7.348732839774751e-05, + "loss": 1.788, + "step": 11861 + }, + { + "epoch": 3.6408839779005526, + "grad_norm": 0.26959577202796936, + "learning_rate": 7.348294026491635e-05, + "loss": 1.884, + "step": 11862 + }, + { + "epoch": 3.6411909146715775, + "grad_norm": 0.2243001013994217, + "learning_rate": 7.347855190001304e-05, + "loss": 1.7765, + "step": 11863 + }, + { + "epoch": 3.641497851442603, + "grad_norm": 0.2480708807706833, + "learning_rate": 7.34741633030809e-05, + "loss": 1.7597, + "step": 11864 + }, + { + "epoch": 3.641804788213628, + "grad_norm": 0.22512994706630707, + "learning_rate": 7.346977447416332e-05, + "loss": 1.7647, + "step": 11865 + }, + { + "epoch": 3.642111724984653, + "grad_norm": 0.24961981177330017, + "learning_rate": 7.346538541330368e-05, + "loss": 1.8178, + "step": 11866 + }, + { + "epoch": 3.6424186617556784, + "grad_norm": 0.320896714925766, + "learning_rate": 7.346099612054533e-05, + "loss": 1.85, + "step": 11867 + }, + { + "epoch": 3.6427255985267033, + "grad_norm": 0.3420880436897278, + "learning_rate": 7.345660659593167e-05, + "loss": 1.8661, + "step": 11868 + }, + { + "epoch": 3.6430325352977286, + "grad_norm": 0.2675844132900238, + "learning_rate": 7.34522168395061e-05, + "loss": 1.8177, + "step": 11869 + }, + { + "epoch": 3.643339472068754, + "grad_norm": 0.23993943631649017, + "learning_rate": 7.344782685131195e-05, + "loss": 1.7365, + "step": 11870 + }, + { + "epoch": 3.643646408839779, + "grad_norm": 0.21805813908576965, + "learning_rate": 7.344343663139264e-05, + "loss": 1.7813, + "step": 11871 + }, + { + "epoch": 3.643953345610804, + "grad_norm": 0.24334421753883362, + "learning_rate": 7.343904617979154e-05, + "loss": 1.7763, + "step": 11872 + }, + { + "epoch": 3.644260282381829, + "grad_norm": 0.22768431901931763, + "learning_rate": 7.343465549655206e-05, + "loss": 1.7817, + "step": 11873 + }, + { + "epoch": 3.6445672191528544, + "grad_norm": 0.23828962445259094, + "learning_rate": 7.343026458171757e-05, + "loss": 1.8391, + "step": 11874 + }, + { + "epoch": 3.6448741559238798, + "grad_norm": 0.24838197231292725, + "learning_rate": 7.342587343533149e-05, + "loss": 1.759, + "step": 11875 + }, + { + "epoch": 3.645181092694905, + "grad_norm": 0.22732019424438477, + "learning_rate": 7.342148205743718e-05, + "loss": 1.7348, + "step": 11876 + }, + { + "epoch": 3.64548802946593, + "grad_norm": 0.25106775760650635, + "learning_rate": 7.341709044807807e-05, + "loss": 1.8121, + "step": 11877 + }, + { + "epoch": 3.6457949662369553, + "grad_norm": 0.28532838821411133, + "learning_rate": 7.341269860729753e-05, + "loss": 1.7147, + "step": 11878 + }, + { + "epoch": 3.64610190300798, + "grad_norm": 0.3041890859603882, + "learning_rate": 7.340830653513899e-05, + "loss": 1.7666, + "step": 11879 + }, + { + "epoch": 3.6464088397790055, + "grad_norm": 0.3142147958278656, + "learning_rate": 7.340391423164585e-05, + "loss": 1.8707, + "step": 11880 + }, + { + "epoch": 3.646715776550031, + "grad_norm": 0.28531381487846375, + "learning_rate": 7.339952169686151e-05, + "loss": 1.7961, + "step": 11881 + }, + { + "epoch": 3.6470227133210558, + "grad_norm": 0.33779671788215637, + "learning_rate": 7.339512893082938e-05, + "loss": 1.7428, + "step": 11882 + }, + { + "epoch": 3.647329650092081, + "grad_norm": 0.29611849784851074, + "learning_rate": 7.339073593359287e-05, + "loss": 1.8803, + "step": 11883 + }, + { + "epoch": 3.647636586863106, + "grad_norm": 0.31248557567596436, + "learning_rate": 7.33863427051954e-05, + "loss": 1.7868, + "step": 11884 + }, + { + "epoch": 3.6479435236341313, + "grad_norm": 0.42829564213752747, + "learning_rate": 7.338194924568039e-05, + "loss": 1.8558, + "step": 11885 + }, + { + "epoch": 3.6482504604051567, + "grad_norm": 0.431023508310318, + "learning_rate": 7.337755555509126e-05, + "loss": 1.7565, + "step": 11886 + }, + { + "epoch": 3.6485573971761815, + "grad_norm": 0.2917975187301636, + "learning_rate": 7.33731616334714e-05, + "loss": 1.8067, + "step": 11887 + }, + { + "epoch": 3.648864333947207, + "grad_norm": 0.3072175085544586, + "learning_rate": 7.336876748086427e-05, + "loss": 1.782, + "step": 11888 + }, + { + "epoch": 3.6491712707182318, + "grad_norm": 0.33658862113952637, + "learning_rate": 7.336437309731327e-05, + "loss": 1.8007, + "step": 11889 + }, + { + "epoch": 3.649478207489257, + "grad_norm": 0.23774033784866333, + "learning_rate": 7.335997848286185e-05, + "loss": 1.7606, + "step": 11890 + }, + { + "epoch": 3.6497851442602824, + "grad_norm": 0.3373236358165741, + "learning_rate": 7.335558363755344e-05, + "loss": 1.7335, + "step": 11891 + }, + { + "epoch": 3.650092081031308, + "grad_norm": 0.3906517028808594, + "learning_rate": 7.335118856143145e-05, + "loss": 1.7974, + "step": 11892 + }, + { + "epoch": 3.6503990178023327, + "grad_norm": 0.37715303897857666, + "learning_rate": 7.334679325453934e-05, + "loss": 1.8875, + "step": 11893 + }, + { + "epoch": 3.650705954573358, + "grad_norm": 0.278540700674057, + "learning_rate": 7.334239771692053e-05, + "loss": 1.8165, + "step": 11894 + }, + { + "epoch": 3.651012891344383, + "grad_norm": 0.24434895813465118, + "learning_rate": 7.333800194861845e-05, + "loss": 1.7756, + "step": 11895 + }, + { + "epoch": 3.6513198281154082, + "grad_norm": 0.25057271122932434, + "learning_rate": 7.333360594967658e-05, + "loss": 1.7932, + "step": 11896 + }, + { + "epoch": 3.6516267648864336, + "grad_norm": 0.3277342617511749, + "learning_rate": 7.332920972013833e-05, + "loss": 1.7781, + "step": 11897 + }, + { + "epoch": 3.6519337016574585, + "grad_norm": 0.2754829525947571, + "learning_rate": 7.332481326004715e-05, + "loss": 1.7916, + "step": 11898 + }, + { + "epoch": 3.652240638428484, + "grad_norm": 0.24490588903427124, + "learning_rate": 7.332041656944651e-05, + "loss": 1.7904, + "step": 11899 + }, + { + "epoch": 3.6525475751995087, + "grad_norm": 0.3176959455013275, + "learning_rate": 7.331601964837982e-05, + "loss": 1.7379, + "step": 11900 + }, + { + "epoch": 3.652854511970534, + "grad_norm": 0.3435784876346588, + "learning_rate": 7.331162249689057e-05, + "loss": 1.7635, + "step": 11901 + }, + { + "epoch": 3.6531614487415593, + "grad_norm": 0.335697740316391, + "learning_rate": 7.330722511502221e-05, + "loss": 1.7903, + "step": 11902 + }, + { + "epoch": 3.6534683855125847, + "grad_norm": 0.2748894691467285, + "learning_rate": 7.330282750281819e-05, + "loss": 1.8259, + "step": 11903 + }, + { + "epoch": 3.6537753222836096, + "grad_norm": 0.36754751205444336, + "learning_rate": 7.329842966032197e-05, + "loss": 1.7728, + "step": 11904 + }, + { + "epoch": 3.654082259054635, + "grad_norm": 0.4355713129043579, + "learning_rate": 7.3294031587577e-05, + "loss": 1.7447, + "step": 11905 + }, + { + "epoch": 3.65438919582566, + "grad_norm": 0.3967476487159729, + "learning_rate": 7.328963328462677e-05, + "loss": 1.8299, + "step": 11906 + }, + { + "epoch": 3.654696132596685, + "grad_norm": 0.23805755376815796, + "learning_rate": 7.328523475151472e-05, + "loss": 1.7631, + "step": 11907 + }, + { + "epoch": 3.6550030693677105, + "grad_norm": 0.40350377559661865, + "learning_rate": 7.328083598828435e-05, + "loss": 1.8693, + "step": 11908 + }, + { + "epoch": 3.6553100061387354, + "grad_norm": 0.4743673801422119, + "learning_rate": 7.32764369949791e-05, + "loss": 1.7887, + "step": 11909 + }, + { + "epoch": 3.6556169429097607, + "grad_norm": 0.33830127120018005, + "learning_rate": 7.327203777164246e-05, + "loss": 1.7527, + "step": 11910 + }, + { + "epoch": 3.6559238796807856, + "grad_norm": 0.2465003877878189, + "learning_rate": 7.326763831831791e-05, + "loss": 1.7898, + "step": 11911 + }, + { + "epoch": 3.656230816451811, + "grad_norm": 0.31647852063179016, + "learning_rate": 7.326323863504892e-05, + "loss": 1.8056, + "step": 11912 + }, + { + "epoch": 3.6565377532228363, + "grad_norm": 0.31436124444007874, + "learning_rate": 7.325883872187896e-05, + "loss": 1.7972, + "step": 11913 + }, + { + "epoch": 3.656844689993861, + "grad_norm": 0.260405957698822, + "learning_rate": 7.325443857885153e-05, + "loss": 1.8109, + "step": 11914 + }, + { + "epoch": 3.6571516267648865, + "grad_norm": 0.29312583804130554, + "learning_rate": 7.325003820601011e-05, + "loss": 1.8947, + "step": 11915 + }, + { + "epoch": 3.6574585635359114, + "grad_norm": 0.2641582190990448, + "learning_rate": 7.324563760339819e-05, + "loss": 1.7737, + "step": 11916 + }, + { + "epoch": 3.6577655003069367, + "grad_norm": 0.2338121086359024, + "learning_rate": 7.324123677105923e-05, + "loss": 1.7462, + "step": 11917 + }, + { + "epoch": 3.658072437077962, + "grad_norm": 0.27877378463745117, + "learning_rate": 7.323683570903676e-05, + "loss": 1.8371, + "step": 11918 + }, + { + "epoch": 3.6583793738489874, + "grad_norm": 0.24238766729831696, + "learning_rate": 7.323243441737427e-05, + "loss": 1.7304, + "step": 11919 + }, + { + "epoch": 3.6586863106200123, + "grad_norm": 0.2349759042263031, + "learning_rate": 7.322803289611525e-05, + "loss": 1.7422, + "step": 11920 + }, + { + "epoch": 3.6589932473910376, + "grad_norm": 0.2254217565059662, + "learning_rate": 7.322363114530318e-05, + "loss": 1.7296, + "step": 11921 + }, + { + "epoch": 3.6593001841620625, + "grad_norm": 0.24533270299434662, + "learning_rate": 7.321922916498158e-05, + "loss": 1.7834, + "step": 11922 + }, + { + "epoch": 3.659607120933088, + "grad_norm": 0.24993161857128143, + "learning_rate": 7.321482695519393e-05, + "loss": 1.8502, + "step": 11923 + }, + { + "epoch": 3.659914057704113, + "grad_norm": 0.2540178894996643, + "learning_rate": 7.321042451598378e-05, + "loss": 1.8372, + "step": 11924 + }, + { + "epoch": 3.660220994475138, + "grad_norm": 0.2241390198469162, + "learning_rate": 7.32060218473946e-05, + "loss": 1.7619, + "step": 11925 + }, + { + "epoch": 3.6605279312461634, + "grad_norm": 0.2137840837240219, + "learning_rate": 7.32016189494699e-05, + "loss": 1.751, + "step": 11926 + }, + { + "epoch": 3.6608348680171883, + "grad_norm": 0.2596585154533386, + "learning_rate": 7.319721582225323e-05, + "loss": 1.7773, + "step": 11927 + }, + { + "epoch": 3.6611418047882136, + "grad_norm": 0.24898354709148407, + "learning_rate": 7.319281246578806e-05, + "loss": 1.7347, + "step": 11928 + }, + { + "epoch": 3.661448741559239, + "grad_norm": 0.26553863286972046, + "learning_rate": 7.31884088801179e-05, + "loss": 1.7812, + "step": 11929 + }, + { + "epoch": 3.661755678330264, + "grad_norm": 0.2494438737630844, + "learning_rate": 7.318400506528633e-05, + "loss": 1.7554, + "step": 11930 + }, + { + "epoch": 3.662062615101289, + "grad_norm": 0.2794995903968811, + "learning_rate": 7.317960102133682e-05, + "loss": 1.7495, + "step": 11931 + }, + { + "epoch": 3.662369551872314, + "grad_norm": 0.2843860983848572, + "learning_rate": 7.317519674831293e-05, + "loss": 1.7734, + "step": 11932 + }, + { + "epoch": 3.6626764886433394, + "grad_norm": 0.28261128067970276, + "learning_rate": 7.317079224625813e-05, + "loss": 1.7794, + "step": 11933 + }, + { + "epoch": 3.6629834254143647, + "grad_norm": 0.2552426755428314, + "learning_rate": 7.316638751521599e-05, + "loss": 1.8397, + "step": 11934 + }, + { + "epoch": 3.66329036218539, + "grad_norm": 0.4140608608722687, + "learning_rate": 7.316198255523002e-05, + "loss": 1.848, + "step": 11935 + }, + { + "epoch": 3.663597298956415, + "grad_norm": 0.3709854483604431, + "learning_rate": 7.315757736634377e-05, + "loss": 1.8489, + "step": 11936 + }, + { + "epoch": 3.6639042357274403, + "grad_norm": 0.23637300729751587, + "learning_rate": 7.315317194860078e-05, + "loss": 1.7549, + "step": 11937 + }, + { + "epoch": 3.664211172498465, + "grad_norm": 0.32884421944618225, + "learning_rate": 7.314876630204456e-05, + "loss": 1.8061, + "step": 11938 + }, + { + "epoch": 3.6645181092694905, + "grad_norm": 0.33354130387306213, + "learning_rate": 7.314436042671867e-05, + "loss": 1.8346, + "step": 11939 + }, + { + "epoch": 3.664825046040516, + "grad_norm": 0.25776317715644836, + "learning_rate": 7.313995432266663e-05, + "loss": 1.8598, + "step": 11940 + }, + { + "epoch": 3.6651319828115407, + "grad_norm": 0.2910402715206146, + "learning_rate": 7.313554798993202e-05, + "loss": 1.7613, + "step": 11941 + }, + { + "epoch": 3.665438919582566, + "grad_norm": 0.3487538695335388, + "learning_rate": 7.313114142855836e-05, + "loss": 1.8105, + "step": 11942 + }, + { + "epoch": 3.665745856353591, + "grad_norm": 0.27271291613578796, + "learning_rate": 7.312673463858918e-05, + "loss": 1.8107, + "step": 11943 + }, + { + "epoch": 3.6660527931246163, + "grad_norm": 0.2613036632537842, + "learning_rate": 7.312232762006809e-05, + "loss": 1.7871, + "step": 11944 + }, + { + "epoch": 3.6663597298956416, + "grad_norm": 0.30594903230667114, + "learning_rate": 7.311792037303859e-05, + "loss": 1.8043, + "step": 11945 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.3960847854614258, + "learning_rate": 7.311351289754425e-05, + "loss": 1.8434, + "step": 11946 + }, + { + "epoch": 3.666973603437692, + "grad_norm": 0.33369311690330505, + "learning_rate": 7.310910519362861e-05, + "loss": 1.7496, + "step": 11947 + }, + { + "epoch": 3.6672805402087167, + "grad_norm": 0.29852384328842163, + "learning_rate": 7.310469726133528e-05, + "loss": 1.858, + "step": 11948 + }, + { + "epoch": 3.667587476979742, + "grad_norm": 0.2610527276992798, + "learning_rate": 7.310028910070777e-05, + "loss": 1.7642, + "step": 11949 + }, + { + "epoch": 3.6678944137507674, + "grad_norm": 0.3606704771518707, + "learning_rate": 7.309588071178967e-05, + "loss": 1.845, + "step": 11950 + }, + { + "epoch": 3.6682013505217927, + "grad_norm": 0.3157273828983307, + "learning_rate": 7.309147209462454e-05, + "loss": 1.7864, + "step": 11951 + }, + { + "epoch": 3.6685082872928176, + "grad_norm": 0.23907925188541412, + "learning_rate": 7.308706324925594e-05, + "loss": 1.8363, + "step": 11952 + }, + { + "epoch": 3.668815224063843, + "grad_norm": 0.3365088999271393, + "learning_rate": 7.308265417572747e-05, + "loss": 1.8755, + "step": 11953 + }, + { + "epoch": 3.669122160834868, + "grad_norm": 0.29404979944229126, + "learning_rate": 7.307824487408266e-05, + "loss": 1.8128, + "step": 11954 + }, + { + "epoch": 3.669429097605893, + "grad_norm": 0.2689574658870697, + "learning_rate": 7.307383534436511e-05, + "loss": 1.8072, + "step": 11955 + }, + { + "epoch": 3.6697360343769185, + "grad_norm": 0.28394198417663574, + "learning_rate": 7.306942558661841e-05, + "loss": 1.7919, + "step": 11956 + }, + { + "epoch": 3.6700429711479434, + "grad_norm": 0.2594783902168274, + "learning_rate": 7.306501560088612e-05, + "loss": 1.7467, + "step": 11957 + }, + { + "epoch": 3.6703499079189688, + "grad_norm": 0.24765191972255707, + "learning_rate": 7.30606053872118e-05, + "loss": 1.7876, + "step": 11958 + }, + { + "epoch": 3.6706568446899936, + "grad_norm": 0.22157172858715057, + "learning_rate": 7.305619494563909e-05, + "loss": 1.7802, + "step": 11959 + }, + { + "epoch": 3.670963781461019, + "grad_norm": 0.270151287317276, + "learning_rate": 7.305178427621155e-05, + "loss": 1.7723, + "step": 11960 + }, + { + "epoch": 3.6712707182320443, + "grad_norm": 0.3163939118385315, + "learning_rate": 7.304737337897277e-05, + "loss": 1.8488, + "step": 11961 + }, + { + "epoch": 3.671577655003069, + "grad_norm": 0.2605706453323364, + "learning_rate": 7.304296225396632e-05, + "loss": 1.7442, + "step": 11962 + }, + { + "epoch": 3.6718845917740945, + "grad_norm": 0.31179291009902954, + "learning_rate": 7.303855090123582e-05, + "loss": 1.831, + "step": 11963 + }, + { + "epoch": 3.6721915285451194, + "grad_norm": 0.33365359902381897, + "learning_rate": 7.303413932082483e-05, + "loss": 1.8376, + "step": 11964 + }, + { + "epoch": 3.6724984653161448, + "grad_norm": 0.2952130138874054, + "learning_rate": 7.302972751277701e-05, + "loss": 1.7733, + "step": 11965 + }, + { + "epoch": 3.67280540208717, + "grad_norm": 0.24270877242088318, + "learning_rate": 7.302531547713592e-05, + "loss": 1.8367, + "step": 11966 + }, + { + "epoch": 3.6731123388581954, + "grad_norm": 0.34315919876098633, + "learning_rate": 7.302090321394517e-05, + "loss": 1.7901, + "step": 11967 + }, + { + "epoch": 3.6734192756292203, + "grad_norm": 0.33511418104171753, + "learning_rate": 7.301649072324834e-05, + "loss": 1.7929, + "step": 11968 + }, + { + "epoch": 3.6737262124002457, + "grad_norm": 0.22397933900356293, + "learning_rate": 7.301207800508907e-05, + "loss": 1.7533, + "step": 11969 + }, + { + "epoch": 3.6740331491712706, + "grad_norm": 0.2882738411426544, + "learning_rate": 7.300766505951095e-05, + "loss": 1.8071, + "step": 11970 + }, + { + "epoch": 3.674340085942296, + "grad_norm": 0.242112398147583, + "learning_rate": 7.300325188655761e-05, + "loss": 1.7739, + "step": 11971 + }, + { + "epoch": 3.674647022713321, + "grad_norm": 0.27754491567611694, + "learning_rate": 7.299883848627265e-05, + "loss": 1.8295, + "step": 11972 + }, + { + "epoch": 3.674953959484346, + "grad_norm": 0.2787899076938629, + "learning_rate": 7.29944248586997e-05, + "loss": 1.7682, + "step": 11973 + }, + { + "epoch": 3.6752608962553714, + "grad_norm": 0.24448934197425842, + "learning_rate": 7.299001100388234e-05, + "loss": 1.7826, + "step": 11974 + }, + { + "epoch": 3.6755678330263963, + "grad_norm": 0.37869495153427124, + "learning_rate": 7.298559692186421e-05, + "loss": 1.8582, + "step": 11975 + }, + { + "epoch": 3.6758747697974217, + "grad_norm": 0.3299996256828308, + "learning_rate": 7.298118261268897e-05, + "loss": 1.7716, + "step": 11976 + }, + { + "epoch": 3.676181706568447, + "grad_norm": 0.278891384601593, + "learning_rate": 7.29767680764002e-05, + "loss": 1.879, + "step": 11977 + }, + { + "epoch": 3.6764886433394723, + "grad_norm": 0.29326459765434265, + "learning_rate": 7.297235331304155e-05, + "loss": 1.804, + "step": 11978 + }, + { + "epoch": 3.6767955801104972, + "grad_norm": 0.2697092592716217, + "learning_rate": 7.296793832265663e-05, + "loss": 1.7842, + "step": 11979 + }, + { + "epoch": 3.6771025168815226, + "grad_norm": 0.3045118749141693, + "learning_rate": 7.296352310528909e-05, + "loss": 1.7959, + "step": 11980 + }, + { + "epoch": 3.6774094536525475, + "grad_norm": 0.278647780418396, + "learning_rate": 7.295910766098252e-05, + "loss": 1.7907, + "step": 11981 + }, + { + "epoch": 3.677716390423573, + "grad_norm": 0.2370275855064392, + "learning_rate": 7.295469198978063e-05, + "loss": 1.757, + "step": 11982 + }, + { + "epoch": 3.678023327194598, + "grad_norm": 0.3061021566390991, + "learning_rate": 7.295027609172702e-05, + "loss": 1.7927, + "step": 11983 + }, + { + "epoch": 3.678330263965623, + "grad_norm": 0.2844544053077698, + "learning_rate": 7.294585996686532e-05, + "loss": 1.7705, + "step": 11984 + }, + { + "epoch": 3.6786372007366483, + "grad_norm": 0.31121113896369934, + "learning_rate": 7.29414436152392e-05, + "loss": 1.783, + "step": 11985 + }, + { + "epoch": 3.6789441375076732, + "grad_norm": 0.2566785514354706, + "learning_rate": 7.293702703689225e-05, + "loss": 1.7781, + "step": 11986 + }, + { + "epoch": 3.6792510742786986, + "grad_norm": 0.22176961600780487, + "learning_rate": 7.293261023186818e-05, + "loss": 1.7302, + "step": 11987 + }, + { + "epoch": 3.679558011049724, + "grad_norm": 0.21547441184520721, + "learning_rate": 7.292819320021062e-05, + "loss": 1.7666, + "step": 11988 + }, + { + "epoch": 3.679864947820749, + "grad_norm": 0.26309674978256226, + "learning_rate": 7.29237759419632e-05, + "loss": 1.7817, + "step": 11989 + }, + { + "epoch": 3.680171884591774, + "grad_norm": 0.2558063864707947, + "learning_rate": 7.29193584571696e-05, + "loss": 1.8257, + "step": 11990 + }, + { + "epoch": 3.680478821362799, + "grad_norm": 0.24516844749450684, + "learning_rate": 7.291494074587347e-05, + "loss": 1.7803, + "step": 11991 + }, + { + "epoch": 3.6807857581338244, + "grad_norm": 0.22891047596931458, + "learning_rate": 7.291052280811843e-05, + "loss": 1.7977, + "step": 11992 + }, + { + "epoch": 3.6810926949048497, + "grad_norm": 0.2776026129722595, + "learning_rate": 7.290610464394822e-05, + "loss": 1.8486, + "step": 11993 + }, + { + "epoch": 3.681399631675875, + "grad_norm": 0.31472426652908325, + "learning_rate": 7.290168625340644e-05, + "loss": 1.7841, + "step": 11994 + }, + { + "epoch": 3.6817065684469, + "grad_norm": 0.3459274470806122, + "learning_rate": 7.289726763653677e-05, + "loss": 1.7458, + "step": 11995 + }, + { + "epoch": 3.6820135052179253, + "grad_norm": 0.23645849525928497, + "learning_rate": 7.289284879338289e-05, + "loss": 1.781, + "step": 11996 + }, + { + "epoch": 3.68232044198895, + "grad_norm": 0.3257114291191101, + "learning_rate": 7.288842972398845e-05, + "loss": 1.8269, + "step": 11997 + }, + { + "epoch": 3.6826273787599755, + "grad_norm": 0.5450126528739929, + "learning_rate": 7.288401042839713e-05, + "loss": 1.8342, + "step": 11998 + }, + { + "epoch": 3.682934315531001, + "grad_norm": 0.5080512762069702, + "learning_rate": 7.287959090665262e-05, + "loss": 1.8097, + "step": 11999 + }, + { + "epoch": 3.6832412523020257, + "grad_norm": 0.3005252480506897, + "learning_rate": 7.287517115879858e-05, + "loss": 1.8271, + "step": 12000 + }, + { + "epoch": 3.683548189073051, + "grad_norm": 0.2760924994945526, + "learning_rate": 7.287075118487869e-05, + "loss": 1.8267, + "step": 12001 + }, + { + "epoch": 3.683855125844076, + "grad_norm": 0.3475865423679352, + "learning_rate": 7.286633098493663e-05, + "loss": 1.785, + "step": 12002 + }, + { + "epoch": 3.6841620626151013, + "grad_norm": 0.2905690670013428, + "learning_rate": 7.286191055901608e-05, + "loss": 1.8283, + "step": 12003 + }, + { + "epoch": 3.6844689993861266, + "grad_norm": 0.23666246235370636, + "learning_rate": 7.285748990716072e-05, + "loss": 1.7665, + "step": 12004 + }, + { + "epoch": 3.6847759361571515, + "grad_norm": 0.32329514622688293, + "learning_rate": 7.285306902941427e-05, + "loss": 1.7267, + "step": 12005 + }, + { + "epoch": 3.685082872928177, + "grad_norm": 0.32345879077911377, + "learning_rate": 7.28486479258204e-05, + "loss": 1.7529, + "step": 12006 + }, + { + "epoch": 3.6853898096992017, + "grad_norm": 0.2727855443954468, + "learning_rate": 7.284422659642279e-05, + "loss": 1.8279, + "step": 12007 + }, + { + "epoch": 3.685696746470227, + "grad_norm": 0.37847277522087097, + "learning_rate": 7.283980504126513e-05, + "loss": 1.7809, + "step": 12008 + }, + { + "epoch": 3.6860036832412524, + "grad_norm": 0.44694215059280396, + "learning_rate": 7.283538326039113e-05, + "loss": 1.8184, + "step": 12009 + }, + { + "epoch": 3.6863106200122777, + "grad_norm": 0.2868261933326721, + "learning_rate": 7.28309612538445e-05, + "loss": 1.7461, + "step": 12010 + }, + { + "epoch": 3.6866175567833026, + "grad_norm": 0.2601351737976074, + "learning_rate": 7.282653902166894e-05, + "loss": 1.8011, + "step": 12011 + }, + { + "epoch": 3.686924493554328, + "grad_norm": 0.328185498714447, + "learning_rate": 7.282211656390813e-05, + "loss": 1.7934, + "step": 12012 + }, + { + "epoch": 3.687231430325353, + "grad_norm": 0.2712559103965759, + "learning_rate": 7.281769388060578e-05, + "loss": 1.7566, + "step": 12013 + }, + { + "epoch": 3.687538367096378, + "grad_norm": 0.2725805938243866, + "learning_rate": 7.281327097180562e-05, + "loss": 1.8024, + "step": 12014 + }, + { + "epoch": 3.6878453038674035, + "grad_norm": 0.37282630801200867, + "learning_rate": 7.280884783755133e-05, + "loss": 1.7624, + "step": 12015 + }, + { + "epoch": 3.6881522406384284, + "grad_norm": 0.36519256234169006, + "learning_rate": 7.280442447788664e-05, + "loss": 1.8691, + "step": 12016 + }, + { + "epoch": 3.6884591774094537, + "grad_norm": 0.21699345111846924, + "learning_rate": 7.280000089285528e-05, + "loss": 1.7308, + "step": 12017 + }, + { + "epoch": 3.6887661141804786, + "grad_norm": 0.3159945011138916, + "learning_rate": 7.279557708250094e-05, + "loss": 1.8144, + "step": 12018 + }, + { + "epoch": 3.689073050951504, + "grad_norm": 0.2927449643611908, + "learning_rate": 7.279115304686735e-05, + "loss": 1.7746, + "step": 12019 + }, + { + "epoch": 3.6893799877225293, + "grad_norm": 0.279208242893219, + "learning_rate": 7.278672878599819e-05, + "loss": 1.7678, + "step": 12020 + }, + { + "epoch": 3.689686924493554, + "grad_norm": 0.40005648136138916, + "learning_rate": 7.278230429993725e-05, + "loss": 1.7876, + "step": 12021 + }, + { + "epoch": 3.6899938612645795, + "grad_norm": 0.3444392681121826, + "learning_rate": 7.277787958872824e-05, + "loss": 1.7591, + "step": 12022 + }, + { + "epoch": 3.6903007980356044, + "grad_norm": 0.21841467916965485, + "learning_rate": 7.277345465241485e-05, + "loss": 1.785, + "step": 12023 + }, + { + "epoch": 3.6906077348066297, + "grad_norm": 0.32463181018829346, + "learning_rate": 7.276902949104084e-05, + "loss": 1.8164, + "step": 12024 + }, + { + "epoch": 3.690914671577655, + "grad_norm": 0.36221247911453247, + "learning_rate": 7.276460410464994e-05, + "loss": 1.7529, + "step": 12025 + }, + { + "epoch": 3.6912216083486804, + "grad_norm": 0.24451927840709686, + "learning_rate": 7.276017849328588e-05, + "loss": 1.8031, + "step": 12026 + }, + { + "epoch": 3.6915285451197053, + "grad_norm": 0.3055694103240967, + "learning_rate": 7.275575265699239e-05, + "loss": 1.8158, + "step": 12027 + }, + { + "epoch": 3.6918354818907306, + "grad_norm": 0.4315083622932434, + "learning_rate": 7.27513265958132e-05, + "loss": 1.8322, + "step": 12028 + }, + { + "epoch": 3.6921424186617555, + "grad_norm": 0.3391095697879791, + "learning_rate": 7.274690030979209e-05, + "loss": 1.8214, + "step": 12029 + }, + { + "epoch": 3.692449355432781, + "grad_norm": 0.22714883089065552, + "learning_rate": 7.274247379897277e-05, + "loss": 1.7312, + "step": 12030 + }, + { + "epoch": 3.692756292203806, + "grad_norm": 0.24982765316963196, + "learning_rate": 7.273804706339899e-05, + "loss": 1.738, + "step": 12031 + }, + { + "epoch": 3.693063228974831, + "grad_norm": 0.32509860396385193, + "learning_rate": 7.273362010311451e-05, + "loss": 1.7773, + "step": 12032 + }, + { + "epoch": 3.6933701657458564, + "grad_norm": 0.2643086612224579, + "learning_rate": 7.272919291816307e-05, + "loss": 1.7545, + "step": 12033 + }, + { + "epoch": 3.6936771025168813, + "grad_norm": 0.2568800747394562, + "learning_rate": 7.272476550858842e-05, + "loss": 1.8055, + "step": 12034 + }, + { + "epoch": 3.6939840392879066, + "grad_norm": 0.27418240904808044, + "learning_rate": 7.272033787443433e-05, + "loss": 1.7769, + "step": 12035 + }, + { + "epoch": 3.694290976058932, + "grad_norm": 0.2459677755832672, + "learning_rate": 7.271591001574453e-05, + "loss": 1.7971, + "step": 12036 + }, + { + "epoch": 3.694597912829957, + "grad_norm": 0.22349393367767334, + "learning_rate": 7.27114819325628e-05, + "loss": 1.7791, + "step": 12037 + }, + { + "epoch": 3.694904849600982, + "grad_norm": 0.25321197509765625, + "learning_rate": 7.270705362493288e-05, + "loss": 1.7475, + "step": 12038 + }, + { + "epoch": 3.695211786372007, + "grad_norm": 0.2585916519165039, + "learning_rate": 7.270262509289855e-05, + "loss": 1.7801, + "step": 12039 + }, + { + "epoch": 3.6955187231430324, + "grad_norm": 0.2673574686050415, + "learning_rate": 7.269819633650359e-05, + "loss": 1.7578, + "step": 12040 + }, + { + "epoch": 3.6958256599140578, + "grad_norm": 0.2509469985961914, + "learning_rate": 7.269376735579175e-05, + "loss": 1.7994, + "step": 12041 + }, + { + "epoch": 3.696132596685083, + "grad_norm": 0.28527703881263733, + "learning_rate": 7.268933815080679e-05, + "loss": 1.7752, + "step": 12042 + }, + { + "epoch": 3.696439533456108, + "grad_norm": 0.22716578841209412, + "learning_rate": 7.268490872159248e-05, + "loss": 1.7186, + "step": 12043 + }, + { + "epoch": 3.6967464702271333, + "grad_norm": 0.24888403713703156, + "learning_rate": 7.268047906819262e-05, + "loss": 1.7882, + "step": 12044 + }, + { + "epoch": 3.697053406998158, + "grad_norm": 0.28976112604141235, + "learning_rate": 7.267604919065096e-05, + "loss": 1.7655, + "step": 12045 + }, + { + "epoch": 3.6973603437691835, + "grad_norm": 0.24668502807617188, + "learning_rate": 7.267161908901131e-05, + "loss": 1.8051, + "step": 12046 + }, + { + "epoch": 3.697667280540209, + "grad_norm": 0.2464776188135147, + "learning_rate": 7.266718876331742e-05, + "loss": 1.809, + "step": 12047 + }, + { + "epoch": 3.6979742173112338, + "grad_norm": 0.27648577094078064, + "learning_rate": 7.266275821361309e-05, + "loss": 1.7869, + "step": 12048 + }, + { + "epoch": 3.698281154082259, + "grad_norm": 0.26427242159843445, + "learning_rate": 7.26583274399421e-05, + "loss": 1.7681, + "step": 12049 + }, + { + "epoch": 3.698588090853284, + "grad_norm": 0.24595285952091217, + "learning_rate": 7.265389644234823e-05, + "loss": 1.7209, + "step": 12050 + }, + { + "epoch": 3.6988950276243093, + "grad_norm": 0.32514405250549316, + "learning_rate": 7.26494652208753e-05, + "loss": 1.8702, + "step": 12051 + }, + { + "epoch": 3.6992019643953347, + "grad_norm": 0.24512936174869537, + "learning_rate": 7.264503377556705e-05, + "loss": 1.784, + "step": 12052 + }, + { + "epoch": 3.69950890116636, + "grad_norm": 0.28698310256004333, + "learning_rate": 7.264060210646733e-05, + "loss": 1.905, + "step": 12053 + }, + { + "epoch": 3.699815837937385, + "grad_norm": 0.2995007336139679, + "learning_rate": 7.263617021361989e-05, + "loss": 1.7822, + "step": 12054 + }, + { + "epoch": 3.7001227747084102, + "grad_norm": 0.25869423151016235, + "learning_rate": 7.263173809706855e-05, + "loss": 1.7988, + "step": 12055 + }, + { + "epoch": 3.700429711479435, + "grad_norm": 0.350918710231781, + "learning_rate": 7.262730575685711e-05, + "loss": 1.9504, + "step": 12056 + }, + { + "epoch": 3.7007366482504604, + "grad_norm": 0.3407665491104126, + "learning_rate": 7.262287319302937e-05, + "loss": 1.8506, + "step": 12057 + }, + { + "epoch": 3.701043585021486, + "grad_norm": 0.3039441704750061, + "learning_rate": 7.261844040562915e-05, + "loss": 1.7841, + "step": 12058 + }, + { + "epoch": 3.7013505217925107, + "grad_norm": 0.23483428359031677, + "learning_rate": 7.261400739470023e-05, + "loss": 1.7899, + "step": 12059 + }, + { + "epoch": 3.701657458563536, + "grad_norm": 0.30779507756233215, + "learning_rate": 7.260957416028645e-05, + "loss": 1.8131, + "step": 12060 + }, + { + "epoch": 3.701964395334561, + "grad_norm": 0.29901376366615295, + "learning_rate": 7.26051407024316e-05, + "loss": 1.7861, + "step": 12061 + }, + { + "epoch": 3.7022713321055862, + "grad_norm": 0.30058762431144714, + "learning_rate": 7.260070702117949e-05, + "loss": 1.7485, + "step": 12062 + }, + { + "epoch": 3.7025782688766116, + "grad_norm": 0.24523651599884033, + "learning_rate": 7.259627311657396e-05, + "loss": 1.772, + "step": 12063 + }, + { + "epoch": 3.7028852056476365, + "grad_norm": 0.24375474452972412, + "learning_rate": 7.259183898865882e-05, + "loss": 1.7848, + "step": 12064 + }, + { + "epoch": 3.703192142418662, + "grad_norm": 0.2562403380870819, + "learning_rate": 7.258740463747788e-05, + "loss": 1.7447, + "step": 12065 + }, + { + "epoch": 3.7034990791896867, + "grad_norm": 0.265229195356369, + "learning_rate": 7.258297006307496e-05, + "loss": 1.8111, + "step": 12066 + }, + { + "epoch": 3.703806015960712, + "grad_norm": 0.2836552858352661, + "learning_rate": 7.25785352654939e-05, + "loss": 1.7952, + "step": 12067 + }, + { + "epoch": 3.7041129527317374, + "grad_norm": 0.3269572854042053, + "learning_rate": 7.257410024477852e-05, + "loss": 1.8604, + "step": 12068 + }, + { + "epoch": 3.7044198895027627, + "grad_norm": 0.2391490638256073, + "learning_rate": 7.256966500097264e-05, + "loss": 1.7417, + "step": 12069 + }, + { + "epoch": 3.7047268262737876, + "grad_norm": 0.2610675096511841, + "learning_rate": 7.256522953412011e-05, + "loss": 1.7712, + "step": 12070 + }, + { + "epoch": 3.705033763044813, + "grad_norm": 0.24954774975776672, + "learning_rate": 7.256079384426477e-05, + "loss": 1.7506, + "step": 12071 + }, + { + "epoch": 3.705340699815838, + "grad_norm": 0.2603892385959625, + "learning_rate": 7.255635793145042e-05, + "loss": 1.8105, + "step": 12072 + }, + { + "epoch": 3.705647636586863, + "grad_norm": 0.32728591561317444, + "learning_rate": 7.255192179572092e-05, + "loss": 1.8448, + "step": 12073 + }, + { + "epoch": 3.7059545733578885, + "grad_norm": 0.4559340178966522, + "learning_rate": 7.254748543712013e-05, + "loss": 1.7232, + "step": 12074 + }, + { + "epoch": 3.7062615101289134, + "grad_norm": 0.36526206135749817, + "learning_rate": 7.254304885569186e-05, + "loss": 1.7874, + "step": 12075 + }, + { + "epoch": 3.7065684468999387, + "grad_norm": 0.21606837213039398, + "learning_rate": 7.253861205147998e-05, + "loss": 1.7266, + "step": 12076 + }, + { + "epoch": 3.7068753836709636, + "grad_norm": 0.3629585802555084, + "learning_rate": 7.253417502452831e-05, + "loss": 1.7722, + "step": 12077 + }, + { + "epoch": 3.707182320441989, + "grad_norm": 0.4224923551082611, + "learning_rate": 7.252973777488072e-05, + "loss": 1.7369, + "step": 12078 + }, + { + "epoch": 3.7074892572130143, + "grad_norm": 0.32245784997940063, + "learning_rate": 7.252530030258106e-05, + "loss": 1.7836, + "step": 12079 + }, + { + "epoch": 3.707796193984039, + "grad_norm": 0.29909494519233704, + "learning_rate": 7.252086260767317e-05, + "loss": 1.8718, + "step": 12080 + }, + { + "epoch": 3.7081031307550645, + "grad_norm": 0.21995799243450165, + "learning_rate": 7.251642469020093e-05, + "loss": 1.7103, + "step": 12081 + }, + { + "epoch": 3.7084100675260894, + "grad_norm": 0.2737572193145752, + "learning_rate": 7.251198655020818e-05, + "loss": 1.7787, + "step": 12082 + }, + { + "epoch": 3.7087170042971147, + "grad_norm": 0.22417058050632477, + "learning_rate": 7.250754818773879e-05, + "loss": 1.7782, + "step": 12083 + }, + { + "epoch": 3.70902394106814, + "grad_norm": 0.3350662887096405, + "learning_rate": 7.25031096028366e-05, + "loss": 1.8193, + "step": 12084 + }, + { + "epoch": 3.7093308778391654, + "grad_norm": 0.3199101686477661, + "learning_rate": 7.24986707955455e-05, + "loss": 1.831, + "step": 12085 + }, + { + "epoch": 3.7096378146101903, + "grad_norm": 0.2513977289199829, + "learning_rate": 7.249423176590936e-05, + "loss": 1.8288, + "step": 12086 + }, + { + "epoch": 3.7099447513812156, + "grad_norm": 0.30411866307258606, + "learning_rate": 7.248979251397203e-05, + "loss": 1.7837, + "step": 12087 + }, + { + "epoch": 3.7102516881522405, + "grad_norm": 0.30755332112312317, + "learning_rate": 7.248535303977738e-05, + "loss": 1.8016, + "step": 12088 + }, + { + "epoch": 3.710558624923266, + "grad_norm": 0.25746986269950867, + "learning_rate": 7.248091334336929e-05, + "loss": 1.8014, + "step": 12089 + }, + { + "epoch": 3.710865561694291, + "grad_norm": 0.3327447772026062, + "learning_rate": 7.247647342479164e-05, + "loss": 1.752, + "step": 12090 + }, + { + "epoch": 3.711172498465316, + "grad_norm": 0.3101816475391388, + "learning_rate": 7.247203328408832e-05, + "loss": 1.7867, + "step": 12091 + }, + { + "epoch": 3.7114794352363414, + "grad_norm": 0.2168906182050705, + "learning_rate": 7.246759292130318e-05, + "loss": 1.7452, + "step": 12092 + }, + { + "epoch": 3.7117863720073663, + "grad_norm": 0.34260258078575134, + "learning_rate": 7.246315233648013e-05, + "loss": 1.8156, + "step": 12093 + }, + { + "epoch": 3.7120933087783916, + "grad_norm": 0.2730714976787567, + "learning_rate": 7.245871152966303e-05, + "loss": 1.7429, + "step": 12094 + }, + { + "epoch": 3.712400245549417, + "grad_norm": 0.2560936212539673, + "learning_rate": 7.245427050089578e-05, + "loss": 1.7969, + "step": 12095 + }, + { + "epoch": 3.712707182320442, + "grad_norm": 0.27510303258895874, + "learning_rate": 7.244982925022228e-05, + "loss": 1.7981, + "step": 12096 + }, + { + "epoch": 3.713014119091467, + "grad_norm": 0.29171642661094666, + "learning_rate": 7.24453877776864e-05, + "loss": 1.7913, + "step": 12097 + }, + { + "epoch": 3.713321055862492, + "grad_norm": 0.26431843638420105, + "learning_rate": 7.244094608333206e-05, + "loss": 1.8262, + "step": 12098 + }, + { + "epoch": 3.7136279926335174, + "grad_norm": 0.30747905373573303, + "learning_rate": 7.243650416720311e-05, + "loss": 1.7951, + "step": 12099 + }, + { + "epoch": 3.7139349294045427, + "grad_norm": 0.346443772315979, + "learning_rate": 7.24320620293435e-05, + "loss": 1.7677, + "step": 12100 + }, + { + "epoch": 3.714241866175568, + "grad_norm": 0.2910652458667755, + "learning_rate": 7.242761966979709e-05, + "loss": 1.7887, + "step": 12101 + }, + { + "epoch": 3.714548802946593, + "grad_norm": 0.22342006862163544, + "learning_rate": 7.24231770886078e-05, + "loss": 1.7678, + "step": 12102 + }, + { + "epoch": 3.7148557397176183, + "grad_norm": 0.24125796556472778, + "learning_rate": 7.241873428581954e-05, + "loss": 1.7436, + "step": 12103 + }, + { + "epoch": 3.715162676488643, + "grad_norm": 0.23542635142803192, + "learning_rate": 7.24142912614762e-05, + "loss": 1.7942, + "step": 12104 + }, + { + "epoch": 3.7154696132596685, + "grad_norm": 0.22476384043693542, + "learning_rate": 7.240984801562169e-05, + "loss": 1.8235, + "step": 12105 + }, + { + "epoch": 3.715776550030694, + "grad_norm": 0.25123465061187744, + "learning_rate": 7.240540454829992e-05, + "loss": 1.8112, + "step": 12106 + }, + { + "epoch": 3.7160834868017187, + "grad_norm": 0.27230000495910645, + "learning_rate": 7.240096085955483e-05, + "loss": 1.8312, + "step": 12107 + }, + { + "epoch": 3.716390423572744, + "grad_norm": 0.2722976803779602, + "learning_rate": 7.239651694943031e-05, + "loss": 1.8368, + "step": 12108 + }, + { + "epoch": 3.716697360343769, + "grad_norm": 0.264138400554657, + "learning_rate": 7.239207281797028e-05, + "loss": 1.8206, + "step": 12109 + }, + { + "epoch": 3.7170042971147943, + "grad_norm": 0.28813931345939636, + "learning_rate": 7.238762846521866e-05, + "loss": 1.7391, + "step": 12110 + }, + { + "epoch": 3.7173112338858196, + "grad_norm": 0.2319631576538086, + "learning_rate": 7.238318389121939e-05, + "loss": 1.7574, + "step": 12111 + }, + { + "epoch": 3.717618170656845, + "grad_norm": 0.2507809102535248, + "learning_rate": 7.237873909601635e-05, + "loss": 1.7359, + "step": 12112 + }, + { + "epoch": 3.71792510742787, + "grad_norm": 0.2717304825782776, + "learning_rate": 7.237429407965351e-05, + "loss": 1.774, + "step": 12113 + }, + { + "epoch": 3.718232044198895, + "grad_norm": 0.2619280517101288, + "learning_rate": 7.236984884217478e-05, + "loss": 1.8083, + "step": 12114 + }, + { + "epoch": 3.71853898096992, + "grad_norm": 0.22268806397914886, + "learning_rate": 7.23654033836241e-05, + "loss": 1.7436, + "step": 12115 + }, + { + "epoch": 3.7188459177409454, + "grad_norm": 0.2341407984495163, + "learning_rate": 7.236095770404539e-05, + "loss": 1.7807, + "step": 12116 + }, + { + "epoch": 3.7191528545119708, + "grad_norm": 0.23519712686538696, + "learning_rate": 7.235651180348258e-05, + "loss": 1.8051, + "step": 12117 + }, + { + "epoch": 3.7194597912829956, + "grad_norm": 0.2391074150800705, + "learning_rate": 7.235206568197963e-05, + "loss": 1.8377, + "step": 12118 + }, + { + "epoch": 3.719766728054021, + "grad_norm": 0.26821592450141907, + "learning_rate": 7.234761933958045e-05, + "loss": 1.8586, + "step": 12119 + }, + { + "epoch": 3.720073664825046, + "grad_norm": 0.24971134960651398, + "learning_rate": 7.234317277632902e-05, + "loss": 1.8404, + "step": 12120 + }, + { + "epoch": 3.720380601596071, + "grad_norm": 0.20817919075489044, + "learning_rate": 7.233872599226926e-05, + "loss": 1.7204, + "step": 12121 + }, + { + "epoch": 3.7206875383670965, + "grad_norm": 0.29301291704177856, + "learning_rate": 7.233427898744509e-05, + "loss": 1.8528, + "step": 12122 + }, + { + "epoch": 3.7209944751381214, + "grad_norm": 0.22214651107788086, + "learning_rate": 7.23298317619005e-05, + "loss": 1.748, + "step": 12123 + }, + { + "epoch": 3.7213014119091468, + "grad_norm": 0.2511044442653656, + "learning_rate": 7.232538431567941e-05, + "loss": 1.8146, + "step": 12124 + }, + { + "epoch": 3.7216083486801717, + "grad_norm": 0.26976367831230164, + "learning_rate": 7.232093664882581e-05, + "loss": 1.8483, + "step": 12125 + }, + { + "epoch": 3.721915285451197, + "grad_norm": 0.2538089156150818, + "learning_rate": 7.231648876138361e-05, + "loss": 1.8097, + "step": 12126 + }, + { + "epoch": 3.7222222222222223, + "grad_norm": 0.2353016883134842, + "learning_rate": 7.231204065339682e-05, + "loss": 1.737, + "step": 12127 + }, + { + "epoch": 3.7225291589932477, + "grad_norm": 0.3205147981643677, + "learning_rate": 7.230759232490935e-05, + "loss": 1.8116, + "step": 12128 + }, + { + "epoch": 3.7228360957642725, + "grad_norm": 0.39056599140167236, + "learning_rate": 7.230314377596516e-05, + "loss": 1.7785, + "step": 12129 + }, + { + "epoch": 3.723143032535298, + "grad_norm": 0.3846863806247711, + "learning_rate": 7.229869500660825e-05, + "loss": 1.738, + "step": 12130 + }, + { + "epoch": 3.7234499693063228, + "grad_norm": 0.24412120878696442, + "learning_rate": 7.229424601688256e-05, + "loss": 1.7351, + "step": 12131 + }, + { + "epoch": 3.723756906077348, + "grad_norm": 0.2978009581565857, + "learning_rate": 7.228979680683206e-05, + "loss": 1.8037, + "step": 12132 + }, + { + "epoch": 3.7240638428483734, + "grad_norm": 0.33787262439727783, + "learning_rate": 7.228534737650074e-05, + "loss": 1.8421, + "step": 12133 + }, + { + "epoch": 3.7243707796193983, + "grad_norm": 0.2536921203136444, + "learning_rate": 7.228089772593254e-05, + "loss": 1.7472, + "step": 12134 + }, + { + "epoch": 3.7246777163904237, + "grad_norm": 0.24103601276874542, + "learning_rate": 7.227644785517144e-05, + "loss": 1.8011, + "step": 12135 + }, + { + "epoch": 3.7249846531614486, + "grad_norm": 0.3653033375740051, + "learning_rate": 7.227199776426146e-05, + "loss": 1.8018, + "step": 12136 + }, + { + "epoch": 3.725291589932474, + "grad_norm": 0.35728752613067627, + "learning_rate": 7.226754745324652e-05, + "loss": 1.7684, + "step": 12137 + }, + { + "epoch": 3.7255985267034992, + "grad_norm": 0.262018620967865, + "learning_rate": 7.226309692217063e-05, + "loss": 1.8124, + "step": 12138 + }, + { + "epoch": 3.725905463474524, + "grad_norm": 0.3467118442058563, + "learning_rate": 7.225864617107776e-05, + "loss": 1.8761, + "step": 12139 + }, + { + "epoch": 3.7262124002455494, + "grad_norm": 0.4365626871585846, + "learning_rate": 7.22541952000119e-05, + "loss": 1.7159, + "step": 12140 + }, + { + "epoch": 3.7265193370165743, + "grad_norm": 0.2819811999797821, + "learning_rate": 7.224974400901705e-05, + "loss": 1.8051, + "step": 12141 + }, + { + "epoch": 3.7268262737875997, + "grad_norm": 0.39062437415122986, + "learning_rate": 7.224529259813719e-05, + "loss": 1.8517, + "step": 12142 + }, + { + "epoch": 3.727133210558625, + "grad_norm": 0.4383927285671234, + "learning_rate": 7.22408409674163e-05, + "loss": 1.8295, + "step": 12143 + }, + { + "epoch": 3.7274401473296503, + "grad_norm": 0.3043094575405121, + "learning_rate": 7.223638911689839e-05, + "loss": 1.7653, + "step": 12144 + }, + { + "epoch": 3.7277470841006752, + "grad_norm": 0.25198984146118164, + "learning_rate": 7.223193704662746e-05, + "loss": 1.7561, + "step": 12145 + }, + { + "epoch": 3.7280540208717006, + "grad_norm": 0.353565514087677, + "learning_rate": 7.222748475664749e-05, + "loss": 1.8077, + "step": 12146 + }, + { + "epoch": 3.7283609576427255, + "grad_norm": 0.39757224917411804, + "learning_rate": 7.222303224700248e-05, + "loss": 1.7622, + "step": 12147 + }, + { + "epoch": 3.728667894413751, + "grad_norm": 0.35595703125, + "learning_rate": 7.221857951773644e-05, + "loss": 1.8436, + "step": 12148 + }, + { + "epoch": 3.728974831184776, + "grad_norm": 0.2469715029001236, + "learning_rate": 7.221412656889338e-05, + "loss": 1.8531, + "step": 12149 + }, + { + "epoch": 3.729281767955801, + "grad_norm": 0.35324424505233765, + "learning_rate": 7.22096734005173e-05, + "loss": 1.7361, + "step": 12150 + }, + { + "epoch": 3.7295887047268264, + "grad_norm": 0.3783365488052368, + "learning_rate": 7.220522001265223e-05, + "loss": 1.7459, + "step": 12151 + }, + { + "epoch": 3.7298956414978512, + "grad_norm": 0.27526360750198364, + "learning_rate": 7.220076640534212e-05, + "loss": 1.8867, + "step": 12152 + }, + { + "epoch": 3.7302025782688766, + "grad_norm": 0.30863118171691895, + "learning_rate": 7.219631257863105e-05, + "loss": 1.7363, + "step": 12153 + }, + { + "epoch": 3.730509515039902, + "grad_norm": 0.38505107164382935, + "learning_rate": 7.219185853256301e-05, + "loss": 1.764, + "step": 12154 + }, + { + "epoch": 3.730816451810927, + "grad_norm": 0.2925978899002075, + "learning_rate": 7.218740426718202e-05, + "loss": 1.7693, + "step": 12155 + }, + { + "epoch": 3.731123388581952, + "grad_norm": 0.24510078132152557, + "learning_rate": 7.218294978253209e-05, + "loss": 1.8089, + "step": 12156 + }, + { + "epoch": 3.731430325352977, + "grad_norm": 0.33029109239578247, + "learning_rate": 7.217849507865724e-05, + "loss": 1.6885, + "step": 12157 + }, + { + "epoch": 3.7317372621240024, + "grad_norm": 0.333970308303833, + "learning_rate": 7.217404015560149e-05, + "loss": 1.8132, + "step": 12158 + }, + { + "epoch": 3.7320441988950277, + "grad_norm": 0.2467660754919052, + "learning_rate": 7.216958501340891e-05, + "loss": 1.8021, + "step": 12159 + }, + { + "epoch": 3.732351135666053, + "grad_norm": 0.2701449990272522, + "learning_rate": 7.216512965212348e-05, + "loss": 1.7006, + "step": 12160 + }, + { + "epoch": 3.732658072437078, + "grad_norm": 0.2784138023853302, + "learning_rate": 7.216067407178926e-05, + "loss": 1.7616, + "step": 12161 + }, + { + "epoch": 3.7329650092081033, + "grad_norm": 0.2082870900630951, + "learning_rate": 7.215621827245026e-05, + "loss": 1.7391, + "step": 12162 + }, + { + "epoch": 3.733271945979128, + "grad_norm": 0.2477869987487793, + "learning_rate": 7.215176225415053e-05, + "loss": 1.7761, + "step": 12163 + }, + { + "epoch": 3.7335788827501535, + "grad_norm": 0.28395572304725647, + "learning_rate": 7.21473060169341e-05, + "loss": 1.8181, + "step": 12164 + }, + { + "epoch": 3.733885819521179, + "grad_norm": 0.20430058240890503, + "learning_rate": 7.2142849560845e-05, + "loss": 1.7035, + "step": 12165 + }, + { + "epoch": 3.7341927562922037, + "grad_norm": 0.30061420798301697, + "learning_rate": 7.21383928859273e-05, + "loss": 1.7703, + "step": 12166 + }, + { + "epoch": 3.734499693063229, + "grad_norm": 0.33865803480148315, + "learning_rate": 7.2133935992225e-05, + "loss": 1.8204, + "step": 12167 + }, + { + "epoch": 3.734806629834254, + "grad_norm": 0.29172980785369873, + "learning_rate": 7.212947887978221e-05, + "loss": 1.739, + "step": 12168 + }, + { + "epoch": 3.7351135666052793, + "grad_norm": 0.2799396812915802, + "learning_rate": 7.212502154864291e-05, + "loss": 1.8503, + "step": 12169 + }, + { + "epoch": 3.7354205033763046, + "grad_norm": 0.2945539355278015, + "learning_rate": 7.212056399885118e-05, + "loss": 1.7523, + "step": 12170 + }, + { + "epoch": 3.7357274401473295, + "grad_norm": 0.2395290732383728, + "learning_rate": 7.211610623045108e-05, + "loss": 1.7728, + "step": 12171 + }, + { + "epoch": 3.736034376918355, + "grad_norm": 0.24369286000728607, + "learning_rate": 7.211164824348667e-05, + "loss": 1.7725, + "step": 12172 + }, + { + "epoch": 3.7363413136893797, + "grad_norm": 0.3272435963153839, + "learning_rate": 7.210719003800197e-05, + "loss": 1.8531, + "step": 12173 + }, + { + "epoch": 3.736648250460405, + "grad_norm": 0.23954182863235474, + "learning_rate": 7.210273161404107e-05, + "loss": 1.7807, + "step": 12174 + }, + { + "epoch": 3.7369551872314304, + "grad_norm": 0.24547603726387024, + "learning_rate": 7.209827297164801e-05, + "loss": 1.8481, + "step": 12175 + }, + { + "epoch": 3.7372621240024557, + "grad_norm": 0.26926249265670776, + "learning_rate": 7.209381411086687e-05, + "loss": 1.7496, + "step": 12176 + }, + { + "epoch": 3.7375690607734806, + "grad_norm": 0.22948235273361206, + "learning_rate": 7.208935503174172e-05, + "loss": 1.7681, + "step": 12177 + }, + { + "epoch": 3.737875997544506, + "grad_norm": 0.2697654664516449, + "learning_rate": 7.20848957343166e-05, + "loss": 1.789, + "step": 12178 + }, + { + "epoch": 3.738182934315531, + "grad_norm": 0.235344797372818, + "learning_rate": 7.208043621863562e-05, + "loss": 1.8309, + "step": 12179 + }, + { + "epoch": 3.738489871086556, + "grad_norm": 0.2688879072666168, + "learning_rate": 7.20759764847428e-05, + "loss": 1.7898, + "step": 12180 + }, + { + "epoch": 3.7387968078575815, + "grad_norm": 0.26818978786468506, + "learning_rate": 7.207151653268226e-05, + "loss": 1.7882, + "step": 12181 + }, + { + "epoch": 3.7391037446286064, + "grad_norm": 0.2612875998020172, + "learning_rate": 7.206705636249804e-05, + "loss": 1.7352, + "step": 12182 + }, + { + "epoch": 3.7394106813996317, + "grad_norm": 0.22547565400600433, + "learning_rate": 7.206259597423425e-05, + "loss": 1.733, + "step": 12183 + }, + { + "epoch": 3.7397176181706566, + "grad_norm": 0.24645474553108215, + "learning_rate": 7.205813536793495e-05, + "loss": 1.8064, + "step": 12184 + }, + { + "epoch": 3.740024554941682, + "grad_norm": 0.25879329442977905, + "learning_rate": 7.205367454364424e-05, + "loss": 1.8134, + "step": 12185 + }, + { + "epoch": 3.7403314917127073, + "grad_norm": 0.22420097887516022, + "learning_rate": 7.204921350140617e-05, + "loss": 1.7819, + "step": 12186 + }, + { + "epoch": 3.7406384284837326, + "grad_norm": 0.2569858431816101, + "learning_rate": 7.204475224126487e-05, + "loss": 1.784, + "step": 12187 + }, + { + "epoch": 3.7409453652547575, + "grad_norm": 0.23769912123680115, + "learning_rate": 7.20402907632644e-05, + "loss": 1.7853, + "step": 12188 + }, + { + "epoch": 3.741252302025783, + "grad_norm": 0.26935988664627075, + "learning_rate": 7.203582906744885e-05, + "loss": 1.806, + "step": 12189 + }, + { + "epoch": 3.7415592387968077, + "grad_norm": 0.2544274628162384, + "learning_rate": 7.203136715386233e-05, + "loss": 1.7988, + "step": 12190 + }, + { + "epoch": 3.741866175567833, + "grad_norm": 0.22665882110595703, + "learning_rate": 7.202690502254892e-05, + "loss": 1.7798, + "step": 12191 + }, + { + "epoch": 3.7421731123388584, + "grad_norm": 0.24512888491153717, + "learning_rate": 7.202244267355273e-05, + "loss": 1.816, + "step": 12192 + }, + { + "epoch": 3.7424800491098833, + "grad_norm": 0.2408553808927536, + "learning_rate": 7.201798010691785e-05, + "loss": 1.7417, + "step": 12193 + }, + { + "epoch": 3.7427869858809086, + "grad_norm": 0.23142600059509277, + "learning_rate": 7.201351732268838e-05, + "loss": 1.7771, + "step": 12194 + }, + { + "epoch": 3.7430939226519335, + "grad_norm": 0.245071142911911, + "learning_rate": 7.200905432090844e-05, + "loss": 1.7556, + "step": 12195 + }, + { + "epoch": 3.743400859422959, + "grad_norm": 0.2623934745788574, + "learning_rate": 7.200459110162211e-05, + "loss": 1.8042, + "step": 12196 + }, + { + "epoch": 3.743707796193984, + "grad_norm": 0.2531217038631439, + "learning_rate": 7.200012766487353e-05, + "loss": 1.7709, + "step": 12197 + }, + { + "epoch": 3.744014732965009, + "grad_norm": 0.23839864134788513, + "learning_rate": 7.19956640107068e-05, + "loss": 1.8202, + "step": 12198 + }, + { + "epoch": 3.7443216697360344, + "grad_norm": 0.2342260777950287, + "learning_rate": 7.1991200139166e-05, + "loss": 1.827, + "step": 12199 + }, + { + "epoch": 3.7446286065070593, + "grad_norm": 0.25511276721954346, + "learning_rate": 7.198673605029528e-05, + "loss": 1.7766, + "step": 12200 + }, + { + "epoch": 3.7449355432780846, + "grad_norm": 0.27601274847984314, + "learning_rate": 7.198227174413876e-05, + "loss": 1.7716, + "step": 12201 + }, + { + "epoch": 3.74524248004911, + "grad_norm": 0.3027385175228119, + "learning_rate": 7.197780722074056e-05, + "loss": 1.8007, + "step": 12202 + }, + { + "epoch": 3.7455494168201353, + "grad_norm": 0.31242382526397705, + "learning_rate": 7.197334248014477e-05, + "loss": 1.8089, + "step": 12203 + }, + { + "epoch": 3.74585635359116, + "grad_norm": 0.3673859238624573, + "learning_rate": 7.196887752239551e-05, + "loss": 1.8017, + "step": 12204 + }, + { + "epoch": 3.7461632903621855, + "grad_norm": 0.3152726888656616, + "learning_rate": 7.196441234753695e-05, + "loss": 1.7108, + "step": 12205 + }, + { + "epoch": 3.7464702271332104, + "grad_norm": 0.2606927156448364, + "learning_rate": 7.195994695561319e-05, + "loss": 1.8066, + "step": 12206 + }, + { + "epoch": 3.7467771639042358, + "grad_norm": 0.37624871730804443, + "learning_rate": 7.195548134666836e-05, + "loss": 1.725, + "step": 12207 + }, + { + "epoch": 3.747084100675261, + "grad_norm": 0.4138187766075134, + "learning_rate": 7.195101552074658e-05, + "loss": 1.7838, + "step": 12208 + }, + { + "epoch": 3.747391037446286, + "grad_norm": 0.3668459951877594, + "learning_rate": 7.194654947789204e-05, + "loss": 1.7575, + "step": 12209 + }, + { + "epoch": 3.7476979742173113, + "grad_norm": 0.27947792410850525, + "learning_rate": 7.19420832181488e-05, + "loss": 1.792, + "step": 12210 + }, + { + "epoch": 3.748004910988336, + "grad_norm": 0.2507692873477936, + "learning_rate": 7.193761674156103e-05, + "loss": 1.7752, + "step": 12211 + }, + { + "epoch": 3.7483118477593615, + "grad_norm": 0.3209949731826782, + "learning_rate": 7.193315004817289e-05, + "loss": 1.8491, + "step": 12212 + }, + { + "epoch": 3.748618784530387, + "grad_norm": 0.32883042097091675, + "learning_rate": 7.192868313802849e-05, + "loss": 1.8135, + "step": 12213 + }, + { + "epoch": 3.7489257213014118, + "grad_norm": 0.2450616955757141, + "learning_rate": 7.192421601117201e-05, + "loss": 1.7722, + "step": 12214 + }, + { + "epoch": 3.749232658072437, + "grad_norm": 0.2545110285282135, + "learning_rate": 7.191974866764757e-05, + "loss": 1.7866, + "step": 12215 + }, + { + "epoch": 3.749539594843462, + "grad_norm": 0.264017790555954, + "learning_rate": 7.191528110749932e-05, + "loss": 1.778, + "step": 12216 + }, + { + "epoch": 3.7498465316144873, + "grad_norm": 0.3156309425830841, + "learning_rate": 7.191081333077142e-05, + "loss": 1.7917, + "step": 12217 + }, + { + "epoch": 3.7501534683855127, + "grad_norm": 0.3578774631023407, + "learning_rate": 7.190634533750802e-05, + "loss": 1.8468, + "step": 12218 + }, + { + "epoch": 3.750460405156538, + "grad_norm": 0.30735981464385986, + "learning_rate": 7.19018771277533e-05, + "loss": 1.7502, + "step": 12219 + }, + { + "epoch": 3.750767341927563, + "grad_norm": 0.22870220243930817, + "learning_rate": 7.189740870155135e-05, + "loss": 1.7686, + "step": 12220 + }, + { + "epoch": 3.7510742786985882, + "grad_norm": 0.30297720432281494, + "learning_rate": 7.18929400589464e-05, + "loss": 1.826, + "step": 12221 + }, + { + "epoch": 3.751381215469613, + "grad_norm": 0.2735389173030853, + "learning_rate": 7.188847119998257e-05, + "loss": 1.8142, + "step": 12222 + }, + { + "epoch": 3.7516881522406385, + "grad_norm": 0.2823885679244995, + "learning_rate": 7.188400212470405e-05, + "loss": 1.8028, + "step": 12223 + }, + { + "epoch": 3.751995089011664, + "grad_norm": 0.4184139370918274, + "learning_rate": 7.187953283315499e-05, + "loss": 1.8467, + "step": 12224 + }, + { + "epoch": 3.7523020257826887, + "grad_norm": 0.3559226095676422, + "learning_rate": 7.187506332537957e-05, + "loss": 1.7416, + "step": 12225 + }, + { + "epoch": 3.752608962553714, + "grad_norm": 0.26055800914764404, + "learning_rate": 7.187059360142194e-05, + "loss": 1.8309, + "step": 12226 + }, + { + "epoch": 3.752915899324739, + "grad_norm": 0.28032660484313965, + "learning_rate": 7.186612366132629e-05, + "loss": 1.7926, + "step": 12227 + }, + { + "epoch": 3.7532228360957642, + "grad_norm": 0.26229965686798096, + "learning_rate": 7.18616535051368e-05, + "loss": 1.7368, + "step": 12228 + }, + { + "epoch": 3.7535297728667896, + "grad_norm": 0.2779417634010315, + "learning_rate": 7.185718313289763e-05, + "loss": 1.8418, + "step": 12229 + }, + { + "epoch": 3.7538367096378145, + "grad_norm": 0.26164770126342773, + "learning_rate": 7.185271254465295e-05, + "loss": 1.7511, + "step": 12230 + }, + { + "epoch": 3.75414364640884, + "grad_norm": 0.30725157260894775, + "learning_rate": 7.184824174044698e-05, + "loss": 1.7661, + "step": 12231 + }, + { + "epoch": 3.7544505831798647, + "grad_norm": 0.33111417293548584, + "learning_rate": 7.184377072032386e-05, + "loss": 1.7341, + "step": 12232 + }, + { + "epoch": 3.75475751995089, + "grad_norm": 0.23978343605995178, + "learning_rate": 7.183929948432779e-05, + "loss": 1.7151, + "step": 12233 + }, + { + "epoch": 3.7550644567219154, + "grad_norm": 0.3057664632797241, + "learning_rate": 7.183482803250299e-05, + "loss": 1.8446, + "step": 12234 + }, + { + "epoch": 3.7553713934929407, + "grad_norm": 0.2629055678844452, + "learning_rate": 7.18303563648936e-05, + "loss": 1.7415, + "step": 12235 + }, + { + "epoch": 3.7556783302639656, + "grad_norm": 0.22703498601913452, + "learning_rate": 7.182588448154386e-05, + "loss": 1.8188, + "step": 12236 + }, + { + "epoch": 3.755985267034991, + "grad_norm": 0.3014034032821655, + "learning_rate": 7.182141238249792e-05, + "loss": 1.8634, + "step": 12237 + }, + { + "epoch": 3.756292203806016, + "grad_norm": 0.28859084844589233, + "learning_rate": 7.181694006779998e-05, + "loss": 1.7509, + "step": 12238 + }, + { + "epoch": 3.756599140577041, + "grad_norm": 0.293720543384552, + "learning_rate": 7.181246753749426e-05, + "loss": 1.777, + "step": 12239 + }, + { + "epoch": 3.7569060773480665, + "grad_norm": 0.2374580055475235, + "learning_rate": 7.180799479162496e-05, + "loss": 1.7492, + "step": 12240 + }, + { + "epoch": 3.7572130141190914, + "grad_norm": 0.30106452107429504, + "learning_rate": 7.180352183023627e-05, + "loss": 1.7538, + "step": 12241 + }, + { + "epoch": 3.7575199508901167, + "grad_norm": 0.3504682183265686, + "learning_rate": 7.179904865337238e-05, + "loss": 1.7477, + "step": 12242 + }, + { + "epoch": 3.7578268876611416, + "grad_norm": 0.2901679575443268, + "learning_rate": 7.179457526107754e-05, + "loss": 1.9412, + "step": 12243 + }, + { + "epoch": 3.758133824432167, + "grad_norm": 0.37690606713294983, + "learning_rate": 7.179010165339591e-05, + "loss": 1.8222, + "step": 12244 + }, + { + "epoch": 3.7584407612031923, + "grad_norm": 0.45126965641975403, + "learning_rate": 7.178562783037172e-05, + "loss": 1.8563, + "step": 12245 + }, + { + "epoch": 3.758747697974217, + "grad_norm": 0.2747548818588257, + "learning_rate": 7.178115379204921e-05, + "loss": 1.7179, + "step": 12246 + }, + { + "epoch": 3.7590546347452425, + "grad_norm": 0.43243977427482605, + "learning_rate": 7.177667953847257e-05, + "loss": 1.8157, + "step": 12247 + }, + { + "epoch": 3.7593615715162674, + "grad_norm": 0.529448390007019, + "learning_rate": 7.177220506968602e-05, + "loss": 1.8113, + "step": 12248 + }, + { + "epoch": 3.7596685082872927, + "grad_norm": 0.3099314868450165, + "learning_rate": 7.176773038573377e-05, + "loss": 1.7833, + "step": 12249 + }, + { + "epoch": 3.759975445058318, + "grad_norm": 0.3111872375011444, + "learning_rate": 7.176325548666004e-05, + "loss": 1.7965, + "step": 12250 + }, + { + "epoch": 3.7602823818293434, + "grad_norm": 0.38437551259994507, + "learning_rate": 7.175878037250907e-05, + "loss": 1.7822, + "step": 12251 + }, + { + "epoch": 3.7605893186003683, + "grad_norm": 0.33643704652786255, + "learning_rate": 7.175430504332509e-05, + "loss": 1.7839, + "step": 12252 + }, + { + "epoch": 3.7608962553713936, + "grad_norm": 0.24705304205417633, + "learning_rate": 7.174982949915232e-05, + "loss": 1.8302, + "step": 12253 + }, + { + "epoch": 3.7612031921424185, + "grad_norm": 0.3615458309650421, + "learning_rate": 7.174535374003497e-05, + "loss": 1.7963, + "step": 12254 + }, + { + "epoch": 3.761510128913444, + "grad_norm": 0.36486589908599854, + "learning_rate": 7.17408777660173e-05, + "loss": 1.7933, + "step": 12255 + }, + { + "epoch": 3.761817065684469, + "grad_norm": 0.2566867172718048, + "learning_rate": 7.173640157714352e-05, + "loss": 1.7254, + "step": 12256 + }, + { + "epoch": 3.762124002455494, + "grad_norm": 0.2602523863315582, + "learning_rate": 7.17319251734579e-05, + "loss": 1.7357, + "step": 12257 + }, + { + "epoch": 3.7624309392265194, + "grad_norm": 0.3626105785369873, + "learning_rate": 7.172744855500464e-05, + "loss": 1.7971, + "step": 12258 + }, + { + "epoch": 3.7627378759975443, + "grad_norm": 0.36327603459358215, + "learning_rate": 7.172297172182802e-05, + "loss": 1.7819, + "step": 12259 + }, + { + "epoch": 3.7630448127685696, + "grad_norm": 0.25935736298561096, + "learning_rate": 7.171849467397224e-05, + "loss": 1.8112, + "step": 12260 + }, + { + "epoch": 3.763351749539595, + "grad_norm": 0.2779700756072998, + "learning_rate": 7.171401741148156e-05, + "loss": 1.786, + "step": 12261 + }, + { + "epoch": 3.7636586863106203, + "grad_norm": 0.3089013695716858, + "learning_rate": 7.170953993440025e-05, + "loss": 1.7808, + "step": 12262 + }, + { + "epoch": 3.763965623081645, + "grad_norm": 0.2562308609485626, + "learning_rate": 7.170506224277253e-05, + "loss": 1.8207, + "step": 12263 + }, + { + "epoch": 3.7642725598526705, + "grad_norm": 0.2907634973526001, + "learning_rate": 7.170058433664268e-05, + "loss": 1.7638, + "step": 12264 + }, + { + "epoch": 3.7645794966236954, + "grad_norm": 0.30341312289237976, + "learning_rate": 7.169610621605493e-05, + "loss": 1.7827, + "step": 12265 + }, + { + "epoch": 3.7648864333947207, + "grad_norm": 0.27091866731643677, + "learning_rate": 7.169162788105353e-05, + "loss": 1.786, + "step": 12266 + }, + { + "epoch": 3.765193370165746, + "grad_norm": 0.234042689204216, + "learning_rate": 7.168714933168277e-05, + "loss": 1.7638, + "step": 12267 + }, + { + "epoch": 3.765500306936771, + "grad_norm": 0.2477465271949768, + "learning_rate": 7.168267056798686e-05, + "loss": 1.7275, + "step": 12268 + }, + { + "epoch": 3.7658072437077963, + "grad_norm": 0.25578543543815613, + "learning_rate": 7.167819159001012e-05, + "loss": 1.7831, + "step": 12269 + }, + { + "epoch": 3.766114180478821, + "grad_norm": 0.26629674434661865, + "learning_rate": 7.167371239779678e-05, + "loss": 1.7866, + "step": 12270 + }, + { + "epoch": 3.7664211172498465, + "grad_norm": 0.31350967288017273, + "learning_rate": 7.16692329913911e-05, + "loss": 1.7755, + "step": 12271 + }, + { + "epoch": 3.766728054020872, + "grad_norm": 0.2670116126537323, + "learning_rate": 7.166475337083735e-05, + "loss": 1.7524, + "step": 12272 + }, + { + "epoch": 3.7670349907918967, + "grad_norm": 0.26503682136535645, + "learning_rate": 7.166027353617983e-05, + "loss": 1.7867, + "step": 12273 + }, + { + "epoch": 3.767341927562922, + "grad_norm": 0.3674192428588867, + "learning_rate": 7.165579348746278e-05, + "loss": 1.7604, + "step": 12274 + }, + { + "epoch": 3.767648864333947, + "grad_norm": 0.4120824337005615, + "learning_rate": 7.16513132247305e-05, + "loss": 1.7905, + "step": 12275 + }, + { + "epoch": 3.7679558011049723, + "grad_norm": 0.29074826836586, + "learning_rate": 7.164683274802723e-05, + "loss": 1.7539, + "step": 12276 + }, + { + "epoch": 3.7682627378759976, + "grad_norm": 0.22223204374313354, + "learning_rate": 7.164235205739729e-05, + "loss": 1.755, + "step": 12277 + }, + { + "epoch": 3.768569674647023, + "grad_norm": 0.23997461795806885, + "learning_rate": 7.163787115288494e-05, + "loss": 1.8024, + "step": 12278 + }, + { + "epoch": 3.768876611418048, + "grad_norm": 0.2556418776512146, + "learning_rate": 7.163339003453445e-05, + "loss": 1.7717, + "step": 12279 + }, + { + "epoch": 3.769183548189073, + "grad_norm": 0.3107141852378845, + "learning_rate": 7.162890870239013e-05, + "loss": 1.8257, + "step": 12280 + }, + { + "epoch": 3.769490484960098, + "grad_norm": 0.35293644666671753, + "learning_rate": 7.162442715649627e-05, + "loss": 1.7855, + "step": 12281 + }, + { + "epoch": 3.7697974217311234, + "grad_norm": 0.25989311933517456, + "learning_rate": 7.161994539689713e-05, + "loss": 1.7816, + "step": 12282 + }, + { + "epoch": 3.7701043585021488, + "grad_norm": 0.25615137815475464, + "learning_rate": 7.161546342363701e-05, + "loss": 1.7738, + "step": 12283 + }, + { + "epoch": 3.7704112952731736, + "grad_norm": 0.29345229268074036, + "learning_rate": 7.161098123676023e-05, + "loss": 1.8496, + "step": 12284 + }, + { + "epoch": 3.770718232044199, + "grad_norm": 0.2975969612598419, + "learning_rate": 7.160649883631105e-05, + "loss": 1.7342, + "step": 12285 + }, + { + "epoch": 3.771025168815224, + "grad_norm": 0.28458064794540405, + "learning_rate": 7.16020162223338e-05, + "loss": 1.8253, + "step": 12286 + }, + { + "epoch": 3.771332105586249, + "grad_norm": 0.2798703908920288, + "learning_rate": 7.159753339487276e-05, + "loss": 1.746, + "step": 12287 + }, + { + "epoch": 3.7716390423572745, + "grad_norm": 0.380044549703598, + "learning_rate": 7.159305035397223e-05, + "loss": 1.769, + "step": 12288 + }, + { + "epoch": 3.7719459791282994, + "grad_norm": 0.28760263323783875, + "learning_rate": 7.158856709967654e-05, + "loss": 1.7466, + "step": 12289 + }, + { + "epoch": 3.7722529158993248, + "grad_norm": 0.23314130306243896, + "learning_rate": 7.158408363202996e-05, + "loss": 1.7545, + "step": 12290 + }, + { + "epoch": 3.7725598526703497, + "grad_norm": 0.2864209711551666, + "learning_rate": 7.15795999510768e-05, + "loss": 1.7549, + "step": 12291 + }, + { + "epoch": 3.772866789441375, + "grad_norm": 0.2605510354042053, + "learning_rate": 7.15751160568614e-05, + "loss": 1.7684, + "step": 12292 + }, + { + "epoch": 3.7731737262124003, + "grad_norm": 0.2475409358739853, + "learning_rate": 7.157063194942806e-05, + "loss": 1.7841, + "step": 12293 + }, + { + "epoch": 3.7734806629834257, + "grad_norm": 0.22479289770126343, + "learning_rate": 7.15661476288211e-05, + "loss": 1.7592, + "step": 12294 + }, + { + "epoch": 3.7737875997544506, + "grad_norm": 0.22076937556266785, + "learning_rate": 7.156166309508482e-05, + "loss": 1.7853, + "step": 12295 + }, + { + "epoch": 3.774094536525476, + "grad_norm": 0.26082465052604675, + "learning_rate": 7.155717834826353e-05, + "loss": 1.7828, + "step": 12296 + }, + { + "epoch": 3.7744014732965008, + "grad_norm": 0.24771755933761597, + "learning_rate": 7.15526933884016e-05, + "loss": 1.758, + "step": 12297 + }, + { + "epoch": 3.774708410067526, + "grad_norm": 0.23806311190128326, + "learning_rate": 7.15482082155433e-05, + "loss": 1.7237, + "step": 12298 + }, + { + "epoch": 3.7750153468385514, + "grad_norm": 0.24822844564914703, + "learning_rate": 7.154372282973299e-05, + "loss": 1.7828, + "step": 12299 + }, + { + "epoch": 3.7753222836095763, + "grad_norm": 0.24423740804195404, + "learning_rate": 7.153923723101496e-05, + "loss": 1.8014, + "step": 12300 + }, + { + "epoch": 3.7756292203806017, + "grad_norm": 0.24966634809970856, + "learning_rate": 7.15347514194336e-05, + "loss": 1.8005, + "step": 12301 + }, + { + "epoch": 3.7759361571516266, + "grad_norm": 0.2549348473548889, + "learning_rate": 7.153026539503317e-05, + "loss": 1.8473, + "step": 12302 + }, + { + "epoch": 3.776243093922652, + "grad_norm": 0.23709465563297272, + "learning_rate": 7.152577915785807e-05, + "loss": 1.8031, + "step": 12303 + }, + { + "epoch": 3.7765500306936772, + "grad_norm": 0.28554168343544006, + "learning_rate": 7.152129270795258e-05, + "loss": 1.7836, + "step": 12304 + }, + { + "epoch": 3.776856967464702, + "grad_norm": 0.2568756639957428, + "learning_rate": 7.151680604536107e-05, + "loss": 1.7345, + "step": 12305 + }, + { + "epoch": 3.7771639042357275, + "grad_norm": 0.23883797228336334, + "learning_rate": 7.151231917012787e-05, + "loss": 1.7342, + "step": 12306 + }, + { + "epoch": 3.7774708410067523, + "grad_norm": 0.24026677012443542, + "learning_rate": 7.150783208229732e-05, + "loss": 1.8156, + "step": 12307 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.25756222009658813, + "learning_rate": 7.150334478191376e-05, + "loss": 1.8204, + "step": 12308 + }, + { + "epoch": 3.778084714548803, + "grad_norm": 0.24917428195476532, + "learning_rate": 7.149885726902156e-05, + "loss": 1.7867, + "step": 12309 + }, + { + "epoch": 3.7783916513198283, + "grad_norm": 0.26269277930259705, + "learning_rate": 7.149436954366504e-05, + "loss": 1.8233, + "step": 12310 + }, + { + "epoch": 3.7786985880908532, + "grad_norm": 0.2502293586730957, + "learning_rate": 7.148988160588857e-05, + "loss": 1.8329, + "step": 12311 + }, + { + "epoch": 3.7790055248618786, + "grad_norm": 0.24845796823501587, + "learning_rate": 7.14853934557365e-05, + "loss": 1.7936, + "step": 12312 + }, + { + "epoch": 3.7793124616329035, + "grad_norm": 0.2453537881374359, + "learning_rate": 7.148090509325315e-05, + "loss": 1.8149, + "step": 12313 + }, + { + "epoch": 3.779619398403929, + "grad_norm": 0.2336922138929367, + "learning_rate": 7.147641651848293e-05, + "loss": 1.7826, + "step": 12314 + }, + { + "epoch": 3.779926335174954, + "grad_norm": 0.25542667508125305, + "learning_rate": 7.147192773147017e-05, + "loss": 1.801, + "step": 12315 + }, + { + "epoch": 3.780233271945979, + "grad_norm": 0.2301866114139557, + "learning_rate": 7.146743873225923e-05, + "loss": 1.7302, + "step": 12316 + }, + { + "epoch": 3.7805402087170044, + "grad_norm": 0.25821468234062195, + "learning_rate": 7.14629495208945e-05, + "loss": 1.7704, + "step": 12317 + }, + { + "epoch": 3.7808471454880292, + "grad_norm": 0.22537970542907715, + "learning_rate": 7.145846009742029e-05, + "loss": 1.7281, + "step": 12318 + }, + { + "epoch": 3.7811540822590546, + "grad_norm": 0.2565869688987732, + "learning_rate": 7.145397046188102e-05, + "loss": 1.8077, + "step": 12319 + }, + { + "epoch": 3.78146101903008, + "grad_norm": 0.2588396966457367, + "learning_rate": 7.144948061432105e-05, + "loss": 1.7438, + "step": 12320 + }, + { + "epoch": 3.781767955801105, + "grad_norm": 0.2538135349750519, + "learning_rate": 7.144499055478472e-05, + "loss": 1.8253, + "step": 12321 + }, + { + "epoch": 3.78207489257213, + "grad_norm": 0.2272680401802063, + "learning_rate": 7.144050028331644e-05, + "loss": 1.7408, + "step": 12322 + }, + { + "epoch": 3.782381829343155, + "grad_norm": 0.25010406970977783, + "learning_rate": 7.143600979996055e-05, + "loss": 1.8219, + "step": 12323 + }, + { + "epoch": 3.7826887661141804, + "grad_norm": 0.2560291290283203, + "learning_rate": 7.143151910476144e-05, + "loss": 1.7734, + "step": 12324 + }, + { + "epoch": 3.7829957028852057, + "grad_norm": 0.24927431344985962, + "learning_rate": 7.142702819776352e-05, + "loss": 1.7682, + "step": 12325 + }, + { + "epoch": 3.783302639656231, + "grad_norm": 0.2501368224620819, + "learning_rate": 7.142253707901114e-05, + "loss": 1.818, + "step": 12326 + }, + { + "epoch": 3.783609576427256, + "grad_norm": 0.3132917284965515, + "learning_rate": 7.141804574854871e-05, + "loss": 1.7793, + "step": 12327 + }, + { + "epoch": 3.7839165131982813, + "grad_norm": 0.24229925870895386, + "learning_rate": 7.141355420642057e-05, + "loss": 1.7585, + "step": 12328 + }, + { + "epoch": 3.784223449969306, + "grad_norm": 0.22612906992435455, + "learning_rate": 7.140906245267116e-05, + "loss": 1.7374, + "step": 12329 + }, + { + "epoch": 3.7845303867403315, + "grad_norm": 0.26354333758354187, + "learning_rate": 7.140457048734482e-05, + "loss": 1.7751, + "step": 12330 + }, + { + "epoch": 3.784837323511357, + "grad_norm": 0.21500451862812042, + "learning_rate": 7.140007831048599e-05, + "loss": 1.7827, + "step": 12331 + }, + { + "epoch": 3.7851442602823817, + "grad_norm": 0.2826332151889801, + "learning_rate": 7.139558592213904e-05, + "loss": 1.7522, + "step": 12332 + }, + { + "epoch": 3.785451197053407, + "grad_norm": 0.3217725455760956, + "learning_rate": 7.139109332234837e-05, + "loss": 1.8758, + "step": 12333 + }, + { + "epoch": 3.785758133824432, + "grad_norm": 0.26934614777565, + "learning_rate": 7.138660051115837e-05, + "loss": 1.8322, + "step": 12334 + }, + { + "epoch": 3.7860650705954573, + "grad_norm": 0.2653827667236328, + "learning_rate": 7.138210748861346e-05, + "loss": 1.7651, + "step": 12335 + }, + { + "epoch": 3.7863720073664826, + "grad_norm": 0.30470311641693115, + "learning_rate": 7.137761425475802e-05, + "loss": 1.855, + "step": 12336 + }, + { + "epoch": 3.786678944137508, + "grad_norm": 0.2558726370334625, + "learning_rate": 7.137312080963647e-05, + "loss": 1.7174, + "step": 12337 + }, + { + "epoch": 3.786985880908533, + "grad_norm": 0.24025602638721466, + "learning_rate": 7.136862715329322e-05, + "loss": 1.7565, + "step": 12338 + }, + { + "epoch": 3.787292817679558, + "grad_norm": 0.34205392003059387, + "learning_rate": 7.136413328577267e-05, + "loss": 1.8116, + "step": 12339 + }, + { + "epoch": 3.787599754450583, + "grad_norm": 0.4069152772426605, + "learning_rate": 7.135963920711923e-05, + "loss": 1.7662, + "step": 12340 + }, + { + "epoch": 3.7879066912216084, + "grad_norm": 0.3915627598762512, + "learning_rate": 7.13551449173773e-05, + "loss": 1.81, + "step": 12341 + }, + { + "epoch": 3.7882136279926337, + "grad_norm": 0.27136507630348206, + "learning_rate": 7.135065041659134e-05, + "loss": 1.7845, + "step": 12342 + }, + { + "epoch": 3.7885205647636586, + "grad_norm": 0.2924078106880188, + "learning_rate": 7.134615570480572e-05, + "loss": 1.8606, + "step": 12343 + }, + { + "epoch": 3.788827501534684, + "grad_norm": 0.35581526160240173, + "learning_rate": 7.134166078206488e-05, + "loss": 1.7785, + "step": 12344 + }, + { + "epoch": 3.789134438305709, + "grad_norm": 0.3003756105899811, + "learning_rate": 7.133716564841324e-05, + "loss": 1.7321, + "step": 12345 + }, + { + "epoch": 3.789441375076734, + "grad_norm": 0.2586000859737396, + "learning_rate": 7.133267030389524e-05, + "loss": 1.7889, + "step": 12346 + }, + { + "epoch": 3.7897483118477595, + "grad_norm": 0.28053075075149536, + "learning_rate": 7.132817474855527e-05, + "loss": 1.8216, + "step": 12347 + }, + { + "epoch": 3.7900552486187844, + "grad_norm": 0.3064870834350586, + "learning_rate": 7.132367898243777e-05, + "loss": 1.7528, + "step": 12348 + }, + { + "epoch": 3.7903621853898097, + "grad_norm": 0.3045158386230469, + "learning_rate": 7.131918300558719e-05, + "loss": 1.8251, + "step": 12349 + }, + { + "epoch": 3.7906691221608346, + "grad_norm": 0.2438485324382782, + "learning_rate": 7.131468681804794e-05, + "loss": 1.7505, + "step": 12350 + }, + { + "epoch": 3.79097605893186, + "grad_norm": 0.24239958822727203, + "learning_rate": 7.131019041986447e-05, + "loss": 1.7544, + "step": 12351 + }, + { + "epoch": 3.7912829957028853, + "grad_norm": 0.24632441997528076, + "learning_rate": 7.130569381108121e-05, + "loss": 1.7485, + "step": 12352 + }, + { + "epoch": 3.7915899324739106, + "grad_norm": 0.22553624212741852, + "learning_rate": 7.13011969917426e-05, + "loss": 1.803, + "step": 12353 + }, + { + "epoch": 3.7918968692449355, + "grad_norm": 0.2164420485496521, + "learning_rate": 7.129669996189306e-05, + "loss": 1.7307, + "step": 12354 + }, + { + "epoch": 3.792203806015961, + "grad_norm": 0.25104281306266785, + "learning_rate": 7.129220272157705e-05, + "loss": 1.8154, + "step": 12355 + }, + { + "epoch": 3.7925107427869857, + "grad_norm": 0.25533202290534973, + "learning_rate": 7.128770527083903e-05, + "loss": 1.8046, + "step": 12356 + }, + { + "epoch": 3.792817679558011, + "grad_norm": 0.24428130686283112, + "learning_rate": 7.128320760972341e-05, + "loss": 1.7984, + "step": 12357 + }, + { + "epoch": 3.7931246163290364, + "grad_norm": 0.2366408109664917, + "learning_rate": 7.127870973827467e-05, + "loss": 1.7781, + "step": 12358 + }, + { + "epoch": 3.7934315531000613, + "grad_norm": 0.2558888792991638, + "learning_rate": 7.127421165653722e-05, + "loss": 1.7858, + "step": 12359 + }, + { + "epoch": 3.7937384898710866, + "grad_norm": 0.25825443863868713, + "learning_rate": 7.126971336455558e-05, + "loss": 1.8292, + "step": 12360 + }, + { + "epoch": 3.7940454266421115, + "grad_norm": 0.2554624080657959, + "learning_rate": 7.126521486237415e-05, + "loss": 1.822, + "step": 12361 + }, + { + "epoch": 3.794352363413137, + "grad_norm": 0.3030763268470764, + "learning_rate": 7.126071615003742e-05, + "loss": 1.8261, + "step": 12362 + }, + { + "epoch": 3.794659300184162, + "grad_norm": 0.3047907054424286, + "learning_rate": 7.125621722758981e-05, + "loss": 1.8419, + "step": 12363 + }, + { + "epoch": 3.794966236955187, + "grad_norm": 0.27782654762268066, + "learning_rate": 7.12517180950758e-05, + "loss": 1.7959, + "step": 12364 + }, + { + "epoch": 3.7952731737262124, + "grad_norm": 0.24526572227478027, + "learning_rate": 7.124721875253986e-05, + "loss": 1.7313, + "step": 12365 + }, + { + "epoch": 3.7955801104972373, + "grad_norm": 0.23718179762363434, + "learning_rate": 7.124271920002646e-05, + "loss": 1.7479, + "step": 12366 + }, + { + "epoch": 3.7958870472682626, + "grad_norm": 0.2880019247531891, + "learning_rate": 7.123821943758004e-05, + "loss": 1.7792, + "step": 12367 + }, + { + "epoch": 3.796193984039288, + "grad_norm": 0.28923723101615906, + "learning_rate": 7.123371946524511e-05, + "loss": 1.7474, + "step": 12368 + }, + { + "epoch": 3.7965009208103133, + "grad_norm": 0.2281525880098343, + "learning_rate": 7.122921928306612e-05, + "loss": 1.8106, + "step": 12369 + }, + { + "epoch": 3.796807857581338, + "grad_norm": 0.34825438261032104, + "learning_rate": 7.122471889108752e-05, + "loss": 1.8076, + "step": 12370 + }, + { + "epoch": 3.7971147943523635, + "grad_norm": 0.41145995259284973, + "learning_rate": 7.122021828935382e-05, + "loss": 1.7692, + "step": 12371 + }, + { + "epoch": 3.7974217311233884, + "grad_norm": 0.31711262464523315, + "learning_rate": 7.12157174779095e-05, + "loss": 1.8101, + "step": 12372 + }, + { + "epoch": 3.7977286678944138, + "grad_norm": 0.3044308125972748, + "learning_rate": 7.1211216456799e-05, + "loss": 1.8238, + "step": 12373 + }, + { + "epoch": 3.798035604665439, + "grad_norm": 0.3750055134296417, + "learning_rate": 7.120671522606683e-05, + "loss": 1.7323, + "step": 12374 + }, + { + "epoch": 3.798342541436464, + "grad_norm": 0.38852599263191223, + "learning_rate": 7.120221378575749e-05, + "loss": 1.8402, + "step": 12375 + }, + { + "epoch": 3.7986494782074893, + "grad_norm": 0.3430371582508087, + "learning_rate": 7.119771213591541e-05, + "loss": 1.8369, + "step": 12376 + }, + { + "epoch": 3.798956414978514, + "grad_norm": 0.4787428677082062, + "learning_rate": 7.119321027658515e-05, + "loss": 1.7977, + "step": 12377 + }, + { + "epoch": 3.7992633517495396, + "grad_norm": 0.4263977110385895, + "learning_rate": 7.118870820781114e-05, + "loss": 1.8208, + "step": 12378 + }, + { + "epoch": 3.799570288520565, + "grad_norm": 0.28649669885635376, + "learning_rate": 7.118420592963793e-05, + "loss": 1.773, + "step": 12379 + }, + { + "epoch": 3.7998772252915898, + "grad_norm": 0.26070261001586914, + "learning_rate": 7.117970344210996e-05, + "loss": 1.6866, + "step": 12380 + }, + { + "epoch": 3.800184162062615, + "grad_norm": 0.30127593874931335, + "learning_rate": 7.117520074527173e-05, + "loss": 1.7208, + "step": 12381 + }, + { + "epoch": 3.80049109883364, + "grad_norm": 0.23639258742332458, + "learning_rate": 7.117069783916777e-05, + "loss": 1.7504, + "step": 12382 + }, + { + "epoch": 3.8007980356046653, + "grad_norm": 0.2852858901023865, + "learning_rate": 7.116619472384256e-05, + "loss": 1.7954, + "step": 12383 + }, + { + "epoch": 3.8011049723756907, + "grad_norm": 0.2673225998878479, + "learning_rate": 7.116169139934063e-05, + "loss": 1.7562, + "step": 12384 + }, + { + "epoch": 3.801411909146716, + "grad_norm": 0.21615394949913025, + "learning_rate": 7.115718786570644e-05, + "loss": 1.7126, + "step": 12385 + }, + { + "epoch": 3.801718845917741, + "grad_norm": 0.2165435254573822, + "learning_rate": 7.115268412298453e-05, + "loss": 1.7171, + "step": 12386 + }, + { + "epoch": 3.8020257826887662, + "grad_norm": 0.280564546585083, + "learning_rate": 7.114818017121939e-05, + "loss": 1.7711, + "step": 12387 + }, + { + "epoch": 3.802332719459791, + "grad_norm": 0.3023521304130554, + "learning_rate": 7.114367601045555e-05, + "loss": 1.7538, + "step": 12388 + }, + { + "epoch": 3.8026396562308165, + "grad_norm": 0.27252480387687683, + "learning_rate": 7.11391716407375e-05, + "loss": 1.7604, + "step": 12389 + }, + { + "epoch": 3.802946593001842, + "grad_norm": 0.2122909128665924, + "learning_rate": 7.113466706210976e-05, + "loss": 1.716, + "step": 12390 + }, + { + "epoch": 3.8032535297728667, + "grad_norm": 0.30141574144363403, + "learning_rate": 7.113016227461686e-05, + "loss": 1.7636, + "step": 12391 + }, + { + "epoch": 3.803560466543892, + "grad_norm": 0.33359697461128235, + "learning_rate": 7.112565727830331e-05, + "loss": 1.7805, + "step": 12392 + }, + { + "epoch": 3.803867403314917, + "grad_norm": 0.3161376714706421, + "learning_rate": 7.112115207321364e-05, + "loss": 1.7974, + "step": 12393 + }, + { + "epoch": 3.8041743400859422, + "grad_norm": 0.29028698801994324, + "learning_rate": 7.111664665939235e-05, + "loss": 1.83, + "step": 12394 + }, + { + "epoch": 3.8044812768569676, + "grad_norm": 0.38829556107521057, + "learning_rate": 7.1112141036884e-05, + "loss": 1.8684, + "step": 12395 + }, + { + "epoch": 3.804788213627993, + "grad_norm": 0.4118283987045288, + "learning_rate": 7.110763520573309e-05, + "loss": 1.7812, + "step": 12396 + }, + { + "epoch": 3.805095150399018, + "grad_norm": 0.3907717168331146, + "learning_rate": 7.110312916598416e-05, + "loss": 1.7789, + "step": 12397 + }, + { + "epoch": 3.805402087170043, + "grad_norm": 0.2768644690513611, + "learning_rate": 7.109862291768173e-05, + "loss": 1.8575, + "step": 12398 + }, + { + "epoch": 3.805709023941068, + "grad_norm": 0.3234006464481354, + "learning_rate": 7.109411646087035e-05, + "loss": 1.7485, + "step": 12399 + }, + { + "epoch": 3.8060159607120934, + "grad_norm": 0.415475994348526, + "learning_rate": 7.108960979559454e-05, + "loss": 1.7363, + "step": 12400 + }, + { + "epoch": 3.8063228974831187, + "grad_norm": 0.38654613494873047, + "learning_rate": 7.108510292189884e-05, + "loss": 1.7907, + "step": 12401 + }, + { + "epoch": 3.8066298342541436, + "grad_norm": 0.2541481852531433, + "learning_rate": 7.10805958398278e-05, + "loss": 1.8458, + "step": 12402 + }, + { + "epoch": 3.806936771025169, + "grad_norm": 0.32562851905822754, + "learning_rate": 7.107608854942597e-05, + "loss": 1.7989, + "step": 12403 + }, + { + "epoch": 3.807243707796194, + "grad_norm": 0.3628395199775696, + "learning_rate": 7.107158105073786e-05, + "loss": 1.8044, + "step": 12404 + }, + { + "epoch": 3.807550644567219, + "grad_norm": 0.3363969027996063, + "learning_rate": 7.106707334380805e-05, + "loss": 1.8078, + "step": 12405 + }, + { + "epoch": 3.8078575813382445, + "grad_norm": 0.2853989601135254, + "learning_rate": 7.106256542868108e-05, + "loss": 1.7913, + "step": 12406 + }, + { + "epoch": 3.8081645181092694, + "grad_norm": 0.33455806970596313, + "learning_rate": 7.105805730540148e-05, + "loss": 1.7252, + "step": 12407 + }, + { + "epoch": 3.8084714548802947, + "grad_norm": 0.28103405237197876, + "learning_rate": 7.105354897401382e-05, + "loss": 1.6942, + "step": 12408 + }, + { + "epoch": 3.8087783916513196, + "grad_norm": 0.23230718076229095, + "learning_rate": 7.104904043456264e-05, + "loss": 1.7723, + "step": 12409 + }, + { + "epoch": 3.809085328422345, + "grad_norm": 0.2883053421974182, + "learning_rate": 7.104453168709251e-05, + "loss": 1.8015, + "step": 12410 + }, + { + "epoch": 3.8093922651933703, + "grad_norm": 0.28462252020835876, + "learning_rate": 7.104002273164798e-05, + "loss": 1.791, + "step": 12411 + }, + { + "epoch": 3.8096992019643956, + "grad_norm": 0.3004699647426605, + "learning_rate": 7.103551356827363e-05, + "loss": 1.8401, + "step": 12412 + }, + { + "epoch": 3.8100061387354205, + "grad_norm": 0.2546156048774719, + "learning_rate": 7.1031004197014e-05, + "loss": 1.7645, + "step": 12413 + }, + { + "epoch": 3.810313075506446, + "grad_norm": 0.24532915651798248, + "learning_rate": 7.102649461791364e-05, + "loss": 1.8, + "step": 12414 + }, + { + "epoch": 3.8106200122774707, + "grad_norm": 0.2432405799627304, + "learning_rate": 7.102198483101716e-05, + "loss": 1.7957, + "step": 12415 + }, + { + "epoch": 3.810926949048496, + "grad_norm": 0.24405215680599213, + "learning_rate": 7.101747483636908e-05, + "loss": 1.79, + "step": 12416 + }, + { + "epoch": 3.8112338858195214, + "grad_norm": 0.29519838094711304, + "learning_rate": 7.101296463401401e-05, + "loss": 1.8087, + "step": 12417 + }, + { + "epoch": 3.8115408225905463, + "grad_norm": 0.28205612301826477, + "learning_rate": 7.100845422399652e-05, + "loss": 1.7897, + "step": 12418 + }, + { + "epoch": 3.8118477593615716, + "grad_norm": 0.25014567375183105, + "learning_rate": 7.100394360636115e-05, + "loss": 1.7574, + "step": 12419 + }, + { + "epoch": 3.8121546961325965, + "grad_norm": 0.3133499026298523, + "learning_rate": 7.099943278115251e-05, + "loss": 1.7957, + "step": 12420 + }, + { + "epoch": 3.812461632903622, + "grad_norm": 0.3706473708152771, + "learning_rate": 7.099492174841516e-05, + "loss": 1.8519, + "step": 12421 + }, + { + "epoch": 3.812768569674647, + "grad_norm": 0.30085715651512146, + "learning_rate": 7.09904105081937e-05, + "loss": 1.778, + "step": 12422 + }, + { + "epoch": 3.813075506445672, + "grad_norm": 0.23897981643676758, + "learning_rate": 7.09858990605327e-05, + "loss": 1.7289, + "step": 12423 + }, + { + "epoch": 3.8133824432166974, + "grad_norm": 0.30046290159225464, + "learning_rate": 7.098138740547673e-05, + "loss": 1.8838, + "step": 12424 + }, + { + "epoch": 3.8136893799877223, + "grad_norm": 0.32126328349113464, + "learning_rate": 7.097687554307041e-05, + "loss": 1.7916, + "step": 12425 + }, + { + "epoch": 3.8139963167587476, + "grad_norm": 0.2922256886959076, + "learning_rate": 7.097236347335829e-05, + "loss": 1.8305, + "step": 12426 + }, + { + "epoch": 3.814303253529773, + "grad_norm": 0.2772706151008606, + "learning_rate": 7.0967851196385e-05, + "loss": 1.7694, + "step": 12427 + }, + { + "epoch": 3.8146101903007983, + "grad_norm": 0.25763455033302307, + "learning_rate": 7.096333871219511e-05, + "loss": 1.8716, + "step": 12428 + }, + { + "epoch": 3.814917127071823, + "grad_norm": 0.2631739377975464, + "learning_rate": 7.095882602083322e-05, + "loss": 1.7771, + "step": 12429 + }, + { + "epoch": 3.8152240638428485, + "grad_norm": 0.29229632019996643, + "learning_rate": 7.095431312234392e-05, + "loss": 1.7865, + "step": 12430 + }, + { + "epoch": 3.8155310006138734, + "grad_norm": 0.2672729790210724, + "learning_rate": 7.094980001677181e-05, + "loss": 1.7848, + "step": 12431 + }, + { + "epoch": 3.8158379373848987, + "grad_norm": 0.2388373166322708, + "learning_rate": 7.094528670416152e-05, + "loss": 1.75, + "step": 12432 + }, + { + "epoch": 3.816144874155924, + "grad_norm": 0.2385305017232895, + "learning_rate": 7.094077318455762e-05, + "loss": 1.748, + "step": 12433 + }, + { + "epoch": 3.816451810926949, + "grad_norm": 0.25421401858329773, + "learning_rate": 7.093625945800471e-05, + "loss": 1.779, + "step": 12434 + }, + { + "epoch": 3.8167587476979743, + "grad_norm": 0.2785158157348633, + "learning_rate": 7.093174552454743e-05, + "loss": 1.8295, + "step": 12435 + }, + { + "epoch": 3.817065684468999, + "grad_norm": 0.2907472252845764, + "learning_rate": 7.092723138423036e-05, + "loss": 1.8216, + "step": 12436 + }, + { + "epoch": 3.8173726212400245, + "grad_norm": 0.253955215215683, + "learning_rate": 7.092271703709814e-05, + "loss": 1.8394, + "step": 12437 + }, + { + "epoch": 3.81767955801105, + "grad_norm": 0.32139912247657776, + "learning_rate": 7.091820248319537e-05, + "loss": 1.8634, + "step": 12438 + }, + { + "epoch": 3.8179864947820747, + "grad_norm": 0.25890466570854187, + "learning_rate": 7.091368772256664e-05, + "loss": 1.7336, + "step": 12439 + }, + { + "epoch": 3.8182934315531, + "grad_norm": 0.2823775112628937, + "learning_rate": 7.090917275525661e-05, + "loss": 1.7927, + "step": 12440 + }, + { + "epoch": 3.818600368324125, + "grad_norm": 0.28739333152770996, + "learning_rate": 7.090465758130988e-05, + "loss": 1.7807, + "step": 12441 + }, + { + "epoch": 3.8189073050951503, + "grad_norm": 0.36823949217796326, + "learning_rate": 7.090014220077106e-05, + "loss": 1.7288, + "step": 12442 + }, + { + "epoch": 3.8192142418661756, + "grad_norm": 0.3061312735080719, + "learning_rate": 7.089562661368479e-05, + "loss": 1.8039, + "step": 12443 + }, + { + "epoch": 3.819521178637201, + "grad_norm": 0.25867924094200134, + "learning_rate": 7.089111082009569e-05, + "loss": 1.7678, + "step": 12444 + }, + { + "epoch": 3.819828115408226, + "grad_norm": 0.26834985613822937, + "learning_rate": 7.088659482004837e-05, + "loss": 1.7592, + "step": 12445 + }, + { + "epoch": 3.820135052179251, + "grad_norm": 0.25608211755752563, + "learning_rate": 7.08820786135875e-05, + "loss": 1.7622, + "step": 12446 + }, + { + "epoch": 3.820441988950276, + "grad_norm": 0.2512456774711609, + "learning_rate": 7.087756220075769e-05, + "loss": 1.7648, + "step": 12447 + }, + { + "epoch": 3.8207489257213014, + "grad_norm": 0.2434878647327423, + "learning_rate": 7.087304558160355e-05, + "loss": 1.7435, + "step": 12448 + }, + { + "epoch": 3.8210558624923268, + "grad_norm": 0.26456570625305176, + "learning_rate": 7.086852875616978e-05, + "loss": 1.7342, + "step": 12449 + }, + { + "epoch": 3.8213627992633517, + "grad_norm": 0.2958984971046448, + "learning_rate": 7.086401172450095e-05, + "loss": 1.8532, + "step": 12450 + }, + { + "epoch": 3.821669736034377, + "grad_norm": 0.25939157605171204, + "learning_rate": 7.085949448664172e-05, + "loss": 1.7746, + "step": 12451 + }, + { + "epoch": 3.821976672805402, + "grad_norm": 0.2210223525762558, + "learning_rate": 7.085497704263675e-05, + "loss": 1.7745, + "step": 12452 + }, + { + "epoch": 3.822283609576427, + "grad_norm": 0.2409319430589676, + "learning_rate": 7.085045939253068e-05, + "loss": 1.7981, + "step": 12453 + }, + { + "epoch": 3.8225905463474525, + "grad_norm": 0.26331812143325806, + "learning_rate": 7.084594153636815e-05, + "loss": 1.8163, + "step": 12454 + }, + { + "epoch": 3.8228974831184774, + "grad_norm": 0.2613828480243683, + "learning_rate": 7.08414234741938e-05, + "loss": 1.8362, + "step": 12455 + }, + { + "epoch": 3.8232044198895028, + "grad_norm": 0.3139529228210449, + "learning_rate": 7.083690520605228e-05, + "loss": 1.8247, + "step": 12456 + }, + { + "epoch": 3.8235113566605277, + "grad_norm": 0.2958570718765259, + "learning_rate": 7.083238673198826e-05, + "loss": 1.8011, + "step": 12457 + }, + { + "epoch": 3.823818293431553, + "grad_norm": 0.2517626881599426, + "learning_rate": 7.082786805204639e-05, + "loss": 1.7353, + "step": 12458 + }, + { + "epoch": 3.8241252302025783, + "grad_norm": 0.2443888783454895, + "learning_rate": 7.082334916627132e-05, + "loss": 1.7916, + "step": 12459 + }, + { + "epoch": 3.8244321669736037, + "grad_norm": 0.283514142036438, + "learning_rate": 7.08188300747077e-05, + "loss": 1.8048, + "step": 12460 + }, + { + "epoch": 3.8247391037446286, + "grad_norm": 0.24775351583957672, + "learning_rate": 7.08143107774002e-05, + "loss": 1.8145, + "step": 12461 + }, + { + "epoch": 3.825046040515654, + "grad_norm": 0.27904003858566284, + "learning_rate": 7.080979127439347e-05, + "loss": 1.8003, + "step": 12462 + }, + { + "epoch": 3.825352977286679, + "grad_norm": 0.24997512996196747, + "learning_rate": 7.08052715657322e-05, + "loss": 1.7962, + "step": 12463 + }, + { + "epoch": 3.825659914057704, + "grad_norm": 0.25874343514442444, + "learning_rate": 7.080075165146104e-05, + "loss": 1.7861, + "step": 12464 + }, + { + "epoch": 3.8259668508287294, + "grad_norm": 0.2964434027671814, + "learning_rate": 7.079623153162467e-05, + "loss": 1.7618, + "step": 12465 + }, + { + "epoch": 3.8262737875997543, + "grad_norm": 0.26403337717056274, + "learning_rate": 7.079171120626774e-05, + "loss": 1.8016, + "step": 12466 + }, + { + "epoch": 3.8265807243707797, + "grad_norm": 0.28369295597076416, + "learning_rate": 7.078719067543494e-05, + "loss": 1.7517, + "step": 12467 + }, + { + "epoch": 3.8268876611418046, + "grad_norm": 0.254312127828598, + "learning_rate": 7.078266993917093e-05, + "loss": 1.8085, + "step": 12468 + }, + { + "epoch": 3.82719459791283, + "grad_norm": 0.24992622435092926, + "learning_rate": 7.077814899752038e-05, + "loss": 1.7657, + "step": 12469 + }, + { + "epoch": 3.8275015346838552, + "grad_norm": 0.26485762000083923, + "learning_rate": 7.077362785052802e-05, + "loss": 1.7303, + "step": 12470 + }, + { + "epoch": 3.8278084714548806, + "grad_norm": 0.29864901304244995, + "learning_rate": 7.076910649823846e-05, + "loss": 1.7734, + "step": 12471 + }, + { + "epoch": 3.8281154082259055, + "grad_norm": 0.2973599433898926, + "learning_rate": 7.076458494069644e-05, + "loss": 1.8055, + "step": 12472 + }, + { + "epoch": 3.828422344996931, + "grad_norm": 0.2150362730026245, + "learning_rate": 7.07600631779466e-05, + "loss": 1.7377, + "step": 12473 + }, + { + "epoch": 3.8287292817679557, + "grad_norm": 0.26443010568618774, + "learning_rate": 7.075554121003367e-05, + "loss": 1.837, + "step": 12474 + }, + { + "epoch": 3.829036218538981, + "grad_norm": 0.27365007996559143, + "learning_rate": 7.075101903700231e-05, + "loss": 1.7784, + "step": 12475 + }, + { + "epoch": 3.8293431553100064, + "grad_norm": 0.22037263214588165, + "learning_rate": 7.074649665889721e-05, + "loss": 1.8182, + "step": 12476 + }, + { + "epoch": 3.8296500920810312, + "grad_norm": 0.29614946246147156, + "learning_rate": 7.074197407576308e-05, + "loss": 1.7993, + "step": 12477 + }, + { + "epoch": 3.8299570288520566, + "grad_norm": 0.25135520100593567, + "learning_rate": 7.07374512876446e-05, + "loss": 1.8211, + "step": 12478 + }, + { + "epoch": 3.8302639656230815, + "grad_norm": 0.2711503207683563, + "learning_rate": 7.073292829458645e-05, + "loss": 1.8274, + "step": 12479 + }, + { + "epoch": 3.830570902394107, + "grad_norm": 0.38659265637397766, + "learning_rate": 7.072840509663338e-05, + "loss": 1.796, + "step": 12480 + }, + { + "epoch": 3.830877839165132, + "grad_norm": 0.39382728934288025, + "learning_rate": 7.072388169383005e-05, + "loss": 1.8439, + "step": 12481 + }, + { + "epoch": 3.831184775936157, + "grad_norm": 0.27570033073425293, + "learning_rate": 7.071935808622118e-05, + "loss": 1.8155, + "step": 12482 + }, + { + "epoch": 3.8314917127071824, + "grad_norm": 0.29054465889930725, + "learning_rate": 7.071483427385147e-05, + "loss": 1.754, + "step": 12483 + }, + { + "epoch": 3.8317986494782073, + "grad_norm": 0.4138031303882599, + "learning_rate": 7.071031025676562e-05, + "loss": 1.7686, + "step": 12484 + }, + { + "epoch": 3.8321055862492326, + "grad_norm": 0.3447251617908478, + "learning_rate": 7.070578603500833e-05, + "loss": 1.8135, + "step": 12485 + }, + { + "epoch": 3.832412523020258, + "grad_norm": 0.265115886926651, + "learning_rate": 7.070126160862436e-05, + "loss": 1.803, + "step": 12486 + }, + { + "epoch": 3.8327194597912833, + "grad_norm": 0.4288817346096039, + "learning_rate": 7.069673697765837e-05, + "loss": 1.7814, + "step": 12487 + }, + { + "epoch": 3.833026396562308, + "grad_norm": 0.4890103340148926, + "learning_rate": 7.06922121421551e-05, + "loss": 1.8318, + "step": 12488 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.3676142990589142, + "learning_rate": 7.068768710215928e-05, + "loss": 1.7792, + "step": 12489 + }, + { + "epoch": 3.8336402701043584, + "grad_norm": 0.23254090547561646, + "learning_rate": 7.068316185771557e-05, + "loss": 1.7154, + "step": 12490 + }, + { + "epoch": 3.8339472068753837, + "grad_norm": 0.35014036297798157, + "learning_rate": 7.067863640886876e-05, + "loss": 1.7031, + "step": 12491 + }, + { + "epoch": 3.834254143646409, + "grad_norm": 0.32155317068099976, + "learning_rate": 7.067411075566353e-05, + "loss": 1.7692, + "step": 12492 + }, + { + "epoch": 3.834561080417434, + "grad_norm": 0.260772705078125, + "learning_rate": 7.066958489814463e-05, + "loss": 1.7488, + "step": 12493 + }, + { + "epoch": 3.8348680171884593, + "grad_norm": 0.2624910771846771, + "learning_rate": 7.066505883635678e-05, + "loss": 1.7436, + "step": 12494 + }, + { + "epoch": 3.835174953959484, + "grad_norm": 0.2782299220561981, + "learning_rate": 7.066053257034471e-05, + "loss": 1.8219, + "step": 12495 + }, + { + "epoch": 3.8354818907305095, + "grad_norm": 0.2749497890472412, + "learning_rate": 7.065600610015312e-05, + "loss": 1.8068, + "step": 12496 + }, + { + "epoch": 3.835788827501535, + "grad_norm": 0.2730359733104706, + "learning_rate": 7.06514794258268e-05, + "loss": 1.7588, + "step": 12497 + }, + { + "epoch": 3.8360957642725597, + "grad_norm": 0.3606291711330414, + "learning_rate": 7.064695254741044e-05, + "loss": 1.8509, + "step": 12498 + }, + { + "epoch": 3.836402701043585, + "grad_norm": 0.23282989859580994, + "learning_rate": 7.064242546494879e-05, + "loss": 1.7444, + "step": 12499 + }, + { + "epoch": 3.83670963781461, + "grad_norm": 0.2554507255554199, + "learning_rate": 7.06378981784866e-05, + "loss": 1.7486, + "step": 12500 + }, + { + "epoch": 3.8370165745856353, + "grad_norm": 0.2916143834590912, + "learning_rate": 7.06333706880686e-05, + "loss": 1.8035, + "step": 12501 + }, + { + "epoch": 3.8373235113566606, + "grad_norm": 0.23719090223312378, + "learning_rate": 7.062884299373955e-05, + "loss": 1.7896, + "step": 12502 + }, + { + "epoch": 3.837630448127686, + "grad_norm": 0.2596152126789093, + "learning_rate": 7.062431509554417e-05, + "loss": 1.7944, + "step": 12503 + }, + { + "epoch": 3.837937384898711, + "grad_norm": 0.29140764474868774, + "learning_rate": 7.061978699352723e-05, + "loss": 1.7988, + "step": 12504 + }, + { + "epoch": 3.838244321669736, + "grad_norm": 0.3421068489551544, + "learning_rate": 7.061525868773347e-05, + "loss": 1.751, + "step": 12505 + }, + { + "epoch": 3.838551258440761, + "grad_norm": 0.2705349624156952, + "learning_rate": 7.061073017820764e-05, + "loss": 1.7578, + "step": 12506 + }, + { + "epoch": 3.8388581952117864, + "grad_norm": 0.2403286248445511, + "learning_rate": 7.060620146499448e-05, + "loss": 1.8422, + "step": 12507 + }, + { + "epoch": 3.8391651319828117, + "grad_norm": 0.3860442042350769, + "learning_rate": 7.060167254813876e-05, + "loss": 1.8168, + "step": 12508 + }, + { + "epoch": 3.8394720687538366, + "grad_norm": 0.4729512631893158, + "learning_rate": 7.059714342768526e-05, + "loss": 1.7786, + "step": 12509 + }, + { + "epoch": 3.839779005524862, + "grad_norm": 0.3522968888282776, + "learning_rate": 7.059261410367871e-05, + "loss": 1.8749, + "step": 12510 + }, + { + "epoch": 3.840085942295887, + "grad_norm": 0.28071436285972595, + "learning_rate": 7.058808457616386e-05, + "loss": 1.7959, + "step": 12511 + }, + { + "epoch": 3.840392879066912, + "grad_norm": 0.4356439411640167, + "learning_rate": 7.05835548451855e-05, + "loss": 1.8045, + "step": 12512 + }, + { + "epoch": 3.8406998158379375, + "grad_norm": 0.4051562249660492, + "learning_rate": 7.057902491078839e-05, + "loss": 1.7909, + "step": 12513 + }, + { + "epoch": 3.8410067526089624, + "grad_norm": 0.2817205488681793, + "learning_rate": 7.057449477301728e-05, + "loss": 1.8736, + "step": 12514 + }, + { + "epoch": 3.8413136893799877, + "grad_norm": 0.33369559049606323, + "learning_rate": 7.056996443191697e-05, + "loss": 1.7799, + "step": 12515 + }, + { + "epoch": 3.8416206261510126, + "grad_norm": 0.369954913854599, + "learning_rate": 7.056543388753221e-05, + "loss": 1.795, + "step": 12516 + }, + { + "epoch": 3.841927562922038, + "grad_norm": 0.289474755525589, + "learning_rate": 7.056090313990778e-05, + "loss": 1.786, + "step": 12517 + }, + { + "epoch": 3.8422344996930633, + "grad_norm": 0.2431849092245102, + "learning_rate": 7.055637218908845e-05, + "loss": 1.7363, + "step": 12518 + }, + { + "epoch": 3.8425414364640886, + "grad_norm": 0.3736060857772827, + "learning_rate": 7.0551841035119e-05, + "loss": 1.8234, + "step": 12519 + }, + { + "epoch": 3.8428483732351135, + "grad_norm": 0.34008854627609253, + "learning_rate": 7.054730967804422e-05, + "loss": 1.8001, + "step": 12520 + }, + { + "epoch": 3.843155310006139, + "grad_norm": 0.24852876365184784, + "learning_rate": 7.054277811790887e-05, + "loss": 1.8298, + "step": 12521 + }, + { + "epoch": 3.8434622467771637, + "grad_norm": 0.3491046726703644, + "learning_rate": 7.053824635475777e-05, + "loss": 1.7336, + "step": 12522 + }, + { + "epoch": 3.843769183548189, + "grad_norm": 0.38757824897766113, + "learning_rate": 7.053371438863566e-05, + "loss": 1.8241, + "step": 12523 + }, + { + "epoch": 3.8440761203192144, + "grad_norm": 0.2607647180557251, + "learning_rate": 7.052918221958735e-05, + "loss": 1.7813, + "step": 12524 + }, + { + "epoch": 3.8443830570902393, + "grad_norm": 0.25634410977363586, + "learning_rate": 7.052464984765764e-05, + "loss": 1.7836, + "step": 12525 + }, + { + "epoch": 3.8446899938612646, + "grad_norm": 0.3113503158092499, + "learning_rate": 7.052011727289129e-05, + "loss": 1.8477, + "step": 12526 + }, + { + "epoch": 3.8449969306322895, + "grad_norm": 0.2852596044540405, + "learning_rate": 7.051558449533313e-05, + "loss": 1.7607, + "step": 12527 + }, + { + "epoch": 3.845303867403315, + "grad_norm": 0.24841541051864624, + "learning_rate": 7.051105151502795e-05, + "loss": 1.8109, + "step": 12528 + }, + { + "epoch": 3.84561080417434, + "grad_norm": 0.2231549620628357, + "learning_rate": 7.050651833202053e-05, + "loss": 1.7245, + "step": 12529 + }, + { + "epoch": 3.845917740945365, + "grad_norm": 0.21975892782211304, + "learning_rate": 7.050198494635566e-05, + "loss": 1.7512, + "step": 12530 + }, + { + "epoch": 3.8462246777163904, + "grad_norm": 0.2546280324459076, + "learning_rate": 7.049745135807816e-05, + "loss": 1.8003, + "step": 12531 + }, + { + "epoch": 3.8465316144874153, + "grad_norm": 0.21507929265499115, + "learning_rate": 7.049291756723284e-05, + "loss": 1.7616, + "step": 12532 + }, + { + "epoch": 3.8468385512584407, + "grad_norm": 0.24927987158298492, + "learning_rate": 7.04883835738645e-05, + "loss": 1.7519, + "step": 12533 + }, + { + "epoch": 3.847145488029466, + "grad_norm": 0.24988602101802826, + "learning_rate": 7.048384937801793e-05, + "loss": 1.7966, + "step": 12534 + }, + { + "epoch": 3.8474524248004913, + "grad_norm": 0.24039845168590546, + "learning_rate": 7.047931497973798e-05, + "loss": 1.7834, + "step": 12535 + }, + { + "epoch": 3.847759361571516, + "grad_norm": 0.22826696932315826, + "learning_rate": 7.047478037906943e-05, + "loss": 1.7334, + "step": 12536 + }, + { + "epoch": 3.8480662983425415, + "grad_norm": 0.22260744869709015, + "learning_rate": 7.047024557605708e-05, + "loss": 1.787, + "step": 12537 + }, + { + "epoch": 3.8483732351135664, + "grad_norm": 0.2457917332649231, + "learning_rate": 7.046571057074578e-05, + "loss": 1.7865, + "step": 12538 + }, + { + "epoch": 3.8486801718845918, + "grad_norm": 0.23952928185462952, + "learning_rate": 7.046117536318035e-05, + "loss": 1.7764, + "step": 12539 + }, + { + "epoch": 3.848987108655617, + "grad_norm": 0.22186748683452606, + "learning_rate": 7.045663995340557e-05, + "loss": 1.7917, + "step": 12540 + }, + { + "epoch": 3.849294045426642, + "grad_norm": 0.24234962463378906, + "learning_rate": 7.045210434146629e-05, + "loss": 1.7697, + "step": 12541 + }, + { + "epoch": 3.8496009821976673, + "grad_norm": 0.2510770857334137, + "learning_rate": 7.044756852740732e-05, + "loss": 1.8012, + "step": 12542 + }, + { + "epoch": 3.849907918968692, + "grad_norm": 0.24910703301429749, + "learning_rate": 7.044303251127349e-05, + "loss": 1.831, + "step": 12543 + }, + { + "epoch": 3.8502148557397176, + "grad_norm": 0.3159966468811035, + "learning_rate": 7.043849629310964e-05, + "loss": 1.8029, + "step": 12544 + }, + { + "epoch": 3.850521792510743, + "grad_norm": 0.3155403733253479, + "learning_rate": 7.04339598729606e-05, + "loss": 1.7429, + "step": 12545 + }, + { + "epoch": 3.8508287292817682, + "grad_norm": 0.3037515878677368, + "learning_rate": 7.042942325087117e-05, + "loss": 1.8186, + "step": 12546 + }, + { + "epoch": 3.851135666052793, + "grad_norm": 0.2319766730070114, + "learning_rate": 7.042488642688621e-05, + "loss": 1.7853, + "step": 12547 + }, + { + "epoch": 3.8514426028238185, + "grad_norm": 0.23911969363689423, + "learning_rate": 7.042034940105055e-05, + "loss": 1.8314, + "step": 12548 + }, + { + "epoch": 3.8517495395948433, + "grad_norm": 0.2541846036911011, + "learning_rate": 7.041581217340905e-05, + "loss": 1.8289, + "step": 12549 + }, + { + "epoch": 3.8520564763658687, + "grad_norm": 0.22234943509101868, + "learning_rate": 7.04112747440065e-05, + "loss": 1.7847, + "step": 12550 + }, + { + "epoch": 3.852363413136894, + "grad_norm": 0.2747870981693268, + "learning_rate": 7.04067371128878e-05, + "loss": 1.7875, + "step": 12551 + }, + { + "epoch": 3.852670349907919, + "grad_norm": 0.28589147329330444, + "learning_rate": 7.040219928009775e-05, + "loss": 1.7289, + "step": 12552 + }, + { + "epoch": 3.8529772866789442, + "grad_norm": 0.21180351078510284, + "learning_rate": 7.039766124568119e-05, + "loss": 1.7611, + "step": 12553 + }, + { + "epoch": 3.853284223449969, + "grad_norm": 0.27751782536506653, + "learning_rate": 7.0393123009683e-05, + "loss": 1.7481, + "step": 12554 + }, + { + "epoch": 3.8535911602209945, + "grad_norm": 0.32883307337760925, + "learning_rate": 7.038858457214802e-05, + "loss": 1.7271, + "step": 12555 + }, + { + "epoch": 3.85389809699202, + "grad_norm": 0.30965641140937805, + "learning_rate": 7.03840459331211e-05, + "loss": 1.81, + "step": 12556 + }, + { + "epoch": 3.8542050337630447, + "grad_norm": 0.25184348225593567, + "learning_rate": 7.037950709264709e-05, + "loss": 1.7642, + "step": 12557 + }, + { + "epoch": 3.85451197053407, + "grad_norm": 0.2376822829246521, + "learning_rate": 7.037496805077084e-05, + "loss": 1.7774, + "step": 12558 + }, + { + "epoch": 3.854818907305095, + "grad_norm": 0.2395993024110794, + "learning_rate": 7.03704288075372e-05, + "loss": 1.8397, + "step": 12559 + }, + { + "epoch": 3.8551258440761202, + "grad_norm": 0.26460394263267517, + "learning_rate": 7.036588936299107e-05, + "loss": 1.7472, + "step": 12560 + }, + { + "epoch": 3.8554327808471456, + "grad_norm": 0.34742459654808044, + "learning_rate": 7.036134971717725e-05, + "loss": 1.8003, + "step": 12561 + }, + { + "epoch": 3.855739717618171, + "grad_norm": 0.2829316556453705, + "learning_rate": 7.035680987014068e-05, + "loss": 1.7765, + "step": 12562 + }, + { + "epoch": 3.856046654389196, + "grad_norm": 0.3087223172187805, + "learning_rate": 7.035226982192615e-05, + "loss": 1.8462, + "step": 12563 + }, + { + "epoch": 3.856353591160221, + "grad_norm": 0.2806380093097687, + "learning_rate": 7.034772957257858e-05, + "loss": 1.7704, + "step": 12564 + }, + { + "epoch": 3.856660527931246, + "grad_norm": 0.25598087906837463, + "learning_rate": 7.03431891221428e-05, + "loss": 1.7843, + "step": 12565 + }, + { + "epoch": 3.8569674647022714, + "grad_norm": 0.30833700299263, + "learning_rate": 7.033864847066373e-05, + "loss": 1.8404, + "step": 12566 + }, + { + "epoch": 3.8572744014732967, + "grad_norm": 0.29562532901763916, + "learning_rate": 7.03341076181862e-05, + "loss": 1.8044, + "step": 12567 + }, + { + "epoch": 3.8575813382443216, + "grad_norm": 0.2901719808578491, + "learning_rate": 7.03295665647551e-05, + "loss": 1.7789, + "step": 12568 + }, + { + "epoch": 3.857888275015347, + "grad_norm": 0.25453686714172363, + "learning_rate": 7.03250253104153e-05, + "loss": 1.6792, + "step": 12569 + }, + { + "epoch": 3.858195211786372, + "grad_norm": 0.26009416580200195, + "learning_rate": 7.03204838552117e-05, + "loss": 1.7835, + "step": 12570 + }, + { + "epoch": 3.858502148557397, + "grad_norm": 0.28074127435684204, + "learning_rate": 7.031594219918916e-05, + "loss": 1.7932, + "step": 12571 + }, + { + "epoch": 3.8588090853284225, + "grad_norm": 0.3341725170612335, + "learning_rate": 7.031140034239258e-05, + "loss": 1.7439, + "step": 12572 + }, + { + "epoch": 3.8591160220994474, + "grad_norm": 0.28142449259757996, + "learning_rate": 7.030685828486684e-05, + "loss": 1.8263, + "step": 12573 + }, + { + "epoch": 3.8594229588704727, + "grad_norm": 0.2571438252925873, + "learning_rate": 7.030231602665681e-05, + "loss": 1.7628, + "step": 12574 + }, + { + "epoch": 3.8597298956414976, + "grad_norm": 0.3079041838645935, + "learning_rate": 7.029777356780741e-05, + "loss": 1.7879, + "step": 12575 + }, + { + "epoch": 3.860036832412523, + "grad_norm": 0.2605433464050293, + "learning_rate": 7.029323090836349e-05, + "loss": 1.7841, + "step": 12576 + }, + { + "epoch": 3.8603437691835483, + "grad_norm": 0.24069640040397644, + "learning_rate": 7.028868804836999e-05, + "loss": 1.7939, + "step": 12577 + }, + { + "epoch": 3.8606507059545736, + "grad_norm": 0.26801639795303345, + "learning_rate": 7.028414498787177e-05, + "loss": 1.8082, + "step": 12578 + }, + { + "epoch": 3.8609576427255985, + "grad_norm": 0.28828585147857666, + "learning_rate": 7.027960172691375e-05, + "loss": 1.8094, + "step": 12579 + }, + { + "epoch": 3.861264579496624, + "grad_norm": 0.22927051782608032, + "learning_rate": 7.027505826554082e-05, + "loss": 1.7758, + "step": 12580 + }, + { + "epoch": 3.8615715162676487, + "grad_norm": 0.25755998492240906, + "learning_rate": 7.027051460379788e-05, + "loss": 1.8429, + "step": 12581 + }, + { + "epoch": 3.861878453038674, + "grad_norm": 0.23636581003665924, + "learning_rate": 7.026597074172982e-05, + "loss": 1.7662, + "step": 12582 + }, + { + "epoch": 3.8621853898096994, + "grad_norm": 0.22599349915981293, + "learning_rate": 7.026142667938156e-05, + "loss": 1.7199, + "step": 12583 + }, + { + "epoch": 3.8624923265807243, + "grad_norm": 0.2504875659942627, + "learning_rate": 7.025688241679802e-05, + "loss": 1.8473, + "step": 12584 + }, + { + "epoch": 3.8627992633517496, + "grad_norm": 0.3012976348400116, + "learning_rate": 7.025233795402408e-05, + "loss": 1.8715, + "step": 12585 + }, + { + "epoch": 3.8631062001227745, + "grad_norm": 0.31703677773475647, + "learning_rate": 7.024779329110469e-05, + "loss": 1.8143, + "step": 12586 + }, + { + "epoch": 3.8634131368938, + "grad_norm": 0.27287593483924866, + "learning_rate": 7.024324842808472e-05, + "loss": 1.7227, + "step": 12587 + }, + { + "epoch": 3.863720073664825, + "grad_norm": 0.24663801491260529, + "learning_rate": 7.02387033650091e-05, + "loss": 1.7529, + "step": 12588 + }, + { + "epoch": 3.86402701043585, + "grad_norm": 0.26127147674560547, + "learning_rate": 7.023415810192277e-05, + "loss": 1.7629, + "step": 12589 + }, + { + "epoch": 3.8643339472068754, + "grad_norm": 0.3457142114639282, + "learning_rate": 7.022961263887062e-05, + "loss": 1.8212, + "step": 12590 + }, + { + "epoch": 3.8646408839779003, + "grad_norm": 0.3296070694923401, + "learning_rate": 7.022506697589759e-05, + "loss": 1.7907, + "step": 12591 + }, + { + "epoch": 3.8649478207489256, + "grad_norm": 0.29474303126335144, + "learning_rate": 7.022052111304858e-05, + "loss": 1.7866, + "step": 12592 + }, + { + "epoch": 3.865254757519951, + "grad_norm": 0.2535403072834015, + "learning_rate": 7.021597505036852e-05, + "loss": 1.7607, + "step": 12593 + }, + { + "epoch": 3.8655616942909763, + "grad_norm": 0.26691222190856934, + "learning_rate": 7.021142878790237e-05, + "loss": 1.8063, + "step": 12594 + }, + { + "epoch": 3.865868631062001, + "grad_norm": 0.2784755229949951, + "learning_rate": 7.020688232569502e-05, + "loss": 1.8065, + "step": 12595 + }, + { + "epoch": 3.8661755678330265, + "grad_norm": 0.23714317381381989, + "learning_rate": 7.020233566379142e-05, + "loss": 1.8317, + "step": 12596 + }, + { + "epoch": 3.8664825046040514, + "grad_norm": 0.25010553002357483, + "learning_rate": 7.019778880223649e-05, + "loss": 1.8493, + "step": 12597 + }, + { + "epoch": 3.8667894413750767, + "grad_norm": 0.2798489034175873, + "learning_rate": 7.01932417410752e-05, + "loss": 1.8134, + "step": 12598 + }, + { + "epoch": 3.867096378146102, + "grad_norm": 0.26199260354042053, + "learning_rate": 7.018869448035243e-05, + "loss": 1.6931, + "step": 12599 + }, + { + "epoch": 3.867403314917127, + "grad_norm": 0.24582891166210175, + "learning_rate": 7.018414702011314e-05, + "loss": 1.8076, + "step": 12600 + }, + { + "epoch": 3.8677102516881523, + "grad_norm": 0.25493237376213074, + "learning_rate": 7.01795993604023e-05, + "loss": 1.7851, + "step": 12601 + }, + { + "epoch": 3.868017188459177, + "grad_norm": 0.2607674300670624, + "learning_rate": 7.017505150126483e-05, + "loss": 1.7285, + "step": 12602 + }, + { + "epoch": 3.8683241252302025, + "grad_norm": 0.23629581928253174, + "learning_rate": 7.017050344274568e-05, + "loss": 1.8254, + "step": 12603 + }, + { + "epoch": 3.868631062001228, + "grad_norm": 0.3129318058490753, + "learning_rate": 7.016595518488979e-05, + "loss": 1.7914, + "step": 12604 + }, + { + "epoch": 3.8689379987722528, + "grad_norm": 0.3178271949291229, + "learning_rate": 7.01614067277421e-05, + "loss": 1.8139, + "step": 12605 + }, + { + "epoch": 3.869244935543278, + "grad_norm": 0.3230711817741394, + "learning_rate": 7.015685807134757e-05, + "loss": 1.8203, + "step": 12606 + }, + { + "epoch": 3.869551872314303, + "grad_norm": 0.26339825987815857, + "learning_rate": 7.015230921575118e-05, + "loss": 1.8022, + "step": 12607 + }, + { + "epoch": 3.8698588090853283, + "grad_norm": 0.25337356328964233, + "learning_rate": 7.014776016099785e-05, + "loss": 1.7779, + "step": 12608 + }, + { + "epoch": 3.8701657458563536, + "grad_norm": 0.2506195306777954, + "learning_rate": 7.014321090713253e-05, + "loss": 1.7858, + "step": 12609 + }, + { + "epoch": 3.870472682627379, + "grad_norm": 0.26249951124191284, + "learning_rate": 7.013866145420021e-05, + "loss": 1.8051, + "step": 12610 + }, + { + "epoch": 3.870779619398404, + "grad_norm": 0.25666534900665283, + "learning_rate": 7.013411180224581e-05, + "loss": 1.7945, + "step": 12611 + }, + { + "epoch": 3.871086556169429, + "grad_norm": 0.23901648819446564, + "learning_rate": 7.012956195131433e-05, + "loss": 1.7844, + "step": 12612 + }, + { + "epoch": 3.871393492940454, + "grad_norm": 0.26814451813697815, + "learning_rate": 7.012501190145071e-05, + "loss": 1.7713, + "step": 12613 + }, + { + "epoch": 3.8717004297114794, + "grad_norm": 0.28377315402030945, + "learning_rate": 7.012046165269995e-05, + "loss": 1.7866, + "step": 12614 + }, + { + "epoch": 3.8720073664825048, + "grad_norm": 0.2751680612564087, + "learning_rate": 7.011591120510699e-05, + "loss": 1.7215, + "step": 12615 + }, + { + "epoch": 3.8723143032535297, + "grad_norm": 0.21988113224506378, + "learning_rate": 7.011136055871679e-05, + "loss": 1.8009, + "step": 12616 + }, + { + "epoch": 3.872621240024555, + "grad_norm": 0.26462143659591675, + "learning_rate": 7.010680971357434e-05, + "loss": 1.7618, + "step": 12617 + }, + { + "epoch": 3.87292817679558, + "grad_norm": 0.29054632782936096, + "learning_rate": 7.010225866972462e-05, + "loss": 1.7549, + "step": 12618 + }, + { + "epoch": 3.873235113566605, + "grad_norm": 0.31341224908828735, + "learning_rate": 7.00977074272126e-05, + "loss": 1.8827, + "step": 12619 + }, + { + "epoch": 3.8735420503376305, + "grad_norm": 0.24252115190029144, + "learning_rate": 7.009315598608324e-05, + "loss": 1.7544, + "step": 12620 + }, + { + "epoch": 3.873848987108656, + "grad_norm": 0.30036893486976624, + "learning_rate": 7.008860434638154e-05, + "loss": 1.7465, + "step": 12621 + }, + { + "epoch": 3.8741559238796808, + "grad_norm": 0.3217438757419586, + "learning_rate": 7.00840525081525e-05, + "loss": 1.72, + "step": 12622 + }, + { + "epoch": 3.874462860650706, + "grad_norm": 0.22507290542125702, + "learning_rate": 7.007950047144105e-05, + "loss": 1.7177, + "step": 12623 + }, + { + "epoch": 3.874769797421731, + "grad_norm": 0.3014441728591919, + "learning_rate": 7.007494823629224e-05, + "loss": 1.7502, + "step": 12624 + }, + { + "epoch": 3.8750767341927563, + "grad_norm": 0.3836904466152191, + "learning_rate": 7.0070395802751e-05, + "loss": 1.7971, + "step": 12625 + }, + { + "epoch": 3.8753836709637817, + "grad_norm": 0.33565691113471985, + "learning_rate": 7.006584317086235e-05, + "loss": 1.7439, + "step": 12626 + }, + { + "epoch": 3.8756906077348066, + "grad_norm": 0.2292134314775467, + "learning_rate": 7.006129034067128e-05, + "loss": 1.7998, + "step": 12627 + }, + { + "epoch": 3.875997544505832, + "grad_norm": 0.26385873556137085, + "learning_rate": 7.005673731222277e-05, + "loss": 1.7914, + "step": 12628 + }, + { + "epoch": 3.876304481276857, + "grad_norm": 0.2854950428009033, + "learning_rate": 7.005218408556184e-05, + "loss": 1.7761, + "step": 12629 + }, + { + "epoch": 3.876611418047882, + "grad_norm": 0.34260645508766174, + "learning_rate": 7.004763066073348e-05, + "loss": 1.8015, + "step": 12630 + }, + { + "epoch": 3.8769183548189075, + "grad_norm": 0.3223683834075928, + "learning_rate": 7.004307703778267e-05, + "loss": 1.7453, + "step": 12631 + }, + { + "epoch": 3.8772252915899323, + "grad_norm": 0.24715089797973633, + "learning_rate": 7.003852321675442e-05, + "loss": 1.7813, + "step": 12632 + }, + { + "epoch": 3.8775322283609577, + "grad_norm": 0.22822390496730804, + "learning_rate": 7.003396919769377e-05, + "loss": 1.7982, + "step": 12633 + }, + { + "epoch": 3.8778391651319826, + "grad_norm": 0.24125081300735474, + "learning_rate": 7.002941498064565e-05, + "loss": 1.8606, + "step": 12634 + }, + { + "epoch": 3.878146101903008, + "grad_norm": 0.23512506484985352, + "learning_rate": 7.002486056565513e-05, + "loss": 1.7469, + "step": 12635 + }, + { + "epoch": 3.8784530386740332, + "grad_norm": 0.2908322215080261, + "learning_rate": 7.00203059527672e-05, + "loss": 1.796, + "step": 12636 + }, + { + "epoch": 3.8787599754450586, + "grad_norm": 0.22931252419948578, + "learning_rate": 7.001575114202689e-05, + "loss": 1.7482, + "step": 12637 + }, + { + "epoch": 3.8790669122160835, + "grad_norm": 0.22574284672737122, + "learning_rate": 7.001119613347917e-05, + "loss": 1.7698, + "step": 12638 + }, + { + "epoch": 3.879373848987109, + "grad_norm": 0.23129726946353912, + "learning_rate": 7.000664092716909e-05, + "loss": 1.776, + "step": 12639 + }, + { + "epoch": 3.8796807857581337, + "grad_norm": 0.2763366401195526, + "learning_rate": 7.000208552314165e-05, + "loss": 1.7814, + "step": 12640 + }, + { + "epoch": 3.879987722529159, + "grad_norm": 0.29870158433914185, + "learning_rate": 6.99975299214419e-05, + "loss": 1.7467, + "step": 12641 + }, + { + "epoch": 3.8802946593001844, + "grad_norm": 0.33574381470680237, + "learning_rate": 6.999297412211484e-05, + "loss": 1.8159, + "step": 12642 + }, + { + "epoch": 3.8806015960712092, + "grad_norm": 0.30309897661209106, + "learning_rate": 6.998841812520547e-05, + "loss": 1.8454, + "step": 12643 + }, + { + "epoch": 3.8809085328422346, + "grad_norm": 0.27399247884750366, + "learning_rate": 6.998386193075886e-05, + "loss": 1.7956, + "step": 12644 + }, + { + "epoch": 3.8812154696132595, + "grad_norm": 0.28649580478668213, + "learning_rate": 6.997930553881998e-05, + "loss": 1.8308, + "step": 12645 + }, + { + "epoch": 3.881522406384285, + "grad_norm": 0.2716052532196045, + "learning_rate": 6.997474894943392e-05, + "loss": 1.7698, + "step": 12646 + }, + { + "epoch": 3.88182934315531, + "grad_norm": 0.21380536258220673, + "learning_rate": 6.997019216264567e-05, + "loss": 1.7028, + "step": 12647 + }, + { + "epoch": 3.882136279926335, + "grad_norm": 0.25262731313705444, + "learning_rate": 6.996563517850028e-05, + "loss": 1.8236, + "step": 12648 + }, + { + "epoch": 3.8824432166973604, + "grad_norm": 0.21150052547454834, + "learning_rate": 6.996107799704277e-05, + "loss": 1.7437, + "step": 12649 + }, + { + "epoch": 3.8827501534683853, + "grad_norm": 0.2614554464817047, + "learning_rate": 6.995652061831821e-05, + "loss": 1.7575, + "step": 12650 + }, + { + "epoch": 3.8830570902394106, + "grad_norm": 0.214684396982193, + "learning_rate": 6.995196304237159e-05, + "loss": 1.8195, + "step": 12651 + }, + { + "epoch": 3.883364027010436, + "grad_norm": 0.2226872444152832, + "learning_rate": 6.994740526924798e-05, + "loss": 1.7556, + "step": 12652 + }, + { + "epoch": 3.8836709637814613, + "grad_norm": 0.22270764410495758, + "learning_rate": 6.994284729899246e-05, + "loss": 1.7536, + "step": 12653 + }, + { + "epoch": 3.883977900552486, + "grad_norm": 0.20683564245700836, + "learning_rate": 6.993828913165e-05, + "loss": 1.7728, + "step": 12654 + }, + { + "epoch": 3.8842848373235115, + "grad_norm": 0.23667018115520477, + "learning_rate": 6.993373076726568e-05, + "loss": 1.7819, + "step": 12655 + }, + { + "epoch": 3.8845917740945364, + "grad_norm": 0.2265234887599945, + "learning_rate": 6.992917220588455e-05, + "loss": 1.7502, + "step": 12656 + }, + { + "epoch": 3.8848987108655617, + "grad_norm": 0.24490754306316376, + "learning_rate": 6.992461344755168e-05, + "loss": 1.7513, + "step": 12657 + }, + { + "epoch": 3.885205647636587, + "grad_norm": 0.23001348972320557, + "learning_rate": 6.992005449231208e-05, + "loss": 1.733, + "step": 12658 + }, + { + "epoch": 3.885512584407612, + "grad_norm": 0.25424695014953613, + "learning_rate": 6.991549534021084e-05, + "loss": 1.7621, + "step": 12659 + }, + { + "epoch": 3.8858195211786373, + "grad_norm": 0.25552862882614136, + "learning_rate": 6.991093599129299e-05, + "loss": 1.7974, + "step": 12660 + }, + { + "epoch": 3.886126457949662, + "grad_norm": 0.26876959204673767, + "learning_rate": 6.99063764456036e-05, + "loss": 1.7924, + "step": 12661 + }, + { + "epoch": 3.8864333947206875, + "grad_norm": 0.2754429578781128, + "learning_rate": 6.990181670318772e-05, + "loss": 1.7981, + "step": 12662 + }, + { + "epoch": 3.886740331491713, + "grad_norm": 0.281818687915802, + "learning_rate": 6.989725676409044e-05, + "loss": 1.7328, + "step": 12663 + }, + { + "epoch": 3.8870472682627377, + "grad_norm": 0.21676552295684814, + "learning_rate": 6.989269662835681e-05, + "loss": 1.7376, + "step": 12664 + }, + { + "epoch": 3.887354205033763, + "grad_norm": 0.276115745306015, + "learning_rate": 6.98881362960319e-05, + "loss": 1.7784, + "step": 12665 + }, + { + "epoch": 3.887661141804788, + "grad_norm": 0.2806364893913269, + "learning_rate": 6.988357576716075e-05, + "loss": 1.8078, + "step": 12666 + }, + { + "epoch": 3.8879680785758133, + "grad_norm": 0.27620184421539307, + "learning_rate": 6.987901504178845e-05, + "loss": 1.8115, + "step": 12667 + }, + { + "epoch": 3.8882750153468386, + "grad_norm": 0.23845402896404266, + "learning_rate": 6.987445411996009e-05, + "loss": 1.7485, + "step": 12668 + }, + { + "epoch": 3.888581952117864, + "grad_norm": 0.25063586235046387, + "learning_rate": 6.986989300172071e-05, + "loss": 1.7663, + "step": 12669 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.2417975515127182, + "learning_rate": 6.98653316871154e-05, + "loss": 1.7562, + "step": 12670 + }, + { + "epoch": 3.889195825659914, + "grad_norm": 0.24952733516693115, + "learning_rate": 6.986077017618923e-05, + "loss": 1.8063, + "step": 12671 + }, + { + "epoch": 3.889502762430939, + "grad_norm": 0.25847554206848145, + "learning_rate": 6.985620846898732e-05, + "loss": 1.7722, + "step": 12672 + }, + { + "epoch": 3.8898096992019644, + "grad_norm": 0.23762650787830353, + "learning_rate": 6.985164656555471e-05, + "loss": 1.8368, + "step": 12673 + }, + { + "epoch": 3.8901166359729897, + "grad_norm": 0.25346314907073975, + "learning_rate": 6.984708446593648e-05, + "loss": 1.7957, + "step": 12674 + }, + { + "epoch": 3.8904235727440146, + "grad_norm": 0.2466745674610138, + "learning_rate": 6.984252217017774e-05, + "loss": 1.8286, + "step": 12675 + }, + { + "epoch": 3.89073050951504, + "grad_norm": 0.25413215160369873, + "learning_rate": 6.983795967832356e-05, + "loss": 1.7711, + "step": 12676 + }, + { + "epoch": 3.891037446286065, + "grad_norm": 0.2315925806760788, + "learning_rate": 6.983339699041903e-05, + "loss": 1.7546, + "step": 12677 + }, + { + "epoch": 3.89134438305709, + "grad_norm": 0.26473405957221985, + "learning_rate": 6.982883410650925e-05, + "loss": 1.7563, + "step": 12678 + }, + { + "epoch": 3.8916513198281155, + "grad_norm": 0.24176491796970367, + "learning_rate": 6.982427102663932e-05, + "loss": 1.7734, + "step": 12679 + }, + { + "epoch": 3.891958256599141, + "grad_norm": 0.25444844365119934, + "learning_rate": 6.98197077508543e-05, + "loss": 1.803, + "step": 12680 + }, + { + "epoch": 3.8922651933701657, + "grad_norm": 0.25234144926071167, + "learning_rate": 6.981514427919933e-05, + "loss": 1.8099, + "step": 12681 + }, + { + "epoch": 3.892572130141191, + "grad_norm": 0.2571142315864563, + "learning_rate": 6.98105806117195e-05, + "loss": 1.8618, + "step": 12682 + }, + { + "epoch": 3.892879066912216, + "grad_norm": 0.21235275268554688, + "learning_rate": 6.980601674845988e-05, + "loss": 1.7121, + "step": 12683 + }, + { + "epoch": 3.8931860036832413, + "grad_norm": 0.27078527212142944, + "learning_rate": 6.98014526894656e-05, + "loss": 1.8103, + "step": 12684 + }, + { + "epoch": 3.8934929404542666, + "grad_norm": 0.3198096454143524, + "learning_rate": 6.979688843478176e-05, + "loss": 1.7529, + "step": 12685 + }, + { + "epoch": 3.8937998772252915, + "grad_norm": 0.3170493245124817, + "learning_rate": 6.979232398445345e-05, + "loss": 1.7629, + "step": 12686 + }, + { + "epoch": 3.894106813996317, + "grad_norm": 0.2495265007019043, + "learning_rate": 6.978775933852582e-05, + "loss": 1.7407, + "step": 12687 + }, + { + "epoch": 3.8944137507673418, + "grad_norm": 0.24570141732692719, + "learning_rate": 6.978319449704395e-05, + "loss": 1.7688, + "step": 12688 + }, + { + "epoch": 3.894720687538367, + "grad_norm": 0.23956388235092163, + "learning_rate": 6.977862946005295e-05, + "loss": 1.7115, + "step": 12689 + }, + { + "epoch": 3.8950276243093924, + "grad_norm": 0.21548940241336823, + "learning_rate": 6.977406422759793e-05, + "loss": 1.7611, + "step": 12690 + }, + { + "epoch": 3.8953345610804173, + "grad_norm": 0.25797295570373535, + "learning_rate": 6.976949879972403e-05, + "loss": 1.7688, + "step": 12691 + }, + { + "epoch": 3.8956414978514426, + "grad_norm": 0.28257784247398376, + "learning_rate": 6.976493317647636e-05, + "loss": 1.7517, + "step": 12692 + }, + { + "epoch": 3.8959484346224675, + "grad_norm": 0.23828580975532532, + "learning_rate": 6.976036735790004e-05, + "loss": 1.7877, + "step": 12693 + }, + { + "epoch": 3.896255371393493, + "grad_norm": 0.22915001213550568, + "learning_rate": 6.975580134404017e-05, + "loss": 1.7741, + "step": 12694 + }, + { + "epoch": 3.896562308164518, + "grad_norm": 0.22975030541419983, + "learning_rate": 6.97512351349419e-05, + "loss": 1.772, + "step": 12695 + }, + { + "epoch": 3.8968692449355435, + "grad_norm": 0.29515185952186584, + "learning_rate": 6.974666873065034e-05, + "loss": 1.8001, + "step": 12696 + }, + { + "epoch": 3.8971761817065684, + "grad_norm": 0.26904794573783875, + "learning_rate": 6.974210213121064e-05, + "loss": 1.7069, + "step": 12697 + }, + { + "epoch": 3.8974831184775938, + "grad_norm": 0.2549479603767395, + "learning_rate": 6.97375353366679e-05, + "loss": 1.7419, + "step": 12698 + }, + { + "epoch": 3.8977900552486187, + "grad_norm": 0.23750101029872894, + "learning_rate": 6.973296834706729e-05, + "loss": 1.7815, + "step": 12699 + }, + { + "epoch": 3.898096992019644, + "grad_norm": 0.23529762029647827, + "learning_rate": 6.972840116245389e-05, + "loss": 1.8139, + "step": 12700 + }, + { + "epoch": 3.8984039287906693, + "grad_norm": 0.3212098777294159, + "learning_rate": 6.97238337828729e-05, + "loss": 1.7507, + "step": 12701 + }, + { + "epoch": 3.898710865561694, + "grad_norm": 0.3167687952518463, + "learning_rate": 6.971926620836941e-05, + "loss": 1.8062, + "step": 12702 + }, + { + "epoch": 3.8990178023327196, + "grad_norm": 0.31298309564590454, + "learning_rate": 6.971469843898855e-05, + "loss": 1.8127, + "step": 12703 + }, + { + "epoch": 3.8993247391037444, + "grad_norm": 0.2537378668785095, + "learning_rate": 6.971013047477551e-05, + "loss": 1.7675, + "step": 12704 + }, + { + "epoch": 3.8996316758747698, + "grad_norm": 0.24292805790901184, + "learning_rate": 6.97055623157754e-05, + "loss": 1.8004, + "step": 12705 + }, + { + "epoch": 3.899938612645795, + "grad_norm": 0.2929537296295166, + "learning_rate": 6.970099396203338e-05, + "loss": 1.7963, + "step": 12706 + }, + { + "epoch": 3.90024554941682, + "grad_norm": 0.30531612038612366, + "learning_rate": 6.969642541359459e-05, + "loss": 1.7347, + "step": 12707 + }, + { + "epoch": 3.9005524861878453, + "grad_norm": 0.3138202726840973, + "learning_rate": 6.969185667050417e-05, + "loss": 1.7987, + "step": 12708 + }, + { + "epoch": 3.9008594229588702, + "grad_norm": 0.2366247922182083, + "learning_rate": 6.96872877328073e-05, + "loss": 1.7671, + "step": 12709 + }, + { + "epoch": 3.9011663597298956, + "grad_norm": 0.26251721382141113, + "learning_rate": 6.96827186005491e-05, + "loss": 1.7657, + "step": 12710 + }, + { + "epoch": 3.901473296500921, + "grad_norm": 0.32497119903564453, + "learning_rate": 6.967814927377474e-05, + "loss": 1.7873, + "step": 12711 + }, + { + "epoch": 3.9017802332719462, + "grad_norm": 0.3290228843688965, + "learning_rate": 6.967357975252939e-05, + "loss": 1.8076, + "step": 12712 + }, + { + "epoch": 3.902087170042971, + "grad_norm": 0.2737300992012024, + "learning_rate": 6.966901003685817e-05, + "loss": 1.7405, + "step": 12713 + }, + { + "epoch": 3.9023941068139965, + "grad_norm": 0.25465309619903564, + "learning_rate": 6.966444012680626e-05, + "loss": 1.8063, + "step": 12714 + }, + { + "epoch": 3.9027010435850213, + "grad_norm": 0.2397255003452301, + "learning_rate": 6.965987002241885e-05, + "loss": 1.8079, + "step": 12715 + }, + { + "epoch": 3.9030079803560467, + "grad_norm": 0.23115718364715576, + "learning_rate": 6.965529972374108e-05, + "loss": 1.8032, + "step": 12716 + }, + { + "epoch": 3.903314917127072, + "grad_norm": 0.2536461055278778, + "learning_rate": 6.96507292308181e-05, + "loss": 1.7477, + "step": 12717 + }, + { + "epoch": 3.903621853898097, + "grad_norm": 0.27151185274124146, + "learning_rate": 6.96461585436951e-05, + "loss": 1.75, + "step": 12718 + }, + { + "epoch": 3.9039287906691222, + "grad_norm": 0.26894113421440125, + "learning_rate": 6.964158766241726e-05, + "loss": 1.7816, + "step": 12719 + }, + { + "epoch": 3.904235727440147, + "grad_norm": 0.23541375994682312, + "learning_rate": 6.963701658702972e-05, + "loss": 1.7991, + "step": 12720 + }, + { + "epoch": 3.9045426642111725, + "grad_norm": 0.22142915427684784, + "learning_rate": 6.96324453175777e-05, + "loss": 1.7245, + "step": 12721 + }, + { + "epoch": 3.904849600982198, + "grad_norm": 0.32864269614219666, + "learning_rate": 6.962787385410632e-05, + "loss": 1.7631, + "step": 12722 + }, + { + "epoch": 3.9051565377532227, + "grad_norm": 0.23657776415348053, + "learning_rate": 6.96233021966608e-05, + "loss": 1.8081, + "step": 12723 + }, + { + "epoch": 3.905463474524248, + "grad_norm": 0.24790632724761963, + "learning_rate": 6.961873034528629e-05, + "loss": 1.7193, + "step": 12724 + }, + { + "epoch": 3.905770411295273, + "grad_norm": 0.2517886459827423, + "learning_rate": 6.961415830002801e-05, + "loss": 1.7785, + "step": 12725 + }, + { + "epoch": 3.9060773480662982, + "grad_norm": 0.2340923547744751, + "learning_rate": 6.960958606093113e-05, + "loss": 1.7632, + "step": 12726 + }, + { + "epoch": 3.9063842848373236, + "grad_norm": 0.23260441422462463, + "learning_rate": 6.960501362804079e-05, + "loss": 1.7865, + "step": 12727 + }, + { + "epoch": 3.906691221608349, + "grad_norm": 0.22616329789161682, + "learning_rate": 6.960044100140224e-05, + "loss": 1.7851, + "step": 12728 + }, + { + "epoch": 3.906998158379374, + "grad_norm": 0.2849951982498169, + "learning_rate": 6.959586818106064e-05, + "loss": 1.8618, + "step": 12729 + }, + { + "epoch": 3.907305095150399, + "grad_norm": 0.3279374837875366, + "learning_rate": 6.95912951670612e-05, + "loss": 1.8563, + "step": 12730 + }, + { + "epoch": 3.907612031921424, + "grad_norm": 0.24359555542469025, + "learning_rate": 6.958672195944906e-05, + "loss": 1.7604, + "step": 12731 + }, + { + "epoch": 3.9079189686924494, + "grad_norm": 0.30881935358047485, + "learning_rate": 6.958214855826947e-05, + "loss": 1.8463, + "step": 12732 + }, + { + "epoch": 3.9082259054634747, + "grad_norm": 0.25361543893814087, + "learning_rate": 6.957757496356763e-05, + "loss": 1.7831, + "step": 12733 + }, + { + "epoch": 3.9085328422344996, + "grad_norm": 0.26763513684272766, + "learning_rate": 6.957300117538869e-05, + "loss": 1.8383, + "step": 12734 + }, + { + "epoch": 3.908839779005525, + "grad_norm": 0.2238057255744934, + "learning_rate": 6.95684271937779e-05, + "loss": 1.7702, + "step": 12735 + }, + { + "epoch": 3.90914671577655, + "grad_norm": 0.22110232710838318, + "learning_rate": 6.956385301878045e-05, + "loss": 1.7931, + "step": 12736 + }, + { + "epoch": 3.909453652547575, + "grad_norm": 0.23765070736408234, + "learning_rate": 6.955927865044152e-05, + "loss": 1.7212, + "step": 12737 + }, + { + "epoch": 3.9097605893186005, + "grad_norm": 0.22324508428573608, + "learning_rate": 6.955470408880633e-05, + "loss": 1.7161, + "step": 12738 + }, + { + "epoch": 3.9100675260896254, + "grad_norm": 0.22485347092151642, + "learning_rate": 6.955012933392012e-05, + "loss": 1.7374, + "step": 12739 + }, + { + "epoch": 3.9103744628606507, + "grad_norm": 0.28046715259552, + "learning_rate": 6.954555438582806e-05, + "loss": 1.9264, + "step": 12740 + }, + { + "epoch": 3.9106813996316756, + "grad_norm": 0.26391276717185974, + "learning_rate": 6.954097924457536e-05, + "loss": 1.7343, + "step": 12741 + }, + { + "epoch": 3.910988336402701, + "grad_norm": 0.29596614837646484, + "learning_rate": 6.953640391020726e-05, + "loss": 1.8111, + "step": 12742 + }, + { + "epoch": 3.9112952731737263, + "grad_norm": 0.2709808051586151, + "learning_rate": 6.953182838276896e-05, + "loss": 1.7776, + "step": 12743 + }, + { + "epoch": 3.9116022099447516, + "grad_norm": 0.2585100531578064, + "learning_rate": 6.952725266230571e-05, + "loss": 1.7774, + "step": 12744 + }, + { + "epoch": 3.9119091467157765, + "grad_norm": 0.26490530371665955, + "learning_rate": 6.952267674886268e-05, + "loss": 1.78, + "step": 12745 + }, + { + "epoch": 3.912216083486802, + "grad_norm": 0.23654767870903015, + "learning_rate": 6.951810064248512e-05, + "loss": 1.8263, + "step": 12746 + }, + { + "epoch": 3.9125230202578267, + "grad_norm": 0.2495296597480774, + "learning_rate": 6.951352434321826e-05, + "loss": 1.787, + "step": 12747 + }, + { + "epoch": 3.912829957028852, + "grad_norm": 0.24038313329219818, + "learning_rate": 6.950894785110728e-05, + "loss": 1.774, + "step": 12748 + }, + { + "epoch": 3.9131368937998774, + "grad_norm": 0.23738732933998108, + "learning_rate": 6.950437116619749e-05, + "loss": 1.7401, + "step": 12749 + }, + { + "epoch": 3.9134438305709023, + "grad_norm": 0.28192025423049927, + "learning_rate": 6.949979428853405e-05, + "loss": 1.8416, + "step": 12750 + }, + { + "epoch": 3.9137507673419276, + "grad_norm": 0.30579057335853577, + "learning_rate": 6.949521721816221e-05, + "loss": 1.7404, + "step": 12751 + }, + { + "epoch": 3.9140577041129525, + "grad_norm": 0.23972894251346588, + "learning_rate": 6.949063995512721e-05, + "loss": 1.7543, + "step": 12752 + }, + { + "epoch": 3.914364640883978, + "grad_norm": 0.2837793231010437, + "learning_rate": 6.94860624994743e-05, + "loss": 1.7779, + "step": 12753 + }, + { + "epoch": 3.914671577655003, + "grad_norm": 0.3344916105270386, + "learning_rate": 6.948148485124868e-05, + "loss": 1.7803, + "step": 12754 + }, + { + "epoch": 3.9149785144260285, + "grad_norm": 0.24271291494369507, + "learning_rate": 6.94769070104956e-05, + "loss": 1.7362, + "step": 12755 + }, + { + "epoch": 3.9152854511970534, + "grad_norm": 0.25299304723739624, + "learning_rate": 6.947232897726031e-05, + "loss": 1.7685, + "step": 12756 + }, + { + "epoch": 3.9155923879680787, + "grad_norm": 0.24766205251216888, + "learning_rate": 6.946775075158807e-05, + "loss": 1.829, + "step": 12757 + }, + { + "epoch": 3.9158993247391036, + "grad_norm": 0.2508428692817688, + "learning_rate": 6.94631723335241e-05, + "loss": 1.809, + "step": 12758 + }, + { + "epoch": 3.916206261510129, + "grad_norm": 0.2172096222639084, + "learning_rate": 6.945859372311365e-05, + "loss": 1.7376, + "step": 12759 + }, + { + "epoch": 3.9165131982811543, + "grad_norm": 0.28976425528526306, + "learning_rate": 6.945401492040198e-05, + "loss": 1.8229, + "step": 12760 + }, + { + "epoch": 3.916820135052179, + "grad_norm": 0.3528063893318176, + "learning_rate": 6.944943592543432e-05, + "loss": 1.7559, + "step": 12761 + }, + { + "epoch": 3.9171270718232045, + "grad_norm": 0.46312370896339417, + "learning_rate": 6.944485673825595e-05, + "loss": 1.7664, + "step": 12762 + }, + { + "epoch": 3.9174340085942294, + "grad_norm": 0.4466164708137512, + "learning_rate": 6.94402773589121e-05, + "loss": 1.7833, + "step": 12763 + }, + { + "epoch": 3.9177409453652547, + "grad_norm": 0.2637740969657898, + "learning_rate": 6.943569778744804e-05, + "loss": 1.818, + "step": 12764 + }, + { + "epoch": 3.91804788213628, + "grad_norm": 0.37515267729759216, + "learning_rate": 6.943111802390901e-05, + "loss": 1.7898, + "step": 12765 + }, + { + "epoch": 3.918354818907305, + "grad_norm": 0.45146289467811584, + "learning_rate": 6.942653806834029e-05, + "loss": 1.7797, + "step": 12766 + }, + { + "epoch": 3.9186617556783303, + "grad_norm": 0.2809859812259674, + "learning_rate": 6.942195792078712e-05, + "loss": 1.7836, + "step": 12767 + }, + { + "epoch": 3.918968692449355, + "grad_norm": 0.3606306314468384, + "learning_rate": 6.94173775812948e-05, + "loss": 1.7657, + "step": 12768 + }, + { + "epoch": 3.9192756292203805, + "grad_norm": 0.49528738856315613, + "learning_rate": 6.941279704990857e-05, + "loss": 1.7628, + "step": 12769 + }, + { + "epoch": 3.919582565991406, + "grad_norm": 0.3484322428703308, + "learning_rate": 6.940821632667371e-05, + "loss": 1.7939, + "step": 12770 + }, + { + "epoch": 3.919889502762431, + "grad_norm": 0.2479606419801712, + "learning_rate": 6.940363541163546e-05, + "loss": 1.813, + "step": 12771 + }, + { + "epoch": 3.920196439533456, + "grad_norm": 0.3491765558719635, + "learning_rate": 6.939905430483911e-05, + "loss": 1.7338, + "step": 12772 + }, + { + "epoch": 3.9205033763044814, + "grad_norm": 0.291810005903244, + "learning_rate": 6.939447300632995e-05, + "loss": 1.7445, + "step": 12773 + }, + { + "epoch": 3.9208103130755063, + "grad_norm": 0.2467527985572815, + "learning_rate": 6.938989151615324e-05, + "loss": 1.8462, + "step": 12774 + }, + { + "epoch": 3.9211172498465316, + "grad_norm": 0.35656824707984924, + "learning_rate": 6.938530983435426e-05, + "loss": 1.7751, + "step": 12775 + }, + { + "epoch": 3.921424186617557, + "grad_norm": 0.31269776821136475, + "learning_rate": 6.938072796097828e-05, + "loss": 1.7714, + "step": 12776 + }, + { + "epoch": 3.921731123388582, + "grad_norm": 0.2082831859588623, + "learning_rate": 6.937614589607058e-05, + "loss": 1.7263, + "step": 12777 + }, + { + "epoch": 3.922038060159607, + "grad_norm": 0.27583765983581543, + "learning_rate": 6.937156363967646e-05, + "loss": 1.6822, + "step": 12778 + }, + { + "epoch": 3.922344996930632, + "grad_norm": 0.32773876190185547, + "learning_rate": 6.93669811918412e-05, + "loss": 1.7792, + "step": 12779 + }, + { + "epoch": 3.9226519337016574, + "grad_norm": 0.2583121657371521, + "learning_rate": 6.936239855261007e-05, + "loss": 1.7812, + "step": 12780 + }, + { + "epoch": 3.9229588704726828, + "grad_norm": 0.245570570230484, + "learning_rate": 6.935781572202836e-05, + "loss": 1.7252, + "step": 12781 + }, + { + "epoch": 3.9232658072437077, + "grad_norm": 0.2379419505596161, + "learning_rate": 6.935323270014138e-05, + "loss": 1.7485, + "step": 12782 + }, + { + "epoch": 3.923572744014733, + "grad_norm": 0.2239784598350525, + "learning_rate": 6.934864948699439e-05, + "loss": 1.7444, + "step": 12783 + }, + { + "epoch": 3.923879680785758, + "grad_norm": 0.2366618812084198, + "learning_rate": 6.934406608263274e-05, + "loss": 1.777, + "step": 12784 + }, + { + "epoch": 3.924186617556783, + "grad_norm": 0.22583791613578796, + "learning_rate": 6.933948248710169e-05, + "loss": 1.7291, + "step": 12785 + }, + { + "epoch": 3.9244935543278086, + "grad_norm": 0.24141047894954681, + "learning_rate": 6.933489870044651e-05, + "loss": 1.7748, + "step": 12786 + }, + { + "epoch": 3.924800491098834, + "grad_norm": 0.2389962524175644, + "learning_rate": 6.933031472271255e-05, + "loss": 1.7957, + "step": 12787 + }, + { + "epoch": 3.925107427869859, + "grad_norm": 0.25230300426483154, + "learning_rate": 6.932573055394509e-05, + "loss": 1.7621, + "step": 12788 + }, + { + "epoch": 3.925414364640884, + "grad_norm": 0.23894043266773224, + "learning_rate": 6.932114619418941e-05, + "loss": 1.7285, + "step": 12789 + }, + { + "epoch": 3.925721301411909, + "grad_norm": 0.2650291919708252, + "learning_rate": 6.931656164349086e-05, + "loss": 1.7613, + "step": 12790 + }, + { + "epoch": 3.9260282381829343, + "grad_norm": 0.20616789162158966, + "learning_rate": 6.931197690189472e-05, + "loss": 1.7505, + "step": 12791 + }, + { + "epoch": 3.9263351749539597, + "grad_norm": 0.23915675282478333, + "learning_rate": 6.930739196944633e-05, + "loss": 1.7477, + "step": 12792 + }, + { + "epoch": 3.9266421117249846, + "grad_norm": 0.2522687613964081, + "learning_rate": 6.930280684619094e-05, + "loss": 1.8, + "step": 12793 + }, + { + "epoch": 3.92694904849601, + "grad_norm": 0.264167845249176, + "learning_rate": 6.929822153217391e-05, + "loss": 1.7516, + "step": 12794 + }, + { + "epoch": 3.927255985267035, + "grad_norm": 0.21358054876327515, + "learning_rate": 6.929363602744054e-05, + "loss": 1.7207, + "step": 12795 + }, + { + "epoch": 3.92756292203806, + "grad_norm": 0.25632721185684204, + "learning_rate": 6.928905033203617e-05, + "loss": 1.7446, + "step": 12796 + }, + { + "epoch": 3.9278698588090855, + "grad_norm": 0.2717185318470001, + "learning_rate": 6.928446444600608e-05, + "loss": 1.8555, + "step": 12797 + }, + { + "epoch": 3.9281767955801103, + "grad_norm": 0.2871767282485962, + "learning_rate": 6.927987836939561e-05, + "loss": 1.7861, + "step": 12798 + }, + { + "epoch": 3.9284837323511357, + "grad_norm": 0.282507061958313, + "learning_rate": 6.927529210225009e-05, + "loss": 1.7683, + "step": 12799 + }, + { + "epoch": 3.9287906691221606, + "grad_norm": 0.24870644509792328, + "learning_rate": 6.927070564461482e-05, + "loss": 1.7355, + "step": 12800 + }, + { + "epoch": 3.929097605893186, + "grad_norm": 0.2093631625175476, + "learning_rate": 6.926611899653516e-05, + "loss": 1.7691, + "step": 12801 + }, + { + "epoch": 3.9294045426642112, + "grad_norm": 0.34258076548576355, + "learning_rate": 6.926153215805642e-05, + "loss": 1.8398, + "step": 12802 + }, + { + "epoch": 3.9297114794352366, + "grad_norm": 0.39179500937461853, + "learning_rate": 6.925694512922391e-05, + "loss": 1.8229, + "step": 12803 + }, + { + "epoch": 3.9300184162062615, + "grad_norm": 0.36814743280410767, + "learning_rate": 6.9252357910083e-05, + "loss": 1.7759, + "step": 12804 + }, + { + "epoch": 3.930325352977287, + "grad_norm": 0.2659403085708618, + "learning_rate": 6.924777050067902e-05, + "loss": 1.7553, + "step": 12805 + }, + { + "epoch": 3.9306322897483117, + "grad_norm": 0.20617491006851196, + "learning_rate": 6.924318290105724e-05, + "loss": 1.7398, + "step": 12806 + }, + { + "epoch": 3.930939226519337, + "grad_norm": 0.23730522394180298, + "learning_rate": 6.923859511126309e-05, + "loss": 1.699, + "step": 12807 + }, + { + "epoch": 3.9312461632903624, + "grad_norm": 0.24865423142910004, + "learning_rate": 6.923400713134184e-05, + "loss": 1.7801, + "step": 12808 + }, + { + "epoch": 3.9315531000613873, + "grad_norm": 0.2495356798171997, + "learning_rate": 6.92294189613389e-05, + "loss": 1.803, + "step": 12809 + }, + { + "epoch": 3.9318600368324126, + "grad_norm": 0.24223244190216064, + "learning_rate": 6.922483060129955e-05, + "loss": 1.751, + "step": 12810 + }, + { + "epoch": 3.9321669736034375, + "grad_norm": 0.2541450262069702, + "learning_rate": 6.922024205126913e-05, + "loss": 1.7721, + "step": 12811 + }, + { + "epoch": 3.932473910374463, + "grad_norm": 0.24528831243515015, + "learning_rate": 6.921565331129304e-05, + "loss": 1.792, + "step": 12812 + }, + { + "epoch": 3.932780847145488, + "grad_norm": 0.22789500653743744, + "learning_rate": 6.921106438141659e-05, + "loss": 1.8455, + "step": 12813 + }, + { + "epoch": 3.933087783916513, + "grad_norm": 0.26267170906066895, + "learning_rate": 6.920647526168515e-05, + "loss": 1.7254, + "step": 12814 + }, + { + "epoch": 3.9333947206875384, + "grad_norm": 0.23044808208942413, + "learning_rate": 6.920188595214406e-05, + "loss": 1.7217, + "step": 12815 + }, + { + "epoch": 3.9337016574585633, + "grad_norm": 0.2304011732339859, + "learning_rate": 6.919729645283867e-05, + "loss": 1.8121, + "step": 12816 + }, + { + "epoch": 3.9340085942295886, + "grad_norm": 0.21516792476177216, + "learning_rate": 6.919270676381435e-05, + "loss": 1.7305, + "step": 12817 + }, + { + "epoch": 3.934315531000614, + "grad_norm": 0.24698840081691742, + "learning_rate": 6.918811688511646e-05, + "loss": 1.7967, + "step": 12818 + }, + { + "epoch": 3.9346224677716393, + "grad_norm": 0.23132537305355072, + "learning_rate": 6.918352681679035e-05, + "loss": 1.7439, + "step": 12819 + }, + { + "epoch": 3.934929404542664, + "grad_norm": 0.2597793936729431, + "learning_rate": 6.917893655888139e-05, + "loss": 1.7882, + "step": 12820 + }, + { + "epoch": 3.9352363413136895, + "grad_norm": 0.23946607112884521, + "learning_rate": 6.917434611143493e-05, + "loss": 1.7991, + "step": 12821 + }, + { + "epoch": 3.9355432780847144, + "grad_norm": 0.25808244943618774, + "learning_rate": 6.916975547449634e-05, + "loss": 1.845, + "step": 12822 + }, + { + "epoch": 3.9358502148557397, + "grad_norm": 0.26082557439804077, + "learning_rate": 6.9165164648111e-05, + "loss": 1.7562, + "step": 12823 + }, + { + "epoch": 3.936157151626765, + "grad_norm": 0.24810053408145905, + "learning_rate": 6.916057363232425e-05, + "loss": 1.778, + "step": 12824 + }, + { + "epoch": 3.93646408839779, + "grad_norm": 0.24168157577514648, + "learning_rate": 6.91559824271815e-05, + "loss": 1.7628, + "step": 12825 + }, + { + "epoch": 3.9367710251688153, + "grad_norm": 0.23800434172153473, + "learning_rate": 6.91513910327281e-05, + "loss": 1.8063, + "step": 12826 + }, + { + "epoch": 3.93707796193984, + "grad_norm": 0.23055073618888855, + "learning_rate": 6.914679944900944e-05, + "loss": 1.749, + "step": 12827 + }, + { + "epoch": 3.9373848987108655, + "grad_norm": 0.22455987334251404, + "learning_rate": 6.914220767607088e-05, + "loss": 1.7471, + "step": 12828 + }, + { + "epoch": 3.937691835481891, + "grad_norm": 0.21808378398418427, + "learning_rate": 6.913761571395778e-05, + "loss": 1.7503, + "step": 12829 + }, + { + "epoch": 3.937998772252916, + "grad_norm": 0.23136213421821594, + "learning_rate": 6.913302356271556e-05, + "loss": 1.752, + "step": 12830 + }, + { + "epoch": 3.938305709023941, + "grad_norm": 0.29579970240592957, + "learning_rate": 6.912843122238959e-05, + "loss": 1.8028, + "step": 12831 + }, + { + "epoch": 3.9386126457949664, + "grad_norm": 0.28578072786331177, + "learning_rate": 6.912383869302526e-05, + "loss": 1.8183, + "step": 12832 + }, + { + "epoch": 3.9389195825659913, + "grad_norm": 0.2616737186908722, + "learning_rate": 6.911924597466793e-05, + "loss": 1.8366, + "step": 12833 + }, + { + "epoch": 3.9392265193370166, + "grad_norm": 0.29275768995285034, + "learning_rate": 6.911465306736302e-05, + "loss": 1.731, + "step": 12834 + }, + { + "epoch": 3.939533456108042, + "grad_norm": 0.3300873041152954, + "learning_rate": 6.91100599711559e-05, + "loss": 1.8713, + "step": 12835 + }, + { + "epoch": 3.939840392879067, + "grad_norm": 0.2744643986225128, + "learning_rate": 6.910546668609195e-05, + "loss": 1.8479, + "step": 12836 + }, + { + "epoch": 3.940147329650092, + "grad_norm": 0.25248417258262634, + "learning_rate": 6.91008732122166e-05, + "loss": 1.7962, + "step": 12837 + }, + { + "epoch": 3.940454266421117, + "grad_norm": 0.3068546652793884, + "learning_rate": 6.909627954957521e-05, + "loss": 1.759, + "step": 12838 + }, + { + "epoch": 3.9407612031921424, + "grad_norm": 0.3273559808731079, + "learning_rate": 6.909168569821321e-05, + "loss": 1.814, + "step": 12839 + }, + { + "epoch": 3.9410681399631677, + "grad_norm": 0.31192758679389954, + "learning_rate": 6.908709165817597e-05, + "loss": 1.7906, + "step": 12840 + }, + { + "epoch": 3.9413750767341926, + "grad_norm": 0.24487090110778809, + "learning_rate": 6.90824974295089e-05, + "loss": 1.8238, + "step": 12841 + }, + { + "epoch": 3.941682013505218, + "grad_norm": 0.24863721430301666, + "learning_rate": 6.907790301225743e-05, + "loss": 1.7651, + "step": 12842 + }, + { + "epoch": 3.941988950276243, + "grad_norm": 0.26555630564689636, + "learning_rate": 6.907330840646693e-05, + "loss": 1.8268, + "step": 12843 + }, + { + "epoch": 3.942295887047268, + "grad_norm": 0.2439817190170288, + "learning_rate": 6.906871361218281e-05, + "loss": 1.7291, + "step": 12844 + }, + { + "epoch": 3.9426028238182935, + "grad_norm": 0.2410304993391037, + "learning_rate": 6.906411862945048e-05, + "loss": 1.712, + "step": 12845 + }, + { + "epoch": 3.942909760589319, + "grad_norm": 0.28575149178504944, + "learning_rate": 6.905952345831537e-05, + "loss": 1.7269, + "step": 12846 + }, + { + "epoch": 3.9432166973603437, + "grad_norm": 0.3055815100669861, + "learning_rate": 6.905492809882286e-05, + "loss": 1.7234, + "step": 12847 + }, + { + "epoch": 3.943523634131369, + "grad_norm": 0.2762533724308014, + "learning_rate": 6.905033255101839e-05, + "loss": 1.7768, + "step": 12848 + }, + { + "epoch": 3.943830570902394, + "grad_norm": 0.22819125652313232, + "learning_rate": 6.904573681494738e-05, + "loss": 1.7416, + "step": 12849 + }, + { + "epoch": 3.9441375076734193, + "grad_norm": 0.21664194762706757, + "learning_rate": 6.904114089065523e-05, + "loss": 1.7506, + "step": 12850 + }, + { + "epoch": 3.9444444444444446, + "grad_norm": 0.21935151517391205, + "learning_rate": 6.903654477818735e-05, + "loss": 1.7522, + "step": 12851 + }, + { + "epoch": 3.9447513812154695, + "grad_norm": 0.2204175442457199, + "learning_rate": 6.903194847758918e-05, + "loss": 1.7753, + "step": 12852 + }, + { + "epoch": 3.945058317986495, + "grad_norm": 0.23130151629447937, + "learning_rate": 6.902735198890615e-05, + "loss": 1.7743, + "step": 12853 + }, + { + "epoch": 3.9453652547575198, + "grad_norm": 0.2548399567604065, + "learning_rate": 6.902275531218368e-05, + "loss": 1.8373, + "step": 12854 + }, + { + "epoch": 3.945672191528545, + "grad_norm": 0.2905479371547699, + "learning_rate": 6.901815844746718e-05, + "loss": 1.8336, + "step": 12855 + }, + { + "epoch": 3.9459791282995704, + "grad_norm": 0.2698945105075836, + "learning_rate": 6.90135613948021e-05, + "loss": 1.7498, + "step": 12856 + }, + { + "epoch": 3.9462860650705953, + "grad_norm": 0.24966828525066376, + "learning_rate": 6.900896415423387e-05, + "loss": 1.7664, + "step": 12857 + }, + { + "epoch": 3.9465930018416207, + "grad_norm": 0.23272784054279327, + "learning_rate": 6.90043667258079e-05, + "loss": 1.7742, + "step": 12858 + }, + { + "epoch": 3.9468999386126455, + "grad_norm": 0.2277698516845703, + "learning_rate": 6.899976910956965e-05, + "loss": 1.7465, + "step": 12859 + }, + { + "epoch": 3.947206875383671, + "grad_norm": 0.2376442402601242, + "learning_rate": 6.899517130556454e-05, + "loss": 1.7995, + "step": 12860 + }, + { + "epoch": 3.947513812154696, + "grad_norm": 0.25591593980789185, + "learning_rate": 6.899057331383802e-05, + "loss": 1.8017, + "step": 12861 + }, + { + "epoch": 3.9478207489257215, + "grad_norm": 0.2715262472629547, + "learning_rate": 6.898597513443551e-05, + "loss": 1.7967, + "step": 12862 + }, + { + "epoch": 3.9481276856967464, + "grad_norm": 0.20916256308555603, + "learning_rate": 6.898137676740246e-05, + "loss": 1.7711, + "step": 12863 + }, + { + "epoch": 3.9484346224677718, + "grad_norm": 0.2570229768753052, + "learning_rate": 6.897677821278435e-05, + "loss": 1.833, + "step": 12864 + }, + { + "epoch": 3.9487415592387967, + "grad_norm": 0.26343438029289246, + "learning_rate": 6.897217947062657e-05, + "loss": 1.7625, + "step": 12865 + }, + { + "epoch": 3.949048496009822, + "grad_norm": 0.23407024145126343, + "learning_rate": 6.896758054097459e-05, + "loss": 1.7211, + "step": 12866 + }, + { + "epoch": 3.9493554327808473, + "grad_norm": 0.2554715573787689, + "learning_rate": 6.896298142387387e-05, + "loss": 1.8548, + "step": 12867 + }, + { + "epoch": 3.949662369551872, + "grad_norm": 0.24143370985984802, + "learning_rate": 6.895838211936986e-05, + "loss": 1.7635, + "step": 12868 + }, + { + "epoch": 3.9499693063228976, + "grad_norm": 0.24634715914726257, + "learning_rate": 6.8953782627508e-05, + "loss": 1.8012, + "step": 12869 + }, + { + "epoch": 3.9502762430939224, + "grad_norm": 0.22740426659584045, + "learning_rate": 6.894918294833375e-05, + "loss": 1.7294, + "step": 12870 + }, + { + "epoch": 3.950583179864948, + "grad_norm": 0.2651631832122803, + "learning_rate": 6.894458308189257e-05, + "loss": 1.8289, + "step": 12871 + }, + { + "epoch": 3.950890116635973, + "grad_norm": 0.28693267703056335, + "learning_rate": 6.893998302822991e-05, + "loss": 1.8462, + "step": 12872 + }, + { + "epoch": 3.951197053406998, + "grad_norm": 0.26584213972091675, + "learning_rate": 6.893538278739125e-05, + "loss": 1.7621, + "step": 12873 + }, + { + "epoch": 3.9515039901780233, + "grad_norm": 0.29970669746398926, + "learning_rate": 6.893078235942203e-05, + "loss": 1.7659, + "step": 12874 + }, + { + "epoch": 3.9518109269490482, + "grad_norm": 0.2271152138710022, + "learning_rate": 6.892618174436771e-05, + "loss": 1.7151, + "step": 12875 + }, + { + "epoch": 3.9521178637200736, + "grad_norm": 0.24783682823181152, + "learning_rate": 6.892158094227379e-05, + "loss": 1.761, + "step": 12876 + }, + { + "epoch": 3.952424800491099, + "grad_norm": 0.2371140718460083, + "learning_rate": 6.891697995318573e-05, + "loss": 1.7557, + "step": 12877 + }, + { + "epoch": 3.9527317372621242, + "grad_norm": 0.29708394408226013, + "learning_rate": 6.891237877714896e-05, + "loss": 1.8629, + "step": 12878 + }, + { + "epoch": 3.953038674033149, + "grad_norm": 0.2724219262599945, + "learning_rate": 6.890777741420899e-05, + "loss": 1.7378, + "step": 12879 + }, + { + "epoch": 3.9533456108041745, + "grad_norm": 0.2227276861667633, + "learning_rate": 6.890317586441126e-05, + "loss": 1.6989, + "step": 12880 + }, + { + "epoch": 3.9536525475751993, + "grad_norm": 0.2546161413192749, + "learning_rate": 6.889857412780128e-05, + "loss": 1.8688, + "step": 12881 + }, + { + "epoch": 3.9539594843462247, + "grad_norm": 0.24882884323596954, + "learning_rate": 6.889397220442452e-05, + "loss": 1.8137, + "step": 12882 + }, + { + "epoch": 3.95426642111725, + "grad_norm": 0.2549113929271698, + "learning_rate": 6.888937009432644e-05, + "loss": 1.8366, + "step": 12883 + }, + { + "epoch": 3.954573357888275, + "grad_norm": 0.30032673478126526, + "learning_rate": 6.888476779755255e-05, + "loss": 1.8267, + "step": 12884 + }, + { + "epoch": 3.9548802946593002, + "grad_norm": 0.2887294292449951, + "learning_rate": 6.888016531414832e-05, + "loss": 1.8295, + "step": 12885 + }, + { + "epoch": 3.955187231430325, + "grad_norm": 0.2947406470775604, + "learning_rate": 6.88755626441592e-05, + "loss": 1.7713, + "step": 12886 + }, + { + "epoch": 3.9554941682013505, + "grad_norm": 0.2967108190059662, + "learning_rate": 6.887095978763072e-05, + "loss": 1.7636, + "step": 12887 + }, + { + "epoch": 3.955801104972376, + "grad_norm": 0.2495311200618744, + "learning_rate": 6.886635674460836e-05, + "loss": 1.8148, + "step": 12888 + }, + { + "epoch": 3.9561080417434007, + "grad_norm": 0.23367099463939667, + "learning_rate": 6.88617535151376e-05, + "loss": 1.7353, + "step": 12889 + }, + { + "epoch": 3.956414978514426, + "grad_norm": 0.36790570616722107, + "learning_rate": 6.885715009926395e-05, + "loss": 1.7853, + "step": 12890 + }, + { + "epoch": 3.9567219152854514, + "grad_norm": 0.5013020038604736, + "learning_rate": 6.885254649703287e-05, + "loss": 1.7923, + "step": 12891 + }, + { + "epoch": 3.9570288520564763, + "grad_norm": 0.4446276128292084, + "learning_rate": 6.884794270848988e-05, + "loss": 1.7504, + "step": 12892 + }, + { + "epoch": 3.9573357888275016, + "grad_norm": 0.2478526383638382, + "learning_rate": 6.88433387336805e-05, + "loss": 1.7629, + "step": 12893 + }, + { + "epoch": 3.957642725598527, + "grad_norm": 0.30111798644065857, + "learning_rate": 6.883873457265019e-05, + "loss": 1.8291, + "step": 12894 + }, + { + "epoch": 3.957949662369552, + "grad_norm": 0.3812437951564789, + "learning_rate": 6.883413022544445e-05, + "loss": 1.7919, + "step": 12895 + }, + { + "epoch": 3.958256599140577, + "grad_norm": 0.2895318269729614, + "learning_rate": 6.882952569210881e-05, + "loss": 1.7467, + "step": 12896 + }, + { + "epoch": 3.958563535911602, + "grad_norm": 0.30391454696655273, + "learning_rate": 6.882492097268873e-05, + "loss": 1.8145, + "step": 12897 + }, + { + "epoch": 3.9588704726826274, + "grad_norm": 0.5033623576164246, + "learning_rate": 6.882031606722977e-05, + "loss": 1.8231, + "step": 12898 + }, + { + "epoch": 3.9591774094536527, + "grad_norm": 0.5351777672767639, + "learning_rate": 6.881571097577742e-05, + "loss": 1.807, + "step": 12899 + }, + { + "epoch": 3.9594843462246776, + "grad_norm": 0.35540491342544556, + "learning_rate": 6.881110569837719e-05, + "loss": 1.7626, + "step": 12900 + }, + { + "epoch": 3.959791282995703, + "grad_norm": 0.22447600960731506, + "learning_rate": 6.880650023507457e-05, + "loss": 1.7392, + "step": 12901 + }, + { + "epoch": 3.960098219766728, + "grad_norm": 0.44619202613830566, + "learning_rate": 6.88018945859151e-05, + "loss": 1.8138, + "step": 12902 + }, + { + "epoch": 3.960405156537753, + "grad_norm": 0.41381633281707764, + "learning_rate": 6.879728875094428e-05, + "loss": 1.7676, + "step": 12903 + }, + { + "epoch": 3.9607120933087785, + "grad_norm": 0.2601528465747833, + "learning_rate": 6.879268273020764e-05, + "loss": 1.8406, + "step": 12904 + }, + { + "epoch": 3.961019030079804, + "grad_norm": 0.3309035003185272, + "learning_rate": 6.878807652375071e-05, + "loss": 1.7673, + "step": 12905 + }, + { + "epoch": 3.9613259668508287, + "grad_norm": 0.5281669497489929, + "learning_rate": 6.878347013161899e-05, + "loss": 1.7686, + "step": 12906 + }, + { + "epoch": 3.961632903621854, + "grad_norm": 0.5397645831108093, + "learning_rate": 6.8778863553858e-05, + "loss": 1.8575, + "step": 12907 + }, + { + "epoch": 3.961939840392879, + "grad_norm": 0.329485684633255, + "learning_rate": 6.877425679051327e-05, + "loss": 1.8185, + "step": 12908 + }, + { + "epoch": 3.9622467771639043, + "grad_norm": 0.3012789487838745, + "learning_rate": 6.876964984163034e-05, + "loss": 1.7962, + "step": 12909 + }, + { + "epoch": 3.9625537139349296, + "grad_norm": 0.5596817135810852, + "learning_rate": 6.876504270725472e-05, + "loss": 1.7972, + "step": 12910 + }, + { + "epoch": 3.9628606507059545, + "grad_norm": 0.5374729633331299, + "learning_rate": 6.876043538743197e-05, + "loss": 1.7863, + "step": 12911 + }, + { + "epoch": 3.96316758747698, + "grad_norm": 0.24617290496826172, + "learning_rate": 6.875582788220757e-05, + "loss": 1.7555, + "step": 12912 + }, + { + "epoch": 3.9634745242480047, + "grad_norm": 0.3493972420692444, + "learning_rate": 6.875122019162712e-05, + "loss": 1.8595, + "step": 12913 + }, + { + "epoch": 3.96378146101903, + "grad_norm": 0.4293089807033539, + "learning_rate": 6.874661231573609e-05, + "loss": 1.7647, + "step": 12914 + }, + { + "epoch": 3.9640883977900554, + "grad_norm": 0.30602574348449707, + "learning_rate": 6.874200425458006e-05, + "loss": 1.7122, + "step": 12915 + }, + { + "epoch": 3.9643953345610803, + "grad_norm": 0.22776013612747192, + "learning_rate": 6.873739600820457e-05, + "loss": 1.7136, + "step": 12916 + }, + { + "epoch": 3.9647022713321056, + "grad_norm": 0.3727327585220337, + "learning_rate": 6.873278757665513e-05, + "loss": 1.8314, + "step": 12917 + }, + { + "epoch": 3.9650092081031305, + "grad_norm": 0.35110536217689514, + "learning_rate": 6.872817895997733e-05, + "loss": 1.7506, + "step": 12918 + }, + { + "epoch": 3.965316144874156, + "grad_norm": 0.275560587644577, + "learning_rate": 6.872357015821666e-05, + "loss": 1.7865, + "step": 12919 + }, + { + "epoch": 3.965623081645181, + "grad_norm": 0.2686980366706848, + "learning_rate": 6.871896117141873e-05, + "loss": 1.8431, + "step": 12920 + }, + { + "epoch": 3.9659300184162065, + "grad_norm": 0.3299664556980133, + "learning_rate": 6.871435199962901e-05, + "loss": 1.7988, + "step": 12921 + }, + { + "epoch": 3.9662369551872314, + "grad_norm": 0.2833637297153473, + "learning_rate": 6.870974264289313e-05, + "loss": 1.6993, + "step": 12922 + }, + { + "epoch": 3.9665438919582567, + "grad_norm": 0.25062620639801025, + "learning_rate": 6.870513310125659e-05, + "loss": 1.7814, + "step": 12923 + }, + { + "epoch": 3.9668508287292816, + "grad_norm": 0.26609909534454346, + "learning_rate": 6.870052337476498e-05, + "loss": 1.7871, + "step": 12924 + }, + { + "epoch": 3.967157765500307, + "grad_norm": 0.22760890424251556, + "learning_rate": 6.869591346346382e-05, + "loss": 1.7941, + "step": 12925 + }, + { + "epoch": 3.9674647022713323, + "grad_norm": 0.2845582067966461, + "learning_rate": 6.869130336739869e-05, + "loss": 1.8215, + "step": 12926 + }, + { + "epoch": 3.967771639042357, + "grad_norm": 0.254948228597641, + "learning_rate": 6.868669308661514e-05, + "loss": 1.7515, + "step": 12927 + }, + { + "epoch": 3.9680785758133825, + "grad_norm": 0.2372167855501175, + "learning_rate": 6.868208262115875e-05, + "loss": 1.7524, + "step": 12928 + }, + { + "epoch": 3.9683855125844074, + "grad_norm": 0.31165993213653564, + "learning_rate": 6.867747197107506e-05, + "loss": 1.8139, + "step": 12929 + }, + { + "epoch": 3.9686924493554327, + "grad_norm": 0.2617839276790619, + "learning_rate": 6.867286113640965e-05, + "loss": 1.7388, + "step": 12930 + }, + { + "epoch": 3.968999386126458, + "grad_norm": 0.22749558091163635, + "learning_rate": 6.866825011720807e-05, + "loss": 1.7421, + "step": 12931 + }, + { + "epoch": 3.969306322897483, + "grad_norm": 0.27737462520599365, + "learning_rate": 6.86636389135159e-05, + "loss": 1.7977, + "step": 12932 + }, + { + "epoch": 3.9696132596685083, + "grad_norm": 0.3331063985824585, + "learning_rate": 6.865902752537871e-05, + "loss": 1.7925, + "step": 12933 + }, + { + "epoch": 3.969920196439533, + "grad_norm": 0.24229519069194794, + "learning_rate": 6.86544159528421e-05, + "loss": 1.7782, + "step": 12934 + }, + { + "epoch": 3.9702271332105585, + "grad_norm": 0.29494860768318176, + "learning_rate": 6.86498041959516e-05, + "loss": 1.7713, + "step": 12935 + }, + { + "epoch": 3.970534069981584, + "grad_norm": 0.26064008474349976, + "learning_rate": 6.86451922547528e-05, + "loss": 1.7161, + "step": 12936 + }, + { + "epoch": 3.970841006752609, + "grad_norm": 0.2656785547733307, + "learning_rate": 6.864058012929129e-05, + "loss": 1.8154, + "step": 12937 + }, + { + "epoch": 3.971147943523634, + "grad_norm": 0.21170997619628906, + "learning_rate": 6.863596781961263e-05, + "loss": 1.7614, + "step": 12938 + }, + { + "epoch": 3.9714548802946594, + "grad_norm": 0.21709072589874268, + "learning_rate": 6.863135532576241e-05, + "loss": 1.7896, + "step": 12939 + }, + { + "epoch": 3.9717618170656843, + "grad_norm": 0.2361367791891098, + "learning_rate": 6.862674264778623e-05, + "loss": 1.7775, + "step": 12940 + }, + { + "epoch": 3.9720687538367097, + "grad_norm": 0.22042550146579742, + "learning_rate": 6.862212978572967e-05, + "loss": 1.7781, + "step": 12941 + }, + { + "epoch": 3.972375690607735, + "grad_norm": 0.2535422146320343, + "learning_rate": 6.86175167396383e-05, + "loss": 1.7665, + "step": 12942 + }, + { + "epoch": 3.97268262737876, + "grad_norm": 0.23741906881332397, + "learning_rate": 6.861290350955771e-05, + "loss": 1.7829, + "step": 12943 + }, + { + "epoch": 3.972989564149785, + "grad_norm": 0.23789910972118378, + "learning_rate": 6.860829009553351e-05, + "loss": 1.7745, + "step": 12944 + }, + { + "epoch": 3.97329650092081, + "grad_norm": 0.26867765188217163, + "learning_rate": 6.860367649761127e-05, + "loss": 1.7239, + "step": 12945 + }, + { + "epoch": 3.9736034376918354, + "grad_norm": 0.3211663067340851, + "learning_rate": 6.85990627158366e-05, + "loss": 1.7976, + "step": 12946 + }, + { + "epoch": 3.9739103744628608, + "grad_norm": 0.26177310943603516, + "learning_rate": 6.85944487502551e-05, + "loss": 1.7446, + "step": 12947 + }, + { + "epoch": 3.9742173112338857, + "grad_norm": 0.23622745275497437, + "learning_rate": 6.858983460091234e-05, + "loss": 1.7824, + "step": 12948 + }, + { + "epoch": 3.974524248004911, + "grad_norm": 0.24372988939285278, + "learning_rate": 6.858522026785395e-05, + "loss": 1.8014, + "step": 12949 + }, + { + "epoch": 3.974831184775936, + "grad_norm": 0.2566998600959778, + "learning_rate": 6.85806057511255e-05, + "loss": 1.742, + "step": 12950 + }, + { + "epoch": 3.9751381215469612, + "grad_norm": 0.24418365955352783, + "learning_rate": 6.857599105077264e-05, + "loss": 1.7331, + "step": 12951 + }, + { + "epoch": 3.9754450583179866, + "grad_norm": 0.2260327935218811, + "learning_rate": 6.857137616684094e-05, + "loss": 1.7173, + "step": 12952 + }, + { + "epoch": 3.975751995089012, + "grad_norm": 0.277044415473938, + "learning_rate": 6.856676109937602e-05, + "loss": 1.7255, + "step": 12953 + }, + { + "epoch": 3.976058931860037, + "grad_norm": 0.228300079703331, + "learning_rate": 6.856214584842348e-05, + "loss": 1.7796, + "step": 12954 + }, + { + "epoch": 3.976365868631062, + "grad_norm": 0.2246638983488083, + "learning_rate": 6.855753041402893e-05, + "loss": 1.7458, + "step": 12955 + }, + { + "epoch": 3.976672805402087, + "grad_norm": 0.22235621511936188, + "learning_rate": 6.855291479623799e-05, + "loss": 1.7585, + "step": 12956 + }, + { + "epoch": 3.9769797421731123, + "grad_norm": 0.23710694909095764, + "learning_rate": 6.854829899509627e-05, + "loss": 1.767, + "step": 12957 + }, + { + "epoch": 3.9772866789441377, + "grad_norm": 0.2527346611022949, + "learning_rate": 6.854368301064939e-05, + "loss": 1.828, + "step": 12958 + }, + { + "epoch": 3.9775936157151626, + "grad_norm": 0.25032514333724976, + "learning_rate": 6.853906684294298e-05, + "loss": 1.8533, + "step": 12959 + }, + { + "epoch": 3.977900552486188, + "grad_norm": 0.2346320003271103, + "learning_rate": 6.853445049202262e-05, + "loss": 1.8046, + "step": 12960 + }, + { + "epoch": 3.978207489257213, + "grad_norm": 0.22576460242271423, + "learning_rate": 6.852983395793398e-05, + "loss": 1.7502, + "step": 12961 + }, + { + "epoch": 3.978514426028238, + "grad_norm": 0.2230147123336792, + "learning_rate": 6.852521724072266e-05, + "loss": 1.7362, + "step": 12962 + }, + { + "epoch": 3.9788213627992635, + "grad_norm": 0.2339705526828766, + "learning_rate": 6.852060034043425e-05, + "loss": 1.763, + "step": 12963 + }, + { + "epoch": 3.979128299570289, + "grad_norm": 0.24511271715164185, + "learning_rate": 6.851598325711446e-05, + "loss": 1.7988, + "step": 12964 + }, + { + "epoch": 3.9794352363413137, + "grad_norm": 0.2927285134792328, + "learning_rate": 6.851136599080885e-05, + "loss": 1.8346, + "step": 12965 + }, + { + "epoch": 3.979742173112339, + "grad_norm": 0.2593212425708771, + "learning_rate": 6.850674854156305e-05, + "loss": 1.7368, + "step": 12966 + }, + { + "epoch": 3.980049109883364, + "grad_norm": 0.3013291656970978, + "learning_rate": 6.850213090942275e-05, + "loss": 1.7911, + "step": 12967 + }, + { + "epoch": 3.9803560466543892, + "grad_norm": 0.3420047163963318, + "learning_rate": 6.849751309443352e-05, + "loss": 1.7899, + "step": 12968 + }, + { + "epoch": 3.9806629834254146, + "grad_norm": 0.2901746928691864, + "learning_rate": 6.849289509664105e-05, + "loss": 1.8244, + "step": 12969 + }, + { + "epoch": 3.9809699201964395, + "grad_norm": 0.2389298677444458, + "learning_rate": 6.848827691609093e-05, + "loss": 1.7116, + "step": 12970 + }, + { + "epoch": 3.981276856967465, + "grad_norm": 0.3153960704803467, + "learning_rate": 6.848365855282882e-05, + "loss": 1.7665, + "step": 12971 + }, + { + "epoch": 3.9815837937384897, + "grad_norm": 0.3162175118923187, + "learning_rate": 6.847904000690036e-05, + "loss": 1.7722, + "step": 12972 + }, + { + "epoch": 3.981890730509515, + "grad_norm": 0.27458643913269043, + "learning_rate": 6.847442127835122e-05, + "loss": 1.8095, + "step": 12973 + }, + { + "epoch": 3.9821976672805404, + "grad_norm": 0.22330710291862488, + "learning_rate": 6.846980236722699e-05, + "loss": 1.7179, + "step": 12974 + }, + { + "epoch": 3.9825046040515653, + "grad_norm": 0.2940923869609833, + "learning_rate": 6.846518327357339e-05, + "loss": 1.7363, + "step": 12975 + }, + { + "epoch": 3.9828115408225906, + "grad_norm": 0.26479849219322205, + "learning_rate": 6.846056399743599e-05, + "loss": 1.7788, + "step": 12976 + }, + { + "epoch": 3.9831184775936155, + "grad_norm": 0.24145057797431946, + "learning_rate": 6.845594453886048e-05, + "loss": 1.7825, + "step": 12977 + }, + { + "epoch": 3.983425414364641, + "grad_norm": 0.2795869708061218, + "learning_rate": 6.845132489789252e-05, + "loss": 1.7705, + "step": 12978 + }, + { + "epoch": 3.983732351135666, + "grad_norm": 0.3117202818393707, + "learning_rate": 6.844670507457776e-05, + "loss": 1.8183, + "step": 12979 + }, + { + "epoch": 3.9840392879066915, + "grad_norm": 0.2666899263858795, + "learning_rate": 6.844208506896184e-05, + "loss": 1.7434, + "step": 12980 + }, + { + "epoch": 3.9843462246777164, + "grad_norm": 0.24682332575321198, + "learning_rate": 6.843746488109042e-05, + "loss": 1.751, + "step": 12981 + }, + { + "epoch": 3.9846531614487417, + "grad_norm": 0.2558208703994751, + "learning_rate": 6.843284451100916e-05, + "loss": 1.7983, + "step": 12982 + }, + { + "epoch": 3.9849600982197666, + "grad_norm": 0.4236481189727783, + "learning_rate": 6.842822395876374e-05, + "loss": 1.8584, + "step": 12983 + }, + { + "epoch": 3.985267034990792, + "grad_norm": 0.4931485950946808, + "learning_rate": 6.84236032243998e-05, + "loss": 1.7617, + "step": 12984 + }, + { + "epoch": 3.9855739717618173, + "grad_norm": 0.37793654203414917, + "learning_rate": 6.841898230796302e-05, + "loss": 1.7411, + "step": 12985 + }, + { + "epoch": 3.985880908532842, + "grad_norm": 0.2093842774629593, + "learning_rate": 6.841436120949906e-05, + "loss": 1.772, + "step": 12986 + }, + { + "epoch": 3.9861878453038675, + "grad_norm": 0.4065552055835724, + "learning_rate": 6.840973992905359e-05, + "loss": 1.7675, + "step": 12987 + }, + { + "epoch": 3.9864947820748924, + "grad_norm": 0.5334183573722839, + "learning_rate": 6.840511846667228e-05, + "loss": 1.7872, + "step": 12988 + }, + { + "epoch": 3.9868017188459177, + "grad_norm": 0.378974974155426, + "learning_rate": 6.84004968224008e-05, + "loss": 1.8288, + "step": 12989 + }, + { + "epoch": 3.987108655616943, + "grad_norm": 0.22518309950828552, + "learning_rate": 6.839587499628483e-05, + "loss": 1.7715, + "step": 12990 + }, + { + "epoch": 3.987415592387968, + "grad_norm": 0.4270850718021393, + "learning_rate": 6.839125298837003e-05, + "loss": 1.7797, + "step": 12991 + }, + { + "epoch": 3.9877225291589933, + "grad_norm": 0.4629896879196167, + "learning_rate": 6.838663079870211e-05, + "loss": 1.7936, + "step": 12992 + }, + { + "epoch": 3.988029465930018, + "grad_norm": 0.29273948073387146, + "learning_rate": 6.838200842732672e-05, + "loss": 1.8264, + "step": 12993 + }, + { + "epoch": 3.9883364027010435, + "grad_norm": 0.31575852632522583, + "learning_rate": 6.837738587428954e-05, + "loss": 1.8043, + "step": 12994 + }, + { + "epoch": 3.988643339472069, + "grad_norm": 0.40602433681488037, + "learning_rate": 6.837276313963627e-05, + "loss": 1.7409, + "step": 12995 + }, + { + "epoch": 3.988950276243094, + "grad_norm": 0.23413142561912537, + "learning_rate": 6.836814022341259e-05, + "loss": 1.8585, + "step": 12996 + }, + { + "epoch": 3.989257213014119, + "grad_norm": 0.3518814444541931, + "learning_rate": 6.836351712566416e-05, + "loss": 1.7768, + "step": 12997 + }, + { + "epoch": 3.9895641497851444, + "grad_norm": 0.3811505436897278, + "learning_rate": 6.83588938464367e-05, + "loss": 1.7738, + "step": 12998 + }, + { + "epoch": 3.9898710865561693, + "grad_norm": 0.2516780197620392, + "learning_rate": 6.835427038577589e-05, + "loss": 1.7351, + "step": 12999 + }, + { + "epoch": 3.9901780233271946, + "grad_norm": 0.23704510927200317, + "learning_rate": 6.834964674372744e-05, + "loss": 1.7907, + "step": 13000 + }, + { + "epoch": 3.99048496009822, + "grad_norm": 0.2890201807022095, + "learning_rate": 6.8345022920337e-05, + "loss": 1.9546, + "step": 13001 + }, + { + "epoch": 3.990791896869245, + "grad_norm": 0.2678101360797882, + "learning_rate": 6.834039891565031e-05, + "loss": 1.7338, + "step": 13002 + }, + { + "epoch": 3.99109883364027, + "grad_norm": 0.31726256012916565, + "learning_rate": 6.833577472971304e-05, + "loss": 1.8464, + "step": 13003 + }, + { + "epoch": 3.991405770411295, + "grad_norm": 0.28112682700157166, + "learning_rate": 6.83311503625709e-05, + "loss": 1.7427, + "step": 13004 + }, + { + "epoch": 3.9917127071823204, + "grad_norm": 0.2651563584804535, + "learning_rate": 6.832652581426958e-05, + "loss": 1.8117, + "step": 13005 + }, + { + "epoch": 3.9920196439533457, + "grad_norm": 0.3095388114452362, + "learning_rate": 6.83219010848548e-05, + "loss": 1.8286, + "step": 13006 + }, + { + "epoch": 3.9923265807243706, + "grad_norm": 0.24704942107200623, + "learning_rate": 6.831727617437225e-05, + "loss": 1.77, + "step": 13007 + }, + { + "epoch": 3.992633517495396, + "grad_norm": 0.24868519604206085, + "learning_rate": 6.831265108286764e-05, + "loss": 1.8129, + "step": 13008 + }, + { + "epoch": 3.992940454266421, + "grad_norm": 0.26511049270629883, + "learning_rate": 6.830802581038669e-05, + "loss": 1.7539, + "step": 13009 + }, + { + "epoch": 3.993247391037446, + "grad_norm": 0.2823421061038971, + "learning_rate": 6.830340035697508e-05, + "loss": 1.8068, + "step": 13010 + }, + { + "epoch": 3.9935543278084715, + "grad_norm": 0.28526121377944946, + "learning_rate": 6.829877472267856e-05, + "loss": 1.764, + "step": 13011 + }, + { + "epoch": 3.993861264579497, + "grad_norm": 0.2576456069946289, + "learning_rate": 6.829414890754281e-05, + "loss": 1.728, + "step": 13012 + }, + { + "epoch": 3.9941682013505218, + "grad_norm": 0.27154842019081116, + "learning_rate": 6.828952291161356e-05, + "loss": 1.797, + "step": 13013 + }, + { + "epoch": 3.994475138121547, + "grad_norm": 0.3129710555076599, + "learning_rate": 6.828489673493652e-05, + "loss": 1.769, + "step": 13014 + }, + { + "epoch": 3.994782074892572, + "grad_norm": 0.40118902921676636, + "learning_rate": 6.828027037755742e-05, + "loss": 1.8029, + "step": 13015 + }, + { + "epoch": 3.9950890116635973, + "grad_norm": 0.33228442072868347, + "learning_rate": 6.827564383952197e-05, + "loss": 1.7295, + "step": 13016 + }, + { + "epoch": 3.9953959484346226, + "grad_norm": 0.218771830201149, + "learning_rate": 6.827101712087591e-05, + "loss": 1.7693, + "step": 13017 + }, + { + "epoch": 3.9957028852056475, + "grad_norm": 0.31354373693466187, + "learning_rate": 6.826639022166492e-05, + "loss": 1.743, + "step": 13018 + }, + { + "epoch": 3.996009821976673, + "grad_norm": 0.3584701418876648, + "learning_rate": 6.826176314193478e-05, + "loss": 1.7597, + "step": 13019 + }, + { + "epoch": 3.9963167587476978, + "grad_norm": 0.2692064344882965, + "learning_rate": 6.82571358817312e-05, + "loss": 1.7871, + "step": 13020 + }, + { + "epoch": 3.996623695518723, + "grad_norm": 0.3064020276069641, + "learning_rate": 6.825250844109987e-05, + "loss": 1.7858, + "step": 13021 + }, + { + "epoch": 3.9969306322897484, + "grad_norm": 0.29913413524627686, + "learning_rate": 6.824788082008657e-05, + "loss": 1.7773, + "step": 13022 + }, + { + "epoch": 3.9972375690607733, + "grad_norm": 0.2682165801525116, + "learning_rate": 6.824325301873703e-05, + "loss": 1.8321, + "step": 13023 + }, + { + "epoch": 3.9975445058317987, + "grad_norm": 0.3274376690387726, + "learning_rate": 6.823862503709694e-05, + "loss": 1.8514, + "step": 13024 + }, + { + "epoch": 3.9978514426028235, + "grad_norm": 0.29828041791915894, + "learning_rate": 6.823399687521211e-05, + "loss": 1.7923, + "step": 13025 + }, + { + "epoch": 3.998158379373849, + "grad_norm": 0.22339288890361786, + "learning_rate": 6.82293685331282e-05, + "loss": 1.756, + "step": 13026 + }, + { + "epoch": 3.998465316144874, + "grad_norm": 0.2254658192396164, + "learning_rate": 6.8224740010891e-05, + "loss": 1.7392, + "step": 13027 + }, + { + "epoch": 3.9987722529158995, + "grad_norm": 0.24932752549648285, + "learning_rate": 6.822011130854624e-05, + "loss": 1.7538, + "step": 13028 + }, + { + "epoch": 3.9990791896869244, + "grad_norm": 0.21429690718650818, + "learning_rate": 6.821548242613966e-05, + "loss": 1.7746, + "step": 13029 + }, + { + "epoch": 3.9993861264579498, + "grad_norm": 0.25503116846084595, + "learning_rate": 6.8210853363717e-05, + "loss": 1.814, + "step": 13030 + }, + { + "epoch": 3.9996930632289747, + "grad_norm": 0.23168155550956726, + "learning_rate": 6.820622412132402e-05, + "loss": 1.769, + "step": 13031 + }, + { + "epoch": 4.0, + "grad_norm": 0.2252223789691925, + "learning_rate": 6.820159469900645e-05, + "loss": 1.7782, + "step": 13032 + }, + { + "epoch": 4.000306936771025, + "grad_norm": 0.1996588408946991, + "learning_rate": 6.819696509681007e-05, + "loss": 1.6839, + "step": 13033 + }, + { + "epoch": 4.000613873542051, + "grad_norm": 0.22297053039073944, + "learning_rate": 6.81923353147806e-05, + "loss": 1.7767, + "step": 13034 + }, + { + "epoch": 4.000920810313075, + "grad_norm": 0.25867611169815063, + "learning_rate": 6.818770535296381e-05, + "loss": 1.8623, + "step": 13035 + }, + { + "epoch": 4.0012277470841005, + "grad_norm": 0.2173648178577423, + "learning_rate": 6.818307521140547e-05, + "loss": 1.8034, + "step": 13036 + }, + { + "epoch": 4.001534683855126, + "grad_norm": 0.23634609580039978, + "learning_rate": 6.81784448901513e-05, + "loss": 1.7503, + "step": 13037 + }, + { + "epoch": 4.001841620626151, + "grad_norm": 0.2626810073852539, + "learning_rate": 6.81738143892471e-05, + "loss": 1.8116, + "step": 13038 + }, + { + "epoch": 4.0021485573971765, + "grad_norm": 0.27888983488082886, + "learning_rate": 6.816918370873861e-05, + "loss": 1.8032, + "step": 13039 + }, + { + "epoch": 4.002455494168202, + "grad_norm": 0.275038480758667, + "learning_rate": 6.816455284867162e-05, + "loss": 1.7445, + "step": 13040 + }, + { + "epoch": 4.002762430939226, + "grad_norm": 0.3475828170776367, + "learning_rate": 6.815992180909184e-05, + "loss": 1.7404, + "step": 13041 + }, + { + "epoch": 4.003069367710252, + "grad_norm": 0.27314287424087524, + "learning_rate": 6.815529059004507e-05, + "loss": 1.8333, + "step": 13042 + }, + { + "epoch": 4.003376304481277, + "grad_norm": 0.34846973419189453, + "learning_rate": 6.815065919157709e-05, + "loss": 1.7921, + "step": 13043 + }, + { + "epoch": 4.003683241252302, + "grad_norm": 0.4191788136959076, + "learning_rate": 6.814602761373365e-05, + "loss": 1.8018, + "step": 13044 + }, + { + "epoch": 4.003990178023328, + "grad_norm": 0.2655608057975769, + "learning_rate": 6.814139585656055e-05, + "loss": 1.7638, + "step": 13045 + }, + { + "epoch": 4.004297114794352, + "grad_norm": 0.25938618183135986, + "learning_rate": 6.813676392010353e-05, + "loss": 1.794, + "step": 13046 + }, + { + "epoch": 4.004604051565377, + "grad_norm": 0.3464813828468323, + "learning_rate": 6.813213180440837e-05, + "loss": 1.8662, + "step": 13047 + }, + { + "epoch": 4.004910988336403, + "grad_norm": 0.30185338854789734, + "learning_rate": 6.812749950952087e-05, + "loss": 1.8029, + "step": 13048 + }, + { + "epoch": 4.005217925107428, + "grad_norm": 0.23291908204555511, + "learning_rate": 6.812286703548678e-05, + "loss": 1.7365, + "step": 13049 + }, + { + "epoch": 4.005524861878453, + "grad_norm": 0.3542841374874115, + "learning_rate": 6.811823438235189e-05, + "loss": 1.8674, + "step": 13050 + }, + { + "epoch": 4.005831798649478, + "grad_norm": 0.2914685606956482, + "learning_rate": 6.811360155016202e-05, + "loss": 1.8306, + "step": 13051 + }, + { + "epoch": 4.006138735420503, + "grad_norm": 0.24888737499713898, + "learning_rate": 6.810896853896289e-05, + "loss": 1.7767, + "step": 13052 + }, + { + "epoch": 4.0064456721915285, + "grad_norm": 0.2977537512779236, + "learning_rate": 6.810433534880033e-05, + "loss": 1.8227, + "step": 13053 + }, + { + "epoch": 4.006752608962554, + "grad_norm": 0.3367510735988617, + "learning_rate": 6.809970197972013e-05, + "loss": 1.734, + "step": 13054 + }, + { + "epoch": 4.007059545733579, + "grad_norm": 0.28098800778388977, + "learning_rate": 6.809506843176806e-05, + "loss": 1.7032, + "step": 13055 + }, + { + "epoch": 4.0073664825046045, + "grad_norm": 0.24016784131526947, + "learning_rate": 6.809043470498991e-05, + "loss": 1.7863, + "step": 13056 + }, + { + "epoch": 4.007673419275629, + "grad_norm": 0.2883957624435425, + "learning_rate": 6.808580079943148e-05, + "loss": 1.7342, + "step": 13057 + }, + { + "epoch": 4.007980356046654, + "grad_norm": 0.3069116473197937, + "learning_rate": 6.808116671513856e-05, + "loss": 1.8544, + "step": 13058 + }, + { + "epoch": 4.00828729281768, + "grad_norm": 0.24113236367702484, + "learning_rate": 6.807653245215697e-05, + "loss": 1.7692, + "step": 13059 + }, + { + "epoch": 4.008594229588705, + "grad_norm": 0.2651619017124176, + "learning_rate": 6.807189801053249e-05, + "loss": 1.8096, + "step": 13060 + }, + { + "epoch": 4.00890116635973, + "grad_norm": 0.2636481523513794, + "learning_rate": 6.806726339031092e-05, + "loss": 1.8062, + "step": 13061 + }, + { + "epoch": 4.009208103130755, + "grad_norm": 0.22691169381141663, + "learning_rate": 6.806262859153807e-05, + "loss": 1.7001, + "step": 13062 + }, + { + "epoch": 4.00951503990178, + "grad_norm": 0.23288170993328094, + "learning_rate": 6.805799361425972e-05, + "loss": 1.7508, + "step": 13063 + }, + { + "epoch": 4.009821976672805, + "grad_norm": 0.243272602558136, + "learning_rate": 6.80533584585217e-05, + "loss": 1.7797, + "step": 13064 + }, + { + "epoch": 4.010128913443831, + "grad_norm": 0.24594646692276, + "learning_rate": 6.80487231243698e-05, + "loss": 1.7894, + "step": 13065 + }, + { + "epoch": 4.010435850214856, + "grad_norm": 0.21726086735725403, + "learning_rate": 6.804408761184986e-05, + "loss": 1.7472, + "step": 13066 + }, + { + "epoch": 4.0107427869858805, + "grad_norm": 0.2262321561574936, + "learning_rate": 6.803945192100767e-05, + "loss": 1.7563, + "step": 13067 + }, + { + "epoch": 4.011049723756906, + "grad_norm": 0.2449522763490677, + "learning_rate": 6.803481605188903e-05, + "loss": 1.7282, + "step": 13068 + }, + { + "epoch": 4.011356660527931, + "grad_norm": 0.2281760573387146, + "learning_rate": 6.803018000453975e-05, + "loss": 1.8191, + "step": 13069 + }, + { + "epoch": 4.0116635972989565, + "grad_norm": 0.3039850890636444, + "learning_rate": 6.80255437790057e-05, + "loss": 1.8258, + "step": 13070 + }, + { + "epoch": 4.011970534069982, + "grad_norm": 0.3978467881679535, + "learning_rate": 6.802090737533264e-05, + "loss": 1.7338, + "step": 13071 + }, + { + "epoch": 4.012277470841007, + "grad_norm": 0.29175812005996704, + "learning_rate": 6.801627079356641e-05, + "loss": 1.7754, + "step": 13072 + }, + { + "epoch": 4.012584407612032, + "grad_norm": 0.24228449165821075, + "learning_rate": 6.801163403375285e-05, + "loss": 1.7624, + "step": 13073 + }, + { + "epoch": 4.012891344383057, + "grad_norm": 0.34527531266212463, + "learning_rate": 6.800699709593776e-05, + "loss": 1.87, + "step": 13074 + }, + { + "epoch": 4.013198281154082, + "grad_norm": 0.1995161920785904, + "learning_rate": 6.800235998016696e-05, + "loss": 1.7253, + "step": 13075 + }, + { + "epoch": 4.013505217925108, + "grad_norm": 0.3509151339530945, + "learning_rate": 6.799772268648628e-05, + "loss": 1.8013, + "step": 13076 + }, + { + "epoch": 4.013812154696133, + "grad_norm": 0.38569679856300354, + "learning_rate": 6.799308521494156e-05, + "loss": 1.7761, + "step": 13077 + }, + { + "epoch": 4.014119091467157, + "grad_norm": 0.2636256814002991, + "learning_rate": 6.798844756557865e-05, + "loss": 1.8101, + "step": 13078 + }, + { + "epoch": 4.014426028238183, + "grad_norm": 0.2570696473121643, + "learning_rate": 6.798380973844335e-05, + "loss": 1.7561, + "step": 13079 + }, + { + "epoch": 4.014732965009208, + "grad_norm": 0.38540002703666687, + "learning_rate": 6.797917173358148e-05, + "loss": 1.7893, + "step": 13080 + }, + { + "epoch": 4.015039901780233, + "grad_norm": 0.2974525988101959, + "learning_rate": 6.79745335510389e-05, + "loss": 1.8331, + "step": 13081 + }, + { + "epoch": 4.015346838551259, + "grad_norm": 0.2563362419605255, + "learning_rate": 6.796989519086146e-05, + "loss": 1.7784, + "step": 13082 + }, + { + "epoch": 4.015653775322283, + "grad_norm": 0.37037795782089233, + "learning_rate": 6.7965256653095e-05, + "loss": 1.7947, + "step": 13083 + }, + { + "epoch": 4.0159607120933085, + "grad_norm": 0.4145336449146271, + "learning_rate": 6.796061793778531e-05, + "loss": 1.7633, + "step": 13084 + }, + { + "epoch": 4.016267648864334, + "grad_norm": 0.32278406620025635, + "learning_rate": 6.795597904497828e-05, + "loss": 1.7827, + "step": 13085 + }, + { + "epoch": 4.016574585635359, + "grad_norm": 0.26466837525367737, + "learning_rate": 6.795133997471974e-05, + "loss": 1.7441, + "step": 13086 + }, + { + "epoch": 4.0168815224063845, + "grad_norm": 0.3212043344974518, + "learning_rate": 6.794670072705553e-05, + "loss": 1.7602, + "step": 13087 + }, + { + "epoch": 4.01718845917741, + "grad_norm": 0.3054736852645874, + "learning_rate": 6.79420613020315e-05, + "loss": 1.7417, + "step": 13088 + }, + { + "epoch": 4.017495395948434, + "grad_norm": 0.22281476855278015, + "learning_rate": 6.793742169969351e-05, + "loss": 1.7675, + "step": 13089 + }, + { + "epoch": 4.01780233271946, + "grad_norm": 0.32630839943885803, + "learning_rate": 6.793278192008742e-05, + "loss": 1.8409, + "step": 13090 + }, + { + "epoch": 4.018109269490485, + "grad_norm": 0.2658778429031372, + "learning_rate": 6.792814196325905e-05, + "loss": 1.7718, + "step": 13091 + }, + { + "epoch": 4.01841620626151, + "grad_norm": 0.24016901850700378, + "learning_rate": 6.792350182925429e-05, + "loss": 1.8393, + "step": 13092 + }, + { + "epoch": 4.018723143032536, + "grad_norm": 0.2882223427295685, + "learning_rate": 6.791886151811897e-05, + "loss": 1.7497, + "step": 13093 + }, + { + "epoch": 4.01903007980356, + "grad_norm": 0.24340751767158508, + "learning_rate": 6.791422102989895e-05, + "loss": 1.72, + "step": 13094 + }, + { + "epoch": 4.019337016574585, + "grad_norm": 0.235665962100029, + "learning_rate": 6.79095803646401e-05, + "loss": 1.7269, + "step": 13095 + }, + { + "epoch": 4.019643953345611, + "grad_norm": 0.32772955298423767, + "learning_rate": 6.79049395223883e-05, + "loss": 1.7916, + "step": 13096 + }, + { + "epoch": 4.019950890116636, + "grad_norm": 0.3189625144004822, + "learning_rate": 6.790029850318938e-05, + "loss": 1.7571, + "step": 13097 + }, + { + "epoch": 4.020257826887661, + "grad_norm": 0.2211185097694397, + "learning_rate": 6.789565730708921e-05, + "loss": 1.793, + "step": 13098 + }, + { + "epoch": 4.020564763658686, + "grad_norm": 0.2840392291545868, + "learning_rate": 6.789101593413367e-05, + "loss": 1.7434, + "step": 13099 + }, + { + "epoch": 4.020871700429711, + "grad_norm": 0.27857357263565063, + "learning_rate": 6.788637438436863e-05, + "loss": 1.742, + "step": 13100 + }, + { + "epoch": 4.0211786372007365, + "grad_norm": 0.314628005027771, + "learning_rate": 6.788173265783996e-05, + "loss": 1.7881, + "step": 13101 + }, + { + "epoch": 4.021485573971762, + "grad_norm": 0.2994774580001831, + "learning_rate": 6.787709075459352e-05, + "loss": 1.7741, + "step": 13102 + }, + { + "epoch": 4.021792510742787, + "grad_norm": 0.3256312310695648, + "learning_rate": 6.787244867467519e-05, + "loss": 1.7758, + "step": 13103 + }, + { + "epoch": 4.0220994475138125, + "grad_norm": 0.2332412451505661, + "learning_rate": 6.786780641813083e-05, + "loss": 1.7654, + "step": 13104 + }, + { + "epoch": 4.022406384284837, + "grad_norm": 0.23226258158683777, + "learning_rate": 6.786316398500636e-05, + "loss": 1.7605, + "step": 13105 + }, + { + "epoch": 4.022713321055862, + "grad_norm": 0.24631965160369873, + "learning_rate": 6.785852137534763e-05, + "loss": 1.7469, + "step": 13106 + }, + { + "epoch": 4.023020257826888, + "grad_norm": 0.1969226449728012, + "learning_rate": 6.785387858920051e-05, + "loss": 1.8151, + "step": 13107 + }, + { + "epoch": 4.023327194597913, + "grad_norm": 0.22769485414028168, + "learning_rate": 6.784923562661091e-05, + "loss": 1.7024, + "step": 13108 + }, + { + "epoch": 4.023634131368938, + "grad_norm": 0.2174670249223709, + "learning_rate": 6.78445924876247e-05, + "loss": 1.8094, + "step": 13109 + }, + { + "epoch": 4.023941068139963, + "grad_norm": 0.2606858015060425, + "learning_rate": 6.783994917228775e-05, + "loss": 1.8043, + "step": 13110 + }, + { + "epoch": 4.024248004910988, + "grad_norm": 0.24721349775791168, + "learning_rate": 6.783530568064599e-05, + "loss": 1.842, + "step": 13111 + }, + { + "epoch": 4.024554941682013, + "grad_norm": 0.2353603094816208, + "learning_rate": 6.783066201274529e-05, + "loss": 1.76, + "step": 13112 + }, + { + "epoch": 4.024861878453039, + "grad_norm": 0.22285830974578857, + "learning_rate": 6.782601816863153e-05, + "loss": 1.8014, + "step": 13113 + }, + { + "epoch": 4.025168815224064, + "grad_norm": 0.2482440173625946, + "learning_rate": 6.782137414835061e-05, + "loss": 1.7552, + "step": 13114 + }, + { + "epoch": 4.0254757519950894, + "grad_norm": 0.19926191866397858, + "learning_rate": 6.781672995194842e-05, + "loss": 1.7549, + "step": 13115 + }, + { + "epoch": 4.025782688766114, + "grad_norm": 0.2342877984046936, + "learning_rate": 6.781208557947086e-05, + "loss": 1.8622, + "step": 13116 + }, + { + "epoch": 4.026089625537139, + "grad_norm": 0.24096547067165375, + "learning_rate": 6.780744103096382e-05, + "loss": 1.7795, + "step": 13117 + }, + { + "epoch": 4.026396562308165, + "grad_norm": 0.23714657127857208, + "learning_rate": 6.780279630647322e-05, + "loss": 1.799, + "step": 13118 + }, + { + "epoch": 4.02670349907919, + "grad_norm": 0.28252026438713074, + "learning_rate": 6.779815140604496e-05, + "loss": 1.7573, + "step": 13119 + }, + { + "epoch": 4.027010435850215, + "grad_norm": 0.28028404712677, + "learning_rate": 6.779350632972493e-05, + "loss": 1.8103, + "step": 13120 + }, + { + "epoch": 4.02731737262124, + "grad_norm": 0.21088312566280365, + "learning_rate": 6.778886107755904e-05, + "loss": 1.7169, + "step": 13121 + }, + { + "epoch": 4.027624309392265, + "grad_norm": 0.22282038629055023, + "learning_rate": 6.77842156495932e-05, + "loss": 1.7206, + "step": 13122 + }, + { + "epoch": 4.02793124616329, + "grad_norm": 0.3281327784061432, + "learning_rate": 6.777957004587331e-05, + "loss": 1.8664, + "step": 13123 + }, + { + "epoch": 4.028238182934316, + "grad_norm": 0.29496827721595764, + "learning_rate": 6.77749242664453e-05, + "loss": 1.7532, + "step": 13124 + }, + { + "epoch": 4.028545119705341, + "grad_norm": 0.25299328565597534, + "learning_rate": 6.777027831135508e-05, + "loss": 1.7836, + "step": 13125 + }, + { + "epoch": 4.0288520564763655, + "grad_norm": 0.3000280559062958, + "learning_rate": 6.776563218064854e-05, + "loss": 1.8079, + "step": 13126 + }, + { + "epoch": 4.029158993247391, + "grad_norm": 0.3613673448562622, + "learning_rate": 6.77609858743716e-05, + "loss": 1.7931, + "step": 13127 + }, + { + "epoch": 4.029465930018416, + "grad_norm": 0.25613468885421753, + "learning_rate": 6.77563393925702e-05, + "loss": 1.7522, + "step": 13128 + }, + { + "epoch": 4.0297728667894415, + "grad_norm": 0.24391578137874603, + "learning_rate": 6.775169273529026e-05, + "loss": 1.818, + "step": 13129 + }, + { + "epoch": 4.030079803560467, + "grad_norm": 0.2806173264980316, + "learning_rate": 6.774704590257768e-05, + "loss": 1.7349, + "step": 13130 + }, + { + "epoch": 4.030386740331492, + "grad_norm": 0.22214172780513763, + "learning_rate": 6.774239889447838e-05, + "loss": 1.759, + "step": 13131 + }, + { + "epoch": 4.030693677102517, + "grad_norm": 0.27285513281822205, + "learning_rate": 6.773775171103828e-05, + "loss": 1.742, + "step": 13132 + }, + { + "epoch": 4.031000613873542, + "grad_norm": 0.22302402555942535, + "learning_rate": 6.773310435230334e-05, + "loss": 1.7277, + "step": 13133 + }, + { + "epoch": 4.031307550644567, + "grad_norm": 0.2350187450647354, + "learning_rate": 6.772845681831947e-05, + "loss": 1.8648, + "step": 13134 + }, + { + "epoch": 4.031614487415593, + "grad_norm": 0.2665547728538513, + "learning_rate": 6.772380910913261e-05, + "loss": 1.776, + "step": 13135 + }, + { + "epoch": 4.031921424186618, + "grad_norm": 0.30652403831481934, + "learning_rate": 6.771916122478867e-05, + "loss": 1.7884, + "step": 13136 + }, + { + "epoch": 4.032228360957642, + "grad_norm": 0.29372814297676086, + "learning_rate": 6.771451316533359e-05, + "loss": 1.8203, + "step": 13137 + }, + { + "epoch": 4.032535297728668, + "grad_norm": 0.2244873046875, + "learning_rate": 6.770986493081329e-05, + "loss": 1.7869, + "step": 13138 + }, + { + "epoch": 4.032842234499693, + "grad_norm": 0.25075265765190125, + "learning_rate": 6.770521652127375e-05, + "loss": 1.772, + "step": 13139 + }, + { + "epoch": 4.033149171270718, + "grad_norm": 0.28118211030960083, + "learning_rate": 6.770056793676087e-05, + "loss": 1.7922, + "step": 13140 + }, + { + "epoch": 4.033456108041744, + "grad_norm": 0.25199100375175476, + "learning_rate": 6.769591917732062e-05, + "loss": 1.7526, + "step": 13141 + }, + { + "epoch": 4.033763044812768, + "grad_norm": 0.2920379638671875, + "learning_rate": 6.769127024299892e-05, + "loss": 1.8365, + "step": 13142 + }, + { + "epoch": 4.0340699815837935, + "grad_norm": 0.23018018901348114, + "learning_rate": 6.768662113384171e-05, + "loss": 1.7411, + "step": 13143 + }, + { + "epoch": 4.034376918354819, + "grad_norm": 0.23253841698169708, + "learning_rate": 6.768197184989494e-05, + "loss": 1.7921, + "step": 13144 + }, + { + "epoch": 4.034683855125844, + "grad_norm": 0.22618864476680756, + "learning_rate": 6.767732239120456e-05, + "loss": 1.7421, + "step": 13145 + }, + { + "epoch": 4.0349907918968695, + "grad_norm": 0.24552187323570251, + "learning_rate": 6.767267275781655e-05, + "loss": 1.7299, + "step": 13146 + }, + { + "epoch": 4.035297728667895, + "grad_norm": 0.22562766075134277, + "learning_rate": 6.76680229497768e-05, + "loss": 1.766, + "step": 13147 + }, + { + "epoch": 4.035604665438919, + "grad_norm": 0.28718629479408264, + "learning_rate": 6.76633729671313e-05, + "loss": 1.7366, + "step": 13148 + }, + { + "epoch": 4.035911602209945, + "grad_norm": 0.38769885897636414, + "learning_rate": 6.765872280992598e-05, + "loss": 1.8244, + "step": 13149 + }, + { + "epoch": 4.03621853898097, + "grad_norm": 0.4232725501060486, + "learning_rate": 6.765407247820683e-05, + "loss": 1.8244, + "step": 13150 + }, + { + "epoch": 4.036525475751995, + "grad_norm": 0.2771088778972626, + "learning_rate": 6.764942197201977e-05, + "loss": 1.7863, + "step": 13151 + }, + { + "epoch": 4.036832412523021, + "grad_norm": 0.2917862832546234, + "learning_rate": 6.76447712914108e-05, + "loss": 1.791, + "step": 13152 + }, + { + "epoch": 4.037139349294045, + "grad_norm": 0.37355467677116394, + "learning_rate": 6.764012043642584e-05, + "loss": 1.74, + "step": 13153 + }, + { + "epoch": 4.03744628606507, + "grad_norm": 0.35664018988609314, + "learning_rate": 6.763546940711089e-05, + "loss": 1.7734, + "step": 13154 + }, + { + "epoch": 4.037753222836096, + "grad_norm": 0.2335754930973053, + "learning_rate": 6.763081820351188e-05, + "loss": 1.7765, + "step": 13155 + }, + { + "epoch": 4.038060159607121, + "grad_norm": 0.2825562357902527, + "learning_rate": 6.762616682567478e-05, + "loss": 1.7867, + "step": 13156 + }, + { + "epoch": 4.038367096378146, + "grad_norm": 0.3103202283382416, + "learning_rate": 6.762151527364559e-05, + "loss": 1.7331, + "step": 13157 + }, + { + "epoch": 4.038674033149171, + "grad_norm": 0.2897353172302246, + "learning_rate": 6.761686354747025e-05, + "loss": 1.7638, + "step": 13158 + }, + { + "epoch": 4.038980969920196, + "grad_norm": 0.21260851621627808, + "learning_rate": 6.761221164719474e-05, + "loss": 1.7302, + "step": 13159 + }, + { + "epoch": 4.0392879066912215, + "grad_norm": 0.2878021001815796, + "learning_rate": 6.760755957286503e-05, + "loss": 1.7368, + "step": 13160 + }, + { + "epoch": 4.039594843462247, + "grad_norm": 0.2785978317260742, + "learning_rate": 6.76029073245271e-05, + "loss": 1.7258, + "step": 13161 + }, + { + "epoch": 4.039901780233272, + "grad_norm": 0.1963953971862793, + "learning_rate": 6.759825490222692e-05, + "loss": 1.755, + "step": 13162 + }, + { + "epoch": 4.0402087170042975, + "grad_norm": 0.26776790618896484, + "learning_rate": 6.759360230601047e-05, + "loss": 1.7676, + "step": 13163 + }, + { + "epoch": 4.040515653775322, + "grad_norm": 0.2751332223415375, + "learning_rate": 6.758894953592373e-05, + "loss": 1.7313, + "step": 13164 + }, + { + "epoch": 4.040822590546347, + "grad_norm": 0.2339213341474533, + "learning_rate": 6.758429659201269e-05, + "loss": 1.714, + "step": 13165 + }, + { + "epoch": 4.041129527317373, + "grad_norm": 0.2624664008617401, + "learning_rate": 6.75796434743233e-05, + "loss": 1.8296, + "step": 13166 + }, + { + "epoch": 4.041436464088398, + "grad_norm": 0.40156883001327515, + "learning_rate": 6.757499018290159e-05, + "loss": 1.8228, + "step": 13167 + }, + { + "epoch": 4.041743400859423, + "grad_norm": 0.32976576685905457, + "learning_rate": 6.757033671779352e-05, + "loss": 1.7403, + "step": 13168 + }, + { + "epoch": 4.042050337630448, + "grad_norm": 0.2343887835741043, + "learning_rate": 6.756568307904508e-05, + "loss": 1.7837, + "step": 13169 + }, + { + "epoch": 4.042357274401473, + "grad_norm": 0.36174145340919495, + "learning_rate": 6.756102926670227e-05, + "loss": 1.7291, + "step": 13170 + }, + { + "epoch": 4.042664211172498, + "grad_norm": 0.3324793577194214, + "learning_rate": 6.755637528081108e-05, + "loss": 1.7414, + "step": 13171 + }, + { + "epoch": 4.042971147943524, + "grad_norm": 0.21945348381996155, + "learning_rate": 6.75517211214175e-05, + "loss": 1.7762, + "step": 13172 + }, + { + "epoch": 4.043278084714549, + "grad_norm": 0.31069812178611755, + "learning_rate": 6.75470667885675e-05, + "loss": 1.7666, + "step": 13173 + }, + { + "epoch": 4.043585021485574, + "grad_norm": 0.3931153118610382, + "learning_rate": 6.754241228230713e-05, + "loss": 1.7871, + "step": 13174 + }, + { + "epoch": 4.043891958256599, + "grad_norm": 0.25559595227241516, + "learning_rate": 6.753775760268234e-05, + "loss": 1.7916, + "step": 13175 + }, + { + "epoch": 4.044198895027624, + "grad_norm": 0.3686937391757965, + "learning_rate": 6.753310274973917e-05, + "loss": 1.7642, + "step": 13176 + }, + { + "epoch": 4.0445058317986495, + "grad_norm": 0.4793247580528259, + "learning_rate": 6.75284477235236e-05, + "loss": 1.739, + "step": 13177 + }, + { + "epoch": 4.044812768569675, + "grad_norm": 0.36179354786872864, + "learning_rate": 6.752379252408164e-05, + "loss": 1.7993, + "step": 13178 + }, + { + "epoch": 4.0451197053407, + "grad_norm": 0.22559234499931335, + "learning_rate": 6.751913715145926e-05, + "loss": 1.7401, + "step": 13179 + }, + { + "epoch": 4.045426642111725, + "grad_norm": 0.29058873653411865, + "learning_rate": 6.751448160570253e-05, + "loss": 1.8089, + "step": 13180 + }, + { + "epoch": 4.04573357888275, + "grad_norm": 0.3069808781147003, + "learning_rate": 6.750982588685742e-05, + "loss": 1.7587, + "step": 13181 + }, + { + "epoch": 4.046040515653775, + "grad_norm": 0.2292155921459198, + "learning_rate": 6.750516999496994e-05, + "loss": 1.7429, + "step": 13182 + }, + { + "epoch": 4.046347452424801, + "grad_norm": 0.2520677149295807, + "learning_rate": 6.750051393008612e-05, + "loss": 1.7842, + "step": 13183 + }, + { + "epoch": 4.046654389195826, + "grad_norm": 0.32546502351760864, + "learning_rate": 6.749585769225194e-05, + "loss": 1.8057, + "step": 13184 + }, + { + "epoch": 4.04696132596685, + "grad_norm": 0.27634644508361816, + "learning_rate": 6.749120128151346e-05, + "loss": 1.7708, + "step": 13185 + }, + { + "epoch": 4.047268262737876, + "grad_norm": 0.2546750009059906, + "learning_rate": 6.748654469791668e-05, + "loss": 1.8744, + "step": 13186 + }, + { + "epoch": 4.047575199508901, + "grad_norm": 0.43873605132102966, + "learning_rate": 6.748188794150761e-05, + "loss": 1.8573, + "step": 13187 + }, + { + "epoch": 4.047882136279926, + "grad_norm": 0.45526960492134094, + "learning_rate": 6.747723101233227e-05, + "loss": 1.7761, + "step": 13188 + }, + { + "epoch": 4.048189073050952, + "grad_norm": 0.24995557963848114, + "learning_rate": 6.74725739104367e-05, + "loss": 1.7679, + "step": 13189 + }, + { + "epoch": 4.048496009821977, + "grad_norm": 0.3203068971633911, + "learning_rate": 6.74679166358669e-05, + "loss": 1.7772, + "step": 13190 + }, + { + "epoch": 4.0488029465930016, + "grad_norm": 0.37020671367645264, + "learning_rate": 6.746325918866893e-05, + "loss": 1.8002, + "step": 13191 + }, + { + "epoch": 4.049109883364027, + "grad_norm": 0.2543959319591522, + "learning_rate": 6.745860156888878e-05, + "loss": 1.8057, + "step": 13192 + }, + { + "epoch": 4.049416820135052, + "grad_norm": 0.2566509246826172, + "learning_rate": 6.74539437765725e-05, + "loss": 1.7853, + "step": 13193 + }, + { + "epoch": 4.0497237569060776, + "grad_norm": 0.2545804977416992, + "learning_rate": 6.744928581176612e-05, + "loss": 1.8136, + "step": 13194 + }, + { + "epoch": 4.050030693677103, + "grad_norm": 0.24307197332382202, + "learning_rate": 6.744462767451568e-05, + "loss": 1.7919, + "step": 13195 + }, + { + "epoch": 4.050337630448127, + "grad_norm": 0.24427616596221924, + "learning_rate": 6.743996936486719e-05, + "loss": 1.8037, + "step": 13196 + }, + { + "epoch": 4.050644567219153, + "grad_norm": 0.2154439389705658, + "learning_rate": 6.743531088286673e-05, + "loss": 1.7088, + "step": 13197 + }, + { + "epoch": 4.050951503990178, + "grad_norm": 0.22251558303833008, + "learning_rate": 6.743065222856027e-05, + "loss": 1.7512, + "step": 13198 + }, + { + "epoch": 4.051258440761203, + "grad_norm": 0.2373272329568863, + "learning_rate": 6.74259934019939e-05, + "loss": 1.8056, + "step": 13199 + }, + { + "epoch": 4.051565377532229, + "grad_norm": 0.23308727145195007, + "learning_rate": 6.742133440321366e-05, + "loss": 1.731, + "step": 13200 + }, + { + "epoch": 4.051872314303253, + "grad_norm": 0.2438805252313614, + "learning_rate": 6.741667523226557e-05, + "loss": 1.7938, + "step": 13201 + }, + { + "epoch": 4.0521792510742785, + "grad_norm": 0.22354702651500702, + "learning_rate": 6.741201588919569e-05, + "loss": 1.762, + "step": 13202 + }, + { + "epoch": 4.052486187845304, + "grad_norm": 0.2505488097667694, + "learning_rate": 6.740735637405006e-05, + "loss": 1.7627, + "step": 13203 + }, + { + "epoch": 4.052793124616329, + "grad_norm": 0.21378709375858307, + "learning_rate": 6.740269668687474e-05, + "loss": 1.7598, + "step": 13204 + }, + { + "epoch": 4.0531000613873545, + "grad_norm": 0.24863660335540771, + "learning_rate": 6.739803682771577e-05, + "loss": 1.7665, + "step": 13205 + }, + { + "epoch": 4.05340699815838, + "grad_norm": 0.3041808605194092, + "learning_rate": 6.739337679661921e-05, + "loss": 1.7909, + "step": 13206 + }, + { + "epoch": 4.053713934929404, + "grad_norm": 0.2745797634124756, + "learning_rate": 6.738871659363109e-05, + "loss": 1.7547, + "step": 13207 + }, + { + "epoch": 4.05402087170043, + "grad_norm": 0.2610073387622833, + "learning_rate": 6.738405621879748e-05, + "loss": 1.7723, + "step": 13208 + }, + { + "epoch": 4.054327808471455, + "grad_norm": 0.22728075087070465, + "learning_rate": 6.737939567216446e-05, + "loss": 1.7865, + "step": 13209 + }, + { + "epoch": 4.05463474524248, + "grad_norm": 0.2877669930458069, + "learning_rate": 6.737473495377804e-05, + "loss": 1.8352, + "step": 13210 + }, + { + "epoch": 4.054941682013506, + "grad_norm": 0.35316282510757446, + "learning_rate": 6.737007406368432e-05, + "loss": 1.8202, + "step": 13211 + }, + { + "epoch": 4.05524861878453, + "grad_norm": 0.34625691175460815, + "learning_rate": 6.736541300192936e-05, + "loss": 1.8456, + "step": 13212 + }, + { + "epoch": 4.055555555555555, + "grad_norm": 0.2432134598493576, + "learning_rate": 6.736075176855917e-05, + "loss": 1.8237, + "step": 13213 + }, + { + "epoch": 4.055862492326581, + "grad_norm": 0.27446529269218445, + "learning_rate": 6.735609036361989e-05, + "loss": 1.71, + "step": 13214 + }, + { + "epoch": 4.056169429097606, + "grad_norm": 0.2870408892631531, + "learning_rate": 6.735142878715754e-05, + "loss": 1.7473, + "step": 13215 + }, + { + "epoch": 4.056476365868631, + "grad_norm": 0.22249078750610352, + "learning_rate": 6.734676703921822e-05, + "loss": 1.7462, + "step": 13216 + }, + { + "epoch": 4.056783302639656, + "grad_norm": 0.25519105792045593, + "learning_rate": 6.734210511984796e-05, + "loss": 1.7022, + "step": 13217 + }, + { + "epoch": 4.057090239410681, + "grad_norm": 0.3366561830043793, + "learning_rate": 6.733744302909285e-05, + "loss": 1.787, + "step": 13218 + }, + { + "epoch": 4.0573971761817065, + "grad_norm": 0.2443208247423172, + "learning_rate": 6.733278076699897e-05, + "loss": 1.8048, + "step": 13219 + }, + { + "epoch": 4.057704112952732, + "grad_norm": 0.2893153131008148, + "learning_rate": 6.73281183336124e-05, + "loss": 1.7805, + "step": 13220 + }, + { + "epoch": 4.058011049723757, + "grad_norm": 0.3178043067455292, + "learning_rate": 6.73234557289792e-05, + "loss": 1.8264, + "step": 13221 + }, + { + "epoch": 4.0583179864947825, + "grad_norm": 0.27355703711509705, + "learning_rate": 6.731879295314546e-05, + "loss": 1.8427, + "step": 13222 + }, + { + "epoch": 4.058624923265807, + "grad_norm": 0.32180166244506836, + "learning_rate": 6.731413000615726e-05, + "loss": 1.7332, + "step": 13223 + }, + { + "epoch": 4.058931860036832, + "grad_norm": 0.3736574351787567, + "learning_rate": 6.730946688806067e-05, + "loss": 1.7447, + "step": 13224 + }, + { + "epoch": 4.059238796807858, + "grad_norm": 0.2526068687438965, + "learning_rate": 6.73048035989018e-05, + "loss": 1.8104, + "step": 13225 + }, + { + "epoch": 4.059545733578883, + "grad_norm": 0.29076167941093445, + "learning_rate": 6.73001401387267e-05, + "loss": 1.7977, + "step": 13226 + }, + { + "epoch": 4.059852670349908, + "grad_norm": 0.37963762879371643, + "learning_rate": 6.729547650758148e-05, + "loss": 1.8336, + "step": 13227 + }, + { + "epoch": 4.060159607120933, + "grad_norm": 0.31584078073501587, + "learning_rate": 6.729081270551222e-05, + "loss": 1.7843, + "step": 13228 + }, + { + "epoch": 4.060466543891958, + "grad_norm": 0.22793468832969666, + "learning_rate": 6.728614873256502e-05, + "loss": 1.7444, + "step": 13229 + }, + { + "epoch": 4.060773480662983, + "grad_norm": 0.3114435076713562, + "learning_rate": 6.728148458878596e-05, + "loss": 1.8012, + "step": 13230 + }, + { + "epoch": 4.061080417434009, + "grad_norm": 0.29843854904174805, + "learning_rate": 6.727682027422116e-05, + "loss": 1.8014, + "step": 13231 + }, + { + "epoch": 4.061387354205034, + "grad_norm": 0.22745616734027863, + "learning_rate": 6.727215578891668e-05, + "loss": 1.7303, + "step": 13232 + }, + { + "epoch": 4.0616942909760585, + "grad_norm": 0.2701241970062256, + "learning_rate": 6.726749113291864e-05, + "loss": 1.7665, + "step": 13233 + }, + { + "epoch": 4.062001227747084, + "grad_norm": 0.29304635524749756, + "learning_rate": 6.726282630627313e-05, + "loss": 1.875, + "step": 13234 + }, + { + "epoch": 4.062308164518109, + "grad_norm": 0.21467708051204681, + "learning_rate": 6.725816130902625e-05, + "loss": 1.7442, + "step": 13235 + }, + { + "epoch": 4.0626151012891345, + "grad_norm": 0.23517470061779022, + "learning_rate": 6.72534961412241e-05, + "loss": 1.7154, + "step": 13236 + }, + { + "epoch": 4.06292203806016, + "grad_norm": 0.21483808755874634, + "learning_rate": 6.724883080291278e-05, + "loss": 1.7162, + "step": 13237 + }, + { + "epoch": 4.063228974831185, + "grad_norm": 0.2274744212627411, + "learning_rate": 6.724416529413843e-05, + "loss": 1.8066, + "step": 13238 + }, + { + "epoch": 4.06353591160221, + "grad_norm": 0.24682378768920898, + "learning_rate": 6.723949961494712e-05, + "loss": 1.7905, + "step": 13239 + }, + { + "epoch": 4.063842848373235, + "grad_norm": 0.2516227066516876, + "learning_rate": 6.723483376538498e-05, + "loss": 1.7693, + "step": 13240 + }, + { + "epoch": 4.06414978514426, + "grad_norm": 0.22076398134231567, + "learning_rate": 6.723016774549808e-05, + "loss": 1.7357, + "step": 13241 + }, + { + "epoch": 4.064456721915286, + "grad_norm": 0.20741026103496552, + "learning_rate": 6.722550155533258e-05, + "loss": 1.8082, + "step": 13242 + }, + { + "epoch": 4.064763658686311, + "grad_norm": 0.2074010819196701, + "learning_rate": 6.722083519493458e-05, + "loss": 1.71, + "step": 13243 + }, + { + "epoch": 4.065070595457335, + "grad_norm": 0.2661527991294861, + "learning_rate": 6.72161686643502e-05, + "loss": 1.7448, + "step": 13244 + }, + { + "epoch": 4.065377532228361, + "grad_norm": 0.2877216935157776, + "learning_rate": 6.721150196362555e-05, + "loss": 1.7574, + "step": 13245 + }, + { + "epoch": 4.065684468999386, + "grad_norm": 0.2520955801010132, + "learning_rate": 6.720683509280675e-05, + "loss": 1.7717, + "step": 13246 + }, + { + "epoch": 4.065991405770411, + "grad_norm": 0.2219560444355011, + "learning_rate": 6.72021680519399e-05, + "loss": 1.7355, + "step": 13247 + }, + { + "epoch": 4.066298342541437, + "grad_norm": 0.24671706557273865, + "learning_rate": 6.719750084107117e-05, + "loss": 1.8204, + "step": 13248 + }, + { + "epoch": 4.066605279312462, + "grad_norm": 0.24512135982513428, + "learning_rate": 6.719283346024664e-05, + "loss": 1.826, + "step": 13249 + }, + { + "epoch": 4.0669122160834865, + "grad_norm": 0.24370841681957245, + "learning_rate": 6.718816590951247e-05, + "loss": 1.8322, + "step": 13250 + }, + { + "epoch": 4.067219152854512, + "grad_norm": 0.2312363088130951, + "learning_rate": 6.718349818891475e-05, + "loss": 1.7621, + "step": 13251 + }, + { + "epoch": 4.067526089625537, + "grad_norm": 0.2500494420528412, + "learning_rate": 6.717883029849965e-05, + "loss": 1.829, + "step": 13252 + }, + { + "epoch": 4.0678330263965625, + "grad_norm": 0.29882633686065674, + "learning_rate": 6.717416223831324e-05, + "loss": 1.799, + "step": 13253 + }, + { + "epoch": 4.068139963167588, + "grad_norm": 0.21962928771972656, + "learning_rate": 6.716949400840172e-05, + "loss": 1.7714, + "step": 13254 + }, + { + "epoch": 4.068446899938612, + "grad_norm": 0.25544899702072144, + "learning_rate": 6.716482560881121e-05, + "loss": 1.7911, + "step": 13255 + }, + { + "epoch": 4.068753836709638, + "grad_norm": 0.24865686893463135, + "learning_rate": 6.716015703958781e-05, + "loss": 1.7107, + "step": 13256 + }, + { + "epoch": 4.069060773480663, + "grad_norm": 0.22669239342212677, + "learning_rate": 6.715548830077769e-05, + "loss": 1.8503, + "step": 13257 + }, + { + "epoch": 4.069367710251688, + "grad_norm": 0.2973819077014923, + "learning_rate": 6.715081939242698e-05, + "loss": 1.7859, + "step": 13258 + }, + { + "epoch": 4.069674647022714, + "grad_norm": 0.3178746700286865, + "learning_rate": 6.714615031458181e-05, + "loss": 1.7705, + "step": 13259 + }, + { + "epoch": 4.069981583793738, + "grad_norm": 0.20452535152435303, + "learning_rate": 6.714148106728835e-05, + "loss": 1.7386, + "step": 13260 + }, + { + "epoch": 4.070288520564763, + "grad_norm": 0.30288320779800415, + "learning_rate": 6.713681165059271e-05, + "loss": 1.7823, + "step": 13261 + }, + { + "epoch": 4.070595457335789, + "grad_norm": 0.30014416575431824, + "learning_rate": 6.713214206454107e-05, + "loss": 1.7626, + "step": 13262 + }, + { + "epoch": 4.070902394106814, + "grad_norm": 0.25144243240356445, + "learning_rate": 6.712747230917956e-05, + "loss": 1.8359, + "step": 13263 + }, + { + "epoch": 4.071209330877839, + "grad_norm": 0.308148592710495, + "learning_rate": 6.712280238455432e-05, + "loss": 1.7226, + "step": 13264 + }, + { + "epoch": 4.071516267648865, + "grad_norm": 0.2704198658466339, + "learning_rate": 6.711813229071151e-05, + "loss": 1.7982, + "step": 13265 + }, + { + "epoch": 4.071823204419889, + "grad_norm": 0.3928656280040741, + "learning_rate": 6.711346202769729e-05, + "loss": 1.7987, + "step": 13266 + }, + { + "epoch": 4.0721301411909145, + "grad_norm": 0.3603350520133972, + "learning_rate": 6.71087915955578e-05, + "loss": 1.7963, + "step": 13267 + }, + { + "epoch": 4.07243707796194, + "grad_norm": 0.2673214077949524, + "learning_rate": 6.710412099433921e-05, + "loss": 1.8011, + "step": 13268 + }, + { + "epoch": 4.072744014732965, + "grad_norm": 0.2523653209209442, + "learning_rate": 6.709945022408768e-05, + "loss": 1.755, + "step": 13269 + }, + { + "epoch": 4.0730509515039905, + "grad_norm": 0.3818903863430023, + "learning_rate": 6.709477928484934e-05, + "loss": 1.7968, + "step": 13270 + }, + { + "epoch": 4.073357888275015, + "grad_norm": 0.31509929895401, + "learning_rate": 6.709010817667039e-05, + "loss": 1.744, + "step": 13271 + }, + { + "epoch": 4.07366482504604, + "grad_norm": 0.21875518560409546, + "learning_rate": 6.708543689959697e-05, + "loss": 1.7511, + "step": 13272 + }, + { + "epoch": 4.073971761817066, + "grad_norm": 0.25381338596343994, + "learning_rate": 6.708076545367523e-05, + "loss": 1.7523, + "step": 13273 + }, + { + "epoch": 4.074278698588091, + "grad_norm": 0.24193842709064484, + "learning_rate": 6.707609383895137e-05, + "loss": 1.7713, + "step": 13274 + }, + { + "epoch": 4.074585635359116, + "grad_norm": 0.21972359716892242, + "learning_rate": 6.707142205547154e-05, + "loss": 1.7329, + "step": 13275 + }, + { + "epoch": 4.074892572130141, + "grad_norm": 0.22188499569892883, + "learning_rate": 6.706675010328192e-05, + "loss": 1.7507, + "step": 13276 + }, + { + "epoch": 4.075199508901166, + "grad_norm": 0.23344436287879944, + "learning_rate": 6.706207798242865e-05, + "loss": 1.771, + "step": 13277 + }, + { + "epoch": 4.0755064456721914, + "grad_norm": 0.3008805513381958, + "learning_rate": 6.705740569295795e-05, + "loss": 1.775, + "step": 13278 + }, + { + "epoch": 4.075813382443217, + "grad_norm": 0.31407982110977173, + "learning_rate": 6.705273323491595e-05, + "loss": 1.7625, + "step": 13279 + }, + { + "epoch": 4.076120319214242, + "grad_norm": 0.2430381178855896, + "learning_rate": 6.704806060834886e-05, + "loss": 1.7706, + "step": 13280 + }, + { + "epoch": 4.0764272559852675, + "grad_norm": 0.23250171542167664, + "learning_rate": 6.704338781330284e-05, + "loss": 1.7977, + "step": 13281 + }, + { + "epoch": 4.076734192756292, + "grad_norm": 0.22073723375797272, + "learning_rate": 6.703871484982407e-05, + "loss": 1.7686, + "step": 13282 + }, + { + "epoch": 4.077041129527317, + "grad_norm": 0.24987035989761353, + "learning_rate": 6.703404171795874e-05, + "loss": 1.736, + "step": 13283 + }, + { + "epoch": 4.077348066298343, + "grad_norm": 0.2697623670101166, + "learning_rate": 6.702936841775301e-05, + "loss": 1.8367, + "step": 13284 + }, + { + "epoch": 4.077655003069368, + "grad_norm": 0.21592749655246735, + "learning_rate": 6.702469494925309e-05, + "loss": 1.7467, + "step": 13285 + }, + { + "epoch": 4.077961939840393, + "grad_norm": 0.2612052261829376, + "learning_rate": 6.702002131250515e-05, + "loss": 1.7689, + "step": 13286 + }, + { + "epoch": 4.078268876611418, + "grad_norm": 0.3004797697067261, + "learning_rate": 6.701534750755539e-05, + "loss": 1.7586, + "step": 13287 + }, + { + "epoch": 4.078575813382443, + "grad_norm": 0.24615366756916046, + "learning_rate": 6.701067353444998e-05, + "loss": 1.7636, + "step": 13288 + }, + { + "epoch": 4.078882750153468, + "grad_norm": 0.23401159048080444, + "learning_rate": 6.700599939323515e-05, + "loss": 1.8015, + "step": 13289 + }, + { + "epoch": 4.079189686924494, + "grad_norm": 0.24546295404434204, + "learning_rate": 6.700132508395705e-05, + "loss": 1.7606, + "step": 13290 + }, + { + "epoch": 4.079496623695519, + "grad_norm": 0.24664412438869476, + "learning_rate": 6.69966506066619e-05, + "loss": 1.7994, + "step": 13291 + }, + { + "epoch": 4.0798035604665435, + "grad_norm": 0.2780163288116455, + "learning_rate": 6.699197596139587e-05, + "loss": 1.7972, + "step": 13292 + }, + { + "epoch": 4.080110497237569, + "grad_norm": 0.2554188668727875, + "learning_rate": 6.698730114820517e-05, + "loss": 1.7928, + "step": 13293 + }, + { + "epoch": 4.080417434008594, + "grad_norm": 0.2471141666173935, + "learning_rate": 6.698262616713602e-05, + "loss": 1.7948, + "step": 13294 + }, + { + "epoch": 4.0807243707796195, + "grad_norm": 0.2556581199169159, + "learning_rate": 6.697795101823461e-05, + "loss": 1.7942, + "step": 13295 + }, + { + "epoch": 4.081031307550645, + "grad_norm": 0.24462421238422394, + "learning_rate": 6.697327570154712e-05, + "loss": 1.7336, + "step": 13296 + }, + { + "epoch": 4.08133824432167, + "grad_norm": 0.22378689050674438, + "learning_rate": 6.696860021711978e-05, + "loss": 1.7703, + "step": 13297 + }, + { + "epoch": 4.081645181092695, + "grad_norm": 0.23949933052062988, + "learning_rate": 6.69639245649988e-05, + "loss": 1.7651, + "step": 13298 + }, + { + "epoch": 4.08195211786372, + "grad_norm": 0.27751216292381287, + "learning_rate": 6.695924874523035e-05, + "loss": 1.7866, + "step": 13299 + }, + { + "epoch": 4.082259054634745, + "grad_norm": 0.22700226306915283, + "learning_rate": 6.695457275786068e-05, + "loss": 1.79, + "step": 13300 + }, + { + "epoch": 4.082565991405771, + "grad_norm": 0.2138090431690216, + "learning_rate": 6.694989660293598e-05, + "loss": 1.7882, + "step": 13301 + }, + { + "epoch": 4.082872928176796, + "grad_norm": 0.2963469326496124, + "learning_rate": 6.694522028050246e-05, + "loss": 1.8779, + "step": 13302 + }, + { + "epoch": 4.08317986494782, + "grad_norm": 0.31833669543266296, + "learning_rate": 6.694054379060634e-05, + "loss": 1.7923, + "step": 13303 + }, + { + "epoch": 4.083486801718846, + "grad_norm": 0.27751585841178894, + "learning_rate": 6.693586713329385e-05, + "loss": 1.7557, + "step": 13304 + }, + { + "epoch": 4.083793738489871, + "grad_norm": 0.23790816962718964, + "learning_rate": 6.69311903086112e-05, + "loss": 1.7587, + "step": 13305 + }, + { + "epoch": 4.084100675260896, + "grad_norm": 0.24153777956962585, + "learning_rate": 6.692651331660458e-05, + "loss": 1.7573, + "step": 13306 + }, + { + "epoch": 4.084407612031922, + "grad_norm": 0.26607179641723633, + "learning_rate": 6.692183615732025e-05, + "loss": 1.7823, + "step": 13307 + }, + { + "epoch": 4.084714548802946, + "grad_norm": 0.26670268177986145, + "learning_rate": 6.691715883080442e-05, + "loss": 1.784, + "step": 13308 + }, + { + "epoch": 4.0850214855739715, + "grad_norm": 0.25980666279792786, + "learning_rate": 6.69124813371033e-05, + "loss": 1.797, + "step": 13309 + }, + { + "epoch": 4.085328422344997, + "grad_norm": 0.2805597484111786, + "learning_rate": 6.690780367626314e-05, + "loss": 1.8298, + "step": 13310 + }, + { + "epoch": 4.085635359116022, + "grad_norm": 0.27198413014411926, + "learning_rate": 6.690312584833012e-05, + "loss": 1.8104, + "step": 13311 + }, + { + "epoch": 4.0859422958870475, + "grad_norm": 0.2619116008281708, + "learning_rate": 6.689844785335054e-05, + "loss": 1.771, + "step": 13312 + }, + { + "epoch": 4.086249232658073, + "grad_norm": 0.22647863626480103, + "learning_rate": 6.689376969137057e-05, + "loss": 1.8114, + "step": 13313 + }, + { + "epoch": 4.086556169429097, + "grad_norm": 1.469475507736206, + "learning_rate": 6.68890913624365e-05, + "loss": 1.8796, + "step": 13314 + }, + { + "epoch": 4.086863106200123, + "grad_norm": 0.4577515423297882, + "learning_rate": 6.68844128665945e-05, + "loss": 1.716, + "step": 13315 + }, + { + "epoch": 4.087170042971148, + "grad_norm": 0.5830543637275696, + "learning_rate": 6.687973420389085e-05, + "loss": 1.7692, + "step": 13316 + }, + { + "epoch": 4.087476979742173, + "grad_norm": 0.4404197037220001, + "learning_rate": 6.687505537437178e-05, + "loss": 1.7909, + "step": 13317 + }, + { + "epoch": 4.087783916513199, + "grad_norm": 0.31379908323287964, + "learning_rate": 6.68703763780835e-05, + "loss": 1.7957, + "step": 13318 + }, + { + "epoch": 4.088090853284223, + "grad_norm": 0.49588730931282043, + "learning_rate": 6.686569721507229e-05, + "loss": 1.7126, + "step": 13319 + }, + { + "epoch": 4.088397790055248, + "grad_norm": 0.3690234124660492, + "learning_rate": 6.686101788538437e-05, + "loss": 1.8233, + "step": 13320 + }, + { + "epoch": 4.088704726826274, + "grad_norm": 0.337310254573822, + "learning_rate": 6.685633838906598e-05, + "loss": 1.6886, + "step": 13321 + }, + { + "epoch": 4.089011663597299, + "grad_norm": 0.5164821147918701, + "learning_rate": 6.685165872616337e-05, + "loss": 1.7967, + "step": 13322 + }, + { + "epoch": 4.089318600368324, + "grad_norm": 0.36501309275627136, + "learning_rate": 6.68469788967228e-05, + "loss": 1.755, + "step": 13323 + }, + { + "epoch": 4.08962553713935, + "grad_norm": 0.35017216205596924, + "learning_rate": 6.684229890079052e-05, + "loss": 1.7595, + "step": 13324 + }, + { + "epoch": 4.089932473910374, + "grad_norm": 0.5622650980949402, + "learning_rate": 6.683761873841277e-05, + "loss": 1.7841, + "step": 13325 + }, + { + "epoch": 4.0902394106813995, + "grad_norm": 0.47010260820388794, + "learning_rate": 6.683293840963578e-05, + "loss": 1.7537, + "step": 13326 + }, + { + "epoch": 4.090546347452425, + "grad_norm": 0.25515374541282654, + "learning_rate": 6.682825791450584e-05, + "loss": 1.7692, + "step": 13327 + }, + { + "epoch": 4.09085328422345, + "grad_norm": 0.5063003897666931, + "learning_rate": 6.682357725306919e-05, + "loss": 1.7454, + "step": 13328 + }, + { + "epoch": 4.0911602209944755, + "grad_norm": 0.4197622835636139, + "learning_rate": 6.681889642537209e-05, + "loss": 1.7792, + "step": 13329 + }, + { + "epoch": 4.0914671577655, + "grad_norm": 0.24038295447826385, + "learning_rate": 6.68142154314608e-05, + "loss": 1.7631, + "step": 13330 + }, + { + "epoch": 4.091774094536525, + "grad_norm": 0.42108532786369324, + "learning_rate": 6.680953427138159e-05, + "loss": 1.7784, + "step": 13331 + }, + { + "epoch": 4.092081031307551, + "grad_norm": 0.33729633688926697, + "learning_rate": 6.68048529451807e-05, + "loss": 1.8057, + "step": 13332 + }, + { + "epoch": 4.092387968078576, + "grad_norm": 0.31847241520881653, + "learning_rate": 6.68001714529044e-05, + "loss": 1.7375, + "step": 13333 + }, + { + "epoch": 4.092694904849601, + "grad_norm": 0.45276644825935364, + "learning_rate": 6.679548979459896e-05, + "loss": 1.7507, + "step": 13334 + }, + { + "epoch": 4.093001841620626, + "grad_norm": 0.3781665861606598, + "learning_rate": 6.679080797031065e-05, + "loss": 1.7718, + "step": 13335 + }, + { + "epoch": 4.093308778391651, + "grad_norm": 0.25868359208106995, + "learning_rate": 6.678612598008573e-05, + "loss": 1.8105, + "step": 13336 + }, + { + "epoch": 4.093615715162676, + "grad_norm": 0.32834702730178833, + "learning_rate": 6.678144382397048e-05, + "loss": 1.7883, + "step": 13337 + }, + { + "epoch": 4.093922651933702, + "grad_norm": 0.2830568253993988, + "learning_rate": 6.677676150201116e-05, + "loss": 1.7994, + "step": 13338 + }, + { + "epoch": 4.094229588704727, + "grad_norm": 0.219541534781456, + "learning_rate": 6.677207901425405e-05, + "loss": 1.7344, + "step": 13339 + }, + { + "epoch": 4.094536525475752, + "grad_norm": 0.2557326555252075, + "learning_rate": 6.676739636074542e-05, + "loss": 1.7734, + "step": 13340 + }, + { + "epoch": 4.094843462246777, + "grad_norm": 0.2741365432739258, + "learning_rate": 6.676271354153156e-05, + "loss": 1.7912, + "step": 13341 + }, + { + "epoch": 4.095150399017802, + "grad_norm": 0.31258970499038696, + "learning_rate": 6.675803055665874e-05, + "loss": 1.7798, + "step": 13342 + }, + { + "epoch": 4.0954573357888275, + "grad_norm": 0.30181947350502014, + "learning_rate": 6.675334740617322e-05, + "loss": 1.7746, + "step": 13343 + }, + { + "epoch": 4.095764272559853, + "grad_norm": 0.3000102937221527, + "learning_rate": 6.674866409012133e-05, + "loss": 1.7842, + "step": 13344 + }, + { + "epoch": 4.096071209330878, + "grad_norm": 0.22871005535125732, + "learning_rate": 6.674398060854931e-05, + "loss": 1.7473, + "step": 13345 + }, + { + "epoch": 4.096378146101903, + "grad_norm": 0.2700810432434082, + "learning_rate": 6.673929696150346e-05, + "loss": 1.7862, + "step": 13346 + }, + { + "epoch": 4.096685082872928, + "grad_norm": 0.27537551522254944, + "learning_rate": 6.673461314903007e-05, + "loss": 1.7843, + "step": 13347 + }, + { + "epoch": 4.096992019643953, + "grad_norm": 0.23700574040412903, + "learning_rate": 6.672992917117542e-05, + "loss": 1.765, + "step": 13348 + }, + { + "epoch": 4.097298956414979, + "grad_norm": 0.23331589996814728, + "learning_rate": 6.672524502798583e-05, + "loss": 1.7894, + "step": 13349 + }, + { + "epoch": 4.097605893186004, + "grad_norm": 0.28591978549957275, + "learning_rate": 6.672056071950753e-05, + "loss": 1.7736, + "step": 13350 + }, + { + "epoch": 4.097912829957028, + "grad_norm": 0.3000452518463135, + "learning_rate": 6.671587624578685e-05, + "loss": 1.7635, + "step": 13351 + }, + { + "epoch": 4.098219766728054, + "grad_norm": 0.21877998113632202, + "learning_rate": 6.67111916068701e-05, + "loss": 1.7225, + "step": 13352 + }, + { + "epoch": 4.098526703499079, + "grad_norm": 0.2598817050457001, + "learning_rate": 6.670650680280358e-05, + "loss": 1.6874, + "step": 13353 + }, + { + "epoch": 4.098833640270104, + "grad_norm": 0.3063203692436218, + "learning_rate": 6.670182183363353e-05, + "loss": 1.7821, + "step": 13354 + }, + { + "epoch": 4.09914057704113, + "grad_norm": 0.2328508347272873, + "learning_rate": 6.66971366994063e-05, + "loss": 1.788, + "step": 13355 + }, + { + "epoch": 4.099447513812155, + "grad_norm": 0.33936765789985657, + "learning_rate": 6.669245140016817e-05, + "loss": 1.8159, + "step": 13356 + }, + { + "epoch": 4.0997544505831796, + "grad_norm": 0.27464553713798523, + "learning_rate": 6.668776593596546e-05, + "loss": 1.7371, + "step": 13357 + }, + { + "epoch": 4.100061387354205, + "grad_norm": 0.24255812168121338, + "learning_rate": 6.668308030684447e-05, + "loss": 1.7993, + "step": 13358 + }, + { + "epoch": 4.10036832412523, + "grad_norm": 0.27203628420829773, + "learning_rate": 6.667839451285149e-05, + "loss": 1.8253, + "step": 13359 + }, + { + "epoch": 4.100675260896256, + "grad_norm": 0.2503862679004669, + "learning_rate": 6.667370855403286e-05, + "loss": 1.7927, + "step": 13360 + }, + { + "epoch": 4.100982197667281, + "grad_norm": 0.2616904377937317, + "learning_rate": 6.666902243043486e-05, + "loss": 1.8226, + "step": 13361 + }, + { + "epoch": 4.101289134438305, + "grad_norm": 0.26707521080970764, + "learning_rate": 6.666433614210379e-05, + "loss": 1.8485, + "step": 13362 + }, + { + "epoch": 4.101596071209331, + "grad_norm": 0.2427528202533722, + "learning_rate": 6.6659649689086e-05, + "loss": 1.7387, + "step": 13363 + }, + { + "epoch": 4.101903007980356, + "grad_norm": 0.2319549173116684, + "learning_rate": 6.66549630714278e-05, + "loss": 1.7396, + "step": 13364 + }, + { + "epoch": 4.102209944751381, + "grad_norm": 0.2248002141714096, + "learning_rate": 6.665027628917548e-05, + "loss": 1.7817, + "step": 13365 + }, + { + "epoch": 4.102516881522407, + "grad_norm": 0.21929535269737244, + "learning_rate": 6.664558934237538e-05, + "loss": 1.7478, + "step": 13366 + }, + { + "epoch": 4.102823818293431, + "grad_norm": 0.21144583821296692, + "learning_rate": 6.66409022310738e-05, + "loss": 1.7602, + "step": 13367 + }, + { + "epoch": 4.1031307550644565, + "grad_norm": 0.21984660625457764, + "learning_rate": 6.663621495531707e-05, + "loss": 1.7541, + "step": 13368 + }, + { + "epoch": 4.103437691835482, + "grad_norm": 0.2075357735157013, + "learning_rate": 6.663152751515152e-05, + "loss": 1.7362, + "step": 13369 + }, + { + "epoch": 4.103744628606507, + "grad_norm": 0.23316961526870728, + "learning_rate": 6.662683991062347e-05, + "loss": 1.8273, + "step": 13370 + }, + { + "epoch": 4.1040515653775325, + "grad_norm": 0.23142337799072266, + "learning_rate": 6.662215214177922e-05, + "loss": 1.7543, + "step": 13371 + }, + { + "epoch": 4.104358502148558, + "grad_norm": 0.24335260689258575, + "learning_rate": 6.661746420866515e-05, + "loss": 1.8328, + "step": 13372 + }, + { + "epoch": 4.104665438919582, + "grad_norm": 0.2440192997455597, + "learning_rate": 6.661277611132753e-05, + "loss": 1.8114, + "step": 13373 + }, + { + "epoch": 4.104972375690608, + "grad_norm": 0.252808541059494, + "learning_rate": 6.660808784981273e-05, + "loss": 1.8556, + "step": 13374 + }, + { + "epoch": 4.105279312461633, + "grad_norm": 0.24564477801322937, + "learning_rate": 6.660339942416708e-05, + "loss": 1.8231, + "step": 13375 + }, + { + "epoch": 4.105586249232658, + "grad_norm": 0.2371874898672104, + "learning_rate": 6.65987108344369e-05, + "loss": 1.7763, + "step": 13376 + }, + { + "epoch": 4.105893186003684, + "grad_norm": 0.22882802784442902, + "learning_rate": 6.659402208066854e-05, + "loss": 1.7388, + "step": 13377 + }, + { + "epoch": 4.106200122774708, + "grad_norm": 0.24857540428638458, + "learning_rate": 6.658933316290832e-05, + "loss": 1.7735, + "step": 13378 + }, + { + "epoch": 4.106507059545733, + "grad_norm": 0.22574029862880707, + "learning_rate": 6.658464408120257e-05, + "loss": 1.7403, + "step": 13379 + }, + { + "epoch": 4.106813996316759, + "grad_norm": 0.24944272637367249, + "learning_rate": 6.657995483559767e-05, + "loss": 1.7827, + "step": 13380 + }, + { + "epoch": 4.107120933087784, + "grad_norm": 0.27386224269866943, + "learning_rate": 6.657526542613992e-05, + "loss": 1.7673, + "step": 13381 + }, + { + "epoch": 4.107427869858809, + "grad_norm": 0.29222097992897034, + "learning_rate": 6.65705758528757e-05, + "loss": 1.7958, + "step": 13382 + }, + { + "epoch": 4.107734806629834, + "grad_norm": 0.2471150904893875, + "learning_rate": 6.656588611585133e-05, + "loss": 1.7706, + "step": 13383 + }, + { + "epoch": 4.108041743400859, + "grad_norm": 0.289316862821579, + "learning_rate": 6.656119621511317e-05, + "loss": 1.7828, + "step": 13384 + }, + { + "epoch": 4.1083486801718845, + "grad_norm": 0.36710497736930847, + "learning_rate": 6.655650615070756e-05, + "loss": 1.712, + "step": 13385 + }, + { + "epoch": 4.10865561694291, + "grad_norm": 0.2999880611896515, + "learning_rate": 6.655181592268084e-05, + "loss": 1.7711, + "step": 13386 + }, + { + "epoch": 4.108962553713935, + "grad_norm": 0.332011342048645, + "learning_rate": 6.654712553107939e-05, + "loss": 1.907, + "step": 13387 + }, + { + "epoch": 4.1092694904849605, + "grad_norm": 0.43125995993614197, + "learning_rate": 6.654243497594953e-05, + "loss": 1.7819, + "step": 13388 + }, + { + "epoch": 4.109576427255985, + "grad_norm": 0.33719149231910706, + "learning_rate": 6.653774425733765e-05, + "loss": 1.797, + "step": 13389 + }, + { + "epoch": 4.10988336402701, + "grad_norm": 0.23091599345207214, + "learning_rate": 6.653305337529006e-05, + "loss": 1.7384, + "step": 13390 + }, + { + "epoch": 4.110190300798036, + "grad_norm": 0.4283982515335083, + "learning_rate": 6.652836232985317e-05, + "loss": 1.8284, + "step": 13391 + }, + { + "epoch": 4.110497237569061, + "grad_norm": 0.43575870990753174, + "learning_rate": 6.652367112107332e-05, + "loss": 1.7235, + "step": 13392 + }, + { + "epoch": 4.110804174340086, + "grad_norm": 0.246877059340477, + "learning_rate": 6.651897974899685e-05, + "loss": 1.7174, + "step": 13393 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.36063629388809204, + "learning_rate": 6.651428821367015e-05, + "loss": 1.8064, + "step": 13394 + }, + { + "epoch": 4.111418047882136, + "grad_norm": 0.4454420804977417, + "learning_rate": 6.650959651513957e-05, + "loss": 1.7575, + "step": 13395 + }, + { + "epoch": 4.111724984653161, + "grad_norm": 0.2788856327533722, + "learning_rate": 6.650490465345149e-05, + "loss": 1.7696, + "step": 13396 + }, + { + "epoch": 4.112031921424187, + "grad_norm": 0.40281879901885986, + "learning_rate": 6.650021262865225e-05, + "loss": 1.8368, + "step": 13397 + }, + { + "epoch": 4.112338858195212, + "grad_norm": 0.5151103138923645, + "learning_rate": 6.649552044078825e-05, + "loss": 1.8224, + "step": 13398 + }, + { + "epoch": 4.112645794966237, + "grad_norm": 0.29390639066696167, + "learning_rate": 6.649082808990586e-05, + "loss": 1.7846, + "step": 13399 + }, + { + "epoch": 4.112952731737262, + "grad_norm": 0.3061942458152771, + "learning_rate": 6.648613557605142e-05, + "loss": 1.7954, + "step": 13400 + }, + { + "epoch": 4.113259668508287, + "grad_norm": 0.47628748416900635, + "learning_rate": 6.648144289927132e-05, + "loss": 1.7782, + "step": 13401 + }, + { + "epoch": 4.1135666052793125, + "grad_norm": 0.4299588203430176, + "learning_rate": 6.647675005961197e-05, + "loss": 1.7459, + "step": 13402 + }, + { + "epoch": 4.113873542050338, + "grad_norm": 0.24556589126586914, + "learning_rate": 6.64720570571197e-05, + "loss": 1.753, + "step": 13403 + }, + { + "epoch": 4.114180478821363, + "grad_norm": 0.29620522260665894, + "learning_rate": 6.646736389184092e-05, + "loss": 1.773, + "step": 13404 + }, + { + "epoch": 4.114487415592388, + "grad_norm": 0.37710070610046387, + "learning_rate": 6.646267056382199e-05, + "loss": 1.8389, + "step": 13405 + }, + { + "epoch": 4.114794352363413, + "grad_norm": 0.2562984824180603, + "learning_rate": 6.64579770731093e-05, + "loss": 1.7905, + "step": 13406 + }, + { + "epoch": 4.115101289134438, + "grad_norm": 0.3999946713447571, + "learning_rate": 6.645328341974924e-05, + "loss": 1.7734, + "step": 13407 + }, + { + "epoch": 4.115408225905464, + "grad_norm": 0.36087217926979065, + "learning_rate": 6.644858960378817e-05, + "loss": 1.801, + "step": 13408 + }, + { + "epoch": 4.115715162676489, + "grad_norm": 0.2520254850387573, + "learning_rate": 6.644389562527251e-05, + "loss": 1.7394, + "step": 13409 + }, + { + "epoch": 4.116022099447513, + "grad_norm": 0.4321835935115814, + "learning_rate": 6.643920148424864e-05, + "loss": 1.8091, + "step": 13410 + }, + { + "epoch": 4.116329036218539, + "grad_norm": 0.40900173783302307, + "learning_rate": 6.643450718076294e-05, + "loss": 1.8198, + "step": 13411 + }, + { + "epoch": 4.116635972989564, + "grad_norm": 0.23693956434726715, + "learning_rate": 6.642981271486182e-05, + "loss": 1.6807, + "step": 13412 + }, + { + "epoch": 4.116942909760589, + "grad_norm": 0.33526891469955444, + "learning_rate": 6.642511808659164e-05, + "loss": 1.8673, + "step": 13413 + }, + { + "epoch": 4.117249846531615, + "grad_norm": 0.4037325382232666, + "learning_rate": 6.642042329599883e-05, + "loss": 1.743, + "step": 13414 + }, + { + "epoch": 4.11755678330264, + "grad_norm": 0.25629740953445435, + "learning_rate": 6.641572834312975e-05, + "loss": 1.6904, + "step": 13415 + }, + { + "epoch": 4.1178637200736645, + "grad_norm": 0.29203253984451294, + "learning_rate": 6.641103322803087e-05, + "loss": 1.7811, + "step": 13416 + }, + { + "epoch": 4.11817065684469, + "grad_norm": 0.423926442861557, + "learning_rate": 6.64063379507485e-05, + "loss": 1.7341, + "step": 13417 + }, + { + "epoch": 4.118477593615715, + "grad_norm": 0.29561251401901245, + "learning_rate": 6.64016425113291e-05, + "loss": 1.7915, + "step": 13418 + }, + { + "epoch": 4.1187845303867405, + "grad_norm": 0.2536832094192505, + "learning_rate": 6.639694690981903e-05, + "loss": 1.7628, + "step": 13419 + }, + { + "epoch": 4.119091467157766, + "grad_norm": 0.2931392192840576, + "learning_rate": 6.639225114626475e-05, + "loss": 1.7877, + "step": 13420 + }, + { + "epoch": 4.11939840392879, + "grad_norm": 0.2219499796628952, + "learning_rate": 6.638755522071263e-05, + "loss": 1.7183, + "step": 13421 + }, + { + "epoch": 4.119705340699816, + "grad_norm": 0.2951931953430176, + "learning_rate": 6.638285913320908e-05, + "loss": 1.7983, + "step": 13422 + }, + { + "epoch": 4.120012277470841, + "grad_norm": 0.3495960533618927, + "learning_rate": 6.63781628838005e-05, + "loss": 1.7531, + "step": 13423 + }, + { + "epoch": 4.120319214241866, + "grad_norm": 0.2389262616634369, + "learning_rate": 6.637346647253333e-05, + "loss": 1.7454, + "step": 13424 + }, + { + "epoch": 4.120626151012892, + "grad_norm": 0.28729167580604553, + "learning_rate": 6.636876989945395e-05, + "loss": 1.8105, + "step": 13425 + }, + { + "epoch": 4.120933087783916, + "grad_norm": 0.2620082199573517, + "learning_rate": 6.636407316460882e-05, + "loss": 1.7948, + "step": 13426 + }, + { + "epoch": 4.121240024554941, + "grad_norm": 0.2694189250469208, + "learning_rate": 6.635937626804432e-05, + "loss": 1.809, + "step": 13427 + }, + { + "epoch": 4.121546961325967, + "grad_norm": 0.2660866379737854, + "learning_rate": 6.635467920980687e-05, + "loss": 1.7431, + "step": 13428 + }, + { + "epoch": 4.121853898096992, + "grad_norm": 0.2579907774925232, + "learning_rate": 6.634998198994289e-05, + "loss": 1.7941, + "step": 13429 + }, + { + "epoch": 4.122160834868017, + "grad_norm": 0.28349989652633667, + "learning_rate": 6.634528460849881e-05, + "loss": 1.8142, + "step": 13430 + }, + { + "epoch": 4.122467771639043, + "grad_norm": 0.28716522455215454, + "learning_rate": 6.634058706552104e-05, + "loss": 1.7496, + "step": 13431 + }, + { + "epoch": 4.122774708410067, + "grad_norm": 0.23228077590465546, + "learning_rate": 6.633588936105601e-05, + "loss": 1.7399, + "step": 13432 + }, + { + "epoch": 4.1230816451810925, + "grad_norm": 0.3649841248989105, + "learning_rate": 6.633119149515017e-05, + "loss": 1.7696, + "step": 13433 + }, + { + "epoch": 4.123388581952118, + "grad_norm": 0.2757830321788788, + "learning_rate": 6.632649346784992e-05, + "loss": 1.8329, + "step": 13434 + }, + { + "epoch": 4.123695518723143, + "grad_norm": 0.28163692355155945, + "learning_rate": 6.632179527920167e-05, + "loss": 1.7761, + "step": 13435 + }, + { + "epoch": 4.1240024554941686, + "grad_norm": 0.3453187048435211, + "learning_rate": 6.631709692925188e-05, + "loss": 1.7843, + "step": 13436 + }, + { + "epoch": 4.124309392265193, + "grad_norm": 0.2792697250843048, + "learning_rate": 6.631239841804698e-05, + "loss": 1.7889, + "step": 13437 + }, + { + "epoch": 4.124616329036218, + "grad_norm": 0.21881693601608276, + "learning_rate": 6.630769974563339e-05, + "loss": 1.8015, + "step": 13438 + }, + { + "epoch": 4.124923265807244, + "grad_norm": 0.4464910328388214, + "learning_rate": 6.630300091205756e-05, + "loss": 1.7851, + "step": 13439 + }, + { + "epoch": 4.125230202578269, + "grad_norm": 0.40191107988357544, + "learning_rate": 6.629830191736591e-05, + "loss": 1.8608, + "step": 13440 + }, + { + "epoch": 4.125537139349294, + "grad_norm": 0.2809060513973236, + "learning_rate": 6.62936027616049e-05, + "loss": 1.7374, + "step": 13441 + }, + { + "epoch": 4.12584407612032, + "grad_norm": 0.24980643391609192, + "learning_rate": 6.628890344482095e-05, + "loss": 1.8152, + "step": 13442 + }, + { + "epoch": 4.126151012891344, + "grad_norm": 0.24538342654705048, + "learning_rate": 6.62842039670605e-05, + "loss": 1.7687, + "step": 13443 + }, + { + "epoch": 4.1264579496623695, + "grad_norm": 0.24684634804725647, + "learning_rate": 6.627950432837002e-05, + "loss": 1.787, + "step": 13444 + }, + { + "epoch": 4.126764886433395, + "grad_norm": 0.22724607586860657, + "learning_rate": 6.627480452879593e-05, + "loss": 1.7871, + "step": 13445 + }, + { + "epoch": 4.12707182320442, + "grad_norm": 0.24724406003952026, + "learning_rate": 6.627010456838469e-05, + "loss": 1.7524, + "step": 13446 + }, + { + "epoch": 4.1273787599754455, + "grad_norm": 0.24219536781311035, + "learning_rate": 6.626540444718274e-05, + "loss": 1.7754, + "step": 13447 + }, + { + "epoch": 4.12768569674647, + "grad_norm": 0.24857915937900543, + "learning_rate": 6.626070416523652e-05, + "loss": 1.7839, + "step": 13448 + }, + { + "epoch": 4.127992633517495, + "grad_norm": 0.2639105021953583, + "learning_rate": 6.625600372259248e-05, + "loss": 1.7546, + "step": 13449 + }, + { + "epoch": 4.128299570288521, + "grad_norm": 0.23598137497901917, + "learning_rate": 6.62513031192971e-05, + "loss": 1.7957, + "step": 13450 + }, + { + "epoch": 4.128606507059546, + "grad_norm": 0.3038909137248993, + "learning_rate": 6.624660235539682e-05, + "loss": 1.8117, + "step": 13451 + }, + { + "epoch": 4.128913443830571, + "grad_norm": 0.27671241760253906, + "learning_rate": 6.624190143093809e-05, + "loss": 1.729, + "step": 13452 + }, + { + "epoch": 4.129220380601596, + "grad_norm": 0.24638360738754272, + "learning_rate": 6.623720034596735e-05, + "loss": 1.7414, + "step": 13453 + }, + { + "epoch": 4.129527317372621, + "grad_norm": 0.24073924124240875, + "learning_rate": 6.623249910053111e-05, + "loss": 1.8046, + "step": 13454 + }, + { + "epoch": 4.129834254143646, + "grad_norm": 0.29734376072883606, + "learning_rate": 6.622779769467578e-05, + "loss": 1.8336, + "step": 13455 + }, + { + "epoch": 4.130141190914672, + "grad_norm": 0.23182810842990875, + "learning_rate": 6.622309612844785e-05, + "loss": 1.7742, + "step": 13456 + }, + { + "epoch": 4.130448127685697, + "grad_norm": 0.2179390788078308, + "learning_rate": 6.621839440189378e-05, + "loss": 1.7656, + "step": 13457 + }, + { + "epoch": 4.1307550644567215, + "grad_norm": 0.21389013528823853, + "learning_rate": 6.621369251506002e-05, + "loss": 1.7504, + "step": 13458 + }, + { + "epoch": 4.131062001227747, + "grad_norm": 0.22306203842163086, + "learning_rate": 6.620899046799305e-05, + "loss": 1.7573, + "step": 13459 + }, + { + "epoch": 4.131368937998772, + "grad_norm": 0.2699708938598633, + "learning_rate": 6.620428826073934e-05, + "loss": 1.7419, + "step": 13460 + }, + { + "epoch": 4.1316758747697975, + "grad_norm": 0.34087565541267395, + "learning_rate": 6.619958589334534e-05, + "loss": 1.7545, + "step": 13461 + }, + { + "epoch": 4.131982811540823, + "grad_norm": 0.2934977412223816, + "learning_rate": 6.619488336585755e-05, + "loss": 1.7611, + "step": 13462 + }, + { + "epoch": 4.132289748311848, + "grad_norm": 0.22545567154884338, + "learning_rate": 6.619018067832243e-05, + "loss": 1.7562, + "step": 13463 + }, + { + "epoch": 4.132596685082873, + "grad_norm": 0.23334743082523346, + "learning_rate": 6.618547783078647e-05, + "loss": 1.7784, + "step": 13464 + }, + { + "epoch": 4.132903621853898, + "grad_norm": 0.22466403245925903, + "learning_rate": 6.618077482329612e-05, + "loss": 1.7277, + "step": 13465 + }, + { + "epoch": 4.133210558624923, + "grad_norm": 0.23504197597503662, + "learning_rate": 6.617607165589785e-05, + "loss": 1.7983, + "step": 13466 + }, + { + "epoch": 4.133517495395949, + "grad_norm": 0.2500833570957184, + "learning_rate": 6.617136832863819e-05, + "loss": 1.7826, + "step": 13467 + }, + { + "epoch": 4.133824432166974, + "grad_norm": 0.22398658096790314, + "learning_rate": 6.616666484156357e-05, + "loss": 1.7281, + "step": 13468 + }, + { + "epoch": 4.134131368937998, + "grad_norm": 0.2537873089313507, + "learning_rate": 6.616196119472052e-05, + "loss": 1.7598, + "step": 13469 + }, + { + "epoch": 4.134438305709024, + "grad_norm": 0.26881173253059387, + "learning_rate": 6.615725738815546e-05, + "loss": 1.8161, + "step": 13470 + }, + { + "epoch": 4.134745242480049, + "grad_norm": 0.3311346471309662, + "learning_rate": 6.615255342191492e-05, + "loss": 1.7954, + "step": 13471 + }, + { + "epoch": 4.135052179251074, + "grad_norm": 0.2562953233718872, + "learning_rate": 6.614784929604539e-05, + "loss": 1.7284, + "step": 13472 + }, + { + "epoch": 4.1353591160221, + "grad_norm": 0.2563154101371765, + "learning_rate": 6.614314501059334e-05, + "loss": 1.7995, + "step": 13473 + }, + { + "epoch": 4.135666052793125, + "grad_norm": 0.24861161410808563, + "learning_rate": 6.613844056560527e-05, + "loss": 1.7589, + "step": 13474 + }, + { + "epoch": 4.1359729895641495, + "grad_norm": 0.23815487325191498, + "learning_rate": 6.613373596112769e-05, + "loss": 1.6906, + "step": 13475 + }, + { + "epoch": 4.136279926335175, + "grad_norm": 0.25394049286842346, + "learning_rate": 6.612903119720705e-05, + "loss": 1.781, + "step": 13476 + }, + { + "epoch": 4.1365868631062, + "grad_norm": 0.24501466751098633, + "learning_rate": 6.612432627388988e-05, + "loss": 1.797, + "step": 13477 + }, + { + "epoch": 4.1368937998772255, + "grad_norm": 0.24909707903862, + "learning_rate": 6.611962119122267e-05, + "loss": 1.7643, + "step": 13478 + }, + { + "epoch": 4.137200736648251, + "grad_norm": 0.24954476952552795, + "learning_rate": 6.611491594925192e-05, + "loss": 1.8219, + "step": 13479 + }, + { + "epoch": 4.137507673419275, + "grad_norm": 0.30572372674942017, + "learning_rate": 6.611021054802411e-05, + "loss": 1.8039, + "step": 13480 + }, + { + "epoch": 4.137814610190301, + "grad_norm": 0.27466365694999695, + "learning_rate": 6.610550498758577e-05, + "loss": 1.6945, + "step": 13481 + }, + { + "epoch": 4.138121546961326, + "grad_norm": 0.2614271640777588, + "learning_rate": 6.610079926798339e-05, + "loss": 1.8648, + "step": 13482 + }, + { + "epoch": 4.138428483732351, + "grad_norm": 0.23645827174186707, + "learning_rate": 6.609609338926346e-05, + "loss": 1.7424, + "step": 13483 + }, + { + "epoch": 4.138735420503377, + "grad_norm": 0.24473626911640167, + "learning_rate": 6.609138735147253e-05, + "loss": 1.8036, + "step": 13484 + }, + { + "epoch": 4.139042357274401, + "grad_norm": 0.2472417950630188, + "learning_rate": 6.608668115465706e-05, + "loss": 1.794, + "step": 13485 + }, + { + "epoch": 4.139349294045426, + "grad_norm": 0.25330284237861633, + "learning_rate": 6.608197479886358e-05, + "loss": 1.8052, + "step": 13486 + }, + { + "epoch": 4.139656230816452, + "grad_norm": 0.24279309809207916, + "learning_rate": 6.60772682841386e-05, + "loss": 1.7375, + "step": 13487 + }, + { + "epoch": 4.139963167587477, + "grad_norm": 0.22319461405277252, + "learning_rate": 6.607256161052862e-05, + "loss": 1.7696, + "step": 13488 + }, + { + "epoch": 4.140270104358502, + "grad_norm": 0.25261563062667847, + "learning_rate": 6.606785477808017e-05, + "loss": 1.7646, + "step": 13489 + }, + { + "epoch": 4.140577041129528, + "grad_norm": 0.3127744793891907, + "learning_rate": 6.606314778683977e-05, + "loss": 1.7899, + "step": 13490 + }, + { + "epoch": 4.140883977900552, + "grad_norm": 0.3550816774368286, + "learning_rate": 6.605844063685392e-05, + "loss": 1.7971, + "step": 13491 + }, + { + "epoch": 4.1411909146715775, + "grad_norm": 0.20977813005447388, + "learning_rate": 6.605373332816916e-05, + "loss": 1.7416, + "step": 13492 + }, + { + "epoch": 4.141497851442603, + "grad_norm": 0.26593849062919617, + "learning_rate": 6.6049025860832e-05, + "loss": 1.7586, + "step": 13493 + }, + { + "epoch": 4.141804788213628, + "grad_norm": 0.2452937364578247, + "learning_rate": 6.604431823488893e-05, + "loss": 1.757, + "step": 13494 + }, + { + "epoch": 4.1421117249846535, + "grad_norm": 0.21029168367385864, + "learning_rate": 6.603961045038652e-05, + "loss": 1.7665, + "step": 13495 + }, + { + "epoch": 4.142418661755678, + "grad_norm": 0.2396312952041626, + "learning_rate": 6.603490250737128e-05, + "loss": 1.7609, + "step": 13496 + }, + { + "epoch": 4.142725598526703, + "grad_norm": 0.23266808688640594, + "learning_rate": 6.603019440588975e-05, + "loss": 1.7893, + "step": 13497 + }, + { + "epoch": 4.143032535297729, + "grad_norm": 0.25235217809677124, + "learning_rate": 6.602548614598842e-05, + "loss": 1.7465, + "step": 13498 + }, + { + "epoch": 4.143339472068754, + "grad_norm": 0.22944024205207825, + "learning_rate": 6.602077772771386e-05, + "loss": 1.7052, + "step": 13499 + }, + { + "epoch": 4.143646408839779, + "grad_norm": 0.2116660475730896, + "learning_rate": 6.601606915111257e-05, + "loss": 1.7042, + "step": 13500 + }, + { + "epoch": 4.143953345610804, + "grad_norm": 0.21777184307575226, + "learning_rate": 6.601136041623111e-05, + "loss": 1.7938, + "step": 13501 + }, + { + "epoch": 4.144260282381829, + "grad_norm": 0.23663075268268585, + "learning_rate": 6.600665152311601e-05, + "loss": 1.7475, + "step": 13502 + }, + { + "epoch": 4.144567219152854, + "grad_norm": 0.20644642412662506, + "learning_rate": 6.600194247181377e-05, + "loss": 1.7992, + "step": 13503 + }, + { + "epoch": 4.14487415592388, + "grad_norm": 0.21479010581970215, + "learning_rate": 6.599723326237098e-05, + "loss": 1.7877, + "step": 13504 + }, + { + "epoch": 4.145181092694905, + "grad_norm": 0.2266562283039093, + "learning_rate": 6.599252389483413e-05, + "loss": 1.8097, + "step": 13505 + }, + { + "epoch": 4.14548802946593, + "grad_norm": 0.2053738683462143, + "learning_rate": 6.59878143692498e-05, + "loss": 1.6878, + "step": 13506 + }, + { + "epoch": 4.145794966236955, + "grad_norm": 0.19583995640277863, + "learning_rate": 6.598310468566452e-05, + "loss": 1.7547, + "step": 13507 + }, + { + "epoch": 4.14610190300798, + "grad_norm": 0.23421542346477509, + "learning_rate": 6.597839484412484e-05, + "loss": 1.7926, + "step": 13508 + }, + { + "epoch": 4.1464088397790055, + "grad_norm": 0.24575260281562805, + "learning_rate": 6.597368484467728e-05, + "loss": 1.7311, + "step": 13509 + }, + { + "epoch": 4.146715776550031, + "grad_norm": 0.27519574761390686, + "learning_rate": 6.596897468736842e-05, + "loss": 1.7858, + "step": 13510 + }, + { + "epoch": 4.147022713321056, + "grad_norm": 0.26434022188186646, + "learning_rate": 6.596426437224477e-05, + "loss": 1.7387, + "step": 13511 + }, + { + "epoch": 4.147329650092081, + "grad_norm": 0.2192772775888443, + "learning_rate": 6.595955389935291e-05, + "loss": 1.7565, + "step": 13512 + }, + { + "epoch": 4.147636586863106, + "grad_norm": 0.21047350764274597, + "learning_rate": 6.595484326873938e-05, + "loss": 1.7234, + "step": 13513 + }, + { + "epoch": 4.147943523634131, + "grad_norm": 0.22838951647281647, + "learning_rate": 6.595013248045075e-05, + "loss": 1.8205, + "step": 13514 + }, + { + "epoch": 4.148250460405157, + "grad_norm": 0.3467923402786255, + "learning_rate": 6.594542153453356e-05, + "loss": 1.7973, + "step": 13515 + }, + { + "epoch": 4.148557397176182, + "grad_norm": 0.241237074136734, + "learning_rate": 6.594071043103438e-05, + "loss": 1.7764, + "step": 13516 + }, + { + "epoch": 4.148864333947207, + "grad_norm": 0.22543516755104065, + "learning_rate": 6.593599916999973e-05, + "loss": 1.7528, + "step": 13517 + }, + { + "epoch": 4.149171270718232, + "grad_norm": 0.24590276181697845, + "learning_rate": 6.593128775147623e-05, + "loss": 1.7422, + "step": 13518 + }, + { + "epoch": 4.149478207489257, + "grad_norm": 0.2434391975402832, + "learning_rate": 6.592657617551038e-05, + "loss": 1.7523, + "step": 13519 + }, + { + "epoch": 4.149785144260282, + "grad_norm": 0.23169009387493134, + "learning_rate": 6.592186444214877e-05, + "loss": 1.8158, + "step": 13520 + }, + { + "epoch": 4.150092081031308, + "grad_norm": 0.2217840999364853, + "learning_rate": 6.591715255143798e-05, + "loss": 1.7487, + "step": 13521 + }, + { + "epoch": 4.150399017802333, + "grad_norm": 0.2405092418193817, + "learning_rate": 6.591244050342454e-05, + "loss": 1.7726, + "step": 13522 + }, + { + "epoch": 4.150705954573358, + "grad_norm": 0.29432612657546997, + "learning_rate": 6.590772829815504e-05, + "loss": 1.7841, + "step": 13523 + }, + { + "epoch": 4.151012891344383, + "grad_norm": 0.2708737850189209, + "learning_rate": 6.590301593567605e-05, + "loss": 1.8551, + "step": 13524 + }, + { + "epoch": 4.151319828115408, + "grad_norm": 0.26643216609954834, + "learning_rate": 6.589830341603413e-05, + "loss": 1.7697, + "step": 13525 + }, + { + "epoch": 4.151626764886434, + "grad_norm": 0.3672652840614319, + "learning_rate": 6.589359073927587e-05, + "loss": 1.8292, + "step": 13526 + }, + { + "epoch": 4.151933701657459, + "grad_norm": 0.2413325160741806, + "learning_rate": 6.588887790544782e-05, + "loss": 1.7514, + "step": 13527 + }, + { + "epoch": 4.152240638428483, + "grad_norm": 0.3248155117034912, + "learning_rate": 6.588416491459657e-05, + "loss": 1.7437, + "step": 13528 + }, + { + "epoch": 4.152547575199509, + "grad_norm": 0.40951836109161377, + "learning_rate": 6.587945176676869e-05, + "loss": 1.7779, + "step": 13529 + }, + { + "epoch": 4.152854511970534, + "grad_norm": 0.23874351382255554, + "learning_rate": 6.587473846201075e-05, + "loss": 1.8343, + "step": 13530 + }, + { + "epoch": 4.153161448741559, + "grad_norm": 0.4535207450389862, + "learning_rate": 6.587002500036936e-05, + "loss": 1.8301, + "step": 13531 + }, + { + "epoch": 4.153468385512585, + "grad_norm": 0.458003968000412, + "learning_rate": 6.586531138189108e-05, + "loss": 1.7053, + "step": 13532 + }, + { + "epoch": 4.153775322283609, + "grad_norm": 0.24350887537002563, + "learning_rate": 6.586059760662248e-05, + "loss": 1.7642, + "step": 13533 + }, + { + "epoch": 4.1540822590546345, + "grad_norm": 0.46951553225517273, + "learning_rate": 6.585588367461017e-05, + "loss": 1.7345, + "step": 13534 + }, + { + "epoch": 4.15438919582566, + "grad_norm": 0.5524527430534363, + "learning_rate": 6.585116958590072e-05, + "loss": 1.7677, + "step": 13535 + }, + { + "epoch": 4.154696132596685, + "grad_norm": 0.2887112498283386, + "learning_rate": 6.584645534054072e-05, + "loss": 1.7704, + "step": 13536 + }, + { + "epoch": 4.1550030693677105, + "grad_norm": 0.36243724822998047, + "learning_rate": 6.584174093857675e-05, + "loss": 1.8133, + "step": 13537 + }, + { + "epoch": 4.155310006138736, + "grad_norm": 0.3869550824165344, + "learning_rate": 6.583702638005543e-05, + "loss": 1.7253, + "step": 13538 + }, + { + "epoch": 4.15561694290976, + "grad_norm": 0.25859662890434265, + "learning_rate": 6.583231166502333e-05, + "loss": 1.7683, + "step": 13539 + }, + { + "epoch": 4.155923879680786, + "grad_norm": 0.3011144995689392, + "learning_rate": 6.582759679352704e-05, + "loss": 1.7139, + "step": 13540 + }, + { + "epoch": 4.156230816451811, + "grad_norm": 0.38033372163772583, + "learning_rate": 6.582288176561316e-05, + "loss": 1.8182, + "step": 13541 + }, + { + "epoch": 4.156537753222836, + "grad_norm": 0.2224060595035553, + "learning_rate": 6.581816658132829e-05, + "loss": 1.7527, + "step": 13542 + }, + { + "epoch": 4.156844689993862, + "grad_norm": 0.4147234261035919, + "learning_rate": 6.581345124071903e-05, + "loss": 1.7339, + "step": 13543 + }, + { + "epoch": 4.157151626764886, + "grad_norm": 0.45334625244140625, + "learning_rate": 6.580873574383198e-05, + "loss": 1.8166, + "step": 13544 + }, + { + "epoch": 4.157458563535911, + "grad_norm": 0.3050530254840851, + "learning_rate": 6.580402009071372e-05, + "loss": 1.7967, + "step": 13545 + }, + { + "epoch": 4.157765500306937, + "grad_norm": 0.25901293754577637, + "learning_rate": 6.579930428141088e-05, + "loss": 1.7806, + "step": 13546 + }, + { + "epoch": 4.158072437077962, + "grad_norm": 0.3142934739589691, + "learning_rate": 6.579458831597006e-05, + "loss": 1.7724, + "step": 13547 + }, + { + "epoch": 4.158379373848987, + "grad_norm": 0.23943179845809937, + "learning_rate": 6.578987219443787e-05, + "loss": 1.7515, + "step": 13548 + }, + { + "epoch": 4.158686310620013, + "grad_norm": 0.2838635742664337, + "learning_rate": 6.578515591686089e-05, + "loss": 1.7707, + "step": 13549 + }, + { + "epoch": 4.158993247391037, + "grad_norm": 0.3064457178115845, + "learning_rate": 6.578043948328575e-05, + "loss": 1.7839, + "step": 13550 + }, + { + "epoch": 4.1593001841620625, + "grad_norm": 0.2311718463897705, + "learning_rate": 6.577572289375907e-05, + "loss": 1.8298, + "step": 13551 + }, + { + "epoch": 4.159607120933088, + "grad_norm": 0.35726481676101685, + "learning_rate": 6.577100614832743e-05, + "loss": 1.811, + "step": 13552 + }, + { + "epoch": 4.159914057704113, + "grad_norm": 0.3176140785217285, + "learning_rate": 6.576628924703749e-05, + "loss": 1.732, + "step": 13553 + }, + { + "epoch": 4.1602209944751385, + "grad_norm": 0.2325647473335266, + "learning_rate": 6.576157218993582e-05, + "loss": 1.827, + "step": 13554 + }, + { + "epoch": 4.160527931246163, + "grad_norm": 0.32260453701019287, + "learning_rate": 6.575685497706905e-05, + "loss": 1.8218, + "step": 13555 + }, + { + "epoch": 4.160834868017188, + "grad_norm": 0.2638537287712097, + "learning_rate": 6.575213760848382e-05, + "loss": 1.7091, + "step": 13556 + }, + { + "epoch": 4.161141804788214, + "grad_norm": 0.2501799762248993, + "learning_rate": 6.574742008422671e-05, + "loss": 1.7707, + "step": 13557 + }, + { + "epoch": 4.161448741559239, + "grad_norm": 0.3212645649909973, + "learning_rate": 6.574270240434439e-05, + "loss": 1.7541, + "step": 13558 + }, + { + "epoch": 4.161755678330264, + "grad_norm": 0.25915586948394775, + "learning_rate": 6.573798456888345e-05, + "loss": 1.7597, + "step": 13559 + }, + { + "epoch": 4.162062615101289, + "grad_norm": 0.2538192868232727, + "learning_rate": 6.573326657789052e-05, + "loss": 1.8507, + "step": 13560 + }, + { + "epoch": 4.162369551872314, + "grad_norm": 0.2542131543159485, + "learning_rate": 6.572854843141223e-05, + "loss": 1.782, + "step": 13561 + }, + { + "epoch": 4.162676488643339, + "grad_norm": 0.26163414120674133, + "learning_rate": 6.572383012949521e-05, + "loss": 1.8482, + "step": 13562 + }, + { + "epoch": 4.162983425414365, + "grad_norm": 0.2566238343715668, + "learning_rate": 6.571911167218608e-05, + "loss": 1.7284, + "step": 13563 + }, + { + "epoch": 4.16329036218539, + "grad_norm": 0.28413113951683044, + "learning_rate": 6.571439305953147e-05, + "loss": 1.7473, + "step": 13564 + }, + { + "epoch": 4.163597298956415, + "grad_norm": 0.20399242639541626, + "learning_rate": 6.570967429157802e-05, + "loss": 1.6942, + "step": 13565 + }, + { + "epoch": 4.16390423572744, + "grad_norm": 0.256104439496994, + "learning_rate": 6.570495536837235e-05, + "loss": 1.7346, + "step": 13566 + }, + { + "epoch": 4.164211172498465, + "grad_norm": 0.350909560918808, + "learning_rate": 6.570023628996112e-05, + "loss": 1.8284, + "step": 13567 + }, + { + "epoch": 4.1645181092694905, + "grad_norm": 0.23500367999076843, + "learning_rate": 6.569551705639096e-05, + "loss": 1.7504, + "step": 13568 + }, + { + "epoch": 4.164825046040516, + "grad_norm": 0.26683783531188965, + "learning_rate": 6.569079766770849e-05, + "loss": 1.7293, + "step": 13569 + }, + { + "epoch": 4.165131982811541, + "grad_norm": 0.3145855963230133, + "learning_rate": 6.568607812396037e-05, + "loss": 1.8171, + "step": 13570 + }, + { + "epoch": 4.165438919582566, + "grad_norm": 0.2354860156774521, + "learning_rate": 6.568135842519324e-05, + "loss": 1.7555, + "step": 13571 + }, + { + "epoch": 4.165745856353591, + "grad_norm": 0.2893243730068207, + "learning_rate": 6.56766385714537e-05, + "loss": 1.7636, + "step": 13572 + }, + { + "epoch": 4.166052793124616, + "grad_norm": 0.20707663893699646, + "learning_rate": 6.567191856278846e-05, + "loss": 1.7239, + "step": 13573 + }, + { + "epoch": 4.166359729895642, + "grad_norm": 0.34200331568717957, + "learning_rate": 6.566719839924412e-05, + "loss": 1.7848, + "step": 13574 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.23326615989208221, + "learning_rate": 6.566247808086734e-05, + "loss": 1.7447, + "step": 13575 + }, + { + "epoch": 4.166973603437691, + "grad_norm": 0.22375629842281342, + "learning_rate": 6.565775760770479e-05, + "loss": 1.7429, + "step": 13576 + }, + { + "epoch": 4.167280540208717, + "grad_norm": 0.2412862777709961, + "learning_rate": 6.565303697980308e-05, + "loss": 1.7671, + "step": 13577 + }, + { + "epoch": 4.167587476979742, + "grad_norm": 0.2482215315103531, + "learning_rate": 6.56483161972089e-05, + "loss": 1.812, + "step": 13578 + }, + { + "epoch": 4.167894413750767, + "grad_norm": 0.2252974659204483, + "learning_rate": 6.564359525996889e-05, + "loss": 1.8173, + "step": 13579 + }, + { + "epoch": 4.168201350521793, + "grad_norm": 0.23497292399406433, + "learning_rate": 6.563887416812969e-05, + "loss": 1.7945, + "step": 13580 + }, + { + "epoch": 4.168508287292818, + "grad_norm": 0.24911245703697205, + "learning_rate": 6.563415292173796e-05, + "loss": 1.7516, + "step": 13581 + }, + { + "epoch": 4.1688152240638425, + "grad_norm": 0.20920930802822113, + "learning_rate": 6.562943152084039e-05, + "loss": 1.765, + "step": 13582 + }, + { + "epoch": 4.169122160834868, + "grad_norm": 0.26001816987991333, + "learning_rate": 6.562470996548361e-05, + "loss": 1.7504, + "step": 13583 + }, + { + "epoch": 4.169429097605893, + "grad_norm": 0.2504529058933258, + "learning_rate": 6.561998825571429e-05, + "loss": 1.7689, + "step": 13584 + }, + { + "epoch": 4.1697360343769185, + "grad_norm": 0.2210187464952469, + "learning_rate": 6.561526639157908e-05, + "loss": 1.752, + "step": 13585 + }, + { + "epoch": 4.170042971147944, + "grad_norm": 0.26323240995407104, + "learning_rate": 6.561054437312467e-05, + "loss": 1.8104, + "step": 13586 + }, + { + "epoch": 4.170349907918968, + "grad_norm": 0.20436744391918182, + "learning_rate": 6.560582220039771e-05, + "loss": 1.7281, + "step": 13587 + }, + { + "epoch": 4.170656844689994, + "grad_norm": 0.2053878903388977, + "learning_rate": 6.560109987344487e-05, + "loss": 1.7192, + "step": 13588 + }, + { + "epoch": 4.170963781461019, + "grad_norm": 0.2416568547487259, + "learning_rate": 6.559637739231281e-05, + "loss": 1.7679, + "step": 13589 + }, + { + "epoch": 4.171270718232044, + "grad_norm": 0.23847989737987518, + "learning_rate": 6.55916547570482e-05, + "loss": 1.7182, + "step": 13590 + }, + { + "epoch": 4.17157765500307, + "grad_norm": 0.2057785540819168, + "learning_rate": 6.558693196769772e-05, + "loss": 1.816, + "step": 13591 + }, + { + "epoch": 4.171884591774095, + "grad_norm": 0.2270805537700653, + "learning_rate": 6.558220902430804e-05, + "loss": 1.7091, + "step": 13592 + }, + { + "epoch": 4.172191528545119, + "grad_norm": 0.22143644094467163, + "learning_rate": 6.557748592692585e-05, + "loss": 1.7446, + "step": 13593 + }, + { + "epoch": 4.172498465316145, + "grad_norm": 0.2032770961523056, + "learning_rate": 6.557276267559781e-05, + "loss": 1.7501, + "step": 13594 + }, + { + "epoch": 4.17280540208717, + "grad_norm": 0.20851244032382965, + "learning_rate": 6.55680392703706e-05, + "loss": 1.8283, + "step": 13595 + }, + { + "epoch": 4.173112338858195, + "grad_norm": 0.2603934109210968, + "learning_rate": 6.55633157112909e-05, + "loss": 1.8523, + "step": 13596 + }, + { + "epoch": 4.173419275629221, + "grad_norm": 0.2232515811920166, + "learning_rate": 6.55585919984054e-05, + "loss": 1.7803, + "step": 13597 + }, + { + "epoch": 4.173726212400245, + "grad_norm": 0.2541115880012512, + "learning_rate": 6.555386813176075e-05, + "loss": 1.7407, + "step": 13598 + }, + { + "epoch": 4.1740331491712706, + "grad_norm": 0.3044603765010834, + "learning_rate": 6.55491441114037e-05, + "loss": 1.8257, + "step": 13599 + }, + { + "epoch": 4.174340085942296, + "grad_norm": 0.29227301478385925, + "learning_rate": 6.554441993738086e-05, + "loss": 1.7998, + "step": 13600 + }, + { + "epoch": 4.174647022713321, + "grad_norm": 0.25166594982147217, + "learning_rate": 6.553969560973896e-05, + "loss": 1.8258, + "step": 13601 + }, + { + "epoch": 4.1749539594843466, + "grad_norm": 0.22973991930484772, + "learning_rate": 6.55349711285247e-05, + "loss": 1.7871, + "step": 13602 + }, + { + "epoch": 4.175260896255371, + "grad_norm": 0.2615009844303131, + "learning_rate": 6.553024649378473e-05, + "loss": 1.7572, + "step": 13603 + }, + { + "epoch": 4.175567833026396, + "grad_norm": 0.24145473539829254, + "learning_rate": 6.552552170556576e-05, + "loss": 1.7546, + "step": 13604 + }, + { + "epoch": 4.175874769797422, + "grad_norm": 0.21989156305789948, + "learning_rate": 6.55207967639145e-05, + "loss": 1.6939, + "step": 13605 + }, + { + "epoch": 4.176181706568447, + "grad_norm": 0.206025168299675, + "learning_rate": 6.551607166887761e-05, + "loss": 1.7531, + "step": 13606 + }, + { + "epoch": 4.176488643339472, + "grad_norm": 0.2175903469324112, + "learning_rate": 6.551134642050181e-05, + "loss": 1.7631, + "step": 13607 + }, + { + "epoch": 4.176795580110497, + "grad_norm": 0.23259282112121582, + "learning_rate": 6.550662101883379e-05, + "loss": 1.7773, + "step": 13608 + }, + { + "epoch": 4.177102516881522, + "grad_norm": 0.23955227434635162, + "learning_rate": 6.550189546392025e-05, + "loss": 1.7321, + "step": 13609 + }, + { + "epoch": 4.1774094536525475, + "grad_norm": 0.23614998161792755, + "learning_rate": 6.549716975580792e-05, + "loss": 1.7855, + "step": 13610 + }, + { + "epoch": 4.177716390423573, + "grad_norm": 0.2274426817893982, + "learning_rate": 6.549244389454345e-05, + "loss": 1.7778, + "step": 13611 + }, + { + "epoch": 4.178023327194598, + "grad_norm": 0.2204308807849884, + "learning_rate": 6.548771788017358e-05, + "loss": 1.7175, + "step": 13612 + }, + { + "epoch": 4.1783302639656235, + "grad_norm": 0.2283930778503418, + "learning_rate": 6.548299171274501e-05, + "loss": 1.8081, + "step": 13613 + }, + { + "epoch": 4.178637200736648, + "grad_norm": 0.25433486700057983, + "learning_rate": 6.547826539230442e-05, + "loss": 1.8009, + "step": 13614 + }, + { + "epoch": 4.178944137507673, + "grad_norm": 0.24452579021453857, + "learning_rate": 6.547353891889856e-05, + "loss": 1.7244, + "step": 13615 + }, + { + "epoch": 4.179251074278699, + "grad_norm": 0.20611275732517242, + "learning_rate": 6.546881229257411e-05, + "loss": 1.7566, + "step": 13616 + }, + { + "epoch": 4.179558011049724, + "grad_norm": 0.24557232856750488, + "learning_rate": 6.546408551337779e-05, + "loss": 1.7638, + "step": 13617 + }, + { + "epoch": 4.179864947820749, + "grad_norm": 0.2158801257610321, + "learning_rate": 6.545935858135631e-05, + "loss": 1.7659, + "step": 13618 + }, + { + "epoch": 4.180171884591774, + "grad_norm": 0.23800688982009888, + "learning_rate": 6.54546314965564e-05, + "loss": 1.7468, + "step": 13619 + }, + { + "epoch": 4.180478821362799, + "grad_norm": 0.2504122853279114, + "learning_rate": 6.544990425902476e-05, + "loss": 1.7682, + "step": 13620 + }, + { + "epoch": 4.180785758133824, + "grad_norm": 0.21556814014911652, + "learning_rate": 6.54451768688081e-05, + "loss": 1.772, + "step": 13621 + }, + { + "epoch": 4.18109269490485, + "grad_norm": 0.23404552042484283, + "learning_rate": 6.544044932595315e-05, + "loss": 1.7844, + "step": 13622 + }, + { + "epoch": 4.181399631675875, + "grad_norm": 0.22129055857658386, + "learning_rate": 6.543572163050664e-05, + "loss": 1.7725, + "step": 13623 + }, + { + "epoch": 4.1817065684469, + "grad_norm": 0.2533521354198456, + "learning_rate": 6.543099378251528e-05, + "loss": 1.7908, + "step": 13624 + }, + { + "epoch": 4.182013505217925, + "grad_norm": 0.2905815541744232, + "learning_rate": 6.542626578202579e-05, + "loss": 1.7913, + "step": 13625 + }, + { + "epoch": 4.18232044198895, + "grad_norm": 0.3330783247947693, + "learning_rate": 6.54215376290849e-05, + "loss": 1.8374, + "step": 13626 + }, + { + "epoch": 4.1826273787599755, + "grad_norm": 0.29268717765808105, + "learning_rate": 6.541680932373933e-05, + "loss": 1.8714, + "step": 13627 + }, + { + "epoch": 4.182934315531001, + "grad_norm": 0.2820781171321869, + "learning_rate": 6.541208086603584e-05, + "loss": 1.8089, + "step": 13628 + }, + { + "epoch": 4.183241252302026, + "grad_norm": 0.3062323033809662, + "learning_rate": 6.54073522560211e-05, + "loss": 1.7307, + "step": 13629 + }, + { + "epoch": 4.183548189073051, + "grad_norm": 0.3010510504245758, + "learning_rate": 6.54026234937419e-05, + "loss": 1.7523, + "step": 13630 + }, + { + "epoch": 4.183855125844076, + "grad_norm": 0.21932095289230347, + "learning_rate": 6.539789457924493e-05, + "loss": 1.737, + "step": 13631 + }, + { + "epoch": 4.184162062615101, + "grad_norm": 0.2710212469100952, + "learning_rate": 6.539316551257695e-05, + "loss": 1.7228, + "step": 13632 + }, + { + "epoch": 4.184468999386127, + "grad_norm": 0.2885816991329193, + "learning_rate": 6.538843629378469e-05, + "loss": 1.8734, + "step": 13633 + }, + { + "epoch": 4.184775936157152, + "grad_norm": 0.2621026635169983, + "learning_rate": 6.538370692291487e-05, + "loss": 1.7884, + "step": 13634 + }, + { + "epoch": 4.185082872928176, + "grad_norm": 0.30503126978874207, + "learning_rate": 6.537897740001426e-05, + "loss": 1.7833, + "step": 13635 + }, + { + "epoch": 4.185389809699202, + "grad_norm": 0.29491373896598816, + "learning_rate": 6.537424772512955e-05, + "loss": 1.7894, + "step": 13636 + }, + { + "epoch": 4.185696746470227, + "grad_norm": 0.24423296749591827, + "learning_rate": 6.536951789830754e-05, + "loss": 1.7409, + "step": 13637 + }, + { + "epoch": 4.186003683241252, + "grad_norm": 0.2184748351573944, + "learning_rate": 6.536478791959495e-05, + "loss": 1.747, + "step": 13638 + }, + { + "epoch": 4.186310620012278, + "grad_norm": 0.2348455935716629, + "learning_rate": 6.53600577890385e-05, + "loss": 1.7422, + "step": 13639 + }, + { + "epoch": 4.186617556783303, + "grad_norm": 0.2554566264152527, + "learning_rate": 6.535532750668497e-05, + "loss": 1.7623, + "step": 13640 + }, + { + "epoch": 4.1869244935543275, + "grad_norm": 0.26424553990364075, + "learning_rate": 6.535059707258109e-05, + "loss": 1.8408, + "step": 13641 + }, + { + "epoch": 4.187231430325353, + "grad_norm": 0.35363274812698364, + "learning_rate": 6.534586648677361e-05, + "loss": 1.7435, + "step": 13642 + }, + { + "epoch": 4.187538367096378, + "grad_norm": 0.3225265443325043, + "learning_rate": 6.534113574930926e-05, + "loss": 1.7181, + "step": 13643 + }, + { + "epoch": 4.1878453038674035, + "grad_norm": 0.23529650270938873, + "learning_rate": 6.533640486023485e-05, + "loss": 1.7712, + "step": 13644 + }, + { + "epoch": 4.188152240638429, + "grad_norm": 0.3490132987499237, + "learning_rate": 6.53316738195971e-05, + "loss": 1.7329, + "step": 13645 + }, + { + "epoch": 4.188459177409453, + "grad_norm": 0.3759285509586334, + "learning_rate": 6.532694262744274e-05, + "loss": 1.802, + "step": 13646 + }, + { + "epoch": 4.188766114180479, + "grad_norm": 0.27383577823638916, + "learning_rate": 6.532221128381858e-05, + "loss": 1.801, + "step": 13647 + }, + { + "epoch": 4.189073050951504, + "grad_norm": 0.23240652680397034, + "learning_rate": 6.531747978877132e-05, + "loss": 1.8415, + "step": 13648 + }, + { + "epoch": 4.189379987722529, + "grad_norm": 0.3302704989910126, + "learning_rate": 6.531274814234773e-05, + "loss": 1.7765, + "step": 13649 + }, + { + "epoch": 4.189686924493555, + "grad_norm": 0.3209368586540222, + "learning_rate": 6.530801634459463e-05, + "loss": 1.6935, + "step": 13650 + }, + { + "epoch": 4.189993861264579, + "grad_norm": 0.26643648743629456, + "learning_rate": 6.530328439555872e-05, + "loss": 1.8159, + "step": 13651 + }, + { + "epoch": 4.190300798035604, + "grad_norm": 0.22594431042671204, + "learning_rate": 6.529855229528679e-05, + "loss": 1.7764, + "step": 13652 + }, + { + "epoch": 4.19060773480663, + "grad_norm": 0.3288109302520752, + "learning_rate": 6.529382004382561e-05, + "loss": 1.7963, + "step": 13653 + }, + { + "epoch": 4.190914671577655, + "grad_norm": 0.3067106604576111, + "learning_rate": 6.528908764122191e-05, + "loss": 1.7564, + "step": 13654 + }, + { + "epoch": 4.19122160834868, + "grad_norm": 0.23437078297138214, + "learning_rate": 6.528435508752249e-05, + "loss": 1.759, + "step": 13655 + }, + { + "epoch": 4.191528545119706, + "grad_norm": 0.30662333965301514, + "learning_rate": 6.527962238277413e-05, + "loss": 1.7549, + "step": 13656 + }, + { + "epoch": 4.19183548189073, + "grad_norm": 0.3545009195804596, + "learning_rate": 6.527488952702356e-05, + "loss": 1.7761, + "step": 13657 + }, + { + "epoch": 4.1921424186617555, + "grad_norm": 0.2509438991546631, + "learning_rate": 6.52701565203176e-05, + "loss": 1.7162, + "step": 13658 + }, + { + "epoch": 4.192449355432781, + "grad_norm": 0.24423806369304657, + "learning_rate": 6.5265423362703e-05, + "loss": 1.735, + "step": 13659 + }, + { + "epoch": 4.192756292203806, + "grad_norm": 0.37365156412124634, + "learning_rate": 6.526069005422654e-05, + "loss": 1.7697, + "step": 13660 + }, + { + "epoch": 4.1930632289748315, + "grad_norm": 0.4025731682777405, + "learning_rate": 6.525595659493499e-05, + "loss": 1.7931, + "step": 13661 + }, + { + "epoch": 4.193370165745856, + "grad_norm": 0.31360915303230286, + "learning_rate": 6.525122298487514e-05, + "loss": 1.8014, + "step": 13662 + }, + { + "epoch": 4.193677102516881, + "grad_norm": 0.2480524778366089, + "learning_rate": 6.524648922409376e-05, + "loss": 1.7753, + "step": 13663 + }, + { + "epoch": 4.193984039287907, + "grad_norm": 0.33740919828414917, + "learning_rate": 6.524175531263765e-05, + "loss": 1.7296, + "step": 13664 + }, + { + "epoch": 4.194290976058932, + "grad_norm": 0.26871639490127563, + "learning_rate": 6.523702125055358e-05, + "loss": 1.7113, + "step": 13665 + }, + { + "epoch": 4.194597912829957, + "grad_norm": 0.2687455415725708, + "learning_rate": 6.52322870378883e-05, + "loss": 1.7645, + "step": 13666 + }, + { + "epoch": 4.194904849600983, + "grad_norm": 0.4207400679588318, + "learning_rate": 6.522755267468868e-05, + "loss": 1.7758, + "step": 13667 + }, + { + "epoch": 4.195211786372007, + "grad_norm": 0.36043494939804077, + "learning_rate": 6.522281816100142e-05, + "loss": 1.7433, + "step": 13668 + }, + { + "epoch": 4.195518723143032, + "grad_norm": 0.2515890598297119, + "learning_rate": 6.52180834968734e-05, + "loss": 1.7646, + "step": 13669 + }, + { + "epoch": 4.195825659914058, + "grad_norm": 0.2871458828449249, + "learning_rate": 6.521334868235132e-05, + "loss": 1.8147, + "step": 13670 + }, + { + "epoch": 4.196132596685083, + "grad_norm": 0.28454354405403137, + "learning_rate": 6.5208613717482e-05, + "loss": 1.8576, + "step": 13671 + }, + { + "epoch": 4.196439533456108, + "grad_norm": 0.2520541548728943, + "learning_rate": 6.520387860231227e-05, + "loss": 1.7513, + "step": 13672 + }, + { + "epoch": 4.196746470227133, + "grad_norm": 0.22782307863235474, + "learning_rate": 6.51991433368889e-05, + "loss": 1.7737, + "step": 13673 + }, + { + "epoch": 4.197053406998158, + "grad_norm": 0.2451259195804596, + "learning_rate": 6.519440792125869e-05, + "loss": 1.7483, + "step": 13674 + }, + { + "epoch": 4.1973603437691835, + "grad_norm": 0.21915963292121887, + "learning_rate": 6.518967235546841e-05, + "loss": 1.718, + "step": 13675 + }, + { + "epoch": 4.197667280540209, + "grad_norm": 0.23005805909633636, + "learning_rate": 6.51849366395649e-05, + "loss": 1.7786, + "step": 13676 + }, + { + "epoch": 4.197974217311234, + "grad_norm": 0.25039517879486084, + "learning_rate": 6.518020077359494e-05, + "loss": 1.7785, + "step": 13677 + }, + { + "epoch": 4.198281154082259, + "grad_norm": 0.26631081104278564, + "learning_rate": 6.517546475760535e-05, + "loss": 1.7921, + "step": 13678 + }, + { + "epoch": 4.198588090853284, + "grad_norm": 0.2220793515443802, + "learning_rate": 6.517072859164292e-05, + "loss": 1.7696, + "step": 13679 + }, + { + "epoch": 4.198895027624309, + "grad_norm": 0.24681030213832855, + "learning_rate": 6.516599227575446e-05, + "loss": 1.7702, + "step": 13680 + }, + { + "epoch": 4.199201964395335, + "grad_norm": 0.2421828955411911, + "learning_rate": 6.516125580998678e-05, + "loss": 1.8058, + "step": 13681 + }, + { + "epoch": 4.19950890116636, + "grad_norm": 0.2170087695121765, + "learning_rate": 6.515651919438667e-05, + "loss": 1.7271, + "step": 13682 + }, + { + "epoch": 4.199815837937384, + "grad_norm": 0.23383566737174988, + "learning_rate": 6.515178242900096e-05, + "loss": 1.7515, + "step": 13683 + }, + { + "epoch": 4.20012277470841, + "grad_norm": 0.2522997558116913, + "learning_rate": 6.514704551387645e-05, + "loss": 1.7619, + "step": 13684 + }, + { + "epoch": 4.200429711479435, + "grad_norm": 0.20973703265190125, + "learning_rate": 6.514230844905995e-05, + "loss": 1.7326, + "step": 13685 + }, + { + "epoch": 4.2007366482504604, + "grad_norm": 0.2308073341846466, + "learning_rate": 6.513757123459832e-05, + "loss": 1.811, + "step": 13686 + }, + { + "epoch": 4.201043585021486, + "grad_norm": 0.21751229465007782, + "learning_rate": 6.51328338705383e-05, + "loss": 1.7795, + "step": 13687 + }, + { + "epoch": 4.201350521792511, + "grad_norm": 0.2357407957315445, + "learning_rate": 6.512809635692675e-05, + "loss": 1.8069, + "step": 13688 + }, + { + "epoch": 4.201657458563536, + "grad_norm": 0.32245033979415894, + "learning_rate": 6.51233586938105e-05, + "loss": 1.8179, + "step": 13689 + }, + { + "epoch": 4.201964395334561, + "grad_norm": 0.22740167379379272, + "learning_rate": 6.511862088123635e-05, + "loss": 1.7482, + "step": 13690 + }, + { + "epoch": 4.202271332105586, + "grad_norm": 0.26880496740341187, + "learning_rate": 6.511388291925114e-05, + "loss": 1.7919, + "step": 13691 + }, + { + "epoch": 4.202578268876612, + "grad_norm": 0.2261822521686554, + "learning_rate": 6.510914480790166e-05, + "loss": 1.7543, + "step": 13692 + }, + { + "epoch": 4.202885205647637, + "grad_norm": 0.2635782063007355, + "learning_rate": 6.510440654723477e-05, + "loss": 1.7874, + "step": 13693 + }, + { + "epoch": 4.203192142418661, + "grad_norm": 0.2505982518196106, + "learning_rate": 6.509966813729726e-05, + "loss": 1.8016, + "step": 13694 + }, + { + "epoch": 4.203499079189687, + "grad_norm": 0.23177236318588257, + "learning_rate": 6.5094929578136e-05, + "loss": 1.7582, + "step": 13695 + }, + { + "epoch": 4.203806015960712, + "grad_norm": 0.2315056324005127, + "learning_rate": 6.509019086979779e-05, + "loss": 1.7418, + "step": 13696 + }, + { + "epoch": 4.204112952731737, + "grad_norm": 0.25565484166145325, + "learning_rate": 6.508545201232947e-05, + "loss": 1.7476, + "step": 13697 + }, + { + "epoch": 4.204419889502763, + "grad_norm": 0.29210081696510315, + "learning_rate": 6.508071300577787e-05, + "loss": 1.8397, + "step": 13698 + }, + { + "epoch": 4.204726826273788, + "grad_norm": 0.2830582559108734, + "learning_rate": 6.507597385018984e-05, + "loss": 1.834, + "step": 13699 + }, + { + "epoch": 4.2050337630448125, + "grad_norm": 0.23013398051261902, + "learning_rate": 6.507123454561217e-05, + "loss": 1.7593, + "step": 13700 + }, + { + "epoch": 4.205340699815838, + "grad_norm": 0.21970276534557343, + "learning_rate": 6.506649509209174e-05, + "loss": 1.754, + "step": 13701 + }, + { + "epoch": 4.205647636586863, + "grad_norm": 0.32052233815193176, + "learning_rate": 6.50617554896754e-05, + "loss": 1.7531, + "step": 13702 + }, + { + "epoch": 4.2059545733578885, + "grad_norm": 0.2597332000732422, + "learning_rate": 6.505701573840995e-05, + "loss": 1.7836, + "step": 13703 + }, + { + "epoch": 4.206261510128914, + "grad_norm": 0.22070355713367462, + "learning_rate": 6.505227583834224e-05, + "loss": 1.7225, + "step": 13704 + }, + { + "epoch": 4.206568446899938, + "grad_norm": 0.27219358086586, + "learning_rate": 6.50475357895191e-05, + "loss": 1.8215, + "step": 13705 + }, + { + "epoch": 4.206875383670964, + "grad_norm": 0.32541659474372864, + "learning_rate": 6.504279559198741e-05, + "loss": 1.7786, + "step": 13706 + }, + { + "epoch": 4.207182320441989, + "grad_norm": 0.25871729850769043, + "learning_rate": 6.5038055245794e-05, + "loss": 1.7621, + "step": 13707 + }, + { + "epoch": 4.207489257213014, + "grad_norm": 0.2190464735031128, + "learning_rate": 6.50333147509857e-05, + "loss": 1.7612, + "step": 13708 + }, + { + "epoch": 4.20779619398404, + "grad_norm": 0.19565832614898682, + "learning_rate": 6.50285741076094e-05, + "loss": 1.7581, + "step": 13709 + }, + { + "epoch": 4.208103130755064, + "grad_norm": 0.1889251321554184, + "learning_rate": 6.50238333157119e-05, + "loss": 1.7611, + "step": 13710 + }, + { + "epoch": 4.208410067526089, + "grad_norm": 0.2013053596019745, + "learning_rate": 6.501909237534008e-05, + "loss": 1.7393, + "step": 13711 + }, + { + "epoch": 4.208717004297115, + "grad_norm": 0.1899433434009552, + "learning_rate": 6.501435128654077e-05, + "loss": 1.7122, + "step": 13712 + }, + { + "epoch": 4.20902394106814, + "grad_norm": 0.19337882101535797, + "learning_rate": 6.500961004936085e-05, + "loss": 1.7538, + "step": 13713 + }, + { + "epoch": 4.209330877839165, + "grad_norm": 0.20419920980930328, + "learning_rate": 6.500486866384718e-05, + "loss": 1.728, + "step": 13714 + }, + { + "epoch": 4.209637814610191, + "grad_norm": 0.20615679025650024, + "learning_rate": 6.50001271300466e-05, + "loss": 1.7843, + "step": 13715 + }, + { + "epoch": 4.209944751381215, + "grad_norm": 0.22178977727890015, + "learning_rate": 6.499538544800596e-05, + "loss": 1.7751, + "step": 13716 + }, + { + "epoch": 4.2102516881522405, + "grad_norm": 0.23703891038894653, + "learning_rate": 6.499064361777214e-05, + "loss": 1.7304, + "step": 13717 + }, + { + "epoch": 4.210558624923266, + "grad_norm": 0.2785723805427551, + "learning_rate": 6.498590163939198e-05, + "loss": 1.802, + "step": 13718 + }, + { + "epoch": 4.210865561694291, + "grad_norm": 0.23277060687541962, + "learning_rate": 6.498115951291237e-05, + "loss": 1.7316, + "step": 13719 + }, + { + "epoch": 4.2111724984653165, + "grad_norm": 0.22289474308490753, + "learning_rate": 6.497641723838017e-05, + "loss": 1.8469, + "step": 13720 + }, + { + "epoch": 4.211479435236341, + "grad_norm": 0.2715846002101898, + "learning_rate": 6.497167481584221e-05, + "loss": 1.7919, + "step": 13721 + }, + { + "epoch": 4.211786372007366, + "grad_norm": 0.29262226819992065, + "learning_rate": 6.49669322453454e-05, + "loss": 1.8379, + "step": 13722 + }, + { + "epoch": 4.212093308778392, + "grad_norm": 0.29136186838150024, + "learning_rate": 6.49621895269366e-05, + "loss": 1.789, + "step": 13723 + }, + { + "epoch": 4.212400245549417, + "grad_norm": 0.25110194087028503, + "learning_rate": 6.495744666066266e-05, + "loss": 1.7574, + "step": 13724 + }, + { + "epoch": 4.212707182320442, + "grad_norm": 0.2301366776227951, + "learning_rate": 6.495270364657048e-05, + "loss": 1.7637, + "step": 13725 + }, + { + "epoch": 4.213014119091467, + "grad_norm": 0.2556478977203369, + "learning_rate": 6.49479604847069e-05, + "loss": 1.7975, + "step": 13726 + }, + { + "epoch": 4.213321055862492, + "grad_norm": 0.2645667493343353, + "learning_rate": 6.494321717511884e-05, + "loss": 1.7594, + "step": 13727 + }, + { + "epoch": 4.213627992633517, + "grad_norm": 0.23664188385009766, + "learning_rate": 6.493847371785312e-05, + "loss": 1.7963, + "step": 13728 + }, + { + "epoch": 4.213934929404543, + "grad_norm": 0.2947930693626404, + "learning_rate": 6.493373011295665e-05, + "loss": 1.7477, + "step": 13729 + }, + { + "epoch": 4.214241866175568, + "grad_norm": 0.34598737955093384, + "learning_rate": 6.492898636047631e-05, + "loss": 1.7014, + "step": 13730 + }, + { + "epoch": 4.214548802946593, + "grad_norm": 0.24406935274600983, + "learning_rate": 6.4924242460459e-05, + "loss": 1.7436, + "step": 13731 + }, + { + "epoch": 4.214855739717618, + "grad_norm": 0.27176225185394287, + "learning_rate": 6.491949841295156e-05, + "loss": 1.8429, + "step": 13732 + }, + { + "epoch": 4.215162676488643, + "grad_norm": 0.2506968080997467, + "learning_rate": 6.491475421800089e-05, + "loss": 1.7519, + "step": 13733 + }, + { + "epoch": 4.2154696132596685, + "grad_norm": 0.2240980863571167, + "learning_rate": 6.491000987565387e-05, + "loss": 1.7595, + "step": 13734 + }, + { + "epoch": 4.215776550030694, + "grad_norm": 0.23201732337474823, + "learning_rate": 6.490526538595741e-05, + "loss": 1.7466, + "step": 13735 + }, + { + "epoch": 4.216083486801719, + "grad_norm": 0.24624750018119812, + "learning_rate": 6.490052074895836e-05, + "loss": 1.7364, + "step": 13736 + }, + { + "epoch": 4.216390423572744, + "grad_norm": 0.22936980426311493, + "learning_rate": 6.489577596470366e-05, + "loss": 1.7095, + "step": 13737 + }, + { + "epoch": 4.216697360343769, + "grad_norm": 0.2106638103723526, + "learning_rate": 6.489103103324016e-05, + "loss": 1.7387, + "step": 13738 + }, + { + "epoch": 4.217004297114794, + "grad_norm": 0.2936140298843384, + "learning_rate": 6.488628595461477e-05, + "loss": 1.9129, + "step": 13739 + }, + { + "epoch": 4.21731123388582, + "grad_norm": 0.21871696412563324, + "learning_rate": 6.488154072887435e-05, + "loss": 1.7489, + "step": 13740 + }, + { + "epoch": 4.217618170656845, + "grad_norm": 0.25941070914268494, + "learning_rate": 6.487679535606583e-05, + "loss": 1.7788, + "step": 13741 + }, + { + "epoch": 4.21792510742787, + "grad_norm": 0.2540862560272217, + "learning_rate": 6.487204983623612e-05, + "loss": 1.8074, + "step": 13742 + }, + { + "epoch": 4.218232044198895, + "grad_norm": 0.25180327892303467, + "learning_rate": 6.486730416943207e-05, + "loss": 1.7503, + "step": 13743 + }, + { + "epoch": 4.21853898096992, + "grad_norm": 0.26625585556030273, + "learning_rate": 6.486255835570063e-05, + "loss": 1.8149, + "step": 13744 + }, + { + "epoch": 4.218845917740945, + "grad_norm": 0.3023914396762848, + "learning_rate": 6.485781239508867e-05, + "loss": 1.8599, + "step": 13745 + }, + { + "epoch": 4.219152854511971, + "grad_norm": 0.2683780789375305, + "learning_rate": 6.48530662876431e-05, + "loss": 1.7911, + "step": 13746 + }, + { + "epoch": 4.219459791282996, + "grad_norm": 0.20747442543506622, + "learning_rate": 6.484832003341081e-05, + "loss": 1.7343, + "step": 13747 + }, + { + "epoch": 4.2197667280540205, + "grad_norm": 0.29284465312957764, + "learning_rate": 6.484357363243873e-05, + "loss": 1.7917, + "step": 13748 + }, + { + "epoch": 4.220073664825046, + "grad_norm": 0.24303840100765228, + "learning_rate": 6.483882708477376e-05, + "loss": 1.7921, + "step": 13749 + }, + { + "epoch": 4.220380601596071, + "grad_norm": 0.26253026723861694, + "learning_rate": 6.48340803904628e-05, + "loss": 1.7971, + "step": 13750 + }, + { + "epoch": 4.2206875383670965, + "grad_norm": 0.23888511955738068, + "learning_rate": 6.482933354955275e-05, + "loss": 1.7967, + "step": 13751 + }, + { + "epoch": 4.220994475138122, + "grad_norm": 0.24966883659362793, + "learning_rate": 6.482458656209054e-05, + "loss": 1.7924, + "step": 13752 + }, + { + "epoch": 4.221301411909146, + "grad_norm": 0.26556864380836487, + "learning_rate": 6.481983942812309e-05, + "loss": 1.8608, + "step": 13753 + }, + { + "epoch": 4.221608348680172, + "grad_norm": 0.29064711928367615, + "learning_rate": 6.48150921476973e-05, + "loss": 1.7785, + "step": 13754 + }, + { + "epoch": 4.221915285451197, + "grad_norm": 0.30876123905181885, + "learning_rate": 6.481034472086008e-05, + "loss": 1.8287, + "step": 13755 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.2622467875480652, + "learning_rate": 6.480559714765835e-05, + "loss": 1.8336, + "step": 13756 + }, + { + "epoch": 4.222529158993248, + "grad_norm": 0.2502644956111908, + "learning_rate": 6.480084942813902e-05, + "loss": 1.7803, + "step": 13757 + }, + { + "epoch": 4.222836095764273, + "grad_norm": 0.2879922688007355, + "learning_rate": 6.479610156234903e-05, + "loss": 1.7544, + "step": 13758 + }, + { + "epoch": 4.223143032535297, + "grad_norm": 0.2831384241580963, + "learning_rate": 6.47913535503353e-05, + "loss": 1.887, + "step": 13759 + }, + { + "epoch": 4.223449969306323, + "grad_norm": 0.3221064805984497, + "learning_rate": 6.478660539214474e-05, + "loss": 1.7455, + "step": 13760 + }, + { + "epoch": 4.223756906077348, + "grad_norm": 0.4231930673122406, + "learning_rate": 6.478185708782427e-05, + "loss": 1.8209, + "step": 13761 + }, + { + "epoch": 4.224063842848373, + "grad_norm": 0.34327802062034607, + "learning_rate": 6.477710863742083e-05, + "loss": 1.7754, + "step": 13762 + }, + { + "epoch": 4.224370779619399, + "grad_norm": 0.21713349223136902, + "learning_rate": 6.477236004098135e-05, + "loss": 1.7576, + "step": 13763 + }, + { + "epoch": 4.224677716390423, + "grad_norm": 0.3262602388858795, + "learning_rate": 6.476761129855275e-05, + "loss": 1.7772, + "step": 13764 + }, + { + "epoch": 4.2249846531614486, + "grad_norm": 0.3231413662433624, + "learning_rate": 6.476286241018195e-05, + "loss": 1.7821, + "step": 13765 + }, + { + "epoch": 4.225291589932474, + "grad_norm": 0.2440098226070404, + "learning_rate": 6.475811337591588e-05, + "loss": 1.7684, + "step": 13766 + }, + { + "epoch": 4.225598526703499, + "grad_norm": 0.329949289560318, + "learning_rate": 6.475336419580151e-05, + "loss": 1.8564, + "step": 13767 + }, + { + "epoch": 4.225905463474525, + "grad_norm": 0.3567483425140381, + "learning_rate": 6.474861486988574e-05, + "loss": 1.7625, + "step": 13768 + }, + { + "epoch": 4.226212400245549, + "grad_norm": 0.25257283449172974, + "learning_rate": 6.47438653982155e-05, + "loss": 1.823, + "step": 13769 + }, + { + "epoch": 4.226519337016574, + "grad_norm": 0.31542617082595825, + "learning_rate": 6.473911578083776e-05, + "loss": 1.7817, + "step": 13770 + }, + { + "epoch": 4.2268262737876, + "grad_norm": 0.29670149087905884, + "learning_rate": 6.473436601779944e-05, + "loss": 1.7493, + "step": 13771 + }, + { + "epoch": 4.227133210558625, + "grad_norm": 0.2635453939437866, + "learning_rate": 6.472961610914745e-05, + "loss": 1.792, + "step": 13772 + }, + { + "epoch": 4.22744014732965, + "grad_norm": 0.25017979741096497, + "learning_rate": 6.472486605492878e-05, + "loss": 1.7183, + "step": 13773 + }, + { + "epoch": 4.227747084100676, + "grad_norm": 0.3766646087169647, + "learning_rate": 6.472011585519034e-05, + "loss": 1.8039, + "step": 13774 + }, + { + "epoch": 4.2280540208717, + "grad_norm": 0.29860204458236694, + "learning_rate": 6.47153655099791e-05, + "loss": 1.8016, + "step": 13775 + }, + { + "epoch": 4.2283609576427255, + "grad_norm": 0.2540898323059082, + "learning_rate": 6.4710615019342e-05, + "loss": 1.8481, + "step": 13776 + }, + { + "epoch": 4.228667894413751, + "grad_norm": 0.3677786886692047, + "learning_rate": 6.470586438332597e-05, + "loss": 1.7663, + "step": 13777 + }, + { + "epoch": 4.228974831184776, + "grad_norm": 0.35693466663360596, + "learning_rate": 6.470111360197797e-05, + "loss": 1.7733, + "step": 13778 + }, + { + "epoch": 4.2292817679558015, + "grad_norm": 0.23747926950454712, + "learning_rate": 6.469636267534496e-05, + "loss": 1.7938, + "step": 13779 + }, + { + "epoch": 4.229588704726826, + "grad_norm": 0.32890695333480835, + "learning_rate": 6.469161160347386e-05, + "loss": 1.7233, + "step": 13780 + }, + { + "epoch": 4.229895641497851, + "grad_norm": 0.3437706530094147, + "learning_rate": 6.468686038641164e-05, + "loss": 1.7716, + "step": 13781 + }, + { + "epoch": 4.230202578268877, + "grad_norm": 0.23452162742614746, + "learning_rate": 6.468210902420527e-05, + "loss": 1.764, + "step": 13782 + }, + { + "epoch": 4.230509515039902, + "grad_norm": 0.3205265402793884, + "learning_rate": 6.46773575169017e-05, + "loss": 1.7464, + "step": 13783 + }, + { + "epoch": 4.230816451810927, + "grad_norm": 0.4234732985496521, + "learning_rate": 6.467260586454787e-05, + "loss": 1.7786, + "step": 13784 + }, + { + "epoch": 4.231123388581952, + "grad_norm": 0.2484128773212433, + "learning_rate": 6.466785406719076e-05, + "loss": 1.8125, + "step": 13785 + }, + { + "epoch": 4.231430325352977, + "grad_norm": 0.3696556091308594, + "learning_rate": 6.46631021248773e-05, + "loss": 1.7974, + "step": 13786 + }, + { + "epoch": 4.231737262124002, + "grad_norm": 0.4251437485218048, + "learning_rate": 6.465835003765449e-05, + "loss": 1.7486, + "step": 13787 + }, + { + "epoch": 4.232044198895028, + "grad_norm": 0.2507621943950653, + "learning_rate": 6.465359780556927e-05, + "loss": 1.829, + "step": 13788 + }, + { + "epoch": 4.232351135666053, + "grad_norm": 0.2911818325519562, + "learning_rate": 6.464884542866861e-05, + "loss": 1.7401, + "step": 13789 + }, + { + "epoch": 4.232658072437078, + "grad_norm": 0.35354506969451904, + "learning_rate": 6.464409290699946e-05, + "loss": 1.7848, + "step": 13790 + }, + { + "epoch": 4.232965009208103, + "grad_norm": 0.2659081518650055, + "learning_rate": 6.46393402406088e-05, + "loss": 1.7408, + "step": 13791 + }, + { + "epoch": 4.233271945979128, + "grad_norm": 0.22676481306552887, + "learning_rate": 6.46345874295436e-05, + "loss": 1.7542, + "step": 13792 + }, + { + "epoch": 4.2335788827501535, + "grad_norm": 0.2549789845943451, + "learning_rate": 6.462983447385085e-05, + "loss": 1.8095, + "step": 13793 + }, + { + "epoch": 4.233885819521179, + "grad_norm": 0.2157238870859146, + "learning_rate": 6.462508137357748e-05, + "loss": 1.7529, + "step": 13794 + }, + { + "epoch": 4.234192756292204, + "grad_norm": 0.2494724988937378, + "learning_rate": 6.46203281287705e-05, + "loss": 1.7839, + "step": 13795 + }, + { + "epoch": 4.234499693063229, + "grad_norm": 0.29560065269470215, + "learning_rate": 6.461557473947685e-05, + "loss": 1.7239, + "step": 13796 + }, + { + "epoch": 4.234806629834254, + "grad_norm": 0.23693916201591492, + "learning_rate": 6.461082120574354e-05, + "loss": 1.8074, + "step": 13797 + }, + { + "epoch": 4.235113566605279, + "grad_norm": 0.2538869082927704, + "learning_rate": 6.460606752761752e-05, + "loss": 1.8319, + "step": 13798 + }, + { + "epoch": 4.235420503376305, + "grad_norm": 0.3186401426792145, + "learning_rate": 6.460131370514578e-05, + "loss": 1.7877, + "step": 13799 + }, + { + "epoch": 4.23572744014733, + "grad_norm": 0.2473619133234024, + "learning_rate": 6.45965597383753e-05, + "loss": 1.8323, + "step": 13800 + }, + { + "epoch": 4.236034376918354, + "grad_norm": 0.32806503772735596, + "learning_rate": 6.459180562735307e-05, + "loss": 1.744, + "step": 13801 + }, + { + "epoch": 4.23634131368938, + "grad_norm": 0.3975784480571747, + "learning_rate": 6.458705137212606e-05, + "loss": 1.7216, + "step": 13802 + }, + { + "epoch": 4.236648250460405, + "grad_norm": 0.2946135997772217, + "learning_rate": 6.458229697274125e-05, + "loss": 1.8781, + "step": 13803 + }, + { + "epoch": 4.23695518723143, + "grad_norm": 0.25109192728996277, + "learning_rate": 6.457754242924565e-05, + "loss": 1.7458, + "step": 13804 + }, + { + "epoch": 4.237262124002456, + "grad_norm": 0.2763883173465729, + "learning_rate": 6.457278774168623e-05, + "loss": 1.7612, + "step": 13805 + }, + { + "epoch": 4.237569060773481, + "grad_norm": 0.22427856922149658, + "learning_rate": 6.456803291010996e-05, + "loss": 1.8049, + "step": 13806 + }, + { + "epoch": 4.2378759975445055, + "grad_norm": 0.28295788168907166, + "learning_rate": 6.456327793456387e-05, + "loss": 1.7608, + "step": 13807 + }, + { + "epoch": 4.238182934315531, + "grad_norm": 0.27857527136802673, + "learning_rate": 6.455852281509493e-05, + "loss": 1.7281, + "step": 13808 + }, + { + "epoch": 4.238489871086556, + "grad_norm": 0.24014849960803986, + "learning_rate": 6.455376755175012e-05, + "loss": 1.7247, + "step": 13809 + }, + { + "epoch": 4.2387968078575815, + "grad_norm": 0.25149038434028625, + "learning_rate": 6.454901214457646e-05, + "loss": 1.8575, + "step": 13810 + }, + { + "epoch": 4.239103744628607, + "grad_norm": 0.32072681188583374, + "learning_rate": 6.454425659362093e-05, + "loss": 1.7421, + "step": 13811 + }, + { + "epoch": 4.239410681399631, + "grad_norm": 0.28418242931365967, + "learning_rate": 6.453950089893054e-05, + "loss": 1.7031, + "step": 13812 + }, + { + "epoch": 4.239717618170657, + "grad_norm": 0.23725132644176483, + "learning_rate": 6.453474506055228e-05, + "loss": 1.7901, + "step": 13813 + }, + { + "epoch": 4.240024554941682, + "grad_norm": 0.3056317865848541, + "learning_rate": 6.452998907853315e-05, + "loss": 1.7414, + "step": 13814 + }, + { + "epoch": 4.240331491712707, + "grad_norm": 0.3111891448497772, + "learning_rate": 6.452523295292013e-05, + "loss": 1.7532, + "step": 13815 + }, + { + "epoch": 4.240638428483733, + "grad_norm": 0.2126779705286026, + "learning_rate": 6.452047668376027e-05, + "loss": 1.6779, + "step": 13816 + }, + { + "epoch": 4.240945365254758, + "grad_norm": 0.26660779118537903, + "learning_rate": 6.451572027110054e-05, + "loss": 1.7162, + "step": 13817 + }, + { + "epoch": 4.241252302025782, + "grad_norm": 0.25901922583580017, + "learning_rate": 6.451096371498794e-05, + "loss": 1.7784, + "step": 13818 + }, + { + "epoch": 4.241559238796808, + "grad_norm": 0.24091807007789612, + "learning_rate": 6.450620701546953e-05, + "loss": 1.7928, + "step": 13819 + }, + { + "epoch": 4.241866175567833, + "grad_norm": 0.25097009539604187, + "learning_rate": 6.450145017259225e-05, + "loss": 1.761, + "step": 13820 + }, + { + "epoch": 4.242173112338858, + "grad_norm": 0.22978942096233368, + "learning_rate": 6.449669318640315e-05, + "loss": 1.7891, + "step": 13821 + }, + { + "epoch": 4.242480049109884, + "grad_norm": 0.27255937457084656, + "learning_rate": 6.449193605694923e-05, + "loss": 1.7964, + "step": 13822 + }, + { + "epoch": 4.242786985880908, + "grad_norm": 0.2210773378610611, + "learning_rate": 6.44871787842775e-05, + "loss": 1.7628, + "step": 13823 + }, + { + "epoch": 4.2430939226519335, + "grad_norm": 0.25784751772880554, + "learning_rate": 6.448242136843497e-05, + "loss": 1.7596, + "step": 13824 + }, + { + "epoch": 4.243400859422959, + "grad_norm": 0.23475486040115356, + "learning_rate": 6.447766380946868e-05, + "loss": 1.8174, + "step": 13825 + }, + { + "epoch": 4.243707796193984, + "grad_norm": 0.2567705512046814, + "learning_rate": 6.447290610742561e-05, + "loss": 1.737, + "step": 13826 + }, + { + "epoch": 4.2440147329650095, + "grad_norm": 0.23973144590854645, + "learning_rate": 6.446814826235281e-05, + "loss": 1.7881, + "step": 13827 + }, + { + "epoch": 4.244321669736034, + "grad_norm": 0.25584739446640015, + "learning_rate": 6.446339027429729e-05, + "loss": 1.7673, + "step": 13828 + }, + { + "epoch": 4.244628606507059, + "grad_norm": 0.2653748393058777, + "learning_rate": 6.445863214330608e-05, + "loss": 1.7443, + "step": 13829 + }, + { + "epoch": 4.244935543278085, + "grad_norm": 0.2492038607597351, + "learning_rate": 6.445387386942619e-05, + "loss": 1.7223, + "step": 13830 + }, + { + "epoch": 4.24524248004911, + "grad_norm": 0.2282228320837021, + "learning_rate": 6.444911545270464e-05, + "loss": 1.7577, + "step": 13831 + }, + { + "epoch": 4.245549416820135, + "grad_norm": 0.2411092072725296, + "learning_rate": 6.444435689318845e-05, + "loss": 1.7324, + "step": 13832 + }, + { + "epoch": 4.245856353591161, + "grad_norm": 0.21557089686393738, + "learning_rate": 6.443959819092468e-05, + "loss": 1.7355, + "step": 13833 + }, + { + "epoch": 4.246163290362185, + "grad_norm": 0.2500394880771637, + "learning_rate": 6.443483934596033e-05, + "loss": 1.775, + "step": 13834 + }, + { + "epoch": 4.24647022713321, + "grad_norm": 0.24135248363018036, + "learning_rate": 6.443008035834244e-05, + "loss": 1.7885, + "step": 13835 + }, + { + "epoch": 4.246777163904236, + "grad_norm": 0.22860904037952423, + "learning_rate": 6.442532122811803e-05, + "loss": 1.7891, + "step": 13836 + }, + { + "epoch": 4.247084100675261, + "grad_norm": 0.2277665138244629, + "learning_rate": 6.442056195533415e-05, + "loss": 1.7583, + "step": 13837 + }, + { + "epoch": 4.247391037446286, + "grad_norm": 0.22822454571723938, + "learning_rate": 6.441580254003782e-05, + "loss": 1.7777, + "step": 13838 + }, + { + "epoch": 4.247697974217311, + "grad_norm": 0.24274896085262299, + "learning_rate": 6.441104298227608e-05, + "loss": 1.7537, + "step": 13839 + }, + { + "epoch": 4.248004910988336, + "grad_norm": 0.25080999732017517, + "learning_rate": 6.440628328209598e-05, + "loss": 1.7537, + "step": 13840 + }, + { + "epoch": 4.2483118477593615, + "grad_norm": 0.22409579157829285, + "learning_rate": 6.440152343954453e-05, + "loss": 1.7652, + "step": 13841 + }, + { + "epoch": 4.248618784530387, + "grad_norm": 0.24028798937797546, + "learning_rate": 6.439676345466877e-05, + "loss": 1.7512, + "step": 13842 + }, + { + "epoch": 4.248925721301412, + "grad_norm": 0.28739503026008606, + "learning_rate": 6.439200332751576e-05, + "loss": 1.8034, + "step": 13843 + }, + { + "epoch": 4.249232658072437, + "grad_norm": 0.2244807928800583, + "learning_rate": 6.438724305813255e-05, + "loss": 1.7243, + "step": 13844 + }, + { + "epoch": 4.249539594843462, + "grad_norm": 0.24478118121623993, + "learning_rate": 6.438248264656618e-05, + "loss": 1.7754, + "step": 13845 + }, + { + "epoch": 4.249846531614487, + "grad_norm": 0.25554370880126953, + "learning_rate": 6.437772209286368e-05, + "loss": 1.7845, + "step": 13846 + }, + { + "epoch": 4.250153468385513, + "grad_norm": 0.24478472769260406, + "learning_rate": 6.43729613970721e-05, + "loss": 1.7954, + "step": 13847 + }, + { + "epoch": 4.250460405156538, + "grad_norm": 0.22287282347679138, + "learning_rate": 6.436820055923849e-05, + "loss": 1.7379, + "step": 13848 + }, + { + "epoch": 4.250767341927563, + "grad_norm": 0.2810569703578949, + "learning_rate": 6.43634395794099e-05, + "loss": 1.8492, + "step": 13849 + }, + { + "epoch": 4.251074278698588, + "grad_norm": 0.2544163465499878, + "learning_rate": 6.435867845763337e-05, + "loss": 1.7846, + "step": 13850 + }, + { + "epoch": 4.251381215469613, + "grad_norm": 0.27879175543785095, + "learning_rate": 6.435391719395598e-05, + "loss": 1.767, + "step": 13851 + }, + { + "epoch": 4.2516881522406385, + "grad_norm": 0.2876715362071991, + "learning_rate": 6.434915578842477e-05, + "loss": 1.8048, + "step": 13852 + }, + { + "epoch": 4.251995089011664, + "grad_norm": 0.27844297885894775, + "learning_rate": 6.434439424108678e-05, + "loss": 1.7472, + "step": 13853 + }, + { + "epoch": 4.252302025782689, + "grad_norm": 0.2417020946741104, + "learning_rate": 6.43396325519891e-05, + "loss": 1.8481, + "step": 13854 + }, + { + "epoch": 4.252608962553714, + "grad_norm": 0.23828522861003876, + "learning_rate": 6.433487072117874e-05, + "loss": 1.7536, + "step": 13855 + }, + { + "epoch": 4.252915899324739, + "grad_norm": 0.22304333746433258, + "learning_rate": 6.43301087487028e-05, + "loss": 1.741, + "step": 13856 + }, + { + "epoch": 4.253222836095764, + "grad_norm": 0.27089163661003113, + "learning_rate": 6.432534663460832e-05, + "loss": 1.7974, + "step": 13857 + }, + { + "epoch": 4.25352977286679, + "grad_norm": 0.2439592182636261, + "learning_rate": 6.432058437894237e-05, + "loss": 1.7713, + "step": 13858 + }, + { + "epoch": 4.253836709637815, + "grad_norm": 0.2368553727865219, + "learning_rate": 6.431582198175203e-05, + "loss": 1.6915, + "step": 13859 + }, + { + "epoch": 4.25414364640884, + "grad_norm": 0.25248441100120544, + "learning_rate": 6.431105944308431e-05, + "loss": 1.7286, + "step": 13860 + }, + { + "epoch": 4.254450583179865, + "grad_norm": 0.20928484201431274, + "learning_rate": 6.430629676298634e-05, + "loss": 1.79, + "step": 13861 + }, + { + "epoch": 4.25475751995089, + "grad_norm": 0.25262540578842163, + "learning_rate": 6.430153394150514e-05, + "loss": 1.7443, + "step": 13862 + }, + { + "epoch": 4.255064456721915, + "grad_norm": 0.27508237957954407, + "learning_rate": 6.429677097868783e-05, + "loss": 1.8207, + "step": 13863 + }, + { + "epoch": 4.255371393492941, + "grad_norm": 0.28129303455352783, + "learning_rate": 6.429200787458141e-05, + "loss": 1.7589, + "step": 13864 + }, + { + "epoch": 4.255678330263966, + "grad_norm": 0.3205658495426178, + "learning_rate": 6.428724462923302e-05, + "loss": 1.8037, + "step": 13865 + }, + { + "epoch": 4.2559852670349905, + "grad_norm": 0.24048078060150146, + "learning_rate": 6.428248124268969e-05, + "loss": 1.7303, + "step": 13866 + }, + { + "epoch": 4.256292203806016, + "grad_norm": 0.24742475152015686, + "learning_rate": 6.427771771499852e-05, + "loss": 1.7753, + "step": 13867 + }, + { + "epoch": 4.256599140577041, + "grad_norm": 0.3082354962825775, + "learning_rate": 6.427295404620656e-05, + "loss": 1.7275, + "step": 13868 + }, + { + "epoch": 4.2569060773480665, + "grad_norm": 0.23319822549819946, + "learning_rate": 6.426819023636093e-05, + "loss": 1.7562, + "step": 13869 + }, + { + "epoch": 4.257213014119092, + "grad_norm": 0.2611405551433563, + "learning_rate": 6.426342628550866e-05, + "loss": 1.7417, + "step": 13870 + }, + { + "epoch": 4.257519950890116, + "grad_norm": 0.2577543258666992, + "learning_rate": 6.425866219369686e-05, + "loss": 1.6906, + "step": 13871 + }, + { + "epoch": 4.257826887661142, + "grad_norm": 0.31353357434272766, + "learning_rate": 6.42538979609726e-05, + "loss": 1.7155, + "step": 13872 + }, + { + "epoch": 4.258133824432167, + "grad_norm": 0.23280073702335358, + "learning_rate": 6.424913358738296e-05, + "loss": 1.7576, + "step": 13873 + }, + { + "epoch": 4.258440761203192, + "grad_norm": 0.24087542295455933, + "learning_rate": 6.424436907297504e-05, + "loss": 1.7622, + "step": 13874 + }, + { + "epoch": 4.258747697974218, + "grad_norm": 0.3146509826183319, + "learning_rate": 6.42396044177959e-05, + "loss": 1.769, + "step": 13875 + }, + { + "epoch": 4.259054634745242, + "grad_norm": 0.2645811438560486, + "learning_rate": 6.423483962189268e-05, + "loss": 1.7713, + "step": 13876 + }, + { + "epoch": 4.259361571516267, + "grad_norm": 0.2166455090045929, + "learning_rate": 6.423007468531238e-05, + "loss": 1.7705, + "step": 13877 + }, + { + "epoch": 4.259668508287293, + "grad_norm": 0.29142528772354126, + "learning_rate": 6.422530960810217e-05, + "loss": 1.7725, + "step": 13878 + }, + { + "epoch": 4.259975445058318, + "grad_norm": 0.28777652978897095, + "learning_rate": 6.422054439030911e-05, + "loss": 1.7853, + "step": 13879 + }, + { + "epoch": 4.260282381829343, + "grad_norm": 0.2285117357969284, + "learning_rate": 6.42157790319803e-05, + "loss": 1.7034, + "step": 13880 + }, + { + "epoch": 4.260589318600369, + "grad_norm": 0.32407644391059875, + "learning_rate": 6.421101353316282e-05, + "loss": 1.7858, + "step": 13881 + }, + { + "epoch": 4.260896255371393, + "grad_norm": 0.4803469777107239, + "learning_rate": 6.420624789390378e-05, + "loss": 1.7337, + "step": 13882 + }, + { + "epoch": 4.2612031921424185, + "grad_norm": 0.4245823919773102, + "learning_rate": 6.420148211425027e-05, + "loss": 1.8024, + "step": 13883 + }, + { + "epoch": 4.261510128913444, + "grad_norm": 0.22298674285411835, + "learning_rate": 6.419671619424938e-05, + "loss": 1.7129, + "step": 13884 + }, + { + "epoch": 4.261817065684469, + "grad_norm": 0.46955862641334534, + "learning_rate": 6.419195013394824e-05, + "loss": 1.7151, + "step": 13885 + }, + { + "epoch": 4.2621240024554945, + "grad_norm": 0.4809224009513855, + "learning_rate": 6.418718393339392e-05, + "loss": 1.7697, + "step": 13886 + }, + { + "epoch": 4.262430939226519, + "grad_norm": 0.2741130292415619, + "learning_rate": 6.418241759263353e-05, + "loss": 1.8133, + "step": 13887 + }, + { + "epoch": 4.262737875997544, + "grad_norm": 0.3673117756843567, + "learning_rate": 6.417765111171419e-05, + "loss": 1.7424, + "step": 13888 + }, + { + "epoch": 4.26304481276857, + "grad_norm": 0.4609327018260956, + "learning_rate": 6.417288449068299e-05, + "loss": 1.741, + "step": 13889 + }, + { + "epoch": 4.263351749539595, + "grad_norm": 0.2929460406303406, + "learning_rate": 6.416811772958702e-05, + "loss": 1.8385, + "step": 13890 + }, + { + "epoch": 4.26365868631062, + "grad_norm": 0.2727305293083191, + "learning_rate": 6.416335082847342e-05, + "loss": 1.794, + "step": 13891 + }, + { + "epoch": 4.263965623081646, + "grad_norm": 0.26089411973953247, + "learning_rate": 6.41585837873893e-05, + "loss": 1.7907, + "step": 13892 + }, + { + "epoch": 4.26427255985267, + "grad_norm": 0.24655573070049286, + "learning_rate": 6.415381660638174e-05, + "loss": 1.7481, + "step": 13893 + }, + { + "epoch": 4.264579496623695, + "grad_norm": 0.4186919629573822, + "learning_rate": 6.414904928549787e-05, + "loss": 1.8021, + "step": 13894 + }, + { + "epoch": 4.264886433394721, + "grad_norm": 0.38188236951828003, + "learning_rate": 6.414428182478478e-05, + "loss": 1.75, + "step": 13895 + }, + { + "epoch": 4.265193370165746, + "grad_norm": 0.23686440289020538, + "learning_rate": 6.413951422428963e-05, + "loss": 1.7882, + "step": 13896 + }, + { + "epoch": 4.265500306936771, + "grad_norm": 0.35963737964630127, + "learning_rate": 6.413474648405952e-05, + "loss": 1.7427, + "step": 13897 + }, + { + "epoch": 4.265807243707796, + "grad_norm": 0.38558289408683777, + "learning_rate": 6.412997860414155e-05, + "loss": 1.7622, + "step": 13898 + }, + { + "epoch": 4.266114180478821, + "grad_norm": 0.2311459481716156, + "learning_rate": 6.412521058458285e-05, + "loss": 1.7894, + "step": 13899 + }, + { + "epoch": 4.2664211172498465, + "grad_norm": 0.2647818624973297, + "learning_rate": 6.412044242543054e-05, + "loss": 1.7399, + "step": 13900 + }, + { + "epoch": 4.266728054020872, + "grad_norm": 0.3174133002758026, + "learning_rate": 6.411567412673174e-05, + "loss": 1.7552, + "step": 13901 + }, + { + "epoch": 4.267034990791897, + "grad_norm": 0.25207316875457764, + "learning_rate": 6.411090568853358e-05, + "loss": 1.7876, + "step": 13902 + }, + { + "epoch": 4.267341927562922, + "grad_norm": 0.24549202620983124, + "learning_rate": 6.410613711088317e-05, + "loss": 1.8554, + "step": 13903 + }, + { + "epoch": 4.267648864333947, + "grad_norm": 0.26293641328811646, + "learning_rate": 6.410136839382765e-05, + "loss": 1.8553, + "step": 13904 + }, + { + "epoch": 4.267955801104972, + "grad_norm": 0.20258362591266632, + "learning_rate": 6.409659953741416e-05, + "loss": 1.7205, + "step": 13905 + }, + { + "epoch": 4.268262737875998, + "grad_norm": 0.24885907769203186, + "learning_rate": 6.409183054168979e-05, + "loss": 1.7718, + "step": 13906 + }, + { + "epoch": 4.268569674647023, + "grad_norm": 0.22737209498882294, + "learning_rate": 6.408706140670169e-05, + "loss": 1.7228, + "step": 13907 + }, + { + "epoch": 4.268876611418047, + "grad_norm": 0.2201235145330429, + "learning_rate": 6.4082292132497e-05, + "loss": 1.7451, + "step": 13908 + }, + { + "epoch": 4.269183548189073, + "grad_norm": 0.24108454585075378, + "learning_rate": 6.407752271912285e-05, + "loss": 1.7531, + "step": 13909 + }, + { + "epoch": 4.269490484960098, + "grad_norm": 0.21723641455173492, + "learning_rate": 6.407275316662636e-05, + "loss": 1.7139, + "step": 13910 + }, + { + "epoch": 4.269797421731123, + "grad_norm": 0.22557848691940308, + "learning_rate": 6.406798347505469e-05, + "loss": 1.7633, + "step": 13911 + }, + { + "epoch": 4.270104358502149, + "grad_norm": 0.24664700031280518, + "learning_rate": 6.406321364445494e-05, + "loss": 1.7854, + "step": 13912 + }, + { + "epoch": 4.270411295273174, + "grad_norm": 0.2599056661128998, + "learning_rate": 6.405844367487428e-05, + "loss": 1.7662, + "step": 13913 + }, + { + "epoch": 4.2707182320441985, + "grad_norm": 0.2378663718700409, + "learning_rate": 6.405367356635982e-05, + "loss": 1.7477, + "step": 13914 + }, + { + "epoch": 4.271025168815224, + "grad_norm": 0.27158626914024353, + "learning_rate": 6.404890331895876e-05, + "loss": 1.7426, + "step": 13915 + }, + { + "epoch": 4.271332105586249, + "grad_norm": 0.28585317730903625, + "learning_rate": 6.404413293271818e-05, + "loss": 1.7492, + "step": 13916 + }, + { + "epoch": 4.2716390423572745, + "grad_norm": 0.2321750968694687, + "learning_rate": 6.403936240768526e-05, + "loss": 1.8594, + "step": 13917 + }, + { + "epoch": 4.2719459791283, + "grad_norm": 0.25824111700057983, + "learning_rate": 6.40345917439071e-05, + "loss": 1.7622, + "step": 13918 + }, + { + "epoch": 4.272252915899324, + "grad_norm": 0.24641194939613342, + "learning_rate": 6.40298209414309e-05, + "loss": 1.7519, + "step": 13919 + }, + { + "epoch": 4.27255985267035, + "grad_norm": 0.2132398933172226, + "learning_rate": 6.40250500003038e-05, + "loss": 1.7339, + "step": 13920 + }, + { + "epoch": 4.272866789441375, + "grad_norm": 0.22630736231803894, + "learning_rate": 6.402027892057292e-05, + "loss": 1.7396, + "step": 13921 + }, + { + "epoch": 4.2731737262124, + "grad_norm": 0.295163631439209, + "learning_rate": 6.401550770228543e-05, + "loss": 1.8063, + "step": 13922 + }, + { + "epoch": 4.273480662983426, + "grad_norm": 0.2722746729850769, + "learning_rate": 6.401073634548848e-05, + "loss": 1.7775, + "step": 13923 + }, + { + "epoch": 4.273787599754451, + "grad_norm": 0.23201976716518402, + "learning_rate": 6.400596485022922e-05, + "loss": 1.7755, + "step": 13924 + }, + { + "epoch": 4.274094536525475, + "grad_norm": 0.23880761861801147, + "learning_rate": 6.40011932165548e-05, + "loss": 1.778, + "step": 13925 + }, + { + "epoch": 4.274401473296501, + "grad_norm": 0.22305625677108765, + "learning_rate": 6.399642144451239e-05, + "loss": 1.761, + "step": 13926 + }, + { + "epoch": 4.274708410067526, + "grad_norm": 0.21874886751174927, + "learning_rate": 6.399164953414914e-05, + "loss": 1.7148, + "step": 13927 + }, + { + "epoch": 4.2750153468385514, + "grad_norm": 0.2003604918718338, + "learning_rate": 6.398687748551221e-05, + "loss": 1.8049, + "step": 13928 + }, + { + "epoch": 4.275322283609577, + "grad_norm": 0.2443511188030243, + "learning_rate": 6.398210529864875e-05, + "loss": 1.782, + "step": 13929 + }, + { + "epoch": 4.275629220380601, + "grad_norm": 0.2297198623418808, + "learning_rate": 6.397733297360594e-05, + "loss": 1.7682, + "step": 13930 + }, + { + "epoch": 4.275936157151627, + "grad_norm": 0.23474562168121338, + "learning_rate": 6.39725605104309e-05, + "loss": 1.7809, + "step": 13931 + }, + { + "epoch": 4.276243093922652, + "grad_norm": 0.25908544659614563, + "learning_rate": 6.396778790917087e-05, + "loss": 1.7343, + "step": 13932 + }, + { + "epoch": 4.276550030693677, + "grad_norm": 0.2440379112958908, + "learning_rate": 6.396301516987295e-05, + "loss": 1.786, + "step": 13933 + }, + { + "epoch": 4.276856967464703, + "grad_norm": 0.26185858249664307, + "learning_rate": 6.395824229258435e-05, + "loss": 1.7863, + "step": 13934 + }, + { + "epoch": 4.277163904235728, + "grad_norm": 0.24470919370651245, + "learning_rate": 6.39534692773522e-05, + "loss": 1.7774, + "step": 13935 + }, + { + "epoch": 4.277470841006752, + "grad_norm": 0.2612632215023041, + "learning_rate": 6.39486961242237e-05, + "loss": 1.7536, + "step": 13936 + }, + { + "epoch": 4.277777777777778, + "grad_norm": 0.26870301365852356, + "learning_rate": 6.3943922833246e-05, + "loss": 1.8177, + "step": 13937 + }, + { + "epoch": 4.278084714548803, + "grad_norm": 0.24445784091949463, + "learning_rate": 6.393914940446628e-05, + "loss": 1.7539, + "step": 13938 + }, + { + "epoch": 4.278391651319828, + "grad_norm": 0.2622319757938385, + "learning_rate": 6.393437583793174e-05, + "loss": 1.8252, + "step": 13939 + }, + { + "epoch": 4.278698588090854, + "grad_norm": 0.2586652636528015, + "learning_rate": 6.39296021336895e-05, + "loss": 1.7975, + "step": 13940 + }, + { + "epoch": 4.279005524861878, + "grad_norm": 0.19488228857517242, + "learning_rate": 6.392482829178678e-05, + "loss": 1.7678, + "step": 13941 + }, + { + "epoch": 4.2793124616329035, + "grad_norm": 0.23956604301929474, + "learning_rate": 6.392005431227074e-05, + "loss": 1.7444, + "step": 13942 + }, + { + "epoch": 4.279619398403929, + "grad_norm": 0.24195842444896698, + "learning_rate": 6.391528019518857e-05, + "loss": 1.8116, + "step": 13943 + }, + { + "epoch": 4.279926335174954, + "grad_norm": 0.21479523181915283, + "learning_rate": 6.391050594058746e-05, + "loss": 1.7351, + "step": 13944 + }, + { + "epoch": 4.2802332719459795, + "grad_norm": 0.2309941202402115, + "learning_rate": 6.390573154851456e-05, + "loss": 1.8245, + "step": 13945 + }, + { + "epoch": 4.280540208717004, + "grad_norm": 0.2375536412000656, + "learning_rate": 6.390095701901706e-05, + "loss": 1.7921, + "step": 13946 + }, + { + "epoch": 4.280847145488029, + "grad_norm": 0.25518664717674255, + "learning_rate": 6.389618235214216e-05, + "loss": 1.7549, + "step": 13947 + }, + { + "epoch": 4.281154082259055, + "grad_norm": 0.2579016089439392, + "learning_rate": 6.389140754793705e-05, + "loss": 1.7637, + "step": 13948 + }, + { + "epoch": 4.28146101903008, + "grad_norm": 0.25350916385650635, + "learning_rate": 6.388663260644892e-05, + "loss": 1.746, + "step": 13949 + }, + { + "epoch": 4.281767955801105, + "grad_norm": 0.2994026839733124, + "learning_rate": 6.388185752772493e-05, + "loss": 1.8196, + "step": 13950 + }, + { + "epoch": 4.28207489257213, + "grad_norm": 0.29938533902168274, + "learning_rate": 6.387708231181229e-05, + "loss": 1.7187, + "step": 13951 + }, + { + "epoch": 4.282381829343155, + "grad_norm": 0.23865137994289398, + "learning_rate": 6.387230695875819e-05, + "loss": 1.7317, + "step": 13952 + }, + { + "epoch": 4.28268876611418, + "grad_norm": 0.23812857270240784, + "learning_rate": 6.386753146860982e-05, + "loss": 1.7536, + "step": 13953 + }, + { + "epoch": 4.282995702885206, + "grad_norm": 0.3395650088787079, + "learning_rate": 6.386275584141438e-05, + "loss": 1.7932, + "step": 13954 + }, + { + "epoch": 4.283302639656231, + "grad_norm": 0.38207507133483887, + "learning_rate": 6.385798007721906e-05, + "loss": 1.8196, + "step": 13955 + }, + { + "epoch": 4.283609576427256, + "grad_norm": 0.32960978150367737, + "learning_rate": 6.385320417607107e-05, + "loss": 1.7898, + "step": 13956 + }, + { + "epoch": 4.283916513198281, + "grad_norm": 0.22978928685188293, + "learning_rate": 6.384842813801757e-05, + "loss": 1.7835, + "step": 13957 + }, + { + "epoch": 4.284223449969306, + "grad_norm": 0.24607588350772858, + "learning_rate": 6.38436519631058e-05, + "loss": 1.7829, + "step": 13958 + }, + { + "epoch": 4.2845303867403315, + "grad_norm": 0.2770270109176636, + "learning_rate": 6.383887565138295e-05, + "loss": 1.7294, + "step": 13959 + }, + { + "epoch": 4.284837323511357, + "grad_norm": 0.27644863724708557, + "learning_rate": 6.383409920289622e-05, + "loss": 1.829, + "step": 13960 + }, + { + "epoch": 4.285144260282382, + "grad_norm": 0.3870919942855835, + "learning_rate": 6.382932261769282e-05, + "loss": 1.8146, + "step": 13961 + }, + { + "epoch": 4.285451197053407, + "grad_norm": 0.3562348186969757, + "learning_rate": 6.382454589581994e-05, + "loss": 1.8225, + "step": 13962 + }, + { + "epoch": 4.285758133824432, + "grad_norm": 0.28444886207580566, + "learning_rate": 6.38197690373248e-05, + "loss": 1.7734, + "step": 13963 + }, + { + "epoch": 4.286065070595457, + "grad_norm": 0.27935758233070374, + "learning_rate": 6.381499204225459e-05, + "loss": 1.7402, + "step": 13964 + }, + { + "epoch": 4.286372007366483, + "grad_norm": 0.34188997745513916, + "learning_rate": 6.381021491065653e-05, + "loss": 1.7661, + "step": 13965 + }, + { + "epoch": 4.286678944137508, + "grad_norm": 0.28648918867111206, + "learning_rate": 6.380543764257785e-05, + "loss": 1.8312, + "step": 13966 + }, + { + "epoch": 4.286985880908533, + "grad_norm": 0.2733290493488312, + "learning_rate": 6.380066023806572e-05, + "loss": 1.7505, + "step": 13967 + }, + { + "epoch": 4.287292817679558, + "grad_norm": 0.3344273865222931, + "learning_rate": 6.37958826971674e-05, + "loss": 1.8392, + "step": 13968 + }, + { + "epoch": 4.287599754450583, + "grad_norm": 0.2655799090862274, + "learning_rate": 6.379110501993006e-05, + "loss": 1.7575, + "step": 13969 + }, + { + "epoch": 4.287906691221608, + "grad_norm": 0.2569151818752289, + "learning_rate": 6.378632720640095e-05, + "loss": 1.6619, + "step": 13970 + }, + { + "epoch": 4.288213627992634, + "grad_norm": 0.2477198988199234, + "learning_rate": 6.378154925662727e-05, + "loss": 1.7532, + "step": 13971 + }, + { + "epoch": 4.288520564763659, + "grad_norm": 0.2867630422115326, + "learning_rate": 6.377677117065624e-05, + "loss": 1.7725, + "step": 13972 + }, + { + "epoch": 4.2888275015346835, + "grad_norm": 0.28316137194633484, + "learning_rate": 6.37719929485351e-05, + "loss": 1.7628, + "step": 13973 + }, + { + "epoch": 4.289134438305709, + "grad_norm": 0.2934304475784302, + "learning_rate": 6.376721459031106e-05, + "loss": 1.7346, + "step": 13974 + }, + { + "epoch": 4.289441375076734, + "grad_norm": 0.22847147285938263, + "learning_rate": 6.376243609603129e-05, + "loss": 1.7409, + "step": 13975 + }, + { + "epoch": 4.2897483118477595, + "grad_norm": 0.360441118478775, + "learning_rate": 6.375765746574311e-05, + "loss": 1.808, + "step": 13976 + }, + { + "epoch": 4.290055248618785, + "grad_norm": 0.2750907242298126, + "learning_rate": 6.375287869949367e-05, + "loss": 1.8046, + "step": 13977 + }, + { + "epoch": 4.290362185389809, + "grad_norm": 0.26193201541900635, + "learning_rate": 6.374809979733022e-05, + "loss": 1.7097, + "step": 13978 + }, + { + "epoch": 4.290669122160835, + "grad_norm": 0.3282175064086914, + "learning_rate": 6.37433207593e-05, + "loss": 1.7924, + "step": 13979 + }, + { + "epoch": 4.29097605893186, + "grad_norm": 0.2845167815685272, + "learning_rate": 6.373854158545021e-05, + "loss": 1.7663, + "step": 13980 + }, + { + "epoch": 4.291282995702885, + "grad_norm": 0.21816621720790863, + "learning_rate": 6.37337622758281e-05, + "loss": 1.7368, + "step": 13981 + }, + { + "epoch": 4.291589932473911, + "grad_norm": 0.264272540807724, + "learning_rate": 6.372898283048094e-05, + "loss": 1.7377, + "step": 13982 + }, + { + "epoch": 4.291896869244935, + "grad_norm": 0.2182006686925888, + "learning_rate": 6.37242032494559e-05, + "loss": 1.8107, + "step": 13983 + }, + { + "epoch": 4.29220380601596, + "grad_norm": 0.26856422424316406, + "learning_rate": 6.371942353280023e-05, + "loss": 1.7708, + "step": 13984 + }, + { + "epoch": 4.292510742786986, + "grad_norm": 0.3025323748588562, + "learning_rate": 6.37146436805612e-05, + "loss": 1.7768, + "step": 13985 + }, + { + "epoch": 4.292817679558011, + "grad_norm": 0.2949144244194031, + "learning_rate": 6.3709863692786e-05, + "loss": 1.7848, + "step": 13986 + }, + { + "epoch": 4.293124616329036, + "grad_norm": 0.20670418441295624, + "learning_rate": 6.370508356952188e-05, + "loss": 1.7367, + "step": 13987 + }, + { + "epoch": 4.293431553100062, + "grad_norm": 0.2453860342502594, + "learning_rate": 6.370030331081611e-05, + "loss": 1.7246, + "step": 13988 + }, + { + "epoch": 4.293738489871086, + "grad_norm": 0.3413507044315338, + "learning_rate": 6.369552291671592e-05, + "loss": 1.7829, + "step": 13989 + }, + { + "epoch": 4.2940454266421115, + "grad_norm": 0.28352782130241394, + "learning_rate": 6.369074238726856e-05, + "loss": 1.7755, + "step": 13990 + }, + { + "epoch": 4.294352363413137, + "grad_norm": 0.21408751606941223, + "learning_rate": 6.368596172252124e-05, + "loss": 1.7292, + "step": 13991 + }, + { + "epoch": 4.294659300184162, + "grad_norm": 0.28372085094451904, + "learning_rate": 6.36811809225212e-05, + "loss": 1.8197, + "step": 13992 + }, + { + "epoch": 4.2949662369551875, + "grad_norm": 0.2400829792022705, + "learning_rate": 6.367639998731573e-05, + "loss": 1.7559, + "step": 13993 + }, + { + "epoch": 4.295273173726212, + "grad_norm": 0.22853593528270721, + "learning_rate": 6.367161891695207e-05, + "loss": 1.8116, + "step": 13994 + }, + { + "epoch": 4.295580110497237, + "grad_norm": 0.22098208963871002, + "learning_rate": 6.366683771147745e-05, + "loss": 1.7269, + "step": 13995 + }, + { + "epoch": 4.295887047268263, + "grad_norm": 0.22293934226036072, + "learning_rate": 6.366205637093914e-05, + "loss": 1.7944, + "step": 13996 + }, + { + "epoch": 4.296193984039288, + "grad_norm": 0.26120004057884216, + "learning_rate": 6.365727489538437e-05, + "loss": 1.7581, + "step": 13997 + }, + { + "epoch": 4.296500920810313, + "grad_norm": 0.2568937838077545, + "learning_rate": 6.365249328486041e-05, + "loss": 1.7356, + "step": 13998 + }, + { + "epoch": 4.296807857581339, + "grad_norm": 0.2419043630361557, + "learning_rate": 6.364771153941449e-05, + "loss": 1.8127, + "step": 13999 + }, + { + "epoch": 4.297114794352363, + "grad_norm": 0.2521972060203552, + "learning_rate": 6.364292965909391e-05, + "loss": 1.7445, + "step": 14000 + }, + { + "epoch": 4.297421731123388, + "grad_norm": 0.3269292414188385, + "learning_rate": 6.363814764394589e-05, + "loss": 1.7835, + "step": 14001 + }, + { + "epoch": 4.297728667894414, + "grad_norm": 0.258405864238739, + "learning_rate": 6.36333654940177e-05, + "loss": 1.7407, + "step": 14002 + }, + { + "epoch": 4.298035604665439, + "grad_norm": 0.21527236700057983, + "learning_rate": 6.362858320935662e-05, + "loss": 1.7729, + "step": 14003 + }, + { + "epoch": 4.298342541436464, + "grad_norm": 0.25343602895736694, + "learning_rate": 6.362380079000988e-05, + "loss": 1.8087, + "step": 14004 + }, + { + "epoch": 4.298649478207489, + "grad_norm": 0.26110637187957764, + "learning_rate": 6.361901823602474e-05, + "loss": 1.813, + "step": 14005 + }, + { + "epoch": 4.298956414978514, + "grad_norm": 0.26749926805496216, + "learning_rate": 6.361423554744851e-05, + "loss": 1.8193, + "step": 14006 + }, + { + "epoch": 4.2992633517495396, + "grad_norm": 0.22357676923274994, + "learning_rate": 6.360945272432841e-05, + "loss": 1.7498, + "step": 14007 + }, + { + "epoch": 4.299570288520565, + "grad_norm": 0.2367832362651825, + "learning_rate": 6.360466976671172e-05, + "loss": 1.7843, + "step": 14008 + }, + { + "epoch": 4.29987722529159, + "grad_norm": 0.23594366014003754, + "learning_rate": 6.35998866746457e-05, + "loss": 1.7442, + "step": 14009 + }, + { + "epoch": 4.300184162062616, + "grad_norm": 0.2660543918609619, + "learning_rate": 6.359510344817765e-05, + "loss": 1.7557, + "step": 14010 + }, + { + "epoch": 4.30049109883364, + "grad_norm": 0.191593199968338, + "learning_rate": 6.359032008735481e-05, + "loss": 1.7988, + "step": 14011 + }, + { + "epoch": 4.300798035604665, + "grad_norm": 0.2755490243434906, + "learning_rate": 6.358553659222447e-05, + "loss": 1.7551, + "step": 14012 + }, + { + "epoch": 4.301104972375691, + "grad_norm": 0.2900530993938446, + "learning_rate": 6.358075296283387e-05, + "loss": 1.7523, + "step": 14013 + }, + { + "epoch": 4.301411909146716, + "grad_norm": 0.22242774069309235, + "learning_rate": 6.357596919923033e-05, + "loss": 1.7626, + "step": 14014 + }, + { + "epoch": 4.301718845917741, + "grad_norm": 0.26636210083961487, + "learning_rate": 6.357118530146108e-05, + "loss": 1.7855, + "step": 14015 + }, + { + "epoch": 4.302025782688766, + "grad_norm": 0.3055269718170166, + "learning_rate": 6.356640126957344e-05, + "loss": 1.7528, + "step": 14016 + }, + { + "epoch": 4.302332719459791, + "grad_norm": 0.29695719480514526, + "learning_rate": 6.356161710361468e-05, + "loss": 1.7482, + "step": 14017 + }, + { + "epoch": 4.3026396562308165, + "grad_norm": 0.2369711697101593, + "learning_rate": 6.355683280363207e-05, + "loss": 1.7635, + "step": 14018 + }, + { + "epoch": 4.302946593001842, + "grad_norm": 0.26681363582611084, + "learning_rate": 6.35520483696729e-05, + "loss": 1.8814, + "step": 14019 + }, + { + "epoch": 4.303253529772867, + "grad_norm": 0.2623308598995209, + "learning_rate": 6.354726380178442e-05, + "loss": 1.8645, + "step": 14020 + }, + { + "epoch": 4.303560466543892, + "grad_norm": 0.23326413333415985, + "learning_rate": 6.354247910001394e-05, + "loss": 1.8093, + "step": 14021 + }, + { + "epoch": 4.303867403314917, + "grad_norm": 0.3037295639514923, + "learning_rate": 6.353769426440875e-05, + "loss": 1.8556, + "step": 14022 + }, + { + "epoch": 4.304174340085942, + "grad_norm": 0.23624882102012634, + "learning_rate": 6.353290929501616e-05, + "loss": 1.803, + "step": 14023 + }, + { + "epoch": 4.304481276856968, + "grad_norm": 0.22106927633285522, + "learning_rate": 6.35281241918834e-05, + "loss": 1.7133, + "step": 14024 + }, + { + "epoch": 4.304788213627993, + "grad_norm": 0.2374040186405182, + "learning_rate": 6.352333895505778e-05, + "loss": 1.8127, + "step": 14025 + }, + { + "epoch": 4.305095150399017, + "grad_norm": 0.2782450318336487, + "learning_rate": 6.35185535845866e-05, + "loss": 1.8613, + "step": 14026 + }, + { + "epoch": 4.305402087170043, + "grad_norm": 0.2527763843536377, + "learning_rate": 6.351376808051717e-05, + "loss": 1.7533, + "step": 14027 + }, + { + "epoch": 4.305709023941068, + "grad_norm": 0.2462318390607834, + "learning_rate": 6.350898244289675e-05, + "loss": 1.8075, + "step": 14028 + }, + { + "epoch": 4.306015960712093, + "grad_norm": 0.2646189332008362, + "learning_rate": 6.350419667177265e-05, + "loss": 1.8261, + "step": 14029 + }, + { + "epoch": 4.306322897483119, + "grad_norm": 0.24918611347675323, + "learning_rate": 6.349941076719218e-05, + "loss": 1.7542, + "step": 14030 + }, + { + "epoch": 4.306629834254144, + "grad_norm": 0.22440841794013977, + "learning_rate": 6.349462472920259e-05, + "loss": 1.7897, + "step": 14031 + }, + { + "epoch": 4.3069367710251685, + "grad_norm": 0.28614330291748047, + "learning_rate": 6.348983855785121e-05, + "loss": 1.88, + "step": 14032 + }, + { + "epoch": 4.307243707796194, + "grad_norm": 0.25015848875045776, + "learning_rate": 6.348505225318535e-05, + "loss": 1.8008, + "step": 14033 + }, + { + "epoch": 4.307550644567219, + "grad_norm": 0.2468707263469696, + "learning_rate": 6.34802658152523e-05, + "loss": 1.8025, + "step": 14034 + }, + { + "epoch": 4.3078575813382445, + "grad_norm": 0.30504748225212097, + "learning_rate": 6.347547924409937e-05, + "loss": 1.8765, + "step": 14035 + }, + { + "epoch": 4.30816451810927, + "grad_norm": 0.35419392585754395, + "learning_rate": 6.347069253977385e-05, + "loss": 1.7807, + "step": 14036 + }, + { + "epoch": 4.308471454880294, + "grad_norm": 0.33683931827545166, + "learning_rate": 6.346590570232305e-05, + "loss": 1.7244, + "step": 14037 + }, + { + "epoch": 4.30877839165132, + "grad_norm": 0.3339467942714691, + "learning_rate": 6.346111873179427e-05, + "loss": 1.7642, + "step": 14038 + }, + { + "epoch": 4.309085328422345, + "grad_norm": 0.2369392216205597, + "learning_rate": 6.345633162823484e-05, + "loss": 1.7127, + "step": 14039 + }, + { + "epoch": 4.30939226519337, + "grad_norm": 0.26469686627388, + "learning_rate": 6.345154439169206e-05, + "loss": 1.7235, + "step": 14040 + }, + { + "epoch": 4.309699201964396, + "grad_norm": 0.2737344205379486, + "learning_rate": 6.344675702221321e-05, + "loss": 1.783, + "step": 14041 + }, + { + "epoch": 4.310006138735421, + "grad_norm": 0.2381773442029953, + "learning_rate": 6.344196951984565e-05, + "loss": 1.7172, + "step": 14042 + }, + { + "epoch": 4.310313075506445, + "grad_norm": 0.28199076652526855, + "learning_rate": 6.343718188463663e-05, + "loss": 1.8315, + "step": 14043 + }, + { + "epoch": 4.310620012277471, + "grad_norm": 0.24378590285778046, + "learning_rate": 6.343239411663353e-05, + "loss": 1.7828, + "step": 14044 + }, + { + "epoch": 4.310926949048496, + "grad_norm": 0.26343944668769836, + "learning_rate": 6.342760621588365e-05, + "loss": 1.7679, + "step": 14045 + }, + { + "epoch": 4.311233885819521, + "grad_norm": 0.23703521490097046, + "learning_rate": 6.342281818243427e-05, + "loss": 1.7885, + "step": 14046 + }, + { + "epoch": 4.311540822590547, + "grad_norm": 0.2230173498392105, + "learning_rate": 6.341803001633276e-05, + "loss": 1.767, + "step": 14047 + }, + { + "epoch": 4.311847759361571, + "grad_norm": 0.249002143740654, + "learning_rate": 6.34132417176264e-05, + "loss": 1.8032, + "step": 14048 + }, + { + "epoch": 4.3121546961325965, + "grad_norm": 0.2383791208267212, + "learning_rate": 6.34084532863625e-05, + "loss": 1.7558, + "step": 14049 + }, + { + "epoch": 4.312461632903622, + "grad_norm": 0.2783047556877136, + "learning_rate": 6.340366472258843e-05, + "loss": 1.8389, + "step": 14050 + }, + { + "epoch": 4.312768569674647, + "grad_norm": 0.2654891312122345, + "learning_rate": 6.339887602635148e-05, + "loss": 1.7989, + "step": 14051 + }, + { + "epoch": 4.3130755064456725, + "grad_norm": 0.2638411521911621, + "learning_rate": 6.3394087197699e-05, + "loss": 1.8707, + "step": 14052 + }, + { + "epoch": 4.313382443216697, + "grad_norm": 0.3026179075241089, + "learning_rate": 6.338929823667829e-05, + "loss": 1.7892, + "step": 14053 + }, + { + "epoch": 4.313689379987722, + "grad_norm": 0.27496880292892456, + "learning_rate": 6.338450914333668e-05, + "loss": 1.7398, + "step": 14054 + }, + { + "epoch": 4.313996316758748, + "grad_norm": 0.2601073086261749, + "learning_rate": 6.337971991772151e-05, + "loss": 1.7646, + "step": 14055 + }, + { + "epoch": 4.314303253529773, + "grad_norm": 0.2061719298362732, + "learning_rate": 6.337493055988011e-05, + "loss": 1.7372, + "step": 14056 + }, + { + "epoch": 4.314610190300798, + "grad_norm": 0.23722340166568756, + "learning_rate": 6.337014106985981e-05, + "loss": 1.7457, + "step": 14057 + }, + { + "epoch": 4.314917127071823, + "grad_norm": 0.2729428708553314, + "learning_rate": 6.336535144770793e-05, + "loss": 1.8423, + "step": 14058 + }, + { + "epoch": 4.315224063842848, + "grad_norm": 0.23520450294017792, + "learning_rate": 6.336056169347182e-05, + "loss": 1.8124, + "step": 14059 + }, + { + "epoch": 4.315531000613873, + "grad_norm": 0.25142738223075867, + "learning_rate": 6.33557718071988e-05, + "loss": 1.7285, + "step": 14060 + }, + { + "epoch": 4.315837937384899, + "grad_norm": 0.24833035469055176, + "learning_rate": 6.335098178893621e-05, + "loss": 1.766, + "step": 14061 + }, + { + "epoch": 4.316144874155924, + "grad_norm": 0.2406177669763565, + "learning_rate": 6.334619163873141e-05, + "loss": 1.8824, + "step": 14062 + }, + { + "epoch": 4.316451810926949, + "grad_norm": 0.23077574372291565, + "learning_rate": 6.334140135663172e-05, + "loss": 1.7589, + "step": 14063 + }, + { + "epoch": 4.316758747697974, + "grad_norm": 0.20476560294628143, + "learning_rate": 6.333661094268448e-05, + "loss": 1.7331, + "step": 14064 + }, + { + "epoch": 4.317065684468999, + "grad_norm": 0.207991823554039, + "learning_rate": 6.333182039693704e-05, + "loss": 1.6876, + "step": 14065 + }, + { + "epoch": 4.3173726212400245, + "grad_norm": 0.20813052356243134, + "learning_rate": 6.332702971943671e-05, + "loss": 1.775, + "step": 14066 + }, + { + "epoch": 4.31767955801105, + "grad_norm": 0.2470991462469101, + "learning_rate": 6.332223891023087e-05, + "loss": 1.7673, + "step": 14067 + }, + { + "epoch": 4.317986494782075, + "grad_norm": 0.23855723440647125, + "learning_rate": 6.331744796936687e-05, + "loss": 1.7842, + "step": 14068 + }, + { + "epoch": 4.3182934315531, + "grad_norm": 0.21852652728557587, + "learning_rate": 6.331265689689204e-05, + "loss": 1.7727, + "step": 14069 + }, + { + "epoch": 4.318600368324125, + "grad_norm": 0.284496545791626, + "learning_rate": 6.330786569285374e-05, + "loss": 1.8248, + "step": 14070 + }, + { + "epoch": 4.31890730509515, + "grad_norm": 0.21709981560707092, + "learning_rate": 6.33030743572993e-05, + "loss": 1.7547, + "step": 14071 + }, + { + "epoch": 4.319214241866176, + "grad_norm": 0.24209457635879517, + "learning_rate": 6.329828289027608e-05, + "loss": 1.7695, + "step": 14072 + }, + { + "epoch": 4.319521178637201, + "grad_norm": 0.24869373440742493, + "learning_rate": 6.329349129183144e-05, + "loss": 1.8204, + "step": 14073 + }, + { + "epoch": 4.319828115408226, + "grad_norm": 0.21702703833580017, + "learning_rate": 6.328869956201274e-05, + "loss": 1.779, + "step": 14074 + }, + { + "epoch": 4.320135052179251, + "grad_norm": 0.22993850708007812, + "learning_rate": 6.328390770086731e-05, + "loss": 1.7935, + "step": 14075 + }, + { + "epoch": 4.320441988950276, + "grad_norm": 0.23491734266281128, + "learning_rate": 6.327911570844252e-05, + "loss": 1.7261, + "step": 14076 + }, + { + "epoch": 4.320748925721301, + "grad_norm": 0.2479303777217865, + "learning_rate": 6.327432358478571e-05, + "loss": 1.7683, + "step": 14077 + }, + { + "epoch": 4.321055862492327, + "grad_norm": 0.24261580407619476, + "learning_rate": 6.326953132994427e-05, + "loss": 1.7147, + "step": 14078 + }, + { + "epoch": 4.321362799263352, + "grad_norm": 0.24627646803855896, + "learning_rate": 6.326473894396553e-05, + "loss": 1.7976, + "step": 14079 + }, + { + "epoch": 4.3216697360343765, + "grad_norm": 0.269149512052536, + "learning_rate": 6.325994642689688e-05, + "loss": 1.7247, + "step": 14080 + }, + { + "epoch": 4.321976672805402, + "grad_norm": 0.4162158966064453, + "learning_rate": 6.325515377878566e-05, + "loss": 1.7485, + "step": 14081 + }, + { + "epoch": 4.322283609576427, + "grad_norm": 0.366459459066391, + "learning_rate": 6.325036099967925e-05, + "loss": 1.7286, + "step": 14082 + }, + { + "epoch": 4.3225905463474525, + "grad_norm": 0.2465270757675171, + "learning_rate": 6.324556808962499e-05, + "loss": 1.8097, + "step": 14083 + }, + { + "epoch": 4.322897483118478, + "grad_norm": 0.2911076843738556, + "learning_rate": 6.324077504867026e-05, + "loss": 1.7979, + "step": 14084 + }, + { + "epoch": 4.323204419889503, + "grad_norm": 0.33455169200897217, + "learning_rate": 6.323598187686245e-05, + "loss": 1.7988, + "step": 14085 + }, + { + "epoch": 4.323511356660528, + "grad_norm": 0.25020337104797363, + "learning_rate": 6.32311885742489e-05, + "loss": 1.7184, + "step": 14086 + }, + { + "epoch": 4.323818293431553, + "grad_norm": 0.23941513895988464, + "learning_rate": 6.322639514087699e-05, + "loss": 1.7672, + "step": 14087 + }, + { + "epoch": 4.324125230202578, + "grad_norm": 0.35258981585502625, + "learning_rate": 6.32216015767941e-05, + "loss": 1.7571, + "step": 14088 + }, + { + "epoch": 4.324432166973604, + "grad_norm": 0.2854993939399719, + "learning_rate": 6.321680788204758e-05, + "loss": 1.8096, + "step": 14089 + }, + { + "epoch": 4.324739103744629, + "grad_norm": 0.24422863125801086, + "learning_rate": 6.321201405668482e-05, + "loss": 1.778, + "step": 14090 + }, + { + "epoch": 4.3250460405156534, + "grad_norm": 0.36629122495651245, + "learning_rate": 6.320722010075321e-05, + "loss": 1.716, + "step": 14091 + }, + { + "epoch": 4.325352977286679, + "grad_norm": 0.37115517258644104, + "learning_rate": 6.32024260143001e-05, + "loss": 1.77, + "step": 14092 + }, + { + "epoch": 4.325659914057704, + "grad_norm": 0.21540327370166779, + "learning_rate": 6.319763179737288e-05, + "loss": 1.7529, + "step": 14093 + }, + { + "epoch": 4.3259668508287294, + "grad_norm": 0.2573898732662201, + "learning_rate": 6.319283745001892e-05, + "loss": 1.8101, + "step": 14094 + }, + { + "epoch": 4.326273787599755, + "grad_norm": 0.29481247067451477, + "learning_rate": 6.31880429722856e-05, + "loss": 1.7459, + "step": 14095 + }, + { + "epoch": 4.326580724370779, + "grad_norm": 0.23474647104740143, + "learning_rate": 6.318324836422031e-05, + "loss": 1.786, + "step": 14096 + }, + { + "epoch": 4.326887661141805, + "grad_norm": 0.2884673476219177, + "learning_rate": 6.317845362587045e-05, + "loss": 1.8123, + "step": 14097 + }, + { + "epoch": 4.32719459791283, + "grad_norm": 0.39008447527885437, + "learning_rate": 6.317365875728338e-05, + "loss": 1.7729, + "step": 14098 + }, + { + "epoch": 4.327501534683855, + "grad_norm": 0.30568063259124756, + "learning_rate": 6.316886375850651e-05, + "loss": 1.7088, + "step": 14099 + }, + { + "epoch": 4.327808471454881, + "grad_norm": 0.2538018524646759, + "learning_rate": 6.316406862958718e-05, + "loss": 1.8028, + "step": 14100 + }, + { + "epoch": 4.328115408225905, + "grad_norm": 0.3815068006515503, + "learning_rate": 6.315927337057281e-05, + "loss": 1.7143, + "step": 14101 + }, + { + "epoch": 4.32842234499693, + "grad_norm": 0.3813243508338928, + "learning_rate": 6.31544779815108e-05, + "loss": 1.7072, + "step": 14102 + }, + { + "epoch": 4.328729281767956, + "grad_norm": 0.22438868880271912, + "learning_rate": 6.314968246244852e-05, + "loss": 1.7445, + "step": 14103 + }, + { + "epoch": 4.329036218538981, + "grad_norm": 0.3818886876106262, + "learning_rate": 6.314488681343337e-05, + "loss": 1.8292, + "step": 14104 + }, + { + "epoch": 4.329343155310006, + "grad_norm": 0.4376567006111145, + "learning_rate": 6.314009103451277e-05, + "loss": 1.8224, + "step": 14105 + }, + { + "epoch": 4.329650092081032, + "grad_norm": 0.2741515636444092, + "learning_rate": 6.313529512573406e-05, + "loss": 1.8078, + "step": 14106 + }, + { + "epoch": 4.329957028852056, + "grad_norm": 0.264343798160553, + "learning_rate": 6.313049908714467e-05, + "loss": 1.7314, + "step": 14107 + }, + { + "epoch": 4.3302639656230815, + "grad_norm": 0.3601943552494049, + "learning_rate": 6.312570291879201e-05, + "loss": 1.7351, + "step": 14108 + }, + { + "epoch": 4.330570902394107, + "grad_norm": 0.2931751012802124, + "learning_rate": 6.312090662072345e-05, + "loss": 1.8117, + "step": 14109 + }, + { + "epoch": 4.330877839165132, + "grad_norm": 0.27670225501060486, + "learning_rate": 6.31161101929864e-05, + "loss": 1.7707, + "step": 14110 + }, + { + "epoch": 4.3311847759361575, + "grad_norm": 0.33669596910476685, + "learning_rate": 6.311131363562825e-05, + "loss": 1.7337, + "step": 14111 + }, + { + "epoch": 4.331491712707182, + "grad_norm": 0.232634037733078, + "learning_rate": 6.310651694869643e-05, + "loss": 1.7372, + "step": 14112 + }, + { + "epoch": 4.331798649478207, + "grad_norm": 0.28611311316490173, + "learning_rate": 6.310172013223832e-05, + "loss": 1.6977, + "step": 14113 + }, + { + "epoch": 4.332105586249233, + "grad_norm": 0.30207201838493347, + "learning_rate": 6.309692318630132e-05, + "loss": 1.7765, + "step": 14114 + }, + { + "epoch": 4.332412523020258, + "grad_norm": 0.20757484436035156, + "learning_rate": 6.309212611093287e-05, + "loss": 1.697, + "step": 14115 + }, + { + "epoch": 4.332719459791283, + "grad_norm": 0.31472963094711304, + "learning_rate": 6.308732890618034e-05, + "loss": 1.7757, + "step": 14116 + }, + { + "epoch": 4.333026396562309, + "grad_norm": 0.37042325735092163, + "learning_rate": 6.308253157209117e-05, + "loss": 1.7745, + "step": 14117 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.25001442432403564, + "learning_rate": 6.307773410871275e-05, + "loss": 1.7461, + "step": 14118 + }, + { + "epoch": 4.333640270104358, + "grad_norm": 0.2691943347454071, + "learning_rate": 6.307293651609248e-05, + "loss": 1.7539, + "step": 14119 + }, + { + "epoch": 4.333947206875384, + "grad_norm": 0.30845868587493896, + "learning_rate": 6.306813879427782e-05, + "loss": 1.7559, + "step": 14120 + }, + { + "epoch": 4.334254143646409, + "grad_norm": 0.2244730293750763, + "learning_rate": 6.306334094331613e-05, + "loss": 1.7609, + "step": 14121 + }, + { + "epoch": 4.334561080417434, + "grad_norm": 0.32132062315940857, + "learning_rate": 6.305854296325485e-05, + "loss": 1.7837, + "step": 14122 + }, + { + "epoch": 4.334868017188459, + "grad_norm": 0.3762948513031006, + "learning_rate": 6.30537448541414e-05, + "loss": 1.7631, + "step": 14123 + }, + { + "epoch": 4.335174953959484, + "grad_norm": 0.24174273014068604, + "learning_rate": 6.30489466160232e-05, + "loss": 1.7532, + "step": 14124 + }, + { + "epoch": 4.3354818907305095, + "grad_norm": 0.23468497395515442, + "learning_rate": 6.304414824894765e-05, + "loss": 1.7731, + "step": 14125 + }, + { + "epoch": 4.335788827501535, + "grad_norm": 0.29086077213287354, + "learning_rate": 6.303934975296218e-05, + "loss": 1.7668, + "step": 14126 + }, + { + "epoch": 4.33609576427256, + "grad_norm": 0.2889879643917084, + "learning_rate": 6.303455112811422e-05, + "loss": 1.8188, + "step": 14127 + }, + { + "epoch": 4.336402701043585, + "grad_norm": 0.2335619181394577, + "learning_rate": 6.302975237445119e-05, + "loss": 1.7944, + "step": 14128 + }, + { + "epoch": 4.33670963781461, + "grad_norm": 0.29027310013771057, + "learning_rate": 6.302495349202051e-05, + "loss": 1.7771, + "step": 14129 + }, + { + "epoch": 4.337016574585635, + "grad_norm": 0.31961241364479065, + "learning_rate": 6.302015448086959e-05, + "loss": 1.8187, + "step": 14130 + }, + { + "epoch": 4.337323511356661, + "grad_norm": 0.26015788316726685, + "learning_rate": 6.301535534104587e-05, + "loss": 1.7819, + "step": 14131 + }, + { + "epoch": 4.337630448127686, + "grad_norm": 0.2440631091594696, + "learning_rate": 6.30105560725968e-05, + "loss": 1.7127, + "step": 14132 + }, + { + "epoch": 4.337937384898711, + "grad_norm": 0.304441899061203, + "learning_rate": 6.300575667556979e-05, + "loss": 1.7619, + "step": 14133 + }, + { + "epoch": 4.338244321669736, + "grad_norm": 0.3085228204727173, + "learning_rate": 6.300095715001226e-05, + "loss": 1.8287, + "step": 14134 + }, + { + "epoch": 4.338551258440761, + "grad_norm": 0.2863372564315796, + "learning_rate": 6.299615749597165e-05, + "loss": 1.8068, + "step": 14135 + }, + { + "epoch": 4.338858195211786, + "grad_norm": 0.25255265831947327, + "learning_rate": 6.299135771349537e-05, + "loss": 1.7506, + "step": 14136 + }, + { + "epoch": 4.339165131982812, + "grad_norm": 0.30224961042404175, + "learning_rate": 6.298655780263092e-05, + "loss": 1.7292, + "step": 14137 + }, + { + "epoch": 4.339472068753837, + "grad_norm": 0.24222104251384735, + "learning_rate": 6.298175776342567e-05, + "loss": 1.7616, + "step": 14138 + }, + { + "epoch": 4.3397790055248615, + "grad_norm": 0.3236368000507355, + "learning_rate": 6.29769575959271e-05, + "loss": 1.787, + "step": 14139 + }, + { + "epoch": 4.340085942295887, + "grad_norm": 0.26049408316612244, + "learning_rate": 6.297215730018261e-05, + "loss": 1.7108, + "step": 14140 + }, + { + "epoch": 4.340392879066912, + "grad_norm": 0.22833532094955444, + "learning_rate": 6.296735687623967e-05, + "loss": 1.7661, + "step": 14141 + }, + { + "epoch": 4.3406998158379375, + "grad_norm": 0.28397905826568604, + "learning_rate": 6.296255632414571e-05, + "loss": 1.7163, + "step": 14142 + }, + { + "epoch": 4.341006752608963, + "grad_norm": 0.3072611093521118, + "learning_rate": 6.295775564394817e-05, + "loss": 1.857, + "step": 14143 + }, + { + "epoch": 4.341313689379987, + "grad_norm": 0.22901058197021484, + "learning_rate": 6.295295483569448e-05, + "loss": 1.7325, + "step": 14144 + }, + { + "epoch": 4.341620626151013, + "grad_norm": 0.27433091402053833, + "learning_rate": 6.294815389943212e-05, + "loss": 1.8229, + "step": 14145 + }, + { + "epoch": 4.341927562922038, + "grad_norm": 0.2635616958141327, + "learning_rate": 6.29433528352085e-05, + "loss": 1.7585, + "step": 14146 + }, + { + "epoch": 4.342234499693063, + "grad_norm": 0.29129260778427124, + "learning_rate": 6.293855164307108e-05, + "loss": 1.8294, + "step": 14147 + }, + { + "epoch": 4.342541436464089, + "grad_norm": 0.3429001569747925, + "learning_rate": 6.293375032306731e-05, + "loss": 1.7725, + "step": 14148 + }, + { + "epoch": 4.342848373235114, + "grad_norm": 0.22407259047031403, + "learning_rate": 6.292894887524464e-05, + "loss": 1.7018, + "step": 14149 + }, + { + "epoch": 4.343155310006138, + "grad_norm": 0.3319321274757385, + "learning_rate": 6.292414729965053e-05, + "loss": 1.8472, + "step": 14150 + }, + { + "epoch": 4.343462246777164, + "grad_norm": 0.42744341492652893, + "learning_rate": 6.291934559633241e-05, + "loss": 1.8118, + "step": 14151 + }, + { + "epoch": 4.343769183548189, + "grad_norm": 0.24572840332984924, + "learning_rate": 6.291454376533774e-05, + "loss": 1.7184, + "step": 14152 + }, + { + "epoch": 4.344076120319214, + "grad_norm": 0.2485980987548828, + "learning_rate": 6.290974180671397e-05, + "loss": 1.7649, + "step": 14153 + }, + { + "epoch": 4.34438305709024, + "grad_norm": 0.3911706209182739, + "learning_rate": 6.29049397205086e-05, + "loss": 1.8105, + "step": 14154 + }, + { + "epoch": 4.344689993861264, + "grad_norm": 0.3008342981338501, + "learning_rate": 6.290013750676902e-05, + "loss": 1.7671, + "step": 14155 + }, + { + "epoch": 4.3449969306322895, + "grad_norm": 0.2072051614522934, + "learning_rate": 6.289533516554274e-05, + "loss": 1.7406, + "step": 14156 + }, + { + "epoch": 4.345303867403315, + "grad_norm": 0.3047312796115875, + "learning_rate": 6.289053269687719e-05, + "loss": 1.8133, + "step": 14157 + }, + { + "epoch": 4.34561080417434, + "grad_norm": 0.28260552883148193, + "learning_rate": 6.288573010081984e-05, + "loss": 1.7253, + "step": 14158 + }, + { + "epoch": 4.3459177409453655, + "grad_norm": 0.2474137246608734, + "learning_rate": 6.288092737741815e-05, + "loss": 1.822, + "step": 14159 + }, + { + "epoch": 4.346224677716391, + "grad_norm": 0.23717878758907318, + "learning_rate": 6.287612452671961e-05, + "loss": 1.7826, + "step": 14160 + }, + { + "epoch": 4.346531614487415, + "grad_norm": 0.2646107077598572, + "learning_rate": 6.287132154877163e-05, + "loss": 1.8118, + "step": 14161 + }, + { + "epoch": 4.346838551258441, + "grad_norm": 0.22026480734348297, + "learning_rate": 6.286651844362172e-05, + "loss": 1.7767, + "step": 14162 + }, + { + "epoch": 4.347145488029466, + "grad_norm": 0.2692350447177887, + "learning_rate": 6.286171521131733e-05, + "loss": 1.8718, + "step": 14163 + }, + { + "epoch": 4.347452424800491, + "grad_norm": 0.2749998867511749, + "learning_rate": 6.285691185190592e-05, + "loss": 1.7689, + "step": 14164 + }, + { + "epoch": 4.347759361571517, + "grad_norm": 0.24552448093891144, + "learning_rate": 6.2852108365435e-05, + "loss": 1.8049, + "step": 14165 + }, + { + "epoch": 4.348066298342541, + "grad_norm": 0.20530807971954346, + "learning_rate": 6.2847304751952e-05, + "loss": 1.7606, + "step": 14166 + }, + { + "epoch": 4.348373235113566, + "grad_norm": 0.23396088182926178, + "learning_rate": 6.28425010115044e-05, + "loss": 1.7482, + "step": 14167 + }, + { + "epoch": 4.348680171884592, + "grad_norm": 0.20512452721595764, + "learning_rate": 6.283769714413968e-05, + "loss": 1.6976, + "step": 14168 + }, + { + "epoch": 4.348987108655617, + "grad_norm": 0.20287172496318817, + "learning_rate": 6.283289314990531e-05, + "loss": 1.7439, + "step": 14169 + }, + { + "epoch": 4.349294045426642, + "grad_norm": 0.2193746268749237, + "learning_rate": 6.282808902884876e-05, + "loss": 1.763, + "step": 14170 + }, + { + "epoch": 4.349600982197667, + "grad_norm": 0.20415273308753967, + "learning_rate": 6.282328478101753e-05, + "loss": 1.7025, + "step": 14171 + }, + { + "epoch": 4.349907918968692, + "grad_norm": 0.19286803901195526, + "learning_rate": 6.281848040645907e-05, + "loss": 1.7529, + "step": 14172 + }, + { + "epoch": 4.350214855739718, + "grad_norm": 0.20908218622207642, + "learning_rate": 6.281367590522088e-05, + "loss": 1.7896, + "step": 14173 + }, + { + "epoch": 4.350521792510743, + "grad_norm": 0.2599989175796509, + "learning_rate": 6.280887127735045e-05, + "loss": 1.764, + "step": 14174 + }, + { + "epoch": 4.350828729281768, + "grad_norm": 0.23955710232257843, + "learning_rate": 6.280406652289523e-05, + "loss": 1.7321, + "step": 14175 + }, + { + "epoch": 4.351135666052793, + "grad_norm": 0.2311990112066269, + "learning_rate": 6.279926164190272e-05, + "loss": 1.7338, + "step": 14176 + }, + { + "epoch": 4.351442602823818, + "grad_norm": 0.2599658966064453, + "learning_rate": 6.27944566344204e-05, + "loss": 1.7444, + "step": 14177 + }, + { + "epoch": 4.351749539594843, + "grad_norm": 0.23079386353492737, + "learning_rate": 6.278965150049579e-05, + "loss": 1.7011, + "step": 14178 + }, + { + "epoch": 4.352056476365869, + "grad_norm": 0.24844171106815338, + "learning_rate": 6.278484624017631e-05, + "loss": 1.7298, + "step": 14179 + }, + { + "epoch": 4.352363413136894, + "grad_norm": 0.24839860200881958, + "learning_rate": 6.27800408535095e-05, + "loss": 1.7717, + "step": 14180 + }, + { + "epoch": 4.352670349907919, + "grad_norm": 0.2652966380119324, + "learning_rate": 6.277523534054284e-05, + "loss": 1.7759, + "step": 14181 + }, + { + "epoch": 4.352977286678944, + "grad_norm": 0.2787603735923767, + "learning_rate": 6.277042970132381e-05, + "loss": 1.8981, + "step": 14182 + }, + { + "epoch": 4.353284223449969, + "grad_norm": 0.2535475194454193, + "learning_rate": 6.276562393589991e-05, + "loss": 1.7538, + "step": 14183 + }, + { + "epoch": 4.3535911602209945, + "grad_norm": 0.3210967183113098, + "learning_rate": 6.276081804431863e-05, + "loss": 1.7087, + "step": 14184 + }, + { + "epoch": 4.35389809699202, + "grad_norm": 0.29936519265174866, + "learning_rate": 6.275601202662749e-05, + "loss": 1.7647, + "step": 14185 + }, + { + "epoch": 4.354205033763045, + "grad_norm": 0.21980762481689453, + "learning_rate": 6.275120588287394e-05, + "loss": 1.7759, + "step": 14186 + }, + { + "epoch": 4.35451197053407, + "grad_norm": 0.26833051443099976, + "learning_rate": 6.274639961310549e-05, + "loss": 1.7648, + "step": 14187 + }, + { + "epoch": 4.354818907305095, + "grad_norm": 0.27998095750808716, + "learning_rate": 6.274159321736966e-05, + "loss": 1.746, + "step": 14188 + }, + { + "epoch": 4.35512584407612, + "grad_norm": 0.21354494988918304, + "learning_rate": 6.273678669571395e-05, + "loss": 1.7417, + "step": 14189 + }, + { + "epoch": 4.355432780847146, + "grad_norm": 0.2295297235250473, + "learning_rate": 6.273198004818583e-05, + "loss": 1.7805, + "step": 14190 + }, + { + "epoch": 4.355739717618171, + "grad_norm": 0.2416422963142395, + "learning_rate": 6.272717327483283e-05, + "loss": 1.73, + "step": 14191 + }, + { + "epoch": 4.356046654389196, + "grad_norm": 0.2685304880142212, + "learning_rate": 6.272236637570244e-05, + "loss": 1.7936, + "step": 14192 + }, + { + "epoch": 4.356353591160221, + "grad_norm": 0.32481294870376587, + "learning_rate": 6.271755935084218e-05, + "loss": 1.7192, + "step": 14193 + }, + { + "epoch": 4.356660527931246, + "grad_norm": 0.2428581267595291, + "learning_rate": 6.271275220029954e-05, + "loss": 1.7428, + "step": 14194 + }, + { + "epoch": 4.356967464702271, + "grad_norm": 0.2266654521226883, + "learning_rate": 6.270794492412203e-05, + "loss": 1.7266, + "step": 14195 + }, + { + "epoch": 4.357274401473297, + "grad_norm": 0.25062093138694763, + "learning_rate": 6.270313752235716e-05, + "loss": 1.7476, + "step": 14196 + }, + { + "epoch": 4.357581338244322, + "grad_norm": 0.24085770547389984, + "learning_rate": 6.269832999505244e-05, + "loss": 1.7981, + "step": 14197 + }, + { + "epoch": 4.3578882750153465, + "grad_norm": 0.27035796642303467, + "learning_rate": 6.269352234225536e-05, + "loss": 1.8867, + "step": 14198 + }, + { + "epoch": 4.358195211786372, + "grad_norm": 0.22464458644390106, + "learning_rate": 6.268871456401348e-05, + "loss": 1.7514, + "step": 14199 + }, + { + "epoch": 4.358502148557397, + "grad_norm": 0.22485734522342682, + "learning_rate": 6.268390666037427e-05, + "loss": 1.7558, + "step": 14200 + }, + { + "epoch": 4.3588090853284225, + "grad_norm": 0.2052135169506073, + "learning_rate": 6.267909863138527e-05, + "loss": 1.7453, + "step": 14201 + }, + { + "epoch": 4.359116022099448, + "grad_norm": 0.2130763679742813, + "learning_rate": 6.267429047709397e-05, + "loss": 1.7712, + "step": 14202 + }, + { + "epoch": 4.359422958870473, + "grad_norm": 0.23146997392177582, + "learning_rate": 6.266948219754793e-05, + "loss": 1.6978, + "step": 14203 + }, + { + "epoch": 4.359729895641498, + "grad_norm": 0.21657225489616394, + "learning_rate": 6.266467379279463e-05, + "loss": 1.7641, + "step": 14204 + }, + { + "epoch": 4.360036832412523, + "grad_norm": 0.2598700523376465, + "learning_rate": 6.265986526288158e-05, + "loss": 1.7956, + "step": 14205 + }, + { + "epoch": 4.360343769183548, + "grad_norm": 0.23497453331947327, + "learning_rate": 6.265505660785633e-05, + "loss": 1.7835, + "step": 14206 + }, + { + "epoch": 4.360650705954574, + "grad_norm": 0.2491760104894638, + "learning_rate": 6.265024782776641e-05, + "loss": 1.8454, + "step": 14207 + }, + { + "epoch": 4.360957642725599, + "grad_norm": 0.224884033203125, + "learning_rate": 6.264543892265932e-05, + "loss": 1.8383, + "step": 14208 + }, + { + "epoch": 4.361264579496623, + "grad_norm": 0.24057646095752716, + "learning_rate": 6.264062989258259e-05, + "loss": 1.7437, + "step": 14209 + }, + { + "epoch": 4.361571516267649, + "grad_norm": 0.24661841988563538, + "learning_rate": 6.263582073758374e-05, + "loss": 1.8151, + "step": 14210 + }, + { + "epoch": 4.361878453038674, + "grad_norm": 0.24618980288505554, + "learning_rate": 6.263101145771031e-05, + "loss": 1.7955, + "step": 14211 + }, + { + "epoch": 4.362185389809699, + "grad_norm": 0.2615448236465454, + "learning_rate": 6.262620205300981e-05, + "loss": 1.7819, + "step": 14212 + }, + { + "epoch": 4.362492326580725, + "grad_norm": 0.3528309464454651, + "learning_rate": 6.26213925235298e-05, + "loss": 1.7723, + "step": 14213 + }, + { + "epoch": 4.362799263351749, + "grad_norm": 0.3099561035633087, + "learning_rate": 6.261658286931779e-05, + "loss": 1.7361, + "step": 14214 + }, + { + "epoch": 4.3631062001227745, + "grad_norm": 0.23693235218524933, + "learning_rate": 6.26117730904213e-05, + "loss": 1.8117, + "step": 14215 + }, + { + "epoch": 4.3634131368938, + "grad_norm": 0.4164150655269623, + "learning_rate": 6.260696318688786e-05, + "loss": 1.7908, + "step": 14216 + }, + { + "epoch": 4.363720073664825, + "grad_norm": 0.39376336336135864, + "learning_rate": 6.260215315876506e-05, + "loss": 1.7832, + "step": 14217 + }, + { + "epoch": 4.3640270104358505, + "grad_norm": 0.24071799218654633, + "learning_rate": 6.259734300610037e-05, + "loss": 1.7569, + "step": 14218 + }, + { + "epoch": 4.364333947206875, + "grad_norm": 0.4305122494697571, + "learning_rate": 6.259253272894136e-05, + "loss": 1.7974, + "step": 14219 + }, + { + "epoch": 4.3646408839779, + "grad_norm": 0.3023197054862976, + "learning_rate": 6.258772232733556e-05, + "loss": 1.7589, + "step": 14220 + }, + { + "epoch": 4.364947820748926, + "grad_norm": 0.23253366351127625, + "learning_rate": 6.258291180133052e-05, + "loss": 1.7138, + "step": 14221 + }, + { + "epoch": 4.365254757519951, + "grad_norm": 0.41141277551651, + "learning_rate": 6.257810115097376e-05, + "loss": 1.7608, + "step": 14222 + }, + { + "epoch": 4.365561694290976, + "grad_norm": 0.3308235704898834, + "learning_rate": 6.257329037631284e-05, + "loss": 1.8006, + "step": 14223 + }, + { + "epoch": 4.365868631062002, + "grad_norm": 0.2635105848312378, + "learning_rate": 6.256847947739528e-05, + "loss": 1.7275, + "step": 14224 + }, + { + "epoch": 4.366175567833026, + "grad_norm": 0.45886602997779846, + "learning_rate": 6.256366845426864e-05, + "loss": 1.7701, + "step": 14225 + }, + { + "epoch": 4.366482504604051, + "grad_norm": 0.48503565788269043, + "learning_rate": 6.255885730698049e-05, + "loss": 1.7409, + "step": 14226 + }, + { + "epoch": 4.366789441375077, + "grad_norm": 0.26727184653282166, + "learning_rate": 6.255404603557833e-05, + "loss": 1.7288, + "step": 14227 + }, + { + "epoch": 4.367096378146102, + "grad_norm": 0.3343912363052368, + "learning_rate": 6.254923464010974e-05, + "loss": 1.764, + "step": 14228 + }, + { + "epoch": 4.367403314917127, + "grad_norm": 0.40050622820854187, + "learning_rate": 6.254442312062224e-05, + "loss": 1.7653, + "step": 14229 + }, + { + "epoch": 4.367710251688152, + "grad_norm": 0.23941144347190857, + "learning_rate": 6.253961147716341e-05, + "loss": 1.6886, + "step": 14230 + }, + { + "epoch": 4.368017188459177, + "grad_norm": 0.25737255811691284, + "learning_rate": 6.253479970978079e-05, + "loss": 1.8047, + "step": 14231 + }, + { + "epoch": 4.3683241252302025, + "grad_norm": 0.28780993819236755, + "learning_rate": 6.252998781852192e-05, + "loss": 1.7453, + "step": 14232 + }, + { + "epoch": 4.368631062001228, + "grad_norm": 0.2362327128648758, + "learning_rate": 6.252517580343438e-05, + "loss": 1.7963, + "step": 14233 + }, + { + "epoch": 4.368937998772253, + "grad_norm": 0.263013631105423, + "learning_rate": 6.252036366456571e-05, + "loss": 1.7837, + "step": 14234 + }, + { + "epoch": 4.3692449355432785, + "grad_norm": 0.27674412727355957, + "learning_rate": 6.251555140196347e-05, + "loss": 1.767, + "step": 14235 + }, + { + "epoch": 4.369551872314303, + "grad_norm": 0.2360621690750122, + "learning_rate": 6.251073901567522e-05, + "loss": 1.7806, + "step": 14236 + }, + { + "epoch": 4.369858809085328, + "grad_norm": 0.2568018138408661, + "learning_rate": 6.25059265057485e-05, + "loss": 1.7672, + "step": 14237 + }, + { + "epoch": 4.370165745856354, + "grad_norm": 0.2512381374835968, + "learning_rate": 6.25011138722309e-05, + "loss": 1.7506, + "step": 14238 + }, + { + "epoch": 4.370472682627379, + "grad_norm": 0.21587291359901428, + "learning_rate": 6.249630111516994e-05, + "loss": 1.7336, + "step": 14239 + }, + { + "epoch": 4.370779619398404, + "grad_norm": 0.21791933476924896, + "learning_rate": 6.249148823461323e-05, + "loss": 1.7588, + "step": 14240 + }, + { + "epoch": 4.371086556169429, + "grad_norm": 0.23061512410640717, + "learning_rate": 6.248667523060831e-05, + "loss": 1.742, + "step": 14241 + }, + { + "epoch": 4.371393492940454, + "grad_norm": 0.2007007598876953, + "learning_rate": 6.248186210320274e-05, + "loss": 1.7227, + "step": 14242 + }, + { + "epoch": 4.371700429711479, + "grad_norm": 0.2564350366592407, + "learning_rate": 6.247704885244411e-05, + "loss": 1.7529, + "step": 14243 + }, + { + "epoch": 4.372007366482505, + "grad_norm": 0.21880537271499634, + "learning_rate": 6.247223547837995e-05, + "loss": 1.7828, + "step": 14244 + }, + { + "epoch": 4.37231430325353, + "grad_norm": 0.26154282689094543, + "learning_rate": 6.246742198105785e-05, + "loss": 1.7895, + "step": 14245 + }, + { + "epoch": 4.3726212400245545, + "grad_norm": 0.2652645707130432, + "learning_rate": 6.24626083605254e-05, + "loss": 1.8038, + "step": 14246 + }, + { + "epoch": 4.37292817679558, + "grad_norm": 0.21463751792907715, + "learning_rate": 6.245779461683013e-05, + "loss": 1.7139, + "step": 14247 + }, + { + "epoch": 4.373235113566605, + "grad_norm": 0.21285851299762726, + "learning_rate": 6.245298075001961e-05, + "loss": 1.7686, + "step": 14248 + }, + { + "epoch": 4.3735420503376305, + "grad_norm": 0.258602499961853, + "learning_rate": 6.244816676014149e-05, + "loss": 1.8518, + "step": 14249 + }, + { + "epoch": 4.373848987108656, + "grad_norm": 0.25747501850128174, + "learning_rate": 6.244335264724323e-05, + "loss": 1.8019, + "step": 14250 + }, + { + "epoch": 4.37415592387968, + "grad_norm": 0.24678784608840942, + "learning_rate": 6.243853841137251e-05, + "loss": 1.7846, + "step": 14251 + }, + { + "epoch": 4.374462860650706, + "grad_norm": 0.31382107734680176, + "learning_rate": 6.243372405257685e-05, + "loss": 1.8389, + "step": 14252 + }, + { + "epoch": 4.374769797421731, + "grad_norm": 0.30522868037223816, + "learning_rate": 6.242890957090383e-05, + "loss": 1.8057, + "step": 14253 + }, + { + "epoch": 4.375076734192756, + "grad_norm": 0.2449347972869873, + "learning_rate": 6.242409496640106e-05, + "loss": 1.7144, + "step": 14254 + }, + { + "epoch": 4.375383670963782, + "grad_norm": 0.3193594217300415, + "learning_rate": 6.241928023911609e-05, + "loss": 1.7404, + "step": 14255 + }, + { + "epoch": 4.375690607734807, + "grad_norm": 0.23948179185390472, + "learning_rate": 6.241446538909651e-05, + "loss": 1.7338, + "step": 14256 + }, + { + "epoch": 4.3759975445058314, + "grad_norm": 0.35325706005096436, + "learning_rate": 6.240965041638991e-05, + "loss": 1.7673, + "step": 14257 + }, + { + "epoch": 4.376304481276857, + "grad_norm": 0.38753262162208557, + "learning_rate": 6.240483532104387e-05, + "loss": 1.769, + "step": 14258 + }, + { + "epoch": 4.376611418047882, + "grad_norm": 0.2749052941799164, + "learning_rate": 6.2400020103106e-05, + "loss": 1.8086, + "step": 14259 + }, + { + "epoch": 4.3769183548189075, + "grad_norm": 0.2553126811981201, + "learning_rate": 6.239520476262384e-05, + "loss": 1.7733, + "step": 14260 + }, + { + "epoch": 4.377225291589933, + "grad_norm": 0.2854517698287964, + "learning_rate": 6.2390389299645e-05, + "loss": 1.7926, + "step": 14261 + }, + { + "epoch": 4.377532228360957, + "grad_norm": 0.24617259204387665, + "learning_rate": 6.238557371421708e-05, + "loss": 1.7297, + "step": 14262 + }, + { + "epoch": 4.377839165131983, + "grad_norm": 0.2555331289768219, + "learning_rate": 6.238075800638765e-05, + "loss": 1.7566, + "step": 14263 + }, + { + "epoch": 4.378146101903008, + "grad_norm": 0.31666773557662964, + "learning_rate": 6.237594217620432e-05, + "loss": 1.8003, + "step": 14264 + }, + { + "epoch": 4.378453038674033, + "grad_norm": 0.24166476726531982, + "learning_rate": 6.237112622371468e-05, + "loss": 1.7425, + "step": 14265 + }, + { + "epoch": 4.378759975445059, + "grad_norm": 0.21237102150917053, + "learning_rate": 6.236631014896633e-05, + "loss": 1.73, + "step": 14266 + }, + { + "epoch": 4.379066912216084, + "grad_norm": 0.2739151120185852, + "learning_rate": 6.236149395200683e-05, + "loss": 1.7113, + "step": 14267 + }, + { + "epoch": 4.379373848987108, + "grad_norm": 0.23700746893882751, + "learning_rate": 6.23566776328838e-05, + "loss": 1.7256, + "step": 14268 + }, + { + "epoch": 4.379680785758134, + "grad_norm": 0.22366748750209808, + "learning_rate": 6.235186119164485e-05, + "loss": 1.7981, + "step": 14269 + }, + { + "epoch": 4.379987722529159, + "grad_norm": 0.28440114855766296, + "learning_rate": 6.234704462833758e-05, + "loss": 1.8087, + "step": 14270 + }, + { + "epoch": 4.380294659300184, + "grad_norm": 0.2706616520881653, + "learning_rate": 6.234222794300957e-05, + "loss": 1.7502, + "step": 14271 + }, + { + "epoch": 4.38060159607121, + "grad_norm": 0.21666266024112701, + "learning_rate": 6.233741113570843e-05, + "loss": 1.7639, + "step": 14272 + }, + { + "epoch": 4.380908532842234, + "grad_norm": 0.26790255308151245, + "learning_rate": 6.233259420648175e-05, + "loss": 1.796, + "step": 14273 + }, + { + "epoch": 4.3812154696132595, + "grad_norm": 0.22233673930168152, + "learning_rate": 6.232777715537715e-05, + "loss": 1.7661, + "step": 14274 + }, + { + "epoch": 4.381522406384285, + "grad_norm": 0.3277546763420105, + "learning_rate": 6.232295998244223e-05, + "loss": 1.7932, + "step": 14275 + }, + { + "epoch": 4.38182934315531, + "grad_norm": 0.2907596826553345, + "learning_rate": 6.231814268772463e-05, + "loss": 1.7103, + "step": 14276 + }, + { + "epoch": 4.3821362799263355, + "grad_norm": 0.2318384349346161, + "learning_rate": 6.231332527127188e-05, + "loss": 1.7351, + "step": 14277 + }, + { + "epoch": 4.382443216697361, + "grad_norm": 0.32904061675071716, + "learning_rate": 6.230850773313163e-05, + "loss": 1.7967, + "step": 14278 + }, + { + "epoch": 4.382750153468385, + "grad_norm": 0.2455490082502365, + "learning_rate": 6.230369007335153e-05, + "loss": 1.7474, + "step": 14279 + }, + { + "epoch": 4.383057090239411, + "grad_norm": 0.23648180067539215, + "learning_rate": 6.229887229197913e-05, + "loss": 1.7106, + "step": 14280 + }, + { + "epoch": 4.383364027010436, + "grad_norm": 0.29552599787712097, + "learning_rate": 6.229405438906207e-05, + "loss": 1.7765, + "step": 14281 + }, + { + "epoch": 4.383670963781461, + "grad_norm": 0.2094641923904419, + "learning_rate": 6.228923636464796e-05, + "loss": 1.7105, + "step": 14282 + }, + { + "epoch": 4.383977900552487, + "grad_norm": 0.24632154405117035, + "learning_rate": 6.228441821878441e-05, + "loss": 1.7913, + "step": 14283 + }, + { + "epoch": 4.384284837323511, + "grad_norm": 0.28114691376686096, + "learning_rate": 6.227959995151904e-05, + "loss": 1.7456, + "step": 14284 + }, + { + "epoch": 4.384591774094536, + "grad_norm": 0.24226875603199005, + "learning_rate": 6.227478156289946e-05, + "loss": 1.797, + "step": 14285 + }, + { + "epoch": 4.384898710865562, + "grad_norm": 0.2526854872703552, + "learning_rate": 6.22699630529733e-05, + "loss": 1.7155, + "step": 14286 + }, + { + "epoch": 4.385205647636587, + "grad_norm": 0.312916100025177, + "learning_rate": 6.226514442178818e-05, + "loss": 1.7808, + "step": 14287 + }, + { + "epoch": 4.385512584407612, + "grad_norm": 0.23087100684642792, + "learning_rate": 6.22603256693917e-05, + "loss": 1.7543, + "step": 14288 + }, + { + "epoch": 4.385819521178637, + "grad_norm": 0.3042476177215576, + "learning_rate": 6.22555067958315e-05, + "loss": 1.747, + "step": 14289 + }, + { + "epoch": 4.386126457949662, + "grad_norm": 0.2604007422924042, + "learning_rate": 6.225068780115522e-05, + "loss": 1.7262, + "step": 14290 + }, + { + "epoch": 4.3864333947206875, + "grad_norm": 0.2200118750333786, + "learning_rate": 6.224586868541044e-05, + "loss": 1.75, + "step": 14291 + }, + { + "epoch": 4.386740331491713, + "grad_norm": 0.3452017307281494, + "learning_rate": 6.224104944864481e-05, + "loss": 1.7598, + "step": 14292 + }, + { + "epoch": 4.387047268262738, + "grad_norm": 0.3169453740119934, + "learning_rate": 6.223623009090597e-05, + "loss": 1.7939, + "step": 14293 + }, + { + "epoch": 4.387354205033763, + "grad_norm": 0.23640502989292145, + "learning_rate": 6.223141061224151e-05, + "loss": 1.8005, + "step": 14294 + }, + { + "epoch": 4.387661141804788, + "grad_norm": 0.26212456822395325, + "learning_rate": 6.22265910126991e-05, + "loss": 1.7951, + "step": 14295 + }, + { + "epoch": 4.387968078575813, + "grad_norm": 0.2687644362449646, + "learning_rate": 6.222177129232634e-05, + "loss": 1.7674, + "step": 14296 + }, + { + "epoch": 4.388275015346839, + "grad_norm": 0.2553202211856842, + "learning_rate": 6.221695145117086e-05, + "loss": 1.8142, + "step": 14297 + }, + { + "epoch": 4.388581952117864, + "grad_norm": 0.3317619264125824, + "learning_rate": 6.221213148928034e-05, + "loss": 1.7884, + "step": 14298 + }, + { + "epoch": 4.388888888888889, + "grad_norm": 0.3059331476688385, + "learning_rate": 6.220731140670235e-05, + "loss": 1.7377, + "step": 14299 + }, + { + "epoch": 4.389195825659914, + "grad_norm": 0.21544015407562256, + "learning_rate": 6.220249120348457e-05, + "loss": 1.6818, + "step": 14300 + }, + { + "epoch": 4.389502762430939, + "grad_norm": 0.3112640380859375, + "learning_rate": 6.219767087967461e-05, + "loss": 1.72, + "step": 14301 + }, + { + "epoch": 4.389809699201964, + "grad_norm": 0.2572654187679291, + "learning_rate": 6.219285043532011e-05, + "loss": 1.793, + "step": 14302 + }, + { + "epoch": 4.39011663597299, + "grad_norm": 0.2621476948261261, + "learning_rate": 6.218802987046874e-05, + "loss": 1.8301, + "step": 14303 + }, + { + "epoch": 4.390423572744015, + "grad_norm": 0.2592658996582031, + "learning_rate": 6.218320918516809e-05, + "loss": 1.7219, + "step": 14304 + }, + { + "epoch": 4.3907305095150395, + "grad_norm": 0.25503265857696533, + "learning_rate": 6.217838837946584e-05, + "loss": 1.8149, + "step": 14305 + }, + { + "epoch": 4.391037446286065, + "grad_norm": 0.21944166719913483, + "learning_rate": 6.217356745340962e-05, + "loss": 1.7174, + "step": 14306 + }, + { + "epoch": 4.39134438305709, + "grad_norm": 0.2937396466732025, + "learning_rate": 6.216874640704707e-05, + "loss": 1.8562, + "step": 14307 + }, + { + "epoch": 4.3916513198281155, + "grad_norm": 0.22520211338996887, + "learning_rate": 6.216392524042581e-05, + "loss": 1.7701, + "step": 14308 + }, + { + "epoch": 4.391958256599141, + "grad_norm": 0.24397830665111542, + "learning_rate": 6.215910395359355e-05, + "loss": 1.7794, + "step": 14309 + }, + { + "epoch": 4.392265193370166, + "grad_norm": 0.2867623567581177, + "learning_rate": 6.215428254659788e-05, + "loss": 1.7275, + "step": 14310 + }, + { + "epoch": 4.392572130141191, + "grad_norm": 0.2632426917552948, + "learning_rate": 6.214946101948648e-05, + "loss": 1.7919, + "step": 14311 + }, + { + "epoch": 4.392879066912216, + "grad_norm": 0.23146092891693115, + "learning_rate": 6.214463937230696e-05, + "loss": 1.744, + "step": 14312 + }, + { + "epoch": 4.393186003683241, + "grad_norm": 0.21877676248550415, + "learning_rate": 6.213981760510701e-05, + "loss": 1.7577, + "step": 14313 + }, + { + "epoch": 4.393492940454267, + "grad_norm": 0.2320399284362793, + "learning_rate": 6.213499571793426e-05, + "loss": 1.7864, + "step": 14314 + }, + { + "epoch": 4.393799877225292, + "grad_norm": 0.2951548993587494, + "learning_rate": 6.213017371083638e-05, + "loss": 1.8257, + "step": 14315 + }, + { + "epoch": 4.394106813996316, + "grad_norm": 0.26062941551208496, + "learning_rate": 6.212535158386102e-05, + "loss": 1.7448, + "step": 14316 + }, + { + "epoch": 4.394413750767342, + "grad_norm": 0.24760986864566803, + "learning_rate": 6.21205293370558e-05, + "loss": 1.7902, + "step": 14317 + }, + { + "epoch": 4.394720687538367, + "grad_norm": 0.2686399221420288, + "learning_rate": 6.211570697046844e-05, + "loss": 1.8209, + "step": 14318 + }, + { + "epoch": 4.395027624309392, + "grad_norm": 0.2599134147167206, + "learning_rate": 6.211088448414653e-05, + "loss": 1.8231, + "step": 14319 + }, + { + "epoch": 4.395334561080418, + "grad_norm": 0.254044771194458, + "learning_rate": 6.210606187813778e-05, + "loss": 1.806, + "step": 14320 + }, + { + "epoch": 4.395641497851442, + "grad_norm": 0.262229323387146, + "learning_rate": 6.210123915248982e-05, + "loss": 1.7857, + "step": 14321 + }, + { + "epoch": 4.3959484346224675, + "grad_norm": 0.2849259078502655, + "learning_rate": 6.209641630725033e-05, + "loss": 1.8005, + "step": 14322 + }, + { + "epoch": 4.396255371393493, + "grad_norm": 0.35480254888534546, + "learning_rate": 6.209159334246697e-05, + "loss": 1.8189, + "step": 14323 + }, + { + "epoch": 4.396562308164518, + "grad_norm": 0.2599184215068817, + "learning_rate": 6.20867702581874e-05, + "loss": 1.7384, + "step": 14324 + }, + { + "epoch": 4.3968692449355435, + "grad_norm": 0.23994222283363342, + "learning_rate": 6.208194705445926e-05, + "loss": 1.7566, + "step": 14325 + }, + { + "epoch": 4.397176181706568, + "grad_norm": 0.24361753463745117, + "learning_rate": 6.207712373133024e-05, + "loss": 1.6965, + "step": 14326 + }, + { + "epoch": 4.397483118477593, + "grad_norm": 0.23925161361694336, + "learning_rate": 6.207230028884803e-05, + "loss": 1.7596, + "step": 14327 + }, + { + "epoch": 4.397790055248619, + "grad_norm": 0.24365897476673126, + "learning_rate": 6.206747672706025e-05, + "loss": 1.7951, + "step": 14328 + }, + { + "epoch": 4.398096992019644, + "grad_norm": 0.25245413184165955, + "learning_rate": 6.206265304601461e-05, + "loss": 1.8086, + "step": 14329 + }, + { + "epoch": 4.398403928790669, + "grad_norm": 0.24272513389587402, + "learning_rate": 6.205782924575874e-05, + "loss": 1.8148, + "step": 14330 + }, + { + "epoch": 4.398710865561695, + "grad_norm": 0.21299590170383453, + "learning_rate": 6.205300532634036e-05, + "loss": 1.7666, + "step": 14331 + }, + { + "epoch": 4.399017802332719, + "grad_norm": 0.23543189465999603, + "learning_rate": 6.20481812878071e-05, + "loss": 1.7629, + "step": 14332 + }, + { + "epoch": 4.399324739103744, + "grad_norm": 0.2284495085477829, + "learning_rate": 6.204335713020665e-05, + "loss": 1.768, + "step": 14333 + }, + { + "epoch": 4.39963167587477, + "grad_norm": 0.23158542811870575, + "learning_rate": 6.20385328535867e-05, + "loss": 1.7761, + "step": 14334 + }, + { + "epoch": 4.399938612645795, + "grad_norm": 0.2378150224685669, + "learning_rate": 6.20337084579949e-05, + "loss": 1.8483, + "step": 14335 + }, + { + "epoch": 4.4002455494168204, + "grad_norm": 0.2407436966896057, + "learning_rate": 6.202888394347892e-05, + "loss": 1.7364, + "step": 14336 + }, + { + "epoch": 4.400552486187845, + "grad_norm": 0.256259560585022, + "learning_rate": 6.202405931008649e-05, + "loss": 1.7376, + "step": 14337 + }, + { + "epoch": 4.40085942295887, + "grad_norm": 0.29293057322502136, + "learning_rate": 6.201923455786524e-05, + "loss": 1.7493, + "step": 14338 + }, + { + "epoch": 4.401166359729896, + "grad_norm": 0.24025334417819977, + "learning_rate": 6.201440968686288e-05, + "loss": 1.7522, + "step": 14339 + }, + { + "epoch": 4.401473296500921, + "grad_norm": 0.3215656280517578, + "learning_rate": 6.200958469712708e-05, + "loss": 1.7748, + "step": 14340 + }, + { + "epoch": 4.401780233271946, + "grad_norm": 0.43553170561790466, + "learning_rate": 6.200475958870553e-05, + "loss": 1.771, + "step": 14341 + }, + { + "epoch": 4.402087170042972, + "grad_norm": 0.3112131953239441, + "learning_rate": 6.19999343616459e-05, + "loss": 1.7655, + "step": 14342 + }, + { + "epoch": 4.402394106813996, + "grad_norm": 0.25197842717170715, + "learning_rate": 6.199510901599589e-05, + "loss": 1.7214, + "step": 14343 + }, + { + "epoch": 4.402701043585021, + "grad_norm": 0.33227142691612244, + "learning_rate": 6.19902835518032e-05, + "loss": 1.7332, + "step": 14344 + }, + { + "epoch": 4.403007980356047, + "grad_norm": 0.27962982654571533, + "learning_rate": 6.198545796911548e-05, + "loss": 1.6943, + "step": 14345 + }, + { + "epoch": 4.403314917127072, + "grad_norm": 0.24374182522296906, + "learning_rate": 6.198063226798044e-05, + "loss": 1.7222, + "step": 14346 + }, + { + "epoch": 4.403621853898097, + "grad_norm": 0.3101944625377655, + "learning_rate": 6.197580644844576e-05, + "loss": 1.7113, + "step": 14347 + }, + { + "epoch": 4.403928790669122, + "grad_norm": 0.25919321179389954, + "learning_rate": 6.197098051055916e-05, + "loss": 1.71, + "step": 14348 + }, + { + "epoch": 4.404235727440147, + "grad_norm": 0.23140330612659454, + "learning_rate": 6.19661544543683e-05, + "loss": 1.7472, + "step": 14349 + }, + { + "epoch": 4.4045426642111725, + "grad_norm": 0.3274286687374115, + "learning_rate": 6.19613282799209e-05, + "loss": 1.7093, + "step": 14350 + }, + { + "epoch": 4.404849600982198, + "grad_norm": 0.3187442123889923, + "learning_rate": 6.195650198726464e-05, + "loss": 1.7488, + "step": 14351 + }, + { + "epoch": 4.405156537753223, + "grad_norm": 0.20547433197498322, + "learning_rate": 6.195167557644722e-05, + "loss": 1.7295, + "step": 14352 + }, + { + "epoch": 4.4054634745242485, + "grad_norm": 0.2623414993286133, + "learning_rate": 6.194684904751633e-05, + "loss": 1.8258, + "step": 14353 + }, + { + "epoch": 4.405770411295273, + "grad_norm": 0.2468457818031311, + "learning_rate": 6.194202240051967e-05, + "loss": 1.6957, + "step": 14354 + }, + { + "epoch": 4.406077348066298, + "grad_norm": 0.2082364559173584, + "learning_rate": 6.193719563550496e-05, + "loss": 1.7596, + "step": 14355 + }, + { + "epoch": 4.406384284837324, + "grad_norm": 0.27072983980178833, + "learning_rate": 6.193236875251988e-05, + "loss": 1.7341, + "step": 14356 + }, + { + "epoch": 4.406691221608349, + "grad_norm": 0.2630362808704376, + "learning_rate": 6.192754175161215e-05, + "loss": 1.7664, + "step": 14357 + }, + { + "epoch": 4.406998158379374, + "grad_norm": 0.25400006771087646, + "learning_rate": 6.192271463282944e-05, + "loss": 1.7582, + "step": 14358 + }, + { + "epoch": 4.407305095150399, + "grad_norm": 0.22256311774253845, + "learning_rate": 6.191788739621949e-05, + "loss": 1.7389, + "step": 14359 + }, + { + "epoch": 4.407612031921424, + "grad_norm": 0.2160387486219406, + "learning_rate": 6.191306004182999e-05, + "loss": 1.7051, + "step": 14360 + }, + { + "epoch": 4.407918968692449, + "grad_norm": 0.20665684342384338, + "learning_rate": 6.190823256970865e-05, + "loss": 1.7606, + "step": 14361 + }, + { + "epoch": 4.408225905463475, + "grad_norm": 0.2173188328742981, + "learning_rate": 6.190340497990318e-05, + "loss": 1.7944, + "step": 14362 + }, + { + "epoch": 4.4085328422345, + "grad_norm": 0.189287930727005, + "learning_rate": 6.189857727246127e-05, + "loss": 1.7283, + "step": 14363 + }, + { + "epoch": 4.4088397790055245, + "grad_norm": 0.2531645596027374, + "learning_rate": 6.189374944743065e-05, + "loss": 1.7554, + "step": 14364 + }, + { + "epoch": 4.40914671577655, + "grad_norm": 0.25439125299453735, + "learning_rate": 6.188892150485903e-05, + "loss": 1.8032, + "step": 14365 + }, + { + "epoch": 4.409453652547575, + "grad_norm": 0.20938685536384583, + "learning_rate": 6.188409344479412e-05, + "loss": 1.7385, + "step": 14366 + }, + { + "epoch": 4.4097605893186005, + "grad_norm": 0.20471477508544922, + "learning_rate": 6.187926526728364e-05, + "loss": 1.7487, + "step": 14367 + }, + { + "epoch": 4.410067526089626, + "grad_norm": 0.2381851226091385, + "learning_rate": 6.187443697237529e-05, + "loss": 1.7443, + "step": 14368 + }, + { + "epoch": 4.41037446286065, + "grad_norm": 0.21584098041057587, + "learning_rate": 6.18696085601168e-05, + "loss": 1.7818, + "step": 14369 + }, + { + "epoch": 4.410681399631676, + "grad_norm": 0.2575368583202362, + "learning_rate": 6.186478003055587e-05, + "loss": 1.8204, + "step": 14370 + }, + { + "epoch": 4.410988336402701, + "grad_norm": 0.21133238077163696, + "learning_rate": 6.185995138374024e-05, + "loss": 1.7274, + "step": 14371 + }, + { + "epoch": 4.411295273173726, + "grad_norm": 0.24918322265148163, + "learning_rate": 6.18551226197176e-05, + "loss": 1.8021, + "step": 14372 + }, + { + "epoch": 4.411602209944752, + "grad_norm": 0.2253655642271042, + "learning_rate": 6.185029373853572e-05, + "loss": 1.7308, + "step": 14373 + }, + { + "epoch": 4.411909146715777, + "grad_norm": 0.20098713040351868, + "learning_rate": 6.184546474024226e-05, + "loss": 1.7549, + "step": 14374 + }, + { + "epoch": 4.412216083486801, + "grad_norm": 0.25612789392471313, + "learning_rate": 6.1840635624885e-05, + "loss": 1.8305, + "step": 14375 + }, + { + "epoch": 4.412523020257827, + "grad_norm": 0.24287539720535278, + "learning_rate": 6.183580639251164e-05, + "loss": 1.7339, + "step": 14376 + }, + { + "epoch": 4.412829957028852, + "grad_norm": 0.2304944545030594, + "learning_rate": 6.183097704316988e-05, + "loss": 1.7023, + "step": 14377 + }, + { + "epoch": 4.413136893799877, + "grad_norm": 0.21911773085594177, + "learning_rate": 6.18261475769075e-05, + "loss": 1.7305, + "step": 14378 + }, + { + "epoch": 4.413443830570903, + "grad_norm": 0.24207864701747894, + "learning_rate": 6.182131799377217e-05, + "loss": 1.7318, + "step": 14379 + }, + { + "epoch": 4.413750767341927, + "grad_norm": 0.2551634609699249, + "learning_rate": 6.181648829381165e-05, + "loss": 1.8101, + "step": 14380 + }, + { + "epoch": 4.4140577041129525, + "grad_norm": 0.4114011526107788, + "learning_rate": 6.181165847707368e-05, + "loss": 1.772, + "step": 14381 + }, + { + "epoch": 4.414364640883978, + "grad_norm": 0.4592796862125397, + "learning_rate": 6.180682854360598e-05, + "loss": 1.7359, + "step": 14382 + }, + { + "epoch": 4.414671577655003, + "grad_norm": 0.2599259614944458, + "learning_rate": 6.180199849345627e-05, + "loss": 1.7028, + "step": 14383 + }, + { + "epoch": 4.4149785144260285, + "grad_norm": 0.3489506244659424, + "learning_rate": 6.17971683266723e-05, + "loss": 1.8252, + "step": 14384 + }, + { + "epoch": 4.415285451197054, + "grad_norm": 0.44563809037208557, + "learning_rate": 6.179233804330179e-05, + "loss": 1.6894, + "step": 14385 + }, + { + "epoch": 4.415592387968078, + "grad_norm": 0.2596888542175293, + "learning_rate": 6.17875076433925e-05, + "loss": 1.8141, + "step": 14386 + }, + { + "epoch": 4.415899324739104, + "grad_norm": 0.3560626804828644, + "learning_rate": 6.178267712699213e-05, + "loss": 1.7764, + "step": 14387 + }, + { + "epoch": 4.416206261510129, + "grad_norm": 0.3746717572212219, + "learning_rate": 6.177784649414843e-05, + "loss": 1.7528, + "step": 14388 + }, + { + "epoch": 4.416513198281154, + "grad_norm": 0.23248885571956635, + "learning_rate": 6.177301574490918e-05, + "loss": 1.7148, + "step": 14389 + }, + { + "epoch": 4.41682013505218, + "grad_norm": 0.26936978101730347, + "learning_rate": 6.176818487932208e-05, + "loss": 1.7199, + "step": 14390 + }, + { + "epoch": 4.417127071823204, + "grad_norm": 0.3102504014968872, + "learning_rate": 6.176335389743486e-05, + "loss": 1.6886, + "step": 14391 + }, + { + "epoch": 4.417434008594229, + "grad_norm": 0.24406832456588745, + "learning_rate": 6.175852279929531e-05, + "loss": 1.7766, + "step": 14392 + }, + { + "epoch": 4.417740945365255, + "grad_norm": 0.271158903837204, + "learning_rate": 6.175369158495112e-05, + "loss": 1.8099, + "step": 14393 + }, + { + "epoch": 4.41804788213628, + "grad_norm": 0.343667209148407, + "learning_rate": 6.174886025445008e-05, + "loss": 1.779, + "step": 14394 + }, + { + "epoch": 4.418354818907305, + "grad_norm": 0.37423139810562134, + "learning_rate": 6.17440288078399e-05, + "loss": 1.7796, + "step": 14395 + }, + { + "epoch": 4.41866175567833, + "grad_norm": 0.3152335286140442, + "learning_rate": 6.173919724516836e-05, + "loss": 1.7388, + "step": 14396 + }, + { + "epoch": 4.418968692449355, + "grad_norm": 0.21467824280261993, + "learning_rate": 6.173436556648319e-05, + "loss": 1.7689, + "step": 14397 + }, + { + "epoch": 4.4192756292203805, + "grad_norm": 0.2861369848251343, + "learning_rate": 6.172953377183213e-05, + "loss": 1.819, + "step": 14398 + }, + { + "epoch": 4.419582565991406, + "grad_norm": 0.34777504205703735, + "learning_rate": 6.172470186126295e-05, + "loss": 1.7444, + "step": 14399 + }, + { + "epoch": 4.419889502762431, + "grad_norm": 0.2728833854198456, + "learning_rate": 6.171986983482339e-05, + "loss": 1.7637, + "step": 14400 + }, + { + "epoch": 4.420196439533456, + "grad_norm": 0.2593914270401001, + "learning_rate": 6.17150376925612e-05, + "loss": 1.8196, + "step": 14401 + }, + { + "epoch": 4.420503376304481, + "grad_norm": 0.29425305128097534, + "learning_rate": 6.171020543452416e-05, + "loss": 1.7511, + "step": 14402 + }, + { + "epoch": 4.420810313075506, + "grad_norm": 0.2587110102176666, + "learning_rate": 6.170537306076e-05, + "loss": 1.8085, + "step": 14403 + }, + { + "epoch": 4.421117249846532, + "grad_norm": 0.22442933917045593, + "learning_rate": 6.170054057131648e-05, + "loss": 1.8023, + "step": 14404 + }, + { + "epoch": 4.421424186617557, + "grad_norm": 0.23302629590034485, + "learning_rate": 6.169570796624136e-05, + "loss": 1.7995, + "step": 14405 + }, + { + "epoch": 4.421731123388582, + "grad_norm": 0.2295885682106018, + "learning_rate": 6.169087524558239e-05, + "loss": 1.7948, + "step": 14406 + }, + { + "epoch": 4.422038060159607, + "grad_norm": 0.2161262482404709, + "learning_rate": 6.168604240938735e-05, + "loss": 1.7159, + "step": 14407 + }, + { + "epoch": 4.422344996930632, + "grad_norm": 0.20746205747127533, + "learning_rate": 6.1681209457704e-05, + "loss": 1.7703, + "step": 14408 + }, + { + "epoch": 4.422651933701657, + "grad_norm": 0.25677376985549927, + "learning_rate": 6.167637639058006e-05, + "loss": 1.7819, + "step": 14409 + }, + { + "epoch": 4.422958870472683, + "grad_norm": 0.226568341255188, + "learning_rate": 6.167154320806336e-05, + "loss": 1.7661, + "step": 14410 + }, + { + "epoch": 4.423265807243708, + "grad_norm": 0.22997824847698212, + "learning_rate": 6.166670991020162e-05, + "loss": 1.7364, + "step": 14411 + }, + { + "epoch": 4.4235727440147325, + "grad_norm": 0.2528770864009857, + "learning_rate": 6.166187649704261e-05, + "loss": 1.8505, + "step": 14412 + }, + { + "epoch": 4.423879680785758, + "grad_norm": 0.27278614044189453, + "learning_rate": 6.165704296863409e-05, + "loss": 1.7855, + "step": 14413 + }, + { + "epoch": 4.424186617556783, + "grad_norm": 0.23086364567279816, + "learning_rate": 6.165220932502385e-05, + "loss": 1.7489, + "step": 14414 + }, + { + "epoch": 4.4244935543278086, + "grad_norm": 0.2570587396621704, + "learning_rate": 6.164737556625965e-05, + "loss": 1.8008, + "step": 14415 + }, + { + "epoch": 4.424800491098834, + "grad_norm": 0.2637264132499695, + "learning_rate": 6.164254169238923e-05, + "loss": 1.7563, + "step": 14416 + }, + { + "epoch": 4.425107427869859, + "grad_norm": 0.23046623170375824, + "learning_rate": 6.163770770346043e-05, + "loss": 1.7433, + "step": 14417 + }, + { + "epoch": 4.425414364640884, + "grad_norm": 0.2531467080116272, + "learning_rate": 6.163287359952095e-05, + "loss": 1.8122, + "step": 14418 + }, + { + "epoch": 4.425721301411909, + "grad_norm": 0.26507216691970825, + "learning_rate": 6.162803938061861e-05, + "loss": 1.7019, + "step": 14419 + }, + { + "epoch": 4.426028238182934, + "grad_norm": 0.229641854763031, + "learning_rate": 6.162320504680117e-05, + "loss": 1.7518, + "step": 14420 + }, + { + "epoch": 4.42633517495396, + "grad_norm": 0.22777152061462402, + "learning_rate": 6.161837059811641e-05, + "loss": 1.8094, + "step": 14421 + }, + { + "epoch": 4.426642111724985, + "grad_norm": 0.22121338546276093, + "learning_rate": 6.161353603461209e-05, + "loss": 1.7204, + "step": 14422 + }, + { + "epoch": 4.4269490484960095, + "grad_norm": 0.21914128959178925, + "learning_rate": 6.1608701356336e-05, + "loss": 1.7554, + "step": 14423 + }, + { + "epoch": 4.427255985267035, + "grad_norm": 0.22649390995502472, + "learning_rate": 6.160386656333593e-05, + "loss": 1.8058, + "step": 14424 + }, + { + "epoch": 4.42756292203806, + "grad_norm": 0.24529023468494415, + "learning_rate": 6.159903165565964e-05, + "loss": 1.7302, + "step": 14425 + }, + { + "epoch": 4.4278698588090855, + "grad_norm": 0.2726481854915619, + "learning_rate": 6.159419663335492e-05, + "loss": 1.825, + "step": 14426 + }, + { + "epoch": 4.428176795580111, + "grad_norm": 0.2772440016269684, + "learning_rate": 6.158936149646957e-05, + "loss": 1.7322, + "step": 14427 + }, + { + "epoch": 4.428483732351136, + "grad_norm": 0.29778853058815, + "learning_rate": 6.158452624505135e-05, + "loss": 1.7421, + "step": 14428 + }, + { + "epoch": 4.428790669122161, + "grad_norm": 0.21327480673789978, + "learning_rate": 6.157969087914804e-05, + "loss": 1.7269, + "step": 14429 + }, + { + "epoch": 4.429097605893186, + "grad_norm": 0.2718868851661682, + "learning_rate": 6.157485539880744e-05, + "loss": 1.7817, + "step": 14430 + }, + { + "epoch": 4.429404542664211, + "grad_norm": 0.32242509722709656, + "learning_rate": 6.157001980407735e-05, + "loss": 1.7115, + "step": 14431 + }, + { + "epoch": 4.429711479435237, + "grad_norm": 0.2931978106498718, + "learning_rate": 6.156518409500553e-05, + "loss": 1.7822, + "step": 14432 + }, + { + "epoch": 4.430018416206262, + "grad_norm": 0.229528546333313, + "learning_rate": 6.156034827163977e-05, + "loss": 1.7623, + "step": 14433 + }, + { + "epoch": 4.430325352977286, + "grad_norm": 0.28702354431152344, + "learning_rate": 6.15555123340279e-05, + "loss": 1.8101, + "step": 14434 + }, + { + "epoch": 4.430632289748312, + "grad_norm": 0.27162131667137146, + "learning_rate": 6.155067628221766e-05, + "loss": 1.7525, + "step": 14435 + }, + { + "epoch": 4.430939226519337, + "grad_norm": 0.24290388822555542, + "learning_rate": 6.154584011625688e-05, + "loss": 1.8701, + "step": 14436 + }, + { + "epoch": 4.431246163290362, + "grad_norm": 0.3055405020713806, + "learning_rate": 6.154100383619334e-05, + "loss": 1.8659, + "step": 14437 + }, + { + "epoch": 4.431553100061388, + "grad_norm": 0.24528950452804565, + "learning_rate": 6.153616744207483e-05, + "loss": 1.8493, + "step": 14438 + }, + { + "epoch": 4.431860036832412, + "grad_norm": 0.2611897587776184, + "learning_rate": 6.153133093394917e-05, + "loss": 1.7905, + "step": 14439 + }, + { + "epoch": 4.4321669736034375, + "grad_norm": 0.2172730267047882, + "learning_rate": 6.15264943118641e-05, + "loss": 1.7087, + "step": 14440 + }, + { + "epoch": 4.432473910374463, + "grad_norm": 0.2320949286222458, + "learning_rate": 6.152165757586749e-05, + "loss": 1.7473, + "step": 14441 + }, + { + "epoch": 4.432780847145488, + "grad_norm": 0.2602086365222931, + "learning_rate": 6.15168207260071e-05, + "loss": 1.7365, + "step": 14442 + }, + { + "epoch": 4.4330877839165135, + "grad_norm": 0.25193190574645996, + "learning_rate": 6.151198376233074e-05, + "loss": 1.8205, + "step": 14443 + }, + { + "epoch": 4.433394720687538, + "grad_norm": 0.2894204556941986, + "learning_rate": 6.150714668488621e-05, + "loss": 1.7759, + "step": 14444 + }, + { + "epoch": 4.433701657458563, + "grad_norm": 0.24150310456752777, + "learning_rate": 6.150230949372131e-05, + "loss": 1.8415, + "step": 14445 + }, + { + "epoch": 4.434008594229589, + "grad_norm": 0.23475918173789978, + "learning_rate": 6.149747218888384e-05, + "loss": 1.7487, + "step": 14446 + }, + { + "epoch": 4.434315531000614, + "grad_norm": 0.29425546526908875, + "learning_rate": 6.149263477042162e-05, + "loss": 1.7538, + "step": 14447 + }, + { + "epoch": 4.434622467771639, + "grad_norm": 0.26241615414619446, + "learning_rate": 6.148779723838244e-05, + "loss": 1.7564, + "step": 14448 + }, + { + "epoch": 4.434929404542665, + "grad_norm": 0.23195287585258484, + "learning_rate": 6.148295959281411e-05, + "loss": 1.837, + "step": 14449 + }, + { + "epoch": 4.435236341313689, + "grad_norm": 0.34972792863845825, + "learning_rate": 6.147812183376445e-05, + "loss": 1.7632, + "step": 14450 + }, + { + "epoch": 4.435543278084714, + "grad_norm": 0.3536125719547272, + "learning_rate": 6.147328396128126e-05, + "loss": 1.8372, + "step": 14451 + }, + { + "epoch": 4.43585021485574, + "grad_norm": 0.2086079865694046, + "learning_rate": 6.146844597541235e-05, + "loss": 1.7014, + "step": 14452 + }, + { + "epoch": 4.436157151626765, + "grad_norm": 0.25547802448272705, + "learning_rate": 6.146360787620554e-05, + "loss": 1.7544, + "step": 14453 + }, + { + "epoch": 4.43646408839779, + "grad_norm": 0.26176998019218445, + "learning_rate": 6.145876966370864e-05, + "loss": 1.7617, + "step": 14454 + }, + { + "epoch": 4.436771025168815, + "grad_norm": 0.2672959566116333, + "learning_rate": 6.145393133796946e-05, + "loss": 1.8178, + "step": 14455 + }, + { + "epoch": 4.43707796193984, + "grad_norm": 0.23373909294605255, + "learning_rate": 6.144909289903582e-05, + "loss": 1.7295, + "step": 14456 + }, + { + "epoch": 4.4373848987108655, + "grad_norm": 0.2369835078716278, + "learning_rate": 6.144425434695551e-05, + "loss": 1.8097, + "step": 14457 + }, + { + "epoch": 4.437691835481891, + "grad_norm": 0.25528979301452637, + "learning_rate": 6.14394156817764e-05, + "loss": 1.7523, + "step": 14458 + }, + { + "epoch": 4.437998772252916, + "grad_norm": 0.2541787624359131, + "learning_rate": 6.143457690354626e-05, + "loss": 1.7606, + "step": 14459 + }, + { + "epoch": 4.4383057090239415, + "grad_norm": 0.2032637745141983, + "learning_rate": 6.142973801231295e-05, + "loss": 1.7967, + "step": 14460 + }, + { + "epoch": 4.438612645794966, + "grad_norm": 0.2413996160030365, + "learning_rate": 6.142489900812426e-05, + "loss": 1.7688, + "step": 14461 + }, + { + "epoch": 4.438919582565991, + "grad_norm": 0.43451038002967834, + "learning_rate": 6.142005989102803e-05, + "loss": 1.8269, + "step": 14462 + }, + { + "epoch": 4.439226519337017, + "grad_norm": 0.23981481790542603, + "learning_rate": 6.141522066107206e-05, + "loss": 1.7628, + "step": 14463 + }, + { + "epoch": 4.439533456108042, + "grad_norm": 0.25396493077278137, + "learning_rate": 6.14103813183042e-05, + "loss": 1.7913, + "step": 14464 + }, + { + "epoch": 4.439840392879067, + "grad_norm": 0.2567536532878876, + "learning_rate": 6.140554186277225e-05, + "loss": 1.7612, + "step": 14465 + }, + { + "epoch": 4.440147329650092, + "grad_norm": 0.2201337069272995, + "learning_rate": 6.140070229452406e-05, + "loss": 1.7541, + "step": 14466 + }, + { + "epoch": 4.440454266421117, + "grad_norm": 0.24202953279018402, + "learning_rate": 6.139586261360746e-05, + "loss": 1.777, + "step": 14467 + }, + { + "epoch": 4.440761203192142, + "grad_norm": 0.23891687393188477, + "learning_rate": 6.139102282007024e-05, + "loss": 1.7509, + "step": 14468 + }, + { + "epoch": 4.441068139963168, + "grad_norm": 0.21132555603981018, + "learning_rate": 6.138618291396026e-05, + "loss": 1.7362, + "step": 14469 + }, + { + "epoch": 4.441375076734193, + "grad_norm": 0.2731861472129822, + "learning_rate": 6.138134289532536e-05, + "loss": 1.8063, + "step": 14470 + }, + { + "epoch": 4.4416820135052175, + "grad_norm": 0.29503315687179565, + "learning_rate": 6.137650276421336e-05, + "loss": 1.7193, + "step": 14471 + }, + { + "epoch": 4.441988950276243, + "grad_norm": 0.2778526544570923, + "learning_rate": 6.137166252067208e-05, + "loss": 1.7507, + "step": 14472 + }, + { + "epoch": 4.442295887047268, + "grad_norm": 0.2907710075378418, + "learning_rate": 6.136682216474938e-05, + "loss": 1.7939, + "step": 14473 + }, + { + "epoch": 4.4426028238182935, + "grad_norm": 0.4133768379688263, + "learning_rate": 6.136198169649306e-05, + "loss": 1.8012, + "step": 14474 + }, + { + "epoch": 4.442909760589319, + "grad_norm": 0.2505052983760834, + "learning_rate": 6.135714111595099e-05, + "loss": 1.8426, + "step": 14475 + }, + { + "epoch": 4.443216697360343, + "grad_norm": 0.3884379267692566, + "learning_rate": 6.135230042317099e-05, + "loss": 1.7383, + "step": 14476 + }, + { + "epoch": 4.443523634131369, + "grad_norm": 0.42902377247810364, + "learning_rate": 6.134745961820091e-05, + "loss": 1.732, + "step": 14477 + }, + { + "epoch": 4.443830570902394, + "grad_norm": 0.21782708168029785, + "learning_rate": 6.134261870108858e-05, + "loss": 1.7369, + "step": 14478 + }, + { + "epoch": 4.444137507673419, + "grad_norm": 0.4160648286342621, + "learning_rate": 6.133777767188186e-05, + "loss": 1.8083, + "step": 14479 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.5057216882705688, + "learning_rate": 6.133293653062856e-05, + "loss": 1.8971, + "step": 14480 + }, + { + "epoch": 4.44475138121547, + "grad_norm": 0.2189750075340271, + "learning_rate": 6.132809527737654e-05, + "loss": 1.7508, + "step": 14481 + }, + { + "epoch": 4.445058317986494, + "grad_norm": 0.4415782392024994, + "learning_rate": 6.132325391217364e-05, + "loss": 1.8548, + "step": 14482 + }, + { + "epoch": 4.44536525475752, + "grad_norm": 0.3907296359539032, + "learning_rate": 6.13184124350677e-05, + "loss": 1.7879, + "step": 14483 + }, + { + "epoch": 4.445672191528545, + "grad_norm": 0.24117955565452576, + "learning_rate": 6.131357084610659e-05, + "loss": 1.7227, + "step": 14484 + }, + { + "epoch": 4.44597912829957, + "grad_norm": 0.3083679974079132, + "learning_rate": 6.130872914533815e-05, + "loss": 1.7505, + "step": 14485 + }, + { + "epoch": 4.446286065070596, + "grad_norm": 0.27730658650398254, + "learning_rate": 6.13038873328102e-05, + "loss": 1.7485, + "step": 14486 + }, + { + "epoch": 4.44659300184162, + "grad_norm": 0.28548410534858704, + "learning_rate": 6.12990454085706e-05, + "loss": 1.8145, + "step": 14487 + }, + { + "epoch": 4.4468999386126455, + "grad_norm": 0.24743106961250305, + "learning_rate": 6.129420337266724e-05, + "loss": 1.7131, + "step": 14488 + }, + { + "epoch": 4.447206875383671, + "grad_norm": 0.2899693250656128, + "learning_rate": 6.128936122514794e-05, + "loss": 1.8567, + "step": 14489 + }, + { + "epoch": 4.447513812154696, + "grad_norm": 0.259916752576828, + "learning_rate": 6.128451896606053e-05, + "loss": 1.7563, + "step": 14490 + }, + { + "epoch": 4.4478207489257215, + "grad_norm": 0.21112586557865143, + "learning_rate": 6.12796765954529e-05, + "loss": 1.6975, + "step": 14491 + }, + { + "epoch": 4.448127685696747, + "grad_norm": 0.2890239953994751, + "learning_rate": 6.12748341133729e-05, + "loss": 1.7904, + "step": 14492 + }, + { + "epoch": 4.448434622467771, + "grad_norm": 0.23394012451171875, + "learning_rate": 6.126999151986839e-05, + "loss": 1.7559, + "step": 14493 + }, + { + "epoch": 4.448741559238797, + "grad_norm": 0.3492949903011322, + "learning_rate": 6.12651488149872e-05, + "loss": 1.7734, + "step": 14494 + }, + { + "epoch": 4.449048496009822, + "grad_norm": 0.48309218883514404, + "learning_rate": 6.126030599877723e-05, + "loss": 1.7798, + "step": 14495 + }, + { + "epoch": 4.449355432780847, + "grad_norm": 0.341146320104599, + "learning_rate": 6.12554630712863e-05, + "loss": 1.7921, + "step": 14496 + }, + { + "epoch": 4.449662369551873, + "grad_norm": 0.223160982131958, + "learning_rate": 6.125062003256229e-05, + "loss": 1.7784, + "step": 14497 + }, + { + "epoch": 4.449969306322897, + "grad_norm": 0.32664811611175537, + "learning_rate": 6.124577688265306e-05, + "loss": 1.7353, + "step": 14498 + }, + { + "epoch": 4.4502762430939224, + "grad_norm": 0.215936541557312, + "learning_rate": 6.124093362160646e-05, + "loss": 1.68, + "step": 14499 + }, + { + "epoch": 4.450583179864948, + "grad_norm": 0.26081225275993347, + "learning_rate": 6.123609024947038e-05, + "loss": 1.7107, + "step": 14500 + }, + { + "epoch": 4.450890116635973, + "grad_norm": 0.3124069571495056, + "learning_rate": 6.123124676629267e-05, + "loss": 1.7338, + "step": 14501 + }, + { + "epoch": 4.4511970534069984, + "grad_norm": 0.23125620186328888, + "learning_rate": 6.122640317212118e-05, + "loss": 1.7842, + "step": 14502 + }, + { + "epoch": 4.451503990178024, + "grad_norm": 0.27065595984458923, + "learning_rate": 6.122155946700381e-05, + "loss": 1.7284, + "step": 14503 + }, + { + "epoch": 4.451810926949048, + "grad_norm": 0.4677436053752899, + "learning_rate": 6.121671565098841e-05, + "loss": 1.8156, + "step": 14504 + }, + { + "epoch": 4.452117863720074, + "grad_norm": 0.36325082182884216, + "learning_rate": 6.121187172412285e-05, + "loss": 1.7875, + "step": 14505 + }, + { + "epoch": 4.452424800491099, + "grad_norm": 0.23409567773342133, + "learning_rate": 6.1207027686455e-05, + "loss": 1.7421, + "step": 14506 + }, + { + "epoch": 4.452731737262124, + "grad_norm": 0.36919257044792175, + "learning_rate": 6.120218353803273e-05, + "loss": 1.7545, + "step": 14507 + }, + { + "epoch": 4.45303867403315, + "grad_norm": 0.318452388048172, + "learning_rate": 6.119733927890393e-05, + "loss": 1.7179, + "step": 14508 + }, + { + "epoch": 4.453345610804174, + "grad_norm": 0.21279768645763397, + "learning_rate": 6.119249490911643e-05, + "loss": 1.7534, + "step": 14509 + }, + { + "epoch": 4.453652547575199, + "grad_norm": 0.30565473437309265, + "learning_rate": 6.118765042871816e-05, + "loss": 1.7962, + "step": 14510 + }, + { + "epoch": 4.453959484346225, + "grad_norm": 0.2608480453491211, + "learning_rate": 6.118280583775697e-05, + "loss": 1.7336, + "step": 14511 + }, + { + "epoch": 4.45426642111725, + "grad_norm": 0.22978845238685608, + "learning_rate": 6.117796113628075e-05, + "loss": 1.8244, + "step": 14512 + }, + { + "epoch": 4.454573357888275, + "grad_norm": 0.26357781887054443, + "learning_rate": 6.117311632433735e-05, + "loss": 1.7425, + "step": 14513 + }, + { + "epoch": 4.4548802946593, + "grad_norm": 0.22127102315425873, + "learning_rate": 6.116827140197467e-05, + "loss": 1.7679, + "step": 14514 + }, + { + "epoch": 4.455187231430325, + "grad_norm": 0.2876584231853485, + "learning_rate": 6.116342636924058e-05, + "loss": 1.8104, + "step": 14515 + }, + { + "epoch": 4.4554941682013505, + "grad_norm": 0.28290677070617676, + "learning_rate": 6.115858122618297e-05, + "loss": 1.7485, + "step": 14516 + }, + { + "epoch": 4.455801104972376, + "grad_norm": 0.21914640069007874, + "learning_rate": 6.115373597284974e-05, + "loss": 1.7736, + "step": 14517 + }, + { + "epoch": 4.456108041743401, + "grad_norm": 0.2603909969329834, + "learning_rate": 6.114889060928873e-05, + "loss": 1.7446, + "step": 14518 + }, + { + "epoch": 4.456414978514426, + "grad_norm": 0.2157236635684967, + "learning_rate": 6.114404513554784e-05, + "loss": 1.7594, + "step": 14519 + }, + { + "epoch": 4.456721915285451, + "grad_norm": 0.27622368931770325, + "learning_rate": 6.113919955167499e-05, + "loss": 1.8154, + "step": 14520 + }, + { + "epoch": 4.457028852056476, + "grad_norm": 0.27298516035079956, + "learning_rate": 6.113435385771803e-05, + "loss": 1.7458, + "step": 14521 + }, + { + "epoch": 4.457335788827502, + "grad_norm": 0.22220586240291595, + "learning_rate": 6.112950805372485e-05, + "loss": 1.7102, + "step": 14522 + }, + { + "epoch": 4.457642725598527, + "grad_norm": 0.19480876624584198, + "learning_rate": 6.112466213974336e-05, + "loss": 1.7696, + "step": 14523 + }, + { + "epoch": 4.457949662369552, + "grad_norm": 0.24261653423309326, + "learning_rate": 6.111981611582144e-05, + "loss": 1.8193, + "step": 14524 + }, + { + "epoch": 4.458256599140577, + "grad_norm": 0.2502967417240143, + "learning_rate": 6.111496998200697e-05, + "loss": 1.7701, + "step": 14525 + }, + { + "epoch": 4.458563535911602, + "grad_norm": 0.25764599442481995, + "learning_rate": 6.111012373834786e-05, + "loss": 1.8055, + "step": 14526 + }, + { + "epoch": 4.458870472682627, + "grad_norm": 0.24085427820682526, + "learning_rate": 6.110527738489198e-05, + "loss": 1.7592, + "step": 14527 + }, + { + "epoch": 4.459177409453653, + "grad_norm": 0.2469809502363205, + "learning_rate": 6.110043092168727e-05, + "loss": 1.6977, + "step": 14528 + }, + { + "epoch": 4.459484346224678, + "grad_norm": 0.21888838708400726, + "learning_rate": 6.109558434878159e-05, + "loss": 1.777, + "step": 14529 + }, + { + "epoch": 4.4597912829957025, + "grad_norm": 0.2094014585018158, + "learning_rate": 6.109073766622281e-05, + "loss": 1.7041, + "step": 14530 + }, + { + "epoch": 4.460098219766728, + "grad_norm": 0.23801055550575256, + "learning_rate": 6.108589087405888e-05, + "loss": 1.8392, + "step": 14531 + }, + { + "epoch": 4.460405156537753, + "grad_norm": 0.2164965718984604, + "learning_rate": 6.108104397233769e-05, + "loss": 1.7643, + "step": 14532 + }, + { + "epoch": 4.4607120933087785, + "grad_norm": 0.21322336792945862, + "learning_rate": 6.107619696110712e-05, + "loss": 1.7063, + "step": 14533 + }, + { + "epoch": 4.461019030079804, + "grad_norm": 0.29019200801849365, + "learning_rate": 6.107134984041507e-05, + "loss": 1.8254, + "step": 14534 + }, + { + "epoch": 4.461325966850829, + "grad_norm": 0.2765025496482849, + "learning_rate": 6.106650261030947e-05, + "loss": 1.7609, + "step": 14535 + }, + { + "epoch": 4.461632903621854, + "grad_norm": 0.20879749953746796, + "learning_rate": 6.106165527083818e-05, + "loss": 1.7387, + "step": 14536 + }, + { + "epoch": 4.461939840392879, + "grad_norm": 0.22295843064785004, + "learning_rate": 6.105680782204913e-05, + "loss": 1.7691, + "step": 14537 + }, + { + "epoch": 4.462246777163904, + "grad_norm": 0.23502351343631744, + "learning_rate": 6.105196026399025e-05, + "loss": 1.7335, + "step": 14538 + }, + { + "epoch": 4.46255371393493, + "grad_norm": 0.22143007814884186, + "learning_rate": 6.104711259670941e-05, + "loss": 1.7338, + "step": 14539 + }, + { + "epoch": 4.462860650705955, + "grad_norm": 0.22361041605472565, + "learning_rate": 6.104226482025453e-05, + "loss": 1.7033, + "step": 14540 + }, + { + "epoch": 4.463167587476979, + "grad_norm": 0.27104905247688293, + "learning_rate": 6.10374169346735e-05, + "loss": 1.7926, + "step": 14541 + }, + { + "epoch": 4.463474524248005, + "grad_norm": 0.23564264178276062, + "learning_rate": 6.103256894001427e-05, + "loss": 1.7522, + "step": 14542 + }, + { + "epoch": 4.46378146101903, + "grad_norm": 0.2585970163345337, + "learning_rate": 6.102772083632471e-05, + "loss": 1.7755, + "step": 14543 + }, + { + "epoch": 4.464088397790055, + "grad_norm": 0.358634889125824, + "learning_rate": 6.102287262365276e-05, + "loss": 1.8092, + "step": 14544 + }, + { + "epoch": 4.464395334561081, + "grad_norm": 0.2862946689128876, + "learning_rate": 6.1018024302046314e-05, + "loss": 1.7051, + "step": 14545 + }, + { + "epoch": 4.464702271332105, + "grad_norm": 0.21907158195972443, + "learning_rate": 6.101317587155331e-05, + "loss": 1.7882, + "step": 14546 + }, + { + "epoch": 4.4650092081031305, + "grad_norm": 0.24268488585948944, + "learning_rate": 6.100832733222164e-05, + "loss": 1.7756, + "step": 14547 + }, + { + "epoch": 4.465316144874156, + "grad_norm": 0.2350744605064392, + "learning_rate": 6.1003478684099214e-05, + "loss": 1.7483, + "step": 14548 + }, + { + "epoch": 4.465623081645181, + "grad_norm": 0.22902250289916992, + "learning_rate": 6.099862992723397e-05, + "loss": 1.7687, + "step": 14549 + }, + { + "epoch": 4.4659300184162065, + "grad_norm": 0.23590944707393646, + "learning_rate": 6.099378106167382e-05, + "loss": 1.8481, + "step": 14550 + }, + { + "epoch": 4.466236955187231, + "grad_norm": 0.23644296824932098, + "learning_rate": 6.098893208746668e-05, + "loss": 1.7422, + "step": 14551 + }, + { + "epoch": 4.466543891958256, + "grad_norm": 0.23782360553741455, + "learning_rate": 6.0984083004660475e-05, + "loss": 1.7852, + "step": 14552 + }, + { + "epoch": 4.466850828729282, + "grad_norm": 0.2546575665473938, + "learning_rate": 6.097923381330313e-05, + "loss": 1.8483, + "step": 14553 + }, + { + "epoch": 4.467157765500307, + "grad_norm": 0.2555409371852875, + "learning_rate": 6.097438451344254e-05, + "loss": 1.7887, + "step": 14554 + }, + { + "epoch": 4.467464702271332, + "grad_norm": 0.28074198961257935, + "learning_rate": 6.0969535105126664e-05, + "loss": 1.7521, + "step": 14555 + }, + { + "epoch": 4.467771639042358, + "grad_norm": 0.22622554004192352, + "learning_rate": 6.096468558840341e-05, + "loss": 1.8088, + "step": 14556 + }, + { + "epoch": 4.468078575813382, + "grad_norm": 0.302749902009964, + "learning_rate": 6.095983596332071e-05, + "loss": 1.8192, + "step": 14557 + }, + { + "epoch": 4.468385512584407, + "grad_norm": 0.27925750613212585, + "learning_rate": 6.0954986229926494e-05, + "loss": 1.8453, + "step": 14558 + }, + { + "epoch": 4.468692449355433, + "grad_norm": 0.2246330976486206, + "learning_rate": 6.095013638826868e-05, + "loss": 1.744, + "step": 14559 + }, + { + "epoch": 4.468999386126458, + "grad_norm": 0.26677101850509644, + "learning_rate": 6.094528643839518e-05, + "loss": 1.708, + "step": 14560 + }, + { + "epoch": 4.469306322897483, + "grad_norm": 0.23684042692184448, + "learning_rate": 6.094043638035396e-05, + "loss": 1.713, + "step": 14561 + }, + { + "epoch": 4.469613259668508, + "grad_norm": 0.2470075935125351, + "learning_rate": 6.093558621419294e-05, + "loss": 1.8096, + "step": 14562 + }, + { + "epoch": 4.469920196439533, + "grad_norm": 0.2775517702102661, + "learning_rate": 6.093073593996005e-05, + "loss": 1.697, + "step": 14563 + }, + { + "epoch": 4.4702271332105585, + "grad_norm": 0.21053175628185272, + "learning_rate": 6.092588555770322e-05, + "loss": 1.6894, + "step": 14564 + }, + { + "epoch": 4.470534069981584, + "grad_norm": 0.2555869221687317, + "learning_rate": 6.0921035067470366e-05, + "loss": 1.7051, + "step": 14565 + }, + { + "epoch": 4.470841006752609, + "grad_norm": 0.34468984603881836, + "learning_rate": 6.0916184469309454e-05, + "loss": 1.7317, + "step": 14566 + }, + { + "epoch": 4.4711479435236345, + "grad_norm": 0.2517752945423126, + "learning_rate": 6.0911333763268407e-05, + "loss": 1.7524, + "step": 14567 + }, + { + "epoch": 4.471454880294659, + "grad_norm": 0.2749727666378021, + "learning_rate": 6.090648294939517e-05, + "loss": 1.7045, + "step": 14568 + }, + { + "epoch": 4.471761817065684, + "grad_norm": 0.36250773072242737, + "learning_rate": 6.0901632027737673e-05, + "loss": 1.7196, + "step": 14569 + }, + { + "epoch": 4.47206875383671, + "grad_norm": 0.2317698448896408, + "learning_rate": 6.089678099834386e-05, + "loss": 1.7318, + "step": 14570 + }, + { + "epoch": 4.472375690607735, + "grad_norm": 0.2863345444202423, + "learning_rate": 6.089192986126166e-05, + "loss": 1.7798, + "step": 14571 + }, + { + "epoch": 4.47268262737876, + "grad_norm": 0.3493366241455078, + "learning_rate": 6.088707861653904e-05, + "loss": 1.7749, + "step": 14572 + }, + { + "epoch": 4.472989564149785, + "grad_norm": 0.25718605518341064, + "learning_rate": 6.0882227264223924e-05, + "loss": 1.7683, + "step": 14573 + }, + { + "epoch": 4.47329650092081, + "grad_norm": 0.2320062816143036, + "learning_rate": 6.087737580436426e-05, + "loss": 1.8296, + "step": 14574 + }, + { + "epoch": 4.473603437691835, + "grad_norm": 0.29071560502052307, + "learning_rate": 6.087252423700799e-05, + "loss": 1.7428, + "step": 14575 + }, + { + "epoch": 4.473910374462861, + "grad_norm": 0.24233707785606384, + "learning_rate": 6.086767256220306e-05, + "loss": 1.7332, + "step": 14576 + }, + { + "epoch": 4.474217311233886, + "grad_norm": 0.228043332695961, + "learning_rate": 6.086282077999742e-05, + "loss": 1.7697, + "step": 14577 + }, + { + "epoch": 4.474524248004911, + "grad_norm": 0.29154402017593384, + "learning_rate": 6.085796889043902e-05, + "loss": 1.8043, + "step": 14578 + }, + { + "epoch": 4.474831184775936, + "grad_norm": 0.30543211102485657, + "learning_rate": 6.0853116893575814e-05, + "loss": 1.7665, + "step": 14579 + }, + { + "epoch": 4.475138121546961, + "grad_norm": 0.22792959213256836, + "learning_rate": 6.0848264789455754e-05, + "loss": 1.729, + "step": 14580 + }, + { + "epoch": 4.475445058317987, + "grad_norm": 0.2615707218647003, + "learning_rate": 6.084341257812677e-05, + "loss": 1.7438, + "step": 14581 + }, + { + "epoch": 4.475751995089012, + "grad_norm": 0.23342981934547424, + "learning_rate": 6.083856025963681e-05, + "loss": 1.7158, + "step": 14582 + }, + { + "epoch": 4.476058931860037, + "grad_norm": 0.22279240190982819, + "learning_rate": 6.083370783403387e-05, + "loss": 1.7413, + "step": 14583 + }, + { + "epoch": 4.476365868631062, + "grad_norm": 0.28867462277412415, + "learning_rate": 6.082885530136587e-05, + "loss": 1.7932, + "step": 14584 + }, + { + "epoch": 4.476672805402087, + "grad_norm": 0.2947152256965637, + "learning_rate": 6.082400266168078e-05, + "loss": 1.8986, + "step": 14585 + }, + { + "epoch": 4.476979742173112, + "grad_norm": 0.2948935627937317, + "learning_rate": 6.0819149915026555e-05, + "loss": 1.9134, + "step": 14586 + }, + { + "epoch": 4.477286678944138, + "grad_norm": 0.4436163902282715, + "learning_rate": 6.081429706145114e-05, + "loss": 1.7616, + "step": 14587 + }, + { + "epoch": 4.477593615715163, + "grad_norm": 0.4879693388938904, + "learning_rate": 6.080944410100249e-05, + "loss": 1.8155, + "step": 14588 + }, + { + "epoch": 4.4779005524861875, + "grad_norm": 0.29742667078971863, + "learning_rate": 6.08045910337286e-05, + "loss": 1.7428, + "step": 14589 + }, + { + "epoch": 4.478207489257213, + "grad_norm": 0.2994751036167145, + "learning_rate": 6.0799737859677395e-05, + "loss": 1.7764, + "step": 14590 + }, + { + "epoch": 4.478514426028238, + "grad_norm": 0.46379905939102173, + "learning_rate": 6.079488457889686e-05, + "loss": 1.7289, + "step": 14591 + }, + { + "epoch": 4.4788213627992635, + "grad_norm": 0.3511717617511749, + "learning_rate": 6.0790031191434946e-05, + "loss": 1.7658, + "step": 14592 + }, + { + "epoch": 4.479128299570289, + "grad_norm": 0.22678083181381226, + "learning_rate": 6.0785177697339626e-05, + "loss": 1.7973, + "step": 14593 + }, + { + "epoch": 4.479435236341313, + "grad_norm": 0.31201767921447754, + "learning_rate": 6.0780324096658837e-05, + "loss": 1.7542, + "step": 14594 + }, + { + "epoch": 4.479742173112339, + "grad_norm": 0.23759113252162933, + "learning_rate": 6.077547038944058e-05, + "loss": 1.7191, + "step": 14595 + }, + { + "epoch": 4.480049109883364, + "grad_norm": 0.25801756978034973, + "learning_rate": 6.077061657573282e-05, + "loss": 1.8229, + "step": 14596 + }, + { + "epoch": 4.480356046654389, + "grad_norm": 0.3435722887516022, + "learning_rate": 6.0765762655583514e-05, + "loss": 1.7633, + "step": 14597 + }, + { + "epoch": 4.480662983425415, + "grad_norm": 0.2710443437099457, + "learning_rate": 6.076090862904063e-05, + "loss": 1.8126, + "step": 14598 + }, + { + "epoch": 4.48096992019644, + "grad_norm": 0.25750285387039185, + "learning_rate": 6.075605449615212e-05, + "loss": 1.7382, + "step": 14599 + }, + { + "epoch": 4.481276856967464, + "grad_norm": 0.3638051152229309, + "learning_rate": 6.075120025696598e-05, + "loss": 1.8191, + "step": 14600 + }, + { + "epoch": 4.48158379373849, + "grad_norm": 0.24185293912887573, + "learning_rate": 6.074634591153019e-05, + "loss": 1.7637, + "step": 14601 + }, + { + "epoch": 4.481890730509515, + "grad_norm": 0.317283570766449, + "learning_rate": 6.0741491459892707e-05, + "loss": 1.7805, + "step": 14602 + }, + { + "epoch": 4.48219766728054, + "grad_norm": 0.33884385228157043, + "learning_rate": 6.073663690210151e-05, + "loss": 1.7719, + "step": 14603 + }, + { + "epoch": 4.482504604051566, + "grad_norm": 0.2554258704185486, + "learning_rate": 6.073178223820457e-05, + "loss": 1.836, + "step": 14604 + }, + { + "epoch": 4.48281154082259, + "grad_norm": 0.3363535702228546, + "learning_rate": 6.072692746824987e-05, + "loss": 1.8249, + "step": 14605 + }, + { + "epoch": 4.4831184775936155, + "grad_norm": 0.36090195178985596, + "learning_rate": 6.072207259228537e-05, + "loss": 1.733, + "step": 14606 + }, + { + "epoch": 4.483425414364641, + "grad_norm": 0.21928483247756958, + "learning_rate": 6.071721761035909e-05, + "loss": 1.7413, + "step": 14607 + }, + { + "epoch": 4.483732351135666, + "grad_norm": 0.4256608486175537, + "learning_rate": 6.071236252251897e-05, + "loss": 1.7585, + "step": 14608 + }, + { + "epoch": 4.4840392879066915, + "grad_norm": 0.41980308294296265, + "learning_rate": 6.0707507328813007e-05, + "loss": 1.7584, + "step": 14609 + }, + { + "epoch": 4.484346224677717, + "grad_norm": 0.200295090675354, + "learning_rate": 6.0702652029289186e-05, + "loss": 1.7492, + "step": 14610 + }, + { + "epoch": 4.484653161448741, + "grad_norm": 0.41847771406173706, + "learning_rate": 6.069779662399549e-05, + "loss": 1.8101, + "step": 14611 + }, + { + "epoch": 4.484960098219767, + "grad_norm": 0.4846353530883789, + "learning_rate": 6.069294111297987e-05, + "loss": 1.8227, + "step": 14612 + }, + { + "epoch": 4.485267034990792, + "grad_norm": 0.23216098546981812, + "learning_rate": 6.068808549629036e-05, + "loss": 1.6811, + "step": 14613 + }, + { + "epoch": 4.485573971761817, + "grad_norm": 0.34903186559677124, + "learning_rate": 6.0683229773974934e-05, + "loss": 1.6858, + "step": 14614 + }, + { + "epoch": 4.485880908532843, + "grad_norm": 0.4349122941493988, + "learning_rate": 6.0678373946081556e-05, + "loss": 1.7704, + "step": 14615 + }, + { + "epoch": 4.486187845303867, + "grad_norm": 0.25738775730133057, + "learning_rate": 6.067351801265824e-05, + "loss": 1.7487, + "step": 14616 + }, + { + "epoch": 4.486494782074892, + "grad_norm": 0.3052736818790436, + "learning_rate": 6.0668661973752936e-05, + "loss": 1.7528, + "step": 14617 + }, + { + "epoch": 4.486801718845918, + "grad_norm": 0.3400498628616333, + "learning_rate": 6.066380582941368e-05, + "loss": 1.7414, + "step": 14618 + }, + { + "epoch": 4.487108655616943, + "grad_norm": 0.28251948952674866, + "learning_rate": 6.065894957968845e-05, + "loss": 1.8078, + "step": 14619 + }, + { + "epoch": 4.487415592387968, + "grad_norm": 0.26907965540885925, + "learning_rate": 6.0654093224625216e-05, + "loss": 1.8143, + "step": 14620 + }, + { + "epoch": 4.487722529158993, + "grad_norm": 0.2821955978870392, + "learning_rate": 6.064923676427201e-05, + "loss": 1.7163, + "step": 14621 + }, + { + "epoch": 4.488029465930018, + "grad_norm": 0.2223028987646103, + "learning_rate": 6.0644380198676786e-05, + "loss": 1.704, + "step": 14622 + }, + { + "epoch": 4.4883364027010435, + "grad_norm": 0.25243067741394043, + "learning_rate": 6.063952352788755e-05, + "loss": 1.7236, + "step": 14623 + }, + { + "epoch": 4.488643339472069, + "grad_norm": 0.30026015639305115, + "learning_rate": 6.063466675195233e-05, + "loss": 1.7575, + "step": 14624 + }, + { + "epoch": 4.488950276243094, + "grad_norm": 0.2055491805076599, + "learning_rate": 6.0629809870919085e-05, + "loss": 1.7294, + "step": 14625 + }, + { + "epoch": 4.4892572130141195, + "grad_norm": 0.2507593035697937, + "learning_rate": 6.0624952884835836e-05, + "loss": 1.762, + "step": 14626 + }, + { + "epoch": 4.489564149785144, + "grad_norm": 0.21385909616947174, + "learning_rate": 6.0620095793750576e-05, + "loss": 1.7396, + "step": 14627 + }, + { + "epoch": 4.489871086556169, + "grad_norm": 0.21926651895046234, + "learning_rate": 6.06152385977113e-05, + "loss": 1.7863, + "step": 14628 + }, + { + "epoch": 4.490178023327195, + "grad_norm": 0.21950845420360565, + "learning_rate": 6.0610381296766016e-05, + "loss": 1.7576, + "step": 14629 + }, + { + "epoch": 4.49048496009822, + "grad_norm": 0.2030971795320511, + "learning_rate": 6.0605523890962736e-05, + "loss": 1.7069, + "step": 14630 + }, + { + "epoch": 4.490791896869245, + "grad_norm": 0.23991432785987854, + "learning_rate": 6.0600666380349436e-05, + "loss": 1.7598, + "step": 14631 + }, + { + "epoch": 4.49109883364027, + "grad_norm": 0.23766861855983734, + "learning_rate": 6.059580876497415e-05, + "loss": 1.7687, + "step": 14632 + }, + { + "epoch": 4.491405770411295, + "grad_norm": 0.2361454963684082, + "learning_rate": 6.059095104488487e-05, + "loss": 1.7883, + "step": 14633 + }, + { + "epoch": 4.49171270718232, + "grad_norm": 0.3128328323364258, + "learning_rate": 6.058609322012958e-05, + "loss": 1.8087, + "step": 14634 + }, + { + "epoch": 4.492019643953346, + "grad_norm": 0.2958957850933075, + "learning_rate": 6.0581235290756335e-05, + "loss": 1.782, + "step": 14635 + }, + { + "epoch": 4.492326580724371, + "grad_norm": 0.2197243571281433, + "learning_rate": 6.057637725681312e-05, + "loss": 1.7408, + "step": 14636 + }, + { + "epoch": 4.4926335174953955, + "grad_norm": 0.22227831184864044, + "learning_rate": 6.0571519118347944e-05, + "loss": 1.734, + "step": 14637 + }, + { + "epoch": 4.492940454266421, + "grad_norm": 0.2784527540206909, + "learning_rate": 6.056666087540882e-05, + "loss": 1.8017, + "step": 14638 + }, + { + "epoch": 4.493247391037446, + "grad_norm": 0.21929821372032166, + "learning_rate": 6.056180252804377e-05, + "loss": 1.7271, + "step": 14639 + }, + { + "epoch": 4.4935543278084715, + "grad_norm": 0.2156134843826294, + "learning_rate": 6.055694407630077e-05, + "loss": 1.8082, + "step": 14640 + }, + { + "epoch": 4.493861264579497, + "grad_norm": 0.22672387957572937, + "learning_rate": 6.0552085520227875e-05, + "loss": 1.7506, + "step": 14641 + }, + { + "epoch": 4.494168201350522, + "grad_norm": 0.228785440325737, + "learning_rate": 6.0547226859873086e-05, + "loss": 1.7023, + "step": 14642 + }, + { + "epoch": 4.494475138121547, + "grad_norm": 0.19483685493469238, + "learning_rate": 6.054236809528443e-05, + "loss": 1.6879, + "step": 14643 + }, + { + "epoch": 4.494782074892572, + "grad_norm": 0.24911309778690338, + "learning_rate": 6.0537509226509904e-05, + "loss": 1.7856, + "step": 14644 + }, + { + "epoch": 4.495089011663597, + "grad_norm": 0.24811938405036926, + "learning_rate": 6.053265025359753e-05, + "loss": 1.7581, + "step": 14645 + }, + { + "epoch": 4.495395948434623, + "grad_norm": 0.2487260401248932, + "learning_rate": 6.052779117659534e-05, + "loss": 1.7536, + "step": 14646 + }, + { + "epoch": 4.495702885205648, + "grad_norm": 0.2594854235649109, + "learning_rate": 6.052293199555136e-05, + "loss": 1.7822, + "step": 14647 + }, + { + "epoch": 4.496009821976672, + "grad_norm": 0.22837325930595398, + "learning_rate": 6.051807271051359e-05, + "loss": 1.7542, + "step": 14648 + }, + { + "epoch": 4.496316758747698, + "grad_norm": 0.23106649518013, + "learning_rate": 6.051321332153005e-05, + "loss": 1.7758, + "step": 14649 + }, + { + "epoch": 4.496623695518723, + "grad_norm": 0.29424673318862915, + "learning_rate": 6.050835382864878e-05, + "loss": 1.8335, + "step": 14650 + }, + { + "epoch": 4.496930632289748, + "grad_norm": 0.28297343850135803, + "learning_rate": 6.050349423191779e-05, + "loss": 1.7711, + "step": 14651 + }, + { + "epoch": 4.497237569060774, + "grad_norm": 0.2001795768737793, + "learning_rate": 6.049863453138511e-05, + "loss": 1.7008, + "step": 14652 + }, + { + "epoch": 4.497544505831799, + "grad_norm": 0.35177022218704224, + "learning_rate": 6.04937747270988e-05, + "loss": 1.7763, + "step": 14653 + }, + { + "epoch": 4.4978514426028235, + "grad_norm": 0.28870898485183716, + "learning_rate": 6.0488914819106835e-05, + "loss": 1.7373, + "step": 14654 + }, + { + "epoch": 4.498158379373849, + "grad_norm": 0.23962664604187012, + "learning_rate": 6.048405480745727e-05, + "loss": 1.7278, + "step": 14655 + }, + { + "epoch": 4.498465316144874, + "grad_norm": 0.324505478143692, + "learning_rate": 6.047919469219813e-05, + "loss": 1.7674, + "step": 14656 + }, + { + "epoch": 4.4987722529158995, + "grad_norm": 0.38313817977905273, + "learning_rate": 6.047433447337744e-05, + "loss": 1.789, + "step": 14657 + }, + { + "epoch": 4.499079189686925, + "grad_norm": 0.2101358324289322, + "learning_rate": 6.046947415104324e-05, + "loss": 1.7331, + "step": 14658 + }, + { + "epoch": 4.499386126457949, + "grad_norm": 0.3388524353504181, + "learning_rate": 6.046461372524357e-05, + "loss": 1.8467, + "step": 14659 + }, + { + "epoch": 4.499693063228975, + "grad_norm": 0.3360123634338379, + "learning_rate": 6.045975319602645e-05, + "loss": 1.8427, + "step": 14660 + }, + { + "epoch": 4.5, + "grad_norm": 0.27596545219421387, + "learning_rate": 6.0454892563439914e-05, + "loss": 1.7768, + "step": 14661 + }, + { + "epoch": 4.500306936771025, + "grad_norm": 0.2580861747264862, + "learning_rate": 6.0450031827532e-05, + "loss": 1.763, + "step": 14662 + }, + { + "epoch": 4.500613873542051, + "grad_norm": 0.3521091938018799, + "learning_rate": 6.044517098835074e-05, + "loss": 1.7118, + "step": 14663 + }, + { + "epoch": 4.500920810313076, + "grad_norm": 0.29412439465522766, + "learning_rate": 6.0440310045944204e-05, + "loss": 1.7252, + "step": 14664 + }, + { + "epoch": 4.5012277470841005, + "grad_norm": 0.23845252394676208, + "learning_rate": 6.043544900036039e-05, + "loss": 1.7622, + "step": 14665 + }, + { + "epoch": 4.501534683855126, + "grad_norm": 0.22957031428813934, + "learning_rate": 6.043058785164736e-05, + "loss": 1.7527, + "step": 14666 + }, + { + "epoch": 4.501841620626151, + "grad_norm": 0.2564462721347809, + "learning_rate": 6.042572659985314e-05, + "loss": 1.801, + "step": 14667 + }, + { + "epoch": 4.5021485573971765, + "grad_norm": 0.22588051855564117, + "learning_rate": 6.042086524502576e-05, + "loss": 1.7387, + "step": 14668 + }, + { + "epoch": 4.502455494168201, + "grad_norm": 0.2609740197658539, + "learning_rate": 6.0416003787213306e-05, + "loss": 1.7615, + "step": 14669 + }, + { + "epoch": 4.502762430939226, + "grad_norm": 0.2535521984100342, + "learning_rate": 6.041114222646379e-05, + "loss": 1.7398, + "step": 14670 + }, + { + "epoch": 4.503069367710252, + "grad_norm": 0.2512127757072449, + "learning_rate": 6.040628056282527e-05, + "loss": 1.7679, + "step": 14671 + }, + { + "epoch": 4.503376304481277, + "grad_norm": 0.2438639998435974, + "learning_rate": 6.0401418796345774e-05, + "loss": 1.7, + "step": 14672 + }, + { + "epoch": 4.503683241252302, + "grad_norm": 0.23428042232990265, + "learning_rate": 6.0396556927073376e-05, + "loss": 1.7748, + "step": 14673 + }, + { + "epoch": 4.503990178023328, + "grad_norm": 0.22894345223903656, + "learning_rate": 6.03916949550561e-05, + "loss": 1.7881, + "step": 14674 + }, + { + "epoch": 4.504297114794352, + "grad_norm": 0.24813716113567352, + "learning_rate": 6.0386832880342006e-05, + "loss": 1.7676, + "step": 14675 + }, + { + "epoch": 4.504604051565377, + "grad_norm": 0.23448842763900757, + "learning_rate": 6.038197070297914e-05, + "loss": 1.7828, + "step": 14676 + }, + { + "epoch": 4.504910988336403, + "grad_norm": 0.25302332639694214, + "learning_rate": 6.037710842301556e-05, + "loss": 1.8061, + "step": 14677 + }, + { + "epoch": 4.505217925107428, + "grad_norm": 0.2411813735961914, + "learning_rate": 6.0372246040499305e-05, + "loss": 1.6901, + "step": 14678 + }, + { + "epoch": 4.505524861878453, + "grad_norm": 0.3154819905757904, + "learning_rate": 6.036738355547844e-05, + "loss": 1.7472, + "step": 14679 + }, + { + "epoch": 4.505831798649478, + "grad_norm": 0.2935639023780823, + "learning_rate": 6.0362520968001014e-05, + "loss": 1.7508, + "step": 14680 + }, + { + "epoch": 4.506138735420503, + "grad_norm": 0.27064070105552673, + "learning_rate": 6.035765827811508e-05, + "loss": 1.8133, + "step": 14681 + }, + { + "epoch": 4.5064456721915285, + "grad_norm": 0.23748525977134705, + "learning_rate": 6.03527954858687e-05, + "loss": 1.7742, + "step": 14682 + }, + { + "epoch": 4.506752608962554, + "grad_norm": 0.216410830616951, + "learning_rate": 6.034793259130992e-05, + "loss": 1.7448, + "step": 14683 + }, + { + "epoch": 4.507059545733579, + "grad_norm": 0.23339977860450745, + "learning_rate": 6.034306959448681e-05, + "loss": 1.7437, + "step": 14684 + }, + { + "epoch": 4.5073664825046045, + "grad_norm": 0.23951120674610138, + "learning_rate": 6.0338206495447414e-05, + "loss": 1.7535, + "step": 14685 + }, + { + "epoch": 4.507673419275629, + "grad_norm": 0.22137518227100372, + "learning_rate": 6.0333343294239816e-05, + "loss": 1.7537, + "step": 14686 + }, + { + "epoch": 4.507980356046654, + "grad_norm": 0.2550075054168701, + "learning_rate": 6.032847999091206e-05, + "loss": 1.8069, + "step": 14687 + }, + { + "epoch": 4.50828729281768, + "grad_norm": 0.2166420966386795, + "learning_rate": 6.032361658551221e-05, + "loss": 1.7746, + "step": 14688 + }, + { + "epoch": 4.508594229588705, + "grad_norm": 0.21926096081733704, + "learning_rate": 6.031875307808833e-05, + "loss": 1.7848, + "step": 14689 + }, + { + "epoch": 4.50890116635973, + "grad_norm": 0.27769652009010315, + "learning_rate": 6.031388946868848e-05, + "loss": 1.7563, + "step": 14690 + }, + { + "epoch": 4.509208103130755, + "grad_norm": 0.23417410254478455, + "learning_rate": 6.030902575736074e-05, + "loss": 1.7475, + "step": 14691 + }, + { + "epoch": 4.50951503990178, + "grad_norm": 0.25454118847846985, + "learning_rate": 6.030416194415314e-05, + "loss": 1.7416, + "step": 14692 + }, + { + "epoch": 4.509821976672805, + "grad_norm": 0.3118220567703247, + "learning_rate": 6.029929802911379e-05, + "loss": 1.8001, + "step": 14693 + }, + { + "epoch": 4.510128913443831, + "grad_norm": 0.2338017225265503, + "learning_rate": 6.029443401229075e-05, + "loss": 1.7243, + "step": 14694 + }, + { + "epoch": 4.510435850214856, + "grad_norm": 0.2490454763174057, + "learning_rate": 6.028956989373207e-05, + "loss": 1.7866, + "step": 14695 + }, + { + "epoch": 4.510742786985881, + "grad_norm": 0.2579275369644165, + "learning_rate": 6.028470567348582e-05, + "loss": 1.7594, + "step": 14696 + }, + { + "epoch": 4.511049723756906, + "grad_norm": 0.23982174694538116, + "learning_rate": 6.0279841351600094e-05, + "loss": 1.7444, + "step": 14697 + }, + { + "epoch": 4.511356660527931, + "grad_norm": 0.2160159945487976, + "learning_rate": 6.027497692812295e-05, + "loss": 1.7002, + "step": 14698 + }, + { + "epoch": 4.5116635972989565, + "grad_norm": 0.24604511260986328, + "learning_rate": 6.0270112403102455e-05, + "loss": 1.7654, + "step": 14699 + }, + { + "epoch": 4.511970534069982, + "grad_norm": 0.21978263556957245, + "learning_rate": 6.026524777658669e-05, + "loss": 1.7278, + "step": 14700 + }, + { + "epoch": 4.512277470841006, + "grad_norm": 0.2814212441444397, + "learning_rate": 6.026038304862373e-05, + "loss": 1.7743, + "step": 14701 + }, + { + "epoch": 4.512584407612032, + "grad_norm": 0.23798944056034088, + "learning_rate": 6.025551821926165e-05, + "loss": 1.7348, + "step": 14702 + }, + { + "epoch": 4.512891344383057, + "grad_norm": 0.22415988147258759, + "learning_rate": 6.025065328854853e-05, + "loss": 1.7973, + "step": 14703 + }, + { + "epoch": 4.513198281154082, + "grad_norm": 0.34614792466163635, + "learning_rate": 6.0245788256532445e-05, + "loss": 1.7263, + "step": 14704 + }, + { + "epoch": 4.513505217925108, + "grad_norm": 0.333918958902359, + "learning_rate": 6.0240923123261485e-05, + "loss": 1.7305, + "step": 14705 + }, + { + "epoch": 4.513812154696133, + "grad_norm": 0.22231793403625488, + "learning_rate": 6.02360578887837e-05, + "loss": 1.806, + "step": 14706 + }, + { + "epoch": 4.514119091467157, + "grad_norm": 0.23323194682598114, + "learning_rate": 6.023119255314721e-05, + "loss": 1.7076, + "step": 14707 + }, + { + "epoch": 4.514426028238183, + "grad_norm": 0.26695477962493896, + "learning_rate": 6.022632711640007e-05, + "loss": 1.775, + "step": 14708 + }, + { + "epoch": 4.514732965009208, + "grad_norm": 0.21446476876735687, + "learning_rate": 6.0221461578590364e-05, + "loss": 1.7524, + "step": 14709 + }, + { + "epoch": 4.515039901780233, + "grad_norm": 0.2677358090877533, + "learning_rate": 6.0216595939766204e-05, + "loss": 1.7513, + "step": 14710 + }, + { + "epoch": 4.515346838551259, + "grad_norm": 0.28648239374160767, + "learning_rate": 6.021173019997565e-05, + "loss": 1.7249, + "step": 14711 + }, + { + "epoch": 4.515653775322283, + "grad_norm": 0.2178548276424408, + "learning_rate": 6.020686435926678e-05, + "loss": 1.7502, + "step": 14712 + }, + { + "epoch": 4.5159607120933085, + "grad_norm": 0.3391740024089813, + "learning_rate": 6.02019984176877e-05, + "loss": 1.6828, + "step": 14713 + }, + { + "epoch": 4.516267648864334, + "grad_norm": 0.25222229957580566, + "learning_rate": 6.01971323752865e-05, + "loss": 1.6982, + "step": 14714 + }, + { + "epoch": 4.516574585635359, + "grad_norm": 0.28776636719703674, + "learning_rate": 6.019226623211125e-05, + "loss": 1.8595, + "step": 14715 + }, + { + "epoch": 4.5168815224063845, + "grad_norm": 0.3240084648132324, + "learning_rate": 6.018739998821006e-05, + "loss": 1.7461, + "step": 14716 + }, + { + "epoch": 4.51718845917741, + "grad_norm": 0.26735052466392517, + "learning_rate": 6.0182533643631015e-05, + "loss": 1.7955, + "step": 14717 + }, + { + "epoch": 4.517495395948434, + "grad_norm": 0.24573692679405212, + "learning_rate": 6.017766719842219e-05, + "loss": 1.7441, + "step": 14718 + }, + { + "epoch": 4.51780233271946, + "grad_norm": 0.27401313185691833, + "learning_rate": 6.01728006526317e-05, + "loss": 1.7399, + "step": 14719 + }, + { + "epoch": 4.518109269490485, + "grad_norm": 0.23578806221485138, + "learning_rate": 6.016793400630763e-05, + "loss": 1.7936, + "step": 14720 + }, + { + "epoch": 4.51841620626151, + "grad_norm": 0.27763426303863525, + "learning_rate": 6.0163067259498074e-05, + "loss": 1.7263, + "step": 14721 + }, + { + "epoch": 4.518723143032536, + "grad_norm": 0.27102044224739075, + "learning_rate": 6.015820041225113e-05, + "loss": 1.7085, + "step": 14722 + }, + { + "epoch": 4.51903007980356, + "grad_norm": 0.2046152651309967, + "learning_rate": 6.01533334646149e-05, + "loss": 1.7602, + "step": 14723 + }, + { + "epoch": 4.519337016574585, + "grad_norm": 0.2645253837108612, + "learning_rate": 6.0148466416637484e-05, + "loss": 1.7729, + "step": 14724 + }, + { + "epoch": 4.519643953345611, + "grad_norm": 0.27467650175094604, + "learning_rate": 6.014359926836697e-05, + "loss": 1.7834, + "step": 14725 + }, + { + "epoch": 4.519950890116636, + "grad_norm": 0.30357635021209717, + "learning_rate": 6.013873201985145e-05, + "loss": 1.8685, + "step": 14726 + }, + { + "epoch": 4.520257826887661, + "grad_norm": 0.22923336923122406, + "learning_rate": 6.013386467113905e-05, + "loss": 1.7531, + "step": 14727 + }, + { + "epoch": 4.520564763658687, + "grad_norm": 0.2792156934738159, + "learning_rate": 6.012899722227786e-05, + "loss": 1.7927, + "step": 14728 + }, + { + "epoch": 4.520871700429711, + "grad_norm": 0.286161869764328, + "learning_rate": 6.012412967331598e-05, + "loss": 1.77, + "step": 14729 + }, + { + "epoch": 4.5211786372007365, + "grad_norm": 0.23964659869670868, + "learning_rate": 6.011926202430151e-05, + "loss": 1.7873, + "step": 14730 + }, + { + "epoch": 4.521485573971762, + "grad_norm": 0.2250162959098816, + "learning_rate": 6.011439427528258e-05, + "loss": 1.741, + "step": 14731 + }, + { + "epoch": 4.521792510742787, + "grad_norm": 0.2797175347805023, + "learning_rate": 6.010952642630726e-05, + "loss": 1.7482, + "step": 14732 + }, + { + "epoch": 4.5220994475138125, + "grad_norm": 0.22159560024738312, + "learning_rate": 6.010465847742368e-05, + "loss": 1.7591, + "step": 14733 + }, + { + "epoch": 4.522406384284837, + "grad_norm": 0.26638463139533997, + "learning_rate": 6.009979042867995e-05, + "loss": 1.8564, + "step": 14734 + }, + { + "epoch": 4.522713321055862, + "grad_norm": 0.2972821891307831, + "learning_rate": 6.009492228012416e-05, + "loss": 1.7569, + "step": 14735 + }, + { + "epoch": 4.523020257826888, + "grad_norm": 0.28108885884284973, + "learning_rate": 6.0090054031804444e-05, + "loss": 1.7256, + "step": 14736 + }, + { + "epoch": 4.523327194597913, + "grad_norm": 0.22359851002693176, + "learning_rate": 6.008518568376888e-05, + "loss": 1.7342, + "step": 14737 + }, + { + "epoch": 4.523634131368938, + "grad_norm": 0.2620728015899658, + "learning_rate": 6.008031723606562e-05, + "loss": 1.7703, + "step": 14738 + }, + { + "epoch": 4.523941068139964, + "grad_norm": 0.2641485333442688, + "learning_rate": 6.007544868874274e-05, + "loss": 1.6944, + "step": 14739 + }, + { + "epoch": 4.524248004910988, + "grad_norm": 0.24957752227783203, + "learning_rate": 6.007058004184839e-05, + "loss": 1.7746, + "step": 14740 + }, + { + "epoch": 4.524554941682013, + "grad_norm": 0.29830998182296753, + "learning_rate": 6.006571129543065e-05, + "loss": 1.7718, + "step": 14741 + }, + { + "epoch": 4.524861878453039, + "grad_norm": 0.32740798592567444, + "learning_rate": 6.006084244953766e-05, + "loss": 1.8194, + "step": 14742 + }, + { + "epoch": 4.525168815224064, + "grad_norm": 0.2614956796169281, + "learning_rate": 6.005597350421751e-05, + "loss": 1.7078, + "step": 14743 + }, + { + "epoch": 4.525475751995089, + "grad_norm": 0.23940515518188477, + "learning_rate": 6.005110445951836e-05, + "loss": 1.7488, + "step": 14744 + }, + { + "epoch": 4.525782688766114, + "grad_norm": 0.25485914945602417, + "learning_rate": 6.004623531548829e-05, + "loss": 1.7705, + "step": 14745 + }, + { + "epoch": 4.526089625537139, + "grad_norm": 0.213532954454422, + "learning_rate": 6.0041366072175445e-05, + "loss": 1.7501, + "step": 14746 + }, + { + "epoch": 4.526396562308165, + "grad_norm": 0.2420104295015335, + "learning_rate": 6.003649672962792e-05, + "loss": 1.717, + "step": 14747 + }, + { + "epoch": 4.52670349907919, + "grad_norm": 0.26179102063179016, + "learning_rate": 6.0031627287893865e-05, + "loss": 1.7665, + "step": 14748 + }, + { + "epoch": 4.527010435850215, + "grad_norm": 0.22032082080841064, + "learning_rate": 6.002675774702139e-05, + "loss": 1.7555, + "step": 14749 + }, + { + "epoch": 4.52731737262124, + "grad_norm": 0.23915240168571472, + "learning_rate": 6.002188810705861e-05, + "loss": 1.8219, + "step": 14750 + }, + { + "epoch": 4.527624309392265, + "grad_norm": 0.2275150567293167, + "learning_rate": 6.0017018368053665e-05, + "loss": 1.7418, + "step": 14751 + }, + { + "epoch": 4.52793124616329, + "grad_norm": 0.2349669486284256, + "learning_rate": 6.001214853005467e-05, + "loss": 1.7814, + "step": 14752 + }, + { + "epoch": 4.528238182934316, + "grad_norm": 0.29985731840133667, + "learning_rate": 6.000727859310975e-05, + "loss": 1.7109, + "step": 14753 + }, + { + "epoch": 4.528545119705341, + "grad_norm": 0.27282044291496277, + "learning_rate": 6.0002408557267044e-05, + "loss": 1.7806, + "step": 14754 + }, + { + "epoch": 4.5288520564763655, + "grad_norm": 0.20906320214271545, + "learning_rate": 5.9997538422574675e-05, + "loss": 1.7221, + "step": 14755 + }, + { + "epoch": 4.529158993247391, + "grad_norm": 0.24553455412387848, + "learning_rate": 5.999266818908076e-05, + "loss": 1.793, + "step": 14756 + }, + { + "epoch": 4.529465930018416, + "grad_norm": 0.29730647802352905, + "learning_rate": 5.998779785683345e-05, + "loss": 1.7597, + "step": 14757 + }, + { + "epoch": 4.5297728667894415, + "grad_norm": 0.28297582268714905, + "learning_rate": 5.998292742588087e-05, + "loss": 1.7459, + "step": 14758 + }, + { + "epoch": 4.530079803560467, + "grad_norm": 0.21853844821453094, + "learning_rate": 5.997805689627115e-05, + "loss": 1.7234, + "step": 14759 + }, + { + "epoch": 4.530386740331492, + "grad_norm": 0.2997361421585083, + "learning_rate": 5.997318626805242e-05, + "loss": 1.7294, + "step": 14760 + }, + { + "epoch": 4.530693677102517, + "grad_norm": 0.3298671543598175, + "learning_rate": 5.9968315541272804e-05, + "loss": 1.7837, + "step": 14761 + }, + { + "epoch": 4.531000613873542, + "grad_norm": 0.22812490165233612, + "learning_rate": 5.996344471598047e-05, + "loss": 1.7509, + "step": 14762 + }, + { + "epoch": 4.531307550644567, + "grad_norm": 0.3179669678211212, + "learning_rate": 5.995857379222354e-05, + "loss": 1.8354, + "step": 14763 + }, + { + "epoch": 4.531614487415593, + "grad_norm": 0.3072827458381653, + "learning_rate": 5.9953702770050135e-05, + "loss": 1.8051, + "step": 14764 + }, + { + "epoch": 4.531921424186618, + "grad_norm": 0.19386722147464752, + "learning_rate": 5.994883164950841e-05, + "loss": 1.7093, + "step": 14765 + }, + { + "epoch": 4.532228360957642, + "grad_norm": 0.2380950152873993, + "learning_rate": 5.99439604306465e-05, + "loss": 1.7547, + "step": 14766 + }, + { + "epoch": 4.532535297728668, + "grad_norm": 0.32604947686195374, + "learning_rate": 5.993908911351254e-05, + "loss": 1.8708, + "step": 14767 + }, + { + "epoch": 4.532842234499693, + "grad_norm": 0.2436954528093338, + "learning_rate": 5.993421769815468e-05, + "loss": 1.7272, + "step": 14768 + }, + { + "epoch": 4.533149171270718, + "grad_norm": 0.2470337301492691, + "learning_rate": 5.992934618462105e-05, + "loss": 1.7242, + "step": 14769 + }, + { + "epoch": 4.533456108041744, + "grad_norm": 0.25720325112342834, + "learning_rate": 5.992447457295981e-05, + "loss": 1.7219, + "step": 14770 + }, + { + "epoch": 4.533763044812769, + "grad_norm": 0.2518918812274933, + "learning_rate": 5.991960286321909e-05, + "loss": 1.7916, + "step": 14771 + }, + { + "epoch": 4.5340699815837935, + "grad_norm": 0.2561487853527069, + "learning_rate": 5.9914731055447037e-05, + "loss": 1.7695, + "step": 14772 + }, + { + "epoch": 4.534376918354819, + "grad_norm": 0.25361356139183044, + "learning_rate": 5.9909859149691804e-05, + "loss": 1.7464, + "step": 14773 + }, + { + "epoch": 4.534683855125844, + "grad_norm": 0.22827522456645966, + "learning_rate": 5.9904987146001545e-05, + "loss": 1.7288, + "step": 14774 + }, + { + "epoch": 4.5349907918968695, + "grad_norm": 0.2417261302471161, + "learning_rate": 5.9900115044424385e-05, + "loss": 1.7311, + "step": 14775 + }, + { + "epoch": 4.535297728667894, + "grad_norm": 0.20756755769252777, + "learning_rate": 5.9895242845008495e-05, + "loss": 1.7799, + "step": 14776 + }, + { + "epoch": 4.535604665438919, + "grad_norm": 0.21999207139015198, + "learning_rate": 5.989037054780201e-05, + "loss": 1.7782, + "step": 14777 + }, + { + "epoch": 4.535911602209945, + "grad_norm": 0.22863444685935974, + "learning_rate": 5.988549815285308e-05, + "loss": 1.7869, + "step": 14778 + }, + { + "epoch": 4.53621853898097, + "grad_norm": 0.23033374547958374, + "learning_rate": 5.988062566020987e-05, + "loss": 1.7328, + "step": 14779 + }, + { + "epoch": 4.536525475751995, + "grad_norm": 0.21903404593467712, + "learning_rate": 5.987575306992053e-05, + "loss": 1.7689, + "step": 14780 + }, + { + "epoch": 4.536832412523021, + "grad_norm": 0.2433948963880539, + "learning_rate": 5.98708803820332e-05, + "loss": 1.7647, + "step": 14781 + }, + { + "epoch": 4.537139349294045, + "grad_norm": 0.2564239799976349, + "learning_rate": 5.986600759659606e-05, + "loss": 1.7958, + "step": 14782 + }, + { + "epoch": 4.53744628606507, + "grad_norm": 0.24009190499782562, + "learning_rate": 5.9861134713657244e-05, + "loss": 1.7511, + "step": 14783 + }, + { + "epoch": 4.537753222836096, + "grad_norm": 0.2578975558280945, + "learning_rate": 5.985626173326491e-05, + "loss": 1.8285, + "step": 14784 + }, + { + "epoch": 4.538060159607121, + "grad_norm": 0.24334335327148438, + "learning_rate": 5.9851388655467225e-05, + "loss": 1.7391, + "step": 14785 + }, + { + "epoch": 4.538367096378146, + "grad_norm": 0.26446983218193054, + "learning_rate": 5.9846515480312335e-05, + "loss": 1.8232, + "step": 14786 + }, + { + "epoch": 4.538674033149171, + "grad_norm": 0.3125670850276947, + "learning_rate": 5.9841642207848415e-05, + "loss": 1.7202, + "step": 14787 + }, + { + "epoch": 4.538980969920196, + "grad_norm": 0.2524511218070984, + "learning_rate": 5.983676883812361e-05, + "loss": 1.7653, + "step": 14788 + }, + { + "epoch": 4.5392879066912215, + "grad_norm": 0.3693946897983551, + "learning_rate": 5.98318953711861e-05, + "loss": 1.7457, + "step": 14789 + }, + { + "epoch": 4.539594843462247, + "grad_norm": 0.32625386118888855, + "learning_rate": 5.9827021807084026e-05, + "loss": 1.784, + "step": 14790 + }, + { + "epoch": 4.539901780233272, + "grad_norm": 0.24243168532848358, + "learning_rate": 5.9822148145865574e-05, + "loss": 1.7651, + "step": 14791 + }, + { + "epoch": 4.5402087170042975, + "grad_norm": 0.2950129210948944, + "learning_rate": 5.9817274387578895e-05, + "loss": 1.7316, + "step": 14792 + }, + { + "epoch": 4.540515653775322, + "grad_norm": 0.29455235600471497, + "learning_rate": 5.981240053227216e-05, + "loss": 1.7504, + "step": 14793 + }, + { + "epoch": 4.540822590546347, + "grad_norm": 0.23161925375461578, + "learning_rate": 5.980752657999352e-05, + "loss": 1.7663, + "step": 14794 + }, + { + "epoch": 4.541129527317373, + "grad_norm": 0.2725144922733307, + "learning_rate": 5.980265253079116e-05, + "loss": 1.765, + "step": 14795 + }, + { + "epoch": 4.541436464088398, + "grad_norm": 0.30911222100257874, + "learning_rate": 5.979777838471324e-05, + "loss": 1.7888, + "step": 14796 + }, + { + "epoch": 4.541743400859423, + "grad_norm": 0.2818063497543335, + "learning_rate": 5.979290414180794e-05, + "loss": 1.8047, + "step": 14797 + }, + { + "epoch": 4.542050337630448, + "grad_norm": 0.23335030674934387, + "learning_rate": 5.978802980212341e-05, + "loss": 1.8205, + "step": 14798 + }, + { + "epoch": 4.542357274401473, + "grad_norm": 0.24228201806545258, + "learning_rate": 5.9783155365707855e-05, + "loss": 1.7774, + "step": 14799 + }, + { + "epoch": 4.542664211172498, + "grad_norm": 0.2410847544670105, + "learning_rate": 5.97782808326094e-05, + "loss": 1.6959, + "step": 14800 + }, + { + "epoch": 4.542971147943524, + "grad_norm": 0.24812567234039307, + "learning_rate": 5.9773406202876245e-05, + "loss": 1.8158, + "step": 14801 + }, + { + "epoch": 4.543278084714549, + "grad_norm": 0.2606147229671478, + "learning_rate": 5.9768531476556566e-05, + "loss": 1.7478, + "step": 14802 + }, + { + "epoch": 4.543585021485574, + "grad_norm": 0.24853013455867767, + "learning_rate": 5.976365665369854e-05, + "loss": 1.8158, + "step": 14803 + }, + { + "epoch": 4.543891958256599, + "grad_norm": 0.2320917695760727, + "learning_rate": 5.9758781734350334e-05, + "loss": 1.7812, + "step": 14804 + }, + { + "epoch": 4.544198895027624, + "grad_norm": 0.3460223376750946, + "learning_rate": 5.9753906718560127e-05, + "loss": 1.7562, + "step": 14805 + }, + { + "epoch": 4.5445058317986495, + "grad_norm": 0.2941136658191681, + "learning_rate": 5.9749031606376086e-05, + "loss": 1.7562, + "step": 14806 + }, + { + "epoch": 4.544812768569675, + "grad_norm": 0.2371312975883484, + "learning_rate": 5.9744156397846404e-05, + "loss": 1.7793, + "step": 14807 + }, + { + "epoch": 4.5451197053407, + "grad_norm": 0.2885094881057739, + "learning_rate": 5.973928109301926e-05, + "loss": 1.7564, + "step": 14808 + }, + { + "epoch": 4.545426642111725, + "grad_norm": 0.2369023859500885, + "learning_rate": 5.973440569194284e-05, + "loss": 1.7862, + "step": 14809 + }, + { + "epoch": 4.54573357888275, + "grad_norm": 0.26628994941711426, + "learning_rate": 5.972953019466531e-05, + "loss": 1.7828, + "step": 14810 + }, + { + "epoch": 4.546040515653775, + "grad_norm": 0.3091031610965729, + "learning_rate": 5.9724654601234864e-05, + "loss": 1.7623, + "step": 14811 + }, + { + "epoch": 4.546347452424801, + "grad_norm": 0.24652205407619476, + "learning_rate": 5.971977891169966e-05, + "loss": 1.6982, + "step": 14812 + }, + { + "epoch": 4.546654389195826, + "grad_norm": 0.21779046952724457, + "learning_rate": 5.971490312610793e-05, + "loss": 1.7363, + "step": 14813 + }, + { + "epoch": 4.546961325966851, + "grad_norm": 0.24130751192569733, + "learning_rate": 5.971002724450783e-05, + "loss": 1.7014, + "step": 14814 + }, + { + "epoch": 4.547268262737876, + "grad_norm": 0.21868734061717987, + "learning_rate": 5.9705151266947534e-05, + "loss": 1.7872, + "step": 14815 + }, + { + "epoch": 4.547575199508901, + "grad_norm": 0.257376492023468, + "learning_rate": 5.9700275193475275e-05, + "loss": 1.75, + "step": 14816 + }, + { + "epoch": 4.547882136279926, + "grad_norm": 0.3182791769504547, + "learning_rate": 5.9695399024139174e-05, + "loss": 1.7965, + "step": 14817 + }, + { + "epoch": 4.548189073050952, + "grad_norm": 0.25553280115127563, + "learning_rate": 5.969052275898748e-05, + "loss": 1.8394, + "step": 14818 + }, + { + "epoch": 4.548496009821976, + "grad_norm": 0.2810833752155304, + "learning_rate": 5.9685646398068354e-05, + "loss": 1.704, + "step": 14819 + }, + { + "epoch": 4.5488029465930016, + "grad_norm": 0.21320512890815735, + "learning_rate": 5.9680769941429993e-05, + "loss": 1.7248, + "step": 14820 + }, + { + "epoch": 4.549109883364027, + "grad_norm": 0.3159593939781189, + "learning_rate": 5.96758933891206e-05, + "loss": 1.7885, + "step": 14821 + }, + { + "epoch": 4.549416820135052, + "grad_norm": 0.21894599497318268, + "learning_rate": 5.967101674118834e-05, + "loss": 1.7388, + "step": 14822 + }, + { + "epoch": 4.5497237569060776, + "grad_norm": 0.24804852902889252, + "learning_rate": 5.9666139997681424e-05, + "loss": 1.7631, + "step": 14823 + }, + { + "epoch": 4.550030693677103, + "grad_norm": 0.2678423523902893, + "learning_rate": 5.966126315864806e-05, + "loss": 1.7631, + "step": 14824 + }, + { + "epoch": 4.550337630448127, + "grad_norm": 0.229649156332016, + "learning_rate": 5.9656386224136426e-05, + "loss": 1.7292, + "step": 14825 + }, + { + "epoch": 4.550644567219153, + "grad_norm": 0.25248458981513977, + "learning_rate": 5.965150919419473e-05, + "loss": 1.8, + "step": 14826 + }, + { + "epoch": 4.550951503990178, + "grad_norm": 0.2583169937133789, + "learning_rate": 5.964663206887116e-05, + "loss": 1.7641, + "step": 14827 + }, + { + "epoch": 4.551258440761203, + "grad_norm": 0.21465209126472473, + "learning_rate": 5.964175484821392e-05, + "loss": 1.7475, + "step": 14828 + }, + { + "epoch": 4.551565377532229, + "grad_norm": 0.28028783202171326, + "learning_rate": 5.963687753227118e-05, + "loss": 1.7649, + "step": 14829 + }, + { + "epoch": 4.551872314303253, + "grad_norm": 0.30248284339904785, + "learning_rate": 5.9632000121091194e-05, + "loss": 1.6969, + "step": 14830 + }, + { + "epoch": 4.5521792510742785, + "grad_norm": 0.24335962533950806, + "learning_rate": 5.962712261472213e-05, + "loss": 1.7295, + "step": 14831 + }, + { + "epoch": 4.552486187845304, + "grad_norm": 0.21014504134655, + "learning_rate": 5.9622245013212206e-05, + "loss": 1.7508, + "step": 14832 + }, + { + "epoch": 4.552793124616329, + "grad_norm": 0.24892041087150574, + "learning_rate": 5.961736731660963e-05, + "loss": 1.7317, + "step": 14833 + }, + { + "epoch": 4.5531000613873545, + "grad_norm": 0.2159881740808487, + "learning_rate": 5.9612489524962556e-05, + "loss": 1.7114, + "step": 14834 + }, + { + "epoch": 4.55340699815838, + "grad_norm": 0.2952292263507843, + "learning_rate": 5.960761163831925e-05, + "loss": 1.8226, + "step": 14835 + }, + { + "epoch": 4.553713934929404, + "grad_norm": 0.3019000291824341, + "learning_rate": 5.9602733656727895e-05, + "loss": 1.7391, + "step": 14836 + }, + { + "epoch": 4.55402087170043, + "grad_norm": 0.2273966521024704, + "learning_rate": 5.9597855580236696e-05, + "loss": 1.7718, + "step": 14837 + }, + { + "epoch": 4.554327808471455, + "grad_norm": 0.2462005764245987, + "learning_rate": 5.959297740889386e-05, + "loss": 1.8428, + "step": 14838 + }, + { + "epoch": 4.55463474524248, + "grad_norm": 0.2773323059082031, + "learning_rate": 5.95880991427476e-05, + "loss": 1.6878, + "step": 14839 + }, + { + "epoch": 4.554941682013506, + "grad_norm": 0.26519861817359924, + "learning_rate": 5.958322078184611e-05, + "loss": 1.737, + "step": 14840 + }, + { + "epoch": 4.55524861878453, + "grad_norm": 0.20157647132873535, + "learning_rate": 5.9578342326237626e-05, + "loss": 1.7164, + "step": 14841 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.21715669333934784, + "learning_rate": 5.957346377597035e-05, + "loss": 1.705, + "step": 14842 + }, + { + "epoch": 4.555862492326581, + "grad_norm": 0.3056442439556122, + "learning_rate": 5.95685851310925e-05, + "loss": 1.7672, + "step": 14843 + }, + { + "epoch": 4.556169429097606, + "grad_norm": 0.24832262098789215, + "learning_rate": 5.956370639165228e-05, + "loss": 1.7305, + "step": 14844 + }, + { + "epoch": 4.556476365868631, + "grad_norm": 0.25814661383628845, + "learning_rate": 5.955882755769791e-05, + "loss": 1.7562, + "step": 14845 + }, + { + "epoch": 4.556783302639657, + "grad_norm": 0.38242629170417786, + "learning_rate": 5.95539486292776e-05, + "loss": 1.7077, + "step": 14846 + }, + { + "epoch": 4.557090239410681, + "grad_norm": 0.2901807427406311, + "learning_rate": 5.954906960643956e-05, + "loss": 1.7233, + "step": 14847 + }, + { + "epoch": 4.5573971761817065, + "grad_norm": 0.22636106610298157, + "learning_rate": 5.954419048923202e-05, + "loss": 1.777, + "step": 14848 + }, + { + "epoch": 4.557704112952732, + "grad_norm": 0.32392850518226624, + "learning_rate": 5.953931127770321e-05, + "loss": 1.7477, + "step": 14849 + }, + { + "epoch": 4.558011049723757, + "grad_norm": 0.3403460681438446, + "learning_rate": 5.953443197190134e-05, + "loss": 1.7712, + "step": 14850 + }, + { + "epoch": 4.558317986494782, + "grad_norm": 0.22923234105110168, + "learning_rate": 5.95295525718746e-05, + "loss": 1.8154, + "step": 14851 + }, + { + "epoch": 4.558624923265807, + "grad_norm": 0.25152841210365295, + "learning_rate": 5.952467307767124e-05, + "loss": 1.7091, + "step": 14852 + }, + { + "epoch": 4.558931860036832, + "grad_norm": 0.27743563055992126, + "learning_rate": 5.951979348933949e-05, + "loss": 1.7621, + "step": 14853 + }, + { + "epoch": 4.559238796807858, + "grad_norm": 0.25809308886528015, + "learning_rate": 5.951491380692756e-05, + "loss": 1.7669, + "step": 14854 + }, + { + "epoch": 4.559545733578883, + "grad_norm": 0.24863946437835693, + "learning_rate": 5.9510034030483676e-05, + "loss": 1.7354, + "step": 14855 + }, + { + "epoch": 4.559852670349908, + "grad_norm": 0.2896040380001068, + "learning_rate": 5.9505154160056066e-05, + "loss": 1.7878, + "step": 14856 + }, + { + "epoch": 4.560159607120933, + "grad_norm": 0.23814482986927032, + "learning_rate": 5.950027419569294e-05, + "loss": 1.7781, + "step": 14857 + }, + { + "epoch": 4.560466543891958, + "grad_norm": 0.2531175911426544, + "learning_rate": 5.949539413744253e-05, + "loss": 1.762, + "step": 14858 + }, + { + "epoch": 4.560773480662983, + "grad_norm": 0.2541767656803131, + "learning_rate": 5.949051398535308e-05, + "loss": 1.7722, + "step": 14859 + }, + { + "epoch": 4.561080417434009, + "grad_norm": 0.25216221809387207, + "learning_rate": 5.948563373947281e-05, + "loss": 1.754, + "step": 14860 + }, + { + "epoch": 4.561387354205034, + "grad_norm": 0.24421775341033936, + "learning_rate": 5.948075339984994e-05, + "loss": 1.7976, + "step": 14861 + }, + { + "epoch": 4.5616942909760585, + "grad_norm": 0.24435418844223022, + "learning_rate": 5.947587296653272e-05, + "loss": 1.79, + "step": 14862 + }, + { + "epoch": 4.562001227747084, + "grad_norm": 0.24471627175807953, + "learning_rate": 5.947099243956936e-05, + "loss": 1.755, + "step": 14863 + }, + { + "epoch": 4.562308164518109, + "grad_norm": 0.2762158215045929, + "learning_rate": 5.9466111819008096e-05, + "loss": 1.7695, + "step": 14864 + }, + { + "epoch": 4.5626151012891345, + "grad_norm": 0.23841319978237152, + "learning_rate": 5.9461231104897174e-05, + "loss": 1.7302, + "step": 14865 + }, + { + "epoch": 4.56292203806016, + "grad_norm": 0.260231077671051, + "learning_rate": 5.9456350297284826e-05, + "loss": 1.7917, + "step": 14866 + }, + { + "epoch": 4.563228974831185, + "grad_norm": 0.2752247452735901, + "learning_rate": 5.945146939621929e-05, + "loss": 1.7953, + "step": 14867 + }, + { + "epoch": 4.56353591160221, + "grad_norm": 0.28760650753974915, + "learning_rate": 5.944658840174878e-05, + "loss": 1.8582, + "step": 14868 + }, + { + "epoch": 4.563842848373235, + "grad_norm": 0.24311676621437073, + "learning_rate": 5.944170731392153e-05, + "loss": 1.8006, + "step": 14869 + }, + { + "epoch": 4.56414978514426, + "grad_norm": 0.2692974805831909, + "learning_rate": 5.943682613278583e-05, + "loss": 1.6984, + "step": 14870 + }, + { + "epoch": 4.564456721915286, + "grad_norm": 0.2784348726272583, + "learning_rate": 5.943194485838985e-05, + "loss": 1.8082, + "step": 14871 + }, + { + "epoch": 4.564763658686311, + "grad_norm": 0.2557264268398285, + "learning_rate": 5.9427063490781885e-05, + "loss": 1.7715, + "step": 14872 + }, + { + "epoch": 4.565070595457335, + "grad_norm": 0.3738742470741272, + "learning_rate": 5.942218203001015e-05, + "loss": 1.7549, + "step": 14873 + }, + { + "epoch": 4.565377532228361, + "grad_norm": 0.2424495816230774, + "learning_rate": 5.941730047612288e-05, + "loss": 1.7388, + "step": 14874 + }, + { + "epoch": 4.565684468999386, + "grad_norm": 0.27020737528800964, + "learning_rate": 5.941241882916833e-05, + "loss": 1.752, + "step": 14875 + }, + { + "epoch": 4.565991405770411, + "grad_norm": 0.3763764798641205, + "learning_rate": 5.940753708919474e-05, + "loss": 1.7918, + "step": 14876 + }, + { + "epoch": 4.566298342541437, + "grad_norm": 0.26782163977622986, + "learning_rate": 5.940265525625036e-05, + "loss": 1.7244, + "step": 14877 + }, + { + "epoch": 4.566605279312462, + "grad_norm": 0.24978911876678467, + "learning_rate": 5.9397773330383434e-05, + "loss": 1.7706, + "step": 14878 + }, + { + "epoch": 4.5669122160834865, + "grad_norm": 0.32905304431915283, + "learning_rate": 5.93928913116422e-05, + "loss": 1.7381, + "step": 14879 + }, + { + "epoch": 4.567219152854512, + "grad_norm": 0.2196444720029831, + "learning_rate": 5.93880092000749e-05, + "loss": 1.7605, + "step": 14880 + }, + { + "epoch": 4.567526089625537, + "grad_norm": 0.3156622350215912, + "learning_rate": 5.9383126995729786e-05, + "loss": 1.9181, + "step": 14881 + }, + { + "epoch": 4.5678330263965625, + "grad_norm": 0.2895203232765198, + "learning_rate": 5.937824469865513e-05, + "loss": 1.7967, + "step": 14882 + }, + { + "epoch": 4.568139963167588, + "grad_norm": 0.24854810535907745, + "learning_rate": 5.937336230889916e-05, + "loss": 1.7332, + "step": 14883 + }, + { + "epoch": 4.568446899938612, + "grad_norm": 0.3417081832885742, + "learning_rate": 5.936847982651013e-05, + "loss": 1.7525, + "step": 14884 + }, + { + "epoch": 4.568753836709638, + "grad_norm": 0.2874949276447296, + "learning_rate": 5.936359725153629e-05, + "loss": 1.7659, + "step": 14885 + }, + { + "epoch": 4.569060773480663, + "grad_norm": 0.25031307339668274, + "learning_rate": 5.935871458402588e-05, + "loss": 1.8061, + "step": 14886 + }, + { + "epoch": 4.569367710251688, + "grad_norm": 0.27047309279441833, + "learning_rate": 5.935383182402717e-05, + "loss": 1.7318, + "step": 14887 + }, + { + "epoch": 4.569674647022714, + "grad_norm": 0.2642819881439209, + "learning_rate": 5.9348948971588425e-05, + "loss": 1.849, + "step": 14888 + }, + { + "epoch": 4.569981583793739, + "grad_norm": 0.2452307790517807, + "learning_rate": 5.9344066026757886e-05, + "loss": 1.7491, + "step": 14889 + }, + { + "epoch": 4.570288520564763, + "grad_norm": 0.24055036902427673, + "learning_rate": 5.9339182989583795e-05, + "loss": 1.7573, + "step": 14890 + }, + { + "epoch": 4.570595457335789, + "grad_norm": 0.23036183416843414, + "learning_rate": 5.933429986011444e-05, + "loss": 1.7841, + "step": 14891 + }, + { + "epoch": 4.570902394106814, + "grad_norm": 0.27987608313560486, + "learning_rate": 5.932941663839805e-05, + "loss": 1.7835, + "step": 14892 + }, + { + "epoch": 4.571209330877839, + "grad_norm": 0.31747013330459595, + "learning_rate": 5.93245333244829e-05, + "loss": 1.7905, + "step": 14893 + }, + { + "epoch": 4.571516267648864, + "grad_norm": 0.24841344356536865, + "learning_rate": 5.931964991841725e-05, + "loss": 1.8003, + "step": 14894 + }, + { + "epoch": 4.571823204419889, + "grad_norm": 0.2416950911283493, + "learning_rate": 5.9314766420249356e-05, + "loss": 1.7787, + "step": 14895 + }, + { + "epoch": 4.5721301411909145, + "grad_norm": 0.2322494238615036, + "learning_rate": 5.930988283002748e-05, + "loss": 1.8153, + "step": 14896 + }, + { + "epoch": 4.57243707796194, + "grad_norm": 0.22629016637802124, + "learning_rate": 5.930499914779989e-05, + "loss": 1.6743, + "step": 14897 + }, + { + "epoch": 4.572744014732965, + "grad_norm": 0.21481508016586304, + "learning_rate": 5.930011537361483e-05, + "loss": 1.7301, + "step": 14898 + }, + { + "epoch": 4.5730509515039905, + "grad_norm": 0.1993340700864792, + "learning_rate": 5.9295231507520586e-05, + "loss": 1.6796, + "step": 14899 + }, + { + "epoch": 4.573357888275015, + "grad_norm": 0.21681822836399078, + "learning_rate": 5.929034754956543e-05, + "loss": 1.7333, + "step": 14900 + }, + { + "epoch": 4.57366482504604, + "grad_norm": 0.23105305433273315, + "learning_rate": 5.928546349979761e-05, + "loss": 1.8207, + "step": 14901 + }, + { + "epoch": 4.573971761817066, + "grad_norm": 0.24656468629837036, + "learning_rate": 5.9280579358265384e-05, + "loss": 1.7805, + "step": 14902 + }, + { + "epoch": 4.574278698588091, + "grad_norm": 0.28564780950546265, + "learning_rate": 5.927569512501704e-05, + "loss": 1.7224, + "step": 14903 + }, + { + "epoch": 4.574585635359116, + "grad_norm": 0.26030251383781433, + "learning_rate": 5.927081080010084e-05, + "loss": 1.7417, + "step": 14904 + }, + { + "epoch": 4.574892572130141, + "grad_norm": 0.21427087485790253, + "learning_rate": 5.926592638356505e-05, + "loss": 1.7239, + "step": 14905 + }, + { + "epoch": 4.575199508901166, + "grad_norm": 0.2351662665605545, + "learning_rate": 5.9261041875457956e-05, + "loss": 1.7711, + "step": 14906 + }, + { + "epoch": 4.5755064456721914, + "grad_norm": 0.27335020899772644, + "learning_rate": 5.925615727582781e-05, + "loss": 1.7496, + "step": 14907 + }, + { + "epoch": 4.575813382443217, + "grad_norm": 0.27849945425987244, + "learning_rate": 5.925127258472289e-05, + "loss": 1.7576, + "step": 14908 + }, + { + "epoch": 4.576120319214242, + "grad_norm": 0.27859339118003845, + "learning_rate": 5.924638780219147e-05, + "loss": 1.8076, + "step": 14909 + }, + { + "epoch": 4.5764272559852675, + "grad_norm": 0.24664369225502014, + "learning_rate": 5.9241502928281836e-05, + "loss": 1.7657, + "step": 14910 + }, + { + "epoch": 4.576734192756292, + "grad_norm": 0.29881149530410767, + "learning_rate": 5.923661796304224e-05, + "loss": 1.7611, + "step": 14911 + }, + { + "epoch": 4.577041129527317, + "grad_norm": 0.2672356367111206, + "learning_rate": 5.9231732906520984e-05, + "loss": 1.7605, + "step": 14912 + }, + { + "epoch": 4.577348066298343, + "grad_norm": 0.24282832443714142, + "learning_rate": 5.9226847758766336e-05, + "loss": 1.7037, + "step": 14913 + }, + { + "epoch": 4.577655003069368, + "grad_norm": 0.3822915852069855, + "learning_rate": 5.922196251982656e-05, + "loss": 1.7609, + "step": 14914 + }, + { + "epoch": 4.577961939840393, + "grad_norm": 0.30721214413642883, + "learning_rate": 5.921707718974994e-05, + "loss": 1.7398, + "step": 14915 + }, + { + "epoch": 4.578268876611418, + "grad_norm": 0.235477477312088, + "learning_rate": 5.921219176858477e-05, + "loss": 1.6869, + "step": 14916 + }, + { + "epoch": 4.578575813382443, + "grad_norm": 0.3752216100692749, + "learning_rate": 5.920730625637934e-05, + "loss": 1.7296, + "step": 14917 + }, + { + "epoch": 4.578882750153468, + "grad_norm": 0.36901310086250305, + "learning_rate": 5.920242065318189e-05, + "loss": 1.7405, + "step": 14918 + }, + { + "epoch": 4.579189686924494, + "grad_norm": 0.2308608740568161, + "learning_rate": 5.9197534959040725e-05, + "loss": 1.7953, + "step": 14919 + }, + { + "epoch": 4.579496623695519, + "grad_norm": 0.3286738991737366, + "learning_rate": 5.919264917400412e-05, + "loss": 1.7669, + "step": 14920 + }, + { + "epoch": 4.579803560466544, + "grad_norm": 0.3944021165370941, + "learning_rate": 5.918776329812039e-05, + "loss": 1.7165, + "step": 14921 + }, + { + "epoch": 4.580110497237569, + "grad_norm": 0.22054845094680786, + "learning_rate": 5.9182877331437795e-05, + "loss": 1.7739, + "step": 14922 + }, + { + "epoch": 4.580417434008594, + "grad_norm": 0.3467540740966797, + "learning_rate": 5.9177991274004605e-05, + "loss": 1.7713, + "step": 14923 + }, + { + "epoch": 4.5807243707796195, + "grad_norm": 0.4313695728778839, + "learning_rate": 5.917310512586914e-05, + "loss": 1.7654, + "step": 14924 + }, + { + "epoch": 4.581031307550645, + "grad_norm": 0.2723502814769745, + "learning_rate": 5.9168218887079685e-05, + "loss": 1.7314, + "step": 14925 + }, + { + "epoch": 4.581338244321669, + "grad_norm": 0.2641250789165497, + "learning_rate": 5.9163332557684504e-05, + "loss": 1.7303, + "step": 14926 + }, + { + "epoch": 4.581645181092695, + "grad_norm": 0.3780760169029236, + "learning_rate": 5.915844613773189e-05, + "loss": 1.7748, + "step": 14927 + }, + { + "epoch": 4.58195211786372, + "grad_norm": 0.23379632830619812, + "learning_rate": 5.915355962727015e-05, + "loss": 1.7482, + "step": 14928 + }, + { + "epoch": 4.582259054634745, + "grad_norm": 0.35227084159851074, + "learning_rate": 5.914867302634758e-05, + "loss": 1.8198, + "step": 14929 + }, + { + "epoch": 4.582565991405771, + "grad_norm": 0.34348124265670776, + "learning_rate": 5.914378633501245e-05, + "loss": 1.8364, + "step": 14930 + }, + { + "epoch": 4.582872928176796, + "grad_norm": 0.2446804940700531, + "learning_rate": 5.9138899553313066e-05, + "loss": 1.7779, + "step": 14931 + }, + { + "epoch": 4.58317986494782, + "grad_norm": 0.23893557488918304, + "learning_rate": 5.913401268129772e-05, + "loss": 1.7582, + "step": 14932 + }, + { + "epoch": 4.583486801718846, + "grad_norm": 0.3046814203262329, + "learning_rate": 5.912912571901471e-05, + "loss": 1.6871, + "step": 14933 + }, + { + "epoch": 4.583793738489871, + "grad_norm": 0.2232733964920044, + "learning_rate": 5.912423866651233e-05, + "loss": 1.7269, + "step": 14934 + }, + { + "epoch": 4.584100675260896, + "grad_norm": 0.18664126098155975, + "learning_rate": 5.911935152383888e-05, + "loss": 1.7155, + "step": 14935 + }, + { + "epoch": 4.584407612031922, + "grad_norm": 0.2573263347148895, + "learning_rate": 5.911446429104265e-05, + "loss": 1.7901, + "step": 14936 + }, + { + "epoch": 4.584714548802946, + "grad_norm": 0.2382393181324005, + "learning_rate": 5.910957696817194e-05, + "loss": 1.7407, + "step": 14937 + }, + { + "epoch": 4.5850214855739715, + "grad_norm": 0.28363972902297974, + "learning_rate": 5.910468955527504e-05, + "loss": 1.7971, + "step": 14938 + }, + { + "epoch": 4.585328422344997, + "grad_norm": 0.3173120617866516, + "learning_rate": 5.909980205240027e-05, + "loss": 1.744, + "step": 14939 + }, + { + "epoch": 4.585635359116022, + "grad_norm": 0.2281302511692047, + "learning_rate": 5.909491445959592e-05, + "loss": 1.6976, + "step": 14940 + }, + { + "epoch": 4.5859422958870475, + "grad_norm": 0.24962912499904633, + "learning_rate": 5.9090026776910304e-05, + "loss": 1.7979, + "step": 14941 + }, + { + "epoch": 4.586249232658073, + "grad_norm": 0.22330854833126068, + "learning_rate": 5.908513900439171e-05, + "loss": 1.7854, + "step": 14942 + }, + { + "epoch": 4.586556169429097, + "grad_norm": 0.20861582458019257, + "learning_rate": 5.908025114208845e-05, + "loss": 1.7133, + "step": 14943 + }, + { + "epoch": 4.586863106200123, + "grad_norm": 0.21838510036468506, + "learning_rate": 5.90753631900488e-05, + "loss": 1.6919, + "step": 14944 + }, + { + "epoch": 4.587170042971148, + "grad_norm": 0.252798467874527, + "learning_rate": 5.907047514832112e-05, + "loss": 1.838, + "step": 14945 + }, + { + "epoch": 4.587476979742173, + "grad_norm": 0.326893150806427, + "learning_rate": 5.906558701695369e-05, + "loss": 1.7303, + "step": 14946 + }, + { + "epoch": 4.587783916513199, + "grad_norm": 0.36489585041999817, + "learning_rate": 5.9060698795994804e-05, + "loss": 1.7631, + "step": 14947 + }, + { + "epoch": 4.588090853284223, + "grad_norm": 0.27491649985313416, + "learning_rate": 5.905581048549279e-05, + "loss": 1.7773, + "step": 14948 + }, + { + "epoch": 4.588397790055248, + "grad_norm": 0.2334890067577362, + "learning_rate": 5.905092208549595e-05, + "loss": 1.7254, + "step": 14949 + }, + { + "epoch": 4.588704726826274, + "grad_norm": 0.24383895099163055, + "learning_rate": 5.904603359605257e-05, + "loss": 1.7496, + "step": 14950 + }, + { + "epoch": 4.589011663597299, + "grad_norm": 0.2144637256860733, + "learning_rate": 5.904114501721102e-05, + "loss": 1.7028, + "step": 14951 + }, + { + "epoch": 4.589318600368324, + "grad_norm": 0.19675977528095245, + "learning_rate": 5.9036256349019555e-05, + "loss": 1.7548, + "step": 14952 + }, + { + "epoch": 4.58962553713935, + "grad_norm": 0.23712843656539917, + "learning_rate": 5.903136759152652e-05, + "loss": 1.7722, + "step": 14953 + }, + { + "epoch": 4.589932473910374, + "grad_norm": 0.20307733118534088, + "learning_rate": 5.902647874478021e-05, + "loss": 1.7177, + "step": 14954 + }, + { + "epoch": 4.5902394106813995, + "grad_norm": 0.21767669916152954, + "learning_rate": 5.9021589808828936e-05, + "loss": 1.7963, + "step": 14955 + }, + { + "epoch": 4.590546347452425, + "grad_norm": 0.2056351602077484, + "learning_rate": 5.9016700783721036e-05, + "loss": 1.7439, + "step": 14956 + }, + { + "epoch": 4.59085328422345, + "grad_norm": 0.20480911433696747, + "learning_rate": 5.90118116695048e-05, + "loss": 1.7122, + "step": 14957 + }, + { + "epoch": 4.5911602209944755, + "grad_norm": 0.24091731011867523, + "learning_rate": 5.900692246622858e-05, + "loss": 1.7862, + "step": 14958 + }, + { + "epoch": 4.5914671577655, + "grad_norm": 0.20246434211730957, + "learning_rate": 5.900203317394066e-05, + "loss": 1.6895, + "step": 14959 + }, + { + "epoch": 4.591774094536525, + "grad_norm": 0.23771630227565765, + "learning_rate": 5.899714379268938e-05, + "loss": 1.7794, + "step": 14960 + }, + { + "epoch": 4.592081031307551, + "grad_norm": 0.2638718783855438, + "learning_rate": 5.899225432252303e-05, + "loss": 1.8059, + "step": 14961 + }, + { + "epoch": 4.592387968078576, + "grad_norm": 0.24251408874988556, + "learning_rate": 5.898736476348997e-05, + "loss": 1.8063, + "step": 14962 + }, + { + "epoch": 4.592694904849601, + "grad_norm": 0.2487735152244568, + "learning_rate": 5.8982475115638515e-05, + "loss": 1.7615, + "step": 14963 + }, + { + "epoch": 4.593001841620627, + "grad_norm": 0.23507241904735565, + "learning_rate": 5.897758537901696e-05, + "loss": 1.7496, + "step": 14964 + }, + { + "epoch": 4.593308778391651, + "grad_norm": 0.22354768216609955, + "learning_rate": 5.897269555367365e-05, + "loss": 1.7293, + "step": 14965 + }, + { + "epoch": 4.593615715162676, + "grad_norm": 0.2711353003978729, + "learning_rate": 5.89678056396569e-05, + "loss": 1.8127, + "step": 14966 + }, + { + "epoch": 4.593922651933702, + "grad_norm": 0.30061110854148865, + "learning_rate": 5.8962915637015036e-05, + "loss": 1.7653, + "step": 14967 + }, + { + "epoch": 4.594229588704727, + "grad_norm": 0.24577318131923676, + "learning_rate": 5.895802554579639e-05, + "loss": 1.7888, + "step": 14968 + }, + { + "epoch": 4.5945365254757515, + "grad_norm": 0.25568944215774536, + "learning_rate": 5.895313536604929e-05, + "loss": 1.7912, + "step": 14969 + }, + { + "epoch": 4.594843462246777, + "grad_norm": 0.2710168957710266, + "learning_rate": 5.894824509782206e-05, + "loss": 1.7681, + "step": 14970 + }, + { + "epoch": 4.595150399017802, + "grad_norm": 0.24056777358055115, + "learning_rate": 5.894335474116303e-05, + "loss": 1.7729, + "step": 14971 + }, + { + "epoch": 4.5954573357888275, + "grad_norm": 0.21956710517406464, + "learning_rate": 5.89384642961205e-05, + "loss": 1.7576, + "step": 14972 + }, + { + "epoch": 4.595764272559853, + "grad_norm": 0.27499106526374817, + "learning_rate": 5.893357376274284e-05, + "loss": 1.7909, + "step": 14973 + }, + { + "epoch": 4.596071209330878, + "grad_norm": 0.28581273555755615, + "learning_rate": 5.8928683141078376e-05, + "loss": 1.7592, + "step": 14974 + }, + { + "epoch": 4.596378146101903, + "grad_norm": 0.23218442499637604, + "learning_rate": 5.892379243117543e-05, + "loss": 1.7142, + "step": 14975 + }, + { + "epoch": 4.596685082872928, + "grad_norm": 0.34015771746635437, + "learning_rate": 5.891890163308234e-05, + "loss": 1.7457, + "step": 14976 + }, + { + "epoch": 4.596992019643953, + "grad_norm": 0.2630012333393097, + "learning_rate": 5.8914010746847435e-05, + "loss": 1.7612, + "step": 14977 + }, + { + "epoch": 4.597298956414979, + "grad_norm": 0.2265843003988266, + "learning_rate": 5.890911977251904e-05, + "loss": 1.7272, + "step": 14978 + }, + { + "epoch": 4.597605893186004, + "grad_norm": 0.22325244545936584, + "learning_rate": 5.8904228710145505e-05, + "loss": 1.7447, + "step": 14979 + }, + { + "epoch": 4.597912829957028, + "grad_norm": 0.23512716591358185, + "learning_rate": 5.889933755977517e-05, + "loss": 1.7123, + "step": 14980 + }, + { + "epoch": 4.598219766728054, + "grad_norm": 0.22534869611263275, + "learning_rate": 5.8894446321456365e-05, + "loss": 1.785, + "step": 14981 + }, + { + "epoch": 4.598526703499079, + "grad_norm": 0.2447836697101593, + "learning_rate": 5.888955499523743e-05, + "loss": 1.7154, + "step": 14982 + }, + { + "epoch": 4.598833640270104, + "grad_norm": 0.2451140582561493, + "learning_rate": 5.88846635811667e-05, + "loss": 1.7494, + "step": 14983 + }, + { + "epoch": 4.59914057704113, + "grad_norm": 0.2253585308790207, + "learning_rate": 5.8879772079292504e-05, + "loss": 1.7591, + "step": 14984 + }, + { + "epoch": 4.599447513812155, + "grad_norm": 0.21714572608470917, + "learning_rate": 5.887488048966322e-05, + "loss": 1.7314, + "step": 14985 + }, + { + "epoch": 4.5997544505831796, + "grad_norm": 0.24897411465644836, + "learning_rate": 5.8869988812327145e-05, + "loss": 1.776, + "step": 14986 + }, + { + "epoch": 4.600061387354205, + "grad_norm": 0.22575093805789948, + "learning_rate": 5.8865097047332653e-05, + "loss": 1.7168, + "step": 14987 + }, + { + "epoch": 4.60036832412523, + "grad_norm": 0.22857412695884705, + "learning_rate": 5.886020519472808e-05, + "loss": 1.8262, + "step": 14988 + }, + { + "epoch": 4.600675260896256, + "grad_norm": 0.22741298377513885, + "learning_rate": 5.885531325456174e-05, + "loss": 1.6732, + "step": 14989 + }, + { + "epoch": 4.600982197667281, + "grad_norm": 0.2229645550251007, + "learning_rate": 5.885042122688202e-05, + "loss": 1.7384, + "step": 14990 + }, + { + "epoch": 4.601289134438305, + "grad_norm": 0.22609494626522064, + "learning_rate": 5.884552911173726e-05, + "loss": 1.714, + "step": 14991 + }, + { + "epoch": 4.601596071209331, + "grad_norm": 0.2629149854183197, + "learning_rate": 5.884063690917578e-05, + "loss": 1.8133, + "step": 14992 + }, + { + "epoch": 4.601903007980356, + "grad_norm": 0.220725417137146, + "learning_rate": 5.883574461924597e-05, + "loss": 1.6898, + "step": 14993 + }, + { + "epoch": 4.602209944751381, + "grad_norm": 0.207612082362175, + "learning_rate": 5.8830852241996135e-05, + "loss": 1.7302, + "step": 14994 + }, + { + "epoch": 4.602516881522407, + "grad_norm": 0.22418084740638733, + "learning_rate": 5.8825959777474625e-05, + "loss": 1.763, + "step": 14995 + }, + { + "epoch": 4.602823818293432, + "grad_norm": 0.30606865882873535, + "learning_rate": 5.882106722572983e-05, + "loss": 1.7657, + "step": 14996 + }, + { + "epoch": 4.6031307550644565, + "grad_norm": 0.2947966456413269, + "learning_rate": 5.881617458681008e-05, + "loss": 1.7796, + "step": 14997 + }, + { + "epoch": 4.603437691835482, + "grad_norm": 0.23430216312408447, + "learning_rate": 5.881128186076372e-05, + "loss": 1.78, + "step": 14998 + }, + { + "epoch": 4.603744628606507, + "grad_norm": 0.28081849217414856, + "learning_rate": 5.880638904763911e-05, + "loss": 1.6791, + "step": 14999 + }, + { + "epoch": 4.6040515653775325, + "grad_norm": 0.25459226965904236, + "learning_rate": 5.88014961474846e-05, + "loss": 1.8064, + "step": 15000 + }, + { + "epoch": 4.604358502148557, + "grad_norm": 0.2358713001012802, + "learning_rate": 5.879660316034854e-05, + "loss": 1.763, + "step": 15001 + }, + { + "epoch": 4.604665438919582, + "grad_norm": 0.32954758405685425, + "learning_rate": 5.879171008627931e-05, + "loss": 1.7462, + "step": 15002 + }, + { + "epoch": 4.604972375690608, + "grad_norm": 0.2588615417480469, + "learning_rate": 5.878681692532523e-05, + "loss": 1.7771, + "step": 15003 + }, + { + "epoch": 4.605279312461633, + "grad_norm": 0.21216195821762085, + "learning_rate": 5.878192367753468e-05, + "loss": 1.7128, + "step": 15004 + }, + { + "epoch": 4.605586249232658, + "grad_norm": 0.26849040389060974, + "learning_rate": 5.8777030342956016e-05, + "loss": 1.7048, + "step": 15005 + }, + { + "epoch": 4.605893186003684, + "grad_norm": 0.22343295812606812, + "learning_rate": 5.877213692163759e-05, + "loss": 1.7695, + "step": 15006 + }, + { + "epoch": 4.606200122774708, + "grad_norm": 0.2794288694858551, + "learning_rate": 5.876724341362776e-05, + "loss": 1.7856, + "step": 15007 + }, + { + "epoch": 4.606507059545733, + "grad_norm": 0.3525427579879761, + "learning_rate": 5.8762349818974905e-05, + "loss": 1.7807, + "step": 15008 + }, + { + "epoch": 4.606813996316759, + "grad_norm": 0.25886499881744385, + "learning_rate": 5.875745613772736e-05, + "loss": 1.7818, + "step": 15009 + }, + { + "epoch": 4.607120933087784, + "grad_norm": 0.24822987616062164, + "learning_rate": 5.8752562369933515e-05, + "loss": 1.7369, + "step": 15010 + }, + { + "epoch": 4.607427869858809, + "grad_norm": 0.26067355275154114, + "learning_rate": 5.874766851564171e-05, + "loss": 1.7056, + "step": 15011 + }, + { + "epoch": 4.607734806629834, + "grad_norm": 0.2869747579097748, + "learning_rate": 5.874277457490033e-05, + "loss": 1.7284, + "step": 15012 + }, + { + "epoch": 4.608041743400859, + "grad_norm": 0.23153580725193024, + "learning_rate": 5.87378805477577e-05, + "loss": 1.7331, + "step": 15013 + }, + { + "epoch": 4.6083486801718845, + "grad_norm": 0.29307299852371216, + "learning_rate": 5.873298643426223e-05, + "loss": 1.7376, + "step": 15014 + }, + { + "epoch": 4.60865561694291, + "grad_norm": 0.25638771057128906, + "learning_rate": 5.872809223446227e-05, + "loss": 1.7585, + "step": 15015 + }, + { + "epoch": 4.608962553713935, + "grad_norm": 0.2272702306509018, + "learning_rate": 5.872319794840618e-05, + "loss": 1.7482, + "step": 15016 + }, + { + "epoch": 4.6092694904849605, + "grad_norm": 0.2579486072063446, + "learning_rate": 5.8718303576142356e-05, + "loss": 1.778, + "step": 15017 + }, + { + "epoch": 4.609576427255985, + "grad_norm": 0.2216452956199646, + "learning_rate": 5.871340911771912e-05, + "loss": 1.7517, + "step": 15018 + }, + { + "epoch": 4.60988336402701, + "grad_norm": 0.22628961503505707, + "learning_rate": 5.870851457318488e-05, + "loss": 1.7579, + "step": 15019 + }, + { + "epoch": 4.610190300798036, + "grad_norm": 0.31018149852752686, + "learning_rate": 5.8703619942588e-05, + "loss": 1.7911, + "step": 15020 + }, + { + "epoch": 4.610497237569061, + "grad_norm": 0.2618122100830078, + "learning_rate": 5.869872522597683e-05, + "loss": 1.8121, + "step": 15021 + }, + { + "epoch": 4.610804174340086, + "grad_norm": 0.26085740327835083, + "learning_rate": 5.869383042339978e-05, + "loss": 1.7952, + "step": 15022 + }, + { + "epoch": 4.611111111111111, + "grad_norm": 0.25237780809402466, + "learning_rate": 5.86889355349052e-05, + "loss": 1.7575, + "step": 15023 + }, + { + "epoch": 4.611418047882136, + "grad_norm": 0.27550897002220154, + "learning_rate": 5.868404056054144e-05, + "loss": 1.7816, + "step": 15024 + }, + { + "epoch": 4.611724984653161, + "grad_norm": 0.2458692342042923, + "learning_rate": 5.8679145500356926e-05, + "loss": 1.7783, + "step": 15025 + }, + { + "epoch": 4.612031921424187, + "grad_norm": 0.25606176257133484, + "learning_rate": 5.867425035439999e-05, + "loss": 1.7863, + "step": 15026 + }, + { + "epoch": 4.612338858195212, + "grad_norm": 0.3206995725631714, + "learning_rate": 5.866935512271905e-05, + "loss": 1.7468, + "step": 15027 + }, + { + "epoch": 4.612645794966237, + "grad_norm": 0.2754824459552765, + "learning_rate": 5.866445980536245e-05, + "loss": 1.793, + "step": 15028 + }, + { + "epoch": 4.612952731737262, + "grad_norm": 0.25168612599372864, + "learning_rate": 5.865956440237859e-05, + "loss": 1.7252, + "step": 15029 + }, + { + "epoch": 4.613259668508287, + "grad_norm": 0.3226735293865204, + "learning_rate": 5.8654668913815815e-05, + "loss": 1.7291, + "step": 15030 + }, + { + "epoch": 4.6135666052793125, + "grad_norm": 0.2580295503139496, + "learning_rate": 5.864977333972255e-05, + "loss": 1.7622, + "step": 15031 + }, + { + "epoch": 4.613873542050338, + "grad_norm": 0.21486075222492218, + "learning_rate": 5.864487768014715e-05, + "loss": 1.7662, + "step": 15032 + }, + { + "epoch": 4.614180478821363, + "grad_norm": 0.2331690639257431, + "learning_rate": 5.8639981935137996e-05, + "loss": 1.7389, + "step": 15033 + }, + { + "epoch": 4.614487415592388, + "grad_norm": 0.2573511302471161, + "learning_rate": 5.863508610474348e-05, + "loss": 1.7699, + "step": 15034 + }, + { + "epoch": 4.614794352363413, + "grad_norm": 0.2260694056749344, + "learning_rate": 5.863019018901199e-05, + "loss": 1.7784, + "step": 15035 + }, + { + "epoch": 4.615101289134438, + "grad_norm": 0.2283065915107727, + "learning_rate": 5.8625294187991895e-05, + "loss": 1.7061, + "step": 15036 + }, + { + "epoch": 4.615408225905464, + "grad_norm": 0.24772310256958008, + "learning_rate": 5.862039810173159e-05, + "loss": 1.7568, + "step": 15037 + }, + { + "epoch": 4.615715162676489, + "grad_norm": 0.2515513002872467, + "learning_rate": 5.861550193027945e-05, + "loss": 1.7445, + "step": 15038 + }, + { + "epoch": 4.616022099447514, + "grad_norm": 0.26472151279449463, + "learning_rate": 5.8610605673683885e-05, + "loss": 1.7735, + "step": 15039 + }, + { + "epoch": 4.616329036218539, + "grad_norm": 0.24053528904914856, + "learning_rate": 5.8605709331993254e-05, + "loss": 1.8009, + "step": 15040 + }, + { + "epoch": 4.616635972989564, + "grad_norm": 0.25125381350517273, + "learning_rate": 5.860081290525596e-05, + "loss": 1.7712, + "step": 15041 + }, + { + "epoch": 4.616942909760589, + "grad_norm": 0.23056018352508545, + "learning_rate": 5.85959163935204e-05, + "loss": 1.7684, + "step": 15042 + }, + { + "epoch": 4.617249846531615, + "grad_norm": 0.2533007562160492, + "learning_rate": 5.859101979683494e-05, + "loss": 1.7793, + "step": 15043 + }, + { + "epoch": 4.617556783302639, + "grad_norm": 0.21007375419139862, + "learning_rate": 5.8586123115248e-05, + "loss": 1.7484, + "step": 15044 + }, + { + "epoch": 4.6178637200736645, + "grad_norm": 0.21329566836357117, + "learning_rate": 5.858122634880797e-05, + "loss": 1.7763, + "step": 15045 + }, + { + "epoch": 4.61817065684469, + "grad_norm": 0.2362898588180542, + "learning_rate": 5.857632949756322e-05, + "loss": 1.7484, + "step": 15046 + }, + { + "epoch": 4.618477593615715, + "grad_norm": 0.2168794423341751, + "learning_rate": 5.857143256156214e-05, + "loss": 1.7752, + "step": 15047 + }, + { + "epoch": 4.6187845303867405, + "grad_norm": 0.24761471152305603, + "learning_rate": 5.856653554085316e-05, + "loss": 1.7793, + "step": 15048 + }, + { + "epoch": 4.619091467157766, + "grad_norm": 0.23202158510684967, + "learning_rate": 5.856163843548466e-05, + "loss": 1.6862, + "step": 15049 + }, + { + "epoch": 4.61939840392879, + "grad_norm": 0.23868000507354736, + "learning_rate": 5.855674124550501e-05, + "loss": 1.8075, + "step": 15050 + }, + { + "epoch": 4.619705340699816, + "grad_norm": 0.3063114583492279, + "learning_rate": 5.855184397096265e-05, + "loss": 1.8051, + "step": 15051 + }, + { + "epoch": 4.620012277470841, + "grad_norm": 0.22672493755817413, + "learning_rate": 5.854694661190594e-05, + "loss": 1.7478, + "step": 15052 + }, + { + "epoch": 4.620319214241866, + "grad_norm": 0.3403559923171997, + "learning_rate": 5.8542049168383296e-05, + "loss": 1.765, + "step": 15053 + }, + { + "epoch": 4.620626151012892, + "grad_norm": 0.33852189779281616, + "learning_rate": 5.853715164044312e-05, + "loss": 1.7602, + "step": 15054 + }, + { + "epoch": 4.620933087783916, + "grad_norm": 0.25166940689086914, + "learning_rate": 5.85322540281338e-05, + "loss": 1.7584, + "step": 15055 + }, + { + "epoch": 4.621240024554941, + "grad_norm": 0.3417987823486328, + "learning_rate": 5.8527356331503757e-05, + "loss": 1.8491, + "step": 15056 + }, + { + "epoch": 4.621546961325967, + "grad_norm": 0.3286994397640228, + "learning_rate": 5.852245855060138e-05, + "loss": 1.7146, + "step": 15057 + }, + { + "epoch": 4.621853898096992, + "grad_norm": 0.24394257366657257, + "learning_rate": 5.851756068547505e-05, + "loss": 1.8762, + "step": 15058 + }, + { + "epoch": 4.622160834868017, + "grad_norm": 0.34945347905158997, + "learning_rate": 5.851266273617321e-05, + "loss": 1.8086, + "step": 15059 + }, + { + "epoch": 4.622467771639043, + "grad_norm": 0.30189210176467896, + "learning_rate": 5.850776470274425e-05, + "loss": 1.7366, + "step": 15060 + }, + { + "epoch": 4.622774708410067, + "grad_norm": 0.24050579965114594, + "learning_rate": 5.850286658523657e-05, + "loss": 1.7599, + "step": 15061 + }, + { + "epoch": 4.6230816451810925, + "grad_norm": 0.33650726079940796, + "learning_rate": 5.849796838369857e-05, + "loss": 1.7343, + "step": 15062 + }, + { + "epoch": 4.623388581952118, + "grad_norm": 0.2855902910232544, + "learning_rate": 5.849307009817868e-05, + "loss": 1.7325, + "step": 15063 + }, + { + "epoch": 4.623695518723143, + "grad_norm": 0.2562592923641205, + "learning_rate": 5.8488171728725275e-05, + "loss": 1.7772, + "step": 15064 + }, + { + "epoch": 4.6240024554941686, + "grad_norm": 0.23494984209537506, + "learning_rate": 5.84832732753868e-05, + "loss": 1.7263, + "step": 15065 + }, + { + "epoch": 4.624309392265193, + "grad_norm": 0.23248226940631866, + "learning_rate": 5.847837473821164e-05, + "loss": 1.7441, + "step": 15066 + }, + { + "epoch": 4.624616329036218, + "grad_norm": 0.2291254848241806, + "learning_rate": 5.847347611724821e-05, + "loss": 1.7742, + "step": 15067 + }, + { + "epoch": 4.624923265807244, + "grad_norm": 0.28305280208587646, + "learning_rate": 5.8468577412544925e-05, + "loss": 1.8224, + "step": 15068 + }, + { + "epoch": 4.625230202578269, + "grad_norm": 0.25531691312789917, + "learning_rate": 5.84636786241502e-05, + "loss": 1.7458, + "step": 15069 + }, + { + "epoch": 4.625537139349294, + "grad_norm": 0.2363462746143341, + "learning_rate": 5.845877975211242e-05, + "loss": 1.7977, + "step": 15070 + }, + { + "epoch": 4.62584407612032, + "grad_norm": 0.2707001864910126, + "learning_rate": 5.845388079648004e-05, + "loss": 1.774, + "step": 15071 + }, + { + "epoch": 4.626151012891344, + "grad_norm": 0.22281844913959503, + "learning_rate": 5.844898175730146e-05, + "loss": 1.7888, + "step": 15072 + }, + { + "epoch": 4.6264579496623695, + "grad_norm": 0.24809995293617249, + "learning_rate": 5.8444082634625086e-05, + "loss": 1.7895, + "step": 15073 + }, + { + "epoch": 4.626764886433395, + "grad_norm": 0.2842096984386444, + "learning_rate": 5.843918342849933e-05, + "loss": 1.7323, + "step": 15074 + }, + { + "epoch": 4.62707182320442, + "grad_norm": 0.21343614161014557, + "learning_rate": 5.843428413897261e-05, + "loss": 1.7298, + "step": 15075 + }, + { + "epoch": 4.627378759975445, + "grad_norm": 0.2420526146888733, + "learning_rate": 5.842938476609336e-05, + "loss": 1.778, + "step": 15076 + }, + { + "epoch": 4.62768569674647, + "grad_norm": 0.22202003002166748, + "learning_rate": 5.842448530990999e-05, + "loss": 1.779, + "step": 15077 + }, + { + "epoch": 4.627992633517495, + "grad_norm": 0.26784011721611023, + "learning_rate": 5.841958577047092e-05, + "loss": 1.799, + "step": 15078 + }, + { + "epoch": 4.628299570288521, + "grad_norm": 0.3230212926864624, + "learning_rate": 5.841468614782457e-05, + "loss": 1.7789, + "step": 15079 + }, + { + "epoch": 4.628606507059546, + "grad_norm": 0.24062715470790863, + "learning_rate": 5.840978644201935e-05, + "loss": 1.7697, + "step": 15080 + }, + { + "epoch": 4.628913443830571, + "grad_norm": 0.2882130444049835, + "learning_rate": 5.84048866531037e-05, + "loss": 1.7946, + "step": 15081 + }, + { + "epoch": 4.629220380601596, + "grad_norm": 0.3145603537559509, + "learning_rate": 5.839998678112602e-05, + "loss": 1.7116, + "step": 15082 + }, + { + "epoch": 4.629527317372621, + "grad_norm": 0.270997017621994, + "learning_rate": 5.839508682613477e-05, + "loss": 1.8281, + "step": 15083 + }, + { + "epoch": 4.629834254143646, + "grad_norm": 0.27299395203590393, + "learning_rate": 5.839018678817834e-05, + "loss": 1.8233, + "step": 15084 + }, + { + "epoch": 4.630141190914672, + "grad_norm": 0.2684478461742401, + "learning_rate": 5.838528666730517e-05, + "loss": 1.8111, + "step": 15085 + }, + { + "epoch": 4.630448127685697, + "grad_norm": 0.2365201860666275, + "learning_rate": 5.838038646356367e-05, + "loss": 1.7475, + "step": 15086 + }, + { + "epoch": 4.6307550644567215, + "grad_norm": 0.2661258280277252, + "learning_rate": 5.8375486177002305e-05, + "loss": 1.748, + "step": 15087 + }, + { + "epoch": 4.631062001227747, + "grad_norm": 0.2865012586116791, + "learning_rate": 5.8370585807669455e-05, + "loss": 1.7525, + "step": 15088 + }, + { + "epoch": 4.631368937998772, + "grad_norm": 0.2445172518491745, + "learning_rate": 5.836568535561358e-05, + "loss": 1.7278, + "step": 15089 + }, + { + "epoch": 4.6316758747697975, + "grad_norm": 0.28192558884620667, + "learning_rate": 5.8360784820883083e-05, + "loss": 1.7371, + "step": 15090 + }, + { + "epoch": 4.631982811540823, + "grad_norm": 0.38927358388900757, + "learning_rate": 5.835588420352642e-05, + "loss": 1.8088, + "step": 15091 + }, + { + "epoch": 4.632289748311848, + "grad_norm": 0.3409229516983032, + "learning_rate": 5.8350983503592025e-05, + "loss": 1.8011, + "step": 15092 + }, + { + "epoch": 4.632596685082873, + "grad_norm": 0.2464994341135025, + "learning_rate": 5.8346082721128294e-05, + "loss": 1.8354, + "step": 15093 + }, + { + "epoch": 4.632903621853898, + "grad_norm": 0.38765814900398254, + "learning_rate": 5.834118185618369e-05, + "loss": 1.7811, + "step": 15094 + }, + { + "epoch": 4.633210558624923, + "grad_norm": 0.42435070872306824, + "learning_rate": 5.833628090880664e-05, + "loss": 1.7855, + "step": 15095 + }, + { + "epoch": 4.633517495395949, + "grad_norm": 0.244876891374588, + "learning_rate": 5.833137987904558e-05, + "loss": 1.7494, + "step": 15096 + }, + { + "epoch": 4.633824432166974, + "grad_norm": 0.30353477597236633, + "learning_rate": 5.8326478766948934e-05, + "loss": 1.7772, + "step": 15097 + }, + { + "epoch": 4.634131368937998, + "grad_norm": 0.38839244842529297, + "learning_rate": 5.8321577572565146e-05, + "loss": 1.7689, + "step": 15098 + }, + { + "epoch": 4.634438305709024, + "grad_norm": 0.357129842042923, + "learning_rate": 5.8316676295942644e-05, + "loss": 1.7777, + "step": 15099 + }, + { + "epoch": 4.634745242480049, + "grad_norm": 0.23458799719810486, + "learning_rate": 5.831177493712988e-05, + "loss": 1.7544, + "step": 15100 + }, + { + "epoch": 4.635052179251074, + "grad_norm": 0.23751308023929596, + "learning_rate": 5.830687349617529e-05, + "loss": 1.7491, + "step": 15101 + }, + { + "epoch": 4.6353591160221, + "grad_norm": 0.31978943943977356, + "learning_rate": 5.83019719731273e-05, + "loss": 1.7439, + "step": 15102 + }, + { + "epoch": 4.635666052793125, + "grad_norm": 0.2751142084598541, + "learning_rate": 5.829707036803438e-05, + "loss": 1.8598, + "step": 15103 + }, + { + "epoch": 4.6359729895641495, + "grad_norm": 0.23670406639575958, + "learning_rate": 5.8292168680944914e-05, + "loss": 1.7629, + "step": 15104 + }, + { + "epoch": 4.636279926335175, + "grad_norm": 0.2447349727153778, + "learning_rate": 5.828726691190739e-05, + "loss": 1.7606, + "step": 15105 + }, + { + "epoch": 4.6365868631062, + "grad_norm": 0.2739902436733246, + "learning_rate": 5.828236506097023e-05, + "loss": 1.707, + "step": 15106 + }, + { + "epoch": 4.6368937998772255, + "grad_norm": 0.2050863653421402, + "learning_rate": 5.82774631281819e-05, + "loss": 1.7235, + "step": 15107 + }, + { + "epoch": 4.637200736648251, + "grad_norm": 0.3005560338497162, + "learning_rate": 5.827256111359082e-05, + "loss": 1.7785, + "step": 15108 + }, + { + "epoch": 4.637507673419275, + "grad_norm": 0.27168264985084534, + "learning_rate": 5.8267659017245434e-05, + "loss": 1.7844, + "step": 15109 + }, + { + "epoch": 4.637814610190301, + "grad_norm": 0.2965840995311737, + "learning_rate": 5.82627568391942e-05, + "loss": 1.7631, + "step": 15110 + }, + { + "epoch": 4.638121546961326, + "grad_norm": 0.3114408552646637, + "learning_rate": 5.825785457948556e-05, + "loss": 1.77, + "step": 15111 + }, + { + "epoch": 4.638428483732351, + "grad_norm": 0.2638910114765167, + "learning_rate": 5.825295223816796e-05, + "loss": 1.9183, + "step": 15112 + }, + { + "epoch": 4.638735420503377, + "grad_norm": 0.3293665051460266, + "learning_rate": 5.824804981528986e-05, + "loss": 1.6779, + "step": 15113 + }, + { + "epoch": 4.639042357274402, + "grad_norm": 0.28586456179618835, + "learning_rate": 5.824314731089968e-05, + "loss": 1.7905, + "step": 15114 + }, + { + "epoch": 4.639349294045426, + "grad_norm": 0.2254554182291031, + "learning_rate": 5.8238244725045906e-05, + "loss": 1.7602, + "step": 15115 + }, + { + "epoch": 4.639656230816452, + "grad_norm": 0.2770406901836395, + "learning_rate": 5.823334205777695e-05, + "loss": 1.7789, + "step": 15116 + }, + { + "epoch": 4.639963167587477, + "grad_norm": 0.2867025136947632, + "learning_rate": 5.822843930914129e-05, + "loss": 1.7408, + "step": 15117 + }, + { + "epoch": 4.640270104358502, + "grad_norm": 0.23486989736557007, + "learning_rate": 5.822353647918737e-05, + "loss": 1.7489, + "step": 15118 + }, + { + "epoch": 4.640577041129527, + "grad_norm": 0.2274324595928192, + "learning_rate": 5.821863356796367e-05, + "loss": 1.768, + "step": 15119 + }, + { + "epoch": 4.640883977900552, + "grad_norm": 0.25032591819763184, + "learning_rate": 5.821373057551858e-05, + "loss": 1.7602, + "step": 15120 + }, + { + "epoch": 4.6411909146715775, + "grad_norm": 0.22332963347434998, + "learning_rate": 5.820882750190059e-05, + "loss": 1.756, + "step": 15121 + }, + { + "epoch": 4.641497851442603, + "grad_norm": 0.24975591897964478, + "learning_rate": 5.820392434715817e-05, + "loss": 1.6963, + "step": 15122 + }, + { + "epoch": 4.641804788213628, + "grad_norm": 0.27892687916755676, + "learning_rate": 5.819902111133976e-05, + "loss": 1.8295, + "step": 15123 + }, + { + "epoch": 4.6421117249846535, + "grad_norm": 0.23914897441864014, + "learning_rate": 5.819411779449381e-05, + "loss": 1.7636, + "step": 15124 + }, + { + "epoch": 4.642418661755678, + "grad_norm": 0.2349565476179123, + "learning_rate": 5.818921439666879e-05, + "loss": 1.7823, + "step": 15125 + }, + { + "epoch": 4.642725598526703, + "grad_norm": 0.2075800597667694, + "learning_rate": 5.818431091791315e-05, + "loss": 1.7282, + "step": 15126 + }, + { + "epoch": 4.643032535297729, + "grad_norm": 0.19781073927879333, + "learning_rate": 5.817940735827535e-05, + "loss": 1.7598, + "step": 15127 + }, + { + "epoch": 4.643339472068754, + "grad_norm": 0.21997439861297607, + "learning_rate": 5.8174503717803866e-05, + "loss": 1.766, + "step": 15128 + }, + { + "epoch": 4.643646408839779, + "grad_norm": 0.23971444368362427, + "learning_rate": 5.816959999654713e-05, + "loss": 1.7824, + "step": 15129 + }, + { + "epoch": 4.643953345610804, + "grad_norm": 0.23357853293418884, + "learning_rate": 5.816469619455363e-05, + "loss": 1.7353, + "step": 15130 + }, + { + "epoch": 4.644260282381829, + "grad_norm": 0.22030897438526154, + "learning_rate": 5.815979231187181e-05, + "loss": 1.7413, + "step": 15131 + }, + { + "epoch": 4.644567219152854, + "grad_norm": 0.2322571873664856, + "learning_rate": 5.815488834855014e-05, + "loss": 1.7305, + "step": 15132 + }, + { + "epoch": 4.64487415592388, + "grad_norm": 0.25256821513175964, + "learning_rate": 5.814998430463709e-05, + "loss": 1.7533, + "step": 15133 + }, + { + "epoch": 4.645181092694905, + "grad_norm": 0.248504638671875, + "learning_rate": 5.81450801801811e-05, + "loss": 1.7345, + "step": 15134 + }, + { + "epoch": 4.64548802946593, + "grad_norm": 0.22850964963436127, + "learning_rate": 5.8140175975230673e-05, + "loss": 1.8308, + "step": 15135 + }, + { + "epoch": 4.645794966236955, + "grad_norm": 0.3517951965332031, + "learning_rate": 5.813527168983426e-05, + "loss": 1.811, + "step": 15136 + }, + { + "epoch": 4.64610190300798, + "grad_norm": 0.32132068276405334, + "learning_rate": 5.813036732404031e-05, + "loss": 1.7584, + "step": 15137 + }, + { + "epoch": 4.6464088397790055, + "grad_norm": 0.2349396049976349, + "learning_rate": 5.812546287789731e-05, + "loss": 1.7762, + "step": 15138 + }, + { + "epoch": 4.646715776550031, + "grad_norm": 0.23519493639469147, + "learning_rate": 5.812055835145372e-05, + "loss": 1.7428, + "step": 15139 + }, + { + "epoch": 4.647022713321056, + "grad_norm": 0.29277852177619934, + "learning_rate": 5.8115653744758016e-05, + "loss": 1.7599, + "step": 15140 + }, + { + "epoch": 4.647329650092081, + "grad_norm": 0.2347593754529953, + "learning_rate": 5.811074905785867e-05, + "loss": 1.7401, + "step": 15141 + }, + { + "epoch": 4.647636586863106, + "grad_norm": 0.23080264031887054, + "learning_rate": 5.8105844290804147e-05, + "loss": 1.7705, + "step": 15142 + }, + { + "epoch": 4.647943523634131, + "grad_norm": 0.24686801433563232, + "learning_rate": 5.810093944364291e-05, + "loss": 1.7409, + "step": 15143 + }, + { + "epoch": 4.648250460405157, + "grad_norm": 0.24098120629787445, + "learning_rate": 5.809603451642344e-05, + "loss": 1.7893, + "step": 15144 + }, + { + "epoch": 4.648557397176182, + "grad_norm": 0.23020638525485992, + "learning_rate": 5.809112950919422e-05, + "loss": 1.7589, + "step": 15145 + }, + { + "epoch": 4.648864333947207, + "grad_norm": 0.3036736249923706, + "learning_rate": 5.808622442200371e-05, + "loss": 1.7964, + "step": 15146 + }, + { + "epoch": 4.649171270718232, + "grad_norm": 0.2965635657310486, + "learning_rate": 5.808131925490039e-05, + "loss": 1.7986, + "step": 15147 + }, + { + "epoch": 4.649478207489257, + "grad_norm": 0.22241640090942383, + "learning_rate": 5.8076414007932745e-05, + "loss": 1.749, + "step": 15148 + }, + { + "epoch": 4.649785144260282, + "grad_norm": 0.20304246246814728, + "learning_rate": 5.8071508681149246e-05, + "loss": 1.7374, + "step": 15149 + }, + { + "epoch": 4.650092081031308, + "grad_norm": 0.19534410536289215, + "learning_rate": 5.806660327459834e-05, + "loss": 1.7087, + "step": 15150 + }, + { + "epoch": 4.650399017802332, + "grad_norm": 0.2151753008365631, + "learning_rate": 5.806169778832856e-05, + "loss": 1.7409, + "step": 15151 + }, + { + "epoch": 4.650705954573358, + "grad_norm": 0.2180301696062088, + "learning_rate": 5.805679222238836e-05, + "loss": 1.7522, + "step": 15152 + }, + { + "epoch": 4.651012891344383, + "grad_norm": 0.19917607307434082, + "learning_rate": 5.8051886576826205e-05, + "loss": 1.768, + "step": 15153 + }, + { + "epoch": 4.651319828115408, + "grad_norm": 0.2312052994966507, + "learning_rate": 5.804698085169059e-05, + "loss": 1.7799, + "step": 15154 + }, + { + "epoch": 4.651626764886434, + "grad_norm": 0.21541514992713928, + "learning_rate": 5.804207504702999e-05, + "loss": 1.7595, + "step": 15155 + }, + { + "epoch": 4.651933701657459, + "grad_norm": 0.2029450386762619, + "learning_rate": 5.803716916289289e-05, + "loss": 1.7727, + "step": 15156 + }, + { + "epoch": 4.652240638428484, + "grad_norm": 0.21796850860118866, + "learning_rate": 5.8032263199327787e-05, + "loss": 1.7445, + "step": 15157 + }, + { + "epoch": 4.652547575199509, + "grad_norm": 0.20309078693389893, + "learning_rate": 5.802735715638314e-05, + "loss": 1.6971, + "step": 15158 + }, + { + "epoch": 4.652854511970534, + "grad_norm": 0.21270112693309784, + "learning_rate": 5.802245103410745e-05, + "loss": 1.7162, + "step": 15159 + }, + { + "epoch": 4.653161448741559, + "grad_norm": 0.25357750058174133, + "learning_rate": 5.8017544832549184e-05, + "loss": 1.7534, + "step": 15160 + }, + { + "epoch": 4.653468385512585, + "grad_norm": 0.24015015363693237, + "learning_rate": 5.8012638551756847e-05, + "loss": 1.7639, + "step": 15161 + }, + { + "epoch": 4.653775322283609, + "grad_norm": 0.20507018268108368, + "learning_rate": 5.800773219177893e-05, + "loss": 1.7293, + "step": 15162 + }, + { + "epoch": 4.6540822590546345, + "grad_norm": 0.23399868607521057, + "learning_rate": 5.800282575266389e-05, + "loss": 1.8286, + "step": 15163 + }, + { + "epoch": 4.65438919582566, + "grad_norm": 0.27126726508140564, + "learning_rate": 5.799791923446025e-05, + "loss": 1.8028, + "step": 15164 + }, + { + "epoch": 4.654696132596685, + "grad_norm": 0.23644569516181946, + "learning_rate": 5.7993012637216494e-05, + "loss": 1.7138, + "step": 15165 + }, + { + "epoch": 4.6550030693677105, + "grad_norm": 0.21557916700839996, + "learning_rate": 5.7988105960981086e-05, + "loss": 1.7703, + "step": 15166 + }, + { + "epoch": 4.655310006138736, + "grad_norm": 0.22030150890350342, + "learning_rate": 5.798319920580254e-05, + "loss": 1.7282, + "step": 15167 + }, + { + "epoch": 4.65561694290976, + "grad_norm": 0.2092939168214798, + "learning_rate": 5.7978292371729325e-05, + "loss": 1.7853, + "step": 15168 + }, + { + "epoch": 4.655923879680786, + "grad_norm": 0.21643707156181335, + "learning_rate": 5.797338545880997e-05, + "loss": 1.7582, + "step": 15169 + }, + { + "epoch": 4.656230816451811, + "grad_norm": 0.3064669668674469, + "learning_rate": 5.796847846709294e-05, + "loss": 1.8139, + "step": 15170 + }, + { + "epoch": 4.656537753222836, + "grad_norm": 0.3060479760169983, + "learning_rate": 5.796357139662674e-05, + "loss": 1.7356, + "step": 15171 + }, + { + "epoch": 4.656844689993862, + "grad_norm": 0.23546656966209412, + "learning_rate": 5.7958664247459835e-05, + "loss": 1.7937, + "step": 15172 + }, + { + "epoch": 4.657151626764886, + "grad_norm": 0.2890888750553131, + "learning_rate": 5.795375701964077e-05, + "loss": 1.7305, + "step": 15173 + }, + { + "epoch": 4.657458563535911, + "grad_norm": 0.27948084473609924, + "learning_rate": 5.794884971321801e-05, + "loss": 1.7428, + "step": 15174 + }, + { + "epoch": 4.657765500306937, + "grad_norm": 0.2354089468717575, + "learning_rate": 5.794394232824007e-05, + "loss": 1.7622, + "step": 15175 + }, + { + "epoch": 4.658072437077962, + "grad_norm": 0.3271159827709198, + "learning_rate": 5.793903486475541e-05, + "loss": 1.7826, + "step": 15176 + }, + { + "epoch": 4.658379373848987, + "grad_norm": 0.3561338782310486, + "learning_rate": 5.793412732281257e-05, + "loss": 1.7698, + "step": 15177 + }, + { + "epoch": 4.658686310620013, + "grad_norm": 0.2913050949573517, + "learning_rate": 5.7929219702460035e-05, + "loss": 1.8156, + "step": 15178 + }, + { + "epoch": 4.658993247391037, + "grad_norm": 0.2345089465379715, + "learning_rate": 5.7924312003746294e-05, + "loss": 1.7859, + "step": 15179 + }, + { + "epoch": 4.6593001841620625, + "grad_norm": 0.3018132150173187, + "learning_rate": 5.7919404226719865e-05, + "loss": 1.7622, + "step": 15180 + }, + { + "epoch": 4.659607120933088, + "grad_norm": 0.29134172201156616, + "learning_rate": 5.791449637142924e-05, + "loss": 1.7287, + "step": 15181 + }, + { + "epoch": 4.659914057704113, + "grad_norm": 0.24126321077346802, + "learning_rate": 5.7909588437922924e-05, + "loss": 1.7969, + "step": 15182 + }, + { + "epoch": 4.6602209944751385, + "grad_norm": 0.27053284645080566, + "learning_rate": 5.7904680426249415e-05, + "loss": 1.7399, + "step": 15183 + }, + { + "epoch": 4.660527931246163, + "grad_norm": 0.2636512219905853, + "learning_rate": 5.789977233645722e-05, + "loss": 1.7615, + "step": 15184 + }, + { + "epoch": 4.660834868017188, + "grad_norm": 0.2263207584619522, + "learning_rate": 5.789486416859484e-05, + "loss": 1.7668, + "step": 15185 + }, + { + "epoch": 4.661141804788214, + "grad_norm": 0.25387826561927795, + "learning_rate": 5.78899559227108e-05, + "loss": 1.7594, + "step": 15186 + }, + { + "epoch": 4.661448741559239, + "grad_norm": 0.2268977165222168, + "learning_rate": 5.7885047598853596e-05, + "loss": 1.75, + "step": 15187 + }, + { + "epoch": 4.661755678330264, + "grad_norm": 0.29093095660209656, + "learning_rate": 5.788013919707172e-05, + "loss": 1.7291, + "step": 15188 + }, + { + "epoch": 4.66206261510129, + "grad_norm": 0.26578736305236816, + "learning_rate": 5.7875230717413684e-05, + "loss": 1.7276, + "step": 15189 + }, + { + "epoch": 4.662369551872314, + "grad_norm": 0.2548983097076416, + "learning_rate": 5.7870322159928e-05, + "loss": 1.755, + "step": 15190 + }, + { + "epoch": 4.662676488643339, + "grad_norm": 0.2246701419353485, + "learning_rate": 5.7865413524663184e-05, + "loss": 1.751, + "step": 15191 + }, + { + "epoch": 4.662983425414365, + "grad_norm": 0.3069002032279968, + "learning_rate": 5.7860504811667747e-05, + "loss": 1.7522, + "step": 15192 + }, + { + "epoch": 4.66329036218539, + "grad_norm": 0.3081241250038147, + "learning_rate": 5.7855596020990186e-05, + "loss": 1.7152, + "step": 15193 + }, + { + "epoch": 4.6635972989564145, + "grad_norm": 0.29006731510162354, + "learning_rate": 5.7850687152679026e-05, + "loss": 1.8471, + "step": 15194 + }, + { + "epoch": 4.66390423572744, + "grad_norm": 0.24131664633750916, + "learning_rate": 5.7845778206782786e-05, + "loss": 1.763, + "step": 15195 + }, + { + "epoch": 4.664211172498465, + "grad_norm": 0.21808001399040222, + "learning_rate": 5.784086918334994e-05, + "loss": 1.6989, + "step": 15196 + }, + { + "epoch": 4.6645181092694905, + "grad_norm": 0.2413240373134613, + "learning_rate": 5.783596008242904e-05, + "loss": 1.7869, + "step": 15197 + }, + { + "epoch": 4.664825046040516, + "grad_norm": 0.23310934007167816, + "learning_rate": 5.7831050904068594e-05, + "loss": 1.8017, + "step": 15198 + }, + { + "epoch": 4.665131982811541, + "grad_norm": 0.2577926814556122, + "learning_rate": 5.7826141648317125e-05, + "loss": 1.6938, + "step": 15199 + }, + { + "epoch": 4.665438919582566, + "grad_norm": 0.22523443400859833, + "learning_rate": 5.782123231522312e-05, + "loss": 1.8104, + "step": 15200 + }, + { + "epoch": 4.665745856353591, + "grad_norm": 0.23603026568889618, + "learning_rate": 5.781632290483512e-05, + "loss": 1.7484, + "step": 15201 + }, + { + "epoch": 4.666052793124616, + "grad_norm": 0.23195989429950714, + "learning_rate": 5.781141341720162e-05, + "loss": 1.7786, + "step": 15202 + }, + { + "epoch": 4.666359729895642, + "grad_norm": 0.21838274598121643, + "learning_rate": 5.780650385237118e-05, + "loss": 1.7509, + "step": 15203 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.26656514406204224, + "learning_rate": 5.780159421039229e-05, + "loss": 1.7875, + "step": 15204 + }, + { + "epoch": 4.666973603437691, + "grad_norm": 0.2293243706226349, + "learning_rate": 5.7796684491313456e-05, + "loss": 1.7518, + "step": 15205 + }, + { + "epoch": 4.667280540208717, + "grad_norm": 0.24190817773342133, + "learning_rate": 5.779177469518323e-05, + "loss": 1.7593, + "step": 15206 + }, + { + "epoch": 4.667587476979742, + "grad_norm": 0.31113871932029724, + "learning_rate": 5.77868648220501e-05, + "loss": 1.7911, + "step": 15207 + }, + { + "epoch": 4.667894413750767, + "grad_norm": 0.2875262498855591, + "learning_rate": 5.778195487196263e-05, + "loss": 1.7871, + "step": 15208 + }, + { + "epoch": 4.668201350521793, + "grad_norm": 0.2172149419784546, + "learning_rate": 5.777704484496931e-05, + "loss": 1.7592, + "step": 15209 + }, + { + "epoch": 4.668508287292818, + "grad_norm": 0.3282458186149597, + "learning_rate": 5.7772134741118675e-05, + "loss": 1.7687, + "step": 15210 + }, + { + "epoch": 4.6688152240638425, + "grad_norm": 0.36963000893592834, + "learning_rate": 5.7767224560459255e-05, + "loss": 1.812, + "step": 15211 + }, + { + "epoch": 4.669122160834868, + "grad_norm": 0.22387740015983582, + "learning_rate": 5.776231430303957e-05, + "loss": 1.7449, + "step": 15212 + }, + { + "epoch": 4.669429097605893, + "grad_norm": 0.21468734741210938, + "learning_rate": 5.775740396890813e-05, + "loss": 1.716, + "step": 15213 + }, + { + "epoch": 4.6697360343769185, + "grad_norm": 0.2478475719690323, + "learning_rate": 5.7752493558113486e-05, + "loss": 1.7182, + "step": 15214 + }, + { + "epoch": 4.670042971147944, + "grad_norm": 0.20924845337867737, + "learning_rate": 5.774758307070416e-05, + "loss": 1.784, + "step": 15215 + }, + { + "epoch": 4.670349907918968, + "grad_norm": 0.2933209538459778, + "learning_rate": 5.774267250672868e-05, + "loss": 1.8375, + "step": 15216 + }, + { + "epoch": 4.670656844689994, + "grad_norm": 0.2744538486003876, + "learning_rate": 5.7737761866235565e-05, + "loss": 1.7019, + "step": 15217 + }, + { + "epoch": 4.670963781461019, + "grad_norm": 0.20991720259189606, + "learning_rate": 5.773285114927336e-05, + "loss": 1.7189, + "step": 15218 + }, + { + "epoch": 4.671270718232044, + "grad_norm": 0.2873254716396332, + "learning_rate": 5.772794035589057e-05, + "loss": 1.7492, + "step": 15219 + }, + { + "epoch": 4.67157765500307, + "grad_norm": 0.2781519591808319, + "learning_rate": 5.772302948613576e-05, + "loss": 1.7342, + "step": 15220 + }, + { + "epoch": 4.671884591774095, + "grad_norm": 0.23288768529891968, + "learning_rate": 5.7718118540057455e-05, + "loss": 1.7245, + "step": 15221 + }, + { + "epoch": 4.672191528545119, + "grad_norm": 0.40817564725875854, + "learning_rate": 5.771320751770417e-05, + "loss": 1.7659, + "step": 15222 + }, + { + "epoch": 4.672498465316145, + "grad_norm": 0.45521771907806396, + "learning_rate": 5.770829641912444e-05, + "loss": 1.7875, + "step": 15223 + }, + { + "epoch": 4.67280540208717, + "grad_norm": 0.22353248298168182, + "learning_rate": 5.77033852443668e-05, + "loss": 1.7098, + "step": 15224 + }, + { + "epoch": 4.673112338858195, + "grad_norm": 0.4066791534423828, + "learning_rate": 5.769847399347981e-05, + "loss": 1.7277, + "step": 15225 + }, + { + "epoch": 4.67341927562922, + "grad_norm": 0.4299545884132385, + "learning_rate": 5.769356266651198e-05, + "loss": 1.7777, + "step": 15226 + }, + { + "epoch": 4.673726212400245, + "grad_norm": 0.21037638187408447, + "learning_rate": 5.768865126351186e-05, + "loss": 1.7263, + "step": 15227 + }, + { + "epoch": 4.6740331491712706, + "grad_norm": 0.3390437066555023, + "learning_rate": 5.768373978452798e-05, + "loss": 1.7457, + "step": 15228 + }, + { + "epoch": 4.674340085942296, + "grad_norm": 0.40003323554992676, + "learning_rate": 5.767882822960887e-05, + "loss": 1.8137, + "step": 15229 + }, + { + "epoch": 4.674647022713321, + "grad_norm": 0.2212848961353302, + "learning_rate": 5.767391659880308e-05, + "loss": 1.7131, + "step": 15230 + }, + { + "epoch": 4.6749539594843466, + "grad_norm": 0.30634984374046326, + "learning_rate": 5.766900489215915e-05, + "loss": 1.7775, + "step": 15231 + }, + { + "epoch": 4.675260896255372, + "grad_norm": 0.31412798166275024, + "learning_rate": 5.766409310972563e-05, + "loss": 1.7383, + "step": 15232 + }, + { + "epoch": 4.675567833026396, + "grad_norm": 0.21125225722789764, + "learning_rate": 5.7659181251551045e-05, + "loss": 1.8046, + "step": 15233 + }, + { + "epoch": 4.675874769797422, + "grad_norm": 0.3234494924545288, + "learning_rate": 5.765426931768394e-05, + "loss": 1.7838, + "step": 15234 + }, + { + "epoch": 4.676181706568447, + "grad_norm": 0.2668779194355011, + "learning_rate": 5.764935730817286e-05, + "loss": 1.7464, + "step": 15235 + }, + { + "epoch": 4.676488643339472, + "grad_norm": 0.22423583269119263, + "learning_rate": 5.764444522306633e-05, + "loss": 1.7165, + "step": 15236 + }, + { + "epoch": 4.676795580110497, + "grad_norm": 0.29066675901412964, + "learning_rate": 5.7639533062412945e-05, + "loss": 1.75, + "step": 15237 + }, + { + "epoch": 4.677102516881522, + "grad_norm": 0.2963598370552063, + "learning_rate": 5.76346208262612e-05, + "loss": 1.8168, + "step": 15238 + }, + { + "epoch": 4.6774094536525475, + "grad_norm": 0.21484358608722687, + "learning_rate": 5.7629708514659655e-05, + "loss": 1.71, + "step": 15239 + }, + { + "epoch": 4.677716390423573, + "grad_norm": 0.20657925307750702, + "learning_rate": 5.762479612765686e-05, + "loss": 1.7239, + "step": 15240 + }, + { + "epoch": 4.678023327194598, + "grad_norm": 0.21336235105991364, + "learning_rate": 5.761988366530136e-05, + "loss": 1.7952, + "step": 15241 + }, + { + "epoch": 4.6783302639656235, + "grad_norm": 0.24156586825847626, + "learning_rate": 5.7614971127641696e-05, + "loss": 1.7709, + "step": 15242 + }, + { + "epoch": 4.678637200736648, + "grad_norm": 0.2633824944496155, + "learning_rate": 5.761005851472643e-05, + "loss": 1.7404, + "step": 15243 + }, + { + "epoch": 4.678944137507673, + "grad_norm": 0.23302829265594482, + "learning_rate": 5.760514582660411e-05, + "loss": 1.7006, + "step": 15244 + }, + { + "epoch": 4.679251074278699, + "grad_norm": 0.22404874861240387, + "learning_rate": 5.7600233063323283e-05, + "loss": 1.7731, + "step": 15245 + }, + { + "epoch": 4.679558011049724, + "grad_norm": 0.23217839002609253, + "learning_rate": 5.7595320224932495e-05, + "loss": 1.7452, + "step": 15246 + }, + { + "epoch": 4.679864947820749, + "grad_norm": 0.23131491243839264, + "learning_rate": 5.7590407311480296e-05, + "loss": 1.7547, + "step": 15247 + }, + { + "epoch": 4.680171884591774, + "grad_norm": 0.21907350420951843, + "learning_rate": 5.7585494323015245e-05, + "loss": 1.7556, + "step": 15248 + }, + { + "epoch": 4.680478821362799, + "grad_norm": 0.22416768968105316, + "learning_rate": 5.7580581259585895e-05, + "loss": 1.7783, + "step": 15249 + }, + { + "epoch": 4.680785758133824, + "grad_norm": 0.20203055441379547, + "learning_rate": 5.75756681212408e-05, + "loss": 1.7285, + "step": 15250 + }, + { + "epoch": 4.68109269490485, + "grad_norm": 0.27838602662086487, + "learning_rate": 5.75707549080285e-05, + "loss": 1.7489, + "step": 15251 + }, + { + "epoch": 4.681399631675875, + "grad_norm": 0.2415023297071457, + "learning_rate": 5.7565841619997586e-05, + "loss": 1.7453, + "step": 15252 + }, + { + "epoch": 4.6817065684469, + "grad_norm": 0.22986920177936554, + "learning_rate": 5.756092825719658e-05, + "loss": 1.7315, + "step": 15253 + }, + { + "epoch": 4.682013505217925, + "grad_norm": 0.2427850216627121, + "learning_rate": 5.755601481967404e-05, + "loss": 1.772, + "step": 15254 + }, + { + "epoch": 4.68232044198895, + "grad_norm": 0.24556589126586914, + "learning_rate": 5.755110130747854e-05, + "loss": 1.7475, + "step": 15255 + }, + { + "epoch": 4.6826273787599755, + "grad_norm": 0.25252529978752136, + "learning_rate": 5.754618772065864e-05, + "loss": 1.7152, + "step": 15256 + }, + { + "epoch": 4.682934315531001, + "grad_norm": 0.24599005281925201, + "learning_rate": 5.754127405926287e-05, + "loss": 1.7911, + "step": 15257 + }, + { + "epoch": 4.683241252302026, + "grad_norm": 0.18961480259895325, + "learning_rate": 5.7536360323339836e-05, + "loss": 1.681, + "step": 15258 + }, + { + "epoch": 4.683548189073051, + "grad_norm": 0.24372327327728271, + "learning_rate": 5.7531446512938035e-05, + "loss": 1.7771, + "step": 15259 + }, + { + "epoch": 4.683855125844076, + "grad_norm": 0.23239269852638245, + "learning_rate": 5.752653262810609e-05, + "loss": 1.7502, + "step": 15260 + }, + { + "epoch": 4.684162062615101, + "grad_norm": 0.25076135993003845, + "learning_rate": 5.752161866889254e-05, + "loss": 1.7974, + "step": 15261 + }, + { + "epoch": 4.684468999386127, + "grad_norm": 0.2703748941421509, + "learning_rate": 5.7516704635345945e-05, + "loss": 1.7245, + "step": 15262 + }, + { + "epoch": 4.684775936157152, + "grad_norm": 0.19247616827487946, + "learning_rate": 5.751179052751487e-05, + "loss": 1.7105, + "step": 15263 + }, + { + "epoch": 4.685082872928177, + "grad_norm": 0.23166817426681519, + "learning_rate": 5.750687634544787e-05, + "loss": 1.8026, + "step": 15264 + }, + { + "epoch": 4.685389809699202, + "grad_norm": 0.22434166073799133, + "learning_rate": 5.7501962089193507e-05, + "loss": 1.7779, + "step": 15265 + }, + { + "epoch": 4.685696746470227, + "grad_norm": 0.190699502825737, + "learning_rate": 5.749704775880037e-05, + "loss": 1.726, + "step": 15266 + }, + { + "epoch": 4.686003683241252, + "grad_norm": 0.22995290160179138, + "learning_rate": 5.749213335431702e-05, + "loss": 1.7495, + "step": 15267 + }, + { + "epoch": 4.686310620012278, + "grad_norm": 0.2712057828903198, + "learning_rate": 5.7487218875792016e-05, + "loss": 1.7862, + "step": 15268 + }, + { + "epoch": 4.686617556783302, + "grad_norm": 0.2524562180042267, + "learning_rate": 5.7482304323273913e-05, + "loss": 1.7092, + "step": 15269 + }, + { + "epoch": 4.6869244935543275, + "grad_norm": 0.23810559511184692, + "learning_rate": 5.747738969681131e-05, + "loss": 1.8049, + "step": 15270 + }, + { + "epoch": 4.687231430325353, + "grad_norm": 0.25521910190582275, + "learning_rate": 5.747247499645275e-05, + "loss": 1.8124, + "step": 15271 + }, + { + "epoch": 4.687538367096378, + "grad_norm": 0.27797845005989075, + "learning_rate": 5.746756022224682e-05, + "loss": 1.7694, + "step": 15272 + }, + { + "epoch": 4.6878453038674035, + "grad_norm": 0.23849260807037354, + "learning_rate": 5.746264537424208e-05, + "loss": 1.7771, + "step": 15273 + }, + { + "epoch": 4.688152240638429, + "grad_norm": 0.24368882179260254, + "learning_rate": 5.74577304524871e-05, + "loss": 1.8143, + "step": 15274 + }, + { + "epoch": 4.688459177409453, + "grad_norm": 0.2712198793888092, + "learning_rate": 5.745281545703045e-05, + "loss": 1.7683, + "step": 15275 + }, + { + "epoch": 4.688766114180479, + "grad_norm": 0.30913081765174866, + "learning_rate": 5.7447900387920716e-05, + "loss": 1.7111, + "step": 15276 + }, + { + "epoch": 4.689073050951504, + "grad_norm": 0.22123363614082336, + "learning_rate": 5.744298524520646e-05, + "loss": 1.7466, + "step": 15277 + }, + { + "epoch": 4.689379987722529, + "grad_norm": 0.32836318016052246, + "learning_rate": 5.743807002893628e-05, + "loss": 1.8083, + "step": 15278 + }, + { + "epoch": 4.689686924493555, + "grad_norm": 0.33319979906082153, + "learning_rate": 5.743315473915871e-05, + "loss": 1.7122, + "step": 15279 + }, + { + "epoch": 4.689993861264579, + "grad_norm": 0.252163290977478, + "learning_rate": 5.742823937592236e-05, + "loss": 1.7599, + "step": 15280 + }, + { + "epoch": 4.690300798035604, + "grad_norm": 0.23248571157455444, + "learning_rate": 5.7423323939275797e-05, + "loss": 1.7791, + "step": 15281 + }, + { + "epoch": 4.69060773480663, + "grad_norm": 0.27024057507514954, + "learning_rate": 5.741840842926759e-05, + "loss": 1.7608, + "step": 15282 + }, + { + "epoch": 4.690914671577655, + "grad_norm": 0.21888256072998047, + "learning_rate": 5.7413492845946326e-05, + "loss": 1.7407, + "step": 15283 + }, + { + "epoch": 4.69122160834868, + "grad_norm": 0.2574782073497772, + "learning_rate": 5.740857718936058e-05, + "loss": 1.707, + "step": 15284 + }, + { + "epoch": 4.691528545119706, + "grad_norm": 0.2541569769382477, + "learning_rate": 5.740366145955893e-05, + "loss": 1.7301, + "step": 15285 + }, + { + "epoch": 4.69183548189073, + "grad_norm": 0.23484647274017334, + "learning_rate": 5.7398745656589955e-05, + "loss": 1.772, + "step": 15286 + }, + { + "epoch": 4.6921424186617555, + "grad_norm": 0.2827093005180359, + "learning_rate": 5.739382978050225e-05, + "loss": 1.7745, + "step": 15287 + }, + { + "epoch": 4.692449355432781, + "grad_norm": 0.300387978553772, + "learning_rate": 5.738891383134437e-05, + "loss": 1.7966, + "step": 15288 + }, + { + "epoch": 4.692756292203806, + "grad_norm": 0.2414523959159851, + "learning_rate": 5.7383997809164926e-05, + "loss": 1.7355, + "step": 15289 + }, + { + "epoch": 4.6930632289748315, + "grad_norm": 0.21221841871738434, + "learning_rate": 5.737908171401248e-05, + "loss": 1.7935, + "step": 15290 + }, + { + "epoch": 4.693370165745856, + "grad_norm": 0.23488084971904755, + "learning_rate": 5.737416554593563e-05, + "loss": 1.7447, + "step": 15291 + }, + { + "epoch": 4.693677102516881, + "grad_norm": 0.26176631450653076, + "learning_rate": 5.7369249304982954e-05, + "loss": 1.769, + "step": 15292 + }, + { + "epoch": 4.693984039287907, + "grad_norm": 0.23060615360736847, + "learning_rate": 5.736433299120303e-05, + "loss": 1.7344, + "step": 15293 + }, + { + "epoch": 4.694290976058932, + "grad_norm": 0.2536846399307251, + "learning_rate": 5.7359416604644456e-05, + "loss": 1.7862, + "step": 15294 + }, + { + "epoch": 4.694597912829957, + "grad_norm": 0.23221342265605927, + "learning_rate": 5.735450014535581e-05, + "loss": 1.743, + "step": 15295 + }, + { + "epoch": 4.694904849600983, + "grad_norm": 0.25320062041282654, + "learning_rate": 5.734958361338568e-05, + "loss": 1.8001, + "step": 15296 + }, + { + "epoch": 4.695211786372007, + "grad_norm": 0.23132461309432983, + "learning_rate": 5.734466700878267e-05, + "loss": 1.7676, + "step": 15297 + }, + { + "epoch": 4.695518723143032, + "grad_norm": 0.2222728580236435, + "learning_rate": 5.7339750331595346e-05, + "loss": 1.7267, + "step": 15298 + }, + { + "epoch": 4.695825659914058, + "grad_norm": 0.2505118250846863, + "learning_rate": 5.733483358187231e-05, + "loss": 1.7467, + "step": 15299 + }, + { + "epoch": 4.696132596685083, + "grad_norm": 0.23609887063503265, + "learning_rate": 5.732991675966214e-05, + "loss": 1.7319, + "step": 15300 + }, + { + "epoch": 4.696439533456108, + "grad_norm": 0.2939738631248474, + "learning_rate": 5.732499986501345e-05, + "loss": 1.8676, + "step": 15301 + }, + { + "epoch": 4.696746470227133, + "grad_norm": 0.29868564009666443, + "learning_rate": 5.7320082897974814e-05, + "loss": 1.7541, + "step": 15302 + }, + { + "epoch": 4.697053406998158, + "grad_norm": 0.2366383820772171, + "learning_rate": 5.731516585859482e-05, + "loss": 1.7531, + "step": 15303 + }, + { + "epoch": 4.6973603437691835, + "grad_norm": 0.2721317410469055, + "learning_rate": 5.731024874692208e-05, + "loss": 1.7444, + "step": 15304 + }, + { + "epoch": 4.697667280540209, + "grad_norm": 0.24925900995731354, + "learning_rate": 5.730533156300517e-05, + "loss": 1.7716, + "step": 15305 + }, + { + "epoch": 4.697974217311234, + "grad_norm": 0.23012754321098328, + "learning_rate": 5.7300414306892704e-05, + "loss": 1.7211, + "step": 15306 + }, + { + "epoch": 4.6982811540822595, + "grad_norm": 0.21274085342884064, + "learning_rate": 5.7295496978633254e-05, + "loss": 1.7853, + "step": 15307 + }, + { + "epoch": 4.698588090853284, + "grad_norm": 0.21799001097679138, + "learning_rate": 5.729057957827544e-05, + "loss": 1.7505, + "step": 15308 + }, + { + "epoch": 4.698895027624309, + "grad_norm": 0.22365793585777283, + "learning_rate": 5.728566210586783e-05, + "loss": 1.7934, + "step": 15309 + }, + { + "epoch": 4.699201964395335, + "grad_norm": 0.23325085639953613, + "learning_rate": 5.728074456145903e-05, + "loss": 1.7354, + "step": 15310 + }, + { + "epoch": 4.69950890116636, + "grad_norm": 0.2175164669752121, + "learning_rate": 5.7275826945097654e-05, + "loss": 1.7541, + "step": 15311 + }, + { + "epoch": 4.699815837937384, + "grad_norm": 0.24657388031482697, + "learning_rate": 5.727090925683231e-05, + "loss": 1.814, + "step": 15312 + }, + { + "epoch": 4.70012277470841, + "grad_norm": 0.2437550574541092, + "learning_rate": 5.726599149671156e-05, + "loss": 1.7234, + "step": 15313 + }, + { + "epoch": 4.700429711479435, + "grad_norm": 0.21053487062454224, + "learning_rate": 5.726107366478402e-05, + "loss": 1.7788, + "step": 15314 + }, + { + "epoch": 4.7007366482504604, + "grad_norm": 0.2007097452878952, + "learning_rate": 5.725615576109831e-05, + "loss": 1.7453, + "step": 15315 + }, + { + "epoch": 4.701043585021486, + "grad_norm": 0.19331564009189606, + "learning_rate": 5.725123778570299e-05, + "loss": 1.7142, + "step": 15316 + }, + { + "epoch": 4.701350521792511, + "grad_norm": 0.24291567504405975, + "learning_rate": 5.7246319738646706e-05, + "loss": 1.8081, + "step": 15317 + }, + { + "epoch": 4.701657458563536, + "grad_norm": 0.21423695981502533, + "learning_rate": 5.724140161997804e-05, + "loss": 1.7021, + "step": 15318 + }, + { + "epoch": 4.701964395334561, + "grad_norm": 0.20857618749141693, + "learning_rate": 5.72364834297456e-05, + "loss": 1.7447, + "step": 15319 + }, + { + "epoch": 4.702271332105586, + "grad_norm": 0.2547401487827301, + "learning_rate": 5.7231565167998e-05, + "loss": 1.7505, + "step": 15320 + }, + { + "epoch": 4.702578268876612, + "grad_norm": 0.2729472219944, + "learning_rate": 5.7226646834783825e-05, + "loss": 1.7974, + "step": 15321 + }, + { + "epoch": 4.702885205647637, + "grad_norm": 0.23258371651172638, + "learning_rate": 5.722172843015169e-05, + "loss": 1.7562, + "step": 15322 + }, + { + "epoch": 4.703192142418661, + "grad_norm": 0.23399893939495087, + "learning_rate": 5.72168099541502e-05, + "loss": 1.7674, + "step": 15323 + }, + { + "epoch": 4.703499079189687, + "grad_norm": 0.2678206264972687, + "learning_rate": 5.721189140682797e-05, + "loss": 1.7331, + "step": 15324 + }, + { + "epoch": 4.703806015960712, + "grad_norm": 0.19472146034240723, + "learning_rate": 5.7206972788233593e-05, + "loss": 1.7003, + "step": 15325 + }, + { + "epoch": 4.704112952731737, + "grad_norm": 0.2199394404888153, + "learning_rate": 5.72020540984157e-05, + "loss": 1.7072, + "step": 15326 + }, + { + "epoch": 4.704419889502763, + "grad_norm": 0.219175323843956, + "learning_rate": 5.719713533742287e-05, + "loss": 1.7591, + "step": 15327 + }, + { + "epoch": 4.704726826273788, + "grad_norm": 0.21127547323703766, + "learning_rate": 5.719221650530374e-05, + "loss": 1.8059, + "step": 15328 + }, + { + "epoch": 4.7050337630448125, + "grad_norm": 0.22189834713935852, + "learning_rate": 5.7187297602106905e-05, + "loss": 1.7529, + "step": 15329 + }, + { + "epoch": 4.705340699815838, + "grad_norm": 0.19945195317268372, + "learning_rate": 5.7182378627881e-05, + "loss": 1.7133, + "step": 15330 + }, + { + "epoch": 4.705647636586863, + "grad_norm": 0.2177499681711197, + "learning_rate": 5.7177459582674595e-05, + "loss": 1.7451, + "step": 15331 + }, + { + "epoch": 4.7059545733578885, + "grad_norm": 0.19489440321922302, + "learning_rate": 5.717254046653635e-05, + "loss": 1.7499, + "step": 15332 + }, + { + "epoch": 4.706261510128914, + "grad_norm": 0.21366968750953674, + "learning_rate": 5.716762127951485e-05, + "loss": 1.7683, + "step": 15333 + }, + { + "epoch": 4.706568446899938, + "grad_norm": 0.2894177734851837, + "learning_rate": 5.71627020216587e-05, + "loss": 1.8235, + "step": 15334 + }, + { + "epoch": 4.706875383670964, + "grad_norm": 0.22175677120685577, + "learning_rate": 5.7157782693016534e-05, + "loss": 1.7421, + "step": 15335 + }, + { + "epoch": 4.707182320441989, + "grad_norm": 0.23653541505336761, + "learning_rate": 5.715286329363698e-05, + "loss": 1.6937, + "step": 15336 + }, + { + "epoch": 4.707489257213014, + "grad_norm": 0.3015746772289276, + "learning_rate": 5.714794382356863e-05, + "loss": 1.7159, + "step": 15337 + }, + { + "epoch": 4.70779619398404, + "grad_norm": 0.24045881628990173, + "learning_rate": 5.714302428286011e-05, + "loss": 1.7263, + "step": 15338 + }, + { + "epoch": 4.708103130755065, + "grad_norm": 0.19836920499801636, + "learning_rate": 5.7138104671560035e-05, + "loss": 1.7604, + "step": 15339 + }, + { + "epoch": 4.708410067526089, + "grad_norm": 0.2430238276720047, + "learning_rate": 5.7133184989717036e-05, + "loss": 1.7147, + "step": 15340 + }, + { + "epoch": 4.708717004297115, + "grad_norm": 0.19388417899608612, + "learning_rate": 5.712826523737971e-05, + "loss": 1.7153, + "step": 15341 + }, + { + "epoch": 4.70902394106814, + "grad_norm": 0.19648151099681854, + "learning_rate": 5.7123345414596694e-05, + "loss": 1.7373, + "step": 15342 + }, + { + "epoch": 4.709330877839165, + "grad_norm": 0.20326325297355652, + "learning_rate": 5.711842552141661e-05, + "loss": 1.7012, + "step": 15343 + }, + { + "epoch": 4.70963781461019, + "grad_norm": 0.20798304677009583, + "learning_rate": 5.711350555788806e-05, + "loss": 1.7134, + "step": 15344 + }, + { + "epoch": 4.709944751381215, + "grad_norm": 0.29318806529045105, + "learning_rate": 5.7108585524059674e-05, + "loss": 1.7661, + "step": 15345 + }, + { + "epoch": 4.7102516881522405, + "grad_norm": 0.273318350315094, + "learning_rate": 5.710366541998009e-05, + "loss": 1.7329, + "step": 15346 + }, + { + "epoch": 4.710558624923266, + "grad_norm": 0.2306031584739685, + "learning_rate": 5.7098745245697925e-05, + "loss": 1.8152, + "step": 15347 + }, + { + "epoch": 4.710865561694291, + "grad_norm": 0.27630630135536194, + "learning_rate": 5.709382500126179e-05, + "loss": 1.7955, + "step": 15348 + }, + { + "epoch": 4.7111724984653165, + "grad_norm": 0.2366025298833847, + "learning_rate": 5.7088904686720326e-05, + "loss": 1.7943, + "step": 15349 + }, + { + "epoch": 4.711479435236341, + "grad_norm": 0.24196656048297882, + "learning_rate": 5.708398430212215e-05, + "loss": 1.698, + "step": 15350 + }, + { + "epoch": 4.711786372007366, + "grad_norm": 0.2770058512687683, + "learning_rate": 5.707906384751588e-05, + "loss": 1.7618, + "step": 15351 + }, + { + "epoch": 4.712093308778392, + "grad_norm": 0.20432323217391968, + "learning_rate": 5.7074143322950157e-05, + "loss": 1.7422, + "step": 15352 + }, + { + "epoch": 4.712400245549417, + "grad_norm": 0.25543150305747986, + "learning_rate": 5.70692227284736e-05, + "loss": 1.7744, + "step": 15353 + }, + { + "epoch": 4.712707182320442, + "grad_norm": 0.24315913021564484, + "learning_rate": 5.7064302064134855e-05, + "loss": 1.7127, + "step": 15354 + }, + { + "epoch": 4.713014119091467, + "grad_norm": 0.23636099696159363, + "learning_rate": 5.705938132998252e-05, + "loss": 1.7725, + "step": 15355 + }, + { + "epoch": 4.713321055862492, + "grad_norm": 0.26809820532798767, + "learning_rate": 5.705446052606526e-05, + "loss": 1.8338, + "step": 15356 + }, + { + "epoch": 4.713627992633517, + "grad_norm": 0.24969002604484558, + "learning_rate": 5.704953965243167e-05, + "loss": 1.8225, + "step": 15357 + }, + { + "epoch": 4.713934929404543, + "grad_norm": 0.23189692199230194, + "learning_rate": 5.70446187091304e-05, + "loss": 1.7901, + "step": 15358 + }, + { + "epoch": 4.714241866175568, + "grad_norm": 0.22373750805854797, + "learning_rate": 5.703969769621008e-05, + "loss": 1.6919, + "step": 15359 + }, + { + "epoch": 4.714548802946593, + "grad_norm": 0.23963531851768494, + "learning_rate": 5.703477661371934e-05, + "loss": 1.7806, + "step": 15360 + }, + { + "epoch": 4.714855739717618, + "grad_norm": 0.20365150272846222, + "learning_rate": 5.702985546170683e-05, + "loss": 1.7207, + "step": 15361 + }, + { + "epoch": 4.715162676488643, + "grad_norm": 0.245658278465271, + "learning_rate": 5.702493424022114e-05, + "loss": 1.7589, + "step": 15362 + }, + { + "epoch": 4.7154696132596685, + "grad_norm": 0.22633756697177887, + "learning_rate": 5.702001294931094e-05, + "loss": 1.7893, + "step": 15363 + }, + { + "epoch": 4.715776550030694, + "grad_norm": 0.21587726473808289, + "learning_rate": 5.701509158902487e-05, + "loss": 1.8095, + "step": 15364 + }, + { + "epoch": 4.716083486801719, + "grad_norm": 0.22553963959217072, + "learning_rate": 5.701017015941155e-05, + "loss": 1.7419, + "step": 15365 + }, + { + "epoch": 4.716390423572744, + "grad_norm": 0.2276087999343872, + "learning_rate": 5.700524866051962e-05, + "loss": 1.7052, + "step": 15366 + }, + { + "epoch": 4.716697360343769, + "grad_norm": 0.22236761450767517, + "learning_rate": 5.700032709239771e-05, + "loss": 1.8612, + "step": 15367 + }, + { + "epoch": 4.717004297114794, + "grad_norm": 0.22816185653209686, + "learning_rate": 5.6995405455094465e-05, + "loss": 1.78, + "step": 15368 + }, + { + "epoch": 4.71731123388582, + "grad_norm": 0.21597479283809662, + "learning_rate": 5.6990483748658516e-05, + "loss": 1.8276, + "step": 15369 + }, + { + "epoch": 4.717618170656845, + "grad_norm": 0.22209586203098297, + "learning_rate": 5.6985561973138533e-05, + "loss": 1.74, + "step": 15370 + }, + { + "epoch": 4.71792510742787, + "grad_norm": 0.24249997735023499, + "learning_rate": 5.6980640128583116e-05, + "loss": 1.8035, + "step": 15371 + }, + { + "epoch": 4.718232044198895, + "grad_norm": 0.23326106369495392, + "learning_rate": 5.6975718215040943e-05, + "loss": 1.7969, + "step": 15372 + }, + { + "epoch": 4.71853898096992, + "grad_norm": 0.215044766664505, + "learning_rate": 5.6970796232560596e-05, + "loss": 1.7345, + "step": 15373 + }, + { + "epoch": 4.718845917740945, + "grad_norm": 0.20231883227825165, + "learning_rate": 5.696587418119078e-05, + "loss": 1.7231, + "step": 15374 + }, + { + "epoch": 4.719152854511971, + "grad_norm": 0.2136038839817047, + "learning_rate": 5.696095206098011e-05, + "loss": 1.7421, + "step": 15375 + }, + { + "epoch": 4.719459791282996, + "grad_norm": 0.2662335932254791, + "learning_rate": 5.6956029871977235e-05, + "loss": 1.7518, + "step": 15376 + }, + { + "epoch": 4.7197667280540205, + "grad_norm": 0.25649648904800415, + "learning_rate": 5.6951107614230783e-05, + "loss": 1.8314, + "step": 15377 + }, + { + "epoch": 4.720073664825046, + "grad_norm": 0.21995560824871063, + "learning_rate": 5.6946185287789425e-05, + "loss": 1.7511, + "step": 15378 + }, + { + "epoch": 4.720380601596071, + "grad_norm": 0.3388935923576355, + "learning_rate": 5.694126289270177e-05, + "loss": 1.7975, + "step": 15379 + }, + { + "epoch": 4.7206875383670965, + "grad_norm": 0.32886409759521484, + "learning_rate": 5.693634042901651e-05, + "loss": 1.7153, + "step": 15380 + }, + { + "epoch": 4.720994475138122, + "grad_norm": 0.21727977693080902, + "learning_rate": 5.693141789678226e-05, + "loss": 1.7095, + "step": 15381 + }, + { + "epoch": 4.721301411909147, + "grad_norm": 0.2680833041667938, + "learning_rate": 5.6926495296047675e-05, + "loss": 1.696, + "step": 15382 + }, + { + "epoch": 4.721608348680172, + "grad_norm": 0.2645499110221863, + "learning_rate": 5.692157262686141e-05, + "loss": 1.6889, + "step": 15383 + }, + { + "epoch": 4.721915285451197, + "grad_norm": 0.20362348854541779, + "learning_rate": 5.69166498892721e-05, + "loss": 1.7303, + "step": 15384 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.24259062111377716, + "learning_rate": 5.691172708332839e-05, + "loss": 1.7684, + "step": 15385 + }, + { + "epoch": 4.722529158993248, + "grad_norm": 0.24204276502132416, + "learning_rate": 5.690680420907897e-05, + "loss": 1.7728, + "step": 15386 + }, + { + "epoch": 4.722836095764272, + "grad_norm": 0.3038320243358612, + "learning_rate": 5.690188126657244e-05, + "loss": 1.7573, + "step": 15387 + }, + { + "epoch": 4.723143032535297, + "grad_norm": 0.24619868397712708, + "learning_rate": 5.689695825585749e-05, + "loss": 1.754, + "step": 15388 + }, + { + "epoch": 4.723449969306323, + "grad_norm": 0.19441325962543488, + "learning_rate": 5.689203517698276e-05, + "loss": 1.726, + "step": 15389 + }, + { + "epoch": 4.723756906077348, + "grad_norm": 0.2874276340007782, + "learning_rate": 5.688711202999688e-05, + "loss": 1.7704, + "step": 15390 + }, + { + "epoch": 4.724063842848373, + "grad_norm": 0.24488390982151031, + "learning_rate": 5.6882188814948535e-05, + "loss": 1.7477, + "step": 15391 + }, + { + "epoch": 4.724370779619399, + "grad_norm": 0.22674018144607544, + "learning_rate": 5.687726553188636e-05, + "loss": 1.7287, + "step": 15392 + }, + { + "epoch": 4.724677716390423, + "grad_norm": 0.2653258442878723, + "learning_rate": 5.687234218085902e-05, + "loss": 1.7415, + "step": 15393 + }, + { + "epoch": 4.7249846531614486, + "grad_norm": 0.20345374941825867, + "learning_rate": 5.686741876191516e-05, + "loss": 1.764, + "step": 15394 + }, + { + "epoch": 4.725291589932474, + "grad_norm": 0.23193977773189545, + "learning_rate": 5.686249527510345e-05, + "loss": 1.7557, + "step": 15395 + }, + { + "epoch": 4.725598526703499, + "grad_norm": 0.26426708698272705, + "learning_rate": 5.685757172047253e-05, + "loss": 1.7708, + "step": 15396 + }, + { + "epoch": 4.725905463474525, + "grad_norm": 0.21377156674861908, + "learning_rate": 5.685264809807107e-05, + "loss": 1.6921, + "step": 15397 + }, + { + "epoch": 4.726212400245549, + "grad_norm": 0.21628457307815552, + "learning_rate": 5.684772440794773e-05, + "loss": 1.72, + "step": 15398 + }, + { + "epoch": 4.726519337016574, + "grad_norm": 0.19200581312179565, + "learning_rate": 5.684280065015116e-05, + "loss": 1.7311, + "step": 15399 + }, + { + "epoch": 4.7268262737876, + "grad_norm": 0.22227540612220764, + "learning_rate": 5.683787682473003e-05, + "loss": 1.7451, + "step": 15400 + }, + { + "epoch": 4.727133210558625, + "grad_norm": 0.18053604662418365, + "learning_rate": 5.683295293173299e-05, + "loss": 1.6816, + "step": 15401 + }, + { + "epoch": 4.72744014732965, + "grad_norm": 0.19827169179916382, + "learning_rate": 5.682802897120869e-05, + "loss": 1.7315, + "step": 15402 + }, + { + "epoch": 4.727747084100676, + "grad_norm": 0.2768021821975708, + "learning_rate": 5.682310494320582e-05, + "loss": 1.7714, + "step": 15403 + }, + { + "epoch": 4.7280540208717, + "grad_norm": 0.2613474428653717, + "learning_rate": 5.6818180847773027e-05, + "loss": 1.7332, + "step": 15404 + }, + { + "epoch": 4.7283609576427255, + "grad_norm": 0.21546787023544312, + "learning_rate": 5.681325668495898e-05, + "loss": 1.771, + "step": 15405 + }, + { + "epoch": 4.728667894413751, + "grad_norm": 0.24442137777805328, + "learning_rate": 5.680833245481234e-05, + "loss": 1.7296, + "step": 15406 + }, + { + "epoch": 4.728974831184776, + "grad_norm": 0.2622109055519104, + "learning_rate": 5.680340815738175e-05, + "loss": 1.7778, + "step": 15407 + }, + { + "epoch": 4.7292817679558015, + "grad_norm": 0.22379513084888458, + "learning_rate": 5.6798483792715904e-05, + "loss": 1.7953, + "step": 15408 + }, + { + "epoch": 4.729588704726826, + "grad_norm": 0.21901065111160278, + "learning_rate": 5.679355936086346e-05, + "loss": 1.7287, + "step": 15409 + }, + { + "epoch": 4.729895641497851, + "grad_norm": 0.3023792505264282, + "learning_rate": 5.6788634861873066e-05, + "loss": 1.7851, + "step": 15410 + }, + { + "epoch": 4.730202578268877, + "grad_norm": 0.23882482945919037, + "learning_rate": 5.678371029579342e-05, + "loss": 1.7621, + "step": 15411 + }, + { + "epoch": 4.730509515039902, + "grad_norm": 0.2661043703556061, + "learning_rate": 5.6778785662673175e-05, + "loss": 1.7453, + "step": 15412 + }, + { + "epoch": 4.730816451810927, + "grad_norm": 0.330208957195282, + "learning_rate": 5.677386096256099e-05, + "loss": 1.761, + "step": 15413 + }, + { + "epoch": 4.731123388581953, + "grad_norm": 0.2686570882797241, + "learning_rate": 5.676893619550552e-05, + "loss": 1.7539, + "step": 15414 + }, + { + "epoch": 4.731430325352977, + "grad_norm": 0.24308046698570251, + "learning_rate": 5.676401136155548e-05, + "loss": 1.7345, + "step": 15415 + }, + { + "epoch": 4.731737262124002, + "grad_norm": 0.4137137830257416, + "learning_rate": 5.67590864607595e-05, + "loss": 1.7688, + "step": 15416 + }, + { + "epoch": 4.732044198895028, + "grad_norm": 0.32161539793014526, + "learning_rate": 5.675416149316628e-05, + "loss": 1.7881, + "step": 15417 + }, + { + "epoch": 4.732351135666053, + "grad_norm": 0.2336999475955963, + "learning_rate": 5.674923645882447e-05, + "loss": 1.755, + "step": 15418 + }, + { + "epoch": 4.7326580724370775, + "grad_norm": 0.32781684398651123, + "learning_rate": 5.6744311357782754e-05, + "loss": 1.8062, + "step": 15419 + }, + { + "epoch": 4.732965009208103, + "grad_norm": 0.2475704401731491, + "learning_rate": 5.6739386190089795e-05, + "loss": 1.725, + "step": 15420 + }, + { + "epoch": 4.733271945979128, + "grad_norm": 0.26295650005340576, + "learning_rate": 5.673446095579427e-05, + "loss": 1.7673, + "step": 15421 + }, + { + "epoch": 4.7335788827501535, + "grad_norm": 0.3454873859882355, + "learning_rate": 5.6729535654944864e-05, + "loss": 1.7523, + "step": 15422 + }, + { + "epoch": 4.733885819521179, + "grad_norm": 0.2306666374206543, + "learning_rate": 5.672461028759024e-05, + "loss": 1.7085, + "step": 15423 + }, + { + "epoch": 4.734192756292204, + "grad_norm": 0.30825871229171753, + "learning_rate": 5.671968485377908e-05, + "loss": 1.7642, + "step": 15424 + }, + { + "epoch": 4.734499693063229, + "grad_norm": 0.42611342668533325, + "learning_rate": 5.6714759353560045e-05, + "loss": 1.7832, + "step": 15425 + }, + { + "epoch": 4.734806629834254, + "grad_norm": 0.29502514004707336, + "learning_rate": 5.670983378698182e-05, + "loss": 1.8153, + "step": 15426 + }, + { + "epoch": 4.735113566605279, + "grad_norm": 0.28416305780410767, + "learning_rate": 5.6704908154093096e-05, + "loss": 1.756, + "step": 15427 + }, + { + "epoch": 4.735420503376305, + "grad_norm": 0.43111103773117065, + "learning_rate": 5.6699982454942534e-05, + "loss": 1.7797, + "step": 15428 + }, + { + "epoch": 4.73572744014733, + "grad_norm": 0.27667397260665894, + "learning_rate": 5.669505668957882e-05, + "loss": 1.7316, + "step": 15429 + }, + { + "epoch": 4.736034376918354, + "grad_norm": 0.3045295774936676, + "learning_rate": 5.669013085805063e-05, + "loss": 1.7591, + "step": 15430 + }, + { + "epoch": 4.73634131368938, + "grad_norm": 0.4494635760784149, + "learning_rate": 5.6685204960406635e-05, + "loss": 1.8295, + "step": 15431 + }, + { + "epoch": 4.736648250460405, + "grad_norm": 0.2951449453830719, + "learning_rate": 5.6680278996695544e-05, + "loss": 1.7857, + "step": 15432 + }, + { + "epoch": 4.73695518723143, + "grad_norm": 0.2714167535305023, + "learning_rate": 5.6675352966966014e-05, + "loss": 1.816, + "step": 15433 + }, + { + "epoch": 4.737262124002456, + "grad_norm": 0.32701000571250916, + "learning_rate": 5.667042687126673e-05, + "loss": 1.7637, + "step": 15434 + }, + { + "epoch": 4.737569060773481, + "grad_norm": 0.2466556429862976, + "learning_rate": 5.666550070964638e-05, + "loss": 1.7805, + "step": 15435 + }, + { + "epoch": 4.7378759975445055, + "grad_norm": 0.3283855617046356, + "learning_rate": 5.666057448215365e-05, + "loss": 1.786, + "step": 15436 + }, + { + "epoch": 4.738182934315531, + "grad_norm": 0.35860660672187805, + "learning_rate": 5.6655648188837205e-05, + "loss": 1.8309, + "step": 15437 + }, + { + "epoch": 4.738489871086556, + "grad_norm": 0.22293898463249207, + "learning_rate": 5.665072182974576e-05, + "loss": 1.7317, + "step": 15438 + }, + { + "epoch": 4.7387968078575815, + "grad_norm": 0.3155089020729065, + "learning_rate": 5.664579540492798e-05, + "loss": 1.7202, + "step": 15439 + }, + { + "epoch": 4.739103744628607, + "grad_norm": 0.28723904490470886, + "learning_rate": 5.6640868914432566e-05, + "loss": 1.7788, + "step": 15440 + }, + { + "epoch": 4.739410681399631, + "grad_norm": 0.2461984008550644, + "learning_rate": 5.6635942358308183e-05, + "loss": 1.8504, + "step": 15441 + }, + { + "epoch": 4.739717618170657, + "grad_norm": 0.2503122091293335, + "learning_rate": 5.663101573660351e-05, + "loss": 1.7375, + "step": 15442 + }, + { + "epoch": 4.740024554941682, + "grad_norm": 0.24925372004508972, + "learning_rate": 5.662608904936727e-05, + "loss": 1.7152, + "step": 15443 + }, + { + "epoch": 4.740331491712707, + "grad_norm": 0.2734573483467102, + "learning_rate": 5.662116229664813e-05, + "loss": 1.7476, + "step": 15444 + }, + { + "epoch": 4.740638428483733, + "grad_norm": 0.38122060894966125, + "learning_rate": 5.661623547849479e-05, + "loss": 1.7682, + "step": 15445 + }, + { + "epoch": 4.740945365254758, + "grad_norm": 0.3786417245864868, + "learning_rate": 5.661130859495593e-05, + "loss": 1.7446, + "step": 15446 + }, + { + "epoch": 4.741252302025782, + "grad_norm": 0.22618255019187927, + "learning_rate": 5.6606381646080244e-05, + "loss": 1.7427, + "step": 15447 + }, + { + "epoch": 4.741559238796808, + "grad_norm": 0.3000899851322174, + "learning_rate": 5.6601454631916405e-05, + "loss": 1.7087, + "step": 15448 + }, + { + "epoch": 4.741866175567833, + "grad_norm": 0.36542513966560364, + "learning_rate": 5.659652755251315e-05, + "loss": 1.7985, + "step": 15449 + }, + { + "epoch": 4.742173112338858, + "grad_norm": 0.23550496995449066, + "learning_rate": 5.659160040791912e-05, + "loss": 1.8163, + "step": 15450 + }, + { + "epoch": 4.742480049109884, + "grad_norm": 0.25615251064300537, + "learning_rate": 5.658667319818305e-05, + "loss": 1.7372, + "step": 15451 + }, + { + "epoch": 4.742786985880908, + "grad_norm": 0.28744083642959595, + "learning_rate": 5.6581745923353615e-05, + "loss": 1.7193, + "step": 15452 + }, + { + "epoch": 4.7430939226519335, + "grad_norm": 0.2500229775905609, + "learning_rate": 5.65768185834795e-05, + "loss": 1.7263, + "step": 15453 + }, + { + "epoch": 4.743400859422959, + "grad_norm": 0.21520425379276276, + "learning_rate": 5.6571891178609394e-05, + "loss": 1.7337, + "step": 15454 + }, + { + "epoch": 4.743707796193984, + "grad_norm": 0.212506502866745, + "learning_rate": 5.656696370879202e-05, + "loss": 1.7672, + "step": 15455 + }, + { + "epoch": 4.7440147329650095, + "grad_norm": 0.21143417060375214, + "learning_rate": 5.656203617407607e-05, + "loss": 1.7189, + "step": 15456 + }, + { + "epoch": 4.744321669736035, + "grad_norm": 0.18320922553539276, + "learning_rate": 5.6557108574510243e-05, + "loss": 1.7521, + "step": 15457 + }, + { + "epoch": 4.744628606507059, + "grad_norm": 0.19202999770641327, + "learning_rate": 5.655218091014321e-05, + "loss": 1.6756, + "step": 15458 + }, + { + "epoch": 4.744935543278085, + "grad_norm": 0.2152331918478012, + "learning_rate": 5.654725318102367e-05, + "loss": 1.7653, + "step": 15459 + }, + { + "epoch": 4.74524248004911, + "grad_norm": 0.24565903842449188, + "learning_rate": 5.6542325387200354e-05, + "loss": 1.7654, + "step": 15460 + }, + { + "epoch": 4.745549416820135, + "grad_norm": 0.2504819333553314, + "learning_rate": 5.653739752872195e-05, + "loss": 1.7073, + "step": 15461 + }, + { + "epoch": 4.74585635359116, + "grad_norm": 0.19258706271648407, + "learning_rate": 5.653246960563714e-05, + "loss": 1.7106, + "step": 15462 + }, + { + "epoch": 4.746163290362185, + "grad_norm": 0.22961968183517456, + "learning_rate": 5.652754161799465e-05, + "loss": 1.7868, + "step": 15463 + }, + { + "epoch": 4.74647022713321, + "grad_norm": 0.2763231098651886, + "learning_rate": 5.652261356584315e-05, + "loss": 1.7714, + "step": 15464 + }, + { + "epoch": 4.746777163904236, + "grad_norm": 0.23866096138954163, + "learning_rate": 5.651768544923136e-05, + "loss": 1.7537, + "step": 15465 + }, + { + "epoch": 4.747084100675261, + "grad_norm": 0.21851976215839386, + "learning_rate": 5.6512757268207997e-05, + "loss": 1.8109, + "step": 15466 + }, + { + "epoch": 4.747391037446286, + "grad_norm": 0.22249393165111542, + "learning_rate": 5.6507829022821745e-05, + "loss": 1.7357, + "step": 15467 + }, + { + "epoch": 4.747697974217311, + "grad_norm": 0.20202289521694183, + "learning_rate": 5.650290071312131e-05, + "loss": 1.7867, + "step": 15468 + }, + { + "epoch": 4.748004910988336, + "grad_norm": 0.20618727803230286, + "learning_rate": 5.649797233915539e-05, + "loss": 1.6904, + "step": 15469 + }, + { + "epoch": 4.7483118477593615, + "grad_norm": 0.25609052181243896, + "learning_rate": 5.649304390097272e-05, + "loss": 1.7287, + "step": 15470 + }, + { + "epoch": 4.748618784530387, + "grad_norm": 0.22966544330120087, + "learning_rate": 5.648811539862195e-05, + "loss": 1.7384, + "step": 15471 + }, + { + "epoch": 4.748925721301412, + "grad_norm": 0.24070143699645996, + "learning_rate": 5.6483186832151856e-05, + "loss": 1.7625, + "step": 15472 + }, + { + "epoch": 4.749232658072437, + "grad_norm": 0.22642426192760468, + "learning_rate": 5.647825820161109e-05, + "loss": 1.7291, + "step": 15473 + }, + { + "epoch": 4.749539594843462, + "grad_norm": 0.23255646228790283, + "learning_rate": 5.64733295070484e-05, + "loss": 1.8076, + "step": 15474 + }, + { + "epoch": 4.749846531614487, + "grad_norm": 0.20902042090892792, + "learning_rate": 5.646840074851246e-05, + "loss": 1.6627, + "step": 15475 + }, + { + "epoch": 4.750153468385513, + "grad_norm": 0.21608836948871613, + "learning_rate": 5.646347192605198e-05, + "loss": 1.7458, + "step": 15476 + }, + { + "epoch": 4.750460405156538, + "grad_norm": 0.22368495166301727, + "learning_rate": 5.6458543039715694e-05, + "loss": 1.7601, + "step": 15477 + }, + { + "epoch": 4.750767341927563, + "grad_norm": 0.30586308240890503, + "learning_rate": 5.645361408955231e-05, + "loss": 1.8389, + "step": 15478 + }, + { + "epoch": 4.751074278698588, + "grad_norm": 0.25122150778770447, + "learning_rate": 5.644868507561052e-05, + "loss": 1.7509, + "step": 15479 + }, + { + "epoch": 4.751381215469613, + "grad_norm": 0.28435763716697693, + "learning_rate": 5.644375599793904e-05, + "loss": 1.7723, + "step": 15480 + }, + { + "epoch": 4.7516881522406385, + "grad_norm": 0.3111409842967987, + "learning_rate": 5.643882685658659e-05, + "loss": 1.7973, + "step": 15481 + }, + { + "epoch": 4.751995089011664, + "grad_norm": 0.3108380138874054, + "learning_rate": 5.6433897651601874e-05, + "loss": 1.8126, + "step": 15482 + }, + { + "epoch": 4.752302025782689, + "grad_norm": 0.25894731283187866, + "learning_rate": 5.642896838303362e-05, + "loss": 1.7849, + "step": 15483 + }, + { + "epoch": 4.752608962553714, + "grad_norm": 0.39321839809417725, + "learning_rate": 5.642403905093052e-05, + "loss": 1.7583, + "step": 15484 + }, + { + "epoch": 4.752915899324739, + "grad_norm": 0.3206121027469635, + "learning_rate": 5.6419109655341315e-05, + "loss": 1.8061, + "step": 15485 + }, + { + "epoch": 4.753222836095764, + "grad_norm": 0.2817624807357788, + "learning_rate": 5.64141801963147e-05, + "loss": 1.8252, + "step": 15486 + }, + { + "epoch": 4.75352977286679, + "grad_norm": 0.3344736397266388, + "learning_rate": 5.6409250673899405e-05, + "loss": 1.6975, + "step": 15487 + }, + { + "epoch": 4.753836709637815, + "grad_norm": 0.21873882412910461, + "learning_rate": 5.640432108814413e-05, + "loss": 1.7126, + "step": 15488 + }, + { + "epoch": 4.75414364640884, + "grad_norm": 0.3317199945449829, + "learning_rate": 5.639939143909758e-05, + "loss": 1.7826, + "step": 15489 + }, + { + "epoch": 4.754450583179865, + "grad_norm": 0.34901630878448486, + "learning_rate": 5.639446172680854e-05, + "loss": 1.7411, + "step": 15490 + }, + { + "epoch": 4.75475751995089, + "grad_norm": 0.24015867710113525, + "learning_rate": 5.6389531951325645e-05, + "loss": 1.7514, + "step": 15491 + }, + { + "epoch": 4.755064456721915, + "grad_norm": 0.28364554047584534, + "learning_rate": 5.6384602112697674e-05, + "loss": 1.7569, + "step": 15492 + }, + { + "epoch": 4.755371393492941, + "grad_norm": 0.3561246693134308, + "learning_rate": 5.637967221097329e-05, + "loss": 1.7212, + "step": 15493 + }, + { + "epoch": 4.755678330263965, + "grad_norm": 0.3383684456348419, + "learning_rate": 5.637474224620126e-05, + "loss": 1.6866, + "step": 15494 + }, + { + "epoch": 4.7559852670349905, + "grad_norm": 0.2399235963821411, + "learning_rate": 5.63698122184303e-05, + "loss": 1.7609, + "step": 15495 + }, + { + "epoch": 4.756292203806016, + "grad_norm": 0.38559645414352417, + "learning_rate": 5.636488212770912e-05, + "loss": 1.7509, + "step": 15496 + }, + { + "epoch": 4.756599140577041, + "grad_norm": 0.365005224943161, + "learning_rate": 5.635995197408645e-05, + "loss": 1.7894, + "step": 15497 + }, + { + "epoch": 4.7569060773480665, + "grad_norm": 0.21254757046699524, + "learning_rate": 5.635502175761099e-05, + "loss": 1.6969, + "step": 15498 + }, + { + "epoch": 4.757213014119092, + "grad_norm": 0.42865821719169617, + "learning_rate": 5.635009147833149e-05, + "loss": 1.7989, + "step": 15499 + }, + { + "epoch": 4.757519950890116, + "grad_norm": 0.35717228055000305, + "learning_rate": 5.634516113629665e-05, + "loss": 1.7338, + "step": 15500 + }, + { + "epoch": 4.757826887661142, + "grad_norm": 0.21582463383674622, + "learning_rate": 5.634023073155523e-05, + "loss": 1.7429, + "step": 15501 + }, + { + "epoch": 4.758133824432167, + "grad_norm": 0.3376842141151428, + "learning_rate": 5.633530026415592e-05, + "loss": 1.7703, + "step": 15502 + }, + { + "epoch": 4.758440761203192, + "grad_norm": 0.2760981023311615, + "learning_rate": 5.633036973414747e-05, + "loss": 1.7389, + "step": 15503 + }, + { + "epoch": 4.758747697974218, + "grad_norm": 0.3808997571468353, + "learning_rate": 5.63254391415786e-05, + "loss": 1.7513, + "step": 15504 + }, + { + "epoch": 4.759054634745242, + "grad_norm": 0.5152496695518494, + "learning_rate": 5.6320508486498014e-05, + "loss": 1.7376, + "step": 15505 + }, + { + "epoch": 4.759361571516267, + "grad_norm": 0.33983346819877625, + "learning_rate": 5.6315577768954464e-05, + "loss": 1.7209, + "step": 15506 + }, + { + "epoch": 4.759668508287293, + "grad_norm": 0.27064043283462524, + "learning_rate": 5.631064698899669e-05, + "loss": 1.7808, + "step": 15507 + }, + { + "epoch": 4.759975445058318, + "grad_norm": 0.3659237027168274, + "learning_rate": 5.630571614667339e-05, + "loss": 1.7706, + "step": 15508 + }, + { + "epoch": 4.760282381829343, + "grad_norm": 0.246379554271698, + "learning_rate": 5.63007852420333e-05, + "loss": 1.7425, + "step": 15509 + }, + { + "epoch": 4.760589318600369, + "grad_norm": 0.2683795392513275, + "learning_rate": 5.629585427512518e-05, + "loss": 1.7332, + "step": 15510 + }, + { + "epoch": 4.760896255371393, + "grad_norm": 0.32626205682754517, + "learning_rate": 5.6290923245997704e-05, + "loss": 1.786, + "step": 15511 + }, + { + "epoch": 4.7612031921424185, + "grad_norm": 0.23723098635673523, + "learning_rate": 5.6285992154699666e-05, + "loss": 1.7305, + "step": 15512 + }, + { + "epoch": 4.761510128913444, + "grad_norm": 0.26316091418266296, + "learning_rate": 5.628106100127976e-05, + "loss": 1.7804, + "step": 15513 + }, + { + "epoch": 4.761817065684469, + "grad_norm": 0.24376356601715088, + "learning_rate": 5.6276129785786726e-05, + "loss": 1.738, + "step": 15514 + }, + { + "epoch": 4.7621240024554945, + "grad_norm": 0.27778422832489014, + "learning_rate": 5.627119850826931e-05, + "loss": 1.7444, + "step": 15515 + }, + { + "epoch": 4.762430939226519, + "grad_norm": 0.3134306073188782, + "learning_rate": 5.6266267168776224e-05, + "loss": 1.7696, + "step": 15516 + }, + { + "epoch": 4.762737875997544, + "grad_norm": 0.2354283481836319, + "learning_rate": 5.6261335767356195e-05, + "loss": 1.799, + "step": 15517 + }, + { + "epoch": 4.76304481276857, + "grad_norm": 0.26902756094932556, + "learning_rate": 5.6256404304058e-05, + "loss": 1.7091, + "step": 15518 + }, + { + "epoch": 4.763351749539595, + "grad_norm": 0.2760716676712036, + "learning_rate": 5.6251472778930345e-05, + "loss": 1.742, + "step": 15519 + }, + { + "epoch": 4.76365868631062, + "grad_norm": 0.2138829231262207, + "learning_rate": 5.624654119202197e-05, + "loss": 1.7093, + "step": 15520 + }, + { + "epoch": 4.763965623081646, + "grad_norm": 0.31404614448547363, + "learning_rate": 5.624160954338162e-05, + "loss": 1.7467, + "step": 15521 + }, + { + "epoch": 4.76427255985267, + "grad_norm": 0.24810083210468292, + "learning_rate": 5.623667783305803e-05, + "loss": 1.745, + "step": 15522 + }, + { + "epoch": 4.764579496623695, + "grad_norm": 0.23674242198467255, + "learning_rate": 5.6231746061099913e-05, + "loss": 1.7662, + "step": 15523 + }, + { + "epoch": 4.764886433394721, + "grad_norm": 0.264230877161026, + "learning_rate": 5.622681422755606e-05, + "loss": 1.7627, + "step": 15524 + }, + { + "epoch": 4.765193370165746, + "grad_norm": 0.2982041537761688, + "learning_rate": 5.6221882332475165e-05, + "loss": 1.7558, + "step": 15525 + }, + { + "epoch": 4.765500306936771, + "grad_norm": 0.29215967655181885, + "learning_rate": 5.6216950375905975e-05, + "loss": 1.7981, + "step": 15526 + }, + { + "epoch": 4.765807243707796, + "grad_norm": 0.20014487206935883, + "learning_rate": 5.6212018357897244e-05, + "loss": 1.7113, + "step": 15527 + }, + { + "epoch": 4.766114180478821, + "grad_norm": 0.22359825670719147, + "learning_rate": 5.620708627849769e-05, + "loss": 1.7356, + "step": 15528 + }, + { + "epoch": 4.7664211172498465, + "grad_norm": 0.2254783809185028, + "learning_rate": 5.620215413775609e-05, + "loss": 1.7397, + "step": 15529 + }, + { + "epoch": 4.766728054020872, + "grad_norm": 0.2827560305595398, + "learning_rate": 5.619722193572117e-05, + "loss": 1.732, + "step": 15530 + }, + { + "epoch": 4.767034990791897, + "grad_norm": 0.22591307759284973, + "learning_rate": 5.619228967244165e-05, + "loss": 1.7713, + "step": 15531 + }, + { + "epoch": 4.7673419275629225, + "grad_norm": 0.25872737169265747, + "learning_rate": 5.618735734796632e-05, + "loss": 1.7291, + "step": 15532 + }, + { + "epoch": 4.767648864333947, + "grad_norm": 0.24515275657176971, + "learning_rate": 5.6182424962343884e-05, + "loss": 1.8079, + "step": 15533 + }, + { + "epoch": 4.767955801104972, + "grad_norm": 0.2456643134355545, + "learning_rate": 5.617749251562309e-05, + "loss": 1.7082, + "step": 15534 + }, + { + "epoch": 4.768262737875998, + "grad_norm": 0.21684220433235168, + "learning_rate": 5.6172560007852716e-05, + "loss": 1.7563, + "step": 15535 + }, + { + "epoch": 4.768569674647023, + "grad_norm": 0.2141445428133011, + "learning_rate": 5.616762743908147e-05, + "loss": 1.7115, + "step": 15536 + }, + { + "epoch": 4.768876611418047, + "grad_norm": 0.22502638399600983, + "learning_rate": 5.616269480935812e-05, + "loss": 1.723, + "step": 15537 + }, + { + "epoch": 4.769183548189073, + "grad_norm": 0.23387989401817322, + "learning_rate": 5.6157762118731416e-05, + "loss": 1.7775, + "step": 15538 + }, + { + "epoch": 4.769490484960098, + "grad_norm": 0.19615057110786438, + "learning_rate": 5.6152829367250096e-05, + "loss": 1.7696, + "step": 15539 + }, + { + "epoch": 4.769797421731123, + "grad_norm": 0.2408154010772705, + "learning_rate": 5.614789655496289e-05, + "loss": 1.7758, + "step": 15540 + }, + { + "epoch": 4.770104358502149, + "grad_norm": 0.20994634926319122, + "learning_rate": 5.614296368191859e-05, + "loss": 1.6935, + "step": 15541 + }, + { + "epoch": 4.770411295273174, + "grad_norm": 0.24135129153728485, + "learning_rate": 5.613803074816591e-05, + "loss": 1.7644, + "step": 15542 + }, + { + "epoch": 4.7707182320441985, + "grad_norm": 0.2380143105983734, + "learning_rate": 5.6133097753753625e-05, + "loss": 1.741, + "step": 15543 + }, + { + "epoch": 4.771025168815224, + "grad_norm": 0.30300623178482056, + "learning_rate": 5.6128164698730465e-05, + "loss": 1.7935, + "step": 15544 + }, + { + "epoch": 4.771332105586249, + "grad_norm": 0.2620760500431061, + "learning_rate": 5.612323158314519e-05, + "loss": 1.7436, + "step": 15545 + }, + { + "epoch": 4.7716390423572745, + "grad_norm": 0.3791491389274597, + "learning_rate": 5.6118298407046544e-05, + "loss": 1.7503, + "step": 15546 + }, + { + "epoch": 4.7719459791283, + "grad_norm": 0.3830909729003906, + "learning_rate": 5.61133651704833e-05, + "loss": 1.7651, + "step": 15547 + }, + { + "epoch": 4.772252915899324, + "grad_norm": 0.26680612564086914, + "learning_rate": 5.610843187350419e-05, + "loss": 1.8075, + "step": 15548 + }, + { + "epoch": 4.77255985267035, + "grad_norm": 0.38018953800201416, + "learning_rate": 5.610349851615798e-05, + "loss": 1.8301, + "step": 15549 + }, + { + "epoch": 4.772866789441375, + "grad_norm": 0.4514484107494354, + "learning_rate": 5.6098565098493414e-05, + "loss": 1.7709, + "step": 15550 + }, + { + "epoch": 4.7731737262124, + "grad_norm": 0.28267863392829895, + "learning_rate": 5.6093631620559254e-05, + "loss": 1.8087, + "step": 15551 + }, + { + "epoch": 4.773480662983426, + "grad_norm": 0.22541162371635437, + "learning_rate": 5.6088698082404256e-05, + "loss": 1.7457, + "step": 15552 + }, + { + "epoch": 4.773787599754451, + "grad_norm": 0.3012544512748718, + "learning_rate": 5.608376448407718e-05, + "loss": 1.7454, + "step": 15553 + }, + { + "epoch": 4.774094536525475, + "grad_norm": 0.2460169941186905, + "learning_rate": 5.607883082562677e-05, + "loss": 1.8237, + "step": 15554 + }, + { + "epoch": 4.774401473296501, + "grad_norm": 0.2918507158756256, + "learning_rate": 5.6073897107101804e-05, + "loss": 1.7416, + "step": 15555 + }, + { + "epoch": 4.774708410067526, + "grad_norm": 0.3104710280895233, + "learning_rate": 5.6068963328551016e-05, + "loss": 1.8162, + "step": 15556 + }, + { + "epoch": 4.7750153468385514, + "grad_norm": 0.2576459050178528, + "learning_rate": 5.606402949002317e-05, + "loss": 1.7732, + "step": 15557 + }, + { + "epoch": 4.775322283609577, + "grad_norm": 0.2373739629983902, + "learning_rate": 5.605909559156706e-05, + "loss": 1.7812, + "step": 15558 + }, + { + "epoch": 4.775629220380601, + "grad_norm": 0.30436694622039795, + "learning_rate": 5.6054161633231385e-05, + "loss": 1.7606, + "step": 15559 + }, + { + "epoch": 4.775936157151627, + "grad_norm": 0.3058558702468872, + "learning_rate": 5.604922761506495e-05, + "loss": 1.8384, + "step": 15560 + }, + { + "epoch": 4.776243093922652, + "grad_norm": 0.26421624422073364, + "learning_rate": 5.6044293537116496e-05, + "loss": 1.8041, + "step": 15561 + }, + { + "epoch": 4.776550030693677, + "grad_norm": 0.4945085346698761, + "learning_rate": 5.603935939943479e-05, + "loss": 1.7522, + "step": 15562 + }, + { + "epoch": 4.776856967464703, + "grad_norm": 0.41049134731292725, + "learning_rate": 5.6034425202068595e-05, + "loss": 1.7471, + "step": 15563 + }, + { + "epoch": 4.777163904235728, + "grad_norm": 0.22972853481769562, + "learning_rate": 5.602949094506668e-05, + "loss": 1.7041, + "step": 15564 + }, + { + "epoch": 4.777470841006752, + "grad_norm": 0.37373700737953186, + "learning_rate": 5.6024556628477785e-05, + "loss": 1.7811, + "step": 15565 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.3603375554084778, + "learning_rate": 5.6019622252350714e-05, + "loss": 1.8396, + "step": 15566 + }, + { + "epoch": 4.778084714548803, + "grad_norm": 0.2085956335067749, + "learning_rate": 5.601468781673419e-05, + "loss": 1.7453, + "step": 15567 + }, + { + "epoch": 4.778391651319828, + "grad_norm": 0.28871405124664307, + "learning_rate": 5.6009753321677e-05, + "loss": 1.7135, + "step": 15568 + }, + { + "epoch": 4.778698588090853, + "grad_norm": 0.2378411591053009, + "learning_rate": 5.600481876722791e-05, + "loss": 1.77, + "step": 15569 + }, + { + "epoch": 4.779005524861878, + "grad_norm": 0.2902696430683136, + "learning_rate": 5.599988415343567e-05, + "loss": 1.7416, + "step": 15570 + }, + { + "epoch": 4.7793124616329035, + "grad_norm": 0.36155447363853455, + "learning_rate": 5.5994949480349066e-05, + "loss": 1.7095, + "step": 15571 + }, + { + "epoch": 4.779619398403929, + "grad_norm": 0.24867403507232666, + "learning_rate": 5.599001474801686e-05, + "loss": 1.8063, + "step": 15572 + }, + { + "epoch": 4.779926335174954, + "grad_norm": 0.24853186309337616, + "learning_rate": 5.5985079956487815e-05, + "loss": 1.7537, + "step": 15573 + }, + { + "epoch": 4.7802332719459795, + "grad_norm": 0.31984636187553406, + "learning_rate": 5.598014510581071e-05, + "loss": 1.7888, + "step": 15574 + }, + { + "epoch": 4.780540208717004, + "grad_norm": 0.23907123506069183, + "learning_rate": 5.597521019603429e-05, + "loss": 1.7157, + "step": 15575 + }, + { + "epoch": 4.780847145488029, + "grad_norm": 0.25759413838386536, + "learning_rate": 5.597027522720736e-05, + "loss": 1.7579, + "step": 15576 + }, + { + "epoch": 4.781154082259055, + "grad_norm": 0.34123921394348145, + "learning_rate": 5.5965340199378654e-05, + "loss": 1.838, + "step": 15577 + }, + { + "epoch": 4.78146101903008, + "grad_norm": 0.2769980728626251, + "learning_rate": 5.596040511259697e-05, + "loss": 1.7889, + "step": 15578 + }, + { + "epoch": 4.781767955801105, + "grad_norm": 0.21936915814876556, + "learning_rate": 5.5955469966911066e-05, + "loss": 1.7434, + "step": 15579 + }, + { + "epoch": 4.78207489257213, + "grad_norm": 0.27583181858062744, + "learning_rate": 5.59505347623697e-05, + "loss": 1.7229, + "step": 15580 + }, + { + "epoch": 4.782381829343155, + "grad_norm": 0.24246171116828918, + "learning_rate": 5.594559949902168e-05, + "loss": 1.7368, + "step": 15581 + }, + { + "epoch": 4.78268876611418, + "grad_norm": 0.22705630958080292, + "learning_rate": 5.594066417691576e-05, + "loss": 1.7261, + "step": 15582 + }, + { + "epoch": 4.782995702885206, + "grad_norm": 0.23308728635311127, + "learning_rate": 5.593572879610072e-05, + "loss": 1.7451, + "step": 15583 + }, + { + "epoch": 4.783302639656231, + "grad_norm": 0.21654267609119415, + "learning_rate": 5.5930793356625324e-05, + "loss": 1.7133, + "step": 15584 + }, + { + "epoch": 4.783609576427256, + "grad_norm": 0.22884133458137512, + "learning_rate": 5.5925857858538347e-05, + "loss": 1.6899, + "step": 15585 + }, + { + "epoch": 4.783916513198281, + "grad_norm": 0.2396838665008545, + "learning_rate": 5.5920922301888555e-05, + "loss": 1.7837, + "step": 15586 + }, + { + "epoch": 4.784223449969306, + "grad_norm": 0.22941450774669647, + "learning_rate": 5.5915986686724765e-05, + "loss": 1.7443, + "step": 15587 + }, + { + "epoch": 4.7845303867403315, + "grad_norm": 0.23992502689361572, + "learning_rate": 5.591105101309572e-05, + "loss": 1.8054, + "step": 15588 + }, + { + "epoch": 4.784837323511357, + "grad_norm": 0.2540588974952698, + "learning_rate": 5.59061152810502e-05, + "loss": 1.855, + "step": 15589 + }, + { + "epoch": 4.785144260282382, + "grad_norm": 0.22691720724105835, + "learning_rate": 5.590117949063699e-05, + "loss": 1.7441, + "step": 15590 + }, + { + "epoch": 4.785451197053407, + "grad_norm": 0.23691289126873016, + "learning_rate": 5.5896243641904864e-05, + "loss": 1.8156, + "step": 15591 + }, + { + "epoch": 4.785758133824432, + "grad_norm": 0.2749332785606384, + "learning_rate": 5.589130773490261e-05, + "loss": 1.8157, + "step": 15592 + }, + { + "epoch": 4.786065070595457, + "grad_norm": 0.2435624748468399, + "learning_rate": 5.588637176967899e-05, + "loss": 1.7473, + "step": 15593 + }, + { + "epoch": 4.786372007366483, + "grad_norm": 0.22931383550167084, + "learning_rate": 5.5881435746282795e-05, + "loss": 1.7652, + "step": 15594 + }, + { + "epoch": 4.786678944137508, + "grad_norm": 0.23916593194007874, + "learning_rate": 5.587649966476282e-05, + "loss": 1.7415, + "step": 15595 + }, + { + "epoch": 4.786985880908533, + "grad_norm": 0.23483172059059143, + "learning_rate": 5.5871563525167814e-05, + "loss": 1.7308, + "step": 15596 + }, + { + "epoch": 4.787292817679558, + "grad_norm": 0.24850021302700043, + "learning_rate": 5.586662732754656e-05, + "loss": 1.8294, + "step": 15597 + }, + { + "epoch": 4.787599754450583, + "grad_norm": 0.2439260333776474, + "learning_rate": 5.586169107194788e-05, + "loss": 1.7599, + "step": 15598 + }, + { + "epoch": 4.787906691221608, + "grad_norm": 0.22379007935523987, + "learning_rate": 5.585675475842054e-05, + "loss": 1.7278, + "step": 15599 + }, + { + "epoch": 4.788213627992634, + "grad_norm": 0.2633908689022064, + "learning_rate": 5.58518183870133e-05, + "loss": 1.7318, + "step": 15600 + }, + { + "epoch": 4.788520564763659, + "grad_norm": 0.20992474257946014, + "learning_rate": 5.584688195777497e-05, + "loss": 1.7003, + "step": 15601 + }, + { + "epoch": 4.7888275015346835, + "grad_norm": 0.2460084706544876, + "learning_rate": 5.584194547075432e-05, + "loss": 1.78, + "step": 15602 + }, + { + "epoch": 4.789134438305709, + "grad_norm": 0.23955418169498444, + "learning_rate": 5.583700892600013e-05, + "loss": 1.7953, + "step": 15603 + }, + { + "epoch": 4.789441375076734, + "grad_norm": 0.2495713233947754, + "learning_rate": 5.583207232356121e-05, + "loss": 1.7874, + "step": 15604 + }, + { + "epoch": 4.7897483118477595, + "grad_norm": 0.22878028452396393, + "learning_rate": 5.5827135663486344e-05, + "loss": 1.7961, + "step": 15605 + }, + { + "epoch": 4.790055248618785, + "grad_norm": 0.2299363762140274, + "learning_rate": 5.582219894582429e-05, + "loss": 1.7497, + "step": 15606 + }, + { + "epoch": 4.79036218538981, + "grad_norm": 0.22896108031272888, + "learning_rate": 5.5817262170623865e-05, + "loss": 1.7543, + "step": 15607 + }, + { + "epoch": 4.790669122160835, + "grad_norm": 0.2150495946407318, + "learning_rate": 5.581232533793383e-05, + "loss": 1.8034, + "step": 15608 + }, + { + "epoch": 4.79097605893186, + "grad_norm": 0.21317999064922333, + "learning_rate": 5.580738844780301e-05, + "loss": 1.7482, + "step": 15609 + }, + { + "epoch": 4.791282995702885, + "grad_norm": 0.21904391050338745, + "learning_rate": 5.580245150028016e-05, + "loss": 1.7647, + "step": 15610 + }, + { + "epoch": 4.791589932473911, + "grad_norm": 0.2026481032371521, + "learning_rate": 5.5797514495414095e-05, + "loss": 1.6997, + "step": 15611 + }, + { + "epoch": 4.791896869244935, + "grad_norm": 0.22508487105369568, + "learning_rate": 5.579257743325359e-05, + "loss": 1.8258, + "step": 15612 + }, + { + "epoch": 4.79220380601596, + "grad_norm": 0.2801211178302765, + "learning_rate": 5.5787640313847435e-05, + "loss": 1.6991, + "step": 15613 + }, + { + "epoch": 4.792510742786986, + "grad_norm": 0.2696724236011505, + "learning_rate": 5.578270313724442e-05, + "loss": 1.7339, + "step": 15614 + }, + { + "epoch": 4.792817679558011, + "grad_norm": 0.2909143269062042, + "learning_rate": 5.577776590349334e-05, + "loss": 1.8481, + "step": 15615 + }, + { + "epoch": 4.793124616329036, + "grad_norm": 0.21682757139205933, + "learning_rate": 5.5772828612643005e-05, + "loss": 1.759, + "step": 15616 + }, + { + "epoch": 4.793431553100062, + "grad_norm": 0.23074059188365936, + "learning_rate": 5.576789126474219e-05, + "loss": 1.7652, + "step": 15617 + }, + { + "epoch": 4.793738489871086, + "grad_norm": 0.24018999934196472, + "learning_rate": 5.576295385983969e-05, + "loss": 1.7986, + "step": 15618 + }, + { + "epoch": 4.7940454266421115, + "grad_norm": 0.23987948894500732, + "learning_rate": 5.575801639798431e-05, + "loss": 1.779, + "step": 15619 + }, + { + "epoch": 4.794352363413137, + "grad_norm": 0.2138533890247345, + "learning_rate": 5.575307887922482e-05, + "loss": 1.7097, + "step": 15620 + }, + { + "epoch": 4.794659300184162, + "grad_norm": 0.1995106190443039, + "learning_rate": 5.5748141303610044e-05, + "loss": 1.6924, + "step": 15621 + }, + { + "epoch": 4.7949662369551875, + "grad_norm": 0.23547641932964325, + "learning_rate": 5.574320367118877e-05, + "loss": 1.8492, + "step": 15622 + }, + { + "epoch": 4.795273173726212, + "grad_norm": 0.22931239008903503, + "learning_rate": 5.5738265982009794e-05, + "loss": 1.8054, + "step": 15623 + }, + { + "epoch": 4.795580110497237, + "grad_norm": 0.19957222044467926, + "learning_rate": 5.573332823612191e-05, + "loss": 1.7464, + "step": 15624 + }, + { + "epoch": 4.795887047268263, + "grad_norm": 0.1990327090024948, + "learning_rate": 5.5728390433573905e-05, + "loss": 1.7438, + "step": 15625 + }, + { + "epoch": 4.796193984039288, + "grad_norm": 0.22276802361011505, + "learning_rate": 5.572345257441459e-05, + "loss": 1.7674, + "step": 15626 + }, + { + "epoch": 4.796500920810313, + "grad_norm": 0.2109617441892624, + "learning_rate": 5.571851465869277e-05, + "loss": 1.7577, + "step": 15627 + }, + { + "epoch": 4.796807857581339, + "grad_norm": 0.22917217016220093, + "learning_rate": 5.5713576686457234e-05, + "loss": 1.7478, + "step": 15628 + }, + { + "epoch": 4.797114794352363, + "grad_norm": 0.21016938984394073, + "learning_rate": 5.570863865775678e-05, + "loss": 1.8078, + "step": 15629 + }, + { + "epoch": 4.797421731123388, + "grad_norm": 0.22478216886520386, + "learning_rate": 5.5703700572640215e-05, + "loss": 1.7621, + "step": 15630 + }, + { + "epoch": 4.797728667894414, + "grad_norm": 0.26899904012680054, + "learning_rate": 5.569876243115634e-05, + "loss": 1.8065, + "step": 15631 + }, + { + "epoch": 4.798035604665439, + "grad_norm": 0.23187808692455292, + "learning_rate": 5.569382423335394e-05, + "loss": 1.7337, + "step": 15632 + }, + { + "epoch": 4.798342541436464, + "grad_norm": 0.2264855057001114, + "learning_rate": 5.568888597928185e-05, + "loss": 1.7879, + "step": 15633 + }, + { + "epoch": 4.798649478207489, + "grad_norm": 0.244137242436409, + "learning_rate": 5.568394766898886e-05, + "loss": 1.8307, + "step": 15634 + }, + { + "epoch": 4.798956414978514, + "grad_norm": 0.2400583177804947, + "learning_rate": 5.5679009302523744e-05, + "loss": 1.76, + "step": 15635 + }, + { + "epoch": 4.7992633517495396, + "grad_norm": 0.2324059158563614, + "learning_rate": 5.5674070879935347e-05, + "loss": 1.7594, + "step": 15636 + }, + { + "epoch": 4.799570288520565, + "grad_norm": 0.21753786504268646, + "learning_rate": 5.566913240127244e-05, + "loss": 1.7568, + "step": 15637 + }, + { + "epoch": 4.79987722529159, + "grad_norm": 0.21557624638080597, + "learning_rate": 5.566419386658386e-05, + "loss": 1.7733, + "step": 15638 + }, + { + "epoch": 4.800184162062616, + "grad_norm": 0.22795113921165466, + "learning_rate": 5.565925527591839e-05, + "loss": 1.7624, + "step": 15639 + }, + { + "epoch": 4.80049109883364, + "grad_norm": 0.23035180568695068, + "learning_rate": 5.565431662932484e-05, + "loss": 1.7436, + "step": 15640 + }, + { + "epoch": 4.800798035604665, + "grad_norm": 0.2569425404071808, + "learning_rate": 5.564937792685203e-05, + "loss": 1.7027, + "step": 15641 + }, + { + "epoch": 4.801104972375691, + "grad_norm": 0.20544980466365814, + "learning_rate": 5.564443916854875e-05, + "loss": 1.7125, + "step": 15642 + }, + { + "epoch": 4.801411909146716, + "grad_norm": 0.25040850043296814, + "learning_rate": 5.5639500354463815e-05, + "loss": 1.7646, + "step": 15643 + }, + { + "epoch": 4.8017188459177405, + "grad_norm": 0.1991344839334488, + "learning_rate": 5.563456148464602e-05, + "loss": 1.7206, + "step": 15644 + }, + { + "epoch": 4.802025782688766, + "grad_norm": 0.236537903547287, + "learning_rate": 5.56296225591442e-05, + "loss": 1.7288, + "step": 15645 + }, + { + "epoch": 4.802332719459791, + "grad_norm": 0.253619521856308, + "learning_rate": 5.562468357800714e-05, + "loss": 1.7347, + "step": 15646 + }, + { + "epoch": 4.8026396562308165, + "grad_norm": 0.22038741409778595, + "learning_rate": 5.561974454128367e-05, + "loss": 1.7854, + "step": 15647 + }, + { + "epoch": 4.802946593001842, + "grad_norm": 0.24848157167434692, + "learning_rate": 5.5614805449022576e-05, + "loss": 1.6904, + "step": 15648 + }, + { + "epoch": 4.803253529772867, + "grad_norm": 0.28735271096229553, + "learning_rate": 5.56098663012727e-05, + "loss": 1.7476, + "step": 15649 + }, + { + "epoch": 4.803560466543892, + "grad_norm": 0.2658432722091675, + "learning_rate": 5.5604927098082825e-05, + "loss": 1.7314, + "step": 15650 + }, + { + "epoch": 4.803867403314917, + "grad_norm": 0.20409154891967773, + "learning_rate": 5.559998783950179e-05, + "loss": 1.7698, + "step": 15651 + }, + { + "epoch": 4.804174340085942, + "grad_norm": 0.21932728588581085, + "learning_rate": 5.5595048525578384e-05, + "loss": 1.7808, + "step": 15652 + }, + { + "epoch": 4.804481276856968, + "grad_norm": 0.2549879848957062, + "learning_rate": 5.559010915636143e-05, + "loss": 1.8294, + "step": 15653 + }, + { + "epoch": 4.804788213627993, + "grad_norm": 0.2002289742231369, + "learning_rate": 5.5585169731899736e-05, + "loss": 1.732, + "step": 15654 + }, + { + "epoch": 4.805095150399017, + "grad_norm": 0.19988931715488434, + "learning_rate": 5.558023025224212e-05, + "loss": 1.7482, + "step": 15655 + }, + { + "epoch": 4.805402087170043, + "grad_norm": 0.21265259385108948, + "learning_rate": 5.55752907174374e-05, + "loss": 1.8003, + "step": 15656 + }, + { + "epoch": 4.805709023941068, + "grad_norm": 0.22365640103816986, + "learning_rate": 5.5570351127534395e-05, + "loss": 1.7536, + "step": 15657 + }, + { + "epoch": 4.806015960712093, + "grad_norm": 0.25516408681869507, + "learning_rate": 5.556541148258192e-05, + "loss": 1.7648, + "step": 15658 + }, + { + "epoch": 4.806322897483119, + "grad_norm": 0.24870765209197998, + "learning_rate": 5.5560471782628775e-05, + "loss": 1.7793, + "step": 15659 + }, + { + "epoch": 4.806629834254144, + "grad_norm": 0.22119416296482086, + "learning_rate": 5.555553202772379e-05, + "loss": 1.7464, + "step": 15660 + }, + { + "epoch": 4.8069367710251685, + "grad_norm": 0.2781904637813568, + "learning_rate": 5.555059221791579e-05, + "loss": 1.7537, + "step": 15661 + }, + { + "epoch": 4.807243707796194, + "grad_norm": 0.2433774471282959, + "learning_rate": 5.5545652353253574e-05, + "loss": 1.74, + "step": 15662 + }, + { + "epoch": 4.807550644567219, + "grad_norm": 0.19932180643081665, + "learning_rate": 5.554071243378598e-05, + "loss": 1.75, + "step": 15663 + }, + { + "epoch": 4.8078575813382445, + "grad_norm": 0.2428865283727646, + "learning_rate": 5.553577245956182e-05, + "loss": 1.7198, + "step": 15664 + }, + { + "epoch": 4.80816451810927, + "grad_norm": 0.2914198338985443, + "learning_rate": 5.553083243062991e-05, + "loss": 1.7544, + "step": 15665 + }, + { + "epoch": 4.808471454880294, + "grad_norm": 0.2274291068315506, + "learning_rate": 5.5525892347039056e-05, + "loss": 1.8213, + "step": 15666 + }, + { + "epoch": 4.80877839165132, + "grad_norm": 0.23662471771240234, + "learning_rate": 5.552095220883811e-05, + "loss": 1.8025, + "step": 15667 + }, + { + "epoch": 4.809085328422345, + "grad_norm": 0.23062555491924286, + "learning_rate": 5.551601201607587e-05, + "loss": 1.7109, + "step": 15668 + }, + { + "epoch": 4.80939226519337, + "grad_norm": 0.19986943900585175, + "learning_rate": 5.551107176880117e-05, + "loss": 1.7442, + "step": 15669 + }, + { + "epoch": 4.809699201964396, + "grad_norm": 0.2545560300350189, + "learning_rate": 5.5506131467062836e-05, + "loss": 1.7609, + "step": 15670 + }, + { + "epoch": 4.810006138735421, + "grad_norm": 0.253296434879303, + "learning_rate": 5.550119111090968e-05, + "loss": 1.7307, + "step": 15671 + }, + { + "epoch": 4.810313075506445, + "grad_norm": 0.19617940485477448, + "learning_rate": 5.549625070039052e-05, + "loss": 1.7507, + "step": 15672 + }, + { + "epoch": 4.810620012277471, + "grad_norm": 0.2525297999382019, + "learning_rate": 5.5491310235554193e-05, + "loss": 1.8021, + "step": 15673 + }, + { + "epoch": 4.810926949048496, + "grad_norm": 0.20537389814853668, + "learning_rate": 5.548636971644953e-05, + "loss": 1.7432, + "step": 15674 + }, + { + "epoch": 4.811233885819521, + "grad_norm": 0.19924211502075195, + "learning_rate": 5.548142914312533e-05, + "loss": 1.7741, + "step": 15675 + }, + { + "epoch": 4.811540822590547, + "grad_norm": 0.21121448278427124, + "learning_rate": 5.547648851563046e-05, + "loss": 1.7198, + "step": 15676 + }, + { + "epoch": 4.811847759361571, + "grad_norm": 0.23504914343357086, + "learning_rate": 5.547154783401369e-05, + "loss": 1.7173, + "step": 15677 + }, + { + "epoch": 4.8121546961325965, + "grad_norm": 0.2362392097711563, + "learning_rate": 5.54666070983239e-05, + "loss": 1.7752, + "step": 15678 + }, + { + "epoch": 4.812461632903622, + "grad_norm": 0.2524966895580292, + "learning_rate": 5.5461666308609886e-05, + "loss": 1.7943, + "step": 15679 + }, + { + "epoch": 4.812768569674647, + "grad_norm": 0.2250952422618866, + "learning_rate": 5.5456725464920476e-05, + "loss": 1.7606, + "step": 15680 + }, + { + "epoch": 4.8130755064456725, + "grad_norm": 0.21753156185150146, + "learning_rate": 5.5451784567304524e-05, + "loss": 1.7846, + "step": 15681 + }, + { + "epoch": 4.813382443216698, + "grad_norm": 0.220795676112175, + "learning_rate": 5.5446843615810825e-05, + "loss": 1.7422, + "step": 15682 + }, + { + "epoch": 4.813689379987722, + "grad_norm": 0.23597733676433563, + "learning_rate": 5.544190261048823e-05, + "loss": 1.7818, + "step": 15683 + }, + { + "epoch": 4.813996316758748, + "grad_norm": 0.2625976502895355, + "learning_rate": 5.543696155138557e-05, + "loss": 1.7796, + "step": 15684 + }, + { + "epoch": 4.814303253529773, + "grad_norm": 0.20515871047973633, + "learning_rate": 5.5432020438551656e-05, + "loss": 1.7096, + "step": 15685 + }, + { + "epoch": 4.814610190300798, + "grad_norm": 0.19353924691677094, + "learning_rate": 5.542707927203536e-05, + "loss": 1.7541, + "step": 15686 + }, + { + "epoch": 4.814917127071823, + "grad_norm": 0.21998172998428345, + "learning_rate": 5.5422138051885454e-05, + "loss": 1.7696, + "step": 15687 + }, + { + "epoch": 4.815224063842848, + "grad_norm": 0.27576857805252075, + "learning_rate": 5.5417196778150816e-05, + "loss": 1.7491, + "step": 15688 + }, + { + "epoch": 4.815531000613873, + "grad_norm": 0.28202036023139954, + "learning_rate": 5.5412255450880254e-05, + "loss": 1.8615, + "step": 15689 + }, + { + "epoch": 4.815837937384899, + "grad_norm": 0.29632845520973206, + "learning_rate": 5.540731407012263e-05, + "loss": 1.7698, + "step": 15690 + }, + { + "epoch": 4.816144874155924, + "grad_norm": 0.35393890738487244, + "learning_rate": 5.540237263592675e-05, + "loss": 1.7924, + "step": 15691 + }, + { + "epoch": 4.816451810926949, + "grad_norm": 0.23756493628025055, + "learning_rate": 5.5397431148341447e-05, + "loss": 1.8301, + "step": 15692 + }, + { + "epoch": 4.816758747697974, + "grad_norm": 0.310153603553772, + "learning_rate": 5.53924896074156e-05, + "loss": 1.8162, + "step": 15693 + }, + { + "epoch": 4.817065684468999, + "grad_norm": 0.3355565369129181, + "learning_rate": 5.538754801319797e-05, + "loss": 1.7738, + "step": 15694 + }, + { + "epoch": 4.8173726212400245, + "grad_norm": 0.2360079288482666, + "learning_rate": 5.5382606365737446e-05, + "loss": 1.6883, + "step": 15695 + }, + { + "epoch": 4.81767955801105, + "grad_norm": 0.2932819724082947, + "learning_rate": 5.537766466508286e-05, + "loss": 1.8045, + "step": 15696 + }, + { + "epoch": 4.817986494782075, + "grad_norm": 0.31298181414604187, + "learning_rate": 5.537272291128304e-05, + "loss": 1.7516, + "step": 15697 + }, + { + "epoch": 4.8182934315531, + "grad_norm": 0.22871924936771393, + "learning_rate": 5.5367781104386806e-05, + "loss": 1.7386, + "step": 15698 + }, + { + "epoch": 4.818600368324125, + "grad_norm": 0.27097782492637634, + "learning_rate": 5.5362839244443034e-05, + "loss": 1.733, + "step": 15699 + }, + { + "epoch": 4.81890730509515, + "grad_norm": 0.23296736180782318, + "learning_rate": 5.535789733150052e-05, + "loss": 1.7735, + "step": 15700 + }, + { + "epoch": 4.819214241866176, + "grad_norm": 0.22650237381458282, + "learning_rate": 5.5352955365608125e-05, + "loss": 1.7443, + "step": 15701 + }, + { + "epoch": 4.819521178637201, + "grad_norm": 0.25525161623954773, + "learning_rate": 5.534801334681471e-05, + "loss": 1.7379, + "step": 15702 + }, + { + "epoch": 4.819828115408226, + "grad_norm": 0.2249457836151123, + "learning_rate": 5.534307127516908e-05, + "loss": 1.7393, + "step": 15703 + }, + { + "epoch": 4.820135052179251, + "grad_norm": 0.1995566338300705, + "learning_rate": 5.5338129150720084e-05, + "loss": 1.7411, + "step": 15704 + }, + { + "epoch": 4.820441988950276, + "grad_norm": 0.250851035118103, + "learning_rate": 5.533318697351657e-05, + "loss": 1.7801, + "step": 15705 + }, + { + "epoch": 4.820748925721301, + "grad_norm": 0.3175830543041229, + "learning_rate": 5.532824474360737e-05, + "loss": 1.7553, + "step": 15706 + }, + { + "epoch": 4.821055862492327, + "grad_norm": 0.22842039167881012, + "learning_rate": 5.532330246104134e-05, + "loss": 1.7489, + "step": 15707 + }, + { + "epoch": 4.821362799263352, + "grad_norm": 0.21125485002994537, + "learning_rate": 5.531836012586732e-05, + "loss": 1.7543, + "step": 15708 + }, + { + "epoch": 4.8216697360343765, + "grad_norm": 0.33028700947761536, + "learning_rate": 5.531341773813414e-05, + "loss": 1.8237, + "step": 15709 + }, + { + "epoch": 4.821976672805402, + "grad_norm": 0.324564129114151, + "learning_rate": 5.530847529789067e-05, + "loss": 1.7288, + "step": 15710 + }, + { + "epoch": 4.822283609576427, + "grad_norm": 0.3299528956413269, + "learning_rate": 5.530353280518571e-05, + "loss": 1.7536, + "step": 15711 + }, + { + "epoch": 4.8225905463474525, + "grad_norm": 0.3535030782222748, + "learning_rate": 5.5298590260068136e-05, + "loss": 1.7941, + "step": 15712 + }, + { + "epoch": 4.822897483118478, + "grad_norm": 0.2627669870853424, + "learning_rate": 5.5293647662586804e-05, + "loss": 1.7638, + "step": 15713 + }, + { + "epoch": 4.823204419889503, + "grad_norm": 0.25569450855255127, + "learning_rate": 5.5288705012790535e-05, + "loss": 1.7396, + "step": 15714 + }, + { + "epoch": 4.823511356660528, + "grad_norm": 0.26099520921707153, + "learning_rate": 5.528376231072817e-05, + "loss": 1.7415, + "step": 15715 + }, + { + "epoch": 4.823818293431553, + "grad_norm": 0.31833693385124207, + "learning_rate": 5.527881955644858e-05, + "loss": 1.7683, + "step": 15716 + }, + { + "epoch": 4.824125230202578, + "grad_norm": 0.2753448188304901, + "learning_rate": 5.5273876750000594e-05, + "loss": 1.6653, + "step": 15717 + }, + { + "epoch": 4.824432166973604, + "grad_norm": 0.23816895484924316, + "learning_rate": 5.526893389143307e-05, + "loss": 1.7575, + "step": 15718 + }, + { + "epoch": 4.824739103744628, + "grad_norm": 0.25376051664352417, + "learning_rate": 5.5263990980794856e-05, + "loss": 1.755, + "step": 15719 + }, + { + "epoch": 4.8250460405156534, + "grad_norm": 0.2483726590871811, + "learning_rate": 5.52590480181348e-05, + "loss": 1.7566, + "step": 15720 + }, + { + "epoch": 4.825352977286679, + "grad_norm": 0.2073517143726349, + "learning_rate": 5.5254105003501746e-05, + "loss": 1.7069, + "step": 15721 + }, + { + "epoch": 4.825659914057704, + "grad_norm": 0.3166659474372864, + "learning_rate": 5.524916193694455e-05, + "loss": 1.7012, + "step": 15722 + }, + { + "epoch": 4.8259668508287294, + "grad_norm": 0.24518641829490662, + "learning_rate": 5.524421881851205e-05, + "loss": 1.7027, + "step": 15723 + }, + { + "epoch": 4.826273787599755, + "grad_norm": 0.23137906193733215, + "learning_rate": 5.523927564825311e-05, + "loss": 1.746, + "step": 15724 + }, + { + "epoch": 4.82658072437078, + "grad_norm": 0.27937051653862, + "learning_rate": 5.5234332426216586e-05, + "loss": 1.7064, + "step": 15725 + }, + { + "epoch": 4.826887661141805, + "grad_norm": 0.26408496499061584, + "learning_rate": 5.522938915245131e-05, + "loss": 1.6598, + "step": 15726 + }, + { + "epoch": 4.82719459791283, + "grad_norm": 0.22269997000694275, + "learning_rate": 5.5224445827006164e-05, + "loss": 1.7166, + "step": 15727 + }, + { + "epoch": 4.827501534683855, + "grad_norm": 0.22687453031539917, + "learning_rate": 5.5219502449929964e-05, + "loss": 1.7156, + "step": 15728 + }, + { + "epoch": 4.827808471454881, + "grad_norm": 0.26355600357055664, + "learning_rate": 5.5214559021271585e-05, + "loss": 1.8016, + "step": 15729 + }, + { + "epoch": 4.828115408225905, + "grad_norm": 0.30103012919425964, + "learning_rate": 5.520961554107987e-05, + "loss": 1.7856, + "step": 15730 + }, + { + "epoch": 4.82842234499693, + "grad_norm": 0.22604018449783325, + "learning_rate": 5.520467200940369e-05, + "loss": 1.813, + "step": 15731 + }, + { + "epoch": 4.828729281767956, + "grad_norm": 0.25435203313827515, + "learning_rate": 5.51997284262919e-05, + "loss": 1.7511, + "step": 15732 + }, + { + "epoch": 4.829036218538981, + "grad_norm": 0.2740691304206848, + "learning_rate": 5.519478479179333e-05, + "loss": 1.7326, + "step": 15733 + }, + { + "epoch": 4.829343155310006, + "grad_norm": 0.19710861146450043, + "learning_rate": 5.5189841105956866e-05, + "loss": 1.7581, + "step": 15734 + }, + { + "epoch": 4.829650092081032, + "grad_norm": 0.2315293401479721, + "learning_rate": 5.518489736883132e-05, + "loss": 1.6796, + "step": 15735 + }, + { + "epoch": 4.829957028852056, + "grad_norm": 0.2465476542711258, + "learning_rate": 5.51799535804656e-05, + "loss": 1.7276, + "step": 15736 + }, + { + "epoch": 4.8302639656230815, + "grad_norm": 0.20438486337661743, + "learning_rate": 5.5175009740908546e-05, + "loss": 1.7188, + "step": 15737 + }, + { + "epoch": 4.830570902394107, + "grad_norm": 0.24328351020812988, + "learning_rate": 5.5170065850209016e-05, + "loss": 1.7165, + "step": 15738 + }, + { + "epoch": 4.830877839165132, + "grad_norm": 0.22486837208271027, + "learning_rate": 5.516512190841586e-05, + "loss": 1.7369, + "step": 15739 + }, + { + "epoch": 4.8311847759361575, + "grad_norm": 0.2065822333097458, + "learning_rate": 5.5160177915577934e-05, + "loss": 1.7125, + "step": 15740 + }, + { + "epoch": 4.831491712707182, + "grad_norm": 0.21223095059394836, + "learning_rate": 5.5155233871744104e-05, + "loss": 1.7319, + "step": 15741 + }, + { + "epoch": 4.831798649478207, + "grad_norm": 0.25712934136390686, + "learning_rate": 5.515028977696325e-05, + "loss": 1.7847, + "step": 15742 + }, + { + "epoch": 4.832105586249233, + "grad_norm": 0.21289978921413422, + "learning_rate": 5.5145345631284215e-05, + "loss": 1.7629, + "step": 15743 + }, + { + "epoch": 4.832412523020258, + "grad_norm": 0.22347134351730347, + "learning_rate": 5.514040143475585e-05, + "loss": 1.7491, + "step": 15744 + }, + { + "epoch": 4.832719459791283, + "grad_norm": 0.20660510659217834, + "learning_rate": 5.513545718742702e-05, + "loss": 1.7377, + "step": 15745 + }, + { + "epoch": 4.833026396562309, + "grad_norm": 0.21612273156642914, + "learning_rate": 5.513051288934658e-05, + "loss": 1.7973, + "step": 15746 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.22515933215618134, + "learning_rate": 5.512556854056342e-05, + "loss": 1.7774, + "step": 15747 + }, + { + "epoch": 4.833640270104358, + "grad_norm": 0.21075554192066193, + "learning_rate": 5.512062414112639e-05, + "loss": 1.7741, + "step": 15748 + }, + { + "epoch": 4.833947206875384, + "grad_norm": 0.2203720659017563, + "learning_rate": 5.511567969108436e-05, + "loss": 1.7902, + "step": 15749 + }, + { + "epoch": 4.834254143646409, + "grad_norm": 0.20247167348861694, + "learning_rate": 5.511073519048616e-05, + "loss": 1.7084, + "step": 15750 + }, + { + "epoch": 4.834561080417434, + "grad_norm": 0.247711181640625, + "learning_rate": 5.5105790639380695e-05, + "loss": 1.8465, + "step": 15751 + }, + { + "epoch": 4.834868017188459, + "grad_norm": 0.22866854071617126, + "learning_rate": 5.51008460378168e-05, + "loss": 1.7252, + "step": 15752 + }, + { + "epoch": 4.835174953959484, + "grad_norm": 0.2335643470287323, + "learning_rate": 5.5095901385843374e-05, + "loss": 1.703, + "step": 15753 + }, + { + "epoch": 4.8354818907305095, + "grad_norm": 0.20874348282814026, + "learning_rate": 5.509095668350926e-05, + "loss": 1.7114, + "step": 15754 + }, + { + "epoch": 4.835788827501535, + "grad_norm": 0.19156917929649353, + "learning_rate": 5.5086011930863314e-05, + "loss": 1.6975, + "step": 15755 + }, + { + "epoch": 4.83609576427256, + "grad_norm": 0.23480524122714996, + "learning_rate": 5.508106712795443e-05, + "loss": 1.8291, + "step": 15756 + }, + { + "epoch": 4.8364027010435855, + "grad_norm": 0.20430417358875275, + "learning_rate": 5.5076122274831454e-05, + "loss": 1.7605, + "step": 15757 + }, + { + "epoch": 4.83670963781461, + "grad_norm": 0.26790598034858704, + "learning_rate": 5.5071177371543256e-05, + "loss": 1.7541, + "step": 15758 + }, + { + "epoch": 4.837016574585635, + "grad_norm": 0.3339289724826813, + "learning_rate": 5.506623241813873e-05, + "loss": 1.7566, + "step": 15759 + }, + { + "epoch": 4.837323511356661, + "grad_norm": 0.30528193712234497, + "learning_rate": 5.5061287414666726e-05, + "loss": 1.7371, + "step": 15760 + }, + { + "epoch": 4.837630448127686, + "grad_norm": 0.21059657633304596, + "learning_rate": 5.5056342361176114e-05, + "loss": 1.7599, + "step": 15761 + }, + { + "epoch": 4.83793738489871, + "grad_norm": 0.27918973565101624, + "learning_rate": 5.5051397257715756e-05, + "loss": 1.7485, + "step": 15762 + }, + { + "epoch": 4.838244321669736, + "grad_norm": 0.23147793114185333, + "learning_rate": 5.5046452104334514e-05, + "loss": 1.7121, + "step": 15763 + }, + { + "epoch": 4.838551258440761, + "grad_norm": 0.22028742730617523, + "learning_rate": 5.5041506901081294e-05, + "loss": 1.803, + "step": 15764 + }, + { + "epoch": 4.838858195211786, + "grad_norm": 0.22840891778469086, + "learning_rate": 5.5036561648004946e-05, + "loss": 1.7555, + "step": 15765 + }, + { + "epoch": 4.839165131982812, + "grad_norm": 0.2610893249511719, + "learning_rate": 5.503161634515433e-05, + "loss": 1.7873, + "step": 15766 + }, + { + "epoch": 4.839472068753837, + "grad_norm": 0.2530003786087036, + "learning_rate": 5.502667099257836e-05, + "loss": 1.7604, + "step": 15767 + }, + { + "epoch": 4.8397790055248615, + "grad_norm": 0.20120400190353394, + "learning_rate": 5.5021725590325854e-05, + "loss": 1.7476, + "step": 15768 + }, + { + "epoch": 4.840085942295887, + "grad_norm": 0.2189723700284958, + "learning_rate": 5.501678013844571e-05, + "loss": 1.7174, + "step": 15769 + }, + { + "epoch": 4.840392879066912, + "grad_norm": 0.2511899173259735, + "learning_rate": 5.501183463698683e-05, + "loss": 1.7589, + "step": 15770 + }, + { + "epoch": 4.8406998158379375, + "grad_norm": 0.24899333715438843, + "learning_rate": 5.5006889085998035e-05, + "loss": 1.7253, + "step": 15771 + }, + { + "epoch": 4.841006752608963, + "grad_norm": 0.21223559975624084, + "learning_rate": 5.5001943485528254e-05, + "loss": 1.6949, + "step": 15772 + }, + { + "epoch": 4.841313689379987, + "grad_norm": 0.21394596993923187, + "learning_rate": 5.499699783562632e-05, + "loss": 1.7827, + "step": 15773 + }, + { + "epoch": 4.841620626151013, + "grad_norm": 0.2379613220691681, + "learning_rate": 5.4992052136341134e-05, + "loss": 1.7968, + "step": 15774 + }, + { + "epoch": 4.841927562922038, + "grad_norm": 0.23748385906219482, + "learning_rate": 5.498710638772154e-05, + "loss": 1.797, + "step": 15775 + }, + { + "epoch": 4.842234499693063, + "grad_norm": 0.2502206265926361, + "learning_rate": 5.498216058981646e-05, + "loss": 1.7292, + "step": 15776 + }, + { + "epoch": 4.842541436464089, + "grad_norm": 0.23613516986370087, + "learning_rate": 5.497721474267475e-05, + "loss": 1.7353, + "step": 15777 + }, + { + "epoch": 4.842848373235114, + "grad_norm": 0.25274696946144104, + "learning_rate": 5.497226884634527e-05, + "loss": 1.7782, + "step": 15778 + }, + { + "epoch": 4.843155310006138, + "grad_norm": 0.19574183225631714, + "learning_rate": 5.496732290087694e-05, + "loss": 1.6926, + "step": 15779 + }, + { + "epoch": 4.843462246777164, + "grad_norm": 0.21040405333042145, + "learning_rate": 5.496237690631858e-05, + "loss": 1.7235, + "step": 15780 + }, + { + "epoch": 4.843769183548189, + "grad_norm": 0.22499679028987885, + "learning_rate": 5.495743086271913e-05, + "loss": 1.7889, + "step": 15781 + }, + { + "epoch": 4.844076120319214, + "grad_norm": 0.24623246490955353, + "learning_rate": 5.4952484770127433e-05, + "loss": 1.7357, + "step": 15782 + }, + { + "epoch": 4.84438305709024, + "grad_norm": 0.21706275641918182, + "learning_rate": 5.494753862859238e-05, + "loss": 1.7349, + "step": 15783 + }, + { + "epoch": 4.844689993861264, + "grad_norm": 0.20705166459083557, + "learning_rate": 5.4942592438162855e-05, + "loss": 1.7047, + "step": 15784 + }, + { + "epoch": 4.8449969306322895, + "grad_norm": 0.21216751635074615, + "learning_rate": 5.493764619888773e-05, + "loss": 1.7335, + "step": 15785 + }, + { + "epoch": 4.845303867403315, + "grad_norm": 0.2945895195007324, + "learning_rate": 5.493269991081588e-05, + "loss": 1.838, + "step": 15786 + }, + { + "epoch": 4.84561080417434, + "grad_norm": 0.22013652324676514, + "learning_rate": 5.492775357399621e-05, + "loss": 1.7541, + "step": 15787 + }, + { + "epoch": 4.8459177409453655, + "grad_norm": 0.25428512692451477, + "learning_rate": 5.4922807188477585e-05, + "loss": 1.7405, + "step": 15788 + }, + { + "epoch": 4.846224677716391, + "grad_norm": 0.23189012706279755, + "learning_rate": 5.49178607543089e-05, + "loss": 1.8075, + "step": 15789 + }, + { + "epoch": 4.846531614487415, + "grad_norm": 0.21637389063835144, + "learning_rate": 5.491291427153904e-05, + "loss": 1.7229, + "step": 15790 + }, + { + "epoch": 4.846838551258441, + "grad_norm": 0.20628009736537933, + "learning_rate": 5.490796774021687e-05, + "loss": 1.7605, + "step": 15791 + }, + { + "epoch": 4.847145488029466, + "grad_norm": 0.20845308899879456, + "learning_rate": 5.4903021160391276e-05, + "loss": 1.7864, + "step": 15792 + }, + { + "epoch": 4.847452424800491, + "grad_norm": 0.20367322862148285, + "learning_rate": 5.4898074532111164e-05, + "loss": 1.733, + "step": 15793 + }, + { + "epoch": 4.847759361571516, + "grad_norm": 0.2066505253314972, + "learning_rate": 5.489312785542543e-05, + "loss": 1.7113, + "step": 15794 + }, + { + "epoch": 4.848066298342541, + "grad_norm": 0.23874987661838531, + "learning_rate": 5.488818113038292e-05, + "loss": 1.7735, + "step": 15795 + }, + { + "epoch": 4.848373235113566, + "grad_norm": 0.26583850383758545, + "learning_rate": 5.488323435703254e-05, + "loss": 1.8019, + "step": 15796 + }, + { + "epoch": 4.848680171884592, + "grad_norm": 0.25207552313804626, + "learning_rate": 5.487828753542317e-05, + "loss": 1.7491, + "step": 15797 + }, + { + "epoch": 4.848987108655617, + "grad_norm": 0.23065905272960663, + "learning_rate": 5.48733406656037e-05, + "loss": 1.7451, + "step": 15798 + }, + { + "epoch": 4.849294045426642, + "grad_norm": 0.26914483308792114, + "learning_rate": 5.486839374762304e-05, + "loss": 1.7553, + "step": 15799 + }, + { + "epoch": 4.849600982197668, + "grad_norm": 0.2509605884552002, + "learning_rate": 5.4863446781530046e-05, + "loss": 1.7124, + "step": 15800 + }, + { + "epoch": 4.849907918968692, + "grad_norm": 0.2618432343006134, + "learning_rate": 5.485849976737362e-05, + "loss": 1.7368, + "step": 15801 + }, + { + "epoch": 4.850214855739718, + "grad_norm": 0.46875160932540894, + "learning_rate": 5.485355270520266e-05, + "loss": 1.7883, + "step": 15802 + }, + { + "epoch": 4.850521792510743, + "grad_norm": 0.37585484981536865, + "learning_rate": 5.4848605595066025e-05, + "loss": 1.7894, + "step": 15803 + }, + { + "epoch": 4.850828729281768, + "grad_norm": 0.2244408279657364, + "learning_rate": 5.4843658437012646e-05, + "loss": 1.7394, + "step": 15804 + }, + { + "epoch": 4.851135666052793, + "grad_norm": 0.4061773419380188, + "learning_rate": 5.48387112310914e-05, + "loss": 1.7703, + "step": 15805 + }, + { + "epoch": 4.851442602823818, + "grad_norm": 0.35925009846687317, + "learning_rate": 5.483376397735117e-05, + "loss": 1.7798, + "step": 15806 + }, + { + "epoch": 4.851749539594843, + "grad_norm": 0.23050184547901154, + "learning_rate": 5.482881667584084e-05, + "loss": 1.7984, + "step": 15807 + }, + { + "epoch": 4.852056476365869, + "grad_norm": 0.37308645248413086, + "learning_rate": 5.4823869326609335e-05, + "loss": 1.6747, + "step": 15808 + }, + { + "epoch": 4.852363413136894, + "grad_norm": 0.29826754331588745, + "learning_rate": 5.481892192970551e-05, + "loss": 1.7432, + "step": 15809 + }, + { + "epoch": 4.852670349907919, + "grad_norm": 0.23652370274066925, + "learning_rate": 5.4813974485178266e-05, + "loss": 1.7557, + "step": 15810 + }, + { + "epoch": 4.852977286678944, + "grad_norm": 0.40549808740615845, + "learning_rate": 5.4809026993076526e-05, + "loss": 1.7317, + "step": 15811 + }, + { + "epoch": 4.853284223449969, + "grad_norm": 0.3367961347103119, + "learning_rate": 5.4804079453449156e-05, + "loss": 1.7648, + "step": 15812 + }, + { + "epoch": 4.8535911602209945, + "grad_norm": 0.21629661321640015, + "learning_rate": 5.4799131866345055e-05, + "loss": 1.7986, + "step": 15813 + }, + { + "epoch": 4.85389809699202, + "grad_norm": 0.26381492614746094, + "learning_rate": 5.4794184231813105e-05, + "loss": 1.7401, + "step": 15814 + }, + { + "epoch": 4.854205033763045, + "grad_norm": 0.22319363057613373, + "learning_rate": 5.478923654990223e-05, + "loss": 1.7773, + "step": 15815 + }, + { + "epoch": 4.85451197053407, + "grad_norm": 0.2547159492969513, + "learning_rate": 5.4784288820661326e-05, + "loss": 1.8194, + "step": 15816 + }, + { + "epoch": 4.854818907305095, + "grad_norm": 0.29574522376060486, + "learning_rate": 5.477934104413925e-05, + "loss": 1.7351, + "step": 15817 + }, + { + "epoch": 4.85512584407612, + "grad_norm": 0.17389361560344696, + "learning_rate": 5.4774393220384945e-05, + "loss": 1.6957, + "step": 15818 + }, + { + "epoch": 4.855432780847146, + "grad_norm": 0.23746751248836517, + "learning_rate": 5.476944534944728e-05, + "loss": 1.7713, + "step": 15819 + }, + { + "epoch": 4.855739717618171, + "grad_norm": 0.182356595993042, + "learning_rate": 5.476449743137516e-05, + "loss": 1.7144, + "step": 15820 + }, + { + "epoch": 4.856046654389196, + "grad_norm": 0.23716382682323456, + "learning_rate": 5.4759549466217475e-05, + "loss": 1.7451, + "step": 15821 + }, + { + "epoch": 4.856353591160221, + "grad_norm": 0.316806823015213, + "learning_rate": 5.475460145402313e-05, + "loss": 1.7823, + "step": 15822 + }, + { + "epoch": 4.856660527931246, + "grad_norm": 0.2333129197359085, + "learning_rate": 5.474965339484105e-05, + "loss": 1.7788, + "step": 15823 + }, + { + "epoch": 4.856967464702271, + "grad_norm": 0.21180212497711182, + "learning_rate": 5.47447052887201e-05, + "loss": 1.7513, + "step": 15824 + }, + { + "epoch": 4.857274401473297, + "grad_norm": 0.22641299664974213, + "learning_rate": 5.473975713570919e-05, + "loss": 1.7514, + "step": 15825 + }, + { + "epoch": 4.857581338244322, + "grad_norm": 0.3179668188095093, + "learning_rate": 5.473480893585723e-05, + "loss": 1.7939, + "step": 15826 + }, + { + "epoch": 4.8578882750153465, + "grad_norm": 0.27463147044181824, + "learning_rate": 5.472986068921309e-05, + "loss": 1.7487, + "step": 15827 + }, + { + "epoch": 4.858195211786372, + "grad_norm": 0.18621626496315002, + "learning_rate": 5.472491239582572e-05, + "loss": 1.7155, + "step": 15828 + }, + { + "epoch": 4.858502148557397, + "grad_norm": 0.2437327802181244, + "learning_rate": 5.471996405574399e-05, + "loss": 1.7586, + "step": 15829 + }, + { + "epoch": 4.8588090853284225, + "grad_norm": 0.26658934354782104, + "learning_rate": 5.47150156690168e-05, + "loss": 1.7331, + "step": 15830 + }, + { + "epoch": 4.859116022099448, + "grad_norm": 0.2257174700498581, + "learning_rate": 5.471006723569308e-05, + "loss": 1.7556, + "step": 15831 + }, + { + "epoch": 4.859422958870473, + "grad_norm": 0.25434550642967224, + "learning_rate": 5.470511875582168e-05, + "loss": 1.7196, + "step": 15832 + }, + { + "epoch": 4.859729895641498, + "grad_norm": 0.2251453697681427, + "learning_rate": 5.470017022945156e-05, + "loss": 1.7174, + "step": 15833 + }, + { + "epoch": 4.860036832412523, + "grad_norm": 0.2757972180843353, + "learning_rate": 5.469522165663161e-05, + "loss": 1.7701, + "step": 15834 + }, + { + "epoch": 4.860343769183548, + "grad_norm": 0.2771994173526764, + "learning_rate": 5.469027303741072e-05, + "loss": 1.8085, + "step": 15835 + }, + { + "epoch": 4.860650705954574, + "grad_norm": 0.23825454711914062, + "learning_rate": 5.468532437183781e-05, + "loss": 1.733, + "step": 15836 + }, + { + "epoch": 4.860957642725598, + "grad_norm": 0.18100066483020782, + "learning_rate": 5.468037565996177e-05, + "loss": 1.7012, + "step": 15837 + }, + { + "epoch": 4.861264579496623, + "grad_norm": 0.22552812099456787, + "learning_rate": 5.4675426901831506e-05, + "loss": 1.728, + "step": 15838 + }, + { + "epoch": 4.861571516267649, + "grad_norm": 0.2505643665790558, + "learning_rate": 5.467047809749595e-05, + "loss": 1.7219, + "step": 15839 + }, + { + "epoch": 4.861878453038674, + "grad_norm": 0.25920796394348145, + "learning_rate": 5.4665529247003975e-05, + "loss": 1.7945, + "step": 15840 + }, + { + "epoch": 4.862185389809699, + "grad_norm": 0.23549394309520721, + "learning_rate": 5.466058035040452e-05, + "loss": 1.7904, + "step": 15841 + }, + { + "epoch": 4.862492326580725, + "grad_norm": 0.26510992646217346, + "learning_rate": 5.465563140774648e-05, + "loss": 1.8051, + "step": 15842 + }, + { + "epoch": 4.862799263351749, + "grad_norm": 0.19175390899181366, + "learning_rate": 5.465068241907876e-05, + "loss": 1.6799, + "step": 15843 + }, + { + "epoch": 4.8631062001227745, + "grad_norm": 0.2588976323604584, + "learning_rate": 5.464573338445025e-05, + "loss": 1.7394, + "step": 15844 + }, + { + "epoch": 4.8634131368938, + "grad_norm": 0.28729483485221863, + "learning_rate": 5.464078430390991e-05, + "loss": 1.797, + "step": 15845 + }, + { + "epoch": 4.863720073664825, + "grad_norm": 0.21302445232868195, + "learning_rate": 5.463583517750661e-05, + "loss": 1.7303, + "step": 15846 + }, + { + "epoch": 4.8640270104358505, + "grad_norm": 0.2407636195421219, + "learning_rate": 5.463088600528926e-05, + "loss": 1.7175, + "step": 15847 + }, + { + "epoch": 4.864333947206875, + "grad_norm": 0.25653502345085144, + "learning_rate": 5.4625936787306784e-05, + "loss": 1.6996, + "step": 15848 + }, + { + "epoch": 4.8646408839779, + "grad_norm": 0.2100832760334015, + "learning_rate": 5.462098752360809e-05, + "loss": 1.7416, + "step": 15849 + }, + { + "epoch": 4.864947820748926, + "grad_norm": 0.2785186469554901, + "learning_rate": 5.461603821424208e-05, + "loss": 1.74, + "step": 15850 + }, + { + "epoch": 4.865254757519951, + "grad_norm": 0.2896614968776703, + "learning_rate": 5.4611088859257696e-05, + "loss": 1.7436, + "step": 15851 + }, + { + "epoch": 4.865561694290976, + "grad_norm": 0.18890418112277985, + "learning_rate": 5.460613945870382e-05, + "loss": 1.7093, + "step": 15852 + }, + { + "epoch": 4.865868631062002, + "grad_norm": 0.27681079506874084, + "learning_rate": 5.4601190012629364e-05, + "loss": 1.8772, + "step": 15853 + }, + { + "epoch": 4.866175567833026, + "grad_norm": 0.24658115208148956, + "learning_rate": 5.4596240521083265e-05, + "loss": 1.776, + "step": 15854 + }, + { + "epoch": 4.866482504604051, + "grad_norm": 0.21958144009113312, + "learning_rate": 5.459129098411441e-05, + "loss": 1.7503, + "step": 15855 + }, + { + "epoch": 4.866789441375077, + "grad_norm": 0.2778300642967224, + "learning_rate": 5.458634140177174e-05, + "loss": 1.8194, + "step": 15856 + }, + { + "epoch": 4.867096378146102, + "grad_norm": 0.28673580288887024, + "learning_rate": 5.458139177410414e-05, + "loss": 1.8033, + "step": 15857 + }, + { + "epoch": 4.867403314917127, + "grad_norm": 0.24472850561141968, + "learning_rate": 5.457644210116055e-05, + "loss": 1.7304, + "step": 15858 + }, + { + "epoch": 4.867710251688152, + "grad_norm": 0.24581189453601837, + "learning_rate": 5.4571492382989886e-05, + "loss": 1.7443, + "step": 15859 + }, + { + "epoch": 4.868017188459177, + "grad_norm": 0.22296221554279327, + "learning_rate": 5.4566542619641045e-05, + "loss": 1.7201, + "step": 15860 + }, + { + "epoch": 4.8683241252302025, + "grad_norm": 0.2378673404455185, + "learning_rate": 5.456159281116295e-05, + "loss": 1.7893, + "step": 15861 + }, + { + "epoch": 4.868631062001228, + "grad_norm": 0.3320823907852173, + "learning_rate": 5.4556642957604534e-05, + "loss": 1.7944, + "step": 15862 + }, + { + "epoch": 4.868937998772253, + "grad_norm": 0.3303453326225281, + "learning_rate": 5.45516930590147e-05, + "loss": 1.7267, + "step": 15863 + }, + { + "epoch": 4.8692449355432785, + "grad_norm": 0.223227858543396, + "learning_rate": 5.454674311544235e-05, + "loss": 1.7477, + "step": 15864 + }, + { + "epoch": 4.869551872314303, + "grad_norm": 0.3012549579143524, + "learning_rate": 5.454179312693643e-05, + "loss": 1.731, + "step": 15865 + }, + { + "epoch": 4.869858809085328, + "grad_norm": 0.3780311942100525, + "learning_rate": 5.453684309354585e-05, + "loss": 1.7296, + "step": 15866 + }, + { + "epoch": 4.870165745856354, + "grad_norm": 0.2753889262676239, + "learning_rate": 5.4531893015319526e-05, + "loss": 1.8024, + "step": 15867 + }, + { + "epoch": 4.870472682627379, + "grad_norm": 0.2270934134721756, + "learning_rate": 5.452694289230639e-05, + "loss": 1.7095, + "step": 15868 + }, + { + "epoch": 4.870779619398404, + "grad_norm": 0.2621576488018036, + "learning_rate": 5.452199272455534e-05, + "loss": 1.75, + "step": 15869 + }, + { + "epoch": 4.871086556169429, + "grad_norm": 0.22175776958465576, + "learning_rate": 5.45170425121153e-05, + "loss": 1.7658, + "step": 15870 + }, + { + "epoch": 4.871393492940454, + "grad_norm": 0.2038736790418625, + "learning_rate": 5.451209225503521e-05, + "loss": 1.6916, + "step": 15871 + }, + { + "epoch": 4.871700429711479, + "grad_norm": 0.2493467777967453, + "learning_rate": 5.450714195336397e-05, + "loss": 1.7408, + "step": 15872 + }, + { + "epoch": 4.872007366482505, + "grad_norm": 0.1966754049062729, + "learning_rate": 5.450219160715052e-05, + "loss": 1.7379, + "step": 15873 + }, + { + "epoch": 4.87231430325353, + "grad_norm": 0.23193517327308655, + "learning_rate": 5.4497241216443775e-05, + "loss": 1.7736, + "step": 15874 + }, + { + "epoch": 4.872621240024555, + "grad_norm": 0.2164391279220581, + "learning_rate": 5.4492290781292646e-05, + "loss": 1.7618, + "step": 15875 + }, + { + "epoch": 4.87292817679558, + "grad_norm": 0.286460816860199, + "learning_rate": 5.448734030174607e-05, + "loss": 1.7745, + "step": 15876 + }, + { + "epoch": 4.873235113566605, + "grad_norm": 0.3454538881778717, + "learning_rate": 5.448238977785298e-05, + "loss": 1.7605, + "step": 15877 + }, + { + "epoch": 4.8735420503376305, + "grad_norm": 0.26775062084198, + "learning_rate": 5.447743920966227e-05, + "loss": 1.7263, + "step": 15878 + }, + { + "epoch": 4.873848987108656, + "grad_norm": 0.2644907832145691, + "learning_rate": 5.447248859722289e-05, + "loss": 1.8489, + "step": 15879 + }, + { + "epoch": 4.87415592387968, + "grad_norm": 0.21646654605865479, + "learning_rate": 5.446753794058376e-05, + "loss": 1.7605, + "step": 15880 + }, + { + "epoch": 4.874462860650706, + "grad_norm": 0.23431318998336792, + "learning_rate": 5.446258723979381e-05, + "loss": 1.7209, + "step": 15881 + }, + { + "epoch": 4.874769797421731, + "grad_norm": 0.24665607511997223, + "learning_rate": 5.4457636494901934e-05, + "loss": 1.813, + "step": 15882 + }, + { + "epoch": 4.875076734192756, + "grad_norm": 0.26269975304603577, + "learning_rate": 5.445268570595708e-05, + "loss": 1.8255, + "step": 15883 + }, + { + "epoch": 4.875383670963782, + "grad_norm": 0.2722402811050415, + "learning_rate": 5.444773487300819e-05, + "loss": 1.7795, + "step": 15884 + }, + { + "epoch": 4.875690607734807, + "grad_norm": 0.3235624134540558, + "learning_rate": 5.444278399610417e-05, + "loss": 1.7804, + "step": 15885 + }, + { + "epoch": 4.8759975445058314, + "grad_norm": 0.2647583782672882, + "learning_rate": 5.4437833075293964e-05, + "loss": 1.7359, + "step": 15886 + }, + { + "epoch": 4.876304481276857, + "grad_norm": 0.272370845079422, + "learning_rate": 5.443288211062649e-05, + "loss": 1.7605, + "step": 15887 + }, + { + "epoch": 4.876611418047882, + "grad_norm": 0.3147594630718231, + "learning_rate": 5.4427931102150675e-05, + "loss": 1.7118, + "step": 15888 + }, + { + "epoch": 4.8769183548189075, + "grad_norm": 0.22751441597938538, + "learning_rate": 5.442298004991544e-05, + "loss": 1.723, + "step": 15889 + }, + { + "epoch": 4.877225291589933, + "grad_norm": 0.2121521681547165, + "learning_rate": 5.441802895396972e-05, + "loss": 1.7485, + "step": 15890 + }, + { + "epoch": 4.877532228360957, + "grad_norm": 0.25370222330093384, + "learning_rate": 5.4413077814362466e-05, + "loss": 1.8064, + "step": 15891 + }, + { + "epoch": 4.877839165131983, + "grad_norm": 0.19492633640766144, + "learning_rate": 5.440812663114259e-05, + "loss": 1.6773, + "step": 15892 + }, + { + "epoch": 4.878146101903008, + "grad_norm": 0.2101750522851944, + "learning_rate": 5.440317540435901e-05, + "loss": 1.7215, + "step": 15893 + }, + { + "epoch": 4.878453038674033, + "grad_norm": 0.21150651574134827, + "learning_rate": 5.439822413406068e-05, + "loss": 1.7875, + "step": 15894 + }, + { + "epoch": 4.878759975445059, + "grad_norm": 0.21008379757404327, + "learning_rate": 5.439327282029651e-05, + "loss": 1.7108, + "step": 15895 + }, + { + "epoch": 4.879066912216084, + "grad_norm": 0.22885502874851227, + "learning_rate": 5.4388321463115453e-05, + "loss": 1.7899, + "step": 15896 + }, + { + "epoch": 4.879373848987108, + "grad_norm": 0.24868059158325195, + "learning_rate": 5.4383370062566444e-05, + "loss": 1.7368, + "step": 15897 + }, + { + "epoch": 4.879680785758134, + "grad_norm": 0.27225378155708313, + "learning_rate": 5.437841861869838e-05, + "loss": 1.7623, + "step": 15898 + }, + { + "epoch": 4.879987722529159, + "grad_norm": 0.23353120684623718, + "learning_rate": 5.437346713156023e-05, + "loss": 1.7908, + "step": 15899 + }, + { + "epoch": 4.880294659300184, + "grad_norm": 0.19032470881938934, + "learning_rate": 5.436851560120091e-05, + "loss": 1.7511, + "step": 15900 + }, + { + "epoch": 4.88060159607121, + "grad_norm": 0.23714862763881683, + "learning_rate": 5.4363564027669345e-05, + "loss": 1.7197, + "step": 15901 + }, + { + "epoch": 4.880908532842234, + "grad_norm": 0.24897022545337677, + "learning_rate": 5.4358612411014495e-05, + "loss": 1.7822, + "step": 15902 + }, + { + "epoch": 4.8812154696132595, + "grad_norm": 0.21433588862419128, + "learning_rate": 5.435366075128528e-05, + "loss": 1.7928, + "step": 15903 + }, + { + "epoch": 4.881522406384285, + "grad_norm": 0.30019649863243103, + "learning_rate": 5.4348709048530646e-05, + "loss": 1.8067, + "step": 15904 + }, + { + "epoch": 4.88182934315531, + "grad_norm": 0.20227669179439545, + "learning_rate": 5.4343757302799515e-05, + "loss": 1.7254, + "step": 15905 + }, + { + "epoch": 4.8821362799263355, + "grad_norm": 0.23447728157043457, + "learning_rate": 5.4338805514140836e-05, + "loss": 1.7314, + "step": 15906 + }, + { + "epoch": 4.882443216697361, + "grad_norm": 0.29545050859451294, + "learning_rate": 5.4333853682603506e-05, + "loss": 1.7659, + "step": 15907 + }, + { + "epoch": 4.882750153468385, + "grad_norm": 0.245390385389328, + "learning_rate": 5.432890180823652e-05, + "loss": 1.7264, + "step": 15908 + }, + { + "epoch": 4.883057090239411, + "grad_norm": 0.209987074136734, + "learning_rate": 5.432394989108879e-05, + "loss": 1.7174, + "step": 15909 + }, + { + "epoch": 4.883364027010436, + "grad_norm": 0.2402341365814209, + "learning_rate": 5.431899793120925e-05, + "loss": 1.7512, + "step": 15910 + }, + { + "epoch": 4.883670963781461, + "grad_norm": 0.26227688789367676, + "learning_rate": 5.431404592864684e-05, + "loss": 1.7697, + "step": 15911 + }, + { + "epoch": 4.883977900552486, + "grad_norm": 0.2556503117084503, + "learning_rate": 5.4309093883450504e-05, + "loss": 1.8191, + "step": 15912 + }, + { + "epoch": 4.884284837323511, + "grad_norm": 0.24766884744167328, + "learning_rate": 5.4304141795669174e-05, + "loss": 1.7574, + "step": 15913 + }, + { + "epoch": 4.884591774094536, + "grad_norm": 0.19925951957702637, + "learning_rate": 5.429918966535179e-05, + "loss": 1.7249, + "step": 15914 + }, + { + "epoch": 4.884898710865562, + "grad_norm": 0.1899442970752716, + "learning_rate": 5.4294237492547294e-05, + "loss": 1.7446, + "step": 15915 + }, + { + "epoch": 4.885205647636587, + "grad_norm": 0.25900956988334656, + "learning_rate": 5.4289285277304636e-05, + "loss": 1.725, + "step": 15916 + }, + { + "epoch": 4.885512584407612, + "grad_norm": 0.2537781000137329, + "learning_rate": 5.428433301967274e-05, + "loss": 1.7861, + "step": 15917 + }, + { + "epoch": 4.885819521178637, + "grad_norm": 0.26432034373283386, + "learning_rate": 5.427938071970054e-05, + "loss": 1.7538, + "step": 15918 + }, + { + "epoch": 4.886126457949662, + "grad_norm": 0.22722363471984863, + "learning_rate": 5.4274428377437e-05, + "loss": 1.7631, + "step": 15919 + }, + { + "epoch": 4.8864333947206875, + "grad_norm": 0.24846172332763672, + "learning_rate": 5.426947599293106e-05, + "loss": 1.7833, + "step": 15920 + }, + { + "epoch": 4.886740331491713, + "grad_norm": 0.24821995198726654, + "learning_rate": 5.426452356623165e-05, + "loss": 1.7638, + "step": 15921 + }, + { + "epoch": 4.887047268262738, + "grad_norm": 0.2796781063079834, + "learning_rate": 5.425957109738773e-05, + "loss": 1.6982, + "step": 15922 + }, + { + "epoch": 4.887354205033763, + "grad_norm": 0.2875385284423828, + "learning_rate": 5.425461858644821e-05, + "loss": 1.7172, + "step": 15923 + }, + { + "epoch": 4.887661141804788, + "grad_norm": 0.21614491939544678, + "learning_rate": 5.424966603346207e-05, + "loss": 1.7521, + "step": 15924 + }, + { + "epoch": 4.887968078575813, + "grad_norm": 0.22944390773773193, + "learning_rate": 5.4244713438478235e-05, + "loss": 1.772, + "step": 15925 + }, + { + "epoch": 4.888275015346839, + "grad_norm": 0.21566039323806763, + "learning_rate": 5.423976080154566e-05, + "loss": 1.734, + "step": 15926 + }, + { + "epoch": 4.888581952117864, + "grad_norm": 0.4253925383090973, + "learning_rate": 5.4234808122713275e-05, + "loss": 1.8017, + "step": 15927 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.239146426320076, + "learning_rate": 5.422985540203004e-05, + "loss": 1.7229, + "step": 15928 + }, + { + "epoch": 4.889195825659914, + "grad_norm": 0.2344054877758026, + "learning_rate": 5.42249026395449e-05, + "loss": 1.7111, + "step": 15929 + }, + { + "epoch": 4.889502762430939, + "grad_norm": 0.21717922389507294, + "learning_rate": 5.421994983530679e-05, + "loss": 1.7427, + "step": 15930 + }, + { + "epoch": 4.889809699201964, + "grad_norm": 0.26895472407341003, + "learning_rate": 5.421499698936466e-05, + "loss": 1.8402, + "step": 15931 + }, + { + "epoch": 4.89011663597299, + "grad_norm": 0.25761866569519043, + "learning_rate": 5.421004410176746e-05, + "loss": 1.7822, + "step": 15932 + }, + { + "epoch": 4.890423572744015, + "grad_norm": 0.24465128779411316, + "learning_rate": 5.420509117256415e-05, + "loss": 1.8074, + "step": 15933 + }, + { + "epoch": 4.8907305095150395, + "grad_norm": 0.2527398467063904, + "learning_rate": 5.4200138201803655e-05, + "loss": 1.7522, + "step": 15934 + }, + { + "epoch": 4.891037446286065, + "grad_norm": 0.23118112981319427, + "learning_rate": 5.4195185189534916e-05, + "loss": 1.7394, + "step": 15935 + }, + { + "epoch": 4.89134438305709, + "grad_norm": 0.2054537534713745, + "learning_rate": 5.419023213580691e-05, + "loss": 1.7096, + "step": 15936 + }, + { + "epoch": 4.8916513198281155, + "grad_norm": 0.2929638922214508, + "learning_rate": 5.418527904066858e-05, + "loss": 1.8733, + "step": 15937 + }, + { + "epoch": 4.891958256599141, + "grad_norm": 0.2957170009613037, + "learning_rate": 5.418032590416886e-05, + "loss": 1.7201, + "step": 15938 + }, + { + "epoch": 4.892265193370166, + "grad_norm": 0.2520081698894501, + "learning_rate": 5.417537272635672e-05, + "loss": 1.7034, + "step": 15939 + }, + { + "epoch": 4.892572130141191, + "grad_norm": 0.25217053294181824, + "learning_rate": 5.41704195072811e-05, + "loss": 1.8538, + "step": 15940 + }, + { + "epoch": 4.892879066912216, + "grad_norm": 0.23605379462242126, + "learning_rate": 5.416546624699093e-05, + "loss": 1.724, + "step": 15941 + }, + { + "epoch": 4.893186003683241, + "grad_norm": 0.321750283241272, + "learning_rate": 5.416051294553519e-05, + "loss": 1.806, + "step": 15942 + }, + { + "epoch": 4.893492940454267, + "grad_norm": 0.23800241947174072, + "learning_rate": 5.415555960296284e-05, + "loss": 1.7578, + "step": 15943 + }, + { + "epoch": 4.893799877225292, + "grad_norm": 0.3423094153404236, + "learning_rate": 5.4150606219322796e-05, + "loss": 1.7324, + "step": 15944 + }, + { + "epoch": 4.894106813996316, + "grad_norm": 0.453074187040329, + "learning_rate": 5.414565279466404e-05, + "loss": 1.7268, + "step": 15945 + }, + { + "epoch": 4.894413750767342, + "grad_norm": 0.21972697973251343, + "learning_rate": 5.4140699329035504e-05, + "loss": 1.6547, + "step": 15946 + }, + { + "epoch": 4.894720687538367, + "grad_norm": 0.32876282930374146, + "learning_rate": 5.413574582248616e-05, + "loss": 1.7527, + "step": 15947 + }, + { + "epoch": 4.895027624309392, + "grad_norm": 0.34035229682922363, + "learning_rate": 5.413079227506494e-05, + "loss": 1.7636, + "step": 15948 + }, + { + "epoch": 4.895334561080418, + "grad_norm": 0.2410411536693573, + "learning_rate": 5.412583868682082e-05, + "loss": 1.8114, + "step": 15949 + }, + { + "epoch": 4.895641497851443, + "grad_norm": 0.2787366211414337, + "learning_rate": 5.412088505780274e-05, + "loss": 1.7393, + "step": 15950 + }, + { + "epoch": 4.8959484346224675, + "grad_norm": 0.23288428783416748, + "learning_rate": 5.411593138805966e-05, + "loss": 1.7413, + "step": 15951 + }, + { + "epoch": 4.896255371393493, + "grad_norm": 0.26302778720855713, + "learning_rate": 5.411097767764053e-05, + "loss": 1.7372, + "step": 15952 + }, + { + "epoch": 4.896562308164518, + "grad_norm": 0.31638020277023315, + "learning_rate": 5.410602392659431e-05, + "loss": 1.8114, + "step": 15953 + }, + { + "epoch": 4.8968692449355435, + "grad_norm": 0.23361825942993164, + "learning_rate": 5.410107013496996e-05, + "loss": 1.7592, + "step": 15954 + }, + { + "epoch": 4.897176181706568, + "grad_norm": 0.19887785613536835, + "learning_rate": 5.409611630281642e-05, + "loss": 1.7509, + "step": 15955 + }, + { + "epoch": 4.897483118477593, + "grad_norm": 0.22396783530712128, + "learning_rate": 5.409116243018266e-05, + "loss": 1.6841, + "step": 15956 + }, + { + "epoch": 4.897790055248619, + "grad_norm": 0.20397686958312988, + "learning_rate": 5.4086208517117645e-05, + "loss": 1.7427, + "step": 15957 + }, + { + "epoch": 4.898096992019644, + "grad_norm": 0.20848311483860016, + "learning_rate": 5.4081254563670314e-05, + "loss": 1.713, + "step": 15958 + }, + { + "epoch": 4.898403928790669, + "grad_norm": 0.2739275395870209, + "learning_rate": 5.407630056988964e-05, + "loss": 1.7673, + "step": 15959 + }, + { + "epoch": 4.898710865561695, + "grad_norm": 0.21485929191112518, + "learning_rate": 5.407134653582456e-05, + "loss": 1.7347, + "step": 15960 + }, + { + "epoch": 4.899017802332719, + "grad_norm": 0.26980286836624146, + "learning_rate": 5.406639246152406e-05, + "loss": 1.7158, + "step": 15961 + }, + { + "epoch": 4.899324739103744, + "grad_norm": 0.22327515482902527, + "learning_rate": 5.4061438347037084e-05, + "loss": 1.7387, + "step": 15962 + }, + { + "epoch": 4.89963167587477, + "grad_norm": 0.2542823553085327, + "learning_rate": 5.4056484192412603e-05, + "loss": 1.7826, + "step": 15963 + }, + { + "epoch": 4.899938612645795, + "grad_norm": 0.3248840868473053, + "learning_rate": 5.405152999769956e-05, + "loss": 1.7878, + "step": 15964 + }, + { + "epoch": 4.9002455494168204, + "grad_norm": 0.21210803091526031, + "learning_rate": 5.404657576294691e-05, + "loss": 1.7378, + "step": 15965 + }, + { + "epoch": 4.900552486187845, + "grad_norm": 0.25679782032966614, + "learning_rate": 5.404162148820365e-05, + "loss": 1.7493, + "step": 15966 + }, + { + "epoch": 4.90085942295887, + "grad_norm": 0.36698678135871887, + "learning_rate": 5.4036667173518704e-05, + "loss": 1.7662, + "step": 15967 + }, + { + "epoch": 4.901166359729896, + "grad_norm": 0.3396874964237213, + "learning_rate": 5.403171281894105e-05, + "loss": 1.7618, + "step": 15968 + }, + { + "epoch": 4.901473296500921, + "grad_norm": 0.2792030870914459, + "learning_rate": 5.402675842451964e-05, + "loss": 1.7858, + "step": 15969 + }, + { + "epoch": 4.901780233271946, + "grad_norm": 0.24499626457691193, + "learning_rate": 5.4021803990303454e-05, + "loss": 1.7503, + "step": 15970 + }, + { + "epoch": 4.902087170042972, + "grad_norm": 0.29185110330581665, + "learning_rate": 5.401684951634144e-05, + "loss": 1.7536, + "step": 15971 + }, + { + "epoch": 4.902394106813996, + "grad_norm": 0.2480020374059677, + "learning_rate": 5.401189500268256e-05, + "loss": 1.7877, + "step": 15972 + }, + { + "epoch": 4.902701043585021, + "grad_norm": 0.3302663564682007, + "learning_rate": 5.400694044937579e-05, + "loss": 1.8693, + "step": 15973 + }, + { + "epoch": 4.903007980356047, + "grad_norm": 0.2500915825366974, + "learning_rate": 5.400198585647008e-05, + "loss": 1.7489, + "step": 15974 + }, + { + "epoch": 4.903314917127072, + "grad_norm": 0.25079864263534546, + "learning_rate": 5.399703122401441e-05, + "loss": 1.7965, + "step": 15975 + }, + { + "epoch": 4.903621853898097, + "grad_norm": 0.2643207907676697, + "learning_rate": 5.399207655205771e-05, + "loss": 1.7696, + "step": 15976 + }, + { + "epoch": 4.903928790669122, + "grad_norm": 0.23719522356987, + "learning_rate": 5.398712184064899e-05, + "loss": 1.7608, + "step": 15977 + }, + { + "epoch": 4.904235727440147, + "grad_norm": 0.25226888060569763, + "learning_rate": 5.3982167089837184e-05, + "loss": 1.8055, + "step": 15978 + }, + { + "epoch": 4.9045426642111725, + "grad_norm": 0.21601852774620056, + "learning_rate": 5.39772122996713e-05, + "loss": 1.7553, + "step": 15979 + }, + { + "epoch": 4.904849600982198, + "grad_norm": 0.20275430381298065, + "learning_rate": 5.397225747020023e-05, + "loss": 1.7221, + "step": 15980 + }, + { + "epoch": 4.905156537753223, + "grad_norm": 0.24815937876701355, + "learning_rate": 5.3967302601473e-05, + "loss": 1.8098, + "step": 15981 + }, + { + "epoch": 4.9054634745242485, + "grad_norm": 0.2193612903356552, + "learning_rate": 5.3962347693538575e-05, + "loss": 1.7116, + "step": 15982 + }, + { + "epoch": 4.905770411295273, + "grad_norm": 0.21409118175506592, + "learning_rate": 5.395739274644589e-05, + "loss": 1.7503, + "step": 15983 + }, + { + "epoch": 4.906077348066298, + "grad_norm": 0.20907564461231232, + "learning_rate": 5.3952437760243935e-05, + "loss": 1.7518, + "step": 15984 + }, + { + "epoch": 4.906384284837324, + "grad_norm": 0.21193571388721466, + "learning_rate": 5.394748273498168e-05, + "loss": 1.6905, + "step": 15985 + }, + { + "epoch": 4.906691221608349, + "grad_norm": 0.19729891419410706, + "learning_rate": 5.394252767070808e-05, + "loss": 1.7398, + "step": 15986 + }, + { + "epoch": 4.906998158379373, + "grad_norm": 0.2654789686203003, + "learning_rate": 5.393757256747211e-05, + "loss": 1.7931, + "step": 15987 + }, + { + "epoch": 4.907305095150399, + "grad_norm": 0.2627345025539398, + "learning_rate": 5.3932617425322726e-05, + "loss": 1.8174, + "step": 15988 + }, + { + "epoch": 4.907612031921424, + "grad_norm": 0.27162298560142517, + "learning_rate": 5.392766224430894e-05, + "loss": 1.8015, + "step": 15989 + }, + { + "epoch": 4.907918968692449, + "grad_norm": 0.24248667061328888, + "learning_rate": 5.3922707024479676e-05, + "loss": 1.7457, + "step": 15990 + }, + { + "epoch": 4.908225905463475, + "grad_norm": 0.24715331196784973, + "learning_rate": 5.391775176588393e-05, + "loss": 1.7724, + "step": 15991 + }, + { + "epoch": 4.9085328422345, + "grad_norm": 0.26335644721984863, + "learning_rate": 5.3912796468570656e-05, + "loss": 1.7183, + "step": 15992 + }, + { + "epoch": 4.9088397790055245, + "grad_norm": 0.23459944128990173, + "learning_rate": 5.3907841132588843e-05, + "loss": 1.7245, + "step": 15993 + }, + { + "epoch": 4.90914671577655, + "grad_norm": 0.21779637038707733, + "learning_rate": 5.3902885757987444e-05, + "loss": 1.7485, + "step": 15994 + }, + { + "epoch": 4.909453652547575, + "grad_norm": 0.227664977312088, + "learning_rate": 5.389793034481545e-05, + "loss": 1.7418, + "step": 15995 + }, + { + "epoch": 4.9097605893186005, + "grad_norm": 0.26230278611183167, + "learning_rate": 5.389297489312183e-05, + "loss": 1.7619, + "step": 15996 + }, + { + "epoch": 4.910067526089626, + "grad_norm": 0.22563579678535461, + "learning_rate": 5.388801940295555e-05, + "loss": 1.7168, + "step": 15997 + }, + { + "epoch": 4.91037446286065, + "grad_norm": 0.24829435348510742, + "learning_rate": 5.388306387436556e-05, + "loss": 1.7422, + "step": 15998 + }, + { + "epoch": 4.910681399631676, + "grad_norm": 0.24395976960659027, + "learning_rate": 5.387810830740088e-05, + "loss": 1.7783, + "step": 15999 + }, + { + "epoch": 4.910988336402701, + "grad_norm": 0.2189297378063202, + "learning_rate": 5.387315270211044e-05, + "loss": 1.7885, + "step": 16000 + }, + { + "epoch": 4.911295273173726, + "grad_norm": 0.21750971674919128, + "learning_rate": 5.386819705854324e-05, + "loss": 1.7659, + "step": 16001 + }, + { + "epoch": 4.911602209944752, + "grad_norm": 0.21907657384872437, + "learning_rate": 5.386324137674826e-05, + "loss": 1.789, + "step": 16002 + }, + { + "epoch": 4.911909146715777, + "grad_norm": 0.18778781592845917, + "learning_rate": 5.3858285656774465e-05, + "loss": 1.7151, + "step": 16003 + }, + { + "epoch": 4.912216083486801, + "grad_norm": 0.24217712879180908, + "learning_rate": 5.385332989867082e-05, + "loss": 1.8108, + "step": 16004 + }, + { + "epoch": 4.912523020257827, + "grad_norm": 0.27637016773223877, + "learning_rate": 5.384837410248632e-05, + "loss": 1.8368, + "step": 16005 + }, + { + "epoch": 4.912829957028852, + "grad_norm": 0.22366084158420563, + "learning_rate": 5.3843418268269926e-05, + "loss": 1.7351, + "step": 16006 + }, + { + "epoch": 4.913136893799877, + "grad_norm": 0.2742357552051544, + "learning_rate": 5.383846239607062e-05, + "loss": 1.7599, + "step": 16007 + }, + { + "epoch": 4.913443830570903, + "grad_norm": 0.2288598269224167, + "learning_rate": 5.383350648593738e-05, + "loss": 1.7056, + "step": 16008 + }, + { + "epoch": 4.913750767341927, + "grad_norm": 0.23319020867347717, + "learning_rate": 5.382855053791919e-05, + "loss": 1.7356, + "step": 16009 + }, + { + "epoch": 4.9140577041129525, + "grad_norm": 0.2232198268175125, + "learning_rate": 5.382359455206499e-05, + "loss": 1.7375, + "step": 16010 + }, + { + "epoch": 4.914364640883978, + "grad_norm": 0.24420048296451569, + "learning_rate": 5.381863852842381e-05, + "loss": 1.8287, + "step": 16011 + }, + { + "epoch": 4.914671577655003, + "grad_norm": 0.22653080523014069, + "learning_rate": 5.381368246704461e-05, + "loss": 1.7137, + "step": 16012 + }, + { + "epoch": 4.9149785144260285, + "grad_norm": 0.20439405739307404, + "learning_rate": 5.380872636797637e-05, + "loss": 1.7688, + "step": 16013 + }, + { + "epoch": 4.915285451197054, + "grad_norm": 0.2602155804634094, + "learning_rate": 5.380377023126806e-05, + "loss": 1.7875, + "step": 16014 + }, + { + "epoch": 4.915592387968078, + "grad_norm": 0.2757892608642578, + "learning_rate": 5.3798814056968647e-05, + "loss": 1.7446, + "step": 16015 + }, + { + "epoch": 4.915899324739104, + "grad_norm": 0.25938209891319275, + "learning_rate": 5.379385784512714e-05, + "loss": 1.6997, + "step": 16016 + }, + { + "epoch": 4.916206261510129, + "grad_norm": 0.2056962549686432, + "learning_rate": 5.37889015957925e-05, + "loss": 1.6961, + "step": 16017 + }, + { + "epoch": 4.916513198281154, + "grad_norm": 0.24388402700424194, + "learning_rate": 5.3783945309013714e-05, + "loss": 1.712, + "step": 16018 + }, + { + "epoch": 4.91682013505218, + "grad_norm": 0.2381993532180786, + "learning_rate": 5.3778988984839775e-05, + "loss": 1.7444, + "step": 16019 + }, + { + "epoch": 4.917127071823204, + "grad_norm": 0.20201562345027924, + "learning_rate": 5.377403262331964e-05, + "loss": 1.7254, + "step": 16020 + }, + { + "epoch": 4.917434008594229, + "grad_norm": 0.24019409716129303, + "learning_rate": 5.376907622450229e-05, + "loss": 1.684, + "step": 16021 + }, + { + "epoch": 4.917740945365255, + "grad_norm": 0.2441694289445877, + "learning_rate": 5.376411978843674e-05, + "loss": 1.7334, + "step": 16022 + }, + { + "epoch": 4.91804788213628, + "grad_norm": 0.23866300284862518, + "learning_rate": 5.3759163315171945e-05, + "loss": 1.7258, + "step": 16023 + }, + { + "epoch": 4.918354818907305, + "grad_norm": 0.28068670630455017, + "learning_rate": 5.375420680475689e-05, + "loss": 1.8049, + "step": 16024 + }, + { + "epoch": 4.918661755678331, + "grad_norm": 0.2956274151802063, + "learning_rate": 5.3749250257240566e-05, + "loss": 1.8544, + "step": 16025 + }, + { + "epoch": 4.918968692449355, + "grad_norm": 0.1971627175807953, + "learning_rate": 5.374429367267196e-05, + "loss": 1.7314, + "step": 16026 + }, + { + "epoch": 4.9192756292203805, + "grad_norm": 0.28565749526023865, + "learning_rate": 5.373933705110004e-05, + "loss": 1.7587, + "step": 16027 + }, + { + "epoch": 4.919582565991406, + "grad_norm": 0.3087369501590729, + "learning_rate": 5.37343803925738e-05, + "loss": 1.7708, + "step": 16028 + }, + { + "epoch": 4.919889502762431, + "grad_norm": 0.22460010647773743, + "learning_rate": 5.372942369714223e-05, + "loss": 1.7401, + "step": 16029 + }, + { + "epoch": 4.920196439533456, + "grad_norm": 0.29492735862731934, + "learning_rate": 5.3724466964854326e-05, + "loss": 1.7033, + "step": 16030 + }, + { + "epoch": 4.920503376304481, + "grad_norm": 0.24452674388885498, + "learning_rate": 5.371951019575904e-05, + "loss": 1.7688, + "step": 16031 + }, + { + "epoch": 4.920810313075506, + "grad_norm": 0.24686957895755768, + "learning_rate": 5.3714553389905366e-05, + "loss": 1.7463, + "step": 16032 + }, + { + "epoch": 4.921117249846532, + "grad_norm": 0.23661597073078156, + "learning_rate": 5.37095965473423e-05, + "loss": 1.7256, + "step": 16033 + }, + { + "epoch": 4.921424186617557, + "grad_norm": 0.22861288487911224, + "learning_rate": 5.370463966811884e-05, + "loss": 1.7722, + "step": 16034 + }, + { + "epoch": 4.921731123388582, + "grad_norm": 0.2453136146068573, + "learning_rate": 5.3699682752283944e-05, + "loss": 1.7343, + "step": 16035 + }, + { + "epoch": 4.922038060159607, + "grad_norm": 0.25267064571380615, + "learning_rate": 5.369472579988663e-05, + "loss": 1.7817, + "step": 16036 + }, + { + "epoch": 4.922344996930632, + "grad_norm": 0.25301575660705566, + "learning_rate": 5.368976881097586e-05, + "loss": 1.8146, + "step": 16037 + }, + { + "epoch": 4.922651933701657, + "grad_norm": 0.23579831421375275, + "learning_rate": 5.368481178560062e-05, + "loss": 1.8089, + "step": 16038 + }, + { + "epoch": 4.922958870472683, + "grad_norm": 0.2181949019432068, + "learning_rate": 5.367985472380993e-05, + "loss": 1.7689, + "step": 16039 + }, + { + "epoch": 4.923265807243708, + "grad_norm": 0.24622827768325806, + "learning_rate": 5.367489762565276e-05, + "loss": 1.791, + "step": 16040 + }, + { + "epoch": 4.9235727440147325, + "grad_norm": 0.2545134723186493, + "learning_rate": 5.3669940491178084e-05, + "loss": 1.738, + "step": 16041 + }, + { + "epoch": 4.923879680785758, + "grad_norm": 0.258139431476593, + "learning_rate": 5.366498332043491e-05, + "loss": 1.8303, + "step": 16042 + }, + { + "epoch": 4.924186617556783, + "grad_norm": 0.23804105818271637, + "learning_rate": 5.366002611347223e-05, + "loss": 1.751, + "step": 16043 + }, + { + "epoch": 4.9244935543278086, + "grad_norm": 0.2354477345943451, + "learning_rate": 5.365506887033901e-05, + "loss": 1.7911, + "step": 16044 + }, + { + "epoch": 4.924800491098834, + "grad_norm": 0.22212550044059753, + "learning_rate": 5.3650111591084276e-05, + "loss": 1.7439, + "step": 16045 + }, + { + "epoch": 4.925107427869859, + "grad_norm": 0.23621168732643127, + "learning_rate": 5.3645154275756984e-05, + "loss": 1.7339, + "step": 16046 + }, + { + "epoch": 4.925414364640884, + "grad_norm": 0.2163209468126297, + "learning_rate": 5.364019692440616e-05, + "loss": 1.7247, + "step": 16047 + }, + { + "epoch": 4.925721301411909, + "grad_norm": 0.21352291107177734, + "learning_rate": 5.3635239537080774e-05, + "loss": 1.7431, + "step": 16048 + }, + { + "epoch": 4.926028238182934, + "grad_norm": 0.3170754909515381, + "learning_rate": 5.36302821138298e-05, + "loss": 1.8075, + "step": 16049 + }, + { + "epoch": 4.92633517495396, + "grad_norm": 0.27073633670806885, + "learning_rate": 5.362532465470226e-05, + "loss": 1.7209, + "step": 16050 + }, + { + "epoch": 4.926642111724985, + "grad_norm": 0.2677803039550781, + "learning_rate": 5.362036715974714e-05, + "loss": 1.7454, + "step": 16051 + }, + { + "epoch": 4.9269490484960095, + "grad_norm": 0.3555704355239868, + "learning_rate": 5.3615409629013436e-05, + "loss": 1.7737, + "step": 16052 + }, + { + "epoch": 4.927255985267035, + "grad_norm": 0.2819947302341461, + "learning_rate": 5.3610452062550124e-05, + "loss": 1.7588, + "step": 16053 + }, + { + "epoch": 4.92756292203806, + "grad_norm": 0.26638996601104736, + "learning_rate": 5.360549446040621e-05, + "loss": 1.8078, + "step": 16054 + }, + { + "epoch": 4.9278698588090855, + "grad_norm": 0.37828773260116577, + "learning_rate": 5.360053682263069e-05, + "loss": 1.7527, + "step": 16055 + }, + { + "epoch": 4.928176795580111, + "grad_norm": 0.35836395621299744, + "learning_rate": 5.359557914927254e-05, + "loss": 1.7199, + "step": 16056 + }, + { + "epoch": 4.928483732351136, + "grad_norm": 0.2720802128314972, + "learning_rate": 5.359062144038078e-05, + "loss": 1.7598, + "step": 16057 + }, + { + "epoch": 4.928790669122161, + "grad_norm": 0.36662939190864563, + "learning_rate": 5.358566369600441e-05, + "loss": 1.7199, + "step": 16058 + }, + { + "epoch": 4.929097605893186, + "grad_norm": 0.42243221402168274, + "learning_rate": 5.3580705916192395e-05, + "loss": 1.7584, + "step": 16059 + }, + { + "epoch": 4.929404542664211, + "grad_norm": 0.21667765080928802, + "learning_rate": 5.357574810099375e-05, + "loss": 1.7608, + "step": 16060 + }, + { + "epoch": 4.929711479435237, + "grad_norm": 0.48101645708084106, + "learning_rate": 5.3570790250457456e-05, + "loss": 1.8157, + "step": 16061 + }, + { + "epoch": 4.930018416206261, + "grad_norm": 0.5289245843887329, + "learning_rate": 5.356583236463253e-05, + "loss": 1.7173, + "step": 16062 + }, + { + "epoch": 4.930325352977286, + "grad_norm": 0.21454930305480957, + "learning_rate": 5.356087444356795e-05, + "loss": 1.7399, + "step": 16063 + }, + { + "epoch": 4.930632289748312, + "grad_norm": 0.5648324489593506, + "learning_rate": 5.355591648731274e-05, + "loss": 1.7814, + "step": 16064 + }, + { + "epoch": 4.930939226519337, + "grad_norm": 0.5669483542442322, + "learning_rate": 5.355095849591587e-05, + "loss": 1.7769, + "step": 16065 + }, + { + "epoch": 4.931246163290362, + "grad_norm": 0.33108505606651306, + "learning_rate": 5.354600046942635e-05, + "loss": 1.7704, + "step": 16066 + }, + { + "epoch": 4.931553100061388, + "grad_norm": 0.31149306893348694, + "learning_rate": 5.3541042407893164e-05, + "loss": 1.7631, + "step": 16067 + }, + { + "epoch": 4.931860036832412, + "grad_norm": 0.30377596616744995, + "learning_rate": 5.353608431136532e-05, + "loss": 1.7888, + "step": 16068 + }, + { + "epoch": 4.9321669736034375, + "grad_norm": 0.25041452050209045, + "learning_rate": 5.3531126179891825e-05, + "loss": 1.7507, + "step": 16069 + }, + { + "epoch": 4.932473910374463, + "grad_norm": 0.33900725841522217, + "learning_rate": 5.352616801352167e-05, + "loss": 1.7365, + "step": 16070 + }, + { + "epoch": 4.932780847145488, + "grad_norm": 0.23939846456050873, + "learning_rate": 5.352120981230386e-05, + "loss": 1.7934, + "step": 16071 + }, + { + "epoch": 4.9330877839165135, + "grad_norm": 0.2419881969690323, + "learning_rate": 5.351625157628739e-05, + "loss": 1.7555, + "step": 16072 + }, + { + "epoch": 4.933394720687538, + "grad_norm": 0.3517596423625946, + "learning_rate": 5.351129330552125e-05, + "loss": 1.7102, + "step": 16073 + }, + { + "epoch": 4.933701657458563, + "grad_norm": 0.2660250663757324, + "learning_rate": 5.350633500005446e-05, + "loss": 1.7692, + "step": 16074 + }, + { + "epoch": 4.934008594229589, + "grad_norm": 0.20726454257965088, + "learning_rate": 5.350137665993601e-05, + "loss": 1.718, + "step": 16075 + }, + { + "epoch": 4.934315531000614, + "grad_norm": 0.28218522667884827, + "learning_rate": 5.3496418285214914e-05, + "loss": 1.8402, + "step": 16076 + }, + { + "epoch": 4.934622467771639, + "grad_norm": 0.2142515480518341, + "learning_rate": 5.349145987594015e-05, + "loss": 1.7571, + "step": 16077 + }, + { + "epoch": 4.934929404542665, + "grad_norm": 0.2777026891708374, + "learning_rate": 5.348650143216074e-05, + "loss": 1.7617, + "step": 16078 + }, + { + "epoch": 4.935236341313689, + "grad_norm": 0.24057620763778687, + "learning_rate": 5.348154295392567e-05, + "loss": 1.7149, + "step": 16079 + }, + { + "epoch": 4.935543278084714, + "grad_norm": 0.22220350801944733, + "learning_rate": 5.3476584441283964e-05, + "loss": 1.7402, + "step": 16080 + }, + { + "epoch": 4.93585021485574, + "grad_norm": 0.2451290488243103, + "learning_rate": 5.347162589428462e-05, + "loss": 1.7004, + "step": 16081 + }, + { + "epoch": 4.936157151626765, + "grad_norm": 0.25621771812438965, + "learning_rate": 5.3466667312976625e-05, + "loss": 1.7765, + "step": 16082 + }, + { + "epoch": 4.93646408839779, + "grad_norm": 0.217393159866333, + "learning_rate": 5.346170869740899e-05, + "loss": 1.7695, + "step": 16083 + }, + { + "epoch": 4.936771025168815, + "grad_norm": 0.21248537302017212, + "learning_rate": 5.345675004763071e-05, + "loss": 1.7277, + "step": 16084 + }, + { + "epoch": 4.93707796193984, + "grad_norm": 0.19431474804878235, + "learning_rate": 5.3451791363690805e-05, + "loss": 1.7352, + "step": 16085 + }, + { + "epoch": 4.9373848987108655, + "grad_norm": 0.20233909785747528, + "learning_rate": 5.344683264563829e-05, + "loss": 1.71, + "step": 16086 + }, + { + "epoch": 4.937691835481891, + "grad_norm": 0.2199622094631195, + "learning_rate": 5.344187389352214e-05, + "loss": 1.7443, + "step": 16087 + }, + { + "epoch": 4.937998772252916, + "grad_norm": 0.23495158553123474, + "learning_rate": 5.343691510739138e-05, + "loss": 1.7758, + "step": 16088 + }, + { + "epoch": 4.9383057090239415, + "grad_norm": 0.228348970413208, + "learning_rate": 5.3431956287295015e-05, + "loss": 1.7645, + "step": 16089 + }, + { + "epoch": 4.938612645794966, + "grad_norm": 0.2337537258863449, + "learning_rate": 5.342699743328203e-05, + "loss": 1.7353, + "step": 16090 + }, + { + "epoch": 4.938919582565991, + "grad_norm": 0.1899309754371643, + "learning_rate": 5.3422038545401454e-05, + "loss": 1.6907, + "step": 16091 + }, + { + "epoch": 4.939226519337017, + "grad_norm": 0.2479192316532135, + "learning_rate": 5.341707962370229e-05, + "loss": 1.7961, + "step": 16092 + }, + { + "epoch": 4.939533456108042, + "grad_norm": 0.2444314956665039, + "learning_rate": 5.341212066823355e-05, + "loss": 1.7768, + "step": 16093 + }, + { + "epoch": 4.939840392879067, + "grad_norm": 0.2123393714427948, + "learning_rate": 5.340716167904423e-05, + "loss": 1.7617, + "step": 16094 + }, + { + "epoch": 4.940147329650092, + "grad_norm": 0.20779116451740265, + "learning_rate": 5.340220265618334e-05, + "loss": 1.6951, + "step": 16095 + }, + { + "epoch": 4.940454266421117, + "grad_norm": 0.22189265489578247, + "learning_rate": 5.3397243599699884e-05, + "loss": 1.8368, + "step": 16096 + }, + { + "epoch": 4.940761203192142, + "grad_norm": 0.22316497564315796, + "learning_rate": 5.3392284509642875e-05, + "loss": 1.7096, + "step": 16097 + }, + { + "epoch": 4.941068139963168, + "grad_norm": 0.20406664907932281, + "learning_rate": 5.3387325386061346e-05, + "loss": 1.7269, + "step": 16098 + }, + { + "epoch": 4.941375076734193, + "grad_norm": 0.263007789850235, + "learning_rate": 5.338236622900427e-05, + "loss": 1.7663, + "step": 16099 + }, + { + "epoch": 4.941682013505218, + "grad_norm": 0.24388311803340912, + "learning_rate": 5.3377407038520654e-05, + "loss": 1.7113, + "step": 16100 + }, + { + "epoch": 4.941988950276243, + "grad_norm": 0.21918313205242157, + "learning_rate": 5.3372447814659524e-05, + "loss": 1.775, + "step": 16101 + }, + { + "epoch": 4.942295887047268, + "grad_norm": 0.30842962861061096, + "learning_rate": 5.336748855746989e-05, + "loss": 1.8229, + "step": 16102 + }, + { + "epoch": 4.9426028238182935, + "grad_norm": 0.2875657379627228, + "learning_rate": 5.336252926700077e-05, + "loss": 1.7377, + "step": 16103 + }, + { + "epoch": 4.942909760589319, + "grad_norm": 0.23411425948143005, + "learning_rate": 5.3357569943301156e-05, + "loss": 1.754, + "step": 16104 + }, + { + "epoch": 4.943216697360343, + "grad_norm": 0.29758864641189575, + "learning_rate": 5.335261058642007e-05, + "loss": 1.7471, + "step": 16105 + }, + { + "epoch": 4.943523634131369, + "grad_norm": 0.31761085987091064, + "learning_rate": 5.3347651196406534e-05, + "loss": 1.7658, + "step": 16106 + }, + { + "epoch": 4.943830570902394, + "grad_norm": 0.2487023025751114, + "learning_rate": 5.334269177330952e-05, + "loss": 1.786, + "step": 16107 + }, + { + "epoch": 4.944137507673419, + "grad_norm": 0.23954913020133972, + "learning_rate": 5.333773231717808e-05, + "loss": 1.8486, + "step": 16108 + }, + { + "epoch": 4.944444444444445, + "grad_norm": 0.24893096089363098, + "learning_rate": 5.3332772828061214e-05, + "loss": 1.7927, + "step": 16109 + }, + { + "epoch": 4.94475138121547, + "grad_norm": 0.28653839230537415, + "learning_rate": 5.332781330600795e-05, + "loss": 1.8331, + "step": 16110 + }, + { + "epoch": 4.945058317986494, + "grad_norm": 0.2597404718399048, + "learning_rate": 5.332285375106726e-05, + "loss": 1.7128, + "step": 16111 + }, + { + "epoch": 4.94536525475752, + "grad_norm": 0.23813198506832123, + "learning_rate": 5.3317894163288196e-05, + "loss": 1.7483, + "step": 16112 + }, + { + "epoch": 4.945672191528545, + "grad_norm": 0.2545793652534485, + "learning_rate": 5.331293454271974e-05, + "loss": 1.7987, + "step": 16113 + }, + { + "epoch": 4.94597912829957, + "grad_norm": 0.2453712821006775, + "learning_rate": 5.330797488941095e-05, + "loss": 1.7376, + "step": 16114 + }, + { + "epoch": 4.946286065070596, + "grad_norm": 0.20583751797676086, + "learning_rate": 5.33030152034108e-05, + "loss": 1.7038, + "step": 16115 + }, + { + "epoch": 4.94659300184162, + "grad_norm": 0.22557811439037323, + "learning_rate": 5.3298055484768313e-05, + "loss": 1.6999, + "step": 16116 + }, + { + "epoch": 4.9468999386126455, + "grad_norm": 0.23163801431655884, + "learning_rate": 5.329309573353252e-05, + "loss": 1.7575, + "step": 16117 + }, + { + "epoch": 4.947206875383671, + "grad_norm": 0.3560176491737366, + "learning_rate": 5.3288135949752394e-05, + "loss": 1.8494, + "step": 16118 + }, + { + "epoch": 4.947513812154696, + "grad_norm": 0.306379109621048, + "learning_rate": 5.328317613347701e-05, + "loss": 1.7229, + "step": 16119 + }, + { + "epoch": 4.9478207489257215, + "grad_norm": 0.24428823590278625, + "learning_rate": 5.3278216284755344e-05, + "loss": 1.7939, + "step": 16120 + }, + { + "epoch": 4.948127685696747, + "grad_norm": 0.22251521050930023, + "learning_rate": 5.327325640363643e-05, + "loss": 1.7624, + "step": 16121 + }, + { + "epoch": 4.948434622467771, + "grad_norm": 0.23310889303684235, + "learning_rate": 5.326829649016928e-05, + "loss": 1.7727, + "step": 16122 + }, + { + "epoch": 4.948741559238797, + "grad_norm": 0.22457881271839142, + "learning_rate": 5.326333654440291e-05, + "loss": 1.7602, + "step": 16123 + }, + { + "epoch": 4.949048496009822, + "grad_norm": 0.24032343924045563, + "learning_rate": 5.325837656638631e-05, + "loss": 1.7591, + "step": 16124 + }, + { + "epoch": 4.949355432780847, + "grad_norm": 0.25082892179489136, + "learning_rate": 5.3253416556168546e-05, + "loss": 1.7745, + "step": 16125 + }, + { + "epoch": 4.949662369551873, + "grad_norm": 0.22859038412570953, + "learning_rate": 5.3248456513798615e-05, + "loss": 1.7475, + "step": 16126 + }, + { + "epoch": 4.949969306322897, + "grad_norm": 0.27282553911209106, + "learning_rate": 5.3243496439325525e-05, + "loss": 1.7438, + "step": 16127 + }, + { + "epoch": 4.9502762430939224, + "grad_norm": 0.23622353374958038, + "learning_rate": 5.3238536332798303e-05, + "loss": 1.7625, + "step": 16128 + }, + { + "epoch": 4.950583179864948, + "grad_norm": 0.28060024976730347, + "learning_rate": 5.3233576194265975e-05, + "loss": 1.8028, + "step": 16129 + }, + { + "epoch": 4.950890116635973, + "grad_norm": 0.33281829953193665, + "learning_rate": 5.322861602377755e-05, + "loss": 1.7163, + "step": 16130 + }, + { + "epoch": 4.9511970534069984, + "grad_norm": 0.26457497477531433, + "learning_rate": 5.322365582138203e-05, + "loss": 1.7347, + "step": 16131 + }, + { + "epoch": 4.951503990178024, + "grad_norm": 0.21651674807071686, + "learning_rate": 5.3218695587128476e-05, + "loss": 1.7123, + "step": 16132 + }, + { + "epoch": 4.951810926949048, + "grad_norm": 0.2299882024526596, + "learning_rate": 5.3213735321065885e-05, + "loss": 1.775, + "step": 16133 + }, + { + "epoch": 4.952117863720074, + "grad_norm": 0.2252396047115326, + "learning_rate": 5.3208775023243265e-05, + "loss": 1.7598, + "step": 16134 + }, + { + "epoch": 4.952424800491099, + "grad_norm": 0.2263660430908203, + "learning_rate": 5.3203814693709655e-05, + "loss": 1.7519, + "step": 16135 + }, + { + "epoch": 4.952731737262124, + "grad_norm": 0.2425432950258255, + "learning_rate": 5.3198854332514056e-05, + "loss": 1.7769, + "step": 16136 + }, + { + "epoch": 4.953038674033149, + "grad_norm": 0.22624996304512024, + "learning_rate": 5.319389393970553e-05, + "loss": 1.7686, + "step": 16137 + }, + { + "epoch": 4.953345610804174, + "grad_norm": 0.2240568846464157, + "learning_rate": 5.318893351533306e-05, + "loss": 1.7795, + "step": 16138 + }, + { + "epoch": 4.953652547575199, + "grad_norm": 0.21708132326602936, + "learning_rate": 5.318397305944568e-05, + "loss": 1.7348, + "step": 16139 + }, + { + "epoch": 4.953959484346225, + "grad_norm": 0.2263328731060028, + "learning_rate": 5.3179012572092415e-05, + "loss": 1.7645, + "step": 16140 + }, + { + "epoch": 4.95426642111725, + "grad_norm": 0.2541986107826233, + "learning_rate": 5.3174052053322274e-05, + "loss": 1.723, + "step": 16141 + }, + { + "epoch": 4.954573357888275, + "grad_norm": 0.25829461216926575, + "learning_rate": 5.316909150318429e-05, + "loss": 1.7469, + "step": 16142 + }, + { + "epoch": 4.9548802946593, + "grad_norm": 0.21251125633716583, + "learning_rate": 5.3164130921727494e-05, + "loss": 1.7699, + "step": 16143 + }, + { + "epoch": 4.955187231430325, + "grad_norm": 0.29195618629455566, + "learning_rate": 5.315917030900091e-05, + "loss": 1.7373, + "step": 16144 + }, + { + "epoch": 4.9554941682013505, + "grad_norm": 0.29457888007164, + "learning_rate": 5.315420966505355e-05, + "loss": 1.7202, + "step": 16145 + }, + { + "epoch": 4.955801104972376, + "grad_norm": 0.19679461419582367, + "learning_rate": 5.314924898993443e-05, + "loss": 1.75, + "step": 16146 + }, + { + "epoch": 4.956108041743401, + "grad_norm": 0.287955105304718, + "learning_rate": 5.314428828369259e-05, + "loss": 1.7385, + "step": 16147 + }, + { + "epoch": 4.956414978514426, + "grad_norm": 0.3081825375556946, + "learning_rate": 5.313932754637706e-05, + "loss": 1.7558, + "step": 16148 + }, + { + "epoch": 4.956721915285451, + "grad_norm": 0.25226521492004395, + "learning_rate": 5.3134366778036846e-05, + "loss": 1.8407, + "step": 16149 + }, + { + "epoch": 4.957028852056476, + "grad_norm": 0.43601852655410767, + "learning_rate": 5.3129405978720984e-05, + "loss": 1.7762, + "step": 16150 + }, + { + "epoch": 4.957335788827502, + "grad_norm": 0.3630274832248688, + "learning_rate": 5.31244451484785e-05, + "loss": 1.7802, + "step": 16151 + }, + { + "epoch": 4.957642725598527, + "grad_norm": 0.21337948739528656, + "learning_rate": 5.311948428735841e-05, + "loss": 1.7107, + "step": 16152 + }, + { + "epoch": 4.957949662369552, + "grad_norm": 0.38581085205078125, + "learning_rate": 5.311452339540974e-05, + "loss": 1.7583, + "step": 16153 + }, + { + "epoch": 4.958256599140577, + "grad_norm": 0.28447309136390686, + "learning_rate": 5.310956247268154e-05, + "loss": 1.6992, + "step": 16154 + }, + { + "epoch": 4.958563535911602, + "grad_norm": 0.24510730803012848, + "learning_rate": 5.310460151922283e-05, + "loss": 1.7059, + "step": 16155 + }, + { + "epoch": 4.958870472682627, + "grad_norm": 0.41670146584510803, + "learning_rate": 5.309964053508262e-05, + "loss": 1.7191, + "step": 16156 + }, + { + "epoch": 4.959177409453653, + "grad_norm": 0.3123849034309387, + "learning_rate": 5.309467952030993e-05, + "loss": 1.7161, + "step": 16157 + }, + { + "epoch": 4.959484346224678, + "grad_norm": 0.2275281697511673, + "learning_rate": 5.308971847495382e-05, + "loss": 1.722, + "step": 16158 + }, + { + "epoch": 4.9597912829957025, + "grad_norm": 0.40216436982154846, + "learning_rate": 5.308475739906329e-05, + "loss": 1.7477, + "step": 16159 + }, + { + "epoch": 4.960098219766728, + "grad_norm": 0.259981244802475, + "learning_rate": 5.307979629268739e-05, + "loss": 1.7384, + "step": 16160 + }, + { + "epoch": 4.960405156537753, + "grad_norm": 0.22969573736190796, + "learning_rate": 5.3074835155875134e-05, + "loss": 1.7328, + "step": 16161 + }, + { + "epoch": 4.9607120933087785, + "grad_norm": 0.2773746848106384, + "learning_rate": 5.3069873988675556e-05, + "loss": 1.7333, + "step": 16162 + }, + { + "epoch": 4.961019030079804, + "grad_norm": 0.2764189541339874, + "learning_rate": 5.306491279113768e-05, + "loss": 1.7956, + "step": 16163 + }, + { + "epoch": 4.961325966850829, + "grad_norm": 0.3640958070755005, + "learning_rate": 5.305995156331054e-05, + "loss": 1.7464, + "step": 16164 + }, + { + "epoch": 4.961632903621854, + "grad_norm": 0.3573450446128845, + "learning_rate": 5.305499030524317e-05, + "loss": 1.75, + "step": 16165 + }, + { + "epoch": 4.961939840392879, + "grad_norm": 0.24313980340957642, + "learning_rate": 5.305002901698459e-05, + "loss": 1.7505, + "step": 16166 + }, + { + "epoch": 4.962246777163904, + "grad_norm": 0.3417615592479706, + "learning_rate": 5.304506769858384e-05, + "loss": 1.7387, + "step": 16167 + }, + { + "epoch": 4.96255371393493, + "grad_norm": 0.23209623992443085, + "learning_rate": 5.304010635008995e-05, + "loss": 1.7111, + "step": 16168 + }, + { + "epoch": 4.962860650705955, + "grad_norm": 0.2994776666164398, + "learning_rate": 5.3035144971551944e-05, + "loss": 1.75, + "step": 16169 + }, + { + "epoch": 4.963167587476979, + "grad_norm": 0.3147084712982178, + "learning_rate": 5.303018356301884e-05, + "loss": 1.7598, + "step": 16170 + }, + { + "epoch": 4.963474524248005, + "grad_norm": 0.20136526226997375, + "learning_rate": 5.30252221245397e-05, + "loss": 1.7217, + "step": 16171 + }, + { + "epoch": 4.96378146101903, + "grad_norm": 0.3308684229850769, + "learning_rate": 5.302026065616355e-05, + "loss": 1.7554, + "step": 16172 + }, + { + "epoch": 4.964088397790055, + "grad_norm": 0.22890877723693848, + "learning_rate": 5.30152991579394e-05, + "loss": 1.7598, + "step": 16173 + }, + { + "epoch": 4.964395334561081, + "grad_norm": 0.3036035895347595, + "learning_rate": 5.301033762991631e-05, + "loss": 1.758, + "step": 16174 + }, + { + "epoch": 4.964702271332106, + "grad_norm": 0.2983579933643341, + "learning_rate": 5.300537607214329e-05, + "loss": 1.8132, + "step": 16175 + }, + { + "epoch": 4.9650092081031305, + "grad_norm": 0.21401815116405487, + "learning_rate": 5.300041448466937e-05, + "loss": 1.7179, + "step": 16176 + }, + { + "epoch": 4.965316144874156, + "grad_norm": 0.2939651608467102, + "learning_rate": 5.2995452867543606e-05, + "loss": 1.7928, + "step": 16177 + }, + { + "epoch": 4.965623081645181, + "grad_norm": 0.24803484976291656, + "learning_rate": 5.2990491220815034e-05, + "loss": 1.7366, + "step": 16178 + }, + { + "epoch": 4.9659300184162065, + "grad_norm": 0.1999569535255432, + "learning_rate": 5.2985529544532656e-05, + "loss": 1.6691, + "step": 16179 + }, + { + "epoch": 4.966236955187231, + "grad_norm": 0.22315269708633423, + "learning_rate": 5.298056783874553e-05, + "loss": 1.7693, + "step": 16180 + }, + { + "epoch": 4.966543891958256, + "grad_norm": 0.22688794136047363, + "learning_rate": 5.2975606103502694e-05, + "loss": 1.8401, + "step": 16181 + }, + { + "epoch": 4.966850828729282, + "grad_norm": 0.2592024505138397, + "learning_rate": 5.297064433885317e-05, + "loss": 1.8054, + "step": 16182 + }, + { + "epoch": 4.967157765500307, + "grad_norm": 0.2508920133113861, + "learning_rate": 5.2965682544846e-05, + "loss": 1.766, + "step": 16183 + }, + { + "epoch": 4.967464702271332, + "grad_norm": 0.22318799793720245, + "learning_rate": 5.296072072153022e-05, + "loss": 1.751, + "step": 16184 + }, + { + "epoch": 4.967771639042358, + "grad_norm": 0.2348448485136032, + "learning_rate": 5.2955758868954855e-05, + "loss": 1.7844, + "step": 16185 + }, + { + "epoch": 4.968078575813382, + "grad_norm": 0.23294343054294586, + "learning_rate": 5.295079698716895e-05, + "loss": 1.7685, + "step": 16186 + }, + { + "epoch": 4.968385512584407, + "grad_norm": 0.20854508876800537, + "learning_rate": 5.2945835076221526e-05, + "loss": 1.6914, + "step": 16187 + }, + { + "epoch": 4.968692449355433, + "grad_norm": 0.21952031552791595, + "learning_rate": 5.294087313616165e-05, + "loss": 1.7121, + "step": 16188 + }, + { + "epoch": 4.968999386126458, + "grad_norm": 0.24097788333892822, + "learning_rate": 5.2935911167038346e-05, + "loss": 1.7712, + "step": 16189 + }, + { + "epoch": 4.969306322897483, + "grad_norm": 0.24433603882789612, + "learning_rate": 5.293094916890063e-05, + "loss": 1.7608, + "step": 16190 + }, + { + "epoch": 4.969613259668508, + "grad_norm": 0.22209061682224274, + "learning_rate": 5.292598714179757e-05, + "loss": 1.7563, + "step": 16191 + }, + { + "epoch": 4.969920196439533, + "grad_norm": 0.24291595816612244, + "learning_rate": 5.29210250857782e-05, + "loss": 1.7765, + "step": 16192 + }, + { + "epoch": 4.9702271332105585, + "grad_norm": 0.3143673837184906, + "learning_rate": 5.291606300089151e-05, + "loss": 1.7945, + "step": 16193 + }, + { + "epoch": 4.970534069981584, + "grad_norm": 0.22693613171577454, + "learning_rate": 5.291110088718661e-05, + "loss": 1.7411, + "step": 16194 + }, + { + "epoch": 4.970841006752609, + "grad_norm": 0.2271365374326706, + "learning_rate": 5.2906138744712494e-05, + "loss": 1.7754, + "step": 16195 + }, + { + "epoch": 4.9711479435236345, + "grad_norm": 0.2428499162197113, + "learning_rate": 5.290117657351822e-05, + "loss": 1.8007, + "step": 16196 + }, + { + "epoch": 4.971454880294659, + "grad_norm": 0.21862711012363434, + "learning_rate": 5.289621437365281e-05, + "loss": 1.7484, + "step": 16197 + }, + { + "epoch": 4.971761817065684, + "grad_norm": 0.26744964718818665, + "learning_rate": 5.2891252145165315e-05, + "loss": 1.7759, + "step": 16198 + }, + { + "epoch": 4.97206875383671, + "grad_norm": 0.2608526647090912, + "learning_rate": 5.288628988810477e-05, + "loss": 1.8527, + "step": 16199 + }, + { + "epoch": 4.972375690607735, + "grad_norm": 0.2245805710554123, + "learning_rate": 5.2881327602520216e-05, + "loss": 1.7773, + "step": 16200 + }, + { + "epoch": 4.97268262737876, + "grad_norm": 0.22023041546344757, + "learning_rate": 5.2876365288460694e-05, + "loss": 1.7101, + "step": 16201 + }, + { + "epoch": 4.972989564149785, + "grad_norm": 0.22034525871276855, + "learning_rate": 5.287140294597525e-05, + "loss": 1.7672, + "step": 16202 + }, + { + "epoch": 4.97329650092081, + "grad_norm": 0.23101158440113068, + "learning_rate": 5.286644057511292e-05, + "loss": 1.741, + "step": 16203 + }, + { + "epoch": 4.973603437691835, + "grad_norm": 0.23050430417060852, + "learning_rate": 5.286147817592273e-05, + "loss": 1.7727, + "step": 16204 + }, + { + "epoch": 4.973910374462861, + "grad_norm": 0.21803520619869232, + "learning_rate": 5.285651574845374e-05, + "loss": 1.7353, + "step": 16205 + }, + { + "epoch": 4.974217311233886, + "grad_norm": 0.22252169251441956, + "learning_rate": 5.2851553292754995e-05, + "loss": 1.7658, + "step": 16206 + }, + { + "epoch": 4.974524248004911, + "grad_norm": 0.22458864748477936, + "learning_rate": 5.284659080887552e-05, + "loss": 1.7157, + "step": 16207 + }, + { + "epoch": 4.974831184775936, + "grad_norm": 0.20769210159778595, + "learning_rate": 5.2841628296864376e-05, + "loss": 1.7731, + "step": 16208 + }, + { + "epoch": 4.975138121546961, + "grad_norm": 0.1952340304851532, + "learning_rate": 5.283666575677059e-05, + "loss": 1.6907, + "step": 16209 + }, + { + "epoch": 4.975445058317987, + "grad_norm": 0.21943804621696472, + "learning_rate": 5.28317031886432e-05, + "loss": 1.8007, + "step": 16210 + }, + { + "epoch": 4.975751995089012, + "grad_norm": 0.21987493336200714, + "learning_rate": 5.2826740592531276e-05, + "loss": 1.7205, + "step": 16211 + }, + { + "epoch": 4.976058931860036, + "grad_norm": 0.2076522558927536, + "learning_rate": 5.2821777968483845e-05, + "loss": 1.7063, + "step": 16212 + }, + { + "epoch": 4.976365868631062, + "grad_norm": 0.19126583635807037, + "learning_rate": 5.281681531654994e-05, + "loss": 1.7118, + "step": 16213 + }, + { + "epoch": 4.976672805402087, + "grad_norm": 0.22308050096035004, + "learning_rate": 5.2811852636778625e-05, + "loss": 1.7565, + "step": 16214 + }, + { + "epoch": 4.976979742173112, + "grad_norm": 0.23187528550624847, + "learning_rate": 5.280688992921893e-05, + "loss": 1.8261, + "step": 16215 + }, + { + "epoch": 4.977286678944138, + "grad_norm": 0.21373791992664337, + "learning_rate": 5.28019271939199e-05, + "loss": 1.6974, + "step": 16216 + }, + { + "epoch": 4.977593615715163, + "grad_norm": 0.21647346019744873, + "learning_rate": 5.2796964430930585e-05, + "loss": 1.7967, + "step": 16217 + }, + { + "epoch": 4.9779005524861875, + "grad_norm": 0.2231660932302475, + "learning_rate": 5.279200164030002e-05, + "loss": 1.7495, + "step": 16218 + }, + { + "epoch": 4.978207489257213, + "grad_norm": 0.2810545563697815, + "learning_rate": 5.278703882207728e-05, + "loss": 1.875, + "step": 16219 + }, + { + "epoch": 4.978514426028238, + "grad_norm": 0.298984557390213, + "learning_rate": 5.2782075976311374e-05, + "loss": 1.7494, + "step": 16220 + }, + { + "epoch": 4.9788213627992635, + "grad_norm": 0.2530893385410309, + "learning_rate": 5.2777113103051365e-05, + "loss": 1.7594, + "step": 16221 + }, + { + "epoch": 4.979128299570289, + "grad_norm": 0.26165664196014404, + "learning_rate": 5.277215020234629e-05, + "loss": 1.7543, + "step": 16222 + }, + { + "epoch": 4.979435236341313, + "grad_norm": 0.25115957856178284, + "learning_rate": 5.276718727424521e-05, + "loss": 1.7925, + "step": 16223 + }, + { + "epoch": 4.979742173112339, + "grad_norm": 0.22134126722812653, + "learning_rate": 5.276222431879716e-05, + "loss": 1.8359, + "step": 16224 + }, + { + "epoch": 4.980049109883364, + "grad_norm": 0.24447613954544067, + "learning_rate": 5.275726133605119e-05, + "loss": 1.7693, + "step": 16225 + }, + { + "epoch": 4.980356046654389, + "grad_norm": 0.23025095462799072, + "learning_rate": 5.275229832605635e-05, + "loss": 1.7911, + "step": 16226 + }, + { + "epoch": 4.980662983425415, + "grad_norm": 0.23424232006072998, + "learning_rate": 5.2747335288861686e-05, + "loss": 1.7628, + "step": 16227 + }, + { + "epoch": 4.98096992019644, + "grad_norm": 0.24598535895347595, + "learning_rate": 5.2742372224516235e-05, + "loss": 1.7651, + "step": 16228 + }, + { + "epoch": 4.981276856967464, + "grad_norm": 0.262893944978714, + "learning_rate": 5.273740913306906e-05, + "loss": 1.7282, + "step": 16229 + }, + { + "epoch": 4.98158379373849, + "grad_norm": 0.21981783211231232, + "learning_rate": 5.2732446014569207e-05, + "loss": 1.7448, + "step": 16230 + }, + { + "epoch": 4.981890730509515, + "grad_norm": 0.24244973063468933, + "learning_rate": 5.272748286906573e-05, + "loss": 1.7216, + "step": 16231 + }, + { + "epoch": 4.98219766728054, + "grad_norm": 0.2365221232175827, + "learning_rate": 5.272251969660766e-05, + "loss": 1.7227, + "step": 16232 + }, + { + "epoch": 4.982504604051566, + "grad_norm": 0.2081129401922226, + "learning_rate": 5.271755649724405e-05, + "loss": 1.7184, + "step": 16233 + }, + { + "epoch": 4.98281154082259, + "grad_norm": 0.2256374955177307, + "learning_rate": 5.271259327102395e-05, + "loss": 1.7412, + "step": 16234 + }, + { + "epoch": 4.9831184775936155, + "grad_norm": 0.23727381229400635, + "learning_rate": 5.270763001799643e-05, + "loss": 1.8095, + "step": 16235 + }, + { + "epoch": 4.983425414364641, + "grad_norm": 0.21498435735702515, + "learning_rate": 5.2702666738210504e-05, + "loss": 1.744, + "step": 16236 + }, + { + "epoch": 4.983732351135666, + "grad_norm": 0.24772173166275024, + "learning_rate": 5.269770343171525e-05, + "loss": 1.741, + "step": 16237 + }, + { + "epoch": 4.9840392879066915, + "grad_norm": 0.2835623621940613, + "learning_rate": 5.269274009855971e-05, + "loss": 1.7765, + "step": 16238 + }, + { + "epoch": 4.984346224677717, + "grad_norm": 0.2570044696331024, + "learning_rate": 5.2687776738792926e-05, + "loss": 1.8206, + "step": 16239 + }, + { + "epoch": 4.984653161448741, + "grad_norm": 0.21549640595912933, + "learning_rate": 5.268281335246397e-05, + "loss": 1.7022, + "step": 16240 + }, + { + "epoch": 4.984960098219767, + "grad_norm": 0.23158684372901917, + "learning_rate": 5.267784993962187e-05, + "loss": 1.7882, + "step": 16241 + }, + { + "epoch": 4.985267034990792, + "grad_norm": 0.22778423130512238, + "learning_rate": 5.26728865003157e-05, + "loss": 1.7358, + "step": 16242 + }, + { + "epoch": 4.985573971761817, + "grad_norm": 0.23197145760059357, + "learning_rate": 5.266792303459449e-05, + "loss": 1.7687, + "step": 16243 + }, + { + "epoch": 4.985880908532843, + "grad_norm": 0.19270172715187073, + "learning_rate": 5.26629595425073e-05, + "loss": 1.6999, + "step": 16244 + }, + { + "epoch": 4.986187845303867, + "grad_norm": 0.25262632966041565, + "learning_rate": 5.2657996024103175e-05, + "loss": 1.7536, + "step": 16245 + }, + { + "epoch": 4.986494782074892, + "grad_norm": 0.18620926141738892, + "learning_rate": 5.2653032479431185e-05, + "loss": 1.7033, + "step": 16246 + }, + { + "epoch": 4.986801718845918, + "grad_norm": 0.19537273049354553, + "learning_rate": 5.2648068908540374e-05, + "loss": 1.7457, + "step": 16247 + }, + { + "epoch": 4.987108655616943, + "grad_norm": 0.19447599351406097, + "learning_rate": 5.26431053114798e-05, + "loss": 1.7053, + "step": 16248 + }, + { + "epoch": 4.987415592387968, + "grad_norm": 0.20431137084960938, + "learning_rate": 5.263814168829852e-05, + "loss": 1.7695, + "step": 16249 + }, + { + "epoch": 4.987722529158994, + "grad_norm": 0.21123024821281433, + "learning_rate": 5.263317803904554e-05, + "loss": 1.7666, + "step": 16250 + }, + { + "epoch": 4.988029465930018, + "grad_norm": 0.21279335021972656, + "learning_rate": 5.262821436376998e-05, + "loss": 1.7231, + "step": 16251 + }, + { + "epoch": 4.9883364027010435, + "grad_norm": 0.22504910826683044, + "learning_rate": 5.262325066252085e-05, + "loss": 1.7657, + "step": 16252 + }, + { + "epoch": 4.988643339472069, + "grad_norm": 0.23505981266498566, + "learning_rate": 5.261828693534723e-05, + "loss": 1.7576, + "step": 16253 + }, + { + "epoch": 4.988950276243094, + "grad_norm": 0.21553601324558258, + "learning_rate": 5.261332318229817e-05, + "loss": 1.7782, + "step": 16254 + }, + { + "epoch": 4.989257213014119, + "grad_norm": 0.29189521074295044, + "learning_rate": 5.26083594034227e-05, + "loss": 1.7664, + "step": 16255 + }, + { + "epoch": 4.989564149785144, + "grad_norm": 0.38108906149864197, + "learning_rate": 5.26033955987699e-05, + "loss": 1.8573, + "step": 16256 + }, + { + "epoch": 4.989871086556169, + "grad_norm": 0.30329224467277527, + "learning_rate": 5.2598431768388824e-05, + "loss": 1.7584, + "step": 16257 + }, + { + "epoch": 4.990178023327195, + "grad_norm": 0.2437417358160019, + "learning_rate": 5.259346791232852e-05, + "loss": 1.7352, + "step": 16258 + }, + { + "epoch": 4.99048496009822, + "grad_norm": 0.3601737320423126, + "learning_rate": 5.258850403063804e-05, + "loss": 1.7206, + "step": 16259 + }, + { + "epoch": 4.990791896869245, + "grad_norm": 0.20259195566177368, + "learning_rate": 5.258354012336646e-05, + "loss": 1.7403, + "step": 16260 + }, + { + "epoch": 4.99109883364027, + "grad_norm": 0.38022148609161377, + "learning_rate": 5.257857619056281e-05, + "loss": 1.7783, + "step": 16261 + }, + { + "epoch": 4.991405770411295, + "grad_norm": 0.30131712555885315, + "learning_rate": 5.257361223227615e-05, + "loss": 1.7826, + "step": 16262 + }, + { + "epoch": 4.99171270718232, + "grad_norm": 0.24159663915634155, + "learning_rate": 5.2568648248555565e-05, + "loss": 1.7792, + "step": 16263 + }, + { + "epoch": 4.992019643953346, + "grad_norm": 0.4641213119029999, + "learning_rate": 5.2563684239450084e-05, + "loss": 1.7432, + "step": 16264 + }, + { + "epoch": 4.992326580724371, + "grad_norm": 0.3526865541934967, + "learning_rate": 5.255872020500877e-05, + "loss": 1.7736, + "step": 16265 + }, + { + "epoch": 4.9926335174953955, + "grad_norm": 0.2396051585674286, + "learning_rate": 5.255375614528071e-05, + "loss": 1.7505, + "step": 16266 + }, + { + "epoch": 4.992940454266421, + "grad_norm": 0.320987343788147, + "learning_rate": 5.25487920603149e-05, + "loss": 1.8229, + "step": 16267 + }, + { + "epoch": 4.993247391037446, + "grad_norm": 0.24689678847789764, + "learning_rate": 5.254382795016044e-05, + "loss": 1.7011, + "step": 16268 + }, + { + "epoch": 4.9935543278084715, + "grad_norm": 0.2407137155532837, + "learning_rate": 5.253886381486639e-05, + "loss": 1.741, + "step": 16269 + }, + { + "epoch": 4.993861264579497, + "grad_norm": 0.3677252531051636, + "learning_rate": 5.25338996544818e-05, + "loss": 1.7792, + "step": 16270 + }, + { + "epoch": 4.994168201350522, + "grad_norm": 0.25096553564071655, + "learning_rate": 5.252893546905573e-05, + "loss": 1.7523, + "step": 16271 + }, + { + "epoch": 4.994475138121547, + "grad_norm": 0.2966327965259552, + "learning_rate": 5.252397125863723e-05, + "loss": 1.7114, + "step": 16272 + }, + { + "epoch": 4.994782074892572, + "grad_norm": 0.36577650904655457, + "learning_rate": 5.2519007023275356e-05, + "loss": 1.7609, + "step": 16273 + }, + { + "epoch": 4.995089011663597, + "grad_norm": 0.2450687140226364, + "learning_rate": 5.25140427630192e-05, + "loss": 1.7452, + "step": 16274 + }, + { + "epoch": 4.995395948434623, + "grad_norm": 0.20782120525836945, + "learning_rate": 5.250907847791778e-05, + "loss": 1.7109, + "step": 16275 + }, + { + "epoch": 4.995702885205648, + "grad_norm": 0.2423330545425415, + "learning_rate": 5.25041141680202e-05, + "loss": 1.7234, + "step": 16276 + }, + { + "epoch": 4.996009821976672, + "grad_norm": 0.20855975151062012, + "learning_rate": 5.2499149833375484e-05, + "loss": 1.7734, + "step": 16277 + }, + { + "epoch": 4.996316758747698, + "grad_norm": 0.24400894343852997, + "learning_rate": 5.24941854740327e-05, + "loss": 1.7566, + "step": 16278 + }, + { + "epoch": 4.996623695518723, + "grad_norm": 0.4378018379211426, + "learning_rate": 5.2489221090040906e-05, + "loss": 1.7536, + "step": 16279 + }, + { + "epoch": 4.996930632289748, + "grad_norm": 0.20726722478866577, + "learning_rate": 5.248425668144918e-05, + "loss": 1.8008, + "step": 16280 + }, + { + "epoch": 4.997237569060774, + "grad_norm": 0.2506333589553833, + "learning_rate": 5.247929224830658e-05, + "loss": 1.7404, + "step": 16281 + }, + { + "epoch": 4.997544505831799, + "grad_norm": 0.24178004264831543, + "learning_rate": 5.247432779066216e-05, + "loss": 1.7517, + "step": 16282 + }, + { + "epoch": 4.9978514426028235, + "grad_norm": 0.2500220835208893, + "learning_rate": 5.246936330856499e-05, + "loss": 1.7705, + "step": 16283 + }, + { + "epoch": 4.998158379373849, + "grad_norm": 0.30043718218803406, + "learning_rate": 5.24643988020641e-05, + "loss": 1.8118, + "step": 16284 + }, + { + "epoch": 4.998465316144874, + "grad_norm": 0.284805566072464, + "learning_rate": 5.245943427120859e-05, + "loss": 1.7968, + "step": 16285 + }, + { + "epoch": 4.9987722529158995, + "grad_norm": 0.3652406632900238, + "learning_rate": 5.245446971604751e-05, + "loss": 1.7785, + "step": 16286 + }, + { + "epoch": 4.999079189686924, + "grad_norm": 0.24879656732082367, + "learning_rate": 5.244950513662992e-05, + "loss": 1.734, + "step": 16287 + }, + { + "epoch": 4.999386126457949, + "grad_norm": 0.2374224215745926, + "learning_rate": 5.244454053300488e-05, + "loss": 1.7394, + "step": 16288 + }, + { + "epoch": 4.999693063228975, + "grad_norm": 0.27090463042259216, + "learning_rate": 5.243957590522147e-05, + "loss": 1.7529, + "step": 16289 + }, + { + "epoch": 5.0, + "grad_norm": 0.23060791194438934, + "learning_rate": 5.243461125332873e-05, + "loss": 1.7599, + "step": 16290 + }, + { + "epoch": 5.000306936771025, + "grad_norm": 0.21159487962722778, + "learning_rate": 5.242964657737572e-05, + "loss": 1.747, + "step": 16291 + }, + { + "epoch": 5.000613873542051, + "grad_norm": 0.21556304395198822, + "learning_rate": 5.242468187741154e-05, + "loss": 1.7653, + "step": 16292 + }, + { + "epoch": 5.000920810313075, + "grad_norm": 0.2569669783115387, + "learning_rate": 5.241971715348524e-05, + "loss": 1.7284, + "step": 16293 + }, + { + "epoch": 5.0012277470841005, + "grad_norm": 0.2827381491661072, + "learning_rate": 5.241475240564586e-05, + "loss": 1.7765, + "step": 16294 + }, + { + "epoch": 5.001534683855126, + "grad_norm": 0.22498267889022827, + "learning_rate": 5.240978763394249e-05, + "loss": 1.729, + "step": 16295 + }, + { + "epoch": 5.001841620626151, + "grad_norm": 0.23975814878940582, + "learning_rate": 5.240482283842418e-05, + "loss": 1.7968, + "step": 16296 + }, + { + "epoch": 5.0021485573971765, + "grad_norm": 0.20811420679092407, + "learning_rate": 5.239985801914e-05, + "loss": 1.6931, + "step": 16297 + }, + { + "epoch": 5.002455494168202, + "grad_norm": 0.22985060513019562, + "learning_rate": 5.2394893176139014e-05, + "loss": 1.7724, + "step": 16298 + }, + { + "epoch": 5.002762430939226, + "grad_norm": 0.22867995500564575, + "learning_rate": 5.2389928309470305e-05, + "loss": 1.7179, + "step": 16299 + }, + { + "epoch": 5.003069367710252, + "grad_norm": 0.2543974220752716, + "learning_rate": 5.238496341918293e-05, + "loss": 1.7859, + "step": 16300 + }, + { + "epoch": 5.003376304481277, + "grad_norm": 0.226583793759346, + "learning_rate": 5.237999850532592e-05, + "loss": 1.7567, + "step": 16301 + }, + { + "epoch": 5.003683241252302, + "grad_norm": 0.21744728088378906, + "learning_rate": 5.237503356794838e-05, + "loss": 1.7345, + "step": 16302 + }, + { + "epoch": 5.003990178023328, + "grad_norm": 0.25915467739105225, + "learning_rate": 5.2370068607099373e-05, + "loss": 1.7179, + "step": 16303 + }, + { + "epoch": 5.004297114794352, + "grad_norm": 0.20572461187839508, + "learning_rate": 5.236510362282796e-05, + "loss": 1.7211, + "step": 16304 + }, + { + "epoch": 5.004604051565377, + "grad_norm": 0.2821461856365204, + "learning_rate": 5.236013861518321e-05, + "loss": 1.7894, + "step": 16305 + }, + { + "epoch": 5.004910988336403, + "grad_norm": 0.22273759543895721, + "learning_rate": 5.235517358421417e-05, + "loss": 1.7919, + "step": 16306 + }, + { + "epoch": 5.005217925107428, + "grad_norm": 0.23875468969345093, + "learning_rate": 5.2350208529969935e-05, + "loss": 1.7558, + "step": 16307 + }, + { + "epoch": 5.005524861878453, + "grad_norm": 0.24673783779144287, + "learning_rate": 5.234524345249955e-05, + "loss": 1.7705, + "step": 16308 + }, + { + "epoch": 5.005831798649478, + "grad_norm": 0.21992872655391693, + "learning_rate": 5.234027835185211e-05, + "loss": 1.7059, + "step": 16309 + }, + { + "epoch": 5.006138735420503, + "grad_norm": 0.19214966893196106, + "learning_rate": 5.233531322807667e-05, + "loss": 1.6647, + "step": 16310 + }, + { + "epoch": 5.0064456721915285, + "grad_norm": 0.18525120615959167, + "learning_rate": 5.233034808122228e-05, + "loss": 1.719, + "step": 16311 + }, + { + "epoch": 5.006752608962554, + "grad_norm": 0.25996243953704834, + "learning_rate": 5.232538291133804e-05, + "loss": 1.7227, + "step": 16312 + }, + { + "epoch": 5.007059545733579, + "grad_norm": 0.2163757085800171, + "learning_rate": 5.232041771847299e-05, + "loss": 1.6962, + "step": 16313 + }, + { + "epoch": 5.0073664825046045, + "grad_norm": 0.23484158515930176, + "learning_rate": 5.231545250267621e-05, + "loss": 1.7816, + "step": 16314 + }, + { + "epoch": 5.007673419275629, + "grad_norm": 0.2188636213541031, + "learning_rate": 5.2310487263996776e-05, + "loss": 1.7477, + "step": 16315 + }, + { + "epoch": 5.007980356046654, + "grad_norm": 0.1950213611125946, + "learning_rate": 5.230552200248377e-05, + "loss": 1.7165, + "step": 16316 + }, + { + "epoch": 5.00828729281768, + "grad_norm": 0.25340089201927185, + "learning_rate": 5.230055671818623e-05, + "loss": 1.7764, + "step": 16317 + }, + { + "epoch": 5.008594229588705, + "grad_norm": 0.23749271035194397, + "learning_rate": 5.2295591411153245e-05, + "loss": 1.7193, + "step": 16318 + }, + { + "epoch": 5.00890116635973, + "grad_norm": 0.2317294180393219, + "learning_rate": 5.229062608143387e-05, + "loss": 1.7607, + "step": 16319 + }, + { + "epoch": 5.009208103130755, + "grad_norm": 0.2751505672931671, + "learning_rate": 5.228566072907719e-05, + "loss": 1.7562, + "step": 16320 + }, + { + "epoch": 5.00951503990178, + "grad_norm": 0.29476025700569153, + "learning_rate": 5.2280695354132267e-05, + "loss": 1.687, + "step": 16321 + }, + { + "epoch": 5.009821976672805, + "grad_norm": 0.20734120905399323, + "learning_rate": 5.227572995664819e-05, + "loss": 1.7608, + "step": 16322 + }, + { + "epoch": 5.010128913443831, + "grad_norm": 0.2537878155708313, + "learning_rate": 5.227076453667401e-05, + "loss": 1.7947, + "step": 16323 + }, + { + "epoch": 5.010435850214856, + "grad_norm": 0.23516076803207397, + "learning_rate": 5.2265799094258796e-05, + "loss": 1.7545, + "step": 16324 + }, + { + "epoch": 5.0107427869858805, + "grad_norm": 0.2581529915332794, + "learning_rate": 5.226083362945162e-05, + "loss": 1.7529, + "step": 16325 + }, + { + "epoch": 5.011049723756906, + "grad_norm": 0.2982035279273987, + "learning_rate": 5.225586814230158e-05, + "loss": 1.74, + "step": 16326 + }, + { + "epoch": 5.011356660527931, + "grad_norm": 0.2773981988430023, + "learning_rate": 5.225090263285772e-05, + "loss": 1.7562, + "step": 16327 + }, + { + "epoch": 5.0116635972989565, + "grad_norm": 0.19992689788341522, + "learning_rate": 5.2245937101169116e-05, + "loss": 1.6896, + "step": 16328 + }, + { + "epoch": 5.011970534069982, + "grad_norm": 0.2913428246974945, + "learning_rate": 5.224097154728486e-05, + "loss": 1.7574, + "step": 16329 + }, + { + "epoch": 5.012277470841007, + "grad_norm": 0.23173104226589203, + "learning_rate": 5.2236005971254e-05, + "loss": 1.6954, + "step": 16330 + }, + { + "epoch": 5.012584407612032, + "grad_norm": 0.2019525170326233, + "learning_rate": 5.2231040373125614e-05, + "loss": 1.7711, + "step": 16331 + }, + { + "epoch": 5.012891344383057, + "grad_norm": 0.29070746898651123, + "learning_rate": 5.222607475294878e-05, + "loss": 1.8201, + "step": 16332 + }, + { + "epoch": 5.013198281154082, + "grad_norm": 0.22005079686641693, + "learning_rate": 5.222110911077258e-05, + "loss": 1.7421, + "step": 16333 + }, + { + "epoch": 5.013505217925108, + "grad_norm": 0.24422192573547363, + "learning_rate": 5.2216143446646085e-05, + "loss": 1.7074, + "step": 16334 + }, + { + "epoch": 5.013812154696133, + "grad_norm": 0.2417927384376526, + "learning_rate": 5.221117776061836e-05, + "loss": 1.7726, + "step": 16335 + }, + { + "epoch": 5.014119091467157, + "grad_norm": 0.245828777551651, + "learning_rate": 5.2206212052738454e-05, + "loss": 1.7932, + "step": 16336 + }, + { + "epoch": 5.014426028238183, + "grad_norm": 0.24054239690303802, + "learning_rate": 5.220124632305548e-05, + "loss": 1.727, + "step": 16337 + }, + { + "epoch": 5.014732965009208, + "grad_norm": 0.2572494149208069, + "learning_rate": 5.21962805716185e-05, + "loss": 1.7234, + "step": 16338 + }, + { + "epoch": 5.015039901780233, + "grad_norm": 0.33624622225761414, + "learning_rate": 5.2191314798476595e-05, + "loss": 1.7499, + "step": 16339 + }, + { + "epoch": 5.015346838551259, + "grad_norm": 0.22321413457393646, + "learning_rate": 5.218634900367883e-05, + "loss": 1.7155, + "step": 16340 + }, + { + "epoch": 5.015653775322283, + "grad_norm": 0.26709917187690735, + "learning_rate": 5.218138318727429e-05, + "loss": 1.8346, + "step": 16341 + }, + { + "epoch": 5.0159607120933085, + "grad_norm": 0.27600952982902527, + "learning_rate": 5.217641734931202e-05, + "loss": 1.789, + "step": 16342 + }, + { + "epoch": 5.016267648864334, + "grad_norm": 0.21392405033111572, + "learning_rate": 5.217145148984114e-05, + "loss": 1.7266, + "step": 16343 + }, + { + "epoch": 5.016574585635359, + "grad_norm": 0.3215450942516327, + "learning_rate": 5.2166485608910696e-05, + "loss": 1.7453, + "step": 16344 + }, + { + "epoch": 5.0168815224063845, + "grad_norm": 0.22328032553195953, + "learning_rate": 5.2161519706569776e-05, + "loss": 1.7209, + "step": 16345 + }, + { + "epoch": 5.01718845917741, + "grad_norm": 0.2438887059688568, + "learning_rate": 5.215655378286744e-05, + "loss": 1.7289, + "step": 16346 + }, + { + "epoch": 5.017495395948434, + "grad_norm": 0.30078747868537903, + "learning_rate": 5.2151587837852786e-05, + "loss": 1.7483, + "step": 16347 + }, + { + "epoch": 5.01780233271946, + "grad_norm": 0.21723167598247528, + "learning_rate": 5.214662187157488e-05, + "loss": 1.7654, + "step": 16348 + }, + { + "epoch": 5.018109269490485, + "grad_norm": 0.26358669996261597, + "learning_rate": 5.2141655884082784e-05, + "loss": 1.7563, + "step": 16349 + }, + { + "epoch": 5.01841620626151, + "grad_norm": 0.24285505712032318, + "learning_rate": 5.2136689875425615e-05, + "loss": 1.7377, + "step": 16350 + }, + { + "epoch": 5.018723143032536, + "grad_norm": 0.2401108294725418, + "learning_rate": 5.2131723845652416e-05, + "loss": 1.7445, + "step": 16351 + }, + { + "epoch": 5.01903007980356, + "grad_norm": 0.3347793519496918, + "learning_rate": 5.212675779481226e-05, + "loss": 1.7872, + "step": 16352 + }, + { + "epoch": 5.019337016574585, + "grad_norm": 0.306728720664978, + "learning_rate": 5.212179172295424e-05, + "loss": 1.8051, + "step": 16353 + }, + { + "epoch": 5.019643953345611, + "grad_norm": 0.22297725081443787, + "learning_rate": 5.211682563012743e-05, + "loss": 1.7082, + "step": 16354 + }, + { + "epoch": 5.019950890116636, + "grad_norm": 0.24047277867794037, + "learning_rate": 5.211185951638091e-05, + "loss": 1.7024, + "step": 16355 + }, + { + "epoch": 5.020257826887661, + "grad_norm": 0.19570080935955048, + "learning_rate": 5.210689338176377e-05, + "loss": 1.6947, + "step": 16356 + }, + { + "epoch": 5.020564763658686, + "grad_norm": 0.2024889886379242, + "learning_rate": 5.2101927226325066e-05, + "loss": 1.7168, + "step": 16357 + }, + { + "epoch": 5.020871700429711, + "grad_norm": 0.23546278476715088, + "learning_rate": 5.209696105011388e-05, + "loss": 1.7697, + "step": 16358 + }, + { + "epoch": 5.0211786372007365, + "grad_norm": 0.21003498136997223, + "learning_rate": 5.209199485317928e-05, + "loss": 1.7198, + "step": 16359 + }, + { + "epoch": 5.021485573971762, + "grad_norm": 0.21375493705272675, + "learning_rate": 5.208702863557039e-05, + "loss": 1.7689, + "step": 16360 + }, + { + "epoch": 5.021792510742787, + "grad_norm": 0.21549762785434723, + "learning_rate": 5.2082062397336254e-05, + "loss": 1.6936, + "step": 16361 + }, + { + "epoch": 5.0220994475138125, + "grad_norm": 0.22633691132068634, + "learning_rate": 5.207709613852595e-05, + "loss": 1.7512, + "step": 16362 + }, + { + "epoch": 5.022406384284837, + "grad_norm": 0.21888238191604614, + "learning_rate": 5.2072129859188566e-05, + "loss": 1.7082, + "step": 16363 + }, + { + "epoch": 5.022713321055862, + "grad_norm": 0.2416619062423706, + "learning_rate": 5.206716355937318e-05, + "loss": 1.7938, + "step": 16364 + }, + { + "epoch": 5.023020257826888, + "grad_norm": 0.22451527416706085, + "learning_rate": 5.206219723912886e-05, + "loss": 1.7372, + "step": 16365 + }, + { + "epoch": 5.023327194597913, + "grad_norm": 0.19698494672775269, + "learning_rate": 5.2057230898504716e-05, + "loss": 1.7205, + "step": 16366 + }, + { + "epoch": 5.023634131368938, + "grad_norm": 0.2441127747297287, + "learning_rate": 5.205226453754982e-05, + "loss": 1.7625, + "step": 16367 + }, + { + "epoch": 5.023941068139963, + "grad_norm": 0.21940121054649353, + "learning_rate": 5.204729815631323e-05, + "loss": 1.7985, + "step": 16368 + }, + { + "epoch": 5.024248004910988, + "grad_norm": 0.21751399338245392, + "learning_rate": 5.204233175484403e-05, + "loss": 1.7759, + "step": 16369 + }, + { + "epoch": 5.024554941682013, + "grad_norm": 0.20261377096176147, + "learning_rate": 5.2037365333191315e-05, + "loss": 1.746, + "step": 16370 + }, + { + "epoch": 5.024861878453039, + "grad_norm": 0.2628774046897888, + "learning_rate": 5.2032398891404166e-05, + "loss": 1.8178, + "step": 16371 + }, + { + "epoch": 5.025168815224064, + "grad_norm": 0.20626378059387207, + "learning_rate": 5.2027432429531665e-05, + "loss": 1.7456, + "step": 16372 + }, + { + "epoch": 5.0254757519950894, + "grad_norm": 0.25548869371414185, + "learning_rate": 5.2022465947622876e-05, + "loss": 1.8098, + "step": 16373 + }, + { + "epoch": 5.025782688766114, + "grad_norm": 0.1978374719619751, + "learning_rate": 5.20174994457269e-05, + "loss": 1.685, + "step": 16374 + }, + { + "epoch": 5.026089625537139, + "grad_norm": 0.2708980143070221, + "learning_rate": 5.201253292389282e-05, + "loss": 1.7464, + "step": 16375 + }, + { + "epoch": 5.026396562308165, + "grad_norm": 0.2730494737625122, + "learning_rate": 5.2007566382169706e-05, + "loss": 1.7391, + "step": 16376 + }, + { + "epoch": 5.02670349907919, + "grad_norm": 0.243557408452034, + "learning_rate": 5.2002599820606624e-05, + "loss": 1.7439, + "step": 16377 + }, + { + "epoch": 5.027010435850215, + "grad_norm": 0.2208259105682373, + "learning_rate": 5.19976332392527e-05, + "loss": 1.7612, + "step": 16378 + }, + { + "epoch": 5.02731737262124, + "grad_norm": 0.21288715302944183, + "learning_rate": 5.199266663815698e-05, + "loss": 1.7546, + "step": 16379 + }, + { + "epoch": 5.027624309392265, + "grad_norm": 0.2106054425239563, + "learning_rate": 5.198770001736857e-05, + "loss": 1.7281, + "step": 16380 + }, + { + "epoch": 5.02793124616329, + "grad_norm": 0.2247164249420166, + "learning_rate": 5.198273337693654e-05, + "loss": 1.8405, + "step": 16381 + }, + { + "epoch": 5.028238182934316, + "grad_norm": 0.21713724732398987, + "learning_rate": 5.197776671690998e-05, + "loss": 1.7333, + "step": 16382 + }, + { + "epoch": 5.028545119705341, + "grad_norm": 0.24063727259635925, + "learning_rate": 5.1972800037337956e-05, + "loss": 1.7608, + "step": 16383 + }, + { + "epoch": 5.0288520564763655, + "grad_norm": 0.22022177278995514, + "learning_rate": 5.196783333826959e-05, + "loss": 1.7045, + "step": 16384 + }, + { + "epoch": 5.029158993247391, + "grad_norm": 0.21348948776721954, + "learning_rate": 5.1962866619753927e-05, + "loss": 1.7516, + "step": 16385 + }, + { + "epoch": 5.029465930018416, + "grad_norm": 0.289315789937973, + "learning_rate": 5.195789988184007e-05, + "loss": 1.8555, + "step": 16386 + }, + { + "epoch": 5.0297728667894415, + "grad_norm": 0.30966848134994507, + "learning_rate": 5.19529331245771e-05, + "loss": 1.7245, + "step": 16387 + }, + { + "epoch": 5.030079803560467, + "grad_norm": 0.24625633656978607, + "learning_rate": 5.194796634801409e-05, + "loss": 1.7788, + "step": 16388 + }, + { + "epoch": 5.030386740331492, + "grad_norm": 0.25937986373901367, + "learning_rate": 5.1942999552200136e-05, + "loss": 1.7655, + "step": 16389 + }, + { + "epoch": 5.030693677102517, + "grad_norm": 0.3056741952896118, + "learning_rate": 5.1938032737184325e-05, + "loss": 1.7167, + "step": 16390 + }, + { + "epoch": 5.031000613873542, + "grad_norm": 0.29773563146591187, + "learning_rate": 5.1933065903015743e-05, + "loss": 1.7247, + "step": 16391 + }, + { + "epoch": 5.031307550644567, + "grad_norm": 0.26433971524238586, + "learning_rate": 5.192809904974347e-05, + "loss": 1.7779, + "step": 16392 + }, + { + "epoch": 5.031614487415593, + "grad_norm": 0.3308073580265045, + "learning_rate": 5.192313217741659e-05, + "loss": 1.7782, + "step": 16393 + }, + { + "epoch": 5.031921424186618, + "grad_norm": 0.2584165632724762, + "learning_rate": 5.1918165286084176e-05, + "loss": 1.7812, + "step": 16394 + }, + { + "epoch": 5.032228360957642, + "grad_norm": 0.31678953766822815, + "learning_rate": 5.1913198375795346e-05, + "loss": 1.7341, + "step": 16395 + }, + { + "epoch": 5.032535297728668, + "grad_norm": 0.3527325391769409, + "learning_rate": 5.190823144659916e-05, + "loss": 1.7844, + "step": 16396 + }, + { + "epoch": 5.032842234499693, + "grad_norm": 0.29233935475349426, + "learning_rate": 5.1903264498544724e-05, + "loss": 1.7993, + "step": 16397 + }, + { + "epoch": 5.033149171270718, + "grad_norm": 0.24549467861652374, + "learning_rate": 5.1898297531681106e-05, + "loss": 1.7294, + "step": 16398 + }, + { + "epoch": 5.033456108041744, + "grad_norm": 0.3446930944919586, + "learning_rate": 5.18933305460574e-05, + "loss": 1.6818, + "step": 16399 + }, + { + "epoch": 5.033763044812768, + "grad_norm": 0.2628229856491089, + "learning_rate": 5.188836354172268e-05, + "loss": 1.7867, + "step": 16400 + }, + { + "epoch": 5.0340699815837935, + "grad_norm": 0.26548629999160767, + "learning_rate": 5.188339651872607e-05, + "loss": 1.7448, + "step": 16401 + }, + { + "epoch": 5.034376918354819, + "grad_norm": 0.29242032766342163, + "learning_rate": 5.187842947711662e-05, + "loss": 1.7103, + "step": 16402 + }, + { + "epoch": 5.034683855125844, + "grad_norm": 0.2515408992767334, + "learning_rate": 5.187346241694343e-05, + "loss": 1.7865, + "step": 16403 + }, + { + "epoch": 5.0349907918968695, + "grad_norm": 0.2253103256225586, + "learning_rate": 5.186849533825559e-05, + "loss": 1.6993, + "step": 16404 + }, + { + "epoch": 5.035297728667895, + "grad_norm": 0.2743360102176666, + "learning_rate": 5.1863528241102154e-05, + "loss": 1.7532, + "step": 16405 + }, + { + "epoch": 5.035604665438919, + "grad_norm": 0.22807851433753967, + "learning_rate": 5.185856112553227e-05, + "loss": 1.7873, + "step": 16406 + }, + { + "epoch": 5.035911602209945, + "grad_norm": 0.23719090223312378, + "learning_rate": 5.1853593991594985e-05, + "loss": 1.7555, + "step": 16407 + }, + { + "epoch": 5.03621853898097, + "grad_norm": 0.2964477241039276, + "learning_rate": 5.184862683933941e-05, + "loss": 1.7204, + "step": 16408 + }, + { + "epoch": 5.036525475751995, + "grad_norm": 0.23717865347862244, + "learning_rate": 5.18436596688146e-05, + "loss": 1.7239, + "step": 16409 + }, + { + "epoch": 5.036832412523021, + "grad_norm": 0.22650085389614105, + "learning_rate": 5.1838692480069686e-05, + "loss": 1.7148, + "step": 16410 + }, + { + "epoch": 5.037139349294045, + "grad_norm": 0.25606781244277954, + "learning_rate": 5.183372527315371e-05, + "loss": 1.7916, + "step": 16411 + }, + { + "epoch": 5.03744628606507, + "grad_norm": 0.22266390919685364, + "learning_rate": 5.182875804811581e-05, + "loss": 1.7481, + "step": 16412 + }, + { + "epoch": 5.037753222836096, + "grad_norm": 0.23481780290603638, + "learning_rate": 5.1823790805005045e-05, + "loss": 1.8014, + "step": 16413 + }, + { + "epoch": 5.038060159607121, + "grad_norm": 0.2629338800907135, + "learning_rate": 5.1818823543870506e-05, + "loss": 1.81, + "step": 16414 + }, + { + "epoch": 5.038367096378146, + "grad_norm": 0.22891482710838318, + "learning_rate": 5.18138562647613e-05, + "loss": 1.757, + "step": 16415 + }, + { + "epoch": 5.038674033149171, + "grad_norm": 0.2666641175746918, + "learning_rate": 5.180888896772649e-05, + "loss": 1.7457, + "step": 16416 + }, + { + "epoch": 5.038980969920196, + "grad_norm": 0.37610310316085815, + "learning_rate": 5.180392165281517e-05, + "loss": 1.8214, + "step": 16417 + }, + { + "epoch": 5.0392879066912215, + "grad_norm": 0.2521277964115143, + "learning_rate": 5.1798954320076455e-05, + "loss": 1.7731, + "step": 16418 + }, + { + "epoch": 5.039594843462247, + "grad_norm": 0.25097090005874634, + "learning_rate": 5.1793986969559415e-05, + "loss": 1.8029, + "step": 16419 + }, + { + "epoch": 5.039901780233272, + "grad_norm": 0.2946726381778717, + "learning_rate": 5.178901960131315e-05, + "loss": 1.7483, + "step": 16420 + }, + { + "epoch": 5.0402087170042975, + "grad_norm": 0.24240419268608093, + "learning_rate": 5.1784052215386736e-05, + "loss": 1.731, + "step": 16421 + }, + { + "epoch": 5.040515653775322, + "grad_norm": 0.2403198480606079, + "learning_rate": 5.177908481182926e-05, + "loss": 1.722, + "step": 16422 + }, + { + "epoch": 5.040822590546347, + "grad_norm": 0.3451874554157257, + "learning_rate": 5.177411739068985e-05, + "loss": 1.7562, + "step": 16423 + }, + { + "epoch": 5.041129527317373, + "grad_norm": 0.3244951069355011, + "learning_rate": 5.176914995201756e-05, + "loss": 1.7321, + "step": 16424 + }, + { + "epoch": 5.041436464088398, + "grad_norm": 0.2346230000257492, + "learning_rate": 5.176418249586149e-05, + "loss": 1.7839, + "step": 16425 + }, + { + "epoch": 5.041743400859423, + "grad_norm": 0.357022225856781, + "learning_rate": 5.1759215022270744e-05, + "loss": 1.7776, + "step": 16426 + }, + { + "epoch": 5.042050337630448, + "grad_norm": 0.259007066488266, + "learning_rate": 5.17542475312944e-05, + "loss": 1.7544, + "step": 16427 + }, + { + "epoch": 5.042357274401473, + "grad_norm": 0.2516533136367798, + "learning_rate": 5.174928002298154e-05, + "loss": 1.7269, + "step": 16428 + }, + { + "epoch": 5.042664211172498, + "grad_norm": 0.3393619954586029, + "learning_rate": 5.174431249738129e-05, + "loss": 1.7487, + "step": 16429 + }, + { + "epoch": 5.042971147943524, + "grad_norm": 0.2730594873428345, + "learning_rate": 5.1739344954542714e-05, + "loss": 1.7468, + "step": 16430 + }, + { + "epoch": 5.043278084714549, + "grad_norm": 0.21233965456485748, + "learning_rate": 5.1734377394514914e-05, + "loss": 1.783, + "step": 16431 + }, + { + "epoch": 5.043585021485574, + "grad_norm": 0.3460896909236908, + "learning_rate": 5.1729409817346974e-05, + "loss": 1.7497, + "step": 16432 + }, + { + "epoch": 5.043891958256599, + "grad_norm": 0.31918221712112427, + "learning_rate": 5.1724442223088e-05, + "loss": 1.7834, + "step": 16433 + }, + { + "epoch": 5.044198895027624, + "grad_norm": 0.23016802966594696, + "learning_rate": 5.171947461178706e-05, + "loss": 1.7348, + "step": 16434 + }, + { + "epoch": 5.0445058317986495, + "grad_norm": 0.35758304595947266, + "learning_rate": 5.171450698349329e-05, + "loss": 1.7734, + "step": 16435 + }, + { + "epoch": 5.044812768569675, + "grad_norm": 0.279725581407547, + "learning_rate": 5.170953933825574e-05, + "loss": 1.7283, + "step": 16436 + }, + { + "epoch": 5.0451197053407, + "grad_norm": 0.23965120315551758, + "learning_rate": 5.170457167612354e-05, + "loss": 1.7606, + "step": 16437 + }, + { + "epoch": 5.045426642111725, + "grad_norm": 0.28026309609413147, + "learning_rate": 5.169960399714574e-05, + "loss": 1.7872, + "step": 16438 + }, + { + "epoch": 5.04573357888275, + "grad_norm": 0.3262448012828827, + "learning_rate": 5.169463630137146e-05, + "loss": 1.8654, + "step": 16439 + }, + { + "epoch": 5.046040515653775, + "grad_norm": 0.4249584674835205, + "learning_rate": 5.168966858884979e-05, + "loss": 1.7244, + "step": 16440 + }, + { + "epoch": 5.046347452424801, + "grad_norm": 0.3385370969772339, + "learning_rate": 5.168470085962984e-05, + "loss": 1.7745, + "step": 16441 + }, + { + "epoch": 5.046654389195826, + "grad_norm": 0.2321811318397522, + "learning_rate": 5.1679733113760675e-05, + "loss": 1.8093, + "step": 16442 + }, + { + "epoch": 5.04696132596685, + "grad_norm": 0.3426755368709564, + "learning_rate": 5.167476535129141e-05, + "loss": 1.7752, + "step": 16443 + }, + { + "epoch": 5.047268262737876, + "grad_norm": 0.27672505378723145, + "learning_rate": 5.166979757227114e-05, + "loss": 1.7619, + "step": 16444 + }, + { + "epoch": 5.047575199508901, + "grad_norm": 0.4111184775829315, + "learning_rate": 5.1664829776748925e-05, + "loss": 1.7672, + "step": 16445 + }, + { + "epoch": 5.047882136279926, + "grad_norm": 0.40139874815940857, + "learning_rate": 5.1659861964773905e-05, + "loss": 1.7753, + "step": 16446 + }, + { + "epoch": 5.048189073050952, + "grad_norm": 0.28931725025177, + "learning_rate": 5.165489413639516e-05, + "loss": 1.7607, + "step": 16447 + }, + { + "epoch": 5.048496009821977, + "grad_norm": 0.297538161277771, + "learning_rate": 5.1649926291661775e-05, + "loss": 1.7661, + "step": 16448 + }, + { + "epoch": 5.0488029465930016, + "grad_norm": 0.4299027621746063, + "learning_rate": 5.1644958430622846e-05, + "loss": 1.6998, + "step": 16449 + }, + { + "epoch": 5.049109883364027, + "grad_norm": 0.2554767429828644, + "learning_rate": 5.163999055332749e-05, + "loss": 1.7716, + "step": 16450 + }, + { + "epoch": 5.049416820135052, + "grad_norm": 0.3561006486415863, + "learning_rate": 5.163502265982477e-05, + "loss": 1.7493, + "step": 16451 + }, + { + "epoch": 5.0497237569060776, + "grad_norm": 0.3839687407016754, + "learning_rate": 5.1630054750163806e-05, + "loss": 1.7314, + "step": 16452 + }, + { + "epoch": 5.050030693677103, + "grad_norm": 0.20022284984588623, + "learning_rate": 5.1625086824393684e-05, + "loss": 1.6992, + "step": 16453 + }, + { + "epoch": 5.050337630448127, + "grad_norm": 0.36830398440361023, + "learning_rate": 5.162011888256349e-05, + "loss": 1.7339, + "step": 16454 + }, + { + "epoch": 5.050644567219153, + "grad_norm": 0.31947389245033264, + "learning_rate": 5.161515092472236e-05, + "loss": 1.7254, + "step": 16455 + }, + { + "epoch": 5.050951503990178, + "grad_norm": 0.2779252827167511, + "learning_rate": 5.161018295091933e-05, + "loss": 1.7941, + "step": 16456 + }, + { + "epoch": 5.051258440761203, + "grad_norm": 0.3796578347682953, + "learning_rate": 5.160521496120354e-05, + "loss": 1.7389, + "step": 16457 + }, + { + "epoch": 5.051565377532229, + "grad_norm": 0.23569442331790924, + "learning_rate": 5.1600246955624076e-05, + "loss": 1.7149, + "step": 16458 + }, + { + "epoch": 5.051872314303253, + "grad_norm": 0.27342507243156433, + "learning_rate": 5.159527893423004e-05, + "loss": 1.699, + "step": 16459 + }, + { + "epoch": 5.0521792510742785, + "grad_norm": 0.2877296209335327, + "learning_rate": 5.159031089707052e-05, + "loss": 1.7668, + "step": 16460 + }, + { + "epoch": 5.052486187845304, + "grad_norm": 0.21482446789741516, + "learning_rate": 5.1585342844194605e-05, + "loss": 1.7132, + "step": 16461 + }, + { + "epoch": 5.052793124616329, + "grad_norm": 0.23588669300079346, + "learning_rate": 5.158037477565142e-05, + "loss": 1.7267, + "step": 16462 + }, + { + "epoch": 5.0531000613873545, + "grad_norm": 0.20188623666763306, + "learning_rate": 5.157540669149003e-05, + "loss": 1.7486, + "step": 16463 + }, + { + "epoch": 5.05340699815838, + "grad_norm": 0.2012643963098526, + "learning_rate": 5.157043859175955e-05, + "loss": 1.718, + "step": 16464 + }, + { + "epoch": 5.053713934929404, + "grad_norm": 0.23133818805217743, + "learning_rate": 5.156547047650908e-05, + "loss": 1.7892, + "step": 16465 + }, + { + "epoch": 5.05402087170043, + "grad_norm": 0.2524542510509491, + "learning_rate": 5.156050234578771e-05, + "loss": 1.8034, + "step": 16466 + }, + { + "epoch": 5.054327808471455, + "grad_norm": 0.20992529392242432, + "learning_rate": 5.155553419964454e-05, + "loss": 1.7158, + "step": 16467 + }, + { + "epoch": 5.05463474524248, + "grad_norm": 0.23815447092056274, + "learning_rate": 5.155056603812868e-05, + "loss": 1.7632, + "step": 16468 + }, + { + "epoch": 5.054941682013506, + "grad_norm": 0.3306051790714264, + "learning_rate": 5.1545597861289205e-05, + "loss": 1.7719, + "step": 16469 + }, + { + "epoch": 5.05524861878453, + "grad_norm": 0.287541925907135, + "learning_rate": 5.154062966917523e-05, + "loss": 1.7092, + "step": 16470 + }, + { + "epoch": 5.055555555555555, + "grad_norm": 0.28186658024787903, + "learning_rate": 5.153566146183586e-05, + "loss": 1.8548, + "step": 16471 + }, + { + "epoch": 5.055862492326581, + "grad_norm": 0.3511136472225189, + "learning_rate": 5.153069323932017e-05, + "loss": 1.8029, + "step": 16472 + }, + { + "epoch": 5.056169429097606, + "grad_norm": 0.32083824276924133, + "learning_rate": 5.152572500167728e-05, + "loss": 1.7321, + "step": 16473 + }, + { + "epoch": 5.056476365868631, + "grad_norm": 0.22571051120758057, + "learning_rate": 5.1520756748956265e-05, + "loss": 1.7218, + "step": 16474 + }, + { + "epoch": 5.056783302639656, + "grad_norm": 0.2902646064758301, + "learning_rate": 5.151578848120626e-05, + "loss": 1.7231, + "step": 16475 + }, + { + "epoch": 5.057090239410681, + "grad_norm": 0.20447610318660736, + "learning_rate": 5.1510820198476336e-05, + "loss": 1.6998, + "step": 16476 + }, + { + "epoch": 5.0573971761817065, + "grad_norm": 0.29436638951301575, + "learning_rate": 5.1505851900815606e-05, + "loss": 1.6793, + "step": 16477 + }, + { + "epoch": 5.057704112952732, + "grad_norm": 0.29718565940856934, + "learning_rate": 5.1500883588273164e-05, + "loss": 1.8322, + "step": 16478 + }, + { + "epoch": 5.058011049723757, + "grad_norm": 0.23530519008636475, + "learning_rate": 5.149591526089811e-05, + "loss": 1.7408, + "step": 16479 + }, + { + "epoch": 5.0583179864947825, + "grad_norm": 0.30735042691230774, + "learning_rate": 5.1490946918739536e-05, + "loss": 1.7454, + "step": 16480 + }, + { + "epoch": 5.058624923265807, + "grad_norm": 0.26151445508003235, + "learning_rate": 5.148597856184656e-05, + "loss": 1.7728, + "step": 16481 + }, + { + "epoch": 5.058931860036832, + "grad_norm": 0.2657756209373474, + "learning_rate": 5.1481010190268263e-05, + "loss": 1.7905, + "step": 16482 + }, + { + "epoch": 5.059238796807858, + "grad_norm": 0.25418251752853394, + "learning_rate": 5.147604180405376e-05, + "loss": 1.7676, + "step": 16483 + }, + { + "epoch": 5.059545733578883, + "grad_norm": 0.25486254692077637, + "learning_rate": 5.1471073403252154e-05, + "loss": 1.8347, + "step": 16484 + }, + { + "epoch": 5.059852670349908, + "grad_norm": 0.22693100571632385, + "learning_rate": 5.146610498791255e-05, + "loss": 1.7308, + "step": 16485 + }, + { + "epoch": 5.060159607120933, + "grad_norm": 0.22056837379932404, + "learning_rate": 5.146113655808401e-05, + "loss": 1.7158, + "step": 16486 + }, + { + "epoch": 5.060466543891958, + "grad_norm": 0.221246138215065, + "learning_rate": 5.1456168113815685e-05, + "loss": 1.6985, + "step": 16487 + }, + { + "epoch": 5.060773480662983, + "grad_norm": 0.2149408906698227, + "learning_rate": 5.145119965515664e-05, + "loss": 1.716, + "step": 16488 + }, + { + "epoch": 5.061080417434009, + "grad_norm": 0.23958513140678406, + "learning_rate": 5.144623118215599e-05, + "loss": 1.8092, + "step": 16489 + }, + { + "epoch": 5.061387354205034, + "grad_norm": 0.2870621085166931, + "learning_rate": 5.1441262694862836e-05, + "loss": 1.75, + "step": 16490 + }, + { + "epoch": 5.0616942909760585, + "grad_norm": 0.26755061745643616, + "learning_rate": 5.1436294193326276e-05, + "loss": 1.7848, + "step": 16491 + }, + { + "epoch": 5.062001227747084, + "grad_norm": 0.2434249073266983, + "learning_rate": 5.143132567759542e-05, + "loss": 1.7487, + "step": 16492 + }, + { + "epoch": 5.062308164518109, + "grad_norm": 0.3044668138027191, + "learning_rate": 5.142635714771936e-05, + "loss": 1.741, + "step": 16493 + }, + { + "epoch": 5.0626151012891345, + "grad_norm": 0.2166958749294281, + "learning_rate": 5.142138860374721e-05, + "loss": 1.7232, + "step": 16494 + }, + { + "epoch": 5.06292203806016, + "grad_norm": 0.34558552503585815, + "learning_rate": 5.141642004572806e-05, + "loss": 1.7663, + "step": 16495 + }, + { + "epoch": 5.063228974831185, + "grad_norm": 0.330751895904541, + "learning_rate": 5.141145147371102e-05, + "loss": 1.6818, + "step": 16496 + }, + { + "epoch": 5.06353591160221, + "grad_norm": 0.21613973379135132, + "learning_rate": 5.140648288774518e-05, + "loss": 1.7914, + "step": 16497 + }, + { + "epoch": 5.063842848373235, + "grad_norm": 0.32759732007980347, + "learning_rate": 5.140151428787966e-05, + "loss": 1.7543, + "step": 16498 + }, + { + "epoch": 5.06414978514426, + "grad_norm": 0.3180293142795563, + "learning_rate": 5.1396545674163556e-05, + "loss": 1.8163, + "step": 16499 + }, + { + "epoch": 5.064456721915286, + "grad_norm": 0.19757944345474243, + "learning_rate": 5.1391577046645964e-05, + "loss": 1.71, + "step": 16500 + }, + { + "epoch": 5.064763658686311, + "grad_norm": 0.253366619348526, + "learning_rate": 5.1386608405376005e-05, + "loss": 1.7266, + "step": 16501 + }, + { + "epoch": 5.065070595457335, + "grad_norm": 0.24577608704566956, + "learning_rate": 5.1381639750402754e-05, + "loss": 1.7218, + "step": 16502 + }, + { + "epoch": 5.065377532228361, + "grad_norm": 0.22847014665603638, + "learning_rate": 5.137667108177533e-05, + "loss": 1.8025, + "step": 16503 + }, + { + "epoch": 5.065684468999386, + "grad_norm": 0.2089833766222, + "learning_rate": 5.137170239954284e-05, + "loss": 1.8032, + "step": 16504 + }, + { + "epoch": 5.065991405770411, + "grad_norm": 0.21528512239456177, + "learning_rate": 5.136673370375439e-05, + "loss": 1.7227, + "step": 16505 + }, + { + "epoch": 5.066298342541437, + "grad_norm": 0.2099117785692215, + "learning_rate": 5.1361764994459074e-05, + "loss": 1.7176, + "step": 16506 + }, + { + "epoch": 5.066605279312462, + "grad_norm": 0.2140430212020874, + "learning_rate": 5.135679627170599e-05, + "loss": 1.8195, + "step": 16507 + }, + { + "epoch": 5.0669122160834865, + "grad_norm": 0.20253533124923706, + "learning_rate": 5.135182753554424e-05, + "loss": 1.7284, + "step": 16508 + }, + { + "epoch": 5.067219152854512, + "grad_norm": 0.19945639371871948, + "learning_rate": 5.134685878602295e-05, + "loss": 1.6915, + "step": 16509 + }, + { + "epoch": 5.067526089625537, + "grad_norm": 0.20138494670391083, + "learning_rate": 5.1341890023191216e-05, + "loss": 1.7856, + "step": 16510 + }, + { + "epoch": 5.0678330263965625, + "grad_norm": 0.22124232351779938, + "learning_rate": 5.1336921247098136e-05, + "loss": 1.7674, + "step": 16511 + }, + { + "epoch": 5.068139963167588, + "grad_norm": 0.21564216911792755, + "learning_rate": 5.133195245779282e-05, + "loss": 1.6998, + "step": 16512 + }, + { + "epoch": 5.068446899938612, + "grad_norm": 0.21836799383163452, + "learning_rate": 5.1326983655324365e-05, + "loss": 1.7468, + "step": 16513 + }, + { + "epoch": 5.068753836709638, + "grad_norm": 0.2412201464176178, + "learning_rate": 5.132201483974187e-05, + "loss": 1.7433, + "step": 16514 + }, + { + "epoch": 5.069060773480663, + "grad_norm": 0.262054979801178, + "learning_rate": 5.131704601109446e-05, + "loss": 1.8315, + "step": 16515 + }, + { + "epoch": 5.069367710251688, + "grad_norm": 0.21573080122470856, + "learning_rate": 5.1312077169431225e-05, + "loss": 1.7668, + "step": 16516 + }, + { + "epoch": 5.069674647022714, + "grad_norm": 0.21407057344913483, + "learning_rate": 5.130710831480129e-05, + "loss": 1.7486, + "step": 16517 + }, + { + "epoch": 5.069981583793738, + "grad_norm": 0.2128407508134842, + "learning_rate": 5.130213944725373e-05, + "loss": 1.7618, + "step": 16518 + }, + { + "epoch": 5.070288520564763, + "grad_norm": 0.2034141719341278, + "learning_rate": 5.129717056683767e-05, + "loss": 1.726, + "step": 16519 + }, + { + "epoch": 5.070595457335789, + "grad_norm": 0.21474458277225494, + "learning_rate": 5.1292201673602205e-05, + "loss": 1.7883, + "step": 16520 + }, + { + "epoch": 5.070902394106814, + "grad_norm": 0.2102673202753067, + "learning_rate": 5.128723276759645e-05, + "loss": 1.7826, + "step": 16521 + }, + { + "epoch": 5.071209330877839, + "grad_norm": 0.21342496573925018, + "learning_rate": 5.1282263848869505e-05, + "loss": 1.7561, + "step": 16522 + }, + { + "epoch": 5.071516267648865, + "grad_norm": 0.21749620139598846, + "learning_rate": 5.1277294917470474e-05, + "loss": 1.7814, + "step": 16523 + }, + { + "epoch": 5.071823204419889, + "grad_norm": 0.20006774365901947, + "learning_rate": 5.1272325973448476e-05, + "loss": 1.6965, + "step": 16524 + }, + { + "epoch": 5.0721301411909145, + "grad_norm": 0.20878590643405914, + "learning_rate": 5.1267357016852593e-05, + "loss": 1.7426, + "step": 16525 + }, + { + "epoch": 5.07243707796194, + "grad_norm": 0.21824820339679718, + "learning_rate": 5.1262388047731946e-05, + "loss": 1.7704, + "step": 16526 + }, + { + "epoch": 5.072744014732965, + "grad_norm": 0.1992526650428772, + "learning_rate": 5.125741906613565e-05, + "loss": 1.7874, + "step": 16527 + }, + { + "epoch": 5.0730509515039905, + "grad_norm": 0.21028028428554535, + "learning_rate": 5.12524500721128e-05, + "loss": 1.7483, + "step": 16528 + }, + { + "epoch": 5.073357888275015, + "grad_norm": 0.21840833127498627, + "learning_rate": 5.12474810657125e-05, + "loss": 1.7763, + "step": 16529 + }, + { + "epoch": 5.07366482504604, + "grad_norm": 0.249269038438797, + "learning_rate": 5.124251204698387e-05, + "loss": 1.7451, + "step": 16530 + }, + { + "epoch": 5.073971761817066, + "grad_norm": 0.2176963835954666, + "learning_rate": 5.1237543015975986e-05, + "loss": 1.7079, + "step": 16531 + }, + { + "epoch": 5.074278698588091, + "grad_norm": 0.20284616947174072, + "learning_rate": 5.1232573972738e-05, + "loss": 1.7235, + "step": 16532 + }, + { + "epoch": 5.074585635359116, + "grad_norm": 0.20140530169010162, + "learning_rate": 5.1227604917318984e-05, + "loss": 1.7014, + "step": 16533 + }, + { + "epoch": 5.074892572130141, + "grad_norm": 0.2407023161649704, + "learning_rate": 5.1222635849768066e-05, + "loss": 1.7493, + "step": 16534 + }, + { + "epoch": 5.075199508901166, + "grad_norm": 0.2013770490884781, + "learning_rate": 5.121766677013433e-05, + "loss": 1.7601, + "step": 16535 + }, + { + "epoch": 5.0755064456721914, + "grad_norm": 0.23889221251010895, + "learning_rate": 5.1212697678466916e-05, + "loss": 1.7282, + "step": 16536 + }, + { + "epoch": 5.075813382443217, + "grad_norm": 0.2411198765039444, + "learning_rate": 5.120772857481489e-05, + "loss": 1.8138, + "step": 16537 + }, + { + "epoch": 5.076120319214242, + "grad_norm": 0.24521365761756897, + "learning_rate": 5.12027594592274e-05, + "loss": 1.7659, + "step": 16538 + }, + { + "epoch": 5.0764272559852675, + "grad_norm": 0.2841372787952423, + "learning_rate": 5.119779033175354e-05, + "loss": 1.7973, + "step": 16539 + }, + { + "epoch": 5.076734192756292, + "grad_norm": 0.21796928346157074, + "learning_rate": 5.1192821192442395e-05, + "loss": 1.6985, + "step": 16540 + }, + { + "epoch": 5.077041129527317, + "grad_norm": 0.2244848757982254, + "learning_rate": 5.118785204134311e-05, + "loss": 1.7413, + "step": 16541 + }, + { + "epoch": 5.077348066298343, + "grad_norm": 0.22581063210964203, + "learning_rate": 5.1182882878504766e-05, + "loss": 1.7706, + "step": 16542 + }, + { + "epoch": 5.077655003069368, + "grad_norm": 0.24478016793727875, + "learning_rate": 5.117791370397647e-05, + "loss": 1.7628, + "step": 16543 + }, + { + "epoch": 5.077961939840393, + "grad_norm": 0.31270188093185425, + "learning_rate": 5.117294451780734e-05, + "loss": 1.8254, + "step": 16544 + }, + { + "epoch": 5.078268876611418, + "grad_norm": 0.3547368049621582, + "learning_rate": 5.11679753200465e-05, + "loss": 1.781, + "step": 16545 + }, + { + "epoch": 5.078575813382443, + "grad_norm": 0.24920180439949036, + "learning_rate": 5.116300611074304e-05, + "loss": 1.7748, + "step": 16546 + }, + { + "epoch": 5.078882750153468, + "grad_norm": 0.2368776649236679, + "learning_rate": 5.115803688994607e-05, + "loss": 1.7459, + "step": 16547 + }, + { + "epoch": 5.079189686924494, + "grad_norm": 0.28341975808143616, + "learning_rate": 5.115306765770471e-05, + "loss": 1.6694, + "step": 16548 + }, + { + "epoch": 5.079496623695519, + "grad_norm": 0.2521432936191559, + "learning_rate": 5.114809841406804e-05, + "loss": 1.7544, + "step": 16549 + }, + { + "epoch": 5.0798035604665435, + "grad_norm": 0.21199844777584076, + "learning_rate": 5.11431291590852e-05, + "loss": 1.7215, + "step": 16550 + }, + { + "epoch": 5.080110497237569, + "grad_norm": 0.25157347321510315, + "learning_rate": 5.113815989280528e-05, + "loss": 1.8021, + "step": 16551 + }, + { + "epoch": 5.080417434008594, + "grad_norm": 0.2284129559993744, + "learning_rate": 5.1133190615277414e-05, + "loss": 1.7125, + "step": 16552 + }, + { + "epoch": 5.0807243707796195, + "grad_norm": 0.2297726720571518, + "learning_rate": 5.11282213265507e-05, + "loss": 1.7602, + "step": 16553 + }, + { + "epoch": 5.081031307550645, + "grad_norm": 0.22392617166042328, + "learning_rate": 5.112325202667421e-05, + "loss": 1.7251, + "step": 16554 + }, + { + "epoch": 5.08133824432167, + "grad_norm": 0.22406147420406342, + "learning_rate": 5.11182827156971e-05, + "loss": 1.7232, + "step": 16555 + }, + { + "epoch": 5.081645181092695, + "grad_norm": 0.2547284960746765, + "learning_rate": 5.111331339366846e-05, + "loss": 1.7335, + "step": 16556 + }, + { + "epoch": 5.08195211786372, + "grad_norm": 0.216146782040596, + "learning_rate": 5.1108344060637415e-05, + "loss": 1.7469, + "step": 16557 + }, + { + "epoch": 5.082259054634745, + "grad_norm": 0.1926967352628708, + "learning_rate": 5.110337471665306e-05, + "loss": 1.7492, + "step": 16558 + }, + { + "epoch": 5.082565991405771, + "grad_norm": 0.30311331152915955, + "learning_rate": 5.109840536176451e-05, + "loss": 1.8129, + "step": 16559 + }, + { + "epoch": 5.082872928176796, + "grad_norm": 0.24273787438869476, + "learning_rate": 5.109343599602087e-05, + "loss": 1.7206, + "step": 16560 + }, + { + "epoch": 5.08317986494782, + "grad_norm": 0.22736592590808868, + "learning_rate": 5.1088466619471255e-05, + "loss": 1.732, + "step": 16561 + }, + { + "epoch": 5.083486801718846, + "grad_norm": 0.21457640826702118, + "learning_rate": 5.1083497232164777e-05, + "loss": 1.726, + "step": 16562 + }, + { + "epoch": 5.083793738489871, + "grad_norm": 0.20968590676784515, + "learning_rate": 5.107852783415055e-05, + "loss": 1.8095, + "step": 16563 + }, + { + "epoch": 5.084100675260896, + "grad_norm": 0.2846728265285492, + "learning_rate": 5.107355842547768e-05, + "loss": 1.7524, + "step": 16564 + }, + { + "epoch": 5.084407612031922, + "grad_norm": 0.21162885427474976, + "learning_rate": 5.106858900619526e-05, + "loss": 1.753, + "step": 16565 + }, + { + "epoch": 5.084714548802946, + "grad_norm": 0.24349012970924377, + "learning_rate": 5.106361957635242e-05, + "loss": 1.7003, + "step": 16566 + }, + { + "epoch": 5.0850214855739715, + "grad_norm": 0.24532537162303925, + "learning_rate": 5.105865013599828e-05, + "loss": 1.7818, + "step": 16567 + }, + { + "epoch": 5.085328422344997, + "grad_norm": 0.22788558900356293, + "learning_rate": 5.1053680685181926e-05, + "loss": 1.7291, + "step": 16568 + }, + { + "epoch": 5.085635359116022, + "grad_norm": 0.22402508556842804, + "learning_rate": 5.10487112239525e-05, + "loss": 1.8292, + "step": 16569 + }, + { + "epoch": 5.0859422958870475, + "grad_norm": 0.2396162748336792, + "learning_rate": 5.1043741752359085e-05, + "loss": 1.7441, + "step": 16570 + }, + { + "epoch": 5.086249232658073, + "grad_norm": 0.22364887595176697, + "learning_rate": 5.1038772270450796e-05, + "loss": 1.7356, + "step": 16571 + }, + { + "epoch": 5.086556169429097, + "grad_norm": 0.20385414361953735, + "learning_rate": 5.103380277827676e-05, + "loss": 1.774, + "step": 16572 + }, + { + "epoch": 5.086863106200123, + "grad_norm": 0.2050715535879135, + "learning_rate": 5.102883327588608e-05, + "loss": 1.7217, + "step": 16573 + }, + { + "epoch": 5.087170042971148, + "grad_norm": 0.23750410974025726, + "learning_rate": 5.102386376332786e-05, + "loss": 1.7605, + "step": 16574 + }, + { + "epoch": 5.087476979742173, + "grad_norm": 0.24313338100910187, + "learning_rate": 5.101889424065122e-05, + "loss": 1.7498, + "step": 16575 + }, + { + "epoch": 5.087783916513199, + "grad_norm": 0.22145850956439972, + "learning_rate": 5.101392470790527e-05, + "loss": 1.7827, + "step": 16576 + }, + { + "epoch": 5.088090853284223, + "grad_norm": 0.23073779046535492, + "learning_rate": 5.100895516513912e-05, + "loss": 1.7722, + "step": 16577 + }, + { + "epoch": 5.088397790055248, + "grad_norm": 0.2112295925617218, + "learning_rate": 5.100398561240188e-05, + "loss": 1.7755, + "step": 16578 + }, + { + "epoch": 5.088704726826274, + "grad_norm": 0.23263800144195557, + "learning_rate": 5.0999016049742675e-05, + "loss": 1.7593, + "step": 16579 + }, + { + "epoch": 5.089011663597299, + "grad_norm": 0.23011381924152374, + "learning_rate": 5.09940464772106e-05, + "loss": 1.704, + "step": 16580 + }, + { + "epoch": 5.089318600368324, + "grad_norm": 0.1930779367685318, + "learning_rate": 5.0989076894854785e-05, + "loss": 1.7038, + "step": 16581 + }, + { + "epoch": 5.08962553713935, + "grad_norm": 0.2100505381822586, + "learning_rate": 5.098410730272433e-05, + "loss": 1.7671, + "step": 16582 + }, + { + "epoch": 5.089932473910374, + "grad_norm": 0.1919277459383011, + "learning_rate": 5.097913770086833e-05, + "loss": 1.651, + "step": 16583 + }, + { + "epoch": 5.0902394106813995, + "grad_norm": 0.23310615122318268, + "learning_rate": 5.097416808933594e-05, + "loss": 1.8294, + "step": 16584 + }, + { + "epoch": 5.090546347452425, + "grad_norm": 0.26191771030426025, + "learning_rate": 5.096919846817624e-05, + "loss": 1.7522, + "step": 16585 + }, + { + "epoch": 5.09085328422345, + "grad_norm": 0.2508419156074524, + "learning_rate": 5.096422883743835e-05, + "loss": 1.8025, + "step": 16586 + }, + { + "epoch": 5.0911602209944755, + "grad_norm": 0.23192499577999115, + "learning_rate": 5.0959259197171414e-05, + "loss": 1.7885, + "step": 16587 + }, + { + "epoch": 5.0914671577655, + "grad_norm": 0.2164602279663086, + "learning_rate": 5.095428954742448e-05, + "loss": 1.7299, + "step": 16588 + }, + { + "epoch": 5.091774094536525, + "grad_norm": 0.21431668102741241, + "learning_rate": 5.094931988824671e-05, + "loss": 1.7122, + "step": 16589 + }, + { + "epoch": 5.092081031307551, + "grad_norm": 0.20563583076000214, + "learning_rate": 5.094435021968722e-05, + "loss": 1.7118, + "step": 16590 + }, + { + "epoch": 5.092387968078576, + "grad_norm": 0.20916326344013214, + "learning_rate": 5.093938054179509e-05, + "loss": 1.7639, + "step": 16591 + }, + { + "epoch": 5.092694904849601, + "grad_norm": 0.21197481453418732, + "learning_rate": 5.0934410854619454e-05, + "loss": 1.7357, + "step": 16592 + }, + { + "epoch": 5.093001841620626, + "grad_norm": 0.21085995435714722, + "learning_rate": 5.092944115820942e-05, + "loss": 1.6921, + "step": 16593 + }, + { + "epoch": 5.093308778391651, + "grad_norm": 0.2608145773410797, + "learning_rate": 5.09244714526141e-05, + "loss": 1.7541, + "step": 16594 + }, + { + "epoch": 5.093615715162676, + "grad_norm": 0.2138587087392807, + "learning_rate": 5.0919501737882624e-05, + "loss": 1.727, + "step": 16595 + }, + { + "epoch": 5.093922651933702, + "grad_norm": 0.230251282453537, + "learning_rate": 5.0914532014064084e-05, + "loss": 1.7828, + "step": 16596 + }, + { + "epoch": 5.094229588704727, + "grad_norm": 0.2162851244211197, + "learning_rate": 5.0909562281207614e-05, + "loss": 1.6905, + "step": 16597 + }, + { + "epoch": 5.094536525475752, + "grad_norm": 0.20637664198875427, + "learning_rate": 5.090459253936231e-05, + "loss": 1.7484, + "step": 16598 + }, + { + "epoch": 5.094843462246777, + "grad_norm": 0.19427815079689026, + "learning_rate": 5.089962278857728e-05, + "loss": 1.7379, + "step": 16599 + }, + { + "epoch": 5.095150399017802, + "grad_norm": 0.1877593845129013, + "learning_rate": 5.089465302890165e-05, + "loss": 1.7017, + "step": 16600 + }, + { + "epoch": 5.0954573357888275, + "grad_norm": 0.19219037890434265, + "learning_rate": 5.0889683260384543e-05, + "loss": 1.7379, + "step": 16601 + }, + { + "epoch": 5.095764272559853, + "grad_norm": 0.19855685532093048, + "learning_rate": 5.088471348307507e-05, + "loss": 1.7171, + "step": 16602 + }, + { + "epoch": 5.096071209330878, + "grad_norm": 0.19119660556316376, + "learning_rate": 5.087974369702235e-05, + "loss": 1.6912, + "step": 16603 + }, + { + "epoch": 5.096378146101903, + "grad_norm": 0.2102670818567276, + "learning_rate": 5.0874773902275476e-05, + "loss": 1.6825, + "step": 16604 + }, + { + "epoch": 5.096685082872928, + "grad_norm": 0.2120765596628189, + "learning_rate": 5.0869804098883564e-05, + "loss": 1.7055, + "step": 16605 + }, + { + "epoch": 5.096992019643953, + "grad_norm": 0.25874772667884827, + "learning_rate": 5.0864834286895745e-05, + "loss": 1.7193, + "step": 16606 + }, + { + "epoch": 5.097298956414979, + "grad_norm": 0.20822012424468994, + "learning_rate": 5.085986446636113e-05, + "loss": 1.6748, + "step": 16607 + }, + { + "epoch": 5.097605893186004, + "grad_norm": 0.21364718675613403, + "learning_rate": 5.085489463732883e-05, + "loss": 1.7762, + "step": 16608 + }, + { + "epoch": 5.097912829957028, + "grad_norm": 0.21961788833141327, + "learning_rate": 5.084992479984796e-05, + "loss": 1.7243, + "step": 16609 + }, + { + "epoch": 5.098219766728054, + "grad_norm": 0.22056026756763458, + "learning_rate": 5.0844954953967624e-05, + "loss": 1.6983, + "step": 16610 + }, + { + "epoch": 5.098526703499079, + "grad_norm": 0.21347738802433014, + "learning_rate": 5.083998509973695e-05, + "loss": 1.7319, + "step": 16611 + }, + { + "epoch": 5.098833640270104, + "grad_norm": 0.23593664169311523, + "learning_rate": 5.083501523720506e-05, + "loss": 1.7121, + "step": 16612 + }, + { + "epoch": 5.09914057704113, + "grad_norm": 0.2088623344898224, + "learning_rate": 5.0830045366421055e-05, + "loss": 1.72, + "step": 16613 + }, + { + "epoch": 5.099447513812155, + "grad_norm": 0.2293832004070282, + "learning_rate": 5.082507548743406e-05, + "loss": 1.7548, + "step": 16614 + }, + { + "epoch": 5.0997544505831796, + "grad_norm": 0.2509057819843292, + "learning_rate": 5.082010560029319e-05, + "loss": 1.7729, + "step": 16615 + }, + { + "epoch": 5.100061387354205, + "grad_norm": 0.1925390362739563, + "learning_rate": 5.081513570504755e-05, + "loss": 1.7109, + "step": 16616 + }, + { + "epoch": 5.10036832412523, + "grad_norm": 0.20876559615135193, + "learning_rate": 5.081016580174626e-05, + "loss": 1.7031, + "step": 16617 + }, + { + "epoch": 5.100675260896256, + "grad_norm": 0.2038683146238327, + "learning_rate": 5.080519589043842e-05, + "loss": 1.7489, + "step": 16618 + }, + { + "epoch": 5.100982197667281, + "grad_norm": 0.25018224120140076, + "learning_rate": 5.080022597117318e-05, + "loss": 1.7884, + "step": 16619 + }, + { + "epoch": 5.101289134438305, + "grad_norm": 0.24430342018604279, + "learning_rate": 5.079525604399965e-05, + "loss": 1.7558, + "step": 16620 + }, + { + "epoch": 5.101596071209331, + "grad_norm": 0.22151432931423187, + "learning_rate": 5.079028610896692e-05, + "loss": 1.7543, + "step": 16621 + }, + { + "epoch": 5.101903007980356, + "grad_norm": 0.2313055694103241, + "learning_rate": 5.0785316166124107e-05, + "loss": 1.7755, + "step": 16622 + }, + { + "epoch": 5.102209944751381, + "grad_norm": 0.27405816316604614, + "learning_rate": 5.0780346215520355e-05, + "loss": 1.7006, + "step": 16623 + }, + { + "epoch": 5.102516881522407, + "grad_norm": 0.2209920734167099, + "learning_rate": 5.077537625720476e-05, + "loss": 1.6877, + "step": 16624 + }, + { + "epoch": 5.102823818293431, + "grad_norm": 0.20993784070014954, + "learning_rate": 5.077040629122645e-05, + "loss": 1.7558, + "step": 16625 + }, + { + "epoch": 5.1031307550644565, + "grad_norm": 0.25554344058036804, + "learning_rate": 5.076543631763453e-05, + "loss": 1.7142, + "step": 16626 + }, + { + "epoch": 5.103437691835482, + "grad_norm": 0.28980588912963867, + "learning_rate": 5.0760466336478116e-05, + "loss": 1.7632, + "step": 16627 + }, + { + "epoch": 5.103744628606507, + "grad_norm": 0.20144744217395782, + "learning_rate": 5.075549634780633e-05, + "loss": 1.7472, + "step": 16628 + }, + { + "epoch": 5.1040515653775325, + "grad_norm": 0.30335596203804016, + "learning_rate": 5.075052635166827e-05, + "loss": 1.7283, + "step": 16629 + }, + { + "epoch": 5.104358502148558, + "grad_norm": 0.3014097213745117, + "learning_rate": 5.074555634811309e-05, + "loss": 1.7273, + "step": 16630 + }, + { + "epoch": 5.104665438919582, + "grad_norm": 0.20123563706874847, + "learning_rate": 5.074058633718988e-05, + "loss": 1.7119, + "step": 16631 + }, + { + "epoch": 5.104972375690608, + "grad_norm": 0.3375137746334076, + "learning_rate": 5.073561631894776e-05, + "loss": 1.7594, + "step": 16632 + }, + { + "epoch": 5.105279312461633, + "grad_norm": 0.3471776247024536, + "learning_rate": 5.0730646293435846e-05, + "loss": 1.729, + "step": 16633 + }, + { + "epoch": 5.105586249232658, + "grad_norm": 0.26405471563339233, + "learning_rate": 5.072567626070327e-05, + "loss": 1.7472, + "step": 16634 + }, + { + "epoch": 5.105893186003684, + "grad_norm": 0.2339334636926651, + "learning_rate": 5.072070622079911e-05, + "loss": 1.7285, + "step": 16635 + }, + { + "epoch": 5.106200122774708, + "grad_norm": 0.26267752051353455, + "learning_rate": 5.0715736173772534e-05, + "loss": 1.7171, + "step": 16636 + }, + { + "epoch": 5.106507059545733, + "grad_norm": 0.22254765033721924, + "learning_rate": 5.0710766119672626e-05, + "loss": 1.7702, + "step": 16637 + }, + { + "epoch": 5.106813996316759, + "grad_norm": 0.2457888424396515, + "learning_rate": 5.070579605854852e-05, + "loss": 1.7987, + "step": 16638 + }, + { + "epoch": 5.107120933087784, + "grad_norm": 0.24500930309295654, + "learning_rate": 5.070082599044931e-05, + "loss": 1.8103, + "step": 16639 + }, + { + "epoch": 5.107427869858809, + "grad_norm": 0.24446405470371246, + "learning_rate": 5.0695855915424116e-05, + "loss": 1.7058, + "step": 16640 + }, + { + "epoch": 5.107734806629834, + "grad_norm": 0.22352534532546997, + "learning_rate": 5.0690885833522086e-05, + "loss": 1.7503, + "step": 16641 + }, + { + "epoch": 5.108041743400859, + "grad_norm": 0.2308795005083084, + "learning_rate": 5.068591574479231e-05, + "loss": 1.8064, + "step": 16642 + }, + { + "epoch": 5.1083486801718845, + "grad_norm": 0.23804180324077606, + "learning_rate": 5.068094564928392e-05, + "loss": 1.7603, + "step": 16643 + }, + { + "epoch": 5.10865561694291, + "grad_norm": 0.1956508308649063, + "learning_rate": 5.0675975547046016e-05, + "loss": 1.7448, + "step": 16644 + }, + { + "epoch": 5.108962553713935, + "grad_norm": 0.24438725411891937, + "learning_rate": 5.067100543812773e-05, + "loss": 1.7706, + "step": 16645 + }, + { + "epoch": 5.1092694904849605, + "grad_norm": 0.26129621267318726, + "learning_rate": 5.066603532257817e-05, + "loss": 1.7321, + "step": 16646 + }, + { + "epoch": 5.109576427255985, + "grad_norm": 0.2024240493774414, + "learning_rate": 5.066106520044646e-05, + "loss": 1.7033, + "step": 16647 + }, + { + "epoch": 5.10988336402701, + "grad_norm": 0.2096802294254303, + "learning_rate": 5.0656095071781716e-05, + "loss": 1.716, + "step": 16648 + }, + { + "epoch": 5.110190300798036, + "grad_norm": 0.20643317699432373, + "learning_rate": 5.0651124936633054e-05, + "loss": 1.7473, + "step": 16649 + }, + { + "epoch": 5.110497237569061, + "grad_norm": 0.2268853783607483, + "learning_rate": 5.0646154795049604e-05, + "loss": 1.7844, + "step": 16650 + }, + { + "epoch": 5.110804174340086, + "grad_norm": 0.20215095579624176, + "learning_rate": 5.064118464708046e-05, + "loss": 1.7138, + "step": 16651 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 0.19411569833755493, + "learning_rate": 5.063621449277476e-05, + "loss": 1.7526, + "step": 16652 + }, + { + "epoch": 5.111418047882136, + "grad_norm": 0.20199783146381378, + "learning_rate": 5.063124433218161e-05, + "loss": 1.806, + "step": 16653 + }, + { + "epoch": 5.111724984653161, + "grad_norm": 0.23351836204528809, + "learning_rate": 5.0626274165350165e-05, + "loss": 1.7529, + "step": 16654 + }, + { + "epoch": 5.112031921424187, + "grad_norm": 0.21098989248275757, + "learning_rate": 5.062130399232948e-05, + "loss": 1.7647, + "step": 16655 + }, + { + "epoch": 5.112338858195212, + "grad_norm": 0.21959169209003448, + "learning_rate": 5.0616333813168714e-05, + "loss": 1.7462, + "step": 16656 + }, + { + "epoch": 5.112645794966237, + "grad_norm": 0.21173696219921112, + "learning_rate": 5.061136362791696e-05, + "loss": 1.7413, + "step": 16657 + }, + { + "epoch": 5.112952731737262, + "grad_norm": 0.22357577085494995, + "learning_rate": 5.0606393436623365e-05, + "loss": 1.7163, + "step": 16658 + }, + { + "epoch": 5.113259668508287, + "grad_norm": 0.24364936351776123, + "learning_rate": 5.060142323933704e-05, + "loss": 1.8139, + "step": 16659 + }, + { + "epoch": 5.1135666052793125, + "grad_norm": 0.21646073460578918, + "learning_rate": 5.05964530361071e-05, + "loss": 1.741, + "step": 16660 + }, + { + "epoch": 5.113873542050338, + "grad_norm": 0.24261775612831116, + "learning_rate": 5.059148282698265e-05, + "loss": 1.7162, + "step": 16661 + }, + { + "epoch": 5.114180478821363, + "grad_norm": 0.22883281111717224, + "learning_rate": 5.058651261201283e-05, + "loss": 1.7342, + "step": 16662 + }, + { + "epoch": 5.114487415592388, + "grad_norm": 0.2616727352142334, + "learning_rate": 5.058154239124674e-05, + "loss": 1.8054, + "step": 16663 + }, + { + "epoch": 5.114794352363413, + "grad_norm": 0.21293358504772186, + "learning_rate": 5.0576572164733505e-05, + "loss": 1.742, + "step": 16664 + }, + { + "epoch": 5.115101289134438, + "grad_norm": 0.20037685334682465, + "learning_rate": 5.057160193252225e-05, + "loss": 1.7518, + "step": 16665 + }, + { + "epoch": 5.115408225905464, + "grad_norm": 0.19102689623832703, + "learning_rate": 5.056663169466209e-05, + "loss": 1.6892, + "step": 16666 + }, + { + "epoch": 5.115715162676489, + "grad_norm": 0.22261591255664825, + "learning_rate": 5.056166145120216e-05, + "loss": 1.7744, + "step": 16667 + }, + { + "epoch": 5.116022099447513, + "grad_norm": 0.23966702818870544, + "learning_rate": 5.055669120219154e-05, + "loss": 1.7786, + "step": 16668 + }, + { + "epoch": 5.116329036218539, + "grad_norm": 0.22008271515369415, + "learning_rate": 5.055172094767937e-05, + "loss": 1.7501, + "step": 16669 + }, + { + "epoch": 5.116635972989564, + "grad_norm": 0.21643415093421936, + "learning_rate": 5.054675068771478e-05, + "loss": 1.7548, + "step": 16670 + }, + { + "epoch": 5.116942909760589, + "grad_norm": 0.24661116302013397, + "learning_rate": 5.0541780422346894e-05, + "loss": 1.8117, + "step": 16671 + }, + { + "epoch": 5.117249846531615, + "grad_norm": 0.21393093466758728, + "learning_rate": 5.05368101516248e-05, + "loss": 1.7341, + "step": 16672 + }, + { + "epoch": 5.11755678330264, + "grad_norm": 0.30949896574020386, + "learning_rate": 5.053183987559763e-05, + "loss": 1.7703, + "step": 16673 + }, + { + "epoch": 5.1178637200736645, + "grad_norm": 0.22236786782741547, + "learning_rate": 5.052686959431451e-05, + "loss": 1.719, + "step": 16674 + }, + { + "epoch": 5.11817065684469, + "grad_norm": 0.26826921105384827, + "learning_rate": 5.052189930782455e-05, + "loss": 1.741, + "step": 16675 + }, + { + "epoch": 5.118477593615715, + "grad_norm": 0.2608947455883026, + "learning_rate": 5.051692901617688e-05, + "loss": 1.7062, + "step": 16676 + }, + { + "epoch": 5.1187845303867405, + "grad_norm": 0.20709002017974854, + "learning_rate": 5.051195871942063e-05, + "loss": 1.703, + "step": 16677 + }, + { + "epoch": 5.119091467157766, + "grad_norm": 0.18957734107971191, + "learning_rate": 5.0506988417604885e-05, + "loss": 1.762, + "step": 16678 + }, + { + "epoch": 5.11939840392879, + "grad_norm": 0.21578781306743622, + "learning_rate": 5.050201811077879e-05, + "loss": 1.7167, + "step": 16679 + }, + { + "epoch": 5.119705340699816, + "grad_norm": 0.2253631353378296, + "learning_rate": 5.049704779899145e-05, + "loss": 1.7374, + "step": 16680 + }, + { + "epoch": 5.120012277470841, + "grad_norm": 0.1977664828300476, + "learning_rate": 5.049207748229199e-05, + "loss": 1.7399, + "step": 16681 + }, + { + "epoch": 5.120319214241866, + "grad_norm": 0.2964428663253784, + "learning_rate": 5.048710716072954e-05, + "loss": 1.8359, + "step": 16682 + }, + { + "epoch": 5.120626151012892, + "grad_norm": 0.24788637459278107, + "learning_rate": 5.0482136834353224e-05, + "loss": 1.7593, + "step": 16683 + }, + { + "epoch": 5.120933087783916, + "grad_norm": 0.21537743508815765, + "learning_rate": 5.0477166503212135e-05, + "loss": 1.7472, + "step": 16684 + }, + { + "epoch": 5.121240024554941, + "grad_norm": 0.2055196613073349, + "learning_rate": 5.047219616735541e-05, + "loss": 1.7106, + "step": 16685 + }, + { + "epoch": 5.121546961325967, + "grad_norm": 0.19770687818527222, + "learning_rate": 5.046722582683215e-05, + "loss": 1.6887, + "step": 16686 + }, + { + "epoch": 5.121853898096992, + "grad_norm": 0.20407389104366302, + "learning_rate": 5.046225548169151e-05, + "loss": 1.7412, + "step": 16687 + }, + { + "epoch": 5.122160834868017, + "grad_norm": 0.20153474807739258, + "learning_rate": 5.045728513198259e-05, + "loss": 1.7643, + "step": 16688 + }, + { + "epoch": 5.122467771639043, + "grad_norm": 0.18737752735614777, + "learning_rate": 5.045231477775452e-05, + "loss": 1.763, + "step": 16689 + }, + { + "epoch": 5.122774708410067, + "grad_norm": 0.19790658354759216, + "learning_rate": 5.0447344419056385e-05, + "loss": 1.7446, + "step": 16690 + }, + { + "epoch": 5.1230816451810925, + "grad_norm": 0.21496973931789398, + "learning_rate": 5.0442374055937336e-05, + "loss": 1.7756, + "step": 16691 + }, + { + "epoch": 5.123388581952118, + "grad_norm": 0.19318655133247375, + "learning_rate": 5.043740368844649e-05, + "loss": 1.7687, + "step": 16692 + }, + { + "epoch": 5.123695518723143, + "grad_norm": 0.2237338423728943, + "learning_rate": 5.0432433316632976e-05, + "loss": 1.7258, + "step": 16693 + }, + { + "epoch": 5.1240024554941686, + "grad_norm": 0.2257162630558014, + "learning_rate": 5.042746294054589e-05, + "loss": 1.7462, + "step": 16694 + }, + { + "epoch": 5.124309392265193, + "grad_norm": 0.25666359066963196, + "learning_rate": 5.0422492560234366e-05, + "loss": 1.7318, + "step": 16695 + }, + { + "epoch": 5.124616329036218, + "grad_norm": 0.2615324556827545, + "learning_rate": 5.0417522175747536e-05, + "loss": 1.7533, + "step": 16696 + }, + { + "epoch": 5.124923265807244, + "grad_norm": 0.2372874766588211, + "learning_rate": 5.0412551787134475e-05, + "loss": 1.7361, + "step": 16697 + }, + { + "epoch": 5.125230202578269, + "grad_norm": 0.25976815819740295, + "learning_rate": 5.040758139444436e-05, + "loss": 1.7542, + "step": 16698 + }, + { + "epoch": 5.125537139349294, + "grad_norm": 0.36173003911972046, + "learning_rate": 5.040261099772629e-05, + "loss": 1.7421, + "step": 16699 + }, + { + "epoch": 5.12584407612032, + "grad_norm": 0.2767728269100189, + "learning_rate": 5.039764059702937e-05, + "loss": 1.7341, + "step": 16700 + }, + { + "epoch": 5.126151012891344, + "grad_norm": 0.20185241103172302, + "learning_rate": 5.039267019240275e-05, + "loss": 1.7068, + "step": 16701 + }, + { + "epoch": 5.1264579496623695, + "grad_norm": 0.26872581243515015, + "learning_rate": 5.0387699783895514e-05, + "loss": 1.7404, + "step": 16702 + }, + { + "epoch": 5.126764886433395, + "grad_norm": 0.2867858111858368, + "learning_rate": 5.038272937155682e-05, + "loss": 1.7702, + "step": 16703 + }, + { + "epoch": 5.12707182320442, + "grad_norm": 0.20939521491527557, + "learning_rate": 5.037775895543574e-05, + "loss": 1.7653, + "step": 16704 + }, + { + "epoch": 5.1273787599754455, + "grad_norm": 0.2674047648906708, + "learning_rate": 5.037278853558146e-05, + "loss": 1.701, + "step": 16705 + }, + { + "epoch": 5.12768569674647, + "grad_norm": 0.20776906609535217, + "learning_rate": 5.036781811204304e-05, + "loss": 1.7476, + "step": 16706 + }, + { + "epoch": 5.127992633517495, + "grad_norm": 0.2695952355861664, + "learning_rate": 5.036284768486964e-05, + "loss": 1.7206, + "step": 16707 + }, + { + "epoch": 5.128299570288521, + "grad_norm": 0.30661383271217346, + "learning_rate": 5.0357877254110363e-05, + "loss": 1.72, + "step": 16708 + }, + { + "epoch": 5.128606507059546, + "grad_norm": 0.2527785003185272, + "learning_rate": 5.0352906819814316e-05, + "loss": 1.6936, + "step": 16709 + }, + { + "epoch": 5.128913443830571, + "grad_norm": 0.23000696301460266, + "learning_rate": 5.034793638203066e-05, + "loss": 1.7634, + "step": 16710 + }, + { + "epoch": 5.129220380601596, + "grad_norm": 0.33594760298728943, + "learning_rate": 5.0342965940808486e-05, + "loss": 1.6952, + "step": 16711 + }, + { + "epoch": 5.129527317372621, + "grad_norm": 0.22834168374538422, + "learning_rate": 5.033799549619692e-05, + "loss": 1.7537, + "step": 16712 + }, + { + "epoch": 5.129834254143646, + "grad_norm": 0.26585114002227783, + "learning_rate": 5.033302504824509e-05, + "loss": 1.7554, + "step": 16713 + }, + { + "epoch": 5.130141190914672, + "grad_norm": 0.25632211565971375, + "learning_rate": 5.032805459700211e-05, + "loss": 1.8141, + "step": 16714 + }, + { + "epoch": 5.130448127685697, + "grad_norm": 0.256523996591568, + "learning_rate": 5.0323084142517084e-05, + "loss": 1.777, + "step": 16715 + }, + { + "epoch": 5.1307550644567215, + "grad_norm": 0.31409457325935364, + "learning_rate": 5.0318113684839166e-05, + "loss": 1.7414, + "step": 16716 + }, + { + "epoch": 5.131062001227747, + "grad_norm": 0.21156816184520721, + "learning_rate": 5.0313143224017455e-05, + "loss": 1.7397, + "step": 16717 + }, + { + "epoch": 5.131368937998772, + "grad_norm": 0.23596547544002533, + "learning_rate": 5.030817276010109e-05, + "loss": 1.752, + "step": 16718 + }, + { + "epoch": 5.1316758747697975, + "grad_norm": 0.2587638199329376, + "learning_rate": 5.0303202293139186e-05, + "loss": 1.7645, + "step": 16719 + }, + { + "epoch": 5.131982811540823, + "grad_norm": 0.2006666213274002, + "learning_rate": 5.029823182318084e-05, + "loss": 1.7009, + "step": 16720 + }, + { + "epoch": 5.132289748311848, + "grad_norm": 0.3075694739818573, + "learning_rate": 5.029326135027521e-05, + "loss": 1.749, + "step": 16721 + }, + { + "epoch": 5.132596685082873, + "grad_norm": 0.3116205334663391, + "learning_rate": 5.028829087447139e-05, + "loss": 1.7458, + "step": 16722 + }, + { + "epoch": 5.132903621853898, + "grad_norm": 0.17925913631916046, + "learning_rate": 5.028332039581851e-05, + "loss": 1.6502, + "step": 16723 + }, + { + "epoch": 5.133210558624923, + "grad_norm": 0.21779952943325043, + "learning_rate": 5.0278349914365694e-05, + "loss": 1.7656, + "step": 16724 + }, + { + "epoch": 5.133517495395949, + "grad_norm": 0.20085318386554718, + "learning_rate": 5.027337943016207e-05, + "loss": 1.7662, + "step": 16725 + }, + { + "epoch": 5.133824432166974, + "grad_norm": 0.19975553452968597, + "learning_rate": 5.026840894325673e-05, + "loss": 1.7392, + "step": 16726 + }, + { + "epoch": 5.134131368937998, + "grad_norm": 0.20610745251178741, + "learning_rate": 5.026343845369883e-05, + "loss": 1.7221, + "step": 16727 + }, + { + "epoch": 5.134438305709024, + "grad_norm": 0.21451768279075623, + "learning_rate": 5.025846796153747e-05, + "loss": 1.8381, + "step": 16728 + }, + { + "epoch": 5.134745242480049, + "grad_norm": 0.19518613815307617, + "learning_rate": 5.0253497466821786e-05, + "loss": 1.7483, + "step": 16729 + }, + { + "epoch": 5.135052179251074, + "grad_norm": 0.24284996092319489, + "learning_rate": 5.024852696960088e-05, + "loss": 1.7895, + "step": 16730 + }, + { + "epoch": 5.1353591160221, + "grad_norm": 0.23962461948394775, + "learning_rate": 5.0243556469923905e-05, + "loss": 1.8468, + "step": 16731 + }, + { + "epoch": 5.135666052793125, + "grad_norm": 0.20455054938793182, + "learning_rate": 5.023858596783993e-05, + "loss": 1.6973, + "step": 16732 + }, + { + "epoch": 5.1359729895641495, + "grad_norm": 0.20629842579364777, + "learning_rate": 5.023361546339813e-05, + "loss": 1.7608, + "step": 16733 + }, + { + "epoch": 5.136279926335175, + "grad_norm": 0.19375818967819214, + "learning_rate": 5.0228644956647606e-05, + "loss": 1.7327, + "step": 16734 + }, + { + "epoch": 5.1365868631062, + "grad_norm": 0.20960548520088196, + "learning_rate": 5.022367444763748e-05, + "loss": 1.7227, + "step": 16735 + }, + { + "epoch": 5.1368937998772255, + "grad_norm": 0.24732786417007446, + "learning_rate": 5.021870393641687e-05, + "loss": 1.8144, + "step": 16736 + }, + { + "epoch": 5.137200736648251, + "grad_norm": 0.22190099954605103, + "learning_rate": 5.021373342303489e-05, + "loss": 1.705, + "step": 16737 + }, + { + "epoch": 5.137507673419275, + "grad_norm": 0.2091664969921112, + "learning_rate": 5.020876290754069e-05, + "loss": 1.7926, + "step": 16738 + }, + { + "epoch": 5.137814610190301, + "grad_norm": 0.22298938035964966, + "learning_rate": 5.020379238998335e-05, + "loss": 1.7782, + "step": 16739 + }, + { + "epoch": 5.138121546961326, + "grad_norm": 0.20843006670475006, + "learning_rate": 5.019882187041203e-05, + "loss": 1.7245, + "step": 16740 + }, + { + "epoch": 5.138428483732351, + "grad_norm": 0.23383544385433197, + "learning_rate": 5.019385134887583e-05, + "loss": 1.6834, + "step": 16741 + }, + { + "epoch": 5.138735420503377, + "grad_norm": 0.3015683889389038, + "learning_rate": 5.018888082542388e-05, + "loss": 1.7636, + "step": 16742 + }, + { + "epoch": 5.139042357274401, + "grad_norm": 0.2253810614347458, + "learning_rate": 5.0183910300105284e-05, + "loss": 1.7375, + "step": 16743 + }, + { + "epoch": 5.139349294045426, + "grad_norm": 0.2064623087644577, + "learning_rate": 5.01789397729692e-05, + "loss": 1.7683, + "step": 16744 + }, + { + "epoch": 5.139656230816452, + "grad_norm": 0.2106693685054779, + "learning_rate": 5.0173969244064724e-05, + "loss": 1.7432, + "step": 16745 + }, + { + "epoch": 5.139963167587477, + "grad_norm": 0.19944638013839722, + "learning_rate": 5.016899871344097e-05, + "loss": 1.701, + "step": 16746 + }, + { + "epoch": 5.140270104358502, + "grad_norm": 0.23210744559764862, + "learning_rate": 5.016402818114708e-05, + "loss": 1.8008, + "step": 16747 + }, + { + "epoch": 5.140577041129528, + "grad_norm": 0.26014089584350586, + "learning_rate": 5.015905764723217e-05, + "loss": 1.7131, + "step": 16748 + }, + { + "epoch": 5.140883977900552, + "grad_norm": 0.25526607036590576, + "learning_rate": 5.015408711174535e-05, + "loss": 1.7525, + "step": 16749 + }, + { + "epoch": 5.1411909146715775, + "grad_norm": 0.2092386782169342, + "learning_rate": 5.0149116574735756e-05, + "loss": 1.7502, + "step": 16750 + }, + { + "epoch": 5.141497851442603, + "grad_norm": 0.21560105681419373, + "learning_rate": 5.01441460362525e-05, + "loss": 1.7903, + "step": 16751 + }, + { + "epoch": 5.141804788213628, + "grad_norm": 0.23538467288017273, + "learning_rate": 5.013917549634471e-05, + "loss": 1.6995, + "step": 16752 + }, + { + "epoch": 5.1421117249846535, + "grad_norm": 0.26545262336730957, + "learning_rate": 5.0134204955061526e-05, + "loss": 1.7511, + "step": 16753 + }, + { + "epoch": 5.142418661755678, + "grad_norm": 0.23030948638916016, + "learning_rate": 5.012923441245203e-05, + "loss": 1.7271, + "step": 16754 + }, + { + "epoch": 5.142725598526703, + "grad_norm": 0.22395408153533936, + "learning_rate": 5.012426386856537e-05, + "loss": 1.7273, + "step": 16755 + }, + { + "epoch": 5.143032535297729, + "grad_norm": 0.21355997025966644, + "learning_rate": 5.011929332345066e-05, + "loss": 1.7347, + "step": 16756 + }, + { + "epoch": 5.143339472068754, + "grad_norm": 0.2355809509754181, + "learning_rate": 5.011432277715702e-05, + "loss": 1.8289, + "step": 16757 + }, + { + "epoch": 5.143646408839779, + "grad_norm": 0.24319802224636078, + "learning_rate": 5.0109352229733584e-05, + "loss": 1.7621, + "step": 16758 + }, + { + "epoch": 5.143953345610804, + "grad_norm": 0.2591453492641449, + "learning_rate": 5.010438168122946e-05, + "loss": 1.8043, + "step": 16759 + }, + { + "epoch": 5.144260282381829, + "grad_norm": 0.22595751285552979, + "learning_rate": 5.009941113169376e-05, + "loss": 1.8137, + "step": 16760 + }, + { + "epoch": 5.144567219152854, + "grad_norm": 0.220921128988266, + "learning_rate": 5.009444058117564e-05, + "loss": 1.7105, + "step": 16761 + }, + { + "epoch": 5.14487415592388, + "grad_norm": 0.25713789463043213, + "learning_rate": 5.0089470029724195e-05, + "loss": 1.8184, + "step": 16762 + }, + { + "epoch": 5.145181092694905, + "grad_norm": 0.19849328696727753, + "learning_rate": 5.008449947738856e-05, + "loss": 1.7331, + "step": 16763 + }, + { + "epoch": 5.14548802946593, + "grad_norm": 0.2073405385017395, + "learning_rate": 5.007952892421785e-05, + "loss": 1.7053, + "step": 16764 + }, + { + "epoch": 5.145794966236955, + "grad_norm": 0.22307951748371124, + "learning_rate": 5.007455837026119e-05, + "loss": 1.7724, + "step": 16765 + }, + { + "epoch": 5.14610190300798, + "grad_norm": 0.22160649299621582, + "learning_rate": 5.006958781556769e-05, + "loss": 1.7191, + "step": 16766 + }, + { + "epoch": 5.1464088397790055, + "grad_norm": 0.2202252298593521, + "learning_rate": 5.0064617260186487e-05, + "loss": 1.7339, + "step": 16767 + }, + { + "epoch": 5.146715776550031, + "grad_norm": 0.23693829774856567, + "learning_rate": 5.005964670416671e-05, + "loss": 1.7143, + "step": 16768 + }, + { + "epoch": 5.147022713321056, + "grad_norm": 0.22675764560699463, + "learning_rate": 5.005467614755746e-05, + "loss": 1.7913, + "step": 16769 + }, + { + "epoch": 5.147329650092081, + "grad_norm": 0.21288467943668365, + "learning_rate": 5.0049705590407866e-05, + "loss": 1.7581, + "step": 16770 + }, + { + "epoch": 5.147636586863106, + "grad_norm": 0.216839998960495, + "learning_rate": 5.0044735032767064e-05, + "loss": 1.7305, + "step": 16771 + }, + { + "epoch": 5.147943523634131, + "grad_norm": 0.2111063450574875, + "learning_rate": 5.003976447468416e-05, + "loss": 1.7444, + "step": 16772 + }, + { + "epoch": 5.148250460405157, + "grad_norm": 0.2536773085594177, + "learning_rate": 5.003479391620827e-05, + "loss": 1.6952, + "step": 16773 + }, + { + "epoch": 5.148557397176182, + "grad_norm": 0.23585477471351624, + "learning_rate": 5.002982335738854e-05, + "loss": 1.6921, + "step": 16774 + }, + { + "epoch": 5.148864333947207, + "grad_norm": 0.1927027702331543, + "learning_rate": 5.002485279827407e-05, + "loss": 1.7781, + "step": 16775 + }, + { + "epoch": 5.149171270718232, + "grad_norm": 0.22545355558395386, + "learning_rate": 5.001988223891399e-05, + "loss": 1.7582, + "step": 16776 + }, + { + "epoch": 5.149478207489257, + "grad_norm": 0.20837660133838654, + "learning_rate": 5.001491167935741e-05, + "loss": 1.7379, + "step": 16777 + }, + { + "epoch": 5.149785144260282, + "grad_norm": 0.20510734617710114, + "learning_rate": 5.000994111965348e-05, + "loss": 1.7568, + "step": 16778 + }, + { + "epoch": 5.150092081031308, + "grad_norm": 0.2629711329936981, + "learning_rate": 5.00049705598513e-05, + "loss": 1.7613, + "step": 16779 + }, + { + "epoch": 5.150399017802333, + "grad_norm": 0.2390555888414383, + "learning_rate": 5e-05, + "loss": 1.7099, + "step": 16780 + }, + { + "epoch": 5.150705954573358, + "grad_norm": 0.19643893837928772, + "learning_rate": 4.9995029440148715e-05, + "loss": 1.7012, + "step": 16781 + }, + { + "epoch": 5.151012891344383, + "grad_norm": 0.1881607472896576, + "learning_rate": 4.999005888034653e-05, + "loss": 1.705, + "step": 16782 + }, + { + "epoch": 5.151319828115408, + "grad_norm": 0.3219485282897949, + "learning_rate": 4.99850883206426e-05, + "loss": 1.8089, + "step": 16783 + }, + { + "epoch": 5.151626764886434, + "grad_norm": 0.22285562753677368, + "learning_rate": 4.998011776108602e-05, + "loss": 1.7343, + "step": 16784 + }, + { + "epoch": 5.151933701657459, + "grad_norm": 0.1981910616159439, + "learning_rate": 4.9975147201725955e-05, + "loss": 1.6939, + "step": 16785 + }, + { + "epoch": 5.152240638428483, + "grad_norm": 0.2338661551475525, + "learning_rate": 4.997017664261148e-05, + "loss": 1.6833, + "step": 16786 + }, + { + "epoch": 5.152547575199509, + "grad_norm": 0.2613268792629242, + "learning_rate": 4.996520608379175e-05, + "loss": 1.7251, + "step": 16787 + }, + { + "epoch": 5.152854511970534, + "grad_norm": 0.26063668727874756, + "learning_rate": 4.996023552531586e-05, + "loss": 1.8444, + "step": 16788 + }, + { + "epoch": 5.153161448741559, + "grad_norm": 0.2711321711540222, + "learning_rate": 4.9955264967232954e-05, + "loss": 1.7257, + "step": 16789 + }, + { + "epoch": 5.153468385512585, + "grad_norm": 0.30134227871894836, + "learning_rate": 4.995029440959213e-05, + "loss": 1.7599, + "step": 16790 + }, + { + "epoch": 5.153775322283609, + "grad_norm": 0.22983741760253906, + "learning_rate": 4.994532385244255e-05, + "loss": 1.7944, + "step": 16791 + }, + { + "epoch": 5.1540822590546345, + "grad_norm": 0.2992973327636719, + "learning_rate": 4.994035329583329e-05, + "loss": 1.7507, + "step": 16792 + }, + { + "epoch": 5.15438919582566, + "grad_norm": 0.2659669518470764, + "learning_rate": 4.993538273981352e-05, + "loss": 1.7246, + "step": 16793 + }, + { + "epoch": 5.154696132596685, + "grad_norm": 0.24235470592975616, + "learning_rate": 4.9930412184432315e-05, + "loss": 1.8378, + "step": 16794 + }, + { + "epoch": 5.1550030693677105, + "grad_norm": 0.30005061626434326, + "learning_rate": 4.992544162973882e-05, + "loss": 1.7526, + "step": 16795 + }, + { + "epoch": 5.155310006138736, + "grad_norm": 0.2183740884065628, + "learning_rate": 4.992047107578215e-05, + "loss": 1.7197, + "step": 16796 + }, + { + "epoch": 5.15561694290976, + "grad_norm": 0.35874706506729126, + "learning_rate": 4.991550052261145e-05, + "loss": 1.8196, + "step": 16797 + }, + { + "epoch": 5.155923879680786, + "grad_norm": 0.42146921157836914, + "learning_rate": 4.991052997027583e-05, + "loss": 1.7165, + "step": 16798 + }, + { + "epoch": 5.156230816451811, + "grad_norm": 0.2738321125507355, + "learning_rate": 4.990555941882437e-05, + "loss": 1.7042, + "step": 16799 + }, + { + "epoch": 5.156537753222836, + "grad_norm": 0.26304566860198975, + "learning_rate": 4.990058886830625e-05, + "loss": 1.7551, + "step": 16800 + }, + { + "epoch": 5.156844689993862, + "grad_norm": 0.4301520586013794, + "learning_rate": 4.9895618318770556e-05, + "loss": 1.7219, + "step": 16801 + }, + { + "epoch": 5.157151626764886, + "grad_norm": 0.3316499590873718, + "learning_rate": 4.989064777026644e-05, + "loss": 1.8034, + "step": 16802 + }, + { + "epoch": 5.157458563535911, + "grad_norm": 0.30105581879615784, + "learning_rate": 4.9885677222842984e-05, + "loss": 1.7022, + "step": 16803 + }, + { + "epoch": 5.157765500306937, + "grad_norm": 0.3830905854701996, + "learning_rate": 4.988070667654937e-05, + "loss": 1.7898, + "step": 16804 + }, + { + "epoch": 5.158072437077962, + "grad_norm": 0.2204640656709671, + "learning_rate": 4.9875736131434644e-05, + "loss": 1.7081, + "step": 16805 + }, + { + "epoch": 5.158379373848987, + "grad_norm": 0.3620772063732147, + "learning_rate": 4.9870765587547976e-05, + "loss": 1.7345, + "step": 16806 + }, + { + "epoch": 5.158686310620013, + "grad_norm": 0.3268207907676697, + "learning_rate": 4.986579504493848e-05, + "loss": 1.7364, + "step": 16807 + }, + { + "epoch": 5.158993247391037, + "grad_norm": 0.2499808967113495, + "learning_rate": 4.986082450365529e-05, + "loss": 1.7836, + "step": 16808 + }, + { + "epoch": 5.1593001841620625, + "grad_norm": 0.3696226477622986, + "learning_rate": 4.98558539637475e-05, + "loss": 1.8094, + "step": 16809 + }, + { + "epoch": 5.159607120933088, + "grad_norm": 0.3239068388938904, + "learning_rate": 4.9850883425264256e-05, + "loss": 1.7448, + "step": 16810 + }, + { + "epoch": 5.159914057704113, + "grad_norm": 0.19875772297382355, + "learning_rate": 4.9845912888254655e-05, + "loss": 1.6945, + "step": 16811 + }, + { + "epoch": 5.1602209944751385, + "grad_norm": 0.3952203691005707, + "learning_rate": 4.984094235276784e-05, + "loss": 1.8457, + "step": 16812 + }, + { + "epoch": 5.160527931246163, + "grad_norm": 0.3052334785461426, + "learning_rate": 4.9835971818852916e-05, + "loss": 1.7371, + "step": 16813 + }, + { + "epoch": 5.160834868017188, + "grad_norm": 0.2874486446380615, + "learning_rate": 4.983100128655904e-05, + "loss": 1.7194, + "step": 16814 + }, + { + "epoch": 5.161141804788214, + "grad_norm": 0.39117491245269775, + "learning_rate": 4.98260307559353e-05, + "loss": 1.7919, + "step": 16815 + }, + { + "epoch": 5.161448741559239, + "grad_norm": 0.2532150149345398, + "learning_rate": 4.982106022703081e-05, + "loss": 1.8103, + "step": 16816 + }, + { + "epoch": 5.161755678330264, + "grad_norm": 0.3545167148113251, + "learning_rate": 4.981608969989473e-05, + "loss": 1.8093, + "step": 16817 + }, + { + "epoch": 5.162062615101289, + "grad_norm": 0.397806316614151, + "learning_rate": 4.981111917457613e-05, + "loss": 1.7885, + "step": 16818 + }, + { + "epoch": 5.162369551872314, + "grad_norm": 0.2523536682128906, + "learning_rate": 4.980614865112419e-05, + "loss": 1.797, + "step": 16819 + }, + { + "epoch": 5.162676488643339, + "grad_norm": 0.3666839301586151, + "learning_rate": 4.980117812958798e-05, + "loss": 1.7859, + "step": 16820 + }, + { + "epoch": 5.162983425414365, + "grad_norm": 0.3392138183116913, + "learning_rate": 4.9796207610016664e-05, + "loss": 1.7717, + "step": 16821 + }, + { + "epoch": 5.16329036218539, + "grad_norm": 0.21040666103363037, + "learning_rate": 4.9791237092459325e-05, + "loss": 1.7447, + "step": 16822 + }, + { + "epoch": 5.163597298956415, + "grad_norm": 0.3140225112438202, + "learning_rate": 4.978626657696512e-05, + "loss": 1.7405, + "step": 16823 + }, + { + "epoch": 5.16390423572744, + "grad_norm": 0.23963581025600433, + "learning_rate": 4.978129606358313e-05, + "loss": 1.7041, + "step": 16824 + }, + { + "epoch": 5.164211172498465, + "grad_norm": 0.32476937770843506, + "learning_rate": 4.977632555236253e-05, + "loss": 1.736, + "step": 16825 + }, + { + "epoch": 5.1645181092694905, + "grad_norm": 0.4362463653087616, + "learning_rate": 4.977135504335239e-05, + "loss": 1.7657, + "step": 16826 + }, + { + "epoch": 5.164825046040516, + "grad_norm": 0.26118260622024536, + "learning_rate": 4.976638453660188e-05, + "loss": 1.7339, + "step": 16827 + }, + { + "epoch": 5.165131982811541, + "grad_norm": 0.27284330129623413, + "learning_rate": 4.9761414032160065e-05, + "loss": 1.8086, + "step": 16828 + }, + { + "epoch": 5.165438919582566, + "grad_norm": 0.2942579388618469, + "learning_rate": 4.975644353007611e-05, + "loss": 1.7869, + "step": 16829 + }, + { + "epoch": 5.165745856353591, + "grad_norm": 0.23257993161678314, + "learning_rate": 4.975147303039912e-05, + "loss": 1.8048, + "step": 16830 + }, + { + "epoch": 5.166052793124616, + "grad_norm": 0.28638842701911926, + "learning_rate": 4.9746502533178225e-05, + "loss": 1.7744, + "step": 16831 + }, + { + "epoch": 5.166359729895642, + "grad_norm": 0.21571335196495056, + "learning_rate": 4.974153203846255e-05, + "loss": 1.7842, + "step": 16832 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 0.268883615732193, + "learning_rate": 4.9736561546301185e-05, + "loss": 1.7194, + "step": 16833 + }, + { + "epoch": 5.166973603437691, + "grad_norm": 0.22934168577194214, + "learning_rate": 4.9731591056743285e-05, + "loss": 1.757, + "step": 16834 + }, + { + "epoch": 5.167280540208717, + "grad_norm": 0.26321718096733093, + "learning_rate": 4.9726620569837946e-05, + "loss": 1.7675, + "step": 16835 + }, + { + "epoch": 5.167587476979742, + "grad_norm": 0.2893882393836975, + "learning_rate": 4.9721650085634325e-05, + "loss": 1.7134, + "step": 16836 + }, + { + "epoch": 5.167894413750767, + "grad_norm": 0.24130617082118988, + "learning_rate": 4.97166796041815e-05, + "loss": 1.7119, + "step": 16837 + }, + { + "epoch": 5.168201350521793, + "grad_norm": 0.23614190518856049, + "learning_rate": 4.9711709125528635e-05, + "loss": 1.7556, + "step": 16838 + }, + { + "epoch": 5.168508287292818, + "grad_norm": 0.2031065821647644, + "learning_rate": 4.97067386497248e-05, + "loss": 1.7678, + "step": 16839 + }, + { + "epoch": 5.1688152240638425, + "grad_norm": 0.30695948004722595, + "learning_rate": 4.970176817681917e-05, + "loss": 1.7907, + "step": 16840 + }, + { + "epoch": 5.169122160834868, + "grad_norm": 0.31256723403930664, + "learning_rate": 4.969679770686082e-05, + "loss": 1.7448, + "step": 16841 + }, + { + "epoch": 5.169429097605893, + "grad_norm": 0.24183644354343414, + "learning_rate": 4.969182723989892e-05, + "loss": 1.7259, + "step": 16842 + }, + { + "epoch": 5.1697360343769185, + "grad_norm": 0.22440548241138458, + "learning_rate": 4.9686856775982536e-05, + "loss": 1.7949, + "step": 16843 + }, + { + "epoch": 5.170042971147944, + "grad_norm": 0.29006195068359375, + "learning_rate": 4.9681886315160846e-05, + "loss": 1.7128, + "step": 16844 + }, + { + "epoch": 5.170349907918968, + "grad_norm": 0.2189658135175705, + "learning_rate": 4.967691585748292e-05, + "loss": 1.7375, + "step": 16845 + }, + { + "epoch": 5.170656844689994, + "grad_norm": 0.289909690618515, + "learning_rate": 4.967194540299791e-05, + "loss": 1.779, + "step": 16846 + }, + { + "epoch": 5.170963781461019, + "grad_norm": 0.28279590606689453, + "learning_rate": 4.966697495175492e-05, + "loss": 1.7368, + "step": 16847 + }, + { + "epoch": 5.171270718232044, + "grad_norm": 0.2056259959936142, + "learning_rate": 4.966200450380309e-05, + "loss": 1.7548, + "step": 16848 + }, + { + "epoch": 5.17157765500307, + "grad_norm": 0.2607482969760895, + "learning_rate": 4.965703405919154e-05, + "loss": 1.7178, + "step": 16849 + }, + { + "epoch": 5.171884591774095, + "grad_norm": 0.26085609197616577, + "learning_rate": 4.965206361796935e-05, + "loss": 1.751, + "step": 16850 + }, + { + "epoch": 5.172191528545119, + "grad_norm": 0.17960335314273834, + "learning_rate": 4.964709318018569e-05, + "loss": 1.6932, + "step": 16851 + }, + { + "epoch": 5.172498465316145, + "grad_norm": 0.2617340385913849, + "learning_rate": 4.964212274588965e-05, + "loss": 1.7753, + "step": 16852 + }, + { + "epoch": 5.17280540208717, + "grad_norm": 0.2454555630683899, + "learning_rate": 4.9637152315130383e-05, + "loss": 1.7587, + "step": 16853 + }, + { + "epoch": 5.173112338858195, + "grad_norm": 0.19221605360507965, + "learning_rate": 4.963218188795696e-05, + "loss": 1.7337, + "step": 16854 + }, + { + "epoch": 5.173419275629221, + "grad_norm": 0.24314738810062408, + "learning_rate": 4.9627211464418565e-05, + "loss": 1.725, + "step": 16855 + }, + { + "epoch": 5.173726212400245, + "grad_norm": 0.2533986568450928, + "learning_rate": 4.962224104456426e-05, + "loss": 1.7502, + "step": 16856 + }, + { + "epoch": 5.1740331491712706, + "grad_norm": 0.21800079941749573, + "learning_rate": 4.9617270628443195e-05, + "loss": 1.7622, + "step": 16857 + }, + { + "epoch": 5.174340085942296, + "grad_norm": 0.22742362320423126, + "learning_rate": 4.96123002161045e-05, + "loss": 1.7078, + "step": 16858 + }, + { + "epoch": 5.174647022713321, + "grad_norm": 0.22729982435703278, + "learning_rate": 4.960732980759727e-05, + "loss": 1.8349, + "step": 16859 + }, + { + "epoch": 5.1749539594843466, + "grad_norm": 0.28869518637657166, + "learning_rate": 4.9602359402970625e-05, + "loss": 1.8932, + "step": 16860 + }, + { + "epoch": 5.175260896255371, + "grad_norm": 0.21931354701519012, + "learning_rate": 4.9597389002273725e-05, + "loss": 1.6989, + "step": 16861 + }, + { + "epoch": 5.175567833026396, + "grad_norm": 0.2130192667245865, + "learning_rate": 4.959241860555564e-05, + "loss": 1.752, + "step": 16862 + }, + { + "epoch": 5.175874769797422, + "grad_norm": 0.21272781491279602, + "learning_rate": 4.958744821286553e-05, + "loss": 1.7402, + "step": 16863 + }, + { + "epoch": 5.176181706568447, + "grad_norm": 0.20279285311698914, + "learning_rate": 4.958247782425248e-05, + "loss": 1.7103, + "step": 16864 + }, + { + "epoch": 5.176488643339472, + "grad_norm": 0.23561790585517883, + "learning_rate": 4.957750743976564e-05, + "loss": 1.7742, + "step": 16865 + }, + { + "epoch": 5.176795580110497, + "grad_norm": 0.27608510851860046, + "learning_rate": 4.957253705945413e-05, + "loss": 1.7505, + "step": 16866 + }, + { + "epoch": 5.177102516881522, + "grad_norm": 0.20624001324176788, + "learning_rate": 4.956756668336704e-05, + "loss": 1.7032, + "step": 16867 + }, + { + "epoch": 5.1774094536525475, + "grad_norm": 0.23743939399719238, + "learning_rate": 4.956259631155352e-05, + "loss": 1.7469, + "step": 16868 + }, + { + "epoch": 5.177716390423573, + "grad_norm": 0.27421119809150696, + "learning_rate": 4.9557625944062675e-05, + "loss": 1.7028, + "step": 16869 + }, + { + "epoch": 5.178023327194598, + "grad_norm": 0.23788046836853027, + "learning_rate": 4.955265558094363e-05, + "loss": 1.7468, + "step": 16870 + }, + { + "epoch": 5.1783302639656235, + "grad_norm": 0.24712958931922913, + "learning_rate": 4.95476852222455e-05, + "loss": 1.7348, + "step": 16871 + }, + { + "epoch": 5.178637200736648, + "grad_norm": 0.21558570861816406, + "learning_rate": 4.9542714868017424e-05, + "loss": 1.7599, + "step": 16872 + }, + { + "epoch": 5.178944137507673, + "grad_norm": 0.2561664283275604, + "learning_rate": 4.953774451830849e-05, + "loss": 1.7673, + "step": 16873 + }, + { + "epoch": 5.179251074278699, + "grad_norm": 0.19761815667152405, + "learning_rate": 4.953277417316786e-05, + "loss": 1.743, + "step": 16874 + }, + { + "epoch": 5.179558011049724, + "grad_norm": 0.24140769243240356, + "learning_rate": 4.95278038326446e-05, + "loss": 1.8229, + "step": 16875 + }, + { + "epoch": 5.179864947820749, + "grad_norm": 0.21686211228370667, + "learning_rate": 4.9522833496787876e-05, + "loss": 1.7914, + "step": 16876 + }, + { + "epoch": 5.180171884591774, + "grad_norm": 0.2537819743156433, + "learning_rate": 4.951786316564678e-05, + "loss": 1.7532, + "step": 16877 + }, + { + "epoch": 5.180478821362799, + "grad_norm": 0.24567632377147675, + "learning_rate": 4.951289283927046e-05, + "loss": 1.7528, + "step": 16878 + }, + { + "epoch": 5.180785758133824, + "grad_norm": 0.1958467960357666, + "learning_rate": 4.9507922517708e-05, + "loss": 1.6922, + "step": 16879 + }, + { + "epoch": 5.18109269490485, + "grad_norm": 0.2012091726064682, + "learning_rate": 4.950295220100857e-05, + "loss": 1.7509, + "step": 16880 + }, + { + "epoch": 5.181399631675875, + "grad_norm": 0.2416311800479889, + "learning_rate": 4.9497981889221226e-05, + "loss": 1.7341, + "step": 16881 + }, + { + "epoch": 5.1817065684469, + "grad_norm": 0.21407842636108398, + "learning_rate": 4.949301158239513e-05, + "loss": 1.7493, + "step": 16882 + }, + { + "epoch": 5.182013505217925, + "grad_norm": 0.2354930192232132, + "learning_rate": 4.94880412805794e-05, + "loss": 1.7726, + "step": 16883 + }, + { + "epoch": 5.18232044198895, + "grad_norm": 0.2168428748846054, + "learning_rate": 4.948307098382313e-05, + "loss": 1.77, + "step": 16884 + }, + { + "epoch": 5.1826273787599755, + "grad_norm": 0.19605880975723267, + "learning_rate": 4.947810069217547e-05, + "loss": 1.7292, + "step": 16885 + }, + { + "epoch": 5.182934315531001, + "grad_norm": 0.23066702485084534, + "learning_rate": 4.947313040568551e-05, + "loss": 1.7265, + "step": 16886 + }, + { + "epoch": 5.183241252302026, + "grad_norm": 0.20139534771442413, + "learning_rate": 4.9468160124402386e-05, + "loss": 1.7443, + "step": 16887 + }, + { + "epoch": 5.183548189073051, + "grad_norm": 0.25097572803497314, + "learning_rate": 4.946318984837521e-05, + "loss": 1.7537, + "step": 16888 + }, + { + "epoch": 5.183855125844076, + "grad_norm": 0.26215067505836487, + "learning_rate": 4.945821957765313e-05, + "loss": 1.8397, + "step": 16889 + }, + { + "epoch": 5.184162062615101, + "grad_norm": 0.22072140872478485, + "learning_rate": 4.9453249312285215e-05, + "loss": 1.7052, + "step": 16890 + }, + { + "epoch": 5.184468999386127, + "grad_norm": 0.20372305810451508, + "learning_rate": 4.944827905232064e-05, + "loss": 1.7228, + "step": 16891 + }, + { + "epoch": 5.184775936157152, + "grad_norm": 0.20383495092391968, + "learning_rate": 4.944330879780847e-05, + "loss": 1.7063, + "step": 16892 + }, + { + "epoch": 5.185082872928176, + "grad_norm": 0.1903693675994873, + "learning_rate": 4.943833854879786e-05, + "loss": 1.6435, + "step": 16893 + }, + { + "epoch": 5.185389809699202, + "grad_norm": 0.20357775688171387, + "learning_rate": 4.94333683053379e-05, + "loss": 1.7485, + "step": 16894 + }, + { + "epoch": 5.185696746470227, + "grad_norm": 0.24776104092597961, + "learning_rate": 4.942839806747775e-05, + "loss": 1.718, + "step": 16895 + }, + { + "epoch": 5.186003683241252, + "grad_norm": 0.2455051839351654, + "learning_rate": 4.942342783526649e-05, + "loss": 1.7124, + "step": 16896 + }, + { + "epoch": 5.186310620012278, + "grad_norm": 0.2102014273405075, + "learning_rate": 4.941845760875328e-05, + "loss": 1.7584, + "step": 16897 + }, + { + "epoch": 5.186617556783303, + "grad_norm": 0.2177651822566986, + "learning_rate": 4.941348738798718e-05, + "loss": 1.7019, + "step": 16898 + }, + { + "epoch": 5.1869244935543275, + "grad_norm": 0.21296697854995728, + "learning_rate": 4.9408517173017355e-05, + "loss": 1.7299, + "step": 16899 + }, + { + "epoch": 5.187231430325353, + "grad_norm": 0.23485495150089264, + "learning_rate": 4.940354696389292e-05, + "loss": 1.7271, + "step": 16900 + }, + { + "epoch": 5.187538367096378, + "grad_norm": 0.27287766337394714, + "learning_rate": 4.939857676066297e-05, + "loss": 1.7601, + "step": 16901 + }, + { + "epoch": 5.1878453038674035, + "grad_norm": 0.2060246467590332, + "learning_rate": 4.939360656337665e-05, + "loss": 1.7064, + "step": 16902 + }, + { + "epoch": 5.188152240638429, + "grad_norm": 0.25422418117523193, + "learning_rate": 4.938863637208305e-05, + "loss": 1.7423, + "step": 16903 + }, + { + "epoch": 5.188459177409453, + "grad_norm": 0.2798483669757843, + "learning_rate": 4.9383666186831304e-05, + "loss": 1.7132, + "step": 16904 + }, + { + "epoch": 5.188766114180479, + "grad_norm": 0.23505693674087524, + "learning_rate": 4.9378696007670525e-05, + "loss": 1.7759, + "step": 16905 + }, + { + "epoch": 5.189073050951504, + "grad_norm": 0.23761989176273346, + "learning_rate": 4.937372583464987e-05, + "loss": 1.7076, + "step": 16906 + }, + { + "epoch": 5.189379987722529, + "grad_norm": 0.3005945086479187, + "learning_rate": 4.9368755667818385e-05, + "loss": 1.6957, + "step": 16907 + }, + { + "epoch": 5.189686924493555, + "grad_norm": 0.2502881586551666, + "learning_rate": 4.936378550722525e-05, + "loss": 1.7352, + "step": 16908 + }, + { + "epoch": 5.189993861264579, + "grad_norm": 0.24194179475307465, + "learning_rate": 4.9358815352919544e-05, + "loss": 1.738, + "step": 16909 + }, + { + "epoch": 5.190300798035604, + "grad_norm": 0.27478742599487305, + "learning_rate": 4.935384520495041e-05, + "loss": 1.7118, + "step": 16910 + }, + { + "epoch": 5.19060773480663, + "grad_norm": 0.22327560186386108, + "learning_rate": 4.9348875063366944e-05, + "loss": 1.7697, + "step": 16911 + }, + { + "epoch": 5.190914671577655, + "grad_norm": 0.21844418346881866, + "learning_rate": 4.9343904928218295e-05, + "loss": 1.7733, + "step": 16912 + }, + { + "epoch": 5.19122160834868, + "grad_norm": 0.25267866253852844, + "learning_rate": 4.933893479955354e-05, + "loss": 1.7313, + "step": 16913 + }, + { + "epoch": 5.191528545119706, + "grad_norm": 0.22045068442821503, + "learning_rate": 4.933396467742185e-05, + "loss": 1.7856, + "step": 16914 + }, + { + "epoch": 5.19183548189073, + "grad_norm": 0.22642305493354797, + "learning_rate": 4.932899456187229e-05, + "loss": 1.7326, + "step": 16915 + }, + { + "epoch": 5.1921424186617555, + "grad_norm": 0.20601733028888702, + "learning_rate": 4.9324024452953995e-05, + "loss": 1.7743, + "step": 16916 + }, + { + "epoch": 5.192449355432781, + "grad_norm": 0.25580671429634094, + "learning_rate": 4.931905435071611e-05, + "loss": 1.7705, + "step": 16917 + }, + { + "epoch": 5.192756292203806, + "grad_norm": 0.38173142075538635, + "learning_rate": 4.9314084255207706e-05, + "loss": 1.7504, + "step": 16918 + }, + { + "epoch": 5.1930632289748315, + "grad_norm": 0.2254420667886734, + "learning_rate": 4.930911416647794e-05, + "loss": 1.7344, + "step": 16919 + }, + { + "epoch": 5.193370165745856, + "grad_norm": 0.2354312688112259, + "learning_rate": 4.9304144084575896e-05, + "loss": 1.7607, + "step": 16920 + }, + { + "epoch": 5.193677102516881, + "grad_norm": 0.23879510164260864, + "learning_rate": 4.9299174009550716e-05, + "loss": 1.683, + "step": 16921 + }, + { + "epoch": 5.193984039287907, + "grad_norm": 0.228669211268425, + "learning_rate": 4.9294203941451494e-05, + "loss": 1.7776, + "step": 16922 + }, + { + "epoch": 5.194290976058932, + "grad_norm": 0.2266843616962433, + "learning_rate": 4.928923388032739e-05, + "loss": 1.7563, + "step": 16923 + }, + { + "epoch": 5.194597912829957, + "grad_norm": 0.2581404745578766, + "learning_rate": 4.928426382622747e-05, + "loss": 1.8112, + "step": 16924 + }, + { + "epoch": 5.194904849600983, + "grad_norm": 0.25179803371429443, + "learning_rate": 4.92792937792009e-05, + "loss": 1.7661, + "step": 16925 + }, + { + "epoch": 5.195211786372007, + "grad_norm": 0.23408514261245728, + "learning_rate": 4.9274323739296746e-05, + "loss": 1.7618, + "step": 16926 + }, + { + "epoch": 5.195518723143032, + "grad_norm": 0.23110872507095337, + "learning_rate": 4.926935370656416e-05, + "loss": 1.6945, + "step": 16927 + }, + { + "epoch": 5.195825659914058, + "grad_norm": 0.2863025665283203, + "learning_rate": 4.926438368105224e-05, + "loss": 1.8659, + "step": 16928 + }, + { + "epoch": 5.196132596685083, + "grad_norm": 0.2156454175710678, + "learning_rate": 4.925941366281013e-05, + "loss": 1.7281, + "step": 16929 + }, + { + "epoch": 5.196439533456108, + "grad_norm": 0.2338300198316574, + "learning_rate": 4.925444365188691e-05, + "loss": 1.7271, + "step": 16930 + }, + { + "epoch": 5.196746470227133, + "grad_norm": 0.21434102952480316, + "learning_rate": 4.924947364833173e-05, + "loss": 1.7342, + "step": 16931 + }, + { + "epoch": 5.197053406998158, + "grad_norm": 0.21619778871536255, + "learning_rate": 4.924450365219369e-05, + "loss": 1.7493, + "step": 16932 + }, + { + "epoch": 5.1973603437691835, + "grad_norm": 0.24532032012939453, + "learning_rate": 4.9239533663521896e-05, + "loss": 1.7707, + "step": 16933 + }, + { + "epoch": 5.197667280540209, + "grad_norm": 0.21795547008514404, + "learning_rate": 4.923456368236549e-05, + "loss": 1.7642, + "step": 16934 + }, + { + "epoch": 5.197974217311234, + "grad_norm": 0.2070101797580719, + "learning_rate": 4.922959370877356e-05, + "loss": 1.7377, + "step": 16935 + }, + { + "epoch": 5.198281154082259, + "grad_norm": 0.22546489536762238, + "learning_rate": 4.9224623742795256e-05, + "loss": 1.7766, + "step": 16936 + }, + { + "epoch": 5.198588090853284, + "grad_norm": 0.20723624527454376, + "learning_rate": 4.921965378447965e-05, + "loss": 1.7316, + "step": 16937 + }, + { + "epoch": 5.198895027624309, + "grad_norm": 0.21870547533035278, + "learning_rate": 4.9214683833875905e-05, + "loss": 1.7653, + "step": 16938 + }, + { + "epoch": 5.199201964395335, + "grad_norm": 0.19606490433216095, + "learning_rate": 4.920971389103309e-05, + "loss": 1.7181, + "step": 16939 + }, + { + "epoch": 5.19950890116636, + "grad_norm": 0.18372730910778046, + "learning_rate": 4.920474395600037e-05, + "loss": 1.7041, + "step": 16940 + }, + { + "epoch": 5.199815837937384, + "grad_norm": 0.22051765024662018, + "learning_rate": 4.919977402882682e-05, + "loss": 1.7172, + "step": 16941 + }, + { + "epoch": 5.20012277470841, + "grad_norm": 0.2135835587978363, + "learning_rate": 4.919480410956159e-05, + "loss": 1.6918, + "step": 16942 + }, + { + "epoch": 5.200429711479435, + "grad_norm": 0.19619768857955933, + "learning_rate": 4.918983419825376e-05, + "loss": 1.7005, + "step": 16943 + }, + { + "epoch": 5.2007366482504604, + "grad_norm": 0.22726574540138245, + "learning_rate": 4.918486429495246e-05, + "loss": 1.6775, + "step": 16944 + }, + { + "epoch": 5.201043585021486, + "grad_norm": 0.21471361815929413, + "learning_rate": 4.9179894399706815e-05, + "loss": 1.7102, + "step": 16945 + }, + { + "epoch": 5.201350521792511, + "grad_norm": 0.20113740861415863, + "learning_rate": 4.917492451256595e-05, + "loss": 1.7548, + "step": 16946 + }, + { + "epoch": 5.201657458563536, + "grad_norm": 0.2337827831506729, + "learning_rate": 4.916995463357894e-05, + "loss": 1.818, + "step": 16947 + }, + { + "epoch": 5.201964395334561, + "grad_norm": 0.2649554908275604, + "learning_rate": 4.9164984762794955e-05, + "loss": 1.7784, + "step": 16948 + }, + { + "epoch": 5.202271332105586, + "grad_norm": 0.2297617793083191, + "learning_rate": 4.916001490026306e-05, + "loss": 1.7484, + "step": 16949 + }, + { + "epoch": 5.202578268876612, + "grad_norm": 0.20791979134082794, + "learning_rate": 4.915504504603238e-05, + "loss": 1.7164, + "step": 16950 + }, + { + "epoch": 5.202885205647637, + "grad_norm": 0.21769596636295319, + "learning_rate": 4.915007520015207e-05, + "loss": 1.7783, + "step": 16951 + }, + { + "epoch": 5.203192142418661, + "grad_norm": 0.21038469672203064, + "learning_rate": 4.914510536267118e-05, + "loss": 1.6863, + "step": 16952 + }, + { + "epoch": 5.203499079189687, + "grad_norm": 0.20725449919700623, + "learning_rate": 4.914013553363889e-05, + "loss": 1.6855, + "step": 16953 + }, + { + "epoch": 5.203806015960712, + "grad_norm": 0.23879854381084442, + "learning_rate": 4.9135165713104266e-05, + "loss": 1.6986, + "step": 16954 + }, + { + "epoch": 5.204112952731737, + "grad_norm": 0.20515915751457214, + "learning_rate": 4.913019590111645e-05, + "loss": 1.6912, + "step": 16955 + }, + { + "epoch": 5.204419889502763, + "grad_norm": 0.2252528965473175, + "learning_rate": 4.912522609772453e-05, + "loss": 1.6974, + "step": 16956 + }, + { + "epoch": 5.204726826273788, + "grad_norm": 0.1946130096912384, + "learning_rate": 4.9120256302977665e-05, + "loss": 1.7009, + "step": 16957 + }, + { + "epoch": 5.2050337630448125, + "grad_norm": 0.21323645114898682, + "learning_rate": 4.9115286516924925e-05, + "loss": 1.7746, + "step": 16958 + }, + { + "epoch": 5.205340699815838, + "grad_norm": 0.20721712708473206, + "learning_rate": 4.911031673961546e-05, + "loss": 1.7103, + "step": 16959 + }, + { + "epoch": 5.205647636586863, + "grad_norm": 0.19630689918994904, + "learning_rate": 4.910534697109834e-05, + "loss": 1.7042, + "step": 16960 + }, + { + "epoch": 5.2059545733578885, + "grad_norm": 0.2036786526441574, + "learning_rate": 4.910037721142273e-05, + "loss": 1.7713, + "step": 16961 + }, + { + "epoch": 5.206261510128914, + "grad_norm": 0.20518352091312408, + "learning_rate": 4.9095407460637696e-05, + "loss": 1.7456, + "step": 16962 + }, + { + "epoch": 5.206568446899938, + "grad_norm": 0.199858620762825, + "learning_rate": 4.9090437718792404e-05, + "loss": 1.7598, + "step": 16963 + }, + { + "epoch": 5.206875383670964, + "grad_norm": 0.22860252857208252, + "learning_rate": 4.9085467985935914e-05, + "loss": 1.7947, + "step": 16964 + }, + { + "epoch": 5.207182320441989, + "grad_norm": 0.22179929912090302, + "learning_rate": 4.9080498262117395e-05, + "loss": 1.7537, + "step": 16965 + }, + { + "epoch": 5.207489257213014, + "grad_norm": 0.24737581610679626, + "learning_rate": 4.9075528547385906e-05, + "loss": 1.7932, + "step": 16966 + }, + { + "epoch": 5.20779619398404, + "grad_norm": 0.2653762400150299, + "learning_rate": 4.907055884179059e-05, + "loss": 1.7683, + "step": 16967 + }, + { + "epoch": 5.208103130755064, + "grad_norm": 0.2891876697540283, + "learning_rate": 4.9065589145380564e-05, + "loss": 1.7867, + "step": 16968 + }, + { + "epoch": 5.208410067526089, + "grad_norm": 0.23162086308002472, + "learning_rate": 4.906061945820492e-05, + "loss": 1.7981, + "step": 16969 + }, + { + "epoch": 5.208717004297115, + "grad_norm": 0.2746187150478363, + "learning_rate": 4.9055649780312805e-05, + "loss": 1.7215, + "step": 16970 + }, + { + "epoch": 5.20902394106814, + "grad_norm": 0.3217853605747223, + "learning_rate": 4.905068011175329e-05, + "loss": 1.8027, + "step": 16971 + }, + { + "epoch": 5.209330877839165, + "grad_norm": 0.21517686545848846, + "learning_rate": 4.904571045257553e-05, + "loss": 1.7055, + "step": 16972 + }, + { + "epoch": 5.209637814610191, + "grad_norm": 0.23613709211349487, + "learning_rate": 4.90407408028286e-05, + "loss": 1.751, + "step": 16973 + }, + { + "epoch": 5.209944751381215, + "grad_norm": 0.35093945264816284, + "learning_rate": 4.903577116256165e-05, + "loss": 1.7749, + "step": 16974 + }, + { + "epoch": 5.2102516881522405, + "grad_norm": 0.3289217948913574, + "learning_rate": 4.903080153182376e-05, + "loss": 1.7722, + "step": 16975 + }, + { + "epoch": 5.210558624923266, + "grad_norm": 0.29387256503105164, + "learning_rate": 4.9025831910664074e-05, + "loss": 1.8121, + "step": 16976 + }, + { + "epoch": 5.210865561694291, + "grad_norm": 0.44418805837631226, + "learning_rate": 4.9020862299131664e-05, + "loss": 1.7744, + "step": 16977 + }, + { + "epoch": 5.2111724984653165, + "grad_norm": 0.39242252707481384, + "learning_rate": 4.901589269727568e-05, + "loss": 1.7183, + "step": 16978 + }, + { + "epoch": 5.211479435236341, + "grad_norm": 0.2028690129518509, + "learning_rate": 4.901092310514522e-05, + "loss": 1.7101, + "step": 16979 + }, + { + "epoch": 5.211786372007366, + "grad_norm": 0.4025843143463135, + "learning_rate": 4.900595352278941e-05, + "loss": 1.7545, + "step": 16980 + }, + { + "epoch": 5.212093308778392, + "grad_norm": 0.284568727016449, + "learning_rate": 4.900098395025733e-05, + "loss": 1.7758, + "step": 16981 + }, + { + "epoch": 5.212400245549417, + "grad_norm": 0.2527516484260559, + "learning_rate": 4.899601438759813e-05, + "loss": 1.695, + "step": 16982 + }, + { + "epoch": 5.212707182320442, + "grad_norm": 0.3063630759716034, + "learning_rate": 4.89910448348609e-05, + "loss": 1.714, + "step": 16983 + }, + { + "epoch": 5.213014119091467, + "grad_norm": 0.22754468023777008, + "learning_rate": 4.898607529209474e-05, + "loss": 1.8315, + "step": 16984 + }, + { + "epoch": 5.213321055862492, + "grad_norm": 0.29594969749450684, + "learning_rate": 4.89811057593488e-05, + "loss": 1.6669, + "step": 16985 + }, + { + "epoch": 5.213627992633517, + "grad_norm": 0.21486569941043854, + "learning_rate": 4.897613623667215e-05, + "loss": 1.7425, + "step": 16986 + }, + { + "epoch": 5.213934929404543, + "grad_norm": 0.30908775329589844, + "learning_rate": 4.897116672411395e-05, + "loss": 1.7915, + "step": 16987 + }, + { + "epoch": 5.214241866175568, + "grad_norm": 0.23515601456165314, + "learning_rate": 4.896619722172325e-05, + "loss": 1.7226, + "step": 16988 + }, + { + "epoch": 5.214548802946593, + "grad_norm": 0.2847287952899933, + "learning_rate": 4.8961227729549215e-05, + "loss": 1.7641, + "step": 16989 + }, + { + "epoch": 5.214855739717618, + "grad_norm": 0.2986287772655487, + "learning_rate": 4.895625824764092e-05, + "loss": 1.8025, + "step": 16990 + }, + { + "epoch": 5.215162676488643, + "grad_norm": 0.23454971611499786, + "learning_rate": 4.8951288776047514e-05, + "loss": 1.7057, + "step": 16991 + }, + { + "epoch": 5.2154696132596685, + "grad_norm": 0.2578633725643158, + "learning_rate": 4.894631931481807e-05, + "loss": 1.7267, + "step": 16992 + }, + { + "epoch": 5.215776550030694, + "grad_norm": 0.29975566267967224, + "learning_rate": 4.894134986400174e-05, + "loss": 1.7452, + "step": 16993 + }, + { + "epoch": 5.216083486801719, + "grad_norm": 0.22313638031482697, + "learning_rate": 4.893638042364758e-05, + "loss": 1.6917, + "step": 16994 + }, + { + "epoch": 5.216390423572744, + "grad_norm": 0.258297860622406, + "learning_rate": 4.893141099380475e-05, + "loss": 1.7816, + "step": 16995 + }, + { + "epoch": 5.216697360343769, + "grad_norm": 0.2656872272491455, + "learning_rate": 4.892644157452233e-05, + "loss": 1.7248, + "step": 16996 + }, + { + "epoch": 5.217004297114794, + "grad_norm": 0.20239698886871338, + "learning_rate": 4.8921472165849464e-05, + "loss": 1.7629, + "step": 16997 + }, + { + "epoch": 5.21731123388582, + "grad_norm": 0.2575492262840271, + "learning_rate": 4.891650276783523e-05, + "loss": 1.719, + "step": 16998 + }, + { + "epoch": 5.217618170656845, + "grad_norm": 0.27563637495040894, + "learning_rate": 4.8911533380528756e-05, + "loss": 1.718, + "step": 16999 + }, + { + "epoch": 5.21792510742787, + "grad_norm": 0.1969723105430603, + "learning_rate": 4.890656400397915e-05, + "loss": 1.7557, + "step": 17000 + }, + { + "epoch": 5.218232044198895, + "grad_norm": 0.24336831271648407, + "learning_rate": 4.89015946382355e-05, + "loss": 1.6861, + "step": 17001 + }, + { + "epoch": 5.21853898096992, + "grad_norm": 0.2804388403892517, + "learning_rate": 4.889662528334696e-05, + "loss": 1.7411, + "step": 17002 + }, + { + "epoch": 5.218845917740945, + "grad_norm": 0.21116352081298828, + "learning_rate": 4.8891655939362596e-05, + "loss": 1.7135, + "step": 17003 + }, + { + "epoch": 5.219152854511971, + "grad_norm": 0.21042904257774353, + "learning_rate": 4.8886686606331556e-05, + "loss": 1.7224, + "step": 17004 + }, + { + "epoch": 5.219459791282996, + "grad_norm": 0.22463755309581757, + "learning_rate": 4.888171728430291e-05, + "loss": 1.8272, + "step": 17005 + }, + { + "epoch": 5.2197667280540205, + "grad_norm": 0.25604158639907837, + "learning_rate": 4.8876747973325805e-05, + "loss": 1.674, + "step": 17006 + }, + { + "epoch": 5.220073664825046, + "grad_norm": 0.3108421564102173, + "learning_rate": 4.887177867344932e-05, + "loss": 1.761, + "step": 17007 + }, + { + "epoch": 5.220380601596071, + "grad_norm": 0.25135359168052673, + "learning_rate": 4.88668093847226e-05, + "loss": 1.7455, + "step": 17008 + }, + { + "epoch": 5.2206875383670965, + "grad_norm": 0.24508307874202728, + "learning_rate": 4.886184010719471e-05, + "loss": 1.7632, + "step": 17009 + }, + { + "epoch": 5.220994475138122, + "grad_norm": 0.26777148246765137, + "learning_rate": 4.8856870840914816e-05, + "loss": 1.7814, + "step": 17010 + }, + { + "epoch": 5.221301411909146, + "grad_norm": 0.22404739260673523, + "learning_rate": 4.8851901585931967e-05, + "loss": 1.7441, + "step": 17011 + }, + { + "epoch": 5.221608348680172, + "grad_norm": 0.2406606674194336, + "learning_rate": 4.884693234229531e-05, + "loss": 1.7789, + "step": 17012 + }, + { + "epoch": 5.221915285451197, + "grad_norm": 0.27320384979248047, + "learning_rate": 4.884196311005394e-05, + "loss": 1.8046, + "step": 17013 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.3393586277961731, + "learning_rate": 4.8836993889256965e-05, + "loss": 1.7155, + "step": 17014 + }, + { + "epoch": 5.222529158993248, + "grad_norm": 0.3069504499435425, + "learning_rate": 4.88320246799535e-05, + "loss": 1.6985, + "step": 17015 + }, + { + "epoch": 5.222836095764273, + "grad_norm": 0.22184616327285767, + "learning_rate": 4.8827055482192664e-05, + "loss": 1.7996, + "step": 17016 + }, + { + "epoch": 5.223143032535297, + "grad_norm": 0.2791864573955536, + "learning_rate": 4.8822086296023544e-05, + "loss": 1.7223, + "step": 17017 + }, + { + "epoch": 5.223449969306323, + "grad_norm": 0.259726345539093, + "learning_rate": 4.8817117121495245e-05, + "loss": 1.7481, + "step": 17018 + }, + { + "epoch": 5.223756906077348, + "grad_norm": 0.19968681037425995, + "learning_rate": 4.8812147958656916e-05, + "loss": 1.702, + "step": 17019 + }, + { + "epoch": 5.224063842848373, + "grad_norm": 0.20161856710910797, + "learning_rate": 4.8807178807557616e-05, + "loss": 1.6689, + "step": 17020 + }, + { + "epoch": 5.224370779619399, + "grad_norm": 0.2365240454673767, + "learning_rate": 4.880220966824649e-05, + "loss": 1.7742, + "step": 17021 + }, + { + "epoch": 5.224677716390423, + "grad_norm": 0.20116381347179413, + "learning_rate": 4.879724054077261e-05, + "loss": 1.7584, + "step": 17022 + }, + { + "epoch": 5.2249846531614486, + "grad_norm": 0.22845037281513214, + "learning_rate": 4.879227142518511e-05, + "loss": 1.7794, + "step": 17023 + }, + { + "epoch": 5.225291589932474, + "grad_norm": 0.251724511384964, + "learning_rate": 4.87873023215331e-05, + "loss": 1.7722, + "step": 17024 + }, + { + "epoch": 5.225598526703499, + "grad_norm": 0.206145241856575, + "learning_rate": 4.878233322986568e-05, + "loss": 1.7452, + "step": 17025 + }, + { + "epoch": 5.225905463474525, + "grad_norm": 0.24065247178077698, + "learning_rate": 4.877736415023194e-05, + "loss": 1.8144, + "step": 17026 + }, + { + "epoch": 5.226212400245549, + "grad_norm": 0.2255484163761139, + "learning_rate": 4.877239508268103e-05, + "loss": 1.706, + "step": 17027 + }, + { + "epoch": 5.226519337016574, + "grad_norm": 0.21035850048065186, + "learning_rate": 4.8767426027262e-05, + "loss": 1.7167, + "step": 17028 + }, + { + "epoch": 5.2268262737876, + "grad_norm": 0.19618964195251465, + "learning_rate": 4.8762456984024025e-05, + "loss": 1.7063, + "step": 17029 + }, + { + "epoch": 5.227133210558625, + "grad_norm": 0.19595398008823395, + "learning_rate": 4.875748795301614e-05, + "loss": 1.7452, + "step": 17030 + }, + { + "epoch": 5.22744014732965, + "grad_norm": 0.22870996594429016, + "learning_rate": 4.8752518934287506e-05, + "loss": 1.8169, + "step": 17031 + }, + { + "epoch": 5.227747084100676, + "grad_norm": 0.24048443138599396, + "learning_rate": 4.87475499278872e-05, + "loss": 1.6988, + "step": 17032 + }, + { + "epoch": 5.2280540208717, + "grad_norm": 0.24177183210849762, + "learning_rate": 4.8742580933864356e-05, + "loss": 1.77, + "step": 17033 + }, + { + "epoch": 5.2283609576427255, + "grad_norm": 0.2023085057735443, + "learning_rate": 4.873761195226806e-05, + "loss": 1.7, + "step": 17034 + }, + { + "epoch": 5.228667894413751, + "grad_norm": 0.2614101767539978, + "learning_rate": 4.873264298314742e-05, + "loss": 1.767, + "step": 17035 + }, + { + "epoch": 5.228974831184776, + "grad_norm": 0.19607602059841156, + "learning_rate": 4.872767402655154e-05, + "loss": 1.7391, + "step": 17036 + }, + { + "epoch": 5.2292817679558015, + "grad_norm": 0.2053994983434677, + "learning_rate": 4.872270508252953e-05, + "loss": 1.7155, + "step": 17037 + }, + { + "epoch": 5.229588704726826, + "grad_norm": 0.18256273865699768, + "learning_rate": 4.871773615113051e-05, + "loss": 1.6999, + "step": 17038 + }, + { + "epoch": 5.229895641497851, + "grad_norm": 0.21956393122673035, + "learning_rate": 4.871276723240356e-05, + "loss": 1.7946, + "step": 17039 + }, + { + "epoch": 5.230202578268877, + "grad_norm": 0.23779109120368958, + "learning_rate": 4.870779832639781e-05, + "loss": 1.8063, + "step": 17040 + }, + { + "epoch": 5.230509515039902, + "grad_norm": 0.21662941575050354, + "learning_rate": 4.8702829433162346e-05, + "loss": 1.7276, + "step": 17041 + }, + { + "epoch": 5.230816451810927, + "grad_norm": 0.21578755974769592, + "learning_rate": 4.869786055274628e-05, + "loss": 1.7577, + "step": 17042 + }, + { + "epoch": 5.231123388581952, + "grad_norm": 0.23229347169399261, + "learning_rate": 4.8692891685198715e-05, + "loss": 1.7884, + "step": 17043 + }, + { + "epoch": 5.231430325352977, + "grad_norm": 0.2302366942167282, + "learning_rate": 4.868792283056878e-05, + "loss": 1.7823, + "step": 17044 + }, + { + "epoch": 5.231737262124002, + "grad_norm": 0.2181033343076706, + "learning_rate": 4.868295398890554e-05, + "loss": 1.7027, + "step": 17045 + }, + { + "epoch": 5.232044198895028, + "grad_norm": 0.20863409340381622, + "learning_rate": 4.8677985160258135e-05, + "loss": 1.7247, + "step": 17046 + }, + { + "epoch": 5.232351135666053, + "grad_norm": 0.2242976278066635, + "learning_rate": 4.867301634467564e-05, + "loss": 1.7799, + "step": 17047 + }, + { + "epoch": 5.232658072437078, + "grad_norm": 0.19934964179992676, + "learning_rate": 4.866804754220719e-05, + "loss": 1.6973, + "step": 17048 + }, + { + "epoch": 5.232965009208103, + "grad_norm": 0.22056198120117188, + "learning_rate": 4.8663078752901855e-05, + "loss": 1.7677, + "step": 17049 + }, + { + "epoch": 5.233271945979128, + "grad_norm": 0.2303200513124466, + "learning_rate": 4.865810997680879e-05, + "loss": 1.7517, + "step": 17050 + }, + { + "epoch": 5.2335788827501535, + "grad_norm": 0.21193410456180573, + "learning_rate": 4.8653141213977066e-05, + "loss": 1.7478, + "step": 17051 + }, + { + "epoch": 5.233885819521179, + "grad_norm": 0.18498395383358002, + "learning_rate": 4.864817246445577e-05, + "loss": 1.6891, + "step": 17052 + }, + { + "epoch": 5.234192756292204, + "grad_norm": 0.22879233956336975, + "learning_rate": 4.8643203728294036e-05, + "loss": 1.7166, + "step": 17053 + }, + { + "epoch": 5.234499693063229, + "grad_norm": 0.2128525823354721, + "learning_rate": 4.8638235005540944e-05, + "loss": 1.7993, + "step": 17054 + }, + { + "epoch": 5.234806629834254, + "grad_norm": 0.21245025098323822, + "learning_rate": 4.8633266296245634e-05, + "loss": 1.7436, + "step": 17055 + }, + { + "epoch": 5.235113566605279, + "grad_norm": 0.20301629602909088, + "learning_rate": 4.8628297600457165e-05, + "loss": 1.7774, + "step": 17056 + }, + { + "epoch": 5.235420503376305, + "grad_norm": 0.23251961171627045, + "learning_rate": 4.8623328918224687e-05, + "loss": 1.7897, + "step": 17057 + }, + { + "epoch": 5.23572744014733, + "grad_norm": 0.2272956669330597, + "learning_rate": 4.861836024959726e-05, + "loss": 1.7668, + "step": 17058 + }, + { + "epoch": 5.236034376918354, + "grad_norm": 0.20540569722652435, + "learning_rate": 4.8613391594624013e-05, + "loss": 1.7549, + "step": 17059 + }, + { + "epoch": 5.23634131368938, + "grad_norm": 0.20306967198848724, + "learning_rate": 4.8608422953354034e-05, + "loss": 1.6993, + "step": 17060 + }, + { + "epoch": 5.236648250460405, + "grad_norm": 0.19415293633937836, + "learning_rate": 4.8603454325836455e-05, + "loss": 1.7313, + "step": 17061 + }, + { + "epoch": 5.23695518723143, + "grad_norm": 0.2058337777853012, + "learning_rate": 4.859848571212034e-05, + "loss": 1.7994, + "step": 17062 + }, + { + "epoch": 5.237262124002456, + "grad_norm": 0.24489709734916687, + "learning_rate": 4.859351711225483e-05, + "loss": 1.7555, + "step": 17063 + }, + { + "epoch": 5.237569060773481, + "grad_norm": 0.22589795291423798, + "learning_rate": 4.858854852628899e-05, + "loss": 1.7136, + "step": 17064 + }, + { + "epoch": 5.2378759975445055, + "grad_norm": 0.21404492855072021, + "learning_rate": 4.858357995427195e-05, + "loss": 1.7598, + "step": 17065 + }, + { + "epoch": 5.238182934315531, + "grad_norm": 0.24936965107917786, + "learning_rate": 4.8578611396252786e-05, + "loss": 1.8027, + "step": 17066 + }, + { + "epoch": 5.238489871086556, + "grad_norm": 0.23391515016555786, + "learning_rate": 4.857364285228065e-05, + "loss": 1.7704, + "step": 17067 + }, + { + "epoch": 5.2387968078575815, + "grad_norm": 0.22633357346057892, + "learning_rate": 4.85686743224046e-05, + "loss": 1.7075, + "step": 17068 + }, + { + "epoch": 5.239103744628607, + "grad_norm": 0.221492201089859, + "learning_rate": 4.8563705806673736e-05, + "loss": 1.7755, + "step": 17069 + }, + { + "epoch": 5.239410681399631, + "grad_norm": 0.2381046712398529, + "learning_rate": 4.855873730513719e-05, + "loss": 1.7971, + "step": 17070 + }, + { + "epoch": 5.239717618170657, + "grad_norm": 0.21930988132953644, + "learning_rate": 4.855376881784402e-05, + "loss": 1.7295, + "step": 17071 + }, + { + "epoch": 5.240024554941682, + "grad_norm": 0.20897921919822693, + "learning_rate": 4.854880034484339e-05, + "loss": 1.7796, + "step": 17072 + }, + { + "epoch": 5.240331491712707, + "grad_norm": 0.26616254448890686, + "learning_rate": 4.8543831886184334e-05, + "loss": 1.7095, + "step": 17073 + }, + { + "epoch": 5.240638428483733, + "grad_norm": 0.19513870775699615, + "learning_rate": 4.853886344191601e-05, + "loss": 1.7181, + "step": 17074 + }, + { + "epoch": 5.240945365254758, + "grad_norm": 0.23476530611515045, + "learning_rate": 4.853389501208747e-05, + "loss": 1.7928, + "step": 17075 + }, + { + "epoch": 5.241252302025782, + "grad_norm": 0.18197014927864075, + "learning_rate": 4.852892659674785e-05, + "loss": 1.6888, + "step": 17076 + }, + { + "epoch": 5.241559238796808, + "grad_norm": 0.20317208766937256, + "learning_rate": 4.852395819594623e-05, + "loss": 1.7828, + "step": 17077 + }, + { + "epoch": 5.241866175567833, + "grad_norm": 0.1953772008419037, + "learning_rate": 4.851898980973175e-05, + "loss": 1.7394, + "step": 17078 + }, + { + "epoch": 5.242173112338858, + "grad_norm": 0.19714407622814178, + "learning_rate": 4.851402143815345e-05, + "loss": 1.7261, + "step": 17079 + }, + { + "epoch": 5.242480049109884, + "grad_norm": 0.2196008861064911, + "learning_rate": 4.850905308126048e-05, + "loss": 1.7387, + "step": 17080 + }, + { + "epoch": 5.242786985880908, + "grad_norm": 0.2337818443775177, + "learning_rate": 4.85040847391019e-05, + "loss": 1.7448, + "step": 17081 + }, + { + "epoch": 5.2430939226519335, + "grad_norm": 0.20940040051937103, + "learning_rate": 4.849911641172685e-05, + "loss": 1.7354, + "step": 17082 + }, + { + "epoch": 5.243400859422959, + "grad_norm": 0.2242170125246048, + "learning_rate": 4.849414809918439e-05, + "loss": 1.7325, + "step": 17083 + }, + { + "epoch": 5.243707796193984, + "grad_norm": 0.2322687953710556, + "learning_rate": 4.8489179801523675e-05, + "loss": 1.7557, + "step": 17084 + }, + { + "epoch": 5.2440147329650095, + "grad_norm": 0.20303767919540405, + "learning_rate": 4.8484211518793764e-05, + "loss": 1.7063, + "step": 17085 + }, + { + "epoch": 5.244321669736034, + "grad_norm": 0.2446853369474411, + "learning_rate": 4.8479243251043746e-05, + "loss": 1.7587, + "step": 17086 + }, + { + "epoch": 5.244628606507059, + "grad_norm": 0.22901636362075806, + "learning_rate": 4.8474274998322735e-05, + "loss": 1.7992, + "step": 17087 + }, + { + "epoch": 5.244935543278085, + "grad_norm": 0.29676303267478943, + "learning_rate": 4.846930676067984e-05, + "loss": 1.7688, + "step": 17088 + }, + { + "epoch": 5.24524248004911, + "grad_norm": 0.24160240590572357, + "learning_rate": 4.846433853816416e-05, + "loss": 1.7367, + "step": 17089 + }, + { + "epoch": 5.245549416820135, + "grad_norm": 0.2097402662038803, + "learning_rate": 4.8459370330824774e-05, + "loss": 1.721, + "step": 17090 + }, + { + "epoch": 5.245856353591161, + "grad_norm": 0.26451143622398376, + "learning_rate": 4.8454402138710814e-05, + "loss": 1.7707, + "step": 17091 + }, + { + "epoch": 5.246163290362185, + "grad_norm": 0.30428358912467957, + "learning_rate": 4.844943396187133e-05, + "loss": 1.7232, + "step": 17092 + }, + { + "epoch": 5.24647022713321, + "grad_norm": 0.24332918226718903, + "learning_rate": 4.8444465800355466e-05, + "loss": 1.8215, + "step": 17093 + }, + { + "epoch": 5.246777163904236, + "grad_norm": 0.292703777551651, + "learning_rate": 4.843949765421229e-05, + "loss": 1.7199, + "step": 17094 + }, + { + "epoch": 5.247084100675261, + "grad_norm": 0.2458789199590683, + "learning_rate": 4.843452952349094e-05, + "loss": 1.7615, + "step": 17095 + }, + { + "epoch": 5.247391037446286, + "grad_norm": 0.22538037598133087, + "learning_rate": 4.842956140824045e-05, + "loss": 1.7279, + "step": 17096 + }, + { + "epoch": 5.247697974217311, + "grad_norm": 0.2959176003932953, + "learning_rate": 4.842459330850999e-05, + "loss": 1.767, + "step": 17097 + }, + { + "epoch": 5.248004910988336, + "grad_norm": 0.26158571243286133, + "learning_rate": 4.84196252243486e-05, + "loss": 1.7387, + "step": 17098 + }, + { + "epoch": 5.2483118477593615, + "grad_norm": 0.22855687141418457, + "learning_rate": 4.84146571558054e-05, + "loss": 1.7497, + "step": 17099 + }, + { + "epoch": 5.248618784530387, + "grad_norm": 0.22470593452453613, + "learning_rate": 4.840968910292949e-05, + "loss": 1.7705, + "step": 17100 + }, + { + "epoch": 5.248925721301412, + "grad_norm": 0.24680538475513458, + "learning_rate": 4.840472106576998e-05, + "loss": 1.7426, + "step": 17101 + }, + { + "epoch": 5.249232658072437, + "grad_norm": 0.23919185996055603, + "learning_rate": 4.839975304437594e-05, + "loss": 1.78, + "step": 17102 + }, + { + "epoch": 5.249539594843462, + "grad_norm": 0.24717366695404053, + "learning_rate": 4.839478503879647e-05, + "loss": 1.7373, + "step": 17103 + }, + { + "epoch": 5.249846531614487, + "grad_norm": 0.20463785529136658, + "learning_rate": 4.838981704908068e-05, + "loss": 1.702, + "step": 17104 + }, + { + "epoch": 5.250153468385513, + "grad_norm": 0.19791419804096222, + "learning_rate": 4.838484907527766e-05, + "loss": 1.746, + "step": 17105 + }, + { + "epoch": 5.250460405156538, + "grad_norm": 0.26169353723526, + "learning_rate": 4.837988111743652e-05, + "loss": 1.7227, + "step": 17106 + }, + { + "epoch": 5.250767341927563, + "grad_norm": 0.23545648157596588, + "learning_rate": 4.837491317560633e-05, + "loss": 1.7104, + "step": 17107 + }, + { + "epoch": 5.251074278698588, + "grad_norm": 0.21569804847240448, + "learning_rate": 4.836994524983622e-05, + "loss": 1.7883, + "step": 17108 + }, + { + "epoch": 5.251381215469613, + "grad_norm": 0.2730300724506378, + "learning_rate": 4.836497734017524e-05, + "loss": 1.7105, + "step": 17109 + }, + { + "epoch": 5.2516881522406385, + "grad_norm": 0.2834697663784027, + "learning_rate": 4.836000944667253e-05, + "loss": 1.8041, + "step": 17110 + }, + { + "epoch": 5.251995089011664, + "grad_norm": 0.31536951661109924, + "learning_rate": 4.835504156937715e-05, + "loss": 1.7708, + "step": 17111 + }, + { + "epoch": 5.252302025782689, + "grad_norm": 0.3830285668373108, + "learning_rate": 4.835007370833824e-05, + "loss": 1.7464, + "step": 17112 + }, + { + "epoch": 5.252608962553714, + "grad_norm": 0.23248349130153656, + "learning_rate": 4.834510586360485e-05, + "loss": 1.7274, + "step": 17113 + }, + { + "epoch": 5.252915899324739, + "grad_norm": 0.4755091071128845, + "learning_rate": 4.834013803522611e-05, + "loss": 1.7853, + "step": 17114 + }, + { + "epoch": 5.253222836095764, + "grad_norm": 0.4267823398113251, + "learning_rate": 4.8335170223251073e-05, + "loss": 1.7424, + "step": 17115 + }, + { + "epoch": 5.25352977286679, + "grad_norm": 0.17621731758117676, + "learning_rate": 4.8330202427728876e-05, + "loss": 1.7415, + "step": 17116 + }, + { + "epoch": 5.253836709637815, + "grad_norm": 0.37484630942344666, + "learning_rate": 4.832523464870859e-05, + "loss": 1.7357, + "step": 17117 + }, + { + "epoch": 5.25414364640884, + "grad_norm": 0.27773791551589966, + "learning_rate": 4.832026688623933e-05, + "loss": 1.717, + "step": 17118 + }, + { + "epoch": 5.254450583179865, + "grad_norm": 0.31190845370292664, + "learning_rate": 4.8315299140370183e-05, + "loss": 1.7226, + "step": 17119 + }, + { + "epoch": 5.25475751995089, + "grad_norm": 0.4321303367614746, + "learning_rate": 4.8310331411150215e-05, + "loss": 1.8003, + "step": 17120 + }, + { + "epoch": 5.255064456721915, + "grad_norm": 0.31622835993766785, + "learning_rate": 4.830536369862855e-05, + "loss": 1.8462, + "step": 17121 + }, + { + "epoch": 5.255371393492941, + "grad_norm": 0.2144850194454193, + "learning_rate": 4.830039600285427e-05, + "loss": 1.8153, + "step": 17122 + }, + { + "epoch": 5.255678330263966, + "grad_norm": 0.3107511103153229, + "learning_rate": 4.829542832387649e-05, + "loss": 1.7271, + "step": 17123 + }, + { + "epoch": 5.2559852670349905, + "grad_norm": 0.24607159197330475, + "learning_rate": 4.8290460661744265e-05, + "loss": 1.7946, + "step": 17124 + }, + { + "epoch": 5.256292203806016, + "grad_norm": 0.226362943649292, + "learning_rate": 4.828549301650673e-05, + "loss": 1.7338, + "step": 17125 + }, + { + "epoch": 5.256599140577041, + "grad_norm": 0.29993724822998047, + "learning_rate": 4.828052538821294e-05, + "loss": 1.8, + "step": 17126 + }, + { + "epoch": 5.2569060773480665, + "grad_norm": 0.25639984011650085, + "learning_rate": 4.8275557776912014e-05, + "loss": 1.8009, + "step": 17127 + }, + { + "epoch": 5.257213014119092, + "grad_norm": 0.2308105081319809, + "learning_rate": 4.8270590182653024e-05, + "loss": 1.7468, + "step": 17128 + }, + { + "epoch": 5.257519950890116, + "grad_norm": 0.27337542176246643, + "learning_rate": 4.82656226054851e-05, + "loss": 1.7725, + "step": 17129 + }, + { + "epoch": 5.257826887661142, + "grad_norm": 0.24848094582557678, + "learning_rate": 4.826065504545729e-05, + "loss": 1.8084, + "step": 17130 + }, + { + "epoch": 5.258133824432167, + "grad_norm": 0.35026392340660095, + "learning_rate": 4.825568750261872e-05, + "loss": 1.7705, + "step": 17131 + }, + { + "epoch": 5.258440761203192, + "grad_norm": 0.3207968473434448, + "learning_rate": 4.825071997701846e-05, + "loss": 1.7329, + "step": 17132 + }, + { + "epoch": 5.258747697974218, + "grad_norm": 0.20949263870716095, + "learning_rate": 4.8245752468705614e-05, + "loss": 1.7658, + "step": 17133 + }, + { + "epoch": 5.259054634745242, + "grad_norm": 0.3158881366252899, + "learning_rate": 4.824078497772926e-05, + "loss": 1.7249, + "step": 17134 + }, + { + "epoch": 5.259361571516267, + "grad_norm": 0.2283414602279663, + "learning_rate": 4.823581750413852e-05, + "loss": 1.7177, + "step": 17135 + }, + { + "epoch": 5.259668508287293, + "grad_norm": 0.24753578007221222, + "learning_rate": 4.823085004798247e-05, + "loss": 1.7232, + "step": 17136 + }, + { + "epoch": 5.259975445058318, + "grad_norm": 0.20381587743759155, + "learning_rate": 4.822588260931017e-05, + "loss": 1.7049, + "step": 17137 + }, + { + "epoch": 5.260282381829343, + "grad_norm": 0.21220643818378448, + "learning_rate": 4.8220915188170746e-05, + "loss": 1.7221, + "step": 17138 + }, + { + "epoch": 5.260589318600369, + "grad_norm": 0.19324758648872375, + "learning_rate": 4.8215947784613276e-05, + "loss": 1.7168, + "step": 17139 + }, + { + "epoch": 5.260896255371393, + "grad_norm": 0.26500338315963745, + "learning_rate": 4.821098039868688e-05, + "loss": 1.7627, + "step": 17140 + }, + { + "epoch": 5.2612031921424185, + "grad_norm": 0.19597655534744263, + "learning_rate": 4.82060130304406e-05, + "loss": 1.7214, + "step": 17141 + }, + { + "epoch": 5.261510128913444, + "grad_norm": 0.2105483114719391, + "learning_rate": 4.820104567992357e-05, + "loss": 1.6742, + "step": 17142 + }, + { + "epoch": 5.261817065684469, + "grad_norm": 0.20020028948783875, + "learning_rate": 4.8196078347184837e-05, + "loss": 1.7721, + "step": 17143 + }, + { + "epoch": 5.2621240024554945, + "grad_norm": 0.2313549965620041, + "learning_rate": 4.819111103227353e-05, + "loss": 1.7644, + "step": 17144 + }, + { + "epoch": 5.262430939226519, + "grad_norm": 0.31893789768218994, + "learning_rate": 4.818614373523871e-05, + "loss": 1.747, + "step": 17145 + }, + { + "epoch": 5.262737875997544, + "grad_norm": 0.2531197667121887, + "learning_rate": 4.8181176456129505e-05, + "loss": 1.7713, + "step": 17146 + }, + { + "epoch": 5.26304481276857, + "grad_norm": 0.2063976377248764, + "learning_rate": 4.817620919499496e-05, + "loss": 1.7254, + "step": 17147 + }, + { + "epoch": 5.263351749539595, + "grad_norm": 0.22220590710639954, + "learning_rate": 4.8171241951884204e-05, + "loss": 1.7345, + "step": 17148 + }, + { + "epoch": 5.26365868631062, + "grad_norm": 0.24240384995937347, + "learning_rate": 4.8166274726846286e-05, + "loss": 1.7302, + "step": 17149 + }, + { + "epoch": 5.263965623081646, + "grad_norm": 0.215829998254776, + "learning_rate": 4.8161307519930326e-05, + "loss": 1.7725, + "step": 17150 + }, + { + "epoch": 5.26427255985267, + "grad_norm": 0.2697906494140625, + "learning_rate": 4.815634033118541e-05, + "loss": 1.7156, + "step": 17151 + }, + { + "epoch": 5.264579496623695, + "grad_norm": 0.21649456024169922, + "learning_rate": 4.815137316066061e-05, + "loss": 1.745, + "step": 17152 + }, + { + "epoch": 5.264886433394721, + "grad_norm": 0.22773787379264832, + "learning_rate": 4.8146406008405033e-05, + "loss": 1.7592, + "step": 17153 + }, + { + "epoch": 5.265193370165746, + "grad_norm": 0.2920280396938324, + "learning_rate": 4.8141438874467745e-05, + "loss": 1.8301, + "step": 17154 + }, + { + "epoch": 5.265500306936771, + "grad_norm": 0.23919162154197693, + "learning_rate": 4.813647175889785e-05, + "loss": 1.7687, + "step": 17155 + }, + { + "epoch": 5.265807243707796, + "grad_norm": 0.24617896974086761, + "learning_rate": 4.8131504661744425e-05, + "loss": 1.8279, + "step": 17156 + }, + { + "epoch": 5.266114180478821, + "grad_norm": 0.22756172716617584, + "learning_rate": 4.812653758305659e-05, + "loss": 1.7595, + "step": 17157 + }, + { + "epoch": 5.2664211172498465, + "grad_norm": 0.22939376533031464, + "learning_rate": 4.812157052288339e-05, + "loss": 1.7445, + "step": 17158 + }, + { + "epoch": 5.266728054020872, + "grad_norm": 0.21021319925785065, + "learning_rate": 4.811660348127395e-05, + "loss": 1.7875, + "step": 17159 + }, + { + "epoch": 5.267034990791897, + "grad_norm": 0.2271810919046402, + "learning_rate": 4.811163645827732e-05, + "loss": 1.74, + "step": 17160 + }, + { + "epoch": 5.267341927562922, + "grad_norm": 0.238374263048172, + "learning_rate": 4.81066694539426e-05, + "loss": 1.7717, + "step": 17161 + }, + { + "epoch": 5.267648864333947, + "grad_norm": 0.20655091106891632, + "learning_rate": 4.8101702468318885e-05, + "loss": 1.7447, + "step": 17162 + }, + { + "epoch": 5.267955801104972, + "grad_norm": 0.24652259051799774, + "learning_rate": 4.809673550145528e-05, + "loss": 1.7755, + "step": 17163 + }, + { + "epoch": 5.268262737875998, + "grad_norm": 0.20256781578063965, + "learning_rate": 4.809176855340083e-05, + "loss": 1.7689, + "step": 17164 + }, + { + "epoch": 5.268569674647023, + "grad_norm": 0.27023112773895264, + "learning_rate": 4.8086801624204665e-05, + "loss": 1.8364, + "step": 17165 + }, + { + "epoch": 5.268876611418047, + "grad_norm": 0.251638799905777, + "learning_rate": 4.808183471391582e-05, + "loss": 1.7924, + "step": 17166 + }, + { + "epoch": 5.269183548189073, + "grad_norm": 0.22897782921791077, + "learning_rate": 4.807686782258342e-05, + "loss": 1.7378, + "step": 17167 + }, + { + "epoch": 5.269490484960098, + "grad_norm": 0.19141456484794617, + "learning_rate": 4.807190095025655e-05, + "loss": 1.6911, + "step": 17168 + }, + { + "epoch": 5.269797421731123, + "grad_norm": 0.19960568845272064, + "learning_rate": 4.806693409698427e-05, + "loss": 1.71, + "step": 17169 + }, + { + "epoch": 5.270104358502149, + "grad_norm": 0.23332087695598602, + "learning_rate": 4.8061967262815694e-05, + "loss": 1.7993, + "step": 17170 + }, + { + "epoch": 5.270411295273174, + "grad_norm": 0.24831432104110718, + "learning_rate": 4.8057000447799876e-05, + "loss": 1.7459, + "step": 17171 + }, + { + "epoch": 5.2707182320441985, + "grad_norm": 0.24735838174819946, + "learning_rate": 4.805203365198593e-05, + "loss": 1.7751, + "step": 17172 + }, + { + "epoch": 5.271025168815224, + "grad_norm": 0.32630103826522827, + "learning_rate": 4.804706687542291e-05, + "loss": 1.7885, + "step": 17173 + }, + { + "epoch": 5.271332105586249, + "grad_norm": 0.29055842757225037, + "learning_rate": 4.804210011815995e-05, + "loss": 1.6819, + "step": 17174 + }, + { + "epoch": 5.2716390423572745, + "grad_norm": 0.22968806326389313, + "learning_rate": 4.803713338024608e-05, + "loss": 1.8146, + "step": 17175 + }, + { + "epoch": 5.2719459791283, + "grad_norm": 0.23430144786834717, + "learning_rate": 4.8032166661730434e-05, + "loss": 1.7401, + "step": 17176 + }, + { + "epoch": 5.272252915899324, + "grad_norm": 0.26312723755836487, + "learning_rate": 4.802719996266204e-05, + "loss": 1.8319, + "step": 17177 + }, + { + "epoch": 5.27255985267035, + "grad_norm": 0.23715369403362274, + "learning_rate": 4.802223328309003e-05, + "loss": 1.8014, + "step": 17178 + }, + { + "epoch": 5.272866789441375, + "grad_norm": 0.23943877220153809, + "learning_rate": 4.801726662306347e-05, + "loss": 1.7181, + "step": 17179 + }, + { + "epoch": 5.2731737262124, + "grad_norm": 0.2366543412208557, + "learning_rate": 4.8012299982631435e-05, + "loss": 1.6685, + "step": 17180 + }, + { + "epoch": 5.273480662983426, + "grad_norm": 0.20688587427139282, + "learning_rate": 4.8007333361843016e-05, + "loss": 1.7089, + "step": 17181 + }, + { + "epoch": 5.273787599754451, + "grad_norm": 0.2069951444864273, + "learning_rate": 4.8002366760747314e-05, + "loss": 1.7447, + "step": 17182 + }, + { + "epoch": 5.274094536525475, + "grad_norm": 0.26072344183921814, + "learning_rate": 4.7997400179393374e-05, + "loss": 1.7346, + "step": 17183 + }, + { + "epoch": 5.274401473296501, + "grad_norm": 0.2397938072681427, + "learning_rate": 4.799243361783031e-05, + "loss": 1.7556, + "step": 17184 + }, + { + "epoch": 5.274708410067526, + "grad_norm": 0.23606348037719727, + "learning_rate": 4.798746707610721e-05, + "loss": 1.732, + "step": 17185 + }, + { + "epoch": 5.2750153468385514, + "grad_norm": 0.21078252792358398, + "learning_rate": 4.798250055427311e-05, + "loss": 1.7571, + "step": 17186 + }, + { + "epoch": 5.275322283609577, + "grad_norm": 0.21331414580345154, + "learning_rate": 4.797753405237714e-05, + "loss": 1.732, + "step": 17187 + }, + { + "epoch": 5.275629220380601, + "grad_norm": 0.23700307309627533, + "learning_rate": 4.7972567570468354e-05, + "loss": 1.7354, + "step": 17188 + }, + { + "epoch": 5.275936157151627, + "grad_norm": 0.20519722998142242, + "learning_rate": 4.7967601108595845e-05, + "loss": 1.7435, + "step": 17189 + }, + { + "epoch": 5.276243093922652, + "grad_norm": 0.22358302772045135, + "learning_rate": 4.79626346668087e-05, + "loss": 1.7891, + "step": 17190 + }, + { + "epoch": 5.276550030693677, + "grad_norm": 0.2434413880109787, + "learning_rate": 4.795766824515598e-05, + "loss": 1.814, + "step": 17191 + }, + { + "epoch": 5.276856967464703, + "grad_norm": 0.2198423594236374, + "learning_rate": 4.795270184368678e-05, + "loss": 1.7212, + "step": 17192 + }, + { + "epoch": 5.277163904235728, + "grad_norm": 0.23587806522846222, + "learning_rate": 4.7947735462450205e-05, + "loss": 1.8337, + "step": 17193 + }, + { + "epoch": 5.277470841006752, + "grad_norm": 0.234666645526886, + "learning_rate": 4.794276910149528e-05, + "loss": 1.7548, + "step": 17194 + }, + { + "epoch": 5.277777777777778, + "grad_norm": 0.23363247513771057, + "learning_rate": 4.793780276087115e-05, + "loss": 1.7587, + "step": 17195 + }, + { + "epoch": 5.278084714548803, + "grad_norm": 0.23191119730472565, + "learning_rate": 4.793283644062683e-05, + "loss": 1.7691, + "step": 17196 + }, + { + "epoch": 5.278391651319828, + "grad_norm": 0.2363097071647644, + "learning_rate": 4.7927870140811445e-05, + "loss": 1.8139, + "step": 17197 + }, + { + "epoch": 5.278698588090854, + "grad_norm": 0.2852413058280945, + "learning_rate": 4.7922903861474056e-05, + "loss": 1.7905, + "step": 17198 + }, + { + "epoch": 5.279005524861878, + "grad_norm": 0.23633842170238495, + "learning_rate": 4.7917937602663764e-05, + "loss": 1.8014, + "step": 17199 + }, + { + "epoch": 5.2793124616329035, + "grad_norm": 0.27007919549942017, + "learning_rate": 4.791297136442961e-05, + "loss": 1.7242, + "step": 17200 + }, + { + "epoch": 5.279619398403929, + "grad_norm": 0.29482147097587585, + "learning_rate": 4.790800514682072e-05, + "loss": 1.7154, + "step": 17201 + }, + { + "epoch": 5.279926335174954, + "grad_norm": 0.27772340178489685, + "learning_rate": 4.790303894988614e-05, + "loss": 1.7771, + "step": 17202 + }, + { + "epoch": 5.2802332719459795, + "grad_norm": 0.21761848032474518, + "learning_rate": 4.789807277367495e-05, + "loss": 1.6983, + "step": 17203 + }, + { + "epoch": 5.280540208717004, + "grad_norm": 0.22621290385723114, + "learning_rate": 4.789310661823626e-05, + "loss": 1.7667, + "step": 17204 + }, + { + "epoch": 5.280847145488029, + "grad_norm": 0.2284683883190155, + "learning_rate": 4.7888140483619095e-05, + "loss": 1.7419, + "step": 17205 + }, + { + "epoch": 5.281154082259055, + "grad_norm": 0.20145639777183533, + "learning_rate": 4.788317436987259e-05, + "loss": 1.7068, + "step": 17206 + }, + { + "epoch": 5.28146101903008, + "grad_norm": 0.23146072030067444, + "learning_rate": 4.7878208277045775e-05, + "loss": 1.7195, + "step": 17207 + }, + { + "epoch": 5.281767955801105, + "grad_norm": 0.24014149606227875, + "learning_rate": 4.787324220518776e-05, + "loss": 1.8148, + "step": 17208 + }, + { + "epoch": 5.28207489257213, + "grad_norm": 0.21067874133586884, + "learning_rate": 4.7868276154347595e-05, + "loss": 1.7754, + "step": 17209 + }, + { + "epoch": 5.282381829343155, + "grad_norm": 0.2313496321439743, + "learning_rate": 4.786331012457441e-05, + "loss": 1.7693, + "step": 17210 + }, + { + "epoch": 5.28268876611418, + "grad_norm": 0.24190983176231384, + "learning_rate": 4.7858344115917214e-05, + "loss": 1.7342, + "step": 17211 + }, + { + "epoch": 5.282995702885206, + "grad_norm": 0.24541905522346497, + "learning_rate": 4.785337812842514e-05, + "loss": 1.7721, + "step": 17212 + }, + { + "epoch": 5.283302639656231, + "grad_norm": 0.21989032626152039, + "learning_rate": 4.784841216214722e-05, + "loss": 1.7522, + "step": 17213 + }, + { + "epoch": 5.283609576427256, + "grad_norm": 0.20637241005897522, + "learning_rate": 4.784344621713256e-05, + "loss": 1.7418, + "step": 17214 + }, + { + "epoch": 5.283916513198281, + "grad_norm": 0.22538220882415771, + "learning_rate": 4.783848029343023e-05, + "loss": 1.8287, + "step": 17215 + }, + { + "epoch": 5.284223449969306, + "grad_norm": 0.24478071928024292, + "learning_rate": 4.7833514391089315e-05, + "loss": 1.7419, + "step": 17216 + }, + { + "epoch": 5.2845303867403315, + "grad_norm": 0.22707650065422058, + "learning_rate": 4.782854851015886e-05, + "loss": 1.7831, + "step": 17217 + }, + { + "epoch": 5.284837323511357, + "grad_norm": 0.2843529284000397, + "learning_rate": 4.7823582650687984e-05, + "loss": 1.7704, + "step": 17218 + }, + { + "epoch": 5.285144260282382, + "grad_norm": 0.21647678315639496, + "learning_rate": 4.781861681272573e-05, + "loss": 1.7514, + "step": 17219 + }, + { + "epoch": 5.285451197053407, + "grad_norm": 0.2279205620288849, + "learning_rate": 4.781365099632117e-05, + "loss": 1.6803, + "step": 17220 + }, + { + "epoch": 5.285758133824432, + "grad_norm": 0.2287401556968689, + "learning_rate": 4.7808685201523417e-05, + "loss": 1.7278, + "step": 17221 + }, + { + "epoch": 5.286065070595457, + "grad_norm": 0.2103174477815628, + "learning_rate": 4.78037194283815e-05, + "loss": 1.7667, + "step": 17222 + }, + { + "epoch": 5.286372007366483, + "grad_norm": 0.24339279532432556, + "learning_rate": 4.7798753676944536e-05, + "loss": 1.7828, + "step": 17223 + }, + { + "epoch": 5.286678944137508, + "grad_norm": 0.2343035340309143, + "learning_rate": 4.779378794726156e-05, + "loss": 1.7277, + "step": 17224 + }, + { + "epoch": 5.286985880908533, + "grad_norm": 0.22456331551074982, + "learning_rate": 4.778882223938167e-05, + "loss": 1.756, + "step": 17225 + }, + { + "epoch": 5.287292817679558, + "grad_norm": 0.2211158126592636, + "learning_rate": 4.778385655335392e-05, + "loss": 1.7733, + "step": 17226 + }, + { + "epoch": 5.287599754450583, + "grad_norm": 0.2731948792934418, + "learning_rate": 4.777889088922743e-05, + "loss": 1.787, + "step": 17227 + }, + { + "epoch": 5.287906691221608, + "grad_norm": 0.19578024744987488, + "learning_rate": 4.7773925247051215e-05, + "loss": 1.7474, + "step": 17228 + }, + { + "epoch": 5.288213627992634, + "grad_norm": 0.277332067489624, + "learning_rate": 4.77689596268744e-05, + "loss": 1.7432, + "step": 17229 + }, + { + "epoch": 5.288520564763659, + "grad_norm": 0.2979765832424164, + "learning_rate": 4.7763994028746003e-05, + "loss": 1.8198, + "step": 17230 + }, + { + "epoch": 5.2888275015346835, + "grad_norm": 0.23176288604736328, + "learning_rate": 4.775902845271515e-05, + "loss": 1.7317, + "step": 17231 + }, + { + "epoch": 5.289134438305709, + "grad_norm": 0.35821911692619324, + "learning_rate": 4.7754062898830876e-05, + "loss": 1.7287, + "step": 17232 + }, + { + "epoch": 5.289441375076734, + "grad_norm": 0.2881525158882141, + "learning_rate": 4.7749097367142296e-05, + "loss": 1.7391, + "step": 17233 + }, + { + "epoch": 5.2897483118477595, + "grad_norm": 0.22021767497062683, + "learning_rate": 4.774413185769842e-05, + "loss": 1.7462, + "step": 17234 + }, + { + "epoch": 5.290055248618785, + "grad_norm": 0.3286842703819275, + "learning_rate": 4.7739166370548385e-05, + "loss": 1.7749, + "step": 17235 + }, + { + "epoch": 5.290362185389809, + "grad_norm": 0.3298519253730774, + "learning_rate": 4.773420090574122e-05, + "loss": 1.7548, + "step": 17236 + }, + { + "epoch": 5.290669122160835, + "grad_norm": 0.20910575985908508, + "learning_rate": 4.7729235463326005e-05, + "loss": 1.7308, + "step": 17237 + }, + { + "epoch": 5.29097605893186, + "grad_norm": 0.3324633240699768, + "learning_rate": 4.7724270043351835e-05, + "loss": 1.7328, + "step": 17238 + }, + { + "epoch": 5.291282995702885, + "grad_norm": 0.21235628426074982, + "learning_rate": 4.771930464586774e-05, + "loss": 1.7186, + "step": 17239 + }, + { + "epoch": 5.291589932473911, + "grad_norm": 0.2971087694168091, + "learning_rate": 4.771433927092283e-05, + "loss": 1.7947, + "step": 17240 + }, + { + "epoch": 5.291896869244935, + "grad_norm": 0.3637695908546448, + "learning_rate": 4.770937391856614e-05, + "loss": 1.7753, + "step": 17241 + }, + { + "epoch": 5.29220380601596, + "grad_norm": 0.2503713369369507, + "learning_rate": 4.770440858884678e-05, + "loss": 1.684, + "step": 17242 + }, + { + "epoch": 5.292510742786986, + "grad_norm": 0.25510790944099426, + "learning_rate": 4.7699443281813774e-05, + "loss": 1.7517, + "step": 17243 + }, + { + "epoch": 5.292817679558011, + "grad_norm": 0.3189590871334076, + "learning_rate": 4.7694477997516244e-05, + "loss": 1.7488, + "step": 17244 + }, + { + "epoch": 5.293124616329036, + "grad_norm": 0.2807229161262512, + "learning_rate": 4.7689512736003215e-05, + "loss": 1.7962, + "step": 17245 + }, + { + "epoch": 5.293431553100062, + "grad_norm": 0.2166406810283661, + "learning_rate": 4.76845474973238e-05, + "loss": 1.7423, + "step": 17246 + }, + { + "epoch": 5.293738489871086, + "grad_norm": 0.29000815749168396, + "learning_rate": 4.767958228152702e-05, + "loss": 1.7508, + "step": 17247 + }, + { + "epoch": 5.2940454266421115, + "grad_norm": 0.19301612675189972, + "learning_rate": 4.767461708866198e-05, + "loss": 1.7223, + "step": 17248 + }, + { + "epoch": 5.294352363413137, + "grad_norm": 0.2828899323940277, + "learning_rate": 4.766965191877772e-05, + "loss": 1.8139, + "step": 17249 + }, + { + "epoch": 5.294659300184162, + "grad_norm": 0.32610374689102173, + "learning_rate": 4.766468677192335e-05, + "loss": 1.7744, + "step": 17250 + }, + { + "epoch": 5.2949662369551875, + "grad_norm": 0.2175719439983368, + "learning_rate": 4.7659721648147895e-05, + "loss": 1.7345, + "step": 17251 + }, + { + "epoch": 5.295273173726212, + "grad_norm": 0.24777816236019135, + "learning_rate": 4.7654756547500457e-05, + "loss": 1.7382, + "step": 17252 + }, + { + "epoch": 5.295580110497237, + "grad_norm": 0.25927749276161194, + "learning_rate": 4.764979147003008e-05, + "loss": 1.7625, + "step": 17253 + }, + { + "epoch": 5.295887047268263, + "grad_norm": 0.2271798849105835, + "learning_rate": 4.7644826415785834e-05, + "loss": 1.6928, + "step": 17254 + }, + { + "epoch": 5.296193984039288, + "grad_norm": 0.30804958939552307, + "learning_rate": 4.763986138481682e-05, + "loss": 1.743, + "step": 17255 + }, + { + "epoch": 5.296500920810313, + "grad_norm": 0.2247130572795868, + "learning_rate": 4.763489637717205e-05, + "loss": 1.7593, + "step": 17256 + }, + { + "epoch": 5.296807857581339, + "grad_norm": 0.22203052043914795, + "learning_rate": 4.7629931392900645e-05, + "loss": 1.6923, + "step": 17257 + }, + { + "epoch": 5.297114794352363, + "grad_norm": 0.23044714331626892, + "learning_rate": 4.7624966432051624e-05, + "loss": 1.7676, + "step": 17258 + }, + { + "epoch": 5.297421731123388, + "grad_norm": 0.2824070155620575, + "learning_rate": 4.7620001494674096e-05, + "loss": 1.8272, + "step": 17259 + }, + { + "epoch": 5.297728667894414, + "grad_norm": 0.27077800035476685, + "learning_rate": 4.761503658081709e-05, + "loss": 1.8106, + "step": 17260 + }, + { + "epoch": 5.298035604665439, + "grad_norm": 0.2333833873271942, + "learning_rate": 4.7610071690529706e-05, + "loss": 1.6841, + "step": 17261 + }, + { + "epoch": 5.298342541436464, + "grad_norm": 0.2542032301425934, + "learning_rate": 4.760510682386098e-05, + "loss": 1.7656, + "step": 17262 + }, + { + "epoch": 5.298649478207489, + "grad_norm": 0.30680081248283386, + "learning_rate": 4.760014198086002e-05, + "loss": 1.7443, + "step": 17263 + }, + { + "epoch": 5.298956414978514, + "grad_norm": 0.21580225229263306, + "learning_rate": 4.759517716157583e-05, + "loss": 1.7907, + "step": 17264 + }, + { + "epoch": 5.2992633517495396, + "grad_norm": 0.2644323408603668, + "learning_rate": 4.7590212366057516e-05, + "loss": 1.6835, + "step": 17265 + }, + { + "epoch": 5.299570288520565, + "grad_norm": 0.23600110411643982, + "learning_rate": 4.758524759435414e-05, + "loss": 1.7481, + "step": 17266 + }, + { + "epoch": 5.29987722529159, + "grad_norm": 0.23825959861278534, + "learning_rate": 4.758028284651477e-05, + "loss": 1.7267, + "step": 17267 + }, + { + "epoch": 5.300184162062616, + "grad_norm": 0.2659476101398468, + "learning_rate": 4.757531812258845e-05, + "loss": 1.7303, + "step": 17268 + }, + { + "epoch": 5.30049109883364, + "grad_norm": 0.30770114064216614, + "learning_rate": 4.757035342262428e-05, + "loss": 1.7636, + "step": 17269 + }, + { + "epoch": 5.300798035604665, + "grad_norm": 0.27921241521835327, + "learning_rate": 4.756538874667129e-05, + "loss": 1.7736, + "step": 17270 + }, + { + "epoch": 5.301104972375691, + "grad_norm": 0.2518016993999481, + "learning_rate": 4.756042409477855e-05, + "loss": 1.7942, + "step": 17271 + }, + { + "epoch": 5.301411909146716, + "grad_norm": 0.2678029537200928, + "learning_rate": 4.755545946699514e-05, + "loss": 1.7179, + "step": 17272 + }, + { + "epoch": 5.301718845917741, + "grad_norm": 0.3082284927368164, + "learning_rate": 4.7550494863370094e-05, + "loss": 1.7282, + "step": 17273 + }, + { + "epoch": 5.302025782688766, + "grad_norm": 0.23269952833652496, + "learning_rate": 4.754553028395251e-05, + "loss": 1.755, + "step": 17274 + }, + { + "epoch": 5.302332719459791, + "grad_norm": 0.2273751199245453, + "learning_rate": 4.754056572879142e-05, + "loss": 1.7661, + "step": 17275 + }, + { + "epoch": 5.3026396562308165, + "grad_norm": 0.2175082415342331, + "learning_rate": 4.7535601197935915e-05, + "loss": 1.7034, + "step": 17276 + }, + { + "epoch": 5.302946593001842, + "grad_norm": 0.20551301538944244, + "learning_rate": 4.753063669143503e-05, + "loss": 1.7329, + "step": 17277 + }, + { + "epoch": 5.303253529772867, + "grad_norm": 0.2350638061761856, + "learning_rate": 4.752567220933785e-05, + "loss": 1.8361, + "step": 17278 + }, + { + "epoch": 5.303560466543892, + "grad_norm": 0.20268140733242035, + "learning_rate": 4.752070775169342e-05, + "loss": 1.6736, + "step": 17279 + }, + { + "epoch": 5.303867403314917, + "grad_norm": 0.1891544908285141, + "learning_rate": 4.7515743318550823e-05, + "loss": 1.7241, + "step": 17280 + }, + { + "epoch": 5.304174340085942, + "grad_norm": 0.22900860011577606, + "learning_rate": 4.751077890995909e-05, + "loss": 1.7321, + "step": 17281 + }, + { + "epoch": 5.304481276856968, + "grad_norm": 0.25827866792678833, + "learning_rate": 4.7505814525967304e-05, + "loss": 1.8021, + "step": 17282 + }, + { + "epoch": 5.304788213627993, + "grad_norm": 0.22459273040294647, + "learning_rate": 4.7500850166624514e-05, + "loss": 1.7845, + "step": 17283 + }, + { + "epoch": 5.305095150399017, + "grad_norm": 0.23737964034080505, + "learning_rate": 4.7495885831979816e-05, + "loss": 1.7274, + "step": 17284 + }, + { + "epoch": 5.305402087170043, + "grad_norm": 0.2267502397298813, + "learning_rate": 4.749092152208221e-05, + "loss": 1.7747, + "step": 17285 + }, + { + "epoch": 5.305709023941068, + "grad_norm": 0.31811007857322693, + "learning_rate": 4.748595723698081e-05, + "loss": 1.7852, + "step": 17286 + }, + { + "epoch": 5.306015960712093, + "grad_norm": 0.42865583300590515, + "learning_rate": 4.7480992976724655e-05, + "loss": 1.7711, + "step": 17287 + }, + { + "epoch": 5.306322897483119, + "grad_norm": 0.3211027979850769, + "learning_rate": 4.747602874136278e-05, + "loss": 1.7813, + "step": 17288 + }, + { + "epoch": 5.306629834254144, + "grad_norm": 0.22552837431430817, + "learning_rate": 4.7471064530944295e-05, + "loss": 1.7407, + "step": 17289 + }, + { + "epoch": 5.3069367710251685, + "grad_norm": 0.3119906485080719, + "learning_rate": 4.746610034551821e-05, + "loss": 1.7255, + "step": 17290 + }, + { + "epoch": 5.307243707796194, + "grad_norm": 0.26405754685401917, + "learning_rate": 4.7461136185133623e-05, + "loss": 1.6945, + "step": 17291 + }, + { + "epoch": 5.307550644567219, + "grad_norm": 0.21759621798992157, + "learning_rate": 4.7456172049839566e-05, + "loss": 1.7319, + "step": 17292 + }, + { + "epoch": 5.3078575813382445, + "grad_norm": 0.26193925738334656, + "learning_rate": 4.745120793968511e-05, + "loss": 1.7508, + "step": 17293 + }, + { + "epoch": 5.30816451810927, + "grad_norm": 0.2549780011177063, + "learning_rate": 4.74462438547193e-05, + "loss": 1.7153, + "step": 17294 + }, + { + "epoch": 5.308471454880294, + "grad_norm": 0.21164020895957947, + "learning_rate": 4.7441279794991235e-05, + "loss": 1.7315, + "step": 17295 + }, + { + "epoch": 5.30877839165132, + "grad_norm": 0.20548345148563385, + "learning_rate": 4.7436315760549914e-05, + "loss": 1.68, + "step": 17296 + }, + { + "epoch": 5.309085328422345, + "grad_norm": 0.23997166752815247, + "learning_rate": 4.7431351751444446e-05, + "loss": 1.8528, + "step": 17297 + }, + { + "epoch": 5.30939226519337, + "grad_norm": 0.2639109194278717, + "learning_rate": 4.7426387767723845e-05, + "loss": 1.8041, + "step": 17298 + }, + { + "epoch": 5.309699201964396, + "grad_norm": 0.2285986840724945, + "learning_rate": 4.7421423809437196e-05, + "loss": 1.8188, + "step": 17299 + }, + { + "epoch": 5.310006138735421, + "grad_norm": 0.22183369100093842, + "learning_rate": 4.741645987663355e-05, + "loss": 1.7581, + "step": 17300 + }, + { + "epoch": 5.310313075506445, + "grad_norm": 0.22716040909290314, + "learning_rate": 4.741149596936197e-05, + "loss": 1.7438, + "step": 17301 + }, + { + "epoch": 5.310620012277471, + "grad_norm": 0.24641327559947968, + "learning_rate": 4.740653208767148e-05, + "loss": 1.761, + "step": 17302 + }, + { + "epoch": 5.310926949048496, + "grad_norm": 0.28470689058303833, + "learning_rate": 4.7401568231611194e-05, + "loss": 1.7512, + "step": 17303 + }, + { + "epoch": 5.311233885819521, + "grad_norm": 0.23279942572116852, + "learning_rate": 4.739660440123012e-05, + "loss": 1.7797, + "step": 17304 + }, + { + "epoch": 5.311540822590547, + "grad_norm": 0.26397696137428284, + "learning_rate": 4.739164059657731e-05, + "loss": 1.748, + "step": 17305 + }, + { + "epoch": 5.311847759361571, + "grad_norm": 0.25072020292282104, + "learning_rate": 4.7386676817701856e-05, + "loss": 1.7571, + "step": 17306 + }, + { + "epoch": 5.3121546961325965, + "grad_norm": 0.20815810561180115, + "learning_rate": 4.7381713064652774e-05, + "loss": 1.7566, + "step": 17307 + }, + { + "epoch": 5.312461632903622, + "grad_norm": 0.23104289174079895, + "learning_rate": 4.7376749337479174e-05, + "loss": 1.7308, + "step": 17308 + }, + { + "epoch": 5.312768569674647, + "grad_norm": 0.21978867053985596, + "learning_rate": 4.737178563623004e-05, + "loss": 1.7997, + "step": 17309 + }, + { + "epoch": 5.3130755064456725, + "grad_norm": 0.34588614106178284, + "learning_rate": 4.736682196095447e-05, + "loss": 1.8414, + "step": 17310 + }, + { + "epoch": 5.313382443216697, + "grad_norm": 0.3475342094898224, + "learning_rate": 4.73618583117015e-05, + "loss": 1.7823, + "step": 17311 + }, + { + "epoch": 5.313689379987722, + "grad_norm": 0.1965305358171463, + "learning_rate": 4.7356894688520215e-05, + "loss": 1.7597, + "step": 17312 + }, + { + "epoch": 5.313996316758748, + "grad_norm": 0.3035048246383667, + "learning_rate": 4.7351931091459624e-05, + "loss": 1.6803, + "step": 17313 + }, + { + "epoch": 5.314303253529773, + "grad_norm": 0.27722910046577454, + "learning_rate": 4.7346967520568827e-05, + "loss": 1.7472, + "step": 17314 + }, + { + "epoch": 5.314610190300798, + "grad_norm": 0.21481415629386902, + "learning_rate": 4.734200397589682e-05, + "loss": 1.7319, + "step": 17315 + }, + { + "epoch": 5.314917127071823, + "grad_norm": 0.2570357918739319, + "learning_rate": 4.733704045749271e-05, + "loss": 1.7392, + "step": 17316 + }, + { + "epoch": 5.315224063842848, + "grad_norm": 0.2404400259256363, + "learning_rate": 4.733207696540551e-05, + "loss": 1.7231, + "step": 17317 + }, + { + "epoch": 5.315531000613873, + "grad_norm": 0.222911074757576, + "learning_rate": 4.732711349968432e-05, + "loss": 1.7584, + "step": 17318 + }, + { + "epoch": 5.315837937384899, + "grad_norm": 0.22908064723014832, + "learning_rate": 4.732215006037813e-05, + "loss": 1.7242, + "step": 17319 + }, + { + "epoch": 5.316144874155924, + "grad_norm": 0.2432398796081543, + "learning_rate": 4.7317186647536044e-05, + "loss": 1.7056, + "step": 17320 + }, + { + "epoch": 5.316451810926949, + "grad_norm": 0.1994420737028122, + "learning_rate": 4.7312223261207086e-05, + "loss": 1.6667, + "step": 17321 + }, + { + "epoch": 5.316758747697974, + "grad_norm": 0.22314350306987762, + "learning_rate": 4.73072599014403e-05, + "loss": 1.7945, + "step": 17322 + }, + { + "epoch": 5.317065684468999, + "grad_norm": 0.2309068888425827, + "learning_rate": 4.730229656828477e-05, + "loss": 1.7099, + "step": 17323 + }, + { + "epoch": 5.3173726212400245, + "grad_norm": 0.22388015687465668, + "learning_rate": 4.729733326178951e-05, + "loss": 1.7053, + "step": 17324 + }, + { + "epoch": 5.31767955801105, + "grad_norm": 0.20203040540218353, + "learning_rate": 4.72923699820036e-05, + "loss": 1.6992, + "step": 17325 + }, + { + "epoch": 5.317986494782075, + "grad_norm": 0.24416297674179077, + "learning_rate": 4.728740672897606e-05, + "loss": 1.7455, + "step": 17326 + }, + { + "epoch": 5.3182934315531, + "grad_norm": 0.2501862049102783, + "learning_rate": 4.728244350275597e-05, + "loss": 1.7609, + "step": 17327 + }, + { + "epoch": 5.318600368324125, + "grad_norm": 0.21482665836811066, + "learning_rate": 4.727748030339235e-05, + "loss": 1.7614, + "step": 17328 + }, + { + "epoch": 5.31890730509515, + "grad_norm": 0.2241419404745102, + "learning_rate": 4.727251713093429e-05, + "loss": 1.736, + "step": 17329 + }, + { + "epoch": 5.319214241866176, + "grad_norm": 0.1757260262966156, + "learning_rate": 4.726755398543079e-05, + "loss": 1.6646, + "step": 17330 + }, + { + "epoch": 5.319521178637201, + "grad_norm": 0.18697243928909302, + "learning_rate": 4.726259086693095e-05, + "loss": 1.7512, + "step": 17331 + }, + { + "epoch": 5.319828115408226, + "grad_norm": 0.22584228217601776, + "learning_rate": 4.725762777548376e-05, + "loss": 1.7439, + "step": 17332 + }, + { + "epoch": 5.320135052179251, + "grad_norm": 0.18673470616340637, + "learning_rate": 4.725266471113832e-05, + "loss": 1.7007, + "step": 17333 + }, + { + "epoch": 5.320441988950276, + "grad_norm": 0.23030288517475128, + "learning_rate": 4.7247701673943656e-05, + "loss": 1.8021, + "step": 17334 + }, + { + "epoch": 5.320748925721301, + "grad_norm": 0.19333480298519135, + "learning_rate": 4.7242738663948813e-05, + "loss": 1.6659, + "step": 17335 + }, + { + "epoch": 5.321055862492327, + "grad_norm": 0.278097003698349, + "learning_rate": 4.723777568120284e-05, + "loss": 1.7302, + "step": 17336 + }, + { + "epoch": 5.321362799263352, + "grad_norm": 0.2146742343902588, + "learning_rate": 4.72328127257548e-05, + "loss": 1.7644, + "step": 17337 + }, + { + "epoch": 5.3216697360343765, + "grad_norm": 0.25582969188690186, + "learning_rate": 4.722784979765372e-05, + "loss": 1.7872, + "step": 17338 + }, + { + "epoch": 5.321976672805402, + "grad_norm": 0.20411577820777893, + "learning_rate": 4.722288689694864e-05, + "loss": 1.7167, + "step": 17339 + }, + { + "epoch": 5.322283609576427, + "grad_norm": 0.20894703269004822, + "learning_rate": 4.7217924023688645e-05, + "loss": 1.7526, + "step": 17340 + }, + { + "epoch": 5.3225905463474525, + "grad_norm": 0.20197831094264984, + "learning_rate": 4.721296117792273e-05, + "loss": 1.711, + "step": 17341 + }, + { + "epoch": 5.322897483118478, + "grad_norm": 0.20490549504756927, + "learning_rate": 4.720799835969999e-05, + "loss": 1.7303, + "step": 17342 + }, + { + "epoch": 5.323204419889503, + "grad_norm": 0.20666229724884033, + "learning_rate": 4.720303556906943e-05, + "loss": 1.6738, + "step": 17343 + }, + { + "epoch": 5.323511356660528, + "grad_norm": 0.21899856626987457, + "learning_rate": 4.719807280608011e-05, + "loss": 1.7632, + "step": 17344 + }, + { + "epoch": 5.323818293431553, + "grad_norm": 0.2310410887002945, + "learning_rate": 4.719311007078108e-05, + "loss": 1.7568, + "step": 17345 + }, + { + "epoch": 5.324125230202578, + "grad_norm": 0.20057427883148193, + "learning_rate": 4.7188147363221394e-05, + "loss": 1.6716, + "step": 17346 + }, + { + "epoch": 5.324432166973604, + "grad_norm": 0.21361050009727478, + "learning_rate": 4.718318468345006e-05, + "loss": 1.7224, + "step": 17347 + }, + { + "epoch": 5.324739103744629, + "grad_norm": 0.28389376401901245, + "learning_rate": 4.7178222031516173e-05, + "loss": 1.8519, + "step": 17348 + }, + { + "epoch": 5.3250460405156534, + "grad_norm": 0.2094416618347168, + "learning_rate": 4.717325940746872e-05, + "loss": 1.7763, + "step": 17349 + }, + { + "epoch": 5.325352977286679, + "grad_norm": 0.2263312190771103, + "learning_rate": 4.716829681135681e-05, + "loss": 1.7961, + "step": 17350 + }, + { + "epoch": 5.325659914057704, + "grad_norm": 0.2685631811618805, + "learning_rate": 4.7163334243229417e-05, + "loss": 1.7763, + "step": 17351 + }, + { + "epoch": 5.3259668508287294, + "grad_norm": 0.2029418647289276, + "learning_rate": 4.7158371703135636e-05, + "loss": 1.7662, + "step": 17352 + }, + { + "epoch": 5.326273787599755, + "grad_norm": 0.3109094798564911, + "learning_rate": 4.715340919112447e-05, + "loss": 1.7064, + "step": 17353 + }, + { + "epoch": 5.326580724370779, + "grad_norm": 0.24679912626743317, + "learning_rate": 4.714844670724502e-05, + "loss": 1.6903, + "step": 17354 + }, + { + "epoch": 5.326887661141805, + "grad_norm": 0.2004890739917755, + "learning_rate": 4.714348425154627e-05, + "loss": 1.7242, + "step": 17355 + }, + { + "epoch": 5.32719459791283, + "grad_norm": 0.27442196011543274, + "learning_rate": 4.7138521824077284e-05, + "loss": 1.826, + "step": 17356 + }, + { + "epoch": 5.327501534683855, + "grad_norm": 0.19933666288852692, + "learning_rate": 4.713355942488711e-05, + "loss": 1.748, + "step": 17357 + }, + { + "epoch": 5.327808471454881, + "grad_norm": 0.2306378185749054, + "learning_rate": 4.712859705402476e-05, + "loss": 1.7426, + "step": 17358 + }, + { + "epoch": 5.328115408225905, + "grad_norm": 0.22484014928340912, + "learning_rate": 4.7123634711539324e-05, + "loss": 1.7355, + "step": 17359 + }, + { + "epoch": 5.32842234499693, + "grad_norm": 0.2501749098300934, + "learning_rate": 4.711867239747979e-05, + "loss": 1.7502, + "step": 17360 + }, + { + "epoch": 5.328729281767956, + "grad_norm": 0.1940663903951645, + "learning_rate": 4.711371011189525e-05, + "loss": 1.7423, + "step": 17361 + }, + { + "epoch": 5.329036218538981, + "grad_norm": 0.28115448355674744, + "learning_rate": 4.71087478548347e-05, + "loss": 1.7134, + "step": 17362 + }, + { + "epoch": 5.329343155310006, + "grad_norm": 0.29717928171157837, + "learning_rate": 4.71037856263472e-05, + "loss": 1.8145, + "step": 17363 + }, + { + "epoch": 5.329650092081032, + "grad_norm": 0.24278375506401062, + "learning_rate": 4.709882342648179e-05, + "loss": 1.689, + "step": 17364 + }, + { + "epoch": 5.329957028852056, + "grad_norm": 0.26382890343666077, + "learning_rate": 4.709386125528751e-05, + "loss": 1.801, + "step": 17365 + }, + { + "epoch": 5.3302639656230815, + "grad_norm": 0.237087219953537, + "learning_rate": 4.708889911281339e-05, + "loss": 1.7019, + "step": 17366 + }, + { + "epoch": 5.330570902394107, + "grad_norm": 0.21994253993034363, + "learning_rate": 4.7083936999108494e-05, + "loss": 1.707, + "step": 17367 + }, + { + "epoch": 5.330877839165132, + "grad_norm": 0.3028903901576996, + "learning_rate": 4.707897491422182e-05, + "loss": 1.7992, + "step": 17368 + }, + { + "epoch": 5.3311847759361575, + "grad_norm": 0.24991434812545776, + "learning_rate": 4.7074012858202435e-05, + "loss": 1.7894, + "step": 17369 + }, + { + "epoch": 5.331491712707182, + "grad_norm": 0.20631250739097595, + "learning_rate": 4.706905083109936e-05, + "loss": 1.6816, + "step": 17370 + }, + { + "epoch": 5.331798649478207, + "grad_norm": 0.23300573229789734, + "learning_rate": 4.7064088832961666e-05, + "loss": 1.7101, + "step": 17371 + }, + { + "epoch": 5.332105586249233, + "grad_norm": 0.22331316769123077, + "learning_rate": 4.705912686383837e-05, + "loss": 1.861, + "step": 17372 + }, + { + "epoch": 5.332412523020258, + "grad_norm": 0.204593226313591, + "learning_rate": 4.7054164923778485e-05, + "loss": 1.7062, + "step": 17373 + }, + { + "epoch": 5.332719459791283, + "grad_norm": 0.22207681834697723, + "learning_rate": 4.704920301283107e-05, + "loss": 1.7546, + "step": 17374 + }, + { + "epoch": 5.333026396562309, + "grad_norm": 0.2508530020713806, + "learning_rate": 4.7044241131045157e-05, + "loss": 1.7881, + "step": 17375 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.26084616780281067, + "learning_rate": 4.7039279278469804e-05, + "loss": 1.7292, + "step": 17376 + }, + { + "epoch": 5.333640270104358, + "grad_norm": 0.2122940719127655, + "learning_rate": 4.7034317455154006e-05, + "loss": 1.7493, + "step": 17377 + }, + { + "epoch": 5.333947206875384, + "grad_norm": 0.2627449333667755, + "learning_rate": 4.702935566114685e-05, + "loss": 1.759, + "step": 17378 + }, + { + "epoch": 5.334254143646409, + "grad_norm": 0.20637977123260498, + "learning_rate": 4.702439389649732e-05, + "loss": 1.8043, + "step": 17379 + }, + { + "epoch": 5.334561080417434, + "grad_norm": 0.28783395886421204, + "learning_rate": 4.701943216125447e-05, + "loss": 1.7256, + "step": 17380 + }, + { + "epoch": 5.334868017188459, + "grad_norm": 0.21130618453025818, + "learning_rate": 4.701447045546734e-05, + "loss": 1.7161, + "step": 17381 + }, + { + "epoch": 5.335174953959484, + "grad_norm": 0.2793416678905487, + "learning_rate": 4.7009508779184984e-05, + "loss": 1.7659, + "step": 17382 + }, + { + "epoch": 5.3354818907305095, + "grad_norm": 0.3088020384311676, + "learning_rate": 4.700454713245639e-05, + "loss": 1.6877, + "step": 17383 + }, + { + "epoch": 5.335788827501535, + "grad_norm": 0.19697681069374084, + "learning_rate": 4.6999585515330646e-05, + "loss": 1.7111, + "step": 17384 + }, + { + "epoch": 5.33609576427256, + "grad_norm": 0.29234182834625244, + "learning_rate": 4.699462392785673e-05, + "loss": 1.7136, + "step": 17385 + }, + { + "epoch": 5.336402701043585, + "grad_norm": 0.2593611776828766, + "learning_rate": 4.698966237008371e-05, + "loss": 1.7531, + "step": 17386 + }, + { + "epoch": 5.33670963781461, + "grad_norm": 0.20024444162845612, + "learning_rate": 4.6984700842060604e-05, + "loss": 1.7035, + "step": 17387 + }, + { + "epoch": 5.337016574585635, + "grad_norm": 0.2929787039756775, + "learning_rate": 4.697973934383647e-05, + "loss": 1.7212, + "step": 17388 + }, + { + "epoch": 5.337323511356661, + "grad_norm": 0.2425665408372879, + "learning_rate": 4.697477787546032e-05, + "loss": 1.7191, + "step": 17389 + }, + { + "epoch": 5.337630448127686, + "grad_norm": 0.19175556302070618, + "learning_rate": 4.6969816436981176e-05, + "loss": 1.7291, + "step": 17390 + }, + { + "epoch": 5.337937384898711, + "grad_norm": 0.2602384686470032, + "learning_rate": 4.696485502844809e-05, + "loss": 1.7035, + "step": 17391 + }, + { + "epoch": 5.338244321669736, + "grad_norm": 0.19117408990859985, + "learning_rate": 4.695989364991006e-05, + "loss": 1.707, + "step": 17392 + }, + { + "epoch": 5.338551258440761, + "grad_norm": 0.31086108088493347, + "learning_rate": 4.6954932301416174e-05, + "loss": 1.7397, + "step": 17393 + }, + { + "epoch": 5.338858195211786, + "grad_norm": 0.27402472496032715, + "learning_rate": 4.694997098301542e-05, + "loss": 1.7144, + "step": 17394 + }, + { + "epoch": 5.339165131982812, + "grad_norm": 0.20345155894756317, + "learning_rate": 4.694500969475685e-05, + "loss": 1.7492, + "step": 17395 + }, + { + "epoch": 5.339472068753837, + "grad_norm": 0.23786045610904694, + "learning_rate": 4.694004843668947e-05, + "loss": 1.7781, + "step": 17396 + }, + { + "epoch": 5.3397790055248615, + "grad_norm": 0.19747424125671387, + "learning_rate": 4.6935087208862335e-05, + "loss": 1.7353, + "step": 17397 + }, + { + "epoch": 5.340085942295887, + "grad_norm": 0.224543035030365, + "learning_rate": 4.693012601132445e-05, + "loss": 1.7229, + "step": 17398 + }, + { + "epoch": 5.340392879066912, + "grad_norm": 0.20840135216712952, + "learning_rate": 4.692516484412488e-05, + "loss": 1.7557, + "step": 17399 + }, + { + "epoch": 5.3406998158379375, + "grad_norm": 0.21019098162651062, + "learning_rate": 4.692020370731261e-05, + "loss": 1.7793, + "step": 17400 + }, + { + "epoch": 5.341006752608963, + "grad_norm": 0.20540091395378113, + "learning_rate": 4.691524260093672e-05, + "loss": 1.6925, + "step": 17401 + }, + { + "epoch": 5.341313689379987, + "grad_norm": 0.2414131462574005, + "learning_rate": 4.691028152504619e-05, + "loss": 1.7706, + "step": 17402 + }, + { + "epoch": 5.341620626151013, + "grad_norm": 0.19627155363559723, + "learning_rate": 4.6905320479690073e-05, + "loss": 1.6356, + "step": 17403 + }, + { + "epoch": 5.341927562922038, + "grad_norm": 0.20978952944278717, + "learning_rate": 4.690035946491741e-05, + "loss": 1.7487, + "step": 17404 + }, + { + "epoch": 5.342234499693063, + "grad_norm": 0.2524566054344177, + "learning_rate": 4.689539848077719e-05, + "loss": 1.7713, + "step": 17405 + }, + { + "epoch": 5.342541436464089, + "grad_norm": 0.1967654973268509, + "learning_rate": 4.689043752731847e-05, + "loss": 1.7358, + "step": 17406 + }, + { + "epoch": 5.342848373235114, + "grad_norm": 0.2085377424955368, + "learning_rate": 4.688547660459026e-05, + "loss": 1.7104, + "step": 17407 + }, + { + "epoch": 5.343155310006138, + "grad_norm": 0.21294310688972473, + "learning_rate": 4.688051571264161e-05, + "loss": 1.7349, + "step": 17408 + }, + { + "epoch": 5.343462246777164, + "grad_norm": 0.23702891170978546, + "learning_rate": 4.6875554851521514e-05, + "loss": 1.8048, + "step": 17409 + }, + { + "epoch": 5.343769183548189, + "grad_norm": 0.2513964772224426, + "learning_rate": 4.687059402127904e-05, + "loss": 1.6669, + "step": 17410 + }, + { + "epoch": 5.344076120319214, + "grad_norm": 0.259540855884552, + "learning_rate": 4.6865633221963165e-05, + "loss": 1.7763, + "step": 17411 + }, + { + "epoch": 5.34438305709024, + "grad_norm": 0.28354617953300476, + "learning_rate": 4.6860672453622966e-05, + "loss": 1.7912, + "step": 17412 + }, + { + "epoch": 5.344689993861264, + "grad_norm": 0.2503860592842102, + "learning_rate": 4.685571171630742e-05, + "loss": 1.6817, + "step": 17413 + }, + { + "epoch": 5.3449969306322895, + "grad_norm": 0.2317555695772171, + "learning_rate": 4.685075101006558e-05, + "loss": 1.7652, + "step": 17414 + }, + { + "epoch": 5.345303867403315, + "grad_norm": 0.23333363234996796, + "learning_rate": 4.684579033494646e-05, + "loss": 1.722, + "step": 17415 + }, + { + "epoch": 5.34561080417434, + "grad_norm": 0.22507359087467194, + "learning_rate": 4.6840829690999104e-05, + "loss": 1.7522, + "step": 17416 + }, + { + "epoch": 5.3459177409453655, + "grad_norm": 0.2298288643360138, + "learning_rate": 4.6835869078272504e-05, + "loss": 1.7425, + "step": 17417 + }, + { + "epoch": 5.346224677716391, + "grad_norm": 0.2829224765300751, + "learning_rate": 4.683090849681572e-05, + "loss": 1.7798, + "step": 17418 + }, + { + "epoch": 5.346531614487415, + "grad_norm": 0.18153807520866394, + "learning_rate": 4.682594794667773e-05, + "loss": 1.6846, + "step": 17419 + }, + { + "epoch": 5.346838551258441, + "grad_norm": 0.24153028428554535, + "learning_rate": 4.6820987427907596e-05, + "loss": 1.7474, + "step": 17420 + }, + { + "epoch": 5.347145488029466, + "grad_norm": 0.2529772222042084, + "learning_rate": 4.681602694055434e-05, + "loss": 1.7465, + "step": 17421 + }, + { + "epoch": 5.347452424800491, + "grad_norm": 0.20414131879806519, + "learning_rate": 4.681106648466696e-05, + "loss": 1.7704, + "step": 17422 + }, + { + "epoch": 5.347759361571517, + "grad_norm": 0.27280452847480774, + "learning_rate": 4.68061060602945e-05, + "loss": 1.791, + "step": 17423 + }, + { + "epoch": 5.348066298342541, + "grad_norm": 0.20767468214035034, + "learning_rate": 4.680114566748595e-05, + "loss": 1.7744, + "step": 17424 + }, + { + "epoch": 5.348373235113566, + "grad_norm": 0.2661697566509247, + "learning_rate": 4.679618530629036e-05, + "loss": 1.7999, + "step": 17425 + }, + { + "epoch": 5.348680171884592, + "grad_norm": 0.23666872084140778, + "learning_rate": 4.679122497675674e-05, + "loss": 1.7204, + "step": 17426 + }, + { + "epoch": 5.348987108655617, + "grad_norm": 0.2688015401363373, + "learning_rate": 4.678626467893414e-05, + "loss": 1.7619, + "step": 17427 + }, + { + "epoch": 5.349294045426642, + "grad_norm": 0.23924420773983002, + "learning_rate": 4.678130441287153e-05, + "loss": 1.7754, + "step": 17428 + }, + { + "epoch": 5.349600982197667, + "grad_norm": 0.25724148750305176, + "learning_rate": 4.677634417861798e-05, + "loss": 1.761, + "step": 17429 + }, + { + "epoch": 5.349907918968692, + "grad_norm": 0.2633780241012573, + "learning_rate": 4.6771383976222464e-05, + "loss": 1.8705, + "step": 17430 + }, + { + "epoch": 5.350214855739718, + "grad_norm": 0.24774575233459473, + "learning_rate": 4.6766423805734036e-05, + "loss": 1.7127, + "step": 17431 + }, + { + "epoch": 5.350521792510743, + "grad_norm": 0.29887545108795166, + "learning_rate": 4.6761463667201695e-05, + "loss": 1.7651, + "step": 17432 + }, + { + "epoch": 5.350828729281768, + "grad_norm": 0.2231605499982834, + "learning_rate": 4.6756503560674486e-05, + "loss": 1.7636, + "step": 17433 + }, + { + "epoch": 5.351135666052793, + "grad_norm": 0.27977073192596436, + "learning_rate": 4.675154348620139e-05, + "loss": 1.7108, + "step": 17434 + }, + { + "epoch": 5.351442602823818, + "grad_norm": 0.26866039633750916, + "learning_rate": 4.674658344383146e-05, + "loss": 1.7593, + "step": 17435 + }, + { + "epoch": 5.351749539594843, + "grad_norm": 0.2154620885848999, + "learning_rate": 4.6741623433613685e-05, + "loss": 1.7536, + "step": 17436 + }, + { + "epoch": 5.352056476365869, + "grad_norm": 0.276656836271286, + "learning_rate": 4.673666345559711e-05, + "loss": 1.803, + "step": 17437 + }, + { + "epoch": 5.352363413136894, + "grad_norm": 0.22247640788555145, + "learning_rate": 4.6731703509830744e-05, + "loss": 1.7273, + "step": 17438 + }, + { + "epoch": 5.352670349907919, + "grad_norm": 0.2399090677499771, + "learning_rate": 4.6726743596363574e-05, + "loss": 1.7708, + "step": 17439 + }, + { + "epoch": 5.352977286678944, + "grad_norm": 0.2550101578235626, + "learning_rate": 4.6721783715244674e-05, + "loss": 1.7016, + "step": 17440 + }, + { + "epoch": 5.353284223449969, + "grad_norm": 0.19929546117782593, + "learning_rate": 4.6716823866523e-05, + "loss": 1.7417, + "step": 17441 + }, + { + "epoch": 5.3535911602209945, + "grad_norm": 0.2496672421693802, + "learning_rate": 4.671186405024761e-05, + "loss": 1.72, + "step": 17442 + }, + { + "epoch": 5.35389809699202, + "grad_norm": 0.19827665388584137, + "learning_rate": 4.67069042664675e-05, + "loss": 1.7515, + "step": 17443 + }, + { + "epoch": 5.354205033763045, + "grad_norm": 0.2528775930404663, + "learning_rate": 4.670194451523171e-05, + "loss": 1.7429, + "step": 17444 + }, + { + "epoch": 5.35451197053407, + "grad_norm": 0.19569729268550873, + "learning_rate": 4.6696984796589215e-05, + "loss": 1.7314, + "step": 17445 + }, + { + "epoch": 5.354818907305095, + "grad_norm": 0.21892370283603668, + "learning_rate": 4.669202511058908e-05, + "loss": 1.7331, + "step": 17446 + }, + { + "epoch": 5.35512584407612, + "grad_norm": 0.21609409153461456, + "learning_rate": 4.668706545728026e-05, + "loss": 1.7267, + "step": 17447 + }, + { + "epoch": 5.355432780847146, + "grad_norm": 0.2631370425224304, + "learning_rate": 4.668210583671182e-05, + "loss": 1.7513, + "step": 17448 + }, + { + "epoch": 5.355739717618171, + "grad_norm": 0.31327441334724426, + "learning_rate": 4.667714624893274e-05, + "loss": 1.7936, + "step": 17449 + }, + { + "epoch": 5.356046654389196, + "grad_norm": 0.21602430939674377, + "learning_rate": 4.667218669399207e-05, + "loss": 1.7387, + "step": 17450 + }, + { + "epoch": 5.356353591160221, + "grad_norm": 0.2895040214061737, + "learning_rate": 4.6667227171938784e-05, + "loss": 1.7293, + "step": 17451 + }, + { + "epoch": 5.356660527931246, + "grad_norm": 0.35150307416915894, + "learning_rate": 4.666226768282193e-05, + "loss": 1.8215, + "step": 17452 + }, + { + "epoch": 5.356967464702271, + "grad_norm": 0.19034281373023987, + "learning_rate": 4.665730822669048e-05, + "loss": 1.702, + "step": 17453 + }, + { + "epoch": 5.357274401473297, + "grad_norm": 0.25586241483688354, + "learning_rate": 4.6652348803593484e-05, + "loss": 1.7809, + "step": 17454 + }, + { + "epoch": 5.357581338244322, + "grad_norm": 0.23919305205345154, + "learning_rate": 4.6647389413579944e-05, + "loss": 1.7555, + "step": 17455 + }, + { + "epoch": 5.3578882750153465, + "grad_norm": 0.22707165777683258, + "learning_rate": 4.664243005669885e-05, + "loss": 1.7633, + "step": 17456 + }, + { + "epoch": 5.358195211786372, + "grad_norm": 0.20666839182376862, + "learning_rate": 4.663747073299925e-05, + "loss": 1.6522, + "step": 17457 + }, + { + "epoch": 5.358502148557397, + "grad_norm": 0.20557542145252228, + "learning_rate": 4.663251144253012e-05, + "loss": 1.73, + "step": 17458 + }, + { + "epoch": 5.3588090853284225, + "grad_norm": 0.22375571727752686, + "learning_rate": 4.662755218534049e-05, + "loss": 1.7189, + "step": 17459 + }, + { + "epoch": 5.359116022099448, + "grad_norm": 0.261393278837204, + "learning_rate": 4.662259296147936e-05, + "loss": 1.6863, + "step": 17460 + }, + { + "epoch": 5.359422958870473, + "grad_norm": 0.2279379516839981, + "learning_rate": 4.6617633770995764e-05, + "loss": 1.7332, + "step": 17461 + }, + { + "epoch": 5.359729895641498, + "grad_norm": 0.2194606065750122, + "learning_rate": 4.6612674613938666e-05, + "loss": 1.7324, + "step": 17462 + }, + { + "epoch": 5.360036832412523, + "grad_norm": 0.27714410424232483, + "learning_rate": 4.660771549035713e-05, + "loss": 1.7386, + "step": 17463 + }, + { + "epoch": 5.360343769183548, + "grad_norm": 0.2118787169456482, + "learning_rate": 4.660275640030012e-05, + "loss": 1.7587, + "step": 17464 + }, + { + "epoch": 5.360650705954574, + "grad_norm": 0.2546979784965515, + "learning_rate": 4.6597797343816665e-05, + "loss": 1.7756, + "step": 17465 + }, + { + "epoch": 5.360957642725599, + "grad_norm": 0.194237619638443, + "learning_rate": 4.659283832095577e-05, + "loss": 1.7351, + "step": 17466 + }, + { + "epoch": 5.361264579496623, + "grad_norm": 0.23448583483695984, + "learning_rate": 4.658787933176646e-05, + "loss": 1.7051, + "step": 17467 + }, + { + "epoch": 5.361571516267649, + "grad_norm": 0.22796298563480377, + "learning_rate": 4.65829203762977e-05, + "loss": 1.7395, + "step": 17468 + }, + { + "epoch": 5.361878453038674, + "grad_norm": 0.22674904763698578, + "learning_rate": 4.657796145459855e-05, + "loss": 1.714, + "step": 17469 + }, + { + "epoch": 5.362185389809699, + "grad_norm": 0.2697311341762543, + "learning_rate": 4.657300256671797e-05, + "loss": 1.8271, + "step": 17470 + }, + { + "epoch": 5.362492326580725, + "grad_norm": 0.28040480613708496, + "learning_rate": 4.6568043712705004e-05, + "loss": 1.8192, + "step": 17471 + }, + { + "epoch": 5.362799263351749, + "grad_norm": 0.21100232005119324, + "learning_rate": 4.6563084892608644e-05, + "loss": 1.7285, + "step": 17472 + }, + { + "epoch": 5.3631062001227745, + "grad_norm": 0.23545897006988525, + "learning_rate": 4.655812610647787e-05, + "loss": 1.7302, + "step": 17473 + }, + { + "epoch": 5.3634131368938, + "grad_norm": 0.23278315365314484, + "learning_rate": 4.655316735436174e-05, + "loss": 1.7749, + "step": 17474 + }, + { + "epoch": 5.363720073664825, + "grad_norm": 0.333763986825943, + "learning_rate": 4.65482086363092e-05, + "loss": 1.7393, + "step": 17475 + }, + { + "epoch": 5.3640270104358505, + "grad_norm": 0.2743878662586212, + "learning_rate": 4.6543249952369306e-05, + "loss": 1.7274, + "step": 17476 + }, + { + "epoch": 5.364333947206875, + "grad_norm": 0.234402596950531, + "learning_rate": 4.6538291302591024e-05, + "loss": 1.7848, + "step": 17477 + }, + { + "epoch": 5.3646408839779, + "grad_norm": 0.29100897908210754, + "learning_rate": 4.65333326870234e-05, + "loss": 1.7698, + "step": 17478 + }, + { + "epoch": 5.364947820748926, + "grad_norm": 0.24178378283977509, + "learning_rate": 4.652837410571539e-05, + "loss": 1.8142, + "step": 17479 + }, + { + "epoch": 5.365254757519951, + "grad_norm": 0.4189155101776123, + "learning_rate": 4.652341555871605e-05, + "loss": 1.7435, + "step": 17480 + }, + { + "epoch": 5.365561694290976, + "grad_norm": 0.40106773376464844, + "learning_rate": 4.651845704607433e-05, + "loss": 1.837, + "step": 17481 + }, + { + "epoch": 5.365868631062002, + "grad_norm": 0.24127443134784698, + "learning_rate": 4.651349856783927e-05, + "loss": 1.7257, + "step": 17482 + }, + { + "epoch": 5.366175567833026, + "grad_norm": 0.412812739610672, + "learning_rate": 4.650854012405985e-05, + "loss": 1.762, + "step": 17483 + }, + { + "epoch": 5.366482504604051, + "grad_norm": 0.2636469602584839, + "learning_rate": 4.65035817147851e-05, + "loss": 1.7995, + "step": 17484 + }, + { + "epoch": 5.366789441375077, + "grad_norm": 0.282186895608902, + "learning_rate": 4.649862334006399e-05, + "loss": 1.75, + "step": 17485 + }, + { + "epoch": 5.367096378146102, + "grad_norm": 0.3280154764652252, + "learning_rate": 4.649366499994555e-05, + "loss": 1.7668, + "step": 17486 + }, + { + "epoch": 5.367403314917127, + "grad_norm": 0.24608035385608673, + "learning_rate": 4.648870669447875e-05, + "loss": 1.8332, + "step": 17487 + }, + { + "epoch": 5.367710251688152, + "grad_norm": 0.21927174925804138, + "learning_rate": 4.648374842371262e-05, + "loss": 1.7365, + "step": 17488 + }, + { + "epoch": 5.368017188459177, + "grad_norm": 0.2658425569534302, + "learning_rate": 4.6478790187696164e-05, + "loss": 1.841, + "step": 17489 + }, + { + "epoch": 5.3683241252302025, + "grad_norm": 0.2302858531475067, + "learning_rate": 4.647383198647834e-05, + "loss": 1.7882, + "step": 17490 + }, + { + "epoch": 5.368631062001228, + "grad_norm": 0.2562740743160248, + "learning_rate": 4.64688738201082e-05, + "loss": 1.7188, + "step": 17491 + }, + { + "epoch": 5.368937998772253, + "grad_norm": 0.28140220046043396, + "learning_rate": 4.646391568863469e-05, + "loss": 1.7482, + "step": 17492 + }, + { + "epoch": 5.3692449355432785, + "grad_norm": 0.21040008962154388, + "learning_rate": 4.6458957592106855e-05, + "loss": 1.7695, + "step": 17493 + }, + { + "epoch": 5.369551872314303, + "grad_norm": 0.25322291254997253, + "learning_rate": 4.645399953057367e-05, + "loss": 1.7127, + "step": 17494 + }, + { + "epoch": 5.369858809085328, + "grad_norm": 0.2239738404750824, + "learning_rate": 4.644904150408415e-05, + "loss": 1.7376, + "step": 17495 + }, + { + "epoch": 5.370165745856354, + "grad_norm": 0.21432901918888092, + "learning_rate": 4.644408351268727e-05, + "loss": 1.7156, + "step": 17496 + }, + { + "epoch": 5.370472682627379, + "grad_norm": 0.3057272732257843, + "learning_rate": 4.643912555643205e-05, + "loss": 1.7706, + "step": 17497 + }, + { + "epoch": 5.370779619398404, + "grad_norm": 0.2826928496360779, + "learning_rate": 4.643416763536748e-05, + "loss": 1.8298, + "step": 17498 + }, + { + "epoch": 5.371086556169429, + "grad_norm": 0.2395278513431549, + "learning_rate": 4.642920974954255e-05, + "loss": 1.7357, + "step": 17499 + }, + { + "epoch": 5.371393492940454, + "grad_norm": 0.21004743874073029, + "learning_rate": 4.642425189900626e-05, + "loss": 1.7263, + "step": 17500 + }, + { + "epoch": 5.371700429711479, + "grad_norm": 0.23981697857379913, + "learning_rate": 4.641929408380761e-05, + "loss": 1.7341, + "step": 17501 + }, + { + "epoch": 5.372007366482505, + "grad_norm": 0.1984727531671524, + "learning_rate": 4.641433630399559e-05, + "loss": 1.7133, + "step": 17502 + }, + { + "epoch": 5.37231430325353, + "grad_norm": 0.22153446078300476, + "learning_rate": 4.640937855961922e-05, + "loss": 1.8028, + "step": 17503 + }, + { + "epoch": 5.3726212400245545, + "grad_norm": 0.24257974326610565, + "learning_rate": 4.6404420850727455e-05, + "loss": 1.7842, + "step": 17504 + }, + { + "epoch": 5.37292817679558, + "grad_norm": 0.19444705545902252, + "learning_rate": 4.6399463177369316e-05, + "loss": 1.7296, + "step": 17505 + }, + { + "epoch": 5.373235113566605, + "grad_norm": 0.2068849354982376, + "learning_rate": 4.6394505539593806e-05, + "loss": 1.6949, + "step": 17506 + }, + { + "epoch": 5.3735420503376305, + "grad_norm": 0.21762309968471527, + "learning_rate": 4.638954793744989e-05, + "loss": 1.7556, + "step": 17507 + }, + { + "epoch": 5.373848987108656, + "grad_norm": 0.20791584253311157, + "learning_rate": 4.638459037098659e-05, + "loss": 1.7442, + "step": 17508 + }, + { + "epoch": 5.37415592387968, + "grad_norm": 0.27774497866630554, + "learning_rate": 4.6379632840252875e-05, + "loss": 1.7834, + "step": 17509 + }, + { + "epoch": 5.374462860650706, + "grad_norm": 0.24211421608924866, + "learning_rate": 4.637467534529775e-05, + "loss": 1.819, + "step": 17510 + }, + { + "epoch": 5.374769797421731, + "grad_norm": 0.24857789278030396, + "learning_rate": 4.636971788617022e-05, + "loss": 1.7483, + "step": 17511 + }, + { + "epoch": 5.375076734192756, + "grad_norm": 0.25142937898635864, + "learning_rate": 4.636476046291925e-05, + "loss": 1.7405, + "step": 17512 + }, + { + "epoch": 5.375383670963782, + "grad_norm": 0.25860801339149475, + "learning_rate": 4.6359803075593846e-05, + "loss": 1.7821, + "step": 17513 + }, + { + "epoch": 5.375690607734807, + "grad_norm": 0.25223109126091003, + "learning_rate": 4.635484572424302e-05, + "loss": 1.738, + "step": 17514 + }, + { + "epoch": 5.3759975445058314, + "grad_norm": 0.22931768000125885, + "learning_rate": 4.634988840891573e-05, + "loss": 1.7717, + "step": 17515 + }, + { + "epoch": 5.376304481276857, + "grad_norm": 0.21371231973171234, + "learning_rate": 4.6344931129661e-05, + "loss": 1.7741, + "step": 17516 + }, + { + "epoch": 5.376611418047882, + "grad_norm": 0.2653632164001465, + "learning_rate": 4.633997388652778e-05, + "loss": 1.7548, + "step": 17517 + }, + { + "epoch": 5.3769183548189075, + "grad_norm": 0.2559951841831207, + "learning_rate": 4.6335016679565094e-05, + "loss": 1.7833, + "step": 17518 + }, + { + "epoch": 5.377225291589933, + "grad_norm": 0.22560031712055206, + "learning_rate": 4.6330059508821914e-05, + "loss": 1.6929, + "step": 17519 + }, + { + "epoch": 5.377532228360957, + "grad_norm": 0.3084852695465088, + "learning_rate": 4.6325102374347255e-05, + "loss": 1.8107, + "step": 17520 + }, + { + "epoch": 5.377839165131983, + "grad_norm": 0.3329267203807831, + "learning_rate": 4.632014527619007e-05, + "loss": 1.6791, + "step": 17521 + }, + { + "epoch": 5.378146101903008, + "grad_norm": 0.26274019479751587, + "learning_rate": 4.631518821439939e-05, + "loss": 1.7187, + "step": 17522 + }, + { + "epoch": 5.378453038674033, + "grad_norm": 0.3769492208957672, + "learning_rate": 4.6310231189024165e-05, + "loss": 1.8366, + "step": 17523 + }, + { + "epoch": 5.378759975445059, + "grad_norm": 0.2503921687602997, + "learning_rate": 4.6305274200113385e-05, + "loss": 1.7281, + "step": 17524 + }, + { + "epoch": 5.379066912216084, + "grad_norm": 0.26305708289146423, + "learning_rate": 4.6300317247716074e-05, + "loss": 1.7231, + "step": 17525 + }, + { + "epoch": 5.379373848987108, + "grad_norm": 0.31899142265319824, + "learning_rate": 4.629536033188118e-05, + "loss": 1.8025, + "step": 17526 + }, + { + "epoch": 5.379680785758134, + "grad_norm": 0.21400104463100433, + "learning_rate": 4.629040345265772e-05, + "loss": 1.7481, + "step": 17527 + }, + { + "epoch": 5.379987722529159, + "grad_norm": 0.23147371411323547, + "learning_rate": 4.628544661009465e-05, + "loss": 1.7049, + "step": 17528 + }, + { + "epoch": 5.380294659300184, + "grad_norm": 0.21156759560108185, + "learning_rate": 4.628048980424099e-05, + "loss": 1.806, + "step": 17529 + }, + { + "epoch": 5.38060159607121, + "grad_norm": 0.22061556577682495, + "learning_rate": 4.6275533035145685e-05, + "loss": 1.7606, + "step": 17530 + }, + { + "epoch": 5.380908532842234, + "grad_norm": 0.23379987478256226, + "learning_rate": 4.6270576302857774e-05, + "loss": 1.7874, + "step": 17531 + }, + { + "epoch": 5.3812154696132595, + "grad_norm": 0.24738669395446777, + "learning_rate": 4.62656196074262e-05, + "loss": 1.7611, + "step": 17532 + }, + { + "epoch": 5.381522406384285, + "grad_norm": 0.19738905131816864, + "learning_rate": 4.6260662948899974e-05, + "loss": 1.7375, + "step": 17533 + }, + { + "epoch": 5.38182934315531, + "grad_norm": 0.2327810823917389, + "learning_rate": 4.6255706327328044e-05, + "loss": 1.7188, + "step": 17534 + }, + { + "epoch": 5.3821362799263355, + "grad_norm": 0.18944145739078522, + "learning_rate": 4.625074974275944e-05, + "loss": 1.6672, + "step": 17535 + }, + { + "epoch": 5.382443216697361, + "grad_norm": 0.20943734049797058, + "learning_rate": 4.624579319524311e-05, + "loss": 1.7238, + "step": 17536 + }, + { + "epoch": 5.382750153468385, + "grad_norm": 0.2060960829257965, + "learning_rate": 4.6240836684828074e-05, + "loss": 1.744, + "step": 17537 + }, + { + "epoch": 5.383057090239411, + "grad_norm": 0.19089816510677338, + "learning_rate": 4.6235880211563264e-05, + "loss": 1.6884, + "step": 17538 + }, + { + "epoch": 5.383364027010436, + "grad_norm": 0.22362665832042694, + "learning_rate": 4.623092377549772e-05, + "loss": 1.7076, + "step": 17539 + }, + { + "epoch": 5.383670963781461, + "grad_norm": 0.19429968297481537, + "learning_rate": 4.622596737668039e-05, + "loss": 1.7315, + "step": 17540 + }, + { + "epoch": 5.383977900552487, + "grad_norm": 0.20481903851032257, + "learning_rate": 4.622101101516024e-05, + "loss": 1.711, + "step": 17541 + }, + { + "epoch": 5.384284837323511, + "grad_norm": 0.19181163609027863, + "learning_rate": 4.6216054690986304e-05, + "loss": 1.6879, + "step": 17542 + }, + { + "epoch": 5.384591774094536, + "grad_norm": 0.23105846345424652, + "learning_rate": 4.6211098404207514e-05, + "loss": 1.7797, + "step": 17543 + }, + { + "epoch": 5.384898710865562, + "grad_norm": 0.2742008864879608, + "learning_rate": 4.6206142154872886e-05, + "loss": 1.7404, + "step": 17544 + }, + { + "epoch": 5.385205647636587, + "grad_norm": 0.2256750613451004, + "learning_rate": 4.6201185943031365e-05, + "loss": 1.7616, + "step": 17545 + }, + { + "epoch": 5.385512584407612, + "grad_norm": 0.23230868577957153, + "learning_rate": 4.6196229768731964e-05, + "loss": 1.7457, + "step": 17546 + }, + { + "epoch": 5.385819521178637, + "grad_norm": 0.2200126200914383, + "learning_rate": 4.6191273632023634e-05, + "loss": 1.7835, + "step": 17547 + }, + { + "epoch": 5.386126457949662, + "grad_norm": 0.21903863549232483, + "learning_rate": 4.6186317532955395e-05, + "loss": 1.7315, + "step": 17548 + }, + { + "epoch": 5.3864333947206875, + "grad_norm": 0.1915556788444519, + "learning_rate": 4.6181361471576186e-05, + "loss": 1.6786, + "step": 17549 + }, + { + "epoch": 5.386740331491713, + "grad_norm": 0.20177799463272095, + "learning_rate": 4.617640544793501e-05, + "loss": 1.7453, + "step": 17550 + }, + { + "epoch": 5.387047268262738, + "grad_norm": 0.2598256766796112, + "learning_rate": 4.617144946208083e-05, + "loss": 1.7931, + "step": 17551 + }, + { + "epoch": 5.387354205033763, + "grad_norm": 0.2357153594493866, + "learning_rate": 4.616649351406263e-05, + "loss": 1.7932, + "step": 17552 + }, + { + "epoch": 5.387661141804788, + "grad_norm": 0.2228964865207672, + "learning_rate": 4.616153760392938e-05, + "loss": 1.7725, + "step": 17553 + }, + { + "epoch": 5.387968078575813, + "grad_norm": 0.20811811089515686, + "learning_rate": 4.6156581731730085e-05, + "loss": 1.744, + "step": 17554 + }, + { + "epoch": 5.388275015346839, + "grad_norm": 0.20008429884910583, + "learning_rate": 4.615162589751369e-05, + "loss": 1.6973, + "step": 17555 + }, + { + "epoch": 5.388581952117864, + "grad_norm": 0.20487523078918457, + "learning_rate": 4.614667010132919e-05, + "loss": 1.7712, + "step": 17556 + }, + { + "epoch": 5.388888888888889, + "grad_norm": 0.21279677748680115, + "learning_rate": 4.6141714343225554e-05, + "loss": 1.7783, + "step": 17557 + }, + { + "epoch": 5.389195825659914, + "grad_norm": 0.28035736083984375, + "learning_rate": 4.613675862325174e-05, + "loss": 1.767, + "step": 17558 + }, + { + "epoch": 5.389502762430939, + "grad_norm": 0.27426794171333313, + "learning_rate": 4.613180294145677e-05, + "loss": 1.7909, + "step": 17559 + }, + { + "epoch": 5.389809699201964, + "grad_norm": 0.22420327365398407, + "learning_rate": 4.612684729788957e-05, + "loss": 1.6902, + "step": 17560 + }, + { + "epoch": 5.39011663597299, + "grad_norm": 0.19799382984638214, + "learning_rate": 4.612189169259915e-05, + "loss": 1.7276, + "step": 17561 + }, + { + "epoch": 5.390423572744015, + "grad_norm": 0.2508823573589325, + "learning_rate": 4.611693612563445e-05, + "loss": 1.7445, + "step": 17562 + }, + { + "epoch": 5.3907305095150395, + "grad_norm": 0.20835694670677185, + "learning_rate": 4.611198059704448e-05, + "loss": 1.696, + "step": 17563 + }, + { + "epoch": 5.391037446286065, + "grad_norm": 0.22136010229587555, + "learning_rate": 4.6107025106878176e-05, + "loss": 1.7701, + "step": 17564 + }, + { + "epoch": 5.39134438305709, + "grad_norm": 0.23835612833499908, + "learning_rate": 4.610206965518456e-05, + "loss": 1.7494, + "step": 17565 + }, + { + "epoch": 5.3916513198281155, + "grad_norm": 0.26142916083335876, + "learning_rate": 4.6097114242012554e-05, + "loss": 1.7616, + "step": 17566 + }, + { + "epoch": 5.391958256599141, + "grad_norm": 0.3366851806640625, + "learning_rate": 4.6092158867411175e-05, + "loss": 1.7409, + "step": 17567 + }, + { + "epoch": 5.392265193370166, + "grad_norm": 0.2592991292476654, + "learning_rate": 4.608720353142935e-05, + "loss": 1.7469, + "step": 17568 + }, + { + "epoch": 5.392572130141191, + "grad_norm": 0.25810322165489197, + "learning_rate": 4.608224823411608e-05, + "loss": 1.7345, + "step": 17569 + }, + { + "epoch": 5.392879066912216, + "grad_norm": 0.26776888966560364, + "learning_rate": 4.607729297552032e-05, + "loss": 1.7698, + "step": 17570 + }, + { + "epoch": 5.393186003683241, + "grad_norm": 0.21023939549922943, + "learning_rate": 4.607233775569107e-05, + "loss": 1.7681, + "step": 17571 + }, + { + "epoch": 5.393492940454267, + "grad_norm": 0.24452096223831177, + "learning_rate": 4.6067382574677265e-05, + "loss": 1.8154, + "step": 17572 + }, + { + "epoch": 5.393799877225292, + "grad_norm": 0.27084338665008545, + "learning_rate": 4.606242743252791e-05, + "loss": 1.7106, + "step": 17573 + }, + { + "epoch": 5.394106813996316, + "grad_norm": 0.24783825874328613, + "learning_rate": 4.605747232929195e-05, + "loss": 1.713, + "step": 17574 + }, + { + "epoch": 5.394413750767342, + "grad_norm": 0.2528151869773865, + "learning_rate": 4.6052517265018333e-05, + "loss": 1.8475, + "step": 17575 + }, + { + "epoch": 5.394720687538367, + "grad_norm": 0.24361065030097961, + "learning_rate": 4.604756223975609e-05, + "loss": 1.7414, + "step": 17576 + }, + { + "epoch": 5.395027624309392, + "grad_norm": 0.2751234769821167, + "learning_rate": 4.604260725355412e-05, + "loss": 1.7603, + "step": 17577 + }, + { + "epoch": 5.395334561080418, + "grad_norm": 0.23183637857437134, + "learning_rate": 4.603765230646146e-05, + "loss": 1.7053, + "step": 17578 + }, + { + "epoch": 5.395641497851442, + "grad_norm": 0.27462145686149597, + "learning_rate": 4.6032697398527005e-05, + "loss": 1.746, + "step": 17579 + }, + { + "epoch": 5.3959484346224675, + "grad_norm": 0.3665321171283722, + "learning_rate": 4.602774252979978e-05, + "loss": 1.6883, + "step": 17580 + }, + { + "epoch": 5.396255371393493, + "grad_norm": 0.22438424825668335, + "learning_rate": 4.602278770032872e-05, + "loss": 1.7473, + "step": 17581 + }, + { + "epoch": 5.396562308164518, + "grad_norm": 0.38713687658309937, + "learning_rate": 4.601783291016282e-05, + "loss": 1.7993, + "step": 17582 + }, + { + "epoch": 5.3968692449355435, + "grad_norm": 0.3399868905544281, + "learning_rate": 4.6012878159351015e-05, + "loss": 1.7709, + "step": 17583 + }, + { + "epoch": 5.397176181706568, + "grad_norm": 0.21916119754314423, + "learning_rate": 4.60079234479423e-05, + "loss": 1.7351, + "step": 17584 + }, + { + "epoch": 5.397483118477593, + "grad_norm": 0.3796394467353821, + "learning_rate": 4.600296877598561e-05, + "loss": 1.7534, + "step": 17585 + }, + { + "epoch": 5.397790055248619, + "grad_norm": 0.27824562788009644, + "learning_rate": 4.599801414352993e-05, + "loss": 1.6962, + "step": 17586 + }, + { + "epoch": 5.398096992019644, + "grad_norm": 0.21037112176418304, + "learning_rate": 4.599305955062421e-05, + "loss": 1.7062, + "step": 17587 + }, + { + "epoch": 5.398403928790669, + "grad_norm": 0.3373035192489624, + "learning_rate": 4.598810499731745e-05, + "loss": 1.8263, + "step": 17588 + }, + { + "epoch": 5.398710865561695, + "grad_norm": 0.2560507357120514, + "learning_rate": 4.5983150483658564e-05, + "loss": 1.7232, + "step": 17589 + }, + { + "epoch": 5.399017802332719, + "grad_norm": 0.23010993003845215, + "learning_rate": 4.5978196009696564e-05, + "loss": 1.805, + "step": 17590 + }, + { + "epoch": 5.399324739103744, + "grad_norm": 0.32955634593963623, + "learning_rate": 4.597324157548037e-05, + "loss": 1.7018, + "step": 17591 + }, + { + "epoch": 5.39963167587477, + "grad_norm": 0.2534363865852356, + "learning_rate": 4.5968287181058953e-05, + "loss": 1.6919, + "step": 17592 + }, + { + "epoch": 5.399938612645795, + "grad_norm": 0.23179130256175995, + "learning_rate": 4.5963332826481314e-05, + "loss": 1.7237, + "step": 17593 + }, + { + "epoch": 5.4002455494168204, + "grad_norm": 0.37712663412094116, + "learning_rate": 4.5958378511796365e-05, + "loss": 1.7694, + "step": 17594 + }, + { + "epoch": 5.400552486187845, + "grad_norm": 0.21228717267513275, + "learning_rate": 4.59534242370531e-05, + "loss": 1.7528, + "step": 17595 + }, + { + "epoch": 5.40085942295887, + "grad_norm": 0.2818812429904938, + "learning_rate": 4.5948470002300454e-05, + "loss": 1.8214, + "step": 17596 + }, + { + "epoch": 5.401166359729896, + "grad_norm": 0.24916675686836243, + "learning_rate": 4.5943515807587415e-05, + "loss": 1.7792, + "step": 17597 + }, + { + "epoch": 5.401473296500921, + "grad_norm": 0.2096913456916809, + "learning_rate": 4.593856165296291e-05, + "loss": 1.6983, + "step": 17598 + }, + { + "epoch": 5.401780233271946, + "grad_norm": 0.271124005317688, + "learning_rate": 4.593360753847595e-05, + "loss": 1.7534, + "step": 17599 + }, + { + "epoch": 5.402087170042972, + "grad_norm": 0.24798092246055603, + "learning_rate": 4.5928653464175435e-05, + "loss": 1.7783, + "step": 17600 + }, + { + "epoch": 5.402394106813996, + "grad_norm": 0.3531748056411743, + "learning_rate": 4.592369943011038e-05, + "loss": 1.7834, + "step": 17601 + }, + { + "epoch": 5.402701043585021, + "grad_norm": 0.29650232195854187, + "learning_rate": 4.591874543632969e-05, + "loss": 1.7186, + "step": 17602 + }, + { + "epoch": 5.403007980356047, + "grad_norm": 0.25578248500823975, + "learning_rate": 4.591379148288236e-05, + "loss": 1.7849, + "step": 17603 + }, + { + "epoch": 5.403314917127072, + "grad_norm": 0.3790532946586609, + "learning_rate": 4.590883756981733e-05, + "loss": 1.7192, + "step": 17604 + }, + { + "epoch": 5.403621853898097, + "grad_norm": 0.23684249818325043, + "learning_rate": 4.590388369718359e-05, + "loss": 1.7171, + "step": 17605 + }, + { + "epoch": 5.403928790669122, + "grad_norm": 0.267702579498291, + "learning_rate": 4.589892986503005e-05, + "loss": 1.7181, + "step": 17606 + }, + { + "epoch": 5.404235727440147, + "grad_norm": 0.29105648398399353, + "learning_rate": 4.5893976073405704e-05, + "loss": 1.7395, + "step": 17607 + }, + { + "epoch": 5.4045426642111725, + "grad_norm": 0.2266589254140854, + "learning_rate": 4.588902232235949e-05, + "loss": 1.7244, + "step": 17608 + }, + { + "epoch": 5.404849600982198, + "grad_norm": 0.24065524339675903, + "learning_rate": 4.588406861194035e-05, + "loss": 1.7398, + "step": 17609 + }, + { + "epoch": 5.405156537753223, + "grad_norm": 0.23166650533676147, + "learning_rate": 4.587911494219728e-05, + "loss": 1.7592, + "step": 17610 + }, + { + "epoch": 5.4054634745242485, + "grad_norm": 0.19882038235664368, + "learning_rate": 4.5874161313179186e-05, + "loss": 1.7087, + "step": 17611 + }, + { + "epoch": 5.405770411295273, + "grad_norm": 0.2688273787498474, + "learning_rate": 4.5869207724935076e-05, + "loss": 1.7791, + "step": 17612 + }, + { + "epoch": 5.406077348066298, + "grad_norm": 0.1970982402563095, + "learning_rate": 4.5864254177513855e-05, + "loss": 1.7079, + "step": 17613 + }, + { + "epoch": 5.406384284837324, + "grad_norm": 0.2531265318393707, + "learning_rate": 4.585930067096451e-05, + "loss": 1.716, + "step": 17614 + }, + { + "epoch": 5.406691221608349, + "grad_norm": 0.2610352337360382, + "learning_rate": 4.585434720533596e-05, + "loss": 1.7133, + "step": 17615 + }, + { + "epoch": 5.406998158379374, + "grad_norm": 0.2420870065689087, + "learning_rate": 4.5849393780677216e-05, + "loss": 1.7044, + "step": 17616 + }, + { + "epoch": 5.407305095150399, + "grad_norm": 0.24078647792339325, + "learning_rate": 4.584444039703717e-05, + "loss": 1.7486, + "step": 17617 + }, + { + "epoch": 5.407612031921424, + "grad_norm": 0.19324539601802826, + "learning_rate": 4.583948705446481e-05, + "loss": 1.7439, + "step": 17618 + }, + { + "epoch": 5.407918968692449, + "grad_norm": 0.2311750054359436, + "learning_rate": 4.5834533753009065e-05, + "loss": 1.7794, + "step": 17619 + }, + { + "epoch": 5.408225905463475, + "grad_norm": 0.2554466128349304, + "learning_rate": 4.5829580492718914e-05, + "loss": 1.7146, + "step": 17620 + }, + { + "epoch": 5.4085328422345, + "grad_norm": 0.2679688334465027, + "learning_rate": 4.582462727364328e-05, + "loss": 1.7677, + "step": 17621 + }, + { + "epoch": 5.4088397790055245, + "grad_norm": 0.19292913377285004, + "learning_rate": 4.5819674095831146e-05, + "loss": 1.7544, + "step": 17622 + }, + { + "epoch": 5.40914671577655, + "grad_norm": 0.2146623730659485, + "learning_rate": 4.5814720959331425e-05, + "loss": 1.7182, + "step": 17623 + }, + { + "epoch": 5.409453652547575, + "grad_norm": 0.23098216950893402, + "learning_rate": 4.5809767864193096e-05, + "loss": 1.6844, + "step": 17624 + }, + { + "epoch": 5.4097605893186005, + "grad_norm": 0.22482910752296448, + "learning_rate": 4.5804814810465096e-05, + "loss": 1.7921, + "step": 17625 + }, + { + "epoch": 5.410067526089626, + "grad_norm": 0.22098569571971893, + "learning_rate": 4.579986179819636e-05, + "loss": 1.7419, + "step": 17626 + }, + { + "epoch": 5.41037446286065, + "grad_norm": 0.2131706178188324, + "learning_rate": 4.579490882743588e-05, + "loss": 1.7587, + "step": 17627 + }, + { + "epoch": 5.410681399631676, + "grad_norm": 0.22448734939098358, + "learning_rate": 4.578995589823254e-05, + "loss": 1.6959, + "step": 17628 + }, + { + "epoch": 5.410988336402701, + "grad_norm": 0.22372964024543762, + "learning_rate": 4.578500301063536e-05, + "loss": 1.7462, + "step": 17629 + }, + { + "epoch": 5.411295273173726, + "grad_norm": 0.22140730917453766, + "learning_rate": 4.578005016469322e-05, + "loss": 1.8348, + "step": 17630 + }, + { + "epoch": 5.411602209944752, + "grad_norm": 0.21697622537612915, + "learning_rate": 4.577509736045511e-05, + "loss": 1.7634, + "step": 17631 + }, + { + "epoch": 5.411909146715777, + "grad_norm": 0.2044363021850586, + "learning_rate": 4.5770144597969954e-05, + "loss": 1.7095, + "step": 17632 + }, + { + "epoch": 5.412216083486801, + "grad_norm": 0.1910451501607895, + "learning_rate": 4.576519187728674e-05, + "loss": 1.7022, + "step": 17633 + }, + { + "epoch": 5.412523020257827, + "grad_norm": 0.21787554025650024, + "learning_rate": 4.576023919845434e-05, + "loss": 1.7206, + "step": 17634 + }, + { + "epoch": 5.412829957028852, + "grad_norm": 0.2363428920507431, + "learning_rate": 4.575528656152178e-05, + "loss": 1.8052, + "step": 17635 + }, + { + "epoch": 5.413136893799877, + "grad_norm": 0.22830195724964142, + "learning_rate": 4.575033396653793e-05, + "loss": 1.7432, + "step": 17636 + }, + { + "epoch": 5.413443830570903, + "grad_norm": 0.24867239594459534, + "learning_rate": 4.5745381413551794e-05, + "loss": 1.7011, + "step": 17637 + }, + { + "epoch": 5.413750767341927, + "grad_norm": 0.19329775869846344, + "learning_rate": 4.574042890261228e-05, + "loss": 1.7749, + "step": 17638 + }, + { + "epoch": 5.4140577041129525, + "grad_norm": 0.22917115688323975, + "learning_rate": 4.573547643376836e-05, + "loss": 1.7478, + "step": 17639 + }, + { + "epoch": 5.414364640883978, + "grad_norm": 0.23882724344730377, + "learning_rate": 4.573052400706894e-05, + "loss": 1.7396, + "step": 17640 + }, + { + "epoch": 5.414671577655003, + "grad_norm": 0.19127070903778076, + "learning_rate": 4.572557162256301e-05, + "loss": 1.6791, + "step": 17641 + }, + { + "epoch": 5.4149785144260285, + "grad_norm": 0.18385560810565948, + "learning_rate": 4.5720619280299475e-05, + "loss": 1.7288, + "step": 17642 + }, + { + "epoch": 5.415285451197054, + "grad_norm": 0.19845189154148102, + "learning_rate": 4.571566698032728e-05, + "loss": 1.7525, + "step": 17643 + }, + { + "epoch": 5.415592387968078, + "grad_norm": 0.18987210094928741, + "learning_rate": 4.571071472269539e-05, + "loss": 1.7253, + "step": 17644 + }, + { + "epoch": 5.415899324739104, + "grad_norm": 0.18257199227809906, + "learning_rate": 4.570576250745271e-05, + "loss": 1.7051, + "step": 17645 + }, + { + "epoch": 5.416206261510129, + "grad_norm": 0.22803467512130737, + "learning_rate": 4.570081033464823e-05, + "loss": 1.7478, + "step": 17646 + }, + { + "epoch": 5.416513198281154, + "grad_norm": 0.18763841688632965, + "learning_rate": 4.569585820433084e-05, + "loss": 1.7316, + "step": 17647 + }, + { + "epoch": 5.41682013505218, + "grad_norm": 0.23974654078483582, + "learning_rate": 4.56909061165495e-05, + "loss": 1.7566, + "step": 17648 + }, + { + "epoch": 5.417127071823204, + "grad_norm": 0.24336253106594086, + "learning_rate": 4.568595407135315e-05, + "loss": 1.7468, + "step": 17649 + }, + { + "epoch": 5.417434008594229, + "grad_norm": 0.23891226947307587, + "learning_rate": 4.5681002068790755e-05, + "loss": 1.7201, + "step": 17650 + }, + { + "epoch": 5.417740945365255, + "grad_norm": 0.19209685921669006, + "learning_rate": 4.56760501089112e-05, + "loss": 1.713, + "step": 17651 + }, + { + "epoch": 5.41804788213628, + "grad_norm": 0.2407880276441574, + "learning_rate": 4.567109819176349e-05, + "loss": 1.7073, + "step": 17652 + }, + { + "epoch": 5.418354818907305, + "grad_norm": 0.2385055273771286, + "learning_rate": 4.5666146317396485e-05, + "loss": 1.7387, + "step": 17653 + }, + { + "epoch": 5.41866175567833, + "grad_norm": 0.22068475186824799, + "learning_rate": 4.566119448585918e-05, + "loss": 1.7116, + "step": 17654 + }, + { + "epoch": 5.418968692449355, + "grad_norm": 0.318375825881958, + "learning_rate": 4.5656242697200496e-05, + "loss": 1.7659, + "step": 17655 + }, + { + "epoch": 5.4192756292203805, + "grad_norm": 0.25311973690986633, + "learning_rate": 4.5651290951469366e-05, + "loss": 1.7814, + "step": 17656 + }, + { + "epoch": 5.419582565991406, + "grad_norm": 0.18701443076133728, + "learning_rate": 4.5646339248714735e-05, + "loss": 1.6993, + "step": 17657 + }, + { + "epoch": 5.419889502762431, + "grad_norm": 0.2964496314525604, + "learning_rate": 4.5641387588985516e-05, + "loss": 1.8254, + "step": 17658 + }, + { + "epoch": 5.420196439533456, + "grad_norm": 0.19447220861911774, + "learning_rate": 4.563643597233067e-05, + "loss": 1.7208, + "step": 17659 + }, + { + "epoch": 5.420503376304481, + "grad_norm": 0.21666039526462555, + "learning_rate": 4.5631484398799105e-05, + "loss": 1.6695, + "step": 17660 + }, + { + "epoch": 5.420810313075506, + "grad_norm": 0.23104412853717804, + "learning_rate": 4.5626532868439796e-05, + "loss": 1.7449, + "step": 17661 + }, + { + "epoch": 5.421117249846532, + "grad_norm": 0.20463459193706512, + "learning_rate": 4.562158138130163e-05, + "loss": 1.6714, + "step": 17662 + }, + { + "epoch": 5.421424186617557, + "grad_norm": 0.21948079764842987, + "learning_rate": 4.561662993743359e-05, + "loss": 1.6957, + "step": 17663 + }, + { + "epoch": 5.421731123388582, + "grad_norm": 0.2672746777534485, + "learning_rate": 4.561167853688455e-05, + "loss": 1.7137, + "step": 17664 + }, + { + "epoch": 5.422038060159607, + "grad_norm": 0.2652325928211212, + "learning_rate": 4.5606727179703493e-05, + "loss": 1.7943, + "step": 17665 + }, + { + "epoch": 5.422344996930632, + "grad_norm": 0.17761313915252686, + "learning_rate": 4.560177586593933e-05, + "loss": 1.7072, + "step": 17666 + }, + { + "epoch": 5.422651933701657, + "grad_norm": 0.24759770929813385, + "learning_rate": 4.5596824595641e-05, + "loss": 1.7807, + "step": 17667 + }, + { + "epoch": 5.422958870472683, + "grad_norm": 0.22191929817199707, + "learning_rate": 4.5591873368857416e-05, + "loss": 1.7668, + "step": 17668 + }, + { + "epoch": 5.423265807243708, + "grad_norm": 0.21293842792510986, + "learning_rate": 4.5586922185637546e-05, + "loss": 1.7304, + "step": 17669 + }, + { + "epoch": 5.4235727440147325, + "grad_norm": 0.2646051049232483, + "learning_rate": 4.5581971046030277e-05, + "loss": 1.7258, + "step": 17670 + }, + { + "epoch": 5.423879680785758, + "grad_norm": 0.1894550621509552, + "learning_rate": 4.5577019950084574e-05, + "loss": 1.7066, + "step": 17671 + }, + { + "epoch": 5.424186617556783, + "grad_norm": 0.2533467710018158, + "learning_rate": 4.557206889784934e-05, + "loss": 1.7668, + "step": 17672 + }, + { + "epoch": 5.4244935543278086, + "grad_norm": 0.1972150355577469, + "learning_rate": 4.556711788937352e-05, + "loss": 1.7306, + "step": 17673 + }, + { + "epoch": 5.424800491098834, + "grad_norm": 0.2726735472679138, + "learning_rate": 4.5562166924706054e-05, + "loss": 1.7281, + "step": 17674 + }, + { + "epoch": 5.425107427869859, + "grad_norm": 0.2244454175233841, + "learning_rate": 4.555721600389584e-05, + "loss": 1.7461, + "step": 17675 + }, + { + "epoch": 5.425414364640884, + "grad_norm": 0.19486510753631592, + "learning_rate": 4.555226512699182e-05, + "loss": 1.7361, + "step": 17676 + }, + { + "epoch": 5.425721301411909, + "grad_norm": 0.18128283321857452, + "learning_rate": 4.554731429404293e-05, + "loss": 1.7637, + "step": 17677 + }, + { + "epoch": 5.426028238182934, + "grad_norm": 0.24709749221801758, + "learning_rate": 4.5542363505098084e-05, + "loss": 1.7928, + "step": 17678 + }, + { + "epoch": 5.42633517495396, + "grad_norm": 0.2236633151769638, + "learning_rate": 4.553741276020621e-05, + "loss": 1.8262, + "step": 17679 + }, + { + "epoch": 5.426642111724985, + "grad_norm": 0.2592087984085083, + "learning_rate": 4.553246205941626e-05, + "loss": 1.675, + "step": 17680 + }, + { + "epoch": 5.4269490484960095, + "grad_norm": 0.27751871943473816, + "learning_rate": 4.552751140277712e-05, + "loss": 1.7344, + "step": 17681 + }, + { + "epoch": 5.427255985267035, + "grad_norm": 0.23752287030220032, + "learning_rate": 4.5522560790337746e-05, + "loss": 1.7748, + "step": 17682 + }, + { + "epoch": 5.42756292203806, + "grad_norm": 0.3259925842285156, + "learning_rate": 4.5517610222147035e-05, + "loss": 1.7855, + "step": 17683 + }, + { + "epoch": 5.4278698588090855, + "grad_norm": 0.2579646706581116, + "learning_rate": 4.551265969825394e-05, + "loss": 1.7978, + "step": 17684 + }, + { + "epoch": 5.428176795580111, + "grad_norm": 0.3217744827270508, + "learning_rate": 4.550770921870735e-05, + "loss": 1.7793, + "step": 17685 + }, + { + "epoch": 5.428483732351136, + "grad_norm": 0.2930903434753418, + "learning_rate": 4.550275878355624e-05, + "loss": 1.7226, + "step": 17686 + }, + { + "epoch": 5.428790669122161, + "grad_norm": 0.1982879489660263, + "learning_rate": 4.549780839284948e-05, + "loss": 1.6841, + "step": 17687 + }, + { + "epoch": 5.429097605893186, + "grad_norm": 0.20843900740146637, + "learning_rate": 4.5492858046636046e-05, + "loss": 1.7201, + "step": 17688 + }, + { + "epoch": 5.429404542664211, + "grad_norm": 0.23116534948349, + "learning_rate": 4.5487907744964794e-05, + "loss": 1.7565, + "step": 17689 + }, + { + "epoch": 5.429711479435237, + "grad_norm": 0.19177772104740143, + "learning_rate": 4.548295748788471e-05, + "loss": 1.7479, + "step": 17690 + }, + { + "epoch": 5.430018416206262, + "grad_norm": 0.22261449694633484, + "learning_rate": 4.547800727544469e-05, + "loss": 1.7785, + "step": 17691 + }, + { + "epoch": 5.430325352977286, + "grad_norm": 0.20073406398296356, + "learning_rate": 4.547305710769363e-05, + "loss": 1.741, + "step": 17692 + }, + { + "epoch": 5.430632289748312, + "grad_norm": 0.21662208437919617, + "learning_rate": 4.546810698468049e-05, + "loss": 1.7269, + "step": 17693 + }, + { + "epoch": 5.430939226519337, + "grad_norm": 0.19540879130363464, + "learning_rate": 4.546315690645416e-05, + "loss": 1.7141, + "step": 17694 + }, + { + "epoch": 5.431246163290362, + "grad_norm": 0.20063656568527222, + "learning_rate": 4.545820687306358e-05, + "loss": 1.7244, + "step": 17695 + }, + { + "epoch": 5.431553100061388, + "grad_norm": 0.2172660082578659, + "learning_rate": 4.545325688455765e-05, + "loss": 1.7172, + "step": 17696 + }, + { + "epoch": 5.431860036832412, + "grad_norm": 0.2480388581752777, + "learning_rate": 4.5448306940985326e-05, + "loss": 1.6994, + "step": 17697 + }, + { + "epoch": 5.4321669736034375, + "grad_norm": 0.22499477863311768, + "learning_rate": 4.544335704239547e-05, + "loss": 1.7405, + "step": 17698 + }, + { + "epoch": 5.432473910374463, + "grad_norm": 0.20655590295791626, + "learning_rate": 4.5438407188837065e-05, + "loss": 1.6867, + "step": 17699 + }, + { + "epoch": 5.432780847145488, + "grad_norm": 0.2045906037092209, + "learning_rate": 4.543345738035896e-05, + "loss": 1.7752, + "step": 17700 + }, + { + "epoch": 5.4330877839165135, + "grad_norm": 0.2092052847146988, + "learning_rate": 4.542850761701013e-05, + "loss": 1.7389, + "step": 17701 + }, + { + "epoch": 5.433394720687538, + "grad_norm": 0.1943730264902115, + "learning_rate": 4.5423557898839446e-05, + "loss": 1.7276, + "step": 17702 + }, + { + "epoch": 5.433701657458563, + "grad_norm": 0.23487289249897003, + "learning_rate": 4.541860822589587e-05, + "loss": 1.8119, + "step": 17703 + }, + { + "epoch": 5.434008594229589, + "grad_norm": 0.204689159989357, + "learning_rate": 4.541365859822827e-05, + "loss": 1.7865, + "step": 17704 + }, + { + "epoch": 5.434315531000614, + "grad_norm": 0.20850931107997894, + "learning_rate": 4.5408709015885604e-05, + "loss": 1.7733, + "step": 17705 + }, + { + "epoch": 5.434622467771639, + "grad_norm": 0.18685877323150635, + "learning_rate": 4.540375947891675e-05, + "loss": 1.7526, + "step": 17706 + }, + { + "epoch": 5.434929404542665, + "grad_norm": 0.2009890079498291, + "learning_rate": 4.539880998737064e-05, + "loss": 1.6904, + "step": 17707 + }, + { + "epoch": 5.435236341313689, + "grad_norm": 0.16602718830108643, + "learning_rate": 4.5393860541296205e-05, + "loss": 1.689, + "step": 17708 + }, + { + "epoch": 5.435543278084714, + "grad_norm": 0.24318818747997284, + "learning_rate": 4.5388911140742315e-05, + "loss": 1.7993, + "step": 17709 + }, + { + "epoch": 5.43585021485574, + "grad_norm": 0.24094417691230774, + "learning_rate": 4.538396178575793e-05, + "loss": 1.7235, + "step": 17710 + }, + { + "epoch": 5.436157151626765, + "grad_norm": 0.20361751317977905, + "learning_rate": 4.537901247639192e-05, + "loss": 1.7198, + "step": 17711 + }, + { + "epoch": 5.43646408839779, + "grad_norm": 0.2563718259334564, + "learning_rate": 4.537406321269323e-05, + "loss": 1.795, + "step": 17712 + }, + { + "epoch": 5.436771025168815, + "grad_norm": 0.29895591735839844, + "learning_rate": 4.536911399471075e-05, + "loss": 1.7515, + "step": 17713 + }, + { + "epoch": 5.43707796193984, + "grad_norm": 0.22535841166973114, + "learning_rate": 4.536416482249342e-05, + "loss": 1.6998, + "step": 17714 + }, + { + "epoch": 5.4373848987108655, + "grad_norm": 0.26025068759918213, + "learning_rate": 4.53592156960901e-05, + "loss": 1.7821, + "step": 17715 + }, + { + "epoch": 5.437691835481891, + "grad_norm": 0.3473168611526489, + "learning_rate": 4.535426661554975e-05, + "loss": 1.7035, + "step": 17716 + }, + { + "epoch": 5.437998772252916, + "grad_norm": 0.22207199037075043, + "learning_rate": 4.534931758092126e-05, + "loss": 1.7485, + "step": 17717 + }, + { + "epoch": 5.4383057090239415, + "grad_norm": 0.26839709281921387, + "learning_rate": 4.534436859225353e-05, + "loss": 1.7272, + "step": 17718 + }, + { + "epoch": 5.438612645794966, + "grad_norm": 0.37715891003608704, + "learning_rate": 4.5339419649595476e-05, + "loss": 1.7254, + "step": 17719 + }, + { + "epoch": 5.438919582565991, + "grad_norm": 0.21485768258571625, + "learning_rate": 4.533447075299603e-05, + "loss": 1.7349, + "step": 17720 + }, + { + "epoch": 5.439226519337017, + "grad_norm": 0.29502415657043457, + "learning_rate": 4.5329521902504055e-05, + "loss": 1.7325, + "step": 17721 + }, + { + "epoch": 5.439533456108042, + "grad_norm": 0.29448410868644714, + "learning_rate": 4.5324573098168505e-05, + "loss": 1.768, + "step": 17722 + }, + { + "epoch": 5.439840392879067, + "grad_norm": 0.1892058402299881, + "learning_rate": 4.5319624340038244e-05, + "loss": 1.6866, + "step": 17723 + }, + { + "epoch": 5.440147329650092, + "grad_norm": 0.3365040123462677, + "learning_rate": 4.531467562816221e-05, + "loss": 1.7662, + "step": 17724 + }, + { + "epoch": 5.440454266421117, + "grad_norm": 0.2960789203643799, + "learning_rate": 4.53097269625893e-05, + "loss": 1.746, + "step": 17725 + }, + { + "epoch": 5.440761203192142, + "grad_norm": 0.21623700857162476, + "learning_rate": 4.530477834336841e-05, + "loss": 1.7619, + "step": 17726 + }, + { + "epoch": 5.441068139963168, + "grad_norm": 0.29010120034217834, + "learning_rate": 4.5299829770548456e-05, + "loss": 1.717, + "step": 17727 + }, + { + "epoch": 5.441375076734193, + "grad_norm": 0.18467605113983154, + "learning_rate": 4.529488124417833e-05, + "loss": 1.6938, + "step": 17728 + }, + { + "epoch": 5.4416820135052175, + "grad_norm": 0.2875411808490753, + "learning_rate": 4.528993276430695e-05, + "loss": 1.7633, + "step": 17729 + }, + { + "epoch": 5.441988950276243, + "grad_norm": 0.24252675473690033, + "learning_rate": 4.528498433098321e-05, + "loss": 1.6477, + "step": 17730 + }, + { + "epoch": 5.442295887047268, + "grad_norm": 0.18885886669158936, + "learning_rate": 4.5280035944256035e-05, + "loss": 1.7241, + "step": 17731 + }, + { + "epoch": 5.4426028238182935, + "grad_norm": 0.2594204246997833, + "learning_rate": 4.527508760417429e-05, + "loss": 1.6697, + "step": 17732 + }, + { + "epoch": 5.442909760589319, + "grad_norm": 0.23796287178993225, + "learning_rate": 4.527013931078692e-05, + "loss": 1.7035, + "step": 17733 + }, + { + "epoch": 5.443216697360343, + "grad_norm": 0.2591552436351776, + "learning_rate": 4.5265191064142787e-05, + "loss": 1.8014, + "step": 17734 + }, + { + "epoch": 5.443523634131369, + "grad_norm": 0.3316073417663574, + "learning_rate": 4.526024286429082e-05, + "loss": 1.752, + "step": 17735 + }, + { + "epoch": 5.443830570902394, + "grad_norm": 0.2409597635269165, + "learning_rate": 4.52552947112799e-05, + "loss": 1.7662, + "step": 17736 + }, + { + "epoch": 5.444137507673419, + "grad_norm": 0.2896713614463806, + "learning_rate": 4.5250346605158964e-05, + "loss": 1.7168, + "step": 17737 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 0.30870527029037476, + "learning_rate": 4.524539854597686e-05, + "loss": 1.704, + "step": 17738 + }, + { + "epoch": 5.44475138121547, + "grad_norm": 0.2476932406425476, + "learning_rate": 4.524045053378254e-05, + "loss": 1.7649, + "step": 17739 + }, + { + "epoch": 5.445058317986494, + "grad_norm": 0.2937077283859253, + "learning_rate": 4.5235502568624855e-05, + "loss": 1.7028, + "step": 17740 + }, + { + "epoch": 5.44536525475752, + "grad_norm": 0.22881117463111877, + "learning_rate": 4.523055465055273e-05, + "loss": 1.7539, + "step": 17741 + }, + { + "epoch": 5.445672191528545, + "grad_norm": 0.2551842927932739, + "learning_rate": 4.522560677961508e-05, + "loss": 1.7601, + "step": 17742 + }, + { + "epoch": 5.44597912829957, + "grad_norm": 0.27533504366874695, + "learning_rate": 4.5220658955860754e-05, + "loss": 1.7695, + "step": 17743 + }, + { + "epoch": 5.446286065070596, + "grad_norm": 0.23387418687343597, + "learning_rate": 4.5215711179338706e-05, + "loss": 1.7218, + "step": 17744 + }, + { + "epoch": 5.44659300184162, + "grad_norm": 0.37932485342025757, + "learning_rate": 4.521076345009777e-05, + "loss": 1.7685, + "step": 17745 + }, + { + "epoch": 5.4468999386126455, + "grad_norm": 0.2668898105621338, + "learning_rate": 4.520581576818691e-05, + "loss": 1.7217, + "step": 17746 + }, + { + "epoch": 5.447206875383671, + "grad_norm": 0.2417856752872467, + "learning_rate": 4.520086813365496e-05, + "loss": 1.692, + "step": 17747 + }, + { + "epoch": 5.447513812154696, + "grad_norm": 0.3170008063316345, + "learning_rate": 4.519592054655086e-05, + "loss": 1.7565, + "step": 17748 + }, + { + "epoch": 5.4478207489257215, + "grad_norm": 0.20711660385131836, + "learning_rate": 4.519097300692348e-05, + "loss": 1.6708, + "step": 17749 + }, + { + "epoch": 5.448127685696747, + "grad_norm": 0.2196272760629654, + "learning_rate": 4.5186025514821746e-05, + "loss": 1.7335, + "step": 17750 + }, + { + "epoch": 5.448434622467771, + "grad_norm": 0.27563074231147766, + "learning_rate": 4.5181078070294505e-05, + "loss": 1.7383, + "step": 17751 + }, + { + "epoch": 5.448741559238797, + "grad_norm": 0.185418501496315, + "learning_rate": 4.517613067339068e-05, + "loss": 1.6841, + "step": 17752 + }, + { + "epoch": 5.449048496009822, + "grad_norm": 0.26787856221199036, + "learning_rate": 4.517118332415915e-05, + "loss": 1.7733, + "step": 17753 + }, + { + "epoch": 5.449355432780847, + "grad_norm": 0.22114823758602142, + "learning_rate": 4.516623602264885e-05, + "loss": 1.7153, + "step": 17754 + }, + { + "epoch": 5.449662369551873, + "grad_norm": 0.23090483248233795, + "learning_rate": 4.51612887689086e-05, + "loss": 1.7063, + "step": 17755 + }, + { + "epoch": 5.449969306322897, + "grad_norm": 0.3227362632751465, + "learning_rate": 4.515634156298736e-05, + "loss": 1.7528, + "step": 17756 + }, + { + "epoch": 5.4502762430939224, + "grad_norm": 0.24202494323253632, + "learning_rate": 4.515139440493397e-05, + "loss": 1.8119, + "step": 17757 + }, + { + "epoch": 5.450583179864948, + "grad_norm": 0.3778383731842041, + "learning_rate": 4.5146447294797356e-05, + "loss": 1.7589, + "step": 17758 + }, + { + "epoch": 5.450890116635973, + "grad_norm": 0.3726772964000702, + "learning_rate": 4.51415002326264e-05, + "loss": 1.7095, + "step": 17759 + }, + { + "epoch": 5.4511970534069984, + "grad_norm": 0.2424323409795761, + "learning_rate": 4.5136553218469966e-05, + "loss": 1.7374, + "step": 17760 + }, + { + "epoch": 5.451503990178024, + "grad_norm": 0.4347550570964813, + "learning_rate": 4.513160625237699e-05, + "loss": 1.8339, + "step": 17761 + }, + { + "epoch": 5.451810926949048, + "grad_norm": 0.2556018829345703, + "learning_rate": 4.512665933439631e-05, + "loss": 1.7024, + "step": 17762 + }, + { + "epoch": 5.452117863720074, + "grad_norm": 0.36380240321159363, + "learning_rate": 4.512171246457685e-05, + "loss": 1.7706, + "step": 17763 + }, + { + "epoch": 5.452424800491099, + "grad_norm": 0.42120790481567383, + "learning_rate": 4.5116765642967476e-05, + "loss": 1.7609, + "step": 17764 + }, + { + "epoch": 5.452731737262124, + "grad_norm": 0.20573028922080994, + "learning_rate": 4.51118188696171e-05, + "loss": 1.7521, + "step": 17765 + }, + { + "epoch": 5.45303867403315, + "grad_norm": 0.39001402258872986, + "learning_rate": 4.510687214457458e-05, + "loss": 1.7097, + "step": 17766 + }, + { + "epoch": 5.453345610804174, + "grad_norm": 0.2778739333152771, + "learning_rate": 4.510192546788884e-05, + "loss": 1.7677, + "step": 17767 + }, + { + "epoch": 5.453652547575199, + "grad_norm": 0.2500934600830078, + "learning_rate": 4.509697883960872e-05, + "loss": 1.7322, + "step": 17768 + }, + { + "epoch": 5.453959484346225, + "grad_norm": 0.23733557760715485, + "learning_rate": 4.509203225978314e-05, + "loss": 1.7426, + "step": 17769 + }, + { + "epoch": 5.45426642111725, + "grad_norm": 0.20033739507198334, + "learning_rate": 4.508708572846096e-05, + "loss": 1.7093, + "step": 17770 + }, + { + "epoch": 5.454573357888275, + "grad_norm": 0.202667698264122, + "learning_rate": 4.508213924569111e-05, + "loss": 1.6807, + "step": 17771 + }, + { + "epoch": 5.4548802946593, + "grad_norm": 0.1980566531419754, + "learning_rate": 4.507719281152241e-05, + "loss": 1.7102, + "step": 17772 + }, + { + "epoch": 5.455187231430325, + "grad_norm": 0.20612162351608276, + "learning_rate": 4.507224642600381e-05, + "loss": 1.7692, + "step": 17773 + }, + { + "epoch": 5.4554941682013505, + "grad_norm": 0.22859175503253937, + "learning_rate": 4.506730008918412e-05, + "loss": 1.7887, + "step": 17774 + }, + { + "epoch": 5.455801104972376, + "grad_norm": 0.19720709323883057, + "learning_rate": 4.5062353801112285e-05, + "loss": 1.7557, + "step": 17775 + }, + { + "epoch": 5.456108041743401, + "grad_norm": 0.23289217054843903, + "learning_rate": 4.505740756183717e-05, + "loss": 1.7023, + "step": 17776 + }, + { + "epoch": 5.456414978514426, + "grad_norm": 0.2120361477136612, + "learning_rate": 4.505246137140763e-05, + "loss": 1.7249, + "step": 17777 + }, + { + "epoch": 5.456721915285451, + "grad_norm": 0.2094341218471527, + "learning_rate": 4.504751522987259e-05, + "loss": 1.7586, + "step": 17778 + }, + { + "epoch": 5.457028852056476, + "grad_norm": 0.22361092269420624, + "learning_rate": 4.504256913728088e-05, + "loss": 1.737, + "step": 17779 + }, + { + "epoch": 5.457335788827502, + "grad_norm": 0.2100353240966797, + "learning_rate": 4.5037623093681424e-05, + "loss": 1.704, + "step": 17780 + }, + { + "epoch": 5.457642725598527, + "grad_norm": 0.20550231635570526, + "learning_rate": 4.503267709912308e-05, + "loss": 1.7732, + "step": 17781 + }, + { + "epoch": 5.457949662369552, + "grad_norm": 0.22843749821186066, + "learning_rate": 4.502773115365474e-05, + "loss": 1.6916, + "step": 17782 + }, + { + "epoch": 5.458256599140577, + "grad_norm": 0.2351907640695572, + "learning_rate": 4.502278525732526e-05, + "loss": 1.8043, + "step": 17783 + }, + { + "epoch": 5.458563535911602, + "grad_norm": 0.271028071641922, + "learning_rate": 4.501783941018355e-05, + "loss": 1.7665, + "step": 17784 + }, + { + "epoch": 5.458870472682627, + "grad_norm": 0.1974802166223526, + "learning_rate": 4.501289361227846e-05, + "loss": 1.718, + "step": 17785 + }, + { + "epoch": 5.459177409453653, + "grad_norm": 0.23726068437099457, + "learning_rate": 4.5007947863658884e-05, + "loss": 1.7507, + "step": 17786 + }, + { + "epoch": 5.459484346224678, + "grad_norm": 0.2112259715795517, + "learning_rate": 4.5003002164373684e-05, + "loss": 1.8116, + "step": 17787 + }, + { + "epoch": 5.4597912829957025, + "grad_norm": 0.2676105201244354, + "learning_rate": 4.4998056514471764e-05, + "loss": 1.7013, + "step": 17788 + }, + { + "epoch": 5.460098219766728, + "grad_norm": 0.2735576033592224, + "learning_rate": 4.4993110914001956e-05, + "loss": 1.7516, + "step": 17789 + }, + { + "epoch": 5.460405156537753, + "grad_norm": 0.1925152987241745, + "learning_rate": 4.498816536301319e-05, + "loss": 1.7018, + "step": 17790 + }, + { + "epoch": 5.4607120933087785, + "grad_norm": 0.25037717819213867, + "learning_rate": 4.498321986155429e-05, + "loss": 1.7207, + "step": 17791 + }, + { + "epoch": 5.461019030079804, + "grad_norm": 0.20481008291244507, + "learning_rate": 4.497827440967415e-05, + "loss": 1.6988, + "step": 17792 + }, + { + "epoch": 5.461325966850829, + "grad_norm": 0.19434049725532532, + "learning_rate": 4.4973329007421673e-05, + "loss": 1.7363, + "step": 17793 + }, + { + "epoch": 5.461632903621854, + "grad_norm": 0.21797434985637665, + "learning_rate": 4.496838365484567e-05, + "loss": 1.7218, + "step": 17794 + }, + { + "epoch": 5.461939840392879, + "grad_norm": 0.18477453291416168, + "learning_rate": 4.496343835199508e-05, + "loss": 1.7204, + "step": 17795 + }, + { + "epoch": 5.462246777163904, + "grad_norm": 0.21657803654670715, + "learning_rate": 4.495849309891872e-05, + "loss": 1.7671, + "step": 17796 + }, + { + "epoch": 5.46255371393493, + "grad_norm": 0.21027342975139618, + "learning_rate": 4.495354789566549e-05, + "loss": 1.7424, + "step": 17797 + }, + { + "epoch": 5.462860650705955, + "grad_norm": 0.2016189992427826, + "learning_rate": 4.4948602742284256e-05, + "loss": 1.7706, + "step": 17798 + }, + { + "epoch": 5.463167587476979, + "grad_norm": 0.2155935913324356, + "learning_rate": 4.494365763882391e-05, + "loss": 1.7314, + "step": 17799 + }, + { + "epoch": 5.463474524248005, + "grad_norm": 0.22079701721668243, + "learning_rate": 4.493871258533328e-05, + "loss": 1.7938, + "step": 17800 + }, + { + "epoch": 5.46378146101903, + "grad_norm": 0.1907699704170227, + "learning_rate": 4.4933767581861283e-05, + "loss": 1.6958, + "step": 17801 + }, + { + "epoch": 5.464088397790055, + "grad_norm": 0.2784879207611084, + "learning_rate": 4.4928822628456735e-05, + "loss": 1.7285, + "step": 17802 + }, + { + "epoch": 5.464395334561081, + "grad_norm": 0.29470255970954895, + "learning_rate": 4.492387772516855e-05, + "loss": 1.7363, + "step": 17803 + }, + { + "epoch": 5.464702271332105, + "grad_norm": 0.21387436985969543, + "learning_rate": 4.4918932872045575e-05, + "loss": 1.7414, + "step": 17804 + }, + { + "epoch": 5.4650092081031305, + "grad_norm": 0.3102552890777588, + "learning_rate": 4.49139880691367e-05, + "loss": 1.7359, + "step": 17805 + }, + { + "epoch": 5.465316144874156, + "grad_norm": 0.2312939465045929, + "learning_rate": 4.490904331649075e-05, + "loss": 1.7609, + "step": 17806 + }, + { + "epoch": 5.465623081645181, + "grad_norm": 0.323913037776947, + "learning_rate": 4.4904098614156645e-05, + "loss": 1.7693, + "step": 17807 + }, + { + "epoch": 5.4659300184162065, + "grad_norm": 0.2975599467754364, + "learning_rate": 4.48991539621832e-05, + "loss": 1.7506, + "step": 17808 + }, + { + "epoch": 5.466236955187231, + "grad_norm": 0.24702571332454681, + "learning_rate": 4.4894209360619316e-05, + "loss": 1.8258, + "step": 17809 + }, + { + "epoch": 5.466543891958256, + "grad_norm": 0.29016581177711487, + "learning_rate": 4.488926480951386e-05, + "loss": 1.7096, + "step": 17810 + }, + { + "epoch": 5.466850828729282, + "grad_norm": 0.2194555252790451, + "learning_rate": 4.488432030891566e-05, + "loss": 1.788, + "step": 17811 + }, + { + "epoch": 5.467157765500307, + "grad_norm": 0.2504041790962219, + "learning_rate": 4.487937585887363e-05, + "loss": 1.7672, + "step": 17812 + }, + { + "epoch": 5.467464702271332, + "grad_norm": 0.2362445741891861, + "learning_rate": 4.487443145943659e-05, + "loss": 1.7426, + "step": 17813 + }, + { + "epoch": 5.467771639042358, + "grad_norm": 0.20075896382331848, + "learning_rate": 4.486948711065343e-05, + "loss": 1.7406, + "step": 17814 + }, + { + "epoch": 5.468078575813382, + "grad_norm": 0.2219153791666031, + "learning_rate": 4.486454281257299e-05, + "loss": 1.683, + "step": 17815 + }, + { + "epoch": 5.468385512584407, + "grad_norm": 0.22551953792572021, + "learning_rate": 4.4859598565244176e-05, + "loss": 1.7896, + "step": 17816 + }, + { + "epoch": 5.468692449355433, + "grad_norm": 0.2385476976633072, + "learning_rate": 4.48546543687158e-05, + "loss": 1.7799, + "step": 17817 + }, + { + "epoch": 5.468999386126458, + "grad_norm": 0.24263370037078857, + "learning_rate": 4.4849710223036764e-05, + "loss": 1.682, + "step": 17818 + }, + { + "epoch": 5.469306322897483, + "grad_norm": 0.24301160871982574, + "learning_rate": 4.484476612825589e-05, + "loss": 1.8121, + "step": 17819 + }, + { + "epoch": 5.469613259668508, + "grad_norm": 0.2516932487487793, + "learning_rate": 4.483982208442207e-05, + "loss": 1.7344, + "step": 17820 + }, + { + "epoch": 5.469920196439533, + "grad_norm": 0.24309395253658295, + "learning_rate": 4.4834878091584156e-05, + "loss": 1.7746, + "step": 17821 + }, + { + "epoch": 5.4702271332105585, + "grad_norm": 0.24711866676807404, + "learning_rate": 4.4829934149790996e-05, + "loss": 1.7887, + "step": 17822 + }, + { + "epoch": 5.470534069981584, + "grad_norm": 0.2923797369003296, + "learning_rate": 4.4824990259091445e-05, + "loss": 1.7017, + "step": 17823 + }, + { + "epoch": 5.470841006752609, + "grad_norm": 0.21658629179000854, + "learning_rate": 4.482004641953441e-05, + "loss": 1.725, + "step": 17824 + }, + { + "epoch": 5.4711479435236345, + "grad_norm": 0.233424574136734, + "learning_rate": 4.481510263116868e-05, + "loss": 1.74, + "step": 17825 + }, + { + "epoch": 5.471454880294659, + "grad_norm": 0.28997600078582764, + "learning_rate": 4.481015889404315e-05, + "loss": 1.8418, + "step": 17826 + }, + { + "epoch": 5.471761817065684, + "grad_norm": 0.2245558649301529, + "learning_rate": 4.480521520820669e-05, + "loss": 1.7519, + "step": 17827 + }, + { + "epoch": 5.47206875383671, + "grad_norm": 0.21008887887001038, + "learning_rate": 4.480027157370812e-05, + "loss": 1.6977, + "step": 17828 + }, + { + "epoch": 5.472375690607735, + "grad_norm": 0.1990261971950531, + "learning_rate": 4.479532799059633e-05, + "loss": 1.7004, + "step": 17829 + }, + { + "epoch": 5.47268262737876, + "grad_norm": 0.2354540079832077, + "learning_rate": 4.479038445892014e-05, + "loss": 1.7755, + "step": 17830 + }, + { + "epoch": 5.472989564149785, + "grad_norm": 0.21904973685741425, + "learning_rate": 4.478544097872843e-05, + "loss": 1.8328, + "step": 17831 + }, + { + "epoch": 5.47329650092081, + "grad_norm": 0.21188503503799438, + "learning_rate": 4.4780497550070055e-05, + "loss": 1.7105, + "step": 17832 + }, + { + "epoch": 5.473603437691835, + "grad_norm": 0.2196870595216751, + "learning_rate": 4.477555417299386e-05, + "loss": 1.7261, + "step": 17833 + }, + { + "epoch": 5.473910374462861, + "grad_norm": 0.24522331357002258, + "learning_rate": 4.477061084754869e-05, + "loss": 1.8101, + "step": 17834 + }, + { + "epoch": 5.474217311233886, + "grad_norm": 0.24073927104473114, + "learning_rate": 4.476566757378343e-05, + "loss": 1.8295, + "step": 17835 + }, + { + "epoch": 5.474524248004911, + "grad_norm": 0.3724605143070221, + "learning_rate": 4.476072435174689e-05, + "loss": 1.7785, + "step": 17836 + }, + { + "epoch": 5.474831184775936, + "grad_norm": 0.25552257895469666, + "learning_rate": 4.475578118148797e-05, + "loss": 1.6978, + "step": 17837 + }, + { + "epoch": 5.475138121546961, + "grad_norm": 0.22402255237102509, + "learning_rate": 4.475083806305546e-05, + "loss": 1.697, + "step": 17838 + }, + { + "epoch": 5.475445058317987, + "grad_norm": 0.25869324803352356, + "learning_rate": 4.474589499649826e-05, + "loss": 1.7026, + "step": 17839 + }, + { + "epoch": 5.475751995089012, + "grad_norm": 0.249742329120636, + "learning_rate": 4.47409519818652e-05, + "loss": 1.7738, + "step": 17840 + }, + { + "epoch": 5.476058931860037, + "grad_norm": 0.28722140192985535, + "learning_rate": 4.473600901920515e-05, + "loss": 1.7555, + "step": 17841 + }, + { + "epoch": 5.476365868631062, + "grad_norm": 0.250964879989624, + "learning_rate": 4.4731066108566926e-05, + "loss": 1.6951, + "step": 17842 + }, + { + "epoch": 5.476672805402087, + "grad_norm": 0.20562006533145905, + "learning_rate": 4.472612324999942e-05, + "loss": 1.7109, + "step": 17843 + }, + { + "epoch": 5.476979742173112, + "grad_norm": 0.26964858174324036, + "learning_rate": 4.472118044355144e-05, + "loss": 1.7468, + "step": 17844 + }, + { + "epoch": 5.477286678944138, + "grad_norm": 0.25700438022613525, + "learning_rate": 4.471623768927184e-05, + "loss": 1.7046, + "step": 17845 + }, + { + "epoch": 5.477593615715163, + "grad_norm": 0.2152809500694275, + "learning_rate": 4.47112949872095e-05, + "loss": 1.7464, + "step": 17846 + }, + { + "epoch": 5.4779005524861875, + "grad_norm": 0.26429688930511475, + "learning_rate": 4.470635233741321e-05, + "loss": 1.7629, + "step": 17847 + }, + { + "epoch": 5.478207489257213, + "grad_norm": 0.18546637892723083, + "learning_rate": 4.470140973993188e-05, + "loss": 1.7143, + "step": 17848 + }, + { + "epoch": 5.478514426028238, + "grad_norm": 0.1927761435508728, + "learning_rate": 4.46964671948143e-05, + "loss": 1.6919, + "step": 17849 + }, + { + "epoch": 5.4788213627992635, + "grad_norm": 0.21581199765205383, + "learning_rate": 4.469152470210935e-05, + "loss": 1.7596, + "step": 17850 + }, + { + "epoch": 5.479128299570289, + "grad_norm": 0.20244133472442627, + "learning_rate": 4.468658226186586e-05, + "loss": 1.7372, + "step": 17851 + }, + { + "epoch": 5.479435236341313, + "grad_norm": 0.2467198520898819, + "learning_rate": 4.468163987413269e-05, + "loss": 1.7361, + "step": 17852 + }, + { + "epoch": 5.479742173112339, + "grad_norm": 0.22134411334991455, + "learning_rate": 4.467669753895866e-05, + "loss": 1.7276, + "step": 17853 + }, + { + "epoch": 5.480049109883364, + "grad_norm": 0.1953750103712082, + "learning_rate": 4.4671755256392636e-05, + "loss": 1.6931, + "step": 17854 + }, + { + "epoch": 5.480356046654389, + "grad_norm": 0.21492068469524384, + "learning_rate": 4.466681302648343e-05, + "loss": 1.7437, + "step": 17855 + }, + { + "epoch": 5.480662983425415, + "grad_norm": 0.24377848207950592, + "learning_rate": 4.466187084927993e-05, + "loss": 1.7869, + "step": 17856 + }, + { + "epoch": 5.48096992019644, + "grad_norm": 0.23674219846725464, + "learning_rate": 4.465692872483093e-05, + "loss": 1.8142, + "step": 17857 + }, + { + "epoch": 5.481276856967464, + "grad_norm": 0.25036486983299255, + "learning_rate": 4.4651986653185304e-05, + "loss": 1.8075, + "step": 17858 + }, + { + "epoch": 5.48158379373849, + "grad_norm": 0.32649150490760803, + "learning_rate": 4.4647044634391867e-05, + "loss": 1.7177, + "step": 17859 + }, + { + "epoch": 5.481890730509515, + "grad_norm": 0.20300604403018951, + "learning_rate": 4.46421026684995e-05, + "loss": 1.6912, + "step": 17860 + }, + { + "epoch": 5.48219766728054, + "grad_norm": 0.24630679190158844, + "learning_rate": 4.4637160755557e-05, + "loss": 1.8312, + "step": 17861 + }, + { + "epoch": 5.482504604051566, + "grad_norm": 0.2263093739748001, + "learning_rate": 4.46322188956132e-05, + "loss": 1.7214, + "step": 17862 + }, + { + "epoch": 5.48281154082259, + "grad_norm": 0.22949177026748657, + "learning_rate": 4.462727708871699e-05, + "loss": 1.6882, + "step": 17863 + }, + { + "epoch": 5.4831184775936155, + "grad_norm": 0.23389381170272827, + "learning_rate": 4.4622335334917156e-05, + "loss": 1.7613, + "step": 17864 + }, + { + "epoch": 5.483425414364641, + "grad_norm": 0.2259683907032013, + "learning_rate": 4.461739363426257e-05, + "loss": 1.7021, + "step": 17865 + }, + { + "epoch": 5.483732351135666, + "grad_norm": 0.3213486969470978, + "learning_rate": 4.4612451986802036e-05, + "loss": 1.7469, + "step": 17866 + }, + { + "epoch": 5.4840392879066915, + "grad_norm": 0.3415670096874237, + "learning_rate": 4.4607510392584426e-05, + "loss": 1.7605, + "step": 17867 + }, + { + "epoch": 5.484346224677717, + "grad_norm": 0.2079494297504425, + "learning_rate": 4.460256885165855e-05, + "loss": 1.7832, + "step": 17868 + }, + { + "epoch": 5.484653161448741, + "grad_norm": 0.30334988236427307, + "learning_rate": 4.459762736407327e-05, + "loss": 1.6825, + "step": 17869 + }, + { + "epoch": 5.484960098219767, + "grad_norm": 0.22320730984210968, + "learning_rate": 4.4592685929877374e-05, + "loss": 1.7452, + "step": 17870 + }, + { + "epoch": 5.485267034990792, + "grad_norm": 0.25325682759284973, + "learning_rate": 4.458774454911975e-05, + "loss": 1.7359, + "step": 17871 + }, + { + "epoch": 5.485573971761817, + "grad_norm": 0.305501788854599, + "learning_rate": 4.458280322184919e-05, + "loss": 1.7161, + "step": 17872 + }, + { + "epoch": 5.485880908532843, + "grad_norm": 0.19486182928085327, + "learning_rate": 4.457786194811455e-05, + "loss": 1.7097, + "step": 17873 + }, + { + "epoch": 5.486187845303867, + "grad_norm": 0.3306363821029663, + "learning_rate": 4.457292072796465e-05, + "loss": 1.7653, + "step": 17874 + }, + { + "epoch": 5.486494782074892, + "grad_norm": 0.25172874331474304, + "learning_rate": 4.456797956144835e-05, + "loss": 1.7289, + "step": 17875 + }, + { + "epoch": 5.486801718845918, + "grad_norm": 0.24508661031723022, + "learning_rate": 4.456303844861444e-05, + "loss": 1.7255, + "step": 17876 + }, + { + "epoch": 5.487108655616943, + "grad_norm": 0.3043360114097595, + "learning_rate": 4.455809738951178e-05, + "loss": 1.7852, + "step": 17877 + }, + { + "epoch": 5.487415592387968, + "grad_norm": 0.22181758284568787, + "learning_rate": 4.4553156384189186e-05, + "loss": 1.7887, + "step": 17878 + }, + { + "epoch": 5.487722529158993, + "grad_norm": 0.2174321413040161, + "learning_rate": 4.454821543269549e-05, + "loss": 1.7024, + "step": 17879 + }, + { + "epoch": 5.488029465930018, + "grad_norm": 0.19634750485420227, + "learning_rate": 4.4543274535079535e-05, + "loss": 1.7451, + "step": 17880 + }, + { + "epoch": 5.4883364027010435, + "grad_norm": 0.20481908321380615, + "learning_rate": 4.4538333691390125e-05, + "loss": 1.7068, + "step": 17881 + }, + { + "epoch": 5.488643339472069, + "grad_norm": 0.2025458663702011, + "learning_rate": 4.453339290167612e-05, + "loss": 1.72, + "step": 17882 + }, + { + "epoch": 5.488950276243094, + "grad_norm": 0.21013019979000092, + "learning_rate": 4.452845216598632e-05, + "loss": 1.7113, + "step": 17883 + }, + { + "epoch": 5.4892572130141195, + "grad_norm": 0.2057499885559082, + "learning_rate": 4.452351148436956e-05, + "loss": 1.7007, + "step": 17884 + }, + { + "epoch": 5.489564149785144, + "grad_norm": 0.19957664608955383, + "learning_rate": 4.4518570856874666e-05, + "loss": 1.6999, + "step": 17885 + }, + { + "epoch": 5.489871086556169, + "grad_norm": 0.22609412670135498, + "learning_rate": 4.451363028355048e-05, + "loss": 1.8124, + "step": 17886 + }, + { + "epoch": 5.490178023327195, + "grad_norm": 0.27350863814353943, + "learning_rate": 4.4508689764445805e-05, + "loss": 1.8042, + "step": 17887 + }, + { + "epoch": 5.49048496009822, + "grad_norm": 0.23416854441165924, + "learning_rate": 4.450374929960949e-05, + "loss": 1.7607, + "step": 17888 + }, + { + "epoch": 5.490791896869245, + "grad_norm": 0.2891421318054199, + "learning_rate": 4.449880888909033e-05, + "loss": 1.7419, + "step": 17889 + }, + { + "epoch": 5.49109883364027, + "grad_norm": 0.2458745837211609, + "learning_rate": 4.449386853293717e-05, + "loss": 1.7234, + "step": 17890 + }, + { + "epoch": 5.491405770411295, + "grad_norm": 0.23390449583530426, + "learning_rate": 4.4488928231198826e-05, + "loss": 1.7482, + "step": 17891 + }, + { + "epoch": 5.49171270718232, + "grad_norm": 0.3509657084941864, + "learning_rate": 4.448398798392414e-05, + "loss": 1.7639, + "step": 17892 + }, + { + "epoch": 5.492019643953346, + "grad_norm": 0.2487955242395401, + "learning_rate": 4.4479047791161916e-05, + "loss": 1.7163, + "step": 17893 + }, + { + "epoch": 5.492326580724371, + "grad_norm": 0.22630274295806885, + "learning_rate": 4.4474107652960956e-05, + "loss": 1.7449, + "step": 17894 + }, + { + "epoch": 5.4926335174953955, + "grad_norm": 0.25909537076950073, + "learning_rate": 4.446916756937012e-05, + "loss": 1.7396, + "step": 17895 + }, + { + "epoch": 5.492940454266421, + "grad_norm": 0.29732683300971985, + "learning_rate": 4.446422754043819e-05, + "loss": 1.8109, + "step": 17896 + }, + { + "epoch": 5.493247391037446, + "grad_norm": 0.22436772286891937, + "learning_rate": 4.4459287566214035e-05, + "loss": 1.7657, + "step": 17897 + }, + { + "epoch": 5.4935543278084715, + "grad_norm": 0.24584892392158508, + "learning_rate": 4.445434764674643e-05, + "loss": 1.73, + "step": 17898 + }, + { + "epoch": 5.493861264579497, + "grad_norm": 0.27446454763412476, + "learning_rate": 4.444940778208423e-05, + "loss": 1.7428, + "step": 17899 + }, + { + "epoch": 5.494168201350522, + "grad_norm": 0.20442110300064087, + "learning_rate": 4.4444467972276215e-05, + "loss": 1.6911, + "step": 17900 + }, + { + "epoch": 5.494475138121547, + "grad_norm": 0.23089268803596497, + "learning_rate": 4.4439528217371236e-05, + "loss": 1.7192, + "step": 17901 + }, + { + "epoch": 5.494782074892572, + "grad_norm": 0.19402450323104858, + "learning_rate": 4.443458851741808e-05, + "loss": 1.7304, + "step": 17902 + }, + { + "epoch": 5.495089011663597, + "grad_norm": 0.2310219705104828, + "learning_rate": 4.442964887246561e-05, + "loss": 1.6963, + "step": 17903 + }, + { + "epoch": 5.495395948434623, + "grad_norm": 0.25573140382766724, + "learning_rate": 4.44247092825626e-05, + "loss": 1.7781, + "step": 17904 + }, + { + "epoch": 5.495702885205648, + "grad_norm": 0.20298753678798676, + "learning_rate": 4.4419769747757894e-05, + "loss": 1.763, + "step": 17905 + }, + { + "epoch": 5.496009821976672, + "grad_norm": 0.22243307530879974, + "learning_rate": 4.441483026810027e-05, + "loss": 1.7345, + "step": 17906 + }, + { + "epoch": 5.496316758747698, + "grad_norm": 0.19801411032676697, + "learning_rate": 4.4409890843638584e-05, + "loss": 1.7504, + "step": 17907 + }, + { + "epoch": 5.496623695518723, + "grad_norm": 0.2804374396800995, + "learning_rate": 4.440495147442162e-05, + "loss": 1.7985, + "step": 17908 + }, + { + "epoch": 5.496930632289748, + "grad_norm": 0.21824021637439728, + "learning_rate": 4.440001216049822e-05, + "loss": 1.6703, + "step": 17909 + }, + { + "epoch": 5.497237569060774, + "grad_norm": 0.23335935175418854, + "learning_rate": 4.439507290191719e-05, + "loss": 1.7426, + "step": 17910 + }, + { + "epoch": 5.497544505831799, + "grad_norm": 0.2093769609928131, + "learning_rate": 4.4390133698727315e-05, + "loss": 1.7178, + "step": 17911 + }, + { + "epoch": 5.4978514426028235, + "grad_norm": 0.18354324996471405, + "learning_rate": 4.438519455097743e-05, + "loss": 1.6849, + "step": 17912 + }, + { + "epoch": 5.498158379373849, + "grad_norm": 0.26826491951942444, + "learning_rate": 4.438025545871633e-05, + "loss": 1.7804, + "step": 17913 + }, + { + "epoch": 5.498465316144874, + "grad_norm": 0.29171738028526306, + "learning_rate": 4.437531642199288e-05, + "loss": 1.764, + "step": 17914 + }, + { + "epoch": 5.4987722529158995, + "grad_norm": 0.17870590090751648, + "learning_rate": 4.437037744085581e-05, + "loss": 1.6789, + "step": 17915 + }, + { + "epoch": 5.499079189686925, + "grad_norm": 0.25412192940711975, + "learning_rate": 4.4365438515354e-05, + "loss": 1.7536, + "step": 17916 + }, + { + "epoch": 5.499386126457949, + "grad_norm": 0.24465163052082062, + "learning_rate": 4.4360499645536203e-05, + "loss": 1.7582, + "step": 17917 + }, + { + "epoch": 5.499693063228975, + "grad_norm": 0.21248452365398407, + "learning_rate": 4.4355560831451264e-05, + "loss": 1.7209, + "step": 17918 + }, + { + "epoch": 5.5, + "grad_norm": 0.21018685400485992, + "learning_rate": 4.435062207314797e-05, + "loss": 1.7461, + "step": 17919 + }, + { + "epoch": 5.500306936771025, + "grad_norm": 0.1880551278591156, + "learning_rate": 4.434568337067517e-05, + "loss": 1.6818, + "step": 17920 + }, + { + "epoch": 5.500613873542051, + "grad_norm": 0.2224894016981125, + "learning_rate": 4.434074472408161e-05, + "loss": 1.8211, + "step": 17921 + }, + { + "epoch": 5.500920810313076, + "grad_norm": 0.19419749081134796, + "learning_rate": 4.433580613341615e-05, + "loss": 1.7625, + "step": 17922 + }, + { + "epoch": 5.5012277470841005, + "grad_norm": 0.2167430967092514, + "learning_rate": 4.433086759872756e-05, + "loss": 1.745, + "step": 17923 + }, + { + "epoch": 5.501534683855126, + "grad_norm": 0.1926383525133133, + "learning_rate": 4.4325929120064665e-05, + "loss": 1.7353, + "step": 17924 + }, + { + "epoch": 5.501841620626151, + "grad_norm": 0.22943224012851715, + "learning_rate": 4.432099069747625e-05, + "loss": 1.6903, + "step": 17925 + }, + { + "epoch": 5.5021485573971765, + "grad_norm": 0.18218693137168884, + "learning_rate": 4.431605233101116e-05, + "loss": 1.742, + "step": 17926 + }, + { + "epoch": 5.502455494168201, + "grad_norm": 0.2660788893699646, + "learning_rate": 4.431111402071817e-05, + "loss": 1.7208, + "step": 17927 + }, + { + "epoch": 5.502762430939226, + "grad_norm": 0.20015788078308105, + "learning_rate": 4.430617576664606e-05, + "loss": 1.721, + "step": 17928 + }, + { + "epoch": 5.503069367710252, + "grad_norm": 0.20011179149150848, + "learning_rate": 4.430123756884368e-05, + "loss": 1.7488, + "step": 17929 + }, + { + "epoch": 5.503376304481277, + "grad_norm": 0.22541452944278717, + "learning_rate": 4.429629942735979e-05, + "loss": 1.7997, + "step": 17930 + }, + { + "epoch": 5.503683241252302, + "grad_norm": 0.21067193150520325, + "learning_rate": 4.4291361342243236e-05, + "loss": 1.6652, + "step": 17931 + }, + { + "epoch": 5.503990178023328, + "grad_norm": 0.38401395082473755, + "learning_rate": 4.428642331354278e-05, + "loss": 1.815, + "step": 17932 + }, + { + "epoch": 5.504297114794352, + "grad_norm": 0.22600100934505463, + "learning_rate": 4.428148534130725e-05, + "loss": 1.7593, + "step": 17933 + }, + { + "epoch": 5.504604051565377, + "grad_norm": 0.21340666711330414, + "learning_rate": 4.427654742558542e-05, + "loss": 1.7447, + "step": 17934 + }, + { + "epoch": 5.504910988336403, + "grad_norm": 0.20676501095294952, + "learning_rate": 4.427160956642611e-05, + "loss": 1.7174, + "step": 17935 + }, + { + "epoch": 5.505217925107428, + "grad_norm": 0.2374252825975418, + "learning_rate": 4.42666717638781e-05, + "loss": 1.703, + "step": 17936 + }, + { + "epoch": 5.505524861878453, + "grad_norm": 0.20975756645202637, + "learning_rate": 4.426173401799022e-05, + "loss": 1.7076, + "step": 17937 + }, + { + "epoch": 5.505831798649478, + "grad_norm": 0.23778517544269562, + "learning_rate": 4.4256796328811226e-05, + "loss": 1.7647, + "step": 17938 + }, + { + "epoch": 5.506138735420503, + "grad_norm": 0.2088557481765747, + "learning_rate": 4.425185869638996e-05, + "loss": 1.764, + "step": 17939 + }, + { + "epoch": 5.5064456721915285, + "grad_norm": 0.26953455805778503, + "learning_rate": 4.424692112077518e-05, + "loss": 1.7351, + "step": 17940 + }, + { + "epoch": 5.506752608962554, + "grad_norm": 0.2762589454650879, + "learning_rate": 4.42419836020157e-05, + "loss": 1.7051, + "step": 17941 + }, + { + "epoch": 5.507059545733579, + "grad_norm": 0.19611702859401703, + "learning_rate": 4.4237046140160306e-05, + "loss": 1.7445, + "step": 17942 + }, + { + "epoch": 5.5073664825046045, + "grad_norm": 0.2708270251750946, + "learning_rate": 4.4232108735257824e-05, + "loss": 1.7284, + "step": 17943 + }, + { + "epoch": 5.507673419275629, + "grad_norm": 0.24194146692752838, + "learning_rate": 4.422717138735701e-05, + "loss": 1.7302, + "step": 17944 + }, + { + "epoch": 5.507980356046654, + "grad_norm": 0.21558286249637604, + "learning_rate": 4.422223409650666e-05, + "loss": 1.7435, + "step": 17945 + }, + { + "epoch": 5.50828729281768, + "grad_norm": 0.1842707246541977, + "learning_rate": 4.4217296862755597e-05, + "loss": 1.6579, + "step": 17946 + }, + { + "epoch": 5.508594229588705, + "grad_norm": 0.20211941003799438, + "learning_rate": 4.4212359686152576e-05, + "loss": 1.8017, + "step": 17947 + }, + { + "epoch": 5.50890116635973, + "grad_norm": 0.23749016225337982, + "learning_rate": 4.420742256674644e-05, + "loss": 1.6721, + "step": 17948 + }, + { + "epoch": 5.509208103130755, + "grad_norm": 0.2076852172613144, + "learning_rate": 4.420248550458592e-05, + "loss": 1.7102, + "step": 17949 + }, + { + "epoch": 5.50951503990178, + "grad_norm": 0.2599447965621948, + "learning_rate": 4.419754849971986e-05, + "loss": 1.7819, + "step": 17950 + }, + { + "epoch": 5.509821976672805, + "grad_norm": 0.2017187476158142, + "learning_rate": 4.4192611552197e-05, + "loss": 1.6812, + "step": 17951 + }, + { + "epoch": 5.510128913443831, + "grad_norm": 0.21972116827964783, + "learning_rate": 4.418767466206617e-05, + "loss": 1.7122, + "step": 17952 + }, + { + "epoch": 5.510435850214856, + "grad_norm": 0.21750569343566895, + "learning_rate": 4.418273782937613e-05, + "loss": 1.7285, + "step": 17953 + }, + { + "epoch": 5.510742786985881, + "grad_norm": 0.19349125027656555, + "learning_rate": 4.417780105417572e-05, + "loss": 1.7383, + "step": 17954 + }, + { + "epoch": 5.511049723756906, + "grad_norm": 0.2094268798828125, + "learning_rate": 4.417286433651366e-05, + "loss": 1.7107, + "step": 17955 + }, + { + "epoch": 5.511356660527931, + "grad_norm": 0.2684331238269806, + "learning_rate": 4.41679276764388e-05, + "loss": 1.7336, + "step": 17956 + }, + { + "epoch": 5.5116635972989565, + "grad_norm": 0.27616915106773376, + "learning_rate": 4.416299107399987e-05, + "loss": 1.7439, + "step": 17957 + }, + { + "epoch": 5.511970534069982, + "grad_norm": 0.23874540627002716, + "learning_rate": 4.415805452924569e-05, + "loss": 1.7979, + "step": 17958 + }, + { + "epoch": 5.512277470841006, + "grad_norm": 0.21870921552181244, + "learning_rate": 4.415311804222503e-05, + "loss": 1.6674, + "step": 17959 + }, + { + "epoch": 5.512584407612032, + "grad_norm": 0.23042429983615875, + "learning_rate": 4.414818161298671e-05, + "loss": 1.7588, + "step": 17960 + }, + { + "epoch": 5.512891344383057, + "grad_norm": 0.2957153916358948, + "learning_rate": 4.4143245241579486e-05, + "loss": 1.8412, + "step": 17961 + }, + { + "epoch": 5.513198281154082, + "grad_norm": 0.28292644023895264, + "learning_rate": 4.413830892805213e-05, + "loss": 1.7915, + "step": 17962 + }, + { + "epoch": 5.513505217925108, + "grad_norm": 0.26526281237602234, + "learning_rate": 4.413337267245344e-05, + "loss": 1.7199, + "step": 17963 + }, + { + "epoch": 5.513812154696133, + "grad_norm": 0.41243693232536316, + "learning_rate": 4.4128436474832204e-05, + "loss": 1.7419, + "step": 17964 + }, + { + "epoch": 5.514119091467157, + "grad_norm": 0.2747771739959717, + "learning_rate": 4.4123500335237214e-05, + "loss": 1.7449, + "step": 17965 + }, + { + "epoch": 5.514426028238183, + "grad_norm": 0.25944122672080994, + "learning_rate": 4.4118564253717216e-05, + "loss": 1.7667, + "step": 17966 + }, + { + "epoch": 5.514732965009208, + "grad_norm": 0.32558533549308777, + "learning_rate": 4.411362823032103e-05, + "loss": 1.7292, + "step": 17967 + }, + { + "epoch": 5.515039901780233, + "grad_norm": 0.20190958678722382, + "learning_rate": 4.4108692265097404e-05, + "loss": 1.7529, + "step": 17968 + }, + { + "epoch": 5.515346838551259, + "grad_norm": 0.35485807061195374, + "learning_rate": 4.410375635809514e-05, + "loss": 1.7335, + "step": 17969 + }, + { + "epoch": 5.515653775322283, + "grad_norm": 0.2670159935951233, + "learning_rate": 4.409882050936301e-05, + "loss": 1.6789, + "step": 17970 + }, + { + "epoch": 5.5159607120933085, + "grad_norm": 0.19106578826904297, + "learning_rate": 4.409388471894981e-05, + "loss": 1.708, + "step": 17971 + }, + { + "epoch": 5.516267648864334, + "grad_norm": 0.2707268297672272, + "learning_rate": 4.4088948986904286e-05, + "loss": 1.7917, + "step": 17972 + }, + { + "epoch": 5.516574585635359, + "grad_norm": 0.2329230159521103, + "learning_rate": 4.408401331327525e-05, + "loss": 1.7378, + "step": 17973 + }, + { + "epoch": 5.5168815224063845, + "grad_norm": 0.22164998948574066, + "learning_rate": 4.4079077698111436e-05, + "loss": 1.7287, + "step": 17974 + }, + { + "epoch": 5.51718845917741, + "grad_norm": 0.25895699858665466, + "learning_rate": 4.4074142141461665e-05, + "loss": 1.7158, + "step": 17975 + }, + { + "epoch": 5.517495395948434, + "grad_norm": 0.2617860436439514, + "learning_rate": 4.4069206643374695e-05, + "loss": 1.7767, + "step": 17976 + }, + { + "epoch": 5.51780233271946, + "grad_norm": 0.20443588495254517, + "learning_rate": 4.40642712038993e-05, + "loss": 1.7371, + "step": 17977 + }, + { + "epoch": 5.518109269490485, + "grad_norm": 0.26251545548439026, + "learning_rate": 4.4059335823084266e-05, + "loss": 1.8154, + "step": 17978 + }, + { + "epoch": 5.51841620626151, + "grad_norm": 0.2315993458032608, + "learning_rate": 4.405440050097833e-05, + "loss": 1.7426, + "step": 17979 + }, + { + "epoch": 5.518723143032536, + "grad_norm": 0.19467706978321075, + "learning_rate": 4.404946523763031e-05, + "loss": 1.7418, + "step": 17980 + }, + { + "epoch": 5.51903007980356, + "grad_norm": 0.2387837916612625, + "learning_rate": 4.4044530033088946e-05, + "loss": 1.7648, + "step": 17981 + }, + { + "epoch": 5.519337016574585, + "grad_norm": 0.21097531914710999, + "learning_rate": 4.403959488740306e-05, + "loss": 1.7198, + "step": 17982 + }, + { + "epoch": 5.519643953345611, + "grad_norm": 0.22303247451782227, + "learning_rate": 4.403465980062136e-05, + "loss": 1.7679, + "step": 17983 + }, + { + "epoch": 5.519950890116636, + "grad_norm": 0.19705620408058167, + "learning_rate": 4.4029724772792666e-05, + "loss": 1.7747, + "step": 17984 + }, + { + "epoch": 5.520257826887661, + "grad_norm": 0.20864570140838623, + "learning_rate": 4.4024789803965715e-05, + "loss": 1.6797, + "step": 17985 + }, + { + "epoch": 5.520564763658687, + "grad_norm": 0.1917724758386612, + "learning_rate": 4.401985489418931e-05, + "loss": 1.7246, + "step": 17986 + }, + { + "epoch": 5.520871700429711, + "grad_norm": 0.25668975710868835, + "learning_rate": 4.401492004351219e-05, + "loss": 1.7245, + "step": 17987 + }, + { + "epoch": 5.5211786372007365, + "grad_norm": 0.22576093673706055, + "learning_rate": 4.4009985251983146e-05, + "loss": 1.6766, + "step": 17988 + }, + { + "epoch": 5.521485573971762, + "grad_norm": 0.18614664673805237, + "learning_rate": 4.400505051965093e-05, + "loss": 1.7379, + "step": 17989 + }, + { + "epoch": 5.521792510742787, + "grad_norm": 0.21472783386707306, + "learning_rate": 4.4000115846564335e-05, + "loss": 1.7203, + "step": 17990 + }, + { + "epoch": 5.5220994475138125, + "grad_norm": 0.201142817735672, + "learning_rate": 4.39951812327721e-05, + "loss": 1.7049, + "step": 17991 + }, + { + "epoch": 5.522406384284837, + "grad_norm": 0.193614661693573, + "learning_rate": 4.3990246678323e-05, + "loss": 1.6938, + "step": 17992 + }, + { + "epoch": 5.522713321055862, + "grad_norm": 0.23343239724636078, + "learning_rate": 4.398531218326582e-05, + "loss": 1.744, + "step": 17993 + }, + { + "epoch": 5.523020257826888, + "grad_norm": 0.26271605491638184, + "learning_rate": 4.3980377747649305e-05, + "loss": 1.7458, + "step": 17994 + }, + { + "epoch": 5.523327194597913, + "grad_norm": 0.2048577219247818, + "learning_rate": 4.397544337152223e-05, + "loss": 1.763, + "step": 17995 + }, + { + "epoch": 5.523634131368938, + "grad_norm": 0.27748194336891174, + "learning_rate": 4.397050905493334e-05, + "loss": 1.7346, + "step": 17996 + }, + { + "epoch": 5.523941068139964, + "grad_norm": 0.3040253520011902, + "learning_rate": 4.3965574797931417e-05, + "loss": 1.7396, + "step": 17997 + }, + { + "epoch": 5.524248004910988, + "grad_norm": 0.3310317397117615, + "learning_rate": 4.396064060056523e-05, + "loss": 1.8094, + "step": 17998 + }, + { + "epoch": 5.524554941682013, + "grad_norm": 0.21845392882823944, + "learning_rate": 4.395570646288352e-05, + "loss": 1.7013, + "step": 17999 + }, + { + "epoch": 5.524861878453039, + "grad_norm": 0.319876492023468, + "learning_rate": 4.395077238493506e-05, + "loss": 1.7985, + "step": 18000 + }, + { + "epoch": 5.525168815224064, + "grad_norm": 0.28261950612068176, + "learning_rate": 4.394583836676863e-05, + "loss": 1.7979, + "step": 18001 + }, + { + "epoch": 5.525475751995089, + "grad_norm": 0.20874030888080597, + "learning_rate": 4.394090440843296e-05, + "loss": 1.7363, + "step": 18002 + }, + { + "epoch": 5.525782688766114, + "grad_norm": 0.28587406873703003, + "learning_rate": 4.393597050997684e-05, + "loss": 1.6787, + "step": 18003 + }, + { + "epoch": 5.526089625537139, + "grad_norm": 0.2719021439552307, + "learning_rate": 4.393103667144899e-05, + "loss": 1.7625, + "step": 18004 + }, + { + "epoch": 5.526396562308165, + "grad_norm": 0.22485414147377014, + "learning_rate": 4.392610289289821e-05, + "loss": 1.6847, + "step": 18005 + }, + { + "epoch": 5.52670349907919, + "grad_norm": 0.3500347435474396, + "learning_rate": 4.392116917437322e-05, + "loss": 1.7244, + "step": 18006 + }, + { + "epoch": 5.527010435850215, + "grad_norm": 0.26308783888816833, + "learning_rate": 4.3916235515922836e-05, + "loss": 1.7738, + "step": 18007 + }, + { + "epoch": 5.52731737262124, + "grad_norm": 0.27030646800994873, + "learning_rate": 4.391130191759574e-05, + "loss": 1.7149, + "step": 18008 + }, + { + "epoch": 5.527624309392265, + "grad_norm": 0.4137318730354309, + "learning_rate": 4.390636837944076e-05, + "loss": 1.7581, + "step": 18009 + }, + { + "epoch": 5.52793124616329, + "grad_norm": 0.2462068647146225, + "learning_rate": 4.390143490150659e-05, + "loss": 1.7767, + "step": 18010 + }, + { + "epoch": 5.528238182934316, + "grad_norm": 0.27424392104148865, + "learning_rate": 4.3896501483842036e-05, + "loss": 1.7701, + "step": 18011 + }, + { + "epoch": 5.528545119705341, + "grad_norm": 0.31268683075904846, + "learning_rate": 4.389156812649583e-05, + "loss": 1.7342, + "step": 18012 + }, + { + "epoch": 5.5288520564763655, + "grad_norm": 0.20428471267223358, + "learning_rate": 4.388663482951671e-05, + "loss": 1.7083, + "step": 18013 + }, + { + "epoch": 5.529158993247391, + "grad_norm": 0.322344034910202, + "learning_rate": 4.3881701592953475e-05, + "loss": 1.7423, + "step": 18014 + }, + { + "epoch": 5.529465930018416, + "grad_norm": 0.2267894744873047, + "learning_rate": 4.387676841685483e-05, + "loss": 1.7309, + "step": 18015 + }, + { + "epoch": 5.5297728667894415, + "grad_norm": 0.23041954636573792, + "learning_rate": 4.387183530126955e-05, + "loss": 1.7352, + "step": 18016 + }, + { + "epoch": 5.530079803560467, + "grad_norm": 0.31139662861824036, + "learning_rate": 4.386690224624638e-05, + "loss": 1.7223, + "step": 18017 + }, + { + "epoch": 5.530386740331492, + "grad_norm": 0.20144063234329224, + "learning_rate": 4.38619692518341e-05, + "loss": 1.7607, + "step": 18018 + }, + { + "epoch": 5.530693677102517, + "grad_norm": 0.23812296986579895, + "learning_rate": 4.385703631808142e-05, + "loss": 1.7599, + "step": 18019 + }, + { + "epoch": 5.531000613873542, + "grad_norm": 0.2442231923341751, + "learning_rate": 4.385210344503712e-05, + "loss": 1.7094, + "step": 18020 + }, + { + "epoch": 5.531307550644567, + "grad_norm": 0.19497406482696533, + "learning_rate": 4.384717063274992e-05, + "loss": 1.7686, + "step": 18021 + }, + { + "epoch": 5.531614487415593, + "grad_norm": 0.29085835814476013, + "learning_rate": 4.38422378812686e-05, + "loss": 1.7454, + "step": 18022 + }, + { + "epoch": 5.531921424186618, + "grad_norm": 0.2701610028743744, + "learning_rate": 4.3837305190641876e-05, + "loss": 1.7376, + "step": 18023 + }, + { + "epoch": 5.532228360957642, + "grad_norm": 0.21232132613658905, + "learning_rate": 4.383237256091854e-05, + "loss": 1.7773, + "step": 18024 + }, + { + "epoch": 5.532535297728668, + "grad_norm": 0.24131610989570618, + "learning_rate": 4.382743999214729e-05, + "loss": 1.7899, + "step": 18025 + }, + { + "epoch": 5.532842234499693, + "grad_norm": 0.2752540409564972, + "learning_rate": 4.382250748437692e-05, + "loss": 1.7603, + "step": 18026 + }, + { + "epoch": 5.533149171270718, + "grad_norm": 0.2007865607738495, + "learning_rate": 4.381757503765613e-05, + "loss": 1.7553, + "step": 18027 + }, + { + "epoch": 5.533456108041744, + "grad_norm": 0.23768723011016846, + "learning_rate": 4.38126426520337e-05, + "loss": 1.757, + "step": 18028 + }, + { + "epoch": 5.533763044812769, + "grad_norm": 0.22198502719402313, + "learning_rate": 4.3807710327558366e-05, + "loss": 1.7578, + "step": 18029 + }, + { + "epoch": 5.5340699815837935, + "grad_norm": 0.22432352602481842, + "learning_rate": 4.380277806427885e-05, + "loss": 1.75, + "step": 18030 + }, + { + "epoch": 5.534376918354819, + "grad_norm": 0.23029591143131256, + "learning_rate": 4.379784586224394e-05, + "loss": 1.7829, + "step": 18031 + }, + { + "epoch": 5.534683855125844, + "grad_norm": 0.23901896178722382, + "learning_rate": 4.379291372150232e-05, + "loss": 1.7461, + "step": 18032 + }, + { + "epoch": 5.5349907918968695, + "grad_norm": 0.20958681404590607, + "learning_rate": 4.378798164210278e-05, + "loss": 1.7224, + "step": 18033 + }, + { + "epoch": 5.535297728667894, + "grad_norm": 0.21619680523872375, + "learning_rate": 4.3783049624094036e-05, + "loss": 1.7605, + "step": 18034 + }, + { + "epoch": 5.535604665438919, + "grad_norm": 0.22988620400428772, + "learning_rate": 4.3778117667524867e-05, + "loss": 1.7668, + "step": 18035 + }, + { + "epoch": 5.535911602209945, + "grad_norm": 0.20107243955135345, + "learning_rate": 4.377318577244395e-05, + "loss": 1.7932, + "step": 18036 + }, + { + "epoch": 5.53621853898097, + "grad_norm": 0.25803956389427185, + "learning_rate": 4.376825393890009e-05, + "loss": 1.7409, + "step": 18037 + }, + { + "epoch": 5.536525475751995, + "grad_norm": 0.34292399883270264, + "learning_rate": 4.376332216694198e-05, + "loss": 1.8554, + "step": 18038 + }, + { + "epoch": 5.536832412523021, + "grad_norm": 0.23147790133953094, + "learning_rate": 4.375839045661839e-05, + "loss": 1.7918, + "step": 18039 + }, + { + "epoch": 5.537139349294045, + "grad_norm": 0.2387644350528717, + "learning_rate": 4.375345880797802e-05, + "loss": 1.7391, + "step": 18040 + }, + { + "epoch": 5.53744628606507, + "grad_norm": 0.21463727951049805, + "learning_rate": 4.374852722106966e-05, + "loss": 1.6812, + "step": 18041 + }, + { + "epoch": 5.537753222836096, + "grad_norm": 0.21994563937187195, + "learning_rate": 4.3743595695941994e-05, + "loss": 1.7727, + "step": 18042 + }, + { + "epoch": 5.538060159607121, + "grad_norm": 0.21102699637413025, + "learning_rate": 4.373866423264381e-05, + "loss": 1.7854, + "step": 18043 + }, + { + "epoch": 5.538367096378146, + "grad_norm": 0.21742786467075348, + "learning_rate": 4.3733732831223794e-05, + "loss": 1.7352, + "step": 18044 + }, + { + "epoch": 5.538674033149171, + "grad_norm": 0.20080791413784027, + "learning_rate": 4.372880149173071e-05, + "loss": 1.7264, + "step": 18045 + }, + { + "epoch": 5.538980969920196, + "grad_norm": 0.21027569472789764, + "learning_rate": 4.372387021421329e-05, + "loss": 1.766, + "step": 18046 + }, + { + "epoch": 5.5392879066912215, + "grad_norm": 0.22870683670043945, + "learning_rate": 4.371893899872025e-05, + "loss": 1.7746, + "step": 18047 + }, + { + "epoch": 5.539594843462247, + "grad_norm": 0.21248690783977509, + "learning_rate": 4.371400784530036e-05, + "loss": 1.7447, + "step": 18048 + }, + { + "epoch": 5.539901780233272, + "grad_norm": 0.23059454560279846, + "learning_rate": 4.37090767540023e-05, + "loss": 1.7827, + "step": 18049 + }, + { + "epoch": 5.5402087170042975, + "grad_norm": 0.2519036531448364, + "learning_rate": 4.370414572487485e-05, + "loss": 1.7984, + "step": 18050 + }, + { + "epoch": 5.540515653775322, + "grad_norm": 0.23621398210525513, + "learning_rate": 4.36992147579667e-05, + "loss": 1.7517, + "step": 18051 + }, + { + "epoch": 5.540822590546347, + "grad_norm": 0.24267609417438507, + "learning_rate": 4.3694283853326625e-05, + "loss": 1.8285, + "step": 18052 + }, + { + "epoch": 5.541129527317373, + "grad_norm": 0.23209960758686066, + "learning_rate": 4.368935301100332e-05, + "loss": 1.7765, + "step": 18053 + }, + { + "epoch": 5.541436464088398, + "grad_norm": 0.21277187764644623, + "learning_rate": 4.368442223104555e-05, + "loss": 1.7182, + "step": 18054 + }, + { + "epoch": 5.541743400859423, + "grad_norm": 0.20821616053581238, + "learning_rate": 4.367949151350199e-05, + "loss": 1.6766, + "step": 18055 + }, + { + "epoch": 5.542050337630448, + "grad_norm": 0.23019999265670776, + "learning_rate": 4.3674560858421414e-05, + "loss": 1.7438, + "step": 18056 + }, + { + "epoch": 5.542357274401473, + "grad_norm": 0.21547134220600128, + "learning_rate": 4.366963026585253e-05, + "loss": 1.7003, + "step": 18057 + }, + { + "epoch": 5.542664211172498, + "grad_norm": 0.22454513609409332, + "learning_rate": 4.3664699735844084e-05, + "loss": 1.7072, + "step": 18058 + }, + { + "epoch": 5.542971147943524, + "grad_norm": 0.22228482365608215, + "learning_rate": 4.365976926844477e-05, + "loss": 1.7557, + "step": 18059 + }, + { + "epoch": 5.543278084714549, + "grad_norm": 0.25762560963630676, + "learning_rate": 4.365483886370335e-05, + "loss": 1.7751, + "step": 18060 + }, + { + "epoch": 5.543585021485574, + "grad_norm": 0.2086205631494522, + "learning_rate": 4.3649908521668516e-05, + "loss": 1.7399, + "step": 18061 + }, + { + "epoch": 5.543891958256599, + "grad_norm": 0.2759089767932892, + "learning_rate": 4.3644978242389014e-05, + "loss": 1.7503, + "step": 18062 + }, + { + "epoch": 5.544198895027624, + "grad_norm": 0.2235182225704193, + "learning_rate": 4.364004802591358e-05, + "loss": 1.7313, + "step": 18063 + }, + { + "epoch": 5.5445058317986495, + "grad_norm": 0.23074570298194885, + "learning_rate": 4.3635117872290885e-05, + "loss": 1.7649, + "step": 18064 + }, + { + "epoch": 5.544812768569675, + "grad_norm": 0.24929538369178772, + "learning_rate": 4.363018778156972e-05, + "loss": 1.732, + "step": 18065 + }, + { + "epoch": 5.5451197053407, + "grad_norm": 0.26422035694122314, + "learning_rate": 4.362525775379874e-05, + "loss": 1.7276, + "step": 18066 + }, + { + "epoch": 5.545426642111725, + "grad_norm": 0.3160388767719269, + "learning_rate": 4.362032778902672e-05, + "loss": 1.7777, + "step": 18067 + }, + { + "epoch": 5.54573357888275, + "grad_norm": 0.20791196823120117, + "learning_rate": 4.3615397887302345e-05, + "loss": 1.7058, + "step": 18068 + }, + { + "epoch": 5.546040515653775, + "grad_norm": 0.31438156962394714, + "learning_rate": 4.361046804867437e-05, + "loss": 1.8102, + "step": 18069 + }, + { + "epoch": 5.546347452424801, + "grad_norm": 0.3008113205432892, + "learning_rate": 4.3605538273191475e-05, + "loss": 1.7297, + "step": 18070 + }, + { + "epoch": 5.546654389195826, + "grad_norm": 0.21147282421588898, + "learning_rate": 4.3600608560902425e-05, + "loss": 1.776, + "step": 18071 + }, + { + "epoch": 5.546961325966851, + "grad_norm": 0.25202393531799316, + "learning_rate": 4.3595678911855884e-05, + "loss": 1.7273, + "step": 18072 + }, + { + "epoch": 5.547268262737876, + "grad_norm": 0.18881210684776306, + "learning_rate": 4.3590749326100614e-05, + "loss": 1.7026, + "step": 18073 + }, + { + "epoch": 5.547575199508901, + "grad_norm": 0.25075671076774597, + "learning_rate": 4.3585819803685295e-05, + "loss": 1.7694, + "step": 18074 + }, + { + "epoch": 5.547882136279926, + "grad_norm": 0.2625887989997864, + "learning_rate": 4.358089034465869e-05, + "loss": 1.7338, + "step": 18075 + }, + { + "epoch": 5.548189073050952, + "grad_norm": 0.27278679609298706, + "learning_rate": 4.357596094906947e-05, + "loss": 1.7684, + "step": 18076 + }, + { + "epoch": 5.548496009821976, + "grad_norm": 0.283964604139328, + "learning_rate": 4.3571031616966396e-05, + "loss": 1.7539, + "step": 18077 + }, + { + "epoch": 5.5488029465930016, + "grad_norm": 0.2702009975910187, + "learning_rate": 4.3566102348398124e-05, + "loss": 1.8064, + "step": 18078 + }, + { + "epoch": 5.549109883364027, + "grad_norm": 0.449733167886734, + "learning_rate": 4.356117314341342e-05, + "loss": 1.7258, + "step": 18079 + }, + { + "epoch": 5.549416820135052, + "grad_norm": 0.3199995160102844, + "learning_rate": 4.3556244002060975e-05, + "loss": 1.7526, + "step": 18080 + }, + { + "epoch": 5.5497237569060776, + "grad_norm": 0.2803747355937958, + "learning_rate": 4.3551314924389494e-05, + "loss": 1.764, + "step": 18081 + }, + { + "epoch": 5.550030693677103, + "grad_norm": 0.28995978832244873, + "learning_rate": 4.3546385910447715e-05, + "loss": 1.7617, + "step": 18082 + }, + { + "epoch": 5.550337630448127, + "grad_norm": 0.24313311278820038, + "learning_rate": 4.354145696028431e-05, + "loss": 1.7515, + "step": 18083 + }, + { + "epoch": 5.550644567219153, + "grad_norm": 0.2668032944202423, + "learning_rate": 4.3536528073948025e-05, + "loss": 1.743, + "step": 18084 + }, + { + "epoch": 5.550951503990178, + "grad_norm": 0.22831310331821442, + "learning_rate": 4.353159925148755e-05, + "loss": 1.7971, + "step": 18085 + }, + { + "epoch": 5.551258440761203, + "grad_norm": 0.22047942876815796, + "learning_rate": 4.352667049295162e-05, + "loss": 1.6983, + "step": 18086 + }, + { + "epoch": 5.551565377532229, + "grad_norm": 0.22895069420337677, + "learning_rate": 4.35217417983889e-05, + "loss": 1.7866, + "step": 18087 + }, + { + "epoch": 5.551872314303253, + "grad_norm": 0.19946368038654327, + "learning_rate": 4.3516813167848156e-05, + "loss": 1.7129, + "step": 18088 + }, + { + "epoch": 5.5521792510742785, + "grad_norm": 0.21508903801441193, + "learning_rate": 4.351188460137804e-05, + "loss": 1.7154, + "step": 18089 + }, + { + "epoch": 5.552486187845304, + "grad_norm": 0.24813953042030334, + "learning_rate": 4.3506956099027294e-05, + "loss": 1.8326, + "step": 18090 + }, + { + "epoch": 5.552793124616329, + "grad_norm": 0.21306444704532623, + "learning_rate": 4.35020276608446e-05, + "loss": 1.7651, + "step": 18091 + }, + { + "epoch": 5.5531000613873545, + "grad_norm": 0.22041217982769012, + "learning_rate": 4.34970992868787e-05, + "loss": 1.6852, + "step": 18092 + }, + { + "epoch": 5.55340699815838, + "grad_norm": 0.21699896454811096, + "learning_rate": 4.349217097717826e-05, + "loss": 1.7524, + "step": 18093 + }, + { + "epoch": 5.553713934929404, + "grad_norm": 0.23086662590503693, + "learning_rate": 4.3487242731792015e-05, + "loss": 1.7441, + "step": 18094 + }, + { + "epoch": 5.55402087170043, + "grad_norm": 0.21898184716701508, + "learning_rate": 4.348231455076864e-05, + "loss": 1.7131, + "step": 18095 + }, + { + "epoch": 5.554327808471455, + "grad_norm": 0.17392560839653015, + "learning_rate": 4.3477386434156854e-05, + "loss": 1.7049, + "step": 18096 + }, + { + "epoch": 5.55463474524248, + "grad_norm": 0.1984172910451889, + "learning_rate": 4.3472458382005374e-05, + "loss": 1.7136, + "step": 18097 + }, + { + "epoch": 5.554941682013506, + "grad_norm": 0.19227837026119232, + "learning_rate": 4.3467530394362866e-05, + "loss": 1.7468, + "step": 18098 + }, + { + "epoch": 5.55524861878453, + "grad_norm": 0.2307087779045105, + "learning_rate": 4.346260247127807e-05, + "loss": 1.7004, + "step": 18099 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.21496252715587616, + "learning_rate": 4.345767461279965e-05, + "loss": 1.7508, + "step": 18100 + }, + { + "epoch": 5.555862492326581, + "grad_norm": 0.21119998395442963, + "learning_rate": 4.3452746818976333e-05, + "loss": 1.7965, + "step": 18101 + }, + { + "epoch": 5.556169429097606, + "grad_norm": 0.2416355311870575, + "learning_rate": 4.34478190898568e-05, + "loss": 1.7006, + "step": 18102 + }, + { + "epoch": 5.556476365868631, + "grad_norm": 0.2009642869234085, + "learning_rate": 4.344289142548978e-05, + "loss": 1.7567, + "step": 18103 + }, + { + "epoch": 5.556783302639657, + "grad_norm": 0.2387058436870575, + "learning_rate": 4.343796382592393e-05, + "loss": 1.7898, + "step": 18104 + }, + { + "epoch": 5.557090239410681, + "grad_norm": 0.19835951924324036, + "learning_rate": 4.343303629120798e-05, + "loss": 1.7888, + "step": 18105 + }, + { + "epoch": 5.5573971761817065, + "grad_norm": 0.23324637115001678, + "learning_rate": 4.3428108821390604e-05, + "loss": 1.7923, + "step": 18106 + }, + { + "epoch": 5.557704112952732, + "grad_norm": 0.22334477305412292, + "learning_rate": 4.342318141652052e-05, + "loss": 1.7234, + "step": 18107 + }, + { + "epoch": 5.558011049723757, + "grad_norm": 0.20220427215099335, + "learning_rate": 4.341825407664639e-05, + "loss": 1.7639, + "step": 18108 + }, + { + "epoch": 5.558317986494782, + "grad_norm": 0.23658546805381775, + "learning_rate": 4.3413326801816964e-05, + "loss": 1.7505, + "step": 18109 + }, + { + "epoch": 5.558624923265807, + "grad_norm": 0.21157726645469666, + "learning_rate": 4.3408399592080875e-05, + "loss": 1.7655, + "step": 18110 + }, + { + "epoch": 5.558931860036832, + "grad_norm": 0.2139829397201538, + "learning_rate": 4.340347244748687e-05, + "loss": 1.767, + "step": 18111 + }, + { + "epoch": 5.559238796807858, + "grad_norm": 0.17811299860477448, + "learning_rate": 4.339854536808359e-05, + "loss": 1.6629, + "step": 18112 + }, + { + "epoch": 5.559545733578883, + "grad_norm": 0.2005898356437683, + "learning_rate": 4.339361835391977e-05, + "loss": 1.7269, + "step": 18113 + }, + { + "epoch": 5.559852670349908, + "grad_norm": 0.21514086425304413, + "learning_rate": 4.338869140504409e-05, + "loss": 1.7806, + "step": 18114 + }, + { + "epoch": 5.560159607120933, + "grad_norm": 0.23163840174674988, + "learning_rate": 4.338376452150522e-05, + "loss": 1.7259, + "step": 18115 + }, + { + "epoch": 5.560466543891958, + "grad_norm": 0.23657509684562683, + "learning_rate": 4.337883770335189e-05, + "loss": 1.7778, + "step": 18116 + }, + { + "epoch": 5.560773480662983, + "grad_norm": 0.20135201513767242, + "learning_rate": 4.337391095063274e-05, + "loss": 1.7359, + "step": 18117 + }, + { + "epoch": 5.561080417434009, + "grad_norm": 0.22871774435043335, + "learning_rate": 4.33689842633965e-05, + "loss": 1.7658, + "step": 18118 + }, + { + "epoch": 5.561387354205034, + "grad_norm": 0.21755221486091614, + "learning_rate": 4.3364057641691835e-05, + "loss": 1.7408, + "step": 18119 + }, + { + "epoch": 5.5616942909760585, + "grad_norm": 0.215267151594162, + "learning_rate": 4.335913108556746e-05, + "loss": 1.7175, + "step": 18120 + }, + { + "epoch": 5.562001227747084, + "grad_norm": 0.25724974274635315, + "learning_rate": 4.335420459507202e-05, + "loss": 1.7197, + "step": 18121 + }, + { + "epoch": 5.562308164518109, + "grad_norm": 0.25375521183013916, + "learning_rate": 4.3349278170254254e-05, + "loss": 1.7251, + "step": 18122 + }, + { + "epoch": 5.5626151012891345, + "grad_norm": 0.24768905341625214, + "learning_rate": 4.334435181116279e-05, + "loss": 1.7405, + "step": 18123 + }, + { + "epoch": 5.56292203806016, + "grad_norm": 0.21281081438064575, + "learning_rate": 4.333942551784636e-05, + "loss": 1.7131, + "step": 18124 + }, + { + "epoch": 5.563228974831185, + "grad_norm": 0.2129398137331009, + "learning_rate": 4.333449929035361e-05, + "loss": 1.7049, + "step": 18125 + }, + { + "epoch": 5.56353591160221, + "grad_norm": 0.24582397937774658, + "learning_rate": 4.332957312873328e-05, + "loss": 1.7205, + "step": 18126 + }, + { + "epoch": 5.563842848373235, + "grad_norm": 0.21282973885536194, + "learning_rate": 4.332464703303399e-05, + "loss": 1.7655, + "step": 18127 + }, + { + "epoch": 5.56414978514426, + "grad_norm": 0.2302251160144806, + "learning_rate": 4.331972100330447e-05, + "loss": 1.7597, + "step": 18128 + }, + { + "epoch": 5.564456721915286, + "grad_norm": 0.23453226685523987, + "learning_rate": 4.331479503959336e-05, + "loss": 1.7028, + "step": 18129 + }, + { + "epoch": 5.564763658686311, + "grad_norm": 0.19723562896251678, + "learning_rate": 4.330986914194938e-05, + "loss": 1.7101, + "step": 18130 + }, + { + "epoch": 5.565070595457335, + "grad_norm": 0.22021643817424774, + "learning_rate": 4.33049433104212e-05, + "loss": 1.7123, + "step": 18131 + }, + { + "epoch": 5.565377532228361, + "grad_norm": 0.25540977716445923, + "learning_rate": 4.3300017545057484e-05, + "loss": 1.7392, + "step": 18132 + }, + { + "epoch": 5.565684468999386, + "grad_norm": 0.23482176661491394, + "learning_rate": 4.329509184590693e-05, + "loss": 1.7175, + "step": 18133 + }, + { + "epoch": 5.565991405770411, + "grad_norm": 0.19537311792373657, + "learning_rate": 4.329016621301819e-05, + "loss": 1.7583, + "step": 18134 + }, + { + "epoch": 5.566298342541437, + "grad_norm": 0.21828842163085938, + "learning_rate": 4.328524064643997e-05, + "loss": 1.7411, + "step": 18135 + }, + { + "epoch": 5.566605279312462, + "grad_norm": 0.24589122831821442, + "learning_rate": 4.328031514622093e-05, + "loss": 1.7769, + "step": 18136 + }, + { + "epoch": 5.5669122160834865, + "grad_norm": 0.20964545011520386, + "learning_rate": 4.327538971240978e-05, + "loss": 1.7743, + "step": 18137 + }, + { + "epoch": 5.567219152854512, + "grad_norm": 0.2210713028907776, + "learning_rate": 4.327046434505514e-05, + "loss": 1.7671, + "step": 18138 + }, + { + "epoch": 5.567526089625537, + "grad_norm": 0.21382687985897064, + "learning_rate": 4.3265539044205736e-05, + "loss": 1.793, + "step": 18139 + }, + { + "epoch": 5.5678330263965625, + "grad_norm": 0.23289678990840912, + "learning_rate": 4.326061380991021e-05, + "loss": 1.738, + "step": 18140 + }, + { + "epoch": 5.568139963167588, + "grad_norm": 0.23789258301258087, + "learning_rate": 4.325568864221725e-05, + "loss": 1.8315, + "step": 18141 + }, + { + "epoch": 5.568446899938612, + "grad_norm": 0.1925022453069687, + "learning_rate": 4.325076354117554e-05, + "loss": 1.6956, + "step": 18142 + }, + { + "epoch": 5.568753836709638, + "grad_norm": 0.22522561252117157, + "learning_rate": 4.324583850683373e-05, + "loss": 1.7957, + "step": 18143 + }, + { + "epoch": 5.569060773480663, + "grad_norm": 0.2787671387195587, + "learning_rate": 4.324091353924049e-05, + "loss": 1.7325, + "step": 18144 + }, + { + "epoch": 5.569367710251688, + "grad_norm": 0.2723194658756256, + "learning_rate": 4.3235988638444536e-05, + "loss": 1.7668, + "step": 18145 + }, + { + "epoch": 5.569674647022714, + "grad_norm": 0.2241704910993576, + "learning_rate": 4.3231063804494484e-05, + "loss": 1.7977, + "step": 18146 + }, + { + "epoch": 5.569981583793739, + "grad_norm": 0.2627747356891632, + "learning_rate": 4.322613903743903e-05, + "loss": 1.6775, + "step": 18147 + }, + { + "epoch": 5.570288520564763, + "grad_norm": 0.2644255757331848, + "learning_rate": 4.322121433732686e-05, + "loss": 1.7404, + "step": 18148 + }, + { + "epoch": 5.570595457335789, + "grad_norm": 0.2386743575334549, + "learning_rate": 4.321628970420659e-05, + "loss": 1.7386, + "step": 18149 + }, + { + "epoch": 5.570902394106814, + "grad_norm": 0.22444583475589752, + "learning_rate": 4.3211365138126945e-05, + "loss": 1.7482, + "step": 18150 + }, + { + "epoch": 5.571209330877839, + "grad_norm": 0.21770013868808746, + "learning_rate": 4.3206440639136554e-05, + "loss": 1.7322, + "step": 18151 + }, + { + "epoch": 5.571516267648864, + "grad_norm": 0.22356587648391724, + "learning_rate": 4.320151620728411e-05, + "loss": 1.751, + "step": 18152 + }, + { + "epoch": 5.571823204419889, + "grad_norm": 0.2040669322013855, + "learning_rate": 4.319659184261826e-05, + "loss": 1.712, + "step": 18153 + }, + { + "epoch": 5.5721301411909145, + "grad_norm": 0.20951713621616364, + "learning_rate": 4.319166754518768e-05, + "loss": 1.7308, + "step": 18154 + }, + { + "epoch": 5.57243707796194, + "grad_norm": 0.186195969581604, + "learning_rate": 4.3186743315041025e-05, + "loss": 1.7133, + "step": 18155 + }, + { + "epoch": 5.572744014732965, + "grad_norm": 0.2098865509033203, + "learning_rate": 4.318181915222698e-05, + "loss": 1.7645, + "step": 18156 + }, + { + "epoch": 5.5730509515039905, + "grad_norm": 0.20552097260951996, + "learning_rate": 4.317689505679418e-05, + "loss": 1.7156, + "step": 18157 + }, + { + "epoch": 5.573357888275015, + "grad_norm": 0.22506964206695557, + "learning_rate": 4.3171971028791314e-05, + "loss": 1.7192, + "step": 18158 + }, + { + "epoch": 5.57366482504604, + "grad_norm": 0.2296760082244873, + "learning_rate": 4.316704706826702e-05, + "loss": 1.7534, + "step": 18159 + }, + { + "epoch": 5.573971761817066, + "grad_norm": 0.20140253007411957, + "learning_rate": 4.316212317526998e-05, + "loss": 1.6906, + "step": 18160 + }, + { + "epoch": 5.574278698588091, + "grad_norm": 0.23313316702842712, + "learning_rate": 4.315719934984884e-05, + "loss": 1.6929, + "step": 18161 + }, + { + "epoch": 5.574585635359116, + "grad_norm": 0.23398169875144958, + "learning_rate": 4.315227559205228e-05, + "loss": 1.7254, + "step": 18162 + }, + { + "epoch": 5.574892572130141, + "grad_norm": 0.20836731791496277, + "learning_rate": 4.314735190192894e-05, + "loss": 1.7335, + "step": 18163 + }, + { + "epoch": 5.575199508901166, + "grad_norm": 0.19899079203605652, + "learning_rate": 4.3142428279527485e-05, + "loss": 1.69, + "step": 18164 + }, + { + "epoch": 5.5755064456721914, + "grad_norm": 0.24623680114746094, + "learning_rate": 4.313750472489657e-05, + "loss": 1.7413, + "step": 18165 + }, + { + "epoch": 5.575813382443217, + "grad_norm": 0.2432616949081421, + "learning_rate": 4.313258123808484e-05, + "loss": 1.7426, + "step": 18166 + }, + { + "epoch": 5.576120319214242, + "grad_norm": 0.22773970663547516, + "learning_rate": 4.3127657819141006e-05, + "loss": 1.7986, + "step": 18167 + }, + { + "epoch": 5.5764272559852675, + "grad_norm": 0.19891540706157684, + "learning_rate": 4.312273446811366e-05, + "loss": 1.7007, + "step": 18168 + }, + { + "epoch": 5.576734192756292, + "grad_norm": 0.23402714729309082, + "learning_rate": 4.311781118505149e-05, + "loss": 1.7774, + "step": 18169 + }, + { + "epoch": 5.577041129527317, + "grad_norm": 0.2248220294713974, + "learning_rate": 4.3112887970003134e-05, + "loss": 1.7079, + "step": 18170 + }, + { + "epoch": 5.577348066298343, + "grad_norm": 0.20901209115982056, + "learning_rate": 4.310796482301726e-05, + "loss": 1.7336, + "step": 18171 + }, + { + "epoch": 5.577655003069368, + "grad_norm": 0.21872754395008087, + "learning_rate": 4.3103041744142516e-05, + "loss": 1.7742, + "step": 18172 + }, + { + "epoch": 5.577961939840393, + "grad_norm": 0.2567403018474579, + "learning_rate": 4.309811873342757e-05, + "loss": 1.7894, + "step": 18173 + }, + { + "epoch": 5.578268876611418, + "grad_norm": 0.219998300075531, + "learning_rate": 4.3093195790921035e-05, + "loss": 1.7283, + "step": 18174 + }, + { + "epoch": 5.578575813382443, + "grad_norm": 0.1944747269153595, + "learning_rate": 4.3088272916671614e-05, + "loss": 1.7129, + "step": 18175 + }, + { + "epoch": 5.578882750153468, + "grad_norm": 0.19492141902446747, + "learning_rate": 4.308335011072791e-05, + "loss": 1.7286, + "step": 18176 + }, + { + "epoch": 5.579189686924494, + "grad_norm": 0.22383002936840057, + "learning_rate": 4.3078427373138604e-05, + "loss": 1.733, + "step": 18177 + }, + { + "epoch": 5.579496623695519, + "grad_norm": 0.20238643884658813, + "learning_rate": 4.307350470395232e-05, + "loss": 1.7522, + "step": 18178 + }, + { + "epoch": 5.579803560466544, + "grad_norm": 0.21456125378608704, + "learning_rate": 4.3068582103217755e-05, + "loss": 1.7298, + "step": 18179 + }, + { + "epoch": 5.580110497237569, + "grad_norm": 0.28084230422973633, + "learning_rate": 4.3063659570983514e-05, + "loss": 1.7805, + "step": 18180 + }, + { + "epoch": 5.580417434008594, + "grad_norm": 0.21319706737995148, + "learning_rate": 4.305873710729824e-05, + "loss": 1.6801, + "step": 18181 + }, + { + "epoch": 5.5807243707796195, + "grad_norm": 0.2279660850763321, + "learning_rate": 4.30538147122106e-05, + "loss": 1.752, + "step": 18182 + }, + { + "epoch": 5.581031307550645, + "grad_norm": 0.1958594173192978, + "learning_rate": 4.304889238576922e-05, + "loss": 1.7487, + "step": 18183 + }, + { + "epoch": 5.581338244321669, + "grad_norm": 0.19484321773052216, + "learning_rate": 4.304397012802279e-05, + "loss": 1.7222, + "step": 18184 + }, + { + "epoch": 5.581645181092695, + "grad_norm": 0.19863305985927582, + "learning_rate": 4.3039047939019906e-05, + "loss": 1.7296, + "step": 18185 + }, + { + "epoch": 5.58195211786372, + "grad_norm": 0.18674087524414062, + "learning_rate": 4.303412581880924e-05, + "loss": 1.6753, + "step": 18186 + }, + { + "epoch": 5.582259054634745, + "grad_norm": 0.22263208031654358, + "learning_rate": 4.302920376743941e-05, + "loss": 1.7431, + "step": 18187 + }, + { + "epoch": 5.582565991405771, + "grad_norm": 0.1926872879266739, + "learning_rate": 4.302428178495909e-05, + "loss": 1.7662, + "step": 18188 + }, + { + "epoch": 5.582872928176796, + "grad_norm": 0.23190459609031677, + "learning_rate": 4.301935987141689e-05, + "loss": 1.7271, + "step": 18189 + }, + { + "epoch": 5.58317986494782, + "grad_norm": 0.30057230591773987, + "learning_rate": 4.301443802686148e-05, + "loss": 1.7957, + "step": 18190 + }, + { + "epoch": 5.583486801718846, + "grad_norm": 0.2520695626735687, + "learning_rate": 4.3009516251341475e-05, + "loss": 1.7501, + "step": 18191 + }, + { + "epoch": 5.583793738489871, + "grad_norm": 0.19143317639827728, + "learning_rate": 4.300459454490555e-05, + "loss": 1.7091, + "step": 18192 + }, + { + "epoch": 5.584100675260896, + "grad_norm": 0.2064475119113922, + "learning_rate": 4.299967290760229e-05, + "loss": 1.6849, + "step": 18193 + }, + { + "epoch": 5.584407612031922, + "grad_norm": 0.3093598484992981, + "learning_rate": 4.299475133948039e-05, + "loss": 1.8479, + "step": 18194 + }, + { + "epoch": 5.584714548802946, + "grad_norm": 0.2875300943851471, + "learning_rate": 4.298982984058845e-05, + "loss": 1.7296, + "step": 18195 + }, + { + "epoch": 5.5850214855739715, + "grad_norm": 0.33194443583488464, + "learning_rate": 4.298490841097514e-05, + "loss": 1.7668, + "step": 18196 + }, + { + "epoch": 5.585328422344997, + "grad_norm": 0.20940829813480377, + "learning_rate": 4.297998705068908e-05, + "loss": 1.7316, + "step": 18197 + }, + { + "epoch": 5.585635359116022, + "grad_norm": 0.32381999492645264, + "learning_rate": 4.297506575977887e-05, + "loss": 1.7212, + "step": 18198 + }, + { + "epoch": 5.5859422958870475, + "grad_norm": 0.31585511565208435, + "learning_rate": 4.29701445382932e-05, + "loss": 1.7695, + "step": 18199 + }, + { + "epoch": 5.586249232658073, + "grad_norm": 0.2272588014602661, + "learning_rate": 4.2965223386280664e-05, + "loss": 1.7105, + "step": 18200 + }, + { + "epoch": 5.586556169429097, + "grad_norm": 0.2949761152267456, + "learning_rate": 4.296030230378993e-05, + "loss": 1.803, + "step": 18201 + }, + { + "epoch": 5.586863106200123, + "grad_norm": 0.20512579381465912, + "learning_rate": 4.29553812908696e-05, + "loss": 1.759, + "step": 18202 + }, + { + "epoch": 5.587170042971148, + "grad_norm": 0.21143598854541779, + "learning_rate": 4.295046034756835e-05, + "loss": 1.7286, + "step": 18203 + }, + { + "epoch": 5.587476979742173, + "grad_norm": 0.22148001194000244, + "learning_rate": 4.294553947393476e-05, + "loss": 1.7258, + "step": 18204 + }, + { + "epoch": 5.587783916513199, + "grad_norm": 0.17245957255363464, + "learning_rate": 4.2940618670017484e-05, + "loss": 1.6863, + "step": 18205 + }, + { + "epoch": 5.588090853284223, + "grad_norm": 0.20260390639305115, + "learning_rate": 4.293569793586515e-05, + "loss": 1.6866, + "step": 18206 + }, + { + "epoch": 5.588397790055248, + "grad_norm": 0.20671936869621277, + "learning_rate": 4.293077727152641e-05, + "loss": 1.7849, + "step": 18207 + }, + { + "epoch": 5.588704726826274, + "grad_norm": 0.21415838599205017, + "learning_rate": 4.292585667704984e-05, + "loss": 1.7279, + "step": 18208 + }, + { + "epoch": 5.589011663597299, + "grad_norm": 0.18668091297149658, + "learning_rate": 4.2920936152484134e-05, + "loss": 1.7087, + "step": 18209 + }, + { + "epoch": 5.589318600368324, + "grad_norm": 0.2253870815038681, + "learning_rate": 4.291601569787786e-05, + "loss": 1.769, + "step": 18210 + }, + { + "epoch": 5.58962553713935, + "grad_norm": 0.22426939010620117, + "learning_rate": 4.291109531327968e-05, + "loss": 1.7382, + "step": 18211 + }, + { + "epoch": 5.589932473910374, + "grad_norm": 0.21552452445030212, + "learning_rate": 4.29061749987382e-05, + "loss": 1.7316, + "step": 18212 + }, + { + "epoch": 5.5902394106813995, + "grad_norm": 0.2337147295475006, + "learning_rate": 4.290125475430209e-05, + "loss": 1.7836, + "step": 18213 + }, + { + "epoch": 5.590546347452425, + "grad_norm": 0.21780124306678772, + "learning_rate": 4.289633458001992e-05, + "loss": 1.6923, + "step": 18214 + }, + { + "epoch": 5.59085328422345, + "grad_norm": 0.20009608566761017, + "learning_rate": 4.289141447594033e-05, + "loss": 1.719, + "step": 18215 + }, + { + "epoch": 5.5911602209944755, + "grad_norm": 0.18165744841098785, + "learning_rate": 4.288649444211196e-05, + "loss": 1.6825, + "step": 18216 + }, + { + "epoch": 5.5914671577655, + "grad_norm": 0.2244826704263687, + "learning_rate": 4.288157447858341e-05, + "loss": 1.7323, + "step": 18217 + }, + { + "epoch": 5.591774094536525, + "grad_norm": 0.16875946521759033, + "learning_rate": 4.2876654585403325e-05, + "loss": 1.6787, + "step": 18218 + }, + { + "epoch": 5.592081031307551, + "grad_norm": 0.19244243204593658, + "learning_rate": 4.28717347626203e-05, + "loss": 1.7225, + "step": 18219 + }, + { + "epoch": 5.592387968078576, + "grad_norm": 0.21081633865833282, + "learning_rate": 4.286681501028299e-05, + "loss": 1.7063, + "step": 18220 + }, + { + "epoch": 5.592694904849601, + "grad_norm": 0.20926406979560852, + "learning_rate": 4.286189532843997e-05, + "loss": 1.7307, + "step": 18221 + }, + { + "epoch": 5.593001841620627, + "grad_norm": 0.20258775353431702, + "learning_rate": 4.28569757171399e-05, + "loss": 1.6917, + "step": 18222 + }, + { + "epoch": 5.593308778391651, + "grad_norm": 0.21956230700016022, + "learning_rate": 4.285205617643137e-05, + "loss": 1.7127, + "step": 18223 + }, + { + "epoch": 5.593615715162676, + "grad_norm": 0.2071436047554016, + "learning_rate": 4.284713670636303e-05, + "loss": 1.7487, + "step": 18224 + }, + { + "epoch": 5.593922651933702, + "grad_norm": 0.2002478390932083, + "learning_rate": 4.2842217306983464e-05, + "loss": 1.6544, + "step": 18225 + }, + { + "epoch": 5.594229588704727, + "grad_norm": 0.20691382884979248, + "learning_rate": 4.283729797834132e-05, + "loss": 1.768, + "step": 18226 + }, + { + "epoch": 5.5945365254757515, + "grad_norm": 0.18423563241958618, + "learning_rate": 4.283237872048517e-05, + "loss": 1.7563, + "step": 18227 + }, + { + "epoch": 5.594843462246777, + "grad_norm": 0.23055453598499298, + "learning_rate": 4.2827459533463665e-05, + "loss": 1.8083, + "step": 18228 + }, + { + "epoch": 5.595150399017802, + "grad_norm": 0.20735648274421692, + "learning_rate": 4.2822540417325396e-05, + "loss": 1.7761, + "step": 18229 + }, + { + "epoch": 5.5954573357888275, + "grad_norm": 0.2919909656047821, + "learning_rate": 4.281762137211902e-05, + "loss": 1.7836, + "step": 18230 + }, + { + "epoch": 5.595764272559853, + "grad_norm": 0.22636881470680237, + "learning_rate": 4.2812702397893113e-05, + "loss": 1.7389, + "step": 18231 + }, + { + "epoch": 5.596071209330878, + "grad_norm": 0.23788630962371826, + "learning_rate": 4.280778349469627e-05, + "loss": 1.7536, + "step": 18232 + }, + { + "epoch": 5.596378146101903, + "grad_norm": 0.22089426219463348, + "learning_rate": 4.280286466257715e-05, + "loss": 1.7584, + "step": 18233 + }, + { + "epoch": 5.596685082872928, + "grad_norm": 0.20486171543598175, + "learning_rate": 4.279794590158431e-05, + "loss": 1.7182, + "step": 18234 + }, + { + "epoch": 5.596992019643953, + "grad_norm": 0.2343701422214508, + "learning_rate": 4.2793027211766425e-05, + "loss": 1.751, + "step": 18235 + }, + { + "epoch": 5.597298956414979, + "grad_norm": 0.21734023094177246, + "learning_rate": 4.2788108593172036e-05, + "loss": 1.7084, + "step": 18236 + }, + { + "epoch": 5.597605893186004, + "grad_norm": 0.20593903958797455, + "learning_rate": 4.278319004584982e-05, + "loss": 1.6805, + "step": 18237 + }, + { + "epoch": 5.597912829957028, + "grad_norm": 0.20877878367900848, + "learning_rate": 4.2778271569848324e-05, + "loss": 1.7011, + "step": 18238 + }, + { + "epoch": 5.598219766728054, + "grad_norm": 0.23915995657444, + "learning_rate": 4.277335316521619e-05, + "loss": 1.732, + "step": 18239 + }, + { + "epoch": 5.598526703499079, + "grad_norm": 0.24310529232025146, + "learning_rate": 4.2768434832002004e-05, + "loss": 1.7859, + "step": 18240 + }, + { + "epoch": 5.598833640270104, + "grad_norm": 0.23189407587051392, + "learning_rate": 4.27635165702544e-05, + "loss": 1.7237, + "step": 18241 + }, + { + "epoch": 5.59914057704113, + "grad_norm": 0.2708875834941864, + "learning_rate": 4.275859838002195e-05, + "loss": 1.7046, + "step": 18242 + }, + { + "epoch": 5.599447513812155, + "grad_norm": 0.23692840337753296, + "learning_rate": 4.27536802613533e-05, + "loss": 1.8556, + "step": 18243 + }, + { + "epoch": 5.5997544505831796, + "grad_norm": 0.28285983204841614, + "learning_rate": 4.274876221429701e-05, + "loss": 1.6734, + "step": 18244 + }, + { + "epoch": 5.600061387354205, + "grad_norm": 0.20602203905582428, + "learning_rate": 4.27438442389017e-05, + "loss": 1.7113, + "step": 18245 + }, + { + "epoch": 5.60036832412523, + "grad_norm": 0.19719314575195312, + "learning_rate": 4.273892633521598e-05, + "loss": 1.7229, + "step": 18246 + }, + { + "epoch": 5.600675260896256, + "grad_norm": 0.2396705001592636, + "learning_rate": 4.273400850328846e-05, + "loss": 1.6986, + "step": 18247 + }, + { + "epoch": 5.600982197667281, + "grad_norm": 0.1974172443151474, + "learning_rate": 4.2729090743167724e-05, + "loss": 1.7445, + "step": 18248 + }, + { + "epoch": 5.601289134438305, + "grad_norm": 0.2193709760904312, + "learning_rate": 4.272417305490235e-05, + "loss": 1.7657, + "step": 18249 + }, + { + "epoch": 5.601596071209331, + "grad_norm": 0.24138681590557098, + "learning_rate": 4.271925543854098e-05, + "loss": 1.7388, + "step": 18250 + }, + { + "epoch": 5.601903007980356, + "grad_norm": 0.19056223332881927, + "learning_rate": 4.271433789413219e-05, + "loss": 1.6897, + "step": 18251 + }, + { + "epoch": 5.602209944751381, + "grad_norm": 0.20533505082130432, + "learning_rate": 4.270942042172459e-05, + "loss": 1.7222, + "step": 18252 + }, + { + "epoch": 5.602516881522407, + "grad_norm": 0.20570224523544312, + "learning_rate": 4.270450302136675e-05, + "loss": 1.8089, + "step": 18253 + }, + { + "epoch": 5.602823818293432, + "grad_norm": 0.2822209298610687, + "learning_rate": 4.269958569310732e-05, + "loss": 1.7523, + "step": 18254 + }, + { + "epoch": 5.6031307550644565, + "grad_norm": 0.2994859218597412, + "learning_rate": 4.269466843699484e-05, + "loss": 1.7538, + "step": 18255 + }, + { + "epoch": 5.603437691835482, + "grad_norm": 0.24851159751415253, + "learning_rate": 4.2689751253077925e-05, + "loss": 1.8162, + "step": 18256 + }, + { + "epoch": 5.603744628606507, + "grad_norm": 0.20387138426303864, + "learning_rate": 4.268483414140517e-05, + "loss": 1.6803, + "step": 18257 + }, + { + "epoch": 5.6040515653775325, + "grad_norm": 0.21620385348796844, + "learning_rate": 4.2679917102025204e-05, + "loss": 1.7236, + "step": 18258 + }, + { + "epoch": 5.604358502148557, + "grad_norm": 0.1925734579563141, + "learning_rate": 4.267500013498655e-05, + "loss": 1.7295, + "step": 18259 + }, + { + "epoch": 5.604665438919582, + "grad_norm": 0.22216086089611053, + "learning_rate": 4.267008324033787e-05, + "loss": 1.6844, + "step": 18260 + }, + { + "epoch": 5.604972375690608, + "grad_norm": 0.20293502509593964, + "learning_rate": 4.26651664181277e-05, + "loss": 1.7065, + "step": 18261 + }, + { + "epoch": 5.605279312461633, + "grad_norm": 0.21269507706165314, + "learning_rate": 4.266024966840466e-05, + "loss": 1.7573, + "step": 18262 + }, + { + "epoch": 5.605586249232658, + "grad_norm": 0.23574227094650269, + "learning_rate": 4.2655332991217334e-05, + "loss": 1.7625, + "step": 18263 + }, + { + "epoch": 5.605893186003684, + "grad_norm": 0.1875103861093521, + "learning_rate": 4.265041638661433e-05, + "loss": 1.7266, + "step": 18264 + }, + { + "epoch": 5.606200122774708, + "grad_norm": 0.20348483324050903, + "learning_rate": 4.264549985464421e-05, + "loss": 1.731, + "step": 18265 + }, + { + "epoch": 5.606507059545733, + "grad_norm": 0.2345927655696869, + "learning_rate": 4.264058339535556e-05, + "loss": 1.7809, + "step": 18266 + }, + { + "epoch": 5.606813996316759, + "grad_norm": 0.21142496168613434, + "learning_rate": 4.2635667008796985e-05, + "loss": 1.7362, + "step": 18267 + }, + { + "epoch": 5.607120933087784, + "grad_norm": 0.19670210778713226, + "learning_rate": 4.263075069501705e-05, + "loss": 1.7029, + "step": 18268 + }, + { + "epoch": 5.607427869858809, + "grad_norm": 0.20985090732574463, + "learning_rate": 4.262583445406439e-05, + "loss": 1.7478, + "step": 18269 + }, + { + "epoch": 5.607734806629834, + "grad_norm": 0.20972272753715515, + "learning_rate": 4.262091828598752e-05, + "loss": 1.7561, + "step": 18270 + }, + { + "epoch": 5.608041743400859, + "grad_norm": 0.20006676018238068, + "learning_rate": 4.261600219083509e-05, + "loss": 1.7584, + "step": 18271 + }, + { + "epoch": 5.6083486801718845, + "grad_norm": 0.21590086817741394, + "learning_rate": 4.2611086168655635e-05, + "loss": 1.7405, + "step": 18272 + }, + { + "epoch": 5.60865561694291, + "grad_norm": 0.19330906867980957, + "learning_rate": 4.260617021949776e-05, + "loss": 1.6797, + "step": 18273 + }, + { + "epoch": 5.608962553713935, + "grad_norm": 0.1955050528049469, + "learning_rate": 4.260125434341004e-05, + "loss": 1.7174, + "step": 18274 + }, + { + "epoch": 5.6092694904849605, + "grad_norm": 0.2117784321308136, + "learning_rate": 4.2596338540441086e-05, + "loss": 1.743, + "step": 18275 + }, + { + "epoch": 5.609576427255985, + "grad_norm": 0.21788950264453888, + "learning_rate": 4.2591422810639425e-05, + "loss": 1.7603, + "step": 18276 + }, + { + "epoch": 5.60988336402701, + "grad_norm": 0.2092670351266861, + "learning_rate": 4.258650715405369e-05, + "loss": 1.7379, + "step": 18277 + }, + { + "epoch": 5.610190300798036, + "grad_norm": 0.1941552758216858, + "learning_rate": 4.2581591570732414e-05, + "loss": 1.7547, + "step": 18278 + }, + { + "epoch": 5.610497237569061, + "grad_norm": 0.21306751668453217, + "learning_rate": 4.2576676060724215e-05, + "loss": 1.7284, + "step": 18279 + }, + { + "epoch": 5.610804174340086, + "grad_norm": 0.18618693947792053, + "learning_rate": 4.2571760624077635e-05, + "loss": 1.7268, + "step": 18280 + }, + { + "epoch": 5.611111111111111, + "grad_norm": 0.21530354022979736, + "learning_rate": 4.256684526084129e-05, + "loss": 1.7036, + "step": 18281 + }, + { + "epoch": 5.611418047882136, + "grad_norm": 0.23363792896270752, + "learning_rate": 4.256192997106375e-05, + "loss": 1.7797, + "step": 18282 + }, + { + "epoch": 5.611724984653161, + "grad_norm": 0.1786416620016098, + "learning_rate": 4.2557014754793544e-05, + "loss": 1.7008, + "step": 18283 + }, + { + "epoch": 5.612031921424187, + "grad_norm": 0.2042730301618576, + "learning_rate": 4.25520996120793e-05, + "loss": 1.7667, + "step": 18284 + }, + { + "epoch": 5.612338858195212, + "grad_norm": 0.2275264412164688, + "learning_rate": 4.2547184542969554e-05, + "loss": 1.8277, + "step": 18285 + }, + { + "epoch": 5.612645794966237, + "grad_norm": 0.21252553164958954, + "learning_rate": 4.2542269547512925e-05, + "loss": 1.7272, + "step": 18286 + }, + { + "epoch": 5.612952731737262, + "grad_norm": 0.20384398102760315, + "learning_rate": 4.2537354625757934e-05, + "loss": 1.6707, + "step": 18287 + }, + { + "epoch": 5.613259668508287, + "grad_norm": 0.19805553555488586, + "learning_rate": 4.253243977775321e-05, + "loss": 1.7443, + "step": 18288 + }, + { + "epoch": 5.6135666052793125, + "grad_norm": 0.20447707176208496, + "learning_rate": 4.2527525003547256e-05, + "loss": 1.7392, + "step": 18289 + }, + { + "epoch": 5.613873542050338, + "grad_norm": 0.21025662124156952, + "learning_rate": 4.25226103031887e-05, + "loss": 1.7856, + "step": 18290 + }, + { + "epoch": 5.614180478821363, + "grad_norm": 0.2131013125181198, + "learning_rate": 4.2517695676726085e-05, + "loss": 1.7521, + "step": 18291 + }, + { + "epoch": 5.614487415592388, + "grad_norm": 0.2511558532714844, + "learning_rate": 4.2512781124208e-05, + "loss": 1.6873, + "step": 18292 + }, + { + "epoch": 5.614794352363413, + "grad_norm": 0.19668610394001007, + "learning_rate": 4.2507866645682984e-05, + "loss": 1.6808, + "step": 18293 + }, + { + "epoch": 5.615101289134438, + "grad_norm": 0.22313621640205383, + "learning_rate": 4.2502952241199637e-05, + "loss": 1.7794, + "step": 18294 + }, + { + "epoch": 5.615408225905464, + "grad_norm": 0.2053089439868927, + "learning_rate": 4.249803791080649e-05, + "loss": 1.7405, + "step": 18295 + }, + { + "epoch": 5.615715162676489, + "grad_norm": 0.2052931934595108, + "learning_rate": 4.249312365455215e-05, + "loss": 1.6698, + "step": 18296 + }, + { + "epoch": 5.616022099447514, + "grad_norm": 0.223783478140831, + "learning_rate": 4.248820947248515e-05, + "loss": 1.7696, + "step": 18297 + }, + { + "epoch": 5.616329036218539, + "grad_norm": 0.3424001932144165, + "learning_rate": 4.248329536465407e-05, + "loss": 1.7724, + "step": 18298 + }, + { + "epoch": 5.616635972989564, + "grad_norm": 0.25015103816986084, + "learning_rate": 4.247838133110749e-05, + "loss": 1.7188, + "step": 18299 + }, + { + "epoch": 5.616942909760589, + "grad_norm": 0.239765465259552, + "learning_rate": 4.247346737189392e-05, + "loss": 1.695, + "step": 18300 + }, + { + "epoch": 5.617249846531615, + "grad_norm": 0.42259401082992554, + "learning_rate": 4.246855348706197e-05, + "loss": 1.6882, + "step": 18301 + }, + { + "epoch": 5.617556783302639, + "grad_norm": 0.2985959053039551, + "learning_rate": 4.246363967666018e-05, + "loss": 1.7236, + "step": 18302 + }, + { + "epoch": 5.6178637200736645, + "grad_norm": 0.22437956929206848, + "learning_rate": 4.245872594073714e-05, + "loss": 1.7158, + "step": 18303 + }, + { + "epoch": 5.61817065684469, + "grad_norm": 0.3165835440158844, + "learning_rate": 4.245381227934138e-05, + "loss": 1.7543, + "step": 18304 + }, + { + "epoch": 5.618477593615715, + "grad_norm": 0.2565564513206482, + "learning_rate": 4.244889869252148e-05, + "loss": 1.7863, + "step": 18305 + }, + { + "epoch": 5.6187845303867405, + "grad_norm": 0.25741446018218994, + "learning_rate": 4.244398518032597e-05, + "loss": 1.721, + "step": 18306 + }, + { + "epoch": 5.619091467157766, + "grad_norm": 0.26492297649383545, + "learning_rate": 4.2439071742803435e-05, + "loss": 1.7697, + "step": 18307 + }, + { + "epoch": 5.61939840392879, + "grad_norm": 0.2086823433637619, + "learning_rate": 4.243415838000243e-05, + "loss": 1.7072, + "step": 18308 + }, + { + "epoch": 5.619705340699816, + "grad_norm": 0.26784422993659973, + "learning_rate": 4.24292450919715e-05, + "loss": 1.7826, + "step": 18309 + }, + { + "epoch": 5.620012277470841, + "grad_norm": 0.21774251759052277, + "learning_rate": 4.242433187875921e-05, + "loss": 1.7204, + "step": 18310 + }, + { + "epoch": 5.620319214241866, + "grad_norm": 0.29547446966171265, + "learning_rate": 4.241941874041412e-05, + "loss": 1.7303, + "step": 18311 + }, + { + "epoch": 5.620626151012892, + "grad_norm": 0.20278988778591156, + "learning_rate": 4.241450567698476e-05, + "loss": 1.692, + "step": 18312 + }, + { + "epoch": 5.620933087783916, + "grad_norm": 0.2084289938211441, + "learning_rate": 4.240959268851971e-05, + "loss": 1.7069, + "step": 18313 + }, + { + "epoch": 5.621240024554941, + "grad_norm": 0.19901904463768005, + "learning_rate": 4.240467977506752e-05, + "loss": 1.6798, + "step": 18314 + }, + { + "epoch": 5.621546961325967, + "grad_norm": 0.24629411101341248, + "learning_rate": 4.2399766936676735e-05, + "loss": 1.775, + "step": 18315 + }, + { + "epoch": 5.621853898096992, + "grad_norm": 0.2532403767108917, + "learning_rate": 4.239485417339591e-05, + "loss": 1.7669, + "step": 18316 + }, + { + "epoch": 5.622160834868017, + "grad_norm": 0.22495722770690918, + "learning_rate": 4.2389941485273576e-05, + "loss": 1.7772, + "step": 18317 + }, + { + "epoch": 5.622467771639043, + "grad_norm": 0.2789733111858368, + "learning_rate": 4.2385028872358316e-05, + "loss": 1.751, + "step": 18318 + }, + { + "epoch": 5.622774708410067, + "grad_norm": 0.2266954481601715, + "learning_rate": 4.238011633469866e-05, + "loss": 1.7213, + "step": 18319 + }, + { + "epoch": 5.6230816451810925, + "grad_norm": 0.2163502722978592, + "learning_rate": 4.237520387234316e-05, + "loss": 1.7781, + "step": 18320 + }, + { + "epoch": 5.623388581952118, + "grad_norm": 0.25249144434928894, + "learning_rate": 4.237029148534036e-05, + "loss": 1.7293, + "step": 18321 + }, + { + "epoch": 5.623695518723143, + "grad_norm": 0.2320011854171753, + "learning_rate": 4.2365379173738826e-05, + "loss": 1.7909, + "step": 18322 + }, + { + "epoch": 5.6240024554941686, + "grad_norm": 0.22074681520462036, + "learning_rate": 4.2360466937587074e-05, + "loss": 1.743, + "step": 18323 + }, + { + "epoch": 5.624309392265193, + "grad_norm": 0.20864775776863098, + "learning_rate": 4.235555477693368e-05, + "loss": 1.726, + "step": 18324 + }, + { + "epoch": 5.624616329036218, + "grad_norm": 0.24547792971134186, + "learning_rate": 4.235064269182716e-05, + "loss": 1.7646, + "step": 18325 + }, + { + "epoch": 5.624923265807244, + "grad_norm": 0.29965806007385254, + "learning_rate": 4.234573068231607e-05, + "loss": 1.7789, + "step": 18326 + }, + { + "epoch": 5.625230202578269, + "grad_norm": 0.20844583213329315, + "learning_rate": 4.234081874844896e-05, + "loss": 1.7007, + "step": 18327 + }, + { + "epoch": 5.625537139349294, + "grad_norm": 0.2455398142337799, + "learning_rate": 4.2335906890274385e-05, + "loss": 1.7094, + "step": 18328 + }, + { + "epoch": 5.62584407612032, + "grad_norm": 0.17839518189430237, + "learning_rate": 4.233099510784085e-05, + "loss": 1.6849, + "step": 18329 + }, + { + "epoch": 5.626151012891344, + "grad_norm": 0.20219004154205322, + "learning_rate": 4.232608340119693e-05, + "loss": 1.716, + "step": 18330 + }, + { + "epoch": 5.6264579496623695, + "grad_norm": 0.23570619523525238, + "learning_rate": 4.232117177039114e-05, + "loss": 1.7622, + "step": 18331 + }, + { + "epoch": 5.626764886433395, + "grad_norm": 0.23534397780895233, + "learning_rate": 4.231626021547204e-05, + "loss": 1.7758, + "step": 18332 + }, + { + "epoch": 5.62707182320442, + "grad_norm": 0.2177352011203766, + "learning_rate": 4.231134873648817e-05, + "loss": 1.7102, + "step": 18333 + }, + { + "epoch": 5.627378759975445, + "grad_norm": 0.22886058688163757, + "learning_rate": 4.230643733348803e-05, + "loss": 1.7766, + "step": 18334 + }, + { + "epoch": 5.62768569674647, + "grad_norm": 0.20723696053028107, + "learning_rate": 4.2301526006520215e-05, + "loss": 1.7287, + "step": 18335 + }, + { + "epoch": 5.627992633517495, + "grad_norm": 0.18612104654312134, + "learning_rate": 4.229661475563321e-05, + "loss": 1.7255, + "step": 18336 + }, + { + "epoch": 5.628299570288521, + "grad_norm": 0.26456236839294434, + "learning_rate": 4.229170358087558e-05, + "loss": 1.7388, + "step": 18337 + }, + { + "epoch": 5.628606507059546, + "grad_norm": 0.25253555178642273, + "learning_rate": 4.2286792482295845e-05, + "loss": 1.7031, + "step": 18338 + }, + { + "epoch": 5.628913443830571, + "grad_norm": 0.23093348741531372, + "learning_rate": 4.228188145994257e-05, + "loss": 1.8032, + "step": 18339 + }, + { + "epoch": 5.629220380601596, + "grad_norm": 0.24142487347126007, + "learning_rate": 4.227697051386424e-05, + "loss": 1.6621, + "step": 18340 + }, + { + "epoch": 5.629527317372621, + "grad_norm": 0.2883392572402954, + "learning_rate": 4.227205964410944e-05, + "loss": 1.7125, + "step": 18341 + }, + { + "epoch": 5.629834254143646, + "grad_norm": 0.22670713067054749, + "learning_rate": 4.226714885072665e-05, + "loss": 1.7659, + "step": 18342 + }, + { + "epoch": 5.630141190914672, + "grad_norm": 0.2795337438583374, + "learning_rate": 4.226223813376444e-05, + "loss": 1.7559, + "step": 18343 + }, + { + "epoch": 5.630448127685697, + "grad_norm": 0.2513083219528198, + "learning_rate": 4.225732749327132e-05, + "loss": 1.6969, + "step": 18344 + }, + { + "epoch": 5.6307550644567215, + "grad_norm": 0.24588467180728912, + "learning_rate": 4.225241692929585e-05, + "loss": 1.7724, + "step": 18345 + }, + { + "epoch": 5.631062001227747, + "grad_norm": 0.41726353764533997, + "learning_rate": 4.224750644188651e-05, + "loss": 1.7308, + "step": 18346 + }, + { + "epoch": 5.631368937998772, + "grad_norm": 0.2512385845184326, + "learning_rate": 4.2242596031091886e-05, + "loss": 1.7068, + "step": 18347 + }, + { + "epoch": 5.6316758747697975, + "grad_norm": 0.3077464997768402, + "learning_rate": 4.223768569696044e-05, + "loss": 1.7383, + "step": 18348 + }, + { + "epoch": 5.631982811540823, + "grad_norm": 0.3460720479488373, + "learning_rate": 4.2232775439540756e-05, + "loss": 1.7317, + "step": 18349 + }, + { + "epoch": 5.632289748311848, + "grad_norm": 0.24827539920806885, + "learning_rate": 4.222786525888134e-05, + "loss": 1.6871, + "step": 18350 + }, + { + "epoch": 5.632596685082873, + "grad_norm": 0.24851584434509277, + "learning_rate": 4.22229551550307e-05, + "loss": 1.7058, + "step": 18351 + }, + { + "epoch": 5.632903621853898, + "grad_norm": 0.31132519245147705, + "learning_rate": 4.2218045128037396e-05, + "loss": 1.7523, + "step": 18352 + }, + { + "epoch": 5.633210558624923, + "grad_norm": 0.3104027807712555, + "learning_rate": 4.2213135177949906e-05, + "loss": 1.7669, + "step": 18353 + }, + { + "epoch": 5.633517495395949, + "grad_norm": 0.31351104378700256, + "learning_rate": 4.2208225304816795e-05, + "loss": 1.7031, + "step": 18354 + }, + { + "epoch": 5.633824432166974, + "grad_norm": 0.3217851221561432, + "learning_rate": 4.2203315508686555e-05, + "loss": 1.7694, + "step": 18355 + }, + { + "epoch": 5.634131368937998, + "grad_norm": 0.22287796437740326, + "learning_rate": 4.2198405789607745e-05, + "loss": 1.7742, + "step": 18356 + }, + { + "epoch": 5.634438305709024, + "grad_norm": 0.20288340747356415, + "learning_rate": 4.219349614762883e-05, + "loss": 1.7113, + "step": 18357 + }, + { + "epoch": 5.634745242480049, + "grad_norm": 0.19823449850082397, + "learning_rate": 4.218858658279839e-05, + "loss": 1.7433, + "step": 18358 + }, + { + "epoch": 5.635052179251074, + "grad_norm": 0.2756347358226776, + "learning_rate": 4.2183677095164895e-05, + "loss": 1.8278, + "step": 18359 + }, + { + "epoch": 5.6353591160221, + "grad_norm": 0.2303706556558609, + "learning_rate": 4.2178767684776895e-05, + "loss": 1.6943, + "step": 18360 + }, + { + "epoch": 5.635666052793125, + "grad_norm": 0.25089216232299805, + "learning_rate": 4.217385835168288e-05, + "loss": 1.6562, + "step": 18361 + }, + { + "epoch": 5.6359729895641495, + "grad_norm": 0.3013486862182617, + "learning_rate": 4.216894909593141e-05, + "loss": 1.7323, + "step": 18362 + }, + { + "epoch": 5.636279926335175, + "grad_norm": 0.19471928477287292, + "learning_rate": 4.2164039917570956e-05, + "loss": 1.7301, + "step": 18363 + }, + { + "epoch": 5.6365868631062, + "grad_norm": 0.3257733881473541, + "learning_rate": 4.2159130816650075e-05, + "loss": 1.7522, + "step": 18364 + }, + { + "epoch": 5.6368937998772255, + "grad_norm": 0.3065868020057678, + "learning_rate": 4.215422179321723e-05, + "loss": 1.7077, + "step": 18365 + }, + { + "epoch": 5.637200736648251, + "grad_norm": 0.20643819868564606, + "learning_rate": 4.214931284732098e-05, + "loss": 1.8033, + "step": 18366 + }, + { + "epoch": 5.637507673419275, + "grad_norm": 0.23551981151103973, + "learning_rate": 4.2144403979009826e-05, + "loss": 1.7391, + "step": 18367 + }, + { + "epoch": 5.637814610190301, + "grad_norm": 0.20602314174175262, + "learning_rate": 4.2139495188332265e-05, + "loss": 1.7593, + "step": 18368 + }, + { + "epoch": 5.638121546961326, + "grad_norm": 0.27911239862442017, + "learning_rate": 4.2134586475336834e-05, + "loss": 1.7212, + "step": 18369 + }, + { + "epoch": 5.638428483732351, + "grad_norm": 0.2700496017932892, + "learning_rate": 4.212967784007201e-05, + "loss": 1.7755, + "step": 18370 + }, + { + "epoch": 5.638735420503377, + "grad_norm": 0.24988985061645508, + "learning_rate": 4.2124769282586334e-05, + "loss": 1.7364, + "step": 18371 + }, + { + "epoch": 5.639042357274402, + "grad_norm": 0.20491284132003784, + "learning_rate": 4.211986080292829e-05, + "loss": 1.7477, + "step": 18372 + }, + { + "epoch": 5.639349294045426, + "grad_norm": 0.24953459203243256, + "learning_rate": 4.211495240114643e-05, + "loss": 1.7712, + "step": 18373 + }, + { + "epoch": 5.639656230816452, + "grad_norm": 0.2028491199016571, + "learning_rate": 4.2110044077289204e-05, + "loss": 1.701, + "step": 18374 + }, + { + "epoch": 5.639963167587477, + "grad_norm": 0.22320568561553955, + "learning_rate": 4.210513583140517e-05, + "loss": 1.7818, + "step": 18375 + }, + { + "epoch": 5.640270104358502, + "grad_norm": 0.22680947184562683, + "learning_rate": 4.210022766354278e-05, + "loss": 1.7631, + "step": 18376 + }, + { + "epoch": 5.640577041129527, + "grad_norm": 0.20724014937877655, + "learning_rate": 4.2095319573750596e-05, + "loss": 1.7757, + "step": 18377 + }, + { + "epoch": 5.640883977900552, + "grad_norm": 0.21785953640937805, + "learning_rate": 4.209041156207708e-05, + "loss": 1.7161, + "step": 18378 + }, + { + "epoch": 5.6411909146715775, + "grad_norm": 0.21751803159713745, + "learning_rate": 4.208550362857078e-05, + "loss": 1.7449, + "step": 18379 + }, + { + "epoch": 5.641497851442603, + "grad_norm": 0.1765962839126587, + "learning_rate": 4.208059577328014e-05, + "loss": 1.7191, + "step": 18380 + }, + { + "epoch": 5.641804788213628, + "grad_norm": 0.22720913589000702, + "learning_rate": 4.2075687996253724e-05, + "loss": 1.7037, + "step": 18381 + }, + { + "epoch": 5.6421117249846535, + "grad_norm": 0.23589655756950378, + "learning_rate": 4.2070780297539976e-05, + "loss": 1.8147, + "step": 18382 + }, + { + "epoch": 5.642418661755678, + "grad_norm": 0.21187056601047516, + "learning_rate": 4.2065872677187435e-05, + "loss": 1.7655, + "step": 18383 + }, + { + "epoch": 5.642725598526703, + "grad_norm": 0.24153946340084076, + "learning_rate": 4.2060965135244606e-05, + "loss": 1.7841, + "step": 18384 + }, + { + "epoch": 5.643032535297729, + "grad_norm": 0.2059229612350464, + "learning_rate": 4.205605767175995e-05, + "loss": 1.6718, + "step": 18385 + }, + { + "epoch": 5.643339472068754, + "grad_norm": 0.20235973596572876, + "learning_rate": 4.205115028678201e-05, + "loss": 1.6931, + "step": 18386 + }, + { + "epoch": 5.643646408839779, + "grad_norm": 0.25149911642074585, + "learning_rate": 4.204624298035924e-05, + "loss": 1.7465, + "step": 18387 + }, + { + "epoch": 5.643953345610804, + "grad_norm": 0.2050812691450119, + "learning_rate": 4.204133575254017e-05, + "loss": 1.7147, + "step": 18388 + }, + { + "epoch": 5.644260282381829, + "grad_norm": 0.20906420052051544, + "learning_rate": 4.2036428603373274e-05, + "loss": 1.6762, + "step": 18389 + }, + { + "epoch": 5.644567219152854, + "grad_norm": 0.20150595903396606, + "learning_rate": 4.2031521532907075e-05, + "loss": 1.678, + "step": 18390 + }, + { + "epoch": 5.64487415592388, + "grad_norm": 0.2141568511724472, + "learning_rate": 4.202661454119004e-05, + "loss": 1.7274, + "step": 18391 + }, + { + "epoch": 5.645181092694905, + "grad_norm": 0.2641741931438446, + "learning_rate": 4.202170762827069e-05, + "loss": 1.7975, + "step": 18392 + }, + { + "epoch": 5.64548802946593, + "grad_norm": 0.22928468883037567, + "learning_rate": 4.201680079419747e-05, + "loss": 1.7687, + "step": 18393 + }, + { + "epoch": 5.645794966236955, + "grad_norm": 0.22713731229305267, + "learning_rate": 4.2011894039018925e-05, + "loss": 1.7475, + "step": 18394 + }, + { + "epoch": 5.64610190300798, + "grad_norm": 0.25602981448173523, + "learning_rate": 4.200698736278351e-05, + "loss": 1.7356, + "step": 18395 + }, + { + "epoch": 5.6464088397790055, + "grad_norm": 0.2619759738445282, + "learning_rate": 4.200208076553975e-05, + "loss": 1.7334, + "step": 18396 + }, + { + "epoch": 5.646715776550031, + "grad_norm": 0.24756783246994019, + "learning_rate": 4.19971742473361e-05, + "loss": 1.7253, + "step": 18397 + }, + { + "epoch": 5.647022713321056, + "grad_norm": 0.2068249136209488, + "learning_rate": 4.199226780822109e-05, + "loss": 1.7246, + "step": 18398 + }, + { + "epoch": 5.647329650092081, + "grad_norm": 0.23219087719917297, + "learning_rate": 4.1987361448243165e-05, + "loss": 1.7388, + "step": 18399 + }, + { + "epoch": 5.647636586863106, + "grad_norm": 0.2051403522491455, + "learning_rate": 4.198245516745082e-05, + "loss": 1.7775, + "step": 18400 + }, + { + "epoch": 5.647943523634131, + "grad_norm": 0.26408639550209045, + "learning_rate": 4.1977548965892575e-05, + "loss": 1.8069, + "step": 18401 + }, + { + "epoch": 5.648250460405157, + "grad_norm": 0.2104891538619995, + "learning_rate": 4.197264284361687e-05, + "loss": 1.7335, + "step": 18402 + }, + { + "epoch": 5.648557397176182, + "grad_norm": 0.23963849246501923, + "learning_rate": 4.196773680067224e-05, + "loss": 1.7254, + "step": 18403 + }, + { + "epoch": 5.648864333947207, + "grad_norm": 0.2770128846168518, + "learning_rate": 4.1962830837107117e-05, + "loss": 1.7848, + "step": 18404 + }, + { + "epoch": 5.649171270718232, + "grad_norm": 0.23342710733413696, + "learning_rate": 4.195792495297002e-05, + "loss": 1.7818, + "step": 18405 + }, + { + "epoch": 5.649478207489257, + "grad_norm": 0.23835061490535736, + "learning_rate": 4.195301914830941e-05, + "loss": 1.7453, + "step": 18406 + }, + { + "epoch": 5.649785144260282, + "grad_norm": 0.21896767616271973, + "learning_rate": 4.194811342317381e-05, + "loss": 1.7205, + "step": 18407 + }, + { + "epoch": 5.650092081031308, + "grad_norm": 0.20222818851470947, + "learning_rate": 4.1943207777611646e-05, + "loss": 1.6833, + "step": 18408 + }, + { + "epoch": 5.650399017802332, + "grad_norm": 0.2182089239358902, + "learning_rate": 4.193830221167146e-05, + "loss": 1.7296, + "step": 18409 + }, + { + "epoch": 5.650705954573358, + "grad_norm": 0.19981688261032104, + "learning_rate": 4.1933396725401655e-05, + "loss": 1.7327, + "step": 18410 + }, + { + "epoch": 5.651012891344383, + "grad_norm": 0.23925067484378815, + "learning_rate": 4.192849131885077e-05, + "loss": 1.7545, + "step": 18411 + }, + { + "epoch": 5.651319828115408, + "grad_norm": 0.21967993676662445, + "learning_rate": 4.192358599206725e-05, + "loss": 1.6973, + "step": 18412 + }, + { + "epoch": 5.651626764886434, + "grad_norm": 0.2273840606212616, + "learning_rate": 4.1918680745099614e-05, + "loss": 1.8229, + "step": 18413 + }, + { + "epoch": 5.651933701657459, + "grad_norm": 0.26950231194496155, + "learning_rate": 4.1913775577996286e-05, + "loss": 1.7666, + "step": 18414 + }, + { + "epoch": 5.652240638428484, + "grad_norm": 0.26608848571777344, + "learning_rate": 4.190887049080579e-05, + "loss": 1.8279, + "step": 18415 + }, + { + "epoch": 5.652547575199509, + "grad_norm": 0.20856785774230957, + "learning_rate": 4.190396548357658e-05, + "loss": 1.7224, + "step": 18416 + }, + { + "epoch": 5.652854511970534, + "grad_norm": 0.2894255816936493, + "learning_rate": 4.18990605563571e-05, + "loss": 1.7308, + "step": 18417 + }, + { + "epoch": 5.653161448741559, + "grad_norm": 0.2047591209411621, + "learning_rate": 4.189415570919588e-05, + "loss": 1.758, + "step": 18418 + }, + { + "epoch": 5.653468385512585, + "grad_norm": 0.37161269783973694, + "learning_rate": 4.1889250942141346e-05, + "loss": 1.7926, + "step": 18419 + }, + { + "epoch": 5.653775322283609, + "grad_norm": 0.37338340282440186, + "learning_rate": 4.1884346255242e-05, + "loss": 1.7491, + "step": 18420 + }, + { + "epoch": 5.6540822590546345, + "grad_norm": 0.24279838800430298, + "learning_rate": 4.187944164854629e-05, + "loss": 1.7103, + "step": 18421 + }, + { + "epoch": 5.65438919582566, + "grad_norm": 0.219639852643013, + "learning_rate": 4.18745371221027e-05, + "loss": 1.7824, + "step": 18422 + }, + { + "epoch": 5.654696132596685, + "grad_norm": 0.22248409688472748, + "learning_rate": 4.186963267595969e-05, + "loss": 1.8098, + "step": 18423 + }, + { + "epoch": 5.6550030693677105, + "grad_norm": 0.2115657478570938, + "learning_rate": 4.1864728310165755e-05, + "loss": 1.72, + "step": 18424 + }, + { + "epoch": 5.655310006138736, + "grad_norm": 0.19723005592823029, + "learning_rate": 4.1859824024769325e-05, + "loss": 1.6818, + "step": 18425 + }, + { + "epoch": 5.65561694290976, + "grad_norm": 0.1828317642211914, + "learning_rate": 4.185491981981891e-05, + "loss": 1.7243, + "step": 18426 + }, + { + "epoch": 5.655923879680786, + "grad_norm": 0.271781861782074, + "learning_rate": 4.185001569536292e-05, + "loss": 1.7688, + "step": 18427 + }, + { + "epoch": 5.656230816451811, + "grad_norm": 0.3140811324119568, + "learning_rate": 4.184511165144986e-05, + "loss": 1.7319, + "step": 18428 + }, + { + "epoch": 5.656537753222836, + "grad_norm": 0.20013047754764557, + "learning_rate": 4.184020768812818e-05, + "loss": 1.7104, + "step": 18429 + }, + { + "epoch": 5.656844689993862, + "grad_norm": 0.2615044414997101, + "learning_rate": 4.183530380544638e-05, + "loss": 1.7314, + "step": 18430 + }, + { + "epoch": 5.657151626764886, + "grad_norm": 0.2645856440067291, + "learning_rate": 4.183040000345287e-05, + "loss": 1.7431, + "step": 18431 + }, + { + "epoch": 5.657458563535911, + "grad_norm": 0.1916145384311676, + "learning_rate": 4.182549628219615e-05, + "loss": 1.7013, + "step": 18432 + }, + { + "epoch": 5.657765500306937, + "grad_norm": 0.2647114396095276, + "learning_rate": 4.182059264172466e-05, + "loss": 1.7278, + "step": 18433 + }, + { + "epoch": 5.658072437077962, + "grad_norm": 0.20201756060123444, + "learning_rate": 4.1815689082086854e-05, + "loss": 1.7065, + "step": 18434 + }, + { + "epoch": 5.658379373848987, + "grad_norm": 0.23892022669315338, + "learning_rate": 4.181078560333123e-05, + "loss": 1.7365, + "step": 18435 + }, + { + "epoch": 5.658686310620013, + "grad_norm": 0.3125975728034973, + "learning_rate": 4.18058822055062e-05, + "loss": 1.7152, + "step": 18436 + }, + { + "epoch": 5.658993247391037, + "grad_norm": 0.18924804031848907, + "learning_rate": 4.180097888866027e-05, + "loss": 1.7763, + "step": 18437 + }, + { + "epoch": 5.6593001841620625, + "grad_norm": 0.28476929664611816, + "learning_rate": 4.1796075652841845e-05, + "loss": 1.7517, + "step": 18438 + }, + { + "epoch": 5.659607120933088, + "grad_norm": 0.30616337060928345, + "learning_rate": 4.1791172498099416e-05, + "loss": 1.7446, + "step": 18439 + }, + { + "epoch": 5.659914057704113, + "grad_norm": 0.3219330608844757, + "learning_rate": 4.1786269424481426e-05, + "loss": 1.8374, + "step": 18440 + }, + { + "epoch": 5.6602209944751385, + "grad_norm": 0.34074151515960693, + "learning_rate": 4.1781366432036364e-05, + "loss": 1.7915, + "step": 18441 + }, + { + "epoch": 5.660527931246163, + "grad_norm": 0.2321610003709793, + "learning_rate": 4.177646352081263e-05, + "loss": 1.7361, + "step": 18442 + }, + { + "epoch": 5.660834868017188, + "grad_norm": 0.34283575415611267, + "learning_rate": 4.1771560690858716e-05, + "loss": 1.6859, + "step": 18443 + }, + { + "epoch": 5.661141804788214, + "grad_norm": 0.32274290919303894, + "learning_rate": 4.1766657942223055e-05, + "loss": 1.7376, + "step": 18444 + }, + { + "epoch": 5.661448741559239, + "grad_norm": 0.23960906267166138, + "learning_rate": 4.1761755274954105e-05, + "loss": 1.7198, + "step": 18445 + }, + { + "epoch": 5.661755678330264, + "grad_norm": 0.2622305154800415, + "learning_rate": 4.175685268910031e-05, + "loss": 1.6997, + "step": 18446 + }, + { + "epoch": 5.66206261510129, + "grad_norm": 0.19836951792240143, + "learning_rate": 4.1751950184710157e-05, + "loss": 1.6612, + "step": 18447 + }, + { + "epoch": 5.662369551872314, + "grad_norm": 0.29541507363319397, + "learning_rate": 4.174704776183204e-05, + "loss": 1.7606, + "step": 18448 + }, + { + "epoch": 5.662676488643339, + "grad_norm": 0.21632203459739685, + "learning_rate": 4.174214542051445e-05, + "loss": 1.7108, + "step": 18449 + }, + { + "epoch": 5.662983425414365, + "grad_norm": 0.2851164638996124, + "learning_rate": 4.173724316080582e-05, + "loss": 1.747, + "step": 18450 + }, + { + "epoch": 5.66329036218539, + "grad_norm": 0.30293309688568115, + "learning_rate": 4.173234098275458e-05, + "loss": 1.7549, + "step": 18451 + }, + { + "epoch": 5.6635972989564145, + "grad_norm": 0.2131963074207306, + "learning_rate": 4.172743888640921e-05, + "loss": 1.7804, + "step": 18452 + }, + { + "epoch": 5.66390423572744, + "grad_norm": 0.234910249710083, + "learning_rate": 4.172253687181812e-05, + "loss": 1.7149, + "step": 18453 + }, + { + "epoch": 5.664211172498465, + "grad_norm": 0.21238654851913452, + "learning_rate": 4.171763493902979e-05, + "loss": 1.7272, + "step": 18454 + }, + { + "epoch": 5.6645181092694905, + "grad_norm": 0.20571236312389374, + "learning_rate": 4.171273308809263e-05, + "loss": 1.713, + "step": 18455 + }, + { + "epoch": 5.664825046040516, + "grad_norm": 0.24867361783981323, + "learning_rate": 4.1707831319055104e-05, + "loss": 1.682, + "step": 18456 + }, + { + "epoch": 5.665131982811541, + "grad_norm": 0.20556440949440002, + "learning_rate": 4.170292963196564e-05, + "loss": 1.7126, + "step": 18457 + }, + { + "epoch": 5.665438919582566, + "grad_norm": 0.26431065797805786, + "learning_rate": 4.169802802687271e-05, + "loss": 1.8142, + "step": 18458 + }, + { + "epoch": 5.665745856353591, + "grad_norm": 0.26041486859321594, + "learning_rate": 4.169312650382471e-05, + "loss": 1.7206, + "step": 18459 + }, + { + "epoch": 5.666052793124616, + "grad_norm": 0.2190525084733963, + "learning_rate": 4.1688225062870126e-05, + "loss": 1.787, + "step": 18460 + }, + { + "epoch": 5.666359729895642, + "grad_norm": 0.24726425111293793, + "learning_rate": 4.1683323704057354e-05, + "loss": 1.7677, + "step": 18461 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.22206442058086395, + "learning_rate": 4.167842242743486e-05, + "loss": 1.73, + "step": 18462 + }, + { + "epoch": 5.666973603437691, + "grad_norm": 0.22501195967197418, + "learning_rate": 4.167352123305108e-05, + "loss": 1.7213, + "step": 18463 + }, + { + "epoch": 5.667280540208717, + "grad_norm": 0.26164770126342773, + "learning_rate": 4.166862012095443e-05, + "loss": 1.7839, + "step": 18464 + }, + { + "epoch": 5.667587476979742, + "grad_norm": 0.19480809569358826, + "learning_rate": 4.166371909119336e-05, + "loss": 1.7562, + "step": 18465 + }, + { + "epoch": 5.667894413750767, + "grad_norm": 0.26677292585372925, + "learning_rate": 4.165881814381632e-05, + "loss": 1.776, + "step": 18466 + }, + { + "epoch": 5.668201350521793, + "grad_norm": 0.22019581496715546, + "learning_rate": 4.165391727887172e-05, + "loss": 1.7575, + "step": 18467 + }, + { + "epoch": 5.668508287292818, + "grad_norm": 0.23851899802684784, + "learning_rate": 4.1649016496407986e-05, + "loss": 1.7346, + "step": 18468 + }, + { + "epoch": 5.6688152240638425, + "grad_norm": 0.3118130564689636, + "learning_rate": 4.1644115796473596e-05, + "loss": 1.7808, + "step": 18469 + }, + { + "epoch": 5.669122160834868, + "grad_norm": 0.22783879935741425, + "learning_rate": 4.163921517911692e-05, + "loss": 1.831, + "step": 18470 + }, + { + "epoch": 5.669429097605893, + "grad_norm": 0.2203773707151413, + "learning_rate": 4.163431464438645e-05, + "loss": 1.7034, + "step": 18471 + }, + { + "epoch": 5.6697360343769185, + "grad_norm": 0.21838103234767914, + "learning_rate": 4.162941419233056e-05, + "loss": 1.7553, + "step": 18472 + }, + { + "epoch": 5.670042971147944, + "grad_norm": 0.18453563749790192, + "learning_rate": 4.162451382299771e-05, + "loss": 1.7139, + "step": 18473 + }, + { + "epoch": 5.670349907918968, + "grad_norm": 0.25308313965797424, + "learning_rate": 4.161961353643633e-05, + "loss": 1.7291, + "step": 18474 + }, + { + "epoch": 5.670656844689994, + "grad_norm": 0.2528827488422394, + "learning_rate": 4.1614713332694845e-05, + "loss": 1.781, + "step": 18475 + }, + { + "epoch": 5.670963781461019, + "grad_norm": 0.24774135649204254, + "learning_rate": 4.160981321182166e-05, + "loss": 1.7808, + "step": 18476 + }, + { + "epoch": 5.671270718232044, + "grad_norm": 0.25225830078125, + "learning_rate": 4.160491317386524e-05, + "loss": 1.739, + "step": 18477 + }, + { + "epoch": 5.67157765500307, + "grad_norm": 0.2095808982849121, + "learning_rate": 4.160001321887397e-05, + "loss": 1.7242, + "step": 18478 + }, + { + "epoch": 5.671884591774095, + "grad_norm": 0.23906216025352478, + "learning_rate": 4.159511334689631e-05, + "loss": 1.7071, + "step": 18479 + }, + { + "epoch": 5.672191528545119, + "grad_norm": 0.21851155161857605, + "learning_rate": 4.159021355798065e-05, + "loss": 1.7171, + "step": 18480 + }, + { + "epoch": 5.672498465316145, + "grad_norm": 0.2005140632390976, + "learning_rate": 4.158531385217544e-05, + "loss": 1.7483, + "step": 18481 + }, + { + "epoch": 5.67280540208717, + "grad_norm": 0.2230832278728485, + "learning_rate": 4.1580414229529074e-05, + "loss": 1.7386, + "step": 18482 + }, + { + "epoch": 5.673112338858195, + "grad_norm": 0.22402967512607574, + "learning_rate": 4.1575514690090014e-05, + "loss": 1.7989, + "step": 18483 + }, + { + "epoch": 5.67341927562922, + "grad_norm": 0.20350080728530884, + "learning_rate": 4.157061523390665e-05, + "loss": 1.6856, + "step": 18484 + }, + { + "epoch": 5.673726212400245, + "grad_norm": 0.2039422243833542, + "learning_rate": 4.15657158610274e-05, + "loss": 1.7262, + "step": 18485 + }, + { + "epoch": 5.6740331491712706, + "grad_norm": 0.20411522686481476, + "learning_rate": 4.156081657150069e-05, + "loss": 1.738, + "step": 18486 + }, + { + "epoch": 5.674340085942296, + "grad_norm": 0.2693086862564087, + "learning_rate": 4.155591736537493e-05, + "loss": 1.731, + "step": 18487 + }, + { + "epoch": 5.674647022713321, + "grad_norm": 0.20745019614696503, + "learning_rate": 4.1551018242698567e-05, + "loss": 1.7138, + "step": 18488 + }, + { + "epoch": 5.6749539594843466, + "grad_norm": 0.22033964097499847, + "learning_rate": 4.1546119203519964e-05, + "loss": 1.8144, + "step": 18489 + }, + { + "epoch": 5.675260896255372, + "grad_norm": 0.22859029471874237, + "learning_rate": 4.154122024788759e-05, + "loss": 1.6724, + "step": 18490 + }, + { + "epoch": 5.675567833026396, + "grad_norm": 0.2226465791463852, + "learning_rate": 4.153632137584982e-05, + "loss": 1.731, + "step": 18491 + }, + { + "epoch": 5.675874769797422, + "grad_norm": 0.19657716155052185, + "learning_rate": 4.1531422587455086e-05, + "loss": 1.6937, + "step": 18492 + }, + { + "epoch": 5.676181706568447, + "grad_norm": 0.23167578876018524, + "learning_rate": 4.152652388275179e-05, + "loss": 1.7444, + "step": 18493 + }, + { + "epoch": 5.676488643339472, + "grad_norm": 0.24468563497066498, + "learning_rate": 4.1521625261788374e-05, + "loss": 1.7173, + "step": 18494 + }, + { + "epoch": 5.676795580110497, + "grad_norm": 0.27125802636146545, + "learning_rate": 4.1516726724613206e-05, + "loss": 1.7424, + "step": 18495 + }, + { + "epoch": 5.677102516881522, + "grad_norm": 0.23816901445388794, + "learning_rate": 4.151182827127473e-05, + "loss": 1.6911, + "step": 18496 + }, + { + "epoch": 5.6774094536525475, + "grad_norm": 0.26058733463287354, + "learning_rate": 4.150692990182133e-05, + "loss": 1.7142, + "step": 18497 + }, + { + "epoch": 5.677716390423573, + "grad_norm": 0.20207929611206055, + "learning_rate": 4.150203161630143e-05, + "loss": 1.7506, + "step": 18498 + }, + { + "epoch": 5.678023327194598, + "grad_norm": 0.259857714176178, + "learning_rate": 4.1497133414763435e-05, + "loss": 1.7181, + "step": 18499 + }, + { + "epoch": 5.6783302639656235, + "grad_norm": 0.2607496380805969, + "learning_rate": 4.149223529725577e-05, + "loss": 1.7829, + "step": 18500 + }, + { + "epoch": 5.678637200736648, + "grad_norm": 0.23265719413757324, + "learning_rate": 4.148733726382681e-05, + "loss": 1.7028, + "step": 18501 + }, + { + "epoch": 5.678944137507673, + "grad_norm": 0.26610276103019714, + "learning_rate": 4.1482439314524964e-05, + "loss": 1.8604, + "step": 18502 + }, + { + "epoch": 5.679251074278699, + "grad_norm": 0.24022582173347473, + "learning_rate": 4.147754144939865e-05, + "loss": 1.7142, + "step": 18503 + }, + { + "epoch": 5.679558011049724, + "grad_norm": 0.2849755585193634, + "learning_rate": 4.1472643668496255e-05, + "loss": 1.6956, + "step": 18504 + }, + { + "epoch": 5.679864947820749, + "grad_norm": 0.24330341815948486, + "learning_rate": 4.1467745971866216e-05, + "loss": 1.7617, + "step": 18505 + }, + { + "epoch": 5.680171884591774, + "grad_norm": 0.21072770655155182, + "learning_rate": 4.146284835955689e-05, + "loss": 1.6999, + "step": 18506 + }, + { + "epoch": 5.680478821362799, + "grad_norm": 0.1971336454153061, + "learning_rate": 4.145795083161673e-05, + "loss": 1.6756, + "step": 18507 + }, + { + "epoch": 5.680785758133824, + "grad_norm": 0.18576614558696747, + "learning_rate": 4.1453053388094073e-05, + "loss": 1.6885, + "step": 18508 + }, + { + "epoch": 5.68109269490485, + "grad_norm": 0.21335965394973755, + "learning_rate": 4.144815602903737e-05, + "loss": 1.7278, + "step": 18509 + }, + { + "epoch": 5.681399631675875, + "grad_norm": 0.21756233274936676, + "learning_rate": 4.1443258754494986e-05, + "loss": 1.7549, + "step": 18510 + }, + { + "epoch": 5.6817065684469, + "grad_norm": 0.2214142084121704, + "learning_rate": 4.143836156451536e-05, + "loss": 1.6654, + "step": 18511 + }, + { + "epoch": 5.682013505217925, + "grad_norm": 0.2230863869190216, + "learning_rate": 4.143346445914684e-05, + "loss": 1.7286, + "step": 18512 + }, + { + "epoch": 5.68232044198895, + "grad_norm": 0.2283746749162674, + "learning_rate": 4.142856743843787e-05, + "loss": 1.7652, + "step": 18513 + }, + { + "epoch": 5.6826273787599755, + "grad_norm": 0.20059749484062195, + "learning_rate": 4.142367050243679e-05, + "loss": 1.6854, + "step": 18514 + }, + { + "epoch": 5.682934315531001, + "grad_norm": 0.17887794971466064, + "learning_rate": 4.141877365119204e-05, + "loss": 1.6975, + "step": 18515 + }, + { + "epoch": 5.683241252302026, + "grad_norm": 0.21266087889671326, + "learning_rate": 4.141387688475199e-05, + "loss": 1.7361, + "step": 18516 + }, + { + "epoch": 5.683548189073051, + "grad_norm": 0.20075422525405884, + "learning_rate": 4.140898020316506e-05, + "loss": 1.7496, + "step": 18517 + }, + { + "epoch": 5.683855125844076, + "grad_norm": 0.21430443227291107, + "learning_rate": 4.140408360647963e-05, + "loss": 1.7481, + "step": 18518 + }, + { + "epoch": 5.684162062615101, + "grad_norm": 0.1951984018087387, + "learning_rate": 4.139918709474405e-05, + "loss": 1.713, + "step": 18519 + }, + { + "epoch": 5.684468999386127, + "grad_norm": 0.21636274456977844, + "learning_rate": 4.1394290668006764e-05, + "loss": 1.8169, + "step": 18520 + }, + { + "epoch": 5.684775936157152, + "grad_norm": 0.21003715693950653, + "learning_rate": 4.138939432631613e-05, + "loss": 1.7453, + "step": 18521 + }, + { + "epoch": 5.685082872928177, + "grad_norm": 0.23559699952602386, + "learning_rate": 4.138449806972057e-05, + "loss": 1.7534, + "step": 18522 + }, + { + "epoch": 5.685389809699202, + "grad_norm": 0.23322029411792755, + "learning_rate": 4.137960189826843e-05, + "loss": 1.7535, + "step": 18523 + }, + { + "epoch": 5.685696746470227, + "grad_norm": 0.1998462826013565, + "learning_rate": 4.137470581200813e-05, + "loss": 1.7025, + "step": 18524 + }, + { + "epoch": 5.686003683241252, + "grad_norm": 0.22321350872516632, + "learning_rate": 4.1369809810988025e-05, + "loss": 1.7666, + "step": 18525 + }, + { + "epoch": 5.686310620012278, + "grad_norm": 0.20851604640483856, + "learning_rate": 4.136491389525653e-05, + "loss": 1.6958, + "step": 18526 + }, + { + "epoch": 5.686617556783302, + "grad_norm": 0.21494868397712708, + "learning_rate": 4.136001806486201e-05, + "loss": 1.7703, + "step": 18527 + }, + { + "epoch": 5.6869244935543275, + "grad_norm": 0.19872798025608063, + "learning_rate": 4.135512231985287e-05, + "loss": 1.7451, + "step": 18528 + }, + { + "epoch": 5.687231430325353, + "grad_norm": 0.2424371987581253, + "learning_rate": 4.1350226660277456e-05, + "loss": 1.8153, + "step": 18529 + }, + { + "epoch": 5.687538367096378, + "grad_norm": 0.20388297736644745, + "learning_rate": 4.1345331086184196e-05, + "loss": 1.6882, + "step": 18530 + }, + { + "epoch": 5.6878453038674035, + "grad_norm": 0.22662605345249176, + "learning_rate": 4.134043559762143e-05, + "loss": 1.7532, + "step": 18531 + }, + { + "epoch": 5.688152240638429, + "grad_norm": 0.2281452864408493, + "learning_rate": 4.133554019463756e-05, + "loss": 1.769, + "step": 18532 + }, + { + "epoch": 5.688459177409453, + "grad_norm": 0.2303505390882492, + "learning_rate": 4.1330644877280955e-05, + "loss": 1.7176, + "step": 18533 + }, + { + "epoch": 5.688766114180479, + "grad_norm": 0.24411743879318237, + "learning_rate": 4.132574964560001e-05, + "loss": 1.7557, + "step": 18534 + }, + { + "epoch": 5.689073050951504, + "grad_norm": 0.2674088776111603, + "learning_rate": 4.13208544996431e-05, + "loss": 1.6997, + "step": 18535 + }, + { + "epoch": 5.689379987722529, + "grad_norm": 0.22232958674430847, + "learning_rate": 4.1315959439458565e-05, + "loss": 1.7731, + "step": 18536 + }, + { + "epoch": 5.689686924493555, + "grad_norm": 0.23894453048706055, + "learning_rate": 4.131106446509483e-05, + "loss": 1.7454, + "step": 18537 + }, + { + "epoch": 5.689993861264579, + "grad_norm": 0.19710026681423187, + "learning_rate": 4.1306169576600226e-05, + "loss": 1.6872, + "step": 18538 + }, + { + "epoch": 5.690300798035604, + "grad_norm": 0.1879546344280243, + "learning_rate": 4.130127477402318e-05, + "loss": 1.6929, + "step": 18539 + }, + { + "epoch": 5.69060773480663, + "grad_norm": 0.1964653730392456, + "learning_rate": 4.129638005741201e-05, + "loss": 1.7778, + "step": 18540 + }, + { + "epoch": 5.690914671577655, + "grad_norm": 0.20161493122577667, + "learning_rate": 4.129148542681513e-05, + "loss": 1.7388, + "step": 18541 + }, + { + "epoch": 5.69122160834868, + "grad_norm": 0.26742830872535706, + "learning_rate": 4.1286590882280886e-05, + "loss": 1.7472, + "step": 18542 + }, + { + "epoch": 5.691528545119706, + "grad_norm": 0.2613312900066376, + "learning_rate": 4.128169642385766e-05, + "loss": 1.7656, + "step": 18543 + }, + { + "epoch": 5.69183548189073, + "grad_norm": 0.17979474365711212, + "learning_rate": 4.127680205159381e-05, + "loss": 1.6992, + "step": 18544 + }, + { + "epoch": 5.6921424186617555, + "grad_norm": 0.23575037717819214, + "learning_rate": 4.1271907765537745e-05, + "loss": 1.7399, + "step": 18545 + }, + { + "epoch": 5.692449355432781, + "grad_norm": 0.19461458921432495, + "learning_rate": 4.126701356573777e-05, + "loss": 1.709, + "step": 18546 + }, + { + "epoch": 5.692756292203806, + "grad_norm": 0.19715365767478943, + "learning_rate": 4.1262119452242306e-05, + "loss": 1.7634, + "step": 18547 + }, + { + "epoch": 5.6930632289748315, + "grad_norm": 0.21454904973506927, + "learning_rate": 4.125722542509969e-05, + "loss": 1.7663, + "step": 18548 + }, + { + "epoch": 5.693370165745856, + "grad_norm": 0.19884896278381348, + "learning_rate": 4.12523314843583e-05, + "loss": 1.7618, + "step": 18549 + }, + { + "epoch": 5.693677102516881, + "grad_norm": 0.2080020159482956, + "learning_rate": 4.124743763006648e-05, + "loss": 1.7379, + "step": 18550 + }, + { + "epoch": 5.693984039287907, + "grad_norm": 0.18780875205993652, + "learning_rate": 4.124254386227264e-05, + "loss": 1.7036, + "step": 18551 + }, + { + "epoch": 5.694290976058932, + "grad_norm": 0.2114439308643341, + "learning_rate": 4.123765018102512e-05, + "loss": 1.6873, + "step": 18552 + }, + { + "epoch": 5.694597912829957, + "grad_norm": 0.1712789535522461, + "learning_rate": 4.123275658637225e-05, + "loss": 1.6772, + "step": 18553 + }, + { + "epoch": 5.694904849600983, + "grad_norm": 0.2435859888792038, + "learning_rate": 4.122786307836243e-05, + "loss": 1.7946, + "step": 18554 + }, + { + "epoch": 5.695211786372007, + "grad_norm": 0.20587889850139618, + "learning_rate": 4.122296965704399e-05, + "loss": 1.7459, + "step": 18555 + }, + { + "epoch": 5.695518723143032, + "grad_norm": 0.2183443009853363, + "learning_rate": 4.121807632246534e-05, + "loss": 1.7036, + "step": 18556 + }, + { + "epoch": 5.695825659914058, + "grad_norm": 0.19276869297027588, + "learning_rate": 4.121318307467478e-05, + "loss": 1.7371, + "step": 18557 + }, + { + "epoch": 5.696132596685083, + "grad_norm": 0.19815512001514435, + "learning_rate": 4.120828991372072e-05, + "loss": 1.7038, + "step": 18558 + }, + { + "epoch": 5.696439533456108, + "grad_norm": 0.18509675562381744, + "learning_rate": 4.120339683965146e-05, + "loss": 1.6936, + "step": 18559 + }, + { + "epoch": 5.696746470227133, + "grad_norm": 0.2296193689107895, + "learning_rate": 4.1198503852515416e-05, + "loss": 1.7626, + "step": 18560 + }, + { + "epoch": 5.697053406998158, + "grad_norm": 0.2064799964427948, + "learning_rate": 4.11936109523609e-05, + "loss": 1.7387, + "step": 18561 + }, + { + "epoch": 5.6973603437691835, + "grad_norm": 0.20171360671520233, + "learning_rate": 4.1188718139236296e-05, + "loss": 1.7372, + "step": 18562 + }, + { + "epoch": 5.697667280540209, + "grad_norm": 0.19421936571598053, + "learning_rate": 4.118382541318993e-05, + "loss": 1.7187, + "step": 18563 + }, + { + "epoch": 5.697974217311234, + "grad_norm": 0.22517532110214233, + "learning_rate": 4.117893277427018e-05, + "loss": 1.7503, + "step": 18564 + }, + { + "epoch": 5.6982811540822595, + "grad_norm": 0.2293393909931183, + "learning_rate": 4.1174040222525366e-05, + "loss": 1.7174, + "step": 18565 + }, + { + "epoch": 5.698588090853284, + "grad_norm": 0.24003073573112488, + "learning_rate": 4.1169147758003876e-05, + "loss": 1.7829, + "step": 18566 + }, + { + "epoch": 5.698895027624309, + "grad_norm": 0.21476133167743683, + "learning_rate": 4.1164255380754034e-05, + "loss": 1.7906, + "step": 18567 + }, + { + "epoch": 5.699201964395335, + "grad_norm": 0.21347576379776, + "learning_rate": 4.115936309082422e-05, + "loss": 1.6986, + "step": 18568 + }, + { + "epoch": 5.69950890116636, + "grad_norm": 0.22650402784347534, + "learning_rate": 4.115447088826276e-05, + "loss": 1.7949, + "step": 18569 + }, + { + "epoch": 5.699815837937384, + "grad_norm": 0.25815197825431824, + "learning_rate": 4.114957877311799e-05, + "loss": 1.7499, + "step": 18570 + }, + { + "epoch": 5.70012277470841, + "grad_norm": 0.22644442319869995, + "learning_rate": 4.1144686745438265e-05, + "loss": 1.7689, + "step": 18571 + }, + { + "epoch": 5.700429711479435, + "grad_norm": 0.241188645362854, + "learning_rate": 4.113979480527194e-05, + "loss": 1.7341, + "step": 18572 + }, + { + "epoch": 5.7007366482504604, + "grad_norm": 0.20984862744808197, + "learning_rate": 4.1134902952667365e-05, + "loss": 1.7091, + "step": 18573 + }, + { + "epoch": 5.701043585021486, + "grad_norm": 0.25150877237319946, + "learning_rate": 4.113001118767286e-05, + "loss": 1.723, + "step": 18574 + }, + { + "epoch": 5.701350521792511, + "grad_norm": 0.21693028509616852, + "learning_rate": 4.1125119510336804e-05, + "loss": 1.7483, + "step": 18575 + }, + { + "epoch": 5.701657458563536, + "grad_norm": 0.2620212733745575, + "learning_rate": 4.11202279207075e-05, + "loss": 1.8159, + "step": 18576 + }, + { + "epoch": 5.701964395334561, + "grad_norm": 0.18722239136695862, + "learning_rate": 4.111533641883332e-05, + "loss": 1.7197, + "step": 18577 + }, + { + "epoch": 5.702271332105586, + "grad_norm": 0.21321091055870056, + "learning_rate": 4.111044500476258e-05, + "loss": 1.7408, + "step": 18578 + }, + { + "epoch": 5.702578268876612, + "grad_norm": 0.24459265172481537, + "learning_rate": 4.110555367854365e-05, + "loss": 1.8304, + "step": 18579 + }, + { + "epoch": 5.702885205647637, + "grad_norm": 0.24987100064754486, + "learning_rate": 4.110066244022483e-05, + "loss": 1.7051, + "step": 18580 + }, + { + "epoch": 5.703192142418661, + "grad_norm": 0.19059090316295624, + "learning_rate": 4.1095771289854506e-05, + "loss": 1.7489, + "step": 18581 + }, + { + "epoch": 5.703499079189687, + "grad_norm": 0.23020480573177338, + "learning_rate": 4.1090880227480966e-05, + "loss": 1.7101, + "step": 18582 + }, + { + "epoch": 5.703806015960712, + "grad_norm": 0.18733634054660797, + "learning_rate": 4.108598925315258e-05, + "loss": 1.7116, + "step": 18583 + }, + { + "epoch": 5.704112952731737, + "grad_norm": 0.1959095001220703, + "learning_rate": 4.108109836691766e-05, + "loss": 1.7283, + "step": 18584 + }, + { + "epoch": 5.704419889502763, + "grad_norm": 0.22685091197490692, + "learning_rate": 4.107620756882457e-05, + "loss": 1.7588, + "step": 18585 + }, + { + "epoch": 5.704726826273788, + "grad_norm": 0.1998603790998459, + "learning_rate": 4.107131685892164e-05, + "loss": 1.7071, + "step": 18586 + }, + { + "epoch": 5.7050337630448125, + "grad_norm": 0.2018733024597168, + "learning_rate": 4.106642623725717e-05, + "loss": 1.6782, + "step": 18587 + }, + { + "epoch": 5.705340699815838, + "grad_norm": 0.21826615929603577, + "learning_rate": 4.106153570387951e-05, + "loss": 1.736, + "step": 18588 + }, + { + "epoch": 5.705647636586863, + "grad_norm": 0.20197603106498718, + "learning_rate": 4.105664525883699e-05, + "loss": 1.6921, + "step": 18589 + }, + { + "epoch": 5.7059545733578885, + "grad_norm": 0.20943905413150787, + "learning_rate": 4.105175490217796e-05, + "loss": 1.665, + "step": 18590 + }, + { + "epoch": 5.706261510128914, + "grad_norm": 0.202060267329216, + "learning_rate": 4.104686463395071e-05, + "loss": 1.714, + "step": 18591 + }, + { + "epoch": 5.706568446899938, + "grad_norm": 0.220698744058609, + "learning_rate": 4.1041974454203623e-05, + "loss": 1.8076, + "step": 18592 + }, + { + "epoch": 5.706875383670964, + "grad_norm": 0.21536946296691895, + "learning_rate": 4.103708436298497e-05, + "loss": 1.6801, + "step": 18593 + }, + { + "epoch": 5.707182320441989, + "grad_norm": 0.21442468464374542, + "learning_rate": 4.103219436034311e-05, + "loss": 1.6921, + "step": 18594 + }, + { + "epoch": 5.707489257213014, + "grad_norm": 0.2047559767961502, + "learning_rate": 4.1027304446326356e-05, + "loss": 1.7861, + "step": 18595 + }, + { + "epoch": 5.70779619398404, + "grad_norm": 0.20304669439792633, + "learning_rate": 4.102241462098305e-05, + "loss": 1.7751, + "step": 18596 + }, + { + "epoch": 5.708103130755065, + "grad_norm": 0.18702620267868042, + "learning_rate": 4.101752488436149e-05, + "loss": 1.6951, + "step": 18597 + }, + { + "epoch": 5.708410067526089, + "grad_norm": 0.1821923404932022, + "learning_rate": 4.1012635236510034e-05, + "loss": 1.711, + "step": 18598 + }, + { + "epoch": 5.708717004297115, + "grad_norm": 0.19422096014022827, + "learning_rate": 4.100774567747696e-05, + "loss": 1.7202, + "step": 18599 + }, + { + "epoch": 5.70902394106814, + "grad_norm": 0.20800530910491943, + "learning_rate": 4.100285620731063e-05, + "loss": 1.7403, + "step": 18600 + }, + { + "epoch": 5.709330877839165, + "grad_norm": 0.221746027469635, + "learning_rate": 4.099796682605934e-05, + "loss": 1.7769, + "step": 18601 + }, + { + "epoch": 5.70963781461019, + "grad_norm": 0.19284313917160034, + "learning_rate": 4.099307753377143e-05, + "loss": 1.692, + "step": 18602 + }, + { + "epoch": 5.709944751381215, + "grad_norm": 0.17635129392147064, + "learning_rate": 4.0988188330495216e-05, + "loss": 1.7212, + "step": 18603 + }, + { + "epoch": 5.7102516881522405, + "grad_norm": 0.17728061974048615, + "learning_rate": 4.098329921627898e-05, + "loss": 1.7217, + "step": 18604 + }, + { + "epoch": 5.710558624923266, + "grad_norm": 0.19998152554035187, + "learning_rate": 4.097841019117108e-05, + "loss": 1.7583, + "step": 18605 + }, + { + "epoch": 5.710865561694291, + "grad_norm": 0.18840095400810242, + "learning_rate": 4.09735212552198e-05, + "loss": 1.7353, + "step": 18606 + }, + { + "epoch": 5.7111724984653165, + "grad_norm": 0.2528367042541504, + "learning_rate": 4.09686324084735e-05, + "loss": 1.7576, + "step": 18607 + }, + { + "epoch": 5.711479435236341, + "grad_norm": 0.27240338921546936, + "learning_rate": 4.096374365098045e-05, + "loss": 1.7303, + "step": 18608 + }, + { + "epoch": 5.711786372007366, + "grad_norm": 0.20187151432037354, + "learning_rate": 4.0958854982789e-05, + "loss": 1.7599, + "step": 18609 + }, + { + "epoch": 5.712093308778392, + "grad_norm": 0.24890528619289398, + "learning_rate": 4.095396640394742e-05, + "loss": 1.7737, + "step": 18610 + }, + { + "epoch": 5.712400245549417, + "grad_norm": 0.21524454653263092, + "learning_rate": 4.094907791450406e-05, + "loss": 1.7704, + "step": 18611 + }, + { + "epoch": 5.712707182320442, + "grad_norm": 0.20070379972457886, + "learning_rate": 4.094418951450721e-05, + "loss": 1.7358, + "step": 18612 + }, + { + "epoch": 5.713014119091467, + "grad_norm": 0.2252196967601776, + "learning_rate": 4.09393012040052e-05, + "loss": 1.7262, + "step": 18613 + }, + { + "epoch": 5.713321055862492, + "grad_norm": 0.19511987268924713, + "learning_rate": 4.093441298304631e-05, + "loss": 1.7146, + "step": 18614 + }, + { + "epoch": 5.713627992633517, + "grad_norm": 0.2047072798013687, + "learning_rate": 4.092952485167888e-05, + "loss": 1.7864, + "step": 18615 + }, + { + "epoch": 5.713934929404543, + "grad_norm": 0.21794871985912323, + "learning_rate": 4.092463680995119e-05, + "loss": 1.7759, + "step": 18616 + }, + { + "epoch": 5.714241866175568, + "grad_norm": 0.23863841593265533, + "learning_rate": 4.0919748857911566e-05, + "loss": 1.7207, + "step": 18617 + }, + { + "epoch": 5.714548802946593, + "grad_norm": 0.19706958532333374, + "learning_rate": 4.09148609956083e-05, + "loss": 1.7247, + "step": 18618 + }, + { + "epoch": 5.714855739717618, + "grad_norm": 0.23663771152496338, + "learning_rate": 4.090997322308971e-05, + "loss": 1.7929, + "step": 18619 + }, + { + "epoch": 5.715162676488643, + "grad_norm": 0.23079079389572144, + "learning_rate": 4.09050855404041e-05, + "loss": 1.763, + "step": 18620 + }, + { + "epoch": 5.7154696132596685, + "grad_norm": 0.23883379995822906, + "learning_rate": 4.0900197947599736e-05, + "loss": 1.7995, + "step": 18621 + }, + { + "epoch": 5.715776550030694, + "grad_norm": 0.2125123143196106, + "learning_rate": 4.0895310444724974e-05, + "loss": 1.8045, + "step": 18622 + }, + { + "epoch": 5.716083486801719, + "grad_norm": 0.21062424778938293, + "learning_rate": 4.0890423031828076e-05, + "loss": 1.7348, + "step": 18623 + }, + { + "epoch": 5.716390423572744, + "grad_norm": 0.24079614877700806, + "learning_rate": 4.088553570895737e-05, + "loss": 1.7462, + "step": 18624 + }, + { + "epoch": 5.716697360343769, + "grad_norm": 0.2120666354894638, + "learning_rate": 4.088064847616113e-05, + "loss": 1.7235, + "step": 18625 + }, + { + "epoch": 5.717004297114794, + "grad_norm": 0.19663050770759583, + "learning_rate": 4.0875761333487685e-05, + "loss": 1.6743, + "step": 18626 + }, + { + "epoch": 5.71731123388582, + "grad_norm": 0.24010685086250305, + "learning_rate": 4.0870874280985295e-05, + "loss": 1.6742, + "step": 18627 + }, + { + "epoch": 5.717618170656845, + "grad_norm": 0.22140294313430786, + "learning_rate": 4.086598731870228e-05, + "loss": 1.7601, + "step": 18628 + }, + { + "epoch": 5.71792510742787, + "grad_norm": 0.2876693308353424, + "learning_rate": 4.086110044668694e-05, + "loss": 1.7601, + "step": 18629 + }, + { + "epoch": 5.718232044198895, + "grad_norm": 0.3103853464126587, + "learning_rate": 4.085621366498756e-05, + "loss": 1.6824, + "step": 18630 + }, + { + "epoch": 5.71853898096992, + "grad_norm": 0.18194396793842316, + "learning_rate": 4.0851326973652424e-05, + "loss": 1.6976, + "step": 18631 + }, + { + "epoch": 5.718845917740945, + "grad_norm": 0.28400903940200806, + "learning_rate": 4.0846440372729854e-05, + "loss": 1.7352, + "step": 18632 + }, + { + "epoch": 5.719152854511971, + "grad_norm": 0.23753583431243896, + "learning_rate": 4.084155386226811e-05, + "loss": 1.7418, + "step": 18633 + }, + { + "epoch": 5.719459791282996, + "grad_norm": 0.215620756149292, + "learning_rate": 4.0836667442315514e-05, + "loss": 1.7602, + "step": 18634 + }, + { + "epoch": 5.7197667280540205, + "grad_norm": 0.21057941019535065, + "learning_rate": 4.083178111292034e-05, + "loss": 1.6818, + "step": 18635 + }, + { + "epoch": 5.720073664825046, + "grad_norm": 0.2169445902109146, + "learning_rate": 4.0826894874130863e-05, + "loss": 1.7942, + "step": 18636 + }, + { + "epoch": 5.720380601596071, + "grad_norm": 0.2779453992843628, + "learning_rate": 4.082200872599541e-05, + "loss": 1.7432, + "step": 18637 + }, + { + "epoch": 5.7206875383670965, + "grad_norm": 0.22556698322296143, + "learning_rate": 4.0817122668562224e-05, + "loss": 1.7748, + "step": 18638 + }, + { + "epoch": 5.720994475138122, + "grad_norm": 0.2570365071296692, + "learning_rate": 4.081223670187962e-05, + "loss": 1.7314, + "step": 18639 + }, + { + "epoch": 5.721301411909147, + "grad_norm": 0.266176700592041, + "learning_rate": 4.080735082599588e-05, + "loss": 1.689, + "step": 18640 + }, + { + "epoch": 5.721608348680172, + "grad_norm": 0.20190037786960602, + "learning_rate": 4.080246504095929e-05, + "loss": 1.7467, + "step": 18641 + }, + { + "epoch": 5.721915285451197, + "grad_norm": 0.2498215138912201, + "learning_rate": 4.079757934681813e-05, + "loss": 1.7063, + "step": 18642 + }, + { + "epoch": 5.722222222222222, + "grad_norm": 0.25594204664230347, + "learning_rate": 4.0792693743620695e-05, + "loss": 1.7096, + "step": 18643 + }, + { + "epoch": 5.722529158993248, + "grad_norm": 0.22674626111984253, + "learning_rate": 4.0787808231415233e-05, + "loss": 1.715, + "step": 18644 + }, + { + "epoch": 5.722836095764272, + "grad_norm": 0.267140656709671, + "learning_rate": 4.078292281025007e-05, + "loss": 1.7747, + "step": 18645 + }, + { + "epoch": 5.723143032535297, + "grad_norm": 0.21161147952079773, + "learning_rate": 4.077803748017345e-05, + "loss": 1.7312, + "step": 18646 + }, + { + "epoch": 5.723449969306323, + "grad_norm": 0.2580260634422302, + "learning_rate": 4.077315224123368e-05, + "loss": 1.7246, + "step": 18647 + }, + { + "epoch": 5.723756906077348, + "grad_norm": 0.23766927421092987, + "learning_rate": 4.076826709347902e-05, + "loss": 1.7147, + "step": 18648 + }, + { + "epoch": 5.724063842848373, + "grad_norm": 0.22764286398887634, + "learning_rate": 4.076338203695776e-05, + "loss": 1.7034, + "step": 18649 + }, + { + "epoch": 5.724370779619399, + "grad_norm": 0.28205159306526184, + "learning_rate": 4.075849707171817e-05, + "loss": 1.7472, + "step": 18650 + }, + { + "epoch": 5.724677716390423, + "grad_norm": 0.2091183066368103, + "learning_rate": 4.075361219780854e-05, + "loss": 1.7693, + "step": 18651 + }, + { + "epoch": 5.7249846531614486, + "grad_norm": 0.29513829946517944, + "learning_rate": 4.074872741527713e-05, + "loss": 1.7286, + "step": 18652 + }, + { + "epoch": 5.725291589932474, + "grad_norm": 0.226357102394104, + "learning_rate": 4.07438427241722e-05, + "loss": 1.7658, + "step": 18653 + }, + { + "epoch": 5.725598526703499, + "grad_norm": 0.23732580244541168, + "learning_rate": 4.073895812454207e-05, + "loss": 1.7591, + "step": 18654 + }, + { + "epoch": 5.725905463474525, + "grad_norm": 0.2835488021373749, + "learning_rate": 4.0734073616434956e-05, + "loss": 1.757, + "step": 18655 + }, + { + "epoch": 5.726212400245549, + "grad_norm": 0.1986306756734848, + "learning_rate": 4.0729189199899186e-05, + "loss": 1.714, + "step": 18656 + }, + { + "epoch": 5.726519337016574, + "grad_norm": 0.25071820616722107, + "learning_rate": 4.072430487498298e-05, + "loss": 1.7334, + "step": 18657 + }, + { + "epoch": 5.7268262737876, + "grad_norm": 0.19989889860153198, + "learning_rate": 4.0719420641734634e-05, + "loss": 1.7472, + "step": 18658 + }, + { + "epoch": 5.727133210558625, + "grad_norm": 0.30006101727485657, + "learning_rate": 4.071453650020241e-05, + "loss": 1.7846, + "step": 18659 + }, + { + "epoch": 5.72744014732965, + "grad_norm": 0.19856922328472137, + "learning_rate": 4.070965245043459e-05, + "loss": 1.6965, + "step": 18660 + }, + { + "epoch": 5.727747084100676, + "grad_norm": 0.20139823853969574, + "learning_rate": 4.070476849247941e-05, + "loss": 1.7265, + "step": 18661 + }, + { + "epoch": 5.7280540208717, + "grad_norm": 0.21507953107357025, + "learning_rate": 4.0699884626385184e-05, + "loss": 1.762, + "step": 18662 + }, + { + "epoch": 5.7283609576427255, + "grad_norm": 0.1885843127965927, + "learning_rate": 4.069500085220013e-05, + "loss": 1.6721, + "step": 18663 + }, + { + "epoch": 5.728667894413751, + "grad_norm": 0.2076897919178009, + "learning_rate": 4.069011716997253e-05, + "loss": 1.7399, + "step": 18664 + }, + { + "epoch": 5.728974831184776, + "grad_norm": 0.21482045948505402, + "learning_rate": 4.068523357975065e-05, + "loss": 1.7105, + "step": 18665 + }, + { + "epoch": 5.7292817679558015, + "grad_norm": 0.20438800752162933, + "learning_rate": 4.0680350081582765e-05, + "loss": 1.7408, + "step": 18666 + }, + { + "epoch": 5.729588704726826, + "grad_norm": 0.2137845903635025, + "learning_rate": 4.0675466675517104e-05, + "loss": 1.7814, + "step": 18667 + }, + { + "epoch": 5.729895641497851, + "grad_norm": 0.23009657859802246, + "learning_rate": 4.067058336160197e-05, + "loss": 1.7311, + "step": 18668 + }, + { + "epoch": 5.730202578268877, + "grad_norm": 0.20602397620677948, + "learning_rate": 4.066570013988558e-05, + "loss": 1.741, + "step": 18669 + }, + { + "epoch": 5.730509515039902, + "grad_norm": 0.24884814023971558, + "learning_rate": 4.066081701041621e-05, + "loss": 1.7222, + "step": 18670 + }, + { + "epoch": 5.730816451810927, + "grad_norm": 0.17906342446804047, + "learning_rate": 4.065593397324214e-05, + "loss": 1.6879, + "step": 18671 + }, + { + "epoch": 5.731123388581953, + "grad_norm": 0.20345427095890045, + "learning_rate": 4.0651051028411586e-05, + "loss": 1.7713, + "step": 18672 + }, + { + "epoch": 5.731430325352977, + "grad_norm": 0.21115002036094666, + "learning_rate": 4.0646168175972846e-05, + "loss": 1.7666, + "step": 18673 + }, + { + "epoch": 5.731737262124002, + "grad_norm": 0.22189734876155853, + "learning_rate": 4.064128541597413e-05, + "loss": 1.6989, + "step": 18674 + }, + { + "epoch": 5.732044198895028, + "grad_norm": 0.24036027491092682, + "learning_rate": 4.063640274846373e-05, + "loss": 1.707, + "step": 18675 + }, + { + "epoch": 5.732351135666053, + "grad_norm": 0.23091022670269012, + "learning_rate": 4.063152017348988e-05, + "loss": 1.7072, + "step": 18676 + }, + { + "epoch": 5.7326580724370775, + "grad_norm": 0.3142668306827545, + "learning_rate": 4.062663769110085e-05, + "loss": 1.7641, + "step": 18677 + }, + { + "epoch": 5.732965009208103, + "grad_norm": 0.2634848356246948, + "learning_rate": 4.0621755301344875e-05, + "loss": 1.7007, + "step": 18678 + }, + { + "epoch": 5.733271945979128, + "grad_norm": 0.21296904981136322, + "learning_rate": 4.061687300427022e-05, + "loss": 1.7201, + "step": 18679 + }, + { + "epoch": 5.7335788827501535, + "grad_norm": 0.24943144619464874, + "learning_rate": 4.0611990799925104e-05, + "loss": 1.7186, + "step": 18680 + }, + { + "epoch": 5.733885819521179, + "grad_norm": 0.2574152946472168, + "learning_rate": 4.060710868835781e-05, + "loss": 1.8671, + "step": 18681 + }, + { + "epoch": 5.734192756292204, + "grad_norm": 0.26023826003074646, + "learning_rate": 4.0602226669616564e-05, + "loss": 1.7618, + "step": 18682 + }, + { + "epoch": 5.734499693063229, + "grad_norm": 0.21078336238861084, + "learning_rate": 4.0597344743749645e-05, + "loss": 1.7548, + "step": 18683 + }, + { + "epoch": 5.734806629834254, + "grad_norm": 0.2195056676864624, + "learning_rate": 4.059246291080525e-05, + "loss": 1.6843, + "step": 18684 + }, + { + "epoch": 5.735113566605279, + "grad_norm": 0.20719893276691437, + "learning_rate": 4.058758117083168e-05, + "loss": 1.692, + "step": 18685 + }, + { + "epoch": 5.735420503376305, + "grad_norm": 0.23012077808380127, + "learning_rate": 4.058269952387713e-05, + "loss": 1.7072, + "step": 18686 + }, + { + "epoch": 5.73572744014733, + "grad_norm": 0.18598411977291107, + "learning_rate": 4.057781796998986e-05, + "loss": 1.6983, + "step": 18687 + }, + { + "epoch": 5.736034376918354, + "grad_norm": 0.20211926102638245, + "learning_rate": 4.057293650921813e-05, + "loss": 1.6818, + "step": 18688 + }, + { + "epoch": 5.73634131368938, + "grad_norm": 0.1957080215215683, + "learning_rate": 4.056805514161015e-05, + "loss": 1.7154, + "step": 18689 + }, + { + "epoch": 5.736648250460405, + "grad_norm": 0.23581798374652863, + "learning_rate": 4.0563173867214196e-05, + "loss": 1.7724, + "step": 18690 + }, + { + "epoch": 5.73695518723143, + "grad_norm": 0.22706671059131622, + "learning_rate": 4.055829268607847e-05, + "loss": 1.7387, + "step": 18691 + }, + { + "epoch": 5.737262124002456, + "grad_norm": 0.20050427317619324, + "learning_rate": 4.055341159825124e-05, + "loss": 1.7585, + "step": 18692 + }, + { + "epoch": 5.737569060773481, + "grad_norm": 0.18666231632232666, + "learning_rate": 4.054853060378072e-05, + "loss": 1.6996, + "step": 18693 + }, + { + "epoch": 5.7378759975445055, + "grad_norm": 0.23018911480903625, + "learning_rate": 4.0543649702715186e-05, + "loss": 1.7167, + "step": 18694 + }, + { + "epoch": 5.738182934315531, + "grad_norm": 0.21207039058208466, + "learning_rate": 4.053876889510282e-05, + "loss": 1.7539, + "step": 18695 + }, + { + "epoch": 5.738489871086556, + "grad_norm": 0.22042523324489594, + "learning_rate": 4.0533888180991915e-05, + "loss": 1.8145, + "step": 18696 + }, + { + "epoch": 5.7387968078575815, + "grad_norm": 0.20705139636993408, + "learning_rate": 4.0529007560430646e-05, + "loss": 1.7612, + "step": 18697 + }, + { + "epoch": 5.739103744628607, + "grad_norm": 0.20673857629299164, + "learning_rate": 4.052412703346729e-05, + "loss": 1.7338, + "step": 18698 + }, + { + "epoch": 5.739410681399631, + "grad_norm": 0.20742641389369965, + "learning_rate": 4.051924660015005e-05, + "loss": 1.7497, + "step": 18699 + }, + { + "epoch": 5.739717618170657, + "grad_norm": 0.22352617979049683, + "learning_rate": 4.05143662605272e-05, + "loss": 1.7568, + "step": 18700 + }, + { + "epoch": 5.740024554941682, + "grad_norm": 0.20306691527366638, + "learning_rate": 4.050948601464692e-05, + "loss": 1.7416, + "step": 18701 + }, + { + "epoch": 5.740331491712707, + "grad_norm": 0.22972522675991058, + "learning_rate": 4.050460586255748e-05, + "loss": 1.7907, + "step": 18702 + }, + { + "epoch": 5.740638428483733, + "grad_norm": 0.2056068629026413, + "learning_rate": 4.0499725804307084e-05, + "loss": 1.7584, + "step": 18703 + }, + { + "epoch": 5.740945365254758, + "grad_norm": 0.2150508463382721, + "learning_rate": 4.049484583994395e-05, + "loss": 1.7695, + "step": 18704 + }, + { + "epoch": 5.741252302025782, + "grad_norm": 0.20274797081947327, + "learning_rate": 4.048996596951634e-05, + "loss": 1.7398, + "step": 18705 + }, + { + "epoch": 5.741559238796808, + "grad_norm": 0.20521290600299835, + "learning_rate": 4.0485086193072444e-05, + "loss": 1.7529, + "step": 18706 + }, + { + "epoch": 5.741866175567833, + "grad_norm": 0.22344307601451874, + "learning_rate": 4.0480206510660527e-05, + "loss": 1.6729, + "step": 18707 + }, + { + "epoch": 5.742173112338858, + "grad_norm": 0.20007841289043427, + "learning_rate": 4.047532692232876e-05, + "loss": 1.7004, + "step": 18708 + }, + { + "epoch": 5.742480049109884, + "grad_norm": 0.2455853819847107, + "learning_rate": 4.047044742812541e-05, + "loss": 1.7324, + "step": 18709 + }, + { + "epoch": 5.742786985880908, + "grad_norm": 0.29901546239852905, + "learning_rate": 4.046556802809867e-05, + "loss": 1.7138, + "step": 18710 + }, + { + "epoch": 5.7430939226519335, + "grad_norm": 0.19636842608451843, + "learning_rate": 4.04606887222968e-05, + "loss": 1.7098, + "step": 18711 + }, + { + "epoch": 5.743400859422959, + "grad_norm": 0.24916070699691772, + "learning_rate": 4.045580951076797e-05, + "loss": 1.7073, + "step": 18712 + }, + { + "epoch": 5.743707796193984, + "grad_norm": 0.2122841477394104, + "learning_rate": 4.0450930393560453e-05, + "loss": 1.7608, + "step": 18713 + }, + { + "epoch": 5.7440147329650095, + "grad_norm": 0.25119176506996155, + "learning_rate": 4.044605137072241e-05, + "loss": 1.7528, + "step": 18714 + }, + { + "epoch": 5.744321669736035, + "grad_norm": 0.2128097116947174, + "learning_rate": 4.0441172442302104e-05, + "loss": 1.6834, + "step": 18715 + }, + { + "epoch": 5.744628606507059, + "grad_norm": 0.1771443784236908, + "learning_rate": 4.043629360834772e-05, + "loss": 1.6699, + "step": 18716 + }, + { + "epoch": 5.744935543278085, + "grad_norm": 0.2360549122095108, + "learning_rate": 4.043141486890751e-05, + "loss": 1.7704, + "step": 18717 + }, + { + "epoch": 5.74524248004911, + "grad_norm": 0.22453519701957703, + "learning_rate": 4.0426536224029645e-05, + "loss": 1.7305, + "step": 18718 + }, + { + "epoch": 5.745549416820135, + "grad_norm": 0.2170165628194809, + "learning_rate": 4.042165767376238e-05, + "loss": 1.7859, + "step": 18719 + }, + { + "epoch": 5.74585635359116, + "grad_norm": 0.233921617269516, + "learning_rate": 4.0416779218153896e-05, + "loss": 1.7622, + "step": 18720 + }, + { + "epoch": 5.746163290362185, + "grad_norm": 0.2698482871055603, + "learning_rate": 4.041190085725242e-05, + "loss": 1.7419, + "step": 18721 + }, + { + "epoch": 5.74647022713321, + "grad_norm": 0.28437280654907227, + "learning_rate": 4.0407022591106165e-05, + "loss": 1.7242, + "step": 18722 + }, + { + "epoch": 5.746777163904236, + "grad_norm": 0.2087356448173523, + "learning_rate": 4.040214441976332e-05, + "loss": 1.747, + "step": 18723 + }, + { + "epoch": 5.747084100675261, + "grad_norm": 0.2028181403875351, + "learning_rate": 4.039726634327213e-05, + "loss": 1.7843, + "step": 18724 + }, + { + "epoch": 5.747391037446286, + "grad_norm": 0.18513897061347961, + "learning_rate": 4.039238836168076e-05, + "loss": 1.692, + "step": 18725 + }, + { + "epoch": 5.747697974217311, + "grad_norm": 0.2308989316225052, + "learning_rate": 4.038751047503745e-05, + "loss": 1.6625, + "step": 18726 + }, + { + "epoch": 5.748004910988336, + "grad_norm": 0.23922030627727509, + "learning_rate": 4.0382632683390386e-05, + "loss": 1.7407, + "step": 18727 + }, + { + "epoch": 5.7483118477593615, + "grad_norm": 0.17225340008735657, + "learning_rate": 4.0377754986787806e-05, + "loss": 1.6888, + "step": 18728 + }, + { + "epoch": 5.748618784530387, + "grad_norm": 0.1898551732301712, + "learning_rate": 4.037287738527786e-05, + "loss": 1.6931, + "step": 18729 + }, + { + "epoch": 5.748925721301412, + "grad_norm": 0.22900012135505676, + "learning_rate": 4.036799987890881e-05, + "loss": 1.751, + "step": 18730 + }, + { + "epoch": 5.749232658072437, + "grad_norm": 0.21106193959712982, + "learning_rate": 4.0363122467728815e-05, + "loss": 1.6919, + "step": 18731 + }, + { + "epoch": 5.749539594843462, + "grad_norm": 0.19944290816783905, + "learning_rate": 4.03582451517861e-05, + "loss": 1.7232, + "step": 18732 + }, + { + "epoch": 5.749846531614487, + "grad_norm": 0.1833256036043167, + "learning_rate": 4.035336793112885e-05, + "loss": 1.7199, + "step": 18733 + }, + { + "epoch": 5.750153468385513, + "grad_norm": 0.2596902847290039, + "learning_rate": 4.0348490805805287e-05, + "loss": 1.7386, + "step": 18734 + }, + { + "epoch": 5.750460405156538, + "grad_norm": 0.23708637058734894, + "learning_rate": 4.034361377586357e-05, + "loss": 1.7697, + "step": 18735 + }, + { + "epoch": 5.750767341927563, + "grad_norm": 0.20476554334163666, + "learning_rate": 4.033873684135195e-05, + "loss": 1.7804, + "step": 18736 + }, + { + "epoch": 5.751074278698588, + "grad_norm": 0.2625868320465088, + "learning_rate": 4.033386000231858e-05, + "loss": 1.7046, + "step": 18737 + }, + { + "epoch": 5.751381215469613, + "grad_norm": 0.23011820018291473, + "learning_rate": 4.032898325881166e-05, + "loss": 1.7758, + "step": 18738 + }, + { + "epoch": 5.7516881522406385, + "grad_norm": 0.23972748219966888, + "learning_rate": 4.032410661087943e-05, + "loss": 1.7165, + "step": 18739 + }, + { + "epoch": 5.751995089011664, + "grad_norm": 0.2241208404302597, + "learning_rate": 4.031923005857001e-05, + "loss": 1.713, + "step": 18740 + }, + { + "epoch": 5.752302025782689, + "grad_norm": 0.22316952049732208, + "learning_rate": 4.0314353601931665e-05, + "loss": 1.7655, + "step": 18741 + }, + { + "epoch": 5.752608962553714, + "grad_norm": 0.2177707403898239, + "learning_rate": 4.030947724101253e-05, + "loss": 1.7517, + "step": 18742 + }, + { + "epoch": 5.752915899324739, + "grad_norm": 0.21731823682785034, + "learning_rate": 4.030460097586083e-05, + "loss": 1.718, + "step": 18743 + }, + { + "epoch": 5.753222836095764, + "grad_norm": 0.1700165718793869, + "learning_rate": 4.0299724806524744e-05, + "loss": 1.6536, + "step": 18744 + }, + { + "epoch": 5.75352977286679, + "grad_norm": 0.21920062601566315, + "learning_rate": 4.029484873305247e-05, + "loss": 1.7298, + "step": 18745 + }, + { + "epoch": 5.753836709637815, + "grad_norm": 0.22648905217647552, + "learning_rate": 4.028997275549218e-05, + "loss": 1.7878, + "step": 18746 + }, + { + "epoch": 5.75414364640884, + "grad_norm": 0.19443005323410034, + "learning_rate": 4.028509687389208e-05, + "loss": 1.7582, + "step": 18747 + }, + { + "epoch": 5.754450583179865, + "grad_norm": 0.21973860263824463, + "learning_rate": 4.028022108830034e-05, + "loss": 1.8215, + "step": 18748 + }, + { + "epoch": 5.75475751995089, + "grad_norm": 0.2215481847524643, + "learning_rate": 4.0275345398765155e-05, + "loss": 1.7092, + "step": 18749 + }, + { + "epoch": 5.755064456721915, + "grad_norm": 0.18789733946323395, + "learning_rate": 4.0270469805334696e-05, + "loss": 1.7089, + "step": 18750 + }, + { + "epoch": 5.755371393492941, + "grad_norm": 0.2423657774925232, + "learning_rate": 4.0265594308057175e-05, + "loss": 1.7412, + "step": 18751 + }, + { + "epoch": 5.755678330263965, + "grad_norm": 0.22020475566387177, + "learning_rate": 4.026071890698074e-05, + "loss": 1.7644, + "step": 18752 + }, + { + "epoch": 5.7559852670349905, + "grad_norm": 0.31772032380104065, + "learning_rate": 4.025584360215361e-05, + "loss": 1.7326, + "step": 18753 + }, + { + "epoch": 5.756292203806016, + "grad_norm": 0.23786257207393646, + "learning_rate": 4.025096839362393e-05, + "loss": 1.7652, + "step": 18754 + }, + { + "epoch": 5.756599140577041, + "grad_norm": 0.24288083612918854, + "learning_rate": 4.024609328143989e-05, + "loss": 1.6797, + "step": 18755 + }, + { + "epoch": 5.7569060773480665, + "grad_norm": 0.30519670248031616, + "learning_rate": 4.024121826564969e-05, + "loss": 1.7442, + "step": 18756 + }, + { + "epoch": 5.757213014119092, + "grad_norm": 0.218281090259552, + "learning_rate": 4.023634334630147e-05, + "loss": 1.7498, + "step": 18757 + }, + { + "epoch": 5.757519950890116, + "grad_norm": 0.215846985578537, + "learning_rate": 4.023146852344345e-05, + "loss": 1.7728, + "step": 18758 + }, + { + "epoch": 5.757826887661142, + "grad_norm": 0.2883944511413574, + "learning_rate": 4.022659379712376e-05, + "loss": 1.8098, + "step": 18759 + }, + { + "epoch": 5.758133824432167, + "grad_norm": 0.25141629576683044, + "learning_rate": 4.022171916739062e-05, + "loss": 1.6574, + "step": 18760 + }, + { + "epoch": 5.758440761203192, + "grad_norm": 0.22118757665157318, + "learning_rate": 4.021684463429216e-05, + "loss": 1.7542, + "step": 18761 + }, + { + "epoch": 5.758747697974218, + "grad_norm": 0.2437646985054016, + "learning_rate": 4.02119701978766e-05, + "loss": 1.7182, + "step": 18762 + }, + { + "epoch": 5.759054634745242, + "grad_norm": 0.24247203767299652, + "learning_rate": 4.020709585819206e-05, + "loss": 1.7134, + "step": 18763 + }, + { + "epoch": 5.759361571516267, + "grad_norm": 0.208528533577919, + "learning_rate": 4.020222161528677e-05, + "loss": 1.6966, + "step": 18764 + }, + { + "epoch": 5.759668508287293, + "grad_norm": 0.19645826518535614, + "learning_rate": 4.0197347469208843e-05, + "loss": 1.7261, + "step": 18765 + }, + { + "epoch": 5.759975445058318, + "grad_norm": 0.20066291093826294, + "learning_rate": 4.019247342000648e-05, + "loss": 1.7197, + "step": 18766 + }, + { + "epoch": 5.760282381829343, + "grad_norm": 0.25344669818878174, + "learning_rate": 4.0187599467727845e-05, + "loss": 1.7957, + "step": 18767 + }, + { + "epoch": 5.760589318600369, + "grad_norm": 0.1917620301246643, + "learning_rate": 4.018272561242111e-05, + "loss": 1.6868, + "step": 18768 + }, + { + "epoch": 5.760896255371393, + "grad_norm": 0.21996566653251648, + "learning_rate": 4.0177851854134424e-05, + "loss": 1.7128, + "step": 18769 + }, + { + "epoch": 5.7612031921424185, + "grad_norm": 0.23226283490657806, + "learning_rate": 4.017297819291598e-05, + "loss": 1.7079, + "step": 18770 + }, + { + "epoch": 5.761510128913444, + "grad_norm": 0.30606213212013245, + "learning_rate": 4.016810462881391e-05, + "loss": 1.8087, + "step": 18771 + }, + { + "epoch": 5.761817065684469, + "grad_norm": 0.2171698361635208, + "learning_rate": 4.016323116187639e-05, + "loss": 1.7377, + "step": 18772 + }, + { + "epoch": 5.7621240024554945, + "grad_norm": 0.24234412610530853, + "learning_rate": 4.01583577921516e-05, + "loss": 1.734, + "step": 18773 + }, + { + "epoch": 5.762430939226519, + "grad_norm": 0.2648961544036865, + "learning_rate": 4.015348451968767e-05, + "loss": 1.7423, + "step": 18774 + }, + { + "epoch": 5.762737875997544, + "grad_norm": 0.18316571414470673, + "learning_rate": 4.01486113445328e-05, + "loss": 1.6708, + "step": 18775 + }, + { + "epoch": 5.76304481276857, + "grad_norm": 0.241583451628685, + "learning_rate": 4.0143738266735104e-05, + "loss": 1.708, + "step": 18776 + }, + { + "epoch": 5.763351749539595, + "grad_norm": 0.2268480360507965, + "learning_rate": 4.0138865286342775e-05, + "loss": 1.7106, + "step": 18777 + }, + { + "epoch": 5.76365868631062, + "grad_norm": 0.2038748860359192, + "learning_rate": 4.0133992403403944e-05, + "loss": 1.7349, + "step": 18778 + }, + { + "epoch": 5.763965623081646, + "grad_norm": 0.24422483146190643, + "learning_rate": 4.0129119617966805e-05, + "loss": 1.659, + "step": 18779 + }, + { + "epoch": 5.76427255985267, + "grad_norm": 0.19925715029239655, + "learning_rate": 4.0124246930079476e-05, + "loss": 1.6983, + "step": 18780 + }, + { + "epoch": 5.764579496623695, + "grad_norm": 0.29671359062194824, + "learning_rate": 4.0119374339790136e-05, + "loss": 1.7188, + "step": 18781 + }, + { + "epoch": 5.764886433394721, + "grad_norm": 0.2752140760421753, + "learning_rate": 4.011450184714692e-05, + "loss": 1.738, + "step": 18782 + }, + { + "epoch": 5.765193370165746, + "grad_norm": 0.2112676352262497, + "learning_rate": 4.0109629452198e-05, + "loss": 1.7529, + "step": 18783 + }, + { + "epoch": 5.765500306936771, + "grad_norm": 0.2091330885887146, + "learning_rate": 4.010475715499151e-05, + "loss": 1.6771, + "step": 18784 + }, + { + "epoch": 5.765807243707796, + "grad_norm": 0.26556238532066345, + "learning_rate": 4.009988495557562e-05, + "loss": 1.7721, + "step": 18785 + }, + { + "epoch": 5.766114180478821, + "grad_norm": 0.20728638768196106, + "learning_rate": 4.009501285399846e-05, + "loss": 1.6893, + "step": 18786 + }, + { + "epoch": 5.7664211172498465, + "grad_norm": 0.213730126619339, + "learning_rate": 4.00901408503082e-05, + "loss": 1.704, + "step": 18787 + }, + { + "epoch": 5.766728054020872, + "grad_norm": 0.21422363817691803, + "learning_rate": 4.0085268944552975e-05, + "loss": 1.7571, + "step": 18788 + }, + { + "epoch": 5.767034990791897, + "grad_norm": 0.20936815440654755, + "learning_rate": 4.0080397136780915e-05, + "loss": 1.7423, + "step": 18789 + }, + { + "epoch": 5.7673419275629225, + "grad_norm": 0.26223674416542053, + "learning_rate": 4.007552542704021e-05, + "loss": 1.7687, + "step": 18790 + }, + { + "epoch": 5.767648864333947, + "grad_norm": 0.3524645268917084, + "learning_rate": 4.0070653815378954e-05, + "loss": 1.7754, + "step": 18791 + }, + { + "epoch": 5.767955801104972, + "grad_norm": 0.20238324999809265, + "learning_rate": 4.006578230184534e-05, + "loss": 1.7043, + "step": 18792 + }, + { + "epoch": 5.768262737875998, + "grad_norm": 0.2739984393119812, + "learning_rate": 4.006091088648747e-05, + "loss": 1.7596, + "step": 18793 + }, + { + "epoch": 5.768569674647023, + "grad_norm": 0.29209306836128235, + "learning_rate": 4.0056039569353515e-05, + "loss": 1.6857, + "step": 18794 + }, + { + "epoch": 5.768876611418047, + "grad_norm": 0.21838447451591492, + "learning_rate": 4.005116835049161e-05, + "loss": 1.7531, + "step": 18795 + }, + { + "epoch": 5.769183548189073, + "grad_norm": 0.21940091252326965, + "learning_rate": 4.0046297229949884e-05, + "loss": 1.7363, + "step": 18796 + }, + { + "epoch": 5.769490484960098, + "grad_norm": 0.22679758071899414, + "learning_rate": 4.004142620777647e-05, + "loss": 1.7586, + "step": 18797 + }, + { + "epoch": 5.769797421731123, + "grad_norm": 0.23782022297382355, + "learning_rate": 4.003655528401954e-05, + "loss": 1.7154, + "step": 18798 + }, + { + "epoch": 5.770104358502149, + "grad_norm": 0.20452092587947845, + "learning_rate": 4.0031684458727194e-05, + "loss": 1.7078, + "step": 18799 + }, + { + "epoch": 5.770411295273174, + "grad_norm": 0.22733618319034576, + "learning_rate": 4.0026813731947594e-05, + "loss": 1.6989, + "step": 18800 + }, + { + "epoch": 5.7707182320441985, + "grad_norm": 0.2322154939174652, + "learning_rate": 4.002194310372886e-05, + "loss": 1.7508, + "step": 18801 + }, + { + "epoch": 5.771025168815224, + "grad_norm": 0.24573352932929993, + "learning_rate": 4.001707257411914e-05, + "loss": 1.7245, + "step": 18802 + }, + { + "epoch": 5.771332105586249, + "grad_norm": 0.19692079722881317, + "learning_rate": 4.001220214316655e-05, + "loss": 1.7116, + "step": 18803 + }, + { + "epoch": 5.7716390423572745, + "grad_norm": 0.20525199174880981, + "learning_rate": 4.000733181091925e-05, + "loss": 1.7503, + "step": 18804 + }, + { + "epoch": 5.7719459791283, + "grad_norm": 0.2097626030445099, + "learning_rate": 4.0002461577425344e-05, + "loss": 1.8204, + "step": 18805 + }, + { + "epoch": 5.772252915899324, + "grad_norm": 0.23059608042240143, + "learning_rate": 3.9997591442732975e-05, + "loss": 1.7747, + "step": 18806 + }, + { + "epoch": 5.77255985267035, + "grad_norm": 0.22085745632648468, + "learning_rate": 3.9992721406890265e-05, + "loss": 1.7579, + "step": 18807 + }, + { + "epoch": 5.772866789441375, + "grad_norm": 0.21529869735240936, + "learning_rate": 3.9987851469945334e-05, + "loss": 1.711, + "step": 18808 + }, + { + "epoch": 5.7731737262124, + "grad_norm": 0.20563572645187378, + "learning_rate": 3.998298163194636e-05, + "loss": 1.761, + "step": 18809 + }, + { + "epoch": 5.773480662983426, + "grad_norm": 0.2081122100353241, + "learning_rate": 3.9978111892941394e-05, + "loss": 1.7112, + "step": 18810 + }, + { + "epoch": 5.773787599754451, + "grad_norm": 0.2373751550912857, + "learning_rate": 3.9973242252978635e-05, + "loss": 1.7726, + "step": 18811 + }, + { + "epoch": 5.774094536525475, + "grad_norm": 0.2742944359779358, + "learning_rate": 3.996837271210615e-05, + "loss": 1.7743, + "step": 18812 + }, + { + "epoch": 5.774401473296501, + "grad_norm": 0.20724992454051971, + "learning_rate": 3.996350327037208e-05, + "loss": 1.7052, + "step": 18813 + }, + { + "epoch": 5.774708410067526, + "grad_norm": 0.22324968874454498, + "learning_rate": 3.995863392782456e-05, + "loss": 1.7865, + "step": 18814 + }, + { + "epoch": 5.7750153468385514, + "grad_norm": 0.22314245998859406, + "learning_rate": 3.995376468451172e-05, + "loss": 1.7705, + "step": 18815 + }, + { + "epoch": 5.775322283609577, + "grad_norm": 0.20793841779232025, + "learning_rate": 3.994889554048165e-05, + "loss": 1.739, + "step": 18816 + }, + { + "epoch": 5.775629220380601, + "grad_norm": 0.20117145776748657, + "learning_rate": 3.994402649578249e-05, + "loss": 1.7256, + "step": 18817 + }, + { + "epoch": 5.775936157151627, + "grad_norm": 0.24406170845031738, + "learning_rate": 3.993915755046235e-05, + "loss": 1.8015, + "step": 18818 + }, + { + "epoch": 5.776243093922652, + "grad_norm": 0.20912545919418335, + "learning_rate": 3.993428870456935e-05, + "loss": 1.7038, + "step": 18819 + }, + { + "epoch": 5.776550030693677, + "grad_norm": 0.2587272822856903, + "learning_rate": 3.992941995815162e-05, + "loss": 1.7918, + "step": 18820 + }, + { + "epoch": 5.776856967464703, + "grad_norm": 0.2996658980846405, + "learning_rate": 3.9924551311257266e-05, + "loss": 1.7513, + "step": 18821 + }, + { + "epoch": 5.777163904235728, + "grad_norm": 0.24603547155857086, + "learning_rate": 3.991968276393441e-05, + "loss": 1.7329, + "step": 18822 + }, + { + "epoch": 5.777470841006752, + "grad_norm": 0.2321038693189621, + "learning_rate": 3.991481431623113e-05, + "loss": 1.7406, + "step": 18823 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.3397100269794464, + "learning_rate": 3.990994596819558e-05, + "loss": 1.8129, + "step": 18824 + }, + { + "epoch": 5.778084714548803, + "grad_norm": 0.2807735800743103, + "learning_rate": 3.990507771987584e-05, + "loss": 1.7579, + "step": 18825 + }, + { + "epoch": 5.778391651319828, + "grad_norm": 0.1952899694442749, + "learning_rate": 3.990020957132007e-05, + "loss": 1.7153, + "step": 18826 + }, + { + "epoch": 5.778698588090853, + "grad_norm": 0.28998714685440063, + "learning_rate": 3.989534152257632e-05, + "loss": 1.7844, + "step": 18827 + }, + { + "epoch": 5.779005524861878, + "grad_norm": 0.20929136872291565, + "learning_rate": 3.989047357369275e-05, + "loss": 1.7499, + "step": 18828 + }, + { + "epoch": 5.7793124616329035, + "grad_norm": 0.31144043803215027, + "learning_rate": 3.9885605724717436e-05, + "loss": 1.7745, + "step": 18829 + }, + { + "epoch": 5.779619398403929, + "grad_norm": 0.22598792612552643, + "learning_rate": 3.988073797569849e-05, + "loss": 1.7226, + "step": 18830 + }, + { + "epoch": 5.779926335174954, + "grad_norm": 0.1971752643585205, + "learning_rate": 3.987587032668402e-05, + "loss": 1.7033, + "step": 18831 + }, + { + "epoch": 5.7802332719459795, + "grad_norm": 0.221087247133255, + "learning_rate": 3.9871002777722156e-05, + "loss": 1.7281, + "step": 18832 + }, + { + "epoch": 5.780540208717004, + "grad_norm": 0.21678583323955536, + "learning_rate": 3.986613532886095e-05, + "loss": 1.7207, + "step": 18833 + }, + { + "epoch": 5.780847145488029, + "grad_norm": 0.2511122226715088, + "learning_rate": 3.9861267980148566e-05, + "loss": 1.7091, + "step": 18834 + }, + { + "epoch": 5.781154082259055, + "grad_norm": 0.2883855104446411, + "learning_rate": 3.985640073163304e-05, + "loss": 1.7963, + "step": 18835 + }, + { + "epoch": 5.78146101903008, + "grad_norm": 0.21786242723464966, + "learning_rate": 3.985153358336253e-05, + "loss": 1.6883, + "step": 18836 + }, + { + "epoch": 5.781767955801105, + "grad_norm": 0.18529155850410461, + "learning_rate": 3.98466665353851e-05, + "loss": 1.7194, + "step": 18837 + }, + { + "epoch": 5.78207489257213, + "grad_norm": 0.20535743236541748, + "learning_rate": 3.984179958774888e-05, + "loss": 1.6943, + "step": 18838 + }, + { + "epoch": 5.782381829343155, + "grad_norm": 0.19377392530441284, + "learning_rate": 3.983693274050195e-05, + "loss": 1.6732, + "step": 18839 + }, + { + "epoch": 5.78268876611418, + "grad_norm": 0.22373615205287933, + "learning_rate": 3.983206599369239e-05, + "loss": 1.7668, + "step": 18840 + }, + { + "epoch": 5.782995702885206, + "grad_norm": 0.2132388800382614, + "learning_rate": 3.982719934736832e-05, + "loss": 1.7155, + "step": 18841 + }, + { + "epoch": 5.783302639656231, + "grad_norm": 0.24871744215488434, + "learning_rate": 3.982233280157782e-05, + "loss": 1.7232, + "step": 18842 + }, + { + "epoch": 5.783609576427256, + "grad_norm": 0.1861848086118698, + "learning_rate": 3.981746635636902e-05, + "loss": 1.707, + "step": 18843 + }, + { + "epoch": 5.783916513198281, + "grad_norm": 0.21882779896259308, + "learning_rate": 3.981260001178995e-05, + "loss": 1.7165, + "step": 18844 + }, + { + "epoch": 5.784223449969306, + "grad_norm": 0.22144648432731628, + "learning_rate": 3.980773376788877e-05, + "loss": 1.7799, + "step": 18845 + }, + { + "epoch": 5.7845303867403315, + "grad_norm": 0.210894376039505, + "learning_rate": 3.980286762471351e-05, + "loss": 1.7539, + "step": 18846 + }, + { + "epoch": 5.784837323511357, + "grad_norm": 0.20435640215873718, + "learning_rate": 3.9798001582312305e-05, + "loss": 1.6736, + "step": 18847 + }, + { + "epoch": 5.785144260282382, + "grad_norm": 0.18998762965202332, + "learning_rate": 3.979313564073322e-05, + "loss": 1.7045, + "step": 18848 + }, + { + "epoch": 5.785451197053407, + "grad_norm": 0.19869361817836761, + "learning_rate": 3.978826980002437e-05, + "loss": 1.7444, + "step": 18849 + }, + { + "epoch": 5.785758133824432, + "grad_norm": 0.2175174504518509, + "learning_rate": 3.97834040602338e-05, + "loss": 1.7565, + "step": 18850 + }, + { + "epoch": 5.786065070595457, + "grad_norm": 0.22726793587207794, + "learning_rate": 3.977853842140964e-05, + "loss": 1.713, + "step": 18851 + }, + { + "epoch": 5.786372007366483, + "grad_norm": 0.26518720388412476, + "learning_rate": 3.9773672883599934e-05, + "loss": 1.6892, + "step": 18852 + }, + { + "epoch": 5.786678944137508, + "grad_norm": 0.20721858739852905, + "learning_rate": 3.97688074468528e-05, + "loss": 1.724, + "step": 18853 + }, + { + "epoch": 5.786985880908533, + "grad_norm": 0.22739483416080475, + "learning_rate": 3.976394211121629e-05, + "loss": 1.762, + "step": 18854 + }, + { + "epoch": 5.787292817679558, + "grad_norm": 0.21918894350528717, + "learning_rate": 3.975907687673853e-05, + "loss": 1.6812, + "step": 18855 + }, + { + "epoch": 5.787599754450583, + "grad_norm": 0.20931273698806763, + "learning_rate": 3.9754211743467574e-05, + "loss": 1.6874, + "step": 18856 + }, + { + "epoch": 5.787906691221608, + "grad_norm": 0.2015041708946228, + "learning_rate": 3.974934671145148e-05, + "loss": 1.7248, + "step": 18857 + }, + { + "epoch": 5.788213627992634, + "grad_norm": 0.21632663905620575, + "learning_rate": 3.974448178073836e-05, + "loss": 1.7313, + "step": 18858 + }, + { + "epoch": 5.788520564763659, + "grad_norm": 0.18995213508605957, + "learning_rate": 3.973961695137627e-05, + "loss": 1.6761, + "step": 18859 + }, + { + "epoch": 5.7888275015346835, + "grad_norm": 0.18678395450115204, + "learning_rate": 3.973475222341333e-05, + "loss": 1.7082, + "step": 18860 + }, + { + "epoch": 5.789134438305709, + "grad_norm": 0.1889343559741974, + "learning_rate": 3.972988759689756e-05, + "loss": 1.7296, + "step": 18861 + }, + { + "epoch": 5.789441375076734, + "grad_norm": 0.20196790993213654, + "learning_rate": 3.9725023071877074e-05, + "loss": 1.6876, + "step": 18862 + }, + { + "epoch": 5.7897483118477595, + "grad_norm": 0.198349729180336, + "learning_rate": 3.972015864839992e-05, + "loss": 1.6826, + "step": 18863 + }, + { + "epoch": 5.790055248618785, + "grad_norm": 0.21323837339878082, + "learning_rate": 3.9715294326514185e-05, + "loss": 1.7444, + "step": 18864 + }, + { + "epoch": 5.79036218538981, + "grad_norm": 0.18581731617450714, + "learning_rate": 3.9710430106267934e-05, + "loss": 1.7731, + "step": 18865 + }, + { + "epoch": 5.790669122160835, + "grad_norm": 0.21925146877765656, + "learning_rate": 3.970556598770927e-05, + "loss": 1.7505, + "step": 18866 + }, + { + "epoch": 5.79097605893186, + "grad_norm": 0.20773115754127502, + "learning_rate": 3.970070197088621e-05, + "loss": 1.7408, + "step": 18867 + }, + { + "epoch": 5.791282995702885, + "grad_norm": 0.1805189698934555, + "learning_rate": 3.9695838055846865e-05, + "loss": 1.6871, + "step": 18868 + }, + { + "epoch": 5.791589932473911, + "grad_norm": 0.24685314297676086, + "learning_rate": 3.969097424263928e-05, + "loss": 1.7186, + "step": 18869 + }, + { + "epoch": 5.791896869244935, + "grad_norm": 0.18801769614219666, + "learning_rate": 3.9686110531311526e-05, + "loss": 1.7196, + "step": 18870 + }, + { + "epoch": 5.79220380601596, + "grad_norm": 0.22717779874801636, + "learning_rate": 3.968124692191168e-05, + "loss": 1.7309, + "step": 18871 + }, + { + "epoch": 5.792510742786986, + "grad_norm": 0.23058642446994781, + "learning_rate": 3.9676383414487806e-05, + "loss": 1.6993, + "step": 18872 + }, + { + "epoch": 5.792817679558011, + "grad_norm": 0.24307532608509064, + "learning_rate": 3.967152000908796e-05, + "loss": 1.6986, + "step": 18873 + }, + { + "epoch": 5.793124616329036, + "grad_norm": 0.3032459318637848, + "learning_rate": 3.9666656705760195e-05, + "loss": 1.677, + "step": 18874 + }, + { + "epoch": 5.793431553100062, + "grad_norm": 0.22669538855552673, + "learning_rate": 3.966179350455259e-05, + "loss": 1.7361, + "step": 18875 + }, + { + "epoch": 5.793738489871086, + "grad_norm": 0.27729150652885437, + "learning_rate": 3.96569304055132e-05, + "loss": 1.746, + "step": 18876 + }, + { + "epoch": 5.7940454266421115, + "grad_norm": 0.3422098755836487, + "learning_rate": 3.96520674086901e-05, + "loss": 1.783, + "step": 18877 + }, + { + "epoch": 5.794352363413137, + "grad_norm": 0.2114052176475525, + "learning_rate": 3.964720451413131e-05, + "loss": 1.7127, + "step": 18878 + }, + { + "epoch": 5.794659300184162, + "grad_norm": 0.22928549349308014, + "learning_rate": 3.964234172188494e-05, + "loss": 1.6579, + "step": 18879 + }, + { + "epoch": 5.7949662369551875, + "grad_norm": 0.24813635647296906, + "learning_rate": 3.9637479031999e-05, + "loss": 1.728, + "step": 18880 + }, + { + "epoch": 5.795273173726212, + "grad_norm": 0.19779744744300842, + "learning_rate": 3.963261644452158e-05, + "loss": 1.7338, + "step": 18881 + }, + { + "epoch": 5.795580110497237, + "grad_norm": 0.2424263060092926, + "learning_rate": 3.96277539595007e-05, + "loss": 1.7762, + "step": 18882 + }, + { + "epoch": 5.795887047268263, + "grad_norm": 0.24621224403381348, + "learning_rate": 3.9622891576984456e-05, + "loss": 1.7746, + "step": 18883 + }, + { + "epoch": 5.796193984039288, + "grad_norm": 0.1973372846841812, + "learning_rate": 3.961802929702086e-05, + "loss": 1.7243, + "step": 18884 + }, + { + "epoch": 5.796500920810313, + "grad_norm": 0.22170570492744446, + "learning_rate": 3.961316711965801e-05, + "loss": 1.764, + "step": 18885 + }, + { + "epoch": 5.796807857581339, + "grad_norm": 0.22319282591342926, + "learning_rate": 3.9608305044943906e-05, + "loss": 1.6795, + "step": 18886 + }, + { + "epoch": 5.797114794352363, + "grad_norm": 0.20000022649765015, + "learning_rate": 3.9603443072926635e-05, + "loss": 1.7587, + "step": 18887 + }, + { + "epoch": 5.797421731123388, + "grad_norm": 0.25041815638542175, + "learning_rate": 3.959858120365424e-05, + "loss": 1.7631, + "step": 18888 + }, + { + "epoch": 5.797728667894414, + "grad_norm": 0.23383729159832, + "learning_rate": 3.959371943717474e-05, + "loss": 1.741, + "step": 18889 + }, + { + "epoch": 5.798035604665439, + "grad_norm": 0.18609663844108582, + "learning_rate": 3.958885777353623e-05, + "loss": 1.6981, + "step": 18890 + }, + { + "epoch": 5.798342541436464, + "grad_norm": 0.29523593187332153, + "learning_rate": 3.9583996212786706e-05, + "loss": 1.8018, + "step": 18891 + }, + { + "epoch": 5.798649478207489, + "grad_norm": 0.20356589555740356, + "learning_rate": 3.9579134754974244e-05, + "loss": 1.7157, + "step": 18892 + }, + { + "epoch": 5.798956414978514, + "grad_norm": 0.2901862561702728, + "learning_rate": 3.957427340014688e-05, + "loss": 1.7249, + "step": 18893 + }, + { + "epoch": 5.7992633517495396, + "grad_norm": 0.24768278002738953, + "learning_rate": 3.956941214835267e-05, + "loss": 1.6894, + "step": 18894 + }, + { + "epoch": 5.799570288520565, + "grad_norm": 0.2417999804019928, + "learning_rate": 3.956455099963962e-05, + "loss": 1.7203, + "step": 18895 + }, + { + "epoch": 5.79987722529159, + "grad_norm": 0.2889639437198639, + "learning_rate": 3.9559689954055814e-05, + "loss": 1.7531, + "step": 18896 + }, + { + "epoch": 5.800184162062616, + "grad_norm": 0.21204611659049988, + "learning_rate": 3.955482901164926e-05, + "loss": 1.7521, + "step": 18897 + }, + { + "epoch": 5.80049109883364, + "grad_norm": 0.2961438298225403, + "learning_rate": 3.954996817246801e-05, + "loss": 1.8102, + "step": 18898 + }, + { + "epoch": 5.800798035604665, + "grad_norm": 0.36562761664390564, + "learning_rate": 3.9545107436560084e-05, + "loss": 1.6722, + "step": 18899 + }, + { + "epoch": 5.801104972375691, + "grad_norm": 0.22423696517944336, + "learning_rate": 3.954024680397357e-05, + "loss": 1.7101, + "step": 18900 + }, + { + "epoch": 5.801411909146716, + "grad_norm": 0.3122335970401764, + "learning_rate": 3.953538627475644e-05, + "loss": 1.7314, + "step": 18901 + }, + { + "epoch": 5.8017188459177405, + "grad_norm": 0.39004257321357727, + "learning_rate": 3.953052584895677e-05, + "loss": 1.762, + "step": 18902 + }, + { + "epoch": 5.802025782688766, + "grad_norm": 0.1827487200498581, + "learning_rate": 3.952566552662256e-05, + "loss": 1.6935, + "step": 18903 + }, + { + "epoch": 5.802332719459791, + "grad_norm": 0.3025164306163788, + "learning_rate": 3.952080530780188e-05, + "loss": 1.7448, + "step": 18904 + }, + { + "epoch": 5.8026396562308165, + "grad_norm": 0.2313300520181656, + "learning_rate": 3.9515945192542754e-05, + "loss": 1.7686, + "step": 18905 + }, + { + "epoch": 5.802946593001842, + "grad_norm": 0.3501042425632477, + "learning_rate": 3.9511085180893184e-05, + "loss": 1.775, + "step": 18906 + }, + { + "epoch": 5.803253529772867, + "grad_norm": 0.4111124873161316, + "learning_rate": 3.950622527290123e-05, + "loss": 1.7561, + "step": 18907 + }, + { + "epoch": 5.803560466543892, + "grad_norm": 0.20877736806869507, + "learning_rate": 3.950136546861489e-05, + "loss": 1.7356, + "step": 18908 + }, + { + "epoch": 5.803867403314917, + "grad_norm": 0.33404025435447693, + "learning_rate": 3.949650576808222e-05, + "loss": 1.7289, + "step": 18909 + }, + { + "epoch": 5.804174340085942, + "grad_norm": 0.2183927446603775, + "learning_rate": 3.9491646171351234e-05, + "loss": 1.7136, + "step": 18910 + }, + { + "epoch": 5.804481276856968, + "grad_norm": 0.27149543166160583, + "learning_rate": 3.948678667846997e-05, + "loss": 1.7516, + "step": 18911 + }, + { + "epoch": 5.804788213627993, + "grad_norm": 0.2369886338710785, + "learning_rate": 3.948192728948643e-05, + "loss": 1.6767, + "step": 18912 + }, + { + "epoch": 5.805095150399017, + "grad_norm": 0.20671069622039795, + "learning_rate": 3.947706800444867e-05, + "loss": 1.7831, + "step": 18913 + }, + { + "epoch": 5.805402087170043, + "grad_norm": 0.23622260987758636, + "learning_rate": 3.9472208823404665e-05, + "loss": 1.7121, + "step": 18914 + }, + { + "epoch": 5.805709023941068, + "grad_norm": 0.21099595725536346, + "learning_rate": 3.946734974640247e-05, + "loss": 1.7137, + "step": 18915 + }, + { + "epoch": 5.806015960712093, + "grad_norm": 0.2205580472946167, + "learning_rate": 3.9462490773490094e-05, + "loss": 1.713, + "step": 18916 + }, + { + "epoch": 5.806322897483119, + "grad_norm": 0.20183326303958893, + "learning_rate": 3.9457631904715584e-05, + "loss": 1.7316, + "step": 18917 + }, + { + "epoch": 5.806629834254144, + "grad_norm": 0.27381497621536255, + "learning_rate": 3.9452773140126906e-05, + "loss": 1.7577, + "step": 18918 + }, + { + "epoch": 5.8069367710251685, + "grad_norm": 0.29962384700775146, + "learning_rate": 3.944791447977214e-05, + "loss": 1.7579, + "step": 18919 + }, + { + "epoch": 5.807243707796194, + "grad_norm": 0.22385326027870178, + "learning_rate": 3.944305592369923e-05, + "loss": 1.7795, + "step": 18920 + }, + { + "epoch": 5.807550644567219, + "grad_norm": 0.2954902648925781, + "learning_rate": 3.943819747195625e-05, + "loss": 1.6655, + "step": 18921 + }, + { + "epoch": 5.8078575813382445, + "grad_norm": 0.18947024643421173, + "learning_rate": 3.94333391245912e-05, + "loss": 1.6803, + "step": 18922 + }, + { + "epoch": 5.80816451810927, + "grad_norm": 0.26797959208488464, + "learning_rate": 3.942848088165206e-05, + "loss": 1.7671, + "step": 18923 + }, + { + "epoch": 5.808471454880294, + "grad_norm": 0.23453201353549957, + "learning_rate": 3.94236227431869e-05, + "loss": 1.7472, + "step": 18924 + }, + { + "epoch": 5.80877839165132, + "grad_norm": 0.24471673369407654, + "learning_rate": 3.941876470924367e-05, + "loss": 1.7482, + "step": 18925 + }, + { + "epoch": 5.809085328422345, + "grad_norm": 0.22249098122119904, + "learning_rate": 3.9413906779870426e-05, + "loss": 1.6794, + "step": 18926 + }, + { + "epoch": 5.80939226519337, + "grad_norm": 0.1985001564025879, + "learning_rate": 3.9409048955115144e-05, + "loss": 1.7278, + "step": 18927 + }, + { + "epoch": 5.809699201964396, + "grad_norm": 0.22482000291347504, + "learning_rate": 3.940419123502587e-05, + "loss": 1.7658, + "step": 18928 + }, + { + "epoch": 5.810006138735421, + "grad_norm": 0.18513578176498413, + "learning_rate": 3.939933361965057e-05, + "loss": 1.7154, + "step": 18929 + }, + { + "epoch": 5.810313075506445, + "grad_norm": 0.1984710991382599, + "learning_rate": 3.939447610903729e-05, + "loss": 1.7324, + "step": 18930 + }, + { + "epoch": 5.810620012277471, + "grad_norm": 0.26089081168174744, + "learning_rate": 3.938961870323399e-05, + "loss": 1.774, + "step": 18931 + }, + { + "epoch": 5.810926949048496, + "grad_norm": 0.2059585452079773, + "learning_rate": 3.9384761402288706e-05, + "loss": 1.7059, + "step": 18932 + }, + { + "epoch": 5.811233885819521, + "grad_norm": 0.1887979656457901, + "learning_rate": 3.937990420624942e-05, + "loss": 1.6829, + "step": 18933 + }, + { + "epoch": 5.811540822590547, + "grad_norm": 0.2589145600795746, + "learning_rate": 3.937504711516417e-05, + "loss": 1.7301, + "step": 18934 + }, + { + "epoch": 5.811847759361571, + "grad_norm": 0.209516704082489, + "learning_rate": 3.9370190129080907e-05, + "loss": 1.7716, + "step": 18935 + }, + { + "epoch": 5.8121546961325965, + "grad_norm": 0.3321632146835327, + "learning_rate": 3.936533324804768e-05, + "loss": 1.7754, + "step": 18936 + }, + { + "epoch": 5.812461632903622, + "grad_norm": 0.236944317817688, + "learning_rate": 3.9360476472112446e-05, + "loss": 1.7546, + "step": 18937 + }, + { + "epoch": 5.812768569674647, + "grad_norm": 0.29667431116104126, + "learning_rate": 3.9355619801323226e-05, + "loss": 1.7712, + "step": 18938 + }, + { + "epoch": 5.8130755064456725, + "grad_norm": 0.3071129620075226, + "learning_rate": 3.935076323572802e-05, + "loss": 1.7351, + "step": 18939 + }, + { + "epoch": 5.813382443216698, + "grad_norm": 0.22747032344341278, + "learning_rate": 3.934590677537479e-05, + "loss": 1.7788, + "step": 18940 + }, + { + "epoch": 5.813689379987722, + "grad_norm": 0.2575854957103729, + "learning_rate": 3.934105042031158e-05, + "loss": 1.705, + "step": 18941 + }, + { + "epoch": 5.813996316758748, + "grad_norm": 0.2561504542827606, + "learning_rate": 3.9336194170586325e-05, + "loss": 1.7309, + "step": 18942 + }, + { + "epoch": 5.814303253529773, + "grad_norm": 0.21570482850074768, + "learning_rate": 3.933133802624707e-05, + "loss": 1.7408, + "step": 18943 + }, + { + "epoch": 5.814610190300798, + "grad_norm": 0.29227179288864136, + "learning_rate": 3.932648198734177e-05, + "loss": 1.7415, + "step": 18944 + }, + { + "epoch": 5.814917127071823, + "grad_norm": 0.17847758531570435, + "learning_rate": 3.9321626053918456e-05, + "loss": 1.7926, + "step": 18945 + }, + { + "epoch": 5.815224063842848, + "grad_norm": 0.24604015052318573, + "learning_rate": 3.931677022602507e-05, + "loss": 1.7519, + "step": 18946 + }, + { + "epoch": 5.815531000613873, + "grad_norm": 0.23843185603618622, + "learning_rate": 3.931191450370965e-05, + "loss": 1.7206, + "step": 18947 + }, + { + "epoch": 5.815837937384899, + "grad_norm": 0.23431400954723358, + "learning_rate": 3.9307058887020126e-05, + "loss": 1.7743, + "step": 18948 + }, + { + "epoch": 5.816144874155924, + "grad_norm": 0.23685097694396973, + "learning_rate": 3.9302203376004525e-05, + "loss": 1.7485, + "step": 18949 + }, + { + "epoch": 5.816451810926949, + "grad_norm": 0.2129819542169571, + "learning_rate": 3.929734797071082e-05, + "loss": 1.6897, + "step": 18950 + }, + { + "epoch": 5.816758747697974, + "grad_norm": 0.24736030399799347, + "learning_rate": 3.9292492671187e-05, + "loss": 1.7292, + "step": 18951 + }, + { + "epoch": 5.817065684468999, + "grad_norm": 0.28659793734550476, + "learning_rate": 3.9287637477481025e-05, + "loss": 1.6772, + "step": 18952 + }, + { + "epoch": 5.8173726212400245, + "grad_norm": 0.22304075956344604, + "learning_rate": 3.928278238964092e-05, + "loss": 1.7991, + "step": 18953 + }, + { + "epoch": 5.81767955801105, + "grad_norm": 0.25354304909706116, + "learning_rate": 3.927792740771462e-05, + "loss": 1.7407, + "step": 18954 + }, + { + "epoch": 5.817986494782075, + "grad_norm": 0.3014552593231201, + "learning_rate": 3.927307253175014e-05, + "loss": 1.7714, + "step": 18955 + }, + { + "epoch": 5.8182934315531, + "grad_norm": 0.20537856221199036, + "learning_rate": 3.926821776179545e-05, + "loss": 1.6992, + "step": 18956 + }, + { + "epoch": 5.818600368324125, + "grad_norm": 0.29656440019607544, + "learning_rate": 3.92633630978985e-05, + "loss": 1.7476, + "step": 18957 + }, + { + "epoch": 5.81890730509515, + "grad_norm": 0.20956869423389435, + "learning_rate": 3.925850854010732e-05, + "loss": 1.808, + "step": 18958 + }, + { + "epoch": 5.819214241866176, + "grad_norm": 0.29395633935928345, + "learning_rate": 3.925365408846983e-05, + "loss": 1.7787, + "step": 18959 + }, + { + "epoch": 5.819521178637201, + "grad_norm": 0.31101030111312866, + "learning_rate": 3.9248799743034025e-05, + "loss": 1.7685, + "step": 18960 + }, + { + "epoch": 5.819828115408226, + "grad_norm": 0.2109794020652771, + "learning_rate": 3.9243945503847894e-05, + "loss": 1.7307, + "step": 18961 + }, + { + "epoch": 5.820135052179251, + "grad_norm": 0.2503393292427063, + "learning_rate": 3.9239091370959405e-05, + "loss": 1.763, + "step": 18962 + }, + { + "epoch": 5.820441988950276, + "grad_norm": 0.21757015585899353, + "learning_rate": 3.92342373444165e-05, + "loss": 1.7862, + "step": 18963 + }, + { + "epoch": 5.820748925721301, + "grad_norm": 0.22108088433742523, + "learning_rate": 3.9229383424267197e-05, + "loss": 1.6845, + "step": 18964 + }, + { + "epoch": 5.821055862492327, + "grad_norm": 0.20059655606746674, + "learning_rate": 3.922452961055941e-05, + "loss": 1.7523, + "step": 18965 + }, + { + "epoch": 5.821362799263352, + "grad_norm": 0.22009585797786713, + "learning_rate": 3.921967590334117e-05, + "loss": 1.7802, + "step": 18966 + }, + { + "epoch": 5.8216697360343765, + "grad_norm": 0.22554142773151398, + "learning_rate": 3.9214822302660386e-05, + "loss": 1.7911, + "step": 18967 + }, + { + "epoch": 5.821976672805402, + "grad_norm": 0.23434770107269287, + "learning_rate": 3.920996880856506e-05, + "loss": 1.6755, + "step": 18968 + }, + { + "epoch": 5.822283609576427, + "grad_norm": 0.2162926346063614, + "learning_rate": 3.920511542110314e-05, + "loss": 1.7145, + "step": 18969 + }, + { + "epoch": 5.8225905463474525, + "grad_norm": 0.18654806911945343, + "learning_rate": 3.9200262140322616e-05, + "loss": 1.7076, + "step": 18970 + }, + { + "epoch": 5.822897483118478, + "grad_norm": 0.22357499599456787, + "learning_rate": 3.9195408966271404e-05, + "loss": 1.791, + "step": 18971 + }, + { + "epoch": 5.823204419889503, + "grad_norm": 0.21073313057422638, + "learning_rate": 3.919055589899752e-05, + "loss": 1.7976, + "step": 18972 + }, + { + "epoch": 5.823511356660528, + "grad_norm": 0.21481956541538239, + "learning_rate": 3.9185702938548886e-05, + "loss": 1.7468, + "step": 18973 + }, + { + "epoch": 5.823818293431553, + "grad_norm": 0.22051872313022614, + "learning_rate": 3.9180850084973464e-05, + "loss": 1.7201, + "step": 18974 + }, + { + "epoch": 5.824125230202578, + "grad_norm": 0.24410493671894073, + "learning_rate": 3.917599733831924e-05, + "loss": 1.7774, + "step": 18975 + }, + { + "epoch": 5.824432166973604, + "grad_norm": 0.19711458683013916, + "learning_rate": 3.917114469863414e-05, + "loss": 1.7907, + "step": 18976 + }, + { + "epoch": 5.824739103744628, + "grad_norm": 0.2045203000307083, + "learning_rate": 3.9166292165966155e-05, + "loss": 1.7105, + "step": 18977 + }, + { + "epoch": 5.8250460405156534, + "grad_norm": 0.21570880711078644, + "learning_rate": 3.9161439740363196e-05, + "loss": 1.7312, + "step": 18978 + }, + { + "epoch": 5.825352977286679, + "grad_norm": 0.21203923225402832, + "learning_rate": 3.915658742187325e-05, + "loss": 1.7869, + "step": 18979 + }, + { + "epoch": 5.825659914057704, + "grad_norm": 0.26233312487602234, + "learning_rate": 3.915173521054426e-05, + "loss": 1.7453, + "step": 18980 + }, + { + "epoch": 5.8259668508287294, + "grad_norm": 0.23792949318885803, + "learning_rate": 3.91468831064242e-05, + "loss": 1.6886, + "step": 18981 + }, + { + "epoch": 5.826273787599755, + "grad_norm": 0.20325250923633575, + "learning_rate": 3.914203110956098e-05, + "loss": 1.7538, + "step": 18982 + }, + { + "epoch": 5.82658072437078, + "grad_norm": 0.28146329522132874, + "learning_rate": 3.9137179220002596e-05, + "loss": 1.7674, + "step": 18983 + }, + { + "epoch": 5.826887661141805, + "grad_norm": 0.2319503277540207, + "learning_rate": 3.9132327437796946e-05, + "loss": 1.7864, + "step": 18984 + }, + { + "epoch": 5.82719459791283, + "grad_norm": 0.22653794288635254, + "learning_rate": 3.9127475762992025e-05, + "loss": 1.7424, + "step": 18985 + }, + { + "epoch": 5.827501534683855, + "grad_norm": 0.26855236291885376, + "learning_rate": 3.912262419563574e-05, + "loss": 1.762, + "step": 18986 + }, + { + "epoch": 5.827808471454881, + "grad_norm": 0.18356221914291382, + "learning_rate": 3.9117772735776095e-05, + "loss": 1.7199, + "step": 18987 + }, + { + "epoch": 5.828115408225905, + "grad_norm": 0.2802455425262451, + "learning_rate": 3.911292138346096e-05, + "loss": 1.7142, + "step": 18988 + }, + { + "epoch": 5.82842234499693, + "grad_norm": 0.2638777494430542, + "learning_rate": 3.910807013873835e-05, + "loss": 1.6759, + "step": 18989 + }, + { + "epoch": 5.828729281767956, + "grad_norm": 0.18397162854671478, + "learning_rate": 3.910321900165615e-05, + "loss": 1.693, + "step": 18990 + }, + { + "epoch": 5.829036218538981, + "grad_norm": 0.20967607200145721, + "learning_rate": 3.909836797226233e-05, + "loss": 1.6908, + "step": 18991 + }, + { + "epoch": 5.829343155310006, + "grad_norm": 0.21123014390468597, + "learning_rate": 3.909351705060485e-05, + "loss": 1.7875, + "step": 18992 + }, + { + "epoch": 5.829650092081032, + "grad_norm": 0.1988777220249176, + "learning_rate": 3.90886662367316e-05, + "loss": 1.7254, + "step": 18993 + }, + { + "epoch": 5.829957028852056, + "grad_norm": 0.17793473601341248, + "learning_rate": 3.9083815530690564e-05, + "loss": 1.7233, + "step": 18994 + }, + { + "epoch": 5.8302639656230815, + "grad_norm": 0.2289644330739975, + "learning_rate": 3.9078964932529645e-05, + "loss": 1.7739, + "step": 18995 + }, + { + "epoch": 5.830570902394107, + "grad_norm": 0.18145552277565002, + "learning_rate": 3.9074114442296804e-05, + "loss": 1.6989, + "step": 18996 + }, + { + "epoch": 5.830877839165132, + "grad_norm": 0.1941588670015335, + "learning_rate": 3.9069264060039956e-05, + "loss": 1.6981, + "step": 18997 + }, + { + "epoch": 5.8311847759361575, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.9064413785807075e-05, + "loss": 1.7163, + "step": 18998 + }, + { + "epoch": 5.831491712707182, + "grad_norm": 0.19494447112083435, + "learning_rate": 3.905956361964604e-05, + "loss": 1.7481, + "step": 18999 + }, + { + "epoch": 5.831798649478207, + "grad_norm": 0.2127624899148941, + "learning_rate": 3.9054713561604826e-05, + "loss": 1.7494, + "step": 19000 + }, + { + "epoch": 5.832105586249233, + "grad_norm": 0.20107653737068176, + "learning_rate": 3.9049863611731334e-05, + "loss": 1.7483, + "step": 19001 + }, + { + "epoch": 5.832412523020258, + "grad_norm": 0.22574639320373535, + "learning_rate": 3.904501377007352e-05, + "loss": 1.8184, + "step": 19002 + }, + { + "epoch": 5.832719459791283, + "grad_norm": 0.20027579367160797, + "learning_rate": 3.9040164036679285e-05, + "loss": 1.6995, + "step": 19003 + }, + { + "epoch": 5.833026396562309, + "grad_norm": 0.21599887311458588, + "learning_rate": 3.90353144115966e-05, + "loss": 1.7487, + "step": 19004 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.21122781932353973, + "learning_rate": 3.9030464894873334e-05, + "loss": 1.7332, + "step": 19005 + }, + { + "epoch": 5.833640270104358, + "grad_norm": 0.19006453454494476, + "learning_rate": 3.902561548655747e-05, + "loss": 1.688, + "step": 19006 + }, + { + "epoch": 5.833947206875384, + "grad_norm": 0.22979344427585602, + "learning_rate": 3.9020766186696895e-05, + "loss": 1.7495, + "step": 19007 + }, + { + "epoch": 5.834254143646409, + "grad_norm": 0.18405365943908691, + "learning_rate": 3.901591699533953e-05, + "loss": 1.7395, + "step": 19008 + }, + { + "epoch": 5.834561080417434, + "grad_norm": 0.26198676228523254, + "learning_rate": 3.901106791253334e-05, + "loss": 1.8286, + "step": 19009 + }, + { + "epoch": 5.834868017188459, + "grad_norm": 0.2535797357559204, + "learning_rate": 3.900621893832619e-05, + "loss": 1.757, + "step": 19010 + }, + { + "epoch": 5.835174953959484, + "grad_norm": 0.24599581956863403, + "learning_rate": 3.900137007276605e-05, + "loss": 1.7266, + "step": 19011 + }, + { + "epoch": 5.8354818907305095, + "grad_norm": 0.25688427686691284, + "learning_rate": 3.8996521315900805e-05, + "loss": 1.7255, + "step": 19012 + }, + { + "epoch": 5.835788827501535, + "grad_norm": 0.24668128788471222, + "learning_rate": 3.8991672667778385e-05, + "loss": 1.737, + "step": 19013 + }, + { + "epoch": 5.83609576427256, + "grad_norm": 0.28365740180015564, + "learning_rate": 3.8986824128446695e-05, + "loss": 1.7129, + "step": 19014 + }, + { + "epoch": 5.8364027010435855, + "grad_norm": 0.2543952465057373, + "learning_rate": 3.89819756979537e-05, + "loss": 1.7249, + "step": 19015 + }, + { + "epoch": 5.83670963781461, + "grad_norm": 0.2868666350841522, + "learning_rate": 3.8977127376347245e-05, + "loss": 1.6985, + "step": 19016 + }, + { + "epoch": 5.837016574585635, + "grad_norm": 0.3818367123603821, + "learning_rate": 3.897227916367531e-05, + "loss": 1.6954, + "step": 19017 + }, + { + "epoch": 5.837323511356661, + "grad_norm": 0.20922113955020905, + "learning_rate": 3.896743105998574e-05, + "loss": 1.7571, + "step": 19018 + }, + { + "epoch": 5.837630448127686, + "grad_norm": 0.3669843375682831, + "learning_rate": 3.89625830653265e-05, + "loss": 1.8041, + "step": 19019 + }, + { + "epoch": 5.83793738489871, + "grad_norm": 0.2889872193336487, + "learning_rate": 3.895773517974548e-05, + "loss": 1.7775, + "step": 19020 + }, + { + "epoch": 5.838244321669736, + "grad_norm": 0.22619491815567017, + "learning_rate": 3.89528874032906e-05, + "loss": 1.7019, + "step": 19021 + }, + { + "epoch": 5.838551258440761, + "grad_norm": 0.4169046878814697, + "learning_rate": 3.894803973600976e-05, + "loss": 1.8282, + "step": 19022 + }, + { + "epoch": 5.838858195211786, + "grad_norm": 0.2567043900489807, + "learning_rate": 3.894319217795087e-05, + "loss": 1.733, + "step": 19023 + }, + { + "epoch": 5.839165131982812, + "grad_norm": 0.2435060739517212, + "learning_rate": 3.8938344729161834e-05, + "loss": 1.7208, + "step": 19024 + }, + { + "epoch": 5.839472068753837, + "grad_norm": 0.2941838204860687, + "learning_rate": 3.893349738969055e-05, + "loss": 1.7202, + "step": 19025 + }, + { + "epoch": 5.8397790055248615, + "grad_norm": 0.23542317748069763, + "learning_rate": 3.892865015958495e-05, + "loss": 1.7571, + "step": 19026 + }, + { + "epoch": 5.840085942295887, + "grad_norm": 0.3248259723186493, + "learning_rate": 3.8923803038892897e-05, + "loss": 1.7118, + "step": 19027 + }, + { + "epoch": 5.840392879066912, + "grad_norm": 0.24359026551246643, + "learning_rate": 3.891895602766234e-05, + "loss": 1.8126, + "step": 19028 + }, + { + "epoch": 5.8406998158379375, + "grad_norm": 0.3053695559501648, + "learning_rate": 3.8914109125941126e-05, + "loss": 1.6632, + "step": 19029 + }, + { + "epoch": 5.841006752608963, + "grad_norm": 0.3194943368434906, + "learning_rate": 3.8909262333777195e-05, + "loss": 1.8432, + "step": 19030 + }, + { + "epoch": 5.841313689379987, + "grad_norm": 0.23532693088054657, + "learning_rate": 3.8904415651218426e-05, + "loss": 1.716, + "step": 19031 + }, + { + "epoch": 5.841620626151013, + "grad_norm": 0.2941347062587738, + "learning_rate": 3.889956907831275e-05, + "loss": 1.7737, + "step": 19032 + }, + { + "epoch": 5.841927562922038, + "grad_norm": 0.2265428602695465, + "learning_rate": 3.889472261510801e-05, + "loss": 1.7111, + "step": 19033 + }, + { + "epoch": 5.842234499693063, + "grad_norm": 0.3023710548877716, + "learning_rate": 3.888987626165216e-05, + "loss": 1.7845, + "step": 19034 + }, + { + "epoch": 5.842541436464089, + "grad_norm": 0.2855348289012909, + "learning_rate": 3.8885030017993026e-05, + "loss": 1.8009, + "step": 19035 + }, + { + "epoch": 5.842848373235114, + "grad_norm": 0.23046357929706573, + "learning_rate": 3.888018388417857e-05, + "loss": 1.8225, + "step": 19036 + }, + { + "epoch": 5.843155310006138, + "grad_norm": 0.23732341825962067, + "learning_rate": 3.8875337860256634e-05, + "loss": 1.7542, + "step": 19037 + }, + { + "epoch": 5.843462246777164, + "grad_norm": 0.18987004458904266, + "learning_rate": 3.887049194627516e-05, + "loss": 1.7327, + "step": 19038 + }, + { + "epoch": 5.843769183548189, + "grad_norm": 0.21539908647537231, + "learning_rate": 3.8865646142281974e-05, + "loss": 1.715, + "step": 19039 + }, + { + "epoch": 5.844076120319214, + "grad_norm": 0.2991954982280731, + "learning_rate": 3.8860800448325024e-05, + "loss": 1.7728, + "step": 19040 + }, + { + "epoch": 5.84438305709024, + "grad_norm": 0.19066409766674042, + "learning_rate": 3.885595486445216e-05, + "loss": 1.7128, + "step": 19041 + }, + { + "epoch": 5.844689993861264, + "grad_norm": 0.21643762290477753, + "learning_rate": 3.885110939071128e-05, + "loss": 1.7584, + "step": 19042 + }, + { + "epoch": 5.8449969306322895, + "grad_norm": 0.20227304100990295, + "learning_rate": 3.884626402715029e-05, + "loss": 1.7053, + "step": 19043 + }, + { + "epoch": 5.845303867403315, + "grad_norm": 0.20429107546806335, + "learning_rate": 3.884141877381703e-05, + "loss": 1.761, + "step": 19044 + }, + { + "epoch": 5.84561080417434, + "grad_norm": 0.1873873621225357, + "learning_rate": 3.8836573630759435e-05, + "loss": 1.7251, + "step": 19045 + }, + { + "epoch": 5.8459177409453655, + "grad_norm": 0.18025323748588562, + "learning_rate": 3.883172859802534e-05, + "loss": 1.6696, + "step": 19046 + }, + { + "epoch": 5.846224677716391, + "grad_norm": 0.22011777758598328, + "learning_rate": 3.8826883675662664e-05, + "loss": 1.7148, + "step": 19047 + }, + { + "epoch": 5.846531614487415, + "grad_norm": 0.17827673256397247, + "learning_rate": 3.882203886371925e-05, + "loss": 1.69, + "step": 19048 + }, + { + "epoch": 5.846838551258441, + "grad_norm": 0.200766459107399, + "learning_rate": 3.881719416224303e-05, + "loss": 1.7773, + "step": 19049 + }, + { + "epoch": 5.847145488029466, + "grad_norm": 0.22770950198173523, + "learning_rate": 3.8812349571281834e-05, + "loss": 1.7156, + "step": 19050 + }, + { + "epoch": 5.847452424800491, + "grad_norm": 0.19483895599842072, + "learning_rate": 3.880750509088357e-05, + "loss": 1.7304, + "step": 19051 + }, + { + "epoch": 5.847759361571516, + "grad_norm": 0.1988774836063385, + "learning_rate": 3.8802660721096086e-05, + "loss": 1.7428, + "step": 19052 + }, + { + "epoch": 5.848066298342541, + "grad_norm": 0.19881510734558105, + "learning_rate": 3.879781646196727e-05, + "loss": 1.7268, + "step": 19053 + }, + { + "epoch": 5.848373235113566, + "grad_norm": 0.21257543563842773, + "learning_rate": 3.8792972313545e-05, + "loss": 1.7532, + "step": 19054 + }, + { + "epoch": 5.848680171884592, + "grad_norm": 0.21000613272190094, + "learning_rate": 3.878812827587716e-05, + "loss": 1.7782, + "step": 19055 + }, + { + "epoch": 5.848987108655617, + "grad_norm": 0.2136746346950531, + "learning_rate": 3.878328434901159e-05, + "loss": 1.6875, + "step": 19056 + }, + { + "epoch": 5.849294045426642, + "grad_norm": 0.20291505753993988, + "learning_rate": 3.8778440532996204e-05, + "loss": 1.74, + "step": 19057 + }, + { + "epoch": 5.849600982197668, + "grad_norm": 0.22568103671073914, + "learning_rate": 3.877359682787883e-05, + "loss": 1.7074, + "step": 19058 + }, + { + "epoch": 5.849907918968692, + "grad_norm": 0.24398963153362274, + "learning_rate": 3.876875323370734e-05, + "loss": 1.6825, + "step": 19059 + }, + { + "epoch": 5.850214855739718, + "grad_norm": 0.19684453308582306, + "learning_rate": 3.876390975052964e-05, + "loss": 1.7143, + "step": 19060 + }, + { + "epoch": 5.850521792510743, + "grad_norm": 0.2786783277988434, + "learning_rate": 3.8759066378393544e-05, + "loss": 1.8339, + "step": 19061 + }, + { + "epoch": 5.850828729281768, + "grad_norm": 0.1977633833885193, + "learning_rate": 3.875422311734697e-05, + "loss": 1.742, + "step": 19062 + }, + { + "epoch": 5.851135666052793, + "grad_norm": 0.260643869638443, + "learning_rate": 3.874937996743772e-05, + "loss": 1.7728, + "step": 19063 + }, + { + "epoch": 5.851442602823818, + "grad_norm": 0.20998433232307434, + "learning_rate": 3.874453692871372e-05, + "loss": 1.768, + "step": 19064 + }, + { + "epoch": 5.851749539594843, + "grad_norm": 0.2603224217891693, + "learning_rate": 3.873969400122278e-05, + "loss": 1.8015, + "step": 19065 + }, + { + "epoch": 5.852056476365869, + "grad_norm": 0.24428118765354156, + "learning_rate": 3.87348511850128e-05, + "loss": 1.8133, + "step": 19066 + }, + { + "epoch": 5.852363413136894, + "grad_norm": 0.19380085170269012, + "learning_rate": 3.873000848013161e-05, + "loss": 1.7331, + "step": 19067 + }, + { + "epoch": 5.852670349907919, + "grad_norm": 0.20088011026382446, + "learning_rate": 3.87251658866271e-05, + "loss": 1.7501, + "step": 19068 + }, + { + "epoch": 5.852977286678944, + "grad_norm": 0.21920672059059143, + "learning_rate": 3.8720323404547095e-05, + "loss": 1.6848, + "step": 19069 + }, + { + "epoch": 5.853284223449969, + "grad_norm": 0.21692565083503723, + "learning_rate": 3.871548103393947e-05, + "loss": 1.7132, + "step": 19070 + }, + { + "epoch": 5.8535911602209945, + "grad_norm": 0.19463133811950684, + "learning_rate": 3.871063877485207e-05, + "loss": 1.7263, + "step": 19071 + }, + { + "epoch": 5.85389809699202, + "grad_norm": 0.21563300490379333, + "learning_rate": 3.870579662733277e-05, + "loss": 1.7271, + "step": 19072 + }, + { + "epoch": 5.854205033763045, + "grad_norm": 0.19901902973651886, + "learning_rate": 3.870095459142939e-05, + "loss": 1.7153, + "step": 19073 + }, + { + "epoch": 5.85451197053407, + "grad_norm": 0.2053879052400589, + "learning_rate": 3.869611266718982e-05, + "loss": 1.7769, + "step": 19074 + }, + { + "epoch": 5.854818907305095, + "grad_norm": 0.18877504765987396, + "learning_rate": 3.869127085466188e-05, + "loss": 1.7427, + "step": 19075 + }, + { + "epoch": 5.85512584407612, + "grad_norm": 0.2000892460346222, + "learning_rate": 3.8686429153893414e-05, + "loss": 1.7245, + "step": 19076 + }, + { + "epoch": 5.855432780847146, + "grad_norm": 0.23791030049324036, + "learning_rate": 3.868158756493231e-05, + "loss": 1.7128, + "step": 19077 + }, + { + "epoch": 5.855739717618171, + "grad_norm": 0.20807631313800812, + "learning_rate": 3.8676746087826374e-05, + "loss": 1.7235, + "step": 19078 + }, + { + "epoch": 5.856046654389196, + "grad_norm": 0.2603290379047394, + "learning_rate": 3.867190472262349e-05, + "loss": 1.7272, + "step": 19079 + }, + { + "epoch": 5.856353591160221, + "grad_norm": 0.25234153866767883, + "learning_rate": 3.8667063469371456e-05, + "loss": 1.7818, + "step": 19080 + }, + { + "epoch": 5.856660527931246, + "grad_norm": 0.20621159672737122, + "learning_rate": 3.866222232811816e-05, + "loss": 1.7318, + "step": 19081 + }, + { + "epoch": 5.856967464702271, + "grad_norm": 0.19565562903881073, + "learning_rate": 3.865738129891141e-05, + "loss": 1.6364, + "step": 19082 + }, + { + "epoch": 5.857274401473297, + "grad_norm": 0.2090953141450882, + "learning_rate": 3.86525403817991e-05, + "loss": 1.7763, + "step": 19083 + }, + { + "epoch": 5.857581338244322, + "grad_norm": 0.21286322176456451, + "learning_rate": 3.864769957682901e-05, + "loss": 1.7652, + "step": 19084 + }, + { + "epoch": 5.8578882750153465, + "grad_norm": 0.20606130361557007, + "learning_rate": 3.864285888404902e-05, + "loss": 1.7267, + "step": 19085 + }, + { + "epoch": 5.858195211786372, + "grad_norm": 0.18837152421474457, + "learning_rate": 3.863801830350694e-05, + "loss": 1.7013, + "step": 19086 + }, + { + "epoch": 5.858502148557397, + "grad_norm": 0.19374001026153564, + "learning_rate": 3.8633177835250636e-05, + "loss": 1.7462, + "step": 19087 + }, + { + "epoch": 5.8588090853284225, + "grad_norm": 0.19090552628040314, + "learning_rate": 3.8628337479327914e-05, + "loss": 1.7321, + "step": 19088 + }, + { + "epoch": 5.859116022099448, + "grad_norm": 0.19487829506397247, + "learning_rate": 3.8623497235786656e-05, + "loss": 1.7323, + "step": 19089 + }, + { + "epoch": 5.859422958870473, + "grad_norm": 0.23836077749729156, + "learning_rate": 3.861865710467464e-05, + "loss": 1.7277, + "step": 19090 + }, + { + "epoch": 5.859729895641498, + "grad_norm": 0.22283829748630524, + "learning_rate": 3.861381708603974e-05, + "loss": 1.7521, + "step": 19091 + }, + { + "epoch": 5.860036832412523, + "grad_norm": 0.2094828337430954, + "learning_rate": 3.8608977179929774e-05, + "loss": 1.763, + "step": 19092 + }, + { + "epoch": 5.860343769183548, + "grad_norm": 0.30857667326927185, + "learning_rate": 3.860413738639256e-05, + "loss": 1.7112, + "step": 19093 + }, + { + "epoch": 5.860650705954574, + "grad_norm": 0.22634989023208618, + "learning_rate": 3.8599297705475954e-05, + "loss": 1.7076, + "step": 19094 + }, + { + "epoch": 5.860957642725598, + "grad_norm": 0.20488132536411285, + "learning_rate": 3.8594458137227757e-05, + "loss": 1.6821, + "step": 19095 + }, + { + "epoch": 5.861264579496623, + "grad_norm": 0.22760719060897827, + "learning_rate": 3.8589618681695826e-05, + "loss": 1.6981, + "step": 19096 + }, + { + "epoch": 5.861571516267649, + "grad_norm": 0.21168997883796692, + "learning_rate": 3.858477933892795e-05, + "loss": 1.7396, + "step": 19097 + }, + { + "epoch": 5.861878453038674, + "grad_norm": 0.24725143611431122, + "learning_rate": 3.8579940108971984e-05, + "loss": 1.791, + "step": 19098 + }, + { + "epoch": 5.862185389809699, + "grad_norm": 0.2245369702577591, + "learning_rate": 3.857510099187573e-05, + "loss": 1.7643, + "step": 19099 + }, + { + "epoch": 5.862492326580725, + "grad_norm": 0.20065639913082123, + "learning_rate": 3.8570261987687056e-05, + "loss": 1.715, + "step": 19100 + }, + { + "epoch": 5.862799263351749, + "grad_norm": 0.1857454925775528, + "learning_rate": 3.856542309645373e-05, + "loss": 1.6833, + "step": 19101 + }, + { + "epoch": 5.8631062001227745, + "grad_norm": 0.18816804885864258, + "learning_rate": 3.856058431822361e-05, + "loss": 1.7049, + "step": 19102 + }, + { + "epoch": 5.8634131368938, + "grad_norm": 0.2861626148223877, + "learning_rate": 3.855574565304448e-05, + "loss": 1.8275, + "step": 19103 + }, + { + "epoch": 5.863720073664825, + "grad_norm": 0.19937226176261902, + "learning_rate": 3.8550907100964196e-05, + "loss": 1.7137, + "step": 19104 + }, + { + "epoch": 5.8640270104358505, + "grad_norm": 0.2040586620569229, + "learning_rate": 3.854606866203055e-05, + "loss": 1.725, + "step": 19105 + }, + { + "epoch": 5.864333947206875, + "grad_norm": 0.21082650125026703, + "learning_rate": 3.854123033629137e-05, + "loss": 1.7143, + "step": 19106 + }, + { + "epoch": 5.8646408839779, + "grad_norm": 0.1977517306804657, + "learning_rate": 3.853639212379446e-05, + "loss": 1.7482, + "step": 19107 + }, + { + "epoch": 5.864947820748926, + "grad_norm": 0.2272191196680069, + "learning_rate": 3.8531554024587655e-05, + "loss": 1.7678, + "step": 19108 + }, + { + "epoch": 5.865254757519951, + "grad_norm": 0.22765736281871796, + "learning_rate": 3.852671603871876e-05, + "loss": 1.7721, + "step": 19109 + }, + { + "epoch": 5.865561694290976, + "grad_norm": 0.20707197487354279, + "learning_rate": 3.852187816623556e-05, + "loss": 1.7509, + "step": 19110 + }, + { + "epoch": 5.865868631062002, + "grad_norm": 0.2699931561946869, + "learning_rate": 3.851704040718591e-05, + "loss": 1.6845, + "step": 19111 + }, + { + "epoch": 5.866175567833026, + "grad_norm": 0.24394196271896362, + "learning_rate": 3.8512202761617575e-05, + "loss": 1.6895, + "step": 19112 + }, + { + "epoch": 5.866482504604051, + "grad_norm": 0.21921835839748383, + "learning_rate": 3.850736522957841e-05, + "loss": 1.7739, + "step": 19113 + }, + { + "epoch": 5.866789441375077, + "grad_norm": 0.2268306314945221, + "learning_rate": 3.8502527811116175e-05, + "loss": 1.7773, + "step": 19114 + }, + { + "epoch": 5.867096378146102, + "grad_norm": 0.2165728509426117, + "learning_rate": 3.84976905062787e-05, + "loss": 1.7567, + "step": 19115 + }, + { + "epoch": 5.867403314917127, + "grad_norm": 0.188106968998909, + "learning_rate": 3.8492853315113804e-05, + "loss": 1.7209, + "step": 19116 + }, + { + "epoch": 5.867710251688152, + "grad_norm": 0.20750530064105988, + "learning_rate": 3.848801623766927e-05, + "loss": 1.6999, + "step": 19117 + }, + { + "epoch": 5.868017188459177, + "grad_norm": 0.2475438266992569, + "learning_rate": 3.84831792739929e-05, + "loss": 1.7535, + "step": 19118 + }, + { + "epoch": 5.8683241252302025, + "grad_norm": 0.23291872441768646, + "learning_rate": 3.847834242413252e-05, + "loss": 1.7137, + "step": 19119 + }, + { + "epoch": 5.868631062001228, + "grad_norm": 0.18381048738956451, + "learning_rate": 3.847350568813589e-05, + "loss": 1.7657, + "step": 19120 + }, + { + "epoch": 5.868937998772253, + "grad_norm": 0.19330385327339172, + "learning_rate": 3.8468669066050845e-05, + "loss": 1.7109, + "step": 19121 + }, + { + "epoch": 5.8692449355432785, + "grad_norm": 0.22503000497817993, + "learning_rate": 3.846383255792517e-05, + "loss": 1.7668, + "step": 19122 + }, + { + "epoch": 5.869551872314303, + "grad_norm": 0.2147306352853775, + "learning_rate": 3.845899616380667e-05, + "loss": 1.74, + "step": 19123 + }, + { + "epoch": 5.869858809085328, + "grad_norm": 0.18493011593818665, + "learning_rate": 3.845415988374312e-05, + "loss": 1.7066, + "step": 19124 + }, + { + "epoch": 5.870165745856354, + "grad_norm": 0.28276753425598145, + "learning_rate": 3.844932371778235e-05, + "loss": 1.7925, + "step": 19125 + }, + { + "epoch": 5.870472682627379, + "grad_norm": 0.23486676812171936, + "learning_rate": 3.844448766597212e-05, + "loss": 1.8216, + "step": 19126 + }, + { + "epoch": 5.870779619398404, + "grad_norm": 0.24370723962783813, + "learning_rate": 3.843965172836024e-05, + "loss": 1.709, + "step": 19127 + }, + { + "epoch": 5.871086556169429, + "grad_norm": 0.22540852427482605, + "learning_rate": 3.843481590499449e-05, + "loss": 1.7608, + "step": 19128 + }, + { + "epoch": 5.871393492940454, + "grad_norm": 0.20578467845916748, + "learning_rate": 3.8429980195922666e-05, + "loss": 1.7288, + "step": 19129 + }, + { + "epoch": 5.871700429711479, + "grad_norm": 0.265325129032135, + "learning_rate": 3.842514460119258e-05, + "loss": 1.7711, + "step": 19130 + }, + { + "epoch": 5.872007366482505, + "grad_norm": 0.20076121389865875, + "learning_rate": 3.842030912085197e-05, + "loss": 1.6764, + "step": 19131 + }, + { + "epoch": 5.87231430325353, + "grad_norm": 0.23941899836063385, + "learning_rate": 3.841547375494868e-05, + "loss": 1.8157, + "step": 19132 + }, + { + "epoch": 5.872621240024555, + "grad_norm": 0.23184041678905487, + "learning_rate": 3.841063850353044e-05, + "loss": 1.6948, + "step": 19133 + }, + { + "epoch": 5.87292817679558, + "grad_norm": 0.20299546420574188, + "learning_rate": 3.840580336664508e-05, + "loss": 1.7812, + "step": 19134 + }, + { + "epoch": 5.873235113566605, + "grad_norm": 0.24654673039913177, + "learning_rate": 3.840096834434036e-05, + "loss": 1.7999, + "step": 19135 + }, + { + "epoch": 5.8735420503376305, + "grad_norm": 0.21144285798072815, + "learning_rate": 3.8396133436664085e-05, + "loss": 1.7033, + "step": 19136 + }, + { + "epoch": 5.873848987108656, + "grad_norm": 0.22186708450317383, + "learning_rate": 3.8391298643663997e-05, + "loss": 1.7292, + "step": 19137 + }, + { + "epoch": 5.87415592387968, + "grad_norm": 0.21017275750637054, + "learning_rate": 3.838646396538793e-05, + "loss": 1.6989, + "step": 19138 + }, + { + "epoch": 5.874462860650706, + "grad_norm": 0.19430704414844513, + "learning_rate": 3.83816294018836e-05, + "loss": 1.7446, + "step": 19139 + }, + { + "epoch": 5.874769797421731, + "grad_norm": 0.25048547983169556, + "learning_rate": 3.8376794953198836e-05, + "loss": 1.7358, + "step": 19140 + }, + { + "epoch": 5.875076734192756, + "grad_norm": 0.21869583427906036, + "learning_rate": 3.8371960619381406e-05, + "loss": 1.7017, + "step": 19141 + }, + { + "epoch": 5.875383670963782, + "grad_norm": 0.2053002119064331, + "learning_rate": 3.836712640047905e-05, + "loss": 1.7077, + "step": 19142 + }, + { + "epoch": 5.875690607734807, + "grad_norm": 0.2222425490617752, + "learning_rate": 3.83622922965396e-05, + "loss": 1.7259, + "step": 19143 + }, + { + "epoch": 5.8759975445058314, + "grad_norm": 0.20682495832443237, + "learning_rate": 3.8357458307610774e-05, + "loss": 1.7597, + "step": 19144 + }, + { + "epoch": 5.876304481276857, + "grad_norm": 0.2001802772283554, + "learning_rate": 3.835262443374038e-05, + "loss": 1.7546, + "step": 19145 + }, + { + "epoch": 5.876611418047882, + "grad_norm": 0.20499882102012634, + "learning_rate": 3.8347790674976166e-05, + "loss": 1.6741, + "step": 19146 + }, + { + "epoch": 5.8769183548189075, + "grad_norm": 0.17830348014831543, + "learning_rate": 3.834295703136593e-05, + "loss": 1.7067, + "step": 19147 + }, + { + "epoch": 5.877225291589933, + "grad_norm": 0.25055429339408875, + "learning_rate": 3.833812350295741e-05, + "loss": 1.753, + "step": 19148 + }, + { + "epoch": 5.877532228360957, + "grad_norm": 0.19037213921546936, + "learning_rate": 3.8333290089798415e-05, + "loss": 1.7336, + "step": 19149 + }, + { + "epoch": 5.877839165131983, + "grad_norm": 0.18041233718395233, + "learning_rate": 3.8328456791936656e-05, + "loss": 1.7172, + "step": 19150 + }, + { + "epoch": 5.878146101903008, + "grad_norm": 0.21531802415847778, + "learning_rate": 3.832362360941994e-05, + "loss": 1.7328, + "step": 19151 + }, + { + "epoch": 5.878453038674033, + "grad_norm": 0.23101283609867096, + "learning_rate": 3.831879054229601e-05, + "loss": 1.7548, + "step": 19152 + }, + { + "epoch": 5.878759975445059, + "grad_norm": 0.19029635190963745, + "learning_rate": 3.831395759061266e-05, + "loss": 1.6852, + "step": 19153 + }, + { + "epoch": 5.879066912216084, + "grad_norm": 0.20305602252483368, + "learning_rate": 3.830912475441761e-05, + "loss": 1.6982, + "step": 19154 + }, + { + "epoch": 5.879373848987108, + "grad_norm": 0.19752593338489532, + "learning_rate": 3.830429203375866e-05, + "loss": 1.7726, + "step": 19155 + }, + { + "epoch": 5.879680785758134, + "grad_norm": 0.2109406590461731, + "learning_rate": 3.8299459428683526e-05, + "loss": 1.7629, + "step": 19156 + }, + { + "epoch": 5.879987722529159, + "grad_norm": 0.19448740780353546, + "learning_rate": 3.829462693924001e-05, + "loss": 1.6981, + "step": 19157 + }, + { + "epoch": 5.880294659300184, + "grad_norm": 0.19344154000282288, + "learning_rate": 3.828979456547586e-05, + "loss": 1.6822, + "step": 19158 + }, + { + "epoch": 5.88060159607121, + "grad_norm": 0.24466145038604736, + "learning_rate": 3.82849623074388e-05, + "loss": 1.7575, + "step": 19159 + }, + { + "epoch": 5.880908532842234, + "grad_norm": 0.20174476504325867, + "learning_rate": 3.828013016517663e-05, + "loss": 1.7267, + "step": 19160 + }, + { + "epoch": 5.8812154696132595, + "grad_norm": 0.23560820519924164, + "learning_rate": 3.827529813873706e-05, + "loss": 1.7125, + "step": 19161 + }, + { + "epoch": 5.881522406384285, + "grad_norm": 0.18118280172348022, + "learning_rate": 3.827046622816789e-05, + "loss": 1.7436, + "step": 19162 + }, + { + "epoch": 5.88182934315531, + "grad_norm": 0.27250152826309204, + "learning_rate": 3.8265634433516824e-05, + "loss": 1.7249, + "step": 19163 + }, + { + "epoch": 5.8821362799263355, + "grad_norm": 0.23510734736919403, + "learning_rate": 3.826080275483166e-05, + "loss": 1.7502, + "step": 19164 + }, + { + "epoch": 5.882443216697361, + "grad_norm": 0.22708909213542938, + "learning_rate": 3.82559711921601e-05, + "loss": 1.7478, + "step": 19165 + }, + { + "epoch": 5.882750153468385, + "grad_norm": 0.292584627866745, + "learning_rate": 3.825113974554995e-05, + "loss": 1.6757, + "step": 19166 + }, + { + "epoch": 5.883057090239411, + "grad_norm": 0.22186334431171417, + "learning_rate": 3.8246308415048884e-05, + "loss": 1.7061, + "step": 19167 + }, + { + "epoch": 5.883364027010436, + "grad_norm": 0.23995520174503326, + "learning_rate": 3.8241477200704714e-05, + "loss": 1.6962, + "step": 19168 + }, + { + "epoch": 5.883670963781461, + "grad_norm": 0.25545260310173035, + "learning_rate": 3.823664610256513e-05, + "loss": 1.7582, + "step": 19169 + }, + { + "epoch": 5.883977900552486, + "grad_norm": 0.2209167629480362, + "learning_rate": 3.823181512067794e-05, + "loss": 1.7212, + "step": 19170 + }, + { + "epoch": 5.884284837323511, + "grad_norm": 0.24626508355140686, + "learning_rate": 3.8226984255090824e-05, + "loss": 1.7356, + "step": 19171 + }, + { + "epoch": 5.884591774094536, + "grad_norm": 0.22982320189476013, + "learning_rate": 3.822215350585157e-05, + "loss": 1.7516, + "step": 19172 + }, + { + "epoch": 5.884898710865562, + "grad_norm": 0.19458627700805664, + "learning_rate": 3.8217322873007874e-05, + "loss": 1.7097, + "step": 19173 + }, + { + "epoch": 5.885205647636587, + "grad_norm": 0.2030913233757019, + "learning_rate": 3.8212492356607524e-05, + "loss": 1.7273, + "step": 19174 + }, + { + "epoch": 5.885512584407612, + "grad_norm": 0.20174767076969147, + "learning_rate": 3.820766195669823e-05, + "loss": 1.7167, + "step": 19175 + }, + { + "epoch": 5.885819521178637, + "grad_norm": 0.22572553157806396, + "learning_rate": 3.820283167332772e-05, + "loss": 1.8034, + "step": 19176 + }, + { + "epoch": 5.886126457949662, + "grad_norm": 0.24423041939735413, + "learning_rate": 3.819800150654376e-05, + "loss": 1.7188, + "step": 19177 + }, + { + "epoch": 5.8864333947206875, + "grad_norm": 0.20805509388446808, + "learning_rate": 3.819317145639404e-05, + "loss": 1.7252, + "step": 19178 + }, + { + "epoch": 5.886740331491713, + "grad_norm": 0.2731400728225708, + "learning_rate": 3.8188341522926334e-05, + "loss": 1.7778, + "step": 19179 + }, + { + "epoch": 5.887047268262738, + "grad_norm": 0.2604491412639618, + "learning_rate": 3.818351170618835e-05, + "loss": 1.7524, + "step": 19180 + }, + { + "epoch": 5.887354205033763, + "grad_norm": 0.20043112337589264, + "learning_rate": 3.817868200622785e-05, + "loss": 1.7176, + "step": 19181 + }, + { + "epoch": 5.887661141804788, + "grad_norm": 0.2224988341331482, + "learning_rate": 3.817385242309253e-05, + "loss": 1.7267, + "step": 19182 + }, + { + "epoch": 5.887968078575813, + "grad_norm": 0.24603894352912903, + "learning_rate": 3.8169022956830135e-05, + "loss": 1.716, + "step": 19183 + }, + { + "epoch": 5.888275015346839, + "grad_norm": 0.19959969818592072, + "learning_rate": 3.816419360748839e-05, + "loss": 1.7461, + "step": 19184 + }, + { + "epoch": 5.888581952117864, + "grad_norm": 0.21907947957515717, + "learning_rate": 3.815936437511501e-05, + "loss": 1.6982, + "step": 19185 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.1920289248228073, + "learning_rate": 3.8154535259757735e-05, + "loss": 1.7213, + "step": 19186 + }, + { + "epoch": 5.889195825659914, + "grad_norm": 0.21930737793445587, + "learning_rate": 3.81497062614643e-05, + "loss": 1.7389, + "step": 19187 + }, + { + "epoch": 5.889502762430939, + "grad_norm": 0.1972137838602066, + "learning_rate": 3.814487738028239e-05, + "loss": 1.7317, + "step": 19188 + }, + { + "epoch": 5.889809699201964, + "grad_norm": 0.20000529289245605, + "learning_rate": 3.8140048616259785e-05, + "loss": 1.7148, + "step": 19189 + }, + { + "epoch": 5.89011663597299, + "grad_norm": 0.18828663229942322, + "learning_rate": 3.8135219969444135e-05, + "loss": 1.725, + "step": 19190 + }, + { + "epoch": 5.890423572744015, + "grad_norm": 0.2237224131822586, + "learning_rate": 3.8130391439883216e-05, + "loss": 1.7252, + "step": 19191 + }, + { + "epoch": 5.8907305095150395, + "grad_norm": 0.19954712688922882, + "learning_rate": 3.812556302762473e-05, + "loss": 1.7071, + "step": 19192 + }, + { + "epoch": 5.891037446286065, + "grad_norm": 0.23509685695171356, + "learning_rate": 3.812073473271637e-05, + "loss": 1.7603, + "step": 19193 + }, + { + "epoch": 5.89134438305709, + "grad_norm": 0.28477707505226135, + "learning_rate": 3.81159065552059e-05, + "loss": 1.8193, + "step": 19194 + }, + { + "epoch": 5.8916513198281155, + "grad_norm": 0.1936045140028, + "learning_rate": 3.811107849514098e-05, + "loss": 1.7438, + "step": 19195 + }, + { + "epoch": 5.891958256599141, + "grad_norm": 0.288253515958786, + "learning_rate": 3.810625055256936e-05, + "loss": 1.8042, + "step": 19196 + }, + { + "epoch": 5.892265193370166, + "grad_norm": 0.19256485998630524, + "learning_rate": 3.810142272753873e-05, + "loss": 1.6997, + "step": 19197 + }, + { + "epoch": 5.892572130141191, + "grad_norm": 0.2823546826839447, + "learning_rate": 3.809659502009684e-05, + "loss": 1.7133, + "step": 19198 + }, + { + "epoch": 5.892879066912216, + "grad_norm": 0.25116851925849915, + "learning_rate": 3.809176743029136e-05, + "loss": 1.7402, + "step": 19199 + }, + { + "epoch": 5.893186003683241, + "grad_norm": 0.19840675592422485, + "learning_rate": 3.808693995817003e-05, + "loss": 1.7009, + "step": 19200 + }, + { + "epoch": 5.893492940454267, + "grad_norm": 0.2703700363636017, + "learning_rate": 3.808211260378051e-05, + "loss": 1.741, + "step": 19201 + }, + { + "epoch": 5.893799877225292, + "grad_norm": 0.25683698058128357, + "learning_rate": 3.807728536717056e-05, + "loss": 1.7431, + "step": 19202 + }, + { + "epoch": 5.894106813996316, + "grad_norm": 0.19033822417259216, + "learning_rate": 3.8072458248387855e-05, + "loss": 1.7423, + "step": 19203 + }, + { + "epoch": 5.894413750767342, + "grad_norm": 0.2771024703979492, + "learning_rate": 3.806763124748012e-05, + "loss": 1.7376, + "step": 19204 + }, + { + "epoch": 5.894720687538367, + "grad_norm": 0.30265524983406067, + "learning_rate": 3.806280436449504e-05, + "loss": 1.7124, + "step": 19205 + }, + { + "epoch": 5.895027624309392, + "grad_norm": 0.21838776767253876, + "learning_rate": 3.805797759948033e-05, + "loss": 1.7319, + "step": 19206 + }, + { + "epoch": 5.895334561080418, + "grad_norm": 0.22244395315647125, + "learning_rate": 3.805315095248368e-05, + "loss": 1.7034, + "step": 19207 + }, + { + "epoch": 5.895641497851443, + "grad_norm": 0.20621941983699799, + "learning_rate": 3.8048324423552786e-05, + "loss": 1.7231, + "step": 19208 + }, + { + "epoch": 5.8959484346224675, + "grad_norm": 0.23735111951828003, + "learning_rate": 3.804349801273538e-05, + "loss": 1.7484, + "step": 19209 + }, + { + "epoch": 5.896255371393493, + "grad_norm": 0.33221447467803955, + "learning_rate": 3.803867172007911e-05, + "loss": 1.7782, + "step": 19210 + }, + { + "epoch": 5.896562308164518, + "grad_norm": 0.20859810709953308, + "learning_rate": 3.803384554563172e-05, + "loss": 1.688, + "step": 19211 + }, + { + "epoch": 5.8968692449355435, + "grad_norm": 0.25731268525123596, + "learning_rate": 3.8029019489440855e-05, + "loss": 1.7463, + "step": 19212 + }, + { + "epoch": 5.897176181706568, + "grad_norm": 0.26556700468063354, + "learning_rate": 3.802419355155425e-05, + "loss": 1.7251, + "step": 19213 + }, + { + "epoch": 5.897483118477593, + "grad_norm": 0.20397205650806427, + "learning_rate": 3.801936773201957e-05, + "loss": 1.6785, + "step": 19214 + }, + { + "epoch": 5.897790055248619, + "grad_norm": 0.2198234349489212, + "learning_rate": 3.8014542030884544e-05, + "loss": 1.7608, + "step": 19215 + }, + { + "epoch": 5.898096992019644, + "grad_norm": 0.22619546949863434, + "learning_rate": 3.800971644819681e-05, + "loss": 1.8034, + "step": 19216 + }, + { + "epoch": 5.898403928790669, + "grad_norm": 0.22074444591999054, + "learning_rate": 3.800489098400412e-05, + "loss": 1.777, + "step": 19217 + }, + { + "epoch": 5.898710865561695, + "grad_norm": 0.2555946707725525, + "learning_rate": 3.80000656383541e-05, + "loss": 1.7578, + "step": 19218 + }, + { + "epoch": 5.899017802332719, + "grad_norm": 0.2130863517522812, + "learning_rate": 3.7995240411294474e-05, + "loss": 1.7312, + "step": 19219 + }, + { + "epoch": 5.899324739103744, + "grad_norm": 0.2574099898338318, + "learning_rate": 3.799041530287291e-05, + "loss": 1.7509, + "step": 19220 + }, + { + "epoch": 5.89963167587477, + "grad_norm": 0.2556573152542114, + "learning_rate": 3.798559031313712e-05, + "loss": 1.7624, + "step": 19221 + }, + { + "epoch": 5.899938612645795, + "grad_norm": 0.19909335672855377, + "learning_rate": 3.798076544213475e-05, + "loss": 1.7466, + "step": 19222 + }, + { + "epoch": 5.9002455494168204, + "grad_norm": 0.19832594692707062, + "learning_rate": 3.7975940689913526e-05, + "loss": 1.6896, + "step": 19223 + }, + { + "epoch": 5.900552486187845, + "grad_norm": 0.18473665416240692, + "learning_rate": 3.7971116056521076e-05, + "loss": 1.7167, + "step": 19224 + }, + { + "epoch": 5.90085942295887, + "grad_norm": 0.21106892824172974, + "learning_rate": 3.796629154200512e-05, + "loss": 1.8071, + "step": 19225 + }, + { + "epoch": 5.901166359729896, + "grad_norm": 0.20903728902339935, + "learning_rate": 3.796146714641333e-05, + "loss": 1.6946, + "step": 19226 + }, + { + "epoch": 5.901473296500921, + "grad_norm": 0.21518728137016296, + "learning_rate": 3.795664286979336e-05, + "loss": 1.6899, + "step": 19227 + }, + { + "epoch": 5.901780233271946, + "grad_norm": 0.1948135644197464, + "learning_rate": 3.7951818712192926e-05, + "loss": 1.7568, + "step": 19228 + }, + { + "epoch": 5.902087170042972, + "grad_norm": 0.2222091257572174, + "learning_rate": 3.7946994673659667e-05, + "loss": 1.8118, + "step": 19229 + }, + { + "epoch": 5.902394106813996, + "grad_norm": 0.2173513025045395, + "learning_rate": 3.794217075424127e-05, + "loss": 1.7194, + "step": 19230 + }, + { + "epoch": 5.902701043585021, + "grad_norm": 0.2026323676109314, + "learning_rate": 3.79373469539854e-05, + "loss": 1.6944, + "step": 19231 + }, + { + "epoch": 5.903007980356047, + "grad_norm": 0.22178098559379578, + "learning_rate": 3.7932523272939765e-05, + "loss": 1.7328, + "step": 19232 + }, + { + "epoch": 5.903314917127072, + "grad_norm": 0.22846719622612, + "learning_rate": 3.792769971115198e-05, + "loss": 1.8065, + "step": 19233 + }, + { + "epoch": 5.903621853898097, + "grad_norm": 0.2086053490638733, + "learning_rate": 3.792287626866977e-05, + "loss": 1.7511, + "step": 19234 + }, + { + "epoch": 5.903928790669122, + "grad_norm": 0.22444705665111542, + "learning_rate": 3.791805294554075e-05, + "loss": 1.742, + "step": 19235 + }, + { + "epoch": 5.904235727440147, + "grad_norm": 0.24630236625671387, + "learning_rate": 3.7913229741812625e-05, + "loss": 1.7531, + "step": 19236 + }, + { + "epoch": 5.9045426642111725, + "grad_norm": 0.2618274986743927, + "learning_rate": 3.7908406657533036e-05, + "loss": 1.7387, + "step": 19237 + }, + { + "epoch": 5.904849600982198, + "grad_norm": 0.25871509313583374, + "learning_rate": 3.790358369274968e-05, + "loss": 1.7822, + "step": 19238 + }, + { + "epoch": 5.905156537753223, + "grad_norm": 0.22675062716007233, + "learning_rate": 3.789876084751018e-05, + "loss": 1.7788, + "step": 19239 + }, + { + "epoch": 5.9054634745242485, + "grad_norm": 0.26623663306236267, + "learning_rate": 3.789393812186224e-05, + "loss": 1.7092, + "step": 19240 + }, + { + "epoch": 5.905770411295273, + "grad_norm": 0.19448868930339813, + "learning_rate": 3.788911551585348e-05, + "loss": 1.7164, + "step": 19241 + }, + { + "epoch": 5.906077348066298, + "grad_norm": 0.22451938688755035, + "learning_rate": 3.788429302953158e-05, + "loss": 1.667, + "step": 19242 + }, + { + "epoch": 5.906384284837324, + "grad_norm": 0.2323608547449112, + "learning_rate": 3.7879470662944214e-05, + "loss": 1.7992, + "step": 19243 + }, + { + "epoch": 5.906691221608349, + "grad_norm": 0.2508258819580078, + "learning_rate": 3.7874648416139e-05, + "loss": 1.7681, + "step": 19244 + }, + { + "epoch": 5.906998158379373, + "grad_norm": 0.22333547472953796, + "learning_rate": 3.786982628916364e-05, + "loss": 1.7006, + "step": 19245 + }, + { + "epoch": 5.907305095150399, + "grad_norm": 0.19816327095031738, + "learning_rate": 3.786500428206575e-05, + "loss": 1.7458, + "step": 19246 + }, + { + "epoch": 5.907612031921424, + "grad_norm": 0.2047683447599411, + "learning_rate": 3.7860182394893006e-05, + "loss": 1.7385, + "step": 19247 + }, + { + "epoch": 5.907918968692449, + "grad_norm": 0.2124621719121933, + "learning_rate": 3.785536062769304e-05, + "loss": 1.7373, + "step": 19248 + }, + { + "epoch": 5.908225905463475, + "grad_norm": 0.200453981757164, + "learning_rate": 3.785053898051355e-05, + "loss": 1.7754, + "step": 19249 + }, + { + "epoch": 5.9085328422345, + "grad_norm": 0.19543224573135376, + "learning_rate": 3.784571745340212e-05, + "loss": 1.724, + "step": 19250 + }, + { + "epoch": 5.9088397790055245, + "grad_norm": 0.17079658806324005, + "learning_rate": 3.784089604640647e-05, + "loss": 1.6843, + "step": 19251 + }, + { + "epoch": 5.90914671577655, + "grad_norm": 0.22792236506938934, + "learning_rate": 3.783607475957418e-05, + "loss": 1.7442, + "step": 19252 + }, + { + "epoch": 5.909453652547575, + "grad_norm": 0.20699752867221832, + "learning_rate": 3.783125359295294e-05, + "loss": 1.7868, + "step": 19253 + }, + { + "epoch": 5.9097605893186005, + "grad_norm": 0.2156144678592682, + "learning_rate": 3.782643254659038e-05, + "loss": 1.7443, + "step": 19254 + }, + { + "epoch": 5.910067526089626, + "grad_norm": 0.2021300345659256, + "learning_rate": 3.782161162053417e-05, + "loss": 1.7749, + "step": 19255 + }, + { + "epoch": 5.91037446286065, + "grad_norm": 0.17613129317760468, + "learning_rate": 3.7816790814831905e-05, + "loss": 1.7001, + "step": 19256 + }, + { + "epoch": 5.910681399631676, + "grad_norm": 0.18911564350128174, + "learning_rate": 3.781197012953128e-05, + "loss": 1.6817, + "step": 19257 + }, + { + "epoch": 5.910988336402701, + "grad_norm": 0.18920689821243286, + "learning_rate": 3.780714956467989e-05, + "loss": 1.7554, + "step": 19258 + }, + { + "epoch": 5.911295273173726, + "grad_norm": 0.22030571103096008, + "learning_rate": 3.7802329120325396e-05, + "loss": 1.7554, + "step": 19259 + }, + { + "epoch": 5.911602209944752, + "grad_norm": 0.21164962649345398, + "learning_rate": 3.779750879651545e-05, + "loss": 1.74, + "step": 19260 + }, + { + "epoch": 5.911909146715777, + "grad_norm": 0.2205103188753128, + "learning_rate": 3.779268859329766e-05, + "loss": 1.7424, + "step": 19261 + }, + { + "epoch": 5.912216083486801, + "grad_norm": 0.19262658059597015, + "learning_rate": 3.7787868510719685e-05, + "loss": 1.7157, + "step": 19262 + }, + { + "epoch": 5.912523020257827, + "grad_norm": 0.19583287835121155, + "learning_rate": 3.778304854882914e-05, + "loss": 1.7343, + "step": 19263 + }, + { + "epoch": 5.912829957028852, + "grad_norm": 0.18275529146194458, + "learning_rate": 3.777822870767368e-05, + "loss": 1.6938, + "step": 19264 + }, + { + "epoch": 5.913136893799877, + "grad_norm": 0.21268916130065918, + "learning_rate": 3.7773408987300914e-05, + "loss": 1.7546, + "step": 19265 + }, + { + "epoch": 5.913443830570903, + "grad_norm": 0.20878887176513672, + "learning_rate": 3.77685893877585e-05, + "loss": 1.8109, + "step": 19266 + }, + { + "epoch": 5.913750767341927, + "grad_norm": 0.2326175421476364, + "learning_rate": 3.776376990909404e-05, + "loss": 1.7248, + "step": 19267 + }, + { + "epoch": 5.9140577041129525, + "grad_norm": 0.28189611434936523, + "learning_rate": 3.7758950551355204e-05, + "loss": 1.7796, + "step": 19268 + }, + { + "epoch": 5.914364640883978, + "grad_norm": 0.1922682821750641, + "learning_rate": 3.775413131458957e-05, + "loss": 1.7096, + "step": 19269 + }, + { + "epoch": 5.914671577655003, + "grad_norm": 0.2839193642139435, + "learning_rate": 3.774931219884479e-05, + "loss": 1.7341, + "step": 19270 + }, + { + "epoch": 5.9149785144260285, + "grad_norm": 0.2075256109237671, + "learning_rate": 3.7744493204168495e-05, + "loss": 1.7565, + "step": 19271 + }, + { + "epoch": 5.915285451197054, + "grad_norm": 0.2780497372150421, + "learning_rate": 3.7739674330608306e-05, + "loss": 1.7186, + "step": 19272 + }, + { + "epoch": 5.915592387968078, + "grad_norm": 0.26129212975502014, + "learning_rate": 3.773485557821182e-05, + "loss": 1.8468, + "step": 19273 + }, + { + "epoch": 5.915899324739104, + "grad_norm": 0.3299194276332855, + "learning_rate": 3.773003694702671e-05, + "loss": 1.7705, + "step": 19274 + }, + { + "epoch": 5.916206261510129, + "grad_norm": 0.3011106848716736, + "learning_rate": 3.772521843710054e-05, + "loss": 1.748, + "step": 19275 + }, + { + "epoch": 5.916513198281154, + "grad_norm": 0.21370603144168854, + "learning_rate": 3.7720400048480966e-05, + "loss": 1.7709, + "step": 19276 + }, + { + "epoch": 5.91682013505218, + "grad_norm": 0.29374879598617554, + "learning_rate": 3.771558178121561e-05, + "loss": 1.6948, + "step": 19277 + }, + { + "epoch": 5.917127071823204, + "grad_norm": 0.2545807659626007, + "learning_rate": 3.771076363535205e-05, + "loss": 1.7974, + "step": 19278 + }, + { + "epoch": 5.917434008594229, + "grad_norm": 0.24210263788700104, + "learning_rate": 3.7705945610937954e-05, + "loss": 1.7438, + "step": 19279 + }, + { + "epoch": 5.917740945365255, + "grad_norm": 0.26224827766418457, + "learning_rate": 3.770112770802088e-05, + "loss": 1.7294, + "step": 19280 + }, + { + "epoch": 5.91804788213628, + "grad_norm": 0.23358991742134094, + "learning_rate": 3.7696309926648486e-05, + "loss": 1.7973, + "step": 19281 + }, + { + "epoch": 5.918354818907305, + "grad_norm": 0.3466563820838928, + "learning_rate": 3.769149226686837e-05, + "loss": 1.784, + "step": 19282 + }, + { + "epoch": 5.918661755678331, + "grad_norm": 0.2416994869709015, + "learning_rate": 3.768667472872814e-05, + "loss": 1.6957, + "step": 19283 + }, + { + "epoch": 5.918968692449355, + "grad_norm": 0.2285085767507553, + "learning_rate": 3.768185731227539e-05, + "loss": 1.71, + "step": 19284 + }, + { + "epoch": 5.9192756292203805, + "grad_norm": 0.2566430866718292, + "learning_rate": 3.7677040017557775e-05, + "loss": 1.792, + "step": 19285 + }, + { + "epoch": 5.919582565991406, + "grad_norm": 0.21566689014434814, + "learning_rate": 3.767222284462285e-05, + "loss": 1.8085, + "step": 19286 + }, + { + "epoch": 5.919889502762431, + "grad_norm": 0.24078889191150665, + "learning_rate": 3.7667405793518264e-05, + "loss": 1.7221, + "step": 19287 + }, + { + "epoch": 5.920196439533456, + "grad_norm": 0.22127531468868256, + "learning_rate": 3.7662588864291584e-05, + "loss": 1.7173, + "step": 19288 + }, + { + "epoch": 5.920503376304481, + "grad_norm": 0.18165946006774902, + "learning_rate": 3.765777205699045e-05, + "loss": 1.7518, + "step": 19289 + }, + { + "epoch": 5.920810313075506, + "grad_norm": 0.2569290101528168, + "learning_rate": 3.765295537166242e-05, + "loss": 1.7716, + "step": 19290 + }, + { + "epoch": 5.921117249846532, + "grad_norm": 0.19010202586650848, + "learning_rate": 3.764813880835515e-05, + "loss": 1.7146, + "step": 19291 + }, + { + "epoch": 5.921424186617557, + "grad_norm": 0.2882116436958313, + "learning_rate": 3.7643322367116195e-05, + "loss": 1.7677, + "step": 19292 + }, + { + "epoch": 5.921731123388582, + "grad_norm": 0.30711185932159424, + "learning_rate": 3.763850604799319e-05, + "loss": 1.7506, + "step": 19293 + }, + { + "epoch": 5.922038060159607, + "grad_norm": 0.19295164942741394, + "learning_rate": 3.76336898510337e-05, + "loss": 1.715, + "step": 19294 + }, + { + "epoch": 5.922344996930632, + "grad_norm": 0.24849168956279755, + "learning_rate": 3.762887377628533e-05, + "loss": 1.6807, + "step": 19295 + }, + { + "epoch": 5.922651933701657, + "grad_norm": 0.23573634028434753, + "learning_rate": 3.7624057823795696e-05, + "loss": 1.7363, + "step": 19296 + }, + { + "epoch": 5.922958870472683, + "grad_norm": 0.24384267628192902, + "learning_rate": 3.761924199361235e-05, + "loss": 1.726, + "step": 19297 + }, + { + "epoch": 5.923265807243708, + "grad_norm": 0.2589210271835327, + "learning_rate": 3.761442628578294e-05, + "loss": 1.7771, + "step": 19298 + }, + { + "epoch": 5.9235727440147325, + "grad_norm": 0.23527951538562775, + "learning_rate": 3.760961070035501e-05, + "loss": 1.6561, + "step": 19299 + }, + { + "epoch": 5.923879680785758, + "grad_norm": 0.20286870002746582, + "learning_rate": 3.7604795237376175e-05, + "loss": 1.7464, + "step": 19300 + }, + { + "epoch": 5.924186617556783, + "grad_norm": 0.22705033421516418, + "learning_rate": 3.759997989689401e-05, + "loss": 1.7814, + "step": 19301 + }, + { + "epoch": 5.9244935543278086, + "grad_norm": 0.21780981123447418, + "learning_rate": 3.7595164678956135e-05, + "loss": 1.7601, + "step": 19302 + }, + { + "epoch": 5.924800491098834, + "grad_norm": 0.2030021697282791, + "learning_rate": 3.759034958361009e-05, + "loss": 1.7222, + "step": 19303 + }, + { + "epoch": 5.925107427869859, + "grad_norm": 0.22956500947475433, + "learning_rate": 3.758553461090351e-05, + "loss": 1.674, + "step": 19304 + }, + { + "epoch": 5.925414364640884, + "grad_norm": 0.2368287444114685, + "learning_rate": 3.758071976088392e-05, + "loss": 1.7483, + "step": 19305 + }, + { + "epoch": 5.925721301411909, + "grad_norm": 0.22852632403373718, + "learning_rate": 3.757590503359896e-05, + "loss": 1.7561, + "step": 19306 + }, + { + "epoch": 5.926028238182934, + "grad_norm": 0.21657361090183258, + "learning_rate": 3.757109042909617e-05, + "loss": 1.7814, + "step": 19307 + }, + { + "epoch": 5.92633517495396, + "grad_norm": 0.21996551752090454, + "learning_rate": 3.756627594742317e-05, + "loss": 1.732, + "step": 19308 + }, + { + "epoch": 5.926642111724985, + "grad_norm": 0.23319712281227112, + "learning_rate": 3.75614615886275e-05, + "loss": 1.6807, + "step": 19309 + }, + { + "epoch": 5.9269490484960095, + "grad_norm": 0.17926698923110962, + "learning_rate": 3.755664735275677e-05, + "loss": 1.6925, + "step": 19310 + }, + { + "epoch": 5.927255985267035, + "grad_norm": 0.18986931443214417, + "learning_rate": 3.755183323985855e-05, + "loss": 1.7002, + "step": 19311 + }, + { + "epoch": 5.92756292203806, + "grad_norm": 0.18753086030483246, + "learning_rate": 3.7547019249980385e-05, + "loss": 1.695, + "step": 19312 + }, + { + "epoch": 5.9278698588090855, + "grad_norm": 0.21354973316192627, + "learning_rate": 3.7542205383169904e-05, + "loss": 1.6629, + "step": 19313 + }, + { + "epoch": 5.928176795580111, + "grad_norm": 0.19713245332241058, + "learning_rate": 3.753739163947463e-05, + "loss": 1.707, + "step": 19314 + }, + { + "epoch": 5.928483732351136, + "grad_norm": 0.2122458517551422, + "learning_rate": 3.753257801894217e-05, + "loss": 1.7309, + "step": 19315 + }, + { + "epoch": 5.928790669122161, + "grad_norm": 0.20360666513442993, + "learning_rate": 3.7527764521620065e-05, + "loss": 1.6861, + "step": 19316 + }, + { + "epoch": 5.929097605893186, + "grad_norm": 0.2652932405471802, + "learning_rate": 3.752295114755592e-05, + "loss": 1.7662, + "step": 19317 + }, + { + "epoch": 5.929404542664211, + "grad_norm": 0.18292152881622314, + "learning_rate": 3.751813789679726e-05, + "loss": 1.6691, + "step": 19318 + }, + { + "epoch": 5.929711479435237, + "grad_norm": 0.25630465149879456, + "learning_rate": 3.75133247693917e-05, + "loss": 1.7647, + "step": 19319 + }, + { + "epoch": 5.930018416206261, + "grad_norm": 0.2463291883468628, + "learning_rate": 3.750851176538677e-05, + "loss": 1.7252, + "step": 19320 + }, + { + "epoch": 5.930325352977286, + "grad_norm": 0.19977931678295135, + "learning_rate": 3.750369888483007e-05, + "loss": 1.7694, + "step": 19321 + }, + { + "epoch": 5.930632289748312, + "grad_norm": 0.19523118436336517, + "learning_rate": 3.7498886127769116e-05, + "loss": 1.7095, + "step": 19322 + }, + { + "epoch": 5.930939226519337, + "grad_norm": 0.19273912906646729, + "learning_rate": 3.749407349425151e-05, + "loss": 1.7009, + "step": 19323 + }, + { + "epoch": 5.931246163290362, + "grad_norm": 0.2419402152299881, + "learning_rate": 3.748926098432479e-05, + "loss": 1.7167, + "step": 19324 + }, + { + "epoch": 5.931553100061388, + "grad_norm": 0.22429771721363068, + "learning_rate": 3.7484448598036534e-05, + "loss": 1.6957, + "step": 19325 + }, + { + "epoch": 5.931860036832412, + "grad_norm": 0.23211807012557983, + "learning_rate": 3.747963633543429e-05, + "loss": 1.767, + "step": 19326 + }, + { + "epoch": 5.9321669736034375, + "grad_norm": 0.23204533755779266, + "learning_rate": 3.7474824196565625e-05, + "loss": 1.7405, + "step": 19327 + }, + { + "epoch": 5.932473910374463, + "grad_norm": 0.24068887531757355, + "learning_rate": 3.747001218147809e-05, + "loss": 1.7539, + "step": 19328 + }, + { + "epoch": 5.932780847145488, + "grad_norm": 0.18140049278736115, + "learning_rate": 3.746520029021922e-05, + "loss": 1.6956, + "step": 19329 + }, + { + "epoch": 5.9330877839165135, + "grad_norm": 0.28421929478645325, + "learning_rate": 3.746038852283661e-05, + "loss": 1.8539, + "step": 19330 + }, + { + "epoch": 5.933394720687538, + "grad_norm": 0.21984805166721344, + "learning_rate": 3.745557687937777e-05, + "loss": 1.7469, + "step": 19331 + }, + { + "epoch": 5.933701657458563, + "grad_norm": 0.2500358819961548, + "learning_rate": 3.7450765359890294e-05, + "loss": 1.7184, + "step": 19332 + }, + { + "epoch": 5.934008594229589, + "grad_norm": 0.2608816623687744, + "learning_rate": 3.744595396442169e-05, + "loss": 1.6825, + "step": 19333 + }, + { + "epoch": 5.934315531000614, + "grad_norm": 0.20359274744987488, + "learning_rate": 3.7441142693019526e-05, + "loss": 1.7535, + "step": 19334 + }, + { + "epoch": 5.934622467771639, + "grad_norm": 0.24795760214328766, + "learning_rate": 3.743633154573135e-05, + "loss": 1.7829, + "step": 19335 + }, + { + "epoch": 5.934929404542665, + "grad_norm": 0.20762503147125244, + "learning_rate": 3.7431520522604736e-05, + "loss": 1.7657, + "step": 19336 + }, + { + "epoch": 5.935236341313689, + "grad_norm": 0.24349527060985565, + "learning_rate": 3.7426709623687174e-05, + "loss": 1.7037, + "step": 19337 + }, + { + "epoch": 5.935543278084714, + "grad_norm": 0.2138780951499939, + "learning_rate": 3.742189884902626e-05, + "loss": 1.7302, + "step": 19338 + }, + { + "epoch": 5.93585021485574, + "grad_norm": 0.24776574969291687, + "learning_rate": 3.741708819866949e-05, + "loss": 1.7293, + "step": 19339 + }, + { + "epoch": 5.936157151626765, + "grad_norm": 0.297888845205307, + "learning_rate": 3.7412277672664444e-05, + "loss": 1.8341, + "step": 19340 + }, + { + "epoch": 5.93646408839779, + "grad_norm": 0.2811104953289032, + "learning_rate": 3.740746727105864e-05, + "loss": 1.7188, + "step": 19341 + }, + { + "epoch": 5.936771025168815, + "grad_norm": 0.37908127903938293, + "learning_rate": 3.740265699389964e-05, + "loss": 1.765, + "step": 19342 + }, + { + "epoch": 5.93707796193984, + "grad_norm": 0.24403691291809082, + "learning_rate": 3.739784684123495e-05, + "loss": 1.6897, + "step": 19343 + }, + { + "epoch": 5.9373848987108655, + "grad_norm": 0.2393181174993515, + "learning_rate": 3.7393036813112135e-05, + "loss": 1.6843, + "step": 19344 + }, + { + "epoch": 5.937691835481891, + "grad_norm": 0.2927580177783966, + "learning_rate": 3.738822690957872e-05, + "loss": 1.6946, + "step": 19345 + }, + { + "epoch": 5.937998772252916, + "grad_norm": 0.23423373699188232, + "learning_rate": 3.738341713068223e-05, + "loss": 1.7409, + "step": 19346 + }, + { + "epoch": 5.9383057090239415, + "grad_norm": 0.2544272840023041, + "learning_rate": 3.7378607476470216e-05, + "loss": 1.698, + "step": 19347 + }, + { + "epoch": 5.938612645794966, + "grad_norm": 0.2120404839515686, + "learning_rate": 3.737379794699019e-05, + "loss": 1.7412, + "step": 19348 + }, + { + "epoch": 5.938919582565991, + "grad_norm": 0.2076033353805542, + "learning_rate": 3.736898854228971e-05, + "loss": 1.752, + "step": 19349 + }, + { + "epoch": 5.939226519337017, + "grad_norm": 0.20122376084327698, + "learning_rate": 3.736417926241627e-05, + "loss": 1.6741, + "step": 19350 + }, + { + "epoch": 5.939533456108042, + "grad_norm": 0.1856858730316162, + "learning_rate": 3.735937010741742e-05, + "loss": 1.6959, + "step": 19351 + }, + { + "epoch": 5.939840392879067, + "grad_norm": 0.22192558646202087, + "learning_rate": 3.7354561077340684e-05, + "loss": 1.7597, + "step": 19352 + }, + { + "epoch": 5.940147329650092, + "grad_norm": 0.2653545141220093, + "learning_rate": 3.73497521722336e-05, + "loss": 1.7324, + "step": 19353 + }, + { + "epoch": 5.940454266421117, + "grad_norm": 0.1975676715373993, + "learning_rate": 3.734494339214366e-05, + "loss": 1.6852, + "step": 19354 + }, + { + "epoch": 5.940761203192142, + "grad_norm": 0.26949796080589294, + "learning_rate": 3.734013473711843e-05, + "loss": 1.7695, + "step": 19355 + }, + { + "epoch": 5.941068139963168, + "grad_norm": 0.2272176742553711, + "learning_rate": 3.733532620720539e-05, + "loss": 1.745, + "step": 19356 + }, + { + "epoch": 5.941375076734193, + "grad_norm": 0.25740066170692444, + "learning_rate": 3.733051780245208e-05, + "loss": 1.7701, + "step": 19357 + }, + { + "epoch": 5.941682013505218, + "grad_norm": 0.1910635381937027, + "learning_rate": 3.732570952290602e-05, + "loss": 1.7276, + "step": 19358 + }, + { + "epoch": 5.941988950276243, + "grad_norm": 0.24896447360515594, + "learning_rate": 3.732090136861474e-05, + "loss": 1.7717, + "step": 19359 + }, + { + "epoch": 5.942295887047268, + "grad_norm": 0.20696721971035004, + "learning_rate": 3.731609333962572e-05, + "loss": 1.7053, + "step": 19360 + }, + { + "epoch": 5.9426028238182935, + "grad_norm": 0.18822510540485382, + "learning_rate": 3.731128543598653e-05, + "loss": 1.6869, + "step": 19361 + }, + { + "epoch": 5.942909760589319, + "grad_norm": 0.20757299661636353, + "learning_rate": 3.730647765774464e-05, + "loss": 1.7214, + "step": 19362 + }, + { + "epoch": 5.943216697360343, + "grad_norm": 0.21238471567630768, + "learning_rate": 3.7301670004947574e-05, + "loss": 1.6953, + "step": 19363 + }, + { + "epoch": 5.943523634131369, + "grad_norm": 0.19326119124889374, + "learning_rate": 3.729686247764286e-05, + "loss": 1.7224, + "step": 19364 + }, + { + "epoch": 5.943830570902394, + "grad_norm": 0.17631326615810394, + "learning_rate": 3.729205507587798e-05, + "loss": 1.6471, + "step": 19365 + }, + { + "epoch": 5.944137507673419, + "grad_norm": 0.1741493195295334, + "learning_rate": 3.728724779970048e-05, + "loss": 1.7169, + "step": 19366 + }, + { + "epoch": 5.944444444444445, + "grad_norm": 0.18203428387641907, + "learning_rate": 3.728244064915782e-05, + "loss": 1.7301, + "step": 19367 + }, + { + "epoch": 5.94475138121547, + "grad_norm": 0.2063162475824356, + "learning_rate": 3.727763362429756e-05, + "loss": 1.7274, + "step": 19368 + }, + { + "epoch": 5.945058317986494, + "grad_norm": 0.17239537835121155, + "learning_rate": 3.7272826725167164e-05, + "loss": 1.7194, + "step": 19369 + }, + { + "epoch": 5.94536525475752, + "grad_norm": 0.1910972148180008, + "learning_rate": 3.726801995181418e-05, + "loss": 1.7017, + "step": 19370 + }, + { + "epoch": 5.945672191528545, + "grad_norm": 0.18822111189365387, + "learning_rate": 3.726321330428606e-05, + "loss": 1.723, + "step": 19371 + }, + { + "epoch": 5.94597912829957, + "grad_norm": 0.19680333137512207, + "learning_rate": 3.725840678263035e-05, + "loss": 1.685, + "step": 19372 + }, + { + "epoch": 5.946286065070596, + "grad_norm": 0.19016215205192566, + "learning_rate": 3.725360038689451e-05, + "loss": 1.7148, + "step": 19373 + }, + { + "epoch": 5.94659300184162, + "grad_norm": 0.1992037147283554, + "learning_rate": 3.7248794117126075e-05, + "loss": 1.7278, + "step": 19374 + }, + { + "epoch": 5.9468999386126455, + "grad_norm": 0.1892910748720169, + "learning_rate": 3.724398797337252e-05, + "loss": 1.7093, + "step": 19375 + }, + { + "epoch": 5.947206875383671, + "grad_norm": 0.23379561305046082, + "learning_rate": 3.723918195568137e-05, + "loss": 1.768, + "step": 19376 + }, + { + "epoch": 5.947513812154696, + "grad_norm": 0.1986081600189209, + "learning_rate": 3.7234376064100104e-05, + "loss": 1.719, + "step": 19377 + }, + { + "epoch": 5.9478207489257215, + "grad_norm": 0.20901642739772797, + "learning_rate": 3.7229570298676195e-05, + "loss": 1.7066, + "step": 19378 + }, + { + "epoch": 5.948127685696747, + "grad_norm": 0.2102847546339035, + "learning_rate": 3.722476465945718e-05, + "loss": 1.7354, + "step": 19379 + }, + { + "epoch": 5.948434622467771, + "grad_norm": 0.1857316792011261, + "learning_rate": 3.72199591464905e-05, + "loss": 1.7159, + "step": 19380 + }, + { + "epoch": 5.948741559238797, + "grad_norm": 0.3045661151409149, + "learning_rate": 3.721515375982371e-05, + "loss": 1.8782, + "step": 19381 + }, + { + "epoch": 5.949048496009822, + "grad_norm": 0.24114711582660675, + "learning_rate": 3.7210348499504236e-05, + "loss": 1.6819, + "step": 19382 + }, + { + "epoch": 5.949355432780847, + "grad_norm": 0.20186996459960938, + "learning_rate": 3.720554336557961e-05, + "loss": 1.8028, + "step": 19383 + }, + { + "epoch": 5.949662369551873, + "grad_norm": 0.25385335087776184, + "learning_rate": 3.7200738358097295e-05, + "loss": 1.7278, + "step": 19384 + }, + { + "epoch": 5.949969306322897, + "grad_norm": 0.23390468955039978, + "learning_rate": 3.719593347710478e-05, + "loss": 1.7775, + "step": 19385 + }, + { + "epoch": 5.9502762430939224, + "grad_norm": 0.22577936947345734, + "learning_rate": 3.719112872264956e-05, + "loss": 1.7567, + "step": 19386 + }, + { + "epoch": 5.950583179864948, + "grad_norm": 0.2540932297706604, + "learning_rate": 3.718632409477912e-05, + "loss": 1.6749, + "step": 19387 + }, + { + "epoch": 5.950890116635973, + "grad_norm": 0.1994820535182953, + "learning_rate": 3.718151959354093e-05, + "loss": 1.6809, + "step": 19388 + }, + { + "epoch": 5.9511970534069984, + "grad_norm": 0.27669432759284973, + "learning_rate": 3.717671521898249e-05, + "loss": 1.7633, + "step": 19389 + }, + { + "epoch": 5.951503990178024, + "grad_norm": 0.2533062994480133, + "learning_rate": 3.717191097115125e-05, + "loss": 1.7536, + "step": 19390 + }, + { + "epoch": 5.951810926949048, + "grad_norm": 0.22249148786067963, + "learning_rate": 3.716710685009471e-05, + "loss": 1.7325, + "step": 19391 + }, + { + "epoch": 5.952117863720074, + "grad_norm": 0.3085922598838806, + "learning_rate": 3.716230285586033e-05, + "loss": 1.7046, + "step": 19392 + }, + { + "epoch": 5.952424800491099, + "grad_norm": 0.2591574192047119, + "learning_rate": 3.715749898849562e-05, + "loss": 1.7165, + "step": 19393 + }, + { + "epoch": 5.952731737262124, + "grad_norm": 0.24586348235607147, + "learning_rate": 3.715269524804803e-05, + "loss": 1.749, + "step": 19394 + }, + { + "epoch": 5.953038674033149, + "grad_norm": 0.3424640893936157, + "learning_rate": 3.714789163456502e-05, + "loss": 1.7143, + "step": 19395 + }, + { + "epoch": 5.953345610804174, + "grad_norm": 0.24856910109519958, + "learning_rate": 3.714308814809408e-05, + "loss": 1.868, + "step": 19396 + }, + { + "epoch": 5.953652547575199, + "grad_norm": 0.2758113145828247, + "learning_rate": 3.7138284788682676e-05, + "loss": 1.6722, + "step": 19397 + }, + { + "epoch": 5.953959484346225, + "grad_norm": 0.25981786847114563, + "learning_rate": 3.71334815563783e-05, + "loss": 1.764, + "step": 19398 + }, + { + "epoch": 5.95426642111725, + "grad_norm": 0.27885568141937256, + "learning_rate": 3.7128678451228385e-05, + "loss": 1.7422, + "step": 19399 + }, + { + "epoch": 5.954573357888275, + "grad_norm": 0.2909421920776367, + "learning_rate": 3.712387547328042e-05, + "loss": 1.7862, + "step": 19400 + }, + { + "epoch": 5.9548802946593, + "grad_norm": 0.2288074642419815, + "learning_rate": 3.711907262258185e-05, + "loss": 1.7054, + "step": 19401 + }, + { + "epoch": 5.955187231430325, + "grad_norm": 0.2986883819103241, + "learning_rate": 3.711426989918017e-05, + "loss": 1.7555, + "step": 19402 + }, + { + "epoch": 5.9554941682013505, + "grad_norm": 0.23201194405555725, + "learning_rate": 3.710946730312281e-05, + "loss": 1.8186, + "step": 19403 + }, + { + "epoch": 5.955801104972376, + "grad_norm": 0.2609403431415558, + "learning_rate": 3.710466483445728e-05, + "loss": 1.7743, + "step": 19404 + }, + { + "epoch": 5.956108041743401, + "grad_norm": 0.31131741404533386, + "learning_rate": 3.709986249323098e-05, + "loss": 1.7938, + "step": 19405 + }, + { + "epoch": 5.956414978514426, + "grad_norm": 0.20544753968715668, + "learning_rate": 3.7095060279491424e-05, + "loss": 1.7278, + "step": 19406 + }, + { + "epoch": 5.956721915285451, + "grad_norm": 0.3063479959964752, + "learning_rate": 3.709025819328602e-05, + "loss": 1.7544, + "step": 19407 + }, + { + "epoch": 5.957028852056476, + "grad_norm": 0.34868693351745605, + "learning_rate": 3.708545623466227e-05, + "loss": 1.7536, + "step": 19408 + }, + { + "epoch": 5.957335788827502, + "grad_norm": 0.20847822725772858, + "learning_rate": 3.70806544036676e-05, + "loss": 1.7003, + "step": 19409 + }, + { + "epoch": 5.957642725598527, + "grad_norm": 0.3250095844268799, + "learning_rate": 3.707585270034949e-05, + "loss": 1.6815, + "step": 19410 + }, + { + "epoch": 5.957949662369552, + "grad_norm": 0.24854284524917603, + "learning_rate": 3.707105112475539e-05, + "loss": 1.7665, + "step": 19411 + }, + { + "epoch": 5.958256599140577, + "grad_norm": 0.2921455502510071, + "learning_rate": 3.706624967693271e-05, + "loss": 1.7039, + "step": 19412 + }, + { + "epoch": 5.958563535911602, + "grad_norm": 0.2659071385860443, + "learning_rate": 3.706144835692894e-05, + "loss": 1.7641, + "step": 19413 + }, + { + "epoch": 5.958870472682627, + "grad_norm": 0.30329519510269165, + "learning_rate": 3.7056647164791516e-05, + "loss": 1.7962, + "step": 19414 + }, + { + "epoch": 5.959177409453653, + "grad_norm": 0.4023756682872772, + "learning_rate": 3.7051846100567906e-05, + "loss": 1.7624, + "step": 19415 + }, + { + "epoch": 5.959484346224678, + "grad_norm": 0.24528828263282776, + "learning_rate": 3.704704516430553e-05, + "loss": 1.8156, + "step": 19416 + }, + { + "epoch": 5.9597912829957025, + "grad_norm": 0.46833130717277527, + "learning_rate": 3.704224435605186e-05, + "loss": 1.798, + "step": 19417 + }, + { + "epoch": 5.960098219766728, + "grad_norm": 0.26952674984931946, + "learning_rate": 3.70374436758543e-05, + "loss": 1.743, + "step": 19418 + }, + { + "epoch": 5.960405156537753, + "grad_norm": 0.3126155734062195, + "learning_rate": 3.703264312376034e-05, + "loss": 1.8003, + "step": 19419 + }, + { + "epoch": 5.9607120933087785, + "grad_norm": 0.2833348512649536, + "learning_rate": 3.702784269981738e-05, + "loss": 1.7524, + "step": 19420 + }, + { + "epoch": 5.961019030079804, + "grad_norm": 0.25425654649734497, + "learning_rate": 3.7023042404072916e-05, + "loss": 1.7241, + "step": 19421 + }, + { + "epoch": 5.961325966850829, + "grad_norm": 0.29460933804512024, + "learning_rate": 3.701824223657433e-05, + "loss": 1.676, + "step": 19422 + }, + { + "epoch": 5.961632903621854, + "grad_norm": 0.21040670573711395, + "learning_rate": 3.7013442197369094e-05, + "loss": 1.71, + "step": 19423 + }, + { + "epoch": 5.961939840392879, + "grad_norm": 0.3200007379055023, + "learning_rate": 3.7008642286504624e-05, + "loss": 1.7108, + "step": 19424 + }, + { + "epoch": 5.962246777163904, + "grad_norm": 0.20397430658340454, + "learning_rate": 3.7003842504028366e-05, + "loss": 1.7472, + "step": 19425 + }, + { + "epoch": 5.96255371393493, + "grad_norm": 0.24811354279518127, + "learning_rate": 3.699904284998776e-05, + "loss": 1.7116, + "step": 19426 + }, + { + "epoch": 5.962860650705955, + "grad_norm": 0.20980580151081085, + "learning_rate": 3.699424332443023e-05, + "loss": 1.786, + "step": 19427 + }, + { + "epoch": 5.963167587476979, + "grad_norm": 0.1967400163412094, + "learning_rate": 3.698944392740322e-05, + "loss": 1.7141, + "step": 19428 + }, + { + "epoch": 5.963474524248005, + "grad_norm": 0.21907822787761688, + "learning_rate": 3.698464465895414e-05, + "loss": 1.6983, + "step": 19429 + }, + { + "epoch": 5.96378146101903, + "grad_norm": 0.19938960671424866, + "learning_rate": 3.697984551913043e-05, + "loss": 1.6811, + "step": 19430 + }, + { + "epoch": 5.964088397790055, + "grad_norm": 0.22280220687389374, + "learning_rate": 3.6975046507979506e-05, + "loss": 1.6838, + "step": 19431 + }, + { + "epoch": 5.964395334561081, + "grad_norm": 0.2530672550201416, + "learning_rate": 3.697024762554883e-05, + "loss": 1.8116, + "step": 19432 + }, + { + "epoch": 5.964702271332106, + "grad_norm": 0.21853135526180267, + "learning_rate": 3.696544887188579e-05, + "loss": 1.692, + "step": 19433 + }, + { + "epoch": 5.9650092081031305, + "grad_norm": 0.18738535046577454, + "learning_rate": 3.696065024703783e-05, + "loss": 1.6971, + "step": 19434 + }, + { + "epoch": 5.965316144874156, + "grad_norm": 0.21199190616607666, + "learning_rate": 3.695585175105236e-05, + "loss": 1.7526, + "step": 19435 + }, + { + "epoch": 5.965623081645181, + "grad_norm": 0.22184251248836517, + "learning_rate": 3.695105338397681e-05, + "loss": 1.8075, + "step": 19436 + }, + { + "epoch": 5.9659300184162065, + "grad_norm": 0.20191644132137299, + "learning_rate": 3.6946255145858605e-05, + "loss": 1.7427, + "step": 19437 + }, + { + "epoch": 5.966236955187231, + "grad_norm": 0.2113640457391739, + "learning_rate": 3.694145703674515e-05, + "loss": 1.7556, + "step": 19438 + }, + { + "epoch": 5.966543891958256, + "grad_norm": 0.21834735572338104, + "learning_rate": 3.693665905668387e-05, + "loss": 1.7673, + "step": 19439 + }, + { + "epoch": 5.966850828729282, + "grad_norm": 0.2260274887084961, + "learning_rate": 3.6931861205722197e-05, + "loss": 1.8168, + "step": 19440 + }, + { + "epoch": 5.967157765500307, + "grad_norm": 0.24090524017810822, + "learning_rate": 3.692706348390751e-05, + "loss": 1.821, + "step": 19441 + }, + { + "epoch": 5.967464702271332, + "grad_norm": 0.27469882369041443, + "learning_rate": 3.6922265891287256e-05, + "loss": 1.7114, + "step": 19442 + }, + { + "epoch": 5.967771639042358, + "grad_norm": 0.23479801416397095, + "learning_rate": 3.6917468427908833e-05, + "loss": 1.7334, + "step": 19443 + }, + { + "epoch": 5.968078575813382, + "grad_norm": 0.21109704673290253, + "learning_rate": 3.6912671093819663e-05, + "loss": 1.7047, + "step": 19444 + }, + { + "epoch": 5.968385512584407, + "grad_norm": 0.21141986548900604, + "learning_rate": 3.690787388906715e-05, + "loss": 1.6868, + "step": 19445 + }, + { + "epoch": 5.968692449355433, + "grad_norm": 0.21836397051811218, + "learning_rate": 3.690307681369868e-05, + "loss": 1.6923, + "step": 19446 + }, + { + "epoch": 5.968999386126458, + "grad_norm": 0.21733662486076355, + "learning_rate": 3.6898279867761695e-05, + "loss": 1.7699, + "step": 19447 + }, + { + "epoch": 5.969306322897483, + "grad_norm": 0.19220437109470367, + "learning_rate": 3.689348305130359e-05, + "loss": 1.7002, + "step": 19448 + }, + { + "epoch": 5.969613259668508, + "grad_norm": 0.22644726932048798, + "learning_rate": 3.688868636437176e-05, + "loss": 1.7024, + "step": 19449 + }, + { + "epoch": 5.969920196439533, + "grad_norm": 0.1832779198884964, + "learning_rate": 3.688388980701361e-05, + "loss": 1.699, + "step": 19450 + }, + { + "epoch": 5.9702271332105585, + "grad_norm": 0.20793284475803375, + "learning_rate": 3.687909337927658e-05, + "loss": 1.7557, + "step": 19451 + }, + { + "epoch": 5.970534069981584, + "grad_norm": 0.19485175609588623, + "learning_rate": 3.6874297081207995e-05, + "loss": 1.7641, + "step": 19452 + }, + { + "epoch": 5.970841006752609, + "grad_norm": 0.20980949699878693, + "learning_rate": 3.686950091285534e-05, + "loss": 1.7542, + "step": 19453 + }, + { + "epoch": 5.9711479435236345, + "grad_norm": 0.24902600049972534, + "learning_rate": 3.686470487426594e-05, + "loss": 1.7342, + "step": 19454 + }, + { + "epoch": 5.971454880294659, + "grad_norm": 0.20191124081611633, + "learning_rate": 3.685990896548724e-05, + "loss": 1.6844, + "step": 19455 + }, + { + "epoch": 5.971761817065684, + "grad_norm": 0.23217806220054626, + "learning_rate": 3.685511318656662e-05, + "loss": 1.7054, + "step": 19456 + }, + { + "epoch": 5.97206875383671, + "grad_norm": 0.23383383452892303, + "learning_rate": 3.6850317537551484e-05, + "loss": 1.6903, + "step": 19457 + }, + { + "epoch": 5.972375690607735, + "grad_norm": 0.2147756665945053, + "learning_rate": 3.6845522018489196e-05, + "loss": 1.736, + "step": 19458 + }, + { + "epoch": 5.97268262737876, + "grad_norm": 0.23864400386810303, + "learning_rate": 3.68407266294272e-05, + "loss": 1.7483, + "step": 19459 + }, + { + "epoch": 5.972989564149785, + "grad_norm": 0.18702742457389832, + "learning_rate": 3.6835931370412836e-05, + "loss": 1.6874, + "step": 19460 + }, + { + "epoch": 5.97329650092081, + "grad_norm": 0.2167401760816574, + "learning_rate": 3.683113624149351e-05, + "loss": 1.652, + "step": 19461 + }, + { + "epoch": 5.973603437691835, + "grad_norm": 0.17105139791965485, + "learning_rate": 3.6826341242716636e-05, + "loss": 1.7029, + "step": 19462 + }, + { + "epoch": 5.973910374462861, + "grad_norm": 0.2189798206090927, + "learning_rate": 3.682154637412956e-05, + "loss": 1.7203, + "step": 19463 + }, + { + "epoch": 5.974217311233886, + "grad_norm": 0.17864444851875305, + "learning_rate": 3.68167516357797e-05, + "loss": 1.7176, + "step": 19464 + }, + { + "epoch": 5.974524248004911, + "grad_norm": 0.22356030344963074, + "learning_rate": 3.681195702771442e-05, + "loss": 1.7492, + "step": 19465 + }, + { + "epoch": 5.974831184775936, + "grad_norm": 0.19020728766918182, + "learning_rate": 3.68071625499811e-05, + "loss": 1.6925, + "step": 19466 + }, + { + "epoch": 5.975138121546961, + "grad_norm": 0.19092151522636414, + "learning_rate": 3.680236820262714e-05, + "loss": 1.7253, + "step": 19467 + }, + { + "epoch": 5.975445058317987, + "grad_norm": 0.20842085778713226, + "learning_rate": 3.6797573985699926e-05, + "loss": 1.7251, + "step": 19468 + }, + { + "epoch": 5.975751995089012, + "grad_norm": 0.2245844155550003, + "learning_rate": 3.6792779899246796e-05, + "loss": 1.7351, + "step": 19469 + }, + { + "epoch": 5.976058931860036, + "grad_norm": 0.18867328763008118, + "learning_rate": 3.678798594331519e-05, + "loss": 1.6646, + "step": 19470 + }, + { + "epoch": 5.976365868631062, + "grad_norm": 0.2892500162124634, + "learning_rate": 3.678319211795242e-05, + "loss": 1.7146, + "step": 19471 + }, + { + "epoch": 5.976672805402087, + "grad_norm": 0.22490514814853668, + "learning_rate": 3.677839842320591e-05, + "loss": 1.7147, + "step": 19472 + }, + { + "epoch": 5.976979742173112, + "grad_norm": 0.296724796295166, + "learning_rate": 3.677360485912301e-05, + "loss": 1.7714, + "step": 19473 + }, + { + "epoch": 5.977286678944138, + "grad_norm": 0.2784444987773895, + "learning_rate": 3.676881142575111e-05, + "loss": 1.7198, + "step": 19474 + }, + { + "epoch": 5.977593615715163, + "grad_norm": 0.20270293951034546, + "learning_rate": 3.676401812313755e-05, + "loss": 1.7336, + "step": 19475 + }, + { + "epoch": 5.9779005524861875, + "grad_norm": 0.23352907598018646, + "learning_rate": 3.6759224951329745e-05, + "loss": 1.7428, + "step": 19476 + }, + { + "epoch": 5.978207489257213, + "grad_norm": 0.1892426460981369, + "learning_rate": 3.675443191037502e-05, + "loss": 1.6636, + "step": 19477 + }, + { + "epoch": 5.978514426028238, + "grad_norm": 0.22216783463954926, + "learning_rate": 3.6749639000320766e-05, + "loss": 1.7446, + "step": 19478 + }, + { + "epoch": 5.9788213627992635, + "grad_norm": 0.19465389847755432, + "learning_rate": 3.6744846221214364e-05, + "loss": 1.7403, + "step": 19479 + }, + { + "epoch": 5.979128299570289, + "grad_norm": 0.1918177455663681, + "learning_rate": 3.674005357310314e-05, + "loss": 1.6974, + "step": 19480 + }, + { + "epoch": 5.979435236341313, + "grad_norm": 0.19065791368484497, + "learning_rate": 3.673526105603449e-05, + "loss": 1.7299, + "step": 19481 + }, + { + "epoch": 5.979742173112339, + "grad_norm": 0.24036844074726105, + "learning_rate": 3.673046867005575e-05, + "loss": 1.7441, + "step": 19482 + }, + { + "epoch": 5.980049109883364, + "grad_norm": 0.22352568805217743, + "learning_rate": 3.6725676415214305e-05, + "loss": 1.7556, + "step": 19483 + }, + { + "epoch": 5.980356046654389, + "grad_norm": 0.2492935210466385, + "learning_rate": 3.67208842915575e-05, + "loss": 1.6833, + "step": 19484 + }, + { + "epoch": 5.980662983425415, + "grad_norm": 0.2554415762424469, + "learning_rate": 3.671609229913272e-05, + "loss": 1.7426, + "step": 19485 + }, + { + "epoch": 5.98096992019644, + "grad_norm": 0.24076475203037262, + "learning_rate": 3.671130043798728e-05, + "loss": 1.7362, + "step": 19486 + }, + { + "epoch": 5.981276856967464, + "grad_norm": 0.24297118186950684, + "learning_rate": 3.670650870816858e-05, + "loss": 1.7493, + "step": 19487 + }, + { + "epoch": 5.98158379373849, + "grad_norm": 0.19533030688762665, + "learning_rate": 3.6701717109723924e-05, + "loss": 1.7397, + "step": 19488 + }, + { + "epoch": 5.981890730509515, + "grad_norm": 0.24731193482875824, + "learning_rate": 3.669692564270071e-05, + "loss": 1.7483, + "step": 19489 + }, + { + "epoch": 5.98219766728054, + "grad_norm": 0.23274390399456024, + "learning_rate": 3.669213430714626e-05, + "loss": 1.7677, + "step": 19490 + }, + { + "epoch": 5.982504604051566, + "grad_norm": 0.180234894156456, + "learning_rate": 3.668734310310796e-05, + "loss": 1.7065, + "step": 19491 + }, + { + "epoch": 5.98281154082259, + "grad_norm": 0.19045281410217285, + "learning_rate": 3.6682552030633125e-05, + "loss": 1.7089, + "step": 19492 + }, + { + "epoch": 5.9831184775936155, + "grad_norm": 0.17261318862438202, + "learning_rate": 3.667776108976914e-05, + "loss": 1.7227, + "step": 19493 + }, + { + "epoch": 5.983425414364641, + "grad_norm": 0.2156316339969635, + "learning_rate": 3.667297028056329e-05, + "loss": 1.7025, + "step": 19494 + }, + { + "epoch": 5.983732351135666, + "grad_norm": 0.22288112342357635, + "learning_rate": 3.666817960306298e-05, + "loss": 1.7123, + "step": 19495 + }, + { + "epoch": 5.9840392879066915, + "grad_norm": 0.21983082592487335, + "learning_rate": 3.6663389057315543e-05, + "loss": 1.7688, + "step": 19496 + }, + { + "epoch": 5.984346224677717, + "grad_norm": 0.1804746687412262, + "learning_rate": 3.665859864336829e-05, + "loss": 1.759, + "step": 19497 + }, + { + "epoch": 5.984653161448741, + "grad_norm": 0.22762230038642883, + "learning_rate": 3.6653808361268605e-05, + "loss": 1.8128, + "step": 19498 + }, + { + "epoch": 5.984960098219767, + "grad_norm": 0.21779340505599976, + "learning_rate": 3.664901821106379e-05, + "loss": 1.7316, + "step": 19499 + }, + { + "epoch": 5.985267034990792, + "grad_norm": 0.18899449706077576, + "learning_rate": 3.664422819280121e-05, + "loss": 1.7535, + "step": 19500 + }, + { + "epoch": 5.985573971761817, + "grad_norm": 0.22799427807331085, + "learning_rate": 3.663943830652819e-05, + "loss": 1.7626, + "step": 19501 + }, + { + "epoch": 5.985880908532843, + "grad_norm": 0.19936929643154144, + "learning_rate": 3.6634648552292086e-05, + "loss": 1.6887, + "step": 19502 + }, + { + "epoch": 5.986187845303867, + "grad_norm": 0.22482532262802124, + "learning_rate": 3.6629858930140206e-05, + "loss": 1.6867, + "step": 19503 + }, + { + "epoch": 5.986494782074892, + "grad_norm": 0.23543842136859894, + "learning_rate": 3.662506944011991e-05, + "loss": 1.7715, + "step": 19504 + }, + { + "epoch": 5.986801718845918, + "grad_norm": 0.230603888630867, + "learning_rate": 3.6620280082278495e-05, + "loss": 1.7514, + "step": 19505 + }, + { + "epoch": 5.987108655616943, + "grad_norm": 0.26767033338546753, + "learning_rate": 3.6615490856663334e-05, + "loss": 1.6862, + "step": 19506 + }, + { + "epoch": 5.987415592387968, + "grad_norm": 0.18282492458820343, + "learning_rate": 3.661070176332172e-05, + "loss": 1.6569, + "step": 19507 + }, + { + "epoch": 5.987722529158994, + "grad_norm": 0.255426824092865, + "learning_rate": 3.6605912802301016e-05, + "loss": 1.7623, + "step": 19508 + }, + { + "epoch": 5.988029465930018, + "grad_norm": 0.25026118755340576, + "learning_rate": 3.6601123973648524e-05, + "loss": 1.6907, + "step": 19509 + }, + { + "epoch": 5.9883364027010435, + "grad_norm": 0.19193407893180847, + "learning_rate": 3.659633527741159e-05, + "loss": 1.7647, + "step": 19510 + }, + { + "epoch": 5.988643339472069, + "grad_norm": 0.25562727451324463, + "learning_rate": 3.6591546713637506e-05, + "loss": 1.6806, + "step": 19511 + }, + { + "epoch": 5.988950276243094, + "grad_norm": 0.2296016663312912, + "learning_rate": 3.6586758282373624e-05, + "loss": 1.7747, + "step": 19512 + }, + { + "epoch": 5.989257213014119, + "grad_norm": 0.22875753045082092, + "learning_rate": 3.6581969983667275e-05, + "loss": 1.7847, + "step": 19513 + }, + { + "epoch": 5.989564149785144, + "grad_norm": 0.24469317495822906, + "learning_rate": 3.6577181817565736e-05, + "loss": 1.6784, + "step": 19514 + }, + { + "epoch": 5.989871086556169, + "grad_norm": 0.22855928540229797, + "learning_rate": 3.657239378411638e-05, + "loss": 1.788, + "step": 19515 + }, + { + "epoch": 5.990178023327195, + "grad_norm": 0.28745612502098083, + "learning_rate": 3.656760588336647e-05, + "loss": 1.6836, + "step": 19516 + }, + { + "epoch": 5.99048496009822, + "grad_norm": 0.18221193552017212, + "learning_rate": 3.656281811536337e-05, + "loss": 1.6687, + "step": 19517 + }, + { + "epoch": 5.990791896869245, + "grad_norm": 0.2556660771369934, + "learning_rate": 3.655803048015437e-05, + "loss": 1.7351, + "step": 19518 + }, + { + "epoch": 5.99109883364027, + "grad_norm": 0.18791422247886658, + "learning_rate": 3.6553242977786803e-05, + "loss": 1.6749, + "step": 19519 + }, + { + "epoch": 5.991405770411295, + "grad_norm": 0.28149592876434326, + "learning_rate": 3.654845560830796e-05, + "loss": 1.7333, + "step": 19520 + }, + { + "epoch": 5.99171270718232, + "grad_norm": 0.24631322920322418, + "learning_rate": 3.654366837176517e-05, + "loss": 1.7672, + "step": 19521 + }, + { + "epoch": 5.992019643953346, + "grad_norm": 0.22054782509803772, + "learning_rate": 3.653888126820573e-05, + "loss": 1.7499, + "step": 19522 + }, + { + "epoch": 5.992326580724371, + "grad_norm": 0.23334862291812897, + "learning_rate": 3.653409429767696e-05, + "loss": 1.7133, + "step": 19523 + }, + { + "epoch": 5.9926335174953955, + "grad_norm": 0.19809292256832123, + "learning_rate": 3.6529307460226145e-05, + "loss": 1.6965, + "step": 19524 + }, + { + "epoch": 5.992940454266421, + "grad_norm": 0.23769772052764893, + "learning_rate": 3.652452075590064e-05, + "loss": 1.699, + "step": 19525 + }, + { + "epoch": 5.993247391037446, + "grad_norm": 0.19045031070709229, + "learning_rate": 3.6519734184747686e-05, + "loss": 1.7043, + "step": 19526 + }, + { + "epoch": 5.9935543278084715, + "grad_norm": 0.20795129239559174, + "learning_rate": 3.651494774681465e-05, + "loss": 1.7159, + "step": 19527 + }, + { + "epoch": 5.993861264579497, + "grad_norm": 0.1933370679616928, + "learning_rate": 3.651016144214878e-05, + "loss": 1.6999, + "step": 19528 + }, + { + "epoch": 5.994168201350522, + "grad_norm": 0.18360544741153717, + "learning_rate": 3.650537527079742e-05, + "loss": 1.7525, + "step": 19529 + }, + { + "epoch": 5.994475138121547, + "grad_norm": 0.21080785989761353, + "learning_rate": 3.650058923280786e-05, + "loss": 1.6832, + "step": 19530 + }, + { + "epoch": 5.994782074892572, + "grad_norm": 0.19701606035232544, + "learning_rate": 3.649580332822736e-05, + "loss": 1.7104, + "step": 19531 + }, + { + "epoch": 5.995089011663597, + "grad_norm": 0.24208703637123108, + "learning_rate": 3.6491017557103266e-05, + "loss": 1.726, + "step": 19532 + }, + { + "epoch": 5.995395948434623, + "grad_norm": 0.25981345772743225, + "learning_rate": 3.648623191948284e-05, + "loss": 1.7644, + "step": 19533 + }, + { + "epoch": 5.995702885205648, + "grad_norm": 0.24137455224990845, + "learning_rate": 3.64814464154134e-05, + "loss": 1.7354, + "step": 19534 + }, + { + "epoch": 5.996009821976672, + "grad_norm": 0.2140759378671646, + "learning_rate": 3.647666104494222e-05, + "loss": 1.7244, + "step": 19535 + }, + { + "epoch": 5.996316758747698, + "grad_norm": 0.2801622748374939, + "learning_rate": 3.647187580811663e-05, + "loss": 1.6996, + "step": 19536 + }, + { + "epoch": 5.996623695518723, + "grad_norm": 0.21048817038536072, + "learning_rate": 3.6467090704983856e-05, + "loss": 1.7378, + "step": 19537 + }, + { + "epoch": 5.996930632289748, + "grad_norm": 0.2935819625854492, + "learning_rate": 3.6462305735591254e-05, + "loss": 1.7066, + "step": 19538 + }, + { + "epoch": 5.997237569060774, + "grad_norm": 0.22473880648612976, + "learning_rate": 3.645752089998606e-05, + "loss": 1.7539, + "step": 19539 + }, + { + "epoch": 5.997544505831799, + "grad_norm": 0.20606113970279694, + "learning_rate": 3.6452736198215585e-05, + "loss": 1.7338, + "step": 19540 + }, + { + "epoch": 5.9978514426028235, + "grad_norm": 0.2702842950820923, + "learning_rate": 3.6447951630327116e-05, + "loss": 1.7171, + "step": 19541 + }, + { + "epoch": 5.998158379373849, + "grad_norm": 0.19971637427806854, + "learning_rate": 3.6443167196367946e-05, + "loss": 1.7132, + "step": 19542 + }, + { + "epoch": 5.998465316144874, + "grad_norm": 0.2352653592824936, + "learning_rate": 3.643838289638531e-05, + "loss": 1.787, + "step": 19543 + }, + { + "epoch": 5.9987722529158995, + "grad_norm": 0.2324669510126114, + "learning_rate": 3.643359873042656e-05, + "loss": 1.7039, + "step": 19544 + }, + { + "epoch": 5.999079189686924, + "grad_norm": 0.1935029774904251, + "learning_rate": 3.6428814698538914e-05, + "loss": 1.6846, + "step": 19545 + }, + { + "epoch": 5.999386126457949, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.642403080076968e-05, + "loss": 1.7018, + "step": 19546 + }, + { + "epoch": 5.999693063228975, + "grad_norm": 0.19364693760871887, + "learning_rate": 3.6419247037166146e-05, + "loss": 1.6901, + "step": 19547 + }, + { + "epoch": 6.0, + "grad_norm": 0.23718556761741638, + "learning_rate": 3.641446340777556e-05, + "loss": 1.7743, + "step": 19548 + } + ], + "logging_steps": 1.0, + "max_steps": 32580, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.27953555091171e+20, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-19548/training_args.bin b/checkpoint-19548/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9db7ad91da5423a229826113feb3e9db3ef40c31 --- /dev/null +++ b/checkpoint-19548/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682b697e933b6e2693e5f9af9a0654effab1ca392c8500bf8af0eb089116a263 +size 7288 diff --git a/checkpoint-19548/zero_to_fp32.py b/checkpoint-19548/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-19548/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-22806/config.json b/checkpoint-22806/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a29af639fbf705188c21aae22660a85fee1ca26e --- /dev/null +++ b/checkpoint-22806/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "freeze_mm_mlp_adapter": false, + "gen_hidden_size": 1792, + "gen_pooling": "early_pool2d_4", + "gen_vision_tower": "eva-clip-E-14-plus", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "image_aspect_ratio": "square", + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "llava_llama", + "n_query": 64, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "tokenizer_model_max_length": 256, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "tune_mm_mlp_adapter": false, + "use_cache": false, + "use_mm_proj": true, + "vision_tower_pretrained": null, + "vocab_size": 128260 +} diff --git a/checkpoint-22806/generation_config.json b/checkpoint-22806/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..05892c70fa899883072c585fa444b4aa7175d6bc --- /dev/null +++ b/checkpoint-22806/generation_config.json @@ -0,0 +1,13 @@ +{ + "attn_implementation": "flash_attention_2", + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-22806/latest b/checkpoint-22806/latest new file mode 100644 index 0000000000000000000000000000000000000000..b315bedddd05f07f899cf4e7b21f6e69720dd29c --- /dev/null +++ b/checkpoint-22806/latest @@ -0,0 +1 @@ +global_step22806 \ No newline at end of file diff --git a/checkpoint-22806/model-00001-of-00003.safetensors b/checkpoint-22806/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d5aada0709439822a1a4e34255b922dc98a09ea7 --- /dev/null +++ b/checkpoint-22806/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ed1e1b7ea7fe0ad2d3ed667f307ba381ddf9ba4003bffcd28160382d5536355 +size 4955415870 diff --git a/checkpoint-22806/model-00002-of-00003.safetensors b/checkpoint-22806/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1c6f3bf70f8abb1e7ffb233219debc10bc20bfc --- /dev/null +++ b/checkpoint-22806/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b088e0e2c4fb5916f448522fa5aef361db713e2c2c0ceac534662c8d52e330d +size 4971563008 diff --git a/checkpoint-22806/model-00003-of-00003.safetensors b/checkpoint-22806/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2e76c6da7bf9f36e8c046dd31427fbcff43fc4a6 --- /dev/null +++ b/checkpoint-22806/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e3ed5802c0d1d2f1b9a0ffa785dd7923a1d91c1ba72bd4f9277ab1d6b78f38b +size 4180840856 diff --git a/checkpoint-22806/model.safetensors.index.json b/checkpoint-22806/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c911c94f46f802ae304903dd7796da96c28604 --- /dev/null +++ b/checkpoint-22806/model.safetensors.index.json @@ -0,0 +1,2358 @@ +{ + "metadata": { + "total_size": 14107506086 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.bias": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.cls_token": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.pos_embed": "model-00001-of-00003.safetensors", + "model.latent_queries": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/checkpoint-22806/rng_state_0.pth b/checkpoint-22806/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..77ef9dc82d21affd079d1d315977687bee4bdc43 --- /dev/null +++ b/checkpoint-22806/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f862ba716aaa5a4f47c312f4fc7963f22d1d02aa1bb72fac83aad5e94a786e9a +size 15984 diff --git a/checkpoint-22806/rng_state_1.pth b/checkpoint-22806/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3fd561b19e756639355460512b518eb297f71279 --- /dev/null +++ b/checkpoint-22806/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:613fadaa6eae357229dc35e1f7d00a71e08bee04ebf6a91e926a87ebd334ccf3 +size 15984 diff --git a/checkpoint-22806/rng_state_10.pth b/checkpoint-22806/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..e5cd20a1c148c3c9ba1b16abb24e45f5c84efef2 --- /dev/null +++ b/checkpoint-22806/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46fd9342552ab089a9cba3ebb2144c22237f96134ce4db750bd6cac7ec2654b8 +size 15997 diff --git a/checkpoint-22806/rng_state_11.pth b/checkpoint-22806/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c5bf971ff27dc40bdaf0d27b2f4c9e1f53cd294 --- /dev/null +++ b/checkpoint-22806/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b792096d9c72b572e70194fb65c20bbb9f7a966bd7033c4d9c745f8b54b0dbe +size 15997 diff --git a/checkpoint-22806/rng_state_12.pth b/checkpoint-22806/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..58198b8c1ab39fc94961f7a7df7ad8908bc63f3f --- /dev/null +++ b/checkpoint-22806/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b611794e3108e38242cef75cf195709144b7ea66a82c5d5d87ae93c71f901ecc +size 15997 diff --git a/checkpoint-22806/rng_state_13.pth b/checkpoint-22806/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..0092bcfb436355c14e85ae80d9301a2d643d27b7 --- /dev/null +++ b/checkpoint-22806/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02f7a9d984b7aa91e87dbac6443300652435a68cbc2010f70a5e42818e90770d +size 15997 diff --git a/checkpoint-22806/rng_state_14.pth b/checkpoint-22806/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..84fbb19176a56dea6579f713808da8e2fe55a91c --- /dev/null +++ b/checkpoint-22806/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b5b7498e0cb12ec48895f9409a6cc4ab8d94a6778e25081c2bca44af48b9e99 +size 15997 diff --git a/checkpoint-22806/rng_state_15.pth b/checkpoint-22806/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..0576fd3d273ecac8bcdcc28aa2010cc5790245fe --- /dev/null +++ b/checkpoint-22806/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ff807457723e2a176a0c959674c0edf65f53e3e4bc0f7d23072035746ac0a6 +size 15997 diff --git a/checkpoint-22806/rng_state_16.pth b/checkpoint-22806/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b6d8120525e7c7f9fb5f1bd24362751b4e2e492 --- /dev/null +++ b/checkpoint-22806/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a31b379a80614ee890ca9c5574ff76bac5e79073a87d56d9cdbf208b5315185d +size 15997 diff --git a/checkpoint-22806/rng_state_17.pth b/checkpoint-22806/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..b425cd4579bc83fa10a139ebbd067e948d0dea47 --- /dev/null +++ b/checkpoint-22806/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae85247687e55deae47051ec1912d3c44c45dec191c2a54327d16bd60b6aad35 +size 15997 diff --git a/checkpoint-22806/rng_state_18.pth b/checkpoint-22806/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..2771343797ffad6ab5775226ac6dab54d4afddff --- /dev/null +++ b/checkpoint-22806/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4181d548fe86c360ed6fc0082f8920575f3f4c19e0f825d55e9703f889573518 +size 15997 diff --git a/checkpoint-22806/rng_state_19.pth b/checkpoint-22806/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..33969c12e5fb13adbf0220ab989050faa6196302 --- /dev/null +++ b/checkpoint-22806/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e757b204339e4c8d22c601d206308145efe56ae1ce491d4bef0c5e73e732c6f4 +size 15997 diff --git a/checkpoint-22806/rng_state_2.pth b/checkpoint-22806/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6dfcebc7da01f1961296e64a65d45a372bf6d70 --- /dev/null +++ b/checkpoint-22806/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63c74adb73ae47f5175b2c43020c42e40655418e34e6468cbf82a8cff36f5ce6 +size 15984 diff --git a/checkpoint-22806/rng_state_20.pth b/checkpoint-22806/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..e4be1e06b5e8527a3f3bec55a74e53202640ea4e --- /dev/null +++ b/checkpoint-22806/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82d60ac6d38b7fd0d441671387b795076d1dc7c43b02e56a40e95db458ebe873 +size 15997 diff --git a/checkpoint-22806/rng_state_21.pth b/checkpoint-22806/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9164d58f2ab8d0296ba5bb49e6311c387006049 --- /dev/null +++ b/checkpoint-22806/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d631cbd7d5a6079757825ef9c565e4eca32f8c844e818bae6810ed02bd25061 +size 15997 diff --git a/checkpoint-22806/rng_state_22.pth b/checkpoint-22806/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..58c361bfb0a9272bb60873d52cdfcdaef532bb37 --- /dev/null +++ b/checkpoint-22806/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c724f617c7b35929f25d541674149c959f9c47a4e0c042207fbb68d0e46d1813 +size 15997 diff --git a/checkpoint-22806/rng_state_23.pth b/checkpoint-22806/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..87cbe4853634d803954b499e287124526bfc8bd5 --- /dev/null +++ b/checkpoint-22806/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1739506c6e5e34b80a1267f12e1776a622f3fce3054af9ce2c1399dcef5d23ac +size 15997 diff --git a/checkpoint-22806/rng_state_24.pth b/checkpoint-22806/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..0ddee40f028cec6c8ac09d8cddacb77f0ba747dd --- /dev/null +++ b/checkpoint-22806/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7b8555f9dbadae006e8262901db0651d7b0f2eae902123cc9dadff94e1d4df1 +size 15997 diff --git a/checkpoint-22806/rng_state_25.pth b/checkpoint-22806/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..c0afe817ab656a98bc7f2def884929c97753b303 --- /dev/null +++ b/checkpoint-22806/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:527e5c4f0997746bd1a87e8528b0b873d937ea6b75123f01bd52e8b01acb4df0 +size 15997 diff --git a/checkpoint-22806/rng_state_26.pth b/checkpoint-22806/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..6df84e08e276be98360f027de829d2d90ae0d97b --- /dev/null +++ b/checkpoint-22806/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4f25d403ee079017a3a27ad6b0fd1caef932bcdc764968f48ce5ef3ec32cc05 +size 15997 diff --git a/checkpoint-22806/rng_state_27.pth b/checkpoint-22806/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..639950e6a7a3409a70806dfeecba4c5d1c99b1a6 --- /dev/null +++ b/checkpoint-22806/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee75ed2e14b4c82e0c62c10dc2c515acb931f954c72dcc5137e0d102b8dc12ac +size 15997 diff --git a/checkpoint-22806/rng_state_28.pth b/checkpoint-22806/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..39955fa4081e37529509bb9d2e66040382815443 --- /dev/null +++ b/checkpoint-22806/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12bf81b9d916cc6e1f01d75e04575c01d80b92ceb5de55f43a82b8843a211e80 +size 15997 diff --git a/checkpoint-22806/rng_state_29.pth b/checkpoint-22806/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e4f739d8da6af2b7f0c5f4b20c5ec00852c41fc --- /dev/null +++ b/checkpoint-22806/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f2260ac7997aa2e3f607860109993e31e61e5a1ab5188e6a052094baf1b2ba +size 15997 diff --git a/checkpoint-22806/rng_state_3.pth b/checkpoint-22806/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..214b8871644338e2b119024acf3b1227c9502a16 --- /dev/null +++ b/checkpoint-22806/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:261c7069abd4e1956ce369cae1b2507b003a266444383cae8e0291dca43f9180 +size 15984 diff --git a/checkpoint-22806/rng_state_30.pth b/checkpoint-22806/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..6132a1ae114c6b13533988e4660b38f4763aa34e --- /dev/null +++ b/checkpoint-22806/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d6669eeb5237fc5760b9e484f928e6c979f31c594301ed240cefbf843d92d00 +size 15997 diff --git a/checkpoint-22806/rng_state_31.pth b/checkpoint-22806/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..677fb96c7be52d0ff92280e841594723008cc743 --- /dev/null +++ b/checkpoint-22806/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdab8ee3bbcc1df115b988af22e9278736a168840e3f6af185d5a91b2565925f +size 15997 diff --git a/checkpoint-22806/rng_state_32.pth b/checkpoint-22806/rng_state_32.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc392edbec77c986604342f62b2dd48264d9328e --- /dev/null +++ b/checkpoint-22806/rng_state_32.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cd94949e55e5af98ff08bcd0fe79a281d347ca82feedbac2aa45e07d8042c12 +size 15997 diff --git a/checkpoint-22806/rng_state_33.pth b/checkpoint-22806/rng_state_33.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d9b4d7efa38b91044464e035adc499ec95149e4 --- /dev/null +++ b/checkpoint-22806/rng_state_33.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed32910738ff9cf0b6b577af0bb5e35d26ec0a3b81b84a54a27e217089421045 +size 15997 diff --git a/checkpoint-22806/rng_state_34.pth b/checkpoint-22806/rng_state_34.pth new file mode 100644 index 0000000000000000000000000000000000000000..65c3a465dc362ab3b26c8091900fcdb8226030a7 --- /dev/null +++ b/checkpoint-22806/rng_state_34.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7a9ca886de7b8dcfff1ac6ac22afe78a4be94deb06012a79b9d9896ac30e863 +size 15997 diff --git a/checkpoint-22806/rng_state_35.pth b/checkpoint-22806/rng_state_35.pth new file mode 100644 index 0000000000000000000000000000000000000000..61b47eeaedc2b55ea73fab0cbfcae4a46c7f8857 --- /dev/null +++ b/checkpoint-22806/rng_state_35.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f9d6d856a130c859f5341638ff564737834058dfcefcbe182469594b99a96fc +size 15997 diff --git a/checkpoint-22806/rng_state_36.pth b/checkpoint-22806/rng_state_36.pth new file mode 100644 index 0000000000000000000000000000000000000000..72e541a6603dfc212ca68d320e425a983e94292d --- /dev/null +++ b/checkpoint-22806/rng_state_36.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15c18d3c5bd8f80ccef4092319417b6a6b8bc926a2e558f3e4cad389d83af372 +size 15997 diff --git a/checkpoint-22806/rng_state_37.pth b/checkpoint-22806/rng_state_37.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9a16a81c65df376e10c242e7aabfa0b19400442 --- /dev/null +++ b/checkpoint-22806/rng_state_37.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67525b674abe6ceed536ba640c9511edc7326bdc05dd0eb45a782dd93b971c12 +size 15997 diff --git a/checkpoint-22806/rng_state_38.pth b/checkpoint-22806/rng_state_38.pth new file mode 100644 index 0000000000000000000000000000000000000000..1cb7c29ef3d538ec15c86af2e110958dcd35b9d7 --- /dev/null +++ b/checkpoint-22806/rng_state_38.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca26ef511901a7e2a0d0c9dfdb447c807dccd027c7ff9a5bd3d858c5edbc9a36 +size 15997 diff --git a/checkpoint-22806/rng_state_39.pth b/checkpoint-22806/rng_state_39.pth new file mode 100644 index 0000000000000000000000000000000000000000..7996ff4e8679394915ea7354bc2d1ecbda2f2e81 --- /dev/null +++ b/checkpoint-22806/rng_state_39.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43396ad5428358be9c1344392da8805a132263d2df176d51b85cb06dee6defd5 +size 15997 diff --git a/checkpoint-22806/rng_state_4.pth b/checkpoint-22806/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..102dfad453ab0966026bffc53ce1eea4925339dc --- /dev/null +++ b/checkpoint-22806/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cd420c919c676c092eab0be7c35304a3a82fb7becf73c200346cad4caf920f2 +size 15984 diff --git a/checkpoint-22806/rng_state_40.pth b/checkpoint-22806/rng_state_40.pth new file mode 100644 index 0000000000000000000000000000000000000000..a598c80ba9351af426832dae0a90d04828ea399e --- /dev/null +++ b/checkpoint-22806/rng_state_40.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9395ac789fabf9b2d20d0104a86f2ccfdb6231a600d432a46ff4eba9d28d4053 +size 15997 diff --git a/checkpoint-22806/rng_state_41.pth b/checkpoint-22806/rng_state_41.pth new file mode 100644 index 0000000000000000000000000000000000000000..742cf5344afc664cccdbac499e91e08ab2792f22 --- /dev/null +++ b/checkpoint-22806/rng_state_41.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8cf6670558dc7f69d6528a88a47fd900d38bad5d482463af92becb902e025ea +size 15997 diff --git a/checkpoint-22806/rng_state_42.pth b/checkpoint-22806/rng_state_42.pth new file mode 100644 index 0000000000000000000000000000000000000000..8b71b871ce6a08eeeb4efa8e61543f194f169580 --- /dev/null +++ b/checkpoint-22806/rng_state_42.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bc6660ba0fc12a5486738ad5494eddb76d223a9f617b9a4bc35e30c3223eae6 +size 15997 diff --git a/checkpoint-22806/rng_state_43.pth b/checkpoint-22806/rng_state_43.pth new file mode 100644 index 0000000000000000000000000000000000000000..1054c196cf7f8c8019f49f342daa8fda17e1fcb9 --- /dev/null +++ b/checkpoint-22806/rng_state_43.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aea3b90c68d9d3304cc6c9ecf8363d4bc0961536cd7e8587e6b2b07f602197b +size 15997 diff --git a/checkpoint-22806/rng_state_44.pth b/checkpoint-22806/rng_state_44.pth new file mode 100644 index 0000000000000000000000000000000000000000..95822b690ea02c53acc2629afac981860ee84e6a --- /dev/null +++ b/checkpoint-22806/rng_state_44.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddded3425dfedf8fea42ac9f2261021914f95d5b366a72b81aa51b27ac7a55e9 +size 15997 diff --git a/checkpoint-22806/rng_state_45.pth b/checkpoint-22806/rng_state_45.pth new file mode 100644 index 0000000000000000000000000000000000000000..9914514eb56160962598987af965d8a719a597f0 --- /dev/null +++ b/checkpoint-22806/rng_state_45.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b6a22be526159cf9d1f854ef0b7ed649d9047153f155013fe4d6cf4ebd59821 +size 15997 diff --git a/checkpoint-22806/rng_state_46.pth b/checkpoint-22806/rng_state_46.pth new file mode 100644 index 0000000000000000000000000000000000000000..6024b0514cb07e26c470c962f5d8b6314fb7c6b8 --- /dev/null +++ b/checkpoint-22806/rng_state_46.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a658c09126c693d00cf51c3f081f5f5fef86d5963e48fc0d50a9be136a5ee1 +size 15997 diff --git a/checkpoint-22806/rng_state_47.pth b/checkpoint-22806/rng_state_47.pth new file mode 100644 index 0000000000000000000000000000000000000000..39916854efa0b2c28df4bf5523886d083f0d9d03 --- /dev/null +++ b/checkpoint-22806/rng_state_47.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dadb40afd5933d12e3ae835ac243d00a91124b5c939e18e7a6b2ea799af2eb0 +size 15997 diff --git a/checkpoint-22806/rng_state_48.pth b/checkpoint-22806/rng_state_48.pth new file mode 100644 index 0000000000000000000000000000000000000000..7fd40932fda89c615957c3530e70a41ee5cf3bc1 --- /dev/null +++ b/checkpoint-22806/rng_state_48.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85db10a6e7209b0e6af11957690a408805cb6f9971db81b35ab52a30d883a513 +size 15997 diff --git a/checkpoint-22806/rng_state_49.pth b/checkpoint-22806/rng_state_49.pth new file mode 100644 index 0000000000000000000000000000000000000000..a6dec2baa39c8b0dd0a2260203392f973ee25a65 --- /dev/null +++ b/checkpoint-22806/rng_state_49.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb7f8c6e75737562b2f921f53376daceeb229c0316a2938dd6c6649c08e63788 +size 15997 diff --git a/checkpoint-22806/rng_state_5.pth b/checkpoint-22806/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ef0cb92ac891ddc4b803ad4a4061204219259fe --- /dev/null +++ b/checkpoint-22806/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e46deb0da0db6e8549402477d92eca61b83f6dac8c66562fa310713be80bcdc3 +size 15984 diff --git a/checkpoint-22806/rng_state_50.pth b/checkpoint-22806/rng_state_50.pth new file mode 100644 index 0000000000000000000000000000000000000000..2da603b0eee03553233b9d4b03b41b28789615ff --- /dev/null +++ b/checkpoint-22806/rng_state_50.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d8c456d7b2ae2c463b3c623da337574dcaf4f7498a661c9025b0b972c04c1f +size 15997 diff --git a/checkpoint-22806/rng_state_51.pth b/checkpoint-22806/rng_state_51.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f98cf58e3a7dcabe809d9ab88612cd5a99cbfc3 --- /dev/null +++ b/checkpoint-22806/rng_state_51.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dbec398dadab22536d8a1b4d9355608eb6820f853c99d19e46a4f1c1d55e6ef +size 15997 diff --git a/checkpoint-22806/rng_state_52.pth b/checkpoint-22806/rng_state_52.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd6babb5967fbbbc42cfb080360c39c85290cda5 --- /dev/null +++ b/checkpoint-22806/rng_state_52.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44488e44615a2f8353d6c2253f525f597c949296b509ffcf525f9c70b2ba52bb +size 15997 diff --git a/checkpoint-22806/rng_state_53.pth b/checkpoint-22806/rng_state_53.pth new file mode 100644 index 0000000000000000000000000000000000000000..768279f3abbd223440905679f2f17a8a31c625e3 --- /dev/null +++ b/checkpoint-22806/rng_state_53.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26a73a36d3a6b295102d905fdd44125189cd1d1023c8854ec6fe742c23d9dfbf +size 15997 diff --git a/checkpoint-22806/rng_state_54.pth b/checkpoint-22806/rng_state_54.pth new file mode 100644 index 0000000000000000000000000000000000000000..631e8714c6c6b07aa3c6f6288bb08be6578e8cfa --- /dev/null +++ b/checkpoint-22806/rng_state_54.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a95d15a6a1e11c722d8628080b36173d7ddf0f90e231d2e26b5ec993cea61b6 +size 15997 diff --git a/checkpoint-22806/rng_state_55.pth b/checkpoint-22806/rng_state_55.pth new file mode 100644 index 0000000000000000000000000000000000000000..999d367ec271be0b5498576eb8a62c4fef27db8d --- /dev/null +++ b/checkpoint-22806/rng_state_55.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b13358d4732e4cf8fe8f196e590132ef84e8eea989b4ac1ed752d367422ba8b7 +size 15997 diff --git a/checkpoint-22806/rng_state_56.pth b/checkpoint-22806/rng_state_56.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf651ec3afdb087587a636c90a57d3d084208ff8 --- /dev/null +++ b/checkpoint-22806/rng_state_56.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6879dfe721ca554e861183d1b0a7cc0f450cbb5d31184e85cf5056ac6706b0c +size 15997 diff --git a/checkpoint-22806/rng_state_57.pth b/checkpoint-22806/rng_state_57.pth new file mode 100644 index 0000000000000000000000000000000000000000..d0f82f2326d9e2296627de1d6e53518a96147529 --- /dev/null +++ b/checkpoint-22806/rng_state_57.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e770c7ff73c5cb59b94697f1810fff5c6745f288c478637d70d30d2e6c7dbe6 +size 15997 diff --git a/checkpoint-22806/rng_state_58.pth b/checkpoint-22806/rng_state_58.pth new file mode 100644 index 0000000000000000000000000000000000000000..a5e68bb1c8077afabc23265e3ffe9502983fd1d4 --- /dev/null +++ b/checkpoint-22806/rng_state_58.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d6bfcecd7c145f676b0fbc946f3e5b72838868bc1ef40b39a82d57c4055e58 +size 15997 diff --git a/checkpoint-22806/rng_state_59.pth b/checkpoint-22806/rng_state_59.pth new file mode 100644 index 0000000000000000000000000000000000000000..d32c302d3484cd04e95228925ce05031bfee5274 --- /dev/null +++ b/checkpoint-22806/rng_state_59.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea85709be06b8667fdc9a7bd312a38e0b64b2667d4f8f03dc6682b398f998ad8 +size 15997 diff --git a/checkpoint-22806/rng_state_6.pth b/checkpoint-22806/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..61788422c099e1bdc6f2ea7fae8cc1cefaeb9445 --- /dev/null +++ b/checkpoint-22806/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67fdfd4ea2ff5f9100a4b623218a5b38ed3b942896857d3efbe387efa97c3eea +size 15984 diff --git a/checkpoint-22806/rng_state_60.pth b/checkpoint-22806/rng_state_60.pth new file mode 100644 index 0000000000000000000000000000000000000000..4cca102c2de89532606b67ae0a4d32da78fd60a7 --- /dev/null +++ b/checkpoint-22806/rng_state_60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03b324a41e7b13ac7e065a9aa07ada66abdaacacd50fcb39fc81b3d74a473199 +size 15997 diff --git a/checkpoint-22806/rng_state_61.pth b/checkpoint-22806/rng_state_61.pth new file mode 100644 index 0000000000000000000000000000000000000000..00d06236fe8c83e0e6cdf9603158567379d7cd06 --- /dev/null +++ b/checkpoint-22806/rng_state_61.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa883a7fd33ba27836c2b1efc8921b1b5fe673424d0aeea485aa2c2fce0a66a +size 15997 diff --git a/checkpoint-22806/rng_state_62.pth b/checkpoint-22806/rng_state_62.pth new file mode 100644 index 0000000000000000000000000000000000000000..3fda4207e3e288ffe6ac43033ffe6388fc51aeaa --- /dev/null +++ b/checkpoint-22806/rng_state_62.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:114fccfda43799417b7e77cdf93161a1077e3fe40f10ced7862d966cf4e7c9fe +size 15997 diff --git a/checkpoint-22806/rng_state_63.pth b/checkpoint-22806/rng_state_63.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d27d6051562dabd08efb4f4ba8a65c5ebc8f7c7 --- /dev/null +++ b/checkpoint-22806/rng_state_63.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d283b3b036df9da0e3a6d3811c85f97dcc653a53333f33c201945d6ba9d8b32b +size 15997 diff --git a/checkpoint-22806/rng_state_7.pth b/checkpoint-22806/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7ee076f6faa33cab9dc01b8c267da0017d327c11 --- /dev/null +++ b/checkpoint-22806/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:268b3dd750e731c44eaf62ff09eb72d75e006a06c461d08b82ac0339f4f3919b +size 15984 diff --git a/checkpoint-22806/rng_state_8.pth b/checkpoint-22806/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..4448d42f4dce70c3b7f02a79dbad331d45440a2d --- /dev/null +++ b/checkpoint-22806/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17b6c2bb389388ac7003c0bc39dc8e199abd8e4399820c5bd2105b8cabc8825c +size 15984 diff --git a/checkpoint-22806/rng_state_9.pth b/checkpoint-22806/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..57de83a33940cb2ffae1ce3666e6185b384003cb --- /dev/null +++ b/checkpoint-22806/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0fa8798e8c4851d934e7d6e45fd3c32de9ba8cba67cf2ac2f68a4dd795999b5 +size 15984 diff --git a/checkpoint-22806/scheduler.pt b/checkpoint-22806/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe9bda44b3afcb17267e1ca129c6d01e8b57f8be --- /dev/null +++ b/checkpoint-22806/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a17068b41b5295f58f0cc3530ac7a7186451ad029bef1787159b540c6104ccb9 +size 1064 diff --git a/checkpoint-22806/special_tokens_map.json b/checkpoint-22806/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad43db72a0e94321a5a9455dce616c68d1f9673 --- /dev/null +++ b/checkpoint-22806/special_tokens_map.json @@ -0,0 +1,46 @@ +{ + "additional_special_tokens": [ + { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-22806/tokenizer.json b/checkpoint-22806/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..444d43e1c25d11b63381073024becd006c83d4f6 --- /dev/null +++ b/checkpoint-22806/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fbef9068a1d82c7fafc3fdfd7c717524c8bfbcaea19c14ce4f8a4e616deb57 +size 17210651 diff --git a/checkpoint-22806/tokenizer_config.json b/checkpoint-22806/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a54102d00c210427fe2da524cea00c5ace13686 --- /dev/null +++ b/checkpoint-22806/tokenizer_config.json @@ -0,0 +1,2102 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "[IMG]", + "[/IMG]", + "" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 256, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-22806/trainer_state.json b/checkpoint-22806/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..28f2e1658225415327490a3adae9858bdf1f6acc --- /dev/null +++ b/checkpoint-22806/trainer_state.json @@ -0,0 +1,159676 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 22806, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003069367710251688, + "grad_norm": 1.3492016792297363, + "learning_rate": 0.0, + "loss": 6.5185, + "step": 1 + }, + { + "epoch": 0.0006138735420503376, + "grad_norm": 1.4303781986236572, + "learning_rate": 1.0224948875255626e-07, + "loss": 6.5124, + "step": 2 + }, + { + "epoch": 0.0009208103130755065, + "grad_norm": 1.3981783390045166, + "learning_rate": 2.0449897750511251e-07, + "loss": 6.5204, + "step": 3 + }, + { + "epoch": 0.0012277470841006752, + "grad_norm": 1.3760672807693481, + "learning_rate": 3.0674846625766876e-07, + "loss": 6.502, + "step": 4 + }, + { + "epoch": 0.001534683855125844, + "grad_norm": 1.3704107999801636, + "learning_rate": 4.0899795501022503e-07, + "loss": 6.5021, + "step": 5 + }, + { + "epoch": 0.001841620626151013, + "grad_norm": 1.3109549283981323, + "learning_rate": 5.112474437627812e-07, + "loss": 6.521, + "step": 6 + }, + { + "epoch": 0.002148557397176182, + "grad_norm": 1.475183367729187, + "learning_rate": 6.134969325153375e-07, + "loss": 6.521, + "step": 7 + }, + { + "epoch": 0.0024554941682013503, + "grad_norm": 1.4563297033309937, + "learning_rate": 7.157464212678937e-07, + "loss": 6.5075, + "step": 8 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 1.437183141708374, + "learning_rate": 8.179959100204501e-07, + "loss": 6.5135, + "step": 9 + }, + { + "epoch": 0.003069367710251688, + "grad_norm": 1.336928129196167, + "learning_rate": 9.202453987730062e-07, + "loss": 6.5138, + "step": 10 + }, + { + "epoch": 0.003376304481276857, + "grad_norm": 1.3220698833465576, + "learning_rate": 1.0224948875255625e-06, + "loss": 6.5187, + "step": 11 + }, + { + "epoch": 0.003683241252302026, + "grad_norm": 1.3990652561187744, + "learning_rate": 1.1247443762781187e-06, + "loss": 6.5129, + "step": 12 + }, + { + "epoch": 0.003990178023327195, + "grad_norm": 1.4394340515136719, + "learning_rate": 1.226993865030675e-06, + "loss": 6.5078, + "step": 13 + }, + { + "epoch": 0.004297114794352364, + "grad_norm": 1.3675259351730347, + "learning_rate": 1.3292433537832312e-06, + "loss": 6.5115, + "step": 14 + }, + { + "epoch": 0.004604051565377533, + "grad_norm": 1.3085063695907593, + "learning_rate": 1.4314928425357874e-06, + "loss": 6.5092, + "step": 15 + }, + { + "epoch": 0.004910988336402701, + "grad_norm": 1.4214227199554443, + "learning_rate": 1.5337423312883435e-06, + "loss": 6.5026, + "step": 16 + }, + { + "epoch": 0.0052179251074278695, + "grad_norm": 1.377146601676941, + "learning_rate": 1.6359918200409001e-06, + "loss": 6.4882, + "step": 17 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.3461124897003174, + "learning_rate": 1.7382413087934563e-06, + "loss": 6.4935, + "step": 18 + }, + { + "epoch": 0.005831798649478207, + "grad_norm": 1.3161669969558716, + "learning_rate": 1.8404907975460124e-06, + "loss": 6.4795, + "step": 19 + }, + { + "epoch": 0.006138735420503376, + "grad_norm": 1.2915974855422974, + "learning_rate": 1.942740286298569e-06, + "loss": 6.4529, + "step": 20 + }, + { + "epoch": 0.006445672191528545, + "grad_norm": 1.2675414085388184, + "learning_rate": 2.044989775051125e-06, + "loss": 6.454, + "step": 21 + }, + { + "epoch": 0.006752608962553714, + "grad_norm": 1.2769283056259155, + "learning_rate": 2.147239263803681e-06, + "loss": 6.4574, + "step": 22 + }, + { + "epoch": 0.007059545733578883, + "grad_norm": 1.2556813955307007, + "learning_rate": 2.2494887525562373e-06, + "loss": 6.4486, + "step": 23 + }, + { + "epoch": 0.007366482504604052, + "grad_norm": 1.2158268690109253, + "learning_rate": 2.3517382413087935e-06, + "loss": 6.4357, + "step": 24 + }, + { + "epoch": 0.007673419275629221, + "grad_norm": 1.2383767366409302, + "learning_rate": 2.45398773006135e-06, + "loss": 6.4347, + "step": 25 + }, + { + "epoch": 0.00798035604665439, + "grad_norm": 1.2865383625030518, + "learning_rate": 2.5562372188139062e-06, + "loss": 6.3611, + "step": 26 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 1.1501989364624023, + "learning_rate": 2.6584867075664624e-06, + "loss": 6.3247, + "step": 27 + }, + { + "epoch": 0.008594229588704727, + "grad_norm": 1.0971378087997437, + "learning_rate": 2.7607361963190186e-06, + "loss": 6.3078, + "step": 28 + }, + { + "epoch": 0.008901166359729895, + "grad_norm": 1.1365599632263184, + "learning_rate": 2.8629856850715747e-06, + "loss": 6.3211, + "step": 29 + }, + { + "epoch": 0.009208103130755065, + "grad_norm": 1.1228944063186646, + "learning_rate": 2.965235173824131e-06, + "loss": 6.3185, + "step": 30 + }, + { + "epoch": 0.009515039901780233, + "grad_norm": 1.126287579536438, + "learning_rate": 3.067484662576687e-06, + "loss": 6.2845, + "step": 31 + }, + { + "epoch": 0.009821976672805401, + "grad_norm": 1.1070353984832764, + "learning_rate": 3.1697341513292436e-06, + "loss": 6.2855, + "step": 32 + }, + { + "epoch": 0.010128913443830571, + "grad_norm": 1.101291537284851, + "learning_rate": 3.2719836400818002e-06, + "loss": 6.2764, + "step": 33 + }, + { + "epoch": 0.010435850214855739, + "grad_norm": 1.0643113851547241, + "learning_rate": 3.374233128834356e-06, + "loss": 6.2363, + "step": 34 + }, + { + "epoch": 0.010742786985880909, + "grad_norm": 0.9714563488960266, + "learning_rate": 3.4764826175869125e-06, + "loss": 6.1771, + "step": 35 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8998560309410095, + "learning_rate": 3.5787321063394683e-06, + "loss": 6.1202, + "step": 36 + }, + { + "epoch": 0.011356660527931247, + "grad_norm": 0.8481987714767456, + "learning_rate": 3.680981595092025e-06, + "loss": 6.0954, + "step": 37 + }, + { + "epoch": 0.011663597298956415, + "grad_norm": 0.8124909996986389, + "learning_rate": 3.783231083844581e-06, + "loss": 6.0832, + "step": 38 + }, + { + "epoch": 0.011970534069981584, + "grad_norm": 0.7968178391456604, + "learning_rate": 3.885480572597138e-06, + "loss": 6.0661, + "step": 39 + }, + { + "epoch": 0.012277470841006752, + "grad_norm": 0.7714207768440247, + "learning_rate": 3.987730061349693e-06, + "loss": 6.0385, + "step": 40 + }, + { + "epoch": 0.012584407612031922, + "grad_norm": 0.7436742782592773, + "learning_rate": 4.08997955010225e-06, + "loss": 6.0227, + "step": 41 + }, + { + "epoch": 0.01289134438305709, + "grad_norm": 0.7447277307510376, + "learning_rate": 4.192229038854806e-06, + "loss": 6.0208, + "step": 42 + }, + { + "epoch": 0.013198281154082258, + "grad_norm": 0.6983785629272461, + "learning_rate": 4.294478527607362e-06, + "loss": 6.0295, + "step": 43 + }, + { + "epoch": 0.013505217925107428, + "grad_norm": 0.6630908250808716, + "learning_rate": 4.3967280163599184e-06, + "loss": 6.004, + "step": 44 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.6481929421424866, + "learning_rate": 4.498977505112475e-06, + "loss": 5.9986, + "step": 45 + }, + { + "epoch": 0.014119091467157766, + "grad_norm": 0.7187685966491699, + "learning_rate": 4.601226993865031e-06, + "loss": 6.0008, + "step": 46 + }, + { + "epoch": 0.014426028238182934, + "grad_norm": 0.6550983190536499, + "learning_rate": 4.703476482617587e-06, + "loss": 5.9735, + "step": 47 + }, + { + "epoch": 0.014732965009208104, + "grad_norm": 0.6780675649642944, + "learning_rate": 4.805725971370143e-06, + "loss": 5.9568, + "step": 48 + }, + { + "epoch": 0.015039901780233272, + "grad_norm": 0.703427791595459, + "learning_rate": 4.9079754601227e-06, + "loss": 5.961, + "step": 49 + }, + { + "epoch": 0.015346838551258441, + "grad_norm": 0.6507543921470642, + "learning_rate": 5.0102249488752554e-06, + "loss": 5.9557, + "step": 50 + }, + { + "epoch": 0.01565377532228361, + "grad_norm": 0.5959481000900269, + "learning_rate": 5.1124744376278124e-06, + "loss": 5.9391, + "step": 51 + }, + { + "epoch": 0.01596071209330878, + "grad_norm": 0.5798730254173279, + "learning_rate": 5.214723926380368e-06, + "loss": 5.9488, + "step": 52 + }, + { + "epoch": 0.016267648864333947, + "grad_norm": 0.5932896137237549, + "learning_rate": 5.316973415132925e-06, + "loss": 5.9176, + "step": 53 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5772561430931091, + "learning_rate": 5.419222903885481e-06, + "loss": 5.9069, + "step": 54 + }, + { + "epoch": 0.016881522406384283, + "grad_norm": 0.5578178763389587, + "learning_rate": 5.521472392638037e-06, + "loss": 5.8924, + "step": 55 + }, + { + "epoch": 0.017188459177409455, + "grad_norm": 0.5458457469940186, + "learning_rate": 5.623721881390593e-06, + "loss": 5.9001, + "step": 56 + }, + { + "epoch": 0.017495395948434623, + "grad_norm": 0.5381231904029846, + "learning_rate": 5.7259713701431494e-06, + "loss": 5.8827, + "step": 57 + }, + { + "epoch": 0.01780233271945979, + "grad_norm": 0.540920615196228, + "learning_rate": 5.828220858895706e-06, + "loss": 5.8763, + "step": 58 + }, + { + "epoch": 0.01810926949048496, + "grad_norm": 0.5378615260124207, + "learning_rate": 5.930470347648262e-06, + "loss": 5.865, + "step": 59 + }, + { + "epoch": 0.01841620626151013, + "grad_norm": 0.5139282941818237, + "learning_rate": 6.032719836400819e-06, + "loss": 5.873, + "step": 60 + }, + { + "epoch": 0.0187231430325353, + "grad_norm": 0.5298904776573181, + "learning_rate": 6.134969325153374e-06, + "loss": 5.861, + "step": 61 + }, + { + "epoch": 0.019030079803560467, + "grad_norm": 0.503131628036499, + "learning_rate": 6.237218813905931e-06, + "loss": 5.844, + "step": 62 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.5133433938026428, + "learning_rate": 6.339468302658487e-06, + "loss": 5.8535, + "step": 63 + }, + { + "epoch": 0.019643953345610803, + "grad_norm": 0.4909187853336334, + "learning_rate": 6.4417177914110434e-06, + "loss": 5.8378, + "step": 64 + }, + { + "epoch": 0.019950890116635974, + "grad_norm": 0.6916642785072327, + "learning_rate": 6.5439672801636004e-06, + "loss": 5.8385, + "step": 65 + }, + { + "epoch": 0.020257826887661142, + "grad_norm": 0.4801484942436218, + "learning_rate": 6.646216768916155e-06, + "loss": 5.8089, + "step": 66 + }, + { + "epoch": 0.02056476365868631, + "grad_norm": 0.47745251655578613, + "learning_rate": 6.748466257668712e-06, + "loss": 5.8119, + "step": 67 + }, + { + "epoch": 0.020871700429711478, + "grad_norm": 0.4693359136581421, + "learning_rate": 6.850715746421268e-06, + "loss": 5.8038, + "step": 68 + }, + { + "epoch": 0.02117863720073665, + "grad_norm": 0.46996453404426575, + "learning_rate": 6.952965235173825e-06, + "loss": 5.7966, + "step": 69 + }, + { + "epoch": 0.021485573971761818, + "grad_norm": 0.45779168605804443, + "learning_rate": 7.05521472392638e-06, + "loss": 5.7959, + "step": 70 + }, + { + "epoch": 0.021792510742786986, + "grad_norm": 0.49008259177207947, + "learning_rate": 7.1574642126789366e-06, + "loss": 5.7861, + "step": 71 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.44727766513824463, + "learning_rate": 7.259713701431494e-06, + "loss": 5.7716, + "step": 72 + }, + { + "epoch": 0.022406384284837322, + "grad_norm": 0.4392741918563843, + "learning_rate": 7.36196319018405e-06, + "loss": 5.7776, + "step": 73 + }, + { + "epoch": 0.022713321055862493, + "grad_norm": 0.43525391817092896, + "learning_rate": 7.464212678936605e-06, + "loss": 5.7687, + "step": 74 + }, + { + "epoch": 0.02302025782688766, + "grad_norm": 0.4370710253715515, + "learning_rate": 7.566462167689162e-06, + "loss": 5.7504, + "step": 75 + }, + { + "epoch": 0.02332719459791283, + "grad_norm": 0.4349770247936249, + "learning_rate": 7.668711656441718e-06, + "loss": 5.7425, + "step": 76 + }, + { + "epoch": 0.023634131368937997, + "grad_norm": 0.42710933089256287, + "learning_rate": 7.770961145194275e-06, + "loss": 5.7562, + "step": 77 + }, + { + "epoch": 0.02394106813996317, + "grad_norm": 0.42816224694252014, + "learning_rate": 7.87321063394683e-06, + "loss": 5.7301, + "step": 78 + }, + { + "epoch": 0.024248004910988337, + "grad_norm": 0.4183364510536194, + "learning_rate": 7.975460122699386e-06, + "loss": 5.7131, + "step": 79 + }, + { + "epoch": 0.024554941682013505, + "grad_norm": 0.4179428517818451, + "learning_rate": 8.077709611451943e-06, + "loss": 5.7057, + "step": 80 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.40880727767944336, + "learning_rate": 8.1799591002045e-06, + "loss": 5.7179, + "step": 81 + }, + { + "epoch": 0.025168815224063844, + "grad_norm": 0.40961235761642456, + "learning_rate": 8.282208588957055e-06, + "loss": 5.7008, + "step": 82 + }, + { + "epoch": 0.025475751995089013, + "grad_norm": 0.46789029240608215, + "learning_rate": 8.384458077709612e-06, + "loss": 5.7071, + "step": 83 + }, + { + "epoch": 0.02578268876611418, + "grad_norm": 0.4776248335838318, + "learning_rate": 8.486707566462168e-06, + "loss": 5.6829, + "step": 84 + }, + { + "epoch": 0.02608962553713935, + "grad_norm": 0.40660589933395386, + "learning_rate": 8.588957055214725e-06, + "loss": 5.6732, + "step": 85 + }, + { + "epoch": 0.026396562308164517, + "grad_norm": 0.3984324038028717, + "learning_rate": 8.69120654396728e-06, + "loss": 5.6777, + "step": 86 + }, + { + "epoch": 0.026703499079189688, + "grad_norm": 0.3972148597240448, + "learning_rate": 8.793456032719837e-06, + "loss": 5.6598, + "step": 87 + }, + { + "epoch": 0.027010435850214856, + "grad_norm": 0.3906182050704956, + "learning_rate": 8.895705521472392e-06, + "loss": 5.6468, + "step": 88 + }, + { + "epoch": 0.027317372621240024, + "grad_norm": 0.38598939776420593, + "learning_rate": 8.99795501022495e-06, + "loss": 5.6452, + "step": 89 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.405943363904953, + "learning_rate": 9.100204498977506e-06, + "loss": 5.6408, + "step": 90 + }, + { + "epoch": 0.027931246163290364, + "grad_norm": 0.3859459161758423, + "learning_rate": 9.202453987730062e-06, + "loss": 5.613, + "step": 91 + }, + { + "epoch": 0.028238182934315532, + "grad_norm": 0.3773545026779175, + "learning_rate": 9.304703476482619e-06, + "loss": 5.6277, + "step": 92 + }, + { + "epoch": 0.0285451197053407, + "grad_norm": 0.36915943026542664, + "learning_rate": 9.406952965235174e-06, + "loss": 5.618, + "step": 93 + }, + { + "epoch": 0.028852056476365868, + "grad_norm": 0.3732316792011261, + "learning_rate": 9.509202453987731e-06, + "loss": 5.6066, + "step": 94 + }, + { + "epoch": 0.029158993247391036, + "grad_norm": 0.3670802414417267, + "learning_rate": 9.611451942740286e-06, + "loss": 5.6189, + "step": 95 + }, + { + "epoch": 0.029465930018416207, + "grad_norm": 0.3672202229499817, + "learning_rate": 9.713701431492843e-06, + "loss": 5.6046, + "step": 96 + }, + { + "epoch": 0.029772866789441375, + "grad_norm": 0.3624509871006012, + "learning_rate": 9.8159509202454e-06, + "loss": 5.585, + "step": 97 + }, + { + "epoch": 0.030079803560466543, + "grad_norm": 0.36265870928764343, + "learning_rate": 9.918200408997956e-06, + "loss": 5.5867, + "step": 98 + }, + { + "epoch": 0.03038674033149171, + "grad_norm": 0.3606979548931122, + "learning_rate": 1.0020449897750511e-05, + "loss": 5.5658, + "step": 99 + }, + { + "epoch": 0.030693677102516883, + "grad_norm": 0.36800363659858704, + "learning_rate": 1.0122699386503068e-05, + "loss": 5.5494, + "step": 100 + }, + { + "epoch": 0.03100061387354205, + "grad_norm": 0.3641016483306885, + "learning_rate": 1.0224948875255625e-05, + "loss": 5.5553, + "step": 101 + }, + { + "epoch": 0.03130755064456722, + "grad_norm": 0.36807990074157715, + "learning_rate": 1.032719836400818e-05, + "loss": 5.5315, + "step": 102 + }, + { + "epoch": 0.03161448741559239, + "grad_norm": 0.37071728706359863, + "learning_rate": 1.0429447852760736e-05, + "loss": 5.522, + "step": 103 + }, + { + "epoch": 0.03192142418661756, + "grad_norm": 0.3549076020717621, + "learning_rate": 1.0531697341513293e-05, + "loss": 5.5354, + "step": 104 + }, + { + "epoch": 0.03222836095764273, + "grad_norm": 0.3589537441730499, + "learning_rate": 1.063394683026585e-05, + "loss": 5.534, + "step": 105 + }, + { + "epoch": 0.032535297728667895, + "grad_norm": 0.4341397285461426, + "learning_rate": 1.0736196319018407e-05, + "loss": 5.5088, + "step": 106 + }, + { + "epoch": 0.03284223449969306, + "grad_norm": 0.37220680713653564, + "learning_rate": 1.0838445807770962e-05, + "loss": 5.5213, + "step": 107 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.3776145875453949, + "learning_rate": 1.0940695296523517e-05, + "loss": 5.4955, + "step": 108 + }, + { + "epoch": 0.0334561080417434, + "grad_norm": 0.38651829957962036, + "learning_rate": 1.1042944785276074e-05, + "loss": 5.4916, + "step": 109 + }, + { + "epoch": 0.03376304481276857, + "grad_norm": 0.3749970495700836, + "learning_rate": 1.1145194274028631e-05, + "loss": 5.4686, + "step": 110 + }, + { + "epoch": 0.03406998158379374, + "grad_norm": 0.38184404373168945, + "learning_rate": 1.1247443762781187e-05, + "loss": 5.4694, + "step": 111 + }, + { + "epoch": 0.03437691835481891, + "grad_norm": 0.38783952593803406, + "learning_rate": 1.1349693251533742e-05, + "loss": 5.4447, + "step": 112 + }, + { + "epoch": 0.03468385512584408, + "grad_norm": 0.369125097990036, + "learning_rate": 1.1451942740286299e-05, + "loss": 5.4506, + "step": 113 + }, + { + "epoch": 0.034990791896869246, + "grad_norm": 0.3773012161254883, + "learning_rate": 1.1554192229038856e-05, + "loss": 5.4637, + "step": 114 + }, + { + "epoch": 0.035297728667894414, + "grad_norm": 0.47702446579933167, + "learning_rate": 1.1656441717791411e-05, + "loss": 5.4487, + "step": 115 + }, + { + "epoch": 0.03560466543891958, + "grad_norm": 0.5288241505622864, + "learning_rate": 1.1758691206543968e-05, + "loss": 5.4216, + "step": 116 + }, + { + "epoch": 0.03591160220994475, + "grad_norm": 0.49916699528694153, + "learning_rate": 1.1860940695296524e-05, + "loss": 5.4055, + "step": 117 + }, + { + "epoch": 0.03621853898096992, + "grad_norm": 0.5027921795845032, + "learning_rate": 1.196319018404908e-05, + "loss": 5.4141, + "step": 118 + }, + { + "epoch": 0.036525475751995086, + "grad_norm": 0.5069209933280945, + "learning_rate": 1.2065439672801638e-05, + "loss": 5.4277, + "step": 119 + }, + { + "epoch": 0.03683241252302026, + "grad_norm": 0.5208525657653809, + "learning_rate": 1.2167689161554193e-05, + "loss": 5.4023, + "step": 120 + }, + { + "epoch": 0.03713934929404543, + "grad_norm": 0.7059593796730042, + "learning_rate": 1.2269938650306748e-05, + "loss": 5.3797, + "step": 121 + }, + { + "epoch": 0.0374462860650706, + "grad_norm": 0.71112060546875, + "learning_rate": 1.2372188139059305e-05, + "loss": 5.3619, + "step": 122 + }, + { + "epoch": 0.037753222836095765, + "grad_norm": 0.5095361471176147, + "learning_rate": 1.2474437627811862e-05, + "loss": 5.3667, + "step": 123 + }, + { + "epoch": 0.03806015960712093, + "grad_norm": 0.986062228679657, + "learning_rate": 1.2576687116564418e-05, + "loss": 5.3459, + "step": 124 + }, + { + "epoch": 0.0383670963781461, + "grad_norm": 0.693392813205719, + "learning_rate": 1.2678936605316975e-05, + "loss": 5.3165, + "step": 125 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7835625410079956, + "learning_rate": 1.278118609406953e-05, + "loss": 5.3205, + "step": 126 + }, + { + "epoch": 0.03898096992019644, + "grad_norm": 0.6314569711685181, + "learning_rate": 1.2883435582822087e-05, + "loss": 5.3287, + "step": 127 + }, + { + "epoch": 0.039287906691221605, + "grad_norm": 0.9079526662826538, + "learning_rate": 1.2985685071574644e-05, + "loss": 5.2935, + "step": 128 + }, + { + "epoch": 0.03959484346224678, + "grad_norm": 0.6998131275177002, + "learning_rate": 1.3087934560327201e-05, + "loss": 5.315, + "step": 129 + }, + { + "epoch": 0.03990178023327195, + "grad_norm": 0.7570182085037231, + "learning_rate": 1.3190184049079754e-05, + "loss": 5.293, + "step": 130 + }, + { + "epoch": 0.040208717004297116, + "grad_norm": 0.6972737908363342, + "learning_rate": 1.329243353783231e-05, + "loss": 5.2863, + "step": 131 + }, + { + "epoch": 0.040515653775322284, + "grad_norm": 0.8841190934181213, + "learning_rate": 1.3394683026584867e-05, + "loss": 5.2518, + "step": 132 + }, + { + "epoch": 0.04082259054634745, + "grad_norm": 0.6792641282081604, + "learning_rate": 1.3496932515337424e-05, + "loss": 5.2386, + "step": 133 + }, + { + "epoch": 0.04112952731737262, + "grad_norm": 0.9234145879745483, + "learning_rate": 1.359918200408998e-05, + "loss": 5.2418, + "step": 134 + }, + { + "epoch": 0.04143646408839779, + "grad_norm": 1.1438226699829102, + "learning_rate": 1.3701431492842536e-05, + "loss": 5.2298, + "step": 135 + }, + { + "epoch": 0.041743400859422956, + "grad_norm": 0.910861074924469, + "learning_rate": 1.3803680981595093e-05, + "loss": 5.2437, + "step": 136 + }, + { + "epoch": 0.042050337630448124, + "grad_norm": 0.8995844721794128, + "learning_rate": 1.390593047034765e-05, + "loss": 5.2456, + "step": 137 + }, + { + "epoch": 0.0423572744014733, + "grad_norm": 0.8543404936790466, + "learning_rate": 1.4008179959100204e-05, + "loss": 5.1888, + "step": 138 + }, + { + "epoch": 0.04266421117249847, + "grad_norm": 0.7565917372703552, + "learning_rate": 1.411042944785276e-05, + "loss": 5.1939, + "step": 139 + }, + { + "epoch": 0.042971147943523635, + "grad_norm": 0.7103878259658813, + "learning_rate": 1.4212678936605318e-05, + "loss": 5.1693, + "step": 140 + }, + { + "epoch": 0.0432780847145488, + "grad_norm": 1.008686900138855, + "learning_rate": 1.4314928425357873e-05, + "loss": 5.1467, + "step": 141 + }, + { + "epoch": 0.04358502148557397, + "grad_norm": 1.1134421825408936, + "learning_rate": 1.441717791411043e-05, + "loss": 5.1695, + "step": 142 + }, + { + "epoch": 0.04389195825659914, + "grad_norm": 0.7418283820152283, + "learning_rate": 1.4519427402862987e-05, + "loss": 5.1556, + "step": 143 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 1.3332276344299316, + "learning_rate": 1.4621676891615542e-05, + "loss": 5.1736, + "step": 144 + }, + { + "epoch": 0.044505831798649476, + "grad_norm": 0.99709153175354, + "learning_rate": 1.47239263803681e-05, + "loss": 5.1326, + "step": 145 + }, + { + "epoch": 0.044812768569674644, + "grad_norm": 2.0185158252716064, + "learning_rate": 1.4826175869120657e-05, + "loss": 5.1075, + "step": 146 + }, + { + "epoch": 0.04511970534069982, + "grad_norm": 0.9810693264007568, + "learning_rate": 1.492842535787321e-05, + "loss": 5.1181, + "step": 147 + }, + { + "epoch": 0.04542664211172499, + "grad_norm": 1.3122087717056274, + "learning_rate": 1.5030674846625767e-05, + "loss": 5.1104, + "step": 148 + }, + { + "epoch": 0.045733578882750155, + "grad_norm": 1.230662226676941, + "learning_rate": 1.5132924335378324e-05, + "loss": 5.0721, + "step": 149 + }, + { + "epoch": 0.04604051565377532, + "grad_norm": 0.9584419131278992, + "learning_rate": 1.523517382413088e-05, + "loss": 5.0574, + "step": 150 + }, + { + "epoch": 0.04634745242480049, + "grad_norm": 1.3933353424072266, + "learning_rate": 1.5337423312883436e-05, + "loss": 5.0468, + "step": 151 + }, + { + "epoch": 0.04665438919582566, + "grad_norm": 1.2336134910583496, + "learning_rate": 1.5439672801635993e-05, + "loss": 5.0596, + "step": 152 + }, + { + "epoch": 0.04696132596685083, + "grad_norm": 1.3005256652832031, + "learning_rate": 1.554192229038855e-05, + "loss": 5.0236, + "step": 153 + }, + { + "epoch": 0.047268262737875995, + "grad_norm": 1.2528692483901978, + "learning_rate": 1.5644171779141108e-05, + "loss": 5.0269, + "step": 154 + }, + { + "epoch": 0.04757519950890117, + "grad_norm": 1.0448148250579834, + "learning_rate": 1.574642126789366e-05, + "loss": 5.0338, + "step": 155 + }, + { + "epoch": 0.04788213627992634, + "grad_norm": 1.2372045516967773, + "learning_rate": 1.5848670756646218e-05, + "loss": 4.9544, + "step": 156 + }, + { + "epoch": 0.048189073050951506, + "grad_norm": 1.2700645923614502, + "learning_rate": 1.5950920245398772e-05, + "loss": 4.9723, + "step": 157 + }, + { + "epoch": 0.048496009821976674, + "grad_norm": 1.1283228397369385, + "learning_rate": 1.605316973415133e-05, + "loss": 4.9801, + "step": 158 + }, + { + "epoch": 0.04880294659300184, + "grad_norm": 1.5563665628433228, + "learning_rate": 1.6155419222903886e-05, + "loss": 4.9118, + "step": 159 + }, + { + "epoch": 0.04910988336402701, + "grad_norm": 1.3759487867355347, + "learning_rate": 1.6257668711656443e-05, + "loss": 4.9552, + "step": 160 + }, + { + "epoch": 0.04941682013505218, + "grad_norm": 1.2167878150939941, + "learning_rate": 1.6359918200409e-05, + "loss": 4.9186, + "step": 161 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.6424930095672607, + "learning_rate": 1.6462167689161557e-05, + "loss": 4.9143, + "step": 162 + }, + { + "epoch": 0.050030693677102514, + "grad_norm": 1.0009948015213013, + "learning_rate": 1.656441717791411e-05, + "loss": 4.8615, + "step": 163 + }, + { + "epoch": 0.05033763044812769, + "grad_norm": 1.8803274631500244, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.8558, + "step": 164 + }, + { + "epoch": 0.05064456721915286, + "grad_norm": 1.1819735765457153, + "learning_rate": 1.6768916155419224e-05, + "loss": 4.8453, + "step": 165 + }, + { + "epoch": 0.050951503990178025, + "grad_norm": 1.9724273681640625, + "learning_rate": 1.6871165644171778e-05, + "loss": 4.8573, + "step": 166 + }, + { + "epoch": 0.05125844076120319, + "grad_norm": 1.4624557495117188, + "learning_rate": 1.6973415132924335e-05, + "loss": 4.8494, + "step": 167 + }, + { + "epoch": 0.05156537753222836, + "grad_norm": 1.4750267267227173, + "learning_rate": 1.7075664621676892e-05, + "loss": 4.8296, + "step": 168 + }, + { + "epoch": 0.05187231430325353, + "grad_norm": 1.3206923007965088, + "learning_rate": 1.717791411042945e-05, + "loss": 4.7834, + "step": 169 + }, + { + "epoch": 0.0521792510742787, + "grad_norm": 1.4332681894302368, + "learning_rate": 1.7280163599182006e-05, + "loss": 4.8008, + "step": 170 + }, + { + "epoch": 0.052486187845303865, + "grad_norm": 1.612804651260376, + "learning_rate": 1.738241308793456e-05, + "loss": 4.7885, + "step": 171 + }, + { + "epoch": 0.05279312461632903, + "grad_norm": 1.3880311250686646, + "learning_rate": 1.7484662576687117e-05, + "loss": 4.8034, + "step": 172 + }, + { + "epoch": 0.05310006138735421, + "grad_norm": 1.7550631761550903, + "learning_rate": 1.7586912065439674e-05, + "loss": 4.7568, + "step": 173 + }, + { + "epoch": 0.053406998158379376, + "grad_norm": 1.653678297996521, + "learning_rate": 1.768916155419223e-05, + "loss": 4.7294, + "step": 174 + }, + { + "epoch": 0.053713934929404544, + "grad_norm": 1.6094826459884644, + "learning_rate": 1.7791411042944784e-05, + "loss": 4.7409, + "step": 175 + }, + { + "epoch": 0.05402087170042971, + "grad_norm": 1.7453033924102783, + "learning_rate": 1.789366053169734e-05, + "loss": 4.7191, + "step": 176 + }, + { + "epoch": 0.05432780847145488, + "grad_norm": 1.3073794841766357, + "learning_rate": 1.79959100204499e-05, + "loss": 4.7347, + "step": 177 + }, + { + "epoch": 0.05463474524248005, + "grad_norm": 2.096515655517578, + "learning_rate": 1.8098159509202455e-05, + "loss": 4.7396, + "step": 178 + }, + { + "epoch": 0.054941682013505216, + "grad_norm": 1.3826024532318115, + "learning_rate": 1.8200408997955012e-05, + "loss": 4.6988, + "step": 179 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 1.9290310144424438, + "learning_rate": 1.8302658486707566e-05, + "loss": 4.6653, + "step": 180 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.7404149770736694, + "learning_rate": 1.8404907975460123e-05, + "loss": 4.7102, + "step": 181 + }, + { + "epoch": 0.05586249232658073, + "grad_norm": 1.7535779476165771, + "learning_rate": 1.850715746421268e-05, + "loss": 4.7124, + "step": 182 + }, + { + "epoch": 0.056169429097605895, + "grad_norm": 1.7792351245880127, + "learning_rate": 1.8609406952965237e-05, + "loss": 4.6969, + "step": 183 + }, + { + "epoch": 0.056476365868631064, + "grad_norm": 2.048332452774048, + "learning_rate": 1.8711656441717794e-05, + "loss": 4.6134, + "step": 184 + }, + { + "epoch": 0.05678330263965623, + "grad_norm": 1.9558366537094116, + "learning_rate": 1.8813905930470348e-05, + "loss": 4.6739, + "step": 185 + }, + { + "epoch": 0.0570902394106814, + "grad_norm": 2.5299644470214844, + "learning_rate": 1.8916155419222905e-05, + "loss": 4.6248, + "step": 186 + }, + { + "epoch": 0.05739717618170657, + "grad_norm": 2.143704891204834, + "learning_rate": 1.9018404907975462e-05, + "loss": 4.6664, + "step": 187 + }, + { + "epoch": 0.057704112952731736, + "grad_norm": 1.925010323524475, + "learning_rate": 1.9120654396728015e-05, + "loss": 4.5657, + "step": 188 + }, + { + "epoch": 0.058011049723756904, + "grad_norm": 1.8223596811294556, + "learning_rate": 1.9222903885480572e-05, + "loss": 4.6124, + "step": 189 + }, + { + "epoch": 0.05831798649478207, + "grad_norm": 1.9519827365875244, + "learning_rate": 1.932515337423313e-05, + "loss": 4.5937, + "step": 190 + }, + { + "epoch": 0.05862492326580725, + "grad_norm": 2.062534809112549, + "learning_rate": 1.9427402862985686e-05, + "loss": 4.6023, + "step": 191 + }, + { + "epoch": 0.058931860036832415, + "grad_norm": 1.8512892723083496, + "learning_rate": 1.9529652351738243e-05, + "loss": 4.5709, + "step": 192 + }, + { + "epoch": 0.05923879680785758, + "grad_norm": 2.7771248817443848, + "learning_rate": 1.96319018404908e-05, + "loss": 4.5902, + "step": 193 + }, + { + "epoch": 0.05954573357888275, + "grad_norm": 1.8911874294281006, + "learning_rate": 1.9734151329243354e-05, + "loss": 4.4973, + "step": 194 + }, + { + "epoch": 0.05985267034990792, + "grad_norm": 2.261096715927124, + "learning_rate": 1.983640081799591e-05, + "loss": 4.5343, + "step": 195 + }, + { + "epoch": 0.06015960712093309, + "grad_norm": 1.833983302116394, + "learning_rate": 1.9938650306748465e-05, + "loss": 4.5604, + "step": 196 + }, + { + "epoch": 0.060466543891958255, + "grad_norm": 2.6909141540527344, + "learning_rate": 2.0040899795501022e-05, + "loss": 4.5411, + "step": 197 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.8085883855819702, + "learning_rate": 2.014314928425358e-05, + "loss": 4.5127, + "step": 198 + }, + { + "epoch": 0.06108041743400859, + "grad_norm": 3.082063913345337, + "learning_rate": 2.0245398773006136e-05, + "loss": 4.5055, + "step": 199 + }, + { + "epoch": 0.061387354205033766, + "grad_norm": 1.6942392587661743, + "learning_rate": 2.0347648261758693e-05, + "loss": 4.4852, + "step": 200 + }, + { + "epoch": 0.061694290976058934, + "grad_norm": 2.428569793701172, + "learning_rate": 2.044989775051125e-05, + "loss": 4.4876, + "step": 201 + }, + { + "epoch": 0.0620012277470841, + "grad_norm": 2.1669068336486816, + "learning_rate": 2.0552147239263807e-05, + "loss": 4.5156, + "step": 202 + }, + { + "epoch": 0.06230816451810927, + "grad_norm": 1.8558237552642822, + "learning_rate": 2.065439672801636e-05, + "loss": 4.495, + "step": 203 + }, + { + "epoch": 0.06261510128913444, + "grad_norm": 2.86224627494812, + "learning_rate": 2.0756646216768917e-05, + "loss": 4.4881, + "step": 204 + }, + { + "epoch": 0.06292203806015961, + "grad_norm": 2.263230562210083, + "learning_rate": 2.085889570552147e-05, + "loss": 4.4349, + "step": 205 + }, + { + "epoch": 0.06322897483118478, + "grad_norm": 2.533039093017578, + "learning_rate": 2.0961145194274028e-05, + "loss": 4.4921, + "step": 206 + }, + { + "epoch": 0.06353591160220995, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.1063394683026585e-05, + "loss": 4.4581, + "step": 207 + }, + { + "epoch": 0.06384284837323512, + "grad_norm": 1.9801981449127197, + "learning_rate": 2.1165644171779142e-05, + "loss": 4.4646, + "step": 208 + }, + { + "epoch": 0.06414978514426029, + "grad_norm": 2.8499860763549805, + "learning_rate": 2.12678936605317e-05, + "loss": 4.3913, + "step": 209 + }, + { + "epoch": 0.06445672191528545, + "grad_norm": 1.8176993131637573, + "learning_rate": 2.1370143149284256e-05, + "loss": 4.4414, + "step": 210 + }, + { + "epoch": 0.06476365868631062, + "grad_norm": 3.1497061252593994, + "learning_rate": 2.1472392638036813e-05, + "loss": 4.4164, + "step": 211 + }, + { + "epoch": 0.06507059545733579, + "grad_norm": 2.0509049892425537, + "learning_rate": 2.1574642126789367e-05, + "loss": 4.4198, + "step": 212 + }, + { + "epoch": 0.06537753222836096, + "grad_norm": 2.5346014499664307, + "learning_rate": 2.1676891615541924e-05, + "loss": 4.3628, + "step": 213 + }, + { + "epoch": 0.06568446899938613, + "grad_norm": 2.281947135925293, + "learning_rate": 2.1779141104294477e-05, + "loss": 4.3824, + "step": 214 + }, + { + "epoch": 0.0659914057704113, + "grad_norm": 2.9005074501037598, + "learning_rate": 2.1881390593047034e-05, + "loss": 4.4227, + "step": 215 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 2.5869741439819336, + "learning_rate": 2.198364008179959e-05, + "loss": 4.4231, + "step": 216 + }, + { + "epoch": 0.06660527931246163, + "grad_norm": 2.339655637741089, + "learning_rate": 2.208588957055215e-05, + "loss": 4.3901, + "step": 217 + }, + { + "epoch": 0.0669122160834868, + "grad_norm": 2.430664539337158, + "learning_rate": 2.2188139059304705e-05, + "loss": 4.3487, + "step": 218 + }, + { + "epoch": 0.06721915285451197, + "grad_norm": 2.1791040897369385, + "learning_rate": 2.2290388548057262e-05, + "loss": 4.3404, + "step": 219 + }, + { + "epoch": 0.06752608962553713, + "grad_norm": 2.7054920196533203, + "learning_rate": 2.239263803680982e-05, + "loss": 4.4186, + "step": 220 + }, + { + "epoch": 0.0678330263965623, + "grad_norm": 2.516566514968872, + "learning_rate": 2.2494887525562373e-05, + "loss": 4.4102, + "step": 221 + }, + { + "epoch": 0.06813996316758748, + "grad_norm": 2.3522324562072754, + "learning_rate": 2.259713701431493e-05, + "loss": 4.4062, + "step": 222 + }, + { + "epoch": 0.06844689993861265, + "grad_norm": 2.557600259780884, + "learning_rate": 2.2699386503067484e-05, + "loss": 4.3711, + "step": 223 + }, + { + "epoch": 0.06875383670963782, + "grad_norm": 2.0590531826019287, + "learning_rate": 2.280163599182004e-05, + "loss": 4.3546, + "step": 224 + }, + { + "epoch": 0.06906077348066299, + "grad_norm": 4.704878330230713, + "learning_rate": 2.2903885480572598e-05, + "loss": 4.39, + "step": 225 + }, + { + "epoch": 0.06936771025168816, + "grad_norm": 2.237440347671509, + "learning_rate": 2.3006134969325155e-05, + "loss": 4.3425, + "step": 226 + }, + { + "epoch": 0.06967464702271332, + "grad_norm": 3.9394450187683105, + "learning_rate": 2.3108384458077712e-05, + "loss": 4.3641, + "step": 227 + }, + { + "epoch": 0.06998158379373849, + "grad_norm": 2.4857213497161865, + "learning_rate": 2.321063394683027e-05, + "loss": 4.3435, + "step": 228 + }, + { + "epoch": 0.07028852056476366, + "grad_norm": 2.893437147140503, + "learning_rate": 2.3312883435582822e-05, + "loss": 4.329, + "step": 229 + }, + { + "epoch": 0.07059545733578883, + "grad_norm": 2.6498284339904785, + "learning_rate": 2.341513292433538e-05, + "loss": 4.3058, + "step": 230 + }, + { + "epoch": 0.070902394106814, + "grad_norm": 2.4182214736938477, + "learning_rate": 2.3517382413087936e-05, + "loss": 4.3147, + "step": 231 + }, + { + "epoch": 0.07120933087783916, + "grad_norm": 2.532050371170044, + "learning_rate": 2.361963190184049e-05, + "loss": 4.3388, + "step": 232 + }, + { + "epoch": 0.07151626764886433, + "grad_norm": 2.5818533897399902, + "learning_rate": 2.3721881390593047e-05, + "loss": 4.3023, + "step": 233 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 2.1860098838806152, + "learning_rate": 2.3824130879345604e-05, + "loss": 4.2571, + "step": 234 + }, + { + "epoch": 0.07213014119091467, + "grad_norm": 3.5780131816864014, + "learning_rate": 2.392638036809816e-05, + "loss": 4.3336, + "step": 235 + }, + { + "epoch": 0.07243707796193984, + "grad_norm": 2.24653697013855, + "learning_rate": 2.4028629856850718e-05, + "loss": 4.3013, + "step": 236 + }, + { + "epoch": 0.072744014732965, + "grad_norm": 3.59663987159729, + "learning_rate": 2.4130879345603275e-05, + "loss": 4.3248, + "step": 237 + }, + { + "epoch": 0.07305095150399017, + "grad_norm": 2.818321943283081, + "learning_rate": 2.423312883435583e-05, + "loss": 4.2876, + "step": 238 + }, + { + "epoch": 0.07335788827501534, + "grad_norm": 2.457371950149536, + "learning_rate": 2.4335378323108386e-05, + "loss": 4.2584, + "step": 239 + }, + { + "epoch": 0.07366482504604052, + "grad_norm": 3.6243598461151123, + "learning_rate": 2.4437627811860943e-05, + "loss": 4.2786, + "step": 240 + }, + { + "epoch": 0.07397176181706569, + "grad_norm": 2.113060474395752, + "learning_rate": 2.4539877300613496e-05, + "loss": 4.2071, + "step": 241 + }, + { + "epoch": 0.07427869858809086, + "grad_norm": 5.355374813079834, + "learning_rate": 2.4642126789366053e-05, + "loss": 4.2871, + "step": 242 + }, + { + "epoch": 0.07458563535911603, + "grad_norm": 2.4509847164154053, + "learning_rate": 2.474437627811861e-05, + "loss": 4.2073, + "step": 243 + }, + { + "epoch": 0.0748925721301412, + "grad_norm": 3.313793659210205, + "learning_rate": 2.4846625766871167e-05, + "loss": 4.2938, + "step": 244 + }, + { + "epoch": 0.07519950890116636, + "grad_norm": 2.731903553009033, + "learning_rate": 2.4948875255623724e-05, + "loss": 4.2023, + "step": 245 + }, + { + "epoch": 0.07550644567219153, + "grad_norm": 2.6218042373657227, + "learning_rate": 2.505112474437628e-05, + "loss": 4.2492, + "step": 246 + }, + { + "epoch": 0.0758133824432167, + "grad_norm": 3.2865426540374756, + "learning_rate": 2.5153374233128835e-05, + "loss": 4.2358, + "step": 247 + }, + { + "epoch": 0.07612031921424187, + "grad_norm": 2.21870756149292, + "learning_rate": 2.5255623721881395e-05, + "loss": 4.1989, + "step": 248 + }, + { + "epoch": 0.07642725598526703, + "grad_norm": 4.095842361450195, + "learning_rate": 2.535787321063395e-05, + "loss": 4.2484, + "step": 249 + }, + { + "epoch": 0.0767341927562922, + "grad_norm": 2.21420955657959, + "learning_rate": 2.5460122699386503e-05, + "loss": 4.1985, + "step": 250 + }, + { + "epoch": 0.07704112952731737, + "grad_norm": 3.011272668838501, + "learning_rate": 2.556237218813906e-05, + "loss": 4.2182, + "step": 251 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 2.930999279022217, + "learning_rate": 2.5664621676891613e-05, + "loss": 4.1985, + "step": 252 + }, + { + "epoch": 0.0776550030693677, + "grad_norm": 2.8528032302856445, + "learning_rate": 2.5766871165644174e-05, + "loss": 4.1859, + "step": 253 + }, + { + "epoch": 0.07796193984039287, + "grad_norm": 3.215587854385376, + "learning_rate": 2.5869120654396727e-05, + "loss": 4.2416, + "step": 254 + }, + { + "epoch": 0.07826887661141804, + "grad_norm": 3.1349990367889404, + "learning_rate": 2.5971370143149288e-05, + "loss": 4.2204, + "step": 255 + }, + { + "epoch": 0.07857581338244321, + "grad_norm": 3.146942377090454, + "learning_rate": 2.607361963190184e-05, + "loss": 4.17, + "step": 256 + }, + { + "epoch": 0.07888275015346839, + "grad_norm": 2.2611942291259766, + "learning_rate": 2.6175869120654402e-05, + "loss": 4.191, + "step": 257 + }, + { + "epoch": 0.07918968692449356, + "grad_norm": 3.434574604034424, + "learning_rate": 2.6278118609406955e-05, + "loss": 4.1854, + "step": 258 + }, + { + "epoch": 0.07949662369551873, + "grad_norm": 2.3132400512695312, + "learning_rate": 2.638036809815951e-05, + "loss": 4.233, + "step": 259 + }, + { + "epoch": 0.0798035604665439, + "grad_norm": 3.2676596641540527, + "learning_rate": 2.6482617586912066e-05, + "loss": 4.1586, + "step": 260 + }, + { + "epoch": 0.08011049723756906, + "grad_norm": 2.6182920932769775, + "learning_rate": 2.658486707566462e-05, + "loss": 4.164, + "step": 261 + }, + { + "epoch": 0.08041743400859423, + "grad_norm": 2.872018814086914, + "learning_rate": 2.668711656441718e-05, + "loss": 4.1642, + "step": 262 + }, + { + "epoch": 0.0807243707796194, + "grad_norm": 3.147237539291382, + "learning_rate": 2.6789366053169734e-05, + "loss": 4.147, + "step": 263 + }, + { + "epoch": 0.08103130755064457, + "grad_norm": 2.363360643386841, + "learning_rate": 2.6891615541922294e-05, + "loss": 4.1388, + "step": 264 + }, + { + "epoch": 0.08133824432166974, + "grad_norm": 3.364442825317383, + "learning_rate": 2.6993865030674848e-05, + "loss": 4.1678, + "step": 265 + }, + { + "epoch": 0.0816451810926949, + "grad_norm": 2.393705368041992, + "learning_rate": 2.7096114519427408e-05, + "loss": 4.1626, + "step": 266 + }, + { + "epoch": 0.08195211786372007, + "grad_norm": 3.8512558937072754, + "learning_rate": 2.719836400817996e-05, + "loss": 4.1613, + "step": 267 + }, + { + "epoch": 0.08225905463474524, + "grad_norm": 3.0992584228515625, + "learning_rate": 2.7300613496932515e-05, + "loss": 4.1486, + "step": 268 + }, + { + "epoch": 0.08256599140577041, + "grad_norm": 3.481079578399658, + "learning_rate": 2.7402862985685072e-05, + "loss": 4.1772, + "step": 269 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 3.2167513370513916, + "learning_rate": 2.7505112474437626e-05, + "loss": 4.1253, + "step": 270 + }, + { + "epoch": 0.08317986494782074, + "grad_norm": 2.9698429107666016, + "learning_rate": 2.7607361963190186e-05, + "loss": 4.0897, + "step": 271 + }, + { + "epoch": 0.08348680171884591, + "grad_norm": 3.2549962997436523, + "learning_rate": 2.770961145194274e-05, + "loss": 4.0851, + "step": 272 + }, + { + "epoch": 0.08379373848987108, + "grad_norm": 3.089301824569702, + "learning_rate": 2.78118609406953e-05, + "loss": 4.1378, + "step": 273 + }, + { + "epoch": 0.08410067526089625, + "grad_norm": 3.1799745559692383, + "learning_rate": 2.7914110429447854e-05, + "loss": 4.159, + "step": 274 + }, + { + "epoch": 0.08440761203192143, + "grad_norm": 2.7577199935913086, + "learning_rate": 2.8016359918200408e-05, + "loss": 4.0524, + "step": 275 + }, + { + "epoch": 0.0847145488029466, + "grad_norm": 3.709740161895752, + "learning_rate": 2.8118609406952968e-05, + "loss": 4.0877, + "step": 276 + }, + { + "epoch": 0.08502148557397177, + "grad_norm": 2.930482864379883, + "learning_rate": 2.822085889570552e-05, + "loss": 4.0408, + "step": 277 + }, + { + "epoch": 0.08532842234499693, + "grad_norm": 3.8216278553009033, + "learning_rate": 2.832310838445808e-05, + "loss": 4.0915, + "step": 278 + }, + { + "epoch": 0.0856353591160221, + "grad_norm": 2.7614903450012207, + "learning_rate": 2.8425357873210636e-05, + "loss": 4.0793, + "step": 279 + }, + { + "epoch": 0.08594229588704727, + "grad_norm": 4.005281448364258, + "learning_rate": 2.8527607361963193e-05, + "loss": 4.1234, + "step": 280 + }, + { + "epoch": 0.08624923265807244, + "grad_norm": 2.731640338897705, + "learning_rate": 2.8629856850715746e-05, + "loss": 4.1408, + "step": 281 + }, + { + "epoch": 0.0865561694290976, + "grad_norm": 4.439471244812012, + "learning_rate": 2.8732106339468307e-05, + "loss": 4.08, + "step": 282 + }, + { + "epoch": 0.08686310620012277, + "grad_norm": 2.929032564163208, + "learning_rate": 2.883435582822086e-05, + "loss": 4.0521, + "step": 283 + }, + { + "epoch": 0.08717004297114794, + "grad_norm": 3.3943557739257812, + "learning_rate": 2.8936605316973414e-05, + "loss": 4.0936, + "step": 284 + }, + { + "epoch": 0.08747697974217311, + "grad_norm": 2.9899704456329346, + "learning_rate": 2.9038854805725974e-05, + "loss": 4.0985, + "step": 285 + }, + { + "epoch": 0.08778391651319828, + "grad_norm": 2.8169870376586914, + "learning_rate": 2.9141104294478528e-05, + "loss": 4.1044, + "step": 286 + }, + { + "epoch": 0.08809085328422345, + "grad_norm": 4.312693119049072, + "learning_rate": 2.9243353783231085e-05, + "loss": 4.0515, + "step": 287 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 2.9270846843719482, + "learning_rate": 2.9345603271983642e-05, + "loss": 4.0221, + "step": 288 + }, + { + "epoch": 0.08870472682627378, + "grad_norm": 3.9831974506378174, + "learning_rate": 2.94478527607362e-05, + "loss": 4.0807, + "step": 289 + }, + { + "epoch": 0.08901166359729895, + "grad_norm": 2.721794605255127, + "learning_rate": 2.9550102249488753e-05, + "loss": 4.0732, + "step": 290 + }, + { + "epoch": 0.08931860036832412, + "grad_norm": 4.721047878265381, + "learning_rate": 2.9652351738241313e-05, + "loss": 4.0457, + "step": 291 + }, + { + "epoch": 0.08962553713934929, + "grad_norm": 2.785738229751587, + "learning_rate": 2.9754601226993867e-05, + "loss": 4.0288, + "step": 292 + }, + { + "epoch": 0.08993247391037447, + "grad_norm": 4.842009544372559, + "learning_rate": 2.985685071574642e-05, + "loss": 4.1193, + "step": 293 + }, + { + "epoch": 0.09023941068139964, + "grad_norm": 2.802044153213501, + "learning_rate": 2.995910020449898e-05, + "loss": 4.0055, + "step": 294 + }, + { + "epoch": 0.0905463474524248, + "grad_norm": 3.7060954570770264, + "learning_rate": 3.0061349693251534e-05, + "loss": 4.0478, + "step": 295 + }, + { + "epoch": 0.09085328422344997, + "grad_norm": 2.8033370971679688, + "learning_rate": 3.0163599182004095e-05, + "loss": 4.0344, + "step": 296 + }, + { + "epoch": 0.09116022099447514, + "grad_norm": 3.148653984069824, + "learning_rate": 3.026584867075665e-05, + "loss": 3.9825, + "step": 297 + }, + { + "epoch": 0.09146715776550031, + "grad_norm": 3.925459384918213, + "learning_rate": 3.0368098159509205e-05, + "loss": 4.0253, + "step": 298 + }, + { + "epoch": 0.09177409453652548, + "grad_norm": 2.8502724170684814, + "learning_rate": 3.047034764826176e-05, + "loss": 4.0192, + "step": 299 + }, + { + "epoch": 0.09208103130755065, + "grad_norm": 3.8444268703460693, + "learning_rate": 3.057259713701431e-05, + "loss": 4.0354, + "step": 300 + }, + { + "epoch": 0.09238796807857581, + "grad_norm": 2.935976982116699, + "learning_rate": 3.067484662576687e-05, + "loss": 4.0397, + "step": 301 + }, + { + "epoch": 0.09269490484960098, + "grad_norm": 2.9375271797180176, + "learning_rate": 3.0777096114519427e-05, + "loss": 3.975, + "step": 302 + }, + { + "epoch": 0.09300184162062615, + "grad_norm": 3.7623329162597656, + "learning_rate": 3.087934560327199e-05, + "loss": 4.0259, + "step": 303 + }, + { + "epoch": 0.09330877839165132, + "grad_norm": 3.1480228900909424, + "learning_rate": 3.098159509202454e-05, + "loss": 3.9676, + "step": 304 + }, + { + "epoch": 0.09361571516267649, + "grad_norm": 4.572622299194336, + "learning_rate": 3.10838445807771e-05, + "loss": 4.0123, + "step": 305 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 2.469806671142578, + "learning_rate": 3.1186094069529655e-05, + "loss": 4.012, + "step": 306 + }, + { + "epoch": 0.09422958870472682, + "grad_norm": 5.133090019226074, + "learning_rate": 3.1288343558282215e-05, + "loss": 3.9892, + "step": 307 + }, + { + "epoch": 0.09453652547575199, + "grad_norm": 3.379105567932129, + "learning_rate": 3.139059304703477e-05, + "loss": 4.0286, + "step": 308 + }, + { + "epoch": 0.09484346224677716, + "grad_norm": 3.1413521766662598, + "learning_rate": 3.149284253578732e-05, + "loss": 4.0238, + "step": 309 + }, + { + "epoch": 0.09515039901780234, + "grad_norm": 2.832242250442505, + "learning_rate": 3.159509202453988e-05, + "loss": 3.9955, + "step": 310 + }, + { + "epoch": 0.09545733578882751, + "grad_norm": 4.405134201049805, + "learning_rate": 3.1697341513292436e-05, + "loss": 4.0093, + "step": 311 + }, + { + "epoch": 0.09576427255985268, + "grad_norm": 2.8928587436676025, + "learning_rate": 3.179959100204499e-05, + "loss": 3.9518, + "step": 312 + }, + { + "epoch": 0.09607120933087784, + "grad_norm": 3.8899731636047363, + "learning_rate": 3.1901840490797544e-05, + "loss": 3.9773, + "step": 313 + }, + { + "epoch": 0.09637814610190301, + "grad_norm": 2.768199920654297, + "learning_rate": 3.2004089979550104e-05, + "loss": 3.9671, + "step": 314 + }, + { + "epoch": 0.09668508287292818, + "grad_norm": 3.834092378616333, + "learning_rate": 3.210633946830266e-05, + "loss": 3.9641, + "step": 315 + }, + { + "epoch": 0.09699201964395335, + "grad_norm": 3.566220998764038, + "learning_rate": 3.220858895705521e-05, + "loss": 3.9585, + "step": 316 + }, + { + "epoch": 0.09729895641497852, + "grad_norm": 3.1876113414764404, + "learning_rate": 3.231083844580777e-05, + "loss": 3.9689, + "step": 317 + }, + { + "epoch": 0.09760589318600368, + "grad_norm": 3.122142791748047, + "learning_rate": 3.2413087934560325e-05, + "loss": 3.9601, + "step": 318 + }, + { + "epoch": 0.09791282995702885, + "grad_norm": 3.825195789337158, + "learning_rate": 3.2515337423312886e-05, + "loss": 3.9413, + "step": 319 + }, + { + "epoch": 0.09821976672805402, + "grad_norm": 3.3126778602600098, + "learning_rate": 3.261758691206544e-05, + "loss": 4.0414, + "step": 320 + }, + { + "epoch": 0.09852670349907919, + "grad_norm": 3.7704360485076904, + "learning_rate": 3.2719836400818e-05, + "loss": 3.9224, + "step": 321 + }, + { + "epoch": 0.09883364027010436, + "grad_norm": 2.997194290161133, + "learning_rate": 3.282208588957055e-05, + "loss": 3.9454, + "step": 322 + }, + { + "epoch": 0.09914057704112952, + "grad_norm": 3.4990131855010986, + "learning_rate": 3.2924335378323114e-05, + "loss": 3.8682, + "step": 323 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 3.146879196166992, + "learning_rate": 3.302658486707567e-05, + "loss": 3.8863, + "step": 324 + }, + { + "epoch": 0.09975445058317986, + "grad_norm": 4.963291645050049, + "learning_rate": 3.312883435582822e-05, + "loss": 3.9951, + "step": 325 + }, + { + "epoch": 0.10006138735420503, + "grad_norm": 2.4511775970458984, + "learning_rate": 3.323108384458078e-05, + "loss": 3.875, + "step": 326 + }, + { + "epoch": 0.1003683241252302, + "grad_norm": 5.670922756195068, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.0446, + "step": 327 + }, + { + "epoch": 0.10067526089625538, + "grad_norm": 3.54237699508667, + "learning_rate": 3.3435582822085895e-05, + "loss": 3.9877, + "step": 328 + }, + { + "epoch": 0.10098219766728055, + "grad_norm": 2.9059271812438965, + "learning_rate": 3.353783231083845e-05, + "loss": 3.949, + "step": 329 + }, + { + "epoch": 0.10128913443830571, + "grad_norm": 3.870962381362915, + "learning_rate": 3.3640081799591e-05, + "loss": 3.8985, + "step": 330 + }, + { + "epoch": 0.10159607120933088, + "grad_norm": 3.275129556655884, + "learning_rate": 3.3742331288343556e-05, + "loss": 4.0209, + "step": 331 + }, + { + "epoch": 0.10190300798035605, + "grad_norm": 3.040931224822998, + "learning_rate": 3.3844580777096117e-05, + "loss": 3.9938, + "step": 332 + }, + { + "epoch": 0.10220994475138122, + "grad_norm": 4.3355584144592285, + "learning_rate": 3.394683026584867e-05, + "loss": 3.876, + "step": 333 + }, + { + "epoch": 0.10251688152240639, + "grad_norm": 3.0981085300445557, + "learning_rate": 3.4049079754601224e-05, + "loss": 3.9014, + "step": 334 + }, + { + "epoch": 0.10282381829343155, + "grad_norm": 3.2902655601501465, + "learning_rate": 3.4151329243353784e-05, + "loss": 3.9599, + "step": 335 + }, + { + "epoch": 0.10313075506445672, + "grad_norm": 3.496514081954956, + "learning_rate": 3.425357873210634e-05, + "loss": 3.9005, + "step": 336 + }, + { + "epoch": 0.10343769183548189, + "grad_norm": 3.4680685997009277, + "learning_rate": 3.43558282208589e-05, + "loss": 3.8591, + "step": 337 + }, + { + "epoch": 0.10374462860650706, + "grad_norm": 3.3041694164276123, + "learning_rate": 3.445807770961145e-05, + "loss": 3.9566, + "step": 338 + }, + { + "epoch": 0.10405156537753223, + "grad_norm": 3.519709825515747, + "learning_rate": 3.456032719836401e-05, + "loss": 3.9219, + "step": 339 + }, + { + "epoch": 0.1043585021485574, + "grad_norm": 3.932344436645508, + "learning_rate": 3.4662576687116566e-05, + "loss": 3.9155, + "step": 340 + }, + { + "epoch": 0.10466543891958256, + "grad_norm": 3.3109822273254395, + "learning_rate": 3.476482617586912e-05, + "loss": 3.9729, + "step": 341 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 4.556341648101807, + "learning_rate": 3.486707566462168e-05, + "loss": 3.9459, + "step": 342 + }, + { + "epoch": 0.1052793124616329, + "grad_norm": 2.9105725288391113, + "learning_rate": 3.4969325153374234e-05, + "loss": 3.9384, + "step": 343 + }, + { + "epoch": 0.10558624923265807, + "grad_norm": 3.865682601928711, + "learning_rate": 3.5071574642126794e-05, + "loss": 3.9826, + "step": 344 + }, + { + "epoch": 0.10589318600368323, + "grad_norm": 2.8606700897216797, + "learning_rate": 3.517382413087935e-05, + "loss": 3.8184, + "step": 345 + }, + { + "epoch": 0.10620012277470842, + "grad_norm": 4.323507785797119, + "learning_rate": 3.527607361963191e-05, + "loss": 3.8772, + "step": 346 + }, + { + "epoch": 0.10650705954573358, + "grad_norm": 2.890390157699585, + "learning_rate": 3.537832310838446e-05, + "loss": 3.8769, + "step": 347 + }, + { + "epoch": 0.10681399631675875, + "grad_norm": 4.008283615112305, + "learning_rate": 3.5480572597137015e-05, + "loss": 3.8796, + "step": 348 + }, + { + "epoch": 0.10712093308778392, + "grad_norm": 3.3605823516845703, + "learning_rate": 3.558282208588957e-05, + "loss": 3.8924, + "step": 349 + }, + { + "epoch": 0.10742786985880909, + "grad_norm": 3.6573123931884766, + "learning_rate": 3.568507157464213e-05, + "loss": 3.812, + "step": 350 + }, + { + "epoch": 0.10773480662983426, + "grad_norm": 3.0771777629852295, + "learning_rate": 3.578732106339468e-05, + "loss": 3.8958, + "step": 351 + }, + { + "epoch": 0.10804174340085942, + "grad_norm": 3.6483314037323, + "learning_rate": 3.5889570552147236e-05, + "loss": 3.8863, + "step": 352 + }, + { + "epoch": 0.10834868017188459, + "grad_norm": 3.1320669651031494, + "learning_rate": 3.59918200408998e-05, + "loss": 3.8194, + "step": 353 + }, + { + "epoch": 0.10865561694290976, + "grad_norm": 3.6510627269744873, + "learning_rate": 3.609406952965235e-05, + "loss": 3.8916, + "step": 354 + }, + { + "epoch": 0.10896255371393493, + "grad_norm": 3.0419273376464844, + "learning_rate": 3.619631901840491e-05, + "loss": 3.7907, + "step": 355 + }, + { + "epoch": 0.1092694904849601, + "grad_norm": 4.519289493560791, + "learning_rate": 3.6298568507157465e-05, + "loss": 3.8902, + "step": 356 + }, + { + "epoch": 0.10957642725598526, + "grad_norm": 2.938493251800537, + "learning_rate": 3.6400817995910025e-05, + "loss": 3.8675, + "step": 357 + }, + { + "epoch": 0.10988336402701043, + "grad_norm": 4.398004531860352, + "learning_rate": 3.650306748466258e-05, + "loss": 3.9535, + "step": 358 + }, + { + "epoch": 0.1101903007980356, + "grad_norm": 2.9128408432006836, + "learning_rate": 3.660531697341513e-05, + "loss": 3.944, + "step": 359 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 5.364169597625732, + "learning_rate": 3.670756646216769e-05, + "loss": 3.9289, + "step": 360 + }, + { + "epoch": 0.11080417434008594, + "grad_norm": 2.8434085845947266, + "learning_rate": 3.6809815950920246e-05, + "loss": 3.8204, + "step": 361 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.494234561920166, + "learning_rate": 3.6912065439672807e-05, + "loss": 3.8518, + "step": 362 + }, + { + "epoch": 0.11141804788213629, + "grad_norm": 2.959608554840088, + "learning_rate": 3.701431492842536e-05, + "loss": 3.8365, + "step": 363 + }, + { + "epoch": 0.11172498465316145, + "grad_norm": 3.4115726947784424, + "learning_rate": 3.711656441717792e-05, + "loss": 3.8507, + "step": 364 + }, + { + "epoch": 0.11203192142418662, + "grad_norm": 3.8023531436920166, + "learning_rate": 3.7218813905930474e-05, + "loss": 3.8544, + "step": 365 + }, + { + "epoch": 0.11233885819521179, + "grad_norm": 3.0639398097991943, + "learning_rate": 3.732106339468303e-05, + "loss": 3.8772, + "step": 366 + }, + { + "epoch": 0.11264579496623696, + "grad_norm": 4.241199016571045, + "learning_rate": 3.742331288343559e-05, + "loss": 3.7739, + "step": 367 + }, + { + "epoch": 0.11295273173726213, + "grad_norm": 2.977330446243286, + "learning_rate": 3.752556237218814e-05, + "loss": 3.8376, + "step": 368 + }, + { + "epoch": 0.1132596685082873, + "grad_norm": 4.574001789093018, + "learning_rate": 3.7627811860940696e-05, + "loss": 3.8761, + "step": 369 + }, + { + "epoch": 0.11356660527931246, + "grad_norm": 3.1499617099761963, + "learning_rate": 3.773006134969325e-05, + "loss": 3.8884, + "step": 370 + }, + { + "epoch": 0.11387354205033763, + "grad_norm": 3.81887149810791, + "learning_rate": 3.783231083844581e-05, + "loss": 3.8474, + "step": 371 + }, + { + "epoch": 0.1141804788213628, + "grad_norm": 3.424117088317871, + "learning_rate": 3.793456032719836e-05, + "loss": 3.8715, + "step": 372 + }, + { + "epoch": 0.11448741559238797, + "grad_norm": 4.431595325469971, + "learning_rate": 3.8036809815950924e-05, + "loss": 3.8305, + "step": 373 + }, + { + "epoch": 0.11479435236341314, + "grad_norm": 3.1664443016052246, + "learning_rate": 3.813905930470348e-05, + "loss": 3.8203, + "step": 374 + }, + { + "epoch": 0.1151012891344383, + "grad_norm": 4.312273025512695, + "learning_rate": 3.824130879345603e-05, + "loss": 3.8195, + "step": 375 + }, + { + "epoch": 0.11540822590546347, + "grad_norm": 3.0893726348876953, + "learning_rate": 3.834355828220859e-05, + "loss": 3.8248, + "step": 376 + }, + { + "epoch": 0.11571516267648864, + "grad_norm": 4.526726722717285, + "learning_rate": 3.8445807770961145e-05, + "loss": 3.8505, + "step": 377 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 2.5805325508117676, + "learning_rate": 3.8548057259713705e-05, + "loss": 3.8153, + "step": 378 + }, + { + "epoch": 0.11632903621853898, + "grad_norm": 4.6043381690979, + "learning_rate": 3.865030674846626e-05, + "loss": 3.8248, + "step": 379 + }, + { + "epoch": 0.11663597298956414, + "grad_norm": 3.0713136196136475, + "learning_rate": 3.875255623721882e-05, + "loss": 3.7687, + "step": 380 + }, + { + "epoch": 0.11694290976058933, + "grad_norm": 3.6344685554504395, + "learning_rate": 3.885480572597137e-05, + "loss": 3.8061, + "step": 381 + }, + { + "epoch": 0.1172498465316145, + "grad_norm": 3.6261723041534424, + "learning_rate": 3.895705521472393e-05, + "loss": 3.7939, + "step": 382 + }, + { + "epoch": 0.11755678330263966, + "grad_norm": 3.811779260635376, + "learning_rate": 3.905930470347649e-05, + "loss": 3.7973, + "step": 383 + }, + { + "epoch": 0.11786372007366483, + "grad_norm": 3.741685628890991, + "learning_rate": 3.916155419222904e-05, + "loss": 3.8149, + "step": 384 + }, + { + "epoch": 0.11817065684469, + "grad_norm": 3.330526351928711, + "learning_rate": 3.92638036809816e-05, + "loss": 3.8058, + "step": 385 + }, + { + "epoch": 0.11847759361571517, + "grad_norm": 3.2102115154266357, + "learning_rate": 3.9366053169734155e-05, + "loss": 3.7199, + "step": 386 + }, + { + "epoch": 0.11878453038674033, + "grad_norm": 3.670474052429199, + "learning_rate": 3.946830265848671e-05, + "loss": 3.8087, + "step": 387 + }, + { + "epoch": 0.1190914671577655, + "grad_norm": 3.218390941619873, + "learning_rate": 3.957055214723926e-05, + "loss": 3.7631, + "step": 388 + }, + { + "epoch": 0.11939840392879067, + "grad_norm": 4.2256693840026855, + "learning_rate": 3.967280163599182e-05, + "loss": 3.7624, + "step": 389 + }, + { + "epoch": 0.11970534069981584, + "grad_norm": 2.86247181892395, + "learning_rate": 3.9775051124744376e-05, + "loss": 3.7638, + "step": 390 + }, + { + "epoch": 0.120012277470841, + "grad_norm": 4.083118915557861, + "learning_rate": 3.987730061349693e-05, + "loss": 3.7581, + "step": 391 + }, + { + "epoch": 0.12031921424186617, + "grad_norm": 2.836794376373291, + "learning_rate": 3.997955010224949e-05, + "loss": 3.7466, + "step": 392 + }, + { + "epoch": 0.12062615101289134, + "grad_norm": 4.071137428283691, + "learning_rate": 4.0081799591002043e-05, + "loss": 3.7836, + "step": 393 + }, + { + "epoch": 0.12093308778391651, + "grad_norm": 3.3141064643859863, + "learning_rate": 4.0184049079754604e-05, + "loss": 3.754, + "step": 394 + }, + { + "epoch": 0.12124002455494168, + "grad_norm": 3.6064393520355225, + "learning_rate": 4.028629856850716e-05, + "loss": 3.8379, + "step": 395 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 3.7306606769561768, + "learning_rate": 4.038854805725972e-05, + "loss": 3.6848, + "step": 396 + }, + { + "epoch": 0.12185389809699201, + "grad_norm": 3.5877859592437744, + "learning_rate": 4.049079754601227e-05, + "loss": 3.8201, + "step": 397 + }, + { + "epoch": 0.12216083486801718, + "grad_norm": 3.930271625518799, + "learning_rate": 4.059304703476483e-05, + "loss": 3.7507, + "step": 398 + }, + { + "epoch": 0.12246777163904236, + "grad_norm": 2.974968194961548, + "learning_rate": 4.0695296523517386e-05, + "loss": 3.7545, + "step": 399 + }, + { + "epoch": 0.12277470841006753, + "grad_norm": 4.655934810638428, + "learning_rate": 4.079754601226994e-05, + "loss": 3.8093, + "step": 400 + }, + { + "epoch": 0.1230816451810927, + "grad_norm": 3.201986312866211, + "learning_rate": 4.08997955010225e-05, + "loss": 3.7252, + "step": 401 + }, + { + "epoch": 0.12338858195211787, + "grad_norm": 4.447626113891602, + "learning_rate": 4.100204498977505e-05, + "loss": 3.7132, + "step": 402 + }, + { + "epoch": 0.12369551872314304, + "grad_norm": 2.6518118381500244, + "learning_rate": 4.1104294478527614e-05, + "loss": 3.7637, + "step": 403 + }, + { + "epoch": 0.1240024554941682, + "grad_norm": 5.116448402404785, + "learning_rate": 4.120654396728017e-05, + "loss": 3.6991, + "step": 404 + }, + { + "epoch": 0.12430939226519337, + "grad_norm": 2.7780613899230957, + "learning_rate": 4.130879345603272e-05, + "loss": 3.7555, + "step": 405 + }, + { + "epoch": 0.12461632903621854, + "grad_norm": 4.281010627746582, + "learning_rate": 4.1411042944785274e-05, + "loss": 3.688, + "step": 406 + }, + { + "epoch": 0.12492326580724371, + "grad_norm": 2.851562023162842, + "learning_rate": 4.1513292433537835e-05, + "loss": 3.7557, + "step": 407 + }, + { + "epoch": 0.1252302025782689, + "grad_norm": 4.092229843139648, + "learning_rate": 4.161554192229039e-05, + "loss": 3.7179, + "step": 408 + }, + { + "epoch": 0.12553713934929406, + "grad_norm": 3.410094976425171, + "learning_rate": 4.171779141104294e-05, + "loss": 3.7292, + "step": 409 + }, + { + "epoch": 0.12584407612031923, + "grad_norm": 4.266562461853027, + "learning_rate": 4.18200408997955e-05, + "loss": 3.8204, + "step": 410 + }, + { + "epoch": 0.1261510128913444, + "grad_norm": 2.997642755508423, + "learning_rate": 4.1922290388548056e-05, + "loss": 3.7773, + "step": 411 + }, + { + "epoch": 0.12645794966236956, + "grad_norm": 4.50873327255249, + "learning_rate": 4.2024539877300617e-05, + "loss": 3.7255, + "step": 412 + }, + { + "epoch": 0.12676488643339473, + "grad_norm": 3.65312123298645, + "learning_rate": 4.212678936605317e-05, + "loss": 3.6472, + "step": 413 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 3.985487222671509, + "learning_rate": 4.222903885480573e-05, + "loss": 3.6915, + "step": 414 + }, + { + "epoch": 0.12737875997544507, + "grad_norm": 3.6020219326019287, + "learning_rate": 4.2331288343558284e-05, + "loss": 3.7299, + "step": 415 + }, + { + "epoch": 0.12768569674647023, + "grad_norm": 3.414529323577881, + "learning_rate": 4.243353783231084e-05, + "loss": 3.7827, + "step": 416 + }, + { + "epoch": 0.1279926335174954, + "grad_norm": 3.537292718887329, + "learning_rate": 4.25357873210634e-05, + "loss": 3.751, + "step": 417 + }, + { + "epoch": 0.12829957028852057, + "grad_norm": 3.5442280769348145, + "learning_rate": 4.263803680981595e-05, + "loss": 3.6828, + "step": 418 + }, + { + "epoch": 0.12860650705954574, + "grad_norm": 3.9816019535064697, + "learning_rate": 4.274028629856851e-05, + "loss": 3.7668, + "step": 419 + }, + { + "epoch": 0.1289134438305709, + "grad_norm": 3.1632657051086426, + "learning_rate": 4.2842535787321066e-05, + "loss": 3.6946, + "step": 420 + }, + { + "epoch": 0.12922038060159607, + "grad_norm": 4.731013298034668, + "learning_rate": 4.2944785276073626e-05, + "loss": 3.7078, + "step": 421 + }, + { + "epoch": 0.12952731737262124, + "grad_norm": 2.7973382472991943, + "learning_rate": 4.304703476482618e-05, + "loss": 3.5934, + "step": 422 + }, + { + "epoch": 0.1298342541436464, + "grad_norm": 4.555461406707764, + "learning_rate": 4.3149284253578733e-05, + "loss": 3.7406, + "step": 423 + }, + { + "epoch": 0.13014119091467158, + "grad_norm": 3.25795841217041, + "learning_rate": 4.3251533742331294e-05, + "loss": 3.6302, + "step": 424 + }, + { + "epoch": 0.13044812768569675, + "grad_norm": 3.9974427223205566, + "learning_rate": 4.335378323108385e-05, + "loss": 3.6995, + "step": 425 + }, + { + "epoch": 0.13075506445672191, + "grad_norm": 3.4234917163848877, + "learning_rate": 4.34560327198364e-05, + "loss": 3.727, + "step": 426 + }, + { + "epoch": 0.13106200122774708, + "grad_norm": 3.40573787689209, + "learning_rate": 4.3558282208588955e-05, + "loss": 3.6964, + "step": 427 + }, + { + "epoch": 0.13136893799877225, + "grad_norm": 3.6903765201568604, + "learning_rate": 4.3660531697341515e-05, + "loss": 3.7139, + "step": 428 + }, + { + "epoch": 0.13167587476979742, + "grad_norm": 3.3252439498901367, + "learning_rate": 4.376278118609407e-05, + "loss": 3.7221, + "step": 429 + }, + { + "epoch": 0.1319828115408226, + "grad_norm": 3.591610908508301, + "learning_rate": 4.386503067484663e-05, + "loss": 3.6592, + "step": 430 + }, + { + "epoch": 0.13228974831184775, + "grad_norm": 3.584683418273926, + "learning_rate": 4.396728016359918e-05, + "loss": 3.695, + "step": 431 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 3.5093443393707275, + "learning_rate": 4.4069529652351736e-05, + "loss": 3.6368, + "step": 432 + }, + { + "epoch": 0.1329036218538981, + "grad_norm": 3.5040347576141357, + "learning_rate": 4.41717791411043e-05, + "loss": 3.6463, + "step": 433 + }, + { + "epoch": 0.13321055862492326, + "grad_norm": 3.534536361694336, + "learning_rate": 4.427402862985685e-05, + "loss": 3.681, + "step": 434 + }, + { + "epoch": 0.13351749539594843, + "grad_norm": 4.016106605529785, + "learning_rate": 4.437627811860941e-05, + "loss": 3.7592, + "step": 435 + }, + { + "epoch": 0.1338244321669736, + "grad_norm": 3.4661898612976074, + "learning_rate": 4.4478527607361964e-05, + "loss": 3.6437, + "step": 436 + }, + { + "epoch": 0.13413136893799876, + "grad_norm": 3.917189359664917, + "learning_rate": 4.4580777096114525e-05, + "loss": 3.6809, + "step": 437 + }, + { + "epoch": 0.13443830570902393, + "grad_norm": 3.472147226333618, + "learning_rate": 4.468302658486708e-05, + "loss": 3.5978, + "step": 438 + }, + { + "epoch": 0.1347452424800491, + "grad_norm": 3.2357044219970703, + "learning_rate": 4.478527607361964e-05, + "loss": 3.6758, + "step": 439 + }, + { + "epoch": 0.13505217925107427, + "grad_norm": 3.8607826232910156, + "learning_rate": 4.488752556237219e-05, + "loss": 3.7155, + "step": 440 + }, + { + "epoch": 0.13535911602209943, + "grad_norm": 3.085242509841919, + "learning_rate": 4.4989775051124746e-05, + "loss": 3.674, + "step": 441 + }, + { + "epoch": 0.1356660527931246, + "grad_norm": 4.0473432540893555, + "learning_rate": 4.5092024539877307e-05, + "loss": 3.6542, + "step": 442 + }, + { + "epoch": 0.1359729895641498, + "grad_norm": 3.4742088317871094, + "learning_rate": 4.519427402862986e-05, + "loss": 3.6226, + "step": 443 + }, + { + "epoch": 0.13627992633517497, + "grad_norm": 3.8838884830474854, + "learning_rate": 4.5296523517382414e-05, + "loss": 3.695, + "step": 444 + }, + { + "epoch": 0.13658686310620013, + "grad_norm": 3.1551895141601562, + "learning_rate": 4.539877300613497e-05, + "loss": 3.6886, + "step": 445 + }, + { + "epoch": 0.1368937998772253, + "grad_norm": 3.6824824810028076, + "learning_rate": 4.550102249488753e-05, + "loss": 3.6397, + "step": 446 + }, + { + "epoch": 0.13720073664825047, + "grad_norm": 3.3671298027038574, + "learning_rate": 4.560327198364008e-05, + "loss": 3.5983, + "step": 447 + }, + { + "epoch": 0.13750767341927564, + "grad_norm": 4.11976957321167, + "learning_rate": 4.570552147239264e-05, + "loss": 3.6371, + "step": 448 + }, + { + "epoch": 0.1378146101903008, + "grad_norm": 3.2035205364227295, + "learning_rate": 4.5807770961145195e-05, + "loss": 3.6097, + "step": 449 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 4.944174289703369, + "learning_rate": 4.591002044989775e-05, + "loss": 3.6317, + "step": 450 + }, + { + "epoch": 0.13842848373235114, + "grad_norm": 3.0040266513824463, + "learning_rate": 4.601226993865031e-05, + "loss": 3.6407, + "step": 451 + }, + { + "epoch": 0.1387354205033763, + "grad_norm": 5.124639511108398, + "learning_rate": 4.611451942740286e-05, + "loss": 3.6539, + "step": 452 + }, + { + "epoch": 0.13904235727440148, + "grad_norm": 2.792884349822998, + "learning_rate": 4.6216768916155423e-05, + "loss": 3.6542, + "step": 453 + }, + { + "epoch": 0.13934929404542665, + "grad_norm": 4.394725799560547, + "learning_rate": 4.631901840490798e-05, + "loss": 3.6811, + "step": 454 + }, + { + "epoch": 0.13965623081645182, + "grad_norm": 3.209400177001953, + "learning_rate": 4.642126789366054e-05, + "loss": 3.6635, + "step": 455 + }, + { + "epoch": 0.13996316758747698, + "grad_norm": 3.6599526405334473, + "learning_rate": 4.652351738241309e-05, + "loss": 3.5732, + "step": 456 + }, + { + "epoch": 0.14027010435850215, + "grad_norm": 3.6527204513549805, + "learning_rate": 4.6625766871165645e-05, + "loss": 3.5979, + "step": 457 + }, + { + "epoch": 0.14057704112952732, + "grad_norm": 3.4562110900878906, + "learning_rate": 4.6728016359918205e-05, + "loss": 3.6761, + "step": 458 + }, + { + "epoch": 0.1408839779005525, + "grad_norm": 3.5935721397399902, + "learning_rate": 4.683026584867076e-05, + "loss": 3.6598, + "step": 459 + }, + { + "epoch": 0.14119091467157766, + "grad_norm": 3.4518251419067383, + "learning_rate": 4.693251533742332e-05, + "loss": 3.5707, + "step": 460 + }, + { + "epoch": 0.14149785144260282, + "grad_norm": 3.3248815536499023, + "learning_rate": 4.703476482617587e-05, + "loss": 3.6949, + "step": 461 + }, + { + "epoch": 0.141804788213628, + "grad_norm": 3.6379971504211426, + "learning_rate": 4.7137014314928426e-05, + "loss": 3.6265, + "step": 462 + }, + { + "epoch": 0.14211172498465316, + "grad_norm": 4.068325996398926, + "learning_rate": 4.723926380368098e-05, + "loss": 3.6096, + "step": 463 + }, + { + "epoch": 0.14241866175567833, + "grad_norm": 3.0870959758758545, + "learning_rate": 4.734151329243354e-05, + "loss": 3.5201, + "step": 464 + }, + { + "epoch": 0.1427255985267035, + "grad_norm": 4.013638973236084, + "learning_rate": 4.7443762781186094e-05, + "loss": 3.5845, + "step": 465 + }, + { + "epoch": 0.14303253529772866, + "grad_norm": 3.421921968460083, + "learning_rate": 4.754601226993865e-05, + "loss": 3.6718, + "step": 466 + }, + { + "epoch": 0.14333947206875383, + "grad_norm": 3.4814112186431885, + "learning_rate": 4.764826175869121e-05, + "loss": 3.6225, + "step": 467 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 2.9323105812072754, + "learning_rate": 4.775051124744376e-05, + "loss": 3.5881, + "step": 468 + }, + { + "epoch": 0.14395334561080417, + "grad_norm": 3.862344264984131, + "learning_rate": 4.785276073619632e-05, + "loss": 3.6264, + "step": 469 + }, + { + "epoch": 0.14426028238182934, + "grad_norm": 2.950495481491089, + "learning_rate": 4.7955010224948876e-05, + "loss": 3.5891, + "step": 470 + }, + { + "epoch": 0.1445672191528545, + "grad_norm": 4.360744476318359, + "learning_rate": 4.8057259713701436e-05, + "loss": 3.6746, + "step": 471 + }, + { + "epoch": 0.14487415592387967, + "grad_norm": 2.689297914505005, + "learning_rate": 4.815950920245399e-05, + "loss": 3.616, + "step": 472 + }, + { + "epoch": 0.14518109269490484, + "grad_norm": 4.433006286621094, + "learning_rate": 4.826175869120655e-05, + "loss": 3.6259, + "step": 473 + }, + { + "epoch": 0.14548802946593, + "grad_norm": 2.9184467792510986, + "learning_rate": 4.8364008179959104e-05, + "loss": 3.59, + "step": 474 + }, + { + "epoch": 0.14579496623695518, + "grad_norm": 4.472714424133301, + "learning_rate": 4.846625766871166e-05, + "loss": 3.5608, + "step": 475 + }, + { + "epoch": 0.14610190300798034, + "grad_norm": 3.0839431285858154, + "learning_rate": 4.856850715746422e-05, + "loss": 3.6069, + "step": 476 + }, + { + "epoch": 0.1464088397790055, + "grad_norm": 3.8900411128997803, + "learning_rate": 4.867075664621677e-05, + "loss": 3.5387, + "step": 477 + }, + { + "epoch": 0.14671577655003068, + "grad_norm": 3.0446956157684326, + "learning_rate": 4.877300613496933e-05, + "loss": 3.5374, + "step": 478 + }, + { + "epoch": 0.14702271332105588, + "grad_norm": 3.805018901824951, + "learning_rate": 4.8875255623721885e-05, + "loss": 3.6032, + "step": 479 + }, + { + "epoch": 0.14732965009208104, + "grad_norm": 2.9937491416931152, + "learning_rate": 4.897750511247444e-05, + "loss": 3.548, + "step": 480 + }, + { + "epoch": 0.1476365868631062, + "grad_norm": 4.103757858276367, + "learning_rate": 4.907975460122699e-05, + "loss": 3.6292, + "step": 481 + }, + { + "epoch": 0.14794352363413138, + "grad_norm": 2.8275530338287354, + "learning_rate": 4.918200408997955e-05, + "loss": 3.5885, + "step": 482 + }, + { + "epoch": 0.14825046040515655, + "grad_norm": 4.104444980621338, + "learning_rate": 4.928425357873211e-05, + "loss": 3.5566, + "step": 483 + }, + { + "epoch": 0.14855739717618172, + "grad_norm": 2.820648670196533, + "learning_rate": 4.938650306748466e-05, + "loss": 3.6576, + "step": 484 + }, + { + "epoch": 0.14886433394720688, + "grad_norm": 4.639568328857422, + "learning_rate": 4.948875255623722e-05, + "loss": 3.583, + "step": 485 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 2.8675858974456787, + "learning_rate": 4.9591002044989774e-05, + "loss": 3.5982, + "step": 486 + }, + { + "epoch": 0.14947820748925722, + "grad_norm": 4.820484638214111, + "learning_rate": 4.9693251533742335e-05, + "loss": 3.5479, + "step": 487 + }, + { + "epoch": 0.1497851442602824, + "grad_norm": 2.9569075107574463, + "learning_rate": 4.979550102249489e-05, + "loss": 3.5846, + "step": 488 + }, + { + "epoch": 0.15009208103130756, + "grad_norm": 4.402152061462402, + "learning_rate": 4.989775051124745e-05, + "loss": 3.5368, + "step": 489 + }, + { + "epoch": 0.15039901780233272, + "grad_norm": 3.0454704761505127, + "learning_rate": 5e-05, + "loss": 3.5233, + "step": 490 + }, + { + "epoch": 0.1507059545733579, + "grad_norm": 3.564425468444824, + "learning_rate": 5.010224948875256e-05, + "loss": 3.5747, + "step": 491 + }, + { + "epoch": 0.15101289134438306, + "grad_norm": 3.2065536975860596, + "learning_rate": 5.020449897750511e-05, + "loss": 3.4803, + "step": 492 + }, + { + "epoch": 0.15131982811540823, + "grad_norm": 4.06170129776001, + "learning_rate": 5.030674846625767e-05, + "loss": 3.5867, + "step": 493 + }, + { + "epoch": 0.1516267648864334, + "grad_norm": 2.937181234359741, + "learning_rate": 5.040899795501023e-05, + "loss": 3.5098, + "step": 494 + }, + { + "epoch": 0.15193370165745856, + "grad_norm": 3.7272653579711914, + "learning_rate": 5.051124744376279e-05, + "loss": 3.5959, + "step": 495 + }, + { + "epoch": 0.15224063842848373, + "grad_norm": 2.8606886863708496, + "learning_rate": 5.061349693251534e-05, + "loss": 3.4881, + "step": 496 + }, + { + "epoch": 0.1525475751995089, + "grad_norm": 3.4861185550689697, + "learning_rate": 5.07157464212679e-05, + "loss": 3.563, + "step": 497 + }, + { + "epoch": 0.15285451197053407, + "grad_norm": 3.1362967491149902, + "learning_rate": 5.081799591002045e-05, + "loss": 3.5564, + "step": 498 + }, + { + "epoch": 0.15316144874155924, + "grad_norm": 3.360508441925049, + "learning_rate": 5.0920245398773005e-05, + "loss": 3.5307, + "step": 499 + }, + { + "epoch": 0.1534683855125844, + "grad_norm": 3.2896840572357178, + "learning_rate": 5.1022494887525566e-05, + "loss": 3.4843, + "step": 500 + }, + { + "epoch": 0.15377532228360957, + "grad_norm": 3.320429801940918, + "learning_rate": 5.112474437627812e-05, + "loss": 3.484, + "step": 501 + }, + { + "epoch": 0.15408225905463474, + "grad_norm": 3.409586191177368, + "learning_rate": 5.122699386503068e-05, + "loss": 3.506, + "step": 502 + }, + { + "epoch": 0.1543891958256599, + "grad_norm": 3.0944409370422363, + "learning_rate": 5.1329243353783227e-05, + "loss": 3.5011, + "step": 503 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 3.7220418453216553, + "learning_rate": 5.143149284253579e-05, + "loss": 3.5629, + "step": 504 + }, + { + "epoch": 0.15500306936771024, + "grad_norm": 3.217435359954834, + "learning_rate": 5.153374233128835e-05, + "loss": 3.4957, + "step": 505 + }, + { + "epoch": 0.1553100061387354, + "grad_norm": 4.0457444190979, + "learning_rate": 5.163599182004091e-05, + "loss": 3.5152, + "step": 506 + }, + { + "epoch": 0.15561694290976058, + "grad_norm": 2.9380006790161133, + "learning_rate": 5.1738241308793455e-05, + "loss": 3.5261, + "step": 507 + }, + { + "epoch": 0.15592387968078575, + "grad_norm": 4.134535312652588, + "learning_rate": 5.1840490797546015e-05, + "loss": 3.5622, + "step": 508 + }, + { + "epoch": 0.15623081645181092, + "grad_norm": 2.8209407329559326, + "learning_rate": 5.1942740286298575e-05, + "loss": 3.5335, + "step": 509 + }, + { + "epoch": 0.15653775322283608, + "grad_norm": 4.4260711669921875, + "learning_rate": 5.204498977505112e-05, + "loss": 3.5554, + "step": 510 + }, + { + "epoch": 0.15684468999386125, + "grad_norm": 2.8649590015411377, + "learning_rate": 5.214723926380368e-05, + "loss": 3.4989, + "step": 511 + }, + { + "epoch": 0.15715162676488642, + "grad_norm": 4.0349812507629395, + "learning_rate": 5.224948875255624e-05, + "loss": 3.4883, + "step": 512 + }, + { + "epoch": 0.1574585635359116, + "grad_norm": 2.841923475265503, + "learning_rate": 5.2351738241308803e-05, + "loss": 3.4748, + "step": 513 + }, + { + "epoch": 0.15776550030693678, + "grad_norm": 3.8810653686523438, + "learning_rate": 5.245398773006135e-05, + "loss": 3.5403, + "step": 514 + }, + { + "epoch": 0.15807243707796195, + "grad_norm": 3.0830774307250977, + "learning_rate": 5.255623721881391e-05, + "loss": 3.513, + "step": 515 + }, + { + "epoch": 0.15837937384898712, + "grad_norm": 3.8688604831695557, + "learning_rate": 5.265848670756647e-05, + "loss": 3.5409, + "step": 516 + }, + { + "epoch": 0.1586863106200123, + "grad_norm": 2.854600429534912, + "learning_rate": 5.276073619631902e-05, + "loss": 3.4441, + "step": 517 + }, + { + "epoch": 0.15899324739103746, + "grad_norm": 3.9125611782073975, + "learning_rate": 5.286298568507158e-05, + "loss": 3.4953, + "step": 518 + }, + { + "epoch": 0.15930018416206262, + "grad_norm": 2.8626177310943604, + "learning_rate": 5.296523517382413e-05, + "loss": 3.5279, + "step": 519 + }, + { + "epoch": 0.1596071209330878, + "grad_norm": 3.5023677349090576, + "learning_rate": 5.306748466257669e-05, + "loss": 3.4886, + "step": 520 + }, + { + "epoch": 0.15991405770411296, + "grad_norm": 2.960505962371826, + "learning_rate": 5.316973415132924e-05, + "loss": 3.5278, + "step": 521 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 3.976245164871216, + "learning_rate": 5.32719836400818e-05, + "loss": 3.5236, + "step": 522 + }, + { + "epoch": 0.1605279312461633, + "grad_norm": 3.078248977661133, + "learning_rate": 5.337423312883436e-05, + "loss": 3.5194, + "step": 523 + }, + { + "epoch": 0.16083486801718846, + "grad_norm": 3.7498552799224854, + "learning_rate": 5.347648261758691e-05, + "loss": 3.5315, + "step": 524 + }, + { + "epoch": 0.16114180478821363, + "grad_norm": 2.87638258934021, + "learning_rate": 5.357873210633947e-05, + "loss": 3.434, + "step": 525 + }, + { + "epoch": 0.1614487415592388, + "grad_norm": 3.786454677581787, + "learning_rate": 5.368098159509203e-05, + "loss": 3.4985, + "step": 526 + }, + { + "epoch": 0.16175567833026397, + "grad_norm": 2.915156364440918, + "learning_rate": 5.378323108384459e-05, + "loss": 3.4979, + "step": 527 + }, + { + "epoch": 0.16206261510128914, + "grad_norm": 4.095824718475342, + "learning_rate": 5.3885480572597135e-05, + "loss": 3.4605, + "step": 528 + }, + { + "epoch": 0.1623695518723143, + "grad_norm": 2.793501853942871, + "learning_rate": 5.3987730061349695e-05, + "loss": 3.476, + "step": 529 + }, + { + "epoch": 0.16267648864333947, + "grad_norm": 3.9074480533599854, + "learning_rate": 5.4089979550102256e-05, + "loss": 3.4636, + "step": 530 + }, + { + "epoch": 0.16298342541436464, + "grad_norm": 2.8382515907287598, + "learning_rate": 5.4192229038854816e-05, + "loss": 3.4364, + "step": 531 + }, + { + "epoch": 0.1632903621853898, + "grad_norm": 3.4670751094818115, + "learning_rate": 5.429447852760736e-05, + "loss": 3.5033, + "step": 532 + }, + { + "epoch": 0.16359729895641498, + "grad_norm": 2.8805580139160156, + "learning_rate": 5.439672801635992e-05, + "loss": 3.471, + "step": 533 + }, + { + "epoch": 0.16390423572744015, + "grad_norm": 3.745434522628784, + "learning_rate": 5.4498977505112484e-05, + "loss": 3.4565, + "step": 534 + }, + { + "epoch": 0.1642111724984653, + "grad_norm": 3.290579319000244, + "learning_rate": 5.460122699386503e-05, + "loss": 3.47, + "step": 535 + }, + { + "epoch": 0.16451810926949048, + "grad_norm": 3.2988481521606445, + "learning_rate": 5.470347648261759e-05, + "loss": 3.3781, + "step": 536 + }, + { + "epoch": 0.16482504604051565, + "grad_norm": 3.3673248291015625, + "learning_rate": 5.4805725971370145e-05, + "loss": 3.4891, + "step": 537 + }, + { + "epoch": 0.16513198281154082, + "grad_norm": 3.1917717456817627, + "learning_rate": 5.4907975460122705e-05, + "loss": 3.4493, + "step": 538 + }, + { + "epoch": 0.16543891958256599, + "grad_norm": 3.3869614601135254, + "learning_rate": 5.501022494887525e-05, + "loss": 3.3954, + "step": 539 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 2.896742820739746, + "learning_rate": 5.511247443762781e-05, + "loss": 3.4465, + "step": 540 + }, + { + "epoch": 0.16605279312461632, + "grad_norm": 3.771268844604492, + "learning_rate": 5.521472392638037e-05, + "loss": 3.4889, + "step": 541 + }, + { + "epoch": 0.1663597298956415, + "grad_norm": 2.8693349361419678, + "learning_rate": 5.531697341513292e-05, + "loss": 3.3661, + "step": 542 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.093103885650635, + "learning_rate": 5.541922290388548e-05, + "loss": 3.4451, + "step": 543 + }, + { + "epoch": 0.16697360343769183, + "grad_norm": 3.050361394882202, + "learning_rate": 5.552147239263804e-05, + "loss": 3.4203, + "step": 544 + }, + { + "epoch": 0.167280540208717, + "grad_norm": 3.041480302810669, + "learning_rate": 5.56237218813906e-05, + "loss": 3.4173, + "step": 545 + }, + { + "epoch": 0.16758747697974216, + "grad_norm": 3.385680675506592, + "learning_rate": 5.572597137014315e-05, + "loss": 3.4408, + "step": 546 + }, + { + "epoch": 0.16789441375076733, + "grad_norm": 2.88845157623291, + "learning_rate": 5.582822085889571e-05, + "loss": 3.4536, + "step": 547 + }, + { + "epoch": 0.1682013505217925, + "grad_norm": 3.7155961990356445, + "learning_rate": 5.593047034764827e-05, + "loss": 3.4392, + "step": 548 + }, + { + "epoch": 0.1685082872928177, + "grad_norm": 3.4626615047454834, + "learning_rate": 5.6032719836400815e-05, + "loss": 3.4395, + "step": 549 + }, + { + "epoch": 0.16881522406384286, + "grad_norm": 3.182154417037964, + "learning_rate": 5.6134969325153376e-05, + "loss": 3.5239, + "step": 550 + }, + { + "epoch": 0.16912216083486803, + "grad_norm": 3.478602886199951, + "learning_rate": 5.6237218813905936e-05, + "loss": 3.4258, + "step": 551 + }, + { + "epoch": 0.1694290976058932, + "grad_norm": 2.9652369022369385, + "learning_rate": 5.6339468302658496e-05, + "loss": 3.3919, + "step": 552 + }, + { + "epoch": 0.16973603437691837, + "grad_norm": 3.736821413040161, + "learning_rate": 5.644171779141104e-05, + "loss": 3.4491, + "step": 553 + }, + { + "epoch": 0.17004297114794353, + "grad_norm": 2.7791361808776855, + "learning_rate": 5.6543967280163604e-05, + "loss": 3.4748, + "step": 554 + }, + { + "epoch": 0.1703499079189687, + "grad_norm": 4.583637714385986, + "learning_rate": 5.664621676891616e-05, + "loss": 3.4554, + "step": 555 + }, + { + "epoch": 0.17065684468999387, + "grad_norm": 2.8527474403381348, + "learning_rate": 5.674846625766872e-05, + "loss": 3.4327, + "step": 556 + }, + { + "epoch": 0.17096378146101904, + "grad_norm": 4.116163730621338, + "learning_rate": 5.685071574642127e-05, + "loss": 3.4043, + "step": 557 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 3.0130903720855713, + "learning_rate": 5.6952965235173825e-05, + "loss": 3.4823, + "step": 558 + }, + { + "epoch": 0.17157765500306937, + "grad_norm": 3.3556432723999023, + "learning_rate": 5.7055214723926385e-05, + "loss": 3.4464, + "step": 559 + }, + { + "epoch": 0.17188459177409454, + "grad_norm": 2.854952573776245, + "learning_rate": 5.715746421267893e-05, + "loss": 3.3768, + "step": 560 + }, + { + "epoch": 0.1721915285451197, + "grad_norm": 3.9891982078552246, + "learning_rate": 5.725971370143149e-05, + "loss": 3.3949, + "step": 561 + }, + { + "epoch": 0.17249846531614488, + "grad_norm": 2.980468511581421, + "learning_rate": 5.736196319018405e-05, + "loss": 3.459, + "step": 562 + }, + { + "epoch": 0.17280540208717005, + "grad_norm": 3.453510284423828, + "learning_rate": 5.7464212678936613e-05, + "loss": 3.4549, + "step": 563 + }, + { + "epoch": 0.1731123388581952, + "grad_norm": 2.8926782608032227, + "learning_rate": 5.756646216768916e-05, + "loss": 3.392, + "step": 564 + }, + { + "epoch": 0.17341927562922038, + "grad_norm": 3.3722894191741943, + "learning_rate": 5.766871165644172e-05, + "loss": 3.4002, + "step": 565 + }, + { + "epoch": 0.17372621240024555, + "grad_norm": 2.8093647956848145, + "learning_rate": 5.777096114519428e-05, + "loss": 3.3862, + "step": 566 + }, + { + "epoch": 0.17403314917127072, + "grad_norm": 4.1722731590271, + "learning_rate": 5.787321063394683e-05, + "loss": 3.3903, + "step": 567 + }, + { + "epoch": 0.17434008594229589, + "grad_norm": 2.778069257736206, + "learning_rate": 5.797546012269939e-05, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.17464702271332105, + "grad_norm": 3.8501908779144287, + "learning_rate": 5.807770961145195e-05, + "loss": 3.4094, + "step": 569 + }, + { + "epoch": 0.17495395948434622, + "grad_norm": 2.5164549350738525, + "learning_rate": 5.817995910020451e-05, + "loss": 3.4343, + "step": 570 + }, + { + "epoch": 0.1752608962553714, + "grad_norm": 4.0673065185546875, + "learning_rate": 5.8282208588957056e-05, + "loss": 3.3993, + "step": 571 + }, + { + "epoch": 0.17556783302639656, + "grad_norm": 2.7882072925567627, + "learning_rate": 5.8384458077709616e-05, + "loss": 3.4759, + "step": 572 + }, + { + "epoch": 0.17587476979742173, + "grad_norm": 3.3252487182617188, + "learning_rate": 5.848670756646217e-05, + "loss": 3.3562, + "step": 573 + }, + { + "epoch": 0.1761817065684469, + "grad_norm": 2.7499115467071533, + "learning_rate": 5.8588957055214724e-05, + "loss": 3.3376, + "step": 574 + }, + { + "epoch": 0.17648864333947206, + "grad_norm": 4.061224460601807, + "learning_rate": 5.8691206543967284e-05, + "loss": 3.3521, + "step": 575 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 3.022193431854248, + "learning_rate": 5.879345603271984e-05, + "loss": 3.3933, + "step": 576 + }, + { + "epoch": 0.1771025168815224, + "grad_norm": 3.2442128658294678, + "learning_rate": 5.88957055214724e-05, + "loss": 3.4531, + "step": 577 + }, + { + "epoch": 0.17740945365254757, + "grad_norm": 2.9524872303009033, + "learning_rate": 5.8997955010224945e-05, + "loss": 3.332, + "step": 578 + }, + { + "epoch": 0.17771639042357273, + "grad_norm": 3.4604902267456055, + "learning_rate": 5.9100204498977505e-05, + "loss": 3.3706, + "step": 579 + }, + { + "epoch": 0.1780233271945979, + "grad_norm": 3.05216646194458, + "learning_rate": 5.9202453987730066e-05, + "loss": 3.463, + "step": 580 + }, + { + "epoch": 0.17833026396562307, + "grad_norm": 3.427311658859253, + "learning_rate": 5.9304703476482626e-05, + "loss": 3.4204, + "step": 581 + }, + { + "epoch": 0.17863720073664824, + "grad_norm": 2.5583856105804443, + "learning_rate": 5.940695296523517e-05, + "loss": 3.4686, + "step": 582 + }, + { + "epoch": 0.1789441375076734, + "grad_norm": 3.85471248626709, + "learning_rate": 5.950920245398773e-05, + "loss": 3.4518, + "step": 583 + }, + { + "epoch": 0.17925107427869857, + "grad_norm": 2.6894235610961914, + "learning_rate": 5.9611451942740294e-05, + "loss": 3.4179, + "step": 584 + }, + { + "epoch": 0.17955801104972377, + "grad_norm": 3.7592904567718506, + "learning_rate": 5.971370143149284e-05, + "loss": 3.3197, + "step": 585 + }, + { + "epoch": 0.17986494782074894, + "grad_norm": 2.8180313110351562, + "learning_rate": 5.98159509202454e-05, + "loss": 3.4098, + "step": 586 + }, + { + "epoch": 0.1801718845917741, + "grad_norm": 3.5678224563598633, + "learning_rate": 5.991820040899796e-05, + "loss": 3.3644, + "step": 587 + }, + { + "epoch": 0.18047882136279927, + "grad_norm": 2.920607328414917, + "learning_rate": 6.002044989775052e-05, + "loss": 3.4158, + "step": 588 + }, + { + "epoch": 0.18078575813382444, + "grad_norm": 2.9465436935424805, + "learning_rate": 6.012269938650307e-05, + "loss": 3.3369, + "step": 589 + }, + { + "epoch": 0.1810926949048496, + "grad_norm": 3.8760533332824707, + "learning_rate": 6.022494887525563e-05, + "loss": 3.4205, + "step": 590 + }, + { + "epoch": 0.18139963167587478, + "grad_norm": 3.2972259521484375, + "learning_rate": 6.032719836400819e-05, + "loss": 3.3234, + "step": 591 + }, + { + "epoch": 0.18170656844689995, + "grad_norm": 2.8855841159820557, + "learning_rate": 6.0429447852760736e-05, + "loss": 3.4172, + "step": 592 + }, + { + "epoch": 0.18201350521792511, + "grad_norm": 3.3035166263580322, + "learning_rate": 6.05316973415133e-05, + "loss": 3.3235, + "step": 593 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 2.5975232124328613, + "learning_rate": 6.063394683026585e-05, + "loss": 3.3245, + "step": 594 + }, + { + "epoch": 0.18262737875997545, + "grad_norm": 3.68007755279541, + "learning_rate": 6.073619631901841e-05, + "loss": 3.4348, + "step": 595 + }, + { + "epoch": 0.18293431553100062, + "grad_norm": 2.774419069290161, + "learning_rate": 6.083844580777096e-05, + "loss": 3.2763, + "step": 596 + }, + { + "epoch": 0.1832412523020258, + "grad_norm": 3.686140298843384, + "learning_rate": 6.094069529652352e-05, + "loss": 3.29, + "step": 597 + }, + { + "epoch": 0.18354818907305095, + "grad_norm": 2.71142315864563, + "learning_rate": 6.104294478527609e-05, + "loss": 3.3899, + "step": 598 + }, + { + "epoch": 0.18385512584407612, + "grad_norm": 3.725736141204834, + "learning_rate": 6.114519427402863e-05, + "loss": 3.3844, + "step": 599 + }, + { + "epoch": 0.1841620626151013, + "grad_norm": 2.691237211227417, + "learning_rate": 6.124744376278119e-05, + "loss": 3.3138, + "step": 600 + }, + { + "epoch": 0.18446899938612646, + "grad_norm": 3.467499256134033, + "learning_rate": 6.134969325153375e-05, + "loss": 3.3501, + "step": 601 + }, + { + "epoch": 0.18477593615715163, + "grad_norm": 2.776309013366699, + "learning_rate": 6.14519427402863e-05, + "loss": 3.3278, + "step": 602 + }, + { + "epoch": 0.1850828729281768, + "grad_norm": 3.4674019813537598, + "learning_rate": 6.155419222903885e-05, + "loss": 3.262, + "step": 603 + }, + { + "epoch": 0.18538980969920196, + "grad_norm": 2.8091421127319336, + "learning_rate": 6.165644171779141e-05, + "loss": 3.3296, + "step": 604 + }, + { + "epoch": 0.18569674647022713, + "grad_norm": 3.4938528537750244, + "learning_rate": 6.175869120654397e-05, + "loss": 3.4028, + "step": 605 + }, + { + "epoch": 0.1860036832412523, + "grad_norm": 2.5200188159942627, + "learning_rate": 6.186094069529653e-05, + "loss": 3.3726, + "step": 606 + }, + { + "epoch": 0.18631062001227747, + "grad_norm": 3.6415109634399414, + "learning_rate": 6.196319018404908e-05, + "loss": 3.3539, + "step": 607 + }, + { + "epoch": 0.18661755678330263, + "grad_norm": 2.553532123565674, + "learning_rate": 6.206543967280163e-05, + "loss": 3.2971, + "step": 608 + }, + { + "epoch": 0.1869244935543278, + "grad_norm": 3.7287046909332275, + "learning_rate": 6.21676891615542e-05, + "loss": 3.3987, + "step": 609 + }, + { + "epoch": 0.18723143032535297, + "grad_norm": 2.6285226345062256, + "learning_rate": 6.226993865030674e-05, + "loss": 3.2446, + "step": 610 + }, + { + "epoch": 0.18753836709637814, + "grad_norm": 3.453766107559204, + "learning_rate": 6.237218813905931e-05, + "loss": 3.2644, + "step": 611 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 2.7924115657806396, + "learning_rate": 6.247443762781186e-05, + "loss": 3.3056, + "step": 612 + }, + { + "epoch": 0.18815224063842848, + "grad_norm": 3.4854533672332764, + "learning_rate": 6.257668711656443e-05, + "loss": 3.3468, + "step": 613 + }, + { + "epoch": 0.18845917740945364, + "grad_norm": 2.8738653659820557, + "learning_rate": 6.267893660531697e-05, + "loss": 3.3079, + "step": 614 + }, + { + "epoch": 0.1887661141804788, + "grad_norm": 3.496342420578003, + "learning_rate": 6.278118609406954e-05, + "loss": 3.3453, + "step": 615 + }, + { + "epoch": 0.18907305095150398, + "grad_norm": 3.1935245990753174, + "learning_rate": 6.288343558282209e-05, + "loss": 3.303, + "step": 616 + }, + { + "epoch": 0.18937998772252915, + "grad_norm": 2.9726579189300537, + "learning_rate": 6.298568507157464e-05, + "loss": 3.284, + "step": 617 + }, + { + "epoch": 0.18968692449355432, + "grad_norm": 2.8515241146087646, + "learning_rate": 6.30879345603272e-05, + "loss": 3.2748, + "step": 618 + }, + { + "epoch": 0.18999386126457948, + "grad_norm": 3.216681480407715, + "learning_rate": 6.319018404907977e-05, + "loss": 3.2613, + "step": 619 + }, + { + "epoch": 0.19030079803560468, + "grad_norm": 2.9164562225341797, + "learning_rate": 6.329243353783232e-05, + "loss": 3.3234, + "step": 620 + }, + { + "epoch": 0.19060773480662985, + "grad_norm": 2.6724259853363037, + "learning_rate": 6.339468302658487e-05, + "loss": 3.3271, + "step": 621 + }, + { + "epoch": 0.19091467157765502, + "grad_norm": 3.298551082611084, + "learning_rate": 6.349693251533743e-05, + "loss": 3.2715, + "step": 622 + }, + { + "epoch": 0.19122160834868018, + "grad_norm": 2.609632968902588, + "learning_rate": 6.359918200408998e-05, + "loss": 3.2392, + "step": 623 + }, + { + "epoch": 0.19152854511970535, + "grad_norm": 3.6469385623931885, + "learning_rate": 6.370143149284253e-05, + "loss": 3.428, + "step": 624 + }, + { + "epoch": 0.19183548189073052, + "grad_norm": 2.4231622219085693, + "learning_rate": 6.380368098159509e-05, + "loss": 3.3436, + "step": 625 + }, + { + "epoch": 0.1921424186617557, + "grad_norm": 3.9182474613189697, + "learning_rate": 6.390593047034765e-05, + "loss": 3.3375, + "step": 626 + }, + { + "epoch": 0.19244935543278086, + "grad_norm": 2.3975942134857178, + "learning_rate": 6.400817995910021e-05, + "loss": 3.2711, + "step": 627 + }, + { + "epoch": 0.19275629220380602, + "grad_norm": 3.061039447784424, + "learning_rate": 6.411042944785276e-05, + "loss": 3.3124, + "step": 628 + }, + { + "epoch": 0.1930632289748312, + "grad_norm": 2.9461817741394043, + "learning_rate": 6.421267893660532e-05, + "loss": 3.2954, + "step": 629 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 2.6603662967681885, + "learning_rate": 6.431492842535788e-05, + "loss": 3.2138, + "step": 630 + }, + { + "epoch": 0.19367710251688153, + "grad_norm": 3.339444875717163, + "learning_rate": 6.441717791411042e-05, + "loss": 3.2796, + "step": 631 + }, + { + "epoch": 0.1939840392879067, + "grad_norm": 2.59061861038208, + "learning_rate": 6.451942740286299e-05, + "loss": 3.3906, + "step": 632 + }, + { + "epoch": 0.19429097605893186, + "grad_norm": 3.704300880432129, + "learning_rate": 6.462167689161554e-05, + "loss": 3.2604, + "step": 633 + }, + { + "epoch": 0.19459791282995703, + "grad_norm": 3.110203266143799, + "learning_rate": 6.472392638036811e-05, + "loss": 3.3236, + "step": 634 + }, + { + "epoch": 0.1949048496009822, + "grad_norm": 3.016730308532715, + "learning_rate": 6.482617586912065e-05, + "loss": 3.2911, + "step": 635 + }, + { + "epoch": 0.19521178637200737, + "grad_norm": 2.896956205368042, + "learning_rate": 6.492842535787322e-05, + "loss": 3.35, + "step": 636 + }, + { + "epoch": 0.19551872314303254, + "grad_norm": 2.7913663387298584, + "learning_rate": 6.503067484662577e-05, + "loss": 3.3474, + "step": 637 + }, + { + "epoch": 0.1958256599140577, + "grad_norm": 3.285518169403076, + "learning_rate": 6.513292433537832e-05, + "loss": 3.2131, + "step": 638 + }, + { + "epoch": 0.19613259668508287, + "grad_norm": 2.588491201400757, + "learning_rate": 6.523517382413088e-05, + "loss": 3.2955, + "step": 639 + }, + { + "epoch": 0.19643953345610804, + "grad_norm": 2.9417827129364014, + "learning_rate": 6.533742331288345e-05, + "loss": 3.2917, + "step": 640 + }, + { + "epoch": 0.1967464702271332, + "grad_norm": 3.2209408283233643, + "learning_rate": 6.5439672801636e-05, + "loss": 3.233, + "step": 641 + }, + { + "epoch": 0.19705340699815838, + "grad_norm": 2.8424925804138184, + "learning_rate": 6.554192229038855e-05, + "loss": 3.3194, + "step": 642 + }, + { + "epoch": 0.19736034376918354, + "grad_norm": 2.9005842208862305, + "learning_rate": 6.56441717791411e-05, + "loss": 3.275, + "step": 643 + }, + { + "epoch": 0.1976672805402087, + "grad_norm": 3.0277016162872314, + "learning_rate": 6.574642126789366e-05, + "loss": 3.2881, + "step": 644 + }, + { + "epoch": 0.19797421731123388, + "grad_norm": 2.8932368755340576, + "learning_rate": 6.584867075664623e-05, + "loss": 3.2799, + "step": 645 + }, + { + "epoch": 0.19828115408225905, + "grad_norm": 2.994464635848999, + "learning_rate": 6.595092024539877e-05, + "loss": 3.258, + "step": 646 + }, + { + "epoch": 0.19858809085328422, + "grad_norm": 2.943040132522583, + "learning_rate": 6.605316973415133e-05, + "loss": 3.1994, + "step": 647 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 2.942765712738037, + "learning_rate": 6.615541922290389e-05, + "loss": 3.1802, + "step": 648 + }, + { + "epoch": 0.19920196439533455, + "grad_norm": 2.8036246299743652, + "learning_rate": 6.625766871165644e-05, + "loss": 3.2426, + "step": 649 + }, + { + "epoch": 0.19950890116635972, + "grad_norm": 2.814507484436035, + "learning_rate": 6.6359918200409e-05, + "loss": 3.2978, + "step": 650 + }, + { + "epoch": 0.1998158379373849, + "grad_norm": 2.8133158683776855, + "learning_rate": 6.646216768916156e-05, + "loss": 3.2435, + "step": 651 + }, + { + "epoch": 0.20012277470841006, + "grad_norm": 2.8596129417419434, + "learning_rate": 6.656441717791412e-05, + "loss": 3.2154, + "step": 652 + }, + { + "epoch": 0.20042971147943522, + "grad_norm": 2.663926839828491, + "learning_rate": 6.666666666666667e-05, + "loss": 3.2487, + "step": 653 + }, + { + "epoch": 0.2007366482504604, + "grad_norm": 3.40561580657959, + "learning_rate": 6.676891615541922e-05, + "loss": 3.1509, + "step": 654 + }, + { + "epoch": 0.20104358502148556, + "grad_norm": 2.5786798000335693, + "learning_rate": 6.687116564417179e-05, + "loss": 3.2686, + "step": 655 + }, + { + "epoch": 0.20135052179251076, + "grad_norm": 3.007436752319336, + "learning_rate": 6.697341513292433e-05, + "loss": 3.2543, + "step": 656 + }, + { + "epoch": 0.20165745856353592, + "grad_norm": 2.5966951847076416, + "learning_rate": 6.70756646216769e-05, + "loss": 3.2643, + "step": 657 + }, + { + "epoch": 0.2019643953345611, + "grad_norm": 3.2698333263397217, + "learning_rate": 6.717791411042945e-05, + "loss": 3.2002, + "step": 658 + }, + { + "epoch": 0.20227133210558626, + "grad_norm": 2.513129472732544, + "learning_rate": 6.7280163599182e-05, + "loss": 3.1551, + "step": 659 + }, + { + "epoch": 0.20257826887661143, + "grad_norm": 2.9690299034118652, + "learning_rate": 6.738241308793456e-05, + "loss": 3.3037, + "step": 660 + }, + { + "epoch": 0.2028852056476366, + "grad_norm": 2.6644227504730225, + "learning_rate": 6.748466257668711e-05, + "loss": 3.3225, + "step": 661 + }, + { + "epoch": 0.20319214241866176, + "grad_norm": 2.6990232467651367, + "learning_rate": 6.758691206543968e-05, + "loss": 3.227, + "step": 662 + }, + { + "epoch": 0.20349907918968693, + "grad_norm": 3.6271350383758545, + "learning_rate": 6.768916155419223e-05, + "loss": 3.32, + "step": 663 + }, + { + "epoch": 0.2038060159607121, + "grad_norm": 2.6351428031921387, + "learning_rate": 6.779141104294479e-05, + "loss": 3.2104, + "step": 664 + }, + { + "epoch": 0.20411295273173727, + "grad_norm": 3.980685234069824, + "learning_rate": 6.789366053169734e-05, + "loss": 3.2602, + "step": 665 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 2.5207509994506836, + "learning_rate": 6.799591002044991e-05, + "loss": 3.2256, + "step": 666 + }, + { + "epoch": 0.2047268262737876, + "grad_norm": 3.0568666458129883, + "learning_rate": 6.809815950920245e-05, + "loss": 3.2918, + "step": 667 + }, + { + "epoch": 0.20503376304481277, + "grad_norm": 2.6476826667785645, + "learning_rate": 6.820040899795501e-05, + "loss": 3.2745, + "step": 668 + }, + { + "epoch": 0.20534069981583794, + "grad_norm": 3.0413191318511963, + "learning_rate": 6.830265848670757e-05, + "loss": 3.2683, + "step": 669 + }, + { + "epoch": 0.2056476365868631, + "grad_norm": 2.6214709281921387, + "learning_rate": 6.840490797546014e-05, + "loss": 3.1399, + "step": 670 + }, + { + "epoch": 0.20595457335788828, + "grad_norm": 3.0577988624572754, + "learning_rate": 6.850715746421268e-05, + "loss": 3.2131, + "step": 671 + }, + { + "epoch": 0.20626151012891344, + "grad_norm": 2.795365571975708, + "learning_rate": 6.860940695296524e-05, + "loss": 3.1633, + "step": 672 + }, + { + "epoch": 0.2065684468999386, + "grad_norm": 3.3030495643615723, + "learning_rate": 6.87116564417178e-05, + "loss": 3.2036, + "step": 673 + }, + { + "epoch": 0.20687538367096378, + "grad_norm": 2.3182966709136963, + "learning_rate": 6.881390593047035e-05, + "loss": 3.2154, + "step": 674 + }, + { + "epoch": 0.20718232044198895, + "grad_norm": 3.133702039718628, + "learning_rate": 6.89161554192229e-05, + "loss": 3.1828, + "step": 675 + }, + { + "epoch": 0.20748925721301412, + "grad_norm": 2.555358409881592, + "learning_rate": 6.901840490797547e-05, + "loss": 3.1434, + "step": 676 + }, + { + "epoch": 0.20779619398403928, + "grad_norm": 2.990675687789917, + "learning_rate": 6.912065439672802e-05, + "loss": 3.2182, + "step": 677 + }, + { + "epoch": 0.20810313075506445, + "grad_norm": 2.5072035789489746, + "learning_rate": 6.922290388548058e-05, + "loss": 3.2735, + "step": 678 + }, + { + "epoch": 0.20841006752608962, + "grad_norm": 3.311474323272705, + "learning_rate": 6.932515337423313e-05, + "loss": 3.2152, + "step": 679 + }, + { + "epoch": 0.2087170042971148, + "grad_norm": 2.7110986709594727, + "learning_rate": 6.942740286298569e-05, + "loss": 3.1633, + "step": 680 + }, + { + "epoch": 0.20902394106813996, + "grad_norm": 2.6963095664978027, + "learning_rate": 6.952965235173824e-05, + "loss": 3.2097, + "step": 681 + }, + { + "epoch": 0.20933087783916512, + "grad_norm": 2.7126448154449463, + "learning_rate": 6.963190184049079e-05, + "loss": 3.232, + "step": 682 + }, + { + "epoch": 0.2096378146101903, + "grad_norm": 2.723257541656494, + "learning_rate": 6.973415132924336e-05, + "loss": 3.1024, + "step": 683 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 2.985407829284668, + "learning_rate": 6.983640081799591e-05, + "loss": 3.215, + "step": 684 + }, + { + "epoch": 0.21025168815224063, + "grad_norm": 2.4878063201904297, + "learning_rate": 6.993865030674847e-05, + "loss": 3.2543, + "step": 685 + }, + { + "epoch": 0.2105586249232658, + "grad_norm": 3.417191505432129, + "learning_rate": 7.004089979550102e-05, + "loss": 3.217, + "step": 686 + }, + { + "epoch": 0.21086556169429096, + "grad_norm": 2.606513738632202, + "learning_rate": 7.014314928425359e-05, + "loss": 3.1831, + "step": 687 + }, + { + "epoch": 0.21117249846531613, + "grad_norm": 2.777334213256836, + "learning_rate": 7.024539877300614e-05, + "loss": 3.1513, + "step": 688 + }, + { + "epoch": 0.2114794352363413, + "grad_norm": 2.718494415283203, + "learning_rate": 7.03476482617587e-05, + "loss": 3.1695, + "step": 689 + }, + { + "epoch": 0.21178637200736647, + "grad_norm": 3.041794776916504, + "learning_rate": 7.044989775051125e-05, + "loss": 3.2078, + "step": 690 + }, + { + "epoch": 0.21209330877839166, + "grad_norm": 2.6473169326782227, + "learning_rate": 7.055214723926382e-05, + "loss": 3.177, + "step": 691 + }, + { + "epoch": 0.21240024554941683, + "grad_norm": 3.2349517345428467, + "learning_rate": 7.065439672801636e-05, + "loss": 3.2144, + "step": 692 + }, + { + "epoch": 0.212707182320442, + "grad_norm": 2.6024651527404785, + "learning_rate": 7.075664621676892e-05, + "loss": 3.2204, + "step": 693 + }, + { + "epoch": 0.21301411909146717, + "grad_norm": 2.9090511798858643, + "learning_rate": 7.085889570552148e-05, + "loss": 3.2473, + "step": 694 + }, + { + "epoch": 0.21332105586249234, + "grad_norm": 3.230525255203247, + "learning_rate": 7.096114519427403e-05, + "loss": 3.2552, + "step": 695 + }, + { + "epoch": 0.2136279926335175, + "grad_norm": 2.2609128952026367, + "learning_rate": 7.106339468302658e-05, + "loss": 3.1302, + "step": 696 + }, + { + "epoch": 0.21393492940454267, + "grad_norm": 3.484372854232788, + "learning_rate": 7.116564417177914e-05, + "loss": 3.1578, + "step": 697 + }, + { + "epoch": 0.21424186617556784, + "grad_norm": 2.130702257156372, + "learning_rate": 7.12678936605317e-05, + "loss": 3.2089, + "step": 698 + }, + { + "epoch": 0.214548802946593, + "grad_norm": 3.0673611164093018, + "learning_rate": 7.137014314928426e-05, + "loss": 3.214, + "step": 699 + }, + { + "epoch": 0.21485573971761818, + "grad_norm": 2.572826862335205, + "learning_rate": 7.147239263803681e-05, + "loss": 3.1824, + "step": 700 + }, + { + "epoch": 0.21516267648864335, + "grad_norm": 2.8327746391296387, + "learning_rate": 7.157464212678937e-05, + "loss": 3.2384, + "step": 701 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 2.863041877746582, + "learning_rate": 7.167689161554193e-05, + "loss": 3.1102, + "step": 702 + }, + { + "epoch": 0.21577655003069368, + "grad_norm": 2.2519750595092773, + "learning_rate": 7.177914110429447e-05, + "loss": 3.1541, + "step": 703 + }, + { + "epoch": 0.21608348680171885, + "grad_norm": 3.197129011154175, + "learning_rate": 7.188139059304704e-05, + "loss": 3.2407, + "step": 704 + }, + { + "epoch": 0.21639042357274402, + "grad_norm": 2.32582426071167, + "learning_rate": 7.19836400817996e-05, + "loss": 3.1895, + "step": 705 + }, + { + "epoch": 0.21669736034376919, + "grad_norm": 3.0128488540649414, + "learning_rate": 7.208588957055215e-05, + "loss": 3.2839, + "step": 706 + }, + { + "epoch": 0.21700429711479435, + "grad_norm": 2.503342390060425, + "learning_rate": 7.21881390593047e-05, + "loss": 3.2093, + "step": 707 + }, + { + "epoch": 0.21731123388581952, + "grad_norm": 2.7540833950042725, + "learning_rate": 7.229038854805727e-05, + "loss": 3.2143, + "step": 708 + }, + { + "epoch": 0.2176181706568447, + "grad_norm": 2.8838772773742676, + "learning_rate": 7.239263803680982e-05, + "loss": 3.2051, + "step": 709 + }, + { + "epoch": 0.21792510742786986, + "grad_norm": 2.7495758533477783, + "learning_rate": 7.249488752556238e-05, + "loss": 3.0701, + "step": 710 + }, + { + "epoch": 0.21823204419889503, + "grad_norm": 2.684539794921875, + "learning_rate": 7.259713701431493e-05, + "loss": 3.1917, + "step": 711 + }, + { + "epoch": 0.2185389809699202, + "grad_norm": 2.8330819606781006, + "learning_rate": 7.26993865030675e-05, + "loss": 3.1685, + "step": 712 + }, + { + "epoch": 0.21884591774094536, + "grad_norm": 2.6974711418151855, + "learning_rate": 7.280163599182005e-05, + "loss": 3.0953, + "step": 713 + }, + { + "epoch": 0.21915285451197053, + "grad_norm": 2.5129306316375732, + "learning_rate": 7.29038854805726e-05, + "loss": 3.1371, + "step": 714 + }, + { + "epoch": 0.2194597912829957, + "grad_norm": 2.7884230613708496, + "learning_rate": 7.300613496932516e-05, + "loss": 3.1386, + "step": 715 + }, + { + "epoch": 0.21976672805402087, + "grad_norm": 2.296306610107422, + "learning_rate": 7.310838445807771e-05, + "loss": 3.1735, + "step": 716 + }, + { + "epoch": 0.22007366482504603, + "grad_norm": 2.777911424636841, + "learning_rate": 7.321063394683026e-05, + "loss": 3.1726, + "step": 717 + }, + { + "epoch": 0.2203806015960712, + "grad_norm": 2.5349695682525635, + "learning_rate": 7.331288343558282e-05, + "loss": 3.1603, + "step": 718 + }, + { + "epoch": 0.22068753836709637, + "grad_norm": 2.415412425994873, + "learning_rate": 7.341513292433539e-05, + "loss": 3.1378, + "step": 719 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 2.7188358306884766, + "learning_rate": 7.351738241308794e-05, + "loss": 3.1321, + "step": 720 + }, + { + "epoch": 0.2213014119091467, + "grad_norm": 2.4872183799743652, + "learning_rate": 7.361963190184049e-05, + "loss": 3.1283, + "step": 721 + }, + { + "epoch": 0.22160834868017187, + "grad_norm": 2.454535961151123, + "learning_rate": 7.372188139059305e-05, + "loss": 3.1085, + "step": 722 + }, + { + "epoch": 0.22191528545119704, + "grad_norm": 2.5621426105499268, + "learning_rate": 7.382413087934561e-05, + "loss": 3.1307, + "step": 723 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.513777256011963, + "learning_rate": 7.392638036809815e-05, + "loss": 3.1103, + "step": 724 + }, + { + "epoch": 0.22252915899324738, + "grad_norm": 2.596559762954712, + "learning_rate": 7.402862985685072e-05, + "loss": 3.1563, + "step": 725 + }, + { + "epoch": 0.22283609576427257, + "grad_norm": 2.371487617492676, + "learning_rate": 7.413087934560327e-05, + "loss": 3.1344, + "step": 726 + }, + { + "epoch": 0.22314303253529774, + "grad_norm": 2.7252206802368164, + "learning_rate": 7.423312883435584e-05, + "loss": 3.2139, + "step": 727 + }, + { + "epoch": 0.2234499693063229, + "grad_norm": 2.2834722995758057, + "learning_rate": 7.433537832310838e-05, + "loss": 3.1461, + "step": 728 + }, + { + "epoch": 0.22375690607734808, + "grad_norm": 3.0965540409088135, + "learning_rate": 7.443762781186095e-05, + "loss": 3.1433, + "step": 729 + }, + { + "epoch": 0.22406384284837325, + "grad_norm": 2.351365804672241, + "learning_rate": 7.45398773006135e-05, + "loss": 3.1737, + "step": 730 + }, + { + "epoch": 0.2243707796193984, + "grad_norm": 3.0938596725463867, + "learning_rate": 7.464212678936606e-05, + "loss": 3.1689, + "step": 731 + }, + { + "epoch": 0.22467771639042358, + "grad_norm": 2.415039300918579, + "learning_rate": 7.474437627811861e-05, + "loss": 3.1146, + "step": 732 + }, + { + "epoch": 0.22498465316144875, + "grad_norm": 2.8242318630218506, + "learning_rate": 7.484662576687118e-05, + "loss": 3.0812, + "step": 733 + }, + { + "epoch": 0.22529158993247392, + "grad_norm": 2.4347777366638184, + "learning_rate": 7.494887525562373e-05, + "loss": 3.203, + "step": 734 + }, + { + "epoch": 0.22559852670349909, + "grad_norm": 2.953418016433716, + "learning_rate": 7.505112474437628e-05, + "loss": 3.109, + "step": 735 + }, + { + "epoch": 0.22590546347452425, + "grad_norm": 2.600888252258301, + "learning_rate": 7.515337423312884e-05, + "loss": 3.1859, + "step": 736 + }, + { + "epoch": 0.22621240024554942, + "grad_norm": 2.7484869956970215, + "learning_rate": 7.525562372188139e-05, + "loss": 3.1169, + "step": 737 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 2.4797677993774414, + "learning_rate": 7.535787321063396e-05, + "loss": 3.0696, + "step": 738 + }, + { + "epoch": 0.22682627378759976, + "grad_norm": 2.641873359680176, + "learning_rate": 7.54601226993865e-05, + "loss": 3.1545, + "step": 739 + }, + { + "epoch": 0.22713321055862493, + "grad_norm": 2.3956825733184814, + "learning_rate": 7.556237218813907e-05, + "loss": 3.1295, + "step": 740 + }, + { + "epoch": 0.2274401473296501, + "grad_norm": 2.8832130432128906, + "learning_rate": 7.566462167689162e-05, + "loss": 3.1119, + "step": 741 + }, + { + "epoch": 0.22774708410067526, + "grad_norm": 2.3001184463500977, + "learning_rate": 7.576687116564417e-05, + "loss": 3.0068, + "step": 742 + }, + { + "epoch": 0.22805402087170043, + "grad_norm": 2.8682122230529785, + "learning_rate": 7.586912065439673e-05, + "loss": 3.0562, + "step": 743 + }, + { + "epoch": 0.2283609576427256, + "grad_norm": 2.2176413536071777, + "learning_rate": 7.59713701431493e-05, + "loss": 3.1395, + "step": 744 + }, + { + "epoch": 0.22866789441375077, + "grad_norm": 3.698274612426758, + "learning_rate": 7.607361963190185e-05, + "loss": 3.209, + "step": 745 + }, + { + "epoch": 0.22897483118477593, + "grad_norm": 2.141063928604126, + "learning_rate": 7.61758691206544e-05, + "loss": 3.1734, + "step": 746 + }, + { + "epoch": 0.2292817679558011, + "grad_norm": 2.728498697280884, + "learning_rate": 7.627811860940695e-05, + "loss": 3.1498, + "step": 747 + }, + { + "epoch": 0.22958870472682627, + "grad_norm": 2.271678924560547, + "learning_rate": 7.638036809815952e-05, + "loss": 3.1538, + "step": 748 + }, + { + "epoch": 0.22989564149785144, + "grad_norm": 2.6095521450042725, + "learning_rate": 7.648261758691206e-05, + "loss": 3.155, + "step": 749 + }, + { + "epoch": 0.2302025782688766, + "grad_norm": 2.410792112350464, + "learning_rate": 7.658486707566463e-05, + "loss": 3.0478, + "step": 750 + }, + { + "epoch": 0.23050951503990177, + "grad_norm": 2.6980888843536377, + "learning_rate": 7.668711656441718e-05, + "loss": 3.1369, + "step": 751 + }, + { + "epoch": 0.23081645181092694, + "grad_norm": 2.353308916091919, + "learning_rate": 7.678936605316974e-05, + "loss": 3.0052, + "step": 752 + }, + { + "epoch": 0.2311233885819521, + "grad_norm": 2.4530155658721924, + "learning_rate": 7.689161554192229e-05, + "loss": 3.1348, + "step": 753 + }, + { + "epoch": 0.23143032535297728, + "grad_norm": 2.393601894378662, + "learning_rate": 7.699386503067484e-05, + "loss": 2.9941, + "step": 754 + }, + { + "epoch": 0.23173726212400245, + "grad_norm": 2.576876401901245, + "learning_rate": 7.709611451942741e-05, + "loss": 3.114, + "step": 755 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 2.0420913696289062, + "learning_rate": 7.719836400817996e-05, + "loss": 3.132, + "step": 756 + }, + { + "epoch": 0.23235113566605278, + "grad_norm": 3.0095622539520264, + "learning_rate": 7.730061349693252e-05, + "loss": 3.1763, + "step": 757 + }, + { + "epoch": 0.23265807243707795, + "grad_norm": 2.224005937576294, + "learning_rate": 7.740286298568507e-05, + "loss": 3.0703, + "step": 758 + }, + { + "epoch": 0.23296500920810312, + "grad_norm": 2.7559845447540283, + "learning_rate": 7.750511247443764e-05, + "loss": 3.1026, + "step": 759 + }, + { + "epoch": 0.2332719459791283, + "grad_norm": 2.2965753078460693, + "learning_rate": 7.760736196319018e-05, + "loss": 3.0284, + "step": 760 + }, + { + "epoch": 0.23357888275015345, + "grad_norm": 2.374398708343506, + "learning_rate": 7.770961145194275e-05, + "loss": 3.0636, + "step": 761 + }, + { + "epoch": 0.23388581952117865, + "grad_norm": 2.4315314292907715, + "learning_rate": 7.78118609406953e-05, + "loss": 3.0906, + "step": 762 + }, + { + "epoch": 0.23419275629220382, + "grad_norm": 2.5609946250915527, + "learning_rate": 7.791411042944787e-05, + "loss": 3.0692, + "step": 763 + }, + { + "epoch": 0.234499693063229, + "grad_norm": 2.419597864151001, + "learning_rate": 7.80163599182004e-05, + "loss": 3.1934, + "step": 764 + }, + { + "epoch": 0.23480662983425415, + "grad_norm": 3.0499062538146973, + "learning_rate": 7.811860940695297e-05, + "loss": 3.18, + "step": 765 + }, + { + "epoch": 0.23511356660527932, + "grad_norm": 2.464421510696411, + "learning_rate": 7.822085889570553e-05, + "loss": 3.1591, + "step": 766 + }, + { + "epoch": 0.2354205033763045, + "grad_norm": 3.4370174407958984, + "learning_rate": 7.832310838445808e-05, + "loss": 3.1156, + "step": 767 + }, + { + "epoch": 0.23572744014732966, + "grad_norm": 2.207406520843506, + "learning_rate": 7.842535787321063e-05, + "loss": 3.0557, + "step": 768 + }, + { + "epoch": 0.23603437691835483, + "grad_norm": 2.484807014465332, + "learning_rate": 7.85276073619632e-05, + "loss": 3.1003, + "step": 769 + }, + { + "epoch": 0.23634131368938, + "grad_norm": 2.33217716217041, + "learning_rate": 7.862985685071576e-05, + "loss": 3.0707, + "step": 770 + }, + { + "epoch": 0.23664825046040516, + "grad_norm": 2.493717670440674, + "learning_rate": 7.873210633946831e-05, + "loss": 3.127, + "step": 771 + }, + { + "epoch": 0.23695518723143033, + "grad_norm": 2.5824413299560547, + "learning_rate": 7.883435582822086e-05, + "loss": 3.1042, + "step": 772 + }, + { + "epoch": 0.2372621240024555, + "grad_norm": 2.4137654304504395, + "learning_rate": 7.893660531697342e-05, + "loss": 3.136, + "step": 773 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 2.4657833576202393, + "learning_rate": 7.903885480572597e-05, + "loss": 3.038, + "step": 774 + }, + { + "epoch": 0.23787599754450584, + "grad_norm": 2.426260471343994, + "learning_rate": 7.914110429447852e-05, + "loss": 3.0102, + "step": 775 + }, + { + "epoch": 0.238182934315531, + "grad_norm": 2.4658050537109375, + "learning_rate": 7.924335378323109e-05, + "loss": 3.0645, + "step": 776 + }, + { + "epoch": 0.23848987108655617, + "grad_norm": 2.186267614364624, + "learning_rate": 7.934560327198364e-05, + "loss": 3.0585, + "step": 777 + }, + { + "epoch": 0.23879680785758134, + "grad_norm": 2.8824141025543213, + "learning_rate": 7.94478527607362e-05, + "loss": 3.0796, + "step": 778 + }, + { + "epoch": 0.2391037446286065, + "grad_norm": 1.9940539598464966, + "learning_rate": 7.955010224948875e-05, + "loss": 2.9894, + "step": 779 + }, + { + "epoch": 0.23941068139963168, + "grad_norm": 2.9386861324310303, + "learning_rate": 7.965235173824132e-05, + "loss": 3.1147, + "step": 780 + }, + { + "epoch": 0.23971761817065684, + "grad_norm": 2.241983413696289, + "learning_rate": 7.975460122699386e-05, + "loss": 2.9977, + "step": 781 + }, + { + "epoch": 0.240024554941682, + "grad_norm": 2.4796900749206543, + "learning_rate": 7.985685071574643e-05, + "loss": 3.0507, + "step": 782 + }, + { + "epoch": 0.24033149171270718, + "grad_norm": 2.6178741455078125, + "learning_rate": 7.995910020449898e-05, + "loss": 3.0299, + "step": 783 + }, + { + "epoch": 0.24063842848373235, + "grad_norm": 2.157179594039917, + "learning_rate": 8.006134969325155e-05, + "loss": 3.0419, + "step": 784 + }, + { + "epoch": 0.24094536525475752, + "grad_norm": 2.49029541015625, + "learning_rate": 8.016359918200409e-05, + "loss": 3.0785, + "step": 785 + }, + { + "epoch": 0.24125230202578268, + "grad_norm": 2.254014492034912, + "learning_rate": 8.026584867075665e-05, + "loss": 3.0009, + "step": 786 + }, + { + "epoch": 0.24155923879680785, + "grad_norm": 2.514465570449829, + "learning_rate": 8.036809815950921e-05, + "loss": 3.0221, + "step": 787 + }, + { + "epoch": 0.24186617556783302, + "grad_norm": 2.309812545776367, + "learning_rate": 8.047034764826176e-05, + "loss": 2.9822, + "step": 788 + }, + { + "epoch": 0.2421731123388582, + "grad_norm": 2.5367796421051025, + "learning_rate": 8.057259713701431e-05, + "loss": 2.966, + "step": 789 + }, + { + "epoch": 0.24248004910988336, + "grad_norm": 2.4668943881988525, + "learning_rate": 8.067484662576688e-05, + "loss": 3.1177, + "step": 790 + }, + { + "epoch": 0.24278698588090852, + "grad_norm": 2.9424917697906494, + "learning_rate": 8.077709611451944e-05, + "loss": 3.078, + "step": 791 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 2.3068933486938477, + "learning_rate": 8.087934560327199e-05, + "loss": 3.0415, + "step": 792 + }, + { + "epoch": 0.24340085942295886, + "grad_norm": 2.675631284713745, + "learning_rate": 8.098159509202454e-05, + "loss": 3.012, + "step": 793 + }, + { + "epoch": 0.24370779619398403, + "grad_norm": 2.0261662006378174, + "learning_rate": 8.10838445807771e-05, + "loss": 3.0023, + "step": 794 + }, + { + "epoch": 0.2440147329650092, + "grad_norm": 3.32330322265625, + "learning_rate": 8.118609406952966e-05, + "loss": 3.0992, + "step": 795 + }, + { + "epoch": 0.24432166973603436, + "grad_norm": 2.1587088108062744, + "learning_rate": 8.12883435582822e-05, + "loss": 3.0922, + "step": 796 + }, + { + "epoch": 0.24462860650705956, + "grad_norm": 2.639254331588745, + "learning_rate": 8.139059304703477e-05, + "loss": 2.9856, + "step": 797 + }, + { + "epoch": 0.24493554327808473, + "grad_norm": 1.9976975917816162, + "learning_rate": 8.149284253578732e-05, + "loss": 3.0015, + "step": 798 + }, + { + "epoch": 0.2452424800491099, + "grad_norm": 2.763504981994629, + "learning_rate": 8.159509202453988e-05, + "loss": 3.0437, + "step": 799 + }, + { + "epoch": 0.24554941682013506, + "grad_norm": 1.9080138206481934, + "learning_rate": 8.169734151329243e-05, + "loss": 3.0009, + "step": 800 + }, + { + "epoch": 0.24585635359116023, + "grad_norm": 3.1276164054870605, + "learning_rate": 8.1799591002045e-05, + "loss": 3.0433, + "step": 801 + }, + { + "epoch": 0.2461632903621854, + "grad_norm": 2.0463218688964844, + "learning_rate": 8.190184049079755e-05, + "loss": 2.988, + "step": 802 + }, + { + "epoch": 0.24647022713321057, + "grad_norm": 2.8476648330688477, + "learning_rate": 8.20040899795501e-05, + "loss": 3.0238, + "step": 803 + }, + { + "epoch": 0.24677716390423574, + "grad_norm": 1.9715898036956787, + "learning_rate": 8.210633946830266e-05, + "loss": 3.0657, + "step": 804 + }, + { + "epoch": 0.2470841006752609, + "grad_norm": 3.369995594024658, + "learning_rate": 8.220858895705523e-05, + "loss": 3.0181, + "step": 805 + }, + { + "epoch": 0.24739103744628607, + "grad_norm": 2.0333900451660156, + "learning_rate": 8.231083844580777e-05, + "loss": 3.0589, + "step": 806 + }, + { + "epoch": 0.24769797421731124, + "grad_norm": 2.5702931880950928, + "learning_rate": 8.241308793456033e-05, + "loss": 2.9908, + "step": 807 + }, + { + "epoch": 0.2480049109883364, + "grad_norm": 2.12131929397583, + "learning_rate": 8.251533742331289e-05, + "loss": 3.0519, + "step": 808 + }, + { + "epoch": 0.24831184775936158, + "grad_norm": 2.5457377433776855, + "learning_rate": 8.261758691206544e-05, + "loss": 3.019, + "step": 809 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 2.0954740047454834, + "learning_rate": 8.2719836400818e-05, + "loss": 2.9805, + "step": 810 + }, + { + "epoch": 0.2489257213014119, + "grad_norm": 2.2456700801849365, + "learning_rate": 8.282208588957055e-05, + "loss": 3.0627, + "step": 811 + }, + { + "epoch": 0.24923265807243708, + "grad_norm": 2.4453790187835693, + "learning_rate": 8.292433537832312e-05, + "loss": 3.0447, + "step": 812 + }, + { + "epoch": 0.24953959484346225, + "grad_norm": 2.1835873126983643, + "learning_rate": 8.302658486707567e-05, + "loss": 3.0008, + "step": 813 + }, + { + "epoch": 0.24984653161448742, + "grad_norm": 2.292989492416382, + "learning_rate": 8.312883435582822e-05, + "loss": 2.9175, + "step": 814 + }, + { + "epoch": 0.2501534683855126, + "grad_norm": 2.408888816833496, + "learning_rate": 8.323108384458078e-05, + "loss": 2.9649, + "step": 815 + }, + { + "epoch": 0.2504604051565378, + "grad_norm": 2.1873834133148193, + "learning_rate": 8.333333333333334e-05, + "loss": 2.9812, + "step": 816 + }, + { + "epoch": 0.25076734192756295, + "grad_norm": 2.2599284648895264, + "learning_rate": 8.343558282208588e-05, + "loss": 3.0086, + "step": 817 + }, + { + "epoch": 0.2510742786985881, + "grad_norm": 2.1902761459350586, + "learning_rate": 8.353783231083845e-05, + "loss": 2.9295, + "step": 818 + }, + { + "epoch": 0.2513812154696133, + "grad_norm": 2.4830422401428223, + "learning_rate": 8.3640081799591e-05, + "loss": 2.9808, + "step": 819 + }, + { + "epoch": 0.25168815224063845, + "grad_norm": 2.2274281978607178, + "learning_rate": 8.374233128834357e-05, + "loss": 2.9525, + "step": 820 + }, + { + "epoch": 0.2519950890116636, + "grad_norm": 2.2949111461639404, + "learning_rate": 8.384458077709611e-05, + "loss": 3.0313, + "step": 821 + }, + { + "epoch": 0.2523020257826888, + "grad_norm": 2.2345564365386963, + "learning_rate": 8.394683026584868e-05, + "loss": 2.9024, + "step": 822 + }, + { + "epoch": 0.25260896255371396, + "grad_norm": 2.488744020462036, + "learning_rate": 8.404907975460123e-05, + "loss": 2.9907, + "step": 823 + }, + { + "epoch": 0.2529158993247391, + "grad_norm": 1.9192837476730347, + "learning_rate": 8.415132924335379e-05, + "loss": 2.9792, + "step": 824 + }, + { + "epoch": 0.2532228360957643, + "grad_norm": 2.6426947116851807, + "learning_rate": 8.425357873210634e-05, + "loss": 2.972, + "step": 825 + }, + { + "epoch": 0.25352977286678946, + "grad_norm": 1.9950047731399536, + "learning_rate": 8.435582822085891e-05, + "loss": 2.9885, + "step": 826 + }, + { + "epoch": 0.25383670963781463, + "grad_norm": 2.30191969871521, + "learning_rate": 8.445807770961146e-05, + "loss": 2.9358, + "step": 827 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 2.1111395359039307, + "learning_rate": 8.456032719836401e-05, + "loss": 3.0343, + "step": 828 + }, + { + "epoch": 0.25445058317986496, + "grad_norm": 2.7292258739471436, + "learning_rate": 8.466257668711657e-05, + "loss": 2.9465, + "step": 829 + }, + { + "epoch": 0.25475751995089013, + "grad_norm": 1.9130604267120361, + "learning_rate": 8.476482617586912e-05, + "loss": 2.9443, + "step": 830 + }, + { + "epoch": 0.2550644567219153, + "grad_norm": 2.4240024089813232, + "learning_rate": 8.486707566462168e-05, + "loss": 2.963, + "step": 831 + }, + { + "epoch": 0.25537139349294047, + "grad_norm": 2.062875509262085, + "learning_rate": 8.496932515337423e-05, + "loss": 3.0127, + "step": 832 + }, + { + "epoch": 0.25567833026396564, + "grad_norm": 2.223639726638794, + "learning_rate": 8.50715746421268e-05, + "loss": 2.944, + "step": 833 + }, + { + "epoch": 0.2559852670349908, + "grad_norm": 2.2969272136688232, + "learning_rate": 8.517382413087935e-05, + "loss": 2.9495, + "step": 834 + }, + { + "epoch": 0.256292203806016, + "grad_norm": 2.1343178749084473, + "learning_rate": 8.52760736196319e-05, + "loss": 3.0383, + "step": 835 + }, + { + "epoch": 0.25659914057704114, + "grad_norm": 2.2348313331604004, + "learning_rate": 8.537832310838446e-05, + "loss": 2.9205, + "step": 836 + }, + { + "epoch": 0.2569060773480663, + "grad_norm": 2.2653896808624268, + "learning_rate": 8.548057259713702e-05, + "loss": 2.9699, + "step": 837 + }, + { + "epoch": 0.2572130141190915, + "grad_norm": 2.1332547664642334, + "learning_rate": 8.558282208588958e-05, + "loss": 2.9318, + "step": 838 + }, + { + "epoch": 0.25751995089011664, + "grad_norm": 2.5935778617858887, + "learning_rate": 8.568507157464213e-05, + "loss": 2.9754, + "step": 839 + }, + { + "epoch": 0.2578268876611418, + "grad_norm": 2.073923110961914, + "learning_rate": 8.578732106339469e-05, + "loss": 3.0396, + "step": 840 + }, + { + "epoch": 0.258133824432167, + "grad_norm": 2.485049247741699, + "learning_rate": 8.588957055214725e-05, + "loss": 2.9297, + "step": 841 + }, + { + "epoch": 0.25844076120319215, + "grad_norm": 1.9425253868103027, + "learning_rate": 8.599182004089979e-05, + "loss": 3.0131, + "step": 842 + }, + { + "epoch": 0.2587476979742173, + "grad_norm": 2.6248724460601807, + "learning_rate": 8.609406952965236e-05, + "loss": 3.0345, + "step": 843 + }, + { + "epoch": 0.2590546347452425, + "grad_norm": 1.9123374223709106, + "learning_rate": 8.619631901840491e-05, + "loss": 3.0259, + "step": 844 + }, + { + "epoch": 0.25936157151626765, + "grad_norm": 2.457913637161255, + "learning_rate": 8.629856850715747e-05, + "loss": 3.0015, + "step": 845 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 2.0444202423095703, + "learning_rate": 8.640081799591002e-05, + "loss": 2.9663, + "step": 846 + }, + { + "epoch": 0.259975445058318, + "grad_norm": 2.1673583984375, + "learning_rate": 8.650306748466259e-05, + "loss": 3.0646, + "step": 847 + }, + { + "epoch": 0.26028238182934316, + "grad_norm": 2.1198627948760986, + "learning_rate": 8.660531697341514e-05, + "loss": 2.8769, + "step": 848 + }, + { + "epoch": 0.2605893186003683, + "grad_norm": 2.379960775375366, + "learning_rate": 8.67075664621677e-05, + "loss": 2.9637, + "step": 849 + }, + { + "epoch": 0.2608962553713935, + "grad_norm": 2.3954226970672607, + "learning_rate": 8.680981595092025e-05, + "loss": 3.025, + "step": 850 + }, + { + "epoch": 0.26120319214241866, + "grad_norm": 2.254746198654175, + "learning_rate": 8.69120654396728e-05, + "loss": 2.9962, + "step": 851 + }, + { + "epoch": 0.26151012891344383, + "grad_norm": 2.0851991176605225, + "learning_rate": 8.701431492842537e-05, + "loss": 2.9399, + "step": 852 + }, + { + "epoch": 0.261817065684469, + "grad_norm": 2.2800698280334473, + "learning_rate": 8.711656441717791e-05, + "loss": 2.9465, + "step": 853 + }, + { + "epoch": 0.26212400245549416, + "grad_norm": 2.3628437519073486, + "learning_rate": 8.721881390593048e-05, + "loss": 3.0298, + "step": 854 + }, + { + "epoch": 0.26243093922651933, + "grad_norm": 1.9642207622528076, + "learning_rate": 8.732106339468303e-05, + "loss": 2.8462, + "step": 855 + }, + { + "epoch": 0.2627378759975445, + "grad_norm": 2.5833423137664795, + "learning_rate": 8.742331288343558e-05, + "loss": 2.9024, + "step": 856 + }, + { + "epoch": 0.26304481276856967, + "grad_norm": 1.7022998332977295, + "learning_rate": 8.752556237218814e-05, + "loss": 2.9948, + "step": 857 + }, + { + "epoch": 0.26335174953959484, + "grad_norm": 3.181725025177002, + "learning_rate": 8.76278118609407e-05, + "loss": 3.0634, + "step": 858 + }, + { + "epoch": 0.26365868631062, + "grad_norm": 1.8931077718734741, + "learning_rate": 8.773006134969326e-05, + "loss": 2.9974, + "step": 859 + }, + { + "epoch": 0.2639656230816452, + "grad_norm": 2.5016703605651855, + "learning_rate": 8.783231083844581e-05, + "loss": 3.0109, + "step": 860 + }, + { + "epoch": 0.26427255985267034, + "grad_norm": 1.810957908630371, + "learning_rate": 8.793456032719837e-05, + "loss": 3.0143, + "step": 861 + }, + { + "epoch": 0.2645794966236955, + "grad_norm": 2.3004086017608643, + "learning_rate": 8.803680981595093e-05, + "loss": 2.9825, + "step": 862 + }, + { + "epoch": 0.2648864333947207, + "grad_norm": 2.23740816116333, + "learning_rate": 8.813905930470347e-05, + "loss": 2.8897, + "step": 863 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 2.441157579421997, + "learning_rate": 8.824130879345604e-05, + "loss": 2.8966, + "step": 864 + }, + { + "epoch": 0.265500306936771, + "grad_norm": 2.063201665878296, + "learning_rate": 8.83435582822086e-05, + "loss": 2.9468, + "step": 865 + }, + { + "epoch": 0.2658072437077962, + "grad_norm": 2.1484951972961426, + "learning_rate": 8.844580777096115e-05, + "loss": 2.9199, + "step": 866 + }, + { + "epoch": 0.26611418047882135, + "grad_norm": 2.167827844619751, + "learning_rate": 8.85480572597137e-05, + "loss": 2.9403, + "step": 867 + }, + { + "epoch": 0.2664211172498465, + "grad_norm": 2.193556070327759, + "learning_rate": 8.865030674846625e-05, + "loss": 2.9171, + "step": 868 + }, + { + "epoch": 0.2667280540208717, + "grad_norm": 2.0754151344299316, + "learning_rate": 8.875255623721882e-05, + "loss": 2.9605, + "step": 869 + }, + { + "epoch": 0.26703499079189685, + "grad_norm": 2.1351094245910645, + "learning_rate": 8.885480572597138e-05, + "loss": 2.9272, + "step": 870 + }, + { + "epoch": 0.267341927562922, + "grad_norm": 2.0486347675323486, + "learning_rate": 8.895705521472393e-05, + "loss": 3.0308, + "step": 871 + }, + { + "epoch": 0.2676488643339472, + "grad_norm": 2.3303308486938477, + "learning_rate": 8.905930470347648e-05, + "loss": 2.9061, + "step": 872 + }, + { + "epoch": 0.26795580110497236, + "grad_norm": 1.9345083236694336, + "learning_rate": 8.916155419222905e-05, + "loss": 2.9644, + "step": 873 + }, + { + "epoch": 0.2682627378759975, + "grad_norm": 2.451918601989746, + "learning_rate": 8.926380368098159e-05, + "loss": 2.9536, + "step": 874 + }, + { + "epoch": 0.2685696746470227, + "grad_norm": 1.6964573860168457, + "learning_rate": 8.936605316973416e-05, + "loss": 2.9228, + "step": 875 + }, + { + "epoch": 0.26887661141804786, + "grad_norm": 2.2414000034332275, + "learning_rate": 8.946830265848671e-05, + "loss": 2.9776, + "step": 876 + }, + { + "epoch": 0.26918354818907303, + "grad_norm": 1.725002408027649, + "learning_rate": 8.957055214723928e-05, + "loss": 2.9837, + "step": 877 + }, + { + "epoch": 0.2694904849600982, + "grad_norm": 2.1498587131500244, + "learning_rate": 8.967280163599182e-05, + "loss": 2.8684, + "step": 878 + }, + { + "epoch": 0.26979742173112337, + "grad_norm": 1.814738392829895, + "learning_rate": 8.977505112474438e-05, + "loss": 2.9077, + "step": 879 + }, + { + "epoch": 0.27010435850214853, + "grad_norm": 2.3086628913879395, + "learning_rate": 8.987730061349694e-05, + "loss": 2.9482, + "step": 880 + }, + { + "epoch": 0.2704112952731737, + "grad_norm": 1.7470855712890625, + "learning_rate": 8.997955010224949e-05, + "loss": 2.9775, + "step": 881 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 2.2822775840759277, + "learning_rate": 9.008179959100205e-05, + "loss": 3.0004, + "step": 882 + }, + { + "epoch": 0.27102516881522404, + "grad_norm": 1.9530903100967407, + "learning_rate": 9.018404907975461e-05, + "loss": 2.949, + "step": 883 + }, + { + "epoch": 0.2713321055862492, + "grad_norm": 2.0626885890960693, + "learning_rate": 9.028629856850717e-05, + "loss": 2.9184, + "step": 884 + }, + { + "epoch": 0.2716390423572744, + "grad_norm": 2.0040712356567383, + "learning_rate": 9.038854805725972e-05, + "loss": 2.8562, + "step": 885 + }, + { + "epoch": 0.2719459791282996, + "grad_norm": 2.026193141937256, + "learning_rate": 9.049079754601227e-05, + "loss": 2.883, + "step": 886 + }, + { + "epoch": 0.27225291589932477, + "grad_norm": 1.8337095975875854, + "learning_rate": 9.059304703476483e-05, + "loss": 2.8512, + "step": 887 + }, + { + "epoch": 0.27255985267034993, + "grad_norm": 2.1098122596740723, + "learning_rate": 9.069529652351738e-05, + "loss": 2.9024, + "step": 888 + }, + { + "epoch": 0.2728667894413751, + "grad_norm": 2.065650701522827, + "learning_rate": 9.079754601226993e-05, + "loss": 2.9291, + "step": 889 + }, + { + "epoch": 0.27317372621240027, + "grad_norm": 2.204819679260254, + "learning_rate": 9.08997955010225e-05, + "loss": 2.9153, + "step": 890 + }, + { + "epoch": 0.27348066298342544, + "grad_norm": 1.7931475639343262, + "learning_rate": 9.100204498977506e-05, + "loss": 2.9104, + "step": 891 + }, + { + "epoch": 0.2737875997544506, + "grad_norm": 2.4288859367370605, + "learning_rate": 9.110429447852761e-05, + "loss": 2.9974, + "step": 892 + }, + { + "epoch": 0.2740945365254758, + "grad_norm": 2.095872640609741, + "learning_rate": 9.120654396728016e-05, + "loss": 2.8446, + "step": 893 + }, + { + "epoch": 0.27440147329650094, + "grad_norm": 2.054410696029663, + "learning_rate": 9.130879345603273e-05, + "loss": 2.9008, + "step": 894 + }, + { + "epoch": 0.2747084100675261, + "grad_norm": 2.1989710330963135, + "learning_rate": 9.141104294478528e-05, + "loss": 2.8808, + "step": 895 + }, + { + "epoch": 0.2750153468385513, + "grad_norm": 2.531081199645996, + "learning_rate": 9.151329243353784e-05, + "loss": 2.8928, + "step": 896 + }, + { + "epoch": 0.27532228360957645, + "grad_norm": 2.010425567626953, + "learning_rate": 9.161554192229039e-05, + "loss": 2.9051, + "step": 897 + }, + { + "epoch": 0.2756292203806016, + "grad_norm": 1.9320241212844849, + "learning_rate": 9.171779141104296e-05, + "loss": 2.8675, + "step": 898 + }, + { + "epoch": 0.2759361571516268, + "grad_norm": 2.2280430793762207, + "learning_rate": 9.18200408997955e-05, + "loss": 2.9082, + "step": 899 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 1.9172335863113403, + "learning_rate": 9.192229038854807e-05, + "loss": 2.8947, + "step": 900 + }, + { + "epoch": 0.2765500306936771, + "grad_norm": 2.0846056938171387, + "learning_rate": 9.202453987730062e-05, + "loss": 2.9161, + "step": 901 + }, + { + "epoch": 0.2768569674647023, + "grad_norm": 1.875034213066101, + "learning_rate": 9.212678936605317e-05, + "loss": 2.8937, + "step": 902 + }, + { + "epoch": 0.27716390423572745, + "grad_norm": 2.230164051055908, + "learning_rate": 9.222903885480573e-05, + "loss": 2.8396, + "step": 903 + }, + { + "epoch": 0.2774708410067526, + "grad_norm": 1.6204382181167603, + "learning_rate": 9.233128834355828e-05, + "loss": 2.9367, + "step": 904 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.4218156337738037, + "learning_rate": 9.243353783231085e-05, + "loss": 2.9727, + "step": 905 + }, + { + "epoch": 0.27808471454880296, + "grad_norm": 1.7401793003082275, + "learning_rate": 9.25357873210634e-05, + "loss": 2.8957, + "step": 906 + }, + { + "epoch": 0.2783916513198281, + "grad_norm": 2.2128076553344727, + "learning_rate": 9.263803680981595e-05, + "loss": 2.8725, + "step": 907 + }, + { + "epoch": 0.2786985880908533, + "grad_norm": 2.004179000854492, + "learning_rate": 9.274028629856851e-05, + "loss": 2.8879, + "step": 908 + }, + { + "epoch": 0.27900552486187846, + "grad_norm": 2.198784112930298, + "learning_rate": 9.284253578732107e-05, + "loss": 2.9655, + "step": 909 + }, + { + "epoch": 0.27931246163290363, + "grad_norm": 1.8064004182815552, + "learning_rate": 9.294478527607362e-05, + "loss": 2.7801, + "step": 910 + }, + { + "epoch": 0.2796193984039288, + "grad_norm": 2.1273581981658936, + "learning_rate": 9.304703476482618e-05, + "loss": 2.8615, + "step": 911 + }, + { + "epoch": 0.27992633517495397, + "grad_norm": 1.7843197584152222, + "learning_rate": 9.314928425357874e-05, + "loss": 2.8735, + "step": 912 + }, + { + "epoch": 0.28023327194597913, + "grad_norm": 2.234886884689331, + "learning_rate": 9.325153374233129e-05, + "loss": 2.9444, + "step": 913 + }, + { + "epoch": 0.2805402087170043, + "grad_norm": 2.0565783977508545, + "learning_rate": 9.335378323108384e-05, + "loss": 2.9784, + "step": 914 + }, + { + "epoch": 0.28084714548802947, + "grad_norm": 1.836901068687439, + "learning_rate": 9.345603271983641e-05, + "loss": 2.9217, + "step": 915 + }, + { + "epoch": 0.28115408225905464, + "grad_norm": 2.0981357097625732, + "learning_rate": 9.355828220858896e-05, + "loss": 2.9091, + "step": 916 + }, + { + "epoch": 0.2814610190300798, + "grad_norm": 1.9199821949005127, + "learning_rate": 9.366053169734152e-05, + "loss": 2.8882, + "step": 917 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 1.9928756952285767, + "learning_rate": 9.376278118609407e-05, + "loss": 2.8463, + "step": 918 + }, + { + "epoch": 0.28207489257213014, + "grad_norm": 1.9580156803131104, + "learning_rate": 9.386503067484664e-05, + "loss": 2.7814, + "step": 919 + }, + { + "epoch": 0.2823818293431553, + "grad_norm": 2.016144275665283, + "learning_rate": 9.396728016359919e-05, + "loss": 2.8725, + "step": 920 + }, + { + "epoch": 0.2826887661141805, + "grad_norm": 1.967668890953064, + "learning_rate": 9.406952965235175e-05, + "loss": 2.912, + "step": 921 + }, + { + "epoch": 0.28299570288520565, + "grad_norm": 1.8826593160629272, + "learning_rate": 9.41717791411043e-05, + "loss": 2.7885, + "step": 922 + }, + { + "epoch": 0.2833026396562308, + "grad_norm": 2.0615732669830322, + "learning_rate": 9.427402862985685e-05, + "loss": 2.9111, + "step": 923 + }, + { + "epoch": 0.283609576427256, + "grad_norm": 1.7132701873779297, + "learning_rate": 9.43762781186094e-05, + "loss": 2.89, + "step": 924 + }, + { + "epoch": 0.28391651319828115, + "grad_norm": 2.1561272144317627, + "learning_rate": 9.447852760736196e-05, + "loss": 2.8741, + "step": 925 + }, + { + "epoch": 0.2842234499693063, + "grad_norm": 1.727338433265686, + "learning_rate": 9.458077709611453e-05, + "loss": 2.8449, + "step": 926 + }, + { + "epoch": 0.2845303867403315, + "grad_norm": 2.19234299659729, + "learning_rate": 9.468302658486708e-05, + "loss": 2.8499, + "step": 927 + }, + { + "epoch": 0.28483732351135665, + "grad_norm": 1.7370812892913818, + "learning_rate": 9.478527607361963e-05, + "loss": 2.882, + "step": 928 + }, + { + "epoch": 0.2851442602823818, + "grad_norm": 2.0576157569885254, + "learning_rate": 9.488752556237219e-05, + "loss": 2.7869, + "step": 929 + }, + { + "epoch": 0.285451197053407, + "grad_norm": 1.7926486730575562, + "learning_rate": 9.498977505112476e-05, + "loss": 2.906, + "step": 930 + }, + { + "epoch": 0.28575813382443216, + "grad_norm": 1.6877856254577637, + "learning_rate": 9.50920245398773e-05, + "loss": 2.8422, + "step": 931 + }, + { + "epoch": 0.2860650705954573, + "grad_norm": 2.3053178787231445, + "learning_rate": 9.519427402862986e-05, + "loss": 2.9039, + "step": 932 + }, + { + "epoch": 0.2863720073664825, + "grad_norm": 1.7746092081069946, + "learning_rate": 9.529652351738242e-05, + "loss": 2.9082, + "step": 933 + }, + { + "epoch": 0.28667894413750766, + "grad_norm": 2.1900086402893066, + "learning_rate": 9.539877300613498e-05, + "loss": 2.8511, + "step": 934 + }, + { + "epoch": 0.28698588090853283, + "grad_norm": 1.781988501548767, + "learning_rate": 9.550102249488752e-05, + "loss": 2.8264, + "step": 935 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 1.845797061920166, + "learning_rate": 9.560327198364009e-05, + "loss": 2.8657, + "step": 936 + }, + { + "epoch": 0.28759975445058317, + "grad_norm": 1.8794586658477783, + "learning_rate": 9.570552147239264e-05, + "loss": 2.8365, + "step": 937 + }, + { + "epoch": 0.28790669122160834, + "grad_norm": 2.078359603881836, + "learning_rate": 9.58077709611452e-05, + "loss": 2.8829, + "step": 938 + }, + { + "epoch": 0.2882136279926335, + "grad_norm": 1.8091285228729248, + "learning_rate": 9.591002044989775e-05, + "loss": 2.8083, + "step": 939 + }, + { + "epoch": 0.28852056476365867, + "grad_norm": 2.0130608081817627, + "learning_rate": 9.601226993865032e-05, + "loss": 2.8922, + "step": 940 + }, + { + "epoch": 0.28882750153468384, + "grad_norm": 1.8504360914230347, + "learning_rate": 9.611451942740287e-05, + "loss": 2.8034, + "step": 941 + }, + { + "epoch": 0.289134438305709, + "grad_norm": 1.860420823097229, + "learning_rate": 9.621676891615543e-05, + "loss": 2.8249, + "step": 942 + }, + { + "epoch": 0.2894413750767342, + "grad_norm": 2.157158374786377, + "learning_rate": 9.631901840490798e-05, + "loss": 2.8629, + "step": 943 + }, + { + "epoch": 0.28974831184775934, + "grad_norm": 1.8066895008087158, + "learning_rate": 9.642126789366053e-05, + "loss": 2.7965, + "step": 944 + }, + { + "epoch": 0.2900552486187845, + "grad_norm": 1.9674500226974487, + "learning_rate": 9.65235173824131e-05, + "loss": 2.8043, + "step": 945 + }, + { + "epoch": 0.2903621853898097, + "grad_norm": 1.7899354696273804, + "learning_rate": 9.662576687116564e-05, + "loss": 2.8803, + "step": 946 + }, + { + "epoch": 0.29066912216083485, + "grad_norm": 2.220201015472412, + "learning_rate": 9.672801635991821e-05, + "loss": 2.8201, + "step": 947 + }, + { + "epoch": 0.29097605893186, + "grad_norm": 1.76320219039917, + "learning_rate": 9.683026584867076e-05, + "loss": 2.8921, + "step": 948 + }, + { + "epoch": 0.2912829957028852, + "grad_norm": 1.6863081455230713, + "learning_rate": 9.693251533742331e-05, + "loss": 2.8208, + "step": 949 + }, + { + "epoch": 0.29158993247391035, + "grad_norm": 2.1578476428985596, + "learning_rate": 9.703476482617587e-05, + "loss": 2.8972, + "step": 950 + }, + { + "epoch": 0.2918968692449355, + "grad_norm": 1.6925181150436401, + "learning_rate": 9.713701431492844e-05, + "loss": 2.8225, + "step": 951 + }, + { + "epoch": 0.2922038060159607, + "grad_norm": 1.8861147165298462, + "learning_rate": 9.723926380368099e-05, + "loss": 2.8707, + "step": 952 + }, + { + "epoch": 0.29251074278698586, + "grad_norm": 1.5894604921340942, + "learning_rate": 9.734151329243354e-05, + "loss": 2.7576, + "step": 953 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 1.9092673063278198, + "learning_rate": 9.74437627811861e-05, + "loss": 2.8659, + "step": 954 + }, + { + "epoch": 0.2931246163290362, + "grad_norm": 1.8600605726242065, + "learning_rate": 9.754601226993866e-05, + "loss": 2.752, + "step": 955 + }, + { + "epoch": 0.29343155310006136, + "grad_norm": 2.005805015563965, + "learning_rate": 9.76482617586912e-05, + "loss": 2.8511, + "step": 956 + }, + { + "epoch": 0.2937384898710866, + "grad_norm": 1.9485148191452026, + "learning_rate": 9.775051124744377e-05, + "loss": 2.9726, + "step": 957 + }, + { + "epoch": 0.29404542664211175, + "grad_norm": 1.9197280406951904, + "learning_rate": 9.785276073619632e-05, + "loss": 2.7753, + "step": 958 + }, + { + "epoch": 0.2943523634131369, + "grad_norm": 1.6279773712158203, + "learning_rate": 9.795501022494888e-05, + "loss": 2.8855, + "step": 959 + }, + { + "epoch": 0.2946593001841621, + "grad_norm": 2.0233097076416016, + "learning_rate": 9.805725971370143e-05, + "loss": 2.749, + "step": 960 + }, + { + "epoch": 0.29496623695518726, + "grad_norm": 1.550295352935791, + "learning_rate": 9.815950920245399e-05, + "loss": 2.7991, + "step": 961 + }, + { + "epoch": 0.2952731737262124, + "grad_norm": 2.3194360733032227, + "learning_rate": 9.826175869120655e-05, + "loss": 2.8208, + "step": 962 + }, + { + "epoch": 0.2955801104972376, + "grad_norm": 1.634867787361145, + "learning_rate": 9.83640081799591e-05, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.29588704726826276, + "grad_norm": 2.1152596473693848, + "learning_rate": 9.846625766871166e-05, + "loss": 2.7667, + "step": 964 + }, + { + "epoch": 0.2961939840392879, + "grad_norm": 1.8927233219146729, + "learning_rate": 9.856850715746421e-05, + "loss": 2.8308, + "step": 965 + }, + { + "epoch": 0.2965009208103131, + "grad_norm": 1.765026330947876, + "learning_rate": 9.867075664621678e-05, + "loss": 2.7546, + "step": 966 + }, + { + "epoch": 0.29680785758133826, + "grad_norm": 1.7491015195846558, + "learning_rate": 9.877300613496932e-05, + "loss": 2.8156, + "step": 967 + }, + { + "epoch": 0.29711479435236343, + "grad_norm": 1.8352077007293701, + "learning_rate": 9.887525562372189e-05, + "loss": 2.8542, + "step": 968 + }, + { + "epoch": 0.2974217311233886, + "grad_norm": 1.8892323970794678, + "learning_rate": 9.897750511247444e-05, + "loss": 2.8216, + "step": 969 + }, + { + "epoch": 0.29772866789441377, + "grad_norm": 1.7171403169631958, + "learning_rate": 9.907975460122701e-05, + "loss": 2.8428, + "step": 970 + }, + { + "epoch": 0.29803560466543894, + "grad_norm": 1.8318040370941162, + "learning_rate": 9.918200408997955e-05, + "loss": 2.7821, + "step": 971 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 1.5829975605010986, + "learning_rate": 9.928425357873212e-05, + "loss": 2.9091, + "step": 972 + }, + { + "epoch": 0.29864947820748927, + "grad_norm": 1.7248235940933228, + "learning_rate": 9.938650306748467e-05, + "loss": 2.7914, + "step": 973 + }, + { + "epoch": 0.29895641497851444, + "grad_norm": 1.7741187810897827, + "learning_rate": 9.948875255623722e-05, + "loss": 2.8711, + "step": 974 + }, + { + "epoch": 0.2992633517495396, + "grad_norm": 1.7419151067733765, + "learning_rate": 9.959100204498978e-05, + "loss": 2.8933, + "step": 975 + }, + { + "epoch": 0.2995702885205648, + "grad_norm": 1.6603926420211792, + "learning_rate": 9.969325153374234e-05, + "loss": 2.7138, + "step": 976 + }, + { + "epoch": 0.29987722529158994, + "grad_norm": 1.8423576354980469, + "learning_rate": 9.97955010224949e-05, + "loss": 2.7776, + "step": 977 + }, + { + "epoch": 0.3001841620626151, + "grad_norm": 1.5548568964004517, + "learning_rate": 9.989775051124745e-05, + "loss": 2.8193, + "step": 978 + }, + { + "epoch": 0.3004910988336403, + "grad_norm": 1.711785078048706, + "learning_rate": 0.0001, + "loss": 2.7082, + "step": 979 + }, + { + "epoch": 0.30079803560466545, + "grad_norm": 1.6395221948623657, + "learning_rate": 9.999999975293535e-05, + "loss": 2.7526, + "step": 980 + }, + { + "epoch": 0.3011049723756906, + "grad_norm": 1.829174518585205, + "learning_rate": 9.999999901174139e-05, + "loss": 2.7555, + "step": 981 + }, + { + "epoch": 0.3014119091467158, + "grad_norm": 1.5807569026947021, + "learning_rate": 9.999999777641814e-05, + "loss": 2.848, + "step": 982 + }, + { + "epoch": 0.30171884591774095, + "grad_norm": 2.014803171157837, + "learning_rate": 9.99999960469656e-05, + "loss": 2.8318, + "step": 983 + }, + { + "epoch": 0.3020257826887661, + "grad_norm": 1.4732542037963867, + "learning_rate": 9.99999938233838e-05, + "loss": 2.8143, + "step": 984 + }, + { + "epoch": 0.3023327194597913, + "grad_norm": 2.4888343811035156, + "learning_rate": 9.999999110567275e-05, + "loss": 2.7979, + "step": 985 + }, + { + "epoch": 0.30263965623081646, + "grad_norm": 1.4265737533569336, + "learning_rate": 9.99999878938325e-05, + "loss": 2.7968, + "step": 986 + }, + { + "epoch": 0.3029465930018416, + "grad_norm": 2.0397326946258545, + "learning_rate": 9.999998418786303e-05, + "loss": 2.7413, + "step": 987 + }, + { + "epoch": 0.3032535297728668, + "grad_norm": 1.6565579175949097, + "learning_rate": 9.999997998776443e-05, + "loss": 2.8249, + "step": 988 + }, + { + "epoch": 0.30356046654389196, + "grad_norm": 1.8470033407211304, + "learning_rate": 9.999997529353673e-05, + "loss": 2.7815, + "step": 989 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 1.571768045425415, + "learning_rate": 9.999997010517995e-05, + "loss": 2.7202, + "step": 990 + }, + { + "epoch": 0.3041743400859423, + "grad_norm": 1.6217811107635498, + "learning_rate": 9.999996442269417e-05, + "loss": 2.832, + "step": 991 + }, + { + "epoch": 0.30448127685696746, + "grad_norm": 1.745591640472412, + "learning_rate": 9.999995824607943e-05, + "loss": 2.8271, + "step": 992 + }, + { + "epoch": 0.30478821362799263, + "grad_norm": 1.6469355821609497, + "learning_rate": 9.99999515753358e-05, + "loss": 2.7699, + "step": 993 + }, + { + "epoch": 0.3050951503990178, + "grad_norm": 1.733182430267334, + "learning_rate": 9.999994441046334e-05, + "loss": 2.7927, + "step": 994 + }, + { + "epoch": 0.30540208717004297, + "grad_norm": 1.6043230295181274, + "learning_rate": 9.999993675146213e-05, + "loss": 2.7536, + "step": 995 + }, + { + "epoch": 0.30570902394106814, + "grad_norm": 1.8154711723327637, + "learning_rate": 9.999992859833222e-05, + "loss": 2.7795, + "step": 996 + }, + { + "epoch": 0.3060159607120933, + "grad_norm": 1.7553666830062866, + "learning_rate": 9.999991995107374e-05, + "loss": 2.8128, + "step": 997 + }, + { + "epoch": 0.3063228974831185, + "grad_norm": 1.702697992324829, + "learning_rate": 9.999991080968672e-05, + "loss": 2.7234, + "step": 998 + }, + { + "epoch": 0.30662983425414364, + "grad_norm": 1.512619972229004, + "learning_rate": 9.99999011741713e-05, + "loss": 2.7555, + "step": 999 + }, + { + "epoch": 0.3069367710251688, + "grad_norm": 1.735844612121582, + "learning_rate": 9.999989104452753e-05, + "loss": 2.7847, + "step": 1000 + }, + { + "epoch": 0.307243707796194, + "grad_norm": 1.4687904119491577, + "learning_rate": 9.999988042075555e-05, + "loss": 2.8039, + "step": 1001 + }, + { + "epoch": 0.30755064456721914, + "grad_norm": 1.6867917776107788, + "learning_rate": 9.999986930285542e-05, + "loss": 2.7643, + "step": 1002 + }, + { + "epoch": 0.3078575813382443, + "grad_norm": 1.6974400281906128, + "learning_rate": 9.99998576908273e-05, + "loss": 2.7284, + "step": 1003 + }, + { + "epoch": 0.3081645181092695, + "grad_norm": 1.6622353792190552, + "learning_rate": 9.999984558467126e-05, + "loss": 2.8364, + "step": 1004 + }, + { + "epoch": 0.30847145488029465, + "grad_norm": 1.7920496463775635, + "learning_rate": 9.999983298438744e-05, + "loss": 2.7769, + "step": 1005 + }, + { + "epoch": 0.3087783916513198, + "grad_norm": 1.7111997604370117, + "learning_rate": 9.999981988997598e-05, + "loss": 2.7323, + "step": 1006 + }, + { + "epoch": 0.309085328422345, + "grad_norm": 1.6372064352035522, + "learning_rate": 9.9999806301437e-05, + "loss": 2.8128, + "step": 1007 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 1.841002345085144, + "learning_rate": 9.999979221877061e-05, + "loss": 2.7049, + "step": 1008 + }, + { + "epoch": 0.3096992019643953, + "grad_norm": 1.4474141597747803, + "learning_rate": 9.999977764197697e-05, + "loss": 2.64, + "step": 1009 + }, + { + "epoch": 0.3100061387354205, + "grad_norm": 1.6599560976028442, + "learning_rate": 9.999976257105622e-05, + "loss": 2.7989, + "step": 1010 + }, + { + "epoch": 0.31031307550644566, + "grad_norm": 1.7502890825271606, + "learning_rate": 9.999974700600851e-05, + "loss": 2.7949, + "step": 1011 + }, + { + "epoch": 0.3106200122774708, + "grad_norm": 1.8119313716888428, + "learning_rate": 9.9999730946834e-05, + "loss": 2.7577, + "step": 1012 + }, + { + "epoch": 0.310926949048496, + "grad_norm": 1.4398404359817505, + "learning_rate": 9.999971439353284e-05, + "loss": 2.7369, + "step": 1013 + }, + { + "epoch": 0.31123388581952116, + "grad_norm": 1.8501840829849243, + "learning_rate": 9.999969734610522e-05, + "loss": 2.6651, + "step": 1014 + }, + { + "epoch": 0.31154082259054633, + "grad_norm": 1.450804352760315, + "learning_rate": 9.999967980455125e-05, + "loss": 2.7231, + "step": 1015 + }, + { + "epoch": 0.3118477593615715, + "grad_norm": 1.9445282220840454, + "learning_rate": 9.999966176887115e-05, + "loss": 2.795, + "step": 1016 + }, + { + "epoch": 0.31215469613259667, + "grad_norm": 1.6361008882522583, + "learning_rate": 9.99996432390651e-05, + "loss": 2.8894, + "step": 1017 + }, + { + "epoch": 0.31246163290362183, + "grad_norm": 2.0804831981658936, + "learning_rate": 9.999962421513325e-05, + "loss": 2.8313, + "step": 1018 + }, + { + "epoch": 0.312768569674647, + "grad_norm": 1.3779852390289307, + "learning_rate": 9.999960469707582e-05, + "loss": 2.6776, + "step": 1019 + }, + { + "epoch": 0.31307550644567217, + "grad_norm": 1.7727700471878052, + "learning_rate": 9.999958468489299e-05, + "loss": 2.8076, + "step": 1020 + }, + { + "epoch": 0.31338244321669734, + "grad_norm": 1.5273795127868652, + "learning_rate": 9.999956417858496e-05, + "loss": 2.7069, + "step": 1021 + }, + { + "epoch": 0.3136893799877225, + "grad_norm": 1.8135402202606201, + "learning_rate": 9.999954317815193e-05, + "loss": 2.7375, + "step": 1022 + }, + { + "epoch": 0.3139963167587477, + "grad_norm": 1.6642818450927734, + "learning_rate": 9.99995216835941e-05, + "loss": 2.8085, + "step": 1023 + }, + { + "epoch": 0.31430325352977284, + "grad_norm": 1.681378722190857, + "learning_rate": 9.999949969491169e-05, + "loss": 2.807, + "step": 1024 + }, + { + "epoch": 0.314610190300798, + "grad_norm": 1.5521160364151, + "learning_rate": 9.999947721210493e-05, + "loss": 2.7266, + "step": 1025 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 1.486830711364746, + "learning_rate": 9.999945423517403e-05, + "loss": 2.774, + "step": 1026 + }, + { + "epoch": 0.3152240638428484, + "grad_norm": 1.5730900764465332, + "learning_rate": 9.99994307641192e-05, + "loss": 2.7101, + "step": 1027 + }, + { + "epoch": 0.31553100061387357, + "grad_norm": 1.4835596084594727, + "learning_rate": 9.999940679894071e-05, + "loss": 2.8195, + "step": 1028 + }, + { + "epoch": 0.31583793738489874, + "grad_norm": 1.7885956764221191, + "learning_rate": 9.999938233963877e-05, + "loss": 2.796, + "step": 1029 + }, + { + "epoch": 0.3161448741559239, + "grad_norm": 1.4036259651184082, + "learning_rate": 9.999935738621362e-05, + "loss": 2.7167, + "step": 1030 + }, + { + "epoch": 0.3164518109269491, + "grad_norm": 1.7480512857437134, + "learning_rate": 9.999933193866554e-05, + "loss": 2.6774, + "step": 1031 + }, + { + "epoch": 0.31675874769797424, + "grad_norm": 1.66177499294281, + "learning_rate": 9.999930599699473e-05, + "loss": 2.7635, + "step": 1032 + }, + { + "epoch": 0.3170656844689994, + "grad_norm": 1.5088306665420532, + "learning_rate": 9.999927956120147e-05, + "loss": 2.7284, + "step": 1033 + }, + { + "epoch": 0.3173726212400246, + "grad_norm": 1.6847199201583862, + "learning_rate": 9.999925263128605e-05, + "loss": 2.8287, + "step": 1034 + }, + { + "epoch": 0.31767955801104975, + "grad_norm": 1.6092369556427002, + "learning_rate": 9.999922520724869e-05, + "loss": 2.7189, + "step": 1035 + }, + { + "epoch": 0.3179864947820749, + "grad_norm": 1.41717529296875, + "learning_rate": 9.999919728908969e-05, + "loss": 2.7134, + "step": 1036 + }, + { + "epoch": 0.3182934315531001, + "grad_norm": 1.6256498098373413, + "learning_rate": 9.999916887680931e-05, + "loss": 2.7312, + "step": 1037 + }, + { + "epoch": 0.31860036832412525, + "grad_norm": 1.4934377670288086, + "learning_rate": 9.999913997040784e-05, + "loss": 2.7548, + "step": 1038 + }, + { + "epoch": 0.3189073050951504, + "grad_norm": 1.6037719249725342, + "learning_rate": 9.999911056988557e-05, + "loss": 2.7682, + "step": 1039 + }, + { + "epoch": 0.3192142418661756, + "grad_norm": 1.4746284484863281, + "learning_rate": 9.999908067524277e-05, + "loss": 2.7256, + "step": 1040 + }, + { + "epoch": 0.31952117863720075, + "grad_norm": 1.4633710384368896, + "learning_rate": 9.999905028647976e-05, + "loss": 2.6779, + "step": 1041 + }, + { + "epoch": 0.3198281154082259, + "grad_norm": 1.6108646392822266, + "learning_rate": 9.999901940359684e-05, + "loss": 2.781, + "step": 1042 + }, + { + "epoch": 0.3201350521792511, + "grad_norm": 1.4130996465682983, + "learning_rate": 9.999898802659428e-05, + "loss": 2.6327, + "step": 1043 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 2.110307455062866, + "learning_rate": 9.999895615547244e-05, + "loss": 2.7965, + "step": 1044 + }, + { + "epoch": 0.3207489257213014, + "grad_norm": 1.500618815422058, + "learning_rate": 9.99989237902316e-05, + "loss": 2.7874, + "step": 1045 + }, + { + "epoch": 0.3210558624923266, + "grad_norm": 1.577890157699585, + "learning_rate": 9.999889093087207e-05, + "loss": 2.6816, + "step": 1046 + }, + { + "epoch": 0.32136279926335176, + "grad_norm": 1.2820981740951538, + "learning_rate": 9.999885757739422e-05, + "loss": 2.6799, + "step": 1047 + }, + { + "epoch": 0.32166973603437693, + "grad_norm": 1.629936695098877, + "learning_rate": 9.999882372979835e-05, + "loss": 2.6783, + "step": 1048 + }, + { + "epoch": 0.3219766728054021, + "grad_norm": 1.3119972944259644, + "learning_rate": 9.999878938808478e-05, + "loss": 2.6403, + "step": 1049 + }, + { + "epoch": 0.32228360957642727, + "grad_norm": 1.720093846321106, + "learning_rate": 9.999875455225389e-05, + "loss": 2.709, + "step": 1050 + }, + { + "epoch": 0.32259054634745243, + "grad_norm": 1.446273922920227, + "learning_rate": 9.999871922230599e-05, + "loss": 2.6463, + "step": 1051 + }, + { + "epoch": 0.3228974831184776, + "grad_norm": 1.5000908374786377, + "learning_rate": 9.999868339824145e-05, + "loss": 2.7502, + "step": 1052 + }, + { + "epoch": 0.32320441988950277, + "grad_norm": 1.6257869005203247, + "learning_rate": 9.999864708006061e-05, + "loss": 2.6984, + "step": 1053 + }, + { + "epoch": 0.32351135666052794, + "grad_norm": 1.509638786315918, + "learning_rate": 9.999861026776384e-05, + "loss": 2.6931, + "step": 1054 + }, + { + "epoch": 0.3238182934315531, + "grad_norm": 1.5305874347686768, + "learning_rate": 9.999857296135149e-05, + "loss": 2.8423, + "step": 1055 + }, + { + "epoch": 0.3241252302025783, + "grad_norm": 1.7664300203323364, + "learning_rate": 9.999853516082394e-05, + "loss": 2.7703, + "step": 1056 + }, + { + "epoch": 0.32443216697360344, + "grad_norm": 1.4633153676986694, + "learning_rate": 9.999849686618157e-05, + "loss": 2.7588, + "step": 1057 + }, + { + "epoch": 0.3247391037446286, + "grad_norm": 1.5177773237228394, + "learning_rate": 9.999845807742473e-05, + "loss": 2.7376, + "step": 1058 + }, + { + "epoch": 0.3250460405156538, + "grad_norm": 1.6122089624404907, + "learning_rate": 9.999841879455383e-05, + "loss": 2.7871, + "step": 1059 + }, + { + "epoch": 0.32535297728667895, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.999837901756926e-05, + "loss": 2.6602, + "step": 1060 + }, + { + "epoch": 0.3256599140577041, + "grad_norm": 1.5714327096939087, + "learning_rate": 9.99983387464714e-05, + "loss": 2.6279, + "step": 1061 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 1.399731993675232, + "learning_rate": 9.999829798126065e-05, + "loss": 2.7957, + "step": 1062 + }, + { + "epoch": 0.32627378759975445, + "grad_norm": 1.694368839263916, + "learning_rate": 9.999825672193741e-05, + "loss": 2.6859, + "step": 1063 + }, + { + "epoch": 0.3265807243707796, + "grad_norm": 1.2585967779159546, + "learning_rate": 9.99982149685021e-05, + "loss": 2.7964, + "step": 1064 + }, + { + "epoch": 0.3268876611418048, + "grad_norm": 1.802262306213379, + "learning_rate": 9.999817272095512e-05, + "loss": 2.6325, + "step": 1065 + }, + { + "epoch": 0.32719459791282995, + "grad_norm": 1.213222861289978, + "learning_rate": 9.99981299792969e-05, + "loss": 2.718, + "step": 1066 + }, + { + "epoch": 0.3275015346838551, + "grad_norm": 1.5745760202407837, + "learning_rate": 9.999808674352785e-05, + "loss": 2.8589, + "step": 1067 + }, + { + "epoch": 0.3278084714548803, + "grad_norm": 1.516995906829834, + "learning_rate": 9.999804301364839e-05, + "loss": 2.6691, + "step": 1068 + }, + { + "epoch": 0.32811540822590546, + "grad_norm": 1.4223122596740723, + "learning_rate": 9.999799878965897e-05, + "loss": 2.6899, + "step": 1069 + }, + { + "epoch": 0.3284223449969306, + "grad_norm": 1.4502828121185303, + "learning_rate": 9.999795407156003e-05, + "loss": 2.7801, + "step": 1070 + }, + { + "epoch": 0.3287292817679558, + "grad_norm": 1.4692026376724243, + "learning_rate": 9.999790885935198e-05, + "loss": 2.6869, + "step": 1071 + }, + { + "epoch": 0.32903621853898096, + "grad_norm": 1.4182246923446655, + "learning_rate": 9.999786315303532e-05, + "loss": 2.7802, + "step": 1072 + }, + { + "epoch": 0.32934315531000613, + "grad_norm": 1.781173586845398, + "learning_rate": 9.999781695261046e-05, + "loss": 2.7522, + "step": 1073 + }, + { + "epoch": 0.3296500920810313, + "grad_norm": 1.3958306312561035, + "learning_rate": 9.999777025807786e-05, + "loss": 2.6894, + "step": 1074 + }, + { + "epoch": 0.32995702885205647, + "grad_norm": 1.7938110828399658, + "learning_rate": 9.9997723069438e-05, + "loss": 2.6468, + "step": 1075 + }, + { + "epoch": 0.33026396562308163, + "grad_norm": 1.2314528226852417, + "learning_rate": 9.999767538669134e-05, + "loss": 2.7446, + "step": 1076 + }, + { + "epoch": 0.3305709023941068, + "grad_norm": 1.4881565570831299, + "learning_rate": 9.999762720983835e-05, + "loss": 2.6904, + "step": 1077 + }, + { + "epoch": 0.33087783916513197, + "grad_norm": 1.3903130292892456, + "learning_rate": 9.999757853887948e-05, + "loss": 2.7315, + "step": 1078 + }, + { + "epoch": 0.33118477593615714, + "grad_norm": 1.491129755973816, + "learning_rate": 9.999752937381525e-05, + "loss": 2.7325, + "step": 1079 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 1.4748190641403198, + "learning_rate": 9.999747971464612e-05, + "loss": 2.7288, + "step": 1080 + }, + { + "epoch": 0.3317986494782075, + "grad_norm": 1.5664055347442627, + "learning_rate": 9.99974295613726e-05, + "loss": 2.8225, + "step": 1081 + }, + { + "epoch": 0.33210558624923264, + "grad_norm": 1.4422696828842163, + "learning_rate": 9.999737891399518e-05, + "loss": 2.6537, + "step": 1082 + }, + { + "epoch": 0.3324125230202578, + "grad_norm": 1.397817850112915, + "learning_rate": 9.999732777251436e-05, + "loss": 2.6329, + "step": 1083 + }, + { + "epoch": 0.332719459791283, + "grad_norm": 1.4253548383712769, + "learning_rate": 9.999727613693063e-05, + "loss": 2.7028, + "step": 1084 + }, + { + "epoch": 0.33302639656230815, + "grad_norm": 1.4327688217163086, + "learning_rate": 9.999722400724451e-05, + "loss": 2.6524, + "step": 1085 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.999717138345654e-05, + "loss": 2.7278, + "step": 1086 + }, + { + "epoch": 0.3336402701043585, + "grad_norm": 1.536656379699707, + "learning_rate": 9.999711826556719e-05, + "loss": 2.5858, + "step": 1087 + }, + { + "epoch": 0.33394720687538365, + "grad_norm": 1.4210286140441895, + "learning_rate": 9.999706465357703e-05, + "loss": 2.7057, + "step": 1088 + }, + { + "epoch": 0.3342541436464088, + "grad_norm": 1.4605839252471924, + "learning_rate": 9.999701054748657e-05, + "loss": 2.6461, + "step": 1089 + }, + { + "epoch": 0.334561080417434, + "grad_norm": 1.4764037132263184, + "learning_rate": 9.999695594729636e-05, + "loss": 2.608, + "step": 1090 + }, + { + "epoch": 0.33486801718845915, + "grad_norm": 1.630843162536621, + "learning_rate": 9.99969008530069e-05, + "loss": 2.6165, + "step": 1091 + }, + { + "epoch": 0.3351749539594843, + "grad_norm": 1.3693522214889526, + "learning_rate": 9.999684526461879e-05, + "loss": 2.72, + "step": 1092 + }, + { + "epoch": 0.3354818907305095, + "grad_norm": 1.609580636024475, + "learning_rate": 9.999678918213254e-05, + "loss": 2.7602, + "step": 1093 + }, + { + "epoch": 0.33578882750153466, + "grad_norm": 1.3815720081329346, + "learning_rate": 9.999673260554872e-05, + "loss": 2.6297, + "step": 1094 + }, + { + "epoch": 0.3360957642725598, + "grad_norm": 1.4511120319366455, + "learning_rate": 9.999667553486787e-05, + "loss": 2.7515, + "step": 1095 + }, + { + "epoch": 0.336402701043585, + "grad_norm": 1.486387848854065, + "learning_rate": 9.999661797009057e-05, + "loss": 2.6839, + "step": 1096 + }, + { + "epoch": 0.33670963781461016, + "grad_norm": 1.239160180091858, + "learning_rate": 9.999655991121739e-05, + "loss": 2.6033, + "step": 1097 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 1.499598741531372, + "learning_rate": 9.999650135824891e-05, + "loss": 2.5582, + "step": 1098 + }, + { + "epoch": 0.33732351135666055, + "grad_norm": 1.32973051071167, + "learning_rate": 9.999644231118571e-05, + "loss": 2.6253, + "step": 1099 + }, + { + "epoch": 0.3376304481276857, + "grad_norm": 1.4025259017944336, + "learning_rate": 9.999638277002833e-05, + "loss": 2.6199, + "step": 1100 + }, + { + "epoch": 0.3379373848987109, + "grad_norm": 1.3162082433700562, + "learning_rate": 9.999632273477742e-05, + "loss": 2.5528, + "step": 1101 + }, + { + "epoch": 0.33824432166973606, + "grad_norm": 1.5454723834991455, + "learning_rate": 9.999626220543352e-05, + "loss": 2.6724, + "step": 1102 + }, + { + "epoch": 0.3385512584407612, + "grad_norm": 1.45896315574646, + "learning_rate": 9.999620118199727e-05, + "loss": 2.688, + "step": 1103 + }, + { + "epoch": 0.3388581952117864, + "grad_norm": 1.3940998315811157, + "learning_rate": 9.999613966446926e-05, + "loss": 2.6991, + "step": 1104 + }, + { + "epoch": 0.33916513198281156, + "grad_norm": 1.4427480697631836, + "learning_rate": 9.999607765285009e-05, + "loss": 2.6869, + "step": 1105 + }, + { + "epoch": 0.33947206875383673, + "grad_norm": 1.260373830795288, + "learning_rate": 9.999601514714036e-05, + "loss": 2.7011, + "step": 1106 + }, + { + "epoch": 0.3397790055248619, + "grad_norm": 1.5985103845596313, + "learning_rate": 9.999595214734072e-05, + "loss": 2.599, + "step": 1107 + }, + { + "epoch": 0.34008594229588707, + "grad_norm": 1.1968494653701782, + "learning_rate": 9.999588865345179e-05, + "loss": 2.6346, + "step": 1108 + }, + { + "epoch": 0.34039287906691224, + "grad_norm": 1.4565916061401367, + "learning_rate": 9.999582466547417e-05, + "loss": 2.6303, + "step": 1109 + }, + { + "epoch": 0.3406998158379374, + "grad_norm": 1.2992361783981323, + "learning_rate": 9.999576018340851e-05, + "loss": 2.6121, + "step": 1110 + }, + { + "epoch": 0.34100675260896257, + "grad_norm": 1.402471899986267, + "learning_rate": 9.999569520725543e-05, + "loss": 2.6697, + "step": 1111 + }, + { + "epoch": 0.34131368937998774, + "grad_norm": 1.3006439208984375, + "learning_rate": 9.99956297370156e-05, + "loss": 2.6347, + "step": 1112 + }, + { + "epoch": 0.3416206261510129, + "grad_norm": 1.4235650300979614, + "learning_rate": 9.999556377268966e-05, + "loss": 2.6869, + "step": 1113 + }, + { + "epoch": 0.3419275629220381, + "grad_norm": 1.3288183212280273, + "learning_rate": 9.999549731427824e-05, + "loss": 2.5834, + "step": 1114 + }, + { + "epoch": 0.34223449969306324, + "grad_norm": 1.430736780166626, + "learning_rate": 9.999543036178203e-05, + "loss": 2.6248, + "step": 1115 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 1.467417597770691, + "learning_rate": 9.999536291520167e-05, + "loss": 2.6563, + "step": 1116 + }, + { + "epoch": 0.3428483732351136, + "grad_norm": 1.3988397121429443, + "learning_rate": 9.999529497453782e-05, + "loss": 2.6634, + "step": 1117 + }, + { + "epoch": 0.34315531000613875, + "grad_norm": 1.2072746753692627, + "learning_rate": 9.999522653979117e-05, + "loss": 2.6129, + "step": 1118 + }, + { + "epoch": 0.3434622467771639, + "grad_norm": 1.5297373533248901, + "learning_rate": 9.999515761096239e-05, + "loss": 2.6359, + "step": 1119 + }, + { + "epoch": 0.3437691835481891, + "grad_norm": 1.2022082805633545, + "learning_rate": 9.999508818805214e-05, + "loss": 2.6934, + "step": 1120 + }, + { + "epoch": 0.34407612031921425, + "grad_norm": 1.5655800104141235, + "learning_rate": 9.999501827106114e-05, + "loss": 2.6132, + "step": 1121 + }, + { + "epoch": 0.3443830570902394, + "grad_norm": 1.1639407873153687, + "learning_rate": 9.999494785999007e-05, + "loss": 2.6416, + "step": 1122 + }, + { + "epoch": 0.3446899938612646, + "grad_norm": 1.5784116983413696, + "learning_rate": 9.999487695483962e-05, + "loss": 2.5967, + "step": 1123 + }, + { + "epoch": 0.34499693063228976, + "grad_norm": 1.1812770366668701, + "learning_rate": 9.999480555561049e-05, + "loss": 2.6303, + "step": 1124 + }, + { + "epoch": 0.3453038674033149, + "grad_norm": 1.5105888843536377, + "learning_rate": 9.99947336623034e-05, + "loss": 2.58, + "step": 1125 + }, + { + "epoch": 0.3456108041743401, + "grad_norm": 1.2969506978988647, + "learning_rate": 9.999466127491904e-05, + "loss": 2.6857, + "step": 1126 + }, + { + "epoch": 0.34591774094536526, + "grad_norm": 1.679018259048462, + "learning_rate": 9.999458839345812e-05, + "loss": 2.6304, + "step": 1127 + }, + { + "epoch": 0.3462246777163904, + "grad_norm": 1.2718015909194946, + "learning_rate": 9.99945150179214e-05, + "loss": 2.6929, + "step": 1128 + }, + { + "epoch": 0.3465316144874156, + "grad_norm": 1.5834014415740967, + "learning_rate": 9.999444114830957e-05, + "loss": 2.6477, + "step": 1129 + }, + { + "epoch": 0.34683855125844076, + "grad_norm": 1.1575955152511597, + "learning_rate": 9.999436678462338e-05, + "loss": 2.6908, + "step": 1130 + }, + { + "epoch": 0.34714548802946593, + "grad_norm": 1.6231988668441772, + "learning_rate": 9.999429192686352e-05, + "loss": 2.6741, + "step": 1131 + }, + { + "epoch": 0.3474524248004911, + "grad_norm": 1.1616390943527222, + "learning_rate": 9.99942165750308e-05, + "loss": 2.5977, + "step": 1132 + }, + { + "epoch": 0.34775936157151627, + "grad_norm": 1.6188498735427856, + "learning_rate": 9.999414072912592e-05, + "loss": 2.6776, + "step": 1133 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 1.3885529041290283, + "learning_rate": 9.999406438914962e-05, + "loss": 2.7136, + "step": 1134 + }, + { + "epoch": 0.3483732351135666, + "grad_norm": 1.4522851705551147, + "learning_rate": 9.999398755510269e-05, + "loss": 2.6817, + "step": 1135 + }, + { + "epoch": 0.34868017188459177, + "grad_norm": 1.2695082426071167, + "learning_rate": 9.999391022698588e-05, + "loss": 2.6257, + "step": 1136 + }, + { + "epoch": 0.34898710865561694, + "grad_norm": 1.1735594272613525, + "learning_rate": 9.999383240479993e-05, + "loss": 2.5908, + "step": 1137 + }, + { + "epoch": 0.3492940454266421, + "grad_norm": 1.4158523082733154, + "learning_rate": 9.999375408854564e-05, + "loss": 2.572, + "step": 1138 + }, + { + "epoch": 0.3496009821976673, + "grad_norm": 1.1342333555221558, + "learning_rate": 9.999367527822376e-05, + "loss": 2.6918, + "step": 1139 + }, + { + "epoch": 0.34990791896869244, + "grad_norm": 1.4462997913360596, + "learning_rate": 9.999359597383509e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 0.3502148557397176, + "grad_norm": 1.254346251487732, + "learning_rate": 9.99935161753804e-05, + "loss": 2.6426, + "step": 1141 + }, + { + "epoch": 0.3505217925107428, + "grad_norm": 1.5101851224899292, + "learning_rate": 9.999343588286048e-05, + "loss": 2.6261, + "step": 1142 + }, + { + "epoch": 0.35082872928176795, + "grad_norm": 1.2910065650939941, + "learning_rate": 9.999335509627612e-05, + "loss": 2.5587, + "step": 1143 + }, + { + "epoch": 0.3511356660527931, + "grad_norm": 1.4421133995056152, + "learning_rate": 9.999327381562812e-05, + "loss": 2.6812, + "step": 1144 + }, + { + "epoch": 0.3514426028238183, + "grad_norm": 1.3265037536621094, + "learning_rate": 9.999319204091728e-05, + "loss": 2.6506, + "step": 1145 + }, + { + "epoch": 0.35174953959484345, + "grad_norm": 1.346258521080017, + "learning_rate": 9.999310977214443e-05, + "loss": 2.7038, + "step": 1146 + }, + { + "epoch": 0.3520564763658686, + "grad_norm": 1.3683836460113525, + "learning_rate": 9.999302700931037e-05, + "loss": 2.5823, + "step": 1147 + }, + { + "epoch": 0.3523634131368938, + "grad_norm": 1.3593783378601074, + "learning_rate": 9.99929437524159e-05, + "loss": 2.5705, + "step": 1148 + }, + { + "epoch": 0.35267034990791896, + "grad_norm": 1.4077095985412598, + "learning_rate": 9.999286000146186e-05, + "loss": 2.6259, + "step": 1149 + }, + { + "epoch": 0.3529772866789441, + "grad_norm": 1.3095922470092773, + "learning_rate": 9.99927757564491e-05, + "loss": 2.683, + "step": 1150 + }, + { + "epoch": 0.3532842234499693, + "grad_norm": 1.4188631772994995, + "learning_rate": 9.999269101737841e-05, + "loss": 2.619, + "step": 1151 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 1.2483123540878296, + "learning_rate": 9.999260578425063e-05, + "loss": 2.6477, + "step": 1152 + }, + { + "epoch": 0.35389809699201963, + "grad_norm": 1.4601099491119385, + "learning_rate": 9.999252005706663e-05, + "loss": 2.5861, + "step": 1153 + }, + { + "epoch": 0.3542050337630448, + "grad_norm": 1.107335090637207, + "learning_rate": 9.999243383582726e-05, + "loss": 2.6308, + "step": 1154 + }, + { + "epoch": 0.35451197053406996, + "grad_norm": 1.60590398311615, + "learning_rate": 9.999234712053334e-05, + "loss": 2.7057, + "step": 1155 + }, + { + "epoch": 0.35481890730509513, + "grad_norm": 1.2256578207015991, + "learning_rate": 9.999225991118575e-05, + "loss": 2.6371, + "step": 1156 + }, + { + "epoch": 0.3551258440761203, + "grad_norm": 1.4451910257339478, + "learning_rate": 9.999217220778535e-05, + "loss": 2.6424, + "step": 1157 + }, + { + "epoch": 0.35543278084714547, + "grad_norm": 1.184781789779663, + "learning_rate": 9.999208401033299e-05, + "loss": 2.6576, + "step": 1158 + }, + { + "epoch": 0.35573971761817064, + "grad_norm": 1.3395711183547974, + "learning_rate": 9.999199531882956e-05, + "loss": 2.6109, + "step": 1159 + }, + { + "epoch": 0.3560466543891958, + "grad_norm": 1.2052571773529053, + "learning_rate": 9.999190613327594e-05, + "loss": 2.5486, + "step": 1160 + }, + { + "epoch": 0.356353591160221, + "grad_norm": 1.2690850496292114, + "learning_rate": 9.999181645367299e-05, + "loss": 2.6457, + "step": 1161 + }, + { + "epoch": 0.35666052793124614, + "grad_norm": 1.2832787036895752, + "learning_rate": 9.999172628002162e-05, + "loss": 2.6097, + "step": 1162 + }, + { + "epoch": 0.3569674647022713, + "grad_norm": 1.3791579008102417, + "learning_rate": 9.999163561232272e-05, + "loss": 2.7458, + "step": 1163 + }, + { + "epoch": 0.3572744014732965, + "grad_norm": 1.260743498802185, + "learning_rate": 9.999154445057715e-05, + "loss": 2.594, + "step": 1164 + }, + { + "epoch": 0.35758133824432164, + "grad_norm": 1.1595406532287598, + "learning_rate": 9.999145279478585e-05, + "loss": 2.5315, + "step": 1165 + }, + { + "epoch": 0.3578882750153468, + "grad_norm": 1.3424396514892578, + "learning_rate": 9.999136064494972e-05, + "loss": 2.6017, + "step": 1166 + }, + { + "epoch": 0.358195211786372, + "grad_norm": 1.317750334739685, + "learning_rate": 9.999126800106963e-05, + "loss": 2.5787, + "step": 1167 + }, + { + "epoch": 0.35850214855739715, + "grad_norm": 1.104471206665039, + "learning_rate": 9.999117486314657e-05, + "loss": 2.6801, + "step": 1168 + }, + { + "epoch": 0.3588090853284224, + "grad_norm": 1.5555830001831055, + "learning_rate": 9.99910812311814e-05, + "loss": 2.6575, + "step": 1169 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 1.1883453130722046, + "learning_rate": 9.999098710517507e-05, + "loss": 2.5801, + "step": 1170 + }, + { + "epoch": 0.3594229588704727, + "grad_norm": 1.3885222673416138, + "learning_rate": 9.99908924851285e-05, + "loss": 2.5637, + "step": 1171 + }, + { + "epoch": 0.3597298956414979, + "grad_norm": 1.1860510110855103, + "learning_rate": 9.999079737104262e-05, + "loss": 2.6528, + "step": 1172 + }, + { + "epoch": 0.36003683241252304, + "grad_norm": 1.4319096803665161, + "learning_rate": 9.99907017629184e-05, + "loss": 2.579, + "step": 1173 + }, + { + "epoch": 0.3603437691835482, + "grad_norm": 1.256819725036621, + "learning_rate": 9.999060566075676e-05, + "loss": 2.5638, + "step": 1174 + }, + { + "epoch": 0.3606507059545734, + "grad_norm": 1.5452641248703003, + "learning_rate": 9.999050906455865e-05, + "loss": 2.6318, + "step": 1175 + }, + { + "epoch": 0.36095764272559855, + "grad_norm": 1.1933847665786743, + "learning_rate": 9.999041197432503e-05, + "loss": 2.5451, + "step": 1176 + }, + { + "epoch": 0.3612645794966237, + "grad_norm": 1.245689034461975, + "learning_rate": 9.999031439005684e-05, + "loss": 2.5452, + "step": 1177 + }, + { + "epoch": 0.3615715162676489, + "grad_norm": 1.2228111028671265, + "learning_rate": 9.99902163117551e-05, + "loss": 2.5856, + "step": 1178 + }, + { + "epoch": 0.36187845303867405, + "grad_norm": 1.3547098636627197, + "learning_rate": 9.999011773942071e-05, + "loss": 2.6604, + "step": 1179 + }, + { + "epoch": 0.3621853898096992, + "grad_norm": 1.25395929813385, + "learning_rate": 9.999001867305469e-05, + "loss": 2.5947, + "step": 1180 + }, + { + "epoch": 0.3624923265807244, + "grad_norm": 1.1676687002182007, + "learning_rate": 9.9989919112658e-05, + "loss": 2.5728, + "step": 1181 + }, + { + "epoch": 0.36279926335174956, + "grad_norm": 1.2076375484466553, + "learning_rate": 9.998981905823163e-05, + "loss": 2.569, + "step": 1182 + }, + { + "epoch": 0.3631062001227747, + "grad_norm": 1.3417900800704956, + "learning_rate": 9.998971850977659e-05, + "loss": 2.5552, + "step": 1183 + }, + { + "epoch": 0.3634131368937999, + "grad_norm": 1.135088324546814, + "learning_rate": 9.998961746729383e-05, + "loss": 2.5883, + "step": 1184 + }, + { + "epoch": 0.36372007366482506, + "grad_norm": 1.3329869508743286, + "learning_rate": 9.998951593078438e-05, + "loss": 2.6398, + "step": 1185 + }, + { + "epoch": 0.36402701043585023, + "grad_norm": 1.1681292057037354, + "learning_rate": 9.998941390024923e-05, + "loss": 2.6082, + "step": 1186 + }, + { + "epoch": 0.3643339472068754, + "grad_norm": 1.4083843231201172, + "learning_rate": 9.998931137568939e-05, + "loss": 2.6585, + "step": 1187 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 1.0879896879196167, + "learning_rate": 9.998920835710587e-05, + "loss": 2.4779, + "step": 1188 + }, + { + "epoch": 0.36494782074892573, + "grad_norm": 1.2977828979492188, + "learning_rate": 9.99891048444997e-05, + "loss": 2.6586, + "step": 1189 + }, + { + "epoch": 0.3652547575199509, + "grad_norm": 1.2552378177642822, + "learning_rate": 9.998900083787188e-05, + "loss": 2.5211, + "step": 1190 + }, + { + "epoch": 0.36556169429097607, + "grad_norm": 1.178227186203003, + "learning_rate": 9.998889633722348e-05, + "loss": 2.5365, + "step": 1191 + }, + { + "epoch": 0.36586863106200124, + "grad_norm": 1.36601722240448, + "learning_rate": 9.99887913425555e-05, + "loss": 2.6108, + "step": 1192 + }, + { + "epoch": 0.3661755678330264, + "grad_norm": 1.1947816610336304, + "learning_rate": 9.998868585386898e-05, + "loss": 2.5269, + "step": 1193 + }, + { + "epoch": 0.3664825046040516, + "grad_norm": 1.3113429546356201, + "learning_rate": 9.998857987116497e-05, + "loss": 2.5241, + "step": 1194 + }, + { + "epoch": 0.36678944137507674, + "grad_norm": 1.1573466062545776, + "learning_rate": 9.99884733944445e-05, + "loss": 2.5772, + "step": 1195 + }, + { + "epoch": 0.3670963781461019, + "grad_norm": 1.3841795921325684, + "learning_rate": 9.998836642370866e-05, + "loss": 2.6254, + "step": 1196 + }, + { + "epoch": 0.3674033149171271, + "grad_norm": 1.3332045078277588, + "learning_rate": 9.998825895895848e-05, + "loss": 2.6846, + "step": 1197 + }, + { + "epoch": 0.36771025168815225, + "grad_norm": 1.1578748226165771, + "learning_rate": 9.9988151000195e-05, + "loss": 2.4717, + "step": 1198 + }, + { + "epoch": 0.3680171884591774, + "grad_norm": 1.1045753955841064, + "learning_rate": 9.998804254741934e-05, + "loss": 2.6433, + "step": 1199 + }, + { + "epoch": 0.3683241252302026, + "grad_norm": 1.3260962963104248, + "learning_rate": 9.998793360063254e-05, + "loss": 2.6385, + "step": 1200 + }, + { + "epoch": 0.36863106200122775, + "grad_norm": 1.1483805179595947, + "learning_rate": 9.998782415983568e-05, + "loss": 2.6013, + "step": 1201 + }, + { + "epoch": 0.3689379987722529, + "grad_norm": 1.1897181272506714, + "learning_rate": 9.998771422502984e-05, + "loss": 2.485, + "step": 1202 + }, + { + "epoch": 0.3692449355432781, + "grad_norm": 1.2124346494674683, + "learning_rate": 9.99876037962161e-05, + "loss": 2.6271, + "step": 1203 + }, + { + "epoch": 0.36955187231430325, + "grad_norm": 1.2274240255355835, + "learning_rate": 9.998749287339557e-05, + "loss": 2.6072, + "step": 1204 + }, + { + "epoch": 0.3698588090853284, + "grad_norm": 1.2045015096664429, + "learning_rate": 9.998738145656934e-05, + "loss": 2.5567, + "step": 1205 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 1.187698483467102, + "learning_rate": 9.998726954573852e-05, + "loss": 2.6251, + "step": 1206 + }, + { + "epoch": 0.37047268262737876, + "grad_norm": 1.1760836839675903, + "learning_rate": 9.998715714090419e-05, + "loss": 2.6544, + "step": 1207 + }, + { + "epoch": 0.3707796193984039, + "grad_norm": 1.2181260585784912, + "learning_rate": 9.998704424206746e-05, + "loss": 2.6258, + "step": 1208 + }, + { + "epoch": 0.3710865561694291, + "grad_norm": 1.2106094360351562, + "learning_rate": 9.998693084922947e-05, + "loss": 2.5932, + "step": 1209 + }, + { + "epoch": 0.37139349294045426, + "grad_norm": 1.2973625659942627, + "learning_rate": 9.998681696239133e-05, + "loss": 2.5257, + "step": 1210 + }, + { + "epoch": 0.37170042971147943, + "grad_norm": 1.2477924823760986, + "learning_rate": 9.998670258155417e-05, + "loss": 2.6579, + "step": 1211 + }, + { + "epoch": 0.3720073664825046, + "grad_norm": 1.3301422595977783, + "learning_rate": 9.998658770671913e-05, + "loss": 2.4903, + "step": 1212 + }, + { + "epoch": 0.37231430325352977, + "grad_norm": 1.224321722984314, + "learning_rate": 9.998647233788732e-05, + "loss": 2.5865, + "step": 1213 + }, + { + "epoch": 0.37262124002455493, + "grad_norm": 1.3110655546188354, + "learning_rate": 9.99863564750599e-05, + "loss": 2.6134, + "step": 1214 + }, + { + "epoch": 0.3729281767955801, + "grad_norm": 1.2323014736175537, + "learning_rate": 9.998624011823801e-05, + "loss": 2.5892, + "step": 1215 + }, + { + "epoch": 0.37323511356660527, + "grad_norm": 1.0873770713806152, + "learning_rate": 9.998612326742279e-05, + "loss": 2.4897, + "step": 1216 + }, + { + "epoch": 0.37354205033763044, + "grad_norm": 1.2789679765701294, + "learning_rate": 9.998600592261539e-05, + "loss": 2.5603, + "step": 1217 + }, + { + "epoch": 0.3738489871086556, + "grad_norm": 1.1311540603637695, + "learning_rate": 9.998588808381699e-05, + "loss": 2.5327, + "step": 1218 + }, + { + "epoch": 0.3741559238796808, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.998576975102876e-05, + "loss": 2.4789, + "step": 1219 + }, + { + "epoch": 0.37446286065070594, + "grad_norm": 1.1840651035308838, + "learning_rate": 9.998565092425182e-05, + "loss": 2.5026, + "step": 1220 + }, + { + "epoch": 0.3747697974217311, + "grad_norm": 1.3145099878311157, + "learning_rate": 9.998553160348743e-05, + "loss": 2.5424, + "step": 1221 + }, + { + "epoch": 0.3750767341927563, + "grad_norm": 1.2192758321762085, + "learning_rate": 9.998541178873668e-05, + "loss": 2.5556, + "step": 1222 + }, + { + "epoch": 0.37538367096378145, + "grad_norm": 1.1329905986785889, + "learning_rate": 9.99852914800008e-05, + "loss": 2.4624, + "step": 1223 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 1.2490339279174805, + "learning_rate": 9.9985170677281e-05, + "loss": 2.5016, + "step": 1224 + }, + { + "epoch": 0.3759975445058318, + "grad_norm": 1.1884582042694092, + "learning_rate": 9.998504938057841e-05, + "loss": 2.5345, + "step": 1225 + }, + { + "epoch": 0.37630448127685695, + "grad_norm": 1.2075775861740112, + "learning_rate": 9.998492758989428e-05, + "loss": 2.5206, + "step": 1226 + }, + { + "epoch": 0.3766114180478821, + "grad_norm": 1.238457441329956, + "learning_rate": 9.99848053052298e-05, + "loss": 2.6748, + "step": 1227 + }, + { + "epoch": 0.3769183548189073, + "grad_norm": 1.3056883811950684, + "learning_rate": 9.998468252658618e-05, + "loss": 2.6146, + "step": 1228 + }, + { + "epoch": 0.37722529158993245, + "grad_norm": 1.191575050354004, + "learning_rate": 9.998455925396461e-05, + "loss": 2.4743, + "step": 1229 + }, + { + "epoch": 0.3775322283609576, + "grad_norm": 1.2834603786468506, + "learning_rate": 9.998443548736635e-05, + "loss": 2.5504, + "step": 1230 + }, + { + "epoch": 0.3778391651319828, + "grad_norm": 1.3023632764816284, + "learning_rate": 9.99843112267926e-05, + "loss": 2.5832, + "step": 1231 + }, + { + "epoch": 0.37814610190300796, + "grad_norm": 1.1219336986541748, + "learning_rate": 9.998418647224458e-05, + "loss": 2.5715, + "step": 1232 + }, + { + "epoch": 0.3784530386740331, + "grad_norm": 1.0666810274124146, + "learning_rate": 9.998406122372354e-05, + "loss": 2.4865, + "step": 1233 + }, + { + "epoch": 0.3787599754450583, + "grad_norm": 1.3699263334274292, + "learning_rate": 9.998393548123072e-05, + "loss": 2.5523, + "step": 1234 + }, + { + "epoch": 0.37906691221608346, + "grad_norm": 1.1383014917373657, + "learning_rate": 9.998380924476733e-05, + "loss": 2.7054, + "step": 1235 + }, + { + "epoch": 0.37937384898710863, + "grad_norm": 1.1304205656051636, + "learning_rate": 9.998368251433465e-05, + "loss": 2.5007, + "step": 1236 + }, + { + "epoch": 0.3796807857581338, + "grad_norm": 1.2220405340194702, + "learning_rate": 9.998355528993394e-05, + "loss": 2.5635, + "step": 1237 + }, + { + "epoch": 0.37998772252915897, + "grad_norm": 1.1126691102981567, + "learning_rate": 9.998342757156642e-05, + "loss": 2.5795, + "step": 1238 + }, + { + "epoch": 0.38029465930018413, + "grad_norm": 1.1675945520401, + "learning_rate": 9.998329935923339e-05, + "loss": 2.564, + "step": 1239 + }, + { + "epoch": 0.38060159607120936, + "grad_norm": 1.1286569833755493, + "learning_rate": 9.998317065293607e-05, + "loss": 2.5476, + "step": 1240 + }, + { + "epoch": 0.3809085328422345, + "grad_norm": 1.1252213716506958, + "learning_rate": 9.998304145267579e-05, + "loss": 2.5406, + "step": 1241 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 1.1931700706481934, + "learning_rate": 9.998291175845378e-05, + "loss": 2.5277, + "step": 1242 + }, + { + "epoch": 0.38152240638428486, + "grad_norm": 1.2148306369781494, + "learning_rate": 9.998278157027136e-05, + "loss": 2.5178, + "step": 1243 + }, + { + "epoch": 0.38182934315531003, + "grad_norm": 1.1597660779953003, + "learning_rate": 9.998265088812978e-05, + "loss": 2.5522, + "step": 1244 + }, + { + "epoch": 0.3821362799263352, + "grad_norm": 1.105973243713379, + "learning_rate": 9.998251971203035e-05, + "loss": 2.4558, + "step": 1245 + }, + { + "epoch": 0.38244321669736037, + "grad_norm": 1.1082781553268433, + "learning_rate": 9.998238804197437e-05, + "loss": 2.5504, + "step": 1246 + }, + { + "epoch": 0.38275015346838553, + "grad_norm": 1.2124732732772827, + "learning_rate": 9.998225587796312e-05, + "loss": 2.5536, + "step": 1247 + }, + { + "epoch": 0.3830570902394107, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.998212321999795e-05, + "loss": 2.4837, + "step": 1248 + }, + { + "epoch": 0.38336402701043587, + "grad_norm": 1.353562355041504, + "learning_rate": 9.998199006808014e-05, + "loss": 2.4554, + "step": 1249 + }, + { + "epoch": 0.38367096378146104, + "grad_norm": 1.2103357315063477, + "learning_rate": 9.998185642221098e-05, + "loss": 2.4843, + "step": 1250 + }, + { + "epoch": 0.3839779005524862, + "grad_norm": 1.2572352886199951, + "learning_rate": 9.998172228239185e-05, + "loss": 2.497, + "step": 1251 + }, + { + "epoch": 0.3842848373235114, + "grad_norm": 1.0910226106643677, + "learning_rate": 9.998158764862402e-05, + "loss": 2.577, + "step": 1252 + }, + { + "epoch": 0.38459177409453654, + "grad_norm": 1.2550606727600098, + "learning_rate": 9.998145252090886e-05, + "loss": 2.5087, + "step": 1253 + }, + { + "epoch": 0.3848987108655617, + "grad_norm": 1.0103787183761597, + "learning_rate": 9.998131689924768e-05, + "loss": 2.5306, + "step": 1254 + }, + { + "epoch": 0.3852056476365869, + "grad_norm": 1.2965941429138184, + "learning_rate": 9.998118078364184e-05, + "loss": 2.5622, + "step": 1255 + }, + { + "epoch": 0.38551258440761205, + "grad_norm": 1.0791535377502441, + "learning_rate": 9.998104417409269e-05, + "loss": 2.5608, + "step": 1256 + }, + { + "epoch": 0.3858195211786372, + "grad_norm": 1.3277596235275269, + "learning_rate": 9.998090707060155e-05, + "loss": 2.5748, + "step": 1257 + }, + { + "epoch": 0.3861264579496624, + "grad_norm": 1.004031777381897, + "learning_rate": 9.99807694731698e-05, + "loss": 2.5532, + "step": 1258 + }, + { + "epoch": 0.38643339472068755, + "grad_norm": 1.4802277088165283, + "learning_rate": 9.998063138179877e-05, + "loss": 2.585, + "step": 1259 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 1.0821146965026855, + "learning_rate": 9.998049279648987e-05, + "loss": 2.5248, + "step": 1260 + }, + { + "epoch": 0.3870472682627379, + "grad_norm": 1.2902108430862427, + "learning_rate": 9.998035371724443e-05, + "loss": 2.5134, + "step": 1261 + }, + { + "epoch": 0.38735420503376305, + "grad_norm": 1.082943320274353, + "learning_rate": 9.998021414406385e-05, + "loss": 2.5937, + "step": 1262 + }, + { + "epoch": 0.3876611418047882, + "grad_norm": 1.2164193391799927, + "learning_rate": 9.998007407694949e-05, + "loss": 2.5106, + "step": 1263 + }, + { + "epoch": 0.3879680785758134, + "grad_norm": 1.0999115705490112, + "learning_rate": 9.997993351590276e-05, + "loss": 2.5458, + "step": 1264 + }, + { + "epoch": 0.38827501534683856, + "grad_norm": 1.2275537252426147, + "learning_rate": 9.997979246092503e-05, + "loss": 2.5664, + "step": 1265 + }, + { + "epoch": 0.3885819521178637, + "grad_norm": 1.3246204853057861, + "learning_rate": 9.997965091201769e-05, + "loss": 2.5289, + "step": 1266 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.2404677867889404, + "learning_rate": 9.997950886918214e-05, + "loss": 2.5302, + "step": 1267 + }, + { + "epoch": 0.38919582565991406, + "grad_norm": 1.0993810892105103, + "learning_rate": 9.99793663324198e-05, + "loss": 2.5085, + "step": 1268 + }, + { + "epoch": 0.38950276243093923, + "grad_norm": 1.3394049406051636, + "learning_rate": 9.997922330173206e-05, + "loss": 2.5882, + "step": 1269 + }, + { + "epoch": 0.3898096992019644, + "grad_norm": 1.1464321613311768, + "learning_rate": 9.997907977712036e-05, + "loss": 2.5211, + "step": 1270 + }, + { + "epoch": 0.39011663597298957, + "grad_norm": 1.1246297359466553, + "learning_rate": 9.997893575858608e-05, + "loss": 2.4204, + "step": 1271 + }, + { + "epoch": 0.39042357274401474, + "grad_norm": 1.1278076171875, + "learning_rate": 9.997879124613067e-05, + "loss": 2.4405, + "step": 1272 + }, + { + "epoch": 0.3907305095150399, + "grad_norm": 1.2284942865371704, + "learning_rate": 9.997864623975555e-05, + "loss": 2.5674, + "step": 1273 + }, + { + "epoch": 0.39103744628606507, + "grad_norm": 1.1243138313293457, + "learning_rate": 9.997850073946215e-05, + "loss": 2.489, + "step": 1274 + }, + { + "epoch": 0.39134438305709024, + "grad_norm": 1.198461890220642, + "learning_rate": 9.997835474525193e-05, + "loss": 2.51, + "step": 1275 + }, + { + "epoch": 0.3916513198281154, + "grad_norm": 1.1643213033676147, + "learning_rate": 9.997820825712629e-05, + "loss": 2.5688, + "step": 1276 + }, + { + "epoch": 0.3919582565991406, + "grad_norm": 1.2107082605361938, + "learning_rate": 9.997806127508671e-05, + "loss": 2.5614, + "step": 1277 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 1.1856440305709839, + "learning_rate": 9.997791379913464e-05, + "loss": 2.5893, + "step": 1278 + }, + { + "epoch": 0.3925721301411909, + "grad_norm": 1.166395664215088, + "learning_rate": 9.997776582927153e-05, + "loss": 2.539, + "step": 1279 + }, + { + "epoch": 0.3928790669122161, + "grad_norm": 1.1638765335083008, + "learning_rate": 9.997761736549886e-05, + "loss": 2.5384, + "step": 1280 + }, + { + "epoch": 0.39318600368324125, + "grad_norm": 1.107485055923462, + "learning_rate": 9.997746840781806e-05, + "loss": 2.559, + "step": 1281 + }, + { + "epoch": 0.3934929404542664, + "grad_norm": 1.174592137336731, + "learning_rate": 9.997731895623063e-05, + "loss": 2.5132, + "step": 1282 + }, + { + "epoch": 0.3937998772252916, + "grad_norm": 1.0407745838165283, + "learning_rate": 9.997716901073806e-05, + "loss": 2.4871, + "step": 1283 + }, + { + "epoch": 0.39410681399631675, + "grad_norm": 1.059743046760559, + "learning_rate": 9.997701857134179e-05, + "loss": 2.4865, + "step": 1284 + }, + { + "epoch": 0.3944137507673419, + "grad_norm": 1.0606070756912231, + "learning_rate": 9.997686763804335e-05, + "loss": 2.5651, + "step": 1285 + }, + { + "epoch": 0.3947206875383671, + "grad_norm": 1.0753284692764282, + "learning_rate": 9.99767162108442e-05, + "loss": 2.4699, + "step": 1286 + }, + { + "epoch": 0.39502762430939226, + "grad_norm": 1.1155509948730469, + "learning_rate": 9.997656428974585e-05, + "loss": 2.5326, + "step": 1287 + }, + { + "epoch": 0.3953345610804174, + "grad_norm": 1.2243739366531372, + "learning_rate": 9.99764118747498e-05, + "loss": 2.5189, + "step": 1288 + }, + { + "epoch": 0.3956414978514426, + "grad_norm": 1.2526514530181885, + "learning_rate": 9.997625896585757e-05, + "loss": 2.5464, + "step": 1289 + }, + { + "epoch": 0.39594843462246776, + "grad_norm": 1.297153115272522, + "learning_rate": 9.997610556307062e-05, + "loss": 2.5752, + "step": 1290 + }, + { + "epoch": 0.39625537139349293, + "grad_norm": 1.1064956188201904, + "learning_rate": 9.997595166639054e-05, + "loss": 2.5743, + "step": 1291 + }, + { + "epoch": 0.3965623081645181, + "grad_norm": 1.255810022354126, + "learning_rate": 9.997579727581879e-05, + "loss": 2.7087, + "step": 1292 + }, + { + "epoch": 0.39686924493554326, + "grad_norm": 1.4290298223495483, + "learning_rate": 9.997564239135692e-05, + "loss": 2.5417, + "step": 1293 + }, + { + "epoch": 0.39717618170656843, + "grad_norm": 1.1937109231948853, + "learning_rate": 9.997548701300648e-05, + "loss": 2.4862, + "step": 1294 + }, + { + "epoch": 0.3974831184775936, + "grad_norm": 1.1707425117492676, + "learning_rate": 9.997533114076897e-05, + "loss": 2.4715, + "step": 1295 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 1.1248551607131958, + "learning_rate": 9.997517477464596e-05, + "loss": 2.4859, + "step": 1296 + }, + { + "epoch": 0.39809699201964394, + "grad_norm": 1.1656453609466553, + "learning_rate": 9.997501791463897e-05, + "loss": 2.5402, + "step": 1297 + }, + { + "epoch": 0.3984039287906691, + "grad_norm": 0.9916674494743347, + "learning_rate": 9.997486056074956e-05, + "loss": 2.5116, + "step": 1298 + }, + { + "epoch": 0.39871086556169427, + "grad_norm": 1.3229619264602661, + "learning_rate": 9.997470271297928e-05, + "loss": 2.5565, + "step": 1299 + }, + { + "epoch": 0.39901780233271944, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.997454437132971e-05, + "loss": 2.5191, + "step": 1300 + }, + { + "epoch": 0.3993247391037446, + "grad_norm": 1.2117778062820435, + "learning_rate": 9.997438553580241e-05, + "loss": 2.558, + "step": 1301 + }, + { + "epoch": 0.3996316758747698, + "grad_norm": 1.1083563566207886, + "learning_rate": 9.997422620639892e-05, + "loss": 2.4734, + "step": 1302 + }, + { + "epoch": 0.39993861264579494, + "grad_norm": 0.9662174582481384, + "learning_rate": 9.997406638312084e-05, + "loss": 2.4866, + "step": 1303 + }, + { + "epoch": 0.4002455494168201, + "grad_norm": 1.0886632204055786, + "learning_rate": 9.997390606596976e-05, + "loss": 2.5397, + "step": 1304 + }, + { + "epoch": 0.4005524861878453, + "grad_norm": 1.2318742275238037, + "learning_rate": 9.997374525494723e-05, + "loss": 2.6281, + "step": 1305 + }, + { + "epoch": 0.40085942295887045, + "grad_norm": 1.1717815399169922, + "learning_rate": 9.997358395005487e-05, + "loss": 2.5202, + "step": 1306 + }, + { + "epoch": 0.4011663597298956, + "grad_norm": 1.0533723831176758, + "learning_rate": 9.997342215129427e-05, + "loss": 2.5096, + "step": 1307 + }, + { + "epoch": 0.4014732965009208, + "grad_norm": 1.0814248323440552, + "learning_rate": 9.997325985866701e-05, + "loss": 2.5513, + "step": 1308 + }, + { + "epoch": 0.40178023327194595, + "grad_norm": 1.078261137008667, + "learning_rate": 9.997309707217472e-05, + "loss": 2.5115, + "step": 1309 + }, + { + "epoch": 0.4020871700429711, + "grad_norm": 1.0834710597991943, + "learning_rate": 9.997293379181897e-05, + "loss": 2.4754, + "step": 1310 + }, + { + "epoch": 0.40239410681399634, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.997277001760142e-05, + "loss": 2.5068, + "step": 1311 + }, + { + "epoch": 0.4027010435850215, + "grad_norm": 1.3008345365524292, + "learning_rate": 9.997260574952366e-05, + "loss": 2.4675, + "step": 1312 + }, + { + "epoch": 0.4030079803560467, + "grad_norm": 1.176858901977539, + "learning_rate": 9.997244098758732e-05, + "loss": 2.4786, + "step": 1313 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 1.0121303796768188, + "learning_rate": 9.997227573179403e-05, + "loss": 2.476, + "step": 1314 + }, + { + "epoch": 0.403621853898097, + "grad_norm": 1.326298713684082, + "learning_rate": 9.997210998214542e-05, + "loss": 2.4093, + "step": 1315 + }, + { + "epoch": 0.4039287906691222, + "grad_norm": 0.9008898735046387, + "learning_rate": 9.997194373864314e-05, + "loss": 2.4523, + "step": 1316 + }, + { + "epoch": 0.40423572744014735, + "grad_norm": 1.0441854000091553, + "learning_rate": 9.99717770012888e-05, + "loss": 2.5419, + "step": 1317 + }, + { + "epoch": 0.4045426642111725, + "grad_norm": 1.0490028858184814, + "learning_rate": 9.997160977008408e-05, + "loss": 2.4855, + "step": 1318 + }, + { + "epoch": 0.4048496009821977, + "grad_norm": 1.0244388580322266, + "learning_rate": 9.997144204503063e-05, + "loss": 2.4555, + "step": 1319 + }, + { + "epoch": 0.40515653775322286, + "grad_norm": 1.1217700242996216, + "learning_rate": 9.99712738261301e-05, + "loss": 2.4872, + "step": 1320 + }, + { + "epoch": 0.405463474524248, + "grad_norm": 1.031691551208496, + "learning_rate": 9.997110511338414e-05, + "loss": 2.4094, + "step": 1321 + }, + { + "epoch": 0.4057704112952732, + "grad_norm": 1.1658705472946167, + "learning_rate": 9.997093590679444e-05, + "loss": 2.407, + "step": 1322 + }, + { + "epoch": 0.40607734806629836, + "grad_norm": 1.1527072191238403, + "learning_rate": 9.997076620636266e-05, + "loss": 2.5041, + "step": 1323 + }, + { + "epoch": 0.40638428483732353, + "grad_norm": 1.2039116621017456, + "learning_rate": 9.997059601209049e-05, + "loss": 2.4682, + "step": 1324 + }, + { + "epoch": 0.4066912216083487, + "grad_norm": 1.142160177230835, + "learning_rate": 9.997042532397957e-05, + "loss": 2.4629, + "step": 1325 + }, + { + "epoch": 0.40699815837937386, + "grad_norm": 0.972081184387207, + "learning_rate": 9.997025414203164e-05, + "loss": 2.3941, + "step": 1326 + }, + { + "epoch": 0.40730509515039903, + "grad_norm": 1.0181753635406494, + "learning_rate": 9.99700824662484e-05, + "loss": 2.5649, + "step": 1327 + }, + { + "epoch": 0.4076120319214242, + "grad_norm": 1.145769715309143, + "learning_rate": 9.996991029663148e-05, + "loss": 2.5284, + "step": 1328 + }, + { + "epoch": 0.40791896869244937, + "grad_norm": 1.0604028701782227, + "learning_rate": 9.996973763318262e-05, + "loss": 2.4488, + "step": 1329 + }, + { + "epoch": 0.40822590546347454, + "grad_norm": 1.161383867263794, + "learning_rate": 9.996956447590354e-05, + "loss": 2.6081, + "step": 1330 + }, + { + "epoch": 0.4085328422344997, + "grad_norm": 1.0880714654922485, + "learning_rate": 9.996939082479591e-05, + "loss": 2.4695, + "step": 1331 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 1.036556601524353, + "learning_rate": 9.99692166798615e-05, + "loss": 2.4428, + "step": 1332 + }, + { + "epoch": 0.40914671577655004, + "grad_norm": 1.079179286956787, + "learning_rate": 9.996904204110198e-05, + "loss": 2.4543, + "step": 1333 + }, + { + "epoch": 0.4094536525475752, + "grad_norm": 1.0588144063949585, + "learning_rate": 9.996886690851912e-05, + "loss": 2.4755, + "step": 1334 + }, + { + "epoch": 0.4097605893186004, + "grad_norm": 1.0359580516815186, + "learning_rate": 9.996869128211462e-05, + "loss": 2.4933, + "step": 1335 + }, + { + "epoch": 0.41006752608962554, + "grad_norm": 1.0067389011383057, + "learning_rate": 9.996851516189021e-05, + "loss": 2.4291, + "step": 1336 + }, + { + "epoch": 0.4103744628606507, + "grad_norm": 1.0173524618148804, + "learning_rate": 9.996833854784766e-05, + "loss": 2.4856, + "step": 1337 + }, + { + "epoch": 0.4106813996316759, + "grad_norm": 1.0740927457809448, + "learning_rate": 9.99681614399887e-05, + "loss": 2.5248, + "step": 1338 + }, + { + "epoch": 0.41098833640270105, + "grad_norm": 0.9638547301292419, + "learning_rate": 9.99679838383151e-05, + "loss": 2.4777, + "step": 1339 + }, + { + "epoch": 0.4112952731737262, + "grad_norm": 1.0349369049072266, + "learning_rate": 9.996780574282856e-05, + "loss": 2.5188, + "step": 1340 + }, + { + "epoch": 0.4116022099447514, + "grad_norm": 1.099743127822876, + "learning_rate": 9.996762715353089e-05, + "loss": 2.4141, + "step": 1341 + }, + { + "epoch": 0.41190914671577655, + "grad_norm": 1.027178406715393, + "learning_rate": 9.996744807042386e-05, + "loss": 2.5134, + "step": 1342 + }, + { + "epoch": 0.4122160834868017, + "grad_norm": 1.1933472156524658, + "learning_rate": 9.996726849350922e-05, + "loss": 2.4821, + "step": 1343 + }, + { + "epoch": 0.4125230202578269, + "grad_norm": 1.1663923263549805, + "learning_rate": 9.996708842278872e-05, + "loss": 2.4593, + "step": 1344 + }, + { + "epoch": 0.41282995702885206, + "grad_norm": 1.2633854150772095, + "learning_rate": 9.996690785826418e-05, + "loss": 2.5524, + "step": 1345 + }, + { + "epoch": 0.4131368937998772, + "grad_norm": 1.03873610496521, + "learning_rate": 9.996672679993737e-05, + "loss": 2.5403, + "step": 1346 + }, + { + "epoch": 0.4134438305709024, + "grad_norm": 1.106656789779663, + "learning_rate": 9.996654524781009e-05, + "loss": 2.5172, + "step": 1347 + }, + { + "epoch": 0.41375076734192756, + "grad_norm": 1.015608310699463, + "learning_rate": 9.996636320188411e-05, + "loss": 2.423, + "step": 1348 + }, + { + "epoch": 0.41405770411295273, + "grad_norm": 1.0672087669372559, + "learning_rate": 9.996618066216124e-05, + "loss": 2.4861, + "step": 1349 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 1.1289842128753662, + "learning_rate": 9.996599762864329e-05, + "loss": 2.3944, + "step": 1350 + }, + { + "epoch": 0.41467157765500307, + "grad_norm": 1.080428957939148, + "learning_rate": 9.996581410133207e-05, + "loss": 2.4563, + "step": 1351 + }, + { + "epoch": 0.41497851442602823, + "grad_norm": 1.257104516029358, + "learning_rate": 9.996563008022939e-05, + "loss": 2.437, + "step": 1352 + }, + { + "epoch": 0.4152854511970534, + "grad_norm": 1.039293646812439, + "learning_rate": 9.996544556533706e-05, + "loss": 2.4654, + "step": 1353 + }, + { + "epoch": 0.41559238796807857, + "grad_norm": 1.0976085662841797, + "learning_rate": 9.996526055665692e-05, + "loss": 2.4755, + "step": 1354 + }, + { + "epoch": 0.41589932473910374, + "grad_norm": 0.937647819519043, + "learning_rate": 9.996507505419078e-05, + "loss": 2.4687, + "step": 1355 + }, + { + "epoch": 0.4162062615101289, + "grad_norm": 1.0461267232894897, + "learning_rate": 9.996488905794047e-05, + "loss": 2.4092, + "step": 1356 + }, + { + "epoch": 0.4165131982811541, + "grad_norm": 1.0510658025741577, + "learning_rate": 9.996470256790787e-05, + "loss": 2.4806, + "step": 1357 + }, + { + "epoch": 0.41682013505217924, + "grad_norm": 1.2323371171951294, + "learning_rate": 9.996451558409478e-05, + "loss": 2.5017, + "step": 1358 + }, + { + "epoch": 0.4171270718232044, + "grad_norm": 0.9880139827728271, + "learning_rate": 9.996432810650307e-05, + "loss": 2.5171, + "step": 1359 + }, + { + "epoch": 0.4174340085942296, + "grad_norm": 1.2572466135025024, + "learning_rate": 9.996414013513458e-05, + "loss": 2.4285, + "step": 1360 + }, + { + "epoch": 0.41774094536525475, + "grad_norm": 0.8809559941291809, + "learning_rate": 9.996395166999118e-05, + "loss": 2.398, + "step": 1361 + }, + { + "epoch": 0.4180478821362799, + "grad_norm": 0.9389429688453674, + "learning_rate": 9.996376271107471e-05, + "loss": 2.4539, + "step": 1362 + }, + { + "epoch": 0.4183548189073051, + "grad_norm": 0.8821789026260376, + "learning_rate": 9.996357325838705e-05, + "loss": 2.4762, + "step": 1363 + }, + { + "epoch": 0.41866175567833025, + "grad_norm": 1.0148484706878662, + "learning_rate": 9.99633833119301e-05, + "loss": 2.5292, + "step": 1364 + }, + { + "epoch": 0.4189686924493554, + "grad_norm": 0.9861947894096375, + "learning_rate": 9.996319287170569e-05, + "loss": 2.4285, + "step": 1365 + }, + { + "epoch": 0.4192756292203806, + "grad_norm": 1.1907099485397339, + "learning_rate": 9.996300193771573e-05, + "loss": 2.4325, + "step": 1366 + }, + { + "epoch": 0.41958256599140575, + "grad_norm": 1.0746681690216064, + "learning_rate": 9.99628105099621e-05, + "loss": 2.3349, + "step": 1367 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 1.2040268182754517, + "learning_rate": 9.996261858844669e-05, + "loss": 2.4427, + "step": 1368 + }, + { + "epoch": 0.4201964395334561, + "grad_norm": 1.0487430095672607, + "learning_rate": 9.99624261731714e-05, + "loss": 2.4305, + "step": 1369 + }, + { + "epoch": 0.42050337630448126, + "grad_norm": 1.0047999620437622, + "learning_rate": 9.996223326413812e-05, + "loss": 2.4442, + "step": 1370 + }, + { + "epoch": 0.4208103130755064, + "grad_norm": 1.147078275680542, + "learning_rate": 9.996203986134879e-05, + "loss": 2.5189, + "step": 1371 + }, + { + "epoch": 0.4211172498465316, + "grad_norm": 1.2269455194473267, + "learning_rate": 9.996184596480529e-05, + "loss": 2.3905, + "step": 1372 + }, + { + "epoch": 0.42142418661755676, + "grad_norm": 0.9716771245002747, + "learning_rate": 9.996165157450954e-05, + "loss": 2.4246, + "step": 1373 + }, + { + "epoch": 0.42173112338858193, + "grad_norm": 1.0569939613342285, + "learning_rate": 9.996145669046347e-05, + "loss": 2.529, + "step": 1374 + }, + { + "epoch": 0.4220380601596071, + "grad_norm": 1.1145942211151123, + "learning_rate": 9.996126131266899e-05, + "loss": 2.3965, + "step": 1375 + }, + { + "epoch": 0.42234499693063227, + "grad_norm": 0.9990974068641663, + "learning_rate": 9.996106544112805e-05, + "loss": 2.4991, + "step": 1376 + }, + { + "epoch": 0.42265193370165743, + "grad_norm": 0.9536247253417969, + "learning_rate": 9.99608690758426e-05, + "loss": 2.4347, + "step": 1377 + }, + { + "epoch": 0.4229588704726826, + "grad_norm": 1.0053460597991943, + "learning_rate": 9.996067221681452e-05, + "loss": 2.4213, + "step": 1378 + }, + { + "epoch": 0.42326580724370777, + "grad_norm": 1.0727168321609497, + "learning_rate": 9.99604748640458e-05, + "loss": 2.4479, + "step": 1379 + }, + { + "epoch": 0.42357274401473294, + "grad_norm": 1.2539277076721191, + "learning_rate": 9.996027701753841e-05, + "loss": 2.4721, + "step": 1380 + }, + { + "epoch": 0.4238796807857581, + "grad_norm": 1.0348230600357056, + "learning_rate": 9.996007867729427e-05, + "loss": 2.4263, + "step": 1381 + }, + { + "epoch": 0.42418661755678333, + "grad_norm": 1.051802158355713, + "learning_rate": 9.995987984331533e-05, + "loss": 2.4492, + "step": 1382 + }, + { + "epoch": 0.4244935543278085, + "grad_norm": 1.0394505262374878, + "learning_rate": 9.995968051560361e-05, + "loss": 2.4625, + "step": 1383 + }, + { + "epoch": 0.42480049109883367, + "grad_norm": 1.1121852397918701, + "learning_rate": 9.995948069416103e-05, + "loss": 2.4999, + "step": 1384 + }, + { + "epoch": 0.42510742786985883, + "grad_norm": 0.9693613052368164, + "learning_rate": 9.995928037898957e-05, + "loss": 2.4112, + "step": 1385 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 1.1416810750961304, + "learning_rate": 9.995907957009123e-05, + "loss": 2.5452, + "step": 1386 + }, + { + "epoch": 0.42572130141190917, + "grad_norm": 1.010640025138855, + "learning_rate": 9.995887826746797e-05, + "loss": 2.412, + "step": 1387 + }, + { + "epoch": 0.42602823818293434, + "grad_norm": 1.0800373554229736, + "learning_rate": 9.99586764711218e-05, + "loss": 2.4451, + "step": 1388 + }, + { + "epoch": 0.4263351749539595, + "grad_norm": 1.058931589126587, + "learning_rate": 9.995847418105471e-05, + "loss": 2.474, + "step": 1389 + }, + { + "epoch": 0.4266421117249847, + "grad_norm": 1.0727131366729736, + "learning_rate": 9.99582713972687e-05, + "loss": 2.468, + "step": 1390 + }, + { + "epoch": 0.42694904849600984, + "grad_norm": 1.0237464904785156, + "learning_rate": 9.995806811976576e-05, + "loss": 2.5208, + "step": 1391 + }, + { + "epoch": 0.427255985267035, + "grad_norm": 1.036582112312317, + "learning_rate": 9.995786434854793e-05, + "loss": 2.4338, + "step": 1392 + }, + { + "epoch": 0.4275629220380602, + "grad_norm": 0.9617817997932434, + "learning_rate": 9.995766008361719e-05, + "loss": 2.4465, + "step": 1393 + }, + { + "epoch": 0.42786985880908535, + "grad_norm": 1.2188911437988281, + "learning_rate": 9.995745532497556e-05, + "loss": 2.5069, + "step": 1394 + }, + { + "epoch": 0.4281767955801105, + "grad_norm": 1.0796585083007812, + "learning_rate": 9.99572500726251e-05, + "loss": 2.4839, + "step": 1395 + }, + { + "epoch": 0.4284837323511357, + "grad_norm": 0.9843130111694336, + "learning_rate": 9.99570443265678e-05, + "loss": 2.4968, + "step": 1396 + }, + { + "epoch": 0.42879066912216085, + "grad_norm": 1.0441415309906006, + "learning_rate": 9.99568380868057e-05, + "loss": 2.4134, + "step": 1397 + }, + { + "epoch": 0.429097605893186, + "grad_norm": 0.9156177639961243, + "learning_rate": 9.995663135334085e-05, + "loss": 2.4891, + "step": 1398 + }, + { + "epoch": 0.4294045426642112, + "grad_norm": 1.1159545183181763, + "learning_rate": 9.995642412617529e-05, + "loss": 2.4507, + "step": 1399 + }, + { + "epoch": 0.42971147943523635, + "grad_norm": 0.8944577574729919, + "learning_rate": 9.995621640531107e-05, + "loss": 2.4465, + "step": 1400 + }, + { + "epoch": 0.4300184162062615, + "grad_norm": 0.9043408036231995, + "learning_rate": 9.995600819075025e-05, + "loss": 2.3726, + "step": 1401 + }, + { + "epoch": 0.4303253529772867, + "grad_norm": 0.9028464555740356, + "learning_rate": 9.995579948249486e-05, + "loss": 2.427, + "step": 1402 + }, + { + "epoch": 0.43063228974831186, + "grad_norm": 0.9497705101966858, + "learning_rate": 9.995559028054699e-05, + "loss": 2.4666, + "step": 1403 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.927601158618927, + "learning_rate": 9.995538058490868e-05, + "loss": 2.3679, + "step": 1404 + }, + { + "epoch": 0.4312461632903622, + "grad_norm": 1.050394892692566, + "learning_rate": 9.995517039558204e-05, + "loss": 2.4096, + "step": 1405 + }, + { + "epoch": 0.43155310006138736, + "grad_norm": 1.3011974096298218, + "learning_rate": 9.995495971256911e-05, + "loss": 2.4439, + "step": 1406 + }, + { + "epoch": 0.43186003683241253, + "grad_norm": 1.0740708112716675, + "learning_rate": 9.9954748535872e-05, + "loss": 2.4891, + "step": 1407 + }, + { + "epoch": 0.4321669736034377, + "grad_norm": 1.1132466793060303, + "learning_rate": 9.995453686549279e-05, + "loss": 2.46, + "step": 1408 + }, + { + "epoch": 0.43247391037446287, + "grad_norm": 1.063275933265686, + "learning_rate": 9.995432470143356e-05, + "loss": 2.5035, + "step": 1409 + }, + { + "epoch": 0.43278084714548803, + "grad_norm": 1.065679669380188, + "learning_rate": 9.99541120436964e-05, + "loss": 2.4471, + "step": 1410 + }, + { + "epoch": 0.4330877839165132, + "grad_norm": 1.017587423324585, + "learning_rate": 9.995389889228344e-05, + "loss": 2.4879, + "step": 1411 + }, + { + "epoch": 0.43339472068753837, + "grad_norm": 0.9744442701339722, + "learning_rate": 9.995368524719678e-05, + "loss": 2.3923, + "step": 1412 + }, + { + "epoch": 0.43370165745856354, + "grad_norm": 0.8916706442832947, + "learning_rate": 9.995347110843851e-05, + "loss": 2.3965, + "step": 1413 + }, + { + "epoch": 0.4340085942295887, + "grad_norm": 0.916221559047699, + "learning_rate": 9.995325647601075e-05, + "loss": 2.4742, + "step": 1414 + }, + { + "epoch": 0.4343155310006139, + "grad_norm": 0.9388782978057861, + "learning_rate": 9.995304134991565e-05, + "loss": 2.453, + "step": 1415 + }, + { + "epoch": 0.43462246777163904, + "grad_norm": 1.057085633277893, + "learning_rate": 9.995282573015532e-05, + "loss": 2.5791, + "step": 1416 + }, + { + "epoch": 0.4349294045426642, + "grad_norm": 1.055145025253296, + "learning_rate": 9.995260961673187e-05, + "loss": 2.3565, + "step": 1417 + }, + { + "epoch": 0.4352363413136894, + "grad_norm": 1.0733528137207031, + "learning_rate": 9.995239300964747e-05, + "loss": 2.5413, + "step": 1418 + }, + { + "epoch": 0.43554327808471455, + "grad_norm": 1.1478198766708374, + "learning_rate": 9.995217590890425e-05, + "loss": 2.4093, + "step": 1419 + }, + { + "epoch": 0.4358502148557397, + "grad_norm": 0.8663081526756287, + "learning_rate": 9.995195831450432e-05, + "loss": 2.3968, + "step": 1420 + }, + { + "epoch": 0.4361571516267649, + "grad_norm": 0.9811860918998718, + "learning_rate": 9.995174022644988e-05, + "loss": 2.3536, + "step": 1421 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.9883477687835693, + "learning_rate": 9.995152164474306e-05, + "loss": 2.5372, + "step": 1422 + }, + { + "epoch": 0.4367710251688152, + "grad_norm": 1.2196532487869263, + "learning_rate": 9.995130256938603e-05, + "loss": 2.429, + "step": 1423 + }, + { + "epoch": 0.4370779619398404, + "grad_norm": 1.000264286994934, + "learning_rate": 9.995108300038096e-05, + "loss": 2.4116, + "step": 1424 + }, + { + "epoch": 0.43738489871086556, + "grad_norm": 1.1259286403656006, + "learning_rate": 9.995086293773e-05, + "loss": 2.4405, + "step": 1425 + }, + { + "epoch": 0.4376918354818907, + "grad_norm": 0.9334595203399658, + "learning_rate": 9.995064238143533e-05, + "loss": 2.3849, + "step": 1426 + }, + { + "epoch": 0.4379987722529159, + "grad_norm": 0.8880285620689392, + "learning_rate": 9.995042133149914e-05, + "loss": 2.4177, + "step": 1427 + }, + { + "epoch": 0.43830570902394106, + "grad_norm": 0.8823251724243164, + "learning_rate": 9.995019978792362e-05, + "loss": 2.4876, + "step": 1428 + }, + { + "epoch": 0.4386126457949662, + "grad_norm": 0.9289014339447021, + "learning_rate": 9.994997775071094e-05, + "loss": 2.4725, + "step": 1429 + }, + { + "epoch": 0.4389195825659914, + "grad_norm": 0.9100427627563477, + "learning_rate": 9.994975521986329e-05, + "loss": 2.3834, + "step": 1430 + }, + { + "epoch": 0.43922651933701656, + "grad_norm": 0.8956978917121887, + "learning_rate": 9.99495321953829e-05, + "loss": 2.4418, + "step": 1431 + }, + { + "epoch": 0.43953345610804173, + "grad_norm": 1.1248396635055542, + "learning_rate": 9.994930867727195e-05, + "loss": 2.4389, + "step": 1432 + }, + { + "epoch": 0.4398403928790669, + "grad_norm": 0.9285669922828674, + "learning_rate": 9.994908466553266e-05, + "loss": 2.3922, + "step": 1433 + }, + { + "epoch": 0.44014732965009207, + "grad_norm": 0.9604844450950623, + "learning_rate": 9.994886016016723e-05, + "loss": 2.4365, + "step": 1434 + }, + { + "epoch": 0.44045426642111724, + "grad_norm": 1.0534024238586426, + "learning_rate": 9.99486351611779e-05, + "loss": 2.4377, + "step": 1435 + }, + { + "epoch": 0.4407612031921424, + "grad_norm": 1.1028003692626953, + "learning_rate": 9.994840966856686e-05, + "loss": 2.4299, + "step": 1436 + }, + { + "epoch": 0.44106813996316757, + "grad_norm": 1.119832158088684, + "learning_rate": 9.994818368233639e-05, + "loss": 2.4656, + "step": 1437 + }, + { + "epoch": 0.44137507673419274, + "grad_norm": 0.9782878160476685, + "learning_rate": 9.994795720248867e-05, + "loss": 2.3661, + "step": 1438 + }, + { + "epoch": 0.4416820135052179, + "grad_norm": 1.0002741813659668, + "learning_rate": 9.994773022902597e-05, + "loss": 2.4157, + "step": 1439 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 1.051486611366272, + "learning_rate": 9.994750276195053e-05, + "loss": 2.452, + "step": 1440 + }, + { + "epoch": 0.44229588704726824, + "grad_norm": 1.0375488996505737, + "learning_rate": 9.994727480126457e-05, + "loss": 2.4406, + "step": 1441 + }, + { + "epoch": 0.4426028238182934, + "grad_norm": 0.9407445192337036, + "learning_rate": 9.99470463469704e-05, + "loss": 2.3434, + "step": 1442 + }, + { + "epoch": 0.4429097605893186, + "grad_norm": 1.0371474027633667, + "learning_rate": 9.994681739907022e-05, + "loss": 2.5094, + "step": 1443 + }, + { + "epoch": 0.44321669736034375, + "grad_norm": 1.057519555091858, + "learning_rate": 9.994658795756632e-05, + "loss": 2.4501, + "step": 1444 + }, + { + "epoch": 0.4435236341313689, + "grad_norm": 0.9340078234672546, + "learning_rate": 9.994635802246097e-05, + "loss": 2.4151, + "step": 1445 + }, + { + "epoch": 0.4438305709023941, + "grad_norm": 0.8906050324440002, + "learning_rate": 9.994612759375644e-05, + "loss": 2.3837, + "step": 1446 + }, + { + "epoch": 0.44413750767341925, + "grad_norm": 0.8349595665931702, + "learning_rate": 9.994589667145497e-05, + "loss": 2.4317, + "step": 1447 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.9362117648124695, + "learning_rate": 9.994566525555891e-05, + "loss": 2.4586, + "step": 1448 + }, + { + "epoch": 0.4447513812154696, + "grad_norm": 0.869215190410614, + "learning_rate": 9.99454333460705e-05, + "loss": 2.4458, + "step": 1449 + }, + { + "epoch": 0.44505831798649476, + "grad_norm": 0.904531717300415, + "learning_rate": 9.994520094299204e-05, + "loss": 2.4198, + "step": 1450 + }, + { + "epoch": 0.4453652547575199, + "grad_norm": 0.9153178930282593, + "learning_rate": 9.994496804632583e-05, + "loss": 2.3718, + "step": 1451 + }, + { + "epoch": 0.44567219152854515, + "grad_norm": 1.0229307413101196, + "learning_rate": 9.994473465607418e-05, + "loss": 2.3787, + "step": 1452 + }, + { + "epoch": 0.4459791282995703, + "grad_norm": 1.0449415445327759, + "learning_rate": 9.994450077223938e-05, + "loss": 2.4965, + "step": 1453 + }, + { + "epoch": 0.4462860650705955, + "grad_norm": 1.0524135828018188, + "learning_rate": 9.994426639482375e-05, + "loss": 2.3518, + "step": 1454 + }, + { + "epoch": 0.44659300184162065, + "grad_norm": 1.0612086057662964, + "learning_rate": 9.994403152382961e-05, + "loss": 2.4501, + "step": 1455 + }, + { + "epoch": 0.4468999386126458, + "grad_norm": 1.0568779706954956, + "learning_rate": 9.994379615925929e-05, + "loss": 2.3754, + "step": 1456 + }, + { + "epoch": 0.447206875383671, + "grad_norm": 1.0984265804290771, + "learning_rate": 9.994356030111509e-05, + "loss": 2.4318, + "step": 1457 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.9227646589279175, + "learning_rate": 9.994332394939936e-05, + "loss": 2.3928, + "step": 1458 + }, + { + "epoch": 0.4478207489257213, + "grad_norm": 1.0073471069335938, + "learning_rate": 9.994308710411442e-05, + "loss": 2.4203, + "step": 1459 + }, + { + "epoch": 0.4481276856967465, + "grad_norm": 1.1347973346710205, + "learning_rate": 9.994284976526263e-05, + "loss": 2.4991, + "step": 1460 + }, + { + "epoch": 0.44843462246777166, + "grad_norm": 0.9912654757499695, + "learning_rate": 9.994261193284631e-05, + "loss": 2.471, + "step": 1461 + }, + { + "epoch": 0.4487415592387968, + "grad_norm": 1.0599550008773804, + "learning_rate": 9.994237360686784e-05, + "loss": 2.505, + "step": 1462 + }, + { + "epoch": 0.449048496009822, + "grad_norm": 0.9811004996299744, + "learning_rate": 9.994213478732957e-05, + "loss": 2.3868, + "step": 1463 + }, + { + "epoch": 0.44935543278084716, + "grad_norm": 0.8389631509780884, + "learning_rate": 9.994189547423384e-05, + "loss": 2.4766, + "step": 1464 + }, + { + "epoch": 0.44966236955187233, + "grad_norm": 0.8475043773651123, + "learning_rate": 9.994165566758302e-05, + "loss": 2.3666, + "step": 1465 + }, + { + "epoch": 0.4499693063228975, + "grad_norm": 0.8922824859619141, + "learning_rate": 9.994141536737951e-05, + "loss": 2.3823, + "step": 1466 + }, + { + "epoch": 0.45027624309392267, + "grad_norm": 1.0286083221435547, + "learning_rate": 9.994117457362564e-05, + "loss": 2.4639, + "step": 1467 + }, + { + "epoch": 0.45058317986494784, + "grad_norm": 1.094282865524292, + "learning_rate": 9.994093328632383e-05, + "loss": 2.3984, + "step": 1468 + }, + { + "epoch": 0.450890116635973, + "grad_norm": 1.0993603467941284, + "learning_rate": 9.994069150547642e-05, + "loss": 2.3719, + "step": 1469 + }, + { + "epoch": 0.45119705340699817, + "grad_norm": 1.0274133682250977, + "learning_rate": 9.994044923108585e-05, + "loss": 2.3644, + "step": 1470 + }, + { + "epoch": 0.45150399017802334, + "grad_norm": 0.8834434747695923, + "learning_rate": 9.994020646315448e-05, + "loss": 2.4955, + "step": 1471 + }, + { + "epoch": 0.4518109269490485, + "grad_norm": 0.8540776968002319, + "learning_rate": 9.993996320168473e-05, + "loss": 2.4292, + "step": 1472 + }, + { + "epoch": 0.4521178637200737, + "grad_norm": 0.8735383749008179, + "learning_rate": 9.993971944667897e-05, + "loss": 2.4343, + "step": 1473 + }, + { + "epoch": 0.45242480049109884, + "grad_norm": 0.976224422454834, + "learning_rate": 9.993947519813965e-05, + "loss": 2.4173, + "step": 1474 + }, + { + "epoch": 0.452731737262124, + "grad_norm": 0.9638139009475708, + "learning_rate": 9.993923045606917e-05, + "loss": 2.4322, + "step": 1475 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.9689927697181702, + "learning_rate": 9.993898522046992e-05, + "loss": 2.4625, + "step": 1476 + }, + { + "epoch": 0.45334561080417435, + "grad_norm": 1.0496052503585815, + "learning_rate": 9.993873949134437e-05, + "loss": 2.4788, + "step": 1477 + }, + { + "epoch": 0.4536525475751995, + "grad_norm": 1.0285090208053589, + "learning_rate": 9.993849326869491e-05, + "loss": 2.4119, + "step": 1478 + }, + { + "epoch": 0.4539594843462247, + "grad_norm": 0.9423730373382568, + "learning_rate": 9.993824655252401e-05, + "loss": 2.3919, + "step": 1479 + }, + { + "epoch": 0.45426642111724985, + "grad_norm": 1.0312988758087158, + "learning_rate": 9.993799934283407e-05, + "loss": 2.3829, + "step": 1480 + }, + { + "epoch": 0.454573357888275, + "grad_norm": 1.0985655784606934, + "learning_rate": 9.993775163962755e-05, + "loss": 2.3958, + "step": 1481 + }, + { + "epoch": 0.4548802946593002, + "grad_norm": 0.9346623420715332, + "learning_rate": 9.993750344290691e-05, + "loss": 2.3611, + "step": 1482 + }, + { + "epoch": 0.45518723143032536, + "grad_norm": 1.039681315422058, + "learning_rate": 9.993725475267459e-05, + "loss": 2.3989, + "step": 1483 + }, + { + "epoch": 0.4554941682013505, + "grad_norm": 0.9941854476928711, + "learning_rate": 9.993700556893304e-05, + "loss": 2.3092, + "step": 1484 + }, + { + "epoch": 0.4558011049723757, + "grad_norm": 0.9752130508422852, + "learning_rate": 9.993675589168473e-05, + "loss": 2.3727, + "step": 1485 + }, + { + "epoch": 0.45610804174340086, + "grad_norm": 0.9946039319038391, + "learning_rate": 9.993650572093216e-05, + "loss": 2.4121, + "step": 1486 + }, + { + "epoch": 0.45641497851442603, + "grad_norm": 1.1340489387512207, + "learning_rate": 9.993625505667774e-05, + "loss": 2.4477, + "step": 1487 + }, + { + "epoch": 0.4567219152854512, + "grad_norm": 0.9300981760025024, + "learning_rate": 9.993600389892399e-05, + "loss": 2.4045, + "step": 1488 + }, + { + "epoch": 0.45702885205647636, + "grad_norm": 0.8670973181724548, + "learning_rate": 9.993575224767338e-05, + "loss": 2.3596, + "step": 1489 + }, + { + "epoch": 0.45733578882750153, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.99355001029284e-05, + "loss": 2.4191, + "step": 1490 + }, + { + "epoch": 0.4576427255985267, + "grad_norm": 0.9099079370498657, + "learning_rate": 9.993524746469154e-05, + "loss": 2.4139, + "step": 1491 + }, + { + "epoch": 0.45794966236955187, + "grad_norm": 0.9740153551101685, + "learning_rate": 9.99349943329653e-05, + "loss": 2.4269, + "step": 1492 + }, + { + "epoch": 0.45825659914057704, + "grad_norm": 0.9112171530723572, + "learning_rate": 9.993474070775217e-05, + "loss": 2.3575, + "step": 1493 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 1.124553918838501, + "learning_rate": 9.993448658905466e-05, + "loss": 2.5518, + "step": 1494 + }, + { + "epoch": 0.4588704726826274, + "grad_norm": 1.1732012033462524, + "learning_rate": 9.99342319768753e-05, + "loss": 2.4346, + "step": 1495 + }, + { + "epoch": 0.45917740945365254, + "grad_norm": 0.8880025148391724, + "learning_rate": 9.993397687121659e-05, + "loss": 2.3593, + "step": 1496 + }, + { + "epoch": 0.4594843462246777, + "grad_norm": 0.9916797876358032, + "learning_rate": 9.993372127208105e-05, + "loss": 2.3283, + "step": 1497 + }, + { + "epoch": 0.4597912829957029, + "grad_norm": 0.9372622966766357, + "learning_rate": 9.99334651794712e-05, + "loss": 2.3868, + "step": 1498 + }, + { + "epoch": 0.46009821976672804, + "grad_norm": 1.0630989074707031, + "learning_rate": 9.99332085933896e-05, + "loss": 2.3605, + "step": 1499 + }, + { + "epoch": 0.4604051565377532, + "grad_norm": 1.000473976135254, + "learning_rate": 9.993295151383874e-05, + "loss": 2.3478, + "step": 1500 + }, + { + "epoch": 0.4607120933087784, + "grad_norm": 1.0269688367843628, + "learning_rate": 9.99326939408212e-05, + "loss": 2.4104, + "step": 1501 + }, + { + "epoch": 0.46101903007980355, + "grad_norm": 0.9003174901008606, + "learning_rate": 9.993243587433952e-05, + "loss": 2.3461, + "step": 1502 + }, + { + "epoch": 0.4613259668508287, + "grad_norm": 0.7938058972358704, + "learning_rate": 9.993217731439623e-05, + "loss": 2.3463, + "step": 1503 + }, + { + "epoch": 0.4616329036218539, + "grad_norm": 0.8715407252311707, + "learning_rate": 9.993191826099391e-05, + "loss": 2.3962, + "step": 1504 + }, + { + "epoch": 0.46193984039287905, + "grad_norm": 0.8319756984710693, + "learning_rate": 9.99316587141351e-05, + "loss": 2.342, + "step": 1505 + }, + { + "epoch": 0.4622467771639042, + "grad_norm": 0.846592903137207, + "learning_rate": 9.993139867382238e-05, + "loss": 2.4064, + "step": 1506 + }, + { + "epoch": 0.4625537139349294, + "grad_norm": 0.8567312955856323, + "learning_rate": 9.99311381400583e-05, + "loss": 2.3603, + "step": 1507 + }, + { + "epoch": 0.46286065070595456, + "grad_norm": 0.8784321546554565, + "learning_rate": 9.993087711284546e-05, + "loss": 2.4031, + "step": 1508 + }, + { + "epoch": 0.4631675874769797, + "grad_norm": 0.838233232498169, + "learning_rate": 9.993061559218641e-05, + "loss": 2.3156, + "step": 1509 + }, + { + "epoch": 0.4634745242480049, + "grad_norm": 0.8804462552070618, + "learning_rate": 9.993035357808376e-05, + "loss": 2.4322, + "step": 1510 + }, + { + "epoch": 0.46378146101903006, + "grad_norm": 1.1055982112884521, + "learning_rate": 9.99300910705401e-05, + "loss": 2.5006, + "step": 1511 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.9872145056724548, + "learning_rate": 9.992982806955799e-05, + "loss": 2.3547, + "step": 1512 + }, + { + "epoch": 0.4643953345610804, + "grad_norm": 1.0710479021072388, + "learning_rate": 9.99295645751401e-05, + "loss": 2.4867, + "step": 1513 + }, + { + "epoch": 0.46470227133210557, + "grad_norm": 0.9858919382095337, + "learning_rate": 9.992930058728894e-05, + "loss": 2.2986, + "step": 1514 + }, + { + "epoch": 0.46500920810313073, + "grad_norm": 0.9031065702438354, + "learning_rate": 9.992903610600719e-05, + "loss": 2.3172, + "step": 1515 + }, + { + "epoch": 0.4653161448741559, + "grad_norm": 0.923160970211029, + "learning_rate": 9.992877113129744e-05, + "loss": 2.4231, + "step": 1516 + }, + { + "epoch": 0.46562308164518107, + "grad_norm": 1.0130947828292847, + "learning_rate": 9.992850566316231e-05, + "loss": 2.3593, + "step": 1517 + }, + { + "epoch": 0.46593001841620624, + "grad_norm": 0.8947033286094666, + "learning_rate": 9.992823970160441e-05, + "loss": 2.3324, + "step": 1518 + }, + { + "epoch": 0.4662369551872314, + "grad_norm": 0.8819900155067444, + "learning_rate": 9.992797324662639e-05, + "loss": 2.2885, + "step": 1519 + }, + { + "epoch": 0.4665438919582566, + "grad_norm": 0.9434374570846558, + "learning_rate": 9.99277062982309e-05, + "loss": 2.427, + "step": 1520 + }, + { + "epoch": 0.46685082872928174, + "grad_norm": 0.9568646550178528, + "learning_rate": 9.99274388564205e-05, + "loss": 2.4059, + "step": 1521 + }, + { + "epoch": 0.4671577655003069, + "grad_norm": 0.9125105142593384, + "learning_rate": 9.992717092119794e-05, + "loss": 2.3306, + "step": 1522 + }, + { + "epoch": 0.46746470227133213, + "grad_norm": 0.8893206715583801, + "learning_rate": 9.992690249256578e-05, + "loss": 2.4211, + "step": 1523 + }, + { + "epoch": 0.4677716390423573, + "grad_norm": 0.8655402660369873, + "learning_rate": 9.992663357052672e-05, + "loss": 2.3493, + "step": 1524 + }, + { + "epoch": 0.46807857581338247, + "grad_norm": 0.7973037958145142, + "learning_rate": 9.99263641550834e-05, + "loss": 2.4255, + "step": 1525 + }, + { + "epoch": 0.46838551258440764, + "grad_norm": 0.8158934116363525, + "learning_rate": 9.992609424623849e-05, + "loss": 2.3518, + "step": 1526 + }, + { + "epoch": 0.4686924493554328, + "grad_norm": 0.7919436693191528, + "learning_rate": 9.992582384399465e-05, + "loss": 2.3762, + "step": 1527 + }, + { + "epoch": 0.468999386126458, + "grad_norm": 0.911490261554718, + "learning_rate": 9.992555294835455e-05, + "loss": 2.454, + "step": 1528 + }, + { + "epoch": 0.46930632289748314, + "grad_norm": 0.9504674077033997, + "learning_rate": 9.992528155932088e-05, + "loss": 2.3554, + "step": 1529 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.9833991527557373, + "learning_rate": 9.99250096768963e-05, + "loss": 2.4245, + "step": 1530 + }, + { + "epoch": 0.4699201964395335, + "grad_norm": 0.9994687438011169, + "learning_rate": 9.992473730108354e-05, + "loss": 2.3269, + "step": 1531 + }, + { + "epoch": 0.47022713321055865, + "grad_norm": 0.977237343788147, + "learning_rate": 9.992446443188526e-05, + "loss": 2.3938, + "step": 1532 + }, + { + "epoch": 0.4705340699815838, + "grad_norm": 1.018334150314331, + "learning_rate": 9.992419106930415e-05, + "loss": 2.3076, + "step": 1533 + }, + { + "epoch": 0.470841006752609, + "grad_norm": 0.9752077460289001, + "learning_rate": 9.992391721334293e-05, + "loss": 2.4224, + "step": 1534 + }, + { + "epoch": 0.47114794352363415, + "grad_norm": 0.9457291960716248, + "learning_rate": 9.992364286400428e-05, + "loss": 2.3859, + "step": 1535 + }, + { + "epoch": 0.4714548802946593, + "grad_norm": 0.9112275838851929, + "learning_rate": 9.992336802129096e-05, + "loss": 2.3343, + "step": 1536 + }, + { + "epoch": 0.4717618170656845, + "grad_norm": 0.7701164484024048, + "learning_rate": 9.992309268520563e-05, + "loss": 2.3912, + "step": 1537 + }, + { + "epoch": 0.47206875383670965, + "grad_norm": 0.826822817325592, + "learning_rate": 9.992281685575105e-05, + "loss": 2.3794, + "step": 1538 + }, + { + "epoch": 0.4723756906077348, + "grad_norm": 0.8690019249916077, + "learning_rate": 9.992254053292994e-05, + "loss": 2.3474, + "step": 1539 + }, + { + "epoch": 0.47268262737876, + "grad_norm": 0.935954213142395, + "learning_rate": 9.9922263716745e-05, + "loss": 2.3794, + "step": 1540 + }, + { + "epoch": 0.47298956414978516, + "grad_norm": 1.0606616735458374, + "learning_rate": 9.992198640719901e-05, + "loss": 2.3491, + "step": 1541 + }, + { + "epoch": 0.4732965009208103, + "grad_norm": 1.0020630359649658, + "learning_rate": 9.992170860429469e-05, + "loss": 2.4723, + "step": 1542 + }, + { + "epoch": 0.4736034376918355, + "grad_norm": 0.9738268256187439, + "learning_rate": 9.992143030803476e-05, + "loss": 2.4282, + "step": 1543 + }, + { + "epoch": 0.47391037446286066, + "grad_norm": 1.0320461988449097, + "learning_rate": 9.992115151842203e-05, + "loss": 2.3935, + "step": 1544 + }, + { + "epoch": 0.47421731123388583, + "grad_norm": 0.926980197429657, + "learning_rate": 9.992087223545921e-05, + "loss": 2.4403, + "step": 1545 + }, + { + "epoch": 0.474524248004911, + "grad_norm": 0.8760805130004883, + "learning_rate": 9.992059245914906e-05, + "loss": 2.3282, + "step": 1546 + }, + { + "epoch": 0.47483118477593617, + "grad_norm": 0.807569146156311, + "learning_rate": 9.992031218949435e-05, + "loss": 2.351, + "step": 1547 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.7491574883460999, + "learning_rate": 9.992003142649788e-05, + "loss": 2.3788, + "step": 1548 + }, + { + "epoch": 0.4754450583179865, + "grad_norm": 0.8402566909790039, + "learning_rate": 9.99197501701624e-05, + "loss": 2.4025, + "step": 1549 + }, + { + "epoch": 0.47575199508901167, + "grad_norm": 0.9501824975013733, + "learning_rate": 9.991946842049067e-05, + "loss": 2.4433, + "step": 1550 + }, + { + "epoch": 0.47605893186003684, + "grad_norm": 1.0070267915725708, + "learning_rate": 9.99191861774855e-05, + "loss": 2.4267, + "step": 1551 + }, + { + "epoch": 0.476365868631062, + "grad_norm": 0.9052779078483582, + "learning_rate": 9.991890344114969e-05, + "loss": 2.37, + "step": 1552 + }, + { + "epoch": 0.4766728054020872, + "grad_norm": 0.9453344345092773, + "learning_rate": 9.9918620211486e-05, + "loss": 2.4687, + "step": 1553 + }, + { + "epoch": 0.47697974217311234, + "grad_norm": 0.9836863875389099, + "learning_rate": 9.991833648849725e-05, + "loss": 2.4005, + "step": 1554 + }, + { + "epoch": 0.4772866789441375, + "grad_norm": 0.856532633304596, + "learning_rate": 9.991805227218624e-05, + "loss": 2.329, + "step": 1555 + }, + { + "epoch": 0.4775936157151627, + "grad_norm": 0.8338705897331238, + "learning_rate": 9.991776756255579e-05, + "loss": 2.3648, + "step": 1556 + }, + { + "epoch": 0.47790055248618785, + "grad_norm": 0.7738644480705261, + "learning_rate": 9.991748235960869e-05, + "loss": 2.2784, + "step": 1557 + }, + { + "epoch": 0.478207489257213, + "grad_norm": 0.7771223783493042, + "learning_rate": 9.991719666334778e-05, + "loss": 2.2747, + "step": 1558 + }, + { + "epoch": 0.4785144260282382, + "grad_norm": 0.7564612627029419, + "learning_rate": 9.991691047377588e-05, + "loss": 2.2964, + "step": 1559 + }, + { + "epoch": 0.47882136279926335, + "grad_norm": 0.7877290844917297, + "learning_rate": 9.99166237908958e-05, + "loss": 2.3149, + "step": 1560 + }, + { + "epoch": 0.4791282995702885, + "grad_norm": 0.7967450022697449, + "learning_rate": 9.991633661471039e-05, + "loss": 2.4035, + "step": 1561 + }, + { + "epoch": 0.4794352363413137, + "grad_norm": 0.8993534445762634, + "learning_rate": 9.991604894522248e-05, + "loss": 2.4028, + "step": 1562 + }, + { + "epoch": 0.47974217311233885, + "grad_norm": 0.9135516881942749, + "learning_rate": 9.991576078243494e-05, + "loss": 2.3968, + "step": 1563 + }, + { + "epoch": 0.480049109883364, + "grad_norm": 0.8438525795936584, + "learning_rate": 9.991547212635057e-05, + "loss": 2.3589, + "step": 1564 + }, + { + "epoch": 0.4803560466543892, + "grad_norm": 0.8979686498641968, + "learning_rate": 9.991518297697226e-05, + "loss": 2.3835, + "step": 1565 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.8821539878845215, + "learning_rate": 9.991489333430286e-05, + "loss": 2.3503, + "step": 1566 + }, + { + "epoch": 0.4809699201964395, + "grad_norm": 0.8649077415466309, + "learning_rate": 9.991460319834523e-05, + "loss": 2.3806, + "step": 1567 + }, + { + "epoch": 0.4812768569674647, + "grad_norm": 0.8360965847969055, + "learning_rate": 9.991431256910223e-05, + "loss": 2.3997, + "step": 1568 + }, + { + "epoch": 0.48158379373848986, + "grad_norm": 0.9178828597068787, + "learning_rate": 9.991402144657673e-05, + "loss": 2.3611, + "step": 1569 + }, + { + "epoch": 0.48189073050951503, + "grad_norm": 0.7961607575416565, + "learning_rate": 9.991372983077161e-05, + "loss": 2.3588, + "step": 1570 + }, + { + "epoch": 0.4821976672805402, + "grad_norm": 0.8136993646621704, + "learning_rate": 9.991343772168978e-05, + "loss": 2.3241, + "step": 1571 + }, + { + "epoch": 0.48250460405156537, + "grad_norm": 0.8421273231506348, + "learning_rate": 9.991314511933407e-05, + "loss": 2.3493, + "step": 1572 + }, + { + "epoch": 0.48281154082259053, + "grad_norm": 0.774861752986908, + "learning_rate": 9.991285202370743e-05, + "loss": 2.362, + "step": 1573 + }, + { + "epoch": 0.4831184775936157, + "grad_norm": 0.9181589484214783, + "learning_rate": 9.991255843481273e-05, + "loss": 2.443, + "step": 1574 + }, + { + "epoch": 0.48342541436464087, + "grad_norm": 0.873884379863739, + "learning_rate": 9.991226435265286e-05, + "loss": 2.3819, + "step": 1575 + }, + { + "epoch": 0.48373235113566604, + "grad_norm": 0.923200786113739, + "learning_rate": 9.991196977723077e-05, + "loss": 2.4152, + "step": 1576 + }, + { + "epoch": 0.4840392879066912, + "grad_norm": 0.9097923040390015, + "learning_rate": 9.99116747085493e-05, + "loss": 2.4072, + "step": 1577 + }, + { + "epoch": 0.4843462246777164, + "grad_norm": 0.8885805010795593, + "learning_rate": 9.991137914661143e-05, + "loss": 2.3963, + "step": 1578 + }, + { + "epoch": 0.48465316144874154, + "grad_norm": 0.9016655683517456, + "learning_rate": 9.991108309142006e-05, + "loss": 2.4287, + "step": 1579 + }, + { + "epoch": 0.4849600982197667, + "grad_norm": 0.957548201084137, + "learning_rate": 9.99107865429781e-05, + "loss": 2.4306, + "step": 1580 + }, + { + "epoch": 0.4852670349907919, + "grad_norm": 0.9604195356369019, + "learning_rate": 9.99104895012885e-05, + "loss": 2.3721, + "step": 1581 + }, + { + "epoch": 0.48557397176181705, + "grad_norm": 1.0423815250396729, + "learning_rate": 9.991019196635419e-05, + "loss": 2.3847, + "step": 1582 + }, + { + "epoch": 0.4858809085328422, + "grad_norm": 0.9538045525550842, + "learning_rate": 9.990989393817809e-05, + "loss": 2.4307, + "step": 1583 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 1.0103334188461304, + "learning_rate": 9.990959541676318e-05, + "loss": 2.409, + "step": 1584 + }, + { + "epoch": 0.48649478207489255, + "grad_norm": 1.0780646800994873, + "learning_rate": 9.99092964021124e-05, + "loss": 2.3314, + "step": 1585 + }, + { + "epoch": 0.4868017188459177, + "grad_norm": 1.0062072277069092, + "learning_rate": 9.99089968942287e-05, + "loss": 2.3922, + "step": 1586 + }, + { + "epoch": 0.4871086556169429, + "grad_norm": 1.0575196743011475, + "learning_rate": 9.990869689311504e-05, + "loss": 2.4156, + "step": 1587 + }, + { + "epoch": 0.48741559238796806, + "grad_norm": 0.9953998923301697, + "learning_rate": 9.990839639877438e-05, + "loss": 2.381, + "step": 1588 + }, + { + "epoch": 0.4877225291589932, + "grad_norm": 0.8848470449447632, + "learning_rate": 9.99080954112097e-05, + "loss": 2.4178, + "step": 1589 + }, + { + "epoch": 0.4880294659300184, + "grad_norm": 0.7849117517471313, + "learning_rate": 9.990779393042397e-05, + "loss": 2.3021, + "step": 1590 + }, + { + "epoch": 0.48833640270104356, + "grad_norm": 0.7611599564552307, + "learning_rate": 9.990749195642016e-05, + "loss": 2.4426, + "step": 1591 + }, + { + "epoch": 0.4886433394720687, + "grad_norm": 0.8361895084381104, + "learning_rate": 9.990718948920127e-05, + "loss": 2.3442, + "step": 1592 + }, + { + "epoch": 0.4889502762430939, + "grad_norm": 0.8249576687812805, + "learning_rate": 9.990688652877028e-05, + "loss": 2.2745, + "step": 1593 + }, + { + "epoch": 0.4892572130141191, + "grad_norm": 0.763889729976654, + "learning_rate": 9.990658307513019e-05, + "loss": 2.3123, + "step": 1594 + }, + { + "epoch": 0.4895641497851443, + "grad_norm": 0.7517281770706177, + "learning_rate": 9.990627912828399e-05, + "loss": 2.3811, + "step": 1595 + }, + { + "epoch": 0.48987108655616945, + "grad_norm": 0.8254112005233765, + "learning_rate": 9.990597468823468e-05, + "loss": 2.4269, + "step": 1596 + }, + { + "epoch": 0.4901780233271946, + "grad_norm": 0.8267236948013306, + "learning_rate": 9.99056697549853e-05, + "loss": 2.354, + "step": 1597 + }, + { + "epoch": 0.4904849600982198, + "grad_norm": 0.8511303067207336, + "learning_rate": 9.990536432853881e-05, + "loss": 2.3755, + "step": 1598 + }, + { + "epoch": 0.49079189686924496, + "grad_norm": 0.8639636635780334, + "learning_rate": 9.990505840889828e-05, + "loss": 2.3828, + "step": 1599 + }, + { + "epoch": 0.4910988336402701, + "grad_norm": 0.8371795415878296, + "learning_rate": 9.990475199606672e-05, + "loss": 2.4235, + "step": 1600 + }, + { + "epoch": 0.4914057704112953, + "grad_norm": 0.7639186382293701, + "learning_rate": 9.990444509004713e-05, + "loss": 2.3547, + "step": 1601 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.7835492491722107, + "learning_rate": 9.990413769084257e-05, + "loss": 2.2983, + "step": 1602 + }, + { + "epoch": 0.49201964395334563, + "grad_norm": 0.8301565647125244, + "learning_rate": 9.990382979845609e-05, + "loss": 2.4109, + "step": 1603 + }, + { + "epoch": 0.4923265807243708, + "grad_norm": 0.9005976915359497, + "learning_rate": 9.99035214128907e-05, + "loss": 2.3618, + "step": 1604 + }, + { + "epoch": 0.49263351749539597, + "grad_norm": 1.0234936475753784, + "learning_rate": 9.990321253414945e-05, + "loss": 2.4622, + "step": 1605 + }, + { + "epoch": 0.49294045426642114, + "grad_norm": 1.1613819599151611, + "learning_rate": 9.990290316223542e-05, + "loss": 2.3231, + "step": 1606 + }, + { + "epoch": 0.4932473910374463, + "grad_norm": 0.9382983446121216, + "learning_rate": 9.990259329715165e-05, + "loss": 2.357, + "step": 1607 + }, + { + "epoch": 0.49355432780847147, + "grad_norm": 1.0277435779571533, + "learning_rate": 9.990228293890121e-05, + "loss": 2.3497, + "step": 1608 + }, + { + "epoch": 0.49386126457949664, + "grad_norm": 0.9809542894363403, + "learning_rate": 9.990197208748716e-05, + "loss": 2.363, + "step": 1609 + }, + { + "epoch": 0.4941682013505218, + "grad_norm": 1.151412844657898, + "learning_rate": 9.990166074291255e-05, + "loss": 2.4859, + "step": 1610 + }, + { + "epoch": 0.494475138121547, + "grad_norm": 0.9663482308387756, + "learning_rate": 9.990134890518051e-05, + "loss": 2.3848, + "step": 1611 + }, + { + "epoch": 0.49478207489257214, + "grad_norm": 0.9619266986846924, + "learning_rate": 9.990103657429405e-05, + "loss": 2.3381, + "step": 1612 + }, + { + "epoch": 0.4950890116635973, + "grad_norm": 1.1306475400924683, + "learning_rate": 9.990072375025634e-05, + "loss": 2.3859, + "step": 1613 + }, + { + "epoch": 0.4953959484346225, + "grad_norm": 1.127801537513733, + "learning_rate": 9.990041043307043e-05, + "loss": 2.4259, + "step": 1614 + }, + { + "epoch": 0.49570288520564765, + "grad_norm": 0.9880200624465942, + "learning_rate": 9.990009662273941e-05, + "loss": 2.3629, + "step": 1615 + }, + { + "epoch": 0.4960098219766728, + "grad_norm": 0.940493643283844, + "learning_rate": 9.989978231926636e-05, + "loss": 2.3716, + "step": 1616 + }, + { + "epoch": 0.496316758747698, + "grad_norm": 0.7923702597618103, + "learning_rate": 9.989946752265445e-05, + "loss": 2.3017, + "step": 1617 + }, + { + "epoch": 0.49662369551872315, + "grad_norm": 0.7668408155441284, + "learning_rate": 9.989915223290673e-05, + "loss": 2.3273, + "step": 1618 + }, + { + "epoch": 0.4969306322897483, + "grad_norm": 0.7134098410606384, + "learning_rate": 9.989883645002636e-05, + "loss": 2.302, + "step": 1619 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.6878800392150879, + "learning_rate": 9.989852017401643e-05, + "loss": 2.3047, + "step": 1620 + }, + { + "epoch": 0.49754450583179866, + "grad_norm": 0.8099397420883179, + "learning_rate": 9.989820340488008e-05, + "loss": 2.4747, + "step": 1621 + }, + { + "epoch": 0.4978514426028238, + "grad_norm": 0.9677640795707703, + "learning_rate": 9.989788614262043e-05, + "loss": 2.3347, + "step": 1622 + }, + { + "epoch": 0.498158379373849, + "grad_norm": 0.7592893838882446, + "learning_rate": 9.989756838724064e-05, + "loss": 2.3238, + "step": 1623 + }, + { + "epoch": 0.49846531614487416, + "grad_norm": 0.872529923915863, + "learning_rate": 9.989725013874382e-05, + "loss": 2.4117, + "step": 1624 + }, + { + "epoch": 0.49877225291589933, + "grad_norm": 1.023362159729004, + "learning_rate": 9.989693139713315e-05, + "loss": 2.3307, + "step": 1625 + }, + { + "epoch": 0.4990791896869245, + "grad_norm": 0.8994693756103516, + "learning_rate": 9.989661216241172e-05, + "loss": 2.3661, + "step": 1626 + }, + { + "epoch": 0.49938612645794966, + "grad_norm": 0.8854429125785828, + "learning_rate": 9.989629243458275e-05, + "loss": 2.311, + "step": 1627 + }, + { + "epoch": 0.49969306322897483, + "grad_norm": 0.8326926231384277, + "learning_rate": 9.989597221364937e-05, + "loss": 2.302, + "step": 1628 + }, + { + "epoch": 0.5, + "grad_norm": 0.8778239488601685, + "learning_rate": 9.989565149961475e-05, + "loss": 2.4653, + "step": 1629 + }, + { + "epoch": 0.5003069367710252, + "grad_norm": 0.9369759559631348, + "learning_rate": 9.989533029248205e-05, + "loss": 2.4165, + "step": 1630 + }, + { + "epoch": 0.5006138735420503, + "grad_norm": 0.8510915637016296, + "learning_rate": 9.989500859225445e-05, + "loss": 2.3345, + "step": 1631 + }, + { + "epoch": 0.5009208103130756, + "grad_norm": 0.787972629070282, + "learning_rate": 9.989468639893513e-05, + "loss": 2.283, + "step": 1632 + }, + { + "epoch": 0.5012277470841007, + "grad_norm": 0.7370568513870239, + "learning_rate": 9.989436371252729e-05, + "loss": 2.2867, + "step": 1633 + }, + { + "epoch": 0.5015346838551259, + "grad_norm": 0.8459502458572388, + "learning_rate": 9.989404053303409e-05, + "loss": 2.2875, + "step": 1634 + }, + { + "epoch": 0.501841620626151, + "grad_norm": 0.9123181700706482, + "learning_rate": 9.989371686045874e-05, + "loss": 2.2653, + "step": 1635 + }, + { + "epoch": 0.5021485573971762, + "grad_norm": 1.1908178329467773, + "learning_rate": 9.989339269480445e-05, + "loss": 2.4849, + "step": 1636 + }, + { + "epoch": 0.5024554941682013, + "grad_norm": 0.8162623643875122, + "learning_rate": 9.989306803607439e-05, + "loss": 2.2409, + "step": 1637 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.9289522171020508, + "learning_rate": 9.98927428842718e-05, + "loss": 2.455, + "step": 1638 + }, + { + "epoch": 0.5030693677102517, + "grad_norm": 1.212346076965332, + "learning_rate": 9.989241723939988e-05, + "loss": 2.3461, + "step": 1639 + }, + { + "epoch": 0.5033763044812769, + "grad_norm": 0.8971593976020813, + "learning_rate": 9.989209110146184e-05, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.503683241252302, + "grad_norm": 0.9293156862258911, + "learning_rate": 9.989176447046092e-05, + "loss": 2.3235, + "step": 1641 + }, + { + "epoch": 0.5039901780233272, + "grad_norm": 0.8665596842765808, + "learning_rate": 9.989143734640034e-05, + "loss": 2.4694, + "step": 1642 + }, + { + "epoch": 0.5042971147943524, + "grad_norm": 0.7732648253440857, + "learning_rate": 9.989110972928333e-05, + "loss": 2.1985, + "step": 1643 + }, + { + "epoch": 0.5046040515653776, + "grad_norm": 0.8124692440032959, + "learning_rate": 9.989078161911314e-05, + "loss": 2.315, + "step": 1644 + }, + { + "epoch": 0.5049109883364027, + "grad_norm": 0.8534342050552368, + "learning_rate": 9.989045301589301e-05, + "loss": 2.3491, + "step": 1645 + }, + { + "epoch": 0.5052179251074279, + "grad_norm": 0.8351274132728577, + "learning_rate": 9.989012391962617e-05, + "loss": 2.3416, + "step": 1646 + }, + { + "epoch": 0.505524861878453, + "grad_norm": 0.9143189787864685, + "learning_rate": 9.988979433031588e-05, + "loss": 2.4665, + "step": 1647 + }, + { + "epoch": 0.5058317986494782, + "grad_norm": 0.8978474140167236, + "learning_rate": 9.988946424796542e-05, + "loss": 2.389, + "step": 1648 + }, + { + "epoch": 0.5061387354205034, + "grad_norm": 1.0245648622512817, + "learning_rate": 9.988913367257802e-05, + "loss": 2.3391, + "step": 1649 + }, + { + "epoch": 0.5064456721915286, + "grad_norm": 0.9991573691368103, + "learning_rate": 9.988880260415695e-05, + "loss": 2.405, + "step": 1650 + }, + { + "epoch": 0.5067526089625537, + "grad_norm": 1.042378306388855, + "learning_rate": 9.98884710427055e-05, + "loss": 2.3467, + "step": 1651 + }, + { + "epoch": 0.5070595457335789, + "grad_norm": 0.9569510817527771, + "learning_rate": 9.988813898822694e-05, + "loss": 2.31, + "step": 1652 + }, + { + "epoch": 0.507366482504604, + "grad_norm": 0.9343158006668091, + "learning_rate": 9.988780644072456e-05, + "loss": 2.3659, + "step": 1653 + }, + { + "epoch": 0.5076734192756293, + "grad_norm": 0.7857093811035156, + "learning_rate": 9.988747340020162e-05, + "loss": 2.3424, + "step": 1654 + }, + { + "epoch": 0.5079803560466544, + "grad_norm": 0.7613041996955872, + "learning_rate": 9.988713986666144e-05, + "loss": 2.2698, + "step": 1655 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.8077516555786133, + "learning_rate": 9.98868058401073e-05, + "loss": 2.3827, + "step": 1656 + }, + { + "epoch": 0.5085942295887047, + "grad_norm": 0.8794304132461548, + "learning_rate": 9.98864713205425e-05, + "loss": 2.3079, + "step": 1657 + }, + { + "epoch": 0.5089011663597299, + "grad_norm": 0.8333674073219299, + "learning_rate": 9.988613630797036e-05, + "loss": 2.3622, + "step": 1658 + }, + { + "epoch": 0.509208103130755, + "grad_norm": 0.9654781222343445, + "learning_rate": 9.988580080239417e-05, + "loss": 2.3979, + "step": 1659 + }, + { + "epoch": 0.5095150399017803, + "grad_norm": 0.9278727769851685, + "learning_rate": 9.988546480381727e-05, + "loss": 2.3728, + "step": 1660 + }, + { + "epoch": 0.5098219766728054, + "grad_norm": 0.7971704006195068, + "learning_rate": 9.988512831224298e-05, + "loss": 2.2983, + "step": 1661 + }, + { + "epoch": 0.5101289134438306, + "grad_norm": 0.8991698026657104, + "learning_rate": 9.988479132767459e-05, + "loss": 2.3992, + "step": 1662 + }, + { + "epoch": 0.5104358502148557, + "grad_norm": 1.0208392143249512, + "learning_rate": 9.988445385011546e-05, + "loss": 2.3847, + "step": 1663 + }, + { + "epoch": 0.5107427869858809, + "grad_norm": 0.878237247467041, + "learning_rate": 9.988411587956891e-05, + "loss": 2.2851, + "step": 1664 + }, + { + "epoch": 0.511049723756906, + "grad_norm": 0.903287410736084, + "learning_rate": 9.98837774160383e-05, + "loss": 2.4233, + "step": 1665 + }, + { + "epoch": 0.5113566605279313, + "grad_norm": 0.8845674991607666, + "learning_rate": 9.988343845952697e-05, + "loss": 2.2923, + "step": 1666 + }, + { + "epoch": 0.5116635972989564, + "grad_norm": 0.7729392051696777, + "learning_rate": 9.988309901003825e-05, + "loss": 2.3044, + "step": 1667 + }, + { + "epoch": 0.5119705340699816, + "grad_norm": 0.719302237033844, + "learning_rate": 9.988275906757551e-05, + "loss": 2.3207, + "step": 1668 + }, + { + "epoch": 0.5122774708410067, + "grad_norm": 0.7205179333686829, + "learning_rate": 9.988241863214211e-05, + "loss": 2.341, + "step": 1669 + }, + { + "epoch": 0.512584407612032, + "grad_norm": 0.7318145036697388, + "learning_rate": 9.988207770374142e-05, + "loss": 2.3419, + "step": 1670 + }, + { + "epoch": 0.5128913443830571, + "grad_norm": 0.770630955696106, + "learning_rate": 9.98817362823768e-05, + "loss": 2.27, + "step": 1671 + }, + { + "epoch": 0.5131982811540823, + "grad_norm": 0.6485452651977539, + "learning_rate": 9.988139436805162e-05, + "loss": 2.2715, + "step": 1672 + }, + { + "epoch": 0.5135052179251074, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.988105196076925e-05, + "loss": 2.2806, + "step": 1673 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.695818305015564, + "learning_rate": 9.98807090605331e-05, + "loss": 2.3387, + "step": 1674 + }, + { + "epoch": 0.5141190914671577, + "grad_norm": 0.7685426473617554, + "learning_rate": 9.988036566734655e-05, + "loss": 2.2921, + "step": 1675 + }, + { + "epoch": 0.514426028238183, + "grad_norm": 0.6522897481918335, + "learning_rate": 9.988002178121301e-05, + "loss": 2.2507, + "step": 1676 + }, + { + "epoch": 0.5147329650092081, + "grad_norm": 0.7442181706428528, + "learning_rate": 9.987967740213583e-05, + "loss": 2.3292, + "step": 1677 + }, + { + "epoch": 0.5150399017802333, + "grad_norm": 0.8093023300170898, + "learning_rate": 9.987933253011846e-05, + "loss": 2.3384, + "step": 1678 + }, + { + "epoch": 0.5153468385512584, + "grad_norm": 0.8014655113220215, + "learning_rate": 9.987898716516428e-05, + "loss": 2.3619, + "step": 1679 + }, + { + "epoch": 0.5156537753222836, + "grad_norm": 0.8230258822441101, + "learning_rate": 9.987864130727671e-05, + "loss": 2.3242, + "step": 1680 + }, + { + "epoch": 0.5159607120933087, + "grad_norm": 0.9222247004508972, + "learning_rate": 9.987829495645918e-05, + "loss": 2.3907, + "step": 1681 + }, + { + "epoch": 0.516267648864334, + "grad_norm": 0.9293351769447327, + "learning_rate": 9.987794811271511e-05, + "loss": 2.3632, + "step": 1682 + }, + { + "epoch": 0.5165745856353591, + "grad_norm": 0.9555168747901917, + "learning_rate": 9.987760077604791e-05, + "loss": 2.3273, + "step": 1683 + }, + { + "epoch": 0.5168815224063843, + "grad_norm": 0.9839370250701904, + "learning_rate": 9.987725294646102e-05, + "loss": 2.3451, + "step": 1684 + }, + { + "epoch": 0.5171884591774094, + "grad_norm": 1.097970962524414, + "learning_rate": 9.987690462395791e-05, + "loss": 2.308, + "step": 1685 + }, + { + "epoch": 0.5174953959484346, + "grad_norm": 0.9345484972000122, + "learning_rate": 9.987655580854198e-05, + "loss": 2.3051, + "step": 1686 + }, + { + "epoch": 0.5178023327194597, + "grad_norm": 0.8075851798057556, + "learning_rate": 9.987620650021668e-05, + "loss": 2.3005, + "step": 1687 + }, + { + "epoch": 0.518109269490485, + "grad_norm": 0.7287935614585876, + "learning_rate": 9.987585669898549e-05, + "loss": 2.3709, + "step": 1688 + }, + { + "epoch": 0.5184162062615101, + "grad_norm": 0.7611173987388611, + "learning_rate": 9.987550640485184e-05, + "loss": 2.3265, + "step": 1689 + }, + { + "epoch": 0.5187231430325353, + "grad_norm": 0.7932588458061218, + "learning_rate": 9.987515561781921e-05, + "loss": 2.3625, + "step": 1690 + }, + { + "epoch": 0.5190300798035604, + "grad_norm": 0.7837479114532471, + "learning_rate": 9.987480433789106e-05, + "loss": 2.2614, + "step": 1691 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.905799925327301, + "learning_rate": 9.987445256507085e-05, + "loss": 2.2915, + "step": 1692 + }, + { + "epoch": 0.5196439533456108, + "grad_norm": 0.9417183995246887, + "learning_rate": 9.987410029936208e-05, + "loss": 2.3624, + "step": 1693 + }, + { + "epoch": 0.519950890116636, + "grad_norm": 0.9971327185630798, + "learning_rate": 9.987374754076822e-05, + "loss": 2.3913, + "step": 1694 + }, + { + "epoch": 0.5202578268876611, + "grad_norm": 0.8719072341918945, + "learning_rate": 9.987339428929274e-05, + "loss": 2.3412, + "step": 1695 + }, + { + "epoch": 0.5205647636586863, + "grad_norm": 0.8198116421699524, + "learning_rate": 9.987304054493916e-05, + "loss": 2.333, + "step": 1696 + }, + { + "epoch": 0.5208717004297114, + "grad_norm": 0.7450931668281555, + "learning_rate": 9.987268630771096e-05, + "loss": 2.2817, + "step": 1697 + }, + { + "epoch": 0.5211786372007366, + "grad_norm": 0.6867587566375732, + "learning_rate": 9.987233157761164e-05, + "loss": 2.3456, + "step": 1698 + }, + { + "epoch": 0.5214855739717618, + "grad_norm": 0.7537778615951538, + "learning_rate": 9.987197635464471e-05, + "loss": 2.176, + "step": 1699 + }, + { + "epoch": 0.521792510742787, + "grad_norm": 0.8347577452659607, + "learning_rate": 9.987162063881366e-05, + "loss": 2.3296, + "step": 1700 + }, + { + "epoch": 0.5220994475138122, + "grad_norm": 0.8714643120765686, + "learning_rate": 9.987126443012205e-05, + "loss": 2.3648, + "step": 1701 + }, + { + "epoch": 0.5224063842848373, + "grad_norm": 0.8579849004745483, + "learning_rate": 9.987090772857336e-05, + "loss": 2.4189, + "step": 1702 + }, + { + "epoch": 0.5227133210558625, + "grad_norm": 0.8651238083839417, + "learning_rate": 9.987055053417114e-05, + "loss": 2.3036, + "step": 1703 + }, + { + "epoch": 0.5230202578268877, + "grad_norm": 0.8447873592376709, + "learning_rate": 9.98701928469189e-05, + "loss": 2.3243, + "step": 1704 + }, + { + "epoch": 0.5233271945979129, + "grad_norm": 0.8218941688537598, + "learning_rate": 9.986983466682019e-05, + "loss": 2.3888, + "step": 1705 + }, + { + "epoch": 0.523634131368938, + "grad_norm": 0.7862920761108398, + "learning_rate": 9.986947599387855e-05, + "loss": 2.335, + "step": 1706 + }, + { + "epoch": 0.5239410681399632, + "grad_norm": 0.8096200227737427, + "learning_rate": 9.986911682809749e-05, + "loss": 2.4034, + "step": 1707 + }, + { + "epoch": 0.5242480049109883, + "grad_norm": 0.8217427730560303, + "learning_rate": 9.986875716948062e-05, + "loss": 2.2659, + "step": 1708 + }, + { + "epoch": 0.5245549416820136, + "grad_norm": 0.7676928043365479, + "learning_rate": 9.986839701803146e-05, + "loss": 2.2736, + "step": 1709 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.7783572673797607, + "learning_rate": 9.986803637375356e-05, + "loss": 2.3611, + "step": 1710 + }, + { + "epoch": 0.5251688152240639, + "grad_norm": 0.7657338380813599, + "learning_rate": 9.98676752366505e-05, + "loss": 2.3573, + "step": 1711 + }, + { + "epoch": 0.525475751995089, + "grad_norm": 0.8946976065635681, + "learning_rate": 9.986731360672585e-05, + "loss": 2.3443, + "step": 1712 + }, + { + "epoch": 0.5257826887661142, + "grad_norm": 0.8047227263450623, + "learning_rate": 9.986695148398318e-05, + "loss": 2.345, + "step": 1713 + }, + { + "epoch": 0.5260896255371393, + "grad_norm": 0.8407939672470093, + "learning_rate": 9.986658886842605e-05, + "loss": 2.2828, + "step": 1714 + }, + { + "epoch": 0.5263965623081646, + "grad_norm": 0.8460215330123901, + "learning_rate": 9.986622576005806e-05, + "loss": 2.2786, + "step": 1715 + }, + { + "epoch": 0.5267034990791897, + "grad_norm": 0.8291949033737183, + "learning_rate": 9.986586215888283e-05, + "loss": 2.3491, + "step": 1716 + }, + { + "epoch": 0.5270104358502149, + "grad_norm": 0.8812628388404846, + "learning_rate": 9.98654980649039e-05, + "loss": 2.3392, + "step": 1717 + }, + { + "epoch": 0.52731737262124, + "grad_norm": 0.8666933178901672, + "learning_rate": 9.98651334781249e-05, + "loss": 2.2585, + "step": 1718 + }, + { + "epoch": 0.5276243093922652, + "grad_norm": 0.8393275737762451, + "learning_rate": 9.986476839854941e-05, + "loss": 2.3315, + "step": 1719 + }, + { + "epoch": 0.5279312461632903, + "grad_norm": 0.8431777954101562, + "learning_rate": 9.986440282618105e-05, + "loss": 2.268, + "step": 1720 + }, + { + "epoch": 0.5282381829343156, + "grad_norm": 0.8020747900009155, + "learning_rate": 9.986403676102346e-05, + "loss": 2.2306, + "step": 1721 + }, + { + "epoch": 0.5285451197053407, + "grad_norm": 0.817395806312561, + "learning_rate": 9.986367020308022e-05, + "loss": 2.2914, + "step": 1722 + }, + { + "epoch": 0.5288520564763659, + "grad_norm": 0.8034493327140808, + "learning_rate": 9.986330315235497e-05, + "loss": 2.3598, + "step": 1723 + }, + { + "epoch": 0.529158993247391, + "grad_norm": 0.9001252055168152, + "learning_rate": 9.986293560885131e-05, + "loss": 2.3456, + "step": 1724 + }, + { + "epoch": 0.5294659300184162, + "grad_norm": 0.9782349467277527, + "learning_rate": 9.986256757257293e-05, + "loss": 2.231, + "step": 1725 + }, + { + "epoch": 0.5297728667894414, + "grad_norm": 1.0022578239440918, + "learning_rate": 9.98621990435234e-05, + "loss": 2.3457, + "step": 1726 + }, + { + "epoch": 0.5300798035604666, + "grad_norm": 1.0705206394195557, + "learning_rate": 9.986183002170642e-05, + "loss": 2.2775, + "step": 1727 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.8464064598083496, + "learning_rate": 9.98614605071256e-05, + "loss": 2.4006, + "step": 1728 + }, + { + "epoch": 0.5306936771025169, + "grad_norm": 0.7128132581710815, + "learning_rate": 9.98610904997846e-05, + "loss": 2.3273, + "step": 1729 + }, + { + "epoch": 0.531000613873542, + "grad_norm": 0.8113927245140076, + "learning_rate": 9.986071999968706e-05, + "loss": 2.3467, + "step": 1730 + }, + { + "epoch": 0.5313075506445673, + "grad_norm": 0.9236831665039062, + "learning_rate": 9.986034900683669e-05, + "loss": 2.3815, + "step": 1731 + }, + { + "epoch": 0.5316144874155924, + "grad_norm": 0.9325668811798096, + "learning_rate": 9.985997752123713e-05, + "loss": 2.3411, + "step": 1732 + }, + { + "epoch": 0.5319214241866176, + "grad_norm": 0.9585117101669312, + "learning_rate": 9.985960554289203e-05, + "loss": 2.3309, + "step": 1733 + }, + { + "epoch": 0.5322283609576427, + "grad_norm": 0.9459986686706543, + "learning_rate": 9.98592330718051e-05, + "loss": 2.3525, + "step": 1734 + }, + { + "epoch": 0.5325352977286679, + "grad_norm": 0.971592366695404, + "learning_rate": 9.985886010797997e-05, + "loss": 2.3665, + "step": 1735 + }, + { + "epoch": 0.532842234499693, + "grad_norm": 0.8533779978752136, + "learning_rate": 9.985848665142039e-05, + "loss": 2.26, + "step": 1736 + }, + { + "epoch": 0.5331491712707183, + "grad_norm": 0.8224228620529175, + "learning_rate": 9.985811270213002e-05, + "loss": 2.3523, + "step": 1737 + }, + { + "epoch": 0.5334561080417434, + "grad_norm": 0.8649810552597046, + "learning_rate": 9.985773826011255e-05, + "loss": 2.3262, + "step": 1738 + }, + { + "epoch": 0.5337630448127686, + "grad_norm": 0.8099339604377747, + "learning_rate": 9.98573633253717e-05, + "loss": 2.3038, + "step": 1739 + }, + { + "epoch": 0.5340699815837937, + "grad_norm": 0.6788219213485718, + "learning_rate": 9.985698789791115e-05, + "loss": 2.3278, + "step": 1740 + }, + { + "epoch": 0.5343769183548189, + "grad_norm": 0.8716040253639221, + "learning_rate": 9.985661197773464e-05, + "loss": 2.2955, + "step": 1741 + }, + { + "epoch": 0.534683855125844, + "grad_norm": 0.8377614617347717, + "learning_rate": 9.985623556484587e-05, + "loss": 2.2801, + "step": 1742 + }, + { + "epoch": 0.5349907918968693, + "grad_norm": 0.8452683091163635, + "learning_rate": 9.985585865924853e-05, + "loss": 2.3313, + "step": 1743 + }, + { + "epoch": 0.5352977286678944, + "grad_norm": 0.8226203918457031, + "learning_rate": 9.98554812609464e-05, + "loss": 2.3464, + "step": 1744 + }, + { + "epoch": 0.5356046654389196, + "grad_norm": 0.7476974725723267, + "learning_rate": 9.985510336994316e-05, + "loss": 2.3721, + "step": 1745 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.7132230997085571, + "learning_rate": 9.98547249862426e-05, + "loss": 2.2657, + "step": 1746 + }, + { + "epoch": 0.5362185389809699, + "grad_norm": 0.7022002339363098, + "learning_rate": 9.98543461098484e-05, + "loss": 2.2656, + "step": 1747 + }, + { + "epoch": 0.536525475751995, + "grad_norm": 0.7174789309501648, + "learning_rate": 9.985396674076435e-05, + "loss": 2.2914, + "step": 1748 + }, + { + "epoch": 0.5368324125230203, + "grad_norm": 0.78509920835495, + "learning_rate": 9.985358687899417e-05, + "loss": 2.3155, + "step": 1749 + }, + { + "epoch": 0.5371393492940454, + "grad_norm": 0.7670894861221313, + "learning_rate": 9.985320652454162e-05, + "loss": 2.2608, + "step": 1750 + }, + { + "epoch": 0.5374462860650706, + "grad_norm": 0.6196603178977966, + "learning_rate": 9.985282567741047e-05, + "loss": 2.2796, + "step": 1751 + }, + { + "epoch": 0.5377532228360957, + "grad_norm": 0.7119829058647156, + "learning_rate": 9.985244433760448e-05, + "loss": 2.2262, + "step": 1752 + }, + { + "epoch": 0.538060159607121, + "grad_norm": 0.6665359735488892, + "learning_rate": 9.98520625051274e-05, + "loss": 2.2714, + "step": 1753 + }, + { + "epoch": 0.5383670963781461, + "grad_norm": 0.7960934042930603, + "learning_rate": 9.985168017998303e-05, + "loss": 2.3703, + "step": 1754 + }, + { + "epoch": 0.5386740331491713, + "grad_norm": 0.9428521394729614, + "learning_rate": 9.985129736217513e-05, + "loss": 2.3334, + "step": 1755 + }, + { + "epoch": 0.5389809699201964, + "grad_norm": 0.9900842905044556, + "learning_rate": 9.985091405170751e-05, + "loss": 2.2369, + "step": 1756 + }, + { + "epoch": 0.5392879066912216, + "grad_norm": 0.9340593814849854, + "learning_rate": 9.985053024858393e-05, + "loss": 2.4332, + "step": 1757 + }, + { + "epoch": 0.5395948434622467, + "grad_norm": 0.9241896271705627, + "learning_rate": 9.985014595280818e-05, + "loss": 2.3484, + "step": 1758 + }, + { + "epoch": 0.539901780233272, + "grad_norm": 0.7724506258964539, + "learning_rate": 9.984976116438408e-05, + "loss": 2.282, + "step": 1759 + }, + { + "epoch": 0.5402087170042971, + "grad_norm": 0.9098101854324341, + "learning_rate": 9.984937588331543e-05, + "loss": 2.3039, + "step": 1760 + }, + { + "epoch": 0.5405156537753223, + "grad_norm": 0.9430370330810547, + "learning_rate": 9.984899010960601e-05, + "loss": 2.2555, + "step": 1761 + }, + { + "epoch": 0.5408225905463474, + "grad_norm": 0.8927021026611328, + "learning_rate": 9.984860384325965e-05, + "loss": 2.3034, + "step": 1762 + }, + { + "epoch": 0.5411295273173726, + "grad_norm": 0.8331896662712097, + "learning_rate": 9.98482170842802e-05, + "loss": 2.3341, + "step": 1763 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.8311246633529663, + "learning_rate": 9.984782983267142e-05, + "loss": 2.3913, + "step": 1764 + }, + { + "epoch": 0.541743400859423, + "grad_norm": 0.7459335923194885, + "learning_rate": 9.98474420884372e-05, + "loss": 2.2912, + "step": 1765 + }, + { + "epoch": 0.5420503376304481, + "grad_norm": 0.84760981798172, + "learning_rate": 9.984705385158131e-05, + "loss": 2.316, + "step": 1766 + }, + { + "epoch": 0.5423572744014733, + "grad_norm": 0.888793408870697, + "learning_rate": 9.984666512210762e-05, + "loss": 2.3452, + "step": 1767 + }, + { + "epoch": 0.5426642111724984, + "grad_norm": 0.7977499961853027, + "learning_rate": 9.984627590001999e-05, + "loss": 2.3325, + "step": 1768 + }, + { + "epoch": 0.5429711479435236, + "grad_norm": 0.8059934377670288, + "learning_rate": 9.984588618532224e-05, + "loss": 2.3347, + "step": 1769 + }, + { + "epoch": 0.5432780847145487, + "grad_norm": 0.8190197348594666, + "learning_rate": 9.984549597801822e-05, + "loss": 2.3446, + "step": 1770 + }, + { + "epoch": 0.543585021485574, + "grad_norm": 0.774773895740509, + "learning_rate": 9.98451052781118e-05, + "loss": 2.2598, + "step": 1771 + }, + { + "epoch": 0.5438919582565992, + "grad_norm": 0.7341485023498535, + "learning_rate": 9.984471408560682e-05, + "loss": 2.2728, + "step": 1772 + }, + { + "epoch": 0.5441988950276243, + "grad_norm": 0.6881145238876343, + "learning_rate": 9.984432240050719e-05, + "loss": 2.2922, + "step": 1773 + }, + { + "epoch": 0.5445058317986495, + "grad_norm": 0.6896151304244995, + "learning_rate": 9.984393022281673e-05, + "loss": 2.2915, + "step": 1774 + }, + { + "epoch": 0.5448127685696746, + "grad_norm": 0.6902059316635132, + "learning_rate": 9.984353755253932e-05, + "loss": 2.31, + "step": 1775 + }, + { + "epoch": 0.5451197053406999, + "grad_norm": 0.7594140768051147, + "learning_rate": 9.984314438967888e-05, + "loss": 2.3092, + "step": 1776 + }, + { + "epoch": 0.545426642111725, + "grad_norm": 0.8682328462600708, + "learning_rate": 9.984275073423927e-05, + "loss": 2.2851, + "step": 1777 + }, + { + "epoch": 0.5457335788827502, + "grad_norm": 0.8747107982635498, + "learning_rate": 9.98423565862244e-05, + "loss": 2.2927, + "step": 1778 + }, + { + "epoch": 0.5460405156537753, + "grad_norm": 0.9824326038360596, + "learning_rate": 9.984196194563813e-05, + "loss": 2.3622, + "step": 1779 + }, + { + "epoch": 0.5463474524248005, + "grad_norm": 1.0006790161132812, + "learning_rate": 9.984156681248438e-05, + "loss": 2.2531, + "step": 1780 + }, + { + "epoch": 0.5466543891958257, + "grad_norm": 0.9501944184303284, + "learning_rate": 9.984117118676705e-05, + "loss": 2.3902, + "step": 1781 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.7835353016853333, + "learning_rate": 9.984077506849005e-05, + "loss": 2.2754, + "step": 1782 + }, + { + "epoch": 0.547268262737876, + "grad_norm": 0.7310026288032532, + "learning_rate": 9.984037845765732e-05, + "loss": 2.2742, + "step": 1783 + }, + { + "epoch": 0.5475751995089012, + "grad_norm": 0.9469361901283264, + "learning_rate": 9.983998135427275e-05, + "loss": 2.4026, + "step": 1784 + }, + { + "epoch": 0.5478821362799263, + "grad_norm": 1.0639240741729736, + "learning_rate": 9.983958375834025e-05, + "loss": 2.3522, + "step": 1785 + }, + { + "epoch": 0.5481890730509515, + "grad_norm": 0.7771989703178406, + "learning_rate": 9.983918566986379e-05, + "loss": 2.216, + "step": 1786 + }, + { + "epoch": 0.5484960098219767, + "grad_norm": 0.6809307932853699, + "learning_rate": 9.983878708884728e-05, + "loss": 2.256, + "step": 1787 + }, + { + "epoch": 0.5488029465930019, + "grad_norm": 0.7300165891647339, + "learning_rate": 9.983838801529469e-05, + "loss": 2.3156, + "step": 1788 + }, + { + "epoch": 0.549109883364027, + "grad_norm": 0.8352389335632324, + "learning_rate": 9.98379884492099e-05, + "loss": 2.3344, + "step": 1789 + }, + { + "epoch": 0.5494168201350522, + "grad_norm": 0.830585777759552, + "learning_rate": 9.983758839059692e-05, + "loss": 2.3076, + "step": 1790 + }, + { + "epoch": 0.5497237569060773, + "grad_norm": 0.7384640574455261, + "learning_rate": 9.983718783945968e-05, + "loss": 2.2387, + "step": 1791 + }, + { + "epoch": 0.5500306936771026, + "grad_norm": 0.7133243083953857, + "learning_rate": 9.983678679580213e-05, + "loss": 2.2933, + "step": 1792 + }, + { + "epoch": 0.5503376304481277, + "grad_norm": 0.8462459444999695, + "learning_rate": 9.983638525962823e-05, + "loss": 2.3294, + "step": 1793 + }, + { + "epoch": 0.5506445672191529, + "grad_norm": 0.7841110825538635, + "learning_rate": 9.983598323094199e-05, + "loss": 2.3156, + "step": 1794 + }, + { + "epoch": 0.550951503990178, + "grad_norm": 0.8454114198684692, + "learning_rate": 9.983558070974735e-05, + "loss": 2.2203, + "step": 1795 + }, + { + "epoch": 0.5512584407612032, + "grad_norm": 0.7741531729698181, + "learning_rate": 9.983517769604826e-05, + "loss": 2.2585, + "step": 1796 + }, + { + "epoch": 0.5515653775322283, + "grad_norm": 0.717714250087738, + "learning_rate": 9.983477418984876e-05, + "loss": 2.3127, + "step": 1797 + }, + { + "epoch": 0.5518723143032536, + "grad_norm": 0.7546361088752747, + "learning_rate": 9.983437019115283e-05, + "loss": 2.2591, + "step": 1798 + }, + { + "epoch": 0.5521792510742787, + "grad_norm": 0.7947681546211243, + "learning_rate": 9.983396569996442e-05, + "loss": 2.337, + "step": 1799 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.9286270141601562, + "learning_rate": 9.983356071628756e-05, + "loss": 2.371, + "step": 1800 + }, + { + "epoch": 0.552793124616329, + "grad_norm": 1.0236682891845703, + "learning_rate": 9.983315524012625e-05, + "loss": 2.2673, + "step": 1801 + }, + { + "epoch": 0.5531000613873542, + "grad_norm": 1.043534278869629, + "learning_rate": 9.983274927148447e-05, + "loss": 2.3204, + "step": 1802 + }, + { + "epoch": 0.5534069981583793, + "grad_norm": 0.9694257378578186, + "learning_rate": 9.983234281036626e-05, + "loss": 2.2642, + "step": 1803 + }, + { + "epoch": 0.5537139349294046, + "grad_norm": 0.8890992403030396, + "learning_rate": 9.983193585677563e-05, + "loss": 2.2546, + "step": 1804 + }, + { + "epoch": 0.5540208717004297, + "grad_norm": 0.8109140396118164, + "learning_rate": 9.983152841071662e-05, + "loss": 2.3088, + "step": 1805 + }, + { + "epoch": 0.5543278084714549, + "grad_norm": 0.7762413620948792, + "learning_rate": 9.983112047219323e-05, + "loss": 2.2277, + "step": 1806 + }, + { + "epoch": 0.55463474524248, + "grad_norm": 0.7949336767196655, + "learning_rate": 9.983071204120951e-05, + "loss": 2.3004, + "step": 1807 + }, + { + "epoch": 0.5549416820135052, + "grad_norm": 0.9118300080299377, + "learning_rate": 9.983030311776946e-05, + "loss": 2.3986, + "step": 1808 + }, + { + "epoch": 0.5552486187845304, + "grad_norm": 0.874891996383667, + "learning_rate": 9.982989370187717e-05, + "loss": 2.2721, + "step": 1809 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.8089940547943115, + "learning_rate": 9.982948379353667e-05, + "loss": 2.2846, + "step": 1810 + }, + { + "epoch": 0.5558624923265807, + "grad_norm": 0.7407395839691162, + "learning_rate": 9.982907339275198e-05, + "loss": 2.2848, + "step": 1811 + }, + { + "epoch": 0.5561694290976059, + "grad_norm": 0.7487329244613647, + "learning_rate": 9.982866249952721e-05, + "loss": 2.266, + "step": 1812 + }, + { + "epoch": 0.556476365868631, + "grad_norm": 0.7910557389259338, + "learning_rate": 9.982825111386638e-05, + "loss": 2.2975, + "step": 1813 + }, + { + "epoch": 0.5567833026396563, + "grad_norm": 0.767186164855957, + "learning_rate": 9.982783923577356e-05, + "loss": 2.2867, + "step": 1814 + }, + { + "epoch": 0.5570902394106814, + "grad_norm": 0.7296959757804871, + "learning_rate": 9.982742686525284e-05, + "loss": 2.2167, + "step": 1815 + }, + { + "epoch": 0.5573971761817066, + "grad_norm": 0.6536411643028259, + "learning_rate": 9.982701400230827e-05, + "loss": 2.2278, + "step": 1816 + }, + { + "epoch": 0.5577041129527317, + "grad_norm": 0.7393643260002136, + "learning_rate": 9.982660064694394e-05, + "loss": 2.3275, + "step": 1817 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.7837240099906921, + "learning_rate": 9.982618679916396e-05, + "loss": 2.3516, + "step": 1818 + }, + { + "epoch": 0.558317986494782, + "grad_norm": 0.8186847567558289, + "learning_rate": 9.982577245897238e-05, + "loss": 2.4104, + "step": 1819 + }, + { + "epoch": 0.5586249232658073, + "grad_norm": 0.733651340007782, + "learning_rate": 9.98253576263733e-05, + "loss": 2.2151, + "step": 1820 + }, + { + "epoch": 0.5589318600368324, + "grad_norm": 0.7452411651611328, + "learning_rate": 9.982494230137086e-05, + "loss": 2.3288, + "step": 1821 + }, + { + "epoch": 0.5592387968078576, + "grad_norm": 0.7369456887245178, + "learning_rate": 9.982452648396913e-05, + "loss": 2.3023, + "step": 1822 + }, + { + "epoch": 0.5595457335788827, + "grad_norm": 0.794789731502533, + "learning_rate": 9.982411017417222e-05, + "loss": 2.2774, + "step": 1823 + }, + { + "epoch": 0.5598526703499079, + "grad_norm": 0.7677412033081055, + "learning_rate": 9.982369337198425e-05, + "loss": 2.3213, + "step": 1824 + }, + { + "epoch": 0.560159607120933, + "grad_norm": 0.8195241689682007, + "learning_rate": 9.982327607740934e-05, + "loss": 2.3721, + "step": 1825 + }, + { + "epoch": 0.5604665438919583, + "grad_norm": 0.867115318775177, + "learning_rate": 9.982285829045162e-05, + "loss": 2.3653, + "step": 1826 + }, + { + "epoch": 0.5607734806629834, + "grad_norm": 0.8519865870475769, + "learning_rate": 9.98224400111152e-05, + "loss": 2.3646, + "step": 1827 + }, + { + "epoch": 0.5610804174340086, + "grad_norm": 0.9408721923828125, + "learning_rate": 9.982202123940425e-05, + "loss": 2.2051, + "step": 1828 + }, + { + "epoch": 0.5613873542050337, + "grad_norm": 0.985325813293457, + "learning_rate": 9.982160197532287e-05, + "loss": 2.3402, + "step": 1829 + }, + { + "epoch": 0.5616942909760589, + "grad_norm": 1.018094539642334, + "learning_rate": 9.982118221887521e-05, + "loss": 2.2712, + "step": 1830 + }, + { + "epoch": 0.562001227747084, + "grad_norm": 0.9246920347213745, + "learning_rate": 9.982076197006543e-05, + "loss": 2.3808, + "step": 1831 + }, + { + "epoch": 0.5623081645181093, + "grad_norm": 0.8519729971885681, + "learning_rate": 9.982034122889768e-05, + "loss": 2.3774, + "step": 1832 + }, + { + "epoch": 0.5626151012891344, + "grad_norm": 0.801567018032074, + "learning_rate": 9.981991999537612e-05, + "loss": 2.2713, + "step": 1833 + }, + { + "epoch": 0.5629220380601596, + "grad_norm": 0.7212518453598022, + "learning_rate": 9.981949826950492e-05, + "loss": 2.1902, + "step": 1834 + }, + { + "epoch": 0.5632289748311847, + "grad_norm": 0.7644798755645752, + "learning_rate": 9.981907605128822e-05, + "loss": 2.2751, + "step": 1835 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.7941999435424805, + "learning_rate": 9.981865334073022e-05, + "loss": 2.2991, + "step": 1836 + }, + { + "epoch": 0.5638428483732351, + "grad_norm": 0.7274888753890991, + "learning_rate": 9.981823013783508e-05, + "loss": 2.3536, + "step": 1837 + }, + { + "epoch": 0.5641497851442603, + "grad_norm": 0.845024585723877, + "learning_rate": 9.9817806442607e-05, + "loss": 2.2796, + "step": 1838 + }, + { + "epoch": 0.5644567219152854, + "grad_norm": 0.8225597739219666, + "learning_rate": 9.981738225505015e-05, + "loss": 2.3339, + "step": 1839 + }, + { + "epoch": 0.5647636586863106, + "grad_norm": 0.8456425070762634, + "learning_rate": 9.981695757516873e-05, + "loss": 2.2583, + "step": 1840 + }, + { + "epoch": 0.5650705954573357, + "grad_norm": 1.0066497325897217, + "learning_rate": 9.981653240296695e-05, + "loss": 2.3628, + "step": 1841 + }, + { + "epoch": 0.565377532228361, + "grad_norm": 0.9574379920959473, + "learning_rate": 9.981610673844899e-05, + "loss": 2.306, + "step": 1842 + }, + { + "epoch": 0.5656844689993862, + "grad_norm": 0.7427437901496887, + "learning_rate": 9.981568058161905e-05, + "loss": 2.267, + "step": 1843 + }, + { + "epoch": 0.5659914057704113, + "grad_norm": 0.6984857320785522, + "learning_rate": 9.981525393248138e-05, + "loss": 2.2095, + "step": 1844 + }, + { + "epoch": 0.5662983425414365, + "grad_norm": 0.748062789440155, + "learning_rate": 9.981482679104016e-05, + "loss": 2.211, + "step": 1845 + }, + { + "epoch": 0.5666052793124616, + "grad_norm": 0.7978217005729675, + "learning_rate": 9.981439915729964e-05, + "loss": 2.2437, + "step": 1846 + }, + { + "epoch": 0.5669122160834869, + "grad_norm": 0.807849109172821, + "learning_rate": 9.981397103126401e-05, + "loss": 2.3063, + "step": 1847 + }, + { + "epoch": 0.567219152854512, + "grad_norm": 0.8626619577407837, + "learning_rate": 9.981354241293752e-05, + "loss": 2.3616, + "step": 1848 + }, + { + "epoch": 0.5675260896255372, + "grad_norm": 0.8991526961326599, + "learning_rate": 9.981311330232442e-05, + "loss": 2.2355, + "step": 1849 + }, + { + "epoch": 0.5678330263965623, + "grad_norm": 0.7399953007698059, + "learning_rate": 9.981268369942894e-05, + "loss": 2.2452, + "step": 1850 + }, + { + "epoch": 0.5681399631675875, + "grad_norm": 0.7787104845046997, + "learning_rate": 9.981225360425533e-05, + "loss": 2.4141, + "step": 1851 + }, + { + "epoch": 0.5684468999386126, + "grad_norm": 0.8570892214775085, + "learning_rate": 9.98118230168078e-05, + "loss": 2.2487, + "step": 1852 + }, + { + "epoch": 0.5687538367096379, + "grad_norm": 0.8277538418769836, + "learning_rate": 9.981139193709068e-05, + "loss": 2.2602, + "step": 1853 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.7638106942176819, + "learning_rate": 9.981096036510817e-05, + "loss": 2.2886, + "step": 1854 + }, + { + "epoch": 0.5693677102516882, + "grad_norm": 0.8480616807937622, + "learning_rate": 9.981052830086454e-05, + "loss": 2.2893, + "step": 1855 + }, + { + "epoch": 0.5696746470227133, + "grad_norm": 0.8568599820137024, + "learning_rate": 9.98100957443641e-05, + "loss": 2.3802, + "step": 1856 + }, + { + "epoch": 0.5699815837937385, + "grad_norm": 0.7863987684249878, + "learning_rate": 9.98096626956111e-05, + "loss": 2.2996, + "step": 1857 + }, + { + "epoch": 0.5702885205647636, + "grad_norm": 0.7636334896087646, + "learning_rate": 9.980922915460979e-05, + "loss": 2.2569, + "step": 1858 + }, + { + "epoch": 0.5705954573357889, + "grad_norm": 0.7514677047729492, + "learning_rate": 9.98087951213645e-05, + "loss": 2.3317, + "step": 1859 + }, + { + "epoch": 0.570902394106814, + "grad_norm": 0.717637300491333, + "learning_rate": 9.980836059587951e-05, + "loss": 2.2855, + "step": 1860 + }, + { + "epoch": 0.5712093308778392, + "grad_norm": 0.728518545627594, + "learning_rate": 9.98079255781591e-05, + "loss": 2.3166, + "step": 1861 + }, + { + "epoch": 0.5715162676488643, + "grad_norm": 0.7158043384552002, + "learning_rate": 9.980749006820757e-05, + "loss": 2.2639, + "step": 1862 + }, + { + "epoch": 0.5718232044198895, + "grad_norm": 0.7565107941627502, + "learning_rate": 9.980705406602924e-05, + "loss": 2.2833, + "step": 1863 + }, + { + "epoch": 0.5721301411909147, + "grad_norm": 0.7873388528823853, + "learning_rate": 9.980661757162841e-05, + "loss": 2.201, + "step": 1864 + }, + { + "epoch": 0.5724370779619399, + "grad_norm": 0.7818259596824646, + "learning_rate": 9.980618058500939e-05, + "loss": 2.242, + "step": 1865 + }, + { + "epoch": 0.572744014732965, + "grad_norm": 0.7464665770530701, + "learning_rate": 9.98057431061765e-05, + "loss": 2.2325, + "step": 1866 + }, + { + "epoch": 0.5730509515039902, + "grad_norm": 0.7778184413909912, + "learning_rate": 9.980530513513406e-05, + "loss": 2.3258, + "step": 1867 + }, + { + "epoch": 0.5733578882750153, + "grad_norm": 0.825661301612854, + "learning_rate": 9.980486667188642e-05, + "loss": 2.3477, + "step": 1868 + }, + { + "epoch": 0.5736648250460405, + "grad_norm": 0.8448848724365234, + "learning_rate": 9.980442771643788e-05, + "loss": 2.3523, + "step": 1869 + }, + { + "epoch": 0.5739717618170657, + "grad_norm": 0.8330404758453369, + "learning_rate": 9.98039882687928e-05, + "loss": 2.2274, + "step": 1870 + }, + { + "epoch": 0.5742786985880909, + "grad_norm": 0.7520943284034729, + "learning_rate": 9.98035483289555e-05, + "loss": 2.2773, + "step": 1871 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.8312448263168335, + "learning_rate": 9.980310789693037e-05, + "loss": 2.302, + "step": 1872 + }, + { + "epoch": 0.5748925721301412, + "grad_norm": 0.7383994460105896, + "learning_rate": 9.980266697272173e-05, + "loss": 2.2168, + "step": 1873 + }, + { + "epoch": 0.5751995089011663, + "grad_norm": 0.9612922072410583, + "learning_rate": 9.980222555633394e-05, + "loss": 2.3558, + "step": 1874 + }, + { + "epoch": 0.5755064456721916, + "grad_norm": 0.9921227097511292, + "learning_rate": 9.980178364777136e-05, + "loss": 2.2913, + "step": 1875 + }, + { + "epoch": 0.5758133824432167, + "grad_norm": 0.9152889847755432, + "learning_rate": 9.980134124703837e-05, + "loss": 2.2615, + "step": 1876 + }, + { + "epoch": 0.5761203192142419, + "grad_norm": 0.8090541362762451, + "learning_rate": 9.980089835413936e-05, + "loss": 2.2661, + "step": 1877 + }, + { + "epoch": 0.576427255985267, + "grad_norm": 0.8074322938919067, + "learning_rate": 9.980045496907865e-05, + "loss": 2.3209, + "step": 1878 + }, + { + "epoch": 0.5767341927562922, + "grad_norm": 0.784649670124054, + "learning_rate": 9.980001109186065e-05, + "loss": 2.241, + "step": 1879 + }, + { + "epoch": 0.5770411295273173, + "grad_norm": 0.768108069896698, + "learning_rate": 9.979956672248978e-05, + "loss": 2.3333, + "step": 1880 + }, + { + "epoch": 0.5773480662983426, + "grad_norm": 0.798058271408081, + "learning_rate": 9.97991218609704e-05, + "loss": 2.3564, + "step": 1881 + }, + { + "epoch": 0.5776550030693677, + "grad_norm": 0.7606865763664246, + "learning_rate": 9.97986765073069e-05, + "loss": 2.2277, + "step": 1882 + }, + { + "epoch": 0.5779619398403929, + "grad_norm": 0.8320558667182922, + "learning_rate": 9.979823066150369e-05, + "loss": 2.3715, + "step": 1883 + }, + { + "epoch": 0.578268876611418, + "grad_norm": 0.7935798168182373, + "learning_rate": 9.979778432356517e-05, + "loss": 2.2605, + "step": 1884 + }, + { + "epoch": 0.5785758133824432, + "grad_norm": 0.6914796829223633, + "learning_rate": 9.979733749349578e-05, + "loss": 2.2699, + "step": 1885 + }, + { + "epoch": 0.5788827501534684, + "grad_norm": 0.6546899676322937, + "learning_rate": 9.979689017129989e-05, + "loss": 2.1908, + "step": 1886 + }, + { + "epoch": 0.5791896869244936, + "grad_norm": 0.7231267094612122, + "learning_rate": 9.979644235698195e-05, + "loss": 2.2084, + "step": 1887 + }, + { + "epoch": 0.5794966236955187, + "grad_norm": 0.668933093547821, + "learning_rate": 9.979599405054639e-05, + "loss": 2.2722, + "step": 1888 + }, + { + "epoch": 0.5798035604665439, + "grad_norm": 0.678191602230072, + "learning_rate": 9.979554525199763e-05, + "loss": 2.2312, + "step": 1889 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.6407462954521179, + "learning_rate": 9.97950959613401e-05, + "loss": 2.2381, + "step": 1890 + }, + { + "epoch": 0.5804174340085942, + "grad_norm": 0.6920403242111206, + "learning_rate": 9.979464617857826e-05, + "loss": 2.2678, + "step": 1891 + }, + { + "epoch": 0.5807243707796194, + "grad_norm": 0.6907110810279846, + "learning_rate": 9.979419590371651e-05, + "loss": 2.2579, + "step": 1892 + }, + { + "epoch": 0.5810313075506446, + "grad_norm": 0.7683933973312378, + "learning_rate": 9.979374513675935e-05, + "loss": 2.2184, + "step": 1893 + }, + { + "epoch": 0.5813382443216697, + "grad_norm": 0.797286868095398, + "learning_rate": 9.979329387771121e-05, + "loss": 2.2518, + "step": 1894 + }, + { + "epoch": 0.5816451810926949, + "grad_norm": 0.8192877769470215, + "learning_rate": 9.979284212657657e-05, + "loss": 2.2271, + "step": 1895 + }, + { + "epoch": 0.58195211786372, + "grad_norm": 0.7510090470314026, + "learning_rate": 9.979238988335986e-05, + "loss": 2.2864, + "step": 1896 + }, + { + "epoch": 0.5822590546347453, + "grad_norm": 0.7541393041610718, + "learning_rate": 9.979193714806558e-05, + "loss": 2.239, + "step": 1897 + }, + { + "epoch": 0.5825659914057704, + "grad_norm": 0.7353073358535767, + "learning_rate": 9.97914839206982e-05, + "loss": 2.2145, + "step": 1898 + }, + { + "epoch": 0.5828729281767956, + "grad_norm": 0.6813456416130066, + "learning_rate": 9.979103020126218e-05, + "loss": 2.194, + "step": 1899 + }, + { + "epoch": 0.5831798649478207, + "grad_norm": 0.6922066807746887, + "learning_rate": 9.979057598976202e-05, + "loss": 2.2335, + "step": 1900 + }, + { + "epoch": 0.5834868017188459, + "grad_norm": 0.5800344944000244, + "learning_rate": 9.97901212862022e-05, + "loss": 2.2159, + "step": 1901 + }, + { + "epoch": 0.583793738489871, + "grad_norm": 0.5770835280418396, + "learning_rate": 9.978966609058722e-05, + "loss": 2.2217, + "step": 1902 + }, + { + "epoch": 0.5841006752608963, + "grad_norm": 0.6217128038406372, + "learning_rate": 9.978921040292158e-05, + "loss": 2.2703, + "step": 1903 + }, + { + "epoch": 0.5844076120319214, + "grad_norm": 0.6684436798095703, + "learning_rate": 9.97887542232098e-05, + "loss": 2.2747, + "step": 1904 + }, + { + "epoch": 0.5847145488029466, + "grad_norm": 0.6261670589447021, + "learning_rate": 9.978829755145633e-05, + "loss": 2.2867, + "step": 1905 + }, + { + "epoch": 0.5850214855739717, + "grad_norm": 0.646051824092865, + "learning_rate": 9.978784038766575e-05, + "loss": 2.2493, + "step": 1906 + }, + { + "epoch": 0.5853284223449969, + "grad_norm": 0.6757060885429382, + "learning_rate": 9.978738273184254e-05, + "loss": 2.218, + "step": 1907 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.7867937684059143, + "learning_rate": 9.978692458399122e-05, + "loss": 2.3405, + "step": 1908 + }, + { + "epoch": 0.5859422958870473, + "grad_norm": 0.8349789381027222, + "learning_rate": 9.978646594411636e-05, + "loss": 2.3292, + "step": 1909 + }, + { + "epoch": 0.5862492326580724, + "grad_norm": 0.8739562034606934, + "learning_rate": 9.978600681222243e-05, + "loss": 2.2132, + "step": 1910 + }, + { + "epoch": 0.5865561694290976, + "grad_norm": 0.8187520503997803, + "learning_rate": 9.978554718831402e-05, + "loss": 2.3078, + "step": 1911 + }, + { + "epoch": 0.5868631062001227, + "grad_norm": 0.8463271856307983, + "learning_rate": 9.978508707239565e-05, + "loss": 2.1924, + "step": 1912 + }, + { + "epoch": 0.5871700429711479, + "grad_norm": 0.8674206733703613, + "learning_rate": 9.978462646447187e-05, + "loss": 2.2185, + "step": 1913 + }, + { + "epoch": 0.5874769797421732, + "grad_norm": 0.7828893065452576, + "learning_rate": 9.978416536454722e-05, + "loss": 2.3137, + "step": 1914 + }, + { + "epoch": 0.5877839165131983, + "grad_norm": 0.7868914604187012, + "learning_rate": 9.978370377262629e-05, + "loss": 2.2202, + "step": 1915 + }, + { + "epoch": 0.5880908532842235, + "grad_norm": 0.811596155166626, + "learning_rate": 9.97832416887136e-05, + "loss": 2.3463, + "step": 1916 + }, + { + "epoch": 0.5883977900552486, + "grad_norm": 0.9281075596809387, + "learning_rate": 9.978277911281375e-05, + "loss": 2.2394, + "step": 1917 + }, + { + "epoch": 0.5887047268262738, + "grad_norm": 0.8862313628196716, + "learning_rate": 9.978231604493129e-05, + "loss": 2.2456, + "step": 1918 + }, + { + "epoch": 0.589011663597299, + "grad_norm": 0.8411116600036621, + "learning_rate": 9.978185248507081e-05, + "loss": 2.2409, + "step": 1919 + }, + { + "epoch": 0.5893186003683242, + "grad_norm": 0.8205060958862305, + "learning_rate": 9.978138843323688e-05, + "loss": 2.2468, + "step": 1920 + }, + { + "epoch": 0.5896255371393493, + "grad_norm": 0.8103171586990356, + "learning_rate": 9.97809238894341e-05, + "loss": 2.2979, + "step": 1921 + }, + { + "epoch": 0.5899324739103745, + "grad_norm": 0.7937025427818298, + "learning_rate": 9.978045885366704e-05, + "loss": 2.3582, + "step": 1922 + }, + { + "epoch": 0.5902394106813996, + "grad_norm": 0.7983896136283875, + "learning_rate": 9.977999332594032e-05, + "loss": 2.2725, + "step": 1923 + }, + { + "epoch": 0.5905463474524248, + "grad_norm": 0.8274399042129517, + "learning_rate": 9.977952730625852e-05, + "loss": 2.3091, + "step": 1924 + }, + { + "epoch": 0.59085328422345, + "grad_norm": 0.9385362863540649, + "learning_rate": 9.977906079462627e-05, + "loss": 2.4322, + "step": 1925 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.8405537009239197, + "learning_rate": 9.977859379104814e-05, + "loss": 2.1606, + "step": 1926 + }, + { + "epoch": 0.5914671577655003, + "grad_norm": 0.8082418441772461, + "learning_rate": 9.97781262955288e-05, + "loss": 2.2929, + "step": 1927 + }, + { + "epoch": 0.5917740945365255, + "grad_norm": 0.7444280385971069, + "learning_rate": 9.977765830807283e-05, + "loss": 2.3217, + "step": 1928 + }, + { + "epoch": 0.5920810313075506, + "grad_norm": 0.7369982600212097, + "learning_rate": 9.977718982868485e-05, + "loss": 2.2658, + "step": 1929 + }, + { + "epoch": 0.5923879680785759, + "grad_norm": 0.6842257380485535, + "learning_rate": 9.977672085736951e-05, + "loss": 2.2243, + "step": 1930 + }, + { + "epoch": 0.592694904849601, + "grad_norm": 0.6954882740974426, + "learning_rate": 9.977625139413145e-05, + "loss": 2.2802, + "step": 1931 + }, + { + "epoch": 0.5930018416206262, + "grad_norm": 0.749829888343811, + "learning_rate": 9.97757814389753e-05, + "loss": 2.3166, + "step": 1932 + }, + { + "epoch": 0.5933087783916513, + "grad_norm": 0.7725609540939331, + "learning_rate": 9.977531099190569e-05, + "loss": 2.2367, + "step": 1933 + }, + { + "epoch": 0.5936157151626765, + "grad_norm": 0.7467440366744995, + "learning_rate": 9.977484005292728e-05, + "loss": 2.2704, + "step": 1934 + }, + { + "epoch": 0.5939226519337016, + "grad_norm": 0.7104424834251404, + "learning_rate": 9.977436862204475e-05, + "loss": 2.1983, + "step": 1935 + }, + { + "epoch": 0.5942295887047269, + "grad_norm": 0.7562711834907532, + "learning_rate": 9.977389669926272e-05, + "loss": 2.2857, + "step": 1936 + }, + { + "epoch": 0.594536525475752, + "grad_norm": 0.7803298830986023, + "learning_rate": 9.977342428458585e-05, + "loss": 2.3526, + "step": 1937 + }, + { + "epoch": 0.5948434622467772, + "grad_norm": 0.7487826943397522, + "learning_rate": 9.977295137801885e-05, + "loss": 2.2338, + "step": 1938 + }, + { + "epoch": 0.5951503990178023, + "grad_norm": 0.6969291567802429, + "learning_rate": 9.977247797956639e-05, + "loss": 2.2185, + "step": 1939 + }, + { + "epoch": 0.5954573357888275, + "grad_norm": 0.6293052434921265, + "learning_rate": 9.977200408923311e-05, + "loss": 2.2767, + "step": 1940 + }, + { + "epoch": 0.5957642725598526, + "grad_norm": 0.7457680702209473, + "learning_rate": 9.97715297070237e-05, + "loss": 2.2688, + "step": 1941 + }, + { + "epoch": 0.5960712093308779, + "grad_norm": 0.7255130410194397, + "learning_rate": 9.977105483294288e-05, + "loss": 2.2157, + "step": 1942 + }, + { + "epoch": 0.596378146101903, + "grad_norm": 0.739815890789032, + "learning_rate": 9.977057946699532e-05, + "loss": 2.306, + "step": 1943 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.7493855357170105, + "learning_rate": 9.977010360918571e-05, + "loss": 2.1893, + "step": 1944 + }, + { + "epoch": 0.5969920196439533, + "grad_norm": 0.7976173758506775, + "learning_rate": 9.976962725951878e-05, + "loss": 2.3288, + "step": 1945 + }, + { + "epoch": 0.5972989564149785, + "grad_norm": 0.9487287998199463, + "learning_rate": 9.976915041799921e-05, + "loss": 2.4484, + "step": 1946 + }, + { + "epoch": 0.5976058931860037, + "grad_norm": 0.9866845011711121, + "learning_rate": 9.976867308463174e-05, + "loss": 2.3223, + "step": 1947 + }, + { + "epoch": 0.5979128299570289, + "grad_norm": 0.9258660674095154, + "learning_rate": 9.976819525942107e-05, + "loss": 2.2358, + "step": 1948 + }, + { + "epoch": 0.598219766728054, + "grad_norm": 0.9822832345962524, + "learning_rate": 9.976771694237192e-05, + "loss": 2.2951, + "step": 1949 + }, + { + "epoch": 0.5985267034990792, + "grad_norm": 1.005528450012207, + "learning_rate": 9.976723813348902e-05, + "loss": 2.2604, + "step": 1950 + }, + { + "epoch": 0.5988336402701043, + "grad_norm": 0.8988018035888672, + "learning_rate": 9.976675883277711e-05, + "loss": 2.3419, + "step": 1951 + }, + { + "epoch": 0.5991405770411296, + "grad_norm": 0.7386319041252136, + "learning_rate": 9.976627904024091e-05, + "loss": 2.2357, + "step": 1952 + }, + { + "epoch": 0.5994475138121547, + "grad_norm": 0.7715404033660889, + "learning_rate": 9.976579875588518e-05, + "loss": 2.3482, + "step": 1953 + }, + { + "epoch": 0.5997544505831799, + "grad_norm": 0.7529712319374084, + "learning_rate": 9.976531797971464e-05, + "loss": 2.1735, + "step": 1954 + }, + { + "epoch": 0.600061387354205, + "grad_norm": 0.8589643836021423, + "learning_rate": 9.97648367117341e-05, + "loss": 2.305, + "step": 1955 + }, + { + "epoch": 0.6003683241252302, + "grad_norm": 0.9038915634155273, + "learning_rate": 9.976435495194823e-05, + "loss": 2.2123, + "step": 1956 + }, + { + "epoch": 0.6006752608962553, + "grad_norm": 0.9388678073883057, + "learning_rate": 9.976387270036186e-05, + "loss": 2.1792, + "step": 1957 + }, + { + "epoch": 0.6009821976672806, + "grad_norm": 0.7970952391624451, + "learning_rate": 9.976338995697974e-05, + "loss": 2.2425, + "step": 1958 + }, + { + "epoch": 0.6012891344383057, + "grad_norm": 0.7219900488853455, + "learning_rate": 9.976290672180662e-05, + "loss": 2.1984, + "step": 1959 + }, + { + "epoch": 0.6015960712093309, + "grad_norm": 0.639715313911438, + "learning_rate": 9.976242299484728e-05, + "loss": 2.2796, + "step": 1960 + }, + { + "epoch": 0.601903007980356, + "grad_norm": 0.6734911799430847, + "learning_rate": 9.976193877610652e-05, + "loss": 2.3066, + "step": 1961 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.8328932523727417, + "learning_rate": 9.976145406558912e-05, + "loss": 2.3958, + "step": 1962 + }, + { + "epoch": 0.6025168815224063, + "grad_norm": 0.9552088379859924, + "learning_rate": 9.976096886329986e-05, + "loss": 2.3246, + "step": 1963 + }, + { + "epoch": 0.6028238182934316, + "grad_norm": 0.8407328128814697, + "learning_rate": 9.976048316924354e-05, + "loss": 2.2922, + "step": 1964 + }, + { + "epoch": 0.6031307550644567, + "grad_norm": 0.6899709105491638, + "learning_rate": 9.975999698342495e-05, + "loss": 2.1808, + "step": 1965 + }, + { + "epoch": 0.6034376918354819, + "grad_norm": 0.8114390969276428, + "learning_rate": 9.975951030584892e-05, + "loss": 2.3516, + "step": 1966 + }, + { + "epoch": 0.603744628606507, + "grad_norm": 0.8071461319923401, + "learning_rate": 9.975902313652024e-05, + "loss": 2.2044, + "step": 1967 + }, + { + "epoch": 0.6040515653775322, + "grad_norm": 0.8767913579940796, + "learning_rate": 9.975853547544372e-05, + "loss": 2.24, + "step": 1968 + }, + { + "epoch": 0.6043585021485574, + "grad_norm": 0.817095935344696, + "learning_rate": 9.975804732262419e-05, + "loss": 2.169, + "step": 1969 + }, + { + "epoch": 0.6046654389195826, + "grad_norm": 0.6818623542785645, + "learning_rate": 9.975755867806648e-05, + "loss": 2.2869, + "step": 1970 + }, + { + "epoch": 0.6049723756906077, + "grad_norm": 0.7248693704605103, + "learning_rate": 9.97570695417754e-05, + "loss": 2.2159, + "step": 1971 + }, + { + "epoch": 0.6052793124616329, + "grad_norm": 0.6425455212593079, + "learning_rate": 9.975657991375581e-05, + "loss": 2.2173, + "step": 1972 + }, + { + "epoch": 0.605586249232658, + "grad_norm": 0.6856566071510315, + "learning_rate": 9.975608979401252e-05, + "loss": 2.2994, + "step": 1973 + }, + { + "epoch": 0.6058931860036832, + "grad_norm": 0.6731004118919373, + "learning_rate": 9.97555991825504e-05, + "loss": 2.2286, + "step": 1974 + }, + { + "epoch": 0.6062001227747084, + "grad_norm": 0.7461759448051453, + "learning_rate": 9.975510807937428e-05, + "loss": 2.2057, + "step": 1975 + }, + { + "epoch": 0.6065070595457336, + "grad_norm": 0.7256236672401428, + "learning_rate": 9.975461648448902e-05, + "loss": 2.2686, + "step": 1976 + }, + { + "epoch": 0.6068139963167587, + "grad_norm": 0.7254514098167419, + "learning_rate": 9.975412439789949e-05, + "loss": 2.2748, + "step": 1977 + }, + { + "epoch": 0.6071209330877839, + "grad_norm": 0.7280047535896301, + "learning_rate": 9.975363181961052e-05, + "loss": 2.27, + "step": 1978 + }, + { + "epoch": 0.607427869858809, + "grad_norm": 0.6801813244819641, + "learning_rate": 9.9753138749627e-05, + "loss": 2.2356, + "step": 1979 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.841946005821228, + "learning_rate": 9.975264518795382e-05, + "loss": 2.3887, + "step": 1980 + }, + { + "epoch": 0.6080417434008594, + "grad_norm": 0.9610007405281067, + "learning_rate": 9.975215113459582e-05, + "loss": 2.2857, + "step": 1981 + }, + { + "epoch": 0.6083486801718846, + "grad_norm": 0.8726536631584167, + "learning_rate": 9.975165658955791e-05, + "loss": 2.3137, + "step": 1982 + }, + { + "epoch": 0.6086556169429097, + "grad_norm": 0.9275946021080017, + "learning_rate": 9.975116155284498e-05, + "loss": 2.291, + "step": 1983 + }, + { + "epoch": 0.6089625537139349, + "grad_norm": 0.9045402407646179, + "learning_rate": 9.97506660244619e-05, + "loss": 2.2183, + "step": 1984 + }, + { + "epoch": 0.6092694904849602, + "grad_norm": 0.7913599610328674, + "learning_rate": 9.975017000441358e-05, + "loss": 2.349, + "step": 1985 + }, + { + "epoch": 0.6095764272559853, + "grad_norm": 0.714824378490448, + "learning_rate": 9.974967349270492e-05, + "loss": 2.2163, + "step": 1986 + }, + { + "epoch": 0.6098833640270105, + "grad_norm": 0.7178559899330139, + "learning_rate": 9.974917648934084e-05, + "loss": 2.2338, + "step": 1987 + }, + { + "epoch": 0.6101903007980356, + "grad_norm": 0.8417280912399292, + "learning_rate": 9.97486789943262e-05, + "loss": 2.1961, + "step": 1988 + }, + { + "epoch": 0.6104972375690608, + "grad_norm": 0.8488532304763794, + "learning_rate": 9.9748181007666e-05, + "loss": 2.2509, + "step": 1989 + }, + { + "epoch": 0.6108041743400859, + "grad_norm": 0.796309769153595, + "learning_rate": 9.974768252936509e-05, + "loss": 2.2948, + "step": 1990 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.7163965702056885, + "learning_rate": 9.974718355942843e-05, + "loss": 2.2136, + "step": 1991 + }, + { + "epoch": 0.6114180478821363, + "grad_norm": 0.6620060205459595, + "learning_rate": 9.974668409786095e-05, + "loss": 2.2442, + "step": 1992 + }, + { + "epoch": 0.6117249846531615, + "grad_norm": 0.6843542456626892, + "learning_rate": 9.974618414466759e-05, + "loss": 2.1972, + "step": 1993 + }, + { + "epoch": 0.6120319214241866, + "grad_norm": 0.699847936630249, + "learning_rate": 9.974568369985327e-05, + "loss": 2.2194, + "step": 1994 + }, + { + "epoch": 0.6123388581952118, + "grad_norm": 0.693384051322937, + "learning_rate": 9.974518276342293e-05, + "loss": 2.2446, + "step": 1995 + }, + { + "epoch": 0.612645794966237, + "grad_norm": 0.6022316813468933, + "learning_rate": 9.974468133538155e-05, + "loss": 2.2037, + "step": 1996 + }, + { + "epoch": 0.6129527317372622, + "grad_norm": 0.6317062377929688, + "learning_rate": 9.974417941573409e-05, + "loss": 2.1855, + "step": 1997 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.7291355133056641, + "learning_rate": 9.974367700448547e-05, + "loss": 2.2179, + "step": 1998 + }, + { + "epoch": 0.6135666052793125, + "grad_norm": 0.6776867508888245, + "learning_rate": 9.97431741016407e-05, + "loss": 2.2437, + "step": 1999 + }, + { + "epoch": 0.6138735420503376, + "grad_norm": 0.6598517298698425, + "learning_rate": 9.97426707072047e-05, + "loss": 2.2775, + "step": 2000 + }, + { + "epoch": 0.6141804788213628, + "grad_norm": 0.6681709289550781, + "learning_rate": 9.974216682118249e-05, + "loss": 2.2004, + "step": 2001 + }, + { + "epoch": 0.614487415592388, + "grad_norm": 0.6725168228149414, + "learning_rate": 9.974166244357903e-05, + "loss": 2.2922, + "step": 2002 + }, + { + "epoch": 0.6147943523634132, + "grad_norm": 0.6547908782958984, + "learning_rate": 9.974115757439931e-05, + "loss": 2.2195, + "step": 2003 + }, + { + "epoch": 0.6151012891344383, + "grad_norm": 0.7195348739624023, + "learning_rate": 9.974065221364831e-05, + "loss": 2.2862, + "step": 2004 + }, + { + "epoch": 0.6154082259054635, + "grad_norm": 0.7992655038833618, + "learning_rate": 9.974014636133103e-05, + "loss": 2.3109, + "step": 2005 + }, + { + "epoch": 0.6157151626764886, + "grad_norm": 0.7932934165000916, + "learning_rate": 9.973964001745249e-05, + "loss": 2.2869, + "step": 2006 + }, + { + "epoch": 0.6160220994475138, + "grad_norm": 0.7778924107551575, + "learning_rate": 9.973913318201763e-05, + "loss": 2.2046, + "step": 2007 + }, + { + "epoch": 0.616329036218539, + "grad_norm": 0.7951294183731079, + "learning_rate": 9.973862585503155e-05, + "loss": 2.221, + "step": 2008 + }, + { + "epoch": 0.6166359729895642, + "grad_norm": 0.729552686214447, + "learning_rate": 9.97381180364992e-05, + "loss": 2.2929, + "step": 2009 + }, + { + "epoch": 0.6169429097605893, + "grad_norm": 0.731516420841217, + "learning_rate": 9.973760972642561e-05, + "loss": 2.2673, + "step": 2010 + }, + { + "epoch": 0.6172498465316145, + "grad_norm": 0.6950094103813171, + "learning_rate": 9.973710092481581e-05, + "loss": 2.2029, + "step": 2011 + }, + { + "epoch": 0.6175567833026396, + "grad_norm": 0.6260825395584106, + "learning_rate": 9.973659163167484e-05, + "loss": 2.3037, + "step": 2012 + }, + { + "epoch": 0.6178637200736649, + "grad_norm": 0.6949467658996582, + "learning_rate": 9.97360818470077e-05, + "loss": 2.2699, + "step": 2013 + }, + { + "epoch": 0.61817065684469, + "grad_norm": 0.7322572469711304, + "learning_rate": 9.973557157081945e-05, + "loss": 2.2921, + "step": 2014 + }, + { + "epoch": 0.6184775936157152, + "grad_norm": 0.8999563455581665, + "learning_rate": 9.973506080311514e-05, + "loss": 2.2499, + "step": 2015 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.9269914031028748, + "learning_rate": 9.973454954389981e-05, + "loss": 2.2676, + "step": 2016 + }, + { + "epoch": 0.6190914671577655, + "grad_norm": 0.8630712628364563, + "learning_rate": 9.973403779317852e-05, + "loss": 2.1379, + "step": 2017 + }, + { + "epoch": 0.6193984039287906, + "grad_norm": 0.8249645233154297, + "learning_rate": 9.97335255509563e-05, + "loss": 2.3109, + "step": 2018 + }, + { + "epoch": 0.6197053406998159, + "grad_norm": 0.7832711338996887, + "learning_rate": 9.973301281723824e-05, + "loss": 2.1316, + "step": 2019 + }, + { + "epoch": 0.620012277470841, + "grad_norm": 0.7502821683883667, + "learning_rate": 9.97324995920294e-05, + "loss": 2.2188, + "step": 2020 + }, + { + "epoch": 0.6203192142418662, + "grad_norm": 0.7804487347602844, + "learning_rate": 9.973198587533483e-05, + "loss": 2.2639, + "step": 2021 + }, + { + "epoch": 0.6206261510128913, + "grad_norm": 0.9198356866836548, + "learning_rate": 9.973147166715963e-05, + "loss": 2.2574, + "step": 2022 + }, + { + "epoch": 0.6209330877839165, + "grad_norm": 0.8792869448661804, + "learning_rate": 9.97309569675089e-05, + "loss": 2.2228, + "step": 2023 + }, + { + "epoch": 0.6212400245549416, + "grad_norm": 0.779772937297821, + "learning_rate": 9.97304417763877e-05, + "loss": 2.2179, + "step": 2024 + }, + { + "epoch": 0.6215469613259669, + "grad_norm": 0.7702100276947021, + "learning_rate": 9.972992609380111e-05, + "loss": 2.3872, + "step": 2025 + }, + { + "epoch": 0.621853898096992, + "grad_norm": 0.8576669096946716, + "learning_rate": 9.972940991975426e-05, + "loss": 2.2279, + "step": 2026 + }, + { + "epoch": 0.6221608348680172, + "grad_norm": 0.8312802314758301, + "learning_rate": 9.972889325425223e-05, + "loss": 2.3507, + "step": 2027 + }, + { + "epoch": 0.6224677716390423, + "grad_norm": 0.7873719930648804, + "learning_rate": 9.972837609730013e-05, + "loss": 2.2252, + "step": 2028 + }, + { + "epoch": 0.6227747084100675, + "grad_norm": 0.7763897180557251, + "learning_rate": 9.972785844890307e-05, + "loss": 2.2559, + "step": 2029 + }, + { + "epoch": 0.6230816451810927, + "grad_norm": 0.7053700685501099, + "learning_rate": 9.972734030906617e-05, + "loss": 2.2248, + "step": 2030 + }, + { + "epoch": 0.6233885819521179, + "grad_norm": 0.8800643682479858, + "learning_rate": 9.972682167779453e-05, + "loss": 2.3111, + "step": 2031 + }, + { + "epoch": 0.623695518723143, + "grad_norm": 0.7237632274627686, + "learning_rate": 9.97263025550933e-05, + "loss": 2.2255, + "step": 2032 + }, + { + "epoch": 0.6240024554941682, + "grad_norm": 0.7139064073562622, + "learning_rate": 9.97257829409676e-05, + "loss": 2.2065, + "step": 2033 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.6514315009117126, + "learning_rate": 9.972526283542259e-05, + "loss": 2.2176, + "step": 2034 + }, + { + "epoch": 0.6246163290362186, + "grad_norm": 0.726828932762146, + "learning_rate": 9.972474223846337e-05, + "loss": 2.2236, + "step": 2035 + }, + { + "epoch": 0.6249232658072437, + "grad_norm": 0.7121313810348511, + "learning_rate": 9.97242211500951e-05, + "loss": 2.2696, + "step": 2036 + }, + { + "epoch": 0.6252302025782689, + "grad_norm": 0.7203021049499512, + "learning_rate": 9.972369957032293e-05, + "loss": 2.2418, + "step": 2037 + }, + { + "epoch": 0.625537139349294, + "grad_norm": 0.6843051910400391, + "learning_rate": 9.972317749915203e-05, + "loss": 2.2408, + "step": 2038 + }, + { + "epoch": 0.6258440761203192, + "grad_norm": 0.6523141264915466, + "learning_rate": 9.972265493658754e-05, + "loss": 2.1693, + "step": 2039 + }, + { + "epoch": 0.6261510128913443, + "grad_norm": 0.6263946294784546, + "learning_rate": 9.972213188263463e-05, + "loss": 2.2477, + "step": 2040 + }, + { + "epoch": 0.6264579496623696, + "grad_norm": 0.6428464651107788, + "learning_rate": 9.972160833729847e-05, + "loss": 2.2131, + "step": 2041 + }, + { + "epoch": 0.6267648864333947, + "grad_norm": 0.6333484649658203, + "learning_rate": 9.972108430058423e-05, + "loss": 2.2806, + "step": 2042 + }, + { + "epoch": 0.6270718232044199, + "grad_norm": 0.7168832421302795, + "learning_rate": 9.97205597724971e-05, + "loss": 2.2468, + "step": 2043 + }, + { + "epoch": 0.627378759975445, + "grad_norm": 0.7522227168083191, + "learning_rate": 9.972003475304226e-05, + "loss": 2.249, + "step": 2044 + }, + { + "epoch": 0.6276856967464702, + "grad_norm": 0.6810066103935242, + "learning_rate": 9.971950924222488e-05, + "loss": 2.1988, + "step": 2045 + }, + { + "epoch": 0.6279926335174953, + "grad_norm": 0.6983187198638916, + "learning_rate": 9.971898324005018e-05, + "loss": 2.2444, + "step": 2046 + }, + { + "epoch": 0.6282995702885206, + "grad_norm": 0.7261439561843872, + "learning_rate": 9.971845674652333e-05, + "loss": 2.1789, + "step": 2047 + }, + { + "epoch": 0.6286065070595457, + "grad_norm": 0.6844322681427002, + "learning_rate": 9.971792976164957e-05, + "loss": 2.2666, + "step": 2048 + }, + { + "epoch": 0.6289134438305709, + "grad_norm": 0.7166746258735657, + "learning_rate": 9.971740228543407e-05, + "loss": 2.3002, + "step": 2049 + }, + { + "epoch": 0.629220380601596, + "grad_norm": 0.7386785745620728, + "learning_rate": 9.971687431788207e-05, + "loss": 2.1798, + "step": 2050 + }, + { + "epoch": 0.6295273173726212, + "grad_norm": 0.6873611211776733, + "learning_rate": 9.971634585899878e-05, + "loss": 2.184, + "step": 2051 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.8005948066711426, + "learning_rate": 9.971581690878941e-05, + "loss": 2.2778, + "step": 2052 + }, + { + "epoch": 0.6301411909146716, + "grad_norm": 0.8972415924072266, + "learning_rate": 9.971528746725922e-05, + "loss": 2.2822, + "step": 2053 + }, + { + "epoch": 0.6304481276856968, + "grad_norm": 0.7935822010040283, + "learning_rate": 9.97147575344134e-05, + "loss": 2.1732, + "step": 2054 + }, + { + "epoch": 0.6307550644567219, + "grad_norm": 0.7891644239425659, + "learning_rate": 9.971422711025721e-05, + "loss": 2.2765, + "step": 2055 + }, + { + "epoch": 0.6310620012277471, + "grad_norm": 0.7857005000114441, + "learning_rate": 9.971369619479589e-05, + "loss": 2.2386, + "step": 2056 + }, + { + "epoch": 0.6313689379987723, + "grad_norm": 0.6909852623939514, + "learning_rate": 9.97131647880347e-05, + "loss": 2.1251, + "step": 2057 + }, + { + "epoch": 0.6316758747697975, + "grad_norm": 0.6352387070655823, + "learning_rate": 9.971263288997885e-05, + "loss": 2.1883, + "step": 2058 + }, + { + "epoch": 0.6319828115408226, + "grad_norm": 0.5811386704444885, + "learning_rate": 9.971210050063364e-05, + "loss": 2.281, + "step": 2059 + }, + { + "epoch": 0.6322897483118478, + "grad_norm": 0.6227630376815796, + "learning_rate": 9.971156762000432e-05, + "loss": 2.1346, + "step": 2060 + }, + { + "epoch": 0.6325966850828729, + "grad_norm": 0.6628422737121582, + "learning_rate": 9.971103424809616e-05, + "loss": 2.2617, + "step": 2061 + }, + { + "epoch": 0.6329036218538981, + "grad_norm": 0.7212308645248413, + "learning_rate": 9.97105003849144e-05, + "loss": 2.1764, + "step": 2062 + }, + { + "epoch": 0.6332105586249233, + "grad_norm": 0.8368894457817078, + "learning_rate": 9.970996603046435e-05, + "loss": 2.2897, + "step": 2063 + }, + { + "epoch": 0.6335174953959485, + "grad_norm": 0.8797467350959778, + "learning_rate": 9.970943118475129e-05, + "loss": 2.1987, + "step": 2064 + }, + { + "epoch": 0.6338244321669736, + "grad_norm": 0.9241101145744324, + "learning_rate": 9.970889584778047e-05, + "loss": 2.2759, + "step": 2065 + }, + { + "epoch": 0.6341313689379988, + "grad_norm": 0.8636183142662048, + "learning_rate": 9.970836001955723e-05, + "loss": 2.2188, + "step": 2066 + }, + { + "epoch": 0.6344383057090239, + "grad_norm": 0.8965754508972168, + "learning_rate": 9.970782370008682e-05, + "loss": 2.2845, + "step": 2067 + }, + { + "epoch": 0.6347452424800492, + "grad_norm": 0.9064372777938843, + "learning_rate": 9.970728688937459e-05, + "loss": 2.1787, + "step": 2068 + }, + { + "epoch": 0.6350521792510743, + "grad_norm": 0.7387171387672424, + "learning_rate": 9.970674958742579e-05, + "loss": 2.1805, + "step": 2069 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.6220484972000122, + "learning_rate": 9.970621179424578e-05, + "loss": 2.2762, + "step": 2070 + }, + { + "epoch": 0.6356660527931246, + "grad_norm": 0.6268464922904968, + "learning_rate": 9.970567350983984e-05, + "loss": 2.2491, + "step": 2071 + }, + { + "epoch": 0.6359729895641498, + "grad_norm": 0.6385738253593445, + "learning_rate": 9.97051347342133e-05, + "loss": 2.2126, + "step": 2072 + }, + { + "epoch": 0.6362799263351749, + "grad_norm": 0.7084285020828247, + "learning_rate": 9.970459546737148e-05, + "loss": 2.2364, + "step": 2073 + }, + { + "epoch": 0.6365868631062002, + "grad_norm": 0.6957145929336548, + "learning_rate": 9.97040557093197e-05, + "loss": 2.266, + "step": 2074 + }, + { + "epoch": 0.6368937998772253, + "grad_norm": 0.6037309169769287, + "learning_rate": 9.970351546006334e-05, + "loss": 2.1514, + "step": 2075 + }, + { + "epoch": 0.6372007366482505, + "grad_norm": 0.6342970132827759, + "learning_rate": 9.97029747196077e-05, + "loss": 2.1602, + "step": 2076 + }, + { + "epoch": 0.6375076734192756, + "grad_norm": 0.5793863534927368, + "learning_rate": 9.970243348795812e-05, + "loss": 2.1853, + "step": 2077 + }, + { + "epoch": 0.6378146101903008, + "grad_norm": 0.5420103073120117, + "learning_rate": 9.970189176511997e-05, + "loss": 2.1885, + "step": 2078 + }, + { + "epoch": 0.638121546961326, + "grad_norm": 0.6713188886642456, + "learning_rate": 9.97013495510986e-05, + "loss": 2.2641, + "step": 2079 + }, + { + "epoch": 0.6384284837323512, + "grad_norm": 0.7410796880722046, + "learning_rate": 9.970080684589935e-05, + "loss": 2.2248, + "step": 2080 + }, + { + "epoch": 0.6387354205033763, + "grad_norm": 0.7138017416000366, + "learning_rate": 9.970026364952761e-05, + "loss": 2.1975, + "step": 2081 + }, + { + "epoch": 0.6390423572744015, + "grad_norm": 0.7553584575653076, + "learning_rate": 9.969971996198873e-05, + "loss": 2.2482, + "step": 2082 + }, + { + "epoch": 0.6393492940454266, + "grad_norm": 0.7082852125167847, + "learning_rate": 9.969917578328808e-05, + "loss": 2.1681, + "step": 2083 + }, + { + "epoch": 0.6396562308164518, + "grad_norm": 0.6190223097801208, + "learning_rate": 9.969863111343105e-05, + "loss": 2.1995, + "step": 2084 + }, + { + "epoch": 0.639963167587477, + "grad_norm": 0.6640429496765137, + "learning_rate": 9.969808595242302e-05, + "loss": 2.2969, + "step": 2085 + }, + { + "epoch": 0.6402701043585022, + "grad_norm": 0.761377215385437, + "learning_rate": 9.969754030026936e-05, + "loss": 2.2412, + "step": 2086 + }, + { + "epoch": 0.6405770411295273, + "grad_norm": 0.7226401567459106, + "learning_rate": 9.969699415697551e-05, + "loss": 2.1852, + "step": 2087 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.6474639177322388, + "learning_rate": 9.969644752254681e-05, + "loss": 2.1867, + "step": 2088 + }, + { + "epoch": 0.6411909146715776, + "grad_norm": 0.6725835800170898, + "learning_rate": 9.96959003969887e-05, + "loss": 2.1962, + "step": 2089 + }, + { + "epoch": 0.6414978514426029, + "grad_norm": 0.6669641733169556, + "learning_rate": 9.969535278030657e-05, + "loss": 2.2045, + "step": 2090 + }, + { + "epoch": 0.641804788213628, + "grad_norm": 0.7604048252105713, + "learning_rate": 9.969480467250583e-05, + "loss": 2.2543, + "step": 2091 + }, + { + "epoch": 0.6421117249846532, + "grad_norm": 0.9369953870773315, + "learning_rate": 9.969425607359191e-05, + "loss": 2.2461, + "step": 2092 + }, + { + "epoch": 0.6424186617556783, + "grad_norm": 1.116156816482544, + "learning_rate": 9.969370698357022e-05, + "loss": 2.2447, + "step": 2093 + }, + { + "epoch": 0.6427255985267035, + "grad_norm": 0.9179674983024597, + "learning_rate": 9.96931574024462e-05, + "loss": 2.2164, + "step": 2094 + }, + { + "epoch": 0.6430325352977286, + "grad_norm": 0.7629393339157104, + "learning_rate": 9.969260733022526e-05, + "loss": 2.22, + "step": 2095 + }, + { + "epoch": 0.6433394720687539, + "grad_norm": 0.7152948379516602, + "learning_rate": 9.969205676691286e-05, + "loss": 2.1967, + "step": 2096 + }, + { + "epoch": 0.643646408839779, + "grad_norm": 0.7527763247489929, + "learning_rate": 9.969150571251442e-05, + "loss": 2.2263, + "step": 2097 + }, + { + "epoch": 0.6439533456108042, + "grad_norm": 0.9889422655105591, + "learning_rate": 9.96909541670354e-05, + "loss": 2.2127, + "step": 2098 + }, + { + "epoch": 0.6442602823818293, + "grad_norm": 1.0340619087219238, + "learning_rate": 9.969040213048125e-05, + "loss": 2.2392, + "step": 2099 + }, + { + "epoch": 0.6445672191528545, + "grad_norm": 0.735322892665863, + "learning_rate": 9.968984960285743e-05, + "loss": 2.1351, + "step": 2100 + }, + { + "epoch": 0.6448741559238796, + "grad_norm": 0.6575397849082947, + "learning_rate": 9.968929658416936e-05, + "loss": 2.2481, + "step": 2101 + }, + { + "epoch": 0.6451810926949049, + "grad_norm": 0.6891960501670837, + "learning_rate": 9.968874307442258e-05, + "loss": 2.2164, + "step": 2102 + }, + { + "epoch": 0.64548802946593, + "grad_norm": 0.792298436164856, + "learning_rate": 9.968818907362248e-05, + "loss": 2.1681, + "step": 2103 + }, + { + "epoch": 0.6457949662369552, + "grad_norm": 0.8438142538070679, + "learning_rate": 9.968763458177459e-05, + "loss": 2.2123, + "step": 2104 + }, + { + "epoch": 0.6461019030079803, + "grad_norm": 0.7494921088218689, + "learning_rate": 9.968707959888436e-05, + "loss": 2.1863, + "step": 2105 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.7049927115440369, + "learning_rate": 9.968652412495731e-05, + "loss": 2.2364, + "step": 2106 + }, + { + "epoch": 0.6467157765500307, + "grad_norm": 0.7586455345153809, + "learning_rate": 9.968596815999889e-05, + "loss": 2.1976, + "step": 2107 + }, + { + "epoch": 0.6470227133210559, + "grad_norm": 0.7762691974639893, + "learning_rate": 9.968541170401462e-05, + "loss": 2.2323, + "step": 2108 + }, + { + "epoch": 0.647329650092081, + "grad_norm": 0.8127642869949341, + "learning_rate": 9.968485475700998e-05, + "loss": 2.1577, + "step": 2109 + }, + { + "epoch": 0.6476365868631062, + "grad_norm": 0.6762635111808777, + "learning_rate": 9.968429731899049e-05, + "loss": 2.1972, + "step": 2110 + }, + { + "epoch": 0.6479435236341313, + "grad_norm": 0.675707995891571, + "learning_rate": 9.968373938996165e-05, + "loss": 2.1932, + "step": 2111 + }, + { + "epoch": 0.6482504604051565, + "grad_norm": 0.6996815204620361, + "learning_rate": 9.968318096992898e-05, + "loss": 2.2695, + "step": 2112 + }, + { + "epoch": 0.6485573971761817, + "grad_norm": 0.8519851565361023, + "learning_rate": 9.968262205889799e-05, + "loss": 2.2662, + "step": 2113 + }, + { + "epoch": 0.6488643339472069, + "grad_norm": 0.7621145844459534, + "learning_rate": 9.968206265687421e-05, + "loss": 2.2888, + "step": 2114 + }, + { + "epoch": 0.649171270718232, + "grad_norm": 0.786609411239624, + "learning_rate": 9.968150276386317e-05, + "loss": 2.3354, + "step": 2115 + }, + { + "epoch": 0.6494782074892572, + "grad_norm": 0.7693428993225098, + "learning_rate": 9.96809423798704e-05, + "loss": 2.1981, + "step": 2116 + }, + { + "epoch": 0.6497851442602823, + "grad_norm": 0.72762131690979, + "learning_rate": 9.968038150490145e-05, + "loss": 2.2387, + "step": 2117 + }, + { + "epoch": 0.6500920810313076, + "grad_norm": 0.737617015838623, + "learning_rate": 9.967982013896184e-05, + "loss": 2.258, + "step": 2118 + }, + { + "epoch": 0.6503990178023327, + "grad_norm": 0.7320968508720398, + "learning_rate": 9.967925828205712e-05, + "loss": 2.3248, + "step": 2119 + }, + { + "epoch": 0.6507059545733579, + "grad_norm": 0.7904484868049622, + "learning_rate": 9.967869593419286e-05, + "loss": 2.2121, + "step": 2120 + }, + { + "epoch": 0.651012891344383, + "grad_norm": 0.7519722580909729, + "learning_rate": 9.967813309537461e-05, + "loss": 2.1999, + "step": 2121 + }, + { + "epoch": 0.6513198281154082, + "grad_norm": 0.7201504707336426, + "learning_rate": 9.967756976560793e-05, + "loss": 2.2022, + "step": 2122 + }, + { + "epoch": 0.6516267648864333, + "grad_norm": 0.6134514808654785, + "learning_rate": 9.96770059448984e-05, + "loss": 2.2105, + "step": 2123 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.6086028218269348, + "learning_rate": 9.967644163325156e-05, + "loss": 2.212, + "step": 2124 + }, + { + "epoch": 0.6522406384284838, + "grad_norm": 0.6550475358963013, + "learning_rate": 9.967587683067302e-05, + "loss": 2.181, + "step": 2125 + }, + { + "epoch": 0.6525475751995089, + "grad_norm": 0.7557916045188904, + "learning_rate": 9.967531153716835e-05, + "loss": 2.3194, + "step": 2126 + }, + { + "epoch": 0.6528545119705341, + "grad_norm": 0.8859965801239014, + "learning_rate": 9.967474575274314e-05, + "loss": 2.2104, + "step": 2127 + }, + { + "epoch": 0.6531614487415592, + "grad_norm": 0.8049005270004272, + "learning_rate": 9.967417947740296e-05, + "loss": 2.2949, + "step": 2128 + }, + { + "epoch": 0.6534683855125845, + "grad_norm": 0.708297073841095, + "learning_rate": 9.967361271115343e-05, + "loss": 2.1703, + "step": 2129 + }, + { + "epoch": 0.6537753222836096, + "grad_norm": 0.6764169335365295, + "learning_rate": 9.967304545400016e-05, + "loss": 2.2177, + "step": 2130 + }, + { + "epoch": 0.6540822590546348, + "grad_norm": 0.6987971067428589, + "learning_rate": 9.967247770594872e-05, + "loss": 2.1699, + "step": 2131 + }, + { + "epoch": 0.6543891958256599, + "grad_norm": 0.7212976217269897, + "learning_rate": 9.967190946700476e-05, + "loss": 2.1217, + "step": 2132 + }, + { + "epoch": 0.6546961325966851, + "grad_norm": 0.6805562973022461, + "learning_rate": 9.967134073717386e-05, + "loss": 2.2295, + "step": 2133 + }, + { + "epoch": 0.6550030693677102, + "grad_norm": 0.665428102016449, + "learning_rate": 9.967077151646167e-05, + "loss": 2.1742, + "step": 2134 + }, + { + "epoch": 0.6553100061387355, + "grad_norm": 0.6691353917121887, + "learning_rate": 9.967020180487378e-05, + "loss": 2.2313, + "step": 2135 + }, + { + "epoch": 0.6556169429097606, + "grad_norm": 0.7095547914505005, + "learning_rate": 9.966963160241587e-05, + "loss": 2.1367, + "step": 2136 + }, + { + "epoch": 0.6559238796807858, + "grad_norm": 0.7050215601921082, + "learning_rate": 9.966906090909353e-05, + "loss": 2.3234, + "step": 2137 + }, + { + "epoch": 0.6562308164518109, + "grad_norm": 0.7592353820800781, + "learning_rate": 9.966848972491245e-05, + "loss": 2.1722, + "step": 2138 + }, + { + "epoch": 0.6565377532228361, + "grad_norm": 0.6520100831985474, + "learning_rate": 9.96679180498782e-05, + "loss": 2.2401, + "step": 2139 + }, + { + "epoch": 0.6568446899938613, + "grad_norm": 0.6650902628898621, + "learning_rate": 9.966734588399651e-05, + "loss": 2.2094, + "step": 2140 + }, + { + "epoch": 0.6571516267648865, + "grad_norm": 0.7236151099205017, + "learning_rate": 9.966677322727299e-05, + "loss": 2.3021, + "step": 2141 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.7160753011703491, + "learning_rate": 9.966620007971327e-05, + "loss": 2.1992, + "step": 2142 + }, + { + "epoch": 0.6577655003069368, + "grad_norm": 0.6761705279350281, + "learning_rate": 9.966562644132309e-05, + "loss": 2.1853, + "step": 2143 + }, + { + "epoch": 0.6580724370779619, + "grad_norm": 0.7017555236816406, + "learning_rate": 9.966505231210806e-05, + "loss": 2.208, + "step": 2144 + }, + { + "epoch": 0.6583793738489871, + "grad_norm": 0.7652586102485657, + "learning_rate": 9.966447769207387e-05, + "loss": 2.3065, + "step": 2145 + }, + { + "epoch": 0.6586863106200123, + "grad_norm": 0.7148436307907104, + "learning_rate": 9.966390258122621e-05, + "loss": 2.1388, + "step": 2146 + }, + { + "epoch": 0.6589932473910375, + "grad_norm": 0.5885360240936279, + "learning_rate": 9.966332697957076e-05, + "loss": 2.1463, + "step": 2147 + }, + { + "epoch": 0.6593001841620626, + "grad_norm": 0.6800816655158997, + "learning_rate": 9.966275088711321e-05, + "loss": 2.3397, + "step": 2148 + }, + { + "epoch": 0.6596071209330878, + "grad_norm": 0.6856956481933594, + "learning_rate": 9.966217430385925e-05, + "loss": 2.0893, + "step": 2149 + }, + { + "epoch": 0.6599140577041129, + "grad_norm": 0.6302888989448547, + "learning_rate": 9.966159722981456e-05, + "loss": 2.1108, + "step": 2150 + }, + { + "epoch": 0.6602209944751382, + "grad_norm": 0.6145252585411072, + "learning_rate": 9.966101966498486e-05, + "loss": 2.2668, + "step": 2151 + }, + { + "epoch": 0.6605279312461633, + "grad_norm": 0.7258949279785156, + "learning_rate": 9.966044160937586e-05, + "loss": 2.2163, + "step": 2152 + }, + { + "epoch": 0.6608348680171885, + "grad_norm": 0.6809847950935364, + "learning_rate": 9.965986306299327e-05, + "loss": 2.1828, + "step": 2153 + }, + { + "epoch": 0.6611418047882136, + "grad_norm": 0.6673223376274109, + "learning_rate": 9.96592840258428e-05, + "loss": 2.232, + "step": 2154 + }, + { + "epoch": 0.6614487415592388, + "grad_norm": 0.6483572721481323, + "learning_rate": 9.96587044979302e-05, + "loss": 2.199, + "step": 2155 + }, + { + "epoch": 0.6617556783302639, + "grad_norm": 0.6227185726165771, + "learning_rate": 9.965812447926115e-05, + "loss": 2.166, + "step": 2156 + }, + { + "epoch": 0.6620626151012892, + "grad_norm": 0.5982463955879211, + "learning_rate": 9.965754396984142e-05, + "loss": 2.2074, + "step": 2157 + }, + { + "epoch": 0.6623695518723143, + "grad_norm": 0.6357809901237488, + "learning_rate": 9.965696296967673e-05, + "loss": 2.2086, + "step": 2158 + }, + { + "epoch": 0.6626764886433395, + "grad_norm": 0.5908147692680359, + "learning_rate": 9.965638147877283e-05, + "loss": 2.1103, + "step": 2159 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.591332733631134, + "learning_rate": 9.965579949713545e-05, + "loss": 2.1698, + "step": 2160 + }, + { + "epoch": 0.6632903621853898, + "grad_norm": 0.5748336911201477, + "learning_rate": 9.965521702477038e-05, + "loss": 2.1812, + "step": 2161 + }, + { + "epoch": 0.663597298956415, + "grad_norm": 0.6643908023834229, + "learning_rate": 9.965463406168334e-05, + "loss": 2.2129, + "step": 2162 + }, + { + "epoch": 0.6639042357274402, + "grad_norm": 0.637627124786377, + "learning_rate": 9.965405060788011e-05, + "loss": 2.226, + "step": 2163 + }, + { + "epoch": 0.6642111724984653, + "grad_norm": 0.6170387268066406, + "learning_rate": 9.965346666336644e-05, + "loss": 2.2025, + "step": 2164 + }, + { + "epoch": 0.6645181092694905, + "grad_norm": 0.6038833260536194, + "learning_rate": 9.965288222814812e-05, + "loss": 2.1761, + "step": 2165 + }, + { + "epoch": 0.6648250460405156, + "grad_norm": 0.5705585479736328, + "learning_rate": 9.965229730223092e-05, + "loss": 2.1511, + "step": 2166 + }, + { + "epoch": 0.6651319828115408, + "grad_norm": 0.5994759798049927, + "learning_rate": 9.965171188562059e-05, + "loss": 2.1763, + "step": 2167 + }, + { + "epoch": 0.665438919582566, + "grad_norm": 0.5887313485145569, + "learning_rate": 9.965112597832296e-05, + "loss": 2.2185, + "step": 2168 + }, + { + "epoch": 0.6657458563535912, + "grad_norm": 0.5688689947128296, + "learning_rate": 9.96505395803438e-05, + "loss": 2.2387, + "step": 2169 + }, + { + "epoch": 0.6660527931246163, + "grad_norm": 0.6121554970741272, + "learning_rate": 9.96499526916889e-05, + "loss": 2.1938, + "step": 2170 + }, + { + "epoch": 0.6663597298956415, + "grad_norm": 0.6048038005828857, + "learning_rate": 9.964936531236407e-05, + "loss": 2.197, + "step": 2171 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6442995071411133, + "learning_rate": 9.96487774423751e-05, + "loss": 2.1725, + "step": 2172 + }, + { + "epoch": 0.6669736034376919, + "grad_norm": 0.7136862874031067, + "learning_rate": 9.964818908172783e-05, + "loss": 2.2166, + "step": 2173 + }, + { + "epoch": 0.667280540208717, + "grad_norm": 0.6902804970741272, + "learning_rate": 9.964760023042805e-05, + "loss": 2.2318, + "step": 2174 + }, + { + "epoch": 0.6675874769797422, + "grad_norm": 0.6946488618850708, + "learning_rate": 9.964701088848158e-05, + "loss": 2.177, + "step": 2175 + }, + { + "epoch": 0.6678944137507673, + "grad_norm": 0.6283712983131409, + "learning_rate": 9.964642105589425e-05, + "loss": 2.2227, + "step": 2176 + }, + { + "epoch": 0.6682013505217925, + "grad_norm": 0.5768510103225708, + "learning_rate": 9.96458307326719e-05, + "loss": 2.1559, + "step": 2177 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.6045784950256348, + "learning_rate": 9.964523991882035e-05, + "loss": 2.2018, + "step": 2178 + }, + { + "epoch": 0.6688152240638429, + "grad_norm": 0.5962889790534973, + "learning_rate": 9.964464861434544e-05, + "loss": 2.1898, + "step": 2179 + }, + { + "epoch": 0.669122160834868, + "grad_norm": 0.6611660718917847, + "learning_rate": 9.964405681925301e-05, + "loss": 2.1989, + "step": 2180 + }, + { + "epoch": 0.6694290976058932, + "grad_norm": 0.6764575242996216, + "learning_rate": 9.964346453354891e-05, + "loss": 2.2764, + "step": 2181 + }, + { + "epoch": 0.6697360343769183, + "grad_norm": 0.6795048117637634, + "learning_rate": 9.964287175723899e-05, + "loss": 2.1313, + "step": 2182 + }, + { + "epoch": 0.6700429711479435, + "grad_norm": 0.6697003841400146, + "learning_rate": 9.964227849032914e-05, + "loss": 2.1999, + "step": 2183 + }, + { + "epoch": 0.6703499079189686, + "grad_norm": 0.669682502746582, + "learning_rate": 9.964168473282519e-05, + "loss": 2.202, + "step": 2184 + }, + { + "epoch": 0.6706568446899939, + "grad_norm": 0.6823530793190002, + "learning_rate": 9.9641090484733e-05, + "loss": 2.2326, + "step": 2185 + }, + { + "epoch": 0.670963781461019, + "grad_norm": 0.7460775971412659, + "learning_rate": 9.964049574605848e-05, + "loss": 2.1594, + "step": 2186 + }, + { + "epoch": 0.6712707182320442, + "grad_norm": 0.8075460195541382, + "learning_rate": 9.963990051680744e-05, + "loss": 2.1506, + "step": 2187 + }, + { + "epoch": 0.6715776550030693, + "grad_norm": 0.8041695356369019, + "learning_rate": 9.963930479698585e-05, + "loss": 2.123, + "step": 2188 + }, + { + "epoch": 0.6718845917740945, + "grad_norm": 0.9129732251167297, + "learning_rate": 9.963870858659955e-05, + "loss": 2.116, + "step": 2189 + }, + { + "epoch": 0.6721915285451197, + "grad_norm": 0.9989685416221619, + "learning_rate": 9.963811188565444e-05, + "loss": 2.3194, + "step": 2190 + }, + { + "epoch": 0.6724984653161449, + "grad_norm": 1.0353670120239258, + "learning_rate": 9.96375146941564e-05, + "loss": 2.113, + "step": 2191 + }, + { + "epoch": 0.67280540208717, + "grad_norm": 0.897750735282898, + "learning_rate": 9.963691701211135e-05, + "loss": 2.1038, + "step": 2192 + }, + { + "epoch": 0.6731123388581952, + "grad_norm": 0.7353916168212891, + "learning_rate": 9.96363188395252e-05, + "loss": 2.2185, + "step": 2193 + }, + { + "epoch": 0.6734192756292203, + "grad_norm": 0.6474063992500305, + "learning_rate": 9.963572017640385e-05, + "loss": 2.2229, + "step": 2194 + }, + { + "epoch": 0.6737262124002455, + "grad_norm": 0.7194583415985107, + "learning_rate": 9.963512102275322e-05, + "loss": 2.2172, + "step": 2195 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.6638131737709045, + "learning_rate": 9.963452137857926e-05, + "loss": 2.2212, + "step": 2196 + }, + { + "epoch": 0.6743400859422959, + "grad_norm": 0.7219048738479614, + "learning_rate": 9.963392124388782e-05, + "loss": 2.3302, + "step": 2197 + }, + { + "epoch": 0.6746470227133211, + "grad_norm": 0.7941164374351501, + "learning_rate": 9.963332061868491e-05, + "loss": 2.2982, + "step": 2198 + }, + { + "epoch": 0.6749539594843462, + "grad_norm": 0.7356888055801392, + "learning_rate": 9.963271950297643e-05, + "loss": 2.1761, + "step": 2199 + }, + { + "epoch": 0.6752608962553714, + "grad_norm": 0.6705774664878845, + "learning_rate": 9.963211789676831e-05, + "loss": 2.2483, + "step": 2200 + }, + { + "epoch": 0.6755678330263966, + "grad_norm": 0.7958056926727295, + "learning_rate": 9.963151580006653e-05, + "loss": 2.2209, + "step": 2201 + }, + { + "epoch": 0.6758747697974218, + "grad_norm": 0.7215412259101868, + "learning_rate": 9.9630913212877e-05, + "loss": 2.1676, + "step": 2202 + }, + { + "epoch": 0.6761817065684469, + "grad_norm": 0.705649197101593, + "learning_rate": 9.963031013520572e-05, + "loss": 2.1855, + "step": 2203 + }, + { + "epoch": 0.6764886433394721, + "grad_norm": 0.7050254344940186, + "learning_rate": 9.962970656705861e-05, + "loss": 2.171, + "step": 2204 + }, + { + "epoch": 0.6767955801104972, + "grad_norm": 0.7163556218147278, + "learning_rate": 9.962910250844167e-05, + "loss": 2.1295, + "step": 2205 + }, + { + "epoch": 0.6771025168815225, + "grad_norm": 0.7195280194282532, + "learning_rate": 9.962849795936083e-05, + "loss": 2.1436, + "step": 2206 + }, + { + "epoch": 0.6774094536525476, + "grad_norm": 0.7356030344963074, + "learning_rate": 9.962789291982208e-05, + "loss": 2.2739, + "step": 2207 + }, + { + "epoch": 0.6777163904235728, + "grad_norm": 0.783649742603302, + "learning_rate": 9.962728738983143e-05, + "loss": 2.2461, + "step": 2208 + }, + { + "epoch": 0.6780233271945979, + "grad_norm": 0.6966754794120789, + "learning_rate": 9.962668136939481e-05, + "loss": 2.1977, + "step": 2209 + }, + { + "epoch": 0.6783302639656231, + "grad_norm": 0.6986487507820129, + "learning_rate": 9.962607485851825e-05, + "loss": 2.1806, + "step": 2210 + }, + { + "epoch": 0.6786372007366482, + "grad_norm": 0.6502536535263062, + "learning_rate": 9.962546785720774e-05, + "loss": 2.174, + "step": 2211 + }, + { + "epoch": 0.6789441375076735, + "grad_norm": 0.6797144412994385, + "learning_rate": 9.962486036546926e-05, + "loss": 2.2635, + "step": 2212 + }, + { + "epoch": 0.6792510742786986, + "grad_norm": 0.7190150022506714, + "learning_rate": 9.962425238330884e-05, + "loss": 2.2231, + "step": 2213 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.6770560145378113, + "learning_rate": 9.962364391073245e-05, + "loss": 2.1639, + "step": 2214 + }, + { + "epoch": 0.6798649478207489, + "grad_norm": 0.624911904335022, + "learning_rate": 9.962303494774614e-05, + "loss": 2.1754, + "step": 2215 + }, + { + "epoch": 0.6801718845917741, + "grad_norm": 0.7127423286437988, + "learning_rate": 9.96224254943559e-05, + "loss": 2.2047, + "step": 2216 + }, + { + "epoch": 0.6804788213627992, + "grad_norm": 0.6729345321655273, + "learning_rate": 9.962181555056778e-05, + "loss": 2.2245, + "step": 2217 + }, + { + "epoch": 0.6807857581338245, + "grad_norm": 0.7142044901847839, + "learning_rate": 9.96212051163878e-05, + "loss": 2.1827, + "step": 2218 + }, + { + "epoch": 0.6810926949048496, + "grad_norm": 0.686295211315155, + "learning_rate": 9.962059419182196e-05, + "loss": 2.1784, + "step": 2219 + }, + { + "epoch": 0.6813996316758748, + "grad_norm": 0.7207211256027222, + "learning_rate": 9.961998277687634e-05, + "loss": 2.2603, + "step": 2220 + }, + { + "epoch": 0.6817065684468999, + "grad_norm": 0.814552903175354, + "learning_rate": 9.961937087155697e-05, + "loss": 2.2328, + "step": 2221 + }, + { + "epoch": 0.6820135052179251, + "grad_norm": 0.851860761642456, + "learning_rate": 9.96187584758699e-05, + "loss": 2.2334, + "step": 2222 + }, + { + "epoch": 0.6823204419889503, + "grad_norm": 0.9232058525085449, + "learning_rate": 9.961814558982117e-05, + "loss": 2.2259, + "step": 2223 + }, + { + "epoch": 0.6826273787599755, + "grad_norm": 0.8393358588218689, + "learning_rate": 9.961753221341684e-05, + "loss": 2.1347, + "step": 2224 + }, + { + "epoch": 0.6829343155310006, + "grad_norm": 0.7124439477920532, + "learning_rate": 9.961691834666297e-05, + "loss": 2.195, + "step": 2225 + }, + { + "epoch": 0.6832412523020258, + "grad_norm": 0.644290566444397, + "learning_rate": 9.961630398956565e-05, + "loss": 2.1967, + "step": 2226 + }, + { + "epoch": 0.6835481890730509, + "grad_norm": 0.6896283030509949, + "learning_rate": 9.961568914213092e-05, + "loss": 2.1781, + "step": 2227 + }, + { + "epoch": 0.6838551258440762, + "grad_norm": 0.711643636226654, + "learning_rate": 9.961507380436487e-05, + "loss": 2.1091, + "step": 2228 + }, + { + "epoch": 0.6841620626151013, + "grad_norm": 0.7056689858436584, + "learning_rate": 9.961445797627358e-05, + "loss": 2.1848, + "step": 2229 + }, + { + "epoch": 0.6844689993861265, + "grad_norm": 0.60573410987854, + "learning_rate": 9.961384165786314e-05, + "loss": 2.1156, + "step": 2230 + }, + { + "epoch": 0.6847759361571516, + "grad_norm": 0.5612443089485168, + "learning_rate": 9.961322484913963e-05, + "loss": 2.2311, + "step": 2231 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.6356449723243713, + "learning_rate": 9.961260755010916e-05, + "loss": 2.1945, + "step": 2232 + }, + { + "epoch": 0.6853898096992019, + "grad_norm": 0.7393341660499573, + "learning_rate": 9.961198976077782e-05, + "loss": 2.2743, + "step": 2233 + }, + { + "epoch": 0.6856967464702272, + "grad_norm": 0.7658794522285461, + "learning_rate": 9.961137148115171e-05, + "loss": 2.1729, + "step": 2234 + }, + { + "epoch": 0.6860036832412523, + "grad_norm": 0.790540337562561, + "learning_rate": 9.961075271123697e-05, + "loss": 2.1372, + "step": 2235 + }, + { + "epoch": 0.6863106200122775, + "grad_norm": 0.71295565366745, + "learning_rate": 9.961013345103968e-05, + "loss": 2.1325, + "step": 2236 + }, + { + "epoch": 0.6866175567833026, + "grad_norm": 0.6648302674293518, + "learning_rate": 9.960951370056597e-05, + "loss": 2.1626, + "step": 2237 + }, + { + "epoch": 0.6869244935543278, + "grad_norm": 0.6276865601539612, + "learning_rate": 9.960889345982198e-05, + "loss": 2.1848, + "step": 2238 + }, + { + "epoch": 0.6872314303253529, + "grad_norm": 0.6786942481994629, + "learning_rate": 9.960827272881383e-05, + "loss": 2.2402, + "step": 2239 + }, + { + "epoch": 0.6875383670963782, + "grad_norm": 0.7752293348312378, + "learning_rate": 9.960765150754764e-05, + "loss": 2.2187, + "step": 2240 + }, + { + "epoch": 0.6878453038674033, + "grad_norm": 0.7958577871322632, + "learning_rate": 9.960702979602956e-05, + "loss": 2.1995, + "step": 2241 + }, + { + "epoch": 0.6881522406384285, + "grad_norm": 0.7327582240104675, + "learning_rate": 9.960640759426575e-05, + "loss": 2.1709, + "step": 2242 + }, + { + "epoch": 0.6884591774094536, + "grad_norm": 0.7002710103988647, + "learning_rate": 9.960578490226233e-05, + "loss": 2.1966, + "step": 2243 + }, + { + "epoch": 0.6887661141804788, + "grad_norm": 0.6163785457611084, + "learning_rate": 9.960516172002548e-05, + "loss": 2.2012, + "step": 2244 + }, + { + "epoch": 0.689073050951504, + "grad_norm": 0.6808127760887146, + "learning_rate": 9.960453804756134e-05, + "loss": 2.1704, + "step": 2245 + }, + { + "epoch": 0.6893799877225292, + "grad_norm": 0.6571208834648132, + "learning_rate": 9.960391388487609e-05, + "loss": 2.17, + "step": 2246 + }, + { + "epoch": 0.6896869244935543, + "grad_norm": 0.7180834412574768, + "learning_rate": 9.960328923197588e-05, + "loss": 2.229, + "step": 2247 + }, + { + "epoch": 0.6899938612645795, + "grad_norm": 0.7283746600151062, + "learning_rate": 9.96026640888669e-05, + "loss": 2.195, + "step": 2248 + }, + { + "epoch": 0.6903007980356046, + "grad_norm": 0.6808122992515564, + "learning_rate": 9.960203845555531e-05, + "loss": 2.1327, + "step": 2249 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.7105094790458679, + "learning_rate": 9.960141233204731e-05, + "loss": 2.2747, + "step": 2250 + }, + { + "epoch": 0.690914671577655, + "grad_norm": 0.7650291919708252, + "learning_rate": 9.960078571834909e-05, + "loss": 2.2751, + "step": 2251 + }, + { + "epoch": 0.6912216083486802, + "grad_norm": 0.8347647786140442, + "learning_rate": 9.960015861446684e-05, + "loss": 2.2101, + "step": 2252 + }, + { + "epoch": 0.6915285451197053, + "grad_norm": 0.7774063348770142, + "learning_rate": 9.959953102040672e-05, + "loss": 2.1275, + "step": 2253 + }, + { + "epoch": 0.6918354818907305, + "grad_norm": 0.7466274499893188, + "learning_rate": 9.959890293617497e-05, + "loss": 2.1352, + "step": 2254 + }, + { + "epoch": 0.6921424186617556, + "grad_norm": 0.7451669573783875, + "learning_rate": 9.959827436177781e-05, + "loss": 2.1229, + "step": 2255 + }, + { + "epoch": 0.6924493554327809, + "grad_norm": 0.651746392250061, + "learning_rate": 9.959764529722142e-05, + "loss": 2.1416, + "step": 2256 + }, + { + "epoch": 0.692756292203806, + "grad_norm": 0.6267968416213989, + "learning_rate": 9.959701574251203e-05, + "loss": 2.1346, + "step": 2257 + }, + { + "epoch": 0.6930632289748312, + "grad_norm": 0.6087000966072083, + "learning_rate": 9.959638569765586e-05, + "loss": 2.2136, + "step": 2258 + }, + { + "epoch": 0.6933701657458563, + "grad_norm": 0.6032208204269409, + "learning_rate": 9.959575516265914e-05, + "loss": 2.1211, + "step": 2259 + }, + { + "epoch": 0.6936771025168815, + "grad_norm": 0.83074551820755, + "learning_rate": 9.95951241375281e-05, + "loss": 2.2951, + "step": 2260 + }, + { + "epoch": 0.6939840392879066, + "grad_norm": 0.8564106225967407, + "learning_rate": 9.959449262226897e-05, + "loss": 2.1496, + "step": 2261 + }, + { + "epoch": 0.6942909760589319, + "grad_norm": 0.8558153510093689, + "learning_rate": 9.9593860616888e-05, + "loss": 2.2325, + "step": 2262 + }, + { + "epoch": 0.694597912829957, + "grad_norm": 0.7391008734703064, + "learning_rate": 9.959322812139143e-05, + "loss": 2.1133, + "step": 2263 + }, + { + "epoch": 0.6949048496009822, + "grad_norm": 0.6090536713600159, + "learning_rate": 9.959259513578552e-05, + "loss": 2.1453, + "step": 2264 + }, + { + "epoch": 0.6952117863720073, + "grad_norm": 0.5893986821174622, + "learning_rate": 9.95919616600765e-05, + "loss": 2.2035, + "step": 2265 + }, + { + "epoch": 0.6955187231430325, + "grad_norm": 0.6274020671844482, + "learning_rate": 9.959132769427065e-05, + "loss": 2.2118, + "step": 2266 + }, + { + "epoch": 0.6958256599140578, + "grad_norm": 0.6287395358085632, + "learning_rate": 9.959069323837424e-05, + "loss": 2.2167, + "step": 2267 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.6281611323356628, + "learning_rate": 9.959005829239354e-05, + "loss": 2.1945, + "step": 2268 + }, + { + "epoch": 0.6964395334561081, + "grad_norm": 0.6422389149665833, + "learning_rate": 9.958942285633481e-05, + "loss": 2.1826, + "step": 2269 + }, + { + "epoch": 0.6967464702271332, + "grad_norm": 0.6461887955665588, + "learning_rate": 9.958878693020434e-05, + "loss": 2.2454, + "step": 2270 + }, + { + "epoch": 0.6970534069981584, + "grad_norm": 0.562102735042572, + "learning_rate": 9.958815051400841e-05, + "loss": 2.1375, + "step": 2271 + }, + { + "epoch": 0.6973603437691835, + "grad_norm": 0.5737003087997437, + "learning_rate": 9.958751360775331e-05, + "loss": 2.2344, + "step": 2272 + }, + { + "epoch": 0.6976672805402088, + "grad_norm": 0.5516494512557983, + "learning_rate": 9.958687621144535e-05, + "loss": 2.249, + "step": 2273 + }, + { + "epoch": 0.6979742173112339, + "grad_norm": 0.7148357629776001, + "learning_rate": 9.958623832509081e-05, + "loss": 2.2383, + "step": 2274 + }, + { + "epoch": 0.6982811540822591, + "grad_norm": 0.7151525020599365, + "learning_rate": 9.958559994869599e-05, + "loss": 2.1697, + "step": 2275 + }, + { + "epoch": 0.6985880908532842, + "grad_norm": 0.6927846670150757, + "learning_rate": 9.958496108226722e-05, + "loss": 2.1534, + "step": 2276 + }, + { + "epoch": 0.6988950276243094, + "grad_norm": 0.811660647392273, + "learning_rate": 9.958432172581079e-05, + "loss": 2.2197, + "step": 2277 + }, + { + "epoch": 0.6992019643953346, + "grad_norm": 0.9680081009864807, + "learning_rate": 9.958368187933305e-05, + "loss": 2.2241, + "step": 2278 + }, + { + "epoch": 0.6995089011663598, + "grad_norm": 0.9996320605278015, + "learning_rate": 9.958304154284028e-05, + "loss": 2.1598, + "step": 2279 + }, + { + "epoch": 0.6998158379373849, + "grad_norm": 1.008695363998413, + "learning_rate": 9.958240071633884e-05, + "loss": 2.2082, + "step": 2280 + }, + { + "epoch": 0.7001227747084101, + "grad_norm": 0.9931860566139221, + "learning_rate": 9.958175939983506e-05, + "loss": 2.1478, + "step": 2281 + }, + { + "epoch": 0.7004297114794352, + "grad_norm": 0.8637800812721252, + "learning_rate": 9.958111759333528e-05, + "loss": 2.149, + "step": 2282 + }, + { + "epoch": 0.7007366482504604, + "grad_norm": 0.7089012861251831, + "learning_rate": 9.958047529684582e-05, + "loss": 2.1845, + "step": 2283 + }, + { + "epoch": 0.7010435850214856, + "grad_norm": 0.6083673238754272, + "learning_rate": 9.957983251037303e-05, + "loss": 2.1542, + "step": 2284 + }, + { + "epoch": 0.7013505217925108, + "grad_norm": 0.7092905044555664, + "learning_rate": 9.957918923392331e-05, + "loss": 2.2305, + "step": 2285 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.8416675925254822, + "learning_rate": 9.957854546750297e-05, + "loss": 2.2975, + "step": 2286 + }, + { + "epoch": 0.7019643953345611, + "grad_norm": 0.7778663039207458, + "learning_rate": 9.957790121111838e-05, + "loss": 2.2363, + "step": 2287 + }, + { + "epoch": 0.7022713321055862, + "grad_norm": 0.7886617183685303, + "learning_rate": 9.957725646477592e-05, + "loss": 2.1547, + "step": 2288 + }, + { + "epoch": 0.7025782688766115, + "grad_norm": 0.6596038937568665, + "learning_rate": 9.957661122848194e-05, + "loss": 2.1537, + "step": 2289 + }, + { + "epoch": 0.7028852056476366, + "grad_norm": 0.6441544890403748, + "learning_rate": 9.957596550224285e-05, + "loss": 2.1678, + "step": 2290 + }, + { + "epoch": 0.7031921424186618, + "grad_norm": 0.7106116414070129, + "learning_rate": 9.957531928606499e-05, + "loss": 2.2039, + "step": 2291 + }, + { + "epoch": 0.7034990791896869, + "grad_norm": 0.6948207020759583, + "learning_rate": 9.957467257995476e-05, + "loss": 2.176, + "step": 2292 + }, + { + "epoch": 0.7038060159607121, + "grad_norm": 0.6834874153137207, + "learning_rate": 9.957402538391859e-05, + "loss": 2.2182, + "step": 2293 + }, + { + "epoch": 0.7041129527317372, + "grad_norm": 0.6246630549430847, + "learning_rate": 9.957337769796282e-05, + "loss": 2.1181, + "step": 2294 + }, + { + "epoch": 0.7044198895027625, + "grad_norm": 0.6421988606452942, + "learning_rate": 9.957272952209389e-05, + "loss": 2.1352, + "step": 2295 + }, + { + "epoch": 0.7047268262737876, + "grad_norm": 0.5955870151519775, + "learning_rate": 9.95720808563182e-05, + "loss": 2.1852, + "step": 2296 + }, + { + "epoch": 0.7050337630448128, + "grad_norm": 0.6961265206336975, + "learning_rate": 9.957143170064214e-05, + "loss": 2.242, + "step": 2297 + }, + { + "epoch": 0.7053406998158379, + "grad_norm": 0.6966063380241394, + "learning_rate": 9.957078205507213e-05, + "loss": 2.1505, + "step": 2298 + }, + { + "epoch": 0.7056476365868631, + "grad_norm": 0.6155996322631836, + "learning_rate": 9.957013191961459e-05, + "loss": 2.1928, + "step": 2299 + }, + { + "epoch": 0.7059545733578882, + "grad_norm": 0.6092718839645386, + "learning_rate": 9.956948129427597e-05, + "loss": 2.138, + "step": 2300 + }, + { + "epoch": 0.7062615101289135, + "grad_norm": 0.645746111869812, + "learning_rate": 9.95688301790627e-05, + "loss": 2.2334, + "step": 2301 + }, + { + "epoch": 0.7065684468999386, + "grad_norm": 0.5959149599075317, + "learning_rate": 9.956817857398116e-05, + "loss": 2.1985, + "step": 2302 + }, + { + "epoch": 0.7068753836709638, + "grad_norm": 0.7127073407173157, + "learning_rate": 9.956752647903785e-05, + "loss": 2.2157, + "step": 2303 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.5589274764060974, + "learning_rate": 9.956687389423917e-05, + "loss": 2.1251, + "step": 2304 + }, + { + "epoch": 0.7074892572130141, + "grad_norm": 0.5502300262451172, + "learning_rate": 9.95662208195916e-05, + "loss": 2.1344, + "step": 2305 + }, + { + "epoch": 0.7077961939840393, + "grad_norm": 0.6577275991439819, + "learning_rate": 9.95655672551016e-05, + "loss": 2.1646, + "step": 2306 + }, + { + "epoch": 0.7081031307550645, + "grad_norm": 0.6241618394851685, + "learning_rate": 9.956491320077559e-05, + "loss": 2.1153, + "step": 2307 + }, + { + "epoch": 0.7084100675260896, + "grad_norm": 0.5846728086471558, + "learning_rate": 9.956425865662007e-05, + "loss": 2.1477, + "step": 2308 + }, + { + "epoch": 0.7087170042971148, + "grad_norm": 0.6005275249481201, + "learning_rate": 9.95636036226415e-05, + "loss": 2.2034, + "step": 2309 + }, + { + "epoch": 0.7090239410681399, + "grad_norm": 0.6545519828796387, + "learning_rate": 9.956294809884635e-05, + "loss": 2.23, + "step": 2310 + }, + { + "epoch": 0.7093308778391652, + "grad_norm": 0.7513750791549683, + "learning_rate": 9.956229208524108e-05, + "loss": 2.2497, + "step": 2311 + }, + { + "epoch": 0.7096378146101903, + "grad_norm": 0.7308349609375, + "learning_rate": 9.956163558183219e-05, + "loss": 2.166, + "step": 2312 + }, + { + "epoch": 0.7099447513812155, + "grad_norm": 0.6278798580169678, + "learning_rate": 9.956097858862619e-05, + "loss": 2.1994, + "step": 2313 + }, + { + "epoch": 0.7102516881522406, + "grad_norm": 0.6725621223449707, + "learning_rate": 9.956032110562953e-05, + "loss": 2.2212, + "step": 2314 + }, + { + "epoch": 0.7105586249232658, + "grad_norm": 0.7116945385932922, + "learning_rate": 9.955966313284872e-05, + "loss": 2.2033, + "step": 2315 + }, + { + "epoch": 0.7108655616942909, + "grad_norm": 0.5906245112419128, + "learning_rate": 9.95590046702903e-05, + "loss": 2.1419, + "step": 2316 + }, + { + "epoch": 0.7111724984653162, + "grad_norm": 0.6911863684654236, + "learning_rate": 9.955834571796073e-05, + "loss": 2.1697, + "step": 2317 + }, + { + "epoch": 0.7114794352363413, + "grad_norm": 0.600350558757782, + "learning_rate": 9.955768627586655e-05, + "loss": 2.0864, + "step": 2318 + }, + { + "epoch": 0.7117863720073665, + "grad_norm": 0.6246278285980225, + "learning_rate": 9.955702634401427e-05, + "loss": 2.1549, + "step": 2319 + }, + { + "epoch": 0.7120933087783916, + "grad_norm": 0.6530009508132935, + "learning_rate": 9.95563659224104e-05, + "loss": 2.1457, + "step": 2320 + }, + { + "epoch": 0.7124002455494168, + "grad_norm": 0.6566256880760193, + "learning_rate": 9.955570501106148e-05, + "loss": 2.1589, + "step": 2321 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.6607041358947754, + "learning_rate": 9.955504360997404e-05, + "loss": 2.1692, + "step": 2322 + }, + { + "epoch": 0.7130141190914672, + "grad_norm": 0.7257810235023499, + "learning_rate": 9.95543817191546e-05, + "loss": 2.2067, + "step": 2323 + }, + { + "epoch": 0.7133210558624923, + "grad_norm": 0.7413349151611328, + "learning_rate": 9.955371933860973e-05, + "loss": 2.1817, + "step": 2324 + }, + { + "epoch": 0.7136279926335175, + "grad_norm": 0.6968317031860352, + "learning_rate": 9.955305646834596e-05, + "loss": 2.2574, + "step": 2325 + }, + { + "epoch": 0.7139349294045426, + "grad_norm": 0.8065732717514038, + "learning_rate": 9.955239310836983e-05, + "loss": 2.1957, + "step": 2326 + }, + { + "epoch": 0.7142418661755678, + "grad_norm": 0.7563133835792542, + "learning_rate": 9.955172925868792e-05, + "loss": 2.2113, + "step": 2327 + }, + { + "epoch": 0.714548802946593, + "grad_norm": 0.6790496110916138, + "learning_rate": 9.955106491930678e-05, + "loss": 2.103, + "step": 2328 + }, + { + "epoch": 0.7148557397176182, + "grad_norm": 0.65167236328125, + "learning_rate": 9.955040009023298e-05, + "loss": 2.1919, + "step": 2329 + }, + { + "epoch": 0.7151626764886433, + "grad_norm": 0.6869332790374756, + "learning_rate": 9.954973477147307e-05, + "loss": 2.2141, + "step": 2330 + }, + { + "epoch": 0.7154696132596685, + "grad_norm": 0.8613699078559875, + "learning_rate": 9.954906896303363e-05, + "loss": 2.1962, + "step": 2331 + }, + { + "epoch": 0.7157765500306936, + "grad_norm": 0.8827282786369324, + "learning_rate": 9.954840266492127e-05, + "loss": 2.216, + "step": 2332 + }, + { + "epoch": 0.7160834868017188, + "grad_norm": 0.9737905263900757, + "learning_rate": 9.954773587714255e-05, + "loss": 2.2118, + "step": 2333 + }, + { + "epoch": 0.716390423572744, + "grad_norm": 0.9978635311126709, + "learning_rate": 9.954706859970404e-05, + "loss": 2.0998, + "step": 2334 + }, + { + "epoch": 0.7166973603437692, + "grad_norm": 0.8694623112678528, + "learning_rate": 9.954640083261238e-05, + "loss": 2.1533, + "step": 2335 + }, + { + "epoch": 0.7170042971147943, + "grad_norm": 0.641293466091156, + "learning_rate": 9.954573257587415e-05, + "loss": 2.2095, + "step": 2336 + }, + { + "epoch": 0.7173112338858195, + "grad_norm": 0.6289860010147095, + "learning_rate": 9.954506382949594e-05, + "loss": 2.1683, + "step": 2337 + }, + { + "epoch": 0.7176181706568447, + "grad_norm": 0.8292246460914612, + "learning_rate": 9.954439459348437e-05, + "loss": 2.1729, + "step": 2338 + }, + { + "epoch": 0.7179251074278699, + "grad_norm": 0.8990920782089233, + "learning_rate": 9.954372486784605e-05, + "loss": 2.0888, + "step": 2339 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.7905614376068115, + "learning_rate": 9.954305465258762e-05, + "loss": 2.2262, + "step": 2340 + }, + { + "epoch": 0.7185389809699202, + "grad_norm": 0.7142611145973206, + "learning_rate": 9.954238394771567e-05, + "loss": 2.1311, + "step": 2341 + }, + { + "epoch": 0.7188459177409454, + "grad_norm": 0.68161541223526, + "learning_rate": 9.954171275323684e-05, + "loss": 2.2622, + "step": 2342 + }, + { + "epoch": 0.7191528545119705, + "grad_norm": 0.7524895668029785, + "learning_rate": 9.954104106915779e-05, + "loss": 2.1709, + "step": 2343 + }, + { + "epoch": 0.7194597912829958, + "grad_norm": 0.7419885396957397, + "learning_rate": 9.954036889548511e-05, + "loss": 2.1528, + "step": 2344 + }, + { + "epoch": 0.7197667280540209, + "grad_norm": 0.8045634031295776, + "learning_rate": 9.953969623222547e-05, + "loss": 2.1774, + "step": 2345 + }, + { + "epoch": 0.7200736648250461, + "grad_norm": 0.6680217385292053, + "learning_rate": 9.953902307938554e-05, + "loss": 2.2345, + "step": 2346 + }, + { + "epoch": 0.7203806015960712, + "grad_norm": 0.6900907754898071, + "learning_rate": 9.953834943697193e-05, + "loss": 2.1696, + "step": 2347 + }, + { + "epoch": 0.7206875383670964, + "grad_norm": 0.7231009006500244, + "learning_rate": 9.953767530499132e-05, + "loss": 2.2556, + "step": 2348 + }, + { + "epoch": 0.7209944751381215, + "grad_norm": 0.7766092419624329, + "learning_rate": 9.953700068345036e-05, + "loss": 2.1522, + "step": 2349 + }, + { + "epoch": 0.7213014119091468, + "grad_norm": 0.7361852526664734, + "learning_rate": 9.953632557235574e-05, + "loss": 2.2427, + "step": 2350 + }, + { + "epoch": 0.7216083486801719, + "grad_norm": 0.7170109152793884, + "learning_rate": 9.953564997171411e-05, + "loss": 2.2439, + "step": 2351 + }, + { + "epoch": 0.7219152854511971, + "grad_norm": 0.7192662954330444, + "learning_rate": 9.953497388153214e-05, + "loss": 2.1242, + "step": 2352 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.7363288402557373, + "learning_rate": 9.953429730181653e-05, + "loss": 2.2748, + "step": 2353 + }, + { + "epoch": 0.7225291589932474, + "grad_norm": 0.8516983985900879, + "learning_rate": 9.953362023257397e-05, + "loss": 2.2471, + "step": 2354 + }, + { + "epoch": 0.7228360957642725, + "grad_norm": 0.7928574681282043, + "learning_rate": 9.953294267381114e-05, + "loss": 2.164, + "step": 2355 + }, + { + "epoch": 0.7231430325352978, + "grad_norm": 0.6803320646286011, + "learning_rate": 9.953226462553474e-05, + "loss": 2.1671, + "step": 2356 + }, + { + "epoch": 0.7234499693063229, + "grad_norm": 0.6811994910240173, + "learning_rate": 9.953158608775147e-05, + "loss": 2.1042, + "step": 2357 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.6077840328216553, + "learning_rate": 9.953090706046804e-05, + "loss": 2.2161, + "step": 2358 + }, + { + "epoch": 0.7240638428483732, + "grad_norm": 0.5938412547111511, + "learning_rate": 9.953022754369114e-05, + "loss": 2.1177, + "step": 2359 + }, + { + "epoch": 0.7243707796193984, + "grad_norm": 0.6752299070358276, + "learning_rate": 9.952954753742751e-05, + "loss": 2.2255, + "step": 2360 + }, + { + "epoch": 0.7246777163904236, + "grad_norm": 0.6745245456695557, + "learning_rate": 9.952886704168387e-05, + "loss": 2.1817, + "step": 2361 + }, + { + "epoch": 0.7249846531614488, + "grad_norm": 0.6645397543907166, + "learning_rate": 9.95281860564669e-05, + "loss": 2.2495, + "step": 2362 + }, + { + "epoch": 0.7252915899324739, + "grad_norm": 0.6758745312690735, + "learning_rate": 9.95275045817834e-05, + "loss": 2.2059, + "step": 2363 + }, + { + "epoch": 0.7255985267034991, + "grad_norm": 0.6584516763687134, + "learning_rate": 9.952682261764006e-05, + "loss": 2.1868, + "step": 2364 + }, + { + "epoch": 0.7259054634745242, + "grad_norm": 0.6335561871528625, + "learning_rate": 9.952614016404363e-05, + "loss": 2.1352, + "step": 2365 + }, + { + "epoch": 0.7262124002455494, + "grad_norm": 0.6656816601753235, + "learning_rate": 9.952545722100087e-05, + "loss": 2.1805, + "step": 2366 + }, + { + "epoch": 0.7265193370165746, + "grad_norm": 0.6262782216072083, + "learning_rate": 9.95247737885185e-05, + "loss": 2.1435, + "step": 2367 + }, + { + "epoch": 0.7268262737875998, + "grad_norm": 0.569795548915863, + "learning_rate": 9.952408986660329e-05, + "loss": 2.1547, + "step": 2368 + }, + { + "epoch": 0.7271332105586249, + "grad_norm": 0.5249118208885193, + "learning_rate": 9.952340545526199e-05, + "loss": 2.1213, + "step": 2369 + }, + { + "epoch": 0.7274401473296501, + "grad_norm": 0.5581740140914917, + "learning_rate": 9.952272055450139e-05, + "loss": 2.1866, + "step": 2370 + }, + { + "epoch": 0.7277470841006752, + "grad_norm": 0.5986969470977783, + "learning_rate": 9.952203516432821e-05, + "loss": 2.143, + "step": 2371 + }, + { + "epoch": 0.7280540208717005, + "grad_norm": 0.6426723599433899, + "learning_rate": 9.952134928474926e-05, + "loss": 2.2132, + "step": 2372 + }, + { + "epoch": 0.7283609576427256, + "grad_norm": 0.5856953263282776, + "learning_rate": 9.952066291577133e-05, + "loss": 2.1502, + "step": 2373 + }, + { + "epoch": 0.7286678944137508, + "grad_norm": 0.5420570969581604, + "learning_rate": 9.951997605740117e-05, + "loss": 2.1213, + "step": 2374 + }, + { + "epoch": 0.7289748311847759, + "grad_norm": 0.6201688647270203, + "learning_rate": 9.951928870964558e-05, + "loss": 2.218, + "step": 2375 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.7023850083351135, + "learning_rate": 9.951860087251137e-05, + "loss": 2.2787, + "step": 2376 + }, + { + "epoch": 0.7295887047268262, + "grad_norm": 0.733650803565979, + "learning_rate": 9.951791254600532e-05, + "loss": 2.1861, + "step": 2377 + }, + { + "epoch": 0.7298956414978515, + "grad_norm": 0.7177363038063049, + "learning_rate": 9.951722373013421e-05, + "loss": 2.1905, + "step": 2378 + }, + { + "epoch": 0.7302025782688766, + "grad_norm": 0.7963547706604004, + "learning_rate": 9.95165344249049e-05, + "loss": 2.1842, + "step": 2379 + }, + { + "epoch": 0.7305095150399018, + "grad_norm": 0.8466546535491943, + "learning_rate": 9.951584463032416e-05, + "loss": 2.1661, + "step": 2380 + }, + { + "epoch": 0.7308164518109269, + "grad_norm": 0.7288870811462402, + "learning_rate": 9.951515434639882e-05, + "loss": 2.1153, + "step": 2381 + }, + { + "epoch": 0.7311233885819521, + "grad_norm": 0.6168704032897949, + "learning_rate": 9.951446357313571e-05, + "loss": 2.121, + "step": 2382 + }, + { + "epoch": 0.7314303253529773, + "grad_norm": 0.6534848809242249, + "learning_rate": 9.951377231054166e-05, + "loss": 2.2087, + "step": 2383 + }, + { + "epoch": 0.7317372621240025, + "grad_norm": 0.7872020602226257, + "learning_rate": 9.951308055862347e-05, + "loss": 2.2428, + "step": 2384 + }, + { + "epoch": 0.7320441988950276, + "grad_norm": 0.864799439907074, + "learning_rate": 9.9512388317388e-05, + "loss": 2.2392, + "step": 2385 + }, + { + "epoch": 0.7323511356660528, + "grad_norm": 0.7365485429763794, + "learning_rate": 9.95116955868421e-05, + "loss": 2.1614, + "step": 2386 + }, + { + "epoch": 0.7326580724370779, + "grad_norm": 0.6509390473365784, + "learning_rate": 9.95110023669926e-05, + "loss": 2.1917, + "step": 2387 + }, + { + "epoch": 0.7329650092081031, + "grad_norm": 0.7660403847694397, + "learning_rate": 9.951030865784635e-05, + "loss": 2.2414, + "step": 2388 + }, + { + "epoch": 0.7332719459791283, + "grad_norm": 0.9997872114181519, + "learning_rate": 9.950961445941022e-05, + "loss": 2.2063, + "step": 2389 + }, + { + "epoch": 0.7335788827501535, + "grad_norm": 1.0113418102264404, + "learning_rate": 9.950891977169106e-05, + "loss": 2.1898, + "step": 2390 + }, + { + "epoch": 0.7338858195211786, + "grad_norm": 0.8849206566810608, + "learning_rate": 9.950822459469573e-05, + "loss": 2.1503, + "step": 2391 + }, + { + "epoch": 0.7341927562922038, + "grad_norm": 0.6561055779457092, + "learning_rate": 9.950752892843112e-05, + "loss": 2.1234, + "step": 2392 + }, + { + "epoch": 0.7344996930632289, + "grad_norm": 0.5568758845329285, + "learning_rate": 9.950683277290407e-05, + "loss": 2.2129, + "step": 2393 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.7019078135490417, + "learning_rate": 9.950613612812149e-05, + "loss": 2.1162, + "step": 2394 + }, + { + "epoch": 0.7351135666052793, + "grad_norm": 0.7633521556854248, + "learning_rate": 9.950543899409026e-05, + "loss": 2.2427, + "step": 2395 + }, + { + "epoch": 0.7354205033763045, + "grad_norm": 0.6743205785751343, + "learning_rate": 9.950474137081726e-05, + "loss": 2.2213, + "step": 2396 + }, + { + "epoch": 0.7357274401473296, + "grad_norm": 0.6008336544036865, + "learning_rate": 9.950404325830941e-05, + "loss": 2.1605, + "step": 2397 + }, + { + "epoch": 0.7360343769183548, + "grad_norm": 0.648760199546814, + "learning_rate": 9.950334465657357e-05, + "loss": 2.2298, + "step": 2398 + }, + { + "epoch": 0.7363413136893799, + "grad_norm": 0.6996559500694275, + "learning_rate": 9.950264556561667e-05, + "loss": 2.1616, + "step": 2399 + }, + { + "epoch": 0.7366482504604052, + "grad_norm": 0.741629421710968, + "learning_rate": 9.950194598544561e-05, + "loss": 2.2162, + "step": 2400 + }, + { + "epoch": 0.7369551872314303, + "grad_norm": 0.6144673824310303, + "learning_rate": 9.95012459160673e-05, + "loss": 2.15, + "step": 2401 + }, + { + "epoch": 0.7372621240024555, + "grad_norm": 0.5826541781425476, + "learning_rate": 9.950054535748867e-05, + "loss": 2.1792, + "step": 2402 + }, + { + "epoch": 0.7375690607734806, + "grad_norm": 0.6489288806915283, + "learning_rate": 9.949984430971665e-05, + "loss": 2.1703, + "step": 2403 + }, + { + "epoch": 0.7378759975445058, + "grad_norm": 0.6752250790596008, + "learning_rate": 9.949914277275814e-05, + "loss": 2.2561, + "step": 2404 + }, + { + "epoch": 0.738182934315531, + "grad_norm": 0.5570092797279358, + "learning_rate": 9.94984407466201e-05, + "loss": 2.1418, + "step": 2405 + }, + { + "epoch": 0.7384898710865562, + "grad_norm": 0.5966812968254089, + "learning_rate": 9.949773823130944e-05, + "loss": 2.2168, + "step": 2406 + }, + { + "epoch": 0.7387968078575813, + "grad_norm": 0.6253142952919006, + "learning_rate": 9.949703522683314e-05, + "loss": 2.1646, + "step": 2407 + }, + { + "epoch": 0.7391037446286065, + "grad_norm": 0.6673659086227417, + "learning_rate": 9.94963317331981e-05, + "loss": 2.1904, + "step": 2408 + }, + { + "epoch": 0.7394106813996317, + "grad_norm": 0.6243279576301575, + "learning_rate": 9.949562775041133e-05, + "loss": 2.2568, + "step": 2409 + }, + { + "epoch": 0.7397176181706568, + "grad_norm": 0.7014298439025879, + "learning_rate": 9.949492327847973e-05, + "loss": 2.2331, + "step": 2410 + }, + { + "epoch": 0.7400245549416821, + "grad_norm": 0.698403537273407, + "learning_rate": 9.94942183174103e-05, + "loss": 2.1928, + "step": 2411 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.6354022026062012, + "learning_rate": 9.949351286721001e-05, + "loss": 2.0975, + "step": 2412 + }, + { + "epoch": 0.7406384284837324, + "grad_norm": 0.595302164554596, + "learning_rate": 9.949280692788579e-05, + "loss": 2.177, + "step": 2413 + }, + { + "epoch": 0.7409453652547575, + "grad_norm": 0.6844484210014343, + "learning_rate": 9.949210049944465e-05, + "loss": 2.1962, + "step": 2414 + }, + { + "epoch": 0.7412523020257827, + "grad_norm": 0.6242616176605225, + "learning_rate": 9.949139358189357e-05, + "loss": 2.2143, + "step": 2415 + }, + { + "epoch": 0.7415592387968079, + "grad_norm": 0.6524595022201538, + "learning_rate": 9.949068617523954e-05, + "loss": 2.1438, + "step": 2416 + }, + { + "epoch": 0.7418661755678331, + "grad_norm": 0.6667510867118835, + "learning_rate": 9.948997827948953e-05, + "loss": 2.2115, + "step": 2417 + }, + { + "epoch": 0.7421731123388582, + "grad_norm": 0.7688906192779541, + "learning_rate": 9.948926989465056e-05, + "loss": 2.1887, + "step": 2418 + }, + { + "epoch": 0.7424800491098834, + "grad_norm": 0.6888165473937988, + "learning_rate": 9.948856102072958e-05, + "loss": 2.1349, + "step": 2419 + }, + { + "epoch": 0.7427869858809085, + "grad_norm": 0.5672495365142822, + "learning_rate": 9.948785165773367e-05, + "loss": 2.1109, + "step": 2420 + }, + { + "epoch": 0.7430939226519337, + "grad_norm": 0.5714489221572876, + "learning_rate": 9.94871418056698e-05, + "loss": 2.1483, + "step": 2421 + }, + { + "epoch": 0.7434008594229589, + "grad_norm": 0.6061533093452454, + "learning_rate": 9.948643146454498e-05, + "loss": 2.211, + "step": 2422 + }, + { + "epoch": 0.7437077961939841, + "grad_norm": 0.6132726073265076, + "learning_rate": 9.948572063436625e-05, + "loss": 2.23, + "step": 2423 + }, + { + "epoch": 0.7440147329650092, + "grad_norm": 0.684301495552063, + "learning_rate": 9.948500931514062e-05, + "loss": 2.129, + "step": 2424 + }, + { + "epoch": 0.7443216697360344, + "grad_norm": 0.6325442790985107, + "learning_rate": 9.948429750687512e-05, + "loss": 2.129, + "step": 2425 + }, + { + "epoch": 0.7446286065070595, + "grad_norm": 0.6245989203453064, + "learning_rate": 9.948358520957678e-05, + "loss": 2.1999, + "step": 2426 + }, + { + "epoch": 0.7449355432780848, + "grad_norm": 0.6638534069061279, + "learning_rate": 9.948287242325267e-05, + "loss": 2.203, + "step": 2427 + }, + { + "epoch": 0.7452424800491099, + "grad_norm": 0.6121437549591064, + "learning_rate": 9.94821591479098e-05, + "loss": 2.1204, + "step": 2428 + }, + { + "epoch": 0.7455494168201351, + "grad_norm": 0.7919846177101135, + "learning_rate": 9.948144538355522e-05, + "loss": 2.2353, + "step": 2429 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.7246984839439392, + "learning_rate": 9.948073113019602e-05, + "loss": 2.1284, + "step": 2430 + }, + { + "epoch": 0.7461632903621854, + "grad_norm": 0.6120265126228333, + "learning_rate": 9.948001638783921e-05, + "loss": 2.0873, + "step": 2431 + }, + { + "epoch": 0.7464702271332105, + "grad_norm": 0.628588080406189, + "learning_rate": 9.947930115649189e-05, + "loss": 2.1713, + "step": 2432 + }, + { + "epoch": 0.7467771639042358, + "grad_norm": 0.63116854429245, + "learning_rate": 9.947858543616111e-05, + "loss": 2.123, + "step": 2433 + }, + { + "epoch": 0.7470841006752609, + "grad_norm": 0.6533017754554749, + "learning_rate": 9.947786922685394e-05, + "loss": 2.1593, + "step": 2434 + }, + { + "epoch": 0.7473910374462861, + "grad_norm": 0.6854177117347717, + "learning_rate": 9.947715252857749e-05, + "loss": 2.162, + "step": 2435 + }, + { + "epoch": 0.7476979742173112, + "grad_norm": 0.7257967591285706, + "learning_rate": 9.94764353413388e-05, + "loss": 2.2644, + "step": 2436 + }, + { + "epoch": 0.7480049109883364, + "grad_norm": 0.6806700825691223, + "learning_rate": 9.947571766514498e-05, + "loss": 2.0875, + "step": 2437 + }, + { + "epoch": 0.7483118477593615, + "grad_norm": 0.6616181135177612, + "learning_rate": 9.947499950000312e-05, + "loss": 2.1353, + "step": 2438 + }, + { + "epoch": 0.7486187845303868, + "grad_norm": 0.7249685525894165, + "learning_rate": 9.947428084592032e-05, + "loss": 2.148, + "step": 2439 + }, + { + "epoch": 0.7489257213014119, + "grad_norm": 0.6372905969619751, + "learning_rate": 9.947356170290369e-05, + "loss": 2.1749, + "step": 2440 + }, + { + "epoch": 0.7492326580724371, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.947284207096031e-05, + "loss": 2.1909, + "step": 2441 + }, + { + "epoch": 0.7495395948434622, + "grad_norm": 0.5830507278442383, + "learning_rate": 9.94721219500973e-05, + "loss": 2.1351, + "step": 2442 + }, + { + "epoch": 0.7498465316144874, + "grad_norm": 0.650262713432312, + "learning_rate": 9.94714013403218e-05, + "loss": 2.2602, + "step": 2443 + }, + { + "epoch": 0.7501534683855126, + "grad_norm": 0.6658717393875122, + "learning_rate": 9.947068024164091e-05, + "loss": 2.0919, + "step": 2444 + }, + { + "epoch": 0.7504604051565378, + "grad_norm": 0.7299105525016785, + "learning_rate": 9.946995865406177e-05, + "loss": 2.2079, + "step": 2445 + }, + { + "epoch": 0.7507673419275629, + "grad_norm": 0.762246310710907, + "learning_rate": 9.946923657759148e-05, + "loss": 2.2225, + "step": 2446 + }, + { + "epoch": 0.7510742786985881, + "grad_norm": 0.7019835710525513, + "learning_rate": 9.946851401223722e-05, + "loss": 2.175, + "step": 2447 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.6214791536331177, + "learning_rate": 9.946779095800611e-05, + "loss": 2.2095, + "step": 2448 + }, + { + "epoch": 0.7516881522406385, + "grad_norm": 0.6380667090415955, + "learning_rate": 9.94670674149053e-05, + "loss": 2.2325, + "step": 2449 + }, + { + "epoch": 0.7519950890116636, + "grad_norm": 0.6175886392593384, + "learning_rate": 9.946634338294191e-05, + "loss": 2.1431, + "step": 2450 + }, + { + "epoch": 0.7523020257826888, + "grad_norm": 0.6642621159553528, + "learning_rate": 9.946561886212315e-05, + "loss": 2.1538, + "step": 2451 + }, + { + "epoch": 0.7526089625537139, + "grad_norm": 0.7078617215156555, + "learning_rate": 9.946489385245614e-05, + "loss": 2.1544, + "step": 2452 + }, + { + "epoch": 0.7529158993247391, + "grad_norm": 0.6939398050308228, + "learning_rate": 9.946416835394806e-05, + "loss": 2.1131, + "step": 2453 + }, + { + "epoch": 0.7532228360957642, + "grad_norm": 0.7080716490745544, + "learning_rate": 9.946344236660608e-05, + "loss": 2.2135, + "step": 2454 + }, + { + "epoch": 0.7535297728667895, + "grad_norm": 0.7451115250587463, + "learning_rate": 9.946271589043736e-05, + "loss": 2.1475, + "step": 2455 + }, + { + "epoch": 0.7538367096378146, + "grad_norm": 0.6718367338180542, + "learning_rate": 9.946198892544909e-05, + "loss": 2.1853, + "step": 2456 + }, + { + "epoch": 0.7541436464088398, + "grad_norm": 0.7071637511253357, + "learning_rate": 9.946126147164847e-05, + "loss": 2.0981, + "step": 2457 + }, + { + "epoch": 0.7544505831798649, + "grad_norm": 0.6745624542236328, + "learning_rate": 9.946053352904267e-05, + "loss": 2.1914, + "step": 2458 + }, + { + "epoch": 0.7547575199508901, + "grad_norm": 0.7267486453056335, + "learning_rate": 9.945980509763888e-05, + "loss": 2.1091, + "step": 2459 + }, + { + "epoch": 0.7550644567219152, + "grad_norm": 0.6128695607185364, + "learning_rate": 9.94590761774443e-05, + "loss": 2.1721, + "step": 2460 + }, + { + "epoch": 0.7553713934929405, + "grad_norm": 0.6574678421020508, + "learning_rate": 9.945834676846615e-05, + "loss": 2.1609, + "step": 2461 + }, + { + "epoch": 0.7556783302639656, + "grad_norm": 0.6209995150566101, + "learning_rate": 9.945761687071164e-05, + "loss": 2.1889, + "step": 2462 + }, + { + "epoch": 0.7559852670349908, + "grad_norm": 0.7425361275672913, + "learning_rate": 9.945688648418795e-05, + "loss": 2.2189, + "step": 2463 + }, + { + "epoch": 0.7562922038060159, + "grad_norm": 1.0604934692382812, + "learning_rate": 9.945615560890234e-05, + "loss": 2.1858, + "step": 2464 + }, + { + "epoch": 0.7565991405770411, + "grad_norm": 0.7162829041481018, + "learning_rate": 9.945542424486201e-05, + "loss": 2.101, + "step": 2465 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.6361207962036133, + "learning_rate": 9.945469239207416e-05, + "loss": 2.0836, + "step": 2466 + }, + { + "epoch": 0.7572130141190915, + "grad_norm": 0.5858156085014343, + "learning_rate": 9.945396005054609e-05, + "loss": 2.2059, + "step": 2467 + }, + { + "epoch": 0.7575199508901166, + "grad_norm": 0.7322074174880981, + "learning_rate": 9.945322722028498e-05, + "loss": 2.2295, + "step": 2468 + }, + { + "epoch": 0.7578268876611418, + "grad_norm": 0.775900661945343, + "learning_rate": 9.945249390129811e-05, + "loss": 2.2171, + "step": 2469 + }, + { + "epoch": 0.7581338244321669, + "grad_norm": 0.8801379799842834, + "learning_rate": 9.94517600935927e-05, + "loss": 2.1632, + "step": 2470 + }, + { + "epoch": 0.7584407612031921, + "grad_norm": 0.8258405923843384, + "learning_rate": 9.945102579717602e-05, + "loss": 2.1591, + "step": 2471 + }, + { + "epoch": 0.7587476979742173, + "grad_norm": 0.7472482323646545, + "learning_rate": 9.945029101205532e-05, + "loss": 2.2242, + "step": 2472 + }, + { + "epoch": 0.7590546347452425, + "grad_norm": 0.6594643592834473, + "learning_rate": 9.944955573823785e-05, + "loss": 2.1217, + "step": 2473 + }, + { + "epoch": 0.7593615715162676, + "grad_norm": 0.6547524333000183, + "learning_rate": 9.944881997573088e-05, + "loss": 2.131, + "step": 2474 + }, + { + "epoch": 0.7596685082872928, + "grad_norm": 0.6630129814147949, + "learning_rate": 9.94480837245417e-05, + "loss": 2.1264, + "step": 2475 + }, + { + "epoch": 0.7599754450583179, + "grad_norm": 0.6877384781837463, + "learning_rate": 9.944734698467757e-05, + "loss": 2.2453, + "step": 2476 + }, + { + "epoch": 0.7602823818293432, + "grad_norm": 0.6736158728599548, + "learning_rate": 9.944660975614579e-05, + "loss": 2.1425, + "step": 2477 + }, + { + "epoch": 0.7605893186003683, + "grad_norm": 0.6140786409378052, + "learning_rate": 9.944587203895361e-05, + "loss": 2.1345, + "step": 2478 + }, + { + "epoch": 0.7608962553713935, + "grad_norm": 0.5515910387039185, + "learning_rate": 9.944513383310837e-05, + "loss": 2.086, + "step": 2479 + }, + { + "epoch": 0.7612031921424187, + "grad_norm": 0.49419671297073364, + "learning_rate": 9.944439513861731e-05, + "loss": 2.1069, + "step": 2480 + }, + { + "epoch": 0.7615101289134438, + "grad_norm": 0.5526577234268188, + "learning_rate": 9.944365595548777e-05, + "loss": 2.1702, + "step": 2481 + }, + { + "epoch": 0.761817065684469, + "grad_norm": 0.5430580973625183, + "learning_rate": 9.944291628372702e-05, + "loss": 2.121, + "step": 2482 + }, + { + "epoch": 0.7621240024554942, + "grad_norm": 0.5333554148674011, + "learning_rate": 9.94421761233424e-05, + "loss": 2.1154, + "step": 2483 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.5856761932373047, + "learning_rate": 9.944143547434124e-05, + "loss": 2.1734, + "step": 2484 + }, + { + "epoch": 0.7627378759975445, + "grad_norm": 0.6619083881378174, + "learning_rate": 9.944069433673082e-05, + "loss": 2.2068, + "step": 2485 + }, + { + "epoch": 0.7630448127685697, + "grad_norm": 0.5791018009185791, + "learning_rate": 9.943995271051849e-05, + "loss": 2.0834, + "step": 2486 + }, + { + "epoch": 0.7633517495395948, + "grad_norm": 0.5942522287368774, + "learning_rate": 9.943921059571155e-05, + "loss": 2.2001, + "step": 2487 + }, + { + "epoch": 0.7636586863106201, + "grad_norm": 0.6285880208015442, + "learning_rate": 9.943846799231738e-05, + "loss": 2.1601, + "step": 2488 + }, + { + "epoch": 0.7639656230816452, + "grad_norm": 0.6337715983390808, + "learning_rate": 9.943772490034326e-05, + "loss": 2.1722, + "step": 2489 + }, + { + "epoch": 0.7642725598526704, + "grad_norm": 0.6912121772766113, + "learning_rate": 9.94369813197966e-05, + "loss": 2.1933, + "step": 2490 + }, + { + "epoch": 0.7645794966236955, + "grad_norm": 0.8028284311294556, + "learning_rate": 9.943623725068469e-05, + "loss": 2.129, + "step": 2491 + }, + { + "epoch": 0.7648864333947207, + "grad_norm": 0.8527138233184814, + "learning_rate": 9.943549269301491e-05, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.7651933701657458, + "grad_norm": 0.8422580361366272, + "learning_rate": 9.943474764679462e-05, + "loss": 2.2958, + "step": 2493 + }, + { + "epoch": 0.7655003069367711, + "grad_norm": 0.7698150873184204, + "learning_rate": 9.943400211203118e-05, + "loss": 2.1415, + "step": 2494 + }, + { + "epoch": 0.7658072437077962, + "grad_norm": 0.6360690593719482, + "learning_rate": 9.943325608873196e-05, + "loss": 2.1188, + "step": 2495 + }, + { + "epoch": 0.7661141804788214, + "grad_norm": 0.6225799918174744, + "learning_rate": 9.943250957690433e-05, + "loss": 2.1006, + "step": 2496 + }, + { + "epoch": 0.7664211172498465, + "grad_norm": 0.6694490909576416, + "learning_rate": 9.943176257655567e-05, + "loss": 2.2455, + "step": 2497 + }, + { + "epoch": 0.7667280540208717, + "grad_norm": 0.6188158988952637, + "learning_rate": 9.943101508769335e-05, + "loss": 2.0853, + "step": 2498 + }, + { + "epoch": 0.7670349907918969, + "grad_norm": 0.5934504866600037, + "learning_rate": 9.943026711032477e-05, + "loss": 2.0718, + "step": 2499 + }, + { + "epoch": 0.7673419275629221, + "grad_norm": 0.6261292695999146, + "learning_rate": 9.942951864445732e-05, + "loss": 2.1747, + "step": 2500 + }, + { + "epoch": 0.7676488643339472, + "grad_norm": 0.5891184210777283, + "learning_rate": 9.94287696900984e-05, + "loss": 2.1637, + "step": 2501 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.5321740508079529, + "learning_rate": 9.94280202472554e-05, + "loss": 2.0717, + "step": 2502 + }, + { + "epoch": 0.7682627378759975, + "grad_norm": 0.5563281178474426, + "learning_rate": 9.942727031593573e-05, + "loss": 2.1654, + "step": 2503 + }, + { + "epoch": 0.7685696746470227, + "grad_norm": 0.5672664046287537, + "learning_rate": 9.942651989614681e-05, + "loss": 2.0853, + "step": 2504 + }, + { + "epoch": 0.7688766114180479, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.942576898789606e-05, + "loss": 2.0636, + "step": 2505 + }, + { + "epoch": 0.7691835481890731, + "grad_norm": 0.5802470445632935, + "learning_rate": 9.942501759119088e-05, + "loss": 2.0924, + "step": 2506 + }, + { + "epoch": 0.7694904849600982, + "grad_norm": 0.5630003213882446, + "learning_rate": 9.94242657060387e-05, + "loss": 2.1975, + "step": 2507 + }, + { + "epoch": 0.7697974217311234, + "grad_norm": 0.6001835465431213, + "learning_rate": 9.942351333244697e-05, + "loss": 2.1187, + "step": 2508 + }, + { + "epoch": 0.7701043585021485, + "grad_norm": 0.6702088117599487, + "learning_rate": 9.942276047042311e-05, + "loss": 2.1489, + "step": 2509 + }, + { + "epoch": 0.7704112952731738, + "grad_norm": 0.7941808700561523, + "learning_rate": 9.942200711997456e-05, + "loss": 2.1404, + "step": 2510 + }, + { + "epoch": 0.7707182320441989, + "grad_norm": 0.8202539682388306, + "learning_rate": 9.942125328110876e-05, + "loss": 2.1242, + "step": 2511 + }, + { + "epoch": 0.7710251688152241, + "grad_norm": 0.7667655348777771, + "learning_rate": 9.942049895383319e-05, + "loss": 2.118, + "step": 2512 + }, + { + "epoch": 0.7713321055862492, + "grad_norm": 0.6766887307167053, + "learning_rate": 9.941974413815527e-05, + "loss": 2.2632, + "step": 2513 + }, + { + "epoch": 0.7716390423572744, + "grad_norm": 0.5923287272453308, + "learning_rate": 9.941898883408248e-05, + "loss": 2.1096, + "step": 2514 + }, + { + "epoch": 0.7719459791282995, + "grad_norm": 0.8847586512565613, + "learning_rate": 9.941823304162227e-05, + "loss": 2.2629, + "step": 2515 + }, + { + "epoch": 0.7722529158993248, + "grad_norm": 1.2274069786071777, + "learning_rate": 9.941747676078211e-05, + "loss": 2.2493, + "step": 2516 + }, + { + "epoch": 0.7725598526703499, + "grad_norm": 0.8637729287147522, + "learning_rate": 9.94167199915695e-05, + "loss": 2.1545, + "step": 2517 + }, + { + "epoch": 0.7728667894413751, + "grad_norm": 0.7852178812026978, + "learning_rate": 9.941596273399187e-05, + "loss": 2.1984, + "step": 2518 + }, + { + "epoch": 0.7731737262124002, + "grad_norm": 0.6839576959609985, + "learning_rate": 9.941520498805677e-05, + "loss": 2.1913, + "step": 2519 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.7051649689674377, + "learning_rate": 9.941444675377163e-05, + "loss": 2.1678, + "step": 2520 + }, + { + "epoch": 0.7737875997544506, + "grad_norm": 0.702549159526825, + "learning_rate": 9.941368803114395e-05, + "loss": 2.1426, + "step": 2521 + }, + { + "epoch": 0.7740945365254758, + "grad_norm": 0.6717942953109741, + "learning_rate": 9.941292882018127e-05, + "loss": 2.1873, + "step": 2522 + }, + { + "epoch": 0.7744014732965009, + "grad_norm": 0.6705282926559448, + "learning_rate": 9.941216912089104e-05, + "loss": 2.1363, + "step": 2523 + }, + { + "epoch": 0.7747084100675261, + "grad_norm": 0.5858317017555237, + "learning_rate": 9.941140893328082e-05, + "loss": 2.1019, + "step": 2524 + }, + { + "epoch": 0.7750153468385512, + "grad_norm": 0.6353682279586792, + "learning_rate": 9.941064825735808e-05, + "loss": 2.1765, + "step": 2525 + }, + { + "epoch": 0.7753222836095764, + "grad_norm": 0.6573354601860046, + "learning_rate": 9.940988709313035e-05, + "loss": 2.0636, + "step": 2526 + }, + { + "epoch": 0.7756292203806016, + "grad_norm": 0.6040489077568054, + "learning_rate": 9.940912544060517e-05, + "loss": 2.0902, + "step": 2527 + }, + { + "epoch": 0.7759361571516268, + "grad_norm": 0.7024530172348022, + "learning_rate": 9.940836329979004e-05, + "loss": 2.2198, + "step": 2528 + }, + { + "epoch": 0.7762430939226519, + "grad_norm": 0.6910196542739868, + "learning_rate": 9.940760067069251e-05, + "loss": 2.0546, + "step": 2529 + }, + { + "epoch": 0.7765500306936771, + "grad_norm": 0.6841506361961365, + "learning_rate": 9.940683755332012e-05, + "loss": 2.2159, + "step": 2530 + }, + { + "epoch": 0.7768569674647022, + "grad_norm": 0.6503066420555115, + "learning_rate": 9.940607394768038e-05, + "loss": 2.2156, + "step": 2531 + }, + { + "epoch": 0.7771639042357275, + "grad_norm": 0.6512146592140198, + "learning_rate": 9.940530985378089e-05, + "loss": 2.1417, + "step": 2532 + }, + { + "epoch": 0.7774708410067526, + "grad_norm": 0.6234787106513977, + "learning_rate": 9.940454527162914e-05, + "loss": 2.1315, + "step": 2533 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.6279457211494446, + "learning_rate": 9.940378020123273e-05, + "loss": 2.2699, + "step": 2534 + }, + { + "epoch": 0.7780847145488029, + "grad_norm": 0.6793956160545349, + "learning_rate": 9.940301464259921e-05, + "loss": 2.2488, + "step": 2535 + }, + { + "epoch": 0.7783916513198281, + "grad_norm": 0.721234142780304, + "learning_rate": 9.940224859573614e-05, + "loss": 2.1183, + "step": 2536 + }, + { + "epoch": 0.7786985880908532, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.94014820606511e-05, + "loss": 2.0995, + "step": 2537 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.6358578205108643, + "learning_rate": 9.940071503735165e-05, + "loss": 2.2024, + "step": 2538 + }, + { + "epoch": 0.7793124616329036, + "grad_norm": 0.6250868439674377, + "learning_rate": 9.939994752584538e-05, + "loss": 2.1574, + "step": 2539 + }, + { + "epoch": 0.7796193984039288, + "grad_norm": 0.7657763361930847, + "learning_rate": 9.939917952613989e-05, + "loss": 2.2625, + "step": 2540 + }, + { + "epoch": 0.7799263351749539, + "grad_norm": 0.7625400424003601, + "learning_rate": 9.939841103824275e-05, + "loss": 2.1809, + "step": 2541 + }, + { + "epoch": 0.7802332719459791, + "grad_norm": 0.8593107461929321, + "learning_rate": 9.939764206216155e-05, + "loss": 2.2359, + "step": 2542 + }, + { + "epoch": 0.7805402087170042, + "grad_norm": 0.8441007733345032, + "learning_rate": 9.93968725979039e-05, + "loss": 2.1844, + "step": 2543 + }, + { + "epoch": 0.7808471454880295, + "grad_norm": 0.6408470273017883, + "learning_rate": 9.93961026454774e-05, + "loss": 2.1871, + "step": 2544 + }, + { + "epoch": 0.7811540822590546, + "grad_norm": 0.6779976487159729, + "learning_rate": 9.939533220488966e-05, + "loss": 2.1651, + "step": 2545 + }, + { + "epoch": 0.7814610190300798, + "grad_norm": 0.5885556936264038, + "learning_rate": 9.93945612761483e-05, + "loss": 2.0172, + "step": 2546 + }, + { + "epoch": 0.7817679558011049, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.939378985926094e-05, + "loss": 2.1358, + "step": 2547 + }, + { + "epoch": 0.7820748925721301, + "grad_norm": 0.685183584690094, + "learning_rate": 9.939301795423519e-05, + "loss": 2.1822, + "step": 2548 + }, + { + "epoch": 0.7823818293431553, + "grad_norm": 0.6666997671127319, + "learning_rate": 9.939224556107869e-05, + "loss": 2.288, + "step": 2549 + }, + { + "epoch": 0.7826887661141805, + "grad_norm": 0.6401170492172241, + "learning_rate": 9.939147267979905e-05, + "loss": 2.1038, + "step": 2550 + }, + { + "epoch": 0.7829957028852057, + "grad_norm": 0.645182728767395, + "learning_rate": 9.939069931040396e-05, + "loss": 2.1285, + "step": 2551 + }, + { + "epoch": 0.7833026396562308, + "grad_norm": 0.6795851588249207, + "learning_rate": 9.9389925452901e-05, + "loss": 2.1844, + "step": 2552 + }, + { + "epoch": 0.783609576427256, + "grad_norm": 0.7027488946914673, + "learning_rate": 9.938915110729788e-05, + "loss": 2.1712, + "step": 2553 + }, + { + "epoch": 0.7839165131982812, + "grad_norm": 0.7076524496078491, + "learning_rate": 9.93883762736022e-05, + "loss": 2.1812, + "step": 2554 + }, + { + "epoch": 0.7842234499693064, + "grad_norm": 0.5979459881782532, + "learning_rate": 9.938760095182165e-05, + "loss": 2.0877, + "step": 2555 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.6408665776252747, + "learning_rate": 9.938682514196387e-05, + "loss": 2.191, + "step": 2556 + }, + { + "epoch": 0.7848373235113567, + "grad_norm": 0.6545908451080322, + "learning_rate": 9.938604884403654e-05, + "loss": 2.0933, + "step": 2557 + }, + { + "epoch": 0.7851442602823818, + "grad_norm": 0.7271838784217834, + "learning_rate": 9.938527205804733e-05, + "loss": 2.1804, + "step": 2558 + }, + { + "epoch": 0.785451197053407, + "grad_norm": 0.6371840834617615, + "learning_rate": 9.938449478400391e-05, + "loss": 2.1161, + "step": 2559 + }, + { + "epoch": 0.7857581338244322, + "grad_norm": 0.5922467708587646, + "learning_rate": 9.938371702191398e-05, + "loss": 2.0929, + "step": 2560 + }, + { + "epoch": 0.7860650705954574, + "grad_norm": 0.536125898361206, + "learning_rate": 9.938293877178522e-05, + "loss": 2.0815, + "step": 2561 + }, + { + "epoch": 0.7863720073664825, + "grad_norm": 0.6026225090026855, + "learning_rate": 9.93821600336253e-05, + "loss": 2.1719, + "step": 2562 + }, + { + "epoch": 0.7866789441375077, + "grad_norm": 0.584267795085907, + "learning_rate": 9.938138080744192e-05, + "loss": 2.1515, + "step": 2563 + }, + { + "epoch": 0.7869858809085328, + "grad_norm": 0.6616362929344177, + "learning_rate": 9.938060109324281e-05, + "loss": 2.2425, + "step": 2564 + }, + { + "epoch": 0.787292817679558, + "grad_norm": 0.669987678527832, + "learning_rate": 9.937982089103566e-05, + "loss": 2.1883, + "step": 2565 + }, + { + "epoch": 0.7875997544505832, + "grad_norm": 0.6769465208053589, + "learning_rate": 9.937904020082815e-05, + "loss": 2.1508, + "step": 2566 + }, + { + "epoch": 0.7879066912216084, + "grad_norm": 0.5796112418174744, + "learning_rate": 9.937825902262805e-05, + "loss": 2.0925, + "step": 2567 + }, + { + "epoch": 0.7882136279926335, + "grad_norm": 0.5895870923995972, + "learning_rate": 9.937747735644305e-05, + "loss": 2.1002, + "step": 2568 + }, + { + "epoch": 0.7885205647636587, + "grad_norm": 0.5870219469070435, + "learning_rate": 9.937669520228088e-05, + "loss": 2.1189, + "step": 2569 + }, + { + "epoch": 0.7888275015346838, + "grad_norm": 0.6191404461860657, + "learning_rate": 9.937591256014925e-05, + "loss": 2.1783, + "step": 2570 + }, + { + "epoch": 0.7891344383057091, + "grad_norm": 0.6033806204795837, + "learning_rate": 9.937512943005592e-05, + "loss": 2.1507, + "step": 2571 + }, + { + "epoch": 0.7894413750767342, + "grad_norm": 0.6319470405578613, + "learning_rate": 9.937434581200863e-05, + "loss": 2.2088, + "step": 2572 + }, + { + "epoch": 0.7897483118477594, + "grad_norm": 0.621004581451416, + "learning_rate": 9.93735617060151e-05, + "loss": 2.1523, + "step": 2573 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.6069821715354919, + "learning_rate": 9.937277711208311e-05, + "loss": 2.1437, + "step": 2574 + }, + { + "epoch": 0.7903621853898097, + "grad_norm": 0.6186996102333069, + "learning_rate": 9.937199203022039e-05, + "loss": 2.1541, + "step": 2575 + }, + { + "epoch": 0.7906691221608348, + "grad_norm": 0.6531949639320374, + "learning_rate": 9.937120646043471e-05, + "loss": 2.1928, + "step": 2576 + }, + { + "epoch": 0.7909760589318601, + "grad_norm": 0.5974560379981995, + "learning_rate": 9.937042040273383e-05, + "loss": 2.1814, + "step": 2577 + }, + { + "epoch": 0.7912829957028852, + "grad_norm": 0.59506756067276, + "learning_rate": 9.936963385712552e-05, + "loss": 2.2143, + "step": 2578 + }, + { + "epoch": 0.7915899324739104, + "grad_norm": 0.5878757834434509, + "learning_rate": 9.936884682361755e-05, + "loss": 2.0718, + "step": 2579 + }, + { + "epoch": 0.7918968692449355, + "grad_norm": 0.6318243145942688, + "learning_rate": 9.936805930221769e-05, + "loss": 2.1465, + "step": 2580 + }, + { + "epoch": 0.7922038060159607, + "grad_norm": 0.6474836468696594, + "learning_rate": 9.936727129293376e-05, + "loss": 2.0869, + "step": 2581 + }, + { + "epoch": 0.7925107427869859, + "grad_norm": 0.6589438915252686, + "learning_rate": 9.936648279577349e-05, + "loss": 2.1422, + "step": 2582 + }, + { + "epoch": 0.7928176795580111, + "grad_norm": 0.6935134530067444, + "learning_rate": 9.93656938107447e-05, + "loss": 2.1571, + "step": 2583 + }, + { + "epoch": 0.7931246163290362, + "grad_norm": 0.655430793762207, + "learning_rate": 9.936490433785522e-05, + "loss": 2.1044, + "step": 2584 + }, + { + "epoch": 0.7934315531000614, + "grad_norm": 0.6856111288070679, + "learning_rate": 9.93641143771128e-05, + "loss": 2.0551, + "step": 2585 + }, + { + "epoch": 0.7937384898710865, + "grad_norm": 0.6783097386360168, + "learning_rate": 9.936332392852527e-05, + "loss": 2.1475, + "step": 2586 + }, + { + "epoch": 0.7940454266421118, + "grad_norm": 0.6746678948402405, + "learning_rate": 9.936253299210045e-05, + "loss": 2.1462, + "step": 2587 + }, + { + "epoch": 0.7943523634131369, + "grad_norm": 0.6854017972946167, + "learning_rate": 9.936174156784614e-05, + "loss": 2.1649, + "step": 2588 + }, + { + "epoch": 0.7946593001841621, + "grad_norm": 0.6740380525588989, + "learning_rate": 9.936094965577017e-05, + "loss": 2.06, + "step": 2589 + }, + { + "epoch": 0.7949662369551872, + "grad_norm": 0.6354179978370667, + "learning_rate": 9.936015725588037e-05, + "loss": 2.1938, + "step": 2590 + }, + { + "epoch": 0.7952731737262124, + "grad_norm": 0.6496716141700745, + "learning_rate": 9.935936436818453e-05, + "loss": 2.089, + "step": 2591 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.5996106266975403, + "learning_rate": 9.935857099269057e-05, + "loss": 2.2254, + "step": 2592 + }, + { + "epoch": 0.7958870472682628, + "grad_norm": 0.5630382895469666, + "learning_rate": 9.935777712940625e-05, + "loss": 2.069, + "step": 2593 + }, + { + "epoch": 0.7961939840392879, + "grad_norm": 0.5480468273162842, + "learning_rate": 9.935698277833946e-05, + "loss": 2.1288, + "step": 2594 + }, + { + "epoch": 0.7965009208103131, + "grad_norm": 0.5127096772193909, + "learning_rate": 9.935618793949803e-05, + "loss": 2.0753, + "step": 2595 + }, + { + "epoch": 0.7968078575813382, + "grad_norm": 0.6451439261436462, + "learning_rate": 9.935539261288983e-05, + "loss": 2.3005, + "step": 2596 + }, + { + "epoch": 0.7971147943523634, + "grad_norm": 0.7047737836837769, + "learning_rate": 9.935459679852271e-05, + "loss": 2.1307, + "step": 2597 + }, + { + "epoch": 0.7974217311233885, + "grad_norm": 0.6382983922958374, + "learning_rate": 9.935380049640454e-05, + "loss": 2.1136, + "step": 2598 + }, + { + "epoch": 0.7977286678944138, + "grad_norm": 0.7337773442268372, + "learning_rate": 9.935300370654317e-05, + "loss": 2.0719, + "step": 2599 + }, + { + "epoch": 0.7980356046654389, + "grad_norm": 0.7481197118759155, + "learning_rate": 9.935220642894652e-05, + "loss": 2.2263, + "step": 2600 + }, + { + "epoch": 0.7983425414364641, + "grad_norm": 0.7383365631103516, + "learning_rate": 9.93514086636224e-05, + "loss": 2.2207, + "step": 2601 + }, + { + "epoch": 0.7986494782074892, + "grad_norm": 0.800762951374054, + "learning_rate": 9.935061041057876e-05, + "loss": 2.1848, + "step": 2602 + }, + { + "epoch": 0.7989564149785144, + "grad_norm": 0.6972829699516296, + "learning_rate": 9.934981166982346e-05, + "loss": 2.1301, + "step": 2603 + }, + { + "epoch": 0.7992633517495396, + "grad_norm": 0.5842304229736328, + "learning_rate": 9.93490124413644e-05, + "loss": 2.1311, + "step": 2604 + }, + { + "epoch": 0.7995702885205648, + "grad_norm": 0.6070491075515747, + "learning_rate": 9.934821272520946e-05, + "loss": 2.2226, + "step": 2605 + }, + { + "epoch": 0.7998772252915899, + "grad_norm": 0.6141406297683716, + "learning_rate": 9.934741252136656e-05, + "loss": 2.1425, + "step": 2606 + }, + { + "epoch": 0.8001841620626151, + "grad_norm": 0.5515148043632507, + "learning_rate": 9.934661182984363e-05, + "loss": 2.1138, + "step": 2607 + }, + { + "epoch": 0.8004910988336402, + "grad_norm": 0.5819688439369202, + "learning_rate": 9.934581065064854e-05, + "loss": 2.0835, + "step": 2608 + }, + { + "epoch": 0.8007980356046654, + "grad_norm": 0.593979001045227, + "learning_rate": 9.934500898378922e-05, + "loss": 2.2262, + "step": 2609 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.6978363990783691, + "learning_rate": 9.934420682927361e-05, + "loss": 2.1283, + "step": 2610 + }, + { + "epoch": 0.8014119091467158, + "grad_norm": 0.6205853223800659, + "learning_rate": 9.934340418710963e-05, + "loss": 2.1254, + "step": 2611 + }, + { + "epoch": 0.8017188459177409, + "grad_norm": 0.5547113418579102, + "learning_rate": 9.93426010573052e-05, + "loss": 2.0895, + "step": 2612 + }, + { + "epoch": 0.8020257826887661, + "grad_norm": 0.5652415156364441, + "learning_rate": 9.934179743986827e-05, + "loss": 2.1496, + "step": 2613 + }, + { + "epoch": 0.8023327194597912, + "grad_norm": 0.5833094120025635, + "learning_rate": 9.934099333480678e-05, + "loss": 2.1159, + "step": 2614 + }, + { + "epoch": 0.8026396562308165, + "grad_norm": 0.5929473638534546, + "learning_rate": 9.934018874212866e-05, + "loss": 2.1512, + "step": 2615 + }, + { + "epoch": 0.8029465930018416, + "grad_norm": 0.6359207630157471, + "learning_rate": 9.93393836618419e-05, + "loss": 2.1384, + "step": 2616 + }, + { + "epoch": 0.8032535297728668, + "grad_norm": 0.5934728384017944, + "learning_rate": 9.933857809395441e-05, + "loss": 2.1087, + "step": 2617 + }, + { + "epoch": 0.8035604665438919, + "grad_norm": 0.5685787796974182, + "learning_rate": 9.933777203847418e-05, + "loss": 2.1521, + "step": 2618 + }, + { + "epoch": 0.8038674033149171, + "grad_norm": 0.6276339292526245, + "learning_rate": 9.933696549540918e-05, + "loss": 2.1151, + "step": 2619 + }, + { + "epoch": 0.8041743400859422, + "grad_norm": 0.6206804513931274, + "learning_rate": 9.933615846476736e-05, + "loss": 2.1872, + "step": 2620 + }, + { + "epoch": 0.8044812768569675, + "grad_norm": 0.6645623445510864, + "learning_rate": 9.933535094655671e-05, + "loss": 2.217, + "step": 2621 + }, + { + "epoch": 0.8047882136279927, + "grad_norm": 0.6639950275421143, + "learning_rate": 9.93345429407852e-05, + "loss": 2.1479, + "step": 2622 + }, + { + "epoch": 0.8050951503990178, + "grad_norm": 0.6284301280975342, + "learning_rate": 9.933373444746081e-05, + "loss": 2.1763, + "step": 2623 + }, + { + "epoch": 0.805402087170043, + "grad_norm": 0.5974198579788208, + "learning_rate": 9.933292546659156e-05, + "loss": 2.1453, + "step": 2624 + }, + { + "epoch": 0.8057090239410681, + "grad_norm": 0.6465814113616943, + "learning_rate": 9.933211599818541e-05, + "loss": 2.1999, + "step": 2625 + }, + { + "epoch": 0.8060159607120934, + "grad_norm": 0.6099503040313721, + "learning_rate": 9.933130604225038e-05, + "loss": 2.1523, + "step": 2626 + }, + { + "epoch": 0.8063228974831185, + "grad_norm": 0.5749596953392029, + "learning_rate": 9.933049559879448e-05, + "loss": 2.0802, + "step": 2627 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.5541282892227173, + "learning_rate": 9.93296846678257e-05, + "loss": 2.0851, + "step": 2628 + }, + { + "epoch": 0.8069367710251688, + "grad_norm": 0.5884469747543335, + "learning_rate": 9.932887324935207e-05, + "loss": 2.1824, + "step": 2629 + }, + { + "epoch": 0.807243707796194, + "grad_norm": 0.7330854535102844, + "learning_rate": 9.93280613433816e-05, + "loss": 2.1463, + "step": 2630 + }, + { + "epoch": 0.8075506445672191, + "grad_norm": 0.7012677192687988, + "learning_rate": 9.932724894992232e-05, + "loss": 2.0907, + "step": 2631 + }, + { + "epoch": 0.8078575813382444, + "grad_norm": 0.6487980484962463, + "learning_rate": 9.932643606898224e-05, + "loss": 2.2131, + "step": 2632 + }, + { + "epoch": 0.8081645181092695, + "grad_norm": 0.7956567406654358, + "learning_rate": 9.932562270056941e-05, + "loss": 2.2289, + "step": 2633 + }, + { + "epoch": 0.8084714548802947, + "grad_norm": 0.7904889583587646, + "learning_rate": 9.932480884469187e-05, + "loss": 2.195, + "step": 2634 + }, + { + "epoch": 0.8087783916513198, + "grad_norm": 0.8088505864143372, + "learning_rate": 9.932399450135766e-05, + "loss": 2.1199, + "step": 2635 + }, + { + "epoch": 0.809085328422345, + "grad_norm": 0.7557070851325989, + "learning_rate": 9.932317967057483e-05, + "loss": 2.177, + "step": 2636 + }, + { + "epoch": 0.8093922651933702, + "grad_norm": 0.8585113286972046, + "learning_rate": 9.932236435235143e-05, + "loss": 2.2215, + "step": 2637 + }, + { + "epoch": 0.8096992019643954, + "grad_norm": 0.9541242718696594, + "learning_rate": 9.932154854669551e-05, + "loss": 2.0971, + "step": 2638 + }, + { + "epoch": 0.8100061387354205, + "grad_norm": 0.9696017503738403, + "learning_rate": 9.932073225361513e-05, + "loss": 2.1723, + "step": 2639 + }, + { + "epoch": 0.8103130755064457, + "grad_norm": 0.9876028895378113, + "learning_rate": 9.931991547311839e-05, + "loss": 2.2266, + "step": 2640 + }, + { + "epoch": 0.8106200122774708, + "grad_norm": 0.9169884324073792, + "learning_rate": 9.931909820521332e-05, + "loss": 2.1453, + "step": 2641 + }, + { + "epoch": 0.810926949048496, + "grad_norm": 0.7645174860954285, + "learning_rate": 9.931828044990801e-05, + "loss": 2.1683, + "step": 2642 + }, + { + "epoch": 0.8112338858195212, + "grad_norm": 0.6733110547065735, + "learning_rate": 9.931746220721056e-05, + "loss": 2.0869, + "step": 2643 + }, + { + "epoch": 0.8115408225905464, + "grad_norm": 0.6033461689949036, + "learning_rate": 9.931664347712904e-05, + "loss": 2.1395, + "step": 2644 + }, + { + "epoch": 0.8118477593615715, + "grad_norm": 0.5953301191329956, + "learning_rate": 9.931582425967154e-05, + "loss": 2.0886, + "step": 2645 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.6587704420089722, + "learning_rate": 9.931500455484616e-05, + "loss": 2.1846, + "step": 2646 + }, + { + "epoch": 0.8124616329036218, + "grad_norm": 0.5837808847427368, + "learning_rate": 9.931418436266101e-05, + "loss": 2.0953, + "step": 2647 + }, + { + "epoch": 0.8127685696746471, + "grad_norm": 0.5593163967132568, + "learning_rate": 9.931336368312417e-05, + "loss": 2.1044, + "step": 2648 + }, + { + "epoch": 0.8130755064456722, + "grad_norm": 0.5758668780326843, + "learning_rate": 9.931254251624378e-05, + "loss": 2.1813, + "step": 2649 + }, + { + "epoch": 0.8133824432166974, + "grad_norm": 0.7128240466117859, + "learning_rate": 9.931172086202793e-05, + "loss": 2.1743, + "step": 2650 + }, + { + "epoch": 0.8136893799877225, + "grad_norm": 0.6214346885681152, + "learning_rate": 9.931089872048476e-05, + "loss": 2.0566, + "step": 2651 + }, + { + "epoch": 0.8139963167587477, + "grad_norm": 0.6279975771903992, + "learning_rate": 9.931007609162239e-05, + "loss": 2.1487, + "step": 2652 + }, + { + "epoch": 0.8143032535297728, + "grad_norm": 0.6137428879737854, + "learning_rate": 9.930925297544895e-05, + "loss": 2.1281, + "step": 2653 + }, + { + "epoch": 0.8146101903007981, + "grad_norm": 0.7433622479438782, + "learning_rate": 9.930842937197255e-05, + "loss": 2.2398, + "step": 2654 + }, + { + "epoch": 0.8149171270718232, + "grad_norm": 0.7490934729576111, + "learning_rate": 9.930760528120137e-05, + "loss": 2.0626, + "step": 2655 + }, + { + "epoch": 0.8152240638428484, + "grad_norm": 0.6829020380973816, + "learning_rate": 9.930678070314352e-05, + "loss": 2.0685, + "step": 2656 + }, + { + "epoch": 0.8155310006138735, + "grad_norm": 0.6328942775726318, + "learning_rate": 9.930595563780718e-05, + "loss": 2.1415, + "step": 2657 + }, + { + "epoch": 0.8158379373848987, + "grad_norm": 0.6919183135032654, + "learning_rate": 9.930513008520048e-05, + "loss": 2.1764, + "step": 2658 + }, + { + "epoch": 0.8161448741559238, + "grad_norm": 0.6600683331489563, + "learning_rate": 9.930430404533158e-05, + "loss": 2.2252, + "step": 2659 + }, + { + "epoch": 0.8164518109269491, + "grad_norm": 0.6614112257957458, + "learning_rate": 9.930347751820866e-05, + "loss": 2.0842, + "step": 2660 + }, + { + "epoch": 0.8167587476979742, + "grad_norm": 0.634395182132721, + "learning_rate": 9.930265050383987e-05, + "loss": 2.1784, + "step": 2661 + }, + { + "epoch": 0.8170656844689994, + "grad_norm": 0.6563819050788879, + "learning_rate": 9.930182300223338e-05, + "loss": 2.1845, + "step": 2662 + }, + { + "epoch": 0.8173726212400245, + "grad_norm": 0.7023175954818726, + "learning_rate": 9.93009950133974e-05, + "loss": 2.1913, + "step": 2663 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.6042037010192871, + "learning_rate": 9.930016653734007e-05, + "loss": 2.1624, + "step": 2664 + }, + { + "epoch": 0.8179864947820749, + "grad_norm": 0.5729875564575195, + "learning_rate": 9.929933757406962e-05, + "loss": 2.0439, + "step": 2665 + }, + { + "epoch": 0.8182934315531001, + "grad_norm": 0.5399687886238098, + "learning_rate": 9.929850812359421e-05, + "loss": 2.1438, + "step": 2666 + }, + { + "epoch": 0.8186003683241252, + "grad_norm": 0.6325745582580566, + "learning_rate": 9.929767818592205e-05, + "loss": 2.1644, + "step": 2667 + }, + { + "epoch": 0.8189073050951504, + "grad_norm": 0.6303146481513977, + "learning_rate": 9.929684776106134e-05, + "loss": 2.1106, + "step": 2668 + }, + { + "epoch": 0.8192142418661755, + "grad_norm": 0.6482712030410767, + "learning_rate": 9.929601684902027e-05, + "loss": 2.0877, + "step": 2669 + }, + { + "epoch": 0.8195211786372008, + "grad_norm": 0.6858036518096924, + "learning_rate": 9.92951854498071e-05, + "loss": 2.1263, + "step": 2670 + }, + { + "epoch": 0.8198281154082259, + "grad_norm": 0.6214284896850586, + "learning_rate": 9.929435356343e-05, + "loss": 2.1516, + "step": 2671 + }, + { + "epoch": 0.8201350521792511, + "grad_norm": 0.5486865639686584, + "learning_rate": 9.92935211898972e-05, + "loss": 2.1199, + "step": 2672 + }, + { + "epoch": 0.8204419889502762, + "grad_norm": 0.62936931848526, + "learning_rate": 9.929268832921693e-05, + "loss": 2.1555, + "step": 2673 + }, + { + "epoch": 0.8207489257213014, + "grad_norm": 0.6402064561843872, + "learning_rate": 9.929185498139744e-05, + "loss": 2.1017, + "step": 2674 + }, + { + "epoch": 0.8210558624923265, + "grad_norm": 0.7254593372344971, + "learning_rate": 9.929102114644693e-05, + "loss": 2.1145, + "step": 2675 + }, + { + "epoch": 0.8213627992633518, + "grad_norm": 0.776472806930542, + "learning_rate": 9.929018682437366e-05, + "loss": 2.2582, + "step": 2676 + }, + { + "epoch": 0.8216697360343769, + "grad_norm": 0.7073757648468018, + "learning_rate": 9.928935201518587e-05, + "loss": 2.1135, + "step": 2677 + }, + { + "epoch": 0.8219766728054021, + "grad_norm": 0.7075079679489136, + "learning_rate": 9.928851671889184e-05, + "loss": 2.128, + "step": 2678 + }, + { + "epoch": 0.8222836095764272, + "grad_norm": 0.7937450408935547, + "learning_rate": 9.928768093549979e-05, + "loss": 2.1401, + "step": 2679 + }, + { + "epoch": 0.8225905463474524, + "grad_norm": 0.7523970603942871, + "learning_rate": 9.928684466501797e-05, + "loss": 2.2055, + "step": 2680 + }, + { + "epoch": 0.8228974831184775, + "grad_norm": 0.6644876599311829, + "learning_rate": 9.928600790745466e-05, + "loss": 2.1449, + "step": 2681 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.6054069399833679, + "learning_rate": 9.928517066281816e-05, + "loss": 2.1191, + "step": 2682 + }, + { + "epoch": 0.8235113566605279, + "grad_norm": 0.6610973477363586, + "learning_rate": 9.92843329311167e-05, + "loss": 2.2247, + "step": 2683 + }, + { + "epoch": 0.8238182934315531, + "grad_norm": 0.69968181848526, + "learning_rate": 9.928349471235858e-05, + "loss": 2.149, + "step": 2684 + }, + { + "epoch": 0.8241252302025782, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.928265600655206e-05, + "loss": 2.1906, + "step": 2685 + }, + { + "epoch": 0.8244321669736034, + "grad_norm": 0.6621972918510437, + "learning_rate": 9.928181681370547e-05, + "loss": 2.1259, + "step": 2686 + }, + { + "epoch": 0.8247391037446286, + "grad_norm": 0.6452053785324097, + "learning_rate": 9.928097713382708e-05, + "loss": 2.1301, + "step": 2687 + }, + { + "epoch": 0.8250460405156538, + "grad_norm": 0.6137326955795288, + "learning_rate": 9.928013696692519e-05, + "loss": 2.0942, + "step": 2688 + }, + { + "epoch": 0.8253529772866789, + "grad_norm": 0.6449215412139893, + "learning_rate": 9.92792963130081e-05, + "loss": 2.2135, + "step": 2689 + }, + { + "epoch": 0.8256599140577041, + "grad_norm": 0.5838732123374939, + "learning_rate": 9.927845517208411e-05, + "loss": 2.1161, + "step": 2690 + }, + { + "epoch": 0.8259668508287292, + "grad_norm": 0.6642805337905884, + "learning_rate": 9.927761354416157e-05, + "loss": 2.1228, + "step": 2691 + }, + { + "epoch": 0.8262737875997545, + "grad_norm": 0.653274416923523, + "learning_rate": 9.927677142924874e-05, + "loss": 2.1777, + "step": 2692 + }, + { + "epoch": 0.8265807243707797, + "grad_norm": 0.6471827030181885, + "learning_rate": 9.927592882735398e-05, + "loss": 2.0756, + "step": 2693 + }, + { + "epoch": 0.8268876611418048, + "grad_norm": 0.6215457916259766, + "learning_rate": 9.927508573848562e-05, + "loss": 2.0691, + "step": 2694 + }, + { + "epoch": 0.82719459791283, + "grad_norm": 0.6343390345573425, + "learning_rate": 9.927424216265198e-05, + "loss": 2.2145, + "step": 2695 + }, + { + "epoch": 0.8275015346838551, + "grad_norm": 0.5296334624290466, + "learning_rate": 9.927339809986138e-05, + "loss": 2.0861, + "step": 2696 + }, + { + "epoch": 0.8278084714548803, + "grad_norm": 0.6457146406173706, + "learning_rate": 9.92725535501222e-05, + "loss": 2.1703, + "step": 2697 + }, + { + "epoch": 0.8281154082259055, + "grad_norm": 0.753579318523407, + "learning_rate": 9.927170851344276e-05, + "loss": 2.1628, + "step": 2698 + }, + { + "epoch": 0.8284223449969307, + "grad_norm": 0.7327163815498352, + "learning_rate": 9.927086298983141e-05, + "loss": 2.105, + "step": 2699 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.7786175608634949, + "learning_rate": 9.927001697929653e-05, + "loss": 2.084, + "step": 2700 + }, + { + "epoch": 0.829036218538981, + "grad_norm": 0.6370857357978821, + "learning_rate": 9.926917048184646e-05, + "loss": 2.0888, + "step": 2701 + }, + { + "epoch": 0.8293431553100061, + "grad_norm": 0.6600006818771362, + "learning_rate": 9.926832349748955e-05, + "loss": 2.148, + "step": 2702 + }, + { + "epoch": 0.8296500920810314, + "grad_norm": 0.6266845464706421, + "learning_rate": 9.926747602623422e-05, + "loss": 2.2182, + "step": 2703 + }, + { + "epoch": 0.8299570288520565, + "grad_norm": 0.588934600353241, + "learning_rate": 9.92666280680888e-05, + "loss": 2.1879, + "step": 2704 + }, + { + "epoch": 0.8302639656230817, + "grad_norm": 0.6467881202697754, + "learning_rate": 9.926577962306168e-05, + "loss": 2.1082, + "step": 2705 + }, + { + "epoch": 0.8305709023941068, + "grad_norm": 0.6256638765335083, + "learning_rate": 9.926493069116127e-05, + "loss": 2.1007, + "step": 2706 + }, + { + "epoch": 0.830877839165132, + "grad_norm": 0.5710256099700928, + "learning_rate": 9.926408127239592e-05, + "loss": 2.0783, + "step": 2707 + }, + { + "epoch": 0.8311847759361571, + "grad_norm": 0.5836597681045532, + "learning_rate": 9.926323136677405e-05, + "loss": 2.1292, + "step": 2708 + }, + { + "epoch": 0.8314917127071824, + "grad_norm": 0.6420408487319946, + "learning_rate": 9.926238097430405e-05, + "loss": 2.1191, + "step": 2709 + }, + { + "epoch": 0.8317986494782075, + "grad_norm": 0.6192520260810852, + "learning_rate": 9.926153009499433e-05, + "loss": 2.1401, + "step": 2710 + }, + { + "epoch": 0.8321055862492327, + "grad_norm": 0.5986925959587097, + "learning_rate": 9.92606787288533e-05, + "loss": 2.0466, + "step": 2711 + }, + { + "epoch": 0.8324125230202578, + "grad_norm": 0.6386710405349731, + "learning_rate": 9.925982687588937e-05, + "loss": 2.1975, + "step": 2712 + }, + { + "epoch": 0.832719459791283, + "grad_norm": 0.6678250432014465, + "learning_rate": 9.925897453611095e-05, + "loss": 2.1744, + "step": 2713 + }, + { + "epoch": 0.8330263965623081, + "grad_norm": 0.628873348236084, + "learning_rate": 9.925812170952648e-05, + "loss": 2.0901, + "step": 2714 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6365368366241455, + "learning_rate": 9.925726839614438e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.8336402701043585, + "grad_norm": 0.6812825798988342, + "learning_rate": 9.925641459597309e-05, + "loss": 2.1163, + "step": 2716 + }, + { + "epoch": 0.8339472068753837, + "grad_norm": 0.6961301565170288, + "learning_rate": 9.925556030902103e-05, + "loss": 2.1634, + "step": 2717 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.687017023563385, + "learning_rate": 9.925470553529666e-05, + "loss": 2.1921, + "step": 2718 + }, + { + "epoch": 0.834561080417434, + "grad_norm": 0.6528787612915039, + "learning_rate": 9.925385027480841e-05, + "loss": 2.1148, + "step": 2719 + }, + { + "epoch": 0.8348680171884592, + "grad_norm": 0.6092917323112488, + "learning_rate": 9.925299452756476e-05, + "loss": 2.0154, + "step": 2720 + }, + { + "epoch": 0.8351749539594844, + "grad_norm": 0.6537092328071594, + "learning_rate": 9.925213829357413e-05, + "loss": 2.1775, + "step": 2721 + }, + { + "epoch": 0.8354818907305095, + "grad_norm": 0.6560773849487305, + "learning_rate": 9.925128157284503e-05, + "loss": 2.1628, + "step": 2722 + }, + { + "epoch": 0.8357888275015347, + "grad_norm": 0.5976104140281677, + "learning_rate": 9.925042436538588e-05, + "loss": 2.1527, + "step": 2723 + }, + { + "epoch": 0.8360957642725598, + "grad_norm": 0.6577131152153015, + "learning_rate": 9.924956667120516e-05, + "loss": 2.1449, + "step": 2724 + }, + { + "epoch": 0.836402701043585, + "grad_norm": 0.6574232578277588, + "learning_rate": 9.924870849031136e-05, + "loss": 2.0517, + "step": 2725 + }, + { + "epoch": 0.8367096378146102, + "grad_norm": 0.5988326072692871, + "learning_rate": 9.924784982271297e-05, + "loss": 2.0975, + "step": 2726 + }, + { + "epoch": 0.8370165745856354, + "grad_norm": 0.5970706939697266, + "learning_rate": 9.924699066841845e-05, + "loss": 2.1754, + "step": 2727 + }, + { + "epoch": 0.8373235113566605, + "grad_norm": 0.6547200679779053, + "learning_rate": 9.924613102743632e-05, + "loss": 2.1651, + "step": 2728 + }, + { + "epoch": 0.8376304481276857, + "grad_norm": 0.643358588218689, + "learning_rate": 9.924527089977504e-05, + "loss": 2.1355, + "step": 2729 + }, + { + "epoch": 0.8379373848987108, + "grad_norm": 0.6696504950523376, + "learning_rate": 9.924441028544314e-05, + "loss": 2.1444, + "step": 2730 + }, + { + "epoch": 0.8382443216697361, + "grad_norm": 0.5923263430595398, + "learning_rate": 9.924354918444911e-05, + "loss": 2.1656, + "step": 2731 + }, + { + "epoch": 0.8385512584407612, + "grad_norm": 0.6507698893547058, + "learning_rate": 9.924268759680146e-05, + "loss": 2.1172, + "step": 2732 + }, + { + "epoch": 0.8388581952117864, + "grad_norm": 0.6240561008453369, + "learning_rate": 9.924182552250873e-05, + "loss": 2.113, + "step": 2733 + }, + { + "epoch": 0.8391651319828115, + "grad_norm": 0.7350605726242065, + "learning_rate": 9.92409629615794e-05, + "loss": 2.2099, + "step": 2734 + }, + { + "epoch": 0.8394720687538367, + "grad_norm": 0.679027795791626, + "learning_rate": 9.924009991402202e-05, + "loss": 2.1202, + "step": 2735 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.7187801003456116, + "learning_rate": 9.923923637984512e-05, + "loss": 2.1994, + "step": 2736 + }, + { + "epoch": 0.8400859422958871, + "grad_norm": 0.7437569499015808, + "learning_rate": 9.92383723590572e-05, + "loss": 2.1778, + "step": 2737 + }, + { + "epoch": 0.8403928790669122, + "grad_norm": 0.7004902958869934, + "learning_rate": 9.923750785166686e-05, + "loss": 2.1478, + "step": 2738 + }, + { + "epoch": 0.8406998158379374, + "grad_norm": 0.632478654384613, + "learning_rate": 9.923664285768258e-05, + "loss": 2.1785, + "step": 2739 + }, + { + "epoch": 0.8410067526089625, + "grad_norm": 0.6399826407432556, + "learning_rate": 9.923577737711295e-05, + "loss": 2.1708, + "step": 2740 + }, + { + "epoch": 0.8413136893799877, + "grad_norm": 0.649340033531189, + "learning_rate": 9.92349114099665e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.8416206261510129, + "grad_norm": 0.6143749952316284, + "learning_rate": 9.923404495625182e-05, + "loss": 2.0696, + "step": 2742 + }, + { + "epoch": 0.8419275629220381, + "grad_norm": 0.655846357345581, + "learning_rate": 9.923317801597742e-05, + "loss": 2.1163, + "step": 2743 + }, + { + "epoch": 0.8422344996930632, + "grad_norm": 0.588096022605896, + "learning_rate": 9.923231058915192e-05, + "loss": 2.0893, + "step": 2744 + }, + { + "epoch": 0.8425414364640884, + "grad_norm": 0.5445908904075623, + "learning_rate": 9.923144267578386e-05, + "loss": 2.1223, + "step": 2745 + }, + { + "epoch": 0.8428483732351135, + "grad_norm": 0.5372910499572754, + "learning_rate": 9.923057427588182e-05, + "loss": 2.1386, + "step": 2746 + }, + { + "epoch": 0.8431553100061387, + "grad_norm": 0.5118899345397949, + "learning_rate": 9.922970538945442e-05, + "loss": 2.0532, + "step": 2747 + }, + { + "epoch": 0.8434622467771639, + "grad_norm": 0.5252440571784973, + "learning_rate": 9.922883601651019e-05, + "loss": 2.1679, + "step": 2748 + }, + { + "epoch": 0.8437691835481891, + "grad_norm": 0.5978875160217285, + "learning_rate": 9.922796615705776e-05, + "loss": 2.2054, + "step": 2749 + }, + { + "epoch": 0.8440761203192142, + "grad_norm": 0.5642610788345337, + "learning_rate": 9.922709581110572e-05, + "loss": 2.1886, + "step": 2750 + }, + { + "epoch": 0.8443830570902394, + "grad_norm": 0.6332407593727112, + "learning_rate": 9.922622497866265e-05, + "loss": 2.1618, + "step": 2751 + }, + { + "epoch": 0.8446899938612645, + "grad_norm": 0.6971728801727295, + "learning_rate": 9.922535365973718e-05, + "loss": 2.1011, + "step": 2752 + }, + { + "epoch": 0.8449969306322898, + "grad_norm": 0.6917250156402588, + "learning_rate": 9.922448185433792e-05, + "loss": 2.1408, + "step": 2753 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.748960554599762, + "learning_rate": 9.922360956247348e-05, + "loss": 2.1612, + "step": 2754 + }, + { + "epoch": 0.8456108041743401, + "grad_norm": 0.6739722490310669, + "learning_rate": 9.922273678415245e-05, + "loss": 2.1234, + "step": 2755 + }, + { + "epoch": 0.8459177409453652, + "grad_norm": 0.6310722827911377, + "learning_rate": 9.922186351938351e-05, + "loss": 2.1476, + "step": 2756 + }, + { + "epoch": 0.8462246777163904, + "grad_norm": 0.5992079973220825, + "learning_rate": 9.922098976817527e-05, + "loss": 2.1009, + "step": 2757 + }, + { + "epoch": 0.8465316144874155, + "grad_norm": 0.5697188973426819, + "learning_rate": 9.922011553053637e-05, + "loss": 2.1277, + "step": 2758 + }, + { + "epoch": 0.8468385512584408, + "grad_norm": 0.7005256414413452, + "learning_rate": 9.921924080647541e-05, + "loss": 2.1592, + "step": 2759 + }, + { + "epoch": 0.8471454880294659, + "grad_norm": 0.7664382457733154, + "learning_rate": 9.921836559600109e-05, + "loss": 2.2328, + "step": 2760 + }, + { + "epoch": 0.8474524248004911, + "grad_norm": 0.8668230772018433, + "learning_rate": 9.921748989912201e-05, + "loss": 2.2285, + "step": 2761 + }, + { + "epoch": 0.8477593615715162, + "grad_norm": 0.9423169493675232, + "learning_rate": 9.921661371584685e-05, + "loss": 2.1172, + "step": 2762 + }, + { + "epoch": 0.8480662983425414, + "grad_norm": 0.8547552824020386, + "learning_rate": 9.921573704618428e-05, + "loss": 2.1426, + "step": 2763 + }, + { + "epoch": 0.8483732351135667, + "grad_norm": 0.7568690776824951, + "learning_rate": 9.921485989014294e-05, + "loss": 2.0861, + "step": 2764 + }, + { + "epoch": 0.8486801718845918, + "grad_norm": 0.6535828709602356, + "learning_rate": 9.92139822477315e-05, + "loss": 2.1705, + "step": 2765 + }, + { + "epoch": 0.848987108655617, + "grad_norm": 0.6099218130111694, + "learning_rate": 9.921310411895867e-05, + "loss": 2.1666, + "step": 2766 + }, + { + "epoch": 0.8492940454266421, + "grad_norm": 0.6315065026283264, + "learning_rate": 9.92122255038331e-05, + "loss": 2.1868, + "step": 2767 + }, + { + "epoch": 0.8496009821976673, + "grad_norm": 0.6861329078674316, + "learning_rate": 9.921134640236344e-05, + "loss": 2.1056, + "step": 2768 + }, + { + "epoch": 0.8499079189686924, + "grad_norm": 0.6357519626617432, + "learning_rate": 9.921046681455844e-05, + "loss": 2.1272, + "step": 2769 + }, + { + "epoch": 0.8502148557397177, + "grad_norm": 0.6245810389518738, + "learning_rate": 9.920958674042676e-05, + "loss": 2.1313, + "step": 2770 + }, + { + "epoch": 0.8505217925107428, + "grad_norm": 0.6087192296981812, + "learning_rate": 9.920870617997709e-05, + "loss": 2.123, + "step": 2771 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.6384228467941284, + "learning_rate": 9.920782513321814e-05, + "loss": 2.1343, + "step": 2772 + }, + { + "epoch": 0.8511356660527931, + "grad_norm": 0.6143882274627686, + "learning_rate": 9.920694360015863e-05, + "loss": 2.0706, + "step": 2773 + }, + { + "epoch": 0.8514426028238183, + "grad_norm": 0.5561975240707397, + "learning_rate": 9.920606158080725e-05, + "loss": 2.1015, + "step": 2774 + }, + { + "epoch": 0.8517495395948435, + "grad_norm": 0.5434146523475647, + "learning_rate": 9.920517907517275e-05, + "loss": 2.1306, + "step": 2775 + }, + { + "epoch": 0.8520564763658687, + "grad_norm": 0.6028591990470886, + "learning_rate": 9.920429608326382e-05, + "loss": 2.1665, + "step": 2776 + }, + { + "epoch": 0.8523634131368938, + "grad_norm": 0.6491599082946777, + "learning_rate": 9.920341260508918e-05, + "loss": 2.0715, + "step": 2777 + }, + { + "epoch": 0.852670349907919, + "grad_norm": 0.6350167989730835, + "learning_rate": 9.92025286406576e-05, + "loss": 2.1492, + "step": 2778 + }, + { + "epoch": 0.8529772866789441, + "grad_norm": 0.5726897120475769, + "learning_rate": 9.92016441899778e-05, + "loss": 2.1128, + "step": 2779 + }, + { + "epoch": 0.8532842234499693, + "grad_norm": 0.5680630207061768, + "learning_rate": 9.92007592530585e-05, + "loss": 2.0718, + "step": 2780 + }, + { + "epoch": 0.8535911602209945, + "grad_norm": 0.5901346802711487, + "learning_rate": 9.919987382990845e-05, + "loss": 2.0577, + "step": 2781 + }, + { + "epoch": 0.8538980969920197, + "grad_norm": 0.5756994485855103, + "learning_rate": 9.919898792053643e-05, + "loss": 2.106, + "step": 2782 + }, + { + "epoch": 0.8542050337630448, + "grad_norm": 0.5831238031387329, + "learning_rate": 9.919810152495116e-05, + "loss": 2.0507, + "step": 2783 + }, + { + "epoch": 0.85451197053407, + "grad_norm": 0.529931902885437, + "learning_rate": 9.919721464316143e-05, + "loss": 2.0934, + "step": 2784 + }, + { + "epoch": 0.8548189073050951, + "grad_norm": 0.603672981262207, + "learning_rate": 9.919632727517597e-05, + "loss": 2.164, + "step": 2785 + }, + { + "epoch": 0.8551258440761204, + "grad_norm": 0.5741528868675232, + "learning_rate": 9.919543942100357e-05, + "loss": 2.0948, + "step": 2786 + }, + { + "epoch": 0.8554327808471455, + "grad_norm": 0.5689142942428589, + "learning_rate": 9.919455108065303e-05, + "loss": 2.1572, + "step": 2787 + }, + { + "epoch": 0.8557397176181707, + "grad_norm": 0.5767523646354675, + "learning_rate": 9.919366225413308e-05, + "loss": 2.0528, + "step": 2788 + }, + { + "epoch": 0.8560466543891958, + "grad_norm": 0.6004374623298645, + "learning_rate": 9.919277294145252e-05, + "loss": 2.1078, + "step": 2789 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.6199560761451721, + "learning_rate": 9.919188314262017e-05, + "loss": 2.034, + "step": 2790 + }, + { + "epoch": 0.8566605279312461, + "grad_norm": 0.5928464531898499, + "learning_rate": 9.919099285764478e-05, + "loss": 2.1226, + "step": 2791 + }, + { + "epoch": 0.8569674647022714, + "grad_norm": 0.5620111227035522, + "learning_rate": 9.919010208653517e-05, + "loss": 2.1387, + "step": 2792 + }, + { + "epoch": 0.8572744014732965, + "grad_norm": 0.6035314798355103, + "learning_rate": 9.918921082930015e-05, + "loss": 2.0888, + "step": 2793 + }, + { + "epoch": 0.8575813382443217, + "grad_norm": 0.6842171549797058, + "learning_rate": 9.91883190859485e-05, + "loss": 2.15, + "step": 2794 + }, + { + "epoch": 0.8578882750153468, + "grad_norm": 0.7600229978561401, + "learning_rate": 9.918742685648906e-05, + "loss": 2.1776, + "step": 2795 + }, + { + "epoch": 0.858195211786372, + "grad_norm": 0.641504168510437, + "learning_rate": 9.918653414093065e-05, + "loss": 2.086, + "step": 2796 + }, + { + "epoch": 0.8585021485573971, + "grad_norm": 0.6062462329864502, + "learning_rate": 9.918564093928207e-05, + "loss": 2.0772, + "step": 2797 + }, + { + "epoch": 0.8588090853284224, + "grad_norm": 0.5259165167808533, + "learning_rate": 9.918474725155214e-05, + "loss": 2.1034, + "step": 2798 + }, + { + "epoch": 0.8591160220994475, + "grad_norm": 0.532511830329895, + "learning_rate": 9.918385307774973e-05, + "loss": 2.103, + "step": 2799 + }, + { + "epoch": 0.8594229588704727, + "grad_norm": 0.5996485352516174, + "learning_rate": 9.918295841788366e-05, + "loss": 2.1698, + "step": 2800 + }, + { + "epoch": 0.8597298956414978, + "grad_norm": 0.5895976424217224, + "learning_rate": 9.918206327196276e-05, + "loss": 2.132, + "step": 2801 + }, + { + "epoch": 0.860036832412523, + "grad_norm": 0.6363179087638855, + "learning_rate": 9.918116763999588e-05, + "loss": 2.0967, + "step": 2802 + }, + { + "epoch": 0.8603437691835482, + "grad_norm": 0.6594113707542419, + "learning_rate": 9.918027152199187e-05, + "loss": 2.1266, + "step": 2803 + }, + { + "epoch": 0.8606507059545734, + "grad_norm": 0.694879412651062, + "learning_rate": 9.917937491795961e-05, + "loss": 2.0694, + "step": 2804 + }, + { + "epoch": 0.8609576427255985, + "grad_norm": 0.6310710906982422, + "learning_rate": 9.917847782790793e-05, + "loss": 2.1546, + "step": 2805 + }, + { + "epoch": 0.8612645794966237, + "grad_norm": 0.6166081428527832, + "learning_rate": 9.917758025184572e-05, + "loss": 2.131, + "step": 2806 + }, + { + "epoch": 0.8615715162676488, + "grad_norm": 0.5857066512107849, + "learning_rate": 9.917668218978182e-05, + "loss": 2.1529, + "step": 2807 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.6374151706695557, + "learning_rate": 9.917578364172513e-05, + "loss": 2.151, + "step": 2808 + }, + { + "epoch": 0.8621853898096992, + "grad_norm": 0.6760959625244141, + "learning_rate": 9.917488460768453e-05, + "loss": 2.1955, + "step": 2809 + }, + { + "epoch": 0.8624923265807244, + "grad_norm": 0.6308501362800598, + "learning_rate": 9.917398508766889e-05, + "loss": 2.1449, + "step": 2810 + }, + { + "epoch": 0.8627992633517495, + "grad_norm": 0.615181028842926, + "learning_rate": 9.91730850816871e-05, + "loss": 2.0326, + "step": 2811 + }, + { + "epoch": 0.8631062001227747, + "grad_norm": 0.6746891736984253, + "learning_rate": 9.917218458974809e-05, + "loss": 2.1472, + "step": 2812 + }, + { + "epoch": 0.8634131368937998, + "grad_norm": 0.6594959497451782, + "learning_rate": 9.91712836118607e-05, + "loss": 2.0879, + "step": 2813 + }, + { + "epoch": 0.8637200736648251, + "grad_norm": 0.6843087077140808, + "learning_rate": 9.91703821480339e-05, + "loss": 2.13, + "step": 2814 + }, + { + "epoch": 0.8640270104358502, + "grad_norm": 0.7513928413391113, + "learning_rate": 9.916948019827653e-05, + "loss": 2.1866, + "step": 2815 + }, + { + "epoch": 0.8643339472068754, + "grad_norm": 0.7352319955825806, + "learning_rate": 9.916857776259755e-05, + "loss": 2.0844, + "step": 2816 + }, + { + "epoch": 0.8646408839779005, + "grad_norm": 0.6901769638061523, + "learning_rate": 9.916767484100587e-05, + "loss": 2.086, + "step": 2817 + }, + { + "epoch": 0.8649478207489257, + "grad_norm": 0.621734619140625, + "learning_rate": 9.91667714335104e-05, + "loss": 2.0764, + "step": 2818 + }, + { + "epoch": 0.8652547575199508, + "grad_norm": 0.5779813528060913, + "learning_rate": 9.916586754012008e-05, + "loss": 2.0568, + "step": 2819 + }, + { + "epoch": 0.8655616942909761, + "grad_norm": 0.566251814365387, + "learning_rate": 9.916496316084385e-05, + "loss": 2.1624, + "step": 2820 + }, + { + "epoch": 0.8658686310620012, + "grad_norm": 0.6039763689041138, + "learning_rate": 9.916405829569062e-05, + "loss": 2.0412, + "step": 2821 + }, + { + "epoch": 0.8661755678330264, + "grad_norm": 0.587469220161438, + "learning_rate": 9.916315294466935e-05, + "loss": 2.1513, + "step": 2822 + }, + { + "epoch": 0.8664825046040515, + "grad_norm": 0.5792883634567261, + "learning_rate": 9.916224710778901e-05, + "loss": 2.055, + "step": 2823 + }, + { + "epoch": 0.8667894413750767, + "grad_norm": 0.5533844232559204, + "learning_rate": 9.916134078505852e-05, + "loss": 2.1237, + "step": 2824 + }, + { + "epoch": 0.8670963781461019, + "grad_norm": 0.6140845417976379, + "learning_rate": 9.916043397648685e-05, + "loss": 2.1481, + "step": 2825 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.6092365384101868, + "learning_rate": 9.915952668208295e-05, + "loss": 2.1567, + "step": 2826 + }, + { + "epoch": 0.8677102516881522, + "grad_norm": 0.5712884068489075, + "learning_rate": 9.915861890185578e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.8680171884591774, + "grad_norm": 0.5314213633537292, + "learning_rate": 9.915771063581434e-05, + "loss": 2.0408, + "step": 2828 + }, + { + "epoch": 0.8683241252302025, + "grad_norm": 0.5258345007896423, + "learning_rate": 9.915680188396759e-05, + "loss": 2.0968, + "step": 2829 + }, + { + "epoch": 0.8686310620012277, + "grad_norm": 0.6071497797966003, + "learning_rate": 9.915589264632453e-05, + "loss": 2.0924, + "step": 2830 + }, + { + "epoch": 0.8689379987722529, + "grad_norm": 0.6742420792579651, + "learning_rate": 9.915498292289408e-05, + "loss": 2.1276, + "step": 2831 + }, + { + "epoch": 0.8692449355432781, + "grad_norm": 0.7642729878425598, + "learning_rate": 9.915407271368533e-05, + "loss": 2.204, + "step": 2832 + }, + { + "epoch": 0.8695518723143032, + "grad_norm": 0.8024489283561707, + "learning_rate": 9.915316201870718e-05, + "loss": 2.163, + "step": 2833 + }, + { + "epoch": 0.8698588090853284, + "grad_norm": 0.8268367648124695, + "learning_rate": 9.915225083796871e-05, + "loss": 2.117, + "step": 2834 + }, + { + "epoch": 0.8701657458563536, + "grad_norm": 0.7761407494544983, + "learning_rate": 9.915133917147888e-05, + "loss": 2.0727, + "step": 2835 + }, + { + "epoch": 0.8704726826273788, + "grad_norm": 0.7515753507614136, + "learning_rate": 9.91504270192467e-05, + "loss": 2.075, + "step": 2836 + }, + { + "epoch": 0.870779619398404, + "grad_norm": 0.6203973889350891, + "learning_rate": 9.914951438128119e-05, + "loss": 2.1163, + "step": 2837 + }, + { + "epoch": 0.8710865561694291, + "grad_norm": 0.6056976318359375, + "learning_rate": 9.914860125759138e-05, + "loss": 2.1515, + "step": 2838 + }, + { + "epoch": 0.8713934929404543, + "grad_norm": 0.6472234725952148, + "learning_rate": 9.914768764818627e-05, + "loss": 2.1618, + "step": 2839 + }, + { + "epoch": 0.8717004297114794, + "grad_norm": 0.5981749892234802, + "learning_rate": 9.914677355307491e-05, + "loss": 2.0763, + "step": 2840 + }, + { + "epoch": 0.8720073664825047, + "grad_norm": 0.5721938014030457, + "learning_rate": 9.914585897226634e-05, + "loss": 2.0916, + "step": 2841 + }, + { + "epoch": 0.8723143032535298, + "grad_norm": 0.6079535484313965, + "learning_rate": 9.914494390576958e-05, + "loss": 2.0767, + "step": 2842 + }, + { + "epoch": 0.872621240024555, + "grad_norm": 0.6684066653251648, + "learning_rate": 9.914402835359368e-05, + "loss": 2.2712, + "step": 2843 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.6992711424827576, + "learning_rate": 9.91431123157477e-05, + "loss": 2.0813, + "step": 2844 + }, + { + "epoch": 0.8732351135666053, + "grad_norm": 0.6585392951965332, + "learning_rate": 9.914219579224065e-05, + "loss": 2.1303, + "step": 2845 + }, + { + "epoch": 0.8735420503376304, + "grad_norm": 0.7267395257949829, + "learning_rate": 9.914127878308164e-05, + "loss": 2.2253, + "step": 2846 + }, + { + "epoch": 0.8738489871086557, + "grad_norm": 0.6764006018638611, + "learning_rate": 9.91403612882797e-05, + "loss": 2.0886, + "step": 2847 + }, + { + "epoch": 0.8741559238796808, + "grad_norm": 0.612808108329773, + "learning_rate": 9.91394433078439e-05, + "loss": 2.0469, + "step": 2848 + }, + { + "epoch": 0.874462860650706, + "grad_norm": 0.5598782896995544, + "learning_rate": 9.913852484178334e-05, + "loss": 2.1745, + "step": 2849 + }, + { + "epoch": 0.8747697974217311, + "grad_norm": 0.6498168706893921, + "learning_rate": 9.913760589010707e-05, + "loss": 2.2657, + "step": 2850 + }, + { + "epoch": 0.8750767341927563, + "grad_norm": 0.6796014904975891, + "learning_rate": 9.913668645282418e-05, + "loss": 2.1056, + "step": 2851 + }, + { + "epoch": 0.8753836709637814, + "grad_norm": 0.7409440279006958, + "learning_rate": 9.913576652994376e-05, + "loss": 2.1533, + "step": 2852 + }, + { + "epoch": 0.8756906077348067, + "grad_norm": 0.7044464945793152, + "learning_rate": 9.913484612147488e-05, + "loss": 2.2088, + "step": 2853 + }, + { + "epoch": 0.8759975445058318, + "grad_norm": 0.6333544254302979, + "learning_rate": 9.913392522742666e-05, + "loss": 2.132, + "step": 2854 + }, + { + "epoch": 0.876304481276857, + "grad_norm": 0.603382408618927, + "learning_rate": 9.91330038478082e-05, + "loss": 2.0657, + "step": 2855 + }, + { + "epoch": 0.8766114180478821, + "grad_norm": 0.5919856429100037, + "learning_rate": 9.913208198262858e-05, + "loss": 2.0854, + "step": 2856 + }, + { + "epoch": 0.8769183548189073, + "grad_norm": 0.6033365726470947, + "learning_rate": 9.913115963189694e-05, + "loss": 2.0825, + "step": 2857 + }, + { + "epoch": 0.8772252915899325, + "grad_norm": 0.5917964577674866, + "learning_rate": 9.913023679562238e-05, + "loss": 2.1608, + "step": 2858 + }, + { + "epoch": 0.8775322283609577, + "grad_norm": 0.5953360795974731, + "learning_rate": 9.912931347381402e-05, + "loss": 2.1454, + "step": 2859 + }, + { + "epoch": 0.8778391651319828, + "grad_norm": 0.5949352979660034, + "learning_rate": 9.9128389666481e-05, + "loss": 2.1575, + "step": 2860 + }, + { + "epoch": 0.878146101903008, + "grad_norm": 0.5468181371688843, + "learning_rate": 9.912746537363243e-05, + "loss": 2.151, + "step": 2861 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.5476632714271545, + "learning_rate": 9.912654059527746e-05, + "loss": 2.1015, + "step": 2862 + }, + { + "epoch": 0.8787599754450584, + "grad_norm": 0.6881390810012817, + "learning_rate": 9.912561533142521e-05, + "loss": 2.2002, + "step": 2863 + }, + { + "epoch": 0.8790669122160835, + "grad_norm": 0.6663404703140259, + "learning_rate": 9.912468958208486e-05, + "loss": 2.0691, + "step": 2864 + }, + { + "epoch": 0.8793738489871087, + "grad_norm": 0.5739100575447083, + "learning_rate": 9.91237633472655e-05, + "loss": 2.0852, + "step": 2865 + }, + { + "epoch": 0.8796807857581338, + "grad_norm": 0.5227558016777039, + "learning_rate": 9.912283662697635e-05, + "loss": 2.1144, + "step": 2866 + }, + { + "epoch": 0.879987722529159, + "grad_norm": 0.5626821517944336, + "learning_rate": 9.912190942122652e-05, + "loss": 2.0796, + "step": 2867 + }, + { + "epoch": 0.8802946593001841, + "grad_norm": 0.5367855429649353, + "learning_rate": 9.912098173002518e-05, + "loss": 2.0768, + "step": 2868 + }, + { + "epoch": 0.8806015960712094, + "grad_norm": 0.5285482406616211, + "learning_rate": 9.912005355338152e-05, + "loss": 2.0832, + "step": 2869 + }, + { + "epoch": 0.8809085328422345, + "grad_norm": 0.5384502410888672, + "learning_rate": 9.91191248913047e-05, + "loss": 2.0187, + "step": 2870 + }, + { + "epoch": 0.8812154696132597, + "grad_norm": 0.5099567770957947, + "learning_rate": 9.91181957438039e-05, + "loss": 2.0865, + "step": 2871 + }, + { + "epoch": 0.8815224063842848, + "grad_norm": 0.5513966679573059, + "learning_rate": 9.911726611088831e-05, + "loss": 2.1097, + "step": 2872 + }, + { + "epoch": 0.88182934315531, + "grad_norm": 0.5411790609359741, + "learning_rate": 9.911633599256709e-05, + "loss": 2.0964, + "step": 2873 + }, + { + "epoch": 0.8821362799263351, + "grad_norm": 0.6151100397109985, + "learning_rate": 9.911540538884947e-05, + "loss": 2.1006, + "step": 2874 + }, + { + "epoch": 0.8824432166973604, + "grad_norm": 0.754391610622406, + "learning_rate": 9.911447429974461e-05, + "loss": 2.1493, + "step": 2875 + }, + { + "epoch": 0.8827501534683855, + "grad_norm": 0.7485715746879578, + "learning_rate": 9.911354272526172e-05, + "loss": 2.1136, + "step": 2876 + }, + { + "epoch": 0.8830570902394107, + "grad_norm": 0.6808591485023499, + "learning_rate": 9.911261066541003e-05, + "loss": 2.1238, + "step": 2877 + }, + { + "epoch": 0.8833640270104358, + "grad_norm": 0.5771127343177795, + "learning_rate": 9.911167812019874e-05, + "loss": 2.0846, + "step": 2878 + }, + { + "epoch": 0.883670963781461, + "grad_norm": 0.5991767048835754, + "learning_rate": 9.911074508963705e-05, + "loss": 2.1486, + "step": 2879 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.6899440884590149, + "learning_rate": 9.91098115737342e-05, + "loss": 2.1357, + "step": 2880 + }, + { + "epoch": 0.8842848373235114, + "grad_norm": 0.7102574110031128, + "learning_rate": 9.91088775724994e-05, + "loss": 2.1269, + "step": 2881 + }, + { + "epoch": 0.8845917740945365, + "grad_norm": 0.7238754034042358, + "learning_rate": 9.910794308594189e-05, + "loss": 2.0829, + "step": 2882 + }, + { + "epoch": 0.8848987108655617, + "grad_norm": 0.7232441902160645, + "learning_rate": 9.91070081140709e-05, + "loss": 2.1704, + "step": 2883 + }, + { + "epoch": 0.8852056476365868, + "grad_norm": 0.7136173844337463, + "learning_rate": 9.910607265689569e-05, + "loss": 2.1553, + "step": 2884 + }, + { + "epoch": 0.885512584407612, + "grad_norm": 0.6566216945648193, + "learning_rate": 9.910513671442547e-05, + "loss": 2.0856, + "step": 2885 + }, + { + "epoch": 0.8858195211786372, + "grad_norm": 0.5712916851043701, + "learning_rate": 9.910420028666951e-05, + "loss": 2.1399, + "step": 2886 + }, + { + "epoch": 0.8861264579496624, + "grad_norm": 0.727664589881897, + "learning_rate": 9.910326337363707e-05, + "loss": 2.088, + "step": 2887 + }, + { + "epoch": 0.8864333947206875, + "grad_norm": 0.799963653087616, + "learning_rate": 9.91023259753374e-05, + "loss": 2.0984, + "step": 2888 + }, + { + "epoch": 0.8867403314917127, + "grad_norm": 0.9462977051734924, + "learning_rate": 9.910138809177975e-05, + "loss": 2.1262, + "step": 2889 + }, + { + "epoch": 0.8870472682627378, + "grad_norm": 0.9130533933639526, + "learning_rate": 9.910044972297343e-05, + "loss": 2.1967, + "step": 2890 + }, + { + "epoch": 0.887354205033763, + "grad_norm": 0.6971304416656494, + "learning_rate": 9.909951086892767e-05, + "loss": 2.0797, + "step": 2891 + }, + { + "epoch": 0.8876611418047882, + "grad_norm": 0.5822353363037109, + "learning_rate": 9.909857152965176e-05, + "loss": 2.1152, + "step": 2892 + }, + { + "epoch": 0.8879680785758134, + "grad_norm": 0.5885453820228577, + "learning_rate": 9.9097631705155e-05, + "loss": 2.0323, + "step": 2893 + }, + { + "epoch": 0.8882750153468385, + "grad_norm": 0.6249284744262695, + "learning_rate": 9.909669139544666e-05, + "loss": 2.1076, + "step": 2894 + }, + { + "epoch": 0.8885819521178637, + "grad_norm": 0.6117702722549438, + "learning_rate": 9.909575060053604e-05, + "loss": 2.0608, + "step": 2895 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.560357928276062, + "learning_rate": 9.909480932043245e-05, + "loss": 2.145, + "step": 2896 + }, + { + "epoch": 0.8891958256599141, + "grad_norm": 0.5442607998847961, + "learning_rate": 9.909386755514516e-05, + "loss": 2.1091, + "step": 2897 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.5653077363967896, + "learning_rate": 9.909292530468351e-05, + "loss": 2.1097, + "step": 2898 + }, + { + "epoch": 0.8898096992019644, + "grad_norm": 0.531939685344696, + "learning_rate": 9.909198256905679e-05, + "loss": 2.0866, + "step": 2899 + }, + { + "epoch": 0.8901166359729895, + "grad_norm": 0.6238400340080261, + "learning_rate": 9.909103934827433e-05, + "loss": 2.1421, + "step": 2900 + }, + { + "epoch": 0.8904235727440147, + "grad_norm": 0.5685901045799255, + "learning_rate": 9.909009564234543e-05, + "loss": 2.0019, + "step": 2901 + }, + { + "epoch": 0.8907305095150398, + "grad_norm": 0.5979083180427551, + "learning_rate": 9.908915145127945e-05, + "loss": 2.0891, + "step": 2902 + }, + { + "epoch": 0.8910374462860651, + "grad_norm": 0.5847237706184387, + "learning_rate": 9.90882067750857e-05, + "loss": 2.1165, + "step": 2903 + }, + { + "epoch": 0.8913443830570903, + "grad_norm": 0.6281530261039734, + "learning_rate": 9.908726161377351e-05, + "loss": 2.1396, + "step": 2904 + }, + { + "epoch": 0.8916513198281154, + "grad_norm": 0.5685252547264099, + "learning_rate": 9.908631596735225e-05, + "loss": 2.0781, + "step": 2905 + }, + { + "epoch": 0.8919582565991406, + "grad_norm": 0.5427065491676331, + "learning_rate": 9.908536983583123e-05, + "loss": 2.1387, + "step": 2906 + }, + { + "epoch": 0.8922651933701657, + "grad_norm": 0.5972270965576172, + "learning_rate": 9.908442321921982e-05, + "loss": 2.0546, + "step": 2907 + }, + { + "epoch": 0.892572130141191, + "grad_norm": 0.562685489654541, + "learning_rate": 9.908347611752735e-05, + "loss": 2.093, + "step": 2908 + }, + { + "epoch": 0.8928790669122161, + "grad_norm": 0.6781734824180603, + "learning_rate": 9.908252853076323e-05, + "loss": 2.1589, + "step": 2909 + }, + { + "epoch": 0.8931860036832413, + "grad_norm": 0.7591540813446045, + "learning_rate": 9.908158045893678e-05, + "loss": 2.164, + "step": 2910 + }, + { + "epoch": 0.8934929404542664, + "grad_norm": 0.7161938548088074, + "learning_rate": 9.908063190205738e-05, + "loss": 2.079, + "step": 2911 + }, + { + "epoch": 0.8937998772252916, + "grad_norm": 0.7338036298751831, + "learning_rate": 9.907968286013442e-05, + "loss": 2.0033, + "step": 2912 + }, + { + "epoch": 0.8941068139963168, + "grad_norm": 0.7641176581382751, + "learning_rate": 9.907873333317727e-05, + "loss": 2.187, + "step": 2913 + }, + { + "epoch": 0.894413750767342, + "grad_norm": 0.6073760390281677, + "learning_rate": 9.90777833211953e-05, + "loss": 2.0589, + "step": 2914 + }, + { + "epoch": 0.8947206875383671, + "grad_norm": 0.49493756890296936, + "learning_rate": 9.907683282419791e-05, + "loss": 2.0555, + "step": 2915 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.6428996920585632, + "learning_rate": 9.907588184219449e-05, + "loss": 2.1083, + "step": 2916 + }, + { + "epoch": 0.8953345610804174, + "grad_norm": 0.6752644777297974, + "learning_rate": 9.907493037519447e-05, + "loss": 2.0987, + "step": 2917 + }, + { + "epoch": 0.8956414978514426, + "grad_norm": 0.5719494223594666, + "learning_rate": 9.907397842320719e-05, + "loss": 2.1735, + "step": 2918 + }, + { + "epoch": 0.8959484346224678, + "grad_norm": 0.5799626111984253, + "learning_rate": 9.907302598624211e-05, + "loss": 2.0978, + "step": 2919 + }, + { + "epoch": 0.896255371393493, + "grad_norm": 0.5407500267028809, + "learning_rate": 9.907207306430861e-05, + "loss": 2.0303, + "step": 2920 + }, + { + "epoch": 0.8965623081645181, + "grad_norm": 0.5950884222984314, + "learning_rate": 9.907111965741614e-05, + "loss": 2.0721, + "step": 2921 + }, + { + "epoch": 0.8968692449355433, + "grad_norm": 0.7711441516876221, + "learning_rate": 9.907016576557409e-05, + "loss": 2.1693, + "step": 2922 + }, + { + "epoch": 0.8971761817065684, + "grad_norm": 0.5522177815437317, + "learning_rate": 9.906921138879191e-05, + "loss": 2.1057, + "step": 2923 + }, + { + "epoch": 0.8974831184775937, + "grad_norm": 0.5743894577026367, + "learning_rate": 9.906825652707903e-05, + "loss": 2.119, + "step": 2924 + }, + { + "epoch": 0.8977900552486188, + "grad_norm": 0.5996440649032593, + "learning_rate": 9.906730118044486e-05, + "loss": 2.1251, + "step": 2925 + }, + { + "epoch": 0.898096992019644, + "grad_norm": 0.691302478313446, + "learning_rate": 9.906634534889887e-05, + "loss": 2.1459, + "step": 2926 + }, + { + "epoch": 0.8984039287906691, + "grad_norm": 0.6125866770744324, + "learning_rate": 9.90653890324505e-05, + "loss": 2.0739, + "step": 2927 + }, + { + "epoch": 0.8987108655616943, + "grad_norm": 0.5285681486129761, + "learning_rate": 9.906443223110919e-05, + "loss": 2.0398, + "step": 2928 + }, + { + "epoch": 0.8990178023327194, + "grad_norm": 0.5747935771942139, + "learning_rate": 9.90634749448844e-05, + "loss": 2.0688, + "step": 2929 + }, + { + "epoch": 0.8993247391037447, + "grad_norm": 0.5686646103858948, + "learning_rate": 9.90625171737856e-05, + "loss": 2.1196, + "step": 2930 + }, + { + "epoch": 0.8996316758747698, + "grad_norm": 0.5320247411727905, + "learning_rate": 9.906155891782225e-05, + "loss": 2.1069, + "step": 2931 + }, + { + "epoch": 0.899938612645795, + "grad_norm": 0.5626047849655151, + "learning_rate": 9.906060017700383e-05, + "loss": 2.1091, + "step": 2932 + }, + { + "epoch": 0.9002455494168201, + "grad_norm": 0.5284978151321411, + "learning_rate": 9.905964095133979e-05, + "loss": 2.036, + "step": 2933 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.5362093448638916, + "learning_rate": 9.905868124083962e-05, + "loss": 2.1273, + "step": 2934 + }, + { + "epoch": 0.9008594229588704, + "grad_norm": 0.5583781599998474, + "learning_rate": 9.90577210455128e-05, + "loss": 2.0871, + "step": 2935 + }, + { + "epoch": 0.9011663597298957, + "grad_norm": 0.5552016496658325, + "learning_rate": 9.905676036536883e-05, + "loss": 2.0785, + "step": 2936 + }, + { + "epoch": 0.9014732965009208, + "grad_norm": 0.6875657439231873, + "learning_rate": 9.905579920041724e-05, + "loss": 2.083, + "step": 2937 + }, + { + "epoch": 0.901780233271946, + "grad_norm": 0.5396340489387512, + "learning_rate": 9.905483755066744e-05, + "loss": 2.0717, + "step": 2938 + }, + { + "epoch": 0.9020871700429711, + "grad_norm": 0.594739556312561, + "learning_rate": 9.9053875416129e-05, + "loss": 2.1305, + "step": 2939 + }, + { + "epoch": 0.9023941068139963, + "grad_norm": 0.6208831667900085, + "learning_rate": 9.905291279681143e-05, + "loss": 2.0034, + "step": 2940 + }, + { + "epoch": 0.9027010435850215, + "grad_norm": 0.5154325366020203, + "learning_rate": 9.90519496927242e-05, + "loss": 2.098, + "step": 2941 + }, + { + "epoch": 0.9030079803560467, + "grad_norm": 0.5217738151550293, + "learning_rate": 9.905098610387687e-05, + "loss": 2.0467, + "step": 2942 + }, + { + "epoch": 0.9033149171270718, + "grad_norm": 0.5623623728752136, + "learning_rate": 9.905002203027894e-05, + "loss": 2.1854, + "step": 2943 + }, + { + "epoch": 0.903621853898097, + "grad_norm": 0.5365456938743591, + "learning_rate": 9.904905747193993e-05, + "loss": 2.1021, + "step": 2944 + }, + { + "epoch": 0.9039287906691221, + "grad_norm": 0.5391906499862671, + "learning_rate": 9.904809242886941e-05, + "loss": 2.1102, + "step": 2945 + }, + { + "epoch": 0.9042357274401474, + "grad_norm": 0.5439971685409546, + "learning_rate": 9.904712690107687e-05, + "loss": 2.0691, + "step": 2946 + }, + { + "epoch": 0.9045426642111725, + "grad_norm": 0.539383053779602, + "learning_rate": 9.904616088857189e-05, + "loss": 2.0514, + "step": 2947 + }, + { + "epoch": 0.9048496009821977, + "grad_norm": 0.5370060801506042, + "learning_rate": 9.904519439136399e-05, + "loss": 2.1069, + "step": 2948 + }, + { + "epoch": 0.9051565377532228, + "grad_norm": 0.5136541724205017, + "learning_rate": 9.904422740946274e-05, + "loss": 2.0519, + "step": 2949 + }, + { + "epoch": 0.905463474524248, + "grad_norm": 0.4970051348209381, + "learning_rate": 9.904325994287768e-05, + "loss": 2.0624, + "step": 2950 + }, + { + "epoch": 0.9057704112952731, + "grad_norm": 0.5003986954689026, + "learning_rate": 9.90422919916184e-05, + "loss": 2.135, + "step": 2951 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.5559821724891663, + "learning_rate": 9.904132355569443e-05, + "loss": 2.0733, + "step": 2952 + }, + { + "epoch": 0.9063842848373235, + "grad_norm": 0.5450533628463745, + "learning_rate": 9.904035463511537e-05, + "loss": 2.1491, + "step": 2953 + }, + { + "epoch": 0.9066912216083487, + "grad_norm": 0.5789141058921814, + "learning_rate": 9.903938522989076e-05, + "loss": 2.0604, + "step": 2954 + }, + { + "epoch": 0.9069981583793738, + "grad_norm": 0.6327412128448486, + "learning_rate": 9.903841534003023e-05, + "loss": 2.1307, + "step": 2955 + }, + { + "epoch": 0.907305095150399, + "grad_norm": 0.5694023966789246, + "learning_rate": 9.90374449655433e-05, + "loss": 2.1322, + "step": 2956 + }, + { + "epoch": 0.9076120319214241, + "grad_norm": 0.6241337060928345, + "learning_rate": 9.903647410643963e-05, + "loss": 2.1026, + "step": 2957 + }, + { + "epoch": 0.9079189686924494, + "grad_norm": 0.6257766485214233, + "learning_rate": 9.903550276272878e-05, + "loss": 2.0449, + "step": 2958 + }, + { + "epoch": 0.9082259054634745, + "grad_norm": 0.708626389503479, + "learning_rate": 9.903453093442032e-05, + "loss": 2.095, + "step": 2959 + }, + { + "epoch": 0.9085328422344997, + "grad_norm": 0.6769086122512817, + "learning_rate": 9.903355862152391e-05, + "loss": 2.0939, + "step": 2960 + }, + { + "epoch": 0.9088397790055248, + "grad_norm": 0.6221890449523926, + "learning_rate": 9.903258582404913e-05, + "loss": 2.1552, + "step": 2961 + }, + { + "epoch": 0.90914671577655, + "grad_norm": 0.7477858662605286, + "learning_rate": 9.903161254200561e-05, + "loss": 2.1155, + "step": 2962 + }, + { + "epoch": 0.9094536525475752, + "grad_norm": 0.665538489818573, + "learning_rate": 9.903063877540294e-05, + "loss": 2.1032, + "step": 2963 + }, + { + "epoch": 0.9097605893186004, + "grad_norm": 0.5973435044288635, + "learning_rate": 9.902966452425076e-05, + "loss": 2.0793, + "step": 2964 + }, + { + "epoch": 0.9100675260896255, + "grad_norm": 0.6544547080993652, + "learning_rate": 9.90286897885587e-05, + "loss": 2.1566, + "step": 2965 + }, + { + "epoch": 0.9103744628606507, + "grad_norm": 0.7162452936172485, + "learning_rate": 9.90277145683364e-05, + "loss": 2.1234, + "step": 2966 + }, + { + "epoch": 0.9106813996316758, + "grad_norm": 0.8400503993034363, + "learning_rate": 9.902673886359349e-05, + "loss": 2.216, + "step": 2967 + }, + { + "epoch": 0.910988336402701, + "grad_norm": 1.0350611209869385, + "learning_rate": 9.902576267433961e-05, + "loss": 2.0785, + "step": 2968 + }, + { + "epoch": 0.9112952731737262, + "grad_norm": 0.9551987051963806, + "learning_rate": 9.90247860005844e-05, + "loss": 2.0652, + "step": 2969 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.839712381362915, + "learning_rate": 9.902380884233751e-05, + "loss": 2.1197, + "step": 2970 + }, + { + "epoch": 0.9119091467157765, + "grad_norm": 0.6588022708892822, + "learning_rate": 9.902283119960863e-05, + "loss": 2.155, + "step": 2971 + }, + { + "epoch": 0.9122160834868017, + "grad_norm": 0.6532430052757263, + "learning_rate": 9.902185307240739e-05, + "loss": 2.0947, + "step": 2972 + }, + { + "epoch": 0.9125230202578268, + "grad_norm": 0.7890481352806091, + "learning_rate": 9.902087446074346e-05, + "loss": 2.0246, + "step": 2973 + }, + { + "epoch": 0.9128299570288521, + "grad_norm": 0.6234511137008667, + "learning_rate": 9.901989536462652e-05, + "loss": 2.1033, + "step": 2974 + }, + { + "epoch": 0.9131368937998773, + "grad_norm": 0.5875300168991089, + "learning_rate": 9.901891578406623e-05, + "loss": 2.0553, + "step": 2975 + }, + { + "epoch": 0.9134438305709024, + "grad_norm": 0.6868174076080322, + "learning_rate": 9.901793571907231e-05, + "loss": 2.1398, + "step": 2976 + }, + { + "epoch": 0.9137507673419276, + "grad_norm": 0.7423301339149475, + "learning_rate": 9.90169551696544e-05, + "loss": 2.1034, + "step": 2977 + }, + { + "epoch": 0.9140577041129527, + "grad_norm": 0.588916003704071, + "learning_rate": 9.901597413582222e-05, + "loss": 2.078, + "step": 2978 + }, + { + "epoch": 0.914364640883978, + "grad_norm": 0.5895309448242188, + "learning_rate": 9.901499261758544e-05, + "loss": 2.0902, + "step": 2979 + }, + { + "epoch": 0.9146715776550031, + "grad_norm": 0.5403301119804382, + "learning_rate": 9.901401061495379e-05, + "loss": 2.0291, + "step": 2980 + }, + { + "epoch": 0.9149785144260283, + "grad_norm": 0.6102077960968018, + "learning_rate": 9.901302812793696e-05, + "loss": 2.0415, + "step": 2981 + }, + { + "epoch": 0.9152854511970534, + "grad_norm": 0.6728450059890747, + "learning_rate": 9.901204515654465e-05, + "loss": 2.105, + "step": 2982 + }, + { + "epoch": 0.9155923879680786, + "grad_norm": 0.5886163711547852, + "learning_rate": 9.901106170078657e-05, + "loss": 2.0186, + "step": 2983 + }, + { + "epoch": 0.9158993247391037, + "grad_norm": 0.539252758026123, + "learning_rate": 9.901007776067247e-05, + "loss": 2.0604, + "step": 2984 + }, + { + "epoch": 0.916206261510129, + "grad_norm": 0.6169516444206238, + "learning_rate": 9.900909333621205e-05, + "loss": 2.1257, + "step": 2985 + }, + { + "epoch": 0.9165131982811541, + "grad_norm": 0.5624274015426636, + "learning_rate": 9.900810842741506e-05, + "loss": 2.0325, + "step": 2986 + }, + { + "epoch": 0.9168201350521793, + "grad_norm": 0.5931735634803772, + "learning_rate": 9.900712303429119e-05, + "loss": 2.0815, + "step": 2987 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.5720505714416504, + "learning_rate": 9.900613715685023e-05, + "loss": 2.1261, + "step": 2988 + }, + { + "epoch": 0.9174340085942296, + "grad_norm": 0.5752067565917969, + "learning_rate": 9.900515079510189e-05, + "loss": 2.1402, + "step": 2989 + }, + { + "epoch": 0.9177409453652547, + "grad_norm": 0.5836917757987976, + "learning_rate": 9.900416394905591e-05, + "loss": 2.0523, + "step": 2990 + }, + { + "epoch": 0.91804788213628, + "grad_norm": 0.6408325433731079, + "learning_rate": 9.900317661872209e-05, + "loss": 2.1874, + "step": 2991 + }, + { + "epoch": 0.9183548189073051, + "grad_norm": 0.6188341379165649, + "learning_rate": 9.900218880411013e-05, + "loss": 2.0903, + "step": 2992 + }, + { + "epoch": 0.9186617556783303, + "grad_norm": 0.5740565657615662, + "learning_rate": 9.900120050522985e-05, + "loss": 2.1243, + "step": 2993 + }, + { + "epoch": 0.9189686924493554, + "grad_norm": 0.635638952255249, + "learning_rate": 9.900021172209096e-05, + "loss": 2.089, + "step": 2994 + }, + { + "epoch": 0.9192756292203806, + "grad_norm": 0.5538209676742554, + "learning_rate": 9.899922245470326e-05, + "loss": 2.0489, + "step": 2995 + }, + { + "epoch": 0.9195825659914058, + "grad_norm": 0.5440292954444885, + "learning_rate": 9.899823270307654e-05, + "loss": 2.0534, + "step": 2996 + }, + { + "epoch": 0.919889502762431, + "grad_norm": 0.6203792691230774, + "learning_rate": 9.899724246722055e-05, + "loss": 2.2799, + "step": 2997 + }, + { + "epoch": 0.9201964395334561, + "grad_norm": 0.6299278140068054, + "learning_rate": 9.89962517471451e-05, + "loss": 2.0813, + "step": 2998 + }, + { + "epoch": 0.9205033763044813, + "grad_norm": 0.6156774759292603, + "learning_rate": 9.899526054285997e-05, + "loss": 2.1345, + "step": 2999 + }, + { + "epoch": 0.9208103130755064, + "grad_norm": 0.5940032601356506, + "learning_rate": 9.899426885437496e-05, + "loss": 2.133, + "step": 3000 + }, + { + "epoch": 0.9211172498465316, + "grad_norm": 0.6210232377052307, + "learning_rate": 9.899327668169987e-05, + "loss": 2.0275, + "step": 3001 + }, + { + "epoch": 0.9214241866175568, + "grad_norm": 0.5578985214233398, + "learning_rate": 9.89922840248445e-05, + "loss": 2.0806, + "step": 3002 + }, + { + "epoch": 0.921731123388582, + "grad_norm": 0.5264963507652283, + "learning_rate": 9.899129088381866e-05, + "loss": 2.1233, + "step": 3003 + }, + { + "epoch": 0.9220380601596071, + "grad_norm": 0.5414119958877563, + "learning_rate": 9.899029725863218e-05, + "loss": 2.1052, + "step": 3004 + }, + { + "epoch": 0.9223449969306323, + "grad_norm": 0.5933207869529724, + "learning_rate": 9.898930314929486e-05, + "loss": 2.108, + "step": 3005 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.6170317530632019, + "learning_rate": 9.898830855581654e-05, + "loss": 2.0997, + "step": 3006 + }, + { + "epoch": 0.9229588704726827, + "grad_norm": 0.5930282473564148, + "learning_rate": 9.898731347820705e-05, + "loss": 2.0507, + "step": 3007 + }, + { + "epoch": 0.9232658072437078, + "grad_norm": 0.5894142985343933, + "learning_rate": 9.898631791647619e-05, + "loss": 2.0687, + "step": 3008 + }, + { + "epoch": 0.923572744014733, + "grad_norm": 0.6560437083244324, + "learning_rate": 9.898532187063383e-05, + "loss": 2.096, + "step": 3009 + }, + { + "epoch": 0.9238796807857581, + "grad_norm": 0.6083245873451233, + "learning_rate": 9.898432534068983e-05, + "loss": 2.0526, + "step": 3010 + }, + { + "epoch": 0.9241866175567833, + "grad_norm": 0.5152565240859985, + "learning_rate": 9.8983328326654e-05, + "loss": 2.0802, + "step": 3011 + }, + { + "epoch": 0.9244935543278084, + "grad_norm": 0.6326588988304138, + "learning_rate": 9.89823308285362e-05, + "loss": 2.1246, + "step": 3012 + }, + { + "epoch": 0.9248004910988337, + "grad_norm": 0.6821309328079224, + "learning_rate": 9.898133284634632e-05, + "loss": 2.1106, + "step": 3013 + }, + { + "epoch": 0.9251074278698588, + "grad_norm": 0.6192164421081543, + "learning_rate": 9.898033438009419e-05, + "loss": 2.0475, + "step": 3014 + }, + { + "epoch": 0.925414364640884, + "grad_norm": 0.6112427115440369, + "learning_rate": 9.897933542978967e-05, + "loss": 2.0904, + "step": 3015 + }, + { + "epoch": 0.9257213014119091, + "grad_norm": 0.5729427933692932, + "learning_rate": 9.897833599544268e-05, + "loss": 2.1151, + "step": 3016 + }, + { + "epoch": 0.9260282381829343, + "grad_norm": 0.6200255751609802, + "learning_rate": 9.897733607706305e-05, + "loss": 2.0815, + "step": 3017 + }, + { + "epoch": 0.9263351749539595, + "grad_norm": 0.635920524597168, + "learning_rate": 9.897633567466068e-05, + "loss": 2.0724, + "step": 3018 + }, + { + "epoch": 0.9266421117249847, + "grad_norm": 0.5916038155555725, + "learning_rate": 9.897533478824546e-05, + "loss": 2.1527, + "step": 3019 + }, + { + "epoch": 0.9269490484960098, + "grad_norm": 0.5552941560745239, + "learning_rate": 9.897433341782727e-05, + "loss": 2.0958, + "step": 3020 + }, + { + "epoch": 0.927255985267035, + "grad_norm": 0.562383770942688, + "learning_rate": 9.897333156341602e-05, + "loss": 2.0939, + "step": 3021 + }, + { + "epoch": 0.9275629220380601, + "grad_norm": 0.5227869153022766, + "learning_rate": 9.897232922502158e-05, + "loss": 2.1358, + "step": 3022 + }, + { + "epoch": 0.9278698588090853, + "grad_norm": 0.5671074986457825, + "learning_rate": 9.897132640265391e-05, + "loss": 2.0877, + "step": 3023 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.5176356434822083, + "learning_rate": 9.897032309632287e-05, + "loss": 2.0392, + "step": 3024 + }, + { + "epoch": 0.9284837323511357, + "grad_norm": 0.5160155296325684, + "learning_rate": 9.89693193060384e-05, + "loss": 2.069, + "step": 3025 + }, + { + "epoch": 0.9287906691221608, + "grad_norm": 0.5034440159797668, + "learning_rate": 9.896831503181042e-05, + "loss": 2.0348, + "step": 3026 + }, + { + "epoch": 0.929097605893186, + "grad_norm": 0.5146151781082153, + "learning_rate": 9.896731027364884e-05, + "loss": 2.0884, + "step": 3027 + }, + { + "epoch": 0.9294045426642111, + "grad_norm": 0.7153071165084839, + "learning_rate": 9.896630503156361e-05, + "loss": 2.2295, + "step": 3028 + }, + { + "epoch": 0.9297114794352364, + "grad_norm": 0.7201753258705139, + "learning_rate": 9.896529930556464e-05, + "loss": 2.1285, + "step": 3029 + }, + { + "epoch": 0.9300184162062615, + "grad_norm": 0.7110029458999634, + "learning_rate": 9.89642930956619e-05, + "loss": 2.1371, + "step": 3030 + }, + { + "epoch": 0.9303253529772867, + "grad_norm": 0.695444643497467, + "learning_rate": 9.896328640186531e-05, + "loss": 2.0698, + "step": 3031 + }, + { + "epoch": 0.9306322897483118, + "grad_norm": 0.6157357096672058, + "learning_rate": 9.896227922418482e-05, + "loss": 2.1294, + "step": 3032 + }, + { + "epoch": 0.930939226519337, + "grad_norm": 0.5473730564117432, + "learning_rate": 9.896127156263039e-05, + "loss": 2.0487, + "step": 3033 + }, + { + "epoch": 0.9312461632903621, + "grad_norm": 0.6400229334831238, + "learning_rate": 9.896026341721198e-05, + "loss": 2.0422, + "step": 3034 + }, + { + "epoch": 0.9315531000613874, + "grad_norm": 0.5046324729919434, + "learning_rate": 9.895925478793955e-05, + "loss": 2.0715, + "step": 3035 + }, + { + "epoch": 0.9318600368324125, + "grad_norm": 0.5316528081893921, + "learning_rate": 9.895824567482307e-05, + "loss": 2.11, + "step": 3036 + }, + { + "epoch": 0.9321669736034377, + "grad_norm": 0.5760478973388672, + "learning_rate": 9.895723607787251e-05, + "loss": 2.0885, + "step": 3037 + }, + { + "epoch": 0.9324739103744628, + "grad_norm": 0.5034705996513367, + "learning_rate": 9.895622599709785e-05, + "loss": 2.0024, + "step": 3038 + }, + { + "epoch": 0.932780847145488, + "grad_norm": 0.46088743209838867, + "learning_rate": 9.895521543250906e-05, + "loss": 2.0794, + "step": 3039 + }, + { + "epoch": 0.9330877839165131, + "grad_norm": 0.5219544172286987, + "learning_rate": 9.895420438411616e-05, + "loss": 2.1002, + "step": 3040 + }, + { + "epoch": 0.9333947206875384, + "grad_norm": 0.5363453030586243, + "learning_rate": 9.89531928519291e-05, + "loss": 2.0629, + "step": 3041 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.5860787630081177, + "learning_rate": 9.89521808359579e-05, + "loss": 2.0999, + "step": 3042 + }, + { + "epoch": 0.9340085942295887, + "grad_norm": 0.7155836224555969, + "learning_rate": 9.895116833621255e-05, + "loss": 2.1674, + "step": 3043 + }, + { + "epoch": 0.9343155310006138, + "grad_norm": 0.8029196262359619, + "learning_rate": 9.895015535270307e-05, + "loss": 2.0776, + "step": 3044 + }, + { + "epoch": 0.934622467771639, + "grad_norm": 0.6973832845687866, + "learning_rate": 9.894914188543946e-05, + "loss": 2.0537, + "step": 3045 + }, + { + "epoch": 0.9349294045426643, + "grad_norm": 0.6646706461906433, + "learning_rate": 9.894812793443175e-05, + "loss": 2.0857, + "step": 3046 + }, + { + "epoch": 0.9352363413136894, + "grad_norm": 0.6343888640403748, + "learning_rate": 9.894711349968995e-05, + "loss": 2.0832, + "step": 3047 + }, + { + "epoch": 0.9355432780847146, + "grad_norm": 0.54819256067276, + "learning_rate": 9.894609858122407e-05, + "loss": 2.1576, + "step": 3048 + }, + { + "epoch": 0.9358502148557397, + "grad_norm": 0.6905701160430908, + "learning_rate": 9.894508317904419e-05, + "loss": 2.0685, + "step": 3049 + }, + { + "epoch": 0.9361571516267649, + "grad_norm": 0.605591356754303, + "learning_rate": 9.894406729316028e-05, + "loss": 2.0931, + "step": 3050 + }, + { + "epoch": 0.93646408839779, + "grad_norm": 0.5702943801879883, + "learning_rate": 9.89430509235824e-05, + "loss": 2.1224, + "step": 3051 + }, + { + "epoch": 0.9367710251688153, + "grad_norm": 0.5855122804641724, + "learning_rate": 9.894203407032064e-05, + "loss": 2.0747, + "step": 3052 + }, + { + "epoch": 0.9370779619398404, + "grad_norm": 0.6002167463302612, + "learning_rate": 9.894101673338498e-05, + "loss": 2.0991, + "step": 3053 + }, + { + "epoch": 0.9373848987108656, + "grad_norm": 0.5914842486381531, + "learning_rate": 9.893999891278553e-05, + "loss": 2.0427, + "step": 3054 + }, + { + "epoch": 0.9376918354818907, + "grad_norm": 0.6283048391342163, + "learning_rate": 9.893898060853232e-05, + "loss": 2.0558, + "step": 3055 + }, + { + "epoch": 0.937998772252916, + "grad_norm": 0.5955209136009216, + "learning_rate": 9.893796182063542e-05, + "loss": 2.1286, + "step": 3056 + }, + { + "epoch": 0.9383057090239411, + "grad_norm": 0.5579878687858582, + "learning_rate": 9.893694254910489e-05, + "loss": 2.0799, + "step": 3057 + }, + { + "epoch": 0.9386126457949663, + "grad_norm": 0.5690281391143799, + "learning_rate": 9.893592279395082e-05, + "loss": 2.0699, + "step": 3058 + }, + { + "epoch": 0.9389195825659914, + "grad_norm": 0.5189259648323059, + "learning_rate": 9.893490255518327e-05, + "loss": 2.0627, + "step": 3059 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.5205439925193787, + "learning_rate": 9.893388183281233e-05, + "loss": 2.0136, + "step": 3060 + }, + { + "epoch": 0.9395334561080417, + "grad_norm": 0.492593914270401, + "learning_rate": 9.89328606268481e-05, + "loss": 2.0799, + "step": 3061 + }, + { + "epoch": 0.939840392879067, + "grad_norm": 0.6511666178703308, + "learning_rate": 9.893183893730067e-05, + "loss": 2.1297, + "step": 3062 + }, + { + "epoch": 0.9401473296500921, + "grad_norm": 0.7640050053596497, + "learning_rate": 9.89308167641801e-05, + "loss": 2.1384, + "step": 3063 + }, + { + "epoch": 0.9404542664211173, + "grad_norm": 0.7526536583900452, + "learning_rate": 9.892979410749654e-05, + "loss": 2.0454, + "step": 3064 + }, + { + "epoch": 0.9407612031921424, + "grad_norm": 0.7140639424324036, + "learning_rate": 9.892877096726007e-05, + "loss": 2.0219, + "step": 3065 + }, + { + "epoch": 0.9410681399631676, + "grad_norm": 0.6584374308586121, + "learning_rate": 9.89277473434808e-05, + "loss": 2.0943, + "step": 3066 + }, + { + "epoch": 0.9413750767341927, + "grad_norm": 0.5889024138450623, + "learning_rate": 9.892672323616888e-05, + "loss": 2.1088, + "step": 3067 + }, + { + "epoch": 0.941682013505218, + "grad_norm": 0.6196749806404114, + "learning_rate": 9.892569864533438e-05, + "loss": 2.101, + "step": 3068 + }, + { + "epoch": 0.9419889502762431, + "grad_norm": 0.6432211399078369, + "learning_rate": 9.892467357098744e-05, + "loss": 2.0828, + "step": 3069 + }, + { + "epoch": 0.9422958870472683, + "grad_norm": 0.6448069214820862, + "learning_rate": 9.892364801313823e-05, + "loss": 2.1389, + "step": 3070 + }, + { + "epoch": 0.9426028238182934, + "grad_norm": 0.597197949886322, + "learning_rate": 9.892262197179682e-05, + "loss": 2.0902, + "step": 3071 + }, + { + "epoch": 0.9429097605893186, + "grad_norm": 0.625348687171936, + "learning_rate": 9.892159544697341e-05, + "loss": 2.0659, + "step": 3072 + }, + { + "epoch": 0.9432166973603437, + "grad_norm": 0.5109166502952576, + "learning_rate": 9.892056843867812e-05, + "loss": 2.0895, + "step": 3073 + }, + { + "epoch": 0.943523634131369, + "grad_norm": 0.5917959213256836, + "learning_rate": 9.891954094692108e-05, + "loss": 2.0646, + "step": 3074 + }, + { + "epoch": 0.9438305709023941, + "grad_norm": 0.5320633053779602, + "learning_rate": 9.891851297171249e-05, + "loss": 2.107, + "step": 3075 + }, + { + "epoch": 0.9441375076734193, + "grad_norm": 0.5271332263946533, + "learning_rate": 9.891748451306246e-05, + "loss": 2.0984, + "step": 3076 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.5389983057975769, + "learning_rate": 9.89164555709812e-05, + "loss": 2.1097, + "step": 3077 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.5536573529243469, + "learning_rate": 9.891542614547885e-05, + "loss": 2.1271, + "step": 3078 + }, + { + "epoch": 0.9450583179864948, + "grad_norm": 0.5481712222099304, + "learning_rate": 9.891439623656558e-05, + "loss": 2.0975, + "step": 3079 + }, + { + "epoch": 0.94536525475752, + "grad_norm": 0.626431941986084, + "learning_rate": 9.891336584425157e-05, + "loss": 2.1561, + "step": 3080 + }, + { + "epoch": 0.9456721915285451, + "grad_norm": 0.7452689409255981, + "learning_rate": 9.891233496854702e-05, + "loss": 2.0791, + "step": 3081 + }, + { + "epoch": 0.9459791282995703, + "grad_norm": 0.9399113059043884, + "learning_rate": 9.89113036094621e-05, + "loss": 2.0706, + "step": 3082 + }, + { + "epoch": 0.9462860650705954, + "grad_norm": 1.0733267068862915, + "learning_rate": 9.891027176700701e-05, + "loss": 2.0705, + "step": 3083 + }, + { + "epoch": 0.9465930018416207, + "grad_norm": 0.7521542906761169, + "learning_rate": 9.890923944119194e-05, + "loss": 2.0862, + "step": 3084 + }, + { + "epoch": 0.9468999386126458, + "grad_norm": 0.5447198152542114, + "learning_rate": 9.890820663202713e-05, + "loss": 2.1047, + "step": 3085 + }, + { + "epoch": 0.947206875383671, + "grad_norm": 0.5733833312988281, + "learning_rate": 9.890717333952273e-05, + "loss": 2.121, + "step": 3086 + }, + { + "epoch": 0.9475138121546961, + "grad_norm": 0.7225440144538879, + "learning_rate": 9.890613956368899e-05, + "loss": 2.0533, + "step": 3087 + }, + { + "epoch": 0.9478207489257213, + "grad_norm": 0.6377096176147461, + "learning_rate": 9.89051053045361e-05, + "loss": 2.07, + "step": 3088 + }, + { + "epoch": 0.9481276856967464, + "grad_norm": 0.556656002998352, + "learning_rate": 9.890407056207432e-05, + "loss": 2.1103, + "step": 3089 + }, + { + "epoch": 0.9484346224677717, + "grad_norm": 0.6807621121406555, + "learning_rate": 9.890303533631382e-05, + "loss": 2.1351, + "step": 3090 + }, + { + "epoch": 0.9487415592387968, + "grad_norm": 0.7187803983688354, + "learning_rate": 9.890199962726487e-05, + "loss": 2.0582, + "step": 3091 + }, + { + "epoch": 0.949048496009822, + "grad_norm": 0.6201196908950806, + "learning_rate": 9.890096343493771e-05, + "loss": 2.0799, + "step": 3092 + }, + { + "epoch": 0.9493554327808471, + "grad_norm": 0.6258496046066284, + "learning_rate": 9.889992675934257e-05, + "loss": 2.156, + "step": 3093 + }, + { + "epoch": 0.9496623695518723, + "grad_norm": 0.6191570162773132, + "learning_rate": 9.889888960048967e-05, + "loss": 2.0121, + "step": 3094 + }, + { + "epoch": 0.9499693063228974, + "grad_norm": 0.5668848752975464, + "learning_rate": 9.88978519583893e-05, + "loss": 2.0954, + "step": 3095 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.5596859455108643, + "learning_rate": 9.88968138330517e-05, + "loss": 2.1274, + "step": 3096 + }, + { + "epoch": 0.9505831798649478, + "grad_norm": 0.6199706196784973, + "learning_rate": 9.889577522448712e-05, + "loss": 2.0588, + "step": 3097 + }, + { + "epoch": 0.950890116635973, + "grad_norm": 0.5129860639572144, + "learning_rate": 9.889473613270584e-05, + "loss": 2.0722, + "step": 3098 + }, + { + "epoch": 0.9511970534069981, + "grad_norm": 0.513263463973999, + "learning_rate": 9.88936965577181e-05, + "loss": 2.0298, + "step": 3099 + }, + { + "epoch": 0.9515039901780233, + "grad_norm": 0.4870156943798065, + "learning_rate": 9.88926564995342e-05, + "loss": 2.025, + "step": 3100 + }, + { + "epoch": 0.9518109269490485, + "grad_norm": 0.5310595035552979, + "learning_rate": 9.889161595816442e-05, + "loss": 2.0767, + "step": 3101 + }, + { + "epoch": 0.9521178637200737, + "grad_norm": 0.5993812084197998, + "learning_rate": 9.889057493361903e-05, + "loss": 2.1931, + "step": 3102 + }, + { + "epoch": 0.9524248004910988, + "grad_norm": 0.6157637238502502, + "learning_rate": 9.888953342590832e-05, + "loss": 2.0757, + "step": 3103 + }, + { + "epoch": 0.952731737262124, + "grad_norm": 0.6280032992362976, + "learning_rate": 9.88884914350426e-05, + "loss": 2.0042, + "step": 3104 + }, + { + "epoch": 0.9530386740331491, + "grad_norm": 0.6740781664848328, + "learning_rate": 9.888744896103212e-05, + "loss": 2.0663, + "step": 3105 + }, + { + "epoch": 0.9533456108041743, + "grad_norm": 0.5851804614067078, + "learning_rate": 9.888640600388725e-05, + "loss": 2.0585, + "step": 3106 + }, + { + "epoch": 0.9536525475751995, + "grad_norm": 0.6590312719345093, + "learning_rate": 9.888536256361825e-05, + "loss": 2.0698, + "step": 3107 + }, + { + "epoch": 0.9539594843462247, + "grad_norm": 0.5356595516204834, + "learning_rate": 9.888431864023544e-05, + "loss": 2.1019, + "step": 3108 + }, + { + "epoch": 0.9542664211172498, + "grad_norm": 0.6401084661483765, + "learning_rate": 9.888327423374915e-05, + "loss": 2.1176, + "step": 3109 + }, + { + "epoch": 0.954573357888275, + "grad_norm": 0.6582900285720825, + "learning_rate": 9.888222934416968e-05, + "loss": 2.0375, + "step": 3110 + }, + { + "epoch": 0.9548802946593001, + "grad_norm": 0.6245424151420593, + "learning_rate": 9.888118397150738e-05, + "loss": 1.9913, + "step": 3111 + }, + { + "epoch": 0.9551872314303254, + "grad_norm": 0.5871780514717102, + "learning_rate": 9.888013811577256e-05, + "loss": 2.1434, + "step": 3112 + }, + { + "epoch": 0.9554941682013505, + "grad_norm": 0.6295487284660339, + "learning_rate": 9.887909177697559e-05, + "loss": 2.0805, + "step": 3113 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.5844045877456665, + "learning_rate": 9.887804495512676e-05, + "loss": 2.076, + "step": 3114 + }, + { + "epoch": 0.9561080417434008, + "grad_norm": 0.5581921339035034, + "learning_rate": 9.887699765023645e-05, + "loss": 2.131, + "step": 3115 + }, + { + "epoch": 0.956414978514426, + "grad_norm": 0.6659174561500549, + "learning_rate": 9.8875949862315e-05, + "loss": 2.0759, + "step": 3116 + }, + { + "epoch": 0.9567219152854513, + "grad_norm": 0.5852961540222168, + "learning_rate": 9.887490159137276e-05, + "loss": 2.0486, + "step": 3117 + }, + { + "epoch": 0.9570288520564764, + "grad_norm": 0.6077566146850586, + "learning_rate": 9.887385283742011e-05, + "loss": 2.1132, + "step": 3118 + }, + { + "epoch": 0.9573357888275016, + "grad_norm": 0.5991361141204834, + "learning_rate": 9.88728036004674e-05, + "loss": 2.0322, + "step": 3119 + }, + { + "epoch": 0.9576427255985267, + "grad_norm": 0.5832391977310181, + "learning_rate": 9.887175388052499e-05, + "loss": 2.135, + "step": 3120 + }, + { + "epoch": 0.9579496623695519, + "grad_norm": 0.5479732751846313, + "learning_rate": 9.887070367760327e-05, + "loss": 2.1222, + "step": 3121 + }, + { + "epoch": 0.958256599140577, + "grad_norm": 0.5630220770835876, + "learning_rate": 9.88696529917126e-05, + "loss": 2.1247, + "step": 3122 + }, + { + "epoch": 0.9585635359116023, + "grad_norm": 0.7052439451217651, + "learning_rate": 9.88686018228634e-05, + "loss": 2.204, + "step": 3123 + }, + { + "epoch": 0.9588704726826274, + "grad_norm": 0.5995638370513916, + "learning_rate": 9.8867550171066e-05, + "loss": 2.0153, + "step": 3124 + }, + { + "epoch": 0.9591774094536526, + "grad_norm": 0.5689408779144287, + "learning_rate": 9.886649803633086e-05, + "loss": 2.0341, + "step": 3125 + }, + { + "epoch": 0.9594843462246777, + "grad_norm": 0.5247456431388855, + "learning_rate": 9.886544541866832e-05, + "loss": 2.0657, + "step": 3126 + }, + { + "epoch": 0.9597912829957029, + "grad_norm": 0.5596463084220886, + "learning_rate": 9.886439231808882e-05, + "loss": 2.0829, + "step": 3127 + }, + { + "epoch": 0.960098219766728, + "grad_norm": 0.4993874430656433, + "learning_rate": 9.886333873460275e-05, + "loss": 2.0517, + "step": 3128 + }, + { + "epoch": 0.9604051565377533, + "grad_norm": 0.5776910185813904, + "learning_rate": 9.886228466822054e-05, + "loss": 2.0124, + "step": 3129 + }, + { + "epoch": 0.9607120933087784, + "grad_norm": 0.5871354341506958, + "learning_rate": 9.886123011895258e-05, + "loss": 2.0327, + "step": 3130 + }, + { + "epoch": 0.9610190300798036, + "grad_norm": 0.5873207449913025, + "learning_rate": 9.886017508680931e-05, + "loss": 2.0756, + "step": 3131 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.6422720551490784, + "learning_rate": 9.885911957180113e-05, + "loss": 2.0649, + "step": 3132 + }, + { + "epoch": 0.9616329036218539, + "grad_norm": 0.6040814518928528, + "learning_rate": 9.885806357393853e-05, + "loss": 2.066, + "step": 3133 + }, + { + "epoch": 0.961939840392879, + "grad_norm": 0.6629621982574463, + "learning_rate": 9.885700709323189e-05, + "loss": 2.0824, + "step": 3134 + }, + { + "epoch": 0.9622467771639043, + "grad_norm": 0.572485625743866, + "learning_rate": 9.885595012969168e-05, + "loss": 2.0572, + "step": 3135 + }, + { + "epoch": 0.9625537139349294, + "grad_norm": 0.5050783753395081, + "learning_rate": 9.885489268332833e-05, + "loss": 2.0645, + "step": 3136 + }, + { + "epoch": 0.9628606507059546, + "grad_norm": 0.5744417309761047, + "learning_rate": 9.885383475415229e-05, + "loss": 2.0549, + "step": 3137 + }, + { + "epoch": 0.9631675874769797, + "grad_norm": 0.5604275465011597, + "learning_rate": 9.885277634217403e-05, + "loss": 2.1339, + "step": 3138 + }, + { + "epoch": 0.963474524248005, + "grad_norm": 0.6182584762573242, + "learning_rate": 9.8851717447404e-05, + "loss": 2.0397, + "step": 3139 + }, + { + "epoch": 0.9637814610190301, + "grad_norm": 0.510515570640564, + "learning_rate": 9.885065806985266e-05, + "loss": 1.9761, + "step": 3140 + }, + { + "epoch": 0.9640883977900553, + "grad_norm": 0.4881763756275177, + "learning_rate": 9.884959820953048e-05, + "loss": 2.005, + "step": 3141 + }, + { + "epoch": 0.9643953345610804, + "grad_norm": 0.47206851840019226, + "learning_rate": 9.884853786644794e-05, + "loss": 2.0661, + "step": 3142 + }, + { + "epoch": 0.9647022713321056, + "grad_norm": 0.5691676735877991, + "learning_rate": 9.884747704061552e-05, + "loss": 2.1316, + "step": 3143 + }, + { + "epoch": 0.9650092081031307, + "grad_norm": 0.5338765978813171, + "learning_rate": 9.884641573204372e-05, + "loss": 2.0715, + "step": 3144 + }, + { + "epoch": 0.965316144874156, + "grad_norm": 0.5721597075462341, + "learning_rate": 9.884535394074299e-05, + "loss": 2.1004, + "step": 3145 + }, + { + "epoch": 0.9656230816451811, + "grad_norm": 0.5269518494606018, + "learning_rate": 9.884429166672384e-05, + "loss": 2.1233, + "step": 3146 + }, + { + "epoch": 0.9659300184162063, + "grad_norm": 0.5264385342597961, + "learning_rate": 9.884322890999678e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.9662369551872314, + "grad_norm": 0.6094604730606079, + "learning_rate": 9.88421656705723e-05, + "loss": 2.1009, + "step": 3148 + }, + { + "epoch": 0.9665438919582566, + "grad_norm": 0.5538906455039978, + "learning_rate": 9.884110194846093e-05, + "loss": 2.0055, + "step": 3149 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.591526985168457, + "learning_rate": 9.884003774367313e-05, + "loss": 2.0655, + "step": 3150 + }, + { + "epoch": 0.967157765500307, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.883897305621948e-05, + "loss": 2.0775, + "step": 3151 + }, + { + "epoch": 0.9674647022713321, + "grad_norm": 0.5074640512466431, + "learning_rate": 9.883790788611045e-05, + "loss": 2.0322, + "step": 3152 + }, + { + "epoch": 0.9677716390423573, + "grad_norm": 0.5111376047134399, + "learning_rate": 9.883684223335661e-05, + "loss": 2.0972, + "step": 3153 + }, + { + "epoch": 0.9680785758133824, + "grad_norm": 0.5187644362449646, + "learning_rate": 9.883577609796846e-05, + "loss": 2.072, + "step": 3154 + }, + { + "epoch": 0.9683855125844076, + "grad_norm": 0.5285201072692871, + "learning_rate": 9.883470947995654e-05, + "loss": 2.0468, + "step": 3155 + }, + { + "epoch": 0.9686924493554327, + "grad_norm": 0.49360916018486023, + "learning_rate": 9.883364237933142e-05, + "loss": 2.07, + "step": 3156 + }, + { + "epoch": 0.968999386126458, + "grad_norm": 0.6359294056892395, + "learning_rate": 9.88325747961036e-05, + "loss": 2.1169, + "step": 3157 + }, + { + "epoch": 0.9693063228974831, + "grad_norm": 0.6274764537811279, + "learning_rate": 9.883150673028367e-05, + "loss": 2.1412, + "step": 3158 + }, + { + "epoch": 0.9696132596685083, + "grad_norm": 0.5755917429924011, + "learning_rate": 9.883043818188215e-05, + "loss": 2.0547, + "step": 3159 + }, + { + "epoch": 0.9699201964395334, + "grad_norm": 0.4765770137310028, + "learning_rate": 9.882936915090964e-05, + "loss": 2.02, + "step": 3160 + }, + { + "epoch": 0.9702271332105586, + "grad_norm": 0.5085053443908691, + "learning_rate": 9.882829963737667e-05, + "loss": 2.0355, + "step": 3161 + }, + { + "epoch": 0.9705340699815838, + "grad_norm": 0.49804505705833435, + "learning_rate": 9.882722964129385e-05, + "loss": 2.1274, + "step": 3162 + }, + { + "epoch": 0.970841006752609, + "grad_norm": 0.5575076341629028, + "learning_rate": 9.882615916267171e-05, + "loss": 2.0661, + "step": 3163 + }, + { + "epoch": 0.9711479435236341, + "grad_norm": 0.5678727626800537, + "learning_rate": 9.882508820152084e-05, + "loss": 2.1135, + "step": 3164 + }, + { + "epoch": 0.9714548802946593, + "grad_norm": 0.5505611896514893, + "learning_rate": 9.882401675785185e-05, + "loss": 2.0888, + "step": 3165 + }, + { + "epoch": 0.9717618170656844, + "grad_norm": 0.5224125385284424, + "learning_rate": 9.88229448316753e-05, + "loss": 2.0492, + "step": 3166 + }, + { + "epoch": 0.9720687538367097, + "grad_norm": 0.437215656042099, + "learning_rate": 9.882187242300178e-05, + "loss": 1.9927, + "step": 3167 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.4914848804473877, + "learning_rate": 9.882079953184192e-05, + "loss": 2.0309, + "step": 3168 + }, + { + "epoch": 0.97268262737876, + "grad_norm": 0.4990764260292053, + "learning_rate": 9.88197261582063e-05, + "loss": 2.0408, + "step": 3169 + }, + { + "epoch": 0.9729895641497851, + "grad_norm": 0.5283234715461731, + "learning_rate": 9.881865230210552e-05, + "loss": 2.0627, + "step": 3170 + }, + { + "epoch": 0.9732965009208103, + "grad_norm": 0.5771347284317017, + "learning_rate": 9.88175779635502e-05, + "loss": 2.1591, + "step": 3171 + }, + { + "epoch": 0.9736034376918354, + "grad_norm": 0.5020268559455872, + "learning_rate": 9.881650314255098e-05, + "loss": 2.0311, + "step": 3172 + }, + { + "epoch": 0.9739103744628607, + "grad_norm": 0.5476529002189636, + "learning_rate": 9.881542783911846e-05, + "loss": 2.1114, + "step": 3173 + }, + { + "epoch": 0.9742173112338858, + "grad_norm": 0.5630559921264648, + "learning_rate": 9.881435205326327e-05, + "loss": 2.0617, + "step": 3174 + }, + { + "epoch": 0.974524248004911, + "grad_norm": 0.5931001305580139, + "learning_rate": 9.881327578499604e-05, + "loss": 2.0376, + "step": 3175 + }, + { + "epoch": 0.9748311847759361, + "grad_norm": 0.6123979091644287, + "learning_rate": 9.881219903432742e-05, + "loss": 2.0995, + "step": 3176 + }, + { + "epoch": 0.9751381215469613, + "grad_norm": 0.6064465641975403, + "learning_rate": 9.881112180126802e-05, + "loss": 2.0533, + "step": 3177 + }, + { + "epoch": 0.9754450583179864, + "grad_norm": 0.6071485877037048, + "learning_rate": 9.881004408582852e-05, + "loss": 2.1007, + "step": 3178 + }, + { + "epoch": 0.9757519950890117, + "grad_norm": 0.6021482944488525, + "learning_rate": 9.880896588801954e-05, + "loss": 2.0528, + "step": 3179 + }, + { + "epoch": 0.9760589318600368, + "grad_norm": 0.5204832553863525, + "learning_rate": 9.880788720785177e-05, + "loss": 2.0489, + "step": 3180 + }, + { + "epoch": 0.976365868631062, + "grad_norm": 0.5347138047218323, + "learning_rate": 9.880680804533585e-05, + "loss": 2.1021, + "step": 3181 + }, + { + "epoch": 0.9766728054020871, + "grad_norm": 0.6318790912628174, + "learning_rate": 9.880572840048243e-05, + "loss": 2.0808, + "step": 3182 + }, + { + "epoch": 0.9769797421731123, + "grad_norm": 0.6978665590286255, + "learning_rate": 9.88046482733022e-05, + "loss": 2.0067, + "step": 3183 + }, + { + "epoch": 0.9772866789441375, + "grad_norm": 0.7986917495727539, + "learning_rate": 9.880356766380582e-05, + "loss": 2.0239, + "step": 3184 + }, + { + "epoch": 0.9775936157151627, + "grad_norm": 0.853898286819458, + "learning_rate": 9.880248657200402e-05, + "loss": 2.085, + "step": 3185 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.8207793235778809, + "learning_rate": 9.880140499790741e-05, + "loss": 2.0504, + "step": 3186 + }, + { + "epoch": 0.978207489257213, + "grad_norm": 0.7750336527824402, + "learning_rate": 9.880032294152673e-05, + "loss": 2.0962, + "step": 3187 + }, + { + "epoch": 0.9785144260282382, + "grad_norm": 0.7141241431236267, + "learning_rate": 9.879924040287263e-05, + "loss": 2.0655, + "step": 3188 + }, + { + "epoch": 0.9788213627992634, + "grad_norm": 0.6119080781936646, + "learning_rate": 9.879815738195585e-05, + "loss": 2.0611, + "step": 3189 + }, + { + "epoch": 0.9791282995702886, + "grad_norm": 0.5963751673698425, + "learning_rate": 9.879707387878708e-05, + "loss": 2.0978, + "step": 3190 + }, + { + "epoch": 0.9794352363413137, + "grad_norm": 0.5016428828239441, + "learning_rate": 9.879598989337703e-05, + "loss": 2.0323, + "step": 3191 + }, + { + "epoch": 0.9797421731123389, + "grad_norm": 0.5610151290893555, + "learning_rate": 9.87949054257364e-05, + "loss": 2.1362, + "step": 3192 + }, + { + "epoch": 0.980049109883364, + "grad_norm": 0.5687069296836853, + "learning_rate": 9.879382047587591e-05, + "loss": 2.0234, + "step": 3193 + }, + { + "epoch": 0.9803560466543892, + "grad_norm": 0.6210914254188538, + "learning_rate": 9.87927350438063e-05, + "loss": 2.0455, + "step": 3194 + }, + { + "epoch": 0.9806629834254144, + "grad_norm": 0.530215322971344, + "learning_rate": 9.879164912953827e-05, + "loss": 2.0607, + "step": 3195 + }, + { + "epoch": 0.9809699201964396, + "grad_norm": 0.5462486147880554, + "learning_rate": 9.879056273308258e-05, + "loss": 2.1229, + "step": 3196 + }, + { + "epoch": 0.9812768569674647, + "grad_norm": 0.5765405297279358, + "learning_rate": 9.878947585444994e-05, + "loss": 2.0575, + "step": 3197 + }, + { + "epoch": 0.9815837937384899, + "grad_norm": 0.531679630279541, + "learning_rate": 9.878838849365111e-05, + "loss": 2.0208, + "step": 3198 + }, + { + "epoch": 0.981890730509515, + "grad_norm": 0.5190781950950623, + "learning_rate": 9.878730065069683e-05, + "loss": 2.0073, + "step": 3199 + }, + { + "epoch": 0.9821976672805403, + "grad_norm": 0.6260761022567749, + "learning_rate": 9.878621232559784e-05, + "loss": 2.1144, + "step": 3200 + }, + { + "epoch": 0.9825046040515654, + "grad_norm": 0.664830207824707, + "learning_rate": 9.878512351836491e-05, + "loss": 2.1423, + "step": 3201 + }, + { + "epoch": 0.9828115408225906, + "grad_norm": 0.7107433676719666, + "learning_rate": 9.878403422900881e-05, + "loss": 2.0851, + "step": 3202 + }, + { + "epoch": 0.9831184775936157, + "grad_norm": 0.7426268458366394, + "learning_rate": 9.878294445754027e-05, + "loss": 2.0637, + "step": 3203 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.7643515467643738, + "learning_rate": 9.878185420397008e-05, + "loss": 2.0623, + "step": 3204 + }, + { + "epoch": 0.983732351135666, + "grad_norm": 0.644257664680481, + "learning_rate": 9.878076346830904e-05, + "loss": 2.103, + "step": 3205 + }, + { + "epoch": 0.9840392879066913, + "grad_norm": 0.5871284008026123, + "learning_rate": 9.877967225056787e-05, + "loss": 2.0695, + "step": 3206 + }, + { + "epoch": 0.9843462246777164, + "grad_norm": 0.6907737851142883, + "learning_rate": 9.877858055075742e-05, + "loss": 2.1148, + "step": 3207 + }, + { + "epoch": 0.9846531614487416, + "grad_norm": 0.6685691475868225, + "learning_rate": 9.877748836888843e-05, + "loss": 2.0356, + "step": 3208 + }, + { + "epoch": 0.9849600982197667, + "grad_norm": 0.797210156917572, + "learning_rate": 9.87763957049717e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.9852670349907919, + "grad_norm": 0.8397588133811951, + "learning_rate": 9.877530255901806e-05, + "loss": 2.0697, + "step": 3210 + }, + { + "epoch": 0.985573971761817, + "grad_norm": 0.6988976001739502, + "learning_rate": 9.877420893103828e-05, + "loss": 2.0676, + "step": 3211 + }, + { + "epoch": 0.9858809085328423, + "grad_norm": 0.5828577876091003, + "learning_rate": 9.877311482104319e-05, + "loss": 2.0988, + "step": 3212 + }, + { + "epoch": 0.9861878453038674, + "grad_norm": 0.66143798828125, + "learning_rate": 9.877202022904359e-05, + "loss": 2.101, + "step": 3213 + }, + { + "epoch": 0.9864947820748926, + "grad_norm": 0.7351155877113342, + "learning_rate": 9.877092515505028e-05, + "loss": 2.0198, + "step": 3214 + }, + { + "epoch": 0.9868017188459177, + "grad_norm": 0.6817437410354614, + "learning_rate": 9.876982959907413e-05, + "loss": 2.1182, + "step": 3215 + }, + { + "epoch": 0.9871086556169429, + "grad_norm": 0.6640676259994507, + "learning_rate": 9.876873356112592e-05, + "loss": 2.1264, + "step": 3216 + }, + { + "epoch": 0.987415592387968, + "grad_norm": 0.6146695017814636, + "learning_rate": 9.876763704121652e-05, + "loss": 2.0378, + "step": 3217 + }, + { + "epoch": 0.9877225291589933, + "grad_norm": 0.6681298017501831, + "learning_rate": 9.876654003935672e-05, + "loss": 2.1916, + "step": 3218 + }, + { + "epoch": 0.9880294659300184, + "grad_norm": 0.7407983541488647, + "learning_rate": 9.876544255555742e-05, + "loss": 2.0996, + "step": 3219 + }, + { + "epoch": 0.9883364027010436, + "grad_norm": 0.5995208621025085, + "learning_rate": 9.876434458982941e-05, + "loss": 2.0023, + "step": 3220 + }, + { + "epoch": 0.9886433394720687, + "grad_norm": 0.6491377949714661, + "learning_rate": 9.876324614218357e-05, + "loss": 2.129, + "step": 3221 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.6356569528579712, + "learning_rate": 9.876214721263074e-05, + "loss": 2.1396, + "step": 3222 + }, + { + "epoch": 0.9892572130141191, + "grad_norm": 0.6149557828903198, + "learning_rate": 9.876104780118182e-05, + "loss": 2.0204, + "step": 3223 + }, + { + "epoch": 0.9895641497851443, + "grad_norm": 0.600841224193573, + "learning_rate": 9.875994790784764e-05, + "loss": 2.0585, + "step": 3224 + }, + { + "epoch": 0.9898710865561694, + "grad_norm": 0.6398041248321533, + "learning_rate": 9.875884753263906e-05, + "loss": 2.1296, + "step": 3225 + }, + { + "epoch": 0.9901780233271946, + "grad_norm": 0.5978466272354126, + "learning_rate": 9.875774667556697e-05, + "loss": 1.9765, + "step": 3226 + }, + { + "epoch": 0.9904849600982197, + "grad_norm": 0.49499931931495667, + "learning_rate": 9.875664533664227e-05, + "loss": 2.0516, + "step": 3227 + }, + { + "epoch": 0.990791896869245, + "grad_norm": 0.5660768151283264, + "learning_rate": 9.875554351587579e-05, + "loss": 2.0743, + "step": 3228 + }, + { + "epoch": 0.9910988336402701, + "grad_norm": 0.56971275806427, + "learning_rate": 9.875444121327849e-05, + "loss": 2.0794, + "step": 3229 + }, + { + "epoch": 0.9914057704112953, + "grad_norm": 0.5806300044059753, + "learning_rate": 9.87533384288612e-05, + "loss": 2.1636, + "step": 3230 + }, + { + "epoch": 0.9917127071823204, + "grad_norm": 0.5485837459564209, + "learning_rate": 9.875223516263485e-05, + "loss": 2.025, + "step": 3231 + }, + { + "epoch": 0.9920196439533456, + "grad_norm": 0.6353451013565063, + "learning_rate": 9.875113141461034e-05, + "loss": 2.1033, + "step": 3232 + }, + { + "epoch": 0.9923265807243707, + "grad_norm": 0.577608048915863, + "learning_rate": 9.875002718479858e-05, + "loss": 2.1306, + "step": 3233 + }, + { + "epoch": 0.992633517495396, + "grad_norm": 0.5305901765823364, + "learning_rate": 9.874892247321046e-05, + "loss": 2.1123, + "step": 3234 + }, + { + "epoch": 0.9929404542664211, + "grad_norm": 0.5554118752479553, + "learning_rate": 9.874781727985693e-05, + "loss": 2.0524, + "step": 3235 + }, + { + "epoch": 0.9932473910374463, + "grad_norm": 0.48555269837379456, + "learning_rate": 9.87467116047489e-05, + "loss": 2.0699, + "step": 3236 + }, + { + "epoch": 0.9935543278084714, + "grad_norm": 0.578976035118103, + "learning_rate": 9.874560544789729e-05, + "loss": 2.0747, + "step": 3237 + }, + { + "epoch": 0.9938612645794966, + "grad_norm": 0.5508282780647278, + "learning_rate": 9.874449880931304e-05, + "loss": 2.0947, + "step": 3238 + }, + { + "epoch": 0.9941682013505218, + "grad_norm": 0.5458595752716064, + "learning_rate": 9.874339168900707e-05, + "loss": 2.0417, + "step": 3239 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.5668261647224426, + "learning_rate": 9.874228408699035e-05, + "loss": 2.0948, + "step": 3240 + }, + { + "epoch": 0.9947820748925721, + "grad_norm": 0.6127253174781799, + "learning_rate": 9.87411760032738e-05, + "loss": 2.0904, + "step": 3241 + }, + { + "epoch": 0.9950890116635973, + "grad_norm": 0.5736191868782043, + "learning_rate": 9.874006743786839e-05, + "loss": 2.0637, + "step": 3242 + }, + { + "epoch": 0.9953959484346224, + "grad_norm": 0.574163019657135, + "learning_rate": 9.873895839078507e-05, + "loss": 2.0925, + "step": 3243 + }, + { + "epoch": 0.9957028852056476, + "grad_norm": 0.5660602450370789, + "learning_rate": 9.873784886203478e-05, + "loss": 2.0743, + "step": 3244 + }, + { + "epoch": 0.9960098219766728, + "grad_norm": 0.6037993431091309, + "learning_rate": 9.87367388516285e-05, + "loss": 2.1274, + "step": 3245 + }, + { + "epoch": 0.996316758747698, + "grad_norm": 0.5664488673210144, + "learning_rate": 9.873562835957722e-05, + "loss": 2.0403, + "step": 3246 + }, + { + "epoch": 0.9966236955187231, + "grad_norm": 0.6170254349708557, + "learning_rate": 9.873451738589188e-05, + "loss": 2.0198, + "step": 3247 + }, + { + "epoch": 0.9969306322897483, + "grad_norm": 0.5582032799720764, + "learning_rate": 9.873340593058348e-05, + "loss": 2.1494, + "step": 3248 + }, + { + "epoch": 0.9972375690607734, + "grad_norm": 0.5565598607063293, + "learning_rate": 9.8732293993663e-05, + "loss": 2.1062, + "step": 3249 + }, + { + "epoch": 0.9975445058317987, + "grad_norm": 0.5526474118232727, + "learning_rate": 9.873118157514142e-05, + "loss": 2.1184, + "step": 3250 + }, + { + "epoch": 0.9978514426028238, + "grad_norm": 0.5864302515983582, + "learning_rate": 9.873006867502975e-05, + "loss": 2.1389, + "step": 3251 + }, + { + "epoch": 0.998158379373849, + "grad_norm": 0.5295118689537048, + "learning_rate": 9.872895529333899e-05, + "loss": 2.05, + "step": 3252 + }, + { + "epoch": 0.9984653161448741, + "grad_norm": 0.553537905216217, + "learning_rate": 9.872784143008012e-05, + "loss": 2.0464, + "step": 3253 + }, + { + "epoch": 0.9987722529158993, + "grad_norm": 0.558159589767456, + "learning_rate": 9.872672708526416e-05, + "loss": 2.1013, + "step": 3254 + }, + { + "epoch": 0.9990791896869244, + "grad_norm": 0.5479860901832581, + "learning_rate": 9.872561225890211e-05, + "loss": 2.0497, + "step": 3255 + }, + { + "epoch": 0.9993861264579497, + "grad_norm": 0.5538234114646912, + "learning_rate": 9.872449695100503e-05, + "loss": 2.1239, + "step": 3256 + }, + { + "epoch": 0.9996930632289748, + "grad_norm": 0.5970771908760071, + "learning_rate": 9.872338116158389e-05, + "loss": 2.0693, + "step": 3257 + }, + { + "epoch": 1.0, + "grad_norm": 0.5118132829666138, + "learning_rate": 9.872226489064975e-05, + "loss": 2.0302, + "step": 3258 + }, + { + "epoch": 1.0003069367710251, + "grad_norm": 0.538902223110199, + "learning_rate": 9.872114813821363e-05, + "loss": 2.0604, + "step": 3259 + }, + { + "epoch": 1.0006138735420504, + "grad_norm": 0.47124916315078735, + "learning_rate": 9.872003090428657e-05, + "loss": 2.054, + "step": 3260 + }, + { + "epoch": 1.0009208103130756, + "grad_norm": 0.5109235048294067, + "learning_rate": 9.87189131888796e-05, + "loss": 2.0107, + "step": 3261 + }, + { + "epoch": 1.0012277470841007, + "grad_norm": 0.5530306696891785, + "learning_rate": 9.871779499200377e-05, + "loss": 2.0914, + "step": 3262 + }, + { + "epoch": 1.0015346838551258, + "grad_norm": 0.6271992325782776, + "learning_rate": 9.871667631367017e-05, + "loss": 1.9855, + "step": 3263 + }, + { + "epoch": 1.0018416206261511, + "grad_norm": 0.5752004384994507, + "learning_rate": 9.871555715388978e-05, + "loss": 2.0689, + "step": 3264 + }, + { + "epoch": 1.0021485573971762, + "grad_norm": 0.6185278296470642, + "learning_rate": 9.871443751267373e-05, + "loss": 2.0751, + "step": 3265 + }, + { + "epoch": 1.0024554941682013, + "grad_norm": 0.625248908996582, + "learning_rate": 9.871331739003304e-05, + "loss": 2.102, + "step": 3266 + }, + { + "epoch": 1.0027624309392265, + "grad_norm": 0.6345300078392029, + "learning_rate": 9.87121967859788e-05, + "loss": 2.0898, + "step": 3267 + }, + { + "epoch": 1.0030693677102518, + "grad_norm": 0.6836622953414917, + "learning_rate": 9.871107570052207e-05, + "loss": 2.1348, + "step": 3268 + }, + { + "epoch": 1.003376304481277, + "grad_norm": 0.699739933013916, + "learning_rate": 9.870995413367397e-05, + "loss": 2.0085, + "step": 3269 + }, + { + "epoch": 1.003683241252302, + "grad_norm": 0.650558590888977, + "learning_rate": 9.870883208544553e-05, + "loss": 2.0927, + "step": 3270 + }, + { + "epoch": 1.0039901780233271, + "grad_norm": 0.6837300658226013, + "learning_rate": 9.870770955584785e-05, + "loss": 2.1415, + "step": 3271 + }, + { + "epoch": 1.0042971147943525, + "grad_norm": 0.595761239528656, + "learning_rate": 9.870658654489206e-05, + "loss": 2.0372, + "step": 3272 + }, + { + "epoch": 1.0046040515653776, + "grad_norm": 0.5177203416824341, + "learning_rate": 9.870546305258922e-05, + "loss": 2.053, + "step": 3273 + }, + { + "epoch": 1.0049109883364027, + "grad_norm": 0.5392438173294067, + "learning_rate": 9.870433907895045e-05, + "loss": 2.0886, + "step": 3274 + }, + { + "epoch": 1.0052179251074278, + "grad_norm": 0.594776451587677, + "learning_rate": 9.870321462398686e-05, + "loss": 2.0158, + "step": 3275 + }, + { + "epoch": 1.0055248618784531, + "grad_norm": 0.6363179683685303, + "learning_rate": 9.870208968770955e-05, + "loss": 2.0532, + "step": 3276 + }, + { + "epoch": 1.0058317986494782, + "grad_norm": 0.7506567239761353, + "learning_rate": 9.870096427012965e-05, + "loss": 2.1288, + "step": 3277 + }, + { + "epoch": 1.0061387354205034, + "grad_norm": 0.7155289053916931, + "learning_rate": 9.869983837125828e-05, + "loss": 2.0859, + "step": 3278 + }, + { + "epoch": 1.0064456721915285, + "grad_norm": 0.7589760422706604, + "learning_rate": 9.869871199110656e-05, + "loss": 2.1668, + "step": 3279 + }, + { + "epoch": 1.0067526089625538, + "grad_norm": 0.6161168217658997, + "learning_rate": 9.869758512968562e-05, + "loss": 2.0421, + "step": 3280 + }, + { + "epoch": 1.007059545733579, + "grad_norm": 0.5722637176513672, + "learning_rate": 9.86964577870066e-05, + "loss": 2.1333, + "step": 3281 + }, + { + "epoch": 1.007366482504604, + "grad_norm": 0.6443020701408386, + "learning_rate": 9.869532996308065e-05, + "loss": 2.0227, + "step": 3282 + }, + { + "epoch": 1.0076734192756291, + "grad_norm": 0.6603342890739441, + "learning_rate": 9.869420165791891e-05, + "loss": 2.0888, + "step": 3283 + }, + { + "epoch": 1.0079803560466545, + "grad_norm": 0.6666482090950012, + "learning_rate": 9.869307287153251e-05, + "loss": 2.0132, + "step": 3284 + }, + { + "epoch": 1.0082872928176796, + "grad_norm": 0.6691575646400452, + "learning_rate": 9.869194360393264e-05, + "loss": 2.0752, + "step": 3285 + }, + { + "epoch": 1.0085942295887047, + "grad_norm": 0.6142565011978149, + "learning_rate": 9.869081385513044e-05, + "loss": 2.0491, + "step": 3286 + }, + { + "epoch": 1.0089011663597298, + "grad_norm": 0.5869930386543274, + "learning_rate": 9.868968362513708e-05, + "loss": 2.1252, + "step": 3287 + }, + { + "epoch": 1.0092081031307552, + "grad_norm": 0.532183825969696, + "learning_rate": 9.868855291396373e-05, + "loss": 2.0589, + "step": 3288 + }, + { + "epoch": 1.0095150399017803, + "grad_norm": 0.616374135017395, + "learning_rate": 9.868742172162156e-05, + "loss": 2.0808, + "step": 3289 + }, + { + "epoch": 1.0098219766728054, + "grad_norm": 0.5750923156738281, + "learning_rate": 9.868629004812176e-05, + "loss": 2.0407, + "step": 3290 + }, + { + "epoch": 1.0101289134438305, + "grad_norm": 0.6161531209945679, + "learning_rate": 9.86851578934755e-05, + "loss": 2.0938, + "step": 3291 + }, + { + "epoch": 1.0104358502148558, + "grad_norm": 0.5369158983230591, + "learning_rate": 9.868402525769397e-05, + "loss": 2.1298, + "step": 3292 + }, + { + "epoch": 1.010742786985881, + "grad_norm": 0.5134824514389038, + "learning_rate": 9.868289214078837e-05, + "loss": 2.0345, + "step": 3293 + }, + { + "epoch": 1.011049723756906, + "grad_norm": 0.4972594082355499, + "learning_rate": 9.868175854276991e-05, + "loss": 2.1264, + "step": 3294 + }, + { + "epoch": 1.0113566605279312, + "grad_norm": 0.5727534890174866, + "learning_rate": 9.868062446364976e-05, + "loss": 2.1668, + "step": 3295 + }, + { + "epoch": 1.0116635972989565, + "grad_norm": 0.6384626030921936, + "learning_rate": 9.867948990343915e-05, + "loss": 2.1125, + "step": 3296 + }, + { + "epoch": 1.0119705340699816, + "grad_norm": 0.7591070532798767, + "learning_rate": 9.867835486214929e-05, + "loss": 2.0975, + "step": 3297 + }, + { + "epoch": 1.0122774708410067, + "grad_norm": 0.7940282821655273, + "learning_rate": 9.86772193397914e-05, + "loss": 2.0107, + "step": 3298 + }, + { + "epoch": 1.0125844076120318, + "grad_norm": 0.6877933144569397, + "learning_rate": 9.86760833363767e-05, + "loss": 2.0684, + "step": 3299 + }, + { + "epoch": 1.0128913443830572, + "grad_norm": 0.5361137986183167, + "learning_rate": 9.867494685191641e-05, + "loss": 2.0426, + "step": 3300 + }, + { + "epoch": 1.0131982811540823, + "grad_norm": 0.5104349851608276, + "learning_rate": 9.867380988642177e-05, + "loss": 2.0849, + "step": 3301 + }, + { + "epoch": 1.0135052179251074, + "grad_norm": 0.6133849024772644, + "learning_rate": 9.867267243990399e-05, + "loss": 2.0789, + "step": 3302 + }, + { + "epoch": 1.0138121546961325, + "grad_norm": 0.6607559323310852, + "learning_rate": 9.867153451237436e-05, + "loss": 2.0978, + "step": 3303 + }, + { + "epoch": 1.0141190914671578, + "grad_norm": 0.6853774189949036, + "learning_rate": 9.867039610384409e-05, + "loss": 2.1612, + "step": 3304 + }, + { + "epoch": 1.014426028238183, + "grad_norm": 0.6326626539230347, + "learning_rate": 9.866925721432442e-05, + "loss": 2.0887, + "step": 3305 + }, + { + "epoch": 1.014732965009208, + "grad_norm": 0.5483830571174622, + "learning_rate": 9.866811784382665e-05, + "loss": 2.0522, + "step": 3306 + }, + { + "epoch": 1.0150399017802332, + "grad_norm": 0.5980744957923889, + "learning_rate": 9.866697799236201e-05, + "loss": 2.0666, + "step": 3307 + }, + { + "epoch": 1.0153468385512585, + "grad_norm": 0.6047075986862183, + "learning_rate": 9.866583765994177e-05, + "loss": 2.0924, + "step": 3308 + }, + { + "epoch": 1.0156537753222836, + "grad_norm": 0.5932674407958984, + "learning_rate": 9.86646968465772e-05, + "loss": 2.0426, + "step": 3309 + }, + { + "epoch": 1.0159607120933087, + "grad_norm": 0.5349873304367065, + "learning_rate": 9.866355555227957e-05, + "loss": 2.027, + "step": 3310 + }, + { + "epoch": 1.0162676488643339, + "grad_norm": 0.5090891122817993, + "learning_rate": 9.866241377706015e-05, + "loss": 2.0554, + "step": 3311 + }, + { + "epoch": 1.0165745856353592, + "grad_norm": 0.605268120765686, + "learning_rate": 9.866127152093025e-05, + "loss": 2.0788, + "step": 3312 + }, + { + "epoch": 1.0168815224063843, + "grad_norm": 0.6006563305854797, + "learning_rate": 9.866012878390113e-05, + "loss": 2.0154, + "step": 3313 + }, + { + "epoch": 1.0171884591774094, + "grad_norm": 0.6412727236747742, + "learning_rate": 9.865898556598409e-05, + "loss": 2.0948, + "step": 3314 + }, + { + "epoch": 1.0174953959484345, + "grad_norm": 0.512140154838562, + "learning_rate": 9.865784186719046e-05, + "loss": 2.0314, + "step": 3315 + }, + { + "epoch": 1.0178023327194599, + "grad_norm": 0.48285913467407227, + "learning_rate": 9.865669768753151e-05, + "loss": 1.9689, + "step": 3316 + }, + { + "epoch": 1.018109269490485, + "grad_norm": 0.6067737340927124, + "learning_rate": 9.865555302701854e-05, + "loss": 2.1042, + "step": 3317 + }, + { + "epoch": 1.01841620626151, + "grad_norm": 0.6272363662719727, + "learning_rate": 9.865440788566289e-05, + "loss": 2.1092, + "step": 3318 + }, + { + "epoch": 1.0187231430325352, + "grad_norm": 0.6264182925224304, + "learning_rate": 9.865326226347586e-05, + "loss": 2.0445, + "step": 3319 + }, + { + "epoch": 1.0190300798035605, + "grad_norm": 0.5642834901809692, + "learning_rate": 9.86521161604688e-05, + "loss": 2.1041, + "step": 3320 + }, + { + "epoch": 1.0193370165745856, + "grad_norm": 0.5188324451446533, + "learning_rate": 9.865096957665297e-05, + "loss": 2.0174, + "step": 3321 + }, + { + "epoch": 1.0196439533456108, + "grad_norm": 0.5204416513442993, + "learning_rate": 9.864982251203976e-05, + "loss": 2.0927, + "step": 3322 + }, + { + "epoch": 1.0199508901166359, + "grad_norm": 0.5845292806625366, + "learning_rate": 9.86486749666405e-05, + "loss": 2.0751, + "step": 3323 + }, + { + "epoch": 1.0202578268876612, + "grad_norm": 0.5514994263648987, + "learning_rate": 9.86475269404665e-05, + "loss": 2.0976, + "step": 3324 + }, + { + "epoch": 1.0205647636586863, + "grad_norm": 0.6578981280326843, + "learning_rate": 9.864637843352915e-05, + "loss": 2.0668, + "step": 3325 + }, + { + "epoch": 1.0208717004297114, + "grad_norm": 0.6396434307098389, + "learning_rate": 9.864522944583976e-05, + "loss": 2.0648, + "step": 3326 + }, + { + "epoch": 1.0211786372007365, + "grad_norm": 0.548759400844574, + "learning_rate": 9.86440799774097e-05, + "loss": 2.0873, + "step": 3327 + }, + { + "epoch": 1.0214855739717619, + "grad_norm": 0.5739279985427856, + "learning_rate": 9.864293002825033e-05, + "loss": 2.0623, + "step": 3328 + }, + { + "epoch": 1.021792510742787, + "grad_norm": 0.5882315039634705, + "learning_rate": 9.864177959837303e-05, + "loss": 2.0399, + "step": 3329 + }, + { + "epoch": 1.022099447513812, + "grad_norm": 0.563359797000885, + "learning_rate": 9.864062868778914e-05, + "loss": 2.0839, + "step": 3330 + }, + { + "epoch": 1.0224063842848374, + "grad_norm": 0.6162607073783875, + "learning_rate": 9.863947729651006e-05, + "loss": 2.0439, + "step": 3331 + }, + { + "epoch": 1.0227133210558625, + "grad_norm": 0.6540365815162659, + "learning_rate": 9.863832542454715e-05, + "loss": 2.1234, + "step": 3332 + }, + { + "epoch": 1.0230202578268877, + "grad_norm": 0.6401089429855347, + "learning_rate": 9.86371730719118e-05, + "loss": 2.0418, + "step": 3333 + }, + { + "epoch": 1.0233271945979128, + "grad_norm": 0.6456391215324402, + "learning_rate": 9.86360202386154e-05, + "loss": 2.1191, + "step": 3334 + }, + { + "epoch": 1.023634131368938, + "grad_norm": 0.59992516040802, + "learning_rate": 9.863486692466933e-05, + "loss": 2.0582, + "step": 3335 + }, + { + "epoch": 1.0239410681399632, + "grad_norm": 0.5932520627975464, + "learning_rate": 9.8633713130085e-05, + "loss": 2.1812, + "step": 3336 + }, + { + "epoch": 1.0242480049109883, + "grad_norm": 0.6322866082191467, + "learning_rate": 9.863255885487384e-05, + "loss": 2.1523, + "step": 3337 + }, + { + "epoch": 1.0245549416820134, + "grad_norm": 0.6291313171386719, + "learning_rate": 9.863140409904719e-05, + "loss": 2.0495, + "step": 3338 + }, + { + "epoch": 1.0248618784530388, + "grad_norm": 0.6272565126419067, + "learning_rate": 9.863024886261653e-05, + "loss": 1.9812, + "step": 3339 + }, + { + "epoch": 1.025168815224064, + "grad_norm": 0.6485729217529297, + "learning_rate": 9.862909314559323e-05, + "loss": 2.0826, + "step": 3340 + }, + { + "epoch": 1.025475751995089, + "grad_norm": 0.608239471912384, + "learning_rate": 9.862793694798875e-05, + "loss": 2.0519, + "step": 3341 + }, + { + "epoch": 1.0257826887661141, + "grad_norm": 0.5492779612541199, + "learning_rate": 9.862678026981447e-05, + "loss": 1.9901, + "step": 3342 + }, + { + "epoch": 1.0260896255371394, + "grad_norm": 0.524030327796936, + "learning_rate": 9.862562311108187e-05, + "loss": 2.0695, + "step": 3343 + }, + { + "epoch": 1.0263965623081646, + "grad_norm": 0.6835227608680725, + "learning_rate": 9.862446547180235e-05, + "loss": 2.1312, + "step": 3344 + }, + { + "epoch": 1.0267034990791897, + "grad_norm": 0.6771748065948486, + "learning_rate": 9.862330735198736e-05, + "loss": 2.0566, + "step": 3345 + }, + { + "epoch": 1.0270104358502148, + "grad_norm": 0.609993577003479, + "learning_rate": 9.862214875164835e-05, + "loss": 2.1463, + "step": 3346 + }, + { + "epoch": 1.0273173726212401, + "grad_norm": 0.6617777347564697, + "learning_rate": 9.862098967079677e-05, + "loss": 2.0485, + "step": 3347 + }, + { + "epoch": 1.0276243093922652, + "grad_norm": 0.7935113906860352, + "learning_rate": 9.861983010944407e-05, + "loss": 2.0528, + "step": 3348 + }, + { + "epoch": 1.0279312461632903, + "grad_norm": 0.7510255575180054, + "learning_rate": 9.861867006760172e-05, + "loss": 1.9803, + "step": 3349 + }, + { + "epoch": 1.0282381829343155, + "grad_norm": 0.6944519281387329, + "learning_rate": 9.861750954528117e-05, + "loss": 2.0488, + "step": 3350 + }, + { + "epoch": 1.0285451197053408, + "grad_norm": 0.6057126522064209, + "learning_rate": 9.861634854249389e-05, + "loss": 2.1465, + "step": 3351 + }, + { + "epoch": 1.028852056476366, + "grad_norm": 0.6156182289123535, + "learning_rate": 9.861518705925135e-05, + "loss": 2.1227, + "step": 3352 + }, + { + "epoch": 1.029158993247391, + "grad_norm": 0.6016978621482849, + "learning_rate": 9.861402509556506e-05, + "loss": 2.0238, + "step": 3353 + }, + { + "epoch": 1.0294659300184161, + "grad_norm": 0.5987950563430786, + "learning_rate": 9.861286265144648e-05, + "loss": 2.0529, + "step": 3354 + }, + { + "epoch": 1.0297728667894415, + "grad_norm": 0.6011384725570679, + "learning_rate": 9.861169972690707e-05, + "loss": 2.0612, + "step": 3355 + }, + { + "epoch": 1.0300798035604666, + "grad_norm": 0.5217840671539307, + "learning_rate": 9.861053632195838e-05, + "loss": 2.0472, + "step": 3356 + }, + { + "epoch": 1.0303867403314917, + "grad_norm": 0.5202180743217468, + "learning_rate": 9.860937243661186e-05, + "loss": 2.1301, + "step": 3357 + }, + { + "epoch": 1.0306936771025168, + "grad_norm": 0.572290301322937, + "learning_rate": 9.860820807087905e-05, + "loss": 2.0309, + "step": 3358 + }, + { + "epoch": 1.0310006138735421, + "grad_norm": 0.5088694095611572, + "learning_rate": 9.860704322477142e-05, + "loss": 2.0789, + "step": 3359 + }, + { + "epoch": 1.0313075506445673, + "grad_norm": 0.5546056032180786, + "learning_rate": 9.860587789830052e-05, + "loss": 1.9708, + "step": 3360 + }, + { + "epoch": 1.0316144874155924, + "grad_norm": 0.5152996182441711, + "learning_rate": 9.860471209147782e-05, + "loss": 2.0656, + "step": 3361 + }, + { + "epoch": 1.0319214241866175, + "grad_norm": 0.4997018873691559, + "learning_rate": 9.860354580431488e-05, + "loss": 2.1404, + "step": 3362 + }, + { + "epoch": 1.0322283609576428, + "grad_norm": 0.5464209318161011, + "learning_rate": 9.860237903682321e-05, + "loss": 2.0013, + "step": 3363 + }, + { + "epoch": 1.032535297728668, + "grad_norm": 0.4934932589530945, + "learning_rate": 9.860121178901435e-05, + "loss": 2.0873, + "step": 3364 + }, + { + "epoch": 1.032842234499693, + "grad_norm": 0.5755184292793274, + "learning_rate": 9.860004406089982e-05, + "loss": 2.0706, + "step": 3365 + }, + { + "epoch": 1.0331491712707181, + "grad_norm": 0.6155427098274231, + "learning_rate": 9.859887585249117e-05, + "loss": 2.1153, + "step": 3366 + }, + { + "epoch": 1.0334561080417435, + "grad_norm": 0.6251068711280823, + "learning_rate": 9.859770716379995e-05, + "loss": 1.9988, + "step": 3367 + }, + { + "epoch": 1.0337630448127686, + "grad_norm": 0.5652515888214111, + "learning_rate": 9.85965379948377e-05, + "loss": 1.9834, + "step": 3368 + }, + { + "epoch": 1.0340699815837937, + "grad_norm": 0.49031418561935425, + "learning_rate": 9.859536834561599e-05, + "loss": 2.0719, + "step": 3369 + }, + { + "epoch": 1.0343769183548188, + "grad_norm": 0.5014585852622986, + "learning_rate": 9.859419821614635e-05, + "loss": 2.0309, + "step": 3370 + }, + { + "epoch": 1.0346838551258442, + "grad_norm": 0.5657221674919128, + "learning_rate": 9.859302760644036e-05, + "loss": 2.048, + "step": 3371 + }, + { + "epoch": 1.0349907918968693, + "grad_norm": 0.7023506164550781, + "learning_rate": 9.85918565165096e-05, + "loss": 2.033, + "step": 3372 + }, + { + "epoch": 1.0352977286678944, + "grad_norm": 0.5712850689888, + "learning_rate": 9.859068494636565e-05, + "loss": 2.1006, + "step": 3373 + }, + { + "epoch": 1.0356046654389195, + "grad_norm": 0.5352653861045837, + "learning_rate": 9.858951289602004e-05, + "loss": 1.9775, + "step": 3374 + }, + { + "epoch": 1.0359116022099448, + "grad_norm": 0.5282073616981506, + "learning_rate": 9.85883403654844e-05, + "loss": 2.0388, + "step": 3375 + }, + { + "epoch": 1.03621853898097, + "grad_norm": 0.6164727210998535, + "learning_rate": 9.85871673547703e-05, + "loss": 2.0758, + "step": 3376 + }, + { + "epoch": 1.036525475751995, + "grad_norm": 0.6034660935401917, + "learning_rate": 9.858599386388933e-05, + "loss": 2.0619, + "step": 3377 + }, + { + "epoch": 1.0368324125230202, + "grad_norm": 0.6129952073097229, + "learning_rate": 9.85848198928531e-05, + "loss": 2.0709, + "step": 3378 + }, + { + "epoch": 1.0371393492940455, + "grad_norm": 0.6287248134613037, + "learning_rate": 9.85836454416732e-05, + "loss": 2.1493, + "step": 3379 + }, + { + "epoch": 1.0374462860650706, + "grad_norm": 0.675419807434082, + "learning_rate": 9.858247051036124e-05, + "loss": 2.0558, + "step": 3380 + }, + { + "epoch": 1.0377532228360957, + "grad_norm": 0.6493481397628784, + "learning_rate": 9.858129509892882e-05, + "loss": 2.2019, + "step": 3381 + }, + { + "epoch": 1.0380601596071208, + "grad_norm": 0.6690036058425903, + "learning_rate": 9.85801192073876e-05, + "loss": 2.0069, + "step": 3382 + }, + { + "epoch": 1.0383670963781462, + "grad_norm": 0.6682954430580139, + "learning_rate": 9.857894283574913e-05, + "loss": 2.0559, + "step": 3383 + }, + { + "epoch": 1.0386740331491713, + "grad_norm": 0.6408236622810364, + "learning_rate": 9.857776598402508e-05, + "loss": 2.0837, + "step": 3384 + }, + { + "epoch": 1.0389809699201964, + "grad_norm": 0.7896385192871094, + "learning_rate": 9.85765886522271e-05, + "loss": 2.1344, + "step": 3385 + }, + { + "epoch": 1.0392879066912215, + "grad_norm": 0.7404007911682129, + "learning_rate": 9.857541084036677e-05, + "loss": 2.0937, + "step": 3386 + }, + { + "epoch": 1.0395948434622468, + "grad_norm": 0.6780609488487244, + "learning_rate": 9.857423254845577e-05, + "loss": 2.0279, + "step": 3387 + }, + { + "epoch": 1.039901780233272, + "grad_norm": 0.5989474654197693, + "learning_rate": 9.857305377650574e-05, + "loss": 2.0997, + "step": 3388 + }, + { + "epoch": 1.040208717004297, + "grad_norm": 0.5449484586715698, + "learning_rate": 9.857187452452832e-05, + "loss": 2.0544, + "step": 3389 + }, + { + "epoch": 1.0405156537753222, + "grad_norm": 0.6261779069900513, + "learning_rate": 9.857069479253516e-05, + "loss": 2.024, + "step": 3390 + }, + { + "epoch": 1.0408225905463475, + "grad_norm": 0.6665713787078857, + "learning_rate": 9.856951458053794e-05, + "loss": 2.1139, + "step": 3391 + }, + { + "epoch": 1.0411295273173726, + "grad_norm": 0.5861490964889526, + "learning_rate": 9.856833388854829e-05, + "loss": 2.0087, + "step": 3392 + }, + { + "epoch": 1.0414364640883977, + "grad_norm": 0.5511623620986938, + "learning_rate": 9.856715271657793e-05, + "loss": 2.106, + "step": 3393 + }, + { + "epoch": 1.0417434008594229, + "grad_norm": 0.5450705885887146, + "learning_rate": 9.856597106463848e-05, + "loss": 2.0669, + "step": 3394 + }, + { + "epoch": 1.0420503376304482, + "grad_norm": 0.5172801613807678, + "learning_rate": 9.856478893274163e-05, + "loss": 2.0492, + "step": 3395 + }, + { + "epoch": 1.0423572744014733, + "grad_norm": 0.580157458782196, + "learning_rate": 9.856360632089907e-05, + "loss": 2.0794, + "step": 3396 + }, + { + "epoch": 1.0426642111724984, + "grad_norm": 0.5138662457466125, + "learning_rate": 9.856242322912251e-05, + "loss": 2.0813, + "step": 3397 + }, + { + "epoch": 1.0429711479435237, + "grad_norm": 0.5626689791679382, + "learning_rate": 9.85612396574236e-05, + "loss": 2.071, + "step": 3398 + }, + { + "epoch": 1.0432780847145489, + "grad_norm": 0.6069894433021545, + "learning_rate": 9.856005560581407e-05, + "loss": 2.132, + "step": 3399 + }, + { + "epoch": 1.043585021485574, + "grad_norm": 0.547346293926239, + "learning_rate": 9.85588710743056e-05, + "loss": 2.0572, + "step": 3400 + }, + { + "epoch": 1.043891958256599, + "grad_norm": 0.5712311863899231, + "learning_rate": 9.855768606290992e-05, + "loss": 2.0943, + "step": 3401 + }, + { + "epoch": 1.0441988950276242, + "grad_norm": 0.5945014953613281, + "learning_rate": 9.85565005716387e-05, + "loss": 2.1004, + "step": 3402 + }, + { + "epoch": 1.0445058317986495, + "grad_norm": 0.5712563395500183, + "learning_rate": 9.85553146005037e-05, + "loss": 2.0817, + "step": 3403 + }, + { + "epoch": 1.0448127685696746, + "grad_norm": 0.552578866481781, + "learning_rate": 9.855412814951661e-05, + "loss": 2.0514, + "step": 3404 + }, + { + "epoch": 1.0451197053406998, + "grad_norm": 0.5654930472373962, + "learning_rate": 9.855294121868918e-05, + "loss": 2.1342, + "step": 3405 + }, + { + "epoch": 1.045426642111725, + "grad_norm": 0.516094446182251, + "learning_rate": 9.855175380803312e-05, + "loss": 2.01, + "step": 3406 + }, + { + "epoch": 1.0457335788827502, + "grad_norm": 0.5198549628257751, + "learning_rate": 9.855056591756018e-05, + "loss": 2.0423, + "step": 3407 + }, + { + "epoch": 1.0460405156537753, + "grad_norm": 0.45312678813934326, + "learning_rate": 9.854937754728209e-05, + "loss": 1.9767, + "step": 3408 + }, + { + "epoch": 1.0463474524248004, + "grad_norm": 0.4647958278656006, + "learning_rate": 9.854818869721059e-05, + "loss": 2.107, + "step": 3409 + }, + { + "epoch": 1.0466543891958258, + "grad_norm": 0.5034347772598267, + "learning_rate": 9.854699936735742e-05, + "loss": 2.0358, + "step": 3410 + }, + { + "epoch": 1.0469613259668509, + "grad_norm": 0.48189103603363037, + "learning_rate": 9.854580955773435e-05, + "loss": 2.0441, + "step": 3411 + }, + { + "epoch": 1.047268262737876, + "grad_norm": 0.5315099954605103, + "learning_rate": 9.854461926835316e-05, + "loss": 2.0222, + "step": 3412 + }, + { + "epoch": 1.047575199508901, + "grad_norm": 0.6013970971107483, + "learning_rate": 9.854342849922557e-05, + "loss": 2.09, + "step": 3413 + }, + { + "epoch": 1.0478821362799264, + "grad_norm": 0.7554240226745605, + "learning_rate": 9.854223725036339e-05, + "loss": 2.0411, + "step": 3414 + }, + { + "epoch": 1.0481890730509515, + "grad_norm": 0.7160158157348633, + "learning_rate": 9.854104552177835e-05, + "loss": 2.0858, + "step": 3415 + }, + { + "epoch": 1.0484960098219767, + "grad_norm": 0.5641576051712036, + "learning_rate": 9.853985331348225e-05, + "loss": 2.0287, + "step": 3416 + }, + { + "epoch": 1.0488029465930018, + "grad_norm": 0.5947676301002502, + "learning_rate": 9.853866062548687e-05, + "loss": 2.1177, + "step": 3417 + }, + { + "epoch": 1.049109883364027, + "grad_norm": 0.5780991911888123, + "learning_rate": 9.853746745780401e-05, + "loss": 2.024, + "step": 3418 + }, + { + "epoch": 1.0494168201350522, + "grad_norm": 0.6753053665161133, + "learning_rate": 9.853627381044543e-05, + "loss": 2.1303, + "step": 3419 + }, + { + "epoch": 1.0497237569060773, + "grad_norm": 0.7183442711830139, + "learning_rate": 9.853507968342295e-05, + "loss": 2.0845, + "step": 3420 + }, + { + "epoch": 1.0500306936771024, + "grad_norm": 0.6768840551376343, + "learning_rate": 9.853388507674837e-05, + "loss": 2.0991, + "step": 3421 + }, + { + "epoch": 1.0503376304481278, + "grad_norm": 0.624703049659729, + "learning_rate": 9.85326899904335e-05, + "loss": 2.0952, + "step": 3422 + }, + { + "epoch": 1.050644567219153, + "grad_norm": 0.523289144039154, + "learning_rate": 9.853149442449013e-05, + "loss": 2.0244, + "step": 3423 + }, + { + "epoch": 1.050951503990178, + "grad_norm": 0.4939860701560974, + "learning_rate": 9.853029837893008e-05, + "loss": 2.0312, + "step": 3424 + }, + { + "epoch": 1.0512584407612031, + "grad_norm": 0.5685132145881653, + "learning_rate": 9.852910185376519e-05, + "loss": 2.0863, + "step": 3425 + }, + { + "epoch": 1.0515653775322285, + "grad_norm": 0.5713129639625549, + "learning_rate": 9.852790484900725e-05, + "loss": 2.1182, + "step": 3426 + }, + { + "epoch": 1.0518723143032536, + "grad_norm": 0.5626100301742554, + "learning_rate": 9.852670736466813e-05, + "loss": 2.0187, + "step": 3427 + }, + { + "epoch": 1.0521792510742787, + "grad_norm": 0.5129684805870056, + "learning_rate": 9.852550940075965e-05, + "loss": 2.0354, + "step": 3428 + }, + { + "epoch": 1.0524861878453038, + "grad_norm": 0.6123769879341125, + "learning_rate": 9.852431095729361e-05, + "loss": 2.1315, + "step": 3429 + }, + { + "epoch": 1.0527931246163291, + "grad_norm": 0.66834956407547, + "learning_rate": 9.852311203428192e-05, + "loss": 2.1642, + "step": 3430 + }, + { + "epoch": 1.0531000613873542, + "grad_norm": 0.6253052353858948, + "learning_rate": 9.85219126317364e-05, + "loss": 2.0651, + "step": 3431 + }, + { + "epoch": 1.0534069981583793, + "grad_norm": 0.5162510871887207, + "learning_rate": 9.852071274966888e-05, + "loss": 2.0029, + "step": 3432 + }, + { + "epoch": 1.0537139349294045, + "grad_norm": 0.5725626349449158, + "learning_rate": 9.851951238809125e-05, + "loss": 2.0875, + "step": 3433 + }, + { + "epoch": 1.0540208717004298, + "grad_norm": 0.5319885611534119, + "learning_rate": 9.851831154701537e-05, + "loss": 2.0042, + "step": 3434 + }, + { + "epoch": 1.054327808471455, + "grad_norm": 0.5030925273895264, + "learning_rate": 9.851711022645307e-05, + "loss": 1.9805, + "step": 3435 + }, + { + "epoch": 1.05463474524248, + "grad_norm": 0.5786148309707642, + "learning_rate": 9.851590842641627e-05, + "loss": 2.1456, + "step": 3436 + }, + { + "epoch": 1.0549416820135051, + "grad_norm": 0.6246622800827026, + "learning_rate": 9.851470614691682e-05, + "loss": 2.042, + "step": 3437 + }, + { + "epoch": 1.0552486187845305, + "grad_norm": 0.5181210041046143, + "learning_rate": 9.851350338796662e-05, + "loss": 2.0423, + "step": 3438 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.5505120754241943, + "learning_rate": 9.851230014957754e-05, + "loss": 2.0478, + "step": 3439 + }, + { + "epoch": 1.0558624923265807, + "grad_norm": 0.6193632483482361, + "learning_rate": 9.851109643176147e-05, + "loss": 1.9904, + "step": 3440 + }, + { + "epoch": 1.0561694290976058, + "grad_norm": 0.6332803964614868, + "learning_rate": 9.85098922345303e-05, + "loss": 2.0037, + "step": 3441 + }, + { + "epoch": 1.0564763658686311, + "grad_norm": 0.5601481199264526, + "learning_rate": 9.850868755789595e-05, + "loss": 2.141, + "step": 3442 + }, + { + "epoch": 1.0567833026396563, + "grad_norm": 0.588182270526886, + "learning_rate": 9.850748240187033e-05, + "loss": 2.17, + "step": 3443 + }, + { + "epoch": 1.0570902394106814, + "grad_norm": 0.5955865383148193, + "learning_rate": 9.850627676646533e-05, + "loss": 2.1004, + "step": 3444 + }, + { + "epoch": 1.0573971761817065, + "grad_norm": 0.6412670612335205, + "learning_rate": 9.850507065169288e-05, + "loss": 2.0642, + "step": 3445 + }, + { + "epoch": 1.0577041129527318, + "grad_norm": 0.5597305297851562, + "learning_rate": 9.850386405756489e-05, + "loss": 2.0412, + "step": 3446 + }, + { + "epoch": 1.058011049723757, + "grad_norm": 0.5633887052536011, + "learning_rate": 9.850265698409328e-05, + "loss": 1.9976, + "step": 3447 + }, + { + "epoch": 1.058317986494782, + "grad_norm": 0.5924213528633118, + "learning_rate": 9.850144943128998e-05, + "loss": 2.0715, + "step": 3448 + }, + { + "epoch": 1.0586249232658071, + "grad_norm": 0.5968048572540283, + "learning_rate": 9.850024139916694e-05, + "loss": 2.0755, + "step": 3449 + }, + { + "epoch": 1.0589318600368325, + "grad_norm": 0.5745044946670532, + "learning_rate": 9.849903288773609e-05, + "loss": 2.0615, + "step": 3450 + }, + { + "epoch": 1.0592387968078576, + "grad_norm": 0.5154273509979248, + "learning_rate": 9.849782389700936e-05, + "loss": 2.0429, + "step": 3451 + }, + { + "epoch": 1.0595457335788827, + "grad_norm": 0.5307286977767944, + "learning_rate": 9.849661442699871e-05, + "loss": 2.0788, + "step": 3452 + }, + { + "epoch": 1.0598526703499078, + "grad_norm": 0.5445010662078857, + "learning_rate": 9.84954044777161e-05, + "loss": 2.0598, + "step": 3453 + }, + { + "epoch": 1.0601596071209332, + "grad_norm": 0.5858064889907837, + "learning_rate": 9.849419404917347e-05, + "loss": 2.069, + "step": 3454 + }, + { + "epoch": 1.0604665438919583, + "grad_norm": 0.5906962156295776, + "learning_rate": 9.84929831413828e-05, + "loss": 2.1256, + "step": 3455 + }, + { + "epoch": 1.0607734806629834, + "grad_norm": 0.6632845997810364, + "learning_rate": 9.849177175435605e-05, + "loss": 2.1002, + "step": 3456 + }, + { + "epoch": 1.0610804174340085, + "grad_norm": 0.6352782845497131, + "learning_rate": 9.849055988810518e-05, + "loss": 2.0901, + "step": 3457 + }, + { + "epoch": 1.0613873542050338, + "grad_norm": 0.5406731963157654, + "learning_rate": 9.848934754264218e-05, + "loss": 2.0562, + "step": 3458 + }, + { + "epoch": 1.061694290976059, + "grad_norm": 0.6067590117454529, + "learning_rate": 9.848813471797902e-05, + "loss": 2.0914, + "step": 3459 + }, + { + "epoch": 1.062001227747084, + "grad_norm": 0.5876826047897339, + "learning_rate": 9.84869214141277e-05, + "loss": 2.0065, + "step": 3460 + }, + { + "epoch": 1.0623081645181092, + "grad_norm": 0.611648440361023, + "learning_rate": 9.84857076311002e-05, + "loss": 2.1252, + "step": 3461 + }, + { + "epoch": 1.0626151012891345, + "grad_norm": 0.568358302116394, + "learning_rate": 9.848449336890853e-05, + "loss": 2.0312, + "step": 3462 + }, + { + "epoch": 1.0629220380601596, + "grad_norm": 0.5303518772125244, + "learning_rate": 9.848327862756466e-05, + "loss": 1.9989, + "step": 3463 + }, + { + "epoch": 1.0632289748311847, + "grad_norm": 0.5377182960510254, + "learning_rate": 9.848206340708062e-05, + "loss": 2.0759, + "step": 3464 + }, + { + "epoch": 1.06353591160221, + "grad_norm": 0.5178431868553162, + "learning_rate": 9.848084770746842e-05, + "loss": 2.0613, + "step": 3465 + }, + { + "epoch": 1.0638428483732352, + "grad_norm": 0.4605518877506256, + "learning_rate": 9.847963152874007e-05, + "loss": 1.9961, + "step": 3466 + }, + { + "epoch": 1.0641497851442603, + "grad_norm": 0.5262506604194641, + "learning_rate": 9.847841487090758e-05, + "loss": 2.032, + "step": 3467 + }, + { + "epoch": 1.0644567219152854, + "grad_norm": 0.5210484862327576, + "learning_rate": 9.847719773398298e-05, + "loss": 2.106, + "step": 3468 + }, + { + "epoch": 1.0647636586863105, + "grad_norm": 0.5159584283828735, + "learning_rate": 9.84759801179783e-05, + "loss": 2.07, + "step": 3469 + }, + { + "epoch": 1.0650705954573358, + "grad_norm": 0.5094224810600281, + "learning_rate": 9.847476202290557e-05, + "loss": 2.1379, + "step": 3470 + }, + { + "epoch": 1.065377532228361, + "grad_norm": 0.5180851221084595, + "learning_rate": 9.847354344877684e-05, + "loss": 2.0911, + "step": 3471 + }, + { + "epoch": 1.065684468999386, + "grad_norm": 0.5476199984550476, + "learning_rate": 9.847232439560412e-05, + "loss": 2.0654, + "step": 3472 + }, + { + "epoch": 1.0659914057704114, + "grad_norm": 0.5314182639122009, + "learning_rate": 9.84711048633995e-05, + "loss": 1.9829, + "step": 3473 + }, + { + "epoch": 1.0662983425414365, + "grad_norm": 0.549379825592041, + "learning_rate": 9.8469884852175e-05, + "loss": 2.0876, + "step": 3474 + }, + { + "epoch": 1.0666052793124616, + "grad_norm": 0.6280861496925354, + "learning_rate": 9.84686643619427e-05, + "loss": 2.1026, + "step": 3475 + }, + { + "epoch": 1.0669122160834867, + "grad_norm": 0.5838838219642639, + "learning_rate": 9.846744339271464e-05, + "loss": 2.0553, + "step": 3476 + }, + { + "epoch": 1.0672191528545119, + "grad_norm": 0.6090747117996216, + "learning_rate": 9.84662219445029e-05, + "loss": 2.0983, + "step": 3477 + }, + { + "epoch": 1.0675260896255372, + "grad_norm": 0.515504002571106, + "learning_rate": 9.846500001731955e-05, + "loss": 2.0992, + "step": 3478 + }, + { + "epoch": 1.0678330263965623, + "grad_norm": 0.5083954930305481, + "learning_rate": 9.846377761117667e-05, + "loss": 1.9851, + "step": 3479 + }, + { + "epoch": 1.0681399631675874, + "grad_norm": 0.5102222561836243, + "learning_rate": 9.846255472608632e-05, + "loss": 2.0553, + "step": 3480 + }, + { + "epoch": 1.0684468999386127, + "grad_norm": 0.5123574137687683, + "learning_rate": 9.846133136206061e-05, + "loss": 2.0382, + "step": 3481 + }, + { + "epoch": 1.0687538367096379, + "grad_norm": 0.5657833814620972, + "learning_rate": 9.84601075191116e-05, + "loss": 2.0735, + "step": 3482 + }, + { + "epoch": 1.069060773480663, + "grad_norm": 0.5460711121559143, + "learning_rate": 9.845888319725143e-05, + "loss": 2.0445, + "step": 3483 + }, + { + "epoch": 1.069367710251688, + "grad_norm": 0.42860034108161926, + "learning_rate": 9.845765839649217e-05, + "loss": 2.0166, + "step": 3484 + }, + { + "epoch": 1.0696746470227134, + "grad_norm": 0.5413190126419067, + "learning_rate": 9.845643311684592e-05, + "loss": 1.9923, + "step": 3485 + }, + { + "epoch": 1.0699815837937385, + "grad_norm": 0.4982166290283203, + "learning_rate": 9.84552073583248e-05, + "loss": 2.0279, + "step": 3486 + }, + { + "epoch": 1.0702885205647636, + "grad_norm": 0.4824393689632416, + "learning_rate": 9.845398112094091e-05, + "loss": 1.9661, + "step": 3487 + }, + { + "epoch": 1.0705954573357888, + "grad_norm": 0.5690898895263672, + "learning_rate": 9.845275440470639e-05, + "loss": 2.0866, + "step": 3488 + }, + { + "epoch": 1.070902394106814, + "grad_norm": 0.6087098717689514, + "learning_rate": 9.845152720963335e-05, + "loss": 2.055, + "step": 3489 + }, + { + "epoch": 1.0712093308778392, + "grad_norm": 0.5754218101501465, + "learning_rate": 9.845029953573392e-05, + "loss": 2.0577, + "step": 3490 + }, + { + "epoch": 1.0715162676488643, + "grad_norm": 0.619746744632721, + "learning_rate": 9.844907138302023e-05, + "loss": 2.0694, + "step": 3491 + }, + { + "epoch": 1.0718232044198894, + "grad_norm": 0.5165389776229858, + "learning_rate": 9.844784275150442e-05, + "loss": 1.9618, + "step": 3492 + }, + { + "epoch": 1.0721301411909148, + "grad_norm": 0.5098079442977905, + "learning_rate": 9.844661364119863e-05, + "loss": 2.0021, + "step": 3493 + }, + { + "epoch": 1.0724370779619399, + "grad_norm": 0.5978688597679138, + "learning_rate": 9.8445384052115e-05, + "loss": 2.0861, + "step": 3494 + }, + { + "epoch": 1.072744014732965, + "grad_norm": 0.5498695373535156, + "learning_rate": 9.844415398426572e-05, + "loss": 2.095, + "step": 3495 + }, + { + "epoch": 1.07305095150399, + "grad_norm": 0.4890369474887848, + "learning_rate": 9.844292343766289e-05, + "loss": 1.9819, + "step": 3496 + }, + { + "epoch": 1.0733578882750154, + "grad_norm": 0.49551400542259216, + "learning_rate": 9.844169241231871e-05, + "loss": 2.109, + "step": 3497 + }, + { + "epoch": 1.0736648250460405, + "grad_norm": 0.5358633399009705, + "learning_rate": 9.844046090824533e-05, + "loss": 2.0579, + "step": 3498 + }, + { + "epoch": 1.0739717618170657, + "grad_norm": 0.5990919470787048, + "learning_rate": 9.843922892545492e-05, + "loss": 2.1962, + "step": 3499 + }, + { + "epoch": 1.0742786985880908, + "grad_norm": 0.5973169207572937, + "learning_rate": 9.843799646395967e-05, + "loss": 2.0691, + "step": 3500 + }, + { + "epoch": 1.074585635359116, + "grad_norm": 0.5875831246376038, + "learning_rate": 9.843676352377172e-05, + "loss": 2.0807, + "step": 3501 + }, + { + "epoch": 1.0748925721301412, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.84355301049033e-05, + "loss": 2.0694, + "step": 3502 + }, + { + "epoch": 1.0751995089011663, + "grad_norm": 0.7694209814071655, + "learning_rate": 9.843429620736659e-05, + "loss": 2.1504, + "step": 3503 + }, + { + "epoch": 1.0755064456721914, + "grad_norm": 0.7930089831352234, + "learning_rate": 9.843306183117376e-05, + "loss": 2.0635, + "step": 3504 + }, + { + "epoch": 1.0758133824432168, + "grad_norm": 0.6518469452857971, + "learning_rate": 9.843182697633704e-05, + "loss": 2.0395, + "step": 3505 + }, + { + "epoch": 1.076120319214242, + "grad_norm": 0.49737605452537537, + "learning_rate": 9.843059164286861e-05, + "loss": 1.9875, + "step": 3506 + }, + { + "epoch": 1.076427255985267, + "grad_norm": 0.5311492085456848, + "learning_rate": 9.84293558307807e-05, + "loss": 2.1331, + "step": 3507 + }, + { + "epoch": 1.0767341927562921, + "grad_norm": 0.6801449656486511, + "learning_rate": 9.842811954008551e-05, + "loss": 2.0991, + "step": 3508 + }, + { + "epoch": 1.0770411295273175, + "grad_norm": 0.5404406189918518, + "learning_rate": 9.842688277079523e-05, + "loss": 2.0482, + "step": 3509 + }, + { + "epoch": 1.0773480662983426, + "grad_norm": 0.6136532425880432, + "learning_rate": 9.842564552292215e-05, + "loss": 2.1016, + "step": 3510 + }, + { + "epoch": 1.0776550030693677, + "grad_norm": 0.5874183773994446, + "learning_rate": 9.842440779647843e-05, + "loss": 2.0495, + "step": 3511 + }, + { + "epoch": 1.0779619398403928, + "grad_norm": 0.4891047775745392, + "learning_rate": 9.842316959147635e-05, + "loss": 2.0592, + "step": 3512 + }, + { + "epoch": 1.0782688766114181, + "grad_norm": 0.5115689635276794, + "learning_rate": 9.84219309079281e-05, + "loss": 2.0084, + "step": 3513 + }, + { + "epoch": 1.0785758133824432, + "grad_norm": 0.5662370324134827, + "learning_rate": 9.842069174584597e-05, + "loss": 2.1134, + "step": 3514 + }, + { + "epoch": 1.0788827501534684, + "grad_norm": 0.6859605312347412, + "learning_rate": 9.841945210524217e-05, + "loss": 2.1144, + "step": 3515 + }, + { + "epoch": 1.0791896869244935, + "grad_norm": 0.8003933429718018, + "learning_rate": 9.841821198612897e-05, + "loss": 2.0353, + "step": 3516 + }, + { + "epoch": 1.0794966236955188, + "grad_norm": 0.8481027483940125, + "learning_rate": 9.841697138851863e-05, + "loss": 2.1012, + "step": 3517 + }, + { + "epoch": 1.079803560466544, + "grad_norm": 0.7234178185462952, + "learning_rate": 9.84157303124234e-05, + "loss": 2.1134, + "step": 3518 + }, + { + "epoch": 1.080110497237569, + "grad_norm": 0.6129522919654846, + "learning_rate": 9.841448875785553e-05, + "loss": 2.0736, + "step": 3519 + }, + { + "epoch": 1.0804174340085941, + "grad_norm": 0.4983314573764801, + "learning_rate": 9.841324672482732e-05, + "loss": 2.0334, + "step": 3520 + }, + { + "epoch": 1.0807243707796195, + "grad_norm": 0.6069099307060242, + "learning_rate": 9.841200421335101e-05, + "loss": 2.0506, + "step": 3521 + }, + { + "epoch": 1.0810313075506446, + "grad_norm": 0.5841798186302185, + "learning_rate": 9.841076122343893e-05, + "loss": 2.0491, + "step": 3522 + }, + { + "epoch": 1.0813382443216697, + "grad_norm": 0.5629861354827881, + "learning_rate": 9.84095177551033e-05, + "loss": 2.0435, + "step": 3523 + }, + { + "epoch": 1.0816451810926948, + "grad_norm": 0.48676446080207825, + "learning_rate": 9.840827380835646e-05, + "loss": 2.0543, + "step": 3524 + }, + { + "epoch": 1.0819521178637201, + "grad_norm": 0.5119389295578003, + "learning_rate": 9.840702938321069e-05, + "loss": 2.0461, + "step": 3525 + }, + { + "epoch": 1.0822590546347453, + "grad_norm": 0.47259917855262756, + "learning_rate": 9.840578447967827e-05, + "loss": 2.0494, + "step": 3526 + }, + { + "epoch": 1.0825659914057704, + "grad_norm": 0.5083605647087097, + "learning_rate": 9.840453909777153e-05, + "loss": 2.0518, + "step": 3527 + }, + { + "epoch": 1.0828729281767955, + "grad_norm": 0.46149778366088867, + "learning_rate": 9.840329323750276e-05, + "loss": 2.0087, + "step": 3528 + }, + { + "epoch": 1.0831798649478208, + "grad_norm": 0.4698919951915741, + "learning_rate": 9.840204689888427e-05, + "loss": 2.0715, + "step": 3529 + }, + { + "epoch": 1.083486801718846, + "grad_norm": 0.514570951461792, + "learning_rate": 9.840080008192838e-05, + "loss": 2.1067, + "step": 3530 + }, + { + "epoch": 1.083793738489871, + "grad_norm": 0.5938723087310791, + "learning_rate": 9.839955278664743e-05, + "loss": 2.1246, + "step": 3531 + }, + { + "epoch": 1.0841006752608962, + "grad_norm": 0.58525550365448, + "learning_rate": 9.839830501305372e-05, + "loss": 2.0695, + "step": 3532 + }, + { + "epoch": 1.0844076120319215, + "grad_norm": 0.5693490505218506, + "learning_rate": 9.83970567611596e-05, + "loss": 2.0166, + "step": 3533 + }, + { + "epoch": 1.0847145488029466, + "grad_norm": 0.544964075088501, + "learning_rate": 9.839580803097738e-05, + "loss": 2.0093, + "step": 3534 + }, + { + "epoch": 1.0850214855739717, + "grad_norm": 0.5509639978408813, + "learning_rate": 9.839455882251945e-05, + "loss": 2.0511, + "step": 3535 + }, + { + "epoch": 1.0853284223449968, + "grad_norm": 0.5092516541481018, + "learning_rate": 9.83933091357981e-05, + "loss": 2.0586, + "step": 3536 + }, + { + "epoch": 1.0856353591160222, + "grad_norm": 0.5163968205451965, + "learning_rate": 9.83920589708257e-05, + "loss": 2.0541, + "step": 3537 + }, + { + "epoch": 1.0859422958870473, + "grad_norm": 0.49756479263305664, + "learning_rate": 9.839080832761464e-05, + "loss": 2.0495, + "step": 3538 + }, + { + "epoch": 1.0862492326580724, + "grad_norm": 0.6246916055679321, + "learning_rate": 9.838955720617722e-05, + "loss": 2.2082, + "step": 3539 + }, + { + "epoch": 1.0865561694290977, + "grad_norm": 0.5826153755187988, + "learning_rate": 9.838830560652585e-05, + "loss": 2.0318, + "step": 3540 + }, + { + "epoch": 1.0868631062001228, + "grad_norm": 0.6131548285484314, + "learning_rate": 9.838705352867287e-05, + "loss": 2.1172, + "step": 3541 + }, + { + "epoch": 1.087170042971148, + "grad_norm": 0.7028201818466187, + "learning_rate": 9.838580097263068e-05, + "loss": 2.061, + "step": 3542 + }, + { + "epoch": 1.087476979742173, + "grad_norm": 0.7061073780059814, + "learning_rate": 9.838454793841166e-05, + "loss": 2.0944, + "step": 3543 + }, + { + "epoch": 1.0877839165131982, + "grad_norm": 0.6820229887962341, + "learning_rate": 9.838329442602814e-05, + "loss": 2.072, + "step": 3544 + }, + { + "epoch": 1.0880908532842235, + "grad_norm": 0.5658139586448669, + "learning_rate": 9.838204043549257e-05, + "loss": 2.0499, + "step": 3545 + }, + { + "epoch": 1.0883977900552486, + "grad_norm": 0.5714126825332642, + "learning_rate": 9.838078596681731e-05, + "loss": 2.06, + "step": 3546 + }, + { + "epoch": 1.0887047268262737, + "grad_norm": 0.5343610048294067, + "learning_rate": 9.837953102001477e-05, + "loss": 2.0932, + "step": 3547 + }, + { + "epoch": 1.089011663597299, + "grad_norm": 0.5799851417541504, + "learning_rate": 9.837827559509735e-05, + "loss": 2.0615, + "step": 3548 + }, + { + "epoch": 1.0893186003683242, + "grad_norm": 0.5679401159286499, + "learning_rate": 9.837701969207745e-05, + "loss": 2.0161, + "step": 3549 + }, + { + "epoch": 1.0896255371393493, + "grad_norm": 0.5369420647621155, + "learning_rate": 9.83757633109675e-05, + "loss": 2.0066, + "step": 3550 + }, + { + "epoch": 1.0899324739103744, + "grad_norm": 0.5276355147361755, + "learning_rate": 9.837450645177988e-05, + "loss": 2.03, + "step": 3551 + }, + { + "epoch": 1.0902394106813997, + "grad_norm": 0.49717894196510315, + "learning_rate": 9.837324911452705e-05, + "loss": 1.9897, + "step": 3552 + }, + { + "epoch": 1.0905463474524248, + "grad_norm": 0.460783451795578, + "learning_rate": 9.837199129922142e-05, + "loss": 2.089, + "step": 3553 + }, + { + "epoch": 1.09085328422345, + "grad_norm": 0.505473792552948, + "learning_rate": 9.837073300587541e-05, + "loss": 2.035, + "step": 3554 + }, + { + "epoch": 1.091160220994475, + "grad_norm": 0.4588155150413513, + "learning_rate": 9.836947423450147e-05, + "loss": 2.0029, + "step": 3555 + }, + { + "epoch": 1.0914671577655004, + "grad_norm": 0.5151825547218323, + "learning_rate": 9.836821498511203e-05, + "loss": 2.1075, + "step": 3556 + }, + { + "epoch": 1.0917740945365255, + "grad_norm": 0.46669647097587585, + "learning_rate": 9.836695525771955e-05, + "loss": 2.0468, + "step": 3557 + }, + { + "epoch": 1.0920810313075506, + "grad_norm": 0.49291539192199707, + "learning_rate": 9.836569505233647e-05, + "loss": 2.1201, + "step": 3558 + }, + { + "epoch": 1.0923879680785757, + "grad_norm": 0.49323126673698425, + "learning_rate": 9.836443436897525e-05, + "loss": 1.9796, + "step": 3559 + }, + { + "epoch": 1.092694904849601, + "grad_norm": 0.4784039258956909, + "learning_rate": 9.836317320764832e-05, + "loss": 2.0267, + "step": 3560 + }, + { + "epoch": 1.0930018416206262, + "grad_norm": 0.5402999520301819, + "learning_rate": 9.836191156836818e-05, + "loss": 2.07, + "step": 3561 + }, + { + "epoch": 1.0933087783916513, + "grad_norm": 0.5989857912063599, + "learning_rate": 9.83606494511473e-05, + "loss": 2.0518, + "step": 3562 + }, + { + "epoch": 1.0936157151626764, + "grad_norm": 0.685855507850647, + "learning_rate": 9.835938685599811e-05, + "loss": 2.0632, + "step": 3563 + }, + { + "epoch": 1.0939226519337018, + "grad_norm": 0.7716066837310791, + "learning_rate": 9.835812378293312e-05, + "loss": 2.0758, + "step": 3564 + }, + { + "epoch": 1.0942295887047269, + "grad_norm": 0.6822659969329834, + "learning_rate": 9.835686023196481e-05, + "loss": 2.0077, + "step": 3565 + }, + { + "epoch": 1.094536525475752, + "grad_norm": 0.5031718611717224, + "learning_rate": 9.835559620310566e-05, + "loss": 2.0432, + "step": 3566 + }, + { + "epoch": 1.094843462246777, + "grad_norm": 0.5570902228355408, + "learning_rate": 9.835433169636818e-05, + "loss": 2.1203, + "step": 3567 + }, + { + "epoch": 1.0951503990178024, + "grad_norm": 0.6224993467330933, + "learning_rate": 9.835306671176484e-05, + "loss": 2.0281, + "step": 3568 + }, + { + "epoch": 1.0954573357888275, + "grad_norm": 0.67215895652771, + "learning_rate": 9.835180124930816e-05, + "loss": 2.1158, + "step": 3569 + }, + { + "epoch": 1.0957642725598526, + "grad_norm": 0.5764983892440796, + "learning_rate": 9.835053530901064e-05, + "loss": 1.9735, + "step": 3570 + }, + { + "epoch": 1.0960712093308778, + "grad_norm": 0.48459672927856445, + "learning_rate": 9.834926889088478e-05, + "loss": 2.0074, + "step": 3571 + }, + { + "epoch": 1.096378146101903, + "grad_norm": 0.4789890944957733, + "learning_rate": 9.834800199494312e-05, + "loss": 1.9942, + "step": 3572 + }, + { + "epoch": 1.0966850828729282, + "grad_norm": 0.5133237838745117, + "learning_rate": 9.834673462119817e-05, + "loss": 2.0204, + "step": 3573 + }, + { + "epoch": 1.0969920196439533, + "grad_norm": 0.638518750667572, + "learning_rate": 9.834546676966244e-05, + "loss": 2.1396, + "step": 3574 + }, + { + "epoch": 1.0972989564149784, + "grad_norm": 0.5471677780151367, + "learning_rate": 9.834419844034848e-05, + "loss": 1.99, + "step": 3575 + }, + { + "epoch": 1.0976058931860038, + "grad_norm": 0.5372926592826843, + "learning_rate": 9.83429296332688e-05, + "loss": 2.0241, + "step": 3576 + }, + { + "epoch": 1.0979128299570289, + "grad_norm": 0.5284983515739441, + "learning_rate": 9.834166034843597e-05, + "loss": 2.0705, + "step": 3577 + }, + { + "epoch": 1.098219766728054, + "grad_norm": 0.5212574601173401, + "learning_rate": 9.834039058586252e-05, + "loss": 2.0648, + "step": 3578 + }, + { + "epoch": 1.098526703499079, + "grad_norm": 0.439454048871994, + "learning_rate": 9.833912034556099e-05, + "loss": 1.9981, + "step": 3579 + }, + { + "epoch": 1.0988336402701044, + "grad_norm": 0.529550313949585, + "learning_rate": 9.833784962754394e-05, + "loss": 2.0092, + "step": 3580 + }, + { + "epoch": 1.0991405770411296, + "grad_norm": 0.5555844902992249, + "learning_rate": 9.833657843182394e-05, + "loss": 2.0457, + "step": 3581 + }, + { + "epoch": 1.0994475138121547, + "grad_norm": 0.56191086769104, + "learning_rate": 9.833530675841352e-05, + "loss": 2.0742, + "step": 3582 + }, + { + "epoch": 1.0997544505831798, + "grad_norm": 0.5119436383247375, + "learning_rate": 9.833403460732529e-05, + "loss": 2.0836, + "step": 3583 + }, + { + "epoch": 1.1000613873542051, + "grad_norm": 0.48049578070640564, + "learning_rate": 9.833276197857179e-05, + "loss": 2.0018, + "step": 3584 + }, + { + "epoch": 1.1003683241252302, + "grad_norm": 0.48501092195510864, + "learning_rate": 9.83314888721656e-05, + "loss": 2.0158, + "step": 3585 + }, + { + "epoch": 1.1006752608962553, + "grad_norm": 0.528548538684845, + "learning_rate": 9.833021528811932e-05, + "loss": 2.0327, + "step": 3586 + }, + { + "epoch": 1.1009821976672804, + "grad_norm": 0.5243194699287415, + "learning_rate": 9.832894122644551e-05, + "loss": 1.9874, + "step": 3587 + }, + { + "epoch": 1.1012891344383058, + "grad_norm": 0.46920302510261536, + "learning_rate": 9.832766668715681e-05, + "loss": 2.0487, + "step": 3588 + }, + { + "epoch": 1.101596071209331, + "grad_norm": 0.45994171500205994, + "learning_rate": 9.832639167026575e-05, + "loss": 2.0926, + "step": 3589 + }, + { + "epoch": 1.101903007980356, + "grad_norm": 0.5337465405464172, + "learning_rate": 9.832511617578497e-05, + "loss": 1.9957, + "step": 3590 + }, + { + "epoch": 1.1022099447513811, + "grad_norm": 0.5920217633247375, + "learning_rate": 9.832384020372707e-05, + "loss": 2.0571, + "step": 3591 + }, + { + "epoch": 1.1025168815224065, + "grad_norm": 0.651720404624939, + "learning_rate": 9.832256375410466e-05, + "loss": 2.0382, + "step": 3592 + }, + { + "epoch": 1.1028238182934316, + "grad_norm": 0.6063461899757385, + "learning_rate": 9.832128682693035e-05, + "loss": 1.9932, + "step": 3593 + }, + { + "epoch": 1.1031307550644567, + "grad_norm": 0.5111881494522095, + "learning_rate": 9.832000942221676e-05, + "loss": 1.9821, + "step": 3594 + }, + { + "epoch": 1.1034376918354818, + "grad_norm": 0.5419835448265076, + "learning_rate": 9.831873153997652e-05, + "loss": 2.0535, + "step": 3595 + }, + { + "epoch": 1.1037446286065071, + "grad_norm": 0.5685762763023376, + "learning_rate": 9.831745318022226e-05, + "loss": 2.0715, + "step": 3596 + }, + { + "epoch": 1.1040515653775322, + "grad_norm": 0.6095051765441895, + "learning_rate": 9.831617434296659e-05, + "loss": 2.0382, + "step": 3597 + }, + { + "epoch": 1.1043585021485574, + "grad_norm": 0.548292338848114, + "learning_rate": 9.831489502822217e-05, + "loss": 1.98, + "step": 3598 + }, + { + "epoch": 1.1046654389195825, + "grad_norm": 0.5056986808776855, + "learning_rate": 9.831361523600165e-05, + "loss": 2.0271, + "step": 3599 + }, + { + "epoch": 1.1049723756906078, + "grad_norm": 0.48790082335472107, + "learning_rate": 9.831233496631767e-05, + "loss": 1.9555, + "step": 3600 + }, + { + "epoch": 1.105279312461633, + "grad_norm": 0.4663766622543335, + "learning_rate": 9.831105421918287e-05, + "loss": 1.9985, + "step": 3601 + }, + { + "epoch": 1.105586249232658, + "grad_norm": 0.4549616277217865, + "learning_rate": 9.83097729946099e-05, + "loss": 2.0543, + "step": 3602 + }, + { + "epoch": 1.1058931860036831, + "grad_norm": 0.46699193120002747, + "learning_rate": 9.830849129261146e-05, + "loss": 2.0395, + "step": 3603 + }, + { + "epoch": 1.1062001227747085, + "grad_norm": 0.4600387215614319, + "learning_rate": 9.830720911320019e-05, + "loss": 2.0155, + "step": 3604 + }, + { + "epoch": 1.1065070595457336, + "grad_norm": 0.4854283034801483, + "learning_rate": 9.830592645638877e-05, + "loss": 2.0698, + "step": 3605 + }, + { + "epoch": 1.1068139963167587, + "grad_norm": 0.5249526500701904, + "learning_rate": 9.830464332218987e-05, + "loss": 2.0842, + "step": 3606 + }, + { + "epoch": 1.107120933087784, + "grad_norm": 0.6377332806587219, + "learning_rate": 9.830335971061616e-05, + "loss": 2.1399, + "step": 3607 + }, + { + "epoch": 1.1074278698588091, + "grad_norm": 0.632194995880127, + "learning_rate": 9.830207562168034e-05, + "loss": 2.1203, + "step": 3608 + }, + { + "epoch": 1.1077348066298343, + "grad_norm": 0.5585857629776001, + "learning_rate": 9.830079105539512e-05, + "loss": 2.0219, + "step": 3609 + }, + { + "epoch": 1.1080417434008594, + "grad_norm": 0.5613297820091248, + "learning_rate": 9.829950601177316e-05, + "loss": 2.0464, + "step": 3610 + }, + { + "epoch": 1.1083486801718845, + "grad_norm": 0.5213276743888855, + "learning_rate": 9.829822049082716e-05, + "loss": 2.0134, + "step": 3611 + }, + { + "epoch": 1.1086556169429098, + "grad_norm": 0.5008644461631775, + "learning_rate": 9.829693449256984e-05, + "loss": 1.9952, + "step": 3612 + }, + { + "epoch": 1.108962553713935, + "grad_norm": 0.5565455555915833, + "learning_rate": 9.829564801701392e-05, + "loss": 1.9737, + "step": 3613 + }, + { + "epoch": 1.10926949048496, + "grad_norm": 0.6150243878364563, + "learning_rate": 9.82943610641721e-05, + "loss": 2.0414, + "step": 3614 + }, + { + "epoch": 1.1095764272559854, + "grad_norm": 0.6731769442558289, + "learning_rate": 9.829307363405709e-05, + "loss": 2.0262, + "step": 3615 + }, + { + "epoch": 1.1098833640270105, + "grad_norm": 0.5681004524230957, + "learning_rate": 9.829178572668162e-05, + "loss": 2.0303, + "step": 3616 + }, + { + "epoch": 1.1101903007980356, + "grad_norm": 0.4748475253582001, + "learning_rate": 9.829049734205841e-05, + "loss": 1.9756, + "step": 3617 + }, + { + "epoch": 1.1104972375690607, + "grad_norm": 0.4218698740005493, + "learning_rate": 9.82892084802002e-05, + "loss": 2.0243, + "step": 3618 + }, + { + "epoch": 1.1108041743400858, + "grad_norm": 0.47928178310394287, + "learning_rate": 9.828791914111976e-05, + "loss": 2.0368, + "step": 3619 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5805749297142029, + "learning_rate": 9.828662932482977e-05, + "loss": 2.0071, + "step": 3620 + }, + { + "epoch": 1.1114180478821363, + "grad_norm": 0.5580070614814758, + "learning_rate": 9.828533903134302e-05, + "loss": 1.9568, + "step": 3621 + }, + { + "epoch": 1.1117249846531614, + "grad_norm": 0.572694718837738, + "learning_rate": 9.828404826067224e-05, + "loss": 2.0128, + "step": 3622 + }, + { + "epoch": 1.1120319214241867, + "grad_norm": 0.605338990688324, + "learning_rate": 9.828275701283021e-05, + "loss": 2.0638, + "step": 3623 + }, + { + "epoch": 1.1123388581952118, + "grad_norm": 0.550521969795227, + "learning_rate": 9.828146528782967e-05, + "loss": 2.118, + "step": 3624 + }, + { + "epoch": 1.112645794966237, + "grad_norm": 0.5420751571655273, + "learning_rate": 9.828017308568337e-05, + "loss": 2.0685, + "step": 3625 + }, + { + "epoch": 1.112952731737262, + "grad_norm": 0.5761057734489441, + "learning_rate": 9.827888040640414e-05, + "loss": 2.1111, + "step": 3626 + }, + { + "epoch": 1.1132596685082874, + "grad_norm": 0.5724154710769653, + "learning_rate": 9.827758725000468e-05, + "loss": 2.0596, + "step": 3627 + }, + { + "epoch": 1.1135666052793125, + "grad_norm": 0.5120618343353271, + "learning_rate": 9.827629361649783e-05, + "loss": 1.9811, + "step": 3628 + }, + { + "epoch": 1.1138735420503376, + "grad_norm": 0.4449520409107208, + "learning_rate": 9.827499950589633e-05, + "loss": 1.9935, + "step": 3629 + }, + { + "epoch": 1.1141804788213627, + "grad_norm": 0.5478667616844177, + "learning_rate": 9.827370491821302e-05, + "loss": 2.0142, + "step": 3630 + }, + { + "epoch": 1.114487415592388, + "grad_norm": 0.6170383095741272, + "learning_rate": 9.827240985346064e-05, + "loss": 2.0588, + "step": 3631 + }, + { + "epoch": 1.1147943523634132, + "grad_norm": 0.5950221419334412, + "learning_rate": 9.827111431165202e-05, + "loss": 2.0187, + "step": 3632 + }, + { + "epoch": 1.1151012891344383, + "grad_norm": 0.5250533819198608, + "learning_rate": 9.826981829279995e-05, + "loss": 2.0288, + "step": 3633 + }, + { + "epoch": 1.1154082259054634, + "grad_norm": 0.6252482533454895, + "learning_rate": 9.826852179691725e-05, + "loss": 2.1834, + "step": 3634 + }, + { + "epoch": 1.1157151626764887, + "grad_norm": 0.5258986353874207, + "learning_rate": 9.826722482401673e-05, + "loss": 1.9894, + "step": 3635 + }, + { + "epoch": 1.1160220994475138, + "grad_norm": 0.5532206892967224, + "learning_rate": 9.82659273741112e-05, + "loss": 2.013, + "step": 3636 + }, + { + "epoch": 1.116329036218539, + "grad_norm": 0.5178828835487366, + "learning_rate": 9.826462944721349e-05, + "loss": 1.955, + "step": 3637 + }, + { + "epoch": 1.116635972989564, + "grad_norm": 0.5466227531433105, + "learning_rate": 9.826333104333642e-05, + "loss": 2.1073, + "step": 3638 + }, + { + "epoch": 1.1169429097605894, + "grad_norm": 0.5513507723808289, + "learning_rate": 9.826203216249282e-05, + "loss": 2.0735, + "step": 3639 + }, + { + "epoch": 1.1172498465316145, + "grad_norm": 0.5485204458236694, + "learning_rate": 9.826073280469554e-05, + "loss": 2.0699, + "step": 3640 + }, + { + "epoch": 1.1175567833026396, + "grad_norm": 0.5148037075996399, + "learning_rate": 9.825943296995741e-05, + "loss": 1.9364, + "step": 3641 + }, + { + "epoch": 1.1178637200736647, + "grad_norm": 0.5639125108718872, + "learning_rate": 9.825813265829127e-05, + "loss": 2.078, + "step": 3642 + }, + { + "epoch": 1.11817065684469, + "grad_norm": 0.581631064414978, + "learning_rate": 9.825683186970997e-05, + "loss": 2.0404, + "step": 3643 + }, + { + "epoch": 1.1184775936157152, + "grad_norm": 0.5630286335945129, + "learning_rate": 9.82555306042264e-05, + "loss": 2.0615, + "step": 3644 + }, + { + "epoch": 1.1187845303867403, + "grad_norm": 0.5661062598228455, + "learning_rate": 9.825422886185338e-05, + "loss": 2.0432, + "step": 3645 + }, + { + "epoch": 1.1190914671577654, + "grad_norm": 0.4960556626319885, + "learning_rate": 9.825292664260379e-05, + "loss": 2.0576, + "step": 3646 + }, + { + "epoch": 1.1193984039287908, + "grad_norm": 0.5052362084388733, + "learning_rate": 9.825162394649048e-05, + "loss": 2.0615, + "step": 3647 + }, + { + "epoch": 1.1197053406998159, + "grad_norm": 0.566758930683136, + "learning_rate": 9.825032077352636e-05, + "loss": 2.0821, + "step": 3648 + }, + { + "epoch": 1.120012277470841, + "grad_norm": 0.5705568790435791, + "learning_rate": 9.824901712372429e-05, + "loss": 2.1455, + "step": 3649 + }, + { + "epoch": 1.120319214241866, + "grad_norm": 0.5584011673927307, + "learning_rate": 9.824771299709714e-05, + "loss": 2.0911, + "step": 3650 + }, + { + "epoch": 1.1206261510128914, + "grad_norm": 0.5621497631072998, + "learning_rate": 9.824640839365782e-05, + "loss": 2.1209, + "step": 3651 + }, + { + "epoch": 1.1209330877839165, + "grad_norm": 0.4893646240234375, + "learning_rate": 9.824510331341921e-05, + "loss": 1.977, + "step": 3652 + }, + { + "epoch": 1.1212400245549416, + "grad_norm": 0.5626688599586487, + "learning_rate": 9.82437977563942e-05, + "loss": 2.1114, + "step": 3653 + }, + { + "epoch": 1.1215469613259668, + "grad_norm": 0.5714966058731079, + "learning_rate": 9.824249172259573e-05, + "loss": 2.021, + "step": 3654 + }, + { + "epoch": 1.121853898096992, + "grad_norm": 0.5190821886062622, + "learning_rate": 9.824118521203666e-05, + "loss": 1.9788, + "step": 3655 + }, + { + "epoch": 1.1221608348680172, + "grad_norm": 0.46421363949775696, + "learning_rate": 9.823987822472994e-05, + "loss": 1.9762, + "step": 3656 + }, + { + "epoch": 1.1224677716390423, + "grad_norm": 0.5071156620979309, + "learning_rate": 9.823857076068846e-05, + "loss": 1.9625, + "step": 3657 + }, + { + "epoch": 1.1227747084100674, + "grad_norm": 0.5762679576873779, + "learning_rate": 9.823726281992515e-05, + "loss": 2.0543, + "step": 3658 + }, + { + "epoch": 1.1230816451810928, + "grad_norm": 0.6275226473808289, + "learning_rate": 9.823595440245294e-05, + "loss": 2.0878, + "step": 3659 + }, + { + "epoch": 1.1233885819521179, + "grad_norm": 0.6893213391304016, + "learning_rate": 9.823464550828476e-05, + "loss": 2.1059, + "step": 3660 + }, + { + "epoch": 1.123695518723143, + "grad_norm": 0.5521993041038513, + "learning_rate": 9.823333613743353e-05, + "loss": 2.035, + "step": 3661 + }, + { + "epoch": 1.124002455494168, + "grad_norm": 0.4918796718120575, + "learning_rate": 9.823202628991221e-05, + "loss": 1.9873, + "step": 3662 + }, + { + "epoch": 1.1243093922651934, + "grad_norm": 0.5177932977676392, + "learning_rate": 9.823071596573373e-05, + "loss": 2.0376, + "step": 3663 + }, + { + "epoch": 1.1246163290362186, + "grad_norm": 0.5337314009666443, + "learning_rate": 9.822940516491106e-05, + "loss": 2.1065, + "step": 3664 + }, + { + "epoch": 1.1249232658072437, + "grad_norm": 0.5179010629653931, + "learning_rate": 9.822809388745713e-05, + "loss": 1.9642, + "step": 3665 + }, + { + "epoch": 1.125230202578269, + "grad_norm": 0.5394679307937622, + "learning_rate": 9.82267821333849e-05, + "loss": 2.0275, + "step": 3666 + }, + { + "epoch": 1.1255371393492941, + "grad_norm": 0.582873523235321, + "learning_rate": 9.822546990270735e-05, + "loss": 2.0369, + "step": 3667 + }, + { + "epoch": 1.1258440761203192, + "grad_norm": 0.6595674753189087, + "learning_rate": 9.822415719543745e-05, + "loss": 1.9776, + "step": 3668 + }, + { + "epoch": 1.1261510128913443, + "grad_norm": 0.8103840947151184, + "learning_rate": 9.822284401158814e-05, + "loss": 2.0784, + "step": 3669 + }, + { + "epoch": 1.1264579496623695, + "grad_norm": 0.9062070250511169, + "learning_rate": 9.822153035117245e-05, + "loss": 1.9886, + "step": 3670 + }, + { + "epoch": 1.1267648864333948, + "grad_norm": 0.8718156814575195, + "learning_rate": 9.822021621420333e-05, + "loss": 2.0499, + "step": 3671 + }, + { + "epoch": 1.12707182320442, + "grad_norm": 0.6499583721160889, + "learning_rate": 9.821890160069375e-05, + "loss": 2.0734, + "step": 3672 + }, + { + "epoch": 1.127378759975445, + "grad_norm": 0.4573141932487488, + "learning_rate": 9.821758651065673e-05, + "loss": 2.0306, + "step": 3673 + }, + { + "epoch": 1.1276856967464703, + "grad_norm": 0.6441135406494141, + "learning_rate": 9.821627094410526e-05, + "loss": 2.051, + "step": 3674 + }, + { + "epoch": 1.1279926335174955, + "grad_norm": 0.7201390266418457, + "learning_rate": 9.821495490105235e-05, + "loss": 2.0187, + "step": 3675 + }, + { + "epoch": 1.1282995702885206, + "grad_norm": 0.6751874685287476, + "learning_rate": 9.821363838151099e-05, + "loss": 2.0363, + "step": 3676 + }, + { + "epoch": 1.1286065070595457, + "grad_norm": 0.5435949563980103, + "learning_rate": 9.821232138549419e-05, + "loss": 1.939, + "step": 3677 + }, + { + "epoch": 1.1289134438305708, + "grad_norm": 0.605248212814331, + "learning_rate": 9.821100391301497e-05, + "loss": 2.146, + "step": 3678 + }, + { + "epoch": 1.1292203806015961, + "grad_norm": 0.6798139810562134, + "learning_rate": 9.820968596408636e-05, + "loss": 2.0423, + "step": 3679 + }, + { + "epoch": 1.1295273173726212, + "grad_norm": 0.6683683395385742, + "learning_rate": 9.820836753872137e-05, + "loss": 1.9768, + "step": 3680 + }, + { + "epoch": 1.1298342541436464, + "grad_norm": 0.578346312046051, + "learning_rate": 9.820704863693304e-05, + "loss": 1.9313, + "step": 3681 + }, + { + "epoch": 1.1301411909146717, + "grad_norm": 0.5639599561691284, + "learning_rate": 9.820572925873441e-05, + "loss": 2.0706, + "step": 3682 + }, + { + "epoch": 1.1304481276856968, + "grad_norm": 0.5749368071556091, + "learning_rate": 9.82044094041385e-05, + "loss": 2.0072, + "step": 3683 + }, + { + "epoch": 1.130755064456722, + "grad_norm": 0.6490229368209839, + "learning_rate": 9.820308907315836e-05, + "loss": 1.9947, + "step": 3684 + }, + { + "epoch": 1.131062001227747, + "grad_norm": 0.6207692623138428, + "learning_rate": 9.820176826580705e-05, + "loss": 2.1426, + "step": 3685 + }, + { + "epoch": 1.1313689379987721, + "grad_norm": 0.6421573162078857, + "learning_rate": 9.82004469820976e-05, + "loss": 2.0558, + "step": 3686 + }, + { + "epoch": 1.1316758747697975, + "grad_norm": 0.5462764501571655, + "learning_rate": 9.81991252220431e-05, + "loss": 2.0072, + "step": 3687 + }, + { + "epoch": 1.1319828115408226, + "grad_norm": 0.49791282415390015, + "learning_rate": 9.819780298565657e-05, + "loss": 1.9949, + "step": 3688 + }, + { + "epoch": 1.1322897483118477, + "grad_norm": 0.5120366215705872, + "learning_rate": 9.819648027295112e-05, + "loss": 2.0503, + "step": 3689 + }, + { + "epoch": 1.132596685082873, + "grad_norm": 0.5118343830108643, + "learning_rate": 9.81951570839398e-05, + "loss": 2.0104, + "step": 3690 + }, + { + "epoch": 1.1329036218538981, + "grad_norm": 0.44520822167396545, + "learning_rate": 9.81938334186357e-05, + "loss": 2.0024, + "step": 3691 + }, + { + "epoch": 1.1332105586249233, + "grad_norm": 0.5505960583686829, + "learning_rate": 9.819250927705188e-05, + "loss": 2.0924, + "step": 3692 + }, + { + "epoch": 1.1335174953959484, + "grad_norm": 0.5269182920455933, + "learning_rate": 9.819118465920143e-05, + "loss": 2.0553, + "step": 3693 + }, + { + "epoch": 1.1338244321669735, + "grad_norm": 0.4864311218261719, + "learning_rate": 9.818985956509745e-05, + "loss": 2.0405, + "step": 3694 + }, + { + "epoch": 1.1341313689379988, + "grad_norm": 0.515202522277832, + "learning_rate": 9.818853399475304e-05, + "loss": 2.0211, + "step": 3695 + }, + { + "epoch": 1.134438305709024, + "grad_norm": 0.5360483527183533, + "learning_rate": 9.818720794818128e-05, + "loss": 2.1077, + "step": 3696 + }, + { + "epoch": 1.134745242480049, + "grad_norm": 0.5469255447387695, + "learning_rate": 9.818588142539531e-05, + "loss": 1.9538, + "step": 3697 + }, + { + "epoch": 1.1350521792510744, + "grad_norm": 0.5042214393615723, + "learning_rate": 9.818455442640819e-05, + "loss": 2.0477, + "step": 3698 + }, + { + "epoch": 1.1353591160220995, + "grad_norm": 0.5678744316101074, + "learning_rate": 9.81832269512331e-05, + "loss": 2.0871, + "step": 3699 + }, + { + "epoch": 1.1356660527931246, + "grad_norm": 0.5218677520751953, + "learning_rate": 9.818189899988308e-05, + "loss": 2.1014, + "step": 3700 + }, + { + "epoch": 1.1359729895641497, + "grad_norm": 0.5141727924346924, + "learning_rate": 9.818057057237132e-05, + "loss": 2.0385, + "step": 3701 + }, + { + "epoch": 1.136279926335175, + "grad_norm": 0.5288038849830627, + "learning_rate": 9.81792416687109e-05, + "loss": 2.0736, + "step": 3702 + }, + { + "epoch": 1.1365868631062002, + "grad_norm": 0.5533168911933899, + "learning_rate": 9.817791228891499e-05, + "loss": 2.032, + "step": 3703 + }, + { + "epoch": 1.1368937998772253, + "grad_norm": 0.4840674102306366, + "learning_rate": 9.81765824329967e-05, + "loss": 2.027, + "step": 3704 + }, + { + "epoch": 1.1372007366482504, + "grad_norm": 0.5060023069381714, + "learning_rate": 9.817525210096921e-05, + "loss": 2.0561, + "step": 3705 + }, + { + "epoch": 1.1375076734192757, + "grad_norm": 0.48830488324165344, + "learning_rate": 9.817392129284561e-05, + "loss": 1.9807, + "step": 3706 + }, + { + "epoch": 1.1378146101903008, + "grad_norm": 0.4644564390182495, + "learning_rate": 9.817259000863911e-05, + "loss": 1.9871, + "step": 3707 + }, + { + "epoch": 1.138121546961326, + "grad_norm": 0.4644739329814911, + "learning_rate": 9.817125824836283e-05, + "loss": 2.0253, + "step": 3708 + }, + { + "epoch": 1.138428483732351, + "grad_norm": 0.5376463532447815, + "learning_rate": 9.816992601202994e-05, + "loss": 2.0693, + "step": 3709 + }, + { + "epoch": 1.1387354205033764, + "grad_norm": 0.49980148673057556, + "learning_rate": 9.816859329965363e-05, + "loss": 2.0123, + "step": 3710 + }, + { + "epoch": 1.1390423572744015, + "grad_norm": 0.5452225208282471, + "learning_rate": 9.816726011124702e-05, + "loss": 2.0725, + "step": 3711 + }, + { + "epoch": 1.1393492940454266, + "grad_norm": 0.5428896546363831, + "learning_rate": 9.816592644682332e-05, + "loss": 2.0446, + "step": 3712 + }, + { + "epoch": 1.1396562308164517, + "grad_norm": 0.5448847413063049, + "learning_rate": 9.816459230639571e-05, + "loss": 2.0262, + "step": 3713 + }, + { + "epoch": 1.139963167587477, + "grad_norm": 0.48574572801589966, + "learning_rate": 9.816325768997736e-05, + "loss": 2.0105, + "step": 3714 + }, + { + "epoch": 1.1402701043585022, + "grad_norm": 0.5566397905349731, + "learning_rate": 9.816192259758147e-05, + "loss": 2.0665, + "step": 3715 + }, + { + "epoch": 1.1405770411295273, + "grad_norm": 0.6098625659942627, + "learning_rate": 9.816058702922124e-05, + "loss": 2.0589, + "step": 3716 + }, + { + "epoch": 1.1408839779005524, + "grad_norm": 0.6118699312210083, + "learning_rate": 9.815925098490985e-05, + "loss": 2.0683, + "step": 3717 + }, + { + "epoch": 1.1411909146715777, + "grad_norm": 0.5213121175765991, + "learning_rate": 9.815791446466053e-05, + "loss": 2.0226, + "step": 3718 + }, + { + "epoch": 1.1414978514426029, + "grad_norm": 0.45717960596084595, + "learning_rate": 9.815657746848648e-05, + "loss": 2.0371, + "step": 3719 + }, + { + "epoch": 1.141804788213628, + "grad_norm": 0.4613656997680664, + "learning_rate": 9.815523999640088e-05, + "loss": 2.0702, + "step": 3720 + }, + { + "epoch": 1.142111724984653, + "grad_norm": 0.4527476727962494, + "learning_rate": 9.8153902048417e-05, + "loss": 1.9893, + "step": 3721 + }, + { + "epoch": 1.1424186617556784, + "grad_norm": 0.4524305462837219, + "learning_rate": 9.815256362454801e-05, + "loss": 1.975, + "step": 3722 + }, + { + "epoch": 1.1427255985267035, + "grad_norm": 0.4421180188655853, + "learning_rate": 9.815122472480718e-05, + "loss": 1.9987, + "step": 3723 + }, + { + "epoch": 1.1430325352977286, + "grad_norm": 0.4833788275718689, + "learning_rate": 9.814988534920771e-05, + "loss": 2.0246, + "step": 3724 + }, + { + "epoch": 1.1433394720687537, + "grad_norm": 0.46547624468803406, + "learning_rate": 9.814854549776287e-05, + "loss": 2.0007, + "step": 3725 + }, + { + "epoch": 1.143646408839779, + "grad_norm": 0.43220648169517517, + "learning_rate": 9.814720517048587e-05, + "loss": 1.9845, + "step": 3726 + }, + { + "epoch": 1.1439533456108042, + "grad_norm": 0.473910391330719, + "learning_rate": 9.814586436738998e-05, + "loss": 2.0518, + "step": 3727 + }, + { + "epoch": 1.1442602823818293, + "grad_norm": 0.507354199886322, + "learning_rate": 9.814452308848843e-05, + "loss": 2.0708, + "step": 3728 + }, + { + "epoch": 1.1445672191528544, + "grad_norm": 0.4585053622722626, + "learning_rate": 9.814318133379448e-05, + "loss": 2.0124, + "step": 3729 + }, + { + "epoch": 1.1448741559238798, + "grad_norm": 0.5280457735061646, + "learning_rate": 9.81418391033214e-05, + "loss": 2.0424, + "step": 3730 + }, + { + "epoch": 1.1451810926949049, + "grad_norm": 0.5173056125640869, + "learning_rate": 9.814049639708245e-05, + "loss": 1.9666, + "step": 3731 + }, + { + "epoch": 1.14548802946593, + "grad_norm": 0.5850839018821716, + "learning_rate": 9.81391532150909e-05, + "loss": 2.0765, + "step": 3732 + }, + { + "epoch": 1.145794966236955, + "grad_norm": 0.5450417995452881, + "learning_rate": 9.813780955736002e-05, + "loss": 2.0696, + "step": 3733 + }, + { + "epoch": 1.1461019030079804, + "grad_norm": 0.4577319622039795, + "learning_rate": 9.81364654239031e-05, + "loss": 2.0493, + "step": 3734 + }, + { + "epoch": 1.1464088397790055, + "grad_norm": 0.5211838483810425, + "learning_rate": 9.813512081473339e-05, + "loss": 2.0578, + "step": 3735 + }, + { + "epoch": 1.1467157765500307, + "grad_norm": 0.6763051152229309, + "learning_rate": 9.813377572986422e-05, + "loss": 2.0859, + "step": 3736 + }, + { + "epoch": 1.1470227133210558, + "grad_norm": 0.8591815233230591, + "learning_rate": 9.813243016930887e-05, + "loss": 1.9743, + "step": 3737 + }, + { + "epoch": 1.147329650092081, + "grad_norm": 0.8573755025863647, + "learning_rate": 9.813108413308063e-05, + "loss": 2.048, + "step": 3738 + }, + { + "epoch": 1.1476365868631062, + "grad_norm": 0.6887713074684143, + "learning_rate": 9.812973762119281e-05, + "loss": 2.0184, + "step": 3739 + }, + { + "epoch": 1.1479435236341313, + "grad_norm": 0.5491438508033752, + "learning_rate": 9.81283906336587e-05, + "loss": 2.0373, + "step": 3740 + }, + { + "epoch": 1.1482504604051567, + "grad_norm": 0.6413923501968384, + "learning_rate": 9.812704317049164e-05, + "loss": 2.067, + "step": 3741 + }, + { + "epoch": 1.1485573971761818, + "grad_norm": 0.8731338381767273, + "learning_rate": 9.812569523170492e-05, + "loss": 1.9996, + "step": 3742 + }, + { + "epoch": 1.1488643339472069, + "grad_norm": 0.8043886423110962, + "learning_rate": 9.812434681731189e-05, + "loss": 2.0464, + "step": 3743 + }, + { + "epoch": 1.149171270718232, + "grad_norm": 0.5458279252052307, + "learning_rate": 9.812299792732584e-05, + "loss": 2.0026, + "step": 3744 + }, + { + "epoch": 1.149478207489257, + "grad_norm": 0.5135432481765747, + "learning_rate": 9.812164856176011e-05, + "loss": 2.0302, + "step": 3745 + }, + { + "epoch": 1.1497851442602824, + "grad_norm": 0.6673153638839722, + "learning_rate": 9.812029872062807e-05, + "loss": 2.0435, + "step": 3746 + }, + { + "epoch": 1.1500920810313076, + "grad_norm": 0.6777083873748779, + "learning_rate": 9.811894840394302e-05, + "loss": 2.0591, + "step": 3747 + }, + { + "epoch": 1.1503990178023327, + "grad_norm": 0.6660524010658264, + "learning_rate": 9.811759761171833e-05, + "loss": 2.0461, + "step": 3748 + }, + { + "epoch": 1.150705954573358, + "grad_norm": 0.6079594492912292, + "learning_rate": 9.811624634396733e-05, + "loss": 2.0708, + "step": 3749 + }, + { + "epoch": 1.1510128913443831, + "grad_norm": 0.5242465734481812, + "learning_rate": 9.811489460070337e-05, + "loss": 2.0513, + "step": 3750 + }, + { + "epoch": 1.1513198281154082, + "grad_norm": 0.7091820240020752, + "learning_rate": 9.811354238193984e-05, + "loss": 2.0356, + "step": 3751 + }, + { + "epoch": 1.1516267648864333, + "grad_norm": 0.6781896948814392, + "learning_rate": 9.811218968769007e-05, + "loss": 2.0693, + "step": 3752 + }, + { + "epoch": 1.1519337016574585, + "grad_norm": 0.6036314368247986, + "learning_rate": 9.811083651796744e-05, + "loss": 2.134, + "step": 3753 + }, + { + "epoch": 1.1522406384284838, + "grad_norm": 0.6173892617225647, + "learning_rate": 9.810948287278534e-05, + "loss": 2.056, + "step": 3754 + }, + { + "epoch": 1.152547575199509, + "grad_norm": 0.4903198182582855, + "learning_rate": 9.810812875215712e-05, + "loss": 2.0037, + "step": 3755 + }, + { + "epoch": 1.152854511970534, + "grad_norm": 0.5527236461639404, + "learning_rate": 9.810677415609619e-05, + "loss": 2.0334, + "step": 3756 + }, + { + "epoch": 1.1531614487415593, + "grad_norm": 0.5342993140220642, + "learning_rate": 9.81054190846159e-05, + "loss": 2.0376, + "step": 3757 + }, + { + "epoch": 1.1534683855125845, + "grad_norm": 0.4860527515411377, + "learning_rate": 9.810406353772968e-05, + "loss": 2.0009, + "step": 3758 + }, + { + "epoch": 1.1537753222836096, + "grad_norm": 0.49722176790237427, + "learning_rate": 9.810270751545089e-05, + "loss": 2.051, + "step": 3759 + }, + { + "epoch": 1.1540822590546347, + "grad_norm": 0.4714743196964264, + "learning_rate": 9.810135101779296e-05, + "loss": 2.0474, + "step": 3760 + }, + { + "epoch": 1.1543891958256598, + "grad_norm": 0.5183619856834412, + "learning_rate": 9.80999940447693e-05, + "loss": 2.1032, + "step": 3761 + }, + { + "epoch": 1.1546961325966851, + "grad_norm": 0.6118659377098083, + "learning_rate": 9.809863659639328e-05, + "loss": 2.0967, + "step": 3762 + }, + { + "epoch": 1.1550030693677102, + "grad_norm": 0.49166184663772583, + "learning_rate": 9.809727867267838e-05, + "loss": 2.0683, + "step": 3763 + }, + { + "epoch": 1.1553100061387354, + "grad_norm": 0.5190026164054871, + "learning_rate": 9.809592027363795e-05, + "loss": 2.0161, + "step": 3764 + }, + { + "epoch": 1.1556169429097607, + "grad_norm": 0.516914427280426, + "learning_rate": 9.809456139928546e-05, + "loss": 2.0886, + "step": 3765 + }, + { + "epoch": 1.1559238796807858, + "grad_norm": 0.49737948179244995, + "learning_rate": 9.809320204963433e-05, + "loss": 2.0111, + "step": 3766 + }, + { + "epoch": 1.156230816451811, + "grad_norm": 0.44676536321640015, + "learning_rate": 9.809184222469796e-05, + "loss": 2.0571, + "step": 3767 + }, + { + "epoch": 1.156537753222836, + "grad_norm": 0.5008999109268188, + "learning_rate": 9.809048192448983e-05, + "loss": 2.0489, + "step": 3768 + }, + { + "epoch": 1.1568446899938611, + "grad_norm": 0.5116657614707947, + "learning_rate": 9.80891211490234e-05, + "loss": 1.9571, + "step": 3769 + }, + { + "epoch": 1.1571516267648865, + "grad_norm": 0.49909651279449463, + "learning_rate": 9.808775989831207e-05, + "loss": 2.0568, + "step": 3770 + }, + { + "epoch": 1.1574585635359116, + "grad_norm": 0.5186662077903748, + "learning_rate": 9.80863981723693e-05, + "loss": 2.0283, + "step": 3771 + }, + { + "epoch": 1.1577655003069367, + "grad_norm": 0.4974740445613861, + "learning_rate": 9.808503597120858e-05, + "loss": 1.9525, + "step": 3772 + }, + { + "epoch": 1.158072437077962, + "grad_norm": 0.5369553565979004, + "learning_rate": 9.808367329484333e-05, + "loss": 1.9627, + "step": 3773 + }, + { + "epoch": 1.1583793738489871, + "grad_norm": 0.5084113478660583, + "learning_rate": 9.808231014328704e-05, + "loss": 1.9563, + "step": 3774 + }, + { + "epoch": 1.1586863106200123, + "grad_norm": 0.6059956550598145, + "learning_rate": 9.808094651655319e-05, + "loss": 2.078, + "step": 3775 + }, + { + "epoch": 1.1589932473910374, + "grad_norm": 0.5677124261856079, + "learning_rate": 9.807958241465523e-05, + "loss": 1.9977, + "step": 3776 + }, + { + "epoch": 1.1593001841620627, + "grad_norm": 0.5582616329193115, + "learning_rate": 9.807821783760667e-05, + "loss": 2.0053, + "step": 3777 + }, + { + "epoch": 1.1596071209330878, + "grad_norm": 0.5558032989501953, + "learning_rate": 9.807685278542097e-05, + "loss": 2.0015, + "step": 3778 + }, + { + "epoch": 1.159914057704113, + "grad_norm": 0.553292989730835, + "learning_rate": 9.807548725811165e-05, + "loss": 2.133, + "step": 3779 + }, + { + "epoch": 1.160220994475138, + "grad_norm": 0.5281317234039307, + "learning_rate": 9.807412125569217e-05, + "loss": 2.0018, + "step": 3780 + }, + { + "epoch": 1.1605279312461634, + "grad_norm": 0.45385050773620605, + "learning_rate": 9.807275477817605e-05, + "loss": 1.9986, + "step": 3781 + }, + { + "epoch": 1.1608348680171885, + "grad_norm": 0.5843673944473267, + "learning_rate": 9.80713878255768e-05, + "loss": 2.0653, + "step": 3782 + }, + { + "epoch": 1.1611418047882136, + "grad_norm": 0.6193283796310425, + "learning_rate": 9.807002039790792e-05, + "loss": 1.9646, + "step": 3783 + }, + { + "epoch": 1.1614487415592387, + "grad_norm": 0.5831897258758545, + "learning_rate": 9.806865249518292e-05, + "loss": 1.9708, + "step": 3784 + }, + { + "epoch": 1.161755678330264, + "grad_norm": 0.49771901965141296, + "learning_rate": 9.806728411741533e-05, + "loss": 1.9953, + "step": 3785 + }, + { + "epoch": 1.1620626151012892, + "grad_norm": 0.5003515481948853, + "learning_rate": 9.806591526461864e-05, + "loss": 2.0503, + "step": 3786 + }, + { + "epoch": 1.1623695518723143, + "grad_norm": 0.5710052847862244, + "learning_rate": 9.806454593680642e-05, + "loss": 1.9976, + "step": 3787 + }, + { + "epoch": 1.1626764886433394, + "grad_norm": 0.5180788040161133, + "learning_rate": 9.806317613399218e-05, + "loss": 1.9872, + "step": 3788 + }, + { + "epoch": 1.1629834254143647, + "grad_norm": 0.5202008485794067, + "learning_rate": 9.806180585618949e-05, + "loss": 1.9628, + "step": 3789 + }, + { + "epoch": 1.1632903621853898, + "grad_norm": 0.47358211874961853, + "learning_rate": 9.806043510341183e-05, + "loss": 1.9994, + "step": 3790 + }, + { + "epoch": 1.163597298956415, + "grad_norm": 0.4258720278739929, + "learning_rate": 9.80590638756728e-05, + "loss": 1.9547, + "step": 3791 + }, + { + "epoch": 1.16390423572744, + "grad_norm": 0.4487614035606384, + "learning_rate": 9.805769217298593e-05, + "loss": 1.9912, + "step": 3792 + }, + { + "epoch": 1.1642111724984654, + "grad_norm": 0.4970495104789734, + "learning_rate": 9.805631999536477e-05, + "loss": 2.0568, + "step": 3793 + }, + { + "epoch": 1.1645181092694905, + "grad_norm": 0.4535474479198456, + "learning_rate": 9.805494734282289e-05, + "loss": 2.0088, + "step": 3794 + }, + { + "epoch": 1.1648250460405156, + "grad_norm": 0.44582805037498474, + "learning_rate": 9.805357421537385e-05, + "loss": 1.9694, + "step": 3795 + }, + { + "epoch": 1.1651319828115407, + "grad_norm": 0.43872734904289246, + "learning_rate": 9.805220061303125e-05, + "loss": 2.0041, + "step": 3796 + }, + { + "epoch": 1.165438919582566, + "grad_norm": 0.5050458908081055, + "learning_rate": 9.805082653580861e-05, + "loss": 1.9963, + "step": 3797 + }, + { + "epoch": 1.1657458563535912, + "grad_norm": 0.5346884727478027, + "learning_rate": 9.804945198371956e-05, + "loss": 2.0334, + "step": 3798 + }, + { + "epoch": 1.1660527931246163, + "grad_norm": 0.5607240796089172, + "learning_rate": 9.804807695677764e-05, + "loss": 2.0474, + "step": 3799 + }, + { + "epoch": 1.1663597298956414, + "grad_norm": 0.5343592166900635, + "learning_rate": 9.804670145499648e-05, + "loss": 2.0542, + "step": 3800 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5195753574371338, + "learning_rate": 9.804532547838964e-05, + "loss": 2.0816, + "step": 3801 + }, + { + "epoch": 1.1669736034376919, + "grad_norm": 0.575821042060852, + "learning_rate": 9.804394902697075e-05, + "loss": 2.0182, + "step": 3802 + }, + { + "epoch": 1.167280540208717, + "grad_norm": 0.6385466456413269, + "learning_rate": 9.804257210075339e-05, + "loss": 2.0519, + "step": 3803 + }, + { + "epoch": 1.167587476979742, + "grad_norm": 0.7202457785606384, + "learning_rate": 9.804119469975117e-05, + "loss": 1.9871, + "step": 3804 + }, + { + "epoch": 1.1678944137507674, + "grad_norm": 0.696793258190155, + "learning_rate": 9.803981682397772e-05, + "loss": 2.1018, + "step": 3805 + }, + { + "epoch": 1.1682013505217925, + "grad_norm": 0.6217656135559082, + "learning_rate": 9.803843847344662e-05, + "loss": 2.1009, + "step": 3806 + }, + { + "epoch": 1.1685082872928176, + "grad_norm": 0.5296351313591003, + "learning_rate": 9.803705964817153e-05, + "loss": 2.1057, + "step": 3807 + }, + { + "epoch": 1.168815224063843, + "grad_norm": 0.5280975699424744, + "learning_rate": 9.803568034816606e-05, + "loss": 2.0019, + "step": 3808 + }, + { + "epoch": 1.169122160834868, + "grad_norm": 0.4981881380081177, + "learning_rate": 9.803430057344385e-05, + "loss": 1.9918, + "step": 3809 + }, + { + "epoch": 1.1694290976058932, + "grad_norm": 0.43662941455841064, + "learning_rate": 9.803292032401852e-05, + "loss": 2.0273, + "step": 3810 + }, + { + "epoch": 1.1697360343769183, + "grad_norm": 0.5039259791374207, + "learning_rate": 9.80315395999037e-05, + "loss": 2.0475, + "step": 3811 + }, + { + "epoch": 1.1700429711479434, + "grad_norm": 0.4330410957336426, + "learning_rate": 9.803015840111308e-05, + "loss": 1.99, + "step": 3812 + }, + { + "epoch": 1.1703499079189688, + "grad_norm": 0.4603813886642456, + "learning_rate": 9.802877672766026e-05, + "loss": 2.0288, + "step": 3813 + }, + { + "epoch": 1.1706568446899939, + "grad_norm": 0.45815590023994446, + "learning_rate": 9.802739457955894e-05, + "loss": 2.0026, + "step": 3814 + }, + { + "epoch": 1.170963781461019, + "grad_norm": 0.46995803713798523, + "learning_rate": 9.802601195682275e-05, + "loss": 2.0608, + "step": 3815 + }, + { + "epoch": 1.1712707182320443, + "grad_norm": 0.4511576294898987, + "learning_rate": 9.802462885946536e-05, + "loss": 1.9793, + "step": 3816 + }, + { + "epoch": 1.1715776550030694, + "grad_norm": 0.49079468846321106, + "learning_rate": 9.802324528750044e-05, + "loss": 2.0049, + "step": 3817 + }, + { + "epoch": 1.1718845917740945, + "grad_norm": 0.47245466709136963, + "learning_rate": 9.802186124094166e-05, + "loss": 1.9562, + "step": 3818 + }, + { + "epoch": 1.1721915285451197, + "grad_norm": 0.485575795173645, + "learning_rate": 9.80204767198027e-05, + "loss": 2.0212, + "step": 3819 + }, + { + "epoch": 1.1724984653161448, + "grad_norm": 0.5924440622329712, + "learning_rate": 9.801909172409724e-05, + "loss": 1.9875, + "step": 3820 + }, + { + "epoch": 1.17280540208717, + "grad_norm": 0.48908641934394836, + "learning_rate": 9.801770625383899e-05, + "loss": 1.9778, + "step": 3821 + }, + { + "epoch": 1.1731123388581952, + "grad_norm": 0.4372415840625763, + "learning_rate": 9.80163203090416e-05, + "loss": 1.9368, + "step": 3822 + }, + { + "epoch": 1.1734192756292203, + "grad_norm": 0.5811094641685486, + "learning_rate": 9.801493388971881e-05, + "loss": 2.1293, + "step": 3823 + }, + { + "epoch": 1.1737262124002457, + "grad_norm": 0.516983151435852, + "learning_rate": 9.801354699588428e-05, + "loss": 2.039, + "step": 3824 + }, + { + "epoch": 1.1740331491712708, + "grad_norm": 0.53409343957901, + "learning_rate": 9.801215962755175e-05, + "loss": 2.0294, + "step": 3825 + }, + { + "epoch": 1.1743400859422959, + "grad_norm": 0.5703202486038208, + "learning_rate": 9.801077178473492e-05, + "loss": 2.0241, + "step": 3826 + }, + { + "epoch": 1.174647022713321, + "grad_norm": 0.49341192841529846, + "learning_rate": 9.80093834674475e-05, + "loss": 1.9092, + "step": 3827 + }, + { + "epoch": 1.174953959484346, + "grad_norm": 0.46960577368736267, + "learning_rate": 9.800799467570321e-05, + "loss": 1.9994, + "step": 3828 + }, + { + "epoch": 1.1752608962553714, + "grad_norm": 0.468108594417572, + "learning_rate": 9.800660540951577e-05, + "loss": 1.9471, + "step": 3829 + }, + { + "epoch": 1.1755678330263966, + "grad_norm": 0.4133259057998657, + "learning_rate": 9.800521566889893e-05, + "loss": 2.0159, + "step": 3830 + }, + { + "epoch": 1.1758747697974217, + "grad_norm": 0.44991979002952576, + "learning_rate": 9.800382545386641e-05, + "loss": 2.0179, + "step": 3831 + }, + { + "epoch": 1.176181706568447, + "grad_norm": 0.43111294507980347, + "learning_rate": 9.800243476443195e-05, + "loss": 2.1092, + "step": 3832 + }, + { + "epoch": 1.1764886433394721, + "grad_norm": 0.4859693944454193, + "learning_rate": 9.800104360060929e-05, + "loss": 2.0134, + "step": 3833 + }, + { + "epoch": 1.1767955801104972, + "grad_norm": 0.474960058927536, + "learning_rate": 9.799965196241219e-05, + "loss": 2.0288, + "step": 3834 + }, + { + "epoch": 1.1771025168815223, + "grad_norm": 0.5269008278846741, + "learning_rate": 9.79982598498544e-05, + "loss": 2.063, + "step": 3835 + }, + { + "epoch": 1.1774094536525475, + "grad_norm": 0.4923003613948822, + "learning_rate": 9.799686726294965e-05, + "loss": 1.9506, + "step": 3836 + }, + { + "epoch": 1.1777163904235728, + "grad_norm": 0.5355561971664429, + "learning_rate": 9.799547420171175e-05, + "loss": 2.0066, + "step": 3837 + }, + { + "epoch": 1.178023327194598, + "grad_norm": 0.6095728874206543, + "learning_rate": 9.799408066615443e-05, + "loss": 1.9799, + "step": 3838 + }, + { + "epoch": 1.178330263965623, + "grad_norm": 0.5268104672431946, + "learning_rate": 9.799268665629148e-05, + "loss": 2.0409, + "step": 3839 + }, + { + "epoch": 1.1786372007366483, + "grad_norm": 0.4478130340576172, + "learning_rate": 9.799129217213667e-05, + "loss": 1.9521, + "step": 3840 + }, + { + "epoch": 1.1789441375076735, + "grad_norm": 0.4691653847694397, + "learning_rate": 9.798989721370379e-05, + "loss": 2.0432, + "step": 3841 + }, + { + "epoch": 1.1792510742786986, + "grad_norm": 0.5602376461029053, + "learning_rate": 9.798850178100661e-05, + "loss": 2.0557, + "step": 3842 + }, + { + "epoch": 1.1795580110497237, + "grad_norm": 0.5619905591011047, + "learning_rate": 9.798710587405893e-05, + "loss": 2.0258, + "step": 3843 + }, + { + "epoch": 1.179864947820749, + "grad_norm": 0.5845574736595154, + "learning_rate": 9.798570949287454e-05, + "loss": 2.0637, + "step": 3844 + }, + { + "epoch": 1.1801718845917741, + "grad_norm": 0.5339313745498657, + "learning_rate": 9.798431263746725e-05, + "loss": 2.0265, + "step": 3845 + }, + { + "epoch": 1.1804788213627992, + "grad_norm": 0.45720914006233215, + "learning_rate": 9.798291530785086e-05, + "loss": 1.9745, + "step": 3846 + }, + { + "epoch": 1.1807857581338244, + "grad_norm": 0.5121282935142517, + "learning_rate": 9.798151750403917e-05, + "loss": 2.0427, + "step": 3847 + }, + { + "epoch": 1.1810926949048497, + "grad_norm": 0.48100459575653076, + "learning_rate": 9.7980119226046e-05, + "loss": 2.0307, + "step": 3848 + }, + { + "epoch": 1.1813996316758748, + "grad_norm": 0.4424034655094147, + "learning_rate": 9.797872047388517e-05, + "loss": 1.9697, + "step": 3849 + }, + { + "epoch": 1.1817065684469, + "grad_norm": 0.45154938101768494, + "learning_rate": 9.797732124757051e-05, + "loss": 1.9689, + "step": 3850 + }, + { + "epoch": 1.182013505217925, + "grad_norm": 0.4807071387767792, + "learning_rate": 9.797592154711584e-05, + "loss": 1.9616, + "step": 3851 + }, + { + "epoch": 1.1823204419889504, + "grad_norm": 0.5113904476165771, + "learning_rate": 9.797452137253498e-05, + "loss": 2.0158, + "step": 3852 + }, + { + "epoch": 1.1826273787599755, + "grad_norm": 0.5456753969192505, + "learning_rate": 9.797312072384179e-05, + "loss": 1.977, + "step": 3853 + }, + { + "epoch": 1.1829343155310006, + "grad_norm": 0.5545704364776611, + "learning_rate": 9.797171960105012e-05, + "loss": 2.0622, + "step": 3854 + }, + { + "epoch": 1.1832412523020257, + "grad_norm": 0.651498556137085, + "learning_rate": 9.797031800417377e-05, + "loss": 2.0739, + "step": 3855 + }, + { + "epoch": 1.183548189073051, + "grad_norm": 0.748968780040741, + "learning_rate": 9.796891593322665e-05, + "loss": 2.0713, + "step": 3856 + }, + { + "epoch": 1.1838551258440762, + "grad_norm": 0.8724157214164734, + "learning_rate": 9.796751338822256e-05, + "loss": 2.0224, + "step": 3857 + }, + { + "epoch": 1.1841620626151013, + "grad_norm": 0.8158844709396362, + "learning_rate": 9.796611036917542e-05, + "loss": 2.0165, + "step": 3858 + }, + { + "epoch": 1.1844689993861264, + "grad_norm": 0.6231487989425659, + "learning_rate": 9.796470687609904e-05, + "loss": 1.9607, + "step": 3859 + }, + { + "epoch": 1.1847759361571517, + "grad_norm": 0.49367067217826843, + "learning_rate": 9.796330290900731e-05, + "loss": 2.0074, + "step": 3860 + }, + { + "epoch": 1.1850828729281768, + "grad_norm": 0.5546393990516663, + "learning_rate": 9.796189846791413e-05, + "loss": 1.9688, + "step": 3861 + }, + { + "epoch": 1.185389809699202, + "grad_norm": 0.5880963802337646, + "learning_rate": 9.796049355283333e-05, + "loss": 2.0192, + "step": 3862 + }, + { + "epoch": 1.185696746470227, + "grad_norm": 0.6064910292625427, + "learning_rate": 9.795908816377884e-05, + "loss": 2.0236, + "step": 3863 + }, + { + "epoch": 1.1860036832412524, + "grad_norm": 0.524116575717926, + "learning_rate": 9.795768230076454e-05, + "loss": 2.0315, + "step": 3864 + }, + { + "epoch": 1.1863106200122775, + "grad_norm": 0.449158251285553, + "learning_rate": 9.79562759638043e-05, + "loss": 1.9423, + "step": 3865 + }, + { + "epoch": 1.1866175567833026, + "grad_norm": 0.5623016953468323, + "learning_rate": 9.795486915291203e-05, + "loss": 2.096, + "step": 3866 + }, + { + "epoch": 1.1869244935543277, + "grad_norm": 0.6107217073440552, + "learning_rate": 9.795346186810164e-05, + "loss": 1.9994, + "step": 3867 + }, + { + "epoch": 1.187231430325353, + "grad_norm": 0.5559211373329163, + "learning_rate": 9.795205410938704e-05, + "loss": 2.0138, + "step": 3868 + }, + { + "epoch": 1.1875383670963782, + "grad_norm": 0.5022037029266357, + "learning_rate": 9.795064587678212e-05, + "loss": 2.0835, + "step": 3869 + }, + { + "epoch": 1.1878453038674033, + "grad_norm": 0.5760810971260071, + "learning_rate": 9.794923717030082e-05, + "loss": 2.0839, + "step": 3870 + }, + { + "epoch": 1.1881522406384284, + "grad_norm": 0.559018075466156, + "learning_rate": 9.794782798995706e-05, + "loss": 2.0397, + "step": 3871 + }, + { + "epoch": 1.1884591774094537, + "grad_norm": 0.48842501640319824, + "learning_rate": 9.794641833576477e-05, + "loss": 2.022, + "step": 3872 + }, + { + "epoch": 1.1887661141804788, + "grad_norm": 0.47267377376556396, + "learning_rate": 9.794500820773785e-05, + "loss": 1.9677, + "step": 3873 + }, + { + "epoch": 1.189073050951504, + "grad_norm": 0.5107980966567993, + "learning_rate": 9.794359760589026e-05, + "loss": 2.124, + "step": 3874 + }, + { + "epoch": 1.189379987722529, + "grad_norm": 0.4993875026702881, + "learning_rate": 9.794218653023595e-05, + "loss": 1.9528, + "step": 3875 + }, + { + "epoch": 1.1896869244935544, + "grad_norm": 0.49543896317481995, + "learning_rate": 9.794077498078885e-05, + "loss": 2.0257, + "step": 3876 + }, + { + "epoch": 1.1899938612645795, + "grad_norm": 0.5207403302192688, + "learning_rate": 9.79393629575629e-05, + "loss": 2.0853, + "step": 3877 + }, + { + "epoch": 1.1903007980356046, + "grad_norm": 0.44884833693504333, + "learning_rate": 9.793795046057208e-05, + "loss": 1.9366, + "step": 3878 + }, + { + "epoch": 1.1906077348066297, + "grad_norm": 0.47921934723854065, + "learning_rate": 9.793653748983033e-05, + "loss": 2.0614, + "step": 3879 + }, + { + "epoch": 1.190914671577655, + "grad_norm": 0.5371566414833069, + "learning_rate": 9.793512404535163e-05, + "loss": 2.0433, + "step": 3880 + }, + { + "epoch": 1.1912216083486802, + "grad_norm": 0.48760104179382324, + "learning_rate": 9.793371012714994e-05, + "loss": 2.0061, + "step": 3881 + }, + { + "epoch": 1.1915285451197053, + "grad_norm": 0.47291669249534607, + "learning_rate": 9.793229573523922e-05, + "loss": 2.0661, + "step": 3882 + }, + { + "epoch": 1.1918354818907306, + "grad_norm": 0.5348502397537231, + "learning_rate": 9.793088086963347e-05, + "loss": 2.0131, + "step": 3883 + }, + { + "epoch": 1.1921424186617557, + "grad_norm": 0.6291812062263489, + "learning_rate": 9.792946553034666e-05, + "loss": 2.0312, + "step": 3884 + }, + { + "epoch": 1.1924493554327809, + "grad_norm": 0.5620503425598145, + "learning_rate": 9.792804971739276e-05, + "loss": 2.0429, + "step": 3885 + }, + { + "epoch": 1.192756292203806, + "grad_norm": 0.4984607696533203, + "learning_rate": 9.792663343078581e-05, + "loss": 2.0183, + "step": 3886 + }, + { + "epoch": 1.193063228974831, + "grad_norm": 0.5867961645126343, + "learning_rate": 9.792521667053975e-05, + "loss": 2.0609, + "step": 3887 + }, + { + "epoch": 1.1933701657458564, + "grad_norm": 0.5819169282913208, + "learning_rate": 9.792379943666863e-05, + "loss": 1.9412, + "step": 3888 + }, + { + "epoch": 1.1936771025168815, + "grad_norm": 0.6232548952102661, + "learning_rate": 9.792238172918643e-05, + "loss": 2.0607, + "step": 3889 + }, + { + "epoch": 1.1939840392879066, + "grad_norm": 0.5859619379043579, + "learning_rate": 9.792096354810716e-05, + "loss": 2.0718, + "step": 3890 + }, + { + "epoch": 1.194290976058932, + "grad_norm": 0.47209057211875916, + "learning_rate": 9.791954489344485e-05, + "loss": 1.9872, + "step": 3891 + }, + { + "epoch": 1.194597912829957, + "grad_norm": 0.5183662176132202, + "learning_rate": 9.79181257652135e-05, + "loss": 2.0782, + "step": 3892 + }, + { + "epoch": 1.1949048496009822, + "grad_norm": 0.551873505115509, + "learning_rate": 9.791670616342715e-05, + "loss": 2.0477, + "step": 3893 + }, + { + "epoch": 1.1952117863720073, + "grad_norm": 0.47254955768585205, + "learning_rate": 9.791528608809984e-05, + "loss": 1.9859, + "step": 3894 + }, + { + "epoch": 1.1955187231430324, + "grad_norm": 0.45482897758483887, + "learning_rate": 9.791386553924556e-05, + "loss": 1.9939, + "step": 3895 + }, + { + "epoch": 1.1958256599140578, + "grad_norm": 0.4687066078186035, + "learning_rate": 9.79124445168784e-05, + "loss": 1.9982, + "step": 3896 + }, + { + "epoch": 1.1961325966850829, + "grad_norm": 0.4855460524559021, + "learning_rate": 9.791102302101236e-05, + "loss": 1.9667, + "step": 3897 + }, + { + "epoch": 1.196439533456108, + "grad_norm": 0.48152467608451843, + "learning_rate": 9.790960105166153e-05, + "loss": 1.9914, + "step": 3898 + }, + { + "epoch": 1.1967464702271333, + "grad_norm": 0.48487406969070435, + "learning_rate": 9.790817860883993e-05, + "loss": 1.9978, + "step": 3899 + }, + { + "epoch": 1.1970534069981584, + "grad_norm": 0.47665563225746155, + "learning_rate": 9.790675569256162e-05, + "loss": 1.9995, + "step": 3900 + }, + { + "epoch": 1.1973603437691835, + "grad_norm": 0.48938530683517456, + "learning_rate": 9.790533230284069e-05, + "loss": 2.0461, + "step": 3901 + }, + { + "epoch": 1.1976672805402087, + "grad_norm": 0.6336411237716675, + "learning_rate": 9.790390843969119e-05, + "loss": 2.0003, + "step": 3902 + }, + { + "epoch": 1.1979742173112338, + "grad_norm": 0.6946616172790527, + "learning_rate": 9.790248410312717e-05, + "loss": 1.9979, + "step": 3903 + }, + { + "epoch": 1.198281154082259, + "grad_norm": 0.7829384803771973, + "learning_rate": 9.790105929316274e-05, + "loss": 2.015, + "step": 3904 + }, + { + "epoch": 1.1985880908532842, + "grad_norm": 0.6874059438705444, + "learning_rate": 9.789963400981197e-05, + "loss": 1.9887, + "step": 3905 + }, + { + "epoch": 1.1988950276243093, + "grad_norm": 0.6074720025062561, + "learning_rate": 9.789820825308893e-05, + "loss": 2.0287, + "step": 3906 + }, + { + "epoch": 1.1992019643953347, + "grad_norm": 0.49311673641204834, + "learning_rate": 9.789678202300774e-05, + "loss": 1.9846, + "step": 3907 + }, + { + "epoch": 1.1995089011663598, + "grad_norm": 0.5266487002372742, + "learning_rate": 9.789535531958244e-05, + "loss": 2.017, + "step": 3908 + }, + { + "epoch": 1.1998158379373849, + "grad_norm": 0.6170570850372314, + "learning_rate": 9.789392814282721e-05, + "loss": 2.0615, + "step": 3909 + }, + { + "epoch": 1.20012277470841, + "grad_norm": 0.5820409059524536, + "learning_rate": 9.789250049275609e-05, + "loss": 2.0459, + "step": 3910 + }, + { + "epoch": 1.2004297114794351, + "grad_norm": 0.5220739841461182, + "learning_rate": 9.78910723693832e-05, + "loss": 2.0843, + "step": 3911 + }, + { + "epoch": 1.2007366482504604, + "grad_norm": 0.5884750485420227, + "learning_rate": 9.788964377272267e-05, + "loss": 2.1068, + "step": 3912 + }, + { + "epoch": 1.2010435850214856, + "grad_norm": 0.5634950995445251, + "learning_rate": 9.788821470278861e-05, + "loss": 2.0206, + "step": 3913 + }, + { + "epoch": 1.2013505217925107, + "grad_norm": 0.5219514966011047, + "learning_rate": 9.788678515959517e-05, + "loss": 2.0802, + "step": 3914 + }, + { + "epoch": 1.201657458563536, + "grad_norm": 0.5870078206062317, + "learning_rate": 9.788535514315642e-05, + "loss": 2.0149, + "step": 3915 + }, + { + "epoch": 1.2019643953345611, + "grad_norm": 0.4850577414035797, + "learning_rate": 9.788392465348653e-05, + "loss": 2.0424, + "step": 3916 + }, + { + "epoch": 1.2022713321055862, + "grad_norm": 0.5354881882667542, + "learning_rate": 9.788249369059964e-05, + "loss": 2.0822, + "step": 3917 + }, + { + "epoch": 1.2025782688766113, + "grad_norm": 0.5817529559135437, + "learning_rate": 9.788106225450988e-05, + "loss": 2.0384, + "step": 3918 + }, + { + "epoch": 1.2028852056476367, + "grad_norm": 0.5685575008392334, + "learning_rate": 9.78796303452314e-05, + "loss": 1.9777, + "step": 3919 + }, + { + "epoch": 1.2031921424186618, + "grad_norm": 0.5086472034454346, + "learning_rate": 9.787819796277835e-05, + "loss": 1.9109, + "step": 3920 + }, + { + "epoch": 1.203499079189687, + "grad_norm": 0.45905008912086487, + "learning_rate": 9.787676510716488e-05, + "loss": 1.9945, + "step": 3921 + }, + { + "epoch": 1.203806015960712, + "grad_norm": 0.6052672863006592, + "learning_rate": 9.787533177840516e-05, + "loss": 2.0873, + "step": 3922 + }, + { + "epoch": 1.2041129527317374, + "grad_norm": 0.636320173740387, + "learning_rate": 9.787389797651334e-05, + "loss": 1.954, + "step": 3923 + }, + { + "epoch": 1.2044198895027625, + "grad_norm": 0.5775459408760071, + "learning_rate": 9.78724637015036e-05, + "loss": 1.9632, + "step": 3924 + }, + { + "epoch": 1.2047268262737876, + "grad_norm": 0.4593936502933502, + "learning_rate": 9.787102895339013e-05, + "loss": 1.948, + "step": 3925 + }, + { + "epoch": 1.2050337630448127, + "grad_norm": 0.4568643867969513, + "learning_rate": 9.78695937321871e-05, + "loss": 1.977, + "step": 3926 + }, + { + "epoch": 1.205340699815838, + "grad_norm": 0.6079357266426086, + "learning_rate": 9.786815803790867e-05, + "loss": 1.9738, + "step": 3927 + }, + { + "epoch": 1.2056476365868631, + "grad_norm": 0.5991626977920532, + "learning_rate": 9.786672187056905e-05, + "loss": 1.9603, + "step": 3928 + }, + { + "epoch": 1.2059545733578882, + "grad_norm": 0.4844282865524292, + "learning_rate": 9.786528523018242e-05, + "loss": 1.9739, + "step": 3929 + }, + { + "epoch": 1.2062615101289134, + "grad_norm": 0.43694475293159485, + "learning_rate": 9.786384811676298e-05, + "loss": 1.957, + "step": 3930 + }, + { + "epoch": 1.2065684468999387, + "grad_norm": 0.5742451548576355, + "learning_rate": 9.786241053032496e-05, + "loss": 1.9872, + "step": 3931 + }, + { + "epoch": 1.2068753836709638, + "grad_norm": 0.6246824860572815, + "learning_rate": 9.786097247088255e-05, + "loss": 2.0747, + "step": 3932 + }, + { + "epoch": 1.207182320441989, + "grad_norm": 0.5364731550216675, + "learning_rate": 9.785953393844996e-05, + "loss": 1.9793, + "step": 3933 + }, + { + "epoch": 1.207489257213014, + "grad_norm": 0.42909273505210876, + "learning_rate": 9.785809493304139e-05, + "loss": 1.9959, + "step": 3934 + }, + { + "epoch": 1.2077961939840394, + "grad_norm": 0.43952879309654236, + "learning_rate": 9.785665545467108e-05, + "loss": 2.0019, + "step": 3935 + }, + { + "epoch": 1.2081031307550645, + "grad_norm": 0.45972180366516113, + "learning_rate": 9.785521550335323e-05, + "loss": 1.9504, + "step": 3936 + }, + { + "epoch": 1.2084100675260896, + "grad_norm": 0.5592246651649475, + "learning_rate": 9.785377507910212e-05, + "loss": 2.0214, + "step": 3937 + }, + { + "epoch": 1.2087170042971147, + "grad_norm": 0.6084285378456116, + "learning_rate": 9.785233418193196e-05, + "loss": 2.08, + "step": 3938 + }, + { + "epoch": 1.20902394106814, + "grad_norm": 0.5370670557022095, + "learning_rate": 9.785089281185698e-05, + "loss": 2.0877, + "step": 3939 + }, + { + "epoch": 1.2093308778391652, + "grad_norm": 0.466501921415329, + "learning_rate": 9.784945096889143e-05, + "loss": 1.9795, + "step": 3940 + }, + { + "epoch": 1.2096378146101903, + "grad_norm": 0.48617517948150635, + "learning_rate": 9.784800865304954e-05, + "loss": 2.0099, + "step": 3941 + }, + { + "epoch": 1.2099447513812154, + "grad_norm": 0.528110921382904, + "learning_rate": 9.78465658643456e-05, + "loss": 2.0597, + "step": 3942 + }, + { + "epoch": 1.2102516881522407, + "grad_norm": 0.47355538606643677, + "learning_rate": 9.784512260279385e-05, + "loss": 2.0145, + "step": 3943 + }, + { + "epoch": 1.2105586249232658, + "grad_norm": 0.46970823407173157, + "learning_rate": 9.784367886840856e-05, + "loss": 2.0533, + "step": 3944 + }, + { + "epoch": 1.210865561694291, + "grad_norm": 0.41206037998199463, + "learning_rate": 9.784223466120399e-05, + "loss": 1.9226, + "step": 3945 + }, + { + "epoch": 1.211172498465316, + "grad_norm": 0.4298155605792999, + "learning_rate": 9.784078998119442e-05, + "loss": 2.0686, + "step": 3946 + }, + { + "epoch": 1.2114794352363414, + "grad_norm": 0.4616359770298004, + "learning_rate": 9.783934482839412e-05, + "loss": 2.0063, + "step": 3947 + }, + { + "epoch": 1.2117863720073665, + "grad_norm": 0.476726233959198, + "learning_rate": 9.783789920281737e-05, + "loss": 1.9868, + "step": 3948 + }, + { + "epoch": 1.2120933087783916, + "grad_norm": 0.5075610876083374, + "learning_rate": 9.783645310447846e-05, + "loss": 2.1019, + "step": 3949 + }, + { + "epoch": 1.212400245549417, + "grad_norm": 0.49806225299835205, + "learning_rate": 9.78350065333917e-05, + "loss": 2.0503, + "step": 3950 + }, + { + "epoch": 1.212707182320442, + "grad_norm": 0.5278452634811401, + "learning_rate": 9.783355948957134e-05, + "loss": 2.0513, + "step": 3951 + }, + { + "epoch": 1.2130141190914672, + "grad_norm": 0.5634627938270569, + "learning_rate": 9.783211197303174e-05, + "loss": 2.1135, + "step": 3952 + }, + { + "epoch": 1.2133210558624923, + "grad_norm": 0.5152999758720398, + "learning_rate": 9.783066398378715e-05, + "loss": 2.0392, + "step": 3953 + }, + { + "epoch": 1.2136279926335174, + "grad_norm": 0.48095864057540894, + "learning_rate": 9.782921552185191e-05, + "loss": 1.982, + "step": 3954 + }, + { + "epoch": 1.2139349294045427, + "grad_norm": 0.47377893328666687, + "learning_rate": 9.782776658724034e-05, + "loss": 1.9538, + "step": 3955 + }, + { + "epoch": 1.2142418661755678, + "grad_norm": 0.5260181427001953, + "learning_rate": 9.782631717996675e-05, + "loss": 2.1197, + "step": 3956 + }, + { + "epoch": 1.214548802946593, + "grad_norm": 0.5640038251876831, + "learning_rate": 9.782486730004544e-05, + "loss": 2.0338, + "step": 3957 + }, + { + "epoch": 1.2148557397176183, + "grad_norm": 0.5091645121574402, + "learning_rate": 9.782341694749078e-05, + "loss": 1.9921, + "step": 3958 + }, + { + "epoch": 1.2151626764886434, + "grad_norm": 0.48285624384880066, + "learning_rate": 9.782196612231706e-05, + "loss": 2.0358, + "step": 3959 + }, + { + "epoch": 1.2154696132596685, + "grad_norm": 0.5013573169708252, + "learning_rate": 9.782051482453867e-05, + "loss": 1.9378, + "step": 3960 + }, + { + "epoch": 1.2157765500306936, + "grad_norm": 0.42000052332878113, + "learning_rate": 9.781906305416991e-05, + "loss": 1.9232, + "step": 3961 + }, + { + "epoch": 1.2160834868017187, + "grad_norm": 0.4651196599006653, + "learning_rate": 9.781761081122514e-05, + "loss": 2.0244, + "step": 3962 + }, + { + "epoch": 1.216390423572744, + "grad_norm": 0.48081469535827637, + "learning_rate": 9.781615809571871e-05, + "loss": 1.938, + "step": 3963 + }, + { + "epoch": 1.2166973603437692, + "grad_norm": 0.4692462086677551, + "learning_rate": 9.7814704907665e-05, + "loss": 1.9592, + "step": 3964 + }, + { + "epoch": 1.2170042971147943, + "grad_norm": 0.5545635223388672, + "learning_rate": 9.781325124707832e-05, + "loss": 2.0882, + "step": 3965 + }, + { + "epoch": 1.2173112338858196, + "grad_norm": 0.47801801562309265, + "learning_rate": 9.78117971139731e-05, + "loss": 2.0127, + "step": 3966 + }, + { + "epoch": 1.2176181706568447, + "grad_norm": 0.4705824851989746, + "learning_rate": 9.781034250836364e-05, + "loss": 2.0659, + "step": 3967 + }, + { + "epoch": 1.2179251074278699, + "grad_norm": 0.4757092297077179, + "learning_rate": 9.78088874302644e-05, + "loss": 1.9177, + "step": 3968 + }, + { + "epoch": 1.218232044198895, + "grad_norm": 0.4563291370868683, + "learning_rate": 9.780743187968968e-05, + "loss": 1.991, + "step": 3969 + }, + { + "epoch": 1.21853898096992, + "grad_norm": 0.4641762375831604, + "learning_rate": 9.78059758566539e-05, + "loss": 2.0357, + "step": 3970 + }, + { + "epoch": 1.2188459177409454, + "grad_norm": 0.510754406452179, + "learning_rate": 9.780451936117145e-05, + "loss": 2.0754, + "step": 3971 + }, + { + "epoch": 1.2191528545119705, + "grad_norm": 0.5595460534095764, + "learning_rate": 9.780306239325671e-05, + "loss": 2.0449, + "step": 3972 + }, + { + "epoch": 1.2194597912829956, + "grad_norm": 0.5778231620788574, + "learning_rate": 9.780160495292412e-05, + "loss": 2.0187, + "step": 3973 + }, + { + "epoch": 1.219766728054021, + "grad_norm": 0.5098022818565369, + "learning_rate": 9.780014704018803e-05, + "loss": 1.9881, + "step": 3974 + }, + { + "epoch": 1.220073664825046, + "grad_norm": 0.46725937724113464, + "learning_rate": 9.779868865506288e-05, + "loss": 1.9929, + "step": 3975 + }, + { + "epoch": 1.2203806015960712, + "grad_norm": 0.48517540097236633, + "learning_rate": 9.779722979756304e-05, + "loss": 1.9446, + "step": 3976 + }, + { + "epoch": 1.2206875383670963, + "grad_norm": 0.5013269186019897, + "learning_rate": 9.7795770467703e-05, + "loss": 2.0256, + "step": 3977 + }, + { + "epoch": 1.2209944751381214, + "grad_norm": 0.4918982982635498, + "learning_rate": 9.779431066549713e-05, + "loss": 1.9732, + "step": 3978 + }, + { + "epoch": 1.2213014119091468, + "grad_norm": 0.45646655559539795, + "learning_rate": 9.779285039095987e-05, + "loss": 1.9672, + "step": 3979 + }, + { + "epoch": 1.2216083486801719, + "grad_norm": 0.4712901711463928, + "learning_rate": 9.779138964410565e-05, + "loss": 2.0074, + "step": 3980 + }, + { + "epoch": 1.221915285451197, + "grad_norm": 0.4901394844055176, + "learning_rate": 9.77899284249489e-05, + "loss": 2.0073, + "step": 3981 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.579247772693634, + "learning_rate": 9.778846673350407e-05, + "loss": 2.0983, + "step": 3982 + }, + { + "epoch": 1.2225291589932474, + "grad_norm": 0.6108444929122925, + "learning_rate": 9.77870045697856e-05, + "loss": 2.0268, + "step": 3983 + }, + { + "epoch": 1.2228360957642725, + "grad_norm": 0.5592121481895447, + "learning_rate": 9.778554193380796e-05, + "loss": 2.0549, + "step": 3984 + }, + { + "epoch": 1.2231430325352977, + "grad_norm": 0.538088858127594, + "learning_rate": 9.778407882558556e-05, + "loss": 1.9398, + "step": 3985 + }, + { + "epoch": 1.223449969306323, + "grad_norm": 0.5928295850753784, + "learning_rate": 9.77826152451329e-05, + "loss": 2.0341, + "step": 3986 + }, + { + "epoch": 1.223756906077348, + "grad_norm": 0.566687822341919, + "learning_rate": 9.778115119246442e-05, + "loss": 2.0629, + "step": 3987 + }, + { + "epoch": 1.2240638428483732, + "grad_norm": 0.7019027471542358, + "learning_rate": 9.777968666759461e-05, + "loss": 1.9979, + "step": 3988 + }, + { + "epoch": 1.2243707796193983, + "grad_norm": 0.7198969721794128, + "learning_rate": 9.777822167053793e-05, + "loss": 1.9898, + "step": 3989 + }, + { + "epoch": 1.2246777163904237, + "grad_norm": 0.6319006085395813, + "learning_rate": 9.777675620130887e-05, + "loss": 1.9591, + "step": 3990 + }, + { + "epoch": 1.2249846531614488, + "grad_norm": 0.5372903347015381, + "learning_rate": 9.777529025992187e-05, + "loss": 1.9605, + "step": 3991 + }, + { + "epoch": 1.225291589932474, + "grad_norm": 0.47436487674713135, + "learning_rate": 9.777382384639147e-05, + "loss": 1.9667, + "step": 3992 + }, + { + "epoch": 1.225598526703499, + "grad_norm": 0.5885797739028931, + "learning_rate": 9.777235696073214e-05, + "loss": 2.0363, + "step": 3993 + }, + { + "epoch": 1.2259054634745243, + "grad_norm": 0.6333138346672058, + "learning_rate": 9.777088960295838e-05, + "loss": 1.9352, + "step": 3994 + }, + { + "epoch": 1.2262124002455494, + "grad_norm": 0.6364251971244812, + "learning_rate": 9.776942177308468e-05, + "loss": 1.9577, + "step": 3995 + }, + { + "epoch": 1.2265193370165746, + "grad_norm": 0.5114668607711792, + "learning_rate": 9.776795347112557e-05, + "loss": 2.0241, + "step": 3996 + }, + { + "epoch": 1.2268262737875997, + "grad_norm": 0.6139995455741882, + "learning_rate": 9.776648469709556e-05, + "loss": 1.9847, + "step": 3997 + }, + { + "epoch": 1.227133210558625, + "grad_norm": 0.6104671955108643, + "learning_rate": 9.776501545100911e-05, + "loss": 1.9311, + "step": 3998 + }, + { + "epoch": 1.2274401473296501, + "grad_norm": 0.5099297761917114, + "learning_rate": 9.776354573288081e-05, + "loss": 2.0877, + "step": 3999 + }, + { + "epoch": 1.2277470841006752, + "grad_norm": 0.48199233412742615, + "learning_rate": 9.776207554272516e-05, + "loss": 1.9802, + "step": 4000 + }, + { + "epoch": 1.2280540208717003, + "grad_norm": 0.5323067307472229, + "learning_rate": 9.776060488055667e-05, + "loss": 2.0278, + "step": 4001 + }, + { + "epoch": 1.2283609576427257, + "grad_norm": 0.49086472392082214, + "learning_rate": 9.775913374638988e-05, + "loss": 2.0242, + "step": 4002 + }, + { + "epoch": 1.2286678944137508, + "grad_norm": 0.4812946319580078, + "learning_rate": 9.775766214023936e-05, + "loss": 1.9762, + "step": 4003 + }, + { + "epoch": 1.228974831184776, + "grad_norm": 0.44118809700012207, + "learning_rate": 9.775619006211962e-05, + "loss": 1.9242, + "step": 4004 + }, + { + "epoch": 1.229281767955801, + "grad_norm": 0.4507352113723755, + "learning_rate": 9.775471751204522e-05, + "loss": 2.0015, + "step": 4005 + }, + { + "epoch": 1.2295887047268264, + "grad_norm": 0.4620691239833832, + "learning_rate": 9.775324449003072e-05, + "loss": 2.0269, + "step": 4006 + }, + { + "epoch": 1.2298956414978515, + "grad_norm": 0.5053025484085083, + "learning_rate": 9.775177099609065e-05, + "loss": 1.9764, + "step": 4007 + }, + { + "epoch": 1.2302025782688766, + "grad_norm": 0.5113483667373657, + "learning_rate": 9.775029703023961e-05, + "loss": 2.0583, + "step": 4008 + }, + { + "epoch": 1.2305095150399017, + "grad_norm": 0.517400324344635, + "learning_rate": 9.774882259249214e-05, + "loss": 2.0918, + "step": 4009 + }, + { + "epoch": 1.230816451810927, + "grad_norm": 0.5575035214424133, + "learning_rate": 9.774734768286282e-05, + "loss": 2.0573, + "step": 4010 + }, + { + "epoch": 1.2311233885819521, + "grad_norm": 0.5556582808494568, + "learning_rate": 9.774587230136622e-05, + "loss": 1.9612, + "step": 4011 + }, + { + "epoch": 1.2314303253529773, + "grad_norm": 0.541752815246582, + "learning_rate": 9.774439644801693e-05, + "loss": 2.0165, + "step": 4012 + }, + { + "epoch": 1.2317372621240024, + "grad_norm": 0.46944886445999146, + "learning_rate": 9.774292012282953e-05, + "loss": 2.0068, + "step": 4013 + }, + { + "epoch": 1.2320441988950277, + "grad_norm": 0.5507385730743408, + "learning_rate": 9.77414433258186e-05, + "loss": 2.0092, + "step": 4014 + }, + { + "epoch": 1.2323511356660528, + "grad_norm": 0.550862193107605, + "learning_rate": 9.773996605699875e-05, + "loss": 1.9887, + "step": 4015 + }, + { + "epoch": 1.232658072437078, + "grad_norm": 0.5281004905700684, + "learning_rate": 9.77384883163846e-05, + "loss": 2.0214, + "step": 4016 + }, + { + "epoch": 1.232965009208103, + "grad_norm": 0.5682541131973267, + "learning_rate": 9.77370101039907e-05, + "loss": 2.0021, + "step": 4017 + }, + { + "epoch": 1.2332719459791284, + "grad_norm": 0.5083168745040894, + "learning_rate": 9.77355314198317e-05, + "loss": 1.9589, + "step": 4018 + }, + { + "epoch": 1.2335788827501535, + "grad_norm": 0.48763957619667053, + "learning_rate": 9.773405226392218e-05, + "loss": 1.9517, + "step": 4019 + }, + { + "epoch": 1.2338858195211786, + "grad_norm": 0.4721868634223938, + "learning_rate": 9.77325726362768e-05, + "loss": 1.959, + "step": 4020 + }, + { + "epoch": 1.2341927562922037, + "grad_norm": 0.5072606205940247, + "learning_rate": 9.773109253691016e-05, + "loss": 2.0252, + "step": 4021 + }, + { + "epoch": 1.234499693063229, + "grad_norm": 0.483260840177536, + "learning_rate": 9.772961196583686e-05, + "loss": 2.0205, + "step": 4022 + }, + { + "epoch": 1.2348066298342542, + "grad_norm": 0.4468609392642975, + "learning_rate": 9.772813092307158e-05, + "loss": 2.0182, + "step": 4023 + }, + { + "epoch": 1.2351135666052793, + "grad_norm": 0.4950753152370453, + "learning_rate": 9.772664940862893e-05, + "loss": 2.0276, + "step": 4024 + }, + { + "epoch": 1.2354205033763046, + "grad_norm": 0.45740416646003723, + "learning_rate": 9.772516742252356e-05, + "loss": 1.9519, + "step": 4025 + }, + { + "epoch": 1.2357274401473297, + "grad_norm": 0.409072607755661, + "learning_rate": 9.772368496477011e-05, + "loss": 1.9441, + "step": 4026 + }, + { + "epoch": 1.2360343769183548, + "grad_norm": 0.44857287406921387, + "learning_rate": 9.772220203538325e-05, + "loss": 1.9941, + "step": 4027 + }, + { + "epoch": 1.23634131368938, + "grad_norm": 0.4610998034477234, + "learning_rate": 9.77207186343776e-05, + "loss": 1.9855, + "step": 4028 + }, + { + "epoch": 1.236648250460405, + "grad_norm": 0.4809660017490387, + "learning_rate": 9.771923476176784e-05, + "loss": 1.9596, + "step": 4029 + }, + { + "epoch": 1.2369551872314304, + "grad_norm": 0.5011657476425171, + "learning_rate": 9.771775041756865e-05, + "loss": 1.9537, + "step": 4030 + }, + { + "epoch": 1.2372621240024555, + "grad_norm": 0.476001501083374, + "learning_rate": 9.771626560179465e-05, + "loss": 1.9447, + "step": 4031 + }, + { + "epoch": 1.2375690607734806, + "grad_norm": 0.4733816385269165, + "learning_rate": 9.771478031446057e-05, + "loss": 2.08, + "step": 4032 + }, + { + "epoch": 1.237875997544506, + "grad_norm": 0.4763995409011841, + "learning_rate": 9.771329455558108e-05, + "loss": 1.9483, + "step": 4033 + }, + { + "epoch": 1.238182934315531, + "grad_norm": 0.4906281530857086, + "learning_rate": 9.771180832517082e-05, + "loss": 1.9619, + "step": 4034 + }, + { + "epoch": 1.2384898710865562, + "grad_norm": 0.48713672161102295, + "learning_rate": 9.77103216232445e-05, + "loss": 1.9753, + "step": 4035 + }, + { + "epoch": 1.2387968078575813, + "grad_norm": 0.5214180946350098, + "learning_rate": 9.770883444981683e-05, + "loss": 2.0407, + "step": 4036 + }, + { + "epoch": 1.2391037446286064, + "grad_norm": 0.5161129236221313, + "learning_rate": 9.77073468049025e-05, + "loss": 2.0298, + "step": 4037 + }, + { + "epoch": 1.2394106813996317, + "grad_norm": 0.5041607022285461, + "learning_rate": 9.770585868851621e-05, + "loss": 1.9898, + "step": 4038 + }, + { + "epoch": 1.2397176181706568, + "grad_norm": 0.5076795220375061, + "learning_rate": 9.770437010067264e-05, + "loss": 1.9899, + "step": 4039 + }, + { + "epoch": 1.240024554941682, + "grad_norm": 0.47992074489593506, + "learning_rate": 9.770288104138654e-05, + "loss": 1.9923, + "step": 4040 + }, + { + "epoch": 1.2403314917127073, + "grad_norm": 0.4655405580997467, + "learning_rate": 9.770139151067261e-05, + "loss": 2.0082, + "step": 4041 + }, + { + "epoch": 1.2406384284837324, + "grad_norm": 0.499953031539917, + "learning_rate": 9.769990150854558e-05, + "loss": 2.0412, + "step": 4042 + }, + { + "epoch": 1.2409453652547575, + "grad_norm": 0.5288184285163879, + "learning_rate": 9.769841103502016e-05, + "loss": 2.0163, + "step": 4043 + }, + { + "epoch": 1.2412523020257826, + "grad_norm": 0.6660463809967041, + "learning_rate": 9.769692009011107e-05, + "loss": 2.1644, + "step": 4044 + }, + { + "epoch": 1.2415592387968077, + "grad_norm": 0.7020677328109741, + "learning_rate": 9.769542867383306e-05, + "loss": 1.9921, + "step": 4045 + }, + { + "epoch": 1.241866175567833, + "grad_norm": 0.8394366502761841, + "learning_rate": 9.769393678620089e-05, + "loss": 2.0099, + "step": 4046 + }, + { + "epoch": 1.2421731123388582, + "grad_norm": 0.9541008472442627, + "learning_rate": 9.769244442722927e-05, + "loss": 2.0035, + "step": 4047 + }, + { + "epoch": 1.2424800491098833, + "grad_norm": 0.8454573750495911, + "learning_rate": 9.769095159693296e-05, + "loss": 2.0075, + "step": 4048 + }, + { + "epoch": 1.2427869858809086, + "grad_norm": 0.6634951233863831, + "learning_rate": 9.768945829532672e-05, + "loss": 2.0352, + "step": 4049 + }, + { + "epoch": 1.2430939226519337, + "grad_norm": 0.5453166365623474, + "learning_rate": 9.76879645224253e-05, + "loss": 2.0259, + "step": 4050 + }, + { + "epoch": 1.2434008594229589, + "grad_norm": 0.8018995523452759, + "learning_rate": 9.768647027824344e-05, + "loss": 2.0175, + "step": 4051 + }, + { + "epoch": 1.243707796193984, + "grad_norm": 0.8518994450569153, + "learning_rate": 9.768497556279596e-05, + "loss": 1.986, + "step": 4052 + }, + { + "epoch": 1.244014732965009, + "grad_norm": 0.670764684677124, + "learning_rate": 9.76834803760976e-05, + "loss": 1.9779, + "step": 4053 + }, + { + "epoch": 1.2443216697360344, + "grad_norm": 0.5042433142662048, + "learning_rate": 9.768198471816312e-05, + "loss": 1.9808, + "step": 4054 + }, + { + "epoch": 1.2446286065070595, + "grad_norm": 0.45487603545188904, + "learning_rate": 9.768048858900733e-05, + "loss": 2.011, + "step": 4055 + }, + { + "epoch": 1.2449355432780846, + "grad_norm": 0.5012104511260986, + "learning_rate": 9.767899198864502e-05, + "loss": 1.9945, + "step": 4056 + }, + { + "epoch": 1.24524248004911, + "grad_norm": 0.6275805234909058, + "learning_rate": 9.767749491709095e-05, + "loss": 2.0397, + "step": 4057 + }, + { + "epoch": 1.245549416820135, + "grad_norm": 0.601513683795929, + "learning_rate": 9.767599737435993e-05, + "loss": 2.0201, + "step": 4058 + }, + { + "epoch": 1.2458563535911602, + "grad_norm": 0.531112551689148, + "learning_rate": 9.767449936046678e-05, + "loss": 2.0449, + "step": 4059 + }, + { + "epoch": 1.2461632903621853, + "grad_norm": 0.48515528440475464, + "learning_rate": 9.767300087542626e-05, + "loss": 2.0318, + "step": 4060 + }, + { + "epoch": 1.2464702271332107, + "grad_norm": 0.49292388558387756, + "learning_rate": 9.767150191925321e-05, + "loss": 2.0004, + "step": 4061 + }, + { + "epoch": 1.2467771639042358, + "grad_norm": 0.6046907901763916, + "learning_rate": 9.767000249196242e-05, + "loss": 2.0141, + "step": 4062 + }, + { + "epoch": 1.2470841006752609, + "grad_norm": 0.5311875939369202, + "learning_rate": 9.766850259356876e-05, + "loss": 1.9909, + "step": 4063 + }, + { + "epoch": 1.247391037446286, + "grad_norm": 0.535664975643158, + "learning_rate": 9.7667002224087e-05, + "loss": 2.07, + "step": 4064 + }, + { + "epoch": 1.2476979742173113, + "grad_norm": 0.594886839389801, + "learning_rate": 9.766550138353199e-05, + "loss": 1.9646, + "step": 4065 + }, + { + "epoch": 1.2480049109883364, + "grad_norm": 0.6726763844490051, + "learning_rate": 9.766400007191856e-05, + "loss": 1.9778, + "step": 4066 + }, + { + "epoch": 1.2483118477593615, + "grad_norm": 0.6045297384262085, + "learning_rate": 9.766249828926154e-05, + "loss": 2.0215, + "step": 4067 + }, + { + "epoch": 1.2486187845303867, + "grad_norm": 0.56207275390625, + "learning_rate": 9.766099603557576e-05, + "loss": 2.0252, + "step": 4068 + }, + { + "epoch": 1.248925721301412, + "grad_norm": 0.6623022556304932, + "learning_rate": 9.765949331087611e-05, + "loss": 1.975, + "step": 4069 + }, + { + "epoch": 1.249232658072437, + "grad_norm": 0.6274738311767578, + "learning_rate": 9.76579901151774e-05, + "loss": 2.037, + "step": 4070 + }, + { + "epoch": 1.2495395948434622, + "grad_norm": 0.5161643028259277, + "learning_rate": 9.76564864484945e-05, + "loss": 1.969, + "step": 4071 + }, + { + "epoch": 1.2498465316144873, + "grad_norm": 0.5624449849128723, + "learning_rate": 9.765498231084227e-05, + "loss": 2.0322, + "step": 4072 + }, + { + "epoch": 1.2501534683855127, + "grad_norm": 0.6198796629905701, + "learning_rate": 9.765347770223556e-05, + "loss": 1.986, + "step": 4073 + }, + { + "epoch": 1.2504604051565378, + "grad_norm": 0.5928165316581726, + "learning_rate": 9.765197262268927e-05, + "loss": 1.9886, + "step": 4074 + }, + { + "epoch": 1.250767341927563, + "grad_norm": 0.476484090089798, + "learning_rate": 9.765046707221825e-05, + "loss": 2.0476, + "step": 4075 + }, + { + "epoch": 1.2510742786985882, + "grad_norm": 0.5001220703125, + "learning_rate": 9.764896105083738e-05, + "loss": 1.9222, + "step": 4076 + }, + { + "epoch": 1.2513812154696133, + "grad_norm": 0.5429214239120483, + "learning_rate": 9.764745455856156e-05, + "loss": 2.0005, + "step": 4077 + }, + { + "epoch": 1.2516881522406385, + "grad_norm": 0.49443748593330383, + "learning_rate": 9.764594759540566e-05, + "loss": 1.9746, + "step": 4078 + }, + { + "epoch": 1.2519950890116636, + "grad_norm": 0.46963369846343994, + "learning_rate": 9.764444016138458e-05, + "loss": 1.9133, + "step": 4079 + }, + { + "epoch": 1.2523020257826887, + "grad_norm": 0.5112172365188599, + "learning_rate": 9.764293225651324e-05, + "loss": 1.9488, + "step": 4080 + }, + { + "epoch": 1.252608962553714, + "grad_norm": 0.4584117829799652, + "learning_rate": 9.764142388080648e-05, + "loss": 1.9895, + "step": 4081 + }, + { + "epoch": 1.2529158993247391, + "grad_norm": 0.48059090971946716, + "learning_rate": 9.763991503427927e-05, + "loss": 2.0436, + "step": 4082 + }, + { + "epoch": 1.2532228360957642, + "grad_norm": 0.5877810120582581, + "learning_rate": 9.763840571694649e-05, + "loss": 1.97, + "step": 4083 + }, + { + "epoch": 1.2535297728667896, + "grad_norm": 0.5370834469795227, + "learning_rate": 9.763689592882306e-05, + "loss": 2.0369, + "step": 4084 + }, + { + "epoch": 1.2538367096378147, + "grad_norm": 0.5483170747756958, + "learning_rate": 9.763538566992392e-05, + "loss": 2.066, + "step": 4085 + }, + { + "epoch": 1.2541436464088398, + "grad_norm": 0.5209359526634216, + "learning_rate": 9.763387494026396e-05, + "loss": 2.0685, + "step": 4086 + }, + { + "epoch": 1.254450583179865, + "grad_norm": 0.5569130182266235, + "learning_rate": 9.763236373985813e-05, + "loss": 2.0253, + "step": 4087 + }, + { + "epoch": 1.25475751995089, + "grad_norm": 0.48483753204345703, + "learning_rate": 9.763085206872136e-05, + "loss": 1.9851, + "step": 4088 + }, + { + "epoch": 1.2550644567219154, + "grad_norm": 0.4289563000202179, + "learning_rate": 9.76293399268686e-05, + "loss": 1.9374, + "step": 4089 + }, + { + "epoch": 1.2553713934929405, + "grad_norm": 0.4691961109638214, + "learning_rate": 9.762782731431478e-05, + "loss": 1.9588, + "step": 4090 + }, + { + "epoch": 1.2556783302639656, + "grad_norm": 0.49626582860946655, + "learning_rate": 9.762631423107488e-05, + "loss": 1.999, + "step": 4091 + }, + { + "epoch": 1.255985267034991, + "grad_norm": 0.5099872946739197, + "learning_rate": 9.762480067716381e-05, + "loss": 2.013, + "step": 4092 + }, + { + "epoch": 1.256292203806016, + "grad_norm": 0.47525838017463684, + "learning_rate": 9.762328665259654e-05, + "loss": 1.9953, + "step": 4093 + }, + { + "epoch": 1.2565991405770411, + "grad_norm": 0.4277878999710083, + "learning_rate": 9.762177215738804e-05, + "loss": 1.9623, + "step": 4094 + }, + { + "epoch": 1.2569060773480663, + "grad_norm": 0.46068885922431946, + "learning_rate": 9.762025719155328e-05, + "loss": 2.0012, + "step": 4095 + }, + { + "epoch": 1.2572130141190914, + "grad_norm": 0.4566059410572052, + "learning_rate": 9.761874175510723e-05, + "loss": 1.9666, + "step": 4096 + }, + { + "epoch": 1.2575199508901167, + "grad_norm": 0.44656631350517273, + "learning_rate": 9.761722584806487e-05, + "loss": 1.9912, + "step": 4097 + }, + { + "epoch": 1.2578268876611418, + "grad_norm": 0.5149295330047607, + "learning_rate": 9.761570947044117e-05, + "loss": 1.9876, + "step": 4098 + }, + { + "epoch": 1.258133824432167, + "grad_norm": 0.5265617370605469, + "learning_rate": 9.761419262225111e-05, + "loss": 2.0817, + "step": 4099 + }, + { + "epoch": 1.2584407612031923, + "grad_norm": 0.5015068054199219, + "learning_rate": 9.76126753035097e-05, + "loss": 1.9767, + "step": 4100 + }, + { + "epoch": 1.2587476979742174, + "grad_norm": 0.5178890228271484, + "learning_rate": 9.761115751423192e-05, + "loss": 1.9968, + "step": 4101 + }, + { + "epoch": 1.2590546347452425, + "grad_norm": 0.46565014123916626, + "learning_rate": 9.760963925443279e-05, + "loss": 1.8977, + "step": 4102 + }, + { + "epoch": 1.2593615715162676, + "grad_norm": 0.466398686170578, + "learning_rate": 9.760812052412728e-05, + "loss": 2.0317, + "step": 4103 + }, + { + "epoch": 1.2596685082872927, + "grad_norm": 0.48445576429367065, + "learning_rate": 9.760660132333043e-05, + "loss": 1.9953, + "step": 4104 + }, + { + "epoch": 1.259975445058318, + "grad_norm": 0.5716978907585144, + "learning_rate": 9.760508165205724e-05, + "loss": 2.0468, + "step": 4105 + }, + { + "epoch": 1.2602823818293432, + "grad_norm": 0.5168376564979553, + "learning_rate": 9.760356151032273e-05, + "loss": 1.9896, + "step": 4106 + }, + { + "epoch": 1.2605893186003683, + "grad_norm": 0.5014469027519226, + "learning_rate": 9.760204089814192e-05, + "loss": 2.0855, + "step": 4107 + }, + { + "epoch": 1.2608962553713936, + "grad_norm": 0.5283352732658386, + "learning_rate": 9.760051981552984e-05, + "loss": 2.0477, + "step": 4108 + }, + { + "epoch": 1.2612031921424187, + "grad_norm": 0.4526209533214569, + "learning_rate": 9.759899826250153e-05, + "loss": 1.9638, + "step": 4109 + }, + { + "epoch": 1.2615101289134438, + "grad_norm": 0.4565027058124542, + "learning_rate": 9.759747623907203e-05, + "loss": 1.9401, + "step": 4110 + }, + { + "epoch": 1.261817065684469, + "grad_norm": 0.48825928568840027, + "learning_rate": 9.759595374525636e-05, + "loss": 1.9721, + "step": 4111 + }, + { + "epoch": 1.262124002455494, + "grad_norm": 0.4922933578491211, + "learning_rate": 9.759443078106958e-05, + "loss": 1.969, + "step": 4112 + }, + { + "epoch": 1.2624309392265194, + "grad_norm": 0.5227758884429932, + "learning_rate": 9.759290734652674e-05, + "loss": 2.0144, + "step": 4113 + }, + { + "epoch": 1.2627378759975445, + "grad_norm": 0.48013919591903687, + "learning_rate": 9.759138344164289e-05, + "loss": 1.9889, + "step": 4114 + }, + { + "epoch": 1.2630448127685696, + "grad_norm": 0.5039379596710205, + "learning_rate": 9.758985906643309e-05, + "loss": 1.9313, + "step": 4115 + }, + { + "epoch": 1.263351749539595, + "grad_norm": 0.5248776078224182, + "learning_rate": 9.758833422091244e-05, + "loss": 2.0091, + "step": 4116 + }, + { + "epoch": 1.26365868631062, + "grad_norm": 0.4788825809955597, + "learning_rate": 9.758680890509595e-05, + "loss": 2.0197, + "step": 4117 + }, + { + "epoch": 1.2639656230816452, + "grad_norm": 0.4926285743713379, + "learning_rate": 9.758528311899873e-05, + "loss": 2.0558, + "step": 4118 + }, + { + "epoch": 1.2642725598526703, + "grad_norm": 0.44785842299461365, + "learning_rate": 9.758375686263586e-05, + "loss": 1.9505, + "step": 4119 + }, + { + "epoch": 1.2645794966236954, + "grad_norm": 0.44693484902381897, + "learning_rate": 9.75822301360224e-05, + "loss": 1.9734, + "step": 4120 + }, + { + "epoch": 1.2648864333947207, + "grad_norm": 0.4691752791404724, + "learning_rate": 9.758070293917346e-05, + "loss": 2.0069, + "step": 4121 + }, + { + "epoch": 1.2651933701657458, + "grad_norm": 0.4718364477157593, + "learning_rate": 9.757917527210413e-05, + "loss": 1.9926, + "step": 4122 + }, + { + "epoch": 1.265500306936771, + "grad_norm": 0.47527435421943665, + "learning_rate": 9.757764713482949e-05, + "loss": 2.0304, + "step": 4123 + }, + { + "epoch": 1.2658072437077963, + "grad_norm": 0.5030924677848816, + "learning_rate": 9.757611852736467e-05, + "loss": 2.0281, + "step": 4124 + }, + { + "epoch": 1.2661141804788214, + "grad_norm": 0.5260440707206726, + "learning_rate": 9.757458944972475e-05, + "loss": 1.9952, + "step": 4125 + }, + { + "epoch": 1.2664211172498465, + "grad_norm": 0.5542300939559937, + "learning_rate": 9.757305990192486e-05, + "loss": 1.979, + "step": 4126 + }, + { + "epoch": 1.2667280540208716, + "grad_norm": 0.5589221715927124, + "learning_rate": 9.757152988398011e-05, + "loss": 2.0123, + "step": 4127 + }, + { + "epoch": 1.2670349907918967, + "grad_norm": 0.48933175206184387, + "learning_rate": 9.75699993959056e-05, + "loss": 1.9671, + "step": 4128 + }, + { + "epoch": 1.267341927562922, + "grad_norm": 0.4785501956939697, + "learning_rate": 9.75684684377165e-05, + "loss": 1.9452, + "step": 4129 + }, + { + "epoch": 1.2676488643339472, + "grad_norm": 0.5000367760658264, + "learning_rate": 9.75669370094279e-05, + "loss": 1.9637, + "step": 4130 + }, + { + "epoch": 1.2679558011049723, + "grad_norm": 0.5292743444442749, + "learning_rate": 9.756540511105496e-05, + "loss": 2.0464, + "step": 4131 + }, + { + "epoch": 1.2682627378759976, + "grad_norm": 0.4979592561721802, + "learning_rate": 9.75638727426128e-05, + "loss": 1.9863, + "step": 4132 + }, + { + "epoch": 1.2685696746470227, + "grad_norm": 0.4681611657142639, + "learning_rate": 9.756233990411656e-05, + "loss": 1.9978, + "step": 4133 + }, + { + "epoch": 1.2688766114180479, + "grad_norm": 0.5034354329109192, + "learning_rate": 9.756080659558142e-05, + "loss": 2.0332, + "step": 4134 + }, + { + "epoch": 1.269183548189073, + "grad_norm": 0.4815942347049713, + "learning_rate": 9.75592728170225e-05, + "loss": 1.9669, + "step": 4135 + }, + { + "epoch": 1.269490484960098, + "grad_norm": 0.49555137753486633, + "learning_rate": 9.755773856845498e-05, + "loss": 1.9774, + "step": 4136 + }, + { + "epoch": 1.2697974217311234, + "grad_norm": 0.5533550381660461, + "learning_rate": 9.755620384989401e-05, + "loss": 2.0236, + "step": 4137 + }, + { + "epoch": 1.2701043585021485, + "grad_norm": 0.49497511982917786, + "learning_rate": 9.755466866135476e-05, + "loss": 1.9266, + "step": 4138 + }, + { + "epoch": 1.2704112952731736, + "grad_norm": 0.5009804964065552, + "learning_rate": 9.755313300285239e-05, + "loss": 1.9463, + "step": 4139 + }, + { + "epoch": 1.270718232044199, + "grad_norm": 0.49870428442955017, + "learning_rate": 9.755159687440209e-05, + "loss": 1.9566, + "step": 4140 + }, + { + "epoch": 1.271025168815224, + "grad_norm": 0.49113500118255615, + "learning_rate": 9.755006027601905e-05, + "loss": 2.0075, + "step": 4141 + }, + { + "epoch": 1.2713321055862492, + "grad_norm": 0.45977187156677246, + "learning_rate": 9.754852320771845e-05, + "loss": 1.9358, + "step": 4142 + }, + { + "epoch": 1.2716390423572743, + "grad_norm": 0.5493664145469666, + "learning_rate": 9.754698566951545e-05, + "loss": 1.9996, + "step": 4143 + }, + { + "epoch": 1.2719459791282997, + "grad_norm": 0.4791078567504883, + "learning_rate": 9.75454476614253e-05, + "loss": 1.9426, + "step": 4144 + }, + { + "epoch": 1.2722529158993248, + "grad_norm": 0.4809282720088959, + "learning_rate": 9.754390918346315e-05, + "loss": 2.0197, + "step": 4145 + }, + { + "epoch": 1.2725598526703499, + "grad_norm": 0.5380387902259827, + "learning_rate": 9.754237023564423e-05, + "loss": 2.0261, + "step": 4146 + }, + { + "epoch": 1.272866789441375, + "grad_norm": 0.48302608728408813, + "learning_rate": 9.754083081798374e-05, + "loss": 2.0539, + "step": 4147 + }, + { + "epoch": 1.2731737262124003, + "grad_norm": 0.5752124786376953, + "learning_rate": 9.75392909304969e-05, + "loss": 2.0901, + "step": 4148 + }, + { + "epoch": 1.2734806629834254, + "grad_norm": 0.5538807511329651, + "learning_rate": 9.75377505731989e-05, + "loss": 1.9721, + "step": 4149 + }, + { + "epoch": 1.2737875997544506, + "grad_norm": 0.6331756114959717, + "learning_rate": 9.753620974610502e-05, + "loss": 2.0124, + "step": 4150 + }, + { + "epoch": 1.2740945365254759, + "grad_norm": 0.6422140598297119, + "learning_rate": 9.753466844923042e-05, + "loss": 2.0115, + "step": 4151 + }, + { + "epoch": 1.274401473296501, + "grad_norm": 0.6650347113609314, + "learning_rate": 9.753312668259038e-05, + "loss": 1.9735, + "step": 4152 + }, + { + "epoch": 1.274708410067526, + "grad_norm": 0.587230384349823, + "learning_rate": 9.753158444620013e-05, + "loss": 1.9382, + "step": 4153 + }, + { + "epoch": 1.2750153468385512, + "grad_norm": 0.5357664823532104, + "learning_rate": 9.75300417400749e-05, + "loss": 2.0437, + "step": 4154 + }, + { + "epoch": 1.2753222836095763, + "grad_norm": 0.5058115720748901, + "learning_rate": 9.752849856422994e-05, + "loss": 2.0031, + "step": 4155 + }, + { + "epoch": 1.2756292203806017, + "grad_norm": 0.5913745164871216, + "learning_rate": 9.75269549186805e-05, + "loss": 1.9923, + "step": 4156 + }, + { + "epoch": 1.2759361571516268, + "grad_norm": 0.6766920685768127, + "learning_rate": 9.752541080344181e-05, + "loss": 1.9619, + "step": 4157 + }, + { + "epoch": 1.276243093922652, + "grad_norm": 0.606132984161377, + "learning_rate": 9.752386621852919e-05, + "loss": 1.9689, + "step": 4158 + }, + { + "epoch": 1.2765500306936772, + "grad_norm": 0.521133542060852, + "learning_rate": 9.752232116395785e-05, + "loss": 1.9602, + "step": 4159 + }, + { + "epoch": 1.2768569674647023, + "grad_norm": 0.45266324281692505, + "learning_rate": 9.75207756397431e-05, + "loss": 2.0032, + "step": 4160 + }, + { + "epoch": 1.2771639042357275, + "grad_norm": 0.5078892707824707, + "learning_rate": 9.751922964590017e-05, + "loss": 2.0656, + "step": 4161 + }, + { + "epoch": 1.2774708410067526, + "grad_norm": 0.5042154788970947, + "learning_rate": 9.751768318244437e-05, + "loss": 1.9356, + "step": 4162 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.5866135954856873, + "learning_rate": 9.751613624939098e-05, + "loss": 1.9655, + "step": 4163 + }, + { + "epoch": 1.278084714548803, + "grad_norm": 0.6038163304328918, + "learning_rate": 9.751458884675527e-05, + "loss": 1.9445, + "step": 4164 + }, + { + "epoch": 1.2783916513198281, + "grad_norm": 0.4938269555568695, + "learning_rate": 9.751304097455254e-05, + "loss": 2.0164, + "step": 4165 + }, + { + "epoch": 1.2786985880908532, + "grad_norm": 0.4289272427558899, + "learning_rate": 9.75114926327981e-05, + "loss": 1.912, + "step": 4166 + }, + { + "epoch": 1.2790055248618786, + "grad_norm": 0.524058997631073, + "learning_rate": 9.750994382150724e-05, + "loss": 1.9279, + "step": 4167 + }, + { + "epoch": 1.2793124616329037, + "grad_norm": 0.6318224668502808, + "learning_rate": 9.750839454069527e-05, + "loss": 1.98, + "step": 4168 + }, + { + "epoch": 1.2796193984039288, + "grad_norm": 0.5709670782089233, + "learning_rate": 9.750684479037749e-05, + "loss": 2.0029, + "step": 4169 + }, + { + "epoch": 1.279926335174954, + "grad_norm": 0.4621523916721344, + "learning_rate": 9.750529457056924e-05, + "loss": 2.0295, + "step": 4170 + }, + { + "epoch": 1.280233271945979, + "grad_norm": 0.5812001824378967, + "learning_rate": 9.750374388128581e-05, + "loss": 2.0839, + "step": 4171 + }, + { + "epoch": 1.2805402087170044, + "grad_norm": 0.6389874219894409, + "learning_rate": 9.750219272254256e-05, + "loss": 2.0825, + "step": 4172 + }, + { + "epoch": 1.2808471454880295, + "grad_norm": 0.49902382493019104, + "learning_rate": 9.750064109435478e-05, + "loss": 1.8902, + "step": 4173 + }, + { + "epoch": 1.2811540822590546, + "grad_norm": 0.5641525983810425, + "learning_rate": 9.749908899673783e-05, + "loss": 2.0463, + "step": 4174 + }, + { + "epoch": 1.28146101903008, + "grad_norm": 0.5977841019630432, + "learning_rate": 9.749753642970704e-05, + "loss": 2.0253, + "step": 4175 + }, + { + "epoch": 1.281767955801105, + "grad_norm": 0.5438104271888733, + "learning_rate": 9.749598339327777e-05, + "loss": 1.9862, + "step": 4176 + }, + { + "epoch": 1.2820748925721301, + "grad_norm": 0.4542587697505951, + "learning_rate": 9.749442988746535e-05, + "loss": 1.9476, + "step": 4177 + }, + { + "epoch": 1.2823818293431553, + "grad_norm": 0.4900791347026825, + "learning_rate": 9.749287591228513e-05, + "loss": 2.0093, + "step": 4178 + }, + { + "epoch": 1.2826887661141804, + "grad_norm": 0.5837534666061401, + "learning_rate": 9.749132146775247e-05, + "loss": 2.0699, + "step": 4179 + }, + { + "epoch": 1.2829957028852057, + "grad_norm": 0.5315881967544556, + "learning_rate": 9.748976655388274e-05, + "loss": 1.9514, + "step": 4180 + }, + { + "epoch": 1.2833026396562308, + "grad_norm": 0.5284895300865173, + "learning_rate": 9.74882111706913e-05, + "loss": 2.0171, + "step": 4181 + }, + { + "epoch": 1.283609576427256, + "grad_norm": 0.521202802658081, + "learning_rate": 9.748665531819352e-05, + "loss": 2.025, + "step": 4182 + }, + { + "epoch": 1.2839165131982813, + "grad_norm": 0.5437573194503784, + "learning_rate": 9.748509899640479e-05, + "loss": 2.0352, + "step": 4183 + }, + { + "epoch": 1.2842234499693064, + "grad_norm": 0.5394143462181091, + "learning_rate": 9.748354220534048e-05, + "loss": 2.0245, + "step": 4184 + }, + { + "epoch": 1.2845303867403315, + "grad_norm": 0.47468093037605286, + "learning_rate": 9.748198494501597e-05, + "loss": 1.9719, + "step": 4185 + }, + { + "epoch": 1.2848373235113566, + "grad_norm": 0.5312216877937317, + "learning_rate": 9.748042721544666e-05, + "loss": 2.0111, + "step": 4186 + }, + { + "epoch": 1.2851442602823817, + "grad_norm": 0.525694727897644, + "learning_rate": 9.747886901664794e-05, + "loss": 2.0582, + "step": 4187 + }, + { + "epoch": 1.285451197053407, + "grad_norm": 0.4965955317020416, + "learning_rate": 9.74773103486352e-05, + "loss": 1.9777, + "step": 4188 + }, + { + "epoch": 1.2857581338244322, + "grad_norm": 0.4391513466835022, + "learning_rate": 9.747575121142385e-05, + "loss": 1.9725, + "step": 4189 + }, + { + "epoch": 1.2860650705954573, + "grad_norm": 0.48999011516571045, + "learning_rate": 9.74741916050293e-05, + "loss": 1.953, + "step": 4190 + }, + { + "epoch": 1.2863720073664826, + "grad_norm": 0.5297304391860962, + "learning_rate": 9.747263152946698e-05, + "loss": 2.0484, + "step": 4191 + }, + { + "epoch": 1.2866789441375077, + "grad_norm": 0.4878230690956116, + "learning_rate": 9.747107098475226e-05, + "loss": 2.0423, + "step": 4192 + }, + { + "epoch": 1.2869858809085328, + "grad_norm": 0.538070023059845, + "learning_rate": 9.74695099709006e-05, + "loss": 2.0699, + "step": 4193 + }, + { + "epoch": 1.287292817679558, + "grad_norm": 0.6656436324119568, + "learning_rate": 9.746794848792743e-05, + "loss": 2.0689, + "step": 4194 + }, + { + "epoch": 1.287599754450583, + "grad_norm": 0.6416848301887512, + "learning_rate": 9.746638653584819e-05, + "loss": 1.9796, + "step": 4195 + }, + { + "epoch": 1.2879066912216084, + "grad_norm": 0.5917447805404663, + "learning_rate": 9.746482411467827e-05, + "loss": 2.0324, + "step": 4196 + }, + { + "epoch": 1.2882136279926335, + "grad_norm": 0.5234537124633789, + "learning_rate": 9.746326122443314e-05, + "loss": 2.0468, + "step": 4197 + }, + { + "epoch": 1.2885205647636586, + "grad_norm": 0.4885808229446411, + "learning_rate": 9.746169786512827e-05, + "loss": 1.9619, + "step": 4198 + }, + { + "epoch": 1.288827501534684, + "grad_norm": 0.5776945948600769, + "learning_rate": 9.746013403677905e-05, + "loss": 2.0167, + "step": 4199 + }, + { + "epoch": 1.289134438305709, + "grad_norm": 0.5722271203994751, + "learning_rate": 9.745856973940099e-05, + "loss": 1.9751, + "step": 4200 + }, + { + "epoch": 1.2894413750767342, + "grad_norm": 0.49253931641578674, + "learning_rate": 9.745700497300951e-05, + "loss": 1.9821, + "step": 4201 + }, + { + "epoch": 1.2897483118477593, + "grad_norm": 0.4739282727241516, + "learning_rate": 9.74554397376201e-05, + "loss": 1.9926, + "step": 4202 + }, + { + "epoch": 1.2900552486187844, + "grad_norm": 0.5133153200149536, + "learning_rate": 9.745387403324823e-05, + "loss": 1.9655, + "step": 4203 + }, + { + "epoch": 1.2903621853898097, + "grad_norm": 0.48941388726234436, + "learning_rate": 9.745230785990935e-05, + "loss": 1.9401, + "step": 4204 + }, + { + "epoch": 1.2906691221608348, + "grad_norm": 0.5998152494430542, + "learning_rate": 9.745074121761896e-05, + "loss": 2.0223, + "step": 4205 + }, + { + "epoch": 1.29097605893186, + "grad_norm": 0.4423331618309021, + "learning_rate": 9.744917410639253e-05, + "loss": 1.9602, + "step": 4206 + }, + { + "epoch": 1.2912829957028853, + "grad_norm": 0.5387418866157532, + "learning_rate": 9.744760652624553e-05, + "loss": 2.0631, + "step": 4207 + }, + { + "epoch": 1.2915899324739104, + "grad_norm": 0.5992900729179382, + "learning_rate": 9.744603847719352e-05, + "loss": 1.9805, + "step": 4208 + }, + { + "epoch": 1.2918968692449355, + "grad_norm": 0.5033924579620361, + "learning_rate": 9.744446995925192e-05, + "loss": 1.9817, + "step": 4209 + }, + { + "epoch": 1.2922038060159606, + "grad_norm": 0.47493448853492737, + "learning_rate": 9.744290097243624e-05, + "loss": 2.0259, + "step": 4210 + }, + { + "epoch": 1.2925107427869857, + "grad_norm": 0.5161942839622498, + "learning_rate": 9.744133151676203e-05, + "loss": 1.9686, + "step": 4211 + }, + { + "epoch": 1.292817679558011, + "grad_norm": 0.4476351737976074, + "learning_rate": 9.743976159224477e-05, + "loss": 1.9488, + "step": 4212 + }, + { + "epoch": 1.2931246163290362, + "grad_norm": 0.5168361663818359, + "learning_rate": 9.743819119889999e-05, + "loss": 2.0645, + "step": 4213 + }, + { + "epoch": 1.2934315531000613, + "grad_norm": 0.5098811984062195, + "learning_rate": 9.743662033674319e-05, + "loss": 1.9889, + "step": 4214 + }, + { + "epoch": 1.2937384898710866, + "grad_norm": 0.5559372305870056, + "learning_rate": 9.74350490057899e-05, + "loss": 2.0348, + "step": 4215 + }, + { + "epoch": 1.2940454266421118, + "grad_norm": 0.5274948477745056, + "learning_rate": 9.743347720605566e-05, + "loss": 2.0566, + "step": 4216 + }, + { + "epoch": 1.2943523634131369, + "grad_norm": 0.5009967088699341, + "learning_rate": 9.743190493755601e-05, + "loss": 1.9915, + "step": 4217 + }, + { + "epoch": 1.2946593001841622, + "grad_norm": 0.5365834832191467, + "learning_rate": 9.743033220030646e-05, + "loss": 2.0581, + "step": 4218 + }, + { + "epoch": 1.2949662369551873, + "grad_norm": 0.519478976726532, + "learning_rate": 9.742875899432255e-05, + "loss": 1.9766, + "step": 4219 + }, + { + "epoch": 1.2952731737262124, + "grad_norm": 0.48030364513397217, + "learning_rate": 9.742718531961988e-05, + "loss": 2.0006, + "step": 4220 + }, + { + "epoch": 1.2955801104972375, + "grad_norm": 0.5257472991943359, + "learning_rate": 9.742561117621394e-05, + "loss": 2.0636, + "step": 4221 + }, + { + "epoch": 1.2958870472682626, + "grad_norm": 0.44784319400787354, + "learning_rate": 9.742403656412034e-05, + "loss": 1.9975, + "step": 4222 + }, + { + "epoch": 1.296193984039288, + "grad_norm": 0.4997022747993469, + "learning_rate": 9.742246148335459e-05, + "loss": 2.0167, + "step": 4223 + }, + { + "epoch": 1.296500920810313, + "grad_norm": 0.43378305435180664, + "learning_rate": 9.742088593393228e-05, + "loss": 1.9202, + "step": 4224 + }, + { + "epoch": 1.2968078575813382, + "grad_norm": 0.5256497859954834, + "learning_rate": 9.741930991586899e-05, + "loss": 2.0306, + "step": 4225 + }, + { + "epoch": 1.2971147943523635, + "grad_norm": 0.5017027258872986, + "learning_rate": 9.741773342918028e-05, + "loss": 2.0124, + "step": 4226 + }, + { + "epoch": 1.2974217311233887, + "grad_norm": 0.5393915176391602, + "learning_rate": 9.741615647388175e-05, + "loss": 2.0255, + "step": 4227 + }, + { + "epoch": 1.2977286678944138, + "grad_norm": 0.48618295788764954, + "learning_rate": 9.741457904998896e-05, + "loss": 1.9863, + "step": 4228 + }, + { + "epoch": 1.2980356046654389, + "grad_norm": 0.48060059547424316, + "learning_rate": 9.741300115751752e-05, + "loss": 2.0787, + "step": 4229 + }, + { + "epoch": 1.298342541436464, + "grad_norm": 0.4966236650943756, + "learning_rate": 9.741142279648298e-05, + "loss": 1.9818, + "step": 4230 + }, + { + "epoch": 1.2986494782074893, + "grad_norm": 0.5178021788597107, + "learning_rate": 9.7409843966901e-05, + "loss": 1.9847, + "step": 4231 + }, + { + "epoch": 1.2989564149785144, + "grad_norm": 0.5161063075065613, + "learning_rate": 9.740826466878716e-05, + "loss": 2.0028, + "step": 4232 + }, + { + "epoch": 1.2992633517495396, + "grad_norm": 0.5972462296485901, + "learning_rate": 9.740668490215705e-05, + "loss": 2.0205, + "step": 4233 + }, + { + "epoch": 1.2995702885205649, + "grad_norm": 0.5929185152053833, + "learning_rate": 9.740510466702629e-05, + "loss": 1.9802, + "step": 4234 + }, + { + "epoch": 1.29987722529159, + "grad_norm": 0.5496684908866882, + "learning_rate": 9.74035239634105e-05, + "loss": 1.9331, + "step": 4235 + }, + { + "epoch": 1.3001841620626151, + "grad_norm": 0.5822622179985046, + "learning_rate": 9.740194279132531e-05, + "loss": 2.1079, + "step": 4236 + }, + { + "epoch": 1.3004910988336402, + "grad_norm": 0.5886369943618774, + "learning_rate": 9.740036115078634e-05, + "loss": 1.9938, + "step": 4237 + }, + { + "epoch": 1.3007980356046653, + "grad_norm": 0.5259171724319458, + "learning_rate": 9.73987790418092e-05, + "loss": 2.0787, + "step": 4238 + }, + { + "epoch": 1.3011049723756907, + "grad_norm": 0.6112152934074402, + "learning_rate": 9.739719646440956e-05, + "loss": 2.0488, + "step": 4239 + }, + { + "epoch": 1.3014119091467158, + "grad_norm": 0.5786338448524475, + "learning_rate": 9.739561341860306e-05, + "loss": 1.9917, + "step": 4240 + }, + { + "epoch": 1.301718845917741, + "grad_norm": 0.5099230408668518, + "learning_rate": 9.739402990440531e-05, + "loss": 1.9949, + "step": 4241 + }, + { + "epoch": 1.3020257826887662, + "grad_norm": 0.5040346384048462, + "learning_rate": 9.739244592183198e-05, + "loss": 1.9368, + "step": 4242 + }, + { + "epoch": 1.3023327194597913, + "grad_norm": 0.48172008991241455, + "learning_rate": 9.739086147089871e-05, + "loss": 1.97, + "step": 4243 + }, + { + "epoch": 1.3026396562308165, + "grad_norm": 0.5350810885429382, + "learning_rate": 9.738927655162119e-05, + "loss": 2.0584, + "step": 4244 + }, + { + "epoch": 1.3029465930018416, + "grad_norm": 0.566371738910675, + "learning_rate": 9.738769116401505e-05, + "loss": 2.0138, + "step": 4245 + }, + { + "epoch": 1.3032535297728667, + "grad_norm": 0.5697746872901917, + "learning_rate": 9.738610530809598e-05, + "loss": 2.0319, + "step": 4246 + }, + { + "epoch": 1.303560466543892, + "grad_norm": 0.5186757445335388, + "learning_rate": 9.738451898387964e-05, + "loss": 1.9958, + "step": 4247 + }, + { + "epoch": 1.3038674033149171, + "grad_norm": 0.5318703651428223, + "learning_rate": 9.73829321913817e-05, + "loss": 2.0857, + "step": 4248 + }, + { + "epoch": 1.3041743400859422, + "grad_norm": 0.5013560056686401, + "learning_rate": 9.738134493061786e-05, + "loss": 1.9545, + "step": 4249 + }, + { + "epoch": 1.3044812768569676, + "grad_norm": 0.499009907245636, + "learning_rate": 9.737975720160382e-05, + "loss": 1.9773, + "step": 4250 + }, + { + "epoch": 1.3047882136279927, + "grad_norm": 0.5187140703201294, + "learning_rate": 9.737816900435522e-05, + "loss": 1.9826, + "step": 4251 + }, + { + "epoch": 1.3050951503990178, + "grad_norm": 0.4950683116912842, + "learning_rate": 9.73765803388878e-05, + "loss": 2.0061, + "step": 4252 + }, + { + "epoch": 1.305402087170043, + "grad_norm": 0.40729087591171265, + "learning_rate": 9.737499120521722e-05, + "loss": 1.9502, + "step": 4253 + }, + { + "epoch": 1.305709023941068, + "grad_norm": 0.4959156811237335, + "learning_rate": 9.737340160335924e-05, + "loss": 2.0975, + "step": 4254 + }, + { + "epoch": 1.3060159607120934, + "grad_norm": 0.5127618312835693, + "learning_rate": 9.737181153332952e-05, + "loss": 2.0098, + "step": 4255 + }, + { + "epoch": 1.3063228974831185, + "grad_norm": 0.45458972454071045, + "learning_rate": 9.737022099514381e-05, + "loss": 1.9475, + "step": 4256 + }, + { + "epoch": 1.3066298342541436, + "grad_norm": 0.5024627447128296, + "learning_rate": 9.736862998881779e-05, + "loss": 2.0682, + "step": 4257 + }, + { + "epoch": 1.306936771025169, + "grad_norm": 0.5217326283454895, + "learning_rate": 9.736703851436722e-05, + "loss": 2.0363, + "step": 4258 + }, + { + "epoch": 1.307243707796194, + "grad_norm": 0.4798679053783417, + "learning_rate": 9.736544657180781e-05, + "loss": 2.0357, + "step": 4259 + }, + { + "epoch": 1.3075506445672191, + "grad_norm": 0.6031736135482788, + "learning_rate": 9.73638541611553e-05, + "loss": 2.0143, + "step": 4260 + }, + { + "epoch": 1.3078575813382443, + "grad_norm": 0.4914969801902771, + "learning_rate": 9.736226128242542e-05, + "loss": 1.9292, + "step": 4261 + }, + { + "epoch": 1.3081645181092694, + "grad_norm": 0.40556418895721436, + "learning_rate": 9.736066793563392e-05, + "loss": 1.9528, + "step": 4262 + }, + { + "epoch": 1.3084714548802947, + "grad_norm": 0.45605841279029846, + "learning_rate": 9.735907412079652e-05, + "loss": 2.0704, + "step": 4263 + }, + { + "epoch": 1.3087783916513198, + "grad_norm": 0.4992324113845825, + "learning_rate": 9.7357479837929e-05, + "loss": 2.0211, + "step": 4264 + }, + { + "epoch": 1.309085328422345, + "grad_norm": 0.4904097020626068, + "learning_rate": 9.735588508704712e-05, + "loss": 1.987, + "step": 4265 + }, + { + "epoch": 1.3093922651933703, + "grad_norm": 0.5436086058616638, + "learning_rate": 9.735428986816661e-05, + "loss": 2.0704, + "step": 4266 + }, + { + "epoch": 1.3096992019643954, + "grad_norm": 0.4850294589996338, + "learning_rate": 9.735269418130326e-05, + "loss": 1.9576, + "step": 4267 + }, + { + "epoch": 1.3100061387354205, + "grad_norm": 0.44082164764404297, + "learning_rate": 9.735109802647283e-05, + "loss": 2.0018, + "step": 4268 + }, + { + "epoch": 1.3103130755064456, + "grad_norm": 0.4844531714916229, + "learning_rate": 9.73495014036911e-05, + "loss": 1.9852, + "step": 4269 + }, + { + "epoch": 1.3106200122774707, + "grad_norm": 0.547596275806427, + "learning_rate": 9.734790431297384e-05, + "loss": 2.0632, + "step": 4270 + }, + { + "epoch": 1.310926949048496, + "grad_norm": 0.517882764339447, + "learning_rate": 9.734630675433684e-05, + "loss": 1.9851, + "step": 4271 + }, + { + "epoch": 1.3112338858195212, + "grad_norm": 0.5148623585700989, + "learning_rate": 9.734470872779589e-05, + "loss": 2.0446, + "step": 4272 + }, + { + "epoch": 1.3115408225905463, + "grad_norm": 0.5872887372970581, + "learning_rate": 9.734311023336678e-05, + "loss": 2.0588, + "step": 4273 + }, + { + "epoch": 1.3118477593615716, + "grad_norm": 0.7116255164146423, + "learning_rate": 9.73415112710653e-05, + "loss": 2.0213, + "step": 4274 + }, + { + "epoch": 1.3121546961325967, + "grad_norm": 0.8191964626312256, + "learning_rate": 9.733991184090725e-05, + "loss": 1.9528, + "step": 4275 + }, + { + "epoch": 1.3124616329036218, + "grad_norm": 0.8214605450630188, + "learning_rate": 9.733831194290846e-05, + "loss": 1.9614, + "step": 4276 + }, + { + "epoch": 1.312768569674647, + "grad_norm": 0.7057182788848877, + "learning_rate": 9.733671157708472e-05, + "loss": 2.0767, + "step": 4277 + }, + { + "epoch": 1.313075506445672, + "grad_norm": 0.5114007592201233, + "learning_rate": 9.733511074345185e-05, + "loss": 1.946, + "step": 4278 + }, + { + "epoch": 1.3133824432166974, + "grad_norm": 0.5347970128059387, + "learning_rate": 9.733350944202566e-05, + "loss": 1.9658, + "step": 4279 + }, + { + "epoch": 1.3136893799877225, + "grad_norm": 0.6962214112281799, + "learning_rate": 9.733190767282202e-05, + "loss": 2.0943, + "step": 4280 + }, + { + "epoch": 1.3139963167587476, + "grad_norm": 0.5942707657814026, + "learning_rate": 9.733030543585668e-05, + "loss": 2.0101, + "step": 4281 + }, + { + "epoch": 1.314303253529773, + "grad_norm": 0.46218639612197876, + "learning_rate": 9.732870273114556e-05, + "loss": 2.0292, + "step": 4282 + }, + { + "epoch": 1.314610190300798, + "grad_norm": 0.5194444060325623, + "learning_rate": 9.732709955870445e-05, + "loss": 2.0666, + "step": 4283 + }, + { + "epoch": 1.3149171270718232, + "grad_norm": 0.5112141370773315, + "learning_rate": 9.732549591854918e-05, + "loss": 2.0205, + "step": 4284 + }, + { + "epoch": 1.3152240638428485, + "grad_norm": 0.5282790660858154, + "learning_rate": 9.732389181069566e-05, + "loss": 2.0704, + "step": 4285 + }, + { + "epoch": 1.3155310006138736, + "grad_norm": 0.4598311185836792, + "learning_rate": 9.732228723515968e-05, + "loss": 1.9485, + "step": 4286 + }, + { + "epoch": 1.3158379373848987, + "grad_norm": 0.4700186550617218, + "learning_rate": 9.732068219195711e-05, + "loss": 2.0329, + "step": 4287 + }, + { + "epoch": 1.3161448741559238, + "grad_norm": 0.4512452781200409, + "learning_rate": 9.731907668110384e-05, + "loss": 1.9829, + "step": 4288 + }, + { + "epoch": 1.316451810926949, + "grad_norm": 0.5053353309631348, + "learning_rate": 9.731747070261572e-05, + "loss": 2.0583, + "step": 4289 + }, + { + "epoch": 1.3167587476979743, + "grad_norm": 0.48143625259399414, + "learning_rate": 9.73158642565086e-05, + "loss": 2.014, + "step": 4290 + }, + { + "epoch": 1.3170656844689994, + "grad_norm": 0.4843716025352478, + "learning_rate": 9.73142573427984e-05, + "loss": 1.9951, + "step": 4291 + }, + { + "epoch": 1.3173726212400245, + "grad_norm": 0.45646217465400696, + "learning_rate": 9.731264996150098e-05, + "loss": 1.9701, + "step": 4292 + }, + { + "epoch": 1.3176795580110499, + "grad_norm": 0.5176306962966919, + "learning_rate": 9.73110421126322e-05, + "loss": 1.9915, + "step": 4293 + }, + { + "epoch": 1.317986494782075, + "grad_norm": 0.4862259328365326, + "learning_rate": 9.730943379620799e-05, + "loss": 2.0157, + "step": 4294 + }, + { + "epoch": 1.3182934315531, + "grad_norm": 0.4941593110561371, + "learning_rate": 9.730782501224423e-05, + "loss": 2.0164, + "step": 4295 + }, + { + "epoch": 1.3186003683241252, + "grad_norm": 0.46818530559539795, + "learning_rate": 9.73062157607568e-05, + "loss": 1.9749, + "step": 4296 + }, + { + "epoch": 1.3189073050951503, + "grad_norm": 0.41685113310813904, + "learning_rate": 9.730460604176163e-05, + "loss": 1.9443, + "step": 4297 + }, + { + "epoch": 1.3192142418661756, + "grad_norm": 0.40586861968040466, + "learning_rate": 9.73029958552746e-05, + "loss": 1.9227, + "step": 4298 + }, + { + "epoch": 1.3195211786372008, + "grad_norm": 0.3946068286895752, + "learning_rate": 9.730138520131167e-05, + "loss": 1.9073, + "step": 4299 + }, + { + "epoch": 1.3198281154082259, + "grad_norm": 0.3722321093082428, + "learning_rate": 9.729977407988871e-05, + "loss": 1.9299, + "step": 4300 + }, + { + "epoch": 1.3201350521792512, + "grad_norm": 0.39335691928863525, + "learning_rate": 9.729816249102164e-05, + "loss": 1.9673, + "step": 4301 + }, + { + "epoch": 1.3204419889502763, + "grad_norm": 0.4342779815196991, + "learning_rate": 9.729655043472643e-05, + "loss": 2.0704, + "step": 4302 + }, + { + "epoch": 1.3207489257213014, + "grad_norm": 0.46981000900268555, + "learning_rate": 9.729493791101899e-05, + "loss": 2.0593, + "step": 4303 + }, + { + "epoch": 1.3210558624923265, + "grad_norm": 0.4319849908351898, + "learning_rate": 9.729332491991524e-05, + "loss": 1.9378, + "step": 4304 + }, + { + "epoch": 1.3213627992633517, + "grad_norm": 0.4555012285709381, + "learning_rate": 9.729171146143115e-05, + "loss": 1.993, + "step": 4305 + }, + { + "epoch": 1.321669736034377, + "grad_norm": 0.5122297406196594, + "learning_rate": 9.729009753558262e-05, + "loss": 2.0237, + "step": 4306 + }, + { + "epoch": 1.321976672805402, + "grad_norm": 0.4814549386501312, + "learning_rate": 9.728848314238566e-05, + "loss": 2.0063, + "step": 4307 + }, + { + "epoch": 1.3222836095764272, + "grad_norm": 0.45410022139549255, + "learning_rate": 9.728686828185618e-05, + "loss": 2.0262, + "step": 4308 + }, + { + "epoch": 1.3225905463474525, + "grad_norm": 0.44759154319763184, + "learning_rate": 9.728525295401014e-05, + "loss": 1.9746, + "step": 4309 + }, + { + "epoch": 1.3228974831184777, + "grad_norm": 0.41539889574050903, + "learning_rate": 9.728363715886352e-05, + "loss": 1.9197, + "step": 4310 + }, + { + "epoch": 1.3232044198895028, + "grad_norm": 0.549961268901825, + "learning_rate": 9.72820208964323e-05, + "loss": 2.0168, + "step": 4311 + }, + { + "epoch": 1.3235113566605279, + "grad_norm": 0.6832249164581299, + "learning_rate": 9.728040416673243e-05, + "loss": 1.9711, + "step": 4312 + }, + { + "epoch": 1.323818293431553, + "grad_norm": 0.7458481788635254, + "learning_rate": 9.727878696977988e-05, + "loss": 2.1677, + "step": 4313 + }, + { + "epoch": 1.3241252302025783, + "grad_norm": 0.6268119812011719, + "learning_rate": 9.727716930559066e-05, + "loss": 2.0222, + "step": 4314 + }, + { + "epoch": 1.3244321669736034, + "grad_norm": 0.540987491607666, + "learning_rate": 9.727555117418075e-05, + "loss": 2.0552, + "step": 4315 + }, + { + "epoch": 1.3247391037446286, + "grad_norm": 0.6105024814605713, + "learning_rate": 9.727393257556612e-05, + "loss": 1.9287, + "step": 4316 + }, + { + "epoch": 1.325046040515654, + "grad_norm": 0.594327449798584, + "learning_rate": 9.727231350976277e-05, + "loss": 1.9737, + "step": 4317 + }, + { + "epoch": 1.325352977286679, + "grad_norm": 0.5686312913894653, + "learning_rate": 9.727069397678674e-05, + "loss": 1.988, + "step": 4318 + }, + { + "epoch": 1.3256599140577041, + "grad_norm": 0.5335875153541565, + "learning_rate": 9.726907397665399e-05, + "loss": 1.9992, + "step": 4319 + }, + { + "epoch": 1.3259668508287292, + "grad_norm": 0.514209508895874, + "learning_rate": 9.726745350938055e-05, + "loss": 2.0928, + "step": 4320 + }, + { + "epoch": 1.3262737875997543, + "grad_norm": 0.58844393491745, + "learning_rate": 9.726583257498242e-05, + "loss": 1.968, + "step": 4321 + }, + { + "epoch": 1.3265807243707797, + "grad_norm": 0.5247591733932495, + "learning_rate": 9.726421117347563e-05, + "loss": 1.9529, + "step": 4322 + }, + { + "epoch": 1.3268876611418048, + "grad_norm": 0.5057464241981506, + "learning_rate": 9.726258930487622e-05, + "loss": 2.0595, + "step": 4323 + }, + { + "epoch": 1.32719459791283, + "grad_norm": 0.564689040184021, + "learning_rate": 9.726096696920019e-05, + "loss": 1.9974, + "step": 4324 + }, + { + "epoch": 1.3275015346838552, + "grad_norm": 0.5755618214607239, + "learning_rate": 9.725934416646358e-05, + "loss": 1.9949, + "step": 4325 + }, + { + "epoch": 1.3278084714548803, + "grad_norm": 0.5969316959381104, + "learning_rate": 9.725772089668243e-05, + "loss": 1.972, + "step": 4326 + }, + { + "epoch": 1.3281154082259055, + "grad_norm": 0.5776877403259277, + "learning_rate": 9.725609715987278e-05, + "loss": 2.1018, + "step": 4327 + }, + { + "epoch": 1.3284223449969306, + "grad_norm": 0.5471270680427551, + "learning_rate": 9.725447295605071e-05, + "loss": 2.0153, + "step": 4328 + }, + { + "epoch": 1.3287292817679557, + "grad_norm": 0.49090373516082764, + "learning_rate": 9.725284828523222e-05, + "loss": 1.9651, + "step": 4329 + }, + { + "epoch": 1.329036218538981, + "grad_norm": 0.49420034885406494, + "learning_rate": 9.725122314743337e-05, + "loss": 2.0119, + "step": 4330 + }, + { + "epoch": 1.3293431553100061, + "grad_norm": 0.4841148853302002, + "learning_rate": 9.724959754267027e-05, + "loss": 1.974, + "step": 4331 + }, + { + "epoch": 1.3296500920810312, + "grad_norm": 0.42349007725715637, + "learning_rate": 9.724797147095893e-05, + "loss": 1.9779, + "step": 4332 + }, + { + "epoch": 1.3299570288520566, + "grad_norm": 0.47239863872528076, + "learning_rate": 9.724634493231545e-05, + "loss": 1.9184, + "step": 4333 + }, + { + "epoch": 1.3302639656230817, + "grad_norm": 0.5583773255348206, + "learning_rate": 9.72447179267559e-05, + "loss": 2.0742, + "step": 4334 + }, + { + "epoch": 1.3305709023941068, + "grad_norm": 0.486937552690506, + "learning_rate": 9.724309045429636e-05, + "loss": 2.0101, + "step": 4335 + }, + { + "epoch": 1.330877839165132, + "grad_norm": 0.42204493284225464, + "learning_rate": 9.724146251495289e-05, + "loss": 1.9564, + "step": 4336 + }, + { + "epoch": 1.331184775936157, + "grad_norm": 0.451628714799881, + "learning_rate": 9.723983410874163e-05, + "loss": 1.9949, + "step": 4337 + }, + { + "epoch": 1.3314917127071824, + "grad_norm": 0.4453491270542145, + "learning_rate": 9.723820523567861e-05, + "loss": 1.9415, + "step": 4338 + }, + { + "epoch": 1.3317986494782075, + "grad_norm": 0.4628424644470215, + "learning_rate": 9.723657589577999e-05, + "loss": 2.0296, + "step": 4339 + }, + { + "epoch": 1.3321055862492326, + "grad_norm": 0.5362148284912109, + "learning_rate": 9.723494608906181e-05, + "loss": 2.0719, + "step": 4340 + }, + { + "epoch": 1.332412523020258, + "grad_norm": 0.45357146859169006, + "learning_rate": 9.723331581554023e-05, + "loss": 1.9107, + "step": 4341 + }, + { + "epoch": 1.332719459791283, + "grad_norm": 0.5042485594749451, + "learning_rate": 9.723168507523133e-05, + "loss": 1.9838, + "step": 4342 + }, + { + "epoch": 1.3330263965623081, + "grad_norm": 0.4797585606575012, + "learning_rate": 9.723005386815123e-05, + "loss": 1.9779, + "step": 4343 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4489155113697052, + "learning_rate": 9.722842219431607e-05, + "loss": 1.9805, + "step": 4344 + }, + { + "epoch": 1.3336402701043584, + "grad_norm": 0.43091216683387756, + "learning_rate": 9.722679005374196e-05, + "loss": 1.9708, + "step": 4345 + }, + { + "epoch": 1.3339472068753837, + "grad_norm": 0.453937292098999, + "learning_rate": 9.722515744644502e-05, + "loss": 2.0038, + "step": 4346 + }, + { + "epoch": 1.3342541436464088, + "grad_norm": 0.38905346393585205, + "learning_rate": 9.722352437244138e-05, + "loss": 2.0042, + "step": 4347 + }, + { + "epoch": 1.334561080417434, + "grad_norm": 0.46686118841171265, + "learning_rate": 9.722189083174722e-05, + "loss": 2.0733, + "step": 4348 + }, + { + "epoch": 1.3348680171884593, + "grad_norm": 0.42737439274787903, + "learning_rate": 9.722025682437865e-05, + "loss": 1.9572, + "step": 4349 + }, + { + "epoch": 1.3351749539594844, + "grad_norm": 0.3857511878013611, + "learning_rate": 9.721862235035181e-05, + "loss": 1.9288, + "step": 4350 + }, + { + "epoch": 1.3354818907305095, + "grad_norm": 0.42448824644088745, + "learning_rate": 9.721698740968288e-05, + "loss": 1.99, + "step": 4351 + }, + { + "epoch": 1.3357888275015346, + "grad_norm": 0.4753642976284027, + "learning_rate": 9.721535200238802e-05, + "loss": 2.0268, + "step": 4352 + }, + { + "epoch": 1.3360957642725597, + "grad_norm": 0.5248960256576538, + "learning_rate": 9.721371612848336e-05, + "loss": 2.008, + "step": 4353 + }, + { + "epoch": 1.336402701043585, + "grad_norm": 0.5046865344047546, + "learning_rate": 9.721207978798507e-05, + "loss": 1.9248, + "step": 4354 + }, + { + "epoch": 1.3367096378146102, + "grad_norm": 0.48205190896987915, + "learning_rate": 9.721044298090937e-05, + "loss": 1.9895, + "step": 4355 + }, + { + "epoch": 1.3370165745856353, + "grad_norm": 0.46149346232414246, + "learning_rate": 9.720880570727238e-05, + "loss": 2.0001, + "step": 4356 + }, + { + "epoch": 1.3373235113566606, + "grad_norm": 0.6212405562400818, + "learning_rate": 9.72071679670903e-05, + "loss": 2.0772, + "step": 4357 + }, + { + "epoch": 1.3376304481276857, + "grad_norm": 0.6935828924179077, + "learning_rate": 9.720552976037934e-05, + "loss": 1.9865, + "step": 4358 + }, + { + "epoch": 1.3379373848987108, + "grad_norm": 0.6850154399871826, + "learning_rate": 9.720389108715564e-05, + "loss": 1.9964, + "step": 4359 + }, + { + "epoch": 1.3382443216697362, + "grad_norm": 0.5925734043121338, + "learning_rate": 9.720225194743544e-05, + "loss": 2.0109, + "step": 4360 + }, + { + "epoch": 1.3385512584407613, + "grad_norm": 0.47503459453582764, + "learning_rate": 9.720061234123492e-05, + "loss": 2.0406, + "step": 4361 + }, + { + "epoch": 1.3388581952117864, + "grad_norm": 0.44226083159446716, + "learning_rate": 9.719897226857026e-05, + "loss": 1.953, + "step": 4362 + }, + { + "epoch": 1.3391651319828115, + "grad_norm": 0.5688608884811401, + "learning_rate": 9.719733172945772e-05, + "loss": 1.9422, + "step": 4363 + }, + { + "epoch": 1.3394720687538366, + "grad_norm": 0.6097545027732849, + "learning_rate": 9.719569072391347e-05, + "loss": 2.0204, + "step": 4364 + }, + { + "epoch": 1.339779005524862, + "grad_norm": 0.44313064217567444, + "learning_rate": 9.719404925195374e-05, + "loss": 1.9458, + "step": 4365 + }, + { + "epoch": 1.340085942295887, + "grad_norm": 0.495632141828537, + "learning_rate": 9.719240731359476e-05, + "loss": 1.9682, + "step": 4366 + }, + { + "epoch": 1.3403928790669122, + "grad_norm": 0.5843736529350281, + "learning_rate": 9.719076490885275e-05, + "loss": 1.9948, + "step": 4367 + }, + { + "epoch": 1.3406998158379375, + "grad_norm": 0.6249645352363586, + "learning_rate": 9.718912203774395e-05, + "loss": 1.9675, + "step": 4368 + }, + { + "epoch": 1.3410067526089626, + "grad_norm": 0.48386043310165405, + "learning_rate": 9.718747870028457e-05, + "loss": 1.9678, + "step": 4369 + }, + { + "epoch": 1.3413136893799877, + "grad_norm": 0.4797835648059845, + "learning_rate": 9.718583489649088e-05, + "loss": 2.0118, + "step": 4370 + }, + { + "epoch": 1.3416206261510129, + "grad_norm": 0.6131169199943542, + "learning_rate": 9.718419062637911e-05, + "loss": 2.0057, + "step": 4371 + }, + { + "epoch": 1.341927562922038, + "grad_norm": 0.6230120062828064, + "learning_rate": 9.718254588996552e-05, + "loss": 1.9871, + "step": 4372 + }, + { + "epoch": 1.3422344996930633, + "grad_norm": 0.5323978662490845, + "learning_rate": 9.718090068726633e-05, + "loss": 1.9389, + "step": 4373 + }, + { + "epoch": 1.3425414364640884, + "grad_norm": 0.429446280002594, + "learning_rate": 9.717925501829786e-05, + "loss": 1.9928, + "step": 4374 + }, + { + "epoch": 1.3428483732351135, + "grad_norm": 0.5588231086730957, + "learning_rate": 9.717760888307632e-05, + "loss": 2.0197, + "step": 4375 + }, + { + "epoch": 1.3431553100061389, + "grad_norm": 0.608248770236969, + "learning_rate": 9.7175962281618e-05, + "loss": 1.9486, + "step": 4376 + }, + { + "epoch": 1.343462246777164, + "grad_norm": 0.6100868582725525, + "learning_rate": 9.717431521393918e-05, + "loss": 2.044, + "step": 4377 + }, + { + "epoch": 1.343769183548189, + "grad_norm": 0.5428611636161804, + "learning_rate": 9.717266768005611e-05, + "loss": 2.0078, + "step": 4378 + }, + { + "epoch": 1.3440761203192142, + "grad_norm": 0.4338260889053345, + "learning_rate": 9.71710196799851e-05, + "loss": 1.9206, + "step": 4379 + }, + { + "epoch": 1.3443830570902393, + "grad_norm": 0.4879632294178009, + "learning_rate": 9.716937121374243e-05, + "loss": 1.9852, + "step": 4380 + }, + { + "epoch": 1.3446899938612646, + "grad_norm": 0.5174580216407776, + "learning_rate": 9.716772228134438e-05, + "loss": 1.9328, + "step": 4381 + }, + { + "epoch": 1.3449969306322898, + "grad_norm": 0.4461662173271179, + "learning_rate": 9.716607288280726e-05, + "loss": 1.9653, + "step": 4382 + }, + { + "epoch": 1.3453038674033149, + "grad_norm": 0.49747103452682495, + "learning_rate": 9.716442301814735e-05, + "loss": 1.9904, + "step": 4383 + }, + { + "epoch": 1.3456108041743402, + "grad_norm": 0.5059060454368591, + "learning_rate": 9.716277268738097e-05, + "loss": 1.9408, + "step": 4384 + }, + { + "epoch": 1.3459177409453653, + "grad_norm": 0.47981831431388855, + "learning_rate": 9.716112189052445e-05, + "loss": 1.9604, + "step": 4385 + }, + { + "epoch": 1.3462246777163904, + "grad_norm": 0.48941048979759216, + "learning_rate": 9.715947062759405e-05, + "loss": 2.0005, + "step": 4386 + }, + { + "epoch": 1.3465316144874155, + "grad_norm": 0.4544732868671417, + "learning_rate": 9.715781889860613e-05, + "loss": 1.9641, + "step": 4387 + }, + { + "epoch": 1.3468385512584407, + "grad_norm": 0.4564060866832733, + "learning_rate": 9.715616670357701e-05, + "loss": 1.8786, + "step": 4388 + }, + { + "epoch": 1.347145488029466, + "grad_norm": 0.4216209352016449, + "learning_rate": 9.715451404252301e-05, + "loss": 1.9402, + "step": 4389 + }, + { + "epoch": 1.347452424800491, + "grad_norm": 0.5024694204330444, + "learning_rate": 9.715286091546046e-05, + "loss": 1.9815, + "step": 4390 + }, + { + "epoch": 1.3477593615715162, + "grad_norm": 0.523953378200531, + "learning_rate": 9.715120732240571e-05, + "loss": 2.008, + "step": 4391 + }, + { + "epoch": 1.3480662983425415, + "grad_norm": 0.5068427920341492, + "learning_rate": 9.714955326337508e-05, + "loss": 1.9984, + "step": 4392 + }, + { + "epoch": 1.3483732351135667, + "grad_norm": 0.4349055290222168, + "learning_rate": 9.714789873838494e-05, + "loss": 1.9576, + "step": 4393 + }, + { + "epoch": 1.3486801718845918, + "grad_norm": 0.4677357077598572, + "learning_rate": 9.714624374745162e-05, + "loss": 2.0491, + "step": 4394 + }, + { + "epoch": 1.3489871086556169, + "grad_norm": 0.5942007899284363, + "learning_rate": 9.71445882905915e-05, + "loss": 1.9951, + "step": 4395 + }, + { + "epoch": 1.349294045426642, + "grad_norm": 0.5354358553886414, + "learning_rate": 9.714293236782092e-05, + "loss": 2.0033, + "step": 4396 + }, + { + "epoch": 1.3496009821976673, + "grad_norm": 0.5081890821456909, + "learning_rate": 9.714127597915625e-05, + "loss": 1.9944, + "step": 4397 + }, + { + "epoch": 1.3499079189686924, + "grad_norm": 0.5279759764671326, + "learning_rate": 9.713961912461386e-05, + "loss": 2.025, + "step": 4398 + }, + { + "epoch": 1.3502148557397176, + "grad_norm": 0.41777312755584717, + "learning_rate": 9.713796180421012e-05, + "loss": 1.9214, + "step": 4399 + }, + { + "epoch": 1.350521792510743, + "grad_norm": 0.48946598172187805, + "learning_rate": 9.713630401796141e-05, + "loss": 1.9851, + "step": 4400 + }, + { + "epoch": 1.350828729281768, + "grad_norm": 0.45182350277900696, + "learning_rate": 9.713464576588413e-05, + "loss": 1.9825, + "step": 4401 + }, + { + "epoch": 1.3511356660527931, + "grad_norm": 0.4178939461708069, + "learning_rate": 9.713298704799465e-05, + "loss": 1.8944, + "step": 4402 + }, + { + "epoch": 1.3514426028238182, + "grad_norm": 0.4178236424922943, + "learning_rate": 9.713132786430937e-05, + "loss": 1.9884, + "step": 4403 + }, + { + "epoch": 1.3517495395948433, + "grad_norm": 0.45951130986213684, + "learning_rate": 9.712966821484467e-05, + "loss": 2.0786, + "step": 4404 + }, + { + "epoch": 1.3520564763658687, + "grad_norm": 0.4884461760520935, + "learning_rate": 9.712800809961697e-05, + "loss": 2.0494, + "step": 4405 + }, + { + "epoch": 1.3523634131368938, + "grad_norm": 0.5342240929603577, + "learning_rate": 9.712634751864268e-05, + "loss": 2.1068, + "step": 4406 + }, + { + "epoch": 1.352670349907919, + "grad_norm": 0.5503208637237549, + "learning_rate": 9.71246864719382e-05, + "loss": 1.9588, + "step": 4407 + }, + { + "epoch": 1.3529772866789442, + "grad_norm": 0.5576291084289551, + "learning_rate": 9.712302495951994e-05, + "loss": 2.0461, + "step": 4408 + }, + { + "epoch": 1.3532842234499693, + "grad_norm": 0.5063806772232056, + "learning_rate": 9.712136298140433e-05, + "loss": 1.9606, + "step": 4409 + }, + { + "epoch": 1.3535911602209945, + "grad_norm": 0.5391512513160706, + "learning_rate": 9.71197005376078e-05, + "loss": 2.0115, + "step": 4410 + }, + { + "epoch": 1.3538980969920196, + "grad_norm": 0.4934769868850708, + "learning_rate": 9.711803762814676e-05, + "loss": 1.9966, + "step": 4411 + }, + { + "epoch": 1.3542050337630447, + "grad_norm": 0.4658334255218506, + "learning_rate": 9.711637425303766e-05, + "loss": 1.9477, + "step": 4412 + }, + { + "epoch": 1.35451197053407, + "grad_norm": 0.4407191574573517, + "learning_rate": 9.711471041229693e-05, + "loss": 1.9334, + "step": 4413 + }, + { + "epoch": 1.3548189073050951, + "grad_norm": 0.5043092370033264, + "learning_rate": 9.711304610594104e-05, + "loss": 2.0068, + "step": 4414 + }, + { + "epoch": 1.3551258440761202, + "grad_norm": 0.4502009451389313, + "learning_rate": 9.711138133398639e-05, + "loss": 1.9389, + "step": 4415 + }, + { + "epoch": 1.3554327808471456, + "grad_norm": 0.41863033175468445, + "learning_rate": 9.710971609644945e-05, + "loss": 1.9244, + "step": 4416 + }, + { + "epoch": 1.3557397176181707, + "grad_norm": 0.47590091824531555, + "learning_rate": 9.71080503933467e-05, + "loss": 2.0144, + "step": 4417 + }, + { + "epoch": 1.3560466543891958, + "grad_norm": 0.47155439853668213, + "learning_rate": 9.71063842246946e-05, + "loss": 2.0729, + "step": 4418 + }, + { + "epoch": 1.356353591160221, + "grad_norm": 0.5231152176856995, + "learning_rate": 9.710471759050957e-05, + "loss": 2.0654, + "step": 4419 + }, + { + "epoch": 1.356660527931246, + "grad_norm": 0.5952544212341309, + "learning_rate": 9.710305049080812e-05, + "loss": 1.9983, + "step": 4420 + }, + { + "epoch": 1.3569674647022714, + "grad_norm": 0.4810022711753845, + "learning_rate": 9.710138292560673e-05, + "loss": 1.9725, + "step": 4421 + }, + { + "epoch": 1.3572744014732965, + "grad_norm": 0.553421676158905, + "learning_rate": 9.709971489492185e-05, + "loss": 2.0666, + "step": 4422 + }, + { + "epoch": 1.3575813382443216, + "grad_norm": 0.48790663480758667, + "learning_rate": 9.709804639877001e-05, + "loss": 1.9312, + "step": 4423 + }, + { + "epoch": 1.357888275015347, + "grad_norm": 0.42968273162841797, + "learning_rate": 9.709637743716764e-05, + "loss": 1.9061, + "step": 4424 + }, + { + "epoch": 1.358195211786372, + "grad_norm": 0.40183690190315247, + "learning_rate": 9.709470801013128e-05, + "loss": 2.0547, + "step": 4425 + }, + { + "epoch": 1.3585021485573971, + "grad_norm": 0.5162881016731262, + "learning_rate": 9.70930381176774e-05, + "loss": 2.0246, + "step": 4426 + }, + { + "epoch": 1.3588090853284225, + "grad_norm": 0.517995297908783, + "learning_rate": 9.709136775982252e-05, + "loss": 2.0029, + "step": 4427 + }, + { + "epoch": 1.3591160220994476, + "grad_norm": 0.47416025400161743, + "learning_rate": 9.708969693658314e-05, + "loss": 1.9517, + "step": 4428 + }, + { + "epoch": 1.3594229588704727, + "grad_norm": 0.4192255437374115, + "learning_rate": 9.708802564797578e-05, + "loss": 1.9138, + "step": 4429 + }, + { + "epoch": 1.3597298956414978, + "grad_norm": 0.4643617868423462, + "learning_rate": 9.708635389401697e-05, + "loss": 1.9753, + "step": 4430 + }, + { + "epoch": 1.360036832412523, + "grad_norm": 0.5007988214492798, + "learning_rate": 9.708468167472317e-05, + "loss": 1.9654, + "step": 4431 + }, + { + "epoch": 1.3603437691835483, + "grad_norm": 0.5188244581222534, + "learning_rate": 9.708300899011098e-05, + "loss": 1.9959, + "step": 4432 + }, + { + "epoch": 1.3606507059545734, + "grad_norm": 0.5209388732910156, + "learning_rate": 9.70813358401969e-05, + "loss": 2.0028, + "step": 4433 + }, + { + "epoch": 1.3609576427255985, + "grad_norm": 0.48829126358032227, + "learning_rate": 9.707966222499745e-05, + "loss": 2.0554, + "step": 4434 + }, + { + "epoch": 1.3612645794966238, + "grad_norm": 0.4373438358306885, + "learning_rate": 9.707798814452919e-05, + "loss": 1.9611, + "step": 4435 + }, + { + "epoch": 1.361571516267649, + "grad_norm": 0.4294830858707428, + "learning_rate": 9.707631359880867e-05, + "loss": 1.9049, + "step": 4436 + }, + { + "epoch": 1.361878453038674, + "grad_norm": 0.46988123655319214, + "learning_rate": 9.70746385878524e-05, + "loss": 1.9221, + "step": 4437 + }, + { + "epoch": 1.3621853898096992, + "grad_norm": 0.4956746995449066, + "learning_rate": 9.707296311167697e-05, + "loss": 1.9215, + "step": 4438 + }, + { + "epoch": 1.3624923265807243, + "grad_norm": 0.43748801946640015, + "learning_rate": 9.707128717029894e-05, + "loss": 1.9882, + "step": 4439 + }, + { + "epoch": 1.3627992633517496, + "grad_norm": 0.4926415979862213, + "learning_rate": 9.706961076373485e-05, + "loss": 1.9664, + "step": 4440 + }, + { + "epoch": 1.3631062001227747, + "grad_norm": 0.5239415764808655, + "learning_rate": 9.706793389200129e-05, + "loss": 1.9809, + "step": 4441 + }, + { + "epoch": 1.3634131368937998, + "grad_norm": 0.5134629607200623, + "learning_rate": 9.706625655511481e-05, + "loss": 1.9559, + "step": 4442 + }, + { + "epoch": 1.3637200736648252, + "grad_norm": 0.49562570452690125, + "learning_rate": 9.706457875309198e-05, + "loss": 1.9603, + "step": 4443 + }, + { + "epoch": 1.3640270104358503, + "grad_norm": 0.45000702142715454, + "learning_rate": 9.706290048594942e-05, + "loss": 1.9395, + "step": 4444 + }, + { + "epoch": 1.3643339472068754, + "grad_norm": 0.4216759502887726, + "learning_rate": 9.70612217537037e-05, + "loss": 1.8857, + "step": 4445 + }, + { + "epoch": 1.3646408839779005, + "grad_norm": 0.5022158622741699, + "learning_rate": 9.705954255637138e-05, + "loss": 1.9388, + "step": 4446 + }, + { + "epoch": 1.3649478207489256, + "grad_norm": 0.5086642503738403, + "learning_rate": 9.70578628939691e-05, + "loss": 1.9325, + "step": 4447 + }, + { + "epoch": 1.365254757519951, + "grad_norm": 0.4891139566898346, + "learning_rate": 9.705618276651342e-05, + "loss": 1.9068, + "step": 4448 + }, + { + "epoch": 1.365561694290976, + "grad_norm": 0.42479926347732544, + "learning_rate": 9.705450217402096e-05, + "loss": 2.0345, + "step": 4449 + }, + { + "epoch": 1.3658686310620012, + "grad_norm": 0.45347172021865845, + "learning_rate": 9.705282111650834e-05, + "loss": 1.9343, + "step": 4450 + }, + { + "epoch": 1.3661755678330265, + "grad_norm": 0.5443231463432312, + "learning_rate": 9.705113959399217e-05, + "loss": 2.0428, + "step": 4451 + }, + { + "epoch": 1.3664825046040516, + "grad_norm": 0.5320110321044922, + "learning_rate": 9.704945760648905e-05, + "loss": 2.0015, + "step": 4452 + }, + { + "epoch": 1.3667894413750767, + "grad_norm": 0.5018410086631775, + "learning_rate": 9.704777515401561e-05, + "loss": 1.9284, + "step": 4453 + }, + { + "epoch": 1.3670963781461019, + "grad_norm": 0.4587440490722656, + "learning_rate": 9.704609223658848e-05, + "loss": 1.8945, + "step": 4454 + }, + { + "epoch": 1.367403314917127, + "grad_norm": 0.4634784758090973, + "learning_rate": 9.70444088542243e-05, + "loss": 1.9564, + "step": 4455 + }, + { + "epoch": 1.3677102516881523, + "grad_norm": 0.43047839403152466, + "learning_rate": 9.70427250069397e-05, + "loss": 2.0417, + "step": 4456 + }, + { + "epoch": 1.3680171884591774, + "grad_norm": 0.46661630272865295, + "learning_rate": 9.70410406947513e-05, + "loss": 2.0563, + "step": 4457 + }, + { + "epoch": 1.3683241252302025, + "grad_norm": 0.46544912457466125, + "learning_rate": 9.703935591767579e-05, + "loss": 2.0115, + "step": 4458 + }, + { + "epoch": 1.3686310620012279, + "grad_norm": 0.466172993183136, + "learning_rate": 9.703767067572977e-05, + "loss": 1.9177, + "step": 4459 + }, + { + "epoch": 1.368937998772253, + "grad_norm": 0.44513949751853943, + "learning_rate": 9.703598496892994e-05, + "loss": 1.9954, + "step": 4460 + }, + { + "epoch": 1.369244935543278, + "grad_norm": 0.4502551257610321, + "learning_rate": 9.703429879729293e-05, + "loss": 1.9155, + "step": 4461 + }, + { + "epoch": 1.3695518723143032, + "grad_norm": 0.4618416726589203, + "learning_rate": 9.703261216083541e-05, + "loss": 2.015, + "step": 4462 + }, + { + "epoch": 1.3698588090853283, + "grad_norm": 0.4691082239151001, + "learning_rate": 9.703092505957405e-05, + "loss": 2.0332, + "step": 4463 + }, + { + "epoch": 1.3701657458563536, + "grad_norm": 0.5674530863761902, + "learning_rate": 9.702923749352553e-05, + "loss": 2.0, + "step": 4464 + }, + { + "epoch": 1.3704726826273788, + "grad_norm": 0.5828661322593689, + "learning_rate": 9.702754946270651e-05, + "loss": 1.9727, + "step": 4465 + }, + { + "epoch": 1.3707796193984039, + "grad_norm": 0.5861548781394958, + "learning_rate": 9.702586096713369e-05, + "loss": 2.0337, + "step": 4466 + }, + { + "epoch": 1.3710865561694292, + "grad_norm": 0.5607923865318298, + "learning_rate": 9.702417200682374e-05, + "loss": 1.9639, + "step": 4467 + }, + { + "epoch": 1.3713934929404543, + "grad_norm": 0.553827702999115, + "learning_rate": 9.702248258179337e-05, + "loss": 1.9644, + "step": 4468 + }, + { + "epoch": 1.3717004297114794, + "grad_norm": 0.6120470762252808, + "learning_rate": 9.702079269205925e-05, + "loss": 1.9562, + "step": 4469 + }, + { + "epoch": 1.3720073664825045, + "grad_norm": 0.6354473829269409, + "learning_rate": 9.70191023376381e-05, + "loss": 2.0984, + "step": 4470 + }, + { + "epoch": 1.3723143032535297, + "grad_norm": 0.5426626801490784, + "learning_rate": 9.701741151854665e-05, + "loss": 1.9473, + "step": 4471 + }, + { + "epoch": 1.372621240024555, + "grad_norm": 0.5632089376449585, + "learning_rate": 9.701572023480156e-05, + "loss": 2.0167, + "step": 4472 + }, + { + "epoch": 1.37292817679558, + "grad_norm": 0.5315039157867432, + "learning_rate": 9.701402848641957e-05, + "loss": 1.9537, + "step": 4473 + }, + { + "epoch": 1.3732351135666052, + "grad_norm": 0.4552931785583496, + "learning_rate": 9.70123362734174e-05, + "loss": 1.9553, + "step": 4474 + }, + { + "epoch": 1.3735420503376305, + "grad_norm": 0.49282166361808777, + "learning_rate": 9.701064359581176e-05, + "loss": 2.0409, + "step": 4475 + }, + { + "epoch": 1.3738489871086557, + "grad_norm": 0.46548575162887573, + "learning_rate": 9.700895045361939e-05, + "loss": 1.9707, + "step": 4476 + }, + { + "epoch": 1.3741559238796808, + "grad_norm": 0.4619027078151703, + "learning_rate": 9.7007256846857e-05, + "loss": 1.9531, + "step": 4477 + }, + { + "epoch": 1.3744628606507059, + "grad_norm": 0.5122626423835754, + "learning_rate": 9.700556277554138e-05, + "loss": 2.0625, + "step": 4478 + }, + { + "epoch": 1.374769797421731, + "grad_norm": 0.487246036529541, + "learning_rate": 9.700386823968922e-05, + "loss": 1.9667, + "step": 4479 + }, + { + "epoch": 1.3750767341927563, + "grad_norm": 0.5093865990638733, + "learning_rate": 9.700217323931729e-05, + "loss": 1.9982, + "step": 4480 + }, + { + "epoch": 1.3753836709637814, + "grad_norm": 0.47049981355667114, + "learning_rate": 9.700047777444232e-05, + "loss": 1.9876, + "step": 4481 + }, + { + "epoch": 1.3756906077348066, + "grad_norm": 0.4997411370277405, + "learning_rate": 9.699878184508109e-05, + "loss": 1.9925, + "step": 4482 + }, + { + "epoch": 1.375997544505832, + "grad_norm": 0.49374327063560486, + "learning_rate": 9.699708545125034e-05, + "loss": 1.9468, + "step": 4483 + }, + { + "epoch": 1.376304481276857, + "grad_norm": 0.44101378321647644, + "learning_rate": 9.699538859296686e-05, + "loss": 2.0577, + "step": 4484 + }, + { + "epoch": 1.3766114180478821, + "grad_norm": 0.47289925813674927, + "learning_rate": 9.699369127024741e-05, + "loss": 1.9611, + "step": 4485 + }, + { + "epoch": 1.3769183548189072, + "grad_norm": 0.4616342782974243, + "learning_rate": 9.699199348310875e-05, + "loss": 2.0196, + "step": 4486 + }, + { + "epoch": 1.3772252915899323, + "grad_norm": 0.45797309279441833, + "learning_rate": 9.699029523156766e-05, + "loss": 2.0168, + "step": 4487 + }, + { + "epoch": 1.3775322283609577, + "grad_norm": 0.5224477648735046, + "learning_rate": 9.698859651564095e-05, + "loss": 2.0312, + "step": 4488 + }, + { + "epoch": 1.3778391651319828, + "grad_norm": 0.4831027388572693, + "learning_rate": 9.698689733534539e-05, + "loss": 2.0084, + "step": 4489 + }, + { + "epoch": 1.378146101903008, + "grad_norm": 0.49492040276527405, + "learning_rate": 9.698519769069774e-05, + "loss": 1.9474, + "step": 4490 + }, + { + "epoch": 1.3784530386740332, + "grad_norm": 0.4911774694919586, + "learning_rate": 9.698349758171486e-05, + "loss": 1.987, + "step": 4491 + }, + { + "epoch": 1.3787599754450584, + "grad_norm": 0.5415390729904175, + "learning_rate": 9.69817970084135e-05, + "loss": 1.9927, + "step": 4492 + }, + { + "epoch": 1.3790669122160835, + "grad_norm": 0.6870381832122803, + "learning_rate": 9.698009597081048e-05, + "loss": 2.0348, + "step": 4493 + }, + { + "epoch": 1.3793738489871086, + "grad_norm": 0.6322616934776306, + "learning_rate": 9.697839446892263e-05, + "loss": 2.0119, + "step": 4494 + }, + { + "epoch": 1.3796807857581337, + "grad_norm": 0.5950151681900024, + "learning_rate": 9.697669250276675e-05, + "loss": 2.002, + "step": 4495 + }, + { + "epoch": 1.379987722529159, + "grad_norm": 0.4321151673793793, + "learning_rate": 9.697499007235966e-05, + "loss": 1.9173, + "step": 4496 + }, + { + "epoch": 1.3802946593001841, + "grad_norm": 0.4627344608306885, + "learning_rate": 9.697328717771818e-05, + "loss": 2.0289, + "step": 4497 + }, + { + "epoch": 1.3806015960712092, + "grad_norm": 0.5040726661682129, + "learning_rate": 9.697158381885915e-05, + "loss": 1.9844, + "step": 4498 + }, + { + "epoch": 1.3809085328422346, + "grad_norm": 0.5219398736953735, + "learning_rate": 9.696987999579939e-05, + "loss": 1.9536, + "step": 4499 + }, + { + "epoch": 1.3812154696132597, + "grad_norm": 0.487734317779541, + "learning_rate": 9.696817570855575e-05, + "loss": 1.9655, + "step": 4500 + }, + { + "epoch": 1.3815224063842848, + "grad_norm": 0.40818822383880615, + "learning_rate": 9.696647095714506e-05, + "loss": 1.9524, + "step": 4501 + }, + { + "epoch": 1.3818293431553101, + "grad_norm": 0.41752889752388, + "learning_rate": 9.69647657415842e-05, + "loss": 1.9927, + "step": 4502 + }, + { + "epoch": 1.3821362799263353, + "grad_norm": 0.44540464878082275, + "learning_rate": 9.696306006188998e-05, + "loss": 1.9207, + "step": 4503 + }, + { + "epoch": 1.3824432166973604, + "grad_norm": 0.44818806648254395, + "learning_rate": 9.696135391807927e-05, + "loss": 1.9054, + "step": 4504 + }, + { + "epoch": 1.3827501534683855, + "grad_norm": 0.430758535861969, + "learning_rate": 9.695964731016896e-05, + "loss": 1.9644, + "step": 4505 + }, + { + "epoch": 1.3830570902394106, + "grad_norm": 0.3787635564804077, + "learning_rate": 9.695794023817586e-05, + "loss": 1.9601, + "step": 4506 + }, + { + "epoch": 1.383364027010436, + "grad_norm": 0.42520588636398315, + "learning_rate": 9.695623270211689e-05, + "loss": 1.9681, + "step": 4507 + }, + { + "epoch": 1.383670963781461, + "grad_norm": 0.39063912630081177, + "learning_rate": 9.69545247020089e-05, + "loss": 2.0323, + "step": 4508 + }, + { + "epoch": 1.3839779005524862, + "grad_norm": 0.41405799984931946, + "learning_rate": 9.695281623786879e-05, + "loss": 1.9239, + "step": 4509 + }, + { + "epoch": 1.3842848373235115, + "grad_norm": 0.4275501072406769, + "learning_rate": 9.695110730971342e-05, + "loss": 1.941, + "step": 4510 + }, + { + "epoch": 1.3845917740945366, + "grad_norm": 0.5254966616630554, + "learning_rate": 9.694939791755968e-05, + "loss": 1.9997, + "step": 4511 + }, + { + "epoch": 1.3848987108655617, + "grad_norm": 0.581857442855835, + "learning_rate": 9.694768806142448e-05, + "loss": 2.0085, + "step": 4512 + }, + { + "epoch": 1.3852056476365868, + "grad_norm": 0.6330662965774536, + "learning_rate": 9.69459777413247e-05, + "loss": 1.9898, + "step": 4513 + }, + { + "epoch": 1.385512584407612, + "grad_norm": 0.693536639213562, + "learning_rate": 9.694426695727727e-05, + "loss": 1.9466, + "step": 4514 + }, + { + "epoch": 1.3858195211786373, + "grad_norm": 0.6494079232215881, + "learning_rate": 9.694255570929906e-05, + "loss": 1.9523, + "step": 4515 + }, + { + "epoch": 1.3861264579496624, + "grad_norm": 0.573515772819519, + "learning_rate": 9.694084399740701e-05, + "loss": 1.9789, + "step": 4516 + }, + { + "epoch": 1.3864333947206875, + "grad_norm": 0.5253448486328125, + "learning_rate": 9.693913182161805e-05, + "loss": 2.0348, + "step": 4517 + }, + { + "epoch": 1.3867403314917128, + "grad_norm": 0.49921590089797974, + "learning_rate": 9.693741918194904e-05, + "loss": 1.9684, + "step": 4518 + }, + { + "epoch": 1.387047268262738, + "grad_norm": 0.5164174437522888, + "learning_rate": 9.693570607841696e-05, + "loss": 2.0104, + "step": 4519 + }, + { + "epoch": 1.387354205033763, + "grad_norm": 0.5620231032371521, + "learning_rate": 9.693399251103872e-05, + "loss": 1.9969, + "step": 4520 + }, + { + "epoch": 1.3876611418047882, + "grad_norm": 0.495890349149704, + "learning_rate": 9.693227847983126e-05, + "loss": 2.0037, + "step": 4521 + }, + { + "epoch": 1.3879680785758133, + "grad_norm": 0.4942645728588104, + "learning_rate": 9.693056398481151e-05, + "loss": 2.0199, + "step": 4522 + }, + { + "epoch": 1.3882750153468386, + "grad_norm": 0.5366860628128052, + "learning_rate": 9.692884902599643e-05, + "loss": 2.0395, + "step": 4523 + }, + { + "epoch": 1.3885819521178637, + "grad_norm": 0.48179951310157776, + "learning_rate": 9.692713360340295e-05, + "loss": 2.0292, + "step": 4524 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.4709320366382599, + "learning_rate": 9.692541771704804e-05, + "loss": 2.006, + "step": 4525 + }, + { + "epoch": 1.3891958256599142, + "grad_norm": 0.4311975836753845, + "learning_rate": 9.692370136694864e-05, + "loss": 2.0122, + "step": 4526 + }, + { + "epoch": 1.3895027624309393, + "grad_norm": 0.4489841163158417, + "learning_rate": 9.692198455312172e-05, + "loss": 1.9635, + "step": 4527 + }, + { + "epoch": 1.3898096992019644, + "grad_norm": 0.40383243560791016, + "learning_rate": 9.692026727558425e-05, + "loss": 1.9352, + "step": 4528 + }, + { + "epoch": 1.3901166359729895, + "grad_norm": 0.4732677638530731, + "learning_rate": 9.691854953435319e-05, + "loss": 1.9882, + "step": 4529 + }, + { + "epoch": 1.3904235727440146, + "grad_norm": 0.5124688744544983, + "learning_rate": 9.691683132944553e-05, + "loss": 2.0068, + "step": 4530 + }, + { + "epoch": 1.39073050951504, + "grad_norm": 0.4810490906238556, + "learning_rate": 9.691511266087824e-05, + "loss": 2.0163, + "step": 4531 + }, + { + "epoch": 1.391037446286065, + "grad_norm": 0.4019710421562195, + "learning_rate": 9.691339352866831e-05, + "loss": 1.8943, + "step": 4532 + }, + { + "epoch": 1.3913443830570902, + "grad_norm": 0.4144287705421448, + "learning_rate": 9.691167393283274e-05, + "loss": 1.9457, + "step": 4533 + }, + { + "epoch": 1.3916513198281155, + "grad_norm": 0.42622655630111694, + "learning_rate": 9.690995387338851e-05, + "loss": 1.9618, + "step": 4534 + }, + { + "epoch": 1.3919582565991406, + "grad_norm": 0.4547794461250305, + "learning_rate": 9.690823335035259e-05, + "loss": 2.0243, + "step": 4535 + }, + { + "epoch": 1.3922651933701657, + "grad_norm": 0.4298909604549408, + "learning_rate": 9.690651236374205e-05, + "loss": 1.9872, + "step": 4536 + }, + { + "epoch": 1.3925721301411909, + "grad_norm": 0.40463829040527344, + "learning_rate": 9.690479091357386e-05, + "loss": 1.9617, + "step": 4537 + }, + { + "epoch": 1.392879066912216, + "grad_norm": 0.441487580537796, + "learning_rate": 9.690306899986502e-05, + "loss": 1.8965, + "step": 4538 + }, + { + "epoch": 1.3931860036832413, + "grad_norm": 0.4713582694530487, + "learning_rate": 9.690134662263256e-05, + "loss": 2.0112, + "step": 4539 + }, + { + "epoch": 1.3934929404542664, + "grad_norm": 0.5772922039031982, + "learning_rate": 9.689962378189351e-05, + "loss": 1.9903, + "step": 4540 + }, + { + "epoch": 1.3937998772252915, + "grad_norm": 0.6658890247344971, + "learning_rate": 9.689790047766489e-05, + "loss": 2.0569, + "step": 4541 + }, + { + "epoch": 1.3941068139963169, + "grad_norm": 0.6710116267204285, + "learning_rate": 9.689617670996372e-05, + "loss": 1.9692, + "step": 4542 + }, + { + "epoch": 1.394413750767342, + "grad_norm": 0.6778390407562256, + "learning_rate": 9.689445247880707e-05, + "loss": 2.0363, + "step": 4543 + }, + { + "epoch": 1.394720687538367, + "grad_norm": 0.6921203136444092, + "learning_rate": 9.689272778421192e-05, + "loss": 2.0104, + "step": 4544 + }, + { + "epoch": 1.3950276243093922, + "grad_norm": 0.48772117495536804, + "learning_rate": 9.689100262619537e-05, + "loss": 2.0006, + "step": 4545 + }, + { + "epoch": 1.3953345610804173, + "grad_norm": 0.4956360459327698, + "learning_rate": 9.688927700477445e-05, + "loss": 1.9724, + "step": 4546 + }, + { + "epoch": 1.3956414978514426, + "grad_norm": 0.6304072141647339, + "learning_rate": 9.68875509199662e-05, + "loss": 1.9904, + "step": 4547 + }, + { + "epoch": 1.3959484346224678, + "grad_norm": 0.6372275948524475, + "learning_rate": 9.68858243717877e-05, + "loss": 2.0328, + "step": 4548 + }, + { + "epoch": 1.3962553713934929, + "grad_norm": 0.48642870783805847, + "learning_rate": 9.688409736025601e-05, + "loss": 1.9898, + "step": 4549 + }, + { + "epoch": 1.3965623081645182, + "grad_norm": 0.41096800565719604, + "learning_rate": 9.688236988538817e-05, + "loss": 1.8945, + "step": 4550 + }, + { + "epoch": 1.3968692449355433, + "grad_norm": 0.48746830224990845, + "learning_rate": 9.68806419472013e-05, + "loss": 1.9809, + "step": 4551 + }, + { + "epoch": 1.3971761817065684, + "grad_norm": 0.5296676754951477, + "learning_rate": 9.687891354571242e-05, + "loss": 1.9194, + "step": 4552 + }, + { + "epoch": 1.3974831184775935, + "grad_norm": 0.43177086114883423, + "learning_rate": 9.687718468093865e-05, + "loss": 1.8785, + "step": 4553 + }, + { + "epoch": 1.3977900552486187, + "grad_norm": 0.4617565870285034, + "learning_rate": 9.687545535289705e-05, + "loss": 2.0021, + "step": 4554 + }, + { + "epoch": 1.398096992019644, + "grad_norm": 0.4460168182849884, + "learning_rate": 9.687372556160477e-05, + "loss": 1.9368, + "step": 4555 + }, + { + "epoch": 1.398403928790669, + "grad_norm": 0.5051010847091675, + "learning_rate": 9.687199530707882e-05, + "loss": 2.0321, + "step": 4556 + }, + { + "epoch": 1.3987108655616942, + "grad_norm": 0.5623685717582703, + "learning_rate": 9.687026458933636e-05, + "loss": 2.007, + "step": 4557 + }, + { + "epoch": 1.3990178023327196, + "grad_norm": 0.48149919509887695, + "learning_rate": 9.686853340839446e-05, + "loss": 1.9346, + "step": 4558 + }, + { + "epoch": 1.3993247391037447, + "grad_norm": 0.4651631712913513, + "learning_rate": 9.686680176427025e-05, + "loss": 1.9603, + "step": 4559 + }, + { + "epoch": 1.3996316758747698, + "grad_norm": 0.5255021452903748, + "learning_rate": 9.686506965698083e-05, + "loss": 2.0206, + "step": 4560 + }, + { + "epoch": 1.3999386126457949, + "grad_norm": 0.5137404799461365, + "learning_rate": 9.686333708654334e-05, + "loss": 1.9736, + "step": 4561 + }, + { + "epoch": 1.40024554941682, + "grad_norm": 0.5037943124771118, + "learning_rate": 9.686160405297487e-05, + "loss": 1.9886, + "step": 4562 + }, + { + "epoch": 1.4005524861878453, + "grad_norm": 0.46424365043640137, + "learning_rate": 9.685987055629256e-05, + "loss": 1.9316, + "step": 4563 + }, + { + "epoch": 1.4008594229588704, + "grad_norm": 0.4839535355567932, + "learning_rate": 9.685813659651355e-05, + "loss": 1.9651, + "step": 4564 + }, + { + "epoch": 1.4011663597298956, + "grad_norm": 0.48972323536872864, + "learning_rate": 9.685640217365497e-05, + "loss": 1.9544, + "step": 4565 + }, + { + "epoch": 1.401473296500921, + "grad_norm": 0.43038102984428406, + "learning_rate": 9.685466728773396e-05, + "loss": 1.9522, + "step": 4566 + }, + { + "epoch": 1.401780233271946, + "grad_norm": 0.5174641013145447, + "learning_rate": 9.685293193876765e-05, + "loss": 2.046, + "step": 4567 + }, + { + "epoch": 1.4020871700429711, + "grad_norm": 0.6731263995170593, + "learning_rate": 9.685119612677323e-05, + "loss": 2.0123, + "step": 4568 + }, + { + "epoch": 1.4023941068139965, + "grad_norm": 0.5863515734672546, + "learning_rate": 9.684945985176782e-05, + "loss": 1.9951, + "step": 4569 + }, + { + "epoch": 1.4027010435850216, + "grad_norm": 0.4479050934314728, + "learning_rate": 9.684772311376859e-05, + "loss": 1.9287, + "step": 4570 + }, + { + "epoch": 1.4030079803560467, + "grad_norm": 0.432740718126297, + "learning_rate": 9.68459859127927e-05, + "loss": 1.955, + "step": 4571 + }, + { + "epoch": 1.4033149171270718, + "grad_norm": 0.571775496006012, + "learning_rate": 9.684424824885731e-05, + "loss": 1.9519, + "step": 4572 + }, + { + "epoch": 1.403621853898097, + "grad_norm": 0.6454880237579346, + "learning_rate": 9.684251012197963e-05, + "loss": 1.9858, + "step": 4573 + }, + { + "epoch": 1.4039287906691222, + "grad_norm": 0.5274731516838074, + "learning_rate": 9.684077153217677e-05, + "loss": 1.9956, + "step": 4574 + }, + { + "epoch": 1.4042357274401474, + "grad_norm": 0.4459272027015686, + "learning_rate": 9.683903247946597e-05, + "loss": 2.0412, + "step": 4575 + }, + { + "epoch": 1.4045426642111725, + "grad_norm": 0.47089213132858276, + "learning_rate": 9.683729296386441e-05, + "loss": 1.9247, + "step": 4576 + }, + { + "epoch": 1.4048496009821978, + "grad_norm": 0.628490149974823, + "learning_rate": 9.683555298538927e-05, + "loss": 2.1311, + "step": 4577 + }, + { + "epoch": 1.405156537753223, + "grad_norm": 0.5498626232147217, + "learning_rate": 9.683381254405773e-05, + "loss": 1.9538, + "step": 4578 + }, + { + "epoch": 1.405463474524248, + "grad_norm": 0.4556458294391632, + "learning_rate": 9.6832071639887e-05, + "loss": 1.9957, + "step": 4579 + }, + { + "epoch": 1.4057704112952731, + "grad_norm": 0.5684164762496948, + "learning_rate": 9.68303302728943e-05, + "loss": 1.9339, + "step": 4580 + }, + { + "epoch": 1.4060773480662982, + "grad_norm": 0.5723292231559753, + "learning_rate": 9.682858844309682e-05, + "loss": 2.0043, + "step": 4581 + }, + { + "epoch": 1.4063842848373236, + "grad_norm": 0.4734770953655243, + "learning_rate": 9.682684615051178e-05, + "loss": 1.9854, + "step": 4582 + }, + { + "epoch": 1.4066912216083487, + "grad_norm": 0.49376189708709717, + "learning_rate": 9.682510339515642e-05, + "loss": 2.0436, + "step": 4583 + }, + { + "epoch": 1.4069981583793738, + "grad_norm": 0.6263520121574402, + "learning_rate": 9.682336017704793e-05, + "loss": 1.9426, + "step": 4584 + }, + { + "epoch": 1.4073050951503991, + "grad_norm": 0.5852357745170593, + "learning_rate": 9.682161649620355e-05, + "loss": 1.9865, + "step": 4585 + }, + { + "epoch": 1.4076120319214243, + "grad_norm": 0.45548367500305176, + "learning_rate": 9.681987235264052e-05, + "loss": 2.0454, + "step": 4586 + }, + { + "epoch": 1.4079189686924494, + "grad_norm": 0.4961472153663635, + "learning_rate": 9.681812774637607e-05, + "loss": 2.0414, + "step": 4587 + }, + { + "epoch": 1.4082259054634745, + "grad_norm": 0.5739028453826904, + "learning_rate": 9.681638267742741e-05, + "loss": 1.9591, + "step": 4588 + }, + { + "epoch": 1.4085328422344996, + "grad_norm": 0.546283483505249, + "learning_rate": 9.681463714581184e-05, + "loss": 1.9631, + "step": 4589 + }, + { + "epoch": 1.408839779005525, + "grad_norm": 0.4757421910762787, + "learning_rate": 9.681289115154659e-05, + "loss": 1.954, + "step": 4590 + }, + { + "epoch": 1.40914671577655, + "grad_norm": 0.5116898417472839, + "learning_rate": 9.681114469464891e-05, + "loss": 1.9816, + "step": 4591 + }, + { + "epoch": 1.4094536525475752, + "grad_norm": 0.6128544807434082, + "learning_rate": 9.680939777513607e-05, + "loss": 1.9408, + "step": 4592 + }, + { + "epoch": 1.4097605893186005, + "grad_norm": 0.5577036142349243, + "learning_rate": 9.680765039302531e-05, + "loss": 1.906, + "step": 4593 + }, + { + "epoch": 1.4100675260896256, + "grad_norm": 0.4608074128627777, + "learning_rate": 9.680590254833393e-05, + "loss": 1.9421, + "step": 4594 + }, + { + "epoch": 1.4103744628606507, + "grad_norm": 0.4221206307411194, + "learning_rate": 9.680415424107917e-05, + "loss": 1.9596, + "step": 4595 + }, + { + "epoch": 1.4106813996316758, + "grad_norm": 0.4278069734573364, + "learning_rate": 9.680240547127832e-05, + "loss": 1.9718, + "step": 4596 + }, + { + "epoch": 1.410988336402701, + "grad_norm": 0.48608019948005676, + "learning_rate": 9.680065623894869e-05, + "loss": 2.0595, + "step": 4597 + }, + { + "epoch": 1.4112952731737263, + "grad_norm": 0.4559817910194397, + "learning_rate": 9.679890654410753e-05, + "loss": 1.959, + "step": 4598 + }, + { + "epoch": 1.4116022099447514, + "grad_norm": 0.5122750997543335, + "learning_rate": 9.679715638677216e-05, + "loss": 2.0669, + "step": 4599 + }, + { + "epoch": 1.4119091467157765, + "grad_norm": 0.5203170776367188, + "learning_rate": 9.679540576695985e-05, + "loss": 1.9475, + "step": 4600 + }, + { + "epoch": 1.4122160834868018, + "grad_norm": 0.5420581698417664, + "learning_rate": 9.679365468468791e-05, + "loss": 1.9603, + "step": 4601 + }, + { + "epoch": 1.412523020257827, + "grad_norm": 0.527387261390686, + "learning_rate": 9.679190313997364e-05, + "loss": 1.9172, + "step": 4602 + }, + { + "epoch": 1.412829957028852, + "grad_norm": 0.48417946696281433, + "learning_rate": 9.679015113283438e-05, + "loss": 1.9619, + "step": 4603 + }, + { + "epoch": 1.4131368937998772, + "grad_norm": 0.49174100160598755, + "learning_rate": 9.678839866328742e-05, + "loss": 1.9959, + "step": 4604 + }, + { + "epoch": 1.4134438305709023, + "grad_norm": 0.5096092224121094, + "learning_rate": 9.678664573135006e-05, + "loss": 2.0046, + "step": 4605 + }, + { + "epoch": 1.4137507673419276, + "grad_norm": 0.4536958634853363, + "learning_rate": 9.678489233703965e-05, + "loss": 1.9289, + "step": 4606 + }, + { + "epoch": 1.4140577041129527, + "grad_norm": 0.40438196063041687, + "learning_rate": 9.678313848037353e-05, + "loss": 1.9488, + "step": 4607 + }, + { + "epoch": 1.4143646408839778, + "grad_norm": 0.4447456896305084, + "learning_rate": 9.6781384161369e-05, + "loss": 1.9638, + "step": 4608 + }, + { + "epoch": 1.4146715776550032, + "grad_norm": 0.44451746344566345, + "learning_rate": 9.677962938004342e-05, + "loss": 1.9026, + "step": 4609 + }, + { + "epoch": 1.4149785144260283, + "grad_norm": 0.4262266457080841, + "learning_rate": 9.677787413641412e-05, + "loss": 1.9408, + "step": 4610 + }, + { + "epoch": 1.4152854511970534, + "grad_norm": 0.42755937576293945, + "learning_rate": 9.677611843049845e-05, + "loss": 1.9542, + "step": 4611 + }, + { + "epoch": 1.4155923879680785, + "grad_norm": 0.43264830112457275, + "learning_rate": 9.677436226231375e-05, + "loss": 2.0244, + "step": 4612 + }, + { + "epoch": 1.4158993247391036, + "grad_norm": 0.4521278142929077, + "learning_rate": 9.67726056318774e-05, + "loss": 2.0343, + "step": 4613 + }, + { + "epoch": 1.416206261510129, + "grad_norm": 0.45257535576820374, + "learning_rate": 9.677084853920675e-05, + "loss": 1.9743, + "step": 4614 + }, + { + "epoch": 1.416513198281154, + "grad_norm": 0.42859771847724915, + "learning_rate": 9.676909098431915e-05, + "loss": 2.0067, + "step": 4615 + }, + { + "epoch": 1.4168201350521792, + "grad_norm": 0.4057050049304962, + "learning_rate": 9.6767332967232e-05, + "loss": 1.9074, + "step": 4616 + }, + { + "epoch": 1.4171270718232045, + "grad_norm": 0.46177807450294495, + "learning_rate": 9.676557448796264e-05, + "loss": 1.9899, + "step": 4617 + }, + { + "epoch": 1.4174340085942296, + "grad_norm": 0.44164395332336426, + "learning_rate": 9.676381554652846e-05, + "loss": 1.9759, + "step": 4618 + }, + { + "epoch": 1.4177409453652547, + "grad_norm": 0.42987993359565735, + "learning_rate": 9.676205614294684e-05, + "loss": 1.8783, + "step": 4619 + }, + { + "epoch": 1.4180478821362799, + "grad_norm": 0.541702389717102, + "learning_rate": 9.67602962772352e-05, + "loss": 2.0099, + "step": 4620 + }, + { + "epoch": 1.418354818907305, + "grad_norm": 0.42173272371292114, + "learning_rate": 9.67585359494109e-05, + "loss": 1.9281, + "step": 4621 + }, + { + "epoch": 1.4186617556783303, + "grad_norm": 0.432476669549942, + "learning_rate": 9.67567751594913e-05, + "loss": 1.9124, + "step": 4622 + }, + { + "epoch": 1.4189686924493554, + "grad_norm": 0.4952125549316406, + "learning_rate": 9.675501390749388e-05, + "loss": 1.973, + "step": 4623 + }, + { + "epoch": 1.4192756292203805, + "grad_norm": 0.5270698070526123, + "learning_rate": 9.6753252193436e-05, + "loss": 2.003, + "step": 4624 + }, + { + "epoch": 1.4195825659914059, + "grad_norm": 0.5735524892807007, + "learning_rate": 9.67514900173351e-05, + "loss": 1.9266, + "step": 4625 + }, + { + "epoch": 1.419889502762431, + "grad_norm": 0.508196234703064, + "learning_rate": 9.674972737920855e-05, + "loss": 1.9633, + "step": 4626 + }, + { + "epoch": 1.420196439533456, + "grad_norm": 0.4321250319480896, + "learning_rate": 9.674796427907379e-05, + "loss": 1.9994, + "step": 4627 + }, + { + "epoch": 1.4205033763044812, + "grad_norm": 0.5697643756866455, + "learning_rate": 9.674620071694826e-05, + "loss": 2.0018, + "step": 4628 + }, + { + "epoch": 1.4208103130755063, + "grad_norm": 0.6797513365745544, + "learning_rate": 9.674443669284936e-05, + "loss": 2.0514, + "step": 4629 + }, + { + "epoch": 1.4211172498465316, + "grad_norm": 0.6622742414474487, + "learning_rate": 9.674267220679456e-05, + "loss": 1.9315, + "step": 4630 + }, + { + "epoch": 1.4214241866175568, + "grad_norm": 0.5143589377403259, + "learning_rate": 9.674090725880125e-05, + "loss": 1.9691, + "step": 4631 + }, + { + "epoch": 1.4217311233885819, + "grad_norm": 0.4472220838069916, + "learning_rate": 9.673914184888692e-05, + "loss": 1.9629, + "step": 4632 + }, + { + "epoch": 1.4220380601596072, + "grad_norm": 0.4992378354072571, + "learning_rate": 9.6737375977069e-05, + "loss": 1.9202, + "step": 4633 + }, + { + "epoch": 1.4223449969306323, + "grad_norm": 0.5463345646858215, + "learning_rate": 9.673560964336493e-05, + "loss": 2.0143, + "step": 4634 + }, + { + "epoch": 1.4226519337016574, + "grad_norm": 0.4566437304019928, + "learning_rate": 9.673384284779217e-05, + "loss": 1.8907, + "step": 4635 + }, + { + "epoch": 1.4229588704726825, + "grad_norm": 0.41718652844429016, + "learning_rate": 9.673207559036816e-05, + "loss": 1.8955, + "step": 4636 + }, + { + "epoch": 1.4232658072437077, + "grad_norm": 0.5017329454421997, + "learning_rate": 9.673030787111043e-05, + "loss": 1.9745, + "step": 4637 + }, + { + "epoch": 1.423572744014733, + "grad_norm": 0.48890092968940735, + "learning_rate": 9.67285396900364e-05, + "loss": 1.9448, + "step": 4638 + }, + { + "epoch": 1.423879680785758, + "grad_norm": 0.4519537687301636, + "learning_rate": 9.672677104716352e-05, + "loss": 1.9572, + "step": 4639 + }, + { + "epoch": 1.4241866175567832, + "grad_norm": 0.4786919355392456, + "learning_rate": 9.672500194250932e-05, + "loss": 2.0212, + "step": 4640 + }, + { + "epoch": 1.4244935543278086, + "grad_norm": 0.4938487112522125, + "learning_rate": 9.672323237609127e-05, + "loss": 1.9842, + "step": 4641 + }, + { + "epoch": 1.4248004910988337, + "grad_norm": 0.5786599516868591, + "learning_rate": 9.672146234792686e-05, + "loss": 1.9575, + "step": 4642 + }, + { + "epoch": 1.4251074278698588, + "grad_norm": 0.5532247424125671, + "learning_rate": 9.671969185803356e-05, + "loss": 1.9972, + "step": 4643 + }, + { + "epoch": 1.4254143646408841, + "grad_norm": 0.5058014988899231, + "learning_rate": 9.671792090642889e-05, + "loss": 2.0042, + "step": 4644 + }, + { + "epoch": 1.4257213014119092, + "grad_norm": 0.46545106172561646, + "learning_rate": 9.671614949313033e-05, + "loss": 1.9853, + "step": 4645 + }, + { + "epoch": 1.4260282381829343, + "grad_norm": 0.47626879811286926, + "learning_rate": 9.671437761815541e-05, + "loss": 1.9725, + "step": 4646 + }, + { + "epoch": 1.4263351749539595, + "grad_norm": 0.4476237893104553, + "learning_rate": 9.671260528152165e-05, + "loss": 1.8876, + "step": 4647 + }, + { + "epoch": 1.4266421117249846, + "grad_norm": 0.4290693700313568, + "learning_rate": 9.671083248324651e-05, + "loss": 1.9766, + "step": 4648 + }, + { + "epoch": 1.42694904849601, + "grad_norm": 0.443131685256958, + "learning_rate": 9.670905922334757e-05, + "loss": 2.0201, + "step": 4649 + }, + { + "epoch": 1.427255985267035, + "grad_norm": 0.5181389451026917, + "learning_rate": 9.670728550184231e-05, + "loss": 2.0013, + "step": 4650 + }, + { + "epoch": 1.4275629220380601, + "grad_norm": 0.48453402519226074, + "learning_rate": 9.670551131874829e-05, + "loss": 1.9536, + "step": 4651 + }, + { + "epoch": 1.4278698588090855, + "grad_norm": 0.49652302265167236, + "learning_rate": 9.670373667408303e-05, + "loss": 1.9934, + "step": 4652 + }, + { + "epoch": 1.4281767955801106, + "grad_norm": 0.47071191668510437, + "learning_rate": 9.670196156786406e-05, + "loss": 2.0319, + "step": 4653 + }, + { + "epoch": 1.4284837323511357, + "grad_norm": 0.46828708052635193, + "learning_rate": 9.670018600010894e-05, + "loss": 1.9248, + "step": 4654 + }, + { + "epoch": 1.4287906691221608, + "grad_norm": 0.48472490906715393, + "learning_rate": 9.669840997083524e-05, + "loss": 1.9681, + "step": 4655 + }, + { + "epoch": 1.429097605893186, + "grad_norm": 0.48628562688827515, + "learning_rate": 9.669663348006044e-05, + "loss": 1.9818, + "step": 4656 + }, + { + "epoch": 1.4294045426642112, + "grad_norm": 0.40770742297172546, + "learning_rate": 9.669485652780215e-05, + "loss": 1.927, + "step": 4657 + }, + { + "epoch": 1.4297114794352364, + "grad_norm": 0.5005267858505249, + "learning_rate": 9.669307911407794e-05, + "loss": 2.0564, + "step": 4658 + }, + { + "epoch": 1.4300184162062615, + "grad_norm": 0.42432111501693726, + "learning_rate": 9.669130123890533e-05, + "loss": 1.9344, + "step": 4659 + }, + { + "epoch": 1.4303253529772868, + "grad_norm": 0.42347240447998047, + "learning_rate": 9.668952290230192e-05, + "loss": 1.962, + "step": 4660 + }, + { + "epoch": 1.430632289748312, + "grad_norm": 0.4718005955219269, + "learning_rate": 9.668774410428529e-05, + "loss": 2.0081, + "step": 4661 + }, + { + "epoch": 1.430939226519337, + "grad_norm": 0.45922374725341797, + "learning_rate": 9.6685964844873e-05, + "loss": 1.9378, + "step": 4662 + }, + { + "epoch": 1.4312461632903621, + "grad_norm": 0.43764227628707886, + "learning_rate": 9.668418512408263e-05, + "loss": 2.0084, + "step": 4663 + }, + { + "epoch": 1.4315531000613873, + "grad_norm": 0.42079678177833557, + "learning_rate": 9.668240494193179e-05, + "loss": 1.9675, + "step": 4664 + }, + { + "epoch": 1.4318600368324126, + "grad_norm": 0.4470539093017578, + "learning_rate": 9.668062429843808e-05, + "loss": 1.9781, + "step": 4665 + }, + { + "epoch": 1.4321669736034377, + "grad_norm": 0.4903084337711334, + "learning_rate": 9.667884319361906e-05, + "loss": 1.9612, + "step": 4666 + }, + { + "epoch": 1.4324739103744628, + "grad_norm": 0.4906228482723236, + "learning_rate": 9.667706162749234e-05, + "loss": 2.0115, + "step": 4667 + }, + { + "epoch": 1.4327808471454881, + "grad_norm": 0.4868105351924896, + "learning_rate": 9.667527960007556e-05, + "loss": 1.9648, + "step": 4668 + }, + { + "epoch": 1.4330877839165133, + "grad_norm": 0.5115882754325867, + "learning_rate": 9.667349711138632e-05, + "loss": 2.0366, + "step": 4669 + }, + { + "epoch": 1.4333947206875384, + "grad_norm": 0.47366276383399963, + "learning_rate": 9.66717141614422e-05, + "loss": 1.9467, + "step": 4670 + }, + { + "epoch": 1.4337016574585635, + "grad_norm": 0.6110171675682068, + "learning_rate": 9.666993075026086e-05, + "loss": 1.9272, + "step": 4671 + }, + { + "epoch": 1.4340085942295886, + "grad_norm": 0.5915683507919312, + "learning_rate": 9.66681468778599e-05, + "loss": 2.0444, + "step": 4672 + }, + { + "epoch": 1.434315531000614, + "grad_norm": 0.5783519744873047, + "learning_rate": 9.666636254425697e-05, + "loss": 1.9579, + "step": 4673 + }, + { + "epoch": 1.434622467771639, + "grad_norm": 0.4646502137184143, + "learning_rate": 9.66645777494697e-05, + "loss": 1.9172, + "step": 4674 + }, + { + "epoch": 1.4349294045426642, + "grad_norm": 0.4184744656085968, + "learning_rate": 9.666279249351571e-05, + "loss": 1.9189, + "step": 4675 + }, + { + "epoch": 1.4352363413136895, + "grad_norm": 0.5444575548171997, + "learning_rate": 9.666100677641266e-05, + "loss": 2.045, + "step": 4676 + }, + { + "epoch": 1.4355432780847146, + "grad_norm": 0.5232846140861511, + "learning_rate": 9.665922059817818e-05, + "loss": 2.0059, + "step": 4677 + }, + { + "epoch": 1.4358502148557397, + "grad_norm": 0.439259797334671, + "learning_rate": 9.665743395882994e-05, + "loss": 1.9164, + "step": 4678 + }, + { + "epoch": 1.4361571516267648, + "grad_norm": 0.405073344707489, + "learning_rate": 9.66556468583856e-05, + "loss": 1.9211, + "step": 4679 + }, + { + "epoch": 1.43646408839779, + "grad_norm": 0.47113174200057983, + "learning_rate": 9.665385929686279e-05, + "loss": 2.0732, + "step": 4680 + }, + { + "epoch": 1.4367710251688153, + "grad_norm": 0.4710143506526947, + "learning_rate": 9.665207127427923e-05, + "loss": 1.9153, + "step": 4681 + }, + { + "epoch": 1.4370779619398404, + "grad_norm": 0.41988152265548706, + "learning_rate": 9.665028279065254e-05, + "loss": 1.9985, + "step": 4682 + }, + { + "epoch": 1.4373848987108655, + "grad_norm": 0.4629889130592346, + "learning_rate": 9.664849384600042e-05, + "loss": 2.0188, + "step": 4683 + }, + { + "epoch": 1.4376918354818908, + "grad_norm": 0.42099106311798096, + "learning_rate": 9.664670444034051e-05, + "loss": 1.8915, + "step": 4684 + }, + { + "epoch": 1.437998772252916, + "grad_norm": 0.4132508337497711, + "learning_rate": 9.664491457369056e-05, + "loss": 1.9842, + "step": 4685 + }, + { + "epoch": 1.438305709023941, + "grad_norm": 0.4019499123096466, + "learning_rate": 9.664312424606822e-05, + "loss": 1.8653, + "step": 4686 + }, + { + "epoch": 1.4386126457949662, + "grad_norm": 0.40366294980049133, + "learning_rate": 9.664133345749118e-05, + "loss": 1.8993, + "step": 4687 + }, + { + "epoch": 1.4389195825659913, + "grad_norm": 0.4391988217830658, + "learning_rate": 9.663954220797715e-05, + "loss": 1.9471, + "step": 4688 + }, + { + "epoch": 1.4392265193370166, + "grad_norm": 0.44109684228897095, + "learning_rate": 9.663775049754382e-05, + "loss": 1.9579, + "step": 4689 + }, + { + "epoch": 1.4395334561080417, + "grad_norm": 0.45682960748672485, + "learning_rate": 9.663595832620891e-05, + "loss": 1.9757, + "step": 4690 + }, + { + "epoch": 1.4398403928790668, + "grad_norm": 0.4106207489967346, + "learning_rate": 9.663416569399013e-05, + "loss": 2.0038, + "step": 4691 + }, + { + "epoch": 1.4401473296500922, + "grad_norm": 0.4627512991428375, + "learning_rate": 9.66323726009052e-05, + "loss": 2.0253, + "step": 4692 + }, + { + "epoch": 1.4404542664211173, + "grad_norm": 0.43822941184043884, + "learning_rate": 9.663057904697182e-05, + "loss": 1.9565, + "step": 4693 + }, + { + "epoch": 1.4407612031921424, + "grad_norm": 0.46254315972328186, + "learning_rate": 9.662878503220772e-05, + "loss": 2.0042, + "step": 4694 + }, + { + "epoch": 1.4410681399631675, + "grad_norm": 0.49801671504974365, + "learning_rate": 9.662699055663065e-05, + "loss": 1.9725, + "step": 4695 + }, + { + "epoch": 1.4413750767341926, + "grad_norm": 0.40280646085739136, + "learning_rate": 9.662519562025832e-05, + "loss": 1.9016, + "step": 4696 + }, + { + "epoch": 1.441682013505218, + "grad_norm": 0.4095497131347656, + "learning_rate": 9.662340022310848e-05, + "loss": 2.0054, + "step": 4697 + }, + { + "epoch": 1.441988950276243, + "grad_norm": 0.44916659593582153, + "learning_rate": 9.662160436519889e-05, + "loss": 2.0126, + "step": 4698 + }, + { + "epoch": 1.4422958870472682, + "grad_norm": 0.47450655698776245, + "learning_rate": 9.661980804654725e-05, + "loss": 1.9679, + "step": 4699 + }, + { + "epoch": 1.4426028238182935, + "grad_norm": 0.4454696774482727, + "learning_rate": 9.661801126717136e-05, + "loss": 1.9335, + "step": 4700 + }, + { + "epoch": 1.4429097605893186, + "grad_norm": 0.5009927153587341, + "learning_rate": 9.661621402708896e-05, + "loss": 1.9777, + "step": 4701 + }, + { + "epoch": 1.4432166973603437, + "grad_norm": 0.49912458658218384, + "learning_rate": 9.66144163263178e-05, + "loss": 2.0095, + "step": 4702 + }, + { + "epoch": 1.4435236341313689, + "grad_norm": 0.4477069079875946, + "learning_rate": 9.661261816487568e-05, + "loss": 1.9265, + "step": 4703 + }, + { + "epoch": 1.443830570902394, + "grad_norm": 0.4170798361301422, + "learning_rate": 9.661081954278033e-05, + "loss": 1.9458, + "step": 4704 + }, + { + "epoch": 1.4441375076734193, + "grad_norm": 0.45160573720932007, + "learning_rate": 9.660902046004953e-05, + "loss": 1.9596, + "step": 4705 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4391551911830902, + "learning_rate": 9.660722091670109e-05, + "loss": 1.9158, + "step": 4706 + }, + { + "epoch": 1.4447513812154695, + "grad_norm": 0.5183218121528625, + "learning_rate": 9.660542091275276e-05, + "loss": 2.0055, + "step": 4707 + }, + { + "epoch": 1.4450583179864949, + "grad_norm": 0.49749481678009033, + "learning_rate": 9.660362044822235e-05, + "loss": 1.9695, + "step": 4708 + }, + { + "epoch": 1.44536525475752, + "grad_norm": 0.4839307963848114, + "learning_rate": 9.660181952312766e-05, + "loss": 1.9447, + "step": 4709 + }, + { + "epoch": 1.445672191528545, + "grad_norm": 0.5218588709831238, + "learning_rate": 9.660001813748647e-05, + "loss": 1.9892, + "step": 4710 + }, + { + "epoch": 1.4459791282995704, + "grad_norm": 0.5628986954689026, + "learning_rate": 9.659821629131658e-05, + "loss": 2.0598, + "step": 4711 + }, + { + "epoch": 1.4462860650705955, + "grad_norm": 0.5226300358772278, + "learning_rate": 9.65964139846358e-05, + "loss": 1.977, + "step": 4712 + }, + { + "epoch": 1.4465930018416207, + "grad_norm": 0.4345463216304779, + "learning_rate": 9.659461121746196e-05, + "loss": 1.9649, + "step": 4713 + }, + { + "epoch": 1.4468999386126458, + "grad_norm": 0.47233885526657104, + "learning_rate": 9.659280798981285e-05, + "loss": 1.9791, + "step": 4714 + }, + { + "epoch": 1.4472068753836709, + "grad_norm": 0.5272542238235474, + "learning_rate": 9.659100430170631e-05, + "loss": 2.0153, + "step": 4715 + }, + { + "epoch": 1.4475138121546962, + "grad_norm": 0.5567492246627808, + "learning_rate": 9.658920015316015e-05, + "loss": 2.0196, + "step": 4716 + }, + { + "epoch": 1.4478207489257213, + "grad_norm": 0.5393046140670776, + "learning_rate": 9.658739554419222e-05, + "loss": 1.9871, + "step": 4717 + }, + { + "epoch": 1.4481276856967464, + "grad_norm": 0.46408072113990784, + "learning_rate": 9.658559047482034e-05, + "loss": 1.9896, + "step": 4718 + }, + { + "epoch": 1.4484346224677718, + "grad_norm": 0.47001218795776367, + "learning_rate": 9.658378494506234e-05, + "loss": 2.0281, + "step": 4719 + }, + { + "epoch": 1.4487415592387969, + "grad_norm": 0.555749773979187, + "learning_rate": 9.658197895493608e-05, + "loss": 2.0184, + "step": 4720 + }, + { + "epoch": 1.449048496009822, + "grad_norm": 0.6206443905830383, + "learning_rate": 9.65801725044594e-05, + "loss": 1.9788, + "step": 4721 + }, + { + "epoch": 1.449355432780847, + "grad_norm": 0.533336877822876, + "learning_rate": 9.657836559365016e-05, + "loss": 1.9755, + "step": 4722 + }, + { + "epoch": 1.4496623695518722, + "grad_norm": 0.4553185701370239, + "learning_rate": 9.65765582225262e-05, + "loss": 1.9791, + "step": 4723 + }, + { + "epoch": 1.4499693063228976, + "grad_norm": 0.5754305124282837, + "learning_rate": 9.65747503911054e-05, + "loss": 1.9485, + "step": 4724 + }, + { + "epoch": 1.4502762430939227, + "grad_norm": 0.6812698245048523, + "learning_rate": 9.657294209940562e-05, + "loss": 2.0326, + "step": 4725 + }, + { + "epoch": 1.4505831798649478, + "grad_norm": 0.7532522678375244, + "learning_rate": 9.657113334744472e-05, + "loss": 1.9387, + "step": 4726 + }, + { + "epoch": 1.4508901166359731, + "grad_norm": 0.5618684887886047, + "learning_rate": 9.656932413524058e-05, + "loss": 1.9395, + "step": 4727 + }, + { + "epoch": 1.4511970534069982, + "grad_norm": 0.4818387031555176, + "learning_rate": 9.65675144628111e-05, + "loss": 1.9473, + "step": 4728 + }, + { + "epoch": 1.4515039901780233, + "grad_norm": 0.5152607560157776, + "learning_rate": 9.656570433017413e-05, + "loss": 1.894, + "step": 4729 + }, + { + "epoch": 1.4518109269490485, + "grad_norm": 0.5098578333854675, + "learning_rate": 9.656389373734759e-05, + "loss": 1.9519, + "step": 4730 + }, + { + "epoch": 1.4521178637200736, + "grad_norm": 0.5862317681312561, + "learning_rate": 9.656208268434936e-05, + "loss": 1.9968, + "step": 4731 + }, + { + "epoch": 1.452424800491099, + "grad_norm": 0.501220703125, + "learning_rate": 9.656027117119732e-05, + "loss": 1.993, + "step": 4732 + }, + { + "epoch": 1.452731737262124, + "grad_norm": 0.4974796772003174, + "learning_rate": 9.655845919790943e-05, + "loss": 2.0007, + "step": 4733 + }, + { + "epoch": 1.4530386740331491, + "grad_norm": 0.513671875, + "learning_rate": 9.655664676450351e-05, + "loss": 1.9321, + "step": 4734 + }, + { + "epoch": 1.4533456108041745, + "grad_norm": 0.5111755728721619, + "learning_rate": 9.655483387099756e-05, + "loss": 2.0187, + "step": 4735 + }, + { + "epoch": 1.4536525475751996, + "grad_norm": 0.47103258967399597, + "learning_rate": 9.655302051740942e-05, + "loss": 1.9716, + "step": 4736 + }, + { + "epoch": 1.4539594843462247, + "grad_norm": 0.4526553750038147, + "learning_rate": 9.655120670375707e-05, + "loss": 2.0424, + "step": 4737 + }, + { + "epoch": 1.4542664211172498, + "grad_norm": 0.44393640756607056, + "learning_rate": 9.65493924300584e-05, + "loss": 1.9318, + "step": 4738 + }, + { + "epoch": 1.454573357888275, + "grad_norm": 0.4070759415626526, + "learning_rate": 9.654757769633136e-05, + "loss": 1.9292, + "step": 4739 + }, + { + "epoch": 1.4548802946593002, + "grad_norm": 0.4010253846645355, + "learning_rate": 9.654576250259387e-05, + "loss": 1.9641, + "step": 4740 + }, + { + "epoch": 1.4551872314303254, + "grad_norm": 0.39156264066696167, + "learning_rate": 9.654394684886387e-05, + "loss": 1.9575, + "step": 4741 + }, + { + "epoch": 1.4554941682013505, + "grad_norm": 0.4360155463218689, + "learning_rate": 9.65421307351593e-05, + "loss": 1.9615, + "step": 4742 + }, + { + "epoch": 1.4558011049723758, + "grad_norm": 0.4203348755836487, + "learning_rate": 9.654031416149813e-05, + "loss": 1.9629, + "step": 4743 + }, + { + "epoch": 1.456108041743401, + "grad_norm": 0.42294225096702576, + "learning_rate": 9.653849712789828e-05, + "loss": 1.9756, + "step": 4744 + }, + { + "epoch": 1.456414978514426, + "grad_norm": 0.46253907680511475, + "learning_rate": 9.653667963437775e-05, + "loss": 2.0128, + "step": 4745 + }, + { + "epoch": 1.4567219152854511, + "grad_norm": 0.41743987798690796, + "learning_rate": 9.653486168095446e-05, + "loss": 1.938, + "step": 4746 + }, + { + "epoch": 1.4570288520564763, + "grad_norm": 0.43411263823509216, + "learning_rate": 9.653304326764639e-05, + "loss": 1.9744, + "step": 4747 + }, + { + "epoch": 1.4573357888275016, + "grad_norm": 0.4569607973098755, + "learning_rate": 9.653122439447151e-05, + "loss": 1.9844, + "step": 4748 + }, + { + "epoch": 1.4576427255985267, + "grad_norm": 0.41858115792274475, + "learning_rate": 9.652940506144781e-05, + "loss": 1.9835, + "step": 4749 + }, + { + "epoch": 1.4579496623695518, + "grad_norm": 0.4259703755378723, + "learning_rate": 9.652758526859324e-05, + "loss": 1.9467, + "step": 4750 + }, + { + "epoch": 1.4582565991405771, + "grad_norm": 0.49847620725631714, + "learning_rate": 9.652576501592583e-05, + "loss": 1.989, + "step": 4751 + }, + { + "epoch": 1.4585635359116023, + "grad_norm": 0.5898705720901489, + "learning_rate": 9.652394430346352e-05, + "loss": 1.9896, + "step": 4752 + }, + { + "epoch": 1.4588704726826274, + "grad_norm": 0.6528434157371521, + "learning_rate": 9.652212313122433e-05, + "loss": 1.9814, + "step": 4753 + }, + { + "epoch": 1.4591774094536525, + "grad_norm": 0.5704251527786255, + "learning_rate": 9.652030149922624e-05, + "loss": 1.9735, + "step": 4754 + }, + { + "epoch": 1.4594843462246776, + "grad_norm": 0.4349142014980316, + "learning_rate": 9.651847940748727e-05, + "loss": 1.9923, + "step": 4755 + }, + { + "epoch": 1.459791282995703, + "grad_norm": 0.43891096115112305, + "learning_rate": 9.651665685602542e-05, + "loss": 1.9429, + "step": 4756 + }, + { + "epoch": 1.460098219766728, + "grad_norm": 0.5881633758544922, + "learning_rate": 9.651483384485871e-05, + "loss": 2.0075, + "step": 4757 + }, + { + "epoch": 1.4604051565377532, + "grad_norm": 0.569064736366272, + "learning_rate": 9.651301037400515e-05, + "loss": 1.9968, + "step": 4758 + }, + { + "epoch": 1.4607120933087785, + "grad_norm": 0.49636805057525635, + "learning_rate": 9.651118644348276e-05, + "loss": 2.0844, + "step": 4759 + }, + { + "epoch": 1.4610190300798036, + "grad_norm": 0.4893283247947693, + "learning_rate": 9.650936205330955e-05, + "loss": 1.9635, + "step": 4760 + }, + { + "epoch": 1.4613259668508287, + "grad_norm": 0.5199632048606873, + "learning_rate": 9.650753720350358e-05, + "loss": 1.8934, + "step": 4761 + }, + { + "epoch": 1.4616329036218538, + "grad_norm": 0.5655859708786011, + "learning_rate": 9.650571189408287e-05, + "loss": 2.0473, + "step": 4762 + }, + { + "epoch": 1.461939840392879, + "grad_norm": 0.5004158020019531, + "learning_rate": 9.650388612506545e-05, + "loss": 1.9388, + "step": 4763 + }, + { + "epoch": 1.4622467771639043, + "grad_norm": 0.5075541734695435, + "learning_rate": 9.650205989646937e-05, + "loss": 2.0362, + "step": 4764 + }, + { + "epoch": 1.4625537139349294, + "grad_norm": 0.52835613489151, + "learning_rate": 9.650023320831267e-05, + "loss": 1.9849, + "step": 4765 + }, + { + "epoch": 1.4628606507059545, + "grad_norm": 0.5208338499069214, + "learning_rate": 9.649840606061342e-05, + "loss": 1.9619, + "step": 4766 + }, + { + "epoch": 1.4631675874769798, + "grad_norm": 0.4954691529273987, + "learning_rate": 9.649657845338966e-05, + "loss": 1.9282, + "step": 4767 + }, + { + "epoch": 1.463474524248005, + "grad_norm": 0.4260660409927368, + "learning_rate": 9.649475038665947e-05, + "loss": 2.0108, + "step": 4768 + }, + { + "epoch": 1.46378146101903, + "grad_norm": 0.4954771101474762, + "learning_rate": 9.64929218604409e-05, + "loss": 1.9995, + "step": 4769 + }, + { + "epoch": 1.4640883977900552, + "grad_norm": 0.6004415154457092, + "learning_rate": 9.649109287475202e-05, + "loss": 1.9816, + "step": 4770 + }, + { + "epoch": 1.4643953345610803, + "grad_norm": 0.6472858190536499, + "learning_rate": 9.648926342961092e-05, + "loss": 1.927, + "step": 4771 + }, + { + "epoch": 1.4647022713321056, + "grad_norm": 0.5293224453926086, + "learning_rate": 9.648743352503567e-05, + "loss": 1.9082, + "step": 4772 + }, + { + "epoch": 1.4650092081031307, + "grad_norm": 0.4413148760795593, + "learning_rate": 9.648560316104435e-05, + "loss": 1.9368, + "step": 4773 + }, + { + "epoch": 1.4653161448741558, + "grad_norm": 0.4727863371372223, + "learning_rate": 9.648377233765507e-05, + "loss": 1.944, + "step": 4774 + }, + { + "epoch": 1.4656230816451812, + "grad_norm": 0.5681154131889343, + "learning_rate": 9.648194105488589e-05, + "loss": 2.0003, + "step": 4775 + }, + { + "epoch": 1.4659300184162063, + "grad_norm": 0.5893644690513611, + "learning_rate": 9.648010931275493e-05, + "loss": 1.936, + "step": 4776 + }, + { + "epoch": 1.4662369551872314, + "grad_norm": 0.5034298300743103, + "learning_rate": 9.647827711128029e-05, + "loss": 2.0318, + "step": 4777 + }, + { + "epoch": 1.4665438919582565, + "grad_norm": 0.4954885244369507, + "learning_rate": 9.647644445048006e-05, + "loss": 2.0053, + "step": 4778 + }, + { + "epoch": 1.4668508287292816, + "grad_norm": 0.475923627614975, + "learning_rate": 9.647461133037236e-05, + "loss": 1.8911, + "step": 4779 + }, + { + "epoch": 1.467157765500307, + "grad_norm": 0.4725008010864258, + "learning_rate": 9.647277775097534e-05, + "loss": 1.8954, + "step": 4780 + }, + { + "epoch": 1.467464702271332, + "grad_norm": 0.4183707535266876, + "learning_rate": 9.647094371230707e-05, + "loss": 1.9891, + "step": 4781 + }, + { + "epoch": 1.4677716390423572, + "grad_norm": 0.4862513244152069, + "learning_rate": 9.64691092143857e-05, + "loss": 2.0364, + "step": 4782 + }, + { + "epoch": 1.4680785758133825, + "grad_norm": 0.5038082599639893, + "learning_rate": 9.646727425722936e-05, + "loss": 1.9304, + "step": 4783 + }, + { + "epoch": 1.4683855125844076, + "grad_norm": 0.47281327843666077, + "learning_rate": 9.646543884085618e-05, + "loss": 1.9453, + "step": 4784 + }, + { + "epoch": 1.4686924493554327, + "grad_norm": 0.42275354266166687, + "learning_rate": 9.646360296528431e-05, + "loss": 1.9434, + "step": 4785 + }, + { + "epoch": 1.468999386126458, + "grad_norm": 0.5757746696472168, + "learning_rate": 9.646176663053185e-05, + "loss": 2.0241, + "step": 4786 + }, + { + "epoch": 1.4693063228974832, + "grad_norm": 0.6757779121398926, + "learning_rate": 9.645992983661701e-05, + "loss": 1.9823, + "step": 4787 + }, + { + "epoch": 1.4696132596685083, + "grad_norm": 0.7052981853485107, + "learning_rate": 9.645809258355792e-05, + "loss": 2.0553, + "step": 4788 + }, + { + "epoch": 1.4699201964395334, + "grad_norm": 0.5630238652229309, + "learning_rate": 9.64562548713727e-05, + "loss": 2.0241, + "step": 4789 + }, + { + "epoch": 1.4702271332105585, + "grad_norm": 0.5034958124160767, + "learning_rate": 9.645441670007955e-05, + "loss": 1.9788, + "step": 4790 + }, + { + "epoch": 1.4705340699815839, + "grad_norm": 0.48978129029273987, + "learning_rate": 9.645257806969663e-05, + "loss": 1.9415, + "step": 4791 + }, + { + "epoch": 1.470841006752609, + "grad_norm": 0.4718508720397949, + "learning_rate": 9.645073898024211e-05, + "loss": 1.9657, + "step": 4792 + }, + { + "epoch": 1.471147943523634, + "grad_norm": 0.5171064734458923, + "learning_rate": 9.644889943173417e-05, + "loss": 1.9311, + "step": 4793 + }, + { + "epoch": 1.4714548802946594, + "grad_norm": 0.4556005597114563, + "learning_rate": 9.644705942419097e-05, + "loss": 1.9093, + "step": 4794 + }, + { + "epoch": 1.4717618170656845, + "grad_norm": 0.44836321473121643, + "learning_rate": 9.64452189576307e-05, + "loss": 1.9715, + "step": 4795 + }, + { + "epoch": 1.4720687538367097, + "grad_norm": 0.5139105916023254, + "learning_rate": 9.644337803207155e-05, + "loss": 1.967, + "step": 4796 + }, + { + "epoch": 1.4723756906077348, + "grad_norm": 0.49145743250846863, + "learning_rate": 9.644153664753173e-05, + "loss": 1.9679, + "step": 4797 + }, + { + "epoch": 1.4726826273787599, + "grad_norm": 0.4353790283203125, + "learning_rate": 9.643969480402942e-05, + "loss": 1.9438, + "step": 4798 + }, + { + "epoch": 1.4729895641497852, + "grad_norm": 0.39393118023872375, + "learning_rate": 9.643785250158283e-05, + "loss": 1.91, + "step": 4799 + }, + { + "epoch": 1.4732965009208103, + "grad_norm": 0.4250284731388092, + "learning_rate": 9.643600974021017e-05, + "loss": 1.9315, + "step": 4800 + }, + { + "epoch": 1.4736034376918354, + "grad_norm": 0.40301406383514404, + "learning_rate": 9.643416651992962e-05, + "loss": 1.9344, + "step": 4801 + }, + { + "epoch": 1.4739103744628608, + "grad_norm": 0.4428589940071106, + "learning_rate": 9.643232284075944e-05, + "loss": 1.9767, + "step": 4802 + }, + { + "epoch": 1.4742173112338859, + "grad_norm": 0.5098150372505188, + "learning_rate": 9.643047870271783e-05, + "loss": 2.0471, + "step": 4803 + }, + { + "epoch": 1.474524248004911, + "grad_norm": 0.5230079293251038, + "learning_rate": 9.642863410582302e-05, + "loss": 1.9647, + "step": 4804 + }, + { + "epoch": 1.474831184775936, + "grad_norm": 0.44200628995895386, + "learning_rate": 9.642678905009322e-05, + "loss": 1.9046, + "step": 4805 + }, + { + "epoch": 1.4751381215469612, + "grad_norm": 0.42684751749038696, + "learning_rate": 9.642494353554669e-05, + "loss": 1.82, + "step": 4806 + }, + { + "epoch": 1.4754450583179866, + "grad_norm": 0.3907437324523926, + "learning_rate": 9.642309756220165e-05, + "loss": 1.9257, + "step": 4807 + }, + { + "epoch": 1.4757519950890117, + "grad_norm": 0.43622660636901855, + "learning_rate": 9.642125113007636e-05, + "loss": 1.9319, + "step": 4808 + }, + { + "epoch": 1.4760589318600368, + "grad_norm": 0.4553097188472748, + "learning_rate": 9.641940423918905e-05, + "loss": 1.9699, + "step": 4809 + }, + { + "epoch": 1.4763658686310621, + "grad_norm": 0.48997193574905396, + "learning_rate": 9.641755688955798e-05, + "loss": 1.9843, + "step": 4810 + }, + { + "epoch": 1.4766728054020872, + "grad_norm": 0.5008227825164795, + "learning_rate": 9.641570908120141e-05, + "loss": 1.9616, + "step": 4811 + }, + { + "epoch": 1.4769797421731123, + "grad_norm": 0.49788615107536316, + "learning_rate": 9.64138608141376e-05, + "loss": 2.0233, + "step": 4812 + }, + { + "epoch": 1.4772866789441375, + "grad_norm": 0.509159505367279, + "learning_rate": 9.64120120883848e-05, + "loss": 1.9982, + "step": 4813 + }, + { + "epoch": 1.4775936157151626, + "grad_norm": 0.4976164996623993, + "learning_rate": 9.641016290396132e-05, + "loss": 1.9944, + "step": 4814 + }, + { + "epoch": 1.477900552486188, + "grad_norm": 0.4925370514392853, + "learning_rate": 9.640831326088539e-05, + "loss": 1.9547, + "step": 4815 + }, + { + "epoch": 1.478207489257213, + "grad_norm": 0.5058705806732178, + "learning_rate": 9.64064631591753e-05, + "loss": 2.0147, + "step": 4816 + }, + { + "epoch": 1.4785144260282381, + "grad_norm": 0.5614715814590454, + "learning_rate": 9.640461259884937e-05, + "loss": 1.9475, + "step": 4817 + }, + { + "epoch": 1.4788213627992635, + "grad_norm": 0.4417608380317688, + "learning_rate": 9.640276157992582e-05, + "loss": 1.9422, + "step": 4818 + }, + { + "epoch": 1.4791282995702886, + "grad_norm": 0.5124607682228088, + "learning_rate": 9.6400910102423e-05, + "loss": 1.9489, + "step": 4819 + }, + { + "epoch": 1.4794352363413137, + "grad_norm": 0.4931279420852661, + "learning_rate": 9.63990581663592e-05, + "loss": 1.9717, + "step": 4820 + }, + { + "epoch": 1.4797421731123388, + "grad_norm": 0.4716447591781616, + "learning_rate": 9.639720577175271e-05, + "loss": 1.9758, + "step": 4821 + }, + { + "epoch": 1.480049109883364, + "grad_norm": 0.4613695740699768, + "learning_rate": 9.639535291862183e-05, + "loss": 1.8998, + "step": 4822 + }, + { + "epoch": 1.4803560466543892, + "grad_norm": 0.4430600702762604, + "learning_rate": 9.639349960698489e-05, + "loss": 1.9539, + "step": 4823 + }, + { + "epoch": 1.4806629834254144, + "grad_norm": 0.45596009492874146, + "learning_rate": 9.639164583686018e-05, + "loss": 1.9626, + "step": 4824 + }, + { + "epoch": 1.4809699201964395, + "grad_norm": 0.4248705804347992, + "learning_rate": 9.638979160826604e-05, + "loss": 1.9627, + "step": 4825 + }, + { + "epoch": 1.4812768569674648, + "grad_norm": 0.43419960141181946, + "learning_rate": 9.63879369212208e-05, + "loss": 1.9589, + "step": 4826 + }, + { + "epoch": 1.48158379373849, + "grad_norm": 0.4715637266635895, + "learning_rate": 9.638608177574278e-05, + "loss": 1.981, + "step": 4827 + }, + { + "epoch": 1.481890730509515, + "grad_norm": 0.41809993982315063, + "learning_rate": 9.63842261718503e-05, + "loss": 1.9587, + "step": 4828 + }, + { + "epoch": 1.4821976672805401, + "grad_norm": 0.4085060656070709, + "learning_rate": 9.63823701095617e-05, + "loss": 1.9497, + "step": 4829 + }, + { + "epoch": 1.4825046040515653, + "grad_norm": 0.4199173152446747, + "learning_rate": 9.638051358889535e-05, + "loss": 1.9543, + "step": 4830 + }, + { + "epoch": 1.4828115408225906, + "grad_norm": 0.4560040235519409, + "learning_rate": 9.637865660986958e-05, + "loss": 1.9451, + "step": 4831 + }, + { + "epoch": 1.4831184775936157, + "grad_norm": 0.4059405028820038, + "learning_rate": 9.637679917250272e-05, + "loss": 1.9154, + "step": 4832 + }, + { + "epoch": 1.4834254143646408, + "grad_norm": 0.43314236402511597, + "learning_rate": 9.637494127681318e-05, + "loss": 1.9589, + "step": 4833 + }, + { + "epoch": 1.4837323511356661, + "grad_norm": 0.3866138458251953, + "learning_rate": 9.637308292281928e-05, + "loss": 1.9239, + "step": 4834 + }, + { + "epoch": 1.4840392879066913, + "grad_norm": 0.40781381726264954, + "learning_rate": 9.637122411053939e-05, + "loss": 1.9805, + "step": 4835 + }, + { + "epoch": 1.4843462246777164, + "grad_norm": 0.4605334401130676, + "learning_rate": 9.636936483999189e-05, + "loss": 1.9571, + "step": 4836 + }, + { + "epoch": 1.4846531614487415, + "grad_norm": 0.4730539917945862, + "learning_rate": 9.636750511119513e-05, + "loss": 1.9429, + "step": 4837 + }, + { + "epoch": 1.4849600982197666, + "grad_norm": 0.47973817586898804, + "learning_rate": 9.636564492416753e-05, + "loss": 1.9865, + "step": 4838 + }, + { + "epoch": 1.485267034990792, + "grad_norm": 0.4541794955730438, + "learning_rate": 9.636378427892744e-05, + "loss": 1.9796, + "step": 4839 + }, + { + "epoch": 1.485573971761817, + "grad_norm": 0.4863722026348114, + "learning_rate": 9.636192317549327e-05, + "loss": 1.9581, + "step": 4840 + }, + { + "epoch": 1.4858809085328422, + "grad_norm": 0.4559536278247833, + "learning_rate": 9.636006161388338e-05, + "loss": 1.9444, + "step": 4841 + }, + { + "epoch": 1.4861878453038675, + "grad_norm": 0.4385206401348114, + "learning_rate": 9.63581995941162e-05, + "loss": 1.9323, + "step": 4842 + }, + { + "epoch": 1.4864947820748926, + "grad_norm": 0.48802945017814636, + "learning_rate": 9.635633711621012e-05, + "loss": 1.9643, + "step": 4843 + }, + { + "epoch": 1.4868017188459177, + "grad_norm": 0.4051367938518524, + "learning_rate": 9.635447418018355e-05, + "loss": 1.9342, + "step": 4844 + }, + { + "epoch": 1.4871086556169428, + "grad_norm": 0.46384257078170776, + "learning_rate": 9.63526107860549e-05, + "loss": 1.9656, + "step": 4845 + }, + { + "epoch": 1.487415592387968, + "grad_norm": 0.3950713574886322, + "learning_rate": 9.635074693384257e-05, + "loss": 1.8673, + "step": 4846 + }, + { + "epoch": 1.4877225291589933, + "grad_norm": 0.4694644808769226, + "learning_rate": 9.634888262356501e-05, + "loss": 1.9484, + "step": 4847 + }, + { + "epoch": 1.4880294659300184, + "grad_norm": 0.45068567991256714, + "learning_rate": 9.63470178552406e-05, + "loss": 1.9221, + "step": 4848 + }, + { + "epoch": 1.4883364027010435, + "grad_norm": 0.44717836380004883, + "learning_rate": 9.634515262888781e-05, + "loss": 1.9968, + "step": 4849 + }, + { + "epoch": 1.4886433394720688, + "grad_norm": 0.42189615964889526, + "learning_rate": 9.634328694452506e-05, + "loss": 2.0262, + "step": 4850 + }, + { + "epoch": 1.488950276243094, + "grad_norm": 0.4895322322845459, + "learning_rate": 9.63414208021708e-05, + "loss": 2.0628, + "step": 4851 + }, + { + "epoch": 1.489257213014119, + "grad_norm": 0.4732883870601654, + "learning_rate": 9.633955420184342e-05, + "loss": 1.9487, + "step": 4852 + }, + { + "epoch": 1.4895641497851444, + "grad_norm": 0.4426051676273346, + "learning_rate": 9.633768714356143e-05, + "loss": 2.0181, + "step": 4853 + }, + { + "epoch": 1.4898710865561695, + "grad_norm": 0.5831739902496338, + "learning_rate": 9.633581962734326e-05, + "loss": 1.9311, + "step": 4854 + }, + { + "epoch": 1.4901780233271946, + "grad_norm": 0.6048587560653687, + "learning_rate": 9.633395165320734e-05, + "loss": 1.9159, + "step": 4855 + }, + { + "epoch": 1.4904849600982197, + "grad_norm": 0.60125732421875, + "learning_rate": 9.633208322117218e-05, + "loss": 1.9732, + "step": 4856 + }, + { + "epoch": 1.4907918968692448, + "grad_norm": 0.4806794822216034, + "learning_rate": 9.63302143312562e-05, + "loss": 1.9101, + "step": 4857 + }, + { + "epoch": 1.4910988336402702, + "grad_norm": 0.4032946228981018, + "learning_rate": 9.632834498347789e-05, + "loss": 1.9097, + "step": 4858 + }, + { + "epoch": 1.4914057704112953, + "grad_norm": 0.400632381439209, + "learning_rate": 9.632647517785571e-05, + "loss": 1.9949, + "step": 4859 + }, + { + "epoch": 1.4917127071823204, + "grad_norm": 0.49766576290130615, + "learning_rate": 9.632460491440818e-05, + "loss": 1.9762, + "step": 4860 + }, + { + "epoch": 1.4920196439533457, + "grad_norm": 0.6273209452629089, + "learning_rate": 9.632273419315372e-05, + "loss": 2.0797, + "step": 4861 + }, + { + "epoch": 1.4923265807243709, + "grad_norm": 0.5848406553268433, + "learning_rate": 9.632086301411087e-05, + "loss": 1.9366, + "step": 4862 + }, + { + "epoch": 1.492633517495396, + "grad_norm": 0.4683595597743988, + "learning_rate": 9.631899137729809e-05, + "loss": 1.9802, + "step": 4863 + }, + { + "epoch": 1.492940454266421, + "grad_norm": 0.43066033720970154, + "learning_rate": 9.63171192827339e-05, + "loss": 1.9621, + "step": 4864 + }, + { + "epoch": 1.4932473910374462, + "grad_norm": 0.47469422221183777, + "learning_rate": 9.63152467304368e-05, + "loss": 1.9795, + "step": 4865 + }, + { + "epoch": 1.4935543278084715, + "grad_norm": 0.5453927516937256, + "learning_rate": 9.631337372042526e-05, + "loss": 1.9711, + "step": 4866 + }, + { + "epoch": 1.4938612645794966, + "grad_norm": 0.5361614227294922, + "learning_rate": 9.631150025271782e-05, + "loss": 1.9849, + "step": 4867 + }, + { + "epoch": 1.4941682013505218, + "grad_norm": 0.4773578643798828, + "learning_rate": 9.6309626327333e-05, + "loss": 2.065, + "step": 4868 + }, + { + "epoch": 1.494475138121547, + "grad_norm": 0.428091824054718, + "learning_rate": 9.630775194428932e-05, + "loss": 1.9448, + "step": 4869 + }, + { + "epoch": 1.4947820748925722, + "grad_norm": 0.41679108142852783, + "learning_rate": 9.630587710360527e-05, + "loss": 1.9511, + "step": 4870 + }, + { + "epoch": 1.4950890116635973, + "grad_norm": 0.5072546601295471, + "learning_rate": 9.630400180529942e-05, + "loss": 1.9973, + "step": 4871 + }, + { + "epoch": 1.4953959484346224, + "grad_norm": 0.5230575799942017, + "learning_rate": 9.630212604939026e-05, + "loss": 1.9659, + "step": 4872 + }, + { + "epoch": 1.4957028852056475, + "grad_norm": 0.44307753443717957, + "learning_rate": 9.630024983589638e-05, + "loss": 1.9056, + "step": 4873 + }, + { + "epoch": 1.4960098219766729, + "grad_norm": 0.43783196806907654, + "learning_rate": 9.629837316483628e-05, + "loss": 1.9716, + "step": 4874 + }, + { + "epoch": 1.496316758747698, + "grad_norm": 0.4553990960121155, + "learning_rate": 9.629649603622852e-05, + "loss": 2.044, + "step": 4875 + }, + { + "epoch": 1.496623695518723, + "grad_norm": 0.49152833223342896, + "learning_rate": 9.629461845009164e-05, + "loss": 1.948, + "step": 4876 + }, + { + "epoch": 1.4969306322897484, + "grad_norm": 0.4371738135814667, + "learning_rate": 9.629274040644422e-05, + "loss": 1.9497, + "step": 4877 + }, + { + "epoch": 1.4972375690607735, + "grad_norm": 0.4973873198032379, + "learning_rate": 9.629086190530482e-05, + "loss": 2.0053, + "step": 4878 + }, + { + "epoch": 1.4975445058317987, + "grad_norm": 0.4250672459602356, + "learning_rate": 9.628898294669197e-05, + "loss": 1.9617, + "step": 4879 + }, + { + "epoch": 1.4978514426028238, + "grad_norm": 0.4514639675617218, + "learning_rate": 9.628710353062427e-05, + "loss": 1.9503, + "step": 4880 + }, + { + "epoch": 1.4981583793738489, + "grad_norm": 0.4960804879665375, + "learning_rate": 9.628522365712027e-05, + "loss": 1.9932, + "step": 4881 + }, + { + "epoch": 1.4984653161448742, + "grad_norm": 0.5604363083839417, + "learning_rate": 9.628334332619857e-05, + "loss": 2.0186, + "step": 4882 + }, + { + "epoch": 1.4987722529158993, + "grad_norm": 0.5125443935394287, + "learning_rate": 9.628146253787776e-05, + "loss": 1.9897, + "step": 4883 + }, + { + "epoch": 1.4990791896869244, + "grad_norm": 0.4029771089553833, + "learning_rate": 9.627958129217639e-05, + "loss": 1.9083, + "step": 4884 + }, + { + "epoch": 1.4993861264579498, + "grad_norm": 0.4608222544193268, + "learning_rate": 9.627769958911308e-05, + "loss": 2.0153, + "step": 4885 + }, + { + "epoch": 1.4996930632289749, + "grad_norm": 0.4253246486186981, + "learning_rate": 9.627581742870641e-05, + "loss": 1.9278, + "step": 4886 + }, + { + "epoch": 1.5, + "grad_norm": 0.4247463047504425, + "learning_rate": 9.6273934810975e-05, + "loss": 1.9456, + "step": 4887 + }, + { + "epoch": 1.5003069367710253, + "grad_norm": 0.44055816531181335, + "learning_rate": 9.627205173593744e-05, + "loss": 2.0225, + "step": 4888 + }, + { + "epoch": 1.5006138735420502, + "grad_norm": 0.47912710905075073, + "learning_rate": 9.627016820361235e-05, + "loss": 1.9716, + "step": 4889 + }, + { + "epoch": 1.5009208103130756, + "grad_norm": 0.47608625888824463, + "learning_rate": 9.626828421401832e-05, + "loss": 1.9444, + "step": 4890 + }, + { + "epoch": 1.5012277470841007, + "grad_norm": 0.4757349193096161, + "learning_rate": 9.6266399767174e-05, + "loss": 2.0699, + "step": 4891 + }, + { + "epoch": 1.5015346838551258, + "grad_norm": 0.5556650757789612, + "learning_rate": 9.6264514863098e-05, + "loss": 1.99, + "step": 4892 + }, + { + "epoch": 1.5018416206261511, + "grad_norm": 0.5072291493415833, + "learning_rate": 9.626262950180894e-05, + "loss": 1.9435, + "step": 4893 + }, + { + "epoch": 1.5021485573971762, + "grad_norm": 0.47811564803123474, + "learning_rate": 9.626074368332546e-05, + "loss": 1.9399, + "step": 4894 + }, + { + "epoch": 1.5024554941682013, + "grad_norm": 0.4613232910633087, + "learning_rate": 9.62588574076662e-05, + "loss": 1.9259, + "step": 4895 + }, + { + "epoch": 1.5027624309392267, + "grad_norm": 0.4170697331428528, + "learning_rate": 9.62569706748498e-05, + "loss": 1.9319, + "step": 4896 + }, + { + "epoch": 1.5030693677102516, + "grad_norm": 0.4731575548648834, + "learning_rate": 9.62550834848949e-05, + "loss": 1.9862, + "step": 4897 + }, + { + "epoch": 1.503376304481277, + "grad_norm": 0.49881401658058167, + "learning_rate": 9.625319583782016e-05, + "loss": 1.9837, + "step": 4898 + }, + { + "epoch": 1.503683241252302, + "grad_norm": 0.4689660668373108, + "learning_rate": 9.625130773364424e-05, + "loss": 1.9662, + "step": 4899 + }, + { + "epoch": 1.5039901780233271, + "grad_norm": 0.48389768600463867, + "learning_rate": 9.624941917238577e-05, + "loss": 2.0087, + "step": 4900 + }, + { + "epoch": 1.5042971147943525, + "grad_norm": 0.46716609597206116, + "learning_rate": 9.624753015406342e-05, + "loss": 1.9718, + "step": 4901 + }, + { + "epoch": 1.5046040515653776, + "grad_norm": 0.544793963432312, + "learning_rate": 9.62456406786959e-05, + "loss": 1.9878, + "step": 4902 + }, + { + "epoch": 1.5049109883364027, + "grad_norm": 0.44499701261520386, + "learning_rate": 9.624375074630183e-05, + "loss": 1.8849, + "step": 4903 + }, + { + "epoch": 1.505217925107428, + "grad_norm": 0.42464208602905273, + "learning_rate": 9.624186035689993e-05, + "loss": 1.8995, + "step": 4904 + }, + { + "epoch": 1.505524861878453, + "grad_norm": 0.41650670766830444, + "learning_rate": 9.623996951050885e-05, + "loss": 1.9138, + "step": 4905 + }, + { + "epoch": 1.5058317986494782, + "grad_norm": 0.37955889105796814, + "learning_rate": 9.62380782071473e-05, + "loss": 1.9746, + "step": 4906 + }, + { + "epoch": 1.5061387354205034, + "grad_norm": 0.3799228072166443, + "learning_rate": 9.623618644683394e-05, + "loss": 1.942, + "step": 4907 + }, + { + "epoch": 1.5064456721915285, + "grad_norm": 0.3799766004085541, + "learning_rate": 9.623429422958751e-05, + "loss": 1.9025, + "step": 4908 + }, + { + "epoch": 1.5067526089625538, + "grad_norm": 0.3780234456062317, + "learning_rate": 9.623240155542668e-05, + "loss": 1.9581, + "step": 4909 + }, + { + "epoch": 1.507059545733579, + "grad_norm": 0.36379706859588623, + "learning_rate": 9.623050842437014e-05, + "loss": 1.9299, + "step": 4910 + }, + { + "epoch": 1.507366482504604, + "grad_norm": 0.5230580568313599, + "learning_rate": 9.622861483643663e-05, + "loss": 2.0306, + "step": 4911 + }, + { + "epoch": 1.5076734192756294, + "grad_norm": 0.443945050239563, + "learning_rate": 9.622672079164486e-05, + "loss": 1.9032, + "step": 4912 + }, + { + "epoch": 1.5079803560466543, + "grad_norm": 0.4689701795578003, + "learning_rate": 9.622482629001355e-05, + "loss": 1.9901, + "step": 4913 + }, + { + "epoch": 1.5082872928176796, + "grad_norm": 0.4483632445335388, + "learning_rate": 9.622293133156139e-05, + "loss": 1.948, + "step": 4914 + }, + { + "epoch": 1.5085942295887047, + "grad_norm": 0.4064919948577881, + "learning_rate": 9.622103591630715e-05, + "loss": 1.9487, + "step": 4915 + }, + { + "epoch": 1.5089011663597298, + "grad_norm": 0.44170522689819336, + "learning_rate": 9.621914004426952e-05, + "loss": 1.9929, + "step": 4916 + }, + { + "epoch": 1.5092081031307552, + "grad_norm": 0.45979443192481995, + "learning_rate": 9.621724371546727e-05, + "loss": 1.9428, + "step": 4917 + }, + { + "epoch": 1.5095150399017803, + "grad_norm": 0.5258452892303467, + "learning_rate": 9.621534692991913e-05, + "loss": 2.0049, + "step": 4918 + }, + { + "epoch": 1.5098219766728054, + "grad_norm": 0.45191919803619385, + "learning_rate": 9.621344968764385e-05, + "loss": 2.0364, + "step": 4919 + }, + { + "epoch": 1.5101289134438307, + "grad_norm": 0.539245069026947, + "learning_rate": 9.621155198866016e-05, + "loss": 2.072, + "step": 4920 + }, + { + "epoch": 1.5104358502148556, + "grad_norm": 0.5410256385803223, + "learning_rate": 9.620965383298684e-05, + "loss": 2.0231, + "step": 4921 + }, + { + "epoch": 1.510742786985881, + "grad_norm": 0.4409741759300232, + "learning_rate": 9.620775522064264e-05, + "loss": 1.9024, + "step": 4922 + }, + { + "epoch": 1.511049723756906, + "grad_norm": 0.4911535680294037, + "learning_rate": 9.620585615164631e-05, + "loss": 2.0057, + "step": 4923 + }, + { + "epoch": 1.5113566605279312, + "grad_norm": 0.48139557242393494, + "learning_rate": 9.620395662601663e-05, + "loss": 2.0175, + "step": 4924 + }, + { + "epoch": 1.5116635972989565, + "grad_norm": 0.5130077004432678, + "learning_rate": 9.620205664377238e-05, + "loss": 1.952, + "step": 4925 + }, + { + "epoch": 1.5119705340699816, + "grad_norm": 0.5428542494773865, + "learning_rate": 9.62001562049323e-05, + "loss": 1.977, + "step": 4926 + }, + { + "epoch": 1.5122774708410067, + "grad_norm": 0.4586256444454193, + "learning_rate": 9.619825530951522e-05, + "loss": 1.9997, + "step": 4927 + }, + { + "epoch": 1.512584407612032, + "grad_norm": 0.3941349387168884, + "learning_rate": 9.61963539575399e-05, + "loss": 1.9174, + "step": 4928 + }, + { + "epoch": 1.512891344383057, + "grad_norm": 0.4396456480026245, + "learning_rate": 9.619445214902511e-05, + "loss": 1.9696, + "step": 4929 + }, + { + "epoch": 1.5131982811540823, + "grad_norm": 0.5413886904716492, + "learning_rate": 9.61925498839897e-05, + "loss": 2.0332, + "step": 4930 + }, + { + "epoch": 1.5135052179251074, + "grad_norm": 0.5946230888366699, + "learning_rate": 9.619064716245242e-05, + "loss": 2.0433, + "step": 4931 + }, + { + "epoch": 1.5138121546961325, + "grad_norm": 0.6353569030761719, + "learning_rate": 9.618874398443211e-05, + "loss": 1.9828, + "step": 4932 + }, + { + "epoch": 1.5141190914671578, + "grad_norm": 0.523690938949585, + "learning_rate": 9.618684034994754e-05, + "loss": 1.9024, + "step": 4933 + }, + { + "epoch": 1.514426028238183, + "grad_norm": 0.4437367022037506, + "learning_rate": 9.618493625901754e-05, + "loss": 1.9961, + "step": 4934 + }, + { + "epoch": 1.514732965009208, + "grad_norm": 0.48458734154701233, + "learning_rate": 9.618303171166094e-05, + "loss": 1.9515, + "step": 4935 + }, + { + "epoch": 1.5150399017802334, + "grad_norm": 0.47659310698509216, + "learning_rate": 9.618112670789657e-05, + "loss": 1.9943, + "step": 4936 + }, + { + "epoch": 1.5153468385512583, + "grad_norm": 0.49281415343284607, + "learning_rate": 9.617922124774322e-05, + "loss": 1.9311, + "step": 4937 + }, + { + "epoch": 1.5156537753222836, + "grad_norm": 0.4706041216850281, + "learning_rate": 9.617731533121972e-05, + "loss": 1.9478, + "step": 4938 + }, + { + "epoch": 1.5159607120933087, + "grad_norm": 0.4187149405479431, + "learning_rate": 9.617540895834496e-05, + "loss": 1.9915, + "step": 4939 + }, + { + "epoch": 1.5162676488643339, + "grad_norm": 0.3792540431022644, + "learning_rate": 9.617350212913772e-05, + "loss": 1.8609, + "step": 4940 + }, + { + "epoch": 1.5165745856353592, + "grad_norm": 0.46558165550231934, + "learning_rate": 9.617159484361688e-05, + "loss": 1.9574, + "step": 4941 + }, + { + "epoch": 1.5168815224063843, + "grad_norm": 0.4930344820022583, + "learning_rate": 9.616968710180127e-05, + "loss": 1.9924, + "step": 4942 + }, + { + "epoch": 1.5171884591774094, + "grad_norm": 0.44909337162971497, + "learning_rate": 9.616777890370976e-05, + "loss": 1.9674, + "step": 4943 + }, + { + "epoch": 1.5174953959484347, + "grad_norm": 0.43266600370407104, + "learning_rate": 9.616587024936119e-05, + "loss": 1.8899, + "step": 4944 + }, + { + "epoch": 1.5178023327194596, + "grad_norm": 0.43229207396507263, + "learning_rate": 9.616396113877444e-05, + "loss": 1.9671, + "step": 4945 + }, + { + "epoch": 1.518109269490485, + "grad_norm": 0.4609402120113373, + "learning_rate": 9.616205157196837e-05, + "loss": 1.9844, + "step": 4946 + }, + { + "epoch": 1.51841620626151, + "grad_norm": 0.4598314166069031, + "learning_rate": 9.616014154896184e-05, + "loss": 1.985, + "step": 4947 + }, + { + "epoch": 1.5187231430325352, + "grad_norm": 0.4746960997581482, + "learning_rate": 9.615823106977376e-05, + "loss": 2.0199, + "step": 4948 + }, + { + "epoch": 1.5190300798035605, + "grad_norm": 0.47560420632362366, + "learning_rate": 9.615632013442295e-05, + "loss": 1.8864, + "step": 4949 + }, + { + "epoch": 1.5193370165745856, + "grad_norm": 0.447837233543396, + "learning_rate": 9.615440874292835e-05, + "loss": 1.9699, + "step": 4950 + }, + { + "epoch": 1.5196439533456108, + "grad_norm": 0.49653175473213196, + "learning_rate": 9.615249689530883e-05, + "loss": 2.0645, + "step": 4951 + }, + { + "epoch": 1.519950890116636, + "grad_norm": 0.47083014249801636, + "learning_rate": 9.615058459158328e-05, + "loss": 2.01, + "step": 4952 + }, + { + "epoch": 1.520257826887661, + "grad_norm": 0.5299197435379028, + "learning_rate": 9.614867183177061e-05, + "loss": 2.0232, + "step": 4953 + }, + { + "epoch": 1.5205647636586863, + "grad_norm": 0.5005922317504883, + "learning_rate": 9.614675861588971e-05, + "loss": 1.9703, + "step": 4954 + }, + { + "epoch": 1.5208717004297114, + "grad_norm": 0.5131978392601013, + "learning_rate": 9.61448449439595e-05, + "loss": 1.9921, + "step": 4955 + }, + { + "epoch": 1.5211786372007365, + "grad_norm": 0.5278428196907043, + "learning_rate": 9.614293081599889e-05, + "loss": 1.9111, + "step": 4956 + }, + { + "epoch": 1.5214855739717619, + "grad_norm": 0.4914579689502716, + "learning_rate": 9.614101623202678e-05, + "loss": 2.0398, + "step": 4957 + }, + { + "epoch": 1.521792510742787, + "grad_norm": 0.454863041639328, + "learning_rate": 9.61391011920621e-05, + "loss": 1.9674, + "step": 4958 + }, + { + "epoch": 1.522099447513812, + "grad_norm": 0.464491605758667, + "learning_rate": 9.613718569612379e-05, + "loss": 2.0123, + "step": 4959 + }, + { + "epoch": 1.5224063842848374, + "grad_norm": 0.4252295196056366, + "learning_rate": 9.613526974423078e-05, + "loss": 1.9796, + "step": 4960 + }, + { + "epoch": 1.5227133210558625, + "grad_norm": 0.4643968641757965, + "learning_rate": 9.613335333640199e-05, + "loss": 1.9448, + "step": 4961 + }, + { + "epoch": 1.5230202578268877, + "grad_norm": 0.4204397201538086, + "learning_rate": 9.613143647265635e-05, + "loss": 2.0191, + "step": 4962 + }, + { + "epoch": 1.523327194597913, + "grad_norm": 0.3838767111301422, + "learning_rate": 9.612951915301283e-05, + "loss": 1.9057, + "step": 4963 + }, + { + "epoch": 1.5236341313689379, + "grad_norm": 0.4353863000869751, + "learning_rate": 9.612760137749035e-05, + "loss": 2.0435, + "step": 4964 + }, + { + "epoch": 1.5239410681399632, + "grad_norm": 0.4082738757133484, + "learning_rate": 9.612568314610788e-05, + "loss": 1.9229, + "step": 4965 + }, + { + "epoch": 1.5242480049109883, + "grad_norm": 0.4382591247558594, + "learning_rate": 9.612376445888437e-05, + "loss": 1.9185, + "step": 4966 + }, + { + "epoch": 1.5245549416820134, + "grad_norm": 0.48340749740600586, + "learning_rate": 9.61218453158388e-05, + "loss": 1.9669, + "step": 4967 + }, + { + "epoch": 1.5248618784530388, + "grad_norm": 0.47423556447029114, + "learning_rate": 9.611992571699012e-05, + "loss": 1.9372, + "step": 4968 + }, + { + "epoch": 1.525168815224064, + "grad_norm": 0.4070637822151184, + "learning_rate": 9.611800566235728e-05, + "loss": 2.0201, + "step": 4969 + }, + { + "epoch": 1.525475751995089, + "grad_norm": 0.43758198618888855, + "learning_rate": 9.61160851519593e-05, + "loss": 1.982, + "step": 4970 + }, + { + "epoch": 1.5257826887661143, + "grad_norm": 0.4724174737930298, + "learning_rate": 9.611416418581513e-05, + "loss": 1.9938, + "step": 4971 + }, + { + "epoch": 1.5260896255371392, + "grad_norm": 0.492405503988266, + "learning_rate": 9.611224276394374e-05, + "loss": 1.9462, + "step": 4972 + }, + { + "epoch": 1.5263965623081646, + "grad_norm": 0.5064161419868469, + "learning_rate": 9.611032088636418e-05, + "loss": 2.0326, + "step": 4973 + }, + { + "epoch": 1.5267034990791897, + "grad_norm": 0.4256031811237335, + "learning_rate": 9.610839855309537e-05, + "loss": 1.8885, + "step": 4974 + }, + { + "epoch": 1.5270104358502148, + "grad_norm": 0.4283316731452942, + "learning_rate": 9.610647576415636e-05, + "loss": 2.005, + "step": 4975 + }, + { + "epoch": 1.5273173726212401, + "grad_norm": 0.44234412908554077, + "learning_rate": 9.610455251956614e-05, + "loss": 1.9626, + "step": 4976 + }, + { + "epoch": 1.5276243093922652, + "grad_norm": 0.4135831594467163, + "learning_rate": 9.610262881934369e-05, + "loss": 1.9529, + "step": 4977 + }, + { + "epoch": 1.5279312461632903, + "grad_norm": 0.48090922832489014, + "learning_rate": 9.610070466350805e-05, + "loss": 2.0239, + "step": 4978 + }, + { + "epoch": 1.5282381829343157, + "grad_norm": 0.4546974301338196, + "learning_rate": 9.609878005207822e-05, + "loss": 1.9556, + "step": 4979 + }, + { + "epoch": 1.5285451197053406, + "grad_norm": 0.4197862148284912, + "learning_rate": 9.609685498507323e-05, + "loss": 1.9117, + "step": 4980 + }, + { + "epoch": 1.528852056476366, + "grad_norm": 0.4376974105834961, + "learning_rate": 9.60949294625121e-05, + "loss": 1.9514, + "step": 4981 + }, + { + "epoch": 1.529158993247391, + "grad_norm": 0.3671407401561737, + "learning_rate": 9.609300348441385e-05, + "loss": 1.9042, + "step": 4982 + }, + { + "epoch": 1.5294659300184161, + "grad_norm": 0.4326031506061554, + "learning_rate": 9.609107705079754e-05, + "loss": 1.9606, + "step": 4983 + }, + { + "epoch": 1.5297728667894415, + "grad_norm": 0.423308402299881, + "learning_rate": 9.608915016168218e-05, + "loss": 1.9663, + "step": 4984 + }, + { + "epoch": 1.5300798035604666, + "grad_norm": 0.46309906244277954, + "learning_rate": 9.608722281708683e-05, + "loss": 2.0114, + "step": 4985 + }, + { + "epoch": 1.5303867403314917, + "grad_norm": 0.4619913101196289, + "learning_rate": 9.608529501703053e-05, + "loss": 1.9328, + "step": 4986 + }, + { + "epoch": 1.530693677102517, + "grad_norm": 0.4335738718509674, + "learning_rate": 9.608336676153234e-05, + "loss": 1.9069, + "step": 4987 + }, + { + "epoch": 1.531000613873542, + "grad_norm": 0.40606966614723206, + "learning_rate": 9.608143805061129e-05, + "loss": 1.9243, + "step": 4988 + }, + { + "epoch": 1.5313075506445673, + "grad_norm": 0.45613235235214233, + "learning_rate": 9.607950888428649e-05, + "loss": 1.9943, + "step": 4989 + }, + { + "epoch": 1.5316144874155924, + "grad_norm": 0.4905582666397095, + "learning_rate": 9.607757926257696e-05, + "loss": 1.9649, + "step": 4990 + }, + { + "epoch": 1.5319214241866175, + "grad_norm": 0.44312527775764465, + "learning_rate": 9.607564918550179e-05, + "loss": 1.927, + "step": 4991 + }, + { + "epoch": 1.5322283609576428, + "grad_norm": 0.5193700790405273, + "learning_rate": 9.607371865308004e-05, + "loss": 1.9038, + "step": 4992 + }, + { + "epoch": 1.532535297728668, + "grad_norm": 0.5528806447982788, + "learning_rate": 9.607178766533078e-05, + "loss": 1.9194, + "step": 4993 + }, + { + "epoch": 1.532842234499693, + "grad_norm": 0.6561285257339478, + "learning_rate": 9.606985622227314e-05, + "loss": 2.0098, + "step": 4994 + }, + { + "epoch": 1.5331491712707184, + "grad_norm": 0.5642603635787964, + "learning_rate": 9.606792432392617e-05, + "loss": 1.9781, + "step": 4995 + }, + { + "epoch": 1.5334561080417433, + "grad_norm": 0.4974311590194702, + "learning_rate": 9.606599197030896e-05, + "loss": 1.9558, + "step": 4996 + }, + { + "epoch": 1.5337630448127686, + "grad_norm": 0.4324510395526886, + "learning_rate": 9.606405916144063e-05, + "loss": 1.9749, + "step": 4997 + }, + { + "epoch": 1.5340699815837937, + "grad_norm": 0.45244327187538147, + "learning_rate": 9.606212589734027e-05, + "loss": 1.8902, + "step": 4998 + }, + { + "epoch": 1.5343769183548188, + "grad_norm": 0.5418685078620911, + "learning_rate": 9.606019217802698e-05, + "loss": 1.9766, + "step": 4999 + }, + { + "epoch": 1.5346838551258442, + "grad_norm": 0.48479241132736206, + "learning_rate": 9.605825800351987e-05, + "loss": 1.9949, + "step": 5000 + }, + { + "epoch": 1.5349907918968693, + "grad_norm": 0.4958111643791199, + "learning_rate": 9.605632337383806e-05, + "loss": 1.988, + "step": 5001 + }, + { + "epoch": 1.5352977286678944, + "grad_norm": 0.47347983717918396, + "learning_rate": 9.605438828900067e-05, + "loss": 1.9157, + "step": 5002 + }, + { + "epoch": 1.5356046654389197, + "grad_norm": 0.4018974304199219, + "learning_rate": 9.605245274902684e-05, + "loss": 1.9347, + "step": 5003 + }, + { + "epoch": 1.5359116022099446, + "grad_norm": 0.46161791682243347, + "learning_rate": 9.605051675393565e-05, + "loss": 1.9785, + "step": 5004 + }, + { + "epoch": 1.53621853898097, + "grad_norm": 0.5113234519958496, + "learning_rate": 9.604858030374627e-05, + "loss": 1.9595, + "step": 5005 + }, + { + "epoch": 1.536525475751995, + "grad_norm": 0.6643409132957458, + "learning_rate": 9.604664339847784e-05, + "loss": 2.0395, + "step": 5006 + }, + { + "epoch": 1.5368324125230202, + "grad_norm": 0.6759974360466003, + "learning_rate": 9.604470603814948e-05, + "loss": 1.9058, + "step": 5007 + }, + { + "epoch": 1.5371393492940455, + "grad_norm": 0.5576213598251343, + "learning_rate": 9.604276822278035e-05, + "loss": 1.9326, + "step": 5008 + }, + { + "epoch": 1.5374462860650706, + "grad_norm": 0.4472630023956299, + "learning_rate": 9.60408299523896e-05, + "loss": 1.9553, + "step": 5009 + }, + { + "epoch": 1.5377532228360957, + "grad_norm": 0.48445144295692444, + "learning_rate": 9.603889122699638e-05, + "loss": 2.0136, + "step": 5010 + }, + { + "epoch": 1.538060159607121, + "grad_norm": 0.4793097972869873, + "learning_rate": 9.603695204661987e-05, + "loss": 1.9777, + "step": 5011 + }, + { + "epoch": 1.538367096378146, + "grad_norm": 0.5003167390823364, + "learning_rate": 9.60350124112792e-05, + "loss": 1.9672, + "step": 5012 + }, + { + "epoch": 1.5386740331491713, + "grad_norm": 0.5131042003631592, + "learning_rate": 9.603307232099355e-05, + "loss": 2.0058, + "step": 5013 + }, + { + "epoch": 1.5389809699201964, + "grad_norm": 0.4145869314670563, + "learning_rate": 9.603113177578212e-05, + "loss": 1.9332, + "step": 5014 + }, + { + "epoch": 1.5392879066912215, + "grad_norm": 0.4939991235733032, + "learning_rate": 9.602919077566404e-05, + "loss": 1.9967, + "step": 5015 + }, + { + "epoch": 1.5395948434622468, + "grad_norm": 0.4768902361392975, + "learning_rate": 9.602724932065853e-05, + "loss": 1.873, + "step": 5016 + }, + { + "epoch": 1.539901780233272, + "grad_norm": 0.45381611585617065, + "learning_rate": 9.602530741078476e-05, + "loss": 1.9416, + "step": 5017 + }, + { + "epoch": 1.540208717004297, + "grad_norm": 0.43104392290115356, + "learning_rate": 9.602336504606193e-05, + "loss": 1.9566, + "step": 5018 + }, + { + "epoch": 1.5405156537753224, + "grad_norm": 0.5354776978492737, + "learning_rate": 9.602142222650924e-05, + "loss": 1.9939, + "step": 5019 + }, + { + "epoch": 1.5408225905463473, + "grad_norm": 0.5623740553855896, + "learning_rate": 9.601947895214586e-05, + "loss": 1.9622, + "step": 5020 + }, + { + "epoch": 1.5411295273173726, + "grad_norm": 0.5234485268592834, + "learning_rate": 9.601753522299103e-05, + "loss": 1.9636, + "step": 5021 + }, + { + "epoch": 1.5414364640883977, + "grad_norm": 0.416384756565094, + "learning_rate": 9.601559103906396e-05, + "loss": 1.92, + "step": 5022 + }, + { + "epoch": 1.5417434008594229, + "grad_norm": 0.47080478072166443, + "learning_rate": 9.601364640038384e-05, + "loss": 1.9147, + "step": 5023 + }, + { + "epoch": 1.5420503376304482, + "grad_norm": 0.527463972568512, + "learning_rate": 9.601170130696988e-05, + "loss": 1.9458, + "step": 5024 + }, + { + "epoch": 1.5423572744014733, + "grad_norm": 0.4761022925376892, + "learning_rate": 9.600975575884134e-05, + "loss": 1.95, + "step": 5025 + }, + { + "epoch": 1.5426642111724984, + "grad_norm": 0.48202264308929443, + "learning_rate": 9.600780975601741e-05, + "loss": 1.9618, + "step": 5026 + }, + { + "epoch": 1.5429711479435237, + "grad_norm": 0.43222522735595703, + "learning_rate": 9.600586329851735e-05, + "loss": 1.9869, + "step": 5027 + }, + { + "epoch": 1.5432780847145486, + "grad_norm": 0.40816691517829895, + "learning_rate": 9.600391638636037e-05, + "loss": 1.991, + "step": 5028 + }, + { + "epoch": 1.543585021485574, + "grad_norm": 0.4365478754043579, + "learning_rate": 9.600196901956572e-05, + "loss": 1.9904, + "step": 5029 + }, + { + "epoch": 1.5438919582565993, + "grad_norm": 0.41411092877388, + "learning_rate": 9.600002119815268e-05, + "loss": 1.9449, + "step": 5030 + }, + { + "epoch": 1.5441988950276242, + "grad_norm": 0.41023650765419006, + "learning_rate": 9.599807292214045e-05, + "loss": 1.9318, + "step": 5031 + }, + { + "epoch": 1.5445058317986495, + "grad_norm": 0.4844631254673004, + "learning_rate": 9.599612419154831e-05, + "loss": 1.9884, + "step": 5032 + }, + { + "epoch": 1.5448127685696746, + "grad_norm": 0.4347037374973297, + "learning_rate": 9.59941750063955e-05, + "loss": 1.8992, + "step": 5033 + }, + { + "epoch": 1.5451197053406998, + "grad_norm": 0.6414445638656616, + "learning_rate": 9.59922253667013e-05, + "loss": 2.0268, + "step": 5034 + }, + { + "epoch": 1.545426642111725, + "grad_norm": 0.6607222557067871, + "learning_rate": 9.599027527248498e-05, + "loss": 2.0116, + "step": 5035 + }, + { + "epoch": 1.5457335788827502, + "grad_norm": 0.6406869292259216, + "learning_rate": 9.59883247237658e-05, + "loss": 1.9256, + "step": 5036 + }, + { + "epoch": 1.5460405156537753, + "grad_norm": 0.5388308167457581, + "learning_rate": 9.598637372056303e-05, + "loss": 1.906, + "step": 5037 + }, + { + "epoch": 1.5463474524248007, + "grad_norm": 0.42285510897636414, + "learning_rate": 9.598442226289596e-05, + "loss": 1.9137, + "step": 5038 + }, + { + "epoch": 1.5466543891958255, + "grad_norm": 0.5622994303703308, + "learning_rate": 9.598247035078389e-05, + "loss": 1.9825, + "step": 5039 + }, + { + "epoch": 1.5469613259668509, + "grad_norm": 0.7120574116706848, + "learning_rate": 9.59805179842461e-05, + "loss": 1.9467, + "step": 5040 + }, + { + "epoch": 1.547268262737876, + "grad_norm": 0.7050338983535767, + "learning_rate": 9.597856516330187e-05, + "loss": 1.9763, + "step": 5041 + }, + { + "epoch": 1.547575199508901, + "grad_norm": 0.4908922016620636, + "learning_rate": 9.597661188797051e-05, + "loss": 1.9826, + "step": 5042 + }, + { + "epoch": 1.5478821362799264, + "grad_norm": 0.47363361716270447, + "learning_rate": 9.597465815827133e-05, + "loss": 1.9769, + "step": 5043 + }, + { + "epoch": 1.5481890730509515, + "grad_norm": 0.6289864182472229, + "learning_rate": 9.597270397422364e-05, + "loss": 1.9364, + "step": 5044 + }, + { + "epoch": 1.5484960098219767, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.597074933584673e-05, + "loss": 1.949, + "step": 5045 + }, + { + "epoch": 1.548802946593002, + "grad_norm": 0.559152364730835, + "learning_rate": 9.596879424315993e-05, + "loss": 2.0194, + "step": 5046 + }, + { + "epoch": 1.5491098833640269, + "grad_norm": 0.4613901674747467, + "learning_rate": 9.596683869618257e-05, + "loss": 1.9658, + "step": 5047 + }, + { + "epoch": 1.5494168201350522, + "grad_norm": 0.6245483160018921, + "learning_rate": 9.596488269493396e-05, + "loss": 1.9265, + "step": 5048 + }, + { + "epoch": 1.5497237569060773, + "grad_norm": 0.8100824356079102, + "learning_rate": 9.596292623943343e-05, + "loss": 1.9536, + "step": 5049 + }, + { + "epoch": 1.5500306936771024, + "grad_norm": 0.7486092448234558, + "learning_rate": 9.596096932970035e-05, + "loss": 1.9801, + "step": 5050 + }, + { + "epoch": 1.5503376304481278, + "grad_norm": 0.4803295135498047, + "learning_rate": 9.595901196575401e-05, + "loss": 1.9943, + "step": 5051 + }, + { + "epoch": 1.550644567219153, + "grad_norm": 0.5027125477790833, + "learning_rate": 9.595705414761379e-05, + "loss": 1.9036, + "step": 5052 + }, + { + "epoch": 1.550951503990178, + "grad_norm": 0.5785070657730103, + "learning_rate": 9.595509587529902e-05, + "loss": 1.9489, + "step": 5053 + }, + { + "epoch": 1.5512584407612033, + "grad_norm": 0.6017338633537292, + "learning_rate": 9.595313714882906e-05, + "loss": 1.9964, + "step": 5054 + }, + { + "epoch": 1.5515653775322282, + "grad_norm": 0.5023195147514343, + "learning_rate": 9.595117796822326e-05, + "loss": 1.9778, + "step": 5055 + }, + { + "epoch": 1.5518723143032536, + "grad_norm": 0.4488884508609772, + "learning_rate": 9.594921833350099e-05, + "loss": 2.0141, + "step": 5056 + }, + { + "epoch": 1.5521792510742787, + "grad_norm": 0.47110801935195923, + "learning_rate": 9.59472582446816e-05, + "loss": 1.9294, + "step": 5057 + }, + { + "epoch": 1.5524861878453038, + "grad_norm": 0.5292330980300903, + "learning_rate": 9.594529770178449e-05, + "loss": 2.0427, + "step": 5058 + }, + { + "epoch": 1.5527931246163291, + "grad_norm": 0.522756814956665, + "learning_rate": 9.5943336704829e-05, + "loss": 1.9854, + "step": 5059 + }, + { + "epoch": 1.5531000613873542, + "grad_norm": 0.44659632444381714, + "learning_rate": 9.594137525383455e-05, + "loss": 2.028, + "step": 5060 + }, + { + "epoch": 1.5534069981583793, + "grad_norm": 0.4745616614818573, + "learning_rate": 9.593941334882048e-05, + "loss": 1.9994, + "step": 5061 + }, + { + "epoch": 1.5537139349294047, + "grad_norm": 0.41752973198890686, + "learning_rate": 9.593745098980622e-05, + "loss": 1.9466, + "step": 5062 + }, + { + "epoch": 1.5540208717004296, + "grad_norm": 0.4548248052597046, + "learning_rate": 9.593548817681115e-05, + "loss": 1.9064, + "step": 5063 + }, + { + "epoch": 1.554327808471455, + "grad_norm": 0.45780888199806213, + "learning_rate": 9.593352490985464e-05, + "loss": 2.0254, + "step": 5064 + }, + { + "epoch": 1.55463474524248, + "grad_norm": 0.4118718206882477, + "learning_rate": 9.593156118895613e-05, + "loss": 1.9761, + "step": 5065 + }, + { + "epoch": 1.5549416820135051, + "grad_norm": 0.41350236535072327, + "learning_rate": 9.592959701413501e-05, + "loss": 1.9476, + "step": 5066 + }, + { + "epoch": 1.5552486187845305, + "grad_norm": 0.4116091728210449, + "learning_rate": 9.59276323854107e-05, + "loss": 1.9325, + "step": 5067 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.44039735198020935, + "learning_rate": 9.592566730280259e-05, + "loss": 1.9916, + "step": 5068 + }, + { + "epoch": 1.5558624923265807, + "grad_norm": 0.4028816819190979, + "learning_rate": 9.592370176633012e-05, + "loss": 1.916, + "step": 5069 + }, + { + "epoch": 1.556169429097606, + "grad_norm": 0.42046302556991577, + "learning_rate": 9.592173577601271e-05, + "loss": 1.961, + "step": 5070 + }, + { + "epoch": 1.556476365868631, + "grad_norm": 0.3749450147151947, + "learning_rate": 9.591976933186982e-05, + "loss": 1.9279, + "step": 5071 + }, + { + "epoch": 1.5567833026396563, + "grad_norm": 0.3441384434700012, + "learning_rate": 9.591780243392081e-05, + "loss": 1.8967, + "step": 5072 + }, + { + "epoch": 1.5570902394106814, + "grad_norm": 0.4032546877861023, + "learning_rate": 9.59158350821852e-05, + "loss": 1.9912, + "step": 5073 + }, + { + "epoch": 1.5573971761817065, + "grad_norm": 0.44628265500068665, + "learning_rate": 9.591386727668238e-05, + "loss": 2.0539, + "step": 5074 + }, + { + "epoch": 1.5577041129527318, + "grad_norm": 0.43606969714164734, + "learning_rate": 9.59118990174318e-05, + "loss": 1.97, + "step": 5075 + }, + { + "epoch": 1.558011049723757, + "grad_norm": 0.42076775431632996, + "learning_rate": 9.590993030445295e-05, + "loss": 1.962, + "step": 5076 + }, + { + "epoch": 1.558317986494782, + "grad_norm": 0.34569117426872253, + "learning_rate": 9.590796113776526e-05, + "loss": 1.8815, + "step": 5077 + }, + { + "epoch": 1.5586249232658074, + "grad_norm": 0.3931111693382263, + "learning_rate": 9.590599151738817e-05, + "loss": 1.9016, + "step": 5078 + }, + { + "epoch": 1.5589318600368323, + "grad_norm": 0.3952369689941406, + "learning_rate": 9.590402144334117e-05, + "loss": 1.9277, + "step": 5079 + }, + { + "epoch": 1.5592387968078576, + "grad_norm": 0.3960857689380646, + "learning_rate": 9.590205091564372e-05, + "loss": 1.947, + "step": 5080 + }, + { + "epoch": 1.5595457335788827, + "grad_norm": 0.37946292757987976, + "learning_rate": 9.590007993431532e-05, + "loss": 1.9907, + "step": 5081 + }, + { + "epoch": 1.5598526703499078, + "grad_norm": 0.41619375348091125, + "learning_rate": 9.589810849937541e-05, + "loss": 1.9451, + "step": 5082 + }, + { + "epoch": 1.5601596071209332, + "grad_norm": 0.39266669750213623, + "learning_rate": 9.58961366108435e-05, + "loss": 2.0137, + "step": 5083 + }, + { + "epoch": 1.5604665438919583, + "grad_norm": 0.39510276913642883, + "learning_rate": 9.589416426873907e-05, + "loss": 1.947, + "step": 5084 + }, + { + "epoch": 1.5607734806629834, + "grad_norm": 0.40243181586265564, + "learning_rate": 9.58921914730816e-05, + "loss": 1.8957, + "step": 5085 + }, + { + "epoch": 1.5610804174340087, + "grad_norm": 0.39877578616142273, + "learning_rate": 9.58902182238906e-05, + "loss": 1.9497, + "step": 5086 + }, + { + "epoch": 1.5613873542050336, + "grad_norm": 0.39367151260375977, + "learning_rate": 9.588824452118557e-05, + "loss": 1.9616, + "step": 5087 + }, + { + "epoch": 1.561694290976059, + "grad_norm": 0.35690104961395264, + "learning_rate": 9.5886270364986e-05, + "loss": 1.9108, + "step": 5088 + }, + { + "epoch": 1.562001227747084, + "grad_norm": 0.39512762427330017, + "learning_rate": 9.588429575531141e-05, + "loss": 1.9909, + "step": 5089 + }, + { + "epoch": 1.5623081645181092, + "grad_norm": 0.39253926277160645, + "learning_rate": 9.588232069218132e-05, + "loss": 1.937, + "step": 5090 + }, + { + "epoch": 1.5626151012891345, + "grad_norm": 0.37811553478240967, + "learning_rate": 9.588034517561526e-05, + "loss": 1.8918, + "step": 5091 + }, + { + "epoch": 1.5629220380601596, + "grad_norm": 0.38191986083984375, + "learning_rate": 9.587836920563272e-05, + "loss": 1.9149, + "step": 5092 + }, + { + "epoch": 1.5632289748311847, + "grad_norm": 0.3903779089450836, + "learning_rate": 9.587639278225326e-05, + "loss": 1.9714, + "step": 5093 + }, + { + "epoch": 1.56353591160221, + "grad_norm": 0.4467499554157257, + "learning_rate": 9.587441590549639e-05, + "loss": 1.8822, + "step": 5094 + }, + { + "epoch": 1.563842848373235, + "grad_norm": 0.3819296956062317, + "learning_rate": 9.587243857538164e-05, + "loss": 1.9212, + "step": 5095 + }, + { + "epoch": 1.5641497851442603, + "grad_norm": 0.4305097162723541, + "learning_rate": 9.587046079192858e-05, + "loss": 1.9264, + "step": 5096 + }, + { + "epoch": 1.5644567219152854, + "grad_norm": 0.4135383367538452, + "learning_rate": 9.586848255515675e-05, + "loss": 1.9743, + "step": 5097 + }, + { + "epoch": 1.5647636586863105, + "grad_norm": 0.44688066840171814, + "learning_rate": 9.586650386508566e-05, + "loss": 1.8804, + "step": 5098 + }, + { + "epoch": 1.5650705954573358, + "grad_norm": 0.5358461737632751, + "learning_rate": 9.586452472173492e-05, + "loss": 1.9485, + "step": 5099 + }, + { + "epoch": 1.565377532228361, + "grad_norm": 0.5585343837738037, + "learning_rate": 9.586254512512408e-05, + "loss": 2.0901, + "step": 5100 + }, + { + "epoch": 1.565684468999386, + "grad_norm": 0.4682343602180481, + "learning_rate": 9.586056507527266e-05, + "loss": 1.8877, + "step": 5101 + }, + { + "epoch": 1.5659914057704114, + "grad_norm": 0.44076529145240784, + "learning_rate": 9.585858457220026e-05, + "loss": 1.93, + "step": 5102 + }, + { + "epoch": 1.5662983425414365, + "grad_norm": 0.4613071382045746, + "learning_rate": 9.585660361592646e-05, + "loss": 1.9689, + "step": 5103 + }, + { + "epoch": 1.5666052793124616, + "grad_norm": 0.4589289128780365, + "learning_rate": 9.585462220647082e-05, + "loss": 1.8876, + "step": 5104 + }, + { + "epoch": 1.566912216083487, + "grad_norm": 0.3495907485485077, + "learning_rate": 9.585264034385292e-05, + "loss": 1.9013, + "step": 5105 + }, + { + "epoch": 1.5672191528545119, + "grad_norm": 0.42263728380203247, + "learning_rate": 9.585065802809235e-05, + "loss": 1.8886, + "step": 5106 + }, + { + "epoch": 1.5675260896255372, + "grad_norm": 0.4275301694869995, + "learning_rate": 9.584867525920872e-05, + "loss": 1.9865, + "step": 5107 + }, + { + "epoch": 1.5678330263965623, + "grad_norm": 0.4228142201900482, + "learning_rate": 9.584669203722161e-05, + "loss": 1.8573, + "step": 5108 + }, + { + "epoch": 1.5681399631675874, + "grad_norm": 0.4422524571418762, + "learning_rate": 9.58447083621506e-05, + "loss": 1.924, + "step": 5109 + }, + { + "epoch": 1.5684468999386127, + "grad_norm": 0.41540947556495667, + "learning_rate": 9.584272423401532e-05, + "loss": 1.969, + "step": 5110 + }, + { + "epoch": 1.5687538367096379, + "grad_norm": 0.3963775336742401, + "learning_rate": 9.584073965283538e-05, + "loss": 1.9509, + "step": 5111 + }, + { + "epoch": 1.569060773480663, + "grad_norm": 0.41465985774993896, + "learning_rate": 9.583875461863037e-05, + "loss": 1.9393, + "step": 5112 + }, + { + "epoch": 1.5693677102516883, + "grad_norm": 0.4396083652973175, + "learning_rate": 9.583676913141991e-05, + "loss": 1.9872, + "step": 5113 + }, + { + "epoch": 1.5696746470227132, + "grad_norm": 0.4247182607650757, + "learning_rate": 9.583478319122366e-05, + "loss": 1.9807, + "step": 5114 + }, + { + "epoch": 1.5699815837937385, + "grad_norm": 0.3612080216407776, + "learning_rate": 9.583279679806119e-05, + "loss": 1.9563, + "step": 5115 + }, + { + "epoch": 1.5702885205647636, + "grad_norm": 0.40084055066108704, + "learning_rate": 9.583080995195217e-05, + "loss": 1.9099, + "step": 5116 + }, + { + "epoch": 1.5705954573357888, + "grad_norm": 0.432381272315979, + "learning_rate": 9.582882265291621e-05, + "loss": 2.0167, + "step": 5117 + }, + { + "epoch": 1.570902394106814, + "grad_norm": 0.45490768551826477, + "learning_rate": 9.5826834900973e-05, + "loss": 1.9179, + "step": 5118 + }, + { + "epoch": 1.5712093308778392, + "grad_norm": 0.39158329367637634, + "learning_rate": 9.582484669614211e-05, + "loss": 1.8716, + "step": 5119 + }, + { + "epoch": 1.5715162676488643, + "grad_norm": 0.45607441663742065, + "learning_rate": 9.582285803844324e-05, + "loss": 1.9631, + "step": 5120 + }, + { + "epoch": 1.5718232044198897, + "grad_norm": 0.42591094970703125, + "learning_rate": 9.582086892789604e-05, + "loss": 1.9809, + "step": 5121 + }, + { + "epoch": 1.5721301411909145, + "grad_norm": 0.46772903203964233, + "learning_rate": 9.581887936452015e-05, + "loss": 1.9991, + "step": 5122 + }, + { + "epoch": 1.5724370779619399, + "grad_norm": 0.4450485408306122, + "learning_rate": 9.581688934833524e-05, + "loss": 1.9471, + "step": 5123 + }, + { + "epoch": 1.572744014732965, + "grad_norm": 0.37539350986480713, + "learning_rate": 9.581489887936097e-05, + "loss": 1.8624, + "step": 5124 + }, + { + "epoch": 1.57305095150399, + "grad_norm": 0.4184030294418335, + "learning_rate": 9.581290795761702e-05, + "loss": 1.9746, + "step": 5125 + }, + { + "epoch": 1.5733578882750154, + "grad_norm": 0.43275317549705505, + "learning_rate": 9.581091658312305e-05, + "loss": 2.0484, + "step": 5126 + }, + { + "epoch": 1.5736648250460405, + "grad_norm": 0.48845502734184265, + "learning_rate": 9.580892475589876e-05, + "loss": 1.9331, + "step": 5127 + }, + { + "epoch": 1.5739717618170657, + "grad_norm": 0.4653528034687042, + "learning_rate": 9.580693247596383e-05, + "loss": 1.8888, + "step": 5128 + }, + { + "epoch": 1.574278698588091, + "grad_norm": 0.4371016323566437, + "learning_rate": 9.580493974333794e-05, + "loss": 1.9004, + "step": 5129 + }, + { + "epoch": 1.5745856353591159, + "grad_norm": 0.4274102747440338, + "learning_rate": 9.580294655804079e-05, + "loss": 1.9877, + "step": 5130 + }, + { + "epoch": 1.5748925721301412, + "grad_norm": 0.4053245484828949, + "learning_rate": 9.580095292009208e-05, + "loss": 1.9253, + "step": 5131 + }, + { + "epoch": 1.5751995089011663, + "grad_norm": 0.47868627309799194, + "learning_rate": 9.579895882951151e-05, + "loss": 1.9659, + "step": 5132 + }, + { + "epoch": 1.5755064456721914, + "grad_norm": 0.47420576214790344, + "learning_rate": 9.579696428631877e-05, + "loss": 1.9115, + "step": 5133 + }, + { + "epoch": 1.5758133824432168, + "grad_norm": 0.41192150115966797, + "learning_rate": 9.57949692905336e-05, + "loss": 1.8949, + "step": 5134 + }, + { + "epoch": 1.576120319214242, + "grad_norm": 0.44949471950531006, + "learning_rate": 9.57929738421757e-05, + "loss": 1.9393, + "step": 5135 + }, + { + "epoch": 1.576427255985267, + "grad_norm": 0.38450154662132263, + "learning_rate": 9.57909779412648e-05, + "loss": 1.8399, + "step": 5136 + }, + { + "epoch": 1.5767341927562923, + "grad_norm": 0.43553364276885986, + "learning_rate": 9.57889815878206e-05, + "loss": 1.9477, + "step": 5137 + }, + { + "epoch": 1.5770411295273172, + "grad_norm": 0.4546982944011688, + "learning_rate": 9.578698478186285e-05, + "loss": 1.9169, + "step": 5138 + }, + { + "epoch": 1.5773480662983426, + "grad_norm": 0.47802838683128357, + "learning_rate": 9.57849875234113e-05, + "loss": 1.9204, + "step": 5139 + }, + { + "epoch": 1.5776550030693677, + "grad_norm": 0.3648034930229187, + "learning_rate": 9.578298981248565e-05, + "loss": 1.9157, + "step": 5140 + }, + { + "epoch": 1.5779619398403928, + "grad_norm": 0.41951245069503784, + "learning_rate": 9.578099164910565e-05, + "loss": 1.9171, + "step": 5141 + }, + { + "epoch": 1.5782688766114181, + "grad_norm": 0.5198701620101929, + "learning_rate": 9.577899303329107e-05, + "loss": 1.9786, + "step": 5142 + }, + { + "epoch": 1.5785758133824432, + "grad_norm": 0.45244187116622925, + "learning_rate": 9.577699396506165e-05, + "loss": 2.0044, + "step": 5143 + }, + { + "epoch": 1.5788827501534684, + "grad_norm": 0.3874819874763489, + "learning_rate": 9.577499444443715e-05, + "loss": 1.9385, + "step": 5144 + }, + { + "epoch": 1.5791896869244937, + "grad_norm": 0.4578075110912323, + "learning_rate": 9.577299447143733e-05, + "loss": 1.9679, + "step": 5145 + }, + { + "epoch": 1.5794966236955186, + "grad_norm": 0.6001343727111816, + "learning_rate": 9.577099404608192e-05, + "loss": 1.9331, + "step": 5146 + }, + { + "epoch": 1.579803560466544, + "grad_norm": 0.5592501759529114, + "learning_rate": 9.576899316839074e-05, + "loss": 1.8968, + "step": 5147 + }, + { + "epoch": 1.580110497237569, + "grad_norm": 0.4333004951477051, + "learning_rate": 9.576699183838356e-05, + "loss": 2.0378, + "step": 5148 + }, + { + "epoch": 1.5804174340085941, + "grad_norm": 0.40593892335891724, + "learning_rate": 9.576499005608011e-05, + "loss": 1.9878, + "step": 5149 + }, + { + "epoch": 1.5807243707796195, + "grad_norm": 0.4805290400981903, + "learning_rate": 9.576298782150023e-05, + "loss": 1.9897, + "step": 5150 + }, + { + "epoch": 1.5810313075506446, + "grad_norm": 0.4620860517024994, + "learning_rate": 9.576098513466367e-05, + "loss": 1.9808, + "step": 5151 + }, + { + "epoch": 1.5813382443216697, + "grad_norm": 0.47085410356521606, + "learning_rate": 9.575898199559023e-05, + "loss": 1.9526, + "step": 5152 + }, + { + "epoch": 1.581645181092695, + "grad_norm": 0.512971043586731, + "learning_rate": 9.575697840429971e-05, + "loss": 1.9684, + "step": 5153 + }, + { + "epoch": 1.58195211786372, + "grad_norm": 0.5474939346313477, + "learning_rate": 9.575497436081193e-05, + "loss": 2.0052, + "step": 5154 + }, + { + "epoch": 1.5822590546347453, + "grad_norm": 0.6277830004692078, + "learning_rate": 9.575296986514666e-05, + "loss": 2.042, + "step": 5155 + }, + { + "epoch": 1.5825659914057704, + "grad_norm": 0.46941256523132324, + "learning_rate": 9.575096491732372e-05, + "loss": 1.952, + "step": 5156 + }, + { + "epoch": 1.5828729281767955, + "grad_norm": 0.4948115646839142, + "learning_rate": 9.574895951736294e-05, + "loss": 1.9573, + "step": 5157 + }, + { + "epoch": 1.5831798649478208, + "grad_norm": 0.5677160024642944, + "learning_rate": 9.574695366528411e-05, + "loss": 1.9696, + "step": 5158 + }, + { + "epoch": 1.583486801718846, + "grad_norm": 0.5915918350219727, + "learning_rate": 9.574494736110708e-05, + "loss": 1.9822, + "step": 5159 + }, + { + "epoch": 1.583793738489871, + "grad_norm": 0.556413471698761, + "learning_rate": 9.574294060485168e-05, + "loss": 1.9548, + "step": 5160 + }, + { + "epoch": 1.5841006752608964, + "grad_norm": 0.4706072509288788, + "learning_rate": 9.574093339653772e-05, + "loss": 2.0052, + "step": 5161 + }, + { + "epoch": 1.5844076120319213, + "grad_norm": 0.3931087553501129, + "learning_rate": 9.573892573618505e-05, + "loss": 1.9071, + "step": 5162 + }, + { + "epoch": 1.5847145488029466, + "grad_norm": 0.4590308368206024, + "learning_rate": 9.573691762381349e-05, + "loss": 2.048, + "step": 5163 + }, + { + "epoch": 1.5850214855739717, + "grad_norm": 0.4404078423976898, + "learning_rate": 9.573490905944293e-05, + "loss": 1.9426, + "step": 5164 + }, + { + "epoch": 1.5853284223449968, + "grad_norm": 0.486074298620224, + "learning_rate": 9.573290004309318e-05, + "loss": 1.9937, + "step": 5165 + }, + { + "epoch": 1.5856353591160222, + "grad_norm": 0.4650556445121765, + "learning_rate": 9.57308905747841e-05, + "loss": 1.9821, + "step": 5166 + }, + { + "epoch": 1.5859422958870473, + "grad_norm": 0.48193567991256714, + "learning_rate": 9.572888065453557e-05, + "loss": 2.0143, + "step": 5167 + }, + { + "epoch": 1.5862492326580724, + "grad_norm": 0.43178877234458923, + "learning_rate": 9.572687028236744e-05, + "loss": 2.0066, + "step": 5168 + }, + { + "epoch": 1.5865561694290977, + "grad_norm": 0.5256033539772034, + "learning_rate": 9.572485945829957e-05, + "loss": 2.0431, + "step": 5169 + }, + { + "epoch": 1.5868631062001226, + "grad_norm": 0.4714619517326355, + "learning_rate": 9.572284818235182e-05, + "loss": 1.9411, + "step": 5170 + }, + { + "epoch": 1.587170042971148, + "grad_norm": 0.4224734902381897, + "learning_rate": 9.572083645454411e-05, + "loss": 1.9648, + "step": 5171 + }, + { + "epoch": 1.5874769797421733, + "grad_norm": 0.45965152978897095, + "learning_rate": 9.571882427489628e-05, + "loss": 1.9241, + "step": 5172 + }, + { + "epoch": 1.5877839165131982, + "grad_norm": 0.459114670753479, + "learning_rate": 9.571681164342825e-05, + "loss": 2.0197, + "step": 5173 + }, + { + "epoch": 1.5880908532842235, + "grad_norm": 0.4278501272201538, + "learning_rate": 9.571479856015988e-05, + "loss": 1.9411, + "step": 5174 + }, + { + "epoch": 1.5883977900552486, + "grad_norm": 0.6875150799751282, + "learning_rate": 9.571278502511107e-05, + "loss": 1.8876, + "step": 5175 + }, + { + "epoch": 1.5887047268262737, + "grad_norm": 0.4596772789955139, + "learning_rate": 9.571077103830174e-05, + "loss": 1.9002, + "step": 5176 + }, + { + "epoch": 1.589011663597299, + "grad_norm": 0.47587937116622925, + "learning_rate": 9.570875659975178e-05, + "loss": 2.0034, + "step": 5177 + }, + { + "epoch": 1.5893186003683242, + "grad_norm": 0.42494842410087585, + "learning_rate": 9.570674170948109e-05, + "loss": 1.9668, + "step": 5178 + }, + { + "epoch": 1.5896255371393493, + "grad_norm": 0.4231310784816742, + "learning_rate": 9.570472636750957e-05, + "loss": 1.9365, + "step": 5179 + }, + { + "epoch": 1.5899324739103746, + "grad_norm": 0.4585247337818146, + "learning_rate": 9.570271057385719e-05, + "loss": 1.9707, + "step": 5180 + }, + { + "epoch": 1.5902394106813995, + "grad_norm": 0.4146895408630371, + "learning_rate": 9.570069432854382e-05, + "loss": 1.9405, + "step": 5181 + }, + { + "epoch": 1.5905463474524248, + "grad_norm": 0.42243605852127075, + "learning_rate": 9.56986776315894e-05, + "loss": 1.8893, + "step": 5182 + }, + { + "epoch": 1.59085328422345, + "grad_norm": 0.44299328327178955, + "learning_rate": 9.569666048301386e-05, + "loss": 1.9596, + "step": 5183 + }, + { + "epoch": 1.591160220994475, + "grad_norm": 0.4950970709323883, + "learning_rate": 9.569464288283716e-05, + "loss": 1.9066, + "step": 5184 + }, + { + "epoch": 1.5914671577655004, + "grad_norm": 0.4664969742298126, + "learning_rate": 9.569262483107919e-05, + "loss": 1.9485, + "step": 5185 + }, + { + "epoch": 1.5917740945365255, + "grad_norm": 0.5052160024642944, + "learning_rate": 9.569060632775993e-05, + "loss": 1.9189, + "step": 5186 + }, + { + "epoch": 1.5920810313075506, + "grad_norm": 0.4109063446521759, + "learning_rate": 9.568858737289932e-05, + "loss": 1.9236, + "step": 5187 + }, + { + "epoch": 1.592387968078576, + "grad_norm": 0.4078194499015808, + "learning_rate": 9.568656796651731e-05, + "loss": 1.9465, + "step": 5188 + }, + { + "epoch": 1.5926949048496009, + "grad_norm": 0.43199312686920166, + "learning_rate": 9.568454810863385e-05, + "loss": 1.9537, + "step": 5189 + }, + { + "epoch": 1.5930018416206262, + "grad_norm": 0.46389925479888916, + "learning_rate": 9.568252779926891e-05, + "loss": 1.9463, + "step": 5190 + }, + { + "epoch": 1.5933087783916513, + "grad_norm": 0.4130708575248718, + "learning_rate": 9.568050703844247e-05, + "loss": 1.948, + "step": 5191 + }, + { + "epoch": 1.5936157151626764, + "grad_norm": 0.4699256122112274, + "learning_rate": 9.567848582617448e-05, + "loss": 1.957, + "step": 5192 + }, + { + "epoch": 1.5939226519337018, + "grad_norm": 0.41965460777282715, + "learning_rate": 9.56764641624849e-05, + "loss": 1.9622, + "step": 5193 + }, + { + "epoch": 1.5942295887047269, + "grad_norm": 0.4313151240348816, + "learning_rate": 9.567444204739376e-05, + "loss": 1.981, + "step": 5194 + }, + { + "epoch": 1.594536525475752, + "grad_norm": 0.4149332642555237, + "learning_rate": 9.5672419480921e-05, + "loss": 1.9542, + "step": 5195 + }, + { + "epoch": 1.5948434622467773, + "grad_norm": 0.4456483721733093, + "learning_rate": 9.567039646308661e-05, + "loss": 2.0206, + "step": 5196 + }, + { + "epoch": 1.5951503990178022, + "grad_norm": 0.46637552976608276, + "learning_rate": 9.56683729939106e-05, + "loss": 2.0264, + "step": 5197 + }, + { + "epoch": 1.5954573357888275, + "grad_norm": 0.4809871315956116, + "learning_rate": 9.566634907341297e-05, + "loss": 1.9113, + "step": 5198 + }, + { + "epoch": 1.5957642725598526, + "grad_norm": 0.5220670104026794, + "learning_rate": 9.566432470161371e-05, + "loss": 1.9806, + "step": 5199 + }, + { + "epoch": 1.5960712093308778, + "grad_norm": 0.5020555853843689, + "learning_rate": 9.566229987853283e-05, + "loss": 1.9925, + "step": 5200 + }, + { + "epoch": 1.596378146101903, + "grad_norm": 0.5481683611869812, + "learning_rate": 9.566027460419034e-05, + "loss": 1.978, + "step": 5201 + }, + { + "epoch": 1.5966850828729282, + "grad_norm": 0.5014147758483887, + "learning_rate": 9.565824887860624e-05, + "loss": 1.9402, + "step": 5202 + }, + { + "epoch": 1.5969920196439533, + "grad_norm": 0.43973588943481445, + "learning_rate": 9.565622270180057e-05, + "loss": 1.9877, + "step": 5203 + }, + { + "epoch": 1.5972989564149787, + "grad_norm": 0.5172939300537109, + "learning_rate": 9.565419607379335e-05, + "loss": 1.9304, + "step": 5204 + }, + { + "epoch": 1.5976058931860035, + "grad_norm": 0.4767214357852936, + "learning_rate": 9.56521689946046e-05, + "loss": 1.9063, + "step": 5205 + }, + { + "epoch": 1.5979128299570289, + "grad_norm": 0.48810651898384094, + "learning_rate": 9.565014146425437e-05, + "loss": 1.9473, + "step": 5206 + }, + { + "epoch": 1.598219766728054, + "grad_norm": 0.4204402565956116, + "learning_rate": 9.564811348276269e-05, + "loss": 1.9562, + "step": 5207 + }, + { + "epoch": 1.598526703499079, + "grad_norm": 0.42679163813591003, + "learning_rate": 9.564608505014958e-05, + "loss": 1.8904, + "step": 5208 + }, + { + "epoch": 1.5988336402701044, + "grad_norm": 0.4240354299545288, + "learning_rate": 9.56440561664351e-05, + "loss": 1.9982, + "step": 5209 + }, + { + "epoch": 1.5991405770411296, + "grad_norm": 0.41588497161865234, + "learning_rate": 9.564202683163932e-05, + "loss": 1.9904, + "step": 5210 + }, + { + "epoch": 1.5994475138121547, + "grad_norm": 0.486240029335022, + "learning_rate": 9.563999704578226e-05, + "loss": 1.9379, + "step": 5211 + }, + { + "epoch": 1.59975445058318, + "grad_norm": 0.4628448188304901, + "learning_rate": 9.563796680888403e-05, + "loss": 2.0061, + "step": 5212 + }, + { + "epoch": 1.600061387354205, + "grad_norm": 0.4514544606208801, + "learning_rate": 9.563593612096464e-05, + "loss": 1.9692, + "step": 5213 + }, + { + "epoch": 1.6003683241252302, + "grad_norm": 0.3869803845882416, + "learning_rate": 9.563390498204419e-05, + "loss": 1.8801, + "step": 5214 + }, + { + "epoch": 1.6006752608962553, + "grad_norm": 0.47029098868370056, + "learning_rate": 9.563187339214274e-05, + "loss": 2.0457, + "step": 5215 + }, + { + "epoch": 1.6009821976672804, + "grad_norm": 0.49051982164382935, + "learning_rate": 9.562984135128037e-05, + "loss": 1.9121, + "step": 5216 + }, + { + "epoch": 1.6012891344383058, + "grad_norm": 0.5087830424308777, + "learning_rate": 9.562780885947717e-05, + "loss": 1.9165, + "step": 5217 + }, + { + "epoch": 1.601596071209331, + "grad_norm": 0.4597826600074768, + "learning_rate": 9.562577591675322e-05, + "loss": 1.9037, + "step": 5218 + }, + { + "epoch": 1.601903007980356, + "grad_norm": 0.43610528111457825, + "learning_rate": 9.562374252312858e-05, + "loss": 1.8785, + "step": 5219 + }, + { + "epoch": 1.6022099447513813, + "grad_norm": 0.45797282457351685, + "learning_rate": 9.56217086786234e-05, + "loss": 2.0713, + "step": 5220 + }, + { + "epoch": 1.6025168815224062, + "grad_norm": 0.46097078919410706, + "learning_rate": 9.561967438325777e-05, + "loss": 1.9176, + "step": 5221 + }, + { + "epoch": 1.6028238182934316, + "grad_norm": 0.47368288040161133, + "learning_rate": 9.561763963705176e-05, + "loss": 1.9333, + "step": 5222 + }, + { + "epoch": 1.6031307550644567, + "grad_norm": 0.5048179626464844, + "learning_rate": 9.561560444002551e-05, + "loss": 1.9473, + "step": 5223 + }, + { + "epoch": 1.6034376918354818, + "grad_norm": 0.42069435119628906, + "learning_rate": 9.56135687921991e-05, + "loss": 1.8507, + "step": 5224 + }, + { + "epoch": 1.6037446286065071, + "grad_norm": 0.37166985869407654, + "learning_rate": 9.561153269359269e-05, + "loss": 1.9404, + "step": 5225 + }, + { + "epoch": 1.6040515653775322, + "grad_norm": 0.42752668261528015, + "learning_rate": 9.560949614422637e-05, + "loss": 1.9791, + "step": 5226 + }, + { + "epoch": 1.6043585021485574, + "grad_norm": 0.4334527552127838, + "learning_rate": 9.560745914412029e-05, + "loss": 1.972, + "step": 5227 + }, + { + "epoch": 1.6046654389195827, + "grad_norm": 0.44162631034851074, + "learning_rate": 9.560542169329454e-05, + "loss": 1.9054, + "step": 5228 + }, + { + "epoch": 1.6049723756906076, + "grad_norm": 0.3891509771347046, + "learning_rate": 9.560338379176929e-05, + "loss": 1.9356, + "step": 5229 + }, + { + "epoch": 1.605279312461633, + "grad_norm": 0.3821989893913269, + "learning_rate": 9.56013454395647e-05, + "loss": 1.9197, + "step": 5230 + }, + { + "epoch": 1.605586249232658, + "grad_norm": 0.4338948428630829, + "learning_rate": 9.559930663670084e-05, + "loss": 2.002, + "step": 5231 + }, + { + "epoch": 1.6058931860036831, + "grad_norm": 0.4784114956855774, + "learning_rate": 9.559726738319794e-05, + "loss": 2.0344, + "step": 5232 + }, + { + "epoch": 1.6062001227747085, + "grad_norm": 0.43362441658973694, + "learning_rate": 9.559522767907612e-05, + "loss": 1.9282, + "step": 5233 + }, + { + "epoch": 1.6065070595457336, + "grad_norm": 0.40863800048828125, + "learning_rate": 9.559318752435553e-05, + "loss": 1.8468, + "step": 5234 + }, + { + "epoch": 1.6068139963167587, + "grad_norm": 0.4509727358818054, + "learning_rate": 9.559114691905633e-05, + "loss": 2.0175, + "step": 5235 + }, + { + "epoch": 1.607120933087784, + "grad_norm": 0.4650020897388458, + "learning_rate": 9.55891058631987e-05, + "loss": 1.9946, + "step": 5236 + }, + { + "epoch": 1.607427869858809, + "grad_norm": 0.4315911829471588, + "learning_rate": 9.55870643568028e-05, + "loss": 1.9271, + "step": 5237 + }, + { + "epoch": 1.6077348066298343, + "grad_norm": 0.4109809994697571, + "learning_rate": 9.558502239988882e-05, + "loss": 1.9791, + "step": 5238 + }, + { + "epoch": 1.6080417434008594, + "grad_norm": 0.4323776662349701, + "learning_rate": 9.558297999247692e-05, + "loss": 1.9745, + "step": 5239 + }, + { + "epoch": 1.6083486801718845, + "grad_norm": 0.4255007207393646, + "learning_rate": 9.558093713458729e-05, + "loss": 1.96, + "step": 5240 + }, + { + "epoch": 1.6086556169429098, + "grad_norm": 0.4045571982860565, + "learning_rate": 9.557889382624014e-05, + "loss": 1.9148, + "step": 5241 + }, + { + "epoch": 1.608962553713935, + "grad_norm": 0.39663615822792053, + "learning_rate": 9.557685006745564e-05, + "loss": 1.9313, + "step": 5242 + }, + { + "epoch": 1.60926949048496, + "grad_norm": 0.39130523800849915, + "learning_rate": 9.5574805858254e-05, + "loss": 2.0073, + "step": 5243 + }, + { + "epoch": 1.6095764272559854, + "grad_norm": 0.4071548581123352, + "learning_rate": 9.55727611986554e-05, + "loss": 1.9353, + "step": 5244 + }, + { + "epoch": 1.6098833640270105, + "grad_norm": 0.44347357749938965, + "learning_rate": 9.557071608868007e-05, + "loss": 1.9325, + "step": 5245 + }, + { + "epoch": 1.6101903007980356, + "grad_norm": 0.48900067806243896, + "learning_rate": 9.556867052834821e-05, + "loss": 2.0083, + "step": 5246 + }, + { + "epoch": 1.610497237569061, + "grad_norm": 0.44374197721481323, + "learning_rate": 9.556662451768006e-05, + "loss": 2.0143, + "step": 5247 + }, + { + "epoch": 1.6108041743400858, + "grad_norm": 0.385268896818161, + "learning_rate": 9.556457805669581e-05, + "loss": 1.8981, + "step": 5248 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.5355607867240906, + "learning_rate": 9.556253114541569e-05, + "loss": 2.0413, + "step": 5249 + }, + { + "epoch": 1.6114180478821363, + "grad_norm": 0.5672646164894104, + "learning_rate": 9.556048378385992e-05, + "loss": 1.9429, + "step": 5250 + }, + { + "epoch": 1.6117249846531614, + "grad_norm": 0.46225669980049133, + "learning_rate": 9.555843597204875e-05, + "loss": 1.9883, + "step": 5251 + }, + { + "epoch": 1.6120319214241867, + "grad_norm": 0.43236228823661804, + "learning_rate": 9.555638771000243e-05, + "loss": 1.9641, + "step": 5252 + }, + { + "epoch": 1.6123388581952118, + "grad_norm": 0.4843178987503052, + "learning_rate": 9.555433899774116e-05, + "loss": 1.9224, + "step": 5253 + }, + { + "epoch": 1.612645794966237, + "grad_norm": 0.4693675637245178, + "learning_rate": 9.555228983528523e-05, + "loss": 1.9774, + "step": 5254 + }, + { + "epoch": 1.6129527317372623, + "grad_norm": 0.3968529999256134, + "learning_rate": 9.555024022265487e-05, + "loss": 1.8939, + "step": 5255 + }, + { + "epoch": 1.6132596685082872, + "grad_norm": 0.42781850695610046, + "learning_rate": 9.554819015987033e-05, + "loss": 1.9561, + "step": 5256 + }, + { + "epoch": 1.6135666052793125, + "grad_norm": 0.5241015553474426, + "learning_rate": 9.554613964695189e-05, + "loss": 1.963, + "step": 5257 + }, + { + "epoch": 1.6138735420503376, + "grad_norm": 0.4292888641357422, + "learning_rate": 9.554408868391979e-05, + "loss": 2.0248, + "step": 5258 + }, + { + "epoch": 1.6141804788213627, + "grad_norm": 0.49197763204574585, + "learning_rate": 9.554203727079433e-05, + "loss": 1.9612, + "step": 5259 + }, + { + "epoch": 1.614487415592388, + "grad_norm": 0.45733556151390076, + "learning_rate": 9.553998540759575e-05, + "loss": 1.9093, + "step": 5260 + }, + { + "epoch": 1.6147943523634132, + "grad_norm": 0.4139576256275177, + "learning_rate": 9.553793309434436e-05, + "loss": 1.875, + "step": 5261 + }, + { + "epoch": 1.6151012891344383, + "grad_norm": 0.42295894026756287, + "learning_rate": 9.55358803310604e-05, + "loss": 1.9427, + "step": 5262 + }, + { + "epoch": 1.6154082259054636, + "grad_norm": 0.370761513710022, + "learning_rate": 9.55338271177642e-05, + "loss": 1.932, + "step": 5263 + }, + { + "epoch": 1.6157151626764885, + "grad_norm": 0.38912683725357056, + "learning_rate": 9.553177345447602e-05, + "loss": 1.9606, + "step": 5264 + }, + { + "epoch": 1.6160220994475138, + "grad_norm": 0.3901510238647461, + "learning_rate": 9.552971934121618e-05, + "loss": 1.9455, + "step": 5265 + }, + { + "epoch": 1.616329036218539, + "grad_norm": 0.4517458975315094, + "learning_rate": 9.552766477800494e-05, + "loss": 1.9291, + "step": 5266 + }, + { + "epoch": 1.616635972989564, + "grad_norm": 0.47282713651657104, + "learning_rate": 9.552560976486266e-05, + "loss": 1.9326, + "step": 5267 + }, + { + "epoch": 1.6169429097605894, + "grad_norm": 0.4741488993167877, + "learning_rate": 9.552355430180961e-05, + "loss": 1.9782, + "step": 5268 + }, + { + "epoch": 1.6172498465316145, + "grad_norm": 0.42634037137031555, + "learning_rate": 9.552149838886612e-05, + "loss": 1.9871, + "step": 5269 + }, + { + "epoch": 1.6175567833026396, + "grad_norm": 0.39007633924484253, + "learning_rate": 9.55194420260525e-05, + "loss": 1.9397, + "step": 5270 + }, + { + "epoch": 1.617863720073665, + "grad_norm": 0.41707170009613037, + "learning_rate": 9.551738521338906e-05, + "loss": 1.8555, + "step": 5271 + }, + { + "epoch": 1.6181706568446899, + "grad_norm": 0.46702343225479126, + "learning_rate": 9.551532795089616e-05, + "loss": 1.9987, + "step": 5272 + }, + { + "epoch": 1.6184775936157152, + "grad_norm": 0.44585564732551575, + "learning_rate": 9.551327023859411e-05, + "loss": 1.8512, + "step": 5273 + }, + { + "epoch": 1.6187845303867403, + "grad_norm": 0.42617684602737427, + "learning_rate": 9.551121207650324e-05, + "loss": 1.9405, + "step": 5274 + }, + { + "epoch": 1.6190914671577654, + "grad_norm": 0.39399340748786926, + "learning_rate": 9.55091534646439e-05, + "loss": 1.9787, + "step": 5275 + }, + { + "epoch": 1.6193984039287908, + "grad_norm": 0.44386324286460876, + "learning_rate": 9.550709440303642e-05, + "loss": 1.9791, + "step": 5276 + }, + { + "epoch": 1.6197053406998159, + "grad_norm": 0.3871287405490875, + "learning_rate": 9.550503489170117e-05, + "loss": 1.9354, + "step": 5277 + }, + { + "epoch": 1.620012277470841, + "grad_norm": 0.4131690263748169, + "learning_rate": 9.550297493065851e-05, + "loss": 1.9709, + "step": 5278 + }, + { + "epoch": 1.6203192142418663, + "grad_norm": 0.3919534683227539, + "learning_rate": 9.550091451992877e-05, + "loss": 1.8997, + "step": 5279 + }, + { + "epoch": 1.6206261510128912, + "grad_norm": 0.40001583099365234, + "learning_rate": 9.54988536595323e-05, + "loss": 1.9006, + "step": 5280 + }, + { + "epoch": 1.6209330877839165, + "grad_norm": 0.44222408533096313, + "learning_rate": 9.549679234948952e-05, + "loss": 2.0033, + "step": 5281 + }, + { + "epoch": 1.6212400245549416, + "grad_norm": 0.4243159592151642, + "learning_rate": 9.549473058982077e-05, + "loss": 1.9582, + "step": 5282 + }, + { + "epoch": 1.6215469613259668, + "grad_norm": 0.411408007144928, + "learning_rate": 9.549266838054641e-05, + "loss": 1.9244, + "step": 5283 + }, + { + "epoch": 1.621853898096992, + "grad_norm": 0.3833782970905304, + "learning_rate": 9.549060572168686e-05, + "loss": 1.9184, + "step": 5284 + }, + { + "epoch": 1.6221608348680172, + "grad_norm": 0.3925926685333252, + "learning_rate": 9.548854261326246e-05, + "loss": 1.9299, + "step": 5285 + }, + { + "epoch": 1.6224677716390423, + "grad_norm": 0.4472656846046448, + "learning_rate": 9.548647905529363e-05, + "loss": 2.0622, + "step": 5286 + }, + { + "epoch": 1.6227747084100677, + "grad_norm": 0.4842108488082886, + "learning_rate": 9.548441504780074e-05, + "loss": 1.9759, + "step": 5287 + }, + { + "epoch": 1.6230816451810925, + "grad_norm": 0.49826517701148987, + "learning_rate": 9.548235059080422e-05, + "loss": 1.9162, + "step": 5288 + }, + { + "epoch": 1.6233885819521179, + "grad_norm": 0.4672689735889435, + "learning_rate": 9.548028568432445e-05, + "loss": 1.9843, + "step": 5289 + }, + { + "epoch": 1.623695518723143, + "grad_norm": 0.48113325238227844, + "learning_rate": 9.547822032838182e-05, + "loss": 1.9426, + "step": 5290 + }, + { + "epoch": 1.624002455494168, + "grad_norm": 0.49646374583244324, + "learning_rate": 9.54761545229968e-05, + "loss": 1.908, + "step": 5291 + }, + { + "epoch": 1.6243093922651934, + "grad_norm": 0.42530664801597595, + "learning_rate": 9.547408826818974e-05, + "loss": 1.9189, + "step": 5292 + }, + { + "epoch": 1.6246163290362186, + "grad_norm": 0.592721164226532, + "learning_rate": 9.54720215639811e-05, + "loss": 1.9656, + "step": 5293 + }, + { + "epoch": 1.6249232658072437, + "grad_norm": 0.5530748963356018, + "learning_rate": 9.546995441039127e-05, + "loss": 1.8815, + "step": 5294 + }, + { + "epoch": 1.625230202578269, + "grad_norm": 0.4551030695438385, + "learning_rate": 9.546788680744073e-05, + "loss": 1.9485, + "step": 5295 + }, + { + "epoch": 1.625537139349294, + "grad_norm": 0.42004409432411194, + "learning_rate": 9.546581875514985e-05, + "loss": 1.9903, + "step": 5296 + }, + { + "epoch": 1.6258440761203192, + "grad_norm": 0.5363507270812988, + "learning_rate": 9.546375025353911e-05, + "loss": 1.93, + "step": 5297 + }, + { + "epoch": 1.6261510128913443, + "grad_norm": 0.457795649766922, + "learning_rate": 9.546168130262896e-05, + "loss": 1.9279, + "step": 5298 + }, + { + "epoch": 1.6264579496623695, + "grad_norm": 0.5061174631118774, + "learning_rate": 9.545961190243982e-05, + "loss": 1.9198, + "step": 5299 + }, + { + "epoch": 1.6267648864333948, + "grad_norm": 0.4366548955440521, + "learning_rate": 9.545754205299214e-05, + "loss": 1.9206, + "step": 5300 + }, + { + "epoch": 1.62707182320442, + "grad_norm": 0.361251562833786, + "learning_rate": 9.54554717543064e-05, + "loss": 1.8638, + "step": 5301 + }, + { + "epoch": 1.627378759975445, + "grad_norm": 0.45089036226272583, + "learning_rate": 9.545340100640303e-05, + "loss": 1.9206, + "step": 5302 + }, + { + "epoch": 1.6276856967464703, + "grad_norm": 0.38224726915359497, + "learning_rate": 9.545132980930251e-05, + "loss": 1.9893, + "step": 5303 + }, + { + "epoch": 1.6279926335174952, + "grad_norm": 0.43573206663131714, + "learning_rate": 9.544925816302533e-05, + "loss": 1.9358, + "step": 5304 + }, + { + "epoch": 1.6282995702885206, + "grad_norm": 0.5618723630905151, + "learning_rate": 9.544718606759193e-05, + "loss": 1.9745, + "step": 5305 + }, + { + "epoch": 1.6286065070595457, + "grad_norm": 0.517867386341095, + "learning_rate": 9.54451135230228e-05, + "loss": 2.0238, + "step": 5306 + }, + { + "epoch": 1.6289134438305708, + "grad_norm": 0.4745725393295288, + "learning_rate": 9.544304052933842e-05, + "loss": 1.999, + "step": 5307 + }, + { + "epoch": 1.6292203806015961, + "grad_norm": 0.4454270899295807, + "learning_rate": 9.544096708655928e-05, + "loss": 1.9215, + "step": 5308 + }, + { + "epoch": 1.6295273173726212, + "grad_norm": 0.5604696273803711, + "learning_rate": 9.543889319470586e-05, + "loss": 1.8756, + "step": 5309 + }, + { + "epoch": 1.6298342541436464, + "grad_norm": 0.645453155040741, + "learning_rate": 9.543681885379869e-05, + "loss": 1.9177, + "step": 5310 + }, + { + "epoch": 1.6301411909146717, + "grad_norm": 0.7018140554428101, + "learning_rate": 9.543474406385824e-05, + "loss": 1.9231, + "step": 5311 + }, + { + "epoch": 1.6304481276856968, + "grad_norm": 0.691644549369812, + "learning_rate": 9.543266882490501e-05, + "loss": 1.9055, + "step": 5312 + }, + { + "epoch": 1.630755064456722, + "grad_norm": 0.5484849810600281, + "learning_rate": 9.54305931369595e-05, + "loss": 1.8977, + "step": 5313 + }, + { + "epoch": 1.6310620012277472, + "grad_norm": 0.4035104811191559, + "learning_rate": 9.542851700004227e-05, + "loss": 1.9098, + "step": 5314 + }, + { + "epoch": 1.6313689379987721, + "grad_norm": 0.4578574299812317, + "learning_rate": 9.542644041417379e-05, + "loss": 1.9946, + "step": 5315 + }, + { + "epoch": 1.6316758747697975, + "grad_norm": 0.646272599697113, + "learning_rate": 9.542436337937462e-05, + "loss": 1.9489, + "step": 5316 + }, + { + "epoch": 1.6319828115408226, + "grad_norm": 0.5796291828155518, + "learning_rate": 9.542228589566524e-05, + "loss": 1.8396, + "step": 5317 + }, + { + "epoch": 1.6322897483118477, + "grad_norm": 0.42690619826316833, + "learning_rate": 9.542020796306623e-05, + "loss": 1.9691, + "step": 5318 + }, + { + "epoch": 1.632596685082873, + "grad_norm": 0.3943910002708435, + "learning_rate": 9.54181295815981e-05, + "loss": 1.8711, + "step": 5319 + }, + { + "epoch": 1.6329036218538981, + "grad_norm": 0.4636860489845276, + "learning_rate": 9.541605075128137e-05, + "loss": 1.8659, + "step": 5320 + }, + { + "epoch": 1.6332105586249233, + "grad_norm": 0.5485807061195374, + "learning_rate": 9.541397147213664e-05, + "loss": 2.031, + "step": 5321 + }, + { + "epoch": 1.6335174953959486, + "grad_norm": 0.40169721841812134, + "learning_rate": 9.541189174418441e-05, + "loss": 1.9346, + "step": 5322 + }, + { + "epoch": 1.6338244321669735, + "grad_norm": 0.3407663106918335, + "learning_rate": 9.540981156744524e-05, + "loss": 1.9238, + "step": 5323 + }, + { + "epoch": 1.6341313689379988, + "grad_norm": 0.4062422513961792, + "learning_rate": 9.540773094193971e-05, + "loss": 1.914, + "step": 5324 + }, + { + "epoch": 1.634438305709024, + "grad_norm": 0.47654685378074646, + "learning_rate": 9.540564986768836e-05, + "loss": 1.8957, + "step": 5325 + }, + { + "epoch": 1.634745242480049, + "grad_norm": 0.4369850754737854, + "learning_rate": 9.540356834471178e-05, + "loss": 1.968, + "step": 5326 + }, + { + "epoch": 1.6350521792510744, + "grad_norm": 0.38868457078933716, + "learning_rate": 9.540148637303052e-05, + "loss": 1.931, + "step": 5327 + }, + { + "epoch": 1.6353591160220995, + "grad_norm": 0.4998358190059662, + "learning_rate": 9.539940395266515e-05, + "loss": 1.9316, + "step": 5328 + }, + { + "epoch": 1.6356660527931246, + "grad_norm": 0.5497372150421143, + "learning_rate": 9.539732108363628e-05, + "loss": 1.9233, + "step": 5329 + }, + { + "epoch": 1.63597298956415, + "grad_norm": 0.5609846115112305, + "learning_rate": 9.539523776596445e-05, + "loss": 1.898, + "step": 5330 + }, + { + "epoch": 1.6362799263351748, + "grad_norm": 0.44984617829322815, + "learning_rate": 9.539315399967029e-05, + "loss": 2.0103, + "step": 5331 + }, + { + "epoch": 1.6365868631062002, + "grad_norm": 0.41710013151168823, + "learning_rate": 9.539106978477436e-05, + "loss": 1.9008, + "step": 5332 + }, + { + "epoch": 1.6368937998772253, + "grad_norm": 0.44854703545570374, + "learning_rate": 9.53889851212973e-05, + "loss": 1.9591, + "step": 5333 + }, + { + "epoch": 1.6372007366482504, + "grad_norm": 0.4259171485900879, + "learning_rate": 9.538690000925968e-05, + "loss": 1.915, + "step": 5334 + }, + { + "epoch": 1.6375076734192757, + "grad_norm": 0.4444480240345001, + "learning_rate": 9.53848144486821e-05, + "loss": 1.9562, + "step": 5335 + }, + { + "epoch": 1.6378146101903008, + "grad_norm": 0.40078794956207275, + "learning_rate": 9.538272843958518e-05, + "loss": 1.8802, + "step": 5336 + }, + { + "epoch": 1.638121546961326, + "grad_norm": 0.5346726179122925, + "learning_rate": 9.538064198198955e-05, + "loss": 2.0214, + "step": 5337 + }, + { + "epoch": 1.6384284837323513, + "grad_norm": 0.47136780619621277, + "learning_rate": 9.537855507591581e-05, + "loss": 1.9593, + "step": 5338 + }, + { + "epoch": 1.6387354205033762, + "grad_norm": 0.3839198052883148, + "learning_rate": 9.53764677213846e-05, + "loss": 1.9507, + "step": 5339 + }, + { + "epoch": 1.6390423572744015, + "grad_norm": 0.4565586447715759, + "learning_rate": 9.537437991841654e-05, + "loss": 1.9292, + "step": 5340 + }, + { + "epoch": 1.6393492940454266, + "grad_norm": 0.5139011740684509, + "learning_rate": 9.537229166703225e-05, + "loss": 1.9388, + "step": 5341 + }, + { + "epoch": 1.6396562308164517, + "grad_norm": 0.5421571135520935, + "learning_rate": 9.537020296725238e-05, + "loss": 1.9031, + "step": 5342 + }, + { + "epoch": 1.639963167587477, + "grad_norm": 0.4085434675216675, + "learning_rate": 9.536811381909758e-05, + "loss": 1.9167, + "step": 5343 + }, + { + "epoch": 1.6402701043585022, + "grad_norm": 0.3567824065685272, + "learning_rate": 9.536602422258849e-05, + "loss": 1.89, + "step": 5344 + }, + { + "epoch": 1.6405770411295273, + "grad_norm": 0.5427443385124207, + "learning_rate": 9.536393417774575e-05, + "loss": 2.0036, + "step": 5345 + }, + { + "epoch": 1.6408839779005526, + "grad_norm": 0.5275370478630066, + "learning_rate": 9.536184368459003e-05, + "loss": 1.94, + "step": 5346 + }, + { + "epoch": 1.6411909146715775, + "grad_norm": 0.3916989862918854, + "learning_rate": 9.535975274314198e-05, + "loss": 1.8769, + "step": 5347 + }, + { + "epoch": 1.6414978514426029, + "grad_norm": 0.4200802743434906, + "learning_rate": 9.535766135342228e-05, + "loss": 1.9384, + "step": 5348 + }, + { + "epoch": 1.641804788213628, + "grad_norm": 0.5287195444107056, + "learning_rate": 9.535556951545157e-05, + "loss": 1.9159, + "step": 5349 + }, + { + "epoch": 1.642111724984653, + "grad_norm": 0.5934851765632629, + "learning_rate": 9.535347722925055e-05, + "loss": 1.9927, + "step": 5350 + }, + { + "epoch": 1.6424186617556784, + "grad_norm": 0.49941807985305786, + "learning_rate": 9.535138449483987e-05, + "loss": 1.9124, + "step": 5351 + }, + { + "epoch": 1.6427255985267035, + "grad_norm": 0.41778016090393066, + "learning_rate": 9.534929131224024e-05, + "loss": 1.9468, + "step": 5352 + }, + { + "epoch": 1.6430325352977286, + "grad_norm": 0.5172474384307861, + "learning_rate": 9.534719768147233e-05, + "loss": 1.928, + "step": 5353 + }, + { + "epoch": 1.643339472068754, + "grad_norm": 0.6690294146537781, + "learning_rate": 9.534510360255683e-05, + "loss": 1.9697, + "step": 5354 + }, + { + "epoch": 1.6436464088397789, + "grad_norm": 0.617683470249176, + "learning_rate": 9.534300907551444e-05, + "loss": 1.9529, + "step": 5355 + }, + { + "epoch": 1.6439533456108042, + "grad_norm": 0.40067893266677856, + "learning_rate": 9.534091410036587e-05, + "loss": 1.915, + "step": 5356 + }, + { + "epoch": 1.6442602823818293, + "grad_norm": 0.46418440341949463, + "learning_rate": 9.53388186771318e-05, + "loss": 1.9056, + "step": 5357 + }, + { + "epoch": 1.6445672191528544, + "grad_norm": 0.6600098013877869, + "learning_rate": 9.533672280583295e-05, + "loss": 1.9641, + "step": 5358 + }, + { + "epoch": 1.6448741559238798, + "grad_norm": 0.6510347127914429, + "learning_rate": 9.533462648649004e-05, + "loss": 1.916, + "step": 5359 + }, + { + "epoch": 1.6451810926949049, + "grad_norm": 0.5004377365112305, + "learning_rate": 9.533252971912376e-05, + "loss": 1.9584, + "step": 5360 + }, + { + "epoch": 1.64548802946593, + "grad_norm": 0.45522230863571167, + "learning_rate": 9.533043250375488e-05, + "loss": 1.973, + "step": 5361 + }, + { + "epoch": 1.6457949662369553, + "grad_norm": 0.5304180383682251, + "learning_rate": 9.532833484040408e-05, + "loss": 1.8542, + "step": 5362 + }, + { + "epoch": 1.6461019030079802, + "grad_norm": 0.5320406556129456, + "learning_rate": 9.53262367290921e-05, + "loss": 1.9405, + "step": 5363 + }, + { + "epoch": 1.6464088397790055, + "grad_norm": 0.4377361536026001, + "learning_rate": 9.532413816983969e-05, + "loss": 1.9126, + "step": 5364 + }, + { + "epoch": 1.6467157765500307, + "grad_norm": 0.4632298946380615, + "learning_rate": 9.532203916266758e-05, + "loss": 1.9868, + "step": 5365 + }, + { + "epoch": 1.6470227133210558, + "grad_norm": 0.4861730635166168, + "learning_rate": 9.531993970759651e-05, + "loss": 1.895, + "step": 5366 + }, + { + "epoch": 1.647329650092081, + "grad_norm": 0.45012348890304565, + "learning_rate": 9.531783980464726e-05, + "loss": 1.9583, + "step": 5367 + }, + { + "epoch": 1.6476365868631062, + "grad_norm": 0.43772751092910767, + "learning_rate": 9.531573945384053e-05, + "loss": 1.9341, + "step": 5368 + }, + { + "epoch": 1.6479435236341313, + "grad_norm": 0.39253392815589905, + "learning_rate": 9.531363865519711e-05, + "loss": 1.8629, + "step": 5369 + }, + { + "epoch": 1.6482504604051567, + "grad_norm": 0.44614076614379883, + "learning_rate": 9.531153740873775e-05, + "loss": 1.9508, + "step": 5370 + }, + { + "epoch": 1.6485573971761815, + "grad_norm": 0.4442307949066162, + "learning_rate": 9.530943571448322e-05, + "loss": 1.9624, + "step": 5371 + }, + { + "epoch": 1.6488643339472069, + "grad_norm": 0.44962942600250244, + "learning_rate": 9.53073335724543e-05, + "loss": 1.9315, + "step": 5372 + }, + { + "epoch": 1.649171270718232, + "grad_norm": 0.4903222620487213, + "learning_rate": 9.530523098267173e-05, + "loss": 1.8776, + "step": 5373 + }, + { + "epoch": 1.649478207489257, + "grad_norm": 0.4733131229877472, + "learning_rate": 9.530312794515633e-05, + "loss": 1.958, + "step": 5374 + }, + { + "epoch": 1.6497851442602824, + "grad_norm": 0.4134232997894287, + "learning_rate": 9.530102445992886e-05, + "loss": 1.9184, + "step": 5375 + }, + { + "epoch": 1.6500920810313076, + "grad_norm": 0.43521758913993835, + "learning_rate": 9.529892052701012e-05, + "loss": 1.9383, + "step": 5376 + }, + { + "epoch": 1.6503990178023327, + "grad_norm": 0.5098583102226257, + "learning_rate": 9.52968161464209e-05, + "loss": 1.9596, + "step": 5377 + }, + { + "epoch": 1.650705954573358, + "grad_norm": 0.48421037197113037, + "learning_rate": 9.5294711318182e-05, + "loss": 1.9258, + "step": 5378 + }, + { + "epoch": 1.651012891344383, + "grad_norm": 0.4039461314678192, + "learning_rate": 9.52926060423142e-05, + "loss": 1.9975, + "step": 5379 + }, + { + "epoch": 1.6513198281154082, + "grad_norm": 0.491858571767807, + "learning_rate": 9.529050031883832e-05, + "loss": 1.9564, + "step": 5380 + }, + { + "epoch": 1.6516267648864333, + "grad_norm": 0.45920100808143616, + "learning_rate": 9.528839414777517e-05, + "loss": 1.8513, + "step": 5381 + }, + { + "epoch": 1.6519337016574585, + "grad_norm": 0.4812139868736267, + "learning_rate": 9.528628752914558e-05, + "loss": 1.9638, + "step": 5382 + }, + { + "epoch": 1.6522406384284838, + "grad_norm": 0.38021141290664673, + "learning_rate": 9.528418046297034e-05, + "loss": 1.848, + "step": 5383 + }, + { + "epoch": 1.652547575199509, + "grad_norm": 0.438681960105896, + "learning_rate": 9.52820729492703e-05, + "loss": 1.9931, + "step": 5384 + }, + { + "epoch": 1.652854511970534, + "grad_norm": 0.4387293756008148, + "learning_rate": 9.527996498806627e-05, + "loss": 1.9969, + "step": 5385 + }, + { + "epoch": 1.6531614487415593, + "grad_norm": 0.43315380811691284, + "learning_rate": 9.527785657937907e-05, + "loss": 1.9607, + "step": 5386 + }, + { + "epoch": 1.6534683855125845, + "grad_norm": 0.4800446927547455, + "learning_rate": 9.527574772322956e-05, + "loss": 1.9645, + "step": 5387 + }, + { + "epoch": 1.6537753222836096, + "grad_norm": 0.45495909452438354, + "learning_rate": 9.527363841963857e-05, + "loss": 1.8748, + "step": 5388 + }, + { + "epoch": 1.654082259054635, + "grad_norm": 0.4052638113498688, + "learning_rate": 9.527152866862696e-05, + "loss": 1.9491, + "step": 5389 + }, + { + "epoch": 1.6543891958256598, + "grad_norm": 0.44545745849609375, + "learning_rate": 9.526941847021558e-05, + "loss": 1.8938, + "step": 5390 + }, + { + "epoch": 1.6546961325966851, + "grad_norm": 0.5576399564743042, + "learning_rate": 9.526730782442526e-05, + "loss": 1.9656, + "step": 5391 + }, + { + "epoch": 1.6550030693677102, + "grad_norm": 0.5678401589393616, + "learning_rate": 9.526519673127686e-05, + "loss": 1.9914, + "step": 5392 + }, + { + "epoch": 1.6553100061387354, + "grad_norm": 0.4391598701477051, + "learning_rate": 9.526308519079127e-05, + "loss": 1.9452, + "step": 5393 + }, + { + "epoch": 1.6556169429097607, + "grad_norm": 0.4375559091567993, + "learning_rate": 9.526097320298934e-05, + "loss": 1.9335, + "step": 5394 + }, + { + "epoch": 1.6559238796807858, + "grad_norm": 0.4976498782634735, + "learning_rate": 9.525886076789194e-05, + "loss": 2.0065, + "step": 5395 + }, + { + "epoch": 1.656230816451811, + "grad_norm": 0.5966445207595825, + "learning_rate": 9.525674788551996e-05, + "loss": 1.9924, + "step": 5396 + }, + { + "epoch": 1.6565377532228363, + "grad_norm": 0.5119359493255615, + "learning_rate": 9.525463455589427e-05, + "loss": 2.0061, + "step": 5397 + }, + { + "epoch": 1.6568446899938611, + "grad_norm": 0.46835067868232727, + "learning_rate": 9.525252077903574e-05, + "loss": 1.9441, + "step": 5398 + }, + { + "epoch": 1.6571516267648865, + "grad_norm": 0.5319140553474426, + "learning_rate": 9.52504065549653e-05, + "loss": 1.9704, + "step": 5399 + }, + { + "epoch": 1.6574585635359116, + "grad_norm": 0.5132572054862976, + "learning_rate": 9.52482918837038e-05, + "loss": 1.9037, + "step": 5400 + }, + { + "epoch": 1.6577655003069367, + "grad_norm": 0.41260987520217896, + "learning_rate": 9.524617676527218e-05, + "loss": 1.9103, + "step": 5401 + }, + { + "epoch": 1.658072437077962, + "grad_norm": 0.41780540347099304, + "learning_rate": 9.524406119969131e-05, + "loss": 1.9419, + "step": 5402 + }, + { + "epoch": 1.6583793738489871, + "grad_norm": 0.42015889286994934, + "learning_rate": 9.524194518698211e-05, + "loss": 1.9143, + "step": 5403 + }, + { + "epoch": 1.6586863106200123, + "grad_norm": 0.4449796676635742, + "learning_rate": 9.523982872716548e-05, + "loss": 1.9794, + "step": 5404 + }, + { + "epoch": 1.6589932473910376, + "grad_norm": 0.4392293393611908, + "learning_rate": 9.523771182026237e-05, + "loss": 1.8687, + "step": 5405 + }, + { + "epoch": 1.6593001841620625, + "grad_norm": 0.49595963954925537, + "learning_rate": 9.523559446629366e-05, + "loss": 2.013, + "step": 5406 + }, + { + "epoch": 1.6596071209330878, + "grad_norm": 0.4456728994846344, + "learning_rate": 9.523347666528029e-05, + "loss": 1.9269, + "step": 5407 + }, + { + "epoch": 1.659914057704113, + "grad_norm": 0.3835284411907196, + "learning_rate": 9.52313584172432e-05, + "loss": 1.9042, + "step": 5408 + }, + { + "epoch": 1.660220994475138, + "grad_norm": 0.39068692922592163, + "learning_rate": 9.522923972220332e-05, + "loss": 1.999, + "step": 5409 + }, + { + "epoch": 1.6605279312461634, + "grad_norm": 0.4522729814052582, + "learning_rate": 9.522712058018157e-05, + "loss": 1.9546, + "step": 5410 + }, + { + "epoch": 1.6608348680171885, + "grad_norm": 0.3834155201911926, + "learning_rate": 9.522500099119891e-05, + "loss": 1.9184, + "step": 5411 + }, + { + "epoch": 1.6611418047882136, + "grad_norm": 0.36149126291275024, + "learning_rate": 9.522288095527629e-05, + "loss": 1.8973, + "step": 5412 + }, + { + "epoch": 1.661448741559239, + "grad_norm": 0.3502398729324341, + "learning_rate": 9.522076047243464e-05, + "loss": 1.8775, + "step": 5413 + }, + { + "epoch": 1.6617556783302638, + "grad_norm": 0.36552321910858154, + "learning_rate": 9.521863954269495e-05, + "loss": 1.901, + "step": 5414 + }, + { + "epoch": 1.6620626151012892, + "grad_norm": 0.37815216183662415, + "learning_rate": 9.521651816607814e-05, + "loss": 1.9143, + "step": 5415 + }, + { + "epoch": 1.6623695518723143, + "grad_norm": 0.4048994481563568, + "learning_rate": 9.52143963426052e-05, + "loss": 1.9892, + "step": 5416 + }, + { + "epoch": 1.6626764886433394, + "grad_norm": 0.35271233320236206, + "learning_rate": 9.52122740722971e-05, + "loss": 1.9209, + "step": 5417 + }, + { + "epoch": 1.6629834254143647, + "grad_norm": 0.405009925365448, + "learning_rate": 9.521015135517482e-05, + "loss": 1.9583, + "step": 5418 + }, + { + "epoch": 1.6632903621853898, + "grad_norm": 0.4041683077812195, + "learning_rate": 9.520802819125932e-05, + "loss": 1.8937, + "step": 5419 + }, + { + "epoch": 1.663597298956415, + "grad_norm": 0.41353970766067505, + "learning_rate": 9.520590458057157e-05, + "loss": 1.949, + "step": 5420 + }, + { + "epoch": 1.6639042357274403, + "grad_norm": 0.3704569637775421, + "learning_rate": 9.520378052313258e-05, + "loss": 1.9287, + "step": 5421 + }, + { + "epoch": 1.6642111724984652, + "grad_norm": 0.4043133854866028, + "learning_rate": 9.520165601896334e-05, + "loss": 1.9116, + "step": 5422 + }, + { + "epoch": 1.6645181092694905, + "grad_norm": 0.3976849317550659, + "learning_rate": 9.519953106808485e-05, + "loss": 1.9578, + "step": 5423 + }, + { + "epoch": 1.6648250460405156, + "grad_norm": 0.41225695610046387, + "learning_rate": 9.51974056705181e-05, + "loss": 1.8861, + "step": 5424 + }, + { + "epoch": 1.6651319828115407, + "grad_norm": 0.40096259117126465, + "learning_rate": 9.519527982628409e-05, + "loss": 1.926, + "step": 5425 + }, + { + "epoch": 1.665438919582566, + "grad_norm": 0.4373134970664978, + "learning_rate": 9.519315353540384e-05, + "loss": 1.8761, + "step": 5426 + }, + { + "epoch": 1.6657458563535912, + "grad_norm": 0.3798682689666748, + "learning_rate": 9.519102679789835e-05, + "loss": 1.8655, + "step": 5427 + }, + { + "epoch": 1.6660527931246163, + "grad_norm": 0.3889687955379486, + "learning_rate": 9.518889961378865e-05, + "loss": 1.8928, + "step": 5428 + }, + { + "epoch": 1.6663597298956416, + "grad_norm": 0.39567697048187256, + "learning_rate": 9.518677198309575e-05, + "loss": 1.9193, + "step": 5429 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.37571004033088684, + "learning_rate": 9.51846439058407e-05, + "loss": 1.9653, + "step": 5430 + }, + { + "epoch": 1.6669736034376919, + "grad_norm": 0.36011725664138794, + "learning_rate": 9.518251538204451e-05, + "loss": 1.9202, + "step": 5431 + }, + { + "epoch": 1.667280540208717, + "grad_norm": 0.42314839363098145, + "learning_rate": 9.518038641172822e-05, + "loss": 1.9883, + "step": 5432 + }, + { + "epoch": 1.667587476979742, + "grad_norm": 0.3986029326915741, + "learning_rate": 9.517825699491287e-05, + "loss": 1.9838, + "step": 5433 + }, + { + "epoch": 1.6678944137507674, + "grad_norm": 0.388236939907074, + "learning_rate": 9.517612713161949e-05, + "loss": 1.901, + "step": 5434 + }, + { + "epoch": 1.6682013505217925, + "grad_norm": 0.3849826455116272, + "learning_rate": 9.517399682186917e-05, + "loss": 1.9621, + "step": 5435 + }, + { + "epoch": 1.6685082872928176, + "grad_norm": 0.40182530879974365, + "learning_rate": 9.517186606568292e-05, + "loss": 1.9081, + "step": 5436 + }, + { + "epoch": 1.668815224063843, + "grad_norm": 0.4260261654853821, + "learning_rate": 9.516973486308181e-05, + "loss": 1.9701, + "step": 5437 + }, + { + "epoch": 1.6691221608348679, + "grad_norm": 0.4035099744796753, + "learning_rate": 9.516760321408692e-05, + "loss": 1.9269, + "step": 5438 + }, + { + "epoch": 1.6694290976058932, + "grad_norm": 0.42106589674949646, + "learning_rate": 9.51654711187193e-05, + "loss": 1.9026, + "step": 5439 + }, + { + "epoch": 1.6697360343769183, + "grad_norm": 0.4629819989204407, + "learning_rate": 9.516333857700001e-05, + "loss": 1.9128, + "step": 5440 + }, + { + "epoch": 1.6700429711479434, + "grad_norm": 0.3824837803840637, + "learning_rate": 9.516120558895014e-05, + "loss": 1.8861, + "step": 5441 + }, + { + "epoch": 1.6703499079189688, + "grad_norm": 0.37263223528862, + "learning_rate": 9.515907215459076e-05, + "loss": 1.9098, + "step": 5442 + }, + { + "epoch": 1.6706568446899939, + "grad_norm": 0.3980494439601898, + "learning_rate": 9.515693827394299e-05, + "loss": 1.9764, + "step": 5443 + }, + { + "epoch": 1.670963781461019, + "grad_norm": 0.5064507722854614, + "learning_rate": 9.515480394702786e-05, + "loss": 1.9771, + "step": 5444 + }, + { + "epoch": 1.6712707182320443, + "grad_norm": 0.5012909770011902, + "learning_rate": 9.515266917386649e-05, + "loss": 1.9162, + "step": 5445 + }, + { + "epoch": 1.6715776550030692, + "grad_norm": 0.5422279238700867, + "learning_rate": 9.515053395447999e-05, + "loss": 1.8913, + "step": 5446 + }, + { + "epoch": 1.6718845917740945, + "grad_norm": 0.4677022397518158, + "learning_rate": 9.514839828888946e-05, + "loss": 1.9156, + "step": 5447 + }, + { + "epoch": 1.6721915285451197, + "grad_norm": 0.39561185240745544, + "learning_rate": 9.514626217711597e-05, + "loss": 1.9203, + "step": 5448 + }, + { + "epoch": 1.6724984653161448, + "grad_norm": 0.4435743987560272, + "learning_rate": 9.514412561918068e-05, + "loss": 1.953, + "step": 5449 + }, + { + "epoch": 1.67280540208717, + "grad_norm": 0.5383535027503967, + "learning_rate": 9.514198861510467e-05, + "loss": 1.9662, + "step": 5450 + }, + { + "epoch": 1.6731123388581952, + "grad_norm": 0.4787214696407318, + "learning_rate": 9.513985116490906e-05, + "loss": 1.9278, + "step": 5451 + }, + { + "epoch": 1.6734192756292203, + "grad_norm": 0.40962034463882446, + "learning_rate": 9.513771326861501e-05, + "loss": 1.9267, + "step": 5452 + }, + { + "epoch": 1.6737262124002457, + "grad_norm": 0.43605929613113403, + "learning_rate": 9.513557492624359e-05, + "loss": 1.9537, + "step": 5453 + }, + { + "epoch": 1.6740331491712708, + "grad_norm": 0.46278494596481323, + "learning_rate": 9.513343613781599e-05, + "loss": 1.9383, + "step": 5454 + }, + { + "epoch": 1.6743400859422959, + "grad_norm": 0.4052918255329132, + "learning_rate": 9.513129690335331e-05, + "loss": 1.9289, + "step": 5455 + }, + { + "epoch": 1.6746470227133212, + "grad_norm": 0.37791141867637634, + "learning_rate": 9.51291572228767e-05, + "loss": 1.9185, + "step": 5456 + }, + { + "epoch": 1.674953959484346, + "grad_norm": 0.41135111451148987, + "learning_rate": 9.512701709640731e-05, + "loss": 2.0003, + "step": 5457 + }, + { + "epoch": 1.6752608962553714, + "grad_norm": 0.41175320744514465, + "learning_rate": 9.512487652396629e-05, + "loss": 1.9307, + "step": 5458 + }, + { + "epoch": 1.6755678330263966, + "grad_norm": 0.40061330795288086, + "learning_rate": 9.512273550557478e-05, + "loss": 1.9361, + "step": 5459 + }, + { + "epoch": 1.6758747697974217, + "grad_norm": 0.3938329219818115, + "learning_rate": 9.512059404125397e-05, + "loss": 1.9419, + "step": 5460 + }, + { + "epoch": 1.676181706568447, + "grad_norm": 0.42825883626937866, + "learning_rate": 9.511845213102498e-05, + "loss": 1.9201, + "step": 5461 + }, + { + "epoch": 1.6764886433394721, + "grad_norm": 0.3795798122882843, + "learning_rate": 9.511630977490901e-05, + "loss": 1.9872, + "step": 5462 + }, + { + "epoch": 1.6767955801104972, + "grad_norm": 0.3639005422592163, + "learning_rate": 9.511416697292724e-05, + "loss": 1.9066, + "step": 5463 + }, + { + "epoch": 1.6771025168815226, + "grad_norm": 0.4200088381767273, + "learning_rate": 9.511202372510082e-05, + "loss": 1.9928, + "step": 5464 + }, + { + "epoch": 1.6774094536525475, + "grad_norm": 0.436638742685318, + "learning_rate": 9.510988003145092e-05, + "loss": 1.8527, + "step": 5465 + }, + { + "epoch": 1.6777163904235728, + "grad_norm": 0.40901345014572144, + "learning_rate": 9.510773589199877e-05, + "loss": 1.9915, + "step": 5466 + }, + { + "epoch": 1.678023327194598, + "grad_norm": 0.39717167615890503, + "learning_rate": 9.510559130676553e-05, + "loss": 1.9682, + "step": 5467 + }, + { + "epoch": 1.678330263965623, + "grad_norm": 0.37574490904808044, + "learning_rate": 9.510344627577239e-05, + "loss": 1.9641, + "step": 5468 + }, + { + "epoch": 1.6786372007366483, + "grad_norm": 0.36686137318611145, + "learning_rate": 9.510130079904057e-05, + "loss": 1.9082, + "step": 5469 + }, + { + "epoch": 1.6789441375076735, + "grad_norm": 0.37321972846984863, + "learning_rate": 9.509915487659125e-05, + "loss": 1.8911, + "step": 5470 + }, + { + "epoch": 1.6792510742786986, + "grad_norm": 0.3911389112472534, + "learning_rate": 9.509700850844566e-05, + "loss": 1.9721, + "step": 5471 + }, + { + "epoch": 1.679558011049724, + "grad_norm": 0.41182973980903625, + "learning_rate": 9.509486169462499e-05, + "loss": 1.9188, + "step": 5472 + }, + { + "epoch": 1.6798649478207488, + "grad_norm": 0.4141900837421417, + "learning_rate": 9.509271443515047e-05, + "loss": 1.875, + "step": 5473 + }, + { + "epoch": 1.6801718845917741, + "grad_norm": 0.4259745478630066, + "learning_rate": 9.509056673004333e-05, + "loss": 1.9258, + "step": 5474 + }, + { + "epoch": 1.6804788213627992, + "grad_norm": 0.47081178426742554, + "learning_rate": 9.508841857932476e-05, + "loss": 2.0494, + "step": 5475 + }, + { + "epoch": 1.6807857581338244, + "grad_norm": 0.5346465110778809, + "learning_rate": 9.508626998301602e-05, + "loss": 1.9371, + "step": 5476 + }, + { + "epoch": 1.6810926949048497, + "grad_norm": 0.5532976388931274, + "learning_rate": 9.508412094113832e-05, + "loss": 1.8727, + "step": 5477 + }, + { + "epoch": 1.6813996316758748, + "grad_norm": 0.5262138843536377, + "learning_rate": 9.508197145371294e-05, + "loss": 1.9098, + "step": 5478 + }, + { + "epoch": 1.6817065684469, + "grad_norm": 0.47581788897514343, + "learning_rate": 9.507982152076108e-05, + "loss": 1.9174, + "step": 5479 + }, + { + "epoch": 1.6820135052179253, + "grad_norm": 0.41795024275779724, + "learning_rate": 9.507767114230399e-05, + "loss": 1.9333, + "step": 5480 + }, + { + "epoch": 1.6823204419889501, + "grad_norm": 0.5213392376899719, + "learning_rate": 9.507552031836295e-05, + "loss": 1.9731, + "step": 5481 + }, + { + "epoch": 1.6826273787599755, + "grad_norm": 0.624969482421875, + "learning_rate": 9.507336904895919e-05, + "loss": 1.965, + "step": 5482 + }, + { + "epoch": 1.6829343155310006, + "grad_norm": 0.5719303488731384, + "learning_rate": 9.507121733411397e-05, + "loss": 1.9325, + "step": 5483 + }, + { + "epoch": 1.6832412523020257, + "grad_norm": 0.45429563522338867, + "learning_rate": 9.506906517384858e-05, + "loss": 1.8846, + "step": 5484 + }, + { + "epoch": 1.683548189073051, + "grad_norm": 0.4679521322250366, + "learning_rate": 9.506691256818427e-05, + "loss": 1.9609, + "step": 5485 + }, + { + "epoch": 1.6838551258440762, + "grad_norm": 0.64385986328125, + "learning_rate": 9.50647595171423e-05, + "loss": 1.9138, + "step": 5486 + }, + { + "epoch": 1.6841620626151013, + "grad_norm": 0.6783073544502258, + "learning_rate": 9.506260602074398e-05, + "loss": 2.0252, + "step": 5487 + }, + { + "epoch": 1.6844689993861266, + "grad_norm": 0.6151844263076782, + "learning_rate": 9.506045207901058e-05, + "loss": 2.0077, + "step": 5488 + }, + { + "epoch": 1.6847759361571515, + "grad_norm": 0.43046683073043823, + "learning_rate": 9.505829769196338e-05, + "loss": 1.8945, + "step": 5489 + }, + { + "epoch": 1.6850828729281768, + "grad_norm": 0.44831258058547974, + "learning_rate": 9.505614285962366e-05, + "loss": 1.9775, + "step": 5490 + }, + { + "epoch": 1.685389809699202, + "grad_norm": 0.4917668402194977, + "learning_rate": 9.505398758201272e-05, + "loss": 1.9115, + "step": 5491 + }, + { + "epoch": 1.685696746470227, + "grad_norm": 0.4595036506652832, + "learning_rate": 9.505183185915187e-05, + "loss": 1.9103, + "step": 5492 + }, + { + "epoch": 1.6860036832412524, + "grad_norm": 0.43335607647895813, + "learning_rate": 9.504967569106243e-05, + "loss": 1.9147, + "step": 5493 + }, + { + "epoch": 1.6863106200122775, + "grad_norm": 0.42885956168174744, + "learning_rate": 9.504751907776567e-05, + "loss": 2.0085, + "step": 5494 + }, + { + "epoch": 1.6866175567833026, + "grad_norm": 0.4121492803096771, + "learning_rate": 9.504536201928295e-05, + "loss": 1.9212, + "step": 5495 + }, + { + "epoch": 1.686924493554328, + "grad_norm": 0.4387015700340271, + "learning_rate": 9.504320451563555e-05, + "loss": 1.9202, + "step": 5496 + }, + { + "epoch": 1.6872314303253528, + "grad_norm": 0.4333394467830658, + "learning_rate": 9.504104656684481e-05, + "loss": 1.9165, + "step": 5497 + }, + { + "epoch": 1.6875383670963782, + "grad_norm": 0.37835901975631714, + "learning_rate": 9.503888817293203e-05, + "loss": 1.9087, + "step": 5498 + }, + { + "epoch": 1.6878453038674033, + "grad_norm": 0.42156684398651123, + "learning_rate": 9.503672933391857e-05, + "loss": 1.8909, + "step": 5499 + }, + { + "epoch": 1.6881522406384284, + "grad_norm": 0.4315885603427887, + "learning_rate": 9.503457004982574e-05, + "loss": 1.8892, + "step": 5500 + }, + { + "epoch": 1.6884591774094537, + "grad_norm": 0.4349892735481262, + "learning_rate": 9.50324103206749e-05, + "loss": 1.9532, + "step": 5501 + }, + { + "epoch": 1.6887661141804788, + "grad_norm": 0.45786523818969727, + "learning_rate": 9.503025014648739e-05, + "loss": 1.9285, + "step": 5502 + }, + { + "epoch": 1.689073050951504, + "grad_norm": 0.36640092730522156, + "learning_rate": 9.502808952728456e-05, + "loss": 1.9167, + "step": 5503 + }, + { + "epoch": 1.6893799877225293, + "grad_norm": 0.46942031383514404, + "learning_rate": 9.502592846308775e-05, + "loss": 2.08, + "step": 5504 + }, + { + "epoch": 1.6896869244935542, + "grad_norm": 0.44714173674583435, + "learning_rate": 9.502376695391833e-05, + "loss": 1.9618, + "step": 5505 + }, + { + "epoch": 1.6899938612645795, + "grad_norm": 0.4216810464859009, + "learning_rate": 9.502160499979764e-05, + "loss": 1.888, + "step": 5506 + }, + { + "epoch": 1.6903007980356046, + "grad_norm": 0.40471377968788147, + "learning_rate": 9.501944260074709e-05, + "loss": 1.9048, + "step": 5507 + }, + { + "epoch": 1.6906077348066297, + "grad_norm": 0.399309366941452, + "learning_rate": 9.501727975678801e-05, + "loss": 1.8796, + "step": 5508 + }, + { + "epoch": 1.690914671577655, + "grad_norm": 0.36903873085975647, + "learning_rate": 9.501511646794176e-05, + "loss": 1.9607, + "step": 5509 + }, + { + "epoch": 1.6912216083486802, + "grad_norm": 0.40781939029693604, + "learning_rate": 9.501295273422977e-05, + "loss": 1.9328, + "step": 5510 + }, + { + "epoch": 1.6915285451197053, + "grad_norm": 0.38062483072280884, + "learning_rate": 9.50107885556734e-05, + "loss": 1.9552, + "step": 5511 + }, + { + "epoch": 1.6918354818907306, + "grad_norm": 0.4047648012638092, + "learning_rate": 9.500862393229402e-05, + "loss": 1.9503, + "step": 5512 + }, + { + "epoch": 1.6921424186617555, + "grad_norm": 0.3829517066478729, + "learning_rate": 9.500645886411305e-05, + "loss": 1.9034, + "step": 5513 + }, + { + "epoch": 1.6924493554327809, + "grad_norm": 0.3657867908477783, + "learning_rate": 9.500429335115188e-05, + "loss": 1.869, + "step": 5514 + }, + { + "epoch": 1.692756292203806, + "grad_norm": 0.410877525806427, + "learning_rate": 9.50021273934319e-05, + "loss": 1.9824, + "step": 5515 + }, + { + "epoch": 1.693063228974831, + "grad_norm": 0.420682817697525, + "learning_rate": 9.499996099097453e-05, + "loss": 1.969, + "step": 5516 + }, + { + "epoch": 1.6933701657458564, + "grad_norm": 0.44578227400779724, + "learning_rate": 9.499779414380115e-05, + "loss": 1.9513, + "step": 5517 + }, + { + "epoch": 1.6936771025168815, + "grad_norm": 0.42710423469543457, + "learning_rate": 9.499562685193319e-05, + "loss": 1.9423, + "step": 5518 + }, + { + "epoch": 1.6939840392879066, + "grad_norm": 0.4503214657306671, + "learning_rate": 9.49934591153921e-05, + "loss": 1.9849, + "step": 5519 + }, + { + "epoch": 1.694290976058932, + "grad_norm": 0.427157998085022, + "learning_rate": 9.499129093419926e-05, + "loss": 1.9502, + "step": 5520 + }, + { + "epoch": 1.6945979128299569, + "grad_norm": 0.4356638491153717, + "learning_rate": 9.498912230837611e-05, + "loss": 1.8593, + "step": 5521 + }, + { + "epoch": 1.6949048496009822, + "grad_norm": 0.3894338309764862, + "learning_rate": 9.498695323794409e-05, + "loss": 1.8857, + "step": 5522 + }, + { + "epoch": 1.6952117863720073, + "grad_norm": 0.4285121262073517, + "learning_rate": 9.498478372292464e-05, + "loss": 1.9774, + "step": 5523 + }, + { + "epoch": 1.6955187231430324, + "grad_norm": 0.4316183924674988, + "learning_rate": 9.498261376333916e-05, + "loss": 1.9067, + "step": 5524 + }, + { + "epoch": 1.6958256599140578, + "grad_norm": 0.3760167956352234, + "learning_rate": 9.498044335920914e-05, + "loss": 1.8375, + "step": 5525 + }, + { + "epoch": 1.6961325966850829, + "grad_norm": 0.4327097237110138, + "learning_rate": 9.497827251055602e-05, + "loss": 1.9333, + "step": 5526 + }, + { + "epoch": 1.696439533456108, + "grad_norm": 0.4169953167438507, + "learning_rate": 9.497610121740126e-05, + "loss": 1.9015, + "step": 5527 + }, + { + "epoch": 1.6967464702271333, + "grad_norm": 0.3915253281593323, + "learning_rate": 9.49739294797663e-05, + "loss": 1.8608, + "step": 5528 + }, + { + "epoch": 1.6970534069981584, + "grad_norm": 0.4071075916290283, + "learning_rate": 9.497175729767259e-05, + "loss": 1.9336, + "step": 5529 + }, + { + "epoch": 1.6973603437691835, + "grad_norm": 0.3550303876399994, + "learning_rate": 9.496958467114163e-05, + "loss": 1.8614, + "step": 5530 + }, + { + "epoch": 1.6976672805402089, + "grad_norm": 0.3757273554801941, + "learning_rate": 9.496741160019487e-05, + "loss": 1.9959, + "step": 5531 + }, + { + "epoch": 1.6979742173112338, + "grad_norm": 0.4126262366771698, + "learning_rate": 9.49652380848538e-05, + "loss": 1.935, + "step": 5532 + }, + { + "epoch": 1.698281154082259, + "grad_norm": 0.46366190910339355, + "learning_rate": 9.496306412513988e-05, + "loss": 1.9336, + "step": 5533 + }, + { + "epoch": 1.6985880908532842, + "grad_norm": 0.42553630471229553, + "learning_rate": 9.496088972107463e-05, + "loss": 1.9388, + "step": 5534 + }, + { + "epoch": 1.6988950276243093, + "grad_norm": 0.4060843884944916, + "learning_rate": 9.49587148726795e-05, + "loss": 1.917, + "step": 5535 + }, + { + "epoch": 1.6992019643953347, + "grad_norm": 0.37994736433029175, + "learning_rate": 9.495653957997601e-05, + "loss": 1.9268, + "step": 5536 + }, + { + "epoch": 1.6995089011663598, + "grad_norm": 0.4148559272289276, + "learning_rate": 9.495436384298563e-05, + "loss": 1.8936, + "step": 5537 + }, + { + "epoch": 1.6998158379373849, + "grad_norm": 0.39814767241477966, + "learning_rate": 9.495218766172989e-05, + "loss": 1.9468, + "step": 5538 + }, + { + "epoch": 1.7001227747084102, + "grad_norm": 0.40800294280052185, + "learning_rate": 9.495001103623027e-05, + "loss": 1.9649, + "step": 5539 + }, + { + "epoch": 1.7004297114794351, + "grad_norm": 0.4225989282131195, + "learning_rate": 9.49478339665083e-05, + "loss": 1.987, + "step": 5540 + }, + { + "epoch": 1.7007366482504604, + "grad_norm": 0.4280939996242523, + "learning_rate": 9.494565645258551e-05, + "loss": 2.0487, + "step": 5541 + }, + { + "epoch": 1.7010435850214856, + "grad_norm": 0.44816237688064575, + "learning_rate": 9.494347849448338e-05, + "loss": 1.9112, + "step": 5542 + }, + { + "epoch": 1.7013505217925107, + "grad_norm": 0.424629271030426, + "learning_rate": 9.494130009222346e-05, + "loss": 1.9284, + "step": 5543 + }, + { + "epoch": 1.701657458563536, + "grad_norm": 0.40010082721710205, + "learning_rate": 9.493912124582727e-05, + "loss": 1.9307, + "step": 5544 + }, + { + "epoch": 1.7019643953345611, + "grad_norm": 0.42541825771331787, + "learning_rate": 9.493694195531633e-05, + "loss": 2.0009, + "step": 5545 + }, + { + "epoch": 1.7022713321055862, + "grad_norm": 0.39693546295166016, + "learning_rate": 9.49347622207122e-05, + "loss": 1.9237, + "step": 5546 + }, + { + "epoch": 1.7025782688766116, + "grad_norm": 0.37853676080703735, + "learning_rate": 9.493258204203644e-05, + "loss": 1.9212, + "step": 5547 + }, + { + "epoch": 1.7028852056476365, + "grad_norm": 0.3856247663497925, + "learning_rate": 9.493040141931054e-05, + "loss": 1.926, + "step": 5548 + }, + { + "epoch": 1.7031921424186618, + "grad_norm": 0.3429555892944336, + "learning_rate": 9.492822035255608e-05, + "loss": 1.8854, + "step": 5549 + }, + { + "epoch": 1.703499079189687, + "grad_norm": 0.3500545620918274, + "learning_rate": 9.49260388417946e-05, + "loss": 1.8627, + "step": 5550 + }, + { + "epoch": 1.703806015960712, + "grad_norm": 0.3461480140686035, + "learning_rate": 9.49238568870477e-05, + "loss": 1.8962, + "step": 5551 + }, + { + "epoch": 1.7041129527317374, + "grad_norm": 0.36311015486717224, + "learning_rate": 9.492167448833691e-05, + "loss": 1.9398, + "step": 5552 + }, + { + "epoch": 1.7044198895027625, + "grad_norm": 0.36770105361938477, + "learning_rate": 9.491949164568379e-05, + "loss": 1.9083, + "step": 5553 + }, + { + "epoch": 1.7047268262737876, + "grad_norm": 0.42491769790649414, + "learning_rate": 9.491730835910993e-05, + "loss": 1.8874, + "step": 5554 + }, + { + "epoch": 1.705033763044813, + "grad_norm": 0.5321764945983887, + "learning_rate": 9.491512462863691e-05, + "loss": 1.9813, + "step": 5555 + }, + { + "epoch": 1.7053406998158378, + "grad_norm": 0.5481576323509216, + "learning_rate": 9.49129404542863e-05, + "loss": 1.8696, + "step": 5556 + }, + { + "epoch": 1.7056476365868631, + "grad_norm": 0.47720953822135925, + "learning_rate": 9.491075583607969e-05, + "loss": 1.9026, + "step": 5557 + }, + { + "epoch": 1.7059545733578882, + "grad_norm": 0.3976534605026245, + "learning_rate": 9.490857077403865e-05, + "loss": 1.8551, + "step": 5558 + }, + { + "epoch": 1.7062615101289134, + "grad_norm": 0.3744281828403473, + "learning_rate": 9.49063852681848e-05, + "loss": 2.012, + "step": 5559 + }, + { + "epoch": 1.7065684468999387, + "grad_norm": 0.3931918740272522, + "learning_rate": 9.490419931853974e-05, + "loss": 1.845, + "step": 5560 + }, + { + "epoch": 1.7068753836709638, + "grad_norm": 0.5411466956138611, + "learning_rate": 9.490201292512506e-05, + "loss": 2.0225, + "step": 5561 + }, + { + "epoch": 1.707182320441989, + "grad_norm": 0.6602910757064819, + "learning_rate": 9.489982608796237e-05, + "loss": 1.9559, + "step": 5562 + }, + { + "epoch": 1.7074892572130143, + "grad_norm": 0.5455329418182373, + "learning_rate": 9.489763880707329e-05, + "loss": 1.8855, + "step": 5563 + }, + { + "epoch": 1.7077961939840391, + "grad_norm": 0.42309099435806274, + "learning_rate": 9.489545108247941e-05, + "loss": 1.8784, + "step": 5564 + }, + { + "epoch": 1.7081031307550645, + "grad_norm": 0.3817001283168793, + "learning_rate": 9.489326291420239e-05, + "loss": 1.8926, + "step": 5565 + }, + { + "epoch": 1.7084100675260896, + "grad_norm": 0.5077582597732544, + "learning_rate": 9.489107430226381e-05, + "loss": 1.8742, + "step": 5566 + }, + { + "epoch": 1.7087170042971147, + "grad_norm": 0.5634065866470337, + "learning_rate": 9.488888524668533e-05, + "loss": 1.9251, + "step": 5567 + }, + { + "epoch": 1.70902394106814, + "grad_norm": 0.5182891488075256, + "learning_rate": 9.488669574748859e-05, + "loss": 1.9689, + "step": 5568 + }, + { + "epoch": 1.7093308778391652, + "grad_norm": 0.4180498719215393, + "learning_rate": 9.48845058046952e-05, + "loss": 1.9248, + "step": 5569 + }, + { + "epoch": 1.7096378146101903, + "grad_norm": 0.4833194315433502, + "learning_rate": 9.488231541832682e-05, + "loss": 2.0115, + "step": 5570 + }, + { + "epoch": 1.7099447513812156, + "grad_norm": 0.46525415778160095, + "learning_rate": 9.488012458840509e-05, + "loss": 1.9108, + "step": 5571 + }, + { + "epoch": 1.7102516881522405, + "grad_norm": 0.5051191449165344, + "learning_rate": 9.487793331495166e-05, + "loss": 1.9055, + "step": 5572 + }, + { + "epoch": 1.7105586249232658, + "grad_norm": 0.4713154137134552, + "learning_rate": 9.48757415979882e-05, + "loss": 1.9104, + "step": 5573 + }, + { + "epoch": 1.710865561694291, + "grad_norm": 0.44901835918426514, + "learning_rate": 9.487354943753635e-05, + "loss": 1.9536, + "step": 5574 + }, + { + "epoch": 1.711172498465316, + "grad_norm": 0.41106006503105164, + "learning_rate": 9.487135683361778e-05, + "loss": 1.9549, + "step": 5575 + }, + { + "epoch": 1.7114794352363414, + "grad_norm": 0.4571320116519928, + "learning_rate": 9.486916378625416e-05, + "loss": 1.859, + "step": 5576 + }, + { + "epoch": 1.7117863720073665, + "grad_norm": 0.4423540532588959, + "learning_rate": 9.486697029546718e-05, + "loss": 1.9621, + "step": 5577 + }, + { + "epoch": 1.7120933087783916, + "grad_norm": 0.44291070103645325, + "learning_rate": 9.48647763612785e-05, + "loss": 1.8567, + "step": 5578 + }, + { + "epoch": 1.712400245549417, + "grad_norm": 0.4374423921108246, + "learning_rate": 9.486258198370981e-05, + "loss": 1.9754, + "step": 5579 + }, + { + "epoch": 1.7127071823204418, + "grad_norm": 0.44008153676986694, + "learning_rate": 9.486038716278277e-05, + "loss": 1.8815, + "step": 5580 + }, + { + "epoch": 1.7130141190914672, + "grad_norm": 0.3571348190307617, + "learning_rate": 9.48581918985191e-05, + "loss": 1.8948, + "step": 5581 + }, + { + "epoch": 1.7133210558624923, + "grad_norm": 0.42260754108428955, + "learning_rate": 9.485599619094049e-05, + "loss": 1.9964, + "step": 5582 + }, + { + "epoch": 1.7136279926335174, + "grad_norm": 0.44568777084350586, + "learning_rate": 9.485380004006863e-05, + "loss": 1.9596, + "step": 5583 + }, + { + "epoch": 1.7139349294045427, + "grad_norm": 0.5488269925117493, + "learning_rate": 9.485160344592523e-05, + "loss": 1.9239, + "step": 5584 + }, + { + "epoch": 1.7142418661755678, + "grad_norm": 0.5653155446052551, + "learning_rate": 9.484940640853199e-05, + "loss": 1.9115, + "step": 5585 + }, + { + "epoch": 1.714548802946593, + "grad_norm": 0.4652312099933624, + "learning_rate": 9.484720892791064e-05, + "loss": 1.9973, + "step": 5586 + }, + { + "epoch": 1.7148557397176183, + "grad_norm": 0.41521382331848145, + "learning_rate": 9.484501100408288e-05, + "loss": 1.9395, + "step": 5587 + }, + { + "epoch": 1.7151626764886432, + "grad_norm": 0.46761438250541687, + "learning_rate": 9.484281263707043e-05, + "loss": 1.9465, + "step": 5588 + }, + { + "epoch": 1.7154696132596685, + "grad_norm": 0.46990182995796204, + "learning_rate": 9.484061382689501e-05, + "loss": 1.8969, + "step": 5589 + }, + { + "epoch": 1.7157765500306936, + "grad_norm": 0.44951021671295166, + "learning_rate": 9.48384145735784e-05, + "loss": 1.9925, + "step": 5590 + }, + { + "epoch": 1.7160834868017187, + "grad_norm": 0.4029327630996704, + "learning_rate": 9.483621487714227e-05, + "loss": 1.8574, + "step": 5591 + }, + { + "epoch": 1.716390423572744, + "grad_norm": 0.3501027226448059, + "learning_rate": 9.48340147376084e-05, + "loss": 1.9156, + "step": 5592 + }, + { + "epoch": 1.7166973603437692, + "grad_norm": 0.5058720111846924, + "learning_rate": 9.48318141549985e-05, + "loss": 2.071, + "step": 5593 + }, + { + "epoch": 1.7170042971147943, + "grad_norm": 0.5097518563270569, + "learning_rate": 9.482961312933435e-05, + "loss": 1.9609, + "step": 5594 + }, + { + "epoch": 1.7173112338858196, + "grad_norm": 0.4728573262691498, + "learning_rate": 9.482741166063769e-05, + "loss": 1.9552, + "step": 5595 + }, + { + "epoch": 1.7176181706568447, + "grad_norm": 0.44095897674560547, + "learning_rate": 9.482520974893026e-05, + "loss": 2.011, + "step": 5596 + }, + { + "epoch": 1.7179251074278699, + "grad_norm": 0.48331573605537415, + "learning_rate": 9.482300739423385e-05, + "loss": 1.9676, + "step": 5597 + }, + { + "epoch": 1.7182320441988952, + "grad_norm": 0.4890894293785095, + "learning_rate": 9.482080459657019e-05, + "loss": 1.9571, + "step": 5598 + }, + { + "epoch": 1.71853898096992, + "grad_norm": 0.4486929476261139, + "learning_rate": 9.481860135596109e-05, + "loss": 1.9205, + "step": 5599 + }, + { + "epoch": 1.7188459177409454, + "grad_norm": 0.44154083728790283, + "learning_rate": 9.48163976724283e-05, + "loss": 1.9995, + "step": 5600 + }, + { + "epoch": 1.7191528545119705, + "grad_norm": 0.4155641496181488, + "learning_rate": 9.481419354599358e-05, + "loss": 1.9192, + "step": 5601 + }, + { + "epoch": 1.7194597912829956, + "grad_norm": 0.453253835439682, + "learning_rate": 9.481198897667875e-05, + "loss": 2.0102, + "step": 5602 + }, + { + "epoch": 1.719766728054021, + "grad_norm": 0.4325653314590454, + "learning_rate": 9.480978396450557e-05, + "loss": 1.8859, + "step": 5603 + }, + { + "epoch": 1.720073664825046, + "grad_norm": 0.4191089868545532, + "learning_rate": 9.480757850949584e-05, + "loss": 2.0007, + "step": 5604 + }, + { + "epoch": 1.7203806015960712, + "grad_norm": 0.4182284474372864, + "learning_rate": 9.480537261167137e-05, + "loss": 1.9374, + "step": 5605 + }, + { + "epoch": 1.7206875383670965, + "grad_norm": 0.4695988893508911, + "learning_rate": 9.480316627105394e-05, + "loss": 1.983, + "step": 5606 + }, + { + "epoch": 1.7209944751381214, + "grad_norm": 0.4668160378932953, + "learning_rate": 9.480095948766536e-05, + "loss": 1.8705, + "step": 5607 + }, + { + "epoch": 1.7213014119091468, + "grad_norm": 0.3689236044883728, + "learning_rate": 9.479875226152744e-05, + "loss": 1.8695, + "step": 5608 + }, + { + "epoch": 1.7216083486801719, + "grad_norm": 0.4206932485103607, + "learning_rate": 9.4796544592662e-05, + "loss": 1.9494, + "step": 5609 + }, + { + "epoch": 1.721915285451197, + "grad_norm": 0.4420578181743622, + "learning_rate": 9.479433648109083e-05, + "loss": 1.8749, + "step": 5610 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.4243582487106323, + "learning_rate": 9.479212792683579e-05, + "loss": 1.9524, + "step": 5611 + }, + { + "epoch": 1.7225291589932474, + "grad_norm": 0.5053666234016418, + "learning_rate": 9.478991892991868e-05, + "loss": 1.9308, + "step": 5612 + }, + { + "epoch": 1.7228360957642725, + "grad_norm": 0.4365650713443756, + "learning_rate": 9.478770949036136e-05, + "loss": 1.9469, + "step": 5613 + }, + { + "epoch": 1.7231430325352979, + "grad_norm": 0.3916216194629669, + "learning_rate": 9.478549960818561e-05, + "loss": 1.8239, + "step": 5614 + }, + { + "epoch": 1.7234499693063228, + "grad_norm": 0.4051356911659241, + "learning_rate": 9.478328928341334e-05, + "loss": 1.892, + "step": 5615 + }, + { + "epoch": 1.723756906077348, + "grad_norm": 0.36592593789100647, + "learning_rate": 9.478107851606633e-05, + "loss": 1.8763, + "step": 5616 + }, + { + "epoch": 1.7240638428483732, + "grad_norm": 0.45741888880729675, + "learning_rate": 9.477886730616645e-05, + "loss": 1.9502, + "step": 5617 + }, + { + "epoch": 1.7243707796193983, + "grad_norm": 0.38170990347862244, + "learning_rate": 9.477665565373558e-05, + "loss": 1.8568, + "step": 5618 + }, + { + "epoch": 1.7246777163904237, + "grad_norm": 0.4193691313266754, + "learning_rate": 9.477444355879554e-05, + "loss": 1.9553, + "step": 5619 + }, + { + "epoch": 1.7249846531614488, + "grad_norm": 0.39682838320732117, + "learning_rate": 9.477223102136821e-05, + "loss": 1.9474, + "step": 5620 + }, + { + "epoch": 1.725291589932474, + "grad_norm": 0.391544371843338, + "learning_rate": 9.477001804147545e-05, + "loss": 1.9277, + "step": 5621 + }, + { + "epoch": 1.7255985267034992, + "grad_norm": 0.42348888516426086, + "learning_rate": 9.476780461913913e-05, + "loss": 1.8923, + "step": 5622 + }, + { + "epoch": 1.7259054634745241, + "grad_norm": 0.4393916130065918, + "learning_rate": 9.476559075438114e-05, + "loss": 1.9052, + "step": 5623 + }, + { + "epoch": 1.7262124002455494, + "grad_norm": 0.42631569504737854, + "learning_rate": 9.476337644722333e-05, + "loss": 1.8849, + "step": 5624 + }, + { + "epoch": 1.7265193370165746, + "grad_norm": 0.3514206111431122, + "learning_rate": 9.47611616976876e-05, + "loss": 1.9286, + "step": 5625 + }, + { + "epoch": 1.7268262737875997, + "grad_norm": 0.4104609191417694, + "learning_rate": 9.475894650579582e-05, + "loss": 1.9178, + "step": 5626 + }, + { + "epoch": 1.727133210558625, + "grad_norm": 0.44329676032066345, + "learning_rate": 9.475673087156992e-05, + "loss": 1.9789, + "step": 5627 + }, + { + "epoch": 1.7274401473296501, + "grad_norm": 0.41865840554237366, + "learning_rate": 9.475451479503175e-05, + "loss": 1.9105, + "step": 5628 + }, + { + "epoch": 1.7277470841006752, + "grad_norm": 0.4166790544986725, + "learning_rate": 9.475229827620326e-05, + "loss": 1.9089, + "step": 5629 + }, + { + "epoch": 1.7280540208717006, + "grad_norm": 0.353771448135376, + "learning_rate": 9.475008131510633e-05, + "loss": 1.9081, + "step": 5630 + }, + { + "epoch": 1.7283609576427255, + "grad_norm": 0.385046124458313, + "learning_rate": 9.474786391176284e-05, + "loss": 1.9268, + "step": 5631 + }, + { + "epoch": 1.7286678944137508, + "grad_norm": 0.3956538438796997, + "learning_rate": 9.474564606619474e-05, + "loss": 1.9445, + "step": 5632 + }, + { + "epoch": 1.728974831184776, + "grad_norm": 0.41305112838745117, + "learning_rate": 9.474342777842394e-05, + "loss": 1.9331, + "step": 5633 + }, + { + "epoch": 1.729281767955801, + "grad_norm": 0.39336860179901123, + "learning_rate": 9.474120904847237e-05, + "loss": 1.9792, + "step": 5634 + }, + { + "epoch": 1.7295887047268264, + "grad_norm": 0.41963186860084534, + "learning_rate": 9.473898987636194e-05, + "loss": 1.8719, + "step": 5635 + }, + { + "epoch": 1.7298956414978515, + "grad_norm": 0.4087338149547577, + "learning_rate": 9.473677026211458e-05, + "loss": 1.9121, + "step": 5636 + }, + { + "epoch": 1.7302025782688766, + "grad_norm": 0.3693830966949463, + "learning_rate": 9.473455020575226e-05, + "loss": 1.9293, + "step": 5637 + }, + { + "epoch": 1.730509515039902, + "grad_norm": 0.40699541568756104, + "learning_rate": 9.473232970729688e-05, + "loss": 1.94, + "step": 5638 + }, + { + "epoch": 1.7308164518109268, + "grad_norm": 0.4222811162471771, + "learning_rate": 9.473010876677041e-05, + "loss": 1.9416, + "step": 5639 + }, + { + "epoch": 1.7311233885819521, + "grad_norm": 0.41459110379219055, + "learning_rate": 9.472788738419477e-05, + "loss": 1.8801, + "step": 5640 + }, + { + "epoch": 1.7314303253529773, + "grad_norm": 0.36970487236976624, + "learning_rate": 9.472566555959195e-05, + "loss": 1.9122, + "step": 5641 + }, + { + "epoch": 1.7317372621240024, + "grad_norm": 0.35511577129364014, + "learning_rate": 9.472344329298388e-05, + "loss": 1.8646, + "step": 5642 + }, + { + "epoch": 1.7320441988950277, + "grad_norm": 0.3511577248573303, + "learning_rate": 9.472122058439252e-05, + "loss": 1.9047, + "step": 5643 + }, + { + "epoch": 1.7323511356660528, + "grad_norm": 0.3421955108642578, + "learning_rate": 9.471899743383986e-05, + "loss": 1.8732, + "step": 5644 + }, + { + "epoch": 1.732658072437078, + "grad_norm": 0.44008341431617737, + "learning_rate": 9.471677384134785e-05, + "loss": 1.8956, + "step": 5645 + }, + { + "epoch": 1.7329650092081033, + "grad_norm": 0.49410128593444824, + "learning_rate": 9.471454980693848e-05, + "loss": 1.9197, + "step": 5646 + }, + { + "epoch": 1.7332719459791281, + "grad_norm": 0.4664965867996216, + "learning_rate": 9.471232533063373e-05, + "loss": 1.8945, + "step": 5647 + }, + { + "epoch": 1.7335788827501535, + "grad_norm": 0.3789248764514923, + "learning_rate": 9.471010041245555e-05, + "loss": 1.9153, + "step": 5648 + }, + { + "epoch": 1.7338858195211786, + "grad_norm": 0.34556612372398376, + "learning_rate": 9.470787505242596e-05, + "loss": 1.9144, + "step": 5649 + }, + { + "epoch": 1.7341927562922037, + "grad_norm": 0.3466256856918335, + "learning_rate": 9.470564925056695e-05, + "loss": 1.8837, + "step": 5650 + }, + { + "epoch": 1.734499693063229, + "grad_norm": 0.34612321853637695, + "learning_rate": 9.470342300690051e-05, + "loss": 1.8667, + "step": 5651 + }, + { + "epoch": 1.7348066298342542, + "grad_norm": 0.3648833632469177, + "learning_rate": 9.470119632144864e-05, + "loss": 1.9499, + "step": 5652 + }, + { + "epoch": 1.7351135666052793, + "grad_norm": 0.3600454330444336, + "learning_rate": 9.469896919423334e-05, + "loss": 1.9093, + "step": 5653 + }, + { + "epoch": 1.7354205033763046, + "grad_norm": 0.41487598419189453, + "learning_rate": 9.469674162527664e-05, + "loss": 1.9714, + "step": 5654 + }, + { + "epoch": 1.7357274401473295, + "grad_norm": 0.35980695486068726, + "learning_rate": 9.469451361460053e-05, + "loss": 1.9006, + "step": 5655 + }, + { + "epoch": 1.7360343769183548, + "grad_norm": 0.42676928639411926, + "learning_rate": 9.469228516222705e-05, + "loss": 1.9286, + "step": 5656 + }, + { + "epoch": 1.73634131368938, + "grad_norm": 0.41541969776153564, + "learning_rate": 9.469005626817822e-05, + "loss": 1.9243, + "step": 5657 + }, + { + "epoch": 1.736648250460405, + "grad_norm": 0.4245065152645111, + "learning_rate": 9.468782693247604e-05, + "loss": 1.9427, + "step": 5658 + }, + { + "epoch": 1.7369551872314304, + "grad_norm": 0.46148940920829773, + "learning_rate": 9.468559715514257e-05, + "loss": 2.0201, + "step": 5659 + }, + { + "epoch": 1.7372621240024555, + "grad_norm": 0.47727301716804504, + "learning_rate": 9.468336693619985e-05, + "loss": 1.9792, + "step": 5660 + }, + { + "epoch": 1.7375690607734806, + "grad_norm": 0.4807848036289215, + "learning_rate": 9.46811362756699e-05, + "loss": 1.9036, + "step": 5661 + }, + { + "epoch": 1.737875997544506, + "grad_norm": 0.5129636526107788, + "learning_rate": 9.467890517357477e-05, + "loss": 1.8861, + "step": 5662 + }, + { + "epoch": 1.7381829343155308, + "grad_norm": 0.467804878950119, + "learning_rate": 9.467667362993651e-05, + "loss": 1.868, + "step": 5663 + }, + { + "epoch": 1.7384898710865562, + "grad_norm": 0.4179893136024475, + "learning_rate": 9.46744416447772e-05, + "loss": 1.9521, + "step": 5664 + }, + { + "epoch": 1.7387968078575813, + "grad_norm": 0.4384612739086151, + "learning_rate": 9.467220921811884e-05, + "loss": 1.9167, + "step": 5665 + }, + { + "epoch": 1.7391037446286064, + "grad_norm": 0.517855703830719, + "learning_rate": 9.466997634998354e-05, + "loss": 1.8919, + "step": 5666 + }, + { + "epoch": 1.7394106813996317, + "grad_norm": 0.4875940978527069, + "learning_rate": 9.466774304039334e-05, + "loss": 1.8774, + "step": 5667 + }, + { + "epoch": 1.7397176181706568, + "grad_norm": 0.44286540150642395, + "learning_rate": 9.466550928937034e-05, + "loss": 1.9696, + "step": 5668 + }, + { + "epoch": 1.740024554941682, + "grad_norm": 0.4092461168766022, + "learning_rate": 9.466327509693658e-05, + "loss": 1.9978, + "step": 5669 + }, + { + "epoch": 1.7403314917127073, + "grad_norm": 0.42797163128852844, + "learning_rate": 9.466104046311418e-05, + "loss": 1.9428, + "step": 5670 + }, + { + "epoch": 1.7406384284837324, + "grad_norm": 0.5174738764762878, + "learning_rate": 9.465880538792518e-05, + "loss": 1.9493, + "step": 5671 + }, + { + "epoch": 1.7409453652547575, + "grad_norm": 0.6263836622238159, + "learning_rate": 9.46565698713917e-05, + "loss": 1.9131, + "step": 5672 + }, + { + "epoch": 1.7412523020257828, + "grad_norm": 0.6452967524528503, + "learning_rate": 9.465433391353582e-05, + "loss": 2.0412, + "step": 5673 + }, + { + "epoch": 1.7415592387968077, + "grad_norm": 0.5004684925079346, + "learning_rate": 9.465209751437964e-05, + "loss": 1.8721, + "step": 5674 + }, + { + "epoch": 1.741866175567833, + "grad_norm": 0.4694507420063019, + "learning_rate": 9.464986067394526e-05, + "loss": 1.9614, + "step": 5675 + }, + { + "epoch": 1.7421731123388582, + "grad_norm": 0.4519532322883606, + "learning_rate": 9.464762339225479e-05, + "loss": 1.9687, + "step": 5676 + }, + { + "epoch": 1.7424800491098833, + "grad_norm": 0.4297941029071808, + "learning_rate": 9.464538566933033e-05, + "loss": 1.965, + "step": 5677 + }, + { + "epoch": 1.7427869858809086, + "grad_norm": 0.4612393081188202, + "learning_rate": 9.464314750519401e-05, + "loss": 1.9651, + "step": 5678 + }, + { + "epoch": 1.7430939226519337, + "grad_norm": 0.394142210483551, + "learning_rate": 9.464090889986794e-05, + "loss": 1.9185, + "step": 5679 + }, + { + "epoch": 1.7434008594229589, + "grad_norm": 0.39999979734420776, + "learning_rate": 9.463866985337424e-05, + "loss": 1.899, + "step": 5680 + }, + { + "epoch": 1.7437077961939842, + "grad_norm": 0.40942859649658203, + "learning_rate": 9.463643036573504e-05, + "loss": 1.9653, + "step": 5681 + }, + { + "epoch": 1.744014732965009, + "grad_norm": 0.4097300171852112, + "learning_rate": 9.463419043697248e-05, + "loss": 1.9944, + "step": 5682 + }, + { + "epoch": 1.7443216697360344, + "grad_norm": 0.41627535223960876, + "learning_rate": 9.463195006710868e-05, + "loss": 1.9156, + "step": 5683 + }, + { + "epoch": 1.7446286065070595, + "grad_norm": 0.3789215385913849, + "learning_rate": 9.46297092561658e-05, + "loss": 1.9262, + "step": 5684 + }, + { + "epoch": 1.7449355432780846, + "grad_norm": 0.4867783188819885, + "learning_rate": 9.462746800416595e-05, + "loss": 1.961, + "step": 5685 + }, + { + "epoch": 1.74524248004911, + "grad_norm": 0.6078580617904663, + "learning_rate": 9.462522631113133e-05, + "loss": 1.9694, + "step": 5686 + }, + { + "epoch": 1.745549416820135, + "grad_norm": 0.558968186378479, + "learning_rate": 9.462298417708406e-05, + "loss": 1.9537, + "step": 5687 + }, + { + "epoch": 1.7458563535911602, + "grad_norm": 0.4677596986293793, + "learning_rate": 9.46207416020463e-05, + "loss": 1.9253, + "step": 5688 + }, + { + "epoch": 1.7461632903621855, + "grad_norm": 0.40353646874427795, + "learning_rate": 9.461849858604023e-05, + "loss": 1.8992, + "step": 5689 + }, + { + "epoch": 1.7464702271332104, + "grad_norm": 0.3738614618778229, + "learning_rate": 9.4616255129088e-05, + "loss": 1.9109, + "step": 5690 + }, + { + "epoch": 1.7467771639042358, + "grad_norm": 0.4040324091911316, + "learning_rate": 9.461401123121179e-05, + "loss": 1.8981, + "step": 5691 + }, + { + "epoch": 1.7470841006752609, + "grad_norm": 0.44214901328086853, + "learning_rate": 9.461176689243376e-05, + "loss": 1.9244, + "step": 5692 + }, + { + "epoch": 1.747391037446286, + "grad_norm": 0.44187378883361816, + "learning_rate": 9.460952211277611e-05, + "loss": 1.9329, + "step": 5693 + }, + { + "epoch": 1.7476979742173113, + "grad_norm": 0.44287410378456116, + "learning_rate": 9.460727689226102e-05, + "loss": 1.97, + "step": 5694 + }, + { + "epoch": 1.7480049109883364, + "grad_norm": 0.3757341504096985, + "learning_rate": 9.460503123091067e-05, + "loss": 1.8766, + "step": 5695 + }, + { + "epoch": 1.7483118477593615, + "grad_norm": 0.4139314591884613, + "learning_rate": 9.460278512874725e-05, + "loss": 1.902, + "step": 5696 + }, + { + "epoch": 1.7486187845303869, + "grad_norm": 0.37526339292526245, + "learning_rate": 9.460053858579298e-05, + "loss": 1.9325, + "step": 5697 + }, + { + "epoch": 1.7489257213014118, + "grad_norm": 0.3770616948604584, + "learning_rate": 9.459829160207004e-05, + "loss": 1.9437, + "step": 5698 + }, + { + "epoch": 1.749232658072437, + "grad_norm": 0.4069806933403015, + "learning_rate": 9.459604417760064e-05, + "loss": 1.9454, + "step": 5699 + }, + { + "epoch": 1.7495395948434622, + "grad_norm": 0.42822694778442383, + "learning_rate": 9.459379631240699e-05, + "loss": 1.8798, + "step": 5700 + }, + { + "epoch": 1.7498465316144873, + "grad_norm": 0.44075292348861694, + "learning_rate": 9.459154800651131e-05, + "loss": 1.9842, + "step": 5701 + }, + { + "epoch": 1.7501534683855127, + "grad_norm": 0.4151122272014618, + "learning_rate": 9.458929925993583e-05, + "loss": 1.8495, + "step": 5702 + }, + { + "epoch": 1.7504604051565378, + "grad_norm": 0.41887882351875305, + "learning_rate": 9.458705007270275e-05, + "loss": 1.9611, + "step": 5703 + }, + { + "epoch": 1.750767341927563, + "grad_norm": 0.3976796865463257, + "learning_rate": 9.45848004448343e-05, + "loss": 1.8841, + "step": 5704 + }, + { + "epoch": 1.7510742786985882, + "grad_norm": 0.3783813416957855, + "learning_rate": 9.458255037635272e-05, + "loss": 1.8897, + "step": 5705 + }, + { + "epoch": 1.7513812154696131, + "grad_norm": 0.35153308510780334, + "learning_rate": 9.458029986728026e-05, + "loss": 1.911, + "step": 5706 + }, + { + "epoch": 1.7516881522406385, + "grad_norm": 0.38390985131263733, + "learning_rate": 9.457804891763913e-05, + "loss": 2.0105, + "step": 5707 + }, + { + "epoch": 1.7519950890116636, + "grad_norm": 0.3830740451812744, + "learning_rate": 9.457579752745161e-05, + "loss": 1.9635, + "step": 5708 + }, + { + "epoch": 1.7523020257826887, + "grad_norm": 0.3711417019367218, + "learning_rate": 9.457354569673993e-05, + "loss": 1.8553, + "step": 5709 + }, + { + "epoch": 1.752608962553714, + "grad_norm": 0.3670618236064911, + "learning_rate": 9.457129342552633e-05, + "loss": 1.9044, + "step": 5710 + }, + { + "epoch": 1.7529158993247391, + "grad_norm": 0.398863285779953, + "learning_rate": 9.45690407138331e-05, + "loss": 1.987, + "step": 5711 + }, + { + "epoch": 1.7532228360957642, + "grad_norm": 0.4100732207298279, + "learning_rate": 9.456678756168248e-05, + "loss": 1.8552, + "step": 5712 + }, + { + "epoch": 1.7535297728667896, + "grad_norm": 0.41883236169815063, + "learning_rate": 9.456453396909676e-05, + "loss": 1.9183, + "step": 5713 + }, + { + "epoch": 1.7538367096378145, + "grad_norm": 0.4063440263271332, + "learning_rate": 9.456227993609818e-05, + "loss": 1.8751, + "step": 5714 + }, + { + "epoch": 1.7541436464088398, + "grad_norm": 0.3880515694618225, + "learning_rate": 9.456002546270904e-05, + "loss": 1.9558, + "step": 5715 + }, + { + "epoch": 1.754450583179865, + "grad_norm": 0.38582444190979004, + "learning_rate": 9.45577705489516e-05, + "loss": 1.9588, + "step": 5716 + }, + { + "epoch": 1.75475751995089, + "grad_norm": 0.3678396940231323, + "learning_rate": 9.455551519484816e-05, + "loss": 1.9108, + "step": 5717 + }, + { + "epoch": 1.7550644567219154, + "grad_norm": 0.3590768277645111, + "learning_rate": 9.455325940042098e-05, + "loss": 1.9027, + "step": 5718 + }, + { + "epoch": 1.7553713934929405, + "grad_norm": 0.4104592204093933, + "learning_rate": 9.455100316569241e-05, + "loss": 1.9099, + "step": 5719 + }, + { + "epoch": 1.7556783302639656, + "grad_norm": 0.3774401843547821, + "learning_rate": 9.45487464906847e-05, + "loss": 1.9098, + "step": 5720 + }, + { + "epoch": 1.755985267034991, + "grad_norm": 0.38464388251304626, + "learning_rate": 9.454648937542019e-05, + "loss": 1.9194, + "step": 5721 + }, + { + "epoch": 1.7562922038060158, + "grad_norm": 0.435131698846817, + "learning_rate": 9.454423181992114e-05, + "loss": 1.9798, + "step": 5722 + }, + { + "epoch": 1.7565991405770411, + "grad_norm": 0.4583236575126648, + "learning_rate": 9.454197382420988e-05, + "loss": 1.9862, + "step": 5723 + }, + { + "epoch": 1.7569060773480663, + "grad_norm": 0.3644738793373108, + "learning_rate": 9.453971538830874e-05, + "loss": 1.8535, + "step": 5724 + }, + { + "epoch": 1.7572130141190914, + "grad_norm": 0.3644218444824219, + "learning_rate": 9.453745651224002e-05, + "loss": 1.8773, + "step": 5725 + }, + { + "epoch": 1.7575199508901167, + "grad_norm": 0.42884743213653564, + "learning_rate": 9.453519719602604e-05, + "loss": 1.882, + "step": 5726 + }, + { + "epoch": 1.7578268876611418, + "grad_norm": 0.41049477458000183, + "learning_rate": 9.453293743968916e-05, + "loss": 1.9133, + "step": 5727 + }, + { + "epoch": 1.758133824432167, + "grad_norm": 0.35882604122161865, + "learning_rate": 9.453067724325169e-05, + "loss": 1.9056, + "step": 5728 + }, + { + "epoch": 1.7584407612031923, + "grad_norm": 0.34516364336013794, + "learning_rate": 9.452841660673595e-05, + "loss": 1.8894, + "step": 5729 + }, + { + "epoch": 1.7587476979742172, + "grad_norm": 0.41804373264312744, + "learning_rate": 9.45261555301643e-05, + "loss": 1.8798, + "step": 5730 + }, + { + "epoch": 1.7590546347452425, + "grad_norm": 0.48584702610969543, + "learning_rate": 9.45238940135591e-05, + "loss": 1.9353, + "step": 5731 + }, + { + "epoch": 1.7593615715162676, + "grad_norm": 0.5693044662475586, + "learning_rate": 9.452163205694267e-05, + "loss": 1.8813, + "step": 5732 + }, + { + "epoch": 1.7596685082872927, + "grad_norm": 0.6146205067634583, + "learning_rate": 9.451936966033738e-05, + "loss": 1.9993, + "step": 5733 + }, + { + "epoch": 1.759975445058318, + "grad_norm": 0.4658338129520416, + "learning_rate": 9.451710682376558e-05, + "loss": 1.8977, + "step": 5734 + }, + { + "epoch": 1.7602823818293432, + "grad_norm": 0.35184696316719055, + "learning_rate": 9.451484354724964e-05, + "loss": 1.9924, + "step": 5735 + }, + { + "epoch": 1.7605893186003683, + "grad_norm": 0.48720163106918335, + "learning_rate": 9.451257983081194e-05, + "loss": 1.9054, + "step": 5736 + }, + { + "epoch": 1.7608962553713936, + "grad_norm": 0.6268271803855896, + "learning_rate": 9.451031567447482e-05, + "loss": 1.9956, + "step": 5737 + }, + { + "epoch": 1.7612031921424187, + "grad_norm": 0.5384534001350403, + "learning_rate": 9.450805107826068e-05, + "loss": 1.9169, + "step": 5738 + }, + { + "epoch": 1.7615101289134438, + "grad_norm": 0.4011121094226837, + "learning_rate": 9.450578604219188e-05, + "loss": 1.9845, + "step": 5739 + }, + { + "epoch": 1.7618170656844692, + "grad_norm": 0.4422668516635895, + "learning_rate": 9.450352056629082e-05, + "loss": 2.0014, + "step": 5740 + }, + { + "epoch": 1.762124002455494, + "grad_norm": 0.5033303499221802, + "learning_rate": 9.45012546505799e-05, + "loss": 1.9142, + "step": 5741 + }, + { + "epoch": 1.7624309392265194, + "grad_norm": 0.6074427366256714, + "learning_rate": 9.449898829508148e-05, + "loss": 1.9385, + "step": 5742 + }, + { + "epoch": 1.7627378759975445, + "grad_norm": 0.6405495405197144, + "learning_rate": 9.449672149981799e-05, + "loss": 1.9792, + "step": 5743 + }, + { + "epoch": 1.7630448127685696, + "grad_norm": 0.5432560443878174, + "learning_rate": 9.449445426481182e-05, + "loss": 1.9294, + "step": 5744 + }, + { + "epoch": 1.763351749539595, + "grad_norm": 0.41406089067459106, + "learning_rate": 9.449218659008536e-05, + "loss": 1.9266, + "step": 5745 + }, + { + "epoch": 1.76365868631062, + "grad_norm": 0.41278013586997986, + "learning_rate": 9.448991847566104e-05, + "loss": 1.9448, + "step": 5746 + }, + { + "epoch": 1.7639656230816452, + "grad_norm": 0.4682934582233429, + "learning_rate": 9.448764992156128e-05, + "loss": 1.9836, + "step": 5747 + }, + { + "epoch": 1.7642725598526705, + "grad_norm": 0.47673073410987854, + "learning_rate": 9.448538092780848e-05, + "loss": 2.0229, + "step": 5748 + }, + { + "epoch": 1.7645794966236954, + "grad_norm": 0.3956258296966553, + "learning_rate": 9.448311149442507e-05, + "loss": 1.9871, + "step": 5749 + }, + { + "epoch": 1.7648864333947207, + "grad_norm": 0.39578214287757874, + "learning_rate": 9.448084162143348e-05, + "loss": 1.8991, + "step": 5750 + }, + { + "epoch": 1.7651933701657458, + "grad_norm": 0.42902353405952454, + "learning_rate": 9.447857130885614e-05, + "loss": 1.9925, + "step": 5751 + }, + { + "epoch": 1.765500306936771, + "grad_norm": 0.45643556118011475, + "learning_rate": 9.44763005567155e-05, + "loss": 1.9662, + "step": 5752 + }, + { + "epoch": 1.7658072437077963, + "grad_norm": 0.39291635155677795, + "learning_rate": 9.447402936503398e-05, + "loss": 1.8925, + "step": 5753 + }, + { + "epoch": 1.7661141804788214, + "grad_norm": 0.36709296703338623, + "learning_rate": 9.447175773383404e-05, + "loss": 1.8669, + "step": 5754 + }, + { + "epoch": 1.7664211172498465, + "grad_norm": 0.41586652398109436, + "learning_rate": 9.446948566313812e-05, + "loss": 1.8925, + "step": 5755 + }, + { + "epoch": 1.7667280540208719, + "grad_norm": 0.42532578110694885, + "learning_rate": 9.446721315296867e-05, + "loss": 1.9923, + "step": 5756 + }, + { + "epoch": 1.7670349907918967, + "grad_norm": 0.45310646295547485, + "learning_rate": 9.446494020334817e-05, + "loss": 1.9908, + "step": 5757 + }, + { + "epoch": 1.767341927562922, + "grad_norm": 0.4391445219516754, + "learning_rate": 9.446266681429907e-05, + "loss": 1.9391, + "step": 5758 + }, + { + "epoch": 1.7676488643339472, + "grad_norm": 0.3728313446044922, + "learning_rate": 9.446039298584382e-05, + "loss": 1.9352, + "step": 5759 + }, + { + "epoch": 1.7679558011049723, + "grad_norm": 0.3862408697605133, + "learning_rate": 9.445811871800492e-05, + "loss": 1.9628, + "step": 5760 + }, + { + "epoch": 1.7682627378759976, + "grad_norm": 0.3704443573951721, + "learning_rate": 9.445584401080482e-05, + "loss": 1.9041, + "step": 5761 + }, + { + "epoch": 1.7685696746470227, + "grad_norm": 0.3490816652774811, + "learning_rate": 9.445356886426603e-05, + "loss": 1.9203, + "step": 5762 + }, + { + "epoch": 1.7688766114180479, + "grad_norm": 0.40135613083839417, + "learning_rate": 9.445129327841102e-05, + "loss": 1.9166, + "step": 5763 + }, + { + "epoch": 1.7691835481890732, + "grad_norm": 0.3794950246810913, + "learning_rate": 9.444901725326227e-05, + "loss": 1.8735, + "step": 5764 + }, + { + "epoch": 1.769490484960098, + "grad_norm": 0.3908408284187317, + "learning_rate": 9.444674078884228e-05, + "loss": 1.9044, + "step": 5765 + }, + { + "epoch": 1.7697974217311234, + "grad_norm": 0.45880573987960815, + "learning_rate": 9.444446388517354e-05, + "loss": 1.999, + "step": 5766 + }, + { + "epoch": 1.7701043585021485, + "grad_norm": 0.44833555817604065, + "learning_rate": 9.444218654227856e-05, + "loss": 1.8638, + "step": 5767 + }, + { + "epoch": 1.7704112952731736, + "grad_norm": 0.4608282446861267, + "learning_rate": 9.443990876017985e-05, + "loss": 2.0073, + "step": 5768 + }, + { + "epoch": 1.770718232044199, + "grad_norm": 0.41873493790626526, + "learning_rate": 9.44376305388999e-05, + "loss": 1.9337, + "step": 5769 + }, + { + "epoch": 1.771025168815224, + "grad_norm": 0.44395530223846436, + "learning_rate": 9.443535187846125e-05, + "loss": 1.9218, + "step": 5770 + }, + { + "epoch": 1.7713321055862492, + "grad_norm": 0.4347928464412689, + "learning_rate": 9.443307277888641e-05, + "loss": 1.9251, + "step": 5771 + }, + { + "epoch": 1.7716390423572745, + "grad_norm": 0.4892890155315399, + "learning_rate": 9.44307932401979e-05, + "loss": 1.9549, + "step": 5772 + }, + { + "epoch": 1.7719459791282994, + "grad_norm": 0.4234324097633362, + "learning_rate": 9.442851326241826e-05, + "loss": 1.9835, + "step": 5773 + }, + { + "epoch": 1.7722529158993248, + "grad_norm": 0.3614303171634674, + "learning_rate": 9.442623284557e-05, + "loss": 1.8942, + "step": 5774 + }, + { + "epoch": 1.7725598526703499, + "grad_norm": 0.4273429214954376, + "learning_rate": 9.442395198967566e-05, + "loss": 1.9363, + "step": 5775 + }, + { + "epoch": 1.772866789441375, + "grad_norm": 0.5049880146980286, + "learning_rate": 9.44216706947578e-05, + "loss": 1.904, + "step": 5776 + }, + { + "epoch": 1.7731737262124003, + "grad_norm": 0.5713424682617188, + "learning_rate": 9.441938896083895e-05, + "loss": 1.9756, + "step": 5777 + }, + { + "epoch": 1.7734806629834254, + "grad_norm": 0.4836362600326538, + "learning_rate": 9.441710678794166e-05, + "loss": 1.9657, + "step": 5778 + }, + { + "epoch": 1.7737875997544506, + "grad_norm": 0.39967820048332214, + "learning_rate": 9.44148241760885e-05, + "loss": 1.9566, + "step": 5779 + }, + { + "epoch": 1.7740945365254759, + "grad_norm": 0.38304075598716736, + "learning_rate": 9.4412541125302e-05, + "loss": 1.9055, + "step": 5780 + }, + { + "epoch": 1.7744014732965008, + "grad_norm": 0.3932463526725769, + "learning_rate": 9.441025763560474e-05, + "loss": 1.9603, + "step": 5781 + }, + { + "epoch": 1.774708410067526, + "grad_norm": 0.4528409242630005, + "learning_rate": 9.44079737070193e-05, + "loss": 2.0095, + "step": 5782 + }, + { + "epoch": 1.7750153468385512, + "grad_norm": 0.42075392603874207, + "learning_rate": 9.440568933956822e-05, + "loss": 1.8818, + "step": 5783 + }, + { + "epoch": 1.7753222836095763, + "grad_norm": 0.4114269018173218, + "learning_rate": 9.44034045332741e-05, + "loss": 1.8524, + "step": 5784 + }, + { + "epoch": 1.7756292203806017, + "grad_norm": 0.4052261412143707, + "learning_rate": 9.44011192881595e-05, + "loss": 1.9759, + "step": 5785 + }, + { + "epoch": 1.7759361571516268, + "grad_norm": 0.3551998436450958, + "learning_rate": 9.439883360424702e-05, + "loss": 1.9534, + "step": 5786 + }, + { + "epoch": 1.776243093922652, + "grad_norm": 0.404109925031662, + "learning_rate": 9.439654748155924e-05, + "loss": 1.8944, + "step": 5787 + }, + { + "epoch": 1.7765500306936772, + "grad_norm": 0.4092860519886017, + "learning_rate": 9.439426092011875e-05, + "loss": 2.0341, + "step": 5788 + }, + { + "epoch": 1.7768569674647021, + "grad_norm": 0.36132386326789856, + "learning_rate": 9.439197391994819e-05, + "loss": 1.8746, + "step": 5789 + }, + { + "epoch": 1.7771639042357275, + "grad_norm": 0.34845319390296936, + "learning_rate": 9.438968648107009e-05, + "loss": 1.8646, + "step": 5790 + }, + { + "epoch": 1.7774708410067526, + "grad_norm": 0.33360353112220764, + "learning_rate": 9.43873986035071e-05, + "loss": 1.901, + "step": 5791 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.348147988319397, + "learning_rate": 9.438511028728181e-05, + "loss": 1.8703, + "step": 5792 + }, + { + "epoch": 1.778084714548803, + "grad_norm": 0.385662704706192, + "learning_rate": 9.438282153241686e-05, + "loss": 1.9806, + "step": 5793 + }, + { + "epoch": 1.7783916513198281, + "grad_norm": 0.39457234740257263, + "learning_rate": 9.438053233893484e-05, + "loss": 1.9324, + "step": 5794 + }, + { + "epoch": 1.7786985880908532, + "grad_norm": 0.35745853185653687, + "learning_rate": 9.43782427068584e-05, + "loss": 1.9754, + "step": 5795 + }, + { + "epoch": 1.7790055248618786, + "grad_norm": 0.40866991877555847, + "learning_rate": 9.437595263621015e-05, + "loss": 1.959, + "step": 5796 + }, + { + "epoch": 1.7793124616329035, + "grad_norm": 0.3938930630683899, + "learning_rate": 9.437366212701274e-05, + "loss": 1.8746, + "step": 5797 + }, + { + "epoch": 1.7796193984039288, + "grad_norm": 0.36665603518486023, + "learning_rate": 9.437137117928878e-05, + "loss": 1.9209, + "step": 5798 + }, + { + "epoch": 1.779926335174954, + "grad_norm": 0.38514846563339233, + "learning_rate": 9.436907979306092e-05, + "loss": 1.8697, + "step": 5799 + }, + { + "epoch": 1.780233271945979, + "grad_norm": 0.4100898206233978, + "learning_rate": 9.43667879683518e-05, + "loss": 1.9606, + "step": 5800 + }, + { + "epoch": 1.7805402087170044, + "grad_norm": 0.40195250511169434, + "learning_rate": 9.43644957051841e-05, + "loss": 1.918, + "step": 5801 + }, + { + "epoch": 1.7808471454880295, + "grad_norm": 0.3943032920360565, + "learning_rate": 9.436220300358043e-05, + "loss": 1.9394, + "step": 5802 + }, + { + "epoch": 1.7811540822590546, + "grad_norm": 0.4171943664550781, + "learning_rate": 9.435990986356349e-05, + "loss": 1.9773, + "step": 5803 + }, + { + "epoch": 1.78146101903008, + "grad_norm": 0.4278806746006012, + "learning_rate": 9.435761628515589e-05, + "loss": 1.8696, + "step": 5804 + }, + { + "epoch": 1.7817679558011048, + "grad_norm": 0.4659377634525299, + "learning_rate": 9.435532226838036e-05, + "loss": 1.9387, + "step": 5805 + }, + { + "epoch": 1.7820748925721301, + "grad_norm": 0.4428139925003052, + "learning_rate": 9.435302781325952e-05, + "loss": 1.9673, + "step": 5806 + }, + { + "epoch": 1.7823818293431553, + "grad_norm": 0.4488377869129181, + "learning_rate": 9.435073291981607e-05, + "loss": 1.8493, + "step": 5807 + }, + { + "epoch": 1.7826887661141804, + "grad_norm": 0.5337218046188354, + "learning_rate": 9.434843758807268e-05, + "loss": 1.8631, + "step": 5808 + }, + { + "epoch": 1.7829957028852057, + "grad_norm": 0.5479410886764526, + "learning_rate": 9.434614181805202e-05, + "loss": 1.8548, + "step": 5809 + }, + { + "epoch": 1.7833026396562308, + "grad_norm": 0.5154398679733276, + "learning_rate": 9.434384560977681e-05, + "loss": 1.9558, + "step": 5810 + }, + { + "epoch": 1.783609576427256, + "grad_norm": 0.44863855838775635, + "learning_rate": 9.434154896326974e-05, + "loss": 1.9287, + "step": 5811 + }, + { + "epoch": 1.7839165131982813, + "grad_norm": 0.43923139572143555, + "learning_rate": 9.433925187855348e-05, + "loss": 1.9475, + "step": 5812 + }, + { + "epoch": 1.7842234499693064, + "grad_norm": 0.3602962791919708, + "learning_rate": 9.433695435565073e-05, + "loss": 1.8705, + "step": 5813 + }, + { + "epoch": 1.7845303867403315, + "grad_norm": 0.3956433832645416, + "learning_rate": 9.433465639458423e-05, + "loss": 1.9402, + "step": 5814 + }, + { + "epoch": 1.7848373235113568, + "grad_norm": 0.3382786810398102, + "learning_rate": 9.433235799537666e-05, + "loss": 1.9176, + "step": 5815 + }, + { + "epoch": 1.7851442602823817, + "grad_norm": 0.3681669533252716, + "learning_rate": 9.433005915805076e-05, + "loss": 1.8628, + "step": 5816 + }, + { + "epoch": 1.785451197053407, + "grad_norm": 0.32285505533218384, + "learning_rate": 9.432775988262921e-05, + "loss": 1.8875, + "step": 5817 + }, + { + "epoch": 1.7857581338244322, + "grad_norm": 0.35673508048057556, + "learning_rate": 9.432546016913477e-05, + "loss": 1.925, + "step": 5818 + }, + { + "epoch": 1.7860650705954573, + "grad_norm": 0.363308310508728, + "learning_rate": 9.432316001759015e-05, + "loss": 1.8711, + "step": 5819 + }, + { + "epoch": 1.7863720073664826, + "grad_norm": 0.36789265275001526, + "learning_rate": 9.432085942801808e-05, + "loss": 1.8578, + "step": 5820 + }, + { + "epoch": 1.7866789441375077, + "grad_norm": 0.3791796565055847, + "learning_rate": 9.43185584004413e-05, + "loss": 1.9162, + "step": 5821 + }, + { + "epoch": 1.7869858809085328, + "grad_norm": 0.3819539248943329, + "learning_rate": 9.431625693488256e-05, + "loss": 1.9042, + "step": 5822 + }, + { + "epoch": 1.7872928176795582, + "grad_norm": 0.36675095558166504, + "learning_rate": 9.43139550313646e-05, + "loss": 1.9775, + "step": 5823 + }, + { + "epoch": 1.787599754450583, + "grad_norm": 0.40895935893058777, + "learning_rate": 9.431165268991013e-05, + "loss": 1.9249, + "step": 5824 + }, + { + "epoch": 1.7879066912216084, + "grad_norm": 0.3866878151893616, + "learning_rate": 9.430934991054197e-05, + "loss": 1.8706, + "step": 5825 + }, + { + "epoch": 1.7882136279926335, + "grad_norm": 0.4892923831939697, + "learning_rate": 9.430704669328283e-05, + "loss": 1.9177, + "step": 5826 + }, + { + "epoch": 1.7885205647636586, + "grad_norm": 0.46216699481010437, + "learning_rate": 9.430474303815548e-05, + "loss": 1.8606, + "step": 5827 + }, + { + "epoch": 1.788827501534684, + "grad_norm": 0.4253760874271393, + "learning_rate": 9.430243894518271e-05, + "loss": 1.9123, + "step": 5828 + }, + { + "epoch": 1.789134438305709, + "grad_norm": 0.3316090404987335, + "learning_rate": 9.430013441438726e-05, + "loss": 1.9138, + "step": 5829 + }, + { + "epoch": 1.7894413750767342, + "grad_norm": 0.36144545674324036, + "learning_rate": 9.429782944579191e-05, + "loss": 1.8851, + "step": 5830 + }, + { + "epoch": 1.7897483118477595, + "grad_norm": 0.47213298082351685, + "learning_rate": 9.429552403941946e-05, + "loss": 1.9614, + "step": 5831 + }, + { + "epoch": 1.7900552486187844, + "grad_norm": 0.5166186094284058, + "learning_rate": 9.429321819529267e-05, + "loss": 1.9297, + "step": 5832 + }, + { + "epoch": 1.7903621853898097, + "grad_norm": 0.5276393294334412, + "learning_rate": 9.429091191343433e-05, + "loss": 1.8803, + "step": 5833 + }, + { + "epoch": 1.7906691221608348, + "grad_norm": 0.5736613869667053, + "learning_rate": 9.428860519386726e-05, + "loss": 1.9256, + "step": 5834 + }, + { + "epoch": 1.79097605893186, + "grad_norm": 0.6111080050468445, + "learning_rate": 9.428629803661421e-05, + "loss": 1.9624, + "step": 5835 + }, + { + "epoch": 1.7912829957028853, + "grad_norm": 0.45036107301712036, + "learning_rate": 9.428399044169802e-05, + "loss": 1.8625, + "step": 5836 + }, + { + "epoch": 1.7915899324739104, + "grad_norm": 0.35049325227737427, + "learning_rate": 9.428168240914148e-05, + "loss": 1.8988, + "step": 5837 + }, + { + "epoch": 1.7918968692449355, + "grad_norm": 0.4196048080921173, + "learning_rate": 9.427937393896739e-05, + "loss": 1.8593, + "step": 5838 + }, + { + "epoch": 1.7922038060159609, + "grad_norm": 0.5051491856575012, + "learning_rate": 9.42770650311986e-05, + "loss": 1.9283, + "step": 5839 + }, + { + "epoch": 1.7925107427869857, + "grad_norm": 0.5883297324180603, + "learning_rate": 9.427475568585787e-05, + "loss": 1.9211, + "step": 5840 + }, + { + "epoch": 1.792817679558011, + "grad_norm": 0.54326993227005, + "learning_rate": 9.427244590296807e-05, + "loss": 1.8856, + "step": 5841 + }, + { + "epoch": 1.7931246163290362, + "grad_norm": 0.3963034152984619, + "learning_rate": 9.4270135682552e-05, + "loss": 1.9302, + "step": 5842 + }, + { + "epoch": 1.7934315531000613, + "grad_norm": 0.3804232180118561, + "learning_rate": 9.426782502463251e-05, + "loss": 1.8615, + "step": 5843 + }, + { + "epoch": 1.7937384898710866, + "grad_norm": 0.5173880457878113, + "learning_rate": 9.426551392923244e-05, + "loss": 1.9702, + "step": 5844 + }, + { + "epoch": 1.7940454266421118, + "grad_norm": 0.5509253144264221, + "learning_rate": 9.42632023963746e-05, + "loss": 1.9091, + "step": 5845 + }, + { + "epoch": 1.7943523634131369, + "grad_norm": 0.4918860197067261, + "learning_rate": 9.426089042608186e-05, + "loss": 1.956, + "step": 5846 + }, + { + "epoch": 1.7946593001841622, + "grad_norm": 0.40632131695747375, + "learning_rate": 9.425857801837705e-05, + "loss": 1.978, + "step": 5847 + }, + { + "epoch": 1.794966236955187, + "grad_norm": 0.429643839597702, + "learning_rate": 9.425626517328303e-05, + "loss": 1.9293, + "step": 5848 + }, + { + "epoch": 1.7952731737262124, + "grad_norm": 0.46690109372138977, + "learning_rate": 9.425395189082267e-05, + "loss": 1.935, + "step": 5849 + }, + { + "epoch": 1.7955801104972375, + "grad_norm": 0.47745081782341003, + "learning_rate": 9.425163817101881e-05, + "loss": 1.9308, + "step": 5850 + }, + { + "epoch": 1.7958870472682626, + "grad_norm": 0.40971288084983826, + "learning_rate": 9.424932401389433e-05, + "loss": 1.8818, + "step": 5851 + }, + { + "epoch": 1.796193984039288, + "grad_norm": 0.44640809297561646, + "learning_rate": 9.424700941947209e-05, + "loss": 1.9298, + "step": 5852 + }, + { + "epoch": 1.796500920810313, + "grad_norm": 0.4068106412887573, + "learning_rate": 9.424469438777497e-05, + "loss": 1.9176, + "step": 5853 + }, + { + "epoch": 1.7968078575813382, + "grad_norm": 0.39228180050849915, + "learning_rate": 9.424237891882584e-05, + "loss": 1.9822, + "step": 5854 + }, + { + "epoch": 1.7971147943523635, + "grad_norm": 0.4050966203212738, + "learning_rate": 9.424006301264761e-05, + "loss": 2.0092, + "step": 5855 + }, + { + "epoch": 1.7974217311233884, + "grad_norm": 0.4402252733707428, + "learning_rate": 9.423774666926313e-05, + "loss": 1.9686, + "step": 5856 + }, + { + "epoch": 1.7977286678944138, + "grad_norm": 0.4362206757068634, + "learning_rate": 9.423542988869531e-05, + "loss": 1.9472, + "step": 5857 + }, + { + "epoch": 1.7980356046654389, + "grad_norm": 0.4363079369068146, + "learning_rate": 9.423311267096706e-05, + "loss": 1.9046, + "step": 5858 + }, + { + "epoch": 1.798342541436464, + "grad_norm": 0.4619371294975281, + "learning_rate": 9.423079501610123e-05, + "loss": 1.9322, + "step": 5859 + }, + { + "epoch": 1.7986494782074893, + "grad_norm": 0.3747330605983734, + "learning_rate": 9.42284769241208e-05, + "loss": 1.8859, + "step": 5860 + }, + { + "epoch": 1.7989564149785144, + "grad_norm": 0.46349939703941345, + "learning_rate": 9.422615839504863e-05, + "loss": 2.0343, + "step": 5861 + }, + { + "epoch": 1.7992633517495396, + "grad_norm": 0.4081406891345978, + "learning_rate": 9.422383942890762e-05, + "loss": 1.9261, + "step": 5862 + }, + { + "epoch": 1.7995702885205649, + "grad_norm": 0.4200274348258972, + "learning_rate": 9.42215200257207e-05, + "loss": 1.8922, + "step": 5863 + }, + { + "epoch": 1.7998772252915898, + "grad_norm": 0.4353233277797699, + "learning_rate": 9.421920018551084e-05, + "loss": 1.9263, + "step": 5864 + }, + { + "epoch": 1.8001841620626151, + "grad_norm": 0.43261346220970154, + "learning_rate": 9.42168799083009e-05, + "loss": 1.872, + "step": 5865 + }, + { + "epoch": 1.8004910988336402, + "grad_norm": 0.41588231921195984, + "learning_rate": 9.421455919411385e-05, + "loss": 1.9427, + "step": 5866 + }, + { + "epoch": 1.8007980356046653, + "grad_norm": 0.36490678787231445, + "learning_rate": 9.421223804297261e-05, + "loss": 1.9458, + "step": 5867 + }, + { + "epoch": 1.8011049723756907, + "grad_norm": 0.40656644105911255, + "learning_rate": 9.42099164549001e-05, + "loss": 1.8791, + "step": 5868 + }, + { + "epoch": 1.8014119091467158, + "grad_norm": 0.35529834032058716, + "learning_rate": 9.42075944299193e-05, + "loss": 1.8889, + "step": 5869 + }, + { + "epoch": 1.801718845917741, + "grad_norm": 0.3530628979206085, + "learning_rate": 9.420527196805314e-05, + "loss": 1.9093, + "step": 5870 + }, + { + "epoch": 1.8020257826887662, + "grad_norm": 0.35012003779411316, + "learning_rate": 9.420294906932457e-05, + "loss": 1.84, + "step": 5871 + }, + { + "epoch": 1.8023327194597911, + "grad_norm": 0.37993142008781433, + "learning_rate": 9.420062573375654e-05, + "loss": 1.9943, + "step": 5872 + }, + { + "epoch": 1.8026396562308165, + "grad_norm": 0.34801873564720154, + "learning_rate": 9.419830196137204e-05, + "loss": 1.9092, + "step": 5873 + }, + { + "epoch": 1.8029465930018416, + "grad_norm": 0.3381052017211914, + "learning_rate": 9.4195977752194e-05, + "loss": 1.9212, + "step": 5874 + }, + { + "epoch": 1.8032535297728667, + "grad_norm": 0.3624991476535797, + "learning_rate": 9.419365310624542e-05, + "loss": 1.9491, + "step": 5875 + }, + { + "epoch": 1.803560466543892, + "grad_norm": 0.3840768337249756, + "learning_rate": 9.419132802354925e-05, + "loss": 1.9531, + "step": 5876 + }, + { + "epoch": 1.8038674033149171, + "grad_norm": 0.377481073141098, + "learning_rate": 9.418900250412846e-05, + "loss": 1.9103, + "step": 5877 + }, + { + "epoch": 1.8041743400859422, + "grad_norm": 0.41462278366088867, + "learning_rate": 9.418667654800606e-05, + "loss": 1.944, + "step": 5878 + }, + { + "epoch": 1.8044812768569676, + "grad_norm": 0.5620705485343933, + "learning_rate": 9.418435015520502e-05, + "loss": 1.9184, + "step": 5879 + }, + { + "epoch": 1.8047882136279927, + "grad_norm": 0.6150699853897095, + "learning_rate": 9.418202332574833e-05, + "loss": 1.8971, + "step": 5880 + }, + { + "epoch": 1.8050951503990178, + "grad_norm": 0.5631645321846008, + "learning_rate": 9.4179696059659e-05, + "loss": 1.9668, + "step": 5881 + }, + { + "epoch": 1.8054020871700431, + "grad_norm": 0.4416831433773041, + "learning_rate": 9.417736835696001e-05, + "loss": 1.8531, + "step": 5882 + }, + { + "epoch": 1.805709023941068, + "grad_norm": 0.37340816855430603, + "learning_rate": 9.417504021767438e-05, + "loss": 1.8928, + "step": 5883 + }, + { + "epoch": 1.8060159607120934, + "grad_norm": 0.46018123626708984, + "learning_rate": 9.41727116418251e-05, + "loss": 1.8943, + "step": 5884 + }, + { + "epoch": 1.8063228974831185, + "grad_norm": 0.3852032721042633, + "learning_rate": 9.41703826294352e-05, + "loss": 1.8927, + "step": 5885 + }, + { + "epoch": 1.8066298342541436, + "grad_norm": 0.36783283948898315, + "learning_rate": 9.41680531805277e-05, + "loss": 1.9255, + "step": 5886 + }, + { + "epoch": 1.806936771025169, + "grad_norm": 0.39950302243232727, + "learning_rate": 9.416572329512559e-05, + "loss": 1.9215, + "step": 5887 + }, + { + "epoch": 1.807243707796194, + "grad_norm": 0.37217068672180176, + "learning_rate": 9.416339297325193e-05, + "loss": 1.8798, + "step": 5888 + }, + { + "epoch": 1.8075506445672191, + "grad_norm": 0.4334213137626648, + "learning_rate": 9.416106221492974e-05, + "loss": 1.9583, + "step": 5889 + }, + { + "epoch": 1.8078575813382445, + "grad_norm": 0.39610370993614197, + "learning_rate": 9.415873102018204e-05, + "loss": 1.9526, + "step": 5890 + }, + { + "epoch": 1.8081645181092694, + "grad_norm": 0.4256335496902466, + "learning_rate": 9.41563993890319e-05, + "loss": 1.9633, + "step": 5891 + }, + { + "epoch": 1.8084714548802947, + "grad_norm": 0.48030543327331543, + "learning_rate": 9.41540673215023e-05, + "loss": 1.8869, + "step": 5892 + }, + { + "epoch": 1.8087783916513198, + "grad_norm": 0.5549675822257996, + "learning_rate": 9.415173481761634e-05, + "loss": 1.9894, + "step": 5893 + }, + { + "epoch": 1.809085328422345, + "grad_norm": 0.5706361532211304, + "learning_rate": 9.414940187739708e-05, + "loss": 1.9721, + "step": 5894 + }, + { + "epoch": 1.8093922651933703, + "grad_norm": 0.4263947606086731, + "learning_rate": 9.414706850086754e-05, + "loss": 1.9408, + "step": 5895 + }, + { + "epoch": 1.8096992019643954, + "grad_norm": 0.3934611976146698, + "learning_rate": 9.414473468805078e-05, + "loss": 1.9444, + "step": 5896 + }, + { + "epoch": 1.8100061387354205, + "grad_norm": 0.4267776608467102, + "learning_rate": 9.41424004389699e-05, + "loss": 1.8774, + "step": 5897 + }, + { + "epoch": 1.8103130755064458, + "grad_norm": 0.46216219663619995, + "learning_rate": 9.414006575364795e-05, + "loss": 1.9648, + "step": 5898 + }, + { + "epoch": 1.8106200122774707, + "grad_norm": 0.4730767607688904, + "learning_rate": 9.413773063210798e-05, + "loss": 1.9528, + "step": 5899 + }, + { + "epoch": 1.810926949048496, + "grad_norm": 0.36383283138275146, + "learning_rate": 9.413539507437308e-05, + "loss": 1.843, + "step": 5900 + }, + { + "epoch": 1.8112338858195212, + "grad_norm": 0.343729168176651, + "learning_rate": 9.413305908046636e-05, + "loss": 1.9101, + "step": 5901 + }, + { + "epoch": 1.8115408225905463, + "grad_norm": 0.3774524927139282, + "learning_rate": 9.413072265041087e-05, + "loss": 1.8705, + "step": 5902 + }, + { + "epoch": 1.8118477593615716, + "grad_norm": 0.37734711170196533, + "learning_rate": 9.412838578422972e-05, + "loss": 1.868, + "step": 5903 + }, + { + "epoch": 1.8121546961325967, + "grad_norm": 0.3705524206161499, + "learning_rate": 9.4126048481946e-05, + "loss": 1.9587, + "step": 5904 + }, + { + "epoch": 1.8124616329036218, + "grad_norm": 0.45906612277030945, + "learning_rate": 9.41237107435828e-05, + "loss": 1.9872, + "step": 5905 + }, + { + "epoch": 1.8127685696746472, + "grad_norm": 0.5013484954833984, + "learning_rate": 9.412137256916323e-05, + "loss": 1.8692, + "step": 5906 + }, + { + "epoch": 1.813075506445672, + "grad_norm": 0.5123991370201111, + "learning_rate": 9.411903395871038e-05, + "loss": 1.9574, + "step": 5907 + }, + { + "epoch": 1.8133824432166974, + "grad_norm": 0.45425844192504883, + "learning_rate": 9.411669491224739e-05, + "loss": 1.9295, + "step": 5908 + }, + { + "epoch": 1.8136893799877225, + "grad_norm": 0.3939640522003174, + "learning_rate": 9.411435542979736e-05, + "loss": 1.9258, + "step": 5909 + }, + { + "epoch": 1.8139963167587476, + "grad_norm": 0.5032235383987427, + "learning_rate": 9.411201551138342e-05, + "loss": 1.9012, + "step": 5910 + }, + { + "epoch": 1.814303253529773, + "grad_norm": 0.6334826946258545, + "learning_rate": 9.410967515702869e-05, + "loss": 1.9699, + "step": 5911 + }, + { + "epoch": 1.814610190300798, + "grad_norm": 0.56645667552948, + "learning_rate": 9.41073343667563e-05, + "loss": 1.9346, + "step": 5912 + }, + { + "epoch": 1.8149171270718232, + "grad_norm": 0.461668461561203, + "learning_rate": 9.410499314058936e-05, + "loss": 1.9549, + "step": 5913 + }, + { + "epoch": 1.8152240638428485, + "grad_norm": 0.39917534589767456, + "learning_rate": 9.410265147855104e-05, + "loss": 1.9503, + "step": 5914 + }, + { + "epoch": 1.8155310006138734, + "grad_norm": 0.4409043788909912, + "learning_rate": 9.410030938066448e-05, + "loss": 1.897, + "step": 5915 + }, + { + "epoch": 1.8158379373848987, + "grad_norm": 0.5793384313583374, + "learning_rate": 9.40979668469528e-05, + "loss": 1.9526, + "step": 5916 + }, + { + "epoch": 1.8161448741559238, + "grad_norm": 0.4642924666404724, + "learning_rate": 9.409562387743917e-05, + "loss": 1.8993, + "step": 5917 + }, + { + "epoch": 1.816451810926949, + "grad_norm": 0.3799861669540405, + "learning_rate": 9.409328047214674e-05, + "loss": 1.9412, + "step": 5918 + }, + { + "epoch": 1.8167587476979743, + "grad_norm": 0.40758320689201355, + "learning_rate": 9.409093663109866e-05, + "loss": 1.9908, + "step": 5919 + }, + { + "epoch": 1.8170656844689994, + "grad_norm": 0.41446420550346375, + "learning_rate": 9.40885923543181e-05, + "loss": 1.8711, + "step": 5920 + }, + { + "epoch": 1.8173726212400245, + "grad_norm": 0.4744807183742523, + "learning_rate": 9.408624764182823e-05, + "loss": 2.0297, + "step": 5921 + }, + { + "epoch": 1.8176795580110499, + "grad_norm": 0.43377524614334106, + "learning_rate": 9.408390249365224e-05, + "loss": 1.9613, + "step": 5922 + }, + { + "epoch": 1.8179864947820747, + "grad_norm": 0.38450872898101807, + "learning_rate": 9.408155690981328e-05, + "loss": 1.8716, + "step": 5923 + }, + { + "epoch": 1.8182934315531, + "grad_norm": 0.4989684820175171, + "learning_rate": 9.407921089033452e-05, + "loss": 1.9909, + "step": 5924 + }, + { + "epoch": 1.8186003683241252, + "grad_norm": 0.4137042462825775, + "learning_rate": 9.407686443523918e-05, + "loss": 1.8778, + "step": 5925 + }, + { + "epoch": 1.8189073050951503, + "grad_norm": 0.3816729485988617, + "learning_rate": 9.407451754455042e-05, + "loss": 1.9355, + "step": 5926 + }, + { + "epoch": 1.8192142418661756, + "grad_norm": 0.48876214027404785, + "learning_rate": 9.407217021829145e-05, + "loss": 1.9256, + "step": 5927 + }, + { + "epoch": 1.8195211786372008, + "grad_norm": 0.5273690223693848, + "learning_rate": 9.406982245648547e-05, + "loss": 1.9456, + "step": 5928 + }, + { + "epoch": 1.8198281154082259, + "grad_norm": 0.4148990511894226, + "learning_rate": 9.406747425915566e-05, + "loss": 1.9184, + "step": 5929 + }, + { + "epoch": 1.8201350521792512, + "grad_norm": 0.4484131634235382, + "learning_rate": 9.406512562632526e-05, + "loss": 1.9305, + "step": 5930 + }, + { + "epoch": 1.820441988950276, + "grad_norm": 0.6036938428878784, + "learning_rate": 9.406277655801744e-05, + "loss": 1.9294, + "step": 5931 + }, + { + "epoch": 1.8207489257213014, + "grad_norm": 0.5399366021156311, + "learning_rate": 9.406042705425543e-05, + "loss": 1.9265, + "step": 5932 + }, + { + "epoch": 1.8210558624923265, + "grad_norm": 0.3591126501560211, + "learning_rate": 9.405807711506249e-05, + "loss": 1.8634, + "step": 5933 + }, + { + "epoch": 1.8213627992633517, + "grad_norm": 0.4474995732307434, + "learning_rate": 9.405572674046179e-05, + "loss": 2.0084, + "step": 5934 + }, + { + "epoch": 1.821669736034377, + "grad_norm": 0.4841657876968384, + "learning_rate": 9.405337593047657e-05, + "loss": 1.8885, + "step": 5935 + }, + { + "epoch": 1.821976672805402, + "grad_norm": 0.4786655008792877, + "learning_rate": 9.405102468513008e-05, + "loss": 1.9273, + "step": 5936 + }, + { + "epoch": 1.8222836095764272, + "grad_norm": 0.4675963521003723, + "learning_rate": 9.404867300444553e-05, + "loss": 1.9267, + "step": 5937 + }, + { + "epoch": 1.8225905463474525, + "grad_norm": 0.40235474705696106, + "learning_rate": 9.404632088844619e-05, + "loss": 2.0208, + "step": 5938 + }, + { + "epoch": 1.8228974831184774, + "grad_norm": 0.40626317262649536, + "learning_rate": 9.404396833715527e-05, + "loss": 1.9079, + "step": 5939 + }, + { + "epoch": 1.8232044198895028, + "grad_norm": 0.4164435565471649, + "learning_rate": 9.404161535059607e-05, + "loss": 1.8818, + "step": 5940 + }, + { + "epoch": 1.8235113566605279, + "grad_norm": 0.44487184286117554, + "learning_rate": 9.40392619287918e-05, + "loss": 1.9184, + "step": 5941 + }, + { + "epoch": 1.823818293431553, + "grad_norm": 0.4009508192539215, + "learning_rate": 9.403690807176572e-05, + "loss": 1.8814, + "step": 5942 + }, + { + "epoch": 1.8241252302025783, + "grad_norm": 0.3518575429916382, + "learning_rate": 9.403455377954112e-05, + "loss": 1.9319, + "step": 5943 + }, + { + "epoch": 1.8244321669736034, + "grad_norm": 0.36712533235549927, + "learning_rate": 9.403219905214125e-05, + "loss": 1.8609, + "step": 5944 + }, + { + "epoch": 1.8247391037446286, + "grad_norm": 0.3926267623901367, + "learning_rate": 9.402984388958937e-05, + "loss": 1.9328, + "step": 5945 + }, + { + "epoch": 1.825046040515654, + "grad_norm": 0.370781272649765, + "learning_rate": 9.402748829190878e-05, + "loss": 1.9848, + "step": 5946 + }, + { + "epoch": 1.8253529772866788, + "grad_norm": 0.38226625323295593, + "learning_rate": 9.402513225912273e-05, + "loss": 1.8933, + "step": 5947 + }, + { + "epoch": 1.8256599140577041, + "grad_norm": 0.40101101994514465, + "learning_rate": 9.402277579125451e-05, + "loss": 1.9231, + "step": 5948 + }, + { + "epoch": 1.8259668508287292, + "grad_norm": 0.41038060188293457, + "learning_rate": 9.402041888832744e-05, + "loss": 1.9445, + "step": 5949 + }, + { + "epoch": 1.8262737875997543, + "grad_norm": 0.37442395091056824, + "learning_rate": 9.401806155036479e-05, + "loss": 1.9271, + "step": 5950 + }, + { + "epoch": 1.8265807243707797, + "grad_norm": 0.43142926692962646, + "learning_rate": 9.401570377738984e-05, + "loss": 1.9489, + "step": 5951 + }, + { + "epoch": 1.8268876611418048, + "grad_norm": 0.38730981945991516, + "learning_rate": 9.401334556942591e-05, + "loss": 1.8802, + "step": 5952 + }, + { + "epoch": 1.82719459791283, + "grad_norm": 0.34189531207084656, + "learning_rate": 9.40109869264963e-05, + "loss": 1.9116, + "step": 5953 + }, + { + "epoch": 1.8275015346838552, + "grad_norm": 0.3632197678089142, + "learning_rate": 9.400862784862434e-05, + "loss": 1.8456, + "step": 5954 + }, + { + "epoch": 1.8278084714548803, + "grad_norm": 0.4008798599243164, + "learning_rate": 9.400626833583331e-05, + "loss": 1.9984, + "step": 5955 + }, + { + "epoch": 1.8281154082259055, + "grad_norm": 0.4087502062320709, + "learning_rate": 9.400390838814655e-05, + "loss": 1.8177, + "step": 5956 + }, + { + "epoch": 1.8284223449969308, + "grad_norm": 0.3753478229045868, + "learning_rate": 9.400154800558737e-05, + "loss": 1.864, + "step": 5957 + }, + { + "epoch": 1.8287292817679557, + "grad_norm": 0.37939608097076416, + "learning_rate": 9.399918718817911e-05, + "loss": 1.9331, + "step": 5958 + }, + { + "epoch": 1.829036218538981, + "grad_norm": 0.41382426023483276, + "learning_rate": 9.399682593594507e-05, + "loss": 1.9014, + "step": 5959 + }, + { + "epoch": 1.8293431553100061, + "grad_norm": 0.46129345893859863, + "learning_rate": 9.399446424890864e-05, + "loss": 1.9591, + "step": 5960 + }, + { + "epoch": 1.8296500920810312, + "grad_norm": 0.487870454788208, + "learning_rate": 9.399210212709312e-05, + "loss": 1.9073, + "step": 5961 + }, + { + "epoch": 1.8299570288520566, + "grad_norm": 0.4693615138530731, + "learning_rate": 9.398973957052185e-05, + "loss": 1.8336, + "step": 5962 + }, + { + "epoch": 1.8302639656230817, + "grad_norm": 0.38947850465774536, + "learning_rate": 9.39873765792182e-05, + "loss": 1.8599, + "step": 5963 + }, + { + "epoch": 1.8305709023941068, + "grad_norm": 0.372242271900177, + "learning_rate": 9.398501315320551e-05, + "loss": 1.9653, + "step": 5964 + }, + { + "epoch": 1.8308778391651321, + "grad_norm": 0.37679895758628845, + "learning_rate": 9.398264929250714e-05, + "loss": 1.8886, + "step": 5965 + }, + { + "epoch": 1.831184775936157, + "grad_norm": 0.347989022731781, + "learning_rate": 9.398028499714645e-05, + "loss": 1.8665, + "step": 5966 + }, + { + "epoch": 1.8314917127071824, + "grad_norm": 0.4297877550125122, + "learning_rate": 9.397792026714681e-05, + "loss": 1.9646, + "step": 5967 + }, + { + "epoch": 1.8317986494782075, + "grad_norm": 0.3698103427886963, + "learning_rate": 9.397555510253158e-05, + "loss": 1.9537, + "step": 5968 + }, + { + "epoch": 1.8321055862492326, + "grad_norm": 0.3268609941005707, + "learning_rate": 9.397318950332414e-05, + "loss": 1.8679, + "step": 5969 + }, + { + "epoch": 1.832412523020258, + "grad_norm": 0.3487341105937958, + "learning_rate": 9.397082346954788e-05, + "loss": 1.8936, + "step": 5970 + }, + { + "epoch": 1.832719459791283, + "grad_norm": 0.36363741755485535, + "learning_rate": 9.396845700122616e-05, + "loss": 1.8926, + "step": 5971 + }, + { + "epoch": 1.8330263965623081, + "grad_norm": 0.42258647084236145, + "learning_rate": 9.396609009838237e-05, + "loss": 1.9439, + "step": 5972 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.4087521433830261, + "learning_rate": 9.396372276103992e-05, + "loss": 1.8868, + "step": 5973 + }, + { + "epoch": 1.8336402701043584, + "grad_norm": 0.41857820749282837, + "learning_rate": 9.396135498922218e-05, + "loss": 1.9824, + "step": 5974 + }, + { + "epoch": 1.8339472068753837, + "grad_norm": 0.44207099080085754, + "learning_rate": 9.395898678295259e-05, + "loss": 1.9183, + "step": 5975 + }, + { + "epoch": 1.8342541436464088, + "grad_norm": 0.38295891880989075, + "learning_rate": 9.39566181422545e-05, + "loss": 1.8882, + "step": 5976 + }, + { + "epoch": 1.834561080417434, + "grad_norm": 0.4440687298774719, + "learning_rate": 9.395424906715136e-05, + "loss": 1.9401, + "step": 5977 + }, + { + "epoch": 1.8348680171884593, + "grad_norm": 0.3867577016353607, + "learning_rate": 9.395187955766655e-05, + "loss": 1.9243, + "step": 5978 + }, + { + "epoch": 1.8351749539594844, + "grad_norm": 0.47536182403564453, + "learning_rate": 9.394950961382354e-05, + "loss": 1.9248, + "step": 5979 + }, + { + "epoch": 1.8354818907305095, + "grad_norm": 0.4071936011314392, + "learning_rate": 9.394713923564569e-05, + "loss": 1.8701, + "step": 5980 + }, + { + "epoch": 1.8357888275015348, + "grad_norm": 0.41844502091407776, + "learning_rate": 9.394476842315645e-05, + "loss": 2.0087, + "step": 5981 + }, + { + "epoch": 1.8360957642725597, + "grad_norm": 0.40439316630363464, + "learning_rate": 9.394239717637927e-05, + "loss": 1.8945, + "step": 5982 + }, + { + "epoch": 1.836402701043585, + "grad_norm": 0.36738064885139465, + "learning_rate": 9.394002549533754e-05, + "loss": 1.9361, + "step": 5983 + }, + { + "epoch": 1.8367096378146102, + "grad_norm": 0.4733370542526245, + "learning_rate": 9.393765338005476e-05, + "loss": 1.9301, + "step": 5984 + }, + { + "epoch": 1.8370165745856353, + "grad_norm": 0.4467030465602875, + "learning_rate": 9.39352808305543e-05, + "loss": 1.8691, + "step": 5985 + }, + { + "epoch": 1.8373235113566606, + "grad_norm": 0.5276423692703247, + "learning_rate": 9.393290784685967e-05, + "loss": 1.9211, + "step": 5986 + }, + { + "epoch": 1.8376304481276857, + "grad_norm": 0.4791669547557831, + "learning_rate": 9.393053442899428e-05, + "loss": 1.9876, + "step": 5987 + }, + { + "epoch": 1.8379373848987108, + "grad_norm": 0.41468554735183716, + "learning_rate": 9.392816057698159e-05, + "loss": 1.9483, + "step": 5988 + }, + { + "epoch": 1.8382443216697362, + "grad_norm": 0.3979242742061615, + "learning_rate": 9.39257862908451e-05, + "loss": 1.8962, + "step": 5989 + }, + { + "epoch": 1.838551258440761, + "grad_norm": 0.47706472873687744, + "learning_rate": 9.392341157060822e-05, + "loss": 1.9028, + "step": 5990 + }, + { + "epoch": 1.8388581952117864, + "grad_norm": 0.5254244804382324, + "learning_rate": 9.392103641629446e-05, + "loss": 1.9244, + "step": 5991 + }, + { + "epoch": 1.8391651319828115, + "grad_norm": 0.49596595764160156, + "learning_rate": 9.391866082792727e-05, + "loss": 1.8731, + "step": 5992 + }, + { + "epoch": 1.8394720687538366, + "grad_norm": 0.3787136971950531, + "learning_rate": 9.391628480553013e-05, + "loss": 1.9404, + "step": 5993 + }, + { + "epoch": 1.839779005524862, + "grad_norm": 0.3986566960811615, + "learning_rate": 9.391390834912651e-05, + "loss": 1.9319, + "step": 5994 + }, + { + "epoch": 1.840085942295887, + "grad_norm": 0.4466419219970703, + "learning_rate": 9.391153145873992e-05, + "loss": 1.9755, + "step": 5995 + }, + { + "epoch": 1.8403928790669122, + "grad_norm": 0.43374884128570557, + "learning_rate": 9.390915413439385e-05, + "loss": 1.913, + "step": 5996 + }, + { + "epoch": 1.8406998158379375, + "grad_norm": 0.3897610902786255, + "learning_rate": 9.390677637611176e-05, + "loss": 1.9488, + "step": 5997 + }, + { + "epoch": 1.8410067526089624, + "grad_norm": 0.38407614827156067, + "learning_rate": 9.390439818391718e-05, + "loss": 1.8712, + "step": 5998 + }, + { + "epoch": 1.8413136893799877, + "grad_norm": 0.4159192740917206, + "learning_rate": 9.390201955783362e-05, + "loss": 1.9254, + "step": 5999 + }, + { + "epoch": 1.8416206261510129, + "grad_norm": 0.42220592498779297, + "learning_rate": 9.389964049788455e-05, + "loss": 1.9684, + "step": 6000 + }, + { + "epoch": 1.841927562922038, + "grad_norm": 0.3792029619216919, + "learning_rate": 9.389726100409351e-05, + "loss": 1.9091, + "step": 6001 + }, + { + "epoch": 1.8422344996930633, + "grad_norm": 0.37374788522720337, + "learning_rate": 9.389488107648401e-05, + "loss": 1.9498, + "step": 6002 + }, + { + "epoch": 1.8425414364640884, + "grad_norm": 0.4237084686756134, + "learning_rate": 9.389250071507958e-05, + "loss": 1.9177, + "step": 6003 + }, + { + "epoch": 1.8428483732351135, + "grad_norm": 0.5332993865013123, + "learning_rate": 9.38901199199037e-05, + "loss": 1.8994, + "step": 6004 + }, + { + "epoch": 1.8431553100061389, + "grad_norm": 0.42202335596084595, + "learning_rate": 9.388773869097996e-05, + "loss": 1.8365, + "step": 6005 + }, + { + "epoch": 1.8434622467771637, + "grad_norm": 0.3581100106239319, + "learning_rate": 9.388535702833185e-05, + "loss": 1.8536, + "step": 6006 + }, + { + "epoch": 1.843769183548189, + "grad_norm": 0.3670782446861267, + "learning_rate": 9.388297493198293e-05, + "loss": 1.8965, + "step": 6007 + }, + { + "epoch": 1.8440761203192142, + "grad_norm": 0.39181825518608093, + "learning_rate": 9.38805924019567e-05, + "loss": 1.8674, + "step": 6008 + }, + { + "epoch": 1.8443830570902393, + "grad_norm": 0.46757015585899353, + "learning_rate": 9.387820943827676e-05, + "loss": 1.8945, + "step": 6009 + }, + { + "epoch": 1.8446899938612646, + "grad_norm": 0.4656504690647125, + "learning_rate": 9.387582604096664e-05, + "loss": 1.8626, + "step": 6010 + }, + { + "epoch": 1.8449969306322898, + "grad_norm": 0.4699888825416565, + "learning_rate": 9.387344221004988e-05, + "loss": 1.9396, + "step": 6011 + }, + { + "epoch": 1.8453038674033149, + "grad_norm": 0.36591392755508423, + "learning_rate": 9.387105794555006e-05, + "loss": 1.8031, + "step": 6012 + }, + { + "epoch": 1.8456108041743402, + "grad_norm": 0.3563486933708191, + "learning_rate": 9.386867324749073e-05, + "loss": 1.8658, + "step": 6013 + }, + { + "epoch": 1.845917740945365, + "grad_norm": 0.4490883946418762, + "learning_rate": 9.386628811589547e-05, + "loss": 1.9809, + "step": 6014 + }, + { + "epoch": 1.8462246777163904, + "grad_norm": 0.39862295985221863, + "learning_rate": 9.38639025507878e-05, + "loss": 1.9268, + "step": 6015 + }, + { + "epoch": 1.8465316144874155, + "grad_norm": 0.3579883575439453, + "learning_rate": 9.386151655219138e-05, + "loss": 1.8538, + "step": 6016 + }, + { + "epoch": 1.8468385512584407, + "grad_norm": 0.411685973405838, + "learning_rate": 9.385913012012973e-05, + "loss": 1.9034, + "step": 6017 + }, + { + "epoch": 1.847145488029466, + "grad_norm": 0.44486066699028015, + "learning_rate": 9.385674325462643e-05, + "loss": 1.9279, + "step": 6018 + }, + { + "epoch": 1.847452424800491, + "grad_norm": 0.42794153094291687, + "learning_rate": 9.385435595570511e-05, + "loss": 1.9117, + "step": 6019 + }, + { + "epoch": 1.8477593615715162, + "grad_norm": 0.3652110695838928, + "learning_rate": 9.385196822338933e-05, + "loss": 1.9636, + "step": 6020 + }, + { + "epoch": 1.8480662983425415, + "grad_norm": 0.36490142345428467, + "learning_rate": 9.38495800577027e-05, + "loss": 1.9468, + "step": 6021 + }, + { + "epoch": 1.8483732351135667, + "grad_norm": 0.3946039080619812, + "learning_rate": 9.384719145866882e-05, + "loss": 1.8851, + "step": 6022 + }, + { + "epoch": 1.8486801718845918, + "grad_norm": 0.4236997067928314, + "learning_rate": 9.38448024263113e-05, + "loss": 2.0256, + "step": 6023 + }, + { + "epoch": 1.848987108655617, + "grad_norm": 0.34637942910194397, + "learning_rate": 9.384241296065374e-05, + "loss": 1.9032, + "step": 6024 + }, + { + "epoch": 1.849294045426642, + "grad_norm": 0.4096907079219818, + "learning_rate": 9.384002306171975e-05, + "loss": 1.9762, + "step": 6025 + }, + { + "epoch": 1.8496009821976673, + "grad_norm": 0.38225218653678894, + "learning_rate": 9.383763272953297e-05, + "loss": 2.023, + "step": 6026 + }, + { + "epoch": 1.8499079189686924, + "grad_norm": 0.4297153055667877, + "learning_rate": 9.3835241964117e-05, + "loss": 1.977, + "step": 6027 + }, + { + "epoch": 1.8502148557397176, + "grad_norm": 0.5225360989570618, + "learning_rate": 9.383285076549548e-05, + "loss": 1.919, + "step": 6028 + }, + { + "epoch": 1.850521792510743, + "grad_norm": 0.6799743175506592, + "learning_rate": 9.383045913369205e-05, + "loss": 1.9382, + "step": 6029 + }, + { + "epoch": 1.850828729281768, + "grad_norm": 0.6274817585945129, + "learning_rate": 9.382806706873031e-05, + "loss": 1.9782, + "step": 6030 + }, + { + "epoch": 1.8511356660527931, + "grad_norm": 0.4939708113670349, + "learning_rate": 9.382567457063392e-05, + "loss": 1.8794, + "step": 6031 + }, + { + "epoch": 1.8514426028238185, + "grad_norm": 0.3876135051250458, + "learning_rate": 9.382328163942656e-05, + "loss": 2.0153, + "step": 6032 + }, + { + "epoch": 1.8517495395948433, + "grad_norm": 0.592051088809967, + "learning_rate": 9.38208882751318e-05, + "loss": 1.9277, + "step": 6033 + }, + { + "epoch": 1.8520564763658687, + "grad_norm": 0.660763144493103, + "learning_rate": 9.381849447777337e-05, + "loss": 1.9177, + "step": 6034 + }, + { + "epoch": 1.8523634131368938, + "grad_norm": 0.5823151469230652, + "learning_rate": 9.381610024737489e-05, + "loss": 1.9363, + "step": 6035 + }, + { + "epoch": 1.852670349907919, + "grad_norm": 0.39519962668418884, + "learning_rate": 9.381370558396004e-05, + "loss": 1.8627, + "step": 6036 + }, + { + "epoch": 1.8529772866789442, + "grad_norm": 0.44657328724861145, + "learning_rate": 9.381131048755244e-05, + "loss": 1.9075, + "step": 6037 + }, + { + "epoch": 1.8532842234499693, + "grad_norm": 0.540743887424469, + "learning_rate": 9.380891495817581e-05, + "loss": 1.9518, + "step": 6038 + }, + { + "epoch": 1.8535911602209945, + "grad_norm": 0.4388680160045624, + "learning_rate": 9.38065189958538e-05, + "loss": 1.8485, + "step": 6039 + }, + { + "epoch": 1.8538980969920198, + "grad_norm": 0.37645572423934937, + "learning_rate": 9.38041226006101e-05, + "loss": 1.9542, + "step": 6040 + }, + { + "epoch": 1.8542050337630447, + "grad_norm": 0.4405656158924103, + "learning_rate": 9.380172577246837e-05, + "loss": 1.9054, + "step": 6041 + }, + { + "epoch": 1.85451197053407, + "grad_norm": 0.45483505725860596, + "learning_rate": 9.379932851145232e-05, + "loss": 1.9077, + "step": 6042 + }, + { + "epoch": 1.8548189073050951, + "grad_norm": 0.40666261315345764, + "learning_rate": 9.379693081758564e-05, + "loss": 1.9977, + "step": 6043 + }, + { + "epoch": 1.8551258440761202, + "grad_norm": 0.365241140127182, + "learning_rate": 9.379453269089202e-05, + "loss": 1.9047, + "step": 6044 + }, + { + "epoch": 1.8554327808471456, + "grad_norm": 0.40797916054725647, + "learning_rate": 9.379213413139516e-05, + "loss": 1.9621, + "step": 6045 + }, + { + "epoch": 1.8557397176181707, + "grad_norm": 0.4525306820869446, + "learning_rate": 9.378973513911875e-05, + "loss": 1.9479, + "step": 6046 + }, + { + "epoch": 1.8560466543891958, + "grad_norm": 0.45422959327697754, + "learning_rate": 9.378733571408652e-05, + "loss": 1.9754, + "step": 6047 + }, + { + "epoch": 1.8563535911602211, + "grad_norm": 0.381862998008728, + "learning_rate": 9.378493585632217e-05, + "loss": 1.8542, + "step": 6048 + }, + { + "epoch": 1.856660527931246, + "grad_norm": 0.40489691495895386, + "learning_rate": 9.378253556584944e-05, + "loss": 1.9331, + "step": 6049 + }, + { + "epoch": 1.8569674647022714, + "grad_norm": 0.40347445011138916, + "learning_rate": 9.378013484269201e-05, + "loss": 1.9414, + "step": 6050 + }, + { + "epoch": 1.8572744014732965, + "grad_norm": 0.35401904582977295, + "learning_rate": 9.377773368687363e-05, + "loss": 1.8094, + "step": 6051 + }, + { + "epoch": 1.8575813382443216, + "grad_norm": 0.4061582684516907, + "learning_rate": 9.377533209841805e-05, + "loss": 1.8686, + "step": 6052 + }, + { + "epoch": 1.857888275015347, + "grad_norm": 0.44419318437576294, + "learning_rate": 9.377293007734895e-05, + "loss": 1.929, + "step": 6053 + }, + { + "epoch": 1.858195211786372, + "grad_norm": 0.41038191318511963, + "learning_rate": 9.37705276236901e-05, + "loss": 1.9636, + "step": 6054 + }, + { + "epoch": 1.8585021485573971, + "grad_norm": 0.4431348145008087, + "learning_rate": 9.376812473746526e-05, + "loss": 1.953, + "step": 6055 + }, + { + "epoch": 1.8588090853284225, + "grad_norm": 0.42502057552337646, + "learning_rate": 9.376572141869814e-05, + "loss": 1.95, + "step": 6056 + }, + { + "epoch": 1.8591160220994474, + "grad_norm": 0.40050914883613586, + "learning_rate": 9.376331766741253e-05, + "loss": 1.9507, + "step": 6057 + }, + { + "epoch": 1.8594229588704727, + "grad_norm": 0.3863932490348816, + "learning_rate": 9.376091348363216e-05, + "loss": 1.8746, + "step": 6058 + }, + { + "epoch": 1.8597298956414978, + "grad_norm": 0.37295350432395935, + "learning_rate": 9.375850886738077e-05, + "loss": 1.8778, + "step": 6059 + }, + { + "epoch": 1.860036832412523, + "grad_norm": 0.37965887784957886, + "learning_rate": 9.375610381868217e-05, + "loss": 1.8511, + "step": 6060 + }, + { + "epoch": 1.8603437691835483, + "grad_norm": 0.3740752637386322, + "learning_rate": 9.37536983375601e-05, + "loss": 1.8988, + "step": 6061 + }, + { + "epoch": 1.8606507059545734, + "grad_norm": 0.40466782450675964, + "learning_rate": 9.375129242403834e-05, + "loss": 1.9195, + "step": 6062 + }, + { + "epoch": 1.8609576427255985, + "grad_norm": 0.3658956289291382, + "learning_rate": 9.374888607814067e-05, + "loss": 1.9598, + "step": 6063 + }, + { + "epoch": 1.8612645794966238, + "grad_norm": 0.3752783238887787, + "learning_rate": 9.374647929989085e-05, + "loss": 1.9791, + "step": 6064 + }, + { + "epoch": 1.8615715162676487, + "grad_norm": 0.408774733543396, + "learning_rate": 9.374407208931268e-05, + "loss": 1.88, + "step": 6065 + }, + { + "epoch": 1.861878453038674, + "grad_norm": 0.3968205749988556, + "learning_rate": 9.374166444642997e-05, + "loss": 1.8755, + "step": 6066 + }, + { + "epoch": 1.8621853898096992, + "grad_norm": 0.37851858139038086, + "learning_rate": 9.373925637126648e-05, + "loss": 1.9296, + "step": 6067 + }, + { + "epoch": 1.8624923265807243, + "grad_norm": 0.34285619854927063, + "learning_rate": 9.373684786384604e-05, + "loss": 2.0149, + "step": 6068 + }, + { + "epoch": 1.8627992633517496, + "grad_norm": 0.38841512799263, + "learning_rate": 9.373443892419242e-05, + "loss": 1.9134, + "step": 6069 + }, + { + "epoch": 1.8631062001227747, + "grad_norm": 0.4744485914707184, + "learning_rate": 9.373202955232943e-05, + "loss": 1.9164, + "step": 6070 + }, + { + "epoch": 1.8634131368937998, + "grad_norm": 0.522659420967102, + "learning_rate": 9.372961974828092e-05, + "loss": 1.9155, + "step": 6071 + }, + { + "epoch": 1.8637200736648252, + "grad_norm": 0.5794001817703247, + "learning_rate": 9.372720951207066e-05, + "loss": 1.9003, + "step": 6072 + }, + { + "epoch": 1.86402701043585, + "grad_norm": 0.5135447978973389, + "learning_rate": 9.372479884372247e-05, + "loss": 1.948, + "step": 6073 + }, + { + "epoch": 1.8643339472068754, + "grad_norm": 0.4060198664665222, + "learning_rate": 9.372238774326021e-05, + "loss": 1.8634, + "step": 6074 + }, + { + "epoch": 1.8646408839779005, + "grad_norm": 0.3880244195461273, + "learning_rate": 9.371997621070769e-05, + "loss": 1.8729, + "step": 6075 + }, + { + "epoch": 1.8649478207489256, + "grad_norm": 0.4862929582595825, + "learning_rate": 9.371756424608875e-05, + "loss": 1.9185, + "step": 6076 + }, + { + "epoch": 1.865254757519951, + "grad_norm": 0.4763035476207733, + "learning_rate": 9.371515184942719e-05, + "loss": 1.9696, + "step": 6077 + }, + { + "epoch": 1.865561694290976, + "grad_norm": 0.3552228808403015, + "learning_rate": 9.371273902074689e-05, + "loss": 1.9101, + "step": 6078 + }, + { + "epoch": 1.8658686310620012, + "grad_norm": 0.46329566836357117, + "learning_rate": 9.371032576007168e-05, + "loss": 1.8807, + "step": 6079 + }, + { + "epoch": 1.8661755678330265, + "grad_norm": 0.5176550149917603, + "learning_rate": 9.370791206742541e-05, + "loss": 1.9044, + "step": 6080 + }, + { + "epoch": 1.8664825046040514, + "grad_norm": 0.3929184675216675, + "learning_rate": 9.370549794283194e-05, + "loss": 1.8858, + "step": 6081 + }, + { + "epoch": 1.8667894413750767, + "grad_norm": 0.35135987401008606, + "learning_rate": 9.370308338631511e-05, + "loss": 1.8518, + "step": 6082 + }, + { + "epoch": 1.8670963781461019, + "grad_norm": 0.4229072034358978, + "learning_rate": 9.370066839789881e-05, + "loss": 1.891, + "step": 6083 + }, + { + "epoch": 1.867403314917127, + "grad_norm": 0.4862394630908966, + "learning_rate": 9.369825297760688e-05, + "loss": 1.9058, + "step": 6084 + }, + { + "epoch": 1.8677102516881523, + "grad_norm": 0.4775281548500061, + "learning_rate": 9.369583712546322e-05, + "loss": 1.9738, + "step": 6085 + }, + { + "epoch": 1.8680171884591774, + "grad_norm": 0.3831046521663666, + "learning_rate": 9.369342084149166e-05, + "loss": 1.9516, + "step": 6086 + }, + { + "epoch": 1.8683241252302025, + "grad_norm": 0.3970867395401001, + "learning_rate": 9.369100412571612e-05, + "loss": 2.0158, + "step": 6087 + }, + { + "epoch": 1.8686310620012279, + "grad_norm": 0.41662725806236267, + "learning_rate": 9.368858697816047e-05, + "loss": 1.86, + "step": 6088 + }, + { + "epoch": 1.8689379987722528, + "grad_norm": 0.44235244393348694, + "learning_rate": 9.36861693988486e-05, + "loss": 1.9257, + "step": 6089 + }, + { + "epoch": 1.869244935543278, + "grad_norm": 0.37863966822624207, + "learning_rate": 9.36837513878044e-05, + "loss": 1.8877, + "step": 6090 + }, + { + "epoch": 1.8695518723143032, + "grad_norm": 0.44757044315338135, + "learning_rate": 9.368133294505175e-05, + "loss": 1.8962, + "step": 6091 + }, + { + "epoch": 1.8698588090853283, + "grad_norm": 0.5299558639526367, + "learning_rate": 9.367891407061458e-05, + "loss": 1.8655, + "step": 6092 + }, + { + "epoch": 1.8701657458563536, + "grad_norm": 0.4899531900882721, + "learning_rate": 9.367649476451678e-05, + "loss": 1.8933, + "step": 6093 + }, + { + "epoch": 1.8704726826273788, + "grad_norm": 0.3883507251739502, + "learning_rate": 9.367407502678224e-05, + "loss": 1.88, + "step": 6094 + }, + { + "epoch": 1.8707796193984039, + "grad_norm": 0.40936750173568726, + "learning_rate": 9.367165485743493e-05, + "loss": 1.8926, + "step": 6095 + }, + { + "epoch": 1.8710865561694292, + "grad_norm": 0.5708447098731995, + "learning_rate": 9.36692342564987e-05, + "loss": 1.9701, + "step": 6096 + }, + { + "epoch": 1.8713934929404543, + "grad_norm": 0.5559602379798889, + "learning_rate": 9.366681322399751e-05, + "loss": 1.8962, + "step": 6097 + }, + { + "epoch": 1.8717004297114794, + "grad_norm": 0.45344826579093933, + "learning_rate": 9.366439175995528e-05, + "loss": 1.9766, + "step": 6098 + }, + { + "epoch": 1.8720073664825048, + "grad_norm": 0.4887133538722992, + "learning_rate": 9.366196986439592e-05, + "loss": 1.8982, + "step": 6099 + }, + { + "epoch": 1.8723143032535297, + "grad_norm": 0.536568284034729, + "learning_rate": 9.365954753734339e-05, + "loss": 1.9506, + "step": 6100 + }, + { + "epoch": 1.872621240024555, + "grad_norm": 0.4792746901512146, + "learning_rate": 9.365712477882162e-05, + "loss": 1.9392, + "step": 6101 + }, + { + "epoch": 1.87292817679558, + "grad_norm": 0.39836910367012024, + "learning_rate": 9.365470158885458e-05, + "loss": 1.8812, + "step": 6102 + }, + { + "epoch": 1.8732351135666052, + "grad_norm": 0.4263121783733368, + "learning_rate": 9.365227796746617e-05, + "loss": 1.8326, + "step": 6103 + }, + { + "epoch": 1.8735420503376305, + "grad_norm": 0.4158315360546112, + "learning_rate": 9.364985391468038e-05, + "loss": 1.8857, + "step": 6104 + }, + { + "epoch": 1.8738489871086557, + "grad_norm": 0.4384559094905853, + "learning_rate": 9.364742943052112e-05, + "loss": 1.9247, + "step": 6105 + }, + { + "epoch": 1.8741559238796808, + "grad_norm": 0.34221649169921875, + "learning_rate": 9.364500451501242e-05, + "loss": 1.8869, + "step": 6106 + }, + { + "epoch": 1.874462860650706, + "grad_norm": 0.38786688446998596, + "learning_rate": 9.364257916817817e-05, + "loss": 1.8879, + "step": 6107 + }, + { + "epoch": 1.874769797421731, + "grad_norm": 0.39408090710639954, + "learning_rate": 9.364015339004239e-05, + "loss": 1.8832, + "step": 6108 + }, + { + "epoch": 1.8750767341927563, + "grad_norm": 0.33985385298728943, + "learning_rate": 9.363772718062902e-05, + "loss": 1.8823, + "step": 6109 + }, + { + "epoch": 1.8753836709637814, + "grad_norm": 0.35319194197654724, + "learning_rate": 9.363530053996206e-05, + "loss": 1.9205, + "step": 6110 + }, + { + "epoch": 1.8756906077348066, + "grad_norm": 0.3455435335636139, + "learning_rate": 9.36328734680655e-05, + "loss": 1.9028, + "step": 6111 + }, + { + "epoch": 1.875997544505832, + "grad_norm": 0.3689115643501282, + "learning_rate": 9.363044596496329e-05, + "loss": 1.8996, + "step": 6112 + }, + { + "epoch": 1.876304481276857, + "grad_norm": 0.35776960849761963, + "learning_rate": 9.362801803067945e-05, + "loss": 1.9563, + "step": 6113 + }, + { + "epoch": 1.8766114180478821, + "grad_norm": 0.3524370491504669, + "learning_rate": 9.362558966523797e-05, + "loss": 1.9016, + "step": 6114 + }, + { + "epoch": 1.8769183548189075, + "grad_norm": 0.3725074529647827, + "learning_rate": 9.362316086866283e-05, + "loss": 1.9467, + "step": 6115 + }, + { + "epoch": 1.8772252915899323, + "grad_norm": 0.390055775642395, + "learning_rate": 9.362073164097807e-05, + "loss": 1.9326, + "step": 6116 + }, + { + "epoch": 1.8775322283609577, + "grad_norm": 0.39119964838027954, + "learning_rate": 9.361830198220764e-05, + "loss": 1.8723, + "step": 6117 + }, + { + "epoch": 1.8778391651319828, + "grad_norm": 0.3659103512763977, + "learning_rate": 9.36158718923756e-05, + "loss": 1.835, + "step": 6118 + }, + { + "epoch": 1.878146101903008, + "grad_norm": 0.3360283076763153, + "learning_rate": 9.361344137150597e-05, + "loss": 1.8622, + "step": 6119 + }, + { + "epoch": 1.8784530386740332, + "grad_norm": 0.35440295934677124, + "learning_rate": 9.361101041962272e-05, + "loss": 1.8523, + "step": 6120 + }, + { + "epoch": 1.8787599754450584, + "grad_norm": 1.2606174945831299, + "learning_rate": 9.36085790367499e-05, + "loss": 1.9826, + "step": 6121 + }, + { + "epoch": 1.8790669122160835, + "grad_norm": 0.49294769763946533, + "learning_rate": 9.360614722291157e-05, + "loss": 1.8478, + "step": 6122 + }, + { + "epoch": 1.8793738489871088, + "grad_norm": 0.5642881393432617, + "learning_rate": 9.360371497813172e-05, + "loss": 1.883, + "step": 6123 + }, + { + "epoch": 1.8796807857581337, + "grad_norm": 0.5257276296615601, + "learning_rate": 9.36012823024344e-05, + "loss": 1.8577, + "step": 6124 + }, + { + "epoch": 1.879987722529159, + "grad_norm": 0.36913231015205383, + "learning_rate": 9.359884919584366e-05, + "loss": 1.8934, + "step": 6125 + }, + { + "epoch": 1.8802946593001841, + "grad_norm": 0.43373262882232666, + "learning_rate": 9.359641565838353e-05, + "loss": 1.8354, + "step": 6126 + }, + { + "epoch": 1.8806015960712092, + "grad_norm": 0.5280462503433228, + "learning_rate": 9.359398169007807e-05, + "loss": 1.9446, + "step": 6127 + }, + { + "epoch": 1.8809085328422346, + "grad_norm": 0.4991915225982666, + "learning_rate": 9.359154729095135e-05, + "loss": 1.9003, + "step": 6128 + }, + { + "epoch": 1.8812154696132597, + "grad_norm": 0.3766331374645233, + "learning_rate": 9.358911246102738e-05, + "loss": 1.9149, + "step": 6129 + }, + { + "epoch": 1.8815224063842848, + "grad_norm": 0.39050692319869995, + "learning_rate": 9.358667720033026e-05, + "loss": 1.8945, + "step": 6130 + }, + { + "epoch": 1.8818293431553101, + "grad_norm": 0.47633904218673706, + "learning_rate": 9.358424150888405e-05, + "loss": 1.8772, + "step": 6131 + }, + { + "epoch": 1.882136279926335, + "grad_norm": 0.46322503685951233, + "learning_rate": 9.358180538671283e-05, + "loss": 1.893, + "step": 6132 + }, + { + "epoch": 1.8824432166973604, + "grad_norm": 0.39437612891197205, + "learning_rate": 9.357936883384066e-05, + "loss": 1.9394, + "step": 6133 + }, + { + "epoch": 1.8827501534683855, + "grad_norm": 0.4534996747970581, + "learning_rate": 9.357693185029162e-05, + "loss": 1.9689, + "step": 6134 + }, + { + "epoch": 1.8830570902394106, + "grad_norm": 0.4408230483531952, + "learning_rate": 9.35744944360898e-05, + "loss": 1.876, + "step": 6135 + }, + { + "epoch": 1.883364027010436, + "grad_norm": 0.5688899755477905, + "learning_rate": 9.35720565912593e-05, + "loss": 2.0153, + "step": 6136 + }, + { + "epoch": 1.883670963781461, + "grad_norm": 0.5005510449409485, + "learning_rate": 9.356961831582418e-05, + "loss": 1.9454, + "step": 6137 + }, + { + "epoch": 1.8839779005524862, + "grad_norm": 0.4002588987350464, + "learning_rate": 9.356717960980856e-05, + "loss": 1.9153, + "step": 6138 + }, + { + "epoch": 1.8842848373235115, + "grad_norm": 0.49053385853767395, + "learning_rate": 9.356474047323653e-05, + "loss": 1.9734, + "step": 6139 + }, + { + "epoch": 1.8845917740945364, + "grad_norm": 0.4828382432460785, + "learning_rate": 9.35623009061322e-05, + "loss": 1.8946, + "step": 6140 + }, + { + "epoch": 1.8848987108655617, + "grad_norm": 0.4389181137084961, + "learning_rate": 9.35598609085197e-05, + "loss": 1.9491, + "step": 6141 + }, + { + "epoch": 1.8852056476365868, + "grad_norm": 0.4010564982891083, + "learning_rate": 9.35574204804231e-05, + "loss": 1.8786, + "step": 6142 + }, + { + "epoch": 1.885512584407612, + "grad_norm": 0.4038756787776947, + "learning_rate": 9.355497962186657e-05, + "loss": 1.907, + "step": 6143 + }, + { + "epoch": 1.8858195211786373, + "grad_norm": 0.5030881762504578, + "learning_rate": 9.355253833287418e-05, + "loss": 1.8438, + "step": 6144 + }, + { + "epoch": 1.8861264579496624, + "grad_norm": 0.42690956592559814, + "learning_rate": 9.355009661347007e-05, + "loss": 1.8254, + "step": 6145 + }, + { + "epoch": 1.8864333947206875, + "grad_norm": 0.37733983993530273, + "learning_rate": 9.35476544636784e-05, + "loss": 1.9035, + "step": 6146 + }, + { + "epoch": 1.8867403314917128, + "grad_norm": 0.36874648928642273, + "learning_rate": 9.354521188352327e-05, + "loss": 1.885, + "step": 6147 + }, + { + "epoch": 1.8870472682627377, + "grad_norm": 0.36208659410476685, + "learning_rate": 9.354276887302885e-05, + "loss": 1.9416, + "step": 6148 + }, + { + "epoch": 1.887354205033763, + "grad_norm": 0.3952158987522125, + "learning_rate": 9.354032543221926e-05, + "loss": 1.9073, + "step": 6149 + }, + { + "epoch": 1.8876611418047882, + "grad_norm": 0.3603280782699585, + "learning_rate": 9.353788156111864e-05, + "loss": 1.9204, + "step": 6150 + }, + { + "epoch": 1.8879680785758133, + "grad_norm": 0.4325824975967407, + "learning_rate": 9.353543725975118e-05, + "loss": 1.9345, + "step": 6151 + }, + { + "epoch": 1.8882750153468386, + "grad_norm": 0.46270960569381714, + "learning_rate": 9.3532992528141e-05, + "loss": 1.9783, + "step": 6152 + }, + { + "epoch": 1.8885819521178637, + "grad_norm": 0.42317959666252136, + "learning_rate": 9.353054736631228e-05, + "loss": 1.9252, + "step": 6153 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.42653194069862366, + "learning_rate": 9.352810177428917e-05, + "loss": 1.9875, + "step": 6154 + }, + { + "epoch": 1.8891958256599142, + "grad_norm": 0.49819129705429077, + "learning_rate": 9.352565575209584e-05, + "loss": 1.9334, + "step": 6155 + }, + { + "epoch": 1.889502762430939, + "grad_norm": 0.4481790065765381, + "learning_rate": 9.352320929975646e-05, + "loss": 1.8939, + "step": 6156 + }, + { + "epoch": 1.8898096992019644, + "grad_norm": 0.41602686047554016, + "learning_rate": 9.352076241729524e-05, + "loss": 1.9207, + "step": 6157 + }, + { + "epoch": 1.8901166359729895, + "grad_norm": 0.4516759216785431, + "learning_rate": 9.351831510473633e-05, + "loss": 1.9384, + "step": 6158 + }, + { + "epoch": 1.8904235727440146, + "grad_norm": 0.5030251741409302, + "learning_rate": 9.351586736210391e-05, + "loss": 1.9787, + "step": 6159 + }, + { + "epoch": 1.89073050951504, + "grad_norm": 0.37176215648651123, + "learning_rate": 9.35134191894222e-05, + "loss": 1.8826, + "step": 6160 + }, + { + "epoch": 1.891037446286065, + "grad_norm": 0.3850235939025879, + "learning_rate": 9.351097058671537e-05, + "loss": 1.8689, + "step": 6161 + }, + { + "epoch": 1.8913443830570902, + "grad_norm": 0.3740260601043701, + "learning_rate": 9.350852155400764e-05, + "loss": 1.8624, + "step": 6162 + }, + { + "epoch": 1.8916513198281155, + "grad_norm": 0.386124849319458, + "learning_rate": 9.350607209132318e-05, + "loss": 1.8506, + "step": 6163 + }, + { + "epoch": 1.8919582565991406, + "grad_norm": 0.3743472993373871, + "learning_rate": 9.350362219868623e-05, + "loss": 1.9499, + "step": 6164 + }, + { + "epoch": 1.8922651933701657, + "grad_norm": 0.4257555603981018, + "learning_rate": 9.350117187612097e-05, + "loss": 1.9407, + "step": 6165 + }, + { + "epoch": 1.892572130141191, + "grad_norm": 0.37218552827835083, + "learning_rate": 9.349872112365163e-05, + "loss": 1.8772, + "step": 6166 + }, + { + "epoch": 1.892879066912216, + "grad_norm": 0.3443894386291504, + "learning_rate": 9.349626994130245e-05, + "loss": 1.8846, + "step": 6167 + }, + { + "epoch": 1.8931860036832413, + "grad_norm": 0.33507248759269714, + "learning_rate": 9.349381832909763e-05, + "loss": 1.9303, + "step": 6168 + }, + { + "epoch": 1.8934929404542664, + "grad_norm": 0.3844592869281769, + "learning_rate": 9.349136628706141e-05, + "loss": 1.9453, + "step": 6169 + }, + { + "epoch": 1.8937998772252915, + "grad_norm": 0.35765793919563293, + "learning_rate": 9.348891381521802e-05, + "loss": 1.8745, + "step": 6170 + }, + { + "epoch": 1.8941068139963169, + "grad_norm": 0.3732185661792755, + "learning_rate": 9.348646091359168e-05, + "loss": 1.9318, + "step": 6171 + }, + { + "epoch": 1.894413750767342, + "grad_norm": 0.3704257607460022, + "learning_rate": 9.348400758220666e-05, + "loss": 1.9285, + "step": 6172 + }, + { + "epoch": 1.894720687538367, + "grad_norm": 0.32159942388534546, + "learning_rate": 9.348155382108717e-05, + "loss": 1.8368, + "step": 6173 + }, + { + "epoch": 1.8950276243093924, + "grad_norm": 0.32755646109580994, + "learning_rate": 9.34790996302575e-05, + "loss": 1.8975, + "step": 6174 + }, + { + "epoch": 1.8953345610804173, + "grad_norm": 0.38797906041145325, + "learning_rate": 9.347664500974186e-05, + "loss": 1.9684, + "step": 6175 + }, + { + "epoch": 1.8956414978514426, + "grad_norm": 0.3870599865913391, + "learning_rate": 9.347418995956456e-05, + "loss": 1.963, + "step": 6176 + }, + { + "epoch": 1.8959484346224678, + "grad_norm": 0.35739025473594666, + "learning_rate": 9.347173447974982e-05, + "loss": 1.8912, + "step": 6177 + }, + { + "epoch": 1.8962553713934929, + "grad_norm": 0.3525852859020233, + "learning_rate": 9.346927857032193e-05, + "loss": 1.8455, + "step": 6178 + }, + { + "epoch": 1.8965623081645182, + "grad_norm": 0.39735934138298035, + "learning_rate": 9.346682223130514e-05, + "loss": 1.8824, + "step": 6179 + }, + { + "epoch": 1.8968692449355433, + "grad_norm": 0.3677692413330078, + "learning_rate": 9.346436546272373e-05, + "loss": 1.8723, + "step": 6180 + }, + { + "epoch": 1.8971761817065684, + "grad_norm": 0.3660476505756378, + "learning_rate": 9.346190826460199e-05, + "loss": 1.9674, + "step": 6181 + }, + { + "epoch": 1.8974831184775938, + "grad_norm": 0.4416230022907257, + "learning_rate": 9.34594506369642e-05, + "loss": 1.9309, + "step": 6182 + }, + { + "epoch": 1.8977900552486187, + "grad_norm": 0.39761826395988464, + "learning_rate": 9.345699257983466e-05, + "loss": 1.9408, + "step": 6183 + }, + { + "epoch": 1.898096992019644, + "grad_norm": 0.44419440627098083, + "learning_rate": 9.345453409323763e-05, + "loss": 2.0013, + "step": 6184 + }, + { + "epoch": 1.898403928790669, + "grad_norm": 0.4173676371574402, + "learning_rate": 9.345207517719743e-05, + "loss": 1.8462, + "step": 6185 + }, + { + "epoch": 1.8987108655616942, + "grad_norm": 0.39312002062797546, + "learning_rate": 9.344961583173837e-05, + "loss": 1.8716, + "step": 6186 + }, + { + "epoch": 1.8990178023327196, + "grad_norm": 0.389996737241745, + "learning_rate": 9.344715605688472e-05, + "loss": 1.9331, + "step": 6187 + }, + { + "epoch": 1.8993247391037447, + "grad_norm": 0.4575251340866089, + "learning_rate": 9.34446958526608e-05, + "loss": 1.9408, + "step": 6188 + }, + { + "epoch": 1.8996316758747698, + "grad_norm": 0.425075888633728, + "learning_rate": 9.344223521909097e-05, + "loss": 1.8632, + "step": 6189 + }, + { + "epoch": 1.899938612645795, + "grad_norm": 0.3622394800186157, + "learning_rate": 9.343977415619948e-05, + "loss": 1.8671, + "step": 6190 + }, + { + "epoch": 1.90024554941682, + "grad_norm": 0.38955047726631165, + "learning_rate": 9.343731266401068e-05, + "loss": 1.8955, + "step": 6191 + }, + { + "epoch": 1.9005524861878453, + "grad_norm": 0.40853381156921387, + "learning_rate": 9.34348507425489e-05, + "loss": 1.8477, + "step": 6192 + }, + { + "epoch": 1.9008594229588704, + "grad_norm": 0.36416095495224, + "learning_rate": 9.343238839183848e-05, + "loss": 1.8596, + "step": 6193 + }, + { + "epoch": 1.9011663597298956, + "grad_norm": 0.3371017277240753, + "learning_rate": 9.342992561190374e-05, + "loss": 1.9646, + "step": 6194 + }, + { + "epoch": 1.901473296500921, + "grad_norm": 0.3605191111564636, + "learning_rate": 9.3427462402769e-05, + "loss": 1.9165, + "step": 6195 + }, + { + "epoch": 1.901780233271946, + "grad_norm": 0.32952287793159485, + "learning_rate": 9.342499876445863e-05, + "loss": 1.8827, + "step": 6196 + }, + { + "epoch": 1.9020871700429711, + "grad_norm": 0.3627411425113678, + "learning_rate": 9.342253469699698e-05, + "loss": 1.9058, + "step": 6197 + }, + { + "epoch": 1.9023941068139965, + "grad_norm": 0.3830505311489105, + "learning_rate": 9.342007020040839e-05, + "loss": 1.89, + "step": 6198 + }, + { + "epoch": 1.9027010435850213, + "grad_norm": 0.36550065875053406, + "learning_rate": 9.341760527471722e-05, + "loss": 1.9004, + "step": 6199 + }, + { + "epoch": 1.9030079803560467, + "grad_norm": 0.4098506569862366, + "learning_rate": 9.341513991994782e-05, + "loss": 1.8656, + "step": 6200 + }, + { + "epoch": 1.9033149171270718, + "grad_norm": 0.5218825340270996, + "learning_rate": 9.341267413612456e-05, + "loss": 1.9179, + "step": 6201 + }, + { + "epoch": 1.903621853898097, + "grad_norm": 0.6201978921890259, + "learning_rate": 9.34102079232718e-05, + "loss": 1.9485, + "step": 6202 + }, + { + "epoch": 1.9039287906691222, + "grad_norm": 0.597594141960144, + "learning_rate": 9.340774128141395e-05, + "loss": 1.9074, + "step": 6203 + }, + { + "epoch": 1.9042357274401474, + "grad_norm": 0.477268248796463, + "learning_rate": 9.340527421057533e-05, + "loss": 1.9202, + "step": 6204 + }, + { + "epoch": 1.9045426642111725, + "grad_norm": 0.39805278182029724, + "learning_rate": 9.340280671078035e-05, + "loss": 1.8801, + "step": 6205 + }, + { + "epoch": 1.9048496009821978, + "grad_norm": 0.5815454721450806, + "learning_rate": 9.340033878205342e-05, + "loss": 1.8564, + "step": 6206 + }, + { + "epoch": 1.9051565377532227, + "grad_norm": 0.6385661363601685, + "learning_rate": 9.339787042441888e-05, + "loss": 1.8992, + "step": 6207 + }, + { + "epoch": 1.905463474524248, + "grad_norm": 0.5905124545097351, + "learning_rate": 9.339540163790116e-05, + "loss": 1.9608, + "step": 6208 + }, + { + "epoch": 1.9057704112952731, + "grad_norm": 0.37329113483428955, + "learning_rate": 9.339293242252465e-05, + "loss": 1.9037, + "step": 6209 + }, + { + "epoch": 1.9060773480662982, + "grad_norm": 0.4568968117237091, + "learning_rate": 9.339046277831374e-05, + "loss": 1.8719, + "step": 6210 + }, + { + "epoch": 1.9063842848373236, + "grad_norm": 0.43003782629966736, + "learning_rate": 9.338799270529284e-05, + "loss": 1.8594, + "step": 6211 + }, + { + "epoch": 1.9066912216083487, + "grad_norm": 0.3795240819454193, + "learning_rate": 9.338552220348637e-05, + "loss": 1.8645, + "step": 6212 + }, + { + "epoch": 1.9069981583793738, + "grad_norm": 0.3791581392288208, + "learning_rate": 9.338305127291876e-05, + "loss": 1.9076, + "step": 6213 + }, + { + "epoch": 1.9073050951503991, + "grad_norm": 0.3747733533382416, + "learning_rate": 9.338057991361438e-05, + "loss": 1.8665, + "step": 6214 + }, + { + "epoch": 1.907612031921424, + "grad_norm": 0.3994114100933075, + "learning_rate": 9.337810812559771e-05, + "loss": 1.9202, + "step": 6215 + }, + { + "epoch": 1.9079189686924494, + "grad_norm": 0.3808605670928955, + "learning_rate": 9.337563590889312e-05, + "loss": 1.9272, + "step": 6216 + }, + { + "epoch": 1.9082259054634745, + "grad_norm": 0.3461966812610626, + "learning_rate": 9.33731632635251e-05, + "loss": 1.8621, + "step": 6217 + }, + { + "epoch": 1.9085328422344996, + "grad_norm": 0.37272316217422485, + "learning_rate": 9.337069018951805e-05, + "loss": 1.8996, + "step": 6218 + }, + { + "epoch": 1.908839779005525, + "grad_norm": 0.40319329500198364, + "learning_rate": 9.336821668689642e-05, + "loss": 1.8852, + "step": 6219 + }, + { + "epoch": 1.90914671577655, + "grad_norm": 0.4059053659439087, + "learning_rate": 9.336574275568463e-05, + "loss": 1.9156, + "step": 6220 + }, + { + "epoch": 1.9094536525475752, + "grad_norm": 0.41244640946388245, + "learning_rate": 9.336326839590719e-05, + "loss": 1.9858, + "step": 6221 + }, + { + "epoch": 1.9097605893186005, + "grad_norm": 0.38230007886886597, + "learning_rate": 9.336079360758849e-05, + "loss": 1.8756, + "step": 6222 + }, + { + "epoch": 1.9100675260896254, + "grad_norm": 0.3620646297931671, + "learning_rate": 9.335831839075304e-05, + "loss": 1.9305, + "step": 6223 + }, + { + "epoch": 1.9103744628606507, + "grad_norm": 0.3700193166732788, + "learning_rate": 9.335584274542525e-05, + "loss": 1.8544, + "step": 6224 + }, + { + "epoch": 1.9106813996316758, + "grad_norm": 0.36827734112739563, + "learning_rate": 9.335336667162962e-05, + "loss": 1.8658, + "step": 6225 + }, + { + "epoch": 1.910988336402701, + "grad_norm": 0.33878061175346375, + "learning_rate": 9.33508901693906e-05, + "loss": 1.8638, + "step": 6226 + }, + { + "epoch": 1.9112952731737263, + "grad_norm": 0.3522186577320099, + "learning_rate": 9.334841323873269e-05, + "loss": 1.9109, + "step": 6227 + }, + { + "epoch": 1.9116022099447514, + "grad_norm": 0.3552776277065277, + "learning_rate": 9.334593587968035e-05, + "loss": 1.8499, + "step": 6228 + }, + { + "epoch": 1.9119091467157765, + "grad_norm": 0.3232300877571106, + "learning_rate": 9.334345809225805e-05, + "loss": 1.9078, + "step": 6229 + }, + { + "epoch": 1.9122160834868018, + "grad_norm": 0.3500599265098572, + "learning_rate": 9.33409798764903e-05, + "loss": 1.8953, + "step": 6230 + }, + { + "epoch": 1.9125230202578267, + "grad_norm": 0.4011479914188385, + "learning_rate": 9.333850123240159e-05, + "loss": 1.8961, + "step": 6231 + }, + { + "epoch": 1.912829957028852, + "grad_norm": 0.419539213180542, + "learning_rate": 9.333602216001642e-05, + "loss": 1.9381, + "step": 6232 + }, + { + "epoch": 1.9131368937998774, + "grad_norm": 0.364956259727478, + "learning_rate": 9.333354265935926e-05, + "loss": 1.8495, + "step": 6233 + }, + { + "epoch": 1.9134438305709023, + "grad_norm": 0.3322601318359375, + "learning_rate": 9.333106273045464e-05, + "loss": 1.8389, + "step": 6234 + }, + { + "epoch": 1.9137507673419276, + "grad_norm": 0.3706522583961487, + "learning_rate": 9.332858237332705e-05, + "loss": 1.904, + "step": 6235 + }, + { + "epoch": 1.9140577041129527, + "grad_norm": 0.3900963366031647, + "learning_rate": 9.332610158800104e-05, + "loss": 1.8974, + "step": 6236 + }, + { + "epoch": 1.9143646408839778, + "grad_norm": 0.3308334946632385, + "learning_rate": 9.332362037450108e-05, + "loss": 1.959, + "step": 6237 + }, + { + "epoch": 1.9146715776550032, + "grad_norm": 0.37876754999160767, + "learning_rate": 9.332113873285171e-05, + "loss": 1.9187, + "step": 6238 + }, + { + "epoch": 1.9149785144260283, + "grad_norm": 0.3557550609111786, + "learning_rate": 9.331865666307746e-05, + "loss": 1.9351, + "step": 6239 + }, + { + "epoch": 1.9152854511970534, + "grad_norm": 0.3792133927345276, + "learning_rate": 9.331617416520285e-05, + "loss": 1.8488, + "step": 6240 + }, + { + "epoch": 1.9155923879680787, + "grad_norm": 0.40517017245292664, + "learning_rate": 9.331369123925242e-05, + "loss": 1.9311, + "step": 6241 + }, + { + "epoch": 1.9158993247391036, + "grad_norm": 0.34011030197143555, + "learning_rate": 9.331120788525072e-05, + "loss": 1.8606, + "step": 6242 + }, + { + "epoch": 1.916206261510129, + "grad_norm": 0.39949584007263184, + "learning_rate": 9.330872410322227e-05, + "loss": 1.9156, + "step": 6243 + }, + { + "epoch": 1.916513198281154, + "grad_norm": 0.3771394193172455, + "learning_rate": 9.330623989319162e-05, + "loss": 1.8448, + "step": 6244 + }, + { + "epoch": 1.9168201350521792, + "grad_norm": 0.32114169001579285, + "learning_rate": 9.330375525518333e-05, + "loss": 1.8681, + "step": 6245 + }, + { + "epoch": 1.9171270718232045, + "grad_norm": 0.3438408672809601, + "learning_rate": 9.330127018922194e-05, + "loss": 1.8582, + "step": 6246 + }, + { + "epoch": 1.9174340085942296, + "grad_norm": 0.35971906781196594, + "learning_rate": 9.329878469533201e-05, + "loss": 1.9026, + "step": 6247 + }, + { + "epoch": 1.9177409453652547, + "grad_norm": 0.3953855633735657, + "learning_rate": 9.329629877353813e-05, + "loss": 1.8837, + "step": 6248 + }, + { + "epoch": 1.91804788213628, + "grad_norm": 0.36541905999183655, + "learning_rate": 9.329381242386485e-05, + "loss": 1.9156, + "step": 6249 + }, + { + "epoch": 1.918354818907305, + "grad_norm": 0.3577594459056854, + "learning_rate": 9.329132564633673e-05, + "loss": 1.8791, + "step": 6250 + }, + { + "epoch": 1.9186617556783303, + "grad_norm": 0.3869122564792633, + "learning_rate": 9.328883844097837e-05, + "loss": 1.9048, + "step": 6251 + }, + { + "epoch": 1.9189686924493554, + "grad_norm": 0.35097724199295044, + "learning_rate": 9.328635080781433e-05, + "loss": 1.9602, + "step": 6252 + }, + { + "epoch": 1.9192756292203805, + "grad_norm": 0.3813062012195587, + "learning_rate": 9.328386274686919e-05, + "loss": 1.9133, + "step": 6253 + }, + { + "epoch": 1.9195825659914059, + "grad_norm": 0.3950280249118805, + "learning_rate": 9.328137425816756e-05, + "loss": 1.9462, + "step": 6254 + }, + { + "epoch": 1.919889502762431, + "grad_norm": 0.41710540652275085, + "learning_rate": 9.327888534173402e-05, + "loss": 1.8616, + "step": 6255 + }, + { + "epoch": 1.920196439533456, + "grad_norm": 0.39998626708984375, + "learning_rate": 9.327639599759318e-05, + "loss": 1.8758, + "step": 6256 + }, + { + "epoch": 1.9205033763044814, + "grad_norm": 0.35425302386283875, + "learning_rate": 9.32739062257696e-05, + "loss": 1.8896, + "step": 6257 + }, + { + "epoch": 1.9208103130755063, + "grad_norm": 0.3487682640552521, + "learning_rate": 9.327141602628793e-05, + "loss": 1.8901, + "step": 6258 + }, + { + "epoch": 1.9211172498465316, + "grad_norm": 0.38767126202583313, + "learning_rate": 9.326892539917277e-05, + "loss": 1.9264, + "step": 6259 + }, + { + "epoch": 1.9214241866175568, + "grad_norm": 0.4265333116054535, + "learning_rate": 9.326643434444872e-05, + "loss": 1.9282, + "step": 6260 + }, + { + "epoch": 1.9217311233885819, + "grad_norm": 0.3386894166469574, + "learning_rate": 9.326394286214042e-05, + "loss": 1.8167, + "step": 6261 + }, + { + "epoch": 1.9220380601596072, + "grad_norm": 0.3594066798686981, + "learning_rate": 9.326145095227246e-05, + "loss": 1.9293, + "step": 6262 + }, + { + "epoch": 1.9223449969306323, + "grad_norm": 0.4041733741760254, + "learning_rate": 9.32589586148695e-05, + "loss": 2.0066, + "step": 6263 + }, + { + "epoch": 1.9226519337016574, + "grad_norm": 0.45588794350624084, + "learning_rate": 9.325646584995615e-05, + "loss": 1.9485, + "step": 6264 + }, + { + "epoch": 1.9229588704726828, + "grad_norm": 0.42583590745925903, + "learning_rate": 9.325397265755705e-05, + "loss": 1.8973, + "step": 6265 + }, + { + "epoch": 1.9232658072437077, + "grad_norm": 0.38701504468917847, + "learning_rate": 9.325147903769684e-05, + "loss": 1.9624, + "step": 6266 + }, + { + "epoch": 1.923572744014733, + "grad_norm": 0.4298608899116516, + "learning_rate": 9.324898499040017e-05, + "loss": 1.9033, + "step": 6267 + }, + { + "epoch": 1.923879680785758, + "grad_norm": 0.3692619800567627, + "learning_rate": 9.324649051569167e-05, + "loss": 1.973, + "step": 6268 + }, + { + "epoch": 1.9241866175567832, + "grad_norm": 0.40625011920928955, + "learning_rate": 9.324399561359602e-05, + "loss": 1.8629, + "step": 6269 + }, + { + "epoch": 1.9244935543278086, + "grad_norm": 0.43613263964653015, + "learning_rate": 9.324150028413784e-05, + "loss": 1.8928, + "step": 6270 + }, + { + "epoch": 1.9248004910988337, + "grad_norm": 0.4670937657356262, + "learning_rate": 9.323900452734182e-05, + "loss": 1.8809, + "step": 6271 + }, + { + "epoch": 1.9251074278698588, + "grad_norm": 0.43263986706733704, + "learning_rate": 9.323650834323262e-05, + "loss": 1.891, + "step": 6272 + }, + { + "epoch": 1.9254143646408841, + "grad_norm": 0.4253878891468048, + "learning_rate": 9.32340117318349e-05, + "loss": 2.0064, + "step": 6273 + }, + { + "epoch": 1.925721301411909, + "grad_norm": 0.3742302358150482, + "learning_rate": 9.323151469317332e-05, + "loss": 1.9441, + "step": 6274 + }, + { + "epoch": 1.9260282381829343, + "grad_norm": 0.37415632605552673, + "learning_rate": 9.32290172272726e-05, + "loss": 1.8901, + "step": 6275 + }, + { + "epoch": 1.9263351749539595, + "grad_norm": 0.402935266494751, + "learning_rate": 9.322651933415738e-05, + "loss": 1.9013, + "step": 6276 + }, + { + "epoch": 1.9266421117249846, + "grad_norm": 0.479819118976593, + "learning_rate": 9.322402101385235e-05, + "loss": 1.9713, + "step": 6277 + }, + { + "epoch": 1.92694904849601, + "grad_norm": 0.4472719430923462, + "learning_rate": 9.322152226638222e-05, + "loss": 1.9106, + "step": 6278 + }, + { + "epoch": 1.927255985267035, + "grad_norm": 0.36508920788764954, + "learning_rate": 9.321902309177168e-05, + "loss": 1.8999, + "step": 6279 + }, + { + "epoch": 1.9275629220380601, + "grad_norm": 0.38674476742744446, + "learning_rate": 9.321652349004542e-05, + "loss": 1.8653, + "step": 6280 + }, + { + "epoch": 1.9278698588090855, + "grad_norm": 0.3745587170124054, + "learning_rate": 9.321402346122814e-05, + "loss": 1.8764, + "step": 6281 + }, + { + "epoch": 1.9281767955801103, + "grad_norm": 0.37824445962905884, + "learning_rate": 9.321152300534454e-05, + "loss": 1.8712, + "step": 6282 + }, + { + "epoch": 1.9284837323511357, + "grad_norm": 0.3442685306072235, + "learning_rate": 9.320902212241936e-05, + "loss": 1.8242, + "step": 6283 + }, + { + "epoch": 1.9287906691221608, + "grad_norm": 0.3152186870574951, + "learning_rate": 9.32065208124773e-05, + "loss": 1.9282, + "step": 6284 + }, + { + "epoch": 1.929097605893186, + "grad_norm": 0.35380542278289795, + "learning_rate": 9.320401907554306e-05, + "loss": 1.8783, + "step": 6285 + }, + { + "epoch": 1.9294045426642112, + "grad_norm": 0.3140089511871338, + "learning_rate": 9.320151691164138e-05, + "loss": 1.9174, + "step": 6286 + }, + { + "epoch": 1.9297114794352364, + "grad_norm": 0.33666202425956726, + "learning_rate": 9.3199014320797e-05, + "loss": 1.8926, + "step": 6287 + }, + { + "epoch": 1.9300184162062615, + "grad_norm": 0.3297472894191742, + "learning_rate": 9.319651130303465e-05, + "loss": 1.8763, + "step": 6288 + }, + { + "epoch": 1.9303253529772868, + "grad_norm": 0.3323235511779785, + "learning_rate": 9.319400785837906e-05, + "loss": 1.9088, + "step": 6289 + }, + { + "epoch": 1.9306322897483117, + "grad_norm": 0.32601413130760193, + "learning_rate": 9.319150398685494e-05, + "loss": 1.8672, + "step": 6290 + }, + { + "epoch": 1.930939226519337, + "grad_norm": 0.35310089588165283, + "learning_rate": 9.318899968848708e-05, + "loss": 1.9492, + "step": 6291 + }, + { + "epoch": 1.9312461632903621, + "grad_norm": 0.3718548119068146, + "learning_rate": 9.31864949633002e-05, + "loss": 1.8692, + "step": 6292 + }, + { + "epoch": 1.9315531000613873, + "grad_norm": 0.42382025718688965, + "learning_rate": 9.318398981131908e-05, + "loss": 1.9693, + "step": 6293 + }, + { + "epoch": 1.9318600368324126, + "grad_norm": 0.5123299360275269, + "learning_rate": 9.318148423256845e-05, + "loss": 2.0117, + "step": 6294 + }, + { + "epoch": 1.9321669736034377, + "grad_norm": 0.4483809769153595, + "learning_rate": 9.317897822707308e-05, + "loss": 1.9165, + "step": 6295 + }, + { + "epoch": 1.9324739103744628, + "grad_norm": 0.4385908544063568, + "learning_rate": 9.317647179485776e-05, + "loss": 1.8869, + "step": 6296 + }, + { + "epoch": 1.9327808471454881, + "grad_norm": 0.42863771319389343, + "learning_rate": 9.317396493594724e-05, + "loss": 1.9484, + "step": 6297 + }, + { + "epoch": 1.933087783916513, + "grad_norm": 0.4130534529685974, + "learning_rate": 9.317145765036627e-05, + "loss": 1.9201, + "step": 6298 + }, + { + "epoch": 1.9333947206875384, + "grad_norm": 0.39024612307548523, + "learning_rate": 9.316894993813965e-05, + "loss": 1.9674, + "step": 6299 + }, + { + "epoch": 1.9337016574585635, + "grad_norm": 0.41060271859169006, + "learning_rate": 9.316644179929219e-05, + "loss": 1.9529, + "step": 6300 + }, + { + "epoch": 1.9340085942295886, + "grad_norm": 0.4302372634410858, + "learning_rate": 9.316393323384863e-05, + "loss": 1.8998, + "step": 6301 + }, + { + "epoch": 1.934315531000614, + "grad_norm": 0.3739410936832428, + "learning_rate": 9.316142424183379e-05, + "loss": 1.8812, + "step": 6302 + }, + { + "epoch": 1.934622467771639, + "grad_norm": 0.3965891897678375, + "learning_rate": 9.315891482327245e-05, + "loss": 1.8851, + "step": 6303 + }, + { + "epoch": 1.9349294045426642, + "grad_norm": 0.4486664831638336, + "learning_rate": 9.315640497818943e-05, + "loss": 1.9494, + "step": 6304 + }, + { + "epoch": 1.9352363413136895, + "grad_norm": 0.5530070662498474, + "learning_rate": 9.315389470660951e-05, + "loss": 1.9716, + "step": 6305 + }, + { + "epoch": 1.9355432780847146, + "grad_norm": 0.7142495512962341, + "learning_rate": 9.315138400855751e-05, + "loss": 1.947, + "step": 6306 + }, + { + "epoch": 1.9358502148557397, + "grad_norm": 0.7555594444274902, + "learning_rate": 9.314887288405827e-05, + "loss": 1.873, + "step": 6307 + }, + { + "epoch": 1.936157151626765, + "grad_norm": 0.6025232076644897, + "learning_rate": 9.314636133313654e-05, + "loss": 1.9189, + "step": 6308 + }, + { + "epoch": 1.93646408839779, + "grad_norm": 0.3686346113681793, + "learning_rate": 9.314384935581719e-05, + "loss": 1.8461, + "step": 6309 + }, + { + "epoch": 1.9367710251688153, + "grad_norm": 0.46265771985054016, + "learning_rate": 9.314133695212505e-05, + "loss": 1.8955, + "step": 6310 + }, + { + "epoch": 1.9370779619398404, + "grad_norm": 0.7023865580558777, + "learning_rate": 9.313882412208492e-05, + "loss": 1.9378, + "step": 6311 + }, + { + "epoch": 1.9373848987108655, + "grad_norm": 0.7163348197937012, + "learning_rate": 9.313631086572163e-05, + "loss": 1.9278, + "step": 6312 + }, + { + "epoch": 1.9376918354818908, + "grad_norm": 0.4772320091724396, + "learning_rate": 9.313379718306006e-05, + "loss": 1.9215, + "step": 6313 + }, + { + "epoch": 1.937998772252916, + "grad_norm": 0.4934171438217163, + "learning_rate": 9.313128307412501e-05, + "loss": 1.9725, + "step": 6314 + }, + { + "epoch": 1.938305709023941, + "grad_norm": 0.5988278985023499, + "learning_rate": 9.312876853894134e-05, + "loss": 1.9238, + "step": 6315 + }, + { + "epoch": 1.9386126457949664, + "grad_norm": 0.5819640159606934, + "learning_rate": 9.31262535775339e-05, + "loss": 1.9228, + "step": 6316 + }, + { + "epoch": 1.9389195825659913, + "grad_norm": 0.49525877833366394, + "learning_rate": 9.312373818992756e-05, + "loss": 1.8939, + "step": 6317 + }, + { + "epoch": 1.9392265193370166, + "grad_norm": 0.3778049647808075, + "learning_rate": 9.312122237614715e-05, + "loss": 1.8709, + "step": 6318 + }, + { + "epoch": 1.9395334561080417, + "grad_norm": 0.48716801404953003, + "learning_rate": 9.311870613621754e-05, + "loss": 1.9014, + "step": 6319 + }, + { + "epoch": 1.9398403928790668, + "grad_norm": 0.47298866510391235, + "learning_rate": 9.311618947016362e-05, + "loss": 1.8686, + "step": 6320 + }, + { + "epoch": 1.9401473296500922, + "grad_norm": 0.3709685206413269, + "learning_rate": 9.311367237801023e-05, + "loss": 1.9531, + "step": 6321 + }, + { + "epoch": 1.9404542664211173, + "grad_norm": 0.3898928761482239, + "learning_rate": 9.311115485978228e-05, + "loss": 1.8806, + "step": 6322 + }, + { + "epoch": 1.9407612031921424, + "grad_norm": 0.43091922998428345, + "learning_rate": 9.310863691550461e-05, + "loss": 1.9278, + "step": 6323 + }, + { + "epoch": 1.9410681399631677, + "grad_norm": 0.3788231909275055, + "learning_rate": 9.310611854520212e-05, + "loss": 1.893, + "step": 6324 + }, + { + "epoch": 1.9413750767341926, + "grad_norm": 0.4471469819545746, + "learning_rate": 9.310359974889972e-05, + "loss": 1.9706, + "step": 6325 + }, + { + "epoch": 1.941682013505218, + "grad_norm": 0.4047459661960602, + "learning_rate": 9.310108052662228e-05, + "loss": 1.8863, + "step": 6326 + }, + { + "epoch": 1.941988950276243, + "grad_norm": 0.4334566593170166, + "learning_rate": 9.309856087839468e-05, + "loss": 1.9543, + "step": 6327 + }, + { + "epoch": 1.9422958870472682, + "grad_norm": 0.3828316032886505, + "learning_rate": 9.309604080424185e-05, + "loss": 1.8601, + "step": 6328 + }, + { + "epoch": 1.9426028238182935, + "grad_norm": 0.3702560067176819, + "learning_rate": 9.30935203041887e-05, + "loss": 1.9055, + "step": 6329 + }, + { + "epoch": 1.9429097605893186, + "grad_norm": 0.4922797977924347, + "learning_rate": 9.309099937826011e-05, + "loss": 1.9589, + "step": 6330 + }, + { + "epoch": 1.9432166973603437, + "grad_norm": 0.4073271155357361, + "learning_rate": 9.308847802648102e-05, + "loss": 1.9727, + "step": 6331 + }, + { + "epoch": 1.943523634131369, + "grad_norm": 0.3833904266357422, + "learning_rate": 9.308595624887633e-05, + "loss": 1.8641, + "step": 6332 + }, + { + "epoch": 1.943830570902394, + "grad_norm": 0.44063761830329895, + "learning_rate": 9.308343404547095e-05, + "loss": 1.8996, + "step": 6333 + }, + { + "epoch": 1.9441375076734193, + "grad_norm": 0.4776977300643921, + "learning_rate": 9.308091141628983e-05, + "loss": 1.9353, + "step": 6334 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.39584699273109436, + "learning_rate": 9.307838836135792e-05, + "loss": 1.8521, + "step": 6335 + }, + { + "epoch": 1.9447513812154695, + "grad_norm": 0.3220890760421753, + "learning_rate": 9.30758648807001e-05, + "loss": 1.825, + "step": 6336 + }, + { + "epoch": 1.9450583179864949, + "grad_norm": 0.4301774501800537, + "learning_rate": 9.307334097434133e-05, + "loss": 1.9317, + "step": 6337 + }, + { + "epoch": 1.94536525475752, + "grad_norm": 0.439165323972702, + "learning_rate": 9.307081664230658e-05, + "loss": 1.8669, + "step": 6338 + }, + { + "epoch": 1.945672191528545, + "grad_norm": 0.4185279607772827, + "learning_rate": 9.306829188462076e-05, + "loss": 1.9512, + "step": 6339 + }, + { + "epoch": 1.9459791282995704, + "grad_norm": 0.4089502990245819, + "learning_rate": 9.306576670130885e-05, + "loss": 1.9607, + "step": 6340 + }, + { + "epoch": 1.9462860650705953, + "grad_norm": 0.508836567401886, + "learning_rate": 9.306324109239578e-05, + "loss": 1.9187, + "step": 6341 + }, + { + "epoch": 1.9465930018416207, + "grad_norm": 0.637534499168396, + "learning_rate": 9.306071505790652e-05, + "loss": 1.8237, + "step": 6342 + }, + { + "epoch": 1.9468999386126458, + "grad_norm": 0.5845112800598145, + "learning_rate": 9.305818859786603e-05, + "loss": 1.8238, + "step": 6343 + }, + { + "epoch": 1.9472068753836709, + "grad_norm": 0.4168374240398407, + "learning_rate": 9.305566171229932e-05, + "loss": 1.9343, + "step": 6344 + }, + { + "epoch": 1.9475138121546962, + "grad_norm": 0.43040701746940613, + "learning_rate": 9.305313440123129e-05, + "loss": 1.8774, + "step": 6345 + }, + { + "epoch": 1.9478207489257213, + "grad_norm": 0.6011641025543213, + "learning_rate": 9.305060666468696e-05, + "loss": 1.89, + "step": 6346 + }, + { + "epoch": 1.9481276856967464, + "grad_norm": 0.5530022382736206, + "learning_rate": 9.304807850269131e-05, + "loss": 2.0006, + "step": 6347 + }, + { + "epoch": 1.9484346224677718, + "grad_norm": 0.3707423210144043, + "learning_rate": 9.30455499152693e-05, + "loss": 1.9116, + "step": 6348 + }, + { + "epoch": 1.9487415592387967, + "grad_norm": 0.5013771653175354, + "learning_rate": 9.304302090244595e-05, + "loss": 1.8902, + "step": 6349 + }, + { + "epoch": 1.949048496009822, + "grad_norm": 0.5873609781265259, + "learning_rate": 9.304049146424623e-05, + "loss": 1.8879, + "step": 6350 + }, + { + "epoch": 1.949355432780847, + "grad_norm": 0.4389801621437073, + "learning_rate": 9.303796160069516e-05, + "loss": 1.9215, + "step": 6351 + }, + { + "epoch": 1.9496623695518722, + "grad_norm": 0.4004434645175934, + "learning_rate": 9.303543131181772e-05, + "loss": 1.9137, + "step": 6352 + }, + { + "epoch": 1.9499693063228976, + "grad_norm": 0.4928852617740631, + "learning_rate": 9.303290059763892e-05, + "loss": 1.9415, + "step": 6353 + }, + { + "epoch": 1.9502762430939227, + "grad_norm": 0.5045879483222961, + "learning_rate": 9.303036945818377e-05, + "loss": 1.8727, + "step": 6354 + }, + { + "epoch": 1.9505831798649478, + "grad_norm": 0.3434823453426361, + "learning_rate": 9.30278378934773e-05, + "loss": 1.8971, + "step": 6355 + }, + { + "epoch": 1.9508901166359731, + "grad_norm": 0.42980003356933594, + "learning_rate": 9.302530590354452e-05, + "loss": 1.9233, + "step": 6356 + }, + { + "epoch": 1.951197053406998, + "grad_norm": 0.3832406997680664, + "learning_rate": 9.302277348841042e-05, + "loss": 1.9317, + "step": 6357 + }, + { + "epoch": 1.9515039901780233, + "grad_norm": 0.37214264273643494, + "learning_rate": 9.30202406481001e-05, + "loss": 1.9172, + "step": 6358 + }, + { + "epoch": 1.9518109269490485, + "grad_norm": 0.3601585924625397, + "learning_rate": 9.30177073826385e-05, + "loss": 1.9286, + "step": 6359 + }, + { + "epoch": 1.9521178637200736, + "grad_norm": 0.36419349908828735, + "learning_rate": 9.301517369205072e-05, + "loss": 1.8624, + "step": 6360 + }, + { + "epoch": 1.952424800491099, + "grad_norm": 0.3808813691139221, + "learning_rate": 9.30126395763618e-05, + "loss": 1.8656, + "step": 6361 + }, + { + "epoch": 1.952731737262124, + "grad_norm": 0.39045700430870056, + "learning_rate": 9.301010503559675e-05, + "loss": 1.9205, + "step": 6362 + }, + { + "epoch": 1.9530386740331491, + "grad_norm": 0.37281444668769836, + "learning_rate": 9.300757006978065e-05, + "loss": 1.9162, + "step": 6363 + }, + { + "epoch": 1.9533456108041745, + "grad_norm": 0.4525204002857208, + "learning_rate": 9.300503467893851e-05, + "loss": 1.8999, + "step": 6364 + }, + { + "epoch": 1.9536525475751993, + "grad_norm": 0.41406187415122986, + "learning_rate": 9.300249886309542e-05, + "loss": 1.9804, + "step": 6365 + }, + { + "epoch": 1.9539594843462247, + "grad_norm": 0.4125058650970459, + "learning_rate": 9.299996262227644e-05, + "loss": 1.8464, + "step": 6366 + }, + { + "epoch": 1.9542664211172498, + "grad_norm": 0.41582876443862915, + "learning_rate": 9.299742595650663e-05, + "loss": 1.9937, + "step": 6367 + }, + { + "epoch": 1.954573357888275, + "grad_norm": 0.4360882639884949, + "learning_rate": 9.299488886581103e-05, + "loss": 1.9064, + "step": 6368 + }, + { + "epoch": 1.9548802946593002, + "grad_norm": 0.38369372487068176, + "learning_rate": 9.299235135021476e-05, + "loss": 1.9202, + "step": 6369 + }, + { + "epoch": 1.9551872314303254, + "grad_norm": 0.34401383996009827, + "learning_rate": 9.298981340974287e-05, + "loss": 1.844, + "step": 6370 + }, + { + "epoch": 1.9554941682013505, + "grad_norm": 0.3434326946735382, + "learning_rate": 9.298727504442044e-05, + "loss": 1.8206, + "step": 6371 + }, + { + "epoch": 1.9558011049723758, + "grad_norm": 0.35966724157333374, + "learning_rate": 9.298473625427257e-05, + "loss": 1.9, + "step": 6372 + }, + { + "epoch": 1.9561080417434007, + "grad_norm": 0.3726016581058502, + "learning_rate": 9.298219703932434e-05, + "loss": 1.9004, + "step": 6373 + }, + { + "epoch": 1.956414978514426, + "grad_norm": 0.3377366364002228, + "learning_rate": 9.297965739960084e-05, + "loss": 1.8747, + "step": 6374 + }, + { + "epoch": 1.9567219152854514, + "grad_norm": 0.36824578046798706, + "learning_rate": 9.297711733512718e-05, + "loss": 1.9059, + "step": 6375 + }, + { + "epoch": 1.9570288520564763, + "grad_norm": 0.3434023857116699, + "learning_rate": 9.297457684592847e-05, + "loss": 1.8624, + "step": 6376 + }, + { + "epoch": 1.9573357888275016, + "grad_norm": 0.36236703395843506, + "learning_rate": 9.297203593202979e-05, + "loss": 1.8558, + "step": 6377 + }, + { + "epoch": 1.9576427255985267, + "grad_norm": 0.3326953947544098, + "learning_rate": 9.296949459345625e-05, + "loss": 1.9189, + "step": 6378 + }, + { + "epoch": 1.9579496623695518, + "grad_norm": 0.3358452022075653, + "learning_rate": 9.2966952830233e-05, + "loss": 1.8601, + "step": 6379 + }, + { + "epoch": 1.9582565991405771, + "grad_norm": 0.36092114448547363, + "learning_rate": 9.296441064238514e-05, + "loss": 1.873, + "step": 6380 + }, + { + "epoch": 1.9585635359116023, + "grad_norm": 0.345683217048645, + "learning_rate": 9.296186802993778e-05, + "loss": 1.9122, + "step": 6381 + }, + { + "epoch": 1.9588704726826274, + "grad_norm": 0.32488611340522766, + "learning_rate": 9.295932499291606e-05, + "loss": 1.8709, + "step": 6382 + }, + { + "epoch": 1.9591774094536527, + "grad_norm": 0.34276288747787476, + "learning_rate": 9.295678153134512e-05, + "loss": 1.937, + "step": 6383 + }, + { + "epoch": 1.9594843462246776, + "grad_norm": 0.3953622877597809, + "learning_rate": 9.295423764525008e-05, + "loss": 1.9357, + "step": 6384 + }, + { + "epoch": 1.959791282995703, + "grad_norm": 0.37806951999664307, + "learning_rate": 9.29516933346561e-05, + "loss": 1.8813, + "step": 6385 + }, + { + "epoch": 1.960098219766728, + "grad_norm": 0.39551272988319397, + "learning_rate": 9.29491485995883e-05, + "loss": 1.8812, + "step": 6386 + }, + { + "epoch": 1.9604051565377532, + "grad_norm": 0.37042370438575745, + "learning_rate": 9.294660344007184e-05, + "loss": 1.9059, + "step": 6387 + }, + { + "epoch": 1.9607120933087785, + "grad_norm": 0.37503576278686523, + "learning_rate": 9.294405785613187e-05, + "loss": 1.9792, + "step": 6388 + }, + { + "epoch": 1.9610190300798036, + "grad_norm": 0.3515741229057312, + "learning_rate": 9.294151184779355e-05, + "loss": 1.8792, + "step": 6389 + }, + { + "epoch": 1.9613259668508287, + "grad_norm": 0.319890558719635, + "learning_rate": 9.293896541508205e-05, + "loss": 1.9222, + "step": 6390 + }, + { + "epoch": 1.961632903621854, + "grad_norm": 0.3517487645149231, + "learning_rate": 9.293641855802252e-05, + "loss": 1.8751, + "step": 6391 + }, + { + "epoch": 1.961939840392879, + "grad_norm": 0.33269986510276794, + "learning_rate": 9.293387127664012e-05, + "loss": 1.8372, + "step": 6392 + }, + { + "epoch": 1.9622467771639043, + "grad_norm": 0.36048516631126404, + "learning_rate": 9.293132357096007e-05, + "loss": 1.8944, + "step": 6393 + }, + { + "epoch": 1.9625537139349294, + "grad_norm": 0.4329642057418823, + "learning_rate": 9.292877544100751e-05, + "loss": 1.9868, + "step": 6394 + }, + { + "epoch": 1.9628606507059545, + "grad_norm": 0.445496529340744, + "learning_rate": 9.292622688680762e-05, + "loss": 1.9885, + "step": 6395 + }, + { + "epoch": 1.9631675874769798, + "grad_norm": 0.3818886876106262, + "learning_rate": 9.292367790838561e-05, + "loss": 1.9515, + "step": 6396 + }, + { + "epoch": 1.963474524248005, + "grad_norm": 0.3800121545791626, + "learning_rate": 9.292112850576664e-05, + "loss": 1.8838, + "step": 6397 + }, + { + "epoch": 1.96378146101903, + "grad_norm": 0.44252321124076843, + "learning_rate": 9.291857867897593e-05, + "loss": 1.9296, + "step": 6398 + }, + { + "epoch": 1.9640883977900554, + "grad_norm": 0.463766485452652, + "learning_rate": 9.291602842803867e-05, + "loss": 1.9164, + "step": 6399 + }, + { + "epoch": 1.9643953345610803, + "grad_norm": 0.4599217474460602, + "learning_rate": 9.291347775298006e-05, + "loss": 1.9277, + "step": 6400 + }, + { + "epoch": 1.9647022713321056, + "grad_norm": 0.371346652507782, + "learning_rate": 9.291092665382532e-05, + "loss": 1.9036, + "step": 6401 + }, + { + "epoch": 1.9650092081031307, + "grad_norm": 0.327197402715683, + "learning_rate": 9.290837513059965e-05, + "loss": 1.8214, + "step": 6402 + }, + { + "epoch": 1.9653161448741558, + "grad_norm": 0.3346688747406006, + "learning_rate": 9.290582318332826e-05, + "loss": 1.8671, + "step": 6403 + }, + { + "epoch": 1.9656230816451812, + "grad_norm": 0.342208594083786, + "learning_rate": 9.290327081203637e-05, + "loss": 1.9143, + "step": 6404 + }, + { + "epoch": 1.9659300184162063, + "grad_norm": 0.3430559039115906, + "learning_rate": 9.290071801674923e-05, + "loss": 1.9135, + "step": 6405 + }, + { + "epoch": 1.9662369551872314, + "grad_norm": 0.3335573971271515, + "learning_rate": 9.289816479749202e-05, + "loss": 1.9011, + "step": 6406 + }, + { + "epoch": 1.9665438919582567, + "grad_norm": 0.3464879095554352, + "learning_rate": 9.289561115429004e-05, + "loss": 1.9061, + "step": 6407 + }, + { + "epoch": 1.9668508287292816, + "grad_norm": 0.3513408899307251, + "learning_rate": 9.289305708716847e-05, + "loss": 1.8982, + "step": 6408 + }, + { + "epoch": 1.967157765500307, + "grad_norm": 0.3888663947582245, + "learning_rate": 9.289050259615256e-05, + "loss": 1.9196, + "step": 6409 + }, + { + "epoch": 1.967464702271332, + "grad_norm": 0.3414073884487152, + "learning_rate": 9.288794768126759e-05, + "loss": 1.932, + "step": 6410 + }, + { + "epoch": 1.9677716390423572, + "grad_norm": 0.33067384362220764, + "learning_rate": 9.288539234253876e-05, + "loss": 1.8547, + "step": 6411 + }, + { + "epoch": 1.9680785758133825, + "grad_norm": 0.31827688217163086, + "learning_rate": 9.288283657999135e-05, + "loss": 1.8691, + "step": 6412 + }, + { + "epoch": 1.9683855125844076, + "grad_norm": 0.32259073853492737, + "learning_rate": 9.288028039365062e-05, + "loss": 1.8889, + "step": 6413 + }, + { + "epoch": 1.9686924493554327, + "grad_norm": 0.37552687525749207, + "learning_rate": 9.287772378354182e-05, + "loss": 1.8709, + "step": 6414 + }, + { + "epoch": 1.968999386126458, + "grad_norm": 0.3446151316165924, + "learning_rate": 9.287516674969024e-05, + "loss": 1.8749, + "step": 6415 + }, + { + "epoch": 1.969306322897483, + "grad_norm": 0.3648208975791931, + "learning_rate": 9.287260929212111e-05, + "loss": 1.93, + "step": 6416 + }, + { + "epoch": 1.9696132596685083, + "grad_norm": 0.3430599868297577, + "learning_rate": 9.287005141085974e-05, + "loss": 1.8537, + "step": 6417 + }, + { + "epoch": 1.9699201964395334, + "grad_norm": 0.39110586047172546, + "learning_rate": 9.286749310593139e-05, + "loss": 1.987, + "step": 6418 + }, + { + "epoch": 1.9702271332105585, + "grad_norm": 0.4033393859863281, + "learning_rate": 9.286493437736136e-05, + "loss": 1.9793, + "step": 6419 + }, + { + "epoch": 1.9705340699815839, + "grad_norm": 0.3950151205062866, + "learning_rate": 9.286237522517491e-05, + "loss": 1.8781, + "step": 6420 + }, + { + "epoch": 1.970841006752609, + "grad_norm": 0.4614053964614868, + "learning_rate": 9.285981564939735e-05, + "loss": 1.9886, + "step": 6421 + }, + { + "epoch": 1.971147943523634, + "grad_norm": 0.4990023076534271, + "learning_rate": 9.285725565005398e-05, + "loss": 1.8957, + "step": 6422 + }, + { + "epoch": 1.9714548802946594, + "grad_norm": 0.501301109790802, + "learning_rate": 9.285469522717008e-05, + "loss": 1.8606, + "step": 6423 + }, + { + "epoch": 1.9717618170656843, + "grad_norm": 0.3820148706436157, + "learning_rate": 9.285213438077097e-05, + "loss": 1.9097, + "step": 6424 + }, + { + "epoch": 1.9720687538367097, + "grad_norm": 0.3959129750728607, + "learning_rate": 9.284957311088193e-05, + "loss": 1.8972, + "step": 6425 + }, + { + "epoch": 1.9723756906077348, + "grad_norm": 0.4914678931236267, + "learning_rate": 9.284701141752831e-05, + "loss": 1.9211, + "step": 6426 + }, + { + "epoch": 1.9726826273787599, + "grad_norm": 0.5992010831832886, + "learning_rate": 9.284444930073542e-05, + "loss": 1.917, + "step": 6427 + }, + { + "epoch": 1.9729895641497852, + "grad_norm": 0.6089407801628113, + "learning_rate": 9.284188676052856e-05, + "loss": 1.9497, + "step": 6428 + }, + { + "epoch": 1.9732965009208103, + "grad_norm": 0.5493173003196716, + "learning_rate": 9.283932379693306e-05, + "loss": 1.9888, + "step": 6429 + }, + { + "epoch": 1.9736034376918354, + "grad_norm": 0.4451984167098999, + "learning_rate": 9.283676040997426e-05, + "loss": 1.892, + "step": 6430 + }, + { + "epoch": 1.9739103744628608, + "grad_norm": 0.35765743255615234, + "learning_rate": 9.283419659967748e-05, + "loss": 1.8768, + "step": 6431 + }, + { + "epoch": 1.9742173112338857, + "grad_norm": 0.36561164259910583, + "learning_rate": 9.283163236606807e-05, + "loss": 1.825, + "step": 6432 + }, + { + "epoch": 1.974524248004911, + "grad_norm": 0.38473913073539734, + "learning_rate": 9.282906770917137e-05, + "loss": 1.9247, + "step": 6433 + }, + { + "epoch": 1.974831184775936, + "grad_norm": 0.324945867061615, + "learning_rate": 9.28265026290127e-05, + "loss": 1.8832, + "step": 6434 + }, + { + "epoch": 1.9751381215469612, + "grad_norm": 0.38697487115859985, + "learning_rate": 9.282393712561744e-05, + "loss": 1.9282, + "step": 6435 + }, + { + "epoch": 1.9754450583179866, + "grad_norm": 0.3772333264350891, + "learning_rate": 9.282137119901094e-05, + "loss": 1.8822, + "step": 6436 + }, + { + "epoch": 1.9757519950890117, + "grad_norm": 0.3522745668888092, + "learning_rate": 9.281880484921854e-05, + "loss": 1.9102, + "step": 6437 + }, + { + "epoch": 1.9760589318600368, + "grad_norm": 0.36745330691337585, + "learning_rate": 9.281623807626562e-05, + "loss": 1.8842, + "step": 6438 + }, + { + "epoch": 1.9763658686310621, + "grad_norm": 0.3990548253059387, + "learning_rate": 9.281367088017755e-05, + "loss": 1.9642, + "step": 6439 + }, + { + "epoch": 1.976672805402087, + "grad_norm": 0.3333520293235779, + "learning_rate": 9.281110326097969e-05, + "loss": 1.8541, + "step": 6440 + }, + { + "epoch": 1.9769797421731123, + "grad_norm": 0.3282802700996399, + "learning_rate": 9.280853521869739e-05, + "loss": 1.8416, + "step": 6441 + }, + { + "epoch": 1.9772866789441375, + "grad_norm": 0.3415268361568451, + "learning_rate": 9.280596675335607e-05, + "loss": 1.9009, + "step": 6442 + }, + { + "epoch": 1.9775936157151626, + "grad_norm": 0.3621836006641388, + "learning_rate": 9.28033978649811e-05, + "loss": 1.8584, + "step": 6443 + }, + { + "epoch": 1.977900552486188, + "grad_norm": 0.34778010845184326, + "learning_rate": 9.280082855359786e-05, + "loss": 1.9455, + "step": 6444 + }, + { + "epoch": 1.978207489257213, + "grad_norm": 0.36525633931159973, + "learning_rate": 9.279825881923174e-05, + "loss": 1.9182, + "step": 6445 + }, + { + "epoch": 1.9785144260282381, + "grad_norm": 0.3404203951358795, + "learning_rate": 9.279568866190815e-05, + "loss": 1.8853, + "step": 6446 + }, + { + "epoch": 1.9788213627992635, + "grad_norm": 0.4564785659313202, + "learning_rate": 9.279311808165249e-05, + "loss": 2.0012, + "step": 6447 + }, + { + "epoch": 1.9791282995702886, + "grad_norm": 0.4371441602706909, + "learning_rate": 9.279054707849015e-05, + "loss": 1.9372, + "step": 6448 + }, + { + "epoch": 1.9794352363413137, + "grad_norm": 0.3928726017475128, + "learning_rate": 9.278797565244652e-05, + "loss": 1.882, + "step": 6449 + }, + { + "epoch": 1.979742173112339, + "grad_norm": 0.483331561088562, + "learning_rate": 9.278540380354706e-05, + "loss": 1.9664, + "step": 6450 + }, + { + "epoch": 1.980049109883364, + "grad_norm": 0.39085066318511963, + "learning_rate": 9.278283153181716e-05, + "loss": 1.874, + "step": 6451 + }, + { + "epoch": 1.9803560466543892, + "grad_norm": 0.3549460172653198, + "learning_rate": 9.278025883728224e-05, + "loss": 1.9108, + "step": 6452 + }, + { + "epoch": 1.9806629834254144, + "grad_norm": 0.4260072410106659, + "learning_rate": 9.277768571996772e-05, + "loss": 1.8621, + "step": 6453 + }, + { + "epoch": 1.9809699201964395, + "grad_norm": 0.4531188905239105, + "learning_rate": 9.277511217989904e-05, + "loss": 1.9924, + "step": 6454 + }, + { + "epoch": 1.9812768569674648, + "grad_norm": 0.34916743636131287, + "learning_rate": 9.277253821710165e-05, + "loss": 1.9459, + "step": 6455 + }, + { + "epoch": 1.98158379373849, + "grad_norm": 0.45466169714927673, + "learning_rate": 9.276996383160095e-05, + "loss": 1.9129, + "step": 6456 + }, + { + "epoch": 1.981890730509515, + "grad_norm": 0.4948022663593292, + "learning_rate": 9.27673890234224e-05, + "loss": 1.9362, + "step": 6457 + }, + { + "epoch": 1.9821976672805404, + "grad_norm": 0.43365779519081116, + "learning_rate": 9.276481379259146e-05, + "loss": 1.9323, + "step": 6458 + }, + { + "epoch": 1.9825046040515653, + "grad_norm": 0.5301255583763123, + "learning_rate": 9.276223813913354e-05, + "loss": 1.9611, + "step": 6459 + }, + { + "epoch": 1.9828115408225906, + "grad_norm": 0.4785257577896118, + "learning_rate": 9.275966206307412e-05, + "loss": 1.8945, + "step": 6460 + }, + { + "epoch": 1.9831184775936157, + "grad_norm": 0.4091590940952301, + "learning_rate": 9.275708556443868e-05, + "loss": 1.9171, + "step": 6461 + }, + { + "epoch": 1.9834254143646408, + "grad_norm": 0.4031025767326355, + "learning_rate": 9.275450864325264e-05, + "loss": 1.9518, + "step": 6462 + }, + { + "epoch": 1.9837323511356661, + "grad_norm": 0.39147642254829407, + "learning_rate": 9.275193129954149e-05, + "loss": 1.8756, + "step": 6463 + }, + { + "epoch": 1.9840392879066913, + "grad_norm": 0.3863523006439209, + "learning_rate": 9.27493535333307e-05, + "loss": 1.8894, + "step": 6464 + }, + { + "epoch": 1.9843462246777164, + "grad_norm": 0.36373165249824524, + "learning_rate": 9.274677534464576e-05, + "loss": 1.8574, + "step": 6465 + }, + { + "epoch": 1.9846531614487417, + "grad_norm": 0.40247389674186707, + "learning_rate": 9.274419673351211e-05, + "loss": 1.832, + "step": 6466 + }, + { + "epoch": 1.9849600982197666, + "grad_norm": 0.3874013125896454, + "learning_rate": 9.274161769995526e-05, + "loss": 1.9079, + "step": 6467 + }, + { + "epoch": 1.985267034990792, + "grad_norm": 0.35506606101989746, + "learning_rate": 9.27390382440007e-05, + "loss": 1.8784, + "step": 6468 + }, + { + "epoch": 1.985573971761817, + "grad_norm": 0.406325101852417, + "learning_rate": 9.273645836567388e-05, + "loss": 1.9822, + "step": 6469 + }, + { + "epoch": 1.9858809085328422, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.273387806500036e-05, + "loss": 1.9334, + "step": 6470 + }, + { + "epoch": 1.9861878453038675, + "grad_norm": 0.4810343384742737, + "learning_rate": 9.273129734200561e-05, + "loss": 1.9598, + "step": 6471 + }, + { + "epoch": 1.9864947820748926, + "grad_norm": 0.4552834630012512, + "learning_rate": 9.272871619671513e-05, + "loss": 1.9504, + "step": 6472 + }, + { + "epoch": 1.9868017188459177, + "grad_norm": 0.38974207639694214, + "learning_rate": 9.272613462915443e-05, + "loss": 1.8811, + "step": 6473 + }, + { + "epoch": 1.987108655616943, + "grad_norm": 0.40983298420906067, + "learning_rate": 9.272355263934902e-05, + "loss": 1.8876, + "step": 6474 + }, + { + "epoch": 1.987415592387968, + "grad_norm": 0.3684757947921753, + "learning_rate": 9.272097022732443e-05, + "loss": 1.921, + "step": 6475 + }, + { + "epoch": 1.9877225291589933, + "grad_norm": 0.38384270668029785, + "learning_rate": 9.271838739310618e-05, + "loss": 1.9099, + "step": 6476 + }, + { + "epoch": 1.9880294659300184, + "grad_norm": 0.3783731460571289, + "learning_rate": 9.271580413671976e-05, + "loss": 1.9322, + "step": 6477 + }, + { + "epoch": 1.9883364027010435, + "grad_norm": 0.3686216473579407, + "learning_rate": 9.271322045819076e-05, + "loss": 1.914, + "step": 6478 + }, + { + "epoch": 1.9886433394720688, + "grad_norm": 0.38776305317878723, + "learning_rate": 9.271063635754466e-05, + "loss": 1.9331, + "step": 6479 + }, + { + "epoch": 1.988950276243094, + "grad_norm": 0.35099950432777405, + "learning_rate": 9.270805183480702e-05, + "loss": 1.9837, + "step": 6480 + }, + { + "epoch": 1.989257213014119, + "grad_norm": 0.3736453652381897, + "learning_rate": 9.270546689000339e-05, + "loss": 1.846, + "step": 6481 + }, + { + "epoch": 1.9895641497851444, + "grad_norm": 0.3654848635196686, + "learning_rate": 9.27028815231593e-05, + "loss": 1.8987, + "step": 6482 + }, + { + "epoch": 1.9898710865561693, + "grad_norm": 0.3534870147705078, + "learning_rate": 9.27002957343003e-05, + "loss": 1.868, + "step": 6483 + }, + { + "epoch": 1.9901780233271946, + "grad_norm": 0.3143392503261566, + "learning_rate": 9.269770952345197e-05, + "loss": 1.8042, + "step": 6484 + }, + { + "epoch": 1.9904849600982197, + "grad_norm": 0.37151026725769043, + "learning_rate": 9.269512289063982e-05, + "loss": 1.8392, + "step": 6485 + }, + { + "epoch": 1.9907918968692448, + "grad_norm": 0.39781463146209717, + "learning_rate": 9.269253583588947e-05, + "loss": 1.9911, + "step": 6486 + }, + { + "epoch": 1.9910988336402702, + "grad_norm": 0.44022107124328613, + "learning_rate": 9.268994835922643e-05, + "loss": 1.9644, + "step": 6487 + }, + { + "epoch": 1.9914057704112953, + "grad_norm": 0.4058530628681183, + "learning_rate": 9.268736046067632e-05, + "loss": 1.9062, + "step": 6488 + }, + { + "epoch": 1.9917127071823204, + "grad_norm": 0.3754481077194214, + "learning_rate": 9.268477214026467e-05, + "loss": 1.8278, + "step": 6489 + }, + { + "epoch": 1.9920196439533457, + "grad_norm": 0.318208247423172, + "learning_rate": 9.268218339801711e-05, + "loss": 1.8529, + "step": 6490 + }, + { + "epoch": 1.9923265807243706, + "grad_norm": 0.350777268409729, + "learning_rate": 9.267959423395918e-05, + "loss": 1.9024, + "step": 6491 + }, + { + "epoch": 1.992633517495396, + "grad_norm": 0.3145158588886261, + "learning_rate": 9.26770046481165e-05, + "loss": 1.934, + "step": 6492 + }, + { + "epoch": 1.992940454266421, + "grad_norm": 0.3347548842430115, + "learning_rate": 9.267441464051463e-05, + "loss": 1.8989, + "step": 6493 + }, + { + "epoch": 1.9932473910374462, + "grad_norm": 0.33111512660980225, + "learning_rate": 9.267182421117919e-05, + "loss": 1.8808, + "step": 6494 + }, + { + "epoch": 1.9935543278084715, + "grad_norm": 0.3135010898113251, + "learning_rate": 9.266923336013577e-05, + "loss": 1.895, + "step": 6495 + }, + { + "epoch": 1.9938612645794966, + "grad_norm": 0.3638830780982971, + "learning_rate": 9.266664208740998e-05, + "loss": 1.9331, + "step": 6496 + }, + { + "epoch": 1.9941682013505218, + "grad_norm": 0.3592624068260193, + "learning_rate": 9.266405039302743e-05, + "loss": 1.8963, + "step": 6497 + }, + { + "epoch": 1.994475138121547, + "grad_norm": 0.34216129779815674, + "learning_rate": 9.266145827701371e-05, + "loss": 1.9062, + "step": 6498 + }, + { + "epoch": 1.994782074892572, + "grad_norm": 0.4180343747138977, + "learning_rate": 9.265886573939447e-05, + "loss": 1.9351, + "step": 6499 + }, + { + "epoch": 1.9950890116635973, + "grad_norm": 0.36890342831611633, + "learning_rate": 9.265627278019531e-05, + "loss": 1.9037, + "step": 6500 + }, + { + "epoch": 1.9953959484346224, + "grad_norm": 0.36638152599334717, + "learning_rate": 9.265367939944188e-05, + "loss": 1.9524, + "step": 6501 + }, + { + "epoch": 1.9957028852056475, + "grad_norm": 0.44918373227119446, + "learning_rate": 9.265108559715976e-05, + "loss": 1.9236, + "step": 6502 + }, + { + "epoch": 1.9960098219766729, + "grad_norm": 0.3805326521396637, + "learning_rate": 9.264849137337462e-05, + "loss": 1.8526, + "step": 6503 + }, + { + "epoch": 1.996316758747698, + "grad_norm": 0.39035212993621826, + "learning_rate": 9.26458967281121e-05, + "loss": 1.8256, + "step": 6504 + }, + { + "epoch": 1.996623695518723, + "grad_norm": 0.330522358417511, + "learning_rate": 9.264330166139783e-05, + "loss": 1.8487, + "step": 6505 + }, + { + "epoch": 1.9969306322897484, + "grad_norm": 0.33569198846817017, + "learning_rate": 9.264070617325746e-05, + "loss": 1.8735, + "step": 6506 + }, + { + "epoch": 1.9972375690607733, + "grad_norm": 0.4121384918689728, + "learning_rate": 9.263811026371664e-05, + "loss": 2.0028, + "step": 6507 + }, + { + "epoch": 1.9975445058317987, + "grad_norm": 0.3419879972934723, + "learning_rate": 9.263551393280103e-05, + "loss": 1.8432, + "step": 6508 + }, + { + "epoch": 1.9978514426028238, + "grad_norm": 0.33369818329811096, + "learning_rate": 9.263291718053626e-05, + "loss": 1.8752, + "step": 6509 + }, + { + "epoch": 1.9981583793738489, + "grad_norm": 0.3580996096134186, + "learning_rate": 9.263032000694804e-05, + "loss": 1.9319, + "step": 6510 + }, + { + "epoch": 1.9984653161448742, + "grad_norm": 0.38216903805732727, + "learning_rate": 9.2627722412062e-05, + "loss": 1.9424, + "step": 6511 + }, + { + "epoch": 1.9987722529158993, + "grad_norm": 0.3836761713027954, + "learning_rate": 9.26251243959038e-05, + "loss": 1.9259, + "step": 6512 + }, + { + "epoch": 1.9990791896869244, + "grad_norm": 0.34978967905044556, + "learning_rate": 9.262252595849917e-05, + "loss": 1.8648, + "step": 6513 + }, + { + "epoch": 1.9993861264579498, + "grad_norm": 0.4190160632133484, + "learning_rate": 9.261992709987375e-05, + "loss": 1.9456, + "step": 6514 + }, + { + "epoch": 1.9996930632289747, + "grad_norm": 0.38700881600379944, + "learning_rate": 9.261732782005322e-05, + "loss": 1.8768, + "step": 6515 + }, + { + "epoch": 2.0, + "grad_norm": 0.3706338405609131, + "learning_rate": 9.261472811906328e-05, + "loss": 1.9247, + "step": 6516 + }, + { + "epoch": 2.0003069367710253, + "grad_norm": 0.36679908633232117, + "learning_rate": 9.261212799692962e-05, + "loss": 1.8193, + "step": 6517 + }, + { + "epoch": 2.0006138735420502, + "grad_norm": 0.45219072699546814, + "learning_rate": 9.260952745367795e-05, + "loss": 1.9019, + "step": 6518 + }, + { + "epoch": 2.0009208103130756, + "grad_norm": 0.6038491725921631, + "learning_rate": 9.260692648933393e-05, + "loss": 1.8834, + "step": 6519 + }, + { + "epoch": 2.001227747084101, + "grad_norm": 0.5823990106582642, + "learning_rate": 9.260432510392331e-05, + "loss": 1.9066, + "step": 6520 + }, + { + "epoch": 2.001534683855126, + "grad_norm": 0.4731088876724243, + "learning_rate": 9.260172329747178e-05, + "loss": 1.8997, + "step": 6521 + }, + { + "epoch": 2.001841620626151, + "grad_norm": 0.3397974669933319, + "learning_rate": 9.259912107000504e-05, + "loss": 1.9396, + "step": 6522 + }, + { + "epoch": 2.002148557397176, + "grad_norm": 0.374734103679657, + "learning_rate": 9.259651842154882e-05, + "loss": 1.9311, + "step": 6523 + }, + { + "epoch": 2.0024554941682013, + "grad_norm": 0.48218441009521484, + "learning_rate": 9.259391535212884e-05, + "loss": 1.948, + "step": 6524 + }, + { + "epoch": 2.0027624309392267, + "grad_norm": 0.40540626645088196, + "learning_rate": 9.259131186177082e-05, + "loss": 1.8541, + "step": 6525 + }, + { + "epoch": 2.0030693677102516, + "grad_norm": 0.3698440492153168, + "learning_rate": 9.258870795050048e-05, + "loss": 1.9622, + "step": 6526 + }, + { + "epoch": 2.003376304481277, + "grad_norm": 0.35084524750709534, + "learning_rate": 9.258610361834358e-05, + "loss": 1.8882, + "step": 6527 + }, + { + "epoch": 2.0036832412523022, + "grad_norm": 0.38982072472572327, + "learning_rate": 9.258349886532584e-05, + "loss": 1.9523, + "step": 6528 + }, + { + "epoch": 2.003990178023327, + "grad_norm": 0.3737744390964508, + "learning_rate": 9.258089369147302e-05, + "loss": 1.9091, + "step": 6529 + }, + { + "epoch": 2.0042971147943525, + "grad_norm": 0.36094167828559875, + "learning_rate": 9.257828809681083e-05, + "loss": 1.8711, + "step": 6530 + }, + { + "epoch": 2.0046040515653774, + "grad_norm": 0.3270244896411896, + "learning_rate": 9.257568208136506e-05, + "loss": 1.8738, + "step": 6531 + }, + { + "epoch": 2.0049109883364027, + "grad_norm": 0.3320237100124359, + "learning_rate": 9.257307564516145e-05, + "loss": 1.8889, + "step": 6532 + }, + { + "epoch": 2.005217925107428, + "grad_norm": 0.3091014623641968, + "learning_rate": 9.257046878822573e-05, + "loss": 1.8683, + "step": 6533 + }, + { + "epoch": 2.005524861878453, + "grad_norm": 0.3234712779521942, + "learning_rate": 9.25678615105837e-05, + "loss": 1.8787, + "step": 6534 + }, + { + "epoch": 2.0058317986494782, + "grad_norm": 0.38402292132377625, + "learning_rate": 9.25652538122611e-05, + "loss": 1.9414, + "step": 6535 + }, + { + "epoch": 2.0061387354205036, + "grad_norm": 0.41379863023757935, + "learning_rate": 9.256264569328372e-05, + "loss": 1.9185, + "step": 6536 + }, + { + "epoch": 2.0064456721915285, + "grad_norm": 0.35990384221076965, + "learning_rate": 9.256003715367733e-05, + "loss": 1.8756, + "step": 6537 + }, + { + "epoch": 2.006752608962554, + "grad_norm": 0.3489217460155487, + "learning_rate": 9.25574281934677e-05, + "loss": 1.8984, + "step": 6538 + }, + { + "epoch": 2.0070595457335787, + "grad_norm": 0.326541006565094, + "learning_rate": 9.255481881268064e-05, + "loss": 1.8559, + "step": 6539 + }, + { + "epoch": 2.007366482504604, + "grad_norm": 0.40900397300720215, + "learning_rate": 9.25522090113419e-05, + "loss": 1.8832, + "step": 6540 + }, + { + "epoch": 2.0076734192756294, + "grad_norm": 0.4130956828594208, + "learning_rate": 9.254959878947731e-05, + "loss": 1.8437, + "step": 6541 + }, + { + "epoch": 2.0079803560466543, + "grad_norm": 0.38869336247444153, + "learning_rate": 9.254698814711263e-05, + "loss": 1.8839, + "step": 6542 + }, + { + "epoch": 2.0082872928176796, + "grad_norm": 0.37832918763160706, + "learning_rate": 9.254437708427368e-05, + "loss": 1.9519, + "step": 6543 + }, + { + "epoch": 2.008594229588705, + "grad_norm": 0.35336560010910034, + "learning_rate": 9.254176560098625e-05, + "loss": 1.8928, + "step": 6544 + }, + { + "epoch": 2.00890116635973, + "grad_norm": 0.347260981798172, + "learning_rate": 9.253915369727617e-05, + "loss": 1.9133, + "step": 6545 + }, + { + "epoch": 2.009208103130755, + "grad_norm": 0.3706999719142914, + "learning_rate": 9.253654137316923e-05, + "loss": 1.9048, + "step": 6546 + }, + { + "epoch": 2.00951503990178, + "grad_norm": 0.40080907940864563, + "learning_rate": 9.253392862869127e-05, + "loss": 1.9169, + "step": 6547 + }, + { + "epoch": 2.0098219766728054, + "grad_norm": 0.3635334074497223, + "learning_rate": 9.253131546386808e-05, + "loss": 1.8623, + "step": 6548 + }, + { + "epoch": 2.0101289134438307, + "grad_norm": 0.32642990350723267, + "learning_rate": 9.252870187872552e-05, + "loss": 1.8624, + "step": 6549 + }, + { + "epoch": 2.0104358502148556, + "grad_norm": 0.32467779517173767, + "learning_rate": 9.25260878732894e-05, + "loss": 1.8867, + "step": 6550 + }, + { + "epoch": 2.010742786985881, + "grad_norm": 0.3496699631214142, + "learning_rate": 9.252347344758553e-05, + "loss": 1.8441, + "step": 6551 + }, + { + "epoch": 2.0110497237569063, + "grad_norm": 0.3624981939792633, + "learning_rate": 9.252085860163981e-05, + "loss": 1.9045, + "step": 6552 + }, + { + "epoch": 2.011356660527931, + "grad_norm": 0.3801099359989166, + "learning_rate": 9.251824333547801e-05, + "loss": 1.9273, + "step": 6553 + }, + { + "epoch": 2.0116635972989565, + "grad_norm": 0.355866402387619, + "learning_rate": 9.251562764912602e-05, + "loss": 1.9032, + "step": 6554 + }, + { + "epoch": 2.0119705340699814, + "grad_norm": 0.31210052967071533, + "learning_rate": 9.251301154260968e-05, + "loss": 1.8148, + "step": 6555 + }, + { + "epoch": 2.0122774708410067, + "grad_norm": 0.3583676218986511, + "learning_rate": 9.251039501595485e-05, + "loss": 1.9326, + "step": 6556 + }, + { + "epoch": 2.012584407612032, + "grad_norm": 0.40221846103668213, + "learning_rate": 9.250777806918737e-05, + "loss": 1.8968, + "step": 6557 + }, + { + "epoch": 2.012891344383057, + "grad_norm": 0.3403627574443817, + "learning_rate": 9.250516070233311e-05, + "loss": 1.8956, + "step": 6558 + }, + { + "epoch": 2.0131982811540823, + "grad_norm": 0.37752729654312134, + "learning_rate": 9.250254291541796e-05, + "loss": 1.9136, + "step": 6559 + }, + { + "epoch": 2.0135052179251076, + "grad_norm": 0.3661794364452362, + "learning_rate": 9.249992470846774e-05, + "loss": 1.8796, + "step": 6560 + }, + { + "epoch": 2.0138121546961325, + "grad_norm": 0.315603643655777, + "learning_rate": 9.249730608150837e-05, + "loss": 1.8711, + "step": 6561 + }, + { + "epoch": 2.014119091467158, + "grad_norm": 0.3187065124511719, + "learning_rate": 9.249468703456571e-05, + "loss": 1.8611, + "step": 6562 + }, + { + "epoch": 2.0144260282381827, + "grad_norm": 0.3018025755882263, + "learning_rate": 9.249206756766564e-05, + "loss": 1.786, + "step": 6563 + }, + { + "epoch": 2.014732965009208, + "grad_norm": 0.344963401556015, + "learning_rate": 9.248944768083406e-05, + "loss": 1.9428, + "step": 6564 + }, + { + "epoch": 2.0150399017802334, + "grad_norm": 0.29776978492736816, + "learning_rate": 9.248682737409687e-05, + "loss": 1.8089, + "step": 6565 + }, + { + "epoch": 2.0153468385512583, + "grad_norm": 0.348982572555542, + "learning_rate": 9.248420664747992e-05, + "loss": 1.8407, + "step": 6566 + }, + { + "epoch": 2.0156537753222836, + "grad_norm": 0.3413224518299103, + "learning_rate": 9.248158550100915e-05, + "loss": 1.9802, + "step": 6567 + }, + { + "epoch": 2.015960712093309, + "grad_norm": 0.3598950505256653, + "learning_rate": 9.247896393471044e-05, + "loss": 1.8882, + "step": 6568 + }, + { + "epoch": 2.016267648864334, + "grad_norm": 0.3609221875667572, + "learning_rate": 9.247634194860974e-05, + "loss": 1.934, + "step": 6569 + }, + { + "epoch": 2.016574585635359, + "grad_norm": 0.3893497586250305, + "learning_rate": 9.247371954273291e-05, + "loss": 1.8808, + "step": 6570 + }, + { + "epoch": 2.016881522406384, + "grad_norm": 0.347417950630188, + "learning_rate": 9.24710967171059e-05, + "loss": 1.863, + "step": 6571 + }, + { + "epoch": 2.0171884591774094, + "grad_norm": 0.35378298163414, + "learning_rate": 9.246847347175461e-05, + "loss": 1.8664, + "step": 6572 + }, + { + "epoch": 2.0174953959484347, + "grad_norm": 0.2819608151912689, + "learning_rate": 9.246584980670499e-05, + "loss": 1.9007, + "step": 6573 + }, + { + "epoch": 2.0178023327194596, + "grad_norm": 0.32445117831230164, + "learning_rate": 9.246322572198293e-05, + "loss": 1.9176, + "step": 6574 + }, + { + "epoch": 2.018109269490485, + "grad_norm": 0.33579203486442566, + "learning_rate": 9.24606012176144e-05, + "loss": 1.8192, + "step": 6575 + }, + { + "epoch": 2.0184162062615103, + "grad_norm": 0.40369588136672974, + "learning_rate": 9.245797629362532e-05, + "loss": 1.8731, + "step": 6576 + }, + { + "epoch": 2.018723143032535, + "grad_norm": 0.34241169691085815, + "learning_rate": 9.245535095004163e-05, + "loss": 1.8555, + "step": 6577 + }, + { + "epoch": 2.0190300798035605, + "grad_norm": 0.3627666234970093, + "learning_rate": 9.245272518688927e-05, + "loss": 1.9212, + "step": 6578 + }, + { + "epoch": 2.0193370165745854, + "grad_norm": 0.3330884873867035, + "learning_rate": 9.245009900419422e-05, + "loss": 1.8727, + "step": 6579 + }, + { + "epoch": 2.0196439533456108, + "grad_norm": 0.3259236514568329, + "learning_rate": 9.244747240198239e-05, + "loss": 1.8471, + "step": 6580 + }, + { + "epoch": 2.019950890116636, + "grad_norm": 0.3715277910232544, + "learning_rate": 9.244484538027976e-05, + "loss": 1.8925, + "step": 6581 + }, + { + "epoch": 2.020257826887661, + "grad_norm": 0.4752909541130066, + "learning_rate": 9.24422179391123e-05, + "loss": 1.889, + "step": 6582 + }, + { + "epoch": 2.0205647636586863, + "grad_norm": 0.5166791677474976, + "learning_rate": 9.243959007850597e-05, + "loss": 1.8637, + "step": 6583 + }, + { + "epoch": 2.0208717004297116, + "grad_norm": 0.5350266695022583, + "learning_rate": 9.243696179848673e-05, + "loss": 1.8916, + "step": 6584 + }, + { + "epoch": 2.0211786372007365, + "grad_norm": 0.6115607619285583, + "learning_rate": 9.243433309908055e-05, + "loss": 1.8847, + "step": 6585 + }, + { + "epoch": 2.021485573971762, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.243170398031343e-05, + "loss": 1.8889, + "step": 6586 + }, + { + "epoch": 2.021792510742787, + "grad_norm": 0.4547630846500397, + "learning_rate": 9.242907444221134e-05, + "loss": 1.8752, + "step": 6587 + }, + { + "epoch": 2.022099447513812, + "grad_norm": 0.39437413215637207, + "learning_rate": 9.242644448480027e-05, + "loss": 1.9318, + "step": 6588 + }, + { + "epoch": 2.0224063842848374, + "grad_norm": 0.39216291904449463, + "learning_rate": 9.24238141081062e-05, + "loss": 1.8799, + "step": 6589 + }, + { + "epoch": 2.0227133210558623, + "grad_norm": 0.4100605547428131, + "learning_rate": 9.242118331215513e-05, + "loss": 1.9278, + "step": 6590 + }, + { + "epoch": 2.0230202578268877, + "grad_norm": 0.38527074456214905, + "learning_rate": 9.241855209697307e-05, + "loss": 1.9085, + "step": 6591 + }, + { + "epoch": 2.023327194597913, + "grad_norm": 0.39856311678886414, + "learning_rate": 9.241592046258602e-05, + "loss": 1.8057, + "step": 6592 + }, + { + "epoch": 2.023634131368938, + "grad_norm": 0.4070499539375305, + "learning_rate": 9.241328840902e-05, + "loss": 1.8099, + "step": 6593 + }, + { + "epoch": 2.023941068139963, + "grad_norm": 0.40319183468818665, + "learning_rate": 9.241065593630097e-05, + "loss": 1.8654, + "step": 6594 + }, + { + "epoch": 2.0242480049109886, + "grad_norm": 0.3788430988788605, + "learning_rate": 9.240802304445499e-05, + "loss": 1.9419, + "step": 6595 + }, + { + "epoch": 2.0245549416820134, + "grad_norm": 0.3656894564628601, + "learning_rate": 9.240538973350809e-05, + "loss": 1.8625, + "step": 6596 + }, + { + "epoch": 2.0248618784530388, + "grad_norm": 0.4384852945804596, + "learning_rate": 9.240275600348625e-05, + "loss": 1.8893, + "step": 6597 + }, + { + "epoch": 2.0251688152240637, + "grad_norm": 0.5054775476455688, + "learning_rate": 9.240012185441554e-05, + "loss": 1.826, + "step": 6598 + }, + { + "epoch": 2.025475751995089, + "grad_norm": 0.4576725959777832, + "learning_rate": 9.239748728632196e-05, + "loss": 1.9319, + "step": 6599 + }, + { + "epoch": 2.0257826887661143, + "grad_norm": 0.40581515431404114, + "learning_rate": 9.239485229923157e-05, + "loss": 1.905, + "step": 6600 + }, + { + "epoch": 2.0260896255371392, + "grad_norm": 0.3168322443962097, + "learning_rate": 9.23922168931704e-05, + "loss": 1.8937, + "step": 6601 + }, + { + "epoch": 2.0263965623081646, + "grad_norm": 0.39211124181747437, + "learning_rate": 9.238958106816449e-05, + "loss": 1.8346, + "step": 6602 + }, + { + "epoch": 2.02670349907919, + "grad_norm": 0.4722496569156647, + "learning_rate": 9.23869448242399e-05, + "loss": 1.933, + "step": 6603 + }, + { + "epoch": 2.027010435850215, + "grad_norm": 0.47029170393943787, + "learning_rate": 9.238430816142268e-05, + "loss": 1.8873, + "step": 6604 + }, + { + "epoch": 2.02731737262124, + "grad_norm": 0.36421555280685425, + "learning_rate": 9.238167107973888e-05, + "loss": 1.8311, + "step": 6605 + }, + { + "epoch": 2.027624309392265, + "grad_norm": 0.36506712436676025, + "learning_rate": 9.237903357921455e-05, + "loss": 1.9025, + "step": 6606 + }, + { + "epoch": 2.0279312461632903, + "grad_norm": 0.5055087208747864, + "learning_rate": 9.237639565987579e-05, + "loss": 1.9138, + "step": 6607 + }, + { + "epoch": 2.0282381829343157, + "grad_norm": 0.5850993394851685, + "learning_rate": 9.237375732174867e-05, + "loss": 1.869, + "step": 6608 + }, + { + "epoch": 2.0285451197053406, + "grad_norm": 0.5053986310958862, + "learning_rate": 9.237111856485921e-05, + "loss": 1.8196, + "step": 6609 + }, + { + "epoch": 2.028852056476366, + "grad_norm": 0.40635839104652405, + "learning_rate": 9.236847938923354e-05, + "loss": 1.8399, + "step": 6610 + }, + { + "epoch": 2.0291589932473912, + "grad_norm": 0.32075709104537964, + "learning_rate": 9.236583979489771e-05, + "loss": 1.8532, + "step": 6611 + }, + { + "epoch": 2.029465930018416, + "grad_norm": 0.4474230408668518, + "learning_rate": 9.236319978187783e-05, + "loss": 1.8807, + "step": 6612 + }, + { + "epoch": 2.0297728667894415, + "grad_norm": 0.5391832590103149, + "learning_rate": 9.236055935019998e-05, + "loss": 1.8887, + "step": 6613 + }, + { + "epoch": 2.0300798035604664, + "grad_norm": 0.5129361748695374, + "learning_rate": 9.235791849989024e-05, + "loss": 1.8541, + "step": 6614 + }, + { + "epoch": 2.0303867403314917, + "grad_norm": 0.33113735914230347, + "learning_rate": 9.235527723097474e-05, + "loss": 1.8611, + "step": 6615 + }, + { + "epoch": 2.030693677102517, + "grad_norm": 0.3526761531829834, + "learning_rate": 9.235263554347956e-05, + "loss": 1.8436, + "step": 6616 + }, + { + "epoch": 2.031000613873542, + "grad_norm": 0.4380190670490265, + "learning_rate": 9.234999343743081e-05, + "loss": 1.854, + "step": 6617 + }, + { + "epoch": 2.0313075506445673, + "grad_norm": 0.4300559163093567, + "learning_rate": 9.23473509128546e-05, + "loss": 1.919, + "step": 6618 + }, + { + "epoch": 2.0316144874155926, + "grad_norm": 0.3445209860801697, + "learning_rate": 9.234470796977705e-05, + "loss": 1.88, + "step": 6619 + }, + { + "epoch": 2.0319214241866175, + "grad_norm": 0.35759109258651733, + "learning_rate": 9.234206460822428e-05, + "loss": 1.9244, + "step": 6620 + }, + { + "epoch": 2.032228360957643, + "grad_norm": 0.432804137468338, + "learning_rate": 9.23394208282224e-05, + "loss": 1.9312, + "step": 6621 + }, + { + "epoch": 2.0325352977286677, + "grad_norm": 0.446865439414978, + "learning_rate": 9.233677662979756e-05, + "loss": 1.8791, + "step": 6622 + }, + { + "epoch": 2.032842234499693, + "grad_norm": 0.37617436051368713, + "learning_rate": 9.233413201297588e-05, + "loss": 1.8794, + "step": 6623 + }, + { + "epoch": 2.0331491712707184, + "grad_norm": 0.33695775270462036, + "learning_rate": 9.233148697778349e-05, + "loss": 1.8649, + "step": 6624 + }, + { + "epoch": 2.0334561080417433, + "grad_norm": 0.3893069624900818, + "learning_rate": 9.232884152424654e-05, + "loss": 1.899, + "step": 6625 + }, + { + "epoch": 2.0337630448127686, + "grad_norm": 0.38993194699287415, + "learning_rate": 9.232619565239116e-05, + "loss": 1.8994, + "step": 6626 + }, + { + "epoch": 2.034069981583794, + "grad_norm": 0.3725507855415344, + "learning_rate": 9.23235493622435e-05, + "loss": 1.8758, + "step": 6627 + }, + { + "epoch": 2.034376918354819, + "grad_norm": 0.3236019015312195, + "learning_rate": 9.232090265382973e-05, + "loss": 1.9041, + "step": 6628 + }, + { + "epoch": 2.034683855125844, + "grad_norm": 0.3399617671966553, + "learning_rate": 9.231825552717599e-05, + "loss": 1.9081, + "step": 6629 + }, + { + "epoch": 2.034990791896869, + "grad_norm": 0.352096289396286, + "learning_rate": 9.231560798230845e-05, + "loss": 1.9001, + "step": 6630 + }, + { + "epoch": 2.0352977286678944, + "grad_norm": 0.39621952176094055, + "learning_rate": 9.231296001925327e-05, + "loss": 1.9258, + "step": 6631 + }, + { + "epoch": 2.0356046654389197, + "grad_norm": 0.36686012148857117, + "learning_rate": 9.23103116380366e-05, + "loss": 1.9325, + "step": 6632 + }, + { + "epoch": 2.0359116022099446, + "grad_norm": 0.36286696791648865, + "learning_rate": 9.230766283868466e-05, + "loss": 1.9623, + "step": 6633 + }, + { + "epoch": 2.03621853898097, + "grad_norm": 0.34748387336730957, + "learning_rate": 9.230501362122359e-05, + "loss": 1.8326, + "step": 6634 + }, + { + "epoch": 2.0365254757519953, + "grad_norm": 0.350993275642395, + "learning_rate": 9.230236398567958e-05, + "loss": 1.8333, + "step": 6635 + }, + { + "epoch": 2.03683241252302, + "grad_norm": 0.3181723356246948, + "learning_rate": 9.229971393207881e-05, + "loss": 1.8852, + "step": 6636 + }, + { + "epoch": 2.0371393492940455, + "grad_norm": 0.3446536660194397, + "learning_rate": 9.229706346044747e-05, + "loss": 1.8833, + "step": 6637 + }, + { + "epoch": 2.0374462860650704, + "grad_norm": 0.3077203631401062, + "learning_rate": 9.229441257081176e-05, + "loss": 1.8546, + "step": 6638 + }, + { + "epoch": 2.0377532228360957, + "grad_norm": 0.3659566342830658, + "learning_rate": 9.229176126319788e-05, + "loss": 1.8687, + "step": 6639 + }, + { + "epoch": 2.038060159607121, + "grad_norm": 0.379779577255249, + "learning_rate": 9.228910953763204e-05, + "loss": 1.9208, + "step": 6640 + }, + { + "epoch": 2.038367096378146, + "grad_norm": 0.4496903121471405, + "learning_rate": 9.228645739414042e-05, + "loss": 1.9471, + "step": 6641 + }, + { + "epoch": 2.0386740331491713, + "grad_norm": 0.37597209215164185, + "learning_rate": 9.228380483274923e-05, + "loss": 1.9047, + "step": 6642 + }, + { + "epoch": 2.0389809699201966, + "grad_norm": 0.3739323019981384, + "learning_rate": 9.228115185348471e-05, + "loss": 1.9697, + "step": 6643 + }, + { + "epoch": 2.0392879066912215, + "grad_norm": 0.3524092435836792, + "learning_rate": 9.227849845637306e-05, + "loss": 1.8716, + "step": 6644 + }, + { + "epoch": 2.039594843462247, + "grad_norm": 0.36939096450805664, + "learning_rate": 9.227584464144051e-05, + "loss": 1.9836, + "step": 6645 + }, + { + "epoch": 2.0399017802332717, + "grad_norm": 0.39015519618988037, + "learning_rate": 9.22731904087133e-05, + "loss": 1.907, + "step": 6646 + }, + { + "epoch": 2.040208717004297, + "grad_norm": 0.3725626468658447, + "learning_rate": 9.227053575821763e-05, + "loss": 1.9483, + "step": 6647 + }, + { + "epoch": 2.0405156537753224, + "grad_norm": 0.41595613956451416, + "learning_rate": 9.226788068997974e-05, + "loss": 1.9352, + "step": 6648 + }, + { + "epoch": 2.0408225905463473, + "grad_norm": 0.4026443660259247, + "learning_rate": 9.226522520402589e-05, + "loss": 1.9166, + "step": 6649 + }, + { + "epoch": 2.0411295273173726, + "grad_norm": 0.39883533120155334, + "learning_rate": 9.226256930038233e-05, + "loss": 1.8594, + "step": 6650 + }, + { + "epoch": 2.041436464088398, + "grad_norm": 0.35540083050727844, + "learning_rate": 9.225991297907526e-05, + "loss": 1.9065, + "step": 6651 + }, + { + "epoch": 2.041743400859423, + "grad_norm": 0.3799804747104645, + "learning_rate": 9.225725624013097e-05, + "loss": 1.9232, + "step": 6652 + }, + { + "epoch": 2.042050337630448, + "grad_norm": 0.37289959192276, + "learning_rate": 9.225459908357572e-05, + "loss": 1.9679, + "step": 6653 + }, + { + "epoch": 2.042357274401473, + "grad_norm": 0.38069143891334534, + "learning_rate": 9.225194150943574e-05, + "loss": 1.9699, + "step": 6654 + }, + { + "epoch": 2.0426642111724984, + "grad_norm": 0.43708884716033936, + "learning_rate": 9.224928351773731e-05, + "loss": 1.8907, + "step": 6655 + }, + { + "epoch": 2.0429711479435237, + "grad_norm": 0.47203195095062256, + "learning_rate": 9.22466251085067e-05, + "loss": 1.9615, + "step": 6656 + }, + { + "epoch": 2.0432780847145486, + "grad_norm": 0.405129998922348, + "learning_rate": 9.224396628177019e-05, + "loss": 1.9165, + "step": 6657 + }, + { + "epoch": 2.043585021485574, + "grad_norm": 0.33447468280792236, + "learning_rate": 9.224130703755403e-05, + "loss": 1.852, + "step": 6658 + }, + { + "epoch": 2.0438919582565993, + "grad_norm": 0.33780771493911743, + "learning_rate": 9.223864737588453e-05, + "loss": 1.875, + "step": 6659 + }, + { + "epoch": 2.044198895027624, + "grad_norm": 0.37942594289779663, + "learning_rate": 9.223598729678796e-05, + "loss": 1.9115, + "step": 6660 + }, + { + "epoch": 2.0445058317986495, + "grad_norm": 0.3368874192237854, + "learning_rate": 9.223332680029059e-05, + "loss": 1.822, + "step": 6661 + }, + { + "epoch": 2.044812768569675, + "grad_norm": 0.3029201924800873, + "learning_rate": 9.223066588641873e-05, + "loss": 1.8902, + "step": 6662 + }, + { + "epoch": 2.0451197053406998, + "grad_norm": 0.4605506360530853, + "learning_rate": 9.22280045551987e-05, + "loss": 1.9164, + "step": 6663 + }, + { + "epoch": 2.045426642111725, + "grad_norm": 0.5012617111206055, + "learning_rate": 9.222534280665675e-05, + "loss": 1.8859, + "step": 6664 + }, + { + "epoch": 2.04573357888275, + "grad_norm": 0.5177115797996521, + "learning_rate": 9.222268064081924e-05, + "loss": 1.93, + "step": 6665 + }, + { + "epoch": 2.0460405156537753, + "grad_norm": 0.3966628313064575, + "learning_rate": 9.222001805771244e-05, + "loss": 1.8817, + "step": 6666 + }, + { + "epoch": 2.0463474524248007, + "grad_norm": 0.3670666813850403, + "learning_rate": 9.221735505736269e-05, + "loss": 1.8224, + "step": 6667 + }, + { + "epoch": 2.0466543891958255, + "grad_norm": 0.4584221839904785, + "learning_rate": 9.221469163979628e-05, + "loss": 1.7788, + "step": 6668 + }, + { + "epoch": 2.046961325966851, + "grad_norm": 0.5598693490028381, + "learning_rate": 9.221202780503954e-05, + "loss": 1.9263, + "step": 6669 + }, + { + "epoch": 2.047268262737876, + "grad_norm": 0.44200289249420166, + "learning_rate": 9.22093635531188e-05, + "loss": 1.8455, + "step": 6670 + }, + { + "epoch": 2.047575199508901, + "grad_norm": 0.33257725834846497, + "learning_rate": 9.22066988840604e-05, + "loss": 1.9019, + "step": 6671 + }, + { + "epoch": 2.0478821362799264, + "grad_norm": 0.4716290831565857, + "learning_rate": 9.220403379789066e-05, + "loss": 1.9012, + "step": 6672 + }, + { + "epoch": 2.0481890730509513, + "grad_norm": 0.5600453615188599, + "learning_rate": 9.220136829463591e-05, + "loss": 1.9158, + "step": 6673 + }, + { + "epoch": 2.0484960098219767, + "grad_norm": 0.5345216393470764, + "learning_rate": 9.219870237432252e-05, + "loss": 1.931, + "step": 6674 + }, + { + "epoch": 2.048802946593002, + "grad_norm": 0.36617112159729004, + "learning_rate": 9.219603603697682e-05, + "loss": 1.9019, + "step": 6675 + }, + { + "epoch": 2.049109883364027, + "grad_norm": 0.33677804470062256, + "learning_rate": 9.219336928262514e-05, + "loss": 1.8897, + "step": 6676 + }, + { + "epoch": 2.049416820135052, + "grad_norm": 0.48563066124916077, + "learning_rate": 9.219070211129388e-05, + "loss": 1.9147, + "step": 6677 + }, + { + "epoch": 2.0497237569060776, + "grad_norm": 0.5029729008674622, + "learning_rate": 9.218803452300935e-05, + "loss": 1.8926, + "step": 6678 + }, + { + "epoch": 2.0500306936771024, + "grad_norm": 0.3969452977180481, + "learning_rate": 9.218536651779795e-05, + "loss": 1.9337, + "step": 6679 + }, + { + "epoch": 2.050337630448128, + "grad_norm": 0.37374138832092285, + "learning_rate": 9.218269809568603e-05, + "loss": 1.9147, + "step": 6680 + }, + { + "epoch": 2.0506445672191527, + "grad_norm": 0.416608065366745, + "learning_rate": 9.218002925669996e-05, + "loss": 1.975, + "step": 6681 + }, + { + "epoch": 2.050951503990178, + "grad_norm": 0.35848283767700195, + "learning_rate": 9.217736000086612e-05, + "loss": 1.9194, + "step": 6682 + }, + { + "epoch": 2.0512584407612033, + "grad_norm": 0.3294626772403717, + "learning_rate": 9.217469032821088e-05, + "loss": 1.8541, + "step": 6683 + }, + { + "epoch": 2.0515653775322282, + "grad_norm": 0.4164618253707886, + "learning_rate": 9.217202023876064e-05, + "loss": 1.8999, + "step": 6684 + }, + { + "epoch": 2.0518723143032536, + "grad_norm": 0.4067288935184479, + "learning_rate": 9.216934973254179e-05, + "loss": 1.8609, + "step": 6685 + }, + { + "epoch": 2.052179251074279, + "grad_norm": 0.38743069767951965, + "learning_rate": 9.216667880958069e-05, + "loss": 1.8571, + "step": 6686 + }, + { + "epoch": 2.052486187845304, + "grad_norm": 0.3430919647216797, + "learning_rate": 9.216400746990377e-05, + "loss": 1.9229, + "step": 6687 + }, + { + "epoch": 2.052793124616329, + "grad_norm": 0.3512028753757477, + "learning_rate": 9.21613357135374e-05, + "loss": 1.9331, + "step": 6688 + }, + { + "epoch": 2.053100061387354, + "grad_norm": 0.3708036541938782, + "learning_rate": 9.215866354050799e-05, + "loss": 1.8499, + "step": 6689 + }, + { + "epoch": 2.0534069981583793, + "grad_norm": 0.39376455545425415, + "learning_rate": 9.215599095084199e-05, + "loss": 1.8531, + "step": 6690 + }, + { + "epoch": 2.0537139349294047, + "grad_norm": 0.3855830430984497, + "learning_rate": 9.215331794456576e-05, + "loss": 1.8597, + "step": 6691 + }, + { + "epoch": 2.0540208717004296, + "grad_norm": 0.3515113592147827, + "learning_rate": 9.215064452170574e-05, + "loss": 1.8776, + "step": 6692 + }, + { + "epoch": 2.054327808471455, + "grad_norm": 0.3165057897567749, + "learning_rate": 9.214797068228833e-05, + "loss": 1.926, + "step": 6693 + }, + { + "epoch": 2.0546347452424802, + "grad_norm": 0.3516407310962677, + "learning_rate": 9.214529642633998e-05, + "loss": 1.9397, + "step": 6694 + }, + { + "epoch": 2.054941682013505, + "grad_norm": 0.36943888664245605, + "learning_rate": 9.214262175388713e-05, + "loss": 1.9114, + "step": 6695 + }, + { + "epoch": 2.0552486187845305, + "grad_norm": 0.3490065634250641, + "learning_rate": 9.213994666495616e-05, + "loss": 1.8637, + "step": 6696 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.30341869592666626, + "learning_rate": 9.213727115957356e-05, + "loss": 1.8525, + "step": 6697 + }, + { + "epoch": 2.0558624923265807, + "grad_norm": 0.3899247646331787, + "learning_rate": 9.213459523776573e-05, + "loss": 2.0578, + "step": 6698 + }, + { + "epoch": 2.056169429097606, + "grad_norm": 0.34904104471206665, + "learning_rate": 9.213191889955915e-05, + "loss": 1.9135, + "step": 6699 + }, + { + "epoch": 2.056476365868631, + "grad_norm": 0.3806450366973877, + "learning_rate": 9.212924214498024e-05, + "loss": 1.9252, + "step": 6700 + }, + { + "epoch": 2.0567833026396563, + "grad_norm": 0.33185848593711853, + "learning_rate": 9.212656497405547e-05, + "loss": 1.8457, + "step": 6701 + }, + { + "epoch": 2.0570902394106816, + "grad_norm": 0.356717050075531, + "learning_rate": 9.21238873868113e-05, + "loss": 1.9086, + "step": 6702 + }, + { + "epoch": 2.0573971761817065, + "grad_norm": 0.41743260622024536, + "learning_rate": 9.212120938327418e-05, + "loss": 1.9255, + "step": 6703 + }, + { + "epoch": 2.057704112952732, + "grad_norm": 0.3937377631664276, + "learning_rate": 9.211853096347058e-05, + "loss": 1.9529, + "step": 6704 + }, + { + "epoch": 2.0580110497237567, + "grad_norm": 0.43980923295021057, + "learning_rate": 9.211585212742698e-05, + "loss": 1.905, + "step": 6705 + }, + { + "epoch": 2.058317986494782, + "grad_norm": 0.36891186237335205, + "learning_rate": 9.211317287516984e-05, + "loss": 1.8109, + "step": 6706 + }, + { + "epoch": 2.0586249232658074, + "grad_norm": 0.3582547605037689, + "learning_rate": 9.211049320672563e-05, + "loss": 1.9633, + "step": 6707 + }, + { + "epoch": 2.0589318600368323, + "grad_norm": 0.3421446979045868, + "learning_rate": 9.210781312212087e-05, + "loss": 1.8956, + "step": 6708 + }, + { + "epoch": 2.0592387968078576, + "grad_norm": 0.34717023372650146, + "learning_rate": 9.210513262138199e-05, + "loss": 1.837, + "step": 6709 + }, + { + "epoch": 2.059545733578883, + "grad_norm": 0.32769930362701416, + "learning_rate": 9.210245170453553e-05, + "loss": 1.8588, + "step": 6710 + }, + { + "epoch": 2.059852670349908, + "grad_norm": 0.3694380223751068, + "learning_rate": 9.209977037160796e-05, + "loss": 1.9298, + "step": 6711 + }, + { + "epoch": 2.060159607120933, + "grad_norm": 0.38598594069480896, + "learning_rate": 9.209708862262578e-05, + "loss": 1.9011, + "step": 6712 + }, + { + "epoch": 2.060466543891958, + "grad_norm": 0.33520397543907166, + "learning_rate": 9.20944064576155e-05, + "loss": 1.9689, + "step": 6713 + }, + { + "epoch": 2.0607734806629834, + "grad_norm": 0.36898335814476013, + "learning_rate": 9.209172387660363e-05, + "loss": 1.9362, + "step": 6714 + }, + { + "epoch": 2.0610804174340087, + "grad_norm": 0.3989763855934143, + "learning_rate": 9.208904087961667e-05, + "loss": 1.8875, + "step": 6715 + }, + { + "epoch": 2.0613873542050336, + "grad_norm": 0.38079237937927246, + "learning_rate": 9.208635746668113e-05, + "loss": 1.8645, + "step": 6716 + }, + { + "epoch": 2.061694290976059, + "grad_norm": 0.3853057026863098, + "learning_rate": 9.208367363782355e-05, + "loss": 1.9346, + "step": 6717 + }, + { + "epoch": 2.0620012277470843, + "grad_norm": 0.33557942509651184, + "learning_rate": 9.208098939307044e-05, + "loss": 1.8629, + "step": 6718 + }, + { + "epoch": 2.062308164518109, + "grad_norm": 0.31848183274269104, + "learning_rate": 9.207830473244832e-05, + "loss": 1.7616, + "step": 6719 + }, + { + "epoch": 2.0626151012891345, + "grad_norm": 0.2901391088962555, + "learning_rate": 9.207561965598375e-05, + "loss": 1.8876, + "step": 6720 + }, + { + "epoch": 2.06292203806016, + "grad_norm": 0.33935174345970154, + "learning_rate": 9.207293416370322e-05, + "loss": 1.8407, + "step": 6721 + }, + { + "epoch": 2.0632289748311847, + "grad_norm": 0.3615114390850067, + "learning_rate": 9.207024825563331e-05, + "loss": 1.8378, + "step": 6722 + }, + { + "epoch": 2.06353591160221, + "grad_norm": 0.35903334617614746, + "learning_rate": 9.206756193180053e-05, + "loss": 1.8316, + "step": 6723 + }, + { + "epoch": 2.063842848373235, + "grad_norm": 0.35222968459129333, + "learning_rate": 9.206487519223146e-05, + "loss": 1.8786, + "step": 6724 + }, + { + "epoch": 2.0641497851442603, + "grad_norm": 0.3412967622280121, + "learning_rate": 9.206218803695264e-05, + "loss": 1.8682, + "step": 6725 + }, + { + "epoch": 2.0644567219152856, + "grad_norm": 0.4166354835033417, + "learning_rate": 9.205950046599062e-05, + "loss": 1.8871, + "step": 6726 + }, + { + "epoch": 2.0647636586863105, + "grad_norm": 0.4631161093711853, + "learning_rate": 9.205681247937196e-05, + "loss": 1.9328, + "step": 6727 + }, + { + "epoch": 2.065070595457336, + "grad_norm": 0.39197248220443726, + "learning_rate": 9.205412407712325e-05, + "loss": 1.9434, + "step": 6728 + }, + { + "epoch": 2.0653775322283607, + "grad_norm": 0.37939852476119995, + "learning_rate": 9.205143525927103e-05, + "loss": 1.9115, + "step": 6729 + }, + { + "epoch": 2.065684468999386, + "grad_norm": 0.35442814230918884, + "learning_rate": 9.204874602584186e-05, + "loss": 1.9197, + "step": 6730 + }, + { + "epoch": 2.0659914057704114, + "grad_norm": 0.3598809242248535, + "learning_rate": 9.204605637686235e-05, + "loss": 1.8684, + "step": 6731 + }, + { + "epoch": 2.0662983425414363, + "grad_norm": 0.3360415995121002, + "learning_rate": 9.204336631235905e-05, + "loss": 1.8531, + "step": 6732 + }, + { + "epoch": 2.0666052793124616, + "grad_norm": 0.4487619698047638, + "learning_rate": 9.204067583235859e-05, + "loss": 1.8509, + "step": 6733 + }, + { + "epoch": 2.066912216083487, + "grad_norm": 0.37166881561279297, + "learning_rate": 9.203798493688753e-05, + "loss": 1.8826, + "step": 6734 + }, + { + "epoch": 2.067219152854512, + "grad_norm": 0.35294032096862793, + "learning_rate": 9.203529362597244e-05, + "loss": 1.9029, + "step": 6735 + }, + { + "epoch": 2.067526089625537, + "grad_norm": 0.4115317165851593, + "learning_rate": 9.203260189963995e-05, + "loss": 1.9117, + "step": 6736 + }, + { + "epoch": 2.0678330263965625, + "grad_norm": 0.44137999415397644, + "learning_rate": 9.202990975791666e-05, + "loss": 1.8754, + "step": 6737 + }, + { + "epoch": 2.0681399631675874, + "grad_norm": 0.46055081486701965, + "learning_rate": 9.202721720082916e-05, + "loss": 1.8322, + "step": 6738 + }, + { + "epoch": 2.0684468999386127, + "grad_norm": 0.38548141717910767, + "learning_rate": 9.202452422840407e-05, + "loss": 1.8341, + "step": 6739 + }, + { + "epoch": 2.0687538367096376, + "grad_norm": 0.3542765974998474, + "learning_rate": 9.2021830840668e-05, + "loss": 1.9301, + "step": 6740 + }, + { + "epoch": 2.069060773480663, + "grad_norm": 0.35987207293510437, + "learning_rate": 9.201913703764755e-05, + "loss": 1.8756, + "step": 6741 + }, + { + "epoch": 2.0693677102516883, + "grad_norm": 0.4297364056110382, + "learning_rate": 9.201644281936938e-05, + "loss": 1.8549, + "step": 6742 + }, + { + "epoch": 2.069674647022713, + "grad_norm": 0.3679873049259186, + "learning_rate": 9.20137481858601e-05, + "loss": 1.8905, + "step": 6743 + }, + { + "epoch": 2.0699815837937385, + "grad_norm": 0.3402685523033142, + "learning_rate": 9.201105313714632e-05, + "loss": 1.8834, + "step": 6744 + }, + { + "epoch": 2.070288520564764, + "grad_norm": 0.40986955165863037, + "learning_rate": 9.200835767325469e-05, + "loss": 1.8861, + "step": 6745 + }, + { + "epoch": 2.0705954573357888, + "grad_norm": 0.4305949807167053, + "learning_rate": 9.200566179421186e-05, + "loss": 1.8977, + "step": 6746 + }, + { + "epoch": 2.070902394106814, + "grad_norm": 0.3948439359664917, + "learning_rate": 9.200296550004446e-05, + "loss": 1.8801, + "step": 6747 + }, + { + "epoch": 2.071209330877839, + "grad_norm": 0.3404015600681305, + "learning_rate": 9.200026879077912e-05, + "loss": 1.8417, + "step": 6748 + }, + { + "epoch": 2.0715162676488643, + "grad_norm": 0.39447101950645447, + "learning_rate": 9.199757166644252e-05, + "loss": 1.9675, + "step": 6749 + }, + { + "epoch": 2.0718232044198897, + "grad_norm": 0.44323647022247314, + "learning_rate": 9.199487412706129e-05, + "loss": 1.9014, + "step": 6750 + }, + { + "epoch": 2.0721301411909145, + "grad_norm": 0.47096556425094604, + "learning_rate": 9.199217617266212e-05, + "loss": 1.8783, + "step": 6751 + }, + { + "epoch": 2.07243707796194, + "grad_norm": 0.42863038182258606, + "learning_rate": 9.198947780327163e-05, + "loss": 1.8369, + "step": 6752 + }, + { + "epoch": 2.072744014732965, + "grad_norm": 0.414079874753952, + "learning_rate": 9.198677901891652e-05, + "loss": 1.9247, + "step": 6753 + }, + { + "epoch": 2.07305095150399, + "grad_norm": 0.3445589542388916, + "learning_rate": 9.198407981962345e-05, + "loss": 1.8494, + "step": 6754 + }, + { + "epoch": 2.0733578882750154, + "grad_norm": 0.4340321719646454, + "learning_rate": 9.198138020541908e-05, + "loss": 1.904, + "step": 6755 + }, + { + "epoch": 2.0736648250460403, + "grad_norm": 0.55349200963974, + "learning_rate": 9.197868017633013e-05, + "loss": 1.9368, + "step": 6756 + }, + { + "epoch": 2.0739717618170657, + "grad_norm": 0.5893970727920532, + "learning_rate": 9.197597973238326e-05, + "loss": 1.9329, + "step": 6757 + }, + { + "epoch": 2.074278698588091, + "grad_norm": 0.4942009449005127, + "learning_rate": 9.197327887360514e-05, + "loss": 1.7726, + "step": 6758 + }, + { + "epoch": 2.074585635359116, + "grad_norm": 0.36411046981811523, + "learning_rate": 9.197057760002247e-05, + "loss": 1.8214, + "step": 6759 + }, + { + "epoch": 2.074892572130141, + "grad_norm": 0.31520166993141174, + "learning_rate": 9.196787591166198e-05, + "loss": 1.8491, + "step": 6760 + }, + { + "epoch": 2.0751995089011666, + "grad_norm": 0.47392621636390686, + "learning_rate": 9.196517380855032e-05, + "loss": 2.0165, + "step": 6761 + }, + { + "epoch": 2.0755064456721914, + "grad_norm": 0.4768085181713104, + "learning_rate": 9.196247129071423e-05, + "loss": 1.9289, + "step": 6762 + }, + { + "epoch": 2.075813382443217, + "grad_norm": 0.396391361951828, + "learning_rate": 9.195976835818039e-05, + "loss": 1.9521, + "step": 6763 + }, + { + "epoch": 2.0761203192142417, + "grad_norm": 0.4030967950820923, + "learning_rate": 9.195706501097551e-05, + "loss": 1.8386, + "step": 6764 + }, + { + "epoch": 2.076427255985267, + "grad_norm": 0.48308777809143066, + "learning_rate": 9.195436124912635e-05, + "loss": 1.8874, + "step": 6765 + }, + { + "epoch": 2.0767341927562923, + "grad_norm": 0.5232771635055542, + "learning_rate": 9.19516570726596e-05, + "loss": 1.8822, + "step": 6766 + }, + { + "epoch": 2.0770411295273172, + "grad_norm": 0.3607174754142761, + "learning_rate": 9.194895248160198e-05, + "loss": 1.8995, + "step": 6767 + }, + { + "epoch": 2.0773480662983426, + "grad_norm": 0.4354429841041565, + "learning_rate": 9.194624747598022e-05, + "loss": 1.8629, + "step": 6768 + }, + { + "epoch": 2.077655003069368, + "grad_norm": 0.5405299067497253, + "learning_rate": 9.194354205582107e-05, + "loss": 1.8608, + "step": 6769 + }, + { + "epoch": 2.077961939840393, + "grad_norm": 0.5442025065422058, + "learning_rate": 9.194083622115123e-05, + "loss": 1.885, + "step": 6770 + }, + { + "epoch": 2.078268876611418, + "grad_norm": 0.4160112142562866, + "learning_rate": 9.193812997199749e-05, + "loss": 1.8617, + "step": 6771 + }, + { + "epoch": 2.078575813382443, + "grad_norm": 0.3550199866294861, + "learning_rate": 9.193542330838656e-05, + "loss": 1.9277, + "step": 6772 + }, + { + "epoch": 2.0788827501534684, + "grad_norm": 0.5224893093109131, + "learning_rate": 9.19327162303452e-05, + "loss": 1.7893, + "step": 6773 + }, + { + "epoch": 2.0791896869244937, + "grad_norm": 0.45021727681159973, + "learning_rate": 9.193000873790014e-05, + "loss": 1.8635, + "step": 6774 + }, + { + "epoch": 2.0794966236955186, + "grad_norm": 0.3087892532348633, + "learning_rate": 9.192730083107819e-05, + "loss": 1.842, + "step": 6775 + }, + { + "epoch": 2.079803560466544, + "grad_norm": 0.4304139018058777, + "learning_rate": 9.192459250990606e-05, + "loss": 1.8461, + "step": 6776 + }, + { + "epoch": 2.0801104972375692, + "grad_norm": 0.4388587474822998, + "learning_rate": 9.192188377441054e-05, + "loss": 1.8978, + "step": 6777 + }, + { + "epoch": 2.080417434008594, + "grad_norm": 0.3452616333961487, + "learning_rate": 9.19191746246184e-05, + "loss": 1.8849, + "step": 6778 + }, + { + "epoch": 2.0807243707796195, + "grad_norm": 0.3127618432044983, + "learning_rate": 9.191646506055638e-05, + "loss": 1.8703, + "step": 6779 + }, + { + "epoch": 2.0810313075506444, + "grad_norm": 0.3424977958202362, + "learning_rate": 9.191375508225131e-05, + "loss": 1.8446, + "step": 6780 + }, + { + "epoch": 2.0813382443216697, + "grad_norm": 0.3536671996116638, + "learning_rate": 9.191104468972993e-05, + "loss": 1.9079, + "step": 6781 + }, + { + "epoch": 2.081645181092695, + "grad_norm": 0.3689599633216858, + "learning_rate": 9.190833388301905e-05, + "loss": 1.8683, + "step": 6782 + }, + { + "epoch": 2.08195211786372, + "grad_norm": 0.30976906418800354, + "learning_rate": 9.190562266214546e-05, + "loss": 1.89, + "step": 6783 + }, + { + "epoch": 2.0822590546347453, + "grad_norm": 0.34682777523994446, + "learning_rate": 9.190291102713593e-05, + "loss": 1.8384, + "step": 6784 + }, + { + "epoch": 2.0825659914057706, + "grad_norm": 0.4135018587112427, + "learning_rate": 9.190019897801727e-05, + "loss": 1.8878, + "step": 6785 + }, + { + "epoch": 2.0828729281767955, + "grad_norm": 0.4247548580169678, + "learning_rate": 9.189748651481629e-05, + "loss": 1.9244, + "step": 6786 + }, + { + "epoch": 2.083179864947821, + "grad_norm": 0.3961609899997711, + "learning_rate": 9.18947736375598e-05, + "loss": 1.9539, + "step": 6787 + }, + { + "epoch": 2.0834868017188457, + "grad_norm": 0.4174231290817261, + "learning_rate": 9.18920603462746e-05, + "loss": 1.9705, + "step": 6788 + }, + { + "epoch": 2.083793738489871, + "grad_norm": 0.38771605491638184, + "learning_rate": 9.18893466409875e-05, + "loss": 1.9038, + "step": 6789 + }, + { + "epoch": 2.0841006752608964, + "grad_norm": 0.38480475544929504, + "learning_rate": 9.188663252172534e-05, + "loss": 1.8725, + "step": 6790 + }, + { + "epoch": 2.0844076120319213, + "grad_norm": 0.37508267164230347, + "learning_rate": 9.18839179885149e-05, + "loss": 1.8819, + "step": 6791 + }, + { + "epoch": 2.0847145488029466, + "grad_norm": 0.3970893621444702, + "learning_rate": 9.188120304138306e-05, + "loss": 1.9035, + "step": 6792 + }, + { + "epoch": 2.085021485573972, + "grad_norm": 0.42629706859588623, + "learning_rate": 9.18784876803566e-05, + "loss": 1.993, + "step": 6793 + }, + { + "epoch": 2.085328422344997, + "grad_norm": 0.40387317538261414, + "learning_rate": 9.18757719054624e-05, + "loss": 1.8987, + "step": 6794 + }, + { + "epoch": 2.085635359116022, + "grad_norm": 0.40304768085479736, + "learning_rate": 9.187305571672726e-05, + "loss": 1.9017, + "step": 6795 + }, + { + "epoch": 2.0859422958870475, + "grad_norm": 0.34255313873291016, + "learning_rate": 9.187033911417805e-05, + "loss": 1.8406, + "step": 6796 + }, + { + "epoch": 2.0862492326580724, + "grad_norm": 0.34713810682296753, + "learning_rate": 9.18676220978416e-05, + "loss": 1.8773, + "step": 6797 + }, + { + "epoch": 2.0865561694290977, + "grad_norm": 0.3651806712150574, + "learning_rate": 9.186490466774478e-05, + "loss": 1.9158, + "step": 6798 + }, + { + "epoch": 2.0868631062001226, + "grad_norm": 0.3859401047229767, + "learning_rate": 9.186218682391443e-05, + "loss": 1.8488, + "step": 6799 + }, + { + "epoch": 2.087170042971148, + "grad_norm": 0.34309303760528564, + "learning_rate": 9.185946856637742e-05, + "loss": 1.8373, + "step": 6800 + }, + { + "epoch": 2.0874769797421733, + "grad_norm": 0.3597384989261627, + "learning_rate": 9.18567498951606e-05, + "loss": 1.8297, + "step": 6801 + }, + { + "epoch": 2.087783916513198, + "grad_norm": 0.39170950651168823, + "learning_rate": 9.185403081029085e-05, + "loss": 1.9623, + "step": 6802 + }, + { + "epoch": 2.0880908532842235, + "grad_norm": 0.37024664878845215, + "learning_rate": 9.185131131179503e-05, + "loss": 1.8966, + "step": 6803 + }, + { + "epoch": 2.0883977900552484, + "grad_norm": 0.37869709730148315, + "learning_rate": 9.184859139970001e-05, + "loss": 1.9121, + "step": 6804 + }, + { + "epoch": 2.0887047268262737, + "grad_norm": 0.3808143436908722, + "learning_rate": 9.184587107403271e-05, + "loss": 1.918, + "step": 6805 + }, + { + "epoch": 2.089011663597299, + "grad_norm": 0.3864719271659851, + "learning_rate": 9.184315033481996e-05, + "loss": 1.9087, + "step": 6806 + }, + { + "epoch": 2.089318600368324, + "grad_norm": 0.41121476888656616, + "learning_rate": 9.184042918208869e-05, + "loss": 1.8971, + "step": 6807 + }, + { + "epoch": 2.0896255371393493, + "grad_norm": 0.33098986744880676, + "learning_rate": 9.183770761586576e-05, + "loss": 1.8497, + "step": 6808 + }, + { + "epoch": 2.0899324739103746, + "grad_norm": 0.336174339056015, + "learning_rate": 9.183498563617809e-05, + "loss": 1.8341, + "step": 6809 + }, + { + "epoch": 2.0902394106813995, + "grad_norm": 0.339040070772171, + "learning_rate": 9.183226324305258e-05, + "loss": 1.9228, + "step": 6810 + }, + { + "epoch": 2.090546347452425, + "grad_norm": 0.395000159740448, + "learning_rate": 9.182954043651613e-05, + "loss": 1.9773, + "step": 6811 + }, + { + "epoch": 2.09085328422345, + "grad_norm": 0.3884550929069519, + "learning_rate": 9.182681721659563e-05, + "loss": 1.9665, + "step": 6812 + }, + { + "epoch": 2.091160220994475, + "grad_norm": 0.38752105832099915, + "learning_rate": 9.182409358331801e-05, + "loss": 1.9337, + "step": 6813 + }, + { + "epoch": 2.0914671577655004, + "grad_norm": 0.3557493984699249, + "learning_rate": 9.182136953671017e-05, + "loss": 1.8506, + "step": 6814 + }, + { + "epoch": 2.0917740945365253, + "grad_norm": 0.36052554845809937, + "learning_rate": 9.181864507679906e-05, + "loss": 1.8336, + "step": 6815 + }, + { + "epoch": 2.0920810313075506, + "grad_norm": 0.3311133086681366, + "learning_rate": 9.181592020361158e-05, + "loss": 1.9121, + "step": 6816 + }, + { + "epoch": 2.092387968078576, + "grad_norm": 0.33922117948532104, + "learning_rate": 9.181319491717468e-05, + "loss": 1.8366, + "step": 6817 + }, + { + "epoch": 2.092694904849601, + "grad_norm": 0.30820000171661377, + "learning_rate": 9.181046921751527e-05, + "loss": 1.8931, + "step": 6818 + }, + { + "epoch": 2.093001841620626, + "grad_norm": 0.327374666929245, + "learning_rate": 9.180774310466031e-05, + "loss": 1.8818, + "step": 6819 + }, + { + "epoch": 2.0933087783916515, + "grad_norm": 0.3244091868400574, + "learning_rate": 9.180501657863672e-05, + "loss": 1.8542, + "step": 6820 + }, + { + "epoch": 2.0936157151626764, + "grad_norm": 0.32823657989501953, + "learning_rate": 9.180228963947144e-05, + "loss": 1.8745, + "step": 6821 + }, + { + "epoch": 2.0939226519337018, + "grad_norm": 0.32869017124176025, + "learning_rate": 9.179956228719144e-05, + "loss": 1.8497, + "step": 6822 + }, + { + "epoch": 2.0942295887047266, + "grad_norm": 0.3624805808067322, + "learning_rate": 9.179683452182369e-05, + "loss": 1.9499, + "step": 6823 + }, + { + "epoch": 2.094536525475752, + "grad_norm": 0.35709038376808167, + "learning_rate": 9.179410634339509e-05, + "loss": 1.8709, + "step": 6824 + }, + { + "epoch": 2.0948434622467773, + "grad_norm": 0.3875027298927307, + "learning_rate": 9.179137775193266e-05, + "loss": 1.883, + "step": 6825 + }, + { + "epoch": 2.095150399017802, + "grad_norm": 0.4203769862651825, + "learning_rate": 9.178864874746333e-05, + "loss": 1.814, + "step": 6826 + }, + { + "epoch": 2.0954573357888275, + "grad_norm": 0.46331214904785156, + "learning_rate": 9.178591933001407e-05, + "loss": 1.9821, + "step": 6827 + }, + { + "epoch": 2.095764272559853, + "grad_norm": 0.4264145791530609, + "learning_rate": 9.178318949961188e-05, + "loss": 1.9249, + "step": 6828 + }, + { + "epoch": 2.0960712093308778, + "grad_norm": 0.3697608709335327, + "learning_rate": 9.178045925628371e-05, + "loss": 2.0052, + "step": 6829 + }, + { + "epoch": 2.096378146101903, + "grad_norm": 0.39582517743110657, + "learning_rate": 9.177772860005656e-05, + "loss": 1.9086, + "step": 6830 + }, + { + "epoch": 2.096685082872928, + "grad_norm": 0.3287788927555084, + "learning_rate": 9.17749975309574e-05, + "loss": 1.8766, + "step": 6831 + }, + { + "epoch": 2.0969920196439533, + "grad_norm": 0.33648282289505005, + "learning_rate": 9.177226604901324e-05, + "loss": 1.933, + "step": 6832 + }, + { + "epoch": 2.0972989564149787, + "grad_norm": 0.34225910902023315, + "learning_rate": 9.176953415425106e-05, + "loss": 1.8801, + "step": 6833 + }, + { + "epoch": 2.0976058931860035, + "grad_norm": 0.35536935925483704, + "learning_rate": 9.176680184669786e-05, + "loss": 1.9472, + "step": 6834 + }, + { + "epoch": 2.097912829957029, + "grad_norm": 0.39152607321739197, + "learning_rate": 9.176406912638064e-05, + "loss": 1.9502, + "step": 6835 + }, + { + "epoch": 2.098219766728054, + "grad_norm": 0.3812694549560547, + "learning_rate": 9.176133599332643e-05, + "loss": 1.8746, + "step": 6836 + }, + { + "epoch": 2.098526703499079, + "grad_norm": 0.36225396394729614, + "learning_rate": 9.17586024475622e-05, + "loss": 1.8489, + "step": 6837 + }, + { + "epoch": 2.0988336402701044, + "grad_norm": 0.3953205943107605, + "learning_rate": 9.1755868489115e-05, + "loss": 1.8671, + "step": 6838 + }, + { + "epoch": 2.0991405770411293, + "grad_norm": 0.33443906903266907, + "learning_rate": 9.175313411801181e-05, + "loss": 1.8574, + "step": 6839 + }, + { + "epoch": 2.0994475138121547, + "grad_norm": 0.3358154892921448, + "learning_rate": 9.17503993342797e-05, + "loss": 1.8329, + "step": 6840 + }, + { + "epoch": 2.09975445058318, + "grad_norm": 0.45934513211250305, + "learning_rate": 9.174766413794566e-05, + "loss": 1.862, + "step": 6841 + }, + { + "epoch": 2.100061387354205, + "grad_norm": 0.46342480182647705, + "learning_rate": 9.174492852903673e-05, + "loss": 1.8747, + "step": 6842 + }, + { + "epoch": 2.1003683241252302, + "grad_norm": 0.4199588894844055, + "learning_rate": 9.174219250757996e-05, + "loss": 1.9308, + "step": 6843 + }, + { + "epoch": 2.1006752608962556, + "grad_norm": 0.3508588373661041, + "learning_rate": 9.173945607360238e-05, + "loss": 1.8622, + "step": 6844 + }, + { + "epoch": 2.1009821976672804, + "grad_norm": 0.3656609356403351, + "learning_rate": 9.173671922713104e-05, + "loss": 1.899, + "step": 6845 + }, + { + "epoch": 2.101289134438306, + "grad_norm": 0.43374791741371155, + "learning_rate": 9.173398196819295e-05, + "loss": 1.8725, + "step": 6846 + }, + { + "epoch": 2.1015960712093307, + "grad_norm": 0.49730411171913147, + "learning_rate": 9.17312442968152e-05, + "loss": 1.9224, + "step": 6847 + }, + { + "epoch": 2.101903007980356, + "grad_norm": 0.45392677187919617, + "learning_rate": 9.172850621302484e-05, + "loss": 1.8374, + "step": 6848 + }, + { + "epoch": 2.1022099447513813, + "grad_norm": 0.3507382273674011, + "learning_rate": 9.172576771684892e-05, + "loss": 1.8875, + "step": 6849 + }, + { + "epoch": 2.1025168815224062, + "grad_norm": 0.4124681055545807, + "learning_rate": 9.172302880831451e-05, + "loss": 1.8828, + "step": 6850 + }, + { + "epoch": 2.1028238182934316, + "grad_norm": 0.5120462775230408, + "learning_rate": 9.172028948744867e-05, + "loss": 1.8218, + "step": 6851 + }, + { + "epoch": 2.103130755064457, + "grad_norm": 0.5858038067817688, + "learning_rate": 9.171754975427848e-05, + "loss": 1.8679, + "step": 6852 + }, + { + "epoch": 2.103437691835482, + "grad_norm": 0.5196588039398193, + "learning_rate": 9.171480960883101e-05, + "loss": 1.8885, + "step": 6853 + }, + { + "epoch": 2.103744628606507, + "grad_norm": 0.38581255078315735, + "learning_rate": 9.171206905113335e-05, + "loss": 1.9127, + "step": 6854 + }, + { + "epoch": 2.104051565377532, + "grad_norm": 0.31531259417533875, + "learning_rate": 9.170932808121256e-05, + "loss": 1.84, + "step": 6855 + }, + { + "epoch": 2.1043585021485574, + "grad_norm": 0.4595080018043518, + "learning_rate": 9.170658669909575e-05, + "loss": 1.908, + "step": 6856 + }, + { + "epoch": 2.1046654389195827, + "grad_norm": 0.42485639452934265, + "learning_rate": 9.170384490481001e-05, + "loss": 1.8943, + "step": 6857 + }, + { + "epoch": 2.1049723756906076, + "grad_norm": 0.3465791344642639, + "learning_rate": 9.170110269838243e-05, + "loss": 1.8362, + "step": 6858 + }, + { + "epoch": 2.105279312461633, + "grad_norm": 0.26863181591033936, + "learning_rate": 9.16983600798401e-05, + "loss": 1.856, + "step": 6859 + }, + { + "epoch": 2.1055862492326582, + "grad_norm": 0.33826425671577454, + "learning_rate": 9.169561704921014e-05, + "loss": 1.8148, + "step": 6860 + }, + { + "epoch": 2.105893186003683, + "grad_norm": 0.3657929301261902, + "learning_rate": 9.169287360651967e-05, + "loss": 1.8978, + "step": 6861 + }, + { + "epoch": 2.1062001227747085, + "grad_norm": 0.2963617444038391, + "learning_rate": 9.169012975179579e-05, + "loss": 1.8432, + "step": 6862 + }, + { + "epoch": 2.1065070595457334, + "grad_norm": 0.32966092228889465, + "learning_rate": 9.168738548506559e-05, + "loss": 1.9137, + "step": 6863 + }, + { + "epoch": 2.1068139963167587, + "grad_norm": 0.4043191075325012, + "learning_rate": 9.168464080635622e-05, + "loss": 1.9294, + "step": 6864 + }, + { + "epoch": 2.107120933087784, + "grad_norm": 0.41461876034736633, + "learning_rate": 9.168189571569479e-05, + "loss": 1.8582, + "step": 6865 + }, + { + "epoch": 2.107427869858809, + "grad_norm": 0.34119492769241333, + "learning_rate": 9.167915021310845e-05, + "loss": 1.8245, + "step": 6866 + }, + { + "epoch": 2.1077348066298343, + "grad_norm": 0.3259434401988983, + "learning_rate": 9.167640429862429e-05, + "loss": 1.8962, + "step": 6867 + }, + { + "epoch": 2.1080417434008596, + "grad_norm": 0.3074548840522766, + "learning_rate": 9.167365797226951e-05, + "loss": 1.8617, + "step": 6868 + }, + { + "epoch": 2.1083486801718845, + "grad_norm": 0.40738388895988464, + "learning_rate": 9.167091123407121e-05, + "loss": 1.9701, + "step": 6869 + }, + { + "epoch": 2.10865561694291, + "grad_norm": 0.3931449055671692, + "learning_rate": 9.166816408405653e-05, + "loss": 1.8874, + "step": 6870 + }, + { + "epoch": 2.108962553713935, + "grad_norm": 0.3726460635662079, + "learning_rate": 9.166541652225264e-05, + "loss": 1.9307, + "step": 6871 + }, + { + "epoch": 2.10926949048496, + "grad_norm": 0.36566078662872314, + "learning_rate": 9.166266854868667e-05, + "loss": 1.8782, + "step": 6872 + }, + { + "epoch": 2.1095764272559854, + "grad_norm": 0.33448025584220886, + "learning_rate": 9.16599201633858e-05, + "loss": 1.8007, + "step": 6873 + }, + { + "epoch": 2.1098833640270103, + "grad_norm": 0.4261031150817871, + "learning_rate": 9.165717136637716e-05, + "loss": 1.9092, + "step": 6874 + }, + { + "epoch": 2.1101903007980356, + "grad_norm": 0.37860241532325745, + "learning_rate": 9.165442215768798e-05, + "loss": 1.8538, + "step": 6875 + }, + { + "epoch": 2.110497237569061, + "grad_norm": 0.35417279601097107, + "learning_rate": 9.165167253734535e-05, + "loss": 1.8859, + "step": 6876 + }, + { + "epoch": 2.110804174340086, + "grad_norm": 0.33357858657836914, + "learning_rate": 9.16489225053765e-05, + "loss": 1.8615, + "step": 6877 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.40441447496414185, + "learning_rate": 9.164617206180856e-05, + "loss": 1.8711, + "step": 6878 + }, + { + "epoch": 2.1114180478821365, + "grad_norm": 0.401530921459198, + "learning_rate": 9.164342120666876e-05, + "loss": 1.8378, + "step": 6879 + }, + { + "epoch": 2.1117249846531614, + "grad_norm": 0.36379504203796387, + "learning_rate": 9.164066993998426e-05, + "loss": 1.87, + "step": 6880 + }, + { + "epoch": 2.1120319214241867, + "grad_norm": 0.36242642998695374, + "learning_rate": 9.163791826178225e-05, + "loss": 1.9041, + "step": 6881 + }, + { + "epoch": 2.1123388581952116, + "grad_norm": 0.34601980447769165, + "learning_rate": 9.163516617208994e-05, + "loss": 1.9248, + "step": 6882 + }, + { + "epoch": 2.112645794966237, + "grad_norm": 0.4664660096168518, + "learning_rate": 9.163241367093451e-05, + "loss": 1.901, + "step": 6883 + }, + { + "epoch": 2.1129527317372623, + "grad_norm": 0.5991809964179993, + "learning_rate": 9.162966075834315e-05, + "loss": 1.9061, + "step": 6884 + }, + { + "epoch": 2.113259668508287, + "grad_norm": 0.5235050320625305, + "learning_rate": 9.16269074343431e-05, + "loss": 1.8958, + "step": 6885 + }, + { + "epoch": 2.1135666052793125, + "grad_norm": 0.39008161425590515, + "learning_rate": 9.162415369896153e-05, + "loss": 1.7935, + "step": 6886 + }, + { + "epoch": 2.113873542050338, + "grad_norm": 0.4212269186973572, + "learning_rate": 9.16213995522257e-05, + "loss": 1.9876, + "step": 6887 + }, + { + "epoch": 2.1141804788213627, + "grad_norm": 0.44495880603790283, + "learning_rate": 9.161864499416279e-05, + "loss": 1.9011, + "step": 6888 + }, + { + "epoch": 2.114487415592388, + "grad_norm": 0.40533384680747986, + "learning_rate": 9.161589002480006e-05, + "loss": 1.8734, + "step": 6889 + }, + { + "epoch": 2.114794352363413, + "grad_norm": 0.45783132314682007, + "learning_rate": 9.161313464416469e-05, + "loss": 1.9769, + "step": 6890 + }, + { + "epoch": 2.1151012891344383, + "grad_norm": 0.37975600361824036, + "learning_rate": 9.161037885228393e-05, + "loss": 1.8988, + "step": 6891 + }, + { + "epoch": 2.1154082259054636, + "grad_norm": 0.394987553358078, + "learning_rate": 9.160762264918504e-05, + "loss": 1.8076, + "step": 6892 + }, + { + "epoch": 2.1157151626764885, + "grad_norm": 0.4180262088775635, + "learning_rate": 9.160486603489522e-05, + "loss": 1.9497, + "step": 6893 + }, + { + "epoch": 2.116022099447514, + "grad_norm": 0.3917383849620819, + "learning_rate": 9.160210900944173e-05, + "loss": 1.9093, + "step": 6894 + }, + { + "epoch": 2.116329036218539, + "grad_norm": 0.3631739616394043, + "learning_rate": 9.15993515728518e-05, + "loss": 1.8724, + "step": 6895 + }, + { + "epoch": 2.116635972989564, + "grad_norm": 0.3304460942745209, + "learning_rate": 9.159659372515272e-05, + "loss": 1.8291, + "step": 6896 + }, + { + "epoch": 2.1169429097605894, + "grad_norm": 0.38202792406082153, + "learning_rate": 9.159383546637172e-05, + "loss": 1.8919, + "step": 6897 + }, + { + "epoch": 2.1172498465316143, + "grad_norm": 0.39544618129730225, + "learning_rate": 9.159107679653605e-05, + "loss": 1.8748, + "step": 6898 + }, + { + "epoch": 2.1175567833026396, + "grad_norm": 0.44175153970718384, + "learning_rate": 9.158831771567298e-05, + "loss": 1.9063, + "step": 6899 + }, + { + "epoch": 2.117863720073665, + "grad_norm": 0.3696559965610504, + "learning_rate": 9.158555822380979e-05, + "loss": 1.8356, + "step": 6900 + }, + { + "epoch": 2.11817065684469, + "grad_norm": 0.2917703688144684, + "learning_rate": 9.158279832097372e-05, + "loss": 1.8996, + "step": 6901 + }, + { + "epoch": 2.118477593615715, + "grad_norm": 0.3991266191005707, + "learning_rate": 9.158003800719208e-05, + "loss": 1.8872, + "step": 6902 + }, + { + "epoch": 2.1187845303867405, + "grad_norm": 0.41425880789756775, + "learning_rate": 9.157727728249213e-05, + "loss": 1.845, + "step": 6903 + }, + { + "epoch": 2.1190914671577654, + "grad_norm": 0.33590519428253174, + "learning_rate": 9.157451614690115e-05, + "loss": 1.8779, + "step": 6904 + }, + { + "epoch": 2.1193984039287908, + "grad_norm": 0.34963786602020264, + "learning_rate": 9.157175460044644e-05, + "loss": 1.8846, + "step": 6905 + }, + { + "epoch": 2.1197053406998156, + "grad_norm": 0.3274745047092438, + "learning_rate": 9.156899264315528e-05, + "loss": 1.8859, + "step": 6906 + }, + { + "epoch": 2.120012277470841, + "grad_norm": 0.35821303725242615, + "learning_rate": 9.156623027505498e-05, + "loss": 1.8314, + "step": 6907 + }, + { + "epoch": 2.1203192142418663, + "grad_norm": 0.41185733675956726, + "learning_rate": 9.156346749617283e-05, + "loss": 1.9162, + "step": 6908 + }, + { + "epoch": 2.120626151012891, + "grad_norm": 0.4120326042175293, + "learning_rate": 9.156070430653613e-05, + "loss": 1.8593, + "step": 6909 + }, + { + "epoch": 2.1209330877839165, + "grad_norm": 0.39017269015312195, + "learning_rate": 9.155794070617218e-05, + "loss": 1.9333, + "step": 6910 + }, + { + "epoch": 2.121240024554942, + "grad_norm": 0.3104727864265442, + "learning_rate": 9.155517669510832e-05, + "loss": 1.8274, + "step": 6911 + }, + { + "epoch": 2.1215469613259668, + "grad_norm": 0.38360875844955444, + "learning_rate": 9.155241227337183e-05, + "loss": 1.9013, + "step": 6912 + }, + { + "epoch": 2.121853898096992, + "grad_norm": 0.3752502501010895, + "learning_rate": 9.154964744099006e-05, + "loss": 1.9079, + "step": 6913 + }, + { + "epoch": 2.122160834868017, + "grad_norm": 0.32074928283691406, + "learning_rate": 9.154688219799033e-05, + "loss": 1.8232, + "step": 6914 + }, + { + "epoch": 2.1224677716390423, + "grad_norm": 0.39559221267700195, + "learning_rate": 9.154411654439993e-05, + "loss": 1.9273, + "step": 6915 + }, + { + "epoch": 2.1227747084100677, + "grad_norm": 0.4010276198387146, + "learning_rate": 9.154135048024623e-05, + "loss": 1.8368, + "step": 6916 + }, + { + "epoch": 2.1230816451810925, + "grad_norm": 0.5745936036109924, + "learning_rate": 9.153858400555658e-05, + "loss": 2.0344, + "step": 6917 + }, + { + "epoch": 2.123388581952118, + "grad_norm": 0.45708227157592773, + "learning_rate": 9.153581712035827e-05, + "loss": 1.9309, + "step": 6918 + }, + { + "epoch": 2.123695518723143, + "grad_norm": 0.43845629692077637, + "learning_rate": 9.153304982467868e-05, + "loss": 1.9213, + "step": 6919 + }, + { + "epoch": 2.124002455494168, + "grad_norm": 0.34456655383110046, + "learning_rate": 9.153028211854516e-05, + "loss": 1.9, + "step": 6920 + }, + { + "epoch": 2.1243093922651934, + "grad_norm": 0.3903563618659973, + "learning_rate": 9.152751400198502e-05, + "loss": 1.8619, + "step": 6921 + }, + { + "epoch": 2.1246163290362183, + "grad_norm": 0.3465174436569214, + "learning_rate": 9.152474547502566e-05, + "loss": 1.8253, + "step": 6922 + }, + { + "epoch": 2.1249232658072437, + "grad_norm": 0.38335317373275757, + "learning_rate": 9.152197653769444e-05, + "loss": 1.8824, + "step": 6923 + }, + { + "epoch": 2.125230202578269, + "grad_norm": 0.3583361506462097, + "learning_rate": 9.15192071900187e-05, + "loss": 1.8749, + "step": 6924 + }, + { + "epoch": 2.125537139349294, + "grad_norm": 0.38249272108078003, + "learning_rate": 9.151643743202582e-05, + "loss": 1.9289, + "step": 6925 + }, + { + "epoch": 2.1258440761203192, + "grad_norm": 0.3972204327583313, + "learning_rate": 9.151366726374318e-05, + "loss": 1.8259, + "step": 6926 + }, + { + "epoch": 2.1261510128913446, + "grad_norm": 0.42475268244743347, + "learning_rate": 9.151089668519814e-05, + "loss": 1.9026, + "step": 6927 + }, + { + "epoch": 2.1264579496623695, + "grad_norm": 0.39575010538101196, + "learning_rate": 9.15081256964181e-05, + "loss": 1.8835, + "step": 6928 + }, + { + "epoch": 2.126764886433395, + "grad_norm": 0.33592918515205383, + "learning_rate": 9.150535429743041e-05, + "loss": 1.9439, + "step": 6929 + }, + { + "epoch": 2.12707182320442, + "grad_norm": 0.41760140657424927, + "learning_rate": 9.150258248826249e-05, + "loss": 1.9326, + "step": 6930 + }, + { + "epoch": 2.127378759975445, + "grad_norm": 0.4759281575679779, + "learning_rate": 9.149981026894173e-05, + "loss": 1.8443, + "step": 6931 + }, + { + "epoch": 2.1276856967464703, + "grad_norm": 0.4669014513492584, + "learning_rate": 9.149703763949552e-05, + "loss": 1.9254, + "step": 6932 + }, + { + "epoch": 2.1279926335174952, + "grad_norm": 0.3498002588748932, + "learning_rate": 9.149426459995126e-05, + "loss": 1.8814, + "step": 6933 + }, + { + "epoch": 2.1282995702885206, + "grad_norm": 0.332998663187027, + "learning_rate": 9.149149115033637e-05, + "loss": 1.8223, + "step": 6934 + }, + { + "epoch": 2.128606507059546, + "grad_norm": 0.36990395188331604, + "learning_rate": 9.148871729067823e-05, + "loss": 1.917, + "step": 6935 + }, + { + "epoch": 2.128913443830571, + "grad_norm": 0.4807330369949341, + "learning_rate": 9.148594302100426e-05, + "loss": 1.9138, + "step": 6936 + }, + { + "epoch": 2.129220380601596, + "grad_norm": 0.4821743369102478, + "learning_rate": 9.14831683413419e-05, + "loss": 1.9201, + "step": 6937 + }, + { + "epoch": 2.129527317372621, + "grad_norm": 0.45373013615608215, + "learning_rate": 9.148039325171855e-05, + "loss": 1.88, + "step": 6938 + }, + { + "epoch": 2.1298342541436464, + "grad_norm": 0.3712935745716095, + "learning_rate": 9.147761775216166e-05, + "loss": 1.8424, + "step": 6939 + }, + { + "epoch": 2.1301411909146717, + "grad_norm": 0.32493939995765686, + "learning_rate": 9.147484184269862e-05, + "loss": 1.8691, + "step": 6940 + }, + { + "epoch": 2.1304481276856966, + "grad_norm": 0.41952449083328247, + "learning_rate": 9.14720655233569e-05, + "loss": 1.8468, + "step": 6941 + }, + { + "epoch": 2.130755064456722, + "grad_norm": 0.4730648398399353, + "learning_rate": 9.14692887941639e-05, + "loss": 2.0333, + "step": 6942 + }, + { + "epoch": 2.1310620012277472, + "grad_norm": 0.3745786249637604, + "learning_rate": 9.14665116551471e-05, + "loss": 1.8835, + "step": 6943 + }, + { + "epoch": 2.131368937998772, + "grad_norm": 0.3747421205043793, + "learning_rate": 9.146373410633392e-05, + "loss": 1.8958, + "step": 6944 + }, + { + "epoch": 2.1316758747697975, + "grad_norm": 0.4383934438228607, + "learning_rate": 9.146095614775182e-05, + "loss": 1.8527, + "step": 6945 + }, + { + "epoch": 2.131982811540823, + "grad_norm": 0.4657299220561981, + "learning_rate": 9.145817777942824e-05, + "loss": 1.9073, + "step": 6946 + }, + { + "epoch": 2.1322897483118477, + "grad_norm": 0.4741605818271637, + "learning_rate": 9.145539900139067e-05, + "loss": 1.8736, + "step": 6947 + }, + { + "epoch": 2.132596685082873, + "grad_norm": 0.4058460295200348, + "learning_rate": 9.145261981366653e-05, + "loss": 1.9365, + "step": 6948 + }, + { + "epoch": 2.132903621853898, + "grad_norm": 0.3430838882923126, + "learning_rate": 9.14498402162833e-05, + "loss": 1.8992, + "step": 6949 + }, + { + "epoch": 2.1332105586249233, + "grad_norm": 0.43009114265441895, + "learning_rate": 9.144706020926847e-05, + "loss": 1.925, + "step": 6950 + }, + { + "epoch": 2.1335174953959486, + "grad_norm": 0.47696158289909363, + "learning_rate": 9.144427979264949e-05, + "loss": 1.858, + "step": 6951 + }, + { + "epoch": 2.1338244321669735, + "grad_norm": 0.4477602243423462, + "learning_rate": 9.144149896645386e-05, + "loss": 1.9042, + "step": 6952 + }, + { + "epoch": 2.134131368937999, + "grad_norm": 0.3736960291862488, + "learning_rate": 9.143871773070903e-05, + "loss": 1.782, + "step": 6953 + }, + { + "epoch": 2.1344383057090237, + "grad_norm": 0.3065558075904846, + "learning_rate": 9.143593608544251e-05, + "loss": 1.8711, + "step": 6954 + }, + { + "epoch": 2.134745242480049, + "grad_norm": 0.41738569736480713, + "learning_rate": 9.143315403068178e-05, + "loss": 1.8651, + "step": 6955 + }, + { + "epoch": 2.1350521792510744, + "grad_norm": 0.4652978479862213, + "learning_rate": 9.143037156645435e-05, + "loss": 1.8225, + "step": 6956 + }, + { + "epoch": 2.1353591160220993, + "grad_norm": 0.3625001311302185, + "learning_rate": 9.142758869278769e-05, + "loss": 1.9045, + "step": 6957 + }, + { + "epoch": 2.1356660527931246, + "grad_norm": 0.34516090154647827, + "learning_rate": 9.142480540970933e-05, + "loss": 1.8527, + "step": 6958 + }, + { + "epoch": 2.13597298956415, + "grad_norm": 0.36983323097229004, + "learning_rate": 9.142202171724674e-05, + "loss": 1.7911, + "step": 6959 + }, + { + "epoch": 2.136279926335175, + "grad_norm": 0.46084535121917725, + "learning_rate": 9.141923761542748e-05, + "loss": 1.9489, + "step": 6960 + }, + { + "epoch": 2.1365868631062, + "grad_norm": 0.49472227692604065, + "learning_rate": 9.141645310427903e-05, + "loss": 1.9904, + "step": 6961 + }, + { + "epoch": 2.1368937998772255, + "grad_norm": 0.39878135919570923, + "learning_rate": 9.14136681838289e-05, + "loss": 1.8969, + "step": 6962 + }, + { + "epoch": 2.1372007366482504, + "grad_norm": 0.3451174795627594, + "learning_rate": 9.141088285410464e-05, + "loss": 1.9186, + "step": 6963 + }, + { + "epoch": 2.1375076734192757, + "grad_norm": 0.4497967064380646, + "learning_rate": 9.140809711513377e-05, + "loss": 1.8636, + "step": 6964 + }, + { + "epoch": 2.1378146101903006, + "grad_norm": 0.4643685221672058, + "learning_rate": 9.14053109669438e-05, + "loss": 1.8427, + "step": 6965 + }, + { + "epoch": 2.138121546961326, + "grad_norm": 0.3748690187931061, + "learning_rate": 9.140252440956229e-05, + "loss": 1.8529, + "step": 6966 + }, + { + "epoch": 2.1384284837323513, + "grad_norm": 0.3211230933666229, + "learning_rate": 9.139973744301675e-05, + "loss": 1.8849, + "step": 6967 + }, + { + "epoch": 2.138735420503376, + "grad_norm": 0.41169998049736023, + "learning_rate": 9.139695006733476e-05, + "loss": 1.8535, + "step": 6968 + }, + { + "epoch": 2.1390423572744015, + "grad_norm": 0.48356300592422485, + "learning_rate": 9.139416228254382e-05, + "loss": 1.8182, + "step": 6969 + }, + { + "epoch": 2.139349294045427, + "grad_norm": 0.4596598148345947, + "learning_rate": 9.139137408867153e-05, + "loss": 1.8522, + "step": 6970 + }, + { + "epoch": 2.1396562308164517, + "grad_norm": 0.37168747186660767, + "learning_rate": 9.138858548574543e-05, + "loss": 1.896, + "step": 6971 + }, + { + "epoch": 2.139963167587477, + "grad_norm": 0.34447649121284485, + "learning_rate": 9.138579647379305e-05, + "loss": 1.8473, + "step": 6972 + }, + { + "epoch": 2.140270104358502, + "grad_norm": 0.466169536113739, + "learning_rate": 9.138300705284197e-05, + "loss": 1.9131, + "step": 6973 + }, + { + "epoch": 2.1405770411295273, + "grad_norm": 0.4297258257865906, + "learning_rate": 9.138021722291977e-05, + "loss": 1.9013, + "step": 6974 + }, + { + "epoch": 2.1408839779005526, + "grad_norm": 0.29336342215538025, + "learning_rate": 9.1377426984054e-05, + "loss": 1.8242, + "step": 6975 + }, + { + "epoch": 2.1411909146715775, + "grad_norm": 0.4282750189304352, + "learning_rate": 9.137463633627226e-05, + "loss": 1.9159, + "step": 6976 + }, + { + "epoch": 2.141497851442603, + "grad_norm": 0.6071211099624634, + "learning_rate": 9.13718452796021e-05, + "loss": 1.9105, + "step": 6977 + }, + { + "epoch": 2.141804788213628, + "grad_norm": 0.5837090015411377, + "learning_rate": 9.136905381407113e-05, + "loss": 1.8735, + "step": 6978 + }, + { + "epoch": 2.142111724984653, + "grad_norm": 0.36910486221313477, + "learning_rate": 9.13662619397069e-05, + "loss": 1.9013, + "step": 6979 + }, + { + "epoch": 2.1424186617556784, + "grad_norm": 0.37497541308403015, + "learning_rate": 9.136346965653704e-05, + "loss": 1.8444, + "step": 6980 + }, + { + "epoch": 2.1427255985267033, + "grad_norm": 0.508252739906311, + "learning_rate": 9.136067696458911e-05, + "loss": 1.8756, + "step": 6981 + }, + { + "epoch": 2.1430325352977286, + "grad_norm": 0.4045214056968689, + "learning_rate": 9.135788386389077e-05, + "loss": 1.8843, + "step": 6982 + }, + { + "epoch": 2.143339472068754, + "grad_norm": 0.36260777711868286, + "learning_rate": 9.135509035446955e-05, + "loss": 1.9264, + "step": 6983 + }, + { + "epoch": 2.143646408839779, + "grad_norm": 0.4112427234649658, + "learning_rate": 9.135229643635309e-05, + "loss": 1.8843, + "step": 6984 + }, + { + "epoch": 2.143953345610804, + "grad_norm": 0.43893104791641235, + "learning_rate": 9.1349502109569e-05, + "loss": 1.9486, + "step": 6985 + }, + { + "epoch": 2.1442602823818295, + "grad_norm": 0.3942745625972748, + "learning_rate": 9.13467073741449e-05, + "loss": 1.8607, + "step": 6986 + }, + { + "epoch": 2.1445672191528544, + "grad_norm": 0.3920004963874817, + "learning_rate": 9.13439122301084e-05, + "loss": 1.8102, + "step": 6987 + }, + { + "epoch": 2.1448741559238798, + "grad_norm": 0.3774373531341553, + "learning_rate": 9.134111667748712e-05, + "loss": 1.8326, + "step": 6988 + }, + { + "epoch": 2.1451810926949046, + "grad_norm": 0.355228453874588, + "learning_rate": 9.13383207163087e-05, + "loss": 1.895, + "step": 6989 + }, + { + "epoch": 2.14548802946593, + "grad_norm": 0.40284648537635803, + "learning_rate": 9.133552434660077e-05, + "loss": 1.928, + "step": 6990 + }, + { + "epoch": 2.1457949662369553, + "grad_norm": 0.3974910378456116, + "learning_rate": 9.133272756839096e-05, + "loss": 1.8567, + "step": 6991 + }, + { + "epoch": 2.14610190300798, + "grad_norm": 0.3878382742404938, + "learning_rate": 9.13299303817069e-05, + "loss": 1.9125, + "step": 6992 + }, + { + "epoch": 2.1464088397790055, + "grad_norm": 0.36132267117500305, + "learning_rate": 9.132713278657625e-05, + "loss": 1.8395, + "step": 6993 + }, + { + "epoch": 2.146715776550031, + "grad_norm": 0.4648832082748413, + "learning_rate": 9.132433478302667e-05, + "loss": 1.8877, + "step": 6994 + }, + { + "epoch": 2.1470227133210558, + "grad_norm": 0.5171563625335693, + "learning_rate": 9.132153637108577e-05, + "loss": 1.857, + "step": 6995 + }, + { + "epoch": 2.147329650092081, + "grad_norm": 0.4256175756454468, + "learning_rate": 9.131873755078124e-05, + "loss": 1.8434, + "step": 6996 + }, + { + "epoch": 2.147636586863106, + "grad_norm": 0.3421500623226166, + "learning_rate": 9.131593832214072e-05, + "loss": 1.8747, + "step": 6997 + }, + { + "epoch": 2.1479435236341313, + "grad_norm": 0.3880314230918884, + "learning_rate": 9.131313868519188e-05, + "loss": 1.8592, + "step": 6998 + }, + { + "epoch": 2.1482504604051567, + "grad_norm": 0.41070252656936646, + "learning_rate": 9.131033863996239e-05, + "loss": 1.8746, + "step": 6999 + }, + { + "epoch": 2.1485573971761815, + "grad_norm": 0.3837376534938812, + "learning_rate": 9.130753818647992e-05, + "loss": 1.8722, + "step": 7000 + }, + { + "epoch": 2.148864333947207, + "grad_norm": 0.311184823513031, + "learning_rate": 9.130473732477217e-05, + "loss": 1.8964, + "step": 7001 + }, + { + "epoch": 2.149171270718232, + "grad_norm": 0.3548091948032379, + "learning_rate": 9.130193605486677e-05, + "loss": 1.9235, + "step": 7002 + }, + { + "epoch": 2.149478207489257, + "grad_norm": 0.3509860932826996, + "learning_rate": 9.129913437679143e-05, + "loss": 1.8088, + "step": 7003 + }, + { + "epoch": 2.1497851442602824, + "grad_norm": 0.3301749527454376, + "learning_rate": 9.129633229057384e-05, + "loss": 1.8926, + "step": 7004 + }, + { + "epoch": 2.150092081031308, + "grad_norm": 0.3071286082267761, + "learning_rate": 9.129352979624169e-05, + "loss": 1.8045, + "step": 7005 + }, + { + "epoch": 2.1503990178023327, + "grad_norm": 0.3222786486148834, + "learning_rate": 9.129072689382268e-05, + "loss": 1.877, + "step": 7006 + }, + { + "epoch": 2.150705954573358, + "grad_norm": 0.31817424297332764, + "learning_rate": 9.128792358334451e-05, + "loss": 1.8863, + "step": 7007 + }, + { + "epoch": 2.151012891344383, + "grad_norm": 0.29379183053970337, + "learning_rate": 9.128511986483487e-05, + "loss": 1.8339, + "step": 7008 + }, + { + "epoch": 2.1513198281154082, + "grad_norm": 0.3618883788585663, + "learning_rate": 9.128231573832149e-05, + "loss": 1.9521, + "step": 7009 + }, + { + "epoch": 2.1516267648864336, + "grad_norm": 0.3188464045524597, + "learning_rate": 9.127951120383205e-05, + "loss": 1.811, + "step": 7010 + }, + { + "epoch": 2.1519337016574585, + "grad_norm": 0.3257068395614624, + "learning_rate": 9.127670626139431e-05, + "loss": 1.9084, + "step": 7011 + }, + { + "epoch": 2.152240638428484, + "grad_norm": 0.3389057219028473, + "learning_rate": 9.127390091103595e-05, + "loss": 1.9272, + "step": 7012 + }, + { + "epoch": 2.1525475751995087, + "grad_norm": 0.3376730680465698, + "learning_rate": 9.127109515278471e-05, + "loss": 1.8841, + "step": 7013 + }, + { + "epoch": 2.152854511970534, + "grad_norm": 0.3032901883125305, + "learning_rate": 9.126828898666833e-05, + "loss": 1.8057, + "step": 7014 + }, + { + "epoch": 2.1531614487415593, + "grad_norm": 0.32034799456596375, + "learning_rate": 9.126548241271451e-05, + "loss": 1.7988, + "step": 7015 + }, + { + "epoch": 2.1534683855125842, + "grad_norm": 0.31879931688308716, + "learning_rate": 9.126267543095102e-05, + "loss": 1.8932, + "step": 7016 + }, + { + "epoch": 2.1537753222836096, + "grad_norm": 0.3282395005226135, + "learning_rate": 9.125986804140559e-05, + "loss": 1.907, + "step": 7017 + }, + { + "epoch": 2.154082259054635, + "grad_norm": 0.36310696601867676, + "learning_rate": 9.125706024410594e-05, + "loss": 1.9812, + "step": 7018 + }, + { + "epoch": 2.15438919582566, + "grad_norm": 0.39414262771606445, + "learning_rate": 9.125425203907985e-05, + "loss": 1.9112, + "step": 7019 + }, + { + "epoch": 2.154696132596685, + "grad_norm": 0.4457061290740967, + "learning_rate": 9.125144342635508e-05, + "loss": 1.8876, + "step": 7020 + }, + { + "epoch": 2.1550030693677105, + "grad_norm": 0.4651646316051483, + "learning_rate": 9.124863440595934e-05, + "loss": 1.8283, + "step": 7021 + }, + { + "epoch": 2.1553100061387354, + "grad_norm": 0.4404383897781372, + "learning_rate": 9.124582497792043e-05, + "loss": 1.8646, + "step": 7022 + }, + { + "epoch": 2.1556169429097607, + "grad_norm": 0.3569783866405487, + "learning_rate": 9.124301514226612e-05, + "loss": 1.9603, + "step": 7023 + }, + { + "epoch": 2.1559238796807856, + "grad_norm": 0.3878212571144104, + "learning_rate": 9.124020489902414e-05, + "loss": 1.889, + "step": 7024 + }, + { + "epoch": 2.156230816451811, + "grad_norm": 0.43005698919296265, + "learning_rate": 9.123739424822229e-05, + "loss": 1.9127, + "step": 7025 + }, + { + "epoch": 2.1565377532228363, + "grad_norm": 0.37798774242401123, + "learning_rate": 9.123458318988834e-05, + "loss": 1.8434, + "step": 7026 + }, + { + "epoch": 2.156844689993861, + "grad_norm": 0.38182979822158813, + "learning_rate": 9.123177172405007e-05, + "loss": 1.8905, + "step": 7027 + }, + { + "epoch": 2.1571516267648865, + "grad_norm": 0.4695180058479309, + "learning_rate": 9.122895985073524e-05, + "loss": 1.9035, + "step": 7028 + }, + { + "epoch": 2.1574585635359114, + "grad_norm": 0.37112870812416077, + "learning_rate": 9.12261475699717e-05, + "loss": 1.8497, + "step": 7029 + }, + { + "epoch": 2.1577655003069367, + "grad_norm": 0.36758264899253845, + "learning_rate": 9.122333488178721e-05, + "loss": 1.9015, + "step": 7030 + }, + { + "epoch": 2.158072437077962, + "grad_norm": 0.4691081643104553, + "learning_rate": 9.122052178620953e-05, + "loss": 1.9707, + "step": 7031 + }, + { + "epoch": 2.158379373848987, + "grad_norm": 0.47068753838539124, + "learning_rate": 9.121770828326653e-05, + "loss": 1.9103, + "step": 7032 + }, + { + "epoch": 2.1586863106200123, + "grad_norm": 0.38539063930511475, + "learning_rate": 9.121489437298593e-05, + "loss": 1.7872, + "step": 7033 + }, + { + "epoch": 2.1589932473910376, + "grad_norm": 0.43769749999046326, + "learning_rate": 9.121208005539563e-05, + "loss": 1.9654, + "step": 7034 + }, + { + "epoch": 2.1593001841620625, + "grad_norm": 0.4770655930042267, + "learning_rate": 9.120926533052338e-05, + "loss": 1.9754, + "step": 7035 + }, + { + "epoch": 2.159607120933088, + "grad_norm": 0.526979386806488, + "learning_rate": 9.120645019839702e-05, + "loss": 1.8833, + "step": 7036 + }, + { + "epoch": 2.159914057704113, + "grad_norm": 0.4734671413898468, + "learning_rate": 9.120363465904438e-05, + "loss": 1.8695, + "step": 7037 + }, + { + "epoch": 2.160220994475138, + "grad_norm": 0.40346798300743103, + "learning_rate": 9.120081871249326e-05, + "loss": 1.9216, + "step": 7038 + }, + { + "epoch": 2.1605279312461634, + "grad_norm": 0.38210105895996094, + "learning_rate": 9.119800235877149e-05, + "loss": 1.9334, + "step": 7039 + }, + { + "epoch": 2.1608348680171883, + "grad_norm": 0.5528677105903625, + "learning_rate": 9.119518559790694e-05, + "loss": 1.8858, + "step": 7040 + }, + { + "epoch": 2.1611418047882136, + "grad_norm": 0.6684148907661438, + "learning_rate": 9.11923684299274e-05, + "loss": 1.9105, + "step": 7041 + }, + { + "epoch": 2.161448741559239, + "grad_norm": 0.4497738778591156, + "learning_rate": 9.118955085486073e-05, + "loss": 1.8789, + "step": 7042 + }, + { + "epoch": 2.161755678330264, + "grad_norm": 0.4440831243991852, + "learning_rate": 9.11867328727348e-05, + "loss": 1.9966, + "step": 7043 + }, + { + "epoch": 2.162062615101289, + "grad_norm": 0.5910835266113281, + "learning_rate": 9.118391448357742e-05, + "loss": 1.8841, + "step": 7044 + }, + { + "epoch": 2.1623695518723145, + "grad_norm": 0.5312752723693848, + "learning_rate": 9.118109568741645e-05, + "loss": 1.8825, + "step": 7045 + }, + { + "epoch": 2.1626764886433394, + "grad_norm": 0.3885713815689087, + "learning_rate": 9.117827648427977e-05, + "loss": 1.8763, + "step": 7046 + }, + { + "epoch": 2.1629834254143647, + "grad_norm": 0.4274894893169403, + "learning_rate": 9.117545687419522e-05, + "loss": 1.8802, + "step": 7047 + }, + { + "epoch": 2.1632903621853896, + "grad_norm": 0.3984382748603821, + "learning_rate": 9.117263685719067e-05, + "loss": 1.8319, + "step": 7048 + }, + { + "epoch": 2.163597298956415, + "grad_norm": 0.3687778115272522, + "learning_rate": 9.1169816433294e-05, + "loss": 1.838, + "step": 7049 + }, + { + "epoch": 2.1639042357274403, + "grad_norm": 0.37597915530204773, + "learning_rate": 9.116699560253306e-05, + "loss": 1.8711, + "step": 7050 + }, + { + "epoch": 2.164211172498465, + "grad_norm": 0.41217467188835144, + "learning_rate": 9.116417436493574e-05, + "loss": 1.8552, + "step": 7051 + }, + { + "epoch": 2.1645181092694905, + "grad_norm": 0.3937448263168335, + "learning_rate": 9.116135272052994e-05, + "loss": 1.8548, + "step": 7052 + }, + { + "epoch": 2.164825046040516, + "grad_norm": 0.3545389175415039, + "learning_rate": 9.115853066934351e-05, + "loss": 1.8694, + "step": 7053 + }, + { + "epoch": 2.1651319828115407, + "grad_norm": 0.32625243067741394, + "learning_rate": 9.115570821140436e-05, + "loss": 1.8579, + "step": 7054 + }, + { + "epoch": 2.165438919582566, + "grad_norm": 0.32701975107192993, + "learning_rate": 9.115288534674038e-05, + "loss": 1.8676, + "step": 7055 + }, + { + "epoch": 2.165745856353591, + "grad_norm": 0.39372533559799194, + "learning_rate": 9.115006207537947e-05, + "loss": 1.8895, + "step": 7056 + }, + { + "epoch": 2.1660527931246163, + "grad_norm": 0.3688350021839142, + "learning_rate": 9.114723839734954e-05, + "loss": 1.8742, + "step": 7057 + }, + { + "epoch": 2.1663597298956416, + "grad_norm": 0.35461875796318054, + "learning_rate": 9.114441431267846e-05, + "loss": 1.8723, + "step": 7058 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.3331618010997772, + "learning_rate": 9.114158982139416e-05, + "loss": 1.8514, + "step": 7059 + }, + { + "epoch": 2.166973603437692, + "grad_norm": 0.3313215374946594, + "learning_rate": 9.113876492352458e-05, + "loss": 1.912, + "step": 7060 + }, + { + "epoch": 2.167280540208717, + "grad_norm": 0.3320949375629425, + "learning_rate": 9.113593961909759e-05, + "loss": 1.8908, + "step": 7061 + }, + { + "epoch": 2.167587476979742, + "grad_norm": 0.3292064070701599, + "learning_rate": 9.113311390814115e-05, + "loss": 1.8702, + "step": 7062 + }, + { + "epoch": 2.1678944137507674, + "grad_norm": 0.33991244435310364, + "learning_rate": 9.113028779068316e-05, + "loss": 1.8503, + "step": 7063 + }, + { + "epoch": 2.1682013505217923, + "grad_norm": 0.3602859377861023, + "learning_rate": 9.112746126675156e-05, + "loss": 1.9185, + "step": 7064 + }, + { + "epoch": 2.1685082872928176, + "grad_norm": 0.3354876637458801, + "learning_rate": 9.112463433637428e-05, + "loss": 1.8857, + "step": 7065 + }, + { + "epoch": 2.168815224063843, + "grad_norm": 0.32364192605018616, + "learning_rate": 9.112180699957926e-05, + "loss": 1.8548, + "step": 7066 + }, + { + "epoch": 2.169122160834868, + "grad_norm": 0.3617163896560669, + "learning_rate": 9.111897925639446e-05, + "loss": 1.9021, + "step": 7067 + }, + { + "epoch": 2.169429097605893, + "grad_norm": 0.3852904438972473, + "learning_rate": 9.111615110684778e-05, + "loss": 1.9331, + "step": 7068 + }, + { + "epoch": 2.1697360343769185, + "grad_norm": 0.332939088344574, + "learning_rate": 9.111332255096721e-05, + "loss": 1.9156, + "step": 7069 + }, + { + "epoch": 2.1700429711479434, + "grad_norm": 0.3386891186237335, + "learning_rate": 9.111049358878067e-05, + "loss": 1.8898, + "step": 7070 + }, + { + "epoch": 2.1703499079189688, + "grad_norm": 0.3559711277484894, + "learning_rate": 9.110766422031617e-05, + "loss": 1.8546, + "step": 7071 + }, + { + "epoch": 2.1706568446899936, + "grad_norm": 0.3440175950527191, + "learning_rate": 9.110483444560162e-05, + "loss": 1.9005, + "step": 7072 + }, + { + "epoch": 2.170963781461019, + "grad_norm": 0.3239493668079376, + "learning_rate": 9.110200426466499e-05, + "loss": 1.9258, + "step": 7073 + }, + { + "epoch": 2.1712707182320443, + "grad_norm": 0.3658723533153534, + "learning_rate": 9.109917367753428e-05, + "loss": 2.0203, + "step": 7074 + }, + { + "epoch": 2.171577655003069, + "grad_norm": 0.35419905185699463, + "learning_rate": 9.109634268423746e-05, + "loss": 1.8515, + "step": 7075 + }, + { + "epoch": 2.1718845917740945, + "grad_norm": 0.40852081775665283, + "learning_rate": 9.109351128480246e-05, + "loss": 1.8744, + "step": 7076 + }, + { + "epoch": 2.17219152854512, + "grad_norm": 0.3502386212348938, + "learning_rate": 9.109067947925732e-05, + "loss": 1.8785, + "step": 7077 + }, + { + "epoch": 2.1724984653161448, + "grad_norm": 0.42964309453964233, + "learning_rate": 9.108784726763e-05, + "loss": 1.9175, + "step": 7078 + }, + { + "epoch": 2.17280540208717, + "grad_norm": 0.39438319206237793, + "learning_rate": 9.108501464994849e-05, + "loss": 1.9072, + "step": 7079 + }, + { + "epoch": 2.1731123388581954, + "grad_norm": 0.5045785903930664, + "learning_rate": 9.108218162624079e-05, + "loss": 1.9246, + "step": 7080 + }, + { + "epoch": 2.1734192756292203, + "grad_norm": 0.4374946653842926, + "learning_rate": 9.107934819653488e-05, + "loss": 1.8669, + "step": 7081 + }, + { + "epoch": 2.1737262124002457, + "grad_norm": 0.3263556957244873, + "learning_rate": 9.107651436085878e-05, + "loss": 1.8402, + "step": 7082 + }, + { + "epoch": 2.1740331491712706, + "grad_norm": 0.4380986988544464, + "learning_rate": 9.107368011924048e-05, + "loss": 1.8948, + "step": 7083 + }, + { + "epoch": 2.174340085942296, + "grad_norm": 0.4350908696651459, + "learning_rate": 9.1070845471708e-05, + "loss": 1.8717, + "step": 7084 + }, + { + "epoch": 2.174647022713321, + "grad_norm": 0.37809762358665466, + "learning_rate": 9.106801041828936e-05, + "loss": 1.8703, + "step": 7085 + }, + { + "epoch": 2.174953959484346, + "grad_norm": 0.3473457992076874, + "learning_rate": 9.106517495901257e-05, + "loss": 1.8999, + "step": 7086 + }, + { + "epoch": 2.1752608962553714, + "grad_norm": 0.48066645860671997, + "learning_rate": 9.106233909390564e-05, + "loss": 1.8788, + "step": 7087 + }, + { + "epoch": 2.1755678330263963, + "grad_norm": 0.5873035788536072, + "learning_rate": 9.105950282299663e-05, + "loss": 1.8879, + "step": 7088 + }, + { + "epoch": 2.1758747697974217, + "grad_norm": 0.47609585523605347, + "learning_rate": 9.105666614631354e-05, + "loss": 1.8813, + "step": 7089 + }, + { + "epoch": 2.176181706568447, + "grad_norm": 0.3845362365245819, + "learning_rate": 9.10538290638844e-05, + "loss": 1.9629, + "step": 7090 + }, + { + "epoch": 2.176488643339472, + "grad_norm": 0.5463572144508362, + "learning_rate": 9.105099157573727e-05, + "loss": 1.9455, + "step": 7091 + }, + { + "epoch": 2.1767955801104972, + "grad_norm": 0.4875337779521942, + "learning_rate": 9.104815368190017e-05, + "loss": 1.9146, + "step": 7092 + }, + { + "epoch": 2.1771025168815226, + "grad_norm": 0.37513965368270874, + "learning_rate": 9.104531538240116e-05, + "loss": 1.8626, + "step": 7093 + }, + { + "epoch": 2.1774094536525475, + "grad_norm": 0.3477539122104645, + "learning_rate": 9.104247667726828e-05, + "loss": 1.878, + "step": 7094 + }, + { + "epoch": 2.177716390423573, + "grad_norm": 0.5122693181037903, + "learning_rate": 9.103963756652961e-05, + "loss": 1.8784, + "step": 7095 + }, + { + "epoch": 2.178023327194598, + "grad_norm": 0.49106159806251526, + "learning_rate": 9.103679805021317e-05, + "loss": 1.8441, + "step": 7096 + }, + { + "epoch": 2.178330263965623, + "grad_norm": 0.3801479637622833, + "learning_rate": 9.103395812834705e-05, + "loss": 1.8986, + "step": 7097 + }, + { + "epoch": 2.1786372007366483, + "grad_norm": 0.3429640233516693, + "learning_rate": 9.10311178009593e-05, + "loss": 1.8806, + "step": 7098 + }, + { + "epoch": 2.1789441375076732, + "grad_norm": 0.36715295910835266, + "learning_rate": 9.102827706807799e-05, + "loss": 1.8215, + "step": 7099 + }, + { + "epoch": 2.1792510742786986, + "grad_norm": 0.37225866317749023, + "learning_rate": 9.10254359297312e-05, + "loss": 1.8851, + "step": 7100 + }, + { + "epoch": 2.179558011049724, + "grad_norm": 0.3552459180355072, + "learning_rate": 9.102259438594702e-05, + "loss": 1.9345, + "step": 7101 + }, + { + "epoch": 2.179864947820749, + "grad_norm": 0.3876415193080902, + "learning_rate": 9.10197524367535e-05, + "loss": 1.8657, + "step": 7102 + }, + { + "epoch": 2.180171884591774, + "grad_norm": 0.4635472595691681, + "learning_rate": 9.101691008217875e-05, + "loss": 1.8527, + "step": 7103 + }, + { + "epoch": 2.1804788213627995, + "grad_norm": 0.46319296956062317, + "learning_rate": 9.101406732225086e-05, + "loss": 1.869, + "step": 7104 + }, + { + "epoch": 2.1807857581338244, + "grad_norm": 0.36179330945014954, + "learning_rate": 9.101122415699792e-05, + "loss": 1.9157, + "step": 7105 + }, + { + "epoch": 2.1810926949048497, + "grad_norm": 0.30921339988708496, + "learning_rate": 9.100838058644801e-05, + "loss": 1.858, + "step": 7106 + }, + { + "epoch": 2.1813996316758746, + "grad_norm": 0.4568884074687958, + "learning_rate": 9.100553661062925e-05, + "loss": 1.8663, + "step": 7107 + }, + { + "epoch": 2.1817065684469, + "grad_norm": 0.43856412172317505, + "learning_rate": 9.100269222956976e-05, + "loss": 1.8492, + "step": 7108 + }, + { + "epoch": 2.1820135052179253, + "grad_norm": 0.3025546967983246, + "learning_rate": 9.099984744329761e-05, + "loss": 1.8532, + "step": 7109 + }, + { + "epoch": 2.18232044198895, + "grad_norm": 0.38365665078163147, + "learning_rate": 9.099700225184096e-05, + "loss": 1.8883, + "step": 7110 + }, + { + "epoch": 2.1826273787599755, + "grad_norm": 0.4863334596157074, + "learning_rate": 9.099415665522788e-05, + "loss": 1.8682, + "step": 7111 + }, + { + "epoch": 2.182934315531001, + "grad_norm": 0.42789241671562195, + "learning_rate": 9.099131065348653e-05, + "loss": 1.8867, + "step": 7112 + }, + { + "epoch": 2.1832412523020257, + "grad_norm": 0.35933569073677063, + "learning_rate": 9.098846424664504e-05, + "loss": 1.9282, + "step": 7113 + }, + { + "epoch": 2.183548189073051, + "grad_norm": 0.42611026763916016, + "learning_rate": 9.09856174347315e-05, + "loss": 1.9609, + "step": 7114 + }, + { + "epoch": 2.183855125844076, + "grad_norm": 0.43970558047294617, + "learning_rate": 9.098277021777406e-05, + "loss": 1.823, + "step": 7115 + }, + { + "epoch": 2.1841620626151013, + "grad_norm": 0.36792683601379395, + "learning_rate": 9.097992259580089e-05, + "loss": 1.9231, + "step": 7116 + }, + { + "epoch": 2.1844689993861266, + "grad_norm": 0.3554590344429016, + "learning_rate": 9.097707456884008e-05, + "loss": 1.914, + "step": 7117 + }, + { + "epoch": 2.1847759361571515, + "grad_norm": 0.4271651804447174, + "learning_rate": 9.097422613691982e-05, + "loss": 1.8666, + "step": 7118 + }, + { + "epoch": 2.185082872928177, + "grad_norm": 0.32142770290374756, + "learning_rate": 9.097137730006822e-05, + "loss": 1.7989, + "step": 7119 + }, + { + "epoch": 2.185389809699202, + "grad_norm": 0.33245620131492615, + "learning_rate": 9.096852805831348e-05, + "loss": 1.8536, + "step": 7120 + }, + { + "epoch": 2.185696746470227, + "grad_norm": 0.3480495810508728, + "learning_rate": 9.09656784116837e-05, + "loss": 1.9008, + "step": 7121 + }, + { + "epoch": 2.1860036832412524, + "grad_norm": 0.35290226340293884, + "learning_rate": 9.09628283602071e-05, + "loss": 1.8593, + "step": 7122 + }, + { + "epoch": 2.1863106200122773, + "grad_norm": 0.3084987998008728, + "learning_rate": 9.095997790391183e-05, + "loss": 1.827, + "step": 7123 + }, + { + "epoch": 2.1866175567833026, + "grad_norm": 0.36295285820961, + "learning_rate": 9.095712704282604e-05, + "loss": 1.909, + "step": 7124 + }, + { + "epoch": 2.186924493554328, + "grad_norm": 0.3893873691558838, + "learning_rate": 9.095427577697791e-05, + "loss": 1.9221, + "step": 7125 + }, + { + "epoch": 2.187231430325353, + "grad_norm": 0.3699241578578949, + "learning_rate": 9.095142410639564e-05, + "loss": 1.9352, + "step": 7126 + }, + { + "epoch": 2.187538367096378, + "grad_norm": 0.3384705185890198, + "learning_rate": 9.094857203110738e-05, + "loss": 1.8541, + "step": 7127 + }, + { + "epoch": 2.1878453038674035, + "grad_norm": 0.377687007188797, + "learning_rate": 9.094571955114133e-05, + "loss": 1.8336, + "step": 7128 + }, + { + "epoch": 2.1881522406384284, + "grad_norm": 0.40227916836738586, + "learning_rate": 9.094286666652567e-05, + "loss": 1.9565, + "step": 7129 + }, + { + "epoch": 2.1884591774094537, + "grad_norm": 0.3679705560207367, + "learning_rate": 9.094001337728862e-05, + "loss": 1.8152, + "step": 7130 + }, + { + "epoch": 2.1887661141804786, + "grad_norm": 0.3197132647037506, + "learning_rate": 9.093715968345836e-05, + "loss": 1.9263, + "step": 7131 + }, + { + "epoch": 2.189073050951504, + "grad_norm": 0.3518284559249878, + "learning_rate": 9.09343055850631e-05, + "loss": 1.8675, + "step": 7132 + }, + { + "epoch": 2.1893799877225293, + "grad_norm": 0.3214010000228882, + "learning_rate": 9.093145108213103e-05, + "loss": 1.8991, + "step": 7133 + }, + { + "epoch": 2.189686924493554, + "grad_norm": 0.3563176393508911, + "learning_rate": 9.092859617469037e-05, + "loss": 1.8603, + "step": 7134 + }, + { + "epoch": 2.1899938612645795, + "grad_norm": 0.34053143858909607, + "learning_rate": 9.092574086276933e-05, + "loss": 1.8955, + "step": 7135 + }, + { + "epoch": 2.190300798035605, + "grad_norm": 0.3833705484867096, + "learning_rate": 9.092288514639613e-05, + "loss": 1.8845, + "step": 7136 + }, + { + "epoch": 2.1906077348066297, + "grad_norm": 0.3932427763938904, + "learning_rate": 9.092002902559901e-05, + "loss": 1.8608, + "step": 7137 + }, + { + "epoch": 2.190914671577655, + "grad_norm": 0.332955539226532, + "learning_rate": 9.091717250040617e-05, + "loss": 1.8558, + "step": 7138 + }, + { + "epoch": 2.1912216083486804, + "grad_norm": 0.3149980306625366, + "learning_rate": 9.091431557084584e-05, + "loss": 1.893, + "step": 7139 + }, + { + "epoch": 2.1915285451197053, + "grad_norm": 0.3679150640964508, + "learning_rate": 9.091145823694628e-05, + "loss": 1.9012, + "step": 7140 + }, + { + "epoch": 2.1918354818907306, + "grad_norm": 0.36836057901382446, + "learning_rate": 9.09086004987357e-05, + "loss": 1.9121, + "step": 7141 + }, + { + "epoch": 2.1921424186617555, + "grad_norm": 0.3581927418708801, + "learning_rate": 9.090574235624237e-05, + "loss": 1.8826, + "step": 7142 + }, + { + "epoch": 2.192449355432781, + "grad_norm": 0.40886545181274414, + "learning_rate": 9.09028838094945e-05, + "loss": 1.8828, + "step": 7143 + }, + { + "epoch": 2.192756292203806, + "grad_norm": 0.32729873061180115, + "learning_rate": 9.090002485852037e-05, + "loss": 1.8827, + "step": 7144 + }, + { + "epoch": 2.193063228974831, + "grad_norm": 0.35304784774780273, + "learning_rate": 9.089716550334819e-05, + "loss": 1.846, + "step": 7145 + }, + { + "epoch": 2.1933701657458564, + "grad_norm": 0.35022708773612976, + "learning_rate": 9.089430574400629e-05, + "loss": 1.9169, + "step": 7146 + }, + { + "epoch": 2.1936771025168813, + "grad_norm": 0.4137697219848633, + "learning_rate": 9.089144558052287e-05, + "loss": 1.9111, + "step": 7147 + }, + { + "epoch": 2.1939840392879066, + "grad_norm": 0.3193536102771759, + "learning_rate": 9.088858501292622e-05, + "loss": 1.8577, + "step": 7148 + }, + { + "epoch": 2.194290976058932, + "grad_norm": 0.35795432329177856, + "learning_rate": 9.08857240412446e-05, + "loss": 1.8645, + "step": 7149 + }, + { + "epoch": 2.194597912829957, + "grad_norm": 0.3626460134983063, + "learning_rate": 9.088286266550632e-05, + "loss": 1.9288, + "step": 7150 + }, + { + "epoch": 2.194904849600982, + "grad_norm": 0.3438000977039337, + "learning_rate": 9.08800008857396e-05, + "loss": 1.9112, + "step": 7151 + }, + { + "epoch": 2.1952117863720075, + "grad_norm": 0.3445241153240204, + "learning_rate": 9.087713870197276e-05, + "loss": 1.8711, + "step": 7152 + }, + { + "epoch": 2.1955187231430324, + "grad_norm": 0.34294596314430237, + "learning_rate": 9.087427611423408e-05, + "loss": 1.9061, + "step": 7153 + }, + { + "epoch": 2.1958256599140578, + "grad_norm": 0.3608735203742981, + "learning_rate": 9.087141312255184e-05, + "loss": 1.8634, + "step": 7154 + }, + { + "epoch": 2.196132596685083, + "grad_norm": 0.3417772352695465, + "learning_rate": 9.086854972695434e-05, + "loss": 1.9, + "step": 7155 + }, + { + "epoch": 2.196439533456108, + "grad_norm": 0.3516700863838196, + "learning_rate": 9.086568592746988e-05, + "loss": 1.9021, + "step": 7156 + }, + { + "epoch": 2.1967464702271333, + "grad_norm": 0.37481075525283813, + "learning_rate": 9.086282172412677e-05, + "loss": 1.8845, + "step": 7157 + }, + { + "epoch": 2.197053406998158, + "grad_norm": 0.3413105010986328, + "learning_rate": 9.08599571169533e-05, + "loss": 1.8128, + "step": 7158 + }, + { + "epoch": 2.1973603437691835, + "grad_norm": 0.3539934754371643, + "learning_rate": 9.085709210597777e-05, + "loss": 1.857, + "step": 7159 + }, + { + "epoch": 2.197667280540209, + "grad_norm": 0.4345060884952545, + "learning_rate": 9.085422669122851e-05, + "loss": 1.8698, + "step": 7160 + }, + { + "epoch": 2.1979742173112338, + "grad_norm": 0.40369880199432373, + "learning_rate": 9.085136087273386e-05, + "loss": 1.7948, + "step": 7161 + }, + { + "epoch": 2.198281154082259, + "grad_norm": 0.3832145035266876, + "learning_rate": 9.08484946505221e-05, + "loss": 1.8682, + "step": 7162 + }, + { + "epoch": 2.198588090853284, + "grad_norm": 0.2859131097793579, + "learning_rate": 9.084562802462158e-05, + "loss": 1.8123, + "step": 7163 + }, + { + "epoch": 2.1988950276243093, + "grad_norm": 0.3062222898006439, + "learning_rate": 9.084276099506062e-05, + "loss": 1.8448, + "step": 7164 + }, + { + "epoch": 2.1992019643953347, + "grad_norm": 0.3819046914577484, + "learning_rate": 9.083989356186757e-05, + "loss": 1.8661, + "step": 7165 + }, + { + "epoch": 2.1995089011663596, + "grad_norm": 0.5007020235061646, + "learning_rate": 9.083702572507074e-05, + "loss": 1.9144, + "step": 7166 + }, + { + "epoch": 2.199815837937385, + "grad_norm": 0.521885097026825, + "learning_rate": 9.083415748469849e-05, + "loss": 1.8695, + "step": 7167 + }, + { + "epoch": 2.2001227747084102, + "grad_norm": 0.35051268339157104, + "learning_rate": 9.083128884077916e-05, + "loss": 1.9378, + "step": 7168 + }, + { + "epoch": 2.200429711479435, + "grad_norm": 0.40265345573425293, + "learning_rate": 9.082841979334111e-05, + "loss": 1.8902, + "step": 7169 + }, + { + "epoch": 2.2007366482504604, + "grad_norm": 0.506377637386322, + "learning_rate": 9.082555034241267e-05, + "loss": 1.9115, + "step": 7170 + }, + { + "epoch": 2.201043585021486, + "grad_norm": 0.42828384041786194, + "learning_rate": 9.082268048802223e-05, + "loss": 1.8173, + "step": 7171 + }, + { + "epoch": 2.2013505217925107, + "grad_norm": 0.2979312539100647, + "learning_rate": 9.081981023019812e-05, + "loss": 1.8089, + "step": 7172 + }, + { + "epoch": 2.201657458563536, + "grad_norm": 0.3840465843677521, + "learning_rate": 9.081693956896872e-05, + "loss": 1.8557, + "step": 7173 + }, + { + "epoch": 2.201964395334561, + "grad_norm": 0.41454845666885376, + "learning_rate": 9.081406850436241e-05, + "loss": 1.8599, + "step": 7174 + }, + { + "epoch": 2.2022713321055862, + "grad_norm": 0.3305908739566803, + "learning_rate": 9.081119703640756e-05, + "loss": 1.8013, + "step": 7175 + }, + { + "epoch": 2.2025782688766116, + "grad_norm": 0.33649876713752747, + "learning_rate": 9.080832516513252e-05, + "loss": 1.9028, + "step": 7176 + }, + { + "epoch": 2.2028852056476365, + "grad_norm": 0.41247284412384033, + "learning_rate": 9.08054528905657e-05, + "loss": 1.8636, + "step": 7177 + }, + { + "epoch": 2.203192142418662, + "grad_norm": 0.4355279505252838, + "learning_rate": 9.080258021273548e-05, + "loss": 1.8923, + "step": 7178 + }, + { + "epoch": 2.203499079189687, + "grad_norm": 0.34598320722579956, + "learning_rate": 9.079970713167026e-05, + "loss": 1.9187, + "step": 7179 + }, + { + "epoch": 2.203806015960712, + "grad_norm": 0.3560951054096222, + "learning_rate": 9.07968336473984e-05, + "loss": 1.9382, + "step": 7180 + }, + { + "epoch": 2.2041129527317374, + "grad_norm": 0.3873176872730255, + "learning_rate": 9.079395975994834e-05, + "loss": 1.8377, + "step": 7181 + }, + { + "epoch": 2.2044198895027622, + "grad_norm": 0.38699567317962646, + "learning_rate": 9.079108546934844e-05, + "loss": 1.848, + "step": 7182 + }, + { + "epoch": 2.2047268262737876, + "grad_norm": 0.3658364713191986, + "learning_rate": 9.078821077562712e-05, + "loss": 1.9308, + "step": 7183 + }, + { + "epoch": 2.205033763044813, + "grad_norm": 0.35228830575942993, + "learning_rate": 9.078533567881281e-05, + "loss": 1.8886, + "step": 7184 + }, + { + "epoch": 2.205340699815838, + "grad_norm": 0.4177337884902954, + "learning_rate": 9.07824601789339e-05, + "loss": 1.8695, + "step": 7185 + }, + { + "epoch": 2.205647636586863, + "grad_norm": 0.4778536260128021, + "learning_rate": 9.077958427601882e-05, + "loss": 1.8288, + "step": 7186 + }, + { + "epoch": 2.2059545733578885, + "grad_norm": 0.46544820070266724, + "learning_rate": 9.077670797009599e-05, + "loss": 1.8974, + "step": 7187 + }, + { + "epoch": 2.2062615101289134, + "grad_norm": 0.36188805103302, + "learning_rate": 9.077383126119382e-05, + "loss": 1.8953, + "step": 7188 + }, + { + "epoch": 2.2065684468999387, + "grad_norm": 0.30941206216812134, + "learning_rate": 9.077095414934075e-05, + "loss": 1.8395, + "step": 7189 + }, + { + "epoch": 2.2068753836709636, + "grad_norm": 0.4497200846672058, + "learning_rate": 9.076807663456524e-05, + "loss": 1.8485, + "step": 7190 + }, + { + "epoch": 2.207182320441989, + "grad_norm": 0.4923233985900879, + "learning_rate": 9.076519871689568e-05, + "loss": 1.8233, + "step": 7191 + }, + { + "epoch": 2.2074892572130143, + "grad_norm": 0.32226502895355225, + "learning_rate": 9.076232039636053e-05, + "loss": 1.8563, + "step": 7192 + }, + { + "epoch": 2.207796193984039, + "grad_norm": 0.46719446778297424, + "learning_rate": 9.075944167298824e-05, + "loss": 1.8602, + "step": 7193 + }, + { + "epoch": 2.2081031307550645, + "grad_norm": 0.5534674525260925, + "learning_rate": 9.075656254680727e-05, + "loss": 1.8804, + "step": 7194 + }, + { + "epoch": 2.20841006752609, + "grad_norm": 0.4895678162574768, + "learning_rate": 9.075368301784606e-05, + "loss": 1.8893, + "step": 7195 + }, + { + "epoch": 2.2087170042971147, + "grad_norm": 0.33137625455856323, + "learning_rate": 9.075080308613306e-05, + "loss": 1.9158, + "step": 7196 + }, + { + "epoch": 2.20902394106814, + "grad_norm": 0.469319611787796, + "learning_rate": 9.074792275169674e-05, + "loss": 1.8628, + "step": 7197 + }, + { + "epoch": 2.209330877839165, + "grad_norm": 0.43872305750846863, + "learning_rate": 9.074504201456556e-05, + "loss": 1.8867, + "step": 7198 + }, + { + "epoch": 2.2096378146101903, + "grad_norm": 0.32900992035865784, + "learning_rate": 9.0742160874768e-05, + "loss": 1.8079, + "step": 7199 + }, + { + "epoch": 2.2099447513812156, + "grad_norm": 0.34231048822402954, + "learning_rate": 9.073927933233253e-05, + "loss": 1.9018, + "step": 7200 + }, + { + "epoch": 2.2102516881522405, + "grad_norm": 0.43461740016937256, + "learning_rate": 9.07363973872876e-05, + "loss": 1.8299, + "step": 7201 + }, + { + "epoch": 2.210558624923266, + "grad_norm": 0.43819913268089294, + "learning_rate": 9.073351503966174e-05, + "loss": 1.8641, + "step": 7202 + }, + { + "epoch": 2.210865561694291, + "grad_norm": 0.330683171749115, + "learning_rate": 9.073063228948339e-05, + "loss": 1.8595, + "step": 7203 + }, + { + "epoch": 2.211172498465316, + "grad_norm": 0.35648414492607117, + "learning_rate": 9.072774913678108e-05, + "loss": 1.8265, + "step": 7204 + }, + { + "epoch": 2.2114794352363414, + "grad_norm": 0.4420771300792694, + "learning_rate": 9.072486558158329e-05, + "loss": 1.902, + "step": 7205 + }, + { + "epoch": 2.2117863720073663, + "grad_norm": 0.41682472825050354, + "learning_rate": 9.072198162391849e-05, + "loss": 1.903, + "step": 7206 + }, + { + "epoch": 2.2120933087783916, + "grad_norm": 0.3194744288921356, + "learning_rate": 9.07190972638152e-05, + "loss": 1.8221, + "step": 7207 + }, + { + "epoch": 2.212400245549417, + "grad_norm": 0.35625776648521423, + "learning_rate": 9.071621250130192e-05, + "loss": 1.8737, + "step": 7208 + }, + { + "epoch": 2.212707182320442, + "grad_norm": 0.4136293828487396, + "learning_rate": 9.071332733640716e-05, + "loss": 1.7995, + "step": 7209 + }, + { + "epoch": 2.213014119091467, + "grad_norm": 0.39144495129585266, + "learning_rate": 9.071044176915947e-05, + "loss": 1.8446, + "step": 7210 + }, + { + "epoch": 2.2133210558624925, + "grad_norm": 0.3082813322544098, + "learning_rate": 9.07075557995873e-05, + "loss": 1.7635, + "step": 7211 + }, + { + "epoch": 2.2136279926335174, + "grad_norm": 0.3642291724681854, + "learning_rate": 9.070466942771921e-05, + "loss": 1.9471, + "step": 7212 + }, + { + "epoch": 2.2139349294045427, + "grad_norm": 0.4506807029247284, + "learning_rate": 9.070178265358372e-05, + "loss": 1.8542, + "step": 7213 + }, + { + "epoch": 2.214241866175568, + "grad_norm": 0.5011601448059082, + "learning_rate": 9.069889547720936e-05, + "loss": 1.9135, + "step": 7214 + }, + { + "epoch": 2.214548802946593, + "grad_norm": 0.3946228623390198, + "learning_rate": 9.069600789862467e-05, + "loss": 1.876, + "step": 7215 + }, + { + "epoch": 2.2148557397176183, + "grad_norm": 0.34833815693855286, + "learning_rate": 9.069311991785816e-05, + "loss": 1.8666, + "step": 7216 + }, + { + "epoch": 2.215162676488643, + "grad_norm": 0.43735191226005554, + "learning_rate": 9.069023153493839e-05, + "loss": 1.9238, + "step": 7217 + }, + { + "epoch": 2.2154696132596685, + "grad_norm": 0.5010718107223511, + "learning_rate": 9.06873427498939e-05, + "loss": 1.8724, + "step": 7218 + }, + { + "epoch": 2.215776550030694, + "grad_norm": 0.35850396752357483, + "learning_rate": 9.068445356275326e-05, + "loss": 1.8825, + "step": 7219 + }, + { + "epoch": 2.2160834868017187, + "grad_norm": 0.3528468906879425, + "learning_rate": 9.0681563973545e-05, + "loss": 1.8724, + "step": 7220 + }, + { + "epoch": 2.216390423572744, + "grad_norm": 0.34725508093833923, + "learning_rate": 9.067867398229767e-05, + "loss": 1.8722, + "step": 7221 + }, + { + "epoch": 2.216697360343769, + "grad_norm": 0.3343757092952728, + "learning_rate": 9.067578358903985e-05, + "loss": 1.8144, + "step": 7222 + }, + { + "epoch": 2.2170042971147943, + "grad_norm": 0.33384087681770325, + "learning_rate": 9.067289279380009e-05, + "loss": 1.832, + "step": 7223 + }, + { + "epoch": 2.2173112338858196, + "grad_norm": 0.3275810778141022, + "learning_rate": 9.067000159660697e-05, + "loss": 1.8819, + "step": 7224 + }, + { + "epoch": 2.2176181706568445, + "grad_norm": 0.405293732881546, + "learning_rate": 9.066710999748904e-05, + "loss": 1.8669, + "step": 7225 + }, + { + "epoch": 2.21792510742787, + "grad_norm": 0.3554569482803345, + "learning_rate": 9.066421799647491e-05, + "loss": 1.8331, + "step": 7226 + }, + { + "epoch": 2.218232044198895, + "grad_norm": 0.3896840810775757, + "learning_rate": 9.066132559359313e-05, + "loss": 1.891, + "step": 7227 + }, + { + "epoch": 2.21853898096992, + "grad_norm": 0.38668718934059143, + "learning_rate": 9.065843278887231e-05, + "loss": 1.9162, + "step": 7228 + }, + { + "epoch": 2.2188459177409454, + "grad_norm": 0.3593392074108124, + "learning_rate": 9.065553958234103e-05, + "loss": 1.866, + "step": 7229 + }, + { + "epoch": 2.2191528545119708, + "grad_norm": 0.3509809076786041, + "learning_rate": 9.065264597402788e-05, + "loss": 1.8979, + "step": 7230 + }, + { + "epoch": 2.2194597912829956, + "grad_norm": 0.35477882623672485, + "learning_rate": 9.064975196396144e-05, + "loss": 1.8425, + "step": 7231 + }, + { + "epoch": 2.219766728054021, + "grad_norm": 0.38763463497161865, + "learning_rate": 9.064685755217033e-05, + "loss": 1.8853, + "step": 7232 + }, + { + "epoch": 2.220073664825046, + "grad_norm": 0.33559930324554443, + "learning_rate": 9.064396273868316e-05, + "loss": 1.8825, + "step": 7233 + }, + { + "epoch": 2.220380601596071, + "grad_norm": 0.3130233585834503, + "learning_rate": 9.064106752352852e-05, + "loss": 1.8082, + "step": 7234 + }, + { + "epoch": 2.2206875383670965, + "grad_norm": 0.33321285247802734, + "learning_rate": 9.063817190673503e-05, + "loss": 1.8795, + "step": 7235 + }, + { + "epoch": 2.2209944751381214, + "grad_norm": 0.47564151883125305, + "learning_rate": 9.063527588833132e-05, + "loss": 1.9461, + "step": 7236 + }, + { + "epoch": 2.2213014119091468, + "grad_norm": 0.38102859258651733, + "learning_rate": 9.063237946834597e-05, + "loss": 1.8656, + "step": 7237 + }, + { + "epoch": 2.2216083486801717, + "grad_norm": 0.32240456342697144, + "learning_rate": 9.062948264680765e-05, + "loss": 1.8187, + "step": 7238 + }, + { + "epoch": 2.221915285451197, + "grad_norm": 0.2852800190448761, + "learning_rate": 9.062658542374496e-05, + "loss": 1.8172, + "step": 7239 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3506350815296173, + "learning_rate": 9.062368779918655e-05, + "loss": 1.8909, + "step": 7240 + }, + { + "epoch": 2.222529158993247, + "grad_norm": 0.29418817162513733, + "learning_rate": 9.062078977316104e-05, + "loss": 1.8078, + "step": 7241 + }, + { + "epoch": 2.2228360957642725, + "grad_norm": 0.31221407651901245, + "learning_rate": 9.061789134569707e-05, + "loss": 1.8813, + "step": 7242 + }, + { + "epoch": 2.223143032535298, + "grad_norm": 0.32314184308052063, + "learning_rate": 9.061499251682331e-05, + "loss": 1.8838, + "step": 7243 + }, + { + "epoch": 2.2234499693063228, + "grad_norm": 0.3329566419124603, + "learning_rate": 9.061209328656838e-05, + "loss": 1.8987, + "step": 7244 + }, + { + "epoch": 2.223756906077348, + "grad_norm": 0.35992133617401123, + "learning_rate": 9.060919365496094e-05, + "loss": 1.9194, + "step": 7245 + }, + { + "epoch": 2.2240638428483734, + "grad_norm": 0.33594536781311035, + "learning_rate": 9.060629362202964e-05, + "loss": 1.8303, + "step": 7246 + }, + { + "epoch": 2.2243707796193983, + "grad_norm": 0.3469938635826111, + "learning_rate": 9.060339318780316e-05, + "loss": 1.905, + "step": 7247 + }, + { + "epoch": 2.2246777163904237, + "grad_norm": 0.3989942967891693, + "learning_rate": 9.060049235231015e-05, + "loss": 1.8655, + "step": 7248 + }, + { + "epoch": 2.2249846531614486, + "grad_norm": 0.35004356503486633, + "learning_rate": 9.059759111557926e-05, + "loss": 1.8081, + "step": 7249 + }, + { + "epoch": 2.225291589932474, + "grad_norm": 0.38162320852279663, + "learning_rate": 9.059468947763919e-05, + "loss": 1.9243, + "step": 7250 + }, + { + "epoch": 2.2255985267034992, + "grad_norm": 0.3417564034461975, + "learning_rate": 9.059178743851859e-05, + "loss": 1.8246, + "step": 7251 + }, + { + "epoch": 2.225905463474524, + "grad_norm": 0.39185380935668945, + "learning_rate": 9.058888499824618e-05, + "loss": 1.9235, + "step": 7252 + }, + { + "epoch": 2.2262124002455494, + "grad_norm": 0.5741223096847534, + "learning_rate": 9.058598215685061e-05, + "loss": 1.9104, + "step": 7253 + }, + { + "epoch": 2.226519337016575, + "grad_norm": 0.6595804691314697, + "learning_rate": 9.058307891436057e-05, + "loss": 1.9956, + "step": 7254 + }, + { + "epoch": 2.2268262737875997, + "grad_norm": 0.6249661445617676, + "learning_rate": 9.058017527080476e-05, + "loss": 1.8913, + "step": 7255 + }, + { + "epoch": 2.227133210558625, + "grad_norm": 0.48208609223365784, + "learning_rate": 9.057727122621188e-05, + "loss": 1.9116, + "step": 7256 + }, + { + "epoch": 2.22744014732965, + "grad_norm": 0.37400147318840027, + "learning_rate": 9.057436678061062e-05, + "loss": 1.8828, + "step": 7257 + }, + { + "epoch": 2.2277470841006752, + "grad_norm": 0.40321463346481323, + "learning_rate": 9.057146193402968e-05, + "loss": 1.7984, + "step": 7258 + }, + { + "epoch": 2.2280540208717006, + "grad_norm": 0.43090149760246277, + "learning_rate": 9.056855668649778e-05, + "loss": 1.9135, + "step": 7259 + }, + { + "epoch": 2.2283609576427255, + "grad_norm": 0.3625677525997162, + "learning_rate": 9.056565103804362e-05, + "loss": 1.9005, + "step": 7260 + }, + { + "epoch": 2.228667894413751, + "grad_norm": 0.3386496901512146, + "learning_rate": 9.056274498869593e-05, + "loss": 1.879, + "step": 7261 + }, + { + "epoch": 2.228974831184776, + "grad_norm": 0.45207980275154114, + "learning_rate": 9.05598385384834e-05, + "loss": 1.8748, + "step": 7262 + }, + { + "epoch": 2.229281767955801, + "grad_norm": 0.38665562868118286, + "learning_rate": 9.055693168743478e-05, + "loss": 1.8828, + "step": 7263 + }, + { + "epoch": 2.2295887047268264, + "grad_norm": 0.3074968159198761, + "learning_rate": 9.05540244355788e-05, + "loss": 1.8443, + "step": 7264 + }, + { + "epoch": 2.2298956414978512, + "grad_norm": 0.36243903636932373, + "learning_rate": 9.055111678294418e-05, + "loss": 1.8681, + "step": 7265 + }, + { + "epoch": 2.2302025782688766, + "grad_norm": 0.4070085287094116, + "learning_rate": 9.054820872955965e-05, + "loss": 1.8643, + "step": 7266 + }, + { + "epoch": 2.230509515039902, + "grad_norm": 0.3784204125404358, + "learning_rate": 9.054530027545396e-05, + "loss": 1.9197, + "step": 7267 + }, + { + "epoch": 2.230816451810927, + "grad_norm": 0.32002586126327515, + "learning_rate": 9.054239142065583e-05, + "loss": 1.9, + "step": 7268 + }, + { + "epoch": 2.231123388581952, + "grad_norm": 0.3701259195804596, + "learning_rate": 9.053948216519405e-05, + "loss": 1.8815, + "step": 7269 + }, + { + "epoch": 2.2314303253529775, + "grad_norm": 0.32927554845809937, + "learning_rate": 9.053657250909734e-05, + "loss": 1.8599, + "step": 7270 + }, + { + "epoch": 2.2317372621240024, + "grad_norm": 0.2915503680706024, + "learning_rate": 9.053366245239445e-05, + "loss": 1.8553, + "step": 7271 + }, + { + "epoch": 2.2320441988950277, + "grad_norm": 0.3347928822040558, + "learning_rate": 9.053075199511416e-05, + "loss": 1.926, + "step": 7272 + }, + { + "epoch": 2.2323511356660526, + "grad_norm": 0.37499183416366577, + "learning_rate": 9.052784113728523e-05, + "loss": 1.8636, + "step": 7273 + }, + { + "epoch": 2.232658072437078, + "grad_norm": 0.38303107023239136, + "learning_rate": 9.05249298789364e-05, + "loss": 1.8739, + "step": 7274 + }, + { + "epoch": 2.2329650092081033, + "grad_norm": 0.356942355632782, + "learning_rate": 9.052201822009648e-05, + "loss": 1.8401, + "step": 7275 + }, + { + "epoch": 2.233271945979128, + "grad_norm": 0.3391316533088684, + "learning_rate": 9.051910616079422e-05, + "loss": 1.8954, + "step": 7276 + }, + { + "epoch": 2.2335788827501535, + "grad_norm": 0.3100464344024658, + "learning_rate": 9.051619370105839e-05, + "loss": 1.8726, + "step": 7277 + }, + { + "epoch": 2.233885819521179, + "grad_norm": 0.38745078444480896, + "learning_rate": 9.05132808409178e-05, + "loss": 1.9605, + "step": 7278 + }, + { + "epoch": 2.2341927562922037, + "grad_norm": 0.40631747245788574, + "learning_rate": 9.051036758040123e-05, + "loss": 1.8458, + "step": 7279 + }, + { + "epoch": 2.234499693063229, + "grad_norm": 0.4084717929363251, + "learning_rate": 9.050745391953745e-05, + "loss": 1.8696, + "step": 7280 + }, + { + "epoch": 2.234806629834254, + "grad_norm": 0.4426955282688141, + "learning_rate": 9.050453985835527e-05, + "loss": 1.9063, + "step": 7281 + }, + { + "epoch": 2.2351135666052793, + "grad_norm": 0.37360796332359314, + "learning_rate": 9.05016253968835e-05, + "loss": 1.9299, + "step": 7282 + }, + { + "epoch": 2.2354205033763046, + "grad_norm": 0.34415799379348755, + "learning_rate": 9.049871053515091e-05, + "loss": 1.8877, + "step": 7283 + }, + { + "epoch": 2.2357274401473295, + "grad_norm": 0.3745698928833008, + "learning_rate": 9.049579527318633e-05, + "loss": 1.9272, + "step": 7284 + }, + { + "epoch": 2.236034376918355, + "grad_norm": 0.3293079435825348, + "learning_rate": 9.049287961101857e-05, + "loss": 1.8599, + "step": 7285 + }, + { + "epoch": 2.23634131368938, + "grad_norm": 0.3563106060028076, + "learning_rate": 9.048996354867644e-05, + "loss": 1.938, + "step": 7286 + }, + { + "epoch": 2.236648250460405, + "grad_norm": 0.36354976892471313, + "learning_rate": 9.048704708618876e-05, + "loss": 1.9401, + "step": 7287 + }, + { + "epoch": 2.2369551872314304, + "grad_norm": 0.32659000158309937, + "learning_rate": 9.048413022358434e-05, + "loss": 1.8056, + "step": 7288 + }, + { + "epoch": 2.2372621240024557, + "grad_norm": 0.30486637353897095, + "learning_rate": 9.048121296089202e-05, + "loss": 1.8178, + "step": 7289 + }, + { + "epoch": 2.2375690607734806, + "grad_norm": 0.34506455063819885, + "learning_rate": 9.047829529814063e-05, + "loss": 1.8866, + "step": 7290 + }, + { + "epoch": 2.237875997544506, + "grad_norm": 0.3200983703136444, + "learning_rate": 9.047537723535902e-05, + "loss": 1.8218, + "step": 7291 + }, + { + "epoch": 2.238182934315531, + "grad_norm": 0.33315715193748474, + "learning_rate": 9.047245877257597e-05, + "loss": 1.8939, + "step": 7292 + }, + { + "epoch": 2.238489871086556, + "grad_norm": 0.38259127736091614, + "learning_rate": 9.046953990982039e-05, + "loss": 1.9566, + "step": 7293 + }, + { + "epoch": 2.2387968078575815, + "grad_norm": 0.32880350947380066, + "learning_rate": 9.04666206471211e-05, + "loss": 1.9056, + "step": 7294 + }, + { + "epoch": 2.2391037446286064, + "grad_norm": 0.39114195108413696, + "learning_rate": 9.046370098450692e-05, + "loss": 1.8773, + "step": 7295 + }, + { + "epoch": 2.2394106813996317, + "grad_norm": 0.37625813484191895, + "learning_rate": 9.046078092200675e-05, + "loss": 1.8685, + "step": 7296 + }, + { + "epoch": 2.2397176181706566, + "grad_norm": 0.3604978621006012, + "learning_rate": 9.045786045964942e-05, + "loss": 1.885, + "step": 7297 + }, + { + "epoch": 2.240024554941682, + "grad_norm": 0.32200589776039124, + "learning_rate": 9.045493959746381e-05, + "loss": 1.9146, + "step": 7298 + }, + { + "epoch": 2.2403314917127073, + "grad_norm": 0.3635976314544678, + "learning_rate": 9.045201833547876e-05, + "loss": 1.8597, + "step": 7299 + }, + { + "epoch": 2.240638428483732, + "grad_norm": 0.3326318562030792, + "learning_rate": 9.044909667372317e-05, + "loss": 1.8577, + "step": 7300 + }, + { + "epoch": 2.2409453652547575, + "grad_norm": 0.32209664583206177, + "learning_rate": 9.044617461222589e-05, + "loss": 1.844, + "step": 7301 + }, + { + "epoch": 2.241252302025783, + "grad_norm": 0.3654637634754181, + "learning_rate": 9.044325215101581e-05, + "loss": 1.8858, + "step": 7302 + }, + { + "epoch": 2.2415592387968077, + "grad_norm": 0.3583166003227234, + "learning_rate": 9.04403292901218e-05, + "loss": 1.8148, + "step": 7303 + }, + { + "epoch": 2.241866175567833, + "grad_norm": 0.3315606117248535, + "learning_rate": 9.043740602957276e-05, + "loss": 1.8504, + "step": 7304 + }, + { + "epoch": 2.2421731123388584, + "grad_norm": 0.36084556579589844, + "learning_rate": 9.043448236939758e-05, + "loss": 1.9167, + "step": 7305 + }, + { + "epoch": 2.2424800491098833, + "grad_norm": 0.43558987975120544, + "learning_rate": 9.043155830962514e-05, + "loss": 1.8937, + "step": 7306 + }, + { + "epoch": 2.2427869858809086, + "grad_norm": 0.455240398645401, + "learning_rate": 9.042863385028433e-05, + "loss": 1.9774, + "step": 7307 + }, + { + "epoch": 2.2430939226519335, + "grad_norm": 0.35868698358535767, + "learning_rate": 9.042570899140408e-05, + "loss": 1.7999, + "step": 7308 + }, + { + "epoch": 2.243400859422959, + "grad_norm": 0.33930447697639465, + "learning_rate": 9.042278373301327e-05, + "loss": 1.965, + "step": 7309 + }, + { + "epoch": 2.243707796193984, + "grad_norm": 0.34124335646629333, + "learning_rate": 9.041985807514082e-05, + "loss": 1.8916, + "step": 7310 + }, + { + "epoch": 2.244014732965009, + "grad_norm": 0.3905695974826813, + "learning_rate": 9.041693201781565e-05, + "loss": 1.9066, + "step": 7311 + }, + { + "epoch": 2.2443216697360344, + "grad_norm": 0.3108711242675781, + "learning_rate": 9.041400556106667e-05, + "loss": 1.8038, + "step": 7312 + }, + { + "epoch": 2.2446286065070598, + "grad_norm": 0.2853390872478485, + "learning_rate": 9.041107870492279e-05, + "loss": 1.8945, + "step": 7313 + }, + { + "epoch": 2.2449355432780846, + "grad_norm": 0.33351564407348633, + "learning_rate": 9.040815144941295e-05, + "loss": 1.8796, + "step": 7314 + }, + { + "epoch": 2.24524248004911, + "grad_norm": 0.3470609486103058, + "learning_rate": 9.040522379456606e-05, + "loss": 1.8914, + "step": 7315 + }, + { + "epoch": 2.245549416820135, + "grad_norm": 0.3474356532096863, + "learning_rate": 9.040229574041109e-05, + "loss": 1.838, + "step": 7316 + }, + { + "epoch": 2.24585635359116, + "grad_norm": 0.36590397357940674, + "learning_rate": 9.039936728697693e-05, + "loss": 1.86, + "step": 7317 + }, + { + "epoch": 2.2461632903621855, + "grad_norm": 0.35168272256851196, + "learning_rate": 9.039643843429257e-05, + "loss": 1.9337, + "step": 7318 + }, + { + "epoch": 2.2464702271332104, + "grad_norm": 0.3402341604232788, + "learning_rate": 9.039350918238691e-05, + "loss": 1.9291, + "step": 7319 + }, + { + "epoch": 2.2467771639042358, + "grad_norm": 0.3505321443080902, + "learning_rate": 9.03905795312889e-05, + "loss": 1.8252, + "step": 7320 + }, + { + "epoch": 2.247084100675261, + "grad_norm": 0.38366270065307617, + "learning_rate": 9.038764948102754e-05, + "loss": 1.8685, + "step": 7321 + }, + { + "epoch": 2.247391037446286, + "grad_norm": 0.3616010844707489, + "learning_rate": 9.038471903163176e-05, + "loss": 1.8734, + "step": 7322 + }, + { + "epoch": 2.2476979742173113, + "grad_norm": 0.2982875108718872, + "learning_rate": 9.038178818313048e-05, + "loss": 1.824, + "step": 7323 + }, + { + "epoch": 2.248004910988336, + "grad_norm": 0.41936174035072327, + "learning_rate": 9.037885693555273e-05, + "loss": 1.8799, + "step": 7324 + }, + { + "epoch": 2.2483118477593615, + "grad_norm": 0.3460717797279358, + "learning_rate": 9.037592528892744e-05, + "loss": 1.8889, + "step": 7325 + }, + { + "epoch": 2.248618784530387, + "grad_norm": 0.34347018599510193, + "learning_rate": 9.03729932432836e-05, + "loss": 1.8779, + "step": 7326 + }, + { + "epoch": 2.2489257213014118, + "grad_norm": 0.2988032400608063, + "learning_rate": 9.037006079865016e-05, + "loss": 1.8753, + "step": 7327 + }, + { + "epoch": 2.249232658072437, + "grad_norm": 0.32754310965538025, + "learning_rate": 9.036712795505613e-05, + "loss": 1.8896, + "step": 7328 + }, + { + "epoch": 2.2495395948434624, + "grad_norm": 0.3599032163619995, + "learning_rate": 9.036419471253049e-05, + "loss": 1.8752, + "step": 7329 + }, + { + "epoch": 2.2498465316144873, + "grad_norm": 0.3461225926876068, + "learning_rate": 9.03612610711022e-05, + "loss": 1.8723, + "step": 7330 + }, + { + "epoch": 2.2501534683855127, + "grad_norm": 0.3141838610172272, + "learning_rate": 9.035832703080027e-05, + "loss": 1.8825, + "step": 7331 + }, + { + "epoch": 2.250460405156538, + "grad_norm": 0.35188567638397217, + "learning_rate": 9.035539259165371e-05, + "loss": 1.8832, + "step": 7332 + }, + { + "epoch": 2.250767341927563, + "grad_norm": 0.3496280014514923, + "learning_rate": 9.035245775369151e-05, + "loss": 1.9084, + "step": 7333 + }, + { + "epoch": 2.2510742786985882, + "grad_norm": 0.34936273097991943, + "learning_rate": 9.034952251694266e-05, + "loss": 1.8142, + "step": 7334 + }, + { + "epoch": 2.251381215469613, + "grad_norm": 0.4227045774459839, + "learning_rate": 9.034658688143618e-05, + "loss": 1.9454, + "step": 7335 + }, + { + "epoch": 2.2516881522406385, + "grad_norm": 0.4042366147041321, + "learning_rate": 9.034365084720108e-05, + "loss": 1.8993, + "step": 7336 + }, + { + "epoch": 2.251995089011664, + "grad_norm": 0.392633318901062, + "learning_rate": 9.03407144142664e-05, + "loss": 1.9229, + "step": 7337 + }, + { + "epoch": 2.2523020257826887, + "grad_norm": 0.31304940581321716, + "learning_rate": 9.033777758266111e-05, + "loss": 1.8746, + "step": 7338 + }, + { + "epoch": 2.252608962553714, + "grad_norm": 0.3205752372741699, + "learning_rate": 9.033484035241426e-05, + "loss": 1.8224, + "step": 7339 + }, + { + "epoch": 2.252915899324739, + "grad_norm": 0.32164251804351807, + "learning_rate": 9.033190272355488e-05, + "loss": 1.8164, + "step": 7340 + }, + { + "epoch": 2.2532228360957642, + "grad_norm": 0.3567545413970947, + "learning_rate": 9.032896469611201e-05, + "loss": 1.8892, + "step": 7341 + }, + { + "epoch": 2.2535297728667896, + "grad_norm": 0.3475800156593323, + "learning_rate": 9.032602627011467e-05, + "loss": 1.8594, + "step": 7342 + }, + { + "epoch": 2.2538367096378145, + "grad_norm": 0.38770994544029236, + "learning_rate": 9.032308744559189e-05, + "loss": 1.8899, + "step": 7343 + }, + { + "epoch": 2.25414364640884, + "grad_norm": 0.3671153783798218, + "learning_rate": 9.032014822257273e-05, + "loss": 1.8795, + "step": 7344 + }, + { + "epoch": 2.254450583179865, + "grad_norm": 0.3415989875793457, + "learning_rate": 9.031720860108623e-05, + "loss": 1.9007, + "step": 7345 + }, + { + "epoch": 2.25475751995089, + "grad_norm": 0.3317084014415741, + "learning_rate": 9.031426858116145e-05, + "loss": 1.8604, + "step": 7346 + }, + { + "epoch": 2.2550644567219154, + "grad_norm": 0.3760251998901367, + "learning_rate": 9.031132816282745e-05, + "loss": 1.9061, + "step": 7347 + }, + { + "epoch": 2.2553713934929407, + "grad_norm": 0.4288908541202545, + "learning_rate": 9.030838734611326e-05, + "loss": 1.8621, + "step": 7348 + }, + { + "epoch": 2.2556783302639656, + "grad_norm": 0.3840491771697998, + "learning_rate": 9.030544613104797e-05, + "loss": 1.8743, + "step": 7349 + }, + { + "epoch": 2.255985267034991, + "grad_norm": 0.32746297121047974, + "learning_rate": 9.030250451766063e-05, + "loss": 1.8813, + "step": 7350 + }, + { + "epoch": 2.256292203806016, + "grad_norm": 0.31266525387763977, + "learning_rate": 9.029956250598032e-05, + "loss": 1.816, + "step": 7351 + }, + { + "epoch": 2.256599140577041, + "grad_norm": 0.34744998812675476, + "learning_rate": 9.029662009603613e-05, + "loss": 1.8728, + "step": 7352 + }, + { + "epoch": 2.2569060773480665, + "grad_norm": 0.36204856634140015, + "learning_rate": 9.029367728785709e-05, + "loss": 1.9331, + "step": 7353 + }, + { + "epoch": 2.2572130141190914, + "grad_norm": 0.3839271664619446, + "learning_rate": 9.029073408147234e-05, + "loss": 2.0018, + "step": 7354 + }, + { + "epoch": 2.2575199508901167, + "grad_norm": 0.34844526648521423, + "learning_rate": 9.028779047691094e-05, + "loss": 1.8873, + "step": 7355 + }, + { + "epoch": 2.2578268876611416, + "grad_norm": 0.31876906752586365, + "learning_rate": 9.028484647420196e-05, + "loss": 1.8569, + "step": 7356 + }, + { + "epoch": 2.258133824432167, + "grad_norm": 0.3633274435997009, + "learning_rate": 9.028190207337452e-05, + "loss": 1.8645, + "step": 7357 + }, + { + "epoch": 2.2584407612031923, + "grad_norm": 0.39025530219078064, + "learning_rate": 9.027895727445775e-05, + "loss": 1.911, + "step": 7358 + }, + { + "epoch": 2.258747697974217, + "grad_norm": 0.34168434143066406, + "learning_rate": 9.027601207748067e-05, + "loss": 1.8675, + "step": 7359 + }, + { + "epoch": 2.2590546347452425, + "grad_norm": 0.3539605438709259, + "learning_rate": 9.027306648247245e-05, + "loss": 1.9001, + "step": 7360 + }, + { + "epoch": 2.259361571516268, + "grad_norm": 0.30433401465415955, + "learning_rate": 9.02701204894622e-05, + "loss": 1.8598, + "step": 7361 + }, + { + "epoch": 2.2596685082872927, + "grad_norm": 0.35448700189590454, + "learning_rate": 9.026717409847898e-05, + "loss": 1.8845, + "step": 7362 + }, + { + "epoch": 2.259975445058318, + "grad_norm": 0.34060248732566833, + "learning_rate": 9.026422730955197e-05, + "loss": 1.9322, + "step": 7363 + }, + { + "epoch": 2.2602823818293434, + "grad_norm": 0.3370642364025116, + "learning_rate": 9.026128012271026e-05, + "loss": 1.8356, + "step": 7364 + }, + { + "epoch": 2.2605893186003683, + "grad_norm": 0.3148033022880554, + "learning_rate": 9.025833253798298e-05, + "loss": 1.7723, + "step": 7365 + }, + { + "epoch": 2.2608962553713936, + "grad_norm": 0.3062879145145416, + "learning_rate": 9.025538455539925e-05, + "loss": 1.8548, + "step": 7366 + }, + { + "epoch": 2.2612031921424185, + "grad_norm": 0.3378484547138214, + "learning_rate": 9.025243617498825e-05, + "loss": 1.9049, + "step": 7367 + }, + { + "epoch": 2.261510128913444, + "grad_norm": 0.277660608291626, + "learning_rate": 9.024948739677905e-05, + "loss": 1.7833, + "step": 7368 + }, + { + "epoch": 2.261817065684469, + "grad_norm": 0.3986060619354248, + "learning_rate": 9.024653822080083e-05, + "loss": 1.8837, + "step": 7369 + }, + { + "epoch": 2.262124002455494, + "grad_norm": 0.3013289272785187, + "learning_rate": 9.024358864708275e-05, + "loss": 1.8659, + "step": 7370 + }, + { + "epoch": 2.2624309392265194, + "grad_norm": 0.3403053879737854, + "learning_rate": 9.024063867565391e-05, + "loss": 1.8914, + "step": 7371 + }, + { + "epoch": 2.2627378759975443, + "grad_norm": 0.3488257825374603, + "learning_rate": 9.023768830654351e-05, + "loss": 1.8887, + "step": 7372 + }, + { + "epoch": 2.2630448127685696, + "grad_norm": 0.2950255274772644, + "learning_rate": 9.023473753978069e-05, + "loss": 1.8385, + "step": 7373 + }, + { + "epoch": 2.263351749539595, + "grad_norm": 0.35732173919677734, + "learning_rate": 9.023178637539461e-05, + "loss": 1.8769, + "step": 7374 + }, + { + "epoch": 2.26365868631062, + "grad_norm": 0.5403436422348022, + "learning_rate": 9.022883481341445e-05, + "loss": 1.9742, + "step": 7375 + }, + { + "epoch": 2.263965623081645, + "grad_norm": 0.5506799221038818, + "learning_rate": 9.022588285386935e-05, + "loss": 1.8667, + "step": 7376 + }, + { + "epoch": 2.2642725598526705, + "grad_norm": 0.4272395372390747, + "learning_rate": 9.02229304967885e-05, + "loss": 1.8336, + "step": 7377 + }, + { + "epoch": 2.2645794966236954, + "grad_norm": 0.34911462664604187, + "learning_rate": 9.021997774220108e-05, + "loss": 1.8608, + "step": 7378 + }, + { + "epoch": 2.2648864333947207, + "grad_norm": 0.3592715263366699, + "learning_rate": 9.021702459013626e-05, + "loss": 1.925, + "step": 7379 + }, + { + "epoch": 2.265193370165746, + "grad_norm": 0.38482216000556946, + "learning_rate": 9.021407104062323e-05, + "loss": 1.8553, + "step": 7380 + }, + { + "epoch": 2.265500306936771, + "grad_norm": 0.4675584137439728, + "learning_rate": 9.021111709369118e-05, + "loss": 1.9303, + "step": 7381 + }, + { + "epoch": 2.2658072437077963, + "grad_norm": 0.40397754311561584, + "learning_rate": 9.02081627493693e-05, + "loss": 1.9512, + "step": 7382 + }, + { + "epoch": 2.266114180478821, + "grad_norm": 0.3385498821735382, + "learning_rate": 9.02052080076868e-05, + "loss": 1.8314, + "step": 7383 + }, + { + "epoch": 2.2664211172498465, + "grad_norm": 0.40668871998786926, + "learning_rate": 9.020225286867285e-05, + "loss": 1.8658, + "step": 7384 + }, + { + "epoch": 2.266728054020872, + "grad_norm": 0.4566061198711395, + "learning_rate": 9.01992973323567e-05, + "loss": 1.8429, + "step": 7385 + }, + { + "epoch": 2.2670349907918967, + "grad_norm": 0.42283549904823303, + "learning_rate": 9.019634139876752e-05, + "loss": 1.8858, + "step": 7386 + }, + { + "epoch": 2.267341927562922, + "grad_norm": 0.3491251468658447, + "learning_rate": 9.019338506793454e-05, + "loss": 1.8389, + "step": 7387 + }, + { + "epoch": 2.267648864333947, + "grad_norm": 0.33846428990364075, + "learning_rate": 9.019042833988696e-05, + "loss": 1.8309, + "step": 7388 + }, + { + "epoch": 2.2679558011049723, + "grad_norm": 0.39968016743659973, + "learning_rate": 9.0187471214654e-05, + "loss": 1.8591, + "step": 7389 + }, + { + "epoch": 2.2682627378759976, + "grad_norm": 0.39926376938819885, + "learning_rate": 9.018451369226493e-05, + "loss": 1.9341, + "step": 7390 + }, + { + "epoch": 2.2685696746470225, + "grad_norm": 0.41112056374549866, + "learning_rate": 9.018155577274892e-05, + "loss": 1.8856, + "step": 7391 + }, + { + "epoch": 2.268876611418048, + "grad_norm": 0.49490058422088623, + "learning_rate": 9.017859745613521e-05, + "loss": 1.8458, + "step": 7392 + }, + { + "epoch": 2.269183548189073, + "grad_norm": 0.42149874567985535, + "learning_rate": 9.017563874245308e-05, + "loss": 1.862, + "step": 7393 + }, + { + "epoch": 2.269490484960098, + "grad_norm": 0.37284091114997864, + "learning_rate": 9.017267963173173e-05, + "loss": 1.8698, + "step": 7394 + }, + { + "epoch": 2.2697974217311234, + "grad_norm": 0.3743322193622589, + "learning_rate": 9.016972012400041e-05, + "loss": 1.8847, + "step": 7395 + }, + { + "epoch": 2.2701043585021488, + "grad_norm": 0.4327050447463989, + "learning_rate": 9.016676021928838e-05, + "loss": 1.8227, + "step": 7396 + }, + { + "epoch": 2.2704112952731736, + "grad_norm": 0.4334336519241333, + "learning_rate": 9.016379991762487e-05, + "loss": 1.9292, + "step": 7397 + }, + { + "epoch": 2.270718232044199, + "grad_norm": 0.37071630358695984, + "learning_rate": 9.016083921903915e-05, + "loss": 1.8045, + "step": 7398 + }, + { + "epoch": 2.271025168815224, + "grad_norm": 0.32131752371788025, + "learning_rate": 9.015787812356049e-05, + "loss": 1.8697, + "step": 7399 + }, + { + "epoch": 2.271332105586249, + "grad_norm": 0.3604664206504822, + "learning_rate": 9.015491663121813e-05, + "loss": 1.9259, + "step": 7400 + }, + { + "epoch": 2.2716390423572745, + "grad_norm": 0.3364580571651459, + "learning_rate": 9.015195474204136e-05, + "loss": 1.8964, + "step": 7401 + }, + { + "epoch": 2.2719459791282994, + "grad_norm": 0.3141402304172516, + "learning_rate": 9.014899245605944e-05, + "loss": 1.8536, + "step": 7402 + }, + { + "epoch": 2.2722529158993248, + "grad_norm": 0.3387024402618408, + "learning_rate": 9.014602977330162e-05, + "loss": 1.8362, + "step": 7403 + }, + { + "epoch": 2.27255985267035, + "grad_norm": 0.42270272970199585, + "learning_rate": 9.014306669379723e-05, + "loss": 1.8288, + "step": 7404 + }, + { + "epoch": 2.272866789441375, + "grad_norm": 0.4565230906009674, + "learning_rate": 9.01401032175755e-05, + "loss": 1.8573, + "step": 7405 + }, + { + "epoch": 2.2731737262124003, + "grad_norm": 0.38861140608787537, + "learning_rate": 9.013713934466576e-05, + "loss": 1.8778, + "step": 7406 + }, + { + "epoch": 2.2734806629834257, + "grad_norm": 0.31552520394325256, + "learning_rate": 9.01341750750973e-05, + "loss": 1.8342, + "step": 7407 + }, + { + "epoch": 2.2737875997544506, + "grad_norm": 0.3771591782569885, + "learning_rate": 9.013121040889938e-05, + "loss": 1.8847, + "step": 7408 + }, + { + "epoch": 2.274094536525476, + "grad_norm": 0.3689042925834656, + "learning_rate": 9.012824534610132e-05, + "loss": 1.9014, + "step": 7409 + }, + { + "epoch": 2.2744014732965008, + "grad_norm": 0.31477800011634827, + "learning_rate": 9.012527988673241e-05, + "loss": 1.8631, + "step": 7410 + }, + { + "epoch": 2.274708410067526, + "grad_norm": 0.3238977789878845, + "learning_rate": 9.012231403082199e-05, + "loss": 1.8319, + "step": 7411 + }, + { + "epoch": 2.2750153468385514, + "grad_norm": 0.3587593138217926, + "learning_rate": 9.011934777839932e-05, + "loss": 1.8982, + "step": 7412 + }, + { + "epoch": 2.2753222836095763, + "grad_norm": 0.35946986079216003, + "learning_rate": 9.011638112949376e-05, + "loss": 1.9206, + "step": 7413 + }, + { + "epoch": 2.2756292203806017, + "grad_norm": 0.3451001048088074, + "learning_rate": 9.01134140841346e-05, + "loss": 1.8122, + "step": 7414 + }, + { + "epoch": 2.2759361571516266, + "grad_norm": 0.3779532313346863, + "learning_rate": 9.011044664235116e-05, + "loss": 1.8851, + "step": 7415 + }, + { + "epoch": 2.276243093922652, + "grad_norm": 0.3812767267227173, + "learning_rate": 9.010747880417279e-05, + "loss": 1.902, + "step": 7416 + }, + { + "epoch": 2.2765500306936772, + "grad_norm": 0.3666127920150757, + "learning_rate": 9.01045105696288e-05, + "loss": 1.8296, + "step": 7417 + }, + { + "epoch": 2.276856967464702, + "grad_norm": 0.3588816225528717, + "learning_rate": 9.010154193874854e-05, + "loss": 1.9023, + "step": 7418 + }, + { + "epoch": 2.2771639042357275, + "grad_norm": 0.37766706943511963, + "learning_rate": 9.009857291156134e-05, + "loss": 1.7996, + "step": 7419 + }, + { + "epoch": 2.277470841006753, + "grad_norm": 0.4222901165485382, + "learning_rate": 9.009560348809654e-05, + "loss": 1.8802, + "step": 7420 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.39289870858192444, + "learning_rate": 9.009263366838348e-05, + "loss": 1.8988, + "step": 7421 + }, + { + "epoch": 2.278084714548803, + "grad_norm": 0.3670540750026703, + "learning_rate": 9.008966345245152e-05, + "loss": 1.8348, + "step": 7422 + }, + { + "epoch": 2.2783916513198283, + "grad_norm": 0.36671552062034607, + "learning_rate": 9.008669284032998e-05, + "loss": 1.9059, + "step": 7423 + }, + { + "epoch": 2.2786985880908532, + "grad_norm": 0.33226338028907776, + "learning_rate": 9.008372183204827e-05, + "loss": 1.8736, + "step": 7424 + }, + { + "epoch": 2.2790055248618786, + "grad_norm": 0.3424983322620392, + "learning_rate": 9.008075042763573e-05, + "loss": 1.8537, + "step": 7425 + }, + { + "epoch": 2.2793124616329035, + "grad_norm": 0.3336870074272156, + "learning_rate": 9.007777862712172e-05, + "loss": 1.8622, + "step": 7426 + }, + { + "epoch": 2.279619398403929, + "grad_norm": 0.3488881289958954, + "learning_rate": 9.007480643053561e-05, + "loss": 1.88, + "step": 7427 + }, + { + "epoch": 2.279926335174954, + "grad_norm": 0.34159761667251587, + "learning_rate": 9.007183383790676e-05, + "loss": 1.8893, + "step": 7428 + }, + { + "epoch": 2.280233271945979, + "grad_norm": 0.3075805604457855, + "learning_rate": 9.006886084926459e-05, + "loss": 1.8613, + "step": 7429 + }, + { + "epoch": 2.2805402087170044, + "grad_norm": 0.32371413707733154, + "learning_rate": 9.006588746463844e-05, + "loss": 1.909, + "step": 7430 + }, + { + "epoch": 2.2808471454880292, + "grad_norm": 0.34343451261520386, + "learning_rate": 9.006291368405769e-05, + "loss": 1.8696, + "step": 7431 + }, + { + "epoch": 2.2811540822590546, + "grad_norm": 0.34018251299858093, + "learning_rate": 9.005993950755177e-05, + "loss": 1.9155, + "step": 7432 + }, + { + "epoch": 2.28146101903008, + "grad_norm": 0.42582982778549194, + "learning_rate": 9.005696493515003e-05, + "loss": 1.8901, + "step": 7433 + }, + { + "epoch": 2.281767955801105, + "grad_norm": 0.44168829917907715, + "learning_rate": 9.005398996688188e-05, + "loss": 1.8693, + "step": 7434 + }, + { + "epoch": 2.28207489257213, + "grad_norm": 0.3650555908679962, + "learning_rate": 9.005101460277673e-05, + "loss": 1.8726, + "step": 7435 + }, + { + "epoch": 2.2823818293431555, + "grad_norm": 0.2945705056190491, + "learning_rate": 9.004803884286399e-05, + "loss": 1.8655, + "step": 7436 + }, + { + "epoch": 2.2826887661141804, + "grad_norm": 0.4192120432853699, + "learning_rate": 9.004506268717305e-05, + "loss": 1.9859, + "step": 7437 + }, + { + "epoch": 2.2829957028852057, + "grad_norm": 0.35403937101364136, + "learning_rate": 9.004208613573334e-05, + "loss": 1.785, + "step": 7438 + }, + { + "epoch": 2.283302639656231, + "grad_norm": 0.3038218021392822, + "learning_rate": 9.003910918857426e-05, + "loss": 1.8199, + "step": 7439 + }, + { + "epoch": 2.283609576427256, + "grad_norm": 0.3447442352771759, + "learning_rate": 9.003613184572522e-05, + "loss": 1.882, + "step": 7440 + }, + { + "epoch": 2.2839165131982813, + "grad_norm": 0.32208123803138733, + "learning_rate": 9.003315410721567e-05, + "loss": 1.8326, + "step": 7441 + }, + { + "epoch": 2.284223449969306, + "grad_norm": 0.31731268763542175, + "learning_rate": 9.003017597307504e-05, + "loss": 1.8947, + "step": 7442 + }, + { + "epoch": 2.2845303867403315, + "grad_norm": 0.3491910398006439, + "learning_rate": 9.002719744333273e-05, + "loss": 1.924, + "step": 7443 + }, + { + "epoch": 2.284837323511357, + "grad_norm": 0.32135117053985596, + "learning_rate": 9.00242185180182e-05, + "loss": 1.838, + "step": 7444 + }, + { + "epoch": 2.2851442602823817, + "grad_norm": 0.32201823592185974, + "learning_rate": 9.00212391971609e-05, + "loss": 1.8449, + "step": 7445 + }, + { + "epoch": 2.285451197053407, + "grad_norm": 0.3983609676361084, + "learning_rate": 9.001825948079024e-05, + "loss": 1.8897, + "step": 7446 + }, + { + "epoch": 2.285758133824432, + "grad_norm": 0.4174421727657318, + "learning_rate": 9.001527936893568e-05, + "loss": 1.8671, + "step": 7447 + }, + { + "epoch": 2.2860650705954573, + "grad_norm": 0.3456888496875763, + "learning_rate": 9.001229886162668e-05, + "loss": 1.9064, + "step": 7448 + }, + { + "epoch": 2.2863720073664826, + "grad_norm": 0.3092229664325714, + "learning_rate": 9.000931795889269e-05, + "loss": 1.8478, + "step": 7449 + }, + { + "epoch": 2.2866789441375075, + "grad_norm": 0.40093541145324707, + "learning_rate": 9.000633666076317e-05, + "loss": 1.9226, + "step": 7450 + }, + { + "epoch": 2.286985880908533, + "grad_norm": 0.41090336441993713, + "learning_rate": 9.000335496726759e-05, + "loss": 1.8542, + "step": 7451 + }, + { + "epoch": 2.287292817679558, + "grad_norm": 0.48479974269866943, + "learning_rate": 9.00003728784354e-05, + "loss": 1.9217, + "step": 7452 + }, + { + "epoch": 2.287599754450583, + "grad_norm": 0.662677526473999, + "learning_rate": 8.999739039429609e-05, + "loss": 1.957, + "step": 7453 + }, + { + "epoch": 2.2879066912216084, + "grad_norm": 0.6417959928512573, + "learning_rate": 8.999440751487911e-05, + "loss": 1.8273, + "step": 7454 + }, + { + "epoch": 2.2882136279926337, + "grad_norm": 0.5561745762825012, + "learning_rate": 8.999142424021396e-05, + "loss": 1.9081, + "step": 7455 + }, + { + "epoch": 2.2885205647636586, + "grad_norm": 0.3603537976741791, + "learning_rate": 8.998844057033013e-05, + "loss": 1.8256, + "step": 7456 + }, + { + "epoch": 2.288827501534684, + "grad_norm": 0.5149406790733337, + "learning_rate": 8.998545650525707e-05, + "loss": 1.8257, + "step": 7457 + }, + { + "epoch": 2.289134438305709, + "grad_norm": 0.6777750253677368, + "learning_rate": 8.99824720450243e-05, + "loss": 1.8581, + "step": 7458 + }, + { + "epoch": 2.289441375076734, + "grad_norm": 0.6244171857833862, + "learning_rate": 8.997948718966132e-05, + "loss": 1.9195, + "step": 7459 + }, + { + "epoch": 2.2897483118477595, + "grad_norm": 0.3903466463088989, + "learning_rate": 8.99765019391976e-05, + "loss": 1.8996, + "step": 7460 + }, + { + "epoch": 2.2900552486187844, + "grad_norm": 0.4231773614883423, + "learning_rate": 8.997351629366266e-05, + "loss": 1.9447, + "step": 7461 + }, + { + "epoch": 2.2903621853898097, + "grad_norm": 0.5735896825790405, + "learning_rate": 8.997053025308602e-05, + "loss": 1.9082, + "step": 7462 + }, + { + "epoch": 2.2906691221608346, + "grad_norm": 0.5015980005264282, + "learning_rate": 8.996754381749715e-05, + "loss": 1.8744, + "step": 7463 + }, + { + "epoch": 2.29097605893186, + "grad_norm": 0.3385339677333832, + "learning_rate": 8.996455698692558e-05, + "loss": 1.8908, + "step": 7464 + }, + { + "epoch": 2.2912829957028853, + "grad_norm": 0.35323935747146606, + "learning_rate": 8.996156976140086e-05, + "loss": 1.8739, + "step": 7465 + }, + { + "epoch": 2.29158993247391, + "grad_norm": 0.386081725358963, + "learning_rate": 8.995858214095248e-05, + "loss": 1.8734, + "step": 7466 + }, + { + "epoch": 2.2918968692449355, + "grad_norm": 0.32834386825561523, + "learning_rate": 8.995559412560996e-05, + "loss": 1.8849, + "step": 7467 + }, + { + "epoch": 2.292203806015961, + "grad_norm": 0.3868117034435272, + "learning_rate": 8.995260571540284e-05, + "loss": 1.8992, + "step": 7468 + }, + { + "epoch": 2.2925107427869857, + "grad_norm": 0.3869209885597229, + "learning_rate": 8.994961691036066e-05, + "loss": 1.8562, + "step": 7469 + }, + { + "epoch": 2.292817679558011, + "grad_norm": 0.39098650217056274, + "learning_rate": 8.994662771051294e-05, + "loss": 1.9077, + "step": 7470 + }, + { + "epoch": 2.2931246163290364, + "grad_norm": 0.4433341920375824, + "learning_rate": 8.994363811588923e-05, + "loss": 1.9193, + "step": 7471 + }, + { + "epoch": 2.2934315531000613, + "grad_norm": 0.37947940826416016, + "learning_rate": 8.99406481265191e-05, + "loss": 1.8843, + "step": 7472 + }, + { + "epoch": 2.2937384898710866, + "grad_norm": 0.4123954772949219, + "learning_rate": 8.993765774243206e-05, + "loss": 1.8847, + "step": 7473 + }, + { + "epoch": 2.2940454266421115, + "grad_norm": 0.3863835036754608, + "learning_rate": 8.993466696365768e-05, + "loss": 1.8226, + "step": 7474 + }, + { + "epoch": 2.294352363413137, + "grad_norm": 0.34903961420059204, + "learning_rate": 8.993167579022551e-05, + "loss": 1.9151, + "step": 7475 + }, + { + "epoch": 2.294659300184162, + "grad_norm": 0.439989298582077, + "learning_rate": 8.992868422216512e-05, + "loss": 1.8494, + "step": 7476 + }, + { + "epoch": 2.294966236955187, + "grad_norm": 0.42929476499557495, + "learning_rate": 8.992569225950607e-05, + "loss": 1.8174, + "step": 7477 + }, + { + "epoch": 2.2952731737262124, + "grad_norm": 0.39554497599601746, + "learning_rate": 8.992269990227792e-05, + "loss": 1.8692, + "step": 7478 + }, + { + "epoch": 2.2955801104972378, + "grad_norm": 0.29355254769325256, + "learning_rate": 8.991970715051026e-05, + "loss": 1.8033, + "step": 7479 + }, + { + "epoch": 2.2958870472682626, + "grad_norm": 0.3488605320453644, + "learning_rate": 8.991671400423265e-05, + "loss": 1.8979, + "step": 7480 + }, + { + "epoch": 2.296193984039288, + "grad_norm": 0.34984245896339417, + "learning_rate": 8.991372046347468e-05, + "loss": 1.8931, + "step": 7481 + }, + { + "epoch": 2.2965009208103133, + "grad_norm": 0.29404810070991516, + "learning_rate": 8.991072652826593e-05, + "loss": 1.8626, + "step": 7482 + }, + { + "epoch": 2.296807857581338, + "grad_norm": 0.2838701009750366, + "learning_rate": 8.990773219863598e-05, + "loss": 1.8542, + "step": 7483 + }, + { + "epoch": 2.2971147943523635, + "grad_norm": 0.28008925914764404, + "learning_rate": 8.990473747461444e-05, + "loss": 1.8354, + "step": 7484 + }, + { + "epoch": 2.2974217311233884, + "grad_norm": 0.3046751320362091, + "learning_rate": 8.99017423562309e-05, + "loss": 1.8657, + "step": 7485 + }, + { + "epoch": 2.2977286678944138, + "grad_norm": 0.28220781683921814, + "learning_rate": 8.989874684351494e-05, + "loss": 1.8349, + "step": 7486 + }, + { + "epoch": 2.298035604665439, + "grad_norm": 0.2665577232837677, + "learning_rate": 8.989575093649619e-05, + "loss": 1.8551, + "step": 7487 + }, + { + "epoch": 2.298342541436464, + "grad_norm": 0.2797924280166626, + "learning_rate": 8.989275463520423e-05, + "loss": 1.8568, + "step": 7488 + }, + { + "epoch": 2.2986494782074893, + "grad_norm": 0.2917410731315613, + "learning_rate": 8.98897579396687e-05, + "loss": 1.843, + "step": 7489 + }, + { + "epoch": 2.298956414978514, + "grad_norm": 0.3014819920063019, + "learning_rate": 8.98867608499192e-05, + "loss": 1.8527, + "step": 7490 + }, + { + "epoch": 2.2992633517495396, + "grad_norm": 0.28019243478775024, + "learning_rate": 8.988376336598537e-05, + "loss": 1.7744, + "step": 7491 + }, + { + "epoch": 2.299570288520565, + "grad_norm": 0.35014277696609497, + "learning_rate": 8.988076548789678e-05, + "loss": 1.9604, + "step": 7492 + }, + { + "epoch": 2.2998772252915898, + "grad_norm": 0.3060695230960846, + "learning_rate": 8.987776721568311e-05, + "loss": 1.8463, + "step": 7493 + }, + { + "epoch": 2.300184162062615, + "grad_norm": 0.29870638251304626, + "learning_rate": 8.987476854937395e-05, + "loss": 1.815, + "step": 7494 + }, + { + "epoch": 2.3004910988336404, + "grad_norm": 0.27395132184028625, + "learning_rate": 8.987176948899898e-05, + "loss": 1.8126, + "step": 7495 + }, + { + "epoch": 2.3007980356046653, + "grad_norm": 0.2982339859008789, + "learning_rate": 8.986877003458781e-05, + "loss": 1.9114, + "step": 7496 + }, + { + "epoch": 2.3011049723756907, + "grad_norm": 0.3113982081413269, + "learning_rate": 8.986577018617008e-05, + "loss": 1.8429, + "step": 7497 + }, + { + "epoch": 2.301411909146716, + "grad_norm": 0.3538585603237152, + "learning_rate": 8.986276994377544e-05, + "loss": 1.9045, + "step": 7498 + }, + { + "epoch": 2.301718845917741, + "grad_norm": 0.37576064467430115, + "learning_rate": 8.985976930743356e-05, + "loss": 1.8955, + "step": 7499 + }, + { + "epoch": 2.3020257826887662, + "grad_norm": 0.3080044388771057, + "learning_rate": 8.985676827717406e-05, + "loss": 1.7946, + "step": 7500 + }, + { + "epoch": 2.302332719459791, + "grad_norm": 0.33935341238975525, + "learning_rate": 8.985376685302662e-05, + "loss": 1.8817, + "step": 7501 + }, + { + "epoch": 2.3026396562308165, + "grad_norm": 0.3817180395126343, + "learning_rate": 8.98507650350209e-05, + "loss": 1.9178, + "step": 7502 + }, + { + "epoch": 2.302946593001842, + "grad_norm": 0.35170307755470276, + "learning_rate": 8.984776282318657e-05, + "loss": 1.9451, + "step": 7503 + }, + { + "epoch": 2.3032535297728667, + "grad_norm": 0.3451419770717621, + "learning_rate": 8.984476021755329e-05, + "loss": 1.9127, + "step": 7504 + }, + { + "epoch": 2.303560466543892, + "grad_norm": 0.4312259554862976, + "learning_rate": 8.984175721815071e-05, + "loss": 1.8784, + "step": 7505 + }, + { + "epoch": 2.303867403314917, + "grad_norm": 0.4684976041316986, + "learning_rate": 8.983875382500856e-05, + "loss": 1.8782, + "step": 7506 + }, + { + "epoch": 2.3041743400859422, + "grad_norm": 0.4230491518974304, + "learning_rate": 8.983575003815648e-05, + "loss": 1.8769, + "step": 7507 + }, + { + "epoch": 2.3044812768569676, + "grad_norm": 0.32715409994125366, + "learning_rate": 8.983274585762417e-05, + "loss": 1.8535, + "step": 7508 + }, + { + "epoch": 2.3047882136279925, + "grad_norm": 0.3857569396495819, + "learning_rate": 8.982974128344134e-05, + "loss": 1.8689, + "step": 7509 + }, + { + "epoch": 2.305095150399018, + "grad_norm": 0.46266329288482666, + "learning_rate": 8.982673631563766e-05, + "loss": 1.9151, + "step": 7510 + }, + { + "epoch": 2.305402087170043, + "grad_norm": 0.455713152885437, + "learning_rate": 8.98237309542428e-05, + "loss": 1.9304, + "step": 7511 + }, + { + "epoch": 2.305709023941068, + "grad_norm": 0.3413514792919159, + "learning_rate": 8.98207251992865e-05, + "loss": 1.8516, + "step": 7512 + }, + { + "epoch": 2.3060159607120934, + "grad_norm": 0.3705863058567047, + "learning_rate": 8.981771905079846e-05, + "loss": 1.8434, + "step": 7513 + }, + { + "epoch": 2.3063228974831187, + "grad_norm": 0.46615147590637207, + "learning_rate": 8.981471250880839e-05, + "loss": 1.9265, + "step": 7514 + }, + { + "epoch": 2.3066298342541436, + "grad_norm": 0.5400925278663635, + "learning_rate": 8.981170557334598e-05, + "loss": 1.9061, + "step": 7515 + }, + { + "epoch": 2.306936771025169, + "grad_norm": 0.40317288041114807, + "learning_rate": 8.980869824444096e-05, + "loss": 1.7916, + "step": 7516 + }, + { + "epoch": 2.307243707796194, + "grad_norm": 0.3522326648235321, + "learning_rate": 8.980569052212307e-05, + "loss": 1.867, + "step": 7517 + }, + { + "epoch": 2.307550644567219, + "grad_norm": 0.5134142637252808, + "learning_rate": 8.9802682406422e-05, + "loss": 1.8406, + "step": 7518 + }, + { + "epoch": 2.3078575813382445, + "grad_norm": 0.5792621970176697, + "learning_rate": 8.97996738973675e-05, + "loss": 1.8467, + "step": 7519 + }, + { + "epoch": 2.3081645181092694, + "grad_norm": 0.424405962228775, + "learning_rate": 8.979666499498928e-05, + "loss": 1.779, + "step": 7520 + }, + { + "epoch": 2.3084714548802947, + "grad_norm": 0.3233562409877777, + "learning_rate": 8.979365569931712e-05, + "loss": 1.9043, + "step": 7521 + }, + { + "epoch": 2.3087783916513196, + "grad_norm": 0.6043062806129456, + "learning_rate": 8.979064601038071e-05, + "loss": 1.9245, + "step": 7522 + }, + { + "epoch": 2.309085328422345, + "grad_norm": 0.6618810892105103, + "learning_rate": 8.978763592820982e-05, + "loss": 1.8601, + "step": 7523 + }, + { + "epoch": 2.3093922651933703, + "grad_norm": 0.44771909713745117, + "learning_rate": 8.978462545283418e-05, + "loss": 1.7836, + "step": 7524 + }, + { + "epoch": 2.309699201964395, + "grad_norm": 0.3473430871963501, + "learning_rate": 8.978161458428356e-05, + "loss": 1.8743, + "step": 7525 + }, + { + "epoch": 2.3100061387354205, + "grad_norm": 0.46158188581466675, + "learning_rate": 8.977860332258772e-05, + "loss": 1.8802, + "step": 7526 + }, + { + "epoch": 2.310313075506446, + "grad_norm": 0.42034098505973816, + "learning_rate": 8.977559166777639e-05, + "loss": 1.8773, + "step": 7527 + }, + { + "epoch": 2.3106200122774707, + "grad_norm": 0.30994895100593567, + "learning_rate": 8.977257961987936e-05, + "loss": 1.8042, + "step": 7528 + }, + { + "epoch": 2.310926949048496, + "grad_norm": 0.32265907526016235, + "learning_rate": 8.976956717892638e-05, + "loss": 1.8, + "step": 7529 + }, + { + "epoch": 2.3112338858195214, + "grad_norm": 0.3592197000980377, + "learning_rate": 8.976655434494723e-05, + "loss": 1.9053, + "step": 7530 + }, + { + "epoch": 2.3115408225905463, + "grad_norm": 0.36494702100753784, + "learning_rate": 8.97635411179717e-05, + "loss": 1.8982, + "step": 7531 + }, + { + "epoch": 2.3118477593615716, + "grad_norm": 0.3697327971458435, + "learning_rate": 8.976052749802952e-05, + "loss": 1.9446, + "step": 7532 + }, + { + "epoch": 2.3121546961325965, + "grad_norm": 0.5200048089027405, + "learning_rate": 8.975751348515052e-05, + "loss": 1.9429, + "step": 7533 + }, + { + "epoch": 2.312461632903622, + "grad_norm": 0.4033229947090149, + "learning_rate": 8.975449907936446e-05, + "loss": 1.8128, + "step": 7534 + }, + { + "epoch": 2.312768569674647, + "grad_norm": 0.35759851336479187, + "learning_rate": 8.975148428070115e-05, + "loss": 1.8721, + "step": 7535 + }, + { + "epoch": 2.313075506445672, + "grad_norm": 0.4578085243701935, + "learning_rate": 8.974846908919037e-05, + "loss": 1.8397, + "step": 7536 + }, + { + "epoch": 2.3133824432166974, + "grad_norm": 0.4557357132434845, + "learning_rate": 8.974545350486192e-05, + "loss": 1.8726, + "step": 7537 + }, + { + "epoch": 2.3136893799877223, + "grad_norm": 0.3946380615234375, + "learning_rate": 8.974243752774561e-05, + "loss": 1.8662, + "step": 7538 + }, + { + "epoch": 2.3139963167587476, + "grad_norm": 0.29723790287971497, + "learning_rate": 8.973942115787122e-05, + "loss": 1.8215, + "step": 7539 + }, + { + "epoch": 2.314303253529773, + "grad_norm": 0.37225791811943054, + "learning_rate": 8.973640439526858e-05, + "loss": 1.9422, + "step": 7540 + }, + { + "epoch": 2.314610190300798, + "grad_norm": 0.3359868824481964, + "learning_rate": 8.973338723996751e-05, + "loss": 1.7974, + "step": 7541 + }, + { + "epoch": 2.314917127071823, + "grad_norm": 0.2993139922618866, + "learning_rate": 8.973036969199782e-05, + "loss": 1.8691, + "step": 7542 + }, + { + "epoch": 2.3152240638428485, + "grad_norm": 0.3155567944049835, + "learning_rate": 8.972735175138933e-05, + "loss": 1.857, + "step": 7543 + }, + { + "epoch": 2.3155310006138734, + "grad_norm": 0.315820574760437, + "learning_rate": 8.972433341817188e-05, + "loss": 1.8597, + "step": 7544 + }, + { + "epoch": 2.3158379373848987, + "grad_norm": 0.32500606775283813, + "learning_rate": 8.972131469237526e-05, + "loss": 1.9293, + "step": 7545 + }, + { + "epoch": 2.316144874155924, + "grad_norm": 0.3481442332267761, + "learning_rate": 8.971829557402933e-05, + "loss": 1.8839, + "step": 7546 + }, + { + "epoch": 2.316451810926949, + "grad_norm": 0.3110404312610626, + "learning_rate": 8.971527606316394e-05, + "loss": 1.8717, + "step": 7547 + }, + { + "epoch": 2.3167587476979743, + "grad_norm": 0.319795161485672, + "learning_rate": 8.97122561598089e-05, + "loss": 1.8855, + "step": 7548 + }, + { + "epoch": 2.317065684468999, + "grad_norm": 0.33142411708831787, + "learning_rate": 8.970923586399407e-05, + "loss": 1.863, + "step": 7549 + }, + { + "epoch": 2.3173726212400245, + "grad_norm": 0.348715603351593, + "learning_rate": 8.970621517574929e-05, + "loss": 1.8886, + "step": 7550 + }, + { + "epoch": 2.31767955801105, + "grad_norm": 0.3179607689380646, + "learning_rate": 8.970319409510444e-05, + "loss": 1.8955, + "step": 7551 + }, + { + "epoch": 2.3179864947820747, + "grad_norm": 0.33166465163230896, + "learning_rate": 8.970017262208934e-05, + "loss": 1.8366, + "step": 7552 + }, + { + "epoch": 2.3182934315531, + "grad_norm": 0.30798691511154175, + "learning_rate": 8.969715075673386e-05, + "loss": 1.8437, + "step": 7553 + }, + { + "epoch": 2.3186003683241254, + "grad_norm": 0.292639821767807, + "learning_rate": 8.969412849906788e-05, + "loss": 1.8056, + "step": 7554 + }, + { + "epoch": 2.3189073050951503, + "grad_norm": 0.2972165048122406, + "learning_rate": 8.969110584912125e-05, + "loss": 1.8596, + "step": 7555 + }, + { + "epoch": 2.3192142418661756, + "grad_norm": 0.3346043527126312, + "learning_rate": 8.968808280692385e-05, + "loss": 1.8652, + "step": 7556 + }, + { + "epoch": 2.319521178637201, + "grad_norm": 0.31866857409477234, + "learning_rate": 8.968505937250555e-05, + "loss": 1.9263, + "step": 7557 + }, + { + "epoch": 2.319828115408226, + "grad_norm": 0.3511367440223694, + "learning_rate": 8.968203554589625e-05, + "loss": 1.8615, + "step": 7558 + }, + { + "epoch": 2.320135052179251, + "grad_norm": 0.36077243089675903, + "learning_rate": 8.96790113271258e-05, + "loss": 1.9155, + "step": 7559 + }, + { + "epoch": 2.320441988950276, + "grad_norm": 0.3335363268852234, + "learning_rate": 8.96759867162241e-05, + "loss": 1.8313, + "step": 7560 + }, + { + "epoch": 2.3207489257213014, + "grad_norm": 0.31834676861763, + "learning_rate": 8.967296171322105e-05, + "loss": 1.809, + "step": 7561 + }, + { + "epoch": 2.3210558624923268, + "grad_norm": 0.3629632890224457, + "learning_rate": 8.966993631814655e-05, + "loss": 1.854, + "step": 7562 + }, + { + "epoch": 2.3213627992633517, + "grad_norm": 0.3164220154285431, + "learning_rate": 8.966691053103049e-05, + "loss": 1.8431, + "step": 7563 + }, + { + "epoch": 2.321669736034377, + "grad_norm": 0.408178448677063, + "learning_rate": 8.966388435190276e-05, + "loss": 1.8652, + "step": 7564 + }, + { + "epoch": 2.321976672805402, + "grad_norm": 0.4244436025619507, + "learning_rate": 8.966085778079327e-05, + "loss": 1.8834, + "step": 7565 + }, + { + "epoch": 2.322283609576427, + "grad_norm": 0.44187989830970764, + "learning_rate": 8.965783081773195e-05, + "loss": 1.8822, + "step": 7566 + }, + { + "epoch": 2.3225905463474525, + "grad_norm": 0.30801042914390564, + "learning_rate": 8.965480346274869e-05, + "loss": 1.8145, + "step": 7567 + }, + { + "epoch": 2.3228974831184774, + "grad_norm": 0.30103740096092224, + "learning_rate": 8.965177571587343e-05, + "loss": 1.8207, + "step": 7568 + }, + { + "epoch": 2.3232044198895028, + "grad_norm": 0.417538046836853, + "learning_rate": 8.964874757713608e-05, + "loss": 1.9213, + "step": 7569 + }, + { + "epoch": 2.323511356660528, + "grad_norm": 0.4238434433937073, + "learning_rate": 8.964571904656656e-05, + "loss": 1.8309, + "step": 7570 + }, + { + "epoch": 2.323818293431553, + "grad_norm": 0.3717726171016693, + "learning_rate": 8.964269012419482e-05, + "loss": 1.8613, + "step": 7571 + }, + { + "epoch": 2.3241252302025783, + "grad_norm": 0.369182288646698, + "learning_rate": 8.963966081005078e-05, + "loss": 1.9232, + "step": 7572 + }, + { + "epoch": 2.3244321669736037, + "grad_norm": 0.40301385521888733, + "learning_rate": 8.963663110416436e-05, + "loss": 1.9509, + "step": 7573 + }, + { + "epoch": 2.3247391037446286, + "grad_norm": 0.3336825966835022, + "learning_rate": 8.963360100656553e-05, + "loss": 1.807, + "step": 7574 + }, + { + "epoch": 2.325046040515654, + "grad_norm": 0.4070039987564087, + "learning_rate": 8.963057051728423e-05, + "loss": 1.9349, + "step": 7575 + }, + { + "epoch": 2.325352977286679, + "grad_norm": 0.34244731068611145, + "learning_rate": 8.96275396363504e-05, + "loss": 1.8378, + "step": 7576 + }, + { + "epoch": 2.325659914057704, + "grad_norm": 0.3408849835395813, + "learning_rate": 8.962450836379401e-05, + "loss": 1.8087, + "step": 7577 + }, + { + "epoch": 2.3259668508287294, + "grad_norm": 0.34224358201026917, + "learning_rate": 8.962147669964498e-05, + "loss": 1.9158, + "step": 7578 + }, + { + "epoch": 2.3262737875997543, + "grad_norm": 0.36177051067352295, + "learning_rate": 8.961844464393332e-05, + "loss": 1.8774, + "step": 7579 + }, + { + "epoch": 2.3265807243707797, + "grad_norm": 0.3000224232673645, + "learning_rate": 8.961541219668895e-05, + "loss": 1.8092, + "step": 7580 + }, + { + "epoch": 2.3268876611418046, + "grad_norm": 0.34738194942474365, + "learning_rate": 8.961237935794185e-05, + "loss": 1.9107, + "step": 7581 + }, + { + "epoch": 2.32719459791283, + "grad_norm": 0.355585515499115, + "learning_rate": 8.960934612772203e-05, + "loss": 1.8343, + "step": 7582 + }, + { + "epoch": 2.3275015346838552, + "grad_norm": 0.29839828610420227, + "learning_rate": 8.96063125060594e-05, + "loss": 1.8345, + "step": 7583 + }, + { + "epoch": 2.32780847145488, + "grad_norm": 0.3695736229419708, + "learning_rate": 8.960327849298399e-05, + "loss": 1.8763, + "step": 7584 + }, + { + "epoch": 2.3281154082259055, + "grad_norm": 0.38834989070892334, + "learning_rate": 8.960024408852578e-05, + "loss": 1.8732, + "step": 7585 + }, + { + "epoch": 2.328422344996931, + "grad_norm": 0.4515606462955475, + "learning_rate": 8.959720929271474e-05, + "loss": 1.9685, + "step": 7586 + }, + { + "epoch": 2.3287292817679557, + "grad_norm": 0.39115825295448303, + "learning_rate": 8.959417410558087e-05, + "loss": 1.7969, + "step": 7587 + }, + { + "epoch": 2.329036218538981, + "grad_norm": 0.37858307361602783, + "learning_rate": 8.959113852715417e-05, + "loss": 1.9013, + "step": 7588 + }, + { + "epoch": 2.3293431553100064, + "grad_norm": 0.35533010959625244, + "learning_rate": 8.958810255746462e-05, + "loss": 1.8862, + "step": 7589 + }, + { + "epoch": 2.3296500920810312, + "grad_norm": 0.36994054913520813, + "learning_rate": 8.958506619654226e-05, + "loss": 1.9783, + "step": 7590 + }, + { + "epoch": 2.3299570288520566, + "grad_norm": 0.4424416124820709, + "learning_rate": 8.958202944441705e-05, + "loss": 1.9095, + "step": 7591 + }, + { + "epoch": 2.3302639656230815, + "grad_norm": 0.41932111978530884, + "learning_rate": 8.957899230111903e-05, + "loss": 1.8623, + "step": 7592 + }, + { + "epoch": 2.330570902394107, + "grad_norm": 0.4359748363494873, + "learning_rate": 8.957595476667822e-05, + "loss": 1.8917, + "step": 7593 + }, + { + "epoch": 2.330877839165132, + "grad_norm": 0.362957239151001, + "learning_rate": 8.957291684112463e-05, + "loss": 1.8478, + "step": 7594 + }, + { + "epoch": 2.331184775936157, + "grad_norm": 0.3442717492580414, + "learning_rate": 8.956987852448827e-05, + "loss": 1.862, + "step": 7595 + }, + { + "epoch": 2.3314917127071824, + "grad_norm": 0.33355212211608887, + "learning_rate": 8.956683981679918e-05, + "loss": 1.8319, + "step": 7596 + }, + { + "epoch": 2.3317986494782073, + "grad_norm": 0.36758801341056824, + "learning_rate": 8.95638007180874e-05, + "loss": 1.8989, + "step": 7597 + }, + { + "epoch": 2.3321055862492326, + "grad_norm": 0.3574751019477844, + "learning_rate": 8.956076122838294e-05, + "loss": 1.8304, + "step": 7598 + }, + { + "epoch": 2.332412523020258, + "grad_norm": 0.30615341663360596, + "learning_rate": 8.955772134771585e-05, + "loss": 1.9078, + "step": 7599 + }, + { + "epoch": 2.332719459791283, + "grad_norm": 0.38824397325515747, + "learning_rate": 8.955468107611618e-05, + "loss": 1.8733, + "step": 7600 + }, + { + "epoch": 2.333026396562308, + "grad_norm": 0.40545380115509033, + "learning_rate": 8.955164041361395e-05, + "loss": 1.8264, + "step": 7601 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.3104313910007477, + "learning_rate": 8.954859936023925e-05, + "loss": 1.8272, + "step": 7602 + }, + { + "epoch": 2.3336402701043584, + "grad_norm": 0.34795114398002625, + "learning_rate": 8.954555791602211e-05, + "loss": 1.8711, + "step": 7603 + }, + { + "epoch": 2.3339472068753837, + "grad_norm": 0.42790937423706055, + "learning_rate": 8.954251608099257e-05, + "loss": 1.8802, + "step": 7604 + }, + { + "epoch": 2.334254143646409, + "grad_norm": 0.3903054893016815, + "learning_rate": 8.953947385518072e-05, + "loss": 1.8489, + "step": 7605 + }, + { + "epoch": 2.334561080417434, + "grad_norm": 0.35869601368904114, + "learning_rate": 8.953643123861661e-05, + "loss": 1.8565, + "step": 7606 + }, + { + "epoch": 2.3348680171884593, + "grad_norm": 0.3960758447647095, + "learning_rate": 8.953338823133033e-05, + "loss": 1.9335, + "step": 7607 + }, + { + "epoch": 2.335174953959484, + "grad_norm": 0.3884136974811554, + "learning_rate": 8.953034483335191e-05, + "loss": 1.887, + "step": 7608 + }, + { + "epoch": 2.3354818907305095, + "grad_norm": 0.3734811246395111, + "learning_rate": 8.952730104471147e-05, + "loss": 1.861, + "step": 7609 + }, + { + "epoch": 2.335788827501535, + "grad_norm": 0.3074554204940796, + "learning_rate": 8.952425686543908e-05, + "loss": 1.8556, + "step": 7610 + }, + { + "epoch": 2.3360957642725597, + "grad_norm": 0.3098750412464142, + "learning_rate": 8.952121229556481e-05, + "loss": 1.8724, + "step": 7611 + }, + { + "epoch": 2.336402701043585, + "grad_norm": 0.3514649569988251, + "learning_rate": 8.951816733511875e-05, + "loss": 1.8023, + "step": 7612 + }, + { + "epoch": 2.33670963781461, + "grad_norm": 0.3275100290775299, + "learning_rate": 8.951512198413101e-05, + "loss": 1.8805, + "step": 7613 + }, + { + "epoch": 2.3370165745856353, + "grad_norm": 0.3380829989910126, + "learning_rate": 8.951207624263165e-05, + "loss": 1.8559, + "step": 7614 + }, + { + "epoch": 2.3373235113566606, + "grad_norm": 0.43179723620414734, + "learning_rate": 8.950903011065082e-05, + "loss": 1.937, + "step": 7615 + }, + { + "epoch": 2.337630448127686, + "grad_norm": 0.4981893002986908, + "learning_rate": 8.950598358821858e-05, + "loss": 1.8828, + "step": 7616 + }, + { + "epoch": 2.337937384898711, + "grad_norm": 0.42164552211761475, + "learning_rate": 8.950293667536506e-05, + "loss": 1.8898, + "step": 7617 + }, + { + "epoch": 2.338244321669736, + "grad_norm": 0.32897287607192993, + "learning_rate": 8.949988937212037e-05, + "loss": 1.9073, + "step": 7618 + }, + { + "epoch": 2.338551258440761, + "grad_norm": 0.38831618428230286, + "learning_rate": 8.949684167851462e-05, + "loss": 1.9694, + "step": 7619 + }, + { + "epoch": 2.3388581952117864, + "grad_norm": 0.3728467524051666, + "learning_rate": 8.949379359457793e-05, + "loss": 1.8803, + "step": 7620 + }, + { + "epoch": 2.3391651319828117, + "grad_norm": 0.4003579020500183, + "learning_rate": 8.949074512034044e-05, + "loss": 1.9306, + "step": 7621 + }, + { + "epoch": 2.3394720687538366, + "grad_norm": 0.35670751333236694, + "learning_rate": 8.948769625583224e-05, + "loss": 1.9176, + "step": 7622 + }, + { + "epoch": 2.339779005524862, + "grad_norm": 0.3257119357585907, + "learning_rate": 8.948464700108347e-05, + "loss": 1.8781, + "step": 7623 + }, + { + "epoch": 2.340085942295887, + "grad_norm": 0.2840226888656616, + "learning_rate": 8.94815973561243e-05, + "loss": 1.8112, + "step": 7624 + }, + { + "epoch": 2.340392879066912, + "grad_norm": 0.33156147599220276, + "learning_rate": 8.947854732098484e-05, + "loss": 1.8562, + "step": 7625 + }, + { + "epoch": 2.3406998158379375, + "grad_norm": 0.33335328102111816, + "learning_rate": 8.947549689569524e-05, + "loss": 1.8404, + "step": 7626 + }, + { + "epoch": 2.3410067526089624, + "grad_norm": 0.2913919985294342, + "learning_rate": 8.947244608028562e-05, + "loss": 1.83, + "step": 7627 + }, + { + "epoch": 2.3413136893799877, + "grad_norm": 0.32735875248908997, + "learning_rate": 8.946939487478618e-05, + "loss": 1.9047, + "step": 7628 + }, + { + "epoch": 2.341620626151013, + "grad_norm": 0.3421878516674042, + "learning_rate": 8.946634327922703e-05, + "loss": 1.8771, + "step": 7629 + }, + { + "epoch": 2.341927562922038, + "grad_norm": 0.33164483308792114, + "learning_rate": 8.946329129363835e-05, + "loss": 1.8463, + "step": 7630 + }, + { + "epoch": 2.3422344996930633, + "grad_norm": 0.35423099994659424, + "learning_rate": 8.946023891805029e-05, + "loss": 1.9254, + "step": 7631 + }, + { + "epoch": 2.3425414364640886, + "grad_norm": 0.3554958403110504, + "learning_rate": 8.9457186152493e-05, + "loss": 1.8949, + "step": 7632 + }, + { + "epoch": 2.3428483732351135, + "grad_norm": 0.35155919194221497, + "learning_rate": 8.94541329969967e-05, + "loss": 1.8432, + "step": 7633 + }, + { + "epoch": 2.343155310006139, + "grad_norm": 0.3210476338863373, + "learning_rate": 8.945107945159154e-05, + "loss": 1.8512, + "step": 7634 + }, + { + "epoch": 2.3434622467771637, + "grad_norm": 0.3587365746498108, + "learning_rate": 8.944802551630767e-05, + "loss": 1.8355, + "step": 7635 + }, + { + "epoch": 2.343769183548189, + "grad_norm": 0.41851457953453064, + "learning_rate": 8.94449711911753e-05, + "loss": 1.814, + "step": 7636 + }, + { + "epoch": 2.3440761203192144, + "grad_norm": 0.3516016900539398, + "learning_rate": 8.94419164762246e-05, + "loss": 1.8563, + "step": 7637 + }, + { + "epoch": 2.3443830570902393, + "grad_norm": 0.2917228937149048, + "learning_rate": 8.943886137148576e-05, + "loss": 1.8037, + "step": 7638 + }, + { + "epoch": 2.3446899938612646, + "grad_norm": 0.3597778379917145, + "learning_rate": 8.943580587698899e-05, + "loss": 1.8766, + "step": 7639 + }, + { + "epoch": 2.3449969306322895, + "grad_norm": 0.359642893075943, + "learning_rate": 8.943274999276445e-05, + "loss": 1.8485, + "step": 7640 + }, + { + "epoch": 2.345303867403315, + "grad_norm": 0.3543380796909332, + "learning_rate": 8.942969371884238e-05, + "loss": 1.8853, + "step": 7641 + }, + { + "epoch": 2.34561080417434, + "grad_norm": 0.371267706155777, + "learning_rate": 8.942663705525296e-05, + "loss": 1.869, + "step": 7642 + }, + { + "epoch": 2.345917740945365, + "grad_norm": 0.34073930978775024, + "learning_rate": 8.942358000202642e-05, + "loss": 1.831, + "step": 7643 + }, + { + "epoch": 2.3462246777163904, + "grad_norm": 0.3654492497444153, + "learning_rate": 8.942052255919293e-05, + "loss": 1.8697, + "step": 7644 + }, + { + "epoch": 2.3465316144874158, + "grad_norm": 0.31281957030296326, + "learning_rate": 8.941746472678275e-05, + "loss": 1.7908, + "step": 7645 + }, + { + "epoch": 2.3468385512584407, + "grad_norm": 0.3310844302177429, + "learning_rate": 8.941440650482607e-05, + "loss": 1.8523, + "step": 7646 + }, + { + "epoch": 2.347145488029466, + "grad_norm": 0.3187454342842102, + "learning_rate": 8.941134789335312e-05, + "loss": 1.8808, + "step": 7647 + }, + { + "epoch": 2.3474524248004913, + "grad_norm": 0.35980424284935, + "learning_rate": 8.940828889239415e-05, + "loss": 1.8713, + "step": 7648 + }, + { + "epoch": 2.347759361571516, + "grad_norm": 0.2960885763168335, + "learning_rate": 8.940522950197935e-05, + "loss": 1.8077, + "step": 7649 + }, + { + "epoch": 2.3480662983425415, + "grad_norm": 0.3056114912033081, + "learning_rate": 8.940216972213897e-05, + "loss": 1.8805, + "step": 7650 + }, + { + "epoch": 2.3483732351135664, + "grad_norm": 0.3047563135623932, + "learning_rate": 8.939910955290328e-05, + "loss": 1.793, + "step": 7651 + }, + { + "epoch": 2.3486801718845918, + "grad_norm": 0.3381251394748688, + "learning_rate": 8.939604899430248e-05, + "loss": 1.8267, + "step": 7652 + }, + { + "epoch": 2.348987108655617, + "grad_norm": 0.36855414509773254, + "learning_rate": 8.939298804636684e-05, + "loss": 1.9386, + "step": 7653 + }, + { + "epoch": 2.349294045426642, + "grad_norm": 0.3742626905441284, + "learning_rate": 8.93899267091266e-05, + "loss": 1.8695, + "step": 7654 + }, + { + "epoch": 2.3496009821976673, + "grad_norm": 0.3170017600059509, + "learning_rate": 8.938686498261201e-05, + "loss": 1.881, + "step": 7655 + }, + { + "epoch": 2.349907918968692, + "grad_norm": 0.2740418016910553, + "learning_rate": 8.938380286685334e-05, + "loss": 1.7992, + "step": 7656 + }, + { + "epoch": 2.3502148557397176, + "grad_norm": 0.3170342743396759, + "learning_rate": 8.938074036188087e-05, + "loss": 1.8281, + "step": 7657 + }, + { + "epoch": 2.350521792510743, + "grad_norm": 0.3487764298915863, + "learning_rate": 8.93776774677248e-05, + "loss": 1.8508, + "step": 7658 + }, + { + "epoch": 2.350828729281768, + "grad_norm": 0.3193725347518921, + "learning_rate": 8.937461418441549e-05, + "loss": 1.802, + "step": 7659 + }, + { + "epoch": 2.351135666052793, + "grad_norm": 0.30621078610420227, + "learning_rate": 8.937155051198312e-05, + "loss": 1.8723, + "step": 7660 + }, + { + "epoch": 2.3514426028238185, + "grad_norm": 0.3154527544975281, + "learning_rate": 8.936848645045803e-05, + "loss": 1.8276, + "step": 7661 + }, + { + "epoch": 2.3517495395948433, + "grad_norm": 0.3809822201728821, + "learning_rate": 8.936542199987048e-05, + "loss": 1.9682, + "step": 7662 + }, + { + "epoch": 2.3520564763658687, + "grad_norm": 0.3817490339279175, + "learning_rate": 8.936235716025076e-05, + "loss": 1.8896, + "step": 7663 + }, + { + "epoch": 2.352363413136894, + "grad_norm": 0.2996097207069397, + "learning_rate": 8.935929193162915e-05, + "loss": 1.7994, + "step": 7664 + }, + { + "epoch": 2.352670349907919, + "grad_norm": 0.30788013339042664, + "learning_rate": 8.935622631403596e-05, + "loss": 1.8243, + "step": 7665 + }, + { + "epoch": 2.3529772866789442, + "grad_norm": 0.331193745136261, + "learning_rate": 8.935316030750145e-05, + "loss": 1.9044, + "step": 7666 + }, + { + "epoch": 2.353284223449969, + "grad_norm": 0.31796711683273315, + "learning_rate": 8.935009391205598e-05, + "loss": 1.8006, + "step": 7667 + }, + { + "epoch": 2.3535911602209945, + "grad_norm": 0.3864014744758606, + "learning_rate": 8.934702712772979e-05, + "loss": 2.0193, + "step": 7668 + }, + { + "epoch": 2.35389809699202, + "grad_norm": 0.3923170566558838, + "learning_rate": 8.934395995455323e-05, + "loss": 1.9418, + "step": 7669 + }, + { + "epoch": 2.3542050337630447, + "grad_norm": 0.3210037052631378, + "learning_rate": 8.934089239255659e-05, + "loss": 1.7964, + "step": 7670 + }, + { + "epoch": 2.35451197053407, + "grad_norm": 0.32465317845344543, + "learning_rate": 8.933782444177019e-05, + "loss": 1.9405, + "step": 7671 + }, + { + "epoch": 2.354818907305095, + "grad_norm": 0.35554173588752747, + "learning_rate": 8.933475610222435e-05, + "loss": 1.8645, + "step": 7672 + }, + { + "epoch": 2.3551258440761202, + "grad_norm": 0.32723551988601685, + "learning_rate": 8.933168737394942e-05, + "loss": 1.8941, + "step": 7673 + }, + { + "epoch": 2.3554327808471456, + "grad_norm": 0.3295009732246399, + "learning_rate": 8.932861825697567e-05, + "loss": 1.9047, + "step": 7674 + }, + { + "epoch": 2.3557397176181705, + "grad_norm": 0.32315388321876526, + "learning_rate": 8.932554875133348e-05, + "loss": 1.8535, + "step": 7675 + }, + { + "epoch": 2.356046654389196, + "grad_norm": 0.31577154994010925, + "learning_rate": 8.932247885705315e-05, + "loss": 1.8697, + "step": 7676 + }, + { + "epoch": 2.356353591160221, + "grad_norm": 0.31099769473075867, + "learning_rate": 8.931940857416506e-05, + "loss": 1.8377, + "step": 7677 + }, + { + "epoch": 2.356660527931246, + "grad_norm": 0.32998642325401306, + "learning_rate": 8.931633790269954e-05, + "loss": 1.8528, + "step": 7678 + }, + { + "epoch": 2.3569674647022714, + "grad_norm": 0.29609233140945435, + "learning_rate": 8.93132668426869e-05, + "loss": 1.8646, + "step": 7679 + }, + { + "epoch": 2.3572744014732967, + "grad_norm": 0.31335413455963135, + "learning_rate": 8.931019539415752e-05, + "loss": 1.9011, + "step": 7680 + }, + { + "epoch": 2.3575813382443216, + "grad_norm": 0.3441788852214813, + "learning_rate": 8.930712355714174e-05, + "loss": 1.8673, + "step": 7681 + }, + { + "epoch": 2.357888275015347, + "grad_norm": 0.34610918164253235, + "learning_rate": 8.930405133166992e-05, + "loss": 1.8613, + "step": 7682 + }, + { + "epoch": 2.358195211786372, + "grad_norm": 0.31753265857696533, + "learning_rate": 8.930097871777245e-05, + "loss": 1.873, + "step": 7683 + }, + { + "epoch": 2.358502148557397, + "grad_norm": 0.29862073063850403, + "learning_rate": 8.929790571547966e-05, + "loss": 1.8392, + "step": 7684 + }, + { + "epoch": 2.3588090853284225, + "grad_norm": 0.2953017055988312, + "learning_rate": 8.929483232482194e-05, + "loss": 1.8402, + "step": 7685 + }, + { + "epoch": 2.3591160220994474, + "grad_norm": 0.36613956093788147, + "learning_rate": 8.929175854582966e-05, + "loss": 1.8954, + "step": 7686 + }, + { + "epoch": 2.3594229588704727, + "grad_norm": 0.3867746889591217, + "learning_rate": 8.928868437853319e-05, + "loss": 1.8496, + "step": 7687 + }, + { + "epoch": 2.359729895641498, + "grad_norm": 0.30742913484573364, + "learning_rate": 8.928560982296292e-05, + "loss": 1.82, + "step": 7688 + }, + { + "epoch": 2.360036832412523, + "grad_norm": 0.306905061006546, + "learning_rate": 8.928253487914921e-05, + "loss": 1.8299, + "step": 7689 + }, + { + "epoch": 2.3603437691835483, + "grad_norm": 0.3253326416015625, + "learning_rate": 8.927945954712247e-05, + "loss": 1.896, + "step": 7690 + }, + { + "epoch": 2.3606507059545736, + "grad_norm": 0.3139156699180603, + "learning_rate": 8.927638382691309e-05, + "loss": 1.838, + "step": 7691 + }, + { + "epoch": 2.3609576427255985, + "grad_norm": 0.3865121006965637, + "learning_rate": 8.927330771855147e-05, + "loss": 1.8502, + "step": 7692 + }, + { + "epoch": 2.361264579496624, + "grad_norm": 0.3640300929546356, + "learning_rate": 8.927023122206799e-05, + "loss": 1.8929, + "step": 7693 + }, + { + "epoch": 2.3615715162676487, + "grad_norm": 0.3446909487247467, + "learning_rate": 8.926715433749309e-05, + "loss": 1.864, + "step": 7694 + }, + { + "epoch": 2.361878453038674, + "grad_norm": 0.3086490035057068, + "learning_rate": 8.926407706485713e-05, + "loss": 1.8588, + "step": 7695 + }, + { + "epoch": 2.3621853898096994, + "grad_norm": 0.28351619839668274, + "learning_rate": 8.926099940419057e-05, + "loss": 1.8114, + "step": 7696 + }, + { + "epoch": 2.3624923265807243, + "grad_norm": 0.31882742047309875, + "learning_rate": 8.925792135552379e-05, + "loss": 1.8544, + "step": 7697 + }, + { + "epoch": 2.3627992633517496, + "grad_norm": 0.2691894769668579, + "learning_rate": 8.925484291888723e-05, + "loss": 1.8143, + "step": 7698 + }, + { + "epoch": 2.3631062001227745, + "grad_norm": 0.2815118432044983, + "learning_rate": 8.925176409431129e-05, + "loss": 1.8687, + "step": 7699 + }, + { + "epoch": 2.3634131368938, + "grad_norm": 0.34842196106910706, + "learning_rate": 8.924868488182643e-05, + "loss": 1.8673, + "step": 7700 + }, + { + "epoch": 2.363720073664825, + "grad_norm": 0.33553025126457214, + "learning_rate": 8.924560528146304e-05, + "loss": 1.8982, + "step": 7701 + }, + { + "epoch": 2.36402701043585, + "grad_norm": 0.30077221989631653, + "learning_rate": 8.924252529325159e-05, + "loss": 1.8155, + "step": 7702 + }, + { + "epoch": 2.3643339472068754, + "grad_norm": 0.3376595079898834, + "learning_rate": 8.923944491722252e-05, + "loss": 1.8871, + "step": 7703 + }, + { + "epoch": 2.3646408839779007, + "grad_norm": 0.3980284333229065, + "learning_rate": 8.923636415340622e-05, + "loss": 1.8414, + "step": 7704 + }, + { + "epoch": 2.3649478207489256, + "grad_norm": 0.4772777259349823, + "learning_rate": 8.92332830018332e-05, + "loss": 1.8393, + "step": 7705 + }, + { + "epoch": 2.365254757519951, + "grad_norm": 0.5061559081077576, + "learning_rate": 8.923020146253387e-05, + "loss": 1.9134, + "step": 7706 + }, + { + "epoch": 2.3655616942909763, + "grad_norm": 0.47147873044013977, + "learning_rate": 8.922711953553871e-05, + "loss": 1.9026, + "step": 7707 + }, + { + "epoch": 2.365868631062001, + "grad_norm": 0.37263748049736023, + "learning_rate": 8.922403722087814e-05, + "loss": 1.8474, + "step": 7708 + }, + { + "epoch": 2.3661755678330265, + "grad_norm": 0.3158501386642456, + "learning_rate": 8.922095451858265e-05, + "loss": 1.8771, + "step": 7709 + }, + { + "epoch": 2.3664825046040514, + "grad_norm": 0.3170566260814667, + "learning_rate": 8.921787142868271e-05, + "loss": 1.8111, + "step": 7710 + }, + { + "epoch": 2.3667894413750767, + "grad_norm": 0.3532208502292633, + "learning_rate": 8.921478795120877e-05, + "loss": 1.8708, + "step": 7711 + }, + { + "epoch": 2.367096378146102, + "grad_norm": 0.3211480379104614, + "learning_rate": 8.921170408619131e-05, + "loss": 1.8487, + "step": 7712 + }, + { + "epoch": 2.367403314917127, + "grad_norm": 0.2806071937084198, + "learning_rate": 8.920861983366083e-05, + "loss": 1.8325, + "step": 7713 + }, + { + "epoch": 2.3677102516881523, + "grad_norm": 0.30703970789909363, + "learning_rate": 8.920553519364777e-05, + "loss": 1.8364, + "step": 7714 + }, + { + "epoch": 2.368017188459177, + "grad_norm": 0.30848923325538635, + "learning_rate": 8.920245016618263e-05, + "loss": 1.833, + "step": 7715 + }, + { + "epoch": 2.3683241252302025, + "grad_norm": 0.31656739115715027, + "learning_rate": 8.919936475129588e-05, + "loss": 1.8884, + "step": 7716 + }, + { + "epoch": 2.368631062001228, + "grad_norm": 0.2806589603424072, + "learning_rate": 8.919627894901806e-05, + "loss": 1.7779, + "step": 7717 + }, + { + "epoch": 2.3689379987722528, + "grad_norm": 0.2943432629108429, + "learning_rate": 8.919319275937962e-05, + "loss": 1.8741, + "step": 7718 + }, + { + "epoch": 2.369244935543278, + "grad_norm": 0.2870347499847412, + "learning_rate": 8.919010618241111e-05, + "loss": 1.8415, + "step": 7719 + }, + { + "epoch": 2.3695518723143034, + "grad_norm": 0.3224312663078308, + "learning_rate": 8.918701921814297e-05, + "loss": 1.8594, + "step": 7720 + }, + { + "epoch": 2.3698588090853283, + "grad_norm": 0.3007681369781494, + "learning_rate": 8.918393186660575e-05, + "loss": 1.878, + "step": 7721 + }, + { + "epoch": 2.3701657458563536, + "grad_norm": 0.3083780109882355, + "learning_rate": 8.918084412782994e-05, + "loss": 1.9088, + "step": 7722 + }, + { + "epoch": 2.370472682627379, + "grad_norm": 0.30599063634872437, + "learning_rate": 8.917775600184608e-05, + "loss": 1.8743, + "step": 7723 + }, + { + "epoch": 2.370779619398404, + "grad_norm": 0.33503273129463196, + "learning_rate": 8.917466748868466e-05, + "loss": 1.9048, + "step": 7724 + }, + { + "epoch": 2.371086556169429, + "grad_norm": 0.3861919343471527, + "learning_rate": 8.917157858837622e-05, + "loss": 1.9073, + "step": 7725 + }, + { + "epoch": 2.371393492940454, + "grad_norm": 0.395945280790329, + "learning_rate": 8.916848930095128e-05, + "loss": 1.8678, + "step": 7726 + }, + { + "epoch": 2.3717004297114794, + "grad_norm": 0.3657386600971222, + "learning_rate": 8.916539962644037e-05, + "loss": 1.9138, + "step": 7727 + }, + { + "epoch": 2.3720073664825048, + "grad_norm": 0.32392752170562744, + "learning_rate": 8.916230956487402e-05, + "loss": 1.803, + "step": 7728 + }, + { + "epoch": 2.3723143032535297, + "grad_norm": 0.406703382730484, + "learning_rate": 8.915921911628278e-05, + "loss": 1.9222, + "step": 7729 + }, + { + "epoch": 2.372621240024555, + "grad_norm": 0.4293023645877838, + "learning_rate": 8.915612828069718e-05, + "loss": 1.8874, + "step": 7730 + }, + { + "epoch": 2.37292817679558, + "grad_norm": 0.45155876874923706, + "learning_rate": 8.915303705814777e-05, + "loss": 1.9059, + "step": 7731 + }, + { + "epoch": 2.373235113566605, + "grad_norm": 0.35105881094932556, + "learning_rate": 8.91499454486651e-05, + "loss": 1.8387, + "step": 7732 + }, + { + "epoch": 2.3735420503376305, + "grad_norm": 0.3197930157184601, + "learning_rate": 8.914685345227973e-05, + "loss": 1.8174, + "step": 7733 + }, + { + "epoch": 2.3738489871086554, + "grad_norm": 0.3610389232635498, + "learning_rate": 8.91437610690222e-05, + "loss": 1.841, + "step": 7734 + }, + { + "epoch": 2.3741559238796808, + "grad_norm": 0.3696954548358917, + "learning_rate": 8.91406682989231e-05, + "loss": 1.8511, + "step": 7735 + }, + { + "epoch": 2.374462860650706, + "grad_norm": 0.3364555239677429, + "learning_rate": 8.913757514201295e-05, + "loss": 1.8382, + "step": 7736 + }, + { + "epoch": 2.374769797421731, + "grad_norm": 0.4600698947906494, + "learning_rate": 8.913448159832236e-05, + "loss": 1.8247, + "step": 7737 + }, + { + "epoch": 2.3750767341927563, + "grad_norm": 0.5877843499183655, + "learning_rate": 8.913138766788187e-05, + "loss": 1.8449, + "step": 7738 + }, + { + "epoch": 2.3753836709637817, + "grad_norm": 0.5380640029907227, + "learning_rate": 8.912829335072208e-05, + "loss": 1.8647, + "step": 7739 + }, + { + "epoch": 2.3756906077348066, + "grad_norm": 0.5100306272506714, + "learning_rate": 8.912519864687357e-05, + "loss": 1.884, + "step": 7740 + }, + { + "epoch": 2.375997544505832, + "grad_norm": 0.48175910115242004, + "learning_rate": 8.91221035563669e-05, + "loss": 1.8378, + "step": 7741 + }, + { + "epoch": 2.376304481276857, + "grad_norm": 0.3296540081501007, + "learning_rate": 8.911900807923268e-05, + "loss": 1.8036, + "step": 7742 + }, + { + "epoch": 2.376611418047882, + "grad_norm": 0.32398131489753723, + "learning_rate": 8.911591221550149e-05, + "loss": 1.8415, + "step": 7743 + }, + { + "epoch": 2.3769183548189075, + "grad_norm": 0.33934786915779114, + "learning_rate": 8.911281596520393e-05, + "loss": 1.9002, + "step": 7744 + }, + { + "epoch": 2.3772252915899323, + "grad_norm": 0.33059465885162354, + "learning_rate": 8.91097193283706e-05, + "loss": 1.8194, + "step": 7745 + }, + { + "epoch": 2.3775322283609577, + "grad_norm": 0.2908796966075897, + "learning_rate": 8.91066223050321e-05, + "loss": 1.8272, + "step": 7746 + }, + { + "epoch": 2.3778391651319826, + "grad_norm": 0.31551963090896606, + "learning_rate": 8.910352489521904e-05, + "loss": 1.8717, + "step": 7747 + }, + { + "epoch": 2.378146101903008, + "grad_norm": 0.2886766493320465, + "learning_rate": 8.910042709896203e-05, + "loss": 1.8714, + "step": 7748 + }, + { + "epoch": 2.3784530386740332, + "grad_norm": 0.3288721740245819, + "learning_rate": 8.909732891629167e-05, + "loss": 1.9194, + "step": 7749 + }, + { + "epoch": 2.378759975445058, + "grad_norm": 0.42444637417793274, + "learning_rate": 8.90942303472386e-05, + "loss": 1.8871, + "step": 7750 + }, + { + "epoch": 2.3790669122160835, + "grad_norm": 0.3550770580768585, + "learning_rate": 8.909113139183343e-05, + "loss": 1.8639, + "step": 7751 + }, + { + "epoch": 2.379373848987109, + "grad_norm": 0.3291744589805603, + "learning_rate": 8.908803205010679e-05, + "loss": 1.8284, + "step": 7752 + }, + { + "epoch": 2.3796807857581337, + "grad_norm": 0.2803054451942444, + "learning_rate": 8.908493232208928e-05, + "loss": 1.8113, + "step": 7753 + }, + { + "epoch": 2.379987722529159, + "grad_norm": 0.30959245562553406, + "learning_rate": 8.908183220781158e-05, + "loss": 1.8821, + "step": 7754 + }, + { + "epoch": 2.3802946593001844, + "grad_norm": 0.37838777899742126, + "learning_rate": 8.907873170730431e-05, + "loss": 1.8749, + "step": 7755 + }, + { + "epoch": 2.3806015960712092, + "grad_norm": 0.34625449776649475, + "learning_rate": 8.907563082059813e-05, + "loss": 1.8804, + "step": 7756 + }, + { + "epoch": 2.3809085328422346, + "grad_norm": 0.3966830372810364, + "learning_rate": 8.907252954772364e-05, + "loss": 1.9295, + "step": 7757 + }, + { + "epoch": 2.3812154696132595, + "grad_norm": 0.3144119679927826, + "learning_rate": 8.906942788871151e-05, + "loss": 1.8486, + "step": 7758 + }, + { + "epoch": 2.381522406384285, + "grad_norm": 0.3498438596725464, + "learning_rate": 8.90663258435924e-05, + "loss": 1.8813, + "step": 7759 + }, + { + "epoch": 2.38182934315531, + "grad_norm": 0.32803723216056824, + "learning_rate": 8.906322341239696e-05, + "loss": 1.8282, + "step": 7760 + }, + { + "epoch": 2.382136279926335, + "grad_norm": 0.28600773215293884, + "learning_rate": 8.906012059515585e-05, + "loss": 1.8319, + "step": 7761 + }, + { + "epoch": 2.3824432166973604, + "grad_norm": 0.2743505537509918, + "learning_rate": 8.905701739189973e-05, + "loss": 1.8198, + "step": 7762 + }, + { + "epoch": 2.3827501534683857, + "grad_norm": 0.3011966347694397, + "learning_rate": 8.905391380265929e-05, + "loss": 1.8476, + "step": 7763 + }, + { + "epoch": 2.3830570902394106, + "grad_norm": 0.3022943437099457, + "learning_rate": 8.905080982746516e-05, + "loss": 1.9037, + "step": 7764 + }, + { + "epoch": 2.383364027010436, + "grad_norm": 0.3333243727684021, + "learning_rate": 8.904770546634805e-05, + "loss": 1.8487, + "step": 7765 + }, + { + "epoch": 2.3836709637814613, + "grad_norm": 0.3773072361946106, + "learning_rate": 8.904460071933862e-05, + "loss": 1.8828, + "step": 7766 + }, + { + "epoch": 2.383977900552486, + "grad_norm": 0.4382041096687317, + "learning_rate": 8.904149558646756e-05, + "loss": 1.9069, + "step": 7767 + }, + { + "epoch": 2.3842848373235115, + "grad_norm": 0.3963650166988373, + "learning_rate": 8.903839006776557e-05, + "loss": 1.816, + "step": 7768 + }, + { + "epoch": 2.3845917740945364, + "grad_norm": 0.35340386629104614, + "learning_rate": 8.903528416326333e-05, + "loss": 1.8853, + "step": 7769 + }, + { + "epoch": 2.3848987108655617, + "grad_norm": 0.31519120931625366, + "learning_rate": 8.903217787299153e-05, + "loss": 1.8953, + "step": 7770 + }, + { + "epoch": 2.385205647636587, + "grad_norm": 0.41126203536987305, + "learning_rate": 8.902907119698088e-05, + "loss": 1.9494, + "step": 7771 + }, + { + "epoch": 2.385512584407612, + "grad_norm": 0.4488140344619751, + "learning_rate": 8.902596413526205e-05, + "loss": 1.8717, + "step": 7772 + }, + { + "epoch": 2.3858195211786373, + "grad_norm": 0.36129191517829895, + "learning_rate": 8.902285668786578e-05, + "loss": 1.8472, + "step": 7773 + }, + { + "epoch": 2.386126457949662, + "grad_norm": 0.3357439935207367, + "learning_rate": 8.901974885482277e-05, + "loss": 1.8143, + "step": 7774 + }, + { + "epoch": 2.3864333947206875, + "grad_norm": 0.2832469046115875, + "learning_rate": 8.901664063616372e-05, + "loss": 1.7952, + "step": 7775 + }, + { + "epoch": 2.386740331491713, + "grad_norm": 0.31065669655799866, + "learning_rate": 8.901353203191937e-05, + "loss": 1.8651, + "step": 7776 + }, + { + "epoch": 2.3870472682627377, + "grad_norm": 0.2985263764858246, + "learning_rate": 8.901042304212042e-05, + "loss": 1.8106, + "step": 7777 + }, + { + "epoch": 2.387354205033763, + "grad_norm": 0.31606364250183105, + "learning_rate": 8.900731366679761e-05, + "loss": 1.8831, + "step": 7778 + }, + { + "epoch": 2.3876611418047884, + "grad_norm": 0.33167949318885803, + "learning_rate": 8.900420390598166e-05, + "loss": 1.9494, + "step": 7779 + }, + { + "epoch": 2.3879680785758133, + "grad_norm": 0.32814472913742065, + "learning_rate": 8.900109375970333e-05, + "loss": 1.8654, + "step": 7780 + }, + { + "epoch": 2.3882750153468386, + "grad_norm": 0.35307401418685913, + "learning_rate": 8.899798322799331e-05, + "loss": 1.904, + "step": 7781 + }, + { + "epoch": 2.388581952117864, + "grad_norm": 0.3936740458011627, + "learning_rate": 8.899487231088236e-05, + "loss": 1.8404, + "step": 7782 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.3675380349159241, + "learning_rate": 8.899176100840124e-05, + "loss": 1.8689, + "step": 7783 + }, + { + "epoch": 2.389195825659914, + "grad_norm": 0.34065911173820496, + "learning_rate": 8.898864932058067e-05, + "loss": 1.8819, + "step": 7784 + }, + { + "epoch": 2.389502762430939, + "grad_norm": 0.31531861424446106, + "learning_rate": 8.898553724745142e-05, + "loss": 1.8379, + "step": 7785 + }, + { + "epoch": 2.3898096992019644, + "grad_norm": 0.33485177159309387, + "learning_rate": 8.898242478904424e-05, + "loss": 1.9206, + "step": 7786 + }, + { + "epoch": 2.3901166359729897, + "grad_norm": 0.33116385340690613, + "learning_rate": 8.897931194538989e-05, + "loss": 1.8744, + "step": 7787 + }, + { + "epoch": 2.3904235727440146, + "grad_norm": 0.33216002583503723, + "learning_rate": 8.897619871651915e-05, + "loss": 1.8794, + "step": 7788 + }, + { + "epoch": 2.39073050951504, + "grad_norm": 0.3246794641017914, + "learning_rate": 8.897308510246273e-05, + "loss": 1.8739, + "step": 7789 + }, + { + "epoch": 2.391037446286065, + "grad_norm": 0.3038793206214905, + "learning_rate": 8.896997110325146e-05, + "loss": 1.8314, + "step": 7790 + }, + { + "epoch": 2.39134438305709, + "grad_norm": 0.35726267099380493, + "learning_rate": 8.896685671891612e-05, + "loss": 1.8764, + "step": 7791 + }, + { + "epoch": 2.3916513198281155, + "grad_norm": 0.421522855758667, + "learning_rate": 8.896374194948744e-05, + "loss": 1.8215, + "step": 7792 + }, + { + "epoch": 2.3919582565991404, + "grad_norm": 0.4456072747707367, + "learning_rate": 8.896062679499621e-05, + "loss": 1.9146, + "step": 7793 + }, + { + "epoch": 2.3922651933701657, + "grad_norm": 0.33498415350914, + "learning_rate": 8.895751125547325e-05, + "loss": 1.8372, + "step": 7794 + }, + { + "epoch": 2.392572130141191, + "grad_norm": 0.3279598355293274, + "learning_rate": 8.895439533094933e-05, + "loss": 1.8469, + "step": 7795 + }, + { + "epoch": 2.392879066912216, + "grad_norm": 0.4238305687904358, + "learning_rate": 8.895127902145524e-05, + "loss": 1.8259, + "step": 7796 + }, + { + "epoch": 2.3931860036832413, + "grad_norm": 0.473057359457016, + "learning_rate": 8.89481623270218e-05, + "loss": 1.8374, + "step": 7797 + }, + { + "epoch": 2.3934929404542666, + "grad_norm": 0.30914968252182007, + "learning_rate": 8.894504524767976e-05, + "loss": 1.7803, + "step": 7798 + }, + { + "epoch": 2.3937998772252915, + "grad_norm": 0.3433384597301483, + "learning_rate": 8.894192778345996e-05, + "loss": 1.8568, + "step": 7799 + }, + { + "epoch": 2.394106813996317, + "grad_norm": 0.4965706467628479, + "learning_rate": 8.893880993439323e-05, + "loss": 1.8576, + "step": 7800 + }, + { + "epoch": 2.3944137507673418, + "grad_norm": 0.4996519684791565, + "learning_rate": 8.893569170051032e-05, + "loss": 1.788, + "step": 7801 + }, + { + "epoch": 2.394720687538367, + "grad_norm": 0.31231364607810974, + "learning_rate": 8.893257308184212e-05, + "loss": 1.7846, + "step": 7802 + }, + { + "epoch": 2.3950276243093924, + "grad_norm": 0.32845574617385864, + "learning_rate": 8.89294540784194e-05, + "loss": 1.8811, + "step": 7803 + }, + { + "epoch": 2.3953345610804173, + "grad_norm": 0.525324285030365, + "learning_rate": 8.8926334690273e-05, + "loss": 1.8458, + "step": 7804 + }, + { + "epoch": 2.3956414978514426, + "grad_norm": 0.5107213854789734, + "learning_rate": 8.892321491743373e-05, + "loss": 1.8419, + "step": 7805 + }, + { + "epoch": 2.3959484346224675, + "grad_norm": 0.33831658959388733, + "learning_rate": 8.892009475993245e-05, + "loss": 1.811, + "step": 7806 + }, + { + "epoch": 2.396255371393493, + "grad_norm": 0.3781357407569885, + "learning_rate": 8.891697421779999e-05, + "loss": 1.9385, + "step": 7807 + }, + { + "epoch": 2.396562308164518, + "grad_norm": 0.43507882952690125, + "learning_rate": 8.891385329106717e-05, + "loss": 1.7705, + "step": 7808 + }, + { + "epoch": 2.396869244935543, + "grad_norm": 0.45114290714263916, + "learning_rate": 8.891073197976483e-05, + "loss": 1.8661, + "step": 7809 + }, + { + "epoch": 2.3971761817065684, + "grad_norm": 0.29369547963142395, + "learning_rate": 8.890761028392385e-05, + "loss": 1.873, + "step": 7810 + }, + { + "epoch": 2.3974831184775938, + "grad_norm": 0.3268595337867737, + "learning_rate": 8.890448820357506e-05, + "loss": 1.8461, + "step": 7811 + }, + { + "epoch": 2.3977900552486187, + "grad_norm": 0.4514225423336029, + "learning_rate": 8.890136573874931e-05, + "loss": 1.8458, + "step": 7812 + }, + { + "epoch": 2.398096992019644, + "grad_norm": 0.5288760662078857, + "learning_rate": 8.889824288947745e-05, + "loss": 1.8301, + "step": 7813 + }, + { + "epoch": 2.3984039287906693, + "grad_norm": 0.46517884731292725, + "learning_rate": 8.889511965579038e-05, + "loss": 1.8769, + "step": 7814 + }, + { + "epoch": 2.398710865561694, + "grad_norm": 0.29907044768333435, + "learning_rate": 8.889199603771892e-05, + "loss": 1.7815, + "step": 7815 + }, + { + "epoch": 2.3990178023327196, + "grad_norm": 0.36091622710227966, + "learning_rate": 8.888887203529398e-05, + "loss": 1.8375, + "step": 7816 + }, + { + "epoch": 2.3993247391037444, + "grad_norm": 0.5604190230369568, + "learning_rate": 8.88857476485464e-05, + "loss": 1.9176, + "step": 7817 + }, + { + "epoch": 2.3996316758747698, + "grad_norm": 0.48299452662467957, + "learning_rate": 8.888262287750707e-05, + "loss": 1.8682, + "step": 7818 + }, + { + "epoch": 2.399938612645795, + "grad_norm": 0.32829394936561584, + "learning_rate": 8.887949772220687e-05, + "loss": 1.9143, + "step": 7819 + }, + { + "epoch": 2.40024554941682, + "grad_norm": 0.401719868183136, + "learning_rate": 8.88763721826767e-05, + "loss": 1.8517, + "step": 7820 + }, + { + "epoch": 2.4005524861878453, + "grad_norm": 0.5205032825469971, + "learning_rate": 8.887324625894741e-05, + "loss": 1.811, + "step": 7821 + }, + { + "epoch": 2.4008594229588702, + "grad_norm": 0.3828800618648529, + "learning_rate": 8.887011995104993e-05, + "loss": 1.8042, + "step": 7822 + }, + { + "epoch": 2.4011663597298956, + "grad_norm": 0.31816062331199646, + "learning_rate": 8.886699325901514e-05, + "loss": 1.8998, + "step": 7823 + }, + { + "epoch": 2.401473296500921, + "grad_norm": 0.36172720789909363, + "learning_rate": 8.886386618287394e-05, + "loss": 1.8689, + "step": 7824 + }, + { + "epoch": 2.401780233271946, + "grad_norm": 0.3582005202770233, + "learning_rate": 8.886073872265725e-05, + "loss": 1.8565, + "step": 7825 + }, + { + "epoch": 2.402087170042971, + "grad_norm": 0.2915255129337311, + "learning_rate": 8.885761087839594e-05, + "loss": 1.8686, + "step": 7826 + }, + { + "epoch": 2.4023941068139965, + "grad_norm": 0.26619917154312134, + "learning_rate": 8.885448265012095e-05, + "loss": 1.7737, + "step": 7827 + }, + { + "epoch": 2.4027010435850213, + "grad_norm": 0.31685733795166016, + "learning_rate": 8.88513540378632e-05, + "loss": 1.9136, + "step": 7828 + }, + { + "epoch": 2.4030079803560467, + "grad_norm": 0.3427450954914093, + "learning_rate": 8.884822504165359e-05, + "loss": 1.8824, + "step": 7829 + }, + { + "epoch": 2.403314917127072, + "grad_norm": 0.3207513689994812, + "learning_rate": 8.884509566152306e-05, + "loss": 1.8332, + "step": 7830 + }, + { + "epoch": 2.403621853898097, + "grad_norm": 0.3301675319671631, + "learning_rate": 8.884196589750251e-05, + "loss": 1.9129, + "step": 7831 + }, + { + "epoch": 2.4039287906691222, + "grad_norm": 0.3232486844062805, + "learning_rate": 8.88388357496229e-05, + "loss": 1.8362, + "step": 7832 + }, + { + "epoch": 2.404235727440147, + "grad_norm": 0.3152230381965637, + "learning_rate": 8.883570521791514e-05, + "loss": 1.8586, + "step": 7833 + }, + { + "epoch": 2.4045426642111725, + "grad_norm": 0.3204822540283203, + "learning_rate": 8.883257430241019e-05, + "loss": 1.842, + "step": 7834 + }, + { + "epoch": 2.404849600982198, + "grad_norm": 0.28253886103630066, + "learning_rate": 8.882944300313897e-05, + "loss": 1.8521, + "step": 7835 + }, + { + "epoch": 2.4051565377532227, + "grad_norm": 0.37631165981292725, + "learning_rate": 8.882631132013245e-05, + "loss": 1.8838, + "step": 7836 + }, + { + "epoch": 2.405463474524248, + "grad_norm": 0.3606031537055969, + "learning_rate": 8.882317925342157e-05, + "loss": 1.8452, + "step": 7837 + }, + { + "epoch": 2.4057704112952734, + "grad_norm": 0.33793914318084717, + "learning_rate": 8.882004680303726e-05, + "loss": 1.8866, + "step": 7838 + }, + { + "epoch": 2.4060773480662982, + "grad_norm": 0.2714223265647888, + "learning_rate": 8.881691396901048e-05, + "loss": 1.7953, + "step": 7839 + }, + { + "epoch": 2.4063842848373236, + "grad_norm": 0.3588239252567291, + "learning_rate": 8.881378075137224e-05, + "loss": 1.9679, + "step": 7840 + }, + { + "epoch": 2.406691221608349, + "grad_norm": 0.3266383707523346, + "learning_rate": 8.881064715015344e-05, + "loss": 1.8747, + "step": 7841 + }, + { + "epoch": 2.406998158379374, + "grad_norm": 0.3498428761959076, + "learning_rate": 8.88075131653851e-05, + "loss": 1.8882, + "step": 7842 + }, + { + "epoch": 2.407305095150399, + "grad_norm": 0.36646100878715515, + "learning_rate": 8.880437879709815e-05, + "loss": 1.8624, + "step": 7843 + }, + { + "epoch": 2.407612031921424, + "grad_norm": 0.36088457703590393, + "learning_rate": 8.88012440453236e-05, + "loss": 1.8527, + "step": 7844 + }, + { + "epoch": 2.4079189686924494, + "grad_norm": 0.3267477750778198, + "learning_rate": 8.87981089100924e-05, + "loss": 1.8374, + "step": 7845 + }, + { + "epoch": 2.4082259054634747, + "grad_norm": 0.3262403607368469, + "learning_rate": 8.879497339143556e-05, + "loss": 1.8752, + "step": 7846 + }, + { + "epoch": 2.4085328422344996, + "grad_norm": 0.278877854347229, + "learning_rate": 8.879183748938405e-05, + "loss": 1.8056, + "step": 7847 + }, + { + "epoch": 2.408839779005525, + "grad_norm": 0.35509005188941956, + "learning_rate": 8.878870120396886e-05, + "loss": 1.8555, + "step": 7848 + }, + { + "epoch": 2.40914671577655, + "grad_norm": 0.3621126413345337, + "learning_rate": 8.8785564535221e-05, + "loss": 1.8084, + "step": 7849 + }, + { + "epoch": 2.409453652547575, + "grad_norm": 0.2772746682167053, + "learning_rate": 8.878242748317145e-05, + "loss": 1.8034, + "step": 7850 + }, + { + "epoch": 2.4097605893186005, + "grad_norm": 0.30938875675201416, + "learning_rate": 8.877929004785121e-05, + "loss": 1.8341, + "step": 7851 + }, + { + "epoch": 2.4100675260896254, + "grad_norm": 0.3349369764328003, + "learning_rate": 8.877615222929133e-05, + "loss": 1.8306, + "step": 7852 + }, + { + "epoch": 2.4103744628606507, + "grad_norm": 0.3109685778617859, + "learning_rate": 8.877301402752277e-05, + "loss": 1.7998, + "step": 7853 + }, + { + "epoch": 2.410681399631676, + "grad_norm": 0.3337927460670471, + "learning_rate": 8.876987544257655e-05, + "loss": 1.8766, + "step": 7854 + }, + { + "epoch": 2.410988336402701, + "grad_norm": 0.33891361951828003, + "learning_rate": 8.87667364744837e-05, + "loss": 1.8535, + "step": 7855 + }, + { + "epoch": 2.4112952731737263, + "grad_norm": 0.30946552753448486, + "learning_rate": 8.876359712327524e-05, + "loss": 1.8144, + "step": 7856 + }, + { + "epoch": 2.4116022099447516, + "grad_norm": 0.354981929063797, + "learning_rate": 8.87604573889822e-05, + "loss": 1.9253, + "step": 7857 + }, + { + "epoch": 2.4119091467157765, + "grad_norm": 0.42054516077041626, + "learning_rate": 8.875731727163559e-05, + "loss": 1.9122, + "step": 7858 + }, + { + "epoch": 2.412216083486802, + "grad_norm": 0.37435492873191833, + "learning_rate": 8.875417677126646e-05, + "loss": 1.8639, + "step": 7859 + }, + { + "epoch": 2.4125230202578267, + "grad_norm": 0.3742216229438782, + "learning_rate": 8.875103588790584e-05, + "loss": 1.8398, + "step": 7860 + }, + { + "epoch": 2.412829957028852, + "grad_norm": 0.3152104616165161, + "learning_rate": 8.874789462158478e-05, + "loss": 1.8078, + "step": 7861 + }, + { + "epoch": 2.4131368937998774, + "grad_norm": 0.32342761754989624, + "learning_rate": 8.87447529723343e-05, + "loss": 1.8632, + "step": 7862 + }, + { + "epoch": 2.4134438305709023, + "grad_norm": 0.31065210700035095, + "learning_rate": 8.874161094018547e-05, + "loss": 1.845, + "step": 7863 + }, + { + "epoch": 2.4137507673419276, + "grad_norm": 0.31379538774490356, + "learning_rate": 8.873846852516933e-05, + "loss": 1.8184, + "step": 7864 + }, + { + "epoch": 2.4140577041129525, + "grad_norm": 0.29058924317359924, + "learning_rate": 8.873532572731694e-05, + "loss": 1.8671, + "step": 7865 + }, + { + "epoch": 2.414364640883978, + "grad_norm": 0.3024691641330719, + "learning_rate": 8.873218254665936e-05, + "loss": 1.7977, + "step": 7866 + }, + { + "epoch": 2.414671577655003, + "grad_norm": 0.30356913805007935, + "learning_rate": 8.872903898322764e-05, + "loss": 1.8284, + "step": 7867 + }, + { + "epoch": 2.414978514426028, + "grad_norm": 0.29594334959983826, + "learning_rate": 8.872589503705287e-05, + "loss": 1.8651, + "step": 7868 + }, + { + "epoch": 2.4152854511970534, + "grad_norm": 0.2929564118385315, + "learning_rate": 8.872275070816612e-05, + "loss": 1.8671, + "step": 7869 + }, + { + "epoch": 2.4155923879680787, + "grad_norm": 0.30591902136802673, + "learning_rate": 8.871960599659842e-05, + "loss": 1.9341, + "step": 7870 + }, + { + "epoch": 2.4158993247391036, + "grad_norm": 0.3944799304008484, + "learning_rate": 8.87164609023809e-05, + "loss": 1.8947, + "step": 7871 + }, + { + "epoch": 2.416206261510129, + "grad_norm": 0.3568263351917267, + "learning_rate": 8.871331542554461e-05, + "loss": 1.8466, + "step": 7872 + }, + { + "epoch": 2.4165131982811543, + "grad_norm": 0.3182635009288788, + "learning_rate": 8.871016956612066e-05, + "loss": 1.8373, + "step": 7873 + }, + { + "epoch": 2.416820135052179, + "grad_norm": 0.31941649317741394, + "learning_rate": 8.870702332414012e-05, + "loss": 1.8356, + "step": 7874 + }, + { + "epoch": 2.4171270718232045, + "grad_norm": 0.3090899586677551, + "learning_rate": 8.870387669963407e-05, + "loss": 1.9308, + "step": 7875 + }, + { + "epoch": 2.4174340085942294, + "grad_norm": 0.3078390955924988, + "learning_rate": 8.870072969263364e-05, + "loss": 1.8521, + "step": 7876 + }, + { + "epoch": 2.4177409453652547, + "grad_norm": 0.29126885533332825, + "learning_rate": 8.869758230316992e-05, + "loss": 1.8091, + "step": 7877 + }, + { + "epoch": 2.41804788213628, + "grad_norm": 0.36473605036735535, + "learning_rate": 8.869443453127402e-05, + "loss": 1.8282, + "step": 7878 + }, + { + "epoch": 2.418354818907305, + "grad_norm": 0.3617660701274872, + "learning_rate": 8.869128637697702e-05, + "loss": 1.8843, + "step": 7879 + }, + { + "epoch": 2.4186617556783303, + "grad_norm": 0.33267220854759216, + "learning_rate": 8.868813784031005e-05, + "loss": 1.8647, + "step": 7880 + }, + { + "epoch": 2.418968692449355, + "grad_norm": 0.29990482330322266, + "learning_rate": 8.868498892130424e-05, + "loss": 1.7697, + "step": 7881 + }, + { + "epoch": 2.4192756292203805, + "grad_norm": 0.3618892431259155, + "learning_rate": 8.868183961999068e-05, + "loss": 1.7699, + "step": 7882 + }, + { + "epoch": 2.419582565991406, + "grad_norm": 0.29534587264060974, + "learning_rate": 8.867868993640051e-05, + "loss": 1.828, + "step": 7883 + }, + { + "epoch": 2.4198895027624308, + "grad_norm": 0.3086758255958557, + "learning_rate": 8.867553987056487e-05, + "loss": 1.8652, + "step": 7884 + }, + { + "epoch": 2.420196439533456, + "grad_norm": 0.3273947834968567, + "learning_rate": 8.867238942251487e-05, + "loss": 1.8553, + "step": 7885 + }, + { + "epoch": 2.4205033763044814, + "grad_norm": 0.3069070279598236, + "learning_rate": 8.866923859228165e-05, + "loss": 1.8057, + "step": 7886 + }, + { + "epoch": 2.4208103130755063, + "grad_norm": 0.2884439527988434, + "learning_rate": 8.866608737989635e-05, + "loss": 1.8479, + "step": 7887 + }, + { + "epoch": 2.4211172498465316, + "grad_norm": 0.32123002409935, + "learning_rate": 8.866293578539011e-05, + "loss": 1.916, + "step": 7888 + }, + { + "epoch": 2.421424186617557, + "grad_norm": 0.285966157913208, + "learning_rate": 8.865978380879407e-05, + "loss": 1.834, + "step": 7889 + }, + { + "epoch": 2.421731123388582, + "grad_norm": 0.28088799118995667, + "learning_rate": 8.865663145013941e-05, + "loss": 1.7794, + "step": 7890 + }, + { + "epoch": 2.422038060159607, + "grad_norm": 0.31160372495651245, + "learning_rate": 8.865347870945724e-05, + "loss": 1.8584, + "step": 7891 + }, + { + "epoch": 2.422344996930632, + "grad_norm": 0.3121089041233063, + "learning_rate": 8.865032558677874e-05, + "loss": 1.8797, + "step": 7892 + }, + { + "epoch": 2.4226519337016574, + "grad_norm": 0.35856643319129944, + "learning_rate": 8.864717208213506e-05, + "loss": 1.8664, + "step": 7893 + }, + { + "epoch": 2.4229588704726828, + "grad_norm": 0.32826781272888184, + "learning_rate": 8.864401819555739e-05, + "loss": 1.8473, + "step": 7894 + }, + { + "epoch": 2.4232658072437077, + "grad_norm": 0.34450921416282654, + "learning_rate": 8.86408639270769e-05, + "loss": 1.918, + "step": 7895 + }, + { + "epoch": 2.423572744014733, + "grad_norm": 0.39621153473854065, + "learning_rate": 8.86377092767247e-05, + "loss": 1.9411, + "step": 7896 + }, + { + "epoch": 2.423879680785758, + "grad_norm": 0.3765166103839874, + "learning_rate": 8.863455424453204e-05, + "loss": 1.9003, + "step": 7897 + }, + { + "epoch": 2.424186617556783, + "grad_norm": 0.3942621946334839, + "learning_rate": 8.863139883053007e-05, + "loss": 1.9647, + "step": 7898 + }, + { + "epoch": 2.4244935543278086, + "grad_norm": 0.4255806803703308, + "learning_rate": 8.862824303474996e-05, + "loss": 1.9147, + "step": 7899 + }, + { + "epoch": 2.424800491098834, + "grad_norm": 0.3993197977542877, + "learning_rate": 8.862508685722292e-05, + "loss": 1.8822, + "step": 7900 + }, + { + "epoch": 2.425107427869859, + "grad_norm": 0.3734201490879059, + "learning_rate": 8.862193029798013e-05, + "loss": 1.8745, + "step": 7901 + }, + { + "epoch": 2.425414364640884, + "grad_norm": 0.40955278277397156, + "learning_rate": 8.861877335705279e-05, + "loss": 1.877, + "step": 7902 + }, + { + "epoch": 2.425721301411909, + "grad_norm": 0.3975965678691864, + "learning_rate": 8.861561603447211e-05, + "loss": 1.868, + "step": 7903 + }, + { + "epoch": 2.4260282381829343, + "grad_norm": 0.30194091796875, + "learning_rate": 8.861245833026926e-05, + "loss": 1.7849, + "step": 7904 + }, + { + "epoch": 2.4263351749539597, + "grad_norm": 0.349930077791214, + "learning_rate": 8.860930024447547e-05, + "loss": 1.891, + "step": 7905 + }, + { + "epoch": 2.4266421117249846, + "grad_norm": 0.40644606947898865, + "learning_rate": 8.860614177712196e-05, + "loss": 1.8463, + "step": 7906 + }, + { + "epoch": 2.42694904849601, + "grad_norm": 0.3627426028251648, + "learning_rate": 8.86029829282399e-05, + "loss": 1.8518, + "step": 7907 + }, + { + "epoch": 2.427255985267035, + "grad_norm": 0.4019826054573059, + "learning_rate": 8.859982369786055e-05, + "loss": 1.7997, + "step": 7908 + }, + { + "epoch": 2.42756292203806, + "grad_norm": 0.375589519739151, + "learning_rate": 8.859666408601512e-05, + "loss": 1.9136, + "step": 7909 + }, + { + "epoch": 2.4278698588090855, + "grad_norm": 0.3135814070701599, + "learning_rate": 8.859350409273484e-05, + "loss": 1.8511, + "step": 7910 + }, + { + "epoch": 2.4281767955801103, + "grad_norm": 0.4534473717212677, + "learning_rate": 8.859034371805093e-05, + "loss": 1.9827, + "step": 7911 + }, + { + "epoch": 2.4284837323511357, + "grad_norm": 0.5559772849082947, + "learning_rate": 8.858718296199462e-05, + "loss": 1.8578, + "step": 7912 + }, + { + "epoch": 2.428790669122161, + "grad_norm": 0.4518011212348938, + "learning_rate": 8.858402182459715e-05, + "loss": 1.8374, + "step": 7913 + }, + { + "epoch": 2.429097605893186, + "grad_norm": 0.31662946939468384, + "learning_rate": 8.858086030588977e-05, + "loss": 1.8356, + "step": 7914 + }, + { + "epoch": 2.4294045426642112, + "grad_norm": 0.4660717844963074, + "learning_rate": 8.857769840590371e-05, + "loss": 1.7977, + "step": 7915 + }, + { + "epoch": 2.4297114794352366, + "grad_norm": 0.5611162185668945, + "learning_rate": 8.857453612467022e-05, + "loss": 1.8423, + "step": 7916 + }, + { + "epoch": 2.4300184162062615, + "grad_norm": 0.5055921077728271, + "learning_rate": 8.857137346222056e-05, + "loss": 1.8595, + "step": 7917 + }, + { + "epoch": 2.430325352977287, + "grad_norm": 0.3589123487472534, + "learning_rate": 8.856821041858597e-05, + "loss": 1.776, + "step": 7918 + }, + { + "epoch": 2.4306322897483117, + "grad_norm": 0.36849313974380493, + "learning_rate": 8.856504699379773e-05, + "loss": 1.8695, + "step": 7919 + }, + { + "epoch": 2.430939226519337, + "grad_norm": 0.47566625475883484, + "learning_rate": 8.856188318788709e-05, + "loss": 1.8578, + "step": 7920 + }, + { + "epoch": 2.4312461632903624, + "grad_norm": 0.554790735244751, + "learning_rate": 8.855871900088532e-05, + "loss": 1.8406, + "step": 7921 + }, + { + "epoch": 2.4315531000613873, + "grad_norm": 0.4846283197402954, + "learning_rate": 8.855555443282369e-05, + "loss": 1.8475, + "step": 7922 + }, + { + "epoch": 2.4318600368324126, + "grad_norm": 0.35256531834602356, + "learning_rate": 8.855238948373346e-05, + "loss": 1.8594, + "step": 7923 + }, + { + "epoch": 2.4321669736034375, + "grad_norm": 0.3713412880897522, + "learning_rate": 8.854922415364593e-05, + "loss": 1.893, + "step": 7924 + }, + { + "epoch": 2.432473910374463, + "grad_norm": 0.4289644658565521, + "learning_rate": 8.854605844259237e-05, + "loss": 1.8958, + "step": 7925 + }, + { + "epoch": 2.432780847145488, + "grad_norm": 0.4209578335285187, + "learning_rate": 8.854289235060406e-05, + "loss": 1.8419, + "step": 7926 + }, + { + "epoch": 2.433087783916513, + "grad_norm": 0.41226091980934143, + "learning_rate": 8.853972587771232e-05, + "loss": 1.958, + "step": 7927 + }, + { + "epoch": 2.4333947206875384, + "grad_norm": 0.36133915185928345, + "learning_rate": 8.853655902394841e-05, + "loss": 1.9181, + "step": 7928 + }, + { + "epoch": 2.4337016574585637, + "grad_norm": 0.44178202748298645, + "learning_rate": 8.853339178934363e-05, + "loss": 1.9242, + "step": 7929 + }, + { + "epoch": 2.4340085942295886, + "grad_norm": 0.4537523686885834, + "learning_rate": 8.853022417392929e-05, + "loss": 2.0451, + "step": 7930 + }, + { + "epoch": 2.434315531000614, + "grad_norm": 0.3214915990829468, + "learning_rate": 8.852705617773669e-05, + "loss": 1.8549, + "step": 7931 + }, + { + "epoch": 2.4346224677716393, + "grad_norm": 0.4621930420398712, + "learning_rate": 8.852388780079714e-05, + "loss": 1.8705, + "step": 7932 + }, + { + "epoch": 2.434929404542664, + "grad_norm": 0.52337646484375, + "learning_rate": 8.852071904314196e-05, + "loss": 1.8381, + "step": 7933 + }, + { + "epoch": 2.4352363413136895, + "grad_norm": 0.3846060633659363, + "learning_rate": 8.851754990480246e-05, + "loss": 1.828, + "step": 7934 + }, + { + "epoch": 2.4355432780847144, + "grad_norm": 0.34233763813972473, + "learning_rate": 8.851438038580994e-05, + "loss": 1.924, + "step": 7935 + }, + { + "epoch": 2.4358502148557397, + "grad_norm": 0.39583292603492737, + "learning_rate": 8.851121048619574e-05, + "loss": 1.8383, + "step": 7936 + }, + { + "epoch": 2.436157151626765, + "grad_norm": 0.3715476393699646, + "learning_rate": 8.850804020599119e-05, + "loss": 1.9251, + "step": 7937 + }, + { + "epoch": 2.43646408839779, + "grad_norm": 0.32089582085609436, + "learning_rate": 8.850486954522762e-05, + "loss": 1.9317, + "step": 7938 + }, + { + "epoch": 2.4367710251688153, + "grad_norm": 0.46823611855506897, + "learning_rate": 8.850169850393634e-05, + "loss": 1.9743, + "step": 7939 + }, + { + "epoch": 2.43707796193984, + "grad_norm": 0.405205637216568, + "learning_rate": 8.849852708214874e-05, + "loss": 1.8772, + "step": 7940 + }, + { + "epoch": 2.4373848987108655, + "grad_norm": 0.33672770857810974, + "learning_rate": 8.849535527989612e-05, + "loss": 1.8767, + "step": 7941 + }, + { + "epoch": 2.437691835481891, + "grad_norm": 0.38022953271865845, + "learning_rate": 8.849218309720983e-05, + "loss": 1.8882, + "step": 7942 + }, + { + "epoch": 2.4379987722529157, + "grad_norm": 0.4224186837673187, + "learning_rate": 8.848901053412124e-05, + "loss": 1.9016, + "step": 7943 + }, + { + "epoch": 2.438305709023941, + "grad_norm": 0.3890904486179352, + "learning_rate": 8.848583759066167e-05, + "loss": 1.8761, + "step": 7944 + }, + { + "epoch": 2.4386126457949664, + "grad_norm": 0.3747030794620514, + "learning_rate": 8.84826642668625e-05, + "loss": 1.8576, + "step": 7945 + }, + { + "epoch": 2.4389195825659913, + "grad_norm": 0.3317604959011078, + "learning_rate": 8.84794905627551e-05, + "loss": 1.9249, + "step": 7946 + }, + { + "epoch": 2.4392265193370166, + "grad_norm": 0.3294972777366638, + "learning_rate": 8.84763164783708e-05, + "loss": 1.8308, + "step": 7947 + }, + { + "epoch": 2.439533456108042, + "grad_norm": 0.42031124234199524, + "learning_rate": 8.847314201374101e-05, + "loss": 1.7884, + "step": 7948 + }, + { + "epoch": 2.439840392879067, + "grad_norm": 0.4018419682979584, + "learning_rate": 8.846996716889708e-05, + "loss": 1.8334, + "step": 7949 + }, + { + "epoch": 2.440147329650092, + "grad_norm": 0.39541858434677124, + "learning_rate": 8.846679194387036e-05, + "loss": 1.888, + "step": 7950 + }, + { + "epoch": 2.440454266421117, + "grad_norm": 0.34641456604003906, + "learning_rate": 8.846361633869228e-05, + "loss": 1.8521, + "step": 7951 + }, + { + "epoch": 2.4407612031921424, + "grad_norm": 0.42987826466560364, + "learning_rate": 8.846044035339419e-05, + "loss": 1.8789, + "step": 7952 + }, + { + "epoch": 2.4410681399631677, + "grad_norm": 0.3651089072227478, + "learning_rate": 8.845726398800749e-05, + "loss": 1.9024, + "step": 7953 + }, + { + "epoch": 2.4413750767341926, + "grad_norm": 0.3024137616157532, + "learning_rate": 8.845408724256356e-05, + "loss": 1.7773, + "step": 7954 + }, + { + "epoch": 2.441682013505218, + "grad_norm": 0.32426944375038147, + "learning_rate": 8.845091011709381e-05, + "loss": 1.7873, + "step": 7955 + }, + { + "epoch": 2.441988950276243, + "grad_norm": 0.34448274970054626, + "learning_rate": 8.844773261162962e-05, + "loss": 1.8854, + "step": 7956 + }, + { + "epoch": 2.442295887047268, + "grad_norm": 0.2942068874835968, + "learning_rate": 8.844455472620241e-05, + "loss": 1.8186, + "step": 7957 + }, + { + "epoch": 2.4426028238182935, + "grad_norm": 0.3849888741970062, + "learning_rate": 8.844137646084358e-05, + "loss": 1.905, + "step": 7958 + }, + { + "epoch": 2.4429097605893184, + "grad_norm": 0.44277897477149963, + "learning_rate": 8.843819781558452e-05, + "loss": 1.8836, + "step": 7959 + }, + { + "epoch": 2.4432166973603437, + "grad_norm": 0.34470248222351074, + "learning_rate": 8.843501879045667e-05, + "loss": 1.9368, + "step": 7960 + }, + { + "epoch": 2.443523634131369, + "grad_norm": 0.29713204503059387, + "learning_rate": 8.843183938549145e-05, + "loss": 1.8562, + "step": 7961 + }, + { + "epoch": 2.443830570902394, + "grad_norm": 0.370623379945755, + "learning_rate": 8.842865960072025e-05, + "loss": 1.8501, + "step": 7962 + }, + { + "epoch": 2.4441375076734193, + "grad_norm": 0.38828277587890625, + "learning_rate": 8.842547943617453e-05, + "loss": 1.884, + "step": 7963 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.294223427772522, + "learning_rate": 8.842229889188566e-05, + "loss": 1.857, + "step": 7964 + }, + { + "epoch": 2.4447513812154695, + "grad_norm": 0.31901589035987854, + "learning_rate": 8.841911796788516e-05, + "loss": 1.8675, + "step": 7965 + }, + { + "epoch": 2.445058317986495, + "grad_norm": 0.3586447834968567, + "learning_rate": 8.84159366642044e-05, + "loss": 1.86, + "step": 7966 + }, + { + "epoch": 2.4453652547575198, + "grad_norm": 0.30848199129104614, + "learning_rate": 8.841275498087482e-05, + "loss": 1.8153, + "step": 7967 + }, + { + "epoch": 2.445672191528545, + "grad_norm": 0.2694801688194275, + "learning_rate": 8.84095729179279e-05, + "loss": 1.7702, + "step": 7968 + }, + { + "epoch": 2.4459791282995704, + "grad_norm": 0.3068044185638428, + "learning_rate": 8.840639047539507e-05, + "loss": 1.8531, + "step": 7969 + }, + { + "epoch": 2.4462860650705953, + "grad_norm": 0.32885125279426575, + "learning_rate": 8.840320765330776e-05, + "loss": 1.9194, + "step": 7970 + }, + { + "epoch": 2.4465930018416207, + "grad_norm": 0.2949635088443756, + "learning_rate": 8.840002445169746e-05, + "loss": 1.8427, + "step": 7971 + }, + { + "epoch": 2.446899938612646, + "grad_norm": 0.27281275391578674, + "learning_rate": 8.83968408705956e-05, + "loss": 1.8279, + "step": 7972 + }, + { + "epoch": 2.447206875383671, + "grad_norm": 0.3038519620895386, + "learning_rate": 8.839365691003367e-05, + "loss": 1.8629, + "step": 7973 + }, + { + "epoch": 2.447513812154696, + "grad_norm": 0.28468266129493713, + "learning_rate": 8.839047257004311e-05, + "loss": 1.8765, + "step": 7974 + }, + { + "epoch": 2.4478207489257215, + "grad_norm": 0.29807159304618835, + "learning_rate": 8.83872878506554e-05, + "loss": 1.8152, + "step": 7975 + }, + { + "epoch": 2.4481276856967464, + "grad_norm": 0.3005301356315613, + "learning_rate": 8.838410275190201e-05, + "loss": 1.8577, + "step": 7976 + }, + { + "epoch": 2.4484346224677718, + "grad_norm": 0.3068598806858063, + "learning_rate": 8.838091727381442e-05, + "loss": 1.863, + "step": 7977 + }, + { + "epoch": 2.4487415592387967, + "grad_norm": 0.33748000860214233, + "learning_rate": 8.837773141642411e-05, + "loss": 1.7889, + "step": 7978 + }, + { + "epoch": 2.449048496009822, + "grad_norm": 0.344417542219162, + "learning_rate": 8.837454517976256e-05, + "loss": 1.9167, + "step": 7979 + }, + { + "epoch": 2.4493554327808473, + "grad_norm": 0.29128298163414, + "learning_rate": 8.837135856386127e-05, + "loss": 1.8246, + "step": 7980 + }, + { + "epoch": 2.449662369551872, + "grad_norm": 0.27023759484291077, + "learning_rate": 8.836817156875172e-05, + "loss": 1.8493, + "step": 7981 + }, + { + "epoch": 2.4499693063228976, + "grad_norm": 0.2792586088180542, + "learning_rate": 8.836498419446541e-05, + "loss": 1.8739, + "step": 7982 + }, + { + "epoch": 2.4502762430939224, + "grad_norm": 0.2715211510658264, + "learning_rate": 8.836179644103384e-05, + "loss": 1.8218, + "step": 7983 + }, + { + "epoch": 2.450583179864948, + "grad_norm": 0.273576557636261, + "learning_rate": 8.835860830848851e-05, + "loss": 1.9063, + "step": 7984 + }, + { + "epoch": 2.450890116635973, + "grad_norm": 0.2992589473724365, + "learning_rate": 8.835541979686093e-05, + "loss": 1.8799, + "step": 7985 + }, + { + "epoch": 2.451197053406998, + "grad_norm": 0.3231843411922455, + "learning_rate": 8.835223090618263e-05, + "loss": 1.8956, + "step": 7986 + }, + { + "epoch": 2.4515039901780233, + "grad_norm": 0.31108468770980835, + "learning_rate": 8.834904163648508e-05, + "loss": 1.8371, + "step": 7987 + }, + { + "epoch": 2.4518109269490487, + "grad_norm": 0.26657021045684814, + "learning_rate": 8.834585198779983e-05, + "loss": 1.8384, + "step": 7988 + }, + { + "epoch": 2.4521178637200736, + "grad_norm": 0.32093849778175354, + "learning_rate": 8.83426619601584e-05, + "loss": 1.8603, + "step": 7989 + }, + { + "epoch": 2.452424800491099, + "grad_norm": 0.32942765951156616, + "learning_rate": 8.833947155359231e-05, + "loss": 1.8306, + "step": 7990 + }, + { + "epoch": 2.4527317372621242, + "grad_norm": 0.31677374243736267, + "learning_rate": 8.83362807681331e-05, + "loss": 1.8339, + "step": 7991 + }, + { + "epoch": 2.453038674033149, + "grad_norm": 0.2739655673503876, + "learning_rate": 8.833308960381228e-05, + "loss": 1.8514, + "step": 7992 + }, + { + "epoch": 2.4533456108041745, + "grad_norm": 0.3194214105606079, + "learning_rate": 8.83298980606614e-05, + "loss": 1.8413, + "step": 7993 + }, + { + "epoch": 2.4536525475751993, + "grad_norm": 0.3346202075481415, + "learning_rate": 8.832670613871202e-05, + "loss": 1.8558, + "step": 7994 + }, + { + "epoch": 2.4539594843462247, + "grad_norm": 0.3400736451148987, + "learning_rate": 8.832351383799565e-05, + "loss": 1.8668, + "step": 7995 + }, + { + "epoch": 2.45426642111725, + "grad_norm": 0.2807479202747345, + "learning_rate": 8.832032115854385e-05, + "loss": 1.8361, + "step": 7996 + }, + { + "epoch": 2.454573357888275, + "grad_norm": 0.2977379262447357, + "learning_rate": 8.831712810038817e-05, + "loss": 1.84, + "step": 7997 + }, + { + "epoch": 2.4548802946593002, + "grad_norm": 0.3242948353290558, + "learning_rate": 8.831393466356019e-05, + "loss": 1.9421, + "step": 7998 + }, + { + "epoch": 2.455187231430325, + "grad_norm": 0.3289327025413513, + "learning_rate": 8.831074084809144e-05, + "loss": 1.9348, + "step": 7999 + }, + { + "epoch": 2.4554941682013505, + "grad_norm": 0.3378387987613678, + "learning_rate": 8.830754665401351e-05, + "loss": 1.7871, + "step": 8000 + }, + { + "epoch": 2.455801104972376, + "grad_norm": 0.29627665877342224, + "learning_rate": 8.830435208135794e-05, + "loss": 1.815, + "step": 8001 + }, + { + "epoch": 2.4561080417434007, + "grad_norm": 0.3509432375431061, + "learning_rate": 8.83011571301563e-05, + "loss": 1.9209, + "step": 8002 + }, + { + "epoch": 2.456414978514426, + "grad_norm": 0.3272305130958557, + "learning_rate": 8.829796180044019e-05, + "loss": 1.8437, + "step": 8003 + }, + { + "epoch": 2.4567219152854514, + "grad_norm": 0.33997493982315063, + "learning_rate": 8.829476609224119e-05, + "loss": 1.8827, + "step": 8004 + }, + { + "epoch": 2.4570288520564763, + "grad_norm": 0.30387789011001587, + "learning_rate": 8.829157000559084e-05, + "loss": 1.8427, + "step": 8005 + }, + { + "epoch": 2.4573357888275016, + "grad_norm": 0.30266425013542175, + "learning_rate": 8.828837354052075e-05, + "loss": 1.8274, + "step": 8006 + }, + { + "epoch": 2.457642725598527, + "grad_norm": 0.365546315908432, + "learning_rate": 8.828517669706254e-05, + "loss": 1.8455, + "step": 8007 + }, + { + "epoch": 2.457949662369552, + "grad_norm": 0.339226633310318, + "learning_rate": 8.828197947524774e-05, + "loss": 1.8665, + "step": 8008 + }, + { + "epoch": 2.458256599140577, + "grad_norm": 0.31167346239089966, + "learning_rate": 8.8278781875108e-05, + "loss": 1.7807, + "step": 8009 + }, + { + "epoch": 2.458563535911602, + "grad_norm": 0.2788028120994568, + "learning_rate": 8.82755838966749e-05, + "loss": 1.8834, + "step": 8010 + }, + { + "epoch": 2.4588704726826274, + "grad_norm": 0.34648752212524414, + "learning_rate": 8.827238553998005e-05, + "loss": 1.8981, + "step": 8011 + }, + { + "epoch": 2.4591774094536527, + "grad_norm": 0.3169974982738495, + "learning_rate": 8.826918680505504e-05, + "loss": 1.81, + "step": 8012 + }, + { + "epoch": 2.4594843462246776, + "grad_norm": 0.46924272179603577, + "learning_rate": 8.826598769193151e-05, + "loss": 1.9016, + "step": 8013 + }, + { + "epoch": 2.459791282995703, + "grad_norm": 0.38437098264694214, + "learning_rate": 8.826278820064106e-05, + "loss": 1.8924, + "step": 8014 + }, + { + "epoch": 2.460098219766728, + "grad_norm": 0.3350604474544525, + "learning_rate": 8.82595883312153e-05, + "loss": 1.8591, + "step": 8015 + }, + { + "epoch": 2.460405156537753, + "grad_norm": 0.3053742051124573, + "learning_rate": 8.825638808368588e-05, + "loss": 1.8114, + "step": 8016 + }, + { + "epoch": 2.4607120933087785, + "grad_norm": 0.29566875100135803, + "learning_rate": 8.82531874580844e-05, + "loss": 1.8055, + "step": 8017 + }, + { + "epoch": 2.4610190300798034, + "grad_norm": 0.3057360053062439, + "learning_rate": 8.824998645444249e-05, + "loss": 1.8268, + "step": 8018 + }, + { + "epoch": 2.4613259668508287, + "grad_norm": 0.27333348989486694, + "learning_rate": 8.82467850727918e-05, + "loss": 1.7876, + "step": 8019 + }, + { + "epoch": 2.461632903621854, + "grad_norm": 0.29202890396118164, + "learning_rate": 8.824358331316398e-05, + "loss": 1.8488, + "step": 8020 + }, + { + "epoch": 2.461939840392879, + "grad_norm": 0.3640623986721039, + "learning_rate": 8.824038117559064e-05, + "loss": 1.9665, + "step": 8021 + }, + { + "epoch": 2.4622467771639043, + "grad_norm": 0.35411131381988525, + "learning_rate": 8.823717866010344e-05, + "loss": 1.8561, + "step": 8022 + }, + { + "epoch": 2.4625537139349296, + "grad_norm": 0.3695240020751953, + "learning_rate": 8.823397576673403e-05, + "loss": 1.8489, + "step": 8023 + }, + { + "epoch": 2.4628606507059545, + "grad_norm": 0.36554715037345886, + "learning_rate": 8.823077249551406e-05, + "loss": 1.8523, + "step": 8024 + }, + { + "epoch": 2.46316758747698, + "grad_norm": 0.2982638478279114, + "learning_rate": 8.822756884647521e-05, + "loss": 1.8006, + "step": 8025 + }, + { + "epoch": 2.4634745242480047, + "grad_norm": 0.3693525791168213, + "learning_rate": 8.822436481964909e-05, + "loss": 1.8695, + "step": 8026 + }, + { + "epoch": 2.46378146101903, + "grad_norm": 0.46769842505455017, + "learning_rate": 8.82211604150674e-05, + "loss": 1.8509, + "step": 8027 + }, + { + "epoch": 2.4640883977900554, + "grad_norm": 0.5327584743499756, + "learning_rate": 8.82179556327618e-05, + "loss": 1.8642, + "step": 8028 + }, + { + "epoch": 2.4643953345610803, + "grad_norm": 0.5302795767784119, + "learning_rate": 8.821475047276398e-05, + "loss": 1.8645, + "step": 8029 + }, + { + "epoch": 2.4647022713321056, + "grad_norm": 0.43549028038978577, + "learning_rate": 8.821154493510557e-05, + "loss": 1.9193, + "step": 8030 + }, + { + "epoch": 2.4650092081031305, + "grad_norm": 0.3013847768306732, + "learning_rate": 8.82083390198183e-05, + "loss": 1.7819, + "step": 8031 + }, + { + "epoch": 2.465316144874156, + "grad_norm": 0.422325074672699, + "learning_rate": 8.820513272693383e-05, + "loss": 1.9307, + "step": 8032 + }, + { + "epoch": 2.465623081645181, + "grad_norm": 0.4823217988014221, + "learning_rate": 8.820192605648383e-05, + "loss": 1.8681, + "step": 8033 + }, + { + "epoch": 2.465930018416206, + "grad_norm": 0.3938382863998413, + "learning_rate": 8.819871900850001e-05, + "loss": 1.8483, + "step": 8034 + }, + { + "epoch": 2.4662369551872314, + "grad_norm": 0.30860164761543274, + "learning_rate": 8.819551158301406e-05, + "loss": 1.8818, + "step": 8035 + }, + { + "epoch": 2.4665438919582567, + "grad_norm": 0.3715503215789795, + "learning_rate": 8.819230378005767e-05, + "loss": 1.8443, + "step": 8036 + }, + { + "epoch": 2.4668508287292816, + "grad_norm": 0.4750272333621979, + "learning_rate": 8.818909559966255e-05, + "loss": 1.8379, + "step": 8037 + }, + { + "epoch": 2.467157765500307, + "grad_norm": 0.4794345796108246, + "learning_rate": 8.818588704186041e-05, + "loss": 1.8585, + "step": 8038 + }, + { + "epoch": 2.4674647022713323, + "grad_norm": 0.33470577001571655, + "learning_rate": 8.818267810668296e-05, + "loss": 1.8231, + "step": 8039 + }, + { + "epoch": 2.467771639042357, + "grad_norm": 0.31480371952056885, + "learning_rate": 8.817946879416191e-05, + "loss": 1.867, + "step": 8040 + }, + { + "epoch": 2.4680785758133825, + "grad_norm": 0.41635531187057495, + "learning_rate": 8.817625910432897e-05, + "loss": 1.9385, + "step": 8041 + }, + { + "epoch": 2.4683855125844074, + "grad_norm": 0.4570399522781372, + "learning_rate": 8.817304903721584e-05, + "loss": 1.7855, + "step": 8042 + }, + { + "epoch": 2.4686924493554327, + "grad_norm": 0.36506229639053345, + "learning_rate": 8.816983859285429e-05, + "loss": 1.808, + "step": 8043 + }, + { + "epoch": 2.468999386126458, + "grad_norm": 0.2650545537471771, + "learning_rate": 8.8166627771276e-05, + "loss": 1.8271, + "step": 8044 + }, + { + "epoch": 2.469306322897483, + "grad_norm": 0.3143758475780487, + "learning_rate": 8.816341657251272e-05, + "loss": 1.9016, + "step": 8045 + }, + { + "epoch": 2.4696132596685083, + "grad_norm": 0.3015407621860504, + "learning_rate": 8.81602049965962e-05, + "loss": 1.8357, + "step": 8046 + }, + { + "epoch": 2.4699201964395336, + "grad_norm": 0.26860085129737854, + "learning_rate": 8.815699304355819e-05, + "loss": 1.8223, + "step": 8047 + }, + { + "epoch": 2.4702271332105585, + "grad_norm": 0.2852436602115631, + "learning_rate": 8.81537807134304e-05, + "loss": 1.8298, + "step": 8048 + }, + { + "epoch": 2.470534069981584, + "grad_norm": 0.29519692063331604, + "learning_rate": 8.815056800624457e-05, + "loss": 1.863, + "step": 8049 + }, + { + "epoch": 2.470841006752609, + "grad_norm": 0.3163367807865143, + "learning_rate": 8.814735492203247e-05, + "loss": 1.878, + "step": 8050 + }, + { + "epoch": 2.471147943523634, + "grad_norm": 0.2955954968929291, + "learning_rate": 8.814414146082586e-05, + "loss": 1.8657, + "step": 8051 + }, + { + "epoch": 2.4714548802946594, + "grad_norm": 0.2773810029029846, + "learning_rate": 8.814092762265648e-05, + "loss": 1.7626, + "step": 8052 + }, + { + "epoch": 2.4717618170656843, + "grad_norm": 0.33908557891845703, + "learning_rate": 8.813771340755609e-05, + "loss": 1.8902, + "step": 8053 + }, + { + "epoch": 2.4720687538367097, + "grad_norm": 0.3083830773830414, + "learning_rate": 8.81344988155565e-05, + "loss": 1.876, + "step": 8054 + }, + { + "epoch": 2.472375690607735, + "grad_norm": 0.29082754254341125, + "learning_rate": 8.81312838466894e-05, + "loss": 1.8637, + "step": 8055 + }, + { + "epoch": 2.47268262737876, + "grad_norm": 0.3240490257740021, + "learning_rate": 8.81280685009866e-05, + "loss": 1.9096, + "step": 8056 + }, + { + "epoch": 2.472989564149785, + "grad_norm": 0.364561527967453, + "learning_rate": 8.812485277847991e-05, + "loss": 1.9361, + "step": 8057 + }, + { + "epoch": 2.47329650092081, + "grad_norm": 0.3420087695121765, + "learning_rate": 8.812163667920107e-05, + "loss": 1.9014, + "step": 8058 + }, + { + "epoch": 2.4736034376918354, + "grad_norm": 0.3346010148525238, + "learning_rate": 8.811842020318186e-05, + "loss": 1.9195, + "step": 8059 + }, + { + "epoch": 2.4739103744628608, + "grad_norm": 0.2990448772907257, + "learning_rate": 8.811520335045409e-05, + "loss": 1.8866, + "step": 8060 + }, + { + "epoch": 2.4742173112338857, + "grad_norm": 0.3047022223472595, + "learning_rate": 8.811198612104953e-05, + "loss": 1.8226, + "step": 8061 + }, + { + "epoch": 2.474524248004911, + "grad_norm": 0.300020307302475, + "learning_rate": 8.8108768515e-05, + "loss": 1.8496, + "step": 8062 + }, + { + "epoch": 2.4748311847759363, + "grad_norm": 0.31999605894088745, + "learning_rate": 8.810555053233729e-05, + "loss": 1.7853, + "step": 8063 + }, + { + "epoch": 2.4751381215469612, + "grad_norm": 0.3136597275733948, + "learning_rate": 8.810233217309318e-05, + "loss": 1.9317, + "step": 8064 + }, + { + "epoch": 2.4754450583179866, + "grad_norm": 0.3373543322086334, + "learning_rate": 8.809911343729948e-05, + "loss": 1.7827, + "step": 8065 + }, + { + "epoch": 2.475751995089012, + "grad_norm": 0.33876341581344604, + "learning_rate": 8.809589432498804e-05, + "loss": 1.8803, + "step": 8066 + }, + { + "epoch": 2.476058931860037, + "grad_norm": 0.3455486297607422, + "learning_rate": 8.809267483619061e-05, + "loss": 1.8987, + "step": 8067 + }, + { + "epoch": 2.476365868631062, + "grad_norm": 0.34245389699935913, + "learning_rate": 8.808945497093907e-05, + "loss": 1.8948, + "step": 8068 + }, + { + "epoch": 2.476672805402087, + "grad_norm": 0.3200787901878357, + "learning_rate": 8.808623472926521e-05, + "loss": 1.8234, + "step": 8069 + }, + { + "epoch": 2.4769797421731123, + "grad_norm": 0.3244795799255371, + "learning_rate": 8.808301411120083e-05, + "loss": 1.8974, + "step": 8070 + }, + { + "epoch": 2.4772866789441377, + "grad_norm": 0.30235809087753296, + "learning_rate": 8.80797931167778e-05, + "loss": 1.8461, + "step": 8071 + }, + { + "epoch": 2.4775936157151626, + "grad_norm": 0.3719651997089386, + "learning_rate": 8.807657174602792e-05, + "loss": 1.9717, + "step": 8072 + }, + { + "epoch": 2.477900552486188, + "grad_norm": 0.3349135220050812, + "learning_rate": 8.807334999898307e-05, + "loss": 1.9, + "step": 8073 + }, + { + "epoch": 2.478207489257213, + "grad_norm": 0.28822100162506104, + "learning_rate": 8.807012787567503e-05, + "loss": 1.7606, + "step": 8074 + }, + { + "epoch": 2.478514426028238, + "grad_norm": 0.33698850870132446, + "learning_rate": 8.806690537613568e-05, + "loss": 1.8909, + "step": 8075 + }, + { + "epoch": 2.4788213627992635, + "grad_norm": 0.35167089104652405, + "learning_rate": 8.806368250039687e-05, + "loss": 1.8529, + "step": 8076 + }, + { + "epoch": 2.4791282995702884, + "grad_norm": 0.3142544627189636, + "learning_rate": 8.806045924849044e-05, + "loss": 1.8169, + "step": 8077 + }, + { + "epoch": 2.4794352363413137, + "grad_norm": 0.3489094078540802, + "learning_rate": 8.805723562044824e-05, + "loss": 1.8822, + "step": 8078 + }, + { + "epoch": 2.479742173112339, + "grad_norm": 0.33814284205436707, + "learning_rate": 8.805401161630214e-05, + "loss": 1.7982, + "step": 8079 + }, + { + "epoch": 2.480049109883364, + "grad_norm": 0.26772376894950867, + "learning_rate": 8.805078723608398e-05, + "loss": 1.8354, + "step": 8080 + }, + { + "epoch": 2.4803560466543892, + "grad_norm": 0.3259965777397156, + "learning_rate": 8.804756247982563e-05, + "loss": 1.8292, + "step": 8081 + }, + { + "epoch": 2.4806629834254146, + "grad_norm": 0.32701683044433594, + "learning_rate": 8.804433734755899e-05, + "loss": 1.8339, + "step": 8082 + }, + { + "epoch": 2.4809699201964395, + "grad_norm": 0.3180190324783325, + "learning_rate": 8.804111183931589e-05, + "loss": 1.8839, + "step": 8083 + }, + { + "epoch": 2.481276856967465, + "grad_norm": 0.3318104147911072, + "learning_rate": 8.803788595512824e-05, + "loss": 1.9024, + "step": 8084 + }, + { + "epoch": 2.4815837937384897, + "grad_norm": 0.3849479854106903, + "learning_rate": 8.80346596950279e-05, + "loss": 1.8497, + "step": 8085 + }, + { + "epoch": 2.481890730509515, + "grad_norm": 0.48812124133110046, + "learning_rate": 8.803143305904676e-05, + "loss": 1.799, + "step": 8086 + }, + { + "epoch": 2.4821976672805404, + "grad_norm": 0.4957241415977478, + "learning_rate": 8.802820604721671e-05, + "loss": 1.8842, + "step": 8087 + }, + { + "epoch": 2.4825046040515653, + "grad_norm": 0.4011611342430115, + "learning_rate": 8.802497865956964e-05, + "loss": 1.8354, + "step": 8088 + }, + { + "epoch": 2.4828115408225906, + "grad_norm": 0.3676159679889679, + "learning_rate": 8.802175089613744e-05, + "loss": 1.8564, + "step": 8089 + }, + { + "epoch": 2.4831184775936155, + "grad_norm": 0.30699628591537476, + "learning_rate": 8.801852275695202e-05, + "loss": 1.8403, + "step": 8090 + }, + { + "epoch": 2.483425414364641, + "grad_norm": 0.4100657105445862, + "learning_rate": 8.801529424204527e-05, + "loss": 1.7885, + "step": 8091 + }, + { + "epoch": 2.483732351135666, + "grad_norm": 0.30880647897720337, + "learning_rate": 8.801206535144909e-05, + "loss": 1.8682, + "step": 8092 + }, + { + "epoch": 2.484039287906691, + "grad_norm": 0.2775783836841583, + "learning_rate": 8.800883608519541e-05, + "loss": 1.8179, + "step": 8093 + }, + { + "epoch": 2.4843462246777164, + "grad_norm": 0.3048902451992035, + "learning_rate": 8.800560644331613e-05, + "loss": 1.8799, + "step": 8094 + }, + { + "epoch": 2.4846531614487417, + "grad_norm": 0.30332526564598083, + "learning_rate": 8.800237642584318e-05, + "loss": 1.8892, + "step": 8095 + }, + { + "epoch": 2.4849600982197666, + "grad_norm": 0.27216237783432007, + "learning_rate": 8.799914603280847e-05, + "loss": 1.7896, + "step": 8096 + }, + { + "epoch": 2.485267034990792, + "grad_norm": 0.28771117329597473, + "learning_rate": 8.799591526424393e-05, + "loss": 1.8593, + "step": 8097 + }, + { + "epoch": 2.4855739717618173, + "grad_norm": 0.2986912429332733, + "learning_rate": 8.799268412018146e-05, + "loss": 1.8205, + "step": 8098 + }, + { + "epoch": 2.485880908532842, + "grad_norm": 0.3072153925895691, + "learning_rate": 8.798945260065306e-05, + "loss": 1.841, + "step": 8099 + }, + { + "epoch": 2.4861878453038675, + "grad_norm": 0.33869001269340515, + "learning_rate": 8.798622070569059e-05, + "loss": 1.8353, + "step": 8100 + }, + { + "epoch": 2.4864947820748924, + "grad_norm": 0.3075481951236725, + "learning_rate": 8.798298843532605e-05, + "loss": 1.8824, + "step": 8101 + }, + { + "epoch": 2.4868017188459177, + "grad_norm": 0.2758934795856476, + "learning_rate": 8.797975578959132e-05, + "loss": 1.8068, + "step": 8102 + }, + { + "epoch": 2.487108655616943, + "grad_norm": 0.3065447211265564, + "learning_rate": 8.79765227685184e-05, + "loss": 1.8661, + "step": 8103 + }, + { + "epoch": 2.487415592387968, + "grad_norm": 0.34466415643692017, + "learning_rate": 8.797328937213923e-05, + "loss": 1.8579, + "step": 8104 + }, + { + "epoch": 2.4877225291589933, + "grad_norm": 0.4202970862388611, + "learning_rate": 8.797005560048575e-05, + "loss": 1.8526, + "step": 8105 + }, + { + "epoch": 2.488029465930018, + "grad_norm": 0.35885924100875854, + "learning_rate": 8.796682145358991e-05, + "loss": 1.8194, + "step": 8106 + }, + { + "epoch": 2.4883364027010435, + "grad_norm": 0.3208492696285248, + "learning_rate": 8.796358693148372e-05, + "loss": 1.8379, + "step": 8107 + }, + { + "epoch": 2.488643339472069, + "grad_norm": 0.26514047384262085, + "learning_rate": 8.79603520341991e-05, + "loss": 1.7978, + "step": 8108 + }, + { + "epoch": 2.4889502762430937, + "grad_norm": 0.34550225734710693, + "learning_rate": 8.795711676176803e-05, + "loss": 1.8771, + "step": 8109 + }, + { + "epoch": 2.489257213014119, + "grad_norm": 0.3016511797904968, + "learning_rate": 8.795388111422248e-05, + "loss": 1.8184, + "step": 8110 + }, + { + "epoch": 2.4895641497851444, + "grad_norm": 0.34824177622795105, + "learning_rate": 8.795064509159444e-05, + "loss": 1.8486, + "step": 8111 + }, + { + "epoch": 2.4898710865561693, + "grad_norm": 0.341482013463974, + "learning_rate": 8.794740869391587e-05, + "loss": 1.7872, + "step": 8112 + }, + { + "epoch": 2.4901780233271946, + "grad_norm": 0.3366520404815674, + "learning_rate": 8.794417192121878e-05, + "loss": 1.838, + "step": 8113 + }, + { + "epoch": 2.49048496009822, + "grad_norm": 0.3168759047985077, + "learning_rate": 8.794093477353514e-05, + "loss": 1.8195, + "step": 8114 + }, + { + "epoch": 2.490791896869245, + "grad_norm": 0.36757516860961914, + "learning_rate": 8.793769725089693e-05, + "loss": 1.8825, + "step": 8115 + }, + { + "epoch": 2.49109883364027, + "grad_norm": 0.3936297297477722, + "learning_rate": 8.793445935333617e-05, + "loss": 1.855, + "step": 8116 + }, + { + "epoch": 2.491405770411295, + "grad_norm": 0.31962448358535767, + "learning_rate": 8.793122108088485e-05, + "loss": 1.8307, + "step": 8117 + }, + { + "epoch": 2.4917127071823204, + "grad_norm": 0.3082095980644226, + "learning_rate": 8.792798243357499e-05, + "loss": 1.8204, + "step": 8118 + }, + { + "epoch": 2.4920196439533457, + "grad_norm": 0.4574470520019531, + "learning_rate": 8.792474341143855e-05, + "loss": 1.8989, + "step": 8119 + }, + { + "epoch": 2.4923265807243706, + "grad_norm": 0.4596022367477417, + "learning_rate": 8.792150401450757e-05, + "loss": 1.8773, + "step": 8120 + }, + { + "epoch": 2.492633517495396, + "grad_norm": 0.32090309262275696, + "learning_rate": 8.791826424281407e-05, + "loss": 1.8621, + "step": 8121 + }, + { + "epoch": 2.4929404542664213, + "grad_norm": 0.3492026925086975, + "learning_rate": 8.791502409639006e-05, + "loss": 1.8887, + "step": 8122 + }, + { + "epoch": 2.493247391037446, + "grad_norm": 0.39859771728515625, + "learning_rate": 8.791178357526754e-05, + "loss": 1.8326, + "step": 8123 + }, + { + "epoch": 2.4935543278084715, + "grad_norm": 0.40439239144325256, + "learning_rate": 8.790854267947857e-05, + "loss": 1.8716, + "step": 8124 + }, + { + "epoch": 2.493861264579497, + "grad_norm": 0.4004671573638916, + "learning_rate": 8.790530140905515e-05, + "loss": 1.8253, + "step": 8125 + }, + { + "epoch": 2.4941682013505218, + "grad_norm": 0.31446993350982666, + "learning_rate": 8.790205976402934e-05, + "loss": 1.8356, + "step": 8126 + }, + { + "epoch": 2.494475138121547, + "grad_norm": 0.3069862723350525, + "learning_rate": 8.789881774443315e-05, + "loss": 1.8532, + "step": 8127 + }, + { + "epoch": 2.494782074892572, + "grad_norm": 0.3192054033279419, + "learning_rate": 8.789557535029864e-05, + "loss": 1.7991, + "step": 8128 + }, + { + "epoch": 2.4950890116635973, + "grad_norm": 0.30979350209236145, + "learning_rate": 8.789233258165783e-05, + "loss": 1.8874, + "step": 8129 + }, + { + "epoch": 2.4953959484346226, + "grad_norm": 0.3193976879119873, + "learning_rate": 8.788908943854279e-05, + "loss": 1.8218, + "step": 8130 + }, + { + "epoch": 2.4957028852056475, + "grad_norm": 0.3120083808898926, + "learning_rate": 8.788584592098557e-05, + "loss": 1.9542, + "step": 8131 + }, + { + "epoch": 2.496009821976673, + "grad_norm": 0.36913001537323, + "learning_rate": 8.788260202901819e-05, + "loss": 1.8543, + "step": 8132 + }, + { + "epoch": 2.4963167587476978, + "grad_norm": 0.40216776728630066, + "learning_rate": 8.787935776267275e-05, + "loss": 1.8645, + "step": 8133 + }, + { + "epoch": 2.496623695518723, + "grad_norm": 0.3553076684474945, + "learning_rate": 8.78761131219813e-05, + "loss": 1.8881, + "step": 8134 + }, + { + "epoch": 2.4969306322897484, + "grad_norm": 0.2926538288593292, + "learning_rate": 8.787286810697589e-05, + "loss": 1.8419, + "step": 8135 + }, + { + "epoch": 2.4972375690607733, + "grad_norm": 0.3412233293056488, + "learning_rate": 8.78696227176886e-05, + "loss": 1.8766, + "step": 8136 + }, + { + "epoch": 2.4975445058317987, + "grad_norm": 0.30935296416282654, + "learning_rate": 8.78663769541515e-05, + "loss": 1.8002, + "step": 8137 + }, + { + "epoch": 2.497851442602824, + "grad_norm": 0.31171828508377075, + "learning_rate": 8.786313081639666e-05, + "loss": 1.7795, + "step": 8138 + }, + { + "epoch": 2.498158379373849, + "grad_norm": 0.2874031364917755, + "learning_rate": 8.785988430445619e-05, + "loss": 1.8508, + "step": 8139 + }, + { + "epoch": 2.498465316144874, + "grad_norm": 0.3126043379306793, + "learning_rate": 8.785663741836215e-05, + "loss": 1.8328, + "step": 8140 + }, + { + "epoch": 2.4987722529158995, + "grad_norm": 0.32581454515457153, + "learning_rate": 8.785339015814662e-05, + "loss": 1.8333, + "step": 8141 + }, + { + "epoch": 2.4990791896869244, + "grad_norm": 0.329745888710022, + "learning_rate": 8.78501425238417e-05, + "loss": 1.8257, + "step": 8142 + }, + { + "epoch": 2.4993861264579498, + "grad_norm": 0.29101938009262085, + "learning_rate": 8.78468945154795e-05, + "loss": 1.8472, + "step": 8143 + }, + { + "epoch": 2.4996930632289747, + "grad_norm": 0.3123742341995239, + "learning_rate": 8.784364613309208e-05, + "loss": 1.9226, + "step": 8144 + }, + { + "epoch": 2.5, + "grad_norm": 0.3330230116844177, + "learning_rate": 8.784039737671159e-05, + "loss": 1.8768, + "step": 8145 + }, + { + "epoch": 2.5003069367710253, + "grad_norm": 0.3147718012332916, + "learning_rate": 8.783714824637011e-05, + "loss": 1.853, + "step": 8146 + }, + { + "epoch": 2.5006138735420502, + "grad_norm": 0.34790241718292236, + "learning_rate": 8.783389874209977e-05, + "loss": 1.8328, + "step": 8147 + }, + { + "epoch": 2.5009208103130756, + "grad_norm": 0.29425308108329773, + "learning_rate": 8.783064886393264e-05, + "loss": 1.8487, + "step": 8148 + }, + { + "epoch": 2.5012277470841005, + "grad_norm": 0.30555078387260437, + "learning_rate": 8.782739861190088e-05, + "loss": 1.8588, + "step": 8149 + }, + { + "epoch": 2.501534683855126, + "grad_norm": 0.29712429642677307, + "learning_rate": 8.78241479860366e-05, + "loss": 1.8056, + "step": 8150 + }, + { + "epoch": 2.501841620626151, + "grad_norm": 0.32512977719306946, + "learning_rate": 8.782089698637191e-05, + "loss": 1.9099, + "step": 8151 + }, + { + "epoch": 2.5021485573971765, + "grad_norm": 0.3660493493080139, + "learning_rate": 8.781764561293895e-05, + "loss": 1.905, + "step": 8152 + }, + { + "epoch": 2.5024554941682013, + "grad_norm": 0.33591583371162415, + "learning_rate": 8.781439386576984e-05, + "loss": 1.8353, + "step": 8153 + }, + { + "epoch": 2.5027624309392267, + "grad_norm": 0.3774370551109314, + "learning_rate": 8.781114174489673e-05, + "loss": 1.8626, + "step": 8154 + }, + { + "epoch": 2.5030693677102516, + "grad_norm": 0.3628109097480774, + "learning_rate": 8.780788925035178e-05, + "loss": 1.8549, + "step": 8155 + }, + { + "epoch": 2.503376304481277, + "grad_norm": 0.3089732825756073, + "learning_rate": 8.78046363821671e-05, + "loss": 1.835, + "step": 8156 + }, + { + "epoch": 2.5036832412523022, + "grad_norm": 0.3630690574645996, + "learning_rate": 8.780138314037482e-05, + "loss": 1.8308, + "step": 8157 + }, + { + "epoch": 2.503990178023327, + "grad_norm": 0.3658130466938019, + "learning_rate": 8.779812952500714e-05, + "loss": 1.8484, + "step": 8158 + }, + { + "epoch": 2.5042971147943525, + "grad_norm": 0.38401272892951965, + "learning_rate": 8.779487553609617e-05, + "loss": 1.8408, + "step": 8159 + }, + { + "epoch": 2.5046040515653774, + "grad_norm": 0.354514479637146, + "learning_rate": 8.77916211736741e-05, + "loss": 1.8491, + "step": 8160 + }, + { + "epoch": 2.5049109883364027, + "grad_norm": 0.3604681193828583, + "learning_rate": 8.778836643777309e-05, + "loss": 1.8887, + "step": 8161 + }, + { + "epoch": 2.505217925107428, + "grad_norm": 0.3155761957168579, + "learning_rate": 8.778511132842528e-05, + "loss": 1.8066, + "step": 8162 + }, + { + "epoch": 2.505524861878453, + "grad_norm": 0.35986092686653137, + "learning_rate": 8.778185584566286e-05, + "loss": 1.8348, + "step": 8163 + }, + { + "epoch": 2.5058317986494782, + "grad_norm": 0.558273434638977, + "learning_rate": 8.777859998951799e-05, + "loss": 1.9118, + "step": 8164 + }, + { + "epoch": 2.506138735420503, + "grad_norm": 0.6520169377326965, + "learning_rate": 8.777534376002285e-05, + "loss": 1.8747, + "step": 8165 + }, + { + "epoch": 2.5064456721915285, + "grad_norm": 0.5059971213340759, + "learning_rate": 8.777208715720963e-05, + "loss": 1.8218, + "step": 8166 + }, + { + "epoch": 2.506752608962554, + "grad_norm": 0.2873745560646057, + "learning_rate": 8.77688301811105e-05, + "loss": 1.8266, + "step": 8167 + }, + { + "epoch": 2.507059545733579, + "grad_norm": 0.4212021827697754, + "learning_rate": 8.776557283175765e-05, + "loss": 1.8553, + "step": 8168 + }, + { + "epoch": 2.507366482504604, + "grad_norm": 0.49324098229408264, + "learning_rate": 8.776231510918328e-05, + "loss": 1.8625, + "step": 8169 + }, + { + "epoch": 2.5076734192756294, + "grad_norm": 0.4414234459400177, + "learning_rate": 8.775905701341959e-05, + "loss": 1.7956, + "step": 8170 + }, + { + "epoch": 2.5079803560466543, + "grad_norm": 0.2691541612148285, + "learning_rate": 8.775579854449876e-05, + "loss": 1.8216, + "step": 8171 + }, + { + "epoch": 2.5082872928176796, + "grad_norm": 0.3366323411464691, + "learning_rate": 8.775253970245299e-05, + "loss": 1.8738, + "step": 8172 + }, + { + "epoch": 2.508594229588705, + "grad_norm": 0.49541351199150085, + "learning_rate": 8.77492804873145e-05, + "loss": 1.8281, + "step": 8173 + }, + { + "epoch": 2.50890116635973, + "grad_norm": 0.584227442741394, + "learning_rate": 8.774602089911548e-05, + "loss": 1.8248, + "step": 8174 + }, + { + "epoch": 2.509208103130755, + "grad_norm": 0.4493597149848938, + "learning_rate": 8.774276093788818e-05, + "loss": 1.8624, + "step": 8175 + }, + { + "epoch": 2.50951503990178, + "grad_norm": 0.29684513807296753, + "learning_rate": 8.77395006036648e-05, + "loss": 1.7806, + "step": 8176 + }, + { + "epoch": 2.5098219766728054, + "grad_norm": 0.38788866996765137, + "learning_rate": 8.773623989647754e-05, + "loss": 1.8334, + "step": 8177 + }, + { + "epoch": 2.5101289134438307, + "grad_norm": 0.44810980558395386, + "learning_rate": 8.773297881635865e-05, + "loss": 1.823, + "step": 8178 + }, + { + "epoch": 2.5104358502148556, + "grad_norm": 0.39918363094329834, + "learning_rate": 8.772971736334032e-05, + "loss": 1.8535, + "step": 8179 + }, + { + "epoch": 2.510742786985881, + "grad_norm": 0.3454466462135315, + "learning_rate": 8.772645553745484e-05, + "loss": 1.8532, + "step": 8180 + }, + { + "epoch": 2.511049723756906, + "grad_norm": 0.3523466885089874, + "learning_rate": 8.77231933387344e-05, + "loss": 1.8402, + "step": 8181 + }, + { + "epoch": 2.511356660527931, + "grad_norm": 0.41947969794273376, + "learning_rate": 8.771993076721126e-05, + "loss": 1.8509, + "step": 8182 + }, + { + "epoch": 2.5116635972989565, + "grad_norm": 0.43224433064460754, + "learning_rate": 8.771666782291765e-05, + "loss": 1.858, + "step": 8183 + }, + { + "epoch": 2.511970534069982, + "grad_norm": 0.3467538058757782, + "learning_rate": 8.771340450588584e-05, + "loss": 1.8528, + "step": 8184 + }, + { + "epoch": 2.5122774708410067, + "grad_norm": 0.33712685108184814, + "learning_rate": 8.771014081614803e-05, + "loss": 1.8741, + "step": 8185 + }, + { + "epoch": 2.512584407612032, + "grad_norm": 0.4289829134941101, + "learning_rate": 8.770687675373652e-05, + "loss": 1.8252, + "step": 8186 + }, + { + "epoch": 2.512891344383057, + "grad_norm": 0.4774068295955658, + "learning_rate": 8.770361231868356e-05, + "loss": 1.8285, + "step": 8187 + }, + { + "epoch": 2.5131982811540823, + "grad_norm": 0.3455580472946167, + "learning_rate": 8.77003475110214e-05, + "loss": 1.8025, + "step": 8188 + }, + { + "epoch": 2.5135052179251076, + "grad_norm": 0.3050900399684906, + "learning_rate": 8.769708233078231e-05, + "loss": 1.8764, + "step": 8189 + }, + { + "epoch": 2.5138121546961325, + "grad_norm": 0.42384061217308044, + "learning_rate": 8.769381677799855e-05, + "loss": 1.8937, + "step": 8190 + }, + { + "epoch": 2.514119091467158, + "grad_norm": 0.4084749221801758, + "learning_rate": 8.76905508527024e-05, + "loss": 1.8124, + "step": 8191 + }, + { + "epoch": 2.5144260282381827, + "grad_norm": 0.38785848021507263, + "learning_rate": 8.768728455492615e-05, + "loss": 1.8731, + "step": 8192 + }, + { + "epoch": 2.514732965009208, + "grad_norm": 0.28196588158607483, + "learning_rate": 8.768401788470206e-05, + "loss": 1.809, + "step": 8193 + }, + { + "epoch": 2.5150399017802334, + "grad_norm": 0.3551066815853119, + "learning_rate": 8.76807508420624e-05, + "loss": 1.8955, + "step": 8194 + }, + { + "epoch": 2.5153468385512583, + "grad_norm": 0.4327031373977661, + "learning_rate": 8.76774834270395e-05, + "loss": 1.8651, + "step": 8195 + }, + { + "epoch": 2.5156537753222836, + "grad_norm": 0.3748793303966522, + "learning_rate": 8.76742156396656e-05, + "loss": 1.8158, + "step": 8196 + }, + { + "epoch": 2.5159607120933085, + "grad_norm": 0.32504430413246155, + "learning_rate": 8.767094747997304e-05, + "loss": 1.8598, + "step": 8197 + }, + { + "epoch": 2.516267648864334, + "grad_norm": 0.3639826476573944, + "learning_rate": 8.76676789479941e-05, + "loss": 1.8829, + "step": 8198 + }, + { + "epoch": 2.516574585635359, + "grad_norm": 0.36793577671051025, + "learning_rate": 8.766441004376106e-05, + "loss": 1.8215, + "step": 8199 + }, + { + "epoch": 2.5168815224063845, + "grad_norm": 0.3245735466480255, + "learning_rate": 8.766114076730624e-05, + "loss": 1.8309, + "step": 8200 + }, + { + "epoch": 2.5171884591774094, + "grad_norm": 0.3022485673427582, + "learning_rate": 8.765787111866198e-05, + "loss": 1.8286, + "step": 8201 + }, + { + "epoch": 2.5174953959484347, + "grad_norm": 0.40962809324264526, + "learning_rate": 8.765460109786056e-05, + "loss": 1.8032, + "step": 8202 + }, + { + "epoch": 2.5178023327194596, + "grad_norm": 0.4123937487602234, + "learning_rate": 8.765133070493428e-05, + "loss": 1.9311, + "step": 8203 + }, + { + "epoch": 2.518109269490485, + "grad_norm": 0.30352556705474854, + "learning_rate": 8.764805993991551e-05, + "loss": 1.8197, + "step": 8204 + }, + { + "epoch": 2.5184162062615103, + "grad_norm": 0.3201169967651367, + "learning_rate": 8.764478880283653e-05, + "loss": 1.9355, + "step": 8205 + }, + { + "epoch": 2.518723143032535, + "grad_norm": 0.36343297362327576, + "learning_rate": 8.764151729372969e-05, + "loss": 1.9201, + "step": 8206 + }, + { + "epoch": 2.5190300798035605, + "grad_norm": 0.3273618817329407, + "learning_rate": 8.763824541262729e-05, + "loss": 1.8195, + "step": 8207 + }, + { + "epoch": 2.5193370165745854, + "grad_norm": 0.30200251936912537, + "learning_rate": 8.76349731595617e-05, + "loss": 1.8094, + "step": 8208 + }, + { + "epoch": 2.5196439533456108, + "grad_norm": 0.3177770674228668, + "learning_rate": 8.763170053456527e-05, + "loss": 1.8519, + "step": 8209 + }, + { + "epoch": 2.519950890116636, + "grad_norm": 0.3206307291984558, + "learning_rate": 8.762842753767031e-05, + "loss": 1.8496, + "step": 8210 + }, + { + "epoch": 2.520257826887661, + "grad_norm": 0.31902456283569336, + "learning_rate": 8.762515416890915e-05, + "loss": 1.9069, + "step": 8211 + }, + { + "epoch": 2.5205647636586863, + "grad_norm": 0.3088377118110657, + "learning_rate": 8.762188042831419e-05, + "loss": 1.8482, + "step": 8212 + }, + { + "epoch": 2.520871700429711, + "grad_norm": 0.3046402931213379, + "learning_rate": 8.761860631591773e-05, + "loss": 1.8241, + "step": 8213 + }, + { + "epoch": 2.5211786372007365, + "grad_norm": 0.291831910610199, + "learning_rate": 8.761533183175217e-05, + "loss": 1.846, + "step": 8214 + }, + { + "epoch": 2.521485573971762, + "grad_norm": 0.3514893054962158, + "learning_rate": 8.761205697584986e-05, + "loss": 1.9, + "step": 8215 + }, + { + "epoch": 2.521792510742787, + "grad_norm": 0.31843090057373047, + "learning_rate": 8.760878174824316e-05, + "loss": 1.78, + "step": 8216 + }, + { + "epoch": 2.522099447513812, + "grad_norm": 0.30090904235839844, + "learning_rate": 8.760550614896443e-05, + "loss": 1.8718, + "step": 8217 + }, + { + "epoch": 2.5224063842848374, + "grad_norm": 0.38502126932144165, + "learning_rate": 8.760223017804604e-05, + "loss": 1.8772, + "step": 8218 + }, + { + "epoch": 2.5227133210558623, + "grad_norm": 0.30862319469451904, + "learning_rate": 8.759895383552037e-05, + "loss": 1.8532, + "step": 8219 + }, + { + "epoch": 2.5230202578268877, + "grad_norm": 0.36331596970558167, + "learning_rate": 8.759567712141981e-05, + "loss": 1.8587, + "step": 8220 + }, + { + "epoch": 2.523327194597913, + "grad_norm": 0.3370853662490845, + "learning_rate": 8.759240003577673e-05, + "loss": 1.8065, + "step": 8221 + }, + { + "epoch": 2.523634131368938, + "grad_norm": 0.3047318160533905, + "learning_rate": 8.758912257862351e-05, + "loss": 1.8783, + "step": 8222 + }, + { + "epoch": 2.523941068139963, + "grad_norm": 0.3172069787979126, + "learning_rate": 8.758584474999257e-05, + "loss": 1.7844, + "step": 8223 + }, + { + "epoch": 2.524248004910988, + "grad_norm": 0.3063897490501404, + "learning_rate": 8.758256654991626e-05, + "loss": 1.8642, + "step": 8224 + }, + { + "epoch": 2.5245549416820134, + "grad_norm": 0.2535867393016815, + "learning_rate": 8.757928797842702e-05, + "loss": 1.7784, + "step": 8225 + }, + { + "epoch": 2.5248618784530388, + "grad_norm": 0.27732348442077637, + "learning_rate": 8.757600903555722e-05, + "loss": 1.8223, + "step": 8226 + }, + { + "epoch": 2.525168815224064, + "grad_norm": 0.29819566011428833, + "learning_rate": 8.757272972133927e-05, + "loss": 1.8237, + "step": 8227 + }, + { + "epoch": 2.525475751995089, + "grad_norm": 0.26726382970809937, + "learning_rate": 8.756945003580559e-05, + "loss": 1.8134, + "step": 8228 + }, + { + "epoch": 2.5257826887661143, + "grad_norm": 0.2845614552497864, + "learning_rate": 8.756616997898859e-05, + "loss": 1.8757, + "step": 8229 + }, + { + "epoch": 2.5260896255371392, + "grad_norm": 0.33399102091789246, + "learning_rate": 8.756288955092066e-05, + "loss": 1.9036, + "step": 8230 + }, + { + "epoch": 2.5263965623081646, + "grad_norm": 0.3839001953601837, + "learning_rate": 8.755960875163426e-05, + "loss": 1.8205, + "step": 8231 + }, + { + "epoch": 2.52670349907919, + "grad_norm": 0.3703761696815491, + "learning_rate": 8.75563275811618e-05, + "loss": 1.768, + "step": 8232 + }, + { + "epoch": 2.527010435850215, + "grad_norm": 0.3083760440349579, + "learning_rate": 8.755304603953568e-05, + "loss": 1.8621, + "step": 8233 + }, + { + "epoch": 2.52731737262124, + "grad_norm": 0.2995334267616272, + "learning_rate": 8.754976412678833e-05, + "loss": 1.8246, + "step": 8234 + }, + { + "epoch": 2.527624309392265, + "grad_norm": 0.3482929766178131, + "learning_rate": 8.754648184295222e-05, + "loss": 1.7982, + "step": 8235 + }, + { + "epoch": 2.5279312461632903, + "grad_norm": 0.37462911009788513, + "learning_rate": 8.754319918805978e-05, + "loss": 1.8458, + "step": 8236 + }, + { + "epoch": 2.5282381829343157, + "grad_norm": 0.3112029433250427, + "learning_rate": 8.753991616214343e-05, + "loss": 1.9116, + "step": 8237 + }, + { + "epoch": 2.5285451197053406, + "grad_norm": 0.309711217880249, + "learning_rate": 8.753663276523563e-05, + "loss": 1.8072, + "step": 8238 + }, + { + "epoch": 2.528852056476366, + "grad_norm": 0.3831833302974701, + "learning_rate": 8.753334899736882e-05, + "loss": 1.8769, + "step": 8239 + }, + { + "epoch": 2.529158993247391, + "grad_norm": 0.30272287130355835, + "learning_rate": 8.753006485857547e-05, + "loss": 1.7874, + "step": 8240 + }, + { + "epoch": 2.529465930018416, + "grad_norm": 0.3613976538181305, + "learning_rate": 8.752678034888801e-05, + "loss": 1.8591, + "step": 8241 + }, + { + "epoch": 2.5297728667894415, + "grad_norm": 0.35976549983024597, + "learning_rate": 8.75234954683389e-05, + "loss": 1.7831, + "step": 8242 + }, + { + "epoch": 2.530079803560467, + "grad_norm": 0.33987951278686523, + "learning_rate": 8.752021021696064e-05, + "loss": 1.7986, + "step": 8243 + }, + { + "epoch": 2.5303867403314917, + "grad_norm": 0.29231634736061096, + "learning_rate": 8.751692459478567e-05, + "loss": 1.8205, + "step": 8244 + }, + { + "epoch": 2.530693677102517, + "grad_norm": 0.3382028341293335, + "learning_rate": 8.751363860184644e-05, + "loss": 1.8403, + "step": 8245 + }, + { + "epoch": 2.531000613873542, + "grad_norm": 0.44643479585647583, + "learning_rate": 8.751035223817546e-05, + "loss": 1.8273, + "step": 8246 + }, + { + "epoch": 2.5313075506445673, + "grad_norm": 0.4412732720375061, + "learning_rate": 8.750706550380518e-05, + "loss": 1.7935, + "step": 8247 + }, + { + "epoch": 2.5316144874155926, + "grad_norm": 0.3826131820678711, + "learning_rate": 8.750377839876811e-05, + "loss": 1.8622, + "step": 8248 + }, + { + "epoch": 2.5319214241866175, + "grad_norm": 0.27509525418281555, + "learning_rate": 8.750049092309672e-05, + "loss": 1.8359, + "step": 8249 + }, + { + "epoch": 2.532228360957643, + "grad_norm": 0.36282727122306824, + "learning_rate": 8.749720307682348e-05, + "loss": 1.8531, + "step": 8250 + }, + { + "epoch": 2.5325352977286677, + "grad_norm": 0.3730177581310272, + "learning_rate": 8.749391485998091e-05, + "loss": 1.8616, + "step": 8251 + }, + { + "epoch": 2.532842234499693, + "grad_norm": 0.3347858190536499, + "learning_rate": 8.749062627260152e-05, + "loss": 1.8078, + "step": 8252 + }, + { + "epoch": 2.5331491712707184, + "grad_norm": 0.29422396421432495, + "learning_rate": 8.748733731471777e-05, + "loss": 1.8623, + "step": 8253 + }, + { + "epoch": 2.5334561080417433, + "grad_norm": 0.36915895342826843, + "learning_rate": 8.748404798636219e-05, + "loss": 1.8461, + "step": 8254 + }, + { + "epoch": 2.5337630448127686, + "grad_norm": 0.4497677981853485, + "learning_rate": 8.748075828756725e-05, + "loss": 1.8328, + "step": 8255 + }, + { + "epoch": 2.5340699815837935, + "grad_norm": 0.4770478308200836, + "learning_rate": 8.747746821836552e-05, + "loss": 1.8418, + "step": 8256 + }, + { + "epoch": 2.534376918354819, + "grad_norm": 0.39125776290893555, + "learning_rate": 8.747417777878946e-05, + "loss": 1.8044, + "step": 8257 + }, + { + "epoch": 2.534683855125844, + "grad_norm": 0.2976539731025696, + "learning_rate": 8.747088696887163e-05, + "loss": 1.8819, + "step": 8258 + }, + { + "epoch": 2.5349907918968695, + "grad_norm": 0.37511107325553894, + "learning_rate": 8.746759578864452e-05, + "loss": 1.8304, + "step": 8259 + }, + { + "epoch": 2.5352977286678944, + "grad_norm": 0.4462794363498688, + "learning_rate": 8.746430423814068e-05, + "loss": 1.8248, + "step": 8260 + }, + { + "epoch": 2.5356046654389197, + "grad_norm": 0.3465537130832672, + "learning_rate": 8.746101231739261e-05, + "loss": 1.7987, + "step": 8261 + }, + { + "epoch": 2.5359116022099446, + "grad_norm": 0.3182581663131714, + "learning_rate": 8.745772002643287e-05, + "loss": 1.8817, + "step": 8262 + }, + { + "epoch": 2.53621853898097, + "grad_norm": 0.43006083369255066, + "learning_rate": 8.745442736529398e-05, + "loss": 1.8003, + "step": 8263 + }, + { + "epoch": 2.5365254757519953, + "grad_norm": 0.45511460304260254, + "learning_rate": 8.745113433400849e-05, + "loss": 1.8735, + "step": 8264 + }, + { + "epoch": 2.53683241252302, + "grad_norm": 0.3625985085964203, + "learning_rate": 8.744784093260894e-05, + "loss": 1.8469, + "step": 8265 + }, + { + "epoch": 2.5371393492940455, + "grad_norm": 0.2977297306060791, + "learning_rate": 8.744454716112787e-05, + "loss": 1.7885, + "step": 8266 + }, + { + "epoch": 2.5374462860650704, + "grad_norm": 0.34910085797309875, + "learning_rate": 8.744125301959785e-05, + "loss": 1.8885, + "step": 8267 + }, + { + "epoch": 2.5377532228360957, + "grad_norm": 0.40707942843437195, + "learning_rate": 8.743795850805141e-05, + "loss": 1.8829, + "step": 8268 + }, + { + "epoch": 2.538060159607121, + "grad_norm": 0.4142697751522064, + "learning_rate": 8.743466362652114e-05, + "loss": 1.903, + "step": 8269 + }, + { + "epoch": 2.538367096378146, + "grad_norm": 0.38610437512397766, + "learning_rate": 8.743136837503958e-05, + "loss": 1.9245, + "step": 8270 + }, + { + "epoch": 2.5386740331491713, + "grad_norm": 0.2940465211868286, + "learning_rate": 8.742807275363928e-05, + "loss": 1.8532, + "step": 8271 + }, + { + "epoch": 2.538980969920196, + "grad_norm": 0.3257673978805542, + "learning_rate": 8.742477676235284e-05, + "loss": 1.8517, + "step": 8272 + }, + { + "epoch": 2.5392879066912215, + "grad_norm": 0.3709326982498169, + "learning_rate": 8.742148040121282e-05, + "loss": 1.872, + "step": 8273 + }, + { + "epoch": 2.539594843462247, + "grad_norm": 0.3433123826980591, + "learning_rate": 8.741818367025179e-05, + "loss": 1.8717, + "step": 8274 + }, + { + "epoch": 2.539901780233272, + "grad_norm": 0.39426255226135254, + "learning_rate": 8.741488656950234e-05, + "loss": 1.8155, + "step": 8275 + }, + { + "epoch": 2.540208717004297, + "grad_norm": 0.48205071687698364, + "learning_rate": 8.741158909899706e-05, + "loss": 1.8668, + "step": 8276 + }, + { + "epoch": 2.5405156537753224, + "grad_norm": 0.35280337929725647, + "learning_rate": 8.740829125876853e-05, + "loss": 1.7845, + "step": 8277 + }, + { + "epoch": 2.5408225905463473, + "grad_norm": 0.3148525059223175, + "learning_rate": 8.740499304884932e-05, + "loss": 1.8539, + "step": 8278 + }, + { + "epoch": 2.5411295273173726, + "grad_norm": 0.387932687997818, + "learning_rate": 8.740169446927207e-05, + "loss": 1.8514, + "step": 8279 + }, + { + "epoch": 2.541436464088398, + "grad_norm": 0.37375807762145996, + "learning_rate": 8.739839552006934e-05, + "loss": 1.8497, + "step": 8280 + }, + { + "epoch": 2.541743400859423, + "grad_norm": 0.3094288408756256, + "learning_rate": 8.739509620127375e-05, + "loss": 1.8675, + "step": 8281 + }, + { + "epoch": 2.542050337630448, + "grad_norm": 0.36951884627342224, + "learning_rate": 8.73917965129179e-05, + "loss": 1.8533, + "step": 8282 + }, + { + "epoch": 2.542357274401473, + "grad_norm": 0.39360809326171875, + "learning_rate": 8.73884964550344e-05, + "loss": 1.8688, + "step": 8283 + }, + { + "epoch": 2.5426642111724984, + "grad_norm": 0.29781201481819153, + "learning_rate": 8.738519602765586e-05, + "loss": 1.8285, + "step": 8284 + }, + { + "epoch": 2.5429711479435237, + "grad_norm": 0.29476743936538696, + "learning_rate": 8.73818952308149e-05, + "loss": 1.8234, + "step": 8285 + }, + { + "epoch": 2.5432780847145486, + "grad_norm": 0.3660123646259308, + "learning_rate": 8.737859406454416e-05, + "loss": 1.8933, + "step": 8286 + }, + { + "epoch": 2.543585021485574, + "grad_norm": 0.41587865352630615, + "learning_rate": 8.737529252887621e-05, + "loss": 1.8799, + "step": 8287 + }, + { + "epoch": 2.5438919582565993, + "grad_norm": 0.4183691143989563, + "learning_rate": 8.737199062384374e-05, + "loss": 1.8479, + "step": 8288 + }, + { + "epoch": 2.544198895027624, + "grad_norm": 0.35940057039260864, + "learning_rate": 8.736868834947935e-05, + "loss": 1.8164, + "step": 8289 + }, + { + "epoch": 2.5445058317986495, + "grad_norm": 0.26804691553115845, + "learning_rate": 8.736538570581568e-05, + "loss": 1.8017, + "step": 8290 + }, + { + "epoch": 2.544812768569675, + "grad_norm": 0.34537792205810547, + "learning_rate": 8.736208269288534e-05, + "loss": 1.9002, + "step": 8291 + }, + { + "epoch": 2.5451197053406998, + "grad_norm": 0.4636915624141693, + "learning_rate": 8.735877931072106e-05, + "loss": 1.8207, + "step": 8292 + }, + { + "epoch": 2.545426642111725, + "grad_norm": 0.4897560775279999, + "learning_rate": 8.735547555935537e-05, + "loss": 1.7981, + "step": 8293 + }, + { + "epoch": 2.54573357888275, + "grad_norm": 0.37379372119903564, + "learning_rate": 8.7352171438821e-05, + "loss": 1.8727, + "step": 8294 + }, + { + "epoch": 2.5460405156537753, + "grad_norm": 0.295436292886734, + "learning_rate": 8.734886694915059e-05, + "loss": 1.8321, + "step": 8295 + }, + { + "epoch": 2.5463474524248007, + "grad_norm": 0.40406084060668945, + "learning_rate": 8.734556209037676e-05, + "loss": 1.8666, + "step": 8296 + }, + { + "epoch": 2.5466543891958255, + "grad_norm": 0.3286290466785431, + "learning_rate": 8.734225686253221e-05, + "loss": 1.8574, + "step": 8297 + }, + { + "epoch": 2.546961325966851, + "grad_norm": 0.3200569152832031, + "learning_rate": 8.73389512656496e-05, + "loss": 1.8253, + "step": 8298 + }, + { + "epoch": 2.5472682627378758, + "grad_norm": 0.35550132393836975, + "learning_rate": 8.733564529976157e-05, + "loss": 1.8293, + "step": 8299 + }, + { + "epoch": 2.547575199508901, + "grad_norm": 0.3804685175418854, + "learning_rate": 8.733233896490081e-05, + "loss": 1.8689, + "step": 8300 + }, + { + "epoch": 2.5478821362799264, + "grad_norm": 0.34739598631858826, + "learning_rate": 8.73290322611e-05, + "loss": 1.8441, + "step": 8301 + }, + { + "epoch": 2.5481890730509518, + "grad_norm": 0.29757586121559143, + "learning_rate": 8.732572518839182e-05, + "loss": 1.8698, + "step": 8302 + }, + { + "epoch": 2.5484960098219767, + "grad_norm": 0.30403536558151245, + "learning_rate": 8.732241774680895e-05, + "loss": 1.8305, + "step": 8303 + }, + { + "epoch": 2.548802946593002, + "grad_norm": 0.326876699924469, + "learning_rate": 8.731910993638406e-05, + "loss": 1.8514, + "step": 8304 + }, + { + "epoch": 2.549109883364027, + "grad_norm": 0.3108467161655426, + "learning_rate": 8.731580175714986e-05, + "loss": 1.8509, + "step": 8305 + }, + { + "epoch": 2.549416820135052, + "grad_norm": 0.31641489267349243, + "learning_rate": 8.731249320913904e-05, + "loss": 1.9009, + "step": 8306 + }, + { + "epoch": 2.5497237569060776, + "grad_norm": 0.3166131377220154, + "learning_rate": 8.730918429238428e-05, + "loss": 1.8291, + "step": 8307 + }, + { + "epoch": 2.5500306936771024, + "grad_norm": 0.27900195121765137, + "learning_rate": 8.730587500691829e-05, + "loss": 1.856, + "step": 8308 + }, + { + "epoch": 2.550337630448128, + "grad_norm": 0.3000704050064087, + "learning_rate": 8.730256535277379e-05, + "loss": 1.839, + "step": 8309 + }, + { + "epoch": 2.5506445672191527, + "grad_norm": 0.30938518047332764, + "learning_rate": 8.729925532998348e-05, + "loss": 1.929, + "step": 8310 + }, + { + "epoch": 2.550951503990178, + "grad_norm": 0.3687250316143036, + "learning_rate": 8.729594493858007e-05, + "loss": 1.9214, + "step": 8311 + }, + { + "epoch": 2.5512584407612033, + "grad_norm": 0.3302690386772156, + "learning_rate": 8.729263417859625e-05, + "loss": 1.8667, + "step": 8312 + }, + { + "epoch": 2.5515653775322282, + "grad_norm": 0.32535505294799805, + "learning_rate": 8.728932305006478e-05, + "loss": 1.8298, + "step": 8313 + }, + { + "epoch": 2.5518723143032536, + "grad_norm": 0.3425545394420624, + "learning_rate": 8.728601155301834e-05, + "loss": 1.9479, + "step": 8314 + }, + { + "epoch": 2.5521792510742785, + "grad_norm": 0.29452621936798096, + "learning_rate": 8.72826996874897e-05, + "loss": 1.7963, + "step": 8315 + }, + { + "epoch": 2.552486187845304, + "grad_norm": 0.28749120235443115, + "learning_rate": 8.727938745351156e-05, + "loss": 1.7993, + "step": 8316 + }, + { + "epoch": 2.552793124616329, + "grad_norm": 0.29261404275894165, + "learning_rate": 8.727607485111669e-05, + "loss": 1.8307, + "step": 8317 + }, + { + "epoch": 2.5531000613873545, + "grad_norm": 0.2949221730232239, + "learning_rate": 8.727276188033778e-05, + "loss": 1.7918, + "step": 8318 + }, + { + "epoch": 2.5534069981583793, + "grad_norm": 0.2975117862224579, + "learning_rate": 8.726944854120757e-05, + "loss": 1.8488, + "step": 8319 + }, + { + "epoch": 2.5537139349294047, + "grad_norm": 0.30285659432411194, + "learning_rate": 8.726613483375885e-05, + "loss": 1.8763, + "step": 8320 + }, + { + "epoch": 2.5540208717004296, + "grad_norm": 0.3068414330482483, + "learning_rate": 8.726282075802435e-05, + "loss": 1.8684, + "step": 8321 + }, + { + "epoch": 2.554327808471455, + "grad_norm": 0.3904091715812683, + "learning_rate": 8.72595063140368e-05, + "loss": 1.8643, + "step": 8322 + }, + { + "epoch": 2.5546347452424802, + "grad_norm": 0.443294882774353, + "learning_rate": 8.725619150182897e-05, + "loss": 1.8268, + "step": 8323 + }, + { + "epoch": 2.554941682013505, + "grad_norm": 0.4574877619743347, + "learning_rate": 8.725287632143362e-05, + "loss": 1.8686, + "step": 8324 + }, + { + "epoch": 2.5552486187845305, + "grad_norm": 0.3246860206127167, + "learning_rate": 8.724956077288351e-05, + "loss": 1.8304, + "step": 8325 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.30745935440063477, + "learning_rate": 8.724624485621141e-05, + "loss": 1.8129, + "step": 8326 + }, + { + "epoch": 2.5558624923265807, + "grad_norm": 0.4026782214641571, + "learning_rate": 8.72429285714501e-05, + "loss": 1.8511, + "step": 8327 + }, + { + "epoch": 2.556169429097606, + "grad_norm": 0.41659530997276306, + "learning_rate": 8.723961191863232e-05, + "loss": 1.891, + "step": 8328 + }, + { + "epoch": 2.556476365868631, + "grad_norm": 0.31792551279067993, + "learning_rate": 8.723629489779088e-05, + "loss": 1.8413, + "step": 8329 + }, + { + "epoch": 2.5567833026396563, + "grad_norm": 0.3168247640132904, + "learning_rate": 8.723297750895856e-05, + "loss": 1.902, + "step": 8330 + }, + { + "epoch": 2.557090239410681, + "grad_norm": 0.27834242582321167, + "learning_rate": 8.72296597521681e-05, + "loss": 1.8185, + "step": 8331 + }, + { + "epoch": 2.5573971761817065, + "grad_norm": 0.2997399568557739, + "learning_rate": 8.722634162745236e-05, + "loss": 1.8389, + "step": 8332 + }, + { + "epoch": 2.557704112952732, + "grad_norm": 0.29116490483283997, + "learning_rate": 8.722302313484407e-05, + "loss": 1.8391, + "step": 8333 + }, + { + "epoch": 2.558011049723757, + "grad_norm": 0.2898460030555725, + "learning_rate": 8.721970427437605e-05, + "loss": 1.8891, + "step": 8334 + }, + { + "epoch": 2.558317986494782, + "grad_norm": 0.3231159746646881, + "learning_rate": 8.721638504608109e-05, + "loss": 1.826, + "step": 8335 + }, + { + "epoch": 2.5586249232658074, + "grad_norm": 0.38665273785591125, + "learning_rate": 8.721306544999203e-05, + "loss": 1.9162, + "step": 8336 + }, + { + "epoch": 2.5589318600368323, + "grad_norm": 0.367824912071228, + "learning_rate": 8.720974548614162e-05, + "loss": 1.8165, + "step": 8337 + }, + { + "epoch": 2.5592387968078576, + "grad_norm": 0.3095315098762512, + "learning_rate": 8.72064251545627e-05, + "loss": 1.8887, + "step": 8338 + }, + { + "epoch": 2.559545733578883, + "grad_norm": 0.316890150308609, + "learning_rate": 8.720310445528807e-05, + "loss": 1.8547, + "step": 8339 + }, + { + "epoch": 2.559852670349908, + "grad_norm": 0.2962728440761566, + "learning_rate": 8.719978338835057e-05, + "loss": 1.8252, + "step": 8340 + }, + { + "epoch": 2.560159607120933, + "grad_norm": 0.3351762890815735, + "learning_rate": 8.719646195378302e-05, + "loss": 1.8056, + "step": 8341 + }, + { + "epoch": 2.560466543891958, + "grad_norm": 0.2946149706840515, + "learning_rate": 8.719314015161822e-05, + "loss": 1.8219, + "step": 8342 + }, + { + "epoch": 2.5607734806629834, + "grad_norm": 0.30291053652763367, + "learning_rate": 8.718981798188899e-05, + "loss": 1.8161, + "step": 8343 + }, + { + "epoch": 2.5610804174340087, + "grad_norm": 0.30717429518699646, + "learning_rate": 8.71864954446282e-05, + "loss": 1.8763, + "step": 8344 + }, + { + "epoch": 2.5613873542050336, + "grad_norm": 0.28360515832901, + "learning_rate": 8.718317253986866e-05, + "loss": 1.7972, + "step": 8345 + }, + { + "epoch": 2.561694290976059, + "grad_norm": 0.34898701310157776, + "learning_rate": 8.717984926764322e-05, + "loss": 1.8843, + "step": 8346 + }, + { + "epoch": 2.562001227747084, + "grad_norm": 0.2702360451221466, + "learning_rate": 8.717652562798472e-05, + "loss": 1.7917, + "step": 8347 + }, + { + "epoch": 2.562308164518109, + "grad_norm": 0.30566295981407166, + "learning_rate": 8.7173201620926e-05, + "loss": 1.9027, + "step": 8348 + }, + { + "epoch": 2.5626151012891345, + "grad_norm": 0.2882433533668518, + "learning_rate": 8.716987724649991e-05, + "loss": 1.8167, + "step": 8349 + }, + { + "epoch": 2.56292203806016, + "grad_norm": 0.2616370916366577, + "learning_rate": 8.71665525047393e-05, + "loss": 1.7779, + "step": 8350 + }, + { + "epoch": 2.5632289748311847, + "grad_norm": 0.3033899664878845, + "learning_rate": 8.716322739567706e-05, + "loss": 1.9022, + "step": 8351 + }, + { + "epoch": 2.56353591160221, + "grad_norm": 0.30584800243377686, + "learning_rate": 8.7159901919346e-05, + "loss": 1.8808, + "step": 8352 + }, + { + "epoch": 2.563842848373235, + "grad_norm": 0.34650805592536926, + "learning_rate": 8.715657607577903e-05, + "loss": 1.8817, + "step": 8353 + }, + { + "epoch": 2.5641497851442603, + "grad_norm": 0.30568572878837585, + "learning_rate": 8.715324986500898e-05, + "loss": 1.8852, + "step": 8354 + }, + { + "epoch": 2.5644567219152856, + "grad_norm": 0.36174869537353516, + "learning_rate": 8.714992328706875e-05, + "loss": 1.8518, + "step": 8355 + }, + { + "epoch": 2.5647636586863105, + "grad_norm": 0.48538872599601746, + "learning_rate": 8.714659634199119e-05, + "loss": 1.8902, + "step": 8356 + }, + { + "epoch": 2.565070595457336, + "grad_norm": 0.44997766613960266, + "learning_rate": 8.71432690298092e-05, + "loss": 1.8914, + "step": 8357 + }, + { + "epoch": 2.5653775322283607, + "grad_norm": 0.30164965987205505, + "learning_rate": 8.713994135055566e-05, + "loss": 1.826, + "step": 8358 + }, + { + "epoch": 2.565684468999386, + "grad_norm": 0.35495996475219727, + "learning_rate": 8.713661330426345e-05, + "loss": 1.8006, + "step": 8359 + }, + { + "epoch": 2.5659914057704114, + "grad_norm": 0.4141593277454376, + "learning_rate": 8.713328489096545e-05, + "loss": 1.782, + "step": 8360 + }, + { + "epoch": 2.5662983425414367, + "grad_norm": 0.4758378267288208, + "learning_rate": 8.712995611069458e-05, + "loss": 1.8378, + "step": 8361 + }, + { + "epoch": 2.5666052793124616, + "grad_norm": 0.4852865934371948, + "learning_rate": 8.71266269634837e-05, + "loss": 1.8472, + "step": 8362 + }, + { + "epoch": 2.566912216083487, + "grad_norm": 0.43413496017456055, + "learning_rate": 8.712329744936576e-05, + "loss": 1.8118, + "step": 8363 + }, + { + "epoch": 2.567219152854512, + "grad_norm": 0.3100700080394745, + "learning_rate": 8.711996756837361e-05, + "loss": 1.8699, + "step": 8364 + }, + { + "epoch": 2.567526089625537, + "grad_norm": 0.31886258721351624, + "learning_rate": 8.711663732054021e-05, + "loss": 1.8022, + "step": 8365 + }, + { + "epoch": 2.5678330263965625, + "grad_norm": 0.38900697231292725, + "learning_rate": 8.711330670589841e-05, + "loss": 1.8119, + "step": 8366 + }, + { + "epoch": 2.5681399631675874, + "grad_norm": 0.4188348650932312, + "learning_rate": 8.710997572448119e-05, + "loss": 1.8561, + "step": 8367 + }, + { + "epoch": 2.5684468999386127, + "grad_norm": 0.3562021255493164, + "learning_rate": 8.710664437632143e-05, + "loss": 1.8605, + "step": 8368 + }, + { + "epoch": 2.5687538367096376, + "grad_norm": 0.3105112910270691, + "learning_rate": 8.710331266145206e-05, + "loss": 1.8122, + "step": 8369 + }, + { + "epoch": 2.569060773480663, + "grad_norm": 0.3209846615791321, + "learning_rate": 8.7099980579906e-05, + "loss": 1.8914, + "step": 8370 + }, + { + "epoch": 2.5693677102516883, + "grad_norm": 0.32560455799102783, + "learning_rate": 8.70966481317162e-05, + "loss": 1.9245, + "step": 8371 + }, + { + "epoch": 2.569674647022713, + "grad_norm": 0.29573267698287964, + "learning_rate": 8.709331531691558e-05, + "loss": 1.8576, + "step": 8372 + }, + { + "epoch": 2.5699815837937385, + "grad_norm": 0.2974778115749359, + "learning_rate": 8.708998213553707e-05, + "loss": 1.8464, + "step": 8373 + }, + { + "epoch": 2.5702885205647634, + "grad_norm": 0.3264322578907013, + "learning_rate": 8.708664858761362e-05, + "loss": 1.8945, + "step": 8374 + }, + { + "epoch": 2.5705954573357888, + "grad_norm": 0.28260353207588196, + "learning_rate": 8.708331467317816e-05, + "loss": 1.8296, + "step": 8375 + }, + { + "epoch": 2.570902394106814, + "grad_norm": 0.2991141676902771, + "learning_rate": 8.707998039226367e-05, + "loss": 1.9227, + "step": 8376 + }, + { + "epoch": 2.5712093308778394, + "grad_norm": 0.28582924604415894, + "learning_rate": 8.707664574490306e-05, + "loss": 1.8465, + "step": 8377 + }, + { + "epoch": 2.5715162676488643, + "grad_norm": 0.2860773205757141, + "learning_rate": 8.707331073112932e-05, + "loss": 1.8403, + "step": 8378 + }, + { + "epoch": 2.5718232044198897, + "grad_norm": 0.31145161390304565, + "learning_rate": 8.70699753509754e-05, + "loss": 1.8775, + "step": 8379 + }, + { + "epoch": 2.5721301411909145, + "grad_norm": 0.28711119294166565, + "learning_rate": 8.706663960447424e-05, + "loss": 1.8354, + "step": 8380 + }, + { + "epoch": 2.57243707796194, + "grad_norm": 0.2884272634983063, + "learning_rate": 8.706330349165884e-05, + "loss": 1.8772, + "step": 8381 + }, + { + "epoch": 2.572744014732965, + "grad_norm": 0.3581789433956146, + "learning_rate": 8.705996701256214e-05, + "loss": 1.8654, + "step": 8382 + }, + { + "epoch": 2.57305095150399, + "grad_norm": 0.41561809182167053, + "learning_rate": 8.705663016721712e-05, + "loss": 1.9112, + "step": 8383 + }, + { + "epoch": 2.5733578882750154, + "grad_norm": 0.301883727312088, + "learning_rate": 8.705329295565676e-05, + "loss": 1.803, + "step": 8384 + }, + { + "epoch": 2.5736648250460403, + "grad_norm": 0.37060779333114624, + "learning_rate": 8.704995537791405e-05, + "loss": 1.9371, + "step": 8385 + }, + { + "epoch": 2.5739717618170657, + "grad_norm": 0.44705548882484436, + "learning_rate": 8.704661743402195e-05, + "loss": 1.8599, + "step": 8386 + }, + { + "epoch": 2.574278698588091, + "grad_norm": 0.44097039103507996, + "learning_rate": 8.70432791240135e-05, + "loss": 1.8305, + "step": 8387 + }, + { + "epoch": 2.574585635359116, + "grad_norm": 0.3278143107891083, + "learning_rate": 8.703994044792161e-05, + "loss": 1.8817, + "step": 8388 + }, + { + "epoch": 2.574892572130141, + "grad_norm": 0.347153902053833, + "learning_rate": 8.703660140577934e-05, + "loss": 1.8182, + "step": 8389 + }, + { + "epoch": 2.575199508901166, + "grad_norm": 0.4667893052101135, + "learning_rate": 8.703326199761966e-05, + "loss": 1.8354, + "step": 8390 + }, + { + "epoch": 2.5755064456721914, + "grad_norm": 0.4956285059452057, + "learning_rate": 8.702992222347559e-05, + "loss": 1.8284, + "step": 8391 + }, + { + "epoch": 2.575813382443217, + "grad_norm": 0.3489355146884918, + "learning_rate": 8.702658208338012e-05, + "loss": 1.8439, + "step": 8392 + }, + { + "epoch": 2.576120319214242, + "grad_norm": 0.3054865002632141, + "learning_rate": 8.702324157736625e-05, + "loss": 1.8659, + "step": 8393 + }, + { + "epoch": 2.576427255985267, + "grad_norm": 0.3459004759788513, + "learning_rate": 8.701990070546703e-05, + "loss": 1.8644, + "step": 8394 + }, + { + "epoch": 2.5767341927562923, + "grad_norm": 0.34715306758880615, + "learning_rate": 8.701655946771544e-05, + "loss": 1.8765, + "step": 8395 + }, + { + "epoch": 2.5770411295273172, + "grad_norm": 0.35610535740852356, + "learning_rate": 8.701321786414452e-05, + "loss": 1.886, + "step": 8396 + }, + { + "epoch": 2.5773480662983426, + "grad_norm": 0.34869852662086487, + "learning_rate": 8.700987589478728e-05, + "loss": 1.8858, + "step": 8397 + }, + { + "epoch": 2.577655003069368, + "grad_norm": 0.33508050441741943, + "learning_rate": 8.700653355967675e-05, + "loss": 1.8429, + "step": 8398 + }, + { + "epoch": 2.577961939840393, + "grad_norm": 0.4707668721675873, + "learning_rate": 8.700319085884597e-05, + "loss": 1.8806, + "step": 8399 + }, + { + "epoch": 2.578268876611418, + "grad_norm": 0.5073609948158264, + "learning_rate": 8.699984779232797e-05, + "loss": 1.9252, + "step": 8400 + }, + { + "epoch": 2.578575813382443, + "grad_norm": 0.4120771884918213, + "learning_rate": 8.699650436015578e-05, + "loss": 1.9463, + "step": 8401 + }, + { + "epoch": 2.5788827501534684, + "grad_norm": 0.5639505386352539, + "learning_rate": 8.699316056236246e-05, + "loss": 1.9076, + "step": 8402 + }, + { + "epoch": 2.5791896869244937, + "grad_norm": 0.7611388564109802, + "learning_rate": 8.698981639898106e-05, + "loss": 1.8344, + "step": 8403 + }, + { + "epoch": 2.5794966236955186, + "grad_norm": 0.715629518032074, + "learning_rate": 8.69864718700446e-05, + "loss": 1.7928, + "step": 8404 + }, + { + "epoch": 2.579803560466544, + "grad_norm": 0.4248988926410675, + "learning_rate": 8.698312697558614e-05, + "loss": 1.835, + "step": 8405 + }, + { + "epoch": 2.580110497237569, + "grad_norm": 0.3638152778148651, + "learning_rate": 8.697978171563875e-05, + "loss": 1.8544, + "step": 8406 + }, + { + "epoch": 2.580417434008594, + "grad_norm": 0.40734997391700745, + "learning_rate": 8.697643609023547e-05, + "loss": 1.7759, + "step": 8407 + }, + { + "epoch": 2.5807243707796195, + "grad_norm": 0.41469305753707886, + "learning_rate": 8.697309009940939e-05, + "loss": 1.8989, + "step": 8408 + }, + { + "epoch": 2.581031307550645, + "grad_norm": 0.3003403842449188, + "learning_rate": 8.696974374319355e-05, + "loss": 1.8138, + "step": 8409 + }, + { + "epoch": 2.5813382443216697, + "grad_norm": 0.3475555181503296, + "learning_rate": 8.696639702162104e-05, + "loss": 1.8851, + "step": 8410 + }, + { + "epoch": 2.581645181092695, + "grad_norm": 0.3952930271625519, + "learning_rate": 8.696304993472493e-05, + "loss": 1.8421, + "step": 8411 + }, + { + "epoch": 2.58195211786372, + "grad_norm": 0.33059266209602356, + "learning_rate": 8.69597024825383e-05, + "loss": 1.886, + "step": 8412 + }, + { + "epoch": 2.5822590546347453, + "grad_norm": 0.291877806186676, + "learning_rate": 8.695635466509422e-05, + "loss": 1.8001, + "step": 8413 + }, + { + "epoch": 2.5825659914057706, + "grad_norm": 0.3707219064235687, + "learning_rate": 8.69530064824258e-05, + "loss": 1.8419, + "step": 8414 + }, + { + "epoch": 2.5828729281767955, + "grad_norm": 0.4656111001968384, + "learning_rate": 8.694965793456609e-05, + "loss": 1.8925, + "step": 8415 + }, + { + "epoch": 2.583179864947821, + "grad_norm": 0.4284421503543854, + "learning_rate": 8.694630902154821e-05, + "loss": 1.8794, + "step": 8416 + }, + { + "epoch": 2.5834868017188457, + "grad_norm": 0.25311100482940674, + "learning_rate": 8.694295974340525e-05, + "loss": 1.8004, + "step": 8417 + }, + { + "epoch": 2.583793738489871, + "grad_norm": 0.3463805615901947, + "learning_rate": 8.693961010017031e-05, + "loss": 1.8666, + "step": 8418 + }, + { + "epoch": 2.5841006752608964, + "grad_norm": 0.3193957209587097, + "learning_rate": 8.693626009187647e-05, + "loss": 1.8787, + "step": 8419 + }, + { + "epoch": 2.5844076120319213, + "grad_norm": 0.30919939279556274, + "learning_rate": 8.69329097185569e-05, + "loss": 1.9066, + "step": 8420 + }, + { + "epoch": 2.5847145488029466, + "grad_norm": 0.31369611620903015, + "learning_rate": 8.692955898024464e-05, + "loss": 1.8714, + "step": 8421 + }, + { + "epoch": 2.5850214855739715, + "grad_norm": 0.3191319406032562, + "learning_rate": 8.692620787697284e-05, + "loss": 1.8535, + "step": 8422 + }, + { + "epoch": 2.585328422344997, + "grad_norm": 0.3148418366909027, + "learning_rate": 8.692285640877462e-05, + "loss": 1.8648, + "step": 8423 + }, + { + "epoch": 2.585635359116022, + "grad_norm": 0.28245437145233154, + "learning_rate": 8.691950457568307e-05, + "loss": 1.8574, + "step": 8424 + }, + { + "epoch": 2.5859422958870475, + "grad_norm": 0.28383150696754456, + "learning_rate": 8.691615237773137e-05, + "loss": 1.7993, + "step": 8425 + }, + { + "epoch": 2.5862492326580724, + "grad_norm": 0.30522802472114563, + "learning_rate": 8.691279981495257e-05, + "loss": 1.8809, + "step": 8426 + }, + { + "epoch": 2.5865561694290977, + "grad_norm": 0.2936995327472687, + "learning_rate": 8.690944688737988e-05, + "loss": 1.745, + "step": 8427 + }, + { + "epoch": 2.5868631062001226, + "grad_norm": 0.2923533320426941, + "learning_rate": 8.69060935950464e-05, + "loss": 1.8929, + "step": 8428 + }, + { + "epoch": 2.587170042971148, + "grad_norm": 0.3280770182609558, + "learning_rate": 8.690273993798526e-05, + "loss": 1.8587, + "step": 8429 + }, + { + "epoch": 2.5874769797421733, + "grad_norm": 0.314712792634964, + "learning_rate": 8.689938591622962e-05, + "loss": 1.8569, + "step": 8430 + }, + { + "epoch": 2.587783916513198, + "grad_norm": 0.3230959475040436, + "learning_rate": 8.689603152981263e-05, + "loss": 1.8451, + "step": 8431 + }, + { + "epoch": 2.5880908532842235, + "grad_norm": 0.35917067527770996, + "learning_rate": 8.689267677876742e-05, + "loss": 1.7755, + "step": 8432 + }, + { + "epoch": 2.5883977900552484, + "grad_norm": 0.3590618968009949, + "learning_rate": 8.688932166312715e-05, + "loss": 1.8236, + "step": 8433 + }, + { + "epoch": 2.5887047268262737, + "grad_norm": 0.29416507482528687, + "learning_rate": 8.6885966182925e-05, + "loss": 1.7852, + "step": 8434 + }, + { + "epoch": 2.589011663597299, + "grad_norm": 0.24230079352855682, + "learning_rate": 8.688261033819409e-05, + "loss": 1.8006, + "step": 8435 + }, + { + "epoch": 2.5893186003683244, + "grad_norm": 0.2519497573375702, + "learning_rate": 8.687925412896762e-05, + "loss": 1.7787, + "step": 8436 + }, + { + "epoch": 2.5896255371393493, + "grad_norm": 0.2794395089149475, + "learning_rate": 8.687589755527874e-05, + "loss": 1.8408, + "step": 8437 + }, + { + "epoch": 2.5899324739103746, + "grad_norm": 0.28811511397361755, + "learning_rate": 8.687254061716063e-05, + "loss": 1.8961, + "step": 8438 + }, + { + "epoch": 2.5902394106813995, + "grad_norm": 0.28127825260162354, + "learning_rate": 8.686918331464647e-05, + "loss": 1.8235, + "step": 8439 + }, + { + "epoch": 2.590546347452425, + "grad_norm": 0.2869607210159302, + "learning_rate": 8.686582564776942e-05, + "loss": 1.8452, + "step": 8440 + }, + { + "epoch": 2.59085328422345, + "grad_norm": 0.36350393295288086, + "learning_rate": 8.686246761656268e-05, + "loss": 1.9262, + "step": 8441 + }, + { + "epoch": 2.591160220994475, + "grad_norm": 0.30231785774230957, + "learning_rate": 8.685910922105942e-05, + "loss": 1.8674, + "step": 8442 + }, + { + "epoch": 2.5914671577655004, + "grad_norm": 0.28321847319602966, + "learning_rate": 8.685575046129285e-05, + "loss": 1.8243, + "step": 8443 + }, + { + "epoch": 2.5917740945365253, + "grad_norm": 0.30235186219215393, + "learning_rate": 8.685239133729615e-05, + "loss": 1.8442, + "step": 8444 + }, + { + "epoch": 2.5920810313075506, + "grad_norm": 0.2684946060180664, + "learning_rate": 8.684903184910252e-05, + "loss": 1.8584, + "step": 8445 + }, + { + "epoch": 2.592387968078576, + "grad_norm": 0.33788567781448364, + "learning_rate": 8.684567199674514e-05, + "loss": 1.8296, + "step": 8446 + }, + { + "epoch": 2.592694904849601, + "grad_norm": 0.38110965490341187, + "learning_rate": 8.684231178025726e-05, + "loss": 1.8581, + "step": 8447 + }, + { + "epoch": 2.593001841620626, + "grad_norm": 0.36466923356056213, + "learning_rate": 8.683895119967204e-05, + "loss": 1.8799, + "step": 8448 + }, + { + "epoch": 2.593308778391651, + "grad_norm": 0.3052733838558197, + "learning_rate": 8.683559025502272e-05, + "loss": 1.8834, + "step": 8449 + }, + { + "epoch": 2.5936157151626764, + "grad_norm": 0.31457164883613586, + "learning_rate": 8.683222894634251e-05, + "loss": 1.8635, + "step": 8450 + }, + { + "epoch": 2.5939226519337018, + "grad_norm": 0.46189576387405396, + "learning_rate": 8.682886727366464e-05, + "loss": 1.8852, + "step": 8451 + }, + { + "epoch": 2.594229588704727, + "grad_norm": 0.467640221118927, + "learning_rate": 8.682550523702229e-05, + "loss": 1.8306, + "step": 8452 + }, + { + "epoch": 2.594536525475752, + "grad_norm": 0.3384416699409485, + "learning_rate": 8.682214283644873e-05, + "loss": 1.8298, + "step": 8453 + }, + { + "epoch": 2.5948434622467773, + "grad_norm": 0.2842169404029846, + "learning_rate": 8.681878007197717e-05, + "loss": 1.8091, + "step": 8454 + }, + { + "epoch": 2.595150399017802, + "grad_norm": 0.31266552209854126, + "learning_rate": 8.681541694364084e-05, + "loss": 1.8329, + "step": 8455 + }, + { + "epoch": 2.5954573357888275, + "grad_norm": 0.36803483963012695, + "learning_rate": 8.681205345147298e-05, + "loss": 1.8427, + "step": 8456 + }, + { + "epoch": 2.595764272559853, + "grad_norm": 0.37500229477882385, + "learning_rate": 8.680868959550684e-05, + "loss": 1.8865, + "step": 8457 + }, + { + "epoch": 2.5960712093308778, + "grad_norm": 0.30494266748428345, + "learning_rate": 8.680532537577565e-05, + "loss": 1.8375, + "step": 8458 + }, + { + "epoch": 2.596378146101903, + "grad_norm": 0.38320985436439514, + "learning_rate": 8.680196079231266e-05, + "loss": 1.8762, + "step": 8459 + }, + { + "epoch": 2.596685082872928, + "grad_norm": 0.48555347323417664, + "learning_rate": 8.679859584515112e-05, + "loss": 1.8558, + "step": 8460 + }, + { + "epoch": 2.5969920196439533, + "grad_norm": 0.3975796401500702, + "learning_rate": 8.67952305343243e-05, + "loss": 1.8265, + "step": 8461 + }, + { + "epoch": 2.5972989564149787, + "grad_norm": 0.3312734365463257, + "learning_rate": 8.679186485986544e-05, + "loss": 1.8346, + "step": 8462 + }, + { + "epoch": 2.5976058931860035, + "grad_norm": 0.37137889862060547, + "learning_rate": 8.67884988218078e-05, + "loss": 1.8894, + "step": 8463 + }, + { + "epoch": 2.597912829957029, + "grad_norm": 0.3645901083946228, + "learning_rate": 8.678513242018467e-05, + "loss": 1.8103, + "step": 8464 + }, + { + "epoch": 2.5982197667280538, + "grad_norm": 0.35010847449302673, + "learning_rate": 8.67817656550293e-05, + "loss": 1.8704, + "step": 8465 + }, + { + "epoch": 2.598526703499079, + "grad_norm": 0.36948931217193604, + "learning_rate": 8.677839852637492e-05, + "loss": 1.8413, + "step": 8466 + }, + { + "epoch": 2.5988336402701044, + "grad_norm": 0.3512018322944641, + "learning_rate": 8.67750310342549e-05, + "loss": 1.8222, + "step": 8467 + }, + { + "epoch": 2.5991405770411298, + "grad_norm": 0.3678590953350067, + "learning_rate": 8.677166317870245e-05, + "loss": 1.852, + "step": 8468 + }, + { + "epoch": 2.5994475138121547, + "grad_norm": 0.46718111634254456, + "learning_rate": 8.676829495975087e-05, + "loss": 1.8459, + "step": 8469 + }, + { + "epoch": 2.59975445058318, + "grad_norm": 0.4580456018447876, + "learning_rate": 8.676492637743345e-05, + "loss": 1.8547, + "step": 8470 + }, + { + "epoch": 2.600061387354205, + "grad_norm": 0.3790566921234131, + "learning_rate": 8.676155743178348e-05, + "loss": 1.8483, + "step": 8471 + }, + { + "epoch": 2.6003683241252302, + "grad_norm": 0.34775233268737793, + "learning_rate": 8.675818812283424e-05, + "loss": 1.9, + "step": 8472 + }, + { + "epoch": 2.6006752608962556, + "grad_norm": 0.4257417619228363, + "learning_rate": 8.675481845061906e-05, + "loss": 1.8354, + "step": 8473 + }, + { + "epoch": 2.6009821976672804, + "grad_norm": 0.46964964270591736, + "learning_rate": 8.675144841517122e-05, + "loss": 1.8305, + "step": 8474 + }, + { + "epoch": 2.601289134438306, + "grad_norm": 0.3592812120914459, + "learning_rate": 8.674807801652403e-05, + "loss": 1.778, + "step": 8475 + }, + { + "epoch": 2.6015960712093307, + "grad_norm": 0.3184985816478729, + "learning_rate": 8.674470725471078e-05, + "loss": 1.8706, + "step": 8476 + }, + { + "epoch": 2.601903007980356, + "grad_norm": 0.31306785345077515, + "learning_rate": 8.674133612976481e-05, + "loss": 1.8482, + "step": 8477 + }, + { + "epoch": 2.6022099447513813, + "grad_norm": 0.30568715929985046, + "learning_rate": 8.673796464171939e-05, + "loss": 1.8346, + "step": 8478 + }, + { + "epoch": 2.6025168815224062, + "grad_norm": 0.33701828122138977, + "learning_rate": 8.673459279060791e-05, + "loss": 1.8165, + "step": 8479 + }, + { + "epoch": 2.6028238182934316, + "grad_norm": 0.3153107166290283, + "learning_rate": 8.673122057646364e-05, + "loss": 1.8175, + "step": 8480 + }, + { + "epoch": 2.6031307550644565, + "grad_norm": 0.3428439497947693, + "learning_rate": 8.67278479993199e-05, + "loss": 1.8344, + "step": 8481 + }, + { + "epoch": 2.603437691835482, + "grad_norm": 0.39118432998657227, + "learning_rate": 8.672447505921006e-05, + "loss": 1.7904, + "step": 8482 + }, + { + "epoch": 2.603744628606507, + "grad_norm": 0.3845612108707428, + "learning_rate": 8.672110175616743e-05, + "loss": 1.8442, + "step": 8483 + }, + { + "epoch": 2.6040515653775325, + "grad_norm": 0.3402850329875946, + "learning_rate": 8.671772809022535e-05, + "loss": 1.8578, + "step": 8484 + }, + { + "epoch": 2.6043585021485574, + "grad_norm": 0.30314967036247253, + "learning_rate": 8.671435406141716e-05, + "loss": 1.8235, + "step": 8485 + }, + { + "epoch": 2.6046654389195827, + "grad_norm": 0.29402145743370056, + "learning_rate": 8.67109796697762e-05, + "loss": 1.8105, + "step": 8486 + }, + { + "epoch": 2.6049723756906076, + "grad_norm": 0.33207419514656067, + "learning_rate": 8.670760491533582e-05, + "loss": 1.9133, + "step": 8487 + }, + { + "epoch": 2.605279312461633, + "grad_norm": 0.3287195861339569, + "learning_rate": 8.670422979812938e-05, + "loss": 1.8344, + "step": 8488 + }, + { + "epoch": 2.6055862492326582, + "grad_norm": 0.37947842478752136, + "learning_rate": 8.670085431819021e-05, + "loss": 1.8504, + "step": 8489 + }, + { + "epoch": 2.605893186003683, + "grad_norm": 0.3688724935054779, + "learning_rate": 8.669747847555171e-05, + "loss": 1.8305, + "step": 8490 + }, + { + "epoch": 2.6062001227747085, + "grad_norm": 0.33962976932525635, + "learning_rate": 8.669410227024721e-05, + "loss": 1.861, + "step": 8491 + }, + { + "epoch": 2.6065070595457334, + "grad_norm": 0.27068057656288147, + "learning_rate": 8.669072570231009e-05, + "loss": 1.7666, + "step": 8492 + }, + { + "epoch": 2.6068139963167587, + "grad_norm": 0.32670122385025024, + "learning_rate": 8.668734877177371e-05, + "loss": 1.8434, + "step": 8493 + }, + { + "epoch": 2.607120933087784, + "grad_norm": 0.37303030490875244, + "learning_rate": 8.668397147867144e-05, + "loss": 1.8326, + "step": 8494 + }, + { + "epoch": 2.607427869858809, + "grad_norm": 0.2860218286514282, + "learning_rate": 8.668059382303666e-05, + "loss": 1.7993, + "step": 8495 + }, + { + "epoch": 2.6077348066298343, + "grad_norm": 0.3480636477470398, + "learning_rate": 8.667721580490278e-05, + "loss": 1.8895, + "step": 8496 + }, + { + "epoch": 2.608041743400859, + "grad_norm": 0.37609198689460754, + "learning_rate": 8.667383742430313e-05, + "loss": 1.8906, + "step": 8497 + }, + { + "epoch": 2.6083486801718845, + "grad_norm": 0.30747851729393005, + "learning_rate": 8.667045868127113e-05, + "loss": 1.8169, + "step": 8498 + }, + { + "epoch": 2.60865561694291, + "grad_norm": 0.3108443021774292, + "learning_rate": 8.666707957584016e-05, + "loss": 1.8296, + "step": 8499 + }, + { + "epoch": 2.608962553713935, + "grad_norm": 0.36353448033332825, + "learning_rate": 8.666370010804361e-05, + "loss": 1.879, + "step": 8500 + }, + { + "epoch": 2.60926949048496, + "grad_norm": 0.39959096908569336, + "learning_rate": 8.666032027791491e-05, + "loss": 1.8602, + "step": 8501 + }, + { + "epoch": 2.6095764272559854, + "grad_norm": 0.3505500853061676, + "learning_rate": 8.665694008548742e-05, + "loss": 1.861, + "step": 8502 + }, + { + "epoch": 2.6098833640270103, + "grad_norm": 0.3155219852924347, + "learning_rate": 8.665355953079457e-05, + "loss": 1.7911, + "step": 8503 + }, + { + "epoch": 2.6101903007980356, + "grad_norm": 0.2868075668811798, + "learning_rate": 8.665017861386975e-05, + "loss": 1.8023, + "step": 8504 + }, + { + "epoch": 2.610497237569061, + "grad_norm": 0.2890832722187042, + "learning_rate": 8.664679733474641e-05, + "loss": 1.8653, + "step": 8505 + }, + { + "epoch": 2.610804174340086, + "grad_norm": 0.3143366575241089, + "learning_rate": 8.66434156934579e-05, + "loss": 1.8024, + "step": 8506 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.28702911734580994, + "learning_rate": 8.664003369003772e-05, + "loss": 1.8231, + "step": 8507 + }, + { + "epoch": 2.611418047882136, + "grad_norm": 0.37087059020996094, + "learning_rate": 8.663665132451924e-05, + "loss": 1.8565, + "step": 8508 + }, + { + "epoch": 2.6117249846531614, + "grad_norm": 0.29796209931373596, + "learning_rate": 8.663326859693588e-05, + "loss": 1.8188, + "step": 8509 + }, + { + "epoch": 2.6120319214241867, + "grad_norm": 0.31352412700653076, + "learning_rate": 8.66298855073211e-05, + "loss": 1.806, + "step": 8510 + }, + { + "epoch": 2.612338858195212, + "grad_norm": 0.28749167919158936, + "learning_rate": 8.662650205570832e-05, + "loss": 1.8082, + "step": 8511 + }, + { + "epoch": 2.612645794966237, + "grad_norm": 0.26889678835868835, + "learning_rate": 8.662311824213099e-05, + "loss": 1.8211, + "step": 8512 + }, + { + "epoch": 2.6129527317372623, + "grad_norm": 0.2562754154205322, + "learning_rate": 8.661973406662253e-05, + "loss": 1.7519, + "step": 8513 + }, + { + "epoch": 2.613259668508287, + "grad_norm": 0.26967912912368774, + "learning_rate": 8.661634952921639e-05, + "loss": 1.8339, + "step": 8514 + }, + { + "epoch": 2.6135666052793125, + "grad_norm": 0.3468424081802368, + "learning_rate": 8.661296462994602e-05, + "loss": 1.9219, + "step": 8515 + }, + { + "epoch": 2.613873542050338, + "grad_norm": 0.34790560603141785, + "learning_rate": 8.660957936884489e-05, + "loss": 1.9089, + "step": 8516 + }, + { + "epoch": 2.6141804788213627, + "grad_norm": 0.350337952375412, + "learning_rate": 8.660619374594643e-05, + "loss": 1.8228, + "step": 8517 + }, + { + "epoch": 2.614487415592388, + "grad_norm": 0.37077057361602783, + "learning_rate": 8.660280776128411e-05, + "loss": 1.8658, + "step": 8518 + }, + { + "epoch": 2.614794352363413, + "grad_norm": 0.35846221446990967, + "learning_rate": 8.659942141489139e-05, + "loss": 1.8573, + "step": 8519 + }, + { + "epoch": 2.6151012891344383, + "grad_norm": 0.339101642370224, + "learning_rate": 8.659603470680173e-05, + "loss": 1.875, + "step": 8520 + }, + { + "epoch": 2.6154082259054636, + "grad_norm": 0.35074207186698914, + "learning_rate": 8.65926476370486e-05, + "loss": 1.8395, + "step": 8521 + }, + { + "epoch": 2.6157151626764885, + "grad_norm": 0.31544017791748047, + "learning_rate": 8.658926020566551e-05, + "loss": 1.8453, + "step": 8522 + }, + { + "epoch": 2.616022099447514, + "grad_norm": 0.30619683861732483, + "learning_rate": 8.658587241268587e-05, + "loss": 1.775, + "step": 8523 + }, + { + "epoch": 2.6163290362185387, + "grad_norm": 0.29331618547439575, + "learning_rate": 8.658248425814322e-05, + "loss": 1.8068, + "step": 8524 + }, + { + "epoch": 2.616635972989564, + "grad_norm": 0.2824336290359497, + "learning_rate": 8.6579095742071e-05, + "loss": 1.8759, + "step": 8525 + }, + { + "epoch": 2.6169429097605894, + "grad_norm": 0.2697986364364624, + "learning_rate": 8.657570686450271e-05, + "loss": 1.8295, + "step": 8526 + }, + { + "epoch": 2.6172498465316147, + "grad_norm": 0.3031822144985199, + "learning_rate": 8.657231762547186e-05, + "loss": 1.9205, + "step": 8527 + }, + { + "epoch": 2.6175567833026396, + "grad_norm": 0.2867984473705292, + "learning_rate": 8.656892802501196e-05, + "loss": 1.8638, + "step": 8528 + }, + { + "epoch": 2.617863720073665, + "grad_norm": 0.29799792170524597, + "learning_rate": 8.656553806315644e-05, + "loss": 1.8187, + "step": 8529 + }, + { + "epoch": 2.61817065684469, + "grad_norm": 0.3222150504589081, + "learning_rate": 8.656214773993884e-05, + "loss": 1.8661, + "step": 8530 + }, + { + "epoch": 2.618477593615715, + "grad_norm": 0.35999616980552673, + "learning_rate": 8.655875705539269e-05, + "loss": 1.9155, + "step": 8531 + }, + { + "epoch": 2.6187845303867405, + "grad_norm": 0.36571675539016724, + "learning_rate": 8.655536600955147e-05, + "loss": 1.8536, + "step": 8532 + }, + { + "epoch": 2.6190914671577654, + "grad_norm": 0.29667189717292786, + "learning_rate": 8.655197460244868e-05, + "loss": 1.8208, + "step": 8533 + }, + { + "epoch": 2.6193984039287908, + "grad_norm": 0.3216320276260376, + "learning_rate": 8.654858283411787e-05, + "loss": 1.8613, + "step": 8534 + }, + { + "epoch": 2.6197053406998156, + "grad_norm": 0.28880423307418823, + "learning_rate": 8.654519070459254e-05, + "loss": 1.8547, + "step": 8535 + }, + { + "epoch": 2.620012277470841, + "grad_norm": 0.3130050301551819, + "learning_rate": 8.654179821390621e-05, + "loss": 1.9355, + "step": 8536 + }, + { + "epoch": 2.6203192142418663, + "grad_norm": 0.3151358664035797, + "learning_rate": 8.653840536209241e-05, + "loss": 1.8462, + "step": 8537 + }, + { + "epoch": 2.620626151012891, + "grad_norm": 0.2702169120311737, + "learning_rate": 8.653501214918468e-05, + "loss": 1.7966, + "step": 8538 + }, + { + "epoch": 2.6209330877839165, + "grad_norm": 0.31494441628456116, + "learning_rate": 8.653161857521655e-05, + "loss": 1.7449, + "step": 8539 + }, + { + "epoch": 2.6212400245549414, + "grad_norm": 0.3219514787197113, + "learning_rate": 8.652822464022154e-05, + "loss": 1.8238, + "step": 8540 + }, + { + "epoch": 2.6215469613259668, + "grad_norm": 0.3237066864967346, + "learning_rate": 8.652483034423322e-05, + "loss": 1.8273, + "step": 8541 + }, + { + "epoch": 2.621853898096992, + "grad_norm": 0.31354910135269165, + "learning_rate": 8.65214356872851e-05, + "loss": 1.8662, + "step": 8542 + }, + { + "epoch": 2.6221608348680174, + "grad_norm": 0.30085036158561707, + "learning_rate": 8.651804066941077e-05, + "loss": 1.8922, + "step": 8543 + }, + { + "epoch": 2.6224677716390423, + "grad_norm": 0.337528258562088, + "learning_rate": 8.651464529064373e-05, + "loss": 1.8234, + "step": 8544 + }, + { + "epoch": 2.6227747084100677, + "grad_norm": 0.33202415704727173, + "learning_rate": 8.65112495510176e-05, + "loss": 1.8331, + "step": 8545 + }, + { + "epoch": 2.6230816451810925, + "grad_norm": 0.3288112282752991, + "learning_rate": 8.650785345056586e-05, + "loss": 1.8129, + "step": 8546 + }, + { + "epoch": 2.623388581952118, + "grad_norm": 0.35483047366142273, + "learning_rate": 8.650445698932214e-05, + "loss": 1.8488, + "step": 8547 + }, + { + "epoch": 2.623695518723143, + "grad_norm": 0.32108932733535767, + "learning_rate": 8.650106016731998e-05, + "loss": 1.8263, + "step": 8548 + }, + { + "epoch": 2.624002455494168, + "grad_norm": 0.2902318239212036, + "learning_rate": 8.649766298459295e-05, + "loss": 1.8352, + "step": 8549 + }, + { + "epoch": 2.6243093922651934, + "grad_norm": 0.29014477133750916, + "learning_rate": 8.64942654411746e-05, + "loss": 1.8568, + "step": 8550 + }, + { + "epoch": 2.6246163290362183, + "grad_norm": 0.3996742367744446, + "learning_rate": 8.649086753709855e-05, + "loss": 1.8928, + "step": 8551 + }, + { + "epoch": 2.6249232658072437, + "grad_norm": 0.3703175187110901, + "learning_rate": 8.648746927239835e-05, + "loss": 1.829, + "step": 8552 + }, + { + "epoch": 2.625230202578269, + "grad_norm": 0.33802542090415955, + "learning_rate": 8.64840706471076e-05, + "loss": 1.8827, + "step": 8553 + }, + { + "epoch": 2.625537139349294, + "grad_norm": 0.33303168416023254, + "learning_rate": 8.648067166125988e-05, + "loss": 1.8964, + "step": 8554 + }, + { + "epoch": 2.6258440761203192, + "grad_norm": 0.33449646830558777, + "learning_rate": 8.647727231488878e-05, + "loss": 1.8477, + "step": 8555 + }, + { + "epoch": 2.626151012891344, + "grad_norm": 0.3260989189147949, + "learning_rate": 8.647387260802788e-05, + "loss": 1.8623, + "step": 8556 + }, + { + "epoch": 2.6264579496623695, + "grad_norm": 0.2847815752029419, + "learning_rate": 8.647047254071082e-05, + "loss": 1.769, + "step": 8557 + }, + { + "epoch": 2.626764886433395, + "grad_norm": 0.30041372776031494, + "learning_rate": 8.646707211297116e-05, + "loss": 1.8451, + "step": 8558 + }, + { + "epoch": 2.62707182320442, + "grad_norm": 0.3557286560535431, + "learning_rate": 8.646367132484252e-05, + "loss": 1.8233, + "step": 8559 + }, + { + "epoch": 2.627378759975445, + "grad_norm": 0.39471131563186646, + "learning_rate": 8.646027017635851e-05, + "loss": 1.8364, + "step": 8560 + }, + { + "epoch": 2.6276856967464703, + "grad_norm": 0.37501803040504456, + "learning_rate": 8.645686866755273e-05, + "loss": 1.8129, + "step": 8561 + }, + { + "epoch": 2.6279926335174952, + "grad_norm": 0.374553918838501, + "learning_rate": 8.645346679845881e-05, + "loss": 1.9388, + "step": 8562 + }, + { + "epoch": 2.6282995702885206, + "grad_norm": 0.34410929679870605, + "learning_rate": 8.645006456911037e-05, + "loss": 1.8496, + "step": 8563 + }, + { + "epoch": 2.628606507059546, + "grad_norm": 0.28208592534065247, + "learning_rate": 8.644666197954103e-05, + "loss": 1.8405, + "step": 8564 + }, + { + "epoch": 2.628913443830571, + "grad_norm": 0.2913917005062103, + "learning_rate": 8.644325902978441e-05, + "loss": 1.8775, + "step": 8565 + }, + { + "epoch": 2.629220380601596, + "grad_norm": 0.33285796642303467, + "learning_rate": 8.643985571987414e-05, + "loss": 1.8217, + "step": 8566 + }, + { + "epoch": 2.629527317372621, + "grad_norm": 0.3419492244720459, + "learning_rate": 8.643645204984386e-05, + "loss": 1.8911, + "step": 8567 + }, + { + "epoch": 2.6298342541436464, + "grad_norm": 0.33901095390319824, + "learning_rate": 8.643304801972721e-05, + "loss": 1.8653, + "step": 8568 + }, + { + "epoch": 2.6301411909146717, + "grad_norm": 0.30073773860931396, + "learning_rate": 8.642964362955781e-05, + "loss": 1.7544, + "step": 8569 + }, + { + "epoch": 2.630448127685697, + "grad_norm": 0.3300367593765259, + "learning_rate": 8.642623887936933e-05, + "loss": 1.8764, + "step": 8570 + }, + { + "epoch": 2.630755064456722, + "grad_norm": 0.330671101808548, + "learning_rate": 8.642283376919542e-05, + "loss": 1.8227, + "step": 8571 + }, + { + "epoch": 2.6310620012277472, + "grad_norm": 0.3498590290546417, + "learning_rate": 8.64194282990697e-05, + "loss": 1.8639, + "step": 8572 + }, + { + "epoch": 2.631368937998772, + "grad_norm": 0.33145999908447266, + "learning_rate": 8.641602246902586e-05, + "loss": 1.8442, + "step": 8573 + }, + { + "epoch": 2.6316758747697975, + "grad_norm": 0.29510337114334106, + "learning_rate": 8.641261627909754e-05, + "loss": 1.829, + "step": 8574 + }, + { + "epoch": 2.631982811540823, + "grad_norm": 0.2788131833076477, + "learning_rate": 8.640920972931839e-05, + "loss": 1.7717, + "step": 8575 + }, + { + "epoch": 2.6322897483118477, + "grad_norm": 0.27459269762039185, + "learning_rate": 8.640580281972209e-05, + "loss": 1.7924, + "step": 8576 + }, + { + "epoch": 2.632596685082873, + "grad_norm": 0.3517146110534668, + "learning_rate": 8.640239555034232e-05, + "loss": 1.8921, + "step": 8577 + }, + { + "epoch": 2.632903621853898, + "grad_norm": 0.2852388620376587, + "learning_rate": 8.639898792121273e-05, + "loss": 1.8207, + "step": 8578 + }, + { + "epoch": 2.6332105586249233, + "grad_norm": 0.3164372742176056, + "learning_rate": 8.639557993236702e-05, + "loss": 1.8782, + "step": 8579 + }, + { + "epoch": 2.6335174953959486, + "grad_norm": 0.43939462304115295, + "learning_rate": 8.639217158383885e-05, + "loss": 1.8345, + "step": 8580 + }, + { + "epoch": 2.6338244321669735, + "grad_norm": 0.45321017503738403, + "learning_rate": 8.63887628756619e-05, + "loss": 1.904, + "step": 8581 + }, + { + "epoch": 2.634131368937999, + "grad_norm": 0.4423905611038208, + "learning_rate": 8.638535380786989e-05, + "loss": 1.8894, + "step": 8582 + }, + { + "epoch": 2.6344383057090237, + "grad_norm": 0.3929237723350525, + "learning_rate": 8.638194438049648e-05, + "loss": 1.8835, + "step": 8583 + }, + { + "epoch": 2.634745242480049, + "grad_norm": 0.3178403973579407, + "learning_rate": 8.637853459357536e-05, + "loss": 1.8125, + "step": 8584 + }, + { + "epoch": 2.6350521792510744, + "grad_norm": 0.3796660602092743, + "learning_rate": 8.637512444714024e-05, + "loss": 1.9376, + "step": 8585 + }, + { + "epoch": 2.6353591160220997, + "grad_norm": 0.34011390805244446, + "learning_rate": 8.637171394122483e-05, + "loss": 1.8339, + "step": 8586 + }, + { + "epoch": 2.6356660527931246, + "grad_norm": 0.3423489034175873, + "learning_rate": 8.636830307586281e-05, + "loss": 1.82, + "step": 8587 + }, + { + "epoch": 2.63597298956415, + "grad_norm": 0.3644867241382599, + "learning_rate": 8.636489185108791e-05, + "loss": 1.811, + "step": 8588 + }, + { + "epoch": 2.636279926335175, + "grad_norm": 0.35383811593055725, + "learning_rate": 8.636148026693384e-05, + "loss": 1.8228, + "step": 8589 + }, + { + "epoch": 2.6365868631062, + "grad_norm": 0.28066012263298035, + "learning_rate": 8.635806832343431e-05, + "loss": 1.7752, + "step": 8590 + }, + { + "epoch": 2.6368937998772255, + "grad_norm": 0.27132275700569153, + "learning_rate": 8.635465602062304e-05, + "loss": 1.8053, + "step": 8591 + }, + { + "epoch": 2.6372007366482504, + "grad_norm": 0.3076920211315155, + "learning_rate": 8.635124335853375e-05, + "loss": 1.77, + "step": 8592 + }, + { + "epoch": 2.6375076734192757, + "grad_norm": 0.35130617022514343, + "learning_rate": 8.634783033720015e-05, + "loss": 1.8272, + "step": 8593 + }, + { + "epoch": 2.6378146101903006, + "grad_norm": 0.3805561661720276, + "learning_rate": 8.634441695665601e-05, + "loss": 1.8549, + "step": 8594 + }, + { + "epoch": 2.638121546961326, + "grad_norm": 0.3168867230415344, + "learning_rate": 8.634100321693504e-05, + "loss": 1.9131, + "step": 8595 + }, + { + "epoch": 2.6384284837323513, + "grad_norm": 0.3061029314994812, + "learning_rate": 8.633758911807095e-05, + "loss": 1.84, + "step": 8596 + }, + { + "epoch": 2.638735420503376, + "grad_norm": 0.2766086459159851, + "learning_rate": 8.633417466009752e-05, + "loss": 1.8519, + "step": 8597 + }, + { + "epoch": 2.6390423572744015, + "grad_norm": 0.3250633180141449, + "learning_rate": 8.633075984304849e-05, + "loss": 1.8434, + "step": 8598 + }, + { + "epoch": 2.6393492940454264, + "grad_norm": 0.2819656729698181, + "learning_rate": 8.63273446669576e-05, + "loss": 1.8181, + "step": 8599 + }, + { + "epoch": 2.6396562308164517, + "grad_norm": 0.3506627678871155, + "learning_rate": 8.632392913185859e-05, + "loss": 1.8521, + "step": 8600 + }, + { + "epoch": 2.639963167587477, + "grad_norm": 0.3026714026927948, + "learning_rate": 8.632051323778521e-05, + "loss": 1.8183, + "step": 8601 + }, + { + "epoch": 2.6402701043585024, + "grad_norm": 0.31900104880332947, + "learning_rate": 8.631709698477124e-05, + "loss": 1.8615, + "step": 8602 + }, + { + "epoch": 2.6405770411295273, + "grad_norm": 0.3017260730266571, + "learning_rate": 8.631368037285044e-05, + "loss": 1.837, + "step": 8603 + }, + { + "epoch": 2.6408839779005526, + "grad_norm": 0.29461613297462463, + "learning_rate": 8.631026340205655e-05, + "loss": 1.8398, + "step": 8604 + }, + { + "epoch": 2.6411909146715775, + "grad_norm": 0.3405241370201111, + "learning_rate": 8.630684607242337e-05, + "loss": 1.9241, + "step": 8605 + }, + { + "epoch": 2.641497851442603, + "grad_norm": 0.36280715465545654, + "learning_rate": 8.630342838398465e-05, + "loss": 1.8319, + "step": 8606 + }, + { + "epoch": 2.641804788213628, + "grad_norm": 0.32274433970451355, + "learning_rate": 8.630001033677414e-05, + "loss": 1.8462, + "step": 8607 + }, + { + "epoch": 2.642111724984653, + "grad_norm": 0.28930720686912537, + "learning_rate": 8.629659193082571e-05, + "loss": 1.8251, + "step": 8608 + }, + { + "epoch": 2.6424186617556784, + "grad_norm": 0.30114278197288513, + "learning_rate": 8.629317316617305e-05, + "loss": 1.8037, + "step": 8609 + }, + { + "epoch": 2.6427255985267033, + "grad_norm": 0.31895074248313904, + "learning_rate": 8.628975404285e-05, + "loss": 1.808, + "step": 8610 + }, + { + "epoch": 2.6430325352977286, + "grad_norm": 0.31819066405296326, + "learning_rate": 8.62863345608903e-05, + "loss": 1.811, + "step": 8611 + }, + { + "epoch": 2.643339472068754, + "grad_norm": 0.3860008716583252, + "learning_rate": 8.628291472032779e-05, + "loss": 1.9041, + "step": 8612 + }, + { + "epoch": 2.643646408839779, + "grad_norm": 0.4598442614078522, + "learning_rate": 8.627949452119626e-05, + "loss": 1.788, + "step": 8613 + }, + { + "epoch": 2.643953345610804, + "grad_norm": 0.4720706641674042, + "learning_rate": 8.62760739635295e-05, + "loss": 1.8436, + "step": 8614 + }, + { + "epoch": 2.644260282381829, + "grad_norm": 0.3894381523132324, + "learning_rate": 8.627265304736131e-05, + "loss": 1.8188, + "step": 8615 + }, + { + "epoch": 2.6445672191528544, + "grad_norm": 0.2819352149963379, + "learning_rate": 8.626923177272551e-05, + "loss": 1.7804, + "step": 8616 + }, + { + "epoch": 2.6448741559238798, + "grad_norm": 0.33847305178642273, + "learning_rate": 8.626581013965588e-05, + "loss": 1.8628, + "step": 8617 + }, + { + "epoch": 2.645181092694905, + "grad_norm": 0.49113303422927856, + "learning_rate": 8.626238814818628e-05, + "loss": 1.821, + "step": 8618 + }, + { + "epoch": 2.64548802946593, + "grad_norm": 0.5562265515327454, + "learning_rate": 8.62589657983505e-05, + "loss": 1.8732, + "step": 8619 + }, + { + "epoch": 2.6457949662369553, + "grad_norm": 0.48525476455688477, + "learning_rate": 8.625554309018237e-05, + "loss": 1.8711, + "step": 8620 + }, + { + "epoch": 2.64610190300798, + "grad_norm": 0.35900986194610596, + "learning_rate": 8.62521200237157e-05, + "loss": 1.8922, + "step": 8621 + }, + { + "epoch": 2.6464088397790055, + "grad_norm": 0.2920636832714081, + "learning_rate": 8.624869659898435e-05, + "loss": 1.8121, + "step": 8622 + }, + { + "epoch": 2.646715776550031, + "grad_norm": 0.3626689314842224, + "learning_rate": 8.624527281602213e-05, + "loss": 1.8231, + "step": 8623 + }, + { + "epoch": 2.6470227133210558, + "grad_norm": 0.37683549523353577, + "learning_rate": 8.624184867486288e-05, + "loss": 1.8648, + "step": 8624 + }, + { + "epoch": 2.647329650092081, + "grad_norm": 0.293865829706192, + "learning_rate": 8.623842417554043e-05, + "loss": 1.8347, + "step": 8625 + }, + { + "epoch": 2.647636586863106, + "grad_norm": 0.28916221857070923, + "learning_rate": 8.623499931808863e-05, + "loss": 1.8337, + "step": 8626 + }, + { + "epoch": 2.6479435236341313, + "grad_norm": 0.439003586769104, + "learning_rate": 8.623157410254134e-05, + "loss": 1.8933, + "step": 8627 + }, + { + "epoch": 2.6482504604051567, + "grad_norm": 0.39125844836235046, + "learning_rate": 8.62281485289324e-05, + "loss": 1.7986, + "step": 8628 + }, + { + "epoch": 2.6485573971761815, + "grad_norm": 0.3968810439109802, + "learning_rate": 8.622472259729566e-05, + "loss": 1.8211, + "step": 8629 + }, + { + "epoch": 2.648864333947207, + "grad_norm": 0.37775713205337524, + "learning_rate": 8.622129630766498e-05, + "loss": 1.8976, + "step": 8630 + }, + { + "epoch": 2.6491712707182318, + "grad_norm": 0.329583078622818, + "learning_rate": 8.621786966007422e-05, + "loss": 1.9164, + "step": 8631 + }, + { + "epoch": 2.649478207489257, + "grad_norm": 0.3499230742454529, + "learning_rate": 8.621444265455725e-05, + "loss": 1.8589, + "step": 8632 + }, + { + "epoch": 2.6497851442602824, + "grad_norm": 0.504540741443634, + "learning_rate": 8.621101529114792e-05, + "loss": 1.7853, + "step": 8633 + }, + { + "epoch": 2.650092081031308, + "grad_norm": 0.47648704051971436, + "learning_rate": 8.620758756988012e-05, + "loss": 1.865, + "step": 8634 + }, + { + "epoch": 2.6503990178023327, + "grad_norm": 0.3592020869255066, + "learning_rate": 8.62041594907877e-05, + "loss": 1.886, + "step": 8635 + }, + { + "epoch": 2.650705954573358, + "grad_norm": 0.4862852096557617, + "learning_rate": 8.620073105390458e-05, + "loss": 1.8408, + "step": 8636 + }, + { + "epoch": 2.651012891344383, + "grad_norm": 0.5418413877487183, + "learning_rate": 8.619730225926462e-05, + "loss": 1.8715, + "step": 8637 + }, + { + "epoch": 2.6513198281154082, + "grad_norm": 0.4154299795627594, + "learning_rate": 8.619387310690168e-05, + "loss": 1.8879, + "step": 8638 + }, + { + "epoch": 2.6516267648864336, + "grad_norm": 0.3325296938419342, + "learning_rate": 8.619044359684968e-05, + "loss": 1.8422, + "step": 8639 + }, + { + "epoch": 2.6519337016574585, + "grad_norm": 0.4082878828048706, + "learning_rate": 8.61870137291425e-05, + "loss": 1.8375, + "step": 8640 + }, + { + "epoch": 2.652240638428484, + "grad_norm": 0.46948596835136414, + "learning_rate": 8.618358350381406e-05, + "loss": 1.8367, + "step": 8641 + }, + { + "epoch": 2.6525475751995087, + "grad_norm": 0.3770928978919983, + "learning_rate": 8.618015292089823e-05, + "loss": 1.8236, + "step": 8642 + }, + { + "epoch": 2.652854511970534, + "grad_norm": 0.27340826392173767, + "learning_rate": 8.617672198042892e-05, + "loss": 1.8446, + "step": 8643 + }, + { + "epoch": 2.6531614487415593, + "grad_norm": 0.4071608781814575, + "learning_rate": 8.617329068244004e-05, + "loss": 1.8576, + "step": 8644 + }, + { + "epoch": 2.6534683855125847, + "grad_norm": 0.5041884779930115, + "learning_rate": 8.61698590269655e-05, + "loss": 1.9075, + "step": 8645 + }, + { + "epoch": 2.6537753222836096, + "grad_norm": 0.4129817485809326, + "learning_rate": 8.616642701403921e-05, + "loss": 1.8592, + "step": 8646 + }, + { + "epoch": 2.654082259054635, + "grad_norm": 0.2837994694709778, + "learning_rate": 8.616299464369508e-05, + "loss": 1.8383, + "step": 8647 + }, + { + "epoch": 2.65438919582566, + "grad_norm": 0.3413170278072357, + "learning_rate": 8.615956191596707e-05, + "loss": 1.8083, + "step": 8648 + }, + { + "epoch": 2.654696132596685, + "grad_norm": 0.3661767244338989, + "learning_rate": 8.615612883088907e-05, + "loss": 1.9141, + "step": 8649 + }, + { + "epoch": 2.6550030693677105, + "grad_norm": 0.3209584951400757, + "learning_rate": 8.6152695388495e-05, + "loss": 1.8886, + "step": 8650 + }, + { + "epoch": 2.6553100061387354, + "grad_norm": 0.3161548674106598, + "learning_rate": 8.61492615888188e-05, + "loss": 1.832, + "step": 8651 + }, + { + "epoch": 2.6556169429097607, + "grad_norm": 0.3258545696735382, + "learning_rate": 8.614582743189441e-05, + "loss": 1.8747, + "step": 8652 + }, + { + "epoch": 2.6559238796807856, + "grad_norm": 0.3528682291507721, + "learning_rate": 8.614239291775579e-05, + "loss": 1.9192, + "step": 8653 + }, + { + "epoch": 2.656230816451811, + "grad_norm": 0.3430826961994171, + "learning_rate": 8.613895804643684e-05, + "loss": 1.8601, + "step": 8654 + }, + { + "epoch": 2.6565377532228363, + "grad_norm": 0.3221988379955292, + "learning_rate": 8.613552281797152e-05, + "loss": 1.9218, + "step": 8655 + }, + { + "epoch": 2.656844689993861, + "grad_norm": 0.2917289137840271, + "learning_rate": 8.613208723239379e-05, + "loss": 1.7443, + "step": 8656 + }, + { + "epoch": 2.6571516267648865, + "grad_norm": 0.28350377082824707, + "learning_rate": 8.612865128973762e-05, + "loss": 1.809, + "step": 8657 + }, + { + "epoch": 2.6574585635359114, + "grad_norm": 0.2758159339427948, + "learning_rate": 8.61252149900369e-05, + "loss": 1.8628, + "step": 8658 + }, + { + "epoch": 2.6577655003069367, + "grad_norm": 0.3537377417087555, + "learning_rate": 8.612177833332566e-05, + "loss": 1.8586, + "step": 8659 + }, + { + "epoch": 2.658072437077962, + "grad_norm": 0.38237693905830383, + "learning_rate": 8.611834131963783e-05, + "loss": 1.8869, + "step": 8660 + }, + { + "epoch": 2.6583793738489874, + "grad_norm": 0.30623751878738403, + "learning_rate": 8.611490394900739e-05, + "loss": 1.8508, + "step": 8661 + }, + { + "epoch": 2.6586863106200123, + "grad_norm": 0.2597752809524536, + "learning_rate": 8.611146622146828e-05, + "loss": 1.7931, + "step": 8662 + }, + { + "epoch": 2.6589932473910376, + "grad_norm": 0.2953357696533203, + "learning_rate": 8.61080281370545e-05, + "loss": 1.837, + "step": 8663 + }, + { + "epoch": 2.6593001841620625, + "grad_norm": 0.3018724322319031, + "learning_rate": 8.610458969580003e-05, + "loss": 1.871, + "step": 8664 + }, + { + "epoch": 2.659607120933088, + "grad_norm": 0.36607179045677185, + "learning_rate": 8.610115089773885e-05, + "loss": 1.9453, + "step": 8665 + }, + { + "epoch": 2.659914057704113, + "grad_norm": 0.38754695653915405, + "learning_rate": 8.609771174290493e-05, + "loss": 1.8886, + "step": 8666 + }, + { + "epoch": 2.660220994475138, + "grad_norm": 0.3752847909927368, + "learning_rate": 8.609427223133226e-05, + "loss": 1.8662, + "step": 8667 + }, + { + "epoch": 2.6605279312461634, + "grad_norm": 0.3301216661930084, + "learning_rate": 8.609083236305483e-05, + "loss": 1.8697, + "step": 8668 + }, + { + "epoch": 2.6608348680171883, + "grad_norm": 0.31682586669921875, + "learning_rate": 8.608739213810666e-05, + "loss": 1.8982, + "step": 8669 + }, + { + "epoch": 2.6611418047882136, + "grad_norm": 0.30835145711898804, + "learning_rate": 8.608395155652172e-05, + "loss": 1.8245, + "step": 8670 + }, + { + "epoch": 2.661448741559239, + "grad_norm": 0.32517582178115845, + "learning_rate": 8.608051061833402e-05, + "loss": 1.9117, + "step": 8671 + }, + { + "epoch": 2.661755678330264, + "grad_norm": 0.3120395541191101, + "learning_rate": 8.607706932357757e-05, + "loss": 1.76, + "step": 8672 + }, + { + "epoch": 2.662062615101289, + "grad_norm": 0.31719091534614563, + "learning_rate": 8.607362767228637e-05, + "loss": 1.8939, + "step": 8673 + }, + { + "epoch": 2.662369551872314, + "grad_norm": 0.28792136907577515, + "learning_rate": 8.607018566449445e-05, + "loss": 1.8403, + "step": 8674 + }, + { + "epoch": 2.6626764886433394, + "grad_norm": 0.28327643871307373, + "learning_rate": 8.606674330023581e-05, + "loss": 1.8204, + "step": 8675 + }, + { + "epoch": 2.6629834254143647, + "grad_norm": 0.29808422923088074, + "learning_rate": 8.606330057954446e-05, + "loss": 1.8325, + "step": 8676 + }, + { + "epoch": 2.66329036218539, + "grad_norm": 0.36162641644477844, + "learning_rate": 8.605985750245446e-05, + "loss": 1.8387, + "step": 8677 + }, + { + "epoch": 2.663597298956415, + "grad_norm": 0.3418589234352112, + "learning_rate": 8.605641406899978e-05, + "loss": 1.8139, + "step": 8678 + }, + { + "epoch": 2.6639042357274403, + "grad_norm": 0.31307870149612427, + "learning_rate": 8.605297027921451e-05, + "loss": 1.8897, + "step": 8679 + }, + { + "epoch": 2.664211172498465, + "grad_norm": 0.36962878704071045, + "learning_rate": 8.604952613313264e-05, + "loss": 1.9233, + "step": 8680 + }, + { + "epoch": 2.6645181092694905, + "grad_norm": 0.3502652049064636, + "learning_rate": 8.604608163078824e-05, + "loss": 1.8218, + "step": 8681 + }, + { + "epoch": 2.664825046040516, + "grad_norm": 0.3703038692474365, + "learning_rate": 8.604263677221533e-05, + "loss": 1.8484, + "step": 8682 + }, + { + "epoch": 2.6651319828115407, + "grad_norm": 0.2609662711620331, + "learning_rate": 8.603919155744796e-05, + "loss": 1.7645, + "step": 8683 + }, + { + "epoch": 2.665438919582566, + "grad_norm": 0.33297231793403625, + "learning_rate": 8.603574598652015e-05, + "loss": 1.8543, + "step": 8684 + }, + { + "epoch": 2.665745856353591, + "grad_norm": 0.28411462903022766, + "learning_rate": 8.603230005946601e-05, + "loss": 1.867, + "step": 8685 + }, + { + "epoch": 2.6660527931246163, + "grad_norm": 0.3209732174873352, + "learning_rate": 8.602885377631954e-05, + "loss": 1.8886, + "step": 8686 + }, + { + "epoch": 2.6663597298956416, + "grad_norm": 0.35397234559059143, + "learning_rate": 8.602540713711482e-05, + "loss": 1.8965, + "step": 8687 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2925071716308594, + "learning_rate": 8.602196014188593e-05, + "loss": 1.8027, + "step": 8688 + }, + { + "epoch": 2.666973603437692, + "grad_norm": 0.2902941107749939, + "learning_rate": 8.60185127906669e-05, + "loss": 1.8022, + "step": 8689 + }, + { + "epoch": 2.6672805402087167, + "grad_norm": 0.31528550386428833, + "learning_rate": 8.601506508349181e-05, + "loss": 1.8153, + "step": 8690 + }, + { + "epoch": 2.667587476979742, + "grad_norm": 0.32254844903945923, + "learning_rate": 8.601161702039477e-05, + "loss": 1.8199, + "step": 8691 + }, + { + "epoch": 2.6678944137507674, + "grad_norm": 0.2999059855937958, + "learning_rate": 8.600816860140979e-05, + "loss": 1.8404, + "step": 8692 + }, + { + "epoch": 2.6682013505217927, + "grad_norm": 0.32727453112602234, + "learning_rate": 8.6004719826571e-05, + "loss": 1.8148, + "step": 8693 + }, + { + "epoch": 2.6685082872928176, + "grad_norm": 0.3048906624317169, + "learning_rate": 8.600127069591245e-05, + "loss": 1.833, + "step": 8694 + }, + { + "epoch": 2.668815224063843, + "grad_norm": 0.43790102005004883, + "learning_rate": 8.599782120946826e-05, + "loss": 1.8537, + "step": 8695 + }, + { + "epoch": 2.669122160834868, + "grad_norm": 0.38096752762794495, + "learning_rate": 8.59943713672725e-05, + "loss": 1.8094, + "step": 8696 + }, + { + "epoch": 2.669429097605893, + "grad_norm": 0.3065931499004364, + "learning_rate": 8.599092116935927e-05, + "loss": 1.8878, + "step": 8697 + }, + { + "epoch": 2.6697360343769185, + "grad_norm": 0.41807904839515686, + "learning_rate": 8.598747061576264e-05, + "loss": 1.8753, + "step": 8698 + }, + { + "epoch": 2.6700429711479434, + "grad_norm": 0.4906943142414093, + "learning_rate": 8.598401970651676e-05, + "loss": 1.7642, + "step": 8699 + }, + { + "epoch": 2.6703499079189688, + "grad_norm": 0.37138858437538147, + "learning_rate": 8.598056844165567e-05, + "loss": 1.8191, + "step": 8700 + }, + { + "epoch": 2.6706568446899936, + "grad_norm": 0.2804940938949585, + "learning_rate": 8.597711682121354e-05, + "loss": 1.8238, + "step": 8701 + }, + { + "epoch": 2.670963781461019, + "grad_norm": 0.3853018581867218, + "learning_rate": 8.597366484522445e-05, + "loss": 1.8762, + "step": 8702 + }, + { + "epoch": 2.6712707182320443, + "grad_norm": 0.3066580295562744, + "learning_rate": 8.597021251372253e-05, + "loss": 1.7638, + "step": 8703 + }, + { + "epoch": 2.671577655003069, + "grad_norm": 0.30797824263572693, + "learning_rate": 8.596675982674186e-05, + "loss": 1.8574, + "step": 8704 + }, + { + "epoch": 2.6718845917740945, + "grad_norm": 0.3268548548221588, + "learning_rate": 8.596330678431661e-05, + "loss": 1.9184, + "step": 8705 + }, + { + "epoch": 2.6721915285451194, + "grad_norm": 0.4077534079551697, + "learning_rate": 8.595985338648087e-05, + "loss": 1.8967, + "step": 8706 + }, + { + "epoch": 2.6724984653161448, + "grad_norm": 0.4514889419078827, + "learning_rate": 8.595639963326881e-05, + "loss": 1.8491, + "step": 8707 + }, + { + "epoch": 2.67280540208717, + "grad_norm": 0.39269959926605225, + "learning_rate": 8.59529455247145e-05, + "loss": 1.7865, + "step": 8708 + }, + { + "epoch": 2.6731123388581954, + "grad_norm": 0.3139820694923401, + "learning_rate": 8.594949106085212e-05, + "loss": 1.8007, + "step": 8709 + }, + { + "epoch": 2.6734192756292203, + "grad_norm": 0.3423599600791931, + "learning_rate": 8.59460362417158e-05, + "loss": 1.8389, + "step": 8710 + }, + { + "epoch": 2.6737262124002457, + "grad_norm": 0.3829670548439026, + "learning_rate": 8.594258106733968e-05, + "loss": 1.8355, + "step": 8711 + }, + { + "epoch": 2.6740331491712706, + "grad_norm": 0.34447145462036133, + "learning_rate": 8.593912553775791e-05, + "loss": 1.8595, + "step": 8712 + }, + { + "epoch": 2.674340085942296, + "grad_norm": 0.34868502616882324, + "learning_rate": 8.593566965300465e-05, + "loss": 1.9195, + "step": 8713 + }, + { + "epoch": 2.674647022713321, + "grad_norm": 0.4919234812259674, + "learning_rate": 8.593221341311402e-05, + "loss": 1.8321, + "step": 8714 + }, + { + "epoch": 2.674953959484346, + "grad_norm": 0.4413202702999115, + "learning_rate": 8.59287568181202e-05, + "loss": 1.7976, + "step": 8715 + }, + { + "epoch": 2.6752608962553714, + "grad_norm": 0.3395153880119324, + "learning_rate": 8.592529986805736e-05, + "loss": 1.7974, + "step": 8716 + }, + { + "epoch": 2.6755678330263963, + "grad_norm": 0.30407002568244934, + "learning_rate": 8.592184256295965e-05, + "loss": 1.7929, + "step": 8717 + }, + { + "epoch": 2.6758747697974217, + "grad_norm": 0.31925150752067566, + "learning_rate": 8.591838490286121e-05, + "loss": 1.8413, + "step": 8718 + }, + { + "epoch": 2.676181706568447, + "grad_norm": 0.28456512093544006, + "learning_rate": 8.591492688779627e-05, + "loss": 1.8686, + "step": 8719 + }, + { + "epoch": 2.6764886433394723, + "grad_norm": 0.3286445438861847, + "learning_rate": 8.591146851779895e-05, + "loss": 1.8538, + "step": 8720 + }, + { + "epoch": 2.6767955801104972, + "grad_norm": 0.40354880690574646, + "learning_rate": 8.590800979290346e-05, + "loss": 1.8599, + "step": 8721 + }, + { + "epoch": 2.6771025168815226, + "grad_norm": 0.3654378652572632, + "learning_rate": 8.590455071314397e-05, + "loss": 1.8063, + "step": 8722 + }, + { + "epoch": 2.6774094536525475, + "grad_norm": 0.3211844861507416, + "learning_rate": 8.590109127855466e-05, + "loss": 1.8146, + "step": 8723 + }, + { + "epoch": 2.677716390423573, + "grad_norm": 0.30884361267089844, + "learning_rate": 8.589763148916973e-05, + "loss": 1.8725, + "step": 8724 + }, + { + "epoch": 2.678023327194598, + "grad_norm": 0.303095281124115, + "learning_rate": 8.589417134502336e-05, + "loss": 1.8994, + "step": 8725 + }, + { + "epoch": 2.678330263965623, + "grad_norm": 0.3086979389190674, + "learning_rate": 8.589071084614977e-05, + "loss": 1.7941, + "step": 8726 + }, + { + "epoch": 2.6786372007366483, + "grad_norm": 0.30298081040382385, + "learning_rate": 8.588724999258311e-05, + "loss": 1.8945, + "step": 8727 + }, + { + "epoch": 2.6789441375076732, + "grad_norm": 0.33253392577171326, + "learning_rate": 8.588378878435763e-05, + "loss": 1.8397, + "step": 8728 + }, + { + "epoch": 2.6792510742786986, + "grad_norm": 0.2782913148403168, + "learning_rate": 8.588032722150752e-05, + "loss": 1.8505, + "step": 8729 + }, + { + "epoch": 2.679558011049724, + "grad_norm": 0.3482373058795929, + "learning_rate": 8.587686530406697e-05, + "loss": 1.9144, + "step": 8730 + }, + { + "epoch": 2.679864947820749, + "grad_norm": 0.31985580921173096, + "learning_rate": 8.587340303207021e-05, + "loss": 1.7695, + "step": 8731 + }, + { + "epoch": 2.680171884591774, + "grad_norm": 0.3222995400428772, + "learning_rate": 8.586994040555147e-05, + "loss": 1.8624, + "step": 8732 + }, + { + "epoch": 2.680478821362799, + "grad_norm": 0.28178468346595764, + "learning_rate": 8.586647742454495e-05, + "loss": 1.8036, + "step": 8733 + }, + { + "epoch": 2.6807857581338244, + "grad_norm": 0.27367156744003296, + "learning_rate": 8.586301408908487e-05, + "loss": 1.801, + "step": 8734 + }, + { + "epoch": 2.6810926949048497, + "grad_norm": 0.2696636915206909, + "learning_rate": 8.585955039920547e-05, + "loss": 1.8211, + "step": 8735 + }, + { + "epoch": 2.681399631675875, + "grad_norm": 0.2880568504333496, + "learning_rate": 8.585608635494098e-05, + "loss": 1.8543, + "step": 8736 + }, + { + "epoch": 2.6817065684469, + "grad_norm": 0.28708669543266296, + "learning_rate": 8.585262195632562e-05, + "loss": 1.8311, + "step": 8737 + }, + { + "epoch": 2.6820135052179253, + "grad_norm": 0.2633354663848877, + "learning_rate": 8.584915720339364e-05, + "loss": 1.7815, + "step": 8738 + }, + { + "epoch": 2.68232044198895, + "grad_norm": 0.25772908329963684, + "learning_rate": 8.584569209617928e-05, + "loss": 1.8322, + "step": 8739 + }, + { + "epoch": 2.6826273787599755, + "grad_norm": 0.2665303647518158, + "learning_rate": 8.584222663471677e-05, + "loss": 1.8456, + "step": 8740 + }, + { + "epoch": 2.682934315531001, + "grad_norm": 0.26330938935279846, + "learning_rate": 8.583876081904038e-05, + "loss": 1.8552, + "step": 8741 + }, + { + "epoch": 2.6832412523020257, + "grad_norm": 0.29758915305137634, + "learning_rate": 8.583529464918434e-05, + "loss": 1.8362, + "step": 8742 + }, + { + "epoch": 2.683548189073051, + "grad_norm": 0.32018154859542847, + "learning_rate": 8.583182812518293e-05, + "loss": 1.8439, + "step": 8743 + }, + { + "epoch": 2.683855125844076, + "grad_norm": 0.33279770612716675, + "learning_rate": 8.582836124707036e-05, + "loss": 1.8629, + "step": 8744 + }, + { + "epoch": 2.6841620626151013, + "grad_norm": 0.40244174003601074, + "learning_rate": 8.582489401488096e-05, + "loss": 1.8221, + "step": 8745 + }, + { + "epoch": 2.6844689993861266, + "grad_norm": 0.3935016393661499, + "learning_rate": 8.582142642864895e-05, + "loss": 1.8564, + "step": 8746 + }, + { + "epoch": 2.6847759361571515, + "grad_norm": 0.3062369227409363, + "learning_rate": 8.58179584884086e-05, + "loss": 1.8587, + "step": 8747 + }, + { + "epoch": 2.685082872928177, + "grad_norm": 0.320422500371933, + "learning_rate": 8.58144901941942e-05, + "loss": 1.8758, + "step": 8748 + }, + { + "epoch": 2.6853898096992017, + "grad_norm": 0.3681413531303406, + "learning_rate": 8.581102154604001e-05, + "loss": 1.7899, + "step": 8749 + }, + { + "epoch": 2.685696746470227, + "grad_norm": 0.37779754400253296, + "learning_rate": 8.580755254398032e-05, + "loss": 1.8584, + "step": 8750 + }, + { + "epoch": 2.6860036832412524, + "grad_norm": 0.34761306643486023, + "learning_rate": 8.58040831880494e-05, + "loss": 1.8656, + "step": 8751 + }, + { + "epoch": 2.6863106200122777, + "grad_norm": 0.2833636403083801, + "learning_rate": 8.580061347828156e-05, + "loss": 1.8043, + "step": 8752 + }, + { + "epoch": 2.6866175567833026, + "grad_norm": 0.29990699887275696, + "learning_rate": 8.579714341471106e-05, + "loss": 1.8365, + "step": 8753 + }, + { + "epoch": 2.686924493554328, + "grad_norm": 0.3322729766368866, + "learning_rate": 8.579367299737222e-05, + "loss": 1.8541, + "step": 8754 + }, + { + "epoch": 2.687231430325353, + "grad_norm": 0.31999245285987854, + "learning_rate": 8.579020222629931e-05, + "loss": 1.8405, + "step": 8755 + }, + { + "epoch": 2.687538367096378, + "grad_norm": 0.332714319229126, + "learning_rate": 8.578673110152666e-05, + "loss": 1.9512, + "step": 8756 + }, + { + "epoch": 2.6878453038674035, + "grad_norm": 0.36372992396354675, + "learning_rate": 8.578325962308855e-05, + "loss": 1.8969, + "step": 8757 + }, + { + "epoch": 2.6881522406384284, + "grad_norm": 0.27239182591438293, + "learning_rate": 8.577978779101929e-05, + "loss": 1.7898, + "step": 8758 + }, + { + "epoch": 2.6884591774094537, + "grad_norm": 0.3552536070346832, + "learning_rate": 8.57763156053532e-05, + "loss": 1.8919, + "step": 8759 + }, + { + "epoch": 2.6887661141804786, + "grad_norm": 0.40591174364089966, + "learning_rate": 8.577284306612458e-05, + "loss": 1.8021, + "step": 8760 + }, + { + "epoch": 2.689073050951504, + "grad_norm": 0.37012994289398193, + "learning_rate": 8.576937017336777e-05, + "loss": 1.7803, + "step": 8761 + }, + { + "epoch": 2.6893799877225293, + "grad_norm": 0.33496031165122986, + "learning_rate": 8.576589692711707e-05, + "loss": 1.8573, + "step": 8762 + }, + { + "epoch": 2.689686924493554, + "grad_norm": 0.35000404715538025, + "learning_rate": 8.576242332740683e-05, + "loss": 1.8769, + "step": 8763 + }, + { + "epoch": 2.6899938612645795, + "grad_norm": 0.32730549573898315, + "learning_rate": 8.575894937427135e-05, + "loss": 1.823, + "step": 8764 + }, + { + "epoch": 2.6903007980356044, + "grad_norm": 0.31418806314468384, + "learning_rate": 8.575547506774497e-05, + "loss": 1.7646, + "step": 8765 + }, + { + "epoch": 2.6906077348066297, + "grad_norm": 0.277721107006073, + "learning_rate": 8.575200040786205e-05, + "loss": 1.8046, + "step": 8766 + }, + { + "epoch": 2.690914671577655, + "grad_norm": 0.3289557695388794, + "learning_rate": 8.574852539465688e-05, + "loss": 1.8145, + "step": 8767 + }, + { + "epoch": 2.6912216083486804, + "grad_norm": 0.28926602005958557, + "learning_rate": 8.574505002816385e-05, + "loss": 1.7627, + "step": 8768 + }, + { + "epoch": 2.6915285451197053, + "grad_norm": 0.2972332835197449, + "learning_rate": 8.574157430841727e-05, + "loss": 1.8294, + "step": 8769 + }, + { + "epoch": 2.6918354818907306, + "grad_norm": 0.28366953134536743, + "learning_rate": 8.57380982354515e-05, + "loss": 1.8535, + "step": 8770 + }, + { + "epoch": 2.6921424186617555, + "grad_norm": 0.2798771262168884, + "learning_rate": 8.57346218093009e-05, + "loss": 1.8298, + "step": 8771 + }, + { + "epoch": 2.692449355432781, + "grad_norm": 0.2614765465259552, + "learning_rate": 8.573114502999983e-05, + "loss": 1.8555, + "step": 8772 + }, + { + "epoch": 2.692756292203806, + "grad_norm": 0.30653777718544006, + "learning_rate": 8.572766789758265e-05, + "loss": 1.8507, + "step": 8773 + }, + { + "epoch": 2.693063228974831, + "grad_norm": 0.3189094066619873, + "learning_rate": 8.572419041208369e-05, + "loss": 1.8791, + "step": 8774 + }, + { + "epoch": 2.6933701657458564, + "grad_norm": 0.33381524682044983, + "learning_rate": 8.572071257353735e-05, + "loss": 1.8241, + "step": 8775 + }, + { + "epoch": 2.6936771025168813, + "grad_norm": 0.2776879668235779, + "learning_rate": 8.571723438197801e-05, + "loss": 1.7837, + "step": 8776 + }, + { + "epoch": 2.6939840392879066, + "grad_norm": 0.35845425724983215, + "learning_rate": 8.571375583744001e-05, + "loss": 1.8896, + "step": 8777 + }, + { + "epoch": 2.694290976058932, + "grad_norm": 0.28849005699157715, + "learning_rate": 8.571027693995775e-05, + "loss": 1.803, + "step": 8778 + }, + { + "epoch": 2.694597912829957, + "grad_norm": 0.3008786141872406, + "learning_rate": 8.57067976895656e-05, + "loss": 1.8559, + "step": 8779 + }, + { + "epoch": 2.694904849600982, + "grad_norm": 0.2924736440181732, + "learning_rate": 8.570331808629795e-05, + "loss": 1.8016, + "step": 8780 + }, + { + "epoch": 2.695211786372007, + "grad_norm": 0.2962380051612854, + "learning_rate": 8.569983813018917e-05, + "loss": 1.819, + "step": 8781 + }, + { + "epoch": 2.6955187231430324, + "grad_norm": 0.3141970634460449, + "learning_rate": 8.569635782127367e-05, + "loss": 1.8462, + "step": 8782 + }, + { + "epoch": 2.6958256599140578, + "grad_norm": 0.297061562538147, + "learning_rate": 8.569287715958584e-05, + "loss": 1.855, + "step": 8783 + }, + { + "epoch": 2.696132596685083, + "grad_norm": 0.30669623613357544, + "learning_rate": 8.568939614516009e-05, + "loss": 1.8626, + "step": 8784 + }, + { + "epoch": 2.696439533456108, + "grad_norm": 0.2782025933265686, + "learning_rate": 8.568591477803081e-05, + "loss": 1.8993, + "step": 8785 + }, + { + "epoch": 2.6967464702271333, + "grad_norm": 0.3644821345806122, + "learning_rate": 8.568243305823239e-05, + "loss": 1.8318, + "step": 8786 + }, + { + "epoch": 2.697053406998158, + "grad_norm": 0.4073259234428406, + "learning_rate": 8.567895098579925e-05, + "loss": 1.8963, + "step": 8787 + }, + { + "epoch": 2.6973603437691835, + "grad_norm": 0.40539780259132385, + "learning_rate": 8.567546856076583e-05, + "loss": 1.8644, + "step": 8788 + }, + { + "epoch": 2.697667280540209, + "grad_norm": 0.36739271879196167, + "learning_rate": 8.567198578316648e-05, + "loss": 1.8555, + "step": 8789 + }, + { + "epoch": 2.6979742173112338, + "grad_norm": 0.3339182138442993, + "learning_rate": 8.566850265303568e-05, + "loss": 1.8431, + "step": 8790 + }, + { + "epoch": 2.698281154082259, + "grad_norm": 0.3389740586280823, + "learning_rate": 8.566501917040784e-05, + "loss": 1.8271, + "step": 8791 + }, + { + "epoch": 2.698588090853284, + "grad_norm": 0.33819615840911865, + "learning_rate": 8.566153533531737e-05, + "loss": 1.8504, + "step": 8792 + }, + { + "epoch": 2.6988950276243093, + "grad_norm": 0.39106276631355286, + "learning_rate": 8.56580511477987e-05, + "loss": 1.7656, + "step": 8793 + }, + { + "epoch": 2.6992019643953347, + "grad_norm": 0.3374726474285126, + "learning_rate": 8.565456660788628e-05, + "loss": 1.8256, + "step": 8794 + }, + { + "epoch": 2.69950890116636, + "grad_norm": 0.33096614480018616, + "learning_rate": 8.565108171561452e-05, + "loss": 1.9486, + "step": 8795 + }, + { + "epoch": 2.699815837937385, + "grad_norm": 0.3202100396156311, + "learning_rate": 8.564759647101788e-05, + "loss": 1.7708, + "step": 8796 + }, + { + "epoch": 2.7001227747084102, + "grad_norm": 0.28830909729003906, + "learning_rate": 8.56441108741308e-05, + "loss": 1.8247, + "step": 8797 + }, + { + "epoch": 2.700429711479435, + "grad_norm": 0.32385459542274475, + "learning_rate": 8.564062492498772e-05, + "loss": 1.8338, + "step": 8798 + }, + { + "epoch": 2.7007366482504604, + "grad_norm": 0.3059900104999542, + "learning_rate": 8.56371386236231e-05, + "loss": 1.8321, + "step": 8799 + }, + { + "epoch": 2.701043585021486, + "grad_norm": 0.2922738492488861, + "learning_rate": 8.563365197007141e-05, + "loss": 1.7734, + "step": 8800 + }, + { + "epoch": 2.7013505217925107, + "grad_norm": 0.32542386651039124, + "learning_rate": 8.563016496436704e-05, + "loss": 1.8696, + "step": 8801 + }, + { + "epoch": 2.701657458563536, + "grad_norm": 0.2830851674079895, + "learning_rate": 8.562667760654452e-05, + "loss": 1.8237, + "step": 8802 + }, + { + "epoch": 2.701964395334561, + "grad_norm": 0.2794142961502075, + "learning_rate": 8.562318989663831e-05, + "loss": 1.8301, + "step": 8803 + }, + { + "epoch": 2.7022713321055862, + "grad_norm": 0.3149101436138153, + "learning_rate": 8.561970183468281e-05, + "loss": 1.8716, + "step": 8804 + }, + { + "epoch": 2.7025782688766116, + "grad_norm": 0.29530593752861023, + "learning_rate": 8.561621342071258e-05, + "loss": 1.9069, + "step": 8805 + }, + { + "epoch": 2.7028852056476365, + "grad_norm": 0.33965879678726196, + "learning_rate": 8.561272465476204e-05, + "loss": 1.8381, + "step": 8806 + }, + { + "epoch": 2.703192142418662, + "grad_norm": 0.3310995399951935, + "learning_rate": 8.560923553686569e-05, + "loss": 1.9293, + "step": 8807 + }, + { + "epoch": 2.7034990791896867, + "grad_norm": 0.3828842043876648, + "learning_rate": 8.5605746067058e-05, + "loss": 1.8789, + "step": 8808 + }, + { + "epoch": 2.703806015960712, + "grad_norm": 0.3666260242462158, + "learning_rate": 8.560225624537346e-05, + "loss": 1.8622, + "step": 8809 + }, + { + "epoch": 2.7041129527317374, + "grad_norm": 0.36732783913612366, + "learning_rate": 8.559876607184653e-05, + "loss": 1.8177, + "step": 8810 + }, + { + "epoch": 2.7044198895027627, + "grad_norm": 0.35554859042167664, + "learning_rate": 8.559527554651176e-05, + "loss": 1.884, + "step": 8811 + }, + { + "epoch": 2.7047268262737876, + "grad_norm": 0.3118159770965576, + "learning_rate": 8.55917846694036e-05, + "loss": 1.8779, + "step": 8812 + }, + { + "epoch": 2.705033763044813, + "grad_norm": 0.278105765581131, + "learning_rate": 8.558829344055657e-05, + "loss": 1.8513, + "step": 8813 + }, + { + "epoch": 2.705340699815838, + "grad_norm": 0.30809372663497925, + "learning_rate": 8.558480186000517e-05, + "loss": 1.8023, + "step": 8814 + }, + { + "epoch": 2.705647636586863, + "grad_norm": 0.28222522139549255, + "learning_rate": 8.558130992778388e-05, + "loss": 1.8421, + "step": 8815 + }, + { + "epoch": 2.7059545733578885, + "grad_norm": 0.29532718658447266, + "learning_rate": 8.557781764392725e-05, + "loss": 1.8131, + "step": 8816 + }, + { + "epoch": 2.7062615101289134, + "grad_norm": 0.2670072317123413, + "learning_rate": 8.557432500846975e-05, + "loss": 1.7856, + "step": 8817 + }, + { + "epoch": 2.7065684468999387, + "grad_norm": 0.3431483805179596, + "learning_rate": 8.557083202144594e-05, + "loss": 1.8484, + "step": 8818 + }, + { + "epoch": 2.7068753836709636, + "grad_norm": 0.3824561536312103, + "learning_rate": 8.556733868289033e-05, + "loss": 1.8954, + "step": 8819 + }, + { + "epoch": 2.707182320441989, + "grad_norm": 0.4189379811286926, + "learning_rate": 8.55638449928374e-05, + "loss": 1.7846, + "step": 8820 + }, + { + "epoch": 2.7074892572130143, + "grad_norm": 0.34948450326919556, + "learning_rate": 8.556035095132173e-05, + "loss": 1.7696, + "step": 8821 + }, + { + "epoch": 2.707796193984039, + "grad_norm": 0.2906292676925659, + "learning_rate": 8.555685655837783e-05, + "loss": 1.8359, + "step": 8822 + }, + { + "epoch": 2.7081031307550645, + "grad_norm": 0.2756035029888153, + "learning_rate": 8.555336181404023e-05, + "loss": 1.8684, + "step": 8823 + }, + { + "epoch": 2.7084100675260894, + "grad_norm": 0.3714772164821625, + "learning_rate": 8.554986671834346e-05, + "loss": 1.8833, + "step": 8824 + }, + { + "epoch": 2.7087170042971147, + "grad_norm": 0.41674792766571045, + "learning_rate": 8.554637127132209e-05, + "loss": 1.8272, + "step": 8825 + }, + { + "epoch": 2.70902394106814, + "grad_norm": 0.333915650844574, + "learning_rate": 8.554287547301063e-05, + "loss": 1.8343, + "step": 8826 + }, + { + "epoch": 2.7093308778391654, + "grad_norm": 0.33764639496803284, + "learning_rate": 8.553937932344365e-05, + "loss": 1.812, + "step": 8827 + }, + { + "epoch": 2.7096378146101903, + "grad_norm": 0.4445551931858063, + "learning_rate": 8.553588282265569e-05, + "loss": 1.8386, + "step": 8828 + }, + { + "epoch": 2.7099447513812156, + "grad_norm": 0.43314024806022644, + "learning_rate": 8.553238597068131e-05, + "loss": 1.7727, + "step": 8829 + }, + { + "epoch": 2.7102516881522405, + "grad_norm": 0.364596426486969, + "learning_rate": 8.552888876755506e-05, + "loss": 1.8875, + "step": 8830 + }, + { + "epoch": 2.710558624923266, + "grad_norm": 0.3023224174976349, + "learning_rate": 8.552539121331151e-05, + "loss": 1.8676, + "step": 8831 + }, + { + "epoch": 2.710865561694291, + "grad_norm": 0.3278682231903076, + "learning_rate": 8.552189330798522e-05, + "loss": 1.852, + "step": 8832 + }, + { + "epoch": 2.711172498465316, + "grad_norm": 0.34684303402900696, + "learning_rate": 8.551839505161077e-05, + "loss": 1.8449, + "step": 8833 + }, + { + "epoch": 2.7114794352363414, + "grad_norm": 0.3398132920265198, + "learning_rate": 8.551489644422271e-05, + "loss": 1.8493, + "step": 8834 + }, + { + "epoch": 2.7117863720073663, + "grad_norm": 0.2835905849933624, + "learning_rate": 8.551139748585563e-05, + "loss": 1.8283, + "step": 8835 + }, + { + "epoch": 2.7120933087783916, + "grad_norm": 0.30910351872444153, + "learning_rate": 8.55078981765441e-05, + "loss": 1.8429, + "step": 8836 + }, + { + "epoch": 2.712400245549417, + "grad_norm": 0.3802061676979065, + "learning_rate": 8.550439851632272e-05, + "loss": 1.8348, + "step": 8837 + }, + { + "epoch": 2.712707182320442, + "grad_norm": 0.3686448931694031, + "learning_rate": 8.550089850522606e-05, + "loss": 1.8652, + "step": 8838 + }, + { + "epoch": 2.713014119091467, + "grad_norm": 0.2919705808162689, + "learning_rate": 8.549739814328872e-05, + "loss": 1.8318, + "step": 8839 + }, + { + "epoch": 2.713321055862492, + "grad_norm": 0.34780198335647583, + "learning_rate": 8.549389743054527e-05, + "loss": 1.8781, + "step": 8840 + }, + { + "epoch": 2.7136279926335174, + "grad_norm": 0.3955966532230377, + "learning_rate": 8.549039636703034e-05, + "loss": 1.867, + "step": 8841 + }, + { + "epoch": 2.7139349294045427, + "grad_norm": 0.2836689054965973, + "learning_rate": 8.548689495277851e-05, + "loss": 1.7859, + "step": 8842 + }, + { + "epoch": 2.714241866175568, + "grad_norm": 0.369865357875824, + "learning_rate": 8.548339318782436e-05, + "loss": 1.8246, + "step": 8843 + }, + { + "epoch": 2.714548802946593, + "grad_norm": 0.2901081442832947, + "learning_rate": 8.547989107220256e-05, + "loss": 1.7888, + "step": 8844 + }, + { + "epoch": 2.7148557397176183, + "grad_norm": 0.2790970802307129, + "learning_rate": 8.547638860594764e-05, + "loss": 1.8311, + "step": 8845 + }, + { + "epoch": 2.715162676488643, + "grad_norm": 0.2935783267021179, + "learning_rate": 8.547288578909429e-05, + "loss": 1.857, + "step": 8846 + }, + { + "epoch": 2.7154696132596685, + "grad_norm": 0.27074959874153137, + "learning_rate": 8.546938262167708e-05, + "loss": 1.7457, + "step": 8847 + }, + { + "epoch": 2.715776550030694, + "grad_norm": 0.3042888343334198, + "learning_rate": 8.546587910373063e-05, + "loss": 1.8598, + "step": 8848 + }, + { + "epoch": 2.7160834868017187, + "grad_norm": 0.29088664054870605, + "learning_rate": 8.546237523528958e-05, + "loss": 1.8461, + "step": 8849 + }, + { + "epoch": 2.716390423572744, + "grad_norm": 0.3022211492061615, + "learning_rate": 8.545887101638857e-05, + "loss": 1.8327, + "step": 8850 + }, + { + "epoch": 2.716697360343769, + "grad_norm": 0.30194929242134094, + "learning_rate": 8.545536644706218e-05, + "loss": 1.8331, + "step": 8851 + }, + { + "epoch": 2.7170042971147943, + "grad_norm": 0.31702303886413574, + "learning_rate": 8.54518615273451e-05, + "loss": 1.8576, + "step": 8852 + }, + { + "epoch": 2.7173112338858196, + "grad_norm": 0.30386796593666077, + "learning_rate": 8.544835625727195e-05, + "loss": 1.8278, + "step": 8853 + }, + { + "epoch": 2.717618170656845, + "grad_norm": 0.30670568346977234, + "learning_rate": 8.544485063687735e-05, + "loss": 1.8123, + "step": 8854 + }, + { + "epoch": 2.71792510742787, + "grad_norm": 0.3896371126174927, + "learning_rate": 8.544134466619597e-05, + "loss": 1.8101, + "step": 8855 + }, + { + "epoch": 2.718232044198895, + "grad_norm": 0.4742000699043274, + "learning_rate": 8.543783834526245e-05, + "loss": 1.8402, + "step": 8856 + }, + { + "epoch": 2.71853898096992, + "grad_norm": 0.4234209954738617, + "learning_rate": 8.543433167411143e-05, + "loss": 1.8814, + "step": 8857 + }, + { + "epoch": 2.7188459177409454, + "grad_norm": 0.28478503227233887, + "learning_rate": 8.54308246527776e-05, + "loss": 1.8165, + "step": 8858 + }, + { + "epoch": 2.7191528545119708, + "grad_norm": 0.3534078896045685, + "learning_rate": 8.542731728129558e-05, + "loss": 1.7947, + "step": 8859 + }, + { + "epoch": 2.7194597912829956, + "grad_norm": 0.5471592545509338, + "learning_rate": 8.542380955970004e-05, + "loss": 1.9073, + "step": 8860 + }, + { + "epoch": 2.719766728054021, + "grad_norm": 0.5037226676940918, + "learning_rate": 8.542030148802566e-05, + "loss": 1.8701, + "step": 8861 + }, + { + "epoch": 2.720073664825046, + "grad_norm": 0.3415449559688568, + "learning_rate": 8.54167930663071e-05, + "loss": 1.827, + "step": 8862 + }, + { + "epoch": 2.720380601596071, + "grad_norm": 0.33516764640808105, + "learning_rate": 8.541328429457903e-05, + "loss": 1.9396, + "step": 8863 + }, + { + "epoch": 2.7206875383670965, + "grad_norm": 0.3934863209724426, + "learning_rate": 8.540977517287612e-05, + "loss": 1.8738, + "step": 8864 + }, + { + "epoch": 2.7209944751381214, + "grad_norm": 0.5137139558792114, + "learning_rate": 8.540626570123307e-05, + "loss": 1.9007, + "step": 8865 + }, + { + "epoch": 2.7213014119091468, + "grad_norm": 0.5846540331840515, + "learning_rate": 8.540275587968453e-05, + "loss": 1.9335, + "step": 8866 + }, + { + "epoch": 2.7216083486801717, + "grad_norm": 0.613388180732727, + "learning_rate": 8.539924570826523e-05, + "loss": 1.8967, + "step": 8867 + }, + { + "epoch": 2.721915285451197, + "grad_norm": 0.4804840087890625, + "learning_rate": 8.539573518700983e-05, + "loss": 1.7712, + "step": 8868 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.34939101338386536, + "learning_rate": 8.539222431595303e-05, + "loss": 1.8578, + "step": 8869 + }, + { + "epoch": 2.7225291589932477, + "grad_norm": 0.4230511486530304, + "learning_rate": 8.538871309512951e-05, + "loss": 1.793, + "step": 8870 + }, + { + "epoch": 2.7228360957642725, + "grad_norm": 0.5383400917053223, + "learning_rate": 8.538520152457402e-05, + "loss": 1.8153, + "step": 8871 + }, + { + "epoch": 2.723143032535298, + "grad_norm": 0.46213194727897644, + "learning_rate": 8.538168960432118e-05, + "loss": 1.9357, + "step": 8872 + }, + { + "epoch": 2.7234499693063228, + "grad_norm": 0.3126194477081299, + "learning_rate": 8.537817733440577e-05, + "loss": 1.7954, + "step": 8873 + }, + { + "epoch": 2.723756906077348, + "grad_norm": 0.4018714129924774, + "learning_rate": 8.537466471486248e-05, + "loss": 1.824, + "step": 8874 + }, + { + "epoch": 2.7240638428483734, + "grad_norm": 0.5690213441848755, + "learning_rate": 8.537115174572602e-05, + "loss": 1.7807, + "step": 8875 + }, + { + "epoch": 2.7243707796193983, + "grad_norm": 0.4669814705848694, + "learning_rate": 8.53676384270311e-05, + "loss": 1.7438, + "step": 8876 + }, + { + "epoch": 2.7246777163904237, + "grad_norm": 0.3040566146373749, + "learning_rate": 8.536412475881246e-05, + "loss": 1.8613, + "step": 8877 + }, + { + "epoch": 2.7249846531614486, + "grad_norm": 0.38985559344291687, + "learning_rate": 8.53606107411048e-05, + "loss": 1.816, + "step": 8878 + }, + { + "epoch": 2.725291589932474, + "grad_norm": 0.4417174160480499, + "learning_rate": 8.535709637394285e-05, + "loss": 1.8675, + "step": 8879 + }, + { + "epoch": 2.7255985267034992, + "grad_norm": 0.3254696726799011, + "learning_rate": 8.535358165736138e-05, + "loss": 1.8419, + "step": 8880 + }, + { + "epoch": 2.725905463474524, + "grad_norm": 0.36002370715141296, + "learning_rate": 8.535006659139506e-05, + "loss": 1.9084, + "step": 8881 + }, + { + "epoch": 2.7262124002455494, + "grad_norm": 0.3471790850162506, + "learning_rate": 8.534655117607869e-05, + "loss": 1.8442, + "step": 8882 + }, + { + "epoch": 2.7265193370165743, + "grad_norm": 0.3042849004268646, + "learning_rate": 8.534303541144697e-05, + "loss": 1.8261, + "step": 8883 + }, + { + "epoch": 2.7268262737875997, + "grad_norm": 0.32416659593582153, + "learning_rate": 8.533951929753465e-05, + "loss": 1.8625, + "step": 8884 + }, + { + "epoch": 2.727133210558625, + "grad_norm": 0.32449519634246826, + "learning_rate": 8.53360028343765e-05, + "loss": 1.8653, + "step": 8885 + }, + { + "epoch": 2.7274401473296503, + "grad_norm": 0.34744054079055786, + "learning_rate": 8.533248602200726e-05, + "loss": 1.8742, + "step": 8886 + }, + { + "epoch": 2.7277470841006752, + "grad_norm": 0.30540695786476135, + "learning_rate": 8.532896886046167e-05, + "loss": 1.8064, + "step": 8887 + }, + { + "epoch": 2.7280540208717006, + "grad_norm": 0.27105677127838135, + "learning_rate": 8.532545134977452e-05, + "loss": 1.7867, + "step": 8888 + }, + { + "epoch": 2.7283609576427255, + "grad_norm": 0.2682685852050781, + "learning_rate": 8.532193348998054e-05, + "loss": 1.8191, + "step": 8889 + }, + { + "epoch": 2.728667894413751, + "grad_norm": 0.33534809947013855, + "learning_rate": 8.531841528111452e-05, + "loss": 1.8758, + "step": 8890 + }, + { + "epoch": 2.728974831184776, + "grad_norm": 0.33555057644844055, + "learning_rate": 8.531489672321122e-05, + "loss": 1.8932, + "step": 8891 + }, + { + "epoch": 2.729281767955801, + "grad_norm": 0.3532167077064514, + "learning_rate": 8.531137781630542e-05, + "loss": 1.8621, + "step": 8892 + }, + { + "epoch": 2.7295887047268264, + "grad_norm": 0.337634414434433, + "learning_rate": 8.530785856043186e-05, + "loss": 1.8618, + "step": 8893 + }, + { + "epoch": 2.7298956414978512, + "grad_norm": 0.28855568170547485, + "learning_rate": 8.530433895562538e-05, + "loss": 1.8248, + "step": 8894 + }, + { + "epoch": 2.7302025782688766, + "grad_norm": 0.3128049373626709, + "learning_rate": 8.530081900192071e-05, + "loss": 1.8071, + "step": 8895 + }, + { + "epoch": 2.730509515039902, + "grad_norm": 0.2949801981449127, + "learning_rate": 8.529729869935265e-05, + "loss": 1.7704, + "step": 8896 + }, + { + "epoch": 2.730816451810927, + "grad_norm": 0.2708294987678528, + "learning_rate": 8.529377804795603e-05, + "loss": 1.8127, + "step": 8897 + }, + { + "epoch": 2.731123388581952, + "grad_norm": 0.300516813993454, + "learning_rate": 8.529025704776559e-05, + "loss": 1.9063, + "step": 8898 + }, + { + "epoch": 2.731430325352977, + "grad_norm": 0.2590954005718231, + "learning_rate": 8.528673569881613e-05, + "loss": 1.7595, + "step": 8899 + }, + { + "epoch": 2.7317372621240024, + "grad_norm": 0.30067136883735657, + "learning_rate": 8.528321400114248e-05, + "loss": 1.8697, + "step": 8900 + }, + { + "epoch": 2.7320441988950277, + "grad_norm": 0.3289981186389923, + "learning_rate": 8.527969195477943e-05, + "loss": 1.8257, + "step": 8901 + }, + { + "epoch": 2.732351135666053, + "grad_norm": 0.3205581307411194, + "learning_rate": 8.527616955976178e-05, + "loss": 1.9002, + "step": 8902 + }, + { + "epoch": 2.732658072437078, + "grad_norm": 0.30869361758232117, + "learning_rate": 8.527264681612435e-05, + "loss": 1.8239, + "step": 8903 + }, + { + "epoch": 2.7329650092081033, + "grad_norm": 0.3237484097480774, + "learning_rate": 8.526912372390195e-05, + "loss": 1.8879, + "step": 8904 + }, + { + "epoch": 2.733271945979128, + "grad_norm": 0.3172036111354828, + "learning_rate": 8.52656002831294e-05, + "loss": 1.8118, + "step": 8905 + }, + { + "epoch": 2.7335788827501535, + "grad_norm": 0.3326823115348816, + "learning_rate": 8.52620764938415e-05, + "loss": 1.8035, + "step": 8906 + }, + { + "epoch": 2.733885819521179, + "grad_norm": 0.36605212092399597, + "learning_rate": 8.525855235607311e-05, + "loss": 1.8689, + "step": 8907 + }, + { + "epoch": 2.7341927562922037, + "grad_norm": 0.31904828548431396, + "learning_rate": 8.525502786985905e-05, + "loss": 1.8188, + "step": 8908 + }, + { + "epoch": 2.734499693063229, + "grad_norm": 0.2657643258571625, + "learning_rate": 8.525150303523413e-05, + "loss": 1.7471, + "step": 8909 + }, + { + "epoch": 2.734806629834254, + "grad_norm": 0.32748520374298096, + "learning_rate": 8.524797785223318e-05, + "loss": 1.8678, + "step": 8910 + }, + { + "epoch": 2.7351135666052793, + "grad_norm": 0.32576173543930054, + "learning_rate": 8.524445232089107e-05, + "loss": 1.8296, + "step": 8911 + }, + { + "epoch": 2.7354205033763046, + "grad_norm": 0.3028578758239746, + "learning_rate": 8.524092644124261e-05, + "loss": 1.8656, + "step": 8912 + }, + { + "epoch": 2.7357274401473295, + "grad_norm": 0.29967090487480164, + "learning_rate": 8.523740021332268e-05, + "loss": 1.8206, + "step": 8913 + }, + { + "epoch": 2.736034376918355, + "grad_norm": 0.3042941391468048, + "learning_rate": 8.523387363716611e-05, + "loss": 1.7928, + "step": 8914 + }, + { + "epoch": 2.7363413136893797, + "grad_norm": 0.3278021216392517, + "learning_rate": 8.523034671280772e-05, + "loss": 1.9213, + "step": 8915 + }, + { + "epoch": 2.736648250460405, + "grad_norm": 0.39839017391204834, + "learning_rate": 8.522681944028242e-05, + "loss": 1.8242, + "step": 8916 + }, + { + "epoch": 2.7369551872314304, + "grad_norm": 0.3960748016834259, + "learning_rate": 8.522329181962504e-05, + "loss": 1.8761, + "step": 8917 + }, + { + "epoch": 2.7372621240024557, + "grad_norm": 0.3250591456890106, + "learning_rate": 8.521976385087044e-05, + "loss": 1.8318, + "step": 8918 + }, + { + "epoch": 2.7375690607734806, + "grad_norm": 0.31731119751930237, + "learning_rate": 8.521623553405349e-05, + "loss": 1.8062, + "step": 8919 + }, + { + "epoch": 2.737875997544506, + "grad_norm": 0.32452264428138733, + "learning_rate": 8.521270686920906e-05, + "loss": 1.8384, + "step": 8920 + }, + { + "epoch": 2.738182934315531, + "grad_norm": 0.2892500162124634, + "learning_rate": 8.520917785637204e-05, + "loss": 1.8128, + "step": 8921 + }, + { + "epoch": 2.738489871086556, + "grad_norm": 0.30028483271598816, + "learning_rate": 8.520564849557726e-05, + "loss": 1.8512, + "step": 8922 + }, + { + "epoch": 2.7387968078575815, + "grad_norm": 0.29927411675453186, + "learning_rate": 8.520211878685964e-05, + "loss": 1.8431, + "step": 8923 + }, + { + "epoch": 2.7391037446286064, + "grad_norm": 0.3426479995250702, + "learning_rate": 8.519858873025405e-05, + "loss": 1.8724, + "step": 8924 + }, + { + "epoch": 2.7394106813996317, + "grad_norm": 0.3795917332172394, + "learning_rate": 8.519505832579538e-05, + "loss": 1.8888, + "step": 8925 + }, + { + "epoch": 2.7397176181706566, + "grad_norm": 0.4924582839012146, + "learning_rate": 8.519152757351849e-05, + "loss": 1.7743, + "step": 8926 + }, + { + "epoch": 2.740024554941682, + "grad_norm": 0.43054282665252686, + "learning_rate": 8.518799647345832e-05, + "loss": 1.8556, + "step": 8927 + }, + { + "epoch": 2.7403314917127073, + "grad_norm": 0.37040412425994873, + "learning_rate": 8.518446502564974e-05, + "loss": 1.9162, + "step": 8928 + }, + { + "epoch": 2.7406384284837326, + "grad_norm": 0.38334885239601135, + "learning_rate": 8.518093323012766e-05, + "loss": 1.8078, + "step": 8929 + }, + { + "epoch": 2.7409453652547575, + "grad_norm": 0.409101665019989, + "learning_rate": 8.517740108692698e-05, + "loss": 1.7874, + "step": 8930 + }, + { + "epoch": 2.741252302025783, + "grad_norm": 0.3953499495983124, + "learning_rate": 8.517386859608258e-05, + "loss": 1.8455, + "step": 8931 + }, + { + "epoch": 2.7415592387968077, + "grad_norm": 0.30524972081184387, + "learning_rate": 8.517033575762942e-05, + "loss": 1.822, + "step": 8932 + }, + { + "epoch": 2.741866175567833, + "grad_norm": 0.354086309671402, + "learning_rate": 8.516680257160239e-05, + "loss": 1.859, + "step": 8933 + }, + { + "epoch": 2.7421731123388584, + "grad_norm": 0.4305376410484314, + "learning_rate": 8.516326903803638e-05, + "loss": 1.8918, + "step": 8934 + }, + { + "epoch": 2.7424800491098833, + "grad_norm": 0.590727686882019, + "learning_rate": 8.515973515696635e-05, + "loss": 1.8841, + "step": 8935 + }, + { + "epoch": 2.7427869858809086, + "grad_norm": 0.665314257144928, + "learning_rate": 8.515620092842723e-05, + "loss": 1.8166, + "step": 8936 + }, + { + "epoch": 2.7430939226519335, + "grad_norm": 0.5579181909561157, + "learning_rate": 8.515266635245389e-05, + "loss": 1.8344, + "step": 8937 + }, + { + "epoch": 2.743400859422959, + "grad_norm": 0.3698382079601288, + "learning_rate": 8.514913142908132e-05, + "loss": 1.8445, + "step": 8938 + }, + { + "epoch": 2.743707796193984, + "grad_norm": 0.30882057547569275, + "learning_rate": 8.514559615834442e-05, + "loss": 1.8443, + "step": 8939 + }, + { + "epoch": 2.744014732965009, + "grad_norm": 0.35821446776390076, + "learning_rate": 8.514206054027815e-05, + "loss": 1.8482, + "step": 8940 + }, + { + "epoch": 2.7443216697360344, + "grad_norm": 0.35552099347114563, + "learning_rate": 8.513852457491744e-05, + "loss": 1.7848, + "step": 8941 + }, + { + "epoch": 2.7446286065070593, + "grad_norm": 0.27788954973220825, + "learning_rate": 8.513498826229722e-05, + "loss": 1.7935, + "step": 8942 + }, + { + "epoch": 2.7449355432780846, + "grad_norm": 0.30653929710388184, + "learning_rate": 8.513145160245246e-05, + "loss": 1.808, + "step": 8943 + }, + { + "epoch": 2.74524248004911, + "grad_norm": 0.34749966859817505, + "learning_rate": 8.512791459541812e-05, + "loss": 1.8498, + "step": 8944 + }, + { + "epoch": 2.7455494168201353, + "grad_norm": 0.362326979637146, + "learning_rate": 8.512437724122912e-05, + "loss": 1.8263, + "step": 8945 + }, + { + "epoch": 2.74585635359116, + "grad_norm": 0.2914038598537445, + "learning_rate": 8.512083953992044e-05, + "loss": 1.834, + "step": 8946 + }, + { + "epoch": 2.7461632903621855, + "grad_norm": 0.31662893295288086, + "learning_rate": 8.511730149152705e-05, + "loss": 1.8157, + "step": 8947 + }, + { + "epoch": 2.7464702271332104, + "grad_norm": 0.38970568776130676, + "learning_rate": 8.51137630960839e-05, + "loss": 1.8764, + "step": 8948 + }, + { + "epoch": 2.7467771639042358, + "grad_norm": 0.3907272517681122, + "learning_rate": 8.511022435362594e-05, + "loss": 1.8665, + "step": 8949 + }, + { + "epoch": 2.747084100675261, + "grad_norm": 0.3315196931362152, + "learning_rate": 8.510668526418819e-05, + "loss": 1.8076, + "step": 8950 + }, + { + "epoch": 2.747391037446286, + "grad_norm": 0.29783520102500916, + "learning_rate": 8.510314582780559e-05, + "loss": 1.8518, + "step": 8951 + }, + { + "epoch": 2.7476979742173113, + "grad_norm": 0.3085685670375824, + "learning_rate": 8.509960604451312e-05, + "loss": 1.8961, + "step": 8952 + }, + { + "epoch": 2.748004910988336, + "grad_norm": 0.3204992711544037, + "learning_rate": 8.509606591434579e-05, + "loss": 1.8374, + "step": 8953 + }, + { + "epoch": 2.7483118477593615, + "grad_norm": 0.2801276445388794, + "learning_rate": 8.509252543733855e-05, + "loss": 1.8455, + "step": 8954 + }, + { + "epoch": 2.748618784530387, + "grad_norm": 0.26911506056785583, + "learning_rate": 8.508898461352641e-05, + "loss": 1.8093, + "step": 8955 + }, + { + "epoch": 2.7489257213014118, + "grad_norm": 0.30429625511169434, + "learning_rate": 8.508544344294435e-05, + "loss": 1.8526, + "step": 8956 + }, + { + "epoch": 2.749232658072437, + "grad_norm": 0.308403342962265, + "learning_rate": 8.50819019256274e-05, + "loss": 1.7917, + "step": 8957 + }, + { + "epoch": 2.749539594843462, + "grad_norm": 0.3292251229286194, + "learning_rate": 8.507836006161052e-05, + "loss": 1.8206, + "step": 8958 + }, + { + "epoch": 2.7498465316144873, + "grad_norm": 0.30014076828956604, + "learning_rate": 8.507481785092871e-05, + "loss": 1.8136, + "step": 8959 + }, + { + "epoch": 2.7501534683855127, + "grad_norm": 0.2879343032836914, + "learning_rate": 8.5071275293617e-05, + "loss": 1.8476, + "step": 8960 + }, + { + "epoch": 2.750460405156538, + "grad_norm": 0.30646058917045593, + "learning_rate": 8.506773238971039e-05, + "loss": 1.7936, + "step": 8961 + }, + { + "epoch": 2.750767341927563, + "grad_norm": 0.309804230928421, + "learning_rate": 8.506418913924391e-05, + "loss": 1.8076, + "step": 8962 + }, + { + "epoch": 2.7510742786985882, + "grad_norm": 0.27035996317863464, + "learning_rate": 8.506064554225255e-05, + "loss": 1.8169, + "step": 8963 + }, + { + "epoch": 2.751381215469613, + "grad_norm": 0.3185548782348633, + "learning_rate": 8.505710159877134e-05, + "loss": 1.8265, + "step": 8964 + }, + { + "epoch": 2.7516881522406385, + "grad_norm": 0.3806973099708557, + "learning_rate": 8.505355730883532e-05, + "loss": 1.824, + "step": 8965 + }, + { + "epoch": 2.751995089011664, + "grad_norm": 0.3206372857093811, + "learning_rate": 8.505001267247949e-05, + "loss": 1.8436, + "step": 8966 + }, + { + "epoch": 2.7523020257826887, + "grad_norm": 0.2957460880279541, + "learning_rate": 8.504646768973889e-05, + "loss": 1.8212, + "step": 8967 + }, + { + "epoch": 2.752608962553714, + "grad_norm": 0.2854628562927246, + "learning_rate": 8.504292236064854e-05, + "loss": 1.862, + "step": 8968 + }, + { + "epoch": 2.752915899324739, + "grad_norm": 0.30056047439575195, + "learning_rate": 8.503937668524351e-05, + "loss": 1.8007, + "step": 8969 + }, + { + "epoch": 2.7532228360957642, + "grad_norm": 0.33884522318840027, + "learning_rate": 8.503583066355883e-05, + "loss": 1.8972, + "step": 8970 + }, + { + "epoch": 2.7535297728667896, + "grad_norm": 0.29358747601509094, + "learning_rate": 8.503228429562951e-05, + "loss": 1.8343, + "step": 8971 + }, + { + "epoch": 2.7538367096378145, + "grad_norm": 0.3650909662246704, + "learning_rate": 8.502873758149063e-05, + "loss": 1.7866, + "step": 8972 + }, + { + "epoch": 2.75414364640884, + "grad_norm": 0.3245839476585388, + "learning_rate": 8.502519052117725e-05, + "loss": 1.8451, + "step": 8973 + }, + { + "epoch": 2.7544505831798647, + "grad_norm": 0.305429071187973, + "learning_rate": 8.502164311472441e-05, + "loss": 1.9277, + "step": 8974 + }, + { + "epoch": 2.75475751995089, + "grad_norm": 0.3520638942718506, + "learning_rate": 8.501809536216716e-05, + "loss": 1.7648, + "step": 8975 + }, + { + "epoch": 2.7550644567219154, + "grad_norm": 0.419918030500412, + "learning_rate": 8.501454726354054e-05, + "loss": 1.7862, + "step": 8976 + }, + { + "epoch": 2.7553713934929407, + "grad_norm": 0.3854345977306366, + "learning_rate": 8.501099881887968e-05, + "loss": 1.8234, + "step": 8977 + }, + { + "epoch": 2.7556783302639656, + "grad_norm": 0.27826064825057983, + "learning_rate": 8.50074500282196e-05, + "loss": 1.7694, + "step": 8978 + }, + { + "epoch": 2.755985267034991, + "grad_norm": 0.3439055383205414, + "learning_rate": 8.500390089159536e-05, + "loss": 1.8136, + "step": 8979 + }, + { + "epoch": 2.756292203806016, + "grad_norm": 0.3434913754463196, + "learning_rate": 8.500035140904208e-05, + "loss": 1.8053, + "step": 8980 + }, + { + "epoch": 2.756599140577041, + "grad_norm": 0.27551600337028503, + "learning_rate": 8.49968015805948e-05, + "loss": 1.8349, + "step": 8981 + }, + { + "epoch": 2.7569060773480665, + "grad_norm": 0.304706871509552, + "learning_rate": 8.499325140628863e-05, + "loss": 1.8488, + "step": 8982 + }, + { + "epoch": 2.7572130141190914, + "grad_norm": 0.36910584568977356, + "learning_rate": 8.498970088615861e-05, + "loss": 1.8519, + "step": 8983 + }, + { + "epoch": 2.7575199508901167, + "grad_norm": 0.30584999918937683, + "learning_rate": 8.498615002023987e-05, + "loss": 1.8479, + "step": 8984 + }, + { + "epoch": 2.7578268876611416, + "grad_norm": 0.28511542081832886, + "learning_rate": 8.498259880856749e-05, + "loss": 1.8047, + "step": 8985 + }, + { + "epoch": 2.758133824432167, + "grad_norm": 0.28804922103881836, + "learning_rate": 8.497904725117658e-05, + "loss": 1.891, + "step": 8986 + }, + { + "epoch": 2.7584407612031923, + "grad_norm": 0.32592445611953735, + "learning_rate": 8.497549534810221e-05, + "loss": 1.8081, + "step": 8987 + }, + { + "epoch": 2.758747697974217, + "grad_norm": 0.3298552632331848, + "learning_rate": 8.497194309937949e-05, + "loss": 1.8897, + "step": 8988 + }, + { + "epoch": 2.7590546347452425, + "grad_norm": 0.3506438136100769, + "learning_rate": 8.496839050504353e-05, + "loss": 1.9007, + "step": 8989 + }, + { + "epoch": 2.7593615715162674, + "grad_norm": 0.30891793966293335, + "learning_rate": 8.496483756512946e-05, + "loss": 1.8154, + "step": 8990 + }, + { + "epoch": 2.7596685082872927, + "grad_norm": 0.3697068691253662, + "learning_rate": 8.496128427967235e-05, + "loss": 1.8301, + "step": 8991 + }, + { + "epoch": 2.759975445058318, + "grad_norm": 0.3090182840824127, + "learning_rate": 8.495773064870734e-05, + "loss": 1.8443, + "step": 8992 + }, + { + "epoch": 2.7602823818293434, + "grad_norm": 0.31172695755958557, + "learning_rate": 8.495417667226955e-05, + "loss": 1.8051, + "step": 8993 + }, + { + "epoch": 2.7605893186003683, + "grad_norm": 0.34285077452659607, + "learning_rate": 8.495062235039411e-05, + "loss": 1.8766, + "step": 8994 + }, + { + "epoch": 2.7608962553713936, + "grad_norm": 0.30001118779182434, + "learning_rate": 8.494706768311612e-05, + "loss": 1.8267, + "step": 8995 + }, + { + "epoch": 2.7612031921424185, + "grad_norm": 0.2767544984817505, + "learning_rate": 8.494351267047074e-05, + "loss": 1.8038, + "step": 8996 + }, + { + "epoch": 2.761510128913444, + "grad_norm": 0.2952648401260376, + "learning_rate": 8.493995731249307e-05, + "loss": 1.7863, + "step": 8997 + }, + { + "epoch": 2.761817065684469, + "grad_norm": 0.27491581439971924, + "learning_rate": 8.493640160921828e-05, + "loss": 1.844, + "step": 8998 + }, + { + "epoch": 2.762124002455494, + "grad_norm": 0.2733328938484192, + "learning_rate": 8.493284556068147e-05, + "loss": 1.7909, + "step": 8999 + }, + { + "epoch": 2.7624309392265194, + "grad_norm": 0.3201010525226593, + "learning_rate": 8.492928916691783e-05, + "loss": 1.8827, + "step": 9000 + }, + { + "epoch": 2.7627378759975443, + "grad_norm": 0.293652206659317, + "learning_rate": 8.492573242796244e-05, + "loss": 1.7755, + "step": 9001 + }, + { + "epoch": 2.7630448127685696, + "grad_norm": 0.2862321436405182, + "learning_rate": 8.492217534385053e-05, + "loss": 1.7868, + "step": 9002 + }, + { + "epoch": 2.763351749539595, + "grad_norm": 0.364490270614624, + "learning_rate": 8.491861791461722e-05, + "loss": 1.8276, + "step": 9003 + }, + { + "epoch": 2.7636586863106203, + "grad_norm": 0.4316955506801605, + "learning_rate": 8.491506014029765e-05, + "loss": 1.8727, + "step": 9004 + }, + { + "epoch": 2.763965623081645, + "grad_norm": 0.37957659363746643, + "learning_rate": 8.491150202092697e-05, + "loss": 1.8471, + "step": 9005 + }, + { + "epoch": 2.7642725598526705, + "grad_norm": 0.2936808168888092, + "learning_rate": 8.490794355654039e-05, + "loss": 1.7964, + "step": 9006 + }, + { + "epoch": 2.7645794966236954, + "grad_norm": 0.3742556869983673, + "learning_rate": 8.490438474717304e-05, + "loss": 1.8461, + "step": 9007 + }, + { + "epoch": 2.7648864333947207, + "grad_norm": 0.4273780286312103, + "learning_rate": 8.49008255928601e-05, + "loss": 1.7947, + "step": 9008 + }, + { + "epoch": 2.765193370165746, + "grad_norm": 0.35967808961868286, + "learning_rate": 8.489726609363675e-05, + "loss": 1.8125, + "step": 9009 + }, + { + "epoch": 2.765500306936771, + "grad_norm": 0.27607613801956177, + "learning_rate": 8.489370624953817e-05, + "loss": 1.8413, + "step": 9010 + }, + { + "epoch": 2.7658072437077963, + "grad_norm": 0.38287433981895447, + "learning_rate": 8.489014606059952e-05, + "loss": 1.8184, + "step": 9011 + }, + { + "epoch": 2.766114180478821, + "grad_norm": 0.4284100830554962, + "learning_rate": 8.4886585526856e-05, + "loss": 1.7965, + "step": 9012 + }, + { + "epoch": 2.7664211172498465, + "grad_norm": 0.35851627588272095, + "learning_rate": 8.48830246483428e-05, + "loss": 1.8275, + "step": 9013 + }, + { + "epoch": 2.766728054020872, + "grad_norm": 0.30598360300064087, + "learning_rate": 8.487946342509509e-05, + "loss": 1.8383, + "step": 9014 + }, + { + "epoch": 2.7670349907918967, + "grad_norm": 0.30098259449005127, + "learning_rate": 8.487590185714811e-05, + "loss": 1.8229, + "step": 9015 + }, + { + "epoch": 2.767341927562922, + "grad_norm": 0.45887723565101624, + "learning_rate": 8.487233994453701e-05, + "loss": 1.9128, + "step": 9016 + }, + { + "epoch": 2.767648864333947, + "grad_norm": 0.4983403980731964, + "learning_rate": 8.4868777687297e-05, + "loss": 1.8269, + "step": 9017 + }, + { + "epoch": 2.7679558011049723, + "grad_norm": 0.4925507605075836, + "learning_rate": 8.48652150854633e-05, + "loss": 1.9231, + "step": 9018 + }, + { + "epoch": 2.7682627378759976, + "grad_norm": 0.31434112787246704, + "learning_rate": 8.48616521390711e-05, + "loss": 1.7782, + "step": 9019 + }, + { + "epoch": 2.768569674647023, + "grad_norm": 0.31802332401275635, + "learning_rate": 8.485808884815563e-05, + "loss": 1.8927, + "step": 9020 + }, + { + "epoch": 2.768876611418048, + "grad_norm": 0.4615871012210846, + "learning_rate": 8.485452521275208e-05, + "loss": 1.7866, + "step": 9021 + }, + { + "epoch": 2.769183548189073, + "grad_norm": 0.43722355365753174, + "learning_rate": 8.48509612328957e-05, + "loss": 1.8159, + "step": 9022 + }, + { + "epoch": 2.769490484960098, + "grad_norm": 0.27137285470962524, + "learning_rate": 8.484739690862169e-05, + "loss": 1.7613, + "step": 9023 + }, + { + "epoch": 2.7697974217311234, + "grad_norm": 0.32973676919937134, + "learning_rate": 8.484383223996528e-05, + "loss": 1.8321, + "step": 9024 + }, + { + "epoch": 2.7701043585021488, + "grad_norm": 0.38628003001213074, + "learning_rate": 8.484026722696169e-05, + "loss": 1.8154, + "step": 9025 + }, + { + "epoch": 2.7704112952731736, + "grad_norm": 0.33044543862342834, + "learning_rate": 8.483670186964617e-05, + "loss": 1.857, + "step": 9026 + }, + { + "epoch": 2.770718232044199, + "grad_norm": 0.2778245210647583, + "learning_rate": 8.483313616805393e-05, + "loss": 1.8524, + "step": 9027 + }, + { + "epoch": 2.771025168815224, + "grad_norm": 0.32064709067344666, + "learning_rate": 8.482957012222024e-05, + "loss": 1.8757, + "step": 9028 + }, + { + "epoch": 2.771332105586249, + "grad_norm": 0.29325249791145325, + "learning_rate": 8.48260037321803e-05, + "loss": 1.8504, + "step": 9029 + }, + { + "epoch": 2.7716390423572745, + "grad_norm": 0.308626651763916, + "learning_rate": 8.48224369979694e-05, + "loss": 1.882, + "step": 9030 + }, + { + "epoch": 2.7719459791282994, + "grad_norm": 0.34577706456184387, + "learning_rate": 8.481886991962276e-05, + "loss": 1.8178, + "step": 9031 + }, + { + "epoch": 2.7722529158993248, + "grad_norm": 0.3902320861816406, + "learning_rate": 8.481530249717564e-05, + "loss": 1.9111, + "step": 9032 + }, + { + "epoch": 2.7725598526703497, + "grad_norm": 0.431540310382843, + "learning_rate": 8.481173473066328e-05, + "loss": 1.8145, + "step": 9033 + }, + { + "epoch": 2.772866789441375, + "grad_norm": 0.3637184798717499, + "learning_rate": 8.480816662012097e-05, + "loss": 1.8298, + "step": 9034 + }, + { + "epoch": 2.7731737262124003, + "grad_norm": 0.3045017123222351, + "learning_rate": 8.480459816558397e-05, + "loss": 1.8099, + "step": 9035 + }, + { + "epoch": 2.7734806629834257, + "grad_norm": 0.4252402186393738, + "learning_rate": 8.48010293670875e-05, + "loss": 1.8125, + "step": 9036 + }, + { + "epoch": 2.7737875997544506, + "grad_norm": 0.37933188676834106, + "learning_rate": 8.479746022466688e-05, + "loss": 1.8162, + "step": 9037 + }, + { + "epoch": 2.774094536525476, + "grad_norm": 0.287536084651947, + "learning_rate": 8.479389073835735e-05, + "loss": 1.8377, + "step": 9038 + }, + { + "epoch": 2.7744014732965008, + "grad_norm": 0.3484840393066406, + "learning_rate": 8.47903209081942e-05, + "loss": 1.8166, + "step": 9039 + }, + { + "epoch": 2.774708410067526, + "grad_norm": 0.4489477872848511, + "learning_rate": 8.478675073421272e-05, + "loss": 1.8618, + "step": 9040 + }, + { + "epoch": 2.7750153468385514, + "grad_norm": 0.3817744553089142, + "learning_rate": 8.478318021644817e-05, + "loss": 1.86, + "step": 9041 + }, + { + "epoch": 2.7753222836095763, + "grad_norm": 0.263468861579895, + "learning_rate": 8.477960935493585e-05, + "loss": 1.7802, + "step": 9042 + }, + { + "epoch": 2.7756292203806017, + "grad_norm": 0.3218925893306732, + "learning_rate": 8.477603814971104e-05, + "loss": 1.8056, + "step": 9043 + }, + { + "epoch": 2.7759361571516266, + "grad_norm": 0.38502782583236694, + "learning_rate": 8.477246660080905e-05, + "loss": 1.8405, + "step": 9044 + }, + { + "epoch": 2.776243093922652, + "grad_norm": 0.3504064381122589, + "learning_rate": 8.476889470826517e-05, + "loss": 1.8606, + "step": 9045 + }, + { + "epoch": 2.7765500306936772, + "grad_norm": 0.3007161021232605, + "learning_rate": 8.476532247211468e-05, + "loss": 1.8407, + "step": 9046 + }, + { + "epoch": 2.776856967464702, + "grad_norm": 0.30306726694107056, + "learning_rate": 8.476174989239289e-05, + "loss": 1.8399, + "step": 9047 + }, + { + "epoch": 2.7771639042357275, + "grad_norm": 0.3898545801639557, + "learning_rate": 8.475817696913511e-05, + "loss": 1.8971, + "step": 9048 + }, + { + "epoch": 2.7774708410067523, + "grad_norm": 0.35386478900909424, + "learning_rate": 8.475460370237667e-05, + "loss": 1.8213, + "step": 9049 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.35815873742103577, + "learning_rate": 8.475103009215287e-05, + "loss": 1.9593, + "step": 9050 + }, + { + "epoch": 2.778084714548803, + "grad_norm": 0.28021275997161865, + "learning_rate": 8.474745613849901e-05, + "loss": 1.7767, + "step": 9051 + }, + { + "epoch": 2.7783916513198283, + "grad_norm": 0.3393603563308716, + "learning_rate": 8.474388184145042e-05, + "loss": 1.8484, + "step": 9052 + }, + { + "epoch": 2.7786985880908532, + "grad_norm": 0.30488693714141846, + "learning_rate": 8.474030720104243e-05, + "loss": 1.835, + "step": 9053 + }, + { + "epoch": 2.7790055248618786, + "grad_norm": 0.2839586138725281, + "learning_rate": 8.473673221731037e-05, + "loss": 1.8054, + "step": 9054 + }, + { + "epoch": 2.7793124616329035, + "grad_norm": 0.2718851864337921, + "learning_rate": 8.473315689028955e-05, + "loss": 1.8216, + "step": 9055 + }, + { + "epoch": 2.779619398403929, + "grad_norm": 0.3072827458381653, + "learning_rate": 8.472958122001531e-05, + "loss": 1.8537, + "step": 9056 + }, + { + "epoch": 2.779926335174954, + "grad_norm": 0.36827966570854187, + "learning_rate": 8.472600520652301e-05, + "loss": 1.8174, + "step": 9057 + }, + { + "epoch": 2.780233271945979, + "grad_norm": 0.37436968088150024, + "learning_rate": 8.472242884984797e-05, + "loss": 1.7983, + "step": 9058 + }, + { + "epoch": 2.7805402087170044, + "grad_norm": 0.3039530813694, + "learning_rate": 8.471885215002554e-05, + "loss": 1.839, + "step": 9059 + }, + { + "epoch": 2.7808471454880292, + "grad_norm": 0.2949865162372589, + "learning_rate": 8.471527510709106e-05, + "loss": 1.8191, + "step": 9060 + }, + { + "epoch": 2.7811540822590546, + "grad_norm": 0.2914051413536072, + "learning_rate": 8.471169772107987e-05, + "loss": 1.8511, + "step": 9061 + }, + { + "epoch": 2.78146101903008, + "grad_norm": 0.29169002175331116, + "learning_rate": 8.470811999202734e-05, + "loss": 1.8242, + "step": 9062 + }, + { + "epoch": 2.781767955801105, + "grad_norm": 0.2862909436225891, + "learning_rate": 8.470454191996884e-05, + "loss": 1.8471, + "step": 9063 + }, + { + "epoch": 2.78207489257213, + "grad_norm": 0.2820829749107361, + "learning_rate": 8.47009635049397e-05, + "loss": 1.8539, + "step": 9064 + }, + { + "epoch": 2.782381829343155, + "grad_norm": 0.2778072655200958, + "learning_rate": 8.469738474697532e-05, + "loss": 1.7999, + "step": 9065 + }, + { + "epoch": 2.7826887661141804, + "grad_norm": 0.35963353514671326, + "learning_rate": 8.469380564611103e-05, + "loss": 1.8589, + "step": 9066 + }, + { + "epoch": 2.7829957028852057, + "grad_norm": 0.29438379406929016, + "learning_rate": 8.469022620238223e-05, + "loss": 1.7898, + "step": 9067 + }, + { + "epoch": 2.783302639656231, + "grad_norm": 0.2766551971435547, + "learning_rate": 8.468664641582428e-05, + "loss": 1.858, + "step": 9068 + }, + { + "epoch": 2.783609576427256, + "grad_norm": 0.29893574118614197, + "learning_rate": 8.468306628647256e-05, + "loss": 1.7859, + "step": 9069 + }, + { + "epoch": 2.7839165131982813, + "grad_norm": 0.2744910717010498, + "learning_rate": 8.467948581436243e-05, + "loss": 1.7803, + "step": 9070 + }, + { + "epoch": 2.784223449969306, + "grad_norm": 0.2405908703804016, + "learning_rate": 8.467590499952931e-05, + "loss": 1.8064, + "step": 9071 + }, + { + "epoch": 2.7845303867403315, + "grad_norm": 0.28585049510002136, + "learning_rate": 8.467232384200858e-05, + "loss": 1.809, + "step": 9072 + }, + { + "epoch": 2.784837323511357, + "grad_norm": 0.25816819071769714, + "learning_rate": 8.466874234183562e-05, + "loss": 1.7687, + "step": 9073 + }, + { + "epoch": 2.7851442602823817, + "grad_norm": 0.3135145306587219, + "learning_rate": 8.466516049904582e-05, + "loss": 1.8902, + "step": 9074 + }, + { + "epoch": 2.785451197053407, + "grad_norm": 0.32004159688949585, + "learning_rate": 8.46615783136746e-05, + "loss": 1.8227, + "step": 9075 + }, + { + "epoch": 2.785758133824432, + "grad_norm": 0.2775251567363739, + "learning_rate": 8.465799578575733e-05, + "loss": 1.8293, + "step": 9076 + }, + { + "epoch": 2.7860650705954573, + "grad_norm": 0.3377391993999481, + "learning_rate": 8.465441291532944e-05, + "loss": 1.9096, + "step": 9077 + }, + { + "epoch": 2.7863720073664826, + "grad_norm": 0.322818398475647, + "learning_rate": 8.465082970242634e-05, + "loss": 1.8372, + "step": 9078 + }, + { + "epoch": 2.786678944137508, + "grad_norm": 0.30539727210998535, + "learning_rate": 8.464724614708342e-05, + "loss": 1.8678, + "step": 9079 + }, + { + "epoch": 2.786985880908533, + "grad_norm": 0.3148079216480255, + "learning_rate": 8.464366224933611e-05, + "loss": 1.798, + "step": 9080 + }, + { + "epoch": 2.787292817679558, + "grad_norm": 0.3834371566772461, + "learning_rate": 8.464007800921983e-05, + "loss": 1.7871, + "step": 9081 + }, + { + "epoch": 2.787599754450583, + "grad_norm": 0.360202431678772, + "learning_rate": 8.463649342676998e-05, + "loss": 1.8396, + "step": 9082 + }, + { + "epoch": 2.7879066912216084, + "grad_norm": 0.28360050916671753, + "learning_rate": 8.463290850202201e-05, + "loss": 1.7905, + "step": 9083 + }, + { + "epoch": 2.7882136279926337, + "grad_norm": 0.28087326884269714, + "learning_rate": 8.462932323501134e-05, + "loss": 1.8079, + "step": 9084 + }, + { + "epoch": 2.7885205647636586, + "grad_norm": 0.2725851833820343, + "learning_rate": 8.462573762577339e-05, + "loss": 1.8099, + "step": 9085 + }, + { + "epoch": 2.788827501534684, + "grad_norm": 0.27776938676834106, + "learning_rate": 8.462215167434363e-05, + "loss": 1.8002, + "step": 9086 + }, + { + "epoch": 2.789134438305709, + "grad_norm": 0.3118545711040497, + "learning_rate": 8.461856538075745e-05, + "loss": 1.8541, + "step": 9087 + }, + { + "epoch": 2.789441375076734, + "grad_norm": 0.29499873518943787, + "learning_rate": 8.461497874505034e-05, + "loss": 1.8667, + "step": 9088 + }, + { + "epoch": 2.7897483118477595, + "grad_norm": 0.31346917152404785, + "learning_rate": 8.46113917672577e-05, + "loss": 1.8737, + "step": 9089 + }, + { + "epoch": 2.7900552486187844, + "grad_norm": 0.30406203866004944, + "learning_rate": 8.460780444741501e-05, + "loss": 1.8467, + "step": 9090 + }, + { + "epoch": 2.7903621853898097, + "grad_norm": 0.28438735008239746, + "learning_rate": 8.46042167855577e-05, + "loss": 1.8008, + "step": 9091 + }, + { + "epoch": 2.7906691221608346, + "grad_norm": 0.29893866181373596, + "learning_rate": 8.460062878172125e-05, + "loss": 1.8498, + "step": 9092 + }, + { + "epoch": 2.79097605893186, + "grad_norm": 0.33810749650001526, + "learning_rate": 8.459704043594112e-05, + "loss": 1.8259, + "step": 9093 + }, + { + "epoch": 2.7912829957028853, + "grad_norm": 0.3726813495159149, + "learning_rate": 8.459345174825273e-05, + "loss": 1.8831, + "step": 9094 + }, + { + "epoch": 2.7915899324739106, + "grad_norm": 0.2983379662036896, + "learning_rate": 8.45898627186916e-05, + "loss": 1.7886, + "step": 9095 + }, + { + "epoch": 2.7918968692449355, + "grad_norm": 0.3235681354999542, + "learning_rate": 8.458627334729316e-05, + "loss": 1.8616, + "step": 9096 + }, + { + "epoch": 2.792203806015961, + "grad_norm": 0.47961094975471497, + "learning_rate": 8.458268363409288e-05, + "loss": 1.8134, + "step": 9097 + }, + { + "epoch": 2.7925107427869857, + "grad_norm": 0.5463281869888306, + "learning_rate": 8.457909357912628e-05, + "loss": 1.8288, + "step": 9098 + }, + { + "epoch": 2.792817679558011, + "grad_norm": 0.5377171635627747, + "learning_rate": 8.45755031824288e-05, + "loss": 1.8032, + "step": 9099 + }, + { + "epoch": 2.7931246163290364, + "grad_norm": 0.30159178376197815, + "learning_rate": 8.457191244403592e-05, + "loss": 1.7619, + "step": 9100 + }, + { + "epoch": 2.7934315531000613, + "grad_norm": 0.33798086643218994, + "learning_rate": 8.456832136398315e-05, + "loss": 1.839, + "step": 9101 + }, + { + "epoch": 2.7937384898710866, + "grad_norm": 0.5194488167762756, + "learning_rate": 8.456472994230595e-05, + "loss": 1.7908, + "step": 9102 + }, + { + "epoch": 2.7940454266421115, + "grad_norm": 0.49310582876205444, + "learning_rate": 8.456113817903986e-05, + "loss": 1.8471, + "step": 9103 + }, + { + "epoch": 2.794352363413137, + "grad_norm": 0.27490735054016113, + "learning_rate": 8.455754607422032e-05, + "loss": 1.8168, + "step": 9104 + }, + { + "epoch": 2.794659300184162, + "grad_norm": 0.3760504126548767, + "learning_rate": 8.455395362788285e-05, + "loss": 1.8796, + "step": 9105 + }, + { + "epoch": 2.794966236955187, + "grad_norm": 0.4636823534965515, + "learning_rate": 8.455036084006298e-05, + "loss": 1.8001, + "step": 9106 + }, + { + "epoch": 2.7952731737262124, + "grad_norm": 0.38666999340057373, + "learning_rate": 8.454676771079619e-05, + "loss": 1.8396, + "step": 9107 + }, + { + "epoch": 2.7955801104972373, + "grad_norm": 0.2992180585861206, + "learning_rate": 8.454317424011797e-05, + "loss": 1.8298, + "step": 9108 + }, + { + "epoch": 2.7958870472682626, + "grad_norm": 0.3744206428527832, + "learning_rate": 8.453958042806389e-05, + "loss": 1.8396, + "step": 9109 + }, + { + "epoch": 2.796193984039288, + "grad_norm": 0.5117284059524536, + "learning_rate": 8.453598627466941e-05, + "loss": 1.9734, + "step": 9110 + }, + { + "epoch": 2.7965009208103133, + "grad_norm": 0.36792969703674316, + "learning_rate": 8.453239177997008e-05, + "loss": 1.8347, + "step": 9111 + }, + { + "epoch": 2.796807857581338, + "grad_norm": 0.3352719843387604, + "learning_rate": 8.452879694400139e-05, + "loss": 1.7967, + "step": 9112 + }, + { + "epoch": 2.7971147943523635, + "grad_norm": 0.45745235681533813, + "learning_rate": 8.452520176679893e-05, + "loss": 1.8484, + "step": 9113 + }, + { + "epoch": 2.7974217311233884, + "grad_norm": 0.43958255648612976, + "learning_rate": 8.452160624839816e-05, + "loss": 1.7954, + "step": 9114 + }, + { + "epoch": 2.7977286678944138, + "grad_norm": 0.28715837001800537, + "learning_rate": 8.451801038883467e-05, + "loss": 1.8088, + "step": 9115 + }, + { + "epoch": 2.798035604665439, + "grad_norm": 0.3552972078323364, + "learning_rate": 8.451441418814394e-05, + "loss": 1.7654, + "step": 9116 + }, + { + "epoch": 2.798342541436464, + "grad_norm": 0.5065462589263916, + "learning_rate": 8.451081764636156e-05, + "loss": 1.7841, + "step": 9117 + }, + { + "epoch": 2.7986494782074893, + "grad_norm": 0.48900917172431946, + "learning_rate": 8.450722076352306e-05, + "loss": 1.8709, + "step": 9118 + }, + { + "epoch": 2.798956414978514, + "grad_norm": 0.31420227885246277, + "learning_rate": 8.450362353966395e-05, + "loss": 1.9057, + "step": 9119 + }, + { + "epoch": 2.7992633517495396, + "grad_norm": 0.35886913537979126, + "learning_rate": 8.450002597481982e-05, + "loss": 1.877, + "step": 9120 + }, + { + "epoch": 2.799570288520565, + "grad_norm": 0.3822213113307953, + "learning_rate": 8.449642806902623e-05, + "loss": 1.9171, + "step": 9121 + }, + { + "epoch": 2.7998772252915898, + "grad_norm": 0.3286183476448059, + "learning_rate": 8.449282982231869e-05, + "loss": 1.8342, + "step": 9122 + }, + { + "epoch": 2.800184162062615, + "grad_norm": 0.3498966693878174, + "learning_rate": 8.448923123473282e-05, + "loss": 1.8276, + "step": 9123 + }, + { + "epoch": 2.80049109883364, + "grad_norm": 0.3550187647342682, + "learning_rate": 8.448563230630413e-05, + "loss": 1.8585, + "step": 9124 + }, + { + "epoch": 2.8007980356046653, + "grad_norm": 0.32100117206573486, + "learning_rate": 8.448203303706821e-05, + "loss": 1.8168, + "step": 9125 + }, + { + "epoch": 2.8011049723756907, + "grad_norm": 0.3859860301017761, + "learning_rate": 8.447843342706063e-05, + "loss": 1.8941, + "step": 9126 + }, + { + "epoch": 2.801411909146716, + "grad_norm": 0.41674432158470154, + "learning_rate": 8.447483347631697e-05, + "loss": 1.7894, + "step": 9127 + }, + { + "epoch": 2.801718845917741, + "grad_norm": 0.3324837386608124, + "learning_rate": 8.44712331848728e-05, + "loss": 1.8901, + "step": 9128 + }, + { + "epoch": 2.8020257826887662, + "grad_norm": 0.30357789993286133, + "learning_rate": 8.44676325527637e-05, + "loss": 1.8434, + "step": 9129 + }, + { + "epoch": 2.802332719459791, + "grad_norm": 0.3215816617012024, + "learning_rate": 8.446403158002525e-05, + "loss": 1.8291, + "step": 9130 + }, + { + "epoch": 2.8026396562308165, + "grad_norm": 0.26280832290649414, + "learning_rate": 8.446043026669303e-05, + "loss": 1.7934, + "step": 9131 + }, + { + "epoch": 2.802946593001842, + "grad_norm": 0.2963539659976959, + "learning_rate": 8.445682861280265e-05, + "loss": 1.824, + "step": 9132 + }, + { + "epoch": 2.8032535297728667, + "grad_norm": 0.4251864552497864, + "learning_rate": 8.44532266183897e-05, + "loss": 1.9, + "step": 9133 + }, + { + "epoch": 2.803560466543892, + "grad_norm": 0.3920140862464905, + "learning_rate": 8.444962428348978e-05, + "loss": 1.7753, + "step": 9134 + }, + { + "epoch": 2.803867403314917, + "grad_norm": 0.2614890933036804, + "learning_rate": 8.444602160813845e-05, + "loss": 1.844, + "step": 9135 + }, + { + "epoch": 2.8041743400859422, + "grad_norm": 0.3359995484352112, + "learning_rate": 8.444241859237135e-05, + "loss": 1.8636, + "step": 9136 + }, + { + "epoch": 2.8044812768569676, + "grad_norm": 0.34399285912513733, + "learning_rate": 8.44388152362241e-05, + "loss": 1.8304, + "step": 9137 + }, + { + "epoch": 2.804788213627993, + "grad_norm": 0.27815961837768555, + "learning_rate": 8.443521153973228e-05, + "loss": 1.7916, + "step": 9138 + }, + { + "epoch": 2.805095150399018, + "grad_norm": 0.40705251693725586, + "learning_rate": 8.443160750293152e-05, + "loss": 1.7707, + "step": 9139 + }, + { + "epoch": 2.805402087170043, + "grad_norm": 0.49512532353401184, + "learning_rate": 8.442800312585744e-05, + "loss": 1.866, + "step": 9140 + }, + { + "epoch": 2.805709023941068, + "grad_norm": 0.31373831629753113, + "learning_rate": 8.442439840854565e-05, + "loss": 1.8495, + "step": 9141 + }, + { + "epoch": 2.8060159607120934, + "grad_norm": 0.33470213413238525, + "learning_rate": 8.442079335103177e-05, + "loss": 1.8459, + "step": 9142 + }, + { + "epoch": 2.8063228974831187, + "grad_norm": 0.4092586636543274, + "learning_rate": 8.441718795335145e-05, + "loss": 1.8547, + "step": 9143 + }, + { + "epoch": 2.8066298342541436, + "grad_norm": 0.37220728397369385, + "learning_rate": 8.44135822155403e-05, + "loss": 1.8922, + "step": 9144 + }, + { + "epoch": 2.806936771025169, + "grad_norm": 0.3197399973869324, + "learning_rate": 8.440997613763395e-05, + "loss": 1.872, + "step": 9145 + }, + { + "epoch": 2.807243707796194, + "grad_norm": 0.31258881092071533, + "learning_rate": 8.440636971966805e-05, + "loss": 1.8394, + "step": 9146 + }, + { + "epoch": 2.807550644567219, + "grad_norm": 0.31450721621513367, + "learning_rate": 8.440276296167825e-05, + "loss": 1.8496, + "step": 9147 + }, + { + "epoch": 2.8078575813382445, + "grad_norm": 0.30959805846214294, + "learning_rate": 8.439915586370018e-05, + "loss": 1.8326, + "step": 9148 + }, + { + "epoch": 2.8081645181092694, + "grad_norm": 0.2942456901073456, + "learning_rate": 8.439554842576949e-05, + "loss": 1.8742, + "step": 9149 + }, + { + "epoch": 2.8084714548802947, + "grad_norm": 0.32378795742988586, + "learning_rate": 8.439194064792182e-05, + "loss": 1.7991, + "step": 9150 + }, + { + "epoch": 2.8087783916513196, + "grad_norm": 0.30733996629714966, + "learning_rate": 8.438833253019285e-05, + "loss": 1.8822, + "step": 9151 + }, + { + "epoch": 2.809085328422345, + "grad_norm": 0.29933521151542664, + "learning_rate": 8.438472407261821e-05, + "loss": 1.7785, + "step": 9152 + }, + { + "epoch": 2.8093922651933703, + "grad_norm": 0.2992005944252014, + "learning_rate": 8.438111527523358e-05, + "loss": 1.9056, + "step": 9153 + }, + { + "epoch": 2.8096992019643956, + "grad_norm": 0.3074969947338104, + "learning_rate": 8.43775061380746e-05, + "loss": 1.8283, + "step": 9154 + }, + { + "epoch": 2.8100061387354205, + "grad_norm": 0.29843345284461975, + "learning_rate": 8.437389666117699e-05, + "loss": 1.87, + "step": 9155 + }, + { + "epoch": 2.810313075506446, + "grad_norm": 0.2939853072166443, + "learning_rate": 8.437028684457635e-05, + "loss": 1.8657, + "step": 9156 + }, + { + "epoch": 2.8106200122774707, + "grad_norm": 0.292972207069397, + "learning_rate": 8.436667668830841e-05, + "loss": 1.821, + "step": 9157 + }, + { + "epoch": 2.810926949048496, + "grad_norm": 0.298244833946228, + "learning_rate": 8.436306619240882e-05, + "loss": 1.8531, + "step": 9158 + }, + { + "epoch": 2.8112338858195214, + "grad_norm": 0.28567394614219666, + "learning_rate": 8.435945535691328e-05, + "loss": 1.7719, + "step": 9159 + }, + { + "epoch": 2.8115408225905463, + "grad_norm": 0.2876092493534088, + "learning_rate": 8.435584418185745e-05, + "loss": 1.7622, + "step": 9160 + }, + { + "epoch": 2.8118477593615716, + "grad_norm": 0.2656804919242859, + "learning_rate": 8.435223266727704e-05, + "loss": 1.7624, + "step": 9161 + }, + { + "epoch": 2.8121546961325965, + "grad_norm": 0.26690298318862915, + "learning_rate": 8.434862081320774e-05, + "loss": 1.807, + "step": 9162 + }, + { + "epoch": 2.812461632903622, + "grad_norm": 0.3088238537311554, + "learning_rate": 8.434500861968521e-05, + "loss": 1.9214, + "step": 9163 + }, + { + "epoch": 2.812768569674647, + "grad_norm": 0.32310751080513, + "learning_rate": 8.43413960867452e-05, + "loss": 1.8341, + "step": 9164 + }, + { + "epoch": 2.813075506445672, + "grad_norm": 0.3028428554534912, + "learning_rate": 8.433778321442339e-05, + "loss": 1.8316, + "step": 9165 + }, + { + "epoch": 2.8133824432166974, + "grad_norm": 0.28363901376724243, + "learning_rate": 8.433417000275545e-05, + "loss": 1.8506, + "step": 9166 + }, + { + "epoch": 2.8136893799877223, + "grad_norm": 0.2976547181606293, + "learning_rate": 8.433055645177714e-05, + "loss": 1.8654, + "step": 9167 + }, + { + "epoch": 2.8139963167587476, + "grad_norm": 0.2945725619792938, + "learning_rate": 8.432694256152414e-05, + "loss": 1.8146, + "step": 9168 + }, + { + "epoch": 2.814303253529773, + "grad_norm": 0.30364149808883667, + "learning_rate": 8.432332833203217e-05, + "loss": 1.8152, + "step": 9169 + }, + { + "epoch": 2.8146101903007983, + "grad_norm": 0.2776038348674774, + "learning_rate": 8.431971376333699e-05, + "loss": 1.7723, + "step": 9170 + }, + { + "epoch": 2.814917127071823, + "grad_norm": 0.41802000999450684, + "learning_rate": 8.431609885547425e-05, + "loss": 1.7909, + "step": 9171 + }, + { + "epoch": 2.8152240638428485, + "grad_norm": 0.400622695684433, + "learning_rate": 8.43124836084797e-05, + "loss": 1.8241, + "step": 9172 + }, + { + "epoch": 2.8155310006138734, + "grad_norm": 0.3760300576686859, + "learning_rate": 8.430886802238908e-05, + "loss": 1.9298, + "step": 9173 + }, + { + "epoch": 2.8158379373848987, + "grad_norm": 0.2944977283477783, + "learning_rate": 8.430525209723813e-05, + "loss": 1.8181, + "step": 9174 + }, + { + "epoch": 2.816144874155924, + "grad_norm": 0.28091785311698914, + "learning_rate": 8.430163583306257e-05, + "loss": 1.8178, + "step": 9175 + }, + { + "epoch": 2.816451810926949, + "grad_norm": 0.33689528703689575, + "learning_rate": 8.429801922989812e-05, + "loss": 1.8195, + "step": 9176 + }, + { + "epoch": 2.8167587476979743, + "grad_norm": 0.3541412055492401, + "learning_rate": 8.429440228778058e-05, + "loss": 1.8951, + "step": 9177 + }, + { + "epoch": 2.817065684468999, + "grad_norm": 0.2846376299858093, + "learning_rate": 8.429078500674564e-05, + "loss": 1.7858, + "step": 9178 + }, + { + "epoch": 2.8173726212400245, + "grad_norm": 0.28097108006477356, + "learning_rate": 8.428716738682905e-05, + "loss": 1.8503, + "step": 9179 + }, + { + "epoch": 2.81767955801105, + "grad_norm": 0.354670912027359, + "learning_rate": 8.428354942806658e-05, + "loss": 1.8332, + "step": 9180 + }, + { + "epoch": 2.8179864947820747, + "grad_norm": 0.3589770793914795, + "learning_rate": 8.427993113049397e-05, + "loss": 1.8527, + "step": 9181 + }, + { + "epoch": 2.8182934315531, + "grad_norm": 0.3171144723892212, + "learning_rate": 8.4276312494147e-05, + "loss": 1.789, + "step": 9182 + }, + { + "epoch": 2.818600368324125, + "grad_norm": 0.3540917932987213, + "learning_rate": 8.427269351906143e-05, + "loss": 1.8338, + "step": 9183 + }, + { + "epoch": 2.8189073050951503, + "grad_norm": 0.34149861335754395, + "learning_rate": 8.426907420527302e-05, + "loss": 1.8202, + "step": 9184 + }, + { + "epoch": 2.8192142418661756, + "grad_norm": 0.3035878837108612, + "learning_rate": 8.426545455281751e-05, + "loss": 1.842, + "step": 9185 + }, + { + "epoch": 2.819521178637201, + "grad_norm": 0.29007625579833984, + "learning_rate": 8.426183456173072e-05, + "loss": 1.8486, + "step": 9186 + }, + { + "epoch": 2.819828115408226, + "grad_norm": 0.3066602647304535, + "learning_rate": 8.425821423204837e-05, + "loss": 1.7833, + "step": 9187 + }, + { + "epoch": 2.820135052179251, + "grad_norm": 0.3163747191429138, + "learning_rate": 8.425459356380627e-05, + "loss": 1.8037, + "step": 9188 + }, + { + "epoch": 2.820441988950276, + "grad_norm": 0.3282648026943207, + "learning_rate": 8.425097255704022e-05, + "loss": 1.8476, + "step": 9189 + }, + { + "epoch": 2.8207489257213014, + "grad_norm": 0.3573009669780731, + "learning_rate": 8.424735121178598e-05, + "loss": 1.87, + "step": 9190 + }, + { + "epoch": 2.8210558624923268, + "grad_norm": 0.3480490744113922, + "learning_rate": 8.424372952807933e-05, + "loss": 1.8773, + "step": 9191 + }, + { + "epoch": 2.8213627992633517, + "grad_norm": 0.3296821415424347, + "learning_rate": 8.424010750595608e-05, + "loss": 1.8775, + "step": 9192 + }, + { + "epoch": 2.821669736034377, + "grad_norm": 0.33366382122039795, + "learning_rate": 8.423648514545202e-05, + "loss": 1.8064, + "step": 9193 + }, + { + "epoch": 2.821976672805402, + "grad_norm": 0.454303503036499, + "learning_rate": 8.423286244660295e-05, + "loss": 1.9702, + "step": 9194 + }, + { + "epoch": 2.822283609576427, + "grad_norm": 0.361215740442276, + "learning_rate": 8.422923940944466e-05, + "loss": 1.8055, + "step": 9195 + }, + { + "epoch": 2.8225905463474525, + "grad_norm": 0.3678447902202606, + "learning_rate": 8.422561603401297e-05, + "loss": 1.8924, + "step": 9196 + }, + { + "epoch": 2.8228974831184774, + "grad_norm": 0.32999005913734436, + "learning_rate": 8.422199232034369e-05, + "loss": 1.7887, + "step": 9197 + }, + { + "epoch": 2.8232044198895028, + "grad_norm": 0.2811618149280548, + "learning_rate": 8.42183682684726e-05, + "loss": 1.8166, + "step": 9198 + }, + { + "epoch": 2.8235113566605277, + "grad_norm": 0.3178839385509491, + "learning_rate": 8.421474387843555e-05, + "loss": 1.7868, + "step": 9199 + }, + { + "epoch": 2.823818293431553, + "grad_norm": 0.27299264073371887, + "learning_rate": 8.421111915026836e-05, + "loss": 1.816, + "step": 9200 + }, + { + "epoch": 2.8241252302025783, + "grad_norm": 0.3191591203212738, + "learning_rate": 8.420749408400684e-05, + "loss": 1.912, + "step": 9201 + }, + { + "epoch": 2.8244321669736037, + "grad_norm": 0.3638809323310852, + "learning_rate": 8.42038686796868e-05, + "loss": 1.7716, + "step": 9202 + }, + { + "epoch": 2.8247391037446286, + "grad_norm": 0.33573171496391296, + "learning_rate": 8.420024293734407e-05, + "loss": 1.8599, + "step": 9203 + }, + { + "epoch": 2.825046040515654, + "grad_norm": 0.29062843322753906, + "learning_rate": 8.419661685701452e-05, + "loss": 1.7982, + "step": 9204 + }, + { + "epoch": 2.825352977286679, + "grad_norm": 0.27475887537002563, + "learning_rate": 8.419299043873394e-05, + "loss": 1.7763, + "step": 9205 + }, + { + "epoch": 2.825659914057704, + "grad_norm": 0.2996850609779358, + "learning_rate": 8.41893636825382e-05, + "loss": 1.7957, + "step": 9206 + }, + { + "epoch": 2.8259668508287294, + "grad_norm": 0.38112908601760864, + "learning_rate": 8.418573658846314e-05, + "loss": 1.8536, + "step": 9207 + }, + { + "epoch": 2.8262737875997543, + "grad_norm": 0.3245584964752197, + "learning_rate": 8.418210915654456e-05, + "loss": 1.8254, + "step": 9208 + }, + { + "epoch": 2.8265807243707797, + "grad_norm": 0.24600234627723694, + "learning_rate": 8.417848138681837e-05, + "loss": 1.825, + "step": 9209 + }, + { + "epoch": 2.8268876611418046, + "grad_norm": 0.3130429685115814, + "learning_rate": 8.417485327932038e-05, + "loss": 1.7954, + "step": 9210 + }, + { + "epoch": 2.82719459791283, + "grad_norm": 0.3218819200992584, + "learning_rate": 8.417122483408647e-05, + "loss": 1.8343, + "step": 9211 + }, + { + "epoch": 2.8275015346838552, + "grad_norm": 0.3020598292350769, + "learning_rate": 8.416759605115248e-05, + "loss": 1.8547, + "step": 9212 + }, + { + "epoch": 2.8278084714548806, + "grad_norm": 0.2685437798500061, + "learning_rate": 8.416396693055429e-05, + "loss": 1.7828, + "step": 9213 + }, + { + "epoch": 2.8281154082259055, + "grad_norm": 0.2990378737449646, + "learning_rate": 8.416033747232775e-05, + "loss": 1.8108, + "step": 9214 + }, + { + "epoch": 2.828422344996931, + "grad_norm": 0.25395238399505615, + "learning_rate": 8.415670767650871e-05, + "loss": 1.786, + "step": 9215 + }, + { + "epoch": 2.8287292817679557, + "grad_norm": 0.3406725823879242, + "learning_rate": 8.41530775431331e-05, + "loss": 1.9015, + "step": 9216 + }, + { + "epoch": 2.829036218538981, + "grad_norm": 0.279859721660614, + "learning_rate": 8.414944707223676e-05, + "loss": 1.8639, + "step": 9217 + }, + { + "epoch": 2.8293431553100064, + "grad_norm": 0.2574310600757599, + "learning_rate": 8.414581626385554e-05, + "loss": 1.7595, + "step": 9218 + }, + { + "epoch": 2.8296500920810312, + "grad_norm": 0.2956291437149048, + "learning_rate": 8.414218511802537e-05, + "loss": 1.8418, + "step": 9219 + }, + { + "epoch": 2.8299570288520566, + "grad_norm": 0.30965283513069153, + "learning_rate": 8.41385536347821e-05, + "loss": 1.8241, + "step": 9220 + }, + { + "epoch": 2.8302639656230815, + "grad_norm": 0.3125357925891876, + "learning_rate": 8.413492181416166e-05, + "loss": 1.7961, + "step": 9221 + }, + { + "epoch": 2.830570902394107, + "grad_norm": 0.23901188373565674, + "learning_rate": 8.413128965619988e-05, + "loss": 1.8109, + "step": 9222 + }, + { + "epoch": 2.830877839165132, + "grad_norm": 0.26556700468063354, + "learning_rate": 8.412765716093272e-05, + "loss": 1.8756, + "step": 9223 + }, + { + "epoch": 2.831184775936157, + "grad_norm": 0.3080972731113434, + "learning_rate": 8.412402432839604e-05, + "loss": 1.8271, + "step": 9224 + }, + { + "epoch": 2.8314917127071824, + "grad_norm": 0.32894501090049744, + "learning_rate": 8.412039115862573e-05, + "loss": 1.8427, + "step": 9225 + }, + { + "epoch": 2.8317986494782073, + "grad_norm": 0.3136049509048462, + "learning_rate": 8.411675765165774e-05, + "loss": 1.8716, + "step": 9226 + }, + { + "epoch": 2.8321055862492326, + "grad_norm": 0.26859185099601746, + "learning_rate": 8.411312380752795e-05, + "loss": 1.8138, + "step": 9227 + }, + { + "epoch": 2.832412523020258, + "grad_norm": 0.26863718032836914, + "learning_rate": 8.410948962627227e-05, + "loss": 1.8286, + "step": 9228 + }, + { + "epoch": 2.8327194597912833, + "grad_norm": 0.25599852204322815, + "learning_rate": 8.410585510792663e-05, + "loss": 1.8274, + "step": 9229 + }, + { + "epoch": 2.833026396562308, + "grad_norm": 0.22787287831306458, + "learning_rate": 8.410222025252694e-05, + "loss": 1.7961, + "step": 9230 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.22957643866539001, + "learning_rate": 8.409858506010912e-05, + "loss": 1.7763, + "step": 9231 + }, + { + "epoch": 2.8336402701043584, + "grad_norm": 0.2794438302516937, + "learning_rate": 8.409494953070909e-05, + "loss": 1.8552, + "step": 9232 + }, + { + "epoch": 2.8339472068753837, + "grad_norm": 0.2755461037158966, + "learning_rate": 8.409131366436279e-05, + "loss": 1.8418, + "step": 9233 + }, + { + "epoch": 2.834254143646409, + "grad_norm": 0.27968719601631165, + "learning_rate": 8.408767746110616e-05, + "loss": 1.8774, + "step": 9234 + }, + { + "epoch": 2.834561080417434, + "grad_norm": 0.3014982044696808, + "learning_rate": 8.408404092097511e-05, + "loss": 1.8886, + "step": 9235 + }, + { + "epoch": 2.8348680171884593, + "grad_norm": 0.3139450252056122, + "learning_rate": 8.408040404400558e-05, + "loss": 1.8119, + "step": 9236 + }, + { + "epoch": 2.835174953959484, + "grad_norm": 0.43578827381134033, + "learning_rate": 8.407676683023353e-05, + "loss": 1.8173, + "step": 9237 + }, + { + "epoch": 2.8354818907305095, + "grad_norm": 0.4939953088760376, + "learning_rate": 8.407312927969489e-05, + "loss": 1.8647, + "step": 9238 + }, + { + "epoch": 2.835788827501535, + "grad_norm": 0.40801018476486206, + "learning_rate": 8.406949139242562e-05, + "loss": 1.8259, + "step": 9239 + }, + { + "epoch": 2.8360957642725597, + "grad_norm": 0.331249862909317, + "learning_rate": 8.406585316846168e-05, + "loss": 1.8727, + "step": 9240 + }, + { + "epoch": 2.836402701043585, + "grad_norm": 0.3368569314479828, + "learning_rate": 8.406221460783901e-05, + "loss": 1.8362, + "step": 9241 + }, + { + "epoch": 2.83670963781461, + "grad_norm": 0.4736326336860657, + "learning_rate": 8.405857571059355e-05, + "loss": 1.9543, + "step": 9242 + }, + { + "epoch": 2.8370165745856353, + "grad_norm": 0.4151712656021118, + "learning_rate": 8.405493647676131e-05, + "loss": 1.8764, + "step": 9243 + }, + { + "epoch": 2.8373235113566606, + "grad_norm": 0.3463367819786072, + "learning_rate": 8.405129690637821e-05, + "loss": 1.8578, + "step": 9244 + }, + { + "epoch": 2.837630448127686, + "grad_norm": 0.28701671957969666, + "learning_rate": 8.404765699948023e-05, + "loss": 1.8201, + "step": 9245 + }, + { + "epoch": 2.837937384898711, + "grad_norm": 0.2893613874912262, + "learning_rate": 8.404401675610336e-05, + "loss": 1.7918, + "step": 9246 + }, + { + "epoch": 2.838244321669736, + "grad_norm": 0.29359766840934753, + "learning_rate": 8.404037617628357e-05, + "loss": 1.7919, + "step": 9247 + }, + { + "epoch": 2.838551258440761, + "grad_norm": 0.30147913098335266, + "learning_rate": 8.403673526005682e-05, + "loss": 1.8227, + "step": 9248 + }, + { + "epoch": 2.8388581952117864, + "grad_norm": 0.28443291783332825, + "learning_rate": 8.403309400745908e-05, + "loss": 1.8128, + "step": 9249 + }, + { + "epoch": 2.8391651319828117, + "grad_norm": 0.27890142798423767, + "learning_rate": 8.40294524185264e-05, + "loss": 1.8109, + "step": 9250 + }, + { + "epoch": 2.8394720687538366, + "grad_norm": 0.29900890588760376, + "learning_rate": 8.402581049329471e-05, + "loss": 1.7852, + "step": 9251 + }, + { + "epoch": 2.839779005524862, + "grad_norm": 0.34249019622802734, + "learning_rate": 8.402216823180001e-05, + "loss": 1.8681, + "step": 9252 + }, + { + "epoch": 2.840085942295887, + "grad_norm": 0.3387257754802704, + "learning_rate": 8.40185256340783e-05, + "loss": 1.9171, + "step": 9253 + }, + { + "epoch": 2.840392879066912, + "grad_norm": 0.2831752598285675, + "learning_rate": 8.40148827001656e-05, + "loss": 1.8422, + "step": 9254 + }, + { + "epoch": 2.8406998158379375, + "grad_norm": 0.30895891785621643, + "learning_rate": 8.401123943009788e-05, + "loss": 1.7967, + "step": 9255 + }, + { + "epoch": 2.8410067526089624, + "grad_norm": 0.381154328584671, + "learning_rate": 8.400759582391116e-05, + "loss": 1.8359, + "step": 9256 + }, + { + "epoch": 2.8413136893799877, + "grad_norm": 0.4041622281074524, + "learning_rate": 8.400395188164144e-05, + "loss": 1.8306, + "step": 9257 + }, + { + "epoch": 2.8416206261510126, + "grad_norm": 0.3801247775554657, + "learning_rate": 8.400030760332474e-05, + "loss": 1.8696, + "step": 9258 + }, + { + "epoch": 2.841927562922038, + "grad_norm": 0.27382874488830566, + "learning_rate": 8.399666298899706e-05, + "loss": 1.8369, + "step": 9259 + }, + { + "epoch": 2.8422344996930633, + "grad_norm": 0.31395214796066284, + "learning_rate": 8.399301803869445e-05, + "loss": 1.8135, + "step": 9260 + }, + { + "epoch": 2.8425414364640886, + "grad_norm": 0.36473774909973145, + "learning_rate": 8.398937275245291e-05, + "loss": 1.8025, + "step": 9261 + }, + { + "epoch": 2.8428483732351135, + "grad_norm": 0.38420331478118896, + "learning_rate": 8.398572713030846e-05, + "loss": 1.7873, + "step": 9262 + }, + { + "epoch": 2.843155310006139, + "grad_norm": 0.2707001566886902, + "learning_rate": 8.398208117229714e-05, + "loss": 1.8071, + "step": 9263 + }, + { + "epoch": 2.8434622467771637, + "grad_norm": 0.3391258418560028, + "learning_rate": 8.397843487845496e-05, + "loss": 1.8186, + "step": 9264 + }, + { + "epoch": 2.843769183548189, + "grad_norm": 0.4473530650138855, + "learning_rate": 8.397478824881799e-05, + "loss": 1.9144, + "step": 9265 + }, + { + "epoch": 2.8440761203192144, + "grad_norm": 0.3141709268093109, + "learning_rate": 8.397114128342224e-05, + "loss": 1.77, + "step": 9266 + }, + { + "epoch": 2.8443830570902393, + "grad_norm": 0.29191854596138, + "learning_rate": 8.396749398230377e-05, + "loss": 1.8645, + "step": 9267 + }, + { + "epoch": 2.8446899938612646, + "grad_norm": 0.4399743676185608, + "learning_rate": 8.39638463454986e-05, + "loss": 1.8261, + "step": 9268 + }, + { + "epoch": 2.8449969306322895, + "grad_norm": 0.4741196036338806, + "learning_rate": 8.396019837304281e-05, + "loss": 1.8566, + "step": 9269 + }, + { + "epoch": 2.845303867403315, + "grad_norm": 0.39640361070632935, + "learning_rate": 8.395655006497243e-05, + "loss": 1.8062, + "step": 9270 + }, + { + "epoch": 2.84561080417434, + "grad_norm": 0.290171355009079, + "learning_rate": 8.39529014213235e-05, + "loss": 1.8463, + "step": 9271 + }, + { + "epoch": 2.845917740945365, + "grad_norm": 0.2773928940296173, + "learning_rate": 8.394925244213212e-05, + "loss": 1.7929, + "step": 9272 + }, + { + "epoch": 2.8462246777163904, + "grad_norm": 0.38512173295021057, + "learning_rate": 8.394560312743433e-05, + "loss": 1.8724, + "step": 9273 + }, + { + "epoch": 2.8465316144874153, + "grad_norm": 0.44405680894851685, + "learning_rate": 8.394195347726619e-05, + "loss": 1.8184, + "step": 9274 + }, + { + "epoch": 2.8468385512584407, + "grad_norm": 0.32526880502700806, + "learning_rate": 8.393830349166376e-05, + "loss": 1.8207, + "step": 9275 + }, + { + "epoch": 2.847145488029466, + "grad_norm": 0.2934194803237915, + "learning_rate": 8.393465317066313e-05, + "loss": 1.8023, + "step": 9276 + }, + { + "epoch": 2.8474524248004913, + "grad_norm": 0.43126001954078674, + "learning_rate": 8.393100251430037e-05, + "loss": 1.8283, + "step": 9277 + }, + { + "epoch": 2.847759361571516, + "grad_norm": 0.48253729939460754, + "learning_rate": 8.392735152261157e-05, + "loss": 1.8359, + "step": 9278 + }, + { + "epoch": 2.8480662983425415, + "grad_norm": 0.3736251890659332, + "learning_rate": 8.392370019563279e-05, + "loss": 1.8553, + "step": 9279 + }, + { + "epoch": 2.8483732351135664, + "grad_norm": 0.33329901099205017, + "learning_rate": 8.39200485334001e-05, + "loss": 1.8156, + "step": 9280 + }, + { + "epoch": 2.8486801718845918, + "grad_norm": 0.42538657784461975, + "learning_rate": 8.391639653594963e-05, + "loss": 1.7812, + "step": 9281 + }, + { + "epoch": 2.848987108655617, + "grad_norm": 0.39076727628707886, + "learning_rate": 8.391274420331744e-05, + "loss": 1.8027, + "step": 9282 + }, + { + "epoch": 2.849294045426642, + "grad_norm": 0.3558272123336792, + "learning_rate": 8.390909153553963e-05, + "loss": 1.8448, + "step": 9283 + }, + { + "epoch": 2.8496009821976673, + "grad_norm": 0.26782071590423584, + "learning_rate": 8.390543853265232e-05, + "loss": 1.7995, + "step": 9284 + }, + { + "epoch": 2.849907918968692, + "grad_norm": 0.3449724614620209, + "learning_rate": 8.390178519469158e-05, + "loss": 1.7888, + "step": 9285 + }, + { + "epoch": 2.8502148557397176, + "grad_norm": 0.36390578746795654, + "learning_rate": 8.389813152169355e-05, + "loss": 1.8072, + "step": 9286 + }, + { + "epoch": 2.850521792510743, + "grad_norm": 0.31959423422813416, + "learning_rate": 8.389447751369428e-05, + "loss": 1.8513, + "step": 9287 + }, + { + "epoch": 2.8508287292817682, + "grad_norm": 0.2717762589454651, + "learning_rate": 8.389082317072994e-05, + "loss": 1.8457, + "step": 9288 + }, + { + "epoch": 2.851135666052793, + "grad_norm": 0.28937265276908875, + "learning_rate": 8.388716849283662e-05, + "loss": 1.7945, + "step": 9289 + }, + { + "epoch": 2.8514426028238185, + "grad_norm": 0.293079674243927, + "learning_rate": 8.388351348005044e-05, + "loss": 1.7731, + "step": 9290 + }, + { + "epoch": 2.8517495395948433, + "grad_norm": 0.32930463552474976, + "learning_rate": 8.38798581324075e-05, + "loss": 1.9017, + "step": 9291 + }, + { + "epoch": 2.8520564763658687, + "grad_norm": 0.2972584664821625, + "learning_rate": 8.387620244994397e-05, + "loss": 1.861, + "step": 9292 + }, + { + "epoch": 2.852363413136894, + "grad_norm": 0.24732981622219086, + "learning_rate": 8.387254643269595e-05, + "loss": 1.7749, + "step": 9293 + }, + { + "epoch": 2.852670349907919, + "grad_norm": 0.31004419922828674, + "learning_rate": 8.386889008069955e-05, + "loss": 1.7848, + "step": 9294 + }, + { + "epoch": 2.8529772866789442, + "grad_norm": 0.2916278541088104, + "learning_rate": 8.386523339399095e-05, + "loss": 1.8299, + "step": 9295 + }, + { + "epoch": 2.853284223449969, + "grad_norm": 0.3109573423862457, + "learning_rate": 8.386157637260626e-05, + "loss": 1.8072, + "step": 9296 + }, + { + "epoch": 2.8535911602209945, + "grad_norm": 0.26398584246635437, + "learning_rate": 8.385791901658162e-05, + "loss": 1.8157, + "step": 9297 + }, + { + "epoch": 2.85389809699202, + "grad_norm": 0.3289371132850647, + "learning_rate": 8.385426132595317e-05, + "loss": 1.9382, + "step": 9298 + }, + { + "epoch": 2.8542050337630447, + "grad_norm": 0.2946974039077759, + "learning_rate": 8.38506033007571e-05, + "loss": 1.7893, + "step": 9299 + }, + { + "epoch": 2.85451197053407, + "grad_norm": 0.2909530699253082, + "learning_rate": 8.384694494102949e-05, + "loss": 1.8223, + "step": 9300 + }, + { + "epoch": 2.854818907305095, + "grad_norm": 0.2886645793914795, + "learning_rate": 8.384328624680655e-05, + "loss": 1.8239, + "step": 9301 + }, + { + "epoch": 2.8551258440761202, + "grad_norm": 0.2669137716293335, + "learning_rate": 8.383962721812442e-05, + "loss": 1.8102, + "step": 9302 + }, + { + "epoch": 2.8554327808471456, + "grad_norm": 0.3740660548210144, + "learning_rate": 8.383596785501926e-05, + "loss": 1.9014, + "step": 9303 + }, + { + "epoch": 2.855739717618171, + "grad_norm": 0.3062593638896942, + "learning_rate": 8.383230815752724e-05, + "loss": 1.8071, + "step": 9304 + }, + { + "epoch": 2.856046654389196, + "grad_norm": 0.2509091794490814, + "learning_rate": 8.382864812568452e-05, + "loss": 1.7968, + "step": 9305 + }, + { + "epoch": 2.856353591160221, + "grad_norm": 0.2764138877391815, + "learning_rate": 8.382498775952725e-05, + "loss": 1.7463, + "step": 9306 + }, + { + "epoch": 2.856660527931246, + "grad_norm": 0.3292323350906372, + "learning_rate": 8.382132705909165e-05, + "loss": 1.7888, + "step": 9307 + }, + { + "epoch": 2.8569674647022714, + "grad_norm": 0.3169284462928772, + "learning_rate": 8.381766602441386e-05, + "loss": 1.841, + "step": 9308 + }, + { + "epoch": 2.8572744014732967, + "grad_norm": 0.27665168046951294, + "learning_rate": 8.381400465553007e-05, + "loss": 1.7659, + "step": 9309 + }, + { + "epoch": 2.8575813382443216, + "grad_norm": 0.34908005595207214, + "learning_rate": 8.381034295247647e-05, + "loss": 1.8752, + "step": 9310 + }, + { + "epoch": 2.857888275015347, + "grad_norm": 0.31204238533973694, + "learning_rate": 8.380668091528924e-05, + "loss": 1.8201, + "step": 9311 + }, + { + "epoch": 2.858195211786372, + "grad_norm": 0.2713339328765869, + "learning_rate": 8.380301854400459e-05, + "loss": 1.8002, + "step": 9312 + }, + { + "epoch": 2.858502148557397, + "grad_norm": 0.30525076389312744, + "learning_rate": 8.379935583865868e-05, + "loss": 1.8533, + "step": 9313 + }, + { + "epoch": 2.8588090853284225, + "grad_norm": 0.3294430673122406, + "learning_rate": 8.379569279928774e-05, + "loss": 1.8895, + "step": 9314 + }, + { + "epoch": 2.8591160220994474, + "grad_norm": 0.31798750162124634, + "learning_rate": 8.379202942592795e-05, + "loss": 1.8148, + "step": 9315 + }, + { + "epoch": 2.8594229588704727, + "grad_norm": 0.3044969141483307, + "learning_rate": 8.378836571861553e-05, + "loss": 1.8477, + "step": 9316 + }, + { + "epoch": 2.8597298956414976, + "grad_norm": 0.2694118320941925, + "learning_rate": 8.378470167738665e-05, + "loss": 1.7998, + "step": 9317 + }, + { + "epoch": 2.860036832412523, + "grad_norm": 0.2601872980594635, + "learning_rate": 8.378103730227758e-05, + "loss": 1.8118, + "step": 9318 + }, + { + "epoch": 2.8603437691835483, + "grad_norm": 0.28168994188308716, + "learning_rate": 8.377737259332446e-05, + "loss": 1.8048, + "step": 9319 + }, + { + "epoch": 2.8606507059545736, + "grad_norm": 0.3008260428905487, + "learning_rate": 8.377370755056358e-05, + "loss": 1.7743, + "step": 9320 + }, + { + "epoch": 2.8609576427255985, + "grad_norm": 0.2578682601451874, + "learning_rate": 8.37700421740311e-05, + "loss": 1.8011, + "step": 9321 + }, + { + "epoch": 2.861264579496624, + "grad_norm": 0.3051932752132416, + "learning_rate": 8.376637646376329e-05, + "loss": 1.8747, + "step": 9322 + }, + { + "epoch": 2.8615715162676487, + "grad_norm": 0.27534300088882446, + "learning_rate": 8.376271041979636e-05, + "loss": 1.8018, + "step": 9323 + }, + { + "epoch": 2.861878453038674, + "grad_norm": 0.3990626335144043, + "learning_rate": 8.375904404216653e-05, + "loss": 1.9223, + "step": 9324 + }, + { + "epoch": 2.8621853898096994, + "grad_norm": 0.43015196919441223, + "learning_rate": 8.375537733091003e-05, + "loss": 1.8219, + "step": 9325 + }, + { + "epoch": 2.8624923265807243, + "grad_norm": 0.4051269590854645, + "learning_rate": 8.37517102860631e-05, + "loss": 1.8057, + "step": 9326 + }, + { + "epoch": 2.8627992633517496, + "grad_norm": 0.31781086325645447, + "learning_rate": 8.3748042907662e-05, + "loss": 1.8374, + "step": 9327 + }, + { + "epoch": 2.8631062001227745, + "grad_norm": 0.3476638197898865, + "learning_rate": 8.374437519574297e-05, + "loss": 1.8679, + "step": 9328 + }, + { + "epoch": 2.8634131368938, + "grad_norm": 0.40497875213623047, + "learning_rate": 8.374070715034224e-05, + "loss": 1.7996, + "step": 9329 + }, + { + "epoch": 2.863720073664825, + "grad_norm": 0.40277308225631714, + "learning_rate": 8.373703877149605e-05, + "loss": 1.8156, + "step": 9330 + }, + { + "epoch": 2.86402701043585, + "grad_norm": 0.3012325167655945, + "learning_rate": 8.373337005924069e-05, + "loss": 1.8765, + "step": 9331 + }, + { + "epoch": 2.8643339472068754, + "grad_norm": 0.3151897192001343, + "learning_rate": 8.372970101361238e-05, + "loss": 1.8395, + "step": 9332 + }, + { + "epoch": 2.8646408839779003, + "grad_norm": 0.33645790815353394, + "learning_rate": 8.372603163464741e-05, + "loss": 1.8587, + "step": 9333 + }, + { + "epoch": 2.8649478207489256, + "grad_norm": 0.29943743348121643, + "learning_rate": 8.3722361922382e-05, + "loss": 1.8007, + "step": 9334 + }, + { + "epoch": 2.865254757519951, + "grad_norm": 0.24727779626846313, + "learning_rate": 8.371869187685248e-05, + "loss": 1.766, + "step": 9335 + }, + { + "epoch": 2.8655616942909763, + "grad_norm": 0.3177282512187958, + "learning_rate": 8.371502149809507e-05, + "loss": 1.7954, + "step": 9336 + }, + { + "epoch": 2.865868631062001, + "grad_norm": 0.3415081202983856, + "learning_rate": 8.371135078614605e-05, + "loss": 1.8036, + "step": 9337 + }, + { + "epoch": 2.8661755678330265, + "grad_norm": 0.3044268488883972, + "learning_rate": 8.37076797410417e-05, + "loss": 1.8196, + "step": 9338 + }, + { + "epoch": 2.8664825046040514, + "grad_norm": 0.24425630271434784, + "learning_rate": 8.370400836281831e-05, + "loss": 1.8267, + "step": 9339 + }, + { + "epoch": 2.8667894413750767, + "grad_norm": 0.27264806628227234, + "learning_rate": 8.370033665151216e-05, + "loss": 1.8218, + "step": 9340 + }, + { + "epoch": 2.867096378146102, + "grad_norm": 0.275601327419281, + "learning_rate": 8.369666460715953e-05, + "loss": 1.8427, + "step": 9341 + }, + { + "epoch": 2.867403314917127, + "grad_norm": 0.2670573592185974, + "learning_rate": 8.36929922297967e-05, + "loss": 1.8449, + "step": 9342 + }, + { + "epoch": 2.8677102516881523, + "grad_norm": 0.2991434335708618, + "learning_rate": 8.368931951945998e-05, + "loss": 1.8866, + "step": 9343 + }, + { + "epoch": 2.868017188459177, + "grad_norm": 0.2975110411643982, + "learning_rate": 8.368564647618564e-05, + "loss": 1.7992, + "step": 9344 + }, + { + "epoch": 2.8683241252302025, + "grad_norm": 0.30109819769859314, + "learning_rate": 8.368197310001001e-05, + "loss": 1.8402, + "step": 9345 + }, + { + "epoch": 2.868631062001228, + "grad_norm": 0.3303714692592621, + "learning_rate": 8.367829939096938e-05, + "loss": 1.8329, + "step": 9346 + }, + { + "epoch": 2.8689379987722528, + "grad_norm": 0.3697182834148407, + "learning_rate": 8.367462534910007e-05, + "loss": 1.9328, + "step": 9347 + }, + { + "epoch": 2.869244935543278, + "grad_norm": 0.3292355537414551, + "learning_rate": 8.367095097443836e-05, + "loss": 1.8284, + "step": 9348 + }, + { + "epoch": 2.869551872314303, + "grad_norm": 0.30440348386764526, + "learning_rate": 8.366727626702058e-05, + "loss": 1.8891, + "step": 9349 + }, + { + "epoch": 2.8698588090853283, + "grad_norm": 0.28200212121009827, + "learning_rate": 8.366360122688303e-05, + "loss": 1.7931, + "step": 9350 + }, + { + "epoch": 2.8701657458563536, + "grad_norm": 0.3162787854671478, + "learning_rate": 8.365992585406207e-05, + "loss": 1.8033, + "step": 9351 + }, + { + "epoch": 2.870472682627379, + "grad_norm": 0.3326094448566437, + "learning_rate": 8.365625014859399e-05, + "loss": 1.8474, + "step": 9352 + }, + { + "epoch": 2.870779619398404, + "grad_norm": 0.36957383155822754, + "learning_rate": 8.36525741105151e-05, + "loss": 1.8387, + "step": 9353 + }, + { + "epoch": 2.871086556169429, + "grad_norm": 0.32996198534965515, + "learning_rate": 8.364889773986175e-05, + "loss": 1.9087, + "step": 9354 + }, + { + "epoch": 2.871393492940454, + "grad_norm": 0.3164239227771759, + "learning_rate": 8.36452210366703e-05, + "loss": 1.8735, + "step": 9355 + }, + { + "epoch": 2.8717004297114794, + "grad_norm": 0.411538302898407, + "learning_rate": 8.364154400097702e-05, + "loss": 1.832, + "step": 9356 + }, + { + "epoch": 2.8720073664825048, + "grad_norm": 0.48294687271118164, + "learning_rate": 8.36378666328183e-05, + "loss": 1.7772, + "step": 9357 + }, + { + "epoch": 2.8723143032535297, + "grad_norm": 0.4894202649593353, + "learning_rate": 8.363418893223046e-05, + "loss": 1.8396, + "step": 9358 + }, + { + "epoch": 2.872621240024555, + "grad_norm": 0.3328344225883484, + "learning_rate": 8.363051089924986e-05, + "loss": 1.8264, + "step": 9359 + }, + { + "epoch": 2.87292817679558, + "grad_norm": 0.29800695180892944, + "learning_rate": 8.362683253391284e-05, + "loss": 1.8609, + "step": 9360 + }, + { + "epoch": 2.873235113566605, + "grad_norm": 0.48049718141555786, + "learning_rate": 8.362315383625574e-05, + "loss": 1.8703, + "step": 9361 + }, + { + "epoch": 2.8735420503376305, + "grad_norm": 0.5477426052093506, + "learning_rate": 8.361947480631494e-05, + "loss": 1.8336, + "step": 9362 + }, + { + "epoch": 2.873848987108656, + "grad_norm": 0.42515942454338074, + "learning_rate": 8.361579544412676e-05, + "loss": 1.826, + "step": 9363 + }, + { + "epoch": 2.8741559238796808, + "grad_norm": 0.3049539029598236, + "learning_rate": 8.361211574972762e-05, + "loss": 1.9117, + "step": 9364 + }, + { + "epoch": 2.874462860650706, + "grad_norm": 0.4089799225330353, + "learning_rate": 8.360843572315384e-05, + "loss": 1.8669, + "step": 9365 + }, + { + "epoch": 2.874769797421731, + "grad_norm": 0.42594894766807556, + "learning_rate": 8.36047553644418e-05, + "loss": 1.8527, + "step": 9366 + }, + { + "epoch": 2.8750767341927563, + "grad_norm": 0.3282840847969055, + "learning_rate": 8.360107467362785e-05, + "loss": 1.833, + "step": 9367 + }, + { + "epoch": 2.8753836709637817, + "grad_norm": 0.26597294211387634, + "learning_rate": 8.359739365074841e-05, + "loss": 1.7735, + "step": 9368 + }, + { + "epoch": 2.8756906077348066, + "grad_norm": 0.33498096466064453, + "learning_rate": 8.359371229583983e-05, + "loss": 1.7923, + "step": 9369 + }, + { + "epoch": 2.875997544505832, + "grad_norm": 0.3046290874481201, + "learning_rate": 8.35900306089385e-05, + "loss": 1.8296, + "step": 9370 + }, + { + "epoch": 2.876304481276857, + "grad_norm": 0.3128269612789154, + "learning_rate": 8.358634859008079e-05, + "loss": 1.8115, + "step": 9371 + }, + { + "epoch": 2.876611418047882, + "grad_norm": 0.3814822733402252, + "learning_rate": 8.358266623930309e-05, + "loss": 1.8454, + "step": 9372 + }, + { + "epoch": 2.8769183548189075, + "grad_norm": 0.42400503158569336, + "learning_rate": 8.35789835566418e-05, + "loss": 1.8162, + "step": 9373 + }, + { + "epoch": 2.8772252915899323, + "grad_norm": 0.3131491243839264, + "learning_rate": 8.357530054213333e-05, + "loss": 1.8281, + "step": 9374 + }, + { + "epoch": 2.8775322283609577, + "grad_norm": 0.2566036581993103, + "learning_rate": 8.357161719581406e-05, + "loss": 1.7751, + "step": 9375 + }, + { + "epoch": 2.8778391651319826, + "grad_norm": 0.3858461081981659, + "learning_rate": 8.356793351772038e-05, + "loss": 1.8558, + "step": 9376 + }, + { + "epoch": 2.878146101903008, + "grad_norm": 0.38664349913597107, + "learning_rate": 8.35642495078887e-05, + "loss": 1.8009, + "step": 9377 + }, + { + "epoch": 2.8784530386740332, + "grad_norm": 0.33365172147750854, + "learning_rate": 8.356056516635545e-05, + "loss": 1.8689, + "step": 9378 + }, + { + "epoch": 2.8787599754450586, + "grad_norm": 0.3602980971336365, + "learning_rate": 8.355688049315702e-05, + "loss": 1.8397, + "step": 9379 + }, + { + "epoch": 2.8790669122160835, + "grad_norm": 0.4508447051048279, + "learning_rate": 8.355319548832983e-05, + "loss": 1.8163, + "step": 9380 + }, + { + "epoch": 2.879373848987109, + "grad_norm": 0.4433961808681488, + "learning_rate": 8.35495101519103e-05, + "loss": 1.7868, + "step": 9381 + }, + { + "epoch": 2.8796807857581337, + "grad_norm": 0.2754592299461365, + "learning_rate": 8.354582448393483e-05, + "loss": 1.8222, + "step": 9382 + }, + { + "epoch": 2.879987722529159, + "grad_norm": 0.29384344816207886, + "learning_rate": 8.354213848443987e-05, + "loss": 1.7742, + "step": 9383 + }, + { + "epoch": 2.8802946593001844, + "grad_norm": 0.33183756470680237, + "learning_rate": 8.353845215346183e-05, + "loss": 1.8327, + "step": 9384 + }, + { + "epoch": 2.8806015960712092, + "grad_norm": 0.3018858730792999, + "learning_rate": 8.353476549103717e-05, + "loss": 1.8606, + "step": 9385 + }, + { + "epoch": 2.8809085328422346, + "grad_norm": 0.38592803478240967, + "learning_rate": 8.353107849720229e-05, + "loss": 1.8091, + "step": 9386 + }, + { + "epoch": 2.8812154696132595, + "grad_norm": 0.448723703622818, + "learning_rate": 8.352739117199364e-05, + "loss": 1.8537, + "step": 9387 + }, + { + "epoch": 2.881522406384285, + "grad_norm": 0.25959616899490356, + "learning_rate": 8.352370351544765e-05, + "loss": 1.8188, + "step": 9388 + }, + { + "epoch": 2.88182934315531, + "grad_norm": 0.3304184079170227, + "learning_rate": 8.352001552760078e-05, + "loss": 1.8008, + "step": 9389 + }, + { + "epoch": 2.882136279926335, + "grad_norm": 0.3831254541873932, + "learning_rate": 8.351632720848947e-05, + "loss": 1.7636, + "step": 9390 + }, + { + "epoch": 2.8824432166973604, + "grad_norm": 0.3358294665813446, + "learning_rate": 8.351263855815017e-05, + "loss": 1.8375, + "step": 9391 + }, + { + "epoch": 2.8827501534683853, + "grad_norm": 0.31194913387298584, + "learning_rate": 8.350894957661935e-05, + "loss": 1.817, + "step": 9392 + }, + { + "epoch": 2.8830570902394106, + "grad_norm": 0.4156818687915802, + "learning_rate": 8.350526026393343e-05, + "loss": 1.799, + "step": 9393 + }, + { + "epoch": 2.883364027010436, + "grad_norm": 0.3062533140182495, + "learning_rate": 8.350157062012889e-05, + "loss": 1.8535, + "step": 9394 + }, + { + "epoch": 2.8836709637814613, + "grad_norm": 0.3091447949409485, + "learning_rate": 8.34978806452422e-05, + "loss": 1.839, + "step": 9395 + }, + { + "epoch": 2.883977900552486, + "grad_norm": 0.38731643557548523, + "learning_rate": 8.349419033930981e-05, + "loss": 1.8714, + "step": 9396 + }, + { + "epoch": 2.8842848373235115, + "grad_norm": 0.34655869007110596, + "learning_rate": 8.34904997023682e-05, + "loss": 1.8694, + "step": 9397 + }, + { + "epoch": 2.8845917740945364, + "grad_norm": 0.3094301223754883, + "learning_rate": 8.348680873445386e-05, + "loss": 1.8773, + "step": 9398 + }, + { + "epoch": 2.8848987108655617, + "grad_norm": 0.2954508364200592, + "learning_rate": 8.348311743560325e-05, + "loss": 1.7716, + "step": 9399 + }, + { + "epoch": 2.885205647636587, + "grad_norm": 0.32545948028564453, + "learning_rate": 8.347942580585282e-05, + "loss": 1.871, + "step": 9400 + }, + { + "epoch": 2.885512584407612, + "grad_norm": 0.3251612186431885, + "learning_rate": 8.34757338452391e-05, + "loss": 1.8553, + "step": 9401 + }, + { + "epoch": 2.8858195211786373, + "grad_norm": 0.2610895335674286, + "learning_rate": 8.347204155379856e-05, + "loss": 1.8018, + "step": 9402 + }, + { + "epoch": 2.886126457949662, + "grad_norm": 0.3369129002094269, + "learning_rate": 8.346834893156768e-05, + "loss": 1.8536, + "step": 9403 + }, + { + "epoch": 2.8864333947206875, + "grad_norm": 0.4544060528278351, + "learning_rate": 8.346465597858296e-05, + "loss": 1.8332, + "step": 9404 + }, + { + "epoch": 2.886740331491713, + "grad_norm": 0.45742174983024597, + "learning_rate": 8.346096269488089e-05, + "loss": 1.89, + "step": 9405 + }, + { + "epoch": 2.8870472682627377, + "grad_norm": 0.3458103537559509, + "learning_rate": 8.345726908049799e-05, + "loss": 1.8902, + "step": 9406 + }, + { + "epoch": 2.887354205033763, + "grad_norm": 0.33266058564186096, + "learning_rate": 8.345357513547074e-05, + "loss": 1.7975, + "step": 9407 + }, + { + "epoch": 2.887661141804788, + "grad_norm": 0.3503437042236328, + "learning_rate": 8.344988085983565e-05, + "loss": 1.8503, + "step": 9408 + }, + { + "epoch": 2.8879680785758133, + "grad_norm": 0.33511486649513245, + "learning_rate": 8.344618625362923e-05, + "loss": 1.8731, + "step": 9409 + }, + { + "epoch": 2.8882750153468386, + "grad_norm": 0.295250803232193, + "learning_rate": 8.344249131688799e-05, + "loss": 1.8557, + "step": 9410 + }, + { + "epoch": 2.888581952117864, + "grad_norm": 0.33287179470062256, + "learning_rate": 8.343879604964846e-05, + "loss": 1.8015, + "step": 9411 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.35169747471809387, + "learning_rate": 8.343510045194715e-05, + "loss": 1.7857, + "step": 9412 + }, + { + "epoch": 2.889195825659914, + "grad_norm": 0.3191360533237457, + "learning_rate": 8.343140452382056e-05, + "loss": 1.8474, + "step": 9413 + }, + { + "epoch": 2.889502762430939, + "grad_norm": 0.27216482162475586, + "learning_rate": 8.342770826530526e-05, + "loss": 1.7941, + "step": 9414 + }, + { + "epoch": 2.8898096992019644, + "grad_norm": 0.32968905568122864, + "learning_rate": 8.342401167643774e-05, + "loss": 1.8568, + "step": 9415 + }, + { + "epoch": 2.8901166359729897, + "grad_norm": 0.37429341673851013, + "learning_rate": 8.342031475725456e-05, + "loss": 1.8995, + "step": 9416 + }, + { + "epoch": 2.8904235727440146, + "grad_norm": 0.3318146765232086, + "learning_rate": 8.341661750779223e-05, + "loss": 1.8886, + "step": 9417 + }, + { + "epoch": 2.89073050951504, + "grad_norm": 0.3208807408809662, + "learning_rate": 8.34129199280873e-05, + "loss": 1.8306, + "step": 9418 + }, + { + "epoch": 2.891037446286065, + "grad_norm": 0.30906134843826294, + "learning_rate": 8.340922201817632e-05, + "loss": 1.8931, + "step": 9419 + }, + { + "epoch": 2.89134438305709, + "grad_norm": 0.2949373722076416, + "learning_rate": 8.340552377809581e-05, + "loss": 1.8375, + "step": 9420 + }, + { + "epoch": 2.8916513198281155, + "grad_norm": 0.2553368806838989, + "learning_rate": 8.340182520788236e-05, + "loss": 1.7816, + "step": 9421 + }, + { + "epoch": 2.891958256599141, + "grad_norm": 0.26867765188217163, + "learning_rate": 8.339812630757246e-05, + "loss": 1.7721, + "step": 9422 + }, + { + "epoch": 2.8922651933701657, + "grad_norm": 0.3132673501968384, + "learning_rate": 8.339442707720273e-05, + "loss": 1.8412, + "step": 9423 + }, + { + "epoch": 2.892572130141191, + "grad_norm": 0.32028669118881226, + "learning_rate": 8.33907275168097e-05, + "loss": 1.8081, + "step": 9424 + }, + { + "epoch": 2.892879066912216, + "grad_norm": 0.30383285880088806, + "learning_rate": 8.338702762642992e-05, + "loss": 1.8294, + "step": 9425 + }, + { + "epoch": 2.8931860036832413, + "grad_norm": 0.284161239862442, + "learning_rate": 8.338332740609995e-05, + "loss": 1.7788, + "step": 9426 + }, + { + "epoch": 2.8934929404542666, + "grad_norm": 0.26731929183006287, + "learning_rate": 8.337962685585638e-05, + "loss": 1.8244, + "step": 9427 + }, + { + "epoch": 2.8937998772252915, + "grad_norm": 0.2687760889530182, + "learning_rate": 8.337592597573578e-05, + "loss": 1.8104, + "step": 9428 + }, + { + "epoch": 2.894106813996317, + "grad_norm": 0.3097872734069824, + "learning_rate": 8.337222476577472e-05, + "loss": 1.8311, + "step": 9429 + }, + { + "epoch": 2.8944137507673418, + "grad_norm": 0.2915988862514496, + "learning_rate": 8.336852322600977e-05, + "loss": 1.8878, + "step": 9430 + }, + { + "epoch": 2.894720687538367, + "grad_norm": 0.2783167362213135, + "learning_rate": 8.336482135647751e-05, + "loss": 1.829, + "step": 9431 + }, + { + "epoch": 2.8950276243093924, + "grad_norm": 0.27866432070732117, + "learning_rate": 8.336111915721454e-05, + "loss": 1.8881, + "step": 9432 + }, + { + "epoch": 2.8953345610804173, + "grad_norm": 0.26949164271354675, + "learning_rate": 8.335741662825743e-05, + "loss": 1.7652, + "step": 9433 + }, + { + "epoch": 2.8956414978514426, + "grad_norm": 0.31324130296707153, + "learning_rate": 8.335371376964278e-05, + "loss": 1.8362, + "step": 9434 + }, + { + "epoch": 2.8959484346224675, + "grad_norm": 0.31150999665260315, + "learning_rate": 8.335001058140718e-05, + "loss": 1.8588, + "step": 9435 + }, + { + "epoch": 2.896255371393493, + "grad_norm": 0.30692601203918457, + "learning_rate": 8.334630706358724e-05, + "loss": 1.8473, + "step": 9436 + }, + { + "epoch": 2.896562308164518, + "grad_norm": 0.2764357328414917, + "learning_rate": 8.334260321621954e-05, + "loss": 1.8696, + "step": 9437 + }, + { + "epoch": 2.8968692449355435, + "grad_norm": 0.26108071208000183, + "learning_rate": 8.333889903934069e-05, + "loss": 1.7647, + "step": 9438 + }, + { + "epoch": 2.8971761817065684, + "grad_norm": 0.3382989466190338, + "learning_rate": 8.33351945329873e-05, + "loss": 1.8936, + "step": 9439 + }, + { + "epoch": 2.8974831184775938, + "grad_norm": 0.3121405839920044, + "learning_rate": 8.333148969719598e-05, + "loss": 1.8281, + "step": 9440 + }, + { + "epoch": 2.8977900552486187, + "grad_norm": 0.283149778842926, + "learning_rate": 8.332778453200334e-05, + "loss": 1.8642, + "step": 9441 + }, + { + "epoch": 2.898096992019644, + "grad_norm": 0.4140075445175171, + "learning_rate": 8.332407903744598e-05, + "loss": 1.8553, + "step": 9442 + }, + { + "epoch": 2.8984039287906693, + "grad_norm": 0.4345620274543762, + "learning_rate": 8.332037321356057e-05, + "loss": 1.7879, + "step": 9443 + }, + { + "epoch": 2.898710865561694, + "grad_norm": 0.4103661775588989, + "learning_rate": 8.33166670603837e-05, + "loss": 1.7928, + "step": 9444 + }, + { + "epoch": 2.8990178023327196, + "grad_norm": 0.2874266505241394, + "learning_rate": 8.3312960577952e-05, + "loss": 1.8097, + "step": 9445 + }, + { + "epoch": 2.8993247391037444, + "grad_norm": 0.2949487864971161, + "learning_rate": 8.330925376630208e-05, + "loss": 1.8679, + "step": 9446 + }, + { + "epoch": 2.8996316758747698, + "grad_norm": 0.3222406804561615, + "learning_rate": 8.330554662547059e-05, + "loss": 1.8184, + "step": 9447 + }, + { + "epoch": 2.899938612645795, + "grad_norm": 0.32089436054229736, + "learning_rate": 8.330183915549418e-05, + "loss": 1.8798, + "step": 9448 + }, + { + "epoch": 2.90024554941682, + "grad_norm": 0.28950363397598267, + "learning_rate": 8.329813135640947e-05, + "loss": 1.8502, + "step": 9449 + }, + { + "epoch": 2.9005524861878453, + "grad_norm": 0.29070547223091125, + "learning_rate": 8.329442322825312e-05, + "loss": 1.8826, + "step": 9450 + }, + { + "epoch": 2.9008594229588702, + "grad_norm": 0.3030688464641571, + "learning_rate": 8.329071477106175e-05, + "loss": 1.8002, + "step": 9451 + }, + { + "epoch": 2.9011663597298956, + "grad_norm": 0.33711570501327515, + "learning_rate": 8.328700598487203e-05, + "loss": 1.8876, + "step": 9452 + }, + { + "epoch": 2.901473296500921, + "grad_norm": 0.31995612382888794, + "learning_rate": 8.328329686972063e-05, + "loss": 1.7952, + "step": 9453 + }, + { + "epoch": 2.9017802332719462, + "grad_norm": 0.2619616389274597, + "learning_rate": 8.327958742564415e-05, + "loss": 1.7371, + "step": 9454 + }, + { + "epoch": 2.902087170042971, + "grad_norm": 0.3527650535106659, + "learning_rate": 8.32758776526793e-05, + "loss": 1.8385, + "step": 9455 + }, + { + "epoch": 2.9023941068139965, + "grad_norm": 0.3238582909107208, + "learning_rate": 8.327216755086271e-05, + "loss": 1.7955, + "step": 9456 + }, + { + "epoch": 2.9027010435850213, + "grad_norm": 0.2647970914840698, + "learning_rate": 8.326845712023106e-05, + "loss": 1.8639, + "step": 9457 + }, + { + "epoch": 2.9030079803560467, + "grad_norm": 0.3435346186161041, + "learning_rate": 8.326474636082103e-05, + "loss": 1.7831, + "step": 9458 + }, + { + "epoch": 2.903314917127072, + "grad_norm": 0.42539843916893005, + "learning_rate": 8.326103527266927e-05, + "loss": 1.8473, + "step": 9459 + }, + { + "epoch": 2.903621853898097, + "grad_norm": 0.3773367404937744, + "learning_rate": 8.325732385581247e-05, + "loss": 1.8993, + "step": 9460 + }, + { + "epoch": 2.9039287906691222, + "grad_norm": 0.2918262183666229, + "learning_rate": 8.32536121102873e-05, + "loss": 1.8198, + "step": 9461 + }, + { + "epoch": 2.904235727440147, + "grad_norm": 0.3997703492641449, + "learning_rate": 8.324990003613044e-05, + "loss": 1.8307, + "step": 9462 + }, + { + "epoch": 2.9045426642111725, + "grad_norm": 0.4593566656112671, + "learning_rate": 8.324618763337858e-05, + "loss": 1.8068, + "step": 9463 + }, + { + "epoch": 2.904849600982198, + "grad_norm": 0.30200180411338806, + "learning_rate": 8.324247490206841e-05, + "loss": 1.7935, + "step": 9464 + }, + { + "epoch": 2.9051565377532227, + "grad_norm": 0.37651970982551575, + "learning_rate": 8.323876184223663e-05, + "loss": 1.9268, + "step": 9465 + }, + { + "epoch": 2.905463474524248, + "grad_norm": 0.465863436460495, + "learning_rate": 8.32350484539199e-05, + "loss": 1.8331, + "step": 9466 + }, + { + "epoch": 2.905770411295273, + "grad_norm": 0.3527480661869049, + "learning_rate": 8.323133473715496e-05, + "loss": 1.899, + "step": 9467 + }, + { + "epoch": 2.9060773480662982, + "grad_norm": 0.30979883670806885, + "learning_rate": 8.32276206919785e-05, + "loss": 1.7578, + "step": 9468 + }, + { + "epoch": 2.9063842848373236, + "grad_norm": 0.5039793252944946, + "learning_rate": 8.322390631842718e-05, + "loss": 1.7822, + "step": 9469 + }, + { + "epoch": 2.906691221608349, + "grad_norm": 0.4683503806591034, + "learning_rate": 8.322019161653777e-05, + "loss": 1.7958, + "step": 9470 + }, + { + "epoch": 2.906998158379374, + "grad_norm": 0.27022865414619446, + "learning_rate": 8.321647658634696e-05, + "loss": 1.838, + "step": 9471 + }, + { + "epoch": 2.907305095150399, + "grad_norm": 0.3253246247768402, + "learning_rate": 8.321276122789146e-05, + "loss": 1.862, + "step": 9472 + }, + { + "epoch": 2.907612031921424, + "grad_norm": 0.3654547929763794, + "learning_rate": 8.320904554120798e-05, + "loss": 1.8578, + "step": 9473 + }, + { + "epoch": 2.9079189686924494, + "grad_norm": 0.3140239417552948, + "learning_rate": 8.320532952633325e-05, + "loss": 1.7954, + "step": 9474 + }, + { + "epoch": 2.9082259054634747, + "grad_norm": 0.24541302025318146, + "learning_rate": 8.3201613183304e-05, + "loss": 1.7711, + "step": 9475 + }, + { + "epoch": 2.9085328422344996, + "grad_norm": 0.2538415491580963, + "learning_rate": 8.319789651215692e-05, + "loss": 1.7756, + "step": 9476 + }, + { + "epoch": 2.908839779005525, + "grad_norm": 0.3181871175765991, + "learning_rate": 8.31941795129288e-05, + "loss": 1.7957, + "step": 9477 + }, + { + "epoch": 2.90914671577655, + "grad_norm": 0.3094673752784729, + "learning_rate": 8.319046218565633e-05, + "loss": 1.8897, + "step": 9478 + }, + { + "epoch": 2.909453652547575, + "grad_norm": 0.3004473149776459, + "learning_rate": 8.318674453037626e-05, + "loss": 1.7853, + "step": 9479 + }, + { + "epoch": 2.9097605893186005, + "grad_norm": 0.28673505783081055, + "learning_rate": 8.318302654712532e-05, + "loss": 1.8119, + "step": 9480 + }, + { + "epoch": 2.9100675260896254, + "grad_norm": 0.3177729547023773, + "learning_rate": 8.317930823594027e-05, + "loss": 1.8211, + "step": 9481 + }, + { + "epoch": 2.9103744628606507, + "grad_norm": 0.28347232937812805, + "learning_rate": 8.317558959685786e-05, + "loss": 1.8061, + "step": 9482 + }, + { + "epoch": 2.9106813996316756, + "grad_norm": 0.28247126936912537, + "learning_rate": 8.317187062991482e-05, + "loss": 1.8175, + "step": 9483 + }, + { + "epoch": 2.910988336402701, + "grad_norm": 0.3153017461299896, + "learning_rate": 8.31681513351479e-05, + "loss": 1.8619, + "step": 9484 + }, + { + "epoch": 2.9112952731737263, + "grad_norm": 0.265821635723114, + "learning_rate": 8.316443171259389e-05, + "loss": 1.7783, + "step": 9485 + }, + { + "epoch": 2.9116022099447516, + "grad_norm": 0.33247366547584534, + "learning_rate": 8.31607117622895e-05, + "loss": 1.8701, + "step": 9486 + }, + { + "epoch": 2.9119091467157765, + "grad_norm": 0.3343275189399719, + "learning_rate": 8.315699148427154e-05, + "loss": 1.742, + "step": 9487 + }, + { + "epoch": 2.912216083486802, + "grad_norm": 0.3427117168903351, + "learning_rate": 8.315327087857677e-05, + "loss": 1.8382, + "step": 9488 + }, + { + "epoch": 2.9125230202578267, + "grad_norm": 0.2884635925292969, + "learning_rate": 8.31495499452419e-05, + "loss": 1.8378, + "step": 9489 + }, + { + "epoch": 2.912829957028852, + "grad_norm": 0.30335184931755066, + "learning_rate": 8.31458286843038e-05, + "loss": 1.7619, + "step": 9490 + }, + { + "epoch": 2.9131368937998774, + "grad_norm": 0.3224368095397949, + "learning_rate": 8.314210709579916e-05, + "loss": 1.8289, + "step": 9491 + }, + { + "epoch": 2.9134438305709023, + "grad_norm": 0.28016242384910583, + "learning_rate": 8.31383851797648e-05, + "loss": 1.8027, + "step": 9492 + }, + { + "epoch": 2.9137507673419276, + "grad_norm": 0.32091468572616577, + "learning_rate": 8.313466293623749e-05, + "loss": 1.9027, + "step": 9493 + }, + { + "epoch": 2.9140577041129525, + "grad_norm": 0.2809069752693176, + "learning_rate": 8.313094036525403e-05, + "loss": 1.9194, + "step": 9494 + }, + { + "epoch": 2.914364640883978, + "grad_norm": 0.30734366178512573, + "learning_rate": 8.312721746685119e-05, + "loss": 1.8612, + "step": 9495 + }, + { + "epoch": 2.914671577655003, + "grad_norm": 0.25953513383865356, + "learning_rate": 8.312349424106578e-05, + "loss": 1.7593, + "step": 9496 + }, + { + "epoch": 2.9149785144260285, + "grad_norm": 0.27583983540534973, + "learning_rate": 8.311977068793459e-05, + "loss": 1.8138, + "step": 9497 + }, + { + "epoch": 2.9152854511970534, + "grad_norm": 0.30315884947776794, + "learning_rate": 8.31160468074944e-05, + "loss": 1.7704, + "step": 9498 + }, + { + "epoch": 2.9155923879680787, + "grad_norm": 0.321603387594223, + "learning_rate": 8.311232259978204e-05, + "loss": 1.8055, + "step": 9499 + }, + { + "epoch": 2.9158993247391036, + "grad_norm": 0.27882421016693115, + "learning_rate": 8.310859806483429e-05, + "loss": 1.8257, + "step": 9500 + }, + { + "epoch": 2.916206261510129, + "grad_norm": 0.3095625042915344, + "learning_rate": 8.310487320268795e-05, + "loss": 1.8561, + "step": 9501 + }, + { + "epoch": 2.9165131982811543, + "grad_norm": 0.27503731846809387, + "learning_rate": 8.310114801337988e-05, + "loss": 1.7588, + "step": 9502 + }, + { + "epoch": 2.916820135052179, + "grad_norm": 0.2534404695034027, + "learning_rate": 8.309742249694686e-05, + "loss": 1.7289, + "step": 9503 + }, + { + "epoch": 2.9171270718232045, + "grad_norm": 0.24968849122524261, + "learning_rate": 8.30936966534257e-05, + "loss": 1.7763, + "step": 9504 + }, + { + "epoch": 2.9174340085942294, + "grad_norm": 0.2728060781955719, + "learning_rate": 8.308997048285324e-05, + "loss": 1.7847, + "step": 9505 + }, + { + "epoch": 2.9177409453652547, + "grad_norm": 0.28728193044662476, + "learning_rate": 8.308624398526629e-05, + "loss": 1.7957, + "step": 9506 + }, + { + "epoch": 2.91804788213628, + "grad_norm": 0.3097241520881653, + "learning_rate": 8.308251716070169e-05, + "loss": 1.8141, + "step": 9507 + }, + { + "epoch": 2.918354818907305, + "grad_norm": 0.3570188879966736, + "learning_rate": 8.307879000919628e-05, + "loss": 1.8246, + "step": 9508 + }, + { + "epoch": 2.9186617556783303, + "grad_norm": 0.27077826857566833, + "learning_rate": 8.307506253078685e-05, + "loss": 1.7912, + "step": 9509 + }, + { + "epoch": 2.918968692449355, + "grad_norm": 0.26213565468788147, + "learning_rate": 8.307133472551028e-05, + "loss": 1.8378, + "step": 9510 + }, + { + "epoch": 2.9192756292203805, + "grad_norm": 0.3482845723628998, + "learning_rate": 8.306760659340339e-05, + "loss": 1.8031, + "step": 9511 + }, + { + "epoch": 2.919582565991406, + "grad_norm": 0.3730507791042328, + "learning_rate": 8.306387813450303e-05, + "loss": 1.7404, + "step": 9512 + }, + { + "epoch": 2.919889502762431, + "grad_norm": 0.2957874536514282, + "learning_rate": 8.306014934884606e-05, + "loss": 1.8623, + "step": 9513 + }, + { + "epoch": 2.920196439533456, + "grad_norm": 0.29137885570526123, + "learning_rate": 8.30564202364693e-05, + "loss": 1.847, + "step": 9514 + }, + { + "epoch": 2.9205033763044814, + "grad_norm": 0.35623642802238464, + "learning_rate": 8.305269079740964e-05, + "loss": 1.8382, + "step": 9515 + }, + { + "epoch": 2.9208103130755063, + "grad_norm": 0.28263330459594727, + "learning_rate": 8.304896103170389e-05, + "loss": 1.7732, + "step": 9516 + }, + { + "epoch": 2.9211172498465316, + "grad_norm": 0.23631221055984497, + "learning_rate": 8.304523093938897e-05, + "loss": 1.7709, + "step": 9517 + }, + { + "epoch": 2.921424186617557, + "grad_norm": 0.25887101888656616, + "learning_rate": 8.304150052050169e-05, + "loss": 1.7966, + "step": 9518 + }, + { + "epoch": 2.921731123388582, + "grad_norm": 0.31445473432540894, + "learning_rate": 8.303776977507894e-05, + "loss": 1.8735, + "step": 9519 + }, + { + "epoch": 2.922038060159607, + "grad_norm": 0.264930784702301, + "learning_rate": 8.303403870315757e-05, + "loss": 1.7983, + "step": 9520 + }, + { + "epoch": 2.922344996930632, + "grad_norm": 0.2664194107055664, + "learning_rate": 8.30303073047745e-05, + "loss": 1.8573, + "step": 9521 + }, + { + "epoch": 2.9226519337016574, + "grad_norm": 0.31645768880844116, + "learning_rate": 8.302657557996656e-05, + "loss": 1.913, + "step": 9522 + }, + { + "epoch": 2.9229588704726828, + "grad_norm": 0.2820858657360077, + "learning_rate": 8.302284352877063e-05, + "loss": 1.8714, + "step": 9523 + }, + { + "epoch": 2.9232658072437077, + "grad_norm": 0.2960543930530548, + "learning_rate": 8.30191111512236e-05, + "loss": 1.8296, + "step": 9524 + }, + { + "epoch": 2.923572744014733, + "grad_norm": 0.319363534450531, + "learning_rate": 8.301537844736237e-05, + "loss": 1.8533, + "step": 9525 + }, + { + "epoch": 2.923879680785758, + "grad_norm": 0.28047996759414673, + "learning_rate": 8.301164541722384e-05, + "loss": 1.7415, + "step": 9526 + }, + { + "epoch": 2.924186617556783, + "grad_norm": 0.3106628656387329, + "learning_rate": 8.300791206084486e-05, + "loss": 1.8809, + "step": 9527 + }, + { + "epoch": 2.9244935543278086, + "grad_norm": 0.2650253474712372, + "learning_rate": 8.300417837826235e-05, + "loss": 1.8097, + "step": 9528 + }, + { + "epoch": 2.924800491098834, + "grad_norm": 0.31832796335220337, + "learning_rate": 8.30004443695132e-05, + "loss": 1.881, + "step": 9529 + }, + { + "epoch": 2.925107427869859, + "grad_norm": 0.311018168926239, + "learning_rate": 8.299671003463432e-05, + "loss": 1.8725, + "step": 9530 + }, + { + "epoch": 2.925414364640884, + "grad_norm": 0.3125450909137726, + "learning_rate": 8.299297537366262e-05, + "loss": 1.8159, + "step": 9531 + }, + { + "epoch": 2.925721301411909, + "grad_norm": 0.30022570490837097, + "learning_rate": 8.298924038663498e-05, + "loss": 1.8217, + "step": 9532 + }, + { + "epoch": 2.9260282381829343, + "grad_norm": 0.3061163127422333, + "learning_rate": 8.298550507358836e-05, + "loss": 1.8529, + "step": 9533 + }, + { + "epoch": 2.9263351749539597, + "grad_norm": 0.258891224861145, + "learning_rate": 8.298176943455962e-05, + "loss": 1.8579, + "step": 9534 + }, + { + "epoch": 2.9266421117249846, + "grad_norm": 0.2871147096157074, + "learning_rate": 8.297803346958571e-05, + "loss": 1.8699, + "step": 9535 + }, + { + "epoch": 2.92694904849601, + "grad_norm": 0.3047468066215515, + "learning_rate": 8.297429717870356e-05, + "loss": 1.9165, + "step": 9536 + }, + { + "epoch": 2.927255985267035, + "grad_norm": 0.2852346897125244, + "learning_rate": 8.297056056195005e-05, + "loss": 1.8417, + "step": 9537 + }, + { + "epoch": 2.92756292203806, + "grad_norm": 0.30782654881477356, + "learning_rate": 8.296682361936216e-05, + "loss": 1.835, + "step": 9538 + }, + { + "epoch": 2.9278698588090855, + "grad_norm": 0.44828128814697266, + "learning_rate": 8.296308635097678e-05, + "loss": 1.8997, + "step": 9539 + }, + { + "epoch": 2.9281767955801103, + "grad_norm": 0.48911961913108826, + "learning_rate": 8.295934875683087e-05, + "loss": 1.8249, + "step": 9540 + }, + { + "epoch": 2.9284837323511357, + "grad_norm": 0.3377256691455841, + "learning_rate": 8.295561083696136e-05, + "loss": 1.757, + "step": 9541 + }, + { + "epoch": 2.9287906691221606, + "grad_norm": 0.29486989974975586, + "learning_rate": 8.295187259140518e-05, + "loss": 1.8282, + "step": 9542 + }, + { + "epoch": 2.929097605893186, + "grad_norm": 0.4291549026966095, + "learning_rate": 8.294813402019927e-05, + "loss": 1.7633, + "step": 9543 + }, + { + "epoch": 2.9294045426642112, + "grad_norm": 0.43153640627861023, + "learning_rate": 8.294439512338061e-05, + "loss": 1.7904, + "step": 9544 + }, + { + "epoch": 2.9297114794352366, + "grad_norm": 0.3454402685165405, + "learning_rate": 8.294065590098611e-05, + "loss": 1.8586, + "step": 9545 + }, + { + "epoch": 2.9300184162062615, + "grad_norm": 0.2709622383117676, + "learning_rate": 8.293691635305276e-05, + "loss": 1.8225, + "step": 9546 + }, + { + "epoch": 2.930325352977287, + "grad_norm": 0.34379467368125916, + "learning_rate": 8.293317647961749e-05, + "loss": 1.9005, + "step": 9547 + }, + { + "epoch": 2.9306322897483117, + "grad_norm": 0.37137365341186523, + "learning_rate": 8.292943628071727e-05, + "loss": 1.829, + "step": 9548 + }, + { + "epoch": 2.930939226519337, + "grad_norm": 0.31634894013404846, + "learning_rate": 8.292569575638905e-05, + "loss": 1.8062, + "step": 9549 + }, + { + "epoch": 2.9312461632903624, + "grad_norm": 0.25719332695007324, + "learning_rate": 8.292195490666981e-05, + "loss": 1.8044, + "step": 9550 + }, + { + "epoch": 2.9315531000613873, + "grad_norm": 0.3341852128505707, + "learning_rate": 8.291821373159652e-05, + "loss": 1.8627, + "step": 9551 + }, + { + "epoch": 2.9318600368324126, + "grad_norm": 0.38499385118484497, + "learning_rate": 8.291447223120614e-05, + "loss": 1.8138, + "step": 9552 + }, + { + "epoch": 2.9321669736034375, + "grad_norm": 0.28036460280418396, + "learning_rate": 8.291073040553567e-05, + "loss": 1.7958, + "step": 9553 + }, + { + "epoch": 2.932473910374463, + "grad_norm": 0.30798816680908203, + "learning_rate": 8.290698825462207e-05, + "loss": 1.899, + "step": 9554 + }, + { + "epoch": 2.932780847145488, + "grad_norm": 0.40930941700935364, + "learning_rate": 8.290324577850232e-05, + "loss": 1.841, + "step": 9555 + }, + { + "epoch": 2.933087783916513, + "grad_norm": 0.38794800639152527, + "learning_rate": 8.289950297721341e-05, + "loss": 1.8022, + "step": 9556 + }, + { + "epoch": 2.9333947206875384, + "grad_norm": 0.2716790437698364, + "learning_rate": 8.289575985079232e-05, + "loss": 1.8009, + "step": 9557 + }, + { + "epoch": 2.9337016574585633, + "grad_norm": 0.3063231110572815, + "learning_rate": 8.289201639927605e-05, + "loss": 1.8677, + "step": 9558 + }, + { + "epoch": 2.9340085942295886, + "grad_norm": 0.3279048800468445, + "learning_rate": 8.28882726227016e-05, + "loss": 1.8071, + "step": 9559 + }, + { + "epoch": 2.934315531000614, + "grad_norm": 0.32144758105278015, + "learning_rate": 8.288452852110596e-05, + "loss": 1.8601, + "step": 9560 + }, + { + "epoch": 2.9346224677716393, + "grad_norm": 0.284495085477829, + "learning_rate": 8.288078409452614e-05, + "loss": 1.8358, + "step": 9561 + }, + { + "epoch": 2.934929404542664, + "grad_norm": 0.3779112696647644, + "learning_rate": 8.287703934299915e-05, + "loss": 1.7903, + "step": 9562 + }, + { + "epoch": 2.9352363413136895, + "grad_norm": 0.33851495385169983, + "learning_rate": 8.287329426656197e-05, + "loss": 1.806, + "step": 9563 + }, + { + "epoch": 2.9355432780847144, + "grad_norm": 0.26610738039016724, + "learning_rate": 8.286954886525164e-05, + "loss": 1.7739, + "step": 9564 + }, + { + "epoch": 2.9358502148557397, + "grad_norm": 0.24825556576251984, + "learning_rate": 8.286580313910515e-05, + "loss": 1.7595, + "step": 9565 + }, + { + "epoch": 2.936157151626765, + "grad_norm": 0.28356245160102844, + "learning_rate": 8.286205708815954e-05, + "loss": 1.8497, + "step": 9566 + }, + { + "epoch": 2.93646408839779, + "grad_norm": 0.2974208891391754, + "learning_rate": 8.285831071245182e-05, + "loss": 1.8561, + "step": 9567 + }, + { + "epoch": 2.9367710251688153, + "grad_norm": 0.26718810200691223, + "learning_rate": 8.2854564012019e-05, + "loss": 1.776, + "step": 9568 + }, + { + "epoch": 2.93707796193984, + "grad_norm": 0.30627691745758057, + "learning_rate": 8.285081698689814e-05, + "loss": 1.8141, + "step": 9569 + }, + { + "epoch": 2.9373848987108655, + "grad_norm": 0.33287444710731506, + "learning_rate": 8.284706963712625e-05, + "loss": 1.8727, + "step": 9570 + }, + { + "epoch": 2.937691835481891, + "grad_norm": 0.30571332573890686, + "learning_rate": 8.284332196274036e-05, + "loss": 1.8388, + "step": 9571 + }, + { + "epoch": 2.937998772252916, + "grad_norm": 0.3603699207305908, + "learning_rate": 8.283957396377753e-05, + "loss": 1.8655, + "step": 9572 + }, + { + "epoch": 2.938305709023941, + "grad_norm": 0.2890760898590088, + "learning_rate": 8.283582564027477e-05, + "loss": 1.7919, + "step": 9573 + }, + { + "epoch": 2.9386126457949664, + "grad_norm": 0.34981194138526917, + "learning_rate": 8.283207699226912e-05, + "loss": 1.8542, + "step": 9574 + }, + { + "epoch": 2.9389195825659913, + "grad_norm": 0.43490317463874817, + "learning_rate": 8.282832801979766e-05, + "loss": 1.8109, + "step": 9575 + }, + { + "epoch": 2.9392265193370166, + "grad_norm": 0.4337438941001892, + "learning_rate": 8.282457872289742e-05, + "loss": 1.8856, + "step": 9576 + }, + { + "epoch": 2.939533456108042, + "grad_norm": 0.2723710834980011, + "learning_rate": 8.282082910160544e-05, + "loss": 1.8554, + "step": 9577 + }, + { + "epoch": 2.939840392879067, + "grad_norm": 0.32447734475135803, + "learning_rate": 8.28170791559588e-05, + "loss": 1.8086, + "step": 9578 + }, + { + "epoch": 2.940147329650092, + "grad_norm": 0.3495276868343353, + "learning_rate": 8.281332888599455e-05, + "loss": 1.785, + "step": 9579 + }, + { + "epoch": 2.940454266421117, + "grad_norm": 0.3324705958366394, + "learning_rate": 8.280957829174975e-05, + "loss": 1.8086, + "step": 9580 + }, + { + "epoch": 2.9407612031921424, + "grad_norm": 0.2633898854255676, + "learning_rate": 8.280582737326146e-05, + "loss": 1.8116, + "step": 9581 + }, + { + "epoch": 2.9410681399631677, + "grad_norm": 0.3109157085418701, + "learning_rate": 8.280207613056676e-05, + "loss": 1.8649, + "step": 9582 + }, + { + "epoch": 2.9413750767341926, + "grad_norm": 0.2772599756717682, + "learning_rate": 8.279832456370273e-05, + "loss": 1.8578, + "step": 9583 + }, + { + "epoch": 2.941682013505218, + "grad_norm": 0.32322654128074646, + "learning_rate": 8.279457267270642e-05, + "loss": 1.8621, + "step": 9584 + }, + { + "epoch": 2.941988950276243, + "grad_norm": 0.3678343594074249, + "learning_rate": 8.279082045761493e-05, + "loss": 1.8819, + "step": 9585 + }, + { + "epoch": 2.942295887047268, + "grad_norm": 0.30976057052612305, + "learning_rate": 8.27870679184653e-05, + "loss": 1.8126, + "step": 9586 + }, + { + "epoch": 2.9426028238182935, + "grad_norm": 0.26715603470802307, + "learning_rate": 8.278331505529469e-05, + "loss": 1.8831, + "step": 9587 + }, + { + "epoch": 2.942909760589319, + "grad_norm": 0.263288289308548, + "learning_rate": 8.277956186814014e-05, + "loss": 1.8057, + "step": 9588 + }, + { + "epoch": 2.9432166973603437, + "grad_norm": 0.29458633065223694, + "learning_rate": 8.277580835703873e-05, + "loss": 1.7307, + "step": 9589 + }, + { + "epoch": 2.943523634131369, + "grad_norm": 0.27819791436195374, + "learning_rate": 8.277205452202759e-05, + "loss": 1.8783, + "step": 9590 + }, + { + "epoch": 2.943830570902394, + "grad_norm": 0.29286056756973267, + "learning_rate": 8.276830036314379e-05, + "loss": 1.8061, + "step": 9591 + }, + { + "epoch": 2.9441375076734193, + "grad_norm": 0.2955230474472046, + "learning_rate": 8.276454588042442e-05, + "loss": 1.8227, + "step": 9592 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.3070714473724365, + "learning_rate": 8.276079107390663e-05, + "loss": 1.8451, + "step": 9593 + }, + { + "epoch": 2.9447513812154695, + "grad_norm": 0.34235841035842896, + "learning_rate": 8.275703594362749e-05, + "loss": 1.8052, + "step": 9594 + }, + { + "epoch": 2.945058317986495, + "grad_norm": 0.2863236665725708, + "learning_rate": 8.275328048962412e-05, + "loss": 1.8741, + "step": 9595 + }, + { + "epoch": 2.9453652547575198, + "grad_norm": 0.3013235032558441, + "learning_rate": 8.274952471193364e-05, + "loss": 1.8177, + "step": 9596 + }, + { + "epoch": 2.945672191528545, + "grad_norm": 0.2994023561477661, + "learning_rate": 8.274576861059316e-05, + "loss": 1.903, + "step": 9597 + }, + { + "epoch": 2.9459791282995704, + "grad_norm": 0.320049524307251, + "learning_rate": 8.27420121856398e-05, + "loss": 1.882, + "step": 9598 + }, + { + "epoch": 2.9462860650705953, + "grad_norm": 0.2789655029773712, + "learning_rate": 8.273825543711069e-05, + "loss": 1.794, + "step": 9599 + }, + { + "epoch": 2.9465930018416207, + "grad_norm": 0.3148564398288727, + "learning_rate": 8.273449836504294e-05, + "loss": 1.8453, + "step": 9600 + }, + { + "epoch": 2.9468999386126455, + "grad_norm": 0.46754372119903564, + "learning_rate": 8.273074096947371e-05, + "loss": 1.8147, + "step": 9601 + }, + { + "epoch": 2.947206875383671, + "grad_norm": 0.5946900844573975, + "learning_rate": 8.27269832504401e-05, + "loss": 1.8099, + "step": 9602 + }, + { + "epoch": 2.947513812154696, + "grad_norm": 0.4916069507598877, + "learning_rate": 8.272322520797926e-05, + "loss": 1.8315, + "step": 9603 + }, + { + "epoch": 2.9478207489257215, + "grad_norm": 0.30378973484039307, + "learning_rate": 8.271946684212833e-05, + "loss": 1.87, + "step": 9604 + }, + { + "epoch": 2.9481276856967464, + "grad_norm": 0.5197327136993408, + "learning_rate": 8.271570815292447e-05, + "loss": 1.8109, + "step": 9605 + }, + { + "epoch": 2.9484346224677718, + "grad_norm": 0.7213841080665588, + "learning_rate": 8.271194914040478e-05, + "loss": 1.8526, + "step": 9606 + }, + { + "epoch": 2.9487415592387967, + "grad_norm": 0.5521572232246399, + "learning_rate": 8.270818980460643e-05, + "loss": 1.7982, + "step": 9607 + }, + { + "epoch": 2.949048496009822, + "grad_norm": 0.3072868287563324, + "learning_rate": 8.27044301455666e-05, + "loss": 1.8708, + "step": 9608 + }, + { + "epoch": 2.9493554327808473, + "grad_norm": 0.5477200746536255, + "learning_rate": 8.270067016332241e-05, + "loss": 1.8708, + "step": 9609 + }, + { + "epoch": 2.949662369551872, + "grad_norm": 0.5991030335426331, + "learning_rate": 8.269690985791104e-05, + "loss": 1.7983, + "step": 9610 + }, + { + "epoch": 2.9499693063228976, + "grad_norm": 0.33343803882598877, + "learning_rate": 8.269314922936964e-05, + "loss": 1.7867, + "step": 9611 + }, + { + "epoch": 2.9502762430939224, + "grad_norm": 0.3671727776527405, + "learning_rate": 8.268938827773538e-05, + "loss": 1.9604, + "step": 9612 + }, + { + "epoch": 2.950583179864948, + "grad_norm": 0.5015503764152527, + "learning_rate": 8.26856270030454e-05, + "loss": 1.8424, + "step": 9613 + }, + { + "epoch": 2.950890116635973, + "grad_norm": 0.4369170367717743, + "learning_rate": 8.268186540533693e-05, + "loss": 1.7915, + "step": 9614 + }, + { + "epoch": 2.951197053406998, + "grad_norm": 0.2739746868610382, + "learning_rate": 8.267810348464709e-05, + "loss": 1.7816, + "step": 9615 + }, + { + "epoch": 2.9515039901780233, + "grad_norm": 0.3660983145236969, + "learning_rate": 8.26743412410131e-05, + "loss": 1.8235, + "step": 9616 + }, + { + "epoch": 2.9518109269490482, + "grad_norm": 0.44442248344421387, + "learning_rate": 8.26705786744721e-05, + "loss": 1.8566, + "step": 9617 + }, + { + "epoch": 2.9521178637200736, + "grad_norm": 0.28847622871398926, + "learning_rate": 8.266681578506129e-05, + "loss": 1.82, + "step": 9618 + }, + { + "epoch": 2.952424800491099, + "grad_norm": 0.32827475666999817, + "learning_rate": 8.266305257281786e-05, + "loss": 1.8422, + "step": 9619 + }, + { + "epoch": 2.9527317372621242, + "grad_norm": 0.3459654748439789, + "learning_rate": 8.265928903777902e-05, + "loss": 1.7919, + "step": 9620 + }, + { + "epoch": 2.953038674033149, + "grad_norm": 0.31467050313949585, + "learning_rate": 8.265552517998191e-05, + "loss": 1.8178, + "step": 9621 + }, + { + "epoch": 2.9533456108041745, + "grad_norm": 0.2814936935901642, + "learning_rate": 8.265176099946381e-05, + "loss": 1.7823, + "step": 9622 + }, + { + "epoch": 2.9536525475751993, + "grad_norm": 0.36387261748313904, + "learning_rate": 8.264799649626182e-05, + "loss": 1.7861, + "step": 9623 + }, + { + "epoch": 2.9539594843462247, + "grad_norm": 0.3504095673561096, + "learning_rate": 8.264423167041322e-05, + "loss": 1.8216, + "step": 9624 + }, + { + "epoch": 2.95426642111725, + "grad_norm": 0.28199300169944763, + "learning_rate": 8.264046652195519e-05, + "loss": 1.8397, + "step": 9625 + }, + { + "epoch": 2.954573357888275, + "grad_norm": 0.435774028301239, + "learning_rate": 8.263670105092494e-05, + "loss": 1.8316, + "step": 9626 + }, + { + "epoch": 2.9548802946593002, + "grad_norm": 0.37712937593460083, + "learning_rate": 8.263293525735967e-05, + "loss": 1.8089, + "step": 9627 + }, + { + "epoch": 2.955187231430325, + "grad_norm": 0.34833967685699463, + "learning_rate": 8.26291691412966e-05, + "loss": 1.8324, + "step": 9628 + }, + { + "epoch": 2.9554941682013505, + "grad_norm": 0.37515538930892944, + "learning_rate": 8.262540270277297e-05, + "loss": 1.7958, + "step": 9629 + }, + { + "epoch": 2.955801104972376, + "grad_norm": 0.3392273485660553, + "learning_rate": 8.262163594182598e-05, + "loss": 1.8322, + "step": 9630 + }, + { + "epoch": 2.9561080417434007, + "grad_norm": 0.3477925956249237, + "learning_rate": 8.261786885849287e-05, + "loss": 1.8525, + "step": 9631 + }, + { + "epoch": 2.956414978514426, + "grad_norm": 0.35574036836624146, + "learning_rate": 8.261410145281085e-05, + "loss": 1.8148, + "step": 9632 + }, + { + "epoch": 2.9567219152854514, + "grad_norm": 0.3166620135307312, + "learning_rate": 8.261033372481717e-05, + "loss": 1.7914, + "step": 9633 + }, + { + "epoch": 2.9570288520564763, + "grad_norm": 0.2562217116355896, + "learning_rate": 8.260656567454907e-05, + "loss": 1.7794, + "step": 9634 + }, + { + "epoch": 2.9573357888275016, + "grad_norm": 0.3328792452812195, + "learning_rate": 8.260279730204377e-05, + "loss": 1.8235, + "step": 9635 + }, + { + "epoch": 2.957642725598527, + "grad_norm": 0.33144834637641907, + "learning_rate": 8.259902860733852e-05, + "loss": 1.7668, + "step": 9636 + }, + { + "epoch": 2.957949662369552, + "grad_norm": 0.30557021498680115, + "learning_rate": 8.259525959047056e-05, + "loss": 1.9135, + "step": 9637 + }, + { + "epoch": 2.958256599140577, + "grad_norm": 0.2901468575000763, + "learning_rate": 8.259149025147713e-05, + "loss": 1.8023, + "step": 9638 + }, + { + "epoch": 2.958563535911602, + "grad_norm": 0.35177919268608093, + "learning_rate": 8.25877205903955e-05, + "loss": 1.8541, + "step": 9639 + }, + { + "epoch": 2.9588704726826274, + "grad_norm": 0.2745177447795868, + "learning_rate": 8.258395060726291e-05, + "loss": 1.8103, + "step": 9640 + }, + { + "epoch": 2.9591774094536527, + "grad_norm": 0.29005685448646545, + "learning_rate": 8.258018030211663e-05, + "loss": 1.7587, + "step": 9641 + }, + { + "epoch": 2.9594843462246776, + "grad_norm": 0.27498918771743774, + "learning_rate": 8.257640967499391e-05, + "loss": 1.8052, + "step": 9642 + }, + { + "epoch": 2.959791282995703, + "grad_norm": 0.2689644694328308, + "learning_rate": 8.257263872593202e-05, + "loss": 1.8582, + "step": 9643 + }, + { + "epoch": 2.960098219766728, + "grad_norm": 0.2953707277774811, + "learning_rate": 8.256886745496821e-05, + "loss": 1.7654, + "step": 9644 + }, + { + "epoch": 2.960405156537753, + "grad_norm": 0.2573971450328827, + "learning_rate": 8.256509586213978e-05, + "loss": 1.7819, + "step": 9645 + }, + { + "epoch": 2.9607120933087785, + "grad_norm": 0.29667192697525024, + "learning_rate": 8.256132394748398e-05, + "loss": 1.8632, + "step": 9646 + }, + { + "epoch": 2.961019030079804, + "grad_norm": 0.2953830361366272, + "learning_rate": 8.255755171103808e-05, + "loss": 1.8672, + "step": 9647 + }, + { + "epoch": 2.9613259668508287, + "grad_norm": 0.2925500273704529, + "learning_rate": 8.255377915283937e-05, + "loss": 1.8691, + "step": 9648 + }, + { + "epoch": 2.961632903621854, + "grad_norm": 0.32245302200317383, + "learning_rate": 8.255000627292515e-05, + "loss": 1.8701, + "step": 9649 + }, + { + "epoch": 2.961939840392879, + "grad_norm": 0.2671414315700531, + "learning_rate": 8.254623307133268e-05, + "loss": 1.8045, + "step": 9650 + }, + { + "epoch": 2.9622467771639043, + "grad_norm": 0.3135749101638794, + "learning_rate": 8.254245954809928e-05, + "loss": 1.7573, + "step": 9651 + }, + { + "epoch": 2.9625537139349296, + "grad_norm": 0.2604369521141052, + "learning_rate": 8.253868570326218e-05, + "loss": 1.8513, + "step": 9652 + }, + { + "epoch": 2.9628606507059545, + "grad_norm": 0.24657092988491058, + "learning_rate": 8.253491153685875e-05, + "loss": 1.8303, + "step": 9653 + }, + { + "epoch": 2.96316758747698, + "grad_norm": 0.24310527741909027, + "learning_rate": 8.253113704892623e-05, + "loss": 1.7648, + "step": 9654 + }, + { + "epoch": 2.9634745242480047, + "grad_norm": 0.24558408558368683, + "learning_rate": 8.252736223950198e-05, + "loss": 1.7517, + "step": 9655 + }, + { + "epoch": 2.96378146101903, + "grad_norm": 0.2500043511390686, + "learning_rate": 8.252358710862324e-05, + "loss": 1.7588, + "step": 9656 + }, + { + "epoch": 2.9640883977900554, + "grad_norm": 0.2532055079936981, + "learning_rate": 8.251981165632737e-05, + "loss": 1.8414, + "step": 9657 + }, + { + "epoch": 2.9643953345610803, + "grad_norm": 0.2692684829235077, + "learning_rate": 8.251603588265165e-05, + "loss": 1.8701, + "step": 9658 + }, + { + "epoch": 2.9647022713321056, + "grad_norm": 0.2511022984981537, + "learning_rate": 8.251225978763341e-05, + "loss": 1.8068, + "step": 9659 + }, + { + "epoch": 2.9650092081031305, + "grad_norm": 0.24702081084251404, + "learning_rate": 8.250848337130997e-05, + "loss": 1.7993, + "step": 9660 + }, + { + "epoch": 2.965316144874156, + "grad_norm": 0.26960623264312744, + "learning_rate": 8.250470663371862e-05, + "loss": 1.8269, + "step": 9661 + }, + { + "epoch": 2.965623081645181, + "grad_norm": 0.2651064693927765, + "learning_rate": 8.250092957489673e-05, + "loss": 1.8235, + "step": 9662 + }, + { + "epoch": 2.9659300184162065, + "grad_norm": 0.3117934465408325, + "learning_rate": 8.249715219488158e-05, + "loss": 1.9603, + "step": 9663 + }, + { + "epoch": 2.9662369551872314, + "grad_norm": 0.3244706988334656, + "learning_rate": 8.249337449371055e-05, + "loss": 1.8766, + "step": 9664 + }, + { + "epoch": 2.9665438919582567, + "grad_norm": 0.3071763515472412, + "learning_rate": 8.248959647142094e-05, + "loss": 1.8118, + "step": 9665 + }, + { + "epoch": 2.9668508287292816, + "grad_norm": 0.2575626075267792, + "learning_rate": 8.24858181280501e-05, + "loss": 1.8578, + "step": 9666 + }, + { + "epoch": 2.967157765500307, + "grad_norm": 0.369356244802475, + "learning_rate": 8.248203946363535e-05, + "loss": 1.7831, + "step": 9667 + }, + { + "epoch": 2.9674647022713323, + "grad_norm": 0.317775160074234, + "learning_rate": 8.247826047821405e-05, + "loss": 1.8839, + "step": 9668 + }, + { + "epoch": 2.967771639042357, + "grad_norm": 0.31816980242729187, + "learning_rate": 8.247448117182355e-05, + "loss": 1.8111, + "step": 9669 + }, + { + "epoch": 2.9680785758133825, + "grad_norm": 0.2943781316280365, + "learning_rate": 8.247070154450119e-05, + "loss": 1.848, + "step": 9670 + }, + { + "epoch": 2.9683855125844074, + "grad_norm": 0.28252434730529785, + "learning_rate": 8.246692159628433e-05, + "loss": 1.8601, + "step": 9671 + }, + { + "epoch": 2.9686924493554327, + "grad_norm": 0.29150691628456116, + "learning_rate": 8.246314132721032e-05, + "loss": 1.7738, + "step": 9672 + }, + { + "epoch": 2.968999386126458, + "grad_norm": 0.3699757754802704, + "learning_rate": 8.245936073731653e-05, + "loss": 1.842, + "step": 9673 + }, + { + "epoch": 2.969306322897483, + "grad_norm": 0.37951794266700745, + "learning_rate": 8.245557982664031e-05, + "loss": 1.8648, + "step": 9674 + }, + { + "epoch": 2.9696132596685083, + "grad_norm": 0.2792273461818695, + "learning_rate": 8.245179859521901e-05, + "loss": 1.889, + "step": 9675 + }, + { + "epoch": 2.969920196439533, + "grad_norm": 0.3405047059059143, + "learning_rate": 8.244801704309002e-05, + "loss": 1.7658, + "step": 9676 + }, + { + "epoch": 2.9702271332105585, + "grad_norm": 0.40138551592826843, + "learning_rate": 8.244423517029072e-05, + "loss": 1.79, + "step": 9677 + }, + { + "epoch": 2.970534069981584, + "grad_norm": 0.42260462045669556, + "learning_rate": 8.244045297685846e-05, + "loss": 1.9248, + "step": 9678 + }, + { + "epoch": 2.970841006752609, + "grad_norm": 0.30391061305999756, + "learning_rate": 8.243667046283063e-05, + "loss": 1.7922, + "step": 9679 + }, + { + "epoch": 2.971147943523634, + "grad_norm": 0.3194752037525177, + "learning_rate": 8.243288762824463e-05, + "loss": 1.8582, + "step": 9680 + }, + { + "epoch": 2.9714548802946594, + "grad_norm": 0.47853100299835205, + "learning_rate": 8.24291044731378e-05, + "loss": 1.8206, + "step": 9681 + }, + { + "epoch": 2.9717618170656843, + "grad_norm": 0.47428956627845764, + "learning_rate": 8.242532099754756e-05, + "loss": 1.8271, + "step": 9682 + }, + { + "epoch": 2.9720687538367097, + "grad_norm": 0.30275169014930725, + "learning_rate": 8.24215372015113e-05, + "loss": 1.8532, + "step": 9683 + }, + { + "epoch": 2.972375690607735, + "grad_norm": 0.31766825914382935, + "learning_rate": 8.24177530850664e-05, + "loss": 1.7751, + "step": 9684 + }, + { + "epoch": 2.97268262737876, + "grad_norm": 0.3738986551761627, + "learning_rate": 8.241396864825026e-05, + "loss": 1.7644, + "step": 9685 + }, + { + "epoch": 2.972989564149785, + "grad_norm": 0.2794596254825592, + "learning_rate": 8.24101838911003e-05, + "loss": 1.7445, + "step": 9686 + }, + { + "epoch": 2.97329650092081, + "grad_norm": 0.30008718371391296, + "learning_rate": 8.240639881365388e-05, + "loss": 1.8181, + "step": 9687 + }, + { + "epoch": 2.9736034376918354, + "grad_norm": 0.36667200922966003, + "learning_rate": 8.240261341594846e-05, + "loss": 1.8606, + "step": 9688 + }, + { + "epoch": 2.9739103744628608, + "grad_norm": 0.2943612039089203, + "learning_rate": 8.23988276980214e-05, + "loss": 1.8169, + "step": 9689 + }, + { + "epoch": 2.9742173112338857, + "grad_norm": 0.3499365746974945, + "learning_rate": 8.239504165991015e-05, + "loss": 1.8901, + "step": 9690 + }, + { + "epoch": 2.974524248004911, + "grad_norm": 0.35552978515625, + "learning_rate": 8.239125530165211e-05, + "loss": 1.8266, + "step": 9691 + }, + { + "epoch": 2.974831184775936, + "grad_norm": 0.35415011644363403, + "learning_rate": 8.23874686232847e-05, + "loss": 1.8588, + "step": 9692 + }, + { + "epoch": 2.9751381215469612, + "grad_norm": 0.3237420618534088, + "learning_rate": 8.238368162484533e-05, + "loss": 1.8112, + "step": 9693 + }, + { + "epoch": 2.9754450583179866, + "grad_norm": 0.31672203540802, + "learning_rate": 8.237989430637145e-05, + "loss": 1.7983, + "step": 9694 + }, + { + "epoch": 2.975751995089012, + "grad_norm": 0.2926657795906067, + "learning_rate": 8.237610666790048e-05, + "loss": 1.8137, + "step": 9695 + }, + { + "epoch": 2.976058931860037, + "grad_norm": 0.2924230992794037, + "learning_rate": 8.237231870946983e-05, + "loss": 1.8789, + "step": 9696 + }, + { + "epoch": 2.976365868631062, + "grad_norm": 0.2768077850341797, + "learning_rate": 8.236853043111697e-05, + "loss": 1.8643, + "step": 9697 + }, + { + "epoch": 2.976672805402087, + "grad_norm": 0.24151389300823212, + "learning_rate": 8.23647418328793e-05, + "loss": 1.8245, + "step": 9698 + }, + { + "epoch": 2.9769797421731123, + "grad_norm": 0.24514195322990417, + "learning_rate": 8.23609529147943e-05, + "loss": 1.761, + "step": 9699 + }, + { + "epoch": 2.9772866789441377, + "grad_norm": 0.2619125545024872, + "learning_rate": 8.235716367689938e-05, + "loss": 1.8445, + "step": 9700 + }, + { + "epoch": 2.9775936157151626, + "grad_norm": 0.2570437490940094, + "learning_rate": 8.235337411923203e-05, + "loss": 1.7881, + "step": 9701 + }, + { + "epoch": 2.977900552486188, + "grad_norm": 0.288775235414505, + "learning_rate": 8.234958424182966e-05, + "loss": 1.8177, + "step": 9702 + }, + { + "epoch": 2.978207489257213, + "grad_norm": 0.3186240792274475, + "learning_rate": 8.234579404472973e-05, + "loss": 1.8438, + "step": 9703 + }, + { + "epoch": 2.978514426028238, + "grad_norm": 0.2520117163658142, + "learning_rate": 8.23420035279697e-05, + "loss": 1.7791, + "step": 9704 + }, + { + "epoch": 2.9788213627992635, + "grad_norm": 0.23164312541484833, + "learning_rate": 8.233821269158706e-05, + "loss": 1.7368, + "step": 9705 + }, + { + "epoch": 2.979128299570289, + "grad_norm": 0.33843451738357544, + "learning_rate": 8.233442153561924e-05, + "loss": 1.8656, + "step": 9706 + }, + { + "epoch": 2.9794352363413137, + "grad_norm": 0.3070257604122162, + "learning_rate": 8.23306300601037e-05, + "loss": 1.7982, + "step": 9707 + }, + { + "epoch": 2.979742173112339, + "grad_norm": 0.29138872027397156, + "learning_rate": 8.232683826507793e-05, + "loss": 1.8227, + "step": 9708 + }, + { + "epoch": 2.980049109883364, + "grad_norm": 0.22698308527469635, + "learning_rate": 8.23230461505794e-05, + "loss": 1.7841, + "step": 9709 + }, + { + "epoch": 2.9803560466543892, + "grad_norm": 0.2597857713699341, + "learning_rate": 8.231925371664559e-05, + "loss": 1.7438, + "step": 9710 + }, + { + "epoch": 2.9806629834254146, + "grad_norm": 0.28672367334365845, + "learning_rate": 8.231546096331395e-05, + "loss": 1.8415, + "step": 9711 + }, + { + "epoch": 2.9809699201964395, + "grad_norm": 0.24295037984848022, + "learning_rate": 8.2311667890622e-05, + "loss": 1.8179, + "step": 9712 + }, + { + "epoch": 2.981276856967465, + "grad_norm": 0.24558894336223602, + "learning_rate": 8.23078744986072e-05, + "loss": 1.8092, + "step": 9713 + }, + { + "epoch": 2.9815837937384897, + "grad_norm": 0.2644276022911072, + "learning_rate": 8.230408078730706e-05, + "loss": 1.8214, + "step": 9714 + }, + { + "epoch": 2.981890730509515, + "grad_norm": 0.27007076144218445, + "learning_rate": 8.230028675675907e-05, + "loss": 1.8042, + "step": 9715 + }, + { + "epoch": 2.9821976672805404, + "grad_norm": 0.2729937732219696, + "learning_rate": 8.229649240700069e-05, + "loss": 1.8419, + "step": 9716 + }, + { + "epoch": 2.9825046040515653, + "grad_norm": 0.26545679569244385, + "learning_rate": 8.229269773806945e-05, + "loss": 1.823, + "step": 9717 + }, + { + "epoch": 2.9828115408225906, + "grad_norm": 0.23276878893375397, + "learning_rate": 8.228890275000285e-05, + "loss": 1.7635, + "step": 9718 + }, + { + "epoch": 2.9831184775936155, + "grad_norm": 0.28991779685020447, + "learning_rate": 8.228510744283837e-05, + "loss": 1.8303, + "step": 9719 + }, + { + "epoch": 2.983425414364641, + "grad_norm": 0.2821960151195526, + "learning_rate": 8.228131181661357e-05, + "loss": 1.8246, + "step": 9720 + }, + { + "epoch": 2.983732351135666, + "grad_norm": 0.25588423013687134, + "learning_rate": 8.22775158713659e-05, + "loss": 1.7764, + "step": 9721 + }, + { + "epoch": 2.9840392879066915, + "grad_norm": 0.2694758176803589, + "learning_rate": 8.227371960713289e-05, + "loss": 1.8026, + "step": 9722 + }, + { + "epoch": 2.9843462246777164, + "grad_norm": 0.27571097016334534, + "learning_rate": 8.226992302395209e-05, + "loss": 1.8051, + "step": 9723 + }, + { + "epoch": 2.9846531614487417, + "grad_norm": 0.2940119504928589, + "learning_rate": 8.226612612186099e-05, + "loss": 1.8782, + "step": 9724 + }, + { + "epoch": 2.9849600982197666, + "grad_norm": 0.34924936294555664, + "learning_rate": 8.226232890089711e-05, + "loss": 1.7845, + "step": 9725 + }, + { + "epoch": 2.985267034990792, + "grad_norm": 0.30503180623054504, + "learning_rate": 8.2258531361098e-05, + "loss": 1.8345, + "step": 9726 + }, + { + "epoch": 2.9855739717618173, + "grad_norm": 0.2463730275630951, + "learning_rate": 8.225473350250117e-05, + "loss": 1.8188, + "step": 9727 + }, + { + "epoch": 2.985880908532842, + "grad_norm": 0.3514629900455475, + "learning_rate": 8.225093532514417e-05, + "loss": 1.9253, + "step": 9728 + }, + { + "epoch": 2.9861878453038675, + "grad_norm": 0.26462769508361816, + "learning_rate": 8.224713682906449e-05, + "loss": 1.7396, + "step": 9729 + }, + { + "epoch": 2.9864947820748924, + "grad_norm": 0.27125996351242065, + "learning_rate": 8.224333801429973e-05, + "loss": 1.7784, + "step": 9730 + }, + { + "epoch": 2.9868017188459177, + "grad_norm": 0.3083387315273285, + "learning_rate": 8.22395388808874e-05, + "loss": 1.8503, + "step": 9731 + }, + { + "epoch": 2.987108655616943, + "grad_norm": 0.28289708495140076, + "learning_rate": 8.223573942886505e-05, + "loss": 1.8337, + "step": 9732 + }, + { + "epoch": 2.987415592387968, + "grad_norm": 0.3667753040790558, + "learning_rate": 8.223193965827023e-05, + "loss": 1.8213, + "step": 9733 + }, + { + "epoch": 2.9877225291589933, + "grad_norm": 0.3568948805332184, + "learning_rate": 8.222813956914049e-05, + "loss": 1.8337, + "step": 9734 + }, + { + "epoch": 2.988029465930018, + "grad_norm": 0.2883065640926361, + "learning_rate": 8.22243391615134e-05, + "loss": 1.7227, + "step": 9735 + }, + { + "epoch": 2.9883364027010435, + "grad_norm": 0.24940936267375946, + "learning_rate": 8.222053843542648e-05, + "loss": 1.7889, + "step": 9736 + }, + { + "epoch": 2.988643339472069, + "grad_norm": 0.31267982721328735, + "learning_rate": 8.221673739091732e-05, + "loss": 1.8432, + "step": 9737 + }, + { + "epoch": 2.988950276243094, + "grad_norm": 0.3552311658859253, + "learning_rate": 8.221293602802349e-05, + "loss": 1.8569, + "step": 9738 + }, + { + "epoch": 2.989257213014119, + "grad_norm": 0.4149966835975647, + "learning_rate": 8.220913434678252e-05, + "loss": 1.8052, + "step": 9739 + }, + { + "epoch": 2.9895641497851444, + "grad_norm": 0.282320499420166, + "learning_rate": 8.220533234723204e-05, + "loss": 1.7629, + "step": 9740 + }, + { + "epoch": 2.9898710865561693, + "grad_norm": 0.27737030386924744, + "learning_rate": 8.220153002940958e-05, + "loss": 1.8331, + "step": 9741 + }, + { + "epoch": 2.9901780233271946, + "grad_norm": 0.29296645522117615, + "learning_rate": 8.219772739335272e-05, + "loss": 1.8414, + "step": 9742 + }, + { + "epoch": 2.99048496009822, + "grad_norm": 0.35226449370384216, + "learning_rate": 8.219392443909903e-05, + "loss": 1.8608, + "step": 9743 + }, + { + "epoch": 2.990791896869245, + "grad_norm": 0.3199223577976227, + "learning_rate": 8.219012116668612e-05, + "loss": 1.7868, + "step": 9744 + }, + { + "epoch": 2.99109883364027, + "grad_norm": 0.2904597818851471, + "learning_rate": 8.218631757615159e-05, + "loss": 1.8495, + "step": 9745 + }, + { + "epoch": 2.991405770411295, + "grad_norm": 0.34674009680747986, + "learning_rate": 8.218251366753298e-05, + "loss": 1.8143, + "step": 9746 + }, + { + "epoch": 2.9917127071823204, + "grad_norm": 0.38007479906082153, + "learning_rate": 8.217870944086791e-05, + "loss": 1.8534, + "step": 9747 + }, + { + "epoch": 2.9920196439533457, + "grad_norm": 0.31660130620002747, + "learning_rate": 8.217490489619398e-05, + "loss": 1.7807, + "step": 9748 + }, + { + "epoch": 2.9923265807243706, + "grad_norm": 0.2923539876937866, + "learning_rate": 8.217110003354877e-05, + "loss": 1.8517, + "step": 9749 + }, + { + "epoch": 2.992633517495396, + "grad_norm": 0.31018227338790894, + "learning_rate": 8.21672948529699e-05, + "loss": 1.7998, + "step": 9750 + }, + { + "epoch": 2.992940454266421, + "grad_norm": 0.29448994994163513, + "learning_rate": 8.216348935449496e-05, + "loss": 1.7883, + "step": 9751 + }, + { + "epoch": 2.993247391037446, + "grad_norm": 0.26120781898498535, + "learning_rate": 8.215968353816158e-05, + "loss": 1.7762, + "step": 9752 + }, + { + "epoch": 2.9935543278084715, + "grad_norm": 0.27784180641174316, + "learning_rate": 8.215587740400735e-05, + "loss": 1.8711, + "step": 9753 + }, + { + "epoch": 2.993861264579497, + "grad_norm": 0.3106052577495575, + "learning_rate": 8.21520709520699e-05, + "loss": 1.8112, + "step": 9754 + }, + { + "epoch": 2.9941682013505218, + "grad_norm": 0.3170885145664215, + "learning_rate": 8.214826418238684e-05, + "loss": 1.8893, + "step": 9755 + }, + { + "epoch": 2.994475138121547, + "grad_norm": 0.2969432473182678, + "learning_rate": 8.214445709499577e-05, + "loss": 1.8628, + "step": 9756 + }, + { + "epoch": 2.994782074892572, + "grad_norm": 0.30484744906425476, + "learning_rate": 8.214064968993436e-05, + "loss": 1.8421, + "step": 9757 + }, + { + "epoch": 2.9950890116635973, + "grad_norm": 0.24819856882095337, + "learning_rate": 8.213684196724019e-05, + "loss": 1.8243, + "step": 9758 + }, + { + "epoch": 2.9953959484346226, + "grad_norm": 0.28566786646842957, + "learning_rate": 8.213303392695092e-05, + "loss": 1.8064, + "step": 9759 + }, + { + "epoch": 2.9957028852056475, + "grad_norm": 0.27742111682891846, + "learning_rate": 8.212922556910418e-05, + "loss": 1.8174, + "step": 9760 + }, + { + "epoch": 2.996009821976673, + "grad_norm": 0.27103090286254883, + "learning_rate": 8.212541689373761e-05, + "loss": 1.761, + "step": 9761 + }, + { + "epoch": 2.9963167587476978, + "grad_norm": 0.27157172560691833, + "learning_rate": 8.212160790088883e-05, + "loss": 1.8893, + "step": 9762 + }, + { + "epoch": 2.996623695518723, + "grad_norm": 0.2742370367050171, + "learning_rate": 8.21177985905955e-05, + "loss": 1.8774, + "step": 9763 + }, + { + "epoch": 2.9969306322897484, + "grad_norm": 0.26467064023017883, + "learning_rate": 8.211398896289524e-05, + "loss": 1.7805, + "step": 9764 + }, + { + "epoch": 2.9972375690607733, + "grad_norm": 0.2622149884700775, + "learning_rate": 8.211017901782574e-05, + "loss": 1.7346, + "step": 9765 + }, + { + "epoch": 2.9975445058317987, + "grad_norm": 0.3163202106952667, + "learning_rate": 8.210636875542462e-05, + "loss": 1.8348, + "step": 9766 + }, + { + "epoch": 2.9978514426028235, + "grad_norm": 0.2789528965950012, + "learning_rate": 8.210255817572955e-05, + "loss": 1.7535, + "step": 9767 + }, + { + "epoch": 2.998158379373849, + "grad_norm": 0.25694188475608826, + "learning_rate": 8.209874727877818e-05, + "loss": 1.8731, + "step": 9768 + }, + { + "epoch": 2.998465316144874, + "grad_norm": 0.40298742055892944, + "learning_rate": 8.209493606460818e-05, + "loss": 1.7924, + "step": 9769 + }, + { + "epoch": 2.9987722529158995, + "grad_norm": 0.5090280771255493, + "learning_rate": 8.20911245332572e-05, + "loss": 1.8253, + "step": 9770 + }, + { + "epoch": 2.9990791896869244, + "grad_norm": 0.41809162497520447, + "learning_rate": 8.208731268476293e-05, + "loss": 1.8233, + "step": 9771 + }, + { + "epoch": 2.9993861264579498, + "grad_norm": 0.23141434788703918, + "learning_rate": 8.208350051916303e-05, + "loss": 1.7842, + "step": 9772 + }, + { + "epoch": 2.9996930632289747, + "grad_norm": 0.3174372613430023, + "learning_rate": 8.207968803649517e-05, + "loss": 1.8477, + "step": 9773 + }, + { + "epoch": 3.0, + "grad_norm": 0.41795292496681213, + "learning_rate": 8.207587523679704e-05, + "loss": 1.8407, + "step": 9774 + }, + { + "epoch": 3.0003069367710253, + "grad_norm": 0.43365660309791565, + "learning_rate": 8.20720621201063e-05, + "loss": 1.8074, + "step": 9775 + }, + { + "epoch": 3.0006138735420502, + "grad_norm": 0.461374968290329, + "learning_rate": 8.206824868646064e-05, + "loss": 1.9089, + "step": 9776 + }, + { + "epoch": 3.0009208103130756, + "grad_norm": 0.3747929632663727, + "learning_rate": 8.206443493589776e-05, + "loss": 1.8358, + "step": 9777 + }, + { + "epoch": 3.001227747084101, + "grad_norm": 0.28436774015426636, + "learning_rate": 8.206062086845532e-05, + "loss": 1.8527, + "step": 9778 + }, + { + "epoch": 3.001534683855126, + "grad_norm": 0.33642131090164185, + "learning_rate": 8.205680648417106e-05, + "loss": 1.8142, + "step": 9779 + }, + { + "epoch": 3.001841620626151, + "grad_norm": 0.4283481240272522, + "learning_rate": 8.205299178308263e-05, + "loss": 1.9006, + "step": 9780 + }, + { + "epoch": 3.002148557397176, + "grad_norm": 0.34405630826950073, + "learning_rate": 8.204917676522777e-05, + "loss": 1.7988, + "step": 9781 + }, + { + "epoch": 3.0024554941682013, + "grad_norm": 0.3161070942878723, + "learning_rate": 8.204536143064414e-05, + "loss": 1.8271, + "step": 9782 + }, + { + "epoch": 3.0027624309392267, + "grad_norm": 0.42518749833106995, + "learning_rate": 8.204154577936946e-05, + "loss": 1.864, + "step": 9783 + }, + { + "epoch": 3.0030693677102516, + "grad_norm": 0.3760852813720703, + "learning_rate": 8.203772981144146e-05, + "loss": 1.8543, + "step": 9784 + }, + { + "epoch": 3.003376304481277, + "grad_norm": 0.32794755697250366, + "learning_rate": 8.203391352689784e-05, + "loss": 1.8776, + "step": 9785 + }, + { + "epoch": 3.0036832412523022, + "grad_norm": 0.3053889274597168, + "learning_rate": 8.20300969257763e-05, + "loss": 1.8064, + "step": 9786 + }, + { + "epoch": 3.003990178023327, + "grad_norm": 0.40283143520355225, + "learning_rate": 8.202628000811456e-05, + "loss": 1.8083, + "step": 9787 + }, + { + "epoch": 3.0042971147943525, + "grad_norm": 0.49270665645599365, + "learning_rate": 8.202246277395038e-05, + "loss": 1.802, + "step": 9788 + }, + { + "epoch": 3.0046040515653774, + "grad_norm": 0.4373023211956024, + "learning_rate": 8.201864522332143e-05, + "loss": 1.8429, + "step": 9789 + }, + { + "epoch": 3.0049109883364027, + "grad_norm": 0.3136310875415802, + "learning_rate": 8.201482735626547e-05, + "loss": 1.8224, + "step": 9790 + }, + { + "epoch": 3.005217925107428, + "grad_norm": 0.3306807279586792, + "learning_rate": 8.201100917282023e-05, + "loss": 1.8463, + "step": 9791 + }, + { + "epoch": 3.005524861878453, + "grad_norm": 0.45082196593284607, + "learning_rate": 8.200719067302342e-05, + "loss": 1.7587, + "step": 9792 + }, + { + "epoch": 3.0058317986494782, + "grad_norm": 0.49246448278427124, + "learning_rate": 8.20033718569128e-05, + "loss": 1.8245, + "step": 9793 + }, + { + "epoch": 3.0061387354205036, + "grad_norm": 0.3040246367454529, + "learning_rate": 8.199955272452609e-05, + "loss": 1.8309, + "step": 9794 + }, + { + "epoch": 3.0064456721915285, + "grad_norm": 0.3909318149089813, + "learning_rate": 8.199573327590105e-05, + "loss": 1.8187, + "step": 9795 + }, + { + "epoch": 3.006752608962554, + "grad_norm": 0.5753183960914612, + "learning_rate": 8.199191351107543e-05, + "loss": 1.826, + "step": 9796 + }, + { + "epoch": 3.0070595457335787, + "grad_norm": 0.48908689618110657, + "learning_rate": 8.198809343008695e-05, + "loss": 1.8475, + "step": 9797 + }, + { + "epoch": 3.007366482504604, + "grad_norm": 0.31570208072662354, + "learning_rate": 8.198427303297341e-05, + "loss": 1.8046, + "step": 9798 + }, + { + "epoch": 3.0076734192756294, + "grad_norm": 0.39205440878868103, + "learning_rate": 8.198045231977251e-05, + "loss": 1.8413, + "step": 9799 + }, + { + "epoch": 3.0079803560466543, + "grad_norm": 0.5117597579956055, + "learning_rate": 8.197663129052204e-05, + "loss": 1.8184, + "step": 9800 + }, + { + "epoch": 3.0082872928176796, + "grad_norm": 0.3623514175415039, + "learning_rate": 8.197280994525978e-05, + "loss": 1.8292, + "step": 9801 + }, + { + "epoch": 3.008594229588705, + "grad_norm": 0.2826726734638214, + "learning_rate": 8.196898828402344e-05, + "loss": 1.8216, + "step": 9802 + }, + { + "epoch": 3.00890116635973, + "grad_norm": 0.38658398389816284, + "learning_rate": 8.196516630685085e-05, + "loss": 1.867, + "step": 9803 + }, + { + "epoch": 3.009208103130755, + "grad_norm": 0.3371698260307312, + "learning_rate": 8.196134401377973e-05, + "loss": 1.8077, + "step": 9804 + }, + { + "epoch": 3.00951503990178, + "grad_norm": 0.24108785390853882, + "learning_rate": 8.195752140484789e-05, + "loss": 1.7858, + "step": 9805 + }, + { + "epoch": 3.0098219766728054, + "grad_norm": 0.34410104155540466, + "learning_rate": 8.195369848009309e-05, + "loss": 1.801, + "step": 9806 + }, + { + "epoch": 3.0101289134438307, + "grad_norm": 0.3412116467952728, + "learning_rate": 8.194987523955311e-05, + "loss": 1.7905, + "step": 9807 + }, + { + "epoch": 3.0104358502148556, + "grad_norm": 0.2473030537366867, + "learning_rate": 8.194605168326573e-05, + "loss": 1.7765, + "step": 9808 + }, + { + "epoch": 3.010742786985881, + "grad_norm": 0.28590065240859985, + "learning_rate": 8.194222781126875e-05, + "loss": 1.7897, + "step": 9809 + }, + { + "epoch": 3.0110497237569063, + "grad_norm": 0.2994272708892822, + "learning_rate": 8.193840362359994e-05, + "loss": 1.7976, + "step": 9810 + }, + { + "epoch": 3.011356660527931, + "grad_norm": 0.2971307635307312, + "learning_rate": 8.193457912029713e-05, + "loss": 1.829, + "step": 9811 + }, + { + "epoch": 3.0116635972989565, + "grad_norm": 0.25149810314178467, + "learning_rate": 8.193075430139809e-05, + "loss": 1.7709, + "step": 9812 + }, + { + "epoch": 3.0119705340699814, + "grad_norm": 0.2561332583427429, + "learning_rate": 8.19269291669406e-05, + "loss": 1.7689, + "step": 9813 + }, + { + "epoch": 3.0122774708410067, + "grad_norm": 0.2658882141113281, + "learning_rate": 8.192310371696249e-05, + "loss": 1.8497, + "step": 9814 + }, + { + "epoch": 3.012584407612032, + "grad_norm": 0.2873780429363251, + "learning_rate": 8.191927795150156e-05, + "loss": 1.8217, + "step": 9815 + }, + { + "epoch": 3.012891344383057, + "grad_norm": 0.2181183248758316, + "learning_rate": 8.191545187059562e-05, + "loss": 1.7261, + "step": 9816 + }, + { + "epoch": 3.0131982811540823, + "grad_norm": 0.2414858490228653, + "learning_rate": 8.191162547428248e-05, + "loss": 1.8035, + "step": 9817 + }, + { + "epoch": 3.0135052179251076, + "grad_norm": 0.2799840271472931, + "learning_rate": 8.190779876259995e-05, + "loss": 1.8279, + "step": 9818 + }, + { + "epoch": 3.0138121546961325, + "grad_norm": 0.2669760584831238, + "learning_rate": 8.190397173558584e-05, + "loss": 1.8155, + "step": 9819 + }, + { + "epoch": 3.014119091467158, + "grad_norm": 0.28857991099357605, + "learning_rate": 8.1900144393278e-05, + "loss": 1.8479, + "step": 9820 + }, + { + "epoch": 3.0144260282381827, + "grad_norm": 0.30534693598747253, + "learning_rate": 8.189631673571422e-05, + "loss": 1.8609, + "step": 9821 + }, + { + "epoch": 3.014732965009208, + "grad_norm": 0.3238218128681183, + "learning_rate": 8.189248876293236e-05, + "loss": 1.9292, + "step": 9822 + }, + { + "epoch": 3.0150399017802334, + "grad_norm": 0.3000536561012268, + "learning_rate": 8.188866047497022e-05, + "loss": 1.8214, + "step": 9823 + }, + { + "epoch": 3.0153468385512583, + "grad_norm": 0.2960065007209778, + "learning_rate": 8.188483187186565e-05, + "loss": 1.8316, + "step": 9824 + }, + { + "epoch": 3.0156537753222836, + "grad_norm": 0.28609779477119446, + "learning_rate": 8.188100295365648e-05, + "loss": 1.8002, + "step": 9825 + }, + { + "epoch": 3.015960712093309, + "grad_norm": 0.31390634179115295, + "learning_rate": 8.187717372038057e-05, + "loss": 1.8134, + "step": 9826 + }, + { + "epoch": 3.016267648864334, + "grad_norm": 0.28550946712493896, + "learning_rate": 8.187334417207573e-05, + "loss": 1.8359, + "step": 9827 + }, + { + "epoch": 3.016574585635359, + "grad_norm": 0.3085210621356964, + "learning_rate": 8.186951430877982e-05, + "loss": 1.813, + "step": 9828 + }, + { + "epoch": 3.016881522406384, + "grad_norm": 0.3043847978115082, + "learning_rate": 8.18656841305307e-05, + "loss": 1.8222, + "step": 9829 + }, + { + "epoch": 3.0171884591774094, + "grad_norm": 0.32524731755256653, + "learning_rate": 8.18618536373662e-05, + "loss": 1.8258, + "step": 9830 + }, + { + "epoch": 3.0174953959484347, + "grad_norm": 0.2690991461277008, + "learning_rate": 8.18580228293242e-05, + "loss": 1.8492, + "step": 9831 + }, + { + "epoch": 3.0178023327194596, + "grad_norm": 0.34936225414276123, + "learning_rate": 8.185419170644253e-05, + "loss": 1.8363, + "step": 9832 + }, + { + "epoch": 3.018109269490485, + "grad_norm": 0.3274296820163727, + "learning_rate": 8.185036026875908e-05, + "loss": 1.7789, + "step": 9833 + }, + { + "epoch": 3.0184162062615103, + "grad_norm": 0.2729836106300354, + "learning_rate": 8.184652851631169e-05, + "loss": 1.8264, + "step": 9834 + }, + { + "epoch": 3.018723143032535, + "grad_norm": 0.28682780265808105, + "learning_rate": 8.184269644913826e-05, + "loss": 1.8399, + "step": 9835 + }, + { + "epoch": 3.0190300798035605, + "grad_norm": 0.3224826455116272, + "learning_rate": 8.183886406727662e-05, + "loss": 1.8338, + "step": 9836 + }, + { + "epoch": 3.0193370165745854, + "grad_norm": 0.30945318937301636, + "learning_rate": 8.183503137076467e-05, + "loss": 1.8248, + "step": 9837 + }, + { + "epoch": 3.0196439533456108, + "grad_norm": 0.27580398321151733, + "learning_rate": 8.183119835964029e-05, + "loss": 1.8096, + "step": 9838 + }, + { + "epoch": 3.019950890116636, + "grad_norm": 0.28927183151245117, + "learning_rate": 8.182736503394132e-05, + "loss": 1.825, + "step": 9839 + }, + { + "epoch": 3.020257826887661, + "grad_norm": 0.253000408411026, + "learning_rate": 8.182353139370571e-05, + "loss": 1.7678, + "step": 9840 + }, + { + "epoch": 3.0205647636586863, + "grad_norm": 0.2882022559642792, + "learning_rate": 8.18196974389713e-05, + "loss": 1.8895, + "step": 9841 + }, + { + "epoch": 3.0208717004297116, + "grad_norm": 0.26864609122276306, + "learning_rate": 8.1815863169776e-05, + "loss": 1.7674, + "step": 9842 + }, + { + "epoch": 3.0211786372007365, + "grad_norm": 0.27344849705696106, + "learning_rate": 8.181202858615769e-05, + "loss": 1.8146, + "step": 9843 + }, + { + "epoch": 3.021485573971762, + "grad_norm": 0.31659772992134094, + "learning_rate": 8.180819368815425e-05, + "loss": 1.8485, + "step": 9844 + }, + { + "epoch": 3.021792510742787, + "grad_norm": 0.3163176476955414, + "learning_rate": 8.18043584758036e-05, + "loss": 1.8994, + "step": 9845 + }, + { + "epoch": 3.022099447513812, + "grad_norm": 0.2583829462528229, + "learning_rate": 8.180052294914365e-05, + "loss": 1.764, + "step": 9846 + }, + { + "epoch": 3.0224063842848374, + "grad_norm": 0.3006649315357208, + "learning_rate": 8.179668710821227e-05, + "loss": 1.9232, + "step": 9847 + }, + { + "epoch": 3.0227133210558623, + "grad_norm": 0.35702988505363464, + "learning_rate": 8.179285095304741e-05, + "loss": 1.8403, + "step": 9848 + }, + { + "epoch": 3.0230202578268877, + "grad_norm": 0.29699379205703735, + "learning_rate": 8.178901448368697e-05, + "loss": 1.8412, + "step": 9849 + }, + { + "epoch": 3.023327194597913, + "grad_norm": 0.3022700548171997, + "learning_rate": 8.178517770016885e-05, + "loss": 1.8197, + "step": 9850 + }, + { + "epoch": 3.023634131368938, + "grad_norm": 0.2943836748600006, + "learning_rate": 8.178134060253097e-05, + "loss": 1.8127, + "step": 9851 + }, + { + "epoch": 3.023941068139963, + "grad_norm": 0.31290489435195923, + "learning_rate": 8.177750319081126e-05, + "loss": 1.821, + "step": 9852 + }, + { + "epoch": 3.0242480049109886, + "grad_norm": 0.30308374762535095, + "learning_rate": 8.177366546504763e-05, + "loss": 1.8522, + "step": 9853 + }, + { + "epoch": 3.0245549416820134, + "grad_norm": 0.301559716463089, + "learning_rate": 8.176982742527802e-05, + "loss": 1.8758, + "step": 9854 + }, + { + "epoch": 3.0248618784530388, + "grad_norm": 0.33314836025238037, + "learning_rate": 8.176598907154034e-05, + "loss": 1.8178, + "step": 9855 + }, + { + "epoch": 3.0251688152240637, + "grad_norm": 0.3567935526371002, + "learning_rate": 8.176215040387255e-05, + "loss": 1.7847, + "step": 9856 + }, + { + "epoch": 3.025475751995089, + "grad_norm": 0.27716195583343506, + "learning_rate": 8.175831142231258e-05, + "loss": 1.772, + "step": 9857 + }, + { + "epoch": 3.0257826887661143, + "grad_norm": 0.24568212032318115, + "learning_rate": 8.175447212689836e-05, + "loss": 1.8171, + "step": 9858 + }, + { + "epoch": 3.0260896255371392, + "grad_norm": 0.25368261337280273, + "learning_rate": 8.175063251766784e-05, + "loss": 1.852, + "step": 9859 + }, + { + "epoch": 3.0263965623081646, + "grad_norm": 0.2509497404098511, + "learning_rate": 8.174679259465894e-05, + "loss": 1.7737, + "step": 9860 + }, + { + "epoch": 3.02670349907919, + "grad_norm": 0.3539343774318695, + "learning_rate": 8.174295235790963e-05, + "loss": 1.8663, + "step": 9861 + }, + { + "epoch": 3.027010435850215, + "grad_norm": 0.36450034379959106, + "learning_rate": 8.173911180745788e-05, + "loss": 1.8179, + "step": 9862 + }, + { + "epoch": 3.02731737262124, + "grad_norm": 0.3550017178058624, + "learning_rate": 8.173527094334162e-05, + "loss": 1.8256, + "step": 9863 + }, + { + "epoch": 3.027624309392265, + "grad_norm": 0.33518701791763306, + "learning_rate": 8.17314297655988e-05, + "loss": 1.7842, + "step": 9864 + }, + { + "epoch": 3.0279312461632903, + "grad_norm": 0.2522886097431183, + "learning_rate": 8.172758827426739e-05, + "loss": 1.7688, + "step": 9865 + }, + { + "epoch": 3.0282381829343157, + "grad_norm": 0.26222914457321167, + "learning_rate": 8.172374646938536e-05, + "loss": 1.8517, + "step": 9866 + }, + { + "epoch": 3.0285451197053406, + "grad_norm": 0.3355788588523865, + "learning_rate": 8.171990435099068e-05, + "loss": 1.9002, + "step": 9867 + }, + { + "epoch": 3.028852056476366, + "grad_norm": 0.32907500863075256, + "learning_rate": 8.171606191912131e-05, + "loss": 1.7801, + "step": 9868 + }, + { + "epoch": 3.0291589932473912, + "grad_norm": 0.29234179854393005, + "learning_rate": 8.171221917381523e-05, + "loss": 1.8055, + "step": 9869 + }, + { + "epoch": 3.029465930018416, + "grad_norm": 0.26374876499176025, + "learning_rate": 8.170837611511041e-05, + "loss": 1.781, + "step": 9870 + }, + { + "epoch": 3.0297728667894415, + "grad_norm": 0.311282217502594, + "learning_rate": 8.170453274304483e-05, + "loss": 1.839, + "step": 9871 + }, + { + "epoch": 3.0300798035604664, + "grad_norm": 0.24225831031799316, + "learning_rate": 8.170068905765648e-05, + "loss": 1.804, + "step": 9872 + }, + { + "epoch": 3.0303867403314917, + "grad_norm": 0.29383334517478943, + "learning_rate": 8.169684505898335e-05, + "loss": 1.7817, + "step": 9873 + }, + { + "epoch": 3.030693677102517, + "grad_norm": 0.2607928514480591, + "learning_rate": 8.169300074706339e-05, + "loss": 1.8379, + "step": 9874 + }, + { + "epoch": 3.031000613873542, + "grad_norm": 0.283028244972229, + "learning_rate": 8.168915612193464e-05, + "loss": 1.7797, + "step": 9875 + }, + { + "epoch": 3.0313075506445673, + "grad_norm": 0.27675309777259827, + "learning_rate": 8.168531118363508e-05, + "loss": 1.8355, + "step": 9876 + }, + { + "epoch": 3.0316144874155926, + "grad_norm": 0.2598227262496948, + "learning_rate": 8.16814659322027e-05, + "loss": 1.7898, + "step": 9877 + }, + { + "epoch": 3.0319214241866175, + "grad_norm": 0.24715003371238708, + "learning_rate": 8.16776203676755e-05, + "loss": 1.7791, + "step": 9878 + }, + { + "epoch": 3.032228360957643, + "grad_norm": 0.2749374210834503, + "learning_rate": 8.167377449009149e-05, + "loss": 1.8303, + "step": 9879 + }, + { + "epoch": 3.0325352977286677, + "grad_norm": 0.26150834560394287, + "learning_rate": 8.166992829948868e-05, + "loss": 1.8462, + "step": 9880 + }, + { + "epoch": 3.032842234499693, + "grad_norm": 0.3044755160808563, + "learning_rate": 8.166608179590506e-05, + "loss": 1.806, + "step": 9881 + }, + { + "epoch": 3.0331491712707184, + "grad_norm": 0.2949555516242981, + "learning_rate": 8.166223497937868e-05, + "loss": 1.8785, + "step": 9882 + }, + { + "epoch": 3.0334561080417433, + "grad_norm": 0.33206698298454285, + "learning_rate": 8.165838784994752e-05, + "loss": 1.8476, + "step": 9883 + }, + { + "epoch": 3.0337630448127686, + "grad_norm": 0.2720400094985962, + "learning_rate": 8.165454040764962e-05, + "loss": 1.843, + "step": 9884 + }, + { + "epoch": 3.034069981583794, + "grad_norm": 0.29340869188308716, + "learning_rate": 8.1650692652523e-05, + "loss": 1.7761, + "step": 9885 + }, + { + "epoch": 3.034376918354819, + "grad_norm": 0.35155293345451355, + "learning_rate": 8.16468445846057e-05, + "loss": 1.8887, + "step": 9886 + }, + { + "epoch": 3.034683855125844, + "grad_norm": 0.2688990831375122, + "learning_rate": 8.164299620393571e-05, + "loss": 1.8001, + "step": 9887 + }, + { + "epoch": 3.034990791896869, + "grad_norm": 0.2921253442764282, + "learning_rate": 8.16391475105511e-05, + "loss": 1.7951, + "step": 9888 + }, + { + "epoch": 3.0352977286678944, + "grad_norm": 0.28100699186325073, + "learning_rate": 8.163529850448988e-05, + "loss": 1.8041, + "step": 9889 + }, + { + "epoch": 3.0356046654389197, + "grad_norm": 0.3155081868171692, + "learning_rate": 8.16314491857901e-05, + "loss": 1.8026, + "step": 9890 + }, + { + "epoch": 3.0359116022099446, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.16275995544898e-05, + "loss": 1.8502, + "step": 9891 + }, + { + "epoch": 3.03621853898097, + "grad_norm": 0.2732076644897461, + "learning_rate": 8.162374961062704e-05, + "loss": 1.8424, + "step": 9892 + }, + { + "epoch": 3.0365254757519953, + "grad_norm": 0.2943679690361023, + "learning_rate": 8.161989935423984e-05, + "loss": 1.7635, + "step": 9893 + }, + { + "epoch": 3.03683241252302, + "grad_norm": 0.28894683718681335, + "learning_rate": 8.161604878536626e-05, + "loss": 1.78, + "step": 9894 + }, + { + "epoch": 3.0371393492940455, + "grad_norm": 0.2718082666397095, + "learning_rate": 8.161219790404435e-05, + "loss": 1.7664, + "step": 9895 + }, + { + "epoch": 3.0374462860650704, + "grad_norm": 0.29092124104499817, + "learning_rate": 8.160834671031216e-05, + "loss": 1.8621, + "step": 9896 + }, + { + "epoch": 3.0377532228360957, + "grad_norm": 0.284665584564209, + "learning_rate": 8.160449520420779e-05, + "loss": 1.8607, + "step": 9897 + }, + { + "epoch": 3.038060159607121, + "grad_norm": 0.23676982522010803, + "learning_rate": 8.160064338576925e-05, + "loss": 1.7137, + "step": 9898 + }, + { + "epoch": 3.038367096378146, + "grad_norm": 0.2666932940483093, + "learning_rate": 8.159679125503466e-05, + "loss": 1.8038, + "step": 9899 + }, + { + "epoch": 3.0386740331491713, + "grad_norm": 0.36214375495910645, + "learning_rate": 8.159293881204204e-05, + "loss": 1.8902, + "step": 9900 + }, + { + "epoch": 3.0389809699201966, + "grad_norm": 0.30301332473754883, + "learning_rate": 8.158908605682948e-05, + "loss": 1.8456, + "step": 9901 + }, + { + "epoch": 3.0392879066912215, + "grad_norm": 0.32190418243408203, + "learning_rate": 8.158523298943506e-05, + "loss": 1.8246, + "step": 9902 + }, + { + "epoch": 3.039594843462247, + "grad_norm": 0.2938043475151062, + "learning_rate": 8.158137960989685e-05, + "loss": 1.8324, + "step": 9903 + }, + { + "epoch": 3.0399017802332717, + "grad_norm": 0.29493969678878784, + "learning_rate": 8.157752591825294e-05, + "loss": 1.8458, + "step": 9904 + }, + { + "epoch": 3.040208717004297, + "grad_norm": 0.2681889832019806, + "learning_rate": 8.157367191454141e-05, + "loss": 1.889, + "step": 9905 + }, + { + "epoch": 3.0405156537753224, + "grad_norm": 0.3111969232559204, + "learning_rate": 8.156981759880035e-05, + "loss": 1.8966, + "step": 9906 + }, + { + "epoch": 3.0408225905463473, + "grad_norm": 0.345262736082077, + "learning_rate": 8.156596297106784e-05, + "loss": 1.8174, + "step": 9907 + }, + { + "epoch": 3.0411295273173726, + "grad_norm": 0.30156534910202026, + "learning_rate": 8.156210803138199e-05, + "loss": 1.766, + "step": 9908 + }, + { + "epoch": 3.041436464088398, + "grad_norm": 0.28691565990448, + "learning_rate": 8.15582527797809e-05, + "loss": 1.8436, + "step": 9909 + }, + { + "epoch": 3.041743400859423, + "grad_norm": 0.33418282866477966, + "learning_rate": 8.155439721630264e-05, + "loss": 1.8939, + "step": 9910 + }, + { + "epoch": 3.042050337630448, + "grad_norm": 0.25496938824653625, + "learning_rate": 8.155054134098535e-05, + "loss": 1.8368, + "step": 9911 + }, + { + "epoch": 3.042357274401473, + "grad_norm": 0.3806788921356201, + "learning_rate": 8.154668515386711e-05, + "loss": 1.8635, + "step": 9912 + }, + { + "epoch": 3.0426642111724984, + "grad_norm": 0.42668119072914124, + "learning_rate": 8.154282865498603e-05, + "loss": 1.76, + "step": 9913 + }, + { + "epoch": 3.0429711479435237, + "grad_norm": 0.35945314168930054, + "learning_rate": 8.153897184438024e-05, + "loss": 1.8275, + "step": 9914 + }, + { + "epoch": 3.0432780847145486, + "grad_norm": 0.3225449323654175, + "learning_rate": 8.153511472208784e-05, + "loss": 1.7901, + "step": 9915 + }, + { + "epoch": 3.043585021485574, + "grad_norm": 0.2905425727367401, + "learning_rate": 8.153125728814694e-05, + "loss": 1.8021, + "step": 9916 + }, + { + "epoch": 3.0438919582565993, + "grad_norm": 0.3315529525279999, + "learning_rate": 8.15273995425957e-05, + "loss": 1.8003, + "step": 9917 + }, + { + "epoch": 3.044198895027624, + "grad_norm": 0.30256444215774536, + "learning_rate": 8.152354148547221e-05, + "loss": 1.8243, + "step": 9918 + }, + { + "epoch": 3.0445058317986495, + "grad_norm": 0.2563035190105438, + "learning_rate": 8.15196831168146e-05, + "loss": 1.7877, + "step": 9919 + }, + { + "epoch": 3.044812768569675, + "grad_norm": 0.25705814361572266, + "learning_rate": 8.151582443666101e-05, + "loss": 1.813, + "step": 9920 + }, + { + "epoch": 3.0451197053406998, + "grad_norm": 0.3649071455001831, + "learning_rate": 8.151196544504957e-05, + "loss": 1.8114, + "step": 9921 + }, + { + "epoch": 3.045426642111725, + "grad_norm": 0.4076193571090698, + "learning_rate": 8.150810614201841e-05, + "loss": 1.7869, + "step": 9922 + }, + { + "epoch": 3.04573357888275, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.150424652760569e-05, + "loss": 1.7878, + "step": 9923 + }, + { + "epoch": 3.0460405156537753, + "grad_norm": 0.2243243157863617, + "learning_rate": 8.150038660184955e-05, + "loss": 1.8224, + "step": 9924 + }, + { + "epoch": 3.0463474524248007, + "grad_norm": 0.3295031487941742, + "learning_rate": 8.149652636478811e-05, + "loss": 1.8685, + "step": 9925 + }, + { + "epoch": 3.0466543891958255, + "grad_norm": 0.2973531186580658, + "learning_rate": 8.149266581645954e-05, + "loss": 1.8082, + "step": 9926 + }, + { + "epoch": 3.046961325966851, + "grad_norm": 0.25648918747901917, + "learning_rate": 8.148880495690199e-05, + "loss": 1.8089, + "step": 9927 + }, + { + "epoch": 3.047268262737876, + "grad_norm": 0.2845752537250519, + "learning_rate": 8.148494378615361e-05, + "loss": 1.8726, + "step": 9928 + }, + { + "epoch": 3.047575199508901, + "grad_norm": 0.2917105555534363, + "learning_rate": 8.148108230425255e-05, + "loss": 1.8035, + "step": 9929 + }, + { + "epoch": 3.0478821362799264, + "grad_norm": 0.2775834798812866, + "learning_rate": 8.1477220511237e-05, + "loss": 1.8545, + "step": 9930 + }, + { + "epoch": 3.0481890730509513, + "grad_norm": 0.3522767424583435, + "learning_rate": 8.14733584071451e-05, + "loss": 1.8261, + "step": 9931 + }, + { + "epoch": 3.0484960098219767, + "grad_norm": 0.3759000599384308, + "learning_rate": 8.146949599201503e-05, + "loss": 1.8405, + "step": 9932 + }, + { + "epoch": 3.048802946593002, + "grad_norm": 0.3353044390678406, + "learning_rate": 8.146563326588496e-05, + "loss": 1.7762, + "step": 9933 + }, + { + "epoch": 3.049109883364027, + "grad_norm": 0.263810932636261, + "learning_rate": 8.146177022879304e-05, + "loss": 1.7546, + "step": 9934 + }, + { + "epoch": 3.049416820135052, + "grad_norm": 0.24064256250858307, + "learning_rate": 8.14579068807775e-05, + "loss": 1.7903, + "step": 9935 + }, + { + "epoch": 3.0497237569060776, + "grad_norm": 0.3144194781780243, + "learning_rate": 8.145404322187645e-05, + "loss": 1.8011, + "step": 9936 + }, + { + "epoch": 3.0500306936771024, + "grad_norm": 0.3362879455089569, + "learning_rate": 8.145017925212812e-05, + "loss": 1.8224, + "step": 9937 + }, + { + "epoch": 3.050337630448128, + "grad_norm": 0.33979395031929016, + "learning_rate": 8.144631497157071e-05, + "loss": 1.8415, + "step": 9938 + }, + { + "epoch": 3.0506445672191527, + "grad_norm": 0.33391237258911133, + "learning_rate": 8.144245038024235e-05, + "loss": 1.7983, + "step": 9939 + }, + { + "epoch": 3.050951503990178, + "grad_norm": 0.34034964442253113, + "learning_rate": 8.143858547818128e-05, + "loss": 1.8635, + "step": 9940 + }, + { + "epoch": 3.0512584407612033, + "grad_norm": 0.3472529947757721, + "learning_rate": 8.143472026542569e-05, + "loss": 1.8067, + "step": 9941 + }, + { + "epoch": 3.0515653775322282, + "grad_norm": 0.3369109630584717, + "learning_rate": 8.143085474201376e-05, + "loss": 1.7933, + "step": 9942 + }, + { + "epoch": 3.0518723143032536, + "grad_norm": 0.3055182993412018, + "learning_rate": 8.14269889079837e-05, + "loss": 1.7358, + "step": 9943 + }, + { + "epoch": 3.052179251074279, + "grad_norm": 0.26729708909988403, + "learning_rate": 8.142312276337372e-05, + "loss": 1.8315, + "step": 9944 + }, + { + "epoch": 3.052486187845304, + "grad_norm": 0.3626720607280731, + "learning_rate": 8.141925630822203e-05, + "loss": 1.7593, + "step": 9945 + }, + { + "epoch": 3.052793124616329, + "grad_norm": 0.3673512637615204, + "learning_rate": 8.141538954256683e-05, + "loss": 1.8414, + "step": 9946 + }, + { + "epoch": 3.053100061387354, + "grad_norm": 0.30554768443107605, + "learning_rate": 8.141152246644632e-05, + "loss": 1.7504, + "step": 9947 + }, + { + "epoch": 3.0534069981583793, + "grad_norm": 0.41163405776023865, + "learning_rate": 8.140765507989875e-05, + "loss": 1.8794, + "step": 9948 + }, + { + "epoch": 3.0537139349294047, + "grad_norm": 0.592751145362854, + "learning_rate": 8.140378738296233e-05, + "loss": 1.8538, + "step": 9949 + }, + { + "epoch": 3.0540208717004296, + "grad_norm": 0.483828604221344, + "learning_rate": 8.139991937567527e-05, + "loss": 1.7952, + "step": 9950 + }, + { + "epoch": 3.054327808471455, + "grad_norm": 0.26665306091308594, + "learning_rate": 8.13960510580758e-05, + "loss": 1.8268, + "step": 9951 + }, + { + "epoch": 3.0546347452424802, + "grad_norm": 0.42917072772979736, + "learning_rate": 8.139218243020215e-05, + "loss": 1.843, + "step": 9952 + }, + { + "epoch": 3.054941682013505, + "grad_norm": 0.47911396622657776, + "learning_rate": 8.138831349209256e-05, + "loss": 1.8223, + "step": 9953 + }, + { + "epoch": 3.0552486187845305, + "grad_norm": 0.4540431797504425, + "learning_rate": 8.138444424378524e-05, + "loss": 1.9198, + "step": 9954 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.29719051718711853, + "learning_rate": 8.138057468531845e-05, + "loss": 1.7873, + "step": 9955 + }, + { + "epoch": 3.0558624923265807, + "grad_norm": 0.35133618116378784, + "learning_rate": 8.137670481673045e-05, + "loss": 1.8459, + "step": 9956 + }, + { + "epoch": 3.056169429097606, + "grad_norm": 0.42896488308906555, + "learning_rate": 8.137283463805945e-05, + "loss": 1.7814, + "step": 9957 + }, + { + "epoch": 3.056476365868631, + "grad_norm": 0.38993972539901733, + "learning_rate": 8.136896414934372e-05, + "loss": 1.7636, + "step": 9958 + }, + { + "epoch": 3.0567833026396563, + "grad_norm": 0.31362372636795044, + "learning_rate": 8.13650933506215e-05, + "loss": 1.8021, + "step": 9959 + }, + { + "epoch": 3.0570902394106816, + "grad_norm": 0.27980196475982666, + "learning_rate": 8.136122224193103e-05, + "loss": 1.8445, + "step": 9960 + }, + { + "epoch": 3.0573971761817065, + "grad_norm": 0.2721461057662964, + "learning_rate": 8.135735082331059e-05, + "loss": 1.7614, + "step": 9961 + }, + { + "epoch": 3.057704112952732, + "grad_norm": 0.25157424807548523, + "learning_rate": 8.135347909479843e-05, + "loss": 1.7598, + "step": 9962 + }, + { + "epoch": 3.0580110497237567, + "grad_norm": 0.25798025727272034, + "learning_rate": 8.13496070564328e-05, + "loss": 1.7823, + "step": 9963 + }, + { + "epoch": 3.058317986494782, + "grad_norm": 0.30775198340415955, + "learning_rate": 8.134573470825199e-05, + "loss": 1.7755, + "step": 9964 + }, + { + "epoch": 3.0586249232658074, + "grad_norm": 0.28916797041893005, + "learning_rate": 8.134186205029426e-05, + "loss": 1.8189, + "step": 9965 + }, + { + "epoch": 3.0589318600368323, + "grad_norm": 0.2829149067401886, + "learning_rate": 8.133798908259787e-05, + "loss": 1.8546, + "step": 9966 + }, + { + "epoch": 3.0592387968078576, + "grad_norm": 0.2884117662906647, + "learning_rate": 8.13341158052011e-05, + "loss": 1.7705, + "step": 9967 + }, + { + "epoch": 3.059545733578883, + "grad_norm": 0.28311973810195923, + "learning_rate": 8.133024221814225e-05, + "loss": 1.8147, + "step": 9968 + }, + { + "epoch": 3.059852670349908, + "grad_norm": 0.25405213236808777, + "learning_rate": 8.132636832145957e-05, + "loss": 1.7813, + "step": 9969 + }, + { + "epoch": 3.060159607120933, + "grad_norm": 0.3082229793071747, + "learning_rate": 8.132249411519137e-05, + "loss": 1.8536, + "step": 9970 + }, + { + "epoch": 3.060466543891958, + "grad_norm": 0.29918181896209717, + "learning_rate": 8.13186195993759e-05, + "loss": 1.8181, + "step": 9971 + }, + { + "epoch": 3.0607734806629834, + "grad_norm": 0.3025238811969757, + "learning_rate": 8.13147447740515e-05, + "loss": 1.7785, + "step": 9972 + }, + { + "epoch": 3.0610804174340087, + "grad_norm": 0.2798222303390503, + "learning_rate": 8.131086963925643e-05, + "loss": 1.7873, + "step": 9973 + }, + { + "epoch": 3.0613873542050336, + "grad_norm": 0.32636210322380066, + "learning_rate": 8.130699419502898e-05, + "loss": 1.882, + "step": 9974 + }, + { + "epoch": 3.061694290976059, + "grad_norm": 0.27722054719924927, + "learning_rate": 8.130311844140748e-05, + "loss": 1.7788, + "step": 9975 + }, + { + "epoch": 3.0620012277470843, + "grad_norm": 0.289156436920166, + "learning_rate": 8.129924237843023e-05, + "loss": 1.8591, + "step": 9976 + }, + { + "epoch": 3.062308164518109, + "grad_norm": 0.2839665412902832, + "learning_rate": 8.12953660061355e-05, + "loss": 1.8255, + "step": 9977 + }, + { + "epoch": 3.0626151012891345, + "grad_norm": 0.2650148272514343, + "learning_rate": 8.129148932456161e-05, + "loss": 1.8353, + "step": 9978 + }, + { + "epoch": 3.06292203806016, + "grad_norm": 0.2884560227394104, + "learning_rate": 8.128761233374691e-05, + "loss": 1.8099, + "step": 9979 + }, + { + "epoch": 3.0632289748311847, + "grad_norm": 0.2610029876232147, + "learning_rate": 8.128373503372967e-05, + "loss": 1.8173, + "step": 9980 + }, + { + "epoch": 3.06353591160221, + "grad_norm": 0.32512393593788147, + "learning_rate": 8.127985742454822e-05, + "loss": 1.8619, + "step": 9981 + }, + { + "epoch": 3.063842848373235, + "grad_norm": 0.3382968604564667, + "learning_rate": 8.127597950624091e-05, + "loss": 1.831, + "step": 9982 + }, + { + "epoch": 3.0641497851442603, + "grad_norm": 0.33773133158683777, + "learning_rate": 8.127210127884602e-05, + "loss": 1.8194, + "step": 9983 + }, + { + "epoch": 3.0644567219152856, + "grad_norm": 0.31642746925354004, + "learning_rate": 8.126822274240188e-05, + "loss": 1.8782, + "step": 9984 + }, + { + "epoch": 3.0647636586863105, + "grad_norm": 0.2476506233215332, + "learning_rate": 8.126434389694686e-05, + "loss": 1.7866, + "step": 9985 + }, + { + "epoch": 3.065070595457336, + "grad_norm": 0.27296319603919983, + "learning_rate": 8.126046474251927e-05, + "loss": 1.8276, + "step": 9986 + }, + { + "epoch": 3.0653775322283607, + "grad_norm": 0.353865385055542, + "learning_rate": 8.125658527915744e-05, + "loss": 1.9525, + "step": 9987 + }, + { + "epoch": 3.065684468999386, + "grad_norm": 0.370256632566452, + "learning_rate": 8.12527055068997e-05, + "loss": 1.8514, + "step": 9988 + }, + { + "epoch": 3.0659914057704114, + "grad_norm": 0.30738842487335205, + "learning_rate": 8.124882542578442e-05, + "loss": 1.8125, + "step": 9989 + }, + { + "epoch": 3.0662983425414363, + "grad_norm": 0.3151233494281769, + "learning_rate": 8.124494503584995e-05, + "loss": 1.8165, + "step": 9990 + }, + { + "epoch": 3.0666052793124616, + "grad_norm": 0.29071590304374695, + "learning_rate": 8.124106433713458e-05, + "loss": 1.7617, + "step": 9991 + }, + { + "epoch": 3.066912216083487, + "grad_norm": 0.2898697853088379, + "learning_rate": 8.123718332967672e-05, + "loss": 1.7779, + "step": 9992 + }, + { + "epoch": 3.067219152854512, + "grad_norm": 0.26601701974868774, + "learning_rate": 8.123330201351471e-05, + "loss": 1.8307, + "step": 9993 + }, + { + "epoch": 3.067526089625537, + "grad_norm": 0.2622119188308716, + "learning_rate": 8.12294203886869e-05, + "loss": 1.7958, + "step": 9994 + }, + { + "epoch": 3.0678330263965625, + "grad_norm": 0.29709386825561523, + "learning_rate": 8.122553845523166e-05, + "loss": 1.7799, + "step": 9995 + }, + { + "epoch": 3.0681399631675874, + "grad_norm": 0.31267789006233215, + "learning_rate": 8.122165621318733e-05, + "loss": 1.8149, + "step": 9996 + }, + { + "epoch": 3.0684468999386127, + "grad_norm": 0.3076523244380951, + "learning_rate": 8.121777366259232e-05, + "loss": 1.7701, + "step": 9997 + }, + { + "epoch": 3.0687538367096376, + "grad_norm": 0.30096009373664856, + "learning_rate": 8.121389080348496e-05, + "loss": 1.8323, + "step": 9998 + }, + { + "epoch": 3.069060773480663, + "grad_norm": 0.25739142298698425, + "learning_rate": 8.121000763590363e-05, + "loss": 1.8105, + "step": 9999 + }, + { + "epoch": 3.0693677102516883, + "grad_norm": 0.2780844271183014, + "learning_rate": 8.120612415988671e-05, + "loss": 1.8502, + "step": 10000 + }, + { + "epoch": 3.069674647022713, + "grad_norm": 0.3316378593444824, + "learning_rate": 8.120224037547259e-05, + "loss": 1.8244, + "step": 10001 + }, + { + "epoch": 3.0699815837937385, + "grad_norm": 0.261129766702652, + "learning_rate": 8.119835628269964e-05, + "loss": 1.7769, + "step": 10002 + }, + { + "epoch": 3.070288520564764, + "grad_norm": 0.29213985800743103, + "learning_rate": 8.119447188160625e-05, + "loss": 1.7717, + "step": 10003 + }, + { + "epoch": 3.0705954573357888, + "grad_norm": 0.38545623421669006, + "learning_rate": 8.11905871722308e-05, + "loss": 1.8433, + "step": 10004 + }, + { + "epoch": 3.070902394106814, + "grad_norm": 0.3617223799228668, + "learning_rate": 8.118670215461168e-05, + "loss": 1.8172, + "step": 10005 + }, + { + "epoch": 3.071209330877839, + "grad_norm": 0.3241543769836426, + "learning_rate": 8.11828168287873e-05, + "loss": 1.8325, + "step": 10006 + }, + { + "epoch": 3.0715162676488643, + "grad_norm": 0.3538578152656555, + "learning_rate": 8.117893119479605e-05, + "loss": 1.8188, + "step": 10007 + }, + { + "epoch": 3.0718232044198897, + "grad_norm": 0.3861970603466034, + "learning_rate": 8.117504525267632e-05, + "loss": 1.8518, + "step": 10008 + }, + { + "epoch": 3.0721301411909145, + "grad_norm": 0.35433146357536316, + "learning_rate": 8.117115900246652e-05, + "loss": 1.8601, + "step": 10009 + }, + { + "epoch": 3.07243707796194, + "grad_norm": 0.29796987771987915, + "learning_rate": 8.116727244420507e-05, + "loss": 1.7934, + "step": 10010 + }, + { + "epoch": 3.072744014732965, + "grad_norm": 0.3091779947280884, + "learning_rate": 8.116338557793035e-05, + "loss": 1.8111, + "step": 10011 + }, + { + "epoch": 3.07305095150399, + "grad_norm": 0.2741319537162781, + "learning_rate": 8.11594984036808e-05, + "loss": 1.8079, + "step": 10012 + }, + { + "epoch": 3.0733578882750154, + "grad_norm": 0.28905320167541504, + "learning_rate": 8.115561092149482e-05, + "loss": 1.8475, + "step": 10013 + }, + { + "epoch": 3.0736648250460403, + "grad_norm": 0.2897081673145294, + "learning_rate": 8.115172313141081e-05, + "loss": 1.838, + "step": 10014 + }, + { + "epoch": 3.0739717618170657, + "grad_norm": 0.2620783746242523, + "learning_rate": 8.114783503346725e-05, + "loss": 1.8024, + "step": 10015 + }, + { + "epoch": 3.074278698588091, + "grad_norm": 0.26478636264801025, + "learning_rate": 8.11439466277025e-05, + "loss": 1.8137, + "step": 10016 + }, + { + "epoch": 3.074585635359116, + "grad_norm": 0.2796174883842468, + "learning_rate": 8.114005791415502e-05, + "loss": 1.7976, + "step": 10017 + }, + { + "epoch": 3.074892572130141, + "grad_norm": 0.26813286542892456, + "learning_rate": 8.113616889286325e-05, + "loss": 1.7945, + "step": 10018 + }, + { + "epoch": 3.0751995089011666, + "grad_norm": 0.2443828582763672, + "learning_rate": 8.11322795638656e-05, + "loss": 1.7829, + "step": 10019 + }, + { + "epoch": 3.0755064456721914, + "grad_norm": 0.2981395423412323, + "learning_rate": 8.112838992720053e-05, + "loss": 1.7928, + "step": 10020 + }, + { + "epoch": 3.075813382443217, + "grad_norm": 0.25605037808418274, + "learning_rate": 8.112449998290644e-05, + "loss": 1.8129, + "step": 10021 + }, + { + "epoch": 3.0761203192142417, + "grad_norm": 0.31180307269096375, + "learning_rate": 8.112060973102181e-05, + "loss": 1.7393, + "step": 10022 + }, + { + "epoch": 3.076427255985267, + "grad_norm": 0.3230421543121338, + "learning_rate": 8.111671917158508e-05, + "loss": 1.818, + "step": 10023 + }, + { + "epoch": 3.0767341927562923, + "grad_norm": 0.3158549964427948, + "learning_rate": 8.111282830463468e-05, + "loss": 1.7582, + "step": 10024 + }, + { + "epoch": 3.0770411295273172, + "grad_norm": 0.24524325132369995, + "learning_rate": 8.110893713020908e-05, + "loss": 1.8215, + "step": 10025 + }, + { + "epoch": 3.0773480662983426, + "grad_norm": 0.2793932259082794, + "learning_rate": 8.110504564834675e-05, + "loss": 1.8551, + "step": 10026 + }, + { + "epoch": 3.077655003069368, + "grad_norm": 0.29629403352737427, + "learning_rate": 8.110115385908612e-05, + "loss": 1.8019, + "step": 10027 + }, + { + "epoch": 3.077961939840393, + "grad_norm": 0.3138490915298462, + "learning_rate": 8.109726176246564e-05, + "loss": 1.8436, + "step": 10028 + }, + { + "epoch": 3.078268876611418, + "grad_norm": 0.29802024364471436, + "learning_rate": 8.10933693585238e-05, + "loss": 1.8158, + "step": 10029 + }, + { + "epoch": 3.078575813382443, + "grad_norm": 0.30785220861434937, + "learning_rate": 8.108947664729907e-05, + "loss": 1.8674, + "step": 10030 + }, + { + "epoch": 3.0788827501534684, + "grad_norm": 0.277662992477417, + "learning_rate": 8.10855836288299e-05, + "loss": 1.8253, + "step": 10031 + }, + { + "epoch": 3.0791896869244937, + "grad_norm": 0.27399590611457825, + "learning_rate": 8.108169030315477e-05, + "loss": 1.8587, + "step": 10032 + }, + { + "epoch": 3.0794966236955186, + "grad_norm": 0.28398239612579346, + "learning_rate": 8.107779667031217e-05, + "loss": 1.8326, + "step": 10033 + }, + { + "epoch": 3.079803560466544, + "grad_norm": 0.2882741093635559, + "learning_rate": 8.107390273034057e-05, + "loss": 1.785, + "step": 10034 + }, + { + "epoch": 3.0801104972375692, + "grad_norm": 0.271043598651886, + "learning_rate": 8.107000848327843e-05, + "loss": 1.765, + "step": 10035 + }, + { + "epoch": 3.080417434008594, + "grad_norm": 0.2589638829231262, + "learning_rate": 8.106611392916427e-05, + "loss": 1.8136, + "step": 10036 + }, + { + "epoch": 3.0807243707796195, + "grad_norm": 0.3068227469921112, + "learning_rate": 8.106221906803656e-05, + "loss": 1.8034, + "step": 10037 + }, + { + "epoch": 3.0810313075506444, + "grad_norm": 0.2714168131351471, + "learning_rate": 8.105832389993379e-05, + "loss": 1.8007, + "step": 10038 + }, + { + "epoch": 3.0813382443216697, + "grad_norm": 0.2747504711151123, + "learning_rate": 8.105442842489447e-05, + "loss": 1.8135, + "step": 10039 + }, + { + "epoch": 3.081645181092695, + "grad_norm": 0.2719285488128662, + "learning_rate": 8.105053264295708e-05, + "loss": 1.7629, + "step": 10040 + }, + { + "epoch": 3.08195211786372, + "grad_norm": 0.3119582235813141, + "learning_rate": 8.104663655416014e-05, + "loss": 1.7887, + "step": 10041 + }, + { + "epoch": 3.0822590546347453, + "grad_norm": 0.35965192317962646, + "learning_rate": 8.104274015854212e-05, + "loss": 1.8484, + "step": 10042 + }, + { + "epoch": 3.0825659914057706, + "grad_norm": 0.3045980632305145, + "learning_rate": 8.103884345614157e-05, + "loss": 1.8625, + "step": 10043 + }, + { + "epoch": 3.0828729281767955, + "grad_norm": 0.2925138473510742, + "learning_rate": 8.103494644699696e-05, + "loss": 1.9306, + "step": 10044 + }, + { + "epoch": 3.083179864947821, + "grad_norm": 0.2894277274608612, + "learning_rate": 8.103104913114681e-05, + "loss": 1.7796, + "step": 10045 + }, + { + "epoch": 3.0834868017188457, + "grad_norm": 0.2776826322078705, + "learning_rate": 8.102715150862967e-05, + "loss": 1.8169, + "step": 10046 + }, + { + "epoch": 3.083793738489871, + "grad_norm": 0.3315230906009674, + "learning_rate": 8.102325357948402e-05, + "loss": 1.8139, + "step": 10047 + }, + { + "epoch": 3.0841006752608964, + "grad_norm": 0.2906761169433594, + "learning_rate": 8.10193553437484e-05, + "loss": 1.8162, + "step": 10048 + }, + { + "epoch": 3.0844076120319213, + "grad_norm": 0.32681339979171753, + "learning_rate": 8.101545680146132e-05, + "loss": 1.8245, + "step": 10049 + }, + { + "epoch": 3.0847145488029466, + "grad_norm": 0.32525795698165894, + "learning_rate": 8.101155795266131e-05, + "loss": 1.8605, + "step": 10050 + }, + { + "epoch": 3.085021485573972, + "grad_norm": 0.31705379486083984, + "learning_rate": 8.100765879738692e-05, + "loss": 1.8214, + "step": 10051 + }, + { + "epoch": 3.085328422344997, + "grad_norm": 0.27772918343544006, + "learning_rate": 8.100375933567668e-05, + "loss": 1.7822, + "step": 10052 + }, + { + "epoch": 3.085635359116022, + "grad_norm": 0.2877809405326843, + "learning_rate": 8.09998595675691e-05, + "loss": 1.7935, + "step": 10053 + }, + { + "epoch": 3.0859422958870475, + "grad_norm": 0.29759806394577026, + "learning_rate": 8.099595949310276e-05, + "loss": 1.8041, + "step": 10054 + }, + { + "epoch": 3.0862492326580724, + "grad_norm": 0.2715320289134979, + "learning_rate": 8.099205911231617e-05, + "loss": 1.7923, + "step": 10055 + }, + { + "epoch": 3.0865561694290977, + "grad_norm": 0.33566340804100037, + "learning_rate": 8.098815842524789e-05, + "loss": 1.7953, + "step": 10056 + }, + { + "epoch": 3.0868631062001226, + "grad_norm": 0.3360871970653534, + "learning_rate": 8.098425743193645e-05, + "loss": 1.8275, + "step": 10057 + }, + { + "epoch": 3.087170042971148, + "grad_norm": 0.2797739803791046, + "learning_rate": 8.098035613242043e-05, + "loss": 1.7597, + "step": 10058 + }, + { + "epoch": 3.0874769797421733, + "grad_norm": 0.25500187277793884, + "learning_rate": 8.097645452673837e-05, + "loss": 1.8059, + "step": 10059 + }, + { + "epoch": 3.087783916513198, + "grad_norm": 0.28042587637901306, + "learning_rate": 8.097255261492884e-05, + "loss": 1.7954, + "step": 10060 + }, + { + "epoch": 3.0880908532842235, + "grad_norm": 0.3616262376308441, + "learning_rate": 8.096865039703038e-05, + "loss": 1.8605, + "step": 10061 + }, + { + "epoch": 3.0883977900552484, + "grad_norm": 0.3453714847564697, + "learning_rate": 8.096474787308157e-05, + "loss": 1.7643, + "step": 10062 + }, + { + "epoch": 3.0887047268262737, + "grad_norm": 0.3192278742790222, + "learning_rate": 8.096084504312098e-05, + "loss": 1.8415, + "step": 10063 + }, + { + "epoch": 3.089011663597299, + "grad_norm": 0.2714482545852661, + "learning_rate": 8.095694190718715e-05, + "loss": 1.8204, + "step": 10064 + }, + { + "epoch": 3.089318600368324, + "grad_norm": 0.26562005281448364, + "learning_rate": 8.09530384653187e-05, + "loss": 1.7322, + "step": 10065 + }, + { + "epoch": 3.0896255371393493, + "grad_norm": 0.33727800846099854, + "learning_rate": 8.094913471755417e-05, + "loss": 1.8221, + "step": 10066 + }, + { + "epoch": 3.0899324739103746, + "grad_norm": 0.3561044931411743, + "learning_rate": 8.094523066393215e-05, + "loss": 1.8879, + "step": 10067 + }, + { + "epoch": 3.0902394106813995, + "grad_norm": 0.2568742334842682, + "learning_rate": 8.094132630449122e-05, + "loss": 1.8178, + "step": 10068 + }, + { + "epoch": 3.090546347452425, + "grad_norm": 0.4025525450706482, + "learning_rate": 8.093742163926998e-05, + "loss": 1.8186, + "step": 10069 + }, + { + "epoch": 3.09085328422345, + "grad_norm": 0.43863433599472046, + "learning_rate": 8.0933516668307e-05, + "loss": 1.8371, + "step": 10070 + }, + { + "epoch": 3.091160220994475, + "grad_norm": 0.34873950481414795, + "learning_rate": 8.092961139164087e-05, + "loss": 1.8083, + "step": 10071 + }, + { + "epoch": 3.0914671577655004, + "grad_norm": 0.31433534622192383, + "learning_rate": 8.092570580931021e-05, + "loss": 1.8154, + "step": 10072 + }, + { + "epoch": 3.0917740945365253, + "grad_norm": 0.25523966550827026, + "learning_rate": 8.092179992135358e-05, + "loss": 1.8158, + "step": 10073 + }, + { + "epoch": 3.0920810313075506, + "grad_norm": 0.348469078540802, + "learning_rate": 8.09178937278096e-05, + "loss": 1.8358, + "step": 10074 + }, + { + "epoch": 3.092387968078576, + "grad_norm": 0.33455297350883484, + "learning_rate": 8.091398722871688e-05, + "loss": 1.7779, + "step": 10075 + }, + { + "epoch": 3.092694904849601, + "grad_norm": 0.36544880270957947, + "learning_rate": 8.091008042411403e-05, + "loss": 1.9186, + "step": 10076 + }, + { + "epoch": 3.093001841620626, + "grad_norm": 0.29165831208229065, + "learning_rate": 8.090617331403965e-05, + "loss": 1.8964, + "step": 10077 + }, + { + "epoch": 3.0933087783916515, + "grad_norm": 0.31011059880256653, + "learning_rate": 8.090226589853234e-05, + "loss": 1.8453, + "step": 10078 + }, + { + "epoch": 3.0936157151626764, + "grad_norm": 0.2835703492164612, + "learning_rate": 8.089835817763071e-05, + "loss": 1.7718, + "step": 10079 + }, + { + "epoch": 3.0939226519337018, + "grad_norm": 0.2910583019256592, + "learning_rate": 8.08944501513734e-05, + "loss": 1.7881, + "step": 10080 + }, + { + "epoch": 3.0942295887047266, + "grad_norm": 0.391303688287735, + "learning_rate": 8.089054181979905e-05, + "loss": 1.7915, + "step": 10081 + }, + { + "epoch": 3.094536525475752, + "grad_norm": 0.4119330048561096, + "learning_rate": 8.088663318294623e-05, + "loss": 1.7975, + "step": 10082 + }, + { + "epoch": 3.0948434622467773, + "grad_norm": 0.2980102002620697, + "learning_rate": 8.088272424085361e-05, + "loss": 1.805, + "step": 10083 + }, + { + "epoch": 3.095150399017802, + "grad_norm": 0.3089980483055115, + "learning_rate": 8.087881499355983e-05, + "loss": 1.8265, + "step": 10084 + }, + { + "epoch": 3.0954573357888275, + "grad_norm": 0.3851003348827362, + "learning_rate": 8.087490544110348e-05, + "loss": 1.8174, + "step": 10085 + }, + { + "epoch": 3.095764272559853, + "grad_norm": 0.42357420921325684, + "learning_rate": 8.08709955835232e-05, + "loss": 1.8083, + "step": 10086 + }, + { + "epoch": 3.0960712093308778, + "grad_norm": 0.291777640581131, + "learning_rate": 8.086708542085768e-05, + "loss": 1.7713, + "step": 10087 + }, + { + "epoch": 3.096378146101903, + "grad_norm": 0.2563805878162384, + "learning_rate": 8.086317495314552e-05, + "loss": 1.7691, + "step": 10088 + }, + { + "epoch": 3.096685082872928, + "grad_norm": 0.3418877422809601, + "learning_rate": 8.085926418042536e-05, + "loss": 1.8547, + "step": 10089 + }, + { + "epoch": 3.0969920196439533, + "grad_norm": 0.3859385550022125, + "learning_rate": 8.085535310273589e-05, + "loss": 1.8226, + "step": 10090 + }, + { + "epoch": 3.0972989564149787, + "grad_norm": 0.3427267372608185, + "learning_rate": 8.085144172011571e-05, + "loss": 1.837, + "step": 10091 + }, + { + "epoch": 3.0976058931860035, + "grad_norm": 0.29290953278541565, + "learning_rate": 8.084753003260352e-05, + "loss": 1.8392, + "step": 10092 + }, + { + "epoch": 3.097912829957029, + "grad_norm": 0.33282020688056946, + "learning_rate": 8.084361804023795e-05, + "loss": 1.8351, + "step": 10093 + }, + { + "epoch": 3.098219766728054, + "grad_norm": 0.3802134394645691, + "learning_rate": 8.083970574305768e-05, + "loss": 1.7467, + "step": 10094 + }, + { + "epoch": 3.098526703499079, + "grad_norm": 0.3142111897468567, + "learning_rate": 8.083579314110135e-05, + "loss": 1.7966, + "step": 10095 + }, + { + "epoch": 3.0988336402701044, + "grad_norm": 0.2956278324127197, + "learning_rate": 8.083188023440765e-05, + "loss": 1.8724, + "step": 10096 + }, + { + "epoch": 3.0991405770411293, + "grad_norm": 0.3262473940849304, + "learning_rate": 8.082796702301522e-05, + "loss": 1.8448, + "step": 10097 + }, + { + "epoch": 3.0994475138121547, + "grad_norm": 0.29358017444610596, + "learning_rate": 8.082405350696276e-05, + "loss": 1.8679, + "step": 10098 + }, + { + "epoch": 3.09975445058318, + "grad_norm": 0.36439722776412964, + "learning_rate": 8.082013968628893e-05, + "loss": 1.8801, + "step": 10099 + }, + { + "epoch": 3.100061387354205, + "grad_norm": 0.3565322458744049, + "learning_rate": 8.081622556103244e-05, + "loss": 1.794, + "step": 10100 + }, + { + "epoch": 3.1003683241252302, + "grad_norm": 0.2841760814189911, + "learning_rate": 8.081231113123191e-05, + "loss": 1.7593, + "step": 10101 + }, + { + "epoch": 3.1006752608962556, + "grad_norm": 0.28589630126953125, + "learning_rate": 8.080839639692608e-05, + "loss": 1.864, + "step": 10102 + }, + { + "epoch": 3.1009821976672804, + "grad_norm": 0.3595057427883148, + "learning_rate": 8.080448135815362e-05, + "loss": 1.8067, + "step": 10103 + }, + { + "epoch": 3.101289134438306, + "grad_norm": 0.3909708261489868, + "learning_rate": 8.080056601495322e-05, + "loss": 1.8601, + "step": 10104 + }, + { + "epoch": 3.1015960712093307, + "grad_norm": 0.35180148482322693, + "learning_rate": 8.079665036736358e-05, + "loss": 1.8328, + "step": 10105 + }, + { + "epoch": 3.101903007980356, + "grad_norm": 0.3065175712108612, + "learning_rate": 8.079273441542338e-05, + "loss": 1.8449, + "step": 10106 + }, + { + "epoch": 3.1022099447513813, + "grad_norm": 0.31358617544174194, + "learning_rate": 8.078881815917134e-05, + "loss": 1.8325, + "step": 10107 + }, + { + "epoch": 3.1025168815224062, + "grad_norm": 0.4737118184566498, + "learning_rate": 8.078490159864614e-05, + "loss": 1.8232, + "step": 10108 + }, + { + "epoch": 3.1028238182934316, + "grad_norm": 0.435148686170578, + "learning_rate": 8.078098473388651e-05, + "loss": 1.8227, + "step": 10109 + }, + { + "epoch": 3.103130755064457, + "grad_norm": 0.3080987334251404, + "learning_rate": 8.077706756493115e-05, + "loss": 1.8072, + "step": 10110 + }, + { + "epoch": 3.103437691835482, + "grad_norm": 0.3225170075893402, + "learning_rate": 8.077315009181876e-05, + "loss": 1.7716, + "step": 10111 + }, + { + "epoch": 3.103744628606507, + "grad_norm": 0.46642443537712097, + "learning_rate": 8.076923231458808e-05, + "loss": 1.8295, + "step": 10112 + }, + { + "epoch": 3.104051565377532, + "grad_norm": 0.42561766505241394, + "learning_rate": 8.07653142332778e-05, + "loss": 1.8553, + "step": 10113 + }, + { + "epoch": 3.1043585021485574, + "grad_norm": 0.27187541127204895, + "learning_rate": 8.076139584792664e-05, + "loss": 1.7937, + "step": 10114 + }, + { + "epoch": 3.1046654389195827, + "grad_norm": 0.27822238206863403, + "learning_rate": 8.075747715857335e-05, + "loss": 1.8151, + "step": 10115 + }, + { + "epoch": 3.1049723756906076, + "grad_norm": 0.40106478333473206, + "learning_rate": 8.075355816525665e-05, + "loss": 1.8637, + "step": 10116 + }, + { + "epoch": 3.105279312461633, + "grad_norm": 0.33455124497413635, + "learning_rate": 8.074963886801525e-05, + "loss": 1.8543, + "step": 10117 + }, + { + "epoch": 3.1055862492326582, + "grad_norm": 0.32246437668800354, + "learning_rate": 8.07457192668879e-05, + "loss": 1.7907, + "step": 10118 + }, + { + "epoch": 3.105893186003683, + "grad_norm": 0.45360109210014343, + "learning_rate": 8.074179936191332e-05, + "loss": 1.7404, + "step": 10119 + }, + { + "epoch": 3.1062001227747085, + "grad_norm": 0.445916086435318, + "learning_rate": 8.07378791531303e-05, + "loss": 1.778, + "step": 10120 + }, + { + "epoch": 3.1065070595457334, + "grad_norm": 0.28561538457870483, + "learning_rate": 8.073395864057751e-05, + "loss": 1.8723, + "step": 10121 + }, + { + "epoch": 3.1068139963167587, + "grad_norm": 0.3258218467235565, + "learning_rate": 8.073003782429373e-05, + "loss": 1.8106, + "step": 10122 + }, + { + "epoch": 3.107120933087784, + "grad_norm": 0.5459560751914978, + "learning_rate": 8.07261167043177e-05, + "loss": 1.8022, + "step": 10123 + }, + { + "epoch": 3.107427869858809, + "grad_norm": 0.4828549921512604, + "learning_rate": 8.072219528068819e-05, + "loss": 1.7556, + "step": 10124 + }, + { + "epoch": 3.1077348066298343, + "grad_norm": 0.24075324833393097, + "learning_rate": 8.071827355344393e-05, + "loss": 1.7901, + "step": 10125 + }, + { + "epoch": 3.1080417434008596, + "grad_norm": 0.44677188992500305, + "learning_rate": 8.071435152262367e-05, + "loss": 1.7858, + "step": 10126 + }, + { + "epoch": 3.1083486801718845, + "grad_norm": 0.49862590432167053, + "learning_rate": 8.071042918826622e-05, + "loss": 1.805, + "step": 10127 + }, + { + "epoch": 3.10865561694291, + "grad_norm": 0.30883491039276123, + "learning_rate": 8.07065065504103e-05, + "loss": 1.7693, + "step": 10128 + }, + { + "epoch": 3.108962553713935, + "grad_norm": 0.29583030939102173, + "learning_rate": 8.070258360909467e-05, + "loss": 1.8141, + "step": 10129 + }, + { + "epoch": 3.10926949048496, + "grad_norm": 0.3595346510410309, + "learning_rate": 8.069866036435812e-05, + "loss": 1.8286, + "step": 10130 + }, + { + "epoch": 3.1095764272559854, + "grad_norm": 0.3215504288673401, + "learning_rate": 8.069473681623942e-05, + "loss": 1.8557, + "step": 10131 + }, + { + "epoch": 3.1098833640270103, + "grad_norm": 0.29734939336776733, + "learning_rate": 8.069081296477734e-05, + "loss": 1.7996, + "step": 10132 + }, + { + "epoch": 3.1101903007980356, + "grad_norm": 0.33546003699302673, + "learning_rate": 8.068688881001065e-05, + "loss": 1.8307, + "step": 10133 + }, + { + "epoch": 3.110497237569061, + "grad_norm": 0.3886832296848297, + "learning_rate": 8.068296435197814e-05, + "loss": 1.751, + "step": 10134 + }, + { + "epoch": 3.110804174340086, + "grad_norm": 0.34505394101142883, + "learning_rate": 8.06790395907186e-05, + "loss": 1.7543, + "step": 10135 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.27018141746520996, + "learning_rate": 8.06751145262708e-05, + "loss": 1.8109, + "step": 10136 + }, + { + "epoch": 3.1114180478821365, + "grad_norm": 0.3367149531841278, + "learning_rate": 8.067118915867355e-05, + "loss": 1.8025, + "step": 10137 + }, + { + "epoch": 3.1117249846531614, + "grad_norm": 0.40811091661453247, + "learning_rate": 8.066726348796562e-05, + "loss": 1.7327, + "step": 10138 + }, + { + "epoch": 3.1120319214241867, + "grad_norm": 0.3511471152305603, + "learning_rate": 8.066333751418583e-05, + "loss": 1.8711, + "step": 10139 + }, + { + "epoch": 3.1123388581952116, + "grad_norm": 0.3112446367740631, + "learning_rate": 8.065941123737295e-05, + "loss": 1.8621, + "step": 10140 + }, + { + "epoch": 3.112645794966237, + "grad_norm": 0.3424238860607147, + "learning_rate": 8.065548465756581e-05, + "loss": 1.8383, + "step": 10141 + }, + { + "epoch": 3.1129527317372623, + "grad_norm": 0.380013108253479, + "learning_rate": 8.06515577748032e-05, + "loss": 1.8121, + "step": 10142 + }, + { + "epoch": 3.113259668508287, + "grad_norm": 0.2650558650493622, + "learning_rate": 8.064763058912393e-05, + "loss": 1.866, + "step": 10143 + }, + { + "epoch": 3.1135666052793125, + "grad_norm": 0.30580762028694153, + "learning_rate": 8.06437031005668e-05, + "loss": 1.7769, + "step": 10144 + }, + { + "epoch": 3.113873542050338, + "grad_norm": 0.29927194118499756, + "learning_rate": 8.063977530917066e-05, + "loss": 1.7897, + "step": 10145 + }, + { + "epoch": 3.1141804788213627, + "grad_norm": 0.24322012066841125, + "learning_rate": 8.063584721497429e-05, + "loss": 1.7968, + "step": 10146 + }, + { + "epoch": 3.114487415592388, + "grad_norm": 0.3082945644855499, + "learning_rate": 8.063191881801651e-05, + "loss": 1.8456, + "step": 10147 + }, + { + "epoch": 3.114794352363413, + "grad_norm": 0.3247329890727997, + "learning_rate": 8.062799011833617e-05, + "loss": 1.7436, + "step": 10148 + }, + { + "epoch": 3.1151012891344383, + "grad_norm": 0.27591946721076965, + "learning_rate": 8.062406111597207e-05, + "loss": 1.7976, + "step": 10149 + }, + { + "epoch": 3.1154082259054636, + "grad_norm": 0.2752058804035187, + "learning_rate": 8.062013181096306e-05, + "loss": 1.7814, + "step": 10150 + }, + { + "epoch": 3.1157151626764885, + "grad_norm": 0.3207196891307831, + "learning_rate": 8.061620220334795e-05, + "loss": 1.7767, + "step": 10151 + }, + { + "epoch": 3.116022099447514, + "grad_norm": 0.2895309627056122, + "learning_rate": 8.061227229316559e-05, + "loss": 1.8588, + "step": 10152 + }, + { + "epoch": 3.116329036218539, + "grad_norm": 0.333843469619751, + "learning_rate": 8.060834208045481e-05, + "loss": 1.7871, + "step": 10153 + }, + { + "epoch": 3.116635972989564, + "grad_norm": 0.43877774477005005, + "learning_rate": 8.060441156525445e-05, + "loss": 1.8165, + "step": 10154 + }, + { + "epoch": 3.1169429097605894, + "grad_norm": 0.35700589418411255, + "learning_rate": 8.060048074760337e-05, + "loss": 1.777, + "step": 10155 + }, + { + "epoch": 3.1172498465316143, + "grad_norm": 0.26124534010887146, + "learning_rate": 8.059654962754039e-05, + "loss": 1.8343, + "step": 10156 + }, + { + "epoch": 3.1175567833026396, + "grad_norm": 0.331444650888443, + "learning_rate": 8.059261820510438e-05, + "loss": 1.9437, + "step": 10157 + }, + { + "epoch": 3.117863720073665, + "grad_norm": 0.31657731533050537, + "learning_rate": 8.058868648033419e-05, + "loss": 1.7621, + "step": 10158 + }, + { + "epoch": 3.11817065684469, + "grad_norm": 0.2785957455635071, + "learning_rate": 8.058475445326867e-05, + "loss": 1.9049, + "step": 10159 + }, + { + "epoch": 3.118477593615715, + "grad_norm": 0.2605743408203125, + "learning_rate": 8.058082212394667e-05, + "loss": 1.7895, + "step": 10160 + }, + { + "epoch": 3.1187845303867405, + "grad_norm": 0.2981378138065338, + "learning_rate": 8.057688949240707e-05, + "loss": 1.8373, + "step": 10161 + }, + { + "epoch": 3.1190914671577654, + "grad_norm": 0.2944273054599762, + "learning_rate": 8.057295655868873e-05, + "loss": 1.8373, + "step": 10162 + }, + { + "epoch": 3.1193984039287908, + "grad_norm": 0.2696721851825714, + "learning_rate": 8.056902332283052e-05, + "loss": 1.8023, + "step": 10163 + }, + { + "epoch": 3.1197053406998156, + "grad_norm": 0.27659857273101807, + "learning_rate": 8.056508978487128e-05, + "loss": 1.8453, + "step": 10164 + }, + { + "epoch": 3.120012277470841, + "grad_norm": 0.2982441186904907, + "learning_rate": 8.056115594484992e-05, + "loss": 1.9072, + "step": 10165 + }, + { + "epoch": 3.1203192142418663, + "grad_norm": 0.3136404752731323, + "learning_rate": 8.055722180280531e-05, + "loss": 1.8585, + "step": 10166 + }, + { + "epoch": 3.120626151012891, + "grad_norm": 0.2979940176010132, + "learning_rate": 8.055328735877631e-05, + "loss": 1.8699, + "step": 10167 + }, + { + "epoch": 3.1209330877839165, + "grad_norm": 0.2585618793964386, + "learning_rate": 8.054935261280184e-05, + "loss": 1.8323, + "step": 10168 + }, + { + "epoch": 3.121240024554942, + "grad_norm": 0.28734859824180603, + "learning_rate": 8.054541756492075e-05, + "loss": 1.8694, + "step": 10169 + }, + { + "epoch": 3.1215469613259668, + "grad_norm": 0.30582788586616516, + "learning_rate": 8.054148221517193e-05, + "loss": 1.856, + "step": 10170 + }, + { + "epoch": 3.121853898096992, + "grad_norm": 0.3128255009651184, + "learning_rate": 8.053754656359429e-05, + "loss": 1.8329, + "step": 10171 + }, + { + "epoch": 3.122160834868017, + "grad_norm": 0.2845318615436554, + "learning_rate": 8.053361061022671e-05, + "loss": 1.8111, + "step": 10172 + }, + { + "epoch": 3.1224677716390423, + "grad_norm": 0.2994609773159027, + "learning_rate": 8.05296743551081e-05, + "loss": 1.8157, + "step": 10173 + }, + { + "epoch": 3.1227747084100677, + "grad_norm": 0.26397961378097534, + "learning_rate": 8.052573779827737e-05, + "loss": 1.8572, + "step": 10174 + }, + { + "epoch": 3.1230816451810925, + "grad_norm": 0.2911500334739685, + "learning_rate": 8.052180093977339e-05, + "loss": 1.8312, + "step": 10175 + }, + { + "epoch": 3.123388581952118, + "grad_norm": 0.33455008268356323, + "learning_rate": 8.051786377963509e-05, + "loss": 1.8748, + "step": 10176 + }, + { + "epoch": 3.123695518723143, + "grad_norm": 0.3127586841583252, + "learning_rate": 8.051392631790135e-05, + "loss": 1.8224, + "step": 10177 + }, + { + "epoch": 3.124002455494168, + "grad_norm": 0.2910686433315277, + "learning_rate": 8.050998855461113e-05, + "loss": 1.8557, + "step": 10178 + }, + { + "epoch": 3.1243093922651934, + "grad_norm": 0.2849208414554596, + "learning_rate": 8.050605048980333e-05, + "loss": 1.82, + "step": 10179 + }, + { + "epoch": 3.1246163290362183, + "grad_norm": 0.35189691185951233, + "learning_rate": 8.050211212351683e-05, + "loss": 1.7884, + "step": 10180 + }, + { + "epoch": 3.1249232658072437, + "grad_norm": 0.3641110360622406, + "learning_rate": 8.04981734557906e-05, + "loss": 1.7984, + "step": 10181 + }, + { + "epoch": 3.125230202578269, + "grad_norm": 0.3111717700958252, + "learning_rate": 8.049423448666353e-05, + "loss": 1.8134, + "step": 10182 + }, + { + "epoch": 3.125537139349294, + "grad_norm": 0.2608453631401062, + "learning_rate": 8.049029521617457e-05, + "loss": 1.765, + "step": 10183 + }, + { + "epoch": 3.1258440761203192, + "grad_norm": 0.28779423236846924, + "learning_rate": 8.048635564436265e-05, + "loss": 1.8355, + "step": 10184 + }, + { + "epoch": 3.1261510128913446, + "grad_norm": 0.38227665424346924, + "learning_rate": 8.048241577126668e-05, + "loss": 1.8487, + "step": 10185 + }, + { + "epoch": 3.1264579496623695, + "grad_norm": 0.3603171706199646, + "learning_rate": 8.047847559692562e-05, + "loss": 1.8035, + "step": 10186 + }, + { + "epoch": 3.126764886433395, + "grad_norm": 0.21950066089630127, + "learning_rate": 8.04745351213784e-05, + "loss": 1.7399, + "step": 10187 + }, + { + "epoch": 3.12707182320442, + "grad_norm": 0.2796075642108917, + "learning_rate": 8.047059434466395e-05, + "loss": 1.8229, + "step": 10188 + }, + { + "epoch": 3.127378759975445, + "grad_norm": 0.3382907807826996, + "learning_rate": 8.046665326682125e-05, + "loss": 1.7713, + "step": 10189 + }, + { + "epoch": 3.1276856967464703, + "grad_norm": 0.36472463607788086, + "learning_rate": 8.04627118878892e-05, + "loss": 1.8129, + "step": 10190 + }, + { + "epoch": 3.1279926335174952, + "grad_norm": 0.2971884310245514, + "learning_rate": 8.045877020790679e-05, + "loss": 1.7894, + "step": 10191 + }, + { + "epoch": 3.1282995702885206, + "grad_norm": 0.2292303442955017, + "learning_rate": 8.045482822691297e-05, + "loss": 1.7637, + "step": 10192 + }, + { + "epoch": 3.128606507059546, + "grad_norm": 0.300750732421875, + "learning_rate": 8.045088594494668e-05, + "loss": 1.7678, + "step": 10193 + }, + { + "epoch": 3.128913443830571, + "grad_norm": 0.3121531009674072, + "learning_rate": 8.044694336204688e-05, + "loss": 1.8651, + "step": 10194 + }, + { + "epoch": 3.129220380601596, + "grad_norm": 0.2456093430519104, + "learning_rate": 8.044300047825254e-05, + "loss": 1.7769, + "step": 10195 + }, + { + "epoch": 3.129527317372621, + "grad_norm": 0.25085800886154175, + "learning_rate": 8.043905729360264e-05, + "loss": 1.7723, + "step": 10196 + }, + { + "epoch": 3.1298342541436464, + "grad_norm": 0.2505287826061249, + "learning_rate": 8.043511380813612e-05, + "loss": 1.7943, + "step": 10197 + }, + { + "epoch": 3.1301411909146717, + "grad_norm": 0.27144530415534973, + "learning_rate": 8.043117002189198e-05, + "loss": 1.8119, + "step": 10198 + }, + { + "epoch": 3.1304481276856966, + "grad_norm": 0.2702989876270294, + "learning_rate": 8.042722593490916e-05, + "loss": 1.8517, + "step": 10199 + }, + { + "epoch": 3.130755064456722, + "grad_norm": 0.2585136890411377, + "learning_rate": 8.042328154722667e-05, + "loss": 1.8382, + "step": 10200 + }, + { + "epoch": 3.1310620012277472, + "grad_norm": 0.26306065917015076, + "learning_rate": 8.041933685888348e-05, + "loss": 1.8211, + "step": 10201 + }, + { + "epoch": 3.131368937998772, + "grad_norm": 0.2208927720785141, + "learning_rate": 8.041539186991858e-05, + "loss": 1.7765, + "step": 10202 + }, + { + "epoch": 3.1316758747697975, + "grad_norm": 0.2756440043449402, + "learning_rate": 8.041144658037095e-05, + "loss": 1.898, + "step": 10203 + }, + { + "epoch": 3.131982811540823, + "grad_norm": 0.29718101024627686, + "learning_rate": 8.040750099027958e-05, + "loss": 1.8226, + "step": 10204 + }, + { + "epoch": 3.1322897483118477, + "grad_norm": 0.3166738748550415, + "learning_rate": 8.040355509968345e-05, + "loss": 1.8129, + "step": 10205 + }, + { + "epoch": 3.132596685082873, + "grad_norm": 0.3534909784793854, + "learning_rate": 8.039960890862158e-05, + "loss": 1.8915, + "step": 10206 + }, + { + "epoch": 3.132903621853898, + "grad_norm": 0.3015006184577942, + "learning_rate": 8.039566241713297e-05, + "loss": 1.8389, + "step": 10207 + }, + { + "epoch": 3.1332105586249233, + "grad_norm": 0.35226619243621826, + "learning_rate": 8.039171562525659e-05, + "loss": 1.7287, + "step": 10208 + }, + { + "epoch": 3.1335174953959486, + "grad_norm": 0.4290136694908142, + "learning_rate": 8.038776853303146e-05, + "loss": 1.8768, + "step": 10209 + }, + { + "epoch": 3.1338244321669735, + "grad_norm": 0.2828960418701172, + "learning_rate": 8.03838211404966e-05, + "loss": 1.7552, + "step": 10210 + }, + { + "epoch": 3.134131368937999, + "grad_norm": 0.3781953752040863, + "learning_rate": 8.0379873447691e-05, + "loss": 1.7812, + "step": 10211 + }, + { + "epoch": 3.1344383057090237, + "grad_norm": 0.4282926023006439, + "learning_rate": 8.037592545465371e-05, + "loss": 1.84, + "step": 10212 + }, + { + "epoch": 3.134745242480049, + "grad_norm": 0.2622411251068115, + "learning_rate": 8.03719771614237e-05, + "loss": 1.8114, + "step": 10213 + }, + { + "epoch": 3.1350521792510744, + "grad_norm": 0.34881457686424255, + "learning_rate": 8.036802856804001e-05, + "loss": 1.7694, + "step": 10214 + }, + { + "epoch": 3.1353591160220993, + "grad_norm": 0.40797632932662964, + "learning_rate": 8.036407967454167e-05, + "loss": 1.7595, + "step": 10215 + }, + { + "epoch": 3.1356660527931246, + "grad_norm": 0.24902814626693726, + "learning_rate": 8.036013048096769e-05, + "loss": 1.8068, + "step": 10216 + }, + { + "epoch": 3.13597298956415, + "grad_norm": 0.3682909607887268, + "learning_rate": 8.035618098735711e-05, + "loss": 1.8519, + "step": 10217 + }, + { + "epoch": 3.136279926335175, + "grad_norm": 0.6111233234405518, + "learning_rate": 8.035223119374895e-05, + "loss": 1.9254, + "step": 10218 + }, + { + "epoch": 3.1365868631062, + "grad_norm": 0.4793062210083008, + "learning_rate": 8.034828110018227e-05, + "loss": 1.786, + "step": 10219 + }, + { + "epoch": 3.1368937998772255, + "grad_norm": 0.3074932396411896, + "learning_rate": 8.034433070669607e-05, + "loss": 1.8495, + "step": 10220 + }, + { + "epoch": 3.1372007366482504, + "grad_norm": 0.4366479218006134, + "learning_rate": 8.034038001332942e-05, + "loss": 1.8501, + "step": 10221 + }, + { + "epoch": 3.1375076734192757, + "grad_norm": 0.4660070538520813, + "learning_rate": 8.033642902012135e-05, + "loss": 1.8317, + "step": 10222 + }, + { + "epoch": 3.1378146101903006, + "grad_norm": 0.3452899158000946, + "learning_rate": 8.03324777271109e-05, + "loss": 1.8702, + "step": 10223 + }, + { + "epoch": 3.138121546961326, + "grad_norm": 0.3658824563026428, + "learning_rate": 8.032852613433713e-05, + "loss": 1.8754, + "step": 10224 + }, + { + "epoch": 3.1384284837323513, + "grad_norm": 0.3777768909931183, + "learning_rate": 8.03245742418391e-05, + "loss": 1.8613, + "step": 10225 + }, + { + "epoch": 3.138735420503376, + "grad_norm": 0.3873192071914673, + "learning_rate": 8.032062204965582e-05, + "loss": 1.8438, + "step": 10226 + }, + { + "epoch": 3.1390423572744015, + "grad_norm": 0.30686715245246887, + "learning_rate": 8.031666955782641e-05, + "loss": 1.811, + "step": 10227 + }, + { + "epoch": 3.139349294045427, + "grad_norm": 0.2738516330718994, + "learning_rate": 8.03127167663899e-05, + "loss": 1.757, + "step": 10228 + }, + { + "epoch": 3.1396562308164517, + "grad_norm": 0.3093133270740509, + "learning_rate": 8.030876367538536e-05, + "loss": 1.8181, + "step": 10229 + }, + { + "epoch": 3.139963167587477, + "grad_norm": 0.3247159719467163, + "learning_rate": 8.030481028485185e-05, + "loss": 1.7798, + "step": 10230 + }, + { + "epoch": 3.140270104358502, + "grad_norm": 0.2855088412761688, + "learning_rate": 8.030085659482845e-05, + "loss": 1.825, + "step": 10231 + }, + { + "epoch": 3.1405770411295273, + "grad_norm": 0.2818242907524109, + "learning_rate": 8.02969026053542e-05, + "loss": 1.7737, + "step": 10232 + }, + { + "epoch": 3.1408839779005526, + "grad_norm": 0.27074751257896423, + "learning_rate": 8.029294831646822e-05, + "loss": 1.8306, + "step": 10233 + }, + { + "epoch": 3.1411909146715775, + "grad_norm": 0.29740920662879944, + "learning_rate": 8.028899372820954e-05, + "loss": 1.8157, + "step": 10234 + }, + { + "epoch": 3.141497851442603, + "grad_norm": 0.30743202567100525, + "learning_rate": 8.028503884061731e-05, + "loss": 1.7626, + "step": 10235 + }, + { + "epoch": 3.141804788213628, + "grad_norm": 0.27812567353248596, + "learning_rate": 8.028108365373058e-05, + "loss": 1.7604, + "step": 10236 + }, + { + "epoch": 3.142111724984653, + "grad_norm": 0.26212629675865173, + "learning_rate": 8.027712816758839e-05, + "loss": 1.8161, + "step": 10237 + }, + { + "epoch": 3.1424186617556784, + "grad_norm": 0.3611658811569214, + "learning_rate": 8.02731723822299e-05, + "loss": 1.8283, + "step": 10238 + }, + { + "epoch": 3.1427255985267033, + "grad_norm": 0.31705498695373535, + "learning_rate": 8.026921629769418e-05, + "loss": 1.7986, + "step": 10239 + }, + { + "epoch": 3.1430325352977286, + "grad_norm": 0.25905972719192505, + "learning_rate": 8.026525991402032e-05, + "loss": 1.7926, + "step": 10240 + }, + { + "epoch": 3.143339472068754, + "grad_norm": 0.42376595735549927, + "learning_rate": 8.026130323124741e-05, + "loss": 1.8275, + "step": 10241 + }, + { + "epoch": 3.143646408839779, + "grad_norm": 0.415556401014328, + "learning_rate": 8.025734624941458e-05, + "loss": 1.7938, + "step": 10242 + }, + { + "epoch": 3.143953345610804, + "grad_norm": 0.3558904528617859, + "learning_rate": 8.025338896856091e-05, + "loss": 1.836, + "step": 10243 + }, + { + "epoch": 3.1442602823818295, + "grad_norm": 0.3091062307357788, + "learning_rate": 8.024943138872553e-05, + "loss": 1.8285, + "step": 10244 + }, + { + "epoch": 3.1445672191528544, + "grad_norm": 0.2620905041694641, + "learning_rate": 8.024547350994753e-05, + "loss": 1.7115, + "step": 10245 + }, + { + "epoch": 3.1448741559238798, + "grad_norm": 0.25716835260391235, + "learning_rate": 8.024151533226604e-05, + "loss": 1.7702, + "step": 10246 + }, + { + "epoch": 3.1451810926949046, + "grad_norm": 0.250844269990921, + "learning_rate": 8.023755685572017e-05, + "loss": 1.7617, + "step": 10247 + }, + { + "epoch": 3.14548802946593, + "grad_norm": 0.23898956179618835, + "learning_rate": 8.023359808034903e-05, + "loss": 1.7872, + "step": 10248 + }, + { + "epoch": 3.1457949662369553, + "grad_norm": 0.2335387021303177, + "learning_rate": 8.022963900619176e-05, + "loss": 1.7656, + "step": 10249 + }, + { + "epoch": 3.14610190300798, + "grad_norm": 0.21822704374790192, + "learning_rate": 8.022567963328749e-05, + "loss": 1.7706, + "step": 10250 + }, + { + "epoch": 3.1464088397790055, + "grad_norm": 0.2627898156642914, + "learning_rate": 8.022171996167531e-05, + "loss": 1.8559, + "step": 10251 + }, + { + "epoch": 3.146715776550031, + "grad_norm": 0.2530064582824707, + "learning_rate": 8.021775999139441e-05, + "loss": 1.788, + "step": 10252 + }, + { + "epoch": 3.1470227133210558, + "grad_norm": 0.2293635457754135, + "learning_rate": 8.021379972248387e-05, + "loss": 1.8129, + "step": 10253 + }, + { + "epoch": 3.147329650092081, + "grad_norm": 0.27753588557243347, + "learning_rate": 8.020983915498286e-05, + "loss": 1.7957, + "step": 10254 + }, + { + "epoch": 3.147636586863106, + "grad_norm": 0.24507668614387512, + "learning_rate": 8.020587828893051e-05, + "loss": 1.7969, + "step": 10255 + }, + { + "epoch": 3.1479435236341313, + "grad_norm": 0.24818891286849976, + "learning_rate": 8.020191712436598e-05, + "loss": 1.8412, + "step": 10256 + }, + { + "epoch": 3.1482504604051567, + "grad_norm": 0.2463149130344391, + "learning_rate": 8.01979556613284e-05, + "loss": 1.8097, + "step": 10257 + }, + { + "epoch": 3.1485573971761815, + "grad_norm": 0.26742151379585266, + "learning_rate": 8.019399389985692e-05, + "loss": 1.8487, + "step": 10258 + }, + { + "epoch": 3.148864333947207, + "grad_norm": 0.3078254461288452, + "learning_rate": 8.01900318399907e-05, + "loss": 1.8189, + "step": 10259 + }, + { + "epoch": 3.149171270718232, + "grad_norm": 0.3819321393966675, + "learning_rate": 8.018606948176887e-05, + "loss": 1.8019, + "step": 10260 + }, + { + "epoch": 3.149478207489257, + "grad_norm": 0.3932126462459564, + "learning_rate": 8.018210682523061e-05, + "loss": 1.787, + "step": 10261 + }, + { + "epoch": 3.1497851442602824, + "grad_norm": 0.2696186900138855, + "learning_rate": 8.017814387041511e-05, + "loss": 1.8345, + "step": 10262 + }, + { + "epoch": 3.150092081031308, + "grad_norm": 0.32631832361221313, + "learning_rate": 8.017418061736149e-05, + "loss": 1.7724, + "step": 10263 + }, + { + "epoch": 3.1503990178023327, + "grad_norm": 0.36187833547592163, + "learning_rate": 8.017021706610893e-05, + "loss": 1.7829, + "step": 10264 + }, + { + "epoch": 3.150705954573358, + "grad_norm": 0.29678142070770264, + "learning_rate": 8.01662532166966e-05, + "loss": 1.7896, + "step": 10265 + }, + { + "epoch": 3.151012891344383, + "grad_norm": 0.2997078001499176, + "learning_rate": 8.016228906916368e-05, + "loss": 1.8401, + "step": 10266 + }, + { + "epoch": 3.1513198281154082, + "grad_norm": 0.4688792824745178, + "learning_rate": 8.015832462354933e-05, + "loss": 1.8263, + "step": 10267 + }, + { + "epoch": 3.1516267648864336, + "grad_norm": 0.42710503935813904, + "learning_rate": 8.015435987989275e-05, + "loss": 1.8233, + "step": 10268 + }, + { + "epoch": 3.1519337016574585, + "grad_norm": 0.2490987628698349, + "learning_rate": 8.01503948382331e-05, + "loss": 1.7792, + "step": 10269 + }, + { + "epoch": 3.152240638428484, + "grad_norm": 0.400836706161499, + "learning_rate": 8.014642949860957e-05, + "loss": 1.8113, + "step": 10270 + }, + { + "epoch": 3.1525475751995087, + "grad_norm": 0.47995972633361816, + "learning_rate": 8.014246386106138e-05, + "loss": 1.8754, + "step": 10271 + }, + { + "epoch": 3.152854511970534, + "grad_norm": 0.39069879055023193, + "learning_rate": 8.013849792562769e-05, + "loss": 1.8541, + "step": 10272 + }, + { + "epoch": 3.1531614487415593, + "grad_norm": 0.27174463868141174, + "learning_rate": 8.013453169234768e-05, + "loss": 1.8018, + "step": 10273 + }, + { + "epoch": 3.1534683855125842, + "grad_norm": 0.37808045744895935, + "learning_rate": 8.013056516126058e-05, + "loss": 1.8346, + "step": 10274 + }, + { + "epoch": 3.1537753222836096, + "grad_norm": 0.43864908814430237, + "learning_rate": 8.012659833240557e-05, + "loss": 1.7626, + "step": 10275 + }, + { + "epoch": 3.154082259054635, + "grad_norm": 0.3592168688774109, + "learning_rate": 8.012263120582187e-05, + "loss": 1.8261, + "step": 10276 + }, + { + "epoch": 3.15438919582566, + "grad_norm": 0.3056562542915344, + "learning_rate": 8.011866378154866e-05, + "loss": 1.903, + "step": 10277 + }, + { + "epoch": 3.154696132596685, + "grad_norm": 0.2898549735546112, + "learning_rate": 8.011469605962517e-05, + "loss": 1.7781, + "step": 10278 + }, + { + "epoch": 3.1550030693677105, + "grad_norm": 0.3498871624469757, + "learning_rate": 8.011072804009059e-05, + "loss": 1.7571, + "step": 10279 + }, + { + "epoch": 3.1553100061387354, + "grad_norm": 0.3330932557582855, + "learning_rate": 8.010675972298416e-05, + "loss": 1.8298, + "step": 10280 + }, + { + "epoch": 3.1556169429097607, + "grad_norm": 0.2540839910507202, + "learning_rate": 8.010279110834507e-05, + "loss": 1.8327, + "step": 10281 + }, + { + "epoch": 3.1559238796807856, + "grad_norm": 0.3557111322879791, + "learning_rate": 8.009882219621257e-05, + "loss": 1.7611, + "step": 10282 + }, + { + "epoch": 3.156230816451811, + "grad_norm": 0.28293952345848083, + "learning_rate": 8.009485298662584e-05, + "loss": 1.7761, + "step": 10283 + }, + { + "epoch": 3.1565377532228363, + "grad_norm": 0.27089303731918335, + "learning_rate": 8.009088347962416e-05, + "loss": 1.8081, + "step": 10284 + }, + { + "epoch": 3.156844689993861, + "grad_norm": 0.2689332664012909, + "learning_rate": 8.008691367524673e-05, + "loss": 1.7458, + "step": 10285 + }, + { + "epoch": 3.1571516267648865, + "grad_norm": 0.2495841234922409, + "learning_rate": 8.008294357353278e-05, + "loss": 1.8307, + "step": 10286 + }, + { + "epoch": 3.1574585635359114, + "grad_norm": 0.29242852330207825, + "learning_rate": 8.007897317452156e-05, + "loss": 1.9216, + "step": 10287 + }, + { + "epoch": 3.1577655003069367, + "grad_norm": 0.26574134826660156, + "learning_rate": 8.007500247825229e-05, + "loss": 1.8392, + "step": 10288 + }, + { + "epoch": 3.158072437077962, + "grad_norm": 0.2503872811794281, + "learning_rate": 8.00710314847642e-05, + "loss": 1.7742, + "step": 10289 + }, + { + "epoch": 3.158379373848987, + "grad_norm": 0.25614771246910095, + "learning_rate": 8.006706019409658e-05, + "loss": 1.828, + "step": 10290 + }, + { + "epoch": 3.1586863106200123, + "grad_norm": 0.259369820356369, + "learning_rate": 8.006308860628863e-05, + "loss": 1.8328, + "step": 10291 + }, + { + "epoch": 3.1589932473910376, + "grad_norm": 0.28183647990226746, + "learning_rate": 8.005911672137962e-05, + "loss": 1.8269, + "step": 10292 + }, + { + "epoch": 3.1593001841620625, + "grad_norm": 0.2926514446735382, + "learning_rate": 8.005514453940881e-05, + "loss": 1.8334, + "step": 10293 + }, + { + "epoch": 3.159607120933088, + "grad_norm": 0.34313449263572693, + "learning_rate": 8.005117206041543e-05, + "loss": 1.7866, + "step": 10294 + }, + { + "epoch": 3.159914057704113, + "grad_norm": 0.30971628427505493, + "learning_rate": 8.004719928443875e-05, + "loss": 1.7827, + "step": 10295 + }, + { + "epoch": 3.160220994475138, + "grad_norm": 0.23955371975898743, + "learning_rate": 8.004322621151807e-05, + "loss": 1.7619, + "step": 10296 + }, + { + "epoch": 3.1605279312461634, + "grad_norm": 0.31311795115470886, + "learning_rate": 8.003925284169261e-05, + "loss": 1.8247, + "step": 10297 + }, + { + "epoch": 3.1608348680171883, + "grad_norm": 0.3408358097076416, + "learning_rate": 8.003527917500163e-05, + "loss": 1.8146, + "step": 10298 + }, + { + "epoch": 3.1611418047882136, + "grad_norm": 0.3030858337879181, + "learning_rate": 8.003130521148442e-05, + "loss": 1.857, + "step": 10299 + }, + { + "epoch": 3.161448741559239, + "grad_norm": 0.25168511271476746, + "learning_rate": 8.002733095118025e-05, + "loss": 1.8404, + "step": 10300 + }, + { + "epoch": 3.161755678330264, + "grad_norm": 0.2956216335296631, + "learning_rate": 8.002335639412839e-05, + "loss": 1.7352, + "step": 10301 + }, + { + "epoch": 3.162062615101289, + "grad_norm": 0.27791857719421387, + "learning_rate": 8.001938154036814e-05, + "loss": 1.7797, + "step": 10302 + }, + { + "epoch": 3.1623695518723145, + "grad_norm": 0.3106420040130615, + "learning_rate": 8.001540638993876e-05, + "loss": 1.8434, + "step": 10303 + }, + { + "epoch": 3.1626764886433394, + "grad_norm": 0.2940445840358734, + "learning_rate": 8.001143094287954e-05, + "loss": 1.8459, + "step": 10304 + }, + { + "epoch": 3.1629834254143647, + "grad_norm": 0.3857429325580597, + "learning_rate": 8.000745519922977e-05, + "loss": 1.7853, + "step": 10305 + }, + { + "epoch": 3.1632903621853896, + "grad_norm": 0.3585071861743927, + "learning_rate": 8.000347915902874e-05, + "loss": 1.8905, + "step": 10306 + }, + { + "epoch": 3.163597298956415, + "grad_norm": 0.320003867149353, + "learning_rate": 7.999950282231574e-05, + "loss": 1.8397, + "step": 10307 + }, + { + "epoch": 3.1639042357274403, + "grad_norm": 0.24986252188682556, + "learning_rate": 7.999552618913009e-05, + "loss": 1.7916, + "step": 10308 + }, + { + "epoch": 3.164211172498465, + "grad_norm": 0.33077237010002136, + "learning_rate": 7.999154925951104e-05, + "loss": 1.8334, + "step": 10309 + }, + { + "epoch": 3.1645181092694905, + "grad_norm": 0.35700327157974243, + "learning_rate": 7.998757203349794e-05, + "loss": 1.7773, + "step": 10310 + }, + { + "epoch": 3.164825046040516, + "grad_norm": 0.3095493018627167, + "learning_rate": 7.998359451113007e-05, + "loss": 1.8156, + "step": 10311 + }, + { + "epoch": 3.1651319828115407, + "grad_norm": 0.3004748225212097, + "learning_rate": 7.997961669244673e-05, + "loss": 1.7862, + "step": 10312 + }, + { + "epoch": 3.165438919582566, + "grad_norm": 0.39382806420326233, + "learning_rate": 7.99756385774873e-05, + "loss": 1.764, + "step": 10313 + }, + { + "epoch": 3.165745856353591, + "grad_norm": 0.3109463155269623, + "learning_rate": 7.997166016629099e-05, + "loss": 1.8006, + "step": 10314 + }, + { + "epoch": 3.1660527931246163, + "grad_norm": 0.2896469235420227, + "learning_rate": 7.996768145889717e-05, + "loss": 1.8373, + "step": 10315 + }, + { + "epoch": 3.1663597298956416, + "grad_norm": 0.35024940967559814, + "learning_rate": 7.996370245534517e-05, + "loss": 1.797, + "step": 10316 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.3228827714920044, + "learning_rate": 7.995972315567431e-05, + "loss": 1.7757, + "step": 10317 + }, + { + "epoch": 3.166973603437692, + "grad_norm": 0.27102410793304443, + "learning_rate": 7.995574355992388e-05, + "loss": 1.7786, + "step": 10318 + }, + { + "epoch": 3.167280540208717, + "grad_norm": 0.2556116580963135, + "learning_rate": 7.995176366813325e-05, + "loss": 1.7621, + "step": 10319 + }, + { + "epoch": 3.167587476979742, + "grad_norm": 0.28279444575309753, + "learning_rate": 7.994778348034173e-05, + "loss": 1.7954, + "step": 10320 + }, + { + "epoch": 3.1678944137507674, + "grad_norm": 0.31778639554977417, + "learning_rate": 7.994380299658867e-05, + "loss": 1.7657, + "step": 10321 + }, + { + "epoch": 3.1682013505217923, + "grad_norm": 0.27935469150543213, + "learning_rate": 7.993982221691339e-05, + "loss": 1.7502, + "step": 10322 + }, + { + "epoch": 3.1685082872928176, + "grad_norm": 0.29012617468833923, + "learning_rate": 7.993584114135524e-05, + "loss": 1.8497, + "step": 10323 + }, + { + "epoch": 3.168815224063843, + "grad_norm": 0.2674056887626648, + "learning_rate": 7.993185976995356e-05, + "loss": 1.7875, + "step": 10324 + }, + { + "epoch": 3.169122160834868, + "grad_norm": 0.2667328417301178, + "learning_rate": 7.992787810274771e-05, + "loss": 1.771, + "step": 10325 + }, + { + "epoch": 3.169429097605893, + "grad_norm": 0.25807151198387146, + "learning_rate": 7.992389613977702e-05, + "loss": 1.7638, + "step": 10326 + }, + { + "epoch": 3.1697360343769185, + "grad_norm": 0.2572930157184601, + "learning_rate": 7.991991388108084e-05, + "loss": 1.8218, + "step": 10327 + }, + { + "epoch": 3.1700429711479434, + "grad_norm": 0.3955067992210388, + "learning_rate": 7.991593132669855e-05, + "loss": 1.8458, + "step": 10328 + }, + { + "epoch": 3.1703499079189688, + "grad_norm": 0.2813466489315033, + "learning_rate": 7.991194847666948e-05, + "loss": 1.8042, + "step": 10329 + }, + { + "epoch": 3.1706568446899936, + "grad_norm": 0.2645012140274048, + "learning_rate": 7.990796533103302e-05, + "loss": 1.8241, + "step": 10330 + }, + { + "epoch": 3.170963781461019, + "grad_norm": 0.28462091088294983, + "learning_rate": 7.99039818898285e-05, + "loss": 1.8853, + "step": 10331 + }, + { + "epoch": 3.1712707182320443, + "grad_norm": 0.2727372944355011, + "learning_rate": 7.98999981530953e-05, + "loss": 1.7564, + "step": 10332 + }, + { + "epoch": 3.171577655003069, + "grad_norm": 0.2658170759677887, + "learning_rate": 7.989601412087281e-05, + "loss": 1.8344, + "step": 10333 + }, + { + "epoch": 3.1718845917740945, + "grad_norm": 0.29713502526283264, + "learning_rate": 7.989202979320039e-05, + "loss": 1.8721, + "step": 10334 + }, + { + "epoch": 3.17219152854512, + "grad_norm": 0.26609495282173157, + "learning_rate": 7.98880451701174e-05, + "loss": 1.7991, + "step": 10335 + }, + { + "epoch": 3.1724984653161448, + "grad_norm": 0.29779741168022156, + "learning_rate": 7.988406025166322e-05, + "loss": 1.8182, + "step": 10336 + }, + { + "epoch": 3.17280540208717, + "grad_norm": 0.2771340012550354, + "learning_rate": 7.988007503787724e-05, + "loss": 1.8034, + "step": 10337 + }, + { + "epoch": 3.1731123388581954, + "grad_norm": 0.30510422587394714, + "learning_rate": 7.987608952879886e-05, + "loss": 1.8477, + "step": 10338 + }, + { + "epoch": 3.1734192756292203, + "grad_norm": 0.3097476363182068, + "learning_rate": 7.987210372446745e-05, + "loss": 1.7572, + "step": 10339 + }, + { + "epoch": 3.1737262124002457, + "grad_norm": 0.2553942799568176, + "learning_rate": 7.986811762492239e-05, + "loss": 1.7837, + "step": 10340 + }, + { + "epoch": 3.1740331491712706, + "grad_norm": 0.26546719670295715, + "learning_rate": 7.986413123020312e-05, + "loss": 1.7893, + "step": 10341 + }, + { + "epoch": 3.174340085942296, + "grad_norm": 0.37721553444862366, + "learning_rate": 7.986014454034895e-05, + "loss": 1.8475, + "step": 10342 + }, + { + "epoch": 3.174647022713321, + "grad_norm": 0.3215494453907013, + "learning_rate": 7.985615755539937e-05, + "loss": 1.7806, + "step": 10343 + }, + { + "epoch": 3.174953959484346, + "grad_norm": 0.2662442922592163, + "learning_rate": 7.985217027539373e-05, + "loss": 1.8116, + "step": 10344 + }, + { + "epoch": 3.1752608962553714, + "grad_norm": 0.23334236443042755, + "learning_rate": 7.984818270037145e-05, + "loss": 1.7929, + "step": 10345 + }, + { + "epoch": 3.1755678330263963, + "grad_norm": 0.2873367667198181, + "learning_rate": 7.98441948303719e-05, + "loss": 1.7808, + "step": 10346 + }, + { + "epoch": 3.1758747697974217, + "grad_norm": 0.3623826801776886, + "learning_rate": 7.984020666543458e-05, + "loss": 1.8817, + "step": 10347 + }, + { + "epoch": 3.176181706568447, + "grad_norm": 0.3060589134693146, + "learning_rate": 7.983621820559881e-05, + "loss": 1.796, + "step": 10348 + }, + { + "epoch": 3.176488643339472, + "grad_norm": 0.2396882325410843, + "learning_rate": 7.983222945090407e-05, + "loss": 1.7455, + "step": 10349 + }, + { + "epoch": 3.1767955801104972, + "grad_norm": 0.24811476469039917, + "learning_rate": 7.982824040138974e-05, + "loss": 1.7907, + "step": 10350 + }, + { + "epoch": 3.1771025168815226, + "grad_norm": 0.32749706506729126, + "learning_rate": 7.982425105709524e-05, + "loss": 1.8553, + "step": 10351 + }, + { + "epoch": 3.1774094536525475, + "grad_norm": 0.3648095726966858, + "learning_rate": 7.982026141806003e-05, + "loss": 1.8387, + "step": 10352 + }, + { + "epoch": 3.177716390423573, + "grad_norm": 0.2749348282814026, + "learning_rate": 7.981627148432352e-05, + "loss": 1.7676, + "step": 10353 + }, + { + "epoch": 3.178023327194598, + "grad_norm": 0.2735142409801483, + "learning_rate": 7.981228125592513e-05, + "loss": 1.822, + "step": 10354 + }, + { + "epoch": 3.178330263965623, + "grad_norm": 0.28759655356407166, + "learning_rate": 7.98082907329043e-05, + "loss": 1.8113, + "step": 10355 + }, + { + "epoch": 3.1786372007366483, + "grad_norm": 0.33661654591560364, + "learning_rate": 7.980429991530048e-05, + "loss": 1.8036, + "step": 10356 + }, + { + "epoch": 3.1789441375076732, + "grad_norm": 0.2634892761707306, + "learning_rate": 7.98003088031531e-05, + "loss": 1.8323, + "step": 10357 + }, + { + "epoch": 3.1792510742786986, + "grad_norm": 0.25864094495773315, + "learning_rate": 7.979631739650158e-05, + "loss": 1.8199, + "step": 10358 + }, + { + "epoch": 3.179558011049724, + "grad_norm": 0.27368444204330444, + "learning_rate": 7.979232569538541e-05, + "loss": 1.7673, + "step": 10359 + }, + { + "epoch": 3.179864947820749, + "grad_norm": 0.2506616413593292, + "learning_rate": 7.9788333699844e-05, + "loss": 1.7912, + "step": 10360 + }, + { + "epoch": 3.180171884591774, + "grad_norm": 0.2539178133010864, + "learning_rate": 7.978434140991684e-05, + "loss": 1.7934, + "step": 10361 + }, + { + "epoch": 3.1804788213627995, + "grad_norm": 0.2605626881122589, + "learning_rate": 7.978034882564334e-05, + "loss": 1.8031, + "step": 10362 + }, + { + "epoch": 3.1807857581338244, + "grad_norm": 0.2610207796096802, + "learning_rate": 7.977635594706299e-05, + "loss": 1.8664, + "step": 10363 + }, + { + "epoch": 3.1810926949048497, + "grad_norm": 0.26164132356643677, + "learning_rate": 7.977236277421523e-05, + "loss": 1.7758, + "step": 10364 + }, + { + "epoch": 3.1813996316758746, + "grad_norm": 0.3122340142726898, + "learning_rate": 7.976836930713953e-05, + "loss": 1.9033, + "step": 10365 + }, + { + "epoch": 3.1817065684469, + "grad_norm": 0.3317202031612396, + "learning_rate": 7.976437554587537e-05, + "loss": 1.7899, + "step": 10366 + }, + { + "epoch": 3.1820135052179253, + "grad_norm": 0.28612568974494934, + "learning_rate": 7.97603814904622e-05, + "loss": 1.8145, + "step": 10367 + }, + { + "epoch": 3.18232044198895, + "grad_norm": 0.349917471408844, + "learning_rate": 7.975638714093949e-05, + "loss": 1.877, + "step": 10368 + }, + { + "epoch": 3.1826273787599755, + "grad_norm": 0.3737771809101105, + "learning_rate": 7.975239249734672e-05, + "loss": 1.8204, + "step": 10369 + }, + { + "epoch": 3.182934315531001, + "grad_norm": 0.3688446879386902, + "learning_rate": 7.974839755972339e-05, + "loss": 1.8487, + "step": 10370 + }, + { + "epoch": 3.1832412523020257, + "grad_norm": 0.2934897541999817, + "learning_rate": 7.974440232810894e-05, + "loss": 1.8243, + "step": 10371 + }, + { + "epoch": 3.183548189073051, + "grad_norm": 0.2596173882484436, + "learning_rate": 7.974040680254287e-05, + "loss": 1.7887, + "step": 10372 + }, + { + "epoch": 3.183855125844076, + "grad_norm": 0.35686594247817993, + "learning_rate": 7.973641098306468e-05, + "loss": 1.8653, + "step": 10373 + }, + { + "epoch": 3.1841620626151013, + "grad_norm": 0.3187713921070099, + "learning_rate": 7.973241486971383e-05, + "loss": 1.8767, + "step": 10374 + }, + { + "epoch": 3.1844689993861266, + "grad_norm": 0.2596273124217987, + "learning_rate": 7.972841846252985e-05, + "loss": 1.8028, + "step": 10375 + }, + { + "epoch": 3.1847759361571515, + "grad_norm": 0.2637474834918976, + "learning_rate": 7.972442176155221e-05, + "loss": 1.802, + "step": 10376 + }, + { + "epoch": 3.185082872928177, + "grad_norm": 0.2641126215457916, + "learning_rate": 7.97204247668204e-05, + "loss": 1.7931, + "step": 10377 + }, + { + "epoch": 3.185389809699202, + "grad_norm": 0.25594159960746765, + "learning_rate": 7.971642747837393e-05, + "loss": 1.818, + "step": 10378 + }, + { + "epoch": 3.185696746470227, + "grad_norm": 0.26567938923835754, + "learning_rate": 7.971242989625233e-05, + "loss": 1.8174, + "step": 10379 + }, + { + "epoch": 3.1860036832412524, + "grad_norm": 0.29580214619636536, + "learning_rate": 7.970843202049508e-05, + "loss": 1.869, + "step": 10380 + }, + { + "epoch": 3.1863106200122773, + "grad_norm": 0.2657530605792999, + "learning_rate": 7.970443385114168e-05, + "loss": 1.8352, + "step": 10381 + }, + { + "epoch": 3.1866175567833026, + "grad_norm": 0.2468358278274536, + "learning_rate": 7.970043538823165e-05, + "loss": 1.7851, + "step": 10382 + }, + { + "epoch": 3.186924493554328, + "grad_norm": 0.26464715600013733, + "learning_rate": 7.969643663180451e-05, + "loss": 1.8208, + "step": 10383 + }, + { + "epoch": 3.187231430325353, + "grad_norm": 0.26035723090171814, + "learning_rate": 7.969243758189979e-05, + "loss": 1.8089, + "step": 10384 + }, + { + "epoch": 3.187538367096378, + "grad_norm": 0.2644619941711426, + "learning_rate": 7.968843823855699e-05, + "loss": 1.8379, + "step": 10385 + }, + { + "epoch": 3.1878453038674035, + "grad_norm": 0.25576624274253845, + "learning_rate": 7.968443860181565e-05, + "loss": 1.7932, + "step": 10386 + }, + { + "epoch": 3.1881522406384284, + "grad_norm": 0.24276074767112732, + "learning_rate": 7.968043867171528e-05, + "loss": 1.8037, + "step": 10387 + }, + { + "epoch": 3.1884591774094537, + "grad_norm": 0.27156540751457214, + "learning_rate": 7.967643844829543e-05, + "loss": 1.7998, + "step": 10388 + }, + { + "epoch": 3.1887661141804786, + "grad_norm": 0.2555428743362427, + "learning_rate": 7.96724379315956e-05, + "loss": 1.7612, + "step": 10389 + }, + { + "epoch": 3.189073050951504, + "grad_norm": 0.3358438014984131, + "learning_rate": 7.966843712165537e-05, + "loss": 1.8543, + "step": 10390 + }, + { + "epoch": 3.1893799877225293, + "grad_norm": 0.2799586355686188, + "learning_rate": 7.966443601851424e-05, + "loss": 1.819, + "step": 10391 + }, + { + "epoch": 3.189686924493554, + "grad_norm": 0.2364189177751541, + "learning_rate": 7.966043462221178e-05, + "loss": 1.8537, + "step": 10392 + }, + { + "epoch": 3.1899938612645795, + "grad_norm": 0.23849403858184814, + "learning_rate": 7.96564329327875e-05, + "loss": 1.8125, + "step": 10393 + }, + { + "epoch": 3.190300798035605, + "grad_norm": 0.2371583878993988, + "learning_rate": 7.965243095028098e-05, + "loss": 1.7352, + "step": 10394 + }, + { + "epoch": 3.1906077348066297, + "grad_norm": 0.2584737539291382, + "learning_rate": 7.964842867473176e-05, + "loss": 1.8801, + "step": 10395 + }, + { + "epoch": 3.190914671577655, + "grad_norm": 0.27768051624298096, + "learning_rate": 7.964442610617939e-05, + "loss": 1.8221, + "step": 10396 + }, + { + "epoch": 3.1912216083486804, + "grad_norm": 0.2680891752243042, + "learning_rate": 7.964042324466341e-05, + "loss": 1.8371, + "step": 10397 + }, + { + "epoch": 3.1915285451197053, + "grad_norm": 0.25301921367645264, + "learning_rate": 7.963642009022343e-05, + "loss": 1.7972, + "step": 10398 + }, + { + "epoch": 3.1918354818907306, + "grad_norm": 0.2589731216430664, + "learning_rate": 7.963241664289896e-05, + "loss": 1.8145, + "step": 10399 + }, + { + "epoch": 3.1921424186617555, + "grad_norm": 0.2611297369003296, + "learning_rate": 7.962841290272956e-05, + "loss": 1.8736, + "step": 10400 + }, + { + "epoch": 3.192449355432781, + "grad_norm": 0.2812272906303406, + "learning_rate": 7.962440886975483e-05, + "loss": 1.8116, + "step": 10401 + }, + { + "epoch": 3.192756292203806, + "grad_norm": 0.3261657655239105, + "learning_rate": 7.962040454401434e-05, + "loss": 1.7935, + "step": 10402 + }, + { + "epoch": 3.193063228974831, + "grad_norm": 0.3355373442173004, + "learning_rate": 7.961639992554764e-05, + "loss": 1.7957, + "step": 10403 + }, + { + "epoch": 3.1933701657458564, + "grad_norm": 0.2811843156814575, + "learning_rate": 7.961239501439432e-05, + "loss": 1.797, + "step": 10404 + }, + { + "epoch": 3.1936771025168813, + "grad_norm": 0.24933238327503204, + "learning_rate": 7.960838981059395e-05, + "loss": 1.7594, + "step": 10405 + }, + { + "epoch": 3.1939840392879066, + "grad_norm": 0.29110121726989746, + "learning_rate": 7.960438431418613e-05, + "loss": 1.8268, + "step": 10406 + }, + { + "epoch": 3.194290976058932, + "grad_norm": 0.3702283799648285, + "learning_rate": 7.960037852521043e-05, + "loss": 1.7629, + "step": 10407 + }, + { + "epoch": 3.194597912829957, + "grad_norm": 0.33275437355041504, + "learning_rate": 7.959637244370644e-05, + "loss": 1.8507, + "step": 10408 + }, + { + "epoch": 3.194904849600982, + "grad_norm": 0.2691981792449951, + "learning_rate": 7.959236606971375e-05, + "loss": 1.8084, + "step": 10409 + }, + { + "epoch": 3.1952117863720075, + "grad_norm": 0.30108413100242615, + "learning_rate": 7.958835940327194e-05, + "loss": 1.8525, + "step": 10410 + }, + { + "epoch": 3.1955187231430324, + "grad_norm": 0.32112306356430054, + "learning_rate": 7.958435244442064e-05, + "loss": 1.7431, + "step": 10411 + }, + { + "epoch": 3.1958256599140578, + "grad_norm": 0.2795291543006897, + "learning_rate": 7.958034519319942e-05, + "loss": 1.7985, + "step": 10412 + }, + { + "epoch": 3.196132596685083, + "grad_norm": 0.2485792338848114, + "learning_rate": 7.957633764964788e-05, + "loss": 1.7363, + "step": 10413 + }, + { + "epoch": 3.196439533456108, + "grad_norm": 0.3552432358264923, + "learning_rate": 7.957232981380565e-05, + "loss": 1.8174, + "step": 10414 + }, + { + "epoch": 3.1967464702271333, + "grad_norm": 0.3829655051231384, + "learning_rate": 7.956832168571234e-05, + "loss": 1.9249, + "step": 10415 + }, + { + "epoch": 3.197053406998158, + "grad_norm": 0.2498074769973755, + "learning_rate": 7.956431326540752e-05, + "loss": 1.8104, + "step": 10416 + }, + { + "epoch": 3.1973603437691835, + "grad_norm": 0.24596504867076874, + "learning_rate": 7.956030455293082e-05, + "loss": 1.8007, + "step": 10417 + }, + { + "epoch": 3.197667280540209, + "grad_norm": 0.2795363664627075, + "learning_rate": 7.95562955483219e-05, + "loss": 1.775, + "step": 10418 + }, + { + "epoch": 3.1979742173112338, + "grad_norm": 0.3581138253211975, + "learning_rate": 7.95522862516203e-05, + "loss": 1.8567, + "step": 10419 + }, + { + "epoch": 3.198281154082259, + "grad_norm": 0.36102500557899475, + "learning_rate": 7.95482766628657e-05, + "loss": 1.8509, + "step": 10420 + }, + { + "epoch": 3.198588090853284, + "grad_norm": 0.4717029929161072, + "learning_rate": 7.954426678209774e-05, + "loss": 1.8218, + "step": 10421 + }, + { + "epoch": 3.1988950276243093, + "grad_norm": 0.3211984932422638, + "learning_rate": 7.9540256609356e-05, + "loss": 1.8696, + "step": 10422 + }, + { + "epoch": 3.1992019643953347, + "grad_norm": 0.30094626545906067, + "learning_rate": 7.953624614468011e-05, + "loss": 1.8714, + "step": 10423 + }, + { + "epoch": 3.1995089011663596, + "grad_norm": 0.267578125, + "learning_rate": 7.953223538810976e-05, + "loss": 1.7903, + "step": 10424 + }, + { + "epoch": 3.199815837937385, + "grad_norm": 0.35577845573425293, + "learning_rate": 7.952822433968453e-05, + "loss": 1.7808, + "step": 10425 + }, + { + "epoch": 3.2001227747084102, + "grad_norm": 0.4117741882801056, + "learning_rate": 7.952421299944408e-05, + "loss": 1.7856, + "step": 10426 + }, + { + "epoch": 3.200429711479435, + "grad_norm": 0.35202035307884216, + "learning_rate": 7.952020136742806e-05, + "loss": 1.8112, + "step": 10427 + }, + { + "epoch": 3.2007366482504604, + "grad_norm": 0.26514917612075806, + "learning_rate": 7.951618944367611e-05, + "loss": 1.828, + "step": 10428 + }, + { + "epoch": 3.201043585021486, + "grad_norm": 0.29219159483909607, + "learning_rate": 7.951217722822786e-05, + "loss": 1.9366, + "step": 10429 + }, + { + "epoch": 3.2013505217925107, + "grad_norm": 0.2929961383342743, + "learning_rate": 7.950816472112298e-05, + "loss": 1.8006, + "step": 10430 + }, + { + "epoch": 3.201657458563536, + "grad_norm": 0.28339722752571106, + "learning_rate": 7.950415192240114e-05, + "loss": 1.7411, + "step": 10431 + }, + { + "epoch": 3.201964395334561, + "grad_norm": 0.258884996175766, + "learning_rate": 7.950013883210196e-05, + "loss": 1.8153, + "step": 10432 + }, + { + "epoch": 3.2022713321055862, + "grad_norm": 0.3065929114818573, + "learning_rate": 7.949612545026512e-05, + "loss": 1.7918, + "step": 10433 + }, + { + "epoch": 3.2025782688766116, + "grad_norm": 0.289874404668808, + "learning_rate": 7.949211177693029e-05, + "loss": 1.7975, + "step": 10434 + }, + { + "epoch": 3.2028852056476365, + "grad_norm": 0.27025631070137024, + "learning_rate": 7.948809781213711e-05, + "loss": 1.8129, + "step": 10435 + }, + { + "epoch": 3.203192142418662, + "grad_norm": 0.2501074969768524, + "learning_rate": 7.948408355592528e-05, + "loss": 1.7653, + "step": 10436 + }, + { + "epoch": 3.203499079189687, + "grad_norm": 0.30402958393096924, + "learning_rate": 7.948006900833445e-05, + "loss": 1.8311, + "step": 10437 + }, + { + "epoch": 3.203806015960712, + "grad_norm": 0.28783223032951355, + "learning_rate": 7.94760541694043e-05, + "loss": 1.82, + "step": 10438 + }, + { + "epoch": 3.2041129527317374, + "grad_norm": 0.30428317189216614, + "learning_rate": 7.947203903917451e-05, + "loss": 1.8673, + "step": 10439 + }, + { + "epoch": 3.2044198895027622, + "grad_norm": 0.2860367000102997, + "learning_rate": 7.946802361768473e-05, + "loss": 1.824, + "step": 10440 + }, + { + "epoch": 3.2047268262737876, + "grad_norm": 0.2995273172855377, + "learning_rate": 7.946400790497469e-05, + "loss": 1.7342, + "step": 10441 + }, + { + "epoch": 3.205033763044813, + "grad_norm": 0.4374088943004608, + "learning_rate": 7.945999190108407e-05, + "loss": 1.8522, + "step": 10442 + }, + { + "epoch": 3.205340699815838, + "grad_norm": 0.37659478187561035, + "learning_rate": 7.945597560605252e-05, + "loss": 1.7518, + "step": 10443 + }, + { + "epoch": 3.205647636586863, + "grad_norm": 0.24257932603359222, + "learning_rate": 7.945195901991975e-05, + "loss": 1.7892, + "step": 10444 + }, + { + "epoch": 3.2059545733578885, + "grad_norm": 0.3682694435119629, + "learning_rate": 7.944794214272546e-05, + "loss": 1.7757, + "step": 10445 + }, + { + "epoch": 3.2062615101289134, + "grad_norm": 0.434692919254303, + "learning_rate": 7.944392497450936e-05, + "loss": 1.8207, + "step": 10446 + }, + { + "epoch": 3.2065684468999387, + "grad_norm": 0.3982211947441101, + "learning_rate": 7.943990751531113e-05, + "loss": 1.8303, + "step": 10447 + }, + { + "epoch": 3.2068753836709636, + "grad_norm": 0.2877334654331207, + "learning_rate": 7.943588976517049e-05, + "loss": 1.8495, + "step": 10448 + }, + { + "epoch": 3.207182320441989, + "grad_norm": 0.34589654207229614, + "learning_rate": 7.943187172412712e-05, + "loss": 1.7773, + "step": 10449 + }, + { + "epoch": 3.2074892572130143, + "grad_norm": 0.4727517366409302, + "learning_rate": 7.942785339222074e-05, + "loss": 1.8702, + "step": 10450 + }, + { + "epoch": 3.207796193984039, + "grad_norm": 0.4019354581832886, + "learning_rate": 7.942383476949107e-05, + "loss": 1.8095, + "step": 10451 + }, + { + "epoch": 3.2081031307550645, + "grad_norm": 0.2726243734359741, + "learning_rate": 7.941981585597782e-05, + "loss": 1.7273, + "step": 10452 + }, + { + "epoch": 3.20841006752609, + "grad_norm": 0.2944760024547577, + "learning_rate": 7.941579665172072e-05, + "loss": 1.7507, + "step": 10453 + }, + { + "epoch": 3.2087170042971147, + "grad_norm": 0.3530777096748352, + "learning_rate": 7.941177715675945e-05, + "loss": 1.8434, + "step": 10454 + }, + { + "epoch": 3.20902394106814, + "grad_norm": 0.28612539172172546, + "learning_rate": 7.940775737113378e-05, + "loss": 1.8094, + "step": 10455 + }, + { + "epoch": 3.209330877839165, + "grad_norm": 0.27006468176841736, + "learning_rate": 7.94037372948834e-05, + "loss": 1.7854, + "step": 10456 + }, + { + "epoch": 3.2096378146101903, + "grad_norm": 0.3027147054672241, + "learning_rate": 7.939971692804806e-05, + "loss": 1.7596, + "step": 10457 + }, + { + "epoch": 3.2099447513812156, + "grad_norm": 0.31999528408050537, + "learning_rate": 7.939569627066749e-05, + "loss": 1.8836, + "step": 10458 + }, + { + "epoch": 3.2102516881522405, + "grad_norm": 0.267600417137146, + "learning_rate": 7.939167532278142e-05, + "loss": 1.8508, + "step": 10459 + }, + { + "epoch": 3.210558624923266, + "grad_norm": 0.3171706795692444, + "learning_rate": 7.938765408442958e-05, + "loss": 1.7507, + "step": 10460 + }, + { + "epoch": 3.210865561694291, + "grad_norm": 0.2955280840396881, + "learning_rate": 7.938363255565171e-05, + "loss": 1.733, + "step": 10461 + }, + { + "epoch": 3.211172498465316, + "grad_norm": 0.3427969217300415, + "learning_rate": 7.937961073648759e-05, + "loss": 1.9208, + "step": 10462 + }, + { + "epoch": 3.2114794352363414, + "grad_norm": 0.28788647055625916, + "learning_rate": 7.937558862697692e-05, + "loss": 1.7723, + "step": 10463 + }, + { + "epoch": 3.2117863720073663, + "grad_norm": 0.26093682646751404, + "learning_rate": 7.937156622715945e-05, + "loss": 1.803, + "step": 10464 + }, + { + "epoch": 3.2120933087783916, + "grad_norm": 0.2791301906108856, + "learning_rate": 7.936754353707497e-05, + "loss": 1.7601, + "step": 10465 + }, + { + "epoch": 3.212400245549417, + "grad_norm": 0.3039831519126892, + "learning_rate": 7.93635205567632e-05, + "loss": 1.7864, + "step": 10466 + }, + { + "epoch": 3.212707182320442, + "grad_norm": 0.28498128056526184, + "learning_rate": 7.935949728626392e-05, + "loss": 1.7745, + "step": 10467 + }, + { + "epoch": 3.213014119091467, + "grad_norm": 0.2908780872821808, + "learning_rate": 7.935547372561687e-05, + "loss": 1.8281, + "step": 10468 + }, + { + "epoch": 3.2133210558624925, + "grad_norm": 0.26148509979248047, + "learning_rate": 7.935144987486183e-05, + "loss": 1.8545, + "step": 10469 + }, + { + "epoch": 3.2136279926335174, + "grad_norm": 0.2853962481021881, + "learning_rate": 7.934742573403856e-05, + "loss": 1.7765, + "step": 10470 + }, + { + "epoch": 3.2139349294045427, + "grad_norm": 0.26497501134872437, + "learning_rate": 7.934340130318681e-05, + "loss": 1.7472, + "step": 10471 + }, + { + "epoch": 3.214241866175568, + "grad_norm": 0.2806912660598755, + "learning_rate": 7.933937658234638e-05, + "loss": 1.7879, + "step": 10472 + }, + { + "epoch": 3.214548802946593, + "grad_norm": 0.2699974477291107, + "learning_rate": 7.933535157155705e-05, + "loss": 1.7539, + "step": 10473 + }, + { + "epoch": 3.2148557397176183, + "grad_norm": 0.22714731097221375, + "learning_rate": 7.933132627085856e-05, + "loss": 1.7861, + "step": 10474 + }, + { + "epoch": 3.215162676488643, + "grad_norm": 0.291340708732605, + "learning_rate": 7.932730068029072e-05, + "loss": 1.8381, + "step": 10475 + }, + { + "epoch": 3.2154696132596685, + "grad_norm": 0.3257324695587158, + "learning_rate": 7.93232747998933e-05, + "loss": 1.8293, + "step": 10476 + }, + { + "epoch": 3.215776550030694, + "grad_norm": 0.3518911600112915, + "learning_rate": 7.93192486297061e-05, + "loss": 1.853, + "step": 10477 + }, + { + "epoch": 3.2160834868017187, + "grad_norm": 0.27663540840148926, + "learning_rate": 7.93152221697689e-05, + "loss": 1.7831, + "step": 10478 + }, + { + "epoch": 3.216390423572744, + "grad_norm": 0.3153248429298401, + "learning_rate": 7.931119542012149e-05, + "loss": 1.7443, + "step": 10479 + }, + { + "epoch": 3.216697360343769, + "grad_norm": 0.2919597029685974, + "learning_rate": 7.930716838080368e-05, + "loss": 1.8108, + "step": 10480 + }, + { + "epoch": 3.2170042971147943, + "grad_norm": 0.26892516016960144, + "learning_rate": 7.930314105185524e-05, + "loss": 1.7791, + "step": 10481 + }, + { + "epoch": 3.2173112338858196, + "grad_norm": 0.2486005276441574, + "learning_rate": 7.929911343331599e-05, + "loss": 1.8184, + "step": 10482 + }, + { + "epoch": 3.2176181706568445, + "grad_norm": 0.260728120803833, + "learning_rate": 7.929508552522571e-05, + "loss": 1.7933, + "step": 10483 + }, + { + "epoch": 3.21792510742787, + "grad_norm": 0.3081948757171631, + "learning_rate": 7.929105732762425e-05, + "loss": 1.7732, + "step": 10484 + }, + { + "epoch": 3.218232044198895, + "grad_norm": 0.3807671368122101, + "learning_rate": 7.928702884055138e-05, + "loss": 1.7652, + "step": 10485 + }, + { + "epoch": 3.21853898096992, + "grad_norm": 0.31637755036354065, + "learning_rate": 7.928300006404692e-05, + "loss": 1.7605, + "step": 10486 + }, + { + "epoch": 3.2188459177409454, + "grad_norm": 0.2812853455543518, + "learning_rate": 7.927897099815071e-05, + "loss": 1.7925, + "step": 10487 + }, + { + "epoch": 3.2191528545119708, + "grad_norm": 0.3472350239753723, + "learning_rate": 7.927494164290253e-05, + "loss": 1.8252, + "step": 10488 + }, + { + "epoch": 3.2194597912829956, + "grad_norm": 0.4202714264392853, + "learning_rate": 7.927091199834222e-05, + "loss": 1.7993, + "step": 10489 + }, + { + "epoch": 3.219766728054021, + "grad_norm": 0.44552353024482727, + "learning_rate": 7.92668820645096e-05, + "loss": 1.8609, + "step": 10490 + }, + { + "epoch": 3.220073664825046, + "grad_norm": 0.38964664936065674, + "learning_rate": 7.926285184144451e-05, + "loss": 1.864, + "step": 10491 + }, + { + "epoch": 3.220380601596071, + "grad_norm": 0.2978462278842926, + "learning_rate": 7.925882132918676e-05, + "loss": 1.7892, + "step": 10492 + }, + { + "epoch": 3.2206875383670965, + "grad_norm": 0.2520316243171692, + "learning_rate": 7.925479052777619e-05, + "loss": 1.7702, + "step": 10493 + }, + { + "epoch": 3.2209944751381214, + "grad_norm": 0.28151068091392517, + "learning_rate": 7.925075943725263e-05, + "loss": 1.7613, + "step": 10494 + }, + { + "epoch": 3.2213014119091468, + "grad_norm": 0.3346099555492401, + "learning_rate": 7.924672805765592e-05, + "loss": 1.894, + "step": 10495 + }, + { + "epoch": 3.2216083486801717, + "grad_norm": 0.2981362044811249, + "learning_rate": 7.924269638902591e-05, + "loss": 1.8157, + "step": 10496 + }, + { + "epoch": 3.221915285451197, + "grad_norm": 0.2561499774456024, + "learning_rate": 7.923866443140242e-05, + "loss": 1.8259, + "step": 10497 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.26480481028556824, + "learning_rate": 7.923463218482532e-05, + "loss": 1.7856, + "step": 10498 + }, + { + "epoch": 3.222529158993247, + "grad_norm": 0.24103692173957825, + "learning_rate": 7.923059964933446e-05, + "loss": 1.7765, + "step": 10499 + }, + { + "epoch": 3.2228360957642725, + "grad_norm": 0.2399173080921173, + "learning_rate": 7.922656682496967e-05, + "loss": 1.8216, + "step": 10500 + }, + { + "epoch": 3.223143032535298, + "grad_norm": 0.24530018866062164, + "learning_rate": 7.922253371177082e-05, + "loss": 1.8155, + "step": 10501 + }, + { + "epoch": 3.2234499693063228, + "grad_norm": 0.23298653960227966, + "learning_rate": 7.921850030977775e-05, + "loss": 1.7843, + "step": 10502 + }, + { + "epoch": 3.223756906077348, + "grad_norm": 0.3053973317146301, + "learning_rate": 7.921446661903035e-05, + "loss": 1.8113, + "step": 10503 + }, + { + "epoch": 3.2240638428483734, + "grad_norm": 0.261336088180542, + "learning_rate": 7.921043263956847e-05, + "loss": 1.8073, + "step": 10504 + }, + { + "epoch": 3.2243707796193983, + "grad_norm": 0.24877268075942993, + "learning_rate": 7.920639837143195e-05, + "loss": 1.8344, + "step": 10505 + }, + { + "epoch": 3.2246777163904237, + "grad_norm": 0.26784422993659973, + "learning_rate": 7.920236381466071e-05, + "loss": 1.7757, + "step": 10506 + }, + { + "epoch": 3.2249846531614486, + "grad_norm": 0.2672121226787567, + "learning_rate": 7.919832896929458e-05, + "loss": 1.8384, + "step": 10507 + }, + { + "epoch": 3.225291589932474, + "grad_norm": 0.27254921197891235, + "learning_rate": 7.919429383537346e-05, + "loss": 1.8056, + "step": 10508 + }, + { + "epoch": 3.2255985267034992, + "grad_norm": 0.24467822909355164, + "learning_rate": 7.91902584129372e-05, + "loss": 1.8109, + "step": 10509 + }, + { + "epoch": 3.225905463474524, + "grad_norm": 0.25966358184814453, + "learning_rate": 7.918622270202571e-05, + "loss": 1.82, + "step": 10510 + }, + { + "epoch": 3.2262124002455494, + "grad_norm": 0.28601330518722534, + "learning_rate": 7.918218670267886e-05, + "loss": 1.7266, + "step": 10511 + }, + { + "epoch": 3.226519337016575, + "grad_norm": 0.4017516076564789, + "learning_rate": 7.917815041493653e-05, + "loss": 1.8408, + "step": 10512 + }, + { + "epoch": 3.2268262737875997, + "grad_norm": 0.3995787501335144, + "learning_rate": 7.917411383883862e-05, + "loss": 1.8441, + "step": 10513 + }, + { + "epoch": 3.227133210558625, + "grad_norm": 0.26997458934783936, + "learning_rate": 7.917007697442502e-05, + "loss": 1.8078, + "step": 10514 + }, + { + "epoch": 3.22744014732965, + "grad_norm": 0.34353014826774597, + "learning_rate": 7.916603982173562e-05, + "loss": 1.7523, + "step": 10515 + }, + { + "epoch": 3.2277470841006752, + "grad_norm": 0.39522337913513184, + "learning_rate": 7.916200238081032e-05, + "loss": 1.7532, + "step": 10516 + }, + { + "epoch": 3.2280540208717006, + "grad_norm": 0.4176923334598541, + "learning_rate": 7.915796465168903e-05, + "loss": 1.8895, + "step": 10517 + }, + { + "epoch": 3.2283609576427255, + "grad_norm": 0.30232906341552734, + "learning_rate": 7.915392663441164e-05, + "loss": 1.8223, + "step": 10518 + }, + { + "epoch": 3.228667894413751, + "grad_norm": 0.230951726436615, + "learning_rate": 7.914988832901805e-05, + "loss": 1.7265, + "step": 10519 + }, + { + "epoch": 3.228974831184776, + "grad_norm": 0.26381877064704895, + "learning_rate": 7.914584973554819e-05, + "loss": 1.7858, + "step": 10520 + }, + { + "epoch": 3.229281767955801, + "grad_norm": 0.2500905394554138, + "learning_rate": 7.914181085404194e-05, + "loss": 1.7606, + "step": 10521 + }, + { + "epoch": 3.2295887047268264, + "grad_norm": 0.2585415840148926, + "learning_rate": 7.913777168453925e-05, + "loss": 1.787, + "step": 10522 + }, + { + "epoch": 3.2298956414978512, + "grad_norm": 0.24236604571342468, + "learning_rate": 7.913373222708001e-05, + "loss": 1.7623, + "step": 10523 + }, + { + "epoch": 3.2302025782688766, + "grad_norm": 0.3113093078136444, + "learning_rate": 7.912969248170416e-05, + "loss": 1.7736, + "step": 10524 + }, + { + "epoch": 3.230509515039902, + "grad_norm": 0.3341342806816101, + "learning_rate": 7.912565244845163e-05, + "loss": 1.8583, + "step": 10525 + }, + { + "epoch": 3.230816451810927, + "grad_norm": 0.2644478678703308, + "learning_rate": 7.912161212736231e-05, + "loss": 1.7891, + "step": 10526 + }, + { + "epoch": 3.231123388581952, + "grad_norm": 0.22916561365127563, + "learning_rate": 7.911757151847616e-05, + "loss": 1.7642, + "step": 10527 + }, + { + "epoch": 3.2314303253529775, + "grad_norm": 0.24204877018928528, + "learning_rate": 7.911353062183309e-05, + "loss": 1.8522, + "step": 10528 + }, + { + "epoch": 3.2317372621240024, + "grad_norm": 0.25339365005493164, + "learning_rate": 7.910948943747307e-05, + "loss": 1.8391, + "step": 10529 + }, + { + "epoch": 3.2320441988950277, + "grad_norm": 0.2652709186077118, + "learning_rate": 7.9105447965436e-05, + "loss": 1.7735, + "step": 10530 + }, + { + "epoch": 3.2323511356660526, + "grad_norm": 0.2711019217967987, + "learning_rate": 7.910140620576183e-05, + "loss": 1.8491, + "step": 10531 + }, + { + "epoch": 3.232658072437078, + "grad_norm": 0.2598389685153961, + "learning_rate": 7.909736415849052e-05, + "loss": 1.8417, + "step": 10532 + }, + { + "epoch": 3.2329650092081033, + "grad_norm": 0.278037428855896, + "learning_rate": 7.9093321823662e-05, + "loss": 1.8774, + "step": 10533 + }, + { + "epoch": 3.233271945979128, + "grad_norm": 0.32015568017959595, + "learning_rate": 7.90892792013162e-05, + "loss": 1.8873, + "step": 10534 + }, + { + "epoch": 3.2335788827501535, + "grad_norm": 0.3098098635673523, + "learning_rate": 7.908523629149312e-05, + "loss": 1.8141, + "step": 10535 + }, + { + "epoch": 3.233885819521179, + "grad_norm": 0.3127266764640808, + "learning_rate": 7.908119309423267e-05, + "loss": 1.8587, + "step": 10536 + }, + { + "epoch": 3.2341927562922037, + "grad_norm": 0.3085545301437378, + "learning_rate": 7.907714960957483e-05, + "loss": 1.8544, + "step": 10537 + }, + { + "epoch": 3.234499693063229, + "grad_norm": 0.3051004409790039, + "learning_rate": 7.907310583755956e-05, + "loss": 1.8144, + "step": 10538 + }, + { + "epoch": 3.234806629834254, + "grad_norm": 0.3458186686038971, + "learning_rate": 7.906906177822682e-05, + "loss": 1.8388, + "step": 10539 + }, + { + "epoch": 3.2351135666052793, + "grad_norm": 0.37064439058303833, + "learning_rate": 7.906501743161656e-05, + "loss": 1.7574, + "step": 10540 + }, + { + "epoch": 3.2354205033763046, + "grad_norm": 0.3382316827774048, + "learning_rate": 7.906097279776876e-05, + "loss": 1.8785, + "step": 10541 + }, + { + "epoch": 3.2357274401473295, + "grad_norm": 0.254802942276001, + "learning_rate": 7.905692787672341e-05, + "loss": 1.8276, + "step": 10542 + }, + { + "epoch": 3.236034376918355, + "grad_norm": 0.3362341523170471, + "learning_rate": 7.905288266852047e-05, + "loss": 1.8057, + "step": 10543 + }, + { + "epoch": 3.23634131368938, + "grad_norm": 0.38821661472320557, + "learning_rate": 7.904883717319988e-05, + "loss": 1.7841, + "step": 10544 + }, + { + "epoch": 3.236648250460405, + "grad_norm": 0.33889076113700867, + "learning_rate": 7.90447913908017e-05, + "loss": 1.7892, + "step": 10545 + }, + { + "epoch": 3.2369551872314304, + "grad_norm": 0.2741014361381531, + "learning_rate": 7.904074532136585e-05, + "loss": 1.7611, + "step": 10546 + }, + { + "epoch": 3.2372621240024557, + "grad_norm": 0.28950995206832886, + "learning_rate": 7.903669896493233e-05, + "loss": 1.7963, + "step": 10547 + }, + { + "epoch": 3.2375690607734806, + "grad_norm": 0.30647143721580505, + "learning_rate": 7.903265232154113e-05, + "loss": 1.7522, + "step": 10548 + }, + { + "epoch": 3.237875997544506, + "grad_norm": 0.30428263545036316, + "learning_rate": 7.902860539123225e-05, + "loss": 1.7383, + "step": 10549 + }, + { + "epoch": 3.238182934315531, + "grad_norm": 0.2357146292924881, + "learning_rate": 7.902455817404569e-05, + "loss": 1.7243, + "step": 10550 + }, + { + "epoch": 3.238489871086556, + "grad_norm": 0.3125104606151581, + "learning_rate": 7.90205106700214e-05, + "loss": 1.8542, + "step": 10551 + }, + { + "epoch": 3.2387968078575815, + "grad_norm": 0.25797244906425476, + "learning_rate": 7.901646287919944e-05, + "loss": 1.8374, + "step": 10552 + }, + { + "epoch": 3.2391037446286064, + "grad_norm": 0.3127591907978058, + "learning_rate": 7.901241480161978e-05, + "loss": 1.9457, + "step": 10553 + }, + { + "epoch": 3.2394106813996317, + "grad_norm": 0.2971835434436798, + "learning_rate": 7.900836643732243e-05, + "loss": 1.7933, + "step": 10554 + }, + { + "epoch": 3.2397176181706566, + "grad_norm": 0.28931814432144165, + "learning_rate": 7.90043177863474e-05, + "loss": 1.8201, + "step": 10555 + }, + { + "epoch": 3.240024554941682, + "grad_norm": 0.3348724842071533, + "learning_rate": 7.90002688487347e-05, + "loss": 1.8718, + "step": 10556 + }, + { + "epoch": 3.2403314917127073, + "grad_norm": 0.28566426038742065, + "learning_rate": 7.899621962452436e-05, + "loss": 1.805, + "step": 10557 + }, + { + "epoch": 3.240638428483732, + "grad_norm": 0.27074119448661804, + "learning_rate": 7.899217011375637e-05, + "loss": 1.842, + "step": 10558 + }, + { + "epoch": 3.2409453652547575, + "grad_norm": 0.27014291286468506, + "learning_rate": 7.898812031647076e-05, + "loss": 1.8156, + "step": 10559 + }, + { + "epoch": 3.241252302025783, + "grad_norm": 0.28087863326072693, + "learning_rate": 7.898407023270756e-05, + "loss": 1.8399, + "step": 10560 + }, + { + "epoch": 3.2415592387968077, + "grad_norm": 0.2641037404537201, + "learning_rate": 7.898001986250679e-05, + "loss": 1.7977, + "step": 10561 + }, + { + "epoch": 3.241866175567833, + "grad_norm": 0.2843858301639557, + "learning_rate": 7.897596920590848e-05, + "loss": 1.834, + "step": 10562 + }, + { + "epoch": 3.2421731123388584, + "grad_norm": 0.2724611163139343, + "learning_rate": 7.897191826295266e-05, + "loss": 1.7547, + "step": 10563 + }, + { + "epoch": 3.2424800491098833, + "grad_norm": 0.2583858370780945, + "learning_rate": 7.896786703367935e-05, + "loss": 1.7658, + "step": 10564 + }, + { + "epoch": 3.2427869858809086, + "grad_norm": 0.2666650712490082, + "learning_rate": 7.896381551812861e-05, + "loss": 1.8017, + "step": 10565 + }, + { + "epoch": 3.2430939226519335, + "grad_norm": 0.23269347846508026, + "learning_rate": 7.895976371634047e-05, + "loss": 1.8267, + "step": 10566 + }, + { + "epoch": 3.243400859422959, + "grad_norm": 0.27865225076675415, + "learning_rate": 7.895571162835496e-05, + "loss": 1.8093, + "step": 10567 + }, + { + "epoch": 3.243707796193984, + "grad_norm": 0.29445022344589233, + "learning_rate": 7.895165925421216e-05, + "loss": 1.7999, + "step": 10568 + }, + { + "epoch": 3.244014732965009, + "grad_norm": 0.32135528326034546, + "learning_rate": 7.894760659395206e-05, + "loss": 1.8405, + "step": 10569 + }, + { + "epoch": 3.2443216697360344, + "grad_norm": 0.3409091532230377, + "learning_rate": 7.894355364761477e-05, + "loss": 1.7861, + "step": 10570 + }, + { + "epoch": 3.2446286065070598, + "grad_norm": 0.3379025459289551, + "learning_rate": 7.893950041524032e-05, + "loss": 1.8495, + "step": 10571 + }, + { + "epoch": 3.2449355432780846, + "grad_norm": 0.2843063473701477, + "learning_rate": 7.893544689686874e-05, + "loss": 1.7888, + "step": 10572 + }, + { + "epoch": 3.24524248004911, + "grad_norm": 0.2914074957370758, + "learning_rate": 7.893139309254013e-05, + "loss": 1.7866, + "step": 10573 + }, + { + "epoch": 3.245549416820135, + "grad_norm": 0.39855021238327026, + "learning_rate": 7.892733900229454e-05, + "loss": 1.7865, + "step": 10574 + }, + { + "epoch": 3.24585635359116, + "grad_norm": 0.4232102632522583, + "learning_rate": 7.892328462617203e-05, + "loss": 1.8443, + "step": 10575 + }, + { + "epoch": 3.2461632903621855, + "grad_norm": 0.390794962644577, + "learning_rate": 7.891922996421267e-05, + "loss": 1.8735, + "step": 10576 + }, + { + "epoch": 3.2464702271332104, + "grad_norm": 0.3051595687866211, + "learning_rate": 7.891517501645653e-05, + "loss": 1.8654, + "step": 10577 + }, + { + "epoch": 3.2467771639042358, + "grad_norm": 0.25363096594810486, + "learning_rate": 7.891111978294367e-05, + "loss": 1.7602, + "step": 10578 + }, + { + "epoch": 3.247084100675261, + "grad_norm": 0.29785794019699097, + "learning_rate": 7.890706426371419e-05, + "loss": 1.8242, + "step": 10579 + }, + { + "epoch": 3.247391037446286, + "grad_norm": 0.346162885427475, + "learning_rate": 7.890300845880816e-05, + "loss": 1.8551, + "step": 10580 + }, + { + "epoch": 3.2476979742173113, + "grad_norm": 0.33906155824661255, + "learning_rate": 7.889895236826566e-05, + "loss": 1.765, + "step": 10581 + }, + { + "epoch": 3.248004910988336, + "grad_norm": 0.26083165407180786, + "learning_rate": 7.889489599212676e-05, + "loss": 1.8246, + "step": 10582 + }, + { + "epoch": 3.2483118477593615, + "grad_norm": 0.3042019009590149, + "learning_rate": 7.889083933043157e-05, + "loss": 1.9017, + "step": 10583 + }, + { + "epoch": 3.248618784530387, + "grad_norm": 0.34833577275276184, + "learning_rate": 7.888678238322018e-05, + "loss": 1.7863, + "step": 10584 + }, + { + "epoch": 3.2489257213014118, + "grad_norm": 0.34436655044555664, + "learning_rate": 7.888272515053267e-05, + "loss": 1.7937, + "step": 10585 + }, + { + "epoch": 3.249232658072437, + "grad_norm": 0.2550172507762909, + "learning_rate": 7.887866763240914e-05, + "loss": 1.7615, + "step": 10586 + }, + { + "epoch": 3.2495395948434624, + "grad_norm": 0.3334405720233917, + "learning_rate": 7.88746098288897e-05, + "loss": 1.7465, + "step": 10587 + }, + { + "epoch": 3.2498465316144873, + "grad_norm": 0.4668157696723938, + "learning_rate": 7.887055174001443e-05, + "loss": 1.7836, + "step": 10588 + }, + { + "epoch": 3.2501534683855127, + "grad_norm": 0.524680495262146, + "learning_rate": 7.886649336582344e-05, + "loss": 1.844, + "step": 10589 + }, + { + "epoch": 3.250460405156538, + "grad_norm": 0.36859074234962463, + "learning_rate": 7.886243470635685e-05, + "loss": 1.8072, + "step": 10590 + }, + { + "epoch": 3.250767341927563, + "grad_norm": 0.32370296120643616, + "learning_rate": 7.885837576165478e-05, + "loss": 1.802, + "step": 10591 + }, + { + "epoch": 3.2510742786985882, + "grad_norm": 0.3506374955177307, + "learning_rate": 7.88543165317573e-05, + "loss": 1.7965, + "step": 10592 + }, + { + "epoch": 3.251381215469613, + "grad_norm": 0.39058688282966614, + "learning_rate": 7.885025701670457e-05, + "loss": 1.7987, + "step": 10593 + }, + { + "epoch": 3.2516881522406385, + "grad_norm": 0.3042154014110565, + "learning_rate": 7.884619721653669e-05, + "loss": 1.8345, + "step": 10594 + }, + { + "epoch": 3.251995089011664, + "grad_norm": 0.2249498963356018, + "learning_rate": 7.884213713129378e-05, + "loss": 1.7796, + "step": 10595 + }, + { + "epoch": 3.2523020257826887, + "grad_norm": 0.2701997458934784, + "learning_rate": 7.883807676101595e-05, + "loss": 1.8027, + "step": 10596 + }, + { + "epoch": 3.252608962553714, + "grad_norm": 0.2574785053730011, + "learning_rate": 7.883401610574336e-05, + "loss": 1.7878, + "step": 10597 + }, + { + "epoch": 3.252915899324739, + "grad_norm": 0.24964739382266998, + "learning_rate": 7.882995516551613e-05, + "loss": 1.7612, + "step": 10598 + }, + { + "epoch": 3.2532228360957642, + "grad_norm": 0.2519865930080414, + "learning_rate": 7.882589394037437e-05, + "loss": 1.7583, + "step": 10599 + }, + { + "epoch": 3.2535297728667896, + "grad_norm": 0.23174463212490082, + "learning_rate": 7.882183243035823e-05, + "loss": 1.7607, + "step": 10600 + }, + { + "epoch": 3.2538367096378145, + "grad_norm": 0.28103554248809814, + "learning_rate": 7.881777063550786e-05, + "loss": 1.904, + "step": 10601 + }, + { + "epoch": 3.25414364640884, + "grad_norm": 0.265677809715271, + "learning_rate": 7.881370855586339e-05, + "loss": 1.8169, + "step": 10602 + }, + { + "epoch": 3.254450583179865, + "grad_norm": 0.2539603114128113, + "learning_rate": 7.880964619146493e-05, + "loss": 1.8439, + "step": 10603 + }, + { + "epoch": 3.25475751995089, + "grad_norm": 0.2741886377334595, + "learning_rate": 7.88055835423527e-05, + "loss": 1.8737, + "step": 10604 + }, + { + "epoch": 3.2550644567219154, + "grad_norm": 0.27548348903656006, + "learning_rate": 7.88015206085668e-05, + "loss": 1.8385, + "step": 10605 + }, + { + "epoch": 3.2553713934929407, + "grad_norm": 0.2958502769470215, + "learning_rate": 7.879745739014739e-05, + "loss": 1.8603, + "step": 10606 + }, + { + "epoch": 3.2556783302639656, + "grad_norm": 0.2728644907474518, + "learning_rate": 7.879339388713462e-05, + "loss": 1.8, + "step": 10607 + }, + { + "epoch": 3.255985267034991, + "grad_norm": 0.28718289732933044, + "learning_rate": 7.878933009956866e-05, + "loss": 1.7803, + "step": 10608 + }, + { + "epoch": 3.256292203806016, + "grad_norm": 0.2989691197872162, + "learning_rate": 7.878526602748967e-05, + "loss": 1.8155, + "step": 10609 + }, + { + "epoch": 3.256599140577041, + "grad_norm": 0.24515527486801147, + "learning_rate": 7.87812016709378e-05, + "loss": 1.7623, + "step": 10610 + }, + { + "epoch": 3.2569060773480665, + "grad_norm": 0.29946041107177734, + "learning_rate": 7.877713702995324e-05, + "loss": 1.8097, + "step": 10611 + }, + { + "epoch": 3.2572130141190914, + "grad_norm": 0.2854483723640442, + "learning_rate": 7.877307210457613e-05, + "loss": 1.8088, + "step": 10612 + }, + { + "epoch": 3.2575199508901167, + "grad_norm": 0.27812930941581726, + "learning_rate": 7.876900689484668e-05, + "loss": 1.8151, + "step": 10613 + }, + { + "epoch": 3.2578268876611416, + "grad_norm": 0.2658015787601471, + "learning_rate": 7.876494140080503e-05, + "loss": 1.8314, + "step": 10614 + }, + { + "epoch": 3.258133824432167, + "grad_norm": 0.28935661911964417, + "learning_rate": 7.876087562249137e-05, + "loss": 1.7948, + "step": 10615 + }, + { + "epoch": 3.2584407612031923, + "grad_norm": 0.27497121691703796, + "learning_rate": 7.875680955994587e-05, + "loss": 1.7964, + "step": 10616 + }, + { + "epoch": 3.258747697974217, + "grad_norm": 0.3313405513763428, + "learning_rate": 7.875274321320873e-05, + "loss": 1.8143, + "step": 10617 + }, + { + "epoch": 3.2590546347452425, + "grad_norm": 0.3217218816280365, + "learning_rate": 7.874867658232013e-05, + "loss": 1.7749, + "step": 10618 + }, + { + "epoch": 3.259361571516268, + "grad_norm": 0.25105544924736023, + "learning_rate": 7.874460966732025e-05, + "loss": 1.7834, + "step": 10619 + }, + { + "epoch": 3.2596685082872927, + "grad_norm": 0.2931382358074188, + "learning_rate": 7.874054246824931e-05, + "loss": 1.8252, + "step": 10620 + }, + { + "epoch": 3.259975445058318, + "grad_norm": 0.2803363502025604, + "learning_rate": 7.873647498514747e-05, + "loss": 1.7527, + "step": 10621 + }, + { + "epoch": 3.2602823818293434, + "grad_norm": 0.29857927560806274, + "learning_rate": 7.873240721805492e-05, + "loss": 1.8085, + "step": 10622 + }, + { + "epoch": 3.2605893186003683, + "grad_norm": 0.24864110350608826, + "learning_rate": 7.872833916701192e-05, + "loss": 1.7509, + "step": 10623 + }, + { + "epoch": 3.2608962553713936, + "grad_norm": 0.24105949699878693, + "learning_rate": 7.872427083205862e-05, + "loss": 1.7871, + "step": 10624 + }, + { + "epoch": 3.2612031921424185, + "grad_norm": 0.2429245114326477, + "learning_rate": 7.872020221323523e-05, + "loss": 1.777, + "step": 10625 + }, + { + "epoch": 3.261510128913444, + "grad_norm": 0.234287828207016, + "learning_rate": 7.871613331058197e-05, + "loss": 1.8001, + "step": 10626 + }, + { + "epoch": 3.261817065684469, + "grad_norm": 0.3463406264781952, + "learning_rate": 7.871206412413905e-05, + "loss": 1.8925, + "step": 10627 + }, + { + "epoch": 3.262124002455494, + "grad_norm": 0.26798921823501587, + "learning_rate": 7.87079946539467e-05, + "loss": 1.7963, + "step": 10628 + }, + { + "epoch": 3.2624309392265194, + "grad_norm": 0.28603312373161316, + "learning_rate": 7.87039249000451e-05, + "loss": 1.8308, + "step": 10629 + }, + { + "epoch": 3.2627378759975443, + "grad_norm": 0.2717527747154236, + "learning_rate": 7.86998548624745e-05, + "loss": 1.8246, + "step": 10630 + }, + { + "epoch": 3.2630448127685696, + "grad_norm": 0.32215580344200134, + "learning_rate": 7.86957845412751e-05, + "loss": 1.7278, + "step": 10631 + }, + { + "epoch": 3.263351749539595, + "grad_norm": 0.3578735589981079, + "learning_rate": 7.869171393648717e-05, + "loss": 1.7288, + "step": 10632 + }, + { + "epoch": 3.26365868631062, + "grad_norm": 0.3120707869529724, + "learning_rate": 7.868764304815089e-05, + "loss": 1.7971, + "step": 10633 + }, + { + "epoch": 3.263965623081645, + "grad_norm": 0.27419236302375793, + "learning_rate": 7.86835718763065e-05, + "loss": 1.8529, + "step": 10634 + }, + { + "epoch": 3.2642725598526705, + "grad_norm": 0.3200531601905823, + "learning_rate": 7.867950042099423e-05, + "loss": 1.7892, + "step": 10635 + }, + { + "epoch": 3.2645794966236954, + "grad_norm": 0.325706422328949, + "learning_rate": 7.867542868225435e-05, + "loss": 1.8236, + "step": 10636 + }, + { + "epoch": 3.2648864333947207, + "grad_norm": 0.2950136065483093, + "learning_rate": 7.867135666012707e-05, + "loss": 1.8163, + "step": 10637 + }, + { + "epoch": 3.265193370165746, + "grad_norm": 0.2772117257118225, + "learning_rate": 7.866728435465263e-05, + "loss": 1.8373, + "step": 10638 + }, + { + "epoch": 3.265500306936771, + "grad_norm": 0.2887401580810547, + "learning_rate": 7.866321176587129e-05, + "loss": 1.7756, + "step": 10639 + }, + { + "epoch": 3.2658072437077963, + "grad_norm": 0.3474489152431488, + "learning_rate": 7.865913889382329e-05, + "loss": 1.7539, + "step": 10640 + }, + { + "epoch": 3.266114180478821, + "grad_norm": 0.3433493971824646, + "learning_rate": 7.865506573854888e-05, + "loss": 1.7987, + "step": 10641 + }, + { + "epoch": 3.2664211172498465, + "grad_norm": 0.3075394630432129, + "learning_rate": 7.865099230008832e-05, + "loss": 1.7907, + "step": 10642 + }, + { + "epoch": 3.266728054020872, + "grad_norm": 0.24817697703838348, + "learning_rate": 7.864691857848187e-05, + "loss": 1.7941, + "step": 10643 + }, + { + "epoch": 3.2670349907918967, + "grad_norm": 0.290147602558136, + "learning_rate": 7.864284457376976e-05, + "loss": 1.9125, + "step": 10644 + }, + { + "epoch": 3.267341927562922, + "grad_norm": 0.253684937953949, + "learning_rate": 7.863877028599229e-05, + "loss": 1.8084, + "step": 10645 + }, + { + "epoch": 3.267648864333947, + "grad_norm": 0.26349252462387085, + "learning_rate": 7.863469571518969e-05, + "loss": 1.7548, + "step": 10646 + }, + { + "epoch": 3.2679558011049723, + "grad_norm": 0.30568864941596985, + "learning_rate": 7.863062086140224e-05, + "loss": 1.8551, + "step": 10647 + }, + { + "epoch": 3.2682627378759976, + "grad_norm": 0.2866690456867218, + "learning_rate": 7.862654572467024e-05, + "loss": 1.8145, + "step": 10648 + }, + { + "epoch": 3.2685696746470225, + "grad_norm": 0.32022854685783386, + "learning_rate": 7.862247030503391e-05, + "loss": 1.896, + "step": 10649 + }, + { + "epoch": 3.268876611418048, + "grad_norm": 0.25260284543037415, + "learning_rate": 7.861839460253356e-05, + "loss": 1.814, + "step": 10650 + }, + { + "epoch": 3.269183548189073, + "grad_norm": 0.26776066422462463, + "learning_rate": 7.861431861720947e-05, + "loss": 1.7755, + "step": 10651 + }, + { + "epoch": 3.269490484960098, + "grad_norm": 0.26514193415641785, + "learning_rate": 7.861024234910191e-05, + "loss": 1.7606, + "step": 10652 + }, + { + "epoch": 3.2697974217311234, + "grad_norm": 0.27213940024375916, + "learning_rate": 7.860616579825116e-05, + "loss": 1.8074, + "step": 10653 + }, + { + "epoch": 3.2701043585021488, + "grad_norm": 0.29192888736724854, + "learning_rate": 7.860208896469752e-05, + "loss": 1.8436, + "step": 10654 + }, + { + "epoch": 3.2704112952731736, + "grad_norm": 0.3772370219230652, + "learning_rate": 7.859801184848127e-05, + "loss": 1.8096, + "step": 10655 + }, + { + "epoch": 3.270718232044199, + "grad_norm": 0.4574970006942749, + "learning_rate": 7.859393444964269e-05, + "loss": 1.7612, + "step": 10656 + }, + { + "epoch": 3.271025168815224, + "grad_norm": 0.4614393413066864, + "learning_rate": 7.858985676822211e-05, + "loss": 1.8529, + "step": 10657 + }, + { + "epoch": 3.271332105586249, + "grad_norm": 0.33567267656326294, + "learning_rate": 7.85857788042598e-05, + "loss": 1.8391, + "step": 10658 + }, + { + "epoch": 3.2716390423572745, + "grad_norm": 0.2564064860343933, + "learning_rate": 7.858170055779609e-05, + "loss": 1.7621, + "step": 10659 + }, + { + "epoch": 3.2719459791282994, + "grad_norm": 0.26769882440567017, + "learning_rate": 7.857762202887122e-05, + "loss": 1.8145, + "step": 10660 + }, + { + "epoch": 3.2722529158993248, + "grad_norm": 0.262008935213089, + "learning_rate": 7.857354321752558e-05, + "loss": 1.7513, + "step": 10661 + }, + { + "epoch": 3.27255985267035, + "grad_norm": 0.26494377851486206, + "learning_rate": 7.856946412379942e-05, + "loss": 1.8071, + "step": 10662 + }, + { + "epoch": 3.272866789441375, + "grad_norm": 0.25613999366760254, + "learning_rate": 7.856538474773307e-05, + "loss": 1.8775, + "step": 10663 + }, + { + "epoch": 3.2731737262124003, + "grad_norm": 0.24789929389953613, + "learning_rate": 7.856130508936684e-05, + "loss": 1.8055, + "step": 10664 + }, + { + "epoch": 3.2734806629834257, + "grad_norm": 0.29111939668655396, + "learning_rate": 7.855722514874107e-05, + "loss": 1.8114, + "step": 10665 + }, + { + "epoch": 3.2737875997544506, + "grad_norm": 0.30511030554771423, + "learning_rate": 7.855314492589605e-05, + "loss": 1.8131, + "step": 10666 + }, + { + "epoch": 3.274094536525476, + "grad_norm": 0.2545989453792572, + "learning_rate": 7.854906442087212e-05, + "loss": 1.7933, + "step": 10667 + }, + { + "epoch": 3.2744014732965008, + "grad_norm": 0.26684823632240295, + "learning_rate": 7.85449836337096e-05, + "loss": 1.7604, + "step": 10668 + }, + { + "epoch": 3.274708410067526, + "grad_norm": 0.5097808837890625, + "learning_rate": 7.854090256444881e-05, + "loss": 1.777, + "step": 10669 + }, + { + "epoch": 3.2750153468385514, + "grad_norm": 0.27828142046928406, + "learning_rate": 7.853682121313011e-05, + "loss": 1.7885, + "step": 10670 + }, + { + "epoch": 3.2753222836095763, + "grad_norm": 0.2925552725791931, + "learning_rate": 7.853273957979381e-05, + "loss": 1.7962, + "step": 10671 + }, + { + "epoch": 3.2756292203806017, + "grad_norm": 0.284574955701828, + "learning_rate": 7.852865766448025e-05, + "loss": 1.8645, + "step": 10672 + }, + { + "epoch": 3.2759361571516266, + "grad_norm": 0.23407664895057678, + "learning_rate": 7.85245754672298e-05, + "loss": 1.7106, + "step": 10673 + }, + { + "epoch": 3.276243093922652, + "grad_norm": 0.2555919885635376, + "learning_rate": 7.852049298808274e-05, + "loss": 1.8237, + "step": 10674 + }, + { + "epoch": 3.2765500306936772, + "grad_norm": 0.26703694462776184, + "learning_rate": 7.851641022707947e-05, + "loss": 1.7844, + "step": 10675 + }, + { + "epoch": 3.276856967464702, + "grad_norm": 0.24889135360717773, + "learning_rate": 7.851232718426033e-05, + "loss": 1.7783, + "step": 10676 + }, + { + "epoch": 3.2771639042357275, + "grad_norm": 0.25770726799964905, + "learning_rate": 7.850824385966564e-05, + "loss": 1.8007, + "step": 10677 + }, + { + "epoch": 3.277470841006753, + "grad_norm": 0.31806984543800354, + "learning_rate": 7.850416025333578e-05, + "loss": 1.8623, + "step": 10678 + }, + { + "epoch": 3.2777777777777777, + "grad_norm": 0.2906930148601532, + "learning_rate": 7.850007636531111e-05, + "loss": 1.8315, + "step": 10679 + }, + { + "epoch": 3.278084714548803, + "grad_norm": 0.2802525460720062, + "learning_rate": 7.849599219563197e-05, + "loss": 1.8488, + "step": 10680 + }, + { + "epoch": 3.2783916513198283, + "grad_norm": 0.26150405406951904, + "learning_rate": 7.849190774433874e-05, + "loss": 1.7967, + "step": 10681 + }, + { + "epoch": 3.2786985880908532, + "grad_norm": 0.25863370299339294, + "learning_rate": 7.848782301147178e-05, + "loss": 1.864, + "step": 10682 + }, + { + "epoch": 3.2790055248618786, + "grad_norm": 0.25381043553352356, + "learning_rate": 7.848373799707145e-05, + "loss": 1.8239, + "step": 10683 + }, + { + "epoch": 3.2793124616329035, + "grad_norm": 0.2583387792110443, + "learning_rate": 7.847965270117814e-05, + "loss": 1.8449, + "step": 10684 + }, + { + "epoch": 3.279619398403929, + "grad_norm": 0.30759841203689575, + "learning_rate": 7.84755671238322e-05, + "loss": 1.7992, + "step": 10685 + }, + { + "epoch": 3.279926335174954, + "grad_norm": 0.4316023588180542, + "learning_rate": 7.847148126507402e-05, + "loss": 1.7912, + "step": 10686 + }, + { + "epoch": 3.280233271945979, + "grad_norm": 0.3988901674747467, + "learning_rate": 7.846739512494396e-05, + "loss": 1.8831, + "step": 10687 + }, + { + "epoch": 3.2805402087170044, + "grad_norm": 0.318934828042984, + "learning_rate": 7.846330870348244e-05, + "loss": 1.8411, + "step": 10688 + }, + { + "epoch": 3.2808471454880292, + "grad_norm": 0.27755632996559143, + "learning_rate": 7.84592220007298e-05, + "loss": 1.8763, + "step": 10689 + }, + { + "epoch": 3.2811540822590546, + "grad_norm": 0.33544883131980896, + "learning_rate": 7.845513501672646e-05, + "loss": 1.731, + "step": 10690 + }, + { + "epoch": 3.28146101903008, + "grad_norm": 0.28299057483673096, + "learning_rate": 7.845104775151278e-05, + "loss": 1.813, + "step": 10691 + }, + { + "epoch": 3.281767955801105, + "grad_norm": 0.2761382460594177, + "learning_rate": 7.844696020512918e-05, + "loss": 1.8018, + "step": 10692 + }, + { + "epoch": 3.28207489257213, + "grad_norm": 0.2919033169746399, + "learning_rate": 7.844287237761605e-05, + "loss": 1.793, + "step": 10693 + }, + { + "epoch": 3.2823818293431555, + "grad_norm": 0.32922014594078064, + "learning_rate": 7.843878426901378e-05, + "loss": 1.8186, + "step": 10694 + }, + { + "epoch": 3.2826887661141804, + "grad_norm": 0.2818562090396881, + "learning_rate": 7.843469587936279e-05, + "loss": 1.7794, + "step": 10695 + }, + { + "epoch": 3.2829957028852057, + "grad_norm": 0.26414254307746887, + "learning_rate": 7.843060720870345e-05, + "loss": 1.7854, + "step": 10696 + }, + { + "epoch": 3.283302639656231, + "grad_norm": 0.28345760703086853, + "learning_rate": 7.842651825707618e-05, + "loss": 1.7659, + "step": 10697 + }, + { + "epoch": 3.283609576427256, + "grad_norm": 0.3522340655326843, + "learning_rate": 7.842242902452141e-05, + "loss": 1.8427, + "step": 10698 + }, + { + "epoch": 3.2839165131982813, + "grad_norm": 0.2861590087413788, + "learning_rate": 7.841833951107954e-05, + "loss": 1.7539, + "step": 10699 + }, + { + "epoch": 3.284223449969306, + "grad_norm": 0.2596624493598938, + "learning_rate": 7.841424971679099e-05, + "loss": 1.8407, + "step": 10700 + }, + { + "epoch": 3.2845303867403315, + "grad_norm": 0.2847718298435211, + "learning_rate": 7.841015964169616e-05, + "loss": 1.8085, + "step": 10701 + }, + { + "epoch": 3.284837323511357, + "grad_norm": 0.29566115140914917, + "learning_rate": 7.840606928583547e-05, + "loss": 1.7873, + "step": 10702 + }, + { + "epoch": 3.2851442602823817, + "grad_norm": 0.2752111256122589, + "learning_rate": 7.840197864924936e-05, + "loss": 1.8186, + "step": 10703 + }, + { + "epoch": 3.285451197053407, + "grad_norm": 0.2907958924770355, + "learning_rate": 7.839788773197826e-05, + "loss": 1.8081, + "step": 10704 + }, + { + "epoch": 3.285758133824432, + "grad_norm": 0.25808724761009216, + "learning_rate": 7.839379653406258e-05, + "loss": 1.7635, + "step": 10705 + }, + { + "epoch": 3.2860650705954573, + "grad_norm": 0.2732730507850647, + "learning_rate": 7.838970505554277e-05, + "loss": 1.8061, + "step": 10706 + }, + { + "epoch": 3.2863720073664826, + "grad_norm": 0.23820067942142487, + "learning_rate": 7.838561329645923e-05, + "loss": 1.8091, + "step": 10707 + }, + { + "epoch": 3.2866789441375075, + "grad_norm": 0.24179396033287048, + "learning_rate": 7.838152125685245e-05, + "loss": 1.7513, + "step": 10708 + }, + { + "epoch": 3.286985880908533, + "grad_norm": 0.2627546787261963, + "learning_rate": 7.837742893676283e-05, + "loss": 1.8741, + "step": 10709 + }, + { + "epoch": 3.287292817679558, + "grad_norm": 0.2827817499637604, + "learning_rate": 7.837333633623083e-05, + "loss": 1.8387, + "step": 10710 + }, + { + "epoch": 3.287599754450583, + "grad_norm": 0.2666749060153961, + "learning_rate": 7.836924345529688e-05, + "loss": 1.8319, + "step": 10711 + }, + { + "epoch": 3.2879066912216084, + "grad_norm": 0.3403390944004059, + "learning_rate": 7.836515029400145e-05, + "loss": 1.7827, + "step": 10712 + }, + { + "epoch": 3.2882136279926337, + "grad_norm": 0.30646705627441406, + "learning_rate": 7.836105685238497e-05, + "loss": 1.8612, + "step": 10713 + }, + { + "epoch": 3.2885205647636586, + "grad_norm": 0.2580253481864929, + "learning_rate": 7.83569631304879e-05, + "loss": 1.7332, + "step": 10714 + }, + { + "epoch": 3.288827501534684, + "grad_norm": 0.23734542727470398, + "learning_rate": 7.835286912835071e-05, + "loss": 1.7899, + "step": 10715 + }, + { + "epoch": 3.289134438305709, + "grad_norm": 0.2457810491323471, + "learning_rate": 7.834877484601384e-05, + "loss": 1.8059, + "step": 10716 + }, + { + "epoch": 3.289441375076734, + "grad_norm": 0.2558443248271942, + "learning_rate": 7.834468028351778e-05, + "loss": 1.8689, + "step": 10717 + }, + { + "epoch": 3.2897483118477595, + "grad_norm": 0.26596710085868835, + "learning_rate": 7.834058544090298e-05, + "loss": 1.816, + "step": 10718 + }, + { + "epoch": 3.2900552486187844, + "grad_norm": 0.25424903631210327, + "learning_rate": 7.833649031820987e-05, + "loss": 1.7907, + "step": 10719 + }, + { + "epoch": 3.2903621853898097, + "grad_norm": 0.23873139917850494, + "learning_rate": 7.833239491547896e-05, + "loss": 1.7666, + "step": 10720 + }, + { + "epoch": 3.2906691221608346, + "grad_norm": 0.23292972147464752, + "learning_rate": 7.832829923275073e-05, + "loss": 1.7674, + "step": 10721 + }, + { + "epoch": 3.29097605893186, + "grad_norm": 0.30133312940597534, + "learning_rate": 7.832420327006566e-05, + "loss": 1.8229, + "step": 10722 + }, + { + "epoch": 3.2912829957028853, + "grad_norm": 0.2882522642612457, + "learning_rate": 7.83201070274642e-05, + "loss": 1.7855, + "step": 10723 + }, + { + "epoch": 3.29158993247391, + "grad_norm": 0.2578088045120239, + "learning_rate": 7.831601050498683e-05, + "loss": 1.7276, + "step": 10724 + }, + { + "epoch": 3.2918968692449355, + "grad_norm": 0.29511600732803345, + "learning_rate": 7.831191370267406e-05, + "loss": 1.8085, + "step": 10725 + }, + { + "epoch": 3.292203806015961, + "grad_norm": 0.29557499289512634, + "learning_rate": 7.830781662056634e-05, + "loss": 1.815, + "step": 10726 + }, + { + "epoch": 3.2925107427869857, + "grad_norm": 0.32722121477127075, + "learning_rate": 7.830371925870422e-05, + "loss": 1.7889, + "step": 10727 + }, + { + "epoch": 3.292817679558011, + "grad_norm": 0.3124488592147827, + "learning_rate": 7.829962161712814e-05, + "loss": 1.8063, + "step": 10728 + }, + { + "epoch": 3.2931246163290364, + "grad_norm": 0.311334490776062, + "learning_rate": 7.829552369587861e-05, + "loss": 1.8852, + "step": 10729 + }, + { + "epoch": 3.2934315531000613, + "grad_norm": 0.28010860085487366, + "learning_rate": 7.829142549499613e-05, + "loss": 1.8274, + "step": 10730 + }, + { + "epoch": 3.2937384898710866, + "grad_norm": 0.3453529477119446, + "learning_rate": 7.828732701452119e-05, + "loss": 1.8618, + "step": 10731 + }, + { + "epoch": 3.2940454266421115, + "grad_norm": 0.2946802079677582, + "learning_rate": 7.828322825449432e-05, + "loss": 1.7123, + "step": 10732 + }, + { + "epoch": 3.294352363413137, + "grad_norm": 0.2467648684978485, + "learning_rate": 7.827912921495601e-05, + "loss": 1.7786, + "step": 10733 + }, + { + "epoch": 3.294659300184162, + "grad_norm": 0.2957034707069397, + "learning_rate": 7.827502989594677e-05, + "loss": 1.7817, + "step": 10734 + }, + { + "epoch": 3.294966236955187, + "grad_norm": 0.300905704498291, + "learning_rate": 7.827093029750713e-05, + "loss": 1.7582, + "step": 10735 + }, + { + "epoch": 3.2952731737262124, + "grad_norm": 0.28935131430625916, + "learning_rate": 7.826683041967757e-05, + "loss": 1.7766, + "step": 10736 + }, + { + "epoch": 3.2955801104972378, + "grad_norm": 0.26046010851860046, + "learning_rate": 7.826273026249861e-05, + "loss": 1.8152, + "step": 10737 + }, + { + "epoch": 3.2958870472682626, + "grad_norm": 0.24247924983501434, + "learning_rate": 7.82586298260108e-05, + "loss": 1.8679, + "step": 10738 + }, + { + "epoch": 3.296193984039288, + "grad_norm": 0.25977620482444763, + "learning_rate": 7.825452911025466e-05, + "loss": 1.8108, + "step": 10739 + }, + { + "epoch": 3.2965009208103133, + "grad_norm": 0.2732592821121216, + "learning_rate": 7.825042811527068e-05, + "loss": 1.7355, + "step": 10740 + }, + { + "epoch": 3.296807857581338, + "grad_norm": 0.38407859206199646, + "learning_rate": 7.824632684109941e-05, + "loss": 1.8418, + "step": 10741 + }, + { + "epoch": 3.2971147943523635, + "grad_norm": 0.4239252805709839, + "learning_rate": 7.82422252877814e-05, + "loss": 1.7655, + "step": 10742 + }, + { + "epoch": 3.2974217311233884, + "grad_norm": 0.3810526132583618, + "learning_rate": 7.823812345535716e-05, + "loss": 1.8804, + "step": 10743 + }, + { + "epoch": 3.2977286678944138, + "grad_norm": 0.29939520359039307, + "learning_rate": 7.823402134386722e-05, + "loss": 1.8207, + "step": 10744 + }, + { + "epoch": 3.298035604665439, + "grad_norm": 0.4053972065448761, + "learning_rate": 7.822991895335215e-05, + "loss": 1.7901, + "step": 10745 + }, + { + "epoch": 3.298342541436464, + "grad_norm": 0.4975005090236664, + "learning_rate": 7.822581628385247e-05, + "loss": 1.8344, + "step": 10746 + }, + { + "epoch": 3.2986494782074893, + "grad_norm": 0.4100436270236969, + "learning_rate": 7.822171333540874e-05, + "loss": 1.7891, + "step": 10747 + }, + { + "epoch": 3.298956414978514, + "grad_norm": 0.2817644476890564, + "learning_rate": 7.821761010806147e-05, + "loss": 1.7895, + "step": 10748 + }, + { + "epoch": 3.2992633517495396, + "grad_norm": 0.332660973072052, + "learning_rate": 7.821350660185125e-05, + "loss": 1.7281, + "step": 10749 + }, + { + "epoch": 3.299570288520565, + "grad_norm": 0.42652732133865356, + "learning_rate": 7.820940281681863e-05, + "loss": 1.7855, + "step": 10750 + }, + { + "epoch": 3.2998772252915898, + "grad_norm": 0.35700714588165283, + "learning_rate": 7.820529875300415e-05, + "loss": 1.8722, + "step": 10751 + }, + { + "epoch": 3.300184162062615, + "grad_norm": 0.25305211544036865, + "learning_rate": 7.820119441044838e-05, + "loss": 1.7696, + "step": 10752 + }, + { + "epoch": 3.3004910988336404, + "grad_norm": 0.280205637216568, + "learning_rate": 7.819708978919188e-05, + "loss": 1.756, + "step": 10753 + }, + { + "epoch": 3.3007980356046653, + "grad_norm": 0.4176226854324341, + "learning_rate": 7.819298488927521e-05, + "loss": 1.7731, + "step": 10754 + }, + { + "epoch": 3.3011049723756907, + "grad_norm": 0.4264865517616272, + "learning_rate": 7.818887971073894e-05, + "loss": 1.7851, + "step": 10755 + }, + { + "epoch": 3.301411909146716, + "grad_norm": 0.2901221215724945, + "learning_rate": 7.818477425362363e-05, + "loss": 1.7356, + "step": 10756 + }, + { + "epoch": 3.301718845917741, + "grad_norm": 0.29583361744880676, + "learning_rate": 7.818066851796986e-05, + "loss": 1.8269, + "step": 10757 + }, + { + "epoch": 3.3020257826887662, + "grad_norm": 0.38592997193336487, + "learning_rate": 7.817656250381821e-05, + "loss": 1.7515, + "step": 10758 + }, + { + "epoch": 3.302332719459791, + "grad_norm": 0.29301533102989197, + "learning_rate": 7.817245621120927e-05, + "loss": 1.7955, + "step": 10759 + }, + { + "epoch": 3.3026396562308165, + "grad_norm": 0.2770880162715912, + "learning_rate": 7.816834964018359e-05, + "loss": 1.7899, + "step": 10760 + }, + { + "epoch": 3.302946593001842, + "grad_norm": 0.32566413283348083, + "learning_rate": 7.816424279078176e-05, + "loss": 1.74, + "step": 10761 + }, + { + "epoch": 3.3032535297728667, + "grad_norm": 0.3077750504016876, + "learning_rate": 7.81601356630444e-05, + "loss": 1.8123, + "step": 10762 + }, + { + "epoch": 3.303560466543892, + "grad_norm": 0.2826370298862457, + "learning_rate": 7.815602825701206e-05, + "loss": 1.865, + "step": 10763 + }, + { + "epoch": 3.303867403314917, + "grad_norm": 0.31700822710990906, + "learning_rate": 7.815192057272534e-05, + "loss": 1.8021, + "step": 10764 + }, + { + "epoch": 3.3041743400859422, + "grad_norm": 0.33182790875434875, + "learning_rate": 7.814781261022486e-05, + "loss": 1.818, + "step": 10765 + }, + { + "epoch": 3.3044812768569676, + "grad_norm": 0.2720039486885071, + "learning_rate": 7.814370436955118e-05, + "loss": 1.8369, + "step": 10766 + }, + { + "epoch": 3.3047882136279925, + "grad_norm": 0.28134068846702576, + "learning_rate": 7.813959585074493e-05, + "loss": 1.8391, + "step": 10767 + }, + { + "epoch": 3.305095150399018, + "grad_norm": 0.25748828053474426, + "learning_rate": 7.813548705384667e-05, + "loss": 1.7987, + "step": 10768 + }, + { + "epoch": 3.305402087170043, + "grad_norm": 0.26187625527381897, + "learning_rate": 7.813137797889708e-05, + "loss": 1.7645, + "step": 10769 + }, + { + "epoch": 3.305709023941068, + "grad_norm": 0.297262579202652, + "learning_rate": 7.812726862593671e-05, + "loss": 1.771, + "step": 10770 + }, + { + "epoch": 3.3060159607120934, + "grad_norm": 0.2987872064113617, + "learning_rate": 7.812315899500618e-05, + "loss": 1.8115, + "step": 10771 + }, + { + "epoch": 3.3063228974831187, + "grad_norm": 0.31963878870010376, + "learning_rate": 7.81190490861461e-05, + "loss": 1.7685, + "step": 10772 + }, + { + "epoch": 3.3066298342541436, + "grad_norm": 0.27007177472114563, + "learning_rate": 7.81149388993971e-05, + "loss": 1.8272, + "step": 10773 + }, + { + "epoch": 3.306936771025169, + "grad_norm": 0.26818498969078064, + "learning_rate": 7.811082843479981e-05, + "loss": 1.7894, + "step": 10774 + }, + { + "epoch": 3.307243707796194, + "grad_norm": 0.28857091069221497, + "learning_rate": 7.810671769239483e-05, + "loss": 1.8769, + "step": 10775 + }, + { + "epoch": 3.307550644567219, + "grad_norm": 0.26983144879341125, + "learning_rate": 7.810260667222277e-05, + "loss": 1.796, + "step": 10776 + }, + { + "epoch": 3.3078575813382445, + "grad_norm": 0.2566467225551605, + "learning_rate": 7.809849537432432e-05, + "loss": 1.848, + "step": 10777 + }, + { + "epoch": 3.3081645181092694, + "grad_norm": 0.25607848167419434, + "learning_rate": 7.809438379874005e-05, + "loss": 1.8072, + "step": 10778 + }, + { + "epoch": 3.3084714548802947, + "grad_norm": 0.29158470034599304, + "learning_rate": 7.809027194551059e-05, + "loss": 1.7772, + "step": 10779 + }, + { + "epoch": 3.3087783916513196, + "grad_norm": 0.360897421836853, + "learning_rate": 7.808615981467664e-05, + "loss": 1.8404, + "step": 10780 + }, + { + "epoch": 3.309085328422345, + "grad_norm": 0.31121253967285156, + "learning_rate": 7.808204740627877e-05, + "loss": 1.8137, + "step": 10781 + }, + { + "epoch": 3.3093922651933703, + "grad_norm": 0.2846451699733734, + "learning_rate": 7.807793472035765e-05, + "loss": 1.8367, + "step": 10782 + }, + { + "epoch": 3.309699201964395, + "grad_norm": 0.2711004316806793, + "learning_rate": 7.807382175695393e-05, + "loss": 1.7728, + "step": 10783 + }, + { + "epoch": 3.3100061387354205, + "grad_norm": 0.2693859338760376, + "learning_rate": 7.806970851610824e-05, + "loss": 1.7026, + "step": 10784 + }, + { + "epoch": 3.310313075506446, + "grad_norm": 0.3050517439842224, + "learning_rate": 7.806559499786125e-05, + "loss": 1.8041, + "step": 10785 + }, + { + "epoch": 3.3106200122774707, + "grad_norm": 0.27304747700691223, + "learning_rate": 7.80614812022536e-05, + "loss": 1.8182, + "step": 10786 + }, + { + "epoch": 3.310926949048496, + "grad_norm": 0.28378555178642273, + "learning_rate": 7.805736712932594e-05, + "loss": 1.8519, + "step": 10787 + }, + { + "epoch": 3.3112338858195214, + "grad_norm": 0.30620133876800537, + "learning_rate": 7.805325277911892e-05, + "loss": 1.8594, + "step": 10788 + }, + { + "epoch": 3.3115408225905463, + "grad_norm": 0.2580169141292572, + "learning_rate": 7.804913815167325e-05, + "loss": 1.7897, + "step": 10789 + }, + { + "epoch": 3.3118477593615716, + "grad_norm": 0.28937023878097534, + "learning_rate": 7.804502324702951e-05, + "loss": 1.8362, + "step": 10790 + }, + { + "epoch": 3.3121546961325965, + "grad_norm": 0.28032705187797546, + "learning_rate": 7.804090806522844e-05, + "loss": 1.8168, + "step": 10791 + }, + { + "epoch": 3.312461632903622, + "grad_norm": 0.33712559938430786, + "learning_rate": 7.803679260631069e-05, + "loss": 1.7489, + "step": 10792 + }, + { + "epoch": 3.312768569674647, + "grad_norm": 0.40536820888519287, + "learning_rate": 7.80326768703169e-05, + "loss": 1.8413, + "step": 10793 + }, + { + "epoch": 3.313075506445672, + "grad_norm": 0.34967559576034546, + "learning_rate": 7.802856085728778e-05, + "loss": 1.8076, + "step": 10794 + }, + { + "epoch": 3.3133824432166974, + "grad_norm": 0.2429870367050171, + "learning_rate": 7.8024444567264e-05, + "loss": 1.8002, + "step": 10795 + }, + { + "epoch": 3.3136893799877223, + "grad_norm": 0.40956684947013855, + "learning_rate": 7.802032800028621e-05, + "loss": 1.8151, + "step": 10796 + }, + { + "epoch": 3.3139963167587476, + "grad_norm": 0.4908781945705414, + "learning_rate": 7.801621115639512e-05, + "loss": 1.8124, + "step": 10797 + }, + { + "epoch": 3.314303253529773, + "grad_norm": 0.3922197222709656, + "learning_rate": 7.801209403563143e-05, + "loss": 1.7911, + "step": 10798 + }, + { + "epoch": 3.314610190300798, + "grad_norm": 0.29467105865478516, + "learning_rate": 7.800797663803578e-05, + "loss": 1.8472, + "step": 10799 + }, + { + "epoch": 3.314917127071823, + "grad_norm": 0.384974867105484, + "learning_rate": 7.800385896364891e-05, + "loss": 1.8139, + "step": 10800 + }, + { + "epoch": 3.3152240638428485, + "grad_norm": 0.4605129063129425, + "learning_rate": 7.79997410125115e-05, + "loss": 1.7982, + "step": 10801 + }, + { + "epoch": 3.3155310006138734, + "grad_norm": 0.2982464134693146, + "learning_rate": 7.799562278466423e-05, + "loss": 1.8496, + "step": 10802 + }, + { + "epoch": 3.3158379373848987, + "grad_norm": 0.3101392984390259, + "learning_rate": 7.79915042801478e-05, + "loss": 1.8172, + "step": 10803 + }, + { + "epoch": 3.316144874155924, + "grad_norm": 0.3651282489299774, + "learning_rate": 7.798738549900292e-05, + "loss": 1.7497, + "step": 10804 + }, + { + "epoch": 3.316451810926949, + "grad_norm": 0.28504419326782227, + "learning_rate": 7.79832664412703e-05, + "loss": 1.8027, + "step": 10805 + }, + { + "epoch": 3.3167587476979743, + "grad_norm": 0.28333309292793274, + "learning_rate": 7.797914710699063e-05, + "loss": 1.8121, + "step": 10806 + }, + { + "epoch": 3.317065684468999, + "grad_norm": 0.37549784779548645, + "learning_rate": 7.797502749620462e-05, + "loss": 1.817, + "step": 10807 + }, + { + "epoch": 3.3173726212400245, + "grad_norm": 0.3864210844039917, + "learning_rate": 7.797090760895301e-05, + "loss": 1.852, + "step": 10808 + }, + { + "epoch": 3.31767955801105, + "grad_norm": 0.2422102987766266, + "learning_rate": 7.79667874452765e-05, + "loss": 1.7523, + "step": 10809 + }, + { + "epoch": 3.3179864947820747, + "grad_norm": 0.307892382144928, + "learning_rate": 7.79626670052158e-05, + "loss": 1.7436, + "step": 10810 + }, + { + "epoch": 3.3182934315531, + "grad_norm": 0.29607462882995605, + "learning_rate": 7.795854628881162e-05, + "loss": 1.768, + "step": 10811 + }, + { + "epoch": 3.3186003683241254, + "grad_norm": 0.23334427177906036, + "learning_rate": 7.795442529610471e-05, + "loss": 1.7687, + "step": 10812 + }, + { + "epoch": 3.3189073050951503, + "grad_norm": 0.26257455348968506, + "learning_rate": 7.795030402713578e-05, + "loss": 1.8266, + "step": 10813 + }, + { + "epoch": 3.3192142418661756, + "grad_norm": 0.3252788782119751, + "learning_rate": 7.794618248194556e-05, + "loss": 1.8645, + "step": 10814 + }, + { + "epoch": 3.319521178637201, + "grad_norm": 0.3807232975959778, + "learning_rate": 7.79420606605748e-05, + "loss": 1.8154, + "step": 10815 + }, + { + "epoch": 3.319828115408226, + "grad_norm": 0.3395625948905945, + "learning_rate": 7.793793856306422e-05, + "loss": 1.8002, + "step": 10816 + }, + { + "epoch": 3.320135052179251, + "grad_norm": 0.2896415889263153, + "learning_rate": 7.793381618945455e-05, + "loss": 1.8077, + "step": 10817 + }, + { + "epoch": 3.320441988950276, + "grad_norm": 0.27733489871025085, + "learning_rate": 7.792969353978652e-05, + "loss": 1.7976, + "step": 10818 + }, + { + "epoch": 3.3207489257213014, + "grad_norm": 0.36985141038894653, + "learning_rate": 7.79255706141009e-05, + "loss": 1.8724, + "step": 10819 + }, + { + "epoch": 3.3210558624923268, + "grad_norm": 0.37886983156204224, + "learning_rate": 7.792144741243843e-05, + "loss": 1.8249, + "step": 10820 + }, + { + "epoch": 3.3213627992633517, + "grad_norm": 0.3030721843242645, + "learning_rate": 7.791732393483986e-05, + "loss": 1.7975, + "step": 10821 + }, + { + "epoch": 3.321669736034377, + "grad_norm": 0.2637709081172943, + "learning_rate": 7.791320018134592e-05, + "loss": 1.7205, + "step": 10822 + }, + { + "epoch": 3.321976672805402, + "grad_norm": 0.35307520627975464, + "learning_rate": 7.790907615199736e-05, + "loss": 1.8786, + "step": 10823 + }, + { + "epoch": 3.322283609576427, + "grad_norm": 0.3333272635936737, + "learning_rate": 7.790495184683497e-05, + "loss": 1.7715, + "step": 10824 + }, + { + "epoch": 3.3225905463474525, + "grad_norm": 0.2597469091415405, + "learning_rate": 7.790082726589948e-05, + "loss": 1.8379, + "step": 10825 + }, + { + "epoch": 3.3228974831184774, + "grad_norm": 0.34176257252693176, + "learning_rate": 7.789670240923168e-05, + "loss": 1.8305, + "step": 10826 + }, + { + "epoch": 3.3232044198895028, + "grad_norm": 0.37954533100128174, + "learning_rate": 7.789257727687229e-05, + "loss": 1.7728, + "step": 10827 + }, + { + "epoch": 3.323511356660528, + "grad_norm": 0.2840248644351959, + "learning_rate": 7.788845186886212e-05, + "loss": 1.8059, + "step": 10828 + }, + { + "epoch": 3.323818293431553, + "grad_norm": 0.3650275766849518, + "learning_rate": 7.788432618524193e-05, + "loss": 1.8127, + "step": 10829 + }, + { + "epoch": 3.3241252302025783, + "grad_norm": 0.4869692623615265, + "learning_rate": 7.788020022605247e-05, + "loss": 1.833, + "step": 10830 + }, + { + "epoch": 3.3244321669736037, + "grad_norm": 0.3419482707977295, + "learning_rate": 7.787607399133453e-05, + "loss": 1.7812, + "step": 10831 + }, + { + "epoch": 3.3247391037446286, + "grad_norm": 0.27625617384910583, + "learning_rate": 7.787194748112889e-05, + "loss": 1.8513, + "step": 10832 + }, + { + "epoch": 3.325046040515654, + "grad_norm": 0.4287806749343872, + "learning_rate": 7.786782069547633e-05, + "loss": 1.836, + "step": 10833 + }, + { + "epoch": 3.325352977286679, + "grad_norm": 0.4345545172691345, + "learning_rate": 7.786369363441763e-05, + "loss": 1.8027, + "step": 10834 + }, + { + "epoch": 3.325659914057704, + "grad_norm": 0.32976534962654114, + "learning_rate": 7.78595662979936e-05, + "loss": 1.7987, + "step": 10835 + }, + { + "epoch": 3.3259668508287294, + "grad_norm": 0.2677469849586487, + "learning_rate": 7.785543868624498e-05, + "loss": 1.8312, + "step": 10836 + }, + { + "epoch": 3.3262737875997543, + "grad_norm": 0.2547740638256073, + "learning_rate": 7.785131079921259e-05, + "loss": 1.7844, + "step": 10837 + }, + { + "epoch": 3.3265807243707797, + "grad_norm": 0.26755592226982117, + "learning_rate": 7.784718263693725e-05, + "loss": 1.8263, + "step": 10838 + }, + { + "epoch": 3.3268876611418046, + "grad_norm": 0.23884403705596924, + "learning_rate": 7.784305419945969e-05, + "loss": 1.7862, + "step": 10839 + }, + { + "epoch": 3.32719459791283, + "grad_norm": 0.2896903157234192, + "learning_rate": 7.783892548682077e-05, + "loss": 1.9138, + "step": 10840 + }, + { + "epoch": 3.3275015346838552, + "grad_norm": 0.3201359510421753, + "learning_rate": 7.783479649906127e-05, + "loss": 1.8382, + "step": 10841 + }, + { + "epoch": 3.32780847145488, + "grad_norm": 0.39285311102867126, + "learning_rate": 7.7830667236222e-05, + "loss": 1.7763, + "step": 10842 + }, + { + "epoch": 3.3281154082259055, + "grad_norm": 0.435007244348526, + "learning_rate": 7.782653769834376e-05, + "loss": 1.8415, + "step": 10843 + }, + { + "epoch": 3.328422344996931, + "grad_norm": 0.34605318307876587, + "learning_rate": 7.782240788546736e-05, + "loss": 1.757, + "step": 10844 + }, + { + "epoch": 3.3287292817679557, + "grad_norm": 0.26830604672431946, + "learning_rate": 7.781827779763362e-05, + "loss": 1.7779, + "step": 10845 + }, + { + "epoch": 3.329036218538981, + "grad_norm": 0.41851529479026794, + "learning_rate": 7.781414743488336e-05, + "loss": 1.8609, + "step": 10846 + }, + { + "epoch": 3.3293431553100064, + "grad_norm": 0.5058079361915588, + "learning_rate": 7.78100167972574e-05, + "loss": 1.8146, + "step": 10847 + }, + { + "epoch": 3.3296500920810312, + "grad_norm": 0.34394967555999756, + "learning_rate": 7.780588588479654e-05, + "loss": 1.8079, + "step": 10848 + }, + { + "epoch": 3.3299570288520566, + "grad_norm": 0.3033885061740875, + "learning_rate": 7.780175469754161e-05, + "loss": 1.8223, + "step": 10849 + }, + { + "epoch": 3.3302639656230815, + "grad_norm": 0.4431045651435852, + "learning_rate": 7.779762323553347e-05, + "loss": 1.8841, + "step": 10850 + }, + { + "epoch": 3.330570902394107, + "grad_norm": 0.3451448976993561, + "learning_rate": 7.77934914988129e-05, + "loss": 1.8092, + "step": 10851 + }, + { + "epoch": 3.330877839165132, + "grad_norm": 0.26580891013145447, + "learning_rate": 7.778935948742077e-05, + "loss": 1.8244, + "step": 10852 + }, + { + "epoch": 3.331184775936157, + "grad_norm": 0.32079070806503296, + "learning_rate": 7.778522720139792e-05, + "loss": 1.7816, + "step": 10853 + }, + { + "epoch": 3.3314917127071824, + "grad_norm": 0.35789042711257935, + "learning_rate": 7.778109464078514e-05, + "loss": 1.8211, + "step": 10854 + }, + { + "epoch": 3.3317986494782073, + "grad_norm": 0.2808612585067749, + "learning_rate": 7.77769618056233e-05, + "loss": 1.8387, + "step": 10855 + }, + { + "epoch": 3.3321055862492326, + "grad_norm": 0.24760548770427704, + "learning_rate": 7.777282869595326e-05, + "loss": 1.7795, + "step": 10856 + }, + { + "epoch": 3.332412523020258, + "grad_norm": 0.2840912640094757, + "learning_rate": 7.776869531181583e-05, + "loss": 1.7492, + "step": 10857 + }, + { + "epoch": 3.332719459791283, + "grad_norm": 0.2881413698196411, + "learning_rate": 7.77645616532519e-05, + "loss": 1.8157, + "step": 10858 + }, + { + "epoch": 3.333026396562308, + "grad_norm": 0.2508779764175415, + "learning_rate": 7.776042772030228e-05, + "loss": 1.8196, + "step": 10859 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.3307822048664093, + "learning_rate": 7.775629351300785e-05, + "loss": 1.8195, + "step": 10860 + }, + { + "epoch": 3.3336402701043584, + "grad_norm": 0.34392043948173523, + "learning_rate": 7.775215903140946e-05, + "loss": 1.7775, + "step": 10861 + }, + { + "epoch": 3.3339472068753837, + "grad_norm": 0.2594252824783325, + "learning_rate": 7.774802427554796e-05, + "loss": 1.7687, + "step": 10862 + }, + { + "epoch": 3.334254143646409, + "grad_norm": 0.3109053075313568, + "learning_rate": 7.774388924546423e-05, + "loss": 1.7908, + "step": 10863 + }, + { + "epoch": 3.334561080417434, + "grad_norm": 0.4801923930644989, + "learning_rate": 7.773975394119913e-05, + "loss": 1.8316, + "step": 10864 + }, + { + "epoch": 3.3348680171884593, + "grad_norm": 0.4754973351955414, + "learning_rate": 7.77356183627935e-05, + "loss": 1.8015, + "step": 10865 + }, + { + "epoch": 3.335174953959484, + "grad_norm": 0.29624658823013306, + "learning_rate": 7.773148251028825e-05, + "loss": 1.8179, + "step": 10866 + }, + { + "epoch": 3.3354818907305095, + "grad_norm": 0.32207581400871277, + "learning_rate": 7.772734638372423e-05, + "loss": 1.799, + "step": 10867 + }, + { + "epoch": 3.335788827501535, + "grad_norm": 0.5227517485618591, + "learning_rate": 7.772320998314233e-05, + "loss": 1.8452, + "step": 10868 + }, + { + "epoch": 3.3360957642725597, + "grad_norm": 0.4081100523471832, + "learning_rate": 7.771907330858341e-05, + "loss": 1.8182, + "step": 10869 + }, + { + "epoch": 3.336402701043585, + "grad_norm": 0.23786653578281403, + "learning_rate": 7.771493636008838e-05, + "loss": 1.7392, + "step": 10870 + }, + { + "epoch": 3.33670963781461, + "grad_norm": 0.37913820147514343, + "learning_rate": 7.771079913769807e-05, + "loss": 1.7559, + "step": 10871 + }, + { + "epoch": 3.3370165745856353, + "grad_norm": 0.4939163625240326, + "learning_rate": 7.770666164145344e-05, + "loss": 1.8076, + "step": 10872 + }, + { + "epoch": 3.3373235113566606, + "grad_norm": 0.3322528302669525, + "learning_rate": 7.770252387139532e-05, + "loss": 1.8045, + "step": 10873 + }, + { + "epoch": 3.337630448127686, + "grad_norm": 0.3685782849788666, + "learning_rate": 7.769838582756461e-05, + "loss": 1.7703, + "step": 10874 + }, + { + "epoch": 3.337937384898711, + "grad_norm": 0.5564271807670593, + "learning_rate": 7.769424751000224e-05, + "loss": 1.7697, + "step": 10875 + }, + { + "epoch": 3.338244321669736, + "grad_norm": 0.38610726594924927, + "learning_rate": 7.769010891874906e-05, + "loss": 1.7944, + "step": 10876 + }, + { + "epoch": 3.338551258440761, + "grad_norm": 0.23838558793067932, + "learning_rate": 7.768597005384602e-05, + "loss": 1.765, + "step": 10877 + }, + { + "epoch": 3.3388581952117864, + "grad_norm": 0.4334571063518524, + "learning_rate": 7.768183091533399e-05, + "loss": 1.7854, + "step": 10878 + }, + { + "epoch": 3.3391651319828117, + "grad_norm": 0.44844719767570496, + "learning_rate": 7.767769150325386e-05, + "loss": 1.7955, + "step": 10879 + }, + { + "epoch": 3.3394720687538366, + "grad_norm": 0.26543378829956055, + "learning_rate": 7.767355181764659e-05, + "loss": 1.8311, + "step": 10880 + }, + { + "epoch": 3.339779005524862, + "grad_norm": 0.39401358366012573, + "learning_rate": 7.766941185855304e-05, + "loss": 1.8264, + "step": 10881 + }, + { + "epoch": 3.340085942295887, + "grad_norm": 0.5476824045181274, + "learning_rate": 7.766527162601416e-05, + "loss": 1.8051, + "step": 10882 + }, + { + "epoch": 3.340392879066912, + "grad_norm": 0.4021138548851013, + "learning_rate": 7.766113112007084e-05, + "loss": 1.7941, + "step": 10883 + }, + { + "epoch": 3.3406998158379375, + "grad_norm": 0.3262040317058563, + "learning_rate": 7.765699034076402e-05, + "loss": 1.8317, + "step": 10884 + }, + { + "epoch": 3.3410067526089624, + "grad_norm": 0.5461146831512451, + "learning_rate": 7.765284928813459e-05, + "loss": 1.833, + "step": 10885 + }, + { + "epoch": 3.3413136893799877, + "grad_norm": 0.5067405700683594, + "learning_rate": 7.764870796222351e-05, + "loss": 1.7862, + "step": 10886 + }, + { + "epoch": 3.341620626151013, + "grad_norm": 0.2731069028377533, + "learning_rate": 7.76445663630717e-05, + "loss": 1.8173, + "step": 10887 + }, + { + "epoch": 3.341927562922038, + "grad_norm": 0.48928195238113403, + "learning_rate": 7.764042449072008e-05, + "loss": 1.7992, + "step": 10888 + }, + { + "epoch": 3.3422344996930633, + "grad_norm": 0.5338504910469055, + "learning_rate": 7.763628234520958e-05, + "loss": 1.7891, + "step": 10889 + }, + { + "epoch": 3.3425414364640886, + "grad_norm": 0.3136523365974426, + "learning_rate": 7.763213992658114e-05, + "loss": 1.8623, + "step": 10890 + }, + { + "epoch": 3.3428483732351135, + "grad_norm": 0.36551395058631897, + "learning_rate": 7.762799723487568e-05, + "loss": 1.8474, + "step": 10891 + }, + { + "epoch": 3.343155310006139, + "grad_norm": 0.35772353410720825, + "learning_rate": 7.762385427013419e-05, + "loss": 1.8625, + "step": 10892 + }, + { + "epoch": 3.3434622467771637, + "grad_norm": 0.29944708943367004, + "learning_rate": 7.761971103239755e-05, + "loss": 1.8181, + "step": 10893 + }, + { + "epoch": 3.343769183548189, + "grad_norm": 0.3395330309867859, + "learning_rate": 7.761556752170676e-05, + "loss": 1.7943, + "step": 10894 + }, + { + "epoch": 3.3440761203192144, + "grad_norm": 0.3624265193939209, + "learning_rate": 7.761142373810274e-05, + "loss": 1.8234, + "step": 10895 + }, + { + "epoch": 3.3443830570902393, + "grad_norm": 0.25409621000289917, + "learning_rate": 7.760727968162644e-05, + "loss": 1.7532, + "step": 10896 + }, + { + "epoch": 3.3446899938612646, + "grad_norm": 0.321437805891037, + "learning_rate": 7.760313535231883e-05, + "loss": 1.8808, + "step": 10897 + }, + { + "epoch": 3.3449969306322895, + "grad_norm": 0.2919142544269562, + "learning_rate": 7.759899075022086e-05, + "loss": 1.7677, + "step": 10898 + }, + { + "epoch": 3.345303867403315, + "grad_norm": 0.26515716314315796, + "learning_rate": 7.759484587537346e-05, + "loss": 1.8118, + "step": 10899 + }, + { + "epoch": 3.34561080417434, + "grad_norm": 0.2963240146636963, + "learning_rate": 7.759070072781764e-05, + "loss": 1.8329, + "step": 10900 + }, + { + "epoch": 3.345917740945365, + "grad_norm": 0.3186480700969696, + "learning_rate": 7.758655530759435e-05, + "loss": 1.8013, + "step": 10901 + }, + { + "epoch": 3.3462246777163904, + "grad_norm": 0.256145715713501, + "learning_rate": 7.758240961474454e-05, + "loss": 1.7865, + "step": 10902 + }, + { + "epoch": 3.3465316144874158, + "grad_norm": 0.28951629996299744, + "learning_rate": 7.757826364930921e-05, + "loss": 1.8091, + "step": 10903 + }, + { + "epoch": 3.3468385512584407, + "grad_norm": 0.2692483365535736, + "learning_rate": 7.75741174113293e-05, + "loss": 1.8308, + "step": 10904 + }, + { + "epoch": 3.347145488029466, + "grad_norm": 0.27615389227867126, + "learning_rate": 7.75699709008458e-05, + "loss": 1.7888, + "step": 10905 + }, + { + "epoch": 3.3474524248004913, + "grad_norm": 0.2819034457206726, + "learning_rate": 7.75658241178997e-05, + "loss": 1.7624, + "step": 10906 + }, + { + "epoch": 3.347759361571516, + "grad_norm": 0.2627592086791992, + "learning_rate": 7.756167706253196e-05, + "loss": 1.7696, + "step": 10907 + }, + { + "epoch": 3.3480662983425415, + "grad_norm": 0.3528621196746826, + "learning_rate": 7.755752973478356e-05, + "loss": 1.7725, + "step": 10908 + }, + { + "epoch": 3.3483732351135664, + "grad_norm": 0.35949698090553284, + "learning_rate": 7.755338213469552e-05, + "loss": 1.8163, + "step": 10909 + }, + { + "epoch": 3.3486801718845918, + "grad_norm": 0.25142577290534973, + "learning_rate": 7.75492342623088e-05, + "loss": 1.7879, + "step": 10910 + }, + { + "epoch": 3.348987108655617, + "grad_norm": 0.25766023993492126, + "learning_rate": 7.75450861176644e-05, + "loss": 1.8143, + "step": 10911 + }, + { + "epoch": 3.349294045426642, + "grad_norm": 0.2736956477165222, + "learning_rate": 7.754093770080331e-05, + "loss": 1.8907, + "step": 10912 + }, + { + "epoch": 3.3496009821976673, + "grad_norm": 0.23700755834579468, + "learning_rate": 7.753678901176654e-05, + "loss": 1.813, + "step": 10913 + }, + { + "epoch": 3.349907918968692, + "grad_norm": 0.245509073138237, + "learning_rate": 7.753264005059507e-05, + "loss": 1.8019, + "step": 10914 + }, + { + "epoch": 3.3502148557397176, + "grad_norm": 0.232910618185997, + "learning_rate": 7.752849081732993e-05, + "loss": 1.784, + "step": 10915 + }, + { + "epoch": 3.350521792510743, + "grad_norm": 0.22989360988140106, + "learning_rate": 7.75243413120121e-05, + "loss": 1.7597, + "step": 10916 + }, + { + "epoch": 3.350828729281768, + "grad_norm": 0.2093925178050995, + "learning_rate": 7.752019153468258e-05, + "loss": 1.7698, + "step": 10917 + }, + { + "epoch": 3.351135666052793, + "grad_norm": 0.25539630651474, + "learning_rate": 7.751604148538241e-05, + "loss": 1.8287, + "step": 10918 + }, + { + "epoch": 3.3514426028238185, + "grad_norm": 0.2731820046901703, + "learning_rate": 7.75118911641526e-05, + "loss": 1.8862, + "step": 10919 + }, + { + "epoch": 3.3517495395948433, + "grad_norm": 0.2464541345834732, + "learning_rate": 7.750774057103416e-05, + "loss": 1.8165, + "step": 10920 + }, + { + "epoch": 3.3520564763658687, + "grad_norm": 0.26380276679992676, + "learning_rate": 7.75035897060681e-05, + "loss": 1.78, + "step": 10921 + }, + { + "epoch": 3.352363413136894, + "grad_norm": 0.3080748915672302, + "learning_rate": 7.749943856929542e-05, + "loss": 1.7925, + "step": 10922 + }, + { + "epoch": 3.352670349907919, + "grad_norm": 0.317754864692688, + "learning_rate": 7.74952871607572e-05, + "loss": 1.8248, + "step": 10923 + }, + { + "epoch": 3.3529772866789442, + "grad_norm": 0.2525196373462677, + "learning_rate": 7.749113548049442e-05, + "loss": 1.762, + "step": 10924 + }, + { + "epoch": 3.353284223449969, + "grad_norm": 0.3149549961090088, + "learning_rate": 7.748698352854814e-05, + "loss": 1.8289, + "step": 10925 + }, + { + "epoch": 3.3535911602209945, + "grad_norm": 0.35744383931159973, + "learning_rate": 7.748283130495937e-05, + "loss": 1.8132, + "step": 10926 + }, + { + "epoch": 3.35389809699202, + "grad_norm": 0.28599128127098083, + "learning_rate": 7.747867880976916e-05, + "loss": 1.7351, + "step": 10927 + }, + { + "epoch": 3.3542050337630447, + "grad_norm": 0.24428869783878326, + "learning_rate": 7.747452604301852e-05, + "loss": 1.794, + "step": 10928 + }, + { + "epoch": 3.35451197053407, + "grad_norm": 0.29067808389663696, + "learning_rate": 7.747037300474854e-05, + "loss": 1.8181, + "step": 10929 + }, + { + "epoch": 3.354818907305095, + "grad_norm": 0.32417505979537964, + "learning_rate": 7.746621969500021e-05, + "loss": 1.8338, + "step": 10930 + }, + { + "epoch": 3.3551258440761202, + "grad_norm": 0.29536551237106323, + "learning_rate": 7.746206611381462e-05, + "loss": 1.8732, + "step": 10931 + }, + { + "epoch": 3.3554327808471456, + "grad_norm": 0.3169345259666443, + "learning_rate": 7.745791226123278e-05, + "loss": 1.876, + "step": 10932 + }, + { + "epoch": 3.3557397176181705, + "grad_norm": 0.2680271565914154, + "learning_rate": 7.745375813729576e-05, + "loss": 1.7347, + "step": 10933 + }, + { + "epoch": 3.356046654389196, + "grad_norm": 0.28339266777038574, + "learning_rate": 7.74496037420446e-05, + "loss": 1.8507, + "step": 10934 + }, + { + "epoch": 3.356353591160221, + "grad_norm": 0.2567409574985504, + "learning_rate": 7.744544907552038e-05, + "loss": 1.8244, + "step": 10935 + }, + { + "epoch": 3.356660527931246, + "grad_norm": 0.266063928604126, + "learning_rate": 7.744129413776416e-05, + "loss": 1.7864, + "step": 10936 + }, + { + "epoch": 3.3569674647022714, + "grad_norm": 0.2490999698638916, + "learning_rate": 7.743713892881696e-05, + "loss": 1.7637, + "step": 10937 + }, + { + "epoch": 3.3572744014732967, + "grad_norm": 0.25857025384902954, + "learning_rate": 7.743298344871988e-05, + "loss": 1.8101, + "step": 10938 + }, + { + "epoch": 3.3575813382443216, + "grad_norm": 0.2549006938934326, + "learning_rate": 7.742882769751398e-05, + "loss": 1.7782, + "step": 10939 + }, + { + "epoch": 3.357888275015347, + "grad_norm": 0.23915350437164307, + "learning_rate": 7.742467167524035e-05, + "loss": 1.7822, + "step": 10940 + }, + { + "epoch": 3.358195211786372, + "grad_norm": 0.25501590967178345, + "learning_rate": 7.742051538194e-05, + "loss": 1.798, + "step": 10941 + }, + { + "epoch": 3.358502148557397, + "grad_norm": 0.29332005977630615, + "learning_rate": 7.741635881765408e-05, + "loss": 1.8334, + "step": 10942 + }, + { + "epoch": 3.3588090853284225, + "grad_norm": 0.28878241777420044, + "learning_rate": 7.741220198242362e-05, + "loss": 1.8266, + "step": 10943 + }, + { + "epoch": 3.3591160220994474, + "grad_norm": 0.3068650960922241, + "learning_rate": 7.740804487628971e-05, + "loss": 1.8562, + "step": 10944 + }, + { + "epoch": 3.3594229588704727, + "grad_norm": 0.2522405683994293, + "learning_rate": 7.740388749929343e-05, + "loss": 1.8001, + "step": 10945 + }, + { + "epoch": 3.359729895641498, + "grad_norm": 0.3073521554470062, + "learning_rate": 7.739972985147588e-05, + "loss": 1.7454, + "step": 10946 + }, + { + "epoch": 3.360036832412523, + "grad_norm": 0.3018052577972412, + "learning_rate": 7.739557193287815e-05, + "loss": 1.7888, + "step": 10947 + }, + { + "epoch": 3.3603437691835483, + "grad_norm": 0.2738604247570038, + "learning_rate": 7.73914137435413e-05, + "loss": 1.7208, + "step": 10948 + }, + { + "epoch": 3.3606507059545736, + "grad_norm": 0.37699586153030396, + "learning_rate": 7.738725528350646e-05, + "loss": 1.8175, + "step": 10949 + }, + { + "epoch": 3.3609576427255985, + "grad_norm": 0.3479778468608856, + "learning_rate": 7.738309655281471e-05, + "loss": 1.818, + "step": 10950 + }, + { + "epoch": 3.361264579496624, + "grad_norm": 0.24871166050434113, + "learning_rate": 7.737893755150715e-05, + "loss": 1.7046, + "step": 10951 + }, + { + "epoch": 3.3615715162676487, + "grad_norm": 0.45015642046928406, + "learning_rate": 7.737477827962488e-05, + "loss": 1.8517, + "step": 10952 + }, + { + "epoch": 3.361878453038674, + "grad_norm": 0.4149077534675598, + "learning_rate": 7.7370618737209e-05, + "loss": 1.7403, + "step": 10953 + }, + { + "epoch": 3.3621853898096994, + "grad_norm": 0.2556059658527374, + "learning_rate": 7.736645892430064e-05, + "loss": 1.8167, + "step": 10954 + }, + { + "epoch": 3.3624923265807243, + "grad_norm": 0.3153657615184784, + "learning_rate": 7.736229884094088e-05, + "loss": 1.8471, + "step": 10955 + }, + { + "epoch": 3.3627992633517496, + "grad_norm": 0.27943772077560425, + "learning_rate": 7.735813848717084e-05, + "loss": 1.7742, + "step": 10956 + }, + { + "epoch": 3.3631062001227745, + "grad_norm": 0.28270283341407776, + "learning_rate": 7.735397786303164e-05, + "loss": 1.8418, + "step": 10957 + }, + { + "epoch": 3.3634131368938, + "grad_norm": 0.3596261441707611, + "learning_rate": 7.734981696856442e-05, + "loss": 1.8213, + "step": 10958 + }, + { + "epoch": 3.363720073664825, + "grad_norm": 0.3678492307662964, + "learning_rate": 7.734565580381026e-05, + "loss": 1.806, + "step": 10959 + }, + { + "epoch": 3.36402701043585, + "grad_norm": 0.27758681774139404, + "learning_rate": 7.734149436881031e-05, + "loss": 1.7832, + "step": 10960 + }, + { + "epoch": 3.3643339472068754, + "grad_norm": 0.2821379005908966, + "learning_rate": 7.733733266360568e-05, + "loss": 1.8888, + "step": 10961 + }, + { + "epoch": 3.3646408839779007, + "grad_norm": 0.33676958084106445, + "learning_rate": 7.733317068823751e-05, + "loss": 1.902, + "step": 10962 + }, + { + "epoch": 3.3649478207489256, + "grad_norm": 0.3116114139556885, + "learning_rate": 7.732900844274691e-05, + "loss": 1.8228, + "step": 10963 + }, + { + "epoch": 3.365254757519951, + "grad_norm": 0.3286324143409729, + "learning_rate": 7.732484592717506e-05, + "loss": 1.8707, + "step": 10964 + }, + { + "epoch": 3.3655616942909763, + "grad_norm": 0.2732192873954773, + "learning_rate": 7.732068314156304e-05, + "loss": 1.773, + "step": 10965 + }, + { + "epoch": 3.365868631062001, + "grad_norm": 0.26663896441459656, + "learning_rate": 7.731652008595204e-05, + "loss": 1.7837, + "step": 10966 + }, + { + "epoch": 3.3661755678330265, + "grad_norm": 0.27447745203971863, + "learning_rate": 7.731235676038317e-05, + "loss": 1.9103, + "step": 10967 + }, + { + "epoch": 3.3664825046040514, + "grad_norm": 0.30832916498184204, + "learning_rate": 7.730819316489757e-05, + "loss": 1.7552, + "step": 10968 + }, + { + "epoch": 3.3667894413750767, + "grad_norm": 0.29657161235809326, + "learning_rate": 7.73040292995364e-05, + "loss": 1.7654, + "step": 10969 + }, + { + "epoch": 3.367096378146102, + "grad_norm": 0.30434274673461914, + "learning_rate": 7.729986516434082e-05, + "loss": 1.8646, + "step": 10970 + }, + { + "epoch": 3.367403314917127, + "grad_norm": 0.25926661491394043, + "learning_rate": 7.729570075935198e-05, + "loss": 1.7555, + "step": 10971 + }, + { + "epoch": 3.3677102516881523, + "grad_norm": 0.2775980532169342, + "learning_rate": 7.729153608461102e-05, + "loss": 1.8427, + "step": 10972 + }, + { + "epoch": 3.368017188459177, + "grad_norm": 0.23915666341781616, + "learning_rate": 7.72873711401591e-05, + "loss": 1.7902, + "step": 10973 + }, + { + "epoch": 3.3683241252302025, + "grad_norm": 0.2603691518306732, + "learning_rate": 7.728320592603737e-05, + "loss": 1.8587, + "step": 10974 + }, + { + "epoch": 3.368631062001228, + "grad_norm": 0.2579508125782013, + "learning_rate": 7.727904044228703e-05, + "loss": 1.7617, + "step": 10975 + }, + { + "epoch": 3.3689379987722528, + "grad_norm": 0.3384297788143158, + "learning_rate": 7.72748746889492e-05, + "loss": 1.8499, + "step": 10976 + }, + { + "epoch": 3.369244935543278, + "grad_norm": 0.36756646633148193, + "learning_rate": 7.727070866606509e-05, + "loss": 1.808, + "step": 10977 + }, + { + "epoch": 3.3695518723143034, + "grad_norm": 0.3212372958660126, + "learning_rate": 7.726654237367587e-05, + "loss": 1.8245, + "step": 10978 + }, + { + "epoch": 3.3698588090853283, + "grad_norm": 0.23782415688037872, + "learning_rate": 7.726237581182267e-05, + "loss": 1.7629, + "step": 10979 + }, + { + "epoch": 3.3701657458563536, + "grad_norm": 0.2782919108867645, + "learning_rate": 7.725820898054669e-05, + "loss": 1.8, + "step": 10980 + }, + { + "epoch": 3.370472682627379, + "grad_norm": 0.2973455488681793, + "learning_rate": 7.725404187988914e-05, + "loss": 1.7949, + "step": 10981 + }, + { + "epoch": 3.370779619398404, + "grad_norm": 0.2875392735004425, + "learning_rate": 7.724987450989114e-05, + "loss": 1.8019, + "step": 10982 + }, + { + "epoch": 3.371086556169429, + "grad_norm": 0.26133236289024353, + "learning_rate": 7.724570687059394e-05, + "loss": 1.7984, + "step": 10983 + }, + { + "epoch": 3.371393492940454, + "grad_norm": 0.2760173976421356, + "learning_rate": 7.724153896203867e-05, + "loss": 1.8082, + "step": 10984 + }, + { + "epoch": 3.3717004297114794, + "grad_norm": 0.26373061537742615, + "learning_rate": 7.723737078426656e-05, + "loss": 1.8408, + "step": 10985 + }, + { + "epoch": 3.3720073664825048, + "grad_norm": 0.29425618052482605, + "learning_rate": 7.723320233731879e-05, + "loss": 1.7992, + "step": 10986 + }, + { + "epoch": 3.3723143032535297, + "grad_norm": 0.29822099208831787, + "learning_rate": 7.722903362123655e-05, + "loss": 1.8204, + "step": 10987 + }, + { + "epoch": 3.372621240024555, + "grad_norm": 0.25945618748664856, + "learning_rate": 7.722486463606104e-05, + "loss": 1.7376, + "step": 10988 + }, + { + "epoch": 3.37292817679558, + "grad_norm": 0.26367196440696716, + "learning_rate": 7.722069538183345e-05, + "loss": 1.814, + "step": 10989 + }, + { + "epoch": 3.373235113566605, + "grad_norm": 0.25015249848365784, + "learning_rate": 7.7216525858595e-05, + "loss": 1.8199, + "step": 10990 + }, + { + "epoch": 3.3735420503376305, + "grad_norm": 0.3035781681537628, + "learning_rate": 7.72123560663869e-05, + "loss": 1.739, + "step": 10991 + }, + { + "epoch": 3.3738489871086554, + "grad_norm": 0.2847912013530731, + "learning_rate": 7.720818600525033e-05, + "loss": 1.8754, + "step": 10992 + }, + { + "epoch": 3.3741559238796808, + "grad_norm": 0.2533976435661316, + "learning_rate": 7.720401567522653e-05, + "loss": 1.7616, + "step": 10993 + }, + { + "epoch": 3.374462860650706, + "grad_norm": 0.250828355550766, + "learning_rate": 7.719984507635669e-05, + "loss": 1.7973, + "step": 10994 + }, + { + "epoch": 3.374769797421731, + "grad_norm": 0.3019898235797882, + "learning_rate": 7.719567420868206e-05, + "loss": 1.7563, + "step": 10995 + }, + { + "epoch": 3.3750767341927563, + "grad_norm": 0.2703310549259186, + "learning_rate": 7.719150307224382e-05, + "loss": 1.8183, + "step": 10996 + }, + { + "epoch": 3.3753836709637817, + "grad_norm": 0.2434745579957962, + "learning_rate": 7.718733166708321e-05, + "loss": 1.7913, + "step": 10997 + }, + { + "epoch": 3.3756906077348066, + "grad_norm": 0.28036773204803467, + "learning_rate": 7.718315999324146e-05, + "loss": 1.7884, + "step": 10998 + }, + { + "epoch": 3.375997544505832, + "grad_norm": 0.25123077630996704, + "learning_rate": 7.717898805075978e-05, + "loss": 1.7394, + "step": 10999 + }, + { + "epoch": 3.376304481276857, + "grad_norm": 0.2313947230577469, + "learning_rate": 7.717481583967943e-05, + "loss": 1.7537, + "step": 11000 + }, + { + "epoch": 3.376611418047882, + "grad_norm": 0.27152860164642334, + "learning_rate": 7.71706433600416e-05, + "loss": 1.8596, + "step": 11001 + }, + { + "epoch": 3.3769183548189075, + "grad_norm": 0.32866382598876953, + "learning_rate": 7.716647061188757e-05, + "loss": 1.9007, + "step": 11002 + }, + { + "epoch": 3.3772252915899323, + "grad_norm": 0.2842368185520172, + "learning_rate": 7.716229759525854e-05, + "loss": 1.7781, + "step": 11003 + }, + { + "epoch": 3.3775322283609577, + "grad_norm": 0.30411216616630554, + "learning_rate": 7.715812431019576e-05, + "loss": 1.7403, + "step": 11004 + }, + { + "epoch": 3.3778391651319826, + "grad_norm": 0.31848132610321045, + "learning_rate": 7.71539507567405e-05, + "loss": 1.817, + "step": 11005 + }, + { + "epoch": 3.378146101903008, + "grad_norm": 0.24206148087978363, + "learning_rate": 7.714977693493397e-05, + "loss": 1.7796, + "step": 11006 + }, + { + "epoch": 3.3784530386740332, + "grad_norm": 0.2982998490333557, + "learning_rate": 7.714560284481742e-05, + "loss": 1.7883, + "step": 11007 + }, + { + "epoch": 3.378759975445058, + "grad_norm": 0.24857483804225922, + "learning_rate": 7.714142848643213e-05, + "loss": 1.7447, + "step": 11008 + }, + { + "epoch": 3.3790669122160835, + "grad_norm": 0.2509039044380188, + "learning_rate": 7.713725385981932e-05, + "loss": 1.8362, + "step": 11009 + }, + { + "epoch": 3.379373848987109, + "grad_norm": 0.2759779095649719, + "learning_rate": 7.713307896502027e-05, + "loss": 1.8655, + "step": 11010 + }, + { + "epoch": 3.3796807857581337, + "grad_norm": 0.264776349067688, + "learning_rate": 7.712890380207623e-05, + "loss": 1.8221, + "step": 11011 + }, + { + "epoch": 3.379987722529159, + "grad_norm": 0.2771971821784973, + "learning_rate": 7.712472837102846e-05, + "loss": 1.6992, + "step": 11012 + }, + { + "epoch": 3.3802946593001844, + "grad_norm": 0.2749316096305847, + "learning_rate": 7.712055267191822e-05, + "loss": 1.8128, + "step": 11013 + }, + { + "epoch": 3.3806015960712092, + "grad_norm": 0.256656289100647, + "learning_rate": 7.71163767047868e-05, + "loss": 1.8382, + "step": 11014 + }, + { + "epoch": 3.3809085328422346, + "grad_norm": 0.27646976709365845, + "learning_rate": 7.711220046967545e-05, + "loss": 1.8321, + "step": 11015 + }, + { + "epoch": 3.3812154696132595, + "grad_norm": 0.3083149194717407, + "learning_rate": 7.710802396662542e-05, + "loss": 1.904, + "step": 11016 + }, + { + "epoch": 3.381522406384285, + "grad_norm": 0.2750856280326843, + "learning_rate": 7.710384719567803e-05, + "loss": 1.7596, + "step": 11017 + }, + { + "epoch": 3.38182934315531, + "grad_norm": 0.3029455244541168, + "learning_rate": 7.709967015687452e-05, + "loss": 1.8542, + "step": 11018 + }, + { + "epoch": 3.382136279926335, + "grad_norm": 0.3144093453884125, + "learning_rate": 7.709549285025622e-05, + "loss": 1.7489, + "step": 11019 + }, + { + "epoch": 3.3824432166973604, + "grad_norm": 0.2675442099571228, + "learning_rate": 7.709131527586433e-05, + "loss": 1.7324, + "step": 11020 + }, + { + "epoch": 3.3827501534683857, + "grad_norm": 0.2906095087528229, + "learning_rate": 7.708713743374021e-05, + "loss": 1.7848, + "step": 11021 + }, + { + "epoch": 3.3830570902394106, + "grad_norm": 0.25141623616218567, + "learning_rate": 7.708295932392513e-05, + "loss": 1.7423, + "step": 11022 + }, + { + "epoch": 3.383364027010436, + "grad_norm": 0.25832003355026245, + "learning_rate": 7.707878094646037e-05, + "loss": 1.7792, + "step": 11023 + }, + { + "epoch": 3.3836709637814613, + "grad_norm": 0.23710070550441742, + "learning_rate": 7.70746023013872e-05, + "loss": 1.7916, + "step": 11024 + }, + { + "epoch": 3.383977900552486, + "grad_norm": 0.286735862493515, + "learning_rate": 7.707042338874697e-05, + "loss": 1.8272, + "step": 11025 + }, + { + "epoch": 3.3842848373235115, + "grad_norm": 0.2536577582359314, + "learning_rate": 7.706624420858094e-05, + "loss": 1.7839, + "step": 11026 + }, + { + "epoch": 3.3845917740945364, + "grad_norm": 0.5564702749252319, + "learning_rate": 7.706206476093043e-05, + "loss": 1.7832, + "step": 11027 + }, + { + "epoch": 3.3848987108655617, + "grad_norm": 0.34694772958755493, + "learning_rate": 7.705788504583671e-05, + "loss": 1.8668, + "step": 11028 + }, + { + "epoch": 3.385205647636587, + "grad_norm": 0.30388176441192627, + "learning_rate": 7.705370506334113e-05, + "loss": 1.8244, + "step": 11029 + }, + { + "epoch": 3.385512584407612, + "grad_norm": 0.2998919188976288, + "learning_rate": 7.704952481348497e-05, + "loss": 1.7927, + "step": 11030 + }, + { + "epoch": 3.3858195211786373, + "grad_norm": 0.2714936435222626, + "learning_rate": 7.704534429630955e-05, + "loss": 1.8757, + "step": 11031 + }, + { + "epoch": 3.386126457949662, + "grad_norm": 0.26670241355895996, + "learning_rate": 7.704116351185619e-05, + "loss": 1.8146, + "step": 11032 + }, + { + "epoch": 3.3864333947206875, + "grad_norm": 0.2500552833080292, + "learning_rate": 7.703698246016621e-05, + "loss": 1.7984, + "step": 11033 + }, + { + "epoch": 3.386740331491713, + "grad_norm": 0.2494918406009674, + "learning_rate": 7.703280114128091e-05, + "loss": 1.7433, + "step": 11034 + }, + { + "epoch": 3.3870472682627377, + "grad_norm": 0.25658491253852844, + "learning_rate": 7.702861955524163e-05, + "loss": 1.8487, + "step": 11035 + }, + { + "epoch": 3.387354205033763, + "grad_norm": 0.2871410548686981, + "learning_rate": 7.702443770208969e-05, + "loss": 1.7919, + "step": 11036 + }, + { + "epoch": 3.3876611418047884, + "grad_norm": 0.3347938060760498, + "learning_rate": 7.702025558186643e-05, + "loss": 1.8091, + "step": 11037 + }, + { + "epoch": 3.3879680785758133, + "grad_norm": 0.39016643166542053, + "learning_rate": 7.701607319461315e-05, + "loss": 1.7816, + "step": 11038 + }, + { + "epoch": 3.3882750153468386, + "grad_norm": 0.3423028290271759, + "learning_rate": 7.701189054037121e-05, + "loss": 1.8454, + "step": 11039 + }, + { + "epoch": 3.388581952117864, + "grad_norm": 0.27592089772224426, + "learning_rate": 7.700770761918192e-05, + "loss": 1.8431, + "step": 11040 + }, + { + "epoch": 3.388888888888889, + "grad_norm": 0.46047264337539673, + "learning_rate": 7.700352443108665e-05, + "loss": 1.8412, + "step": 11041 + }, + { + "epoch": 3.389195825659914, + "grad_norm": 0.49226754903793335, + "learning_rate": 7.699934097612673e-05, + "loss": 1.8212, + "step": 11042 + }, + { + "epoch": 3.389502762430939, + "grad_norm": 0.3958778381347656, + "learning_rate": 7.699515725434348e-05, + "loss": 1.747, + "step": 11043 + }, + { + "epoch": 3.3898096992019644, + "grad_norm": 0.26097169518470764, + "learning_rate": 7.699097326577827e-05, + "loss": 1.7631, + "step": 11044 + }, + { + "epoch": 3.3901166359729897, + "grad_norm": 0.2922612130641937, + "learning_rate": 7.698678901047245e-05, + "loss": 1.7891, + "step": 11045 + }, + { + "epoch": 3.3904235727440146, + "grad_norm": 0.4195055365562439, + "learning_rate": 7.698260448846734e-05, + "loss": 1.7765, + "step": 11046 + }, + { + "epoch": 3.39073050951504, + "grad_norm": 0.4572988450527191, + "learning_rate": 7.697841969980434e-05, + "loss": 1.8085, + "step": 11047 + }, + { + "epoch": 3.391037446286065, + "grad_norm": 0.38819587230682373, + "learning_rate": 7.697423464452478e-05, + "loss": 1.8854, + "step": 11048 + }, + { + "epoch": 3.39134438305709, + "grad_norm": 0.27421653270721436, + "learning_rate": 7.697004932267003e-05, + "loss": 1.8327, + "step": 11049 + }, + { + "epoch": 3.3916513198281155, + "grad_norm": 0.33559146523475647, + "learning_rate": 7.696586373428142e-05, + "loss": 1.8109, + "step": 11050 + }, + { + "epoch": 3.3919582565991404, + "grad_norm": 0.39438655972480774, + "learning_rate": 7.696167787940037e-05, + "loss": 1.7909, + "step": 11051 + }, + { + "epoch": 3.3922651933701657, + "grad_norm": 0.3425842523574829, + "learning_rate": 7.695749175806819e-05, + "loss": 1.8571, + "step": 11052 + }, + { + "epoch": 3.392572130141191, + "grad_norm": 0.2860080301761627, + "learning_rate": 7.695330537032628e-05, + "loss": 1.8546, + "step": 11053 + }, + { + "epoch": 3.392879066912216, + "grad_norm": 0.35894665122032166, + "learning_rate": 7.694911871621601e-05, + "loss": 1.7895, + "step": 11054 + }, + { + "epoch": 3.3931860036832413, + "grad_norm": 0.351193904876709, + "learning_rate": 7.694493179577879e-05, + "loss": 1.7453, + "step": 11055 + }, + { + "epoch": 3.3934929404542666, + "grad_norm": 0.24812865257263184, + "learning_rate": 7.694074460905592e-05, + "loss": 1.8131, + "step": 11056 + }, + { + "epoch": 3.3937998772252915, + "grad_norm": 0.38620972633361816, + "learning_rate": 7.693655715608883e-05, + "loss": 1.8346, + "step": 11057 + }, + { + "epoch": 3.394106813996317, + "grad_norm": 0.5005692839622498, + "learning_rate": 7.69323694369189e-05, + "loss": 1.9031, + "step": 11058 + }, + { + "epoch": 3.3944137507673418, + "grad_norm": 0.4321887791156769, + "learning_rate": 7.692818145158751e-05, + "loss": 1.8783, + "step": 11059 + }, + { + "epoch": 3.394720687538367, + "grad_norm": 0.269307017326355, + "learning_rate": 7.692399320013603e-05, + "loss": 1.8075, + "step": 11060 + }, + { + "epoch": 3.3950276243093924, + "grad_norm": 0.2945556342601776, + "learning_rate": 7.69198046826059e-05, + "loss": 1.8366, + "step": 11061 + }, + { + "epoch": 3.3953345610804173, + "grad_norm": 0.30531853437423706, + "learning_rate": 7.691561589903847e-05, + "loss": 1.7665, + "step": 11062 + }, + { + "epoch": 3.3956414978514426, + "grad_norm": 0.25105199217796326, + "learning_rate": 7.691142684947513e-05, + "loss": 1.782, + "step": 11063 + }, + { + "epoch": 3.3959484346224675, + "grad_norm": 0.3373202085494995, + "learning_rate": 7.69072375339573e-05, + "loss": 1.8148, + "step": 11064 + }, + { + "epoch": 3.396255371393493, + "grad_norm": 0.34207093715667725, + "learning_rate": 7.690304795252638e-05, + "loss": 1.8287, + "step": 11065 + }, + { + "epoch": 3.396562308164518, + "grad_norm": 0.26281681656837463, + "learning_rate": 7.68988581052238e-05, + "loss": 1.8551, + "step": 11066 + }, + { + "epoch": 3.396869244935543, + "grad_norm": 0.3091152608394623, + "learning_rate": 7.689466799209091e-05, + "loss": 1.7689, + "step": 11067 + }, + { + "epoch": 3.3971761817065684, + "grad_norm": 0.37421298027038574, + "learning_rate": 7.689047761316914e-05, + "loss": 1.7908, + "step": 11068 + }, + { + "epoch": 3.3974831184775938, + "grad_norm": 0.3745511770248413, + "learning_rate": 7.688628696849993e-05, + "loss": 1.8408, + "step": 11069 + }, + { + "epoch": 3.3977900552486187, + "grad_norm": 0.3003663122653961, + "learning_rate": 7.688209605812467e-05, + "loss": 1.9109, + "step": 11070 + }, + { + "epoch": 3.398096992019644, + "grad_norm": 0.3437681496143341, + "learning_rate": 7.687790488208478e-05, + "loss": 1.811, + "step": 11071 + }, + { + "epoch": 3.3984039287906693, + "grad_norm": 0.3480641841888428, + "learning_rate": 7.687371344042168e-05, + "loss": 1.8114, + "step": 11072 + }, + { + "epoch": 3.398710865561694, + "grad_norm": 0.24670913815498352, + "learning_rate": 7.686952173317679e-05, + "loss": 1.7959, + "step": 11073 + }, + { + "epoch": 3.3990178023327196, + "grad_norm": 0.2939499020576477, + "learning_rate": 7.686532976039154e-05, + "loss": 1.7518, + "step": 11074 + }, + { + "epoch": 3.3993247391037444, + "grad_norm": 0.3332279622554779, + "learning_rate": 7.686113752210736e-05, + "loss": 1.843, + "step": 11075 + }, + { + "epoch": 3.3996316758747698, + "grad_norm": 0.22967280447483063, + "learning_rate": 7.685694501836566e-05, + "loss": 1.7408, + "step": 11076 + }, + { + "epoch": 3.399938612645795, + "grad_norm": 0.3443470001220703, + "learning_rate": 7.685275224920789e-05, + "loss": 1.8004, + "step": 11077 + }, + { + "epoch": 3.40024554941682, + "grad_norm": 0.3725457489490509, + "learning_rate": 7.684855921467548e-05, + "loss": 1.833, + "step": 11078 + }, + { + "epoch": 3.4005524861878453, + "grad_norm": 0.3178638219833374, + "learning_rate": 7.68443659148099e-05, + "loss": 1.8055, + "step": 11079 + }, + { + "epoch": 3.4008594229588702, + "grad_norm": 0.2609167695045471, + "learning_rate": 7.684017234965254e-05, + "loss": 1.7881, + "step": 11080 + }, + { + "epoch": 3.4011663597298956, + "grad_norm": 0.26975762844085693, + "learning_rate": 7.683597851924486e-05, + "loss": 1.8424, + "step": 11081 + }, + { + "epoch": 3.401473296500921, + "grad_norm": 0.266661673784256, + "learning_rate": 7.683178442362832e-05, + "loss": 1.7785, + "step": 11082 + }, + { + "epoch": 3.401780233271946, + "grad_norm": 0.27915671467781067, + "learning_rate": 7.682759006284436e-05, + "loss": 1.8241, + "step": 11083 + }, + { + "epoch": 3.402087170042971, + "grad_norm": 0.25167274475097656, + "learning_rate": 7.682339543693444e-05, + "loss": 1.7637, + "step": 11084 + }, + { + "epoch": 3.4023941068139965, + "grad_norm": 0.2439529299736023, + "learning_rate": 7.681920054593999e-05, + "loss": 1.7796, + "step": 11085 + }, + { + "epoch": 3.4027010435850213, + "grad_norm": 0.26224252581596375, + "learning_rate": 7.681500538990249e-05, + "loss": 1.8018, + "step": 11086 + }, + { + "epoch": 3.4030079803560467, + "grad_norm": 0.25093868374824524, + "learning_rate": 7.681080996886336e-05, + "loss": 1.7664, + "step": 11087 + }, + { + "epoch": 3.403314917127072, + "grad_norm": 0.26393210887908936, + "learning_rate": 7.680661428286413e-05, + "loss": 1.8389, + "step": 11088 + }, + { + "epoch": 3.403621853898097, + "grad_norm": 0.24750283360481262, + "learning_rate": 7.680241833194622e-05, + "loss": 1.8358, + "step": 11089 + }, + { + "epoch": 3.4039287906691222, + "grad_norm": 0.21568982303142548, + "learning_rate": 7.67982221161511e-05, + "loss": 1.7874, + "step": 11090 + }, + { + "epoch": 3.404235727440147, + "grad_norm": 0.24407126009464264, + "learning_rate": 7.679402563552023e-05, + "loss": 1.7753, + "step": 11091 + }, + { + "epoch": 3.4045426642111725, + "grad_norm": 0.23288260400295258, + "learning_rate": 7.67898288900951e-05, + "loss": 1.8046, + "step": 11092 + }, + { + "epoch": 3.404849600982198, + "grad_norm": 0.2548544108867645, + "learning_rate": 7.678563187991718e-05, + "loss": 1.8778, + "step": 11093 + }, + { + "epoch": 3.4051565377532227, + "grad_norm": 0.24008090794086456, + "learning_rate": 7.678143460502796e-05, + "loss": 1.7912, + "step": 11094 + }, + { + "epoch": 3.405463474524248, + "grad_norm": 0.26085031032562256, + "learning_rate": 7.677723706546889e-05, + "loss": 1.849, + "step": 11095 + }, + { + "epoch": 3.4057704112952734, + "grad_norm": 0.2830932140350342, + "learning_rate": 7.677303926128147e-05, + "loss": 1.8265, + "step": 11096 + }, + { + "epoch": 3.4060773480662982, + "grad_norm": 0.27593597769737244, + "learning_rate": 7.676884119250718e-05, + "loss": 1.8555, + "step": 11097 + }, + { + "epoch": 3.4063842848373236, + "grad_norm": 0.2403372824192047, + "learning_rate": 7.676464285918751e-05, + "loss": 1.7243, + "step": 11098 + }, + { + "epoch": 3.406691221608349, + "grad_norm": 0.28830090165138245, + "learning_rate": 7.676044426136397e-05, + "loss": 1.8108, + "step": 11099 + }, + { + "epoch": 3.406998158379374, + "grad_norm": 0.2918153405189514, + "learning_rate": 7.675624539907802e-05, + "loss": 1.7875, + "step": 11100 + }, + { + "epoch": 3.407305095150399, + "grad_norm": 0.2609013020992279, + "learning_rate": 7.675204627237117e-05, + "loss": 1.778, + "step": 11101 + }, + { + "epoch": 3.407612031921424, + "grad_norm": 0.2714763283729553, + "learning_rate": 7.674784688128494e-05, + "loss": 1.8472, + "step": 11102 + }, + { + "epoch": 3.4079189686924494, + "grad_norm": 0.25857117772102356, + "learning_rate": 7.674364722586078e-05, + "loss": 1.7495, + "step": 11103 + }, + { + "epoch": 3.4082259054634747, + "grad_norm": 0.25485143065452576, + "learning_rate": 7.673944730614023e-05, + "loss": 1.7817, + "step": 11104 + }, + { + "epoch": 3.4085328422344996, + "grad_norm": 0.2735857665538788, + "learning_rate": 7.67352471221648e-05, + "loss": 1.7522, + "step": 11105 + }, + { + "epoch": 3.408839779005525, + "grad_norm": 0.25079572200775146, + "learning_rate": 7.6731046673976e-05, + "loss": 1.765, + "step": 11106 + }, + { + "epoch": 3.40914671577655, + "grad_norm": 0.3080148696899414, + "learning_rate": 7.672684596161532e-05, + "loss": 1.8305, + "step": 11107 + }, + { + "epoch": 3.409453652547575, + "grad_norm": 0.23771968483924866, + "learning_rate": 7.672264498512427e-05, + "loss": 1.7837, + "step": 11108 + }, + { + "epoch": 3.4097605893186005, + "grad_norm": 0.29941999912261963, + "learning_rate": 7.671844374454437e-05, + "loss": 1.8013, + "step": 11109 + }, + { + "epoch": 3.4100675260896254, + "grad_norm": 0.27871644496917725, + "learning_rate": 7.671424223991717e-05, + "loss": 1.8598, + "step": 11110 + }, + { + "epoch": 3.4103744628606507, + "grad_norm": 0.2751443684101105, + "learning_rate": 7.671004047128416e-05, + "loss": 1.8341, + "step": 11111 + }, + { + "epoch": 3.410681399631676, + "grad_norm": 0.27227312326431274, + "learning_rate": 7.670583843868688e-05, + "loss": 1.81, + "step": 11112 + }, + { + "epoch": 3.410988336402701, + "grad_norm": 0.29617756605148315, + "learning_rate": 7.670163614216685e-05, + "loss": 1.8795, + "step": 11113 + }, + { + "epoch": 3.4112952731737263, + "grad_norm": 0.268920361995697, + "learning_rate": 7.669743358176563e-05, + "loss": 1.7659, + "step": 11114 + }, + { + "epoch": 3.4116022099447516, + "grad_norm": 0.2875109314918518, + "learning_rate": 7.669323075752467e-05, + "loss": 1.8263, + "step": 11115 + }, + { + "epoch": 3.4119091467157765, + "grad_norm": 0.34703585505485535, + "learning_rate": 7.668902766948558e-05, + "loss": 1.7622, + "step": 11116 + }, + { + "epoch": 3.412216083486802, + "grad_norm": 0.3090265393257141, + "learning_rate": 7.668482431768989e-05, + "loss": 1.7381, + "step": 11117 + }, + { + "epoch": 3.4125230202578267, + "grad_norm": 0.2619737684726715, + "learning_rate": 7.668062070217911e-05, + "loss": 1.8004, + "step": 11118 + }, + { + "epoch": 3.412829957028852, + "grad_norm": 0.289815217256546, + "learning_rate": 7.667641682299482e-05, + "loss": 1.7946, + "step": 11119 + }, + { + "epoch": 3.4131368937998774, + "grad_norm": 0.28732073307037354, + "learning_rate": 7.667221268017852e-05, + "loss": 1.8746, + "step": 11120 + }, + { + "epoch": 3.4134438305709023, + "grad_norm": 0.23232576251029968, + "learning_rate": 7.666800827377178e-05, + "loss": 1.7403, + "step": 11121 + }, + { + "epoch": 3.4137507673419276, + "grad_norm": 0.22903507947921753, + "learning_rate": 7.666380360381616e-05, + "loss": 1.7785, + "step": 11122 + }, + { + "epoch": 3.4140577041129525, + "grad_norm": 0.25023025274276733, + "learning_rate": 7.665959867035321e-05, + "loss": 1.7881, + "step": 11123 + }, + { + "epoch": 3.414364640883978, + "grad_norm": 0.2199166864156723, + "learning_rate": 7.665539347342449e-05, + "loss": 1.7522, + "step": 11124 + }, + { + "epoch": 3.414671577655003, + "grad_norm": 0.2539862394332886, + "learning_rate": 7.665118801307152e-05, + "loss": 1.7964, + "step": 11125 + }, + { + "epoch": 3.414978514426028, + "grad_norm": 0.22670161724090576, + "learning_rate": 7.664698228933591e-05, + "loss": 1.7071, + "step": 11126 + }, + { + "epoch": 3.4152854511970534, + "grad_norm": 0.24827396869659424, + "learning_rate": 7.664277630225919e-05, + "loss": 1.7897, + "step": 11127 + }, + { + "epoch": 3.4155923879680787, + "grad_norm": 0.29391366243362427, + "learning_rate": 7.663857005188296e-05, + "loss": 1.7967, + "step": 11128 + }, + { + "epoch": 3.4158993247391036, + "grad_norm": 0.3201812505722046, + "learning_rate": 7.663436353824874e-05, + "loss": 1.7681, + "step": 11129 + }, + { + "epoch": 3.416206261510129, + "grad_norm": 0.2274552583694458, + "learning_rate": 7.663015676139814e-05, + "loss": 1.7535, + "step": 11130 + }, + { + "epoch": 3.4165131982811543, + "grad_norm": 0.3955044150352478, + "learning_rate": 7.662594972137273e-05, + "loss": 1.8175, + "step": 11131 + }, + { + "epoch": 3.416820135052179, + "grad_norm": 0.46493569016456604, + "learning_rate": 7.662174241821406e-05, + "loss": 1.7806, + "step": 11132 + }, + { + "epoch": 3.4171270718232045, + "grad_norm": 0.37731611728668213, + "learning_rate": 7.661753485196375e-05, + "loss": 1.7555, + "step": 11133 + }, + { + "epoch": 3.4174340085942294, + "grad_norm": 0.23983556032180786, + "learning_rate": 7.661332702266334e-05, + "loss": 1.7662, + "step": 11134 + }, + { + "epoch": 3.4177409453652547, + "grad_norm": 0.34964314103126526, + "learning_rate": 7.660911893035445e-05, + "loss": 1.7786, + "step": 11135 + }, + { + "epoch": 3.41804788213628, + "grad_norm": 0.44820764660835266, + "learning_rate": 7.660491057507864e-05, + "loss": 1.778, + "step": 11136 + }, + { + "epoch": 3.418354818907305, + "grad_norm": 0.32936233282089233, + "learning_rate": 7.660070195687752e-05, + "loss": 1.8181, + "step": 11137 + }, + { + "epoch": 3.4186617556783303, + "grad_norm": 0.2874850332736969, + "learning_rate": 7.659649307579266e-05, + "loss": 1.8733, + "step": 11138 + }, + { + "epoch": 3.418968692449355, + "grad_norm": 0.46269866824150085, + "learning_rate": 7.659228393186566e-05, + "loss": 1.8566, + "step": 11139 + }, + { + "epoch": 3.4192756292203805, + "grad_norm": 0.5873839855194092, + "learning_rate": 7.658807452513816e-05, + "loss": 1.8317, + "step": 11140 + }, + { + "epoch": 3.419582565991406, + "grad_norm": 0.43150341510772705, + "learning_rate": 7.65838648556517e-05, + "loss": 1.7702, + "step": 11141 + }, + { + "epoch": 3.4198895027624308, + "grad_norm": 0.2803891599178314, + "learning_rate": 7.65796549234479e-05, + "loss": 1.8043, + "step": 11142 + }, + { + "epoch": 3.420196439533456, + "grad_norm": 0.37295013666152954, + "learning_rate": 7.657544472856838e-05, + "loss": 1.7923, + "step": 11143 + }, + { + "epoch": 3.4205033763044814, + "grad_norm": 0.3922573924064636, + "learning_rate": 7.657123427105473e-05, + "loss": 1.8231, + "step": 11144 + }, + { + "epoch": 3.4208103130755063, + "grad_norm": 0.27254152297973633, + "learning_rate": 7.656702355094859e-05, + "loss": 1.8168, + "step": 11145 + }, + { + "epoch": 3.4211172498465316, + "grad_norm": 0.28005337715148926, + "learning_rate": 7.656281256829152e-05, + "loss": 1.8047, + "step": 11146 + }, + { + "epoch": 3.421424186617557, + "grad_norm": 0.4369073808193207, + "learning_rate": 7.655860132312519e-05, + "loss": 1.7243, + "step": 11147 + }, + { + "epoch": 3.421731123388582, + "grad_norm": 0.4127553701400757, + "learning_rate": 7.655438981549119e-05, + "loss": 1.8148, + "step": 11148 + }, + { + "epoch": 3.422038060159607, + "grad_norm": 0.3131798207759857, + "learning_rate": 7.655017804543114e-05, + "loss": 1.789, + "step": 11149 + }, + { + "epoch": 3.422344996930632, + "grad_norm": 0.2947194576263428, + "learning_rate": 7.654596601298666e-05, + "loss": 1.8221, + "step": 11150 + }, + { + "epoch": 3.4226519337016574, + "grad_norm": 0.3072497546672821, + "learning_rate": 7.654175371819941e-05, + "loss": 1.7747, + "step": 11151 + }, + { + "epoch": 3.4229588704726828, + "grad_norm": 0.29408320784568787, + "learning_rate": 7.653754116111099e-05, + "loss": 1.9009, + "step": 11152 + }, + { + "epoch": 3.4232658072437077, + "grad_norm": 0.2629215717315674, + "learning_rate": 7.653332834176303e-05, + "loss": 1.7354, + "step": 11153 + }, + { + "epoch": 3.423572744014733, + "grad_norm": 0.2850257456302643, + "learning_rate": 7.652911526019716e-05, + "loss": 1.8422, + "step": 11154 + }, + { + "epoch": 3.423879680785758, + "grad_norm": 0.29787111282348633, + "learning_rate": 7.652490191645503e-05, + "loss": 1.8122, + "step": 11155 + }, + { + "epoch": 3.424186617556783, + "grad_norm": 0.2670947015285492, + "learning_rate": 7.652068831057826e-05, + "loss": 1.7734, + "step": 11156 + }, + { + "epoch": 3.4244935543278086, + "grad_norm": 0.26415133476257324, + "learning_rate": 7.651647444260853e-05, + "loss": 1.7661, + "step": 11157 + }, + { + "epoch": 3.424800491098834, + "grad_norm": 0.2614886164665222, + "learning_rate": 7.651226031258745e-05, + "loss": 1.6918, + "step": 11158 + }, + { + "epoch": 3.425107427869859, + "grad_norm": 0.28485649824142456, + "learning_rate": 7.650804592055667e-05, + "loss": 1.7771, + "step": 11159 + }, + { + "epoch": 3.425414364640884, + "grad_norm": 0.26080289483070374, + "learning_rate": 7.650383126655784e-05, + "loss": 1.7637, + "step": 11160 + }, + { + "epoch": 3.425721301411909, + "grad_norm": 0.2503695487976074, + "learning_rate": 7.649961635063261e-05, + "loss": 1.7864, + "step": 11161 + }, + { + "epoch": 3.4260282381829343, + "grad_norm": 0.3165570795536041, + "learning_rate": 7.649540117282263e-05, + "loss": 1.8107, + "step": 11162 + }, + { + "epoch": 3.4263351749539597, + "grad_norm": 0.28411731123924255, + "learning_rate": 7.649118573316959e-05, + "loss": 1.7557, + "step": 11163 + }, + { + "epoch": 3.4266421117249846, + "grad_norm": 0.24469570815563202, + "learning_rate": 7.648697003171512e-05, + "loss": 1.7597, + "step": 11164 + }, + { + "epoch": 3.42694904849601, + "grad_norm": 0.31968292593955994, + "learning_rate": 7.648275406850087e-05, + "loss": 1.7796, + "step": 11165 + }, + { + "epoch": 3.427255985267035, + "grad_norm": 0.24520765244960785, + "learning_rate": 7.647853784356856e-05, + "loss": 1.7931, + "step": 11166 + }, + { + "epoch": 3.42756292203806, + "grad_norm": 0.23946821689605713, + "learning_rate": 7.647432135695977e-05, + "loss": 1.7143, + "step": 11167 + }, + { + "epoch": 3.4278698588090855, + "grad_norm": 0.321455180644989, + "learning_rate": 7.647010460871624e-05, + "loss": 1.8682, + "step": 11168 + }, + { + "epoch": 3.4281767955801103, + "grad_norm": 0.2803197503089905, + "learning_rate": 7.646588759887964e-05, + "loss": 1.8, + "step": 11169 + }, + { + "epoch": 3.4284837323511357, + "grad_norm": 0.2597559988498688, + "learning_rate": 7.64616703274916e-05, + "loss": 1.8027, + "step": 11170 + }, + { + "epoch": 3.428790669122161, + "grad_norm": 0.25055503845214844, + "learning_rate": 7.645745279459384e-05, + "loss": 1.7659, + "step": 11171 + }, + { + "epoch": 3.429097605893186, + "grad_norm": 0.34582629799842834, + "learning_rate": 7.645323500022803e-05, + "loss": 1.7868, + "step": 11172 + }, + { + "epoch": 3.4294045426642112, + "grad_norm": 0.32845041155815125, + "learning_rate": 7.644901694443584e-05, + "loss": 1.8247, + "step": 11173 + }, + { + "epoch": 3.4297114794352366, + "grad_norm": 0.2570398449897766, + "learning_rate": 7.644479862725896e-05, + "loss": 1.7802, + "step": 11174 + }, + { + "epoch": 3.4300184162062615, + "grad_norm": 0.23117294907569885, + "learning_rate": 7.644058004873908e-05, + "loss": 1.7575, + "step": 11175 + }, + { + "epoch": 3.430325352977287, + "grad_norm": 0.2417830377817154, + "learning_rate": 7.64363612089179e-05, + "loss": 1.7954, + "step": 11176 + }, + { + "epoch": 3.4306322897483117, + "grad_norm": 0.249378964304924, + "learning_rate": 7.643214210783708e-05, + "loss": 1.8161, + "step": 11177 + }, + { + "epoch": 3.430939226519337, + "grad_norm": 0.24494746327400208, + "learning_rate": 7.642792274553836e-05, + "loss": 1.825, + "step": 11178 + }, + { + "epoch": 3.4312461632903624, + "grad_norm": 0.2663760185241699, + "learning_rate": 7.642370312206342e-05, + "loss": 1.7589, + "step": 11179 + }, + { + "epoch": 3.4315531000613873, + "grad_norm": 0.2819322645664215, + "learning_rate": 7.641948323745395e-05, + "loss": 1.8097, + "step": 11180 + }, + { + "epoch": 3.4318600368324126, + "grad_norm": 0.26917630434036255, + "learning_rate": 7.641526309175166e-05, + "loss": 1.7934, + "step": 11181 + }, + { + "epoch": 3.4321669736034375, + "grad_norm": 0.31618112325668335, + "learning_rate": 7.641104268499826e-05, + "loss": 1.8522, + "step": 11182 + }, + { + "epoch": 3.432473910374463, + "grad_norm": 0.29209139943122864, + "learning_rate": 7.640682201723546e-05, + "loss": 1.7499, + "step": 11183 + }, + { + "epoch": 3.432780847145488, + "grad_norm": 0.24831914901733398, + "learning_rate": 7.640260108850496e-05, + "loss": 1.7897, + "step": 11184 + }, + { + "epoch": 3.433087783916513, + "grad_norm": 0.2459818720817566, + "learning_rate": 7.639837989884849e-05, + "loss": 1.7604, + "step": 11185 + }, + { + "epoch": 3.4333947206875384, + "grad_norm": 0.27157485485076904, + "learning_rate": 7.639415844830774e-05, + "loss": 1.7776, + "step": 11186 + }, + { + "epoch": 3.4337016574585637, + "grad_norm": 0.3021515905857086, + "learning_rate": 7.638993673692445e-05, + "loss": 1.7771, + "step": 11187 + }, + { + "epoch": 3.4340085942295886, + "grad_norm": 0.2591722309589386, + "learning_rate": 7.638571476474036e-05, + "loss": 1.8333, + "step": 11188 + }, + { + "epoch": 3.434315531000614, + "grad_norm": 0.2255258709192276, + "learning_rate": 7.638149253179717e-05, + "loss": 1.7647, + "step": 11189 + }, + { + "epoch": 3.4346224677716393, + "grad_norm": 0.2585793733596802, + "learning_rate": 7.637727003813658e-05, + "loss": 1.786, + "step": 11190 + }, + { + "epoch": 3.434929404542664, + "grad_norm": 0.23649543523788452, + "learning_rate": 7.637304728380036e-05, + "loss": 1.822, + "step": 11191 + }, + { + "epoch": 3.4352363413136895, + "grad_norm": 0.2610832452774048, + "learning_rate": 7.636882426883023e-05, + "loss": 1.7925, + "step": 11192 + }, + { + "epoch": 3.4355432780847144, + "grad_norm": 0.26230642199516296, + "learning_rate": 7.636460099326793e-05, + "loss": 1.8169, + "step": 11193 + }, + { + "epoch": 3.4358502148557397, + "grad_norm": 0.2800561189651489, + "learning_rate": 7.636037745715518e-05, + "loss": 1.845, + "step": 11194 + }, + { + "epoch": 3.436157151626765, + "grad_norm": 0.27790409326553345, + "learning_rate": 7.635615366053372e-05, + "loss": 1.8141, + "step": 11195 + }, + { + "epoch": 3.43646408839779, + "grad_norm": 0.2894865870475769, + "learning_rate": 7.635192960344533e-05, + "loss": 1.7916, + "step": 11196 + }, + { + "epoch": 3.4367710251688153, + "grad_norm": 0.22310738265514374, + "learning_rate": 7.634770528593171e-05, + "loss": 1.79, + "step": 11197 + }, + { + "epoch": 3.43707796193984, + "grad_norm": 0.2837755084037781, + "learning_rate": 7.634348070803463e-05, + "loss": 1.8763, + "step": 11198 + }, + { + "epoch": 3.4373848987108655, + "grad_norm": 0.32488104701042175, + "learning_rate": 7.633925586979583e-05, + "loss": 1.8331, + "step": 11199 + }, + { + "epoch": 3.437691835481891, + "grad_norm": 0.2708779573440552, + "learning_rate": 7.633503077125706e-05, + "loss": 1.761, + "step": 11200 + }, + { + "epoch": 3.4379987722529157, + "grad_norm": 0.23929642140865326, + "learning_rate": 7.633080541246008e-05, + "loss": 1.8217, + "step": 11201 + }, + { + "epoch": 3.438305709023941, + "grad_norm": 0.3213331997394562, + "learning_rate": 7.632657979344667e-05, + "loss": 1.8375, + "step": 11202 + }, + { + "epoch": 3.4386126457949664, + "grad_norm": 0.38420629501342773, + "learning_rate": 7.632235391425854e-05, + "loss": 1.765, + "step": 11203 + }, + { + "epoch": 3.4389195825659913, + "grad_norm": 0.40466073155403137, + "learning_rate": 7.631812777493749e-05, + "loss": 1.8262, + "step": 11204 + }, + { + "epoch": 3.4392265193370166, + "grad_norm": 0.35904639959335327, + "learning_rate": 7.631390137552527e-05, + "loss": 1.894, + "step": 11205 + }, + { + "epoch": 3.439533456108042, + "grad_norm": 0.28880515694618225, + "learning_rate": 7.630967471606368e-05, + "loss": 1.87, + "step": 11206 + }, + { + "epoch": 3.439840392879067, + "grad_norm": 0.2878882884979248, + "learning_rate": 7.630544779659444e-05, + "loss": 1.7841, + "step": 11207 + }, + { + "epoch": 3.440147329650092, + "grad_norm": 0.36002418398857117, + "learning_rate": 7.630122061715935e-05, + "loss": 1.7318, + "step": 11208 + }, + { + "epoch": 3.440454266421117, + "grad_norm": 0.3304644227027893, + "learning_rate": 7.629699317780019e-05, + "loss": 1.8581, + "step": 11209 + }, + { + "epoch": 3.4407612031921424, + "grad_norm": 0.23396331071853638, + "learning_rate": 7.629276547855872e-05, + "loss": 1.7897, + "step": 11210 + }, + { + "epoch": 3.4410681399631677, + "grad_norm": 0.34914183616638184, + "learning_rate": 7.628853751947674e-05, + "loss": 1.8531, + "step": 11211 + }, + { + "epoch": 3.4413750767341926, + "grad_norm": 0.3700502812862396, + "learning_rate": 7.6284309300596e-05, + "loss": 1.7884, + "step": 11212 + }, + { + "epoch": 3.441682013505218, + "grad_norm": 0.24606801569461823, + "learning_rate": 7.628008082195835e-05, + "loss": 1.7292, + "step": 11213 + }, + { + "epoch": 3.441988950276243, + "grad_norm": 0.26344993710517883, + "learning_rate": 7.627585208360551e-05, + "loss": 1.7832, + "step": 11214 + }, + { + "epoch": 3.442295887047268, + "grad_norm": 0.4034743010997772, + "learning_rate": 7.62716230855793e-05, + "loss": 1.8164, + "step": 11215 + }, + { + "epoch": 3.4426028238182935, + "grad_norm": 0.4508039355278015, + "learning_rate": 7.626739382792152e-05, + "loss": 1.7855, + "step": 11216 + }, + { + "epoch": 3.4429097605893184, + "grad_norm": 0.2963111400604248, + "learning_rate": 7.626316431067395e-05, + "loss": 1.7995, + "step": 11217 + }, + { + "epoch": 3.4432166973603437, + "grad_norm": 0.35248515009880066, + "learning_rate": 7.625893453387841e-05, + "loss": 1.8761, + "step": 11218 + }, + { + "epoch": 3.443523634131369, + "grad_norm": 0.4032224416732788, + "learning_rate": 7.625470449757668e-05, + "loss": 1.7746, + "step": 11219 + }, + { + "epoch": 3.443830570902394, + "grad_norm": 0.3505195081233978, + "learning_rate": 7.625047420181057e-05, + "loss": 1.851, + "step": 11220 + }, + { + "epoch": 3.4441375076734193, + "grad_norm": 0.288968563079834, + "learning_rate": 7.62462436466219e-05, + "loss": 1.8055, + "step": 11221 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.43141910433769226, + "learning_rate": 7.624201283205246e-05, + "loss": 1.816, + "step": 11222 + }, + { + "epoch": 3.4447513812154695, + "grad_norm": 0.46902137994766235, + "learning_rate": 7.623778175814407e-05, + "loss": 1.8478, + "step": 11223 + }, + { + "epoch": 3.445058317986495, + "grad_norm": 0.3333328366279602, + "learning_rate": 7.623355042493854e-05, + "loss": 1.7949, + "step": 11224 + }, + { + "epoch": 3.4453652547575198, + "grad_norm": 0.2625340521335602, + "learning_rate": 7.622931883247768e-05, + "loss": 1.745, + "step": 11225 + }, + { + "epoch": 3.445672191528545, + "grad_norm": 0.4565848410129547, + "learning_rate": 7.622508698080333e-05, + "loss": 1.796, + "step": 11226 + }, + { + "epoch": 3.4459791282995704, + "grad_norm": 0.4676518738269806, + "learning_rate": 7.622085486995729e-05, + "loss": 1.8115, + "step": 11227 + }, + { + "epoch": 3.4462860650705953, + "grad_norm": 0.3828938603401184, + "learning_rate": 7.62166224999814e-05, + "loss": 1.8758, + "step": 11228 + }, + { + "epoch": 3.4465930018416207, + "grad_norm": 0.2786383628845215, + "learning_rate": 7.621238987091747e-05, + "loss": 1.7616, + "step": 11229 + }, + { + "epoch": 3.446899938612646, + "grad_norm": 0.4442835748195648, + "learning_rate": 7.620815698280734e-05, + "loss": 1.8342, + "step": 11230 + }, + { + "epoch": 3.447206875383671, + "grad_norm": 0.45760586857795715, + "learning_rate": 7.620392383569286e-05, + "loss": 1.8159, + "step": 11231 + }, + { + "epoch": 3.447513812154696, + "grad_norm": 0.2567009925842285, + "learning_rate": 7.619969042961583e-05, + "loss": 1.774, + "step": 11232 + }, + { + "epoch": 3.4478207489257215, + "grad_norm": 0.3720102310180664, + "learning_rate": 7.619545676461812e-05, + "loss": 1.8366, + "step": 11233 + }, + { + "epoch": 3.4481276856967464, + "grad_norm": 0.36436137557029724, + "learning_rate": 7.619122284074154e-05, + "loss": 1.832, + "step": 11234 + }, + { + "epoch": 3.4484346224677718, + "grad_norm": 0.310310959815979, + "learning_rate": 7.618698865802795e-05, + "loss": 1.9023, + "step": 11235 + }, + { + "epoch": 3.4487415592387967, + "grad_norm": 0.2693026661872864, + "learning_rate": 7.618275421651916e-05, + "loss": 1.7696, + "step": 11236 + }, + { + "epoch": 3.449048496009822, + "grad_norm": 0.2942425608634949, + "learning_rate": 7.61785195162571e-05, + "loss": 1.822, + "step": 11237 + }, + { + "epoch": 3.4493554327808473, + "grad_norm": 0.22454749047756195, + "learning_rate": 7.617428455728353e-05, + "loss": 1.7011, + "step": 11238 + }, + { + "epoch": 3.449662369551872, + "grad_norm": 0.23345038294792175, + "learning_rate": 7.617004933964035e-05, + "loss": 1.7563, + "step": 11239 + }, + { + "epoch": 3.4499693063228976, + "grad_norm": 0.24990662932395935, + "learning_rate": 7.616581386336941e-05, + "loss": 1.8031, + "step": 11240 + }, + { + "epoch": 3.4502762430939224, + "grad_norm": 0.2919348478317261, + "learning_rate": 7.616157812851254e-05, + "loss": 1.7355, + "step": 11241 + }, + { + "epoch": 3.450583179864948, + "grad_norm": 0.2926909327507019, + "learning_rate": 7.615734213511165e-05, + "loss": 1.8341, + "step": 11242 + }, + { + "epoch": 3.450890116635973, + "grad_norm": 0.24316683411598206, + "learning_rate": 7.615310588320855e-05, + "loss": 1.8154, + "step": 11243 + }, + { + "epoch": 3.451197053406998, + "grad_norm": 0.23154498636722565, + "learning_rate": 7.614886937284513e-05, + "loss": 1.7904, + "step": 11244 + }, + { + "epoch": 3.4515039901780233, + "grad_norm": 0.25973939895629883, + "learning_rate": 7.614463260406327e-05, + "loss": 1.7598, + "step": 11245 + }, + { + "epoch": 3.4518109269490487, + "grad_norm": 0.22110119462013245, + "learning_rate": 7.614039557690482e-05, + "loss": 1.7903, + "step": 11246 + }, + { + "epoch": 3.4521178637200736, + "grad_norm": 0.26184993982315063, + "learning_rate": 7.613615829141165e-05, + "loss": 1.748, + "step": 11247 + }, + { + "epoch": 3.452424800491099, + "grad_norm": 0.26128727197647095, + "learning_rate": 7.613192074762565e-05, + "loss": 1.7786, + "step": 11248 + }, + { + "epoch": 3.4527317372621242, + "grad_norm": 0.23230813443660736, + "learning_rate": 7.612768294558871e-05, + "loss": 1.8114, + "step": 11249 + }, + { + "epoch": 3.453038674033149, + "grad_norm": 0.2686540186405182, + "learning_rate": 7.612344488534268e-05, + "loss": 1.7311, + "step": 11250 + }, + { + "epoch": 3.4533456108041745, + "grad_norm": 0.25553348660469055, + "learning_rate": 7.611920656692946e-05, + "loss": 1.8468, + "step": 11251 + }, + { + "epoch": 3.4536525475751993, + "grad_norm": 0.2639308273792267, + "learning_rate": 7.611496799039092e-05, + "loss": 1.8292, + "step": 11252 + }, + { + "epoch": 3.4539594843462247, + "grad_norm": 0.2468358874320984, + "learning_rate": 7.611072915576895e-05, + "loss": 1.8173, + "step": 11253 + }, + { + "epoch": 3.45426642111725, + "grad_norm": 0.27236035466194153, + "learning_rate": 7.610649006310549e-05, + "loss": 1.8082, + "step": 11254 + }, + { + "epoch": 3.454573357888275, + "grad_norm": 0.2277914434671402, + "learning_rate": 7.610225071244237e-05, + "loss": 1.7483, + "step": 11255 + }, + { + "epoch": 3.4548802946593002, + "grad_norm": 0.2292868196964264, + "learning_rate": 7.60980111038215e-05, + "loss": 1.7716, + "step": 11256 + }, + { + "epoch": 3.455187231430325, + "grad_norm": 0.22116152942180634, + "learning_rate": 7.60937712372848e-05, + "loss": 1.773, + "step": 11257 + }, + { + "epoch": 3.4554941682013505, + "grad_norm": 0.23238304257392883, + "learning_rate": 7.608953111287416e-05, + "loss": 1.7602, + "step": 11258 + }, + { + "epoch": 3.455801104972376, + "grad_norm": 0.2810615003108978, + "learning_rate": 7.608529073063149e-05, + "loss": 1.8781, + "step": 11259 + }, + { + "epoch": 3.4561080417434007, + "grad_norm": 0.2516821324825287, + "learning_rate": 7.608105009059867e-05, + "loss": 1.835, + "step": 11260 + }, + { + "epoch": 3.456414978514426, + "grad_norm": 0.25698330998420715, + "learning_rate": 7.607680919281763e-05, + "loss": 1.7859, + "step": 11261 + }, + { + "epoch": 3.4567219152854514, + "grad_norm": 0.2597602903842926, + "learning_rate": 7.60725680373303e-05, + "loss": 1.8287, + "step": 11262 + }, + { + "epoch": 3.4570288520564763, + "grad_norm": 0.2564091980457306, + "learning_rate": 7.606832662417855e-05, + "loss": 1.8003, + "step": 11263 + }, + { + "epoch": 3.4573357888275016, + "grad_norm": 0.2872684597969055, + "learning_rate": 7.606408495340432e-05, + "loss": 1.8242, + "step": 11264 + }, + { + "epoch": 3.457642725598527, + "grad_norm": 0.27513590455055237, + "learning_rate": 7.605984302504952e-05, + "loss": 1.8605, + "step": 11265 + }, + { + "epoch": 3.457949662369552, + "grad_norm": 0.27768459916114807, + "learning_rate": 7.605560083915609e-05, + "loss": 1.7948, + "step": 11266 + }, + { + "epoch": 3.458256599140577, + "grad_norm": 0.23911382257938385, + "learning_rate": 7.605135839576593e-05, + "loss": 1.7575, + "step": 11267 + }, + { + "epoch": 3.458563535911602, + "grad_norm": 0.26773568987846375, + "learning_rate": 7.604711569492098e-05, + "loss": 1.752, + "step": 11268 + }, + { + "epoch": 3.4588704726826274, + "grad_norm": 0.30079394578933716, + "learning_rate": 7.604287273666316e-05, + "loss": 1.8022, + "step": 11269 + }, + { + "epoch": 3.4591774094536527, + "grad_norm": 0.27393853664398193, + "learning_rate": 7.603862952103441e-05, + "loss": 1.8054, + "step": 11270 + }, + { + "epoch": 3.4594843462246776, + "grad_norm": 0.2794870436191559, + "learning_rate": 7.603438604807667e-05, + "loss": 1.808, + "step": 11271 + }, + { + "epoch": 3.459791282995703, + "grad_norm": 0.26482146978378296, + "learning_rate": 7.603014231783185e-05, + "loss": 1.8696, + "step": 11272 + }, + { + "epoch": 3.460098219766728, + "grad_norm": 0.2755354344844818, + "learning_rate": 7.602589833034192e-05, + "loss": 1.8412, + "step": 11273 + }, + { + "epoch": 3.460405156537753, + "grad_norm": 0.2666642367839813, + "learning_rate": 7.602165408564883e-05, + "loss": 1.8333, + "step": 11274 + }, + { + "epoch": 3.4607120933087785, + "grad_norm": 0.26958519220352173, + "learning_rate": 7.601740958379448e-05, + "loss": 1.7943, + "step": 11275 + }, + { + "epoch": 3.4610190300798034, + "grad_norm": 0.2915789783000946, + "learning_rate": 7.601316482482084e-05, + "loss": 1.7519, + "step": 11276 + }, + { + "epoch": 3.4613259668508287, + "grad_norm": 0.2456950694322586, + "learning_rate": 7.600891980876985e-05, + "loss": 1.8064, + "step": 11277 + }, + { + "epoch": 3.461632903621854, + "grad_norm": 0.2517867088317871, + "learning_rate": 7.600467453568348e-05, + "loss": 1.7766, + "step": 11278 + }, + { + "epoch": 3.461939840392879, + "grad_norm": 0.24567969143390656, + "learning_rate": 7.600042900560368e-05, + "loss": 1.7331, + "step": 11279 + }, + { + "epoch": 3.4622467771639043, + "grad_norm": 0.23986820876598358, + "learning_rate": 7.599618321857239e-05, + "loss": 1.7477, + "step": 11280 + }, + { + "epoch": 3.4625537139349296, + "grad_norm": 0.2555375397205353, + "learning_rate": 7.599193717463158e-05, + "loss": 1.8154, + "step": 11281 + }, + { + "epoch": 3.4628606507059545, + "grad_norm": 0.2522781193256378, + "learning_rate": 7.598769087382323e-05, + "loss": 1.7821, + "step": 11282 + }, + { + "epoch": 3.46316758747698, + "grad_norm": 0.25631004571914673, + "learning_rate": 7.598344431618926e-05, + "loss": 1.8043, + "step": 11283 + }, + { + "epoch": 3.4634745242480047, + "grad_norm": 0.2611328661441803, + "learning_rate": 7.597919750177168e-05, + "loss": 1.8036, + "step": 11284 + }, + { + "epoch": 3.46378146101903, + "grad_norm": 0.255670428276062, + "learning_rate": 7.597495043061244e-05, + "loss": 1.7375, + "step": 11285 + }, + { + "epoch": 3.4640883977900554, + "grad_norm": 0.2687236964702606, + "learning_rate": 7.597070310275353e-05, + "loss": 1.7496, + "step": 11286 + }, + { + "epoch": 3.4643953345610803, + "grad_norm": 0.2643752992153168, + "learning_rate": 7.596645551823688e-05, + "loss": 1.8444, + "step": 11287 + }, + { + "epoch": 3.4647022713321056, + "grad_norm": 0.2564511299133301, + "learning_rate": 7.596220767710452e-05, + "loss": 1.7557, + "step": 11288 + }, + { + "epoch": 3.4650092081031305, + "grad_norm": 0.2510208487510681, + "learning_rate": 7.59579595793984e-05, + "loss": 1.7234, + "step": 11289 + }, + { + "epoch": 3.465316144874156, + "grad_norm": 0.2765158712863922, + "learning_rate": 7.595371122516051e-05, + "loss": 1.8215, + "step": 11290 + }, + { + "epoch": 3.465623081645181, + "grad_norm": 0.28233039379119873, + "learning_rate": 7.594946261443286e-05, + "loss": 1.7752, + "step": 11291 + }, + { + "epoch": 3.465930018416206, + "grad_norm": 0.26971468329429626, + "learning_rate": 7.594521374725735e-05, + "loss": 1.7924, + "step": 11292 + }, + { + "epoch": 3.4662369551872314, + "grad_norm": 0.29425930976867676, + "learning_rate": 7.594096462367608e-05, + "loss": 1.8144, + "step": 11293 + }, + { + "epoch": 3.4665438919582567, + "grad_norm": 0.233150452375412, + "learning_rate": 7.593671524373098e-05, + "loss": 1.7741, + "step": 11294 + }, + { + "epoch": 3.4668508287292816, + "grad_norm": 0.2947762608528137, + "learning_rate": 7.593246560746406e-05, + "loss": 1.8031, + "step": 11295 + }, + { + "epoch": 3.467157765500307, + "grad_norm": 0.250552773475647, + "learning_rate": 7.59282157149173e-05, + "loss": 1.7501, + "step": 11296 + }, + { + "epoch": 3.4674647022713323, + "grad_norm": 0.26091331243515015, + "learning_rate": 7.592396556613274e-05, + "loss": 1.836, + "step": 11297 + }, + { + "epoch": 3.467771639042357, + "grad_norm": 0.28625619411468506, + "learning_rate": 7.591971516115233e-05, + "loss": 1.7555, + "step": 11298 + }, + { + "epoch": 3.4680785758133825, + "grad_norm": 0.2723398804664612, + "learning_rate": 7.591546450001811e-05, + "loss": 1.825, + "step": 11299 + }, + { + "epoch": 3.4683855125844074, + "grad_norm": 0.24289946258068085, + "learning_rate": 7.591121358277211e-05, + "loss": 1.7441, + "step": 11300 + }, + { + "epoch": 3.4686924493554327, + "grad_norm": 0.2706952691078186, + "learning_rate": 7.590696240945629e-05, + "loss": 1.8651, + "step": 11301 + }, + { + "epoch": 3.468999386126458, + "grad_norm": 0.24632862210273743, + "learning_rate": 7.590271098011268e-05, + "loss": 1.8229, + "step": 11302 + }, + { + "epoch": 3.469306322897483, + "grad_norm": 0.29275211691856384, + "learning_rate": 7.58984592947833e-05, + "loss": 1.7591, + "step": 11303 + }, + { + "epoch": 3.4696132596685083, + "grad_norm": 0.29228144884109497, + "learning_rate": 7.589420735351016e-05, + "loss": 1.8395, + "step": 11304 + }, + { + "epoch": 3.4699201964395336, + "grad_norm": 0.28339114785194397, + "learning_rate": 7.588995515633528e-05, + "loss": 1.8543, + "step": 11305 + }, + { + "epoch": 3.4702271332105585, + "grad_norm": 0.2834693193435669, + "learning_rate": 7.588570270330071e-05, + "loss": 1.826, + "step": 11306 + }, + { + "epoch": 3.470534069981584, + "grad_norm": 0.26130759716033936, + "learning_rate": 7.588144999444844e-05, + "loss": 1.7887, + "step": 11307 + }, + { + "epoch": 3.470841006752609, + "grad_norm": 0.29554685950279236, + "learning_rate": 7.587719702982052e-05, + "loss": 1.819, + "step": 11308 + }, + { + "epoch": 3.471147943523634, + "grad_norm": 0.2687968611717224, + "learning_rate": 7.587294380945898e-05, + "loss": 1.7354, + "step": 11309 + }, + { + "epoch": 3.4714548802946594, + "grad_norm": 0.28795287013053894, + "learning_rate": 7.586869033340582e-05, + "loss": 1.8267, + "step": 11310 + }, + { + "epoch": 3.4717618170656843, + "grad_norm": 0.33244553208351135, + "learning_rate": 7.58644366017031e-05, + "loss": 1.86, + "step": 11311 + }, + { + "epoch": 3.4720687538367097, + "grad_norm": 0.2878025472164154, + "learning_rate": 7.586018261439288e-05, + "loss": 1.7587, + "step": 11312 + }, + { + "epoch": 3.472375690607735, + "grad_norm": 0.26856711506843567, + "learning_rate": 7.585592837151716e-05, + "loss": 1.7351, + "step": 11313 + }, + { + "epoch": 3.47268262737876, + "grad_norm": 0.2554367780685425, + "learning_rate": 7.585167387311802e-05, + "loss": 1.7664, + "step": 11314 + }, + { + "epoch": 3.472989564149785, + "grad_norm": 0.3193204700946808, + "learning_rate": 7.584741911923748e-05, + "loss": 1.7487, + "step": 11315 + }, + { + "epoch": 3.47329650092081, + "grad_norm": 0.3227958679199219, + "learning_rate": 7.584316410991759e-05, + "loss": 1.8107, + "step": 11316 + }, + { + "epoch": 3.4736034376918354, + "grad_norm": 0.33891916275024414, + "learning_rate": 7.58389088452004e-05, + "loss": 1.8466, + "step": 11317 + }, + { + "epoch": 3.4739103744628608, + "grad_norm": 0.27050724625587463, + "learning_rate": 7.583465332512797e-05, + "loss": 1.7877, + "step": 11318 + }, + { + "epoch": 3.4742173112338857, + "grad_norm": 0.2935837209224701, + "learning_rate": 7.583039754974235e-05, + "loss": 1.7932, + "step": 11319 + }, + { + "epoch": 3.474524248004911, + "grad_norm": 0.27780550718307495, + "learning_rate": 7.582614151908561e-05, + "loss": 1.8374, + "step": 11320 + }, + { + "epoch": 3.4748311847759363, + "grad_norm": 0.2579033076763153, + "learning_rate": 7.58218852331998e-05, + "loss": 1.7305, + "step": 11321 + }, + { + "epoch": 3.4751381215469612, + "grad_norm": 0.2531716227531433, + "learning_rate": 7.581762869212699e-05, + "loss": 1.8136, + "step": 11322 + }, + { + "epoch": 3.4754450583179866, + "grad_norm": 0.25504544377326965, + "learning_rate": 7.581337189590924e-05, + "loss": 1.787, + "step": 11323 + }, + { + "epoch": 3.475751995089012, + "grad_norm": 0.23659855127334595, + "learning_rate": 7.580911484458861e-05, + "loss": 1.77, + "step": 11324 + }, + { + "epoch": 3.476058931860037, + "grad_norm": 0.22556856274604797, + "learning_rate": 7.580485753820721e-05, + "loss": 1.7808, + "step": 11325 + }, + { + "epoch": 3.476365868631062, + "grad_norm": 0.2860291600227356, + "learning_rate": 7.580059997680705e-05, + "loss": 1.8224, + "step": 11326 + }, + { + "epoch": 3.476672805402087, + "grad_norm": 0.3134596645832062, + "learning_rate": 7.579634216043023e-05, + "loss": 1.8278, + "step": 11327 + }, + { + "epoch": 3.4769797421731123, + "grad_norm": 0.2883087992668152, + "learning_rate": 7.579208408911887e-05, + "loss": 1.7917, + "step": 11328 + }, + { + "epoch": 3.4772866789441377, + "grad_norm": 0.2743333578109741, + "learning_rate": 7.578782576291501e-05, + "loss": 1.8228, + "step": 11329 + }, + { + "epoch": 3.4775936157151626, + "grad_norm": 0.25026053190231323, + "learning_rate": 7.578356718186073e-05, + "loss": 1.7717, + "step": 11330 + }, + { + "epoch": 3.477900552486188, + "grad_norm": 0.246905118227005, + "learning_rate": 7.577930834599813e-05, + "loss": 1.7979, + "step": 11331 + }, + { + "epoch": 3.478207489257213, + "grad_norm": 0.24709418416023254, + "learning_rate": 7.577504925536929e-05, + "loss": 1.8111, + "step": 11332 + }, + { + "epoch": 3.478514426028238, + "grad_norm": 0.25685814023017883, + "learning_rate": 7.577078991001632e-05, + "loss": 1.8255, + "step": 11333 + }, + { + "epoch": 3.4788213627992635, + "grad_norm": 0.23937836289405823, + "learning_rate": 7.576653030998129e-05, + "loss": 1.7254, + "step": 11334 + }, + { + "epoch": 3.4791282995702884, + "grad_norm": 0.22638650238513947, + "learning_rate": 7.57622704553063e-05, + "loss": 1.7847, + "step": 11335 + }, + { + "epoch": 3.4794352363413137, + "grad_norm": 0.26083993911743164, + "learning_rate": 7.575801034603347e-05, + "loss": 1.7947, + "step": 11336 + }, + { + "epoch": 3.479742173112339, + "grad_norm": 0.2715466022491455, + "learning_rate": 7.575374998220488e-05, + "loss": 1.848, + "step": 11337 + }, + { + "epoch": 3.480049109883364, + "grad_norm": 0.25554224848747253, + "learning_rate": 7.574948936386262e-05, + "loss": 1.7811, + "step": 11338 + }, + { + "epoch": 3.4803560466543892, + "grad_norm": 0.2689397931098938, + "learning_rate": 7.574522849104882e-05, + "loss": 1.82, + "step": 11339 + }, + { + "epoch": 3.4806629834254146, + "grad_norm": 0.25027474761009216, + "learning_rate": 7.57409673638056e-05, + "loss": 1.775, + "step": 11340 + }, + { + "epoch": 3.4809699201964395, + "grad_norm": 0.2545457184314728, + "learning_rate": 7.573670598217504e-05, + "loss": 1.8056, + "step": 11341 + }, + { + "epoch": 3.481276856967465, + "grad_norm": 0.28404027223587036, + "learning_rate": 7.573244434619928e-05, + "loss": 1.8372, + "step": 11342 + }, + { + "epoch": 3.4815837937384897, + "grad_norm": 0.28046950697898865, + "learning_rate": 7.572818245592041e-05, + "loss": 1.7851, + "step": 11343 + }, + { + "epoch": 3.481890730509515, + "grad_norm": 0.23005759716033936, + "learning_rate": 7.572392031138056e-05, + "loss": 1.7059, + "step": 11344 + }, + { + "epoch": 3.4821976672805404, + "grad_norm": 0.2931719124317169, + "learning_rate": 7.571965791262185e-05, + "loss": 1.84, + "step": 11345 + }, + { + "epoch": 3.4825046040515653, + "grad_norm": 0.4399266242980957, + "learning_rate": 7.571539525968642e-05, + "loss": 1.7465, + "step": 11346 + }, + { + "epoch": 3.4828115408225906, + "grad_norm": 0.48957565426826477, + "learning_rate": 7.571113235261638e-05, + "loss": 1.8494, + "step": 11347 + }, + { + "epoch": 3.4831184775936155, + "grad_norm": 0.37828895449638367, + "learning_rate": 7.570686919145385e-05, + "loss": 1.7598, + "step": 11348 + }, + { + "epoch": 3.483425414364641, + "grad_norm": 0.22943973541259766, + "learning_rate": 7.570260577624098e-05, + "loss": 1.7443, + "step": 11349 + }, + { + "epoch": 3.483732351135666, + "grad_norm": 0.3245384991168976, + "learning_rate": 7.569834210701987e-05, + "loss": 1.7232, + "step": 11350 + }, + { + "epoch": 3.484039287906691, + "grad_norm": 0.4419693648815155, + "learning_rate": 7.569407818383271e-05, + "loss": 1.841, + "step": 11351 + }, + { + "epoch": 3.4843462246777164, + "grad_norm": 0.4061864912509918, + "learning_rate": 7.568981400672159e-05, + "loss": 1.8274, + "step": 11352 + }, + { + "epoch": 3.4846531614487417, + "grad_norm": 0.2609417736530304, + "learning_rate": 7.56855495757287e-05, + "loss": 1.8631, + "step": 11353 + }, + { + "epoch": 3.4849600982197666, + "grad_norm": 0.28758567571640015, + "learning_rate": 7.568128489089612e-05, + "loss": 1.8169, + "step": 11354 + }, + { + "epoch": 3.485267034990792, + "grad_norm": 0.40643060207366943, + "learning_rate": 7.567701995226606e-05, + "loss": 1.809, + "step": 11355 + }, + { + "epoch": 3.4855739717618173, + "grad_norm": 0.37649446725845337, + "learning_rate": 7.56727547598806e-05, + "loss": 1.7661, + "step": 11356 + }, + { + "epoch": 3.485880908532842, + "grad_norm": 0.22863779962062836, + "learning_rate": 7.566848931378197e-05, + "loss": 1.808, + "step": 11357 + }, + { + "epoch": 3.4861878453038675, + "grad_norm": 0.4487019181251526, + "learning_rate": 7.566422361401226e-05, + "loss": 1.7627, + "step": 11358 + }, + { + "epoch": 3.4864947820748924, + "grad_norm": 0.4583640694618225, + "learning_rate": 7.565995766061367e-05, + "loss": 1.8186, + "step": 11359 + }, + { + "epoch": 3.4868017188459177, + "grad_norm": 0.27231526374816895, + "learning_rate": 7.565569145362833e-05, + "loss": 1.8465, + "step": 11360 + }, + { + "epoch": 3.487108655616943, + "grad_norm": 0.3877887725830078, + "learning_rate": 7.565142499309841e-05, + "loss": 1.7668, + "step": 11361 + }, + { + "epoch": 3.487415592387968, + "grad_norm": 0.5511242747306824, + "learning_rate": 7.564715827906606e-05, + "loss": 1.8417, + "step": 11362 + }, + { + "epoch": 3.4877225291589933, + "grad_norm": 0.5112231373786926, + "learning_rate": 7.564289131157348e-05, + "loss": 1.8038, + "step": 11363 + }, + { + "epoch": 3.488029465930018, + "grad_norm": 0.279502809047699, + "learning_rate": 7.56386240906628e-05, + "loss": 1.7545, + "step": 11364 + }, + { + "epoch": 3.4883364027010435, + "grad_norm": 0.30080464482307434, + "learning_rate": 7.563435661637623e-05, + "loss": 1.8136, + "step": 11365 + }, + { + "epoch": 3.488643339472069, + "grad_norm": 0.4424717128276825, + "learning_rate": 7.563008888875591e-05, + "loss": 1.7542, + "step": 11366 + }, + { + "epoch": 3.4889502762430937, + "grad_norm": 0.42144715785980225, + "learning_rate": 7.562582090784403e-05, + "loss": 1.8245, + "step": 11367 + }, + { + "epoch": 3.489257213014119, + "grad_norm": 0.2533668875694275, + "learning_rate": 7.562155267368277e-05, + "loss": 1.8654, + "step": 11368 + }, + { + "epoch": 3.4895641497851444, + "grad_norm": 0.3327534794807434, + "learning_rate": 7.56172841863143e-05, + "loss": 1.7882, + "step": 11369 + }, + { + "epoch": 3.4898710865561693, + "grad_norm": 0.44001486897468567, + "learning_rate": 7.561301544578081e-05, + "loss": 1.8397, + "step": 11370 + }, + { + "epoch": 3.4901780233271946, + "grad_norm": 0.2779090106487274, + "learning_rate": 7.56087464521245e-05, + "loss": 1.7398, + "step": 11371 + }, + { + "epoch": 3.49048496009822, + "grad_norm": 0.3018067479133606, + "learning_rate": 7.560447720538755e-05, + "loss": 1.8076, + "step": 11372 + }, + { + "epoch": 3.490791896869245, + "grad_norm": 0.4370935261249542, + "learning_rate": 7.560020770561216e-05, + "loss": 1.8057, + "step": 11373 + }, + { + "epoch": 3.49109883364027, + "grad_norm": 0.2936978042125702, + "learning_rate": 7.559593795284047e-05, + "loss": 1.7726, + "step": 11374 + }, + { + "epoch": 3.491405770411295, + "grad_norm": 0.28825095295906067, + "learning_rate": 7.559166794711476e-05, + "loss": 1.8039, + "step": 11375 + }, + { + "epoch": 3.4917127071823204, + "grad_norm": 0.39334073662757874, + "learning_rate": 7.55873976884772e-05, + "loss": 1.8388, + "step": 11376 + }, + { + "epoch": 3.4920196439533457, + "grad_norm": 0.33880460262298584, + "learning_rate": 7.558312717696995e-05, + "loss": 1.7791, + "step": 11377 + }, + { + "epoch": 3.4923265807243706, + "grad_norm": 0.4433762729167938, + "learning_rate": 7.557885641263524e-05, + "loss": 1.7786, + "step": 11378 + }, + { + "epoch": 3.492633517495396, + "grad_norm": 0.4710264205932617, + "learning_rate": 7.557458539551527e-05, + "loss": 1.7193, + "step": 11379 + }, + { + "epoch": 3.4929404542664213, + "grad_norm": 0.27514326572418213, + "learning_rate": 7.557031412565228e-05, + "loss": 1.823, + "step": 11380 + }, + { + "epoch": 3.493247391037446, + "grad_norm": 0.4681413471698761, + "learning_rate": 7.556604260308846e-05, + "loss": 1.7598, + "step": 11381 + }, + { + "epoch": 3.4935543278084715, + "grad_norm": 0.5032503604888916, + "learning_rate": 7.556177082786602e-05, + "loss": 1.741, + "step": 11382 + }, + { + "epoch": 3.493861264579497, + "grad_norm": 0.2677086889743805, + "learning_rate": 7.555749880002716e-05, + "loss": 1.8528, + "step": 11383 + }, + { + "epoch": 3.4941682013505218, + "grad_norm": 0.43870940804481506, + "learning_rate": 7.555322651961414e-05, + "loss": 1.7632, + "step": 11384 + }, + { + "epoch": 3.494475138121547, + "grad_norm": 0.5403209924697876, + "learning_rate": 7.554895398666914e-05, + "loss": 1.8181, + "step": 11385 + }, + { + "epoch": 3.494782074892572, + "grad_norm": 0.2714318335056305, + "learning_rate": 7.554468120123441e-05, + "loss": 1.8151, + "step": 11386 + }, + { + "epoch": 3.4950890116635973, + "grad_norm": 0.49661698937416077, + "learning_rate": 7.554040816335217e-05, + "loss": 1.8116, + "step": 11387 + }, + { + "epoch": 3.4953959484346226, + "grad_norm": 0.49954715371131897, + "learning_rate": 7.553613487306465e-05, + "loss": 1.8841, + "step": 11388 + }, + { + "epoch": 3.4957028852056475, + "grad_norm": 0.28189441561698914, + "learning_rate": 7.553186133041406e-05, + "loss": 1.7834, + "step": 11389 + }, + { + "epoch": 3.496009821976673, + "grad_norm": 0.36029115319252014, + "learning_rate": 7.552758753544267e-05, + "loss": 1.7796, + "step": 11390 + }, + { + "epoch": 3.4963167587476978, + "grad_norm": 0.45023465156555176, + "learning_rate": 7.552331348819268e-05, + "loss": 1.8773, + "step": 11391 + }, + { + "epoch": 3.496623695518723, + "grad_norm": 0.3235788643360138, + "learning_rate": 7.551903918870636e-05, + "loss": 1.7984, + "step": 11392 + }, + { + "epoch": 3.4969306322897484, + "grad_norm": 0.25656190514564514, + "learning_rate": 7.551476463702596e-05, + "loss": 1.8403, + "step": 11393 + }, + { + "epoch": 3.4972375690607733, + "grad_norm": 0.2866458594799042, + "learning_rate": 7.551048983319366e-05, + "loss": 1.7428, + "step": 11394 + }, + { + "epoch": 3.4975445058317987, + "grad_norm": 0.2713877856731415, + "learning_rate": 7.550621477725177e-05, + "loss": 1.8508, + "step": 11395 + }, + { + "epoch": 3.497851442602824, + "grad_norm": 0.27978867292404175, + "learning_rate": 7.55019394692425e-05, + "loss": 1.8049, + "step": 11396 + }, + { + "epoch": 3.498158379373849, + "grad_norm": 0.3275020122528076, + "learning_rate": 7.549766390920814e-05, + "loss": 1.8553, + "step": 11397 + }, + { + "epoch": 3.498465316144874, + "grad_norm": 0.29947492480278015, + "learning_rate": 7.54933880971909e-05, + "loss": 1.7614, + "step": 11398 + }, + { + "epoch": 3.4987722529158995, + "grad_norm": 0.25790849328041077, + "learning_rate": 7.548911203323308e-05, + "loss": 1.8223, + "step": 11399 + }, + { + "epoch": 3.4990791896869244, + "grad_norm": 0.3145451545715332, + "learning_rate": 7.54848357173769e-05, + "loss": 1.7642, + "step": 11400 + }, + { + "epoch": 3.4993861264579498, + "grad_norm": 0.29052913188934326, + "learning_rate": 7.548055914966463e-05, + "loss": 1.7728, + "step": 11401 + }, + { + "epoch": 3.4996930632289747, + "grad_norm": 0.2741037905216217, + "learning_rate": 7.547628233013854e-05, + "loss": 1.7382, + "step": 11402 + }, + { + "epoch": 3.5, + "grad_norm": 0.2562723755836487, + "learning_rate": 7.54720052588409e-05, + "loss": 1.7455, + "step": 11403 + }, + { + "epoch": 3.5003069367710253, + "grad_norm": 0.27649983763694763, + "learning_rate": 7.546772793581398e-05, + "loss": 1.7194, + "step": 11404 + }, + { + "epoch": 3.5006138735420502, + "grad_norm": 0.27290579676628113, + "learning_rate": 7.546345036110004e-05, + "loss": 1.87, + "step": 11405 + }, + { + "epoch": 3.5009208103130756, + "grad_norm": 0.33585605025291443, + "learning_rate": 7.545917253474136e-05, + "loss": 1.7703, + "step": 11406 + }, + { + "epoch": 3.5012277470841005, + "grad_norm": 0.2592691481113434, + "learning_rate": 7.545489445678022e-05, + "loss": 1.7657, + "step": 11407 + }, + { + "epoch": 3.501534683855126, + "grad_norm": 0.3081367015838623, + "learning_rate": 7.545061612725888e-05, + "loss": 1.8067, + "step": 11408 + }, + { + "epoch": 3.501841620626151, + "grad_norm": 0.31012001633644104, + "learning_rate": 7.544633754621965e-05, + "loss": 1.8009, + "step": 11409 + }, + { + "epoch": 3.5021485573971765, + "grad_norm": 0.28232479095458984, + "learning_rate": 7.54420587137048e-05, + "loss": 1.8124, + "step": 11410 + }, + { + "epoch": 3.5024554941682013, + "grad_norm": 0.24079222977161407, + "learning_rate": 7.54377796297566e-05, + "loss": 1.789, + "step": 11411 + }, + { + "epoch": 3.5027624309392267, + "grad_norm": 0.27347204089164734, + "learning_rate": 7.543350029441737e-05, + "loss": 1.7704, + "step": 11412 + }, + { + "epoch": 3.5030693677102516, + "grad_norm": 0.25545811653137207, + "learning_rate": 7.542922070772935e-05, + "loss": 1.7871, + "step": 11413 + }, + { + "epoch": 3.503376304481277, + "grad_norm": 0.2507263123989105, + "learning_rate": 7.54249408697349e-05, + "loss": 1.8424, + "step": 11414 + }, + { + "epoch": 3.5036832412523022, + "grad_norm": 0.2776084244251251, + "learning_rate": 7.542066078047627e-05, + "loss": 1.8246, + "step": 11415 + }, + { + "epoch": 3.503990178023327, + "grad_norm": 0.32833749055862427, + "learning_rate": 7.541638043999577e-05, + "loss": 1.7785, + "step": 11416 + }, + { + "epoch": 3.5042971147943525, + "grad_norm": 0.258486270904541, + "learning_rate": 7.541209984833571e-05, + "loss": 1.7543, + "step": 11417 + }, + { + "epoch": 3.5046040515653774, + "grad_norm": 0.25825178623199463, + "learning_rate": 7.540781900553837e-05, + "loss": 1.7939, + "step": 11418 + }, + { + "epoch": 3.5049109883364027, + "grad_norm": 0.26980888843536377, + "learning_rate": 7.540353791164606e-05, + "loss": 1.7777, + "step": 11419 + }, + { + "epoch": 3.505217925107428, + "grad_norm": 0.24103333055973053, + "learning_rate": 7.539925656670111e-05, + "loss": 1.7565, + "step": 11420 + }, + { + "epoch": 3.505524861878453, + "grad_norm": 0.25192007422447205, + "learning_rate": 7.539497497074584e-05, + "loss": 1.7696, + "step": 11421 + }, + { + "epoch": 3.5058317986494782, + "grad_norm": 0.218489870429039, + "learning_rate": 7.539069312382252e-05, + "loss": 1.761, + "step": 11422 + }, + { + "epoch": 3.506138735420503, + "grad_norm": 0.27533552050590515, + "learning_rate": 7.53864110259735e-05, + "loss": 1.7374, + "step": 11423 + }, + { + "epoch": 3.5064456721915285, + "grad_norm": 0.2603490650653839, + "learning_rate": 7.538212867724108e-05, + "loss": 1.8342, + "step": 11424 + }, + { + "epoch": 3.506752608962554, + "grad_norm": 0.27340635657310486, + "learning_rate": 7.537784607766758e-05, + "loss": 1.8099, + "step": 11425 + }, + { + "epoch": 3.507059545733579, + "grad_norm": 0.25342679023742676, + "learning_rate": 7.537356322729537e-05, + "loss": 1.7949, + "step": 11426 + }, + { + "epoch": 3.507366482504604, + "grad_norm": 0.292819082736969, + "learning_rate": 7.536928012616669e-05, + "loss": 1.9049, + "step": 11427 + }, + { + "epoch": 3.5076734192756294, + "grad_norm": 0.28256532549858093, + "learning_rate": 7.536499677432393e-05, + "loss": 1.8464, + "step": 11428 + }, + { + "epoch": 3.5079803560466543, + "grad_norm": 0.2672989070415497, + "learning_rate": 7.536071317180942e-05, + "loss": 1.8301, + "step": 11429 + }, + { + "epoch": 3.5082872928176796, + "grad_norm": 0.2525518238544464, + "learning_rate": 7.535642931866546e-05, + "loss": 1.8054, + "step": 11430 + }, + { + "epoch": 3.508594229588705, + "grad_norm": 0.2622447609901428, + "learning_rate": 7.535214521493442e-05, + "loss": 1.8293, + "step": 11431 + }, + { + "epoch": 3.50890116635973, + "grad_norm": 0.27057385444641113, + "learning_rate": 7.534786086065859e-05, + "loss": 1.7426, + "step": 11432 + }, + { + "epoch": 3.509208103130755, + "grad_norm": 0.27363866567611694, + "learning_rate": 7.534357625588038e-05, + "loss": 1.7138, + "step": 11433 + }, + { + "epoch": 3.50951503990178, + "grad_norm": 0.3029060363769531, + "learning_rate": 7.533929140064207e-05, + "loss": 1.864, + "step": 11434 + }, + { + "epoch": 3.5098219766728054, + "grad_norm": 0.3144821524620056, + "learning_rate": 7.533500629498604e-05, + "loss": 1.7846, + "step": 11435 + }, + { + "epoch": 3.5101289134438307, + "grad_norm": 0.44535213708877563, + "learning_rate": 7.533072093895461e-05, + "loss": 1.799, + "step": 11436 + }, + { + "epoch": 3.5104358502148556, + "grad_norm": 0.25344160199165344, + "learning_rate": 7.532643533259017e-05, + "loss": 1.7391, + "step": 11437 + }, + { + "epoch": 3.510742786985881, + "grad_norm": 0.286026269197464, + "learning_rate": 7.532214947593506e-05, + "loss": 1.8436, + "step": 11438 + }, + { + "epoch": 3.511049723756906, + "grad_norm": 0.3317352533340454, + "learning_rate": 7.53178633690316e-05, + "loss": 1.8507, + "step": 11439 + }, + { + "epoch": 3.511356660527931, + "grad_norm": 0.2547265589237213, + "learning_rate": 7.53135770119222e-05, + "loss": 1.7483, + "step": 11440 + }, + { + "epoch": 3.5116635972989565, + "grad_norm": 0.24281835556030273, + "learning_rate": 7.530929040464917e-05, + "loss": 1.759, + "step": 11441 + }, + { + "epoch": 3.511970534069982, + "grad_norm": 0.2935381829738617, + "learning_rate": 7.530500354725491e-05, + "loss": 1.8235, + "step": 11442 + }, + { + "epoch": 3.5122774708410067, + "grad_norm": 0.26642969250679016, + "learning_rate": 7.53007164397818e-05, + "loss": 1.8324, + "step": 11443 + }, + { + "epoch": 3.512584407612032, + "grad_norm": 0.24830882251262665, + "learning_rate": 7.529642908227215e-05, + "loss": 1.8132, + "step": 11444 + }, + { + "epoch": 3.512891344383057, + "grad_norm": 0.3100191056728363, + "learning_rate": 7.529214147476838e-05, + "loss": 1.8453, + "step": 11445 + }, + { + "epoch": 3.5131982811540823, + "grad_norm": 0.27948811650276184, + "learning_rate": 7.528785361731282e-05, + "loss": 1.7792, + "step": 11446 + }, + { + "epoch": 3.5135052179251076, + "grad_norm": 0.26978832483291626, + "learning_rate": 7.528356550994787e-05, + "loss": 1.7857, + "step": 11447 + }, + { + "epoch": 3.5138121546961325, + "grad_norm": 0.30527836084365845, + "learning_rate": 7.527927715271592e-05, + "loss": 1.807, + "step": 11448 + }, + { + "epoch": 3.514119091467158, + "grad_norm": 0.2915664315223694, + "learning_rate": 7.527498854565934e-05, + "loss": 1.8414, + "step": 11449 + }, + { + "epoch": 3.5144260282381827, + "grad_norm": 0.2854034900665283, + "learning_rate": 7.52706996888205e-05, + "loss": 1.793, + "step": 11450 + }, + { + "epoch": 3.514732965009208, + "grad_norm": 0.30281978845596313, + "learning_rate": 7.52664105822418e-05, + "loss": 1.7896, + "step": 11451 + }, + { + "epoch": 3.5150399017802334, + "grad_norm": 0.3317166566848755, + "learning_rate": 7.526212122596561e-05, + "loss": 1.7776, + "step": 11452 + }, + { + "epoch": 3.5153468385512583, + "grad_norm": 0.3400021195411682, + "learning_rate": 7.525783162003434e-05, + "loss": 1.8411, + "step": 11453 + }, + { + "epoch": 3.5156537753222836, + "grad_norm": 0.25169485807418823, + "learning_rate": 7.525354176449037e-05, + "loss": 1.7871, + "step": 11454 + }, + { + "epoch": 3.5159607120933085, + "grad_norm": 0.3442455530166626, + "learning_rate": 7.52492516593761e-05, + "loss": 1.7644, + "step": 11455 + }, + { + "epoch": 3.516267648864334, + "grad_norm": 0.35644033551216125, + "learning_rate": 7.524496130473394e-05, + "loss": 1.801, + "step": 11456 + }, + { + "epoch": 3.516574585635359, + "grad_norm": 0.3180185854434967, + "learning_rate": 7.524067070060625e-05, + "loss": 1.7897, + "step": 11457 + }, + { + "epoch": 3.5168815224063845, + "grad_norm": 0.2417978048324585, + "learning_rate": 7.523637984703548e-05, + "loss": 1.8527, + "step": 11458 + }, + { + "epoch": 3.5171884591774094, + "grad_norm": 0.29661375284194946, + "learning_rate": 7.5232088744064e-05, + "loss": 1.8276, + "step": 11459 + }, + { + "epoch": 3.5174953959484347, + "grad_norm": 0.2467545121908188, + "learning_rate": 7.522779739173424e-05, + "loss": 1.7819, + "step": 11460 + }, + { + "epoch": 3.5178023327194596, + "grad_norm": 0.26177898049354553, + "learning_rate": 7.522350579008859e-05, + "loss": 1.8017, + "step": 11461 + }, + { + "epoch": 3.518109269490485, + "grad_norm": 0.28740498423576355, + "learning_rate": 7.521921393916948e-05, + "loss": 1.7863, + "step": 11462 + }, + { + "epoch": 3.5184162062615103, + "grad_norm": 0.28685200214385986, + "learning_rate": 7.521492183901932e-05, + "loss": 1.8069, + "step": 11463 + }, + { + "epoch": 3.518723143032535, + "grad_norm": 0.24174338579177856, + "learning_rate": 7.521062948968051e-05, + "loss": 1.7523, + "step": 11464 + }, + { + "epoch": 3.5190300798035605, + "grad_norm": 0.23273243010044098, + "learning_rate": 7.520633689119548e-05, + "loss": 1.7827, + "step": 11465 + }, + { + "epoch": 3.5193370165745854, + "grad_norm": 0.22708217799663544, + "learning_rate": 7.520204404360667e-05, + "loss": 1.7377, + "step": 11466 + }, + { + "epoch": 3.5196439533456108, + "grad_norm": 0.24725353717803955, + "learning_rate": 7.519775094695649e-05, + "loss": 1.7828, + "step": 11467 + }, + { + "epoch": 3.519950890116636, + "grad_norm": 0.23046265542507172, + "learning_rate": 7.519345760128736e-05, + "loss": 1.7427, + "step": 11468 + }, + { + "epoch": 3.520257826887661, + "grad_norm": 0.2618728280067444, + "learning_rate": 7.518916400664171e-05, + "loss": 1.8133, + "step": 11469 + }, + { + "epoch": 3.5205647636586863, + "grad_norm": 0.23232363164424896, + "learning_rate": 7.5184870163062e-05, + "loss": 1.7468, + "step": 11470 + }, + { + "epoch": 3.520871700429711, + "grad_norm": 0.21993626654148102, + "learning_rate": 7.51805760705906e-05, + "loss": 1.7565, + "step": 11471 + }, + { + "epoch": 3.5211786372007365, + "grad_norm": 0.23563124239444733, + "learning_rate": 7.517628172927001e-05, + "loss": 1.7795, + "step": 11472 + }, + { + "epoch": 3.521485573971762, + "grad_norm": 0.24502862989902496, + "learning_rate": 7.517198713914266e-05, + "loss": 1.813, + "step": 11473 + }, + { + "epoch": 3.521792510742787, + "grad_norm": 0.24745969474315643, + "learning_rate": 7.516769230025097e-05, + "loss": 1.7601, + "step": 11474 + }, + { + "epoch": 3.522099447513812, + "grad_norm": 0.27686986327171326, + "learning_rate": 7.516339721263739e-05, + "loss": 1.8121, + "step": 11475 + }, + { + "epoch": 3.5224063842848374, + "grad_norm": 0.3110332787036896, + "learning_rate": 7.515910187634439e-05, + "loss": 1.7978, + "step": 11476 + }, + { + "epoch": 3.5227133210558623, + "grad_norm": 0.3394792377948761, + "learning_rate": 7.515480629141436e-05, + "loss": 1.8427, + "step": 11477 + }, + { + "epoch": 3.5230202578268877, + "grad_norm": 0.2802537679672241, + "learning_rate": 7.515051045788984e-05, + "loss": 1.7343, + "step": 11478 + }, + { + "epoch": 3.523327194597913, + "grad_norm": 0.23687711358070374, + "learning_rate": 7.514621437581319e-05, + "loss": 1.7786, + "step": 11479 + }, + { + "epoch": 3.523634131368938, + "grad_norm": 0.31114310026168823, + "learning_rate": 7.514191804522693e-05, + "loss": 1.8137, + "step": 11480 + }, + { + "epoch": 3.523941068139963, + "grad_norm": 0.3257891833782196, + "learning_rate": 7.513762146617351e-05, + "loss": 1.8015, + "step": 11481 + }, + { + "epoch": 3.524248004910988, + "grad_norm": 0.24353443086147308, + "learning_rate": 7.513332463869536e-05, + "loss": 1.7485, + "step": 11482 + }, + { + "epoch": 3.5245549416820134, + "grad_norm": 0.29861485958099365, + "learning_rate": 7.512902756283498e-05, + "loss": 1.7993, + "step": 11483 + }, + { + "epoch": 3.5248618784530388, + "grad_norm": 0.40380924940109253, + "learning_rate": 7.51247302386348e-05, + "loss": 1.7664, + "step": 11484 + }, + { + "epoch": 3.525168815224064, + "grad_norm": 0.3365862965583801, + "learning_rate": 7.512043266613733e-05, + "loss": 1.7512, + "step": 11485 + }, + { + "epoch": 3.525475751995089, + "grad_norm": 0.2502824068069458, + "learning_rate": 7.511613484538502e-05, + "loss": 1.8414, + "step": 11486 + }, + { + "epoch": 3.5257826887661143, + "grad_norm": 0.2598603069782257, + "learning_rate": 7.511183677642034e-05, + "loss": 1.7358, + "step": 11487 + }, + { + "epoch": 3.5260896255371392, + "grad_norm": 0.30246880650520325, + "learning_rate": 7.510753845928576e-05, + "loss": 1.791, + "step": 11488 + }, + { + "epoch": 3.5263965623081646, + "grad_norm": 0.25170832872390747, + "learning_rate": 7.510323989402378e-05, + "loss": 1.7498, + "step": 11489 + }, + { + "epoch": 3.52670349907919, + "grad_norm": 0.2925282418727875, + "learning_rate": 7.509894108067688e-05, + "loss": 1.8413, + "step": 11490 + }, + { + "epoch": 3.527010435850215, + "grad_norm": 0.2643601596355438, + "learning_rate": 7.509464201928752e-05, + "loss": 1.8052, + "step": 11491 + }, + { + "epoch": 3.52731737262124, + "grad_norm": 0.2938917279243469, + "learning_rate": 7.50903427098982e-05, + "loss": 1.7308, + "step": 11492 + }, + { + "epoch": 3.527624309392265, + "grad_norm": 0.2978343367576599, + "learning_rate": 7.508604315255142e-05, + "loss": 1.8147, + "step": 11493 + }, + { + "epoch": 3.5279312461632903, + "grad_norm": 0.2507816255092621, + "learning_rate": 7.508174334728963e-05, + "loss": 1.774, + "step": 11494 + }, + { + "epoch": 3.5282381829343157, + "grad_norm": 0.32971861958503723, + "learning_rate": 7.507744329415538e-05, + "loss": 1.7634, + "step": 11495 + }, + { + "epoch": 3.5285451197053406, + "grad_norm": 0.3149639964103699, + "learning_rate": 7.507314299319113e-05, + "loss": 1.8032, + "step": 11496 + }, + { + "epoch": 3.528852056476366, + "grad_norm": 0.2721364498138428, + "learning_rate": 7.506884244443937e-05, + "loss": 1.7702, + "step": 11497 + }, + { + "epoch": 3.529158993247391, + "grad_norm": 0.29375985264778137, + "learning_rate": 7.506454164794263e-05, + "loss": 1.8673, + "step": 11498 + }, + { + "epoch": 3.529465930018416, + "grad_norm": 0.379944384098053, + "learning_rate": 7.50602406037434e-05, + "loss": 1.883, + "step": 11499 + }, + { + "epoch": 3.5297728667894415, + "grad_norm": 0.4041840136051178, + "learning_rate": 7.505593931188417e-05, + "loss": 1.7998, + "step": 11500 + }, + { + "epoch": 3.530079803560467, + "grad_norm": 0.30013784766197205, + "learning_rate": 7.505163777240747e-05, + "loss": 1.775, + "step": 11501 + }, + { + "epoch": 3.5303867403314917, + "grad_norm": 0.25161153078079224, + "learning_rate": 7.50473359853558e-05, + "loss": 1.8609, + "step": 11502 + }, + { + "epoch": 3.530693677102517, + "grad_norm": 0.2803831100463867, + "learning_rate": 7.504303395077168e-05, + "loss": 1.8397, + "step": 11503 + }, + { + "epoch": 3.531000613873542, + "grad_norm": 0.26678118109703064, + "learning_rate": 7.503873166869762e-05, + "loss": 1.7877, + "step": 11504 + }, + { + "epoch": 3.5313075506445673, + "grad_norm": 0.24280449748039246, + "learning_rate": 7.503442913917613e-05, + "loss": 1.7891, + "step": 11505 + }, + { + "epoch": 3.5316144874155926, + "grad_norm": 0.26461485028266907, + "learning_rate": 7.503012636224976e-05, + "loss": 1.7993, + "step": 11506 + }, + { + "epoch": 3.5319214241866175, + "grad_norm": 0.27001824975013733, + "learning_rate": 7.502582333796098e-05, + "loss": 1.7719, + "step": 11507 + }, + { + "epoch": 3.532228360957643, + "grad_norm": 0.27585846185684204, + "learning_rate": 7.502152006635237e-05, + "loss": 1.7412, + "step": 11508 + }, + { + "epoch": 3.5325352977286677, + "grad_norm": 0.24896648526191711, + "learning_rate": 7.501721654746643e-05, + "loss": 1.7459, + "step": 11509 + }, + { + "epoch": 3.532842234499693, + "grad_norm": 0.2308502197265625, + "learning_rate": 7.501291278134569e-05, + "loss": 1.7717, + "step": 11510 + }, + { + "epoch": 3.5331491712707184, + "grad_norm": 0.3026069104671478, + "learning_rate": 7.500860876803267e-05, + "loss": 1.8578, + "step": 11511 + }, + { + "epoch": 3.5334561080417433, + "grad_norm": 0.30242082476615906, + "learning_rate": 7.500430450756995e-05, + "loss": 1.7793, + "step": 11512 + }, + { + "epoch": 3.5337630448127686, + "grad_norm": 0.2583339214324951, + "learning_rate": 7.500000000000001e-05, + "loss": 1.8388, + "step": 11513 + }, + { + "epoch": 3.5340699815837935, + "grad_norm": 0.29673871397972107, + "learning_rate": 7.499569524536542e-05, + "loss": 1.7749, + "step": 11514 + }, + { + "epoch": 3.534376918354819, + "grad_norm": 0.35199788212776184, + "learning_rate": 7.499139024370874e-05, + "loss": 1.7863, + "step": 11515 + }, + { + "epoch": 3.534683855125844, + "grad_norm": 0.25776436924934387, + "learning_rate": 7.498708499507247e-05, + "loss": 1.7568, + "step": 11516 + }, + { + "epoch": 3.5349907918968695, + "grad_norm": 0.26081520318984985, + "learning_rate": 7.498277949949919e-05, + "loss": 1.807, + "step": 11517 + }, + { + "epoch": 3.5352977286678944, + "grad_norm": 0.29247912764549255, + "learning_rate": 7.497847375703145e-05, + "loss": 1.7568, + "step": 11518 + }, + { + "epoch": 3.5356046654389197, + "grad_norm": 0.20964498817920685, + "learning_rate": 7.497416776771178e-05, + "loss": 1.7601, + "step": 11519 + }, + { + "epoch": 3.5359116022099446, + "grad_norm": 0.28739818930625916, + "learning_rate": 7.496986153158273e-05, + "loss": 1.7915, + "step": 11520 + }, + { + "epoch": 3.53621853898097, + "grad_norm": 0.3109932839870453, + "learning_rate": 7.496555504868691e-05, + "loss": 1.8046, + "step": 11521 + }, + { + "epoch": 3.5365254757519953, + "grad_norm": 0.259284108877182, + "learning_rate": 7.496124831906681e-05, + "loss": 1.7595, + "step": 11522 + }, + { + "epoch": 3.53683241252302, + "grad_norm": 0.265909343957901, + "learning_rate": 7.495694134276504e-05, + "loss": 1.8249, + "step": 11523 + }, + { + "epoch": 3.5371393492940455, + "grad_norm": 0.2478799819946289, + "learning_rate": 7.495263411982415e-05, + "loss": 1.8531, + "step": 11524 + }, + { + "epoch": 3.5374462860650704, + "grad_norm": 0.2636432945728302, + "learning_rate": 7.494832665028671e-05, + "loss": 1.8114, + "step": 11525 + }, + { + "epoch": 3.5377532228360957, + "grad_norm": 0.25323864817619324, + "learning_rate": 7.494401893419527e-05, + "loss": 1.8271, + "step": 11526 + }, + { + "epoch": 3.538060159607121, + "grad_norm": 0.2352467179298401, + "learning_rate": 7.493971097159241e-05, + "loss": 1.7524, + "step": 11527 + }, + { + "epoch": 3.538367096378146, + "grad_norm": 0.2788623869419098, + "learning_rate": 7.493540276252072e-05, + "loss": 1.8238, + "step": 11528 + }, + { + "epoch": 3.5386740331491713, + "grad_norm": 0.3506326377391815, + "learning_rate": 7.493109430702277e-05, + "loss": 1.8525, + "step": 11529 + }, + { + "epoch": 3.538980969920196, + "grad_norm": 0.3685263395309448, + "learning_rate": 7.492678560514113e-05, + "loss": 1.8497, + "step": 11530 + }, + { + "epoch": 3.5392879066912215, + "grad_norm": 0.32200056314468384, + "learning_rate": 7.492247665691837e-05, + "loss": 1.7587, + "step": 11531 + }, + { + "epoch": 3.539594843462247, + "grad_norm": 0.2800062894821167, + "learning_rate": 7.49181674623971e-05, + "loss": 1.8188, + "step": 11532 + }, + { + "epoch": 3.539901780233272, + "grad_norm": 0.24137580394744873, + "learning_rate": 7.491385802161989e-05, + "loss": 1.7947, + "step": 11533 + }, + { + "epoch": 3.540208717004297, + "grad_norm": 0.21900027990341187, + "learning_rate": 7.490954833462933e-05, + "loss": 1.7722, + "step": 11534 + }, + { + "epoch": 3.5405156537753224, + "grad_norm": 0.25009945034980774, + "learning_rate": 7.490523840146803e-05, + "loss": 1.8173, + "step": 11535 + }, + { + "epoch": 3.5408225905463473, + "grad_norm": 0.2778431475162506, + "learning_rate": 7.490092822217855e-05, + "loss": 1.8368, + "step": 11536 + }, + { + "epoch": 3.5411295273173726, + "grad_norm": 0.2845982611179352, + "learning_rate": 7.48966177968035e-05, + "loss": 1.7539, + "step": 11537 + }, + { + "epoch": 3.541436464088398, + "grad_norm": 0.27480921149253845, + "learning_rate": 7.48923071253855e-05, + "loss": 1.8494, + "step": 11538 + }, + { + "epoch": 3.541743400859423, + "grad_norm": 0.2722087502479553, + "learning_rate": 7.488799620796711e-05, + "loss": 1.8422, + "step": 11539 + }, + { + "epoch": 3.542050337630448, + "grad_norm": 0.2984340190887451, + "learning_rate": 7.488368504459097e-05, + "loss": 1.8042, + "step": 11540 + }, + { + "epoch": 3.542357274401473, + "grad_norm": 0.2405850738286972, + "learning_rate": 7.487937363529966e-05, + "loss": 1.749, + "step": 11541 + }, + { + "epoch": 3.5426642111724984, + "grad_norm": 0.24816973507404327, + "learning_rate": 7.487506198013579e-05, + "loss": 1.8671, + "step": 11542 + }, + { + "epoch": 3.5429711479435237, + "grad_norm": 0.2796473503112793, + "learning_rate": 7.487075007914199e-05, + "loss": 1.8023, + "step": 11543 + }, + { + "epoch": 3.5432780847145486, + "grad_norm": 0.2600162625312805, + "learning_rate": 7.486643793236086e-05, + "loss": 1.7997, + "step": 11544 + }, + { + "epoch": 3.543585021485574, + "grad_norm": 0.2746226489543915, + "learning_rate": 7.486212553983503e-05, + "loss": 1.7773, + "step": 11545 + }, + { + "epoch": 3.5438919582565993, + "grad_norm": 0.24142079055309296, + "learning_rate": 7.485781290160708e-05, + "loss": 1.791, + "step": 11546 + }, + { + "epoch": 3.544198895027624, + "grad_norm": 0.2472934126853943, + "learning_rate": 7.485350001771966e-05, + "loss": 1.8183, + "step": 11547 + }, + { + "epoch": 3.5445058317986495, + "grad_norm": 0.26891404390335083, + "learning_rate": 7.48491868882154e-05, + "loss": 1.7421, + "step": 11548 + }, + { + "epoch": 3.544812768569675, + "grad_norm": 0.24820464849472046, + "learning_rate": 7.48448735131369e-05, + "loss": 1.7372, + "step": 11549 + }, + { + "epoch": 3.5451197053406998, + "grad_norm": 0.2456594705581665, + "learning_rate": 7.484055989252679e-05, + "loss": 1.7883, + "step": 11550 + }, + { + "epoch": 3.545426642111725, + "grad_norm": 0.32420551776885986, + "learning_rate": 7.48362460264277e-05, + "loss": 1.8363, + "step": 11551 + }, + { + "epoch": 3.54573357888275, + "grad_norm": 0.3187662661075592, + "learning_rate": 7.483193191488229e-05, + "loss": 1.7957, + "step": 11552 + }, + { + "epoch": 3.5460405156537753, + "grad_norm": 0.2845410108566284, + "learning_rate": 7.482761755793316e-05, + "loss": 1.8288, + "step": 11553 + }, + { + "epoch": 3.5463474524248007, + "grad_norm": 0.2816021740436554, + "learning_rate": 7.482330295562298e-05, + "loss": 1.7562, + "step": 11554 + }, + { + "epoch": 3.5466543891958255, + "grad_norm": 0.28938058018684387, + "learning_rate": 7.481898810799435e-05, + "loss": 1.8139, + "step": 11555 + }, + { + "epoch": 3.546961325966851, + "grad_norm": 0.3305707573890686, + "learning_rate": 7.481467301508995e-05, + "loss": 1.8956, + "step": 11556 + }, + { + "epoch": 3.5472682627378758, + "grad_norm": 0.3890376091003418, + "learning_rate": 7.48103576769524e-05, + "loss": 1.8552, + "step": 11557 + }, + { + "epoch": 3.547575199508901, + "grad_norm": 0.3900652825832367, + "learning_rate": 7.480604209362434e-05, + "loss": 1.7748, + "step": 11558 + }, + { + "epoch": 3.5478821362799264, + "grad_norm": 0.3297326862812042, + "learning_rate": 7.480172626514845e-05, + "loss": 1.8201, + "step": 11559 + }, + { + "epoch": 3.5481890730509518, + "grad_norm": 0.28797218203544617, + "learning_rate": 7.479741019156737e-05, + "loss": 1.7652, + "step": 11560 + }, + { + "epoch": 3.5484960098219767, + "grad_norm": 0.2764691114425659, + "learning_rate": 7.479309387292373e-05, + "loss": 1.7534, + "step": 11561 + }, + { + "epoch": 3.548802946593002, + "grad_norm": 0.25067585706710815, + "learning_rate": 7.47887773092602e-05, + "loss": 1.7849, + "step": 11562 + }, + { + "epoch": 3.549109883364027, + "grad_norm": 0.29966798424720764, + "learning_rate": 7.478446050061947e-05, + "loss": 1.8299, + "step": 11563 + }, + { + "epoch": 3.549416820135052, + "grad_norm": 0.24068406224250793, + "learning_rate": 7.478014344704416e-05, + "loss": 1.8366, + "step": 11564 + }, + { + "epoch": 3.5497237569060776, + "grad_norm": 0.2559303641319275, + "learning_rate": 7.477582614857695e-05, + "loss": 1.7665, + "step": 11565 + }, + { + "epoch": 3.5500306936771024, + "grad_norm": 0.24617858231067657, + "learning_rate": 7.47715086052605e-05, + "loss": 1.8334, + "step": 11566 + }, + { + "epoch": 3.550337630448128, + "grad_norm": 0.2433501034975052, + "learning_rate": 7.476719081713749e-05, + "loss": 1.7963, + "step": 11567 + }, + { + "epoch": 3.5506445672191527, + "grad_norm": 0.2583518326282501, + "learning_rate": 7.476287278425057e-05, + "loss": 1.8311, + "step": 11568 + }, + { + "epoch": 3.550951503990178, + "grad_norm": 0.3232485055923462, + "learning_rate": 7.475855450664244e-05, + "loss": 1.9162, + "step": 11569 + }, + { + "epoch": 3.5512584407612033, + "grad_norm": 0.28247153759002686, + "learning_rate": 7.475423598435576e-05, + "loss": 1.8027, + "step": 11570 + }, + { + "epoch": 3.5515653775322282, + "grad_norm": 0.27201834321022034, + "learning_rate": 7.47499172174332e-05, + "loss": 1.7822, + "step": 11571 + }, + { + "epoch": 3.5518723143032536, + "grad_norm": 0.2408471554517746, + "learning_rate": 7.474559820591748e-05, + "loss": 1.7735, + "step": 11572 + }, + { + "epoch": 3.5521792510742785, + "grad_norm": 0.24187393486499786, + "learning_rate": 7.474127894985124e-05, + "loss": 1.7931, + "step": 11573 + }, + { + "epoch": 3.552486187845304, + "grad_norm": 0.2759699523448944, + "learning_rate": 7.473695944927717e-05, + "loss": 1.8407, + "step": 11574 + }, + { + "epoch": 3.552793124616329, + "grad_norm": 0.2503111958503723, + "learning_rate": 7.473263970423797e-05, + "loss": 1.7613, + "step": 11575 + }, + { + "epoch": 3.5531000613873545, + "grad_norm": 0.24795177578926086, + "learning_rate": 7.472831971477633e-05, + "loss": 1.8221, + "step": 11576 + }, + { + "epoch": 3.5534069981583793, + "grad_norm": 0.23190177977085114, + "learning_rate": 7.472399948093494e-05, + "loss": 1.7541, + "step": 11577 + }, + { + "epoch": 3.5537139349294047, + "grad_norm": 0.24650825560092926, + "learning_rate": 7.471967900275653e-05, + "loss": 1.8002, + "step": 11578 + }, + { + "epoch": 3.5540208717004296, + "grad_norm": 0.256598562002182, + "learning_rate": 7.471535828028372e-05, + "loss": 1.7052, + "step": 11579 + }, + { + "epoch": 3.554327808471455, + "grad_norm": 0.2715381681919098, + "learning_rate": 7.471103731355926e-05, + "loss": 1.7701, + "step": 11580 + }, + { + "epoch": 3.5546347452424802, + "grad_norm": 0.29806044697761536, + "learning_rate": 7.470671610262586e-05, + "loss": 1.7614, + "step": 11581 + }, + { + "epoch": 3.554941682013505, + "grad_norm": 0.26364314556121826, + "learning_rate": 7.470239464752621e-05, + "loss": 1.7957, + "step": 11582 + }, + { + "epoch": 3.5552486187845305, + "grad_norm": 0.29270800948143005, + "learning_rate": 7.4698072948303e-05, + "loss": 1.8263, + "step": 11583 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.25941839814186096, + "learning_rate": 7.469375100499898e-05, + "loss": 1.8517, + "step": 11584 + }, + { + "epoch": 3.5558624923265807, + "grad_norm": 0.29509237408638, + "learning_rate": 7.468942881765681e-05, + "loss": 1.8643, + "step": 11585 + }, + { + "epoch": 3.556169429097606, + "grad_norm": 0.23090367019176483, + "learning_rate": 7.468510638631926e-05, + "loss": 1.7239, + "step": 11586 + }, + { + "epoch": 3.556476365868631, + "grad_norm": 0.2696724236011505, + "learning_rate": 7.468078371102901e-05, + "loss": 1.848, + "step": 11587 + }, + { + "epoch": 3.5567833026396563, + "grad_norm": 0.2691192626953125, + "learning_rate": 7.46764607918288e-05, + "loss": 1.8194, + "step": 11588 + }, + { + "epoch": 3.557090239410681, + "grad_norm": 0.26616501808166504, + "learning_rate": 7.467213762876131e-05, + "loss": 1.8382, + "step": 11589 + }, + { + "epoch": 3.5573971761817065, + "grad_norm": 0.30629831552505493, + "learning_rate": 7.466781422186933e-05, + "loss": 1.8417, + "step": 11590 + }, + { + "epoch": 3.557704112952732, + "grad_norm": 0.27212417125701904, + "learning_rate": 7.466349057119552e-05, + "loss": 1.7612, + "step": 11591 + }, + { + "epoch": 3.558011049723757, + "grad_norm": 0.2872084379196167, + "learning_rate": 7.465916667678266e-05, + "loss": 1.7998, + "step": 11592 + }, + { + "epoch": 3.558317986494782, + "grad_norm": 0.3017117977142334, + "learning_rate": 7.465484253867348e-05, + "loss": 1.7996, + "step": 11593 + }, + { + "epoch": 3.5586249232658074, + "grad_norm": 0.2707957327365875, + "learning_rate": 7.465051815691066e-05, + "loss": 1.7678, + "step": 11594 + }, + { + "epoch": 3.5589318600368323, + "grad_norm": 0.28932711482048035, + "learning_rate": 7.464619353153702e-05, + "loss": 1.8576, + "step": 11595 + }, + { + "epoch": 3.5592387968078576, + "grad_norm": 0.2585125267505646, + "learning_rate": 7.464186866259519e-05, + "loss": 1.8678, + "step": 11596 + }, + { + "epoch": 3.559545733578883, + "grad_norm": 0.24386851489543915, + "learning_rate": 7.4637543550128e-05, + "loss": 1.7778, + "step": 11597 + }, + { + "epoch": 3.559852670349908, + "grad_norm": 0.2375860959291458, + "learning_rate": 7.463321819417817e-05, + "loss": 1.8096, + "step": 11598 + }, + { + "epoch": 3.560159607120933, + "grad_norm": 0.2341299206018448, + "learning_rate": 7.462889259478842e-05, + "loss": 1.7191, + "step": 11599 + }, + { + "epoch": 3.560466543891958, + "grad_norm": 0.2510595917701721, + "learning_rate": 7.462456675200154e-05, + "loss": 1.7763, + "step": 11600 + }, + { + "epoch": 3.5607734806629834, + "grad_norm": 0.2554674744606018, + "learning_rate": 7.462024066586025e-05, + "loss": 1.7578, + "step": 11601 + }, + { + "epoch": 3.5610804174340087, + "grad_norm": 0.25040730834007263, + "learning_rate": 7.46159143364073e-05, + "loss": 1.8194, + "step": 11602 + }, + { + "epoch": 3.5613873542050336, + "grad_norm": 0.24294932186603546, + "learning_rate": 7.461158776368547e-05, + "loss": 1.8063, + "step": 11603 + }, + { + "epoch": 3.561694290976059, + "grad_norm": 0.2388325333595276, + "learning_rate": 7.46072609477375e-05, + "loss": 1.7942, + "step": 11604 + }, + { + "epoch": 3.562001227747084, + "grad_norm": 0.2569502890110016, + "learning_rate": 7.460293388860615e-05, + "loss": 1.7824, + "step": 11605 + }, + { + "epoch": 3.562308164518109, + "grad_norm": 0.24004346132278442, + "learning_rate": 7.45986065863342e-05, + "loss": 1.8676, + "step": 11606 + }, + { + "epoch": 3.5626151012891345, + "grad_norm": 0.25446319580078125, + "learning_rate": 7.45942790409644e-05, + "loss": 1.7726, + "step": 11607 + }, + { + "epoch": 3.56292203806016, + "grad_norm": 0.26257482171058655, + "learning_rate": 7.458995125253951e-05, + "loss": 1.779, + "step": 11608 + }, + { + "epoch": 3.5632289748311847, + "grad_norm": 0.27703070640563965, + "learning_rate": 7.458562322110231e-05, + "loss": 1.8247, + "step": 11609 + }, + { + "epoch": 3.56353591160221, + "grad_norm": 0.25478535890579224, + "learning_rate": 7.458129494669556e-05, + "loss": 1.7794, + "step": 11610 + }, + { + "epoch": 3.563842848373235, + "grad_norm": 0.26173365116119385, + "learning_rate": 7.457696642936207e-05, + "loss": 1.758, + "step": 11611 + }, + { + "epoch": 3.5641497851442603, + "grad_norm": 0.25077274441719055, + "learning_rate": 7.45726376691446e-05, + "loss": 1.8234, + "step": 11612 + }, + { + "epoch": 3.5644567219152856, + "grad_norm": 0.2591109275817871, + "learning_rate": 7.456830866608589e-05, + "loss": 1.7723, + "step": 11613 + }, + { + "epoch": 3.5647636586863105, + "grad_norm": 0.2653447091579437, + "learning_rate": 7.456397942022877e-05, + "loss": 1.7839, + "step": 11614 + }, + { + "epoch": 3.565070595457336, + "grad_norm": 0.3203454911708832, + "learning_rate": 7.455964993161601e-05, + "loss": 1.8548, + "step": 11615 + }, + { + "epoch": 3.5653775322283607, + "grad_norm": 0.3041793704032898, + "learning_rate": 7.455532020029039e-05, + "loss": 1.7925, + "step": 11616 + }, + { + "epoch": 3.565684468999386, + "grad_norm": 0.26066139340400696, + "learning_rate": 7.45509902262947e-05, + "loss": 1.7905, + "step": 11617 + }, + { + "epoch": 3.5659914057704114, + "grad_norm": 0.2483314871788025, + "learning_rate": 7.454666000967174e-05, + "loss": 1.7658, + "step": 11618 + }, + { + "epoch": 3.5662983425414367, + "grad_norm": 0.24285900592803955, + "learning_rate": 7.45423295504643e-05, + "loss": 1.7575, + "step": 11619 + }, + { + "epoch": 3.5666052793124616, + "grad_norm": 0.27231669425964355, + "learning_rate": 7.453799884871517e-05, + "loss": 1.8389, + "step": 11620 + }, + { + "epoch": 3.566912216083487, + "grad_norm": 0.24324406683444977, + "learning_rate": 7.453366790446717e-05, + "loss": 1.7775, + "step": 11621 + }, + { + "epoch": 3.567219152854512, + "grad_norm": 0.2724440097808838, + "learning_rate": 7.452933671776305e-05, + "loss": 1.8135, + "step": 11622 + }, + { + "epoch": 3.567526089625537, + "grad_norm": 0.22207655012607574, + "learning_rate": 7.452500528864568e-05, + "loss": 1.722, + "step": 11623 + }, + { + "epoch": 3.5678330263965625, + "grad_norm": 0.25650298595428467, + "learning_rate": 7.452067361715782e-05, + "loss": 1.7813, + "step": 11624 + }, + { + "epoch": 3.5681399631675874, + "grad_norm": 0.2582200765609741, + "learning_rate": 7.45163417033423e-05, + "loss": 1.8253, + "step": 11625 + }, + { + "epoch": 3.5684468999386127, + "grad_norm": 0.29545384645462036, + "learning_rate": 7.451200954724188e-05, + "loss": 1.8108, + "step": 11626 + }, + { + "epoch": 3.5687538367096376, + "grad_norm": 0.30457428097724915, + "learning_rate": 7.450767714889946e-05, + "loss": 1.8257, + "step": 11627 + }, + { + "epoch": 3.569060773480663, + "grad_norm": 0.2955166697502136, + "learning_rate": 7.450334450835781e-05, + "loss": 1.8172, + "step": 11628 + }, + { + "epoch": 3.5693677102516883, + "grad_norm": 0.2793857753276825, + "learning_rate": 7.449901162565974e-05, + "loss": 1.8493, + "step": 11629 + }, + { + "epoch": 3.569674647022713, + "grad_norm": 0.27154335379600525, + "learning_rate": 7.449467850084808e-05, + "loss": 1.8306, + "step": 11630 + }, + { + "epoch": 3.5699815837937385, + "grad_norm": 0.22336189448833466, + "learning_rate": 7.449034513396564e-05, + "loss": 1.7435, + "step": 11631 + }, + { + "epoch": 3.5702885205647634, + "grad_norm": 0.22799183428287506, + "learning_rate": 7.448601152505526e-05, + "loss": 1.7818, + "step": 11632 + }, + { + "epoch": 3.5705954573357888, + "grad_norm": 0.26670658588409424, + "learning_rate": 7.448167767415976e-05, + "loss": 1.7777, + "step": 11633 + }, + { + "epoch": 3.570902394106814, + "grad_norm": 0.2848666310310364, + "learning_rate": 7.447734358132196e-05, + "loss": 1.7572, + "step": 11634 + }, + { + "epoch": 3.5712093308778394, + "grad_norm": 0.26843544840812683, + "learning_rate": 7.447300924658473e-05, + "loss": 1.7642, + "step": 11635 + }, + { + "epoch": 3.5715162676488643, + "grad_norm": 0.24666404724121094, + "learning_rate": 7.446867466999087e-05, + "loss": 1.7533, + "step": 11636 + }, + { + "epoch": 3.5718232044198897, + "grad_norm": 0.31111210584640503, + "learning_rate": 7.44643398515832e-05, + "loss": 1.7875, + "step": 11637 + }, + { + "epoch": 3.5721301411909145, + "grad_norm": 0.3157108724117279, + "learning_rate": 7.446000479140462e-05, + "loss": 1.7879, + "step": 11638 + }, + { + "epoch": 3.57243707796194, + "grad_norm": 0.2935558259487152, + "learning_rate": 7.445566948949792e-05, + "loss": 1.7819, + "step": 11639 + }, + { + "epoch": 3.572744014732965, + "grad_norm": 0.2265472710132599, + "learning_rate": 7.445133394590597e-05, + "loss": 1.7518, + "step": 11640 + }, + { + "epoch": 3.57305095150399, + "grad_norm": 0.2564176023006439, + "learning_rate": 7.444699816067159e-05, + "loss": 1.7281, + "step": 11641 + }, + { + "epoch": 3.5733578882750154, + "grad_norm": 0.27933555841445923, + "learning_rate": 7.444266213383766e-05, + "loss": 1.7852, + "step": 11642 + }, + { + "epoch": 3.5736648250460403, + "grad_norm": 0.29105356335639954, + "learning_rate": 7.4438325865447e-05, + "loss": 1.8056, + "step": 11643 + }, + { + "epoch": 3.5739717618170657, + "grad_norm": 0.27665549516677856, + "learning_rate": 7.443398935554249e-05, + "loss": 1.7249, + "step": 11644 + }, + { + "epoch": 3.574278698588091, + "grad_norm": 0.21899232268333435, + "learning_rate": 7.442965260416698e-05, + "loss": 1.7689, + "step": 11645 + }, + { + "epoch": 3.574585635359116, + "grad_norm": 0.3250672221183777, + "learning_rate": 7.442531561136333e-05, + "loss": 1.8058, + "step": 11646 + }, + { + "epoch": 3.574892572130141, + "grad_norm": 0.42442524433135986, + "learning_rate": 7.442097837717438e-05, + "loss": 1.7887, + "step": 11647 + }, + { + "epoch": 3.575199508901166, + "grad_norm": 0.33108964562416077, + "learning_rate": 7.441664090164302e-05, + "loss": 1.7628, + "step": 11648 + }, + { + "epoch": 3.5755064456721914, + "grad_norm": 0.23050357401371002, + "learning_rate": 7.44123031848121e-05, + "loss": 1.8121, + "step": 11649 + }, + { + "epoch": 3.575813382443217, + "grad_norm": 0.29251593351364136, + "learning_rate": 7.440796522672448e-05, + "loss": 1.8051, + "step": 11650 + }, + { + "epoch": 3.576120319214242, + "grad_norm": 0.3764750063419342, + "learning_rate": 7.440362702742305e-05, + "loss": 1.9002, + "step": 11651 + }, + { + "epoch": 3.576427255985267, + "grad_norm": 0.3751949071884155, + "learning_rate": 7.439928858695069e-05, + "loss": 1.821, + "step": 11652 + }, + { + "epoch": 3.5767341927562923, + "grad_norm": 0.268476665019989, + "learning_rate": 7.439494990535024e-05, + "loss": 1.8241, + "step": 11653 + }, + { + "epoch": 3.5770411295273172, + "grad_norm": 0.3072795271873474, + "learning_rate": 7.439061098266459e-05, + "loss": 1.8169, + "step": 11654 + }, + { + "epoch": 3.5773480662983426, + "grad_norm": 0.4948901832103729, + "learning_rate": 7.438627181893664e-05, + "loss": 1.7706, + "step": 11655 + }, + { + "epoch": 3.577655003069368, + "grad_norm": 0.5892601013183594, + "learning_rate": 7.438193241420926e-05, + "loss": 1.7631, + "step": 11656 + }, + { + "epoch": 3.577961939840393, + "grad_norm": 0.4599401652812958, + "learning_rate": 7.437759276852533e-05, + "loss": 1.7471, + "step": 11657 + }, + { + "epoch": 3.578268876611418, + "grad_norm": 0.2545170783996582, + "learning_rate": 7.437325288192773e-05, + "loss": 1.7945, + "step": 11658 + }, + { + "epoch": 3.578575813382443, + "grad_norm": 0.3136496841907501, + "learning_rate": 7.436891275445938e-05, + "loss": 1.828, + "step": 11659 + }, + { + "epoch": 3.5788827501534684, + "grad_norm": 0.3631688058376312, + "learning_rate": 7.436457238616313e-05, + "loss": 1.8302, + "step": 11660 + }, + { + "epoch": 3.5791896869244937, + "grad_norm": 0.3097386658191681, + "learning_rate": 7.436023177708192e-05, + "loss": 1.8397, + "step": 11661 + }, + { + "epoch": 3.5794966236955186, + "grad_norm": 0.20948798954486847, + "learning_rate": 7.43558909272586e-05, + "loss": 1.7844, + "step": 11662 + }, + { + "epoch": 3.579803560466544, + "grad_norm": 0.24327392876148224, + "learning_rate": 7.43515498367361e-05, + "loss": 1.7827, + "step": 11663 + }, + { + "epoch": 3.580110497237569, + "grad_norm": 0.25268325209617615, + "learning_rate": 7.434720850555731e-05, + "loss": 1.8224, + "step": 11664 + }, + { + "epoch": 3.580417434008594, + "grad_norm": 0.24883607029914856, + "learning_rate": 7.434286693376513e-05, + "loss": 1.8189, + "step": 11665 + }, + { + "epoch": 3.5807243707796195, + "grad_norm": 0.2942518889904022, + "learning_rate": 7.433852512140248e-05, + "loss": 1.8325, + "step": 11666 + }, + { + "epoch": 3.581031307550645, + "grad_norm": 0.3556186556816101, + "learning_rate": 7.433418306851225e-05, + "loss": 1.7511, + "step": 11667 + }, + { + "epoch": 3.5813382443216697, + "grad_norm": 0.421220600605011, + "learning_rate": 7.432984077513738e-05, + "loss": 1.8081, + "step": 11668 + }, + { + "epoch": 3.581645181092695, + "grad_norm": 0.3338243067264557, + "learning_rate": 7.432549824132074e-05, + "loss": 1.8274, + "step": 11669 + }, + { + "epoch": 3.58195211786372, + "grad_norm": 0.25091543793678284, + "learning_rate": 7.432115546710528e-05, + "loss": 1.7637, + "step": 11670 + }, + { + "epoch": 3.5822590546347453, + "grad_norm": 0.29870370030403137, + "learning_rate": 7.431681245253389e-05, + "loss": 1.8036, + "step": 11671 + }, + { + "epoch": 3.5825659914057706, + "grad_norm": 0.2682137191295624, + "learning_rate": 7.431246919764953e-05, + "loss": 1.8252, + "step": 11672 + }, + { + "epoch": 3.5828729281767955, + "grad_norm": 0.28790801763534546, + "learning_rate": 7.430812570249508e-05, + "loss": 1.7713, + "step": 11673 + }, + { + "epoch": 3.583179864947821, + "grad_norm": 0.26357609033584595, + "learning_rate": 7.43037819671135e-05, + "loss": 1.8388, + "step": 11674 + }, + { + "epoch": 3.5834868017188457, + "grad_norm": 0.2505483031272888, + "learning_rate": 7.42994379915477e-05, + "loss": 1.7722, + "step": 11675 + }, + { + "epoch": 3.583793738489871, + "grad_norm": 0.2535844147205353, + "learning_rate": 7.42950937758406e-05, + "loss": 1.756, + "step": 11676 + }, + { + "epoch": 3.5841006752608964, + "grad_norm": 0.23045027256011963, + "learning_rate": 7.429074932003515e-05, + "loss": 1.791, + "step": 11677 + }, + { + "epoch": 3.5844076120319213, + "grad_norm": 0.22525762021541595, + "learning_rate": 7.428640462417428e-05, + "loss": 1.7234, + "step": 11678 + }, + { + "epoch": 3.5847145488029466, + "grad_norm": 0.2402270883321762, + "learning_rate": 7.428205968830094e-05, + "loss": 1.845, + "step": 11679 + }, + { + "epoch": 3.5850214855739715, + "grad_norm": 0.24909646809101105, + "learning_rate": 7.427771451245802e-05, + "loss": 1.8537, + "step": 11680 + }, + { + "epoch": 3.585328422344997, + "grad_norm": 0.25813063979148865, + "learning_rate": 7.427336909668853e-05, + "loss": 1.7353, + "step": 11681 + }, + { + "epoch": 3.585635359116022, + "grad_norm": 0.26073768734931946, + "learning_rate": 7.426902344103534e-05, + "loss": 1.8142, + "step": 11682 + }, + { + "epoch": 3.5859422958870475, + "grad_norm": 0.2498280256986618, + "learning_rate": 7.426467754554147e-05, + "loss": 1.7996, + "step": 11683 + }, + { + "epoch": 3.5862492326580724, + "grad_norm": 0.3131188154220581, + "learning_rate": 7.426033141024981e-05, + "loss": 1.7793, + "step": 11684 + }, + { + "epoch": 3.5865561694290977, + "grad_norm": 0.24118199944496155, + "learning_rate": 7.425598503520337e-05, + "loss": 1.8249, + "step": 11685 + }, + { + "epoch": 3.5868631062001226, + "grad_norm": 0.2791197597980499, + "learning_rate": 7.425163842044504e-05, + "loss": 1.7966, + "step": 11686 + }, + { + "epoch": 3.587170042971148, + "grad_norm": 0.2298576384782791, + "learning_rate": 7.424729156601781e-05, + "loss": 1.7224, + "step": 11687 + }, + { + "epoch": 3.5874769797421733, + "grad_norm": 0.23113438487052917, + "learning_rate": 7.424294447196462e-05, + "loss": 1.7641, + "step": 11688 + }, + { + "epoch": 3.587783916513198, + "grad_norm": 0.3064495027065277, + "learning_rate": 7.423859713832847e-05, + "loss": 1.8688, + "step": 11689 + }, + { + "epoch": 3.5880908532842235, + "grad_norm": 0.22847676277160645, + "learning_rate": 7.423424956515228e-05, + "loss": 1.7513, + "step": 11690 + }, + { + "epoch": 3.5883977900552484, + "grad_norm": 0.2797350585460663, + "learning_rate": 7.422990175247905e-05, + "loss": 1.8268, + "step": 11691 + }, + { + "epoch": 3.5887047268262737, + "grad_norm": 0.2753821313381195, + "learning_rate": 7.422555370035171e-05, + "loss": 1.7313, + "step": 11692 + }, + { + "epoch": 3.589011663597299, + "grad_norm": 0.2981179654598236, + "learning_rate": 7.422120540881326e-05, + "loss": 1.8455, + "step": 11693 + }, + { + "epoch": 3.5893186003683244, + "grad_norm": 0.33028867840766907, + "learning_rate": 7.421685687790667e-05, + "loss": 1.8397, + "step": 11694 + }, + { + "epoch": 3.5896255371393493, + "grad_norm": 0.409173846244812, + "learning_rate": 7.421250810767487e-05, + "loss": 1.8088, + "step": 11695 + }, + { + "epoch": 3.5899324739103746, + "grad_norm": 0.4118194878101349, + "learning_rate": 7.42081590981609e-05, + "loss": 1.7719, + "step": 11696 + }, + { + "epoch": 3.5902394106813995, + "grad_norm": 0.34716179966926575, + "learning_rate": 7.420380984940773e-05, + "loss": 1.8063, + "step": 11697 + }, + { + "epoch": 3.590546347452425, + "grad_norm": 0.27763083577156067, + "learning_rate": 7.419946036145829e-05, + "loss": 1.7777, + "step": 11698 + }, + { + "epoch": 3.59085328422345, + "grad_norm": 0.3175280690193176, + "learning_rate": 7.419511063435562e-05, + "loss": 1.697, + "step": 11699 + }, + { + "epoch": 3.591160220994475, + "grad_norm": 0.3151503801345825, + "learning_rate": 7.419076066814268e-05, + "loss": 1.8067, + "step": 11700 + }, + { + "epoch": 3.5914671577655004, + "grad_norm": 0.26914867758750916, + "learning_rate": 7.418641046286245e-05, + "loss": 1.7797, + "step": 11701 + }, + { + "epoch": 3.5917740945365253, + "grad_norm": 0.27231964468955994, + "learning_rate": 7.418206001855797e-05, + "loss": 1.7931, + "step": 11702 + }, + { + "epoch": 3.5920810313075506, + "grad_norm": 0.3352177143096924, + "learning_rate": 7.417770933527217e-05, + "loss": 1.9187, + "step": 11703 + }, + { + "epoch": 3.592387968078576, + "grad_norm": 0.3510081470012665, + "learning_rate": 7.417335841304808e-05, + "loss": 1.7889, + "step": 11704 + }, + { + "epoch": 3.592694904849601, + "grad_norm": 0.24949313700199127, + "learning_rate": 7.41690072519287e-05, + "loss": 1.7683, + "step": 11705 + }, + { + "epoch": 3.593001841620626, + "grad_norm": 0.28442221879959106, + "learning_rate": 7.416465585195702e-05, + "loss": 1.7889, + "step": 11706 + }, + { + "epoch": 3.593308778391651, + "grad_norm": 0.3355824649333954, + "learning_rate": 7.416030421317605e-05, + "loss": 1.7637, + "step": 11707 + }, + { + "epoch": 3.5936157151626764, + "grad_norm": 0.33569446206092834, + "learning_rate": 7.415595233562878e-05, + "loss": 1.919, + "step": 11708 + }, + { + "epoch": 3.5939226519337018, + "grad_norm": 0.2488354742527008, + "learning_rate": 7.415160021935825e-05, + "loss": 1.8424, + "step": 11709 + }, + { + "epoch": 3.594229588704727, + "grad_norm": 0.2701130509376526, + "learning_rate": 7.414724786440746e-05, + "loss": 1.7586, + "step": 11710 + }, + { + "epoch": 3.594536525475752, + "grad_norm": 0.26289790868759155, + "learning_rate": 7.414289527081939e-05, + "loss": 1.7975, + "step": 11711 + }, + { + "epoch": 3.5948434622467773, + "grad_norm": 0.25382301211357117, + "learning_rate": 7.413854243863707e-05, + "loss": 1.7393, + "step": 11712 + }, + { + "epoch": 3.595150399017802, + "grad_norm": 0.28282979130744934, + "learning_rate": 7.413418936790357e-05, + "loss": 1.8048, + "step": 11713 + }, + { + "epoch": 3.5954573357888275, + "grad_norm": 0.28001347184181213, + "learning_rate": 7.412983605866183e-05, + "loss": 1.7864, + "step": 11714 + }, + { + "epoch": 3.595764272559853, + "grad_norm": 0.26107707619667053, + "learning_rate": 7.412548251095491e-05, + "loss": 1.8016, + "step": 11715 + }, + { + "epoch": 3.5960712093308778, + "grad_norm": 0.2518761456012726, + "learning_rate": 7.412112872482583e-05, + "loss": 1.7565, + "step": 11716 + }, + { + "epoch": 3.596378146101903, + "grad_norm": 0.25911152362823486, + "learning_rate": 7.411677470031762e-05, + "loss": 1.8333, + "step": 11717 + }, + { + "epoch": 3.596685082872928, + "grad_norm": 0.3411506414413452, + "learning_rate": 7.41124204374733e-05, + "loss": 1.8027, + "step": 11718 + }, + { + "epoch": 3.5969920196439533, + "grad_norm": 0.28535547852516174, + "learning_rate": 7.410806593633593e-05, + "loss": 1.7596, + "step": 11719 + }, + { + "epoch": 3.5972989564149787, + "grad_norm": 0.24665530025959015, + "learning_rate": 7.410371119694852e-05, + "loss": 1.7777, + "step": 11720 + }, + { + "epoch": 3.5976058931860035, + "grad_norm": 0.29162275791168213, + "learning_rate": 7.40993562193541e-05, + "loss": 1.795, + "step": 11721 + }, + { + "epoch": 3.597912829957029, + "grad_norm": 0.2712220549583435, + "learning_rate": 7.409500100359573e-05, + "loss": 1.824, + "step": 11722 + }, + { + "epoch": 3.5982197667280538, + "grad_norm": 0.239755779504776, + "learning_rate": 7.40906455497164e-05, + "loss": 1.7534, + "step": 11723 + }, + { + "epoch": 3.598526703499079, + "grad_norm": 0.26056957244873047, + "learning_rate": 7.408628985775922e-05, + "loss": 1.757, + "step": 11724 + }, + { + "epoch": 3.5988336402701044, + "grad_norm": 0.3230258822441101, + "learning_rate": 7.40819339277672e-05, + "loss": 1.8684, + "step": 11725 + }, + { + "epoch": 3.5991405770411298, + "grad_norm": 0.26070696115493774, + "learning_rate": 7.407757775978339e-05, + "loss": 1.7868, + "step": 11726 + }, + { + "epoch": 3.5994475138121547, + "grad_norm": 0.24940893054008484, + "learning_rate": 7.407322135385085e-05, + "loss": 1.8391, + "step": 11727 + }, + { + "epoch": 3.59975445058318, + "grad_norm": 0.2717723250389099, + "learning_rate": 7.406886471001263e-05, + "loss": 1.7567, + "step": 11728 + }, + { + "epoch": 3.600061387354205, + "grad_norm": 0.2328445315361023, + "learning_rate": 7.406450782831177e-05, + "loss": 1.7761, + "step": 11729 + }, + { + "epoch": 3.6003683241252302, + "grad_norm": 0.2740287184715271, + "learning_rate": 7.406015070879136e-05, + "loss": 1.8599, + "step": 11730 + }, + { + "epoch": 3.6006752608962556, + "grad_norm": 0.2930558919906616, + "learning_rate": 7.405579335149441e-05, + "loss": 1.852, + "step": 11731 + }, + { + "epoch": 3.6009821976672804, + "grad_norm": 0.30175161361694336, + "learning_rate": 7.405143575646403e-05, + "loss": 1.8861, + "step": 11732 + }, + { + "epoch": 3.601289134438306, + "grad_norm": 0.2617531418800354, + "learning_rate": 7.404707792374328e-05, + "loss": 1.7598, + "step": 11733 + }, + { + "epoch": 3.6015960712093307, + "grad_norm": 0.25384122133255005, + "learning_rate": 7.404271985337517e-05, + "loss": 1.7634, + "step": 11734 + }, + { + "epoch": 3.601903007980356, + "grad_norm": 0.31706711649894714, + "learning_rate": 7.403836154540284e-05, + "loss": 1.8125, + "step": 11735 + }, + { + "epoch": 3.6022099447513813, + "grad_norm": 0.299662709236145, + "learning_rate": 7.403400299986932e-05, + "loss": 1.748, + "step": 11736 + }, + { + "epoch": 3.6025168815224062, + "grad_norm": 0.23828944563865662, + "learning_rate": 7.40296442168177e-05, + "loss": 1.7473, + "step": 11737 + }, + { + "epoch": 3.6028238182934316, + "grad_norm": 0.22611604630947113, + "learning_rate": 7.402528519629106e-05, + "loss": 1.7519, + "step": 11738 + }, + { + "epoch": 3.6031307550644565, + "grad_norm": 0.28498536348342896, + "learning_rate": 7.402092593833246e-05, + "loss": 1.7792, + "step": 11739 + }, + { + "epoch": 3.603437691835482, + "grad_norm": 0.2404283881187439, + "learning_rate": 7.4016566442985e-05, + "loss": 1.7434, + "step": 11740 + }, + { + "epoch": 3.603744628606507, + "grad_norm": 0.2291589230298996, + "learning_rate": 7.401220671029173e-05, + "loss": 1.7623, + "step": 11741 + }, + { + "epoch": 3.6040515653775325, + "grad_norm": 0.23962698876857758, + "learning_rate": 7.400784674029578e-05, + "loss": 1.7232, + "step": 11742 + }, + { + "epoch": 3.6043585021485574, + "grad_norm": 0.3015185594558716, + "learning_rate": 7.400348653304022e-05, + "loss": 1.7808, + "step": 11743 + }, + { + "epoch": 3.6046654389195827, + "grad_norm": 0.30623099207878113, + "learning_rate": 7.399912608856813e-05, + "loss": 1.8518, + "step": 11744 + }, + { + "epoch": 3.6049723756906076, + "grad_norm": 0.2698235511779785, + "learning_rate": 7.39947654069226e-05, + "loss": 1.7829, + "step": 11745 + }, + { + "epoch": 3.605279312461633, + "grad_norm": 0.2195274829864502, + "learning_rate": 7.399040448814674e-05, + "loss": 1.7709, + "step": 11746 + }, + { + "epoch": 3.6055862492326582, + "grad_norm": 0.22962357103824615, + "learning_rate": 7.398604333228366e-05, + "loss": 1.7482, + "step": 11747 + }, + { + "epoch": 3.605893186003683, + "grad_norm": 0.2403932511806488, + "learning_rate": 7.398168193937642e-05, + "loss": 1.8063, + "step": 11748 + }, + { + "epoch": 3.6062001227747085, + "grad_norm": 0.23542718589305878, + "learning_rate": 7.397732030946816e-05, + "loss": 1.7599, + "step": 11749 + }, + { + "epoch": 3.6065070595457334, + "grad_norm": 0.2462490350008011, + "learning_rate": 7.397295844260195e-05, + "loss": 1.8183, + "step": 11750 + }, + { + "epoch": 3.6068139963167587, + "grad_norm": 0.21428349614143372, + "learning_rate": 7.396859633882091e-05, + "loss": 1.6944, + "step": 11751 + }, + { + "epoch": 3.607120933087784, + "grad_norm": 0.21240907907485962, + "learning_rate": 7.396423399816817e-05, + "loss": 1.7795, + "step": 11752 + }, + { + "epoch": 3.607427869858809, + "grad_norm": 0.23413677513599396, + "learning_rate": 7.395987142068682e-05, + "loss": 1.8015, + "step": 11753 + }, + { + "epoch": 3.6077348066298343, + "grad_norm": 0.26724907755851746, + "learning_rate": 7.395550860641998e-05, + "loss": 1.8174, + "step": 11754 + }, + { + "epoch": 3.608041743400859, + "grad_norm": 0.22077679634094238, + "learning_rate": 7.395114555541077e-05, + "loss": 1.7929, + "step": 11755 + }, + { + "epoch": 3.6083486801718845, + "grad_norm": 0.2475263774394989, + "learning_rate": 7.394678226770228e-05, + "loss": 1.7744, + "step": 11756 + }, + { + "epoch": 3.60865561694291, + "grad_norm": 0.22579342126846313, + "learning_rate": 7.394241874333764e-05, + "loss": 1.79, + "step": 11757 + }, + { + "epoch": 3.608962553713935, + "grad_norm": 0.26798152923583984, + "learning_rate": 7.393805498236001e-05, + "loss": 1.8087, + "step": 11758 + }, + { + "epoch": 3.60926949048496, + "grad_norm": 0.2755621373653412, + "learning_rate": 7.393369098481248e-05, + "loss": 1.7834, + "step": 11759 + }, + { + "epoch": 3.6095764272559854, + "grad_norm": 0.2741812467575073, + "learning_rate": 7.39293267507382e-05, + "loss": 1.7948, + "step": 11760 + }, + { + "epoch": 3.6098833640270103, + "grad_norm": 0.2378924936056137, + "learning_rate": 7.392496228018028e-05, + "loss": 1.8317, + "step": 11761 + }, + { + "epoch": 3.6101903007980356, + "grad_norm": 0.2628132700920105, + "learning_rate": 7.392059757318187e-05, + "loss": 1.8123, + "step": 11762 + }, + { + "epoch": 3.610497237569061, + "grad_norm": 0.2613002359867096, + "learning_rate": 7.391623262978607e-05, + "loss": 1.795, + "step": 11763 + }, + { + "epoch": 3.610804174340086, + "grad_norm": 0.27272161841392517, + "learning_rate": 7.391186745003608e-05, + "loss": 1.7808, + "step": 11764 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.21366162598133087, + "learning_rate": 7.390750203397497e-05, + "loss": 1.77, + "step": 11765 + }, + { + "epoch": 3.611418047882136, + "grad_norm": 0.25559261441230774, + "learning_rate": 7.390313638164593e-05, + "loss": 1.8442, + "step": 11766 + }, + { + "epoch": 3.6117249846531614, + "grad_norm": 0.23794838786125183, + "learning_rate": 7.389877049309207e-05, + "loss": 1.8237, + "step": 11767 + }, + { + "epoch": 3.6120319214241867, + "grad_norm": 0.2690154016017914, + "learning_rate": 7.389440436835656e-05, + "loss": 1.8194, + "step": 11768 + }, + { + "epoch": 3.612338858195212, + "grad_norm": 0.26148009300231934, + "learning_rate": 7.389003800748254e-05, + "loss": 1.7862, + "step": 11769 + }, + { + "epoch": 3.612645794966237, + "grad_norm": 0.26414936780929565, + "learning_rate": 7.388567141051315e-05, + "loss": 1.7815, + "step": 11770 + }, + { + "epoch": 3.6129527317372623, + "grad_norm": 0.24473857879638672, + "learning_rate": 7.388130457749157e-05, + "loss": 1.801, + "step": 11771 + }, + { + "epoch": 3.613259668508287, + "grad_norm": 0.24356001615524292, + "learning_rate": 7.387693750846094e-05, + "loss": 1.8031, + "step": 11772 + }, + { + "epoch": 3.6135666052793125, + "grad_norm": 0.26716411113739014, + "learning_rate": 7.387257020346441e-05, + "loss": 1.7999, + "step": 11773 + }, + { + "epoch": 3.613873542050338, + "grad_norm": 0.2730760872364044, + "learning_rate": 7.386820266254516e-05, + "loss": 1.8079, + "step": 11774 + }, + { + "epoch": 3.6141804788213627, + "grad_norm": 0.2570728361606598, + "learning_rate": 7.386383488574635e-05, + "loss": 1.7374, + "step": 11775 + }, + { + "epoch": 3.614487415592388, + "grad_norm": 0.24992883205413818, + "learning_rate": 7.385946687311112e-05, + "loss": 1.8432, + "step": 11776 + }, + { + "epoch": 3.614794352363413, + "grad_norm": 0.28632259368896484, + "learning_rate": 7.385509862468266e-05, + "loss": 1.8014, + "step": 11777 + }, + { + "epoch": 3.6151012891344383, + "grad_norm": 0.257303923368454, + "learning_rate": 7.385073014050412e-05, + "loss": 1.8166, + "step": 11778 + }, + { + "epoch": 3.6154082259054636, + "grad_norm": 0.2791872024536133, + "learning_rate": 7.38463614206187e-05, + "loss": 1.7865, + "step": 11779 + }, + { + "epoch": 3.6157151626764885, + "grad_norm": 0.25708603858947754, + "learning_rate": 7.384199246506956e-05, + "loss": 1.807, + "step": 11780 + }, + { + "epoch": 3.616022099447514, + "grad_norm": 0.28693172335624695, + "learning_rate": 7.383762327389988e-05, + "loss": 1.8049, + "step": 11781 + }, + { + "epoch": 3.6163290362185387, + "grad_norm": 0.2731167674064636, + "learning_rate": 7.383325384715283e-05, + "loss": 1.8937, + "step": 11782 + }, + { + "epoch": 3.616635972989564, + "grad_norm": 0.26151663064956665, + "learning_rate": 7.38288841848716e-05, + "loss": 1.8288, + "step": 11783 + }, + { + "epoch": 3.6169429097605894, + "grad_norm": 0.2732257843017578, + "learning_rate": 7.382451428709936e-05, + "loss": 1.7668, + "step": 11784 + }, + { + "epoch": 3.6172498465316147, + "grad_norm": 0.2747575640678406, + "learning_rate": 7.38201441538793e-05, + "loss": 1.7991, + "step": 11785 + }, + { + "epoch": 3.6175567833026396, + "grad_norm": 0.2884783446788788, + "learning_rate": 7.381577378525462e-05, + "loss": 1.7798, + "step": 11786 + }, + { + "epoch": 3.617863720073665, + "grad_norm": 0.2716344892978668, + "learning_rate": 7.381140318126851e-05, + "loss": 1.7923, + "step": 11787 + }, + { + "epoch": 3.61817065684469, + "grad_norm": 0.3007747232913971, + "learning_rate": 7.380703234196416e-05, + "loss": 1.8397, + "step": 11788 + }, + { + "epoch": 3.618477593615715, + "grad_norm": 0.39218056201934814, + "learning_rate": 7.380266126738476e-05, + "loss": 1.8517, + "step": 11789 + }, + { + "epoch": 3.6187845303867405, + "grad_norm": 0.43425866961479187, + "learning_rate": 7.379828995757351e-05, + "loss": 1.7518, + "step": 11790 + }, + { + "epoch": 3.6190914671577654, + "grad_norm": 0.34399518370628357, + "learning_rate": 7.37939184125736e-05, + "loss": 1.7607, + "step": 11791 + }, + { + "epoch": 3.6193984039287908, + "grad_norm": 0.23124302923679352, + "learning_rate": 7.378954663242825e-05, + "loss": 1.7898, + "step": 11792 + }, + { + "epoch": 3.6197053406998156, + "grad_norm": 0.32839757204055786, + "learning_rate": 7.378517461718066e-05, + "loss": 1.7472, + "step": 11793 + }, + { + "epoch": 3.620012277470841, + "grad_norm": 0.38583460450172424, + "learning_rate": 7.378080236687403e-05, + "loss": 1.7947, + "step": 11794 + }, + { + "epoch": 3.6203192142418663, + "grad_norm": 0.4622896909713745, + "learning_rate": 7.377642988155157e-05, + "loss": 1.9023, + "step": 11795 + }, + { + "epoch": 3.620626151012891, + "grad_norm": 0.3783189058303833, + "learning_rate": 7.37720571612565e-05, + "loss": 1.7813, + "step": 11796 + }, + { + "epoch": 3.6209330877839165, + "grad_norm": 0.3468814790248871, + "learning_rate": 7.376768420603204e-05, + "loss": 1.7509, + "step": 11797 + }, + { + "epoch": 3.6212400245549414, + "grad_norm": 0.2602507174015045, + "learning_rate": 7.376331101592138e-05, + "loss": 1.8158, + "step": 11798 + }, + { + "epoch": 3.6215469613259668, + "grad_norm": 0.28337883949279785, + "learning_rate": 7.375893759096775e-05, + "loss": 1.7755, + "step": 11799 + }, + { + "epoch": 3.621853898096992, + "grad_norm": 0.3644609749317169, + "learning_rate": 7.375456393121437e-05, + "loss": 1.8193, + "step": 11800 + }, + { + "epoch": 3.6221608348680174, + "grad_norm": 0.338211327791214, + "learning_rate": 7.375019003670448e-05, + "loss": 1.821, + "step": 11801 + }, + { + "epoch": 3.6224677716390423, + "grad_norm": 0.23850654065608978, + "learning_rate": 7.374581590748129e-05, + "loss": 1.7317, + "step": 11802 + }, + { + "epoch": 3.6227747084100677, + "grad_norm": 0.3496716618537903, + "learning_rate": 7.374144154358801e-05, + "loss": 1.8361, + "step": 11803 + }, + { + "epoch": 3.6230816451810925, + "grad_norm": 0.5585216283798218, + "learning_rate": 7.37370669450679e-05, + "loss": 1.7667, + "step": 11804 + }, + { + "epoch": 3.623388581952118, + "grad_norm": 0.4578089714050293, + "learning_rate": 7.373269211196418e-05, + "loss": 1.8051, + "step": 11805 + }, + { + "epoch": 3.623695518723143, + "grad_norm": 0.28195759654045105, + "learning_rate": 7.37283170443201e-05, + "loss": 1.7823, + "step": 11806 + }, + { + "epoch": 3.624002455494168, + "grad_norm": 0.4066108465194702, + "learning_rate": 7.372394174217887e-05, + "loss": 1.7819, + "step": 11807 + }, + { + "epoch": 3.6243093922651934, + "grad_norm": 0.5368703007698059, + "learning_rate": 7.371956620558375e-05, + "loss": 1.8121, + "step": 11808 + }, + { + "epoch": 3.6246163290362183, + "grad_norm": 0.36627063155174255, + "learning_rate": 7.371519043457795e-05, + "loss": 1.7944, + "step": 11809 + }, + { + "epoch": 3.6249232658072437, + "grad_norm": 0.3100780248641968, + "learning_rate": 7.371081442920476e-05, + "loss": 1.783, + "step": 11810 + }, + { + "epoch": 3.625230202578269, + "grad_norm": 0.3277178704738617, + "learning_rate": 7.370643818950741e-05, + "loss": 1.8105, + "step": 11811 + }, + { + "epoch": 3.625537139349294, + "grad_norm": 0.3887772560119629, + "learning_rate": 7.370206171552914e-05, + "loss": 1.8136, + "step": 11812 + }, + { + "epoch": 3.6258440761203192, + "grad_norm": 0.2770824134349823, + "learning_rate": 7.36976850073132e-05, + "loss": 1.7852, + "step": 11813 + }, + { + "epoch": 3.626151012891344, + "grad_norm": 0.26357728242874146, + "learning_rate": 7.369330806490284e-05, + "loss": 1.7621, + "step": 11814 + }, + { + "epoch": 3.6264579496623695, + "grad_norm": 0.3387344181537628, + "learning_rate": 7.368893088834135e-05, + "loss": 1.7785, + "step": 11815 + }, + { + "epoch": 3.626764886433395, + "grad_norm": 0.35155174136161804, + "learning_rate": 7.368455347767193e-05, + "loss": 1.8081, + "step": 11816 + }, + { + "epoch": 3.62707182320442, + "grad_norm": 0.2855289876461029, + "learning_rate": 7.368017583293788e-05, + "loss": 1.8245, + "step": 11817 + }, + { + "epoch": 3.627378759975445, + "grad_norm": 0.28462162613868713, + "learning_rate": 7.367579795418245e-05, + "loss": 1.8066, + "step": 11818 + }, + { + "epoch": 3.6276856967464703, + "grad_norm": 0.40696555376052856, + "learning_rate": 7.367141984144891e-05, + "loss": 1.8897, + "step": 11819 + }, + { + "epoch": 3.6279926335174952, + "grad_norm": 0.472782701253891, + "learning_rate": 7.366704149478054e-05, + "loss": 1.8071, + "step": 11820 + }, + { + "epoch": 3.6282995702885206, + "grad_norm": 0.27022916078567505, + "learning_rate": 7.366266291422057e-05, + "loss": 1.8574, + "step": 11821 + }, + { + "epoch": 3.628606507059546, + "grad_norm": 0.4207148253917694, + "learning_rate": 7.365828409981231e-05, + "loss": 1.7759, + "step": 11822 + }, + { + "epoch": 3.628913443830571, + "grad_norm": 0.42866072058677673, + "learning_rate": 7.365390505159902e-05, + "loss": 1.7366, + "step": 11823 + }, + { + "epoch": 3.629220380601596, + "grad_norm": 0.28288859128952026, + "learning_rate": 7.364952576962398e-05, + "loss": 1.8591, + "step": 11824 + }, + { + "epoch": 3.629527317372621, + "grad_norm": 0.30544906854629517, + "learning_rate": 7.364514625393045e-05, + "loss": 1.7965, + "step": 11825 + }, + { + "epoch": 3.6298342541436464, + "grad_norm": 0.3251616954803467, + "learning_rate": 7.364076650456173e-05, + "loss": 1.8197, + "step": 11826 + }, + { + "epoch": 3.6301411909146717, + "grad_norm": 0.3133888840675354, + "learning_rate": 7.363638652156109e-05, + "loss": 1.7978, + "step": 11827 + }, + { + "epoch": 3.630448127685697, + "grad_norm": 0.29004594683647156, + "learning_rate": 7.363200630497185e-05, + "loss": 1.8035, + "step": 11828 + }, + { + "epoch": 3.630755064456722, + "grad_norm": 0.2781279683113098, + "learning_rate": 7.362762585483725e-05, + "loss": 1.8462, + "step": 11829 + }, + { + "epoch": 3.6310620012277472, + "grad_norm": 0.29003822803497314, + "learning_rate": 7.362324517120063e-05, + "loss": 1.7952, + "step": 11830 + }, + { + "epoch": 3.631368937998772, + "grad_norm": 0.2510940134525299, + "learning_rate": 7.361886425410524e-05, + "loss": 1.7645, + "step": 11831 + }, + { + "epoch": 3.6316758747697975, + "grad_norm": 0.23798540234565735, + "learning_rate": 7.361448310359438e-05, + "loss": 1.7329, + "step": 11832 + }, + { + "epoch": 3.631982811540823, + "grad_norm": 0.2711278796195984, + "learning_rate": 7.361010171971137e-05, + "loss": 1.8245, + "step": 11833 + }, + { + "epoch": 3.6322897483118477, + "grad_norm": 0.2895669639110565, + "learning_rate": 7.360572010249949e-05, + "loss": 1.7668, + "step": 11834 + }, + { + "epoch": 3.632596685082873, + "grad_norm": 0.2216273844242096, + "learning_rate": 7.360133825200205e-05, + "loss": 1.8164, + "step": 11835 + }, + { + "epoch": 3.632903621853898, + "grad_norm": 0.3075082302093506, + "learning_rate": 7.359695616826236e-05, + "loss": 1.8159, + "step": 11836 + }, + { + "epoch": 3.6332105586249233, + "grad_norm": 0.3208801746368408, + "learning_rate": 7.35925738513237e-05, + "loss": 1.8385, + "step": 11837 + }, + { + "epoch": 3.6335174953959486, + "grad_norm": 0.272517591714859, + "learning_rate": 7.35881913012294e-05, + "loss": 1.7653, + "step": 11838 + }, + { + "epoch": 3.6338244321669735, + "grad_norm": 0.23105360567569733, + "learning_rate": 7.358380851802277e-05, + "loss": 1.7697, + "step": 11839 + }, + { + "epoch": 3.634131368937999, + "grad_norm": 0.2643153667449951, + "learning_rate": 7.357942550174714e-05, + "loss": 1.7885, + "step": 11840 + }, + { + "epoch": 3.6344383057090237, + "grad_norm": 0.22643202543258667, + "learning_rate": 7.357504225244579e-05, + "loss": 1.746, + "step": 11841 + }, + { + "epoch": 3.634745242480049, + "grad_norm": 0.27782970666885376, + "learning_rate": 7.357065877016207e-05, + "loss": 1.794, + "step": 11842 + }, + { + "epoch": 3.6350521792510744, + "grad_norm": 0.3035561740398407, + "learning_rate": 7.356627505493925e-05, + "loss": 1.7892, + "step": 11843 + }, + { + "epoch": 3.6353591160220997, + "grad_norm": 0.31859731674194336, + "learning_rate": 7.356189110682072e-05, + "loss": 1.7636, + "step": 11844 + }, + { + "epoch": 3.6356660527931246, + "grad_norm": 0.2960890233516693, + "learning_rate": 7.355750692584977e-05, + "loss": 1.8294, + "step": 11845 + }, + { + "epoch": 3.63597298956415, + "grad_norm": 0.2544194459915161, + "learning_rate": 7.355312251206972e-05, + "loss": 1.7603, + "step": 11846 + }, + { + "epoch": 3.636279926335175, + "grad_norm": 0.27864789962768555, + "learning_rate": 7.354873786552391e-05, + "loss": 1.7917, + "step": 11847 + }, + { + "epoch": 3.6365868631062, + "grad_norm": 0.32552552223205566, + "learning_rate": 7.354435298625568e-05, + "loss": 1.7769, + "step": 11848 + }, + { + "epoch": 3.6368937998772255, + "grad_norm": 0.25094640254974365, + "learning_rate": 7.353996787430833e-05, + "loss": 1.8371, + "step": 11849 + }, + { + "epoch": 3.6372007366482504, + "grad_norm": 0.26656433939933777, + "learning_rate": 7.353558252972524e-05, + "loss": 1.7686, + "step": 11850 + }, + { + "epoch": 3.6375076734192757, + "grad_norm": 0.3023635745048523, + "learning_rate": 7.353119695254973e-05, + "loss": 1.7892, + "step": 11851 + }, + { + "epoch": 3.6378146101903006, + "grad_norm": 0.2822463810443878, + "learning_rate": 7.352681114282514e-05, + "loss": 1.8221, + "step": 11852 + }, + { + "epoch": 3.638121546961326, + "grad_norm": 0.31159496307373047, + "learning_rate": 7.35224251005948e-05, + "loss": 1.803, + "step": 11853 + }, + { + "epoch": 3.6384284837323513, + "grad_norm": 0.3133087158203125, + "learning_rate": 7.351803882590207e-05, + "loss": 1.744, + "step": 11854 + }, + { + "epoch": 3.638735420503376, + "grad_norm": 0.3050002455711365, + "learning_rate": 7.351365231879029e-05, + "loss": 1.7522, + "step": 11855 + }, + { + "epoch": 3.6390423572744015, + "grad_norm": 0.2729037404060364, + "learning_rate": 7.350926557930283e-05, + "loss": 1.7629, + "step": 11856 + }, + { + "epoch": 3.6393492940454264, + "grad_norm": 0.3181995153427124, + "learning_rate": 7.350487860748303e-05, + "loss": 1.7603, + "step": 11857 + }, + { + "epoch": 3.6396562308164517, + "grad_norm": 0.352651447057724, + "learning_rate": 7.350049140337423e-05, + "loss": 1.8177, + "step": 11858 + }, + { + "epoch": 3.639963167587477, + "grad_norm": 0.22935177385807037, + "learning_rate": 7.349610396701981e-05, + "loss": 1.7421, + "step": 11859 + }, + { + "epoch": 3.6402701043585024, + "grad_norm": 0.26442599296569824, + "learning_rate": 7.349171629846312e-05, + "loss": 1.8026, + "step": 11860 + }, + { + "epoch": 3.6405770411295273, + "grad_norm": 0.25357648730278015, + "learning_rate": 7.348732839774751e-05, + "loss": 1.788, + "step": 11861 + }, + { + "epoch": 3.6408839779005526, + "grad_norm": 0.26959577202796936, + "learning_rate": 7.348294026491635e-05, + "loss": 1.884, + "step": 11862 + }, + { + "epoch": 3.6411909146715775, + "grad_norm": 0.2243001013994217, + "learning_rate": 7.347855190001304e-05, + "loss": 1.7765, + "step": 11863 + }, + { + "epoch": 3.641497851442603, + "grad_norm": 0.2480708807706833, + "learning_rate": 7.34741633030809e-05, + "loss": 1.7597, + "step": 11864 + }, + { + "epoch": 3.641804788213628, + "grad_norm": 0.22512994706630707, + "learning_rate": 7.346977447416332e-05, + "loss": 1.7647, + "step": 11865 + }, + { + "epoch": 3.642111724984653, + "grad_norm": 0.24961981177330017, + "learning_rate": 7.346538541330368e-05, + "loss": 1.8178, + "step": 11866 + }, + { + "epoch": 3.6424186617556784, + "grad_norm": 0.320896714925766, + "learning_rate": 7.346099612054533e-05, + "loss": 1.85, + "step": 11867 + }, + { + "epoch": 3.6427255985267033, + "grad_norm": 0.3420880436897278, + "learning_rate": 7.345660659593167e-05, + "loss": 1.8661, + "step": 11868 + }, + { + "epoch": 3.6430325352977286, + "grad_norm": 0.2675844132900238, + "learning_rate": 7.34522168395061e-05, + "loss": 1.8177, + "step": 11869 + }, + { + "epoch": 3.643339472068754, + "grad_norm": 0.23993943631649017, + "learning_rate": 7.344782685131195e-05, + "loss": 1.7365, + "step": 11870 + }, + { + "epoch": 3.643646408839779, + "grad_norm": 0.21805813908576965, + "learning_rate": 7.344343663139264e-05, + "loss": 1.7813, + "step": 11871 + }, + { + "epoch": 3.643953345610804, + "grad_norm": 0.24334421753883362, + "learning_rate": 7.343904617979154e-05, + "loss": 1.7763, + "step": 11872 + }, + { + "epoch": 3.644260282381829, + "grad_norm": 0.22768431901931763, + "learning_rate": 7.343465549655206e-05, + "loss": 1.7817, + "step": 11873 + }, + { + "epoch": 3.6445672191528544, + "grad_norm": 0.23828962445259094, + "learning_rate": 7.343026458171757e-05, + "loss": 1.8391, + "step": 11874 + }, + { + "epoch": 3.6448741559238798, + "grad_norm": 0.24838197231292725, + "learning_rate": 7.342587343533149e-05, + "loss": 1.759, + "step": 11875 + }, + { + "epoch": 3.645181092694905, + "grad_norm": 0.22732019424438477, + "learning_rate": 7.342148205743718e-05, + "loss": 1.7348, + "step": 11876 + }, + { + "epoch": 3.64548802946593, + "grad_norm": 0.25106775760650635, + "learning_rate": 7.341709044807807e-05, + "loss": 1.8121, + "step": 11877 + }, + { + "epoch": 3.6457949662369553, + "grad_norm": 0.28532838821411133, + "learning_rate": 7.341269860729753e-05, + "loss": 1.7147, + "step": 11878 + }, + { + "epoch": 3.64610190300798, + "grad_norm": 0.3041890859603882, + "learning_rate": 7.340830653513899e-05, + "loss": 1.7666, + "step": 11879 + }, + { + "epoch": 3.6464088397790055, + "grad_norm": 0.3142147958278656, + "learning_rate": 7.340391423164585e-05, + "loss": 1.8707, + "step": 11880 + }, + { + "epoch": 3.646715776550031, + "grad_norm": 0.28531381487846375, + "learning_rate": 7.339952169686151e-05, + "loss": 1.7961, + "step": 11881 + }, + { + "epoch": 3.6470227133210558, + "grad_norm": 0.33779671788215637, + "learning_rate": 7.339512893082938e-05, + "loss": 1.7428, + "step": 11882 + }, + { + "epoch": 3.647329650092081, + "grad_norm": 0.29611849784851074, + "learning_rate": 7.339073593359287e-05, + "loss": 1.8803, + "step": 11883 + }, + { + "epoch": 3.647636586863106, + "grad_norm": 0.31248557567596436, + "learning_rate": 7.33863427051954e-05, + "loss": 1.7868, + "step": 11884 + }, + { + "epoch": 3.6479435236341313, + "grad_norm": 0.42829564213752747, + "learning_rate": 7.338194924568039e-05, + "loss": 1.8558, + "step": 11885 + }, + { + "epoch": 3.6482504604051567, + "grad_norm": 0.431023508310318, + "learning_rate": 7.337755555509126e-05, + "loss": 1.7565, + "step": 11886 + }, + { + "epoch": 3.6485573971761815, + "grad_norm": 0.2917975187301636, + "learning_rate": 7.33731616334714e-05, + "loss": 1.8067, + "step": 11887 + }, + { + "epoch": 3.648864333947207, + "grad_norm": 0.3072175085544586, + "learning_rate": 7.336876748086427e-05, + "loss": 1.782, + "step": 11888 + }, + { + "epoch": 3.6491712707182318, + "grad_norm": 0.33658862113952637, + "learning_rate": 7.336437309731327e-05, + "loss": 1.8007, + "step": 11889 + }, + { + "epoch": 3.649478207489257, + "grad_norm": 0.23774033784866333, + "learning_rate": 7.335997848286185e-05, + "loss": 1.7606, + "step": 11890 + }, + { + "epoch": 3.6497851442602824, + "grad_norm": 0.3373236358165741, + "learning_rate": 7.335558363755344e-05, + "loss": 1.7335, + "step": 11891 + }, + { + "epoch": 3.650092081031308, + "grad_norm": 0.3906517028808594, + "learning_rate": 7.335118856143145e-05, + "loss": 1.7974, + "step": 11892 + }, + { + "epoch": 3.6503990178023327, + "grad_norm": 0.37715303897857666, + "learning_rate": 7.334679325453934e-05, + "loss": 1.8875, + "step": 11893 + }, + { + "epoch": 3.650705954573358, + "grad_norm": 0.278540700674057, + "learning_rate": 7.334239771692053e-05, + "loss": 1.8165, + "step": 11894 + }, + { + "epoch": 3.651012891344383, + "grad_norm": 0.24434895813465118, + "learning_rate": 7.333800194861845e-05, + "loss": 1.7756, + "step": 11895 + }, + { + "epoch": 3.6513198281154082, + "grad_norm": 0.25057271122932434, + "learning_rate": 7.333360594967658e-05, + "loss": 1.7932, + "step": 11896 + }, + { + "epoch": 3.6516267648864336, + "grad_norm": 0.3277342617511749, + "learning_rate": 7.332920972013833e-05, + "loss": 1.7781, + "step": 11897 + }, + { + "epoch": 3.6519337016574585, + "grad_norm": 0.2754829525947571, + "learning_rate": 7.332481326004715e-05, + "loss": 1.7916, + "step": 11898 + }, + { + "epoch": 3.652240638428484, + "grad_norm": 0.24490588903427124, + "learning_rate": 7.332041656944651e-05, + "loss": 1.7904, + "step": 11899 + }, + { + "epoch": 3.6525475751995087, + "grad_norm": 0.3176959455013275, + "learning_rate": 7.331601964837982e-05, + "loss": 1.7379, + "step": 11900 + }, + { + "epoch": 3.652854511970534, + "grad_norm": 0.3435784876346588, + "learning_rate": 7.331162249689057e-05, + "loss": 1.7635, + "step": 11901 + }, + { + "epoch": 3.6531614487415593, + "grad_norm": 0.335697740316391, + "learning_rate": 7.330722511502221e-05, + "loss": 1.7903, + "step": 11902 + }, + { + "epoch": 3.6534683855125847, + "grad_norm": 0.2748894691467285, + "learning_rate": 7.330282750281819e-05, + "loss": 1.8259, + "step": 11903 + }, + { + "epoch": 3.6537753222836096, + "grad_norm": 0.36754751205444336, + "learning_rate": 7.329842966032197e-05, + "loss": 1.7728, + "step": 11904 + }, + { + "epoch": 3.654082259054635, + "grad_norm": 0.4355713129043579, + "learning_rate": 7.3294031587577e-05, + "loss": 1.7447, + "step": 11905 + }, + { + "epoch": 3.65438919582566, + "grad_norm": 0.3967476487159729, + "learning_rate": 7.328963328462677e-05, + "loss": 1.8299, + "step": 11906 + }, + { + "epoch": 3.654696132596685, + "grad_norm": 0.23805755376815796, + "learning_rate": 7.328523475151472e-05, + "loss": 1.7631, + "step": 11907 + }, + { + "epoch": 3.6550030693677105, + "grad_norm": 0.40350377559661865, + "learning_rate": 7.328083598828435e-05, + "loss": 1.8693, + "step": 11908 + }, + { + "epoch": 3.6553100061387354, + "grad_norm": 0.4743673801422119, + "learning_rate": 7.32764369949791e-05, + "loss": 1.7887, + "step": 11909 + }, + { + "epoch": 3.6556169429097607, + "grad_norm": 0.33830127120018005, + "learning_rate": 7.327203777164246e-05, + "loss": 1.7527, + "step": 11910 + }, + { + "epoch": 3.6559238796807856, + "grad_norm": 0.2465003877878189, + "learning_rate": 7.326763831831791e-05, + "loss": 1.7898, + "step": 11911 + }, + { + "epoch": 3.656230816451811, + "grad_norm": 0.31647852063179016, + "learning_rate": 7.326323863504892e-05, + "loss": 1.8056, + "step": 11912 + }, + { + "epoch": 3.6565377532228363, + "grad_norm": 0.31436124444007874, + "learning_rate": 7.325883872187896e-05, + "loss": 1.7972, + "step": 11913 + }, + { + "epoch": 3.656844689993861, + "grad_norm": 0.260405957698822, + "learning_rate": 7.325443857885153e-05, + "loss": 1.8109, + "step": 11914 + }, + { + "epoch": 3.6571516267648865, + "grad_norm": 0.29312583804130554, + "learning_rate": 7.325003820601011e-05, + "loss": 1.8947, + "step": 11915 + }, + { + "epoch": 3.6574585635359114, + "grad_norm": 0.2641582190990448, + "learning_rate": 7.324563760339819e-05, + "loss": 1.7737, + "step": 11916 + }, + { + "epoch": 3.6577655003069367, + "grad_norm": 0.2338121086359024, + "learning_rate": 7.324123677105923e-05, + "loss": 1.7462, + "step": 11917 + }, + { + "epoch": 3.658072437077962, + "grad_norm": 0.27877378463745117, + "learning_rate": 7.323683570903676e-05, + "loss": 1.8371, + "step": 11918 + }, + { + "epoch": 3.6583793738489874, + "grad_norm": 0.24238766729831696, + "learning_rate": 7.323243441737427e-05, + "loss": 1.7304, + "step": 11919 + }, + { + "epoch": 3.6586863106200123, + "grad_norm": 0.2349759042263031, + "learning_rate": 7.322803289611525e-05, + "loss": 1.7422, + "step": 11920 + }, + { + "epoch": 3.6589932473910376, + "grad_norm": 0.2254217565059662, + "learning_rate": 7.322363114530318e-05, + "loss": 1.7296, + "step": 11921 + }, + { + "epoch": 3.6593001841620625, + "grad_norm": 0.24533270299434662, + "learning_rate": 7.321922916498158e-05, + "loss": 1.7834, + "step": 11922 + }, + { + "epoch": 3.659607120933088, + "grad_norm": 0.24993161857128143, + "learning_rate": 7.321482695519393e-05, + "loss": 1.8502, + "step": 11923 + }, + { + "epoch": 3.659914057704113, + "grad_norm": 0.2540178894996643, + "learning_rate": 7.321042451598378e-05, + "loss": 1.8372, + "step": 11924 + }, + { + "epoch": 3.660220994475138, + "grad_norm": 0.2241390198469162, + "learning_rate": 7.32060218473946e-05, + "loss": 1.7619, + "step": 11925 + }, + { + "epoch": 3.6605279312461634, + "grad_norm": 0.2137840837240219, + "learning_rate": 7.32016189494699e-05, + "loss": 1.751, + "step": 11926 + }, + { + "epoch": 3.6608348680171883, + "grad_norm": 0.2596585154533386, + "learning_rate": 7.319721582225323e-05, + "loss": 1.7773, + "step": 11927 + }, + { + "epoch": 3.6611418047882136, + "grad_norm": 0.24898354709148407, + "learning_rate": 7.319281246578806e-05, + "loss": 1.7347, + "step": 11928 + }, + { + "epoch": 3.661448741559239, + "grad_norm": 0.26553863286972046, + "learning_rate": 7.31884088801179e-05, + "loss": 1.7812, + "step": 11929 + }, + { + "epoch": 3.661755678330264, + "grad_norm": 0.2494438737630844, + "learning_rate": 7.318400506528633e-05, + "loss": 1.7554, + "step": 11930 + }, + { + "epoch": 3.662062615101289, + "grad_norm": 0.2794995903968811, + "learning_rate": 7.317960102133682e-05, + "loss": 1.7495, + "step": 11931 + }, + { + "epoch": 3.662369551872314, + "grad_norm": 0.2843860983848572, + "learning_rate": 7.317519674831293e-05, + "loss": 1.7734, + "step": 11932 + }, + { + "epoch": 3.6626764886433394, + "grad_norm": 0.28261128067970276, + "learning_rate": 7.317079224625813e-05, + "loss": 1.7794, + "step": 11933 + }, + { + "epoch": 3.6629834254143647, + "grad_norm": 0.2552426755428314, + "learning_rate": 7.316638751521599e-05, + "loss": 1.8397, + "step": 11934 + }, + { + "epoch": 3.66329036218539, + "grad_norm": 0.4140608608722687, + "learning_rate": 7.316198255523002e-05, + "loss": 1.848, + "step": 11935 + }, + { + "epoch": 3.663597298956415, + "grad_norm": 0.3709854483604431, + "learning_rate": 7.315757736634377e-05, + "loss": 1.8489, + "step": 11936 + }, + { + "epoch": 3.6639042357274403, + "grad_norm": 0.23637300729751587, + "learning_rate": 7.315317194860078e-05, + "loss": 1.7549, + "step": 11937 + }, + { + "epoch": 3.664211172498465, + "grad_norm": 0.32884421944618225, + "learning_rate": 7.314876630204456e-05, + "loss": 1.8061, + "step": 11938 + }, + { + "epoch": 3.6645181092694905, + "grad_norm": 0.33354130387306213, + "learning_rate": 7.314436042671867e-05, + "loss": 1.8346, + "step": 11939 + }, + { + "epoch": 3.664825046040516, + "grad_norm": 0.25776317715644836, + "learning_rate": 7.313995432266663e-05, + "loss": 1.8598, + "step": 11940 + }, + { + "epoch": 3.6651319828115407, + "grad_norm": 0.2910402715206146, + "learning_rate": 7.313554798993202e-05, + "loss": 1.7613, + "step": 11941 + }, + { + "epoch": 3.665438919582566, + "grad_norm": 0.3487538695335388, + "learning_rate": 7.313114142855836e-05, + "loss": 1.8105, + "step": 11942 + }, + { + "epoch": 3.665745856353591, + "grad_norm": 0.27271291613578796, + "learning_rate": 7.312673463858918e-05, + "loss": 1.8107, + "step": 11943 + }, + { + "epoch": 3.6660527931246163, + "grad_norm": 0.2613036632537842, + "learning_rate": 7.312232762006809e-05, + "loss": 1.7871, + "step": 11944 + }, + { + "epoch": 3.6663597298956416, + "grad_norm": 0.30594903230667114, + "learning_rate": 7.311792037303859e-05, + "loss": 1.8043, + "step": 11945 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.3960847854614258, + "learning_rate": 7.311351289754425e-05, + "loss": 1.8434, + "step": 11946 + }, + { + "epoch": 3.666973603437692, + "grad_norm": 0.33369311690330505, + "learning_rate": 7.310910519362861e-05, + "loss": 1.7496, + "step": 11947 + }, + { + "epoch": 3.6672805402087167, + "grad_norm": 0.29852384328842163, + "learning_rate": 7.310469726133528e-05, + "loss": 1.858, + "step": 11948 + }, + { + "epoch": 3.667587476979742, + "grad_norm": 0.2610527276992798, + "learning_rate": 7.310028910070777e-05, + "loss": 1.7642, + "step": 11949 + }, + { + "epoch": 3.6678944137507674, + "grad_norm": 0.3606704771518707, + "learning_rate": 7.309588071178967e-05, + "loss": 1.845, + "step": 11950 + }, + { + "epoch": 3.6682013505217927, + "grad_norm": 0.3157273828983307, + "learning_rate": 7.309147209462454e-05, + "loss": 1.7864, + "step": 11951 + }, + { + "epoch": 3.6685082872928176, + "grad_norm": 0.23907925188541412, + "learning_rate": 7.308706324925594e-05, + "loss": 1.8363, + "step": 11952 + }, + { + "epoch": 3.668815224063843, + "grad_norm": 0.3365088999271393, + "learning_rate": 7.308265417572747e-05, + "loss": 1.8755, + "step": 11953 + }, + { + "epoch": 3.669122160834868, + "grad_norm": 0.29404979944229126, + "learning_rate": 7.307824487408266e-05, + "loss": 1.8128, + "step": 11954 + }, + { + "epoch": 3.669429097605893, + "grad_norm": 0.2689574658870697, + "learning_rate": 7.307383534436511e-05, + "loss": 1.8072, + "step": 11955 + }, + { + "epoch": 3.6697360343769185, + "grad_norm": 0.28394198417663574, + "learning_rate": 7.306942558661841e-05, + "loss": 1.7919, + "step": 11956 + }, + { + "epoch": 3.6700429711479434, + "grad_norm": 0.2594783902168274, + "learning_rate": 7.306501560088612e-05, + "loss": 1.7467, + "step": 11957 + }, + { + "epoch": 3.6703499079189688, + "grad_norm": 0.24765191972255707, + "learning_rate": 7.30606053872118e-05, + "loss": 1.7876, + "step": 11958 + }, + { + "epoch": 3.6706568446899936, + "grad_norm": 0.22157172858715057, + "learning_rate": 7.305619494563909e-05, + "loss": 1.7802, + "step": 11959 + }, + { + "epoch": 3.670963781461019, + "grad_norm": 0.270151287317276, + "learning_rate": 7.305178427621155e-05, + "loss": 1.7723, + "step": 11960 + }, + { + "epoch": 3.6712707182320443, + "grad_norm": 0.3163939118385315, + "learning_rate": 7.304737337897277e-05, + "loss": 1.8488, + "step": 11961 + }, + { + "epoch": 3.671577655003069, + "grad_norm": 0.2605706453323364, + "learning_rate": 7.304296225396632e-05, + "loss": 1.7442, + "step": 11962 + }, + { + "epoch": 3.6718845917740945, + "grad_norm": 0.31179291009902954, + "learning_rate": 7.303855090123582e-05, + "loss": 1.831, + "step": 11963 + }, + { + "epoch": 3.6721915285451194, + "grad_norm": 0.33365359902381897, + "learning_rate": 7.303413932082483e-05, + "loss": 1.8376, + "step": 11964 + }, + { + "epoch": 3.6724984653161448, + "grad_norm": 0.2952130138874054, + "learning_rate": 7.302972751277701e-05, + "loss": 1.7733, + "step": 11965 + }, + { + "epoch": 3.67280540208717, + "grad_norm": 0.24270877242088318, + "learning_rate": 7.302531547713592e-05, + "loss": 1.8367, + "step": 11966 + }, + { + "epoch": 3.6731123388581954, + "grad_norm": 0.34315919876098633, + "learning_rate": 7.302090321394517e-05, + "loss": 1.7901, + "step": 11967 + }, + { + "epoch": 3.6734192756292203, + "grad_norm": 0.33511418104171753, + "learning_rate": 7.301649072324834e-05, + "loss": 1.7929, + "step": 11968 + }, + { + "epoch": 3.6737262124002457, + "grad_norm": 0.22397933900356293, + "learning_rate": 7.301207800508907e-05, + "loss": 1.7533, + "step": 11969 + }, + { + "epoch": 3.6740331491712706, + "grad_norm": 0.2882738411426544, + "learning_rate": 7.300766505951095e-05, + "loss": 1.8071, + "step": 11970 + }, + { + "epoch": 3.674340085942296, + "grad_norm": 0.242112398147583, + "learning_rate": 7.300325188655761e-05, + "loss": 1.7739, + "step": 11971 + }, + { + "epoch": 3.674647022713321, + "grad_norm": 0.27754491567611694, + "learning_rate": 7.299883848627265e-05, + "loss": 1.8295, + "step": 11972 + }, + { + "epoch": 3.674953959484346, + "grad_norm": 0.2787899076938629, + "learning_rate": 7.29944248586997e-05, + "loss": 1.7682, + "step": 11973 + }, + { + "epoch": 3.6752608962553714, + "grad_norm": 0.24448934197425842, + "learning_rate": 7.299001100388234e-05, + "loss": 1.7826, + "step": 11974 + }, + { + "epoch": 3.6755678330263963, + "grad_norm": 0.37869495153427124, + "learning_rate": 7.298559692186421e-05, + "loss": 1.8582, + "step": 11975 + }, + { + "epoch": 3.6758747697974217, + "grad_norm": 0.3299996256828308, + "learning_rate": 7.298118261268897e-05, + "loss": 1.7716, + "step": 11976 + }, + { + "epoch": 3.676181706568447, + "grad_norm": 0.278891384601593, + "learning_rate": 7.29767680764002e-05, + "loss": 1.879, + "step": 11977 + }, + { + "epoch": 3.6764886433394723, + "grad_norm": 0.29326459765434265, + "learning_rate": 7.297235331304155e-05, + "loss": 1.804, + "step": 11978 + }, + { + "epoch": 3.6767955801104972, + "grad_norm": 0.2697092592716217, + "learning_rate": 7.296793832265663e-05, + "loss": 1.7842, + "step": 11979 + }, + { + "epoch": 3.6771025168815226, + "grad_norm": 0.3045118749141693, + "learning_rate": 7.296352310528909e-05, + "loss": 1.7959, + "step": 11980 + }, + { + "epoch": 3.6774094536525475, + "grad_norm": 0.278647780418396, + "learning_rate": 7.295910766098252e-05, + "loss": 1.7907, + "step": 11981 + }, + { + "epoch": 3.677716390423573, + "grad_norm": 0.2370275855064392, + "learning_rate": 7.295469198978063e-05, + "loss": 1.757, + "step": 11982 + }, + { + "epoch": 3.678023327194598, + "grad_norm": 0.3061021566390991, + "learning_rate": 7.295027609172702e-05, + "loss": 1.7927, + "step": 11983 + }, + { + "epoch": 3.678330263965623, + "grad_norm": 0.2844544053077698, + "learning_rate": 7.294585996686532e-05, + "loss": 1.7705, + "step": 11984 + }, + { + "epoch": 3.6786372007366483, + "grad_norm": 0.31121113896369934, + "learning_rate": 7.29414436152392e-05, + "loss": 1.783, + "step": 11985 + }, + { + "epoch": 3.6789441375076732, + "grad_norm": 0.2566785514354706, + "learning_rate": 7.293702703689225e-05, + "loss": 1.7781, + "step": 11986 + }, + { + "epoch": 3.6792510742786986, + "grad_norm": 0.22176961600780487, + "learning_rate": 7.293261023186818e-05, + "loss": 1.7302, + "step": 11987 + }, + { + "epoch": 3.679558011049724, + "grad_norm": 0.21547441184520721, + "learning_rate": 7.292819320021062e-05, + "loss": 1.7666, + "step": 11988 + }, + { + "epoch": 3.679864947820749, + "grad_norm": 0.26309674978256226, + "learning_rate": 7.29237759419632e-05, + "loss": 1.7817, + "step": 11989 + }, + { + "epoch": 3.680171884591774, + "grad_norm": 0.2558063864707947, + "learning_rate": 7.29193584571696e-05, + "loss": 1.8257, + "step": 11990 + }, + { + "epoch": 3.680478821362799, + "grad_norm": 0.24516844749450684, + "learning_rate": 7.291494074587347e-05, + "loss": 1.7803, + "step": 11991 + }, + { + "epoch": 3.6807857581338244, + "grad_norm": 0.22891047596931458, + "learning_rate": 7.291052280811843e-05, + "loss": 1.7977, + "step": 11992 + }, + { + "epoch": 3.6810926949048497, + "grad_norm": 0.2776026129722595, + "learning_rate": 7.290610464394822e-05, + "loss": 1.8486, + "step": 11993 + }, + { + "epoch": 3.681399631675875, + "grad_norm": 0.31472426652908325, + "learning_rate": 7.290168625340644e-05, + "loss": 1.7841, + "step": 11994 + }, + { + "epoch": 3.6817065684469, + "grad_norm": 0.3459274470806122, + "learning_rate": 7.289726763653677e-05, + "loss": 1.7458, + "step": 11995 + }, + { + "epoch": 3.6820135052179253, + "grad_norm": 0.23645849525928497, + "learning_rate": 7.289284879338289e-05, + "loss": 1.781, + "step": 11996 + }, + { + "epoch": 3.68232044198895, + "grad_norm": 0.3257114291191101, + "learning_rate": 7.288842972398845e-05, + "loss": 1.8269, + "step": 11997 + }, + { + "epoch": 3.6826273787599755, + "grad_norm": 0.5450126528739929, + "learning_rate": 7.288401042839713e-05, + "loss": 1.8342, + "step": 11998 + }, + { + "epoch": 3.682934315531001, + "grad_norm": 0.5080512762069702, + "learning_rate": 7.287959090665262e-05, + "loss": 1.8097, + "step": 11999 + }, + { + "epoch": 3.6832412523020257, + "grad_norm": 0.3005252480506897, + "learning_rate": 7.287517115879858e-05, + "loss": 1.8271, + "step": 12000 + }, + { + "epoch": 3.683548189073051, + "grad_norm": 0.2760924994945526, + "learning_rate": 7.287075118487869e-05, + "loss": 1.8267, + "step": 12001 + }, + { + "epoch": 3.683855125844076, + "grad_norm": 0.3475865423679352, + "learning_rate": 7.286633098493663e-05, + "loss": 1.785, + "step": 12002 + }, + { + "epoch": 3.6841620626151013, + "grad_norm": 0.2905690670013428, + "learning_rate": 7.286191055901608e-05, + "loss": 1.8283, + "step": 12003 + }, + { + "epoch": 3.6844689993861266, + "grad_norm": 0.23666246235370636, + "learning_rate": 7.285748990716072e-05, + "loss": 1.7665, + "step": 12004 + }, + { + "epoch": 3.6847759361571515, + "grad_norm": 0.32329514622688293, + "learning_rate": 7.285306902941427e-05, + "loss": 1.7267, + "step": 12005 + }, + { + "epoch": 3.685082872928177, + "grad_norm": 0.32345879077911377, + "learning_rate": 7.28486479258204e-05, + "loss": 1.7529, + "step": 12006 + }, + { + "epoch": 3.6853898096992017, + "grad_norm": 0.2727855443954468, + "learning_rate": 7.284422659642279e-05, + "loss": 1.8279, + "step": 12007 + }, + { + "epoch": 3.685696746470227, + "grad_norm": 0.37847277522087097, + "learning_rate": 7.283980504126513e-05, + "loss": 1.7809, + "step": 12008 + }, + { + "epoch": 3.6860036832412524, + "grad_norm": 0.44694215059280396, + "learning_rate": 7.283538326039113e-05, + "loss": 1.8184, + "step": 12009 + }, + { + "epoch": 3.6863106200122777, + "grad_norm": 0.2868261933326721, + "learning_rate": 7.28309612538445e-05, + "loss": 1.7461, + "step": 12010 + }, + { + "epoch": 3.6866175567833026, + "grad_norm": 0.2601351737976074, + "learning_rate": 7.282653902166894e-05, + "loss": 1.8011, + "step": 12011 + }, + { + "epoch": 3.686924493554328, + "grad_norm": 0.328185498714447, + "learning_rate": 7.282211656390813e-05, + "loss": 1.7934, + "step": 12012 + }, + { + "epoch": 3.687231430325353, + "grad_norm": 0.2712559103965759, + "learning_rate": 7.281769388060578e-05, + "loss": 1.7566, + "step": 12013 + }, + { + "epoch": 3.687538367096378, + "grad_norm": 0.2725805938243866, + "learning_rate": 7.281327097180562e-05, + "loss": 1.8024, + "step": 12014 + }, + { + "epoch": 3.6878453038674035, + "grad_norm": 0.37282630801200867, + "learning_rate": 7.280884783755133e-05, + "loss": 1.7624, + "step": 12015 + }, + { + "epoch": 3.6881522406384284, + "grad_norm": 0.36519256234169006, + "learning_rate": 7.280442447788664e-05, + "loss": 1.8691, + "step": 12016 + }, + { + "epoch": 3.6884591774094537, + "grad_norm": 0.21699345111846924, + "learning_rate": 7.280000089285528e-05, + "loss": 1.7308, + "step": 12017 + }, + { + "epoch": 3.6887661141804786, + "grad_norm": 0.3159945011138916, + "learning_rate": 7.279557708250094e-05, + "loss": 1.8144, + "step": 12018 + }, + { + "epoch": 3.689073050951504, + "grad_norm": 0.2927449643611908, + "learning_rate": 7.279115304686735e-05, + "loss": 1.7746, + "step": 12019 + }, + { + "epoch": 3.6893799877225293, + "grad_norm": 0.279208242893219, + "learning_rate": 7.278672878599819e-05, + "loss": 1.7678, + "step": 12020 + }, + { + "epoch": 3.689686924493554, + "grad_norm": 0.40005648136138916, + "learning_rate": 7.278230429993725e-05, + "loss": 1.7876, + "step": 12021 + }, + { + "epoch": 3.6899938612645795, + "grad_norm": 0.3444392681121826, + "learning_rate": 7.277787958872824e-05, + "loss": 1.7591, + "step": 12022 + }, + { + "epoch": 3.6903007980356044, + "grad_norm": 0.21841467916965485, + "learning_rate": 7.277345465241485e-05, + "loss": 1.785, + "step": 12023 + }, + { + "epoch": 3.6906077348066297, + "grad_norm": 0.32463181018829346, + "learning_rate": 7.276902949104084e-05, + "loss": 1.8164, + "step": 12024 + }, + { + "epoch": 3.690914671577655, + "grad_norm": 0.36221247911453247, + "learning_rate": 7.276460410464994e-05, + "loss": 1.7529, + "step": 12025 + }, + { + "epoch": 3.6912216083486804, + "grad_norm": 0.24451927840709686, + "learning_rate": 7.276017849328588e-05, + "loss": 1.8031, + "step": 12026 + }, + { + "epoch": 3.6915285451197053, + "grad_norm": 0.3055694103240967, + "learning_rate": 7.275575265699239e-05, + "loss": 1.8158, + "step": 12027 + }, + { + "epoch": 3.6918354818907306, + "grad_norm": 0.4315083622932434, + "learning_rate": 7.27513265958132e-05, + "loss": 1.8322, + "step": 12028 + }, + { + "epoch": 3.6921424186617555, + "grad_norm": 0.3391095697879791, + "learning_rate": 7.274690030979209e-05, + "loss": 1.8214, + "step": 12029 + }, + { + "epoch": 3.692449355432781, + "grad_norm": 0.22714883089065552, + "learning_rate": 7.274247379897277e-05, + "loss": 1.7312, + "step": 12030 + }, + { + "epoch": 3.692756292203806, + "grad_norm": 0.24982765316963196, + "learning_rate": 7.273804706339899e-05, + "loss": 1.738, + "step": 12031 + }, + { + "epoch": 3.693063228974831, + "grad_norm": 0.32509860396385193, + "learning_rate": 7.273362010311451e-05, + "loss": 1.7773, + "step": 12032 + }, + { + "epoch": 3.6933701657458564, + "grad_norm": 0.2643086612224579, + "learning_rate": 7.272919291816307e-05, + "loss": 1.7545, + "step": 12033 + }, + { + "epoch": 3.6936771025168813, + "grad_norm": 0.2568800747394562, + "learning_rate": 7.272476550858842e-05, + "loss": 1.8055, + "step": 12034 + }, + { + "epoch": 3.6939840392879066, + "grad_norm": 0.27418240904808044, + "learning_rate": 7.272033787443433e-05, + "loss": 1.7769, + "step": 12035 + }, + { + "epoch": 3.694290976058932, + "grad_norm": 0.2459677755832672, + "learning_rate": 7.271591001574453e-05, + "loss": 1.7971, + "step": 12036 + }, + { + "epoch": 3.694597912829957, + "grad_norm": 0.22349393367767334, + "learning_rate": 7.27114819325628e-05, + "loss": 1.7791, + "step": 12037 + }, + { + "epoch": 3.694904849600982, + "grad_norm": 0.25321197509765625, + "learning_rate": 7.270705362493288e-05, + "loss": 1.7475, + "step": 12038 + }, + { + "epoch": 3.695211786372007, + "grad_norm": 0.2585916519165039, + "learning_rate": 7.270262509289855e-05, + "loss": 1.7801, + "step": 12039 + }, + { + "epoch": 3.6955187231430324, + "grad_norm": 0.2673574686050415, + "learning_rate": 7.269819633650359e-05, + "loss": 1.7578, + "step": 12040 + }, + { + "epoch": 3.6958256599140578, + "grad_norm": 0.2509469985961914, + "learning_rate": 7.269376735579175e-05, + "loss": 1.7994, + "step": 12041 + }, + { + "epoch": 3.696132596685083, + "grad_norm": 0.28527703881263733, + "learning_rate": 7.268933815080679e-05, + "loss": 1.7752, + "step": 12042 + }, + { + "epoch": 3.696439533456108, + "grad_norm": 0.22716578841209412, + "learning_rate": 7.268490872159248e-05, + "loss": 1.7186, + "step": 12043 + }, + { + "epoch": 3.6967464702271333, + "grad_norm": 0.24888403713703156, + "learning_rate": 7.268047906819262e-05, + "loss": 1.7882, + "step": 12044 + }, + { + "epoch": 3.697053406998158, + "grad_norm": 0.28976112604141235, + "learning_rate": 7.267604919065096e-05, + "loss": 1.7655, + "step": 12045 + }, + { + "epoch": 3.6973603437691835, + "grad_norm": 0.24668502807617188, + "learning_rate": 7.267161908901131e-05, + "loss": 1.8051, + "step": 12046 + }, + { + "epoch": 3.697667280540209, + "grad_norm": 0.2464776188135147, + "learning_rate": 7.266718876331742e-05, + "loss": 1.809, + "step": 12047 + }, + { + "epoch": 3.6979742173112338, + "grad_norm": 0.27648577094078064, + "learning_rate": 7.266275821361309e-05, + "loss": 1.7869, + "step": 12048 + }, + { + "epoch": 3.698281154082259, + "grad_norm": 0.26427242159843445, + "learning_rate": 7.26583274399421e-05, + "loss": 1.7681, + "step": 12049 + }, + { + "epoch": 3.698588090853284, + "grad_norm": 0.24595285952091217, + "learning_rate": 7.265389644234823e-05, + "loss": 1.7209, + "step": 12050 + }, + { + "epoch": 3.6988950276243093, + "grad_norm": 0.32514405250549316, + "learning_rate": 7.26494652208753e-05, + "loss": 1.8702, + "step": 12051 + }, + { + "epoch": 3.6992019643953347, + "grad_norm": 0.24512936174869537, + "learning_rate": 7.264503377556705e-05, + "loss": 1.784, + "step": 12052 + }, + { + "epoch": 3.69950890116636, + "grad_norm": 0.28698310256004333, + "learning_rate": 7.264060210646733e-05, + "loss": 1.905, + "step": 12053 + }, + { + "epoch": 3.699815837937385, + "grad_norm": 0.2995007336139679, + "learning_rate": 7.263617021361989e-05, + "loss": 1.7822, + "step": 12054 + }, + { + "epoch": 3.7001227747084102, + "grad_norm": 0.25869423151016235, + "learning_rate": 7.263173809706855e-05, + "loss": 1.7988, + "step": 12055 + }, + { + "epoch": 3.700429711479435, + "grad_norm": 0.350918710231781, + "learning_rate": 7.262730575685711e-05, + "loss": 1.9504, + "step": 12056 + }, + { + "epoch": 3.7007366482504604, + "grad_norm": 0.3407665491104126, + "learning_rate": 7.262287319302937e-05, + "loss": 1.8506, + "step": 12057 + }, + { + "epoch": 3.701043585021486, + "grad_norm": 0.3039441704750061, + "learning_rate": 7.261844040562915e-05, + "loss": 1.7841, + "step": 12058 + }, + { + "epoch": 3.7013505217925107, + "grad_norm": 0.23483428359031677, + "learning_rate": 7.261400739470023e-05, + "loss": 1.7899, + "step": 12059 + }, + { + "epoch": 3.701657458563536, + "grad_norm": 0.30779507756233215, + "learning_rate": 7.260957416028645e-05, + "loss": 1.8131, + "step": 12060 + }, + { + "epoch": 3.701964395334561, + "grad_norm": 0.29901376366615295, + "learning_rate": 7.26051407024316e-05, + "loss": 1.7861, + "step": 12061 + }, + { + "epoch": 3.7022713321055862, + "grad_norm": 0.30058762431144714, + "learning_rate": 7.260070702117949e-05, + "loss": 1.7485, + "step": 12062 + }, + { + "epoch": 3.7025782688766116, + "grad_norm": 0.24523651599884033, + "learning_rate": 7.259627311657396e-05, + "loss": 1.772, + "step": 12063 + }, + { + "epoch": 3.7028852056476365, + "grad_norm": 0.24375474452972412, + "learning_rate": 7.259183898865882e-05, + "loss": 1.7848, + "step": 12064 + }, + { + "epoch": 3.703192142418662, + "grad_norm": 0.2562403380870819, + "learning_rate": 7.258740463747788e-05, + "loss": 1.7447, + "step": 12065 + }, + { + "epoch": 3.7034990791896867, + "grad_norm": 0.265229195356369, + "learning_rate": 7.258297006307496e-05, + "loss": 1.8111, + "step": 12066 + }, + { + "epoch": 3.703806015960712, + "grad_norm": 0.2836552858352661, + "learning_rate": 7.25785352654939e-05, + "loss": 1.7952, + "step": 12067 + }, + { + "epoch": 3.7041129527317374, + "grad_norm": 0.3269572854042053, + "learning_rate": 7.257410024477852e-05, + "loss": 1.8604, + "step": 12068 + }, + { + "epoch": 3.7044198895027627, + "grad_norm": 0.2391490638256073, + "learning_rate": 7.256966500097264e-05, + "loss": 1.7417, + "step": 12069 + }, + { + "epoch": 3.7047268262737876, + "grad_norm": 0.2610675096511841, + "learning_rate": 7.256522953412011e-05, + "loss": 1.7712, + "step": 12070 + }, + { + "epoch": 3.705033763044813, + "grad_norm": 0.24954774975776672, + "learning_rate": 7.256079384426477e-05, + "loss": 1.7506, + "step": 12071 + }, + { + "epoch": 3.705340699815838, + "grad_norm": 0.2603892385959625, + "learning_rate": 7.255635793145042e-05, + "loss": 1.8105, + "step": 12072 + }, + { + "epoch": 3.705647636586863, + "grad_norm": 0.32728591561317444, + "learning_rate": 7.255192179572092e-05, + "loss": 1.8448, + "step": 12073 + }, + { + "epoch": 3.7059545733578885, + "grad_norm": 0.4559340178966522, + "learning_rate": 7.254748543712013e-05, + "loss": 1.7232, + "step": 12074 + }, + { + "epoch": 3.7062615101289134, + "grad_norm": 0.36526206135749817, + "learning_rate": 7.254304885569186e-05, + "loss": 1.7874, + "step": 12075 + }, + { + "epoch": 3.7065684468999387, + "grad_norm": 0.21606837213039398, + "learning_rate": 7.253861205147998e-05, + "loss": 1.7266, + "step": 12076 + }, + { + "epoch": 3.7068753836709636, + "grad_norm": 0.3629585802555084, + "learning_rate": 7.253417502452831e-05, + "loss": 1.7722, + "step": 12077 + }, + { + "epoch": 3.707182320441989, + "grad_norm": 0.4224923551082611, + "learning_rate": 7.252973777488072e-05, + "loss": 1.7369, + "step": 12078 + }, + { + "epoch": 3.7074892572130143, + "grad_norm": 0.32245784997940063, + "learning_rate": 7.252530030258106e-05, + "loss": 1.7836, + "step": 12079 + }, + { + "epoch": 3.707796193984039, + "grad_norm": 0.29909494519233704, + "learning_rate": 7.252086260767317e-05, + "loss": 1.8718, + "step": 12080 + }, + { + "epoch": 3.7081031307550645, + "grad_norm": 0.21995799243450165, + "learning_rate": 7.251642469020093e-05, + "loss": 1.7103, + "step": 12081 + }, + { + "epoch": 3.7084100675260894, + "grad_norm": 0.2737572193145752, + "learning_rate": 7.251198655020818e-05, + "loss": 1.7787, + "step": 12082 + }, + { + "epoch": 3.7087170042971147, + "grad_norm": 0.22417058050632477, + "learning_rate": 7.250754818773879e-05, + "loss": 1.7782, + "step": 12083 + }, + { + "epoch": 3.70902394106814, + "grad_norm": 0.3350662887096405, + "learning_rate": 7.25031096028366e-05, + "loss": 1.8193, + "step": 12084 + }, + { + "epoch": 3.7093308778391654, + "grad_norm": 0.3199101686477661, + "learning_rate": 7.24986707955455e-05, + "loss": 1.831, + "step": 12085 + }, + { + "epoch": 3.7096378146101903, + "grad_norm": 0.2513977289199829, + "learning_rate": 7.249423176590936e-05, + "loss": 1.8288, + "step": 12086 + }, + { + "epoch": 3.7099447513812156, + "grad_norm": 0.30411866307258606, + "learning_rate": 7.248979251397203e-05, + "loss": 1.7837, + "step": 12087 + }, + { + "epoch": 3.7102516881522405, + "grad_norm": 0.30755332112312317, + "learning_rate": 7.248535303977738e-05, + "loss": 1.8016, + "step": 12088 + }, + { + "epoch": 3.710558624923266, + "grad_norm": 0.25746986269950867, + "learning_rate": 7.248091334336929e-05, + "loss": 1.8014, + "step": 12089 + }, + { + "epoch": 3.710865561694291, + "grad_norm": 0.3327447772026062, + "learning_rate": 7.247647342479164e-05, + "loss": 1.752, + "step": 12090 + }, + { + "epoch": 3.711172498465316, + "grad_norm": 0.3101816475391388, + "learning_rate": 7.247203328408832e-05, + "loss": 1.7867, + "step": 12091 + }, + { + "epoch": 3.7114794352363414, + "grad_norm": 0.2168906182050705, + "learning_rate": 7.246759292130318e-05, + "loss": 1.7452, + "step": 12092 + }, + { + "epoch": 3.7117863720073663, + "grad_norm": 0.34260258078575134, + "learning_rate": 7.246315233648013e-05, + "loss": 1.8156, + "step": 12093 + }, + { + "epoch": 3.7120933087783916, + "grad_norm": 0.2730714976787567, + "learning_rate": 7.245871152966303e-05, + "loss": 1.7429, + "step": 12094 + }, + { + "epoch": 3.712400245549417, + "grad_norm": 0.2560936212539673, + "learning_rate": 7.245427050089578e-05, + "loss": 1.7969, + "step": 12095 + }, + { + "epoch": 3.712707182320442, + "grad_norm": 0.27510303258895874, + "learning_rate": 7.244982925022228e-05, + "loss": 1.7981, + "step": 12096 + }, + { + "epoch": 3.713014119091467, + "grad_norm": 0.29171642661094666, + "learning_rate": 7.24453877776864e-05, + "loss": 1.7913, + "step": 12097 + }, + { + "epoch": 3.713321055862492, + "grad_norm": 0.26431843638420105, + "learning_rate": 7.244094608333206e-05, + "loss": 1.8262, + "step": 12098 + }, + { + "epoch": 3.7136279926335174, + "grad_norm": 0.30747905373573303, + "learning_rate": 7.243650416720311e-05, + "loss": 1.7951, + "step": 12099 + }, + { + "epoch": 3.7139349294045427, + "grad_norm": 0.346443772315979, + "learning_rate": 7.24320620293435e-05, + "loss": 1.7677, + "step": 12100 + }, + { + "epoch": 3.714241866175568, + "grad_norm": 0.2910652458667755, + "learning_rate": 7.242761966979709e-05, + "loss": 1.7887, + "step": 12101 + }, + { + "epoch": 3.714548802946593, + "grad_norm": 0.22342006862163544, + "learning_rate": 7.24231770886078e-05, + "loss": 1.7678, + "step": 12102 + }, + { + "epoch": 3.7148557397176183, + "grad_norm": 0.24125796556472778, + "learning_rate": 7.241873428581954e-05, + "loss": 1.7436, + "step": 12103 + }, + { + "epoch": 3.715162676488643, + "grad_norm": 0.23542635142803192, + "learning_rate": 7.24142912614762e-05, + "loss": 1.7942, + "step": 12104 + }, + { + "epoch": 3.7154696132596685, + "grad_norm": 0.22476384043693542, + "learning_rate": 7.240984801562169e-05, + "loss": 1.8235, + "step": 12105 + }, + { + "epoch": 3.715776550030694, + "grad_norm": 0.25123465061187744, + "learning_rate": 7.240540454829992e-05, + "loss": 1.8112, + "step": 12106 + }, + { + "epoch": 3.7160834868017187, + "grad_norm": 0.27230000495910645, + "learning_rate": 7.240096085955483e-05, + "loss": 1.8312, + "step": 12107 + }, + { + "epoch": 3.716390423572744, + "grad_norm": 0.2722976803779602, + "learning_rate": 7.239651694943031e-05, + "loss": 1.8368, + "step": 12108 + }, + { + "epoch": 3.716697360343769, + "grad_norm": 0.264138400554657, + "learning_rate": 7.239207281797028e-05, + "loss": 1.8206, + "step": 12109 + }, + { + "epoch": 3.7170042971147943, + "grad_norm": 0.28813931345939636, + "learning_rate": 7.238762846521866e-05, + "loss": 1.7391, + "step": 12110 + }, + { + "epoch": 3.7173112338858196, + "grad_norm": 0.2319631576538086, + "learning_rate": 7.238318389121939e-05, + "loss": 1.7574, + "step": 12111 + }, + { + "epoch": 3.717618170656845, + "grad_norm": 0.2507809102535248, + "learning_rate": 7.237873909601635e-05, + "loss": 1.7359, + "step": 12112 + }, + { + "epoch": 3.71792510742787, + "grad_norm": 0.2717304825782776, + "learning_rate": 7.237429407965351e-05, + "loss": 1.774, + "step": 12113 + }, + { + "epoch": 3.718232044198895, + "grad_norm": 0.2619280517101288, + "learning_rate": 7.236984884217478e-05, + "loss": 1.8083, + "step": 12114 + }, + { + "epoch": 3.71853898096992, + "grad_norm": 0.22268806397914886, + "learning_rate": 7.23654033836241e-05, + "loss": 1.7436, + "step": 12115 + }, + { + "epoch": 3.7188459177409454, + "grad_norm": 0.2341407984495163, + "learning_rate": 7.236095770404539e-05, + "loss": 1.7807, + "step": 12116 + }, + { + "epoch": 3.7191528545119708, + "grad_norm": 0.23519712686538696, + "learning_rate": 7.235651180348258e-05, + "loss": 1.8051, + "step": 12117 + }, + { + "epoch": 3.7194597912829956, + "grad_norm": 0.2391074150800705, + "learning_rate": 7.235206568197963e-05, + "loss": 1.8377, + "step": 12118 + }, + { + "epoch": 3.719766728054021, + "grad_norm": 0.26821592450141907, + "learning_rate": 7.234761933958045e-05, + "loss": 1.8586, + "step": 12119 + }, + { + "epoch": 3.720073664825046, + "grad_norm": 0.24971134960651398, + "learning_rate": 7.234317277632902e-05, + "loss": 1.8404, + "step": 12120 + }, + { + "epoch": 3.720380601596071, + "grad_norm": 0.20817919075489044, + "learning_rate": 7.233872599226926e-05, + "loss": 1.7204, + "step": 12121 + }, + { + "epoch": 3.7206875383670965, + "grad_norm": 0.29301291704177856, + "learning_rate": 7.233427898744509e-05, + "loss": 1.8528, + "step": 12122 + }, + { + "epoch": 3.7209944751381214, + "grad_norm": 0.22214651107788086, + "learning_rate": 7.23298317619005e-05, + "loss": 1.748, + "step": 12123 + }, + { + "epoch": 3.7213014119091468, + "grad_norm": 0.2511044442653656, + "learning_rate": 7.232538431567941e-05, + "loss": 1.8146, + "step": 12124 + }, + { + "epoch": 3.7216083486801717, + "grad_norm": 0.26976367831230164, + "learning_rate": 7.232093664882581e-05, + "loss": 1.8483, + "step": 12125 + }, + { + "epoch": 3.721915285451197, + "grad_norm": 0.2538089156150818, + "learning_rate": 7.231648876138361e-05, + "loss": 1.8097, + "step": 12126 + }, + { + "epoch": 3.7222222222222223, + "grad_norm": 0.2353016883134842, + "learning_rate": 7.231204065339682e-05, + "loss": 1.737, + "step": 12127 + }, + { + "epoch": 3.7225291589932477, + "grad_norm": 0.3205147981643677, + "learning_rate": 7.230759232490935e-05, + "loss": 1.8116, + "step": 12128 + }, + { + "epoch": 3.7228360957642725, + "grad_norm": 0.39056599140167236, + "learning_rate": 7.230314377596516e-05, + "loss": 1.7785, + "step": 12129 + }, + { + "epoch": 3.723143032535298, + "grad_norm": 0.3846863806247711, + "learning_rate": 7.229869500660825e-05, + "loss": 1.738, + "step": 12130 + }, + { + "epoch": 3.7234499693063228, + "grad_norm": 0.24412120878696442, + "learning_rate": 7.229424601688256e-05, + "loss": 1.7351, + "step": 12131 + }, + { + "epoch": 3.723756906077348, + "grad_norm": 0.2978009581565857, + "learning_rate": 7.228979680683206e-05, + "loss": 1.8037, + "step": 12132 + }, + { + "epoch": 3.7240638428483734, + "grad_norm": 0.33787262439727783, + "learning_rate": 7.228534737650074e-05, + "loss": 1.8421, + "step": 12133 + }, + { + "epoch": 3.7243707796193983, + "grad_norm": 0.2536921203136444, + "learning_rate": 7.228089772593254e-05, + "loss": 1.7472, + "step": 12134 + }, + { + "epoch": 3.7246777163904237, + "grad_norm": 0.24103601276874542, + "learning_rate": 7.227644785517144e-05, + "loss": 1.8011, + "step": 12135 + }, + { + "epoch": 3.7249846531614486, + "grad_norm": 0.3653033375740051, + "learning_rate": 7.227199776426146e-05, + "loss": 1.8018, + "step": 12136 + }, + { + "epoch": 3.725291589932474, + "grad_norm": 0.35728752613067627, + "learning_rate": 7.226754745324652e-05, + "loss": 1.7684, + "step": 12137 + }, + { + "epoch": 3.7255985267034992, + "grad_norm": 0.262018620967865, + "learning_rate": 7.226309692217063e-05, + "loss": 1.8124, + "step": 12138 + }, + { + "epoch": 3.725905463474524, + "grad_norm": 0.3467118442058563, + "learning_rate": 7.225864617107776e-05, + "loss": 1.8761, + "step": 12139 + }, + { + "epoch": 3.7262124002455494, + "grad_norm": 0.4365626871585846, + "learning_rate": 7.22541952000119e-05, + "loss": 1.7159, + "step": 12140 + }, + { + "epoch": 3.7265193370165743, + "grad_norm": 0.2819811999797821, + "learning_rate": 7.224974400901705e-05, + "loss": 1.8051, + "step": 12141 + }, + { + "epoch": 3.7268262737875997, + "grad_norm": 0.39062437415122986, + "learning_rate": 7.224529259813719e-05, + "loss": 1.8517, + "step": 12142 + }, + { + "epoch": 3.727133210558625, + "grad_norm": 0.4383927285671234, + "learning_rate": 7.22408409674163e-05, + "loss": 1.8295, + "step": 12143 + }, + { + "epoch": 3.7274401473296503, + "grad_norm": 0.3043094575405121, + "learning_rate": 7.223638911689839e-05, + "loss": 1.7653, + "step": 12144 + }, + { + "epoch": 3.7277470841006752, + "grad_norm": 0.25198984146118164, + "learning_rate": 7.223193704662746e-05, + "loss": 1.7561, + "step": 12145 + }, + { + "epoch": 3.7280540208717006, + "grad_norm": 0.353565514087677, + "learning_rate": 7.222748475664749e-05, + "loss": 1.8077, + "step": 12146 + }, + { + "epoch": 3.7283609576427255, + "grad_norm": 0.39757224917411804, + "learning_rate": 7.222303224700248e-05, + "loss": 1.7622, + "step": 12147 + }, + { + "epoch": 3.728667894413751, + "grad_norm": 0.35595703125, + "learning_rate": 7.221857951773644e-05, + "loss": 1.8436, + "step": 12148 + }, + { + "epoch": 3.728974831184776, + "grad_norm": 0.2469715029001236, + "learning_rate": 7.221412656889338e-05, + "loss": 1.8531, + "step": 12149 + }, + { + "epoch": 3.729281767955801, + "grad_norm": 0.35324424505233765, + "learning_rate": 7.22096734005173e-05, + "loss": 1.7361, + "step": 12150 + }, + { + "epoch": 3.7295887047268264, + "grad_norm": 0.3783365488052368, + "learning_rate": 7.220522001265223e-05, + "loss": 1.7459, + "step": 12151 + }, + { + "epoch": 3.7298956414978512, + "grad_norm": 0.27526360750198364, + "learning_rate": 7.220076640534212e-05, + "loss": 1.8867, + "step": 12152 + }, + { + "epoch": 3.7302025782688766, + "grad_norm": 0.30863118171691895, + "learning_rate": 7.219631257863105e-05, + "loss": 1.7363, + "step": 12153 + }, + { + "epoch": 3.730509515039902, + "grad_norm": 0.38505107164382935, + "learning_rate": 7.219185853256301e-05, + "loss": 1.764, + "step": 12154 + }, + { + "epoch": 3.730816451810927, + "grad_norm": 0.2925978899002075, + "learning_rate": 7.218740426718202e-05, + "loss": 1.7693, + "step": 12155 + }, + { + "epoch": 3.731123388581952, + "grad_norm": 0.24510078132152557, + "learning_rate": 7.218294978253209e-05, + "loss": 1.8089, + "step": 12156 + }, + { + "epoch": 3.731430325352977, + "grad_norm": 0.33029109239578247, + "learning_rate": 7.217849507865724e-05, + "loss": 1.6885, + "step": 12157 + }, + { + "epoch": 3.7317372621240024, + "grad_norm": 0.333970308303833, + "learning_rate": 7.217404015560149e-05, + "loss": 1.8132, + "step": 12158 + }, + { + "epoch": 3.7320441988950277, + "grad_norm": 0.2467660754919052, + "learning_rate": 7.216958501340891e-05, + "loss": 1.8021, + "step": 12159 + }, + { + "epoch": 3.732351135666053, + "grad_norm": 0.2701449990272522, + "learning_rate": 7.216512965212348e-05, + "loss": 1.7006, + "step": 12160 + }, + { + "epoch": 3.732658072437078, + "grad_norm": 0.2784138023853302, + "learning_rate": 7.216067407178926e-05, + "loss": 1.7616, + "step": 12161 + }, + { + "epoch": 3.7329650092081033, + "grad_norm": 0.2082870900630951, + "learning_rate": 7.215621827245026e-05, + "loss": 1.7391, + "step": 12162 + }, + { + "epoch": 3.733271945979128, + "grad_norm": 0.2477869987487793, + "learning_rate": 7.215176225415053e-05, + "loss": 1.7761, + "step": 12163 + }, + { + "epoch": 3.7335788827501535, + "grad_norm": 0.28395572304725647, + "learning_rate": 7.21473060169341e-05, + "loss": 1.8181, + "step": 12164 + }, + { + "epoch": 3.733885819521179, + "grad_norm": 0.20430058240890503, + "learning_rate": 7.2142849560845e-05, + "loss": 1.7035, + "step": 12165 + }, + { + "epoch": 3.7341927562922037, + "grad_norm": 0.30061420798301697, + "learning_rate": 7.21383928859273e-05, + "loss": 1.7703, + "step": 12166 + }, + { + "epoch": 3.734499693063229, + "grad_norm": 0.33865803480148315, + "learning_rate": 7.2133935992225e-05, + "loss": 1.8204, + "step": 12167 + }, + { + "epoch": 3.734806629834254, + "grad_norm": 0.29172980785369873, + "learning_rate": 7.212947887978221e-05, + "loss": 1.739, + "step": 12168 + }, + { + "epoch": 3.7351135666052793, + "grad_norm": 0.2799396812915802, + "learning_rate": 7.212502154864291e-05, + "loss": 1.8503, + "step": 12169 + }, + { + "epoch": 3.7354205033763046, + "grad_norm": 0.2945539355278015, + "learning_rate": 7.212056399885118e-05, + "loss": 1.7523, + "step": 12170 + }, + { + "epoch": 3.7357274401473295, + "grad_norm": 0.2395290732383728, + "learning_rate": 7.211610623045108e-05, + "loss": 1.7728, + "step": 12171 + }, + { + "epoch": 3.736034376918355, + "grad_norm": 0.24369286000728607, + "learning_rate": 7.211164824348667e-05, + "loss": 1.7725, + "step": 12172 + }, + { + "epoch": 3.7363413136893797, + "grad_norm": 0.3272435963153839, + "learning_rate": 7.210719003800197e-05, + "loss": 1.8531, + "step": 12173 + }, + { + "epoch": 3.736648250460405, + "grad_norm": 0.23954182863235474, + "learning_rate": 7.210273161404107e-05, + "loss": 1.7807, + "step": 12174 + }, + { + "epoch": 3.7369551872314304, + "grad_norm": 0.24547603726387024, + "learning_rate": 7.209827297164801e-05, + "loss": 1.8481, + "step": 12175 + }, + { + "epoch": 3.7372621240024557, + "grad_norm": 0.26926249265670776, + "learning_rate": 7.209381411086687e-05, + "loss": 1.7496, + "step": 12176 + }, + { + "epoch": 3.7375690607734806, + "grad_norm": 0.22948235273361206, + "learning_rate": 7.208935503174172e-05, + "loss": 1.7681, + "step": 12177 + }, + { + "epoch": 3.737875997544506, + "grad_norm": 0.2697654664516449, + "learning_rate": 7.20848957343166e-05, + "loss": 1.789, + "step": 12178 + }, + { + "epoch": 3.738182934315531, + "grad_norm": 0.235344797372818, + "learning_rate": 7.208043621863562e-05, + "loss": 1.8309, + "step": 12179 + }, + { + "epoch": 3.738489871086556, + "grad_norm": 0.2688879072666168, + "learning_rate": 7.20759764847428e-05, + "loss": 1.7898, + "step": 12180 + }, + { + "epoch": 3.7387968078575815, + "grad_norm": 0.26818978786468506, + "learning_rate": 7.207151653268226e-05, + "loss": 1.7882, + "step": 12181 + }, + { + "epoch": 3.7391037446286064, + "grad_norm": 0.2612875998020172, + "learning_rate": 7.206705636249804e-05, + "loss": 1.7352, + "step": 12182 + }, + { + "epoch": 3.7394106813996317, + "grad_norm": 0.22547565400600433, + "learning_rate": 7.206259597423425e-05, + "loss": 1.733, + "step": 12183 + }, + { + "epoch": 3.7397176181706566, + "grad_norm": 0.24645474553108215, + "learning_rate": 7.205813536793495e-05, + "loss": 1.8064, + "step": 12184 + }, + { + "epoch": 3.740024554941682, + "grad_norm": 0.25879329442977905, + "learning_rate": 7.205367454364424e-05, + "loss": 1.8134, + "step": 12185 + }, + { + "epoch": 3.7403314917127073, + "grad_norm": 0.22420097887516022, + "learning_rate": 7.204921350140617e-05, + "loss": 1.7819, + "step": 12186 + }, + { + "epoch": 3.7406384284837326, + "grad_norm": 0.2569858431816101, + "learning_rate": 7.204475224126487e-05, + "loss": 1.784, + "step": 12187 + }, + { + "epoch": 3.7409453652547575, + "grad_norm": 0.23769912123680115, + "learning_rate": 7.20402907632644e-05, + "loss": 1.7853, + "step": 12188 + }, + { + "epoch": 3.741252302025783, + "grad_norm": 0.26935988664627075, + "learning_rate": 7.203582906744885e-05, + "loss": 1.806, + "step": 12189 + }, + { + "epoch": 3.7415592387968077, + "grad_norm": 0.2544274628162384, + "learning_rate": 7.203136715386233e-05, + "loss": 1.7988, + "step": 12190 + }, + { + "epoch": 3.741866175567833, + "grad_norm": 0.22665882110595703, + "learning_rate": 7.202690502254892e-05, + "loss": 1.7798, + "step": 12191 + }, + { + "epoch": 3.7421731123388584, + "grad_norm": 0.24512888491153717, + "learning_rate": 7.202244267355273e-05, + "loss": 1.816, + "step": 12192 + }, + { + "epoch": 3.7424800491098833, + "grad_norm": 0.2408553808927536, + "learning_rate": 7.201798010691785e-05, + "loss": 1.7417, + "step": 12193 + }, + { + "epoch": 3.7427869858809086, + "grad_norm": 0.23142600059509277, + "learning_rate": 7.201351732268838e-05, + "loss": 1.7771, + "step": 12194 + }, + { + "epoch": 3.7430939226519335, + "grad_norm": 0.245071142911911, + "learning_rate": 7.200905432090844e-05, + "loss": 1.7556, + "step": 12195 + }, + { + "epoch": 3.743400859422959, + "grad_norm": 0.2623934745788574, + "learning_rate": 7.200459110162211e-05, + "loss": 1.8042, + "step": 12196 + }, + { + "epoch": 3.743707796193984, + "grad_norm": 0.2531217038631439, + "learning_rate": 7.200012766487353e-05, + "loss": 1.7709, + "step": 12197 + }, + { + "epoch": 3.744014732965009, + "grad_norm": 0.23839864134788513, + "learning_rate": 7.19956640107068e-05, + "loss": 1.8202, + "step": 12198 + }, + { + "epoch": 3.7443216697360344, + "grad_norm": 0.2342260777950287, + "learning_rate": 7.1991200139166e-05, + "loss": 1.827, + "step": 12199 + }, + { + "epoch": 3.7446286065070593, + "grad_norm": 0.25511276721954346, + "learning_rate": 7.198673605029528e-05, + "loss": 1.7766, + "step": 12200 + }, + { + "epoch": 3.7449355432780846, + "grad_norm": 0.27601274847984314, + "learning_rate": 7.198227174413876e-05, + "loss": 1.7716, + "step": 12201 + }, + { + "epoch": 3.74524248004911, + "grad_norm": 0.3027385175228119, + "learning_rate": 7.197780722074056e-05, + "loss": 1.8007, + "step": 12202 + }, + { + "epoch": 3.7455494168201353, + "grad_norm": 0.31242382526397705, + "learning_rate": 7.197334248014477e-05, + "loss": 1.8089, + "step": 12203 + }, + { + "epoch": 3.74585635359116, + "grad_norm": 0.3673859238624573, + "learning_rate": 7.196887752239551e-05, + "loss": 1.8017, + "step": 12204 + }, + { + "epoch": 3.7461632903621855, + "grad_norm": 0.3152726888656616, + "learning_rate": 7.196441234753695e-05, + "loss": 1.7108, + "step": 12205 + }, + { + "epoch": 3.7464702271332104, + "grad_norm": 0.2606927156448364, + "learning_rate": 7.195994695561319e-05, + "loss": 1.8066, + "step": 12206 + }, + { + "epoch": 3.7467771639042358, + "grad_norm": 0.37624871730804443, + "learning_rate": 7.195548134666836e-05, + "loss": 1.725, + "step": 12207 + }, + { + "epoch": 3.747084100675261, + "grad_norm": 0.4138187766075134, + "learning_rate": 7.195101552074658e-05, + "loss": 1.7838, + "step": 12208 + }, + { + "epoch": 3.747391037446286, + "grad_norm": 0.3668459951877594, + "learning_rate": 7.194654947789204e-05, + "loss": 1.7575, + "step": 12209 + }, + { + "epoch": 3.7476979742173113, + "grad_norm": 0.27947792410850525, + "learning_rate": 7.19420832181488e-05, + "loss": 1.792, + "step": 12210 + }, + { + "epoch": 3.748004910988336, + "grad_norm": 0.2507692873477936, + "learning_rate": 7.193761674156103e-05, + "loss": 1.7752, + "step": 12211 + }, + { + "epoch": 3.7483118477593615, + "grad_norm": 0.3209949731826782, + "learning_rate": 7.193315004817289e-05, + "loss": 1.8491, + "step": 12212 + }, + { + "epoch": 3.748618784530387, + "grad_norm": 0.32883042097091675, + "learning_rate": 7.192868313802849e-05, + "loss": 1.8135, + "step": 12213 + }, + { + "epoch": 3.7489257213014118, + "grad_norm": 0.2450616955757141, + "learning_rate": 7.192421601117201e-05, + "loss": 1.7722, + "step": 12214 + }, + { + "epoch": 3.749232658072437, + "grad_norm": 0.2545110285282135, + "learning_rate": 7.191974866764757e-05, + "loss": 1.7866, + "step": 12215 + }, + { + "epoch": 3.749539594843462, + "grad_norm": 0.264017790555954, + "learning_rate": 7.191528110749932e-05, + "loss": 1.778, + "step": 12216 + }, + { + "epoch": 3.7498465316144873, + "grad_norm": 0.3156309425830841, + "learning_rate": 7.191081333077142e-05, + "loss": 1.7917, + "step": 12217 + }, + { + "epoch": 3.7501534683855127, + "grad_norm": 0.3578774631023407, + "learning_rate": 7.190634533750802e-05, + "loss": 1.8468, + "step": 12218 + }, + { + "epoch": 3.750460405156538, + "grad_norm": 0.30735981464385986, + "learning_rate": 7.19018771277533e-05, + "loss": 1.7502, + "step": 12219 + }, + { + "epoch": 3.750767341927563, + "grad_norm": 0.22870220243930817, + "learning_rate": 7.189740870155135e-05, + "loss": 1.7686, + "step": 12220 + }, + { + "epoch": 3.7510742786985882, + "grad_norm": 0.30297720432281494, + "learning_rate": 7.18929400589464e-05, + "loss": 1.826, + "step": 12221 + }, + { + "epoch": 3.751381215469613, + "grad_norm": 0.2735389173030853, + "learning_rate": 7.188847119998257e-05, + "loss": 1.8142, + "step": 12222 + }, + { + "epoch": 3.7516881522406385, + "grad_norm": 0.2823885679244995, + "learning_rate": 7.188400212470405e-05, + "loss": 1.8028, + "step": 12223 + }, + { + "epoch": 3.751995089011664, + "grad_norm": 0.4184139370918274, + "learning_rate": 7.187953283315499e-05, + "loss": 1.8467, + "step": 12224 + }, + { + "epoch": 3.7523020257826887, + "grad_norm": 0.3559226095676422, + "learning_rate": 7.187506332537957e-05, + "loss": 1.7416, + "step": 12225 + }, + { + "epoch": 3.752608962553714, + "grad_norm": 0.26055800914764404, + "learning_rate": 7.187059360142194e-05, + "loss": 1.8309, + "step": 12226 + }, + { + "epoch": 3.752915899324739, + "grad_norm": 0.28032660484313965, + "learning_rate": 7.186612366132629e-05, + "loss": 1.7926, + "step": 12227 + }, + { + "epoch": 3.7532228360957642, + "grad_norm": 0.26229965686798096, + "learning_rate": 7.18616535051368e-05, + "loss": 1.7368, + "step": 12228 + }, + { + "epoch": 3.7535297728667896, + "grad_norm": 0.2779417634010315, + "learning_rate": 7.185718313289763e-05, + "loss": 1.8418, + "step": 12229 + }, + { + "epoch": 3.7538367096378145, + "grad_norm": 0.26164770126342773, + "learning_rate": 7.185271254465295e-05, + "loss": 1.7511, + "step": 12230 + }, + { + "epoch": 3.75414364640884, + "grad_norm": 0.30725157260894775, + "learning_rate": 7.184824174044698e-05, + "loss": 1.7661, + "step": 12231 + }, + { + "epoch": 3.7544505831798647, + "grad_norm": 0.33111417293548584, + "learning_rate": 7.184377072032386e-05, + "loss": 1.7341, + "step": 12232 + }, + { + "epoch": 3.75475751995089, + "grad_norm": 0.23978343605995178, + "learning_rate": 7.183929948432779e-05, + "loss": 1.7151, + "step": 12233 + }, + { + "epoch": 3.7550644567219154, + "grad_norm": 0.3057664632797241, + "learning_rate": 7.183482803250299e-05, + "loss": 1.8446, + "step": 12234 + }, + { + "epoch": 3.7553713934929407, + "grad_norm": 0.2629055678844452, + "learning_rate": 7.18303563648936e-05, + "loss": 1.7415, + "step": 12235 + }, + { + "epoch": 3.7556783302639656, + "grad_norm": 0.22703498601913452, + "learning_rate": 7.182588448154386e-05, + "loss": 1.8188, + "step": 12236 + }, + { + "epoch": 3.755985267034991, + "grad_norm": 0.3014034032821655, + "learning_rate": 7.182141238249792e-05, + "loss": 1.8634, + "step": 12237 + }, + { + "epoch": 3.756292203806016, + "grad_norm": 0.28859084844589233, + "learning_rate": 7.181694006779998e-05, + "loss": 1.7509, + "step": 12238 + }, + { + "epoch": 3.756599140577041, + "grad_norm": 0.293720543384552, + "learning_rate": 7.181246753749426e-05, + "loss": 1.777, + "step": 12239 + }, + { + "epoch": 3.7569060773480665, + "grad_norm": 0.2374580055475235, + "learning_rate": 7.180799479162496e-05, + "loss": 1.7492, + "step": 12240 + }, + { + "epoch": 3.7572130141190914, + "grad_norm": 0.30106452107429504, + "learning_rate": 7.180352183023627e-05, + "loss": 1.7538, + "step": 12241 + }, + { + "epoch": 3.7575199508901167, + "grad_norm": 0.3504682183265686, + "learning_rate": 7.179904865337238e-05, + "loss": 1.7477, + "step": 12242 + }, + { + "epoch": 3.7578268876611416, + "grad_norm": 0.2901679575443268, + "learning_rate": 7.179457526107754e-05, + "loss": 1.9412, + "step": 12243 + }, + { + "epoch": 3.758133824432167, + "grad_norm": 0.37690606713294983, + "learning_rate": 7.179010165339591e-05, + "loss": 1.8222, + "step": 12244 + }, + { + "epoch": 3.7584407612031923, + "grad_norm": 0.45126965641975403, + "learning_rate": 7.178562783037172e-05, + "loss": 1.8563, + "step": 12245 + }, + { + "epoch": 3.758747697974217, + "grad_norm": 0.2747548818588257, + "learning_rate": 7.178115379204921e-05, + "loss": 1.7179, + "step": 12246 + }, + { + "epoch": 3.7590546347452425, + "grad_norm": 0.43243977427482605, + "learning_rate": 7.177667953847257e-05, + "loss": 1.8157, + "step": 12247 + }, + { + "epoch": 3.7593615715162674, + "grad_norm": 0.529448390007019, + "learning_rate": 7.177220506968602e-05, + "loss": 1.8113, + "step": 12248 + }, + { + "epoch": 3.7596685082872927, + "grad_norm": 0.3099314868450165, + "learning_rate": 7.176773038573377e-05, + "loss": 1.7833, + "step": 12249 + }, + { + "epoch": 3.759975445058318, + "grad_norm": 0.3111872375011444, + "learning_rate": 7.176325548666004e-05, + "loss": 1.7965, + "step": 12250 + }, + { + "epoch": 3.7602823818293434, + "grad_norm": 0.38437551259994507, + "learning_rate": 7.175878037250907e-05, + "loss": 1.7822, + "step": 12251 + }, + { + "epoch": 3.7605893186003683, + "grad_norm": 0.33643704652786255, + "learning_rate": 7.175430504332509e-05, + "loss": 1.7839, + "step": 12252 + }, + { + "epoch": 3.7608962553713936, + "grad_norm": 0.24705304205417633, + "learning_rate": 7.174982949915232e-05, + "loss": 1.8302, + "step": 12253 + }, + { + "epoch": 3.7612031921424185, + "grad_norm": 0.3615458309650421, + "learning_rate": 7.174535374003497e-05, + "loss": 1.7963, + "step": 12254 + }, + { + "epoch": 3.761510128913444, + "grad_norm": 0.36486589908599854, + "learning_rate": 7.17408777660173e-05, + "loss": 1.7933, + "step": 12255 + }, + { + "epoch": 3.761817065684469, + "grad_norm": 0.2566867172718048, + "learning_rate": 7.173640157714352e-05, + "loss": 1.7254, + "step": 12256 + }, + { + "epoch": 3.762124002455494, + "grad_norm": 0.2602523863315582, + "learning_rate": 7.17319251734579e-05, + "loss": 1.7357, + "step": 12257 + }, + { + "epoch": 3.7624309392265194, + "grad_norm": 0.3626105785369873, + "learning_rate": 7.172744855500464e-05, + "loss": 1.7971, + "step": 12258 + }, + { + "epoch": 3.7627378759975443, + "grad_norm": 0.36327603459358215, + "learning_rate": 7.172297172182802e-05, + "loss": 1.7819, + "step": 12259 + }, + { + "epoch": 3.7630448127685696, + "grad_norm": 0.25935736298561096, + "learning_rate": 7.171849467397224e-05, + "loss": 1.8112, + "step": 12260 + }, + { + "epoch": 3.763351749539595, + "grad_norm": 0.2779700756072998, + "learning_rate": 7.171401741148156e-05, + "loss": 1.786, + "step": 12261 + }, + { + "epoch": 3.7636586863106203, + "grad_norm": 0.3089013695716858, + "learning_rate": 7.170953993440025e-05, + "loss": 1.7808, + "step": 12262 + }, + { + "epoch": 3.763965623081645, + "grad_norm": 0.2562308609485626, + "learning_rate": 7.170506224277253e-05, + "loss": 1.8207, + "step": 12263 + }, + { + "epoch": 3.7642725598526705, + "grad_norm": 0.2907634973526001, + "learning_rate": 7.170058433664268e-05, + "loss": 1.7638, + "step": 12264 + }, + { + "epoch": 3.7645794966236954, + "grad_norm": 0.30341312289237976, + "learning_rate": 7.169610621605493e-05, + "loss": 1.7827, + "step": 12265 + }, + { + "epoch": 3.7648864333947207, + "grad_norm": 0.27091866731643677, + "learning_rate": 7.169162788105353e-05, + "loss": 1.786, + "step": 12266 + }, + { + "epoch": 3.765193370165746, + "grad_norm": 0.234042689204216, + "learning_rate": 7.168714933168277e-05, + "loss": 1.7638, + "step": 12267 + }, + { + "epoch": 3.765500306936771, + "grad_norm": 0.2477465271949768, + "learning_rate": 7.168267056798686e-05, + "loss": 1.7275, + "step": 12268 + }, + { + "epoch": 3.7658072437077963, + "grad_norm": 0.25578543543815613, + "learning_rate": 7.167819159001012e-05, + "loss": 1.7831, + "step": 12269 + }, + { + "epoch": 3.766114180478821, + "grad_norm": 0.26629674434661865, + "learning_rate": 7.167371239779678e-05, + "loss": 1.7866, + "step": 12270 + }, + { + "epoch": 3.7664211172498465, + "grad_norm": 0.31350967288017273, + "learning_rate": 7.16692329913911e-05, + "loss": 1.7755, + "step": 12271 + }, + { + "epoch": 3.766728054020872, + "grad_norm": 0.2670116126537323, + "learning_rate": 7.166475337083735e-05, + "loss": 1.7524, + "step": 12272 + }, + { + "epoch": 3.7670349907918967, + "grad_norm": 0.26503682136535645, + "learning_rate": 7.166027353617983e-05, + "loss": 1.7867, + "step": 12273 + }, + { + "epoch": 3.767341927562922, + "grad_norm": 0.3674192428588867, + "learning_rate": 7.165579348746278e-05, + "loss": 1.7604, + "step": 12274 + }, + { + "epoch": 3.767648864333947, + "grad_norm": 0.4120824337005615, + "learning_rate": 7.16513132247305e-05, + "loss": 1.7905, + "step": 12275 + }, + { + "epoch": 3.7679558011049723, + "grad_norm": 0.29074826836586, + "learning_rate": 7.164683274802723e-05, + "loss": 1.7539, + "step": 12276 + }, + { + "epoch": 3.7682627378759976, + "grad_norm": 0.22223204374313354, + "learning_rate": 7.164235205739729e-05, + "loss": 1.755, + "step": 12277 + }, + { + "epoch": 3.768569674647023, + "grad_norm": 0.23997461795806885, + "learning_rate": 7.163787115288494e-05, + "loss": 1.8024, + "step": 12278 + }, + { + "epoch": 3.768876611418048, + "grad_norm": 0.2556418776512146, + "learning_rate": 7.163339003453445e-05, + "loss": 1.7717, + "step": 12279 + }, + { + "epoch": 3.769183548189073, + "grad_norm": 0.3107141852378845, + "learning_rate": 7.162890870239013e-05, + "loss": 1.8257, + "step": 12280 + }, + { + "epoch": 3.769490484960098, + "grad_norm": 0.35293644666671753, + "learning_rate": 7.162442715649627e-05, + "loss": 1.7855, + "step": 12281 + }, + { + "epoch": 3.7697974217311234, + "grad_norm": 0.25989311933517456, + "learning_rate": 7.161994539689713e-05, + "loss": 1.7816, + "step": 12282 + }, + { + "epoch": 3.7701043585021488, + "grad_norm": 0.25615137815475464, + "learning_rate": 7.161546342363701e-05, + "loss": 1.7738, + "step": 12283 + }, + { + "epoch": 3.7704112952731736, + "grad_norm": 0.29345229268074036, + "learning_rate": 7.161098123676023e-05, + "loss": 1.8496, + "step": 12284 + }, + { + "epoch": 3.770718232044199, + "grad_norm": 0.2975969612598419, + "learning_rate": 7.160649883631105e-05, + "loss": 1.7342, + "step": 12285 + }, + { + "epoch": 3.771025168815224, + "grad_norm": 0.28458064794540405, + "learning_rate": 7.16020162223338e-05, + "loss": 1.8253, + "step": 12286 + }, + { + "epoch": 3.771332105586249, + "grad_norm": 0.2798703908920288, + "learning_rate": 7.159753339487276e-05, + "loss": 1.746, + "step": 12287 + }, + { + "epoch": 3.7716390423572745, + "grad_norm": 0.380044549703598, + "learning_rate": 7.159305035397223e-05, + "loss": 1.769, + "step": 12288 + }, + { + "epoch": 3.7719459791282994, + "grad_norm": 0.28760263323783875, + "learning_rate": 7.158856709967654e-05, + "loss": 1.7466, + "step": 12289 + }, + { + "epoch": 3.7722529158993248, + "grad_norm": 0.23314130306243896, + "learning_rate": 7.158408363202996e-05, + "loss": 1.7545, + "step": 12290 + }, + { + "epoch": 3.7725598526703497, + "grad_norm": 0.2864209711551666, + "learning_rate": 7.15795999510768e-05, + "loss": 1.7549, + "step": 12291 + }, + { + "epoch": 3.772866789441375, + "grad_norm": 0.2605510354042053, + "learning_rate": 7.15751160568614e-05, + "loss": 1.7684, + "step": 12292 + }, + { + "epoch": 3.7731737262124003, + "grad_norm": 0.2475409358739853, + "learning_rate": 7.157063194942806e-05, + "loss": 1.7841, + "step": 12293 + }, + { + "epoch": 3.7734806629834257, + "grad_norm": 0.22479289770126343, + "learning_rate": 7.15661476288211e-05, + "loss": 1.7592, + "step": 12294 + }, + { + "epoch": 3.7737875997544506, + "grad_norm": 0.22076937556266785, + "learning_rate": 7.156166309508482e-05, + "loss": 1.7853, + "step": 12295 + }, + { + "epoch": 3.774094536525476, + "grad_norm": 0.26082465052604675, + "learning_rate": 7.155717834826353e-05, + "loss": 1.7828, + "step": 12296 + }, + { + "epoch": 3.7744014732965008, + "grad_norm": 0.24771755933761597, + "learning_rate": 7.15526933884016e-05, + "loss": 1.758, + "step": 12297 + }, + { + "epoch": 3.774708410067526, + "grad_norm": 0.23806311190128326, + "learning_rate": 7.15482082155433e-05, + "loss": 1.7237, + "step": 12298 + }, + { + "epoch": 3.7750153468385514, + "grad_norm": 0.24822844564914703, + "learning_rate": 7.154372282973299e-05, + "loss": 1.7828, + "step": 12299 + }, + { + "epoch": 3.7753222836095763, + "grad_norm": 0.24423740804195404, + "learning_rate": 7.153923723101496e-05, + "loss": 1.8014, + "step": 12300 + }, + { + "epoch": 3.7756292203806017, + "grad_norm": 0.24966634809970856, + "learning_rate": 7.15347514194336e-05, + "loss": 1.8005, + "step": 12301 + }, + { + "epoch": 3.7759361571516266, + "grad_norm": 0.2549348473548889, + "learning_rate": 7.153026539503317e-05, + "loss": 1.8473, + "step": 12302 + }, + { + "epoch": 3.776243093922652, + "grad_norm": 0.23709465563297272, + "learning_rate": 7.152577915785807e-05, + "loss": 1.8031, + "step": 12303 + }, + { + "epoch": 3.7765500306936772, + "grad_norm": 0.28554168343544006, + "learning_rate": 7.152129270795258e-05, + "loss": 1.7836, + "step": 12304 + }, + { + "epoch": 3.776856967464702, + "grad_norm": 0.2568756639957428, + "learning_rate": 7.151680604536107e-05, + "loss": 1.7345, + "step": 12305 + }, + { + "epoch": 3.7771639042357275, + "grad_norm": 0.23883797228336334, + "learning_rate": 7.151231917012787e-05, + "loss": 1.7342, + "step": 12306 + }, + { + "epoch": 3.7774708410067523, + "grad_norm": 0.24026677012443542, + "learning_rate": 7.150783208229732e-05, + "loss": 1.8156, + "step": 12307 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.25756222009658813, + "learning_rate": 7.150334478191376e-05, + "loss": 1.8204, + "step": 12308 + }, + { + "epoch": 3.778084714548803, + "grad_norm": 0.24917428195476532, + "learning_rate": 7.149885726902156e-05, + "loss": 1.7867, + "step": 12309 + }, + { + "epoch": 3.7783916513198283, + "grad_norm": 0.26269277930259705, + "learning_rate": 7.149436954366504e-05, + "loss": 1.8233, + "step": 12310 + }, + { + "epoch": 3.7786985880908532, + "grad_norm": 0.2502293586730957, + "learning_rate": 7.148988160588857e-05, + "loss": 1.8329, + "step": 12311 + }, + { + "epoch": 3.7790055248618786, + "grad_norm": 0.24845796823501587, + "learning_rate": 7.14853934557365e-05, + "loss": 1.7936, + "step": 12312 + }, + { + "epoch": 3.7793124616329035, + "grad_norm": 0.2453537881374359, + "learning_rate": 7.148090509325315e-05, + "loss": 1.8149, + "step": 12313 + }, + { + "epoch": 3.779619398403929, + "grad_norm": 0.2336922138929367, + "learning_rate": 7.147641651848293e-05, + "loss": 1.7826, + "step": 12314 + }, + { + "epoch": 3.779926335174954, + "grad_norm": 0.25542667508125305, + "learning_rate": 7.147192773147017e-05, + "loss": 1.801, + "step": 12315 + }, + { + "epoch": 3.780233271945979, + "grad_norm": 0.2301866114139557, + "learning_rate": 7.146743873225923e-05, + "loss": 1.7302, + "step": 12316 + }, + { + "epoch": 3.7805402087170044, + "grad_norm": 0.25821468234062195, + "learning_rate": 7.14629495208945e-05, + "loss": 1.7704, + "step": 12317 + }, + { + "epoch": 3.7808471454880292, + "grad_norm": 0.22537970542907715, + "learning_rate": 7.145846009742029e-05, + "loss": 1.7281, + "step": 12318 + }, + { + "epoch": 3.7811540822590546, + "grad_norm": 0.2565869688987732, + "learning_rate": 7.145397046188102e-05, + "loss": 1.8077, + "step": 12319 + }, + { + "epoch": 3.78146101903008, + "grad_norm": 0.2588396966457367, + "learning_rate": 7.144948061432105e-05, + "loss": 1.7438, + "step": 12320 + }, + { + "epoch": 3.781767955801105, + "grad_norm": 0.2538135349750519, + "learning_rate": 7.144499055478472e-05, + "loss": 1.8253, + "step": 12321 + }, + { + "epoch": 3.78207489257213, + "grad_norm": 0.2272680401802063, + "learning_rate": 7.144050028331644e-05, + "loss": 1.7408, + "step": 12322 + }, + { + "epoch": 3.782381829343155, + "grad_norm": 0.25010406970977783, + "learning_rate": 7.143600979996055e-05, + "loss": 1.8219, + "step": 12323 + }, + { + "epoch": 3.7826887661141804, + "grad_norm": 0.2560291290283203, + "learning_rate": 7.143151910476144e-05, + "loss": 1.7734, + "step": 12324 + }, + { + "epoch": 3.7829957028852057, + "grad_norm": 0.24927431344985962, + "learning_rate": 7.142702819776352e-05, + "loss": 1.7682, + "step": 12325 + }, + { + "epoch": 3.783302639656231, + "grad_norm": 0.2501368224620819, + "learning_rate": 7.142253707901114e-05, + "loss": 1.818, + "step": 12326 + }, + { + "epoch": 3.783609576427256, + "grad_norm": 0.3132917284965515, + "learning_rate": 7.141804574854871e-05, + "loss": 1.7793, + "step": 12327 + }, + { + "epoch": 3.7839165131982813, + "grad_norm": 0.24229925870895386, + "learning_rate": 7.141355420642057e-05, + "loss": 1.7585, + "step": 12328 + }, + { + "epoch": 3.784223449969306, + "grad_norm": 0.22612906992435455, + "learning_rate": 7.140906245267116e-05, + "loss": 1.7374, + "step": 12329 + }, + { + "epoch": 3.7845303867403315, + "grad_norm": 0.26354333758354187, + "learning_rate": 7.140457048734482e-05, + "loss": 1.7751, + "step": 12330 + }, + { + "epoch": 3.784837323511357, + "grad_norm": 0.21500451862812042, + "learning_rate": 7.140007831048599e-05, + "loss": 1.7827, + "step": 12331 + }, + { + "epoch": 3.7851442602823817, + "grad_norm": 0.2826332151889801, + "learning_rate": 7.139558592213904e-05, + "loss": 1.7522, + "step": 12332 + }, + { + "epoch": 3.785451197053407, + "grad_norm": 0.3217725455760956, + "learning_rate": 7.139109332234837e-05, + "loss": 1.8758, + "step": 12333 + }, + { + "epoch": 3.785758133824432, + "grad_norm": 0.26934614777565, + "learning_rate": 7.138660051115837e-05, + "loss": 1.8322, + "step": 12334 + }, + { + "epoch": 3.7860650705954573, + "grad_norm": 0.2653827667236328, + "learning_rate": 7.138210748861346e-05, + "loss": 1.7651, + "step": 12335 + }, + { + "epoch": 3.7863720073664826, + "grad_norm": 0.30470311641693115, + "learning_rate": 7.137761425475802e-05, + "loss": 1.855, + "step": 12336 + }, + { + "epoch": 3.786678944137508, + "grad_norm": 0.2558726370334625, + "learning_rate": 7.137312080963647e-05, + "loss": 1.7174, + "step": 12337 + }, + { + "epoch": 3.786985880908533, + "grad_norm": 0.24025602638721466, + "learning_rate": 7.136862715329322e-05, + "loss": 1.7565, + "step": 12338 + }, + { + "epoch": 3.787292817679558, + "grad_norm": 0.34205392003059387, + "learning_rate": 7.136413328577267e-05, + "loss": 1.8116, + "step": 12339 + }, + { + "epoch": 3.787599754450583, + "grad_norm": 0.4069152772426605, + "learning_rate": 7.135963920711923e-05, + "loss": 1.7662, + "step": 12340 + }, + { + "epoch": 3.7879066912216084, + "grad_norm": 0.3915627598762512, + "learning_rate": 7.13551449173773e-05, + "loss": 1.81, + "step": 12341 + }, + { + "epoch": 3.7882136279926337, + "grad_norm": 0.27136507630348206, + "learning_rate": 7.135065041659134e-05, + "loss": 1.7845, + "step": 12342 + }, + { + "epoch": 3.7885205647636586, + "grad_norm": 0.2924078106880188, + "learning_rate": 7.134615570480572e-05, + "loss": 1.8606, + "step": 12343 + }, + { + "epoch": 3.788827501534684, + "grad_norm": 0.35581526160240173, + "learning_rate": 7.134166078206488e-05, + "loss": 1.7785, + "step": 12344 + }, + { + "epoch": 3.789134438305709, + "grad_norm": 0.3003756105899811, + "learning_rate": 7.133716564841324e-05, + "loss": 1.7321, + "step": 12345 + }, + { + "epoch": 3.789441375076734, + "grad_norm": 0.2586000859737396, + "learning_rate": 7.133267030389524e-05, + "loss": 1.7889, + "step": 12346 + }, + { + "epoch": 3.7897483118477595, + "grad_norm": 0.28053075075149536, + "learning_rate": 7.132817474855527e-05, + "loss": 1.8216, + "step": 12347 + }, + { + "epoch": 3.7900552486187844, + "grad_norm": 0.3064870834350586, + "learning_rate": 7.132367898243777e-05, + "loss": 1.7528, + "step": 12348 + }, + { + "epoch": 3.7903621853898097, + "grad_norm": 0.3045158386230469, + "learning_rate": 7.131918300558719e-05, + "loss": 1.8251, + "step": 12349 + }, + { + "epoch": 3.7906691221608346, + "grad_norm": 0.2438485324382782, + "learning_rate": 7.131468681804794e-05, + "loss": 1.7505, + "step": 12350 + }, + { + "epoch": 3.79097605893186, + "grad_norm": 0.24239958822727203, + "learning_rate": 7.131019041986447e-05, + "loss": 1.7544, + "step": 12351 + }, + { + "epoch": 3.7912829957028853, + "grad_norm": 0.24632441997528076, + "learning_rate": 7.130569381108121e-05, + "loss": 1.7485, + "step": 12352 + }, + { + "epoch": 3.7915899324739106, + "grad_norm": 0.22553624212741852, + "learning_rate": 7.13011969917426e-05, + "loss": 1.803, + "step": 12353 + }, + { + "epoch": 3.7918968692449355, + "grad_norm": 0.2164420485496521, + "learning_rate": 7.129669996189306e-05, + "loss": 1.7307, + "step": 12354 + }, + { + "epoch": 3.792203806015961, + "grad_norm": 0.25104281306266785, + "learning_rate": 7.129220272157705e-05, + "loss": 1.8154, + "step": 12355 + }, + { + "epoch": 3.7925107427869857, + "grad_norm": 0.25533202290534973, + "learning_rate": 7.128770527083903e-05, + "loss": 1.8046, + "step": 12356 + }, + { + "epoch": 3.792817679558011, + "grad_norm": 0.24428130686283112, + "learning_rate": 7.128320760972341e-05, + "loss": 1.7984, + "step": 12357 + }, + { + "epoch": 3.7931246163290364, + "grad_norm": 0.2366408109664917, + "learning_rate": 7.127870973827467e-05, + "loss": 1.7781, + "step": 12358 + }, + { + "epoch": 3.7934315531000613, + "grad_norm": 0.2558888792991638, + "learning_rate": 7.127421165653722e-05, + "loss": 1.7858, + "step": 12359 + }, + { + "epoch": 3.7937384898710866, + "grad_norm": 0.25825443863868713, + "learning_rate": 7.126971336455558e-05, + "loss": 1.8292, + "step": 12360 + }, + { + "epoch": 3.7940454266421115, + "grad_norm": 0.2554624080657959, + "learning_rate": 7.126521486237415e-05, + "loss": 1.822, + "step": 12361 + }, + { + "epoch": 3.794352363413137, + "grad_norm": 0.3030763268470764, + "learning_rate": 7.126071615003742e-05, + "loss": 1.8261, + "step": 12362 + }, + { + "epoch": 3.794659300184162, + "grad_norm": 0.3047907054424286, + "learning_rate": 7.125621722758981e-05, + "loss": 1.8419, + "step": 12363 + }, + { + "epoch": 3.794966236955187, + "grad_norm": 0.27782654762268066, + "learning_rate": 7.12517180950758e-05, + "loss": 1.7959, + "step": 12364 + }, + { + "epoch": 3.7952731737262124, + "grad_norm": 0.24526572227478027, + "learning_rate": 7.124721875253986e-05, + "loss": 1.7313, + "step": 12365 + }, + { + "epoch": 3.7955801104972373, + "grad_norm": 0.23718179762363434, + "learning_rate": 7.124271920002646e-05, + "loss": 1.7479, + "step": 12366 + }, + { + "epoch": 3.7958870472682626, + "grad_norm": 0.2880019247531891, + "learning_rate": 7.123821943758004e-05, + "loss": 1.7792, + "step": 12367 + }, + { + "epoch": 3.796193984039288, + "grad_norm": 0.28923723101615906, + "learning_rate": 7.123371946524511e-05, + "loss": 1.7474, + "step": 12368 + }, + { + "epoch": 3.7965009208103133, + "grad_norm": 0.2281525880098343, + "learning_rate": 7.122921928306612e-05, + "loss": 1.8106, + "step": 12369 + }, + { + "epoch": 3.796807857581338, + "grad_norm": 0.34825438261032104, + "learning_rate": 7.122471889108752e-05, + "loss": 1.8076, + "step": 12370 + }, + { + "epoch": 3.7971147943523635, + "grad_norm": 0.41145995259284973, + "learning_rate": 7.122021828935382e-05, + "loss": 1.7692, + "step": 12371 + }, + { + "epoch": 3.7974217311233884, + "grad_norm": 0.31711262464523315, + "learning_rate": 7.12157174779095e-05, + "loss": 1.8101, + "step": 12372 + }, + { + "epoch": 3.7977286678944138, + "grad_norm": 0.3044308125972748, + "learning_rate": 7.1211216456799e-05, + "loss": 1.8238, + "step": 12373 + }, + { + "epoch": 3.798035604665439, + "grad_norm": 0.3750055134296417, + "learning_rate": 7.120671522606683e-05, + "loss": 1.7323, + "step": 12374 + }, + { + "epoch": 3.798342541436464, + "grad_norm": 0.38852599263191223, + "learning_rate": 7.120221378575749e-05, + "loss": 1.8402, + "step": 12375 + }, + { + "epoch": 3.7986494782074893, + "grad_norm": 0.3430371582508087, + "learning_rate": 7.119771213591541e-05, + "loss": 1.8369, + "step": 12376 + }, + { + "epoch": 3.798956414978514, + "grad_norm": 0.4787428677082062, + "learning_rate": 7.119321027658515e-05, + "loss": 1.7977, + "step": 12377 + }, + { + "epoch": 3.7992633517495396, + "grad_norm": 0.4263977110385895, + "learning_rate": 7.118870820781114e-05, + "loss": 1.8208, + "step": 12378 + }, + { + "epoch": 3.799570288520565, + "grad_norm": 0.28649669885635376, + "learning_rate": 7.118420592963793e-05, + "loss": 1.773, + "step": 12379 + }, + { + "epoch": 3.7998772252915898, + "grad_norm": 0.26070261001586914, + "learning_rate": 7.117970344210996e-05, + "loss": 1.6866, + "step": 12380 + }, + { + "epoch": 3.800184162062615, + "grad_norm": 0.30127593874931335, + "learning_rate": 7.117520074527173e-05, + "loss": 1.7208, + "step": 12381 + }, + { + "epoch": 3.80049109883364, + "grad_norm": 0.23639258742332458, + "learning_rate": 7.117069783916777e-05, + "loss": 1.7504, + "step": 12382 + }, + { + "epoch": 3.8007980356046653, + "grad_norm": 0.2852858901023865, + "learning_rate": 7.116619472384256e-05, + "loss": 1.7954, + "step": 12383 + }, + { + "epoch": 3.8011049723756907, + "grad_norm": 0.2673225998878479, + "learning_rate": 7.116169139934063e-05, + "loss": 1.7562, + "step": 12384 + }, + { + "epoch": 3.801411909146716, + "grad_norm": 0.21615394949913025, + "learning_rate": 7.115718786570644e-05, + "loss": 1.7126, + "step": 12385 + }, + { + "epoch": 3.801718845917741, + "grad_norm": 0.2165435254573822, + "learning_rate": 7.115268412298453e-05, + "loss": 1.7171, + "step": 12386 + }, + { + "epoch": 3.8020257826887662, + "grad_norm": 0.280564546585083, + "learning_rate": 7.114818017121939e-05, + "loss": 1.7711, + "step": 12387 + }, + { + "epoch": 3.802332719459791, + "grad_norm": 0.3023521304130554, + "learning_rate": 7.114367601045555e-05, + "loss": 1.7538, + "step": 12388 + }, + { + "epoch": 3.8026396562308165, + "grad_norm": 0.27252480387687683, + "learning_rate": 7.11391716407375e-05, + "loss": 1.7604, + "step": 12389 + }, + { + "epoch": 3.802946593001842, + "grad_norm": 0.2122909128665924, + "learning_rate": 7.113466706210976e-05, + "loss": 1.716, + "step": 12390 + }, + { + "epoch": 3.8032535297728667, + "grad_norm": 0.30141574144363403, + "learning_rate": 7.113016227461686e-05, + "loss": 1.7636, + "step": 12391 + }, + { + "epoch": 3.803560466543892, + "grad_norm": 0.33359697461128235, + "learning_rate": 7.112565727830331e-05, + "loss": 1.7805, + "step": 12392 + }, + { + "epoch": 3.803867403314917, + "grad_norm": 0.3161376714706421, + "learning_rate": 7.112115207321364e-05, + "loss": 1.7974, + "step": 12393 + }, + { + "epoch": 3.8041743400859422, + "grad_norm": 0.29028698801994324, + "learning_rate": 7.111664665939235e-05, + "loss": 1.83, + "step": 12394 + }, + { + "epoch": 3.8044812768569676, + "grad_norm": 0.38829556107521057, + "learning_rate": 7.1112141036884e-05, + "loss": 1.8684, + "step": 12395 + }, + { + "epoch": 3.804788213627993, + "grad_norm": 0.4118283987045288, + "learning_rate": 7.110763520573309e-05, + "loss": 1.7812, + "step": 12396 + }, + { + "epoch": 3.805095150399018, + "grad_norm": 0.3907717168331146, + "learning_rate": 7.110312916598416e-05, + "loss": 1.7789, + "step": 12397 + }, + { + "epoch": 3.805402087170043, + "grad_norm": 0.2768644690513611, + "learning_rate": 7.109862291768173e-05, + "loss": 1.8575, + "step": 12398 + }, + { + "epoch": 3.805709023941068, + "grad_norm": 0.3234006464481354, + "learning_rate": 7.109411646087035e-05, + "loss": 1.7485, + "step": 12399 + }, + { + "epoch": 3.8060159607120934, + "grad_norm": 0.415475994348526, + "learning_rate": 7.108960979559454e-05, + "loss": 1.7363, + "step": 12400 + }, + { + "epoch": 3.8063228974831187, + "grad_norm": 0.38654613494873047, + "learning_rate": 7.108510292189884e-05, + "loss": 1.7907, + "step": 12401 + }, + { + "epoch": 3.8066298342541436, + "grad_norm": 0.2541481852531433, + "learning_rate": 7.10805958398278e-05, + "loss": 1.8458, + "step": 12402 + }, + { + "epoch": 3.806936771025169, + "grad_norm": 0.32562851905822754, + "learning_rate": 7.107608854942597e-05, + "loss": 1.7989, + "step": 12403 + }, + { + "epoch": 3.807243707796194, + "grad_norm": 0.3628395199775696, + "learning_rate": 7.107158105073786e-05, + "loss": 1.8044, + "step": 12404 + }, + { + "epoch": 3.807550644567219, + "grad_norm": 0.3363969027996063, + "learning_rate": 7.106707334380805e-05, + "loss": 1.8078, + "step": 12405 + }, + { + "epoch": 3.8078575813382445, + "grad_norm": 0.2853989601135254, + "learning_rate": 7.106256542868108e-05, + "loss": 1.7913, + "step": 12406 + }, + { + "epoch": 3.8081645181092694, + "grad_norm": 0.33455806970596313, + "learning_rate": 7.105805730540148e-05, + "loss": 1.7252, + "step": 12407 + }, + { + "epoch": 3.8084714548802947, + "grad_norm": 0.28103405237197876, + "learning_rate": 7.105354897401382e-05, + "loss": 1.6942, + "step": 12408 + }, + { + "epoch": 3.8087783916513196, + "grad_norm": 0.23230718076229095, + "learning_rate": 7.104904043456264e-05, + "loss": 1.7723, + "step": 12409 + }, + { + "epoch": 3.809085328422345, + "grad_norm": 0.2883053421974182, + "learning_rate": 7.104453168709251e-05, + "loss": 1.8015, + "step": 12410 + }, + { + "epoch": 3.8093922651933703, + "grad_norm": 0.28462252020835876, + "learning_rate": 7.104002273164798e-05, + "loss": 1.791, + "step": 12411 + }, + { + "epoch": 3.8096992019643956, + "grad_norm": 0.3004699647426605, + "learning_rate": 7.103551356827363e-05, + "loss": 1.8401, + "step": 12412 + }, + { + "epoch": 3.8100061387354205, + "grad_norm": 0.2546156048774719, + "learning_rate": 7.1031004197014e-05, + "loss": 1.7645, + "step": 12413 + }, + { + "epoch": 3.810313075506446, + "grad_norm": 0.24532915651798248, + "learning_rate": 7.102649461791364e-05, + "loss": 1.8, + "step": 12414 + }, + { + "epoch": 3.8106200122774707, + "grad_norm": 0.2432405799627304, + "learning_rate": 7.102198483101716e-05, + "loss": 1.7957, + "step": 12415 + }, + { + "epoch": 3.810926949048496, + "grad_norm": 0.24405215680599213, + "learning_rate": 7.101747483636908e-05, + "loss": 1.79, + "step": 12416 + }, + { + "epoch": 3.8112338858195214, + "grad_norm": 0.29519838094711304, + "learning_rate": 7.101296463401401e-05, + "loss": 1.8087, + "step": 12417 + }, + { + "epoch": 3.8115408225905463, + "grad_norm": 0.28205612301826477, + "learning_rate": 7.100845422399652e-05, + "loss": 1.7897, + "step": 12418 + }, + { + "epoch": 3.8118477593615716, + "grad_norm": 0.25014567375183105, + "learning_rate": 7.100394360636115e-05, + "loss": 1.7574, + "step": 12419 + }, + { + "epoch": 3.8121546961325965, + "grad_norm": 0.3133499026298523, + "learning_rate": 7.099943278115251e-05, + "loss": 1.7957, + "step": 12420 + }, + { + "epoch": 3.812461632903622, + "grad_norm": 0.3706473708152771, + "learning_rate": 7.099492174841516e-05, + "loss": 1.8519, + "step": 12421 + }, + { + "epoch": 3.812768569674647, + "grad_norm": 0.30085715651512146, + "learning_rate": 7.09904105081937e-05, + "loss": 1.778, + "step": 12422 + }, + { + "epoch": 3.813075506445672, + "grad_norm": 0.23897981643676758, + "learning_rate": 7.09858990605327e-05, + "loss": 1.7289, + "step": 12423 + }, + { + "epoch": 3.8133824432166974, + "grad_norm": 0.30046290159225464, + "learning_rate": 7.098138740547673e-05, + "loss": 1.8838, + "step": 12424 + }, + { + "epoch": 3.8136893799877223, + "grad_norm": 0.32126328349113464, + "learning_rate": 7.097687554307041e-05, + "loss": 1.7916, + "step": 12425 + }, + { + "epoch": 3.8139963167587476, + "grad_norm": 0.2922256886959076, + "learning_rate": 7.097236347335829e-05, + "loss": 1.8305, + "step": 12426 + }, + { + "epoch": 3.814303253529773, + "grad_norm": 0.2772706151008606, + "learning_rate": 7.0967851196385e-05, + "loss": 1.7694, + "step": 12427 + }, + { + "epoch": 3.8146101903007983, + "grad_norm": 0.25763455033302307, + "learning_rate": 7.096333871219511e-05, + "loss": 1.8716, + "step": 12428 + }, + { + "epoch": 3.814917127071823, + "grad_norm": 0.2631739377975464, + "learning_rate": 7.095882602083322e-05, + "loss": 1.7771, + "step": 12429 + }, + { + "epoch": 3.8152240638428485, + "grad_norm": 0.29229632019996643, + "learning_rate": 7.095431312234392e-05, + "loss": 1.7865, + "step": 12430 + }, + { + "epoch": 3.8155310006138734, + "grad_norm": 0.2672729790210724, + "learning_rate": 7.094980001677181e-05, + "loss": 1.7848, + "step": 12431 + }, + { + "epoch": 3.8158379373848987, + "grad_norm": 0.2388373166322708, + "learning_rate": 7.094528670416152e-05, + "loss": 1.75, + "step": 12432 + }, + { + "epoch": 3.816144874155924, + "grad_norm": 0.2385305017232895, + "learning_rate": 7.094077318455762e-05, + "loss": 1.748, + "step": 12433 + }, + { + "epoch": 3.816451810926949, + "grad_norm": 0.25421401858329773, + "learning_rate": 7.093625945800471e-05, + "loss": 1.779, + "step": 12434 + }, + { + "epoch": 3.8167587476979743, + "grad_norm": 0.2785158157348633, + "learning_rate": 7.093174552454743e-05, + "loss": 1.8295, + "step": 12435 + }, + { + "epoch": 3.817065684468999, + "grad_norm": 0.2907472252845764, + "learning_rate": 7.092723138423036e-05, + "loss": 1.8216, + "step": 12436 + }, + { + "epoch": 3.8173726212400245, + "grad_norm": 0.253955215215683, + "learning_rate": 7.092271703709814e-05, + "loss": 1.8394, + "step": 12437 + }, + { + "epoch": 3.81767955801105, + "grad_norm": 0.32139912247657776, + "learning_rate": 7.091820248319537e-05, + "loss": 1.8634, + "step": 12438 + }, + { + "epoch": 3.8179864947820747, + "grad_norm": 0.25890466570854187, + "learning_rate": 7.091368772256664e-05, + "loss": 1.7336, + "step": 12439 + }, + { + "epoch": 3.8182934315531, + "grad_norm": 0.2823775112628937, + "learning_rate": 7.090917275525661e-05, + "loss": 1.7927, + "step": 12440 + }, + { + "epoch": 3.818600368324125, + "grad_norm": 0.28739333152770996, + "learning_rate": 7.090465758130988e-05, + "loss": 1.7807, + "step": 12441 + }, + { + "epoch": 3.8189073050951503, + "grad_norm": 0.36823949217796326, + "learning_rate": 7.090014220077106e-05, + "loss": 1.7288, + "step": 12442 + }, + { + "epoch": 3.8192142418661756, + "grad_norm": 0.3061312735080719, + "learning_rate": 7.089562661368479e-05, + "loss": 1.8039, + "step": 12443 + }, + { + "epoch": 3.819521178637201, + "grad_norm": 0.25867924094200134, + "learning_rate": 7.089111082009569e-05, + "loss": 1.7678, + "step": 12444 + }, + { + "epoch": 3.819828115408226, + "grad_norm": 0.26834985613822937, + "learning_rate": 7.088659482004837e-05, + "loss": 1.7592, + "step": 12445 + }, + { + "epoch": 3.820135052179251, + "grad_norm": 0.25608211755752563, + "learning_rate": 7.08820786135875e-05, + "loss": 1.7622, + "step": 12446 + }, + { + "epoch": 3.820441988950276, + "grad_norm": 0.2512456774711609, + "learning_rate": 7.087756220075769e-05, + "loss": 1.7648, + "step": 12447 + }, + { + "epoch": 3.8207489257213014, + "grad_norm": 0.2434878647327423, + "learning_rate": 7.087304558160355e-05, + "loss": 1.7435, + "step": 12448 + }, + { + "epoch": 3.8210558624923268, + "grad_norm": 0.26456570625305176, + "learning_rate": 7.086852875616978e-05, + "loss": 1.7342, + "step": 12449 + }, + { + "epoch": 3.8213627992633517, + "grad_norm": 0.2958984971046448, + "learning_rate": 7.086401172450095e-05, + "loss": 1.8532, + "step": 12450 + }, + { + "epoch": 3.821669736034377, + "grad_norm": 0.25939157605171204, + "learning_rate": 7.085949448664172e-05, + "loss": 1.7746, + "step": 12451 + }, + { + "epoch": 3.821976672805402, + "grad_norm": 0.2210223525762558, + "learning_rate": 7.085497704263675e-05, + "loss": 1.7745, + "step": 12452 + }, + { + "epoch": 3.822283609576427, + "grad_norm": 0.2409319430589676, + "learning_rate": 7.085045939253068e-05, + "loss": 1.7981, + "step": 12453 + }, + { + "epoch": 3.8225905463474525, + "grad_norm": 0.26331812143325806, + "learning_rate": 7.084594153636815e-05, + "loss": 1.8163, + "step": 12454 + }, + { + "epoch": 3.8228974831184774, + "grad_norm": 0.2613828480243683, + "learning_rate": 7.08414234741938e-05, + "loss": 1.8362, + "step": 12455 + }, + { + "epoch": 3.8232044198895028, + "grad_norm": 0.3139529228210449, + "learning_rate": 7.083690520605228e-05, + "loss": 1.8247, + "step": 12456 + }, + { + "epoch": 3.8235113566605277, + "grad_norm": 0.2958570718765259, + "learning_rate": 7.083238673198826e-05, + "loss": 1.8011, + "step": 12457 + }, + { + "epoch": 3.823818293431553, + "grad_norm": 0.2517626881599426, + "learning_rate": 7.082786805204639e-05, + "loss": 1.7353, + "step": 12458 + }, + { + "epoch": 3.8241252302025783, + "grad_norm": 0.2443888783454895, + "learning_rate": 7.082334916627132e-05, + "loss": 1.7916, + "step": 12459 + }, + { + "epoch": 3.8244321669736037, + "grad_norm": 0.283514142036438, + "learning_rate": 7.08188300747077e-05, + "loss": 1.8048, + "step": 12460 + }, + { + "epoch": 3.8247391037446286, + "grad_norm": 0.24775351583957672, + "learning_rate": 7.08143107774002e-05, + "loss": 1.8145, + "step": 12461 + }, + { + "epoch": 3.825046040515654, + "grad_norm": 0.27904003858566284, + "learning_rate": 7.080979127439347e-05, + "loss": 1.8003, + "step": 12462 + }, + { + "epoch": 3.825352977286679, + "grad_norm": 0.24997512996196747, + "learning_rate": 7.08052715657322e-05, + "loss": 1.7962, + "step": 12463 + }, + { + "epoch": 3.825659914057704, + "grad_norm": 0.25874343514442444, + "learning_rate": 7.080075165146104e-05, + "loss": 1.7861, + "step": 12464 + }, + { + "epoch": 3.8259668508287294, + "grad_norm": 0.2964434027671814, + "learning_rate": 7.079623153162467e-05, + "loss": 1.7618, + "step": 12465 + }, + { + "epoch": 3.8262737875997543, + "grad_norm": 0.26403337717056274, + "learning_rate": 7.079171120626774e-05, + "loss": 1.8016, + "step": 12466 + }, + { + "epoch": 3.8265807243707797, + "grad_norm": 0.28369295597076416, + "learning_rate": 7.078719067543494e-05, + "loss": 1.7517, + "step": 12467 + }, + { + "epoch": 3.8268876611418046, + "grad_norm": 0.254312127828598, + "learning_rate": 7.078266993917093e-05, + "loss": 1.8085, + "step": 12468 + }, + { + "epoch": 3.82719459791283, + "grad_norm": 0.24992622435092926, + "learning_rate": 7.077814899752038e-05, + "loss": 1.7657, + "step": 12469 + }, + { + "epoch": 3.8275015346838552, + "grad_norm": 0.26485762000083923, + "learning_rate": 7.077362785052802e-05, + "loss": 1.7303, + "step": 12470 + }, + { + "epoch": 3.8278084714548806, + "grad_norm": 0.29864901304244995, + "learning_rate": 7.076910649823846e-05, + "loss": 1.7734, + "step": 12471 + }, + { + "epoch": 3.8281154082259055, + "grad_norm": 0.2973599433898926, + "learning_rate": 7.076458494069644e-05, + "loss": 1.8055, + "step": 12472 + }, + { + "epoch": 3.828422344996931, + "grad_norm": 0.2150362730026245, + "learning_rate": 7.07600631779466e-05, + "loss": 1.7377, + "step": 12473 + }, + { + "epoch": 3.8287292817679557, + "grad_norm": 0.26443010568618774, + "learning_rate": 7.075554121003367e-05, + "loss": 1.837, + "step": 12474 + }, + { + "epoch": 3.829036218538981, + "grad_norm": 0.27365007996559143, + "learning_rate": 7.075101903700231e-05, + "loss": 1.7784, + "step": 12475 + }, + { + "epoch": 3.8293431553100064, + "grad_norm": 0.22037263214588165, + "learning_rate": 7.074649665889721e-05, + "loss": 1.8182, + "step": 12476 + }, + { + "epoch": 3.8296500920810312, + "grad_norm": 0.29614946246147156, + "learning_rate": 7.074197407576308e-05, + "loss": 1.7993, + "step": 12477 + }, + { + "epoch": 3.8299570288520566, + "grad_norm": 0.25135520100593567, + "learning_rate": 7.07374512876446e-05, + "loss": 1.8211, + "step": 12478 + }, + { + "epoch": 3.8302639656230815, + "grad_norm": 0.2711503207683563, + "learning_rate": 7.073292829458645e-05, + "loss": 1.8274, + "step": 12479 + }, + { + "epoch": 3.830570902394107, + "grad_norm": 0.38659265637397766, + "learning_rate": 7.072840509663338e-05, + "loss": 1.796, + "step": 12480 + }, + { + "epoch": 3.830877839165132, + "grad_norm": 0.39382728934288025, + "learning_rate": 7.072388169383005e-05, + "loss": 1.8439, + "step": 12481 + }, + { + "epoch": 3.831184775936157, + "grad_norm": 0.27570033073425293, + "learning_rate": 7.071935808622118e-05, + "loss": 1.8155, + "step": 12482 + }, + { + "epoch": 3.8314917127071824, + "grad_norm": 0.29054465889930725, + "learning_rate": 7.071483427385147e-05, + "loss": 1.754, + "step": 12483 + }, + { + "epoch": 3.8317986494782073, + "grad_norm": 0.4138031303882599, + "learning_rate": 7.071031025676562e-05, + "loss": 1.7686, + "step": 12484 + }, + { + "epoch": 3.8321055862492326, + "grad_norm": 0.3447251617908478, + "learning_rate": 7.070578603500833e-05, + "loss": 1.8135, + "step": 12485 + }, + { + "epoch": 3.832412523020258, + "grad_norm": 0.265115886926651, + "learning_rate": 7.070126160862436e-05, + "loss": 1.803, + "step": 12486 + }, + { + "epoch": 3.8327194597912833, + "grad_norm": 0.4288817346096039, + "learning_rate": 7.069673697765837e-05, + "loss": 1.7814, + "step": 12487 + }, + { + "epoch": 3.833026396562308, + "grad_norm": 0.4890103340148926, + "learning_rate": 7.06922121421551e-05, + "loss": 1.8318, + "step": 12488 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.3676142990589142, + "learning_rate": 7.068768710215928e-05, + "loss": 1.7792, + "step": 12489 + }, + { + "epoch": 3.8336402701043584, + "grad_norm": 0.23254090547561646, + "learning_rate": 7.068316185771557e-05, + "loss": 1.7154, + "step": 12490 + }, + { + "epoch": 3.8339472068753837, + "grad_norm": 0.35014036297798157, + "learning_rate": 7.067863640886876e-05, + "loss": 1.7031, + "step": 12491 + }, + { + "epoch": 3.834254143646409, + "grad_norm": 0.32155317068099976, + "learning_rate": 7.067411075566353e-05, + "loss": 1.7692, + "step": 12492 + }, + { + "epoch": 3.834561080417434, + "grad_norm": 0.260772705078125, + "learning_rate": 7.066958489814463e-05, + "loss": 1.7488, + "step": 12493 + }, + { + "epoch": 3.8348680171884593, + "grad_norm": 0.2624910771846771, + "learning_rate": 7.066505883635678e-05, + "loss": 1.7436, + "step": 12494 + }, + { + "epoch": 3.835174953959484, + "grad_norm": 0.2782299220561981, + "learning_rate": 7.066053257034471e-05, + "loss": 1.8219, + "step": 12495 + }, + { + "epoch": 3.8354818907305095, + "grad_norm": 0.2749497890472412, + "learning_rate": 7.065600610015312e-05, + "loss": 1.8068, + "step": 12496 + }, + { + "epoch": 3.835788827501535, + "grad_norm": 0.2730359733104706, + "learning_rate": 7.06514794258268e-05, + "loss": 1.7588, + "step": 12497 + }, + { + "epoch": 3.8360957642725597, + "grad_norm": 0.3606291711330414, + "learning_rate": 7.064695254741044e-05, + "loss": 1.8509, + "step": 12498 + }, + { + "epoch": 3.836402701043585, + "grad_norm": 0.23282989859580994, + "learning_rate": 7.064242546494879e-05, + "loss": 1.7444, + "step": 12499 + }, + { + "epoch": 3.83670963781461, + "grad_norm": 0.2554507255554199, + "learning_rate": 7.06378981784866e-05, + "loss": 1.7486, + "step": 12500 + }, + { + "epoch": 3.8370165745856353, + "grad_norm": 0.2916143834590912, + "learning_rate": 7.06333706880686e-05, + "loss": 1.8035, + "step": 12501 + }, + { + "epoch": 3.8373235113566606, + "grad_norm": 0.23719090223312378, + "learning_rate": 7.062884299373955e-05, + "loss": 1.7896, + "step": 12502 + }, + { + "epoch": 3.837630448127686, + "grad_norm": 0.2596152126789093, + "learning_rate": 7.062431509554417e-05, + "loss": 1.7944, + "step": 12503 + }, + { + "epoch": 3.837937384898711, + "grad_norm": 0.29140764474868774, + "learning_rate": 7.061978699352723e-05, + "loss": 1.7988, + "step": 12504 + }, + { + "epoch": 3.838244321669736, + "grad_norm": 0.3421068489551544, + "learning_rate": 7.061525868773347e-05, + "loss": 1.751, + "step": 12505 + }, + { + "epoch": 3.838551258440761, + "grad_norm": 0.2705349624156952, + "learning_rate": 7.061073017820764e-05, + "loss": 1.7578, + "step": 12506 + }, + { + "epoch": 3.8388581952117864, + "grad_norm": 0.2403286248445511, + "learning_rate": 7.060620146499448e-05, + "loss": 1.8422, + "step": 12507 + }, + { + "epoch": 3.8391651319828117, + "grad_norm": 0.3860442042350769, + "learning_rate": 7.060167254813876e-05, + "loss": 1.8168, + "step": 12508 + }, + { + "epoch": 3.8394720687538366, + "grad_norm": 0.4729512631893158, + "learning_rate": 7.059714342768526e-05, + "loss": 1.7786, + "step": 12509 + }, + { + "epoch": 3.839779005524862, + "grad_norm": 0.3522968888282776, + "learning_rate": 7.059261410367871e-05, + "loss": 1.8749, + "step": 12510 + }, + { + "epoch": 3.840085942295887, + "grad_norm": 0.28071436285972595, + "learning_rate": 7.058808457616386e-05, + "loss": 1.7959, + "step": 12511 + }, + { + "epoch": 3.840392879066912, + "grad_norm": 0.4356439411640167, + "learning_rate": 7.05835548451855e-05, + "loss": 1.8045, + "step": 12512 + }, + { + "epoch": 3.8406998158379375, + "grad_norm": 0.4051562249660492, + "learning_rate": 7.057902491078839e-05, + "loss": 1.7909, + "step": 12513 + }, + { + "epoch": 3.8410067526089624, + "grad_norm": 0.2817205488681793, + "learning_rate": 7.057449477301728e-05, + "loss": 1.8736, + "step": 12514 + }, + { + "epoch": 3.8413136893799877, + "grad_norm": 0.33369559049606323, + "learning_rate": 7.056996443191697e-05, + "loss": 1.7799, + "step": 12515 + }, + { + "epoch": 3.8416206261510126, + "grad_norm": 0.369954913854599, + "learning_rate": 7.056543388753221e-05, + "loss": 1.795, + "step": 12516 + }, + { + "epoch": 3.841927562922038, + "grad_norm": 0.289474755525589, + "learning_rate": 7.056090313990778e-05, + "loss": 1.786, + "step": 12517 + }, + { + "epoch": 3.8422344996930633, + "grad_norm": 0.2431849092245102, + "learning_rate": 7.055637218908845e-05, + "loss": 1.7363, + "step": 12518 + }, + { + "epoch": 3.8425414364640886, + "grad_norm": 0.3736060857772827, + "learning_rate": 7.0551841035119e-05, + "loss": 1.8234, + "step": 12519 + }, + { + "epoch": 3.8428483732351135, + "grad_norm": 0.34008854627609253, + "learning_rate": 7.054730967804422e-05, + "loss": 1.8001, + "step": 12520 + }, + { + "epoch": 3.843155310006139, + "grad_norm": 0.24852876365184784, + "learning_rate": 7.054277811790887e-05, + "loss": 1.8298, + "step": 12521 + }, + { + "epoch": 3.8434622467771637, + "grad_norm": 0.3491046726703644, + "learning_rate": 7.053824635475777e-05, + "loss": 1.7336, + "step": 12522 + }, + { + "epoch": 3.843769183548189, + "grad_norm": 0.38757824897766113, + "learning_rate": 7.053371438863566e-05, + "loss": 1.8241, + "step": 12523 + }, + { + "epoch": 3.8440761203192144, + "grad_norm": 0.2607647180557251, + "learning_rate": 7.052918221958735e-05, + "loss": 1.7813, + "step": 12524 + }, + { + "epoch": 3.8443830570902393, + "grad_norm": 0.25634410977363586, + "learning_rate": 7.052464984765764e-05, + "loss": 1.7836, + "step": 12525 + }, + { + "epoch": 3.8446899938612646, + "grad_norm": 0.3113503158092499, + "learning_rate": 7.052011727289129e-05, + "loss": 1.8477, + "step": 12526 + }, + { + "epoch": 3.8449969306322895, + "grad_norm": 0.2852596044540405, + "learning_rate": 7.051558449533313e-05, + "loss": 1.7607, + "step": 12527 + }, + { + "epoch": 3.845303867403315, + "grad_norm": 0.24841541051864624, + "learning_rate": 7.051105151502795e-05, + "loss": 1.8109, + "step": 12528 + }, + { + "epoch": 3.84561080417434, + "grad_norm": 0.2231549620628357, + "learning_rate": 7.050651833202053e-05, + "loss": 1.7245, + "step": 12529 + }, + { + "epoch": 3.845917740945365, + "grad_norm": 0.21975892782211304, + "learning_rate": 7.050198494635566e-05, + "loss": 1.7512, + "step": 12530 + }, + { + "epoch": 3.8462246777163904, + "grad_norm": 0.2546280324459076, + "learning_rate": 7.049745135807816e-05, + "loss": 1.8003, + "step": 12531 + }, + { + "epoch": 3.8465316144874153, + "grad_norm": 0.21507929265499115, + "learning_rate": 7.049291756723284e-05, + "loss": 1.7616, + "step": 12532 + }, + { + "epoch": 3.8468385512584407, + "grad_norm": 0.24927987158298492, + "learning_rate": 7.04883835738645e-05, + "loss": 1.7519, + "step": 12533 + }, + { + "epoch": 3.847145488029466, + "grad_norm": 0.24988602101802826, + "learning_rate": 7.048384937801793e-05, + "loss": 1.7966, + "step": 12534 + }, + { + "epoch": 3.8474524248004913, + "grad_norm": 0.24039845168590546, + "learning_rate": 7.047931497973798e-05, + "loss": 1.7834, + "step": 12535 + }, + { + "epoch": 3.847759361571516, + "grad_norm": 0.22826696932315826, + "learning_rate": 7.047478037906943e-05, + "loss": 1.7334, + "step": 12536 + }, + { + "epoch": 3.8480662983425415, + "grad_norm": 0.22260744869709015, + "learning_rate": 7.047024557605708e-05, + "loss": 1.787, + "step": 12537 + }, + { + "epoch": 3.8483732351135664, + "grad_norm": 0.2457917332649231, + "learning_rate": 7.046571057074578e-05, + "loss": 1.7865, + "step": 12538 + }, + { + "epoch": 3.8486801718845918, + "grad_norm": 0.23952928185462952, + "learning_rate": 7.046117536318035e-05, + "loss": 1.7764, + "step": 12539 + }, + { + "epoch": 3.848987108655617, + "grad_norm": 0.22186748683452606, + "learning_rate": 7.045663995340557e-05, + "loss": 1.7917, + "step": 12540 + }, + { + "epoch": 3.849294045426642, + "grad_norm": 0.24234962463378906, + "learning_rate": 7.045210434146629e-05, + "loss": 1.7697, + "step": 12541 + }, + { + "epoch": 3.8496009821976673, + "grad_norm": 0.2510770857334137, + "learning_rate": 7.044756852740732e-05, + "loss": 1.8012, + "step": 12542 + }, + { + "epoch": 3.849907918968692, + "grad_norm": 0.24910703301429749, + "learning_rate": 7.044303251127349e-05, + "loss": 1.831, + "step": 12543 + }, + { + "epoch": 3.8502148557397176, + "grad_norm": 0.3159966468811035, + "learning_rate": 7.043849629310964e-05, + "loss": 1.8029, + "step": 12544 + }, + { + "epoch": 3.850521792510743, + "grad_norm": 0.3155403733253479, + "learning_rate": 7.04339598729606e-05, + "loss": 1.7429, + "step": 12545 + }, + { + "epoch": 3.8508287292817682, + "grad_norm": 0.3037515878677368, + "learning_rate": 7.042942325087117e-05, + "loss": 1.8186, + "step": 12546 + }, + { + "epoch": 3.851135666052793, + "grad_norm": 0.2319766730070114, + "learning_rate": 7.042488642688621e-05, + "loss": 1.7853, + "step": 12547 + }, + { + "epoch": 3.8514426028238185, + "grad_norm": 0.23911969363689423, + "learning_rate": 7.042034940105055e-05, + "loss": 1.8314, + "step": 12548 + }, + { + "epoch": 3.8517495395948433, + "grad_norm": 0.2541846036911011, + "learning_rate": 7.041581217340905e-05, + "loss": 1.8289, + "step": 12549 + }, + { + "epoch": 3.8520564763658687, + "grad_norm": 0.22234943509101868, + "learning_rate": 7.04112747440065e-05, + "loss": 1.7847, + "step": 12550 + }, + { + "epoch": 3.852363413136894, + "grad_norm": 0.2747870981693268, + "learning_rate": 7.04067371128878e-05, + "loss": 1.7875, + "step": 12551 + }, + { + "epoch": 3.852670349907919, + "grad_norm": 0.28589147329330444, + "learning_rate": 7.040219928009775e-05, + "loss": 1.7289, + "step": 12552 + }, + { + "epoch": 3.8529772866789442, + "grad_norm": 0.21180351078510284, + "learning_rate": 7.039766124568119e-05, + "loss": 1.7611, + "step": 12553 + }, + { + "epoch": 3.853284223449969, + "grad_norm": 0.27751782536506653, + "learning_rate": 7.0393123009683e-05, + "loss": 1.7481, + "step": 12554 + }, + { + "epoch": 3.8535911602209945, + "grad_norm": 0.32883307337760925, + "learning_rate": 7.038858457214802e-05, + "loss": 1.7271, + "step": 12555 + }, + { + "epoch": 3.85389809699202, + "grad_norm": 0.30965641140937805, + "learning_rate": 7.03840459331211e-05, + "loss": 1.81, + "step": 12556 + }, + { + "epoch": 3.8542050337630447, + "grad_norm": 0.25184348225593567, + "learning_rate": 7.037950709264709e-05, + "loss": 1.7642, + "step": 12557 + }, + { + "epoch": 3.85451197053407, + "grad_norm": 0.2376822829246521, + "learning_rate": 7.037496805077084e-05, + "loss": 1.7774, + "step": 12558 + }, + { + "epoch": 3.854818907305095, + "grad_norm": 0.2395993024110794, + "learning_rate": 7.03704288075372e-05, + "loss": 1.8397, + "step": 12559 + }, + { + "epoch": 3.8551258440761202, + "grad_norm": 0.26460394263267517, + "learning_rate": 7.036588936299107e-05, + "loss": 1.7472, + "step": 12560 + }, + { + "epoch": 3.8554327808471456, + "grad_norm": 0.34742459654808044, + "learning_rate": 7.036134971717725e-05, + "loss": 1.8003, + "step": 12561 + }, + { + "epoch": 3.855739717618171, + "grad_norm": 0.2829316556453705, + "learning_rate": 7.035680987014068e-05, + "loss": 1.7765, + "step": 12562 + }, + { + "epoch": 3.856046654389196, + "grad_norm": 0.3087223172187805, + "learning_rate": 7.035226982192615e-05, + "loss": 1.8462, + "step": 12563 + }, + { + "epoch": 3.856353591160221, + "grad_norm": 0.2806380093097687, + "learning_rate": 7.034772957257858e-05, + "loss": 1.7704, + "step": 12564 + }, + { + "epoch": 3.856660527931246, + "grad_norm": 0.25598087906837463, + "learning_rate": 7.03431891221428e-05, + "loss": 1.7843, + "step": 12565 + }, + { + "epoch": 3.8569674647022714, + "grad_norm": 0.30833700299263, + "learning_rate": 7.033864847066373e-05, + "loss": 1.8404, + "step": 12566 + }, + { + "epoch": 3.8572744014732967, + "grad_norm": 0.29562532901763916, + "learning_rate": 7.03341076181862e-05, + "loss": 1.8044, + "step": 12567 + }, + { + "epoch": 3.8575813382443216, + "grad_norm": 0.2901719808578491, + "learning_rate": 7.03295665647551e-05, + "loss": 1.7789, + "step": 12568 + }, + { + "epoch": 3.857888275015347, + "grad_norm": 0.25453686714172363, + "learning_rate": 7.03250253104153e-05, + "loss": 1.6792, + "step": 12569 + }, + { + "epoch": 3.858195211786372, + "grad_norm": 0.26009416580200195, + "learning_rate": 7.03204838552117e-05, + "loss": 1.7835, + "step": 12570 + }, + { + "epoch": 3.858502148557397, + "grad_norm": 0.28074127435684204, + "learning_rate": 7.031594219918916e-05, + "loss": 1.7932, + "step": 12571 + }, + { + "epoch": 3.8588090853284225, + "grad_norm": 0.3341725170612335, + "learning_rate": 7.031140034239258e-05, + "loss": 1.7439, + "step": 12572 + }, + { + "epoch": 3.8591160220994474, + "grad_norm": 0.28142449259757996, + "learning_rate": 7.030685828486684e-05, + "loss": 1.8263, + "step": 12573 + }, + { + "epoch": 3.8594229588704727, + "grad_norm": 0.2571438252925873, + "learning_rate": 7.030231602665681e-05, + "loss": 1.7628, + "step": 12574 + }, + { + "epoch": 3.8597298956414976, + "grad_norm": 0.3079041838645935, + "learning_rate": 7.029777356780741e-05, + "loss": 1.7879, + "step": 12575 + }, + { + "epoch": 3.860036832412523, + "grad_norm": 0.2605433464050293, + "learning_rate": 7.029323090836349e-05, + "loss": 1.7841, + "step": 12576 + }, + { + "epoch": 3.8603437691835483, + "grad_norm": 0.24069640040397644, + "learning_rate": 7.028868804836999e-05, + "loss": 1.7939, + "step": 12577 + }, + { + "epoch": 3.8606507059545736, + "grad_norm": 0.26801639795303345, + "learning_rate": 7.028414498787177e-05, + "loss": 1.8082, + "step": 12578 + }, + { + "epoch": 3.8609576427255985, + "grad_norm": 0.28828585147857666, + "learning_rate": 7.027960172691375e-05, + "loss": 1.8094, + "step": 12579 + }, + { + "epoch": 3.861264579496624, + "grad_norm": 0.22927051782608032, + "learning_rate": 7.027505826554082e-05, + "loss": 1.7758, + "step": 12580 + }, + { + "epoch": 3.8615715162676487, + "grad_norm": 0.25755998492240906, + "learning_rate": 7.027051460379788e-05, + "loss": 1.8429, + "step": 12581 + }, + { + "epoch": 3.861878453038674, + "grad_norm": 0.23636581003665924, + "learning_rate": 7.026597074172982e-05, + "loss": 1.7662, + "step": 12582 + }, + { + "epoch": 3.8621853898096994, + "grad_norm": 0.22599349915981293, + "learning_rate": 7.026142667938156e-05, + "loss": 1.7199, + "step": 12583 + }, + { + "epoch": 3.8624923265807243, + "grad_norm": 0.2504875659942627, + "learning_rate": 7.025688241679802e-05, + "loss": 1.8473, + "step": 12584 + }, + { + "epoch": 3.8627992633517496, + "grad_norm": 0.3012976348400116, + "learning_rate": 7.025233795402408e-05, + "loss": 1.8715, + "step": 12585 + }, + { + "epoch": 3.8631062001227745, + "grad_norm": 0.31703677773475647, + "learning_rate": 7.024779329110469e-05, + "loss": 1.8143, + "step": 12586 + }, + { + "epoch": 3.8634131368938, + "grad_norm": 0.27287593483924866, + "learning_rate": 7.024324842808472e-05, + "loss": 1.7227, + "step": 12587 + }, + { + "epoch": 3.863720073664825, + "grad_norm": 0.24663801491260529, + "learning_rate": 7.02387033650091e-05, + "loss": 1.7529, + "step": 12588 + }, + { + "epoch": 3.86402701043585, + "grad_norm": 0.26127147674560547, + "learning_rate": 7.023415810192277e-05, + "loss": 1.7629, + "step": 12589 + }, + { + "epoch": 3.8643339472068754, + "grad_norm": 0.3457142114639282, + "learning_rate": 7.022961263887062e-05, + "loss": 1.8212, + "step": 12590 + }, + { + "epoch": 3.8646408839779003, + "grad_norm": 0.3296070694923401, + "learning_rate": 7.022506697589759e-05, + "loss": 1.7907, + "step": 12591 + }, + { + "epoch": 3.8649478207489256, + "grad_norm": 0.29474303126335144, + "learning_rate": 7.022052111304858e-05, + "loss": 1.7866, + "step": 12592 + }, + { + "epoch": 3.865254757519951, + "grad_norm": 0.2535403072834015, + "learning_rate": 7.021597505036852e-05, + "loss": 1.7607, + "step": 12593 + }, + { + "epoch": 3.8655616942909763, + "grad_norm": 0.26691222190856934, + "learning_rate": 7.021142878790237e-05, + "loss": 1.8063, + "step": 12594 + }, + { + "epoch": 3.865868631062001, + "grad_norm": 0.2784755229949951, + "learning_rate": 7.020688232569502e-05, + "loss": 1.8065, + "step": 12595 + }, + { + "epoch": 3.8661755678330265, + "grad_norm": 0.23714317381381989, + "learning_rate": 7.020233566379142e-05, + "loss": 1.8317, + "step": 12596 + }, + { + "epoch": 3.8664825046040514, + "grad_norm": 0.25010553002357483, + "learning_rate": 7.019778880223649e-05, + "loss": 1.8493, + "step": 12597 + }, + { + "epoch": 3.8667894413750767, + "grad_norm": 0.2798489034175873, + "learning_rate": 7.01932417410752e-05, + "loss": 1.8134, + "step": 12598 + }, + { + "epoch": 3.867096378146102, + "grad_norm": 0.26199260354042053, + "learning_rate": 7.018869448035243e-05, + "loss": 1.6931, + "step": 12599 + }, + { + "epoch": 3.867403314917127, + "grad_norm": 0.24582891166210175, + "learning_rate": 7.018414702011314e-05, + "loss": 1.8076, + "step": 12600 + }, + { + "epoch": 3.8677102516881523, + "grad_norm": 0.25493237376213074, + "learning_rate": 7.01795993604023e-05, + "loss": 1.7851, + "step": 12601 + }, + { + "epoch": 3.868017188459177, + "grad_norm": 0.2607674300670624, + "learning_rate": 7.017505150126483e-05, + "loss": 1.7285, + "step": 12602 + }, + { + "epoch": 3.8683241252302025, + "grad_norm": 0.23629581928253174, + "learning_rate": 7.017050344274568e-05, + "loss": 1.8254, + "step": 12603 + }, + { + "epoch": 3.868631062001228, + "grad_norm": 0.3129318058490753, + "learning_rate": 7.016595518488979e-05, + "loss": 1.7914, + "step": 12604 + }, + { + "epoch": 3.8689379987722528, + "grad_norm": 0.3178271949291229, + "learning_rate": 7.01614067277421e-05, + "loss": 1.8139, + "step": 12605 + }, + { + "epoch": 3.869244935543278, + "grad_norm": 0.3230711817741394, + "learning_rate": 7.015685807134757e-05, + "loss": 1.8203, + "step": 12606 + }, + { + "epoch": 3.869551872314303, + "grad_norm": 0.26339825987815857, + "learning_rate": 7.015230921575118e-05, + "loss": 1.8022, + "step": 12607 + }, + { + "epoch": 3.8698588090853283, + "grad_norm": 0.25337356328964233, + "learning_rate": 7.014776016099785e-05, + "loss": 1.7779, + "step": 12608 + }, + { + "epoch": 3.8701657458563536, + "grad_norm": 0.2506195306777954, + "learning_rate": 7.014321090713253e-05, + "loss": 1.7858, + "step": 12609 + }, + { + "epoch": 3.870472682627379, + "grad_norm": 0.26249951124191284, + "learning_rate": 7.013866145420021e-05, + "loss": 1.8051, + "step": 12610 + }, + { + "epoch": 3.870779619398404, + "grad_norm": 0.25666534900665283, + "learning_rate": 7.013411180224581e-05, + "loss": 1.7945, + "step": 12611 + }, + { + "epoch": 3.871086556169429, + "grad_norm": 0.23901648819446564, + "learning_rate": 7.012956195131433e-05, + "loss": 1.7844, + "step": 12612 + }, + { + "epoch": 3.871393492940454, + "grad_norm": 0.26814451813697815, + "learning_rate": 7.012501190145071e-05, + "loss": 1.7713, + "step": 12613 + }, + { + "epoch": 3.8717004297114794, + "grad_norm": 0.28377315402030945, + "learning_rate": 7.012046165269995e-05, + "loss": 1.7866, + "step": 12614 + }, + { + "epoch": 3.8720073664825048, + "grad_norm": 0.2751680612564087, + "learning_rate": 7.011591120510699e-05, + "loss": 1.7215, + "step": 12615 + }, + { + "epoch": 3.8723143032535297, + "grad_norm": 0.21988113224506378, + "learning_rate": 7.011136055871679e-05, + "loss": 1.8009, + "step": 12616 + }, + { + "epoch": 3.872621240024555, + "grad_norm": 0.26462143659591675, + "learning_rate": 7.010680971357434e-05, + "loss": 1.7618, + "step": 12617 + }, + { + "epoch": 3.87292817679558, + "grad_norm": 0.29054632782936096, + "learning_rate": 7.010225866972462e-05, + "loss": 1.7549, + "step": 12618 + }, + { + "epoch": 3.873235113566605, + "grad_norm": 0.31341224908828735, + "learning_rate": 7.00977074272126e-05, + "loss": 1.8827, + "step": 12619 + }, + { + "epoch": 3.8735420503376305, + "grad_norm": 0.24252115190029144, + "learning_rate": 7.009315598608324e-05, + "loss": 1.7544, + "step": 12620 + }, + { + "epoch": 3.873848987108656, + "grad_norm": 0.30036893486976624, + "learning_rate": 7.008860434638154e-05, + "loss": 1.7465, + "step": 12621 + }, + { + "epoch": 3.8741559238796808, + "grad_norm": 0.3217438757419586, + "learning_rate": 7.00840525081525e-05, + "loss": 1.72, + "step": 12622 + }, + { + "epoch": 3.874462860650706, + "grad_norm": 0.22507290542125702, + "learning_rate": 7.007950047144105e-05, + "loss": 1.7177, + "step": 12623 + }, + { + "epoch": 3.874769797421731, + "grad_norm": 0.3014441728591919, + "learning_rate": 7.007494823629224e-05, + "loss": 1.7502, + "step": 12624 + }, + { + "epoch": 3.8750767341927563, + "grad_norm": 0.3836904466152191, + "learning_rate": 7.0070395802751e-05, + "loss": 1.7971, + "step": 12625 + }, + { + "epoch": 3.8753836709637817, + "grad_norm": 0.33565691113471985, + "learning_rate": 7.006584317086235e-05, + "loss": 1.7439, + "step": 12626 + }, + { + "epoch": 3.8756906077348066, + "grad_norm": 0.2292134314775467, + "learning_rate": 7.006129034067128e-05, + "loss": 1.7998, + "step": 12627 + }, + { + "epoch": 3.875997544505832, + "grad_norm": 0.26385873556137085, + "learning_rate": 7.005673731222277e-05, + "loss": 1.7914, + "step": 12628 + }, + { + "epoch": 3.876304481276857, + "grad_norm": 0.2854950428009033, + "learning_rate": 7.005218408556184e-05, + "loss": 1.7761, + "step": 12629 + }, + { + "epoch": 3.876611418047882, + "grad_norm": 0.34260645508766174, + "learning_rate": 7.004763066073348e-05, + "loss": 1.8015, + "step": 12630 + }, + { + "epoch": 3.8769183548189075, + "grad_norm": 0.3223683834075928, + "learning_rate": 7.004307703778267e-05, + "loss": 1.7453, + "step": 12631 + }, + { + "epoch": 3.8772252915899323, + "grad_norm": 0.24715089797973633, + "learning_rate": 7.003852321675442e-05, + "loss": 1.7813, + "step": 12632 + }, + { + "epoch": 3.8775322283609577, + "grad_norm": 0.22822390496730804, + "learning_rate": 7.003396919769377e-05, + "loss": 1.7982, + "step": 12633 + }, + { + "epoch": 3.8778391651319826, + "grad_norm": 0.24125081300735474, + "learning_rate": 7.002941498064565e-05, + "loss": 1.8606, + "step": 12634 + }, + { + "epoch": 3.878146101903008, + "grad_norm": 0.23512506484985352, + "learning_rate": 7.002486056565513e-05, + "loss": 1.7469, + "step": 12635 + }, + { + "epoch": 3.8784530386740332, + "grad_norm": 0.2908322215080261, + "learning_rate": 7.00203059527672e-05, + "loss": 1.796, + "step": 12636 + }, + { + "epoch": 3.8787599754450586, + "grad_norm": 0.22931252419948578, + "learning_rate": 7.001575114202689e-05, + "loss": 1.7482, + "step": 12637 + }, + { + "epoch": 3.8790669122160835, + "grad_norm": 0.22574284672737122, + "learning_rate": 7.001119613347917e-05, + "loss": 1.7698, + "step": 12638 + }, + { + "epoch": 3.879373848987109, + "grad_norm": 0.23129726946353912, + "learning_rate": 7.000664092716909e-05, + "loss": 1.776, + "step": 12639 + }, + { + "epoch": 3.8796807857581337, + "grad_norm": 0.2763366401195526, + "learning_rate": 7.000208552314165e-05, + "loss": 1.7814, + "step": 12640 + }, + { + "epoch": 3.879987722529159, + "grad_norm": 0.29870158433914185, + "learning_rate": 6.99975299214419e-05, + "loss": 1.7467, + "step": 12641 + }, + { + "epoch": 3.8802946593001844, + "grad_norm": 0.33574381470680237, + "learning_rate": 6.999297412211484e-05, + "loss": 1.8159, + "step": 12642 + }, + { + "epoch": 3.8806015960712092, + "grad_norm": 0.30309897661209106, + "learning_rate": 6.998841812520547e-05, + "loss": 1.8454, + "step": 12643 + }, + { + "epoch": 3.8809085328422346, + "grad_norm": 0.27399247884750366, + "learning_rate": 6.998386193075886e-05, + "loss": 1.7956, + "step": 12644 + }, + { + "epoch": 3.8812154696132595, + "grad_norm": 0.28649580478668213, + "learning_rate": 6.997930553881998e-05, + "loss": 1.8308, + "step": 12645 + }, + { + "epoch": 3.881522406384285, + "grad_norm": 0.2716052532196045, + "learning_rate": 6.997474894943392e-05, + "loss": 1.7698, + "step": 12646 + }, + { + "epoch": 3.88182934315531, + "grad_norm": 0.21380536258220673, + "learning_rate": 6.997019216264567e-05, + "loss": 1.7028, + "step": 12647 + }, + { + "epoch": 3.882136279926335, + "grad_norm": 0.25262731313705444, + "learning_rate": 6.996563517850028e-05, + "loss": 1.8236, + "step": 12648 + }, + { + "epoch": 3.8824432166973604, + "grad_norm": 0.21150052547454834, + "learning_rate": 6.996107799704277e-05, + "loss": 1.7437, + "step": 12649 + }, + { + "epoch": 3.8827501534683853, + "grad_norm": 0.2614554464817047, + "learning_rate": 6.995652061831821e-05, + "loss": 1.7575, + "step": 12650 + }, + { + "epoch": 3.8830570902394106, + "grad_norm": 0.214684396982193, + "learning_rate": 6.995196304237159e-05, + "loss": 1.8195, + "step": 12651 + }, + { + "epoch": 3.883364027010436, + "grad_norm": 0.2226872444152832, + "learning_rate": 6.994740526924798e-05, + "loss": 1.7556, + "step": 12652 + }, + { + "epoch": 3.8836709637814613, + "grad_norm": 0.22270764410495758, + "learning_rate": 6.994284729899246e-05, + "loss": 1.7536, + "step": 12653 + }, + { + "epoch": 3.883977900552486, + "grad_norm": 0.20683564245700836, + "learning_rate": 6.993828913165e-05, + "loss": 1.7728, + "step": 12654 + }, + { + "epoch": 3.8842848373235115, + "grad_norm": 0.23667018115520477, + "learning_rate": 6.993373076726568e-05, + "loss": 1.7819, + "step": 12655 + }, + { + "epoch": 3.8845917740945364, + "grad_norm": 0.2265234887599945, + "learning_rate": 6.992917220588455e-05, + "loss": 1.7502, + "step": 12656 + }, + { + "epoch": 3.8848987108655617, + "grad_norm": 0.24490754306316376, + "learning_rate": 6.992461344755168e-05, + "loss": 1.7513, + "step": 12657 + }, + { + "epoch": 3.885205647636587, + "grad_norm": 0.23001348972320557, + "learning_rate": 6.992005449231208e-05, + "loss": 1.733, + "step": 12658 + }, + { + "epoch": 3.885512584407612, + "grad_norm": 0.25424695014953613, + "learning_rate": 6.991549534021084e-05, + "loss": 1.7621, + "step": 12659 + }, + { + "epoch": 3.8858195211786373, + "grad_norm": 0.25552862882614136, + "learning_rate": 6.991093599129299e-05, + "loss": 1.7974, + "step": 12660 + }, + { + "epoch": 3.886126457949662, + "grad_norm": 0.26876959204673767, + "learning_rate": 6.99063764456036e-05, + "loss": 1.7924, + "step": 12661 + }, + { + "epoch": 3.8864333947206875, + "grad_norm": 0.2754429578781128, + "learning_rate": 6.990181670318772e-05, + "loss": 1.7981, + "step": 12662 + }, + { + "epoch": 3.886740331491713, + "grad_norm": 0.281818687915802, + "learning_rate": 6.989725676409044e-05, + "loss": 1.7328, + "step": 12663 + }, + { + "epoch": 3.8870472682627377, + "grad_norm": 0.21676552295684814, + "learning_rate": 6.989269662835681e-05, + "loss": 1.7376, + "step": 12664 + }, + { + "epoch": 3.887354205033763, + "grad_norm": 0.276115745306015, + "learning_rate": 6.98881362960319e-05, + "loss": 1.7784, + "step": 12665 + }, + { + "epoch": 3.887661141804788, + "grad_norm": 0.2806364893913269, + "learning_rate": 6.988357576716075e-05, + "loss": 1.8078, + "step": 12666 + }, + { + "epoch": 3.8879680785758133, + "grad_norm": 0.27620184421539307, + "learning_rate": 6.987901504178845e-05, + "loss": 1.8115, + "step": 12667 + }, + { + "epoch": 3.8882750153468386, + "grad_norm": 0.23845402896404266, + "learning_rate": 6.987445411996009e-05, + "loss": 1.7485, + "step": 12668 + }, + { + "epoch": 3.888581952117864, + "grad_norm": 0.25063586235046387, + "learning_rate": 6.986989300172071e-05, + "loss": 1.7663, + "step": 12669 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.2417975515127182, + "learning_rate": 6.98653316871154e-05, + "loss": 1.7562, + "step": 12670 + }, + { + "epoch": 3.889195825659914, + "grad_norm": 0.24952733516693115, + "learning_rate": 6.986077017618923e-05, + "loss": 1.8063, + "step": 12671 + }, + { + "epoch": 3.889502762430939, + "grad_norm": 0.25847554206848145, + "learning_rate": 6.985620846898732e-05, + "loss": 1.7722, + "step": 12672 + }, + { + "epoch": 3.8898096992019644, + "grad_norm": 0.23762650787830353, + "learning_rate": 6.985164656555471e-05, + "loss": 1.8368, + "step": 12673 + }, + { + "epoch": 3.8901166359729897, + "grad_norm": 0.25346314907073975, + "learning_rate": 6.984708446593648e-05, + "loss": 1.7957, + "step": 12674 + }, + { + "epoch": 3.8904235727440146, + "grad_norm": 0.2466745674610138, + "learning_rate": 6.984252217017774e-05, + "loss": 1.8286, + "step": 12675 + }, + { + "epoch": 3.89073050951504, + "grad_norm": 0.25413215160369873, + "learning_rate": 6.983795967832356e-05, + "loss": 1.7711, + "step": 12676 + }, + { + "epoch": 3.891037446286065, + "grad_norm": 0.2315925806760788, + "learning_rate": 6.983339699041903e-05, + "loss": 1.7546, + "step": 12677 + }, + { + "epoch": 3.89134438305709, + "grad_norm": 0.26473405957221985, + "learning_rate": 6.982883410650925e-05, + "loss": 1.7563, + "step": 12678 + }, + { + "epoch": 3.8916513198281155, + "grad_norm": 0.24176491796970367, + "learning_rate": 6.982427102663932e-05, + "loss": 1.7734, + "step": 12679 + }, + { + "epoch": 3.891958256599141, + "grad_norm": 0.25444844365119934, + "learning_rate": 6.98197077508543e-05, + "loss": 1.803, + "step": 12680 + }, + { + "epoch": 3.8922651933701657, + "grad_norm": 0.25234144926071167, + "learning_rate": 6.981514427919933e-05, + "loss": 1.8099, + "step": 12681 + }, + { + "epoch": 3.892572130141191, + "grad_norm": 0.2571142315864563, + "learning_rate": 6.98105806117195e-05, + "loss": 1.8618, + "step": 12682 + }, + { + "epoch": 3.892879066912216, + "grad_norm": 0.21235275268554688, + "learning_rate": 6.980601674845988e-05, + "loss": 1.7121, + "step": 12683 + }, + { + "epoch": 3.8931860036832413, + "grad_norm": 0.27078527212142944, + "learning_rate": 6.98014526894656e-05, + "loss": 1.8103, + "step": 12684 + }, + { + "epoch": 3.8934929404542666, + "grad_norm": 0.3198096454143524, + "learning_rate": 6.979688843478176e-05, + "loss": 1.7529, + "step": 12685 + }, + { + "epoch": 3.8937998772252915, + "grad_norm": 0.3170493245124817, + "learning_rate": 6.979232398445345e-05, + "loss": 1.7629, + "step": 12686 + }, + { + "epoch": 3.894106813996317, + "grad_norm": 0.2495265007019043, + "learning_rate": 6.978775933852582e-05, + "loss": 1.7407, + "step": 12687 + }, + { + "epoch": 3.8944137507673418, + "grad_norm": 0.24570141732692719, + "learning_rate": 6.978319449704395e-05, + "loss": 1.7688, + "step": 12688 + }, + { + "epoch": 3.894720687538367, + "grad_norm": 0.23956388235092163, + "learning_rate": 6.977862946005295e-05, + "loss": 1.7115, + "step": 12689 + }, + { + "epoch": 3.8950276243093924, + "grad_norm": 0.21548940241336823, + "learning_rate": 6.977406422759793e-05, + "loss": 1.7611, + "step": 12690 + }, + { + "epoch": 3.8953345610804173, + "grad_norm": 0.25797295570373535, + "learning_rate": 6.976949879972403e-05, + "loss": 1.7688, + "step": 12691 + }, + { + "epoch": 3.8956414978514426, + "grad_norm": 0.28257784247398376, + "learning_rate": 6.976493317647636e-05, + "loss": 1.7517, + "step": 12692 + }, + { + "epoch": 3.8959484346224675, + "grad_norm": 0.23828580975532532, + "learning_rate": 6.976036735790004e-05, + "loss": 1.7877, + "step": 12693 + }, + { + "epoch": 3.896255371393493, + "grad_norm": 0.22915001213550568, + "learning_rate": 6.975580134404017e-05, + "loss": 1.7741, + "step": 12694 + }, + { + "epoch": 3.896562308164518, + "grad_norm": 0.22975030541419983, + "learning_rate": 6.97512351349419e-05, + "loss": 1.772, + "step": 12695 + }, + { + "epoch": 3.8968692449355435, + "grad_norm": 0.29515185952186584, + "learning_rate": 6.974666873065034e-05, + "loss": 1.8001, + "step": 12696 + }, + { + "epoch": 3.8971761817065684, + "grad_norm": 0.26904794573783875, + "learning_rate": 6.974210213121064e-05, + "loss": 1.7069, + "step": 12697 + }, + { + "epoch": 3.8974831184775938, + "grad_norm": 0.2549479603767395, + "learning_rate": 6.97375353366679e-05, + "loss": 1.7419, + "step": 12698 + }, + { + "epoch": 3.8977900552486187, + "grad_norm": 0.23750101029872894, + "learning_rate": 6.973296834706729e-05, + "loss": 1.7815, + "step": 12699 + }, + { + "epoch": 3.898096992019644, + "grad_norm": 0.23529762029647827, + "learning_rate": 6.972840116245389e-05, + "loss": 1.8139, + "step": 12700 + }, + { + "epoch": 3.8984039287906693, + "grad_norm": 0.3212098777294159, + "learning_rate": 6.97238337828729e-05, + "loss": 1.7507, + "step": 12701 + }, + { + "epoch": 3.898710865561694, + "grad_norm": 0.3167687952518463, + "learning_rate": 6.971926620836941e-05, + "loss": 1.8062, + "step": 12702 + }, + { + "epoch": 3.8990178023327196, + "grad_norm": 0.31298309564590454, + "learning_rate": 6.971469843898855e-05, + "loss": 1.8127, + "step": 12703 + }, + { + "epoch": 3.8993247391037444, + "grad_norm": 0.2537378668785095, + "learning_rate": 6.971013047477551e-05, + "loss": 1.7675, + "step": 12704 + }, + { + "epoch": 3.8996316758747698, + "grad_norm": 0.24292805790901184, + "learning_rate": 6.97055623157754e-05, + "loss": 1.8004, + "step": 12705 + }, + { + "epoch": 3.899938612645795, + "grad_norm": 0.2929537296295166, + "learning_rate": 6.970099396203338e-05, + "loss": 1.7963, + "step": 12706 + }, + { + "epoch": 3.90024554941682, + "grad_norm": 0.30531612038612366, + "learning_rate": 6.969642541359459e-05, + "loss": 1.7347, + "step": 12707 + }, + { + "epoch": 3.9005524861878453, + "grad_norm": 0.3138202726840973, + "learning_rate": 6.969185667050417e-05, + "loss": 1.7987, + "step": 12708 + }, + { + "epoch": 3.9008594229588702, + "grad_norm": 0.2366247922182083, + "learning_rate": 6.96872877328073e-05, + "loss": 1.7671, + "step": 12709 + }, + { + "epoch": 3.9011663597298956, + "grad_norm": 0.26251721382141113, + "learning_rate": 6.96827186005491e-05, + "loss": 1.7657, + "step": 12710 + }, + { + "epoch": 3.901473296500921, + "grad_norm": 0.32497119903564453, + "learning_rate": 6.967814927377474e-05, + "loss": 1.7873, + "step": 12711 + }, + { + "epoch": 3.9017802332719462, + "grad_norm": 0.3290228843688965, + "learning_rate": 6.967357975252939e-05, + "loss": 1.8076, + "step": 12712 + }, + { + "epoch": 3.902087170042971, + "grad_norm": 0.2737300992012024, + "learning_rate": 6.966901003685817e-05, + "loss": 1.7405, + "step": 12713 + }, + { + "epoch": 3.9023941068139965, + "grad_norm": 0.25465309619903564, + "learning_rate": 6.966444012680626e-05, + "loss": 1.8063, + "step": 12714 + }, + { + "epoch": 3.9027010435850213, + "grad_norm": 0.2397255003452301, + "learning_rate": 6.965987002241885e-05, + "loss": 1.8079, + "step": 12715 + }, + { + "epoch": 3.9030079803560467, + "grad_norm": 0.23115718364715576, + "learning_rate": 6.965529972374108e-05, + "loss": 1.8032, + "step": 12716 + }, + { + "epoch": 3.903314917127072, + "grad_norm": 0.2536461055278778, + "learning_rate": 6.96507292308181e-05, + "loss": 1.7477, + "step": 12717 + }, + { + "epoch": 3.903621853898097, + "grad_norm": 0.27151185274124146, + "learning_rate": 6.96461585436951e-05, + "loss": 1.75, + "step": 12718 + }, + { + "epoch": 3.9039287906691222, + "grad_norm": 0.26894113421440125, + "learning_rate": 6.964158766241726e-05, + "loss": 1.7816, + "step": 12719 + }, + { + "epoch": 3.904235727440147, + "grad_norm": 0.23541375994682312, + "learning_rate": 6.963701658702972e-05, + "loss": 1.7991, + "step": 12720 + }, + { + "epoch": 3.9045426642111725, + "grad_norm": 0.22142915427684784, + "learning_rate": 6.96324453175777e-05, + "loss": 1.7245, + "step": 12721 + }, + { + "epoch": 3.904849600982198, + "grad_norm": 0.32864269614219666, + "learning_rate": 6.962787385410632e-05, + "loss": 1.7631, + "step": 12722 + }, + { + "epoch": 3.9051565377532227, + "grad_norm": 0.23657776415348053, + "learning_rate": 6.96233021966608e-05, + "loss": 1.8081, + "step": 12723 + }, + { + "epoch": 3.905463474524248, + "grad_norm": 0.24790632724761963, + "learning_rate": 6.961873034528629e-05, + "loss": 1.7193, + "step": 12724 + }, + { + "epoch": 3.905770411295273, + "grad_norm": 0.2517886459827423, + "learning_rate": 6.961415830002801e-05, + "loss": 1.7785, + "step": 12725 + }, + { + "epoch": 3.9060773480662982, + "grad_norm": 0.2340923547744751, + "learning_rate": 6.960958606093113e-05, + "loss": 1.7632, + "step": 12726 + }, + { + "epoch": 3.9063842848373236, + "grad_norm": 0.23260441422462463, + "learning_rate": 6.960501362804079e-05, + "loss": 1.7865, + "step": 12727 + }, + { + "epoch": 3.906691221608349, + "grad_norm": 0.22616329789161682, + "learning_rate": 6.960044100140224e-05, + "loss": 1.7851, + "step": 12728 + }, + { + "epoch": 3.906998158379374, + "grad_norm": 0.2849951982498169, + "learning_rate": 6.959586818106064e-05, + "loss": 1.8618, + "step": 12729 + }, + { + "epoch": 3.907305095150399, + "grad_norm": 0.3279374837875366, + "learning_rate": 6.95912951670612e-05, + "loss": 1.8563, + "step": 12730 + }, + { + "epoch": 3.907612031921424, + "grad_norm": 0.24359555542469025, + "learning_rate": 6.958672195944906e-05, + "loss": 1.7604, + "step": 12731 + }, + { + "epoch": 3.9079189686924494, + "grad_norm": 0.30881935358047485, + "learning_rate": 6.958214855826947e-05, + "loss": 1.8463, + "step": 12732 + }, + { + "epoch": 3.9082259054634747, + "grad_norm": 0.25361543893814087, + "learning_rate": 6.957757496356763e-05, + "loss": 1.7831, + "step": 12733 + }, + { + "epoch": 3.9085328422344996, + "grad_norm": 0.26763513684272766, + "learning_rate": 6.957300117538869e-05, + "loss": 1.8383, + "step": 12734 + }, + { + "epoch": 3.908839779005525, + "grad_norm": 0.2238057255744934, + "learning_rate": 6.95684271937779e-05, + "loss": 1.7702, + "step": 12735 + }, + { + "epoch": 3.90914671577655, + "grad_norm": 0.22110232710838318, + "learning_rate": 6.956385301878045e-05, + "loss": 1.7931, + "step": 12736 + }, + { + "epoch": 3.909453652547575, + "grad_norm": 0.23765070736408234, + "learning_rate": 6.955927865044152e-05, + "loss": 1.7212, + "step": 12737 + }, + { + "epoch": 3.9097605893186005, + "grad_norm": 0.22324508428573608, + "learning_rate": 6.955470408880633e-05, + "loss": 1.7161, + "step": 12738 + }, + { + "epoch": 3.9100675260896254, + "grad_norm": 0.22485347092151642, + "learning_rate": 6.955012933392012e-05, + "loss": 1.7374, + "step": 12739 + }, + { + "epoch": 3.9103744628606507, + "grad_norm": 0.28046715259552, + "learning_rate": 6.954555438582806e-05, + "loss": 1.9264, + "step": 12740 + }, + { + "epoch": 3.9106813996316756, + "grad_norm": 0.26391276717185974, + "learning_rate": 6.954097924457536e-05, + "loss": 1.7343, + "step": 12741 + }, + { + "epoch": 3.910988336402701, + "grad_norm": 0.29596614837646484, + "learning_rate": 6.953640391020726e-05, + "loss": 1.8111, + "step": 12742 + }, + { + "epoch": 3.9112952731737263, + "grad_norm": 0.2709808051586151, + "learning_rate": 6.953182838276896e-05, + "loss": 1.7776, + "step": 12743 + }, + { + "epoch": 3.9116022099447516, + "grad_norm": 0.2585100531578064, + "learning_rate": 6.952725266230571e-05, + "loss": 1.7774, + "step": 12744 + }, + { + "epoch": 3.9119091467157765, + "grad_norm": 0.26490530371665955, + "learning_rate": 6.952267674886268e-05, + "loss": 1.78, + "step": 12745 + }, + { + "epoch": 3.912216083486802, + "grad_norm": 0.23654767870903015, + "learning_rate": 6.951810064248512e-05, + "loss": 1.8263, + "step": 12746 + }, + { + "epoch": 3.9125230202578267, + "grad_norm": 0.2495296597480774, + "learning_rate": 6.951352434321826e-05, + "loss": 1.787, + "step": 12747 + }, + { + "epoch": 3.912829957028852, + "grad_norm": 0.24038313329219818, + "learning_rate": 6.950894785110728e-05, + "loss": 1.774, + "step": 12748 + }, + { + "epoch": 3.9131368937998774, + "grad_norm": 0.23738732933998108, + "learning_rate": 6.950437116619749e-05, + "loss": 1.7401, + "step": 12749 + }, + { + "epoch": 3.9134438305709023, + "grad_norm": 0.28192025423049927, + "learning_rate": 6.949979428853405e-05, + "loss": 1.8416, + "step": 12750 + }, + { + "epoch": 3.9137507673419276, + "grad_norm": 0.30579057335853577, + "learning_rate": 6.949521721816221e-05, + "loss": 1.7404, + "step": 12751 + }, + { + "epoch": 3.9140577041129525, + "grad_norm": 0.23972894251346588, + "learning_rate": 6.949063995512721e-05, + "loss": 1.7543, + "step": 12752 + }, + { + "epoch": 3.914364640883978, + "grad_norm": 0.2837793231010437, + "learning_rate": 6.94860624994743e-05, + "loss": 1.7779, + "step": 12753 + }, + { + "epoch": 3.914671577655003, + "grad_norm": 0.3344916105270386, + "learning_rate": 6.948148485124868e-05, + "loss": 1.7803, + "step": 12754 + }, + { + "epoch": 3.9149785144260285, + "grad_norm": 0.24271291494369507, + "learning_rate": 6.94769070104956e-05, + "loss": 1.7362, + "step": 12755 + }, + { + "epoch": 3.9152854511970534, + "grad_norm": 0.25299304723739624, + "learning_rate": 6.947232897726031e-05, + "loss": 1.7685, + "step": 12756 + }, + { + "epoch": 3.9155923879680787, + "grad_norm": 0.24766205251216888, + "learning_rate": 6.946775075158807e-05, + "loss": 1.829, + "step": 12757 + }, + { + "epoch": 3.9158993247391036, + "grad_norm": 0.2508428692817688, + "learning_rate": 6.94631723335241e-05, + "loss": 1.809, + "step": 12758 + }, + { + "epoch": 3.916206261510129, + "grad_norm": 0.2172096222639084, + "learning_rate": 6.945859372311365e-05, + "loss": 1.7376, + "step": 12759 + }, + { + "epoch": 3.9165131982811543, + "grad_norm": 0.28976425528526306, + "learning_rate": 6.945401492040198e-05, + "loss": 1.8229, + "step": 12760 + }, + { + "epoch": 3.916820135052179, + "grad_norm": 0.3528063893318176, + "learning_rate": 6.944943592543432e-05, + "loss": 1.7559, + "step": 12761 + }, + { + "epoch": 3.9171270718232045, + "grad_norm": 0.46312370896339417, + "learning_rate": 6.944485673825595e-05, + "loss": 1.7664, + "step": 12762 + }, + { + "epoch": 3.9174340085942294, + "grad_norm": 0.4466164708137512, + "learning_rate": 6.94402773589121e-05, + "loss": 1.7833, + "step": 12763 + }, + { + "epoch": 3.9177409453652547, + "grad_norm": 0.2637740969657898, + "learning_rate": 6.943569778744804e-05, + "loss": 1.818, + "step": 12764 + }, + { + "epoch": 3.91804788213628, + "grad_norm": 0.37515267729759216, + "learning_rate": 6.943111802390901e-05, + "loss": 1.7898, + "step": 12765 + }, + { + "epoch": 3.918354818907305, + "grad_norm": 0.45146289467811584, + "learning_rate": 6.942653806834029e-05, + "loss": 1.7797, + "step": 12766 + }, + { + "epoch": 3.9186617556783303, + "grad_norm": 0.2809859812259674, + "learning_rate": 6.942195792078712e-05, + "loss": 1.7836, + "step": 12767 + }, + { + "epoch": 3.918968692449355, + "grad_norm": 0.3606306314468384, + "learning_rate": 6.94173775812948e-05, + "loss": 1.7657, + "step": 12768 + }, + { + "epoch": 3.9192756292203805, + "grad_norm": 0.49528738856315613, + "learning_rate": 6.941279704990857e-05, + "loss": 1.7628, + "step": 12769 + }, + { + "epoch": 3.919582565991406, + "grad_norm": 0.3484322428703308, + "learning_rate": 6.940821632667371e-05, + "loss": 1.7939, + "step": 12770 + }, + { + "epoch": 3.919889502762431, + "grad_norm": 0.2479606419801712, + "learning_rate": 6.940363541163546e-05, + "loss": 1.813, + "step": 12771 + }, + { + "epoch": 3.920196439533456, + "grad_norm": 0.3491765558719635, + "learning_rate": 6.939905430483911e-05, + "loss": 1.7338, + "step": 12772 + }, + { + "epoch": 3.9205033763044814, + "grad_norm": 0.291810005903244, + "learning_rate": 6.939447300632995e-05, + "loss": 1.7445, + "step": 12773 + }, + { + "epoch": 3.9208103130755063, + "grad_norm": 0.2467527985572815, + "learning_rate": 6.938989151615324e-05, + "loss": 1.8462, + "step": 12774 + }, + { + "epoch": 3.9211172498465316, + "grad_norm": 0.35656824707984924, + "learning_rate": 6.938530983435426e-05, + "loss": 1.7751, + "step": 12775 + }, + { + "epoch": 3.921424186617557, + "grad_norm": 0.31269776821136475, + "learning_rate": 6.938072796097828e-05, + "loss": 1.7714, + "step": 12776 + }, + { + "epoch": 3.921731123388582, + "grad_norm": 0.2082831859588623, + "learning_rate": 6.937614589607058e-05, + "loss": 1.7263, + "step": 12777 + }, + { + "epoch": 3.922038060159607, + "grad_norm": 0.27583765983581543, + "learning_rate": 6.937156363967646e-05, + "loss": 1.6822, + "step": 12778 + }, + { + "epoch": 3.922344996930632, + "grad_norm": 0.32773876190185547, + "learning_rate": 6.93669811918412e-05, + "loss": 1.7792, + "step": 12779 + }, + { + "epoch": 3.9226519337016574, + "grad_norm": 0.2583121657371521, + "learning_rate": 6.936239855261007e-05, + "loss": 1.7812, + "step": 12780 + }, + { + "epoch": 3.9229588704726828, + "grad_norm": 0.245570570230484, + "learning_rate": 6.935781572202836e-05, + "loss": 1.7252, + "step": 12781 + }, + { + "epoch": 3.9232658072437077, + "grad_norm": 0.2379419505596161, + "learning_rate": 6.935323270014138e-05, + "loss": 1.7485, + "step": 12782 + }, + { + "epoch": 3.923572744014733, + "grad_norm": 0.2239784598350525, + "learning_rate": 6.934864948699439e-05, + "loss": 1.7444, + "step": 12783 + }, + { + "epoch": 3.923879680785758, + "grad_norm": 0.2366618812084198, + "learning_rate": 6.934406608263274e-05, + "loss": 1.777, + "step": 12784 + }, + { + "epoch": 3.924186617556783, + "grad_norm": 0.22583791613578796, + "learning_rate": 6.933948248710169e-05, + "loss": 1.7291, + "step": 12785 + }, + { + "epoch": 3.9244935543278086, + "grad_norm": 0.24141047894954681, + "learning_rate": 6.933489870044651e-05, + "loss": 1.7748, + "step": 12786 + }, + { + "epoch": 3.924800491098834, + "grad_norm": 0.2389962524175644, + "learning_rate": 6.933031472271255e-05, + "loss": 1.7957, + "step": 12787 + }, + { + "epoch": 3.925107427869859, + "grad_norm": 0.25230300426483154, + "learning_rate": 6.932573055394509e-05, + "loss": 1.7621, + "step": 12788 + }, + { + "epoch": 3.925414364640884, + "grad_norm": 0.23894043266773224, + "learning_rate": 6.932114619418941e-05, + "loss": 1.7285, + "step": 12789 + }, + { + "epoch": 3.925721301411909, + "grad_norm": 0.2650291919708252, + "learning_rate": 6.931656164349086e-05, + "loss": 1.7613, + "step": 12790 + }, + { + "epoch": 3.9260282381829343, + "grad_norm": 0.20616789162158966, + "learning_rate": 6.931197690189472e-05, + "loss": 1.7505, + "step": 12791 + }, + { + "epoch": 3.9263351749539597, + "grad_norm": 0.23915675282478333, + "learning_rate": 6.930739196944633e-05, + "loss": 1.7477, + "step": 12792 + }, + { + "epoch": 3.9266421117249846, + "grad_norm": 0.2522687613964081, + "learning_rate": 6.930280684619094e-05, + "loss": 1.8, + "step": 12793 + }, + { + "epoch": 3.92694904849601, + "grad_norm": 0.264167845249176, + "learning_rate": 6.929822153217391e-05, + "loss": 1.7516, + "step": 12794 + }, + { + "epoch": 3.927255985267035, + "grad_norm": 0.21358054876327515, + "learning_rate": 6.929363602744054e-05, + "loss": 1.7207, + "step": 12795 + }, + { + "epoch": 3.92756292203806, + "grad_norm": 0.25632721185684204, + "learning_rate": 6.928905033203617e-05, + "loss": 1.7446, + "step": 12796 + }, + { + "epoch": 3.9278698588090855, + "grad_norm": 0.2717185318470001, + "learning_rate": 6.928446444600608e-05, + "loss": 1.8555, + "step": 12797 + }, + { + "epoch": 3.9281767955801103, + "grad_norm": 0.2871767282485962, + "learning_rate": 6.927987836939561e-05, + "loss": 1.7861, + "step": 12798 + }, + { + "epoch": 3.9284837323511357, + "grad_norm": 0.282507061958313, + "learning_rate": 6.927529210225009e-05, + "loss": 1.7683, + "step": 12799 + }, + { + "epoch": 3.9287906691221606, + "grad_norm": 0.24870644509792328, + "learning_rate": 6.927070564461482e-05, + "loss": 1.7355, + "step": 12800 + }, + { + "epoch": 3.929097605893186, + "grad_norm": 0.2093631625175476, + "learning_rate": 6.926611899653516e-05, + "loss": 1.7691, + "step": 12801 + }, + { + "epoch": 3.9294045426642112, + "grad_norm": 0.34258076548576355, + "learning_rate": 6.926153215805642e-05, + "loss": 1.8398, + "step": 12802 + }, + { + "epoch": 3.9297114794352366, + "grad_norm": 0.39179500937461853, + "learning_rate": 6.925694512922391e-05, + "loss": 1.8229, + "step": 12803 + }, + { + "epoch": 3.9300184162062615, + "grad_norm": 0.36814743280410767, + "learning_rate": 6.9252357910083e-05, + "loss": 1.7759, + "step": 12804 + }, + { + "epoch": 3.930325352977287, + "grad_norm": 0.2659403085708618, + "learning_rate": 6.924777050067902e-05, + "loss": 1.7553, + "step": 12805 + }, + { + "epoch": 3.9306322897483117, + "grad_norm": 0.20617491006851196, + "learning_rate": 6.924318290105724e-05, + "loss": 1.7398, + "step": 12806 + }, + { + "epoch": 3.930939226519337, + "grad_norm": 0.23730522394180298, + "learning_rate": 6.923859511126309e-05, + "loss": 1.699, + "step": 12807 + }, + { + "epoch": 3.9312461632903624, + "grad_norm": 0.24865423142910004, + "learning_rate": 6.923400713134184e-05, + "loss": 1.7801, + "step": 12808 + }, + { + "epoch": 3.9315531000613873, + "grad_norm": 0.2495356798171997, + "learning_rate": 6.92294189613389e-05, + "loss": 1.803, + "step": 12809 + }, + { + "epoch": 3.9318600368324126, + "grad_norm": 0.24223244190216064, + "learning_rate": 6.922483060129955e-05, + "loss": 1.751, + "step": 12810 + }, + { + "epoch": 3.9321669736034375, + "grad_norm": 0.2541450262069702, + "learning_rate": 6.922024205126913e-05, + "loss": 1.7721, + "step": 12811 + }, + { + "epoch": 3.932473910374463, + "grad_norm": 0.24528831243515015, + "learning_rate": 6.921565331129304e-05, + "loss": 1.792, + "step": 12812 + }, + { + "epoch": 3.932780847145488, + "grad_norm": 0.22789500653743744, + "learning_rate": 6.921106438141659e-05, + "loss": 1.8455, + "step": 12813 + }, + { + "epoch": 3.933087783916513, + "grad_norm": 0.26267170906066895, + "learning_rate": 6.920647526168515e-05, + "loss": 1.7254, + "step": 12814 + }, + { + "epoch": 3.9333947206875384, + "grad_norm": 0.23044808208942413, + "learning_rate": 6.920188595214406e-05, + "loss": 1.7217, + "step": 12815 + }, + { + "epoch": 3.9337016574585633, + "grad_norm": 0.2304011732339859, + "learning_rate": 6.919729645283867e-05, + "loss": 1.8121, + "step": 12816 + }, + { + "epoch": 3.9340085942295886, + "grad_norm": 0.21516792476177216, + "learning_rate": 6.919270676381435e-05, + "loss": 1.7305, + "step": 12817 + }, + { + "epoch": 3.934315531000614, + "grad_norm": 0.24698840081691742, + "learning_rate": 6.918811688511646e-05, + "loss": 1.7967, + "step": 12818 + }, + { + "epoch": 3.9346224677716393, + "grad_norm": 0.23132537305355072, + "learning_rate": 6.918352681679035e-05, + "loss": 1.7439, + "step": 12819 + }, + { + "epoch": 3.934929404542664, + "grad_norm": 0.2597793936729431, + "learning_rate": 6.917893655888139e-05, + "loss": 1.7882, + "step": 12820 + }, + { + "epoch": 3.9352363413136895, + "grad_norm": 0.23946607112884521, + "learning_rate": 6.917434611143493e-05, + "loss": 1.7991, + "step": 12821 + }, + { + "epoch": 3.9355432780847144, + "grad_norm": 0.25808244943618774, + "learning_rate": 6.916975547449634e-05, + "loss": 1.845, + "step": 12822 + }, + { + "epoch": 3.9358502148557397, + "grad_norm": 0.26082557439804077, + "learning_rate": 6.9165164648111e-05, + "loss": 1.7562, + "step": 12823 + }, + { + "epoch": 3.936157151626765, + "grad_norm": 0.24810053408145905, + "learning_rate": 6.916057363232425e-05, + "loss": 1.778, + "step": 12824 + }, + { + "epoch": 3.93646408839779, + "grad_norm": 0.24168157577514648, + "learning_rate": 6.91559824271815e-05, + "loss": 1.7628, + "step": 12825 + }, + { + "epoch": 3.9367710251688153, + "grad_norm": 0.23800434172153473, + "learning_rate": 6.91513910327281e-05, + "loss": 1.8063, + "step": 12826 + }, + { + "epoch": 3.93707796193984, + "grad_norm": 0.23055073618888855, + "learning_rate": 6.914679944900944e-05, + "loss": 1.749, + "step": 12827 + }, + { + "epoch": 3.9373848987108655, + "grad_norm": 0.22455987334251404, + "learning_rate": 6.914220767607088e-05, + "loss": 1.7471, + "step": 12828 + }, + { + "epoch": 3.937691835481891, + "grad_norm": 0.21808378398418427, + "learning_rate": 6.913761571395778e-05, + "loss": 1.7503, + "step": 12829 + }, + { + "epoch": 3.937998772252916, + "grad_norm": 0.23136213421821594, + "learning_rate": 6.913302356271556e-05, + "loss": 1.752, + "step": 12830 + }, + { + "epoch": 3.938305709023941, + "grad_norm": 0.29579970240592957, + "learning_rate": 6.912843122238959e-05, + "loss": 1.8028, + "step": 12831 + }, + { + "epoch": 3.9386126457949664, + "grad_norm": 0.28578072786331177, + "learning_rate": 6.912383869302526e-05, + "loss": 1.8183, + "step": 12832 + }, + { + "epoch": 3.9389195825659913, + "grad_norm": 0.2616737186908722, + "learning_rate": 6.911924597466793e-05, + "loss": 1.8366, + "step": 12833 + }, + { + "epoch": 3.9392265193370166, + "grad_norm": 0.29275768995285034, + "learning_rate": 6.911465306736302e-05, + "loss": 1.731, + "step": 12834 + }, + { + "epoch": 3.939533456108042, + "grad_norm": 0.3300873041152954, + "learning_rate": 6.91100599711559e-05, + "loss": 1.8713, + "step": 12835 + }, + { + "epoch": 3.939840392879067, + "grad_norm": 0.2744643986225128, + "learning_rate": 6.910546668609195e-05, + "loss": 1.8479, + "step": 12836 + }, + { + "epoch": 3.940147329650092, + "grad_norm": 0.25248417258262634, + "learning_rate": 6.91008732122166e-05, + "loss": 1.7962, + "step": 12837 + }, + { + "epoch": 3.940454266421117, + "grad_norm": 0.3068546652793884, + "learning_rate": 6.909627954957521e-05, + "loss": 1.759, + "step": 12838 + }, + { + "epoch": 3.9407612031921424, + "grad_norm": 0.3273559808731079, + "learning_rate": 6.909168569821321e-05, + "loss": 1.814, + "step": 12839 + }, + { + "epoch": 3.9410681399631677, + "grad_norm": 0.31192758679389954, + "learning_rate": 6.908709165817597e-05, + "loss": 1.7906, + "step": 12840 + }, + { + "epoch": 3.9413750767341926, + "grad_norm": 0.24487090110778809, + "learning_rate": 6.90824974295089e-05, + "loss": 1.8238, + "step": 12841 + }, + { + "epoch": 3.941682013505218, + "grad_norm": 0.24863721430301666, + "learning_rate": 6.907790301225743e-05, + "loss": 1.7651, + "step": 12842 + }, + { + "epoch": 3.941988950276243, + "grad_norm": 0.26555630564689636, + "learning_rate": 6.907330840646693e-05, + "loss": 1.8268, + "step": 12843 + }, + { + "epoch": 3.942295887047268, + "grad_norm": 0.2439817190170288, + "learning_rate": 6.906871361218281e-05, + "loss": 1.7291, + "step": 12844 + }, + { + "epoch": 3.9426028238182935, + "grad_norm": 0.2410304993391037, + "learning_rate": 6.906411862945048e-05, + "loss": 1.712, + "step": 12845 + }, + { + "epoch": 3.942909760589319, + "grad_norm": 0.28575149178504944, + "learning_rate": 6.905952345831537e-05, + "loss": 1.7269, + "step": 12846 + }, + { + "epoch": 3.9432166973603437, + "grad_norm": 0.3055815100669861, + "learning_rate": 6.905492809882286e-05, + "loss": 1.7234, + "step": 12847 + }, + { + "epoch": 3.943523634131369, + "grad_norm": 0.2762533724308014, + "learning_rate": 6.905033255101839e-05, + "loss": 1.7768, + "step": 12848 + }, + { + "epoch": 3.943830570902394, + "grad_norm": 0.22819125652313232, + "learning_rate": 6.904573681494738e-05, + "loss": 1.7416, + "step": 12849 + }, + { + "epoch": 3.9441375076734193, + "grad_norm": 0.21664194762706757, + "learning_rate": 6.904114089065523e-05, + "loss": 1.7506, + "step": 12850 + }, + { + "epoch": 3.9444444444444446, + "grad_norm": 0.21935151517391205, + "learning_rate": 6.903654477818735e-05, + "loss": 1.7522, + "step": 12851 + }, + { + "epoch": 3.9447513812154695, + "grad_norm": 0.2204175442457199, + "learning_rate": 6.903194847758918e-05, + "loss": 1.7753, + "step": 12852 + }, + { + "epoch": 3.945058317986495, + "grad_norm": 0.23130151629447937, + "learning_rate": 6.902735198890615e-05, + "loss": 1.7743, + "step": 12853 + }, + { + "epoch": 3.9453652547575198, + "grad_norm": 0.2548399567604065, + "learning_rate": 6.902275531218368e-05, + "loss": 1.8373, + "step": 12854 + }, + { + "epoch": 3.945672191528545, + "grad_norm": 0.2905479371547699, + "learning_rate": 6.901815844746718e-05, + "loss": 1.8336, + "step": 12855 + }, + { + "epoch": 3.9459791282995704, + "grad_norm": 0.2698945105075836, + "learning_rate": 6.90135613948021e-05, + "loss": 1.7498, + "step": 12856 + }, + { + "epoch": 3.9462860650705953, + "grad_norm": 0.24966828525066376, + "learning_rate": 6.900896415423387e-05, + "loss": 1.7664, + "step": 12857 + }, + { + "epoch": 3.9465930018416207, + "grad_norm": 0.23272784054279327, + "learning_rate": 6.90043667258079e-05, + "loss": 1.7742, + "step": 12858 + }, + { + "epoch": 3.9468999386126455, + "grad_norm": 0.2277698516845703, + "learning_rate": 6.899976910956965e-05, + "loss": 1.7465, + "step": 12859 + }, + { + "epoch": 3.947206875383671, + "grad_norm": 0.2376442402601242, + "learning_rate": 6.899517130556454e-05, + "loss": 1.7995, + "step": 12860 + }, + { + "epoch": 3.947513812154696, + "grad_norm": 0.25591593980789185, + "learning_rate": 6.899057331383802e-05, + "loss": 1.8017, + "step": 12861 + }, + { + "epoch": 3.9478207489257215, + "grad_norm": 0.2715262472629547, + "learning_rate": 6.898597513443551e-05, + "loss": 1.7967, + "step": 12862 + }, + { + "epoch": 3.9481276856967464, + "grad_norm": 0.20916256308555603, + "learning_rate": 6.898137676740246e-05, + "loss": 1.7711, + "step": 12863 + }, + { + "epoch": 3.9484346224677718, + "grad_norm": 0.2570229768753052, + "learning_rate": 6.897677821278435e-05, + "loss": 1.833, + "step": 12864 + }, + { + "epoch": 3.9487415592387967, + "grad_norm": 0.26343438029289246, + "learning_rate": 6.897217947062657e-05, + "loss": 1.7625, + "step": 12865 + }, + { + "epoch": 3.949048496009822, + "grad_norm": 0.23407024145126343, + "learning_rate": 6.896758054097459e-05, + "loss": 1.7211, + "step": 12866 + }, + { + "epoch": 3.9493554327808473, + "grad_norm": 0.2554715573787689, + "learning_rate": 6.896298142387387e-05, + "loss": 1.8548, + "step": 12867 + }, + { + "epoch": 3.949662369551872, + "grad_norm": 0.24143370985984802, + "learning_rate": 6.895838211936986e-05, + "loss": 1.7635, + "step": 12868 + }, + { + "epoch": 3.9499693063228976, + "grad_norm": 0.24634715914726257, + "learning_rate": 6.8953782627508e-05, + "loss": 1.8012, + "step": 12869 + }, + { + "epoch": 3.9502762430939224, + "grad_norm": 0.22740426659584045, + "learning_rate": 6.894918294833375e-05, + "loss": 1.7294, + "step": 12870 + }, + { + "epoch": 3.950583179864948, + "grad_norm": 0.2651631832122803, + "learning_rate": 6.894458308189257e-05, + "loss": 1.8289, + "step": 12871 + }, + { + "epoch": 3.950890116635973, + "grad_norm": 0.28693267703056335, + "learning_rate": 6.893998302822991e-05, + "loss": 1.8462, + "step": 12872 + }, + { + "epoch": 3.951197053406998, + "grad_norm": 0.26584213972091675, + "learning_rate": 6.893538278739125e-05, + "loss": 1.7621, + "step": 12873 + }, + { + "epoch": 3.9515039901780233, + "grad_norm": 0.29970669746398926, + "learning_rate": 6.893078235942203e-05, + "loss": 1.7659, + "step": 12874 + }, + { + "epoch": 3.9518109269490482, + "grad_norm": 0.2271152138710022, + "learning_rate": 6.892618174436771e-05, + "loss": 1.7151, + "step": 12875 + }, + { + "epoch": 3.9521178637200736, + "grad_norm": 0.24783682823181152, + "learning_rate": 6.892158094227379e-05, + "loss": 1.761, + "step": 12876 + }, + { + "epoch": 3.952424800491099, + "grad_norm": 0.2371140718460083, + "learning_rate": 6.891697995318573e-05, + "loss": 1.7557, + "step": 12877 + }, + { + "epoch": 3.9527317372621242, + "grad_norm": 0.29708394408226013, + "learning_rate": 6.891237877714896e-05, + "loss": 1.8629, + "step": 12878 + }, + { + "epoch": 3.953038674033149, + "grad_norm": 0.2724219262599945, + "learning_rate": 6.890777741420899e-05, + "loss": 1.7378, + "step": 12879 + }, + { + "epoch": 3.9533456108041745, + "grad_norm": 0.2227276861667633, + "learning_rate": 6.890317586441126e-05, + "loss": 1.6989, + "step": 12880 + }, + { + "epoch": 3.9536525475751993, + "grad_norm": 0.2546161413192749, + "learning_rate": 6.889857412780128e-05, + "loss": 1.8688, + "step": 12881 + }, + { + "epoch": 3.9539594843462247, + "grad_norm": 0.24882884323596954, + "learning_rate": 6.889397220442452e-05, + "loss": 1.8137, + "step": 12882 + }, + { + "epoch": 3.95426642111725, + "grad_norm": 0.2549113929271698, + "learning_rate": 6.888937009432644e-05, + "loss": 1.8366, + "step": 12883 + }, + { + "epoch": 3.954573357888275, + "grad_norm": 0.30032673478126526, + "learning_rate": 6.888476779755255e-05, + "loss": 1.8267, + "step": 12884 + }, + { + "epoch": 3.9548802946593002, + "grad_norm": 0.2887294292449951, + "learning_rate": 6.888016531414832e-05, + "loss": 1.8295, + "step": 12885 + }, + { + "epoch": 3.955187231430325, + "grad_norm": 0.2947406470775604, + "learning_rate": 6.88755626441592e-05, + "loss": 1.7713, + "step": 12886 + }, + { + "epoch": 3.9554941682013505, + "grad_norm": 0.2967108190059662, + "learning_rate": 6.887095978763072e-05, + "loss": 1.7636, + "step": 12887 + }, + { + "epoch": 3.955801104972376, + "grad_norm": 0.2495311200618744, + "learning_rate": 6.886635674460836e-05, + "loss": 1.8148, + "step": 12888 + }, + { + "epoch": 3.9561080417434007, + "grad_norm": 0.23367099463939667, + "learning_rate": 6.88617535151376e-05, + "loss": 1.7353, + "step": 12889 + }, + { + "epoch": 3.956414978514426, + "grad_norm": 0.36790570616722107, + "learning_rate": 6.885715009926395e-05, + "loss": 1.7853, + "step": 12890 + }, + { + "epoch": 3.9567219152854514, + "grad_norm": 0.5013020038604736, + "learning_rate": 6.885254649703287e-05, + "loss": 1.7923, + "step": 12891 + }, + { + "epoch": 3.9570288520564763, + "grad_norm": 0.4446276128292084, + "learning_rate": 6.884794270848988e-05, + "loss": 1.7504, + "step": 12892 + }, + { + "epoch": 3.9573357888275016, + "grad_norm": 0.2478526383638382, + "learning_rate": 6.88433387336805e-05, + "loss": 1.7629, + "step": 12893 + }, + { + "epoch": 3.957642725598527, + "grad_norm": 0.30111798644065857, + "learning_rate": 6.883873457265019e-05, + "loss": 1.8291, + "step": 12894 + }, + { + "epoch": 3.957949662369552, + "grad_norm": 0.3812437951564789, + "learning_rate": 6.883413022544445e-05, + "loss": 1.7919, + "step": 12895 + }, + { + "epoch": 3.958256599140577, + "grad_norm": 0.2895318269729614, + "learning_rate": 6.882952569210881e-05, + "loss": 1.7467, + "step": 12896 + }, + { + "epoch": 3.958563535911602, + "grad_norm": 0.30391454696655273, + "learning_rate": 6.882492097268873e-05, + "loss": 1.8145, + "step": 12897 + }, + { + "epoch": 3.9588704726826274, + "grad_norm": 0.5033623576164246, + "learning_rate": 6.882031606722977e-05, + "loss": 1.8231, + "step": 12898 + }, + { + "epoch": 3.9591774094536527, + "grad_norm": 0.5351777672767639, + "learning_rate": 6.881571097577742e-05, + "loss": 1.807, + "step": 12899 + }, + { + "epoch": 3.9594843462246776, + "grad_norm": 0.35540491342544556, + "learning_rate": 6.881110569837719e-05, + "loss": 1.7626, + "step": 12900 + }, + { + "epoch": 3.959791282995703, + "grad_norm": 0.22447600960731506, + "learning_rate": 6.880650023507457e-05, + "loss": 1.7392, + "step": 12901 + }, + { + "epoch": 3.960098219766728, + "grad_norm": 0.44619202613830566, + "learning_rate": 6.88018945859151e-05, + "loss": 1.8138, + "step": 12902 + }, + { + "epoch": 3.960405156537753, + "grad_norm": 0.41381633281707764, + "learning_rate": 6.879728875094428e-05, + "loss": 1.7676, + "step": 12903 + }, + { + "epoch": 3.9607120933087785, + "grad_norm": 0.2601528465747833, + "learning_rate": 6.879268273020764e-05, + "loss": 1.8406, + "step": 12904 + }, + { + "epoch": 3.961019030079804, + "grad_norm": 0.3309035003185272, + "learning_rate": 6.878807652375071e-05, + "loss": 1.7673, + "step": 12905 + }, + { + "epoch": 3.9613259668508287, + "grad_norm": 0.5281669497489929, + "learning_rate": 6.878347013161899e-05, + "loss": 1.7686, + "step": 12906 + }, + { + "epoch": 3.961632903621854, + "grad_norm": 0.5397645831108093, + "learning_rate": 6.8778863553858e-05, + "loss": 1.8575, + "step": 12907 + }, + { + "epoch": 3.961939840392879, + "grad_norm": 0.329485684633255, + "learning_rate": 6.877425679051327e-05, + "loss": 1.8185, + "step": 12908 + }, + { + "epoch": 3.9622467771639043, + "grad_norm": 0.3012789487838745, + "learning_rate": 6.876964984163034e-05, + "loss": 1.7962, + "step": 12909 + }, + { + "epoch": 3.9625537139349296, + "grad_norm": 0.5596817135810852, + "learning_rate": 6.876504270725472e-05, + "loss": 1.7972, + "step": 12910 + }, + { + "epoch": 3.9628606507059545, + "grad_norm": 0.5374729633331299, + "learning_rate": 6.876043538743197e-05, + "loss": 1.7863, + "step": 12911 + }, + { + "epoch": 3.96316758747698, + "grad_norm": 0.24617290496826172, + "learning_rate": 6.875582788220757e-05, + "loss": 1.7555, + "step": 12912 + }, + { + "epoch": 3.9634745242480047, + "grad_norm": 0.3493972420692444, + "learning_rate": 6.875122019162712e-05, + "loss": 1.8595, + "step": 12913 + }, + { + "epoch": 3.96378146101903, + "grad_norm": 0.4293089807033539, + "learning_rate": 6.874661231573609e-05, + "loss": 1.7647, + "step": 12914 + }, + { + "epoch": 3.9640883977900554, + "grad_norm": 0.30602574348449707, + "learning_rate": 6.874200425458006e-05, + "loss": 1.7122, + "step": 12915 + }, + { + "epoch": 3.9643953345610803, + "grad_norm": 0.22776013612747192, + "learning_rate": 6.873739600820457e-05, + "loss": 1.7136, + "step": 12916 + }, + { + "epoch": 3.9647022713321056, + "grad_norm": 0.3727327585220337, + "learning_rate": 6.873278757665513e-05, + "loss": 1.8314, + "step": 12917 + }, + { + "epoch": 3.9650092081031305, + "grad_norm": 0.35110536217689514, + "learning_rate": 6.872817895997733e-05, + "loss": 1.7506, + "step": 12918 + }, + { + "epoch": 3.965316144874156, + "grad_norm": 0.275560587644577, + "learning_rate": 6.872357015821666e-05, + "loss": 1.7865, + "step": 12919 + }, + { + "epoch": 3.965623081645181, + "grad_norm": 0.2686980366706848, + "learning_rate": 6.871896117141873e-05, + "loss": 1.8431, + "step": 12920 + }, + { + "epoch": 3.9659300184162065, + "grad_norm": 0.3299664556980133, + "learning_rate": 6.871435199962901e-05, + "loss": 1.7988, + "step": 12921 + }, + { + "epoch": 3.9662369551872314, + "grad_norm": 0.2833637297153473, + "learning_rate": 6.870974264289313e-05, + "loss": 1.6993, + "step": 12922 + }, + { + "epoch": 3.9665438919582567, + "grad_norm": 0.25062620639801025, + "learning_rate": 6.870513310125659e-05, + "loss": 1.7814, + "step": 12923 + }, + { + "epoch": 3.9668508287292816, + "grad_norm": 0.26609909534454346, + "learning_rate": 6.870052337476498e-05, + "loss": 1.7871, + "step": 12924 + }, + { + "epoch": 3.967157765500307, + "grad_norm": 0.22760890424251556, + "learning_rate": 6.869591346346382e-05, + "loss": 1.7941, + "step": 12925 + }, + { + "epoch": 3.9674647022713323, + "grad_norm": 0.2845582067966461, + "learning_rate": 6.869130336739869e-05, + "loss": 1.8215, + "step": 12926 + }, + { + "epoch": 3.967771639042357, + "grad_norm": 0.254948228597641, + "learning_rate": 6.868669308661514e-05, + "loss": 1.7515, + "step": 12927 + }, + { + "epoch": 3.9680785758133825, + "grad_norm": 0.2372167855501175, + "learning_rate": 6.868208262115875e-05, + "loss": 1.7524, + "step": 12928 + }, + { + "epoch": 3.9683855125844074, + "grad_norm": 0.31165993213653564, + "learning_rate": 6.867747197107506e-05, + "loss": 1.8139, + "step": 12929 + }, + { + "epoch": 3.9686924493554327, + "grad_norm": 0.2617839276790619, + "learning_rate": 6.867286113640965e-05, + "loss": 1.7388, + "step": 12930 + }, + { + "epoch": 3.968999386126458, + "grad_norm": 0.22749558091163635, + "learning_rate": 6.866825011720807e-05, + "loss": 1.7421, + "step": 12931 + }, + { + "epoch": 3.969306322897483, + "grad_norm": 0.27737462520599365, + "learning_rate": 6.86636389135159e-05, + "loss": 1.7977, + "step": 12932 + }, + { + "epoch": 3.9696132596685083, + "grad_norm": 0.3331063985824585, + "learning_rate": 6.865902752537871e-05, + "loss": 1.7925, + "step": 12933 + }, + { + "epoch": 3.969920196439533, + "grad_norm": 0.24229519069194794, + "learning_rate": 6.86544159528421e-05, + "loss": 1.7782, + "step": 12934 + }, + { + "epoch": 3.9702271332105585, + "grad_norm": 0.29494860768318176, + "learning_rate": 6.86498041959516e-05, + "loss": 1.7713, + "step": 12935 + }, + { + "epoch": 3.970534069981584, + "grad_norm": 0.26064008474349976, + "learning_rate": 6.86451922547528e-05, + "loss": 1.7161, + "step": 12936 + }, + { + "epoch": 3.970841006752609, + "grad_norm": 0.2656785547733307, + "learning_rate": 6.864058012929129e-05, + "loss": 1.8154, + "step": 12937 + }, + { + "epoch": 3.971147943523634, + "grad_norm": 0.21170997619628906, + "learning_rate": 6.863596781961263e-05, + "loss": 1.7614, + "step": 12938 + }, + { + "epoch": 3.9714548802946594, + "grad_norm": 0.21709072589874268, + "learning_rate": 6.863135532576241e-05, + "loss": 1.7896, + "step": 12939 + }, + { + "epoch": 3.9717618170656843, + "grad_norm": 0.2361367791891098, + "learning_rate": 6.862674264778623e-05, + "loss": 1.7775, + "step": 12940 + }, + { + "epoch": 3.9720687538367097, + "grad_norm": 0.22042550146579742, + "learning_rate": 6.862212978572967e-05, + "loss": 1.7781, + "step": 12941 + }, + { + "epoch": 3.972375690607735, + "grad_norm": 0.2535422146320343, + "learning_rate": 6.86175167396383e-05, + "loss": 1.7665, + "step": 12942 + }, + { + "epoch": 3.97268262737876, + "grad_norm": 0.23741906881332397, + "learning_rate": 6.861290350955771e-05, + "loss": 1.7829, + "step": 12943 + }, + { + "epoch": 3.972989564149785, + "grad_norm": 0.23789910972118378, + "learning_rate": 6.860829009553351e-05, + "loss": 1.7745, + "step": 12944 + }, + { + "epoch": 3.97329650092081, + "grad_norm": 0.26867765188217163, + "learning_rate": 6.860367649761127e-05, + "loss": 1.7239, + "step": 12945 + }, + { + "epoch": 3.9736034376918354, + "grad_norm": 0.3211663067340851, + "learning_rate": 6.85990627158366e-05, + "loss": 1.7976, + "step": 12946 + }, + { + "epoch": 3.9739103744628608, + "grad_norm": 0.26177310943603516, + "learning_rate": 6.85944487502551e-05, + "loss": 1.7446, + "step": 12947 + }, + { + "epoch": 3.9742173112338857, + "grad_norm": 0.23622745275497437, + "learning_rate": 6.858983460091234e-05, + "loss": 1.7824, + "step": 12948 + }, + { + "epoch": 3.974524248004911, + "grad_norm": 0.24372988939285278, + "learning_rate": 6.858522026785395e-05, + "loss": 1.8014, + "step": 12949 + }, + { + "epoch": 3.974831184775936, + "grad_norm": 0.2566998600959778, + "learning_rate": 6.85806057511255e-05, + "loss": 1.742, + "step": 12950 + }, + { + "epoch": 3.9751381215469612, + "grad_norm": 0.24418365955352783, + "learning_rate": 6.857599105077264e-05, + "loss": 1.7331, + "step": 12951 + }, + { + "epoch": 3.9754450583179866, + "grad_norm": 0.2260327935218811, + "learning_rate": 6.857137616684094e-05, + "loss": 1.7173, + "step": 12952 + }, + { + "epoch": 3.975751995089012, + "grad_norm": 0.277044415473938, + "learning_rate": 6.856676109937602e-05, + "loss": 1.7255, + "step": 12953 + }, + { + "epoch": 3.976058931860037, + "grad_norm": 0.228300079703331, + "learning_rate": 6.856214584842348e-05, + "loss": 1.7796, + "step": 12954 + }, + { + "epoch": 3.976365868631062, + "grad_norm": 0.2246638983488083, + "learning_rate": 6.855753041402893e-05, + "loss": 1.7458, + "step": 12955 + }, + { + "epoch": 3.976672805402087, + "grad_norm": 0.22235621511936188, + "learning_rate": 6.855291479623799e-05, + "loss": 1.7585, + "step": 12956 + }, + { + "epoch": 3.9769797421731123, + "grad_norm": 0.23710694909095764, + "learning_rate": 6.854829899509627e-05, + "loss": 1.767, + "step": 12957 + }, + { + "epoch": 3.9772866789441377, + "grad_norm": 0.2527346611022949, + "learning_rate": 6.854368301064939e-05, + "loss": 1.828, + "step": 12958 + }, + { + "epoch": 3.9775936157151626, + "grad_norm": 0.25032514333724976, + "learning_rate": 6.853906684294298e-05, + "loss": 1.8533, + "step": 12959 + }, + { + "epoch": 3.977900552486188, + "grad_norm": 0.2346320003271103, + "learning_rate": 6.853445049202262e-05, + "loss": 1.8046, + "step": 12960 + }, + { + "epoch": 3.978207489257213, + "grad_norm": 0.22576460242271423, + "learning_rate": 6.852983395793398e-05, + "loss": 1.7502, + "step": 12961 + }, + { + "epoch": 3.978514426028238, + "grad_norm": 0.2230147123336792, + "learning_rate": 6.852521724072266e-05, + "loss": 1.7362, + "step": 12962 + }, + { + "epoch": 3.9788213627992635, + "grad_norm": 0.2339705526828766, + "learning_rate": 6.852060034043425e-05, + "loss": 1.763, + "step": 12963 + }, + { + "epoch": 3.979128299570289, + "grad_norm": 0.24511271715164185, + "learning_rate": 6.851598325711446e-05, + "loss": 1.7988, + "step": 12964 + }, + { + "epoch": 3.9794352363413137, + "grad_norm": 0.2927285134792328, + "learning_rate": 6.851136599080885e-05, + "loss": 1.8346, + "step": 12965 + }, + { + "epoch": 3.979742173112339, + "grad_norm": 0.2593212425708771, + "learning_rate": 6.850674854156305e-05, + "loss": 1.7368, + "step": 12966 + }, + { + "epoch": 3.980049109883364, + "grad_norm": 0.3013291656970978, + "learning_rate": 6.850213090942275e-05, + "loss": 1.7911, + "step": 12967 + }, + { + "epoch": 3.9803560466543892, + "grad_norm": 0.3420047163963318, + "learning_rate": 6.849751309443352e-05, + "loss": 1.7899, + "step": 12968 + }, + { + "epoch": 3.9806629834254146, + "grad_norm": 0.2901746928691864, + "learning_rate": 6.849289509664105e-05, + "loss": 1.8244, + "step": 12969 + }, + { + "epoch": 3.9809699201964395, + "grad_norm": 0.2389298677444458, + "learning_rate": 6.848827691609093e-05, + "loss": 1.7116, + "step": 12970 + }, + { + "epoch": 3.981276856967465, + "grad_norm": 0.3153960704803467, + "learning_rate": 6.848365855282882e-05, + "loss": 1.7665, + "step": 12971 + }, + { + "epoch": 3.9815837937384897, + "grad_norm": 0.3162175118923187, + "learning_rate": 6.847904000690036e-05, + "loss": 1.7722, + "step": 12972 + }, + { + "epoch": 3.981890730509515, + "grad_norm": 0.27458643913269043, + "learning_rate": 6.847442127835122e-05, + "loss": 1.8095, + "step": 12973 + }, + { + "epoch": 3.9821976672805404, + "grad_norm": 0.22330710291862488, + "learning_rate": 6.846980236722699e-05, + "loss": 1.7179, + "step": 12974 + }, + { + "epoch": 3.9825046040515653, + "grad_norm": 0.2940923869609833, + "learning_rate": 6.846518327357339e-05, + "loss": 1.7363, + "step": 12975 + }, + { + "epoch": 3.9828115408225906, + "grad_norm": 0.26479849219322205, + "learning_rate": 6.846056399743599e-05, + "loss": 1.7788, + "step": 12976 + }, + { + "epoch": 3.9831184775936155, + "grad_norm": 0.24145057797431946, + "learning_rate": 6.845594453886048e-05, + "loss": 1.7825, + "step": 12977 + }, + { + "epoch": 3.983425414364641, + "grad_norm": 0.2795869708061218, + "learning_rate": 6.845132489789252e-05, + "loss": 1.7705, + "step": 12978 + }, + { + "epoch": 3.983732351135666, + "grad_norm": 0.3117202818393707, + "learning_rate": 6.844670507457776e-05, + "loss": 1.8183, + "step": 12979 + }, + { + "epoch": 3.9840392879066915, + "grad_norm": 0.2666899263858795, + "learning_rate": 6.844208506896184e-05, + "loss": 1.7434, + "step": 12980 + }, + { + "epoch": 3.9843462246777164, + "grad_norm": 0.24682332575321198, + "learning_rate": 6.843746488109042e-05, + "loss": 1.751, + "step": 12981 + }, + { + "epoch": 3.9846531614487417, + "grad_norm": 0.2558208703994751, + "learning_rate": 6.843284451100916e-05, + "loss": 1.7983, + "step": 12982 + }, + { + "epoch": 3.9849600982197666, + "grad_norm": 0.4236481189727783, + "learning_rate": 6.842822395876374e-05, + "loss": 1.8584, + "step": 12983 + }, + { + "epoch": 3.985267034990792, + "grad_norm": 0.4931485950946808, + "learning_rate": 6.84236032243998e-05, + "loss": 1.7617, + "step": 12984 + }, + { + "epoch": 3.9855739717618173, + "grad_norm": 0.37793654203414917, + "learning_rate": 6.841898230796302e-05, + "loss": 1.7411, + "step": 12985 + }, + { + "epoch": 3.985880908532842, + "grad_norm": 0.2093842774629593, + "learning_rate": 6.841436120949906e-05, + "loss": 1.772, + "step": 12986 + }, + { + "epoch": 3.9861878453038675, + "grad_norm": 0.4065552055835724, + "learning_rate": 6.840973992905359e-05, + "loss": 1.7675, + "step": 12987 + }, + { + "epoch": 3.9864947820748924, + "grad_norm": 0.5334183573722839, + "learning_rate": 6.840511846667228e-05, + "loss": 1.7872, + "step": 12988 + }, + { + "epoch": 3.9868017188459177, + "grad_norm": 0.378974974155426, + "learning_rate": 6.84004968224008e-05, + "loss": 1.8288, + "step": 12989 + }, + { + "epoch": 3.987108655616943, + "grad_norm": 0.22518309950828552, + "learning_rate": 6.839587499628483e-05, + "loss": 1.7715, + "step": 12990 + }, + { + "epoch": 3.987415592387968, + "grad_norm": 0.4270850718021393, + "learning_rate": 6.839125298837003e-05, + "loss": 1.7797, + "step": 12991 + }, + { + "epoch": 3.9877225291589933, + "grad_norm": 0.4629896879196167, + "learning_rate": 6.838663079870211e-05, + "loss": 1.7936, + "step": 12992 + }, + { + "epoch": 3.988029465930018, + "grad_norm": 0.29273948073387146, + "learning_rate": 6.838200842732672e-05, + "loss": 1.8264, + "step": 12993 + }, + { + "epoch": 3.9883364027010435, + "grad_norm": 0.31575852632522583, + "learning_rate": 6.837738587428954e-05, + "loss": 1.8043, + "step": 12994 + }, + { + "epoch": 3.988643339472069, + "grad_norm": 0.40602433681488037, + "learning_rate": 6.837276313963627e-05, + "loss": 1.7409, + "step": 12995 + }, + { + "epoch": 3.988950276243094, + "grad_norm": 0.23413142561912537, + "learning_rate": 6.836814022341259e-05, + "loss": 1.8585, + "step": 12996 + }, + { + "epoch": 3.989257213014119, + "grad_norm": 0.3518814444541931, + "learning_rate": 6.836351712566416e-05, + "loss": 1.7768, + "step": 12997 + }, + { + "epoch": 3.9895641497851444, + "grad_norm": 0.3811505436897278, + "learning_rate": 6.83588938464367e-05, + "loss": 1.7738, + "step": 12998 + }, + { + "epoch": 3.9898710865561693, + "grad_norm": 0.2516780197620392, + "learning_rate": 6.835427038577589e-05, + "loss": 1.7351, + "step": 12999 + }, + { + "epoch": 3.9901780233271946, + "grad_norm": 0.23704510927200317, + "learning_rate": 6.834964674372744e-05, + "loss": 1.7907, + "step": 13000 + }, + { + "epoch": 3.99048496009822, + "grad_norm": 0.2890201807022095, + "learning_rate": 6.8345022920337e-05, + "loss": 1.9546, + "step": 13001 + }, + { + "epoch": 3.990791896869245, + "grad_norm": 0.2678101360797882, + "learning_rate": 6.834039891565031e-05, + "loss": 1.7338, + "step": 13002 + }, + { + "epoch": 3.99109883364027, + "grad_norm": 0.31726256012916565, + "learning_rate": 6.833577472971304e-05, + "loss": 1.8464, + "step": 13003 + }, + { + "epoch": 3.991405770411295, + "grad_norm": 0.28112682700157166, + "learning_rate": 6.83311503625709e-05, + "loss": 1.7427, + "step": 13004 + }, + { + "epoch": 3.9917127071823204, + "grad_norm": 0.2651563584804535, + "learning_rate": 6.832652581426958e-05, + "loss": 1.8117, + "step": 13005 + }, + { + "epoch": 3.9920196439533457, + "grad_norm": 0.3095388114452362, + "learning_rate": 6.83219010848548e-05, + "loss": 1.8286, + "step": 13006 + }, + { + "epoch": 3.9923265807243706, + "grad_norm": 0.24704942107200623, + "learning_rate": 6.831727617437225e-05, + "loss": 1.77, + "step": 13007 + }, + { + "epoch": 3.992633517495396, + "grad_norm": 0.24868519604206085, + "learning_rate": 6.831265108286764e-05, + "loss": 1.8129, + "step": 13008 + }, + { + "epoch": 3.992940454266421, + "grad_norm": 0.26511049270629883, + "learning_rate": 6.830802581038669e-05, + "loss": 1.7539, + "step": 13009 + }, + { + "epoch": 3.993247391037446, + "grad_norm": 0.2823421061038971, + "learning_rate": 6.830340035697508e-05, + "loss": 1.8068, + "step": 13010 + }, + { + "epoch": 3.9935543278084715, + "grad_norm": 0.28526121377944946, + "learning_rate": 6.829877472267856e-05, + "loss": 1.764, + "step": 13011 + }, + { + "epoch": 3.993861264579497, + "grad_norm": 0.2576456069946289, + "learning_rate": 6.829414890754281e-05, + "loss": 1.728, + "step": 13012 + }, + { + "epoch": 3.9941682013505218, + "grad_norm": 0.27154842019081116, + "learning_rate": 6.828952291161356e-05, + "loss": 1.797, + "step": 13013 + }, + { + "epoch": 3.994475138121547, + "grad_norm": 0.3129710555076599, + "learning_rate": 6.828489673493652e-05, + "loss": 1.769, + "step": 13014 + }, + { + "epoch": 3.994782074892572, + "grad_norm": 0.40118902921676636, + "learning_rate": 6.828027037755742e-05, + "loss": 1.8029, + "step": 13015 + }, + { + "epoch": 3.9950890116635973, + "grad_norm": 0.33228442072868347, + "learning_rate": 6.827564383952197e-05, + "loss": 1.7295, + "step": 13016 + }, + { + "epoch": 3.9953959484346226, + "grad_norm": 0.218771830201149, + "learning_rate": 6.827101712087591e-05, + "loss": 1.7693, + "step": 13017 + }, + { + "epoch": 3.9957028852056475, + "grad_norm": 0.31354373693466187, + "learning_rate": 6.826639022166492e-05, + "loss": 1.743, + "step": 13018 + }, + { + "epoch": 3.996009821976673, + "grad_norm": 0.3584701418876648, + "learning_rate": 6.826176314193478e-05, + "loss": 1.7597, + "step": 13019 + }, + { + "epoch": 3.9963167587476978, + "grad_norm": 0.2692064344882965, + "learning_rate": 6.82571358817312e-05, + "loss": 1.7871, + "step": 13020 + }, + { + "epoch": 3.996623695518723, + "grad_norm": 0.3064020276069641, + "learning_rate": 6.825250844109987e-05, + "loss": 1.7858, + "step": 13021 + }, + { + "epoch": 3.9969306322897484, + "grad_norm": 0.29913413524627686, + "learning_rate": 6.824788082008657e-05, + "loss": 1.7773, + "step": 13022 + }, + { + "epoch": 3.9972375690607733, + "grad_norm": 0.2682165801525116, + "learning_rate": 6.824325301873703e-05, + "loss": 1.8321, + "step": 13023 + }, + { + "epoch": 3.9975445058317987, + "grad_norm": 0.3274376690387726, + "learning_rate": 6.823862503709694e-05, + "loss": 1.8514, + "step": 13024 + }, + { + "epoch": 3.9978514426028235, + "grad_norm": 0.29828041791915894, + "learning_rate": 6.823399687521211e-05, + "loss": 1.7923, + "step": 13025 + }, + { + "epoch": 3.998158379373849, + "grad_norm": 0.22339288890361786, + "learning_rate": 6.82293685331282e-05, + "loss": 1.756, + "step": 13026 + }, + { + "epoch": 3.998465316144874, + "grad_norm": 0.2254658192396164, + "learning_rate": 6.8224740010891e-05, + "loss": 1.7392, + "step": 13027 + }, + { + "epoch": 3.9987722529158995, + "grad_norm": 0.24932752549648285, + "learning_rate": 6.822011130854624e-05, + "loss": 1.7538, + "step": 13028 + }, + { + "epoch": 3.9990791896869244, + "grad_norm": 0.21429690718650818, + "learning_rate": 6.821548242613966e-05, + "loss": 1.7746, + "step": 13029 + }, + { + "epoch": 3.9993861264579498, + "grad_norm": 0.25503116846084595, + "learning_rate": 6.8210853363717e-05, + "loss": 1.814, + "step": 13030 + }, + { + "epoch": 3.9996930632289747, + "grad_norm": 0.23168155550956726, + "learning_rate": 6.820622412132402e-05, + "loss": 1.769, + "step": 13031 + }, + { + "epoch": 4.0, + "grad_norm": 0.2252223789691925, + "learning_rate": 6.820159469900645e-05, + "loss": 1.7782, + "step": 13032 + }, + { + "epoch": 4.000306936771025, + "grad_norm": 0.1996588408946991, + "learning_rate": 6.819696509681007e-05, + "loss": 1.6839, + "step": 13033 + }, + { + "epoch": 4.000613873542051, + "grad_norm": 0.22297053039073944, + "learning_rate": 6.81923353147806e-05, + "loss": 1.7767, + "step": 13034 + }, + { + "epoch": 4.000920810313075, + "grad_norm": 0.25867611169815063, + "learning_rate": 6.818770535296381e-05, + "loss": 1.8623, + "step": 13035 + }, + { + "epoch": 4.0012277470841005, + "grad_norm": 0.2173648178577423, + "learning_rate": 6.818307521140547e-05, + "loss": 1.8034, + "step": 13036 + }, + { + "epoch": 4.001534683855126, + "grad_norm": 0.23634609580039978, + "learning_rate": 6.81784448901513e-05, + "loss": 1.7503, + "step": 13037 + }, + { + "epoch": 4.001841620626151, + "grad_norm": 0.2626810073852539, + "learning_rate": 6.81738143892471e-05, + "loss": 1.8116, + "step": 13038 + }, + { + "epoch": 4.0021485573971765, + "grad_norm": 0.27888983488082886, + "learning_rate": 6.816918370873861e-05, + "loss": 1.8032, + "step": 13039 + }, + { + "epoch": 4.002455494168202, + "grad_norm": 0.275038480758667, + "learning_rate": 6.816455284867162e-05, + "loss": 1.7445, + "step": 13040 + }, + { + "epoch": 4.002762430939226, + "grad_norm": 0.3475828170776367, + "learning_rate": 6.815992180909184e-05, + "loss": 1.7404, + "step": 13041 + }, + { + "epoch": 4.003069367710252, + "grad_norm": 0.27314287424087524, + "learning_rate": 6.815529059004507e-05, + "loss": 1.8333, + "step": 13042 + }, + { + "epoch": 4.003376304481277, + "grad_norm": 0.34846973419189453, + "learning_rate": 6.815065919157709e-05, + "loss": 1.7921, + "step": 13043 + }, + { + "epoch": 4.003683241252302, + "grad_norm": 0.4191788136959076, + "learning_rate": 6.814602761373365e-05, + "loss": 1.8018, + "step": 13044 + }, + { + "epoch": 4.003990178023328, + "grad_norm": 0.2655608057975769, + "learning_rate": 6.814139585656055e-05, + "loss": 1.7638, + "step": 13045 + }, + { + "epoch": 4.004297114794352, + "grad_norm": 0.25938618183135986, + "learning_rate": 6.813676392010353e-05, + "loss": 1.794, + "step": 13046 + }, + { + "epoch": 4.004604051565377, + "grad_norm": 0.3464813828468323, + "learning_rate": 6.813213180440837e-05, + "loss": 1.8662, + "step": 13047 + }, + { + "epoch": 4.004910988336403, + "grad_norm": 0.30185338854789734, + "learning_rate": 6.812749950952087e-05, + "loss": 1.8029, + "step": 13048 + }, + { + "epoch": 4.005217925107428, + "grad_norm": 0.23291908204555511, + "learning_rate": 6.812286703548678e-05, + "loss": 1.7365, + "step": 13049 + }, + { + "epoch": 4.005524861878453, + "grad_norm": 0.3542841374874115, + "learning_rate": 6.811823438235189e-05, + "loss": 1.8674, + "step": 13050 + }, + { + "epoch": 4.005831798649478, + "grad_norm": 0.2914685606956482, + "learning_rate": 6.811360155016202e-05, + "loss": 1.8306, + "step": 13051 + }, + { + "epoch": 4.006138735420503, + "grad_norm": 0.24888737499713898, + "learning_rate": 6.810896853896289e-05, + "loss": 1.7767, + "step": 13052 + }, + { + "epoch": 4.0064456721915285, + "grad_norm": 0.2977537512779236, + "learning_rate": 6.810433534880033e-05, + "loss": 1.8227, + "step": 13053 + }, + { + "epoch": 4.006752608962554, + "grad_norm": 0.3367510735988617, + "learning_rate": 6.809970197972013e-05, + "loss": 1.734, + "step": 13054 + }, + { + "epoch": 4.007059545733579, + "grad_norm": 0.28098800778388977, + "learning_rate": 6.809506843176806e-05, + "loss": 1.7032, + "step": 13055 + }, + { + "epoch": 4.0073664825046045, + "grad_norm": 0.24016784131526947, + "learning_rate": 6.809043470498991e-05, + "loss": 1.7863, + "step": 13056 + }, + { + "epoch": 4.007673419275629, + "grad_norm": 0.2883957624435425, + "learning_rate": 6.808580079943148e-05, + "loss": 1.7342, + "step": 13057 + }, + { + "epoch": 4.007980356046654, + "grad_norm": 0.3069116473197937, + "learning_rate": 6.808116671513856e-05, + "loss": 1.8544, + "step": 13058 + }, + { + "epoch": 4.00828729281768, + "grad_norm": 0.24113236367702484, + "learning_rate": 6.807653245215697e-05, + "loss": 1.7692, + "step": 13059 + }, + { + "epoch": 4.008594229588705, + "grad_norm": 0.2651619017124176, + "learning_rate": 6.807189801053249e-05, + "loss": 1.8096, + "step": 13060 + }, + { + "epoch": 4.00890116635973, + "grad_norm": 0.2636481523513794, + "learning_rate": 6.806726339031092e-05, + "loss": 1.8062, + "step": 13061 + }, + { + "epoch": 4.009208103130755, + "grad_norm": 0.22691169381141663, + "learning_rate": 6.806262859153807e-05, + "loss": 1.7001, + "step": 13062 + }, + { + "epoch": 4.00951503990178, + "grad_norm": 0.23288170993328094, + "learning_rate": 6.805799361425972e-05, + "loss": 1.7508, + "step": 13063 + }, + { + "epoch": 4.009821976672805, + "grad_norm": 0.243272602558136, + "learning_rate": 6.80533584585217e-05, + "loss": 1.7797, + "step": 13064 + }, + { + "epoch": 4.010128913443831, + "grad_norm": 0.24594646692276, + "learning_rate": 6.80487231243698e-05, + "loss": 1.7894, + "step": 13065 + }, + { + "epoch": 4.010435850214856, + "grad_norm": 0.21726086735725403, + "learning_rate": 6.804408761184986e-05, + "loss": 1.7472, + "step": 13066 + }, + { + "epoch": 4.0107427869858805, + "grad_norm": 0.2262321561574936, + "learning_rate": 6.803945192100767e-05, + "loss": 1.7563, + "step": 13067 + }, + { + "epoch": 4.011049723756906, + "grad_norm": 0.2449522763490677, + "learning_rate": 6.803481605188903e-05, + "loss": 1.7282, + "step": 13068 + }, + { + "epoch": 4.011356660527931, + "grad_norm": 0.2281760573387146, + "learning_rate": 6.803018000453975e-05, + "loss": 1.8191, + "step": 13069 + }, + { + "epoch": 4.0116635972989565, + "grad_norm": 0.3039850890636444, + "learning_rate": 6.80255437790057e-05, + "loss": 1.8258, + "step": 13070 + }, + { + "epoch": 4.011970534069982, + "grad_norm": 0.3978467881679535, + "learning_rate": 6.802090737533264e-05, + "loss": 1.7338, + "step": 13071 + }, + { + "epoch": 4.012277470841007, + "grad_norm": 0.29175812005996704, + "learning_rate": 6.801627079356641e-05, + "loss": 1.7754, + "step": 13072 + }, + { + "epoch": 4.012584407612032, + "grad_norm": 0.24228449165821075, + "learning_rate": 6.801163403375285e-05, + "loss": 1.7624, + "step": 13073 + }, + { + "epoch": 4.012891344383057, + "grad_norm": 0.34527531266212463, + "learning_rate": 6.800699709593776e-05, + "loss": 1.87, + "step": 13074 + }, + { + "epoch": 4.013198281154082, + "grad_norm": 0.1995161920785904, + "learning_rate": 6.800235998016696e-05, + "loss": 1.7253, + "step": 13075 + }, + { + "epoch": 4.013505217925108, + "grad_norm": 0.3509151339530945, + "learning_rate": 6.799772268648628e-05, + "loss": 1.8013, + "step": 13076 + }, + { + "epoch": 4.013812154696133, + "grad_norm": 0.38569679856300354, + "learning_rate": 6.799308521494156e-05, + "loss": 1.7761, + "step": 13077 + }, + { + "epoch": 4.014119091467157, + "grad_norm": 0.2636256814002991, + "learning_rate": 6.798844756557865e-05, + "loss": 1.8101, + "step": 13078 + }, + { + "epoch": 4.014426028238183, + "grad_norm": 0.2570696473121643, + "learning_rate": 6.798380973844335e-05, + "loss": 1.7561, + "step": 13079 + }, + { + "epoch": 4.014732965009208, + "grad_norm": 0.38540002703666687, + "learning_rate": 6.797917173358148e-05, + "loss": 1.7893, + "step": 13080 + }, + { + "epoch": 4.015039901780233, + "grad_norm": 0.2974525988101959, + "learning_rate": 6.79745335510389e-05, + "loss": 1.8331, + "step": 13081 + }, + { + "epoch": 4.015346838551259, + "grad_norm": 0.2563362419605255, + "learning_rate": 6.796989519086146e-05, + "loss": 1.7784, + "step": 13082 + }, + { + "epoch": 4.015653775322283, + "grad_norm": 0.37037795782089233, + "learning_rate": 6.7965256653095e-05, + "loss": 1.7947, + "step": 13083 + }, + { + "epoch": 4.0159607120933085, + "grad_norm": 0.4145336449146271, + "learning_rate": 6.796061793778531e-05, + "loss": 1.7633, + "step": 13084 + }, + { + "epoch": 4.016267648864334, + "grad_norm": 0.32278406620025635, + "learning_rate": 6.795597904497828e-05, + "loss": 1.7827, + "step": 13085 + }, + { + "epoch": 4.016574585635359, + "grad_norm": 0.26466837525367737, + "learning_rate": 6.795133997471974e-05, + "loss": 1.7441, + "step": 13086 + }, + { + "epoch": 4.0168815224063845, + "grad_norm": 0.3212043344974518, + "learning_rate": 6.794670072705553e-05, + "loss": 1.7602, + "step": 13087 + }, + { + "epoch": 4.01718845917741, + "grad_norm": 0.3054736852645874, + "learning_rate": 6.79420613020315e-05, + "loss": 1.7417, + "step": 13088 + }, + { + "epoch": 4.017495395948434, + "grad_norm": 0.22281476855278015, + "learning_rate": 6.793742169969351e-05, + "loss": 1.7675, + "step": 13089 + }, + { + "epoch": 4.01780233271946, + "grad_norm": 0.32630839943885803, + "learning_rate": 6.793278192008742e-05, + "loss": 1.8409, + "step": 13090 + }, + { + "epoch": 4.018109269490485, + "grad_norm": 0.2658778429031372, + "learning_rate": 6.792814196325905e-05, + "loss": 1.7718, + "step": 13091 + }, + { + "epoch": 4.01841620626151, + "grad_norm": 0.24016901850700378, + "learning_rate": 6.792350182925429e-05, + "loss": 1.8393, + "step": 13092 + }, + { + "epoch": 4.018723143032536, + "grad_norm": 0.2882223427295685, + "learning_rate": 6.791886151811897e-05, + "loss": 1.7497, + "step": 13093 + }, + { + "epoch": 4.01903007980356, + "grad_norm": 0.24340751767158508, + "learning_rate": 6.791422102989895e-05, + "loss": 1.72, + "step": 13094 + }, + { + "epoch": 4.019337016574585, + "grad_norm": 0.235665962100029, + "learning_rate": 6.79095803646401e-05, + "loss": 1.7269, + "step": 13095 + }, + { + "epoch": 4.019643953345611, + "grad_norm": 0.32772955298423767, + "learning_rate": 6.79049395223883e-05, + "loss": 1.7916, + "step": 13096 + }, + { + "epoch": 4.019950890116636, + "grad_norm": 0.3189625144004822, + "learning_rate": 6.790029850318938e-05, + "loss": 1.7571, + "step": 13097 + }, + { + "epoch": 4.020257826887661, + "grad_norm": 0.2211185097694397, + "learning_rate": 6.789565730708921e-05, + "loss": 1.793, + "step": 13098 + }, + { + "epoch": 4.020564763658686, + "grad_norm": 0.2840392291545868, + "learning_rate": 6.789101593413367e-05, + "loss": 1.7434, + "step": 13099 + }, + { + "epoch": 4.020871700429711, + "grad_norm": 0.27857357263565063, + "learning_rate": 6.788637438436863e-05, + "loss": 1.742, + "step": 13100 + }, + { + "epoch": 4.0211786372007365, + "grad_norm": 0.314628005027771, + "learning_rate": 6.788173265783996e-05, + "loss": 1.7881, + "step": 13101 + }, + { + "epoch": 4.021485573971762, + "grad_norm": 0.2994774580001831, + "learning_rate": 6.787709075459352e-05, + "loss": 1.7741, + "step": 13102 + }, + { + "epoch": 4.021792510742787, + "grad_norm": 0.3256312310695648, + "learning_rate": 6.787244867467519e-05, + "loss": 1.7758, + "step": 13103 + }, + { + "epoch": 4.0220994475138125, + "grad_norm": 0.2332412451505661, + "learning_rate": 6.786780641813083e-05, + "loss": 1.7654, + "step": 13104 + }, + { + "epoch": 4.022406384284837, + "grad_norm": 0.23226258158683777, + "learning_rate": 6.786316398500636e-05, + "loss": 1.7605, + "step": 13105 + }, + { + "epoch": 4.022713321055862, + "grad_norm": 0.24631965160369873, + "learning_rate": 6.785852137534763e-05, + "loss": 1.7469, + "step": 13106 + }, + { + "epoch": 4.023020257826888, + "grad_norm": 0.1969226449728012, + "learning_rate": 6.785387858920051e-05, + "loss": 1.8151, + "step": 13107 + }, + { + "epoch": 4.023327194597913, + "grad_norm": 0.22769485414028168, + "learning_rate": 6.784923562661091e-05, + "loss": 1.7024, + "step": 13108 + }, + { + "epoch": 4.023634131368938, + "grad_norm": 0.2174670249223709, + "learning_rate": 6.78445924876247e-05, + "loss": 1.8094, + "step": 13109 + }, + { + "epoch": 4.023941068139963, + "grad_norm": 0.2606858015060425, + "learning_rate": 6.783994917228775e-05, + "loss": 1.8043, + "step": 13110 + }, + { + "epoch": 4.024248004910988, + "grad_norm": 0.24721349775791168, + "learning_rate": 6.783530568064599e-05, + "loss": 1.842, + "step": 13111 + }, + { + "epoch": 4.024554941682013, + "grad_norm": 0.2353603094816208, + "learning_rate": 6.783066201274529e-05, + "loss": 1.76, + "step": 13112 + }, + { + "epoch": 4.024861878453039, + "grad_norm": 0.22285830974578857, + "learning_rate": 6.782601816863153e-05, + "loss": 1.8014, + "step": 13113 + }, + { + "epoch": 4.025168815224064, + "grad_norm": 0.2482440173625946, + "learning_rate": 6.782137414835061e-05, + "loss": 1.7552, + "step": 13114 + }, + { + "epoch": 4.0254757519950894, + "grad_norm": 0.19926191866397858, + "learning_rate": 6.781672995194842e-05, + "loss": 1.7549, + "step": 13115 + }, + { + "epoch": 4.025782688766114, + "grad_norm": 0.2342877984046936, + "learning_rate": 6.781208557947086e-05, + "loss": 1.8622, + "step": 13116 + }, + { + "epoch": 4.026089625537139, + "grad_norm": 0.24096547067165375, + "learning_rate": 6.780744103096382e-05, + "loss": 1.7795, + "step": 13117 + }, + { + "epoch": 4.026396562308165, + "grad_norm": 0.23714657127857208, + "learning_rate": 6.780279630647322e-05, + "loss": 1.799, + "step": 13118 + }, + { + "epoch": 4.02670349907919, + "grad_norm": 0.28252026438713074, + "learning_rate": 6.779815140604496e-05, + "loss": 1.7573, + "step": 13119 + }, + { + "epoch": 4.027010435850215, + "grad_norm": 0.28028404712677, + "learning_rate": 6.779350632972493e-05, + "loss": 1.8103, + "step": 13120 + }, + { + "epoch": 4.02731737262124, + "grad_norm": 0.21088312566280365, + "learning_rate": 6.778886107755904e-05, + "loss": 1.7169, + "step": 13121 + }, + { + "epoch": 4.027624309392265, + "grad_norm": 0.22282038629055023, + "learning_rate": 6.77842156495932e-05, + "loss": 1.7206, + "step": 13122 + }, + { + "epoch": 4.02793124616329, + "grad_norm": 0.3281327784061432, + "learning_rate": 6.777957004587331e-05, + "loss": 1.8664, + "step": 13123 + }, + { + "epoch": 4.028238182934316, + "grad_norm": 0.29496827721595764, + "learning_rate": 6.77749242664453e-05, + "loss": 1.7532, + "step": 13124 + }, + { + "epoch": 4.028545119705341, + "grad_norm": 0.25299328565597534, + "learning_rate": 6.777027831135508e-05, + "loss": 1.7836, + "step": 13125 + }, + { + "epoch": 4.0288520564763655, + "grad_norm": 0.3000280559062958, + "learning_rate": 6.776563218064854e-05, + "loss": 1.8079, + "step": 13126 + }, + { + "epoch": 4.029158993247391, + "grad_norm": 0.3613673448562622, + "learning_rate": 6.77609858743716e-05, + "loss": 1.7931, + "step": 13127 + }, + { + "epoch": 4.029465930018416, + "grad_norm": 0.25613468885421753, + "learning_rate": 6.77563393925702e-05, + "loss": 1.7522, + "step": 13128 + }, + { + "epoch": 4.0297728667894415, + "grad_norm": 0.24391578137874603, + "learning_rate": 6.775169273529026e-05, + "loss": 1.818, + "step": 13129 + }, + { + "epoch": 4.030079803560467, + "grad_norm": 0.2806173264980316, + "learning_rate": 6.774704590257768e-05, + "loss": 1.7349, + "step": 13130 + }, + { + "epoch": 4.030386740331492, + "grad_norm": 0.22214172780513763, + "learning_rate": 6.774239889447838e-05, + "loss": 1.759, + "step": 13131 + }, + { + "epoch": 4.030693677102517, + "grad_norm": 0.27285513281822205, + "learning_rate": 6.773775171103828e-05, + "loss": 1.742, + "step": 13132 + }, + { + "epoch": 4.031000613873542, + "grad_norm": 0.22302402555942535, + "learning_rate": 6.773310435230334e-05, + "loss": 1.7277, + "step": 13133 + }, + { + "epoch": 4.031307550644567, + "grad_norm": 0.2350187450647354, + "learning_rate": 6.772845681831947e-05, + "loss": 1.8648, + "step": 13134 + }, + { + "epoch": 4.031614487415593, + "grad_norm": 0.2665547728538513, + "learning_rate": 6.772380910913261e-05, + "loss": 1.776, + "step": 13135 + }, + { + "epoch": 4.031921424186618, + "grad_norm": 0.30652403831481934, + "learning_rate": 6.771916122478867e-05, + "loss": 1.7884, + "step": 13136 + }, + { + "epoch": 4.032228360957642, + "grad_norm": 0.29372814297676086, + "learning_rate": 6.771451316533359e-05, + "loss": 1.8203, + "step": 13137 + }, + { + "epoch": 4.032535297728668, + "grad_norm": 0.2244873046875, + "learning_rate": 6.770986493081329e-05, + "loss": 1.7869, + "step": 13138 + }, + { + "epoch": 4.032842234499693, + "grad_norm": 0.25075265765190125, + "learning_rate": 6.770521652127375e-05, + "loss": 1.772, + "step": 13139 + }, + { + "epoch": 4.033149171270718, + "grad_norm": 0.28118211030960083, + "learning_rate": 6.770056793676087e-05, + "loss": 1.7922, + "step": 13140 + }, + { + "epoch": 4.033456108041744, + "grad_norm": 0.25199100375175476, + "learning_rate": 6.769591917732062e-05, + "loss": 1.7526, + "step": 13141 + }, + { + "epoch": 4.033763044812768, + "grad_norm": 0.2920379638671875, + "learning_rate": 6.769127024299892e-05, + "loss": 1.8365, + "step": 13142 + }, + { + "epoch": 4.0340699815837935, + "grad_norm": 0.23018018901348114, + "learning_rate": 6.768662113384171e-05, + "loss": 1.7411, + "step": 13143 + }, + { + "epoch": 4.034376918354819, + "grad_norm": 0.23253841698169708, + "learning_rate": 6.768197184989494e-05, + "loss": 1.7921, + "step": 13144 + }, + { + "epoch": 4.034683855125844, + "grad_norm": 0.22618864476680756, + "learning_rate": 6.767732239120456e-05, + "loss": 1.7421, + "step": 13145 + }, + { + "epoch": 4.0349907918968695, + "grad_norm": 0.24552187323570251, + "learning_rate": 6.767267275781655e-05, + "loss": 1.7299, + "step": 13146 + }, + { + "epoch": 4.035297728667895, + "grad_norm": 0.22562766075134277, + "learning_rate": 6.76680229497768e-05, + "loss": 1.766, + "step": 13147 + }, + { + "epoch": 4.035604665438919, + "grad_norm": 0.28718629479408264, + "learning_rate": 6.76633729671313e-05, + "loss": 1.7366, + "step": 13148 + }, + { + "epoch": 4.035911602209945, + "grad_norm": 0.38769885897636414, + "learning_rate": 6.765872280992598e-05, + "loss": 1.8244, + "step": 13149 + }, + { + "epoch": 4.03621853898097, + "grad_norm": 0.4232725501060486, + "learning_rate": 6.765407247820683e-05, + "loss": 1.8244, + "step": 13150 + }, + { + "epoch": 4.036525475751995, + "grad_norm": 0.2771088778972626, + "learning_rate": 6.764942197201977e-05, + "loss": 1.7863, + "step": 13151 + }, + { + "epoch": 4.036832412523021, + "grad_norm": 0.2917862832546234, + "learning_rate": 6.76447712914108e-05, + "loss": 1.791, + "step": 13152 + }, + { + "epoch": 4.037139349294045, + "grad_norm": 0.37355467677116394, + "learning_rate": 6.764012043642584e-05, + "loss": 1.74, + "step": 13153 + }, + { + "epoch": 4.03744628606507, + "grad_norm": 0.35664018988609314, + "learning_rate": 6.763546940711089e-05, + "loss": 1.7734, + "step": 13154 + }, + { + "epoch": 4.037753222836096, + "grad_norm": 0.2335754930973053, + "learning_rate": 6.763081820351188e-05, + "loss": 1.7765, + "step": 13155 + }, + { + "epoch": 4.038060159607121, + "grad_norm": 0.2825562357902527, + "learning_rate": 6.762616682567478e-05, + "loss": 1.7867, + "step": 13156 + }, + { + "epoch": 4.038367096378146, + "grad_norm": 0.3103202283382416, + "learning_rate": 6.762151527364559e-05, + "loss": 1.7331, + "step": 13157 + }, + { + "epoch": 4.038674033149171, + "grad_norm": 0.2897353172302246, + "learning_rate": 6.761686354747025e-05, + "loss": 1.7638, + "step": 13158 + }, + { + "epoch": 4.038980969920196, + "grad_norm": 0.21260851621627808, + "learning_rate": 6.761221164719474e-05, + "loss": 1.7302, + "step": 13159 + }, + { + "epoch": 4.0392879066912215, + "grad_norm": 0.2878021001815796, + "learning_rate": 6.760755957286503e-05, + "loss": 1.7368, + "step": 13160 + }, + { + "epoch": 4.039594843462247, + "grad_norm": 0.2785978317260742, + "learning_rate": 6.76029073245271e-05, + "loss": 1.7258, + "step": 13161 + }, + { + "epoch": 4.039901780233272, + "grad_norm": 0.1963953971862793, + "learning_rate": 6.759825490222692e-05, + "loss": 1.755, + "step": 13162 + }, + { + "epoch": 4.0402087170042975, + "grad_norm": 0.26776790618896484, + "learning_rate": 6.759360230601047e-05, + "loss": 1.7676, + "step": 13163 + }, + { + "epoch": 4.040515653775322, + "grad_norm": 0.2751332223415375, + "learning_rate": 6.758894953592373e-05, + "loss": 1.7313, + "step": 13164 + }, + { + "epoch": 4.040822590546347, + "grad_norm": 0.2339213341474533, + "learning_rate": 6.758429659201269e-05, + "loss": 1.714, + "step": 13165 + }, + { + "epoch": 4.041129527317373, + "grad_norm": 0.2624664008617401, + "learning_rate": 6.75796434743233e-05, + "loss": 1.8296, + "step": 13166 + }, + { + "epoch": 4.041436464088398, + "grad_norm": 0.40156883001327515, + "learning_rate": 6.757499018290159e-05, + "loss": 1.8228, + "step": 13167 + }, + { + "epoch": 4.041743400859423, + "grad_norm": 0.32976576685905457, + "learning_rate": 6.757033671779352e-05, + "loss": 1.7403, + "step": 13168 + }, + { + "epoch": 4.042050337630448, + "grad_norm": 0.2343887835741043, + "learning_rate": 6.756568307904508e-05, + "loss": 1.7837, + "step": 13169 + }, + { + "epoch": 4.042357274401473, + "grad_norm": 0.36174145340919495, + "learning_rate": 6.756102926670227e-05, + "loss": 1.7291, + "step": 13170 + }, + { + "epoch": 4.042664211172498, + "grad_norm": 0.3324793577194214, + "learning_rate": 6.755637528081108e-05, + "loss": 1.7414, + "step": 13171 + }, + { + "epoch": 4.042971147943524, + "grad_norm": 0.21945348381996155, + "learning_rate": 6.75517211214175e-05, + "loss": 1.7762, + "step": 13172 + }, + { + "epoch": 4.043278084714549, + "grad_norm": 0.31069812178611755, + "learning_rate": 6.75470667885675e-05, + "loss": 1.7666, + "step": 13173 + }, + { + "epoch": 4.043585021485574, + "grad_norm": 0.3931153118610382, + "learning_rate": 6.754241228230713e-05, + "loss": 1.7871, + "step": 13174 + }, + { + "epoch": 4.043891958256599, + "grad_norm": 0.25559595227241516, + "learning_rate": 6.753775760268234e-05, + "loss": 1.7916, + "step": 13175 + }, + { + "epoch": 4.044198895027624, + "grad_norm": 0.3686937391757965, + "learning_rate": 6.753310274973917e-05, + "loss": 1.7642, + "step": 13176 + }, + { + "epoch": 4.0445058317986495, + "grad_norm": 0.4793247580528259, + "learning_rate": 6.75284477235236e-05, + "loss": 1.739, + "step": 13177 + }, + { + "epoch": 4.044812768569675, + "grad_norm": 0.36179354786872864, + "learning_rate": 6.752379252408164e-05, + "loss": 1.7993, + "step": 13178 + }, + { + "epoch": 4.0451197053407, + "grad_norm": 0.22559234499931335, + "learning_rate": 6.751913715145926e-05, + "loss": 1.7401, + "step": 13179 + }, + { + "epoch": 4.045426642111725, + "grad_norm": 0.29058873653411865, + "learning_rate": 6.751448160570253e-05, + "loss": 1.8089, + "step": 13180 + }, + { + "epoch": 4.04573357888275, + "grad_norm": 0.3069808781147003, + "learning_rate": 6.750982588685742e-05, + "loss": 1.7587, + "step": 13181 + }, + { + "epoch": 4.046040515653775, + "grad_norm": 0.2292155921459198, + "learning_rate": 6.750516999496994e-05, + "loss": 1.7429, + "step": 13182 + }, + { + "epoch": 4.046347452424801, + "grad_norm": 0.2520677149295807, + "learning_rate": 6.750051393008612e-05, + "loss": 1.7842, + "step": 13183 + }, + { + "epoch": 4.046654389195826, + "grad_norm": 0.32546502351760864, + "learning_rate": 6.749585769225194e-05, + "loss": 1.8057, + "step": 13184 + }, + { + "epoch": 4.04696132596685, + "grad_norm": 0.27634644508361816, + "learning_rate": 6.749120128151346e-05, + "loss": 1.7708, + "step": 13185 + }, + { + "epoch": 4.047268262737876, + "grad_norm": 0.2546750009059906, + "learning_rate": 6.748654469791668e-05, + "loss": 1.8744, + "step": 13186 + }, + { + "epoch": 4.047575199508901, + "grad_norm": 0.43873605132102966, + "learning_rate": 6.748188794150761e-05, + "loss": 1.8573, + "step": 13187 + }, + { + "epoch": 4.047882136279926, + "grad_norm": 0.45526960492134094, + "learning_rate": 6.747723101233227e-05, + "loss": 1.7761, + "step": 13188 + }, + { + "epoch": 4.048189073050952, + "grad_norm": 0.24995557963848114, + "learning_rate": 6.74725739104367e-05, + "loss": 1.7679, + "step": 13189 + }, + { + "epoch": 4.048496009821977, + "grad_norm": 0.3203068971633911, + "learning_rate": 6.74679166358669e-05, + "loss": 1.7772, + "step": 13190 + }, + { + "epoch": 4.0488029465930016, + "grad_norm": 0.37020671367645264, + "learning_rate": 6.746325918866893e-05, + "loss": 1.8002, + "step": 13191 + }, + { + "epoch": 4.049109883364027, + "grad_norm": 0.2543959319591522, + "learning_rate": 6.745860156888878e-05, + "loss": 1.8057, + "step": 13192 + }, + { + "epoch": 4.049416820135052, + "grad_norm": 0.2566509246826172, + "learning_rate": 6.74539437765725e-05, + "loss": 1.7853, + "step": 13193 + }, + { + "epoch": 4.0497237569060776, + "grad_norm": 0.2545804977416992, + "learning_rate": 6.744928581176612e-05, + "loss": 1.8136, + "step": 13194 + }, + { + "epoch": 4.050030693677103, + "grad_norm": 0.24307197332382202, + "learning_rate": 6.744462767451568e-05, + "loss": 1.7919, + "step": 13195 + }, + { + "epoch": 4.050337630448127, + "grad_norm": 0.24427616596221924, + "learning_rate": 6.743996936486719e-05, + "loss": 1.8037, + "step": 13196 + }, + { + "epoch": 4.050644567219153, + "grad_norm": 0.2154439389705658, + "learning_rate": 6.743531088286673e-05, + "loss": 1.7088, + "step": 13197 + }, + { + "epoch": 4.050951503990178, + "grad_norm": 0.22251558303833008, + "learning_rate": 6.743065222856027e-05, + "loss": 1.7512, + "step": 13198 + }, + { + "epoch": 4.051258440761203, + "grad_norm": 0.2373272329568863, + "learning_rate": 6.74259934019939e-05, + "loss": 1.8056, + "step": 13199 + }, + { + "epoch": 4.051565377532229, + "grad_norm": 0.23308727145195007, + "learning_rate": 6.742133440321366e-05, + "loss": 1.731, + "step": 13200 + }, + { + "epoch": 4.051872314303253, + "grad_norm": 0.2438805252313614, + "learning_rate": 6.741667523226557e-05, + "loss": 1.7938, + "step": 13201 + }, + { + "epoch": 4.0521792510742785, + "grad_norm": 0.22354702651500702, + "learning_rate": 6.741201588919569e-05, + "loss": 1.762, + "step": 13202 + }, + { + "epoch": 4.052486187845304, + "grad_norm": 0.2505488097667694, + "learning_rate": 6.740735637405006e-05, + "loss": 1.7627, + "step": 13203 + }, + { + "epoch": 4.052793124616329, + "grad_norm": 0.21378709375858307, + "learning_rate": 6.740269668687474e-05, + "loss": 1.7598, + "step": 13204 + }, + { + "epoch": 4.0531000613873545, + "grad_norm": 0.24863660335540771, + "learning_rate": 6.739803682771577e-05, + "loss": 1.7665, + "step": 13205 + }, + { + "epoch": 4.05340699815838, + "grad_norm": 0.3041808605194092, + "learning_rate": 6.739337679661921e-05, + "loss": 1.7909, + "step": 13206 + }, + { + "epoch": 4.053713934929404, + "grad_norm": 0.2745797634124756, + "learning_rate": 6.738871659363109e-05, + "loss": 1.7547, + "step": 13207 + }, + { + "epoch": 4.05402087170043, + "grad_norm": 0.2610073387622833, + "learning_rate": 6.738405621879748e-05, + "loss": 1.7723, + "step": 13208 + }, + { + "epoch": 4.054327808471455, + "grad_norm": 0.22728075087070465, + "learning_rate": 6.737939567216446e-05, + "loss": 1.7865, + "step": 13209 + }, + { + "epoch": 4.05463474524248, + "grad_norm": 0.2877669930458069, + "learning_rate": 6.737473495377804e-05, + "loss": 1.8352, + "step": 13210 + }, + { + "epoch": 4.054941682013506, + "grad_norm": 0.35316282510757446, + "learning_rate": 6.737007406368432e-05, + "loss": 1.8202, + "step": 13211 + }, + { + "epoch": 4.05524861878453, + "grad_norm": 0.34625691175460815, + "learning_rate": 6.736541300192936e-05, + "loss": 1.8456, + "step": 13212 + }, + { + "epoch": 4.055555555555555, + "grad_norm": 0.2432134598493576, + "learning_rate": 6.736075176855917e-05, + "loss": 1.8237, + "step": 13213 + }, + { + "epoch": 4.055862492326581, + "grad_norm": 0.27446529269218445, + "learning_rate": 6.735609036361989e-05, + "loss": 1.71, + "step": 13214 + }, + { + "epoch": 4.056169429097606, + "grad_norm": 0.2870408892631531, + "learning_rate": 6.735142878715754e-05, + "loss": 1.7473, + "step": 13215 + }, + { + "epoch": 4.056476365868631, + "grad_norm": 0.22249078750610352, + "learning_rate": 6.734676703921822e-05, + "loss": 1.7462, + "step": 13216 + }, + { + "epoch": 4.056783302639656, + "grad_norm": 0.25519105792045593, + "learning_rate": 6.734210511984796e-05, + "loss": 1.7022, + "step": 13217 + }, + { + "epoch": 4.057090239410681, + "grad_norm": 0.3366561830043793, + "learning_rate": 6.733744302909285e-05, + "loss": 1.787, + "step": 13218 + }, + { + "epoch": 4.0573971761817065, + "grad_norm": 0.2443208247423172, + "learning_rate": 6.733278076699897e-05, + "loss": 1.8048, + "step": 13219 + }, + { + "epoch": 4.057704112952732, + "grad_norm": 0.2893153131008148, + "learning_rate": 6.73281183336124e-05, + "loss": 1.7805, + "step": 13220 + }, + { + "epoch": 4.058011049723757, + "grad_norm": 0.3178043067455292, + "learning_rate": 6.73234557289792e-05, + "loss": 1.8264, + "step": 13221 + }, + { + "epoch": 4.0583179864947825, + "grad_norm": 0.27355703711509705, + "learning_rate": 6.731879295314546e-05, + "loss": 1.8427, + "step": 13222 + }, + { + "epoch": 4.058624923265807, + "grad_norm": 0.32180166244506836, + "learning_rate": 6.731413000615726e-05, + "loss": 1.7332, + "step": 13223 + }, + { + "epoch": 4.058931860036832, + "grad_norm": 0.3736574351787567, + "learning_rate": 6.730946688806067e-05, + "loss": 1.7447, + "step": 13224 + }, + { + "epoch": 4.059238796807858, + "grad_norm": 0.2526068687438965, + "learning_rate": 6.73048035989018e-05, + "loss": 1.8104, + "step": 13225 + }, + { + "epoch": 4.059545733578883, + "grad_norm": 0.29076167941093445, + "learning_rate": 6.73001401387267e-05, + "loss": 1.7977, + "step": 13226 + }, + { + "epoch": 4.059852670349908, + "grad_norm": 0.37963762879371643, + "learning_rate": 6.729547650758148e-05, + "loss": 1.8336, + "step": 13227 + }, + { + "epoch": 4.060159607120933, + "grad_norm": 0.31584078073501587, + "learning_rate": 6.729081270551222e-05, + "loss": 1.7843, + "step": 13228 + }, + { + "epoch": 4.060466543891958, + "grad_norm": 0.22793468832969666, + "learning_rate": 6.728614873256502e-05, + "loss": 1.7444, + "step": 13229 + }, + { + "epoch": 4.060773480662983, + "grad_norm": 0.3114435076713562, + "learning_rate": 6.728148458878596e-05, + "loss": 1.8012, + "step": 13230 + }, + { + "epoch": 4.061080417434009, + "grad_norm": 0.29843854904174805, + "learning_rate": 6.727682027422116e-05, + "loss": 1.8014, + "step": 13231 + }, + { + "epoch": 4.061387354205034, + "grad_norm": 0.22745616734027863, + "learning_rate": 6.727215578891668e-05, + "loss": 1.7303, + "step": 13232 + }, + { + "epoch": 4.0616942909760585, + "grad_norm": 0.2701241970062256, + "learning_rate": 6.726749113291864e-05, + "loss": 1.7665, + "step": 13233 + }, + { + "epoch": 4.062001227747084, + "grad_norm": 0.29304635524749756, + "learning_rate": 6.726282630627313e-05, + "loss": 1.875, + "step": 13234 + }, + { + "epoch": 4.062308164518109, + "grad_norm": 0.21467708051204681, + "learning_rate": 6.725816130902625e-05, + "loss": 1.7442, + "step": 13235 + }, + { + "epoch": 4.0626151012891345, + "grad_norm": 0.23517470061779022, + "learning_rate": 6.72534961412241e-05, + "loss": 1.7154, + "step": 13236 + }, + { + "epoch": 4.06292203806016, + "grad_norm": 0.21483808755874634, + "learning_rate": 6.724883080291278e-05, + "loss": 1.7162, + "step": 13237 + }, + { + "epoch": 4.063228974831185, + "grad_norm": 0.2274744212627411, + "learning_rate": 6.724416529413843e-05, + "loss": 1.8066, + "step": 13238 + }, + { + "epoch": 4.06353591160221, + "grad_norm": 0.24682378768920898, + "learning_rate": 6.723949961494712e-05, + "loss": 1.7905, + "step": 13239 + }, + { + "epoch": 4.063842848373235, + "grad_norm": 0.2516227066516876, + "learning_rate": 6.723483376538498e-05, + "loss": 1.7693, + "step": 13240 + }, + { + "epoch": 4.06414978514426, + "grad_norm": 0.22076398134231567, + "learning_rate": 6.723016774549808e-05, + "loss": 1.7357, + "step": 13241 + }, + { + "epoch": 4.064456721915286, + "grad_norm": 0.20741026103496552, + "learning_rate": 6.722550155533258e-05, + "loss": 1.8082, + "step": 13242 + }, + { + "epoch": 4.064763658686311, + "grad_norm": 0.2074010819196701, + "learning_rate": 6.722083519493458e-05, + "loss": 1.71, + "step": 13243 + }, + { + "epoch": 4.065070595457335, + "grad_norm": 0.2661527991294861, + "learning_rate": 6.72161686643502e-05, + "loss": 1.7448, + "step": 13244 + }, + { + "epoch": 4.065377532228361, + "grad_norm": 0.2877216935157776, + "learning_rate": 6.721150196362555e-05, + "loss": 1.7574, + "step": 13245 + }, + { + "epoch": 4.065684468999386, + "grad_norm": 0.2520955801010132, + "learning_rate": 6.720683509280675e-05, + "loss": 1.7717, + "step": 13246 + }, + { + "epoch": 4.065991405770411, + "grad_norm": 0.2219560444355011, + "learning_rate": 6.72021680519399e-05, + "loss": 1.7355, + "step": 13247 + }, + { + "epoch": 4.066298342541437, + "grad_norm": 0.24671706557273865, + "learning_rate": 6.719750084107117e-05, + "loss": 1.8204, + "step": 13248 + }, + { + "epoch": 4.066605279312462, + "grad_norm": 0.24512135982513428, + "learning_rate": 6.719283346024664e-05, + "loss": 1.826, + "step": 13249 + }, + { + "epoch": 4.0669122160834865, + "grad_norm": 0.24370841681957245, + "learning_rate": 6.718816590951247e-05, + "loss": 1.8322, + "step": 13250 + }, + { + "epoch": 4.067219152854512, + "grad_norm": 0.2312363088130951, + "learning_rate": 6.718349818891475e-05, + "loss": 1.7621, + "step": 13251 + }, + { + "epoch": 4.067526089625537, + "grad_norm": 0.2500494420528412, + "learning_rate": 6.717883029849965e-05, + "loss": 1.829, + "step": 13252 + }, + { + "epoch": 4.0678330263965625, + "grad_norm": 0.29882633686065674, + "learning_rate": 6.717416223831324e-05, + "loss": 1.799, + "step": 13253 + }, + { + "epoch": 4.068139963167588, + "grad_norm": 0.21962928771972656, + "learning_rate": 6.716949400840172e-05, + "loss": 1.7714, + "step": 13254 + }, + { + "epoch": 4.068446899938612, + "grad_norm": 0.25544899702072144, + "learning_rate": 6.716482560881121e-05, + "loss": 1.7911, + "step": 13255 + }, + { + "epoch": 4.068753836709638, + "grad_norm": 0.24865686893463135, + "learning_rate": 6.716015703958781e-05, + "loss": 1.7107, + "step": 13256 + }, + { + "epoch": 4.069060773480663, + "grad_norm": 0.22669239342212677, + "learning_rate": 6.715548830077769e-05, + "loss": 1.8503, + "step": 13257 + }, + { + "epoch": 4.069367710251688, + "grad_norm": 0.2973819077014923, + "learning_rate": 6.715081939242698e-05, + "loss": 1.7859, + "step": 13258 + }, + { + "epoch": 4.069674647022714, + "grad_norm": 0.3178746700286865, + "learning_rate": 6.714615031458181e-05, + "loss": 1.7705, + "step": 13259 + }, + { + "epoch": 4.069981583793738, + "grad_norm": 0.20452535152435303, + "learning_rate": 6.714148106728835e-05, + "loss": 1.7386, + "step": 13260 + }, + { + "epoch": 4.070288520564763, + "grad_norm": 0.30288320779800415, + "learning_rate": 6.713681165059271e-05, + "loss": 1.7823, + "step": 13261 + }, + { + "epoch": 4.070595457335789, + "grad_norm": 0.30014416575431824, + "learning_rate": 6.713214206454107e-05, + "loss": 1.7626, + "step": 13262 + }, + { + "epoch": 4.070902394106814, + "grad_norm": 0.25144243240356445, + "learning_rate": 6.712747230917956e-05, + "loss": 1.8359, + "step": 13263 + }, + { + "epoch": 4.071209330877839, + "grad_norm": 0.308148592710495, + "learning_rate": 6.712280238455432e-05, + "loss": 1.7226, + "step": 13264 + }, + { + "epoch": 4.071516267648865, + "grad_norm": 0.2704198658466339, + "learning_rate": 6.711813229071151e-05, + "loss": 1.7982, + "step": 13265 + }, + { + "epoch": 4.071823204419889, + "grad_norm": 0.3928656280040741, + "learning_rate": 6.711346202769729e-05, + "loss": 1.7987, + "step": 13266 + }, + { + "epoch": 4.0721301411909145, + "grad_norm": 0.3603350520133972, + "learning_rate": 6.71087915955578e-05, + "loss": 1.7963, + "step": 13267 + }, + { + "epoch": 4.07243707796194, + "grad_norm": 0.2673214077949524, + "learning_rate": 6.710412099433921e-05, + "loss": 1.8011, + "step": 13268 + }, + { + "epoch": 4.072744014732965, + "grad_norm": 0.2523653209209442, + "learning_rate": 6.709945022408768e-05, + "loss": 1.755, + "step": 13269 + }, + { + "epoch": 4.0730509515039905, + "grad_norm": 0.3818903863430023, + "learning_rate": 6.709477928484934e-05, + "loss": 1.7968, + "step": 13270 + }, + { + "epoch": 4.073357888275015, + "grad_norm": 0.31509929895401, + "learning_rate": 6.709010817667039e-05, + "loss": 1.744, + "step": 13271 + }, + { + "epoch": 4.07366482504604, + "grad_norm": 0.21875518560409546, + "learning_rate": 6.708543689959697e-05, + "loss": 1.7511, + "step": 13272 + }, + { + "epoch": 4.073971761817066, + "grad_norm": 0.25381338596343994, + "learning_rate": 6.708076545367523e-05, + "loss": 1.7523, + "step": 13273 + }, + { + "epoch": 4.074278698588091, + "grad_norm": 0.24193842709064484, + "learning_rate": 6.707609383895137e-05, + "loss": 1.7713, + "step": 13274 + }, + { + "epoch": 4.074585635359116, + "grad_norm": 0.21972359716892242, + "learning_rate": 6.707142205547154e-05, + "loss": 1.7329, + "step": 13275 + }, + { + "epoch": 4.074892572130141, + "grad_norm": 0.22188499569892883, + "learning_rate": 6.706675010328192e-05, + "loss": 1.7507, + "step": 13276 + }, + { + "epoch": 4.075199508901166, + "grad_norm": 0.23344436287879944, + "learning_rate": 6.706207798242865e-05, + "loss": 1.771, + "step": 13277 + }, + { + "epoch": 4.0755064456721914, + "grad_norm": 0.3008805513381958, + "learning_rate": 6.705740569295795e-05, + "loss": 1.775, + "step": 13278 + }, + { + "epoch": 4.075813382443217, + "grad_norm": 0.31407982110977173, + "learning_rate": 6.705273323491595e-05, + "loss": 1.7625, + "step": 13279 + }, + { + "epoch": 4.076120319214242, + "grad_norm": 0.2430381178855896, + "learning_rate": 6.704806060834886e-05, + "loss": 1.7706, + "step": 13280 + }, + { + "epoch": 4.0764272559852675, + "grad_norm": 0.23250171542167664, + "learning_rate": 6.704338781330284e-05, + "loss": 1.7977, + "step": 13281 + }, + { + "epoch": 4.076734192756292, + "grad_norm": 0.22073723375797272, + "learning_rate": 6.703871484982407e-05, + "loss": 1.7686, + "step": 13282 + }, + { + "epoch": 4.077041129527317, + "grad_norm": 0.24987035989761353, + "learning_rate": 6.703404171795874e-05, + "loss": 1.736, + "step": 13283 + }, + { + "epoch": 4.077348066298343, + "grad_norm": 0.2697623670101166, + "learning_rate": 6.702936841775301e-05, + "loss": 1.8367, + "step": 13284 + }, + { + "epoch": 4.077655003069368, + "grad_norm": 0.21592749655246735, + "learning_rate": 6.702469494925309e-05, + "loss": 1.7467, + "step": 13285 + }, + { + "epoch": 4.077961939840393, + "grad_norm": 0.2612052261829376, + "learning_rate": 6.702002131250515e-05, + "loss": 1.7689, + "step": 13286 + }, + { + "epoch": 4.078268876611418, + "grad_norm": 0.3004797697067261, + "learning_rate": 6.701534750755539e-05, + "loss": 1.7586, + "step": 13287 + }, + { + "epoch": 4.078575813382443, + "grad_norm": 0.24615366756916046, + "learning_rate": 6.701067353444998e-05, + "loss": 1.7636, + "step": 13288 + }, + { + "epoch": 4.078882750153468, + "grad_norm": 0.23401159048080444, + "learning_rate": 6.700599939323515e-05, + "loss": 1.8015, + "step": 13289 + }, + { + "epoch": 4.079189686924494, + "grad_norm": 0.24546295404434204, + "learning_rate": 6.700132508395705e-05, + "loss": 1.7606, + "step": 13290 + }, + { + "epoch": 4.079496623695519, + "grad_norm": 0.24664412438869476, + "learning_rate": 6.69966506066619e-05, + "loss": 1.7994, + "step": 13291 + }, + { + "epoch": 4.0798035604665435, + "grad_norm": 0.2780163288116455, + "learning_rate": 6.699197596139587e-05, + "loss": 1.7972, + "step": 13292 + }, + { + "epoch": 4.080110497237569, + "grad_norm": 0.2554188668727875, + "learning_rate": 6.698730114820517e-05, + "loss": 1.7928, + "step": 13293 + }, + { + "epoch": 4.080417434008594, + "grad_norm": 0.2471141666173935, + "learning_rate": 6.698262616713602e-05, + "loss": 1.7948, + "step": 13294 + }, + { + "epoch": 4.0807243707796195, + "grad_norm": 0.2556581199169159, + "learning_rate": 6.697795101823461e-05, + "loss": 1.7942, + "step": 13295 + }, + { + "epoch": 4.081031307550645, + "grad_norm": 0.24462421238422394, + "learning_rate": 6.697327570154712e-05, + "loss": 1.7336, + "step": 13296 + }, + { + "epoch": 4.08133824432167, + "grad_norm": 0.22378689050674438, + "learning_rate": 6.696860021711978e-05, + "loss": 1.7703, + "step": 13297 + }, + { + "epoch": 4.081645181092695, + "grad_norm": 0.23949933052062988, + "learning_rate": 6.69639245649988e-05, + "loss": 1.7651, + "step": 13298 + }, + { + "epoch": 4.08195211786372, + "grad_norm": 0.27751216292381287, + "learning_rate": 6.695924874523035e-05, + "loss": 1.7866, + "step": 13299 + }, + { + "epoch": 4.082259054634745, + "grad_norm": 0.22700226306915283, + "learning_rate": 6.695457275786068e-05, + "loss": 1.79, + "step": 13300 + }, + { + "epoch": 4.082565991405771, + "grad_norm": 0.2138090431690216, + "learning_rate": 6.694989660293598e-05, + "loss": 1.7882, + "step": 13301 + }, + { + "epoch": 4.082872928176796, + "grad_norm": 0.2963469326496124, + "learning_rate": 6.694522028050246e-05, + "loss": 1.8779, + "step": 13302 + }, + { + "epoch": 4.08317986494782, + "grad_norm": 0.31833669543266296, + "learning_rate": 6.694054379060634e-05, + "loss": 1.7923, + "step": 13303 + }, + { + "epoch": 4.083486801718846, + "grad_norm": 0.27751585841178894, + "learning_rate": 6.693586713329385e-05, + "loss": 1.7557, + "step": 13304 + }, + { + "epoch": 4.083793738489871, + "grad_norm": 0.23790816962718964, + "learning_rate": 6.69311903086112e-05, + "loss": 1.7587, + "step": 13305 + }, + { + "epoch": 4.084100675260896, + "grad_norm": 0.24153777956962585, + "learning_rate": 6.692651331660458e-05, + "loss": 1.7573, + "step": 13306 + }, + { + "epoch": 4.084407612031922, + "grad_norm": 0.26607179641723633, + "learning_rate": 6.692183615732025e-05, + "loss": 1.7823, + "step": 13307 + }, + { + "epoch": 4.084714548802946, + "grad_norm": 0.26670268177986145, + "learning_rate": 6.691715883080442e-05, + "loss": 1.784, + "step": 13308 + }, + { + "epoch": 4.0850214855739715, + "grad_norm": 0.25980666279792786, + "learning_rate": 6.69124813371033e-05, + "loss": 1.797, + "step": 13309 + }, + { + "epoch": 4.085328422344997, + "grad_norm": 0.2805597484111786, + "learning_rate": 6.690780367626314e-05, + "loss": 1.8298, + "step": 13310 + }, + { + "epoch": 4.085635359116022, + "grad_norm": 0.27198413014411926, + "learning_rate": 6.690312584833012e-05, + "loss": 1.8104, + "step": 13311 + }, + { + "epoch": 4.0859422958870475, + "grad_norm": 0.2619116008281708, + "learning_rate": 6.689844785335054e-05, + "loss": 1.771, + "step": 13312 + }, + { + "epoch": 4.086249232658073, + "grad_norm": 0.22647863626480103, + "learning_rate": 6.689376969137057e-05, + "loss": 1.8114, + "step": 13313 + }, + { + "epoch": 4.086556169429097, + "grad_norm": 1.469475507736206, + "learning_rate": 6.68890913624365e-05, + "loss": 1.8796, + "step": 13314 + }, + { + "epoch": 4.086863106200123, + "grad_norm": 0.4577515423297882, + "learning_rate": 6.68844128665945e-05, + "loss": 1.716, + "step": 13315 + }, + { + "epoch": 4.087170042971148, + "grad_norm": 0.5830543637275696, + "learning_rate": 6.687973420389085e-05, + "loss": 1.7692, + "step": 13316 + }, + { + "epoch": 4.087476979742173, + "grad_norm": 0.4404197037220001, + "learning_rate": 6.687505537437178e-05, + "loss": 1.7909, + "step": 13317 + }, + { + "epoch": 4.087783916513199, + "grad_norm": 0.31379908323287964, + "learning_rate": 6.68703763780835e-05, + "loss": 1.7957, + "step": 13318 + }, + { + "epoch": 4.088090853284223, + "grad_norm": 0.49588730931282043, + "learning_rate": 6.686569721507229e-05, + "loss": 1.7126, + "step": 13319 + }, + { + "epoch": 4.088397790055248, + "grad_norm": 0.3690234124660492, + "learning_rate": 6.686101788538437e-05, + "loss": 1.8233, + "step": 13320 + }, + { + "epoch": 4.088704726826274, + "grad_norm": 0.337310254573822, + "learning_rate": 6.685633838906598e-05, + "loss": 1.6886, + "step": 13321 + }, + { + "epoch": 4.089011663597299, + "grad_norm": 0.5164821147918701, + "learning_rate": 6.685165872616337e-05, + "loss": 1.7967, + "step": 13322 + }, + { + "epoch": 4.089318600368324, + "grad_norm": 0.36501309275627136, + "learning_rate": 6.68469788967228e-05, + "loss": 1.755, + "step": 13323 + }, + { + "epoch": 4.08962553713935, + "grad_norm": 0.35017216205596924, + "learning_rate": 6.684229890079052e-05, + "loss": 1.7595, + "step": 13324 + }, + { + "epoch": 4.089932473910374, + "grad_norm": 0.5622650980949402, + "learning_rate": 6.683761873841277e-05, + "loss": 1.7841, + "step": 13325 + }, + { + "epoch": 4.0902394106813995, + "grad_norm": 0.47010260820388794, + "learning_rate": 6.683293840963578e-05, + "loss": 1.7537, + "step": 13326 + }, + { + "epoch": 4.090546347452425, + "grad_norm": 0.25515374541282654, + "learning_rate": 6.682825791450584e-05, + "loss": 1.7692, + "step": 13327 + }, + { + "epoch": 4.09085328422345, + "grad_norm": 0.5063003897666931, + "learning_rate": 6.682357725306919e-05, + "loss": 1.7454, + "step": 13328 + }, + { + "epoch": 4.0911602209944755, + "grad_norm": 0.4197622835636139, + "learning_rate": 6.681889642537209e-05, + "loss": 1.7792, + "step": 13329 + }, + { + "epoch": 4.0914671577655, + "grad_norm": 0.24038295447826385, + "learning_rate": 6.68142154314608e-05, + "loss": 1.7631, + "step": 13330 + }, + { + "epoch": 4.091774094536525, + "grad_norm": 0.42108532786369324, + "learning_rate": 6.680953427138159e-05, + "loss": 1.7784, + "step": 13331 + }, + { + "epoch": 4.092081031307551, + "grad_norm": 0.33729633688926697, + "learning_rate": 6.68048529451807e-05, + "loss": 1.8057, + "step": 13332 + }, + { + "epoch": 4.092387968078576, + "grad_norm": 0.31847241520881653, + "learning_rate": 6.68001714529044e-05, + "loss": 1.7375, + "step": 13333 + }, + { + "epoch": 4.092694904849601, + "grad_norm": 0.45276644825935364, + "learning_rate": 6.679548979459896e-05, + "loss": 1.7507, + "step": 13334 + }, + { + "epoch": 4.093001841620626, + "grad_norm": 0.3781665861606598, + "learning_rate": 6.679080797031065e-05, + "loss": 1.7718, + "step": 13335 + }, + { + "epoch": 4.093308778391651, + "grad_norm": 0.25868359208106995, + "learning_rate": 6.678612598008573e-05, + "loss": 1.8105, + "step": 13336 + }, + { + "epoch": 4.093615715162676, + "grad_norm": 0.32834702730178833, + "learning_rate": 6.678144382397048e-05, + "loss": 1.7883, + "step": 13337 + }, + { + "epoch": 4.093922651933702, + "grad_norm": 0.2830568253993988, + "learning_rate": 6.677676150201116e-05, + "loss": 1.7994, + "step": 13338 + }, + { + "epoch": 4.094229588704727, + "grad_norm": 0.219541534781456, + "learning_rate": 6.677207901425405e-05, + "loss": 1.7344, + "step": 13339 + }, + { + "epoch": 4.094536525475752, + "grad_norm": 0.2557326555252075, + "learning_rate": 6.676739636074542e-05, + "loss": 1.7734, + "step": 13340 + }, + { + "epoch": 4.094843462246777, + "grad_norm": 0.2741365432739258, + "learning_rate": 6.676271354153156e-05, + "loss": 1.7912, + "step": 13341 + }, + { + "epoch": 4.095150399017802, + "grad_norm": 0.31258970499038696, + "learning_rate": 6.675803055665874e-05, + "loss": 1.7798, + "step": 13342 + }, + { + "epoch": 4.0954573357888275, + "grad_norm": 0.30181947350502014, + "learning_rate": 6.675334740617322e-05, + "loss": 1.7746, + "step": 13343 + }, + { + "epoch": 4.095764272559853, + "grad_norm": 0.3000102937221527, + "learning_rate": 6.674866409012133e-05, + "loss": 1.7842, + "step": 13344 + }, + { + "epoch": 4.096071209330878, + "grad_norm": 0.22871005535125732, + "learning_rate": 6.674398060854931e-05, + "loss": 1.7473, + "step": 13345 + }, + { + "epoch": 4.096378146101903, + "grad_norm": 0.2700810432434082, + "learning_rate": 6.673929696150346e-05, + "loss": 1.7862, + "step": 13346 + }, + { + "epoch": 4.096685082872928, + "grad_norm": 0.27537551522254944, + "learning_rate": 6.673461314903007e-05, + "loss": 1.7843, + "step": 13347 + }, + { + "epoch": 4.096992019643953, + "grad_norm": 0.23700574040412903, + "learning_rate": 6.672992917117542e-05, + "loss": 1.765, + "step": 13348 + }, + { + "epoch": 4.097298956414979, + "grad_norm": 0.23331589996814728, + "learning_rate": 6.672524502798583e-05, + "loss": 1.7894, + "step": 13349 + }, + { + "epoch": 4.097605893186004, + "grad_norm": 0.28591978549957275, + "learning_rate": 6.672056071950753e-05, + "loss": 1.7736, + "step": 13350 + }, + { + "epoch": 4.097912829957028, + "grad_norm": 0.3000452518463135, + "learning_rate": 6.671587624578685e-05, + "loss": 1.7635, + "step": 13351 + }, + { + "epoch": 4.098219766728054, + "grad_norm": 0.21877998113632202, + "learning_rate": 6.67111916068701e-05, + "loss": 1.7225, + "step": 13352 + }, + { + "epoch": 4.098526703499079, + "grad_norm": 0.2598817050457001, + "learning_rate": 6.670650680280358e-05, + "loss": 1.6874, + "step": 13353 + }, + { + "epoch": 4.098833640270104, + "grad_norm": 0.3063203692436218, + "learning_rate": 6.670182183363353e-05, + "loss": 1.7821, + "step": 13354 + }, + { + "epoch": 4.09914057704113, + "grad_norm": 0.2328508347272873, + "learning_rate": 6.66971366994063e-05, + "loss": 1.788, + "step": 13355 + }, + { + "epoch": 4.099447513812155, + "grad_norm": 0.33936765789985657, + "learning_rate": 6.669245140016817e-05, + "loss": 1.8159, + "step": 13356 + }, + { + "epoch": 4.0997544505831796, + "grad_norm": 0.27464553713798523, + "learning_rate": 6.668776593596546e-05, + "loss": 1.7371, + "step": 13357 + }, + { + "epoch": 4.100061387354205, + "grad_norm": 0.24255812168121338, + "learning_rate": 6.668308030684447e-05, + "loss": 1.7993, + "step": 13358 + }, + { + "epoch": 4.10036832412523, + "grad_norm": 0.27203628420829773, + "learning_rate": 6.667839451285149e-05, + "loss": 1.8253, + "step": 13359 + }, + { + "epoch": 4.100675260896256, + "grad_norm": 0.2503862679004669, + "learning_rate": 6.667370855403286e-05, + "loss": 1.7927, + "step": 13360 + }, + { + "epoch": 4.100982197667281, + "grad_norm": 0.2616904377937317, + "learning_rate": 6.666902243043486e-05, + "loss": 1.8226, + "step": 13361 + }, + { + "epoch": 4.101289134438305, + "grad_norm": 0.26707521080970764, + "learning_rate": 6.666433614210379e-05, + "loss": 1.8485, + "step": 13362 + }, + { + "epoch": 4.101596071209331, + "grad_norm": 0.2427528202533722, + "learning_rate": 6.6659649689086e-05, + "loss": 1.7387, + "step": 13363 + }, + { + "epoch": 4.101903007980356, + "grad_norm": 0.2319549173116684, + "learning_rate": 6.66549630714278e-05, + "loss": 1.7396, + "step": 13364 + }, + { + "epoch": 4.102209944751381, + "grad_norm": 0.2248002141714096, + "learning_rate": 6.665027628917548e-05, + "loss": 1.7817, + "step": 13365 + }, + { + "epoch": 4.102516881522407, + "grad_norm": 0.21929535269737244, + "learning_rate": 6.664558934237538e-05, + "loss": 1.7478, + "step": 13366 + }, + { + "epoch": 4.102823818293431, + "grad_norm": 0.21144583821296692, + "learning_rate": 6.66409022310738e-05, + "loss": 1.7602, + "step": 13367 + }, + { + "epoch": 4.1031307550644565, + "grad_norm": 0.21984660625457764, + "learning_rate": 6.663621495531707e-05, + "loss": 1.7541, + "step": 13368 + }, + { + "epoch": 4.103437691835482, + "grad_norm": 0.2075357735157013, + "learning_rate": 6.663152751515152e-05, + "loss": 1.7362, + "step": 13369 + }, + { + "epoch": 4.103744628606507, + "grad_norm": 0.23316961526870728, + "learning_rate": 6.662683991062347e-05, + "loss": 1.8273, + "step": 13370 + }, + { + "epoch": 4.1040515653775325, + "grad_norm": 0.23142337799072266, + "learning_rate": 6.662215214177922e-05, + "loss": 1.7543, + "step": 13371 + }, + { + "epoch": 4.104358502148558, + "grad_norm": 0.24335260689258575, + "learning_rate": 6.661746420866515e-05, + "loss": 1.8328, + "step": 13372 + }, + { + "epoch": 4.104665438919582, + "grad_norm": 0.2440192997455597, + "learning_rate": 6.661277611132753e-05, + "loss": 1.8114, + "step": 13373 + }, + { + "epoch": 4.104972375690608, + "grad_norm": 0.252808541059494, + "learning_rate": 6.660808784981273e-05, + "loss": 1.8556, + "step": 13374 + }, + { + "epoch": 4.105279312461633, + "grad_norm": 0.24564477801322937, + "learning_rate": 6.660339942416708e-05, + "loss": 1.8231, + "step": 13375 + }, + { + "epoch": 4.105586249232658, + "grad_norm": 0.2371874898672104, + "learning_rate": 6.65987108344369e-05, + "loss": 1.7763, + "step": 13376 + }, + { + "epoch": 4.105893186003684, + "grad_norm": 0.22882802784442902, + "learning_rate": 6.659402208066854e-05, + "loss": 1.7388, + "step": 13377 + }, + { + "epoch": 4.106200122774708, + "grad_norm": 0.24857540428638458, + "learning_rate": 6.658933316290832e-05, + "loss": 1.7735, + "step": 13378 + }, + { + "epoch": 4.106507059545733, + "grad_norm": 0.22574029862880707, + "learning_rate": 6.658464408120257e-05, + "loss": 1.7403, + "step": 13379 + }, + { + "epoch": 4.106813996316759, + "grad_norm": 0.24944272637367249, + "learning_rate": 6.657995483559767e-05, + "loss": 1.7827, + "step": 13380 + }, + { + "epoch": 4.107120933087784, + "grad_norm": 0.27386224269866943, + "learning_rate": 6.657526542613992e-05, + "loss": 1.7673, + "step": 13381 + }, + { + "epoch": 4.107427869858809, + "grad_norm": 0.29222097992897034, + "learning_rate": 6.65705758528757e-05, + "loss": 1.7958, + "step": 13382 + }, + { + "epoch": 4.107734806629834, + "grad_norm": 0.2471150904893875, + "learning_rate": 6.656588611585133e-05, + "loss": 1.7706, + "step": 13383 + }, + { + "epoch": 4.108041743400859, + "grad_norm": 0.289316862821579, + "learning_rate": 6.656119621511317e-05, + "loss": 1.7828, + "step": 13384 + }, + { + "epoch": 4.1083486801718845, + "grad_norm": 0.36710497736930847, + "learning_rate": 6.655650615070756e-05, + "loss": 1.712, + "step": 13385 + }, + { + "epoch": 4.10865561694291, + "grad_norm": 0.2999880611896515, + "learning_rate": 6.655181592268084e-05, + "loss": 1.7711, + "step": 13386 + }, + { + "epoch": 4.108962553713935, + "grad_norm": 0.332011342048645, + "learning_rate": 6.654712553107939e-05, + "loss": 1.907, + "step": 13387 + }, + { + "epoch": 4.1092694904849605, + "grad_norm": 0.43125995993614197, + "learning_rate": 6.654243497594953e-05, + "loss": 1.7819, + "step": 13388 + }, + { + "epoch": 4.109576427255985, + "grad_norm": 0.33719149231910706, + "learning_rate": 6.653774425733765e-05, + "loss": 1.797, + "step": 13389 + }, + { + "epoch": 4.10988336402701, + "grad_norm": 0.23091599345207214, + "learning_rate": 6.653305337529006e-05, + "loss": 1.7384, + "step": 13390 + }, + { + "epoch": 4.110190300798036, + "grad_norm": 0.4283982515335083, + "learning_rate": 6.652836232985317e-05, + "loss": 1.8284, + "step": 13391 + }, + { + "epoch": 4.110497237569061, + "grad_norm": 0.43575870990753174, + "learning_rate": 6.652367112107332e-05, + "loss": 1.7235, + "step": 13392 + }, + { + "epoch": 4.110804174340086, + "grad_norm": 0.246877059340477, + "learning_rate": 6.651897974899685e-05, + "loss": 1.7174, + "step": 13393 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.36063629388809204, + "learning_rate": 6.651428821367015e-05, + "loss": 1.8064, + "step": 13394 + }, + { + "epoch": 4.111418047882136, + "grad_norm": 0.4454420804977417, + "learning_rate": 6.650959651513957e-05, + "loss": 1.7575, + "step": 13395 + }, + { + "epoch": 4.111724984653161, + "grad_norm": 0.2788856327533722, + "learning_rate": 6.650490465345149e-05, + "loss": 1.7696, + "step": 13396 + }, + { + "epoch": 4.112031921424187, + "grad_norm": 0.40281879901885986, + "learning_rate": 6.650021262865225e-05, + "loss": 1.8368, + "step": 13397 + }, + { + "epoch": 4.112338858195212, + "grad_norm": 0.5151103138923645, + "learning_rate": 6.649552044078825e-05, + "loss": 1.8224, + "step": 13398 + }, + { + "epoch": 4.112645794966237, + "grad_norm": 0.29390639066696167, + "learning_rate": 6.649082808990586e-05, + "loss": 1.7846, + "step": 13399 + }, + { + "epoch": 4.112952731737262, + "grad_norm": 0.3061942458152771, + "learning_rate": 6.648613557605142e-05, + "loss": 1.7954, + "step": 13400 + }, + { + "epoch": 4.113259668508287, + "grad_norm": 0.47628748416900635, + "learning_rate": 6.648144289927132e-05, + "loss": 1.7782, + "step": 13401 + }, + { + "epoch": 4.1135666052793125, + "grad_norm": 0.4299588203430176, + "learning_rate": 6.647675005961197e-05, + "loss": 1.7459, + "step": 13402 + }, + { + "epoch": 4.113873542050338, + "grad_norm": 0.24556589126586914, + "learning_rate": 6.64720570571197e-05, + "loss": 1.753, + "step": 13403 + }, + { + "epoch": 4.114180478821363, + "grad_norm": 0.29620522260665894, + "learning_rate": 6.646736389184092e-05, + "loss": 1.773, + "step": 13404 + }, + { + "epoch": 4.114487415592388, + "grad_norm": 0.37710070610046387, + "learning_rate": 6.646267056382199e-05, + "loss": 1.8389, + "step": 13405 + }, + { + "epoch": 4.114794352363413, + "grad_norm": 0.2562984824180603, + "learning_rate": 6.64579770731093e-05, + "loss": 1.7905, + "step": 13406 + }, + { + "epoch": 4.115101289134438, + "grad_norm": 0.3999946713447571, + "learning_rate": 6.645328341974924e-05, + "loss": 1.7734, + "step": 13407 + }, + { + "epoch": 4.115408225905464, + "grad_norm": 0.36087217926979065, + "learning_rate": 6.644858960378817e-05, + "loss": 1.801, + "step": 13408 + }, + { + "epoch": 4.115715162676489, + "grad_norm": 0.2520254850387573, + "learning_rate": 6.644389562527251e-05, + "loss": 1.7394, + "step": 13409 + }, + { + "epoch": 4.116022099447513, + "grad_norm": 0.4321835935115814, + "learning_rate": 6.643920148424864e-05, + "loss": 1.8091, + "step": 13410 + }, + { + "epoch": 4.116329036218539, + "grad_norm": 0.40900173783302307, + "learning_rate": 6.643450718076294e-05, + "loss": 1.8198, + "step": 13411 + }, + { + "epoch": 4.116635972989564, + "grad_norm": 0.23693956434726715, + "learning_rate": 6.642981271486182e-05, + "loss": 1.6807, + "step": 13412 + }, + { + "epoch": 4.116942909760589, + "grad_norm": 0.33526891469955444, + "learning_rate": 6.642511808659164e-05, + "loss": 1.8673, + "step": 13413 + }, + { + "epoch": 4.117249846531615, + "grad_norm": 0.4037325382232666, + "learning_rate": 6.642042329599883e-05, + "loss": 1.743, + "step": 13414 + }, + { + "epoch": 4.11755678330264, + "grad_norm": 0.25629740953445435, + "learning_rate": 6.641572834312975e-05, + "loss": 1.6904, + "step": 13415 + }, + { + "epoch": 4.1178637200736645, + "grad_norm": 0.29203253984451294, + "learning_rate": 6.641103322803087e-05, + "loss": 1.7811, + "step": 13416 + }, + { + "epoch": 4.11817065684469, + "grad_norm": 0.423926442861557, + "learning_rate": 6.64063379507485e-05, + "loss": 1.7341, + "step": 13417 + }, + { + "epoch": 4.118477593615715, + "grad_norm": 0.29561251401901245, + "learning_rate": 6.64016425113291e-05, + "loss": 1.7915, + "step": 13418 + }, + { + "epoch": 4.1187845303867405, + "grad_norm": 0.2536832094192505, + "learning_rate": 6.639694690981903e-05, + "loss": 1.7628, + "step": 13419 + }, + { + "epoch": 4.119091467157766, + "grad_norm": 0.2931392192840576, + "learning_rate": 6.639225114626475e-05, + "loss": 1.7877, + "step": 13420 + }, + { + "epoch": 4.11939840392879, + "grad_norm": 0.2219499796628952, + "learning_rate": 6.638755522071263e-05, + "loss": 1.7183, + "step": 13421 + }, + { + "epoch": 4.119705340699816, + "grad_norm": 0.2951931953430176, + "learning_rate": 6.638285913320908e-05, + "loss": 1.7983, + "step": 13422 + }, + { + "epoch": 4.120012277470841, + "grad_norm": 0.3495960533618927, + "learning_rate": 6.63781628838005e-05, + "loss": 1.7531, + "step": 13423 + }, + { + "epoch": 4.120319214241866, + "grad_norm": 0.2389262616634369, + "learning_rate": 6.637346647253333e-05, + "loss": 1.7454, + "step": 13424 + }, + { + "epoch": 4.120626151012892, + "grad_norm": 0.28729167580604553, + "learning_rate": 6.636876989945395e-05, + "loss": 1.8105, + "step": 13425 + }, + { + "epoch": 4.120933087783916, + "grad_norm": 0.2620082199573517, + "learning_rate": 6.636407316460882e-05, + "loss": 1.7948, + "step": 13426 + }, + { + "epoch": 4.121240024554941, + "grad_norm": 0.2694189250469208, + "learning_rate": 6.635937626804432e-05, + "loss": 1.809, + "step": 13427 + }, + { + "epoch": 4.121546961325967, + "grad_norm": 0.2660866379737854, + "learning_rate": 6.635467920980687e-05, + "loss": 1.7431, + "step": 13428 + }, + { + "epoch": 4.121853898096992, + "grad_norm": 0.2579907774925232, + "learning_rate": 6.634998198994289e-05, + "loss": 1.7941, + "step": 13429 + }, + { + "epoch": 4.122160834868017, + "grad_norm": 0.28349989652633667, + "learning_rate": 6.634528460849881e-05, + "loss": 1.8142, + "step": 13430 + }, + { + "epoch": 4.122467771639043, + "grad_norm": 0.28716522455215454, + "learning_rate": 6.634058706552104e-05, + "loss": 1.7496, + "step": 13431 + }, + { + "epoch": 4.122774708410067, + "grad_norm": 0.23228077590465546, + "learning_rate": 6.633588936105601e-05, + "loss": 1.7399, + "step": 13432 + }, + { + "epoch": 4.1230816451810925, + "grad_norm": 0.3649841248989105, + "learning_rate": 6.633119149515017e-05, + "loss": 1.7696, + "step": 13433 + }, + { + "epoch": 4.123388581952118, + "grad_norm": 0.2757830321788788, + "learning_rate": 6.632649346784992e-05, + "loss": 1.8329, + "step": 13434 + }, + { + "epoch": 4.123695518723143, + "grad_norm": 0.28163692355155945, + "learning_rate": 6.632179527920167e-05, + "loss": 1.7761, + "step": 13435 + }, + { + "epoch": 4.1240024554941686, + "grad_norm": 0.3453187048435211, + "learning_rate": 6.631709692925188e-05, + "loss": 1.7843, + "step": 13436 + }, + { + "epoch": 4.124309392265193, + "grad_norm": 0.2792697250843048, + "learning_rate": 6.631239841804698e-05, + "loss": 1.7889, + "step": 13437 + }, + { + "epoch": 4.124616329036218, + "grad_norm": 0.21881693601608276, + "learning_rate": 6.630769974563339e-05, + "loss": 1.8015, + "step": 13438 + }, + { + "epoch": 4.124923265807244, + "grad_norm": 0.4464910328388214, + "learning_rate": 6.630300091205756e-05, + "loss": 1.7851, + "step": 13439 + }, + { + "epoch": 4.125230202578269, + "grad_norm": 0.40191107988357544, + "learning_rate": 6.629830191736591e-05, + "loss": 1.8608, + "step": 13440 + }, + { + "epoch": 4.125537139349294, + "grad_norm": 0.2809060513973236, + "learning_rate": 6.62936027616049e-05, + "loss": 1.7374, + "step": 13441 + }, + { + "epoch": 4.12584407612032, + "grad_norm": 0.24980643391609192, + "learning_rate": 6.628890344482095e-05, + "loss": 1.8152, + "step": 13442 + }, + { + "epoch": 4.126151012891344, + "grad_norm": 0.24538342654705048, + "learning_rate": 6.62842039670605e-05, + "loss": 1.7687, + "step": 13443 + }, + { + "epoch": 4.1264579496623695, + "grad_norm": 0.24684634804725647, + "learning_rate": 6.627950432837002e-05, + "loss": 1.787, + "step": 13444 + }, + { + "epoch": 4.126764886433395, + "grad_norm": 0.22724607586860657, + "learning_rate": 6.627480452879593e-05, + "loss": 1.7871, + "step": 13445 + }, + { + "epoch": 4.12707182320442, + "grad_norm": 0.24724406003952026, + "learning_rate": 6.627010456838469e-05, + "loss": 1.7524, + "step": 13446 + }, + { + "epoch": 4.1273787599754455, + "grad_norm": 0.24219536781311035, + "learning_rate": 6.626540444718274e-05, + "loss": 1.7754, + "step": 13447 + }, + { + "epoch": 4.12768569674647, + "grad_norm": 0.24857915937900543, + "learning_rate": 6.626070416523652e-05, + "loss": 1.7839, + "step": 13448 + }, + { + "epoch": 4.127992633517495, + "grad_norm": 0.2639105021953583, + "learning_rate": 6.625600372259248e-05, + "loss": 1.7546, + "step": 13449 + }, + { + "epoch": 4.128299570288521, + "grad_norm": 0.23598137497901917, + "learning_rate": 6.62513031192971e-05, + "loss": 1.7957, + "step": 13450 + }, + { + "epoch": 4.128606507059546, + "grad_norm": 0.3038909137248993, + "learning_rate": 6.624660235539682e-05, + "loss": 1.8117, + "step": 13451 + }, + { + "epoch": 4.128913443830571, + "grad_norm": 0.27671241760253906, + "learning_rate": 6.624190143093809e-05, + "loss": 1.729, + "step": 13452 + }, + { + "epoch": 4.129220380601596, + "grad_norm": 0.24638360738754272, + "learning_rate": 6.623720034596735e-05, + "loss": 1.7414, + "step": 13453 + }, + { + "epoch": 4.129527317372621, + "grad_norm": 0.24073924124240875, + "learning_rate": 6.623249910053111e-05, + "loss": 1.8046, + "step": 13454 + }, + { + "epoch": 4.129834254143646, + "grad_norm": 0.29734376072883606, + "learning_rate": 6.622779769467578e-05, + "loss": 1.8336, + "step": 13455 + }, + { + "epoch": 4.130141190914672, + "grad_norm": 0.23182810842990875, + "learning_rate": 6.622309612844785e-05, + "loss": 1.7742, + "step": 13456 + }, + { + "epoch": 4.130448127685697, + "grad_norm": 0.2179390788078308, + "learning_rate": 6.621839440189378e-05, + "loss": 1.7656, + "step": 13457 + }, + { + "epoch": 4.1307550644567215, + "grad_norm": 0.21389013528823853, + "learning_rate": 6.621369251506002e-05, + "loss": 1.7504, + "step": 13458 + }, + { + "epoch": 4.131062001227747, + "grad_norm": 0.22306203842163086, + "learning_rate": 6.620899046799305e-05, + "loss": 1.7573, + "step": 13459 + }, + { + "epoch": 4.131368937998772, + "grad_norm": 0.2699708938598633, + "learning_rate": 6.620428826073934e-05, + "loss": 1.7419, + "step": 13460 + }, + { + "epoch": 4.1316758747697975, + "grad_norm": 0.34087565541267395, + "learning_rate": 6.619958589334534e-05, + "loss": 1.7545, + "step": 13461 + }, + { + "epoch": 4.131982811540823, + "grad_norm": 0.2934977412223816, + "learning_rate": 6.619488336585755e-05, + "loss": 1.7611, + "step": 13462 + }, + { + "epoch": 4.132289748311848, + "grad_norm": 0.22545567154884338, + "learning_rate": 6.619018067832243e-05, + "loss": 1.7562, + "step": 13463 + }, + { + "epoch": 4.132596685082873, + "grad_norm": 0.23334743082523346, + "learning_rate": 6.618547783078647e-05, + "loss": 1.7784, + "step": 13464 + }, + { + "epoch": 4.132903621853898, + "grad_norm": 0.22466403245925903, + "learning_rate": 6.618077482329612e-05, + "loss": 1.7277, + "step": 13465 + }, + { + "epoch": 4.133210558624923, + "grad_norm": 0.23504197597503662, + "learning_rate": 6.617607165589785e-05, + "loss": 1.7983, + "step": 13466 + }, + { + "epoch": 4.133517495395949, + "grad_norm": 0.2500833570957184, + "learning_rate": 6.617136832863819e-05, + "loss": 1.7826, + "step": 13467 + }, + { + "epoch": 4.133824432166974, + "grad_norm": 0.22398658096790314, + "learning_rate": 6.616666484156357e-05, + "loss": 1.7281, + "step": 13468 + }, + { + "epoch": 4.134131368937998, + "grad_norm": 0.2537873089313507, + "learning_rate": 6.616196119472052e-05, + "loss": 1.7598, + "step": 13469 + }, + { + "epoch": 4.134438305709024, + "grad_norm": 0.26881173253059387, + "learning_rate": 6.615725738815546e-05, + "loss": 1.8161, + "step": 13470 + }, + { + "epoch": 4.134745242480049, + "grad_norm": 0.3311346471309662, + "learning_rate": 6.615255342191492e-05, + "loss": 1.7954, + "step": 13471 + }, + { + "epoch": 4.135052179251074, + "grad_norm": 0.2562953233718872, + "learning_rate": 6.614784929604539e-05, + "loss": 1.7284, + "step": 13472 + }, + { + "epoch": 4.1353591160221, + "grad_norm": 0.2563154101371765, + "learning_rate": 6.614314501059334e-05, + "loss": 1.7995, + "step": 13473 + }, + { + "epoch": 4.135666052793125, + "grad_norm": 0.24861161410808563, + "learning_rate": 6.613844056560527e-05, + "loss": 1.7589, + "step": 13474 + }, + { + "epoch": 4.1359729895641495, + "grad_norm": 0.23815487325191498, + "learning_rate": 6.613373596112769e-05, + "loss": 1.6906, + "step": 13475 + }, + { + "epoch": 4.136279926335175, + "grad_norm": 0.25394049286842346, + "learning_rate": 6.612903119720705e-05, + "loss": 1.781, + "step": 13476 + }, + { + "epoch": 4.1365868631062, + "grad_norm": 0.24501466751098633, + "learning_rate": 6.612432627388988e-05, + "loss": 1.797, + "step": 13477 + }, + { + "epoch": 4.1368937998772255, + "grad_norm": 0.24909707903862, + "learning_rate": 6.611962119122267e-05, + "loss": 1.7643, + "step": 13478 + }, + { + "epoch": 4.137200736648251, + "grad_norm": 0.24954476952552795, + "learning_rate": 6.611491594925192e-05, + "loss": 1.8219, + "step": 13479 + }, + { + "epoch": 4.137507673419275, + "grad_norm": 0.30572372674942017, + "learning_rate": 6.611021054802411e-05, + "loss": 1.8039, + "step": 13480 + }, + { + "epoch": 4.137814610190301, + "grad_norm": 0.27466365694999695, + "learning_rate": 6.610550498758577e-05, + "loss": 1.6945, + "step": 13481 + }, + { + "epoch": 4.138121546961326, + "grad_norm": 0.2614271640777588, + "learning_rate": 6.610079926798339e-05, + "loss": 1.8648, + "step": 13482 + }, + { + "epoch": 4.138428483732351, + "grad_norm": 0.23645827174186707, + "learning_rate": 6.609609338926346e-05, + "loss": 1.7424, + "step": 13483 + }, + { + "epoch": 4.138735420503377, + "grad_norm": 0.24473626911640167, + "learning_rate": 6.609138735147253e-05, + "loss": 1.8036, + "step": 13484 + }, + { + "epoch": 4.139042357274401, + "grad_norm": 0.2472417950630188, + "learning_rate": 6.608668115465706e-05, + "loss": 1.794, + "step": 13485 + }, + { + "epoch": 4.139349294045426, + "grad_norm": 0.25330284237861633, + "learning_rate": 6.608197479886358e-05, + "loss": 1.8052, + "step": 13486 + }, + { + "epoch": 4.139656230816452, + "grad_norm": 0.24279309809207916, + "learning_rate": 6.60772682841386e-05, + "loss": 1.7375, + "step": 13487 + }, + { + "epoch": 4.139963167587477, + "grad_norm": 0.22319461405277252, + "learning_rate": 6.607256161052862e-05, + "loss": 1.7696, + "step": 13488 + }, + { + "epoch": 4.140270104358502, + "grad_norm": 0.25261563062667847, + "learning_rate": 6.606785477808017e-05, + "loss": 1.7646, + "step": 13489 + }, + { + "epoch": 4.140577041129528, + "grad_norm": 0.3127744793891907, + "learning_rate": 6.606314778683977e-05, + "loss": 1.7899, + "step": 13490 + }, + { + "epoch": 4.140883977900552, + "grad_norm": 0.3550816774368286, + "learning_rate": 6.605844063685392e-05, + "loss": 1.7971, + "step": 13491 + }, + { + "epoch": 4.1411909146715775, + "grad_norm": 0.20977813005447388, + "learning_rate": 6.605373332816916e-05, + "loss": 1.7416, + "step": 13492 + }, + { + "epoch": 4.141497851442603, + "grad_norm": 0.26593849062919617, + "learning_rate": 6.6049025860832e-05, + "loss": 1.7586, + "step": 13493 + }, + { + "epoch": 4.141804788213628, + "grad_norm": 0.2452937364578247, + "learning_rate": 6.604431823488893e-05, + "loss": 1.757, + "step": 13494 + }, + { + "epoch": 4.1421117249846535, + "grad_norm": 0.21029168367385864, + "learning_rate": 6.603961045038652e-05, + "loss": 1.7665, + "step": 13495 + }, + { + "epoch": 4.142418661755678, + "grad_norm": 0.2396312952041626, + "learning_rate": 6.603490250737128e-05, + "loss": 1.7609, + "step": 13496 + }, + { + "epoch": 4.142725598526703, + "grad_norm": 0.23266808688640594, + "learning_rate": 6.603019440588975e-05, + "loss": 1.7893, + "step": 13497 + }, + { + "epoch": 4.143032535297729, + "grad_norm": 0.25235217809677124, + "learning_rate": 6.602548614598842e-05, + "loss": 1.7465, + "step": 13498 + }, + { + "epoch": 4.143339472068754, + "grad_norm": 0.22944024205207825, + "learning_rate": 6.602077772771386e-05, + "loss": 1.7052, + "step": 13499 + }, + { + "epoch": 4.143646408839779, + "grad_norm": 0.2116660475730896, + "learning_rate": 6.601606915111257e-05, + "loss": 1.7042, + "step": 13500 + }, + { + "epoch": 4.143953345610804, + "grad_norm": 0.21777184307575226, + "learning_rate": 6.601136041623111e-05, + "loss": 1.7938, + "step": 13501 + }, + { + "epoch": 4.144260282381829, + "grad_norm": 0.23663075268268585, + "learning_rate": 6.600665152311601e-05, + "loss": 1.7475, + "step": 13502 + }, + { + "epoch": 4.144567219152854, + "grad_norm": 0.20644642412662506, + "learning_rate": 6.600194247181377e-05, + "loss": 1.7992, + "step": 13503 + }, + { + "epoch": 4.14487415592388, + "grad_norm": 0.21479010581970215, + "learning_rate": 6.599723326237098e-05, + "loss": 1.7877, + "step": 13504 + }, + { + "epoch": 4.145181092694905, + "grad_norm": 0.2266562283039093, + "learning_rate": 6.599252389483413e-05, + "loss": 1.8097, + "step": 13505 + }, + { + "epoch": 4.14548802946593, + "grad_norm": 0.2053738683462143, + "learning_rate": 6.59878143692498e-05, + "loss": 1.6878, + "step": 13506 + }, + { + "epoch": 4.145794966236955, + "grad_norm": 0.19583995640277863, + "learning_rate": 6.598310468566452e-05, + "loss": 1.7547, + "step": 13507 + }, + { + "epoch": 4.14610190300798, + "grad_norm": 0.23421542346477509, + "learning_rate": 6.597839484412484e-05, + "loss": 1.7926, + "step": 13508 + }, + { + "epoch": 4.1464088397790055, + "grad_norm": 0.24575260281562805, + "learning_rate": 6.597368484467728e-05, + "loss": 1.7311, + "step": 13509 + }, + { + "epoch": 4.146715776550031, + "grad_norm": 0.27519574761390686, + "learning_rate": 6.596897468736842e-05, + "loss": 1.7858, + "step": 13510 + }, + { + "epoch": 4.147022713321056, + "grad_norm": 0.26434022188186646, + "learning_rate": 6.596426437224477e-05, + "loss": 1.7387, + "step": 13511 + }, + { + "epoch": 4.147329650092081, + "grad_norm": 0.2192772775888443, + "learning_rate": 6.595955389935291e-05, + "loss": 1.7565, + "step": 13512 + }, + { + "epoch": 4.147636586863106, + "grad_norm": 0.21047350764274597, + "learning_rate": 6.595484326873938e-05, + "loss": 1.7234, + "step": 13513 + }, + { + "epoch": 4.147943523634131, + "grad_norm": 0.22838951647281647, + "learning_rate": 6.595013248045075e-05, + "loss": 1.8205, + "step": 13514 + }, + { + "epoch": 4.148250460405157, + "grad_norm": 0.3467923402786255, + "learning_rate": 6.594542153453356e-05, + "loss": 1.7973, + "step": 13515 + }, + { + "epoch": 4.148557397176182, + "grad_norm": 0.241237074136734, + "learning_rate": 6.594071043103438e-05, + "loss": 1.7764, + "step": 13516 + }, + { + "epoch": 4.148864333947207, + "grad_norm": 0.22543516755104065, + "learning_rate": 6.593599916999973e-05, + "loss": 1.7528, + "step": 13517 + }, + { + "epoch": 4.149171270718232, + "grad_norm": 0.24590276181697845, + "learning_rate": 6.593128775147623e-05, + "loss": 1.7422, + "step": 13518 + }, + { + "epoch": 4.149478207489257, + "grad_norm": 0.2434391975402832, + "learning_rate": 6.592657617551038e-05, + "loss": 1.7523, + "step": 13519 + }, + { + "epoch": 4.149785144260282, + "grad_norm": 0.23169009387493134, + "learning_rate": 6.592186444214877e-05, + "loss": 1.8158, + "step": 13520 + }, + { + "epoch": 4.150092081031308, + "grad_norm": 0.2217840999364853, + "learning_rate": 6.591715255143798e-05, + "loss": 1.7487, + "step": 13521 + }, + { + "epoch": 4.150399017802333, + "grad_norm": 0.2405092418193817, + "learning_rate": 6.591244050342454e-05, + "loss": 1.7726, + "step": 13522 + }, + { + "epoch": 4.150705954573358, + "grad_norm": 0.29432612657546997, + "learning_rate": 6.590772829815504e-05, + "loss": 1.7841, + "step": 13523 + }, + { + "epoch": 4.151012891344383, + "grad_norm": 0.2708737850189209, + "learning_rate": 6.590301593567605e-05, + "loss": 1.8551, + "step": 13524 + }, + { + "epoch": 4.151319828115408, + "grad_norm": 0.26643216609954834, + "learning_rate": 6.589830341603413e-05, + "loss": 1.7697, + "step": 13525 + }, + { + "epoch": 4.151626764886434, + "grad_norm": 0.3672652840614319, + "learning_rate": 6.589359073927587e-05, + "loss": 1.8292, + "step": 13526 + }, + { + "epoch": 4.151933701657459, + "grad_norm": 0.2413325160741806, + "learning_rate": 6.588887790544782e-05, + "loss": 1.7514, + "step": 13527 + }, + { + "epoch": 4.152240638428483, + "grad_norm": 0.3248155117034912, + "learning_rate": 6.588416491459657e-05, + "loss": 1.7437, + "step": 13528 + }, + { + "epoch": 4.152547575199509, + "grad_norm": 0.40951836109161377, + "learning_rate": 6.587945176676869e-05, + "loss": 1.7779, + "step": 13529 + }, + { + "epoch": 4.152854511970534, + "grad_norm": 0.23874351382255554, + "learning_rate": 6.587473846201075e-05, + "loss": 1.8343, + "step": 13530 + }, + { + "epoch": 4.153161448741559, + "grad_norm": 0.4535207450389862, + "learning_rate": 6.587002500036936e-05, + "loss": 1.8301, + "step": 13531 + }, + { + "epoch": 4.153468385512585, + "grad_norm": 0.458003968000412, + "learning_rate": 6.586531138189108e-05, + "loss": 1.7053, + "step": 13532 + }, + { + "epoch": 4.153775322283609, + "grad_norm": 0.24350887537002563, + "learning_rate": 6.586059760662248e-05, + "loss": 1.7642, + "step": 13533 + }, + { + "epoch": 4.1540822590546345, + "grad_norm": 0.46951553225517273, + "learning_rate": 6.585588367461017e-05, + "loss": 1.7345, + "step": 13534 + }, + { + "epoch": 4.15438919582566, + "grad_norm": 0.5524527430534363, + "learning_rate": 6.585116958590072e-05, + "loss": 1.7677, + "step": 13535 + }, + { + "epoch": 4.154696132596685, + "grad_norm": 0.2887112498283386, + "learning_rate": 6.584645534054072e-05, + "loss": 1.7704, + "step": 13536 + }, + { + "epoch": 4.1550030693677105, + "grad_norm": 0.36243724822998047, + "learning_rate": 6.584174093857675e-05, + "loss": 1.8133, + "step": 13537 + }, + { + "epoch": 4.155310006138736, + "grad_norm": 0.3869550824165344, + "learning_rate": 6.583702638005543e-05, + "loss": 1.7253, + "step": 13538 + }, + { + "epoch": 4.15561694290976, + "grad_norm": 0.25859662890434265, + "learning_rate": 6.583231166502333e-05, + "loss": 1.7683, + "step": 13539 + }, + { + "epoch": 4.155923879680786, + "grad_norm": 0.3011144995689392, + "learning_rate": 6.582759679352704e-05, + "loss": 1.7139, + "step": 13540 + }, + { + "epoch": 4.156230816451811, + "grad_norm": 0.38033372163772583, + "learning_rate": 6.582288176561316e-05, + "loss": 1.8182, + "step": 13541 + }, + { + "epoch": 4.156537753222836, + "grad_norm": 0.2224060595035553, + "learning_rate": 6.581816658132829e-05, + "loss": 1.7527, + "step": 13542 + }, + { + "epoch": 4.156844689993862, + "grad_norm": 0.4147234261035919, + "learning_rate": 6.581345124071903e-05, + "loss": 1.7339, + "step": 13543 + }, + { + "epoch": 4.157151626764886, + "grad_norm": 0.45334625244140625, + "learning_rate": 6.580873574383198e-05, + "loss": 1.8166, + "step": 13544 + }, + { + "epoch": 4.157458563535911, + "grad_norm": 0.3050530254840851, + "learning_rate": 6.580402009071372e-05, + "loss": 1.7967, + "step": 13545 + }, + { + "epoch": 4.157765500306937, + "grad_norm": 0.25901293754577637, + "learning_rate": 6.579930428141088e-05, + "loss": 1.7806, + "step": 13546 + }, + { + "epoch": 4.158072437077962, + "grad_norm": 0.3142934739589691, + "learning_rate": 6.579458831597006e-05, + "loss": 1.7724, + "step": 13547 + }, + { + "epoch": 4.158379373848987, + "grad_norm": 0.23943179845809937, + "learning_rate": 6.578987219443787e-05, + "loss": 1.7515, + "step": 13548 + }, + { + "epoch": 4.158686310620013, + "grad_norm": 0.2838635742664337, + "learning_rate": 6.578515591686089e-05, + "loss": 1.7707, + "step": 13549 + }, + { + "epoch": 4.158993247391037, + "grad_norm": 0.3064457178115845, + "learning_rate": 6.578043948328575e-05, + "loss": 1.7839, + "step": 13550 + }, + { + "epoch": 4.1593001841620625, + "grad_norm": 0.2311718463897705, + "learning_rate": 6.577572289375907e-05, + "loss": 1.8298, + "step": 13551 + }, + { + "epoch": 4.159607120933088, + "grad_norm": 0.35726481676101685, + "learning_rate": 6.577100614832743e-05, + "loss": 1.811, + "step": 13552 + }, + { + "epoch": 4.159914057704113, + "grad_norm": 0.3176140785217285, + "learning_rate": 6.576628924703749e-05, + "loss": 1.732, + "step": 13553 + }, + { + "epoch": 4.1602209944751385, + "grad_norm": 0.2325647473335266, + "learning_rate": 6.576157218993582e-05, + "loss": 1.827, + "step": 13554 + }, + { + "epoch": 4.160527931246163, + "grad_norm": 0.32260453701019287, + "learning_rate": 6.575685497706905e-05, + "loss": 1.8218, + "step": 13555 + }, + { + "epoch": 4.160834868017188, + "grad_norm": 0.2638537287712097, + "learning_rate": 6.575213760848382e-05, + "loss": 1.7091, + "step": 13556 + }, + { + "epoch": 4.161141804788214, + "grad_norm": 0.2501799762248993, + "learning_rate": 6.574742008422671e-05, + "loss": 1.7707, + "step": 13557 + }, + { + "epoch": 4.161448741559239, + "grad_norm": 0.3212645649909973, + "learning_rate": 6.574270240434439e-05, + "loss": 1.7541, + "step": 13558 + }, + { + "epoch": 4.161755678330264, + "grad_norm": 0.25915586948394775, + "learning_rate": 6.573798456888345e-05, + "loss": 1.7597, + "step": 13559 + }, + { + "epoch": 4.162062615101289, + "grad_norm": 0.2538192868232727, + "learning_rate": 6.573326657789052e-05, + "loss": 1.8507, + "step": 13560 + }, + { + "epoch": 4.162369551872314, + "grad_norm": 0.2542131543159485, + "learning_rate": 6.572854843141223e-05, + "loss": 1.782, + "step": 13561 + }, + { + "epoch": 4.162676488643339, + "grad_norm": 0.26163414120674133, + "learning_rate": 6.572383012949521e-05, + "loss": 1.8482, + "step": 13562 + }, + { + "epoch": 4.162983425414365, + "grad_norm": 0.2566238343715668, + "learning_rate": 6.571911167218608e-05, + "loss": 1.7284, + "step": 13563 + }, + { + "epoch": 4.16329036218539, + "grad_norm": 0.28413113951683044, + "learning_rate": 6.571439305953147e-05, + "loss": 1.7473, + "step": 13564 + }, + { + "epoch": 4.163597298956415, + "grad_norm": 0.20399242639541626, + "learning_rate": 6.570967429157802e-05, + "loss": 1.6942, + "step": 13565 + }, + { + "epoch": 4.16390423572744, + "grad_norm": 0.256104439496994, + "learning_rate": 6.570495536837235e-05, + "loss": 1.7346, + "step": 13566 + }, + { + "epoch": 4.164211172498465, + "grad_norm": 0.350909560918808, + "learning_rate": 6.570023628996112e-05, + "loss": 1.8284, + "step": 13567 + }, + { + "epoch": 4.1645181092694905, + "grad_norm": 0.23500367999076843, + "learning_rate": 6.569551705639096e-05, + "loss": 1.7504, + "step": 13568 + }, + { + "epoch": 4.164825046040516, + "grad_norm": 0.26683783531188965, + "learning_rate": 6.569079766770849e-05, + "loss": 1.7293, + "step": 13569 + }, + { + "epoch": 4.165131982811541, + "grad_norm": 0.3145855963230133, + "learning_rate": 6.568607812396037e-05, + "loss": 1.8171, + "step": 13570 + }, + { + "epoch": 4.165438919582566, + "grad_norm": 0.2354860156774521, + "learning_rate": 6.568135842519324e-05, + "loss": 1.7555, + "step": 13571 + }, + { + "epoch": 4.165745856353591, + "grad_norm": 0.2893243730068207, + "learning_rate": 6.56766385714537e-05, + "loss": 1.7636, + "step": 13572 + }, + { + "epoch": 4.166052793124616, + "grad_norm": 0.20707663893699646, + "learning_rate": 6.567191856278846e-05, + "loss": 1.7239, + "step": 13573 + }, + { + "epoch": 4.166359729895642, + "grad_norm": 0.34200331568717957, + "learning_rate": 6.566719839924412e-05, + "loss": 1.7848, + "step": 13574 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.23326615989208221, + "learning_rate": 6.566247808086734e-05, + "loss": 1.7447, + "step": 13575 + }, + { + "epoch": 4.166973603437691, + "grad_norm": 0.22375629842281342, + "learning_rate": 6.565775760770479e-05, + "loss": 1.7429, + "step": 13576 + }, + { + "epoch": 4.167280540208717, + "grad_norm": 0.2412862777709961, + "learning_rate": 6.565303697980308e-05, + "loss": 1.7671, + "step": 13577 + }, + { + "epoch": 4.167587476979742, + "grad_norm": 0.2482215315103531, + "learning_rate": 6.56483161972089e-05, + "loss": 1.812, + "step": 13578 + }, + { + "epoch": 4.167894413750767, + "grad_norm": 0.2252974659204483, + "learning_rate": 6.564359525996889e-05, + "loss": 1.8173, + "step": 13579 + }, + { + "epoch": 4.168201350521793, + "grad_norm": 0.23497292399406433, + "learning_rate": 6.563887416812969e-05, + "loss": 1.7945, + "step": 13580 + }, + { + "epoch": 4.168508287292818, + "grad_norm": 0.24911245703697205, + "learning_rate": 6.563415292173796e-05, + "loss": 1.7516, + "step": 13581 + }, + { + "epoch": 4.1688152240638425, + "grad_norm": 0.20920930802822113, + "learning_rate": 6.562943152084039e-05, + "loss": 1.765, + "step": 13582 + }, + { + "epoch": 4.169122160834868, + "grad_norm": 0.26001816987991333, + "learning_rate": 6.562470996548361e-05, + "loss": 1.7504, + "step": 13583 + }, + { + "epoch": 4.169429097605893, + "grad_norm": 0.2504529058933258, + "learning_rate": 6.561998825571429e-05, + "loss": 1.7689, + "step": 13584 + }, + { + "epoch": 4.1697360343769185, + "grad_norm": 0.2210187464952469, + "learning_rate": 6.561526639157908e-05, + "loss": 1.752, + "step": 13585 + }, + { + "epoch": 4.170042971147944, + "grad_norm": 0.26323240995407104, + "learning_rate": 6.561054437312467e-05, + "loss": 1.8104, + "step": 13586 + }, + { + "epoch": 4.170349907918968, + "grad_norm": 0.20436744391918182, + "learning_rate": 6.560582220039771e-05, + "loss": 1.7281, + "step": 13587 + }, + { + "epoch": 4.170656844689994, + "grad_norm": 0.2053878903388977, + "learning_rate": 6.560109987344487e-05, + "loss": 1.7192, + "step": 13588 + }, + { + "epoch": 4.170963781461019, + "grad_norm": 0.2416568547487259, + "learning_rate": 6.559637739231281e-05, + "loss": 1.7679, + "step": 13589 + }, + { + "epoch": 4.171270718232044, + "grad_norm": 0.23847989737987518, + "learning_rate": 6.55916547570482e-05, + "loss": 1.7182, + "step": 13590 + }, + { + "epoch": 4.17157765500307, + "grad_norm": 0.2057785540819168, + "learning_rate": 6.558693196769772e-05, + "loss": 1.816, + "step": 13591 + }, + { + "epoch": 4.171884591774095, + "grad_norm": 0.2270805537700653, + "learning_rate": 6.558220902430804e-05, + "loss": 1.7091, + "step": 13592 + }, + { + "epoch": 4.172191528545119, + "grad_norm": 0.22143644094467163, + "learning_rate": 6.557748592692585e-05, + "loss": 1.7446, + "step": 13593 + }, + { + "epoch": 4.172498465316145, + "grad_norm": 0.2032770961523056, + "learning_rate": 6.557276267559781e-05, + "loss": 1.7501, + "step": 13594 + }, + { + "epoch": 4.17280540208717, + "grad_norm": 0.20851244032382965, + "learning_rate": 6.55680392703706e-05, + "loss": 1.8283, + "step": 13595 + }, + { + "epoch": 4.173112338858195, + "grad_norm": 0.2603934109210968, + "learning_rate": 6.55633157112909e-05, + "loss": 1.8523, + "step": 13596 + }, + { + "epoch": 4.173419275629221, + "grad_norm": 0.2232515811920166, + "learning_rate": 6.55585919984054e-05, + "loss": 1.7803, + "step": 13597 + }, + { + "epoch": 4.173726212400245, + "grad_norm": 0.2541115880012512, + "learning_rate": 6.555386813176075e-05, + "loss": 1.7407, + "step": 13598 + }, + { + "epoch": 4.1740331491712706, + "grad_norm": 0.3044603765010834, + "learning_rate": 6.55491441114037e-05, + "loss": 1.8257, + "step": 13599 + }, + { + "epoch": 4.174340085942296, + "grad_norm": 0.29227301478385925, + "learning_rate": 6.554441993738086e-05, + "loss": 1.7998, + "step": 13600 + }, + { + "epoch": 4.174647022713321, + "grad_norm": 0.25166594982147217, + "learning_rate": 6.553969560973896e-05, + "loss": 1.8258, + "step": 13601 + }, + { + "epoch": 4.1749539594843466, + "grad_norm": 0.22973991930484772, + "learning_rate": 6.55349711285247e-05, + "loss": 1.7871, + "step": 13602 + }, + { + "epoch": 4.175260896255371, + "grad_norm": 0.2615009844303131, + "learning_rate": 6.553024649378473e-05, + "loss": 1.7572, + "step": 13603 + }, + { + "epoch": 4.175567833026396, + "grad_norm": 0.24145473539829254, + "learning_rate": 6.552552170556576e-05, + "loss": 1.7546, + "step": 13604 + }, + { + "epoch": 4.175874769797422, + "grad_norm": 0.21989156305789948, + "learning_rate": 6.55207967639145e-05, + "loss": 1.6939, + "step": 13605 + }, + { + "epoch": 4.176181706568447, + "grad_norm": 0.206025168299675, + "learning_rate": 6.551607166887761e-05, + "loss": 1.7531, + "step": 13606 + }, + { + "epoch": 4.176488643339472, + "grad_norm": 0.2175903469324112, + "learning_rate": 6.551134642050181e-05, + "loss": 1.7631, + "step": 13607 + }, + { + "epoch": 4.176795580110497, + "grad_norm": 0.23259282112121582, + "learning_rate": 6.550662101883379e-05, + "loss": 1.7773, + "step": 13608 + }, + { + "epoch": 4.177102516881522, + "grad_norm": 0.23955227434635162, + "learning_rate": 6.550189546392025e-05, + "loss": 1.7321, + "step": 13609 + }, + { + "epoch": 4.1774094536525475, + "grad_norm": 0.23614998161792755, + "learning_rate": 6.549716975580792e-05, + "loss": 1.7855, + "step": 13610 + }, + { + "epoch": 4.177716390423573, + "grad_norm": 0.2274426817893982, + "learning_rate": 6.549244389454345e-05, + "loss": 1.7778, + "step": 13611 + }, + { + "epoch": 4.178023327194598, + "grad_norm": 0.2204308807849884, + "learning_rate": 6.548771788017358e-05, + "loss": 1.7175, + "step": 13612 + }, + { + "epoch": 4.1783302639656235, + "grad_norm": 0.2283930778503418, + "learning_rate": 6.548299171274501e-05, + "loss": 1.8081, + "step": 13613 + }, + { + "epoch": 4.178637200736648, + "grad_norm": 0.25433486700057983, + "learning_rate": 6.547826539230442e-05, + "loss": 1.8009, + "step": 13614 + }, + { + "epoch": 4.178944137507673, + "grad_norm": 0.24452579021453857, + "learning_rate": 6.547353891889856e-05, + "loss": 1.7244, + "step": 13615 + }, + { + "epoch": 4.179251074278699, + "grad_norm": 0.20611275732517242, + "learning_rate": 6.546881229257411e-05, + "loss": 1.7566, + "step": 13616 + }, + { + "epoch": 4.179558011049724, + "grad_norm": 0.24557232856750488, + "learning_rate": 6.546408551337779e-05, + "loss": 1.7638, + "step": 13617 + }, + { + "epoch": 4.179864947820749, + "grad_norm": 0.2158801257610321, + "learning_rate": 6.545935858135631e-05, + "loss": 1.7659, + "step": 13618 + }, + { + "epoch": 4.180171884591774, + "grad_norm": 0.23800688982009888, + "learning_rate": 6.54546314965564e-05, + "loss": 1.7468, + "step": 13619 + }, + { + "epoch": 4.180478821362799, + "grad_norm": 0.2504122853279114, + "learning_rate": 6.544990425902476e-05, + "loss": 1.7682, + "step": 13620 + }, + { + "epoch": 4.180785758133824, + "grad_norm": 0.21556814014911652, + "learning_rate": 6.54451768688081e-05, + "loss": 1.772, + "step": 13621 + }, + { + "epoch": 4.18109269490485, + "grad_norm": 0.23404552042484283, + "learning_rate": 6.544044932595315e-05, + "loss": 1.7844, + "step": 13622 + }, + { + "epoch": 4.181399631675875, + "grad_norm": 0.22129055857658386, + "learning_rate": 6.543572163050664e-05, + "loss": 1.7725, + "step": 13623 + }, + { + "epoch": 4.1817065684469, + "grad_norm": 0.2533521354198456, + "learning_rate": 6.543099378251528e-05, + "loss": 1.7908, + "step": 13624 + }, + { + "epoch": 4.182013505217925, + "grad_norm": 0.2905815541744232, + "learning_rate": 6.542626578202579e-05, + "loss": 1.7913, + "step": 13625 + }, + { + "epoch": 4.18232044198895, + "grad_norm": 0.3330783247947693, + "learning_rate": 6.54215376290849e-05, + "loss": 1.8374, + "step": 13626 + }, + { + "epoch": 4.1826273787599755, + "grad_norm": 0.29268717765808105, + "learning_rate": 6.541680932373933e-05, + "loss": 1.8714, + "step": 13627 + }, + { + "epoch": 4.182934315531001, + "grad_norm": 0.2820781171321869, + "learning_rate": 6.541208086603584e-05, + "loss": 1.8089, + "step": 13628 + }, + { + "epoch": 4.183241252302026, + "grad_norm": 0.3062323033809662, + "learning_rate": 6.54073522560211e-05, + "loss": 1.7307, + "step": 13629 + }, + { + "epoch": 4.183548189073051, + "grad_norm": 0.3010510504245758, + "learning_rate": 6.54026234937419e-05, + "loss": 1.7523, + "step": 13630 + }, + { + "epoch": 4.183855125844076, + "grad_norm": 0.21932095289230347, + "learning_rate": 6.539789457924493e-05, + "loss": 1.737, + "step": 13631 + }, + { + "epoch": 4.184162062615101, + "grad_norm": 0.2710212469100952, + "learning_rate": 6.539316551257695e-05, + "loss": 1.7228, + "step": 13632 + }, + { + "epoch": 4.184468999386127, + "grad_norm": 0.2885816991329193, + "learning_rate": 6.538843629378469e-05, + "loss": 1.8734, + "step": 13633 + }, + { + "epoch": 4.184775936157152, + "grad_norm": 0.2621026635169983, + "learning_rate": 6.538370692291487e-05, + "loss": 1.7884, + "step": 13634 + }, + { + "epoch": 4.185082872928176, + "grad_norm": 0.30503126978874207, + "learning_rate": 6.537897740001426e-05, + "loss": 1.7833, + "step": 13635 + }, + { + "epoch": 4.185389809699202, + "grad_norm": 0.29491373896598816, + "learning_rate": 6.537424772512955e-05, + "loss": 1.7894, + "step": 13636 + }, + { + "epoch": 4.185696746470227, + "grad_norm": 0.24423296749591827, + "learning_rate": 6.536951789830754e-05, + "loss": 1.7409, + "step": 13637 + }, + { + "epoch": 4.186003683241252, + "grad_norm": 0.2184748351573944, + "learning_rate": 6.536478791959495e-05, + "loss": 1.747, + "step": 13638 + }, + { + "epoch": 4.186310620012278, + "grad_norm": 0.2348455935716629, + "learning_rate": 6.53600577890385e-05, + "loss": 1.7422, + "step": 13639 + }, + { + "epoch": 4.186617556783303, + "grad_norm": 0.2554566264152527, + "learning_rate": 6.535532750668497e-05, + "loss": 1.7623, + "step": 13640 + }, + { + "epoch": 4.1869244935543275, + "grad_norm": 0.26424553990364075, + "learning_rate": 6.535059707258109e-05, + "loss": 1.8408, + "step": 13641 + }, + { + "epoch": 4.187231430325353, + "grad_norm": 0.35363274812698364, + "learning_rate": 6.534586648677361e-05, + "loss": 1.7435, + "step": 13642 + }, + { + "epoch": 4.187538367096378, + "grad_norm": 0.3225265443325043, + "learning_rate": 6.534113574930926e-05, + "loss": 1.7181, + "step": 13643 + }, + { + "epoch": 4.1878453038674035, + "grad_norm": 0.23529650270938873, + "learning_rate": 6.533640486023485e-05, + "loss": 1.7712, + "step": 13644 + }, + { + "epoch": 4.188152240638429, + "grad_norm": 0.3490132987499237, + "learning_rate": 6.53316738195971e-05, + "loss": 1.7329, + "step": 13645 + }, + { + "epoch": 4.188459177409453, + "grad_norm": 0.3759285509586334, + "learning_rate": 6.532694262744274e-05, + "loss": 1.802, + "step": 13646 + }, + { + "epoch": 4.188766114180479, + "grad_norm": 0.27383577823638916, + "learning_rate": 6.532221128381858e-05, + "loss": 1.801, + "step": 13647 + }, + { + "epoch": 4.189073050951504, + "grad_norm": 0.23240652680397034, + "learning_rate": 6.531747978877132e-05, + "loss": 1.8415, + "step": 13648 + }, + { + "epoch": 4.189379987722529, + "grad_norm": 0.3302704989910126, + "learning_rate": 6.531274814234773e-05, + "loss": 1.7765, + "step": 13649 + }, + { + "epoch": 4.189686924493555, + "grad_norm": 0.3209368586540222, + "learning_rate": 6.530801634459463e-05, + "loss": 1.6935, + "step": 13650 + }, + { + "epoch": 4.189993861264579, + "grad_norm": 0.26643648743629456, + "learning_rate": 6.530328439555872e-05, + "loss": 1.8159, + "step": 13651 + }, + { + "epoch": 4.190300798035604, + "grad_norm": 0.22594431042671204, + "learning_rate": 6.529855229528679e-05, + "loss": 1.7764, + "step": 13652 + }, + { + "epoch": 4.19060773480663, + "grad_norm": 0.3288109302520752, + "learning_rate": 6.529382004382561e-05, + "loss": 1.7963, + "step": 13653 + }, + { + "epoch": 4.190914671577655, + "grad_norm": 0.3067106604576111, + "learning_rate": 6.528908764122191e-05, + "loss": 1.7564, + "step": 13654 + }, + { + "epoch": 4.19122160834868, + "grad_norm": 0.23437078297138214, + "learning_rate": 6.528435508752249e-05, + "loss": 1.759, + "step": 13655 + }, + { + "epoch": 4.191528545119706, + "grad_norm": 0.30662333965301514, + "learning_rate": 6.527962238277413e-05, + "loss": 1.7549, + "step": 13656 + }, + { + "epoch": 4.19183548189073, + "grad_norm": 0.3545009195804596, + "learning_rate": 6.527488952702356e-05, + "loss": 1.7761, + "step": 13657 + }, + { + "epoch": 4.1921424186617555, + "grad_norm": 0.2509438991546631, + "learning_rate": 6.52701565203176e-05, + "loss": 1.7162, + "step": 13658 + }, + { + "epoch": 4.192449355432781, + "grad_norm": 0.24423806369304657, + "learning_rate": 6.5265423362703e-05, + "loss": 1.735, + "step": 13659 + }, + { + "epoch": 4.192756292203806, + "grad_norm": 0.37365156412124634, + "learning_rate": 6.526069005422654e-05, + "loss": 1.7697, + "step": 13660 + }, + { + "epoch": 4.1930632289748315, + "grad_norm": 0.4025731682777405, + "learning_rate": 6.525595659493499e-05, + "loss": 1.7931, + "step": 13661 + }, + { + "epoch": 4.193370165745856, + "grad_norm": 0.31360915303230286, + "learning_rate": 6.525122298487514e-05, + "loss": 1.8014, + "step": 13662 + }, + { + "epoch": 4.193677102516881, + "grad_norm": 0.2480524778366089, + "learning_rate": 6.524648922409376e-05, + "loss": 1.7753, + "step": 13663 + }, + { + "epoch": 4.193984039287907, + "grad_norm": 0.33740919828414917, + "learning_rate": 6.524175531263765e-05, + "loss": 1.7296, + "step": 13664 + }, + { + "epoch": 4.194290976058932, + "grad_norm": 0.26871639490127563, + "learning_rate": 6.523702125055358e-05, + "loss": 1.7113, + "step": 13665 + }, + { + "epoch": 4.194597912829957, + "grad_norm": 0.2687455415725708, + "learning_rate": 6.52322870378883e-05, + "loss": 1.7645, + "step": 13666 + }, + { + "epoch": 4.194904849600983, + "grad_norm": 0.4207400679588318, + "learning_rate": 6.522755267468868e-05, + "loss": 1.7758, + "step": 13667 + }, + { + "epoch": 4.195211786372007, + "grad_norm": 0.36043494939804077, + "learning_rate": 6.522281816100142e-05, + "loss": 1.7433, + "step": 13668 + }, + { + "epoch": 4.195518723143032, + "grad_norm": 0.2515890598297119, + "learning_rate": 6.52180834968734e-05, + "loss": 1.7646, + "step": 13669 + }, + { + "epoch": 4.195825659914058, + "grad_norm": 0.2871458828449249, + "learning_rate": 6.521334868235132e-05, + "loss": 1.8147, + "step": 13670 + }, + { + "epoch": 4.196132596685083, + "grad_norm": 0.28454354405403137, + "learning_rate": 6.5208613717482e-05, + "loss": 1.8576, + "step": 13671 + }, + { + "epoch": 4.196439533456108, + "grad_norm": 0.2520541548728943, + "learning_rate": 6.520387860231227e-05, + "loss": 1.7513, + "step": 13672 + }, + { + "epoch": 4.196746470227133, + "grad_norm": 0.22782307863235474, + "learning_rate": 6.51991433368889e-05, + "loss": 1.7737, + "step": 13673 + }, + { + "epoch": 4.197053406998158, + "grad_norm": 0.2451259195804596, + "learning_rate": 6.519440792125869e-05, + "loss": 1.7483, + "step": 13674 + }, + { + "epoch": 4.1973603437691835, + "grad_norm": 0.21915963292121887, + "learning_rate": 6.518967235546841e-05, + "loss": 1.718, + "step": 13675 + }, + { + "epoch": 4.197667280540209, + "grad_norm": 0.23005805909633636, + "learning_rate": 6.51849366395649e-05, + "loss": 1.7786, + "step": 13676 + }, + { + "epoch": 4.197974217311234, + "grad_norm": 0.25039517879486084, + "learning_rate": 6.518020077359494e-05, + "loss": 1.7785, + "step": 13677 + }, + { + "epoch": 4.198281154082259, + "grad_norm": 0.26631081104278564, + "learning_rate": 6.517546475760535e-05, + "loss": 1.7921, + "step": 13678 + }, + { + "epoch": 4.198588090853284, + "grad_norm": 0.2220793515443802, + "learning_rate": 6.517072859164292e-05, + "loss": 1.7696, + "step": 13679 + }, + { + "epoch": 4.198895027624309, + "grad_norm": 0.24681030213832855, + "learning_rate": 6.516599227575446e-05, + "loss": 1.7702, + "step": 13680 + }, + { + "epoch": 4.199201964395335, + "grad_norm": 0.2421828955411911, + "learning_rate": 6.516125580998678e-05, + "loss": 1.8058, + "step": 13681 + }, + { + "epoch": 4.19950890116636, + "grad_norm": 0.2170087695121765, + "learning_rate": 6.515651919438667e-05, + "loss": 1.7271, + "step": 13682 + }, + { + "epoch": 4.199815837937384, + "grad_norm": 0.23383566737174988, + "learning_rate": 6.515178242900096e-05, + "loss": 1.7515, + "step": 13683 + }, + { + "epoch": 4.20012277470841, + "grad_norm": 0.2522997558116913, + "learning_rate": 6.514704551387645e-05, + "loss": 1.7619, + "step": 13684 + }, + { + "epoch": 4.200429711479435, + "grad_norm": 0.20973703265190125, + "learning_rate": 6.514230844905995e-05, + "loss": 1.7326, + "step": 13685 + }, + { + "epoch": 4.2007366482504604, + "grad_norm": 0.2308073341846466, + "learning_rate": 6.513757123459832e-05, + "loss": 1.811, + "step": 13686 + }, + { + "epoch": 4.201043585021486, + "grad_norm": 0.21751229465007782, + "learning_rate": 6.51328338705383e-05, + "loss": 1.7795, + "step": 13687 + }, + { + "epoch": 4.201350521792511, + "grad_norm": 0.2357407957315445, + "learning_rate": 6.512809635692675e-05, + "loss": 1.8069, + "step": 13688 + }, + { + "epoch": 4.201657458563536, + "grad_norm": 0.32245033979415894, + "learning_rate": 6.51233586938105e-05, + "loss": 1.8179, + "step": 13689 + }, + { + "epoch": 4.201964395334561, + "grad_norm": 0.22740167379379272, + "learning_rate": 6.511862088123635e-05, + "loss": 1.7482, + "step": 13690 + }, + { + "epoch": 4.202271332105586, + "grad_norm": 0.26880496740341187, + "learning_rate": 6.511388291925114e-05, + "loss": 1.7919, + "step": 13691 + }, + { + "epoch": 4.202578268876612, + "grad_norm": 0.2261822521686554, + "learning_rate": 6.510914480790166e-05, + "loss": 1.7543, + "step": 13692 + }, + { + "epoch": 4.202885205647637, + "grad_norm": 0.2635782063007355, + "learning_rate": 6.510440654723477e-05, + "loss": 1.7874, + "step": 13693 + }, + { + "epoch": 4.203192142418661, + "grad_norm": 0.2505982518196106, + "learning_rate": 6.509966813729726e-05, + "loss": 1.8016, + "step": 13694 + }, + { + "epoch": 4.203499079189687, + "grad_norm": 0.23177236318588257, + "learning_rate": 6.5094929578136e-05, + "loss": 1.7582, + "step": 13695 + }, + { + "epoch": 4.203806015960712, + "grad_norm": 0.2315056324005127, + "learning_rate": 6.509019086979779e-05, + "loss": 1.7418, + "step": 13696 + }, + { + "epoch": 4.204112952731737, + "grad_norm": 0.25565484166145325, + "learning_rate": 6.508545201232947e-05, + "loss": 1.7476, + "step": 13697 + }, + { + "epoch": 4.204419889502763, + "grad_norm": 0.29210081696510315, + "learning_rate": 6.508071300577787e-05, + "loss": 1.8397, + "step": 13698 + }, + { + "epoch": 4.204726826273788, + "grad_norm": 0.2830582559108734, + "learning_rate": 6.507597385018984e-05, + "loss": 1.834, + "step": 13699 + }, + { + "epoch": 4.2050337630448125, + "grad_norm": 0.23013398051261902, + "learning_rate": 6.507123454561217e-05, + "loss": 1.7593, + "step": 13700 + }, + { + "epoch": 4.205340699815838, + "grad_norm": 0.21970276534557343, + "learning_rate": 6.506649509209174e-05, + "loss": 1.754, + "step": 13701 + }, + { + "epoch": 4.205647636586863, + "grad_norm": 0.32052233815193176, + "learning_rate": 6.50617554896754e-05, + "loss": 1.7531, + "step": 13702 + }, + { + "epoch": 4.2059545733578885, + "grad_norm": 0.2597332000732422, + "learning_rate": 6.505701573840995e-05, + "loss": 1.7836, + "step": 13703 + }, + { + "epoch": 4.206261510128914, + "grad_norm": 0.22070355713367462, + "learning_rate": 6.505227583834224e-05, + "loss": 1.7225, + "step": 13704 + }, + { + "epoch": 4.206568446899938, + "grad_norm": 0.27219358086586, + "learning_rate": 6.50475357895191e-05, + "loss": 1.8215, + "step": 13705 + }, + { + "epoch": 4.206875383670964, + "grad_norm": 0.32541659474372864, + "learning_rate": 6.504279559198741e-05, + "loss": 1.7786, + "step": 13706 + }, + { + "epoch": 4.207182320441989, + "grad_norm": 0.25871729850769043, + "learning_rate": 6.5038055245794e-05, + "loss": 1.7621, + "step": 13707 + }, + { + "epoch": 4.207489257213014, + "grad_norm": 0.2190464735031128, + "learning_rate": 6.50333147509857e-05, + "loss": 1.7612, + "step": 13708 + }, + { + "epoch": 4.20779619398404, + "grad_norm": 0.19565832614898682, + "learning_rate": 6.50285741076094e-05, + "loss": 1.7581, + "step": 13709 + }, + { + "epoch": 4.208103130755064, + "grad_norm": 0.1889251321554184, + "learning_rate": 6.50238333157119e-05, + "loss": 1.7611, + "step": 13710 + }, + { + "epoch": 4.208410067526089, + "grad_norm": 0.2013053596019745, + "learning_rate": 6.501909237534008e-05, + "loss": 1.7393, + "step": 13711 + }, + { + "epoch": 4.208717004297115, + "grad_norm": 0.1899433434009552, + "learning_rate": 6.501435128654077e-05, + "loss": 1.7122, + "step": 13712 + }, + { + "epoch": 4.20902394106814, + "grad_norm": 0.19337882101535797, + "learning_rate": 6.500961004936085e-05, + "loss": 1.7538, + "step": 13713 + }, + { + "epoch": 4.209330877839165, + "grad_norm": 0.20419920980930328, + "learning_rate": 6.500486866384718e-05, + "loss": 1.728, + "step": 13714 + }, + { + "epoch": 4.209637814610191, + "grad_norm": 0.20615679025650024, + "learning_rate": 6.50001271300466e-05, + "loss": 1.7843, + "step": 13715 + }, + { + "epoch": 4.209944751381215, + "grad_norm": 0.22178977727890015, + "learning_rate": 6.499538544800596e-05, + "loss": 1.7751, + "step": 13716 + }, + { + "epoch": 4.2102516881522405, + "grad_norm": 0.23703891038894653, + "learning_rate": 6.499064361777214e-05, + "loss": 1.7304, + "step": 13717 + }, + { + "epoch": 4.210558624923266, + "grad_norm": 0.2785723805427551, + "learning_rate": 6.498590163939198e-05, + "loss": 1.802, + "step": 13718 + }, + { + "epoch": 4.210865561694291, + "grad_norm": 0.23277060687541962, + "learning_rate": 6.498115951291237e-05, + "loss": 1.7316, + "step": 13719 + }, + { + "epoch": 4.2111724984653165, + "grad_norm": 0.22289474308490753, + "learning_rate": 6.497641723838017e-05, + "loss": 1.8469, + "step": 13720 + }, + { + "epoch": 4.211479435236341, + "grad_norm": 0.2715846002101898, + "learning_rate": 6.497167481584221e-05, + "loss": 1.7919, + "step": 13721 + }, + { + "epoch": 4.211786372007366, + "grad_norm": 0.29262226819992065, + "learning_rate": 6.49669322453454e-05, + "loss": 1.8379, + "step": 13722 + }, + { + "epoch": 4.212093308778392, + "grad_norm": 0.29136186838150024, + "learning_rate": 6.49621895269366e-05, + "loss": 1.789, + "step": 13723 + }, + { + "epoch": 4.212400245549417, + "grad_norm": 0.25110194087028503, + "learning_rate": 6.495744666066266e-05, + "loss": 1.7574, + "step": 13724 + }, + { + "epoch": 4.212707182320442, + "grad_norm": 0.2301366776227951, + "learning_rate": 6.495270364657048e-05, + "loss": 1.7637, + "step": 13725 + }, + { + "epoch": 4.213014119091467, + "grad_norm": 0.2556478977203369, + "learning_rate": 6.49479604847069e-05, + "loss": 1.7975, + "step": 13726 + }, + { + "epoch": 4.213321055862492, + "grad_norm": 0.2645667493343353, + "learning_rate": 6.494321717511884e-05, + "loss": 1.7594, + "step": 13727 + }, + { + "epoch": 4.213627992633517, + "grad_norm": 0.23664188385009766, + "learning_rate": 6.493847371785312e-05, + "loss": 1.7963, + "step": 13728 + }, + { + "epoch": 4.213934929404543, + "grad_norm": 0.2947930693626404, + "learning_rate": 6.493373011295665e-05, + "loss": 1.7477, + "step": 13729 + }, + { + "epoch": 4.214241866175568, + "grad_norm": 0.34598737955093384, + "learning_rate": 6.492898636047631e-05, + "loss": 1.7014, + "step": 13730 + }, + { + "epoch": 4.214548802946593, + "grad_norm": 0.24406935274600983, + "learning_rate": 6.4924242460459e-05, + "loss": 1.7436, + "step": 13731 + }, + { + "epoch": 4.214855739717618, + "grad_norm": 0.27176225185394287, + "learning_rate": 6.491949841295156e-05, + "loss": 1.8429, + "step": 13732 + }, + { + "epoch": 4.215162676488643, + "grad_norm": 0.2506968080997467, + "learning_rate": 6.491475421800089e-05, + "loss": 1.7519, + "step": 13733 + }, + { + "epoch": 4.2154696132596685, + "grad_norm": 0.2240980863571167, + "learning_rate": 6.491000987565387e-05, + "loss": 1.7595, + "step": 13734 + }, + { + "epoch": 4.215776550030694, + "grad_norm": 0.23201732337474823, + "learning_rate": 6.490526538595741e-05, + "loss": 1.7466, + "step": 13735 + }, + { + "epoch": 4.216083486801719, + "grad_norm": 0.24624750018119812, + "learning_rate": 6.490052074895836e-05, + "loss": 1.7364, + "step": 13736 + }, + { + "epoch": 4.216390423572744, + "grad_norm": 0.22936980426311493, + "learning_rate": 6.489577596470366e-05, + "loss": 1.7095, + "step": 13737 + }, + { + "epoch": 4.216697360343769, + "grad_norm": 0.2106638103723526, + "learning_rate": 6.489103103324016e-05, + "loss": 1.7387, + "step": 13738 + }, + { + "epoch": 4.217004297114794, + "grad_norm": 0.2936140298843384, + "learning_rate": 6.488628595461477e-05, + "loss": 1.9129, + "step": 13739 + }, + { + "epoch": 4.21731123388582, + "grad_norm": 0.21871696412563324, + "learning_rate": 6.488154072887435e-05, + "loss": 1.7489, + "step": 13740 + }, + { + "epoch": 4.217618170656845, + "grad_norm": 0.25941070914268494, + "learning_rate": 6.487679535606583e-05, + "loss": 1.7788, + "step": 13741 + }, + { + "epoch": 4.21792510742787, + "grad_norm": 0.2540862560272217, + "learning_rate": 6.487204983623612e-05, + "loss": 1.8074, + "step": 13742 + }, + { + "epoch": 4.218232044198895, + "grad_norm": 0.25180327892303467, + "learning_rate": 6.486730416943207e-05, + "loss": 1.7503, + "step": 13743 + }, + { + "epoch": 4.21853898096992, + "grad_norm": 0.26625585556030273, + "learning_rate": 6.486255835570063e-05, + "loss": 1.8149, + "step": 13744 + }, + { + "epoch": 4.218845917740945, + "grad_norm": 0.3023914396762848, + "learning_rate": 6.485781239508867e-05, + "loss": 1.8599, + "step": 13745 + }, + { + "epoch": 4.219152854511971, + "grad_norm": 0.2683780789375305, + "learning_rate": 6.48530662876431e-05, + "loss": 1.7911, + "step": 13746 + }, + { + "epoch": 4.219459791282996, + "grad_norm": 0.20747442543506622, + "learning_rate": 6.484832003341081e-05, + "loss": 1.7343, + "step": 13747 + }, + { + "epoch": 4.2197667280540205, + "grad_norm": 0.29284465312957764, + "learning_rate": 6.484357363243873e-05, + "loss": 1.7917, + "step": 13748 + }, + { + "epoch": 4.220073664825046, + "grad_norm": 0.24303840100765228, + "learning_rate": 6.483882708477376e-05, + "loss": 1.7921, + "step": 13749 + }, + { + "epoch": 4.220380601596071, + "grad_norm": 0.26253026723861694, + "learning_rate": 6.48340803904628e-05, + "loss": 1.7971, + "step": 13750 + }, + { + "epoch": 4.2206875383670965, + "grad_norm": 0.23888511955738068, + "learning_rate": 6.482933354955275e-05, + "loss": 1.7967, + "step": 13751 + }, + { + "epoch": 4.220994475138122, + "grad_norm": 0.24966883659362793, + "learning_rate": 6.482458656209054e-05, + "loss": 1.7924, + "step": 13752 + }, + { + "epoch": 4.221301411909146, + "grad_norm": 0.26556864380836487, + "learning_rate": 6.481983942812309e-05, + "loss": 1.8608, + "step": 13753 + }, + { + "epoch": 4.221608348680172, + "grad_norm": 0.29064711928367615, + "learning_rate": 6.48150921476973e-05, + "loss": 1.7785, + "step": 13754 + }, + { + "epoch": 4.221915285451197, + "grad_norm": 0.30876123905181885, + "learning_rate": 6.481034472086008e-05, + "loss": 1.8287, + "step": 13755 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.2622467875480652, + "learning_rate": 6.480559714765835e-05, + "loss": 1.8336, + "step": 13756 + }, + { + "epoch": 4.222529158993248, + "grad_norm": 0.2502644956111908, + "learning_rate": 6.480084942813902e-05, + "loss": 1.7803, + "step": 13757 + }, + { + "epoch": 4.222836095764273, + "grad_norm": 0.2879922688007355, + "learning_rate": 6.479610156234903e-05, + "loss": 1.7544, + "step": 13758 + }, + { + "epoch": 4.223143032535297, + "grad_norm": 0.2831384241580963, + "learning_rate": 6.47913535503353e-05, + "loss": 1.887, + "step": 13759 + }, + { + "epoch": 4.223449969306323, + "grad_norm": 0.3221064805984497, + "learning_rate": 6.478660539214474e-05, + "loss": 1.7455, + "step": 13760 + }, + { + "epoch": 4.223756906077348, + "grad_norm": 0.4231930673122406, + "learning_rate": 6.478185708782427e-05, + "loss": 1.8209, + "step": 13761 + }, + { + "epoch": 4.224063842848373, + "grad_norm": 0.34327802062034607, + "learning_rate": 6.477710863742083e-05, + "loss": 1.7754, + "step": 13762 + }, + { + "epoch": 4.224370779619399, + "grad_norm": 0.21713349223136902, + "learning_rate": 6.477236004098135e-05, + "loss": 1.7576, + "step": 13763 + }, + { + "epoch": 4.224677716390423, + "grad_norm": 0.3262602388858795, + "learning_rate": 6.476761129855275e-05, + "loss": 1.7772, + "step": 13764 + }, + { + "epoch": 4.2249846531614486, + "grad_norm": 0.3231413662433624, + "learning_rate": 6.476286241018195e-05, + "loss": 1.7821, + "step": 13765 + }, + { + "epoch": 4.225291589932474, + "grad_norm": 0.2440098226070404, + "learning_rate": 6.475811337591588e-05, + "loss": 1.7684, + "step": 13766 + }, + { + "epoch": 4.225598526703499, + "grad_norm": 0.329949289560318, + "learning_rate": 6.475336419580151e-05, + "loss": 1.8564, + "step": 13767 + }, + { + "epoch": 4.225905463474525, + "grad_norm": 0.3567483425140381, + "learning_rate": 6.474861486988574e-05, + "loss": 1.7625, + "step": 13768 + }, + { + "epoch": 4.226212400245549, + "grad_norm": 0.25257283449172974, + "learning_rate": 6.47438653982155e-05, + "loss": 1.823, + "step": 13769 + }, + { + "epoch": 4.226519337016574, + "grad_norm": 0.31542617082595825, + "learning_rate": 6.473911578083776e-05, + "loss": 1.7817, + "step": 13770 + }, + { + "epoch": 4.2268262737876, + "grad_norm": 0.29670149087905884, + "learning_rate": 6.473436601779944e-05, + "loss": 1.7493, + "step": 13771 + }, + { + "epoch": 4.227133210558625, + "grad_norm": 0.2635453939437866, + "learning_rate": 6.472961610914745e-05, + "loss": 1.792, + "step": 13772 + }, + { + "epoch": 4.22744014732965, + "grad_norm": 0.25017979741096497, + "learning_rate": 6.472486605492878e-05, + "loss": 1.7183, + "step": 13773 + }, + { + "epoch": 4.227747084100676, + "grad_norm": 0.3766646087169647, + "learning_rate": 6.472011585519034e-05, + "loss": 1.8039, + "step": 13774 + }, + { + "epoch": 4.2280540208717, + "grad_norm": 0.29860204458236694, + "learning_rate": 6.47153655099791e-05, + "loss": 1.8016, + "step": 13775 + }, + { + "epoch": 4.2283609576427255, + "grad_norm": 0.2540898323059082, + "learning_rate": 6.4710615019342e-05, + "loss": 1.8481, + "step": 13776 + }, + { + "epoch": 4.228667894413751, + "grad_norm": 0.3677786886692047, + "learning_rate": 6.470586438332597e-05, + "loss": 1.7663, + "step": 13777 + }, + { + "epoch": 4.228974831184776, + "grad_norm": 0.35693466663360596, + "learning_rate": 6.470111360197797e-05, + "loss": 1.7733, + "step": 13778 + }, + { + "epoch": 4.2292817679558015, + "grad_norm": 0.23747926950454712, + "learning_rate": 6.469636267534496e-05, + "loss": 1.7938, + "step": 13779 + }, + { + "epoch": 4.229588704726826, + "grad_norm": 0.32890695333480835, + "learning_rate": 6.469161160347386e-05, + "loss": 1.7233, + "step": 13780 + }, + { + "epoch": 4.229895641497851, + "grad_norm": 0.3437706530094147, + "learning_rate": 6.468686038641164e-05, + "loss": 1.7716, + "step": 13781 + }, + { + "epoch": 4.230202578268877, + "grad_norm": 0.23452162742614746, + "learning_rate": 6.468210902420527e-05, + "loss": 1.764, + "step": 13782 + }, + { + "epoch": 4.230509515039902, + "grad_norm": 0.3205265402793884, + "learning_rate": 6.46773575169017e-05, + "loss": 1.7464, + "step": 13783 + }, + { + "epoch": 4.230816451810927, + "grad_norm": 0.4234732985496521, + "learning_rate": 6.467260586454787e-05, + "loss": 1.7786, + "step": 13784 + }, + { + "epoch": 4.231123388581952, + "grad_norm": 0.2484128773212433, + "learning_rate": 6.466785406719076e-05, + "loss": 1.8125, + "step": 13785 + }, + { + "epoch": 4.231430325352977, + "grad_norm": 0.3696556091308594, + "learning_rate": 6.46631021248773e-05, + "loss": 1.7974, + "step": 13786 + }, + { + "epoch": 4.231737262124002, + "grad_norm": 0.4251437485218048, + "learning_rate": 6.465835003765449e-05, + "loss": 1.7486, + "step": 13787 + }, + { + "epoch": 4.232044198895028, + "grad_norm": 0.2507621943950653, + "learning_rate": 6.465359780556927e-05, + "loss": 1.829, + "step": 13788 + }, + { + "epoch": 4.232351135666053, + "grad_norm": 0.2911818325519562, + "learning_rate": 6.464884542866861e-05, + "loss": 1.7401, + "step": 13789 + }, + { + "epoch": 4.232658072437078, + "grad_norm": 0.35354506969451904, + "learning_rate": 6.464409290699946e-05, + "loss": 1.7848, + "step": 13790 + }, + { + "epoch": 4.232965009208103, + "grad_norm": 0.2659081518650055, + "learning_rate": 6.46393402406088e-05, + "loss": 1.7408, + "step": 13791 + }, + { + "epoch": 4.233271945979128, + "grad_norm": 0.22676481306552887, + "learning_rate": 6.46345874295436e-05, + "loss": 1.7542, + "step": 13792 + }, + { + "epoch": 4.2335788827501535, + "grad_norm": 0.2549789845943451, + "learning_rate": 6.462983447385085e-05, + "loss": 1.8095, + "step": 13793 + }, + { + "epoch": 4.233885819521179, + "grad_norm": 0.2157238870859146, + "learning_rate": 6.462508137357748e-05, + "loss": 1.7529, + "step": 13794 + }, + { + "epoch": 4.234192756292204, + "grad_norm": 0.2494724988937378, + "learning_rate": 6.46203281287705e-05, + "loss": 1.7839, + "step": 13795 + }, + { + "epoch": 4.234499693063229, + "grad_norm": 0.29560065269470215, + "learning_rate": 6.461557473947685e-05, + "loss": 1.7239, + "step": 13796 + }, + { + "epoch": 4.234806629834254, + "grad_norm": 0.23693916201591492, + "learning_rate": 6.461082120574354e-05, + "loss": 1.8074, + "step": 13797 + }, + { + "epoch": 4.235113566605279, + "grad_norm": 0.2538869082927704, + "learning_rate": 6.460606752761752e-05, + "loss": 1.8319, + "step": 13798 + }, + { + "epoch": 4.235420503376305, + "grad_norm": 0.3186401426792145, + "learning_rate": 6.460131370514578e-05, + "loss": 1.7877, + "step": 13799 + }, + { + "epoch": 4.23572744014733, + "grad_norm": 0.2473619133234024, + "learning_rate": 6.45965597383753e-05, + "loss": 1.8323, + "step": 13800 + }, + { + "epoch": 4.236034376918354, + "grad_norm": 0.32806503772735596, + "learning_rate": 6.459180562735307e-05, + "loss": 1.744, + "step": 13801 + }, + { + "epoch": 4.23634131368938, + "grad_norm": 0.3975784480571747, + "learning_rate": 6.458705137212606e-05, + "loss": 1.7216, + "step": 13802 + }, + { + "epoch": 4.236648250460405, + "grad_norm": 0.2946135997772217, + "learning_rate": 6.458229697274125e-05, + "loss": 1.8781, + "step": 13803 + }, + { + "epoch": 4.23695518723143, + "grad_norm": 0.25109192728996277, + "learning_rate": 6.457754242924565e-05, + "loss": 1.7458, + "step": 13804 + }, + { + "epoch": 4.237262124002456, + "grad_norm": 0.2763883173465729, + "learning_rate": 6.457278774168623e-05, + "loss": 1.7612, + "step": 13805 + }, + { + "epoch": 4.237569060773481, + "grad_norm": 0.22427856922149658, + "learning_rate": 6.456803291010996e-05, + "loss": 1.8049, + "step": 13806 + }, + { + "epoch": 4.2378759975445055, + "grad_norm": 0.28295788168907166, + "learning_rate": 6.456327793456387e-05, + "loss": 1.7608, + "step": 13807 + }, + { + "epoch": 4.238182934315531, + "grad_norm": 0.27857527136802673, + "learning_rate": 6.455852281509493e-05, + "loss": 1.7281, + "step": 13808 + }, + { + "epoch": 4.238489871086556, + "grad_norm": 0.24014849960803986, + "learning_rate": 6.455376755175012e-05, + "loss": 1.7247, + "step": 13809 + }, + { + "epoch": 4.2387968078575815, + "grad_norm": 0.25149038434028625, + "learning_rate": 6.454901214457646e-05, + "loss": 1.8575, + "step": 13810 + }, + { + "epoch": 4.239103744628607, + "grad_norm": 0.32072681188583374, + "learning_rate": 6.454425659362093e-05, + "loss": 1.7421, + "step": 13811 + }, + { + "epoch": 4.239410681399631, + "grad_norm": 0.28418242931365967, + "learning_rate": 6.453950089893054e-05, + "loss": 1.7031, + "step": 13812 + }, + { + "epoch": 4.239717618170657, + "grad_norm": 0.23725132644176483, + "learning_rate": 6.453474506055228e-05, + "loss": 1.7901, + "step": 13813 + }, + { + "epoch": 4.240024554941682, + "grad_norm": 0.3056317865848541, + "learning_rate": 6.452998907853315e-05, + "loss": 1.7414, + "step": 13814 + }, + { + "epoch": 4.240331491712707, + "grad_norm": 0.3111891448497772, + "learning_rate": 6.452523295292013e-05, + "loss": 1.7532, + "step": 13815 + }, + { + "epoch": 4.240638428483733, + "grad_norm": 0.2126779705286026, + "learning_rate": 6.452047668376027e-05, + "loss": 1.6779, + "step": 13816 + }, + { + "epoch": 4.240945365254758, + "grad_norm": 0.26660779118537903, + "learning_rate": 6.451572027110054e-05, + "loss": 1.7162, + "step": 13817 + }, + { + "epoch": 4.241252302025782, + "grad_norm": 0.25901922583580017, + "learning_rate": 6.451096371498794e-05, + "loss": 1.7784, + "step": 13818 + }, + { + "epoch": 4.241559238796808, + "grad_norm": 0.24091807007789612, + "learning_rate": 6.450620701546953e-05, + "loss": 1.7928, + "step": 13819 + }, + { + "epoch": 4.241866175567833, + "grad_norm": 0.25097009539604187, + "learning_rate": 6.450145017259225e-05, + "loss": 1.761, + "step": 13820 + }, + { + "epoch": 4.242173112338858, + "grad_norm": 0.22978942096233368, + "learning_rate": 6.449669318640315e-05, + "loss": 1.7891, + "step": 13821 + }, + { + "epoch": 4.242480049109884, + "grad_norm": 0.27255937457084656, + "learning_rate": 6.449193605694923e-05, + "loss": 1.7964, + "step": 13822 + }, + { + "epoch": 4.242786985880908, + "grad_norm": 0.2210773378610611, + "learning_rate": 6.44871787842775e-05, + "loss": 1.7628, + "step": 13823 + }, + { + "epoch": 4.2430939226519335, + "grad_norm": 0.25784751772880554, + "learning_rate": 6.448242136843497e-05, + "loss": 1.7596, + "step": 13824 + }, + { + "epoch": 4.243400859422959, + "grad_norm": 0.23475486040115356, + "learning_rate": 6.447766380946868e-05, + "loss": 1.8174, + "step": 13825 + }, + { + "epoch": 4.243707796193984, + "grad_norm": 0.2567705512046814, + "learning_rate": 6.447290610742561e-05, + "loss": 1.737, + "step": 13826 + }, + { + "epoch": 4.2440147329650095, + "grad_norm": 0.23973144590854645, + "learning_rate": 6.446814826235281e-05, + "loss": 1.7881, + "step": 13827 + }, + { + "epoch": 4.244321669736034, + "grad_norm": 0.25584739446640015, + "learning_rate": 6.446339027429729e-05, + "loss": 1.7673, + "step": 13828 + }, + { + "epoch": 4.244628606507059, + "grad_norm": 0.2653748393058777, + "learning_rate": 6.445863214330608e-05, + "loss": 1.7443, + "step": 13829 + }, + { + "epoch": 4.244935543278085, + "grad_norm": 0.2492038607597351, + "learning_rate": 6.445387386942619e-05, + "loss": 1.7223, + "step": 13830 + }, + { + "epoch": 4.24524248004911, + "grad_norm": 0.2282228320837021, + "learning_rate": 6.444911545270464e-05, + "loss": 1.7577, + "step": 13831 + }, + { + "epoch": 4.245549416820135, + "grad_norm": 0.2411092072725296, + "learning_rate": 6.444435689318845e-05, + "loss": 1.7324, + "step": 13832 + }, + { + "epoch": 4.245856353591161, + "grad_norm": 0.21557089686393738, + "learning_rate": 6.443959819092468e-05, + "loss": 1.7355, + "step": 13833 + }, + { + "epoch": 4.246163290362185, + "grad_norm": 0.2500394880771637, + "learning_rate": 6.443483934596033e-05, + "loss": 1.775, + "step": 13834 + }, + { + "epoch": 4.24647022713321, + "grad_norm": 0.24135248363018036, + "learning_rate": 6.443008035834244e-05, + "loss": 1.7885, + "step": 13835 + }, + { + "epoch": 4.246777163904236, + "grad_norm": 0.22860904037952423, + "learning_rate": 6.442532122811803e-05, + "loss": 1.7891, + "step": 13836 + }, + { + "epoch": 4.247084100675261, + "grad_norm": 0.2277665138244629, + "learning_rate": 6.442056195533415e-05, + "loss": 1.7583, + "step": 13837 + }, + { + "epoch": 4.247391037446286, + "grad_norm": 0.22822454571723938, + "learning_rate": 6.441580254003782e-05, + "loss": 1.7777, + "step": 13838 + }, + { + "epoch": 4.247697974217311, + "grad_norm": 0.24274896085262299, + "learning_rate": 6.441104298227608e-05, + "loss": 1.7537, + "step": 13839 + }, + { + "epoch": 4.248004910988336, + "grad_norm": 0.25080999732017517, + "learning_rate": 6.440628328209598e-05, + "loss": 1.7537, + "step": 13840 + }, + { + "epoch": 4.2483118477593615, + "grad_norm": 0.22409579157829285, + "learning_rate": 6.440152343954453e-05, + "loss": 1.7652, + "step": 13841 + }, + { + "epoch": 4.248618784530387, + "grad_norm": 0.24028798937797546, + "learning_rate": 6.439676345466877e-05, + "loss": 1.7512, + "step": 13842 + }, + { + "epoch": 4.248925721301412, + "grad_norm": 0.28739503026008606, + "learning_rate": 6.439200332751576e-05, + "loss": 1.8034, + "step": 13843 + }, + { + "epoch": 4.249232658072437, + "grad_norm": 0.2244807928800583, + "learning_rate": 6.438724305813255e-05, + "loss": 1.7243, + "step": 13844 + }, + { + "epoch": 4.249539594843462, + "grad_norm": 0.24478118121623993, + "learning_rate": 6.438248264656618e-05, + "loss": 1.7754, + "step": 13845 + }, + { + "epoch": 4.249846531614487, + "grad_norm": 0.25554370880126953, + "learning_rate": 6.437772209286368e-05, + "loss": 1.7845, + "step": 13846 + }, + { + "epoch": 4.250153468385513, + "grad_norm": 0.24478472769260406, + "learning_rate": 6.43729613970721e-05, + "loss": 1.7954, + "step": 13847 + }, + { + "epoch": 4.250460405156538, + "grad_norm": 0.22287282347679138, + "learning_rate": 6.436820055923849e-05, + "loss": 1.7379, + "step": 13848 + }, + { + "epoch": 4.250767341927563, + "grad_norm": 0.2810569703578949, + "learning_rate": 6.43634395794099e-05, + "loss": 1.8492, + "step": 13849 + }, + { + "epoch": 4.251074278698588, + "grad_norm": 0.2544163465499878, + "learning_rate": 6.435867845763337e-05, + "loss": 1.7846, + "step": 13850 + }, + { + "epoch": 4.251381215469613, + "grad_norm": 0.27879175543785095, + "learning_rate": 6.435391719395598e-05, + "loss": 1.767, + "step": 13851 + }, + { + "epoch": 4.2516881522406385, + "grad_norm": 0.2876715362071991, + "learning_rate": 6.434915578842477e-05, + "loss": 1.8048, + "step": 13852 + }, + { + "epoch": 4.251995089011664, + "grad_norm": 0.27844297885894775, + "learning_rate": 6.434439424108678e-05, + "loss": 1.7472, + "step": 13853 + }, + { + "epoch": 4.252302025782689, + "grad_norm": 0.2417020946741104, + "learning_rate": 6.43396325519891e-05, + "loss": 1.8481, + "step": 13854 + }, + { + "epoch": 4.252608962553714, + "grad_norm": 0.23828522861003876, + "learning_rate": 6.433487072117874e-05, + "loss": 1.7536, + "step": 13855 + }, + { + "epoch": 4.252915899324739, + "grad_norm": 0.22304333746433258, + "learning_rate": 6.43301087487028e-05, + "loss": 1.741, + "step": 13856 + }, + { + "epoch": 4.253222836095764, + "grad_norm": 0.27089163661003113, + "learning_rate": 6.432534663460832e-05, + "loss": 1.7974, + "step": 13857 + }, + { + "epoch": 4.25352977286679, + "grad_norm": 0.2439592182636261, + "learning_rate": 6.432058437894237e-05, + "loss": 1.7713, + "step": 13858 + }, + { + "epoch": 4.253836709637815, + "grad_norm": 0.2368553727865219, + "learning_rate": 6.431582198175203e-05, + "loss": 1.6915, + "step": 13859 + }, + { + "epoch": 4.25414364640884, + "grad_norm": 0.25248441100120544, + "learning_rate": 6.431105944308431e-05, + "loss": 1.7286, + "step": 13860 + }, + { + "epoch": 4.254450583179865, + "grad_norm": 0.20928484201431274, + "learning_rate": 6.430629676298634e-05, + "loss": 1.79, + "step": 13861 + }, + { + "epoch": 4.25475751995089, + "grad_norm": 0.25262540578842163, + "learning_rate": 6.430153394150514e-05, + "loss": 1.7443, + "step": 13862 + }, + { + "epoch": 4.255064456721915, + "grad_norm": 0.27508237957954407, + "learning_rate": 6.429677097868783e-05, + "loss": 1.8207, + "step": 13863 + }, + { + "epoch": 4.255371393492941, + "grad_norm": 0.28129303455352783, + "learning_rate": 6.429200787458141e-05, + "loss": 1.7589, + "step": 13864 + }, + { + "epoch": 4.255678330263966, + "grad_norm": 0.3205658495426178, + "learning_rate": 6.428724462923302e-05, + "loss": 1.8037, + "step": 13865 + }, + { + "epoch": 4.2559852670349905, + "grad_norm": 0.24048078060150146, + "learning_rate": 6.428248124268969e-05, + "loss": 1.7303, + "step": 13866 + }, + { + "epoch": 4.256292203806016, + "grad_norm": 0.24742475152015686, + "learning_rate": 6.427771771499852e-05, + "loss": 1.7753, + "step": 13867 + }, + { + "epoch": 4.256599140577041, + "grad_norm": 0.3082354962825775, + "learning_rate": 6.427295404620656e-05, + "loss": 1.7275, + "step": 13868 + }, + { + "epoch": 4.2569060773480665, + "grad_norm": 0.23319822549819946, + "learning_rate": 6.426819023636093e-05, + "loss": 1.7562, + "step": 13869 + }, + { + "epoch": 4.257213014119092, + "grad_norm": 0.2611405551433563, + "learning_rate": 6.426342628550866e-05, + "loss": 1.7417, + "step": 13870 + }, + { + "epoch": 4.257519950890116, + "grad_norm": 0.2577543258666992, + "learning_rate": 6.425866219369686e-05, + "loss": 1.6906, + "step": 13871 + }, + { + "epoch": 4.257826887661142, + "grad_norm": 0.31353357434272766, + "learning_rate": 6.42538979609726e-05, + "loss": 1.7155, + "step": 13872 + }, + { + "epoch": 4.258133824432167, + "grad_norm": 0.23280073702335358, + "learning_rate": 6.424913358738296e-05, + "loss": 1.7576, + "step": 13873 + }, + { + "epoch": 4.258440761203192, + "grad_norm": 0.24087542295455933, + "learning_rate": 6.424436907297504e-05, + "loss": 1.7622, + "step": 13874 + }, + { + "epoch": 4.258747697974218, + "grad_norm": 0.3146509826183319, + "learning_rate": 6.42396044177959e-05, + "loss": 1.769, + "step": 13875 + }, + { + "epoch": 4.259054634745242, + "grad_norm": 0.2645811438560486, + "learning_rate": 6.423483962189268e-05, + "loss": 1.7713, + "step": 13876 + }, + { + "epoch": 4.259361571516267, + "grad_norm": 0.2166455090045929, + "learning_rate": 6.423007468531238e-05, + "loss": 1.7705, + "step": 13877 + }, + { + "epoch": 4.259668508287293, + "grad_norm": 0.29142528772354126, + "learning_rate": 6.422530960810217e-05, + "loss": 1.7725, + "step": 13878 + }, + { + "epoch": 4.259975445058318, + "grad_norm": 0.28777652978897095, + "learning_rate": 6.422054439030911e-05, + "loss": 1.7853, + "step": 13879 + }, + { + "epoch": 4.260282381829343, + "grad_norm": 0.2285117357969284, + "learning_rate": 6.42157790319803e-05, + "loss": 1.7034, + "step": 13880 + }, + { + "epoch": 4.260589318600369, + "grad_norm": 0.32407644391059875, + "learning_rate": 6.421101353316282e-05, + "loss": 1.7858, + "step": 13881 + }, + { + "epoch": 4.260896255371393, + "grad_norm": 0.4803469777107239, + "learning_rate": 6.420624789390378e-05, + "loss": 1.7337, + "step": 13882 + }, + { + "epoch": 4.2612031921424185, + "grad_norm": 0.4245823919773102, + "learning_rate": 6.420148211425027e-05, + "loss": 1.8024, + "step": 13883 + }, + { + "epoch": 4.261510128913444, + "grad_norm": 0.22298674285411835, + "learning_rate": 6.419671619424938e-05, + "loss": 1.7129, + "step": 13884 + }, + { + "epoch": 4.261817065684469, + "grad_norm": 0.46955862641334534, + "learning_rate": 6.419195013394824e-05, + "loss": 1.7151, + "step": 13885 + }, + { + "epoch": 4.2621240024554945, + "grad_norm": 0.4809224009513855, + "learning_rate": 6.418718393339392e-05, + "loss": 1.7697, + "step": 13886 + }, + { + "epoch": 4.262430939226519, + "grad_norm": 0.2741130292415619, + "learning_rate": 6.418241759263353e-05, + "loss": 1.8133, + "step": 13887 + }, + { + "epoch": 4.262737875997544, + "grad_norm": 0.3673117756843567, + "learning_rate": 6.417765111171419e-05, + "loss": 1.7424, + "step": 13888 + }, + { + "epoch": 4.26304481276857, + "grad_norm": 0.4609327018260956, + "learning_rate": 6.417288449068299e-05, + "loss": 1.741, + "step": 13889 + }, + { + "epoch": 4.263351749539595, + "grad_norm": 0.2929460406303406, + "learning_rate": 6.416811772958702e-05, + "loss": 1.8385, + "step": 13890 + }, + { + "epoch": 4.26365868631062, + "grad_norm": 0.2727305293083191, + "learning_rate": 6.416335082847342e-05, + "loss": 1.794, + "step": 13891 + }, + { + "epoch": 4.263965623081646, + "grad_norm": 0.26089411973953247, + "learning_rate": 6.41585837873893e-05, + "loss": 1.7907, + "step": 13892 + }, + { + "epoch": 4.26427255985267, + "grad_norm": 0.24655573070049286, + "learning_rate": 6.415381660638174e-05, + "loss": 1.7481, + "step": 13893 + }, + { + "epoch": 4.264579496623695, + "grad_norm": 0.4186919629573822, + "learning_rate": 6.414904928549787e-05, + "loss": 1.8021, + "step": 13894 + }, + { + "epoch": 4.264886433394721, + "grad_norm": 0.38188236951828003, + "learning_rate": 6.414428182478478e-05, + "loss": 1.75, + "step": 13895 + }, + { + "epoch": 4.265193370165746, + "grad_norm": 0.23686440289020538, + "learning_rate": 6.413951422428963e-05, + "loss": 1.7882, + "step": 13896 + }, + { + "epoch": 4.265500306936771, + "grad_norm": 0.35963737964630127, + "learning_rate": 6.413474648405952e-05, + "loss": 1.7427, + "step": 13897 + }, + { + "epoch": 4.265807243707796, + "grad_norm": 0.38558289408683777, + "learning_rate": 6.412997860414155e-05, + "loss": 1.7622, + "step": 13898 + }, + { + "epoch": 4.266114180478821, + "grad_norm": 0.2311459481716156, + "learning_rate": 6.412521058458285e-05, + "loss": 1.7894, + "step": 13899 + }, + { + "epoch": 4.2664211172498465, + "grad_norm": 0.2647818624973297, + "learning_rate": 6.412044242543054e-05, + "loss": 1.7399, + "step": 13900 + }, + { + "epoch": 4.266728054020872, + "grad_norm": 0.3174133002758026, + "learning_rate": 6.411567412673174e-05, + "loss": 1.7552, + "step": 13901 + }, + { + "epoch": 4.267034990791897, + "grad_norm": 0.25207316875457764, + "learning_rate": 6.411090568853358e-05, + "loss": 1.7876, + "step": 13902 + }, + { + "epoch": 4.267341927562922, + "grad_norm": 0.24549202620983124, + "learning_rate": 6.410613711088317e-05, + "loss": 1.8554, + "step": 13903 + }, + { + "epoch": 4.267648864333947, + "grad_norm": 0.26293641328811646, + "learning_rate": 6.410136839382765e-05, + "loss": 1.8553, + "step": 13904 + }, + { + "epoch": 4.267955801104972, + "grad_norm": 0.20258362591266632, + "learning_rate": 6.409659953741416e-05, + "loss": 1.7205, + "step": 13905 + }, + { + "epoch": 4.268262737875998, + "grad_norm": 0.24885907769203186, + "learning_rate": 6.409183054168979e-05, + "loss": 1.7718, + "step": 13906 + }, + { + "epoch": 4.268569674647023, + "grad_norm": 0.22737209498882294, + "learning_rate": 6.408706140670169e-05, + "loss": 1.7228, + "step": 13907 + }, + { + "epoch": 4.268876611418047, + "grad_norm": 0.2201235145330429, + "learning_rate": 6.4082292132497e-05, + "loss": 1.7451, + "step": 13908 + }, + { + "epoch": 4.269183548189073, + "grad_norm": 0.24108454585075378, + "learning_rate": 6.407752271912285e-05, + "loss": 1.7531, + "step": 13909 + }, + { + "epoch": 4.269490484960098, + "grad_norm": 0.21723641455173492, + "learning_rate": 6.407275316662636e-05, + "loss": 1.7139, + "step": 13910 + }, + { + "epoch": 4.269797421731123, + "grad_norm": 0.22557848691940308, + "learning_rate": 6.406798347505469e-05, + "loss": 1.7633, + "step": 13911 + }, + { + "epoch": 4.270104358502149, + "grad_norm": 0.24664700031280518, + "learning_rate": 6.406321364445494e-05, + "loss": 1.7854, + "step": 13912 + }, + { + "epoch": 4.270411295273174, + "grad_norm": 0.2599056661128998, + "learning_rate": 6.405844367487428e-05, + "loss": 1.7662, + "step": 13913 + }, + { + "epoch": 4.2707182320441985, + "grad_norm": 0.2378663718700409, + "learning_rate": 6.405367356635982e-05, + "loss": 1.7477, + "step": 13914 + }, + { + "epoch": 4.271025168815224, + "grad_norm": 0.27158626914024353, + "learning_rate": 6.404890331895876e-05, + "loss": 1.7426, + "step": 13915 + }, + { + "epoch": 4.271332105586249, + "grad_norm": 0.28585317730903625, + "learning_rate": 6.404413293271818e-05, + "loss": 1.7492, + "step": 13916 + }, + { + "epoch": 4.2716390423572745, + "grad_norm": 0.2321750968694687, + "learning_rate": 6.403936240768526e-05, + "loss": 1.8594, + "step": 13917 + }, + { + "epoch": 4.2719459791283, + "grad_norm": 0.25824111700057983, + "learning_rate": 6.40345917439071e-05, + "loss": 1.7622, + "step": 13918 + }, + { + "epoch": 4.272252915899324, + "grad_norm": 0.24641194939613342, + "learning_rate": 6.40298209414309e-05, + "loss": 1.7519, + "step": 13919 + }, + { + "epoch": 4.27255985267035, + "grad_norm": 0.2132398933172226, + "learning_rate": 6.40250500003038e-05, + "loss": 1.7339, + "step": 13920 + }, + { + "epoch": 4.272866789441375, + "grad_norm": 0.22630736231803894, + "learning_rate": 6.402027892057292e-05, + "loss": 1.7396, + "step": 13921 + }, + { + "epoch": 4.2731737262124, + "grad_norm": 0.295163631439209, + "learning_rate": 6.401550770228543e-05, + "loss": 1.8063, + "step": 13922 + }, + { + "epoch": 4.273480662983426, + "grad_norm": 0.2722746729850769, + "learning_rate": 6.401073634548848e-05, + "loss": 1.7775, + "step": 13923 + }, + { + "epoch": 4.273787599754451, + "grad_norm": 0.23201976716518402, + "learning_rate": 6.400596485022922e-05, + "loss": 1.7755, + "step": 13924 + }, + { + "epoch": 4.274094536525475, + "grad_norm": 0.23880761861801147, + "learning_rate": 6.40011932165548e-05, + "loss": 1.778, + "step": 13925 + }, + { + "epoch": 4.274401473296501, + "grad_norm": 0.22305625677108765, + "learning_rate": 6.399642144451239e-05, + "loss": 1.761, + "step": 13926 + }, + { + "epoch": 4.274708410067526, + "grad_norm": 0.21874886751174927, + "learning_rate": 6.399164953414914e-05, + "loss": 1.7148, + "step": 13927 + }, + { + "epoch": 4.2750153468385514, + "grad_norm": 0.2003604918718338, + "learning_rate": 6.398687748551221e-05, + "loss": 1.8049, + "step": 13928 + }, + { + "epoch": 4.275322283609577, + "grad_norm": 0.2443511188030243, + "learning_rate": 6.398210529864875e-05, + "loss": 1.782, + "step": 13929 + }, + { + "epoch": 4.275629220380601, + "grad_norm": 0.2297198623418808, + "learning_rate": 6.397733297360594e-05, + "loss": 1.7682, + "step": 13930 + }, + { + "epoch": 4.275936157151627, + "grad_norm": 0.23474562168121338, + "learning_rate": 6.39725605104309e-05, + "loss": 1.7809, + "step": 13931 + }, + { + "epoch": 4.276243093922652, + "grad_norm": 0.25908544659614563, + "learning_rate": 6.396778790917087e-05, + "loss": 1.7343, + "step": 13932 + }, + { + "epoch": 4.276550030693677, + "grad_norm": 0.2440379112958908, + "learning_rate": 6.396301516987295e-05, + "loss": 1.786, + "step": 13933 + }, + { + "epoch": 4.276856967464703, + "grad_norm": 0.26185858249664307, + "learning_rate": 6.395824229258435e-05, + "loss": 1.7863, + "step": 13934 + }, + { + "epoch": 4.277163904235728, + "grad_norm": 0.24470919370651245, + "learning_rate": 6.39534692773522e-05, + "loss": 1.7774, + "step": 13935 + }, + { + "epoch": 4.277470841006752, + "grad_norm": 0.2612632215023041, + "learning_rate": 6.39486961242237e-05, + "loss": 1.7536, + "step": 13936 + }, + { + "epoch": 4.277777777777778, + "grad_norm": 0.26870301365852356, + "learning_rate": 6.3943922833246e-05, + "loss": 1.8177, + "step": 13937 + }, + { + "epoch": 4.278084714548803, + "grad_norm": 0.24445784091949463, + "learning_rate": 6.393914940446628e-05, + "loss": 1.7539, + "step": 13938 + }, + { + "epoch": 4.278391651319828, + "grad_norm": 0.2622319757938385, + "learning_rate": 6.393437583793174e-05, + "loss": 1.8252, + "step": 13939 + }, + { + "epoch": 4.278698588090854, + "grad_norm": 0.2586652636528015, + "learning_rate": 6.39296021336895e-05, + "loss": 1.7975, + "step": 13940 + }, + { + "epoch": 4.279005524861878, + "grad_norm": 0.19488228857517242, + "learning_rate": 6.392482829178678e-05, + "loss": 1.7678, + "step": 13941 + }, + { + "epoch": 4.2793124616329035, + "grad_norm": 0.23956604301929474, + "learning_rate": 6.392005431227074e-05, + "loss": 1.7444, + "step": 13942 + }, + { + "epoch": 4.279619398403929, + "grad_norm": 0.24195842444896698, + "learning_rate": 6.391528019518857e-05, + "loss": 1.8116, + "step": 13943 + }, + { + "epoch": 4.279926335174954, + "grad_norm": 0.21479523181915283, + "learning_rate": 6.391050594058746e-05, + "loss": 1.7351, + "step": 13944 + }, + { + "epoch": 4.2802332719459795, + "grad_norm": 0.2309941202402115, + "learning_rate": 6.390573154851456e-05, + "loss": 1.8245, + "step": 13945 + }, + { + "epoch": 4.280540208717004, + "grad_norm": 0.2375536412000656, + "learning_rate": 6.390095701901706e-05, + "loss": 1.7921, + "step": 13946 + }, + { + "epoch": 4.280847145488029, + "grad_norm": 0.25518664717674255, + "learning_rate": 6.389618235214216e-05, + "loss": 1.7549, + "step": 13947 + }, + { + "epoch": 4.281154082259055, + "grad_norm": 0.2579016089439392, + "learning_rate": 6.389140754793705e-05, + "loss": 1.7637, + "step": 13948 + }, + { + "epoch": 4.28146101903008, + "grad_norm": 0.25350916385650635, + "learning_rate": 6.388663260644892e-05, + "loss": 1.746, + "step": 13949 + }, + { + "epoch": 4.281767955801105, + "grad_norm": 0.2994026839733124, + "learning_rate": 6.388185752772493e-05, + "loss": 1.8196, + "step": 13950 + }, + { + "epoch": 4.28207489257213, + "grad_norm": 0.29938533902168274, + "learning_rate": 6.387708231181229e-05, + "loss": 1.7187, + "step": 13951 + }, + { + "epoch": 4.282381829343155, + "grad_norm": 0.23865137994289398, + "learning_rate": 6.387230695875819e-05, + "loss": 1.7317, + "step": 13952 + }, + { + "epoch": 4.28268876611418, + "grad_norm": 0.23812857270240784, + "learning_rate": 6.386753146860982e-05, + "loss": 1.7536, + "step": 13953 + }, + { + "epoch": 4.282995702885206, + "grad_norm": 0.3395650088787079, + "learning_rate": 6.386275584141438e-05, + "loss": 1.7932, + "step": 13954 + }, + { + "epoch": 4.283302639656231, + "grad_norm": 0.38207507133483887, + "learning_rate": 6.385798007721906e-05, + "loss": 1.8196, + "step": 13955 + }, + { + "epoch": 4.283609576427256, + "grad_norm": 0.32960978150367737, + "learning_rate": 6.385320417607107e-05, + "loss": 1.7898, + "step": 13956 + }, + { + "epoch": 4.283916513198281, + "grad_norm": 0.22978928685188293, + "learning_rate": 6.384842813801757e-05, + "loss": 1.7835, + "step": 13957 + }, + { + "epoch": 4.284223449969306, + "grad_norm": 0.24607588350772858, + "learning_rate": 6.38436519631058e-05, + "loss": 1.7829, + "step": 13958 + }, + { + "epoch": 4.2845303867403315, + "grad_norm": 0.2770270109176636, + "learning_rate": 6.383887565138295e-05, + "loss": 1.7294, + "step": 13959 + }, + { + "epoch": 4.284837323511357, + "grad_norm": 0.27644863724708557, + "learning_rate": 6.383409920289622e-05, + "loss": 1.829, + "step": 13960 + }, + { + "epoch": 4.285144260282382, + "grad_norm": 0.3870919942855835, + "learning_rate": 6.382932261769282e-05, + "loss": 1.8146, + "step": 13961 + }, + { + "epoch": 4.285451197053407, + "grad_norm": 0.3562348186969757, + "learning_rate": 6.382454589581994e-05, + "loss": 1.8225, + "step": 13962 + }, + { + "epoch": 4.285758133824432, + "grad_norm": 0.28444886207580566, + "learning_rate": 6.38197690373248e-05, + "loss": 1.7734, + "step": 13963 + }, + { + "epoch": 4.286065070595457, + "grad_norm": 0.27935758233070374, + "learning_rate": 6.381499204225459e-05, + "loss": 1.7402, + "step": 13964 + }, + { + "epoch": 4.286372007366483, + "grad_norm": 0.34188997745513916, + "learning_rate": 6.381021491065653e-05, + "loss": 1.7661, + "step": 13965 + }, + { + "epoch": 4.286678944137508, + "grad_norm": 0.28648918867111206, + "learning_rate": 6.380543764257785e-05, + "loss": 1.8312, + "step": 13966 + }, + { + "epoch": 4.286985880908533, + "grad_norm": 0.2733290493488312, + "learning_rate": 6.380066023806572e-05, + "loss": 1.7505, + "step": 13967 + }, + { + "epoch": 4.287292817679558, + "grad_norm": 0.3344273865222931, + "learning_rate": 6.37958826971674e-05, + "loss": 1.8392, + "step": 13968 + }, + { + "epoch": 4.287599754450583, + "grad_norm": 0.2655799090862274, + "learning_rate": 6.379110501993006e-05, + "loss": 1.7575, + "step": 13969 + }, + { + "epoch": 4.287906691221608, + "grad_norm": 0.2569151818752289, + "learning_rate": 6.378632720640095e-05, + "loss": 1.6619, + "step": 13970 + }, + { + "epoch": 4.288213627992634, + "grad_norm": 0.2477198988199234, + "learning_rate": 6.378154925662727e-05, + "loss": 1.7532, + "step": 13971 + }, + { + "epoch": 4.288520564763659, + "grad_norm": 0.2867630422115326, + "learning_rate": 6.377677117065624e-05, + "loss": 1.7725, + "step": 13972 + }, + { + "epoch": 4.2888275015346835, + "grad_norm": 0.28316137194633484, + "learning_rate": 6.37719929485351e-05, + "loss": 1.7628, + "step": 13973 + }, + { + "epoch": 4.289134438305709, + "grad_norm": 0.2934304475784302, + "learning_rate": 6.376721459031106e-05, + "loss": 1.7346, + "step": 13974 + }, + { + "epoch": 4.289441375076734, + "grad_norm": 0.22847147285938263, + "learning_rate": 6.376243609603129e-05, + "loss": 1.7409, + "step": 13975 + }, + { + "epoch": 4.2897483118477595, + "grad_norm": 0.360441118478775, + "learning_rate": 6.375765746574311e-05, + "loss": 1.808, + "step": 13976 + }, + { + "epoch": 4.290055248618785, + "grad_norm": 0.2750907242298126, + "learning_rate": 6.375287869949367e-05, + "loss": 1.8046, + "step": 13977 + }, + { + "epoch": 4.290362185389809, + "grad_norm": 0.26193201541900635, + "learning_rate": 6.374809979733022e-05, + "loss": 1.7097, + "step": 13978 + }, + { + "epoch": 4.290669122160835, + "grad_norm": 0.3282175064086914, + "learning_rate": 6.37433207593e-05, + "loss": 1.7924, + "step": 13979 + }, + { + "epoch": 4.29097605893186, + "grad_norm": 0.2845167815685272, + "learning_rate": 6.373854158545021e-05, + "loss": 1.7663, + "step": 13980 + }, + { + "epoch": 4.291282995702885, + "grad_norm": 0.21816621720790863, + "learning_rate": 6.37337622758281e-05, + "loss": 1.7368, + "step": 13981 + }, + { + "epoch": 4.291589932473911, + "grad_norm": 0.264272540807724, + "learning_rate": 6.372898283048094e-05, + "loss": 1.7377, + "step": 13982 + }, + { + "epoch": 4.291896869244935, + "grad_norm": 0.2182006686925888, + "learning_rate": 6.37242032494559e-05, + "loss": 1.8107, + "step": 13983 + }, + { + "epoch": 4.29220380601596, + "grad_norm": 0.26856422424316406, + "learning_rate": 6.371942353280023e-05, + "loss": 1.7708, + "step": 13984 + }, + { + "epoch": 4.292510742786986, + "grad_norm": 0.3025323748588562, + "learning_rate": 6.37146436805612e-05, + "loss": 1.7768, + "step": 13985 + }, + { + "epoch": 4.292817679558011, + "grad_norm": 0.2949144244194031, + "learning_rate": 6.3709863692786e-05, + "loss": 1.7848, + "step": 13986 + }, + { + "epoch": 4.293124616329036, + "grad_norm": 0.20670418441295624, + "learning_rate": 6.370508356952188e-05, + "loss": 1.7367, + "step": 13987 + }, + { + "epoch": 4.293431553100062, + "grad_norm": 0.2453860342502594, + "learning_rate": 6.370030331081611e-05, + "loss": 1.7246, + "step": 13988 + }, + { + "epoch": 4.293738489871086, + "grad_norm": 0.3413507044315338, + "learning_rate": 6.369552291671592e-05, + "loss": 1.7829, + "step": 13989 + }, + { + "epoch": 4.2940454266421115, + "grad_norm": 0.28352782130241394, + "learning_rate": 6.369074238726856e-05, + "loss": 1.7755, + "step": 13990 + }, + { + "epoch": 4.294352363413137, + "grad_norm": 0.21408751606941223, + "learning_rate": 6.368596172252124e-05, + "loss": 1.7292, + "step": 13991 + }, + { + "epoch": 4.294659300184162, + "grad_norm": 0.28372085094451904, + "learning_rate": 6.36811809225212e-05, + "loss": 1.8197, + "step": 13992 + }, + { + "epoch": 4.2949662369551875, + "grad_norm": 0.2400829792022705, + "learning_rate": 6.367639998731573e-05, + "loss": 1.7559, + "step": 13993 + }, + { + "epoch": 4.295273173726212, + "grad_norm": 0.22853593528270721, + "learning_rate": 6.367161891695207e-05, + "loss": 1.8116, + "step": 13994 + }, + { + "epoch": 4.295580110497237, + "grad_norm": 0.22098208963871002, + "learning_rate": 6.366683771147745e-05, + "loss": 1.7269, + "step": 13995 + }, + { + "epoch": 4.295887047268263, + "grad_norm": 0.22293934226036072, + "learning_rate": 6.366205637093914e-05, + "loss": 1.7944, + "step": 13996 + }, + { + "epoch": 4.296193984039288, + "grad_norm": 0.26120004057884216, + "learning_rate": 6.365727489538437e-05, + "loss": 1.7581, + "step": 13997 + }, + { + "epoch": 4.296500920810313, + "grad_norm": 0.2568937838077545, + "learning_rate": 6.365249328486041e-05, + "loss": 1.7356, + "step": 13998 + }, + { + "epoch": 4.296807857581339, + "grad_norm": 0.2419043630361557, + "learning_rate": 6.364771153941449e-05, + "loss": 1.8127, + "step": 13999 + }, + { + "epoch": 4.297114794352363, + "grad_norm": 0.2521972060203552, + "learning_rate": 6.364292965909391e-05, + "loss": 1.7445, + "step": 14000 + }, + { + "epoch": 4.297421731123388, + "grad_norm": 0.3269292414188385, + "learning_rate": 6.363814764394589e-05, + "loss": 1.7835, + "step": 14001 + }, + { + "epoch": 4.297728667894414, + "grad_norm": 0.258405864238739, + "learning_rate": 6.36333654940177e-05, + "loss": 1.7407, + "step": 14002 + }, + { + "epoch": 4.298035604665439, + "grad_norm": 0.21527236700057983, + "learning_rate": 6.362858320935662e-05, + "loss": 1.7729, + "step": 14003 + }, + { + "epoch": 4.298342541436464, + "grad_norm": 0.25343602895736694, + "learning_rate": 6.362380079000988e-05, + "loss": 1.8087, + "step": 14004 + }, + { + "epoch": 4.298649478207489, + "grad_norm": 0.26110637187957764, + "learning_rate": 6.361901823602474e-05, + "loss": 1.813, + "step": 14005 + }, + { + "epoch": 4.298956414978514, + "grad_norm": 0.26749926805496216, + "learning_rate": 6.361423554744851e-05, + "loss": 1.8193, + "step": 14006 + }, + { + "epoch": 4.2992633517495396, + "grad_norm": 0.22357676923274994, + "learning_rate": 6.360945272432841e-05, + "loss": 1.7498, + "step": 14007 + }, + { + "epoch": 4.299570288520565, + "grad_norm": 0.2367832362651825, + "learning_rate": 6.360466976671172e-05, + "loss": 1.7843, + "step": 14008 + }, + { + "epoch": 4.29987722529159, + "grad_norm": 0.23594366014003754, + "learning_rate": 6.35998866746457e-05, + "loss": 1.7442, + "step": 14009 + }, + { + "epoch": 4.300184162062616, + "grad_norm": 0.2660543918609619, + "learning_rate": 6.359510344817765e-05, + "loss": 1.7557, + "step": 14010 + }, + { + "epoch": 4.30049109883364, + "grad_norm": 0.191593199968338, + "learning_rate": 6.359032008735481e-05, + "loss": 1.7988, + "step": 14011 + }, + { + "epoch": 4.300798035604665, + "grad_norm": 0.2755490243434906, + "learning_rate": 6.358553659222447e-05, + "loss": 1.7551, + "step": 14012 + }, + { + "epoch": 4.301104972375691, + "grad_norm": 0.2900530993938446, + "learning_rate": 6.358075296283387e-05, + "loss": 1.7523, + "step": 14013 + }, + { + "epoch": 4.301411909146716, + "grad_norm": 0.22242774069309235, + "learning_rate": 6.357596919923033e-05, + "loss": 1.7626, + "step": 14014 + }, + { + "epoch": 4.301718845917741, + "grad_norm": 0.26636210083961487, + "learning_rate": 6.357118530146108e-05, + "loss": 1.7855, + "step": 14015 + }, + { + "epoch": 4.302025782688766, + "grad_norm": 0.3055269718170166, + "learning_rate": 6.356640126957344e-05, + "loss": 1.7528, + "step": 14016 + }, + { + "epoch": 4.302332719459791, + "grad_norm": 0.29695719480514526, + "learning_rate": 6.356161710361468e-05, + "loss": 1.7482, + "step": 14017 + }, + { + "epoch": 4.3026396562308165, + "grad_norm": 0.2369711697101593, + "learning_rate": 6.355683280363207e-05, + "loss": 1.7635, + "step": 14018 + }, + { + "epoch": 4.302946593001842, + "grad_norm": 0.26681363582611084, + "learning_rate": 6.35520483696729e-05, + "loss": 1.8814, + "step": 14019 + }, + { + "epoch": 4.303253529772867, + "grad_norm": 0.2623308598995209, + "learning_rate": 6.354726380178442e-05, + "loss": 1.8645, + "step": 14020 + }, + { + "epoch": 4.303560466543892, + "grad_norm": 0.23326413333415985, + "learning_rate": 6.354247910001394e-05, + "loss": 1.8093, + "step": 14021 + }, + { + "epoch": 4.303867403314917, + "grad_norm": 0.3037295639514923, + "learning_rate": 6.353769426440875e-05, + "loss": 1.8556, + "step": 14022 + }, + { + "epoch": 4.304174340085942, + "grad_norm": 0.23624882102012634, + "learning_rate": 6.353290929501616e-05, + "loss": 1.803, + "step": 14023 + }, + { + "epoch": 4.304481276856968, + "grad_norm": 0.22106927633285522, + "learning_rate": 6.35281241918834e-05, + "loss": 1.7133, + "step": 14024 + }, + { + "epoch": 4.304788213627993, + "grad_norm": 0.2374040186405182, + "learning_rate": 6.352333895505778e-05, + "loss": 1.8127, + "step": 14025 + }, + { + "epoch": 4.305095150399017, + "grad_norm": 0.2782450318336487, + "learning_rate": 6.35185535845866e-05, + "loss": 1.8613, + "step": 14026 + }, + { + "epoch": 4.305402087170043, + "grad_norm": 0.2527763843536377, + "learning_rate": 6.351376808051717e-05, + "loss": 1.7533, + "step": 14027 + }, + { + "epoch": 4.305709023941068, + "grad_norm": 0.2462318390607834, + "learning_rate": 6.350898244289675e-05, + "loss": 1.8075, + "step": 14028 + }, + { + "epoch": 4.306015960712093, + "grad_norm": 0.2646189332008362, + "learning_rate": 6.350419667177265e-05, + "loss": 1.8261, + "step": 14029 + }, + { + "epoch": 4.306322897483119, + "grad_norm": 0.24918611347675323, + "learning_rate": 6.349941076719218e-05, + "loss": 1.7542, + "step": 14030 + }, + { + "epoch": 4.306629834254144, + "grad_norm": 0.22440841794013977, + "learning_rate": 6.349462472920259e-05, + "loss": 1.7897, + "step": 14031 + }, + { + "epoch": 4.3069367710251685, + "grad_norm": 0.28614330291748047, + "learning_rate": 6.348983855785121e-05, + "loss": 1.88, + "step": 14032 + }, + { + "epoch": 4.307243707796194, + "grad_norm": 0.25015848875045776, + "learning_rate": 6.348505225318535e-05, + "loss": 1.8008, + "step": 14033 + }, + { + "epoch": 4.307550644567219, + "grad_norm": 0.2468707263469696, + "learning_rate": 6.34802658152523e-05, + "loss": 1.8025, + "step": 14034 + }, + { + "epoch": 4.3078575813382445, + "grad_norm": 0.30504748225212097, + "learning_rate": 6.347547924409937e-05, + "loss": 1.8765, + "step": 14035 + }, + { + "epoch": 4.30816451810927, + "grad_norm": 0.35419392585754395, + "learning_rate": 6.347069253977385e-05, + "loss": 1.7807, + "step": 14036 + }, + { + "epoch": 4.308471454880294, + "grad_norm": 0.33683931827545166, + "learning_rate": 6.346590570232305e-05, + "loss": 1.7244, + "step": 14037 + }, + { + "epoch": 4.30877839165132, + "grad_norm": 0.3339467942714691, + "learning_rate": 6.346111873179427e-05, + "loss": 1.7642, + "step": 14038 + }, + { + "epoch": 4.309085328422345, + "grad_norm": 0.2369392216205597, + "learning_rate": 6.345633162823484e-05, + "loss": 1.7127, + "step": 14039 + }, + { + "epoch": 4.30939226519337, + "grad_norm": 0.26469686627388, + "learning_rate": 6.345154439169206e-05, + "loss": 1.7235, + "step": 14040 + }, + { + "epoch": 4.309699201964396, + "grad_norm": 0.2737344205379486, + "learning_rate": 6.344675702221321e-05, + "loss": 1.783, + "step": 14041 + }, + { + "epoch": 4.310006138735421, + "grad_norm": 0.2381773442029953, + "learning_rate": 6.344196951984565e-05, + "loss": 1.7172, + "step": 14042 + }, + { + "epoch": 4.310313075506445, + "grad_norm": 0.28199076652526855, + "learning_rate": 6.343718188463663e-05, + "loss": 1.8315, + "step": 14043 + }, + { + "epoch": 4.310620012277471, + "grad_norm": 0.24378590285778046, + "learning_rate": 6.343239411663353e-05, + "loss": 1.7828, + "step": 14044 + }, + { + "epoch": 4.310926949048496, + "grad_norm": 0.26343944668769836, + "learning_rate": 6.342760621588365e-05, + "loss": 1.7679, + "step": 14045 + }, + { + "epoch": 4.311233885819521, + "grad_norm": 0.23703521490097046, + "learning_rate": 6.342281818243427e-05, + "loss": 1.7885, + "step": 14046 + }, + { + "epoch": 4.311540822590547, + "grad_norm": 0.2230173498392105, + "learning_rate": 6.341803001633276e-05, + "loss": 1.767, + "step": 14047 + }, + { + "epoch": 4.311847759361571, + "grad_norm": 0.249002143740654, + "learning_rate": 6.34132417176264e-05, + "loss": 1.8032, + "step": 14048 + }, + { + "epoch": 4.3121546961325965, + "grad_norm": 0.2383791208267212, + "learning_rate": 6.34084532863625e-05, + "loss": 1.7558, + "step": 14049 + }, + { + "epoch": 4.312461632903622, + "grad_norm": 0.2783047556877136, + "learning_rate": 6.340366472258843e-05, + "loss": 1.8389, + "step": 14050 + }, + { + "epoch": 4.312768569674647, + "grad_norm": 0.2654891312122345, + "learning_rate": 6.339887602635148e-05, + "loss": 1.7989, + "step": 14051 + }, + { + "epoch": 4.3130755064456725, + "grad_norm": 0.2638411521911621, + "learning_rate": 6.3394087197699e-05, + "loss": 1.8707, + "step": 14052 + }, + { + "epoch": 4.313382443216697, + "grad_norm": 0.3026179075241089, + "learning_rate": 6.338929823667829e-05, + "loss": 1.7892, + "step": 14053 + }, + { + "epoch": 4.313689379987722, + "grad_norm": 0.27496880292892456, + "learning_rate": 6.338450914333668e-05, + "loss": 1.7398, + "step": 14054 + }, + { + "epoch": 4.313996316758748, + "grad_norm": 0.2601073086261749, + "learning_rate": 6.337971991772151e-05, + "loss": 1.7646, + "step": 14055 + }, + { + "epoch": 4.314303253529773, + "grad_norm": 0.2061719298362732, + "learning_rate": 6.337493055988011e-05, + "loss": 1.7372, + "step": 14056 + }, + { + "epoch": 4.314610190300798, + "grad_norm": 0.23722340166568756, + "learning_rate": 6.337014106985981e-05, + "loss": 1.7457, + "step": 14057 + }, + { + "epoch": 4.314917127071823, + "grad_norm": 0.2729428708553314, + "learning_rate": 6.336535144770793e-05, + "loss": 1.8423, + "step": 14058 + }, + { + "epoch": 4.315224063842848, + "grad_norm": 0.23520450294017792, + "learning_rate": 6.336056169347182e-05, + "loss": 1.8124, + "step": 14059 + }, + { + "epoch": 4.315531000613873, + "grad_norm": 0.25142738223075867, + "learning_rate": 6.33557718071988e-05, + "loss": 1.7285, + "step": 14060 + }, + { + "epoch": 4.315837937384899, + "grad_norm": 0.24833035469055176, + "learning_rate": 6.335098178893621e-05, + "loss": 1.766, + "step": 14061 + }, + { + "epoch": 4.316144874155924, + "grad_norm": 0.2406177669763565, + "learning_rate": 6.334619163873141e-05, + "loss": 1.8824, + "step": 14062 + }, + { + "epoch": 4.316451810926949, + "grad_norm": 0.23077574372291565, + "learning_rate": 6.334140135663172e-05, + "loss": 1.7589, + "step": 14063 + }, + { + "epoch": 4.316758747697974, + "grad_norm": 0.20476560294628143, + "learning_rate": 6.333661094268448e-05, + "loss": 1.7331, + "step": 14064 + }, + { + "epoch": 4.317065684468999, + "grad_norm": 0.207991823554039, + "learning_rate": 6.333182039693704e-05, + "loss": 1.6876, + "step": 14065 + }, + { + "epoch": 4.3173726212400245, + "grad_norm": 0.20813052356243134, + "learning_rate": 6.332702971943671e-05, + "loss": 1.775, + "step": 14066 + }, + { + "epoch": 4.31767955801105, + "grad_norm": 0.2470991462469101, + "learning_rate": 6.332223891023087e-05, + "loss": 1.7673, + "step": 14067 + }, + { + "epoch": 4.317986494782075, + "grad_norm": 0.23855723440647125, + "learning_rate": 6.331744796936687e-05, + "loss": 1.7842, + "step": 14068 + }, + { + "epoch": 4.3182934315531, + "grad_norm": 0.21852652728557587, + "learning_rate": 6.331265689689204e-05, + "loss": 1.7727, + "step": 14069 + }, + { + "epoch": 4.318600368324125, + "grad_norm": 0.284496545791626, + "learning_rate": 6.330786569285374e-05, + "loss": 1.8248, + "step": 14070 + }, + { + "epoch": 4.31890730509515, + "grad_norm": 0.21709981560707092, + "learning_rate": 6.33030743572993e-05, + "loss": 1.7547, + "step": 14071 + }, + { + "epoch": 4.319214241866176, + "grad_norm": 0.24209457635879517, + "learning_rate": 6.329828289027608e-05, + "loss": 1.7695, + "step": 14072 + }, + { + "epoch": 4.319521178637201, + "grad_norm": 0.24869373440742493, + "learning_rate": 6.329349129183144e-05, + "loss": 1.8204, + "step": 14073 + }, + { + "epoch": 4.319828115408226, + "grad_norm": 0.21702703833580017, + "learning_rate": 6.328869956201274e-05, + "loss": 1.779, + "step": 14074 + }, + { + "epoch": 4.320135052179251, + "grad_norm": 0.22993850708007812, + "learning_rate": 6.328390770086731e-05, + "loss": 1.7935, + "step": 14075 + }, + { + "epoch": 4.320441988950276, + "grad_norm": 0.23491734266281128, + "learning_rate": 6.327911570844252e-05, + "loss": 1.7261, + "step": 14076 + }, + { + "epoch": 4.320748925721301, + "grad_norm": 0.2479303777217865, + "learning_rate": 6.327432358478571e-05, + "loss": 1.7683, + "step": 14077 + }, + { + "epoch": 4.321055862492327, + "grad_norm": 0.24261580407619476, + "learning_rate": 6.326953132994427e-05, + "loss": 1.7147, + "step": 14078 + }, + { + "epoch": 4.321362799263352, + "grad_norm": 0.24627646803855896, + "learning_rate": 6.326473894396553e-05, + "loss": 1.7976, + "step": 14079 + }, + { + "epoch": 4.3216697360343765, + "grad_norm": 0.269149512052536, + "learning_rate": 6.325994642689688e-05, + "loss": 1.7247, + "step": 14080 + }, + { + "epoch": 4.321976672805402, + "grad_norm": 0.4162158966064453, + "learning_rate": 6.325515377878566e-05, + "loss": 1.7485, + "step": 14081 + }, + { + "epoch": 4.322283609576427, + "grad_norm": 0.366459459066391, + "learning_rate": 6.325036099967925e-05, + "loss": 1.7286, + "step": 14082 + }, + { + "epoch": 4.3225905463474525, + "grad_norm": 0.2465270757675171, + "learning_rate": 6.324556808962499e-05, + "loss": 1.8097, + "step": 14083 + }, + { + "epoch": 4.322897483118478, + "grad_norm": 0.2911076843738556, + "learning_rate": 6.324077504867026e-05, + "loss": 1.7979, + "step": 14084 + }, + { + "epoch": 4.323204419889503, + "grad_norm": 0.33455169200897217, + "learning_rate": 6.323598187686245e-05, + "loss": 1.7988, + "step": 14085 + }, + { + "epoch": 4.323511356660528, + "grad_norm": 0.25020337104797363, + "learning_rate": 6.32311885742489e-05, + "loss": 1.7184, + "step": 14086 + }, + { + "epoch": 4.323818293431553, + "grad_norm": 0.23941513895988464, + "learning_rate": 6.322639514087699e-05, + "loss": 1.7672, + "step": 14087 + }, + { + "epoch": 4.324125230202578, + "grad_norm": 0.35258981585502625, + "learning_rate": 6.32216015767941e-05, + "loss": 1.7571, + "step": 14088 + }, + { + "epoch": 4.324432166973604, + "grad_norm": 0.2854993939399719, + "learning_rate": 6.321680788204758e-05, + "loss": 1.8096, + "step": 14089 + }, + { + "epoch": 4.324739103744629, + "grad_norm": 0.24422863125801086, + "learning_rate": 6.321201405668482e-05, + "loss": 1.778, + "step": 14090 + }, + { + "epoch": 4.3250460405156534, + "grad_norm": 0.36629122495651245, + "learning_rate": 6.320722010075321e-05, + "loss": 1.716, + "step": 14091 + }, + { + "epoch": 4.325352977286679, + "grad_norm": 0.37115517258644104, + "learning_rate": 6.32024260143001e-05, + "loss": 1.77, + "step": 14092 + }, + { + "epoch": 4.325659914057704, + "grad_norm": 0.21540327370166779, + "learning_rate": 6.319763179737288e-05, + "loss": 1.7529, + "step": 14093 + }, + { + "epoch": 4.3259668508287294, + "grad_norm": 0.2573898732662201, + "learning_rate": 6.319283745001892e-05, + "loss": 1.8101, + "step": 14094 + }, + { + "epoch": 4.326273787599755, + "grad_norm": 0.29481247067451477, + "learning_rate": 6.31880429722856e-05, + "loss": 1.7459, + "step": 14095 + }, + { + "epoch": 4.326580724370779, + "grad_norm": 0.23474647104740143, + "learning_rate": 6.318324836422031e-05, + "loss": 1.786, + "step": 14096 + }, + { + "epoch": 4.326887661141805, + "grad_norm": 0.2884673476219177, + "learning_rate": 6.317845362587045e-05, + "loss": 1.8123, + "step": 14097 + }, + { + "epoch": 4.32719459791283, + "grad_norm": 0.39008447527885437, + "learning_rate": 6.317365875728338e-05, + "loss": 1.7729, + "step": 14098 + }, + { + "epoch": 4.327501534683855, + "grad_norm": 0.30568063259124756, + "learning_rate": 6.316886375850651e-05, + "loss": 1.7088, + "step": 14099 + }, + { + "epoch": 4.327808471454881, + "grad_norm": 0.2538018524646759, + "learning_rate": 6.316406862958718e-05, + "loss": 1.8028, + "step": 14100 + }, + { + "epoch": 4.328115408225905, + "grad_norm": 0.3815068006515503, + "learning_rate": 6.315927337057281e-05, + "loss": 1.7143, + "step": 14101 + }, + { + "epoch": 4.32842234499693, + "grad_norm": 0.3813243508338928, + "learning_rate": 6.31544779815108e-05, + "loss": 1.7072, + "step": 14102 + }, + { + "epoch": 4.328729281767956, + "grad_norm": 0.22438868880271912, + "learning_rate": 6.314968246244852e-05, + "loss": 1.7445, + "step": 14103 + }, + { + "epoch": 4.329036218538981, + "grad_norm": 0.3818886876106262, + "learning_rate": 6.314488681343337e-05, + "loss": 1.8292, + "step": 14104 + }, + { + "epoch": 4.329343155310006, + "grad_norm": 0.4376567006111145, + "learning_rate": 6.314009103451277e-05, + "loss": 1.8224, + "step": 14105 + }, + { + "epoch": 4.329650092081032, + "grad_norm": 0.2741515636444092, + "learning_rate": 6.313529512573406e-05, + "loss": 1.8078, + "step": 14106 + }, + { + "epoch": 4.329957028852056, + "grad_norm": 0.264343798160553, + "learning_rate": 6.313049908714467e-05, + "loss": 1.7314, + "step": 14107 + }, + { + "epoch": 4.3302639656230815, + "grad_norm": 0.3601943552494049, + "learning_rate": 6.312570291879201e-05, + "loss": 1.7351, + "step": 14108 + }, + { + "epoch": 4.330570902394107, + "grad_norm": 0.2931751012802124, + "learning_rate": 6.312090662072345e-05, + "loss": 1.8117, + "step": 14109 + }, + { + "epoch": 4.330877839165132, + "grad_norm": 0.27670225501060486, + "learning_rate": 6.31161101929864e-05, + "loss": 1.7707, + "step": 14110 + }, + { + "epoch": 4.3311847759361575, + "grad_norm": 0.33669596910476685, + "learning_rate": 6.311131363562825e-05, + "loss": 1.7337, + "step": 14111 + }, + { + "epoch": 4.331491712707182, + "grad_norm": 0.232634037733078, + "learning_rate": 6.310651694869643e-05, + "loss": 1.7372, + "step": 14112 + }, + { + "epoch": 4.331798649478207, + "grad_norm": 0.28611311316490173, + "learning_rate": 6.310172013223832e-05, + "loss": 1.6977, + "step": 14113 + }, + { + "epoch": 4.332105586249233, + "grad_norm": 0.30207201838493347, + "learning_rate": 6.309692318630132e-05, + "loss": 1.7765, + "step": 14114 + }, + { + "epoch": 4.332412523020258, + "grad_norm": 0.20757484436035156, + "learning_rate": 6.309212611093287e-05, + "loss": 1.697, + "step": 14115 + }, + { + "epoch": 4.332719459791283, + "grad_norm": 0.31472963094711304, + "learning_rate": 6.308732890618034e-05, + "loss": 1.7757, + "step": 14116 + }, + { + "epoch": 4.333026396562309, + "grad_norm": 0.37042325735092163, + "learning_rate": 6.308253157209117e-05, + "loss": 1.7745, + "step": 14117 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.25001442432403564, + "learning_rate": 6.307773410871275e-05, + "loss": 1.7461, + "step": 14118 + }, + { + "epoch": 4.333640270104358, + "grad_norm": 0.2691943347454071, + "learning_rate": 6.307293651609248e-05, + "loss": 1.7539, + "step": 14119 + }, + { + "epoch": 4.333947206875384, + "grad_norm": 0.30845868587493896, + "learning_rate": 6.306813879427782e-05, + "loss": 1.7559, + "step": 14120 + }, + { + "epoch": 4.334254143646409, + "grad_norm": 0.2244730293750763, + "learning_rate": 6.306334094331613e-05, + "loss": 1.7609, + "step": 14121 + }, + { + "epoch": 4.334561080417434, + "grad_norm": 0.32132062315940857, + "learning_rate": 6.305854296325485e-05, + "loss": 1.7837, + "step": 14122 + }, + { + "epoch": 4.334868017188459, + "grad_norm": 0.3762948513031006, + "learning_rate": 6.30537448541414e-05, + "loss": 1.7631, + "step": 14123 + }, + { + "epoch": 4.335174953959484, + "grad_norm": 0.24174273014068604, + "learning_rate": 6.30489466160232e-05, + "loss": 1.7532, + "step": 14124 + }, + { + "epoch": 4.3354818907305095, + "grad_norm": 0.23468497395515442, + "learning_rate": 6.304414824894765e-05, + "loss": 1.7731, + "step": 14125 + }, + { + "epoch": 4.335788827501535, + "grad_norm": 0.29086077213287354, + "learning_rate": 6.303934975296218e-05, + "loss": 1.7668, + "step": 14126 + }, + { + "epoch": 4.33609576427256, + "grad_norm": 0.2889879643917084, + "learning_rate": 6.303455112811422e-05, + "loss": 1.8188, + "step": 14127 + }, + { + "epoch": 4.336402701043585, + "grad_norm": 0.2335619181394577, + "learning_rate": 6.302975237445119e-05, + "loss": 1.7944, + "step": 14128 + }, + { + "epoch": 4.33670963781461, + "grad_norm": 0.29027310013771057, + "learning_rate": 6.302495349202051e-05, + "loss": 1.7771, + "step": 14129 + }, + { + "epoch": 4.337016574585635, + "grad_norm": 0.31961241364479065, + "learning_rate": 6.302015448086959e-05, + "loss": 1.8187, + "step": 14130 + }, + { + "epoch": 4.337323511356661, + "grad_norm": 0.26015788316726685, + "learning_rate": 6.301535534104587e-05, + "loss": 1.7819, + "step": 14131 + }, + { + "epoch": 4.337630448127686, + "grad_norm": 0.2440631091594696, + "learning_rate": 6.30105560725968e-05, + "loss": 1.7127, + "step": 14132 + }, + { + "epoch": 4.337937384898711, + "grad_norm": 0.304441899061203, + "learning_rate": 6.300575667556979e-05, + "loss": 1.7619, + "step": 14133 + }, + { + "epoch": 4.338244321669736, + "grad_norm": 0.3085228204727173, + "learning_rate": 6.300095715001226e-05, + "loss": 1.8287, + "step": 14134 + }, + { + "epoch": 4.338551258440761, + "grad_norm": 0.2863372564315796, + "learning_rate": 6.299615749597165e-05, + "loss": 1.8068, + "step": 14135 + }, + { + "epoch": 4.338858195211786, + "grad_norm": 0.25255265831947327, + "learning_rate": 6.299135771349537e-05, + "loss": 1.7506, + "step": 14136 + }, + { + "epoch": 4.339165131982812, + "grad_norm": 0.30224961042404175, + "learning_rate": 6.298655780263092e-05, + "loss": 1.7292, + "step": 14137 + }, + { + "epoch": 4.339472068753837, + "grad_norm": 0.24222104251384735, + "learning_rate": 6.298175776342567e-05, + "loss": 1.7616, + "step": 14138 + }, + { + "epoch": 4.3397790055248615, + "grad_norm": 0.3236368000507355, + "learning_rate": 6.29769575959271e-05, + "loss": 1.787, + "step": 14139 + }, + { + "epoch": 4.340085942295887, + "grad_norm": 0.26049408316612244, + "learning_rate": 6.297215730018261e-05, + "loss": 1.7108, + "step": 14140 + }, + { + "epoch": 4.340392879066912, + "grad_norm": 0.22833532094955444, + "learning_rate": 6.296735687623967e-05, + "loss": 1.7661, + "step": 14141 + }, + { + "epoch": 4.3406998158379375, + "grad_norm": 0.28397905826568604, + "learning_rate": 6.296255632414571e-05, + "loss": 1.7163, + "step": 14142 + }, + { + "epoch": 4.341006752608963, + "grad_norm": 0.3072611093521118, + "learning_rate": 6.295775564394817e-05, + "loss": 1.857, + "step": 14143 + }, + { + "epoch": 4.341313689379987, + "grad_norm": 0.22901058197021484, + "learning_rate": 6.295295483569448e-05, + "loss": 1.7325, + "step": 14144 + }, + { + "epoch": 4.341620626151013, + "grad_norm": 0.27433091402053833, + "learning_rate": 6.294815389943212e-05, + "loss": 1.8229, + "step": 14145 + }, + { + "epoch": 4.341927562922038, + "grad_norm": 0.2635616958141327, + "learning_rate": 6.29433528352085e-05, + "loss": 1.7585, + "step": 14146 + }, + { + "epoch": 4.342234499693063, + "grad_norm": 0.29129260778427124, + "learning_rate": 6.293855164307108e-05, + "loss": 1.8294, + "step": 14147 + }, + { + "epoch": 4.342541436464089, + "grad_norm": 0.3429001569747925, + "learning_rate": 6.293375032306731e-05, + "loss": 1.7725, + "step": 14148 + }, + { + "epoch": 4.342848373235114, + "grad_norm": 0.22407259047031403, + "learning_rate": 6.292894887524464e-05, + "loss": 1.7018, + "step": 14149 + }, + { + "epoch": 4.343155310006138, + "grad_norm": 0.3319321274757385, + "learning_rate": 6.292414729965053e-05, + "loss": 1.8472, + "step": 14150 + }, + { + "epoch": 4.343462246777164, + "grad_norm": 0.42744341492652893, + "learning_rate": 6.291934559633241e-05, + "loss": 1.8118, + "step": 14151 + }, + { + "epoch": 4.343769183548189, + "grad_norm": 0.24572840332984924, + "learning_rate": 6.291454376533774e-05, + "loss": 1.7184, + "step": 14152 + }, + { + "epoch": 4.344076120319214, + "grad_norm": 0.2485980987548828, + "learning_rate": 6.290974180671397e-05, + "loss": 1.7649, + "step": 14153 + }, + { + "epoch": 4.34438305709024, + "grad_norm": 0.3911706209182739, + "learning_rate": 6.29049397205086e-05, + "loss": 1.8105, + "step": 14154 + }, + { + "epoch": 4.344689993861264, + "grad_norm": 0.3008342981338501, + "learning_rate": 6.290013750676902e-05, + "loss": 1.7671, + "step": 14155 + }, + { + "epoch": 4.3449969306322895, + "grad_norm": 0.2072051614522934, + "learning_rate": 6.289533516554274e-05, + "loss": 1.7406, + "step": 14156 + }, + { + "epoch": 4.345303867403315, + "grad_norm": 0.3047312796115875, + "learning_rate": 6.289053269687719e-05, + "loss": 1.8133, + "step": 14157 + }, + { + "epoch": 4.34561080417434, + "grad_norm": 0.28260552883148193, + "learning_rate": 6.288573010081984e-05, + "loss": 1.7253, + "step": 14158 + }, + { + "epoch": 4.3459177409453655, + "grad_norm": 0.2474137246608734, + "learning_rate": 6.288092737741815e-05, + "loss": 1.822, + "step": 14159 + }, + { + "epoch": 4.346224677716391, + "grad_norm": 0.23717878758907318, + "learning_rate": 6.287612452671961e-05, + "loss": 1.7826, + "step": 14160 + }, + { + "epoch": 4.346531614487415, + "grad_norm": 0.2646107077598572, + "learning_rate": 6.287132154877163e-05, + "loss": 1.8118, + "step": 14161 + }, + { + "epoch": 4.346838551258441, + "grad_norm": 0.22026480734348297, + "learning_rate": 6.286651844362172e-05, + "loss": 1.7767, + "step": 14162 + }, + { + "epoch": 4.347145488029466, + "grad_norm": 0.2692350447177887, + "learning_rate": 6.286171521131733e-05, + "loss": 1.8718, + "step": 14163 + }, + { + "epoch": 4.347452424800491, + "grad_norm": 0.2749998867511749, + "learning_rate": 6.285691185190592e-05, + "loss": 1.7689, + "step": 14164 + }, + { + "epoch": 4.347759361571517, + "grad_norm": 0.24552448093891144, + "learning_rate": 6.2852108365435e-05, + "loss": 1.8049, + "step": 14165 + }, + { + "epoch": 4.348066298342541, + "grad_norm": 0.20530807971954346, + "learning_rate": 6.2847304751952e-05, + "loss": 1.7606, + "step": 14166 + }, + { + "epoch": 4.348373235113566, + "grad_norm": 0.23396088182926178, + "learning_rate": 6.28425010115044e-05, + "loss": 1.7482, + "step": 14167 + }, + { + "epoch": 4.348680171884592, + "grad_norm": 0.20512452721595764, + "learning_rate": 6.283769714413968e-05, + "loss": 1.6976, + "step": 14168 + }, + { + "epoch": 4.348987108655617, + "grad_norm": 0.20287172496318817, + "learning_rate": 6.283289314990531e-05, + "loss": 1.7439, + "step": 14169 + }, + { + "epoch": 4.349294045426642, + "grad_norm": 0.2193746268749237, + "learning_rate": 6.282808902884876e-05, + "loss": 1.763, + "step": 14170 + }, + { + "epoch": 4.349600982197667, + "grad_norm": 0.20415273308753967, + "learning_rate": 6.282328478101753e-05, + "loss": 1.7025, + "step": 14171 + }, + { + "epoch": 4.349907918968692, + "grad_norm": 0.19286803901195526, + "learning_rate": 6.281848040645907e-05, + "loss": 1.7529, + "step": 14172 + }, + { + "epoch": 4.350214855739718, + "grad_norm": 0.20908218622207642, + "learning_rate": 6.281367590522088e-05, + "loss": 1.7896, + "step": 14173 + }, + { + "epoch": 4.350521792510743, + "grad_norm": 0.2599989175796509, + "learning_rate": 6.280887127735045e-05, + "loss": 1.764, + "step": 14174 + }, + { + "epoch": 4.350828729281768, + "grad_norm": 0.23955710232257843, + "learning_rate": 6.280406652289523e-05, + "loss": 1.7321, + "step": 14175 + }, + { + "epoch": 4.351135666052793, + "grad_norm": 0.2311990112066269, + "learning_rate": 6.279926164190272e-05, + "loss": 1.7338, + "step": 14176 + }, + { + "epoch": 4.351442602823818, + "grad_norm": 0.2599658966064453, + "learning_rate": 6.27944566344204e-05, + "loss": 1.7444, + "step": 14177 + }, + { + "epoch": 4.351749539594843, + "grad_norm": 0.23079386353492737, + "learning_rate": 6.278965150049579e-05, + "loss": 1.7011, + "step": 14178 + }, + { + "epoch": 4.352056476365869, + "grad_norm": 0.24844171106815338, + "learning_rate": 6.278484624017631e-05, + "loss": 1.7298, + "step": 14179 + }, + { + "epoch": 4.352363413136894, + "grad_norm": 0.24839860200881958, + "learning_rate": 6.27800408535095e-05, + "loss": 1.7717, + "step": 14180 + }, + { + "epoch": 4.352670349907919, + "grad_norm": 0.2652966380119324, + "learning_rate": 6.277523534054284e-05, + "loss": 1.7759, + "step": 14181 + }, + { + "epoch": 4.352977286678944, + "grad_norm": 0.2787603735923767, + "learning_rate": 6.277042970132381e-05, + "loss": 1.8981, + "step": 14182 + }, + { + "epoch": 4.353284223449969, + "grad_norm": 0.2535475194454193, + "learning_rate": 6.276562393589991e-05, + "loss": 1.7538, + "step": 14183 + }, + { + "epoch": 4.3535911602209945, + "grad_norm": 0.3210967183113098, + "learning_rate": 6.276081804431863e-05, + "loss": 1.7087, + "step": 14184 + }, + { + "epoch": 4.35389809699202, + "grad_norm": 0.29936519265174866, + "learning_rate": 6.275601202662749e-05, + "loss": 1.7647, + "step": 14185 + }, + { + "epoch": 4.354205033763045, + "grad_norm": 0.21980762481689453, + "learning_rate": 6.275120588287394e-05, + "loss": 1.7759, + "step": 14186 + }, + { + "epoch": 4.35451197053407, + "grad_norm": 0.26833051443099976, + "learning_rate": 6.274639961310549e-05, + "loss": 1.7648, + "step": 14187 + }, + { + "epoch": 4.354818907305095, + "grad_norm": 0.27998095750808716, + "learning_rate": 6.274159321736966e-05, + "loss": 1.746, + "step": 14188 + }, + { + "epoch": 4.35512584407612, + "grad_norm": 0.21354494988918304, + "learning_rate": 6.273678669571395e-05, + "loss": 1.7417, + "step": 14189 + }, + { + "epoch": 4.355432780847146, + "grad_norm": 0.2295297235250473, + "learning_rate": 6.273198004818583e-05, + "loss": 1.7805, + "step": 14190 + }, + { + "epoch": 4.355739717618171, + "grad_norm": 0.2416422963142395, + "learning_rate": 6.272717327483283e-05, + "loss": 1.73, + "step": 14191 + }, + { + "epoch": 4.356046654389196, + "grad_norm": 0.2685304880142212, + "learning_rate": 6.272236637570244e-05, + "loss": 1.7936, + "step": 14192 + }, + { + "epoch": 4.356353591160221, + "grad_norm": 0.32481294870376587, + "learning_rate": 6.271755935084218e-05, + "loss": 1.7192, + "step": 14193 + }, + { + "epoch": 4.356660527931246, + "grad_norm": 0.2428581267595291, + "learning_rate": 6.271275220029954e-05, + "loss": 1.7428, + "step": 14194 + }, + { + "epoch": 4.356967464702271, + "grad_norm": 0.2266654521226883, + "learning_rate": 6.270794492412203e-05, + "loss": 1.7266, + "step": 14195 + }, + { + "epoch": 4.357274401473297, + "grad_norm": 0.25062093138694763, + "learning_rate": 6.270313752235716e-05, + "loss": 1.7476, + "step": 14196 + }, + { + "epoch": 4.357581338244322, + "grad_norm": 0.24085770547389984, + "learning_rate": 6.269832999505244e-05, + "loss": 1.7981, + "step": 14197 + }, + { + "epoch": 4.3578882750153465, + "grad_norm": 0.27035796642303467, + "learning_rate": 6.269352234225536e-05, + "loss": 1.8867, + "step": 14198 + }, + { + "epoch": 4.358195211786372, + "grad_norm": 0.22464458644390106, + "learning_rate": 6.268871456401348e-05, + "loss": 1.7514, + "step": 14199 + }, + { + "epoch": 4.358502148557397, + "grad_norm": 0.22485734522342682, + "learning_rate": 6.268390666037427e-05, + "loss": 1.7558, + "step": 14200 + }, + { + "epoch": 4.3588090853284225, + "grad_norm": 0.2052135169506073, + "learning_rate": 6.267909863138527e-05, + "loss": 1.7453, + "step": 14201 + }, + { + "epoch": 4.359116022099448, + "grad_norm": 0.2130763679742813, + "learning_rate": 6.267429047709397e-05, + "loss": 1.7712, + "step": 14202 + }, + { + "epoch": 4.359422958870473, + "grad_norm": 0.23146997392177582, + "learning_rate": 6.266948219754793e-05, + "loss": 1.6978, + "step": 14203 + }, + { + "epoch": 4.359729895641498, + "grad_norm": 0.21657225489616394, + "learning_rate": 6.266467379279463e-05, + "loss": 1.7641, + "step": 14204 + }, + { + "epoch": 4.360036832412523, + "grad_norm": 0.2598700523376465, + "learning_rate": 6.265986526288158e-05, + "loss": 1.7956, + "step": 14205 + }, + { + "epoch": 4.360343769183548, + "grad_norm": 0.23497453331947327, + "learning_rate": 6.265505660785633e-05, + "loss": 1.7835, + "step": 14206 + }, + { + "epoch": 4.360650705954574, + "grad_norm": 0.2491760104894638, + "learning_rate": 6.265024782776641e-05, + "loss": 1.8454, + "step": 14207 + }, + { + "epoch": 4.360957642725599, + "grad_norm": 0.224884033203125, + "learning_rate": 6.264543892265932e-05, + "loss": 1.8383, + "step": 14208 + }, + { + "epoch": 4.361264579496623, + "grad_norm": 0.24057646095752716, + "learning_rate": 6.264062989258259e-05, + "loss": 1.7437, + "step": 14209 + }, + { + "epoch": 4.361571516267649, + "grad_norm": 0.24661841988563538, + "learning_rate": 6.263582073758374e-05, + "loss": 1.8151, + "step": 14210 + }, + { + "epoch": 4.361878453038674, + "grad_norm": 0.24618980288505554, + "learning_rate": 6.263101145771031e-05, + "loss": 1.7955, + "step": 14211 + }, + { + "epoch": 4.362185389809699, + "grad_norm": 0.2615448236465454, + "learning_rate": 6.262620205300981e-05, + "loss": 1.7819, + "step": 14212 + }, + { + "epoch": 4.362492326580725, + "grad_norm": 0.3528309464454651, + "learning_rate": 6.26213925235298e-05, + "loss": 1.7723, + "step": 14213 + }, + { + "epoch": 4.362799263351749, + "grad_norm": 0.3099561035633087, + "learning_rate": 6.261658286931779e-05, + "loss": 1.7361, + "step": 14214 + }, + { + "epoch": 4.3631062001227745, + "grad_norm": 0.23693235218524933, + "learning_rate": 6.26117730904213e-05, + "loss": 1.8117, + "step": 14215 + }, + { + "epoch": 4.3634131368938, + "grad_norm": 0.4164150655269623, + "learning_rate": 6.260696318688786e-05, + "loss": 1.7908, + "step": 14216 + }, + { + "epoch": 4.363720073664825, + "grad_norm": 0.39376336336135864, + "learning_rate": 6.260215315876506e-05, + "loss": 1.7832, + "step": 14217 + }, + { + "epoch": 4.3640270104358505, + "grad_norm": 0.24071799218654633, + "learning_rate": 6.259734300610037e-05, + "loss": 1.7569, + "step": 14218 + }, + { + "epoch": 4.364333947206875, + "grad_norm": 0.4305122494697571, + "learning_rate": 6.259253272894136e-05, + "loss": 1.7974, + "step": 14219 + }, + { + "epoch": 4.3646408839779, + "grad_norm": 0.3023197054862976, + "learning_rate": 6.258772232733556e-05, + "loss": 1.7589, + "step": 14220 + }, + { + "epoch": 4.364947820748926, + "grad_norm": 0.23253366351127625, + "learning_rate": 6.258291180133052e-05, + "loss": 1.7138, + "step": 14221 + }, + { + "epoch": 4.365254757519951, + "grad_norm": 0.41141277551651, + "learning_rate": 6.257810115097376e-05, + "loss": 1.7608, + "step": 14222 + }, + { + "epoch": 4.365561694290976, + "grad_norm": 0.3308235704898834, + "learning_rate": 6.257329037631284e-05, + "loss": 1.8006, + "step": 14223 + }, + { + "epoch": 4.365868631062002, + "grad_norm": 0.2635105848312378, + "learning_rate": 6.256847947739528e-05, + "loss": 1.7275, + "step": 14224 + }, + { + "epoch": 4.366175567833026, + "grad_norm": 0.45886602997779846, + "learning_rate": 6.256366845426864e-05, + "loss": 1.7701, + "step": 14225 + }, + { + "epoch": 4.366482504604051, + "grad_norm": 0.48503565788269043, + "learning_rate": 6.255885730698049e-05, + "loss": 1.7409, + "step": 14226 + }, + { + "epoch": 4.366789441375077, + "grad_norm": 0.26727184653282166, + "learning_rate": 6.255404603557833e-05, + "loss": 1.7288, + "step": 14227 + }, + { + "epoch": 4.367096378146102, + "grad_norm": 0.3343912363052368, + "learning_rate": 6.254923464010974e-05, + "loss": 1.764, + "step": 14228 + }, + { + "epoch": 4.367403314917127, + "grad_norm": 0.40050622820854187, + "learning_rate": 6.254442312062224e-05, + "loss": 1.7653, + "step": 14229 + }, + { + "epoch": 4.367710251688152, + "grad_norm": 0.23941144347190857, + "learning_rate": 6.253961147716341e-05, + "loss": 1.6886, + "step": 14230 + }, + { + "epoch": 4.368017188459177, + "grad_norm": 0.25737255811691284, + "learning_rate": 6.253479970978079e-05, + "loss": 1.8047, + "step": 14231 + }, + { + "epoch": 4.3683241252302025, + "grad_norm": 0.28780993819236755, + "learning_rate": 6.252998781852192e-05, + "loss": 1.7453, + "step": 14232 + }, + { + "epoch": 4.368631062001228, + "grad_norm": 0.2362327128648758, + "learning_rate": 6.252517580343438e-05, + "loss": 1.7963, + "step": 14233 + }, + { + "epoch": 4.368937998772253, + "grad_norm": 0.263013631105423, + "learning_rate": 6.252036366456571e-05, + "loss": 1.7837, + "step": 14234 + }, + { + "epoch": 4.3692449355432785, + "grad_norm": 0.27674412727355957, + "learning_rate": 6.251555140196347e-05, + "loss": 1.767, + "step": 14235 + }, + { + "epoch": 4.369551872314303, + "grad_norm": 0.2360621690750122, + "learning_rate": 6.251073901567522e-05, + "loss": 1.7806, + "step": 14236 + }, + { + "epoch": 4.369858809085328, + "grad_norm": 0.2568018138408661, + "learning_rate": 6.25059265057485e-05, + "loss": 1.7672, + "step": 14237 + }, + { + "epoch": 4.370165745856354, + "grad_norm": 0.2512381374835968, + "learning_rate": 6.25011138722309e-05, + "loss": 1.7506, + "step": 14238 + }, + { + "epoch": 4.370472682627379, + "grad_norm": 0.21587291359901428, + "learning_rate": 6.249630111516994e-05, + "loss": 1.7336, + "step": 14239 + }, + { + "epoch": 4.370779619398404, + "grad_norm": 0.21791933476924896, + "learning_rate": 6.249148823461323e-05, + "loss": 1.7588, + "step": 14240 + }, + { + "epoch": 4.371086556169429, + "grad_norm": 0.23061512410640717, + "learning_rate": 6.248667523060831e-05, + "loss": 1.742, + "step": 14241 + }, + { + "epoch": 4.371393492940454, + "grad_norm": 0.2007007598876953, + "learning_rate": 6.248186210320274e-05, + "loss": 1.7227, + "step": 14242 + }, + { + "epoch": 4.371700429711479, + "grad_norm": 0.2564350366592407, + "learning_rate": 6.247704885244411e-05, + "loss": 1.7529, + "step": 14243 + }, + { + "epoch": 4.372007366482505, + "grad_norm": 0.21880537271499634, + "learning_rate": 6.247223547837995e-05, + "loss": 1.7828, + "step": 14244 + }, + { + "epoch": 4.37231430325353, + "grad_norm": 0.26154282689094543, + "learning_rate": 6.246742198105785e-05, + "loss": 1.7895, + "step": 14245 + }, + { + "epoch": 4.3726212400245545, + "grad_norm": 0.2652645707130432, + "learning_rate": 6.24626083605254e-05, + "loss": 1.8038, + "step": 14246 + }, + { + "epoch": 4.37292817679558, + "grad_norm": 0.21463751792907715, + "learning_rate": 6.245779461683013e-05, + "loss": 1.7139, + "step": 14247 + }, + { + "epoch": 4.373235113566605, + "grad_norm": 0.21285851299762726, + "learning_rate": 6.245298075001961e-05, + "loss": 1.7686, + "step": 14248 + }, + { + "epoch": 4.3735420503376305, + "grad_norm": 0.258602499961853, + "learning_rate": 6.244816676014149e-05, + "loss": 1.8518, + "step": 14249 + }, + { + "epoch": 4.373848987108656, + "grad_norm": 0.25747501850128174, + "learning_rate": 6.244335264724323e-05, + "loss": 1.8019, + "step": 14250 + }, + { + "epoch": 4.37415592387968, + "grad_norm": 0.24678784608840942, + "learning_rate": 6.243853841137251e-05, + "loss": 1.7846, + "step": 14251 + }, + { + "epoch": 4.374462860650706, + "grad_norm": 0.31382107734680176, + "learning_rate": 6.243372405257685e-05, + "loss": 1.8389, + "step": 14252 + }, + { + "epoch": 4.374769797421731, + "grad_norm": 0.30522868037223816, + "learning_rate": 6.242890957090383e-05, + "loss": 1.8057, + "step": 14253 + }, + { + "epoch": 4.375076734192756, + "grad_norm": 0.2449347972869873, + "learning_rate": 6.242409496640106e-05, + "loss": 1.7144, + "step": 14254 + }, + { + "epoch": 4.375383670963782, + "grad_norm": 0.3193594217300415, + "learning_rate": 6.241928023911609e-05, + "loss": 1.7404, + "step": 14255 + }, + { + "epoch": 4.375690607734807, + "grad_norm": 0.23948179185390472, + "learning_rate": 6.241446538909651e-05, + "loss": 1.7338, + "step": 14256 + }, + { + "epoch": 4.3759975445058314, + "grad_norm": 0.35325706005096436, + "learning_rate": 6.240965041638991e-05, + "loss": 1.7673, + "step": 14257 + }, + { + "epoch": 4.376304481276857, + "grad_norm": 0.38753262162208557, + "learning_rate": 6.240483532104387e-05, + "loss": 1.769, + "step": 14258 + }, + { + "epoch": 4.376611418047882, + "grad_norm": 0.2749052941799164, + "learning_rate": 6.2400020103106e-05, + "loss": 1.8086, + "step": 14259 + }, + { + "epoch": 4.3769183548189075, + "grad_norm": 0.2553126811981201, + "learning_rate": 6.239520476262384e-05, + "loss": 1.7733, + "step": 14260 + }, + { + "epoch": 4.377225291589933, + "grad_norm": 0.2854517698287964, + "learning_rate": 6.2390389299645e-05, + "loss": 1.7926, + "step": 14261 + }, + { + "epoch": 4.377532228360957, + "grad_norm": 0.24617259204387665, + "learning_rate": 6.238557371421708e-05, + "loss": 1.7297, + "step": 14262 + }, + { + "epoch": 4.377839165131983, + "grad_norm": 0.2555331289768219, + "learning_rate": 6.238075800638765e-05, + "loss": 1.7566, + "step": 14263 + }, + { + "epoch": 4.378146101903008, + "grad_norm": 0.31666773557662964, + "learning_rate": 6.237594217620432e-05, + "loss": 1.8003, + "step": 14264 + }, + { + "epoch": 4.378453038674033, + "grad_norm": 0.24166476726531982, + "learning_rate": 6.237112622371468e-05, + "loss": 1.7425, + "step": 14265 + }, + { + "epoch": 4.378759975445059, + "grad_norm": 0.21237102150917053, + "learning_rate": 6.236631014896633e-05, + "loss": 1.73, + "step": 14266 + }, + { + "epoch": 4.379066912216084, + "grad_norm": 0.2739151120185852, + "learning_rate": 6.236149395200683e-05, + "loss": 1.7113, + "step": 14267 + }, + { + "epoch": 4.379373848987108, + "grad_norm": 0.23700746893882751, + "learning_rate": 6.23566776328838e-05, + "loss": 1.7256, + "step": 14268 + }, + { + "epoch": 4.379680785758134, + "grad_norm": 0.22366748750209808, + "learning_rate": 6.235186119164485e-05, + "loss": 1.7981, + "step": 14269 + }, + { + "epoch": 4.379987722529159, + "grad_norm": 0.28440114855766296, + "learning_rate": 6.234704462833758e-05, + "loss": 1.8087, + "step": 14270 + }, + { + "epoch": 4.380294659300184, + "grad_norm": 0.2706616520881653, + "learning_rate": 6.234222794300957e-05, + "loss": 1.7502, + "step": 14271 + }, + { + "epoch": 4.38060159607121, + "grad_norm": 0.21666266024112701, + "learning_rate": 6.233741113570843e-05, + "loss": 1.7639, + "step": 14272 + }, + { + "epoch": 4.380908532842234, + "grad_norm": 0.26790255308151245, + "learning_rate": 6.233259420648175e-05, + "loss": 1.796, + "step": 14273 + }, + { + "epoch": 4.3812154696132595, + "grad_norm": 0.22233673930168152, + "learning_rate": 6.232777715537715e-05, + "loss": 1.7661, + "step": 14274 + }, + { + "epoch": 4.381522406384285, + "grad_norm": 0.3277546763420105, + "learning_rate": 6.232295998244223e-05, + "loss": 1.7932, + "step": 14275 + }, + { + "epoch": 4.38182934315531, + "grad_norm": 0.2907596826553345, + "learning_rate": 6.231814268772463e-05, + "loss": 1.7103, + "step": 14276 + }, + { + "epoch": 4.3821362799263355, + "grad_norm": 0.2318384349346161, + "learning_rate": 6.231332527127188e-05, + "loss": 1.7351, + "step": 14277 + }, + { + "epoch": 4.382443216697361, + "grad_norm": 0.32904061675071716, + "learning_rate": 6.230850773313163e-05, + "loss": 1.7967, + "step": 14278 + }, + { + "epoch": 4.382750153468385, + "grad_norm": 0.2455490082502365, + "learning_rate": 6.230369007335153e-05, + "loss": 1.7474, + "step": 14279 + }, + { + "epoch": 4.383057090239411, + "grad_norm": 0.23648180067539215, + "learning_rate": 6.229887229197913e-05, + "loss": 1.7106, + "step": 14280 + }, + { + "epoch": 4.383364027010436, + "grad_norm": 0.29552599787712097, + "learning_rate": 6.229405438906207e-05, + "loss": 1.7765, + "step": 14281 + }, + { + "epoch": 4.383670963781461, + "grad_norm": 0.2094641923904419, + "learning_rate": 6.228923636464796e-05, + "loss": 1.7105, + "step": 14282 + }, + { + "epoch": 4.383977900552487, + "grad_norm": 0.24632154405117035, + "learning_rate": 6.228441821878441e-05, + "loss": 1.7913, + "step": 14283 + }, + { + "epoch": 4.384284837323511, + "grad_norm": 0.28114691376686096, + "learning_rate": 6.227959995151904e-05, + "loss": 1.7456, + "step": 14284 + }, + { + "epoch": 4.384591774094536, + "grad_norm": 0.24226875603199005, + "learning_rate": 6.227478156289946e-05, + "loss": 1.797, + "step": 14285 + }, + { + "epoch": 4.384898710865562, + "grad_norm": 0.2526854872703552, + "learning_rate": 6.22699630529733e-05, + "loss": 1.7155, + "step": 14286 + }, + { + "epoch": 4.385205647636587, + "grad_norm": 0.312916100025177, + "learning_rate": 6.226514442178818e-05, + "loss": 1.7808, + "step": 14287 + }, + { + "epoch": 4.385512584407612, + "grad_norm": 0.23087100684642792, + "learning_rate": 6.22603256693917e-05, + "loss": 1.7543, + "step": 14288 + }, + { + "epoch": 4.385819521178637, + "grad_norm": 0.3042476177215576, + "learning_rate": 6.22555067958315e-05, + "loss": 1.747, + "step": 14289 + }, + { + "epoch": 4.386126457949662, + "grad_norm": 0.2604007422924042, + "learning_rate": 6.225068780115522e-05, + "loss": 1.7262, + "step": 14290 + }, + { + "epoch": 4.3864333947206875, + "grad_norm": 0.2200118750333786, + "learning_rate": 6.224586868541044e-05, + "loss": 1.75, + "step": 14291 + }, + { + "epoch": 4.386740331491713, + "grad_norm": 0.3452017307281494, + "learning_rate": 6.224104944864481e-05, + "loss": 1.7598, + "step": 14292 + }, + { + "epoch": 4.387047268262738, + "grad_norm": 0.3169453740119934, + "learning_rate": 6.223623009090597e-05, + "loss": 1.7939, + "step": 14293 + }, + { + "epoch": 4.387354205033763, + "grad_norm": 0.23640502989292145, + "learning_rate": 6.223141061224151e-05, + "loss": 1.8005, + "step": 14294 + }, + { + "epoch": 4.387661141804788, + "grad_norm": 0.26212456822395325, + "learning_rate": 6.22265910126991e-05, + "loss": 1.7951, + "step": 14295 + }, + { + "epoch": 4.387968078575813, + "grad_norm": 0.2687644362449646, + "learning_rate": 6.222177129232634e-05, + "loss": 1.7674, + "step": 14296 + }, + { + "epoch": 4.388275015346839, + "grad_norm": 0.2553202211856842, + "learning_rate": 6.221695145117086e-05, + "loss": 1.8142, + "step": 14297 + }, + { + "epoch": 4.388581952117864, + "grad_norm": 0.3317619264125824, + "learning_rate": 6.221213148928034e-05, + "loss": 1.7884, + "step": 14298 + }, + { + "epoch": 4.388888888888889, + "grad_norm": 0.3059331476688385, + "learning_rate": 6.220731140670235e-05, + "loss": 1.7377, + "step": 14299 + }, + { + "epoch": 4.389195825659914, + "grad_norm": 0.21544015407562256, + "learning_rate": 6.220249120348457e-05, + "loss": 1.6818, + "step": 14300 + }, + { + "epoch": 4.389502762430939, + "grad_norm": 0.3112640380859375, + "learning_rate": 6.219767087967461e-05, + "loss": 1.72, + "step": 14301 + }, + { + "epoch": 4.389809699201964, + "grad_norm": 0.2572654187679291, + "learning_rate": 6.219285043532011e-05, + "loss": 1.793, + "step": 14302 + }, + { + "epoch": 4.39011663597299, + "grad_norm": 0.2621476948261261, + "learning_rate": 6.218802987046874e-05, + "loss": 1.8301, + "step": 14303 + }, + { + "epoch": 4.390423572744015, + "grad_norm": 0.2592658996582031, + "learning_rate": 6.218320918516809e-05, + "loss": 1.7219, + "step": 14304 + }, + { + "epoch": 4.3907305095150395, + "grad_norm": 0.25503265857696533, + "learning_rate": 6.217838837946584e-05, + "loss": 1.8149, + "step": 14305 + }, + { + "epoch": 4.391037446286065, + "grad_norm": 0.21944166719913483, + "learning_rate": 6.217356745340962e-05, + "loss": 1.7174, + "step": 14306 + }, + { + "epoch": 4.39134438305709, + "grad_norm": 0.2937396466732025, + "learning_rate": 6.216874640704707e-05, + "loss": 1.8562, + "step": 14307 + }, + { + "epoch": 4.3916513198281155, + "grad_norm": 0.22520211338996887, + "learning_rate": 6.216392524042581e-05, + "loss": 1.7701, + "step": 14308 + }, + { + "epoch": 4.391958256599141, + "grad_norm": 0.24397830665111542, + "learning_rate": 6.215910395359355e-05, + "loss": 1.7794, + "step": 14309 + }, + { + "epoch": 4.392265193370166, + "grad_norm": 0.2867623567581177, + "learning_rate": 6.215428254659788e-05, + "loss": 1.7275, + "step": 14310 + }, + { + "epoch": 4.392572130141191, + "grad_norm": 0.2632426917552948, + "learning_rate": 6.214946101948648e-05, + "loss": 1.7919, + "step": 14311 + }, + { + "epoch": 4.392879066912216, + "grad_norm": 0.23146092891693115, + "learning_rate": 6.214463937230696e-05, + "loss": 1.744, + "step": 14312 + }, + { + "epoch": 4.393186003683241, + "grad_norm": 0.21877676248550415, + "learning_rate": 6.213981760510701e-05, + "loss": 1.7577, + "step": 14313 + }, + { + "epoch": 4.393492940454267, + "grad_norm": 0.2320399284362793, + "learning_rate": 6.213499571793426e-05, + "loss": 1.7864, + "step": 14314 + }, + { + "epoch": 4.393799877225292, + "grad_norm": 0.2951548993587494, + "learning_rate": 6.213017371083638e-05, + "loss": 1.8257, + "step": 14315 + }, + { + "epoch": 4.394106813996316, + "grad_norm": 0.26062941551208496, + "learning_rate": 6.212535158386102e-05, + "loss": 1.7448, + "step": 14316 + }, + { + "epoch": 4.394413750767342, + "grad_norm": 0.24760986864566803, + "learning_rate": 6.21205293370558e-05, + "loss": 1.7902, + "step": 14317 + }, + { + "epoch": 4.394720687538367, + "grad_norm": 0.2686399221420288, + "learning_rate": 6.211570697046844e-05, + "loss": 1.8209, + "step": 14318 + }, + { + "epoch": 4.395027624309392, + "grad_norm": 0.2599134147167206, + "learning_rate": 6.211088448414653e-05, + "loss": 1.8231, + "step": 14319 + }, + { + "epoch": 4.395334561080418, + "grad_norm": 0.254044771194458, + "learning_rate": 6.210606187813778e-05, + "loss": 1.806, + "step": 14320 + }, + { + "epoch": 4.395641497851442, + "grad_norm": 0.262229323387146, + "learning_rate": 6.210123915248982e-05, + "loss": 1.7857, + "step": 14321 + }, + { + "epoch": 4.3959484346224675, + "grad_norm": 0.2849259078502655, + "learning_rate": 6.209641630725033e-05, + "loss": 1.8005, + "step": 14322 + }, + { + "epoch": 4.396255371393493, + "grad_norm": 0.35480254888534546, + "learning_rate": 6.209159334246697e-05, + "loss": 1.8189, + "step": 14323 + }, + { + "epoch": 4.396562308164518, + "grad_norm": 0.2599184215068817, + "learning_rate": 6.20867702581874e-05, + "loss": 1.7384, + "step": 14324 + }, + { + "epoch": 4.3968692449355435, + "grad_norm": 0.23994222283363342, + "learning_rate": 6.208194705445926e-05, + "loss": 1.7566, + "step": 14325 + }, + { + "epoch": 4.397176181706568, + "grad_norm": 0.24361753463745117, + "learning_rate": 6.207712373133024e-05, + "loss": 1.6965, + "step": 14326 + }, + { + "epoch": 4.397483118477593, + "grad_norm": 0.23925161361694336, + "learning_rate": 6.207230028884803e-05, + "loss": 1.7596, + "step": 14327 + }, + { + "epoch": 4.397790055248619, + "grad_norm": 0.24365897476673126, + "learning_rate": 6.206747672706025e-05, + "loss": 1.7951, + "step": 14328 + }, + { + "epoch": 4.398096992019644, + "grad_norm": 0.25245413184165955, + "learning_rate": 6.206265304601461e-05, + "loss": 1.8086, + "step": 14329 + }, + { + "epoch": 4.398403928790669, + "grad_norm": 0.24272513389587402, + "learning_rate": 6.205782924575874e-05, + "loss": 1.8148, + "step": 14330 + }, + { + "epoch": 4.398710865561695, + "grad_norm": 0.21299590170383453, + "learning_rate": 6.205300532634036e-05, + "loss": 1.7666, + "step": 14331 + }, + { + "epoch": 4.399017802332719, + "grad_norm": 0.23543189465999603, + "learning_rate": 6.20481812878071e-05, + "loss": 1.7629, + "step": 14332 + }, + { + "epoch": 4.399324739103744, + "grad_norm": 0.2284495085477829, + "learning_rate": 6.204335713020665e-05, + "loss": 1.768, + "step": 14333 + }, + { + "epoch": 4.39963167587477, + "grad_norm": 0.23158542811870575, + "learning_rate": 6.20385328535867e-05, + "loss": 1.7761, + "step": 14334 + }, + { + "epoch": 4.399938612645795, + "grad_norm": 0.2378150224685669, + "learning_rate": 6.20337084579949e-05, + "loss": 1.8483, + "step": 14335 + }, + { + "epoch": 4.4002455494168204, + "grad_norm": 0.2407436966896057, + "learning_rate": 6.202888394347892e-05, + "loss": 1.7364, + "step": 14336 + }, + { + "epoch": 4.400552486187845, + "grad_norm": 0.256259560585022, + "learning_rate": 6.202405931008649e-05, + "loss": 1.7376, + "step": 14337 + }, + { + "epoch": 4.40085942295887, + "grad_norm": 0.29293057322502136, + "learning_rate": 6.201923455786524e-05, + "loss": 1.7493, + "step": 14338 + }, + { + "epoch": 4.401166359729896, + "grad_norm": 0.24025334417819977, + "learning_rate": 6.201440968686288e-05, + "loss": 1.7522, + "step": 14339 + }, + { + "epoch": 4.401473296500921, + "grad_norm": 0.3215656280517578, + "learning_rate": 6.200958469712708e-05, + "loss": 1.7748, + "step": 14340 + }, + { + "epoch": 4.401780233271946, + "grad_norm": 0.43553170561790466, + "learning_rate": 6.200475958870553e-05, + "loss": 1.771, + "step": 14341 + }, + { + "epoch": 4.402087170042972, + "grad_norm": 0.3112131953239441, + "learning_rate": 6.19999343616459e-05, + "loss": 1.7655, + "step": 14342 + }, + { + "epoch": 4.402394106813996, + "grad_norm": 0.25197842717170715, + "learning_rate": 6.199510901599589e-05, + "loss": 1.7214, + "step": 14343 + }, + { + "epoch": 4.402701043585021, + "grad_norm": 0.33227142691612244, + "learning_rate": 6.19902835518032e-05, + "loss": 1.7332, + "step": 14344 + }, + { + "epoch": 4.403007980356047, + "grad_norm": 0.27962982654571533, + "learning_rate": 6.198545796911548e-05, + "loss": 1.6943, + "step": 14345 + }, + { + "epoch": 4.403314917127072, + "grad_norm": 0.24374182522296906, + "learning_rate": 6.198063226798044e-05, + "loss": 1.7222, + "step": 14346 + }, + { + "epoch": 4.403621853898097, + "grad_norm": 0.3101944625377655, + "learning_rate": 6.197580644844576e-05, + "loss": 1.7113, + "step": 14347 + }, + { + "epoch": 4.403928790669122, + "grad_norm": 0.25919321179389954, + "learning_rate": 6.197098051055916e-05, + "loss": 1.71, + "step": 14348 + }, + { + "epoch": 4.404235727440147, + "grad_norm": 0.23140330612659454, + "learning_rate": 6.19661544543683e-05, + "loss": 1.7472, + "step": 14349 + }, + { + "epoch": 4.4045426642111725, + "grad_norm": 0.3274286687374115, + "learning_rate": 6.19613282799209e-05, + "loss": 1.7093, + "step": 14350 + }, + { + "epoch": 4.404849600982198, + "grad_norm": 0.3187442123889923, + "learning_rate": 6.195650198726464e-05, + "loss": 1.7488, + "step": 14351 + }, + { + "epoch": 4.405156537753223, + "grad_norm": 0.20547433197498322, + "learning_rate": 6.195167557644722e-05, + "loss": 1.7295, + "step": 14352 + }, + { + "epoch": 4.4054634745242485, + "grad_norm": 0.2623414993286133, + "learning_rate": 6.194684904751633e-05, + "loss": 1.8258, + "step": 14353 + }, + { + "epoch": 4.405770411295273, + "grad_norm": 0.2468457818031311, + "learning_rate": 6.194202240051967e-05, + "loss": 1.6957, + "step": 14354 + }, + { + "epoch": 4.406077348066298, + "grad_norm": 0.2082364559173584, + "learning_rate": 6.193719563550496e-05, + "loss": 1.7596, + "step": 14355 + }, + { + "epoch": 4.406384284837324, + "grad_norm": 0.27072983980178833, + "learning_rate": 6.193236875251988e-05, + "loss": 1.7341, + "step": 14356 + }, + { + "epoch": 4.406691221608349, + "grad_norm": 0.2630362808704376, + "learning_rate": 6.192754175161215e-05, + "loss": 1.7664, + "step": 14357 + }, + { + "epoch": 4.406998158379374, + "grad_norm": 0.25400006771087646, + "learning_rate": 6.192271463282944e-05, + "loss": 1.7582, + "step": 14358 + }, + { + "epoch": 4.407305095150399, + "grad_norm": 0.22256311774253845, + "learning_rate": 6.191788739621949e-05, + "loss": 1.7389, + "step": 14359 + }, + { + "epoch": 4.407612031921424, + "grad_norm": 0.2160387486219406, + "learning_rate": 6.191306004182999e-05, + "loss": 1.7051, + "step": 14360 + }, + { + "epoch": 4.407918968692449, + "grad_norm": 0.20665684342384338, + "learning_rate": 6.190823256970865e-05, + "loss": 1.7606, + "step": 14361 + }, + { + "epoch": 4.408225905463475, + "grad_norm": 0.2173188328742981, + "learning_rate": 6.190340497990318e-05, + "loss": 1.7944, + "step": 14362 + }, + { + "epoch": 4.4085328422345, + "grad_norm": 0.189287930727005, + "learning_rate": 6.189857727246127e-05, + "loss": 1.7283, + "step": 14363 + }, + { + "epoch": 4.4088397790055245, + "grad_norm": 0.2531645596027374, + "learning_rate": 6.189374944743065e-05, + "loss": 1.7554, + "step": 14364 + }, + { + "epoch": 4.40914671577655, + "grad_norm": 0.25439125299453735, + "learning_rate": 6.188892150485903e-05, + "loss": 1.8032, + "step": 14365 + }, + { + "epoch": 4.409453652547575, + "grad_norm": 0.20938685536384583, + "learning_rate": 6.188409344479412e-05, + "loss": 1.7385, + "step": 14366 + }, + { + "epoch": 4.4097605893186005, + "grad_norm": 0.20471477508544922, + "learning_rate": 6.187926526728364e-05, + "loss": 1.7487, + "step": 14367 + }, + { + "epoch": 4.410067526089626, + "grad_norm": 0.2381851226091385, + "learning_rate": 6.187443697237529e-05, + "loss": 1.7443, + "step": 14368 + }, + { + "epoch": 4.41037446286065, + "grad_norm": 0.21584098041057587, + "learning_rate": 6.18696085601168e-05, + "loss": 1.7818, + "step": 14369 + }, + { + "epoch": 4.410681399631676, + "grad_norm": 0.2575368583202362, + "learning_rate": 6.186478003055587e-05, + "loss": 1.8204, + "step": 14370 + }, + { + "epoch": 4.410988336402701, + "grad_norm": 0.21133238077163696, + "learning_rate": 6.185995138374024e-05, + "loss": 1.7274, + "step": 14371 + }, + { + "epoch": 4.411295273173726, + "grad_norm": 0.24918322265148163, + "learning_rate": 6.18551226197176e-05, + "loss": 1.8021, + "step": 14372 + }, + { + "epoch": 4.411602209944752, + "grad_norm": 0.2253655642271042, + "learning_rate": 6.185029373853572e-05, + "loss": 1.7308, + "step": 14373 + }, + { + "epoch": 4.411909146715777, + "grad_norm": 0.20098713040351868, + "learning_rate": 6.184546474024226e-05, + "loss": 1.7549, + "step": 14374 + }, + { + "epoch": 4.412216083486801, + "grad_norm": 0.25612789392471313, + "learning_rate": 6.1840635624885e-05, + "loss": 1.8305, + "step": 14375 + }, + { + "epoch": 4.412523020257827, + "grad_norm": 0.24287539720535278, + "learning_rate": 6.183580639251164e-05, + "loss": 1.7339, + "step": 14376 + }, + { + "epoch": 4.412829957028852, + "grad_norm": 0.2304944545030594, + "learning_rate": 6.183097704316988e-05, + "loss": 1.7023, + "step": 14377 + }, + { + "epoch": 4.413136893799877, + "grad_norm": 0.21911773085594177, + "learning_rate": 6.18261475769075e-05, + "loss": 1.7305, + "step": 14378 + }, + { + "epoch": 4.413443830570903, + "grad_norm": 0.24207864701747894, + "learning_rate": 6.182131799377217e-05, + "loss": 1.7318, + "step": 14379 + }, + { + "epoch": 4.413750767341927, + "grad_norm": 0.2551634609699249, + "learning_rate": 6.181648829381165e-05, + "loss": 1.8101, + "step": 14380 + }, + { + "epoch": 4.4140577041129525, + "grad_norm": 0.4114011526107788, + "learning_rate": 6.181165847707368e-05, + "loss": 1.772, + "step": 14381 + }, + { + "epoch": 4.414364640883978, + "grad_norm": 0.4592796862125397, + "learning_rate": 6.180682854360598e-05, + "loss": 1.7359, + "step": 14382 + }, + { + "epoch": 4.414671577655003, + "grad_norm": 0.2599259614944458, + "learning_rate": 6.180199849345627e-05, + "loss": 1.7028, + "step": 14383 + }, + { + "epoch": 4.4149785144260285, + "grad_norm": 0.3489506244659424, + "learning_rate": 6.17971683266723e-05, + "loss": 1.8252, + "step": 14384 + }, + { + "epoch": 4.415285451197054, + "grad_norm": 0.44563809037208557, + "learning_rate": 6.179233804330179e-05, + "loss": 1.6894, + "step": 14385 + }, + { + "epoch": 4.415592387968078, + "grad_norm": 0.2596888542175293, + "learning_rate": 6.17875076433925e-05, + "loss": 1.8141, + "step": 14386 + }, + { + "epoch": 4.415899324739104, + "grad_norm": 0.3560626804828644, + "learning_rate": 6.178267712699213e-05, + "loss": 1.7764, + "step": 14387 + }, + { + "epoch": 4.416206261510129, + "grad_norm": 0.3746717572212219, + "learning_rate": 6.177784649414843e-05, + "loss": 1.7528, + "step": 14388 + }, + { + "epoch": 4.416513198281154, + "grad_norm": 0.23248885571956635, + "learning_rate": 6.177301574490918e-05, + "loss": 1.7148, + "step": 14389 + }, + { + "epoch": 4.41682013505218, + "grad_norm": 0.26936978101730347, + "learning_rate": 6.176818487932208e-05, + "loss": 1.7199, + "step": 14390 + }, + { + "epoch": 4.417127071823204, + "grad_norm": 0.3102504014968872, + "learning_rate": 6.176335389743486e-05, + "loss": 1.6886, + "step": 14391 + }, + { + "epoch": 4.417434008594229, + "grad_norm": 0.24406832456588745, + "learning_rate": 6.175852279929531e-05, + "loss": 1.7766, + "step": 14392 + }, + { + "epoch": 4.417740945365255, + "grad_norm": 0.271158903837204, + "learning_rate": 6.175369158495112e-05, + "loss": 1.8099, + "step": 14393 + }, + { + "epoch": 4.41804788213628, + "grad_norm": 0.343667209148407, + "learning_rate": 6.174886025445008e-05, + "loss": 1.779, + "step": 14394 + }, + { + "epoch": 4.418354818907305, + "grad_norm": 0.37423139810562134, + "learning_rate": 6.17440288078399e-05, + "loss": 1.7796, + "step": 14395 + }, + { + "epoch": 4.41866175567833, + "grad_norm": 0.3152335286140442, + "learning_rate": 6.173919724516836e-05, + "loss": 1.7388, + "step": 14396 + }, + { + "epoch": 4.418968692449355, + "grad_norm": 0.21467824280261993, + "learning_rate": 6.173436556648319e-05, + "loss": 1.7689, + "step": 14397 + }, + { + "epoch": 4.4192756292203805, + "grad_norm": 0.2861369848251343, + "learning_rate": 6.172953377183213e-05, + "loss": 1.819, + "step": 14398 + }, + { + "epoch": 4.419582565991406, + "grad_norm": 0.34777504205703735, + "learning_rate": 6.172470186126295e-05, + "loss": 1.7444, + "step": 14399 + }, + { + "epoch": 4.419889502762431, + "grad_norm": 0.2728833854198456, + "learning_rate": 6.171986983482339e-05, + "loss": 1.7637, + "step": 14400 + }, + { + "epoch": 4.420196439533456, + "grad_norm": 0.2593914270401001, + "learning_rate": 6.17150376925612e-05, + "loss": 1.8196, + "step": 14401 + }, + { + "epoch": 4.420503376304481, + "grad_norm": 0.29425305128097534, + "learning_rate": 6.171020543452416e-05, + "loss": 1.7511, + "step": 14402 + }, + { + "epoch": 4.420810313075506, + "grad_norm": 0.2587110102176666, + "learning_rate": 6.170537306076e-05, + "loss": 1.8085, + "step": 14403 + }, + { + "epoch": 4.421117249846532, + "grad_norm": 0.22442933917045593, + "learning_rate": 6.170054057131648e-05, + "loss": 1.8023, + "step": 14404 + }, + { + "epoch": 4.421424186617557, + "grad_norm": 0.23302629590034485, + "learning_rate": 6.169570796624136e-05, + "loss": 1.7995, + "step": 14405 + }, + { + "epoch": 4.421731123388582, + "grad_norm": 0.2295885682106018, + "learning_rate": 6.169087524558239e-05, + "loss": 1.7948, + "step": 14406 + }, + { + "epoch": 4.422038060159607, + "grad_norm": 0.2161262482404709, + "learning_rate": 6.168604240938735e-05, + "loss": 1.7159, + "step": 14407 + }, + { + "epoch": 4.422344996930632, + "grad_norm": 0.20746205747127533, + "learning_rate": 6.1681209457704e-05, + "loss": 1.7703, + "step": 14408 + }, + { + "epoch": 4.422651933701657, + "grad_norm": 0.25677376985549927, + "learning_rate": 6.167637639058006e-05, + "loss": 1.7819, + "step": 14409 + }, + { + "epoch": 4.422958870472683, + "grad_norm": 0.226568341255188, + "learning_rate": 6.167154320806336e-05, + "loss": 1.7661, + "step": 14410 + }, + { + "epoch": 4.423265807243708, + "grad_norm": 0.22997824847698212, + "learning_rate": 6.166670991020162e-05, + "loss": 1.7364, + "step": 14411 + }, + { + "epoch": 4.4235727440147325, + "grad_norm": 0.2528770864009857, + "learning_rate": 6.166187649704261e-05, + "loss": 1.8505, + "step": 14412 + }, + { + "epoch": 4.423879680785758, + "grad_norm": 0.27278614044189453, + "learning_rate": 6.165704296863409e-05, + "loss": 1.7855, + "step": 14413 + }, + { + "epoch": 4.424186617556783, + "grad_norm": 0.23086364567279816, + "learning_rate": 6.165220932502385e-05, + "loss": 1.7489, + "step": 14414 + }, + { + "epoch": 4.4244935543278086, + "grad_norm": 0.2570587396621704, + "learning_rate": 6.164737556625965e-05, + "loss": 1.8008, + "step": 14415 + }, + { + "epoch": 4.424800491098834, + "grad_norm": 0.2637264132499695, + "learning_rate": 6.164254169238923e-05, + "loss": 1.7563, + "step": 14416 + }, + { + "epoch": 4.425107427869859, + "grad_norm": 0.23046623170375824, + "learning_rate": 6.163770770346043e-05, + "loss": 1.7433, + "step": 14417 + }, + { + "epoch": 4.425414364640884, + "grad_norm": 0.2531467080116272, + "learning_rate": 6.163287359952095e-05, + "loss": 1.8122, + "step": 14418 + }, + { + "epoch": 4.425721301411909, + "grad_norm": 0.26507216691970825, + "learning_rate": 6.162803938061861e-05, + "loss": 1.7019, + "step": 14419 + }, + { + "epoch": 4.426028238182934, + "grad_norm": 0.229641854763031, + "learning_rate": 6.162320504680117e-05, + "loss": 1.7518, + "step": 14420 + }, + { + "epoch": 4.42633517495396, + "grad_norm": 0.22777152061462402, + "learning_rate": 6.161837059811641e-05, + "loss": 1.8094, + "step": 14421 + }, + { + "epoch": 4.426642111724985, + "grad_norm": 0.22121338546276093, + "learning_rate": 6.161353603461209e-05, + "loss": 1.7204, + "step": 14422 + }, + { + "epoch": 4.4269490484960095, + "grad_norm": 0.21914128959178925, + "learning_rate": 6.1608701356336e-05, + "loss": 1.7554, + "step": 14423 + }, + { + "epoch": 4.427255985267035, + "grad_norm": 0.22649390995502472, + "learning_rate": 6.160386656333593e-05, + "loss": 1.8058, + "step": 14424 + }, + { + "epoch": 4.42756292203806, + "grad_norm": 0.24529023468494415, + "learning_rate": 6.159903165565964e-05, + "loss": 1.7302, + "step": 14425 + }, + { + "epoch": 4.4278698588090855, + "grad_norm": 0.2726481854915619, + "learning_rate": 6.159419663335492e-05, + "loss": 1.825, + "step": 14426 + }, + { + "epoch": 4.428176795580111, + "grad_norm": 0.2772440016269684, + "learning_rate": 6.158936149646957e-05, + "loss": 1.7322, + "step": 14427 + }, + { + "epoch": 4.428483732351136, + "grad_norm": 0.29778853058815, + "learning_rate": 6.158452624505135e-05, + "loss": 1.7421, + "step": 14428 + }, + { + "epoch": 4.428790669122161, + "grad_norm": 0.21327480673789978, + "learning_rate": 6.157969087914804e-05, + "loss": 1.7269, + "step": 14429 + }, + { + "epoch": 4.429097605893186, + "grad_norm": 0.2718868851661682, + "learning_rate": 6.157485539880744e-05, + "loss": 1.7817, + "step": 14430 + }, + { + "epoch": 4.429404542664211, + "grad_norm": 0.32242509722709656, + "learning_rate": 6.157001980407735e-05, + "loss": 1.7115, + "step": 14431 + }, + { + "epoch": 4.429711479435237, + "grad_norm": 0.2931978106498718, + "learning_rate": 6.156518409500553e-05, + "loss": 1.7822, + "step": 14432 + }, + { + "epoch": 4.430018416206262, + "grad_norm": 0.229528546333313, + "learning_rate": 6.156034827163977e-05, + "loss": 1.7623, + "step": 14433 + }, + { + "epoch": 4.430325352977286, + "grad_norm": 0.28702354431152344, + "learning_rate": 6.15555123340279e-05, + "loss": 1.8101, + "step": 14434 + }, + { + "epoch": 4.430632289748312, + "grad_norm": 0.27162131667137146, + "learning_rate": 6.155067628221766e-05, + "loss": 1.7525, + "step": 14435 + }, + { + "epoch": 4.430939226519337, + "grad_norm": 0.24290388822555542, + "learning_rate": 6.154584011625688e-05, + "loss": 1.8701, + "step": 14436 + }, + { + "epoch": 4.431246163290362, + "grad_norm": 0.3055405020713806, + "learning_rate": 6.154100383619334e-05, + "loss": 1.8659, + "step": 14437 + }, + { + "epoch": 4.431553100061388, + "grad_norm": 0.24528950452804565, + "learning_rate": 6.153616744207483e-05, + "loss": 1.8493, + "step": 14438 + }, + { + "epoch": 4.431860036832412, + "grad_norm": 0.2611897587776184, + "learning_rate": 6.153133093394917e-05, + "loss": 1.7905, + "step": 14439 + }, + { + "epoch": 4.4321669736034375, + "grad_norm": 0.2172730267047882, + "learning_rate": 6.15264943118641e-05, + "loss": 1.7087, + "step": 14440 + }, + { + "epoch": 4.432473910374463, + "grad_norm": 0.2320949286222458, + "learning_rate": 6.152165757586749e-05, + "loss": 1.7473, + "step": 14441 + }, + { + "epoch": 4.432780847145488, + "grad_norm": 0.2602086365222931, + "learning_rate": 6.15168207260071e-05, + "loss": 1.7365, + "step": 14442 + }, + { + "epoch": 4.4330877839165135, + "grad_norm": 0.25193190574645996, + "learning_rate": 6.151198376233074e-05, + "loss": 1.8205, + "step": 14443 + }, + { + "epoch": 4.433394720687538, + "grad_norm": 0.2894204556941986, + "learning_rate": 6.150714668488621e-05, + "loss": 1.7759, + "step": 14444 + }, + { + "epoch": 4.433701657458563, + "grad_norm": 0.24150310456752777, + "learning_rate": 6.150230949372131e-05, + "loss": 1.8415, + "step": 14445 + }, + { + "epoch": 4.434008594229589, + "grad_norm": 0.23475918173789978, + "learning_rate": 6.149747218888384e-05, + "loss": 1.7487, + "step": 14446 + }, + { + "epoch": 4.434315531000614, + "grad_norm": 0.29425546526908875, + "learning_rate": 6.149263477042162e-05, + "loss": 1.7538, + "step": 14447 + }, + { + "epoch": 4.434622467771639, + "grad_norm": 0.26241615414619446, + "learning_rate": 6.148779723838244e-05, + "loss": 1.7564, + "step": 14448 + }, + { + "epoch": 4.434929404542665, + "grad_norm": 0.23195287585258484, + "learning_rate": 6.148295959281411e-05, + "loss": 1.837, + "step": 14449 + }, + { + "epoch": 4.435236341313689, + "grad_norm": 0.34972792863845825, + "learning_rate": 6.147812183376445e-05, + "loss": 1.7632, + "step": 14450 + }, + { + "epoch": 4.435543278084714, + "grad_norm": 0.3536125719547272, + "learning_rate": 6.147328396128126e-05, + "loss": 1.8372, + "step": 14451 + }, + { + "epoch": 4.43585021485574, + "grad_norm": 0.2086079865694046, + "learning_rate": 6.146844597541235e-05, + "loss": 1.7014, + "step": 14452 + }, + { + "epoch": 4.436157151626765, + "grad_norm": 0.25547802448272705, + "learning_rate": 6.146360787620554e-05, + "loss": 1.7544, + "step": 14453 + }, + { + "epoch": 4.43646408839779, + "grad_norm": 0.26176998019218445, + "learning_rate": 6.145876966370864e-05, + "loss": 1.7617, + "step": 14454 + }, + { + "epoch": 4.436771025168815, + "grad_norm": 0.2672959566116333, + "learning_rate": 6.145393133796946e-05, + "loss": 1.8178, + "step": 14455 + }, + { + "epoch": 4.43707796193984, + "grad_norm": 0.23373909294605255, + "learning_rate": 6.144909289903582e-05, + "loss": 1.7295, + "step": 14456 + }, + { + "epoch": 4.4373848987108655, + "grad_norm": 0.2369835078716278, + "learning_rate": 6.144425434695551e-05, + "loss": 1.8097, + "step": 14457 + }, + { + "epoch": 4.437691835481891, + "grad_norm": 0.25528979301452637, + "learning_rate": 6.14394156817764e-05, + "loss": 1.7523, + "step": 14458 + }, + { + "epoch": 4.437998772252916, + "grad_norm": 0.2541787624359131, + "learning_rate": 6.143457690354626e-05, + "loss": 1.7606, + "step": 14459 + }, + { + "epoch": 4.4383057090239415, + "grad_norm": 0.2032637745141983, + "learning_rate": 6.142973801231295e-05, + "loss": 1.7967, + "step": 14460 + }, + { + "epoch": 4.438612645794966, + "grad_norm": 0.2413996160030365, + "learning_rate": 6.142489900812426e-05, + "loss": 1.7688, + "step": 14461 + }, + { + "epoch": 4.438919582565991, + "grad_norm": 0.43451038002967834, + "learning_rate": 6.142005989102803e-05, + "loss": 1.8269, + "step": 14462 + }, + { + "epoch": 4.439226519337017, + "grad_norm": 0.23981481790542603, + "learning_rate": 6.141522066107206e-05, + "loss": 1.7628, + "step": 14463 + }, + { + "epoch": 4.439533456108042, + "grad_norm": 0.25396493077278137, + "learning_rate": 6.14103813183042e-05, + "loss": 1.7913, + "step": 14464 + }, + { + "epoch": 4.439840392879067, + "grad_norm": 0.2567536532878876, + "learning_rate": 6.140554186277225e-05, + "loss": 1.7612, + "step": 14465 + }, + { + "epoch": 4.440147329650092, + "grad_norm": 0.2201337069272995, + "learning_rate": 6.140070229452406e-05, + "loss": 1.7541, + "step": 14466 + }, + { + "epoch": 4.440454266421117, + "grad_norm": 0.24202953279018402, + "learning_rate": 6.139586261360746e-05, + "loss": 1.777, + "step": 14467 + }, + { + "epoch": 4.440761203192142, + "grad_norm": 0.23891687393188477, + "learning_rate": 6.139102282007024e-05, + "loss": 1.7509, + "step": 14468 + }, + { + "epoch": 4.441068139963168, + "grad_norm": 0.21132555603981018, + "learning_rate": 6.138618291396026e-05, + "loss": 1.7362, + "step": 14469 + }, + { + "epoch": 4.441375076734193, + "grad_norm": 0.2731861472129822, + "learning_rate": 6.138134289532536e-05, + "loss": 1.8063, + "step": 14470 + }, + { + "epoch": 4.4416820135052175, + "grad_norm": 0.29503315687179565, + "learning_rate": 6.137650276421336e-05, + "loss": 1.7193, + "step": 14471 + }, + { + "epoch": 4.441988950276243, + "grad_norm": 0.2778526544570923, + "learning_rate": 6.137166252067208e-05, + "loss": 1.7507, + "step": 14472 + }, + { + "epoch": 4.442295887047268, + "grad_norm": 0.2907710075378418, + "learning_rate": 6.136682216474938e-05, + "loss": 1.7939, + "step": 14473 + }, + { + "epoch": 4.4426028238182935, + "grad_norm": 0.4133768379688263, + "learning_rate": 6.136198169649306e-05, + "loss": 1.8012, + "step": 14474 + }, + { + "epoch": 4.442909760589319, + "grad_norm": 0.2505052983760834, + "learning_rate": 6.135714111595099e-05, + "loss": 1.8426, + "step": 14475 + }, + { + "epoch": 4.443216697360343, + "grad_norm": 0.3884379267692566, + "learning_rate": 6.135230042317099e-05, + "loss": 1.7383, + "step": 14476 + }, + { + "epoch": 4.443523634131369, + "grad_norm": 0.42902377247810364, + "learning_rate": 6.134745961820091e-05, + "loss": 1.732, + "step": 14477 + }, + { + "epoch": 4.443830570902394, + "grad_norm": 0.21782708168029785, + "learning_rate": 6.134261870108858e-05, + "loss": 1.7369, + "step": 14478 + }, + { + "epoch": 4.444137507673419, + "grad_norm": 0.4160648286342621, + "learning_rate": 6.133777767188186e-05, + "loss": 1.8083, + "step": 14479 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.5057216882705688, + "learning_rate": 6.133293653062856e-05, + "loss": 1.8971, + "step": 14480 + }, + { + "epoch": 4.44475138121547, + "grad_norm": 0.2189750075340271, + "learning_rate": 6.132809527737654e-05, + "loss": 1.7508, + "step": 14481 + }, + { + "epoch": 4.445058317986494, + "grad_norm": 0.4415782392024994, + "learning_rate": 6.132325391217364e-05, + "loss": 1.8548, + "step": 14482 + }, + { + "epoch": 4.44536525475752, + "grad_norm": 0.3907296359539032, + "learning_rate": 6.13184124350677e-05, + "loss": 1.7879, + "step": 14483 + }, + { + "epoch": 4.445672191528545, + "grad_norm": 0.24117955565452576, + "learning_rate": 6.131357084610659e-05, + "loss": 1.7227, + "step": 14484 + }, + { + "epoch": 4.44597912829957, + "grad_norm": 0.3083679974079132, + "learning_rate": 6.130872914533815e-05, + "loss": 1.7505, + "step": 14485 + }, + { + "epoch": 4.446286065070596, + "grad_norm": 0.27730658650398254, + "learning_rate": 6.13038873328102e-05, + "loss": 1.7485, + "step": 14486 + }, + { + "epoch": 4.44659300184162, + "grad_norm": 0.28548410534858704, + "learning_rate": 6.12990454085706e-05, + "loss": 1.8145, + "step": 14487 + }, + { + "epoch": 4.4468999386126455, + "grad_norm": 0.24743106961250305, + "learning_rate": 6.129420337266724e-05, + "loss": 1.7131, + "step": 14488 + }, + { + "epoch": 4.447206875383671, + "grad_norm": 0.2899693250656128, + "learning_rate": 6.128936122514794e-05, + "loss": 1.8567, + "step": 14489 + }, + { + "epoch": 4.447513812154696, + "grad_norm": 0.259916752576828, + "learning_rate": 6.128451896606053e-05, + "loss": 1.7563, + "step": 14490 + }, + { + "epoch": 4.4478207489257215, + "grad_norm": 0.21112586557865143, + "learning_rate": 6.12796765954529e-05, + "loss": 1.6975, + "step": 14491 + }, + { + "epoch": 4.448127685696747, + "grad_norm": 0.2890239953994751, + "learning_rate": 6.12748341133729e-05, + "loss": 1.7904, + "step": 14492 + }, + { + "epoch": 4.448434622467771, + "grad_norm": 0.23394012451171875, + "learning_rate": 6.126999151986839e-05, + "loss": 1.7559, + "step": 14493 + }, + { + "epoch": 4.448741559238797, + "grad_norm": 0.3492949903011322, + "learning_rate": 6.12651488149872e-05, + "loss": 1.7734, + "step": 14494 + }, + { + "epoch": 4.449048496009822, + "grad_norm": 0.48309218883514404, + "learning_rate": 6.126030599877723e-05, + "loss": 1.7798, + "step": 14495 + }, + { + "epoch": 4.449355432780847, + "grad_norm": 0.341146320104599, + "learning_rate": 6.12554630712863e-05, + "loss": 1.7921, + "step": 14496 + }, + { + "epoch": 4.449662369551873, + "grad_norm": 0.223160982131958, + "learning_rate": 6.125062003256229e-05, + "loss": 1.7784, + "step": 14497 + }, + { + "epoch": 4.449969306322897, + "grad_norm": 0.32664811611175537, + "learning_rate": 6.124577688265306e-05, + "loss": 1.7353, + "step": 14498 + }, + { + "epoch": 4.4502762430939224, + "grad_norm": 0.215936541557312, + "learning_rate": 6.124093362160646e-05, + "loss": 1.68, + "step": 14499 + }, + { + "epoch": 4.450583179864948, + "grad_norm": 0.26081225275993347, + "learning_rate": 6.123609024947038e-05, + "loss": 1.7107, + "step": 14500 + }, + { + "epoch": 4.450890116635973, + "grad_norm": 0.3124069571495056, + "learning_rate": 6.123124676629267e-05, + "loss": 1.7338, + "step": 14501 + }, + { + "epoch": 4.4511970534069984, + "grad_norm": 0.23125620186328888, + "learning_rate": 6.122640317212118e-05, + "loss": 1.7842, + "step": 14502 + }, + { + "epoch": 4.451503990178024, + "grad_norm": 0.27065595984458923, + "learning_rate": 6.122155946700381e-05, + "loss": 1.7284, + "step": 14503 + }, + { + "epoch": 4.451810926949048, + "grad_norm": 0.4677436053752899, + "learning_rate": 6.121671565098841e-05, + "loss": 1.8156, + "step": 14504 + }, + { + "epoch": 4.452117863720074, + "grad_norm": 0.36325082182884216, + "learning_rate": 6.121187172412285e-05, + "loss": 1.7875, + "step": 14505 + }, + { + "epoch": 4.452424800491099, + "grad_norm": 0.23409567773342133, + "learning_rate": 6.1207027686455e-05, + "loss": 1.7421, + "step": 14506 + }, + { + "epoch": 4.452731737262124, + "grad_norm": 0.36919257044792175, + "learning_rate": 6.120218353803273e-05, + "loss": 1.7545, + "step": 14507 + }, + { + "epoch": 4.45303867403315, + "grad_norm": 0.318452388048172, + "learning_rate": 6.119733927890393e-05, + "loss": 1.7179, + "step": 14508 + }, + { + "epoch": 4.453345610804174, + "grad_norm": 0.21279768645763397, + "learning_rate": 6.119249490911643e-05, + "loss": 1.7534, + "step": 14509 + }, + { + "epoch": 4.453652547575199, + "grad_norm": 0.30565473437309265, + "learning_rate": 6.118765042871816e-05, + "loss": 1.7962, + "step": 14510 + }, + { + "epoch": 4.453959484346225, + "grad_norm": 0.2608480453491211, + "learning_rate": 6.118280583775697e-05, + "loss": 1.7336, + "step": 14511 + }, + { + "epoch": 4.45426642111725, + "grad_norm": 0.22978845238685608, + "learning_rate": 6.117796113628075e-05, + "loss": 1.8244, + "step": 14512 + }, + { + "epoch": 4.454573357888275, + "grad_norm": 0.26357781887054443, + "learning_rate": 6.117311632433735e-05, + "loss": 1.7425, + "step": 14513 + }, + { + "epoch": 4.4548802946593, + "grad_norm": 0.22127102315425873, + "learning_rate": 6.116827140197467e-05, + "loss": 1.7679, + "step": 14514 + }, + { + "epoch": 4.455187231430325, + "grad_norm": 0.2876584231853485, + "learning_rate": 6.116342636924058e-05, + "loss": 1.8104, + "step": 14515 + }, + { + "epoch": 4.4554941682013505, + "grad_norm": 0.28290677070617676, + "learning_rate": 6.115858122618297e-05, + "loss": 1.7485, + "step": 14516 + }, + { + "epoch": 4.455801104972376, + "grad_norm": 0.21914640069007874, + "learning_rate": 6.115373597284974e-05, + "loss": 1.7736, + "step": 14517 + }, + { + "epoch": 4.456108041743401, + "grad_norm": 0.2603909969329834, + "learning_rate": 6.114889060928873e-05, + "loss": 1.7446, + "step": 14518 + }, + { + "epoch": 4.456414978514426, + "grad_norm": 0.2157236635684967, + "learning_rate": 6.114404513554784e-05, + "loss": 1.7594, + "step": 14519 + }, + { + "epoch": 4.456721915285451, + "grad_norm": 0.27622368931770325, + "learning_rate": 6.113919955167499e-05, + "loss": 1.8154, + "step": 14520 + }, + { + "epoch": 4.457028852056476, + "grad_norm": 0.27298516035079956, + "learning_rate": 6.113435385771803e-05, + "loss": 1.7458, + "step": 14521 + }, + { + "epoch": 4.457335788827502, + "grad_norm": 0.22220586240291595, + "learning_rate": 6.112950805372485e-05, + "loss": 1.7102, + "step": 14522 + }, + { + "epoch": 4.457642725598527, + "grad_norm": 0.19480876624584198, + "learning_rate": 6.112466213974336e-05, + "loss": 1.7696, + "step": 14523 + }, + { + "epoch": 4.457949662369552, + "grad_norm": 0.24261653423309326, + "learning_rate": 6.111981611582144e-05, + "loss": 1.8193, + "step": 14524 + }, + { + "epoch": 4.458256599140577, + "grad_norm": 0.2502967417240143, + "learning_rate": 6.111496998200697e-05, + "loss": 1.7701, + "step": 14525 + }, + { + "epoch": 4.458563535911602, + "grad_norm": 0.25764599442481995, + "learning_rate": 6.111012373834786e-05, + "loss": 1.8055, + "step": 14526 + }, + { + "epoch": 4.458870472682627, + "grad_norm": 0.24085427820682526, + "learning_rate": 6.110527738489198e-05, + "loss": 1.7592, + "step": 14527 + }, + { + "epoch": 4.459177409453653, + "grad_norm": 0.2469809502363205, + "learning_rate": 6.110043092168727e-05, + "loss": 1.6977, + "step": 14528 + }, + { + "epoch": 4.459484346224678, + "grad_norm": 0.21888838708400726, + "learning_rate": 6.109558434878159e-05, + "loss": 1.777, + "step": 14529 + }, + { + "epoch": 4.4597912829957025, + "grad_norm": 0.2094014585018158, + "learning_rate": 6.109073766622281e-05, + "loss": 1.7041, + "step": 14530 + }, + { + "epoch": 4.460098219766728, + "grad_norm": 0.23801055550575256, + "learning_rate": 6.108589087405888e-05, + "loss": 1.8392, + "step": 14531 + }, + { + "epoch": 4.460405156537753, + "grad_norm": 0.2164965718984604, + "learning_rate": 6.108104397233769e-05, + "loss": 1.7643, + "step": 14532 + }, + { + "epoch": 4.4607120933087785, + "grad_norm": 0.21322336792945862, + "learning_rate": 6.107619696110712e-05, + "loss": 1.7063, + "step": 14533 + }, + { + "epoch": 4.461019030079804, + "grad_norm": 0.29019200801849365, + "learning_rate": 6.107134984041507e-05, + "loss": 1.8254, + "step": 14534 + }, + { + "epoch": 4.461325966850829, + "grad_norm": 0.2765025496482849, + "learning_rate": 6.106650261030947e-05, + "loss": 1.7609, + "step": 14535 + }, + { + "epoch": 4.461632903621854, + "grad_norm": 0.20879749953746796, + "learning_rate": 6.106165527083818e-05, + "loss": 1.7387, + "step": 14536 + }, + { + "epoch": 4.461939840392879, + "grad_norm": 0.22295843064785004, + "learning_rate": 6.105680782204913e-05, + "loss": 1.7691, + "step": 14537 + }, + { + "epoch": 4.462246777163904, + "grad_norm": 0.23502351343631744, + "learning_rate": 6.105196026399025e-05, + "loss": 1.7335, + "step": 14538 + }, + { + "epoch": 4.46255371393493, + "grad_norm": 0.22143007814884186, + "learning_rate": 6.104711259670941e-05, + "loss": 1.7338, + "step": 14539 + }, + { + "epoch": 4.462860650705955, + "grad_norm": 0.22361041605472565, + "learning_rate": 6.104226482025453e-05, + "loss": 1.7033, + "step": 14540 + }, + { + "epoch": 4.463167587476979, + "grad_norm": 0.27104905247688293, + "learning_rate": 6.10374169346735e-05, + "loss": 1.7926, + "step": 14541 + }, + { + "epoch": 4.463474524248005, + "grad_norm": 0.23564264178276062, + "learning_rate": 6.103256894001427e-05, + "loss": 1.7522, + "step": 14542 + }, + { + "epoch": 4.46378146101903, + "grad_norm": 0.2585970163345337, + "learning_rate": 6.102772083632471e-05, + "loss": 1.7755, + "step": 14543 + }, + { + "epoch": 4.464088397790055, + "grad_norm": 0.358634889125824, + "learning_rate": 6.102287262365276e-05, + "loss": 1.8092, + "step": 14544 + }, + { + "epoch": 4.464395334561081, + "grad_norm": 0.2862946689128876, + "learning_rate": 6.1018024302046314e-05, + "loss": 1.7051, + "step": 14545 + }, + { + "epoch": 4.464702271332105, + "grad_norm": 0.21907158195972443, + "learning_rate": 6.101317587155331e-05, + "loss": 1.7882, + "step": 14546 + }, + { + "epoch": 4.4650092081031305, + "grad_norm": 0.24268488585948944, + "learning_rate": 6.100832733222164e-05, + "loss": 1.7756, + "step": 14547 + }, + { + "epoch": 4.465316144874156, + "grad_norm": 0.2350744605064392, + "learning_rate": 6.1003478684099214e-05, + "loss": 1.7483, + "step": 14548 + }, + { + "epoch": 4.465623081645181, + "grad_norm": 0.22902250289916992, + "learning_rate": 6.099862992723397e-05, + "loss": 1.7687, + "step": 14549 + }, + { + "epoch": 4.4659300184162065, + "grad_norm": 0.23590944707393646, + "learning_rate": 6.099378106167382e-05, + "loss": 1.8481, + "step": 14550 + }, + { + "epoch": 4.466236955187231, + "grad_norm": 0.23644296824932098, + "learning_rate": 6.098893208746668e-05, + "loss": 1.7422, + "step": 14551 + }, + { + "epoch": 4.466543891958256, + "grad_norm": 0.23782360553741455, + "learning_rate": 6.0984083004660475e-05, + "loss": 1.7852, + "step": 14552 + }, + { + "epoch": 4.466850828729282, + "grad_norm": 0.2546575665473938, + "learning_rate": 6.097923381330313e-05, + "loss": 1.8483, + "step": 14553 + }, + { + "epoch": 4.467157765500307, + "grad_norm": 0.2555409371852875, + "learning_rate": 6.097438451344254e-05, + "loss": 1.7887, + "step": 14554 + }, + { + "epoch": 4.467464702271332, + "grad_norm": 0.28074198961257935, + "learning_rate": 6.0969535105126664e-05, + "loss": 1.7521, + "step": 14555 + }, + { + "epoch": 4.467771639042358, + "grad_norm": 0.22622554004192352, + "learning_rate": 6.096468558840341e-05, + "loss": 1.8088, + "step": 14556 + }, + { + "epoch": 4.468078575813382, + "grad_norm": 0.302749902009964, + "learning_rate": 6.095983596332071e-05, + "loss": 1.8192, + "step": 14557 + }, + { + "epoch": 4.468385512584407, + "grad_norm": 0.27925750613212585, + "learning_rate": 6.0954986229926494e-05, + "loss": 1.8453, + "step": 14558 + }, + { + "epoch": 4.468692449355433, + "grad_norm": 0.2246330976486206, + "learning_rate": 6.095013638826868e-05, + "loss": 1.744, + "step": 14559 + }, + { + "epoch": 4.468999386126458, + "grad_norm": 0.26677101850509644, + "learning_rate": 6.094528643839518e-05, + "loss": 1.708, + "step": 14560 + }, + { + "epoch": 4.469306322897483, + "grad_norm": 0.23684042692184448, + "learning_rate": 6.094043638035396e-05, + "loss": 1.713, + "step": 14561 + }, + { + "epoch": 4.469613259668508, + "grad_norm": 0.2470075935125351, + "learning_rate": 6.093558621419294e-05, + "loss": 1.8096, + "step": 14562 + }, + { + "epoch": 4.469920196439533, + "grad_norm": 0.2775517702102661, + "learning_rate": 6.093073593996005e-05, + "loss": 1.697, + "step": 14563 + }, + { + "epoch": 4.4702271332105585, + "grad_norm": 0.21053175628185272, + "learning_rate": 6.092588555770322e-05, + "loss": 1.6894, + "step": 14564 + }, + { + "epoch": 4.470534069981584, + "grad_norm": 0.2555869221687317, + "learning_rate": 6.0921035067470366e-05, + "loss": 1.7051, + "step": 14565 + }, + { + "epoch": 4.470841006752609, + "grad_norm": 0.34468984603881836, + "learning_rate": 6.0916184469309454e-05, + "loss": 1.7317, + "step": 14566 + }, + { + "epoch": 4.4711479435236345, + "grad_norm": 0.2517752945423126, + "learning_rate": 6.0911333763268407e-05, + "loss": 1.7524, + "step": 14567 + }, + { + "epoch": 4.471454880294659, + "grad_norm": 0.2749727666378021, + "learning_rate": 6.090648294939517e-05, + "loss": 1.7045, + "step": 14568 + }, + { + "epoch": 4.471761817065684, + "grad_norm": 0.36250773072242737, + "learning_rate": 6.0901632027737673e-05, + "loss": 1.7196, + "step": 14569 + }, + { + "epoch": 4.47206875383671, + "grad_norm": 0.2317698448896408, + "learning_rate": 6.089678099834386e-05, + "loss": 1.7318, + "step": 14570 + }, + { + "epoch": 4.472375690607735, + "grad_norm": 0.2863345444202423, + "learning_rate": 6.089192986126166e-05, + "loss": 1.7798, + "step": 14571 + }, + { + "epoch": 4.47268262737876, + "grad_norm": 0.3493366241455078, + "learning_rate": 6.088707861653904e-05, + "loss": 1.7749, + "step": 14572 + }, + { + "epoch": 4.472989564149785, + "grad_norm": 0.25718605518341064, + "learning_rate": 6.0882227264223924e-05, + "loss": 1.7683, + "step": 14573 + }, + { + "epoch": 4.47329650092081, + "grad_norm": 0.2320062816143036, + "learning_rate": 6.087737580436426e-05, + "loss": 1.8296, + "step": 14574 + }, + { + "epoch": 4.473603437691835, + "grad_norm": 0.29071560502052307, + "learning_rate": 6.087252423700799e-05, + "loss": 1.7428, + "step": 14575 + }, + { + "epoch": 4.473910374462861, + "grad_norm": 0.24233707785606384, + "learning_rate": 6.086767256220306e-05, + "loss": 1.7332, + "step": 14576 + }, + { + "epoch": 4.474217311233886, + "grad_norm": 0.228043332695961, + "learning_rate": 6.086282077999742e-05, + "loss": 1.7697, + "step": 14577 + }, + { + "epoch": 4.474524248004911, + "grad_norm": 0.29154402017593384, + "learning_rate": 6.085796889043902e-05, + "loss": 1.8043, + "step": 14578 + }, + { + "epoch": 4.474831184775936, + "grad_norm": 0.30543211102485657, + "learning_rate": 6.0853116893575814e-05, + "loss": 1.7665, + "step": 14579 + }, + { + "epoch": 4.475138121546961, + "grad_norm": 0.22792959213256836, + "learning_rate": 6.0848264789455754e-05, + "loss": 1.729, + "step": 14580 + }, + { + "epoch": 4.475445058317987, + "grad_norm": 0.2615707218647003, + "learning_rate": 6.084341257812677e-05, + "loss": 1.7438, + "step": 14581 + }, + { + "epoch": 4.475751995089012, + "grad_norm": 0.23342981934547424, + "learning_rate": 6.083856025963681e-05, + "loss": 1.7158, + "step": 14582 + }, + { + "epoch": 4.476058931860037, + "grad_norm": 0.22279240190982819, + "learning_rate": 6.083370783403387e-05, + "loss": 1.7413, + "step": 14583 + }, + { + "epoch": 4.476365868631062, + "grad_norm": 0.28867462277412415, + "learning_rate": 6.082885530136587e-05, + "loss": 1.7932, + "step": 14584 + }, + { + "epoch": 4.476672805402087, + "grad_norm": 0.2947152256965637, + "learning_rate": 6.082400266168078e-05, + "loss": 1.8986, + "step": 14585 + }, + { + "epoch": 4.476979742173112, + "grad_norm": 0.2948935627937317, + "learning_rate": 6.0819149915026555e-05, + "loss": 1.9134, + "step": 14586 + }, + { + "epoch": 4.477286678944138, + "grad_norm": 0.4436163902282715, + "learning_rate": 6.081429706145114e-05, + "loss": 1.7616, + "step": 14587 + }, + { + "epoch": 4.477593615715163, + "grad_norm": 0.4879693388938904, + "learning_rate": 6.080944410100249e-05, + "loss": 1.8155, + "step": 14588 + }, + { + "epoch": 4.4779005524861875, + "grad_norm": 0.29742667078971863, + "learning_rate": 6.08045910337286e-05, + "loss": 1.7428, + "step": 14589 + }, + { + "epoch": 4.478207489257213, + "grad_norm": 0.2994751036167145, + "learning_rate": 6.0799737859677395e-05, + "loss": 1.7764, + "step": 14590 + }, + { + "epoch": 4.478514426028238, + "grad_norm": 0.46379905939102173, + "learning_rate": 6.079488457889686e-05, + "loss": 1.7289, + "step": 14591 + }, + { + "epoch": 4.4788213627992635, + "grad_norm": 0.3511717617511749, + "learning_rate": 6.0790031191434946e-05, + "loss": 1.7658, + "step": 14592 + }, + { + "epoch": 4.479128299570289, + "grad_norm": 0.22678083181381226, + "learning_rate": 6.0785177697339626e-05, + "loss": 1.7973, + "step": 14593 + }, + { + "epoch": 4.479435236341313, + "grad_norm": 0.31201767921447754, + "learning_rate": 6.0780324096658837e-05, + "loss": 1.7542, + "step": 14594 + }, + { + "epoch": 4.479742173112339, + "grad_norm": 0.23759113252162933, + "learning_rate": 6.077547038944058e-05, + "loss": 1.7191, + "step": 14595 + }, + { + "epoch": 4.480049109883364, + "grad_norm": 0.25801756978034973, + "learning_rate": 6.077061657573282e-05, + "loss": 1.8229, + "step": 14596 + }, + { + "epoch": 4.480356046654389, + "grad_norm": 0.3435722887516022, + "learning_rate": 6.0765762655583514e-05, + "loss": 1.7633, + "step": 14597 + }, + { + "epoch": 4.480662983425415, + "grad_norm": 0.2710443437099457, + "learning_rate": 6.076090862904063e-05, + "loss": 1.8126, + "step": 14598 + }, + { + "epoch": 4.48096992019644, + "grad_norm": 0.25750285387039185, + "learning_rate": 6.075605449615212e-05, + "loss": 1.7382, + "step": 14599 + }, + { + "epoch": 4.481276856967464, + "grad_norm": 0.3638051152229309, + "learning_rate": 6.075120025696598e-05, + "loss": 1.8191, + "step": 14600 + }, + { + "epoch": 4.48158379373849, + "grad_norm": 0.24185293912887573, + "learning_rate": 6.074634591153019e-05, + "loss": 1.7637, + "step": 14601 + }, + { + "epoch": 4.481890730509515, + "grad_norm": 0.317283570766449, + "learning_rate": 6.0741491459892707e-05, + "loss": 1.7805, + "step": 14602 + }, + { + "epoch": 4.48219766728054, + "grad_norm": 0.33884385228157043, + "learning_rate": 6.073663690210151e-05, + "loss": 1.7719, + "step": 14603 + }, + { + "epoch": 4.482504604051566, + "grad_norm": 0.2554258704185486, + "learning_rate": 6.073178223820457e-05, + "loss": 1.836, + "step": 14604 + }, + { + "epoch": 4.48281154082259, + "grad_norm": 0.3363535702228546, + "learning_rate": 6.072692746824987e-05, + "loss": 1.8249, + "step": 14605 + }, + { + "epoch": 4.4831184775936155, + "grad_norm": 0.36090195178985596, + "learning_rate": 6.072207259228537e-05, + "loss": 1.733, + "step": 14606 + }, + { + "epoch": 4.483425414364641, + "grad_norm": 0.21928483247756958, + "learning_rate": 6.071721761035909e-05, + "loss": 1.7413, + "step": 14607 + }, + { + "epoch": 4.483732351135666, + "grad_norm": 0.4256608486175537, + "learning_rate": 6.071236252251897e-05, + "loss": 1.7585, + "step": 14608 + }, + { + "epoch": 4.4840392879066915, + "grad_norm": 0.41980308294296265, + "learning_rate": 6.0707507328813007e-05, + "loss": 1.7584, + "step": 14609 + }, + { + "epoch": 4.484346224677717, + "grad_norm": 0.200295090675354, + "learning_rate": 6.0702652029289186e-05, + "loss": 1.7492, + "step": 14610 + }, + { + "epoch": 4.484653161448741, + "grad_norm": 0.41847771406173706, + "learning_rate": 6.069779662399549e-05, + "loss": 1.8101, + "step": 14611 + }, + { + "epoch": 4.484960098219767, + "grad_norm": 0.4846353530883789, + "learning_rate": 6.069294111297987e-05, + "loss": 1.8227, + "step": 14612 + }, + { + "epoch": 4.485267034990792, + "grad_norm": 0.23216098546981812, + "learning_rate": 6.068808549629036e-05, + "loss": 1.6811, + "step": 14613 + }, + { + "epoch": 4.485573971761817, + "grad_norm": 0.34903186559677124, + "learning_rate": 6.0683229773974934e-05, + "loss": 1.6858, + "step": 14614 + }, + { + "epoch": 4.485880908532843, + "grad_norm": 0.4349122941493988, + "learning_rate": 6.0678373946081556e-05, + "loss": 1.7704, + "step": 14615 + }, + { + "epoch": 4.486187845303867, + "grad_norm": 0.25738775730133057, + "learning_rate": 6.067351801265824e-05, + "loss": 1.7487, + "step": 14616 + }, + { + "epoch": 4.486494782074892, + "grad_norm": 0.3052736818790436, + "learning_rate": 6.0668661973752936e-05, + "loss": 1.7528, + "step": 14617 + }, + { + "epoch": 4.486801718845918, + "grad_norm": 0.3400498628616333, + "learning_rate": 6.066380582941368e-05, + "loss": 1.7414, + "step": 14618 + }, + { + "epoch": 4.487108655616943, + "grad_norm": 0.28251948952674866, + "learning_rate": 6.065894957968845e-05, + "loss": 1.8078, + "step": 14619 + }, + { + "epoch": 4.487415592387968, + "grad_norm": 0.26907965540885925, + "learning_rate": 6.0654093224625216e-05, + "loss": 1.8143, + "step": 14620 + }, + { + "epoch": 4.487722529158993, + "grad_norm": 0.2821955978870392, + "learning_rate": 6.064923676427201e-05, + "loss": 1.7163, + "step": 14621 + }, + { + "epoch": 4.488029465930018, + "grad_norm": 0.2223028987646103, + "learning_rate": 6.0644380198676786e-05, + "loss": 1.704, + "step": 14622 + }, + { + "epoch": 4.4883364027010435, + "grad_norm": 0.25243067741394043, + "learning_rate": 6.063952352788755e-05, + "loss": 1.7236, + "step": 14623 + }, + { + "epoch": 4.488643339472069, + "grad_norm": 0.30026015639305115, + "learning_rate": 6.063466675195233e-05, + "loss": 1.7575, + "step": 14624 + }, + { + "epoch": 4.488950276243094, + "grad_norm": 0.2055491805076599, + "learning_rate": 6.0629809870919085e-05, + "loss": 1.7294, + "step": 14625 + }, + { + "epoch": 4.4892572130141195, + "grad_norm": 0.2507593035697937, + "learning_rate": 6.0624952884835836e-05, + "loss": 1.762, + "step": 14626 + }, + { + "epoch": 4.489564149785144, + "grad_norm": 0.21385909616947174, + "learning_rate": 6.0620095793750576e-05, + "loss": 1.7396, + "step": 14627 + }, + { + "epoch": 4.489871086556169, + "grad_norm": 0.21926651895046234, + "learning_rate": 6.06152385977113e-05, + "loss": 1.7863, + "step": 14628 + }, + { + "epoch": 4.490178023327195, + "grad_norm": 0.21950845420360565, + "learning_rate": 6.0610381296766016e-05, + "loss": 1.7576, + "step": 14629 + }, + { + "epoch": 4.49048496009822, + "grad_norm": 0.2030971795320511, + "learning_rate": 6.0605523890962736e-05, + "loss": 1.7069, + "step": 14630 + }, + { + "epoch": 4.490791896869245, + "grad_norm": 0.23991432785987854, + "learning_rate": 6.0600666380349436e-05, + "loss": 1.7598, + "step": 14631 + }, + { + "epoch": 4.49109883364027, + "grad_norm": 0.23766861855983734, + "learning_rate": 6.059580876497415e-05, + "loss": 1.7687, + "step": 14632 + }, + { + "epoch": 4.491405770411295, + "grad_norm": 0.2361454963684082, + "learning_rate": 6.059095104488487e-05, + "loss": 1.7883, + "step": 14633 + }, + { + "epoch": 4.49171270718232, + "grad_norm": 0.3128328323364258, + "learning_rate": 6.058609322012958e-05, + "loss": 1.8087, + "step": 14634 + }, + { + "epoch": 4.492019643953346, + "grad_norm": 0.2958957850933075, + "learning_rate": 6.0581235290756335e-05, + "loss": 1.782, + "step": 14635 + }, + { + "epoch": 4.492326580724371, + "grad_norm": 0.2197243571281433, + "learning_rate": 6.057637725681312e-05, + "loss": 1.7408, + "step": 14636 + }, + { + "epoch": 4.4926335174953955, + "grad_norm": 0.22227831184864044, + "learning_rate": 6.0571519118347944e-05, + "loss": 1.734, + "step": 14637 + }, + { + "epoch": 4.492940454266421, + "grad_norm": 0.2784527540206909, + "learning_rate": 6.056666087540882e-05, + "loss": 1.8017, + "step": 14638 + }, + { + "epoch": 4.493247391037446, + "grad_norm": 0.21929821372032166, + "learning_rate": 6.056180252804377e-05, + "loss": 1.7271, + "step": 14639 + }, + { + "epoch": 4.4935543278084715, + "grad_norm": 0.2156134843826294, + "learning_rate": 6.055694407630077e-05, + "loss": 1.8082, + "step": 14640 + }, + { + "epoch": 4.493861264579497, + "grad_norm": 0.22672387957572937, + "learning_rate": 6.0552085520227875e-05, + "loss": 1.7506, + "step": 14641 + }, + { + "epoch": 4.494168201350522, + "grad_norm": 0.228785440325737, + "learning_rate": 6.0547226859873086e-05, + "loss": 1.7023, + "step": 14642 + }, + { + "epoch": 4.494475138121547, + "grad_norm": 0.19483685493469238, + "learning_rate": 6.054236809528443e-05, + "loss": 1.6879, + "step": 14643 + }, + { + "epoch": 4.494782074892572, + "grad_norm": 0.24911309778690338, + "learning_rate": 6.0537509226509904e-05, + "loss": 1.7856, + "step": 14644 + }, + { + "epoch": 4.495089011663597, + "grad_norm": 0.24811938405036926, + "learning_rate": 6.053265025359753e-05, + "loss": 1.7581, + "step": 14645 + }, + { + "epoch": 4.495395948434623, + "grad_norm": 0.2487260401248932, + "learning_rate": 6.052779117659534e-05, + "loss": 1.7536, + "step": 14646 + }, + { + "epoch": 4.495702885205648, + "grad_norm": 0.2594854235649109, + "learning_rate": 6.052293199555136e-05, + "loss": 1.7822, + "step": 14647 + }, + { + "epoch": 4.496009821976672, + "grad_norm": 0.22837325930595398, + "learning_rate": 6.051807271051359e-05, + "loss": 1.7542, + "step": 14648 + }, + { + "epoch": 4.496316758747698, + "grad_norm": 0.23106649518013, + "learning_rate": 6.051321332153005e-05, + "loss": 1.7758, + "step": 14649 + }, + { + "epoch": 4.496623695518723, + "grad_norm": 0.29424673318862915, + "learning_rate": 6.050835382864878e-05, + "loss": 1.8335, + "step": 14650 + }, + { + "epoch": 4.496930632289748, + "grad_norm": 0.28297343850135803, + "learning_rate": 6.050349423191779e-05, + "loss": 1.7711, + "step": 14651 + }, + { + "epoch": 4.497237569060774, + "grad_norm": 0.2001795768737793, + "learning_rate": 6.049863453138511e-05, + "loss": 1.7008, + "step": 14652 + }, + { + "epoch": 4.497544505831799, + "grad_norm": 0.35177022218704224, + "learning_rate": 6.04937747270988e-05, + "loss": 1.7763, + "step": 14653 + }, + { + "epoch": 4.4978514426028235, + "grad_norm": 0.28870898485183716, + "learning_rate": 6.0488914819106835e-05, + "loss": 1.7373, + "step": 14654 + }, + { + "epoch": 4.498158379373849, + "grad_norm": 0.23962664604187012, + "learning_rate": 6.048405480745727e-05, + "loss": 1.7278, + "step": 14655 + }, + { + "epoch": 4.498465316144874, + "grad_norm": 0.324505478143692, + "learning_rate": 6.047919469219813e-05, + "loss": 1.7674, + "step": 14656 + }, + { + "epoch": 4.4987722529158995, + "grad_norm": 0.38313817977905273, + "learning_rate": 6.047433447337744e-05, + "loss": 1.789, + "step": 14657 + }, + { + "epoch": 4.499079189686925, + "grad_norm": 0.2101358324289322, + "learning_rate": 6.046947415104324e-05, + "loss": 1.7331, + "step": 14658 + }, + { + "epoch": 4.499386126457949, + "grad_norm": 0.3388524353504181, + "learning_rate": 6.046461372524357e-05, + "loss": 1.8467, + "step": 14659 + }, + { + "epoch": 4.499693063228975, + "grad_norm": 0.3360123634338379, + "learning_rate": 6.045975319602645e-05, + "loss": 1.8427, + "step": 14660 + }, + { + "epoch": 4.5, + "grad_norm": 0.27596545219421387, + "learning_rate": 6.0454892563439914e-05, + "loss": 1.7768, + "step": 14661 + }, + { + "epoch": 4.500306936771025, + "grad_norm": 0.2580861747264862, + "learning_rate": 6.0450031827532e-05, + "loss": 1.763, + "step": 14662 + }, + { + "epoch": 4.500613873542051, + "grad_norm": 0.3521091938018799, + "learning_rate": 6.044517098835074e-05, + "loss": 1.7118, + "step": 14663 + }, + { + "epoch": 4.500920810313076, + "grad_norm": 0.29412439465522766, + "learning_rate": 6.0440310045944204e-05, + "loss": 1.7252, + "step": 14664 + }, + { + "epoch": 4.5012277470841005, + "grad_norm": 0.23845252394676208, + "learning_rate": 6.043544900036039e-05, + "loss": 1.7622, + "step": 14665 + }, + { + "epoch": 4.501534683855126, + "grad_norm": 0.22957031428813934, + "learning_rate": 6.043058785164736e-05, + "loss": 1.7527, + "step": 14666 + }, + { + "epoch": 4.501841620626151, + "grad_norm": 0.2564462721347809, + "learning_rate": 6.042572659985314e-05, + "loss": 1.801, + "step": 14667 + }, + { + "epoch": 4.5021485573971765, + "grad_norm": 0.22588051855564117, + "learning_rate": 6.042086524502576e-05, + "loss": 1.7387, + "step": 14668 + }, + { + "epoch": 4.502455494168201, + "grad_norm": 0.2609740197658539, + "learning_rate": 6.0416003787213306e-05, + "loss": 1.7615, + "step": 14669 + }, + { + "epoch": 4.502762430939226, + "grad_norm": 0.2535521984100342, + "learning_rate": 6.041114222646379e-05, + "loss": 1.7398, + "step": 14670 + }, + { + "epoch": 4.503069367710252, + "grad_norm": 0.2512127757072449, + "learning_rate": 6.040628056282527e-05, + "loss": 1.7679, + "step": 14671 + }, + { + "epoch": 4.503376304481277, + "grad_norm": 0.2438639998435974, + "learning_rate": 6.0401418796345774e-05, + "loss": 1.7, + "step": 14672 + }, + { + "epoch": 4.503683241252302, + "grad_norm": 0.23428042232990265, + "learning_rate": 6.0396556927073376e-05, + "loss": 1.7748, + "step": 14673 + }, + { + "epoch": 4.503990178023328, + "grad_norm": 0.22894345223903656, + "learning_rate": 6.03916949550561e-05, + "loss": 1.7881, + "step": 14674 + }, + { + "epoch": 4.504297114794352, + "grad_norm": 0.24813716113567352, + "learning_rate": 6.0386832880342006e-05, + "loss": 1.7676, + "step": 14675 + }, + { + "epoch": 4.504604051565377, + "grad_norm": 0.23448842763900757, + "learning_rate": 6.038197070297914e-05, + "loss": 1.7828, + "step": 14676 + }, + { + "epoch": 4.504910988336403, + "grad_norm": 0.25302332639694214, + "learning_rate": 6.037710842301556e-05, + "loss": 1.8061, + "step": 14677 + }, + { + "epoch": 4.505217925107428, + "grad_norm": 0.2411813735961914, + "learning_rate": 6.0372246040499305e-05, + "loss": 1.6901, + "step": 14678 + }, + { + "epoch": 4.505524861878453, + "grad_norm": 0.3154819905757904, + "learning_rate": 6.036738355547844e-05, + "loss": 1.7472, + "step": 14679 + }, + { + "epoch": 4.505831798649478, + "grad_norm": 0.2935639023780823, + "learning_rate": 6.0362520968001014e-05, + "loss": 1.7508, + "step": 14680 + }, + { + "epoch": 4.506138735420503, + "grad_norm": 0.27064070105552673, + "learning_rate": 6.035765827811508e-05, + "loss": 1.8133, + "step": 14681 + }, + { + "epoch": 4.5064456721915285, + "grad_norm": 0.23748525977134705, + "learning_rate": 6.03527954858687e-05, + "loss": 1.7742, + "step": 14682 + }, + { + "epoch": 4.506752608962554, + "grad_norm": 0.216410830616951, + "learning_rate": 6.034793259130992e-05, + "loss": 1.7448, + "step": 14683 + }, + { + "epoch": 4.507059545733579, + "grad_norm": 0.23339977860450745, + "learning_rate": 6.034306959448681e-05, + "loss": 1.7437, + "step": 14684 + }, + { + "epoch": 4.5073664825046045, + "grad_norm": 0.23951120674610138, + "learning_rate": 6.0338206495447414e-05, + "loss": 1.7535, + "step": 14685 + }, + { + "epoch": 4.507673419275629, + "grad_norm": 0.22137518227100372, + "learning_rate": 6.0333343294239816e-05, + "loss": 1.7537, + "step": 14686 + }, + { + "epoch": 4.507980356046654, + "grad_norm": 0.2550075054168701, + "learning_rate": 6.032847999091206e-05, + "loss": 1.8069, + "step": 14687 + }, + { + "epoch": 4.50828729281768, + "grad_norm": 0.2166420966386795, + "learning_rate": 6.032361658551221e-05, + "loss": 1.7746, + "step": 14688 + }, + { + "epoch": 4.508594229588705, + "grad_norm": 0.21926096081733704, + "learning_rate": 6.031875307808833e-05, + "loss": 1.7848, + "step": 14689 + }, + { + "epoch": 4.50890116635973, + "grad_norm": 0.27769652009010315, + "learning_rate": 6.031388946868848e-05, + "loss": 1.7563, + "step": 14690 + }, + { + "epoch": 4.509208103130755, + "grad_norm": 0.23417410254478455, + "learning_rate": 6.030902575736074e-05, + "loss": 1.7475, + "step": 14691 + }, + { + "epoch": 4.50951503990178, + "grad_norm": 0.25454118847846985, + "learning_rate": 6.030416194415314e-05, + "loss": 1.7416, + "step": 14692 + }, + { + "epoch": 4.509821976672805, + "grad_norm": 0.3118220567703247, + "learning_rate": 6.029929802911379e-05, + "loss": 1.8001, + "step": 14693 + }, + { + "epoch": 4.510128913443831, + "grad_norm": 0.2338017225265503, + "learning_rate": 6.029443401229075e-05, + "loss": 1.7243, + "step": 14694 + }, + { + "epoch": 4.510435850214856, + "grad_norm": 0.2490454763174057, + "learning_rate": 6.028956989373207e-05, + "loss": 1.7866, + "step": 14695 + }, + { + "epoch": 4.510742786985881, + "grad_norm": 0.2579275369644165, + "learning_rate": 6.028470567348582e-05, + "loss": 1.7594, + "step": 14696 + }, + { + "epoch": 4.511049723756906, + "grad_norm": 0.23982174694538116, + "learning_rate": 6.0279841351600094e-05, + "loss": 1.7444, + "step": 14697 + }, + { + "epoch": 4.511356660527931, + "grad_norm": 0.2160159945487976, + "learning_rate": 6.027497692812295e-05, + "loss": 1.7002, + "step": 14698 + }, + { + "epoch": 4.5116635972989565, + "grad_norm": 0.24604511260986328, + "learning_rate": 6.0270112403102455e-05, + "loss": 1.7654, + "step": 14699 + }, + { + "epoch": 4.511970534069982, + "grad_norm": 0.21978263556957245, + "learning_rate": 6.026524777658669e-05, + "loss": 1.7278, + "step": 14700 + }, + { + "epoch": 4.512277470841006, + "grad_norm": 0.2814212441444397, + "learning_rate": 6.026038304862373e-05, + "loss": 1.7743, + "step": 14701 + }, + { + "epoch": 4.512584407612032, + "grad_norm": 0.23798944056034088, + "learning_rate": 6.025551821926165e-05, + "loss": 1.7348, + "step": 14702 + }, + { + "epoch": 4.512891344383057, + "grad_norm": 0.22415988147258759, + "learning_rate": 6.025065328854853e-05, + "loss": 1.7973, + "step": 14703 + }, + { + "epoch": 4.513198281154082, + "grad_norm": 0.34614792466163635, + "learning_rate": 6.0245788256532445e-05, + "loss": 1.7263, + "step": 14704 + }, + { + "epoch": 4.513505217925108, + "grad_norm": 0.333918958902359, + "learning_rate": 6.0240923123261485e-05, + "loss": 1.7305, + "step": 14705 + }, + { + "epoch": 4.513812154696133, + "grad_norm": 0.22231793403625488, + "learning_rate": 6.02360578887837e-05, + "loss": 1.806, + "step": 14706 + }, + { + "epoch": 4.514119091467157, + "grad_norm": 0.23323194682598114, + "learning_rate": 6.023119255314721e-05, + "loss": 1.7076, + "step": 14707 + }, + { + "epoch": 4.514426028238183, + "grad_norm": 0.26695477962493896, + "learning_rate": 6.022632711640007e-05, + "loss": 1.775, + "step": 14708 + }, + { + "epoch": 4.514732965009208, + "grad_norm": 0.21446476876735687, + "learning_rate": 6.0221461578590364e-05, + "loss": 1.7524, + "step": 14709 + }, + { + "epoch": 4.515039901780233, + "grad_norm": 0.2677358090877533, + "learning_rate": 6.0216595939766204e-05, + "loss": 1.7513, + "step": 14710 + }, + { + "epoch": 4.515346838551259, + "grad_norm": 0.28648239374160767, + "learning_rate": 6.021173019997565e-05, + "loss": 1.7249, + "step": 14711 + }, + { + "epoch": 4.515653775322283, + "grad_norm": 0.2178548276424408, + "learning_rate": 6.020686435926678e-05, + "loss": 1.7502, + "step": 14712 + }, + { + "epoch": 4.5159607120933085, + "grad_norm": 0.3391740024089813, + "learning_rate": 6.02019984176877e-05, + "loss": 1.6828, + "step": 14713 + }, + { + "epoch": 4.516267648864334, + "grad_norm": 0.25222229957580566, + "learning_rate": 6.01971323752865e-05, + "loss": 1.6982, + "step": 14714 + }, + { + "epoch": 4.516574585635359, + "grad_norm": 0.28776636719703674, + "learning_rate": 6.019226623211125e-05, + "loss": 1.8595, + "step": 14715 + }, + { + "epoch": 4.5168815224063845, + "grad_norm": 0.3240084648132324, + "learning_rate": 6.018739998821006e-05, + "loss": 1.7461, + "step": 14716 + }, + { + "epoch": 4.51718845917741, + "grad_norm": 0.26735052466392517, + "learning_rate": 6.0182533643631015e-05, + "loss": 1.7955, + "step": 14717 + }, + { + "epoch": 4.517495395948434, + "grad_norm": 0.24573692679405212, + "learning_rate": 6.017766719842219e-05, + "loss": 1.7441, + "step": 14718 + }, + { + "epoch": 4.51780233271946, + "grad_norm": 0.27401313185691833, + "learning_rate": 6.01728006526317e-05, + "loss": 1.7399, + "step": 14719 + }, + { + "epoch": 4.518109269490485, + "grad_norm": 0.23578806221485138, + "learning_rate": 6.016793400630763e-05, + "loss": 1.7936, + "step": 14720 + }, + { + "epoch": 4.51841620626151, + "grad_norm": 0.27763426303863525, + "learning_rate": 6.0163067259498074e-05, + "loss": 1.7263, + "step": 14721 + }, + { + "epoch": 4.518723143032536, + "grad_norm": 0.27102044224739075, + "learning_rate": 6.015820041225113e-05, + "loss": 1.7085, + "step": 14722 + }, + { + "epoch": 4.51903007980356, + "grad_norm": 0.2046152651309967, + "learning_rate": 6.01533334646149e-05, + "loss": 1.7602, + "step": 14723 + }, + { + "epoch": 4.519337016574585, + "grad_norm": 0.2645253837108612, + "learning_rate": 6.0148466416637484e-05, + "loss": 1.7729, + "step": 14724 + }, + { + "epoch": 4.519643953345611, + "grad_norm": 0.27467650175094604, + "learning_rate": 6.014359926836697e-05, + "loss": 1.7834, + "step": 14725 + }, + { + "epoch": 4.519950890116636, + "grad_norm": 0.30357635021209717, + "learning_rate": 6.013873201985145e-05, + "loss": 1.8685, + "step": 14726 + }, + { + "epoch": 4.520257826887661, + "grad_norm": 0.22923336923122406, + "learning_rate": 6.013386467113905e-05, + "loss": 1.7531, + "step": 14727 + }, + { + "epoch": 4.520564763658687, + "grad_norm": 0.2792156934738159, + "learning_rate": 6.012899722227786e-05, + "loss": 1.7927, + "step": 14728 + }, + { + "epoch": 4.520871700429711, + "grad_norm": 0.286161869764328, + "learning_rate": 6.012412967331598e-05, + "loss": 1.77, + "step": 14729 + }, + { + "epoch": 4.5211786372007365, + "grad_norm": 0.23964659869670868, + "learning_rate": 6.011926202430151e-05, + "loss": 1.7873, + "step": 14730 + }, + { + "epoch": 4.521485573971762, + "grad_norm": 0.2250162959098816, + "learning_rate": 6.011439427528258e-05, + "loss": 1.741, + "step": 14731 + }, + { + "epoch": 4.521792510742787, + "grad_norm": 0.2797175347805023, + "learning_rate": 6.010952642630726e-05, + "loss": 1.7482, + "step": 14732 + }, + { + "epoch": 4.5220994475138125, + "grad_norm": 0.22159560024738312, + "learning_rate": 6.010465847742368e-05, + "loss": 1.7591, + "step": 14733 + }, + { + "epoch": 4.522406384284837, + "grad_norm": 0.26638463139533997, + "learning_rate": 6.009979042867995e-05, + "loss": 1.8564, + "step": 14734 + }, + { + "epoch": 4.522713321055862, + "grad_norm": 0.2972821891307831, + "learning_rate": 6.009492228012416e-05, + "loss": 1.7569, + "step": 14735 + }, + { + "epoch": 4.523020257826888, + "grad_norm": 0.28108885884284973, + "learning_rate": 6.0090054031804444e-05, + "loss": 1.7256, + "step": 14736 + }, + { + "epoch": 4.523327194597913, + "grad_norm": 0.22359851002693176, + "learning_rate": 6.008518568376888e-05, + "loss": 1.7342, + "step": 14737 + }, + { + "epoch": 4.523634131368938, + "grad_norm": 0.2620728015899658, + "learning_rate": 6.008031723606562e-05, + "loss": 1.7703, + "step": 14738 + }, + { + "epoch": 4.523941068139964, + "grad_norm": 0.2641485333442688, + "learning_rate": 6.007544868874274e-05, + "loss": 1.6944, + "step": 14739 + }, + { + "epoch": 4.524248004910988, + "grad_norm": 0.24957752227783203, + "learning_rate": 6.007058004184839e-05, + "loss": 1.7746, + "step": 14740 + }, + { + "epoch": 4.524554941682013, + "grad_norm": 0.29830998182296753, + "learning_rate": 6.006571129543065e-05, + "loss": 1.7718, + "step": 14741 + }, + { + "epoch": 4.524861878453039, + "grad_norm": 0.32740798592567444, + "learning_rate": 6.006084244953766e-05, + "loss": 1.8194, + "step": 14742 + }, + { + "epoch": 4.525168815224064, + "grad_norm": 0.2614956796169281, + "learning_rate": 6.005597350421751e-05, + "loss": 1.7078, + "step": 14743 + }, + { + "epoch": 4.525475751995089, + "grad_norm": 0.23940515518188477, + "learning_rate": 6.005110445951836e-05, + "loss": 1.7488, + "step": 14744 + }, + { + "epoch": 4.525782688766114, + "grad_norm": 0.25485914945602417, + "learning_rate": 6.004623531548829e-05, + "loss": 1.7705, + "step": 14745 + }, + { + "epoch": 4.526089625537139, + "grad_norm": 0.213532954454422, + "learning_rate": 6.0041366072175445e-05, + "loss": 1.7501, + "step": 14746 + }, + { + "epoch": 4.526396562308165, + "grad_norm": 0.2420104295015335, + "learning_rate": 6.003649672962792e-05, + "loss": 1.717, + "step": 14747 + }, + { + "epoch": 4.52670349907919, + "grad_norm": 0.26179102063179016, + "learning_rate": 6.0031627287893865e-05, + "loss": 1.7665, + "step": 14748 + }, + { + "epoch": 4.527010435850215, + "grad_norm": 0.22032082080841064, + "learning_rate": 6.002675774702139e-05, + "loss": 1.7555, + "step": 14749 + }, + { + "epoch": 4.52731737262124, + "grad_norm": 0.23915240168571472, + "learning_rate": 6.002188810705861e-05, + "loss": 1.8219, + "step": 14750 + }, + { + "epoch": 4.527624309392265, + "grad_norm": 0.2275150567293167, + "learning_rate": 6.0017018368053665e-05, + "loss": 1.7418, + "step": 14751 + }, + { + "epoch": 4.52793124616329, + "grad_norm": 0.2349669486284256, + "learning_rate": 6.001214853005467e-05, + "loss": 1.7814, + "step": 14752 + }, + { + "epoch": 4.528238182934316, + "grad_norm": 0.29985731840133667, + "learning_rate": 6.000727859310975e-05, + "loss": 1.7109, + "step": 14753 + }, + { + "epoch": 4.528545119705341, + "grad_norm": 0.27282044291496277, + "learning_rate": 6.0002408557267044e-05, + "loss": 1.7806, + "step": 14754 + }, + { + "epoch": 4.5288520564763655, + "grad_norm": 0.20906320214271545, + "learning_rate": 5.9997538422574675e-05, + "loss": 1.7221, + "step": 14755 + }, + { + "epoch": 4.529158993247391, + "grad_norm": 0.24553455412387848, + "learning_rate": 5.999266818908076e-05, + "loss": 1.793, + "step": 14756 + }, + { + "epoch": 4.529465930018416, + "grad_norm": 0.29730647802352905, + "learning_rate": 5.998779785683345e-05, + "loss": 1.7597, + "step": 14757 + }, + { + "epoch": 4.5297728667894415, + "grad_norm": 0.28297582268714905, + "learning_rate": 5.998292742588087e-05, + "loss": 1.7459, + "step": 14758 + }, + { + "epoch": 4.530079803560467, + "grad_norm": 0.21853844821453094, + "learning_rate": 5.997805689627115e-05, + "loss": 1.7234, + "step": 14759 + }, + { + "epoch": 4.530386740331492, + "grad_norm": 0.2997361421585083, + "learning_rate": 5.997318626805242e-05, + "loss": 1.7294, + "step": 14760 + }, + { + "epoch": 4.530693677102517, + "grad_norm": 0.3298671543598175, + "learning_rate": 5.9968315541272804e-05, + "loss": 1.7837, + "step": 14761 + }, + { + "epoch": 4.531000613873542, + "grad_norm": 0.22812490165233612, + "learning_rate": 5.996344471598047e-05, + "loss": 1.7509, + "step": 14762 + }, + { + "epoch": 4.531307550644567, + "grad_norm": 0.3179669678211212, + "learning_rate": 5.995857379222354e-05, + "loss": 1.8354, + "step": 14763 + }, + { + "epoch": 4.531614487415593, + "grad_norm": 0.3072827458381653, + "learning_rate": 5.9953702770050135e-05, + "loss": 1.8051, + "step": 14764 + }, + { + "epoch": 4.531921424186618, + "grad_norm": 0.19386722147464752, + "learning_rate": 5.994883164950841e-05, + "loss": 1.7093, + "step": 14765 + }, + { + "epoch": 4.532228360957642, + "grad_norm": 0.2380950152873993, + "learning_rate": 5.99439604306465e-05, + "loss": 1.7547, + "step": 14766 + }, + { + "epoch": 4.532535297728668, + "grad_norm": 0.32604947686195374, + "learning_rate": 5.993908911351254e-05, + "loss": 1.8708, + "step": 14767 + }, + { + "epoch": 4.532842234499693, + "grad_norm": 0.2436954528093338, + "learning_rate": 5.993421769815468e-05, + "loss": 1.7272, + "step": 14768 + }, + { + "epoch": 4.533149171270718, + "grad_norm": 0.2470337301492691, + "learning_rate": 5.992934618462105e-05, + "loss": 1.7242, + "step": 14769 + }, + { + "epoch": 4.533456108041744, + "grad_norm": 0.25720325112342834, + "learning_rate": 5.992447457295981e-05, + "loss": 1.7219, + "step": 14770 + }, + { + "epoch": 4.533763044812769, + "grad_norm": 0.2518918812274933, + "learning_rate": 5.991960286321909e-05, + "loss": 1.7916, + "step": 14771 + }, + { + "epoch": 4.5340699815837935, + "grad_norm": 0.2561487853527069, + "learning_rate": 5.9914731055447037e-05, + "loss": 1.7695, + "step": 14772 + }, + { + "epoch": 4.534376918354819, + "grad_norm": 0.25361356139183044, + "learning_rate": 5.9909859149691804e-05, + "loss": 1.7464, + "step": 14773 + }, + { + "epoch": 4.534683855125844, + "grad_norm": 0.22827522456645966, + "learning_rate": 5.9904987146001545e-05, + "loss": 1.7288, + "step": 14774 + }, + { + "epoch": 4.5349907918968695, + "grad_norm": 0.2417261302471161, + "learning_rate": 5.9900115044424385e-05, + "loss": 1.7311, + "step": 14775 + }, + { + "epoch": 4.535297728667894, + "grad_norm": 0.20756755769252777, + "learning_rate": 5.9895242845008495e-05, + "loss": 1.7799, + "step": 14776 + }, + { + "epoch": 4.535604665438919, + "grad_norm": 0.21999207139015198, + "learning_rate": 5.989037054780201e-05, + "loss": 1.7782, + "step": 14777 + }, + { + "epoch": 4.535911602209945, + "grad_norm": 0.22863444685935974, + "learning_rate": 5.988549815285308e-05, + "loss": 1.7869, + "step": 14778 + }, + { + "epoch": 4.53621853898097, + "grad_norm": 0.23033374547958374, + "learning_rate": 5.988062566020987e-05, + "loss": 1.7328, + "step": 14779 + }, + { + "epoch": 4.536525475751995, + "grad_norm": 0.21903404593467712, + "learning_rate": 5.987575306992053e-05, + "loss": 1.7689, + "step": 14780 + }, + { + "epoch": 4.536832412523021, + "grad_norm": 0.2433948963880539, + "learning_rate": 5.98708803820332e-05, + "loss": 1.7647, + "step": 14781 + }, + { + "epoch": 4.537139349294045, + "grad_norm": 0.2564239799976349, + "learning_rate": 5.986600759659606e-05, + "loss": 1.7958, + "step": 14782 + }, + { + "epoch": 4.53744628606507, + "grad_norm": 0.24009190499782562, + "learning_rate": 5.9861134713657244e-05, + "loss": 1.7511, + "step": 14783 + }, + { + "epoch": 4.537753222836096, + "grad_norm": 0.2578975558280945, + "learning_rate": 5.985626173326491e-05, + "loss": 1.8285, + "step": 14784 + }, + { + "epoch": 4.538060159607121, + "grad_norm": 0.24334335327148438, + "learning_rate": 5.9851388655467225e-05, + "loss": 1.7391, + "step": 14785 + }, + { + "epoch": 4.538367096378146, + "grad_norm": 0.26446983218193054, + "learning_rate": 5.9846515480312335e-05, + "loss": 1.8232, + "step": 14786 + }, + { + "epoch": 4.538674033149171, + "grad_norm": 0.3125670850276947, + "learning_rate": 5.9841642207848415e-05, + "loss": 1.7202, + "step": 14787 + }, + { + "epoch": 4.538980969920196, + "grad_norm": 0.2524511218070984, + "learning_rate": 5.983676883812361e-05, + "loss": 1.7653, + "step": 14788 + }, + { + "epoch": 4.5392879066912215, + "grad_norm": 0.3693946897983551, + "learning_rate": 5.98318953711861e-05, + "loss": 1.7457, + "step": 14789 + }, + { + "epoch": 4.539594843462247, + "grad_norm": 0.32625386118888855, + "learning_rate": 5.9827021807084026e-05, + "loss": 1.784, + "step": 14790 + }, + { + "epoch": 4.539901780233272, + "grad_norm": 0.24243168532848358, + "learning_rate": 5.9822148145865574e-05, + "loss": 1.7651, + "step": 14791 + }, + { + "epoch": 4.5402087170042975, + "grad_norm": 0.2950129210948944, + "learning_rate": 5.9817274387578895e-05, + "loss": 1.7316, + "step": 14792 + }, + { + "epoch": 4.540515653775322, + "grad_norm": 0.29455235600471497, + "learning_rate": 5.981240053227216e-05, + "loss": 1.7504, + "step": 14793 + }, + { + "epoch": 4.540822590546347, + "grad_norm": 0.23161925375461578, + "learning_rate": 5.980752657999352e-05, + "loss": 1.7663, + "step": 14794 + }, + { + "epoch": 4.541129527317373, + "grad_norm": 0.2725144922733307, + "learning_rate": 5.980265253079116e-05, + "loss": 1.765, + "step": 14795 + }, + { + "epoch": 4.541436464088398, + "grad_norm": 0.30911222100257874, + "learning_rate": 5.979777838471324e-05, + "loss": 1.7888, + "step": 14796 + }, + { + "epoch": 4.541743400859423, + "grad_norm": 0.2818063497543335, + "learning_rate": 5.979290414180794e-05, + "loss": 1.8047, + "step": 14797 + }, + { + "epoch": 4.542050337630448, + "grad_norm": 0.23335030674934387, + "learning_rate": 5.978802980212341e-05, + "loss": 1.8205, + "step": 14798 + }, + { + "epoch": 4.542357274401473, + "grad_norm": 0.24228201806545258, + "learning_rate": 5.9783155365707855e-05, + "loss": 1.7774, + "step": 14799 + }, + { + "epoch": 4.542664211172498, + "grad_norm": 0.2410847544670105, + "learning_rate": 5.97782808326094e-05, + "loss": 1.6959, + "step": 14800 + }, + { + "epoch": 4.542971147943524, + "grad_norm": 0.24812567234039307, + "learning_rate": 5.9773406202876245e-05, + "loss": 1.8158, + "step": 14801 + }, + { + "epoch": 4.543278084714549, + "grad_norm": 0.2606147229671478, + "learning_rate": 5.9768531476556566e-05, + "loss": 1.7478, + "step": 14802 + }, + { + "epoch": 4.543585021485574, + "grad_norm": 0.24853013455867767, + "learning_rate": 5.976365665369854e-05, + "loss": 1.8158, + "step": 14803 + }, + { + "epoch": 4.543891958256599, + "grad_norm": 0.2320917695760727, + "learning_rate": 5.9758781734350334e-05, + "loss": 1.7812, + "step": 14804 + }, + { + "epoch": 4.544198895027624, + "grad_norm": 0.3460223376750946, + "learning_rate": 5.9753906718560127e-05, + "loss": 1.7562, + "step": 14805 + }, + { + "epoch": 4.5445058317986495, + "grad_norm": 0.2941136658191681, + "learning_rate": 5.9749031606376086e-05, + "loss": 1.7562, + "step": 14806 + }, + { + "epoch": 4.544812768569675, + "grad_norm": 0.2371312975883484, + "learning_rate": 5.9744156397846404e-05, + "loss": 1.7793, + "step": 14807 + }, + { + "epoch": 4.5451197053407, + "grad_norm": 0.2885094881057739, + "learning_rate": 5.973928109301926e-05, + "loss": 1.7564, + "step": 14808 + }, + { + "epoch": 4.545426642111725, + "grad_norm": 0.2369023859500885, + "learning_rate": 5.973440569194284e-05, + "loss": 1.7862, + "step": 14809 + }, + { + "epoch": 4.54573357888275, + "grad_norm": 0.26628994941711426, + "learning_rate": 5.972953019466531e-05, + "loss": 1.7828, + "step": 14810 + }, + { + "epoch": 4.546040515653775, + "grad_norm": 0.3091031610965729, + "learning_rate": 5.9724654601234864e-05, + "loss": 1.7623, + "step": 14811 + }, + { + "epoch": 4.546347452424801, + "grad_norm": 0.24652205407619476, + "learning_rate": 5.971977891169966e-05, + "loss": 1.6982, + "step": 14812 + }, + { + "epoch": 4.546654389195826, + "grad_norm": 0.21779046952724457, + "learning_rate": 5.971490312610793e-05, + "loss": 1.7363, + "step": 14813 + }, + { + "epoch": 4.546961325966851, + "grad_norm": 0.24130751192569733, + "learning_rate": 5.971002724450783e-05, + "loss": 1.7014, + "step": 14814 + }, + { + "epoch": 4.547268262737876, + "grad_norm": 0.21868734061717987, + "learning_rate": 5.9705151266947534e-05, + "loss": 1.7872, + "step": 14815 + }, + { + "epoch": 4.547575199508901, + "grad_norm": 0.257376492023468, + "learning_rate": 5.9700275193475275e-05, + "loss": 1.75, + "step": 14816 + }, + { + "epoch": 4.547882136279926, + "grad_norm": 0.3182791769504547, + "learning_rate": 5.9695399024139174e-05, + "loss": 1.7965, + "step": 14817 + }, + { + "epoch": 4.548189073050952, + "grad_norm": 0.25553280115127563, + "learning_rate": 5.969052275898748e-05, + "loss": 1.8394, + "step": 14818 + }, + { + "epoch": 4.548496009821976, + "grad_norm": 0.2810833752155304, + "learning_rate": 5.9685646398068354e-05, + "loss": 1.704, + "step": 14819 + }, + { + "epoch": 4.5488029465930016, + "grad_norm": 0.21320512890815735, + "learning_rate": 5.9680769941429993e-05, + "loss": 1.7248, + "step": 14820 + }, + { + "epoch": 4.549109883364027, + "grad_norm": 0.3159593939781189, + "learning_rate": 5.96758933891206e-05, + "loss": 1.7885, + "step": 14821 + }, + { + "epoch": 4.549416820135052, + "grad_norm": 0.21894599497318268, + "learning_rate": 5.967101674118834e-05, + "loss": 1.7388, + "step": 14822 + }, + { + "epoch": 4.5497237569060776, + "grad_norm": 0.24804852902889252, + "learning_rate": 5.9666139997681424e-05, + "loss": 1.7631, + "step": 14823 + }, + { + "epoch": 4.550030693677103, + "grad_norm": 0.2678423523902893, + "learning_rate": 5.966126315864806e-05, + "loss": 1.7631, + "step": 14824 + }, + { + "epoch": 4.550337630448127, + "grad_norm": 0.229649156332016, + "learning_rate": 5.9656386224136426e-05, + "loss": 1.7292, + "step": 14825 + }, + { + "epoch": 4.550644567219153, + "grad_norm": 0.25248458981513977, + "learning_rate": 5.965150919419473e-05, + "loss": 1.8, + "step": 14826 + }, + { + "epoch": 4.550951503990178, + "grad_norm": 0.2583169937133789, + "learning_rate": 5.964663206887116e-05, + "loss": 1.7641, + "step": 14827 + }, + { + "epoch": 4.551258440761203, + "grad_norm": 0.21465209126472473, + "learning_rate": 5.964175484821392e-05, + "loss": 1.7475, + "step": 14828 + }, + { + "epoch": 4.551565377532229, + "grad_norm": 0.28028783202171326, + "learning_rate": 5.963687753227118e-05, + "loss": 1.7649, + "step": 14829 + }, + { + "epoch": 4.551872314303253, + "grad_norm": 0.30248284339904785, + "learning_rate": 5.9632000121091194e-05, + "loss": 1.6969, + "step": 14830 + }, + { + "epoch": 4.5521792510742785, + "grad_norm": 0.24335962533950806, + "learning_rate": 5.962712261472213e-05, + "loss": 1.7295, + "step": 14831 + }, + { + "epoch": 4.552486187845304, + "grad_norm": 0.21014504134655, + "learning_rate": 5.9622245013212206e-05, + "loss": 1.7508, + "step": 14832 + }, + { + "epoch": 4.552793124616329, + "grad_norm": 0.24892041087150574, + "learning_rate": 5.961736731660963e-05, + "loss": 1.7317, + "step": 14833 + }, + { + "epoch": 4.5531000613873545, + "grad_norm": 0.2159881740808487, + "learning_rate": 5.9612489524962556e-05, + "loss": 1.7114, + "step": 14834 + }, + { + "epoch": 4.55340699815838, + "grad_norm": 0.2952292263507843, + "learning_rate": 5.960761163831925e-05, + "loss": 1.8226, + "step": 14835 + }, + { + "epoch": 4.553713934929404, + "grad_norm": 0.3019000291824341, + "learning_rate": 5.9602733656727895e-05, + "loss": 1.7391, + "step": 14836 + }, + { + "epoch": 4.55402087170043, + "grad_norm": 0.2273966521024704, + "learning_rate": 5.9597855580236696e-05, + "loss": 1.7718, + "step": 14837 + }, + { + "epoch": 4.554327808471455, + "grad_norm": 0.2462005764245987, + "learning_rate": 5.959297740889386e-05, + "loss": 1.8428, + "step": 14838 + }, + { + "epoch": 4.55463474524248, + "grad_norm": 0.2773323059082031, + "learning_rate": 5.95880991427476e-05, + "loss": 1.6878, + "step": 14839 + }, + { + "epoch": 4.554941682013506, + "grad_norm": 0.26519861817359924, + "learning_rate": 5.958322078184611e-05, + "loss": 1.737, + "step": 14840 + }, + { + "epoch": 4.55524861878453, + "grad_norm": 0.20157647132873535, + "learning_rate": 5.9578342326237626e-05, + "loss": 1.7164, + "step": 14841 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.21715669333934784, + "learning_rate": 5.957346377597035e-05, + "loss": 1.705, + "step": 14842 + }, + { + "epoch": 4.555862492326581, + "grad_norm": 0.3056442439556122, + "learning_rate": 5.95685851310925e-05, + "loss": 1.7672, + "step": 14843 + }, + { + "epoch": 4.556169429097606, + "grad_norm": 0.24832262098789215, + "learning_rate": 5.956370639165228e-05, + "loss": 1.7305, + "step": 14844 + }, + { + "epoch": 4.556476365868631, + "grad_norm": 0.25814661383628845, + "learning_rate": 5.955882755769791e-05, + "loss": 1.7562, + "step": 14845 + }, + { + "epoch": 4.556783302639657, + "grad_norm": 0.38242629170417786, + "learning_rate": 5.95539486292776e-05, + "loss": 1.7077, + "step": 14846 + }, + { + "epoch": 4.557090239410681, + "grad_norm": 0.2901807427406311, + "learning_rate": 5.954906960643956e-05, + "loss": 1.7233, + "step": 14847 + }, + { + "epoch": 4.5573971761817065, + "grad_norm": 0.22636106610298157, + "learning_rate": 5.954419048923202e-05, + "loss": 1.777, + "step": 14848 + }, + { + "epoch": 4.557704112952732, + "grad_norm": 0.32392850518226624, + "learning_rate": 5.953931127770321e-05, + "loss": 1.7477, + "step": 14849 + }, + { + "epoch": 4.558011049723757, + "grad_norm": 0.3403460681438446, + "learning_rate": 5.953443197190134e-05, + "loss": 1.7712, + "step": 14850 + }, + { + "epoch": 4.558317986494782, + "grad_norm": 0.22923234105110168, + "learning_rate": 5.95295525718746e-05, + "loss": 1.8154, + "step": 14851 + }, + { + "epoch": 4.558624923265807, + "grad_norm": 0.25152841210365295, + "learning_rate": 5.952467307767124e-05, + "loss": 1.7091, + "step": 14852 + }, + { + "epoch": 4.558931860036832, + "grad_norm": 0.27743563055992126, + "learning_rate": 5.951979348933949e-05, + "loss": 1.7621, + "step": 14853 + }, + { + "epoch": 4.559238796807858, + "grad_norm": 0.25809308886528015, + "learning_rate": 5.951491380692756e-05, + "loss": 1.7669, + "step": 14854 + }, + { + "epoch": 4.559545733578883, + "grad_norm": 0.24863946437835693, + "learning_rate": 5.9510034030483676e-05, + "loss": 1.7354, + "step": 14855 + }, + { + "epoch": 4.559852670349908, + "grad_norm": 0.2896040380001068, + "learning_rate": 5.9505154160056066e-05, + "loss": 1.7878, + "step": 14856 + }, + { + "epoch": 4.560159607120933, + "grad_norm": 0.23814482986927032, + "learning_rate": 5.950027419569294e-05, + "loss": 1.7781, + "step": 14857 + }, + { + "epoch": 4.560466543891958, + "grad_norm": 0.2531175911426544, + "learning_rate": 5.949539413744253e-05, + "loss": 1.762, + "step": 14858 + }, + { + "epoch": 4.560773480662983, + "grad_norm": 0.2541767656803131, + "learning_rate": 5.949051398535308e-05, + "loss": 1.7722, + "step": 14859 + }, + { + "epoch": 4.561080417434009, + "grad_norm": 0.25216221809387207, + "learning_rate": 5.948563373947281e-05, + "loss": 1.754, + "step": 14860 + }, + { + "epoch": 4.561387354205034, + "grad_norm": 0.24421775341033936, + "learning_rate": 5.948075339984994e-05, + "loss": 1.7976, + "step": 14861 + }, + { + "epoch": 4.5616942909760585, + "grad_norm": 0.24435418844223022, + "learning_rate": 5.947587296653272e-05, + "loss": 1.79, + "step": 14862 + }, + { + "epoch": 4.562001227747084, + "grad_norm": 0.24471627175807953, + "learning_rate": 5.947099243956936e-05, + "loss": 1.755, + "step": 14863 + }, + { + "epoch": 4.562308164518109, + "grad_norm": 0.2762158215045929, + "learning_rate": 5.9466111819008096e-05, + "loss": 1.7695, + "step": 14864 + }, + { + "epoch": 4.5626151012891345, + "grad_norm": 0.23841319978237152, + "learning_rate": 5.9461231104897174e-05, + "loss": 1.7302, + "step": 14865 + }, + { + "epoch": 4.56292203806016, + "grad_norm": 0.260231077671051, + "learning_rate": 5.9456350297284826e-05, + "loss": 1.7917, + "step": 14866 + }, + { + "epoch": 4.563228974831185, + "grad_norm": 0.2752247452735901, + "learning_rate": 5.945146939621929e-05, + "loss": 1.7953, + "step": 14867 + }, + { + "epoch": 4.56353591160221, + "grad_norm": 0.28760650753974915, + "learning_rate": 5.944658840174878e-05, + "loss": 1.8582, + "step": 14868 + }, + { + "epoch": 4.563842848373235, + "grad_norm": 0.24311676621437073, + "learning_rate": 5.944170731392153e-05, + "loss": 1.8006, + "step": 14869 + }, + { + "epoch": 4.56414978514426, + "grad_norm": 0.2692974805831909, + "learning_rate": 5.943682613278583e-05, + "loss": 1.6984, + "step": 14870 + }, + { + "epoch": 4.564456721915286, + "grad_norm": 0.2784348726272583, + "learning_rate": 5.943194485838985e-05, + "loss": 1.8082, + "step": 14871 + }, + { + "epoch": 4.564763658686311, + "grad_norm": 0.2557264268398285, + "learning_rate": 5.9427063490781885e-05, + "loss": 1.7715, + "step": 14872 + }, + { + "epoch": 4.565070595457335, + "grad_norm": 0.3738742470741272, + "learning_rate": 5.942218203001015e-05, + "loss": 1.7549, + "step": 14873 + }, + { + "epoch": 4.565377532228361, + "grad_norm": 0.2424495816230774, + "learning_rate": 5.941730047612288e-05, + "loss": 1.7388, + "step": 14874 + }, + { + "epoch": 4.565684468999386, + "grad_norm": 0.27020737528800964, + "learning_rate": 5.941241882916833e-05, + "loss": 1.752, + "step": 14875 + }, + { + "epoch": 4.565991405770411, + "grad_norm": 0.3763764798641205, + "learning_rate": 5.940753708919474e-05, + "loss": 1.7918, + "step": 14876 + }, + { + "epoch": 4.566298342541437, + "grad_norm": 0.26782163977622986, + "learning_rate": 5.940265525625036e-05, + "loss": 1.7244, + "step": 14877 + }, + { + "epoch": 4.566605279312462, + "grad_norm": 0.24978911876678467, + "learning_rate": 5.9397773330383434e-05, + "loss": 1.7706, + "step": 14878 + }, + { + "epoch": 4.5669122160834865, + "grad_norm": 0.32905304431915283, + "learning_rate": 5.93928913116422e-05, + "loss": 1.7381, + "step": 14879 + }, + { + "epoch": 4.567219152854512, + "grad_norm": 0.2196444720029831, + "learning_rate": 5.93880092000749e-05, + "loss": 1.7605, + "step": 14880 + }, + { + "epoch": 4.567526089625537, + "grad_norm": 0.3156622350215912, + "learning_rate": 5.9383126995729786e-05, + "loss": 1.9181, + "step": 14881 + }, + { + "epoch": 4.5678330263965625, + "grad_norm": 0.2895203232765198, + "learning_rate": 5.937824469865513e-05, + "loss": 1.7967, + "step": 14882 + }, + { + "epoch": 4.568139963167588, + "grad_norm": 0.24854810535907745, + "learning_rate": 5.937336230889916e-05, + "loss": 1.7332, + "step": 14883 + }, + { + "epoch": 4.568446899938612, + "grad_norm": 0.3417081832885742, + "learning_rate": 5.936847982651013e-05, + "loss": 1.7525, + "step": 14884 + }, + { + "epoch": 4.568753836709638, + "grad_norm": 0.2874949276447296, + "learning_rate": 5.936359725153629e-05, + "loss": 1.7659, + "step": 14885 + }, + { + "epoch": 4.569060773480663, + "grad_norm": 0.25031307339668274, + "learning_rate": 5.935871458402588e-05, + "loss": 1.8061, + "step": 14886 + }, + { + "epoch": 4.569367710251688, + "grad_norm": 0.27047309279441833, + "learning_rate": 5.935383182402717e-05, + "loss": 1.7318, + "step": 14887 + }, + { + "epoch": 4.569674647022714, + "grad_norm": 0.2642819881439209, + "learning_rate": 5.9348948971588425e-05, + "loss": 1.849, + "step": 14888 + }, + { + "epoch": 4.569981583793739, + "grad_norm": 0.2452307790517807, + "learning_rate": 5.9344066026757886e-05, + "loss": 1.7491, + "step": 14889 + }, + { + "epoch": 4.570288520564763, + "grad_norm": 0.24055036902427673, + "learning_rate": 5.9339182989583795e-05, + "loss": 1.7573, + "step": 14890 + }, + { + "epoch": 4.570595457335789, + "grad_norm": 0.23036183416843414, + "learning_rate": 5.933429986011444e-05, + "loss": 1.7841, + "step": 14891 + }, + { + "epoch": 4.570902394106814, + "grad_norm": 0.27987608313560486, + "learning_rate": 5.932941663839805e-05, + "loss": 1.7835, + "step": 14892 + }, + { + "epoch": 4.571209330877839, + "grad_norm": 0.31747013330459595, + "learning_rate": 5.93245333244829e-05, + "loss": 1.7905, + "step": 14893 + }, + { + "epoch": 4.571516267648864, + "grad_norm": 0.24841344356536865, + "learning_rate": 5.931964991841725e-05, + "loss": 1.8003, + "step": 14894 + }, + { + "epoch": 4.571823204419889, + "grad_norm": 0.2416950911283493, + "learning_rate": 5.9314766420249356e-05, + "loss": 1.7787, + "step": 14895 + }, + { + "epoch": 4.5721301411909145, + "grad_norm": 0.2322494238615036, + "learning_rate": 5.930988283002748e-05, + "loss": 1.8153, + "step": 14896 + }, + { + "epoch": 4.57243707796194, + "grad_norm": 0.22629016637802124, + "learning_rate": 5.930499914779989e-05, + "loss": 1.6743, + "step": 14897 + }, + { + "epoch": 4.572744014732965, + "grad_norm": 0.21481508016586304, + "learning_rate": 5.930011537361483e-05, + "loss": 1.7301, + "step": 14898 + }, + { + "epoch": 4.5730509515039905, + "grad_norm": 0.1993340700864792, + "learning_rate": 5.9295231507520586e-05, + "loss": 1.6796, + "step": 14899 + }, + { + "epoch": 4.573357888275015, + "grad_norm": 0.21681822836399078, + "learning_rate": 5.929034754956543e-05, + "loss": 1.7333, + "step": 14900 + }, + { + "epoch": 4.57366482504604, + "grad_norm": 0.23105305433273315, + "learning_rate": 5.928546349979761e-05, + "loss": 1.8207, + "step": 14901 + }, + { + "epoch": 4.573971761817066, + "grad_norm": 0.24656468629837036, + "learning_rate": 5.9280579358265384e-05, + "loss": 1.7805, + "step": 14902 + }, + { + "epoch": 4.574278698588091, + "grad_norm": 0.28564780950546265, + "learning_rate": 5.927569512501704e-05, + "loss": 1.7224, + "step": 14903 + }, + { + "epoch": 4.574585635359116, + "grad_norm": 0.26030251383781433, + "learning_rate": 5.927081080010084e-05, + "loss": 1.7417, + "step": 14904 + }, + { + "epoch": 4.574892572130141, + "grad_norm": 0.21427087485790253, + "learning_rate": 5.926592638356505e-05, + "loss": 1.7239, + "step": 14905 + }, + { + "epoch": 4.575199508901166, + "grad_norm": 0.2351662665605545, + "learning_rate": 5.9261041875457956e-05, + "loss": 1.7711, + "step": 14906 + }, + { + "epoch": 4.5755064456721914, + "grad_norm": 0.27335020899772644, + "learning_rate": 5.925615727582781e-05, + "loss": 1.7496, + "step": 14907 + }, + { + "epoch": 4.575813382443217, + "grad_norm": 0.27849945425987244, + "learning_rate": 5.925127258472289e-05, + "loss": 1.7576, + "step": 14908 + }, + { + "epoch": 4.576120319214242, + "grad_norm": 0.27859339118003845, + "learning_rate": 5.924638780219147e-05, + "loss": 1.8076, + "step": 14909 + }, + { + "epoch": 4.5764272559852675, + "grad_norm": 0.24664369225502014, + "learning_rate": 5.9241502928281836e-05, + "loss": 1.7657, + "step": 14910 + }, + { + "epoch": 4.576734192756292, + "grad_norm": 0.29881149530410767, + "learning_rate": 5.923661796304224e-05, + "loss": 1.7611, + "step": 14911 + }, + { + "epoch": 4.577041129527317, + "grad_norm": 0.2672356367111206, + "learning_rate": 5.9231732906520984e-05, + "loss": 1.7605, + "step": 14912 + }, + { + "epoch": 4.577348066298343, + "grad_norm": 0.24282832443714142, + "learning_rate": 5.9226847758766336e-05, + "loss": 1.7037, + "step": 14913 + }, + { + "epoch": 4.577655003069368, + "grad_norm": 0.3822915852069855, + "learning_rate": 5.922196251982656e-05, + "loss": 1.7609, + "step": 14914 + }, + { + "epoch": 4.577961939840393, + "grad_norm": 0.30721214413642883, + "learning_rate": 5.921707718974994e-05, + "loss": 1.7398, + "step": 14915 + }, + { + "epoch": 4.578268876611418, + "grad_norm": 0.235477477312088, + "learning_rate": 5.921219176858477e-05, + "loss": 1.6869, + "step": 14916 + }, + { + "epoch": 4.578575813382443, + "grad_norm": 0.3752216100692749, + "learning_rate": 5.920730625637934e-05, + "loss": 1.7296, + "step": 14917 + }, + { + "epoch": 4.578882750153468, + "grad_norm": 0.36901310086250305, + "learning_rate": 5.920242065318189e-05, + "loss": 1.7405, + "step": 14918 + }, + { + "epoch": 4.579189686924494, + "grad_norm": 0.2308608740568161, + "learning_rate": 5.9197534959040725e-05, + "loss": 1.7953, + "step": 14919 + }, + { + "epoch": 4.579496623695519, + "grad_norm": 0.3286738991737366, + "learning_rate": 5.919264917400412e-05, + "loss": 1.7669, + "step": 14920 + }, + { + "epoch": 4.579803560466544, + "grad_norm": 0.3944021165370941, + "learning_rate": 5.918776329812039e-05, + "loss": 1.7165, + "step": 14921 + }, + { + "epoch": 4.580110497237569, + "grad_norm": 0.22054845094680786, + "learning_rate": 5.9182877331437795e-05, + "loss": 1.7739, + "step": 14922 + }, + { + "epoch": 4.580417434008594, + "grad_norm": 0.3467540740966797, + "learning_rate": 5.9177991274004605e-05, + "loss": 1.7713, + "step": 14923 + }, + { + "epoch": 4.5807243707796195, + "grad_norm": 0.4313695728778839, + "learning_rate": 5.917310512586914e-05, + "loss": 1.7654, + "step": 14924 + }, + { + "epoch": 4.581031307550645, + "grad_norm": 0.2723502814769745, + "learning_rate": 5.9168218887079685e-05, + "loss": 1.7314, + "step": 14925 + }, + { + "epoch": 4.581338244321669, + "grad_norm": 0.2641250789165497, + "learning_rate": 5.9163332557684504e-05, + "loss": 1.7303, + "step": 14926 + }, + { + "epoch": 4.581645181092695, + "grad_norm": 0.3780760169029236, + "learning_rate": 5.915844613773189e-05, + "loss": 1.7748, + "step": 14927 + }, + { + "epoch": 4.58195211786372, + "grad_norm": 0.23379632830619812, + "learning_rate": 5.915355962727015e-05, + "loss": 1.7482, + "step": 14928 + }, + { + "epoch": 4.582259054634745, + "grad_norm": 0.35227084159851074, + "learning_rate": 5.914867302634758e-05, + "loss": 1.8198, + "step": 14929 + }, + { + "epoch": 4.582565991405771, + "grad_norm": 0.34348124265670776, + "learning_rate": 5.914378633501245e-05, + "loss": 1.8364, + "step": 14930 + }, + { + "epoch": 4.582872928176796, + "grad_norm": 0.2446804940700531, + "learning_rate": 5.9138899553313066e-05, + "loss": 1.7779, + "step": 14931 + }, + { + "epoch": 4.58317986494782, + "grad_norm": 0.23893557488918304, + "learning_rate": 5.913401268129772e-05, + "loss": 1.7582, + "step": 14932 + }, + { + "epoch": 4.583486801718846, + "grad_norm": 0.3046814203262329, + "learning_rate": 5.912912571901471e-05, + "loss": 1.6871, + "step": 14933 + }, + { + "epoch": 4.583793738489871, + "grad_norm": 0.2232733964920044, + "learning_rate": 5.912423866651233e-05, + "loss": 1.7269, + "step": 14934 + }, + { + "epoch": 4.584100675260896, + "grad_norm": 0.18664126098155975, + "learning_rate": 5.911935152383888e-05, + "loss": 1.7155, + "step": 14935 + }, + { + "epoch": 4.584407612031922, + "grad_norm": 0.2573263347148895, + "learning_rate": 5.911446429104265e-05, + "loss": 1.7901, + "step": 14936 + }, + { + "epoch": 4.584714548802946, + "grad_norm": 0.2382393181324005, + "learning_rate": 5.910957696817194e-05, + "loss": 1.7407, + "step": 14937 + }, + { + "epoch": 4.5850214855739715, + "grad_norm": 0.28363972902297974, + "learning_rate": 5.910468955527504e-05, + "loss": 1.7971, + "step": 14938 + }, + { + "epoch": 4.585328422344997, + "grad_norm": 0.3173120617866516, + "learning_rate": 5.909980205240027e-05, + "loss": 1.744, + "step": 14939 + }, + { + "epoch": 4.585635359116022, + "grad_norm": 0.2281302511692047, + "learning_rate": 5.909491445959592e-05, + "loss": 1.6976, + "step": 14940 + }, + { + "epoch": 4.5859422958870475, + "grad_norm": 0.24962912499904633, + "learning_rate": 5.9090026776910304e-05, + "loss": 1.7979, + "step": 14941 + }, + { + "epoch": 4.586249232658073, + "grad_norm": 0.22330854833126068, + "learning_rate": 5.908513900439171e-05, + "loss": 1.7854, + "step": 14942 + }, + { + "epoch": 4.586556169429097, + "grad_norm": 0.20861582458019257, + "learning_rate": 5.908025114208845e-05, + "loss": 1.7133, + "step": 14943 + }, + { + "epoch": 4.586863106200123, + "grad_norm": 0.21838510036468506, + "learning_rate": 5.90753631900488e-05, + "loss": 1.6919, + "step": 14944 + }, + { + "epoch": 4.587170042971148, + "grad_norm": 0.252798467874527, + "learning_rate": 5.907047514832112e-05, + "loss": 1.838, + "step": 14945 + }, + { + "epoch": 4.587476979742173, + "grad_norm": 0.326893150806427, + "learning_rate": 5.906558701695369e-05, + "loss": 1.7303, + "step": 14946 + }, + { + "epoch": 4.587783916513199, + "grad_norm": 0.36489585041999817, + "learning_rate": 5.9060698795994804e-05, + "loss": 1.7631, + "step": 14947 + }, + { + "epoch": 4.588090853284223, + "grad_norm": 0.27491649985313416, + "learning_rate": 5.905581048549279e-05, + "loss": 1.7773, + "step": 14948 + }, + { + "epoch": 4.588397790055248, + "grad_norm": 0.2334890067577362, + "learning_rate": 5.905092208549595e-05, + "loss": 1.7254, + "step": 14949 + }, + { + "epoch": 4.588704726826274, + "grad_norm": 0.24383895099163055, + "learning_rate": 5.904603359605257e-05, + "loss": 1.7496, + "step": 14950 + }, + { + "epoch": 4.589011663597299, + "grad_norm": 0.2144637256860733, + "learning_rate": 5.904114501721102e-05, + "loss": 1.7028, + "step": 14951 + }, + { + "epoch": 4.589318600368324, + "grad_norm": 0.19675977528095245, + "learning_rate": 5.9036256349019555e-05, + "loss": 1.7548, + "step": 14952 + }, + { + "epoch": 4.58962553713935, + "grad_norm": 0.23712843656539917, + "learning_rate": 5.903136759152652e-05, + "loss": 1.7722, + "step": 14953 + }, + { + "epoch": 4.589932473910374, + "grad_norm": 0.20307733118534088, + "learning_rate": 5.902647874478021e-05, + "loss": 1.7177, + "step": 14954 + }, + { + "epoch": 4.5902394106813995, + "grad_norm": 0.21767669916152954, + "learning_rate": 5.9021589808828936e-05, + "loss": 1.7963, + "step": 14955 + }, + { + "epoch": 4.590546347452425, + "grad_norm": 0.2056351602077484, + "learning_rate": 5.9016700783721036e-05, + "loss": 1.7439, + "step": 14956 + }, + { + "epoch": 4.59085328422345, + "grad_norm": 0.20480911433696747, + "learning_rate": 5.90118116695048e-05, + "loss": 1.7122, + "step": 14957 + }, + { + "epoch": 4.5911602209944755, + "grad_norm": 0.24091731011867523, + "learning_rate": 5.900692246622858e-05, + "loss": 1.7862, + "step": 14958 + }, + { + "epoch": 4.5914671577655, + "grad_norm": 0.20246434211730957, + "learning_rate": 5.900203317394066e-05, + "loss": 1.6895, + "step": 14959 + }, + { + "epoch": 4.591774094536525, + "grad_norm": 0.23771630227565765, + "learning_rate": 5.899714379268938e-05, + "loss": 1.7794, + "step": 14960 + }, + { + "epoch": 4.592081031307551, + "grad_norm": 0.2638718783855438, + "learning_rate": 5.899225432252303e-05, + "loss": 1.8059, + "step": 14961 + }, + { + "epoch": 4.592387968078576, + "grad_norm": 0.24251408874988556, + "learning_rate": 5.898736476348997e-05, + "loss": 1.8063, + "step": 14962 + }, + { + "epoch": 4.592694904849601, + "grad_norm": 0.2487735152244568, + "learning_rate": 5.8982475115638515e-05, + "loss": 1.7615, + "step": 14963 + }, + { + "epoch": 4.593001841620627, + "grad_norm": 0.23507241904735565, + "learning_rate": 5.897758537901696e-05, + "loss": 1.7496, + "step": 14964 + }, + { + "epoch": 4.593308778391651, + "grad_norm": 0.22354768216609955, + "learning_rate": 5.897269555367365e-05, + "loss": 1.7293, + "step": 14965 + }, + { + "epoch": 4.593615715162676, + "grad_norm": 0.2711353003978729, + "learning_rate": 5.89678056396569e-05, + "loss": 1.8127, + "step": 14966 + }, + { + "epoch": 4.593922651933702, + "grad_norm": 0.30061110854148865, + "learning_rate": 5.8962915637015036e-05, + "loss": 1.7653, + "step": 14967 + }, + { + "epoch": 4.594229588704727, + "grad_norm": 0.24577318131923676, + "learning_rate": 5.895802554579639e-05, + "loss": 1.7888, + "step": 14968 + }, + { + "epoch": 4.5945365254757515, + "grad_norm": 0.25568944215774536, + "learning_rate": 5.895313536604929e-05, + "loss": 1.7912, + "step": 14969 + }, + { + "epoch": 4.594843462246777, + "grad_norm": 0.2710168957710266, + "learning_rate": 5.894824509782206e-05, + "loss": 1.7681, + "step": 14970 + }, + { + "epoch": 4.595150399017802, + "grad_norm": 0.24056777358055115, + "learning_rate": 5.894335474116303e-05, + "loss": 1.7729, + "step": 14971 + }, + { + "epoch": 4.5954573357888275, + "grad_norm": 0.21956710517406464, + "learning_rate": 5.89384642961205e-05, + "loss": 1.7576, + "step": 14972 + }, + { + "epoch": 4.595764272559853, + "grad_norm": 0.27499106526374817, + "learning_rate": 5.893357376274284e-05, + "loss": 1.7909, + "step": 14973 + }, + { + "epoch": 4.596071209330878, + "grad_norm": 0.28581273555755615, + "learning_rate": 5.8928683141078376e-05, + "loss": 1.7592, + "step": 14974 + }, + { + "epoch": 4.596378146101903, + "grad_norm": 0.23218442499637604, + "learning_rate": 5.892379243117543e-05, + "loss": 1.7142, + "step": 14975 + }, + { + "epoch": 4.596685082872928, + "grad_norm": 0.34015771746635437, + "learning_rate": 5.891890163308234e-05, + "loss": 1.7457, + "step": 14976 + }, + { + "epoch": 4.596992019643953, + "grad_norm": 0.2630012333393097, + "learning_rate": 5.8914010746847435e-05, + "loss": 1.7612, + "step": 14977 + }, + { + "epoch": 4.597298956414979, + "grad_norm": 0.2265843003988266, + "learning_rate": 5.890911977251904e-05, + "loss": 1.7272, + "step": 14978 + }, + { + "epoch": 4.597605893186004, + "grad_norm": 0.22325244545936584, + "learning_rate": 5.8904228710145505e-05, + "loss": 1.7447, + "step": 14979 + }, + { + "epoch": 4.597912829957028, + "grad_norm": 0.23512716591358185, + "learning_rate": 5.889933755977517e-05, + "loss": 1.7123, + "step": 14980 + }, + { + "epoch": 4.598219766728054, + "grad_norm": 0.22534869611263275, + "learning_rate": 5.8894446321456365e-05, + "loss": 1.785, + "step": 14981 + }, + { + "epoch": 4.598526703499079, + "grad_norm": 0.2447836697101593, + "learning_rate": 5.888955499523743e-05, + "loss": 1.7154, + "step": 14982 + }, + { + "epoch": 4.598833640270104, + "grad_norm": 0.2451140582561493, + "learning_rate": 5.88846635811667e-05, + "loss": 1.7494, + "step": 14983 + }, + { + "epoch": 4.59914057704113, + "grad_norm": 0.2253585308790207, + "learning_rate": 5.8879772079292504e-05, + "loss": 1.7591, + "step": 14984 + }, + { + "epoch": 4.599447513812155, + "grad_norm": 0.21714572608470917, + "learning_rate": 5.887488048966322e-05, + "loss": 1.7314, + "step": 14985 + }, + { + "epoch": 4.5997544505831796, + "grad_norm": 0.24897411465644836, + "learning_rate": 5.8869988812327145e-05, + "loss": 1.776, + "step": 14986 + }, + { + "epoch": 4.600061387354205, + "grad_norm": 0.22575093805789948, + "learning_rate": 5.8865097047332653e-05, + "loss": 1.7168, + "step": 14987 + }, + { + "epoch": 4.60036832412523, + "grad_norm": 0.22857412695884705, + "learning_rate": 5.886020519472808e-05, + "loss": 1.8262, + "step": 14988 + }, + { + "epoch": 4.600675260896256, + "grad_norm": 0.22741298377513885, + "learning_rate": 5.885531325456174e-05, + "loss": 1.6732, + "step": 14989 + }, + { + "epoch": 4.600982197667281, + "grad_norm": 0.2229645550251007, + "learning_rate": 5.885042122688202e-05, + "loss": 1.7384, + "step": 14990 + }, + { + "epoch": 4.601289134438305, + "grad_norm": 0.22609494626522064, + "learning_rate": 5.884552911173726e-05, + "loss": 1.714, + "step": 14991 + }, + { + "epoch": 4.601596071209331, + "grad_norm": 0.2629149854183197, + "learning_rate": 5.884063690917578e-05, + "loss": 1.8133, + "step": 14992 + }, + { + "epoch": 4.601903007980356, + "grad_norm": 0.220725417137146, + "learning_rate": 5.883574461924597e-05, + "loss": 1.6898, + "step": 14993 + }, + { + "epoch": 4.602209944751381, + "grad_norm": 0.207612082362175, + "learning_rate": 5.8830852241996135e-05, + "loss": 1.7302, + "step": 14994 + }, + { + "epoch": 4.602516881522407, + "grad_norm": 0.22418084740638733, + "learning_rate": 5.8825959777474625e-05, + "loss": 1.763, + "step": 14995 + }, + { + "epoch": 4.602823818293432, + "grad_norm": 0.30606865882873535, + "learning_rate": 5.882106722572983e-05, + "loss": 1.7657, + "step": 14996 + }, + { + "epoch": 4.6031307550644565, + "grad_norm": 0.2947966456413269, + "learning_rate": 5.881617458681008e-05, + "loss": 1.7796, + "step": 14997 + }, + { + "epoch": 4.603437691835482, + "grad_norm": 0.23430216312408447, + "learning_rate": 5.881128186076372e-05, + "loss": 1.78, + "step": 14998 + }, + { + "epoch": 4.603744628606507, + "grad_norm": 0.28081849217414856, + "learning_rate": 5.880638904763911e-05, + "loss": 1.6791, + "step": 14999 + }, + { + "epoch": 4.6040515653775325, + "grad_norm": 0.25459226965904236, + "learning_rate": 5.88014961474846e-05, + "loss": 1.8064, + "step": 15000 + }, + { + "epoch": 4.604358502148557, + "grad_norm": 0.2358713001012802, + "learning_rate": 5.879660316034854e-05, + "loss": 1.763, + "step": 15001 + }, + { + "epoch": 4.604665438919582, + "grad_norm": 0.32954758405685425, + "learning_rate": 5.879171008627931e-05, + "loss": 1.7462, + "step": 15002 + }, + { + "epoch": 4.604972375690608, + "grad_norm": 0.2588615417480469, + "learning_rate": 5.878681692532523e-05, + "loss": 1.7771, + "step": 15003 + }, + { + "epoch": 4.605279312461633, + "grad_norm": 0.21216195821762085, + "learning_rate": 5.878192367753468e-05, + "loss": 1.7128, + "step": 15004 + }, + { + "epoch": 4.605586249232658, + "grad_norm": 0.26849040389060974, + "learning_rate": 5.8777030342956016e-05, + "loss": 1.7048, + "step": 15005 + }, + { + "epoch": 4.605893186003684, + "grad_norm": 0.22343295812606812, + "learning_rate": 5.877213692163759e-05, + "loss": 1.7695, + "step": 15006 + }, + { + "epoch": 4.606200122774708, + "grad_norm": 0.2794288694858551, + "learning_rate": 5.876724341362776e-05, + "loss": 1.7856, + "step": 15007 + }, + { + "epoch": 4.606507059545733, + "grad_norm": 0.3525427579879761, + "learning_rate": 5.8762349818974905e-05, + "loss": 1.7807, + "step": 15008 + }, + { + "epoch": 4.606813996316759, + "grad_norm": 0.25886499881744385, + "learning_rate": 5.875745613772736e-05, + "loss": 1.7818, + "step": 15009 + }, + { + "epoch": 4.607120933087784, + "grad_norm": 0.24822987616062164, + "learning_rate": 5.8752562369933515e-05, + "loss": 1.7369, + "step": 15010 + }, + { + "epoch": 4.607427869858809, + "grad_norm": 0.26067355275154114, + "learning_rate": 5.874766851564171e-05, + "loss": 1.7056, + "step": 15011 + }, + { + "epoch": 4.607734806629834, + "grad_norm": 0.2869747579097748, + "learning_rate": 5.874277457490033e-05, + "loss": 1.7284, + "step": 15012 + }, + { + "epoch": 4.608041743400859, + "grad_norm": 0.23153580725193024, + "learning_rate": 5.87378805477577e-05, + "loss": 1.7331, + "step": 15013 + }, + { + "epoch": 4.6083486801718845, + "grad_norm": 0.29307299852371216, + "learning_rate": 5.873298643426223e-05, + "loss": 1.7376, + "step": 15014 + }, + { + "epoch": 4.60865561694291, + "grad_norm": 0.25638771057128906, + "learning_rate": 5.872809223446227e-05, + "loss": 1.7585, + "step": 15015 + }, + { + "epoch": 4.608962553713935, + "grad_norm": 0.2272702306509018, + "learning_rate": 5.872319794840618e-05, + "loss": 1.7482, + "step": 15016 + }, + { + "epoch": 4.6092694904849605, + "grad_norm": 0.2579486072063446, + "learning_rate": 5.8718303576142356e-05, + "loss": 1.778, + "step": 15017 + }, + { + "epoch": 4.609576427255985, + "grad_norm": 0.2216452956199646, + "learning_rate": 5.871340911771912e-05, + "loss": 1.7517, + "step": 15018 + }, + { + "epoch": 4.60988336402701, + "grad_norm": 0.22628961503505707, + "learning_rate": 5.870851457318488e-05, + "loss": 1.7579, + "step": 15019 + }, + { + "epoch": 4.610190300798036, + "grad_norm": 0.31018149852752686, + "learning_rate": 5.8703619942588e-05, + "loss": 1.7911, + "step": 15020 + }, + { + "epoch": 4.610497237569061, + "grad_norm": 0.2618122100830078, + "learning_rate": 5.869872522597683e-05, + "loss": 1.8121, + "step": 15021 + }, + { + "epoch": 4.610804174340086, + "grad_norm": 0.26085740327835083, + "learning_rate": 5.869383042339978e-05, + "loss": 1.7952, + "step": 15022 + }, + { + "epoch": 4.611111111111111, + "grad_norm": 0.25237780809402466, + "learning_rate": 5.86889355349052e-05, + "loss": 1.7575, + "step": 15023 + }, + { + "epoch": 4.611418047882136, + "grad_norm": 0.27550897002220154, + "learning_rate": 5.868404056054144e-05, + "loss": 1.7816, + "step": 15024 + }, + { + "epoch": 4.611724984653161, + "grad_norm": 0.2458692342042923, + "learning_rate": 5.8679145500356926e-05, + "loss": 1.7783, + "step": 15025 + }, + { + "epoch": 4.612031921424187, + "grad_norm": 0.25606176257133484, + "learning_rate": 5.867425035439999e-05, + "loss": 1.7863, + "step": 15026 + }, + { + "epoch": 4.612338858195212, + "grad_norm": 0.3206995725631714, + "learning_rate": 5.866935512271905e-05, + "loss": 1.7468, + "step": 15027 + }, + { + "epoch": 4.612645794966237, + "grad_norm": 0.2754824459552765, + "learning_rate": 5.866445980536245e-05, + "loss": 1.793, + "step": 15028 + }, + { + "epoch": 4.612952731737262, + "grad_norm": 0.25168612599372864, + "learning_rate": 5.865956440237859e-05, + "loss": 1.7252, + "step": 15029 + }, + { + "epoch": 4.613259668508287, + "grad_norm": 0.3226735293865204, + "learning_rate": 5.8654668913815815e-05, + "loss": 1.7291, + "step": 15030 + }, + { + "epoch": 4.6135666052793125, + "grad_norm": 0.2580295503139496, + "learning_rate": 5.864977333972255e-05, + "loss": 1.7622, + "step": 15031 + }, + { + "epoch": 4.613873542050338, + "grad_norm": 0.21486075222492218, + "learning_rate": 5.864487768014715e-05, + "loss": 1.7662, + "step": 15032 + }, + { + "epoch": 4.614180478821363, + "grad_norm": 0.2331690639257431, + "learning_rate": 5.8639981935137996e-05, + "loss": 1.7389, + "step": 15033 + }, + { + "epoch": 4.614487415592388, + "grad_norm": 0.2573511302471161, + "learning_rate": 5.863508610474348e-05, + "loss": 1.7699, + "step": 15034 + }, + { + "epoch": 4.614794352363413, + "grad_norm": 0.2260694056749344, + "learning_rate": 5.863019018901199e-05, + "loss": 1.7784, + "step": 15035 + }, + { + "epoch": 4.615101289134438, + "grad_norm": 0.2283065915107727, + "learning_rate": 5.8625294187991895e-05, + "loss": 1.7061, + "step": 15036 + }, + { + "epoch": 4.615408225905464, + "grad_norm": 0.24772310256958008, + "learning_rate": 5.862039810173159e-05, + "loss": 1.7568, + "step": 15037 + }, + { + "epoch": 4.615715162676489, + "grad_norm": 0.2515513002872467, + "learning_rate": 5.861550193027945e-05, + "loss": 1.7445, + "step": 15038 + }, + { + "epoch": 4.616022099447514, + "grad_norm": 0.26472151279449463, + "learning_rate": 5.8610605673683885e-05, + "loss": 1.7735, + "step": 15039 + }, + { + "epoch": 4.616329036218539, + "grad_norm": 0.24053528904914856, + "learning_rate": 5.8605709331993254e-05, + "loss": 1.8009, + "step": 15040 + }, + { + "epoch": 4.616635972989564, + "grad_norm": 0.25125381350517273, + "learning_rate": 5.860081290525596e-05, + "loss": 1.7712, + "step": 15041 + }, + { + "epoch": 4.616942909760589, + "grad_norm": 0.23056018352508545, + "learning_rate": 5.85959163935204e-05, + "loss": 1.7684, + "step": 15042 + }, + { + "epoch": 4.617249846531615, + "grad_norm": 0.2533007562160492, + "learning_rate": 5.859101979683494e-05, + "loss": 1.7793, + "step": 15043 + }, + { + "epoch": 4.617556783302639, + "grad_norm": 0.21007375419139862, + "learning_rate": 5.8586123115248e-05, + "loss": 1.7484, + "step": 15044 + }, + { + "epoch": 4.6178637200736645, + "grad_norm": 0.21329566836357117, + "learning_rate": 5.858122634880797e-05, + "loss": 1.7763, + "step": 15045 + }, + { + "epoch": 4.61817065684469, + "grad_norm": 0.2362898588180542, + "learning_rate": 5.857632949756322e-05, + "loss": 1.7484, + "step": 15046 + }, + { + "epoch": 4.618477593615715, + "grad_norm": 0.2168794423341751, + "learning_rate": 5.857143256156214e-05, + "loss": 1.7752, + "step": 15047 + }, + { + "epoch": 4.6187845303867405, + "grad_norm": 0.24761471152305603, + "learning_rate": 5.856653554085316e-05, + "loss": 1.7793, + "step": 15048 + }, + { + "epoch": 4.619091467157766, + "grad_norm": 0.23202158510684967, + "learning_rate": 5.856163843548466e-05, + "loss": 1.6862, + "step": 15049 + }, + { + "epoch": 4.61939840392879, + "grad_norm": 0.23868000507354736, + "learning_rate": 5.855674124550501e-05, + "loss": 1.8075, + "step": 15050 + }, + { + "epoch": 4.619705340699816, + "grad_norm": 0.3063114583492279, + "learning_rate": 5.855184397096265e-05, + "loss": 1.8051, + "step": 15051 + }, + { + "epoch": 4.620012277470841, + "grad_norm": 0.22672493755817413, + "learning_rate": 5.854694661190594e-05, + "loss": 1.7478, + "step": 15052 + }, + { + "epoch": 4.620319214241866, + "grad_norm": 0.3403559923171997, + "learning_rate": 5.8542049168383296e-05, + "loss": 1.765, + "step": 15053 + }, + { + "epoch": 4.620626151012892, + "grad_norm": 0.33852189779281616, + "learning_rate": 5.853715164044312e-05, + "loss": 1.7602, + "step": 15054 + }, + { + "epoch": 4.620933087783916, + "grad_norm": 0.25166940689086914, + "learning_rate": 5.85322540281338e-05, + "loss": 1.7584, + "step": 15055 + }, + { + "epoch": 4.621240024554941, + "grad_norm": 0.3417987823486328, + "learning_rate": 5.8527356331503757e-05, + "loss": 1.8491, + "step": 15056 + }, + { + "epoch": 4.621546961325967, + "grad_norm": 0.3286994397640228, + "learning_rate": 5.852245855060138e-05, + "loss": 1.7146, + "step": 15057 + }, + { + "epoch": 4.621853898096992, + "grad_norm": 0.24394257366657257, + "learning_rate": 5.851756068547505e-05, + "loss": 1.8762, + "step": 15058 + }, + { + "epoch": 4.622160834868017, + "grad_norm": 0.34945347905158997, + "learning_rate": 5.851266273617321e-05, + "loss": 1.8086, + "step": 15059 + }, + { + "epoch": 4.622467771639043, + "grad_norm": 0.30189210176467896, + "learning_rate": 5.850776470274425e-05, + "loss": 1.7366, + "step": 15060 + }, + { + "epoch": 4.622774708410067, + "grad_norm": 0.24050579965114594, + "learning_rate": 5.850286658523657e-05, + "loss": 1.7599, + "step": 15061 + }, + { + "epoch": 4.6230816451810925, + "grad_norm": 0.33650726079940796, + "learning_rate": 5.849796838369857e-05, + "loss": 1.7343, + "step": 15062 + }, + { + "epoch": 4.623388581952118, + "grad_norm": 0.2855902910232544, + "learning_rate": 5.849307009817868e-05, + "loss": 1.7325, + "step": 15063 + }, + { + "epoch": 4.623695518723143, + "grad_norm": 0.2562592923641205, + "learning_rate": 5.8488171728725275e-05, + "loss": 1.7772, + "step": 15064 + }, + { + "epoch": 4.6240024554941686, + "grad_norm": 0.23494984209537506, + "learning_rate": 5.84832732753868e-05, + "loss": 1.7263, + "step": 15065 + }, + { + "epoch": 4.624309392265193, + "grad_norm": 0.23248226940631866, + "learning_rate": 5.847837473821164e-05, + "loss": 1.7441, + "step": 15066 + }, + { + "epoch": 4.624616329036218, + "grad_norm": 0.2291254848241806, + "learning_rate": 5.847347611724821e-05, + "loss": 1.7742, + "step": 15067 + }, + { + "epoch": 4.624923265807244, + "grad_norm": 0.28305280208587646, + "learning_rate": 5.8468577412544925e-05, + "loss": 1.8224, + "step": 15068 + }, + { + "epoch": 4.625230202578269, + "grad_norm": 0.25531691312789917, + "learning_rate": 5.84636786241502e-05, + "loss": 1.7458, + "step": 15069 + }, + { + "epoch": 4.625537139349294, + "grad_norm": 0.2363462746143341, + "learning_rate": 5.845877975211242e-05, + "loss": 1.7977, + "step": 15070 + }, + { + "epoch": 4.62584407612032, + "grad_norm": 0.2707001864910126, + "learning_rate": 5.845388079648004e-05, + "loss": 1.774, + "step": 15071 + }, + { + "epoch": 4.626151012891344, + "grad_norm": 0.22281844913959503, + "learning_rate": 5.844898175730146e-05, + "loss": 1.7888, + "step": 15072 + }, + { + "epoch": 4.6264579496623695, + "grad_norm": 0.24809995293617249, + "learning_rate": 5.8444082634625086e-05, + "loss": 1.7895, + "step": 15073 + }, + { + "epoch": 4.626764886433395, + "grad_norm": 0.2842096984386444, + "learning_rate": 5.843918342849933e-05, + "loss": 1.7323, + "step": 15074 + }, + { + "epoch": 4.62707182320442, + "grad_norm": 0.21343614161014557, + "learning_rate": 5.843428413897261e-05, + "loss": 1.7298, + "step": 15075 + }, + { + "epoch": 4.627378759975445, + "grad_norm": 0.2420526146888733, + "learning_rate": 5.842938476609336e-05, + "loss": 1.778, + "step": 15076 + }, + { + "epoch": 4.62768569674647, + "grad_norm": 0.22202003002166748, + "learning_rate": 5.842448530990999e-05, + "loss": 1.779, + "step": 15077 + }, + { + "epoch": 4.627992633517495, + "grad_norm": 0.26784011721611023, + "learning_rate": 5.841958577047092e-05, + "loss": 1.799, + "step": 15078 + }, + { + "epoch": 4.628299570288521, + "grad_norm": 0.3230212926864624, + "learning_rate": 5.841468614782457e-05, + "loss": 1.7789, + "step": 15079 + }, + { + "epoch": 4.628606507059546, + "grad_norm": 0.24062715470790863, + "learning_rate": 5.840978644201935e-05, + "loss": 1.7697, + "step": 15080 + }, + { + "epoch": 4.628913443830571, + "grad_norm": 0.2882130444049835, + "learning_rate": 5.84048866531037e-05, + "loss": 1.7946, + "step": 15081 + }, + { + "epoch": 4.629220380601596, + "grad_norm": 0.3145603537559509, + "learning_rate": 5.839998678112602e-05, + "loss": 1.7116, + "step": 15082 + }, + { + "epoch": 4.629527317372621, + "grad_norm": 0.270997017621994, + "learning_rate": 5.839508682613477e-05, + "loss": 1.8281, + "step": 15083 + }, + { + "epoch": 4.629834254143646, + "grad_norm": 0.27299395203590393, + "learning_rate": 5.839018678817834e-05, + "loss": 1.8233, + "step": 15084 + }, + { + "epoch": 4.630141190914672, + "grad_norm": 0.2684478461742401, + "learning_rate": 5.838528666730517e-05, + "loss": 1.8111, + "step": 15085 + }, + { + "epoch": 4.630448127685697, + "grad_norm": 0.2365201860666275, + "learning_rate": 5.838038646356367e-05, + "loss": 1.7475, + "step": 15086 + }, + { + "epoch": 4.6307550644567215, + "grad_norm": 0.2661258280277252, + "learning_rate": 5.8375486177002305e-05, + "loss": 1.748, + "step": 15087 + }, + { + "epoch": 4.631062001227747, + "grad_norm": 0.2865012586116791, + "learning_rate": 5.8370585807669455e-05, + "loss": 1.7525, + "step": 15088 + }, + { + "epoch": 4.631368937998772, + "grad_norm": 0.2445172518491745, + "learning_rate": 5.836568535561358e-05, + "loss": 1.7278, + "step": 15089 + }, + { + "epoch": 4.6316758747697975, + "grad_norm": 0.28192558884620667, + "learning_rate": 5.8360784820883083e-05, + "loss": 1.7371, + "step": 15090 + }, + { + "epoch": 4.631982811540823, + "grad_norm": 0.38927358388900757, + "learning_rate": 5.835588420352642e-05, + "loss": 1.8088, + "step": 15091 + }, + { + "epoch": 4.632289748311848, + "grad_norm": 0.3409229516983032, + "learning_rate": 5.8350983503592025e-05, + "loss": 1.8011, + "step": 15092 + }, + { + "epoch": 4.632596685082873, + "grad_norm": 0.2464994341135025, + "learning_rate": 5.8346082721128294e-05, + "loss": 1.8354, + "step": 15093 + }, + { + "epoch": 4.632903621853898, + "grad_norm": 0.38765814900398254, + "learning_rate": 5.834118185618369e-05, + "loss": 1.7811, + "step": 15094 + }, + { + "epoch": 4.633210558624923, + "grad_norm": 0.42435070872306824, + "learning_rate": 5.833628090880664e-05, + "loss": 1.7855, + "step": 15095 + }, + { + "epoch": 4.633517495395949, + "grad_norm": 0.244876891374588, + "learning_rate": 5.833137987904558e-05, + "loss": 1.7494, + "step": 15096 + }, + { + "epoch": 4.633824432166974, + "grad_norm": 0.30353477597236633, + "learning_rate": 5.8326478766948934e-05, + "loss": 1.7772, + "step": 15097 + }, + { + "epoch": 4.634131368937998, + "grad_norm": 0.38839244842529297, + "learning_rate": 5.8321577572565146e-05, + "loss": 1.7689, + "step": 15098 + }, + { + "epoch": 4.634438305709024, + "grad_norm": 0.357129842042923, + "learning_rate": 5.8316676295942644e-05, + "loss": 1.7777, + "step": 15099 + }, + { + "epoch": 4.634745242480049, + "grad_norm": 0.23458799719810486, + "learning_rate": 5.831177493712988e-05, + "loss": 1.7544, + "step": 15100 + }, + { + "epoch": 4.635052179251074, + "grad_norm": 0.23751308023929596, + "learning_rate": 5.830687349617529e-05, + "loss": 1.7491, + "step": 15101 + }, + { + "epoch": 4.6353591160221, + "grad_norm": 0.31978943943977356, + "learning_rate": 5.83019719731273e-05, + "loss": 1.7439, + "step": 15102 + }, + { + "epoch": 4.635666052793125, + "grad_norm": 0.2751142084598541, + "learning_rate": 5.829707036803438e-05, + "loss": 1.8598, + "step": 15103 + }, + { + "epoch": 4.6359729895641495, + "grad_norm": 0.23670406639575958, + "learning_rate": 5.8292168680944914e-05, + "loss": 1.7629, + "step": 15104 + }, + { + "epoch": 4.636279926335175, + "grad_norm": 0.2447349727153778, + "learning_rate": 5.828726691190739e-05, + "loss": 1.7606, + "step": 15105 + }, + { + "epoch": 4.6365868631062, + "grad_norm": 0.2739902436733246, + "learning_rate": 5.828236506097023e-05, + "loss": 1.707, + "step": 15106 + }, + { + "epoch": 4.6368937998772255, + "grad_norm": 0.2050863653421402, + "learning_rate": 5.82774631281819e-05, + "loss": 1.7235, + "step": 15107 + }, + { + "epoch": 4.637200736648251, + "grad_norm": 0.3005560338497162, + "learning_rate": 5.827256111359082e-05, + "loss": 1.7785, + "step": 15108 + }, + { + "epoch": 4.637507673419275, + "grad_norm": 0.27168264985084534, + "learning_rate": 5.8267659017245434e-05, + "loss": 1.7844, + "step": 15109 + }, + { + "epoch": 4.637814610190301, + "grad_norm": 0.2965840995311737, + "learning_rate": 5.82627568391942e-05, + "loss": 1.7631, + "step": 15110 + }, + { + "epoch": 4.638121546961326, + "grad_norm": 0.3114408552646637, + "learning_rate": 5.825785457948556e-05, + "loss": 1.77, + "step": 15111 + }, + { + "epoch": 4.638428483732351, + "grad_norm": 0.2638910114765167, + "learning_rate": 5.825295223816796e-05, + "loss": 1.9183, + "step": 15112 + }, + { + "epoch": 4.638735420503377, + "grad_norm": 0.3293665051460266, + "learning_rate": 5.824804981528986e-05, + "loss": 1.6779, + "step": 15113 + }, + { + "epoch": 4.639042357274402, + "grad_norm": 0.28586456179618835, + "learning_rate": 5.824314731089968e-05, + "loss": 1.7905, + "step": 15114 + }, + { + "epoch": 4.639349294045426, + "grad_norm": 0.2254554182291031, + "learning_rate": 5.8238244725045906e-05, + "loss": 1.7602, + "step": 15115 + }, + { + "epoch": 4.639656230816452, + "grad_norm": 0.2770406901836395, + "learning_rate": 5.823334205777695e-05, + "loss": 1.7789, + "step": 15116 + }, + { + "epoch": 4.639963167587477, + "grad_norm": 0.2867025136947632, + "learning_rate": 5.822843930914129e-05, + "loss": 1.7408, + "step": 15117 + }, + { + "epoch": 4.640270104358502, + "grad_norm": 0.23486989736557007, + "learning_rate": 5.822353647918737e-05, + "loss": 1.7489, + "step": 15118 + }, + { + "epoch": 4.640577041129527, + "grad_norm": 0.2274324595928192, + "learning_rate": 5.821863356796367e-05, + "loss": 1.768, + "step": 15119 + }, + { + "epoch": 4.640883977900552, + "grad_norm": 0.25032591819763184, + "learning_rate": 5.821373057551858e-05, + "loss": 1.7602, + "step": 15120 + }, + { + "epoch": 4.6411909146715775, + "grad_norm": 0.22332963347434998, + "learning_rate": 5.820882750190059e-05, + "loss": 1.756, + "step": 15121 + }, + { + "epoch": 4.641497851442603, + "grad_norm": 0.24975591897964478, + "learning_rate": 5.820392434715817e-05, + "loss": 1.6963, + "step": 15122 + }, + { + "epoch": 4.641804788213628, + "grad_norm": 0.27892687916755676, + "learning_rate": 5.819902111133976e-05, + "loss": 1.8295, + "step": 15123 + }, + { + "epoch": 4.6421117249846535, + "grad_norm": 0.23914897441864014, + "learning_rate": 5.819411779449381e-05, + "loss": 1.7636, + "step": 15124 + }, + { + "epoch": 4.642418661755678, + "grad_norm": 0.2349565476179123, + "learning_rate": 5.818921439666879e-05, + "loss": 1.7823, + "step": 15125 + }, + { + "epoch": 4.642725598526703, + "grad_norm": 0.2075800597667694, + "learning_rate": 5.818431091791315e-05, + "loss": 1.7282, + "step": 15126 + }, + { + "epoch": 4.643032535297729, + "grad_norm": 0.19781073927879333, + "learning_rate": 5.817940735827535e-05, + "loss": 1.7598, + "step": 15127 + }, + { + "epoch": 4.643339472068754, + "grad_norm": 0.21997439861297607, + "learning_rate": 5.8174503717803866e-05, + "loss": 1.766, + "step": 15128 + }, + { + "epoch": 4.643646408839779, + "grad_norm": 0.23971444368362427, + "learning_rate": 5.816959999654713e-05, + "loss": 1.7824, + "step": 15129 + }, + { + "epoch": 4.643953345610804, + "grad_norm": 0.23357853293418884, + "learning_rate": 5.816469619455363e-05, + "loss": 1.7353, + "step": 15130 + }, + { + "epoch": 4.644260282381829, + "grad_norm": 0.22030897438526154, + "learning_rate": 5.815979231187181e-05, + "loss": 1.7413, + "step": 15131 + }, + { + "epoch": 4.644567219152854, + "grad_norm": 0.2322571873664856, + "learning_rate": 5.815488834855014e-05, + "loss": 1.7305, + "step": 15132 + }, + { + "epoch": 4.64487415592388, + "grad_norm": 0.25256821513175964, + "learning_rate": 5.814998430463709e-05, + "loss": 1.7533, + "step": 15133 + }, + { + "epoch": 4.645181092694905, + "grad_norm": 0.248504638671875, + "learning_rate": 5.81450801801811e-05, + "loss": 1.7345, + "step": 15134 + }, + { + "epoch": 4.64548802946593, + "grad_norm": 0.22850964963436127, + "learning_rate": 5.8140175975230673e-05, + "loss": 1.8308, + "step": 15135 + }, + { + "epoch": 4.645794966236955, + "grad_norm": 0.3517951965332031, + "learning_rate": 5.813527168983426e-05, + "loss": 1.811, + "step": 15136 + }, + { + "epoch": 4.64610190300798, + "grad_norm": 0.32132068276405334, + "learning_rate": 5.813036732404031e-05, + "loss": 1.7584, + "step": 15137 + }, + { + "epoch": 4.6464088397790055, + "grad_norm": 0.2349396049976349, + "learning_rate": 5.812546287789731e-05, + "loss": 1.7762, + "step": 15138 + }, + { + "epoch": 4.646715776550031, + "grad_norm": 0.23519493639469147, + "learning_rate": 5.812055835145372e-05, + "loss": 1.7428, + "step": 15139 + }, + { + "epoch": 4.647022713321056, + "grad_norm": 0.29277852177619934, + "learning_rate": 5.8115653744758016e-05, + "loss": 1.7599, + "step": 15140 + }, + { + "epoch": 4.647329650092081, + "grad_norm": 0.2347593754529953, + "learning_rate": 5.811074905785867e-05, + "loss": 1.7401, + "step": 15141 + }, + { + "epoch": 4.647636586863106, + "grad_norm": 0.23080264031887054, + "learning_rate": 5.8105844290804147e-05, + "loss": 1.7705, + "step": 15142 + }, + { + "epoch": 4.647943523634131, + "grad_norm": 0.24686801433563232, + "learning_rate": 5.810093944364291e-05, + "loss": 1.7409, + "step": 15143 + }, + { + "epoch": 4.648250460405157, + "grad_norm": 0.24098120629787445, + "learning_rate": 5.809603451642344e-05, + "loss": 1.7893, + "step": 15144 + }, + { + "epoch": 4.648557397176182, + "grad_norm": 0.23020638525485992, + "learning_rate": 5.809112950919422e-05, + "loss": 1.7589, + "step": 15145 + }, + { + "epoch": 4.648864333947207, + "grad_norm": 0.3036736249923706, + "learning_rate": 5.808622442200371e-05, + "loss": 1.7964, + "step": 15146 + }, + { + "epoch": 4.649171270718232, + "grad_norm": 0.2965635657310486, + "learning_rate": 5.808131925490039e-05, + "loss": 1.7986, + "step": 15147 + }, + { + "epoch": 4.649478207489257, + "grad_norm": 0.22241640090942383, + "learning_rate": 5.8076414007932745e-05, + "loss": 1.749, + "step": 15148 + }, + { + "epoch": 4.649785144260282, + "grad_norm": 0.20304246246814728, + "learning_rate": 5.8071508681149246e-05, + "loss": 1.7374, + "step": 15149 + }, + { + "epoch": 4.650092081031308, + "grad_norm": 0.19534410536289215, + "learning_rate": 5.806660327459834e-05, + "loss": 1.7087, + "step": 15150 + }, + { + "epoch": 4.650399017802332, + "grad_norm": 0.2151753008365631, + "learning_rate": 5.806169778832856e-05, + "loss": 1.7409, + "step": 15151 + }, + { + "epoch": 4.650705954573358, + "grad_norm": 0.2180301696062088, + "learning_rate": 5.805679222238836e-05, + "loss": 1.7522, + "step": 15152 + }, + { + "epoch": 4.651012891344383, + "grad_norm": 0.19917607307434082, + "learning_rate": 5.8051886576826205e-05, + "loss": 1.768, + "step": 15153 + }, + { + "epoch": 4.651319828115408, + "grad_norm": 0.2312052994966507, + "learning_rate": 5.804698085169059e-05, + "loss": 1.7799, + "step": 15154 + }, + { + "epoch": 4.651626764886434, + "grad_norm": 0.21541514992713928, + "learning_rate": 5.804207504702999e-05, + "loss": 1.7595, + "step": 15155 + }, + { + "epoch": 4.651933701657459, + "grad_norm": 0.2029450386762619, + "learning_rate": 5.803716916289289e-05, + "loss": 1.7727, + "step": 15156 + }, + { + "epoch": 4.652240638428484, + "grad_norm": 0.21796850860118866, + "learning_rate": 5.8032263199327787e-05, + "loss": 1.7445, + "step": 15157 + }, + { + "epoch": 4.652547575199509, + "grad_norm": 0.20309078693389893, + "learning_rate": 5.802735715638314e-05, + "loss": 1.6971, + "step": 15158 + }, + { + "epoch": 4.652854511970534, + "grad_norm": 0.21270112693309784, + "learning_rate": 5.802245103410745e-05, + "loss": 1.7162, + "step": 15159 + }, + { + "epoch": 4.653161448741559, + "grad_norm": 0.25357750058174133, + "learning_rate": 5.8017544832549184e-05, + "loss": 1.7534, + "step": 15160 + }, + { + "epoch": 4.653468385512585, + "grad_norm": 0.24015015363693237, + "learning_rate": 5.8012638551756847e-05, + "loss": 1.7639, + "step": 15161 + }, + { + "epoch": 4.653775322283609, + "grad_norm": 0.20507018268108368, + "learning_rate": 5.800773219177893e-05, + "loss": 1.7293, + "step": 15162 + }, + { + "epoch": 4.6540822590546345, + "grad_norm": 0.23399868607521057, + "learning_rate": 5.800282575266389e-05, + "loss": 1.8286, + "step": 15163 + }, + { + "epoch": 4.65438919582566, + "grad_norm": 0.27126726508140564, + "learning_rate": 5.799791923446025e-05, + "loss": 1.8028, + "step": 15164 + }, + { + "epoch": 4.654696132596685, + "grad_norm": 0.23644569516181946, + "learning_rate": 5.7993012637216494e-05, + "loss": 1.7138, + "step": 15165 + }, + { + "epoch": 4.6550030693677105, + "grad_norm": 0.21557916700839996, + "learning_rate": 5.7988105960981086e-05, + "loss": 1.7703, + "step": 15166 + }, + { + "epoch": 4.655310006138736, + "grad_norm": 0.22030150890350342, + "learning_rate": 5.798319920580254e-05, + "loss": 1.7282, + "step": 15167 + }, + { + "epoch": 4.65561694290976, + "grad_norm": 0.2092939168214798, + "learning_rate": 5.7978292371729325e-05, + "loss": 1.7853, + "step": 15168 + }, + { + "epoch": 4.655923879680786, + "grad_norm": 0.21643707156181335, + "learning_rate": 5.797338545880997e-05, + "loss": 1.7582, + "step": 15169 + }, + { + "epoch": 4.656230816451811, + "grad_norm": 0.3064669668674469, + "learning_rate": 5.796847846709294e-05, + "loss": 1.8139, + "step": 15170 + }, + { + "epoch": 4.656537753222836, + "grad_norm": 0.3060479760169983, + "learning_rate": 5.796357139662674e-05, + "loss": 1.7356, + "step": 15171 + }, + { + "epoch": 4.656844689993862, + "grad_norm": 0.23546656966209412, + "learning_rate": 5.7958664247459835e-05, + "loss": 1.7937, + "step": 15172 + }, + { + "epoch": 4.657151626764886, + "grad_norm": 0.2890888750553131, + "learning_rate": 5.795375701964077e-05, + "loss": 1.7305, + "step": 15173 + }, + { + "epoch": 4.657458563535911, + "grad_norm": 0.27948084473609924, + "learning_rate": 5.794884971321801e-05, + "loss": 1.7428, + "step": 15174 + }, + { + "epoch": 4.657765500306937, + "grad_norm": 0.2354089468717575, + "learning_rate": 5.794394232824007e-05, + "loss": 1.7622, + "step": 15175 + }, + { + "epoch": 4.658072437077962, + "grad_norm": 0.3271159827709198, + "learning_rate": 5.793903486475541e-05, + "loss": 1.7826, + "step": 15176 + }, + { + "epoch": 4.658379373848987, + "grad_norm": 0.3561338782310486, + "learning_rate": 5.793412732281257e-05, + "loss": 1.7698, + "step": 15177 + }, + { + "epoch": 4.658686310620013, + "grad_norm": 0.2913050949573517, + "learning_rate": 5.7929219702460035e-05, + "loss": 1.8156, + "step": 15178 + }, + { + "epoch": 4.658993247391037, + "grad_norm": 0.2345089465379715, + "learning_rate": 5.7924312003746294e-05, + "loss": 1.7859, + "step": 15179 + }, + { + "epoch": 4.6593001841620625, + "grad_norm": 0.3018132150173187, + "learning_rate": 5.7919404226719865e-05, + "loss": 1.7622, + "step": 15180 + }, + { + "epoch": 4.659607120933088, + "grad_norm": 0.29134172201156616, + "learning_rate": 5.791449637142924e-05, + "loss": 1.7287, + "step": 15181 + }, + { + "epoch": 4.659914057704113, + "grad_norm": 0.24126321077346802, + "learning_rate": 5.7909588437922924e-05, + "loss": 1.7969, + "step": 15182 + }, + { + "epoch": 4.6602209944751385, + "grad_norm": 0.27053284645080566, + "learning_rate": 5.7904680426249415e-05, + "loss": 1.7399, + "step": 15183 + }, + { + "epoch": 4.660527931246163, + "grad_norm": 0.2636512219905853, + "learning_rate": 5.789977233645722e-05, + "loss": 1.7615, + "step": 15184 + }, + { + "epoch": 4.660834868017188, + "grad_norm": 0.2263207584619522, + "learning_rate": 5.789486416859484e-05, + "loss": 1.7668, + "step": 15185 + }, + { + "epoch": 4.661141804788214, + "grad_norm": 0.25387826561927795, + "learning_rate": 5.78899559227108e-05, + "loss": 1.7594, + "step": 15186 + }, + { + "epoch": 4.661448741559239, + "grad_norm": 0.2268977165222168, + "learning_rate": 5.7885047598853596e-05, + "loss": 1.75, + "step": 15187 + }, + { + "epoch": 4.661755678330264, + "grad_norm": 0.29093095660209656, + "learning_rate": 5.788013919707172e-05, + "loss": 1.7291, + "step": 15188 + }, + { + "epoch": 4.66206261510129, + "grad_norm": 0.26578736305236816, + "learning_rate": 5.7875230717413684e-05, + "loss": 1.7276, + "step": 15189 + }, + { + "epoch": 4.662369551872314, + "grad_norm": 0.2548983097076416, + "learning_rate": 5.7870322159928e-05, + "loss": 1.755, + "step": 15190 + }, + { + "epoch": 4.662676488643339, + "grad_norm": 0.2246701419353485, + "learning_rate": 5.7865413524663184e-05, + "loss": 1.751, + "step": 15191 + }, + { + "epoch": 4.662983425414365, + "grad_norm": 0.3069002032279968, + "learning_rate": 5.7860504811667747e-05, + "loss": 1.7522, + "step": 15192 + }, + { + "epoch": 4.66329036218539, + "grad_norm": 0.3081241250038147, + "learning_rate": 5.7855596020990186e-05, + "loss": 1.7152, + "step": 15193 + }, + { + "epoch": 4.6635972989564145, + "grad_norm": 0.29006731510162354, + "learning_rate": 5.7850687152679026e-05, + "loss": 1.8471, + "step": 15194 + }, + { + "epoch": 4.66390423572744, + "grad_norm": 0.24131664633750916, + "learning_rate": 5.7845778206782786e-05, + "loss": 1.763, + "step": 15195 + }, + { + "epoch": 4.664211172498465, + "grad_norm": 0.21808001399040222, + "learning_rate": 5.784086918334994e-05, + "loss": 1.6989, + "step": 15196 + }, + { + "epoch": 4.6645181092694905, + "grad_norm": 0.2413240373134613, + "learning_rate": 5.783596008242904e-05, + "loss": 1.7869, + "step": 15197 + }, + { + "epoch": 4.664825046040516, + "grad_norm": 0.23310934007167816, + "learning_rate": 5.7831050904068594e-05, + "loss": 1.8017, + "step": 15198 + }, + { + "epoch": 4.665131982811541, + "grad_norm": 0.2577926814556122, + "learning_rate": 5.7826141648317125e-05, + "loss": 1.6938, + "step": 15199 + }, + { + "epoch": 4.665438919582566, + "grad_norm": 0.22523443400859833, + "learning_rate": 5.782123231522312e-05, + "loss": 1.8104, + "step": 15200 + }, + { + "epoch": 4.665745856353591, + "grad_norm": 0.23603026568889618, + "learning_rate": 5.781632290483512e-05, + "loss": 1.7484, + "step": 15201 + }, + { + "epoch": 4.666052793124616, + "grad_norm": 0.23195989429950714, + "learning_rate": 5.781141341720162e-05, + "loss": 1.7786, + "step": 15202 + }, + { + "epoch": 4.666359729895642, + "grad_norm": 0.21838274598121643, + "learning_rate": 5.780650385237118e-05, + "loss": 1.7509, + "step": 15203 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.26656514406204224, + "learning_rate": 5.780159421039229e-05, + "loss": 1.7875, + "step": 15204 + }, + { + "epoch": 4.666973603437691, + "grad_norm": 0.2293243706226349, + "learning_rate": 5.7796684491313456e-05, + "loss": 1.7518, + "step": 15205 + }, + { + "epoch": 4.667280540208717, + "grad_norm": 0.24190817773342133, + "learning_rate": 5.779177469518323e-05, + "loss": 1.7593, + "step": 15206 + }, + { + "epoch": 4.667587476979742, + "grad_norm": 0.31113871932029724, + "learning_rate": 5.77868648220501e-05, + "loss": 1.7911, + "step": 15207 + }, + { + "epoch": 4.667894413750767, + "grad_norm": 0.2875262498855591, + "learning_rate": 5.778195487196263e-05, + "loss": 1.7871, + "step": 15208 + }, + { + "epoch": 4.668201350521793, + "grad_norm": 0.2172149419784546, + "learning_rate": 5.777704484496931e-05, + "loss": 1.7592, + "step": 15209 + }, + { + "epoch": 4.668508287292818, + "grad_norm": 0.3282458186149597, + "learning_rate": 5.7772134741118675e-05, + "loss": 1.7687, + "step": 15210 + }, + { + "epoch": 4.6688152240638425, + "grad_norm": 0.36963000893592834, + "learning_rate": 5.7767224560459255e-05, + "loss": 1.812, + "step": 15211 + }, + { + "epoch": 4.669122160834868, + "grad_norm": 0.22387740015983582, + "learning_rate": 5.776231430303957e-05, + "loss": 1.7449, + "step": 15212 + }, + { + "epoch": 4.669429097605893, + "grad_norm": 0.21468734741210938, + "learning_rate": 5.775740396890813e-05, + "loss": 1.716, + "step": 15213 + }, + { + "epoch": 4.6697360343769185, + "grad_norm": 0.2478475719690323, + "learning_rate": 5.7752493558113486e-05, + "loss": 1.7182, + "step": 15214 + }, + { + "epoch": 4.670042971147944, + "grad_norm": 0.20924845337867737, + "learning_rate": 5.774758307070416e-05, + "loss": 1.784, + "step": 15215 + }, + { + "epoch": 4.670349907918968, + "grad_norm": 0.2933209538459778, + "learning_rate": 5.774267250672868e-05, + "loss": 1.8375, + "step": 15216 + }, + { + "epoch": 4.670656844689994, + "grad_norm": 0.2744538486003876, + "learning_rate": 5.7737761866235565e-05, + "loss": 1.7019, + "step": 15217 + }, + { + "epoch": 4.670963781461019, + "grad_norm": 0.20991720259189606, + "learning_rate": 5.773285114927336e-05, + "loss": 1.7189, + "step": 15218 + }, + { + "epoch": 4.671270718232044, + "grad_norm": 0.2873254716396332, + "learning_rate": 5.772794035589057e-05, + "loss": 1.7492, + "step": 15219 + }, + { + "epoch": 4.67157765500307, + "grad_norm": 0.2781519591808319, + "learning_rate": 5.772302948613576e-05, + "loss": 1.7342, + "step": 15220 + }, + { + "epoch": 4.671884591774095, + "grad_norm": 0.23288768529891968, + "learning_rate": 5.7718118540057455e-05, + "loss": 1.7245, + "step": 15221 + }, + { + "epoch": 4.672191528545119, + "grad_norm": 0.40817564725875854, + "learning_rate": 5.771320751770417e-05, + "loss": 1.7659, + "step": 15222 + }, + { + "epoch": 4.672498465316145, + "grad_norm": 0.45521771907806396, + "learning_rate": 5.770829641912444e-05, + "loss": 1.7875, + "step": 15223 + }, + { + "epoch": 4.67280540208717, + "grad_norm": 0.22353248298168182, + "learning_rate": 5.77033852443668e-05, + "loss": 1.7098, + "step": 15224 + }, + { + "epoch": 4.673112338858195, + "grad_norm": 0.4066791534423828, + "learning_rate": 5.769847399347981e-05, + "loss": 1.7277, + "step": 15225 + }, + { + "epoch": 4.67341927562922, + "grad_norm": 0.4299545884132385, + "learning_rate": 5.769356266651198e-05, + "loss": 1.7777, + "step": 15226 + }, + { + "epoch": 4.673726212400245, + "grad_norm": 0.21037638187408447, + "learning_rate": 5.768865126351186e-05, + "loss": 1.7263, + "step": 15227 + }, + { + "epoch": 4.6740331491712706, + "grad_norm": 0.3390437066555023, + "learning_rate": 5.768373978452798e-05, + "loss": 1.7457, + "step": 15228 + }, + { + "epoch": 4.674340085942296, + "grad_norm": 0.40003323554992676, + "learning_rate": 5.767882822960887e-05, + "loss": 1.8137, + "step": 15229 + }, + { + "epoch": 4.674647022713321, + "grad_norm": 0.2212848961353302, + "learning_rate": 5.767391659880308e-05, + "loss": 1.7131, + "step": 15230 + }, + { + "epoch": 4.6749539594843466, + "grad_norm": 0.30634984374046326, + "learning_rate": 5.766900489215915e-05, + "loss": 1.7775, + "step": 15231 + }, + { + "epoch": 4.675260896255372, + "grad_norm": 0.31412798166275024, + "learning_rate": 5.766409310972563e-05, + "loss": 1.7383, + "step": 15232 + }, + { + "epoch": 4.675567833026396, + "grad_norm": 0.21125225722789764, + "learning_rate": 5.7659181251551045e-05, + "loss": 1.8046, + "step": 15233 + }, + { + "epoch": 4.675874769797422, + "grad_norm": 0.3234494924545288, + "learning_rate": 5.765426931768394e-05, + "loss": 1.7838, + "step": 15234 + }, + { + "epoch": 4.676181706568447, + "grad_norm": 0.2668779194355011, + "learning_rate": 5.764935730817286e-05, + "loss": 1.7464, + "step": 15235 + }, + { + "epoch": 4.676488643339472, + "grad_norm": 0.22423583269119263, + "learning_rate": 5.764444522306633e-05, + "loss": 1.7165, + "step": 15236 + }, + { + "epoch": 4.676795580110497, + "grad_norm": 0.29066675901412964, + "learning_rate": 5.7639533062412945e-05, + "loss": 1.75, + "step": 15237 + }, + { + "epoch": 4.677102516881522, + "grad_norm": 0.2963598370552063, + "learning_rate": 5.76346208262612e-05, + "loss": 1.8168, + "step": 15238 + }, + { + "epoch": 4.6774094536525475, + "grad_norm": 0.21484358608722687, + "learning_rate": 5.7629708514659655e-05, + "loss": 1.71, + "step": 15239 + }, + { + "epoch": 4.677716390423573, + "grad_norm": 0.20657925307750702, + "learning_rate": 5.762479612765686e-05, + "loss": 1.7239, + "step": 15240 + }, + { + "epoch": 4.678023327194598, + "grad_norm": 0.21336235105991364, + "learning_rate": 5.761988366530136e-05, + "loss": 1.7952, + "step": 15241 + }, + { + "epoch": 4.6783302639656235, + "grad_norm": 0.24156586825847626, + "learning_rate": 5.7614971127641696e-05, + "loss": 1.7709, + "step": 15242 + }, + { + "epoch": 4.678637200736648, + "grad_norm": 0.2633824944496155, + "learning_rate": 5.761005851472643e-05, + "loss": 1.7404, + "step": 15243 + }, + { + "epoch": 4.678944137507673, + "grad_norm": 0.23302829265594482, + "learning_rate": 5.760514582660411e-05, + "loss": 1.7006, + "step": 15244 + }, + { + "epoch": 4.679251074278699, + "grad_norm": 0.22404874861240387, + "learning_rate": 5.7600233063323283e-05, + "loss": 1.7731, + "step": 15245 + }, + { + "epoch": 4.679558011049724, + "grad_norm": 0.23217839002609253, + "learning_rate": 5.7595320224932495e-05, + "loss": 1.7452, + "step": 15246 + }, + { + "epoch": 4.679864947820749, + "grad_norm": 0.23131491243839264, + "learning_rate": 5.7590407311480296e-05, + "loss": 1.7547, + "step": 15247 + }, + { + "epoch": 4.680171884591774, + "grad_norm": 0.21907350420951843, + "learning_rate": 5.7585494323015245e-05, + "loss": 1.7556, + "step": 15248 + }, + { + "epoch": 4.680478821362799, + "grad_norm": 0.22416768968105316, + "learning_rate": 5.7580581259585895e-05, + "loss": 1.7783, + "step": 15249 + }, + { + "epoch": 4.680785758133824, + "grad_norm": 0.20203055441379547, + "learning_rate": 5.75756681212408e-05, + "loss": 1.7285, + "step": 15250 + }, + { + "epoch": 4.68109269490485, + "grad_norm": 0.27838602662086487, + "learning_rate": 5.75707549080285e-05, + "loss": 1.7489, + "step": 15251 + }, + { + "epoch": 4.681399631675875, + "grad_norm": 0.2415023297071457, + "learning_rate": 5.7565841619997586e-05, + "loss": 1.7453, + "step": 15252 + }, + { + "epoch": 4.6817065684469, + "grad_norm": 0.22986920177936554, + "learning_rate": 5.756092825719658e-05, + "loss": 1.7315, + "step": 15253 + }, + { + "epoch": 4.682013505217925, + "grad_norm": 0.2427850216627121, + "learning_rate": 5.755601481967404e-05, + "loss": 1.772, + "step": 15254 + }, + { + "epoch": 4.68232044198895, + "grad_norm": 0.24556589126586914, + "learning_rate": 5.755110130747854e-05, + "loss": 1.7475, + "step": 15255 + }, + { + "epoch": 4.6826273787599755, + "grad_norm": 0.25252529978752136, + "learning_rate": 5.754618772065864e-05, + "loss": 1.7152, + "step": 15256 + }, + { + "epoch": 4.682934315531001, + "grad_norm": 0.24599005281925201, + "learning_rate": 5.754127405926287e-05, + "loss": 1.7911, + "step": 15257 + }, + { + "epoch": 4.683241252302026, + "grad_norm": 0.18961480259895325, + "learning_rate": 5.7536360323339836e-05, + "loss": 1.681, + "step": 15258 + }, + { + "epoch": 4.683548189073051, + "grad_norm": 0.24372327327728271, + "learning_rate": 5.7531446512938035e-05, + "loss": 1.7771, + "step": 15259 + }, + { + "epoch": 4.683855125844076, + "grad_norm": 0.23239269852638245, + "learning_rate": 5.752653262810609e-05, + "loss": 1.7502, + "step": 15260 + }, + { + "epoch": 4.684162062615101, + "grad_norm": 0.25076135993003845, + "learning_rate": 5.752161866889254e-05, + "loss": 1.7974, + "step": 15261 + }, + { + "epoch": 4.684468999386127, + "grad_norm": 0.2703748941421509, + "learning_rate": 5.7516704635345945e-05, + "loss": 1.7245, + "step": 15262 + }, + { + "epoch": 4.684775936157152, + "grad_norm": 0.19247616827487946, + "learning_rate": 5.751179052751487e-05, + "loss": 1.7105, + "step": 15263 + }, + { + "epoch": 4.685082872928177, + "grad_norm": 0.23166817426681519, + "learning_rate": 5.750687634544787e-05, + "loss": 1.8026, + "step": 15264 + }, + { + "epoch": 4.685389809699202, + "grad_norm": 0.22434166073799133, + "learning_rate": 5.7501962089193507e-05, + "loss": 1.7779, + "step": 15265 + }, + { + "epoch": 4.685696746470227, + "grad_norm": 0.190699502825737, + "learning_rate": 5.749704775880037e-05, + "loss": 1.726, + "step": 15266 + }, + { + "epoch": 4.686003683241252, + "grad_norm": 0.22995290160179138, + "learning_rate": 5.749213335431702e-05, + "loss": 1.7495, + "step": 15267 + }, + { + "epoch": 4.686310620012278, + "grad_norm": 0.2712057828903198, + "learning_rate": 5.7487218875792016e-05, + "loss": 1.7862, + "step": 15268 + }, + { + "epoch": 4.686617556783302, + "grad_norm": 0.2524562180042267, + "learning_rate": 5.7482304323273913e-05, + "loss": 1.7092, + "step": 15269 + }, + { + "epoch": 4.6869244935543275, + "grad_norm": 0.23810559511184692, + "learning_rate": 5.747738969681131e-05, + "loss": 1.8049, + "step": 15270 + }, + { + "epoch": 4.687231430325353, + "grad_norm": 0.25521910190582275, + "learning_rate": 5.747247499645275e-05, + "loss": 1.8124, + "step": 15271 + }, + { + "epoch": 4.687538367096378, + "grad_norm": 0.27797845005989075, + "learning_rate": 5.746756022224682e-05, + "loss": 1.7694, + "step": 15272 + }, + { + "epoch": 4.6878453038674035, + "grad_norm": 0.23849260807037354, + "learning_rate": 5.746264537424208e-05, + "loss": 1.7771, + "step": 15273 + }, + { + "epoch": 4.688152240638429, + "grad_norm": 0.24368882179260254, + "learning_rate": 5.74577304524871e-05, + "loss": 1.8143, + "step": 15274 + }, + { + "epoch": 4.688459177409453, + "grad_norm": 0.2712198793888092, + "learning_rate": 5.745281545703045e-05, + "loss": 1.7683, + "step": 15275 + }, + { + "epoch": 4.688766114180479, + "grad_norm": 0.30913081765174866, + "learning_rate": 5.7447900387920716e-05, + "loss": 1.7111, + "step": 15276 + }, + { + "epoch": 4.689073050951504, + "grad_norm": 0.22123363614082336, + "learning_rate": 5.744298524520646e-05, + "loss": 1.7466, + "step": 15277 + }, + { + "epoch": 4.689379987722529, + "grad_norm": 0.32836318016052246, + "learning_rate": 5.743807002893628e-05, + "loss": 1.8083, + "step": 15278 + }, + { + "epoch": 4.689686924493555, + "grad_norm": 0.33319979906082153, + "learning_rate": 5.743315473915871e-05, + "loss": 1.7122, + "step": 15279 + }, + { + "epoch": 4.689993861264579, + "grad_norm": 0.252163290977478, + "learning_rate": 5.742823937592236e-05, + "loss": 1.7599, + "step": 15280 + }, + { + "epoch": 4.690300798035604, + "grad_norm": 0.23248571157455444, + "learning_rate": 5.7423323939275797e-05, + "loss": 1.7791, + "step": 15281 + }, + { + "epoch": 4.69060773480663, + "grad_norm": 0.27024057507514954, + "learning_rate": 5.741840842926759e-05, + "loss": 1.7608, + "step": 15282 + }, + { + "epoch": 4.690914671577655, + "grad_norm": 0.21888256072998047, + "learning_rate": 5.7413492845946326e-05, + "loss": 1.7407, + "step": 15283 + }, + { + "epoch": 4.69122160834868, + "grad_norm": 0.2574782073497772, + "learning_rate": 5.740857718936058e-05, + "loss": 1.707, + "step": 15284 + }, + { + "epoch": 4.691528545119706, + "grad_norm": 0.2541569769382477, + "learning_rate": 5.740366145955893e-05, + "loss": 1.7301, + "step": 15285 + }, + { + "epoch": 4.69183548189073, + "grad_norm": 0.23484647274017334, + "learning_rate": 5.7398745656589955e-05, + "loss": 1.772, + "step": 15286 + }, + { + "epoch": 4.6921424186617555, + "grad_norm": 0.2827093005180359, + "learning_rate": 5.739382978050225e-05, + "loss": 1.7745, + "step": 15287 + }, + { + "epoch": 4.692449355432781, + "grad_norm": 0.300387978553772, + "learning_rate": 5.738891383134437e-05, + "loss": 1.7966, + "step": 15288 + }, + { + "epoch": 4.692756292203806, + "grad_norm": 0.2414523959159851, + "learning_rate": 5.7383997809164926e-05, + "loss": 1.7355, + "step": 15289 + }, + { + "epoch": 4.6930632289748315, + "grad_norm": 0.21221841871738434, + "learning_rate": 5.737908171401248e-05, + "loss": 1.7935, + "step": 15290 + }, + { + "epoch": 4.693370165745856, + "grad_norm": 0.23488084971904755, + "learning_rate": 5.737416554593563e-05, + "loss": 1.7447, + "step": 15291 + }, + { + "epoch": 4.693677102516881, + "grad_norm": 0.26176631450653076, + "learning_rate": 5.7369249304982954e-05, + "loss": 1.769, + "step": 15292 + }, + { + "epoch": 4.693984039287907, + "grad_norm": 0.23060615360736847, + "learning_rate": 5.736433299120303e-05, + "loss": 1.7344, + "step": 15293 + }, + { + "epoch": 4.694290976058932, + "grad_norm": 0.2536846399307251, + "learning_rate": 5.7359416604644456e-05, + "loss": 1.7862, + "step": 15294 + }, + { + "epoch": 4.694597912829957, + "grad_norm": 0.23221342265605927, + "learning_rate": 5.735450014535581e-05, + "loss": 1.743, + "step": 15295 + }, + { + "epoch": 4.694904849600983, + "grad_norm": 0.25320062041282654, + "learning_rate": 5.734958361338568e-05, + "loss": 1.8001, + "step": 15296 + }, + { + "epoch": 4.695211786372007, + "grad_norm": 0.23132461309432983, + "learning_rate": 5.734466700878267e-05, + "loss": 1.7676, + "step": 15297 + }, + { + "epoch": 4.695518723143032, + "grad_norm": 0.2222728580236435, + "learning_rate": 5.7339750331595346e-05, + "loss": 1.7267, + "step": 15298 + }, + { + "epoch": 4.695825659914058, + "grad_norm": 0.2505118250846863, + "learning_rate": 5.733483358187231e-05, + "loss": 1.7467, + "step": 15299 + }, + { + "epoch": 4.696132596685083, + "grad_norm": 0.23609887063503265, + "learning_rate": 5.732991675966214e-05, + "loss": 1.7319, + "step": 15300 + }, + { + "epoch": 4.696439533456108, + "grad_norm": 0.2939738631248474, + "learning_rate": 5.732499986501345e-05, + "loss": 1.8676, + "step": 15301 + }, + { + "epoch": 4.696746470227133, + "grad_norm": 0.29868564009666443, + "learning_rate": 5.7320082897974814e-05, + "loss": 1.7541, + "step": 15302 + }, + { + "epoch": 4.697053406998158, + "grad_norm": 0.2366383820772171, + "learning_rate": 5.731516585859482e-05, + "loss": 1.7531, + "step": 15303 + }, + { + "epoch": 4.6973603437691835, + "grad_norm": 0.2721317410469055, + "learning_rate": 5.731024874692208e-05, + "loss": 1.7444, + "step": 15304 + }, + { + "epoch": 4.697667280540209, + "grad_norm": 0.24925900995731354, + "learning_rate": 5.730533156300517e-05, + "loss": 1.7716, + "step": 15305 + }, + { + "epoch": 4.697974217311234, + "grad_norm": 0.23012754321098328, + "learning_rate": 5.7300414306892704e-05, + "loss": 1.7211, + "step": 15306 + }, + { + "epoch": 4.6982811540822595, + "grad_norm": 0.21274085342884064, + "learning_rate": 5.7295496978633254e-05, + "loss": 1.7853, + "step": 15307 + }, + { + "epoch": 4.698588090853284, + "grad_norm": 0.21799001097679138, + "learning_rate": 5.729057957827544e-05, + "loss": 1.7505, + "step": 15308 + }, + { + "epoch": 4.698895027624309, + "grad_norm": 0.22365793585777283, + "learning_rate": 5.728566210586783e-05, + "loss": 1.7934, + "step": 15309 + }, + { + "epoch": 4.699201964395335, + "grad_norm": 0.23325085639953613, + "learning_rate": 5.728074456145903e-05, + "loss": 1.7354, + "step": 15310 + }, + { + "epoch": 4.69950890116636, + "grad_norm": 0.2175164669752121, + "learning_rate": 5.7275826945097654e-05, + "loss": 1.7541, + "step": 15311 + }, + { + "epoch": 4.699815837937384, + "grad_norm": 0.24657388031482697, + "learning_rate": 5.727090925683231e-05, + "loss": 1.814, + "step": 15312 + }, + { + "epoch": 4.70012277470841, + "grad_norm": 0.2437550574541092, + "learning_rate": 5.726599149671156e-05, + "loss": 1.7234, + "step": 15313 + }, + { + "epoch": 4.700429711479435, + "grad_norm": 0.21053487062454224, + "learning_rate": 5.726107366478402e-05, + "loss": 1.7788, + "step": 15314 + }, + { + "epoch": 4.7007366482504604, + "grad_norm": 0.2007097452878952, + "learning_rate": 5.725615576109831e-05, + "loss": 1.7453, + "step": 15315 + }, + { + "epoch": 4.701043585021486, + "grad_norm": 0.19331564009189606, + "learning_rate": 5.725123778570299e-05, + "loss": 1.7142, + "step": 15316 + }, + { + "epoch": 4.701350521792511, + "grad_norm": 0.24291567504405975, + "learning_rate": 5.7246319738646706e-05, + "loss": 1.8081, + "step": 15317 + }, + { + "epoch": 4.701657458563536, + "grad_norm": 0.21423695981502533, + "learning_rate": 5.724140161997804e-05, + "loss": 1.7021, + "step": 15318 + }, + { + "epoch": 4.701964395334561, + "grad_norm": 0.20857618749141693, + "learning_rate": 5.72364834297456e-05, + "loss": 1.7447, + "step": 15319 + }, + { + "epoch": 4.702271332105586, + "grad_norm": 0.2547401487827301, + "learning_rate": 5.7231565167998e-05, + "loss": 1.7505, + "step": 15320 + }, + { + "epoch": 4.702578268876612, + "grad_norm": 0.2729472219944, + "learning_rate": 5.7226646834783825e-05, + "loss": 1.7974, + "step": 15321 + }, + { + "epoch": 4.702885205647637, + "grad_norm": 0.23258371651172638, + "learning_rate": 5.722172843015169e-05, + "loss": 1.7562, + "step": 15322 + }, + { + "epoch": 4.703192142418661, + "grad_norm": 0.23399893939495087, + "learning_rate": 5.72168099541502e-05, + "loss": 1.7674, + "step": 15323 + }, + { + "epoch": 4.703499079189687, + "grad_norm": 0.2678206264972687, + "learning_rate": 5.721189140682797e-05, + "loss": 1.7331, + "step": 15324 + }, + { + "epoch": 4.703806015960712, + "grad_norm": 0.19472146034240723, + "learning_rate": 5.7206972788233593e-05, + "loss": 1.7003, + "step": 15325 + }, + { + "epoch": 4.704112952731737, + "grad_norm": 0.2199394404888153, + "learning_rate": 5.72020540984157e-05, + "loss": 1.7072, + "step": 15326 + }, + { + "epoch": 4.704419889502763, + "grad_norm": 0.219175323843956, + "learning_rate": 5.719713533742287e-05, + "loss": 1.7591, + "step": 15327 + }, + { + "epoch": 4.704726826273788, + "grad_norm": 0.21127547323703766, + "learning_rate": 5.719221650530374e-05, + "loss": 1.8059, + "step": 15328 + }, + { + "epoch": 4.7050337630448125, + "grad_norm": 0.22189834713935852, + "learning_rate": 5.7187297602106905e-05, + "loss": 1.7529, + "step": 15329 + }, + { + "epoch": 4.705340699815838, + "grad_norm": 0.19945195317268372, + "learning_rate": 5.7182378627881e-05, + "loss": 1.7133, + "step": 15330 + }, + { + "epoch": 4.705647636586863, + "grad_norm": 0.2177499681711197, + "learning_rate": 5.7177459582674595e-05, + "loss": 1.7451, + "step": 15331 + }, + { + "epoch": 4.7059545733578885, + "grad_norm": 0.19489440321922302, + "learning_rate": 5.717254046653635e-05, + "loss": 1.7499, + "step": 15332 + }, + { + "epoch": 4.706261510128914, + "grad_norm": 0.21366968750953674, + "learning_rate": 5.716762127951485e-05, + "loss": 1.7683, + "step": 15333 + }, + { + "epoch": 4.706568446899938, + "grad_norm": 0.2894177734851837, + "learning_rate": 5.71627020216587e-05, + "loss": 1.8235, + "step": 15334 + }, + { + "epoch": 4.706875383670964, + "grad_norm": 0.22175677120685577, + "learning_rate": 5.7157782693016534e-05, + "loss": 1.7421, + "step": 15335 + }, + { + "epoch": 4.707182320441989, + "grad_norm": 0.23653541505336761, + "learning_rate": 5.715286329363698e-05, + "loss": 1.6937, + "step": 15336 + }, + { + "epoch": 4.707489257213014, + "grad_norm": 0.3015746772289276, + "learning_rate": 5.714794382356863e-05, + "loss": 1.7159, + "step": 15337 + }, + { + "epoch": 4.70779619398404, + "grad_norm": 0.24045881628990173, + "learning_rate": 5.714302428286011e-05, + "loss": 1.7263, + "step": 15338 + }, + { + "epoch": 4.708103130755065, + "grad_norm": 0.19836920499801636, + "learning_rate": 5.7138104671560035e-05, + "loss": 1.7604, + "step": 15339 + }, + { + "epoch": 4.708410067526089, + "grad_norm": 0.2430238276720047, + "learning_rate": 5.7133184989717036e-05, + "loss": 1.7147, + "step": 15340 + }, + { + "epoch": 4.708717004297115, + "grad_norm": 0.19388417899608612, + "learning_rate": 5.712826523737971e-05, + "loss": 1.7153, + "step": 15341 + }, + { + "epoch": 4.70902394106814, + "grad_norm": 0.19648151099681854, + "learning_rate": 5.7123345414596694e-05, + "loss": 1.7373, + "step": 15342 + }, + { + "epoch": 4.709330877839165, + "grad_norm": 0.20326325297355652, + "learning_rate": 5.711842552141661e-05, + "loss": 1.7012, + "step": 15343 + }, + { + "epoch": 4.70963781461019, + "grad_norm": 0.20798304677009583, + "learning_rate": 5.711350555788806e-05, + "loss": 1.7134, + "step": 15344 + }, + { + "epoch": 4.709944751381215, + "grad_norm": 0.29318806529045105, + "learning_rate": 5.7108585524059674e-05, + "loss": 1.7661, + "step": 15345 + }, + { + "epoch": 4.7102516881522405, + "grad_norm": 0.273318350315094, + "learning_rate": 5.710366541998009e-05, + "loss": 1.7329, + "step": 15346 + }, + { + "epoch": 4.710558624923266, + "grad_norm": 0.2306031584739685, + "learning_rate": 5.7098745245697925e-05, + "loss": 1.8152, + "step": 15347 + }, + { + "epoch": 4.710865561694291, + "grad_norm": 0.27630630135536194, + "learning_rate": 5.709382500126179e-05, + "loss": 1.7955, + "step": 15348 + }, + { + "epoch": 4.7111724984653165, + "grad_norm": 0.2366025298833847, + "learning_rate": 5.7088904686720326e-05, + "loss": 1.7943, + "step": 15349 + }, + { + "epoch": 4.711479435236341, + "grad_norm": 0.24196656048297882, + "learning_rate": 5.708398430212215e-05, + "loss": 1.698, + "step": 15350 + }, + { + "epoch": 4.711786372007366, + "grad_norm": 0.2770058512687683, + "learning_rate": 5.707906384751588e-05, + "loss": 1.7618, + "step": 15351 + }, + { + "epoch": 4.712093308778392, + "grad_norm": 0.20432323217391968, + "learning_rate": 5.7074143322950157e-05, + "loss": 1.7422, + "step": 15352 + }, + { + "epoch": 4.712400245549417, + "grad_norm": 0.25543150305747986, + "learning_rate": 5.70692227284736e-05, + "loss": 1.7744, + "step": 15353 + }, + { + "epoch": 4.712707182320442, + "grad_norm": 0.24315913021564484, + "learning_rate": 5.7064302064134855e-05, + "loss": 1.7127, + "step": 15354 + }, + { + "epoch": 4.713014119091467, + "grad_norm": 0.23636099696159363, + "learning_rate": 5.705938132998252e-05, + "loss": 1.7725, + "step": 15355 + }, + { + "epoch": 4.713321055862492, + "grad_norm": 0.26809820532798767, + "learning_rate": 5.705446052606526e-05, + "loss": 1.8338, + "step": 15356 + }, + { + "epoch": 4.713627992633517, + "grad_norm": 0.24969002604484558, + "learning_rate": 5.704953965243167e-05, + "loss": 1.8225, + "step": 15357 + }, + { + "epoch": 4.713934929404543, + "grad_norm": 0.23189692199230194, + "learning_rate": 5.70446187091304e-05, + "loss": 1.7901, + "step": 15358 + }, + { + "epoch": 4.714241866175568, + "grad_norm": 0.22373750805854797, + "learning_rate": 5.703969769621008e-05, + "loss": 1.6919, + "step": 15359 + }, + { + "epoch": 4.714548802946593, + "grad_norm": 0.23963531851768494, + "learning_rate": 5.703477661371934e-05, + "loss": 1.7806, + "step": 15360 + }, + { + "epoch": 4.714855739717618, + "grad_norm": 0.20365150272846222, + "learning_rate": 5.702985546170683e-05, + "loss": 1.7207, + "step": 15361 + }, + { + "epoch": 4.715162676488643, + "grad_norm": 0.245658278465271, + "learning_rate": 5.702493424022114e-05, + "loss": 1.7589, + "step": 15362 + }, + { + "epoch": 4.7154696132596685, + "grad_norm": 0.22633756697177887, + "learning_rate": 5.702001294931094e-05, + "loss": 1.7893, + "step": 15363 + }, + { + "epoch": 4.715776550030694, + "grad_norm": 0.21587726473808289, + "learning_rate": 5.701509158902487e-05, + "loss": 1.8095, + "step": 15364 + }, + { + "epoch": 4.716083486801719, + "grad_norm": 0.22553963959217072, + "learning_rate": 5.701017015941155e-05, + "loss": 1.7419, + "step": 15365 + }, + { + "epoch": 4.716390423572744, + "grad_norm": 0.2276087999343872, + "learning_rate": 5.700524866051962e-05, + "loss": 1.7052, + "step": 15366 + }, + { + "epoch": 4.716697360343769, + "grad_norm": 0.22236761450767517, + "learning_rate": 5.700032709239771e-05, + "loss": 1.8612, + "step": 15367 + }, + { + "epoch": 4.717004297114794, + "grad_norm": 0.22816185653209686, + "learning_rate": 5.6995405455094465e-05, + "loss": 1.78, + "step": 15368 + }, + { + "epoch": 4.71731123388582, + "grad_norm": 0.21597479283809662, + "learning_rate": 5.6990483748658516e-05, + "loss": 1.8276, + "step": 15369 + }, + { + "epoch": 4.717618170656845, + "grad_norm": 0.22209586203098297, + "learning_rate": 5.6985561973138533e-05, + "loss": 1.74, + "step": 15370 + }, + { + "epoch": 4.71792510742787, + "grad_norm": 0.24249997735023499, + "learning_rate": 5.6980640128583116e-05, + "loss": 1.8035, + "step": 15371 + }, + { + "epoch": 4.718232044198895, + "grad_norm": 0.23326106369495392, + "learning_rate": 5.6975718215040943e-05, + "loss": 1.7969, + "step": 15372 + }, + { + "epoch": 4.71853898096992, + "grad_norm": 0.215044766664505, + "learning_rate": 5.6970796232560596e-05, + "loss": 1.7345, + "step": 15373 + }, + { + "epoch": 4.718845917740945, + "grad_norm": 0.20231883227825165, + "learning_rate": 5.696587418119078e-05, + "loss": 1.7231, + "step": 15374 + }, + { + "epoch": 4.719152854511971, + "grad_norm": 0.2136038839817047, + "learning_rate": 5.696095206098011e-05, + "loss": 1.7421, + "step": 15375 + }, + { + "epoch": 4.719459791282996, + "grad_norm": 0.2662335932254791, + "learning_rate": 5.6956029871977235e-05, + "loss": 1.7518, + "step": 15376 + }, + { + "epoch": 4.7197667280540205, + "grad_norm": 0.25649648904800415, + "learning_rate": 5.6951107614230783e-05, + "loss": 1.8314, + "step": 15377 + }, + { + "epoch": 4.720073664825046, + "grad_norm": 0.21995560824871063, + "learning_rate": 5.6946185287789425e-05, + "loss": 1.7511, + "step": 15378 + }, + { + "epoch": 4.720380601596071, + "grad_norm": 0.3388935923576355, + "learning_rate": 5.694126289270177e-05, + "loss": 1.7975, + "step": 15379 + }, + { + "epoch": 4.7206875383670965, + "grad_norm": 0.32886409759521484, + "learning_rate": 5.693634042901651e-05, + "loss": 1.7153, + "step": 15380 + }, + { + "epoch": 4.720994475138122, + "grad_norm": 0.21727977693080902, + "learning_rate": 5.693141789678226e-05, + "loss": 1.7095, + "step": 15381 + }, + { + "epoch": 4.721301411909147, + "grad_norm": 0.2680833041667938, + "learning_rate": 5.6926495296047675e-05, + "loss": 1.696, + "step": 15382 + }, + { + "epoch": 4.721608348680172, + "grad_norm": 0.2645499110221863, + "learning_rate": 5.692157262686141e-05, + "loss": 1.6889, + "step": 15383 + }, + { + "epoch": 4.721915285451197, + "grad_norm": 0.20362348854541779, + "learning_rate": 5.69166498892721e-05, + "loss": 1.7303, + "step": 15384 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.24259062111377716, + "learning_rate": 5.691172708332839e-05, + "loss": 1.7684, + "step": 15385 + }, + { + "epoch": 4.722529158993248, + "grad_norm": 0.24204276502132416, + "learning_rate": 5.690680420907897e-05, + "loss": 1.7728, + "step": 15386 + }, + { + "epoch": 4.722836095764272, + "grad_norm": 0.3038320243358612, + "learning_rate": 5.690188126657244e-05, + "loss": 1.7573, + "step": 15387 + }, + { + "epoch": 4.723143032535297, + "grad_norm": 0.24619868397712708, + "learning_rate": 5.689695825585749e-05, + "loss": 1.754, + "step": 15388 + }, + { + "epoch": 4.723449969306323, + "grad_norm": 0.19441325962543488, + "learning_rate": 5.689203517698276e-05, + "loss": 1.726, + "step": 15389 + }, + { + "epoch": 4.723756906077348, + "grad_norm": 0.2874276340007782, + "learning_rate": 5.688711202999688e-05, + "loss": 1.7704, + "step": 15390 + }, + { + "epoch": 4.724063842848373, + "grad_norm": 0.24488390982151031, + "learning_rate": 5.6882188814948535e-05, + "loss": 1.7477, + "step": 15391 + }, + { + "epoch": 4.724370779619399, + "grad_norm": 0.22674018144607544, + "learning_rate": 5.687726553188636e-05, + "loss": 1.7287, + "step": 15392 + }, + { + "epoch": 4.724677716390423, + "grad_norm": 0.2653258442878723, + "learning_rate": 5.687234218085902e-05, + "loss": 1.7415, + "step": 15393 + }, + { + "epoch": 4.7249846531614486, + "grad_norm": 0.20345374941825867, + "learning_rate": 5.686741876191516e-05, + "loss": 1.764, + "step": 15394 + }, + { + "epoch": 4.725291589932474, + "grad_norm": 0.23193977773189545, + "learning_rate": 5.686249527510345e-05, + "loss": 1.7557, + "step": 15395 + }, + { + "epoch": 4.725598526703499, + "grad_norm": 0.26426708698272705, + "learning_rate": 5.685757172047253e-05, + "loss": 1.7708, + "step": 15396 + }, + { + "epoch": 4.725905463474525, + "grad_norm": 0.21377156674861908, + "learning_rate": 5.685264809807107e-05, + "loss": 1.6921, + "step": 15397 + }, + { + "epoch": 4.726212400245549, + "grad_norm": 0.21628457307815552, + "learning_rate": 5.684772440794773e-05, + "loss": 1.72, + "step": 15398 + }, + { + "epoch": 4.726519337016574, + "grad_norm": 0.19200581312179565, + "learning_rate": 5.684280065015116e-05, + "loss": 1.7311, + "step": 15399 + }, + { + "epoch": 4.7268262737876, + "grad_norm": 0.22227540612220764, + "learning_rate": 5.683787682473003e-05, + "loss": 1.7451, + "step": 15400 + }, + { + "epoch": 4.727133210558625, + "grad_norm": 0.18053604662418365, + "learning_rate": 5.683295293173299e-05, + "loss": 1.6816, + "step": 15401 + }, + { + "epoch": 4.72744014732965, + "grad_norm": 0.19827169179916382, + "learning_rate": 5.682802897120869e-05, + "loss": 1.7315, + "step": 15402 + }, + { + "epoch": 4.727747084100676, + "grad_norm": 0.2768021821975708, + "learning_rate": 5.682310494320582e-05, + "loss": 1.7714, + "step": 15403 + }, + { + "epoch": 4.7280540208717, + "grad_norm": 0.2613474428653717, + "learning_rate": 5.6818180847773027e-05, + "loss": 1.7332, + "step": 15404 + }, + { + "epoch": 4.7283609576427255, + "grad_norm": 0.21546787023544312, + "learning_rate": 5.681325668495898e-05, + "loss": 1.771, + "step": 15405 + }, + { + "epoch": 4.728667894413751, + "grad_norm": 0.24442137777805328, + "learning_rate": 5.680833245481234e-05, + "loss": 1.7296, + "step": 15406 + }, + { + "epoch": 4.728974831184776, + "grad_norm": 0.2622109055519104, + "learning_rate": 5.680340815738175e-05, + "loss": 1.7778, + "step": 15407 + }, + { + "epoch": 4.7292817679558015, + "grad_norm": 0.22379513084888458, + "learning_rate": 5.6798483792715904e-05, + "loss": 1.7953, + "step": 15408 + }, + { + "epoch": 4.729588704726826, + "grad_norm": 0.21901065111160278, + "learning_rate": 5.679355936086346e-05, + "loss": 1.7287, + "step": 15409 + }, + { + "epoch": 4.729895641497851, + "grad_norm": 0.3023792505264282, + "learning_rate": 5.6788634861873066e-05, + "loss": 1.7851, + "step": 15410 + }, + { + "epoch": 4.730202578268877, + "grad_norm": 0.23882482945919037, + "learning_rate": 5.678371029579342e-05, + "loss": 1.7621, + "step": 15411 + }, + { + "epoch": 4.730509515039902, + "grad_norm": 0.2661043703556061, + "learning_rate": 5.6778785662673175e-05, + "loss": 1.7453, + "step": 15412 + }, + { + "epoch": 4.730816451810927, + "grad_norm": 0.330208957195282, + "learning_rate": 5.677386096256099e-05, + "loss": 1.761, + "step": 15413 + }, + { + "epoch": 4.731123388581953, + "grad_norm": 0.2686570882797241, + "learning_rate": 5.676893619550552e-05, + "loss": 1.7539, + "step": 15414 + }, + { + "epoch": 4.731430325352977, + "grad_norm": 0.24308046698570251, + "learning_rate": 5.676401136155548e-05, + "loss": 1.7345, + "step": 15415 + }, + { + "epoch": 4.731737262124002, + "grad_norm": 0.4137137830257416, + "learning_rate": 5.67590864607595e-05, + "loss": 1.7688, + "step": 15416 + }, + { + "epoch": 4.732044198895028, + "grad_norm": 0.32161539793014526, + "learning_rate": 5.675416149316628e-05, + "loss": 1.7881, + "step": 15417 + }, + { + "epoch": 4.732351135666053, + "grad_norm": 0.2336999475955963, + "learning_rate": 5.674923645882447e-05, + "loss": 1.755, + "step": 15418 + }, + { + "epoch": 4.7326580724370775, + "grad_norm": 0.32781684398651123, + "learning_rate": 5.6744311357782754e-05, + "loss": 1.8062, + "step": 15419 + }, + { + "epoch": 4.732965009208103, + "grad_norm": 0.2475704401731491, + "learning_rate": 5.6739386190089795e-05, + "loss": 1.725, + "step": 15420 + }, + { + "epoch": 4.733271945979128, + "grad_norm": 0.26295650005340576, + "learning_rate": 5.673446095579427e-05, + "loss": 1.7673, + "step": 15421 + }, + { + "epoch": 4.7335788827501535, + "grad_norm": 0.3454873859882355, + "learning_rate": 5.6729535654944864e-05, + "loss": 1.7523, + "step": 15422 + }, + { + "epoch": 4.733885819521179, + "grad_norm": 0.2306666374206543, + "learning_rate": 5.672461028759024e-05, + "loss": 1.7085, + "step": 15423 + }, + { + "epoch": 4.734192756292204, + "grad_norm": 0.30825871229171753, + "learning_rate": 5.671968485377908e-05, + "loss": 1.7642, + "step": 15424 + }, + { + "epoch": 4.734499693063229, + "grad_norm": 0.42611342668533325, + "learning_rate": 5.6714759353560045e-05, + "loss": 1.7832, + "step": 15425 + }, + { + "epoch": 4.734806629834254, + "grad_norm": 0.29502514004707336, + "learning_rate": 5.670983378698182e-05, + "loss": 1.8153, + "step": 15426 + }, + { + "epoch": 4.735113566605279, + "grad_norm": 0.28416305780410767, + "learning_rate": 5.6704908154093096e-05, + "loss": 1.756, + "step": 15427 + }, + { + "epoch": 4.735420503376305, + "grad_norm": 0.43111103773117065, + "learning_rate": 5.6699982454942534e-05, + "loss": 1.7797, + "step": 15428 + }, + { + "epoch": 4.73572744014733, + "grad_norm": 0.27667397260665894, + "learning_rate": 5.669505668957882e-05, + "loss": 1.7316, + "step": 15429 + }, + { + "epoch": 4.736034376918354, + "grad_norm": 0.3045295774936676, + "learning_rate": 5.669013085805063e-05, + "loss": 1.7591, + "step": 15430 + }, + { + "epoch": 4.73634131368938, + "grad_norm": 0.4494635760784149, + "learning_rate": 5.6685204960406635e-05, + "loss": 1.8295, + "step": 15431 + }, + { + "epoch": 4.736648250460405, + "grad_norm": 0.2951449453830719, + "learning_rate": 5.6680278996695544e-05, + "loss": 1.7857, + "step": 15432 + }, + { + "epoch": 4.73695518723143, + "grad_norm": 0.2714167535305023, + "learning_rate": 5.6675352966966014e-05, + "loss": 1.816, + "step": 15433 + }, + { + "epoch": 4.737262124002456, + "grad_norm": 0.32701000571250916, + "learning_rate": 5.667042687126673e-05, + "loss": 1.7637, + "step": 15434 + }, + { + "epoch": 4.737569060773481, + "grad_norm": 0.2466556429862976, + "learning_rate": 5.666550070964638e-05, + "loss": 1.7805, + "step": 15435 + }, + { + "epoch": 4.7378759975445055, + "grad_norm": 0.3283855617046356, + "learning_rate": 5.666057448215365e-05, + "loss": 1.786, + "step": 15436 + }, + { + "epoch": 4.738182934315531, + "grad_norm": 0.35860660672187805, + "learning_rate": 5.6655648188837205e-05, + "loss": 1.8309, + "step": 15437 + }, + { + "epoch": 4.738489871086556, + "grad_norm": 0.22293898463249207, + "learning_rate": 5.665072182974576e-05, + "loss": 1.7317, + "step": 15438 + }, + { + "epoch": 4.7387968078575815, + "grad_norm": 0.3155089020729065, + "learning_rate": 5.664579540492798e-05, + "loss": 1.7202, + "step": 15439 + }, + { + "epoch": 4.739103744628607, + "grad_norm": 0.28723904490470886, + "learning_rate": 5.6640868914432566e-05, + "loss": 1.7788, + "step": 15440 + }, + { + "epoch": 4.739410681399631, + "grad_norm": 0.2461984008550644, + "learning_rate": 5.6635942358308183e-05, + "loss": 1.8504, + "step": 15441 + }, + { + "epoch": 4.739717618170657, + "grad_norm": 0.2503122091293335, + "learning_rate": 5.663101573660351e-05, + "loss": 1.7375, + "step": 15442 + }, + { + "epoch": 4.740024554941682, + "grad_norm": 0.24925372004508972, + "learning_rate": 5.662608904936727e-05, + "loss": 1.7152, + "step": 15443 + }, + { + "epoch": 4.740331491712707, + "grad_norm": 0.2734573483467102, + "learning_rate": 5.662116229664813e-05, + "loss": 1.7476, + "step": 15444 + }, + { + "epoch": 4.740638428483733, + "grad_norm": 0.38122060894966125, + "learning_rate": 5.661623547849479e-05, + "loss": 1.7682, + "step": 15445 + }, + { + "epoch": 4.740945365254758, + "grad_norm": 0.3786417245864868, + "learning_rate": 5.661130859495593e-05, + "loss": 1.7446, + "step": 15446 + }, + { + "epoch": 4.741252302025782, + "grad_norm": 0.22618255019187927, + "learning_rate": 5.6606381646080244e-05, + "loss": 1.7427, + "step": 15447 + }, + { + "epoch": 4.741559238796808, + "grad_norm": 0.3000899851322174, + "learning_rate": 5.6601454631916405e-05, + "loss": 1.7087, + "step": 15448 + }, + { + "epoch": 4.741866175567833, + "grad_norm": 0.36542513966560364, + "learning_rate": 5.659652755251315e-05, + "loss": 1.7985, + "step": 15449 + }, + { + "epoch": 4.742173112338858, + "grad_norm": 0.23550496995449066, + "learning_rate": 5.659160040791912e-05, + "loss": 1.8163, + "step": 15450 + }, + { + "epoch": 4.742480049109884, + "grad_norm": 0.25615251064300537, + "learning_rate": 5.658667319818305e-05, + "loss": 1.7372, + "step": 15451 + }, + { + "epoch": 4.742786985880908, + "grad_norm": 0.28744083642959595, + "learning_rate": 5.6581745923353615e-05, + "loss": 1.7193, + "step": 15452 + }, + { + "epoch": 4.7430939226519335, + "grad_norm": 0.2500229775905609, + "learning_rate": 5.65768185834795e-05, + "loss": 1.7263, + "step": 15453 + }, + { + "epoch": 4.743400859422959, + "grad_norm": 0.21520425379276276, + "learning_rate": 5.6571891178609394e-05, + "loss": 1.7337, + "step": 15454 + }, + { + "epoch": 4.743707796193984, + "grad_norm": 0.212506502866745, + "learning_rate": 5.656696370879202e-05, + "loss": 1.7672, + "step": 15455 + }, + { + "epoch": 4.7440147329650095, + "grad_norm": 0.21143417060375214, + "learning_rate": 5.656203617407607e-05, + "loss": 1.7189, + "step": 15456 + }, + { + "epoch": 4.744321669736035, + "grad_norm": 0.18320922553539276, + "learning_rate": 5.6557108574510243e-05, + "loss": 1.7521, + "step": 15457 + }, + { + "epoch": 4.744628606507059, + "grad_norm": 0.19202999770641327, + "learning_rate": 5.655218091014321e-05, + "loss": 1.6756, + "step": 15458 + }, + { + "epoch": 4.744935543278085, + "grad_norm": 0.2152331918478012, + "learning_rate": 5.654725318102367e-05, + "loss": 1.7653, + "step": 15459 + }, + { + "epoch": 4.74524248004911, + "grad_norm": 0.24565903842449188, + "learning_rate": 5.6542325387200354e-05, + "loss": 1.7654, + "step": 15460 + }, + { + "epoch": 4.745549416820135, + "grad_norm": 0.2504819333553314, + "learning_rate": 5.653739752872195e-05, + "loss": 1.7073, + "step": 15461 + }, + { + "epoch": 4.74585635359116, + "grad_norm": 0.19258706271648407, + "learning_rate": 5.653246960563714e-05, + "loss": 1.7106, + "step": 15462 + }, + { + "epoch": 4.746163290362185, + "grad_norm": 0.22961968183517456, + "learning_rate": 5.652754161799465e-05, + "loss": 1.7868, + "step": 15463 + }, + { + "epoch": 4.74647022713321, + "grad_norm": 0.2763231098651886, + "learning_rate": 5.652261356584315e-05, + "loss": 1.7714, + "step": 15464 + }, + { + "epoch": 4.746777163904236, + "grad_norm": 0.23866096138954163, + "learning_rate": 5.651768544923136e-05, + "loss": 1.7537, + "step": 15465 + }, + { + "epoch": 4.747084100675261, + "grad_norm": 0.21851976215839386, + "learning_rate": 5.6512757268207997e-05, + "loss": 1.8109, + "step": 15466 + }, + { + "epoch": 4.747391037446286, + "grad_norm": 0.22249393165111542, + "learning_rate": 5.6507829022821745e-05, + "loss": 1.7357, + "step": 15467 + }, + { + "epoch": 4.747697974217311, + "grad_norm": 0.20202289521694183, + "learning_rate": 5.650290071312131e-05, + "loss": 1.7867, + "step": 15468 + }, + { + "epoch": 4.748004910988336, + "grad_norm": 0.20618727803230286, + "learning_rate": 5.649797233915539e-05, + "loss": 1.6904, + "step": 15469 + }, + { + "epoch": 4.7483118477593615, + "grad_norm": 0.25609052181243896, + "learning_rate": 5.649304390097272e-05, + "loss": 1.7287, + "step": 15470 + }, + { + "epoch": 4.748618784530387, + "grad_norm": 0.22966544330120087, + "learning_rate": 5.648811539862195e-05, + "loss": 1.7384, + "step": 15471 + }, + { + "epoch": 4.748925721301412, + "grad_norm": 0.24070143699645996, + "learning_rate": 5.6483186832151856e-05, + "loss": 1.7625, + "step": 15472 + }, + { + "epoch": 4.749232658072437, + "grad_norm": 0.22642426192760468, + "learning_rate": 5.647825820161109e-05, + "loss": 1.7291, + "step": 15473 + }, + { + "epoch": 4.749539594843462, + "grad_norm": 0.23255646228790283, + "learning_rate": 5.64733295070484e-05, + "loss": 1.8076, + "step": 15474 + }, + { + "epoch": 4.749846531614487, + "grad_norm": 0.20902042090892792, + "learning_rate": 5.646840074851246e-05, + "loss": 1.6627, + "step": 15475 + }, + { + "epoch": 4.750153468385513, + "grad_norm": 0.21608836948871613, + "learning_rate": 5.646347192605198e-05, + "loss": 1.7458, + "step": 15476 + }, + { + "epoch": 4.750460405156538, + "grad_norm": 0.22368495166301727, + "learning_rate": 5.6458543039715694e-05, + "loss": 1.7601, + "step": 15477 + }, + { + "epoch": 4.750767341927563, + "grad_norm": 0.30586308240890503, + "learning_rate": 5.645361408955231e-05, + "loss": 1.8389, + "step": 15478 + }, + { + "epoch": 4.751074278698588, + "grad_norm": 0.25122150778770447, + "learning_rate": 5.644868507561052e-05, + "loss": 1.7509, + "step": 15479 + }, + { + "epoch": 4.751381215469613, + "grad_norm": 0.28435763716697693, + "learning_rate": 5.644375599793904e-05, + "loss": 1.7723, + "step": 15480 + }, + { + "epoch": 4.7516881522406385, + "grad_norm": 0.3111409842967987, + "learning_rate": 5.643882685658659e-05, + "loss": 1.7973, + "step": 15481 + }, + { + "epoch": 4.751995089011664, + "grad_norm": 0.3108380138874054, + "learning_rate": 5.6433897651601874e-05, + "loss": 1.8126, + "step": 15482 + }, + { + "epoch": 4.752302025782689, + "grad_norm": 0.25894731283187866, + "learning_rate": 5.642896838303362e-05, + "loss": 1.7849, + "step": 15483 + }, + { + "epoch": 4.752608962553714, + "grad_norm": 0.39321839809417725, + "learning_rate": 5.642403905093052e-05, + "loss": 1.7583, + "step": 15484 + }, + { + "epoch": 4.752915899324739, + "grad_norm": 0.3206121027469635, + "learning_rate": 5.6419109655341315e-05, + "loss": 1.8061, + "step": 15485 + }, + { + "epoch": 4.753222836095764, + "grad_norm": 0.2817624807357788, + "learning_rate": 5.64141801963147e-05, + "loss": 1.8252, + "step": 15486 + }, + { + "epoch": 4.75352977286679, + "grad_norm": 0.3344736397266388, + "learning_rate": 5.6409250673899405e-05, + "loss": 1.6975, + "step": 15487 + }, + { + "epoch": 4.753836709637815, + "grad_norm": 0.21873882412910461, + "learning_rate": 5.640432108814413e-05, + "loss": 1.7126, + "step": 15488 + }, + { + "epoch": 4.75414364640884, + "grad_norm": 0.3317199945449829, + "learning_rate": 5.639939143909758e-05, + "loss": 1.7826, + "step": 15489 + }, + { + "epoch": 4.754450583179865, + "grad_norm": 0.34901630878448486, + "learning_rate": 5.639446172680854e-05, + "loss": 1.7411, + "step": 15490 + }, + { + "epoch": 4.75475751995089, + "grad_norm": 0.24015867710113525, + "learning_rate": 5.6389531951325645e-05, + "loss": 1.7514, + "step": 15491 + }, + { + "epoch": 4.755064456721915, + "grad_norm": 0.28364554047584534, + "learning_rate": 5.6384602112697674e-05, + "loss": 1.7569, + "step": 15492 + }, + { + "epoch": 4.755371393492941, + "grad_norm": 0.3561246693134308, + "learning_rate": 5.637967221097329e-05, + "loss": 1.7212, + "step": 15493 + }, + { + "epoch": 4.755678330263965, + "grad_norm": 0.3383684456348419, + "learning_rate": 5.637474224620126e-05, + "loss": 1.6866, + "step": 15494 + }, + { + "epoch": 4.7559852670349905, + "grad_norm": 0.2399235963821411, + "learning_rate": 5.63698122184303e-05, + "loss": 1.7609, + "step": 15495 + }, + { + "epoch": 4.756292203806016, + "grad_norm": 0.38559645414352417, + "learning_rate": 5.636488212770912e-05, + "loss": 1.7509, + "step": 15496 + }, + { + "epoch": 4.756599140577041, + "grad_norm": 0.365005224943161, + "learning_rate": 5.635995197408645e-05, + "loss": 1.7894, + "step": 15497 + }, + { + "epoch": 4.7569060773480665, + "grad_norm": 0.21254757046699524, + "learning_rate": 5.635502175761099e-05, + "loss": 1.6969, + "step": 15498 + }, + { + "epoch": 4.757213014119092, + "grad_norm": 0.42865821719169617, + "learning_rate": 5.635009147833149e-05, + "loss": 1.7989, + "step": 15499 + }, + { + "epoch": 4.757519950890116, + "grad_norm": 0.35717228055000305, + "learning_rate": 5.634516113629665e-05, + "loss": 1.7338, + "step": 15500 + }, + { + "epoch": 4.757826887661142, + "grad_norm": 0.21582463383674622, + "learning_rate": 5.634023073155523e-05, + "loss": 1.7429, + "step": 15501 + }, + { + "epoch": 4.758133824432167, + "grad_norm": 0.3376842141151428, + "learning_rate": 5.633530026415592e-05, + "loss": 1.7703, + "step": 15502 + }, + { + "epoch": 4.758440761203192, + "grad_norm": 0.2760981023311615, + "learning_rate": 5.633036973414747e-05, + "loss": 1.7389, + "step": 15503 + }, + { + "epoch": 4.758747697974218, + "grad_norm": 0.3808997571468353, + "learning_rate": 5.63254391415786e-05, + "loss": 1.7513, + "step": 15504 + }, + { + "epoch": 4.759054634745242, + "grad_norm": 0.5152496695518494, + "learning_rate": 5.6320508486498014e-05, + "loss": 1.7376, + "step": 15505 + }, + { + "epoch": 4.759361571516267, + "grad_norm": 0.33983346819877625, + "learning_rate": 5.6315577768954464e-05, + "loss": 1.7209, + "step": 15506 + }, + { + "epoch": 4.759668508287293, + "grad_norm": 0.27064043283462524, + "learning_rate": 5.631064698899669e-05, + "loss": 1.7808, + "step": 15507 + }, + { + "epoch": 4.759975445058318, + "grad_norm": 0.3659237027168274, + "learning_rate": 5.630571614667339e-05, + "loss": 1.7706, + "step": 15508 + }, + { + "epoch": 4.760282381829343, + "grad_norm": 0.246379554271698, + "learning_rate": 5.63007852420333e-05, + "loss": 1.7425, + "step": 15509 + }, + { + "epoch": 4.760589318600369, + "grad_norm": 0.2683795392513275, + "learning_rate": 5.629585427512518e-05, + "loss": 1.7332, + "step": 15510 + }, + { + "epoch": 4.760896255371393, + "grad_norm": 0.32626205682754517, + "learning_rate": 5.6290923245997704e-05, + "loss": 1.786, + "step": 15511 + }, + { + "epoch": 4.7612031921424185, + "grad_norm": 0.23723098635673523, + "learning_rate": 5.6285992154699666e-05, + "loss": 1.7305, + "step": 15512 + }, + { + "epoch": 4.761510128913444, + "grad_norm": 0.26316091418266296, + "learning_rate": 5.628106100127976e-05, + "loss": 1.7804, + "step": 15513 + }, + { + "epoch": 4.761817065684469, + "grad_norm": 0.24376356601715088, + "learning_rate": 5.6276129785786726e-05, + "loss": 1.738, + "step": 15514 + }, + { + "epoch": 4.7621240024554945, + "grad_norm": 0.27778422832489014, + "learning_rate": 5.627119850826931e-05, + "loss": 1.7444, + "step": 15515 + }, + { + "epoch": 4.762430939226519, + "grad_norm": 0.3134306073188782, + "learning_rate": 5.6266267168776224e-05, + "loss": 1.7696, + "step": 15516 + }, + { + "epoch": 4.762737875997544, + "grad_norm": 0.2354283481836319, + "learning_rate": 5.6261335767356195e-05, + "loss": 1.799, + "step": 15517 + }, + { + "epoch": 4.76304481276857, + "grad_norm": 0.26902756094932556, + "learning_rate": 5.6256404304058e-05, + "loss": 1.7091, + "step": 15518 + }, + { + "epoch": 4.763351749539595, + "grad_norm": 0.2760716676712036, + "learning_rate": 5.6251472778930345e-05, + "loss": 1.742, + "step": 15519 + }, + { + "epoch": 4.76365868631062, + "grad_norm": 0.2138829231262207, + "learning_rate": 5.624654119202197e-05, + "loss": 1.7093, + "step": 15520 + }, + { + "epoch": 4.763965623081646, + "grad_norm": 0.31404614448547363, + "learning_rate": 5.624160954338162e-05, + "loss": 1.7467, + "step": 15521 + }, + { + "epoch": 4.76427255985267, + "grad_norm": 0.24810083210468292, + "learning_rate": 5.623667783305803e-05, + "loss": 1.745, + "step": 15522 + }, + { + "epoch": 4.764579496623695, + "grad_norm": 0.23674242198467255, + "learning_rate": 5.6231746061099913e-05, + "loss": 1.7662, + "step": 15523 + }, + { + "epoch": 4.764886433394721, + "grad_norm": 0.264230877161026, + "learning_rate": 5.622681422755606e-05, + "loss": 1.7627, + "step": 15524 + }, + { + "epoch": 4.765193370165746, + "grad_norm": 0.2982041537761688, + "learning_rate": 5.6221882332475165e-05, + "loss": 1.7558, + "step": 15525 + }, + { + "epoch": 4.765500306936771, + "grad_norm": 0.29215967655181885, + "learning_rate": 5.6216950375905975e-05, + "loss": 1.7981, + "step": 15526 + }, + { + "epoch": 4.765807243707796, + "grad_norm": 0.20014487206935883, + "learning_rate": 5.6212018357897244e-05, + "loss": 1.7113, + "step": 15527 + }, + { + "epoch": 4.766114180478821, + "grad_norm": 0.22359825670719147, + "learning_rate": 5.620708627849769e-05, + "loss": 1.7356, + "step": 15528 + }, + { + "epoch": 4.7664211172498465, + "grad_norm": 0.2254783809185028, + "learning_rate": 5.620215413775609e-05, + "loss": 1.7397, + "step": 15529 + }, + { + "epoch": 4.766728054020872, + "grad_norm": 0.2827560305595398, + "learning_rate": 5.619722193572117e-05, + "loss": 1.732, + "step": 15530 + }, + { + "epoch": 4.767034990791897, + "grad_norm": 0.22591307759284973, + "learning_rate": 5.619228967244165e-05, + "loss": 1.7713, + "step": 15531 + }, + { + "epoch": 4.7673419275629225, + "grad_norm": 0.25872737169265747, + "learning_rate": 5.618735734796632e-05, + "loss": 1.7291, + "step": 15532 + }, + { + "epoch": 4.767648864333947, + "grad_norm": 0.24515275657176971, + "learning_rate": 5.6182424962343884e-05, + "loss": 1.8079, + "step": 15533 + }, + { + "epoch": 4.767955801104972, + "grad_norm": 0.2456643134355545, + "learning_rate": 5.617749251562309e-05, + "loss": 1.7082, + "step": 15534 + }, + { + "epoch": 4.768262737875998, + "grad_norm": 0.21684220433235168, + "learning_rate": 5.6172560007852716e-05, + "loss": 1.7563, + "step": 15535 + }, + { + "epoch": 4.768569674647023, + "grad_norm": 0.2141445428133011, + "learning_rate": 5.616762743908147e-05, + "loss": 1.7115, + "step": 15536 + }, + { + "epoch": 4.768876611418047, + "grad_norm": 0.22502638399600983, + "learning_rate": 5.616269480935812e-05, + "loss": 1.723, + "step": 15537 + }, + { + "epoch": 4.769183548189073, + "grad_norm": 0.23387989401817322, + "learning_rate": 5.6157762118731416e-05, + "loss": 1.7775, + "step": 15538 + }, + { + "epoch": 4.769490484960098, + "grad_norm": 0.19615057110786438, + "learning_rate": 5.6152829367250096e-05, + "loss": 1.7696, + "step": 15539 + }, + { + "epoch": 4.769797421731123, + "grad_norm": 0.2408154010772705, + "learning_rate": 5.614789655496289e-05, + "loss": 1.7758, + "step": 15540 + }, + { + "epoch": 4.770104358502149, + "grad_norm": 0.20994634926319122, + "learning_rate": 5.614296368191859e-05, + "loss": 1.6935, + "step": 15541 + }, + { + "epoch": 4.770411295273174, + "grad_norm": 0.24135129153728485, + "learning_rate": 5.613803074816591e-05, + "loss": 1.7644, + "step": 15542 + }, + { + "epoch": 4.7707182320441985, + "grad_norm": 0.2380143105983734, + "learning_rate": 5.6133097753753625e-05, + "loss": 1.741, + "step": 15543 + }, + { + "epoch": 4.771025168815224, + "grad_norm": 0.30300623178482056, + "learning_rate": 5.6128164698730465e-05, + "loss": 1.7935, + "step": 15544 + }, + { + "epoch": 4.771332105586249, + "grad_norm": 0.2620760500431061, + "learning_rate": 5.612323158314519e-05, + "loss": 1.7436, + "step": 15545 + }, + { + "epoch": 4.7716390423572745, + "grad_norm": 0.3791491389274597, + "learning_rate": 5.6118298407046544e-05, + "loss": 1.7503, + "step": 15546 + }, + { + "epoch": 4.7719459791283, + "grad_norm": 0.3830909729003906, + "learning_rate": 5.61133651704833e-05, + "loss": 1.7651, + "step": 15547 + }, + { + "epoch": 4.772252915899324, + "grad_norm": 0.26680612564086914, + "learning_rate": 5.610843187350419e-05, + "loss": 1.8075, + "step": 15548 + }, + { + "epoch": 4.77255985267035, + "grad_norm": 0.38018953800201416, + "learning_rate": 5.610349851615798e-05, + "loss": 1.8301, + "step": 15549 + }, + { + "epoch": 4.772866789441375, + "grad_norm": 0.4514484107494354, + "learning_rate": 5.6098565098493414e-05, + "loss": 1.7709, + "step": 15550 + }, + { + "epoch": 4.7731737262124, + "grad_norm": 0.28267863392829895, + "learning_rate": 5.6093631620559254e-05, + "loss": 1.8087, + "step": 15551 + }, + { + "epoch": 4.773480662983426, + "grad_norm": 0.22541162371635437, + "learning_rate": 5.6088698082404256e-05, + "loss": 1.7457, + "step": 15552 + }, + { + "epoch": 4.773787599754451, + "grad_norm": 0.3012544512748718, + "learning_rate": 5.608376448407718e-05, + "loss": 1.7454, + "step": 15553 + }, + { + "epoch": 4.774094536525475, + "grad_norm": 0.2460169941186905, + "learning_rate": 5.607883082562677e-05, + "loss": 1.8237, + "step": 15554 + }, + { + "epoch": 4.774401473296501, + "grad_norm": 0.2918507158756256, + "learning_rate": 5.6073897107101804e-05, + "loss": 1.7416, + "step": 15555 + }, + { + "epoch": 4.774708410067526, + "grad_norm": 0.3104710280895233, + "learning_rate": 5.6068963328551016e-05, + "loss": 1.8162, + "step": 15556 + }, + { + "epoch": 4.7750153468385514, + "grad_norm": 0.2576459050178528, + "learning_rate": 5.606402949002317e-05, + "loss": 1.7732, + "step": 15557 + }, + { + "epoch": 4.775322283609577, + "grad_norm": 0.2373739629983902, + "learning_rate": 5.605909559156706e-05, + "loss": 1.7812, + "step": 15558 + }, + { + "epoch": 4.775629220380601, + "grad_norm": 0.30436694622039795, + "learning_rate": 5.6054161633231385e-05, + "loss": 1.7606, + "step": 15559 + }, + { + "epoch": 4.775936157151627, + "grad_norm": 0.3058558702468872, + "learning_rate": 5.604922761506495e-05, + "loss": 1.8384, + "step": 15560 + }, + { + "epoch": 4.776243093922652, + "grad_norm": 0.26421624422073364, + "learning_rate": 5.6044293537116496e-05, + "loss": 1.8041, + "step": 15561 + }, + { + "epoch": 4.776550030693677, + "grad_norm": 0.4945085346698761, + "learning_rate": 5.603935939943479e-05, + "loss": 1.7522, + "step": 15562 + }, + { + "epoch": 4.776856967464703, + "grad_norm": 0.41049134731292725, + "learning_rate": 5.6034425202068595e-05, + "loss": 1.7471, + "step": 15563 + }, + { + "epoch": 4.777163904235728, + "grad_norm": 0.22972853481769562, + "learning_rate": 5.602949094506668e-05, + "loss": 1.7041, + "step": 15564 + }, + { + "epoch": 4.777470841006752, + "grad_norm": 0.37373700737953186, + "learning_rate": 5.6024556628477785e-05, + "loss": 1.7811, + "step": 15565 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.3603375554084778, + "learning_rate": 5.6019622252350714e-05, + "loss": 1.8396, + "step": 15566 + }, + { + "epoch": 4.778084714548803, + "grad_norm": 0.2085956335067749, + "learning_rate": 5.601468781673419e-05, + "loss": 1.7453, + "step": 15567 + }, + { + "epoch": 4.778391651319828, + "grad_norm": 0.28871405124664307, + "learning_rate": 5.6009753321677e-05, + "loss": 1.7135, + "step": 15568 + }, + { + "epoch": 4.778698588090853, + "grad_norm": 0.2378411591053009, + "learning_rate": 5.600481876722791e-05, + "loss": 1.77, + "step": 15569 + }, + { + "epoch": 4.779005524861878, + "grad_norm": 0.2902696430683136, + "learning_rate": 5.599988415343567e-05, + "loss": 1.7416, + "step": 15570 + }, + { + "epoch": 4.7793124616329035, + "grad_norm": 0.36155447363853455, + "learning_rate": 5.5994949480349066e-05, + "loss": 1.7095, + "step": 15571 + }, + { + "epoch": 4.779619398403929, + "grad_norm": 0.24867403507232666, + "learning_rate": 5.599001474801686e-05, + "loss": 1.8063, + "step": 15572 + }, + { + "epoch": 4.779926335174954, + "grad_norm": 0.24853186309337616, + "learning_rate": 5.5985079956487815e-05, + "loss": 1.7537, + "step": 15573 + }, + { + "epoch": 4.7802332719459795, + "grad_norm": 0.31984636187553406, + "learning_rate": 5.598014510581071e-05, + "loss": 1.7888, + "step": 15574 + }, + { + "epoch": 4.780540208717004, + "grad_norm": 0.23907123506069183, + "learning_rate": 5.597521019603429e-05, + "loss": 1.7157, + "step": 15575 + }, + { + "epoch": 4.780847145488029, + "grad_norm": 0.25759413838386536, + "learning_rate": 5.597027522720736e-05, + "loss": 1.7579, + "step": 15576 + }, + { + "epoch": 4.781154082259055, + "grad_norm": 0.34123921394348145, + "learning_rate": 5.5965340199378654e-05, + "loss": 1.838, + "step": 15577 + }, + { + "epoch": 4.78146101903008, + "grad_norm": 0.2769980728626251, + "learning_rate": 5.596040511259697e-05, + "loss": 1.7889, + "step": 15578 + }, + { + "epoch": 4.781767955801105, + "grad_norm": 0.21936915814876556, + "learning_rate": 5.5955469966911066e-05, + "loss": 1.7434, + "step": 15579 + }, + { + "epoch": 4.78207489257213, + "grad_norm": 0.27583181858062744, + "learning_rate": 5.59505347623697e-05, + "loss": 1.7229, + "step": 15580 + }, + { + "epoch": 4.782381829343155, + "grad_norm": 0.24246171116828918, + "learning_rate": 5.594559949902168e-05, + "loss": 1.7368, + "step": 15581 + }, + { + "epoch": 4.78268876611418, + "grad_norm": 0.22705630958080292, + "learning_rate": 5.594066417691576e-05, + "loss": 1.7261, + "step": 15582 + }, + { + "epoch": 4.782995702885206, + "grad_norm": 0.23308728635311127, + "learning_rate": 5.593572879610072e-05, + "loss": 1.7451, + "step": 15583 + }, + { + "epoch": 4.783302639656231, + "grad_norm": 0.21654267609119415, + "learning_rate": 5.5930793356625324e-05, + "loss": 1.7133, + "step": 15584 + }, + { + "epoch": 4.783609576427256, + "grad_norm": 0.22884133458137512, + "learning_rate": 5.5925857858538347e-05, + "loss": 1.6899, + "step": 15585 + }, + { + "epoch": 4.783916513198281, + "grad_norm": 0.2396838665008545, + "learning_rate": 5.5920922301888555e-05, + "loss": 1.7837, + "step": 15586 + }, + { + "epoch": 4.784223449969306, + "grad_norm": 0.22941450774669647, + "learning_rate": 5.5915986686724765e-05, + "loss": 1.7443, + "step": 15587 + }, + { + "epoch": 4.7845303867403315, + "grad_norm": 0.23992502689361572, + "learning_rate": 5.591105101309572e-05, + "loss": 1.8054, + "step": 15588 + }, + { + "epoch": 4.784837323511357, + "grad_norm": 0.2540588974952698, + "learning_rate": 5.59061152810502e-05, + "loss": 1.855, + "step": 15589 + }, + { + "epoch": 4.785144260282382, + "grad_norm": 0.22691720724105835, + "learning_rate": 5.590117949063699e-05, + "loss": 1.7441, + "step": 15590 + }, + { + "epoch": 4.785451197053407, + "grad_norm": 0.23691289126873016, + "learning_rate": 5.5896243641904864e-05, + "loss": 1.8156, + "step": 15591 + }, + { + "epoch": 4.785758133824432, + "grad_norm": 0.2749332785606384, + "learning_rate": 5.589130773490261e-05, + "loss": 1.8157, + "step": 15592 + }, + { + "epoch": 4.786065070595457, + "grad_norm": 0.2435624748468399, + "learning_rate": 5.588637176967899e-05, + "loss": 1.7473, + "step": 15593 + }, + { + "epoch": 4.786372007366483, + "grad_norm": 0.22931383550167084, + "learning_rate": 5.5881435746282795e-05, + "loss": 1.7652, + "step": 15594 + }, + { + "epoch": 4.786678944137508, + "grad_norm": 0.23916593194007874, + "learning_rate": 5.587649966476282e-05, + "loss": 1.7415, + "step": 15595 + }, + { + "epoch": 4.786985880908533, + "grad_norm": 0.23483172059059143, + "learning_rate": 5.5871563525167814e-05, + "loss": 1.7308, + "step": 15596 + }, + { + "epoch": 4.787292817679558, + "grad_norm": 0.24850021302700043, + "learning_rate": 5.586662732754656e-05, + "loss": 1.8294, + "step": 15597 + }, + { + "epoch": 4.787599754450583, + "grad_norm": 0.2439260333776474, + "learning_rate": 5.586169107194788e-05, + "loss": 1.7599, + "step": 15598 + }, + { + "epoch": 4.787906691221608, + "grad_norm": 0.22379007935523987, + "learning_rate": 5.585675475842054e-05, + "loss": 1.7278, + "step": 15599 + }, + { + "epoch": 4.788213627992634, + "grad_norm": 0.2633908689022064, + "learning_rate": 5.58518183870133e-05, + "loss": 1.7318, + "step": 15600 + }, + { + "epoch": 4.788520564763659, + "grad_norm": 0.20992474257946014, + "learning_rate": 5.584688195777497e-05, + "loss": 1.7003, + "step": 15601 + }, + { + "epoch": 4.7888275015346835, + "grad_norm": 0.2460084706544876, + "learning_rate": 5.584194547075432e-05, + "loss": 1.78, + "step": 15602 + }, + { + "epoch": 4.789134438305709, + "grad_norm": 0.23955418169498444, + "learning_rate": 5.583700892600013e-05, + "loss": 1.7953, + "step": 15603 + }, + { + "epoch": 4.789441375076734, + "grad_norm": 0.2495713233947754, + "learning_rate": 5.583207232356121e-05, + "loss": 1.7874, + "step": 15604 + }, + { + "epoch": 4.7897483118477595, + "grad_norm": 0.22878028452396393, + "learning_rate": 5.5827135663486344e-05, + "loss": 1.7961, + "step": 15605 + }, + { + "epoch": 4.790055248618785, + "grad_norm": 0.2299363762140274, + "learning_rate": 5.582219894582429e-05, + "loss": 1.7497, + "step": 15606 + }, + { + "epoch": 4.79036218538981, + "grad_norm": 0.22896108031272888, + "learning_rate": 5.5817262170623865e-05, + "loss": 1.7543, + "step": 15607 + }, + { + "epoch": 4.790669122160835, + "grad_norm": 0.2150495946407318, + "learning_rate": 5.581232533793383e-05, + "loss": 1.8034, + "step": 15608 + }, + { + "epoch": 4.79097605893186, + "grad_norm": 0.21317999064922333, + "learning_rate": 5.580738844780301e-05, + "loss": 1.7482, + "step": 15609 + }, + { + "epoch": 4.791282995702885, + "grad_norm": 0.21904391050338745, + "learning_rate": 5.580245150028016e-05, + "loss": 1.7647, + "step": 15610 + }, + { + "epoch": 4.791589932473911, + "grad_norm": 0.2026481032371521, + "learning_rate": 5.5797514495414095e-05, + "loss": 1.6997, + "step": 15611 + }, + { + "epoch": 4.791896869244935, + "grad_norm": 0.22508487105369568, + "learning_rate": 5.579257743325359e-05, + "loss": 1.8258, + "step": 15612 + }, + { + "epoch": 4.79220380601596, + "grad_norm": 0.2801211178302765, + "learning_rate": 5.5787640313847435e-05, + "loss": 1.6991, + "step": 15613 + }, + { + "epoch": 4.792510742786986, + "grad_norm": 0.2696724236011505, + "learning_rate": 5.578270313724442e-05, + "loss": 1.7339, + "step": 15614 + }, + { + "epoch": 4.792817679558011, + "grad_norm": 0.2909143269062042, + "learning_rate": 5.577776590349334e-05, + "loss": 1.8481, + "step": 15615 + }, + { + "epoch": 4.793124616329036, + "grad_norm": 0.21682757139205933, + "learning_rate": 5.5772828612643005e-05, + "loss": 1.759, + "step": 15616 + }, + { + "epoch": 4.793431553100062, + "grad_norm": 0.23074059188365936, + "learning_rate": 5.576789126474219e-05, + "loss": 1.7652, + "step": 15617 + }, + { + "epoch": 4.793738489871086, + "grad_norm": 0.24018999934196472, + "learning_rate": 5.576295385983969e-05, + "loss": 1.7986, + "step": 15618 + }, + { + "epoch": 4.7940454266421115, + "grad_norm": 0.23987948894500732, + "learning_rate": 5.575801639798431e-05, + "loss": 1.779, + "step": 15619 + }, + { + "epoch": 4.794352363413137, + "grad_norm": 0.2138533890247345, + "learning_rate": 5.575307887922482e-05, + "loss": 1.7097, + "step": 15620 + }, + { + "epoch": 4.794659300184162, + "grad_norm": 0.1995106190443039, + "learning_rate": 5.5748141303610044e-05, + "loss": 1.6924, + "step": 15621 + }, + { + "epoch": 4.7949662369551875, + "grad_norm": 0.23547641932964325, + "learning_rate": 5.574320367118877e-05, + "loss": 1.8492, + "step": 15622 + }, + { + "epoch": 4.795273173726212, + "grad_norm": 0.22931239008903503, + "learning_rate": 5.5738265982009794e-05, + "loss": 1.8054, + "step": 15623 + }, + { + "epoch": 4.795580110497237, + "grad_norm": 0.19957222044467926, + "learning_rate": 5.573332823612191e-05, + "loss": 1.7464, + "step": 15624 + }, + { + "epoch": 4.795887047268263, + "grad_norm": 0.1990327090024948, + "learning_rate": 5.5728390433573905e-05, + "loss": 1.7438, + "step": 15625 + }, + { + "epoch": 4.796193984039288, + "grad_norm": 0.22276802361011505, + "learning_rate": 5.572345257441459e-05, + "loss": 1.7674, + "step": 15626 + }, + { + "epoch": 4.796500920810313, + "grad_norm": 0.2109617441892624, + "learning_rate": 5.571851465869277e-05, + "loss": 1.7577, + "step": 15627 + }, + { + "epoch": 4.796807857581339, + "grad_norm": 0.22917217016220093, + "learning_rate": 5.5713576686457234e-05, + "loss": 1.7478, + "step": 15628 + }, + { + "epoch": 4.797114794352363, + "grad_norm": 0.21016938984394073, + "learning_rate": 5.570863865775678e-05, + "loss": 1.8078, + "step": 15629 + }, + { + "epoch": 4.797421731123388, + "grad_norm": 0.22478216886520386, + "learning_rate": 5.5703700572640215e-05, + "loss": 1.7621, + "step": 15630 + }, + { + "epoch": 4.797728667894414, + "grad_norm": 0.26899904012680054, + "learning_rate": 5.569876243115634e-05, + "loss": 1.8065, + "step": 15631 + }, + { + "epoch": 4.798035604665439, + "grad_norm": 0.23187808692455292, + "learning_rate": 5.569382423335394e-05, + "loss": 1.7337, + "step": 15632 + }, + { + "epoch": 4.798342541436464, + "grad_norm": 0.2264855057001114, + "learning_rate": 5.568888597928185e-05, + "loss": 1.7879, + "step": 15633 + }, + { + "epoch": 4.798649478207489, + "grad_norm": 0.244137242436409, + "learning_rate": 5.568394766898886e-05, + "loss": 1.8307, + "step": 15634 + }, + { + "epoch": 4.798956414978514, + "grad_norm": 0.2400583177804947, + "learning_rate": 5.5679009302523744e-05, + "loss": 1.76, + "step": 15635 + }, + { + "epoch": 4.7992633517495396, + "grad_norm": 0.2324059158563614, + "learning_rate": 5.5674070879935347e-05, + "loss": 1.7594, + "step": 15636 + }, + { + "epoch": 4.799570288520565, + "grad_norm": 0.21753786504268646, + "learning_rate": 5.566913240127244e-05, + "loss": 1.7568, + "step": 15637 + }, + { + "epoch": 4.79987722529159, + "grad_norm": 0.21557624638080597, + "learning_rate": 5.566419386658386e-05, + "loss": 1.7733, + "step": 15638 + }, + { + "epoch": 4.800184162062616, + "grad_norm": 0.22795113921165466, + "learning_rate": 5.565925527591839e-05, + "loss": 1.7624, + "step": 15639 + }, + { + "epoch": 4.80049109883364, + "grad_norm": 0.23035180568695068, + "learning_rate": 5.565431662932484e-05, + "loss": 1.7436, + "step": 15640 + }, + { + "epoch": 4.800798035604665, + "grad_norm": 0.2569425404071808, + "learning_rate": 5.564937792685203e-05, + "loss": 1.7027, + "step": 15641 + }, + { + "epoch": 4.801104972375691, + "grad_norm": 0.20544980466365814, + "learning_rate": 5.564443916854875e-05, + "loss": 1.7125, + "step": 15642 + }, + { + "epoch": 4.801411909146716, + "grad_norm": 0.25040850043296814, + "learning_rate": 5.5639500354463815e-05, + "loss": 1.7646, + "step": 15643 + }, + { + "epoch": 4.8017188459177405, + "grad_norm": 0.1991344839334488, + "learning_rate": 5.563456148464602e-05, + "loss": 1.7206, + "step": 15644 + }, + { + "epoch": 4.802025782688766, + "grad_norm": 0.236537903547287, + "learning_rate": 5.56296225591442e-05, + "loss": 1.7288, + "step": 15645 + }, + { + "epoch": 4.802332719459791, + "grad_norm": 0.253619521856308, + "learning_rate": 5.562468357800714e-05, + "loss": 1.7347, + "step": 15646 + }, + { + "epoch": 4.8026396562308165, + "grad_norm": 0.22038741409778595, + "learning_rate": 5.561974454128367e-05, + "loss": 1.7854, + "step": 15647 + }, + { + "epoch": 4.802946593001842, + "grad_norm": 0.24848157167434692, + "learning_rate": 5.5614805449022576e-05, + "loss": 1.6904, + "step": 15648 + }, + { + "epoch": 4.803253529772867, + "grad_norm": 0.28735271096229553, + "learning_rate": 5.56098663012727e-05, + "loss": 1.7476, + "step": 15649 + }, + { + "epoch": 4.803560466543892, + "grad_norm": 0.2658432722091675, + "learning_rate": 5.5604927098082825e-05, + "loss": 1.7314, + "step": 15650 + }, + { + "epoch": 4.803867403314917, + "grad_norm": 0.20409154891967773, + "learning_rate": 5.559998783950179e-05, + "loss": 1.7698, + "step": 15651 + }, + { + "epoch": 4.804174340085942, + "grad_norm": 0.21932728588581085, + "learning_rate": 5.5595048525578384e-05, + "loss": 1.7808, + "step": 15652 + }, + { + "epoch": 4.804481276856968, + "grad_norm": 0.2549879848957062, + "learning_rate": 5.559010915636143e-05, + "loss": 1.8294, + "step": 15653 + }, + { + "epoch": 4.804788213627993, + "grad_norm": 0.2002289742231369, + "learning_rate": 5.5585169731899736e-05, + "loss": 1.732, + "step": 15654 + }, + { + "epoch": 4.805095150399017, + "grad_norm": 0.19988931715488434, + "learning_rate": 5.558023025224212e-05, + "loss": 1.7482, + "step": 15655 + }, + { + "epoch": 4.805402087170043, + "grad_norm": 0.21265259385108948, + "learning_rate": 5.55752907174374e-05, + "loss": 1.8003, + "step": 15656 + }, + { + "epoch": 4.805709023941068, + "grad_norm": 0.22365640103816986, + "learning_rate": 5.5570351127534395e-05, + "loss": 1.7536, + "step": 15657 + }, + { + "epoch": 4.806015960712093, + "grad_norm": 0.25516408681869507, + "learning_rate": 5.556541148258192e-05, + "loss": 1.7648, + "step": 15658 + }, + { + "epoch": 4.806322897483119, + "grad_norm": 0.24870765209197998, + "learning_rate": 5.5560471782628775e-05, + "loss": 1.7793, + "step": 15659 + }, + { + "epoch": 4.806629834254144, + "grad_norm": 0.22119416296482086, + "learning_rate": 5.555553202772379e-05, + "loss": 1.7464, + "step": 15660 + }, + { + "epoch": 4.8069367710251685, + "grad_norm": 0.2781904637813568, + "learning_rate": 5.555059221791579e-05, + "loss": 1.7537, + "step": 15661 + }, + { + "epoch": 4.807243707796194, + "grad_norm": 0.2433774471282959, + "learning_rate": 5.5545652353253574e-05, + "loss": 1.74, + "step": 15662 + }, + { + "epoch": 4.807550644567219, + "grad_norm": 0.19932180643081665, + "learning_rate": 5.554071243378598e-05, + "loss": 1.75, + "step": 15663 + }, + { + "epoch": 4.8078575813382445, + "grad_norm": 0.2428865283727646, + "learning_rate": 5.553577245956182e-05, + "loss": 1.7198, + "step": 15664 + }, + { + "epoch": 4.80816451810927, + "grad_norm": 0.2914198338985443, + "learning_rate": 5.553083243062991e-05, + "loss": 1.7544, + "step": 15665 + }, + { + "epoch": 4.808471454880294, + "grad_norm": 0.2274291068315506, + "learning_rate": 5.5525892347039056e-05, + "loss": 1.8213, + "step": 15666 + }, + { + "epoch": 4.80877839165132, + "grad_norm": 0.23662471771240234, + "learning_rate": 5.552095220883811e-05, + "loss": 1.8025, + "step": 15667 + }, + { + "epoch": 4.809085328422345, + "grad_norm": 0.23062555491924286, + "learning_rate": 5.551601201607587e-05, + "loss": 1.7109, + "step": 15668 + }, + { + "epoch": 4.80939226519337, + "grad_norm": 0.19986943900585175, + "learning_rate": 5.551107176880117e-05, + "loss": 1.7442, + "step": 15669 + }, + { + "epoch": 4.809699201964396, + "grad_norm": 0.2545560300350189, + "learning_rate": 5.5506131467062836e-05, + "loss": 1.7609, + "step": 15670 + }, + { + "epoch": 4.810006138735421, + "grad_norm": 0.253296434879303, + "learning_rate": 5.550119111090968e-05, + "loss": 1.7307, + "step": 15671 + }, + { + "epoch": 4.810313075506445, + "grad_norm": 0.19617940485477448, + "learning_rate": 5.549625070039052e-05, + "loss": 1.7507, + "step": 15672 + }, + { + "epoch": 4.810620012277471, + "grad_norm": 0.2525297999382019, + "learning_rate": 5.5491310235554193e-05, + "loss": 1.8021, + "step": 15673 + }, + { + "epoch": 4.810926949048496, + "grad_norm": 0.20537389814853668, + "learning_rate": 5.548636971644953e-05, + "loss": 1.7432, + "step": 15674 + }, + { + "epoch": 4.811233885819521, + "grad_norm": 0.19924211502075195, + "learning_rate": 5.548142914312533e-05, + "loss": 1.7741, + "step": 15675 + }, + { + "epoch": 4.811540822590547, + "grad_norm": 0.21121448278427124, + "learning_rate": 5.547648851563046e-05, + "loss": 1.7198, + "step": 15676 + }, + { + "epoch": 4.811847759361571, + "grad_norm": 0.23504914343357086, + "learning_rate": 5.547154783401369e-05, + "loss": 1.7173, + "step": 15677 + }, + { + "epoch": 4.8121546961325965, + "grad_norm": 0.2362392097711563, + "learning_rate": 5.54666070983239e-05, + "loss": 1.7752, + "step": 15678 + }, + { + "epoch": 4.812461632903622, + "grad_norm": 0.2524966895580292, + "learning_rate": 5.5461666308609886e-05, + "loss": 1.7943, + "step": 15679 + }, + { + "epoch": 4.812768569674647, + "grad_norm": 0.2250952422618866, + "learning_rate": 5.5456725464920476e-05, + "loss": 1.7606, + "step": 15680 + }, + { + "epoch": 4.8130755064456725, + "grad_norm": 0.21753156185150146, + "learning_rate": 5.5451784567304524e-05, + "loss": 1.7846, + "step": 15681 + }, + { + "epoch": 4.813382443216698, + "grad_norm": 0.220795676112175, + "learning_rate": 5.5446843615810825e-05, + "loss": 1.7422, + "step": 15682 + }, + { + "epoch": 4.813689379987722, + "grad_norm": 0.23597733676433563, + "learning_rate": 5.544190261048823e-05, + "loss": 1.7818, + "step": 15683 + }, + { + "epoch": 4.813996316758748, + "grad_norm": 0.2625976502895355, + "learning_rate": 5.543696155138557e-05, + "loss": 1.7796, + "step": 15684 + }, + { + "epoch": 4.814303253529773, + "grad_norm": 0.20515871047973633, + "learning_rate": 5.5432020438551656e-05, + "loss": 1.7096, + "step": 15685 + }, + { + "epoch": 4.814610190300798, + "grad_norm": 0.19353924691677094, + "learning_rate": 5.542707927203536e-05, + "loss": 1.7541, + "step": 15686 + }, + { + "epoch": 4.814917127071823, + "grad_norm": 0.21998172998428345, + "learning_rate": 5.5422138051885454e-05, + "loss": 1.7696, + "step": 15687 + }, + { + "epoch": 4.815224063842848, + "grad_norm": 0.27576857805252075, + "learning_rate": 5.5417196778150816e-05, + "loss": 1.7491, + "step": 15688 + }, + { + "epoch": 4.815531000613873, + "grad_norm": 0.28202036023139954, + "learning_rate": 5.5412255450880254e-05, + "loss": 1.8615, + "step": 15689 + }, + { + "epoch": 4.815837937384899, + "grad_norm": 0.29632845520973206, + "learning_rate": 5.540731407012263e-05, + "loss": 1.7698, + "step": 15690 + }, + { + "epoch": 4.816144874155924, + "grad_norm": 0.35393890738487244, + "learning_rate": 5.540237263592675e-05, + "loss": 1.7924, + "step": 15691 + }, + { + "epoch": 4.816451810926949, + "grad_norm": 0.23756493628025055, + "learning_rate": 5.5397431148341447e-05, + "loss": 1.8301, + "step": 15692 + }, + { + "epoch": 4.816758747697974, + "grad_norm": 0.310153603553772, + "learning_rate": 5.53924896074156e-05, + "loss": 1.8162, + "step": 15693 + }, + { + "epoch": 4.817065684468999, + "grad_norm": 0.3355565369129181, + "learning_rate": 5.538754801319797e-05, + "loss": 1.7738, + "step": 15694 + }, + { + "epoch": 4.8173726212400245, + "grad_norm": 0.2360079288482666, + "learning_rate": 5.5382606365737446e-05, + "loss": 1.6883, + "step": 15695 + }, + { + "epoch": 4.81767955801105, + "grad_norm": 0.2932819724082947, + "learning_rate": 5.537766466508286e-05, + "loss": 1.8045, + "step": 15696 + }, + { + "epoch": 4.817986494782075, + "grad_norm": 0.31298181414604187, + "learning_rate": 5.537272291128304e-05, + "loss": 1.7516, + "step": 15697 + }, + { + "epoch": 4.8182934315531, + "grad_norm": 0.22871924936771393, + "learning_rate": 5.5367781104386806e-05, + "loss": 1.7386, + "step": 15698 + }, + { + "epoch": 4.818600368324125, + "grad_norm": 0.27097782492637634, + "learning_rate": 5.5362839244443034e-05, + "loss": 1.733, + "step": 15699 + }, + { + "epoch": 4.81890730509515, + "grad_norm": 0.23296736180782318, + "learning_rate": 5.535789733150052e-05, + "loss": 1.7735, + "step": 15700 + }, + { + "epoch": 4.819214241866176, + "grad_norm": 0.22650237381458282, + "learning_rate": 5.5352955365608125e-05, + "loss": 1.7443, + "step": 15701 + }, + { + "epoch": 4.819521178637201, + "grad_norm": 0.25525161623954773, + "learning_rate": 5.534801334681471e-05, + "loss": 1.7379, + "step": 15702 + }, + { + "epoch": 4.819828115408226, + "grad_norm": 0.2249457836151123, + "learning_rate": 5.534307127516908e-05, + "loss": 1.7393, + "step": 15703 + }, + { + "epoch": 4.820135052179251, + "grad_norm": 0.1995566338300705, + "learning_rate": 5.5338129150720084e-05, + "loss": 1.7411, + "step": 15704 + }, + { + "epoch": 4.820441988950276, + "grad_norm": 0.250851035118103, + "learning_rate": 5.533318697351657e-05, + "loss": 1.7801, + "step": 15705 + }, + { + "epoch": 4.820748925721301, + "grad_norm": 0.3175830543041229, + "learning_rate": 5.532824474360737e-05, + "loss": 1.7553, + "step": 15706 + }, + { + "epoch": 4.821055862492327, + "grad_norm": 0.22842039167881012, + "learning_rate": 5.532330246104134e-05, + "loss": 1.7489, + "step": 15707 + }, + { + "epoch": 4.821362799263352, + "grad_norm": 0.21125485002994537, + "learning_rate": 5.531836012586732e-05, + "loss": 1.7543, + "step": 15708 + }, + { + "epoch": 4.8216697360343765, + "grad_norm": 0.33028700947761536, + "learning_rate": 5.531341773813414e-05, + "loss": 1.8237, + "step": 15709 + }, + { + "epoch": 4.821976672805402, + "grad_norm": 0.324564129114151, + "learning_rate": 5.530847529789067e-05, + "loss": 1.7288, + "step": 15710 + }, + { + "epoch": 4.822283609576427, + "grad_norm": 0.3299528956413269, + "learning_rate": 5.530353280518571e-05, + "loss": 1.7536, + "step": 15711 + }, + { + "epoch": 4.8225905463474525, + "grad_norm": 0.3535030782222748, + "learning_rate": 5.5298590260068136e-05, + "loss": 1.7941, + "step": 15712 + }, + { + "epoch": 4.822897483118478, + "grad_norm": 0.2627669870853424, + "learning_rate": 5.5293647662586804e-05, + "loss": 1.7638, + "step": 15713 + }, + { + "epoch": 4.823204419889503, + "grad_norm": 0.25569450855255127, + "learning_rate": 5.5288705012790535e-05, + "loss": 1.7396, + "step": 15714 + }, + { + "epoch": 4.823511356660528, + "grad_norm": 0.26099520921707153, + "learning_rate": 5.528376231072817e-05, + "loss": 1.7415, + "step": 15715 + }, + { + "epoch": 4.823818293431553, + "grad_norm": 0.31833693385124207, + "learning_rate": 5.527881955644858e-05, + "loss": 1.7683, + "step": 15716 + }, + { + "epoch": 4.824125230202578, + "grad_norm": 0.2753448188304901, + "learning_rate": 5.5273876750000594e-05, + "loss": 1.6653, + "step": 15717 + }, + { + "epoch": 4.824432166973604, + "grad_norm": 0.23816895484924316, + "learning_rate": 5.526893389143307e-05, + "loss": 1.7575, + "step": 15718 + }, + { + "epoch": 4.824739103744628, + "grad_norm": 0.25376051664352417, + "learning_rate": 5.5263990980794856e-05, + "loss": 1.755, + "step": 15719 + }, + { + "epoch": 4.8250460405156534, + "grad_norm": 0.2483726590871811, + "learning_rate": 5.52590480181348e-05, + "loss": 1.7566, + "step": 15720 + }, + { + "epoch": 4.825352977286679, + "grad_norm": 0.2073517143726349, + "learning_rate": 5.5254105003501746e-05, + "loss": 1.7069, + "step": 15721 + }, + { + "epoch": 4.825659914057704, + "grad_norm": 0.3166659474372864, + "learning_rate": 5.524916193694455e-05, + "loss": 1.7012, + "step": 15722 + }, + { + "epoch": 4.8259668508287294, + "grad_norm": 0.24518641829490662, + "learning_rate": 5.524421881851205e-05, + "loss": 1.7027, + "step": 15723 + }, + { + "epoch": 4.826273787599755, + "grad_norm": 0.23137906193733215, + "learning_rate": 5.523927564825311e-05, + "loss": 1.746, + "step": 15724 + }, + { + "epoch": 4.82658072437078, + "grad_norm": 0.27937051653862, + "learning_rate": 5.5234332426216586e-05, + "loss": 1.7064, + "step": 15725 + }, + { + "epoch": 4.826887661141805, + "grad_norm": 0.26408496499061584, + "learning_rate": 5.522938915245131e-05, + "loss": 1.6598, + "step": 15726 + }, + { + "epoch": 4.82719459791283, + "grad_norm": 0.22269997000694275, + "learning_rate": 5.5224445827006164e-05, + "loss": 1.7166, + "step": 15727 + }, + { + "epoch": 4.827501534683855, + "grad_norm": 0.22687453031539917, + "learning_rate": 5.5219502449929964e-05, + "loss": 1.7156, + "step": 15728 + }, + { + "epoch": 4.827808471454881, + "grad_norm": 0.26355600357055664, + "learning_rate": 5.5214559021271585e-05, + "loss": 1.8016, + "step": 15729 + }, + { + "epoch": 4.828115408225905, + "grad_norm": 0.30103012919425964, + "learning_rate": 5.520961554107987e-05, + "loss": 1.7856, + "step": 15730 + }, + { + "epoch": 4.82842234499693, + "grad_norm": 0.22604018449783325, + "learning_rate": 5.520467200940369e-05, + "loss": 1.813, + "step": 15731 + }, + { + "epoch": 4.828729281767956, + "grad_norm": 0.25435203313827515, + "learning_rate": 5.51997284262919e-05, + "loss": 1.7511, + "step": 15732 + }, + { + "epoch": 4.829036218538981, + "grad_norm": 0.2740691304206848, + "learning_rate": 5.519478479179333e-05, + "loss": 1.7326, + "step": 15733 + }, + { + "epoch": 4.829343155310006, + "grad_norm": 0.19710861146450043, + "learning_rate": 5.5189841105956866e-05, + "loss": 1.7581, + "step": 15734 + }, + { + "epoch": 4.829650092081032, + "grad_norm": 0.2315293401479721, + "learning_rate": 5.518489736883132e-05, + "loss": 1.6796, + "step": 15735 + }, + { + "epoch": 4.829957028852056, + "grad_norm": 0.2465476542711258, + "learning_rate": 5.51799535804656e-05, + "loss": 1.7276, + "step": 15736 + }, + { + "epoch": 4.8302639656230815, + "grad_norm": 0.20438486337661743, + "learning_rate": 5.5175009740908546e-05, + "loss": 1.7188, + "step": 15737 + }, + { + "epoch": 4.830570902394107, + "grad_norm": 0.24328351020812988, + "learning_rate": 5.5170065850209016e-05, + "loss": 1.7165, + "step": 15738 + }, + { + "epoch": 4.830877839165132, + "grad_norm": 0.22486837208271027, + "learning_rate": 5.516512190841586e-05, + "loss": 1.7369, + "step": 15739 + }, + { + "epoch": 4.8311847759361575, + "grad_norm": 0.2065822333097458, + "learning_rate": 5.5160177915577934e-05, + "loss": 1.7125, + "step": 15740 + }, + { + "epoch": 4.831491712707182, + "grad_norm": 0.21223095059394836, + "learning_rate": 5.5155233871744104e-05, + "loss": 1.7319, + "step": 15741 + }, + { + "epoch": 4.831798649478207, + "grad_norm": 0.25712934136390686, + "learning_rate": 5.515028977696325e-05, + "loss": 1.7847, + "step": 15742 + }, + { + "epoch": 4.832105586249233, + "grad_norm": 0.21289978921413422, + "learning_rate": 5.5145345631284215e-05, + "loss": 1.7629, + "step": 15743 + }, + { + "epoch": 4.832412523020258, + "grad_norm": 0.22347134351730347, + "learning_rate": 5.514040143475585e-05, + "loss": 1.7491, + "step": 15744 + }, + { + "epoch": 4.832719459791283, + "grad_norm": 0.20660510659217834, + "learning_rate": 5.513545718742702e-05, + "loss": 1.7377, + "step": 15745 + }, + { + "epoch": 4.833026396562309, + "grad_norm": 0.21612273156642914, + "learning_rate": 5.513051288934658e-05, + "loss": 1.7973, + "step": 15746 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.22515933215618134, + "learning_rate": 5.512556854056342e-05, + "loss": 1.7774, + "step": 15747 + }, + { + "epoch": 4.833640270104358, + "grad_norm": 0.21075554192066193, + "learning_rate": 5.512062414112639e-05, + "loss": 1.7741, + "step": 15748 + }, + { + "epoch": 4.833947206875384, + "grad_norm": 0.2203720659017563, + "learning_rate": 5.511567969108436e-05, + "loss": 1.7902, + "step": 15749 + }, + { + "epoch": 4.834254143646409, + "grad_norm": 0.20247167348861694, + "learning_rate": 5.511073519048616e-05, + "loss": 1.7084, + "step": 15750 + }, + { + "epoch": 4.834561080417434, + "grad_norm": 0.247711181640625, + "learning_rate": 5.5105790639380695e-05, + "loss": 1.8465, + "step": 15751 + }, + { + "epoch": 4.834868017188459, + "grad_norm": 0.22866854071617126, + "learning_rate": 5.51008460378168e-05, + "loss": 1.7252, + "step": 15752 + }, + { + "epoch": 4.835174953959484, + "grad_norm": 0.2335643470287323, + "learning_rate": 5.5095901385843374e-05, + "loss": 1.703, + "step": 15753 + }, + { + "epoch": 4.8354818907305095, + "grad_norm": 0.20874348282814026, + "learning_rate": 5.509095668350926e-05, + "loss": 1.7114, + "step": 15754 + }, + { + "epoch": 4.835788827501535, + "grad_norm": 0.19156917929649353, + "learning_rate": 5.5086011930863314e-05, + "loss": 1.6975, + "step": 15755 + }, + { + "epoch": 4.83609576427256, + "grad_norm": 0.23480524122714996, + "learning_rate": 5.508106712795443e-05, + "loss": 1.8291, + "step": 15756 + }, + { + "epoch": 4.8364027010435855, + "grad_norm": 0.20430417358875275, + "learning_rate": 5.5076122274831454e-05, + "loss": 1.7605, + "step": 15757 + }, + { + "epoch": 4.83670963781461, + "grad_norm": 0.26790598034858704, + "learning_rate": 5.5071177371543256e-05, + "loss": 1.7541, + "step": 15758 + }, + { + "epoch": 4.837016574585635, + "grad_norm": 0.3339289724826813, + "learning_rate": 5.506623241813873e-05, + "loss": 1.7566, + "step": 15759 + }, + { + "epoch": 4.837323511356661, + "grad_norm": 0.30528193712234497, + "learning_rate": 5.5061287414666726e-05, + "loss": 1.7371, + "step": 15760 + }, + { + "epoch": 4.837630448127686, + "grad_norm": 0.21059657633304596, + "learning_rate": 5.5056342361176114e-05, + "loss": 1.7599, + "step": 15761 + }, + { + "epoch": 4.83793738489871, + "grad_norm": 0.27918973565101624, + "learning_rate": 5.5051397257715756e-05, + "loss": 1.7485, + "step": 15762 + }, + { + "epoch": 4.838244321669736, + "grad_norm": 0.23147793114185333, + "learning_rate": 5.5046452104334514e-05, + "loss": 1.7121, + "step": 15763 + }, + { + "epoch": 4.838551258440761, + "grad_norm": 0.22028742730617523, + "learning_rate": 5.5041506901081294e-05, + "loss": 1.803, + "step": 15764 + }, + { + "epoch": 4.838858195211786, + "grad_norm": 0.22840891778469086, + "learning_rate": 5.5036561648004946e-05, + "loss": 1.7555, + "step": 15765 + }, + { + "epoch": 4.839165131982812, + "grad_norm": 0.2610893249511719, + "learning_rate": 5.503161634515433e-05, + "loss": 1.7873, + "step": 15766 + }, + { + "epoch": 4.839472068753837, + "grad_norm": 0.2530003786087036, + "learning_rate": 5.502667099257836e-05, + "loss": 1.7604, + "step": 15767 + }, + { + "epoch": 4.8397790055248615, + "grad_norm": 0.20120400190353394, + "learning_rate": 5.5021725590325854e-05, + "loss": 1.7476, + "step": 15768 + }, + { + "epoch": 4.840085942295887, + "grad_norm": 0.2189723700284958, + "learning_rate": 5.501678013844571e-05, + "loss": 1.7174, + "step": 15769 + }, + { + "epoch": 4.840392879066912, + "grad_norm": 0.2511899173259735, + "learning_rate": 5.501183463698683e-05, + "loss": 1.7589, + "step": 15770 + }, + { + "epoch": 4.8406998158379375, + "grad_norm": 0.24899333715438843, + "learning_rate": 5.5006889085998035e-05, + "loss": 1.7253, + "step": 15771 + }, + { + "epoch": 4.841006752608963, + "grad_norm": 0.21223559975624084, + "learning_rate": 5.5001943485528254e-05, + "loss": 1.6949, + "step": 15772 + }, + { + "epoch": 4.841313689379987, + "grad_norm": 0.21394596993923187, + "learning_rate": 5.499699783562632e-05, + "loss": 1.7827, + "step": 15773 + }, + { + "epoch": 4.841620626151013, + "grad_norm": 0.2379613220691681, + "learning_rate": 5.4992052136341134e-05, + "loss": 1.7968, + "step": 15774 + }, + { + "epoch": 4.841927562922038, + "grad_norm": 0.23748385906219482, + "learning_rate": 5.498710638772154e-05, + "loss": 1.797, + "step": 15775 + }, + { + "epoch": 4.842234499693063, + "grad_norm": 0.2502206265926361, + "learning_rate": 5.498216058981646e-05, + "loss": 1.7292, + "step": 15776 + }, + { + "epoch": 4.842541436464089, + "grad_norm": 0.23613516986370087, + "learning_rate": 5.497721474267475e-05, + "loss": 1.7353, + "step": 15777 + }, + { + "epoch": 4.842848373235114, + "grad_norm": 0.25274696946144104, + "learning_rate": 5.497226884634527e-05, + "loss": 1.7782, + "step": 15778 + }, + { + "epoch": 4.843155310006138, + "grad_norm": 0.19574183225631714, + "learning_rate": 5.496732290087694e-05, + "loss": 1.6926, + "step": 15779 + }, + { + "epoch": 4.843462246777164, + "grad_norm": 0.21040405333042145, + "learning_rate": 5.496237690631858e-05, + "loss": 1.7235, + "step": 15780 + }, + { + "epoch": 4.843769183548189, + "grad_norm": 0.22499679028987885, + "learning_rate": 5.495743086271913e-05, + "loss": 1.7889, + "step": 15781 + }, + { + "epoch": 4.844076120319214, + "grad_norm": 0.24623246490955353, + "learning_rate": 5.4952484770127433e-05, + "loss": 1.7357, + "step": 15782 + }, + { + "epoch": 4.84438305709024, + "grad_norm": 0.21706275641918182, + "learning_rate": 5.494753862859238e-05, + "loss": 1.7349, + "step": 15783 + }, + { + "epoch": 4.844689993861264, + "grad_norm": 0.20705166459083557, + "learning_rate": 5.4942592438162855e-05, + "loss": 1.7047, + "step": 15784 + }, + { + "epoch": 4.8449969306322895, + "grad_norm": 0.21216751635074615, + "learning_rate": 5.493764619888773e-05, + "loss": 1.7335, + "step": 15785 + }, + { + "epoch": 4.845303867403315, + "grad_norm": 0.2945895195007324, + "learning_rate": 5.493269991081588e-05, + "loss": 1.838, + "step": 15786 + }, + { + "epoch": 4.84561080417434, + "grad_norm": 0.22013652324676514, + "learning_rate": 5.492775357399621e-05, + "loss": 1.7541, + "step": 15787 + }, + { + "epoch": 4.8459177409453655, + "grad_norm": 0.25428512692451477, + "learning_rate": 5.4922807188477585e-05, + "loss": 1.7405, + "step": 15788 + }, + { + "epoch": 4.846224677716391, + "grad_norm": 0.23189012706279755, + "learning_rate": 5.49178607543089e-05, + "loss": 1.8075, + "step": 15789 + }, + { + "epoch": 4.846531614487415, + "grad_norm": 0.21637389063835144, + "learning_rate": 5.491291427153904e-05, + "loss": 1.7229, + "step": 15790 + }, + { + "epoch": 4.846838551258441, + "grad_norm": 0.20628009736537933, + "learning_rate": 5.490796774021687e-05, + "loss": 1.7605, + "step": 15791 + }, + { + "epoch": 4.847145488029466, + "grad_norm": 0.20845308899879456, + "learning_rate": 5.4903021160391276e-05, + "loss": 1.7864, + "step": 15792 + }, + { + "epoch": 4.847452424800491, + "grad_norm": 0.20367322862148285, + "learning_rate": 5.4898074532111164e-05, + "loss": 1.733, + "step": 15793 + }, + { + "epoch": 4.847759361571516, + "grad_norm": 0.2066505253314972, + "learning_rate": 5.489312785542543e-05, + "loss": 1.7113, + "step": 15794 + }, + { + "epoch": 4.848066298342541, + "grad_norm": 0.23874987661838531, + "learning_rate": 5.488818113038292e-05, + "loss": 1.7735, + "step": 15795 + }, + { + "epoch": 4.848373235113566, + "grad_norm": 0.26583850383758545, + "learning_rate": 5.488323435703254e-05, + "loss": 1.8019, + "step": 15796 + }, + { + "epoch": 4.848680171884592, + "grad_norm": 0.25207552313804626, + "learning_rate": 5.487828753542317e-05, + "loss": 1.7491, + "step": 15797 + }, + { + "epoch": 4.848987108655617, + "grad_norm": 0.23065905272960663, + "learning_rate": 5.48733406656037e-05, + "loss": 1.7451, + "step": 15798 + }, + { + "epoch": 4.849294045426642, + "grad_norm": 0.26914483308792114, + "learning_rate": 5.486839374762304e-05, + "loss": 1.7553, + "step": 15799 + }, + { + "epoch": 4.849600982197668, + "grad_norm": 0.2509605884552002, + "learning_rate": 5.4863446781530046e-05, + "loss": 1.7124, + "step": 15800 + }, + { + "epoch": 4.849907918968692, + "grad_norm": 0.2618432343006134, + "learning_rate": 5.485849976737362e-05, + "loss": 1.7368, + "step": 15801 + }, + { + "epoch": 4.850214855739718, + "grad_norm": 0.46875160932540894, + "learning_rate": 5.485355270520266e-05, + "loss": 1.7883, + "step": 15802 + }, + { + "epoch": 4.850521792510743, + "grad_norm": 0.37585484981536865, + "learning_rate": 5.4848605595066025e-05, + "loss": 1.7894, + "step": 15803 + }, + { + "epoch": 4.850828729281768, + "grad_norm": 0.2244408279657364, + "learning_rate": 5.4843658437012646e-05, + "loss": 1.7394, + "step": 15804 + }, + { + "epoch": 4.851135666052793, + "grad_norm": 0.4061773419380188, + "learning_rate": 5.48387112310914e-05, + "loss": 1.7703, + "step": 15805 + }, + { + "epoch": 4.851442602823818, + "grad_norm": 0.35925009846687317, + "learning_rate": 5.483376397735117e-05, + "loss": 1.7798, + "step": 15806 + }, + { + "epoch": 4.851749539594843, + "grad_norm": 0.23050184547901154, + "learning_rate": 5.482881667584084e-05, + "loss": 1.7984, + "step": 15807 + }, + { + "epoch": 4.852056476365869, + "grad_norm": 0.37308645248413086, + "learning_rate": 5.4823869326609335e-05, + "loss": 1.6747, + "step": 15808 + }, + { + "epoch": 4.852363413136894, + "grad_norm": 0.29826754331588745, + "learning_rate": 5.481892192970551e-05, + "loss": 1.7432, + "step": 15809 + }, + { + "epoch": 4.852670349907919, + "grad_norm": 0.23652370274066925, + "learning_rate": 5.4813974485178266e-05, + "loss": 1.7557, + "step": 15810 + }, + { + "epoch": 4.852977286678944, + "grad_norm": 0.40549808740615845, + "learning_rate": 5.4809026993076526e-05, + "loss": 1.7317, + "step": 15811 + }, + { + "epoch": 4.853284223449969, + "grad_norm": 0.3367961347103119, + "learning_rate": 5.4804079453449156e-05, + "loss": 1.7648, + "step": 15812 + }, + { + "epoch": 4.8535911602209945, + "grad_norm": 0.21629661321640015, + "learning_rate": 5.4799131866345055e-05, + "loss": 1.7986, + "step": 15813 + }, + { + "epoch": 4.85389809699202, + "grad_norm": 0.26381492614746094, + "learning_rate": 5.4794184231813105e-05, + "loss": 1.7401, + "step": 15814 + }, + { + "epoch": 4.854205033763045, + "grad_norm": 0.22319363057613373, + "learning_rate": 5.478923654990223e-05, + "loss": 1.7773, + "step": 15815 + }, + { + "epoch": 4.85451197053407, + "grad_norm": 0.2547159492969513, + "learning_rate": 5.4784288820661326e-05, + "loss": 1.8194, + "step": 15816 + }, + { + "epoch": 4.854818907305095, + "grad_norm": 0.29574522376060486, + "learning_rate": 5.477934104413925e-05, + "loss": 1.7351, + "step": 15817 + }, + { + "epoch": 4.85512584407612, + "grad_norm": 0.17389361560344696, + "learning_rate": 5.4774393220384945e-05, + "loss": 1.6957, + "step": 15818 + }, + { + "epoch": 4.855432780847146, + "grad_norm": 0.23746751248836517, + "learning_rate": 5.476944534944728e-05, + "loss": 1.7713, + "step": 15819 + }, + { + "epoch": 4.855739717618171, + "grad_norm": 0.182356595993042, + "learning_rate": 5.476449743137516e-05, + "loss": 1.7144, + "step": 15820 + }, + { + "epoch": 4.856046654389196, + "grad_norm": 0.23716382682323456, + "learning_rate": 5.4759549466217475e-05, + "loss": 1.7451, + "step": 15821 + }, + { + "epoch": 4.856353591160221, + "grad_norm": 0.316806823015213, + "learning_rate": 5.475460145402313e-05, + "loss": 1.7823, + "step": 15822 + }, + { + "epoch": 4.856660527931246, + "grad_norm": 0.2333129197359085, + "learning_rate": 5.474965339484105e-05, + "loss": 1.7788, + "step": 15823 + }, + { + "epoch": 4.856967464702271, + "grad_norm": 0.21180212497711182, + "learning_rate": 5.47447052887201e-05, + "loss": 1.7513, + "step": 15824 + }, + { + "epoch": 4.857274401473297, + "grad_norm": 0.22641299664974213, + "learning_rate": 5.473975713570919e-05, + "loss": 1.7514, + "step": 15825 + }, + { + "epoch": 4.857581338244322, + "grad_norm": 0.3179668188095093, + "learning_rate": 5.473480893585723e-05, + "loss": 1.7939, + "step": 15826 + }, + { + "epoch": 4.8578882750153465, + "grad_norm": 0.27463147044181824, + "learning_rate": 5.472986068921309e-05, + "loss": 1.7487, + "step": 15827 + }, + { + "epoch": 4.858195211786372, + "grad_norm": 0.18621626496315002, + "learning_rate": 5.472491239582572e-05, + "loss": 1.7155, + "step": 15828 + }, + { + "epoch": 4.858502148557397, + "grad_norm": 0.2437327802181244, + "learning_rate": 5.471996405574399e-05, + "loss": 1.7586, + "step": 15829 + }, + { + "epoch": 4.8588090853284225, + "grad_norm": 0.26658934354782104, + "learning_rate": 5.47150156690168e-05, + "loss": 1.7331, + "step": 15830 + }, + { + "epoch": 4.859116022099448, + "grad_norm": 0.2257174700498581, + "learning_rate": 5.471006723569308e-05, + "loss": 1.7556, + "step": 15831 + }, + { + "epoch": 4.859422958870473, + "grad_norm": 0.25434550642967224, + "learning_rate": 5.470511875582168e-05, + "loss": 1.7196, + "step": 15832 + }, + { + "epoch": 4.859729895641498, + "grad_norm": 0.2251453697681427, + "learning_rate": 5.470017022945156e-05, + "loss": 1.7174, + "step": 15833 + }, + { + "epoch": 4.860036832412523, + "grad_norm": 0.2757972180843353, + "learning_rate": 5.469522165663161e-05, + "loss": 1.7701, + "step": 15834 + }, + { + "epoch": 4.860343769183548, + "grad_norm": 0.2771994173526764, + "learning_rate": 5.469027303741072e-05, + "loss": 1.8085, + "step": 15835 + }, + { + "epoch": 4.860650705954574, + "grad_norm": 0.23825454711914062, + "learning_rate": 5.468532437183781e-05, + "loss": 1.733, + "step": 15836 + }, + { + "epoch": 4.860957642725598, + "grad_norm": 0.18100066483020782, + "learning_rate": 5.468037565996177e-05, + "loss": 1.7012, + "step": 15837 + }, + { + "epoch": 4.861264579496623, + "grad_norm": 0.22552812099456787, + "learning_rate": 5.4675426901831506e-05, + "loss": 1.728, + "step": 15838 + }, + { + "epoch": 4.861571516267649, + "grad_norm": 0.2505643665790558, + "learning_rate": 5.467047809749595e-05, + "loss": 1.7219, + "step": 15839 + }, + { + "epoch": 4.861878453038674, + "grad_norm": 0.25920796394348145, + "learning_rate": 5.4665529247003975e-05, + "loss": 1.7945, + "step": 15840 + }, + { + "epoch": 4.862185389809699, + "grad_norm": 0.23549394309520721, + "learning_rate": 5.466058035040452e-05, + "loss": 1.7904, + "step": 15841 + }, + { + "epoch": 4.862492326580725, + "grad_norm": 0.26510992646217346, + "learning_rate": 5.465563140774648e-05, + "loss": 1.8051, + "step": 15842 + }, + { + "epoch": 4.862799263351749, + "grad_norm": 0.19175390899181366, + "learning_rate": 5.465068241907876e-05, + "loss": 1.6799, + "step": 15843 + }, + { + "epoch": 4.8631062001227745, + "grad_norm": 0.2588976323604584, + "learning_rate": 5.464573338445025e-05, + "loss": 1.7394, + "step": 15844 + }, + { + "epoch": 4.8634131368938, + "grad_norm": 0.28729483485221863, + "learning_rate": 5.464078430390991e-05, + "loss": 1.797, + "step": 15845 + }, + { + "epoch": 4.863720073664825, + "grad_norm": 0.21302445232868195, + "learning_rate": 5.463583517750661e-05, + "loss": 1.7303, + "step": 15846 + }, + { + "epoch": 4.8640270104358505, + "grad_norm": 0.2407636195421219, + "learning_rate": 5.463088600528926e-05, + "loss": 1.7175, + "step": 15847 + }, + { + "epoch": 4.864333947206875, + "grad_norm": 0.25653502345085144, + "learning_rate": 5.4625936787306784e-05, + "loss": 1.6996, + "step": 15848 + }, + { + "epoch": 4.8646408839779, + "grad_norm": 0.2100832760334015, + "learning_rate": 5.462098752360809e-05, + "loss": 1.7416, + "step": 15849 + }, + { + "epoch": 4.864947820748926, + "grad_norm": 0.2785186469554901, + "learning_rate": 5.461603821424208e-05, + "loss": 1.74, + "step": 15850 + }, + { + "epoch": 4.865254757519951, + "grad_norm": 0.2896614968776703, + "learning_rate": 5.4611088859257696e-05, + "loss": 1.7436, + "step": 15851 + }, + { + "epoch": 4.865561694290976, + "grad_norm": 0.18890418112277985, + "learning_rate": 5.460613945870382e-05, + "loss": 1.7093, + "step": 15852 + }, + { + "epoch": 4.865868631062002, + "grad_norm": 0.27681079506874084, + "learning_rate": 5.4601190012629364e-05, + "loss": 1.8772, + "step": 15853 + }, + { + "epoch": 4.866175567833026, + "grad_norm": 0.24658115208148956, + "learning_rate": 5.4596240521083265e-05, + "loss": 1.776, + "step": 15854 + }, + { + "epoch": 4.866482504604051, + "grad_norm": 0.21958144009113312, + "learning_rate": 5.459129098411441e-05, + "loss": 1.7503, + "step": 15855 + }, + { + "epoch": 4.866789441375077, + "grad_norm": 0.2778300642967224, + "learning_rate": 5.458634140177174e-05, + "loss": 1.8194, + "step": 15856 + }, + { + "epoch": 4.867096378146102, + "grad_norm": 0.28673580288887024, + "learning_rate": 5.458139177410414e-05, + "loss": 1.8033, + "step": 15857 + }, + { + "epoch": 4.867403314917127, + "grad_norm": 0.24472850561141968, + "learning_rate": 5.457644210116055e-05, + "loss": 1.7304, + "step": 15858 + }, + { + "epoch": 4.867710251688152, + "grad_norm": 0.24581189453601837, + "learning_rate": 5.4571492382989886e-05, + "loss": 1.7443, + "step": 15859 + }, + { + "epoch": 4.868017188459177, + "grad_norm": 0.22296221554279327, + "learning_rate": 5.4566542619641045e-05, + "loss": 1.7201, + "step": 15860 + }, + { + "epoch": 4.8683241252302025, + "grad_norm": 0.2378673404455185, + "learning_rate": 5.456159281116295e-05, + "loss": 1.7893, + "step": 15861 + }, + { + "epoch": 4.868631062001228, + "grad_norm": 0.3320823907852173, + "learning_rate": 5.4556642957604534e-05, + "loss": 1.7944, + "step": 15862 + }, + { + "epoch": 4.868937998772253, + "grad_norm": 0.3303453326225281, + "learning_rate": 5.45516930590147e-05, + "loss": 1.7267, + "step": 15863 + }, + { + "epoch": 4.8692449355432785, + "grad_norm": 0.223227858543396, + "learning_rate": 5.454674311544235e-05, + "loss": 1.7477, + "step": 15864 + }, + { + "epoch": 4.869551872314303, + "grad_norm": 0.3012549579143524, + "learning_rate": 5.454179312693643e-05, + "loss": 1.731, + "step": 15865 + }, + { + "epoch": 4.869858809085328, + "grad_norm": 0.3780311942100525, + "learning_rate": 5.453684309354585e-05, + "loss": 1.7296, + "step": 15866 + }, + { + "epoch": 4.870165745856354, + "grad_norm": 0.2753889262676239, + "learning_rate": 5.4531893015319526e-05, + "loss": 1.8024, + "step": 15867 + }, + { + "epoch": 4.870472682627379, + "grad_norm": 0.2270934134721756, + "learning_rate": 5.452694289230639e-05, + "loss": 1.7095, + "step": 15868 + }, + { + "epoch": 4.870779619398404, + "grad_norm": 0.2621576488018036, + "learning_rate": 5.452199272455534e-05, + "loss": 1.75, + "step": 15869 + }, + { + "epoch": 4.871086556169429, + "grad_norm": 0.22175776958465576, + "learning_rate": 5.45170425121153e-05, + "loss": 1.7658, + "step": 15870 + }, + { + "epoch": 4.871393492940454, + "grad_norm": 0.2038736790418625, + "learning_rate": 5.451209225503521e-05, + "loss": 1.6916, + "step": 15871 + }, + { + "epoch": 4.871700429711479, + "grad_norm": 0.2493467777967453, + "learning_rate": 5.450714195336397e-05, + "loss": 1.7408, + "step": 15872 + }, + { + "epoch": 4.872007366482505, + "grad_norm": 0.1966754049062729, + "learning_rate": 5.450219160715052e-05, + "loss": 1.7379, + "step": 15873 + }, + { + "epoch": 4.87231430325353, + "grad_norm": 0.23193517327308655, + "learning_rate": 5.4497241216443775e-05, + "loss": 1.7736, + "step": 15874 + }, + { + "epoch": 4.872621240024555, + "grad_norm": 0.2164391279220581, + "learning_rate": 5.4492290781292646e-05, + "loss": 1.7618, + "step": 15875 + }, + { + "epoch": 4.87292817679558, + "grad_norm": 0.286460816860199, + "learning_rate": 5.448734030174607e-05, + "loss": 1.7745, + "step": 15876 + }, + { + "epoch": 4.873235113566605, + "grad_norm": 0.3454538881778717, + "learning_rate": 5.448238977785298e-05, + "loss": 1.7605, + "step": 15877 + }, + { + "epoch": 4.8735420503376305, + "grad_norm": 0.26775062084198, + "learning_rate": 5.447743920966227e-05, + "loss": 1.7263, + "step": 15878 + }, + { + "epoch": 4.873848987108656, + "grad_norm": 0.2644907832145691, + "learning_rate": 5.447248859722289e-05, + "loss": 1.8489, + "step": 15879 + }, + { + "epoch": 4.87415592387968, + "grad_norm": 0.21646654605865479, + "learning_rate": 5.446753794058376e-05, + "loss": 1.7605, + "step": 15880 + }, + { + "epoch": 4.874462860650706, + "grad_norm": 0.23431318998336792, + "learning_rate": 5.446258723979381e-05, + "loss": 1.7209, + "step": 15881 + }, + { + "epoch": 4.874769797421731, + "grad_norm": 0.24665607511997223, + "learning_rate": 5.4457636494901934e-05, + "loss": 1.813, + "step": 15882 + }, + { + "epoch": 4.875076734192756, + "grad_norm": 0.26269975304603577, + "learning_rate": 5.445268570595708e-05, + "loss": 1.8255, + "step": 15883 + }, + { + "epoch": 4.875383670963782, + "grad_norm": 0.2722402811050415, + "learning_rate": 5.444773487300819e-05, + "loss": 1.7795, + "step": 15884 + }, + { + "epoch": 4.875690607734807, + "grad_norm": 0.3235624134540558, + "learning_rate": 5.444278399610417e-05, + "loss": 1.7804, + "step": 15885 + }, + { + "epoch": 4.8759975445058314, + "grad_norm": 0.2647583782672882, + "learning_rate": 5.4437833075293964e-05, + "loss": 1.7359, + "step": 15886 + }, + { + "epoch": 4.876304481276857, + "grad_norm": 0.272370845079422, + "learning_rate": 5.443288211062649e-05, + "loss": 1.7605, + "step": 15887 + }, + { + "epoch": 4.876611418047882, + "grad_norm": 0.3147594630718231, + "learning_rate": 5.4427931102150675e-05, + "loss": 1.7118, + "step": 15888 + }, + { + "epoch": 4.8769183548189075, + "grad_norm": 0.22751441597938538, + "learning_rate": 5.442298004991544e-05, + "loss": 1.723, + "step": 15889 + }, + { + "epoch": 4.877225291589933, + "grad_norm": 0.2121521681547165, + "learning_rate": 5.441802895396972e-05, + "loss": 1.7485, + "step": 15890 + }, + { + "epoch": 4.877532228360957, + "grad_norm": 0.25370222330093384, + "learning_rate": 5.4413077814362466e-05, + "loss": 1.8064, + "step": 15891 + }, + { + "epoch": 4.877839165131983, + "grad_norm": 0.19492633640766144, + "learning_rate": 5.440812663114259e-05, + "loss": 1.6773, + "step": 15892 + }, + { + "epoch": 4.878146101903008, + "grad_norm": 0.2101750522851944, + "learning_rate": 5.440317540435901e-05, + "loss": 1.7215, + "step": 15893 + }, + { + "epoch": 4.878453038674033, + "grad_norm": 0.21150651574134827, + "learning_rate": 5.439822413406068e-05, + "loss": 1.7875, + "step": 15894 + }, + { + "epoch": 4.878759975445059, + "grad_norm": 0.21008379757404327, + "learning_rate": 5.439327282029651e-05, + "loss": 1.7108, + "step": 15895 + }, + { + "epoch": 4.879066912216084, + "grad_norm": 0.22885502874851227, + "learning_rate": 5.4388321463115453e-05, + "loss": 1.7899, + "step": 15896 + }, + { + "epoch": 4.879373848987108, + "grad_norm": 0.24868059158325195, + "learning_rate": 5.4383370062566444e-05, + "loss": 1.7368, + "step": 15897 + }, + { + "epoch": 4.879680785758134, + "grad_norm": 0.27225378155708313, + "learning_rate": 5.437841861869838e-05, + "loss": 1.7623, + "step": 15898 + }, + { + "epoch": 4.879987722529159, + "grad_norm": 0.23353120684623718, + "learning_rate": 5.437346713156023e-05, + "loss": 1.7908, + "step": 15899 + }, + { + "epoch": 4.880294659300184, + "grad_norm": 0.19032470881938934, + "learning_rate": 5.436851560120091e-05, + "loss": 1.7511, + "step": 15900 + }, + { + "epoch": 4.88060159607121, + "grad_norm": 0.23714862763881683, + "learning_rate": 5.4363564027669345e-05, + "loss": 1.7197, + "step": 15901 + }, + { + "epoch": 4.880908532842234, + "grad_norm": 0.24897022545337677, + "learning_rate": 5.4358612411014495e-05, + "loss": 1.7822, + "step": 15902 + }, + { + "epoch": 4.8812154696132595, + "grad_norm": 0.21433588862419128, + "learning_rate": 5.435366075128528e-05, + "loss": 1.7928, + "step": 15903 + }, + { + "epoch": 4.881522406384285, + "grad_norm": 0.30019649863243103, + "learning_rate": 5.4348709048530646e-05, + "loss": 1.8067, + "step": 15904 + }, + { + "epoch": 4.88182934315531, + "grad_norm": 0.20227669179439545, + "learning_rate": 5.4343757302799515e-05, + "loss": 1.7254, + "step": 15905 + }, + { + "epoch": 4.8821362799263355, + "grad_norm": 0.23447728157043457, + "learning_rate": 5.4338805514140836e-05, + "loss": 1.7314, + "step": 15906 + }, + { + "epoch": 4.882443216697361, + "grad_norm": 0.29545050859451294, + "learning_rate": 5.4333853682603506e-05, + "loss": 1.7659, + "step": 15907 + }, + { + "epoch": 4.882750153468385, + "grad_norm": 0.245390385389328, + "learning_rate": 5.432890180823652e-05, + "loss": 1.7264, + "step": 15908 + }, + { + "epoch": 4.883057090239411, + "grad_norm": 0.209987074136734, + "learning_rate": 5.432394989108879e-05, + "loss": 1.7174, + "step": 15909 + }, + { + "epoch": 4.883364027010436, + "grad_norm": 0.2402341365814209, + "learning_rate": 5.431899793120925e-05, + "loss": 1.7512, + "step": 15910 + }, + { + "epoch": 4.883670963781461, + "grad_norm": 0.26227688789367676, + "learning_rate": 5.431404592864684e-05, + "loss": 1.7697, + "step": 15911 + }, + { + "epoch": 4.883977900552486, + "grad_norm": 0.2556503117084503, + "learning_rate": 5.4309093883450504e-05, + "loss": 1.8191, + "step": 15912 + }, + { + "epoch": 4.884284837323511, + "grad_norm": 0.24766884744167328, + "learning_rate": 5.4304141795669174e-05, + "loss": 1.7574, + "step": 15913 + }, + { + "epoch": 4.884591774094536, + "grad_norm": 0.19925951957702637, + "learning_rate": 5.429918966535179e-05, + "loss": 1.7249, + "step": 15914 + }, + { + "epoch": 4.884898710865562, + "grad_norm": 0.1899442970752716, + "learning_rate": 5.4294237492547294e-05, + "loss": 1.7446, + "step": 15915 + }, + { + "epoch": 4.885205647636587, + "grad_norm": 0.25900956988334656, + "learning_rate": 5.4289285277304636e-05, + "loss": 1.725, + "step": 15916 + }, + { + "epoch": 4.885512584407612, + "grad_norm": 0.2537781000137329, + "learning_rate": 5.428433301967274e-05, + "loss": 1.7861, + "step": 15917 + }, + { + "epoch": 4.885819521178637, + "grad_norm": 0.26432034373283386, + "learning_rate": 5.427938071970054e-05, + "loss": 1.7538, + "step": 15918 + }, + { + "epoch": 4.886126457949662, + "grad_norm": 0.22722363471984863, + "learning_rate": 5.4274428377437e-05, + "loss": 1.7631, + "step": 15919 + }, + { + "epoch": 4.8864333947206875, + "grad_norm": 0.24846172332763672, + "learning_rate": 5.426947599293106e-05, + "loss": 1.7833, + "step": 15920 + }, + { + "epoch": 4.886740331491713, + "grad_norm": 0.24821995198726654, + "learning_rate": 5.426452356623165e-05, + "loss": 1.7638, + "step": 15921 + }, + { + "epoch": 4.887047268262738, + "grad_norm": 0.2796781063079834, + "learning_rate": 5.425957109738773e-05, + "loss": 1.6982, + "step": 15922 + }, + { + "epoch": 4.887354205033763, + "grad_norm": 0.2875385284423828, + "learning_rate": 5.425461858644821e-05, + "loss": 1.7172, + "step": 15923 + }, + { + "epoch": 4.887661141804788, + "grad_norm": 0.21614491939544678, + "learning_rate": 5.424966603346207e-05, + "loss": 1.7521, + "step": 15924 + }, + { + "epoch": 4.887968078575813, + "grad_norm": 0.22944390773773193, + "learning_rate": 5.4244713438478235e-05, + "loss": 1.772, + "step": 15925 + }, + { + "epoch": 4.888275015346839, + "grad_norm": 0.21566039323806763, + "learning_rate": 5.423976080154566e-05, + "loss": 1.734, + "step": 15926 + }, + { + "epoch": 4.888581952117864, + "grad_norm": 0.4253925383090973, + "learning_rate": 5.4234808122713275e-05, + "loss": 1.8017, + "step": 15927 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.239146426320076, + "learning_rate": 5.422985540203004e-05, + "loss": 1.7229, + "step": 15928 + }, + { + "epoch": 4.889195825659914, + "grad_norm": 0.2344054877758026, + "learning_rate": 5.42249026395449e-05, + "loss": 1.7111, + "step": 15929 + }, + { + "epoch": 4.889502762430939, + "grad_norm": 0.21717922389507294, + "learning_rate": 5.421994983530679e-05, + "loss": 1.7427, + "step": 15930 + }, + { + "epoch": 4.889809699201964, + "grad_norm": 0.26895472407341003, + "learning_rate": 5.421499698936466e-05, + "loss": 1.8402, + "step": 15931 + }, + { + "epoch": 4.89011663597299, + "grad_norm": 0.25761866569519043, + "learning_rate": 5.421004410176746e-05, + "loss": 1.7822, + "step": 15932 + }, + { + "epoch": 4.890423572744015, + "grad_norm": 0.24465128779411316, + "learning_rate": 5.420509117256415e-05, + "loss": 1.8074, + "step": 15933 + }, + { + "epoch": 4.8907305095150395, + "grad_norm": 0.2527398467063904, + "learning_rate": 5.4200138201803655e-05, + "loss": 1.7522, + "step": 15934 + }, + { + "epoch": 4.891037446286065, + "grad_norm": 0.23118112981319427, + "learning_rate": 5.4195185189534916e-05, + "loss": 1.7394, + "step": 15935 + }, + { + "epoch": 4.89134438305709, + "grad_norm": 0.2054537534713745, + "learning_rate": 5.419023213580691e-05, + "loss": 1.7096, + "step": 15936 + }, + { + "epoch": 4.8916513198281155, + "grad_norm": 0.2929638922214508, + "learning_rate": 5.418527904066858e-05, + "loss": 1.8733, + "step": 15937 + }, + { + "epoch": 4.891958256599141, + "grad_norm": 0.2957170009613037, + "learning_rate": 5.418032590416886e-05, + "loss": 1.7201, + "step": 15938 + }, + { + "epoch": 4.892265193370166, + "grad_norm": 0.2520081698894501, + "learning_rate": 5.417537272635672e-05, + "loss": 1.7034, + "step": 15939 + }, + { + "epoch": 4.892572130141191, + "grad_norm": 0.25217053294181824, + "learning_rate": 5.41704195072811e-05, + "loss": 1.8538, + "step": 15940 + }, + { + "epoch": 4.892879066912216, + "grad_norm": 0.23605379462242126, + "learning_rate": 5.416546624699093e-05, + "loss": 1.724, + "step": 15941 + }, + { + "epoch": 4.893186003683241, + "grad_norm": 0.321750283241272, + "learning_rate": 5.416051294553519e-05, + "loss": 1.806, + "step": 15942 + }, + { + "epoch": 4.893492940454267, + "grad_norm": 0.23800241947174072, + "learning_rate": 5.415555960296284e-05, + "loss": 1.7578, + "step": 15943 + }, + { + "epoch": 4.893799877225292, + "grad_norm": 0.3423094153404236, + "learning_rate": 5.4150606219322796e-05, + "loss": 1.7324, + "step": 15944 + }, + { + "epoch": 4.894106813996316, + "grad_norm": 0.453074187040329, + "learning_rate": 5.414565279466404e-05, + "loss": 1.7268, + "step": 15945 + }, + { + "epoch": 4.894413750767342, + "grad_norm": 0.21972697973251343, + "learning_rate": 5.4140699329035504e-05, + "loss": 1.6547, + "step": 15946 + }, + { + "epoch": 4.894720687538367, + "grad_norm": 0.32876282930374146, + "learning_rate": 5.413574582248616e-05, + "loss": 1.7527, + "step": 15947 + }, + { + "epoch": 4.895027624309392, + "grad_norm": 0.34035229682922363, + "learning_rate": 5.413079227506494e-05, + "loss": 1.7636, + "step": 15948 + }, + { + "epoch": 4.895334561080418, + "grad_norm": 0.2410411536693573, + "learning_rate": 5.412583868682082e-05, + "loss": 1.8114, + "step": 15949 + }, + { + "epoch": 4.895641497851443, + "grad_norm": 0.2787366211414337, + "learning_rate": 5.412088505780274e-05, + "loss": 1.7393, + "step": 15950 + }, + { + "epoch": 4.8959484346224675, + "grad_norm": 0.23288428783416748, + "learning_rate": 5.411593138805966e-05, + "loss": 1.7413, + "step": 15951 + }, + { + "epoch": 4.896255371393493, + "grad_norm": 0.26302778720855713, + "learning_rate": 5.411097767764053e-05, + "loss": 1.7372, + "step": 15952 + }, + { + "epoch": 4.896562308164518, + "grad_norm": 0.31638020277023315, + "learning_rate": 5.410602392659431e-05, + "loss": 1.8114, + "step": 15953 + }, + { + "epoch": 4.8968692449355435, + "grad_norm": 0.23361825942993164, + "learning_rate": 5.410107013496996e-05, + "loss": 1.7592, + "step": 15954 + }, + { + "epoch": 4.897176181706568, + "grad_norm": 0.19887785613536835, + "learning_rate": 5.409611630281642e-05, + "loss": 1.7509, + "step": 15955 + }, + { + "epoch": 4.897483118477593, + "grad_norm": 0.22396783530712128, + "learning_rate": 5.409116243018266e-05, + "loss": 1.6841, + "step": 15956 + }, + { + "epoch": 4.897790055248619, + "grad_norm": 0.20397686958312988, + "learning_rate": 5.4086208517117645e-05, + "loss": 1.7427, + "step": 15957 + }, + { + "epoch": 4.898096992019644, + "grad_norm": 0.20848311483860016, + "learning_rate": 5.4081254563670314e-05, + "loss": 1.713, + "step": 15958 + }, + { + "epoch": 4.898403928790669, + "grad_norm": 0.2739275395870209, + "learning_rate": 5.407630056988964e-05, + "loss": 1.7673, + "step": 15959 + }, + { + "epoch": 4.898710865561695, + "grad_norm": 0.21485929191112518, + "learning_rate": 5.407134653582456e-05, + "loss": 1.7347, + "step": 15960 + }, + { + "epoch": 4.899017802332719, + "grad_norm": 0.26980286836624146, + "learning_rate": 5.406639246152406e-05, + "loss": 1.7158, + "step": 15961 + }, + { + "epoch": 4.899324739103744, + "grad_norm": 0.22327515482902527, + "learning_rate": 5.4061438347037084e-05, + "loss": 1.7387, + "step": 15962 + }, + { + "epoch": 4.89963167587477, + "grad_norm": 0.2542823553085327, + "learning_rate": 5.4056484192412603e-05, + "loss": 1.7826, + "step": 15963 + }, + { + "epoch": 4.899938612645795, + "grad_norm": 0.3248840868473053, + "learning_rate": 5.405152999769956e-05, + "loss": 1.7878, + "step": 15964 + }, + { + "epoch": 4.9002455494168204, + "grad_norm": 0.21210803091526031, + "learning_rate": 5.404657576294691e-05, + "loss": 1.7378, + "step": 15965 + }, + { + "epoch": 4.900552486187845, + "grad_norm": 0.25679782032966614, + "learning_rate": 5.404162148820365e-05, + "loss": 1.7493, + "step": 15966 + }, + { + "epoch": 4.90085942295887, + "grad_norm": 0.36698678135871887, + "learning_rate": 5.4036667173518704e-05, + "loss": 1.7662, + "step": 15967 + }, + { + "epoch": 4.901166359729896, + "grad_norm": 0.3396874964237213, + "learning_rate": 5.403171281894105e-05, + "loss": 1.7618, + "step": 15968 + }, + { + "epoch": 4.901473296500921, + "grad_norm": 0.2792030870914459, + "learning_rate": 5.402675842451964e-05, + "loss": 1.7858, + "step": 15969 + }, + { + "epoch": 4.901780233271946, + "grad_norm": 0.24499626457691193, + "learning_rate": 5.4021803990303454e-05, + "loss": 1.7503, + "step": 15970 + }, + { + "epoch": 4.902087170042972, + "grad_norm": 0.29185110330581665, + "learning_rate": 5.401684951634144e-05, + "loss": 1.7536, + "step": 15971 + }, + { + "epoch": 4.902394106813996, + "grad_norm": 0.2480020374059677, + "learning_rate": 5.401189500268256e-05, + "loss": 1.7877, + "step": 15972 + }, + { + "epoch": 4.902701043585021, + "grad_norm": 0.3302663564682007, + "learning_rate": 5.400694044937579e-05, + "loss": 1.8693, + "step": 15973 + }, + { + "epoch": 4.903007980356047, + "grad_norm": 0.2500915825366974, + "learning_rate": 5.400198585647008e-05, + "loss": 1.7489, + "step": 15974 + }, + { + "epoch": 4.903314917127072, + "grad_norm": 0.25079864263534546, + "learning_rate": 5.399703122401441e-05, + "loss": 1.7965, + "step": 15975 + }, + { + "epoch": 4.903621853898097, + "grad_norm": 0.2643207907676697, + "learning_rate": 5.399207655205771e-05, + "loss": 1.7696, + "step": 15976 + }, + { + "epoch": 4.903928790669122, + "grad_norm": 0.23719522356987, + "learning_rate": 5.398712184064899e-05, + "loss": 1.7608, + "step": 15977 + }, + { + "epoch": 4.904235727440147, + "grad_norm": 0.25226888060569763, + "learning_rate": 5.3982167089837184e-05, + "loss": 1.8055, + "step": 15978 + }, + { + "epoch": 4.9045426642111725, + "grad_norm": 0.21601852774620056, + "learning_rate": 5.39772122996713e-05, + "loss": 1.7553, + "step": 15979 + }, + { + "epoch": 4.904849600982198, + "grad_norm": 0.20275430381298065, + "learning_rate": 5.397225747020023e-05, + "loss": 1.7221, + "step": 15980 + }, + { + "epoch": 4.905156537753223, + "grad_norm": 0.24815937876701355, + "learning_rate": 5.3967302601473e-05, + "loss": 1.8098, + "step": 15981 + }, + { + "epoch": 4.9054634745242485, + "grad_norm": 0.2193612903356552, + "learning_rate": 5.3962347693538575e-05, + "loss": 1.7116, + "step": 15982 + }, + { + "epoch": 4.905770411295273, + "grad_norm": 0.21409118175506592, + "learning_rate": 5.395739274644589e-05, + "loss": 1.7503, + "step": 15983 + }, + { + "epoch": 4.906077348066298, + "grad_norm": 0.20907564461231232, + "learning_rate": 5.3952437760243935e-05, + "loss": 1.7518, + "step": 15984 + }, + { + "epoch": 4.906384284837324, + "grad_norm": 0.21193571388721466, + "learning_rate": 5.394748273498168e-05, + "loss": 1.6905, + "step": 15985 + }, + { + "epoch": 4.906691221608349, + "grad_norm": 0.19729891419410706, + "learning_rate": 5.394252767070808e-05, + "loss": 1.7398, + "step": 15986 + }, + { + "epoch": 4.906998158379373, + "grad_norm": 0.2654789686203003, + "learning_rate": 5.393757256747211e-05, + "loss": 1.7931, + "step": 15987 + }, + { + "epoch": 4.907305095150399, + "grad_norm": 0.2627345025539398, + "learning_rate": 5.3932617425322726e-05, + "loss": 1.8174, + "step": 15988 + }, + { + "epoch": 4.907612031921424, + "grad_norm": 0.27162298560142517, + "learning_rate": 5.392766224430894e-05, + "loss": 1.8015, + "step": 15989 + }, + { + "epoch": 4.907918968692449, + "grad_norm": 0.24248667061328888, + "learning_rate": 5.3922707024479676e-05, + "loss": 1.7457, + "step": 15990 + }, + { + "epoch": 4.908225905463475, + "grad_norm": 0.24715331196784973, + "learning_rate": 5.391775176588393e-05, + "loss": 1.7724, + "step": 15991 + }, + { + "epoch": 4.9085328422345, + "grad_norm": 0.26335644721984863, + "learning_rate": 5.3912796468570656e-05, + "loss": 1.7183, + "step": 15992 + }, + { + "epoch": 4.9088397790055245, + "grad_norm": 0.23459944128990173, + "learning_rate": 5.3907841132588843e-05, + "loss": 1.7245, + "step": 15993 + }, + { + "epoch": 4.90914671577655, + "grad_norm": 0.21779637038707733, + "learning_rate": 5.3902885757987444e-05, + "loss": 1.7485, + "step": 15994 + }, + { + "epoch": 4.909453652547575, + "grad_norm": 0.227664977312088, + "learning_rate": 5.389793034481545e-05, + "loss": 1.7418, + "step": 15995 + }, + { + "epoch": 4.9097605893186005, + "grad_norm": 0.26230278611183167, + "learning_rate": 5.389297489312183e-05, + "loss": 1.7619, + "step": 15996 + }, + { + "epoch": 4.910067526089626, + "grad_norm": 0.22563579678535461, + "learning_rate": 5.388801940295555e-05, + "loss": 1.7168, + "step": 15997 + }, + { + "epoch": 4.91037446286065, + "grad_norm": 0.24829435348510742, + "learning_rate": 5.388306387436556e-05, + "loss": 1.7422, + "step": 15998 + }, + { + "epoch": 4.910681399631676, + "grad_norm": 0.24395976960659027, + "learning_rate": 5.387810830740088e-05, + "loss": 1.7783, + "step": 15999 + }, + { + "epoch": 4.910988336402701, + "grad_norm": 0.2189297378063202, + "learning_rate": 5.387315270211044e-05, + "loss": 1.7885, + "step": 16000 + }, + { + "epoch": 4.911295273173726, + "grad_norm": 0.21750971674919128, + "learning_rate": 5.386819705854324e-05, + "loss": 1.7659, + "step": 16001 + }, + { + "epoch": 4.911602209944752, + "grad_norm": 0.21907657384872437, + "learning_rate": 5.386324137674826e-05, + "loss": 1.789, + "step": 16002 + }, + { + "epoch": 4.911909146715777, + "grad_norm": 0.18778781592845917, + "learning_rate": 5.3858285656774465e-05, + "loss": 1.7151, + "step": 16003 + }, + { + "epoch": 4.912216083486801, + "grad_norm": 0.24217712879180908, + "learning_rate": 5.385332989867082e-05, + "loss": 1.8108, + "step": 16004 + }, + { + "epoch": 4.912523020257827, + "grad_norm": 0.27637016773223877, + "learning_rate": 5.384837410248632e-05, + "loss": 1.8368, + "step": 16005 + }, + { + "epoch": 4.912829957028852, + "grad_norm": 0.22366084158420563, + "learning_rate": 5.3843418268269926e-05, + "loss": 1.7351, + "step": 16006 + }, + { + "epoch": 4.913136893799877, + "grad_norm": 0.2742357552051544, + "learning_rate": 5.383846239607062e-05, + "loss": 1.7599, + "step": 16007 + }, + { + "epoch": 4.913443830570903, + "grad_norm": 0.2288598269224167, + "learning_rate": 5.383350648593738e-05, + "loss": 1.7056, + "step": 16008 + }, + { + "epoch": 4.913750767341927, + "grad_norm": 0.23319020867347717, + "learning_rate": 5.382855053791919e-05, + "loss": 1.7356, + "step": 16009 + }, + { + "epoch": 4.9140577041129525, + "grad_norm": 0.2232198268175125, + "learning_rate": 5.382359455206499e-05, + "loss": 1.7375, + "step": 16010 + }, + { + "epoch": 4.914364640883978, + "grad_norm": 0.24420048296451569, + "learning_rate": 5.381863852842381e-05, + "loss": 1.8287, + "step": 16011 + }, + { + "epoch": 4.914671577655003, + "grad_norm": 0.22653080523014069, + "learning_rate": 5.381368246704461e-05, + "loss": 1.7137, + "step": 16012 + }, + { + "epoch": 4.9149785144260285, + "grad_norm": 0.20439405739307404, + "learning_rate": 5.380872636797637e-05, + "loss": 1.7688, + "step": 16013 + }, + { + "epoch": 4.915285451197054, + "grad_norm": 0.2602155804634094, + "learning_rate": 5.380377023126806e-05, + "loss": 1.7875, + "step": 16014 + }, + { + "epoch": 4.915592387968078, + "grad_norm": 0.2757892608642578, + "learning_rate": 5.3798814056968647e-05, + "loss": 1.7446, + "step": 16015 + }, + { + "epoch": 4.915899324739104, + "grad_norm": 0.25938209891319275, + "learning_rate": 5.379385784512714e-05, + "loss": 1.6997, + "step": 16016 + }, + { + "epoch": 4.916206261510129, + "grad_norm": 0.2056962549686432, + "learning_rate": 5.37889015957925e-05, + "loss": 1.6961, + "step": 16017 + }, + { + "epoch": 4.916513198281154, + "grad_norm": 0.24388402700424194, + "learning_rate": 5.3783945309013714e-05, + "loss": 1.712, + "step": 16018 + }, + { + "epoch": 4.91682013505218, + "grad_norm": 0.2381993532180786, + "learning_rate": 5.3778988984839775e-05, + "loss": 1.7444, + "step": 16019 + }, + { + "epoch": 4.917127071823204, + "grad_norm": 0.20201562345027924, + "learning_rate": 5.377403262331964e-05, + "loss": 1.7254, + "step": 16020 + }, + { + "epoch": 4.917434008594229, + "grad_norm": 0.24019409716129303, + "learning_rate": 5.376907622450229e-05, + "loss": 1.684, + "step": 16021 + }, + { + "epoch": 4.917740945365255, + "grad_norm": 0.2441694289445877, + "learning_rate": 5.376411978843674e-05, + "loss": 1.7334, + "step": 16022 + }, + { + "epoch": 4.91804788213628, + "grad_norm": 0.23866300284862518, + "learning_rate": 5.3759163315171945e-05, + "loss": 1.7258, + "step": 16023 + }, + { + "epoch": 4.918354818907305, + "grad_norm": 0.28068670630455017, + "learning_rate": 5.375420680475689e-05, + "loss": 1.8049, + "step": 16024 + }, + { + "epoch": 4.918661755678331, + "grad_norm": 0.2956274151802063, + "learning_rate": 5.3749250257240566e-05, + "loss": 1.8544, + "step": 16025 + }, + { + "epoch": 4.918968692449355, + "grad_norm": 0.1971627175807953, + "learning_rate": 5.374429367267196e-05, + "loss": 1.7314, + "step": 16026 + }, + { + "epoch": 4.9192756292203805, + "grad_norm": 0.28565749526023865, + "learning_rate": 5.373933705110004e-05, + "loss": 1.7587, + "step": 16027 + }, + { + "epoch": 4.919582565991406, + "grad_norm": 0.3087369501590729, + "learning_rate": 5.37343803925738e-05, + "loss": 1.7708, + "step": 16028 + }, + { + "epoch": 4.919889502762431, + "grad_norm": 0.22460010647773743, + "learning_rate": 5.372942369714223e-05, + "loss": 1.7401, + "step": 16029 + }, + { + "epoch": 4.920196439533456, + "grad_norm": 0.29492735862731934, + "learning_rate": 5.3724466964854326e-05, + "loss": 1.7033, + "step": 16030 + }, + { + "epoch": 4.920503376304481, + "grad_norm": 0.24452674388885498, + "learning_rate": 5.371951019575904e-05, + "loss": 1.7688, + "step": 16031 + }, + { + "epoch": 4.920810313075506, + "grad_norm": 0.24686957895755768, + "learning_rate": 5.3714553389905366e-05, + "loss": 1.7463, + "step": 16032 + }, + { + "epoch": 4.921117249846532, + "grad_norm": 0.23661597073078156, + "learning_rate": 5.37095965473423e-05, + "loss": 1.7256, + "step": 16033 + }, + { + "epoch": 4.921424186617557, + "grad_norm": 0.22861288487911224, + "learning_rate": 5.370463966811884e-05, + "loss": 1.7722, + "step": 16034 + }, + { + "epoch": 4.921731123388582, + "grad_norm": 0.2453136146068573, + "learning_rate": 5.3699682752283944e-05, + "loss": 1.7343, + "step": 16035 + }, + { + "epoch": 4.922038060159607, + "grad_norm": 0.25267064571380615, + "learning_rate": 5.369472579988663e-05, + "loss": 1.7817, + "step": 16036 + }, + { + "epoch": 4.922344996930632, + "grad_norm": 0.25301575660705566, + "learning_rate": 5.368976881097586e-05, + "loss": 1.8146, + "step": 16037 + }, + { + "epoch": 4.922651933701657, + "grad_norm": 0.23579831421375275, + "learning_rate": 5.368481178560062e-05, + "loss": 1.8089, + "step": 16038 + }, + { + "epoch": 4.922958870472683, + "grad_norm": 0.2181949019432068, + "learning_rate": 5.367985472380993e-05, + "loss": 1.7689, + "step": 16039 + }, + { + "epoch": 4.923265807243708, + "grad_norm": 0.24622827768325806, + "learning_rate": 5.367489762565276e-05, + "loss": 1.791, + "step": 16040 + }, + { + "epoch": 4.9235727440147325, + "grad_norm": 0.2545134723186493, + "learning_rate": 5.3669940491178084e-05, + "loss": 1.738, + "step": 16041 + }, + { + "epoch": 4.923879680785758, + "grad_norm": 0.258139431476593, + "learning_rate": 5.366498332043491e-05, + "loss": 1.8303, + "step": 16042 + }, + { + "epoch": 4.924186617556783, + "grad_norm": 0.23804105818271637, + "learning_rate": 5.366002611347223e-05, + "loss": 1.751, + "step": 16043 + }, + { + "epoch": 4.9244935543278086, + "grad_norm": 0.2354477345943451, + "learning_rate": 5.365506887033901e-05, + "loss": 1.7911, + "step": 16044 + }, + { + "epoch": 4.924800491098834, + "grad_norm": 0.22212550044059753, + "learning_rate": 5.3650111591084276e-05, + "loss": 1.7439, + "step": 16045 + }, + { + "epoch": 4.925107427869859, + "grad_norm": 0.23621168732643127, + "learning_rate": 5.3645154275756984e-05, + "loss": 1.7339, + "step": 16046 + }, + { + "epoch": 4.925414364640884, + "grad_norm": 0.2163209468126297, + "learning_rate": 5.364019692440616e-05, + "loss": 1.7247, + "step": 16047 + }, + { + "epoch": 4.925721301411909, + "grad_norm": 0.21352291107177734, + "learning_rate": 5.3635239537080774e-05, + "loss": 1.7431, + "step": 16048 + }, + { + "epoch": 4.926028238182934, + "grad_norm": 0.3170754909515381, + "learning_rate": 5.36302821138298e-05, + "loss": 1.8075, + "step": 16049 + }, + { + "epoch": 4.92633517495396, + "grad_norm": 0.27073633670806885, + "learning_rate": 5.362532465470226e-05, + "loss": 1.7209, + "step": 16050 + }, + { + "epoch": 4.926642111724985, + "grad_norm": 0.2677803039550781, + "learning_rate": 5.362036715974714e-05, + "loss": 1.7454, + "step": 16051 + }, + { + "epoch": 4.9269490484960095, + "grad_norm": 0.3555704355239868, + "learning_rate": 5.3615409629013436e-05, + "loss": 1.7737, + "step": 16052 + }, + { + "epoch": 4.927255985267035, + "grad_norm": 0.2819947302341461, + "learning_rate": 5.3610452062550124e-05, + "loss": 1.7588, + "step": 16053 + }, + { + "epoch": 4.92756292203806, + "grad_norm": 0.26638996601104736, + "learning_rate": 5.360549446040621e-05, + "loss": 1.8078, + "step": 16054 + }, + { + "epoch": 4.9278698588090855, + "grad_norm": 0.37828773260116577, + "learning_rate": 5.360053682263069e-05, + "loss": 1.7527, + "step": 16055 + }, + { + "epoch": 4.928176795580111, + "grad_norm": 0.35836395621299744, + "learning_rate": 5.359557914927254e-05, + "loss": 1.7199, + "step": 16056 + }, + { + "epoch": 4.928483732351136, + "grad_norm": 0.2720802128314972, + "learning_rate": 5.359062144038078e-05, + "loss": 1.7598, + "step": 16057 + }, + { + "epoch": 4.928790669122161, + "grad_norm": 0.36662939190864563, + "learning_rate": 5.358566369600441e-05, + "loss": 1.7199, + "step": 16058 + }, + { + "epoch": 4.929097605893186, + "grad_norm": 0.42243221402168274, + "learning_rate": 5.3580705916192395e-05, + "loss": 1.7584, + "step": 16059 + }, + { + "epoch": 4.929404542664211, + "grad_norm": 0.21667765080928802, + "learning_rate": 5.357574810099375e-05, + "loss": 1.7608, + "step": 16060 + }, + { + "epoch": 4.929711479435237, + "grad_norm": 0.48101645708084106, + "learning_rate": 5.3570790250457456e-05, + "loss": 1.8157, + "step": 16061 + }, + { + "epoch": 4.930018416206261, + "grad_norm": 0.5289245843887329, + "learning_rate": 5.356583236463253e-05, + "loss": 1.7173, + "step": 16062 + }, + { + "epoch": 4.930325352977286, + "grad_norm": 0.21454930305480957, + "learning_rate": 5.356087444356795e-05, + "loss": 1.7399, + "step": 16063 + }, + { + "epoch": 4.930632289748312, + "grad_norm": 0.5648324489593506, + "learning_rate": 5.355591648731274e-05, + "loss": 1.7814, + "step": 16064 + }, + { + "epoch": 4.930939226519337, + "grad_norm": 0.5669483542442322, + "learning_rate": 5.355095849591587e-05, + "loss": 1.7769, + "step": 16065 + }, + { + "epoch": 4.931246163290362, + "grad_norm": 0.33108505606651306, + "learning_rate": 5.354600046942635e-05, + "loss": 1.7704, + "step": 16066 + }, + { + "epoch": 4.931553100061388, + "grad_norm": 0.31149306893348694, + "learning_rate": 5.3541042407893164e-05, + "loss": 1.7631, + "step": 16067 + }, + { + "epoch": 4.931860036832412, + "grad_norm": 0.30377596616744995, + "learning_rate": 5.353608431136532e-05, + "loss": 1.7888, + "step": 16068 + }, + { + "epoch": 4.9321669736034375, + "grad_norm": 0.25041452050209045, + "learning_rate": 5.3531126179891825e-05, + "loss": 1.7507, + "step": 16069 + }, + { + "epoch": 4.932473910374463, + "grad_norm": 0.33900725841522217, + "learning_rate": 5.352616801352167e-05, + "loss": 1.7365, + "step": 16070 + }, + { + "epoch": 4.932780847145488, + "grad_norm": 0.23939846456050873, + "learning_rate": 5.352120981230386e-05, + "loss": 1.7934, + "step": 16071 + }, + { + "epoch": 4.9330877839165135, + "grad_norm": 0.2419881969690323, + "learning_rate": 5.351625157628739e-05, + "loss": 1.7555, + "step": 16072 + }, + { + "epoch": 4.933394720687538, + "grad_norm": 0.3517596423625946, + "learning_rate": 5.351129330552125e-05, + "loss": 1.7102, + "step": 16073 + }, + { + "epoch": 4.933701657458563, + "grad_norm": 0.2660250663757324, + "learning_rate": 5.350633500005446e-05, + "loss": 1.7692, + "step": 16074 + }, + { + "epoch": 4.934008594229589, + "grad_norm": 0.20726454257965088, + "learning_rate": 5.350137665993601e-05, + "loss": 1.718, + "step": 16075 + }, + { + "epoch": 4.934315531000614, + "grad_norm": 0.28218522667884827, + "learning_rate": 5.3496418285214914e-05, + "loss": 1.8402, + "step": 16076 + }, + { + "epoch": 4.934622467771639, + "grad_norm": 0.2142515480518341, + "learning_rate": 5.349145987594015e-05, + "loss": 1.7571, + "step": 16077 + }, + { + "epoch": 4.934929404542665, + "grad_norm": 0.2777026891708374, + "learning_rate": 5.348650143216074e-05, + "loss": 1.7617, + "step": 16078 + }, + { + "epoch": 4.935236341313689, + "grad_norm": 0.24057620763778687, + "learning_rate": 5.348154295392567e-05, + "loss": 1.7149, + "step": 16079 + }, + { + "epoch": 4.935543278084714, + "grad_norm": 0.22220350801944733, + "learning_rate": 5.3476584441283964e-05, + "loss": 1.7402, + "step": 16080 + }, + { + "epoch": 4.93585021485574, + "grad_norm": 0.2451290488243103, + "learning_rate": 5.347162589428462e-05, + "loss": 1.7004, + "step": 16081 + }, + { + "epoch": 4.936157151626765, + "grad_norm": 0.25621771812438965, + "learning_rate": 5.3466667312976625e-05, + "loss": 1.7765, + "step": 16082 + }, + { + "epoch": 4.93646408839779, + "grad_norm": 0.217393159866333, + "learning_rate": 5.346170869740899e-05, + "loss": 1.7695, + "step": 16083 + }, + { + "epoch": 4.936771025168815, + "grad_norm": 0.21248537302017212, + "learning_rate": 5.345675004763071e-05, + "loss": 1.7277, + "step": 16084 + }, + { + "epoch": 4.93707796193984, + "grad_norm": 0.19431474804878235, + "learning_rate": 5.3451791363690805e-05, + "loss": 1.7352, + "step": 16085 + }, + { + "epoch": 4.9373848987108655, + "grad_norm": 0.20233909785747528, + "learning_rate": 5.344683264563829e-05, + "loss": 1.71, + "step": 16086 + }, + { + "epoch": 4.937691835481891, + "grad_norm": 0.2199622094631195, + "learning_rate": 5.344187389352214e-05, + "loss": 1.7443, + "step": 16087 + }, + { + "epoch": 4.937998772252916, + "grad_norm": 0.23495158553123474, + "learning_rate": 5.343691510739138e-05, + "loss": 1.7758, + "step": 16088 + }, + { + "epoch": 4.9383057090239415, + "grad_norm": 0.228348970413208, + "learning_rate": 5.3431956287295015e-05, + "loss": 1.7645, + "step": 16089 + }, + { + "epoch": 4.938612645794966, + "grad_norm": 0.2337537258863449, + "learning_rate": 5.342699743328203e-05, + "loss": 1.7353, + "step": 16090 + }, + { + "epoch": 4.938919582565991, + "grad_norm": 0.1899309754371643, + "learning_rate": 5.3422038545401454e-05, + "loss": 1.6907, + "step": 16091 + }, + { + "epoch": 4.939226519337017, + "grad_norm": 0.2479192316532135, + "learning_rate": 5.341707962370229e-05, + "loss": 1.7961, + "step": 16092 + }, + { + "epoch": 4.939533456108042, + "grad_norm": 0.2444314956665039, + "learning_rate": 5.341212066823355e-05, + "loss": 1.7768, + "step": 16093 + }, + { + "epoch": 4.939840392879067, + "grad_norm": 0.2123393714427948, + "learning_rate": 5.340716167904423e-05, + "loss": 1.7617, + "step": 16094 + }, + { + "epoch": 4.940147329650092, + "grad_norm": 0.20779116451740265, + "learning_rate": 5.340220265618334e-05, + "loss": 1.6951, + "step": 16095 + }, + { + "epoch": 4.940454266421117, + "grad_norm": 0.22189265489578247, + "learning_rate": 5.3397243599699884e-05, + "loss": 1.8368, + "step": 16096 + }, + { + "epoch": 4.940761203192142, + "grad_norm": 0.22316497564315796, + "learning_rate": 5.3392284509642875e-05, + "loss": 1.7096, + "step": 16097 + }, + { + "epoch": 4.941068139963168, + "grad_norm": 0.20406664907932281, + "learning_rate": 5.3387325386061346e-05, + "loss": 1.7269, + "step": 16098 + }, + { + "epoch": 4.941375076734193, + "grad_norm": 0.263007789850235, + "learning_rate": 5.338236622900427e-05, + "loss": 1.7663, + "step": 16099 + }, + { + "epoch": 4.941682013505218, + "grad_norm": 0.24388311803340912, + "learning_rate": 5.3377407038520654e-05, + "loss": 1.7113, + "step": 16100 + }, + { + "epoch": 4.941988950276243, + "grad_norm": 0.21918313205242157, + "learning_rate": 5.3372447814659524e-05, + "loss": 1.775, + "step": 16101 + }, + { + "epoch": 4.942295887047268, + "grad_norm": 0.30842962861061096, + "learning_rate": 5.336748855746989e-05, + "loss": 1.8229, + "step": 16102 + }, + { + "epoch": 4.9426028238182935, + "grad_norm": 0.2875657379627228, + "learning_rate": 5.336252926700077e-05, + "loss": 1.7377, + "step": 16103 + }, + { + "epoch": 4.942909760589319, + "grad_norm": 0.23411425948143005, + "learning_rate": 5.3357569943301156e-05, + "loss": 1.754, + "step": 16104 + }, + { + "epoch": 4.943216697360343, + "grad_norm": 0.29758864641189575, + "learning_rate": 5.335261058642007e-05, + "loss": 1.7471, + "step": 16105 + }, + { + "epoch": 4.943523634131369, + "grad_norm": 0.31761085987091064, + "learning_rate": 5.3347651196406534e-05, + "loss": 1.7658, + "step": 16106 + }, + { + "epoch": 4.943830570902394, + "grad_norm": 0.2487023025751114, + "learning_rate": 5.334269177330952e-05, + "loss": 1.786, + "step": 16107 + }, + { + "epoch": 4.944137507673419, + "grad_norm": 0.23954913020133972, + "learning_rate": 5.333773231717808e-05, + "loss": 1.8486, + "step": 16108 + }, + { + "epoch": 4.944444444444445, + "grad_norm": 0.24893096089363098, + "learning_rate": 5.3332772828061214e-05, + "loss": 1.7927, + "step": 16109 + }, + { + "epoch": 4.94475138121547, + "grad_norm": 0.28653839230537415, + "learning_rate": 5.332781330600795e-05, + "loss": 1.8331, + "step": 16110 + }, + { + "epoch": 4.945058317986494, + "grad_norm": 0.2597404718399048, + "learning_rate": 5.332285375106726e-05, + "loss": 1.7128, + "step": 16111 + }, + { + "epoch": 4.94536525475752, + "grad_norm": 0.23813198506832123, + "learning_rate": 5.3317894163288196e-05, + "loss": 1.7483, + "step": 16112 + }, + { + "epoch": 4.945672191528545, + "grad_norm": 0.2545793652534485, + "learning_rate": 5.331293454271974e-05, + "loss": 1.7987, + "step": 16113 + }, + { + "epoch": 4.94597912829957, + "grad_norm": 0.2453712821006775, + "learning_rate": 5.330797488941095e-05, + "loss": 1.7376, + "step": 16114 + }, + { + "epoch": 4.946286065070596, + "grad_norm": 0.20583751797676086, + "learning_rate": 5.33030152034108e-05, + "loss": 1.7038, + "step": 16115 + }, + { + "epoch": 4.94659300184162, + "grad_norm": 0.22557811439037323, + "learning_rate": 5.3298055484768313e-05, + "loss": 1.6999, + "step": 16116 + }, + { + "epoch": 4.9468999386126455, + "grad_norm": 0.23163801431655884, + "learning_rate": 5.329309573353252e-05, + "loss": 1.7575, + "step": 16117 + }, + { + "epoch": 4.947206875383671, + "grad_norm": 0.3560176491737366, + "learning_rate": 5.3288135949752394e-05, + "loss": 1.8494, + "step": 16118 + }, + { + "epoch": 4.947513812154696, + "grad_norm": 0.306379109621048, + "learning_rate": 5.328317613347701e-05, + "loss": 1.7229, + "step": 16119 + }, + { + "epoch": 4.9478207489257215, + "grad_norm": 0.24428823590278625, + "learning_rate": 5.3278216284755344e-05, + "loss": 1.7939, + "step": 16120 + }, + { + "epoch": 4.948127685696747, + "grad_norm": 0.22251521050930023, + "learning_rate": 5.327325640363643e-05, + "loss": 1.7624, + "step": 16121 + }, + { + "epoch": 4.948434622467771, + "grad_norm": 0.23310889303684235, + "learning_rate": 5.326829649016928e-05, + "loss": 1.7727, + "step": 16122 + }, + { + "epoch": 4.948741559238797, + "grad_norm": 0.22457881271839142, + "learning_rate": 5.326333654440291e-05, + "loss": 1.7602, + "step": 16123 + }, + { + "epoch": 4.949048496009822, + "grad_norm": 0.24032343924045563, + "learning_rate": 5.325837656638631e-05, + "loss": 1.7591, + "step": 16124 + }, + { + "epoch": 4.949355432780847, + "grad_norm": 0.25082892179489136, + "learning_rate": 5.3253416556168546e-05, + "loss": 1.7745, + "step": 16125 + }, + { + "epoch": 4.949662369551873, + "grad_norm": 0.22859038412570953, + "learning_rate": 5.3248456513798615e-05, + "loss": 1.7475, + "step": 16126 + }, + { + "epoch": 4.949969306322897, + "grad_norm": 0.27282553911209106, + "learning_rate": 5.3243496439325525e-05, + "loss": 1.7438, + "step": 16127 + }, + { + "epoch": 4.9502762430939224, + "grad_norm": 0.23622353374958038, + "learning_rate": 5.3238536332798303e-05, + "loss": 1.7625, + "step": 16128 + }, + { + "epoch": 4.950583179864948, + "grad_norm": 0.28060024976730347, + "learning_rate": 5.3233576194265975e-05, + "loss": 1.8028, + "step": 16129 + }, + { + "epoch": 4.950890116635973, + "grad_norm": 0.33281829953193665, + "learning_rate": 5.322861602377755e-05, + "loss": 1.7163, + "step": 16130 + }, + { + "epoch": 4.9511970534069984, + "grad_norm": 0.26457497477531433, + "learning_rate": 5.322365582138203e-05, + "loss": 1.7347, + "step": 16131 + }, + { + "epoch": 4.951503990178024, + "grad_norm": 0.21651674807071686, + "learning_rate": 5.3218695587128476e-05, + "loss": 1.7123, + "step": 16132 + }, + { + "epoch": 4.951810926949048, + "grad_norm": 0.2299882024526596, + "learning_rate": 5.3213735321065885e-05, + "loss": 1.775, + "step": 16133 + }, + { + "epoch": 4.952117863720074, + "grad_norm": 0.2252396047115326, + "learning_rate": 5.3208775023243265e-05, + "loss": 1.7598, + "step": 16134 + }, + { + "epoch": 4.952424800491099, + "grad_norm": 0.2263660430908203, + "learning_rate": 5.3203814693709655e-05, + "loss": 1.7519, + "step": 16135 + }, + { + "epoch": 4.952731737262124, + "grad_norm": 0.2425432950258255, + "learning_rate": 5.3198854332514056e-05, + "loss": 1.7769, + "step": 16136 + }, + { + "epoch": 4.953038674033149, + "grad_norm": 0.22624996304512024, + "learning_rate": 5.319389393970553e-05, + "loss": 1.7686, + "step": 16137 + }, + { + "epoch": 4.953345610804174, + "grad_norm": 0.2240568846464157, + "learning_rate": 5.318893351533306e-05, + "loss": 1.7795, + "step": 16138 + }, + { + "epoch": 4.953652547575199, + "grad_norm": 0.21708132326602936, + "learning_rate": 5.318397305944568e-05, + "loss": 1.7348, + "step": 16139 + }, + { + "epoch": 4.953959484346225, + "grad_norm": 0.2263328731060028, + "learning_rate": 5.3179012572092415e-05, + "loss": 1.7645, + "step": 16140 + }, + { + "epoch": 4.95426642111725, + "grad_norm": 0.2541986107826233, + "learning_rate": 5.3174052053322274e-05, + "loss": 1.723, + "step": 16141 + }, + { + "epoch": 4.954573357888275, + "grad_norm": 0.25829461216926575, + "learning_rate": 5.316909150318429e-05, + "loss": 1.7469, + "step": 16142 + }, + { + "epoch": 4.9548802946593, + "grad_norm": 0.21251125633716583, + "learning_rate": 5.3164130921727494e-05, + "loss": 1.7699, + "step": 16143 + }, + { + "epoch": 4.955187231430325, + "grad_norm": 0.29195618629455566, + "learning_rate": 5.315917030900091e-05, + "loss": 1.7373, + "step": 16144 + }, + { + "epoch": 4.9554941682013505, + "grad_norm": 0.29457888007164, + "learning_rate": 5.315420966505355e-05, + "loss": 1.7202, + "step": 16145 + }, + { + "epoch": 4.955801104972376, + "grad_norm": 0.19679461419582367, + "learning_rate": 5.314924898993443e-05, + "loss": 1.75, + "step": 16146 + }, + { + "epoch": 4.956108041743401, + "grad_norm": 0.287955105304718, + "learning_rate": 5.314428828369259e-05, + "loss": 1.7385, + "step": 16147 + }, + { + "epoch": 4.956414978514426, + "grad_norm": 0.3081825375556946, + "learning_rate": 5.313932754637706e-05, + "loss": 1.7558, + "step": 16148 + }, + { + "epoch": 4.956721915285451, + "grad_norm": 0.25226521492004395, + "learning_rate": 5.3134366778036846e-05, + "loss": 1.8407, + "step": 16149 + }, + { + "epoch": 4.957028852056476, + "grad_norm": 0.43601852655410767, + "learning_rate": 5.3129405978720984e-05, + "loss": 1.7762, + "step": 16150 + }, + { + "epoch": 4.957335788827502, + "grad_norm": 0.3630274832248688, + "learning_rate": 5.31244451484785e-05, + "loss": 1.7802, + "step": 16151 + }, + { + "epoch": 4.957642725598527, + "grad_norm": 0.21337948739528656, + "learning_rate": 5.311948428735841e-05, + "loss": 1.7107, + "step": 16152 + }, + { + "epoch": 4.957949662369552, + "grad_norm": 0.38581085205078125, + "learning_rate": 5.311452339540974e-05, + "loss": 1.7583, + "step": 16153 + }, + { + "epoch": 4.958256599140577, + "grad_norm": 0.28447309136390686, + "learning_rate": 5.310956247268154e-05, + "loss": 1.6992, + "step": 16154 + }, + { + "epoch": 4.958563535911602, + "grad_norm": 0.24510730803012848, + "learning_rate": 5.310460151922283e-05, + "loss": 1.7059, + "step": 16155 + }, + { + "epoch": 4.958870472682627, + "grad_norm": 0.41670146584510803, + "learning_rate": 5.309964053508262e-05, + "loss": 1.7191, + "step": 16156 + }, + { + "epoch": 4.959177409453653, + "grad_norm": 0.3123849034309387, + "learning_rate": 5.309467952030993e-05, + "loss": 1.7161, + "step": 16157 + }, + { + "epoch": 4.959484346224678, + "grad_norm": 0.2275281697511673, + "learning_rate": 5.308971847495382e-05, + "loss": 1.722, + "step": 16158 + }, + { + "epoch": 4.9597912829957025, + "grad_norm": 0.40216436982154846, + "learning_rate": 5.308475739906329e-05, + "loss": 1.7477, + "step": 16159 + }, + { + "epoch": 4.960098219766728, + "grad_norm": 0.259981244802475, + "learning_rate": 5.307979629268739e-05, + "loss": 1.7384, + "step": 16160 + }, + { + "epoch": 4.960405156537753, + "grad_norm": 0.22969573736190796, + "learning_rate": 5.3074835155875134e-05, + "loss": 1.7328, + "step": 16161 + }, + { + "epoch": 4.9607120933087785, + "grad_norm": 0.2773746848106384, + "learning_rate": 5.3069873988675556e-05, + "loss": 1.7333, + "step": 16162 + }, + { + "epoch": 4.961019030079804, + "grad_norm": 0.2764189541339874, + "learning_rate": 5.306491279113768e-05, + "loss": 1.7956, + "step": 16163 + }, + { + "epoch": 4.961325966850829, + "grad_norm": 0.3640958070755005, + "learning_rate": 5.305995156331054e-05, + "loss": 1.7464, + "step": 16164 + }, + { + "epoch": 4.961632903621854, + "grad_norm": 0.3573450446128845, + "learning_rate": 5.305499030524317e-05, + "loss": 1.75, + "step": 16165 + }, + { + "epoch": 4.961939840392879, + "grad_norm": 0.24313980340957642, + "learning_rate": 5.305002901698459e-05, + "loss": 1.7505, + "step": 16166 + }, + { + "epoch": 4.962246777163904, + "grad_norm": 0.3417615592479706, + "learning_rate": 5.304506769858384e-05, + "loss": 1.7387, + "step": 16167 + }, + { + "epoch": 4.96255371393493, + "grad_norm": 0.23209623992443085, + "learning_rate": 5.304010635008995e-05, + "loss": 1.7111, + "step": 16168 + }, + { + "epoch": 4.962860650705955, + "grad_norm": 0.2994776666164398, + "learning_rate": 5.3035144971551944e-05, + "loss": 1.75, + "step": 16169 + }, + { + "epoch": 4.963167587476979, + "grad_norm": 0.3147084712982178, + "learning_rate": 5.303018356301884e-05, + "loss": 1.7598, + "step": 16170 + }, + { + "epoch": 4.963474524248005, + "grad_norm": 0.20136526226997375, + "learning_rate": 5.30252221245397e-05, + "loss": 1.7217, + "step": 16171 + }, + { + "epoch": 4.96378146101903, + "grad_norm": 0.3308684229850769, + "learning_rate": 5.302026065616355e-05, + "loss": 1.7554, + "step": 16172 + }, + { + "epoch": 4.964088397790055, + "grad_norm": 0.22890877723693848, + "learning_rate": 5.30152991579394e-05, + "loss": 1.7598, + "step": 16173 + }, + { + "epoch": 4.964395334561081, + "grad_norm": 0.3036035895347595, + "learning_rate": 5.301033762991631e-05, + "loss": 1.758, + "step": 16174 + }, + { + "epoch": 4.964702271332106, + "grad_norm": 0.2983579933643341, + "learning_rate": 5.300537607214329e-05, + "loss": 1.8132, + "step": 16175 + }, + { + "epoch": 4.9650092081031305, + "grad_norm": 0.21401815116405487, + "learning_rate": 5.300041448466937e-05, + "loss": 1.7179, + "step": 16176 + }, + { + "epoch": 4.965316144874156, + "grad_norm": 0.2939651608467102, + "learning_rate": 5.2995452867543606e-05, + "loss": 1.7928, + "step": 16177 + }, + { + "epoch": 4.965623081645181, + "grad_norm": 0.24803484976291656, + "learning_rate": 5.2990491220815034e-05, + "loss": 1.7366, + "step": 16178 + }, + { + "epoch": 4.9659300184162065, + "grad_norm": 0.1999569535255432, + "learning_rate": 5.2985529544532656e-05, + "loss": 1.6691, + "step": 16179 + }, + { + "epoch": 4.966236955187231, + "grad_norm": 0.22315269708633423, + "learning_rate": 5.298056783874553e-05, + "loss": 1.7693, + "step": 16180 + }, + { + "epoch": 4.966543891958256, + "grad_norm": 0.22688794136047363, + "learning_rate": 5.2975606103502694e-05, + "loss": 1.8401, + "step": 16181 + }, + { + "epoch": 4.966850828729282, + "grad_norm": 0.2592024505138397, + "learning_rate": 5.297064433885317e-05, + "loss": 1.8054, + "step": 16182 + }, + { + "epoch": 4.967157765500307, + "grad_norm": 0.2508920133113861, + "learning_rate": 5.2965682544846e-05, + "loss": 1.766, + "step": 16183 + }, + { + "epoch": 4.967464702271332, + "grad_norm": 0.22318799793720245, + "learning_rate": 5.296072072153022e-05, + "loss": 1.751, + "step": 16184 + }, + { + "epoch": 4.967771639042358, + "grad_norm": 0.2348448485136032, + "learning_rate": 5.2955758868954855e-05, + "loss": 1.7844, + "step": 16185 + }, + { + "epoch": 4.968078575813382, + "grad_norm": 0.23294343054294586, + "learning_rate": 5.295079698716895e-05, + "loss": 1.7685, + "step": 16186 + }, + { + "epoch": 4.968385512584407, + "grad_norm": 0.20854508876800537, + "learning_rate": 5.2945835076221526e-05, + "loss": 1.6914, + "step": 16187 + }, + { + "epoch": 4.968692449355433, + "grad_norm": 0.21952031552791595, + "learning_rate": 5.294087313616165e-05, + "loss": 1.7121, + "step": 16188 + }, + { + "epoch": 4.968999386126458, + "grad_norm": 0.24097788333892822, + "learning_rate": 5.2935911167038346e-05, + "loss": 1.7712, + "step": 16189 + }, + { + "epoch": 4.969306322897483, + "grad_norm": 0.24433603882789612, + "learning_rate": 5.293094916890063e-05, + "loss": 1.7608, + "step": 16190 + }, + { + "epoch": 4.969613259668508, + "grad_norm": 0.22209061682224274, + "learning_rate": 5.292598714179757e-05, + "loss": 1.7563, + "step": 16191 + }, + { + "epoch": 4.969920196439533, + "grad_norm": 0.24291595816612244, + "learning_rate": 5.29210250857782e-05, + "loss": 1.7765, + "step": 16192 + }, + { + "epoch": 4.9702271332105585, + "grad_norm": 0.3143673837184906, + "learning_rate": 5.291606300089151e-05, + "loss": 1.7945, + "step": 16193 + }, + { + "epoch": 4.970534069981584, + "grad_norm": 0.22693613171577454, + "learning_rate": 5.291110088718661e-05, + "loss": 1.7411, + "step": 16194 + }, + { + "epoch": 4.970841006752609, + "grad_norm": 0.2271365374326706, + "learning_rate": 5.2906138744712494e-05, + "loss": 1.7754, + "step": 16195 + }, + { + "epoch": 4.9711479435236345, + "grad_norm": 0.2428499162197113, + "learning_rate": 5.290117657351822e-05, + "loss": 1.8007, + "step": 16196 + }, + { + "epoch": 4.971454880294659, + "grad_norm": 0.21862711012363434, + "learning_rate": 5.289621437365281e-05, + "loss": 1.7484, + "step": 16197 + }, + { + "epoch": 4.971761817065684, + "grad_norm": 0.26744964718818665, + "learning_rate": 5.2891252145165315e-05, + "loss": 1.7759, + "step": 16198 + }, + { + "epoch": 4.97206875383671, + "grad_norm": 0.2608526647090912, + "learning_rate": 5.288628988810477e-05, + "loss": 1.8527, + "step": 16199 + }, + { + "epoch": 4.972375690607735, + "grad_norm": 0.2245805710554123, + "learning_rate": 5.2881327602520216e-05, + "loss": 1.7773, + "step": 16200 + }, + { + "epoch": 4.97268262737876, + "grad_norm": 0.22023041546344757, + "learning_rate": 5.2876365288460694e-05, + "loss": 1.7101, + "step": 16201 + }, + { + "epoch": 4.972989564149785, + "grad_norm": 0.22034525871276855, + "learning_rate": 5.287140294597525e-05, + "loss": 1.7672, + "step": 16202 + }, + { + "epoch": 4.97329650092081, + "grad_norm": 0.23101158440113068, + "learning_rate": 5.286644057511292e-05, + "loss": 1.741, + "step": 16203 + }, + { + "epoch": 4.973603437691835, + "grad_norm": 0.23050430417060852, + "learning_rate": 5.286147817592273e-05, + "loss": 1.7727, + "step": 16204 + }, + { + "epoch": 4.973910374462861, + "grad_norm": 0.21803520619869232, + "learning_rate": 5.285651574845374e-05, + "loss": 1.7353, + "step": 16205 + }, + { + "epoch": 4.974217311233886, + "grad_norm": 0.22252169251441956, + "learning_rate": 5.2851553292754995e-05, + "loss": 1.7658, + "step": 16206 + }, + { + "epoch": 4.974524248004911, + "grad_norm": 0.22458864748477936, + "learning_rate": 5.284659080887552e-05, + "loss": 1.7157, + "step": 16207 + }, + { + "epoch": 4.974831184775936, + "grad_norm": 0.20769210159778595, + "learning_rate": 5.2841628296864376e-05, + "loss": 1.7731, + "step": 16208 + }, + { + "epoch": 4.975138121546961, + "grad_norm": 0.1952340304851532, + "learning_rate": 5.283666575677059e-05, + "loss": 1.6907, + "step": 16209 + }, + { + "epoch": 4.975445058317987, + "grad_norm": 0.21943804621696472, + "learning_rate": 5.28317031886432e-05, + "loss": 1.8007, + "step": 16210 + }, + { + "epoch": 4.975751995089012, + "grad_norm": 0.21987493336200714, + "learning_rate": 5.2826740592531276e-05, + "loss": 1.7205, + "step": 16211 + }, + { + "epoch": 4.976058931860036, + "grad_norm": 0.2076522558927536, + "learning_rate": 5.2821777968483845e-05, + "loss": 1.7063, + "step": 16212 + }, + { + "epoch": 4.976365868631062, + "grad_norm": 0.19126583635807037, + "learning_rate": 5.281681531654994e-05, + "loss": 1.7118, + "step": 16213 + }, + { + "epoch": 4.976672805402087, + "grad_norm": 0.22308050096035004, + "learning_rate": 5.2811852636778625e-05, + "loss": 1.7565, + "step": 16214 + }, + { + "epoch": 4.976979742173112, + "grad_norm": 0.23187528550624847, + "learning_rate": 5.280688992921893e-05, + "loss": 1.8261, + "step": 16215 + }, + { + "epoch": 4.977286678944138, + "grad_norm": 0.21373791992664337, + "learning_rate": 5.28019271939199e-05, + "loss": 1.6974, + "step": 16216 + }, + { + "epoch": 4.977593615715163, + "grad_norm": 0.21647346019744873, + "learning_rate": 5.2796964430930585e-05, + "loss": 1.7967, + "step": 16217 + }, + { + "epoch": 4.9779005524861875, + "grad_norm": 0.2231660932302475, + "learning_rate": 5.279200164030002e-05, + "loss": 1.7495, + "step": 16218 + }, + { + "epoch": 4.978207489257213, + "grad_norm": 0.2810545563697815, + "learning_rate": 5.278703882207728e-05, + "loss": 1.875, + "step": 16219 + }, + { + "epoch": 4.978514426028238, + "grad_norm": 0.298984557390213, + "learning_rate": 5.2782075976311374e-05, + "loss": 1.7494, + "step": 16220 + }, + { + "epoch": 4.9788213627992635, + "grad_norm": 0.2530893385410309, + "learning_rate": 5.2777113103051365e-05, + "loss": 1.7594, + "step": 16221 + }, + { + "epoch": 4.979128299570289, + "grad_norm": 0.26165664196014404, + "learning_rate": 5.277215020234629e-05, + "loss": 1.7543, + "step": 16222 + }, + { + "epoch": 4.979435236341313, + "grad_norm": 0.25115957856178284, + "learning_rate": 5.276718727424521e-05, + "loss": 1.7925, + "step": 16223 + }, + { + "epoch": 4.979742173112339, + "grad_norm": 0.22134126722812653, + "learning_rate": 5.276222431879716e-05, + "loss": 1.8359, + "step": 16224 + }, + { + "epoch": 4.980049109883364, + "grad_norm": 0.24447613954544067, + "learning_rate": 5.275726133605119e-05, + "loss": 1.7693, + "step": 16225 + }, + { + "epoch": 4.980356046654389, + "grad_norm": 0.23025095462799072, + "learning_rate": 5.275229832605635e-05, + "loss": 1.7911, + "step": 16226 + }, + { + "epoch": 4.980662983425415, + "grad_norm": 0.23424232006072998, + "learning_rate": 5.2747335288861686e-05, + "loss": 1.7628, + "step": 16227 + }, + { + "epoch": 4.98096992019644, + "grad_norm": 0.24598535895347595, + "learning_rate": 5.2742372224516235e-05, + "loss": 1.7651, + "step": 16228 + }, + { + "epoch": 4.981276856967464, + "grad_norm": 0.262893944978714, + "learning_rate": 5.273740913306906e-05, + "loss": 1.7282, + "step": 16229 + }, + { + "epoch": 4.98158379373849, + "grad_norm": 0.21981783211231232, + "learning_rate": 5.2732446014569207e-05, + "loss": 1.7448, + "step": 16230 + }, + { + "epoch": 4.981890730509515, + "grad_norm": 0.24244973063468933, + "learning_rate": 5.272748286906573e-05, + "loss": 1.7216, + "step": 16231 + }, + { + "epoch": 4.98219766728054, + "grad_norm": 0.2365221232175827, + "learning_rate": 5.272251969660766e-05, + "loss": 1.7227, + "step": 16232 + }, + { + "epoch": 4.982504604051566, + "grad_norm": 0.2081129401922226, + "learning_rate": 5.271755649724405e-05, + "loss": 1.7184, + "step": 16233 + }, + { + "epoch": 4.98281154082259, + "grad_norm": 0.2256374955177307, + "learning_rate": 5.271259327102395e-05, + "loss": 1.7412, + "step": 16234 + }, + { + "epoch": 4.9831184775936155, + "grad_norm": 0.23727381229400635, + "learning_rate": 5.270763001799643e-05, + "loss": 1.8095, + "step": 16235 + }, + { + "epoch": 4.983425414364641, + "grad_norm": 0.21498435735702515, + "learning_rate": 5.2702666738210504e-05, + "loss": 1.744, + "step": 16236 + }, + { + "epoch": 4.983732351135666, + "grad_norm": 0.24772173166275024, + "learning_rate": 5.269770343171525e-05, + "loss": 1.741, + "step": 16237 + }, + { + "epoch": 4.9840392879066915, + "grad_norm": 0.2835623621940613, + "learning_rate": 5.269274009855971e-05, + "loss": 1.7765, + "step": 16238 + }, + { + "epoch": 4.984346224677717, + "grad_norm": 0.2570044696331024, + "learning_rate": 5.2687776738792926e-05, + "loss": 1.8206, + "step": 16239 + }, + { + "epoch": 4.984653161448741, + "grad_norm": 0.21549640595912933, + "learning_rate": 5.268281335246397e-05, + "loss": 1.7022, + "step": 16240 + }, + { + "epoch": 4.984960098219767, + "grad_norm": 0.23158684372901917, + "learning_rate": 5.267784993962187e-05, + "loss": 1.7882, + "step": 16241 + }, + { + "epoch": 4.985267034990792, + "grad_norm": 0.22778423130512238, + "learning_rate": 5.26728865003157e-05, + "loss": 1.7358, + "step": 16242 + }, + { + "epoch": 4.985573971761817, + "grad_norm": 0.23197145760059357, + "learning_rate": 5.266792303459449e-05, + "loss": 1.7687, + "step": 16243 + }, + { + "epoch": 4.985880908532843, + "grad_norm": 0.19270172715187073, + "learning_rate": 5.26629595425073e-05, + "loss": 1.6999, + "step": 16244 + }, + { + "epoch": 4.986187845303867, + "grad_norm": 0.25262632966041565, + "learning_rate": 5.2657996024103175e-05, + "loss": 1.7536, + "step": 16245 + }, + { + "epoch": 4.986494782074892, + "grad_norm": 0.18620926141738892, + "learning_rate": 5.2653032479431185e-05, + "loss": 1.7033, + "step": 16246 + }, + { + "epoch": 4.986801718845918, + "grad_norm": 0.19537273049354553, + "learning_rate": 5.2648068908540374e-05, + "loss": 1.7457, + "step": 16247 + }, + { + "epoch": 4.987108655616943, + "grad_norm": 0.19447599351406097, + "learning_rate": 5.26431053114798e-05, + "loss": 1.7053, + "step": 16248 + }, + { + "epoch": 4.987415592387968, + "grad_norm": 0.20431137084960938, + "learning_rate": 5.263814168829852e-05, + "loss": 1.7695, + "step": 16249 + }, + { + "epoch": 4.987722529158994, + "grad_norm": 0.21123024821281433, + "learning_rate": 5.263317803904554e-05, + "loss": 1.7666, + "step": 16250 + }, + { + "epoch": 4.988029465930018, + "grad_norm": 0.21279335021972656, + "learning_rate": 5.262821436376998e-05, + "loss": 1.7231, + "step": 16251 + }, + { + "epoch": 4.9883364027010435, + "grad_norm": 0.22504910826683044, + "learning_rate": 5.262325066252085e-05, + "loss": 1.7657, + "step": 16252 + }, + { + "epoch": 4.988643339472069, + "grad_norm": 0.23505981266498566, + "learning_rate": 5.261828693534723e-05, + "loss": 1.7576, + "step": 16253 + }, + { + "epoch": 4.988950276243094, + "grad_norm": 0.21553601324558258, + "learning_rate": 5.261332318229817e-05, + "loss": 1.7782, + "step": 16254 + }, + { + "epoch": 4.989257213014119, + "grad_norm": 0.29189521074295044, + "learning_rate": 5.26083594034227e-05, + "loss": 1.7664, + "step": 16255 + }, + { + "epoch": 4.989564149785144, + "grad_norm": 0.38108906149864197, + "learning_rate": 5.26033955987699e-05, + "loss": 1.8573, + "step": 16256 + }, + { + "epoch": 4.989871086556169, + "grad_norm": 0.30329224467277527, + "learning_rate": 5.2598431768388824e-05, + "loss": 1.7584, + "step": 16257 + }, + { + "epoch": 4.990178023327195, + "grad_norm": 0.2437417358160019, + "learning_rate": 5.259346791232852e-05, + "loss": 1.7352, + "step": 16258 + }, + { + "epoch": 4.99048496009822, + "grad_norm": 0.3601737320423126, + "learning_rate": 5.258850403063804e-05, + "loss": 1.7206, + "step": 16259 + }, + { + "epoch": 4.990791896869245, + "grad_norm": 0.20259195566177368, + "learning_rate": 5.258354012336646e-05, + "loss": 1.7403, + "step": 16260 + }, + { + "epoch": 4.99109883364027, + "grad_norm": 0.38022148609161377, + "learning_rate": 5.257857619056281e-05, + "loss": 1.7783, + "step": 16261 + }, + { + "epoch": 4.991405770411295, + "grad_norm": 0.30131712555885315, + "learning_rate": 5.257361223227615e-05, + "loss": 1.7826, + "step": 16262 + }, + { + "epoch": 4.99171270718232, + "grad_norm": 0.24159663915634155, + "learning_rate": 5.2568648248555565e-05, + "loss": 1.7792, + "step": 16263 + }, + { + "epoch": 4.992019643953346, + "grad_norm": 0.4641213119029999, + "learning_rate": 5.2563684239450084e-05, + "loss": 1.7432, + "step": 16264 + }, + { + "epoch": 4.992326580724371, + "grad_norm": 0.3526865541934967, + "learning_rate": 5.255872020500877e-05, + "loss": 1.7736, + "step": 16265 + }, + { + "epoch": 4.9926335174953955, + "grad_norm": 0.2396051585674286, + "learning_rate": 5.255375614528071e-05, + "loss": 1.7505, + "step": 16266 + }, + { + "epoch": 4.992940454266421, + "grad_norm": 0.320987343788147, + "learning_rate": 5.25487920603149e-05, + "loss": 1.8229, + "step": 16267 + }, + { + "epoch": 4.993247391037446, + "grad_norm": 0.24689678847789764, + "learning_rate": 5.254382795016044e-05, + "loss": 1.7011, + "step": 16268 + }, + { + "epoch": 4.9935543278084715, + "grad_norm": 0.2407137155532837, + "learning_rate": 5.253886381486639e-05, + "loss": 1.741, + "step": 16269 + }, + { + "epoch": 4.993861264579497, + "grad_norm": 0.3677252531051636, + "learning_rate": 5.25338996544818e-05, + "loss": 1.7792, + "step": 16270 + }, + { + "epoch": 4.994168201350522, + "grad_norm": 0.25096553564071655, + "learning_rate": 5.252893546905573e-05, + "loss": 1.7523, + "step": 16271 + }, + { + "epoch": 4.994475138121547, + "grad_norm": 0.2966327965259552, + "learning_rate": 5.252397125863723e-05, + "loss": 1.7114, + "step": 16272 + }, + { + "epoch": 4.994782074892572, + "grad_norm": 0.36577650904655457, + "learning_rate": 5.2519007023275356e-05, + "loss": 1.7609, + "step": 16273 + }, + { + "epoch": 4.995089011663597, + "grad_norm": 0.2450687140226364, + "learning_rate": 5.25140427630192e-05, + "loss": 1.7452, + "step": 16274 + }, + { + "epoch": 4.995395948434623, + "grad_norm": 0.20782120525836945, + "learning_rate": 5.250907847791778e-05, + "loss": 1.7109, + "step": 16275 + }, + { + "epoch": 4.995702885205648, + "grad_norm": 0.2423330545425415, + "learning_rate": 5.25041141680202e-05, + "loss": 1.7234, + "step": 16276 + }, + { + "epoch": 4.996009821976672, + "grad_norm": 0.20855975151062012, + "learning_rate": 5.2499149833375484e-05, + "loss": 1.7734, + "step": 16277 + }, + { + "epoch": 4.996316758747698, + "grad_norm": 0.24400894343852997, + "learning_rate": 5.24941854740327e-05, + "loss": 1.7566, + "step": 16278 + }, + { + "epoch": 4.996623695518723, + "grad_norm": 0.4378018379211426, + "learning_rate": 5.2489221090040906e-05, + "loss": 1.7536, + "step": 16279 + }, + { + "epoch": 4.996930632289748, + "grad_norm": 0.20726722478866577, + "learning_rate": 5.248425668144918e-05, + "loss": 1.8008, + "step": 16280 + }, + { + "epoch": 4.997237569060774, + "grad_norm": 0.2506333589553833, + "learning_rate": 5.247929224830658e-05, + "loss": 1.7404, + "step": 16281 + }, + { + "epoch": 4.997544505831799, + "grad_norm": 0.24178004264831543, + "learning_rate": 5.247432779066216e-05, + "loss": 1.7517, + "step": 16282 + }, + { + "epoch": 4.9978514426028235, + "grad_norm": 0.2500220835208893, + "learning_rate": 5.246936330856499e-05, + "loss": 1.7705, + "step": 16283 + }, + { + "epoch": 4.998158379373849, + "grad_norm": 0.30043718218803406, + "learning_rate": 5.24643988020641e-05, + "loss": 1.8118, + "step": 16284 + }, + { + "epoch": 4.998465316144874, + "grad_norm": 0.284805566072464, + "learning_rate": 5.245943427120859e-05, + "loss": 1.7968, + "step": 16285 + }, + { + "epoch": 4.9987722529158995, + "grad_norm": 0.3652406632900238, + "learning_rate": 5.245446971604751e-05, + "loss": 1.7785, + "step": 16286 + }, + { + "epoch": 4.999079189686924, + "grad_norm": 0.24879656732082367, + "learning_rate": 5.244950513662992e-05, + "loss": 1.734, + "step": 16287 + }, + { + "epoch": 4.999386126457949, + "grad_norm": 0.2374224215745926, + "learning_rate": 5.244454053300488e-05, + "loss": 1.7394, + "step": 16288 + }, + { + "epoch": 4.999693063228975, + "grad_norm": 0.27090463042259216, + "learning_rate": 5.243957590522147e-05, + "loss": 1.7529, + "step": 16289 + }, + { + "epoch": 5.0, + "grad_norm": 0.23060791194438934, + "learning_rate": 5.243461125332873e-05, + "loss": 1.7599, + "step": 16290 + }, + { + "epoch": 5.000306936771025, + "grad_norm": 0.21159487962722778, + "learning_rate": 5.242964657737572e-05, + "loss": 1.747, + "step": 16291 + }, + { + "epoch": 5.000613873542051, + "grad_norm": 0.21556304395198822, + "learning_rate": 5.242468187741154e-05, + "loss": 1.7653, + "step": 16292 + }, + { + "epoch": 5.000920810313075, + "grad_norm": 0.2569669783115387, + "learning_rate": 5.241971715348524e-05, + "loss": 1.7284, + "step": 16293 + }, + { + "epoch": 5.0012277470841005, + "grad_norm": 0.2827381491661072, + "learning_rate": 5.241475240564586e-05, + "loss": 1.7765, + "step": 16294 + }, + { + "epoch": 5.001534683855126, + "grad_norm": 0.22498267889022827, + "learning_rate": 5.240978763394249e-05, + "loss": 1.729, + "step": 16295 + }, + { + "epoch": 5.001841620626151, + "grad_norm": 0.23975814878940582, + "learning_rate": 5.240482283842418e-05, + "loss": 1.7968, + "step": 16296 + }, + { + "epoch": 5.0021485573971765, + "grad_norm": 0.20811420679092407, + "learning_rate": 5.239985801914e-05, + "loss": 1.6931, + "step": 16297 + }, + { + "epoch": 5.002455494168202, + "grad_norm": 0.22985060513019562, + "learning_rate": 5.2394893176139014e-05, + "loss": 1.7724, + "step": 16298 + }, + { + "epoch": 5.002762430939226, + "grad_norm": 0.22867995500564575, + "learning_rate": 5.2389928309470305e-05, + "loss": 1.7179, + "step": 16299 + }, + { + "epoch": 5.003069367710252, + "grad_norm": 0.2543974220752716, + "learning_rate": 5.238496341918293e-05, + "loss": 1.7859, + "step": 16300 + }, + { + "epoch": 5.003376304481277, + "grad_norm": 0.226583793759346, + "learning_rate": 5.237999850532592e-05, + "loss": 1.7567, + "step": 16301 + }, + { + "epoch": 5.003683241252302, + "grad_norm": 0.21744728088378906, + "learning_rate": 5.237503356794838e-05, + "loss": 1.7345, + "step": 16302 + }, + { + "epoch": 5.003990178023328, + "grad_norm": 0.25915467739105225, + "learning_rate": 5.2370068607099373e-05, + "loss": 1.7179, + "step": 16303 + }, + { + "epoch": 5.004297114794352, + "grad_norm": 0.20572461187839508, + "learning_rate": 5.236510362282796e-05, + "loss": 1.7211, + "step": 16304 + }, + { + "epoch": 5.004604051565377, + "grad_norm": 0.2821461856365204, + "learning_rate": 5.236013861518321e-05, + "loss": 1.7894, + "step": 16305 + }, + { + "epoch": 5.004910988336403, + "grad_norm": 0.22273759543895721, + "learning_rate": 5.235517358421417e-05, + "loss": 1.7919, + "step": 16306 + }, + { + "epoch": 5.005217925107428, + "grad_norm": 0.23875468969345093, + "learning_rate": 5.2350208529969935e-05, + "loss": 1.7558, + "step": 16307 + }, + { + "epoch": 5.005524861878453, + "grad_norm": 0.24673783779144287, + "learning_rate": 5.234524345249955e-05, + "loss": 1.7705, + "step": 16308 + }, + { + "epoch": 5.005831798649478, + "grad_norm": 0.21992872655391693, + "learning_rate": 5.234027835185211e-05, + "loss": 1.7059, + "step": 16309 + }, + { + "epoch": 5.006138735420503, + "grad_norm": 0.19214966893196106, + "learning_rate": 5.233531322807667e-05, + "loss": 1.6647, + "step": 16310 + }, + { + "epoch": 5.0064456721915285, + "grad_norm": 0.18525120615959167, + "learning_rate": 5.233034808122228e-05, + "loss": 1.719, + "step": 16311 + }, + { + "epoch": 5.006752608962554, + "grad_norm": 0.25996243953704834, + "learning_rate": 5.232538291133804e-05, + "loss": 1.7227, + "step": 16312 + }, + { + "epoch": 5.007059545733579, + "grad_norm": 0.2163757085800171, + "learning_rate": 5.232041771847299e-05, + "loss": 1.6962, + "step": 16313 + }, + { + "epoch": 5.0073664825046045, + "grad_norm": 0.23484158515930176, + "learning_rate": 5.231545250267621e-05, + "loss": 1.7816, + "step": 16314 + }, + { + "epoch": 5.007673419275629, + "grad_norm": 0.2188636213541031, + "learning_rate": 5.2310487263996776e-05, + "loss": 1.7477, + "step": 16315 + }, + { + "epoch": 5.007980356046654, + "grad_norm": 0.1950213611125946, + "learning_rate": 5.230552200248377e-05, + "loss": 1.7165, + "step": 16316 + }, + { + "epoch": 5.00828729281768, + "grad_norm": 0.25340089201927185, + "learning_rate": 5.230055671818623e-05, + "loss": 1.7764, + "step": 16317 + }, + { + "epoch": 5.008594229588705, + "grad_norm": 0.23749271035194397, + "learning_rate": 5.2295591411153245e-05, + "loss": 1.7193, + "step": 16318 + }, + { + "epoch": 5.00890116635973, + "grad_norm": 0.2317294180393219, + "learning_rate": 5.229062608143387e-05, + "loss": 1.7607, + "step": 16319 + }, + { + "epoch": 5.009208103130755, + "grad_norm": 0.2751505672931671, + "learning_rate": 5.228566072907719e-05, + "loss": 1.7562, + "step": 16320 + }, + { + "epoch": 5.00951503990178, + "grad_norm": 0.29476025700569153, + "learning_rate": 5.2280695354132267e-05, + "loss": 1.687, + "step": 16321 + }, + { + "epoch": 5.009821976672805, + "grad_norm": 0.20734120905399323, + "learning_rate": 5.227572995664819e-05, + "loss": 1.7608, + "step": 16322 + }, + { + "epoch": 5.010128913443831, + "grad_norm": 0.2537878155708313, + "learning_rate": 5.227076453667401e-05, + "loss": 1.7947, + "step": 16323 + }, + { + "epoch": 5.010435850214856, + "grad_norm": 0.23516076803207397, + "learning_rate": 5.2265799094258796e-05, + "loss": 1.7545, + "step": 16324 + }, + { + "epoch": 5.0107427869858805, + "grad_norm": 0.2581529915332794, + "learning_rate": 5.226083362945162e-05, + "loss": 1.7529, + "step": 16325 + }, + { + "epoch": 5.011049723756906, + "grad_norm": 0.2982035279273987, + "learning_rate": 5.225586814230158e-05, + "loss": 1.74, + "step": 16326 + }, + { + "epoch": 5.011356660527931, + "grad_norm": 0.2773981988430023, + "learning_rate": 5.225090263285772e-05, + "loss": 1.7562, + "step": 16327 + }, + { + "epoch": 5.0116635972989565, + "grad_norm": 0.19992689788341522, + "learning_rate": 5.2245937101169116e-05, + "loss": 1.6896, + "step": 16328 + }, + { + "epoch": 5.011970534069982, + "grad_norm": 0.2913428246974945, + "learning_rate": 5.224097154728486e-05, + "loss": 1.7574, + "step": 16329 + }, + { + "epoch": 5.012277470841007, + "grad_norm": 0.23173104226589203, + "learning_rate": 5.2236005971254e-05, + "loss": 1.6954, + "step": 16330 + }, + { + "epoch": 5.012584407612032, + "grad_norm": 0.2019525170326233, + "learning_rate": 5.2231040373125614e-05, + "loss": 1.7711, + "step": 16331 + }, + { + "epoch": 5.012891344383057, + "grad_norm": 0.29070746898651123, + "learning_rate": 5.222607475294878e-05, + "loss": 1.8201, + "step": 16332 + }, + { + "epoch": 5.013198281154082, + "grad_norm": 0.22005079686641693, + "learning_rate": 5.222110911077258e-05, + "loss": 1.7421, + "step": 16333 + }, + { + "epoch": 5.013505217925108, + "grad_norm": 0.24422192573547363, + "learning_rate": 5.2216143446646085e-05, + "loss": 1.7074, + "step": 16334 + }, + { + "epoch": 5.013812154696133, + "grad_norm": 0.2417927384376526, + "learning_rate": 5.221117776061836e-05, + "loss": 1.7726, + "step": 16335 + }, + { + "epoch": 5.014119091467157, + "grad_norm": 0.245828777551651, + "learning_rate": 5.2206212052738454e-05, + "loss": 1.7932, + "step": 16336 + }, + { + "epoch": 5.014426028238183, + "grad_norm": 0.24054239690303802, + "learning_rate": 5.220124632305548e-05, + "loss": 1.727, + "step": 16337 + }, + { + "epoch": 5.014732965009208, + "grad_norm": 0.2572494149208069, + "learning_rate": 5.21962805716185e-05, + "loss": 1.7234, + "step": 16338 + }, + { + "epoch": 5.015039901780233, + "grad_norm": 0.33624622225761414, + "learning_rate": 5.2191314798476595e-05, + "loss": 1.7499, + "step": 16339 + }, + { + "epoch": 5.015346838551259, + "grad_norm": 0.22321413457393646, + "learning_rate": 5.218634900367883e-05, + "loss": 1.7155, + "step": 16340 + }, + { + "epoch": 5.015653775322283, + "grad_norm": 0.26709917187690735, + "learning_rate": 5.218138318727429e-05, + "loss": 1.8346, + "step": 16341 + }, + { + "epoch": 5.0159607120933085, + "grad_norm": 0.27600952982902527, + "learning_rate": 5.217641734931202e-05, + "loss": 1.789, + "step": 16342 + }, + { + "epoch": 5.016267648864334, + "grad_norm": 0.21392405033111572, + "learning_rate": 5.217145148984114e-05, + "loss": 1.7266, + "step": 16343 + }, + { + "epoch": 5.016574585635359, + "grad_norm": 0.3215450942516327, + "learning_rate": 5.2166485608910696e-05, + "loss": 1.7453, + "step": 16344 + }, + { + "epoch": 5.0168815224063845, + "grad_norm": 0.22328032553195953, + "learning_rate": 5.2161519706569776e-05, + "loss": 1.7209, + "step": 16345 + }, + { + "epoch": 5.01718845917741, + "grad_norm": 0.2438887059688568, + "learning_rate": 5.215655378286744e-05, + "loss": 1.7289, + "step": 16346 + }, + { + "epoch": 5.017495395948434, + "grad_norm": 0.30078747868537903, + "learning_rate": 5.2151587837852786e-05, + "loss": 1.7483, + "step": 16347 + }, + { + "epoch": 5.01780233271946, + "grad_norm": 0.21723167598247528, + "learning_rate": 5.214662187157488e-05, + "loss": 1.7654, + "step": 16348 + }, + { + "epoch": 5.018109269490485, + "grad_norm": 0.26358669996261597, + "learning_rate": 5.2141655884082784e-05, + "loss": 1.7563, + "step": 16349 + }, + { + "epoch": 5.01841620626151, + "grad_norm": 0.24285505712032318, + "learning_rate": 5.2136689875425615e-05, + "loss": 1.7377, + "step": 16350 + }, + { + "epoch": 5.018723143032536, + "grad_norm": 0.2401108294725418, + "learning_rate": 5.2131723845652416e-05, + "loss": 1.7445, + "step": 16351 + }, + { + "epoch": 5.01903007980356, + "grad_norm": 0.3347793519496918, + "learning_rate": 5.212675779481226e-05, + "loss": 1.7872, + "step": 16352 + }, + { + "epoch": 5.019337016574585, + "grad_norm": 0.306728720664978, + "learning_rate": 5.212179172295424e-05, + "loss": 1.8051, + "step": 16353 + }, + { + "epoch": 5.019643953345611, + "grad_norm": 0.22297725081443787, + "learning_rate": 5.211682563012743e-05, + "loss": 1.7082, + "step": 16354 + }, + { + "epoch": 5.019950890116636, + "grad_norm": 0.24047277867794037, + "learning_rate": 5.211185951638091e-05, + "loss": 1.7024, + "step": 16355 + }, + { + "epoch": 5.020257826887661, + "grad_norm": 0.19570080935955048, + "learning_rate": 5.210689338176377e-05, + "loss": 1.6947, + "step": 16356 + }, + { + "epoch": 5.020564763658686, + "grad_norm": 0.2024889886379242, + "learning_rate": 5.2101927226325066e-05, + "loss": 1.7168, + "step": 16357 + }, + { + "epoch": 5.020871700429711, + "grad_norm": 0.23546278476715088, + "learning_rate": 5.209696105011388e-05, + "loss": 1.7697, + "step": 16358 + }, + { + "epoch": 5.0211786372007365, + "grad_norm": 0.21003498136997223, + "learning_rate": 5.209199485317928e-05, + "loss": 1.7198, + "step": 16359 + }, + { + "epoch": 5.021485573971762, + "grad_norm": 0.21375493705272675, + "learning_rate": 5.208702863557039e-05, + "loss": 1.7689, + "step": 16360 + }, + { + "epoch": 5.021792510742787, + "grad_norm": 0.21549762785434723, + "learning_rate": 5.2082062397336254e-05, + "loss": 1.6936, + "step": 16361 + }, + { + "epoch": 5.0220994475138125, + "grad_norm": 0.22633691132068634, + "learning_rate": 5.207709613852595e-05, + "loss": 1.7512, + "step": 16362 + }, + { + "epoch": 5.022406384284837, + "grad_norm": 0.21888238191604614, + "learning_rate": 5.2072129859188566e-05, + "loss": 1.7082, + "step": 16363 + }, + { + "epoch": 5.022713321055862, + "grad_norm": 0.2416619062423706, + "learning_rate": 5.206716355937318e-05, + "loss": 1.7938, + "step": 16364 + }, + { + "epoch": 5.023020257826888, + "grad_norm": 0.22451527416706085, + "learning_rate": 5.206219723912886e-05, + "loss": 1.7372, + "step": 16365 + }, + { + "epoch": 5.023327194597913, + "grad_norm": 0.19698494672775269, + "learning_rate": 5.2057230898504716e-05, + "loss": 1.7205, + "step": 16366 + }, + { + "epoch": 5.023634131368938, + "grad_norm": 0.2441127747297287, + "learning_rate": 5.205226453754982e-05, + "loss": 1.7625, + "step": 16367 + }, + { + "epoch": 5.023941068139963, + "grad_norm": 0.21940121054649353, + "learning_rate": 5.204729815631323e-05, + "loss": 1.7985, + "step": 16368 + }, + { + "epoch": 5.024248004910988, + "grad_norm": 0.21751399338245392, + "learning_rate": 5.204233175484403e-05, + "loss": 1.7759, + "step": 16369 + }, + { + "epoch": 5.024554941682013, + "grad_norm": 0.20261377096176147, + "learning_rate": 5.2037365333191315e-05, + "loss": 1.746, + "step": 16370 + }, + { + "epoch": 5.024861878453039, + "grad_norm": 0.2628774046897888, + "learning_rate": 5.2032398891404166e-05, + "loss": 1.8178, + "step": 16371 + }, + { + "epoch": 5.025168815224064, + "grad_norm": 0.20626378059387207, + "learning_rate": 5.2027432429531665e-05, + "loss": 1.7456, + "step": 16372 + }, + { + "epoch": 5.0254757519950894, + "grad_norm": 0.25548869371414185, + "learning_rate": 5.2022465947622876e-05, + "loss": 1.8098, + "step": 16373 + }, + { + "epoch": 5.025782688766114, + "grad_norm": 0.1978374719619751, + "learning_rate": 5.20174994457269e-05, + "loss": 1.685, + "step": 16374 + }, + { + "epoch": 5.026089625537139, + "grad_norm": 0.2708980143070221, + "learning_rate": 5.201253292389282e-05, + "loss": 1.7464, + "step": 16375 + }, + { + "epoch": 5.026396562308165, + "grad_norm": 0.2730494737625122, + "learning_rate": 5.2007566382169706e-05, + "loss": 1.7391, + "step": 16376 + }, + { + "epoch": 5.02670349907919, + "grad_norm": 0.243557408452034, + "learning_rate": 5.2002599820606624e-05, + "loss": 1.7439, + "step": 16377 + }, + { + "epoch": 5.027010435850215, + "grad_norm": 0.2208259105682373, + "learning_rate": 5.19976332392527e-05, + "loss": 1.7612, + "step": 16378 + }, + { + "epoch": 5.02731737262124, + "grad_norm": 0.21288715302944183, + "learning_rate": 5.199266663815698e-05, + "loss": 1.7546, + "step": 16379 + }, + { + "epoch": 5.027624309392265, + "grad_norm": 0.2106054425239563, + "learning_rate": 5.198770001736857e-05, + "loss": 1.7281, + "step": 16380 + }, + { + "epoch": 5.02793124616329, + "grad_norm": 0.2247164249420166, + "learning_rate": 5.198273337693654e-05, + "loss": 1.8405, + "step": 16381 + }, + { + "epoch": 5.028238182934316, + "grad_norm": 0.21713724732398987, + "learning_rate": 5.197776671690998e-05, + "loss": 1.7333, + "step": 16382 + }, + { + "epoch": 5.028545119705341, + "grad_norm": 0.24063727259635925, + "learning_rate": 5.1972800037337956e-05, + "loss": 1.7608, + "step": 16383 + }, + { + "epoch": 5.0288520564763655, + "grad_norm": 0.22022177278995514, + "learning_rate": 5.196783333826959e-05, + "loss": 1.7045, + "step": 16384 + }, + { + "epoch": 5.029158993247391, + "grad_norm": 0.21348948776721954, + "learning_rate": 5.1962866619753927e-05, + "loss": 1.7516, + "step": 16385 + }, + { + "epoch": 5.029465930018416, + "grad_norm": 0.289315789937973, + "learning_rate": 5.195789988184007e-05, + "loss": 1.8555, + "step": 16386 + }, + { + "epoch": 5.0297728667894415, + "grad_norm": 0.30966848134994507, + "learning_rate": 5.19529331245771e-05, + "loss": 1.7245, + "step": 16387 + }, + { + "epoch": 5.030079803560467, + "grad_norm": 0.24625633656978607, + "learning_rate": 5.194796634801409e-05, + "loss": 1.7788, + "step": 16388 + }, + { + "epoch": 5.030386740331492, + "grad_norm": 0.25937986373901367, + "learning_rate": 5.1942999552200136e-05, + "loss": 1.7655, + "step": 16389 + }, + { + "epoch": 5.030693677102517, + "grad_norm": 0.3056741952896118, + "learning_rate": 5.1938032737184325e-05, + "loss": 1.7167, + "step": 16390 + }, + { + "epoch": 5.031000613873542, + "grad_norm": 0.29773563146591187, + "learning_rate": 5.1933065903015743e-05, + "loss": 1.7247, + "step": 16391 + }, + { + "epoch": 5.031307550644567, + "grad_norm": 0.26433971524238586, + "learning_rate": 5.192809904974347e-05, + "loss": 1.7779, + "step": 16392 + }, + { + "epoch": 5.031614487415593, + "grad_norm": 0.3308073580265045, + "learning_rate": 5.192313217741659e-05, + "loss": 1.7782, + "step": 16393 + }, + { + "epoch": 5.031921424186618, + "grad_norm": 0.2584165632724762, + "learning_rate": 5.1918165286084176e-05, + "loss": 1.7812, + "step": 16394 + }, + { + "epoch": 5.032228360957642, + "grad_norm": 0.31678953766822815, + "learning_rate": 5.1913198375795346e-05, + "loss": 1.7341, + "step": 16395 + }, + { + "epoch": 5.032535297728668, + "grad_norm": 0.3527325391769409, + "learning_rate": 5.190823144659916e-05, + "loss": 1.7844, + "step": 16396 + }, + { + "epoch": 5.032842234499693, + "grad_norm": 0.29233935475349426, + "learning_rate": 5.1903264498544724e-05, + "loss": 1.7993, + "step": 16397 + }, + { + "epoch": 5.033149171270718, + "grad_norm": 0.24549467861652374, + "learning_rate": 5.1898297531681106e-05, + "loss": 1.7294, + "step": 16398 + }, + { + "epoch": 5.033456108041744, + "grad_norm": 0.3446930944919586, + "learning_rate": 5.18933305460574e-05, + "loss": 1.6818, + "step": 16399 + }, + { + "epoch": 5.033763044812768, + "grad_norm": 0.2628229856491089, + "learning_rate": 5.188836354172268e-05, + "loss": 1.7867, + "step": 16400 + }, + { + "epoch": 5.0340699815837935, + "grad_norm": 0.26548629999160767, + "learning_rate": 5.188339651872607e-05, + "loss": 1.7448, + "step": 16401 + }, + { + "epoch": 5.034376918354819, + "grad_norm": 0.29242032766342163, + "learning_rate": 5.187842947711662e-05, + "loss": 1.7103, + "step": 16402 + }, + { + "epoch": 5.034683855125844, + "grad_norm": 0.2515408992767334, + "learning_rate": 5.187346241694343e-05, + "loss": 1.7865, + "step": 16403 + }, + { + "epoch": 5.0349907918968695, + "grad_norm": 0.2253103256225586, + "learning_rate": 5.186849533825559e-05, + "loss": 1.6993, + "step": 16404 + }, + { + "epoch": 5.035297728667895, + "grad_norm": 0.2743360102176666, + "learning_rate": 5.1863528241102154e-05, + "loss": 1.7532, + "step": 16405 + }, + { + "epoch": 5.035604665438919, + "grad_norm": 0.22807851433753967, + "learning_rate": 5.185856112553227e-05, + "loss": 1.7873, + "step": 16406 + }, + { + "epoch": 5.035911602209945, + "grad_norm": 0.23719090223312378, + "learning_rate": 5.1853593991594985e-05, + "loss": 1.7555, + "step": 16407 + }, + { + "epoch": 5.03621853898097, + "grad_norm": 0.2964477241039276, + "learning_rate": 5.184862683933941e-05, + "loss": 1.7204, + "step": 16408 + }, + { + "epoch": 5.036525475751995, + "grad_norm": 0.23717865347862244, + "learning_rate": 5.18436596688146e-05, + "loss": 1.7239, + "step": 16409 + }, + { + "epoch": 5.036832412523021, + "grad_norm": 0.22650085389614105, + "learning_rate": 5.1838692480069686e-05, + "loss": 1.7148, + "step": 16410 + }, + { + "epoch": 5.037139349294045, + "grad_norm": 0.25606781244277954, + "learning_rate": 5.183372527315371e-05, + "loss": 1.7916, + "step": 16411 + }, + { + "epoch": 5.03744628606507, + "grad_norm": 0.22266390919685364, + "learning_rate": 5.182875804811581e-05, + "loss": 1.7481, + "step": 16412 + }, + { + "epoch": 5.037753222836096, + "grad_norm": 0.23481780290603638, + "learning_rate": 5.1823790805005045e-05, + "loss": 1.8014, + "step": 16413 + }, + { + "epoch": 5.038060159607121, + "grad_norm": 0.2629338800907135, + "learning_rate": 5.1818823543870506e-05, + "loss": 1.81, + "step": 16414 + }, + { + "epoch": 5.038367096378146, + "grad_norm": 0.22891482710838318, + "learning_rate": 5.18138562647613e-05, + "loss": 1.757, + "step": 16415 + }, + { + "epoch": 5.038674033149171, + "grad_norm": 0.2666641175746918, + "learning_rate": 5.180888896772649e-05, + "loss": 1.7457, + "step": 16416 + }, + { + "epoch": 5.038980969920196, + "grad_norm": 0.37610310316085815, + "learning_rate": 5.180392165281517e-05, + "loss": 1.8214, + "step": 16417 + }, + { + "epoch": 5.0392879066912215, + "grad_norm": 0.2521277964115143, + "learning_rate": 5.1798954320076455e-05, + "loss": 1.7731, + "step": 16418 + }, + { + "epoch": 5.039594843462247, + "grad_norm": 0.25097090005874634, + "learning_rate": 5.1793986969559415e-05, + "loss": 1.8029, + "step": 16419 + }, + { + "epoch": 5.039901780233272, + "grad_norm": 0.2946726381778717, + "learning_rate": 5.178901960131315e-05, + "loss": 1.7483, + "step": 16420 + }, + { + "epoch": 5.0402087170042975, + "grad_norm": 0.24240419268608093, + "learning_rate": 5.1784052215386736e-05, + "loss": 1.731, + "step": 16421 + }, + { + "epoch": 5.040515653775322, + "grad_norm": 0.2403198480606079, + "learning_rate": 5.177908481182926e-05, + "loss": 1.722, + "step": 16422 + }, + { + "epoch": 5.040822590546347, + "grad_norm": 0.3451874554157257, + "learning_rate": 5.177411739068985e-05, + "loss": 1.7562, + "step": 16423 + }, + { + "epoch": 5.041129527317373, + "grad_norm": 0.3244951069355011, + "learning_rate": 5.176914995201756e-05, + "loss": 1.7321, + "step": 16424 + }, + { + "epoch": 5.041436464088398, + "grad_norm": 0.2346230000257492, + "learning_rate": 5.176418249586149e-05, + "loss": 1.7839, + "step": 16425 + }, + { + "epoch": 5.041743400859423, + "grad_norm": 0.357022225856781, + "learning_rate": 5.1759215022270744e-05, + "loss": 1.7776, + "step": 16426 + }, + { + "epoch": 5.042050337630448, + "grad_norm": 0.259007066488266, + "learning_rate": 5.17542475312944e-05, + "loss": 1.7544, + "step": 16427 + }, + { + "epoch": 5.042357274401473, + "grad_norm": 0.2516533136367798, + "learning_rate": 5.174928002298154e-05, + "loss": 1.7269, + "step": 16428 + }, + { + "epoch": 5.042664211172498, + "grad_norm": 0.3393619954586029, + "learning_rate": 5.174431249738129e-05, + "loss": 1.7487, + "step": 16429 + }, + { + "epoch": 5.042971147943524, + "grad_norm": 0.2730594873428345, + "learning_rate": 5.1739344954542714e-05, + "loss": 1.7468, + "step": 16430 + }, + { + "epoch": 5.043278084714549, + "grad_norm": 0.21233965456485748, + "learning_rate": 5.1734377394514914e-05, + "loss": 1.783, + "step": 16431 + }, + { + "epoch": 5.043585021485574, + "grad_norm": 0.3460896909236908, + "learning_rate": 5.1729409817346974e-05, + "loss": 1.7497, + "step": 16432 + }, + { + "epoch": 5.043891958256599, + "grad_norm": 0.31918221712112427, + "learning_rate": 5.1724442223088e-05, + "loss": 1.7834, + "step": 16433 + }, + { + "epoch": 5.044198895027624, + "grad_norm": 0.23016802966594696, + "learning_rate": 5.171947461178706e-05, + "loss": 1.7348, + "step": 16434 + }, + { + "epoch": 5.0445058317986495, + "grad_norm": 0.35758304595947266, + "learning_rate": 5.171450698349329e-05, + "loss": 1.7734, + "step": 16435 + }, + { + "epoch": 5.044812768569675, + "grad_norm": 0.279725581407547, + "learning_rate": 5.170953933825574e-05, + "loss": 1.7283, + "step": 16436 + }, + { + "epoch": 5.0451197053407, + "grad_norm": 0.23965120315551758, + "learning_rate": 5.170457167612354e-05, + "loss": 1.7606, + "step": 16437 + }, + { + "epoch": 5.045426642111725, + "grad_norm": 0.28026309609413147, + "learning_rate": 5.169960399714574e-05, + "loss": 1.7872, + "step": 16438 + }, + { + "epoch": 5.04573357888275, + "grad_norm": 0.3262448012828827, + "learning_rate": 5.169463630137146e-05, + "loss": 1.8654, + "step": 16439 + }, + { + "epoch": 5.046040515653775, + "grad_norm": 0.4249584674835205, + "learning_rate": 5.168966858884979e-05, + "loss": 1.7244, + "step": 16440 + }, + { + "epoch": 5.046347452424801, + "grad_norm": 0.3385370969772339, + "learning_rate": 5.168470085962984e-05, + "loss": 1.7745, + "step": 16441 + }, + { + "epoch": 5.046654389195826, + "grad_norm": 0.2321811318397522, + "learning_rate": 5.1679733113760675e-05, + "loss": 1.8093, + "step": 16442 + }, + { + "epoch": 5.04696132596685, + "grad_norm": 0.3426755368709564, + "learning_rate": 5.167476535129141e-05, + "loss": 1.7752, + "step": 16443 + }, + { + "epoch": 5.047268262737876, + "grad_norm": 0.27672505378723145, + "learning_rate": 5.166979757227114e-05, + "loss": 1.7619, + "step": 16444 + }, + { + "epoch": 5.047575199508901, + "grad_norm": 0.4111184775829315, + "learning_rate": 5.1664829776748925e-05, + "loss": 1.7672, + "step": 16445 + }, + { + "epoch": 5.047882136279926, + "grad_norm": 0.40139874815940857, + "learning_rate": 5.1659861964773905e-05, + "loss": 1.7753, + "step": 16446 + }, + { + "epoch": 5.048189073050952, + "grad_norm": 0.28931725025177, + "learning_rate": 5.165489413639516e-05, + "loss": 1.7607, + "step": 16447 + }, + { + "epoch": 5.048496009821977, + "grad_norm": 0.297538161277771, + "learning_rate": 5.1649926291661775e-05, + "loss": 1.7661, + "step": 16448 + }, + { + "epoch": 5.0488029465930016, + "grad_norm": 0.4299027621746063, + "learning_rate": 5.1644958430622846e-05, + "loss": 1.6998, + "step": 16449 + }, + { + "epoch": 5.049109883364027, + "grad_norm": 0.2554767429828644, + "learning_rate": 5.163999055332749e-05, + "loss": 1.7716, + "step": 16450 + }, + { + "epoch": 5.049416820135052, + "grad_norm": 0.3561006486415863, + "learning_rate": 5.163502265982477e-05, + "loss": 1.7493, + "step": 16451 + }, + { + "epoch": 5.0497237569060776, + "grad_norm": 0.3839687407016754, + "learning_rate": 5.1630054750163806e-05, + "loss": 1.7314, + "step": 16452 + }, + { + "epoch": 5.050030693677103, + "grad_norm": 0.20022284984588623, + "learning_rate": 5.1625086824393684e-05, + "loss": 1.6992, + "step": 16453 + }, + { + "epoch": 5.050337630448127, + "grad_norm": 0.36830398440361023, + "learning_rate": 5.162011888256349e-05, + "loss": 1.7339, + "step": 16454 + }, + { + "epoch": 5.050644567219153, + "grad_norm": 0.31947389245033264, + "learning_rate": 5.161515092472236e-05, + "loss": 1.7254, + "step": 16455 + }, + { + "epoch": 5.050951503990178, + "grad_norm": 0.2779252827167511, + "learning_rate": 5.161018295091933e-05, + "loss": 1.7941, + "step": 16456 + }, + { + "epoch": 5.051258440761203, + "grad_norm": 0.3796578347682953, + "learning_rate": 5.160521496120354e-05, + "loss": 1.7389, + "step": 16457 + }, + { + "epoch": 5.051565377532229, + "grad_norm": 0.23569442331790924, + "learning_rate": 5.1600246955624076e-05, + "loss": 1.7149, + "step": 16458 + }, + { + "epoch": 5.051872314303253, + "grad_norm": 0.27342507243156433, + "learning_rate": 5.159527893423004e-05, + "loss": 1.699, + "step": 16459 + }, + { + "epoch": 5.0521792510742785, + "grad_norm": 0.2877296209335327, + "learning_rate": 5.159031089707052e-05, + "loss": 1.7668, + "step": 16460 + }, + { + "epoch": 5.052486187845304, + "grad_norm": 0.21482446789741516, + "learning_rate": 5.1585342844194605e-05, + "loss": 1.7132, + "step": 16461 + }, + { + "epoch": 5.052793124616329, + "grad_norm": 0.23588669300079346, + "learning_rate": 5.158037477565142e-05, + "loss": 1.7267, + "step": 16462 + }, + { + "epoch": 5.0531000613873545, + "grad_norm": 0.20188623666763306, + "learning_rate": 5.157540669149003e-05, + "loss": 1.7486, + "step": 16463 + }, + { + "epoch": 5.05340699815838, + "grad_norm": 0.2012643963098526, + "learning_rate": 5.157043859175955e-05, + "loss": 1.718, + "step": 16464 + }, + { + "epoch": 5.053713934929404, + "grad_norm": 0.23133818805217743, + "learning_rate": 5.156547047650908e-05, + "loss": 1.7892, + "step": 16465 + }, + { + "epoch": 5.05402087170043, + "grad_norm": 0.2524542510509491, + "learning_rate": 5.156050234578771e-05, + "loss": 1.8034, + "step": 16466 + }, + { + "epoch": 5.054327808471455, + "grad_norm": 0.20992529392242432, + "learning_rate": 5.155553419964454e-05, + "loss": 1.7158, + "step": 16467 + }, + { + "epoch": 5.05463474524248, + "grad_norm": 0.23815447092056274, + "learning_rate": 5.155056603812868e-05, + "loss": 1.7632, + "step": 16468 + }, + { + "epoch": 5.054941682013506, + "grad_norm": 0.3306051790714264, + "learning_rate": 5.1545597861289205e-05, + "loss": 1.7719, + "step": 16469 + }, + { + "epoch": 5.05524861878453, + "grad_norm": 0.287541925907135, + "learning_rate": 5.154062966917523e-05, + "loss": 1.7092, + "step": 16470 + }, + { + "epoch": 5.055555555555555, + "grad_norm": 0.28186658024787903, + "learning_rate": 5.153566146183586e-05, + "loss": 1.8548, + "step": 16471 + }, + { + "epoch": 5.055862492326581, + "grad_norm": 0.3511136472225189, + "learning_rate": 5.153069323932017e-05, + "loss": 1.8029, + "step": 16472 + }, + { + "epoch": 5.056169429097606, + "grad_norm": 0.32083824276924133, + "learning_rate": 5.152572500167728e-05, + "loss": 1.7321, + "step": 16473 + }, + { + "epoch": 5.056476365868631, + "grad_norm": 0.22571051120758057, + "learning_rate": 5.1520756748956265e-05, + "loss": 1.7218, + "step": 16474 + }, + { + "epoch": 5.056783302639656, + "grad_norm": 0.2902646064758301, + "learning_rate": 5.151578848120626e-05, + "loss": 1.7231, + "step": 16475 + }, + { + "epoch": 5.057090239410681, + "grad_norm": 0.20447610318660736, + "learning_rate": 5.1510820198476336e-05, + "loss": 1.6998, + "step": 16476 + }, + { + "epoch": 5.0573971761817065, + "grad_norm": 0.29436638951301575, + "learning_rate": 5.1505851900815606e-05, + "loss": 1.6793, + "step": 16477 + }, + { + "epoch": 5.057704112952732, + "grad_norm": 0.29718565940856934, + "learning_rate": 5.1500883588273164e-05, + "loss": 1.8322, + "step": 16478 + }, + { + "epoch": 5.058011049723757, + "grad_norm": 0.23530519008636475, + "learning_rate": 5.149591526089811e-05, + "loss": 1.7408, + "step": 16479 + }, + { + "epoch": 5.0583179864947825, + "grad_norm": 0.30735042691230774, + "learning_rate": 5.1490946918739536e-05, + "loss": 1.7454, + "step": 16480 + }, + { + "epoch": 5.058624923265807, + "grad_norm": 0.26151445508003235, + "learning_rate": 5.148597856184656e-05, + "loss": 1.7728, + "step": 16481 + }, + { + "epoch": 5.058931860036832, + "grad_norm": 0.2657756209373474, + "learning_rate": 5.1481010190268263e-05, + "loss": 1.7905, + "step": 16482 + }, + { + "epoch": 5.059238796807858, + "grad_norm": 0.25418251752853394, + "learning_rate": 5.147604180405376e-05, + "loss": 1.7676, + "step": 16483 + }, + { + "epoch": 5.059545733578883, + "grad_norm": 0.25486254692077637, + "learning_rate": 5.1471073403252154e-05, + "loss": 1.8347, + "step": 16484 + }, + { + "epoch": 5.059852670349908, + "grad_norm": 0.22693100571632385, + "learning_rate": 5.146610498791255e-05, + "loss": 1.7308, + "step": 16485 + }, + { + "epoch": 5.060159607120933, + "grad_norm": 0.22056837379932404, + "learning_rate": 5.146113655808401e-05, + "loss": 1.7158, + "step": 16486 + }, + { + "epoch": 5.060466543891958, + "grad_norm": 0.221246138215065, + "learning_rate": 5.1456168113815685e-05, + "loss": 1.6985, + "step": 16487 + }, + { + "epoch": 5.060773480662983, + "grad_norm": 0.2149408906698227, + "learning_rate": 5.145119965515664e-05, + "loss": 1.716, + "step": 16488 + }, + { + "epoch": 5.061080417434009, + "grad_norm": 0.23958513140678406, + "learning_rate": 5.144623118215599e-05, + "loss": 1.8092, + "step": 16489 + }, + { + "epoch": 5.061387354205034, + "grad_norm": 0.2870621085166931, + "learning_rate": 5.1441262694862836e-05, + "loss": 1.75, + "step": 16490 + }, + { + "epoch": 5.0616942909760585, + "grad_norm": 0.26755061745643616, + "learning_rate": 5.1436294193326276e-05, + "loss": 1.7848, + "step": 16491 + }, + { + "epoch": 5.062001227747084, + "grad_norm": 0.2434249073266983, + "learning_rate": 5.143132567759542e-05, + "loss": 1.7487, + "step": 16492 + }, + { + "epoch": 5.062308164518109, + "grad_norm": 0.3044668138027191, + "learning_rate": 5.142635714771936e-05, + "loss": 1.741, + "step": 16493 + }, + { + "epoch": 5.0626151012891345, + "grad_norm": 0.2166958749294281, + "learning_rate": 5.142138860374721e-05, + "loss": 1.7232, + "step": 16494 + }, + { + "epoch": 5.06292203806016, + "grad_norm": 0.34558552503585815, + "learning_rate": 5.141642004572806e-05, + "loss": 1.7663, + "step": 16495 + }, + { + "epoch": 5.063228974831185, + "grad_norm": 0.330751895904541, + "learning_rate": 5.141145147371102e-05, + "loss": 1.6818, + "step": 16496 + }, + { + "epoch": 5.06353591160221, + "grad_norm": 0.21613973379135132, + "learning_rate": 5.140648288774518e-05, + "loss": 1.7914, + "step": 16497 + }, + { + "epoch": 5.063842848373235, + "grad_norm": 0.32759732007980347, + "learning_rate": 5.140151428787966e-05, + "loss": 1.7543, + "step": 16498 + }, + { + "epoch": 5.06414978514426, + "grad_norm": 0.3180293142795563, + "learning_rate": 5.1396545674163556e-05, + "loss": 1.8163, + "step": 16499 + }, + { + "epoch": 5.064456721915286, + "grad_norm": 0.19757944345474243, + "learning_rate": 5.1391577046645964e-05, + "loss": 1.71, + "step": 16500 + }, + { + "epoch": 5.064763658686311, + "grad_norm": 0.253366619348526, + "learning_rate": 5.1386608405376005e-05, + "loss": 1.7266, + "step": 16501 + }, + { + "epoch": 5.065070595457335, + "grad_norm": 0.24577608704566956, + "learning_rate": 5.1381639750402754e-05, + "loss": 1.7218, + "step": 16502 + }, + { + "epoch": 5.065377532228361, + "grad_norm": 0.22847014665603638, + "learning_rate": 5.137667108177533e-05, + "loss": 1.8025, + "step": 16503 + }, + { + "epoch": 5.065684468999386, + "grad_norm": 0.2089833766222, + "learning_rate": 5.137170239954284e-05, + "loss": 1.8032, + "step": 16504 + }, + { + "epoch": 5.065991405770411, + "grad_norm": 0.21528512239456177, + "learning_rate": 5.136673370375439e-05, + "loss": 1.7227, + "step": 16505 + }, + { + "epoch": 5.066298342541437, + "grad_norm": 0.2099117785692215, + "learning_rate": 5.1361764994459074e-05, + "loss": 1.7176, + "step": 16506 + }, + { + "epoch": 5.066605279312462, + "grad_norm": 0.2140430212020874, + "learning_rate": 5.135679627170599e-05, + "loss": 1.8195, + "step": 16507 + }, + { + "epoch": 5.0669122160834865, + "grad_norm": 0.20253533124923706, + "learning_rate": 5.135182753554424e-05, + "loss": 1.7284, + "step": 16508 + }, + { + "epoch": 5.067219152854512, + "grad_norm": 0.19945639371871948, + "learning_rate": 5.134685878602295e-05, + "loss": 1.6915, + "step": 16509 + }, + { + "epoch": 5.067526089625537, + "grad_norm": 0.20138494670391083, + "learning_rate": 5.1341890023191216e-05, + "loss": 1.7856, + "step": 16510 + }, + { + "epoch": 5.0678330263965625, + "grad_norm": 0.22124232351779938, + "learning_rate": 5.1336921247098136e-05, + "loss": 1.7674, + "step": 16511 + }, + { + "epoch": 5.068139963167588, + "grad_norm": 0.21564216911792755, + "learning_rate": 5.133195245779282e-05, + "loss": 1.6998, + "step": 16512 + }, + { + "epoch": 5.068446899938612, + "grad_norm": 0.21836799383163452, + "learning_rate": 5.1326983655324365e-05, + "loss": 1.7468, + "step": 16513 + }, + { + "epoch": 5.068753836709638, + "grad_norm": 0.2412201464176178, + "learning_rate": 5.132201483974187e-05, + "loss": 1.7433, + "step": 16514 + }, + { + "epoch": 5.069060773480663, + "grad_norm": 0.262054979801178, + "learning_rate": 5.131704601109446e-05, + "loss": 1.8315, + "step": 16515 + }, + { + "epoch": 5.069367710251688, + "grad_norm": 0.21573080122470856, + "learning_rate": 5.1312077169431225e-05, + "loss": 1.7668, + "step": 16516 + }, + { + "epoch": 5.069674647022714, + "grad_norm": 0.21407057344913483, + "learning_rate": 5.130710831480129e-05, + "loss": 1.7486, + "step": 16517 + }, + { + "epoch": 5.069981583793738, + "grad_norm": 0.2128407508134842, + "learning_rate": 5.130213944725373e-05, + "loss": 1.7618, + "step": 16518 + }, + { + "epoch": 5.070288520564763, + "grad_norm": 0.2034141719341278, + "learning_rate": 5.129717056683767e-05, + "loss": 1.726, + "step": 16519 + }, + { + "epoch": 5.070595457335789, + "grad_norm": 0.21474458277225494, + "learning_rate": 5.1292201673602205e-05, + "loss": 1.7883, + "step": 16520 + }, + { + "epoch": 5.070902394106814, + "grad_norm": 0.2102673202753067, + "learning_rate": 5.128723276759645e-05, + "loss": 1.7826, + "step": 16521 + }, + { + "epoch": 5.071209330877839, + "grad_norm": 0.21342496573925018, + "learning_rate": 5.1282263848869505e-05, + "loss": 1.7561, + "step": 16522 + }, + { + "epoch": 5.071516267648865, + "grad_norm": 0.21749620139598846, + "learning_rate": 5.1277294917470474e-05, + "loss": 1.7814, + "step": 16523 + }, + { + "epoch": 5.071823204419889, + "grad_norm": 0.20006774365901947, + "learning_rate": 5.1272325973448476e-05, + "loss": 1.6965, + "step": 16524 + }, + { + "epoch": 5.0721301411909145, + "grad_norm": 0.20878590643405914, + "learning_rate": 5.1267357016852593e-05, + "loss": 1.7426, + "step": 16525 + }, + { + "epoch": 5.07243707796194, + "grad_norm": 0.21824820339679718, + "learning_rate": 5.1262388047731946e-05, + "loss": 1.7704, + "step": 16526 + }, + { + "epoch": 5.072744014732965, + "grad_norm": 0.1992526650428772, + "learning_rate": 5.125741906613565e-05, + "loss": 1.7874, + "step": 16527 + }, + { + "epoch": 5.0730509515039905, + "grad_norm": 0.21028028428554535, + "learning_rate": 5.12524500721128e-05, + "loss": 1.7483, + "step": 16528 + }, + { + "epoch": 5.073357888275015, + "grad_norm": 0.21840833127498627, + "learning_rate": 5.12474810657125e-05, + "loss": 1.7763, + "step": 16529 + }, + { + "epoch": 5.07366482504604, + "grad_norm": 0.249269038438797, + "learning_rate": 5.124251204698387e-05, + "loss": 1.7451, + "step": 16530 + }, + { + "epoch": 5.073971761817066, + "grad_norm": 0.2176963835954666, + "learning_rate": 5.1237543015975986e-05, + "loss": 1.7079, + "step": 16531 + }, + { + "epoch": 5.074278698588091, + "grad_norm": 0.20284616947174072, + "learning_rate": 5.1232573972738e-05, + "loss": 1.7235, + "step": 16532 + }, + { + "epoch": 5.074585635359116, + "grad_norm": 0.20140530169010162, + "learning_rate": 5.1227604917318984e-05, + "loss": 1.7014, + "step": 16533 + }, + { + "epoch": 5.074892572130141, + "grad_norm": 0.2407023161649704, + "learning_rate": 5.1222635849768066e-05, + "loss": 1.7493, + "step": 16534 + }, + { + "epoch": 5.075199508901166, + "grad_norm": 0.2013770490884781, + "learning_rate": 5.121766677013433e-05, + "loss": 1.7601, + "step": 16535 + }, + { + "epoch": 5.0755064456721914, + "grad_norm": 0.23889221251010895, + "learning_rate": 5.1212697678466916e-05, + "loss": 1.7282, + "step": 16536 + }, + { + "epoch": 5.075813382443217, + "grad_norm": 0.2411198765039444, + "learning_rate": 5.120772857481489e-05, + "loss": 1.8138, + "step": 16537 + }, + { + "epoch": 5.076120319214242, + "grad_norm": 0.24521365761756897, + "learning_rate": 5.12027594592274e-05, + "loss": 1.7659, + "step": 16538 + }, + { + "epoch": 5.0764272559852675, + "grad_norm": 0.2841372787952423, + "learning_rate": 5.119779033175354e-05, + "loss": 1.7973, + "step": 16539 + }, + { + "epoch": 5.076734192756292, + "grad_norm": 0.21796928346157074, + "learning_rate": 5.1192821192442395e-05, + "loss": 1.6985, + "step": 16540 + }, + { + "epoch": 5.077041129527317, + "grad_norm": 0.2244848757982254, + "learning_rate": 5.118785204134311e-05, + "loss": 1.7413, + "step": 16541 + }, + { + "epoch": 5.077348066298343, + "grad_norm": 0.22581063210964203, + "learning_rate": 5.1182882878504766e-05, + "loss": 1.7706, + "step": 16542 + }, + { + "epoch": 5.077655003069368, + "grad_norm": 0.24478016793727875, + "learning_rate": 5.117791370397647e-05, + "loss": 1.7628, + "step": 16543 + }, + { + "epoch": 5.077961939840393, + "grad_norm": 0.31270188093185425, + "learning_rate": 5.117294451780734e-05, + "loss": 1.8254, + "step": 16544 + }, + { + "epoch": 5.078268876611418, + "grad_norm": 0.3547368049621582, + "learning_rate": 5.11679753200465e-05, + "loss": 1.781, + "step": 16545 + }, + { + "epoch": 5.078575813382443, + "grad_norm": 0.24920180439949036, + "learning_rate": 5.116300611074304e-05, + "loss": 1.7748, + "step": 16546 + }, + { + "epoch": 5.078882750153468, + "grad_norm": 0.2368776649236679, + "learning_rate": 5.115803688994607e-05, + "loss": 1.7459, + "step": 16547 + }, + { + "epoch": 5.079189686924494, + "grad_norm": 0.28341975808143616, + "learning_rate": 5.115306765770471e-05, + "loss": 1.6694, + "step": 16548 + }, + { + "epoch": 5.079496623695519, + "grad_norm": 0.2521432936191559, + "learning_rate": 5.114809841406804e-05, + "loss": 1.7544, + "step": 16549 + }, + { + "epoch": 5.0798035604665435, + "grad_norm": 0.21199844777584076, + "learning_rate": 5.11431291590852e-05, + "loss": 1.7215, + "step": 16550 + }, + { + "epoch": 5.080110497237569, + "grad_norm": 0.25157347321510315, + "learning_rate": 5.113815989280528e-05, + "loss": 1.8021, + "step": 16551 + }, + { + "epoch": 5.080417434008594, + "grad_norm": 0.2284129559993744, + "learning_rate": 5.1133190615277414e-05, + "loss": 1.7125, + "step": 16552 + }, + { + "epoch": 5.0807243707796195, + "grad_norm": 0.2297726720571518, + "learning_rate": 5.11282213265507e-05, + "loss": 1.7602, + "step": 16553 + }, + { + "epoch": 5.081031307550645, + "grad_norm": 0.22392617166042328, + "learning_rate": 5.112325202667421e-05, + "loss": 1.7251, + "step": 16554 + }, + { + "epoch": 5.08133824432167, + "grad_norm": 0.22406147420406342, + "learning_rate": 5.11182827156971e-05, + "loss": 1.7232, + "step": 16555 + }, + { + "epoch": 5.081645181092695, + "grad_norm": 0.2547284960746765, + "learning_rate": 5.111331339366846e-05, + "loss": 1.7335, + "step": 16556 + }, + { + "epoch": 5.08195211786372, + "grad_norm": 0.216146782040596, + "learning_rate": 5.1108344060637415e-05, + "loss": 1.7469, + "step": 16557 + }, + { + "epoch": 5.082259054634745, + "grad_norm": 0.1926967352628708, + "learning_rate": 5.110337471665306e-05, + "loss": 1.7492, + "step": 16558 + }, + { + "epoch": 5.082565991405771, + "grad_norm": 0.30311331152915955, + "learning_rate": 5.109840536176451e-05, + "loss": 1.8129, + "step": 16559 + }, + { + "epoch": 5.082872928176796, + "grad_norm": 0.24273787438869476, + "learning_rate": 5.109343599602087e-05, + "loss": 1.7206, + "step": 16560 + }, + { + "epoch": 5.08317986494782, + "grad_norm": 0.22736592590808868, + "learning_rate": 5.1088466619471255e-05, + "loss": 1.732, + "step": 16561 + }, + { + "epoch": 5.083486801718846, + "grad_norm": 0.21457640826702118, + "learning_rate": 5.1083497232164777e-05, + "loss": 1.726, + "step": 16562 + }, + { + "epoch": 5.083793738489871, + "grad_norm": 0.20968590676784515, + "learning_rate": 5.107852783415055e-05, + "loss": 1.8095, + "step": 16563 + }, + { + "epoch": 5.084100675260896, + "grad_norm": 0.2846728265285492, + "learning_rate": 5.107355842547768e-05, + "loss": 1.7524, + "step": 16564 + }, + { + "epoch": 5.084407612031922, + "grad_norm": 0.21162885427474976, + "learning_rate": 5.106858900619526e-05, + "loss": 1.753, + "step": 16565 + }, + { + "epoch": 5.084714548802946, + "grad_norm": 0.24349012970924377, + "learning_rate": 5.106361957635242e-05, + "loss": 1.7003, + "step": 16566 + }, + { + "epoch": 5.0850214855739715, + "grad_norm": 0.24532537162303925, + "learning_rate": 5.105865013599828e-05, + "loss": 1.7818, + "step": 16567 + }, + { + "epoch": 5.085328422344997, + "grad_norm": 0.22788558900356293, + "learning_rate": 5.1053680685181926e-05, + "loss": 1.7291, + "step": 16568 + }, + { + "epoch": 5.085635359116022, + "grad_norm": 0.22402508556842804, + "learning_rate": 5.10487112239525e-05, + "loss": 1.8292, + "step": 16569 + }, + { + "epoch": 5.0859422958870475, + "grad_norm": 0.2396162748336792, + "learning_rate": 5.1043741752359085e-05, + "loss": 1.7441, + "step": 16570 + }, + { + "epoch": 5.086249232658073, + "grad_norm": 0.22364887595176697, + "learning_rate": 5.1038772270450796e-05, + "loss": 1.7356, + "step": 16571 + }, + { + "epoch": 5.086556169429097, + "grad_norm": 0.20385414361953735, + "learning_rate": 5.103380277827676e-05, + "loss": 1.774, + "step": 16572 + }, + { + "epoch": 5.086863106200123, + "grad_norm": 0.2050715535879135, + "learning_rate": 5.102883327588608e-05, + "loss": 1.7217, + "step": 16573 + }, + { + "epoch": 5.087170042971148, + "grad_norm": 0.23750410974025726, + "learning_rate": 5.102386376332786e-05, + "loss": 1.7605, + "step": 16574 + }, + { + "epoch": 5.087476979742173, + "grad_norm": 0.24313338100910187, + "learning_rate": 5.101889424065122e-05, + "loss": 1.7498, + "step": 16575 + }, + { + "epoch": 5.087783916513199, + "grad_norm": 0.22145850956439972, + "learning_rate": 5.101392470790527e-05, + "loss": 1.7827, + "step": 16576 + }, + { + "epoch": 5.088090853284223, + "grad_norm": 0.23073779046535492, + "learning_rate": 5.100895516513912e-05, + "loss": 1.7722, + "step": 16577 + }, + { + "epoch": 5.088397790055248, + "grad_norm": 0.2112295925617218, + "learning_rate": 5.100398561240188e-05, + "loss": 1.7755, + "step": 16578 + }, + { + "epoch": 5.088704726826274, + "grad_norm": 0.23263800144195557, + "learning_rate": 5.0999016049742675e-05, + "loss": 1.7593, + "step": 16579 + }, + { + "epoch": 5.089011663597299, + "grad_norm": 0.23011381924152374, + "learning_rate": 5.09940464772106e-05, + "loss": 1.704, + "step": 16580 + }, + { + "epoch": 5.089318600368324, + "grad_norm": 0.1930779367685318, + "learning_rate": 5.0989076894854785e-05, + "loss": 1.7038, + "step": 16581 + }, + { + "epoch": 5.08962553713935, + "grad_norm": 0.2100505381822586, + "learning_rate": 5.098410730272433e-05, + "loss": 1.7671, + "step": 16582 + }, + { + "epoch": 5.089932473910374, + "grad_norm": 0.1919277459383011, + "learning_rate": 5.097913770086833e-05, + "loss": 1.651, + "step": 16583 + }, + { + "epoch": 5.0902394106813995, + "grad_norm": 0.23310615122318268, + "learning_rate": 5.097416808933594e-05, + "loss": 1.8294, + "step": 16584 + }, + { + "epoch": 5.090546347452425, + "grad_norm": 0.26191771030426025, + "learning_rate": 5.096919846817624e-05, + "loss": 1.7522, + "step": 16585 + }, + { + "epoch": 5.09085328422345, + "grad_norm": 0.2508419156074524, + "learning_rate": 5.096422883743835e-05, + "loss": 1.8025, + "step": 16586 + }, + { + "epoch": 5.0911602209944755, + "grad_norm": 0.23192499577999115, + "learning_rate": 5.0959259197171414e-05, + "loss": 1.7885, + "step": 16587 + }, + { + "epoch": 5.0914671577655, + "grad_norm": 0.2164602279663086, + "learning_rate": 5.095428954742448e-05, + "loss": 1.7299, + "step": 16588 + }, + { + "epoch": 5.091774094536525, + "grad_norm": 0.21431668102741241, + "learning_rate": 5.094931988824671e-05, + "loss": 1.7122, + "step": 16589 + }, + { + "epoch": 5.092081031307551, + "grad_norm": 0.20563583076000214, + "learning_rate": 5.094435021968722e-05, + "loss": 1.7118, + "step": 16590 + }, + { + "epoch": 5.092387968078576, + "grad_norm": 0.20916326344013214, + "learning_rate": 5.093938054179509e-05, + "loss": 1.7639, + "step": 16591 + }, + { + "epoch": 5.092694904849601, + "grad_norm": 0.21197481453418732, + "learning_rate": 5.0934410854619454e-05, + "loss": 1.7357, + "step": 16592 + }, + { + "epoch": 5.093001841620626, + "grad_norm": 0.21085995435714722, + "learning_rate": 5.092944115820942e-05, + "loss": 1.6921, + "step": 16593 + }, + { + "epoch": 5.093308778391651, + "grad_norm": 0.2608145773410797, + "learning_rate": 5.09244714526141e-05, + "loss": 1.7541, + "step": 16594 + }, + { + "epoch": 5.093615715162676, + "grad_norm": 0.2138587087392807, + "learning_rate": 5.0919501737882624e-05, + "loss": 1.727, + "step": 16595 + }, + { + "epoch": 5.093922651933702, + "grad_norm": 0.230251282453537, + "learning_rate": 5.0914532014064084e-05, + "loss": 1.7828, + "step": 16596 + }, + { + "epoch": 5.094229588704727, + "grad_norm": 0.2162851244211197, + "learning_rate": 5.0909562281207614e-05, + "loss": 1.6905, + "step": 16597 + }, + { + "epoch": 5.094536525475752, + "grad_norm": 0.20637664198875427, + "learning_rate": 5.090459253936231e-05, + "loss": 1.7484, + "step": 16598 + }, + { + "epoch": 5.094843462246777, + "grad_norm": 0.19427815079689026, + "learning_rate": 5.089962278857728e-05, + "loss": 1.7379, + "step": 16599 + }, + { + "epoch": 5.095150399017802, + "grad_norm": 0.1877593845129013, + "learning_rate": 5.089465302890165e-05, + "loss": 1.7017, + "step": 16600 + }, + { + "epoch": 5.0954573357888275, + "grad_norm": 0.19219037890434265, + "learning_rate": 5.0889683260384543e-05, + "loss": 1.7379, + "step": 16601 + }, + { + "epoch": 5.095764272559853, + "grad_norm": 0.19855685532093048, + "learning_rate": 5.088471348307507e-05, + "loss": 1.7171, + "step": 16602 + }, + { + "epoch": 5.096071209330878, + "grad_norm": 0.19119660556316376, + "learning_rate": 5.087974369702235e-05, + "loss": 1.6912, + "step": 16603 + }, + { + "epoch": 5.096378146101903, + "grad_norm": 0.2102670818567276, + "learning_rate": 5.0874773902275476e-05, + "loss": 1.6825, + "step": 16604 + }, + { + "epoch": 5.096685082872928, + "grad_norm": 0.2120765596628189, + "learning_rate": 5.0869804098883564e-05, + "loss": 1.7055, + "step": 16605 + }, + { + "epoch": 5.096992019643953, + "grad_norm": 0.25874772667884827, + "learning_rate": 5.0864834286895745e-05, + "loss": 1.7193, + "step": 16606 + }, + { + "epoch": 5.097298956414979, + "grad_norm": 0.20822012424468994, + "learning_rate": 5.085986446636113e-05, + "loss": 1.6748, + "step": 16607 + }, + { + "epoch": 5.097605893186004, + "grad_norm": 0.21364718675613403, + "learning_rate": 5.085489463732883e-05, + "loss": 1.7762, + "step": 16608 + }, + { + "epoch": 5.097912829957028, + "grad_norm": 0.21961788833141327, + "learning_rate": 5.084992479984796e-05, + "loss": 1.7243, + "step": 16609 + }, + { + "epoch": 5.098219766728054, + "grad_norm": 0.22056026756763458, + "learning_rate": 5.0844954953967624e-05, + "loss": 1.6983, + "step": 16610 + }, + { + "epoch": 5.098526703499079, + "grad_norm": 0.21347738802433014, + "learning_rate": 5.083998509973695e-05, + "loss": 1.7319, + "step": 16611 + }, + { + "epoch": 5.098833640270104, + "grad_norm": 0.23593664169311523, + "learning_rate": 5.083501523720506e-05, + "loss": 1.7121, + "step": 16612 + }, + { + "epoch": 5.09914057704113, + "grad_norm": 0.2088623344898224, + "learning_rate": 5.0830045366421055e-05, + "loss": 1.72, + "step": 16613 + }, + { + "epoch": 5.099447513812155, + "grad_norm": 0.2293832004070282, + "learning_rate": 5.082507548743406e-05, + "loss": 1.7548, + "step": 16614 + }, + { + "epoch": 5.0997544505831796, + "grad_norm": 0.2509057819843292, + "learning_rate": 5.082010560029319e-05, + "loss": 1.7729, + "step": 16615 + }, + { + "epoch": 5.100061387354205, + "grad_norm": 0.1925390362739563, + "learning_rate": 5.081513570504755e-05, + "loss": 1.7109, + "step": 16616 + }, + { + "epoch": 5.10036832412523, + "grad_norm": 0.20876559615135193, + "learning_rate": 5.081016580174626e-05, + "loss": 1.7031, + "step": 16617 + }, + { + "epoch": 5.100675260896256, + "grad_norm": 0.2038683146238327, + "learning_rate": 5.080519589043842e-05, + "loss": 1.7489, + "step": 16618 + }, + { + "epoch": 5.100982197667281, + "grad_norm": 0.25018224120140076, + "learning_rate": 5.080022597117318e-05, + "loss": 1.7884, + "step": 16619 + }, + { + "epoch": 5.101289134438305, + "grad_norm": 0.24430342018604279, + "learning_rate": 5.079525604399965e-05, + "loss": 1.7558, + "step": 16620 + }, + { + "epoch": 5.101596071209331, + "grad_norm": 0.22151432931423187, + "learning_rate": 5.079028610896692e-05, + "loss": 1.7543, + "step": 16621 + }, + { + "epoch": 5.101903007980356, + "grad_norm": 0.2313055694103241, + "learning_rate": 5.0785316166124107e-05, + "loss": 1.7755, + "step": 16622 + }, + { + "epoch": 5.102209944751381, + "grad_norm": 0.27405816316604614, + "learning_rate": 5.0780346215520355e-05, + "loss": 1.7006, + "step": 16623 + }, + { + "epoch": 5.102516881522407, + "grad_norm": 0.2209920734167099, + "learning_rate": 5.077537625720476e-05, + "loss": 1.6877, + "step": 16624 + }, + { + "epoch": 5.102823818293431, + "grad_norm": 0.20993784070014954, + "learning_rate": 5.077040629122645e-05, + "loss": 1.7558, + "step": 16625 + }, + { + "epoch": 5.1031307550644565, + "grad_norm": 0.25554344058036804, + "learning_rate": 5.076543631763453e-05, + "loss": 1.7142, + "step": 16626 + }, + { + "epoch": 5.103437691835482, + "grad_norm": 0.28980588912963867, + "learning_rate": 5.0760466336478116e-05, + "loss": 1.7632, + "step": 16627 + }, + { + "epoch": 5.103744628606507, + "grad_norm": 0.20144744217395782, + "learning_rate": 5.075549634780633e-05, + "loss": 1.7472, + "step": 16628 + }, + { + "epoch": 5.1040515653775325, + "grad_norm": 0.30335596203804016, + "learning_rate": 5.075052635166827e-05, + "loss": 1.7283, + "step": 16629 + }, + { + "epoch": 5.104358502148558, + "grad_norm": 0.3014097213745117, + "learning_rate": 5.074555634811309e-05, + "loss": 1.7273, + "step": 16630 + }, + { + "epoch": 5.104665438919582, + "grad_norm": 0.20123563706874847, + "learning_rate": 5.074058633718988e-05, + "loss": 1.7119, + "step": 16631 + }, + { + "epoch": 5.104972375690608, + "grad_norm": 0.3375137746334076, + "learning_rate": 5.073561631894776e-05, + "loss": 1.7594, + "step": 16632 + }, + { + "epoch": 5.105279312461633, + "grad_norm": 0.3471776247024536, + "learning_rate": 5.0730646293435846e-05, + "loss": 1.729, + "step": 16633 + }, + { + "epoch": 5.105586249232658, + "grad_norm": 0.26405471563339233, + "learning_rate": 5.072567626070327e-05, + "loss": 1.7472, + "step": 16634 + }, + { + "epoch": 5.105893186003684, + "grad_norm": 0.2339334636926651, + "learning_rate": 5.072070622079911e-05, + "loss": 1.7285, + "step": 16635 + }, + { + "epoch": 5.106200122774708, + "grad_norm": 0.26267752051353455, + "learning_rate": 5.0715736173772534e-05, + "loss": 1.7171, + "step": 16636 + }, + { + "epoch": 5.106507059545733, + "grad_norm": 0.22254765033721924, + "learning_rate": 5.0710766119672626e-05, + "loss": 1.7702, + "step": 16637 + }, + { + "epoch": 5.106813996316759, + "grad_norm": 0.2457888424396515, + "learning_rate": 5.070579605854852e-05, + "loss": 1.7987, + "step": 16638 + }, + { + "epoch": 5.107120933087784, + "grad_norm": 0.24500930309295654, + "learning_rate": 5.070082599044931e-05, + "loss": 1.8103, + "step": 16639 + }, + { + "epoch": 5.107427869858809, + "grad_norm": 0.24446405470371246, + "learning_rate": 5.0695855915424116e-05, + "loss": 1.7058, + "step": 16640 + }, + { + "epoch": 5.107734806629834, + "grad_norm": 0.22352534532546997, + "learning_rate": 5.0690885833522086e-05, + "loss": 1.7503, + "step": 16641 + }, + { + "epoch": 5.108041743400859, + "grad_norm": 0.2308795005083084, + "learning_rate": 5.068591574479231e-05, + "loss": 1.8064, + "step": 16642 + }, + { + "epoch": 5.1083486801718845, + "grad_norm": 0.23804180324077606, + "learning_rate": 5.068094564928392e-05, + "loss": 1.7603, + "step": 16643 + }, + { + "epoch": 5.10865561694291, + "grad_norm": 0.1956508308649063, + "learning_rate": 5.0675975547046016e-05, + "loss": 1.7448, + "step": 16644 + }, + { + "epoch": 5.108962553713935, + "grad_norm": 0.24438725411891937, + "learning_rate": 5.067100543812773e-05, + "loss": 1.7706, + "step": 16645 + }, + { + "epoch": 5.1092694904849605, + "grad_norm": 0.26129621267318726, + "learning_rate": 5.066603532257817e-05, + "loss": 1.7321, + "step": 16646 + }, + { + "epoch": 5.109576427255985, + "grad_norm": 0.2024240493774414, + "learning_rate": 5.066106520044646e-05, + "loss": 1.7033, + "step": 16647 + }, + { + "epoch": 5.10988336402701, + "grad_norm": 0.2096802294254303, + "learning_rate": 5.0656095071781716e-05, + "loss": 1.716, + "step": 16648 + }, + { + "epoch": 5.110190300798036, + "grad_norm": 0.20643317699432373, + "learning_rate": 5.0651124936633054e-05, + "loss": 1.7473, + "step": 16649 + }, + { + "epoch": 5.110497237569061, + "grad_norm": 0.2268853783607483, + "learning_rate": 5.0646154795049604e-05, + "loss": 1.7844, + "step": 16650 + }, + { + "epoch": 5.110804174340086, + "grad_norm": 0.20215095579624176, + "learning_rate": 5.064118464708046e-05, + "loss": 1.7138, + "step": 16651 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 0.19411569833755493, + "learning_rate": 5.063621449277476e-05, + "loss": 1.7526, + "step": 16652 + }, + { + "epoch": 5.111418047882136, + "grad_norm": 0.20199783146381378, + "learning_rate": 5.063124433218161e-05, + "loss": 1.806, + "step": 16653 + }, + { + "epoch": 5.111724984653161, + "grad_norm": 0.23351836204528809, + "learning_rate": 5.0626274165350165e-05, + "loss": 1.7529, + "step": 16654 + }, + { + "epoch": 5.112031921424187, + "grad_norm": 0.21098989248275757, + "learning_rate": 5.062130399232948e-05, + "loss": 1.7647, + "step": 16655 + }, + { + "epoch": 5.112338858195212, + "grad_norm": 0.21959169209003448, + "learning_rate": 5.0616333813168714e-05, + "loss": 1.7462, + "step": 16656 + }, + { + "epoch": 5.112645794966237, + "grad_norm": 0.21173696219921112, + "learning_rate": 5.061136362791696e-05, + "loss": 1.7413, + "step": 16657 + }, + { + "epoch": 5.112952731737262, + "grad_norm": 0.22357577085494995, + "learning_rate": 5.0606393436623365e-05, + "loss": 1.7163, + "step": 16658 + }, + { + "epoch": 5.113259668508287, + "grad_norm": 0.24364936351776123, + "learning_rate": 5.060142323933704e-05, + "loss": 1.8139, + "step": 16659 + }, + { + "epoch": 5.1135666052793125, + "grad_norm": 0.21646073460578918, + "learning_rate": 5.05964530361071e-05, + "loss": 1.741, + "step": 16660 + }, + { + "epoch": 5.113873542050338, + "grad_norm": 0.24261775612831116, + "learning_rate": 5.059148282698265e-05, + "loss": 1.7162, + "step": 16661 + }, + { + "epoch": 5.114180478821363, + "grad_norm": 0.22883281111717224, + "learning_rate": 5.058651261201283e-05, + "loss": 1.7342, + "step": 16662 + }, + { + "epoch": 5.114487415592388, + "grad_norm": 0.2616727352142334, + "learning_rate": 5.058154239124674e-05, + "loss": 1.8054, + "step": 16663 + }, + { + "epoch": 5.114794352363413, + "grad_norm": 0.21293358504772186, + "learning_rate": 5.0576572164733505e-05, + "loss": 1.742, + "step": 16664 + }, + { + "epoch": 5.115101289134438, + "grad_norm": 0.20037685334682465, + "learning_rate": 5.057160193252225e-05, + "loss": 1.7518, + "step": 16665 + }, + { + "epoch": 5.115408225905464, + "grad_norm": 0.19102689623832703, + "learning_rate": 5.056663169466209e-05, + "loss": 1.6892, + "step": 16666 + }, + { + "epoch": 5.115715162676489, + "grad_norm": 0.22261591255664825, + "learning_rate": 5.056166145120216e-05, + "loss": 1.7744, + "step": 16667 + }, + { + "epoch": 5.116022099447513, + "grad_norm": 0.23966702818870544, + "learning_rate": 5.055669120219154e-05, + "loss": 1.7786, + "step": 16668 + }, + { + "epoch": 5.116329036218539, + "grad_norm": 0.22008271515369415, + "learning_rate": 5.055172094767937e-05, + "loss": 1.7501, + "step": 16669 + }, + { + "epoch": 5.116635972989564, + "grad_norm": 0.21643415093421936, + "learning_rate": 5.054675068771478e-05, + "loss": 1.7548, + "step": 16670 + }, + { + "epoch": 5.116942909760589, + "grad_norm": 0.24661116302013397, + "learning_rate": 5.0541780422346894e-05, + "loss": 1.8117, + "step": 16671 + }, + { + "epoch": 5.117249846531615, + "grad_norm": 0.21393093466758728, + "learning_rate": 5.05368101516248e-05, + "loss": 1.7341, + "step": 16672 + }, + { + "epoch": 5.11755678330264, + "grad_norm": 0.30949896574020386, + "learning_rate": 5.053183987559763e-05, + "loss": 1.7703, + "step": 16673 + }, + { + "epoch": 5.1178637200736645, + "grad_norm": 0.22236786782741547, + "learning_rate": 5.052686959431451e-05, + "loss": 1.719, + "step": 16674 + }, + { + "epoch": 5.11817065684469, + "grad_norm": 0.26826921105384827, + "learning_rate": 5.052189930782455e-05, + "loss": 1.741, + "step": 16675 + }, + { + "epoch": 5.118477593615715, + "grad_norm": 0.2608947455883026, + "learning_rate": 5.051692901617688e-05, + "loss": 1.7062, + "step": 16676 + }, + { + "epoch": 5.1187845303867405, + "grad_norm": 0.20709002017974854, + "learning_rate": 5.051195871942063e-05, + "loss": 1.703, + "step": 16677 + }, + { + "epoch": 5.119091467157766, + "grad_norm": 0.18957734107971191, + "learning_rate": 5.0506988417604885e-05, + "loss": 1.762, + "step": 16678 + }, + { + "epoch": 5.11939840392879, + "grad_norm": 0.21578781306743622, + "learning_rate": 5.050201811077879e-05, + "loss": 1.7167, + "step": 16679 + }, + { + "epoch": 5.119705340699816, + "grad_norm": 0.2253631353378296, + "learning_rate": 5.049704779899145e-05, + "loss": 1.7374, + "step": 16680 + }, + { + "epoch": 5.120012277470841, + "grad_norm": 0.1977664828300476, + "learning_rate": 5.049207748229199e-05, + "loss": 1.7399, + "step": 16681 + }, + { + "epoch": 5.120319214241866, + "grad_norm": 0.2964428663253784, + "learning_rate": 5.048710716072954e-05, + "loss": 1.8359, + "step": 16682 + }, + { + "epoch": 5.120626151012892, + "grad_norm": 0.24788637459278107, + "learning_rate": 5.0482136834353224e-05, + "loss": 1.7593, + "step": 16683 + }, + { + "epoch": 5.120933087783916, + "grad_norm": 0.21537743508815765, + "learning_rate": 5.0477166503212135e-05, + "loss": 1.7472, + "step": 16684 + }, + { + "epoch": 5.121240024554941, + "grad_norm": 0.2055196613073349, + "learning_rate": 5.047219616735541e-05, + "loss": 1.7106, + "step": 16685 + }, + { + "epoch": 5.121546961325967, + "grad_norm": 0.19770687818527222, + "learning_rate": 5.046722582683215e-05, + "loss": 1.6887, + "step": 16686 + }, + { + "epoch": 5.121853898096992, + "grad_norm": 0.20407389104366302, + "learning_rate": 5.046225548169151e-05, + "loss": 1.7412, + "step": 16687 + }, + { + "epoch": 5.122160834868017, + "grad_norm": 0.20153474807739258, + "learning_rate": 5.045728513198259e-05, + "loss": 1.7643, + "step": 16688 + }, + { + "epoch": 5.122467771639043, + "grad_norm": 0.18737752735614777, + "learning_rate": 5.045231477775452e-05, + "loss": 1.763, + "step": 16689 + }, + { + "epoch": 5.122774708410067, + "grad_norm": 0.19790658354759216, + "learning_rate": 5.0447344419056385e-05, + "loss": 1.7446, + "step": 16690 + }, + { + "epoch": 5.1230816451810925, + "grad_norm": 0.21496973931789398, + "learning_rate": 5.0442374055937336e-05, + "loss": 1.7756, + "step": 16691 + }, + { + "epoch": 5.123388581952118, + "grad_norm": 0.19318655133247375, + "learning_rate": 5.043740368844649e-05, + "loss": 1.7687, + "step": 16692 + }, + { + "epoch": 5.123695518723143, + "grad_norm": 0.2237338423728943, + "learning_rate": 5.0432433316632976e-05, + "loss": 1.7258, + "step": 16693 + }, + { + "epoch": 5.1240024554941686, + "grad_norm": 0.2257162630558014, + "learning_rate": 5.042746294054589e-05, + "loss": 1.7462, + "step": 16694 + }, + { + "epoch": 5.124309392265193, + "grad_norm": 0.25666359066963196, + "learning_rate": 5.0422492560234366e-05, + "loss": 1.7318, + "step": 16695 + }, + { + "epoch": 5.124616329036218, + "grad_norm": 0.2615324556827545, + "learning_rate": 5.0417522175747536e-05, + "loss": 1.7533, + "step": 16696 + }, + { + "epoch": 5.124923265807244, + "grad_norm": 0.2372874766588211, + "learning_rate": 5.0412551787134475e-05, + "loss": 1.7361, + "step": 16697 + }, + { + "epoch": 5.125230202578269, + "grad_norm": 0.25976815819740295, + "learning_rate": 5.040758139444436e-05, + "loss": 1.7542, + "step": 16698 + }, + { + "epoch": 5.125537139349294, + "grad_norm": 0.36173003911972046, + "learning_rate": 5.040261099772629e-05, + "loss": 1.7421, + "step": 16699 + }, + { + "epoch": 5.12584407612032, + "grad_norm": 0.2767728269100189, + "learning_rate": 5.039764059702937e-05, + "loss": 1.7341, + "step": 16700 + }, + { + "epoch": 5.126151012891344, + "grad_norm": 0.20185241103172302, + "learning_rate": 5.039267019240275e-05, + "loss": 1.7068, + "step": 16701 + }, + { + "epoch": 5.1264579496623695, + "grad_norm": 0.26872581243515015, + "learning_rate": 5.0387699783895514e-05, + "loss": 1.7404, + "step": 16702 + }, + { + "epoch": 5.126764886433395, + "grad_norm": 0.2867858111858368, + "learning_rate": 5.038272937155682e-05, + "loss": 1.7702, + "step": 16703 + }, + { + "epoch": 5.12707182320442, + "grad_norm": 0.20939521491527557, + "learning_rate": 5.037775895543574e-05, + "loss": 1.7653, + "step": 16704 + }, + { + "epoch": 5.1273787599754455, + "grad_norm": 0.2674047648906708, + "learning_rate": 5.037278853558146e-05, + "loss": 1.701, + "step": 16705 + }, + { + "epoch": 5.12768569674647, + "grad_norm": 0.20776906609535217, + "learning_rate": 5.036781811204304e-05, + "loss": 1.7476, + "step": 16706 + }, + { + "epoch": 5.127992633517495, + "grad_norm": 0.2695952355861664, + "learning_rate": 5.036284768486964e-05, + "loss": 1.7206, + "step": 16707 + }, + { + "epoch": 5.128299570288521, + "grad_norm": 0.30661383271217346, + "learning_rate": 5.0357877254110363e-05, + "loss": 1.72, + "step": 16708 + }, + { + "epoch": 5.128606507059546, + "grad_norm": 0.2527785003185272, + "learning_rate": 5.0352906819814316e-05, + "loss": 1.6936, + "step": 16709 + }, + { + "epoch": 5.128913443830571, + "grad_norm": 0.23000696301460266, + "learning_rate": 5.034793638203066e-05, + "loss": 1.7634, + "step": 16710 + }, + { + "epoch": 5.129220380601596, + "grad_norm": 0.33594760298728943, + "learning_rate": 5.0342965940808486e-05, + "loss": 1.6952, + "step": 16711 + }, + { + "epoch": 5.129527317372621, + "grad_norm": 0.22834168374538422, + "learning_rate": 5.033799549619692e-05, + "loss": 1.7537, + "step": 16712 + }, + { + "epoch": 5.129834254143646, + "grad_norm": 0.26585114002227783, + "learning_rate": 5.033302504824509e-05, + "loss": 1.7554, + "step": 16713 + }, + { + "epoch": 5.130141190914672, + "grad_norm": 0.25632211565971375, + "learning_rate": 5.032805459700211e-05, + "loss": 1.8141, + "step": 16714 + }, + { + "epoch": 5.130448127685697, + "grad_norm": 0.256523996591568, + "learning_rate": 5.0323084142517084e-05, + "loss": 1.777, + "step": 16715 + }, + { + "epoch": 5.1307550644567215, + "grad_norm": 0.31409457325935364, + "learning_rate": 5.0318113684839166e-05, + "loss": 1.7414, + "step": 16716 + }, + { + "epoch": 5.131062001227747, + "grad_norm": 0.21156816184520721, + "learning_rate": 5.0313143224017455e-05, + "loss": 1.7397, + "step": 16717 + }, + { + "epoch": 5.131368937998772, + "grad_norm": 0.23596547544002533, + "learning_rate": 5.030817276010109e-05, + "loss": 1.752, + "step": 16718 + }, + { + "epoch": 5.1316758747697975, + "grad_norm": 0.2587638199329376, + "learning_rate": 5.0303202293139186e-05, + "loss": 1.7645, + "step": 16719 + }, + { + "epoch": 5.131982811540823, + "grad_norm": 0.2006666213274002, + "learning_rate": 5.029823182318084e-05, + "loss": 1.7009, + "step": 16720 + }, + { + "epoch": 5.132289748311848, + "grad_norm": 0.3075694739818573, + "learning_rate": 5.029326135027521e-05, + "loss": 1.749, + "step": 16721 + }, + { + "epoch": 5.132596685082873, + "grad_norm": 0.3116205334663391, + "learning_rate": 5.028829087447139e-05, + "loss": 1.7458, + "step": 16722 + }, + { + "epoch": 5.132903621853898, + "grad_norm": 0.17925913631916046, + "learning_rate": 5.028332039581851e-05, + "loss": 1.6502, + "step": 16723 + }, + { + "epoch": 5.133210558624923, + "grad_norm": 0.21779952943325043, + "learning_rate": 5.0278349914365694e-05, + "loss": 1.7656, + "step": 16724 + }, + { + "epoch": 5.133517495395949, + "grad_norm": 0.20085318386554718, + "learning_rate": 5.027337943016207e-05, + "loss": 1.7662, + "step": 16725 + }, + { + "epoch": 5.133824432166974, + "grad_norm": 0.19975553452968597, + "learning_rate": 5.026840894325673e-05, + "loss": 1.7392, + "step": 16726 + }, + { + "epoch": 5.134131368937998, + "grad_norm": 0.20610745251178741, + "learning_rate": 5.026343845369883e-05, + "loss": 1.7221, + "step": 16727 + }, + { + "epoch": 5.134438305709024, + "grad_norm": 0.21451768279075623, + "learning_rate": 5.025846796153747e-05, + "loss": 1.8381, + "step": 16728 + }, + { + "epoch": 5.134745242480049, + "grad_norm": 0.19518613815307617, + "learning_rate": 5.0253497466821786e-05, + "loss": 1.7483, + "step": 16729 + }, + { + "epoch": 5.135052179251074, + "grad_norm": 0.24284996092319489, + "learning_rate": 5.024852696960088e-05, + "loss": 1.7895, + "step": 16730 + }, + { + "epoch": 5.1353591160221, + "grad_norm": 0.23962461948394775, + "learning_rate": 5.0243556469923905e-05, + "loss": 1.8468, + "step": 16731 + }, + { + "epoch": 5.135666052793125, + "grad_norm": 0.20455054938793182, + "learning_rate": 5.023858596783993e-05, + "loss": 1.6973, + "step": 16732 + }, + { + "epoch": 5.1359729895641495, + "grad_norm": 0.20629842579364777, + "learning_rate": 5.023361546339813e-05, + "loss": 1.7608, + "step": 16733 + }, + { + "epoch": 5.136279926335175, + "grad_norm": 0.19375818967819214, + "learning_rate": 5.0228644956647606e-05, + "loss": 1.7327, + "step": 16734 + }, + { + "epoch": 5.1365868631062, + "grad_norm": 0.20960548520088196, + "learning_rate": 5.022367444763748e-05, + "loss": 1.7227, + "step": 16735 + }, + { + "epoch": 5.1368937998772255, + "grad_norm": 0.24732786417007446, + "learning_rate": 5.021870393641687e-05, + "loss": 1.8144, + "step": 16736 + }, + { + "epoch": 5.137200736648251, + "grad_norm": 0.22190099954605103, + "learning_rate": 5.021373342303489e-05, + "loss": 1.705, + "step": 16737 + }, + { + "epoch": 5.137507673419275, + "grad_norm": 0.2091664969921112, + "learning_rate": 5.020876290754069e-05, + "loss": 1.7926, + "step": 16738 + }, + { + "epoch": 5.137814610190301, + "grad_norm": 0.22298938035964966, + "learning_rate": 5.020379238998335e-05, + "loss": 1.7782, + "step": 16739 + }, + { + "epoch": 5.138121546961326, + "grad_norm": 0.20843006670475006, + "learning_rate": 5.019882187041203e-05, + "loss": 1.7245, + "step": 16740 + }, + { + "epoch": 5.138428483732351, + "grad_norm": 0.23383544385433197, + "learning_rate": 5.019385134887583e-05, + "loss": 1.6834, + "step": 16741 + }, + { + "epoch": 5.138735420503377, + "grad_norm": 0.3015683889389038, + "learning_rate": 5.018888082542388e-05, + "loss": 1.7636, + "step": 16742 + }, + { + "epoch": 5.139042357274401, + "grad_norm": 0.2253810614347458, + "learning_rate": 5.0183910300105284e-05, + "loss": 1.7375, + "step": 16743 + }, + { + "epoch": 5.139349294045426, + "grad_norm": 0.2064623087644577, + "learning_rate": 5.01789397729692e-05, + "loss": 1.7683, + "step": 16744 + }, + { + "epoch": 5.139656230816452, + "grad_norm": 0.2106693685054779, + "learning_rate": 5.0173969244064724e-05, + "loss": 1.7432, + "step": 16745 + }, + { + "epoch": 5.139963167587477, + "grad_norm": 0.19944638013839722, + "learning_rate": 5.016899871344097e-05, + "loss": 1.701, + "step": 16746 + }, + { + "epoch": 5.140270104358502, + "grad_norm": 0.23210744559764862, + "learning_rate": 5.016402818114708e-05, + "loss": 1.8008, + "step": 16747 + }, + { + "epoch": 5.140577041129528, + "grad_norm": 0.26014089584350586, + "learning_rate": 5.015905764723217e-05, + "loss": 1.7131, + "step": 16748 + }, + { + "epoch": 5.140883977900552, + "grad_norm": 0.25526607036590576, + "learning_rate": 5.015408711174535e-05, + "loss": 1.7525, + "step": 16749 + }, + { + "epoch": 5.1411909146715775, + "grad_norm": 0.2092386782169342, + "learning_rate": 5.0149116574735756e-05, + "loss": 1.7502, + "step": 16750 + }, + { + "epoch": 5.141497851442603, + "grad_norm": 0.21560105681419373, + "learning_rate": 5.01441460362525e-05, + "loss": 1.7903, + "step": 16751 + }, + { + "epoch": 5.141804788213628, + "grad_norm": 0.23538467288017273, + "learning_rate": 5.013917549634471e-05, + "loss": 1.6995, + "step": 16752 + }, + { + "epoch": 5.1421117249846535, + "grad_norm": 0.26545262336730957, + "learning_rate": 5.0134204955061526e-05, + "loss": 1.7511, + "step": 16753 + }, + { + "epoch": 5.142418661755678, + "grad_norm": 0.23030948638916016, + "learning_rate": 5.012923441245203e-05, + "loss": 1.7271, + "step": 16754 + }, + { + "epoch": 5.142725598526703, + "grad_norm": 0.22395408153533936, + "learning_rate": 5.012426386856537e-05, + "loss": 1.7273, + "step": 16755 + }, + { + "epoch": 5.143032535297729, + "grad_norm": 0.21355997025966644, + "learning_rate": 5.011929332345066e-05, + "loss": 1.7347, + "step": 16756 + }, + { + "epoch": 5.143339472068754, + "grad_norm": 0.2355809509754181, + "learning_rate": 5.011432277715702e-05, + "loss": 1.8289, + "step": 16757 + }, + { + "epoch": 5.143646408839779, + "grad_norm": 0.24319802224636078, + "learning_rate": 5.0109352229733584e-05, + "loss": 1.7621, + "step": 16758 + }, + { + "epoch": 5.143953345610804, + "grad_norm": 0.2591453492641449, + "learning_rate": 5.010438168122946e-05, + "loss": 1.8043, + "step": 16759 + }, + { + "epoch": 5.144260282381829, + "grad_norm": 0.22595751285552979, + "learning_rate": 5.009941113169376e-05, + "loss": 1.8137, + "step": 16760 + }, + { + "epoch": 5.144567219152854, + "grad_norm": 0.220921128988266, + "learning_rate": 5.009444058117564e-05, + "loss": 1.7105, + "step": 16761 + }, + { + "epoch": 5.14487415592388, + "grad_norm": 0.25713789463043213, + "learning_rate": 5.0089470029724195e-05, + "loss": 1.8184, + "step": 16762 + }, + { + "epoch": 5.145181092694905, + "grad_norm": 0.19849328696727753, + "learning_rate": 5.008449947738856e-05, + "loss": 1.7331, + "step": 16763 + }, + { + "epoch": 5.14548802946593, + "grad_norm": 0.2073405385017395, + "learning_rate": 5.007952892421785e-05, + "loss": 1.7053, + "step": 16764 + }, + { + "epoch": 5.145794966236955, + "grad_norm": 0.22307951748371124, + "learning_rate": 5.007455837026119e-05, + "loss": 1.7724, + "step": 16765 + }, + { + "epoch": 5.14610190300798, + "grad_norm": 0.22160649299621582, + "learning_rate": 5.006958781556769e-05, + "loss": 1.7191, + "step": 16766 + }, + { + "epoch": 5.1464088397790055, + "grad_norm": 0.2202252298593521, + "learning_rate": 5.0064617260186487e-05, + "loss": 1.7339, + "step": 16767 + }, + { + "epoch": 5.146715776550031, + "grad_norm": 0.23693829774856567, + "learning_rate": 5.005964670416671e-05, + "loss": 1.7143, + "step": 16768 + }, + { + "epoch": 5.147022713321056, + "grad_norm": 0.22675764560699463, + "learning_rate": 5.005467614755746e-05, + "loss": 1.7913, + "step": 16769 + }, + { + "epoch": 5.147329650092081, + "grad_norm": 0.21288467943668365, + "learning_rate": 5.0049705590407866e-05, + "loss": 1.7581, + "step": 16770 + }, + { + "epoch": 5.147636586863106, + "grad_norm": 0.216839998960495, + "learning_rate": 5.0044735032767064e-05, + "loss": 1.7305, + "step": 16771 + }, + { + "epoch": 5.147943523634131, + "grad_norm": 0.2111063450574875, + "learning_rate": 5.003976447468416e-05, + "loss": 1.7444, + "step": 16772 + }, + { + "epoch": 5.148250460405157, + "grad_norm": 0.2536773085594177, + "learning_rate": 5.003479391620827e-05, + "loss": 1.6952, + "step": 16773 + }, + { + "epoch": 5.148557397176182, + "grad_norm": 0.23585477471351624, + "learning_rate": 5.002982335738854e-05, + "loss": 1.6921, + "step": 16774 + }, + { + "epoch": 5.148864333947207, + "grad_norm": 0.1927027702331543, + "learning_rate": 5.002485279827407e-05, + "loss": 1.7781, + "step": 16775 + }, + { + "epoch": 5.149171270718232, + "grad_norm": 0.22545355558395386, + "learning_rate": 5.001988223891399e-05, + "loss": 1.7582, + "step": 16776 + }, + { + "epoch": 5.149478207489257, + "grad_norm": 0.20837660133838654, + "learning_rate": 5.001491167935741e-05, + "loss": 1.7379, + "step": 16777 + }, + { + "epoch": 5.149785144260282, + "grad_norm": 0.20510734617710114, + "learning_rate": 5.000994111965348e-05, + "loss": 1.7568, + "step": 16778 + }, + { + "epoch": 5.150092081031308, + "grad_norm": 0.2629711329936981, + "learning_rate": 5.00049705598513e-05, + "loss": 1.7613, + "step": 16779 + }, + { + "epoch": 5.150399017802333, + "grad_norm": 0.2390555888414383, + "learning_rate": 5e-05, + "loss": 1.7099, + "step": 16780 + }, + { + "epoch": 5.150705954573358, + "grad_norm": 0.19643893837928772, + "learning_rate": 4.9995029440148715e-05, + "loss": 1.7012, + "step": 16781 + }, + { + "epoch": 5.151012891344383, + "grad_norm": 0.1881607472896576, + "learning_rate": 4.999005888034653e-05, + "loss": 1.705, + "step": 16782 + }, + { + "epoch": 5.151319828115408, + "grad_norm": 0.3219485282897949, + "learning_rate": 4.99850883206426e-05, + "loss": 1.8089, + "step": 16783 + }, + { + "epoch": 5.151626764886434, + "grad_norm": 0.22285562753677368, + "learning_rate": 4.998011776108602e-05, + "loss": 1.7343, + "step": 16784 + }, + { + "epoch": 5.151933701657459, + "grad_norm": 0.1981910616159439, + "learning_rate": 4.9975147201725955e-05, + "loss": 1.6939, + "step": 16785 + }, + { + "epoch": 5.152240638428483, + "grad_norm": 0.2338661551475525, + "learning_rate": 4.997017664261148e-05, + "loss": 1.6833, + "step": 16786 + }, + { + "epoch": 5.152547575199509, + "grad_norm": 0.2613268792629242, + "learning_rate": 4.996520608379175e-05, + "loss": 1.7251, + "step": 16787 + }, + { + "epoch": 5.152854511970534, + "grad_norm": 0.26063668727874756, + "learning_rate": 4.996023552531586e-05, + "loss": 1.8444, + "step": 16788 + }, + { + "epoch": 5.153161448741559, + "grad_norm": 0.2711321711540222, + "learning_rate": 4.9955264967232954e-05, + "loss": 1.7257, + "step": 16789 + }, + { + "epoch": 5.153468385512585, + "grad_norm": 0.30134227871894836, + "learning_rate": 4.995029440959213e-05, + "loss": 1.7599, + "step": 16790 + }, + { + "epoch": 5.153775322283609, + "grad_norm": 0.22983741760253906, + "learning_rate": 4.994532385244255e-05, + "loss": 1.7944, + "step": 16791 + }, + { + "epoch": 5.1540822590546345, + "grad_norm": 0.2992973327636719, + "learning_rate": 4.994035329583329e-05, + "loss": 1.7507, + "step": 16792 + }, + { + "epoch": 5.15438919582566, + "grad_norm": 0.2659669518470764, + "learning_rate": 4.993538273981352e-05, + "loss": 1.7246, + "step": 16793 + }, + { + "epoch": 5.154696132596685, + "grad_norm": 0.24235470592975616, + "learning_rate": 4.9930412184432315e-05, + "loss": 1.8378, + "step": 16794 + }, + { + "epoch": 5.1550030693677105, + "grad_norm": 0.30005061626434326, + "learning_rate": 4.992544162973882e-05, + "loss": 1.7526, + "step": 16795 + }, + { + "epoch": 5.155310006138736, + "grad_norm": 0.2183740884065628, + "learning_rate": 4.992047107578215e-05, + "loss": 1.7197, + "step": 16796 + }, + { + "epoch": 5.15561694290976, + "grad_norm": 0.35874706506729126, + "learning_rate": 4.991550052261145e-05, + "loss": 1.8196, + "step": 16797 + }, + { + "epoch": 5.155923879680786, + "grad_norm": 0.42146921157836914, + "learning_rate": 4.991052997027583e-05, + "loss": 1.7165, + "step": 16798 + }, + { + "epoch": 5.156230816451811, + "grad_norm": 0.2738321125507355, + "learning_rate": 4.990555941882437e-05, + "loss": 1.7042, + "step": 16799 + }, + { + "epoch": 5.156537753222836, + "grad_norm": 0.26304566860198975, + "learning_rate": 4.990058886830625e-05, + "loss": 1.7551, + "step": 16800 + }, + { + "epoch": 5.156844689993862, + "grad_norm": 0.4301520586013794, + "learning_rate": 4.9895618318770556e-05, + "loss": 1.7219, + "step": 16801 + }, + { + "epoch": 5.157151626764886, + "grad_norm": 0.3316499590873718, + "learning_rate": 4.989064777026644e-05, + "loss": 1.8034, + "step": 16802 + }, + { + "epoch": 5.157458563535911, + "grad_norm": 0.30105581879615784, + "learning_rate": 4.9885677222842984e-05, + "loss": 1.7022, + "step": 16803 + }, + { + "epoch": 5.157765500306937, + "grad_norm": 0.3830905854701996, + "learning_rate": 4.988070667654937e-05, + "loss": 1.7898, + "step": 16804 + }, + { + "epoch": 5.158072437077962, + "grad_norm": 0.2204640656709671, + "learning_rate": 4.9875736131434644e-05, + "loss": 1.7081, + "step": 16805 + }, + { + "epoch": 5.158379373848987, + "grad_norm": 0.3620772063732147, + "learning_rate": 4.9870765587547976e-05, + "loss": 1.7345, + "step": 16806 + }, + { + "epoch": 5.158686310620013, + "grad_norm": 0.3268207907676697, + "learning_rate": 4.986579504493848e-05, + "loss": 1.7364, + "step": 16807 + }, + { + "epoch": 5.158993247391037, + "grad_norm": 0.2499808967113495, + "learning_rate": 4.986082450365529e-05, + "loss": 1.7836, + "step": 16808 + }, + { + "epoch": 5.1593001841620625, + "grad_norm": 0.3696226477622986, + "learning_rate": 4.98558539637475e-05, + "loss": 1.8094, + "step": 16809 + }, + { + "epoch": 5.159607120933088, + "grad_norm": 0.3239068388938904, + "learning_rate": 4.9850883425264256e-05, + "loss": 1.7448, + "step": 16810 + }, + { + "epoch": 5.159914057704113, + "grad_norm": 0.19875772297382355, + "learning_rate": 4.9845912888254655e-05, + "loss": 1.6945, + "step": 16811 + }, + { + "epoch": 5.1602209944751385, + "grad_norm": 0.3952203691005707, + "learning_rate": 4.984094235276784e-05, + "loss": 1.8457, + "step": 16812 + }, + { + "epoch": 5.160527931246163, + "grad_norm": 0.3052334785461426, + "learning_rate": 4.9835971818852916e-05, + "loss": 1.7371, + "step": 16813 + }, + { + "epoch": 5.160834868017188, + "grad_norm": 0.2874486446380615, + "learning_rate": 4.983100128655904e-05, + "loss": 1.7194, + "step": 16814 + }, + { + "epoch": 5.161141804788214, + "grad_norm": 0.39117491245269775, + "learning_rate": 4.98260307559353e-05, + "loss": 1.7919, + "step": 16815 + }, + { + "epoch": 5.161448741559239, + "grad_norm": 0.2532150149345398, + "learning_rate": 4.982106022703081e-05, + "loss": 1.8103, + "step": 16816 + }, + { + "epoch": 5.161755678330264, + "grad_norm": 0.3545167148113251, + "learning_rate": 4.981608969989473e-05, + "loss": 1.8093, + "step": 16817 + }, + { + "epoch": 5.162062615101289, + "grad_norm": 0.397806316614151, + "learning_rate": 4.981111917457613e-05, + "loss": 1.7885, + "step": 16818 + }, + { + "epoch": 5.162369551872314, + "grad_norm": 0.2523536682128906, + "learning_rate": 4.980614865112419e-05, + "loss": 1.797, + "step": 16819 + }, + { + "epoch": 5.162676488643339, + "grad_norm": 0.3666839301586151, + "learning_rate": 4.980117812958798e-05, + "loss": 1.7859, + "step": 16820 + }, + { + "epoch": 5.162983425414365, + "grad_norm": 0.3392138183116913, + "learning_rate": 4.9796207610016664e-05, + "loss": 1.7717, + "step": 16821 + }, + { + "epoch": 5.16329036218539, + "grad_norm": 0.21040666103363037, + "learning_rate": 4.9791237092459325e-05, + "loss": 1.7447, + "step": 16822 + }, + { + "epoch": 5.163597298956415, + "grad_norm": 0.3140225112438202, + "learning_rate": 4.978626657696512e-05, + "loss": 1.7405, + "step": 16823 + }, + { + "epoch": 5.16390423572744, + "grad_norm": 0.23963581025600433, + "learning_rate": 4.978129606358313e-05, + "loss": 1.7041, + "step": 16824 + }, + { + "epoch": 5.164211172498465, + "grad_norm": 0.32476937770843506, + "learning_rate": 4.977632555236253e-05, + "loss": 1.736, + "step": 16825 + }, + { + "epoch": 5.1645181092694905, + "grad_norm": 0.4362463653087616, + "learning_rate": 4.977135504335239e-05, + "loss": 1.7657, + "step": 16826 + }, + { + "epoch": 5.164825046040516, + "grad_norm": 0.26118260622024536, + "learning_rate": 4.976638453660188e-05, + "loss": 1.7339, + "step": 16827 + }, + { + "epoch": 5.165131982811541, + "grad_norm": 0.27284330129623413, + "learning_rate": 4.9761414032160065e-05, + "loss": 1.8086, + "step": 16828 + }, + { + "epoch": 5.165438919582566, + "grad_norm": 0.2942579388618469, + "learning_rate": 4.975644353007611e-05, + "loss": 1.7869, + "step": 16829 + }, + { + "epoch": 5.165745856353591, + "grad_norm": 0.23257993161678314, + "learning_rate": 4.975147303039912e-05, + "loss": 1.8048, + "step": 16830 + }, + { + "epoch": 5.166052793124616, + "grad_norm": 0.28638842701911926, + "learning_rate": 4.9746502533178225e-05, + "loss": 1.7744, + "step": 16831 + }, + { + "epoch": 5.166359729895642, + "grad_norm": 0.21571335196495056, + "learning_rate": 4.974153203846255e-05, + "loss": 1.7842, + "step": 16832 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 0.268883615732193, + "learning_rate": 4.9736561546301185e-05, + "loss": 1.7194, + "step": 16833 + }, + { + "epoch": 5.166973603437691, + "grad_norm": 0.22934168577194214, + "learning_rate": 4.9731591056743285e-05, + "loss": 1.757, + "step": 16834 + }, + { + "epoch": 5.167280540208717, + "grad_norm": 0.26321718096733093, + "learning_rate": 4.9726620569837946e-05, + "loss": 1.7675, + "step": 16835 + }, + { + "epoch": 5.167587476979742, + "grad_norm": 0.2893882393836975, + "learning_rate": 4.9721650085634325e-05, + "loss": 1.7134, + "step": 16836 + }, + { + "epoch": 5.167894413750767, + "grad_norm": 0.24130617082118988, + "learning_rate": 4.97166796041815e-05, + "loss": 1.7119, + "step": 16837 + }, + { + "epoch": 5.168201350521793, + "grad_norm": 0.23614190518856049, + "learning_rate": 4.9711709125528635e-05, + "loss": 1.7556, + "step": 16838 + }, + { + "epoch": 5.168508287292818, + "grad_norm": 0.2031065821647644, + "learning_rate": 4.97067386497248e-05, + "loss": 1.7678, + "step": 16839 + }, + { + "epoch": 5.1688152240638425, + "grad_norm": 0.30695948004722595, + "learning_rate": 4.970176817681917e-05, + "loss": 1.7907, + "step": 16840 + }, + { + "epoch": 5.169122160834868, + "grad_norm": 0.31256723403930664, + "learning_rate": 4.969679770686082e-05, + "loss": 1.7448, + "step": 16841 + }, + { + "epoch": 5.169429097605893, + "grad_norm": 0.24183644354343414, + "learning_rate": 4.969182723989892e-05, + "loss": 1.7259, + "step": 16842 + }, + { + "epoch": 5.1697360343769185, + "grad_norm": 0.22440548241138458, + "learning_rate": 4.9686856775982536e-05, + "loss": 1.7949, + "step": 16843 + }, + { + "epoch": 5.170042971147944, + "grad_norm": 0.29006195068359375, + "learning_rate": 4.9681886315160846e-05, + "loss": 1.7128, + "step": 16844 + }, + { + "epoch": 5.170349907918968, + "grad_norm": 0.2189658135175705, + "learning_rate": 4.967691585748292e-05, + "loss": 1.7375, + "step": 16845 + }, + { + "epoch": 5.170656844689994, + "grad_norm": 0.289909690618515, + "learning_rate": 4.967194540299791e-05, + "loss": 1.779, + "step": 16846 + }, + { + "epoch": 5.170963781461019, + "grad_norm": 0.28279590606689453, + "learning_rate": 4.966697495175492e-05, + "loss": 1.7368, + "step": 16847 + }, + { + "epoch": 5.171270718232044, + "grad_norm": 0.2056259959936142, + "learning_rate": 4.966200450380309e-05, + "loss": 1.7548, + "step": 16848 + }, + { + "epoch": 5.17157765500307, + "grad_norm": 0.2607482969760895, + "learning_rate": 4.965703405919154e-05, + "loss": 1.7178, + "step": 16849 + }, + { + "epoch": 5.171884591774095, + "grad_norm": 0.26085609197616577, + "learning_rate": 4.965206361796935e-05, + "loss": 1.751, + "step": 16850 + }, + { + "epoch": 5.172191528545119, + "grad_norm": 0.17960335314273834, + "learning_rate": 4.964709318018569e-05, + "loss": 1.6932, + "step": 16851 + }, + { + "epoch": 5.172498465316145, + "grad_norm": 0.2617340385913849, + "learning_rate": 4.964212274588965e-05, + "loss": 1.7753, + "step": 16852 + }, + { + "epoch": 5.17280540208717, + "grad_norm": 0.2454555630683899, + "learning_rate": 4.9637152315130383e-05, + "loss": 1.7587, + "step": 16853 + }, + { + "epoch": 5.173112338858195, + "grad_norm": 0.19221605360507965, + "learning_rate": 4.963218188795696e-05, + "loss": 1.7337, + "step": 16854 + }, + { + "epoch": 5.173419275629221, + "grad_norm": 0.24314738810062408, + "learning_rate": 4.9627211464418565e-05, + "loss": 1.725, + "step": 16855 + }, + { + "epoch": 5.173726212400245, + "grad_norm": 0.2533986568450928, + "learning_rate": 4.962224104456426e-05, + "loss": 1.7502, + "step": 16856 + }, + { + "epoch": 5.1740331491712706, + "grad_norm": 0.21800079941749573, + "learning_rate": 4.9617270628443195e-05, + "loss": 1.7622, + "step": 16857 + }, + { + "epoch": 5.174340085942296, + "grad_norm": 0.22742362320423126, + "learning_rate": 4.96123002161045e-05, + "loss": 1.7078, + "step": 16858 + }, + { + "epoch": 5.174647022713321, + "grad_norm": 0.22729982435703278, + "learning_rate": 4.960732980759727e-05, + "loss": 1.8349, + "step": 16859 + }, + { + "epoch": 5.1749539594843466, + "grad_norm": 0.28869518637657166, + "learning_rate": 4.9602359402970625e-05, + "loss": 1.8932, + "step": 16860 + }, + { + "epoch": 5.175260896255371, + "grad_norm": 0.21931354701519012, + "learning_rate": 4.9597389002273725e-05, + "loss": 1.6989, + "step": 16861 + }, + { + "epoch": 5.175567833026396, + "grad_norm": 0.2130192667245865, + "learning_rate": 4.959241860555564e-05, + "loss": 1.752, + "step": 16862 + }, + { + "epoch": 5.175874769797422, + "grad_norm": 0.21272781491279602, + "learning_rate": 4.958744821286553e-05, + "loss": 1.7402, + "step": 16863 + }, + { + "epoch": 5.176181706568447, + "grad_norm": 0.20279285311698914, + "learning_rate": 4.958247782425248e-05, + "loss": 1.7103, + "step": 16864 + }, + { + "epoch": 5.176488643339472, + "grad_norm": 0.23561790585517883, + "learning_rate": 4.957750743976564e-05, + "loss": 1.7742, + "step": 16865 + }, + { + "epoch": 5.176795580110497, + "grad_norm": 0.27608510851860046, + "learning_rate": 4.957253705945413e-05, + "loss": 1.7505, + "step": 16866 + }, + { + "epoch": 5.177102516881522, + "grad_norm": 0.20624001324176788, + "learning_rate": 4.956756668336704e-05, + "loss": 1.7032, + "step": 16867 + }, + { + "epoch": 5.1774094536525475, + "grad_norm": 0.23743939399719238, + "learning_rate": 4.956259631155352e-05, + "loss": 1.7469, + "step": 16868 + }, + { + "epoch": 5.177716390423573, + "grad_norm": 0.27421119809150696, + "learning_rate": 4.9557625944062675e-05, + "loss": 1.7028, + "step": 16869 + }, + { + "epoch": 5.178023327194598, + "grad_norm": 0.23788046836853027, + "learning_rate": 4.955265558094363e-05, + "loss": 1.7468, + "step": 16870 + }, + { + "epoch": 5.1783302639656235, + "grad_norm": 0.24712958931922913, + "learning_rate": 4.95476852222455e-05, + "loss": 1.7348, + "step": 16871 + }, + { + "epoch": 5.178637200736648, + "grad_norm": 0.21558570861816406, + "learning_rate": 4.9542714868017424e-05, + "loss": 1.7599, + "step": 16872 + }, + { + "epoch": 5.178944137507673, + "grad_norm": 0.2561664283275604, + "learning_rate": 4.953774451830849e-05, + "loss": 1.7673, + "step": 16873 + }, + { + "epoch": 5.179251074278699, + "grad_norm": 0.19761815667152405, + "learning_rate": 4.953277417316786e-05, + "loss": 1.743, + "step": 16874 + }, + { + "epoch": 5.179558011049724, + "grad_norm": 0.24140769243240356, + "learning_rate": 4.95278038326446e-05, + "loss": 1.8229, + "step": 16875 + }, + { + "epoch": 5.179864947820749, + "grad_norm": 0.21686211228370667, + "learning_rate": 4.9522833496787876e-05, + "loss": 1.7914, + "step": 16876 + }, + { + "epoch": 5.180171884591774, + "grad_norm": 0.2537819743156433, + "learning_rate": 4.951786316564678e-05, + "loss": 1.7532, + "step": 16877 + }, + { + "epoch": 5.180478821362799, + "grad_norm": 0.24567632377147675, + "learning_rate": 4.951289283927046e-05, + "loss": 1.7528, + "step": 16878 + }, + { + "epoch": 5.180785758133824, + "grad_norm": 0.1958467960357666, + "learning_rate": 4.9507922517708e-05, + "loss": 1.6922, + "step": 16879 + }, + { + "epoch": 5.18109269490485, + "grad_norm": 0.2012091726064682, + "learning_rate": 4.950295220100857e-05, + "loss": 1.7509, + "step": 16880 + }, + { + "epoch": 5.181399631675875, + "grad_norm": 0.2416311800479889, + "learning_rate": 4.9497981889221226e-05, + "loss": 1.7341, + "step": 16881 + }, + { + "epoch": 5.1817065684469, + "grad_norm": 0.21407842636108398, + "learning_rate": 4.949301158239513e-05, + "loss": 1.7493, + "step": 16882 + }, + { + "epoch": 5.182013505217925, + "grad_norm": 0.2354930192232132, + "learning_rate": 4.94880412805794e-05, + "loss": 1.7726, + "step": 16883 + }, + { + "epoch": 5.18232044198895, + "grad_norm": 0.2168428748846054, + "learning_rate": 4.948307098382313e-05, + "loss": 1.77, + "step": 16884 + }, + { + "epoch": 5.1826273787599755, + "grad_norm": 0.19605880975723267, + "learning_rate": 4.947810069217547e-05, + "loss": 1.7292, + "step": 16885 + }, + { + "epoch": 5.182934315531001, + "grad_norm": 0.23066702485084534, + "learning_rate": 4.947313040568551e-05, + "loss": 1.7265, + "step": 16886 + }, + { + "epoch": 5.183241252302026, + "grad_norm": 0.20139534771442413, + "learning_rate": 4.9468160124402386e-05, + "loss": 1.7443, + "step": 16887 + }, + { + "epoch": 5.183548189073051, + "grad_norm": 0.25097572803497314, + "learning_rate": 4.946318984837521e-05, + "loss": 1.7537, + "step": 16888 + }, + { + "epoch": 5.183855125844076, + "grad_norm": 0.26215067505836487, + "learning_rate": 4.945821957765313e-05, + "loss": 1.8397, + "step": 16889 + }, + { + "epoch": 5.184162062615101, + "grad_norm": 0.22072140872478485, + "learning_rate": 4.9453249312285215e-05, + "loss": 1.7052, + "step": 16890 + }, + { + "epoch": 5.184468999386127, + "grad_norm": 0.20372305810451508, + "learning_rate": 4.944827905232064e-05, + "loss": 1.7228, + "step": 16891 + }, + { + "epoch": 5.184775936157152, + "grad_norm": 0.20383495092391968, + "learning_rate": 4.944330879780847e-05, + "loss": 1.7063, + "step": 16892 + }, + { + "epoch": 5.185082872928176, + "grad_norm": 0.1903693675994873, + "learning_rate": 4.943833854879786e-05, + "loss": 1.6435, + "step": 16893 + }, + { + "epoch": 5.185389809699202, + "grad_norm": 0.20357775688171387, + "learning_rate": 4.94333683053379e-05, + "loss": 1.7485, + "step": 16894 + }, + { + "epoch": 5.185696746470227, + "grad_norm": 0.24776104092597961, + "learning_rate": 4.942839806747775e-05, + "loss": 1.718, + "step": 16895 + }, + { + "epoch": 5.186003683241252, + "grad_norm": 0.2455051839351654, + "learning_rate": 4.942342783526649e-05, + "loss": 1.7124, + "step": 16896 + }, + { + "epoch": 5.186310620012278, + "grad_norm": 0.2102014273405075, + "learning_rate": 4.941845760875328e-05, + "loss": 1.7584, + "step": 16897 + }, + { + "epoch": 5.186617556783303, + "grad_norm": 0.2177651822566986, + "learning_rate": 4.941348738798718e-05, + "loss": 1.7019, + "step": 16898 + }, + { + "epoch": 5.1869244935543275, + "grad_norm": 0.21296697854995728, + "learning_rate": 4.9408517173017355e-05, + "loss": 1.7299, + "step": 16899 + }, + { + "epoch": 5.187231430325353, + "grad_norm": 0.23485495150089264, + "learning_rate": 4.940354696389292e-05, + "loss": 1.7271, + "step": 16900 + }, + { + "epoch": 5.187538367096378, + "grad_norm": 0.27287766337394714, + "learning_rate": 4.939857676066297e-05, + "loss": 1.7601, + "step": 16901 + }, + { + "epoch": 5.1878453038674035, + "grad_norm": 0.2060246467590332, + "learning_rate": 4.939360656337665e-05, + "loss": 1.7064, + "step": 16902 + }, + { + "epoch": 5.188152240638429, + "grad_norm": 0.25422418117523193, + "learning_rate": 4.938863637208305e-05, + "loss": 1.7423, + "step": 16903 + }, + { + "epoch": 5.188459177409453, + "grad_norm": 0.2798483669757843, + "learning_rate": 4.9383666186831304e-05, + "loss": 1.7132, + "step": 16904 + }, + { + "epoch": 5.188766114180479, + "grad_norm": 0.23505693674087524, + "learning_rate": 4.9378696007670525e-05, + "loss": 1.7759, + "step": 16905 + }, + { + "epoch": 5.189073050951504, + "grad_norm": 0.23761989176273346, + "learning_rate": 4.937372583464987e-05, + "loss": 1.7076, + "step": 16906 + }, + { + "epoch": 5.189379987722529, + "grad_norm": 0.3005945086479187, + "learning_rate": 4.9368755667818385e-05, + "loss": 1.6957, + "step": 16907 + }, + { + "epoch": 5.189686924493555, + "grad_norm": 0.2502881586551666, + "learning_rate": 4.936378550722525e-05, + "loss": 1.7352, + "step": 16908 + }, + { + "epoch": 5.189993861264579, + "grad_norm": 0.24194179475307465, + "learning_rate": 4.9358815352919544e-05, + "loss": 1.738, + "step": 16909 + }, + { + "epoch": 5.190300798035604, + "grad_norm": 0.27478742599487305, + "learning_rate": 4.935384520495041e-05, + "loss": 1.7118, + "step": 16910 + }, + { + "epoch": 5.19060773480663, + "grad_norm": 0.22327560186386108, + "learning_rate": 4.9348875063366944e-05, + "loss": 1.7697, + "step": 16911 + }, + { + "epoch": 5.190914671577655, + "grad_norm": 0.21844418346881866, + "learning_rate": 4.9343904928218295e-05, + "loss": 1.7733, + "step": 16912 + }, + { + "epoch": 5.19122160834868, + "grad_norm": 0.25267866253852844, + "learning_rate": 4.933893479955354e-05, + "loss": 1.7313, + "step": 16913 + }, + { + "epoch": 5.191528545119706, + "grad_norm": 0.22045068442821503, + "learning_rate": 4.933396467742185e-05, + "loss": 1.7856, + "step": 16914 + }, + { + "epoch": 5.19183548189073, + "grad_norm": 0.22642305493354797, + "learning_rate": 4.932899456187229e-05, + "loss": 1.7326, + "step": 16915 + }, + { + "epoch": 5.1921424186617555, + "grad_norm": 0.20601733028888702, + "learning_rate": 4.9324024452953995e-05, + "loss": 1.7743, + "step": 16916 + }, + { + "epoch": 5.192449355432781, + "grad_norm": 0.25580671429634094, + "learning_rate": 4.931905435071611e-05, + "loss": 1.7705, + "step": 16917 + }, + { + "epoch": 5.192756292203806, + "grad_norm": 0.38173142075538635, + "learning_rate": 4.9314084255207706e-05, + "loss": 1.7504, + "step": 16918 + }, + { + "epoch": 5.1930632289748315, + "grad_norm": 0.2254420667886734, + "learning_rate": 4.930911416647794e-05, + "loss": 1.7344, + "step": 16919 + }, + { + "epoch": 5.193370165745856, + "grad_norm": 0.2354312688112259, + "learning_rate": 4.9304144084575896e-05, + "loss": 1.7607, + "step": 16920 + }, + { + "epoch": 5.193677102516881, + "grad_norm": 0.23879510164260864, + "learning_rate": 4.9299174009550716e-05, + "loss": 1.683, + "step": 16921 + }, + { + "epoch": 5.193984039287907, + "grad_norm": 0.228669211268425, + "learning_rate": 4.9294203941451494e-05, + "loss": 1.7776, + "step": 16922 + }, + { + "epoch": 5.194290976058932, + "grad_norm": 0.2266843616962433, + "learning_rate": 4.928923388032739e-05, + "loss": 1.7563, + "step": 16923 + }, + { + "epoch": 5.194597912829957, + "grad_norm": 0.2581404745578766, + "learning_rate": 4.928426382622747e-05, + "loss": 1.8112, + "step": 16924 + }, + { + "epoch": 5.194904849600983, + "grad_norm": 0.25179803371429443, + "learning_rate": 4.92792937792009e-05, + "loss": 1.7661, + "step": 16925 + }, + { + "epoch": 5.195211786372007, + "grad_norm": 0.23408514261245728, + "learning_rate": 4.9274323739296746e-05, + "loss": 1.7618, + "step": 16926 + }, + { + "epoch": 5.195518723143032, + "grad_norm": 0.23110872507095337, + "learning_rate": 4.926935370656416e-05, + "loss": 1.6945, + "step": 16927 + }, + { + "epoch": 5.195825659914058, + "grad_norm": 0.2863025665283203, + "learning_rate": 4.926438368105224e-05, + "loss": 1.8659, + "step": 16928 + }, + { + "epoch": 5.196132596685083, + "grad_norm": 0.2156454175710678, + "learning_rate": 4.925941366281013e-05, + "loss": 1.7281, + "step": 16929 + }, + { + "epoch": 5.196439533456108, + "grad_norm": 0.2338300198316574, + "learning_rate": 4.925444365188691e-05, + "loss": 1.7271, + "step": 16930 + }, + { + "epoch": 5.196746470227133, + "grad_norm": 0.21434102952480316, + "learning_rate": 4.924947364833173e-05, + "loss": 1.7342, + "step": 16931 + }, + { + "epoch": 5.197053406998158, + "grad_norm": 0.21619778871536255, + "learning_rate": 4.924450365219369e-05, + "loss": 1.7493, + "step": 16932 + }, + { + "epoch": 5.1973603437691835, + "grad_norm": 0.24532032012939453, + "learning_rate": 4.9239533663521896e-05, + "loss": 1.7707, + "step": 16933 + }, + { + "epoch": 5.197667280540209, + "grad_norm": 0.21795547008514404, + "learning_rate": 4.923456368236549e-05, + "loss": 1.7642, + "step": 16934 + }, + { + "epoch": 5.197974217311234, + "grad_norm": 0.2070101797580719, + "learning_rate": 4.922959370877356e-05, + "loss": 1.7377, + "step": 16935 + }, + { + "epoch": 5.198281154082259, + "grad_norm": 0.22546489536762238, + "learning_rate": 4.9224623742795256e-05, + "loss": 1.7766, + "step": 16936 + }, + { + "epoch": 5.198588090853284, + "grad_norm": 0.20723624527454376, + "learning_rate": 4.921965378447965e-05, + "loss": 1.7316, + "step": 16937 + }, + { + "epoch": 5.198895027624309, + "grad_norm": 0.21870547533035278, + "learning_rate": 4.9214683833875905e-05, + "loss": 1.7653, + "step": 16938 + }, + { + "epoch": 5.199201964395335, + "grad_norm": 0.19606490433216095, + "learning_rate": 4.920971389103309e-05, + "loss": 1.7181, + "step": 16939 + }, + { + "epoch": 5.19950890116636, + "grad_norm": 0.18372730910778046, + "learning_rate": 4.920474395600037e-05, + "loss": 1.7041, + "step": 16940 + }, + { + "epoch": 5.199815837937384, + "grad_norm": 0.22051765024662018, + "learning_rate": 4.919977402882682e-05, + "loss": 1.7172, + "step": 16941 + }, + { + "epoch": 5.20012277470841, + "grad_norm": 0.2135835587978363, + "learning_rate": 4.919480410956159e-05, + "loss": 1.6918, + "step": 16942 + }, + { + "epoch": 5.200429711479435, + "grad_norm": 0.19619768857955933, + "learning_rate": 4.918983419825376e-05, + "loss": 1.7005, + "step": 16943 + }, + { + "epoch": 5.2007366482504604, + "grad_norm": 0.22726574540138245, + "learning_rate": 4.918486429495246e-05, + "loss": 1.6775, + "step": 16944 + }, + { + "epoch": 5.201043585021486, + "grad_norm": 0.21471361815929413, + "learning_rate": 4.9179894399706815e-05, + "loss": 1.7102, + "step": 16945 + }, + { + "epoch": 5.201350521792511, + "grad_norm": 0.20113740861415863, + "learning_rate": 4.917492451256595e-05, + "loss": 1.7548, + "step": 16946 + }, + { + "epoch": 5.201657458563536, + "grad_norm": 0.2337827831506729, + "learning_rate": 4.916995463357894e-05, + "loss": 1.818, + "step": 16947 + }, + { + "epoch": 5.201964395334561, + "grad_norm": 0.2649554908275604, + "learning_rate": 4.9164984762794955e-05, + "loss": 1.7784, + "step": 16948 + }, + { + "epoch": 5.202271332105586, + "grad_norm": 0.2297617793083191, + "learning_rate": 4.916001490026306e-05, + "loss": 1.7484, + "step": 16949 + }, + { + "epoch": 5.202578268876612, + "grad_norm": 0.20791979134082794, + "learning_rate": 4.915504504603238e-05, + "loss": 1.7164, + "step": 16950 + }, + { + "epoch": 5.202885205647637, + "grad_norm": 0.21769596636295319, + "learning_rate": 4.915007520015207e-05, + "loss": 1.7783, + "step": 16951 + }, + { + "epoch": 5.203192142418661, + "grad_norm": 0.21038469672203064, + "learning_rate": 4.914510536267118e-05, + "loss": 1.6863, + "step": 16952 + }, + { + "epoch": 5.203499079189687, + "grad_norm": 0.20725449919700623, + "learning_rate": 4.914013553363889e-05, + "loss": 1.6855, + "step": 16953 + }, + { + "epoch": 5.203806015960712, + "grad_norm": 0.23879854381084442, + "learning_rate": 4.9135165713104266e-05, + "loss": 1.6986, + "step": 16954 + }, + { + "epoch": 5.204112952731737, + "grad_norm": 0.20515915751457214, + "learning_rate": 4.913019590111645e-05, + "loss": 1.6912, + "step": 16955 + }, + { + "epoch": 5.204419889502763, + "grad_norm": 0.2252528965473175, + "learning_rate": 4.912522609772453e-05, + "loss": 1.6974, + "step": 16956 + }, + { + "epoch": 5.204726826273788, + "grad_norm": 0.1946130096912384, + "learning_rate": 4.9120256302977665e-05, + "loss": 1.7009, + "step": 16957 + }, + { + "epoch": 5.2050337630448125, + "grad_norm": 0.21323645114898682, + "learning_rate": 4.9115286516924925e-05, + "loss": 1.7746, + "step": 16958 + }, + { + "epoch": 5.205340699815838, + "grad_norm": 0.20721712708473206, + "learning_rate": 4.911031673961546e-05, + "loss": 1.7103, + "step": 16959 + }, + { + "epoch": 5.205647636586863, + "grad_norm": 0.19630689918994904, + "learning_rate": 4.910534697109834e-05, + "loss": 1.7042, + "step": 16960 + }, + { + "epoch": 5.2059545733578885, + "grad_norm": 0.2036786526441574, + "learning_rate": 4.910037721142273e-05, + "loss": 1.7713, + "step": 16961 + }, + { + "epoch": 5.206261510128914, + "grad_norm": 0.20518352091312408, + "learning_rate": 4.9095407460637696e-05, + "loss": 1.7456, + "step": 16962 + }, + { + "epoch": 5.206568446899938, + "grad_norm": 0.199858620762825, + "learning_rate": 4.9090437718792404e-05, + "loss": 1.7598, + "step": 16963 + }, + { + "epoch": 5.206875383670964, + "grad_norm": 0.22860252857208252, + "learning_rate": 4.9085467985935914e-05, + "loss": 1.7947, + "step": 16964 + }, + { + "epoch": 5.207182320441989, + "grad_norm": 0.22179929912090302, + "learning_rate": 4.9080498262117395e-05, + "loss": 1.7537, + "step": 16965 + }, + { + "epoch": 5.207489257213014, + "grad_norm": 0.24737581610679626, + "learning_rate": 4.9075528547385906e-05, + "loss": 1.7932, + "step": 16966 + }, + { + "epoch": 5.20779619398404, + "grad_norm": 0.2653762400150299, + "learning_rate": 4.907055884179059e-05, + "loss": 1.7683, + "step": 16967 + }, + { + "epoch": 5.208103130755064, + "grad_norm": 0.2891876697540283, + "learning_rate": 4.9065589145380564e-05, + "loss": 1.7867, + "step": 16968 + }, + { + "epoch": 5.208410067526089, + "grad_norm": 0.23162086308002472, + "learning_rate": 4.906061945820492e-05, + "loss": 1.7981, + "step": 16969 + }, + { + "epoch": 5.208717004297115, + "grad_norm": 0.2746187150478363, + "learning_rate": 4.9055649780312805e-05, + "loss": 1.7215, + "step": 16970 + }, + { + "epoch": 5.20902394106814, + "grad_norm": 0.3217853605747223, + "learning_rate": 4.905068011175329e-05, + "loss": 1.8027, + "step": 16971 + }, + { + "epoch": 5.209330877839165, + "grad_norm": 0.21517686545848846, + "learning_rate": 4.904571045257553e-05, + "loss": 1.7055, + "step": 16972 + }, + { + "epoch": 5.209637814610191, + "grad_norm": 0.23613709211349487, + "learning_rate": 4.90407408028286e-05, + "loss": 1.751, + "step": 16973 + }, + { + "epoch": 5.209944751381215, + "grad_norm": 0.35093945264816284, + "learning_rate": 4.903577116256165e-05, + "loss": 1.7749, + "step": 16974 + }, + { + "epoch": 5.2102516881522405, + "grad_norm": 0.3289217948913574, + "learning_rate": 4.903080153182376e-05, + "loss": 1.7722, + "step": 16975 + }, + { + "epoch": 5.210558624923266, + "grad_norm": 0.29387256503105164, + "learning_rate": 4.9025831910664074e-05, + "loss": 1.8121, + "step": 16976 + }, + { + "epoch": 5.210865561694291, + "grad_norm": 0.44418805837631226, + "learning_rate": 4.9020862299131664e-05, + "loss": 1.7744, + "step": 16977 + }, + { + "epoch": 5.2111724984653165, + "grad_norm": 0.39242252707481384, + "learning_rate": 4.901589269727568e-05, + "loss": 1.7183, + "step": 16978 + }, + { + "epoch": 5.211479435236341, + "grad_norm": 0.2028690129518509, + "learning_rate": 4.901092310514522e-05, + "loss": 1.7101, + "step": 16979 + }, + { + "epoch": 5.211786372007366, + "grad_norm": 0.4025843143463135, + "learning_rate": 4.900595352278941e-05, + "loss": 1.7545, + "step": 16980 + }, + { + "epoch": 5.212093308778392, + "grad_norm": 0.284568727016449, + "learning_rate": 4.900098395025733e-05, + "loss": 1.7758, + "step": 16981 + }, + { + "epoch": 5.212400245549417, + "grad_norm": 0.2527516484260559, + "learning_rate": 4.899601438759813e-05, + "loss": 1.695, + "step": 16982 + }, + { + "epoch": 5.212707182320442, + "grad_norm": 0.3063630759716034, + "learning_rate": 4.89910448348609e-05, + "loss": 1.714, + "step": 16983 + }, + { + "epoch": 5.213014119091467, + "grad_norm": 0.22754468023777008, + "learning_rate": 4.898607529209474e-05, + "loss": 1.8315, + "step": 16984 + }, + { + "epoch": 5.213321055862492, + "grad_norm": 0.29594969749450684, + "learning_rate": 4.89811057593488e-05, + "loss": 1.6669, + "step": 16985 + }, + { + "epoch": 5.213627992633517, + "grad_norm": 0.21486569941043854, + "learning_rate": 4.897613623667215e-05, + "loss": 1.7425, + "step": 16986 + }, + { + "epoch": 5.213934929404543, + "grad_norm": 0.30908775329589844, + "learning_rate": 4.897116672411395e-05, + "loss": 1.7915, + "step": 16987 + }, + { + "epoch": 5.214241866175568, + "grad_norm": 0.23515601456165314, + "learning_rate": 4.896619722172325e-05, + "loss": 1.7226, + "step": 16988 + }, + { + "epoch": 5.214548802946593, + "grad_norm": 0.2847287952899933, + "learning_rate": 4.8961227729549215e-05, + "loss": 1.7641, + "step": 16989 + }, + { + "epoch": 5.214855739717618, + "grad_norm": 0.2986287772655487, + "learning_rate": 4.895625824764092e-05, + "loss": 1.8025, + "step": 16990 + }, + { + "epoch": 5.215162676488643, + "grad_norm": 0.23454971611499786, + "learning_rate": 4.8951288776047514e-05, + "loss": 1.7057, + "step": 16991 + }, + { + "epoch": 5.2154696132596685, + "grad_norm": 0.2578633725643158, + "learning_rate": 4.894631931481807e-05, + "loss": 1.7267, + "step": 16992 + }, + { + "epoch": 5.215776550030694, + "grad_norm": 0.29975566267967224, + "learning_rate": 4.894134986400174e-05, + "loss": 1.7452, + "step": 16993 + }, + { + "epoch": 5.216083486801719, + "grad_norm": 0.22313638031482697, + "learning_rate": 4.893638042364758e-05, + "loss": 1.6917, + "step": 16994 + }, + { + "epoch": 5.216390423572744, + "grad_norm": 0.258297860622406, + "learning_rate": 4.893141099380475e-05, + "loss": 1.7816, + "step": 16995 + }, + { + "epoch": 5.216697360343769, + "grad_norm": 0.2656872272491455, + "learning_rate": 4.892644157452233e-05, + "loss": 1.7248, + "step": 16996 + }, + { + "epoch": 5.217004297114794, + "grad_norm": 0.20239698886871338, + "learning_rate": 4.8921472165849464e-05, + "loss": 1.7629, + "step": 16997 + }, + { + "epoch": 5.21731123388582, + "grad_norm": 0.2575492262840271, + "learning_rate": 4.891650276783523e-05, + "loss": 1.719, + "step": 16998 + }, + { + "epoch": 5.217618170656845, + "grad_norm": 0.27563637495040894, + "learning_rate": 4.8911533380528756e-05, + "loss": 1.718, + "step": 16999 + }, + { + "epoch": 5.21792510742787, + "grad_norm": 0.1969723105430603, + "learning_rate": 4.890656400397915e-05, + "loss": 1.7557, + "step": 17000 + }, + { + "epoch": 5.218232044198895, + "grad_norm": 0.24336831271648407, + "learning_rate": 4.89015946382355e-05, + "loss": 1.6861, + "step": 17001 + }, + { + "epoch": 5.21853898096992, + "grad_norm": 0.2804388403892517, + "learning_rate": 4.889662528334696e-05, + "loss": 1.7411, + "step": 17002 + }, + { + "epoch": 5.218845917740945, + "grad_norm": 0.21116352081298828, + "learning_rate": 4.8891655939362596e-05, + "loss": 1.7135, + "step": 17003 + }, + { + "epoch": 5.219152854511971, + "grad_norm": 0.21042904257774353, + "learning_rate": 4.8886686606331556e-05, + "loss": 1.7224, + "step": 17004 + }, + { + "epoch": 5.219459791282996, + "grad_norm": 0.22463755309581757, + "learning_rate": 4.888171728430291e-05, + "loss": 1.8272, + "step": 17005 + }, + { + "epoch": 5.2197667280540205, + "grad_norm": 0.25604158639907837, + "learning_rate": 4.8876747973325805e-05, + "loss": 1.674, + "step": 17006 + }, + { + "epoch": 5.220073664825046, + "grad_norm": 0.3108421564102173, + "learning_rate": 4.887177867344932e-05, + "loss": 1.761, + "step": 17007 + }, + { + "epoch": 5.220380601596071, + "grad_norm": 0.25135359168052673, + "learning_rate": 4.88668093847226e-05, + "loss": 1.7455, + "step": 17008 + }, + { + "epoch": 5.2206875383670965, + "grad_norm": 0.24508307874202728, + "learning_rate": 4.886184010719471e-05, + "loss": 1.7632, + "step": 17009 + }, + { + "epoch": 5.220994475138122, + "grad_norm": 0.26777148246765137, + "learning_rate": 4.8856870840914816e-05, + "loss": 1.7814, + "step": 17010 + }, + { + "epoch": 5.221301411909146, + "grad_norm": 0.22404739260673523, + "learning_rate": 4.8851901585931967e-05, + "loss": 1.7441, + "step": 17011 + }, + { + "epoch": 5.221608348680172, + "grad_norm": 0.2406606674194336, + "learning_rate": 4.884693234229531e-05, + "loss": 1.7789, + "step": 17012 + }, + { + "epoch": 5.221915285451197, + "grad_norm": 0.27320384979248047, + "learning_rate": 4.884196311005394e-05, + "loss": 1.8046, + "step": 17013 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.3393586277961731, + "learning_rate": 4.8836993889256965e-05, + "loss": 1.7155, + "step": 17014 + }, + { + "epoch": 5.222529158993248, + "grad_norm": 0.3069504499435425, + "learning_rate": 4.88320246799535e-05, + "loss": 1.6985, + "step": 17015 + }, + { + "epoch": 5.222836095764273, + "grad_norm": 0.22184616327285767, + "learning_rate": 4.8827055482192664e-05, + "loss": 1.7996, + "step": 17016 + }, + { + "epoch": 5.223143032535297, + "grad_norm": 0.2791864573955536, + "learning_rate": 4.8822086296023544e-05, + "loss": 1.7223, + "step": 17017 + }, + { + "epoch": 5.223449969306323, + "grad_norm": 0.259726345539093, + "learning_rate": 4.8817117121495245e-05, + "loss": 1.7481, + "step": 17018 + }, + { + "epoch": 5.223756906077348, + "grad_norm": 0.19968681037425995, + "learning_rate": 4.8812147958656916e-05, + "loss": 1.702, + "step": 17019 + }, + { + "epoch": 5.224063842848373, + "grad_norm": 0.20161856710910797, + "learning_rate": 4.8807178807557616e-05, + "loss": 1.6689, + "step": 17020 + }, + { + "epoch": 5.224370779619399, + "grad_norm": 0.2365240454673767, + "learning_rate": 4.880220966824649e-05, + "loss": 1.7742, + "step": 17021 + }, + { + "epoch": 5.224677716390423, + "grad_norm": 0.20116381347179413, + "learning_rate": 4.879724054077261e-05, + "loss": 1.7584, + "step": 17022 + }, + { + "epoch": 5.2249846531614486, + "grad_norm": 0.22845037281513214, + "learning_rate": 4.879227142518511e-05, + "loss": 1.7794, + "step": 17023 + }, + { + "epoch": 5.225291589932474, + "grad_norm": 0.251724511384964, + "learning_rate": 4.87873023215331e-05, + "loss": 1.7722, + "step": 17024 + }, + { + "epoch": 5.225598526703499, + "grad_norm": 0.206145241856575, + "learning_rate": 4.878233322986568e-05, + "loss": 1.7452, + "step": 17025 + }, + { + "epoch": 5.225905463474525, + "grad_norm": 0.24065247178077698, + "learning_rate": 4.877736415023194e-05, + "loss": 1.8144, + "step": 17026 + }, + { + "epoch": 5.226212400245549, + "grad_norm": 0.2255484163761139, + "learning_rate": 4.877239508268103e-05, + "loss": 1.706, + "step": 17027 + }, + { + "epoch": 5.226519337016574, + "grad_norm": 0.21035850048065186, + "learning_rate": 4.8767426027262e-05, + "loss": 1.7167, + "step": 17028 + }, + { + "epoch": 5.2268262737876, + "grad_norm": 0.19618964195251465, + "learning_rate": 4.8762456984024025e-05, + "loss": 1.7063, + "step": 17029 + }, + { + "epoch": 5.227133210558625, + "grad_norm": 0.19595398008823395, + "learning_rate": 4.875748795301614e-05, + "loss": 1.7452, + "step": 17030 + }, + { + "epoch": 5.22744014732965, + "grad_norm": 0.22870996594429016, + "learning_rate": 4.8752518934287506e-05, + "loss": 1.8169, + "step": 17031 + }, + { + "epoch": 5.227747084100676, + "grad_norm": 0.24048443138599396, + "learning_rate": 4.87475499278872e-05, + "loss": 1.6988, + "step": 17032 + }, + { + "epoch": 5.2280540208717, + "grad_norm": 0.24177183210849762, + "learning_rate": 4.8742580933864356e-05, + "loss": 1.77, + "step": 17033 + }, + { + "epoch": 5.2283609576427255, + "grad_norm": 0.2023085057735443, + "learning_rate": 4.873761195226806e-05, + "loss": 1.7, + "step": 17034 + }, + { + "epoch": 5.228667894413751, + "grad_norm": 0.2614101767539978, + "learning_rate": 4.873264298314742e-05, + "loss": 1.767, + "step": 17035 + }, + { + "epoch": 5.228974831184776, + "grad_norm": 0.19607602059841156, + "learning_rate": 4.872767402655154e-05, + "loss": 1.7391, + "step": 17036 + }, + { + "epoch": 5.2292817679558015, + "grad_norm": 0.2053994983434677, + "learning_rate": 4.872270508252953e-05, + "loss": 1.7155, + "step": 17037 + }, + { + "epoch": 5.229588704726826, + "grad_norm": 0.18256273865699768, + "learning_rate": 4.871773615113051e-05, + "loss": 1.6999, + "step": 17038 + }, + { + "epoch": 5.229895641497851, + "grad_norm": 0.21956393122673035, + "learning_rate": 4.871276723240356e-05, + "loss": 1.7946, + "step": 17039 + }, + { + "epoch": 5.230202578268877, + "grad_norm": 0.23779109120368958, + "learning_rate": 4.870779832639781e-05, + "loss": 1.8063, + "step": 17040 + }, + { + "epoch": 5.230509515039902, + "grad_norm": 0.21662941575050354, + "learning_rate": 4.8702829433162346e-05, + "loss": 1.7276, + "step": 17041 + }, + { + "epoch": 5.230816451810927, + "grad_norm": 0.21578755974769592, + "learning_rate": 4.869786055274628e-05, + "loss": 1.7577, + "step": 17042 + }, + { + "epoch": 5.231123388581952, + "grad_norm": 0.23229347169399261, + "learning_rate": 4.8692891685198715e-05, + "loss": 1.7884, + "step": 17043 + }, + { + "epoch": 5.231430325352977, + "grad_norm": 0.2302366942167282, + "learning_rate": 4.868792283056878e-05, + "loss": 1.7823, + "step": 17044 + }, + { + "epoch": 5.231737262124002, + "grad_norm": 0.2181033343076706, + "learning_rate": 4.868295398890554e-05, + "loss": 1.7027, + "step": 17045 + }, + { + "epoch": 5.232044198895028, + "grad_norm": 0.20863409340381622, + "learning_rate": 4.8677985160258135e-05, + "loss": 1.7247, + "step": 17046 + }, + { + "epoch": 5.232351135666053, + "grad_norm": 0.2242976278066635, + "learning_rate": 4.867301634467564e-05, + "loss": 1.7799, + "step": 17047 + }, + { + "epoch": 5.232658072437078, + "grad_norm": 0.19934964179992676, + "learning_rate": 4.866804754220719e-05, + "loss": 1.6973, + "step": 17048 + }, + { + "epoch": 5.232965009208103, + "grad_norm": 0.22056198120117188, + "learning_rate": 4.8663078752901855e-05, + "loss": 1.7677, + "step": 17049 + }, + { + "epoch": 5.233271945979128, + "grad_norm": 0.2303200513124466, + "learning_rate": 4.865810997680879e-05, + "loss": 1.7517, + "step": 17050 + }, + { + "epoch": 5.2335788827501535, + "grad_norm": 0.21193410456180573, + "learning_rate": 4.8653141213977066e-05, + "loss": 1.7478, + "step": 17051 + }, + { + "epoch": 5.233885819521179, + "grad_norm": 0.18498395383358002, + "learning_rate": 4.864817246445577e-05, + "loss": 1.6891, + "step": 17052 + }, + { + "epoch": 5.234192756292204, + "grad_norm": 0.22879233956336975, + "learning_rate": 4.8643203728294036e-05, + "loss": 1.7166, + "step": 17053 + }, + { + "epoch": 5.234499693063229, + "grad_norm": 0.2128525823354721, + "learning_rate": 4.8638235005540944e-05, + "loss": 1.7993, + "step": 17054 + }, + { + "epoch": 5.234806629834254, + "grad_norm": 0.21245025098323822, + "learning_rate": 4.8633266296245634e-05, + "loss": 1.7436, + "step": 17055 + }, + { + "epoch": 5.235113566605279, + "grad_norm": 0.20301629602909088, + "learning_rate": 4.8628297600457165e-05, + "loss": 1.7774, + "step": 17056 + }, + { + "epoch": 5.235420503376305, + "grad_norm": 0.23251961171627045, + "learning_rate": 4.8623328918224687e-05, + "loss": 1.7897, + "step": 17057 + }, + { + "epoch": 5.23572744014733, + "grad_norm": 0.2272956669330597, + "learning_rate": 4.861836024959726e-05, + "loss": 1.7668, + "step": 17058 + }, + { + "epoch": 5.236034376918354, + "grad_norm": 0.20540569722652435, + "learning_rate": 4.8613391594624013e-05, + "loss": 1.7549, + "step": 17059 + }, + { + "epoch": 5.23634131368938, + "grad_norm": 0.20306967198848724, + "learning_rate": 4.8608422953354034e-05, + "loss": 1.6993, + "step": 17060 + }, + { + "epoch": 5.236648250460405, + "grad_norm": 0.19415293633937836, + "learning_rate": 4.8603454325836455e-05, + "loss": 1.7313, + "step": 17061 + }, + { + "epoch": 5.23695518723143, + "grad_norm": 0.2058337777853012, + "learning_rate": 4.859848571212034e-05, + "loss": 1.7994, + "step": 17062 + }, + { + "epoch": 5.237262124002456, + "grad_norm": 0.24489709734916687, + "learning_rate": 4.859351711225483e-05, + "loss": 1.7555, + "step": 17063 + }, + { + "epoch": 5.237569060773481, + "grad_norm": 0.22589795291423798, + "learning_rate": 4.858854852628899e-05, + "loss": 1.7136, + "step": 17064 + }, + { + "epoch": 5.2378759975445055, + "grad_norm": 0.21404492855072021, + "learning_rate": 4.858357995427195e-05, + "loss": 1.7598, + "step": 17065 + }, + { + "epoch": 5.238182934315531, + "grad_norm": 0.24936965107917786, + "learning_rate": 4.8578611396252786e-05, + "loss": 1.8027, + "step": 17066 + }, + { + "epoch": 5.238489871086556, + "grad_norm": 0.23391515016555786, + "learning_rate": 4.857364285228065e-05, + "loss": 1.7704, + "step": 17067 + }, + { + "epoch": 5.2387968078575815, + "grad_norm": 0.22633357346057892, + "learning_rate": 4.85686743224046e-05, + "loss": 1.7075, + "step": 17068 + }, + { + "epoch": 5.239103744628607, + "grad_norm": 0.221492201089859, + "learning_rate": 4.8563705806673736e-05, + "loss": 1.7755, + "step": 17069 + }, + { + "epoch": 5.239410681399631, + "grad_norm": 0.2381046712398529, + "learning_rate": 4.855873730513719e-05, + "loss": 1.7971, + "step": 17070 + }, + { + "epoch": 5.239717618170657, + "grad_norm": 0.21930988132953644, + "learning_rate": 4.855376881784402e-05, + "loss": 1.7295, + "step": 17071 + }, + { + "epoch": 5.240024554941682, + "grad_norm": 0.20897921919822693, + "learning_rate": 4.854880034484339e-05, + "loss": 1.7796, + "step": 17072 + }, + { + "epoch": 5.240331491712707, + "grad_norm": 0.26616254448890686, + "learning_rate": 4.8543831886184334e-05, + "loss": 1.7095, + "step": 17073 + }, + { + "epoch": 5.240638428483733, + "grad_norm": 0.19513870775699615, + "learning_rate": 4.853886344191601e-05, + "loss": 1.7181, + "step": 17074 + }, + { + "epoch": 5.240945365254758, + "grad_norm": 0.23476530611515045, + "learning_rate": 4.853389501208747e-05, + "loss": 1.7928, + "step": 17075 + }, + { + "epoch": 5.241252302025782, + "grad_norm": 0.18197014927864075, + "learning_rate": 4.852892659674785e-05, + "loss": 1.6888, + "step": 17076 + }, + { + "epoch": 5.241559238796808, + "grad_norm": 0.20317208766937256, + "learning_rate": 4.852395819594623e-05, + "loss": 1.7828, + "step": 17077 + }, + { + "epoch": 5.241866175567833, + "grad_norm": 0.1953772008419037, + "learning_rate": 4.851898980973175e-05, + "loss": 1.7394, + "step": 17078 + }, + { + "epoch": 5.242173112338858, + "grad_norm": 0.19714407622814178, + "learning_rate": 4.851402143815345e-05, + "loss": 1.7261, + "step": 17079 + }, + { + "epoch": 5.242480049109884, + "grad_norm": 0.2196008861064911, + "learning_rate": 4.850905308126048e-05, + "loss": 1.7387, + "step": 17080 + }, + { + "epoch": 5.242786985880908, + "grad_norm": 0.2337818443775177, + "learning_rate": 4.85040847391019e-05, + "loss": 1.7448, + "step": 17081 + }, + { + "epoch": 5.2430939226519335, + "grad_norm": 0.20940040051937103, + "learning_rate": 4.849911641172685e-05, + "loss": 1.7354, + "step": 17082 + }, + { + "epoch": 5.243400859422959, + "grad_norm": 0.2242170125246048, + "learning_rate": 4.849414809918439e-05, + "loss": 1.7325, + "step": 17083 + }, + { + "epoch": 5.243707796193984, + "grad_norm": 0.2322687953710556, + "learning_rate": 4.8489179801523675e-05, + "loss": 1.7557, + "step": 17084 + }, + { + "epoch": 5.2440147329650095, + "grad_norm": 0.20303767919540405, + "learning_rate": 4.8484211518793764e-05, + "loss": 1.7063, + "step": 17085 + }, + { + "epoch": 5.244321669736034, + "grad_norm": 0.2446853369474411, + "learning_rate": 4.8479243251043746e-05, + "loss": 1.7587, + "step": 17086 + }, + { + "epoch": 5.244628606507059, + "grad_norm": 0.22901636362075806, + "learning_rate": 4.8474274998322735e-05, + "loss": 1.7992, + "step": 17087 + }, + { + "epoch": 5.244935543278085, + "grad_norm": 0.29676303267478943, + "learning_rate": 4.846930676067984e-05, + "loss": 1.7688, + "step": 17088 + }, + { + "epoch": 5.24524248004911, + "grad_norm": 0.24160240590572357, + "learning_rate": 4.846433853816416e-05, + "loss": 1.7367, + "step": 17089 + }, + { + "epoch": 5.245549416820135, + "grad_norm": 0.2097402662038803, + "learning_rate": 4.8459370330824774e-05, + "loss": 1.721, + "step": 17090 + }, + { + "epoch": 5.245856353591161, + "grad_norm": 0.26451143622398376, + "learning_rate": 4.8454402138710814e-05, + "loss": 1.7707, + "step": 17091 + }, + { + "epoch": 5.246163290362185, + "grad_norm": 0.30428358912467957, + "learning_rate": 4.844943396187133e-05, + "loss": 1.7232, + "step": 17092 + }, + { + "epoch": 5.24647022713321, + "grad_norm": 0.24332918226718903, + "learning_rate": 4.8444465800355466e-05, + "loss": 1.8215, + "step": 17093 + }, + { + "epoch": 5.246777163904236, + "grad_norm": 0.292703777551651, + "learning_rate": 4.843949765421229e-05, + "loss": 1.7199, + "step": 17094 + }, + { + "epoch": 5.247084100675261, + "grad_norm": 0.2458789199590683, + "learning_rate": 4.843452952349094e-05, + "loss": 1.7615, + "step": 17095 + }, + { + "epoch": 5.247391037446286, + "grad_norm": 0.22538037598133087, + "learning_rate": 4.842956140824045e-05, + "loss": 1.7279, + "step": 17096 + }, + { + "epoch": 5.247697974217311, + "grad_norm": 0.2959176003932953, + "learning_rate": 4.842459330850999e-05, + "loss": 1.767, + "step": 17097 + }, + { + "epoch": 5.248004910988336, + "grad_norm": 0.26158571243286133, + "learning_rate": 4.84196252243486e-05, + "loss": 1.7387, + "step": 17098 + }, + { + "epoch": 5.2483118477593615, + "grad_norm": 0.22855687141418457, + "learning_rate": 4.84146571558054e-05, + "loss": 1.7497, + "step": 17099 + }, + { + "epoch": 5.248618784530387, + "grad_norm": 0.22470593452453613, + "learning_rate": 4.840968910292949e-05, + "loss": 1.7705, + "step": 17100 + }, + { + "epoch": 5.248925721301412, + "grad_norm": 0.24680538475513458, + "learning_rate": 4.840472106576998e-05, + "loss": 1.7426, + "step": 17101 + }, + { + "epoch": 5.249232658072437, + "grad_norm": 0.23919185996055603, + "learning_rate": 4.839975304437594e-05, + "loss": 1.78, + "step": 17102 + }, + { + "epoch": 5.249539594843462, + "grad_norm": 0.24717366695404053, + "learning_rate": 4.839478503879647e-05, + "loss": 1.7373, + "step": 17103 + }, + { + "epoch": 5.249846531614487, + "grad_norm": 0.20463785529136658, + "learning_rate": 4.838981704908068e-05, + "loss": 1.702, + "step": 17104 + }, + { + "epoch": 5.250153468385513, + "grad_norm": 0.19791419804096222, + "learning_rate": 4.838484907527766e-05, + "loss": 1.746, + "step": 17105 + }, + { + "epoch": 5.250460405156538, + "grad_norm": 0.26169353723526, + "learning_rate": 4.837988111743652e-05, + "loss": 1.7227, + "step": 17106 + }, + { + "epoch": 5.250767341927563, + "grad_norm": 0.23545648157596588, + "learning_rate": 4.837491317560633e-05, + "loss": 1.7104, + "step": 17107 + }, + { + "epoch": 5.251074278698588, + "grad_norm": 0.21569804847240448, + "learning_rate": 4.836994524983622e-05, + "loss": 1.7883, + "step": 17108 + }, + { + "epoch": 5.251381215469613, + "grad_norm": 0.2730300724506378, + "learning_rate": 4.836497734017524e-05, + "loss": 1.7105, + "step": 17109 + }, + { + "epoch": 5.2516881522406385, + "grad_norm": 0.2834697663784027, + "learning_rate": 4.836000944667253e-05, + "loss": 1.8041, + "step": 17110 + }, + { + "epoch": 5.251995089011664, + "grad_norm": 0.31536951661109924, + "learning_rate": 4.835504156937715e-05, + "loss": 1.7708, + "step": 17111 + }, + { + "epoch": 5.252302025782689, + "grad_norm": 0.3830285668373108, + "learning_rate": 4.835007370833824e-05, + "loss": 1.7464, + "step": 17112 + }, + { + "epoch": 5.252608962553714, + "grad_norm": 0.23248349130153656, + "learning_rate": 4.834510586360485e-05, + "loss": 1.7274, + "step": 17113 + }, + { + "epoch": 5.252915899324739, + "grad_norm": 0.4755091071128845, + "learning_rate": 4.834013803522611e-05, + "loss": 1.7853, + "step": 17114 + }, + { + "epoch": 5.253222836095764, + "grad_norm": 0.4267823398113251, + "learning_rate": 4.8335170223251073e-05, + "loss": 1.7424, + "step": 17115 + }, + { + "epoch": 5.25352977286679, + "grad_norm": 0.17621731758117676, + "learning_rate": 4.8330202427728876e-05, + "loss": 1.7415, + "step": 17116 + }, + { + "epoch": 5.253836709637815, + "grad_norm": 0.37484630942344666, + "learning_rate": 4.832523464870859e-05, + "loss": 1.7357, + "step": 17117 + }, + { + "epoch": 5.25414364640884, + "grad_norm": 0.27773791551589966, + "learning_rate": 4.832026688623933e-05, + "loss": 1.717, + "step": 17118 + }, + { + "epoch": 5.254450583179865, + "grad_norm": 0.31190845370292664, + "learning_rate": 4.8315299140370183e-05, + "loss": 1.7226, + "step": 17119 + }, + { + "epoch": 5.25475751995089, + "grad_norm": 0.4321303367614746, + "learning_rate": 4.8310331411150215e-05, + "loss": 1.8003, + "step": 17120 + }, + { + "epoch": 5.255064456721915, + "grad_norm": 0.31622835993766785, + "learning_rate": 4.830536369862855e-05, + "loss": 1.8462, + "step": 17121 + }, + { + "epoch": 5.255371393492941, + "grad_norm": 0.2144850194454193, + "learning_rate": 4.830039600285427e-05, + "loss": 1.8153, + "step": 17122 + }, + { + "epoch": 5.255678330263966, + "grad_norm": 0.3107511103153229, + "learning_rate": 4.829542832387649e-05, + "loss": 1.7271, + "step": 17123 + }, + { + "epoch": 5.2559852670349905, + "grad_norm": 0.24607159197330475, + "learning_rate": 4.8290460661744265e-05, + "loss": 1.7946, + "step": 17124 + }, + { + "epoch": 5.256292203806016, + "grad_norm": 0.226362943649292, + "learning_rate": 4.828549301650673e-05, + "loss": 1.7338, + "step": 17125 + }, + { + "epoch": 5.256599140577041, + "grad_norm": 0.29993724822998047, + "learning_rate": 4.828052538821294e-05, + "loss": 1.8, + "step": 17126 + }, + { + "epoch": 5.2569060773480665, + "grad_norm": 0.25639984011650085, + "learning_rate": 4.8275557776912014e-05, + "loss": 1.8009, + "step": 17127 + }, + { + "epoch": 5.257213014119092, + "grad_norm": 0.2308105081319809, + "learning_rate": 4.8270590182653024e-05, + "loss": 1.7468, + "step": 17128 + }, + { + "epoch": 5.257519950890116, + "grad_norm": 0.27337542176246643, + "learning_rate": 4.82656226054851e-05, + "loss": 1.7725, + "step": 17129 + }, + { + "epoch": 5.257826887661142, + "grad_norm": 0.24848094582557678, + "learning_rate": 4.826065504545729e-05, + "loss": 1.8084, + "step": 17130 + }, + { + "epoch": 5.258133824432167, + "grad_norm": 0.35026392340660095, + "learning_rate": 4.825568750261872e-05, + "loss": 1.7705, + "step": 17131 + }, + { + "epoch": 5.258440761203192, + "grad_norm": 0.3207968473434448, + "learning_rate": 4.825071997701846e-05, + "loss": 1.7329, + "step": 17132 + }, + { + "epoch": 5.258747697974218, + "grad_norm": 0.20949263870716095, + "learning_rate": 4.8245752468705614e-05, + "loss": 1.7658, + "step": 17133 + }, + { + "epoch": 5.259054634745242, + "grad_norm": 0.3158881366252899, + "learning_rate": 4.824078497772926e-05, + "loss": 1.7249, + "step": 17134 + }, + { + "epoch": 5.259361571516267, + "grad_norm": 0.2283414602279663, + "learning_rate": 4.823581750413852e-05, + "loss": 1.7177, + "step": 17135 + }, + { + "epoch": 5.259668508287293, + "grad_norm": 0.24753578007221222, + "learning_rate": 4.823085004798247e-05, + "loss": 1.7232, + "step": 17136 + }, + { + "epoch": 5.259975445058318, + "grad_norm": 0.20381587743759155, + "learning_rate": 4.822588260931017e-05, + "loss": 1.7049, + "step": 17137 + }, + { + "epoch": 5.260282381829343, + "grad_norm": 0.21220643818378448, + "learning_rate": 4.8220915188170746e-05, + "loss": 1.7221, + "step": 17138 + }, + { + "epoch": 5.260589318600369, + "grad_norm": 0.19324758648872375, + "learning_rate": 4.8215947784613276e-05, + "loss": 1.7168, + "step": 17139 + }, + { + "epoch": 5.260896255371393, + "grad_norm": 0.26500338315963745, + "learning_rate": 4.821098039868688e-05, + "loss": 1.7627, + "step": 17140 + }, + { + "epoch": 5.2612031921424185, + "grad_norm": 0.19597655534744263, + "learning_rate": 4.82060130304406e-05, + "loss": 1.7214, + "step": 17141 + }, + { + "epoch": 5.261510128913444, + "grad_norm": 0.2105483114719391, + "learning_rate": 4.820104567992357e-05, + "loss": 1.6742, + "step": 17142 + }, + { + "epoch": 5.261817065684469, + "grad_norm": 0.20020028948783875, + "learning_rate": 4.8196078347184837e-05, + "loss": 1.7721, + "step": 17143 + }, + { + "epoch": 5.2621240024554945, + "grad_norm": 0.2313549965620041, + "learning_rate": 4.819111103227353e-05, + "loss": 1.7644, + "step": 17144 + }, + { + "epoch": 5.262430939226519, + "grad_norm": 0.31893789768218994, + "learning_rate": 4.818614373523871e-05, + "loss": 1.747, + "step": 17145 + }, + { + "epoch": 5.262737875997544, + "grad_norm": 0.2531197667121887, + "learning_rate": 4.8181176456129505e-05, + "loss": 1.7713, + "step": 17146 + }, + { + "epoch": 5.26304481276857, + "grad_norm": 0.2063976377248764, + "learning_rate": 4.817620919499496e-05, + "loss": 1.7254, + "step": 17147 + }, + { + "epoch": 5.263351749539595, + "grad_norm": 0.22220590710639954, + "learning_rate": 4.8171241951884204e-05, + "loss": 1.7345, + "step": 17148 + }, + { + "epoch": 5.26365868631062, + "grad_norm": 0.24240384995937347, + "learning_rate": 4.8166274726846286e-05, + "loss": 1.7302, + "step": 17149 + }, + { + "epoch": 5.263965623081646, + "grad_norm": 0.215829998254776, + "learning_rate": 4.8161307519930326e-05, + "loss": 1.7725, + "step": 17150 + }, + { + "epoch": 5.26427255985267, + "grad_norm": 0.2697906494140625, + "learning_rate": 4.815634033118541e-05, + "loss": 1.7156, + "step": 17151 + }, + { + "epoch": 5.264579496623695, + "grad_norm": 0.21649456024169922, + "learning_rate": 4.815137316066061e-05, + "loss": 1.745, + "step": 17152 + }, + { + "epoch": 5.264886433394721, + "grad_norm": 0.22773787379264832, + "learning_rate": 4.8146406008405033e-05, + "loss": 1.7592, + "step": 17153 + }, + { + "epoch": 5.265193370165746, + "grad_norm": 0.2920280396938324, + "learning_rate": 4.8141438874467745e-05, + "loss": 1.8301, + "step": 17154 + }, + { + "epoch": 5.265500306936771, + "grad_norm": 0.23919162154197693, + "learning_rate": 4.813647175889785e-05, + "loss": 1.7687, + "step": 17155 + }, + { + "epoch": 5.265807243707796, + "grad_norm": 0.24617896974086761, + "learning_rate": 4.8131504661744425e-05, + "loss": 1.8279, + "step": 17156 + }, + { + "epoch": 5.266114180478821, + "grad_norm": 0.22756172716617584, + "learning_rate": 4.812653758305659e-05, + "loss": 1.7595, + "step": 17157 + }, + { + "epoch": 5.2664211172498465, + "grad_norm": 0.22939376533031464, + "learning_rate": 4.812157052288339e-05, + "loss": 1.7445, + "step": 17158 + }, + { + "epoch": 5.266728054020872, + "grad_norm": 0.21021319925785065, + "learning_rate": 4.811660348127395e-05, + "loss": 1.7875, + "step": 17159 + }, + { + "epoch": 5.267034990791897, + "grad_norm": 0.2271810919046402, + "learning_rate": 4.811163645827732e-05, + "loss": 1.74, + "step": 17160 + }, + { + "epoch": 5.267341927562922, + "grad_norm": 0.238374263048172, + "learning_rate": 4.81066694539426e-05, + "loss": 1.7717, + "step": 17161 + }, + { + "epoch": 5.267648864333947, + "grad_norm": 0.20655091106891632, + "learning_rate": 4.8101702468318885e-05, + "loss": 1.7447, + "step": 17162 + }, + { + "epoch": 5.267955801104972, + "grad_norm": 0.24652259051799774, + "learning_rate": 4.809673550145528e-05, + "loss": 1.7755, + "step": 17163 + }, + { + "epoch": 5.268262737875998, + "grad_norm": 0.20256781578063965, + "learning_rate": 4.809176855340083e-05, + "loss": 1.7689, + "step": 17164 + }, + { + "epoch": 5.268569674647023, + "grad_norm": 0.27023112773895264, + "learning_rate": 4.8086801624204665e-05, + "loss": 1.8364, + "step": 17165 + }, + { + "epoch": 5.268876611418047, + "grad_norm": 0.251638799905777, + "learning_rate": 4.808183471391582e-05, + "loss": 1.7924, + "step": 17166 + }, + { + "epoch": 5.269183548189073, + "grad_norm": 0.22897782921791077, + "learning_rate": 4.807686782258342e-05, + "loss": 1.7378, + "step": 17167 + }, + { + "epoch": 5.269490484960098, + "grad_norm": 0.19141456484794617, + "learning_rate": 4.807190095025655e-05, + "loss": 1.6911, + "step": 17168 + }, + { + "epoch": 5.269797421731123, + "grad_norm": 0.19960568845272064, + "learning_rate": 4.806693409698427e-05, + "loss": 1.71, + "step": 17169 + }, + { + "epoch": 5.270104358502149, + "grad_norm": 0.23332087695598602, + "learning_rate": 4.8061967262815694e-05, + "loss": 1.7993, + "step": 17170 + }, + { + "epoch": 5.270411295273174, + "grad_norm": 0.24831432104110718, + "learning_rate": 4.8057000447799876e-05, + "loss": 1.7459, + "step": 17171 + }, + { + "epoch": 5.2707182320441985, + "grad_norm": 0.24735838174819946, + "learning_rate": 4.805203365198593e-05, + "loss": 1.7751, + "step": 17172 + }, + { + "epoch": 5.271025168815224, + "grad_norm": 0.32630103826522827, + "learning_rate": 4.804706687542291e-05, + "loss": 1.7885, + "step": 17173 + }, + { + "epoch": 5.271332105586249, + "grad_norm": 0.29055842757225037, + "learning_rate": 4.804210011815995e-05, + "loss": 1.6819, + "step": 17174 + }, + { + "epoch": 5.2716390423572745, + "grad_norm": 0.22968806326389313, + "learning_rate": 4.803713338024608e-05, + "loss": 1.8146, + "step": 17175 + }, + { + "epoch": 5.2719459791283, + "grad_norm": 0.23430144786834717, + "learning_rate": 4.8032166661730434e-05, + "loss": 1.7401, + "step": 17176 + }, + { + "epoch": 5.272252915899324, + "grad_norm": 0.26312723755836487, + "learning_rate": 4.802719996266204e-05, + "loss": 1.8319, + "step": 17177 + }, + { + "epoch": 5.27255985267035, + "grad_norm": 0.23715369403362274, + "learning_rate": 4.802223328309003e-05, + "loss": 1.8014, + "step": 17178 + }, + { + "epoch": 5.272866789441375, + "grad_norm": 0.23943877220153809, + "learning_rate": 4.801726662306347e-05, + "loss": 1.7181, + "step": 17179 + }, + { + "epoch": 5.2731737262124, + "grad_norm": 0.2366543412208557, + "learning_rate": 4.8012299982631435e-05, + "loss": 1.6685, + "step": 17180 + }, + { + "epoch": 5.273480662983426, + "grad_norm": 0.20688587427139282, + "learning_rate": 4.8007333361843016e-05, + "loss": 1.7089, + "step": 17181 + }, + { + "epoch": 5.273787599754451, + "grad_norm": 0.2069951444864273, + "learning_rate": 4.8002366760747314e-05, + "loss": 1.7447, + "step": 17182 + }, + { + "epoch": 5.274094536525475, + "grad_norm": 0.26072344183921814, + "learning_rate": 4.7997400179393374e-05, + "loss": 1.7346, + "step": 17183 + }, + { + "epoch": 5.274401473296501, + "grad_norm": 0.2397938072681427, + "learning_rate": 4.799243361783031e-05, + "loss": 1.7556, + "step": 17184 + }, + { + "epoch": 5.274708410067526, + "grad_norm": 0.23606348037719727, + "learning_rate": 4.798746707610721e-05, + "loss": 1.732, + "step": 17185 + }, + { + "epoch": 5.2750153468385514, + "grad_norm": 0.21078252792358398, + "learning_rate": 4.798250055427311e-05, + "loss": 1.7571, + "step": 17186 + }, + { + "epoch": 5.275322283609577, + "grad_norm": 0.21331414580345154, + "learning_rate": 4.797753405237714e-05, + "loss": 1.732, + "step": 17187 + }, + { + "epoch": 5.275629220380601, + "grad_norm": 0.23700307309627533, + "learning_rate": 4.7972567570468354e-05, + "loss": 1.7354, + "step": 17188 + }, + { + "epoch": 5.275936157151627, + "grad_norm": 0.20519722998142242, + "learning_rate": 4.7967601108595845e-05, + "loss": 1.7435, + "step": 17189 + }, + { + "epoch": 5.276243093922652, + "grad_norm": 0.22358302772045135, + "learning_rate": 4.79626346668087e-05, + "loss": 1.7891, + "step": 17190 + }, + { + "epoch": 5.276550030693677, + "grad_norm": 0.2434413880109787, + "learning_rate": 4.795766824515598e-05, + "loss": 1.814, + "step": 17191 + }, + { + "epoch": 5.276856967464703, + "grad_norm": 0.2198423594236374, + "learning_rate": 4.795270184368678e-05, + "loss": 1.7212, + "step": 17192 + }, + { + "epoch": 5.277163904235728, + "grad_norm": 0.23587806522846222, + "learning_rate": 4.7947735462450205e-05, + "loss": 1.8337, + "step": 17193 + }, + { + "epoch": 5.277470841006752, + "grad_norm": 0.234666645526886, + "learning_rate": 4.794276910149528e-05, + "loss": 1.7548, + "step": 17194 + }, + { + "epoch": 5.277777777777778, + "grad_norm": 0.23363247513771057, + "learning_rate": 4.793780276087115e-05, + "loss": 1.7587, + "step": 17195 + }, + { + "epoch": 5.278084714548803, + "grad_norm": 0.23191119730472565, + "learning_rate": 4.793283644062683e-05, + "loss": 1.7691, + "step": 17196 + }, + { + "epoch": 5.278391651319828, + "grad_norm": 0.2363097071647644, + "learning_rate": 4.7927870140811445e-05, + "loss": 1.8139, + "step": 17197 + }, + { + "epoch": 5.278698588090854, + "grad_norm": 0.2852413058280945, + "learning_rate": 4.7922903861474056e-05, + "loss": 1.7905, + "step": 17198 + }, + { + "epoch": 5.279005524861878, + "grad_norm": 0.23633842170238495, + "learning_rate": 4.7917937602663764e-05, + "loss": 1.8014, + "step": 17199 + }, + { + "epoch": 5.2793124616329035, + "grad_norm": 0.27007919549942017, + "learning_rate": 4.791297136442961e-05, + "loss": 1.7242, + "step": 17200 + }, + { + "epoch": 5.279619398403929, + "grad_norm": 0.29482147097587585, + "learning_rate": 4.790800514682072e-05, + "loss": 1.7154, + "step": 17201 + }, + { + "epoch": 5.279926335174954, + "grad_norm": 0.27772340178489685, + "learning_rate": 4.790303894988614e-05, + "loss": 1.7771, + "step": 17202 + }, + { + "epoch": 5.2802332719459795, + "grad_norm": 0.21761848032474518, + "learning_rate": 4.789807277367495e-05, + "loss": 1.6983, + "step": 17203 + }, + { + "epoch": 5.280540208717004, + "grad_norm": 0.22621290385723114, + "learning_rate": 4.789310661823626e-05, + "loss": 1.7667, + "step": 17204 + }, + { + "epoch": 5.280847145488029, + "grad_norm": 0.2284683883190155, + "learning_rate": 4.7888140483619095e-05, + "loss": 1.7419, + "step": 17205 + }, + { + "epoch": 5.281154082259055, + "grad_norm": 0.20145639777183533, + "learning_rate": 4.788317436987259e-05, + "loss": 1.7068, + "step": 17206 + }, + { + "epoch": 5.28146101903008, + "grad_norm": 0.23146072030067444, + "learning_rate": 4.7878208277045775e-05, + "loss": 1.7195, + "step": 17207 + }, + { + "epoch": 5.281767955801105, + "grad_norm": 0.24014149606227875, + "learning_rate": 4.787324220518776e-05, + "loss": 1.8148, + "step": 17208 + }, + { + "epoch": 5.28207489257213, + "grad_norm": 0.21067874133586884, + "learning_rate": 4.7868276154347595e-05, + "loss": 1.7754, + "step": 17209 + }, + { + "epoch": 5.282381829343155, + "grad_norm": 0.2313496321439743, + "learning_rate": 4.786331012457441e-05, + "loss": 1.7693, + "step": 17210 + }, + { + "epoch": 5.28268876611418, + "grad_norm": 0.24190983176231384, + "learning_rate": 4.7858344115917214e-05, + "loss": 1.7342, + "step": 17211 + }, + { + "epoch": 5.282995702885206, + "grad_norm": 0.24541905522346497, + "learning_rate": 4.785337812842514e-05, + "loss": 1.7721, + "step": 17212 + }, + { + "epoch": 5.283302639656231, + "grad_norm": 0.21989032626152039, + "learning_rate": 4.784841216214722e-05, + "loss": 1.7522, + "step": 17213 + }, + { + "epoch": 5.283609576427256, + "grad_norm": 0.20637241005897522, + "learning_rate": 4.784344621713256e-05, + "loss": 1.7418, + "step": 17214 + }, + { + "epoch": 5.283916513198281, + "grad_norm": 0.22538220882415771, + "learning_rate": 4.783848029343023e-05, + "loss": 1.8287, + "step": 17215 + }, + { + "epoch": 5.284223449969306, + "grad_norm": 0.24478071928024292, + "learning_rate": 4.7833514391089315e-05, + "loss": 1.7419, + "step": 17216 + }, + { + "epoch": 5.2845303867403315, + "grad_norm": 0.22707650065422058, + "learning_rate": 4.782854851015886e-05, + "loss": 1.7831, + "step": 17217 + }, + { + "epoch": 5.284837323511357, + "grad_norm": 0.2843529284000397, + "learning_rate": 4.7823582650687984e-05, + "loss": 1.7704, + "step": 17218 + }, + { + "epoch": 5.285144260282382, + "grad_norm": 0.21647678315639496, + "learning_rate": 4.781861681272573e-05, + "loss": 1.7514, + "step": 17219 + }, + { + "epoch": 5.285451197053407, + "grad_norm": 0.2279205620288849, + "learning_rate": 4.781365099632117e-05, + "loss": 1.6803, + "step": 17220 + }, + { + "epoch": 5.285758133824432, + "grad_norm": 0.2287401556968689, + "learning_rate": 4.7808685201523417e-05, + "loss": 1.7278, + "step": 17221 + }, + { + "epoch": 5.286065070595457, + "grad_norm": 0.2103174477815628, + "learning_rate": 4.78037194283815e-05, + "loss": 1.7667, + "step": 17222 + }, + { + "epoch": 5.286372007366483, + "grad_norm": 0.24339279532432556, + "learning_rate": 4.7798753676944536e-05, + "loss": 1.7828, + "step": 17223 + }, + { + "epoch": 5.286678944137508, + "grad_norm": 0.2343035340309143, + "learning_rate": 4.779378794726156e-05, + "loss": 1.7277, + "step": 17224 + }, + { + "epoch": 5.286985880908533, + "grad_norm": 0.22456331551074982, + "learning_rate": 4.778882223938167e-05, + "loss": 1.756, + "step": 17225 + }, + { + "epoch": 5.287292817679558, + "grad_norm": 0.2211158126592636, + "learning_rate": 4.778385655335392e-05, + "loss": 1.7733, + "step": 17226 + }, + { + "epoch": 5.287599754450583, + "grad_norm": 0.2731948792934418, + "learning_rate": 4.777889088922743e-05, + "loss": 1.787, + "step": 17227 + }, + { + "epoch": 5.287906691221608, + "grad_norm": 0.19578024744987488, + "learning_rate": 4.7773925247051215e-05, + "loss": 1.7474, + "step": 17228 + }, + { + "epoch": 5.288213627992634, + "grad_norm": 0.277332067489624, + "learning_rate": 4.77689596268744e-05, + "loss": 1.7432, + "step": 17229 + }, + { + "epoch": 5.288520564763659, + "grad_norm": 0.2979765832424164, + "learning_rate": 4.7763994028746003e-05, + "loss": 1.8198, + "step": 17230 + }, + { + "epoch": 5.2888275015346835, + "grad_norm": 0.23176288604736328, + "learning_rate": 4.775902845271515e-05, + "loss": 1.7317, + "step": 17231 + }, + { + "epoch": 5.289134438305709, + "grad_norm": 0.35821911692619324, + "learning_rate": 4.7754062898830876e-05, + "loss": 1.7287, + "step": 17232 + }, + { + "epoch": 5.289441375076734, + "grad_norm": 0.2881525158882141, + "learning_rate": 4.7749097367142296e-05, + "loss": 1.7391, + "step": 17233 + }, + { + "epoch": 5.2897483118477595, + "grad_norm": 0.22021767497062683, + "learning_rate": 4.774413185769842e-05, + "loss": 1.7462, + "step": 17234 + }, + { + "epoch": 5.290055248618785, + "grad_norm": 0.3286842703819275, + "learning_rate": 4.7739166370548385e-05, + "loss": 1.7749, + "step": 17235 + }, + { + "epoch": 5.290362185389809, + "grad_norm": 0.3298519253730774, + "learning_rate": 4.773420090574122e-05, + "loss": 1.7548, + "step": 17236 + }, + { + "epoch": 5.290669122160835, + "grad_norm": 0.20910575985908508, + "learning_rate": 4.7729235463326005e-05, + "loss": 1.7308, + "step": 17237 + }, + { + "epoch": 5.29097605893186, + "grad_norm": 0.3324633240699768, + "learning_rate": 4.7724270043351835e-05, + "loss": 1.7328, + "step": 17238 + }, + { + "epoch": 5.291282995702885, + "grad_norm": 0.21235628426074982, + "learning_rate": 4.771930464586774e-05, + "loss": 1.7186, + "step": 17239 + }, + { + "epoch": 5.291589932473911, + "grad_norm": 0.2971087694168091, + "learning_rate": 4.771433927092283e-05, + "loss": 1.7947, + "step": 17240 + }, + { + "epoch": 5.291896869244935, + "grad_norm": 0.3637695908546448, + "learning_rate": 4.770937391856614e-05, + "loss": 1.7753, + "step": 17241 + }, + { + "epoch": 5.29220380601596, + "grad_norm": 0.2503713369369507, + "learning_rate": 4.770440858884678e-05, + "loss": 1.684, + "step": 17242 + }, + { + "epoch": 5.292510742786986, + "grad_norm": 0.25510790944099426, + "learning_rate": 4.7699443281813774e-05, + "loss": 1.7517, + "step": 17243 + }, + { + "epoch": 5.292817679558011, + "grad_norm": 0.3189590871334076, + "learning_rate": 4.7694477997516244e-05, + "loss": 1.7488, + "step": 17244 + }, + { + "epoch": 5.293124616329036, + "grad_norm": 0.2807229161262512, + "learning_rate": 4.7689512736003215e-05, + "loss": 1.7962, + "step": 17245 + }, + { + "epoch": 5.293431553100062, + "grad_norm": 0.2166406810283661, + "learning_rate": 4.76845474973238e-05, + "loss": 1.7423, + "step": 17246 + }, + { + "epoch": 5.293738489871086, + "grad_norm": 0.29000815749168396, + "learning_rate": 4.767958228152702e-05, + "loss": 1.7508, + "step": 17247 + }, + { + "epoch": 5.2940454266421115, + "grad_norm": 0.19301612675189972, + "learning_rate": 4.767461708866198e-05, + "loss": 1.7223, + "step": 17248 + }, + { + "epoch": 5.294352363413137, + "grad_norm": 0.2828899323940277, + "learning_rate": 4.766965191877772e-05, + "loss": 1.8139, + "step": 17249 + }, + { + "epoch": 5.294659300184162, + "grad_norm": 0.32610374689102173, + "learning_rate": 4.766468677192335e-05, + "loss": 1.7744, + "step": 17250 + }, + { + "epoch": 5.2949662369551875, + "grad_norm": 0.2175719439983368, + "learning_rate": 4.7659721648147895e-05, + "loss": 1.7345, + "step": 17251 + }, + { + "epoch": 5.295273173726212, + "grad_norm": 0.24777816236019135, + "learning_rate": 4.7654756547500457e-05, + "loss": 1.7382, + "step": 17252 + }, + { + "epoch": 5.295580110497237, + "grad_norm": 0.25927749276161194, + "learning_rate": 4.764979147003008e-05, + "loss": 1.7625, + "step": 17253 + }, + { + "epoch": 5.295887047268263, + "grad_norm": 0.2271798849105835, + "learning_rate": 4.7644826415785834e-05, + "loss": 1.6928, + "step": 17254 + }, + { + "epoch": 5.296193984039288, + "grad_norm": 0.30804958939552307, + "learning_rate": 4.763986138481682e-05, + "loss": 1.743, + "step": 17255 + }, + { + "epoch": 5.296500920810313, + "grad_norm": 0.2247130572795868, + "learning_rate": 4.763489637717205e-05, + "loss": 1.7593, + "step": 17256 + }, + { + "epoch": 5.296807857581339, + "grad_norm": 0.22203052043914795, + "learning_rate": 4.7629931392900645e-05, + "loss": 1.6923, + "step": 17257 + }, + { + "epoch": 5.297114794352363, + "grad_norm": 0.23044714331626892, + "learning_rate": 4.7624966432051624e-05, + "loss": 1.7676, + "step": 17258 + }, + { + "epoch": 5.297421731123388, + "grad_norm": 0.2824070155620575, + "learning_rate": 4.7620001494674096e-05, + "loss": 1.8272, + "step": 17259 + }, + { + "epoch": 5.297728667894414, + "grad_norm": 0.27077800035476685, + "learning_rate": 4.761503658081709e-05, + "loss": 1.8106, + "step": 17260 + }, + { + "epoch": 5.298035604665439, + "grad_norm": 0.2333833873271942, + "learning_rate": 4.7610071690529706e-05, + "loss": 1.6841, + "step": 17261 + }, + { + "epoch": 5.298342541436464, + "grad_norm": 0.2542032301425934, + "learning_rate": 4.760510682386098e-05, + "loss": 1.7656, + "step": 17262 + }, + { + "epoch": 5.298649478207489, + "grad_norm": 0.30680081248283386, + "learning_rate": 4.760014198086002e-05, + "loss": 1.7443, + "step": 17263 + }, + { + "epoch": 5.298956414978514, + "grad_norm": 0.21580225229263306, + "learning_rate": 4.759517716157583e-05, + "loss": 1.7907, + "step": 17264 + }, + { + "epoch": 5.2992633517495396, + "grad_norm": 0.2644323408603668, + "learning_rate": 4.7590212366057516e-05, + "loss": 1.6835, + "step": 17265 + }, + { + "epoch": 5.299570288520565, + "grad_norm": 0.23600110411643982, + "learning_rate": 4.758524759435414e-05, + "loss": 1.7481, + "step": 17266 + }, + { + "epoch": 5.29987722529159, + "grad_norm": 0.23825959861278534, + "learning_rate": 4.758028284651477e-05, + "loss": 1.7267, + "step": 17267 + }, + { + "epoch": 5.300184162062616, + "grad_norm": 0.2659476101398468, + "learning_rate": 4.757531812258845e-05, + "loss": 1.7303, + "step": 17268 + }, + { + "epoch": 5.30049109883364, + "grad_norm": 0.30770114064216614, + "learning_rate": 4.757035342262428e-05, + "loss": 1.7636, + "step": 17269 + }, + { + "epoch": 5.300798035604665, + "grad_norm": 0.27921241521835327, + "learning_rate": 4.756538874667129e-05, + "loss": 1.7736, + "step": 17270 + }, + { + "epoch": 5.301104972375691, + "grad_norm": 0.2518016993999481, + "learning_rate": 4.756042409477855e-05, + "loss": 1.7942, + "step": 17271 + }, + { + "epoch": 5.301411909146716, + "grad_norm": 0.2678029537200928, + "learning_rate": 4.755545946699514e-05, + "loss": 1.7179, + "step": 17272 + }, + { + "epoch": 5.301718845917741, + "grad_norm": 0.3082284927368164, + "learning_rate": 4.7550494863370094e-05, + "loss": 1.7282, + "step": 17273 + }, + { + "epoch": 5.302025782688766, + "grad_norm": 0.23269952833652496, + "learning_rate": 4.754553028395251e-05, + "loss": 1.755, + "step": 17274 + }, + { + "epoch": 5.302332719459791, + "grad_norm": 0.2273751199245453, + "learning_rate": 4.754056572879142e-05, + "loss": 1.7661, + "step": 17275 + }, + { + "epoch": 5.3026396562308165, + "grad_norm": 0.2175082415342331, + "learning_rate": 4.7535601197935915e-05, + "loss": 1.7034, + "step": 17276 + }, + { + "epoch": 5.302946593001842, + "grad_norm": 0.20551301538944244, + "learning_rate": 4.753063669143503e-05, + "loss": 1.7329, + "step": 17277 + }, + { + "epoch": 5.303253529772867, + "grad_norm": 0.2350638061761856, + "learning_rate": 4.752567220933785e-05, + "loss": 1.8361, + "step": 17278 + }, + { + "epoch": 5.303560466543892, + "grad_norm": 0.20268140733242035, + "learning_rate": 4.752070775169342e-05, + "loss": 1.6736, + "step": 17279 + }, + { + "epoch": 5.303867403314917, + "grad_norm": 0.1891544908285141, + "learning_rate": 4.7515743318550823e-05, + "loss": 1.7241, + "step": 17280 + }, + { + "epoch": 5.304174340085942, + "grad_norm": 0.22900860011577606, + "learning_rate": 4.751077890995909e-05, + "loss": 1.7321, + "step": 17281 + }, + { + "epoch": 5.304481276856968, + "grad_norm": 0.25827866792678833, + "learning_rate": 4.7505814525967304e-05, + "loss": 1.8021, + "step": 17282 + }, + { + "epoch": 5.304788213627993, + "grad_norm": 0.22459273040294647, + "learning_rate": 4.7500850166624514e-05, + "loss": 1.7845, + "step": 17283 + }, + { + "epoch": 5.305095150399017, + "grad_norm": 0.23737964034080505, + "learning_rate": 4.7495885831979816e-05, + "loss": 1.7274, + "step": 17284 + }, + { + "epoch": 5.305402087170043, + "grad_norm": 0.2267502397298813, + "learning_rate": 4.749092152208221e-05, + "loss": 1.7747, + "step": 17285 + }, + { + "epoch": 5.305709023941068, + "grad_norm": 0.31811007857322693, + "learning_rate": 4.748595723698081e-05, + "loss": 1.7852, + "step": 17286 + }, + { + "epoch": 5.306015960712093, + "grad_norm": 0.42865583300590515, + "learning_rate": 4.7480992976724655e-05, + "loss": 1.7711, + "step": 17287 + }, + { + "epoch": 5.306322897483119, + "grad_norm": 0.3211027979850769, + "learning_rate": 4.747602874136278e-05, + "loss": 1.7813, + "step": 17288 + }, + { + "epoch": 5.306629834254144, + "grad_norm": 0.22552837431430817, + "learning_rate": 4.7471064530944295e-05, + "loss": 1.7407, + "step": 17289 + }, + { + "epoch": 5.3069367710251685, + "grad_norm": 0.3119906485080719, + "learning_rate": 4.746610034551821e-05, + "loss": 1.7255, + "step": 17290 + }, + { + "epoch": 5.307243707796194, + "grad_norm": 0.26405754685401917, + "learning_rate": 4.7461136185133623e-05, + "loss": 1.6945, + "step": 17291 + }, + { + "epoch": 5.307550644567219, + "grad_norm": 0.21759621798992157, + "learning_rate": 4.7456172049839566e-05, + "loss": 1.7319, + "step": 17292 + }, + { + "epoch": 5.3078575813382445, + "grad_norm": 0.26193925738334656, + "learning_rate": 4.745120793968511e-05, + "loss": 1.7508, + "step": 17293 + }, + { + "epoch": 5.30816451810927, + "grad_norm": 0.2549780011177063, + "learning_rate": 4.74462438547193e-05, + "loss": 1.7153, + "step": 17294 + }, + { + "epoch": 5.308471454880294, + "grad_norm": 0.21164020895957947, + "learning_rate": 4.7441279794991235e-05, + "loss": 1.7315, + "step": 17295 + }, + { + "epoch": 5.30877839165132, + "grad_norm": 0.20548345148563385, + "learning_rate": 4.7436315760549914e-05, + "loss": 1.68, + "step": 17296 + }, + { + "epoch": 5.309085328422345, + "grad_norm": 0.23997166752815247, + "learning_rate": 4.7431351751444446e-05, + "loss": 1.8528, + "step": 17297 + }, + { + "epoch": 5.30939226519337, + "grad_norm": 0.2639109194278717, + "learning_rate": 4.7426387767723845e-05, + "loss": 1.8041, + "step": 17298 + }, + { + "epoch": 5.309699201964396, + "grad_norm": 0.2285986840724945, + "learning_rate": 4.7421423809437196e-05, + "loss": 1.8188, + "step": 17299 + }, + { + "epoch": 5.310006138735421, + "grad_norm": 0.22183369100093842, + "learning_rate": 4.741645987663355e-05, + "loss": 1.7581, + "step": 17300 + }, + { + "epoch": 5.310313075506445, + "grad_norm": 0.22716040909290314, + "learning_rate": 4.741149596936197e-05, + "loss": 1.7438, + "step": 17301 + }, + { + "epoch": 5.310620012277471, + "grad_norm": 0.24641327559947968, + "learning_rate": 4.740653208767148e-05, + "loss": 1.761, + "step": 17302 + }, + { + "epoch": 5.310926949048496, + "grad_norm": 0.28470689058303833, + "learning_rate": 4.7401568231611194e-05, + "loss": 1.7512, + "step": 17303 + }, + { + "epoch": 5.311233885819521, + "grad_norm": 0.23279942572116852, + "learning_rate": 4.739660440123012e-05, + "loss": 1.7797, + "step": 17304 + }, + { + "epoch": 5.311540822590547, + "grad_norm": 0.26397696137428284, + "learning_rate": 4.739164059657731e-05, + "loss": 1.748, + "step": 17305 + }, + { + "epoch": 5.311847759361571, + "grad_norm": 0.25072020292282104, + "learning_rate": 4.7386676817701856e-05, + "loss": 1.7571, + "step": 17306 + }, + { + "epoch": 5.3121546961325965, + "grad_norm": 0.20815810561180115, + "learning_rate": 4.7381713064652774e-05, + "loss": 1.7566, + "step": 17307 + }, + { + "epoch": 5.312461632903622, + "grad_norm": 0.23104289174079895, + "learning_rate": 4.7376749337479174e-05, + "loss": 1.7308, + "step": 17308 + }, + { + "epoch": 5.312768569674647, + "grad_norm": 0.21978867053985596, + "learning_rate": 4.737178563623004e-05, + "loss": 1.7997, + "step": 17309 + }, + { + "epoch": 5.3130755064456725, + "grad_norm": 0.34588614106178284, + "learning_rate": 4.736682196095447e-05, + "loss": 1.8414, + "step": 17310 + }, + { + "epoch": 5.313382443216697, + "grad_norm": 0.3475342094898224, + "learning_rate": 4.73618583117015e-05, + "loss": 1.7823, + "step": 17311 + }, + { + "epoch": 5.313689379987722, + "grad_norm": 0.1965305358171463, + "learning_rate": 4.7356894688520215e-05, + "loss": 1.7597, + "step": 17312 + }, + { + "epoch": 5.313996316758748, + "grad_norm": 0.3035048246383667, + "learning_rate": 4.7351931091459624e-05, + "loss": 1.6803, + "step": 17313 + }, + { + "epoch": 5.314303253529773, + "grad_norm": 0.27722910046577454, + "learning_rate": 4.7346967520568827e-05, + "loss": 1.7472, + "step": 17314 + }, + { + "epoch": 5.314610190300798, + "grad_norm": 0.21481415629386902, + "learning_rate": 4.734200397589682e-05, + "loss": 1.7319, + "step": 17315 + }, + { + "epoch": 5.314917127071823, + "grad_norm": 0.2570357918739319, + "learning_rate": 4.733704045749271e-05, + "loss": 1.7392, + "step": 17316 + }, + { + "epoch": 5.315224063842848, + "grad_norm": 0.2404400259256363, + "learning_rate": 4.733207696540551e-05, + "loss": 1.7231, + "step": 17317 + }, + { + "epoch": 5.315531000613873, + "grad_norm": 0.222911074757576, + "learning_rate": 4.732711349968432e-05, + "loss": 1.7584, + "step": 17318 + }, + { + "epoch": 5.315837937384899, + "grad_norm": 0.22908064723014832, + "learning_rate": 4.732215006037813e-05, + "loss": 1.7242, + "step": 17319 + }, + { + "epoch": 5.316144874155924, + "grad_norm": 0.2432398796081543, + "learning_rate": 4.7317186647536044e-05, + "loss": 1.7056, + "step": 17320 + }, + { + "epoch": 5.316451810926949, + "grad_norm": 0.1994420737028122, + "learning_rate": 4.7312223261207086e-05, + "loss": 1.6667, + "step": 17321 + }, + { + "epoch": 5.316758747697974, + "grad_norm": 0.22314350306987762, + "learning_rate": 4.73072599014403e-05, + "loss": 1.7945, + "step": 17322 + }, + { + "epoch": 5.317065684468999, + "grad_norm": 0.2309068888425827, + "learning_rate": 4.730229656828477e-05, + "loss": 1.7099, + "step": 17323 + }, + { + "epoch": 5.3173726212400245, + "grad_norm": 0.22388015687465668, + "learning_rate": 4.729733326178951e-05, + "loss": 1.7053, + "step": 17324 + }, + { + "epoch": 5.31767955801105, + "grad_norm": 0.20203040540218353, + "learning_rate": 4.72923699820036e-05, + "loss": 1.6992, + "step": 17325 + }, + { + "epoch": 5.317986494782075, + "grad_norm": 0.24416297674179077, + "learning_rate": 4.728740672897606e-05, + "loss": 1.7455, + "step": 17326 + }, + { + "epoch": 5.3182934315531, + "grad_norm": 0.2501862049102783, + "learning_rate": 4.728244350275597e-05, + "loss": 1.7609, + "step": 17327 + }, + { + "epoch": 5.318600368324125, + "grad_norm": 0.21482665836811066, + "learning_rate": 4.727748030339235e-05, + "loss": 1.7614, + "step": 17328 + }, + { + "epoch": 5.31890730509515, + "grad_norm": 0.2241419404745102, + "learning_rate": 4.727251713093429e-05, + "loss": 1.736, + "step": 17329 + }, + { + "epoch": 5.319214241866176, + "grad_norm": 0.1757260262966156, + "learning_rate": 4.726755398543079e-05, + "loss": 1.6646, + "step": 17330 + }, + { + "epoch": 5.319521178637201, + "grad_norm": 0.18697243928909302, + "learning_rate": 4.726259086693095e-05, + "loss": 1.7512, + "step": 17331 + }, + { + "epoch": 5.319828115408226, + "grad_norm": 0.22584228217601776, + "learning_rate": 4.725762777548376e-05, + "loss": 1.7439, + "step": 17332 + }, + { + "epoch": 5.320135052179251, + "grad_norm": 0.18673470616340637, + "learning_rate": 4.725266471113832e-05, + "loss": 1.7007, + "step": 17333 + }, + { + "epoch": 5.320441988950276, + "grad_norm": 0.23030288517475128, + "learning_rate": 4.7247701673943656e-05, + "loss": 1.8021, + "step": 17334 + }, + { + "epoch": 5.320748925721301, + "grad_norm": 0.19333480298519135, + "learning_rate": 4.7242738663948813e-05, + "loss": 1.6659, + "step": 17335 + }, + { + "epoch": 5.321055862492327, + "grad_norm": 0.278097003698349, + "learning_rate": 4.723777568120284e-05, + "loss": 1.7302, + "step": 17336 + }, + { + "epoch": 5.321362799263352, + "grad_norm": 0.2146742343902588, + "learning_rate": 4.72328127257548e-05, + "loss": 1.7644, + "step": 17337 + }, + { + "epoch": 5.3216697360343765, + "grad_norm": 0.25582969188690186, + "learning_rate": 4.722784979765372e-05, + "loss": 1.7872, + "step": 17338 + }, + { + "epoch": 5.321976672805402, + "grad_norm": 0.20411577820777893, + "learning_rate": 4.722288689694864e-05, + "loss": 1.7167, + "step": 17339 + }, + { + "epoch": 5.322283609576427, + "grad_norm": 0.20894703269004822, + "learning_rate": 4.7217924023688645e-05, + "loss": 1.7526, + "step": 17340 + }, + { + "epoch": 5.3225905463474525, + "grad_norm": 0.20197831094264984, + "learning_rate": 4.721296117792273e-05, + "loss": 1.711, + "step": 17341 + }, + { + "epoch": 5.322897483118478, + "grad_norm": 0.20490549504756927, + "learning_rate": 4.720799835969999e-05, + "loss": 1.7303, + "step": 17342 + }, + { + "epoch": 5.323204419889503, + "grad_norm": 0.20666229724884033, + "learning_rate": 4.720303556906943e-05, + "loss": 1.6738, + "step": 17343 + }, + { + "epoch": 5.323511356660528, + "grad_norm": 0.21899856626987457, + "learning_rate": 4.719807280608011e-05, + "loss": 1.7632, + "step": 17344 + }, + { + "epoch": 5.323818293431553, + "grad_norm": 0.2310410887002945, + "learning_rate": 4.719311007078108e-05, + "loss": 1.7568, + "step": 17345 + }, + { + "epoch": 5.324125230202578, + "grad_norm": 0.20057427883148193, + "learning_rate": 4.7188147363221394e-05, + "loss": 1.6716, + "step": 17346 + }, + { + "epoch": 5.324432166973604, + "grad_norm": 0.21361050009727478, + "learning_rate": 4.718318468345006e-05, + "loss": 1.7224, + "step": 17347 + }, + { + "epoch": 5.324739103744629, + "grad_norm": 0.28389376401901245, + "learning_rate": 4.7178222031516173e-05, + "loss": 1.8519, + "step": 17348 + }, + { + "epoch": 5.3250460405156534, + "grad_norm": 0.2094416618347168, + "learning_rate": 4.717325940746872e-05, + "loss": 1.7763, + "step": 17349 + }, + { + "epoch": 5.325352977286679, + "grad_norm": 0.2263312190771103, + "learning_rate": 4.716829681135681e-05, + "loss": 1.7961, + "step": 17350 + }, + { + "epoch": 5.325659914057704, + "grad_norm": 0.2685631811618805, + "learning_rate": 4.7163334243229417e-05, + "loss": 1.7763, + "step": 17351 + }, + { + "epoch": 5.3259668508287294, + "grad_norm": 0.2029418647289276, + "learning_rate": 4.7158371703135636e-05, + "loss": 1.7662, + "step": 17352 + }, + { + "epoch": 5.326273787599755, + "grad_norm": 0.3109094798564911, + "learning_rate": 4.715340919112447e-05, + "loss": 1.7064, + "step": 17353 + }, + { + "epoch": 5.326580724370779, + "grad_norm": 0.24679912626743317, + "learning_rate": 4.714844670724502e-05, + "loss": 1.6903, + "step": 17354 + }, + { + "epoch": 5.326887661141805, + "grad_norm": 0.2004890739917755, + "learning_rate": 4.714348425154627e-05, + "loss": 1.7242, + "step": 17355 + }, + { + "epoch": 5.32719459791283, + "grad_norm": 0.27442196011543274, + "learning_rate": 4.7138521824077284e-05, + "loss": 1.826, + "step": 17356 + }, + { + "epoch": 5.327501534683855, + "grad_norm": 0.19933666288852692, + "learning_rate": 4.713355942488711e-05, + "loss": 1.748, + "step": 17357 + }, + { + "epoch": 5.327808471454881, + "grad_norm": 0.2306378185749054, + "learning_rate": 4.712859705402476e-05, + "loss": 1.7426, + "step": 17358 + }, + { + "epoch": 5.328115408225905, + "grad_norm": 0.22484014928340912, + "learning_rate": 4.7123634711539324e-05, + "loss": 1.7355, + "step": 17359 + }, + { + "epoch": 5.32842234499693, + "grad_norm": 0.2501749098300934, + "learning_rate": 4.711867239747979e-05, + "loss": 1.7502, + "step": 17360 + }, + { + "epoch": 5.328729281767956, + "grad_norm": 0.1940663903951645, + "learning_rate": 4.711371011189525e-05, + "loss": 1.7423, + "step": 17361 + }, + { + "epoch": 5.329036218538981, + "grad_norm": 0.28115448355674744, + "learning_rate": 4.71087478548347e-05, + "loss": 1.7134, + "step": 17362 + }, + { + "epoch": 5.329343155310006, + "grad_norm": 0.29717928171157837, + "learning_rate": 4.71037856263472e-05, + "loss": 1.8145, + "step": 17363 + }, + { + "epoch": 5.329650092081032, + "grad_norm": 0.24278375506401062, + "learning_rate": 4.709882342648179e-05, + "loss": 1.689, + "step": 17364 + }, + { + "epoch": 5.329957028852056, + "grad_norm": 0.26382890343666077, + "learning_rate": 4.709386125528751e-05, + "loss": 1.801, + "step": 17365 + }, + { + "epoch": 5.3302639656230815, + "grad_norm": 0.237087219953537, + "learning_rate": 4.708889911281339e-05, + "loss": 1.7019, + "step": 17366 + }, + { + "epoch": 5.330570902394107, + "grad_norm": 0.21994253993034363, + "learning_rate": 4.7083936999108494e-05, + "loss": 1.707, + "step": 17367 + }, + { + "epoch": 5.330877839165132, + "grad_norm": 0.3028903901576996, + "learning_rate": 4.707897491422182e-05, + "loss": 1.7992, + "step": 17368 + }, + { + "epoch": 5.3311847759361575, + "grad_norm": 0.24991434812545776, + "learning_rate": 4.7074012858202435e-05, + "loss": 1.7894, + "step": 17369 + }, + { + "epoch": 5.331491712707182, + "grad_norm": 0.20631250739097595, + "learning_rate": 4.706905083109936e-05, + "loss": 1.6816, + "step": 17370 + }, + { + "epoch": 5.331798649478207, + "grad_norm": 0.23300573229789734, + "learning_rate": 4.7064088832961666e-05, + "loss": 1.7101, + "step": 17371 + }, + { + "epoch": 5.332105586249233, + "grad_norm": 0.22331316769123077, + "learning_rate": 4.705912686383837e-05, + "loss": 1.861, + "step": 17372 + }, + { + "epoch": 5.332412523020258, + "grad_norm": 0.204593226313591, + "learning_rate": 4.7054164923778485e-05, + "loss": 1.7062, + "step": 17373 + }, + { + "epoch": 5.332719459791283, + "grad_norm": 0.22207681834697723, + "learning_rate": 4.704920301283107e-05, + "loss": 1.7546, + "step": 17374 + }, + { + "epoch": 5.333026396562309, + "grad_norm": 0.2508530020713806, + "learning_rate": 4.7044241131045157e-05, + "loss": 1.7881, + "step": 17375 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.26084616780281067, + "learning_rate": 4.7039279278469804e-05, + "loss": 1.7292, + "step": 17376 + }, + { + "epoch": 5.333640270104358, + "grad_norm": 0.2122940719127655, + "learning_rate": 4.7034317455154006e-05, + "loss": 1.7493, + "step": 17377 + }, + { + "epoch": 5.333947206875384, + "grad_norm": 0.2627449333667755, + "learning_rate": 4.702935566114685e-05, + "loss": 1.759, + "step": 17378 + }, + { + "epoch": 5.334254143646409, + "grad_norm": 0.20637977123260498, + "learning_rate": 4.702439389649732e-05, + "loss": 1.8043, + "step": 17379 + }, + { + "epoch": 5.334561080417434, + "grad_norm": 0.28783395886421204, + "learning_rate": 4.701943216125447e-05, + "loss": 1.7256, + "step": 17380 + }, + { + "epoch": 5.334868017188459, + "grad_norm": 0.21130618453025818, + "learning_rate": 4.701447045546734e-05, + "loss": 1.7161, + "step": 17381 + }, + { + "epoch": 5.335174953959484, + "grad_norm": 0.2793416678905487, + "learning_rate": 4.7009508779184984e-05, + "loss": 1.7659, + "step": 17382 + }, + { + "epoch": 5.3354818907305095, + "grad_norm": 0.3088020384311676, + "learning_rate": 4.700454713245639e-05, + "loss": 1.6877, + "step": 17383 + }, + { + "epoch": 5.335788827501535, + "grad_norm": 0.19697681069374084, + "learning_rate": 4.6999585515330646e-05, + "loss": 1.7111, + "step": 17384 + }, + { + "epoch": 5.33609576427256, + "grad_norm": 0.29234182834625244, + "learning_rate": 4.699462392785673e-05, + "loss": 1.7136, + "step": 17385 + }, + { + "epoch": 5.336402701043585, + "grad_norm": 0.2593611776828766, + "learning_rate": 4.698966237008371e-05, + "loss": 1.7531, + "step": 17386 + }, + { + "epoch": 5.33670963781461, + "grad_norm": 0.20024444162845612, + "learning_rate": 4.6984700842060604e-05, + "loss": 1.7035, + "step": 17387 + }, + { + "epoch": 5.337016574585635, + "grad_norm": 0.2929787039756775, + "learning_rate": 4.697973934383647e-05, + "loss": 1.7212, + "step": 17388 + }, + { + "epoch": 5.337323511356661, + "grad_norm": 0.2425665408372879, + "learning_rate": 4.697477787546032e-05, + "loss": 1.7191, + "step": 17389 + }, + { + "epoch": 5.337630448127686, + "grad_norm": 0.19175556302070618, + "learning_rate": 4.6969816436981176e-05, + "loss": 1.7291, + "step": 17390 + }, + { + "epoch": 5.337937384898711, + "grad_norm": 0.2602384686470032, + "learning_rate": 4.696485502844809e-05, + "loss": 1.7035, + "step": 17391 + }, + { + "epoch": 5.338244321669736, + "grad_norm": 0.19117408990859985, + "learning_rate": 4.695989364991006e-05, + "loss": 1.707, + "step": 17392 + }, + { + "epoch": 5.338551258440761, + "grad_norm": 0.31086108088493347, + "learning_rate": 4.6954932301416174e-05, + "loss": 1.7397, + "step": 17393 + }, + { + "epoch": 5.338858195211786, + "grad_norm": 0.27402472496032715, + "learning_rate": 4.694997098301542e-05, + "loss": 1.7144, + "step": 17394 + }, + { + "epoch": 5.339165131982812, + "grad_norm": 0.20345155894756317, + "learning_rate": 4.694500969475685e-05, + "loss": 1.7492, + "step": 17395 + }, + { + "epoch": 5.339472068753837, + "grad_norm": 0.23786045610904694, + "learning_rate": 4.694004843668947e-05, + "loss": 1.7781, + "step": 17396 + }, + { + "epoch": 5.3397790055248615, + "grad_norm": 0.19747424125671387, + "learning_rate": 4.6935087208862335e-05, + "loss": 1.7353, + "step": 17397 + }, + { + "epoch": 5.340085942295887, + "grad_norm": 0.224543035030365, + "learning_rate": 4.693012601132445e-05, + "loss": 1.7229, + "step": 17398 + }, + { + "epoch": 5.340392879066912, + "grad_norm": 0.20840135216712952, + "learning_rate": 4.692516484412488e-05, + "loss": 1.7557, + "step": 17399 + }, + { + "epoch": 5.3406998158379375, + "grad_norm": 0.21019098162651062, + "learning_rate": 4.692020370731261e-05, + "loss": 1.7793, + "step": 17400 + }, + { + "epoch": 5.341006752608963, + "grad_norm": 0.20540091395378113, + "learning_rate": 4.691524260093672e-05, + "loss": 1.6925, + "step": 17401 + }, + { + "epoch": 5.341313689379987, + "grad_norm": 0.2414131462574005, + "learning_rate": 4.691028152504619e-05, + "loss": 1.7706, + "step": 17402 + }, + { + "epoch": 5.341620626151013, + "grad_norm": 0.19627155363559723, + "learning_rate": 4.6905320479690073e-05, + "loss": 1.6356, + "step": 17403 + }, + { + "epoch": 5.341927562922038, + "grad_norm": 0.20978952944278717, + "learning_rate": 4.690035946491741e-05, + "loss": 1.7487, + "step": 17404 + }, + { + "epoch": 5.342234499693063, + "grad_norm": 0.2524566054344177, + "learning_rate": 4.689539848077719e-05, + "loss": 1.7713, + "step": 17405 + }, + { + "epoch": 5.342541436464089, + "grad_norm": 0.1967654973268509, + "learning_rate": 4.689043752731847e-05, + "loss": 1.7358, + "step": 17406 + }, + { + "epoch": 5.342848373235114, + "grad_norm": 0.2085377424955368, + "learning_rate": 4.688547660459026e-05, + "loss": 1.7104, + "step": 17407 + }, + { + "epoch": 5.343155310006138, + "grad_norm": 0.21294310688972473, + "learning_rate": 4.688051571264161e-05, + "loss": 1.7349, + "step": 17408 + }, + { + "epoch": 5.343462246777164, + "grad_norm": 0.23702891170978546, + "learning_rate": 4.6875554851521514e-05, + "loss": 1.8048, + "step": 17409 + }, + { + "epoch": 5.343769183548189, + "grad_norm": 0.2513964772224426, + "learning_rate": 4.687059402127904e-05, + "loss": 1.6669, + "step": 17410 + }, + { + "epoch": 5.344076120319214, + "grad_norm": 0.259540855884552, + "learning_rate": 4.6865633221963165e-05, + "loss": 1.7763, + "step": 17411 + }, + { + "epoch": 5.34438305709024, + "grad_norm": 0.28354617953300476, + "learning_rate": 4.6860672453622966e-05, + "loss": 1.7912, + "step": 17412 + }, + { + "epoch": 5.344689993861264, + "grad_norm": 0.2503860592842102, + "learning_rate": 4.685571171630742e-05, + "loss": 1.6817, + "step": 17413 + }, + { + "epoch": 5.3449969306322895, + "grad_norm": 0.2317555695772171, + "learning_rate": 4.685075101006558e-05, + "loss": 1.7652, + "step": 17414 + }, + { + "epoch": 5.345303867403315, + "grad_norm": 0.23333363234996796, + "learning_rate": 4.684579033494646e-05, + "loss": 1.722, + "step": 17415 + }, + { + "epoch": 5.34561080417434, + "grad_norm": 0.22507359087467194, + "learning_rate": 4.6840829690999104e-05, + "loss": 1.7522, + "step": 17416 + }, + { + "epoch": 5.3459177409453655, + "grad_norm": 0.2298288643360138, + "learning_rate": 4.6835869078272504e-05, + "loss": 1.7425, + "step": 17417 + }, + { + "epoch": 5.346224677716391, + "grad_norm": 0.2829224765300751, + "learning_rate": 4.683090849681572e-05, + "loss": 1.7798, + "step": 17418 + }, + { + "epoch": 5.346531614487415, + "grad_norm": 0.18153807520866394, + "learning_rate": 4.682594794667773e-05, + "loss": 1.6846, + "step": 17419 + }, + { + "epoch": 5.346838551258441, + "grad_norm": 0.24153028428554535, + "learning_rate": 4.6820987427907596e-05, + "loss": 1.7474, + "step": 17420 + }, + { + "epoch": 5.347145488029466, + "grad_norm": 0.2529772222042084, + "learning_rate": 4.681602694055434e-05, + "loss": 1.7465, + "step": 17421 + }, + { + "epoch": 5.347452424800491, + "grad_norm": 0.20414131879806519, + "learning_rate": 4.681106648466696e-05, + "loss": 1.7704, + "step": 17422 + }, + { + "epoch": 5.347759361571517, + "grad_norm": 0.27280452847480774, + "learning_rate": 4.68061060602945e-05, + "loss": 1.791, + "step": 17423 + }, + { + "epoch": 5.348066298342541, + "grad_norm": 0.20767468214035034, + "learning_rate": 4.680114566748595e-05, + "loss": 1.7744, + "step": 17424 + }, + { + "epoch": 5.348373235113566, + "grad_norm": 0.2661697566509247, + "learning_rate": 4.679618530629036e-05, + "loss": 1.7999, + "step": 17425 + }, + { + "epoch": 5.348680171884592, + "grad_norm": 0.23666872084140778, + "learning_rate": 4.679122497675674e-05, + "loss": 1.7204, + "step": 17426 + }, + { + "epoch": 5.348987108655617, + "grad_norm": 0.2688015401363373, + "learning_rate": 4.678626467893414e-05, + "loss": 1.7619, + "step": 17427 + }, + { + "epoch": 5.349294045426642, + "grad_norm": 0.23924420773983002, + "learning_rate": 4.678130441287153e-05, + "loss": 1.7754, + "step": 17428 + }, + { + "epoch": 5.349600982197667, + "grad_norm": 0.25724148750305176, + "learning_rate": 4.677634417861798e-05, + "loss": 1.761, + "step": 17429 + }, + { + "epoch": 5.349907918968692, + "grad_norm": 0.2633780241012573, + "learning_rate": 4.6771383976222464e-05, + "loss": 1.8705, + "step": 17430 + }, + { + "epoch": 5.350214855739718, + "grad_norm": 0.24774575233459473, + "learning_rate": 4.6766423805734036e-05, + "loss": 1.7127, + "step": 17431 + }, + { + "epoch": 5.350521792510743, + "grad_norm": 0.29887545108795166, + "learning_rate": 4.6761463667201695e-05, + "loss": 1.7651, + "step": 17432 + }, + { + "epoch": 5.350828729281768, + "grad_norm": 0.2231605499982834, + "learning_rate": 4.6756503560674486e-05, + "loss": 1.7636, + "step": 17433 + }, + { + "epoch": 5.351135666052793, + "grad_norm": 0.27977073192596436, + "learning_rate": 4.675154348620139e-05, + "loss": 1.7108, + "step": 17434 + }, + { + "epoch": 5.351442602823818, + "grad_norm": 0.26866039633750916, + "learning_rate": 4.674658344383146e-05, + "loss": 1.7593, + "step": 17435 + }, + { + "epoch": 5.351749539594843, + "grad_norm": 0.2154620885848999, + "learning_rate": 4.6741623433613685e-05, + "loss": 1.7536, + "step": 17436 + }, + { + "epoch": 5.352056476365869, + "grad_norm": 0.276656836271286, + "learning_rate": 4.673666345559711e-05, + "loss": 1.803, + "step": 17437 + }, + { + "epoch": 5.352363413136894, + "grad_norm": 0.22247640788555145, + "learning_rate": 4.6731703509830744e-05, + "loss": 1.7273, + "step": 17438 + }, + { + "epoch": 5.352670349907919, + "grad_norm": 0.2399090677499771, + "learning_rate": 4.6726743596363574e-05, + "loss": 1.7708, + "step": 17439 + }, + { + "epoch": 5.352977286678944, + "grad_norm": 0.2550101578235626, + "learning_rate": 4.6721783715244674e-05, + "loss": 1.7016, + "step": 17440 + }, + { + "epoch": 5.353284223449969, + "grad_norm": 0.19929546117782593, + "learning_rate": 4.6716823866523e-05, + "loss": 1.7417, + "step": 17441 + }, + { + "epoch": 5.3535911602209945, + "grad_norm": 0.2496672421693802, + "learning_rate": 4.671186405024761e-05, + "loss": 1.72, + "step": 17442 + }, + { + "epoch": 5.35389809699202, + "grad_norm": 0.19827665388584137, + "learning_rate": 4.67069042664675e-05, + "loss": 1.7515, + "step": 17443 + }, + { + "epoch": 5.354205033763045, + "grad_norm": 0.2528775930404663, + "learning_rate": 4.670194451523171e-05, + "loss": 1.7429, + "step": 17444 + }, + { + "epoch": 5.35451197053407, + "grad_norm": 0.19569729268550873, + "learning_rate": 4.6696984796589215e-05, + "loss": 1.7314, + "step": 17445 + }, + { + "epoch": 5.354818907305095, + "grad_norm": 0.21892370283603668, + "learning_rate": 4.669202511058908e-05, + "loss": 1.7331, + "step": 17446 + }, + { + "epoch": 5.35512584407612, + "grad_norm": 0.21609409153461456, + "learning_rate": 4.668706545728026e-05, + "loss": 1.7267, + "step": 17447 + }, + { + "epoch": 5.355432780847146, + "grad_norm": 0.2631370425224304, + "learning_rate": 4.668210583671182e-05, + "loss": 1.7513, + "step": 17448 + }, + { + "epoch": 5.355739717618171, + "grad_norm": 0.31327441334724426, + "learning_rate": 4.667714624893274e-05, + "loss": 1.7936, + "step": 17449 + }, + { + "epoch": 5.356046654389196, + "grad_norm": 0.21602430939674377, + "learning_rate": 4.667218669399207e-05, + "loss": 1.7387, + "step": 17450 + }, + { + "epoch": 5.356353591160221, + "grad_norm": 0.2895040214061737, + "learning_rate": 4.6667227171938784e-05, + "loss": 1.7293, + "step": 17451 + }, + { + "epoch": 5.356660527931246, + "grad_norm": 0.35150307416915894, + "learning_rate": 4.666226768282193e-05, + "loss": 1.8215, + "step": 17452 + }, + { + "epoch": 5.356967464702271, + "grad_norm": 0.19034281373023987, + "learning_rate": 4.665730822669048e-05, + "loss": 1.702, + "step": 17453 + }, + { + "epoch": 5.357274401473297, + "grad_norm": 0.25586241483688354, + "learning_rate": 4.6652348803593484e-05, + "loss": 1.7809, + "step": 17454 + }, + { + "epoch": 5.357581338244322, + "grad_norm": 0.23919305205345154, + "learning_rate": 4.6647389413579944e-05, + "loss": 1.7555, + "step": 17455 + }, + { + "epoch": 5.3578882750153465, + "grad_norm": 0.22707165777683258, + "learning_rate": 4.664243005669885e-05, + "loss": 1.7633, + "step": 17456 + }, + { + "epoch": 5.358195211786372, + "grad_norm": 0.20666839182376862, + "learning_rate": 4.663747073299925e-05, + "loss": 1.6522, + "step": 17457 + }, + { + "epoch": 5.358502148557397, + "grad_norm": 0.20557542145252228, + "learning_rate": 4.663251144253012e-05, + "loss": 1.73, + "step": 17458 + }, + { + "epoch": 5.3588090853284225, + "grad_norm": 0.22375571727752686, + "learning_rate": 4.662755218534049e-05, + "loss": 1.7189, + "step": 17459 + }, + { + "epoch": 5.359116022099448, + "grad_norm": 0.261393278837204, + "learning_rate": 4.662259296147936e-05, + "loss": 1.6863, + "step": 17460 + }, + { + "epoch": 5.359422958870473, + "grad_norm": 0.2279379516839981, + "learning_rate": 4.6617633770995764e-05, + "loss": 1.7332, + "step": 17461 + }, + { + "epoch": 5.359729895641498, + "grad_norm": 0.2194606065750122, + "learning_rate": 4.6612674613938666e-05, + "loss": 1.7324, + "step": 17462 + }, + { + "epoch": 5.360036832412523, + "grad_norm": 0.27714410424232483, + "learning_rate": 4.660771549035713e-05, + "loss": 1.7386, + "step": 17463 + }, + { + "epoch": 5.360343769183548, + "grad_norm": 0.2118787169456482, + "learning_rate": 4.660275640030012e-05, + "loss": 1.7587, + "step": 17464 + }, + { + "epoch": 5.360650705954574, + "grad_norm": 0.2546979784965515, + "learning_rate": 4.6597797343816665e-05, + "loss": 1.7756, + "step": 17465 + }, + { + "epoch": 5.360957642725599, + "grad_norm": 0.194237619638443, + "learning_rate": 4.659283832095577e-05, + "loss": 1.7351, + "step": 17466 + }, + { + "epoch": 5.361264579496623, + "grad_norm": 0.23448583483695984, + "learning_rate": 4.658787933176646e-05, + "loss": 1.7051, + "step": 17467 + }, + { + "epoch": 5.361571516267649, + "grad_norm": 0.22796298563480377, + "learning_rate": 4.65829203762977e-05, + "loss": 1.7395, + "step": 17468 + }, + { + "epoch": 5.361878453038674, + "grad_norm": 0.22674904763698578, + "learning_rate": 4.657796145459855e-05, + "loss": 1.714, + "step": 17469 + }, + { + "epoch": 5.362185389809699, + "grad_norm": 0.2697311341762543, + "learning_rate": 4.657300256671797e-05, + "loss": 1.8271, + "step": 17470 + }, + { + "epoch": 5.362492326580725, + "grad_norm": 0.28040480613708496, + "learning_rate": 4.6568043712705004e-05, + "loss": 1.8192, + "step": 17471 + }, + { + "epoch": 5.362799263351749, + "grad_norm": 0.21100232005119324, + "learning_rate": 4.6563084892608644e-05, + "loss": 1.7285, + "step": 17472 + }, + { + "epoch": 5.3631062001227745, + "grad_norm": 0.23545897006988525, + "learning_rate": 4.655812610647787e-05, + "loss": 1.7302, + "step": 17473 + }, + { + "epoch": 5.3634131368938, + "grad_norm": 0.23278315365314484, + "learning_rate": 4.655316735436174e-05, + "loss": 1.7749, + "step": 17474 + }, + { + "epoch": 5.363720073664825, + "grad_norm": 0.333763986825943, + "learning_rate": 4.65482086363092e-05, + "loss": 1.7393, + "step": 17475 + }, + { + "epoch": 5.3640270104358505, + "grad_norm": 0.2743878662586212, + "learning_rate": 4.6543249952369306e-05, + "loss": 1.7274, + "step": 17476 + }, + { + "epoch": 5.364333947206875, + "grad_norm": 0.234402596950531, + "learning_rate": 4.6538291302591024e-05, + "loss": 1.7848, + "step": 17477 + }, + { + "epoch": 5.3646408839779, + "grad_norm": 0.29100897908210754, + "learning_rate": 4.65333326870234e-05, + "loss": 1.7698, + "step": 17478 + }, + { + "epoch": 5.364947820748926, + "grad_norm": 0.24178378283977509, + "learning_rate": 4.652837410571539e-05, + "loss": 1.8142, + "step": 17479 + }, + { + "epoch": 5.365254757519951, + "grad_norm": 0.4189155101776123, + "learning_rate": 4.652341555871605e-05, + "loss": 1.7435, + "step": 17480 + }, + { + "epoch": 5.365561694290976, + "grad_norm": 0.40106773376464844, + "learning_rate": 4.651845704607433e-05, + "loss": 1.837, + "step": 17481 + }, + { + "epoch": 5.365868631062002, + "grad_norm": 0.24127443134784698, + "learning_rate": 4.651349856783927e-05, + "loss": 1.7257, + "step": 17482 + }, + { + "epoch": 5.366175567833026, + "grad_norm": 0.412812739610672, + "learning_rate": 4.650854012405985e-05, + "loss": 1.762, + "step": 17483 + }, + { + "epoch": 5.366482504604051, + "grad_norm": 0.2636469602584839, + "learning_rate": 4.65035817147851e-05, + "loss": 1.7995, + "step": 17484 + }, + { + "epoch": 5.366789441375077, + "grad_norm": 0.282186895608902, + "learning_rate": 4.649862334006399e-05, + "loss": 1.75, + "step": 17485 + }, + { + "epoch": 5.367096378146102, + "grad_norm": 0.3280154764652252, + "learning_rate": 4.649366499994555e-05, + "loss": 1.7668, + "step": 17486 + }, + { + "epoch": 5.367403314917127, + "grad_norm": 0.24608035385608673, + "learning_rate": 4.648870669447875e-05, + "loss": 1.8332, + "step": 17487 + }, + { + "epoch": 5.367710251688152, + "grad_norm": 0.21927174925804138, + "learning_rate": 4.648374842371262e-05, + "loss": 1.7365, + "step": 17488 + }, + { + "epoch": 5.368017188459177, + "grad_norm": 0.2658425569534302, + "learning_rate": 4.6478790187696164e-05, + "loss": 1.841, + "step": 17489 + }, + { + "epoch": 5.3683241252302025, + "grad_norm": 0.2302858531475067, + "learning_rate": 4.647383198647834e-05, + "loss": 1.7882, + "step": 17490 + }, + { + "epoch": 5.368631062001228, + "grad_norm": 0.2562740743160248, + "learning_rate": 4.64688738201082e-05, + "loss": 1.7188, + "step": 17491 + }, + { + "epoch": 5.368937998772253, + "grad_norm": 0.28140220046043396, + "learning_rate": 4.646391568863469e-05, + "loss": 1.7482, + "step": 17492 + }, + { + "epoch": 5.3692449355432785, + "grad_norm": 0.21040008962154388, + "learning_rate": 4.6458957592106855e-05, + "loss": 1.7695, + "step": 17493 + }, + { + "epoch": 5.369551872314303, + "grad_norm": 0.25322291254997253, + "learning_rate": 4.645399953057367e-05, + "loss": 1.7127, + "step": 17494 + }, + { + "epoch": 5.369858809085328, + "grad_norm": 0.2239738404750824, + "learning_rate": 4.644904150408415e-05, + "loss": 1.7376, + "step": 17495 + }, + { + "epoch": 5.370165745856354, + "grad_norm": 0.21432901918888092, + "learning_rate": 4.644408351268727e-05, + "loss": 1.7156, + "step": 17496 + }, + { + "epoch": 5.370472682627379, + "grad_norm": 0.3057272732257843, + "learning_rate": 4.643912555643205e-05, + "loss": 1.7706, + "step": 17497 + }, + { + "epoch": 5.370779619398404, + "grad_norm": 0.2826928496360779, + "learning_rate": 4.643416763536748e-05, + "loss": 1.8298, + "step": 17498 + }, + { + "epoch": 5.371086556169429, + "grad_norm": 0.2395278513431549, + "learning_rate": 4.642920974954255e-05, + "loss": 1.7357, + "step": 17499 + }, + { + "epoch": 5.371393492940454, + "grad_norm": 0.21004743874073029, + "learning_rate": 4.642425189900626e-05, + "loss": 1.7263, + "step": 17500 + }, + { + "epoch": 5.371700429711479, + "grad_norm": 0.23981697857379913, + "learning_rate": 4.641929408380761e-05, + "loss": 1.7341, + "step": 17501 + }, + { + "epoch": 5.372007366482505, + "grad_norm": 0.1984727531671524, + "learning_rate": 4.641433630399559e-05, + "loss": 1.7133, + "step": 17502 + }, + { + "epoch": 5.37231430325353, + "grad_norm": 0.22153446078300476, + "learning_rate": 4.640937855961922e-05, + "loss": 1.8028, + "step": 17503 + }, + { + "epoch": 5.3726212400245545, + "grad_norm": 0.24257974326610565, + "learning_rate": 4.6404420850727455e-05, + "loss": 1.7842, + "step": 17504 + }, + { + "epoch": 5.37292817679558, + "grad_norm": 0.19444705545902252, + "learning_rate": 4.6399463177369316e-05, + "loss": 1.7296, + "step": 17505 + }, + { + "epoch": 5.373235113566605, + "grad_norm": 0.2068849354982376, + "learning_rate": 4.6394505539593806e-05, + "loss": 1.6949, + "step": 17506 + }, + { + "epoch": 5.3735420503376305, + "grad_norm": 0.21762309968471527, + "learning_rate": 4.638954793744989e-05, + "loss": 1.7556, + "step": 17507 + }, + { + "epoch": 5.373848987108656, + "grad_norm": 0.20791584253311157, + "learning_rate": 4.638459037098659e-05, + "loss": 1.7442, + "step": 17508 + }, + { + "epoch": 5.37415592387968, + "grad_norm": 0.27774497866630554, + "learning_rate": 4.6379632840252875e-05, + "loss": 1.7834, + "step": 17509 + }, + { + "epoch": 5.374462860650706, + "grad_norm": 0.24211421608924866, + "learning_rate": 4.637467534529775e-05, + "loss": 1.819, + "step": 17510 + }, + { + "epoch": 5.374769797421731, + "grad_norm": 0.24857789278030396, + "learning_rate": 4.636971788617022e-05, + "loss": 1.7483, + "step": 17511 + }, + { + "epoch": 5.375076734192756, + "grad_norm": 0.25142937898635864, + "learning_rate": 4.636476046291925e-05, + "loss": 1.7405, + "step": 17512 + }, + { + "epoch": 5.375383670963782, + "grad_norm": 0.25860801339149475, + "learning_rate": 4.6359803075593846e-05, + "loss": 1.7821, + "step": 17513 + }, + { + "epoch": 5.375690607734807, + "grad_norm": 0.25223109126091003, + "learning_rate": 4.635484572424302e-05, + "loss": 1.738, + "step": 17514 + }, + { + "epoch": 5.3759975445058314, + "grad_norm": 0.22931768000125885, + "learning_rate": 4.634988840891573e-05, + "loss": 1.7717, + "step": 17515 + }, + { + "epoch": 5.376304481276857, + "grad_norm": 0.21371231973171234, + "learning_rate": 4.6344931129661e-05, + "loss": 1.7741, + "step": 17516 + }, + { + "epoch": 5.376611418047882, + "grad_norm": 0.2653632164001465, + "learning_rate": 4.633997388652778e-05, + "loss": 1.7548, + "step": 17517 + }, + { + "epoch": 5.3769183548189075, + "grad_norm": 0.2559951841831207, + "learning_rate": 4.6335016679565094e-05, + "loss": 1.7833, + "step": 17518 + }, + { + "epoch": 5.377225291589933, + "grad_norm": 0.22560031712055206, + "learning_rate": 4.6330059508821914e-05, + "loss": 1.6929, + "step": 17519 + }, + { + "epoch": 5.377532228360957, + "grad_norm": 0.3084852695465088, + "learning_rate": 4.6325102374347255e-05, + "loss": 1.8107, + "step": 17520 + }, + { + "epoch": 5.377839165131983, + "grad_norm": 0.3329267203807831, + "learning_rate": 4.632014527619007e-05, + "loss": 1.6791, + "step": 17521 + }, + { + "epoch": 5.378146101903008, + "grad_norm": 0.26274019479751587, + "learning_rate": 4.631518821439939e-05, + "loss": 1.7187, + "step": 17522 + }, + { + "epoch": 5.378453038674033, + "grad_norm": 0.3769492208957672, + "learning_rate": 4.6310231189024165e-05, + "loss": 1.8366, + "step": 17523 + }, + { + "epoch": 5.378759975445059, + "grad_norm": 0.2503921687602997, + "learning_rate": 4.6305274200113385e-05, + "loss": 1.7281, + "step": 17524 + }, + { + "epoch": 5.379066912216084, + "grad_norm": 0.26305708289146423, + "learning_rate": 4.6300317247716074e-05, + "loss": 1.7231, + "step": 17525 + }, + { + "epoch": 5.379373848987108, + "grad_norm": 0.31899142265319824, + "learning_rate": 4.629536033188118e-05, + "loss": 1.8025, + "step": 17526 + }, + { + "epoch": 5.379680785758134, + "grad_norm": 0.21400104463100433, + "learning_rate": 4.629040345265772e-05, + "loss": 1.7481, + "step": 17527 + }, + { + "epoch": 5.379987722529159, + "grad_norm": 0.23147371411323547, + "learning_rate": 4.628544661009465e-05, + "loss": 1.7049, + "step": 17528 + }, + { + "epoch": 5.380294659300184, + "grad_norm": 0.21156759560108185, + "learning_rate": 4.628048980424099e-05, + "loss": 1.806, + "step": 17529 + }, + { + "epoch": 5.38060159607121, + "grad_norm": 0.22061556577682495, + "learning_rate": 4.6275533035145685e-05, + "loss": 1.7606, + "step": 17530 + }, + { + "epoch": 5.380908532842234, + "grad_norm": 0.23379987478256226, + "learning_rate": 4.6270576302857774e-05, + "loss": 1.7874, + "step": 17531 + }, + { + "epoch": 5.3812154696132595, + "grad_norm": 0.24738669395446777, + "learning_rate": 4.62656196074262e-05, + "loss": 1.7611, + "step": 17532 + }, + { + "epoch": 5.381522406384285, + "grad_norm": 0.19738905131816864, + "learning_rate": 4.6260662948899974e-05, + "loss": 1.7375, + "step": 17533 + }, + { + "epoch": 5.38182934315531, + "grad_norm": 0.2327810823917389, + "learning_rate": 4.6255706327328044e-05, + "loss": 1.7188, + "step": 17534 + }, + { + "epoch": 5.3821362799263355, + "grad_norm": 0.18944145739078522, + "learning_rate": 4.625074974275944e-05, + "loss": 1.6672, + "step": 17535 + }, + { + "epoch": 5.382443216697361, + "grad_norm": 0.20943734049797058, + "learning_rate": 4.624579319524311e-05, + "loss": 1.7238, + "step": 17536 + }, + { + "epoch": 5.382750153468385, + "grad_norm": 0.2060960829257965, + "learning_rate": 4.6240836684828074e-05, + "loss": 1.744, + "step": 17537 + }, + { + "epoch": 5.383057090239411, + "grad_norm": 0.19089816510677338, + "learning_rate": 4.6235880211563264e-05, + "loss": 1.6884, + "step": 17538 + }, + { + "epoch": 5.383364027010436, + "grad_norm": 0.22362665832042694, + "learning_rate": 4.623092377549772e-05, + "loss": 1.7076, + "step": 17539 + }, + { + "epoch": 5.383670963781461, + "grad_norm": 0.19429968297481537, + "learning_rate": 4.622596737668039e-05, + "loss": 1.7315, + "step": 17540 + }, + { + "epoch": 5.383977900552487, + "grad_norm": 0.20481903851032257, + "learning_rate": 4.622101101516024e-05, + "loss": 1.711, + "step": 17541 + }, + { + "epoch": 5.384284837323511, + "grad_norm": 0.19181163609027863, + "learning_rate": 4.6216054690986304e-05, + "loss": 1.6879, + "step": 17542 + }, + { + "epoch": 5.384591774094536, + "grad_norm": 0.23105846345424652, + "learning_rate": 4.6211098404207514e-05, + "loss": 1.7797, + "step": 17543 + }, + { + "epoch": 5.384898710865562, + "grad_norm": 0.2742008864879608, + "learning_rate": 4.6206142154872886e-05, + "loss": 1.7404, + "step": 17544 + }, + { + "epoch": 5.385205647636587, + "grad_norm": 0.2256750613451004, + "learning_rate": 4.6201185943031365e-05, + "loss": 1.7616, + "step": 17545 + }, + { + "epoch": 5.385512584407612, + "grad_norm": 0.23230868577957153, + "learning_rate": 4.6196229768731964e-05, + "loss": 1.7457, + "step": 17546 + }, + { + "epoch": 5.385819521178637, + "grad_norm": 0.2200126200914383, + "learning_rate": 4.6191273632023634e-05, + "loss": 1.7835, + "step": 17547 + }, + { + "epoch": 5.386126457949662, + "grad_norm": 0.21903863549232483, + "learning_rate": 4.6186317532955395e-05, + "loss": 1.7315, + "step": 17548 + }, + { + "epoch": 5.3864333947206875, + "grad_norm": 0.1915556788444519, + "learning_rate": 4.6181361471576186e-05, + "loss": 1.6786, + "step": 17549 + }, + { + "epoch": 5.386740331491713, + "grad_norm": 0.20177799463272095, + "learning_rate": 4.617640544793501e-05, + "loss": 1.7453, + "step": 17550 + }, + { + "epoch": 5.387047268262738, + "grad_norm": 0.2598256766796112, + "learning_rate": 4.617144946208083e-05, + "loss": 1.7931, + "step": 17551 + }, + { + "epoch": 5.387354205033763, + "grad_norm": 0.2357153594493866, + "learning_rate": 4.616649351406263e-05, + "loss": 1.7932, + "step": 17552 + }, + { + "epoch": 5.387661141804788, + "grad_norm": 0.2228964865207672, + "learning_rate": 4.616153760392938e-05, + "loss": 1.7725, + "step": 17553 + }, + { + "epoch": 5.387968078575813, + "grad_norm": 0.20811811089515686, + "learning_rate": 4.6156581731730085e-05, + "loss": 1.744, + "step": 17554 + }, + { + "epoch": 5.388275015346839, + "grad_norm": 0.20008429884910583, + "learning_rate": 4.615162589751369e-05, + "loss": 1.6973, + "step": 17555 + }, + { + "epoch": 5.388581952117864, + "grad_norm": 0.20487523078918457, + "learning_rate": 4.614667010132919e-05, + "loss": 1.7712, + "step": 17556 + }, + { + "epoch": 5.388888888888889, + "grad_norm": 0.21279677748680115, + "learning_rate": 4.6141714343225554e-05, + "loss": 1.7783, + "step": 17557 + }, + { + "epoch": 5.389195825659914, + "grad_norm": 0.28035736083984375, + "learning_rate": 4.613675862325174e-05, + "loss": 1.767, + "step": 17558 + }, + { + "epoch": 5.389502762430939, + "grad_norm": 0.27426794171333313, + "learning_rate": 4.613180294145677e-05, + "loss": 1.7909, + "step": 17559 + }, + { + "epoch": 5.389809699201964, + "grad_norm": 0.22420327365398407, + "learning_rate": 4.612684729788957e-05, + "loss": 1.6902, + "step": 17560 + }, + { + "epoch": 5.39011663597299, + "grad_norm": 0.19799382984638214, + "learning_rate": 4.612189169259915e-05, + "loss": 1.7276, + "step": 17561 + }, + { + "epoch": 5.390423572744015, + "grad_norm": 0.2508823573589325, + "learning_rate": 4.611693612563445e-05, + "loss": 1.7445, + "step": 17562 + }, + { + "epoch": 5.3907305095150395, + "grad_norm": 0.20835694670677185, + "learning_rate": 4.611198059704448e-05, + "loss": 1.696, + "step": 17563 + }, + { + "epoch": 5.391037446286065, + "grad_norm": 0.22136010229587555, + "learning_rate": 4.6107025106878176e-05, + "loss": 1.7701, + "step": 17564 + }, + { + "epoch": 5.39134438305709, + "grad_norm": 0.23835612833499908, + "learning_rate": 4.610206965518456e-05, + "loss": 1.7494, + "step": 17565 + }, + { + "epoch": 5.3916513198281155, + "grad_norm": 0.26142916083335876, + "learning_rate": 4.6097114242012554e-05, + "loss": 1.7616, + "step": 17566 + }, + { + "epoch": 5.391958256599141, + "grad_norm": 0.3366851806640625, + "learning_rate": 4.6092158867411175e-05, + "loss": 1.7409, + "step": 17567 + }, + { + "epoch": 5.392265193370166, + "grad_norm": 0.2592991292476654, + "learning_rate": 4.608720353142935e-05, + "loss": 1.7469, + "step": 17568 + }, + { + "epoch": 5.392572130141191, + "grad_norm": 0.25810322165489197, + "learning_rate": 4.608224823411608e-05, + "loss": 1.7345, + "step": 17569 + }, + { + "epoch": 5.392879066912216, + "grad_norm": 0.26776888966560364, + "learning_rate": 4.607729297552032e-05, + "loss": 1.7698, + "step": 17570 + }, + { + "epoch": 5.393186003683241, + "grad_norm": 0.21023939549922943, + "learning_rate": 4.607233775569107e-05, + "loss": 1.7681, + "step": 17571 + }, + { + "epoch": 5.393492940454267, + "grad_norm": 0.24452096223831177, + "learning_rate": 4.6067382574677265e-05, + "loss": 1.8154, + "step": 17572 + }, + { + "epoch": 5.393799877225292, + "grad_norm": 0.27084338665008545, + "learning_rate": 4.606242743252791e-05, + "loss": 1.7106, + "step": 17573 + }, + { + "epoch": 5.394106813996316, + "grad_norm": 0.24783825874328613, + "learning_rate": 4.605747232929195e-05, + "loss": 1.713, + "step": 17574 + }, + { + "epoch": 5.394413750767342, + "grad_norm": 0.2528151869773865, + "learning_rate": 4.6052517265018333e-05, + "loss": 1.8475, + "step": 17575 + }, + { + "epoch": 5.394720687538367, + "grad_norm": 0.24361065030097961, + "learning_rate": 4.604756223975609e-05, + "loss": 1.7414, + "step": 17576 + }, + { + "epoch": 5.395027624309392, + "grad_norm": 0.2751234769821167, + "learning_rate": 4.604260725355412e-05, + "loss": 1.7603, + "step": 17577 + }, + { + "epoch": 5.395334561080418, + "grad_norm": 0.23183637857437134, + "learning_rate": 4.603765230646146e-05, + "loss": 1.7053, + "step": 17578 + }, + { + "epoch": 5.395641497851442, + "grad_norm": 0.27462145686149597, + "learning_rate": 4.6032697398527005e-05, + "loss": 1.746, + "step": 17579 + }, + { + "epoch": 5.3959484346224675, + "grad_norm": 0.3665321171283722, + "learning_rate": 4.602774252979978e-05, + "loss": 1.6883, + "step": 17580 + }, + { + "epoch": 5.396255371393493, + "grad_norm": 0.22438424825668335, + "learning_rate": 4.602278770032872e-05, + "loss": 1.7473, + "step": 17581 + }, + { + "epoch": 5.396562308164518, + "grad_norm": 0.38713687658309937, + "learning_rate": 4.601783291016282e-05, + "loss": 1.7993, + "step": 17582 + }, + { + "epoch": 5.3968692449355435, + "grad_norm": 0.3399868905544281, + "learning_rate": 4.6012878159351015e-05, + "loss": 1.7709, + "step": 17583 + }, + { + "epoch": 5.397176181706568, + "grad_norm": 0.21916119754314423, + "learning_rate": 4.60079234479423e-05, + "loss": 1.7351, + "step": 17584 + }, + { + "epoch": 5.397483118477593, + "grad_norm": 0.3796394467353821, + "learning_rate": 4.600296877598561e-05, + "loss": 1.7534, + "step": 17585 + }, + { + "epoch": 5.397790055248619, + "grad_norm": 0.27824562788009644, + "learning_rate": 4.599801414352993e-05, + "loss": 1.6962, + "step": 17586 + }, + { + "epoch": 5.398096992019644, + "grad_norm": 0.21037112176418304, + "learning_rate": 4.599305955062421e-05, + "loss": 1.7062, + "step": 17587 + }, + { + "epoch": 5.398403928790669, + "grad_norm": 0.3373035192489624, + "learning_rate": 4.598810499731745e-05, + "loss": 1.8263, + "step": 17588 + }, + { + "epoch": 5.398710865561695, + "grad_norm": 0.2560507357120514, + "learning_rate": 4.5983150483658564e-05, + "loss": 1.7232, + "step": 17589 + }, + { + "epoch": 5.399017802332719, + "grad_norm": 0.23010993003845215, + "learning_rate": 4.5978196009696564e-05, + "loss": 1.805, + "step": 17590 + }, + { + "epoch": 5.399324739103744, + "grad_norm": 0.32955634593963623, + "learning_rate": 4.597324157548037e-05, + "loss": 1.7018, + "step": 17591 + }, + { + "epoch": 5.39963167587477, + "grad_norm": 0.2534363865852356, + "learning_rate": 4.5968287181058953e-05, + "loss": 1.6919, + "step": 17592 + }, + { + "epoch": 5.399938612645795, + "grad_norm": 0.23179130256175995, + "learning_rate": 4.5963332826481314e-05, + "loss": 1.7237, + "step": 17593 + }, + { + "epoch": 5.4002455494168204, + "grad_norm": 0.37712663412094116, + "learning_rate": 4.5958378511796365e-05, + "loss": 1.7694, + "step": 17594 + }, + { + "epoch": 5.400552486187845, + "grad_norm": 0.21228717267513275, + "learning_rate": 4.59534242370531e-05, + "loss": 1.7528, + "step": 17595 + }, + { + "epoch": 5.40085942295887, + "grad_norm": 0.2818812429904938, + "learning_rate": 4.5948470002300454e-05, + "loss": 1.8214, + "step": 17596 + }, + { + "epoch": 5.401166359729896, + "grad_norm": 0.24916675686836243, + "learning_rate": 4.5943515807587415e-05, + "loss": 1.7792, + "step": 17597 + }, + { + "epoch": 5.401473296500921, + "grad_norm": 0.2096913456916809, + "learning_rate": 4.593856165296291e-05, + "loss": 1.6983, + "step": 17598 + }, + { + "epoch": 5.401780233271946, + "grad_norm": 0.271124005317688, + "learning_rate": 4.593360753847595e-05, + "loss": 1.7534, + "step": 17599 + }, + { + "epoch": 5.402087170042972, + "grad_norm": 0.24798092246055603, + "learning_rate": 4.5928653464175435e-05, + "loss": 1.7783, + "step": 17600 + }, + { + "epoch": 5.402394106813996, + "grad_norm": 0.3531748056411743, + "learning_rate": 4.592369943011038e-05, + "loss": 1.7834, + "step": 17601 + }, + { + "epoch": 5.402701043585021, + "grad_norm": 0.29650232195854187, + "learning_rate": 4.591874543632969e-05, + "loss": 1.7186, + "step": 17602 + }, + { + "epoch": 5.403007980356047, + "grad_norm": 0.25578248500823975, + "learning_rate": 4.591379148288236e-05, + "loss": 1.7849, + "step": 17603 + }, + { + "epoch": 5.403314917127072, + "grad_norm": 0.3790532946586609, + "learning_rate": 4.590883756981733e-05, + "loss": 1.7192, + "step": 17604 + }, + { + "epoch": 5.403621853898097, + "grad_norm": 0.23684249818325043, + "learning_rate": 4.590388369718359e-05, + "loss": 1.7171, + "step": 17605 + }, + { + "epoch": 5.403928790669122, + "grad_norm": 0.267702579498291, + "learning_rate": 4.589892986503005e-05, + "loss": 1.7181, + "step": 17606 + }, + { + "epoch": 5.404235727440147, + "grad_norm": 0.29105648398399353, + "learning_rate": 4.5893976073405704e-05, + "loss": 1.7395, + "step": 17607 + }, + { + "epoch": 5.4045426642111725, + "grad_norm": 0.2266589254140854, + "learning_rate": 4.588902232235949e-05, + "loss": 1.7244, + "step": 17608 + }, + { + "epoch": 5.404849600982198, + "grad_norm": 0.24065524339675903, + "learning_rate": 4.588406861194035e-05, + "loss": 1.7398, + "step": 17609 + }, + { + "epoch": 5.405156537753223, + "grad_norm": 0.23166650533676147, + "learning_rate": 4.587911494219728e-05, + "loss": 1.7592, + "step": 17610 + }, + { + "epoch": 5.4054634745242485, + "grad_norm": 0.19882038235664368, + "learning_rate": 4.5874161313179186e-05, + "loss": 1.7087, + "step": 17611 + }, + { + "epoch": 5.405770411295273, + "grad_norm": 0.2688273787498474, + "learning_rate": 4.5869207724935076e-05, + "loss": 1.7791, + "step": 17612 + }, + { + "epoch": 5.406077348066298, + "grad_norm": 0.1970982402563095, + "learning_rate": 4.5864254177513855e-05, + "loss": 1.7079, + "step": 17613 + }, + { + "epoch": 5.406384284837324, + "grad_norm": 0.2531265318393707, + "learning_rate": 4.585930067096451e-05, + "loss": 1.716, + "step": 17614 + }, + { + "epoch": 5.406691221608349, + "grad_norm": 0.2610352337360382, + "learning_rate": 4.585434720533596e-05, + "loss": 1.7133, + "step": 17615 + }, + { + "epoch": 5.406998158379374, + "grad_norm": 0.2420870065689087, + "learning_rate": 4.5849393780677216e-05, + "loss": 1.7044, + "step": 17616 + }, + { + "epoch": 5.407305095150399, + "grad_norm": 0.24078647792339325, + "learning_rate": 4.584444039703717e-05, + "loss": 1.7486, + "step": 17617 + }, + { + "epoch": 5.407612031921424, + "grad_norm": 0.19324539601802826, + "learning_rate": 4.583948705446481e-05, + "loss": 1.7439, + "step": 17618 + }, + { + "epoch": 5.407918968692449, + "grad_norm": 0.2311750054359436, + "learning_rate": 4.5834533753009065e-05, + "loss": 1.7794, + "step": 17619 + }, + { + "epoch": 5.408225905463475, + "grad_norm": 0.2554466128349304, + "learning_rate": 4.5829580492718914e-05, + "loss": 1.7146, + "step": 17620 + }, + { + "epoch": 5.4085328422345, + "grad_norm": 0.2679688334465027, + "learning_rate": 4.582462727364328e-05, + "loss": 1.7677, + "step": 17621 + }, + { + "epoch": 5.4088397790055245, + "grad_norm": 0.19292913377285004, + "learning_rate": 4.5819674095831146e-05, + "loss": 1.7544, + "step": 17622 + }, + { + "epoch": 5.40914671577655, + "grad_norm": 0.2146623730659485, + "learning_rate": 4.5814720959331425e-05, + "loss": 1.7182, + "step": 17623 + }, + { + "epoch": 5.409453652547575, + "grad_norm": 0.23098216950893402, + "learning_rate": 4.5809767864193096e-05, + "loss": 1.6844, + "step": 17624 + }, + { + "epoch": 5.4097605893186005, + "grad_norm": 0.22482910752296448, + "learning_rate": 4.5804814810465096e-05, + "loss": 1.7921, + "step": 17625 + }, + { + "epoch": 5.410067526089626, + "grad_norm": 0.22098569571971893, + "learning_rate": 4.579986179819636e-05, + "loss": 1.7419, + "step": 17626 + }, + { + "epoch": 5.41037446286065, + "grad_norm": 0.2131706178188324, + "learning_rate": 4.579490882743588e-05, + "loss": 1.7587, + "step": 17627 + }, + { + "epoch": 5.410681399631676, + "grad_norm": 0.22448734939098358, + "learning_rate": 4.578995589823254e-05, + "loss": 1.6959, + "step": 17628 + }, + { + "epoch": 5.410988336402701, + "grad_norm": 0.22372964024543762, + "learning_rate": 4.578500301063536e-05, + "loss": 1.7462, + "step": 17629 + }, + { + "epoch": 5.411295273173726, + "grad_norm": 0.22140730917453766, + "learning_rate": 4.578005016469322e-05, + "loss": 1.8348, + "step": 17630 + }, + { + "epoch": 5.411602209944752, + "grad_norm": 0.21697622537612915, + "learning_rate": 4.577509736045511e-05, + "loss": 1.7634, + "step": 17631 + }, + { + "epoch": 5.411909146715777, + "grad_norm": 0.2044363021850586, + "learning_rate": 4.5770144597969954e-05, + "loss": 1.7095, + "step": 17632 + }, + { + "epoch": 5.412216083486801, + "grad_norm": 0.1910451501607895, + "learning_rate": 4.576519187728674e-05, + "loss": 1.7022, + "step": 17633 + }, + { + "epoch": 5.412523020257827, + "grad_norm": 0.21787554025650024, + "learning_rate": 4.576023919845434e-05, + "loss": 1.7206, + "step": 17634 + }, + { + "epoch": 5.412829957028852, + "grad_norm": 0.2363428920507431, + "learning_rate": 4.575528656152178e-05, + "loss": 1.8052, + "step": 17635 + }, + { + "epoch": 5.413136893799877, + "grad_norm": 0.22830195724964142, + "learning_rate": 4.575033396653793e-05, + "loss": 1.7432, + "step": 17636 + }, + { + "epoch": 5.413443830570903, + "grad_norm": 0.24867239594459534, + "learning_rate": 4.5745381413551794e-05, + "loss": 1.7011, + "step": 17637 + }, + { + "epoch": 5.413750767341927, + "grad_norm": 0.19329775869846344, + "learning_rate": 4.574042890261228e-05, + "loss": 1.7749, + "step": 17638 + }, + { + "epoch": 5.4140577041129525, + "grad_norm": 0.22917115688323975, + "learning_rate": 4.573547643376836e-05, + "loss": 1.7478, + "step": 17639 + }, + { + "epoch": 5.414364640883978, + "grad_norm": 0.23882724344730377, + "learning_rate": 4.573052400706894e-05, + "loss": 1.7396, + "step": 17640 + }, + { + "epoch": 5.414671577655003, + "grad_norm": 0.19127070903778076, + "learning_rate": 4.572557162256301e-05, + "loss": 1.6791, + "step": 17641 + }, + { + "epoch": 5.4149785144260285, + "grad_norm": 0.18385560810565948, + "learning_rate": 4.5720619280299475e-05, + "loss": 1.7288, + "step": 17642 + }, + { + "epoch": 5.415285451197054, + "grad_norm": 0.19845189154148102, + "learning_rate": 4.571566698032728e-05, + "loss": 1.7525, + "step": 17643 + }, + { + "epoch": 5.415592387968078, + "grad_norm": 0.18987210094928741, + "learning_rate": 4.571071472269539e-05, + "loss": 1.7253, + "step": 17644 + }, + { + "epoch": 5.415899324739104, + "grad_norm": 0.18257199227809906, + "learning_rate": 4.570576250745271e-05, + "loss": 1.7051, + "step": 17645 + }, + { + "epoch": 5.416206261510129, + "grad_norm": 0.22803467512130737, + "learning_rate": 4.570081033464823e-05, + "loss": 1.7478, + "step": 17646 + }, + { + "epoch": 5.416513198281154, + "grad_norm": 0.18763841688632965, + "learning_rate": 4.569585820433084e-05, + "loss": 1.7316, + "step": 17647 + }, + { + "epoch": 5.41682013505218, + "grad_norm": 0.23974654078483582, + "learning_rate": 4.56909061165495e-05, + "loss": 1.7566, + "step": 17648 + }, + { + "epoch": 5.417127071823204, + "grad_norm": 0.24336253106594086, + "learning_rate": 4.568595407135315e-05, + "loss": 1.7468, + "step": 17649 + }, + { + "epoch": 5.417434008594229, + "grad_norm": 0.23891226947307587, + "learning_rate": 4.5681002068790755e-05, + "loss": 1.7201, + "step": 17650 + }, + { + "epoch": 5.417740945365255, + "grad_norm": 0.19209685921669006, + "learning_rate": 4.56760501089112e-05, + "loss": 1.713, + "step": 17651 + }, + { + "epoch": 5.41804788213628, + "grad_norm": 0.2407880276441574, + "learning_rate": 4.567109819176349e-05, + "loss": 1.7073, + "step": 17652 + }, + { + "epoch": 5.418354818907305, + "grad_norm": 0.2385055273771286, + "learning_rate": 4.5666146317396485e-05, + "loss": 1.7387, + "step": 17653 + }, + { + "epoch": 5.41866175567833, + "grad_norm": 0.22068475186824799, + "learning_rate": 4.566119448585918e-05, + "loss": 1.7116, + "step": 17654 + }, + { + "epoch": 5.418968692449355, + "grad_norm": 0.318375825881958, + "learning_rate": 4.5656242697200496e-05, + "loss": 1.7659, + "step": 17655 + }, + { + "epoch": 5.4192756292203805, + "grad_norm": 0.25311973690986633, + "learning_rate": 4.5651290951469366e-05, + "loss": 1.7814, + "step": 17656 + }, + { + "epoch": 5.419582565991406, + "grad_norm": 0.18701443076133728, + "learning_rate": 4.5646339248714735e-05, + "loss": 1.6993, + "step": 17657 + }, + { + "epoch": 5.419889502762431, + "grad_norm": 0.2964496314525604, + "learning_rate": 4.5641387588985516e-05, + "loss": 1.8254, + "step": 17658 + }, + { + "epoch": 5.420196439533456, + "grad_norm": 0.19447220861911774, + "learning_rate": 4.563643597233067e-05, + "loss": 1.7208, + "step": 17659 + }, + { + "epoch": 5.420503376304481, + "grad_norm": 0.21666039526462555, + "learning_rate": 4.5631484398799105e-05, + "loss": 1.6695, + "step": 17660 + }, + { + "epoch": 5.420810313075506, + "grad_norm": 0.23104412853717804, + "learning_rate": 4.5626532868439796e-05, + "loss": 1.7449, + "step": 17661 + }, + { + "epoch": 5.421117249846532, + "grad_norm": 0.20463459193706512, + "learning_rate": 4.562158138130163e-05, + "loss": 1.6714, + "step": 17662 + }, + { + "epoch": 5.421424186617557, + "grad_norm": 0.21948079764842987, + "learning_rate": 4.561662993743359e-05, + "loss": 1.6957, + "step": 17663 + }, + { + "epoch": 5.421731123388582, + "grad_norm": 0.2672746777534485, + "learning_rate": 4.561167853688455e-05, + "loss": 1.7137, + "step": 17664 + }, + { + "epoch": 5.422038060159607, + "grad_norm": 0.2652325928211212, + "learning_rate": 4.5606727179703493e-05, + "loss": 1.7943, + "step": 17665 + }, + { + "epoch": 5.422344996930632, + "grad_norm": 0.17761313915252686, + "learning_rate": 4.560177586593933e-05, + "loss": 1.7072, + "step": 17666 + }, + { + "epoch": 5.422651933701657, + "grad_norm": 0.24759770929813385, + "learning_rate": 4.5596824595641e-05, + "loss": 1.7807, + "step": 17667 + }, + { + "epoch": 5.422958870472683, + "grad_norm": 0.22191929817199707, + "learning_rate": 4.5591873368857416e-05, + "loss": 1.7668, + "step": 17668 + }, + { + "epoch": 5.423265807243708, + "grad_norm": 0.21293842792510986, + "learning_rate": 4.5586922185637546e-05, + "loss": 1.7304, + "step": 17669 + }, + { + "epoch": 5.4235727440147325, + "grad_norm": 0.2646051049232483, + "learning_rate": 4.5581971046030277e-05, + "loss": 1.7258, + "step": 17670 + }, + { + "epoch": 5.423879680785758, + "grad_norm": 0.1894550621509552, + "learning_rate": 4.5577019950084574e-05, + "loss": 1.7066, + "step": 17671 + }, + { + "epoch": 5.424186617556783, + "grad_norm": 0.2533467710018158, + "learning_rate": 4.557206889784934e-05, + "loss": 1.7668, + "step": 17672 + }, + { + "epoch": 5.4244935543278086, + "grad_norm": 0.1972150355577469, + "learning_rate": 4.556711788937352e-05, + "loss": 1.7306, + "step": 17673 + }, + { + "epoch": 5.424800491098834, + "grad_norm": 0.2726735472679138, + "learning_rate": 4.5562166924706054e-05, + "loss": 1.7281, + "step": 17674 + }, + { + "epoch": 5.425107427869859, + "grad_norm": 0.2244454175233841, + "learning_rate": 4.555721600389584e-05, + "loss": 1.7461, + "step": 17675 + }, + { + "epoch": 5.425414364640884, + "grad_norm": 0.19486510753631592, + "learning_rate": 4.555226512699182e-05, + "loss": 1.7361, + "step": 17676 + }, + { + "epoch": 5.425721301411909, + "grad_norm": 0.18128283321857452, + "learning_rate": 4.554731429404293e-05, + "loss": 1.7637, + "step": 17677 + }, + { + "epoch": 5.426028238182934, + "grad_norm": 0.24709749221801758, + "learning_rate": 4.5542363505098084e-05, + "loss": 1.7928, + "step": 17678 + }, + { + "epoch": 5.42633517495396, + "grad_norm": 0.2236633151769638, + "learning_rate": 4.553741276020621e-05, + "loss": 1.8262, + "step": 17679 + }, + { + "epoch": 5.426642111724985, + "grad_norm": 0.2592087984085083, + "learning_rate": 4.553246205941626e-05, + "loss": 1.675, + "step": 17680 + }, + { + "epoch": 5.4269490484960095, + "grad_norm": 0.27751871943473816, + "learning_rate": 4.552751140277712e-05, + "loss": 1.7344, + "step": 17681 + }, + { + "epoch": 5.427255985267035, + "grad_norm": 0.23752287030220032, + "learning_rate": 4.5522560790337746e-05, + "loss": 1.7748, + "step": 17682 + }, + { + "epoch": 5.42756292203806, + "grad_norm": 0.3259925842285156, + "learning_rate": 4.5517610222147035e-05, + "loss": 1.7855, + "step": 17683 + }, + { + "epoch": 5.4278698588090855, + "grad_norm": 0.2579646706581116, + "learning_rate": 4.551265969825394e-05, + "loss": 1.7978, + "step": 17684 + }, + { + "epoch": 5.428176795580111, + "grad_norm": 0.3217744827270508, + "learning_rate": 4.550770921870735e-05, + "loss": 1.7793, + "step": 17685 + }, + { + "epoch": 5.428483732351136, + "grad_norm": 0.2930903434753418, + "learning_rate": 4.550275878355624e-05, + "loss": 1.7226, + "step": 17686 + }, + { + "epoch": 5.428790669122161, + "grad_norm": 0.1982879489660263, + "learning_rate": 4.549780839284948e-05, + "loss": 1.6841, + "step": 17687 + }, + { + "epoch": 5.429097605893186, + "grad_norm": 0.20843900740146637, + "learning_rate": 4.5492858046636046e-05, + "loss": 1.7201, + "step": 17688 + }, + { + "epoch": 5.429404542664211, + "grad_norm": 0.23116534948349, + "learning_rate": 4.5487907744964794e-05, + "loss": 1.7565, + "step": 17689 + }, + { + "epoch": 5.429711479435237, + "grad_norm": 0.19177772104740143, + "learning_rate": 4.548295748788471e-05, + "loss": 1.7479, + "step": 17690 + }, + { + "epoch": 5.430018416206262, + "grad_norm": 0.22261449694633484, + "learning_rate": 4.547800727544469e-05, + "loss": 1.7785, + "step": 17691 + }, + { + "epoch": 5.430325352977286, + "grad_norm": 0.20073406398296356, + "learning_rate": 4.547305710769363e-05, + "loss": 1.741, + "step": 17692 + }, + { + "epoch": 5.430632289748312, + "grad_norm": 0.21662208437919617, + "learning_rate": 4.546810698468049e-05, + "loss": 1.7269, + "step": 17693 + }, + { + "epoch": 5.430939226519337, + "grad_norm": 0.19540879130363464, + "learning_rate": 4.546315690645416e-05, + "loss": 1.7141, + "step": 17694 + }, + { + "epoch": 5.431246163290362, + "grad_norm": 0.20063656568527222, + "learning_rate": 4.545820687306358e-05, + "loss": 1.7244, + "step": 17695 + }, + { + "epoch": 5.431553100061388, + "grad_norm": 0.2172660082578659, + "learning_rate": 4.545325688455765e-05, + "loss": 1.7172, + "step": 17696 + }, + { + "epoch": 5.431860036832412, + "grad_norm": 0.2480388581752777, + "learning_rate": 4.5448306940985326e-05, + "loss": 1.6994, + "step": 17697 + }, + { + "epoch": 5.4321669736034375, + "grad_norm": 0.22499477863311768, + "learning_rate": 4.544335704239547e-05, + "loss": 1.7405, + "step": 17698 + }, + { + "epoch": 5.432473910374463, + "grad_norm": 0.20655590295791626, + "learning_rate": 4.5438407188837065e-05, + "loss": 1.6867, + "step": 17699 + }, + { + "epoch": 5.432780847145488, + "grad_norm": 0.2045906037092209, + "learning_rate": 4.543345738035896e-05, + "loss": 1.7752, + "step": 17700 + }, + { + "epoch": 5.4330877839165135, + "grad_norm": 0.2092052847146988, + "learning_rate": 4.542850761701013e-05, + "loss": 1.7389, + "step": 17701 + }, + { + "epoch": 5.433394720687538, + "grad_norm": 0.1943730264902115, + "learning_rate": 4.5423557898839446e-05, + "loss": 1.7276, + "step": 17702 + }, + { + "epoch": 5.433701657458563, + "grad_norm": 0.23487289249897003, + "learning_rate": 4.541860822589587e-05, + "loss": 1.8119, + "step": 17703 + }, + { + "epoch": 5.434008594229589, + "grad_norm": 0.204689159989357, + "learning_rate": 4.541365859822827e-05, + "loss": 1.7865, + "step": 17704 + }, + { + "epoch": 5.434315531000614, + "grad_norm": 0.20850931107997894, + "learning_rate": 4.5408709015885604e-05, + "loss": 1.7733, + "step": 17705 + }, + { + "epoch": 5.434622467771639, + "grad_norm": 0.18685877323150635, + "learning_rate": 4.540375947891675e-05, + "loss": 1.7526, + "step": 17706 + }, + { + "epoch": 5.434929404542665, + "grad_norm": 0.2009890079498291, + "learning_rate": 4.539880998737064e-05, + "loss": 1.6904, + "step": 17707 + }, + { + "epoch": 5.435236341313689, + "grad_norm": 0.16602718830108643, + "learning_rate": 4.5393860541296205e-05, + "loss": 1.689, + "step": 17708 + }, + { + "epoch": 5.435543278084714, + "grad_norm": 0.24318818747997284, + "learning_rate": 4.5388911140742315e-05, + "loss": 1.7993, + "step": 17709 + }, + { + "epoch": 5.43585021485574, + "grad_norm": 0.24094417691230774, + "learning_rate": 4.538396178575793e-05, + "loss": 1.7235, + "step": 17710 + }, + { + "epoch": 5.436157151626765, + "grad_norm": 0.20361751317977905, + "learning_rate": 4.537901247639192e-05, + "loss": 1.7198, + "step": 17711 + }, + { + "epoch": 5.43646408839779, + "grad_norm": 0.2563718259334564, + "learning_rate": 4.537406321269323e-05, + "loss": 1.795, + "step": 17712 + }, + { + "epoch": 5.436771025168815, + "grad_norm": 0.29895591735839844, + "learning_rate": 4.536911399471075e-05, + "loss": 1.7515, + "step": 17713 + }, + { + "epoch": 5.43707796193984, + "grad_norm": 0.22535841166973114, + "learning_rate": 4.536416482249342e-05, + "loss": 1.6998, + "step": 17714 + }, + { + "epoch": 5.4373848987108655, + "grad_norm": 0.26025068759918213, + "learning_rate": 4.53592156960901e-05, + "loss": 1.7821, + "step": 17715 + }, + { + "epoch": 5.437691835481891, + "grad_norm": 0.3473168611526489, + "learning_rate": 4.535426661554975e-05, + "loss": 1.7035, + "step": 17716 + }, + { + "epoch": 5.437998772252916, + "grad_norm": 0.22207199037075043, + "learning_rate": 4.534931758092126e-05, + "loss": 1.7485, + "step": 17717 + }, + { + "epoch": 5.4383057090239415, + "grad_norm": 0.26839709281921387, + "learning_rate": 4.534436859225353e-05, + "loss": 1.7272, + "step": 17718 + }, + { + "epoch": 5.438612645794966, + "grad_norm": 0.37715891003608704, + "learning_rate": 4.5339419649595476e-05, + "loss": 1.7254, + "step": 17719 + }, + { + "epoch": 5.438919582565991, + "grad_norm": 0.21485768258571625, + "learning_rate": 4.533447075299603e-05, + "loss": 1.7349, + "step": 17720 + }, + { + "epoch": 5.439226519337017, + "grad_norm": 0.29502415657043457, + "learning_rate": 4.5329521902504055e-05, + "loss": 1.7325, + "step": 17721 + }, + { + "epoch": 5.439533456108042, + "grad_norm": 0.29448410868644714, + "learning_rate": 4.5324573098168505e-05, + "loss": 1.768, + "step": 17722 + }, + { + "epoch": 5.439840392879067, + "grad_norm": 0.1892058402299881, + "learning_rate": 4.5319624340038244e-05, + "loss": 1.6866, + "step": 17723 + }, + { + "epoch": 5.440147329650092, + "grad_norm": 0.3365040123462677, + "learning_rate": 4.531467562816221e-05, + "loss": 1.7662, + "step": 17724 + }, + { + "epoch": 5.440454266421117, + "grad_norm": 0.2960789203643799, + "learning_rate": 4.53097269625893e-05, + "loss": 1.746, + "step": 17725 + }, + { + "epoch": 5.440761203192142, + "grad_norm": 0.21623700857162476, + "learning_rate": 4.530477834336841e-05, + "loss": 1.7619, + "step": 17726 + }, + { + "epoch": 5.441068139963168, + "grad_norm": 0.29010120034217834, + "learning_rate": 4.5299829770548456e-05, + "loss": 1.717, + "step": 17727 + }, + { + "epoch": 5.441375076734193, + "grad_norm": 0.18467605113983154, + "learning_rate": 4.529488124417833e-05, + "loss": 1.6938, + "step": 17728 + }, + { + "epoch": 5.4416820135052175, + "grad_norm": 0.2875411808490753, + "learning_rate": 4.528993276430695e-05, + "loss": 1.7633, + "step": 17729 + }, + { + "epoch": 5.441988950276243, + "grad_norm": 0.24252675473690033, + "learning_rate": 4.528498433098321e-05, + "loss": 1.6477, + "step": 17730 + }, + { + "epoch": 5.442295887047268, + "grad_norm": 0.18885886669158936, + "learning_rate": 4.5280035944256035e-05, + "loss": 1.7241, + "step": 17731 + }, + { + "epoch": 5.4426028238182935, + "grad_norm": 0.2594204246997833, + "learning_rate": 4.527508760417429e-05, + "loss": 1.6697, + "step": 17732 + }, + { + "epoch": 5.442909760589319, + "grad_norm": 0.23796287178993225, + "learning_rate": 4.527013931078692e-05, + "loss": 1.7035, + "step": 17733 + }, + { + "epoch": 5.443216697360343, + "grad_norm": 0.2591552436351776, + "learning_rate": 4.5265191064142787e-05, + "loss": 1.8014, + "step": 17734 + }, + { + "epoch": 5.443523634131369, + "grad_norm": 0.3316073417663574, + "learning_rate": 4.526024286429082e-05, + "loss": 1.752, + "step": 17735 + }, + { + "epoch": 5.443830570902394, + "grad_norm": 0.2409597635269165, + "learning_rate": 4.52552947112799e-05, + "loss": 1.7662, + "step": 17736 + }, + { + "epoch": 5.444137507673419, + "grad_norm": 0.2896713614463806, + "learning_rate": 4.5250346605158964e-05, + "loss": 1.7168, + "step": 17737 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 0.30870527029037476, + "learning_rate": 4.524539854597686e-05, + "loss": 1.704, + "step": 17738 + }, + { + "epoch": 5.44475138121547, + "grad_norm": 0.2476932406425476, + "learning_rate": 4.524045053378254e-05, + "loss": 1.7649, + "step": 17739 + }, + { + "epoch": 5.445058317986494, + "grad_norm": 0.2937077283859253, + "learning_rate": 4.5235502568624855e-05, + "loss": 1.7028, + "step": 17740 + }, + { + "epoch": 5.44536525475752, + "grad_norm": 0.22881117463111877, + "learning_rate": 4.523055465055273e-05, + "loss": 1.7539, + "step": 17741 + }, + { + "epoch": 5.445672191528545, + "grad_norm": 0.2551842927932739, + "learning_rate": 4.522560677961508e-05, + "loss": 1.7601, + "step": 17742 + }, + { + "epoch": 5.44597912829957, + "grad_norm": 0.27533504366874695, + "learning_rate": 4.5220658955860754e-05, + "loss": 1.7695, + "step": 17743 + }, + { + "epoch": 5.446286065070596, + "grad_norm": 0.23387418687343597, + "learning_rate": 4.5215711179338706e-05, + "loss": 1.7218, + "step": 17744 + }, + { + "epoch": 5.44659300184162, + "grad_norm": 0.37932485342025757, + "learning_rate": 4.521076345009777e-05, + "loss": 1.7685, + "step": 17745 + }, + { + "epoch": 5.4468999386126455, + "grad_norm": 0.2668898105621338, + "learning_rate": 4.520581576818691e-05, + "loss": 1.7217, + "step": 17746 + }, + { + "epoch": 5.447206875383671, + "grad_norm": 0.2417856752872467, + "learning_rate": 4.520086813365496e-05, + "loss": 1.692, + "step": 17747 + }, + { + "epoch": 5.447513812154696, + "grad_norm": 0.3170008063316345, + "learning_rate": 4.519592054655086e-05, + "loss": 1.7565, + "step": 17748 + }, + { + "epoch": 5.4478207489257215, + "grad_norm": 0.20711660385131836, + "learning_rate": 4.519097300692348e-05, + "loss": 1.6708, + "step": 17749 + }, + { + "epoch": 5.448127685696747, + "grad_norm": 0.2196272760629654, + "learning_rate": 4.5186025514821746e-05, + "loss": 1.7335, + "step": 17750 + }, + { + "epoch": 5.448434622467771, + "grad_norm": 0.27563074231147766, + "learning_rate": 4.5181078070294505e-05, + "loss": 1.7383, + "step": 17751 + }, + { + "epoch": 5.448741559238797, + "grad_norm": 0.185418501496315, + "learning_rate": 4.517613067339068e-05, + "loss": 1.6841, + "step": 17752 + }, + { + "epoch": 5.449048496009822, + "grad_norm": 0.26787856221199036, + "learning_rate": 4.517118332415915e-05, + "loss": 1.7733, + "step": 17753 + }, + { + "epoch": 5.449355432780847, + "grad_norm": 0.22114823758602142, + "learning_rate": 4.516623602264885e-05, + "loss": 1.7153, + "step": 17754 + }, + { + "epoch": 5.449662369551873, + "grad_norm": 0.23090483248233795, + "learning_rate": 4.51612887689086e-05, + "loss": 1.7063, + "step": 17755 + }, + { + "epoch": 5.449969306322897, + "grad_norm": 0.3227362632751465, + "learning_rate": 4.515634156298736e-05, + "loss": 1.7528, + "step": 17756 + }, + { + "epoch": 5.4502762430939224, + "grad_norm": 0.24202494323253632, + "learning_rate": 4.515139440493397e-05, + "loss": 1.8119, + "step": 17757 + }, + { + "epoch": 5.450583179864948, + "grad_norm": 0.3778383731842041, + "learning_rate": 4.5146447294797356e-05, + "loss": 1.7589, + "step": 17758 + }, + { + "epoch": 5.450890116635973, + "grad_norm": 0.3726772964000702, + "learning_rate": 4.51415002326264e-05, + "loss": 1.7095, + "step": 17759 + }, + { + "epoch": 5.4511970534069984, + "grad_norm": 0.2424323409795761, + "learning_rate": 4.5136553218469966e-05, + "loss": 1.7374, + "step": 17760 + }, + { + "epoch": 5.451503990178024, + "grad_norm": 0.4347550570964813, + "learning_rate": 4.513160625237699e-05, + "loss": 1.8339, + "step": 17761 + }, + { + "epoch": 5.451810926949048, + "grad_norm": 0.2556018829345703, + "learning_rate": 4.512665933439631e-05, + "loss": 1.7024, + "step": 17762 + }, + { + "epoch": 5.452117863720074, + "grad_norm": 0.36380240321159363, + "learning_rate": 4.512171246457685e-05, + "loss": 1.7706, + "step": 17763 + }, + { + "epoch": 5.452424800491099, + "grad_norm": 0.42120790481567383, + "learning_rate": 4.5116765642967476e-05, + "loss": 1.7609, + "step": 17764 + }, + { + "epoch": 5.452731737262124, + "grad_norm": 0.20573028922080994, + "learning_rate": 4.51118188696171e-05, + "loss": 1.7521, + "step": 17765 + }, + { + "epoch": 5.45303867403315, + "grad_norm": 0.39001402258872986, + "learning_rate": 4.510687214457458e-05, + "loss": 1.7097, + "step": 17766 + }, + { + "epoch": 5.453345610804174, + "grad_norm": 0.2778739333152771, + "learning_rate": 4.510192546788884e-05, + "loss": 1.7677, + "step": 17767 + }, + { + "epoch": 5.453652547575199, + "grad_norm": 0.2500934600830078, + "learning_rate": 4.509697883960872e-05, + "loss": 1.7322, + "step": 17768 + }, + { + "epoch": 5.453959484346225, + "grad_norm": 0.23733557760715485, + "learning_rate": 4.509203225978314e-05, + "loss": 1.7426, + "step": 17769 + }, + { + "epoch": 5.45426642111725, + "grad_norm": 0.20033739507198334, + "learning_rate": 4.508708572846096e-05, + "loss": 1.7093, + "step": 17770 + }, + { + "epoch": 5.454573357888275, + "grad_norm": 0.202667698264122, + "learning_rate": 4.508213924569111e-05, + "loss": 1.6807, + "step": 17771 + }, + { + "epoch": 5.4548802946593, + "grad_norm": 0.1980566531419754, + "learning_rate": 4.507719281152241e-05, + "loss": 1.7102, + "step": 17772 + }, + { + "epoch": 5.455187231430325, + "grad_norm": 0.20612162351608276, + "learning_rate": 4.507224642600381e-05, + "loss": 1.7692, + "step": 17773 + }, + { + "epoch": 5.4554941682013505, + "grad_norm": 0.22859175503253937, + "learning_rate": 4.506730008918412e-05, + "loss": 1.7887, + "step": 17774 + }, + { + "epoch": 5.455801104972376, + "grad_norm": 0.19720709323883057, + "learning_rate": 4.5062353801112285e-05, + "loss": 1.7557, + "step": 17775 + }, + { + "epoch": 5.456108041743401, + "grad_norm": 0.23289217054843903, + "learning_rate": 4.505740756183717e-05, + "loss": 1.7023, + "step": 17776 + }, + { + "epoch": 5.456414978514426, + "grad_norm": 0.2120361477136612, + "learning_rate": 4.505246137140763e-05, + "loss": 1.7249, + "step": 17777 + }, + { + "epoch": 5.456721915285451, + "grad_norm": 0.2094341218471527, + "learning_rate": 4.504751522987259e-05, + "loss": 1.7586, + "step": 17778 + }, + { + "epoch": 5.457028852056476, + "grad_norm": 0.22361092269420624, + "learning_rate": 4.504256913728088e-05, + "loss": 1.737, + "step": 17779 + }, + { + "epoch": 5.457335788827502, + "grad_norm": 0.2100353240966797, + "learning_rate": 4.5037623093681424e-05, + "loss": 1.704, + "step": 17780 + }, + { + "epoch": 5.457642725598527, + "grad_norm": 0.20550231635570526, + "learning_rate": 4.503267709912308e-05, + "loss": 1.7732, + "step": 17781 + }, + { + "epoch": 5.457949662369552, + "grad_norm": 0.22843749821186066, + "learning_rate": 4.502773115365474e-05, + "loss": 1.6916, + "step": 17782 + }, + { + "epoch": 5.458256599140577, + "grad_norm": 0.2351907640695572, + "learning_rate": 4.502278525732526e-05, + "loss": 1.8043, + "step": 17783 + }, + { + "epoch": 5.458563535911602, + "grad_norm": 0.271028071641922, + "learning_rate": 4.501783941018355e-05, + "loss": 1.7665, + "step": 17784 + }, + { + "epoch": 5.458870472682627, + "grad_norm": 0.1974802166223526, + "learning_rate": 4.501289361227846e-05, + "loss": 1.718, + "step": 17785 + }, + { + "epoch": 5.459177409453653, + "grad_norm": 0.23726068437099457, + "learning_rate": 4.5007947863658884e-05, + "loss": 1.7507, + "step": 17786 + }, + { + "epoch": 5.459484346224678, + "grad_norm": 0.2112259715795517, + "learning_rate": 4.5003002164373684e-05, + "loss": 1.8116, + "step": 17787 + }, + { + "epoch": 5.4597912829957025, + "grad_norm": 0.2676105201244354, + "learning_rate": 4.4998056514471764e-05, + "loss": 1.7013, + "step": 17788 + }, + { + "epoch": 5.460098219766728, + "grad_norm": 0.2735576033592224, + "learning_rate": 4.4993110914001956e-05, + "loss": 1.7516, + "step": 17789 + }, + { + "epoch": 5.460405156537753, + "grad_norm": 0.1925152987241745, + "learning_rate": 4.498816536301319e-05, + "loss": 1.7018, + "step": 17790 + }, + { + "epoch": 5.4607120933087785, + "grad_norm": 0.25037717819213867, + "learning_rate": 4.498321986155429e-05, + "loss": 1.7207, + "step": 17791 + }, + { + "epoch": 5.461019030079804, + "grad_norm": 0.20481008291244507, + "learning_rate": 4.497827440967415e-05, + "loss": 1.6988, + "step": 17792 + }, + { + "epoch": 5.461325966850829, + "grad_norm": 0.19434049725532532, + "learning_rate": 4.4973329007421673e-05, + "loss": 1.7363, + "step": 17793 + }, + { + "epoch": 5.461632903621854, + "grad_norm": 0.21797434985637665, + "learning_rate": 4.496838365484567e-05, + "loss": 1.7218, + "step": 17794 + }, + { + "epoch": 5.461939840392879, + "grad_norm": 0.18477453291416168, + "learning_rate": 4.496343835199508e-05, + "loss": 1.7204, + "step": 17795 + }, + { + "epoch": 5.462246777163904, + "grad_norm": 0.21657803654670715, + "learning_rate": 4.495849309891872e-05, + "loss": 1.7671, + "step": 17796 + }, + { + "epoch": 5.46255371393493, + "grad_norm": 0.21027342975139618, + "learning_rate": 4.495354789566549e-05, + "loss": 1.7424, + "step": 17797 + }, + { + "epoch": 5.462860650705955, + "grad_norm": 0.2016189992427826, + "learning_rate": 4.4948602742284256e-05, + "loss": 1.7706, + "step": 17798 + }, + { + "epoch": 5.463167587476979, + "grad_norm": 0.2155935913324356, + "learning_rate": 4.494365763882391e-05, + "loss": 1.7314, + "step": 17799 + }, + { + "epoch": 5.463474524248005, + "grad_norm": 0.22079701721668243, + "learning_rate": 4.493871258533328e-05, + "loss": 1.7938, + "step": 17800 + }, + { + "epoch": 5.46378146101903, + "grad_norm": 0.1907699704170227, + "learning_rate": 4.4933767581861283e-05, + "loss": 1.6958, + "step": 17801 + }, + { + "epoch": 5.464088397790055, + "grad_norm": 0.2784879207611084, + "learning_rate": 4.4928822628456735e-05, + "loss": 1.7285, + "step": 17802 + }, + { + "epoch": 5.464395334561081, + "grad_norm": 0.29470255970954895, + "learning_rate": 4.492387772516855e-05, + "loss": 1.7363, + "step": 17803 + }, + { + "epoch": 5.464702271332105, + "grad_norm": 0.21387436985969543, + "learning_rate": 4.4918932872045575e-05, + "loss": 1.7414, + "step": 17804 + }, + { + "epoch": 5.4650092081031305, + "grad_norm": 0.3102552890777588, + "learning_rate": 4.49139880691367e-05, + "loss": 1.7359, + "step": 17805 + }, + { + "epoch": 5.465316144874156, + "grad_norm": 0.2312939465045929, + "learning_rate": 4.490904331649075e-05, + "loss": 1.7609, + "step": 17806 + }, + { + "epoch": 5.465623081645181, + "grad_norm": 0.323913037776947, + "learning_rate": 4.4904098614156645e-05, + "loss": 1.7693, + "step": 17807 + }, + { + "epoch": 5.4659300184162065, + "grad_norm": 0.2975599467754364, + "learning_rate": 4.48991539621832e-05, + "loss": 1.7506, + "step": 17808 + }, + { + "epoch": 5.466236955187231, + "grad_norm": 0.24702571332454681, + "learning_rate": 4.4894209360619316e-05, + "loss": 1.8258, + "step": 17809 + }, + { + "epoch": 5.466543891958256, + "grad_norm": 0.29016581177711487, + "learning_rate": 4.488926480951386e-05, + "loss": 1.7096, + "step": 17810 + }, + { + "epoch": 5.466850828729282, + "grad_norm": 0.2194555252790451, + "learning_rate": 4.488432030891566e-05, + "loss": 1.788, + "step": 17811 + }, + { + "epoch": 5.467157765500307, + "grad_norm": 0.2504041790962219, + "learning_rate": 4.487937585887363e-05, + "loss": 1.7672, + "step": 17812 + }, + { + "epoch": 5.467464702271332, + "grad_norm": 0.2362445741891861, + "learning_rate": 4.487443145943659e-05, + "loss": 1.7426, + "step": 17813 + }, + { + "epoch": 5.467771639042358, + "grad_norm": 0.20075896382331848, + "learning_rate": 4.486948711065343e-05, + "loss": 1.7406, + "step": 17814 + }, + { + "epoch": 5.468078575813382, + "grad_norm": 0.2219153791666031, + "learning_rate": 4.486454281257299e-05, + "loss": 1.683, + "step": 17815 + }, + { + "epoch": 5.468385512584407, + "grad_norm": 0.22551953792572021, + "learning_rate": 4.4859598565244176e-05, + "loss": 1.7896, + "step": 17816 + }, + { + "epoch": 5.468692449355433, + "grad_norm": 0.2385476976633072, + "learning_rate": 4.48546543687158e-05, + "loss": 1.7799, + "step": 17817 + }, + { + "epoch": 5.468999386126458, + "grad_norm": 0.24263370037078857, + "learning_rate": 4.4849710223036764e-05, + "loss": 1.682, + "step": 17818 + }, + { + "epoch": 5.469306322897483, + "grad_norm": 0.24301160871982574, + "learning_rate": 4.484476612825589e-05, + "loss": 1.8121, + "step": 17819 + }, + { + "epoch": 5.469613259668508, + "grad_norm": 0.2516932487487793, + "learning_rate": 4.483982208442207e-05, + "loss": 1.7344, + "step": 17820 + }, + { + "epoch": 5.469920196439533, + "grad_norm": 0.24309395253658295, + "learning_rate": 4.4834878091584156e-05, + "loss": 1.7746, + "step": 17821 + }, + { + "epoch": 5.4702271332105585, + "grad_norm": 0.24711866676807404, + "learning_rate": 4.4829934149790996e-05, + "loss": 1.7887, + "step": 17822 + }, + { + "epoch": 5.470534069981584, + "grad_norm": 0.2923797369003296, + "learning_rate": 4.4824990259091445e-05, + "loss": 1.7017, + "step": 17823 + }, + { + "epoch": 5.470841006752609, + "grad_norm": 0.21658629179000854, + "learning_rate": 4.482004641953441e-05, + "loss": 1.725, + "step": 17824 + }, + { + "epoch": 5.4711479435236345, + "grad_norm": 0.233424574136734, + "learning_rate": 4.481510263116868e-05, + "loss": 1.74, + "step": 17825 + }, + { + "epoch": 5.471454880294659, + "grad_norm": 0.28997600078582764, + "learning_rate": 4.481015889404315e-05, + "loss": 1.8418, + "step": 17826 + }, + { + "epoch": 5.471761817065684, + "grad_norm": 0.2245558649301529, + "learning_rate": 4.480521520820669e-05, + "loss": 1.7519, + "step": 17827 + }, + { + "epoch": 5.47206875383671, + "grad_norm": 0.21008887887001038, + "learning_rate": 4.480027157370812e-05, + "loss": 1.6977, + "step": 17828 + }, + { + "epoch": 5.472375690607735, + "grad_norm": 0.1990261971950531, + "learning_rate": 4.479532799059633e-05, + "loss": 1.7004, + "step": 17829 + }, + { + "epoch": 5.47268262737876, + "grad_norm": 0.2354540079832077, + "learning_rate": 4.479038445892014e-05, + "loss": 1.7755, + "step": 17830 + }, + { + "epoch": 5.472989564149785, + "grad_norm": 0.21904973685741425, + "learning_rate": 4.478544097872843e-05, + "loss": 1.8328, + "step": 17831 + }, + { + "epoch": 5.47329650092081, + "grad_norm": 0.21188503503799438, + "learning_rate": 4.4780497550070055e-05, + "loss": 1.7105, + "step": 17832 + }, + { + "epoch": 5.473603437691835, + "grad_norm": 0.2196870595216751, + "learning_rate": 4.477555417299386e-05, + "loss": 1.7261, + "step": 17833 + }, + { + "epoch": 5.473910374462861, + "grad_norm": 0.24522331357002258, + "learning_rate": 4.477061084754869e-05, + "loss": 1.8101, + "step": 17834 + }, + { + "epoch": 5.474217311233886, + "grad_norm": 0.24073927104473114, + "learning_rate": 4.476566757378343e-05, + "loss": 1.8295, + "step": 17835 + }, + { + "epoch": 5.474524248004911, + "grad_norm": 0.3724605143070221, + "learning_rate": 4.476072435174689e-05, + "loss": 1.7785, + "step": 17836 + }, + { + "epoch": 5.474831184775936, + "grad_norm": 0.25552257895469666, + "learning_rate": 4.475578118148797e-05, + "loss": 1.6978, + "step": 17837 + }, + { + "epoch": 5.475138121546961, + "grad_norm": 0.22402255237102509, + "learning_rate": 4.475083806305546e-05, + "loss": 1.697, + "step": 17838 + }, + { + "epoch": 5.475445058317987, + "grad_norm": 0.25869324803352356, + "learning_rate": 4.474589499649826e-05, + "loss": 1.7026, + "step": 17839 + }, + { + "epoch": 5.475751995089012, + "grad_norm": 0.249742329120636, + "learning_rate": 4.47409519818652e-05, + "loss": 1.7738, + "step": 17840 + }, + { + "epoch": 5.476058931860037, + "grad_norm": 0.28722140192985535, + "learning_rate": 4.473600901920515e-05, + "loss": 1.7555, + "step": 17841 + }, + { + "epoch": 5.476365868631062, + "grad_norm": 0.250964879989624, + "learning_rate": 4.4731066108566926e-05, + "loss": 1.6951, + "step": 17842 + }, + { + "epoch": 5.476672805402087, + "grad_norm": 0.20562006533145905, + "learning_rate": 4.472612324999942e-05, + "loss": 1.7109, + "step": 17843 + }, + { + "epoch": 5.476979742173112, + "grad_norm": 0.26964858174324036, + "learning_rate": 4.472118044355144e-05, + "loss": 1.7468, + "step": 17844 + }, + { + "epoch": 5.477286678944138, + "grad_norm": 0.25700438022613525, + "learning_rate": 4.471623768927184e-05, + "loss": 1.7046, + "step": 17845 + }, + { + "epoch": 5.477593615715163, + "grad_norm": 0.2152809500694275, + "learning_rate": 4.47112949872095e-05, + "loss": 1.7464, + "step": 17846 + }, + { + "epoch": 5.4779005524861875, + "grad_norm": 0.26429688930511475, + "learning_rate": 4.470635233741321e-05, + "loss": 1.7629, + "step": 17847 + }, + { + "epoch": 5.478207489257213, + "grad_norm": 0.18546637892723083, + "learning_rate": 4.470140973993188e-05, + "loss": 1.7143, + "step": 17848 + }, + { + "epoch": 5.478514426028238, + "grad_norm": 0.1927761435508728, + "learning_rate": 4.46964671948143e-05, + "loss": 1.6919, + "step": 17849 + }, + { + "epoch": 5.4788213627992635, + "grad_norm": 0.21581199765205383, + "learning_rate": 4.469152470210935e-05, + "loss": 1.7596, + "step": 17850 + }, + { + "epoch": 5.479128299570289, + "grad_norm": 0.20244133472442627, + "learning_rate": 4.468658226186586e-05, + "loss": 1.7372, + "step": 17851 + }, + { + "epoch": 5.479435236341313, + "grad_norm": 0.2467198520898819, + "learning_rate": 4.468163987413269e-05, + "loss": 1.7361, + "step": 17852 + }, + { + "epoch": 5.479742173112339, + "grad_norm": 0.22134411334991455, + "learning_rate": 4.467669753895866e-05, + "loss": 1.7276, + "step": 17853 + }, + { + "epoch": 5.480049109883364, + "grad_norm": 0.1953750103712082, + "learning_rate": 4.4671755256392636e-05, + "loss": 1.6931, + "step": 17854 + }, + { + "epoch": 5.480356046654389, + "grad_norm": 0.21492068469524384, + "learning_rate": 4.466681302648343e-05, + "loss": 1.7437, + "step": 17855 + }, + { + "epoch": 5.480662983425415, + "grad_norm": 0.24377848207950592, + "learning_rate": 4.466187084927993e-05, + "loss": 1.7869, + "step": 17856 + }, + { + "epoch": 5.48096992019644, + "grad_norm": 0.23674219846725464, + "learning_rate": 4.465692872483093e-05, + "loss": 1.8142, + "step": 17857 + }, + { + "epoch": 5.481276856967464, + "grad_norm": 0.25036486983299255, + "learning_rate": 4.4651986653185304e-05, + "loss": 1.8075, + "step": 17858 + }, + { + "epoch": 5.48158379373849, + "grad_norm": 0.32649150490760803, + "learning_rate": 4.4647044634391867e-05, + "loss": 1.7177, + "step": 17859 + }, + { + "epoch": 5.481890730509515, + "grad_norm": 0.20300604403018951, + "learning_rate": 4.46421026684995e-05, + "loss": 1.6912, + "step": 17860 + }, + { + "epoch": 5.48219766728054, + "grad_norm": 0.24630679190158844, + "learning_rate": 4.4637160755557e-05, + "loss": 1.8312, + "step": 17861 + }, + { + "epoch": 5.482504604051566, + "grad_norm": 0.2263093739748001, + "learning_rate": 4.46322188956132e-05, + "loss": 1.7214, + "step": 17862 + }, + { + "epoch": 5.48281154082259, + "grad_norm": 0.22949177026748657, + "learning_rate": 4.462727708871699e-05, + "loss": 1.6882, + "step": 17863 + }, + { + "epoch": 5.4831184775936155, + "grad_norm": 0.23389381170272827, + "learning_rate": 4.4622335334917156e-05, + "loss": 1.7613, + "step": 17864 + }, + { + "epoch": 5.483425414364641, + "grad_norm": 0.2259683907032013, + "learning_rate": 4.461739363426257e-05, + "loss": 1.7021, + "step": 17865 + }, + { + "epoch": 5.483732351135666, + "grad_norm": 0.3213486969470978, + "learning_rate": 4.4612451986802036e-05, + "loss": 1.7469, + "step": 17866 + }, + { + "epoch": 5.4840392879066915, + "grad_norm": 0.3415670096874237, + "learning_rate": 4.4607510392584426e-05, + "loss": 1.7605, + "step": 17867 + }, + { + "epoch": 5.484346224677717, + "grad_norm": 0.2079494297504425, + "learning_rate": 4.460256885165855e-05, + "loss": 1.7832, + "step": 17868 + }, + { + "epoch": 5.484653161448741, + "grad_norm": 0.30334988236427307, + "learning_rate": 4.459762736407327e-05, + "loss": 1.6825, + "step": 17869 + }, + { + "epoch": 5.484960098219767, + "grad_norm": 0.22320730984210968, + "learning_rate": 4.4592685929877374e-05, + "loss": 1.7452, + "step": 17870 + }, + { + "epoch": 5.485267034990792, + "grad_norm": 0.25325682759284973, + "learning_rate": 4.458774454911975e-05, + "loss": 1.7359, + "step": 17871 + }, + { + "epoch": 5.485573971761817, + "grad_norm": 0.305501788854599, + "learning_rate": 4.458280322184919e-05, + "loss": 1.7161, + "step": 17872 + }, + { + "epoch": 5.485880908532843, + "grad_norm": 0.19486182928085327, + "learning_rate": 4.457786194811455e-05, + "loss": 1.7097, + "step": 17873 + }, + { + "epoch": 5.486187845303867, + "grad_norm": 0.3306363821029663, + "learning_rate": 4.457292072796465e-05, + "loss": 1.7653, + "step": 17874 + }, + { + "epoch": 5.486494782074892, + "grad_norm": 0.25172874331474304, + "learning_rate": 4.456797956144835e-05, + "loss": 1.7289, + "step": 17875 + }, + { + "epoch": 5.486801718845918, + "grad_norm": 0.24508661031723022, + "learning_rate": 4.456303844861444e-05, + "loss": 1.7255, + "step": 17876 + }, + { + "epoch": 5.487108655616943, + "grad_norm": 0.3043360114097595, + "learning_rate": 4.455809738951178e-05, + "loss": 1.7852, + "step": 17877 + }, + { + "epoch": 5.487415592387968, + "grad_norm": 0.22181758284568787, + "learning_rate": 4.4553156384189186e-05, + "loss": 1.7887, + "step": 17878 + }, + { + "epoch": 5.487722529158993, + "grad_norm": 0.2174321413040161, + "learning_rate": 4.454821543269549e-05, + "loss": 1.7024, + "step": 17879 + }, + { + "epoch": 5.488029465930018, + "grad_norm": 0.19634750485420227, + "learning_rate": 4.4543274535079535e-05, + "loss": 1.7451, + "step": 17880 + }, + { + "epoch": 5.4883364027010435, + "grad_norm": 0.20481908321380615, + "learning_rate": 4.4538333691390125e-05, + "loss": 1.7068, + "step": 17881 + }, + { + "epoch": 5.488643339472069, + "grad_norm": 0.2025458663702011, + "learning_rate": 4.453339290167612e-05, + "loss": 1.72, + "step": 17882 + }, + { + "epoch": 5.488950276243094, + "grad_norm": 0.21013019979000092, + "learning_rate": 4.452845216598632e-05, + "loss": 1.7113, + "step": 17883 + }, + { + "epoch": 5.4892572130141195, + "grad_norm": 0.2057499885559082, + "learning_rate": 4.452351148436956e-05, + "loss": 1.7007, + "step": 17884 + }, + { + "epoch": 5.489564149785144, + "grad_norm": 0.19957664608955383, + "learning_rate": 4.4518570856874666e-05, + "loss": 1.6999, + "step": 17885 + }, + { + "epoch": 5.489871086556169, + "grad_norm": 0.22609412670135498, + "learning_rate": 4.451363028355048e-05, + "loss": 1.8124, + "step": 17886 + }, + { + "epoch": 5.490178023327195, + "grad_norm": 0.27350863814353943, + "learning_rate": 4.4508689764445805e-05, + "loss": 1.8042, + "step": 17887 + }, + { + "epoch": 5.49048496009822, + "grad_norm": 0.23416854441165924, + "learning_rate": 4.450374929960949e-05, + "loss": 1.7607, + "step": 17888 + }, + { + "epoch": 5.490791896869245, + "grad_norm": 0.2891421318054199, + "learning_rate": 4.449880888909033e-05, + "loss": 1.7419, + "step": 17889 + }, + { + "epoch": 5.49109883364027, + "grad_norm": 0.2458745837211609, + "learning_rate": 4.449386853293717e-05, + "loss": 1.7234, + "step": 17890 + }, + { + "epoch": 5.491405770411295, + "grad_norm": 0.23390449583530426, + "learning_rate": 4.4488928231198826e-05, + "loss": 1.7482, + "step": 17891 + }, + { + "epoch": 5.49171270718232, + "grad_norm": 0.3509657084941864, + "learning_rate": 4.448398798392414e-05, + "loss": 1.7639, + "step": 17892 + }, + { + "epoch": 5.492019643953346, + "grad_norm": 0.2487955242395401, + "learning_rate": 4.4479047791161916e-05, + "loss": 1.7163, + "step": 17893 + }, + { + "epoch": 5.492326580724371, + "grad_norm": 0.22630274295806885, + "learning_rate": 4.4474107652960956e-05, + "loss": 1.7449, + "step": 17894 + }, + { + "epoch": 5.4926335174953955, + "grad_norm": 0.25909537076950073, + "learning_rate": 4.446916756937012e-05, + "loss": 1.7396, + "step": 17895 + }, + { + "epoch": 5.492940454266421, + "grad_norm": 0.29732683300971985, + "learning_rate": 4.446422754043819e-05, + "loss": 1.8109, + "step": 17896 + }, + { + "epoch": 5.493247391037446, + "grad_norm": 0.22436772286891937, + "learning_rate": 4.4459287566214035e-05, + "loss": 1.7657, + "step": 17897 + }, + { + "epoch": 5.4935543278084715, + "grad_norm": 0.24584892392158508, + "learning_rate": 4.445434764674643e-05, + "loss": 1.73, + "step": 17898 + }, + { + "epoch": 5.493861264579497, + "grad_norm": 0.27446454763412476, + "learning_rate": 4.444940778208423e-05, + "loss": 1.7428, + "step": 17899 + }, + { + "epoch": 5.494168201350522, + "grad_norm": 0.20442110300064087, + "learning_rate": 4.4444467972276215e-05, + "loss": 1.6911, + "step": 17900 + }, + { + "epoch": 5.494475138121547, + "grad_norm": 0.23089268803596497, + "learning_rate": 4.4439528217371236e-05, + "loss": 1.7192, + "step": 17901 + }, + { + "epoch": 5.494782074892572, + "grad_norm": 0.19402450323104858, + "learning_rate": 4.443458851741808e-05, + "loss": 1.7304, + "step": 17902 + }, + { + "epoch": 5.495089011663597, + "grad_norm": 0.2310219705104828, + "learning_rate": 4.442964887246561e-05, + "loss": 1.6963, + "step": 17903 + }, + { + "epoch": 5.495395948434623, + "grad_norm": 0.25573140382766724, + "learning_rate": 4.44247092825626e-05, + "loss": 1.7781, + "step": 17904 + }, + { + "epoch": 5.495702885205648, + "grad_norm": 0.20298753678798676, + "learning_rate": 4.4419769747757894e-05, + "loss": 1.763, + "step": 17905 + }, + { + "epoch": 5.496009821976672, + "grad_norm": 0.22243307530879974, + "learning_rate": 4.441483026810027e-05, + "loss": 1.7345, + "step": 17906 + }, + { + "epoch": 5.496316758747698, + "grad_norm": 0.19801411032676697, + "learning_rate": 4.4409890843638584e-05, + "loss": 1.7504, + "step": 17907 + }, + { + "epoch": 5.496623695518723, + "grad_norm": 0.2804374396800995, + "learning_rate": 4.440495147442162e-05, + "loss": 1.7985, + "step": 17908 + }, + { + "epoch": 5.496930632289748, + "grad_norm": 0.21824021637439728, + "learning_rate": 4.440001216049822e-05, + "loss": 1.6703, + "step": 17909 + }, + { + "epoch": 5.497237569060774, + "grad_norm": 0.23335935175418854, + "learning_rate": 4.439507290191719e-05, + "loss": 1.7426, + "step": 17910 + }, + { + "epoch": 5.497544505831799, + "grad_norm": 0.2093769609928131, + "learning_rate": 4.4390133698727315e-05, + "loss": 1.7178, + "step": 17911 + }, + { + "epoch": 5.4978514426028235, + "grad_norm": 0.18354324996471405, + "learning_rate": 4.438519455097743e-05, + "loss": 1.6849, + "step": 17912 + }, + { + "epoch": 5.498158379373849, + "grad_norm": 0.26826491951942444, + "learning_rate": 4.438025545871633e-05, + "loss": 1.7804, + "step": 17913 + }, + { + "epoch": 5.498465316144874, + "grad_norm": 0.29171738028526306, + "learning_rate": 4.437531642199288e-05, + "loss": 1.764, + "step": 17914 + }, + { + "epoch": 5.4987722529158995, + "grad_norm": 0.17870590090751648, + "learning_rate": 4.437037744085581e-05, + "loss": 1.6789, + "step": 17915 + }, + { + "epoch": 5.499079189686925, + "grad_norm": 0.25412192940711975, + "learning_rate": 4.4365438515354e-05, + "loss": 1.7536, + "step": 17916 + }, + { + "epoch": 5.499386126457949, + "grad_norm": 0.24465163052082062, + "learning_rate": 4.4360499645536203e-05, + "loss": 1.7582, + "step": 17917 + }, + { + "epoch": 5.499693063228975, + "grad_norm": 0.21248452365398407, + "learning_rate": 4.4355560831451264e-05, + "loss": 1.7209, + "step": 17918 + }, + { + "epoch": 5.5, + "grad_norm": 0.21018685400485992, + "learning_rate": 4.435062207314797e-05, + "loss": 1.7461, + "step": 17919 + }, + { + "epoch": 5.500306936771025, + "grad_norm": 0.1880551278591156, + "learning_rate": 4.434568337067517e-05, + "loss": 1.6818, + "step": 17920 + }, + { + "epoch": 5.500613873542051, + "grad_norm": 0.2224894016981125, + "learning_rate": 4.434074472408161e-05, + "loss": 1.8211, + "step": 17921 + }, + { + "epoch": 5.500920810313076, + "grad_norm": 0.19419749081134796, + "learning_rate": 4.433580613341615e-05, + "loss": 1.7625, + "step": 17922 + }, + { + "epoch": 5.5012277470841005, + "grad_norm": 0.2167430967092514, + "learning_rate": 4.433086759872756e-05, + "loss": 1.745, + "step": 17923 + }, + { + "epoch": 5.501534683855126, + "grad_norm": 0.1926383525133133, + "learning_rate": 4.4325929120064665e-05, + "loss": 1.7353, + "step": 17924 + }, + { + "epoch": 5.501841620626151, + "grad_norm": 0.22943224012851715, + "learning_rate": 4.432099069747625e-05, + "loss": 1.6903, + "step": 17925 + }, + { + "epoch": 5.5021485573971765, + "grad_norm": 0.18218693137168884, + "learning_rate": 4.431605233101116e-05, + "loss": 1.742, + "step": 17926 + }, + { + "epoch": 5.502455494168201, + "grad_norm": 0.2660788893699646, + "learning_rate": 4.431111402071817e-05, + "loss": 1.7208, + "step": 17927 + }, + { + "epoch": 5.502762430939226, + "grad_norm": 0.20015788078308105, + "learning_rate": 4.430617576664606e-05, + "loss": 1.721, + "step": 17928 + }, + { + "epoch": 5.503069367710252, + "grad_norm": 0.20011179149150848, + "learning_rate": 4.430123756884368e-05, + "loss": 1.7488, + "step": 17929 + }, + { + "epoch": 5.503376304481277, + "grad_norm": 0.22541452944278717, + "learning_rate": 4.429629942735979e-05, + "loss": 1.7997, + "step": 17930 + }, + { + "epoch": 5.503683241252302, + "grad_norm": 0.21067193150520325, + "learning_rate": 4.4291361342243236e-05, + "loss": 1.6652, + "step": 17931 + }, + { + "epoch": 5.503990178023328, + "grad_norm": 0.38401395082473755, + "learning_rate": 4.428642331354278e-05, + "loss": 1.815, + "step": 17932 + }, + { + "epoch": 5.504297114794352, + "grad_norm": 0.22600100934505463, + "learning_rate": 4.428148534130725e-05, + "loss": 1.7593, + "step": 17933 + }, + { + "epoch": 5.504604051565377, + "grad_norm": 0.21340666711330414, + "learning_rate": 4.427654742558542e-05, + "loss": 1.7447, + "step": 17934 + }, + { + "epoch": 5.504910988336403, + "grad_norm": 0.20676501095294952, + "learning_rate": 4.427160956642611e-05, + "loss": 1.7174, + "step": 17935 + }, + { + "epoch": 5.505217925107428, + "grad_norm": 0.2374252825975418, + "learning_rate": 4.42666717638781e-05, + "loss": 1.703, + "step": 17936 + }, + { + "epoch": 5.505524861878453, + "grad_norm": 0.20975756645202637, + "learning_rate": 4.426173401799022e-05, + "loss": 1.7076, + "step": 17937 + }, + { + "epoch": 5.505831798649478, + "grad_norm": 0.23778517544269562, + "learning_rate": 4.4256796328811226e-05, + "loss": 1.7647, + "step": 17938 + }, + { + "epoch": 5.506138735420503, + "grad_norm": 0.2088557481765747, + "learning_rate": 4.425185869638996e-05, + "loss": 1.764, + "step": 17939 + }, + { + "epoch": 5.5064456721915285, + "grad_norm": 0.26953455805778503, + "learning_rate": 4.424692112077518e-05, + "loss": 1.7351, + "step": 17940 + }, + { + "epoch": 5.506752608962554, + "grad_norm": 0.2762589454650879, + "learning_rate": 4.42419836020157e-05, + "loss": 1.7051, + "step": 17941 + }, + { + "epoch": 5.507059545733579, + "grad_norm": 0.19611702859401703, + "learning_rate": 4.4237046140160306e-05, + "loss": 1.7445, + "step": 17942 + }, + { + "epoch": 5.5073664825046045, + "grad_norm": 0.2708270251750946, + "learning_rate": 4.4232108735257824e-05, + "loss": 1.7284, + "step": 17943 + }, + { + "epoch": 5.507673419275629, + "grad_norm": 0.24194146692752838, + "learning_rate": 4.422717138735701e-05, + "loss": 1.7302, + "step": 17944 + }, + { + "epoch": 5.507980356046654, + "grad_norm": 0.21558286249637604, + "learning_rate": 4.422223409650666e-05, + "loss": 1.7435, + "step": 17945 + }, + { + "epoch": 5.50828729281768, + "grad_norm": 0.1842707246541977, + "learning_rate": 4.4217296862755597e-05, + "loss": 1.6579, + "step": 17946 + }, + { + "epoch": 5.508594229588705, + "grad_norm": 0.20211941003799438, + "learning_rate": 4.4212359686152576e-05, + "loss": 1.8017, + "step": 17947 + }, + { + "epoch": 5.50890116635973, + "grad_norm": 0.23749016225337982, + "learning_rate": 4.420742256674644e-05, + "loss": 1.6721, + "step": 17948 + }, + { + "epoch": 5.509208103130755, + "grad_norm": 0.2076852172613144, + "learning_rate": 4.420248550458592e-05, + "loss": 1.7102, + "step": 17949 + }, + { + "epoch": 5.50951503990178, + "grad_norm": 0.2599447965621948, + "learning_rate": 4.419754849971986e-05, + "loss": 1.7819, + "step": 17950 + }, + { + "epoch": 5.509821976672805, + "grad_norm": 0.2017187476158142, + "learning_rate": 4.4192611552197e-05, + "loss": 1.6812, + "step": 17951 + }, + { + "epoch": 5.510128913443831, + "grad_norm": 0.21972116827964783, + "learning_rate": 4.418767466206617e-05, + "loss": 1.7122, + "step": 17952 + }, + { + "epoch": 5.510435850214856, + "grad_norm": 0.21750569343566895, + "learning_rate": 4.418273782937613e-05, + "loss": 1.7285, + "step": 17953 + }, + { + "epoch": 5.510742786985881, + "grad_norm": 0.19349125027656555, + "learning_rate": 4.417780105417572e-05, + "loss": 1.7383, + "step": 17954 + }, + { + "epoch": 5.511049723756906, + "grad_norm": 0.2094268798828125, + "learning_rate": 4.417286433651366e-05, + "loss": 1.7107, + "step": 17955 + }, + { + "epoch": 5.511356660527931, + "grad_norm": 0.2684331238269806, + "learning_rate": 4.41679276764388e-05, + "loss": 1.7336, + "step": 17956 + }, + { + "epoch": 5.5116635972989565, + "grad_norm": 0.27616915106773376, + "learning_rate": 4.416299107399987e-05, + "loss": 1.7439, + "step": 17957 + }, + { + "epoch": 5.511970534069982, + "grad_norm": 0.23874540627002716, + "learning_rate": 4.415805452924569e-05, + "loss": 1.7979, + "step": 17958 + }, + { + "epoch": 5.512277470841006, + "grad_norm": 0.21870921552181244, + "learning_rate": 4.415311804222503e-05, + "loss": 1.6674, + "step": 17959 + }, + { + "epoch": 5.512584407612032, + "grad_norm": 0.23042429983615875, + "learning_rate": 4.414818161298671e-05, + "loss": 1.7588, + "step": 17960 + }, + { + "epoch": 5.512891344383057, + "grad_norm": 0.2957153916358948, + "learning_rate": 4.4143245241579486e-05, + "loss": 1.8412, + "step": 17961 + }, + { + "epoch": 5.513198281154082, + "grad_norm": 0.28292644023895264, + "learning_rate": 4.413830892805213e-05, + "loss": 1.7915, + "step": 17962 + }, + { + "epoch": 5.513505217925108, + "grad_norm": 0.26526281237602234, + "learning_rate": 4.413337267245344e-05, + "loss": 1.7199, + "step": 17963 + }, + { + "epoch": 5.513812154696133, + "grad_norm": 0.41243693232536316, + "learning_rate": 4.4128436474832204e-05, + "loss": 1.7419, + "step": 17964 + }, + { + "epoch": 5.514119091467157, + "grad_norm": 0.2747771739959717, + "learning_rate": 4.4123500335237214e-05, + "loss": 1.7449, + "step": 17965 + }, + { + "epoch": 5.514426028238183, + "grad_norm": 0.25944122672080994, + "learning_rate": 4.4118564253717216e-05, + "loss": 1.7667, + "step": 17966 + }, + { + "epoch": 5.514732965009208, + "grad_norm": 0.32558533549308777, + "learning_rate": 4.411362823032103e-05, + "loss": 1.7292, + "step": 17967 + }, + { + "epoch": 5.515039901780233, + "grad_norm": 0.20190958678722382, + "learning_rate": 4.4108692265097404e-05, + "loss": 1.7529, + "step": 17968 + }, + { + "epoch": 5.515346838551259, + "grad_norm": 0.35485807061195374, + "learning_rate": 4.410375635809514e-05, + "loss": 1.7335, + "step": 17969 + }, + { + "epoch": 5.515653775322283, + "grad_norm": 0.2670159935951233, + "learning_rate": 4.409882050936301e-05, + "loss": 1.6789, + "step": 17970 + }, + { + "epoch": 5.5159607120933085, + "grad_norm": 0.19106578826904297, + "learning_rate": 4.409388471894981e-05, + "loss": 1.708, + "step": 17971 + }, + { + "epoch": 5.516267648864334, + "grad_norm": 0.2707268297672272, + "learning_rate": 4.4088948986904286e-05, + "loss": 1.7917, + "step": 17972 + }, + { + "epoch": 5.516574585635359, + "grad_norm": 0.2329230159521103, + "learning_rate": 4.408401331327525e-05, + "loss": 1.7378, + "step": 17973 + }, + { + "epoch": 5.5168815224063845, + "grad_norm": 0.22164998948574066, + "learning_rate": 4.4079077698111436e-05, + "loss": 1.7287, + "step": 17974 + }, + { + "epoch": 5.51718845917741, + "grad_norm": 0.25895699858665466, + "learning_rate": 4.4074142141461665e-05, + "loss": 1.7158, + "step": 17975 + }, + { + "epoch": 5.517495395948434, + "grad_norm": 0.2617860436439514, + "learning_rate": 4.4069206643374695e-05, + "loss": 1.7767, + "step": 17976 + }, + { + "epoch": 5.51780233271946, + "grad_norm": 0.20443588495254517, + "learning_rate": 4.40642712038993e-05, + "loss": 1.7371, + "step": 17977 + }, + { + "epoch": 5.518109269490485, + "grad_norm": 0.26251545548439026, + "learning_rate": 4.4059335823084266e-05, + "loss": 1.8154, + "step": 17978 + }, + { + "epoch": 5.51841620626151, + "grad_norm": 0.2315993458032608, + "learning_rate": 4.405440050097833e-05, + "loss": 1.7426, + "step": 17979 + }, + { + "epoch": 5.518723143032536, + "grad_norm": 0.19467706978321075, + "learning_rate": 4.404946523763031e-05, + "loss": 1.7418, + "step": 17980 + }, + { + "epoch": 5.51903007980356, + "grad_norm": 0.2387837916612625, + "learning_rate": 4.4044530033088946e-05, + "loss": 1.7648, + "step": 17981 + }, + { + "epoch": 5.519337016574585, + "grad_norm": 0.21097531914710999, + "learning_rate": 4.403959488740306e-05, + "loss": 1.7198, + "step": 17982 + }, + { + "epoch": 5.519643953345611, + "grad_norm": 0.22303247451782227, + "learning_rate": 4.403465980062136e-05, + "loss": 1.7679, + "step": 17983 + }, + { + "epoch": 5.519950890116636, + "grad_norm": 0.19705620408058167, + "learning_rate": 4.4029724772792666e-05, + "loss": 1.7747, + "step": 17984 + }, + { + "epoch": 5.520257826887661, + "grad_norm": 0.20864570140838623, + "learning_rate": 4.4024789803965715e-05, + "loss": 1.6797, + "step": 17985 + }, + { + "epoch": 5.520564763658687, + "grad_norm": 0.1917724758386612, + "learning_rate": 4.401985489418931e-05, + "loss": 1.7246, + "step": 17986 + }, + { + "epoch": 5.520871700429711, + "grad_norm": 0.25668975710868835, + "learning_rate": 4.401492004351219e-05, + "loss": 1.7245, + "step": 17987 + }, + { + "epoch": 5.5211786372007365, + "grad_norm": 0.22576093673706055, + "learning_rate": 4.4009985251983146e-05, + "loss": 1.6766, + "step": 17988 + }, + { + "epoch": 5.521485573971762, + "grad_norm": 0.18614664673805237, + "learning_rate": 4.400505051965093e-05, + "loss": 1.7379, + "step": 17989 + }, + { + "epoch": 5.521792510742787, + "grad_norm": 0.21472783386707306, + "learning_rate": 4.4000115846564335e-05, + "loss": 1.7203, + "step": 17990 + }, + { + "epoch": 5.5220994475138125, + "grad_norm": 0.201142817735672, + "learning_rate": 4.39951812327721e-05, + "loss": 1.7049, + "step": 17991 + }, + { + "epoch": 5.522406384284837, + "grad_norm": 0.193614661693573, + "learning_rate": 4.3990246678323e-05, + "loss": 1.6938, + "step": 17992 + }, + { + "epoch": 5.522713321055862, + "grad_norm": 0.23343239724636078, + "learning_rate": 4.398531218326582e-05, + "loss": 1.744, + "step": 17993 + }, + { + "epoch": 5.523020257826888, + "grad_norm": 0.26271605491638184, + "learning_rate": 4.3980377747649305e-05, + "loss": 1.7458, + "step": 17994 + }, + { + "epoch": 5.523327194597913, + "grad_norm": 0.2048577219247818, + "learning_rate": 4.397544337152223e-05, + "loss": 1.763, + "step": 17995 + }, + { + "epoch": 5.523634131368938, + "grad_norm": 0.27748194336891174, + "learning_rate": 4.397050905493334e-05, + "loss": 1.7346, + "step": 17996 + }, + { + "epoch": 5.523941068139964, + "grad_norm": 0.3040253520011902, + "learning_rate": 4.3965574797931417e-05, + "loss": 1.7396, + "step": 17997 + }, + { + "epoch": 5.524248004910988, + "grad_norm": 0.3310317397117615, + "learning_rate": 4.396064060056523e-05, + "loss": 1.8094, + "step": 17998 + }, + { + "epoch": 5.524554941682013, + "grad_norm": 0.21845392882823944, + "learning_rate": 4.395570646288352e-05, + "loss": 1.7013, + "step": 17999 + }, + { + "epoch": 5.524861878453039, + "grad_norm": 0.319876492023468, + "learning_rate": 4.395077238493506e-05, + "loss": 1.7985, + "step": 18000 + }, + { + "epoch": 5.525168815224064, + "grad_norm": 0.28261950612068176, + "learning_rate": 4.394583836676863e-05, + "loss": 1.7979, + "step": 18001 + }, + { + "epoch": 5.525475751995089, + "grad_norm": 0.20874030888080597, + "learning_rate": 4.394090440843296e-05, + "loss": 1.7363, + "step": 18002 + }, + { + "epoch": 5.525782688766114, + "grad_norm": 0.28587406873703003, + "learning_rate": 4.393597050997684e-05, + "loss": 1.6787, + "step": 18003 + }, + { + "epoch": 5.526089625537139, + "grad_norm": 0.2719021439552307, + "learning_rate": 4.393103667144899e-05, + "loss": 1.7625, + "step": 18004 + }, + { + "epoch": 5.526396562308165, + "grad_norm": 0.22485414147377014, + "learning_rate": 4.392610289289821e-05, + "loss": 1.6847, + "step": 18005 + }, + { + "epoch": 5.52670349907919, + "grad_norm": 0.3500347435474396, + "learning_rate": 4.392116917437322e-05, + "loss": 1.7244, + "step": 18006 + }, + { + "epoch": 5.527010435850215, + "grad_norm": 0.26308783888816833, + "learning_rate": 4.3916235515922836e-05, + "loss": 1.7738, + "step": 18007 + }, + { + "epoch": 5.52731737262124, + "grad_norm": 0.27030646800994873, + "learning_rate": 4.391130191759574e-05, + "loss": 1.7149, + "step": 18008 + }, + { + "epoch": 5.527624309392265, + "grad_norm": 0.4137318730354309, + "learning_rate": 4.390636837944076e-05, + "loss": 1.7581, + "step": 18009 + }, + { + "epoch": 5.52793124616329, + "grad_norm": 0.2462068647146225, + "learning_rate": 4.390143490150659e-05, + "loss": 1.7767, + "step": 18010 + }, + { + "epoch": 5.528238182934316, + "grad_norm": 0.27424392104148865, + "learning_rate": 4.3896501483842036e-05, + "loss": 1.7701, + "step": 18011 + }, + { + "epoch": 5.528545119705341, + "grad_norm": 0.31268683075904846, + "learning_rate": 4.389156812649583e-05, + "loss": 1.7342, + "step": 18012 + }, + { + "epoch": 5.5288520564763655, + "grad_norm": 0.20428471267223358, + "learning_rate": 4.388663482951671e-05, + "loss": 1.7083, + "step": 18013 + }, + { + "epoch": 5.529158993247391, + "grad_norm": 0.322344034910202, + "learning_rate": 4.3881701592953475e-05, + "loss": 1.7423, + "step": 18014 + }, + { + "epoch": 5.529465930018416, + "grad_norm": 0.2267894744873047, + "learning_rate": 4.387676841685483e-05, + "loss": 1.7309, + "step": 18015 + }, + { + "epoch": 5.5297728667894415, + "grad_norm": 0.23041954636573792, + "learning_rate": 4.387183530126955e-05, + "loss": 1.7352, + "step": 18016 + }, + { + "epoch": 5.530079803560467, + "grad_norm": 0.31139662861824036, + "learning_rate": 4.386690224624638e-05, + "loss": 1.7223, + "step": 18017 + }, + { + "epoch": 5.530386740331492, + "grad_norm": 0.20144063234329224, + "learning_rate": 4.38619692518341e-05, + "loss": 1.7607, + "step": 18018 + }, + { + "epoch": 5.530693677102517, + "grad_norm": 0.23812296986579895, + "learning_rate": 4.385703631808142e-05, + "loss": 1.7599, + "step": 18019 + }, + { + "epoch": 5.531000613873542, + "grad_norm": 0.2442231923341751, + "learning_rate": 4.385210344503712e-05, + "loss": 1.7094, + "step": 18020 + }, + { + "epoch": 5.531307550644567, + "grad_norm": 0.19497406482696533, + "learning_rate": 4.384717063274992e-05, + "loss": 1.7686, + "step": 18021 + }, + { + "epoch": 5.531614487415593, + "grad_norm": 0.29085835814476013, + "learning_rate": 4.38422378812686e-05, + "loss": 1.7454, + "step": 18022 + }, + { + "epoch": 5.531921424186618, + "grad_norm": 0.2701610028743744, + "learning_rate": 4.3837305190641876e-05, + "loss": 1.7376, + "step": 18023 + }, + { + "epoch": 5.532228360957642, + "grad_norm": 0.21232132613658905, + "learning_rate": 4.383237256091854e-05, + "loss": 1.7773, + "step": 18024 + }, + { + "epoch": 5.532535297728668, + "grad_norm": 0.24131610989570618, + "learning_rate": 4.382743999214729e-05, + "loss": 1.7899, + "step": 18025 + }, + { + "epoch": 5.532842234499693, + "grad_norm": 0.2752540409564972, + "learning_rate": 4.382250748437692e-05, + "loss": 1.7603, + "step": 18026 + }, + { + "epoch": 5.533149171270718, + "grad_norm": 0.2007865607738495, + "learning_rate": 4.381757503765613e-05, + "loss": 1.7553, + "step": 18027 + }, + { + "epoch": 5.533456108041744, + "grad_norm": 0.23768723011016846, + "learning_rate": 4.38126426520337e-05, + "loss": 1.757, + "step": 18028 + }, + { + "epoch": 5.533763044812769, + "grad_norm": 0.22198502719402313, + "learning_rate": 4.3807710327558366e-05, + "loss": 1.7578, + "step": 18029 + }, + { + "epoch": 5.5340699815837935, + "grad_norm": 0.22432352602481842, + "learning_rate": 4.380277806427885e-05, + "loss": 1.75, + "step": 18030 + }, + { + "epoch": 5.534376918354819, + "grad_norm": 0.23029591143131256, + "learning_rate": 4.379784586224394e-05, + "loss": 1.7829, + "step": 18031 + }, + { + "epoch": 5.534683855125844, + "grad_norm": 0.23901896178722382, + "learning_rate": 4.379291372150232e-05, + "loss": 1.7461, + "step": 18032 + }, + { + "epoch": 5.5349907918968695, + "grad_norm": 0.20958681404590607, + "learning_rate": 4.378798164210278e-05, + "loss": 1.7224, + "step": 18033 + }, + { + "epoch": 5.535297728667894, + "grad_norm": 0.21619680523872375, + "learning_rate": 4.3783049624094036e-05, + "loss": 1.7605, + "step": 18034 + }, + { + "epoch": 5.535604665438919, + "grad_norm": 0.22988620400428772, + "learning_rate": 4.3778117667524867e-05, + "loss": 1.7668, + "step": 18035 + }, + { + "epoch": 5.535911602209945, + "grad_norm": 0.20107243955135345, + "learning_rate": 4.377318577244395e-05, + "loss": 1.7932, + "step": 18036 + }, + { + "epoch": 5.53621853898097, + "grad_norm": 0.25803956389427185, + "learning_rate": 4.376825393890009e-05, + "loss": 1.7409, + "step": 18037 + }, + { + "epoch": 5.536525475751995, + "grad_norm": 0.34292399883270264, + "learning_rate": 4.376332216694198e-05, + "loss": 1.8554, + "step": 18038 + }, + { + "epoch": 5.536832412523021, + "grad_norm": 0.23147790133953094, + "learning_rate": 4.375839045661839e-05, + "loss": 1.7918, + "step": 18039 + }, + { + "epoch": 5.537139349294045, + "grad_norm": 0.2387644350528717, + "learning_rate": 4.375345880797802e-05, + "loss": 1.7391, + "step": 18040 + }, + { + "epoch": 5.53744628606507, + "grad_norm": 0.21463727951049805, + "learning_rate": 4.374852722106966e-05, + "loss": 1.6812, + "step": 18041 + }, + { + "epoch": 5.537753222836096, + "grad_norm": 0.21994563937187195, + "learning_rate": 4.3743595695941994e-05, + "loss": 1.7727, + "step": 18042 + }, + { + "epoch": 5.538060159607121, + "grad_norm": 0.21102699637413025, + "learning_rate": 4.373866423264381e-05, + "loss": 1.7854, + "step": 18043 + }, + { + "epoch": 5.538367096378146, + "grad_norm": 0.21742786467075348, + "learning_rate": 4.3733732831223794e-05, + "loss": 1.7352, + "step": 18044 + }, + { + "epoch": 5.538674033149171, + "grad_norm": 0.20080791413784027, + "learning_rate": 4.372880149173071e-05, + "loss": 1.7264, + "step": 18045 + }, + { + "epoch": 5.538980969920196, + "grad_norm": 0.21027569472789764, + "learning_rate": 4.372387021421329e-05, + "loss": 1.766, + "step": 18046 + }, + { + "epoch": 5.5392879066912215, + "grad_norm": 0.22870683670043945, + "learning_rate": 4.371893899872025e-05, + "loss": 1.7746, + "step": 18047 + }, + { + "epoch": 5.539594843462247, + "grad_norm": 0.21248690783977509, + "learning_rate": 4.371400784530036e-05, + "loss": 1.7447, + "step": 18048 + }, + { + "epoch": 5.539901780233272, + "grad_norm": 0.23059454560279846, + "learning_rate": 4.37090767540023e-05, + "loss": 1.7827, + "step": 18049 + }, + { + "epoch": 5.5402087170042975, + "grad_norm": 0.2519036531448364, + "learning_rate": 4.370414572487485e-05, + "loss": 1.7984, + "step": 18050 + }, + { + "epoch": 5.540515653775322, + "grad_norm": 0.23621398210525513, + "learning_rate": 4.36992147579667e-05, + "loss": 1.7517, + "step": 18051 + }, + { + "epoch": 5.540822590546347, + "grad_norm": 0.24267609417438507, + "learning_rate": 4.3694283853326625e-05, + "loss": 1.8285, + "step": 18052 + }, + { + "epoch": 5.541129527317373, + "grad_norm": 0.23209960758686066, + "learning_rate": 4.368935301100332e-05, + "loss": 1.7765, + "step": 18053 + }, + { + "epoch": 5.541436464088398, + "grad_norm": 0.21277187764644623, + "learning_rate": 4.368442223104555e-05, + "loss": 1.7182, + "step": 18054 + }, + { + "epoch": 5.541743400859423, + "grad_norm": 0.20821616053581238, + "learning_rate": 4.367949151350199e-05, + "loss": 1.6766, + "step": 18055 + }, + { + "epoch": 5.542050337630448, + "grad_norm": 0.23019999265670776, + "learning_rate": 4.3674560858421414e-05, + "loss": 1.7438, + "step": 18056 + }, + { + "epoch": 5.542357274401473, + "grad_norm": 0.21547134220600128, + "learning_rate": 4.366963026585253e-05, + "loss": 1.7003, + "step": 18057 + }, + { + "epoch": 5.542664211172498, + "grad_norm": 0.22454513609409332, + "learning_rate": 4.3664699735844084e-05, + "loss": 1.7072, + "step": 18058 + }, + { + "epoch": 5.542971147943524, + "grad_norm": 0.22228482365608215, + "learning_rate": 4.365976926844477e-05, + "loss": 1.7557, + "step": 18059 + }, + { + "epoch": 5.543278084714549, + "grad_norm": 0.25762560963630676, + "learning_rate": 4.365483886370335e-05, + "loss": 1.7751, + "step": 18060 + }, + { + "epoch": 5.543585021485574, + "grad_norm": 0.2086205631494522, + "learning_rate": 4.3649908521668516e-05, + "loss": 1.7399, + "step": 18061 + }, + { + "epoch": 5.543891958256599, + "grad_norm": 0.2759089767932892, + "learning_rate": 4.3644978242389014e-05, + "loss": 1.7503, + "step": 18062 + }, + { + "epoch": 5.544198895027624, + "grad_norm": 0.2235182225704193, + "learning_rate": 4.364004802591358e-05, + "loss": 1.7313, + "step": 18063 + }, + { + "epoch": 5.5445058317986495, + "grad_norm": 0.23074570298194885, + "learning_rate": 4.3635117872290885e-05, + "loss": 1.7649, + "step": 18064 + }, + { + "epoch": 5.544812768569675, + "grad_norm": 0.24929538369178772, + "learning_rate": 4.363018778156972e-05, + "loss": 1.732, + "step": 18065 + }, + { + "epoch": 5.5451197053407, + "grad_norm": 0.26422035694122314, + "learning_rate": 4.362525775379874e-05, + "loss": 1.7276, + "step": 18066 + }, + { + "epoch": 5.545426642111725, + "grad_norm": 0.3160388767719269, + "learning_rate": 4.362032778902672e-05, + "loss": 1.7777, + "step": 18067 + }, + { + "epoch": 5.54573357888275, + "grad_norm": 0.20791196823120117, + "learning_rate": 4.3615397887302345e-05, + "loss": 1.7058, + "step": 18068 + }, + { + "epoch": 5.546040515653775, + "grad_norm": 0.31438156962394714, + "learning_rate": 4.361046804867437e-05, + "loss": 1.8102, + "step": 18069 + }, + { + "epoch": 5.546347452424801, + "grad_norm": 0.3008113205432892, + "learning_rate": 4.3605538273191475e-05, + "loss": 1.7297, + "step": 18070 + }, + { + "epoch": 5.546654389195826, + "grad_norm": 0.21147282421588898, + "learning_rate": 4.3600608560902425e-05, + "loss": 1.776, + "step": 18071 + }, + { + "epoch": 5.546961325966851, + "grad_norm": 0.25202393531799316, + "learning_rate": 4.3595678911855884e-05, + "loss": 1.7273, + "step": 18072 + }, + { + "epoch": 5.547268262737876, + "grad_norm": 0.18881210684776306, + "learning_rate": 4.3590749326100614e-05, + "loss": 1.7026, + "step": 18073 + }, + { + "epoch": 5.547575199508901, + "grad_norm": 0.25075671076774597, + "learning_rate": 4.3585819803685295e-05, + "loss": 1.7694, + "step": 18074 + }, + { + "epoch": 5.547882136279926, + "grad_norm": 0.2625887989997864, + "learning_rate": 4.358089034465869e-05, + "loss": 1.7338, + "step": 18075 + }, + { + "epoch": 5.548189073050952, + "grad_norm": 0.27278679609298706, + "learning_rate": 4.357596094906947e-05, + "loss": 1.7684, + "step": 18076 + }, + { + "epoch": 5.548496009821976, + "grad_norm": 0.283964604139328, + "learning_rate": 4.3571031616966396e-05, + "loss": 1.7539, + "step": 18077 + }, + { + "epoch": 5.5488029465930016, + "grad_norm": 0.2702009975910187, + "learning_rate": 4.3566102348398124e-05, + "loss": 1.8064, + "step": 18078 + }, + { + "epoch": 5.549109883364027, + "grad_norm": 0.449733167886734, + "learning_rate": 4.356117314341342e-05, + "loss": 1.7258, + "step": 18079 + }, + { + "epoch": 5.549416820135052, + "grad_norm": 0.3199995160102844, + "learning_rate": 4.3556244002060975e-05, + "loss": 1.7526, + "step": 18080 + }, + { + "epoch": 5.5497237569060776, + "grad_norm": 0.2803747355937958, + "learning_rate": 4.3551314924389494e-05, + "loss": 1.764, + "step": 18081 + }, + { + "epoch": 5.550030693677103, + "grad_norm": 0.28995978832244873, + "learning_rate": 4.3546385910447715e-05, + "loss": 1.7617, + "step": 18082 + }, + { + "epoch": 5.550337630448127, + "grad_norm": 0.24313311278820038, + "learning_rate": 4.354145696028431e-05, + "loss": 1.7515, + "step": 18083 + }, + { + "epoch": 5.550644567219153, + "grad_norm": 0.2668032944202423, + "learning_rate": 4.3536528073948025e-05, + "loss": 1.743, + "step": 18084 + }, + { + "epoch": 5.550951503990178, + "grad_norm": 0.22831310331821442, + "learning_rate": 4.353159925148755e-05, + "loss": 1.7971, + "step": 18085 + }, + { + "epoch": 5.551258440761203, + "grad_norm": 0.22047942876815796, + "learning_rate": 4.352667049295162e-05, + "loss": 1.6983, + "step": 18086 + }, + { + "epoch": 5.551565377532229, + "grad_norm": 0.22895069420337677, + "learning_rate": 4.35217417983889e-05, + "loss": 1.7866, + "step": 18087 + }, + { + "epoch": 5.551872314303253, + "grad_norm": 0.19946368038654327, + "learning_rate": 4.3516813167848156e-05, + "loss": 1.7129, + "step": 18088 + }, + { + "epoch": 5.5521792510742785, + "grad_norm": 0.21508903801441193, + "learning_rate": 4.351188460137804e-05, + "loss": 1.7154, + "step": 18089 + }, + { + "epoch": 5.552486187845304, + "grad_norm": 0.24813953042030334, + "learning_rate": 4.3506956099027294e-05, + "loss": 1.8326, + "step": 18090 + }, + { + "epoch": 5.552793124616329, + "grad_norm": 0.21306444704532623, + "learning_rate": 4.35020276608446e-05, + "loss": 1.7651, + "step": 18091 + }, + { + "epoch": 5.5531000613873545, + "grad_norm": 0.22041217982769012, + "learning_rate": 4.34970992868787e-05, + "loss": 1.6852, + "step": 18092 + }, + { + "epoch": 5.55340699815838, + "grad_norm": 0.21699896454811096, + "learning_rate": 4.349217097717826e-05, + "loss": 1.7524, + "step": 18093 + }, + { + "epoch": 5.553713934929404, + "grad_norm": 0.23086662590503693, + "learning_rate": 4.3487242731792015e-05, + "loss": 1.7441, + "step": 18094 + }, + { + "epoch": 5.55402087170043, + "grad_norm": 0.21898184716701508, + "learning_rate": 4.348231455076864e-05, + "loss": 1.7131, + "step": 18095 + }, + { + "epoch": 5.554327808471455, + "grad_norm": 0.17392560839653015, + "learning_rate": 4.3477386434156854e-05, + "loss": 1.7049, + "step": 18096 + }, + { + "epoch": 5.55463474524248, + "grad_norm": 0.1984172910451889, + "learning_rate": 4.3472458382005374e-05, + "loss": 1.7136, + "step": 18097 + }, + { + "epoch": 5.554941682013506, + "grad_norm": 0.19227837026119232, + "learning_rate": 4.3467530394362866e-05, + "loss": 1.7468, + "step": 18098 + }, + { + "epoch": 5.55524861878453, + "grad_norm": 0.2307087779045105, + "learning_rate": 4.346260247127807e-05, + "loss": 1.7004, + "step": 18099 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.21496252715587616, + "learning_rate": 4.345767461279965e-05, + "loss": 1.7508, + "step": 18100 + }, + { + "epoch": 5.555862492326581, + "grad_norm": 0.21119998395442963, + "learning_rate": 4.3452746818976333e-05, + "loss": 1.7965, + "step": 18101 + }, + { + "epoch": 5.556169429097606, + "grad_norm": 0.2416355311870575, + "learning_rate": 4.34478190898568e-05, + "loss": 1.7006, + "step": 18102 + }, + { + "epoch": 5.556476365868631, + "grad_norm": 0.2009642869234085, + "learning_rate": 4.344289142548978e-05, + "loss": 1.7567, + "step": 18103 + }, + { + "epoch": 5.556783302639657, + "grad_norm": 0.2387058436870575, + "learning_rate": 4.343796382592393e-05, + "loss": 1.7898, + "step": 18104 + }, + { + "epoch": 5.557090239410681, + "grad_norm": 0.19835951924324036, + "learning_rate": 4.343303629120798e-05, + "loss": 1.7888, + "step": 18105 + }, + { + "epoch": 5.5573971761817065, + "grad_norm": 0.23324637115001678, + "learning_rate": 4.3428108821390604e-05, + "loss": 1.7923, + "step": 18106 + }, + { + "epoch": 5.557704112952732, + "grad_norm": 0.22334477305412292, + "learning_rate": 4.342318141652052e-05, + "loss": 1.7234, + "step": 18107 + }, + { + "epoch": 5.558011049723757, + "grad_norm": 0.20220427215099335, + "learning_rate": 4.341825407664639e-05, + "loss": 1.7639, + "step": 18108 + }, + { + "epoch": 5.558317986494782, + "grad_norm": 0.23658546805381775, + "learning_rate": 4.3413326801816964e-05, + "loss": 1.7505, + "step": 18109 + }, + { + "epoch": 5.558624923265807, + "grad_norm": 0.21157726645469666, + "learning_rate": 4.3408399592080875e-05, + "loss": 1.7655, + "step": 18110 + }, + { + "epoch": 5.558931860036832, + "grad_norm": 0.2139829397201538, + "learning_rate": 4.340347244748687e-05, + "loss": 1.767, + "step": 18111 + }, + { + "epoch": 5.559238796807858, + "grad_norm": 0.17811299860477448, + "learning_rate": 4.339854536808359e-05, + "loss": 1.6629, + "step": 18112 + }, + { + "epoch": 5.559545733578883, + "grad_norm": 0.2005898356437683, + "learning_rate": 4.339361835391977e-05, + "loss": 1.7269, + "step": 18113 + }, + { + "epoch": 5.559852670349908, + "grad_norm": 0.21514086425304413, + "learning_rate": 4.338869140504409e-05, + "loss": 1.7806, + "step": 18114 + }, + { + "epoch": 5.560159607120933, + "grad_norm": 0.23163840174674988, + "learning_rate": 4.338376452150522e-05, + "loss": 1.7259, + "step": 18115 + }, + { + "epoch": 5.560466543891958, + "grad_norm": 0.23657509684562683, + "learning_rate": 4.337883770335189e-05, + "loss": 1.7778, + "step": 18116 + }, + { + "epoch": 5.560773480662983, + "grad_norm": 0.20135201513767242, + "learning_rate": 4.337391095063274e-05, + "loss": 1.7359, + "step": 18117 + }, + { + "epoch": 5.561080417434009, + "grad_norm": 0.22871774435043335, + "learning_rate": 4.33689842633965e-05, + "loss": 1.7658, + "step": 18118 + }, + { + "epoch": 5.561387354205034, + "grad_norm": 0.21755221486091614, + "learning_rate": 4.3364057641691835e-05, + "loss": 1.7408, + "step": 18119 + }, + { + "epoch": 5.5616942909760585, + "grad_norm": 0.215267151594162, + "learning_rate": 4.335913108556746e-05, + "loss": 1.7175, + "step": 18120 + }, + { + "epoch": 5.562001227747084, + "grad_norm": 0.25724974274635315, + "learning_rate": 4.335420459507202e-05, + "loss": 1.7197, + "step": 18121 + }, + { + "epoch": 5.562308164518109, + "grad_norm": 0.25375521183013916, + "learning_rate": 4.3349278170254254e-05, + "loss": 1.7251, + "step": 18122 + }, + { + "epoch": 5.5626151012891345, + "grad_norm": 0.24768905341625214, + "learning_rate": 4.334435181116279e-05, + "loss": 1.7405, + "step": 18123 + }, + { + "epoch": 5.56292203806016, + "grad_norm": 0.21281081438064575, + "learning_rate": 4.333942551784636e-05, + "loss": 1.7131, + "step": 18124 + }, + { + "epoch": 5.563228974831185, + "grad_norm": 0.2129398137331009, + "learning_rate": 4.333449929035361e-05, + "loss": 1.7049, + "step": 18125 + }, + { + "epoch": 5.56353591160221, + "grad_norm": 0.24582397937774658, + "learning_rate": 4.332957312873328e-05, + "loss": 1.7205, + "step": 18126 + }, + { + "epoch": 5.563842848373235, + "grad_norm": 0.21282973885536194, + "learning_rate": 4.332464703303399e-05, + "loss": 1.7655, + "step": 18127 + }, + { + "epoch": 5.56414978514426, + "grad_norm": 0.2302251160144806, + "learning_rate": 4.331972100330447e-05, + "loss": 1.7597, + "step": 18128 + }, + { + "epoch": 5.564456721915286, + "grad_norm": 0.23453226685523987, + "learning_rate": 4.331479503959336e-05, + "loss": 1.7028, + "step": 18129 + }, + { + "epoch": 5.564763658686311, + "grad_norm": 0.19723562896251678, + "learning_rate": 4.330986914194938e-05, + "loss": 1.7101, + "step": 18130 + }, + { + "epoch": 5.565070595457335, + "grad_norm": 0.22021643817424774, + "learning_rate": 4.33049433104212e-05, + "loss": 1.7123, + "step": 18131 + }, + { + "epoch": 5.565377532228361, + "grad_norm": 0.25540977716445923, + "learning_rate": 4.3300017545057484e-05, + "loss": 1.7392, + "step": 18132 + }, + { + "epoch": 5.565684468999386, + "grad_norm": 0.23482176661491394, + "learning_rate": 4.329509184590693e-05, + "loss": 1.7175, + "step": 18133 + }, + { + "epoch": 5.565991405770411, + "grad_norm": 0.19537311792373657, + "learning_rate": 4.329016621301819e-05, + "loss": 1.7583, + "step": 18134 + }, + { + "epoch": 5.566298342541437, + "grad_norm": 0.21828842163085938, + "learning_rate": 4.328524064643997e-05, + "loss": 1.7411, + "step": 18135 + }, + { + "epoch": 5.566605279312462, + "grad_norm": 0.24589122831821442, + "learning_rate": 4.328031514622093e-05, + "loss": 1.7769, + "step": 18136 + }, + { + "epoch": 5.5669122160834865, + "grad_norm": 0.20964545011520386, + "learning_rate": 4.327538971240978e-05, + "loss": 1.7743, + "step": 18137 + }, + { + "epoch": 5.567219152854512, + "grad_norm": 0.2210713028907776, + "learning_rate": 4.327046434505514e-05, + "loss": 1.7671, + "step": 18138 + }, + { + "epoch": 5.567526089625537, + "grad_norm": 0.21382687985897064, + "learning_rate": 4.3265539044205736e-05, + "loss": 1.793, + "step": 18139 + }, + { + "epoch": 5.5678330263965625, + "grad_norm": 0.23289678990840912, + "learning_rate": 4.326061380991021e-05, + "loss": 1.738, + "step": 18140 + }, + { + "epoch": 5.568139963167588, + "grad_norm": 0.23789258301258087, + "learning_rate": 4.325568864221725e-05, + "loss": 1.8315, + "step": 18141 + }, + { + "epoch": 5.568446899938612, + "grad_norm": 0.1925022453069687, + "learning_rate": 4.325076354117554e-05, + "loss": 1.6956, + "step": 18142 + }, + { + "epoch": 5.568753836709638, + "grad_norm": 0.22522561252117157, + "learning_rate": 4.324583850683373e-05, + "loss": 1.7957, + "step": 18143 + }, + { + "epoch": 5.569060773480663, + "grad_norm": 0.2787671387195587, + "learning_rate": 4.324091353924049e-05, + "loss": 1.7325, + "step": 18144 + }, + { + "epoch": 5.569367710251688, + "grad_norm": 0.2723194658756256, + "learning_rate": 4.3235988638444536e-05, + "loss": 1.7668, + "step": 18145 + }, + { + "epoch": 5.569674647022714, + "grad_norm": 0.2241704910993576, + "learning_rate": 4.3231063804494484e-05, + "loss": 1.7977, + "step": 18146 + }, + { + "epoch": 5.569981583793739, + "grad_norm": 0.2627747356891632, + "learning_rate": 4.322613903743903e-05, + "loss": 1.6775, + "step": 18147 + }, + { + "epoch": 5.570288520564763, + "grad_norm": 0.2644255757331848, + "learning_rate": 4.322121433732686e-05, + "loss": 1.7404, + "step": 18148 + }, + { + "epoch": 5.570595457335789, + "grad_norm": 0.2386743575334549, + "learning_rate": 4.321628970420659e-05, + "loss": 1.7386, + "step": 18149 + }, + { + "epoch": 5.570902394106814, + "grad_norm": 0.22444583475589752, + "learning_rate": 4.3211365138126945e-05, + "loss": 1.7482, + "step": 18150 + }, + { + "epoch": 5.571209330877839, + "grad_norm": 0.21770013868808746, + "learning_rate": 4.3206440639136554e-05, + "loss": 1.7322, + "step": 18151 + }, + { + "epoch": 5.571516267648864, + "grad_norm": 0.22356587648391724, + "learning_rate": 4.320151620728411e-05, + "loss": 1.751, + "step": 18152 + }, + { + "epoch": 5.571823204419889, + "grad_norm": 0.2040669322013855, + "learning_rate": 4.319659184261826e-05, + "loss": 1.712, + "step": 18153 + }, + { + "epoch": 5.5721301411909145, + "grad_norm": 0.20951713621616364, + "learning_rate": 4.319166754518768e-05, + "loss": 1.7308, + "step": 18154 + }, + { + "epoch": 5.57243707796194, + "grad_norm": 0.186195969581604, + "learning_rate": 4.3186743315041025e-05, + "loss": 1.7133, + "step": 18155 + }, + { + "epoch": 5.572744014732965, + "grad_norm": 0.2098865509033203, + "learning_rate": 4.318181915222698e-05, + "loss": 1.7645, + "step": 18156 + }, + { + "epoch": 5.5730509515039905, + "grad_norm": 0.20552097260951996, + "learning_rate": 4.317689505679418e-05, + "loss": 1.7156, + "step": 18157 + }, + { + "epoch": 5.573357888275015, + "grad_norm": 0.22506964206695557, + "learning_rate": 4.3171971028791314e-05, + "loss": 1.7192, + "step": 18158 + }, + { + "epoch": 5.57366482504604, + "grad_norm": 0.2296760082244873, + "learning_rate": 4.316704706826702e-05, + "loss": 1.7534, + "step": 18159 + }, + { + "epoch": 5.573971761817066, + "grad_norm": 0.20140253007411957, + "learning_rate": 4.316212317526998e-05, + "loss": 1.6906, + "step": 18160 + }, + { + "epoch": 5.574278698588091, + "grad_norm": 0.23313316702842712, + "learning_rate": 4.315719934984884e-05, + "loss": 1.6929, + "step": 18161 + }, + { + "epoch": 5.574585635359116, + "grad_norm": 0.23398169875144958, + "learning_rate": 4.315227559205228e-05, + "loss": 1.7254, + "step": 18162 + }, + { + "epoch": 5.574892572130141, + "grad_norm": 0.20836731791496277, + "learning_rate": 4.314735190192894e-05, + "loss": 1.7335, + "step": 18163 + }, + { + "epoch": 5.575199508901166, + "grad_norm": 0.19899079203605652, + "learning_rate": 4.3142428279527485e-05, + "loss": 1.69, + "step": 18164 + }, + { + "epoch": 5.5755064456721914, + "grad_norm": 0.24623680114746094, + "learning_rate": 4.313750472489657e-05, + "loss": 1.7413, + "step": 18165 + }, + { + "epoch": 5.575813382443217, + "grad_norm": 0.2432616949081421, + "learning_rate": 4.313258123808484e-05, + "loss": 1.7426, + "step": 18166 + }, + { + "epoch": 5.576120319214242, + "grad_norm": 0.22773970663547516, + "learning_rate": 4.3127657819141006e-05, + "loss": 1.7986, + "step": 18167 + }, + { + "epoch": 5.5764272559852675, + "grad_norm": 0.19891540706157684, + "learning_rate": 4.312273446811366e-05, + "loss": 1.7007, + "step": 18168 + }, + { + "epoch": 5.576734192756292, + "grad_norm": 0.23402714729309082, + "learning_rate": 4.311781118505149e-05, + "loss": 1.7774, + "step": 18169 + }, + { + "epoch": 5.577041129527317, + "grad_norm": 0.2248220294713974, + "learning_rate": 4.3112887970003134e-05, + "loss": 1.7079, + "step": 18170 + }, + { + "epoch": 5.577348066298343, + "grad_norm": 0.20901209115982056, + "learning_rate": 4.310796482301726e-05, + "loss": 1.7336, + "step": 18171 + }, + { + "epoch": 5.577655003069368, + "grad_norm": 0.21872754395008087, + "learning_rate": 4.3103041744142516e-05, + "loss": 1.7742, + "step": 18172 + }, + { + "epoch": 5.577961939840393, + "grad_norm": 0.2567403018474579, + "learning_rate": 4.309811873342757e-05, + "loss": 1.7894, + "step": 18173 + }, + { + "epoch": 5.578268876611418, + "grad_norm": 0.219998300075531, + "learning_rate": 4.3093195790921035e-05, + "loss": 1.7283, + "step": 18174 + }, + { + "epoch": 5.578575813382443, + "grad_norm": 0.1944747269153595, + "learning_rate": 4.3088272916671614e-05, + "loss": 1.7129, + "step": 18175 + }, + { + "epoch": 5.578882750153468, + "grad_norm": 0.19492141902446747, + "learning_rate": 4.308335011072791e-05, + "loss": 1.7286, + "step": 18176 + }, + { + "epoch": 5.579189686924494, + "grad_norm": 0.22383002936840057, + "learning_rate": 4.3078427373138604e-05, + "loss": 1.733, + "step": 18177 + }, + { + "epoch": 5.579496623695519, + "grad_norm": 0.20238643884658813, + "learning_rate": 4.307350470395232e-05, + "loss": 1.7522, + "step": 18178 + }, + { + "epoch": 5.579803560466544, + "grad_norm": 0.21456125378608704, + "learning_rate": 4.3068582103217755e-05, + "loss": 1.7298, + "step": 18179 + }, + { + "epoch": 5.580110497237569, + "grad_norm": 0.28084230422973633, + "learning_rate": 4.3063659570983514e-05, + "loss": 1.7805, + "step": 18180 + }, + { + "epoch": 5.580417434008594, + "grad_norm": 0.21319706737995148, + "learning_rate": 4.305873710729824e-05, + "loss": 1.6801, + "step": 18181 + }, + { + "epoch": 5.5807243707796195, + "grad_norm": 0.2279660850763321, + "learning_rate": 4.30538147122106e-05, + "loss": 1.752, + "step": 18182 + }, + { + "epoch": 5.581031307550645, + "grad_norm": 0.1958594173192978, + "learning_rate": 4.304889238576922e-05, + "loss": 1.7487, + "step": 18183 + }, + { + "epoch": 5.581338244321669, + "grad_norm": 0.19484321773052216, + "learning_rate": 4.304397012802279e-05, + "loss": 1.7222, + "step": 18184 + }, + { + "epoch": 5.581645181092695, + "grad_norm": 0.19863305985927582, + "learning_rate": 4.3039047939019906e-05, + "loss": 1.7296, + "step": 18185 + }, + { + "epoch": 5.58195211786372, + "grad_norm": 0.18674087524414062, + "learning_rate": 4.303412581880924e-05, + "loss": 1.6753, + "step": 18186 + }, + { + "epoch": 5.582259054634745, + "grad_norm": 0.22263208031654358, + "learning_rate": 4.302920376743941e-05, + "loss": 1.7431, + "step": 18187 + }, + { + "epoch": 5.582565991405771, + "grad_norm": 0.1926872879266739, + "learning_rate": 4.302428178495909e-05, + "loss": 1.7662, + "step": 18188 + }, + { + "epoch": 5.582872928176796, + "grad_norm": 0.23190459609031677, + "learning_rate": 4.301935987141689e-05, + "loss": 1.7271, + "step": 18189 + }, + { + "epoch": 5.58317986494782, + "grad_norm": 0.30057230591773987, + "learning_rate": 4.301443802686148e-05, + "loss": 1.7957, + "step": 18190 + }, + { + "epoch": 5.583486801718846, + "grad_norm": 0.2520695626735687, + "learning_rate": 4.3009516251341475e-05, + "loss": 1.7501, + "step": 18191 + }, + { + "epoch": 5.583793738489871, + "grad_norm": 0.19143317639827728, + "learning_rate": 4.300459454490555e-05, + "loss": 1.7091, + "step": 18192 + }, + { + "epoch": 5.584100675260896, + "grad_norm": 0.2064475119113922, + "learning_rate": 4.299967290760229e-05, + "loss": 1.6849, + "step": 18193 + }, + { + "epoch": 5.584407612031922, + "grad_norm": 0.3093598484992981, + "learning_rate": 4.299475133948039e-05, + "loss": 1.8479, + "step": 18194 + }, + { + "epoch": 5.584714548802946, + "grad_norm": 0.2875300943851471, + "learning_rate": 4.298982984058845e-05, + "loss": 1.7296, + "step": 18195 + }, + { + "epoch": 5.5850214855739715, + "grad_norm": 0.33194443583488464, + "learning_rate": 4.298490841097514e-05, + "loss": 1.7668, + "step": 18196 + }, + { + "epoch": 5.585328422344997, + "grad_norm": 0.20940829813480377, + "learning_rate": 4.297998705068908e-05, + "loss": 1.7316, + "step": 18197 + }, + { + "epoch": 5.585635359116022, + "grad_norm": 0.32381999492645264, + "learning_rate": 4.297506575977887e-05, + "loss": 1.7212, + "step": 18198 + }, + { + "epoch": 5.5859422958870475, + "grad_norm": 0.31585511565208435, + "learning_rate": 4.29701445382932e-05, + "loss": 1.7695, + "step": 18199 + }, + { + "epoch": 5.586249232658073, + "grad_norm": 0.2272588014602661, + "learning_rate": 4.2965223386280664e-05, + "loss": 1.7105, + "step": 18200 + }, + { + "epoch": 5.586556169429097, + "grad_norm": 0.2949761152267456, + "learning_rate": 4.296030230378993e-05, + "loss": 1.803, + "step": 18201 + }, + { + "epoch": 5.586863106200123, + "grad_norm": 0.20512579381465912, + "learning_rate": 4.29553812908696e-05, + "loss": 1.759, + "step": 18202 + }, + { + "epoch": 5.587170042971148, + "grad_norm": 0.21143598854541779, + "learning_rate": 4.295046034756835e-05, + "loss": 1.7286, + "step": 18203 + }, + { + "epoch": 5.587476979742173, + "grad_norm": 0.22148001194000244, + "learning_rate": 4.294553947393476e-05, + "loss": 1.7258, + "step": 18204 + }, + { + "epoch": 5.587783916513199, + "grad_norm": 0.17245957255363464, + "learning_rate": 4.2940618670017484e-05, + "loss": 1.6863, + "step": 18205 + }, + { + "epoch": 5.588090853284223, + "grad_norm": 0.20260390639305115, + "learning_rate": 4.293569793586515e-05, + "loss": 1.6866, + "step": 18206 + }, + { + "epoch": 5.588397790055248, + "grad_norm": 0.20671936869621277, + "learning_rate": 4.293077727152641e-05, + "loss": 1.7849, + "step": 18207 + }, + { + "epoch": 5.588704726826274, + "grad_norm": 0.21415838599205017, + "learning_rate": 4.292585667704984e-05, + "loss": 1.7279, + "step": 18208 + }, + { + "epoch": 5.589011663597299, + "grad_norm": 0.18668091297149658, + "learning_rate": 4.2920936152484134e-05, + "loss": 1.7087, + "step": 18209 + }, + { + "epoch": 5.589318600368324, + "grad_norm": 0.2253870815038681, + "learning_rate": 4.291601569787786e-05, + "loss": 1.769, + "step": 18210 + }, + { + "epoch": 5.58962553713935, + "grad_norm": 0.22426939010620117, + "learning_rate": 4.291109531327968e-05, + "loss": 1.7382, + "step": 18211 + }, + { + "epoch": 5.589932473910374, + "grad_norm": 0.21552452445030212, + "learning_rate": 4.29061749987382e-05, + "loss": 1.7316, + "step": 18212 + }, + { + "epoch": 5.5902394106813995, + "grad_norm": 0.2337147295475006, + "learning_rate": 4.290125475430209e-05, + "loss": 1.7836, + "step": 18213 + }, + { + "epoch": 5.590546347452425, + "grad_norm": 0.21780124306678772, + "learning_rate": 4.289633458001992e-05, + "loss": 1.6923, + "step": 18214 + }, + { + "epoch": 5.59085328422345, + "grad_norm": 0.20009608566761017, + "learning_rate": 4.289141447594033e-05, + "loss": 1.719, + "step": 18215 + }, + { + "epoch": 5.5911602209944755, + "grad_norm": 0.18165744841098785, + "learning_rate": 4.288649444211196e-05, + "loss": 1.6825, + "step": 18216 + }, + { + "epoch": 5.5914671577655, + "grad_norm": 0.2244826704263687, + "learning_rate": 4.288157447858341e-05, + "loss": 1.7323, + "step": 18217 + }, + { + "epoch": 5.591774094536525, + "grad_norm": 0.16875946521759033, + "learning_rate": 4.2876654585403325e-05, + "loss": 1.6787, + "step": 18218 + }, + { + "epoch": 5.592081031307551, + "grad_norm": 0.19244243204593658, + "learning_rate": 4.28717347626203e-05, + "loss": 1.7225, + "step": 18219 + }, + { + "epoch": 5.592387968078576, + "grad_norm": 0.21081633865833282, + "learning_rate": 4.286681501028299e-05, + "loss": 1.7063, + "step": 18220 + }, + { + "epoch": 5.592694904849601, + "grad_norm": 0.20926406979560852, + "learning_rate": 4.286189532843997e-05, + "loss": 1.7307, + "step": 18221 + }, + { + "epoch": 5.593001841620627, + "grad_norm": 0.20258775353431702, + "learning_rate": 4.28569757171399e-05, + "loss": 1.6917, + "step": 18222 + }, + { + "epoch": 5.593308778391651, + "grad_norm": 0.21956230700016022, + "learning_rate": 4.285205617643137e-05, + "loss": 1.7127, + "step": 18223 + }, + { + "epoch": 5.593615715162676, + "grad_norm": 0.2071436047554016, + "learning_rate": 4.284713670636303e-05, + "loss": 1.7487, + "step": 18224 + }, + { + "epoch": 5.593922651933702, + "grad_norm": 0.2002478390932083, + "learning_rate": 4.2842217306983464e-05, + "loss": 1.6544, + "step": 18225 + }, + { + "epoch": 5.594229588704727, + "grad_norm": 0.20691382884979248, + "learning_rate": 4.283729797834132e-05, + "loss": 1.768, + "step": 18226 + }, + { + "epoch": 5.5945365254757515, + "grad_norm": 0.18423563241958618, + "learning_rate": 4.283237872048517e-05, + "loss": 1.7563, + "step": 18227 + }, + { + "epoch": 5.594843462246777, + "grad_norm": 0.23055453598499298, + "learning_rate": 4.2827459533463665e-05, + "loss": 1.8083, + "step": 18228 + }, + { + "epoch": 5.595150399017802, + "grad_norm": 0.20735648274421692, + "learning_rate": 4.2822540417325396e-05, + "loss": 1.7761, + "step": 18229 + }, + { + "epoch": 5.5954573357888275, + "grad_norm": 0.2919909656047821, + "learning_rate": 4.281762137211902e-05, + "loss": 1.7836, + "step": 18230 + }, + { + "epoch": 5.595764272559853, + "grad_norm": 0.22636881470680237, + "learning_rate": 4.2812702397893113e-05, + "loss": 1.7389, + "step": 18231 + }, + { + "epoch": 5.596071209330878, + "grad_norm": 0.23788630962371826, + "learning_rate": 4.280778349469627e-05, + "loss": 1.7536, + "step": 18232 + }, + { + "epoch": 5.596378146101903, + "grad_norm": 0.22089426219463348, + "learning_rate": 4.280286466257715e-05, + "loss": 1.7584, + "step": 18233 + }, + { + "epoch": 5.596685082872928, + "grad_norm": 0.20486171543598175, + "learning_rate": 4.279794590158431e-05, + "loss": 1.7182, + "step": 18234 + }, + { + "epoch": 5.596992019643953, + "grad_norm": 0.2343701422214508, + "learning_rate": 4.2793027211766425e-05, + "loss": 1.751, + "step": 18235 + }, + { + "epoch": 5.597298956414979, + "grad_norm": 0.21734023094177246, + "learning_rate": 4.2788108593172036e-05, + "loss": 1.7084, + "step": 18236 + }, + { + "epoch": 5.597605893186004, + "grad_norm": 0.20593903958797455, + "learning_rate": 4.278319004584982e-05, + "loss": 1.6805, + "step": 18237 + }, + { + "epoch": 5.597912829957028, + "grad_norm": 0.20877878367900848, + "learning_rate": 4.2778271569848324e-05, + "loss": 1.7011, + "step": 18238 + }, + { + "epoch": 5.598219766728054, + "grad_norm": 0.23915995657444, + "learning_rate": 4.277335316521619e-05, + "loss": 1.732, + "step": 18239 + }, + { + "epoch": 5.598526703499079, + "grad_norm": 0.24310529232025146, + "learning_rate": 4.2768434832002004e-05, + "loss": 1.7859, + "step": 18240 + }, + { + "epoch": 5.598833640270104, + "grad_norm": 0.23189407587051392, + "learning_rate": 4.27635165702544e-05, + "loss": 1.7237, + "step": 18241 + }, + { + "epoch": 5.59914057704113, + "grad_norm": 0.2708875834941864, + "learning_rate": 4.275859838002195e-05, + "loss": 1.7046, + "step": 18242 + }, + { + "epoch": 5.599447513812155, + "grad_norm": 0.23692840337753296, + "learning_rate": 4.27536802613533e-05, + "loss": 1.8556, + "step": 18243 + }, + { + "epoch": 5.5997544505831796, + "grad_norm": 0.28285983204841614, + "learning_rate": 4.274876221429701e-05, + "loss": 1.6734, + "step": 18244 + }, + { + "epoch": 5.600061387354205, + "grad_norm": 0.20602203905582428, + "learning_rate": 4.27438442389017e-05, + "loss": 1.7113, + "step": 18245 + }, + { + "epoch": 5.60036832412523, + "grad_norm": 0.19719314575195312, + "learning_rate": 4.273892633521598e-05, + "loss": 1.7229, + "step": 18246 + }, + { + "epoch": 5.600675260896256, + "grad_norm": 0.2396705001592636, + "learning_rate": 4.273400850328846e-05, + "loss": 1.6986, + "step": 18247 + }, + { + "epoch": 5.600982197667281, + "grad_norm": 0.1974172443151474, + "learning_rate": 4.2729090743167724e-05, + "loss": 1.7445, + "step": 18248 + }, + { + "epoch": 5.601289134438305, + "grad_norm": 0.2193709760904312, + "learning_rate": 4.272417305490235e-05, + "loss": 1.7657, + "step": 18249 + }, + { + "epoch": 5.601596071209331, + "grad_norm": 0.24138681590557098, + "learning_rate": 4.271925543854098e-05, + "loss": 1.7388, + "step": 18250 + }, + { + "epoch": 5.601903007980356, + "grad_norm": 0.19056223332881927, + "learning_rate": 4.271433789413219e-05, + "loss": 1.6897, + "step": 18251 + }, + { + "epoch": 5.602209944751381, + "grad_norm": 0.20533505082130432, + "learning_rate": 4.270942042172459e-05, + "loss": 1.7222, + "step": 18252 + }, + { + "epoch": 5.602516881522407, + "grad_norm": 0.20570224523544312, + "learning_rate": 4.270450302136675e-05, + "loss": 1.8089, + "step": 18253 + }, + { + "epoch": 5.602823818293432, + "grad_norm": 0.2822209298610687, + "learning_rate": 4.269958569310732e-05, + "loss": 1.7523, + "step": 18254 + }, + { + "epoch": 5.6031307550644565, + "grad_norm": 0.2994859218597412, + "learning_rate": 4.269466843699484e-05, + "loss": 1.7538, + "step": 18255 + }, + { + "epoch": 5.603437691835482, + "grad_norm": 0.24851159751415253, + "learning_rate": 4.2689751253077925e-05, + "loss": 1.8162, + "step": 18256 + }, + { + "epoch": 5.603744628606507, + "grad_norm": 0.20387138426303864, + "learning_rate": 4.268483414140517e-05, + "loss": 1.6803, + "step": 18257 + }, + { + "epoch": 5.6040515653775325, + "grad_norm": 0.21620385348796844, + "learning_rate": 4.2679917102025204e-05, + "loss": 1.7236, + "step": 18258 + }, + { + "epoch": 5.604358502148557, + "grad_norm": 0.1925734579563141, + "learning_rate": 4.267500013498655e-05, + "loss": 1.7295, + "step": 18259 + }, + { + "epoch": 5.604665438919582, + "grad_norm": 0.22216086089611053, + "learning_rate": 4.267008324033787e-05, + "loss": 1.6844, + "step": 18260 + }, + { + "epoch": 5.604972375690608, + "grad_norm": 0.20293502509593964, + "learning_rate": 4.26651664181277e-05, + "loss": 1.7065, + "step": 18261 + }, + { + "epoch": 5.605279312461633, + "grad_norm": 0.21269507706165314, + "learning_rate": 4.266024966840466e-05, + "loss": 1.7573, + "step": 18262 + }, + { + "epoch": 5.605586249232658, + "grad_norm": 0.23574227094650269, + "learning_rate": 4.2655332991217334e-05, + "loss": 1.7625, + "step": 18263 + }, + { + "epoch": 5.605893186003684, + "grad_norm": 0.1875103861093521, + "learning_rate": 4.265041638661433e-05, + "loss": 1.7266, + "step": 18264 + }, + { + "epoch": 5.606200122774708, + "grad_norm": 0.20348483324050903, + "learning_rate": 4.264549985464421e-05, + "loss": 1.731, + "step": 18265 + }, + { + "epoch": 5.606507059545733, + "grad_norm": 0.2345927655696869, + "learning_rate": 4.264058339535556e-05, + "loss": 1.7809, + "step": 18266 + }, + { + "epoch": 5.606813996316759, + "grad_norm": 0.21142496168613434, + "learning_rate": 4.2635667008796985e-05, + "loss": 1.7362, + "step": 18267 + }, + { + "epoch": 5.607120933087784, + "grad_norm": 0.19670210778713226, + "learning_rate": 4.263075069501705e-05, + "loss": 1.7029, + "step": 18268 + }, + { + "epoch": 5.607427869858809, + "grad_norm": 0.20985090732574463, + "learning_rate": 4.262583445406439e-05, + "loss": 1.7478, + "step": 18269 + }, + { + "epoch": 5.607734806629834, + "grad_norm": 0.20972272753715515, + "learning_rate": 4.262091828598752e-05, + "loss": 1.7561, + "step": 18270 + }, + { + "epoch": 5.608041743400859, + "grad_norm": 0.20006676018238068, + "learning_rate": 4.261600219083509e-05, + "loss": 1.7584, + "step": 18271 + }, + { + "epoch": 5.6083486801718845, + "grad_norm": 0.21590086817741394, + "learning_rate": 4.2611086168655635e-05, + "loss": 1.7405, + "step": 18272 + }, + { + "epoch": 5.60865561694291, + "grad_norm": 0.19330906867980957, + "learning_rate": 4.260617021949776e-05, + "loss": 1.6797, + "step": 18273 + }, + { + "epoch": 5.608962553713935, + "grad_norm": 0.1955050528049469, + "learning_rate": 4.260125434341004e-05, + "loss": 1.7174, + "step": 18274 + }, + { + "epoch": 5.6092694904849605, + "grad_norm": 0.2117784321308136, + "learning_rate": 4.2596338540441086e-05, + "loss": 1.743, + "step": 18275 + }, + { + "epoch": 5.609576427255985, + "grad_norm": 0.21788950264453888, + "learning_rate": 4.2591422810639425e-05, + "loss": 1.7603, + "step": 18276 + }, + { + "epoch": 5.60988336402701, + "grad_norm": 0.2092670351266861, + "learning_rate": 4.258650715405369e-05, + "loss": 1.7379, + "step": 18277 + }, + { + "epoch": 5.610190300798036, + "grad_norm": 0.1941552758216858, + "learning_rate": 4.2581591570732414e-05, + "loss": 1.7547, + "step": 18278 + }, + { + "epoch": 5.610497237569061, + "grad_norm": 0.21306751668453217, + "learning_rate": 4.2576676060724215e-05, + "loss": 1.7284, + "step": 18279 + }, + { + "epoch": 5.610804174340086, + "grad_norm": 0.18618693947792053, + "learning_rate": 4.2571760624077635e-05, + "loss": 1.7268, + "step": 18280 + }, + { + "epoch": 5.611111111111111, + "grad_norm": 0.21530354022979736, + "learning_rate": 4.256684526084129e-05, + "loss": 1.7036, + "step": 18281 + }, + { + "epoch": 5.611418047882136, + "grad_norm": 0.23363792896270752, + "learning_rate": 4.256192997106375e-05, + "loss": 1.7797, + "step": 18282 + }, + { + "epoch": 5.611724984653161, + "grad_norm": 0.1786416620016098, + "learning_rate": 4.2557014754793544e-05, + "loss": 1.7008, + "step": 18283 + }, + { + "epoch": 5.612031921424187, + "grad_norm": 0.2042730301618576, + "learning_rate": 4.25520996120793e-05, + "loss": 1.7667, + "step": 18284 + }, + { + "epoch": 5.612338858195212, + "grad_norm": 0.2275264412164688, + "learning_rate": 4.2547184542969554e-05, + "loss": 1.8277, + "step": 18285 + }, + { + "epoch": 5.612645794966237, + "grad_norm": 0.21252553164958954, + "learning_rate": 4.2542269547512925e-05, + "loss": 1.7272, + "step": 18286 + }, + { + "epoch": 5.612952731737262, + "grad_norm": 0.20384398102760315, + "learning_rate": 4.2537354625757934e-05, + "loss": 1.6707, + "step": 18287 + }, + { + "epoch": 5.613259668508287, + "grad_norm": 0.19805553555488586, + "learning_rate": 4.253243977775321e-05, + "loss": 1.7443, + "step": 18288 + }, + { + "epoch": 5.6135666052793125, + "grad_norm": 0.20447707176208496, + "learning_rate": 4.2527525003547256e-05, + "loss": 1.7392, + "step": 18289 + }, + { + "epoch": 5.613873542050338, + "grad_norm": 0.21025662124156952, + "learning_rate": 4.25226103031887e-05, + "loss": 1.7856, + "step": 18290 + }, + { + "epoch": 5.614180478821363, + "grad_norm": 0.2131013125181198, + "learning_rate": 4.2517695676726085e-05, + "loss": 1.7521, + "step": 18291 + }, + { + "epoch": 5.614487415592388, + "grad_norm": 0.2511558532714844, + "learning_rate": 4.2512781124208e-05, + "loss": 1.6873, + "step": 18292 + }, + { + "epoch": 5.614794352363413, + "grad_norm": 0.19668610394001007, + "learning_rate": 4.2507866645682984e-05, + "loss": 1.6808, + "step": 18293 + }, + { + "epoch": 5.615101289134438, + "grad_norm": 0.22313621640205383, + "learning_rate": 4.2502952241199637e-05, + "loss": 1.7794, + "step": 18294 + }, + { + "epoch": 5.615408225905464, + "grad_norm": 0.2053089439868927, + "learning_rate": 4.249803791080649e-05, + "loss": 1.7405, + "step": 18295 + }, + { + "epoch": 5.615715162676489, + "grad_norm": 0.2052931934595108, + "learning_rate": 4.249312365455215e-05, + "loss": 1.6698, + "step": 18296 + }, + { + "epoch": 5.616022099447514, + "grad_norm": 0.223783478140831, + "learning_rate": 4.248820947248515e-05, + "loss": 1.7696, + "step": 18297 + }, + { + "epoch": 5.616329036218539, + "grad_norm": 0.3424001932144165, + "learning_rate": 4.248329536465407e-05, + "loss": 1.7724, + "step": 18298 + }, + { + "epoch": 5.616635972989564, + "grad_norm": 0.25015103816986084, + "learning_rate": 4.247838133110749e-05, + "loss": 1.7188, + "step": 18299 + }, + { + "epoch": 5.616942909760589, + "grad_norm": 0.239765465259552, + "learning_rate": 4.247346737189392e-05, + "loss": 1.695, + "step": 18300 + }, + { + "epoch": 5.617249846531615, + "grad_norm": 0.42259401082992554, + "learning_rate": 4.246855348706197e-05, + "loss": 1.6882, + "step": 18301 + }, + { + "epoch": 5.617556783302639, + "grad_norm": 0.2985959053039551, + "learning_rate": 4.246363967666018e-05, + "loss": 1.7236, + "step": 18302 + }, + { + "epoch": 5.6178637200736645, + "grad_norm": 0.22437956929206848, + "learning_rate": 4.245872594073714e-05, + "loss": 1.7158, + "step": 18303 + }, + { + "epoch": 5.61817065684469, + "grad_norm": 0.3165835440158844, + "learning_rate": 4.245381227934138e-05, + "loss": 1.7543, + "step": 18304 + }, + { + "epoch": 5.618477593615715, + "grad_norm": 0.2565564513206482, + "learning_rate": 4.244889869252148e-05, + "loss": 1.7863, + "step": 18305 + }, + { + "epoch": 5.6187845303867405, + "grad_norm": 0.25741446018218994, + "learning_rate": 4.244398518032597e-05, + "loss": 1.721, + "step": 18306 + }, + { + "epoch": 5.619091467157766, + "grad_norm": 0.26492297649383545, + "learning_rate": 4.2439071742803435e-05, + "loss": 1.7697, + "step": 18307 + }, + { + "epoch": 5.61939840392879, + "grad_norm": 0.2086823433637619, + "learning_rate": 4.243415838000243e-05, + "loss": 1.7072, + "step": 18308 + }, + { + "epoch": 5.619705340699816, + "grad_norm": 0.26784422993659973, + "learning_rate": 4.24292450919715e-05, + "loss": 1.7826, + "step": 18309 + }, + { + "epoch": 5.620012277470841, + "grad_norm": 0.21774251759052277, + "learning_rate": 4.242433187875921e-05, + "loss": 1.7204, + "step": 18310 + }, + { + "epoch": 5.620319214241866, + "grad_norm": 0.29547446966171265, + "learning_rate": 4.241941874041412e-05, + "loss": 1.7303, + "step": 18311 + }, + { + "epoch": 5.620626151012892, + "grad_norm": 0.20278988778591156, + "learning_rate": 4.241450567698476e-05, + "loss": 1.692, + "step": 18312 + }, + { + "epoch": 5.620933087783916, + "grad_norm": 0.2084289938211441, + "learning_rate": 4.240959268851971e-05, + "loss": 1.7069, + "step": 18313 + }, + { + "epoch": 5.621240024554941, + "grad_norm": 0.19901904463768005, + "learning_rate": 4.240467977506752e-05, + "loss": 1.6798, + "step": 18314 + }, + { + "epoch": 5.621546961325967, + "grad_norm": 0.24629411101341248, + "learning_rate": 4.2399766936676735e-05, + "loss": 1.775, + "step": 18315 + }, + { + "epoch": 5.621853898096992, + "grad_norm": 0.2532403767108917, + "learning_rate": 4.239485417339591e-05, + "loss": 1.7669, + "step": 18316 + }, + { + "epoch": 5.622160834868017, + "grad_norm": 0.22495722770690918, + "learning_rate": 4.2389941485273576e-05, + "loss": 1.7772, + "step": 18317 + }, + { + "epoch": 5.622467771639043, + "grad_norm": 0.2789733111858368, + "learning_rate": 4.2385028872358316e-05, + "loss": 1.751, + "step": 18318 + }, + { + "epoch": 5.622774708410067, + "grad_norm": 0.2266954481601715, + "learning_rate": 4.238011633469866e-05, + "loss": 1.7213, + "step": 18319 + }, + { + "epoch": 5.6230816451810925, + "grad_norm": 0.2163502722978592, + "learning_rate": 4.237520387234316e-05, + "loss": 1.7781, + "step": 18320 + }, + { + "epoch": 5.623388581952118, + "grad_norm": 0.25249144434928894, + "learning_rate": 4.237029148534036e-05, + "loss": 1.7293, + "step": 18321 + }, + { + "epoch": 5.623695518723143, + "grad_norm": 0.2320011854171753, + "learning_rate": 4.2365379173738826e-05, + "loss": 1.7909, + "step": 18322 + }, + { + "epoch": 5.6240024554941686, + "grad_norm": 0.22074681520462036, + "learning_rate": 4.2360466937587074e-05, + "loss": 1.743, + "step": 18323 + }, + { + "epoch": 5.624309392265193, + "grad_norm": 0.20864775776863098, + "learning_rate": 4.235555477693368e-05, + "loss": 1.726, + "step": 18324 + }, + { + "epoch": 5.624616329036218, + "grad_norm": 0.24547792971134186, + "learning_rate": 4.235064269182716e-05, + "loss": 1.7646, + "step": 18325 + }, + { + "epoch": 5.624923265807244, + "grad_norm": 0.29965806007385254, + "learning_rate": 4.234573068231607e-05, + "loss": 1.7789, + "step": 18326 + }, + { + "epoch": 5.625230202578269, + "grad_norm": 0.20844583213329315, + "learning_rate": 4.234081874844896e-05, + "loss": 1.7007, + "step": 18327 + }, + { + "epoch": 5.625537139349294, + "grad_norm": 0.2455398142337799, + "learning_rate": 4.2335906890274385e-05, + "loss": 1.7094, + "step": 18328 + }, + { + "epoch": 5.62584407612032, + "grad_norm": 0.17839518189430237, + "learning_rate": 4.233099510784085e-05, + "loss": 1.6849, + "step": 18329 + }, + { + "epoch": 5.626151012891344, + "grad_norm": 0.20219004154205322, + "learning_rate": 4.232608340119693e-05, + "loss": 1.716, + "step": 18330 + }, + { + "epoch": 5.6264579496623695, + "grad_norm": 0.23570619523525238, + "learning_rate": 4.232117177039114e-05, + "loss": 1.7622, + "step": 18331 + }, + { + "epoch": 5.626764886433395, + "grad_norm": 0.23534397780895233, + "learning_rate": 4.231626021547204e-05, + "loss": 1.7758, + "step": 18332 + }, + { + "epoch": 5.62707182320442, + "grad_norm": 0.2177352011203766, + "learning_rate": 4.231134873648817e-05, + "loss": 1.7102, + "step": 18333 + }, + { + "epoch": 5.627378759975445, + "grad_norm": 0.22886058688163757, + "learning_rate": 4.230643733348803e-05, + "loss": 1.7766, + "step": 18334 + }, + { + "epoch": 5.62768569674647, + "grad_norm": 0.20723696053028107, + "learning_rate": 4.2301526006520215e-05, + "loss": 1.7287, + "step": 18335 + }, + { + "epoch": 5.627992633517495, + "grad_norm": 0.18612104654312134, + "learning_rate": 4.229661475563321e-05, + "loss": 1.7255, + "step": 18336 + }, + { + "epoch": 5.628299570288521, + "grad_norm": 0.26456236839294434, + "learning_rate": 4.229170358087558e-05, + "loss": 1.7388, + "step": 18337 + }, + { + "epoch": 5.628606507059546, + "grad_norm": 0.25253555178642273, + "learning_rate": 4.2286792482295845e-05, + "loss": 1.7031, + "step": 18338 + }, + { + "epoch": 5.628913443830571, + "grad_norm": 0.23093348741531372, + "learning_rate": 4.228188145994257e-05, + "loss": 1.8032, + "step": 18339 + }, + { + "epoch": 5.629220380601596, + "grad_norm": 0.24142487347126007, + "learning_rate": 4.227697051386424e-05, + "loss": 1.6621, + "step": 18340 + }, + { + "epoch": 5.629527317372621, + "grad_norm": 0.2883392572402954, + "learning_rate": 4.227205964410944e-05, + "loss": 1.7125, + "step": 18341 + }, + { + "epoch": 5.629834254143646, + "grad_norm": 0.22670713067054749, + "learning_rate": 4.226714885072665e-05, + "loss": 1.7659, + "step": 18342 + }, + { + "epoch": 5.630141190914672, + "grad_norm": 0.2795337438583374, + "learning_rate": 4.226223813376444e-05, + "loss": 1.7559, + "step": 18343 + }, + { + "epoch": 5.630448127685697, + "grad_norm": 0.2513083219528198, + "learning_rate": 4.225732749327132e-05, + "loss": 1.6969, + "step": 18344 + }, + { + "epoch": 5.6307550644567215, + "grad_norm": 0.24588467180728912, + "learning_rate": 4.225241692929585e-05, + "loss": 1.7724, + "step": 18345 + }, + { + "epoch": 5.631062001227747, + "grad_norm": 0.41726353764533997, + "learning_rate": 4.224750644188651e-05, + "loss": 1.7308, + "step": 18346 + }, + { + "epoch": 5.631368937998772, + "grad_norm": 0.2512385845184326, + "learning_rate": 4.2242596031091886e-05, + "loss": 1.7068, + "step": 18347 + }, + { + "epoch": 5.6316758747697975, + "grad_norm": 0.3077464997768402, + "learning_rate": 4.223768569696044e-05, + "loss": 1.7383, + "step": 18348 + }, + { + "epoch": 5.631982811540823, + "grad_norm": 0.3460720479488373, + "learning_rate": 4.2232775439540756e-05, + "loss": 1.7317, + "step": 18349 + }, + { + "epoch": 5.632289748311848, + "grad_norm": 0.24827539920806885, + "learning_rate": 4.222786525888134e-05, + "loss": 1.6871, + "step": 18350 + }, + { + "epoch": 5.632596685082873, + "grad_norm": 0.24851584434509277, + "learning_rate": 4.22229551550307e-05, + "loss": 1.7058, + "step": 18351 + }, + { + "epoch": 5.632903621853898, + "grad_norm": 0.31132519245147705, + "learning_rate": 4.2218045128037396e-05, + "loss": 1.7523, + "step": 18352 + }, + { + "epoch": 5.633210558624923, + "grad_norm": 0.3104027807712555, + "learning_rate": 4.2213135177949906e-05, + "loss": 1.7669, + "step": 18353 + }, + { + "epoch": 5.633517495395949, + "grad_norm": 0.31351104378700256, + "learning_rate": 4.2208225304816795e-05, + "loss": 1.7031, + "step": 18354 + }, + { + "epoch": 5.633824432166974, + "grad_norm": 0.3217851221561432, + "learning_rate": 4.2203315508686555e-05, + "loss": 1.7694, + "step": 18355 + }, + { + "epoch": 5.634131368937998, + "grad_norm": 0.22287796437740326, + "learning_rate": 4.2198405789607745e-05, + "loss": 1.7742, + "step": 18356 + }, + { + "epoch": 5.634438305709024, + "grad_norm": 0.20288340747356415, + "learning_rate": 4.219349614762883e-05, + "loss": 1.7113, + "step": 18357 + }, + { + "epoch": 5.634745242480049, + "grad_norm": 0.19823449850082397, + "learning_rate": 4.218858658279839e-05, + "loss": 1.7433, + "step": 18358 + }, + { + "epoch": 5.635052179251074, + "grad_norm": 0.2756347358226776, + "learning_rate": 4.2183677095164895e-05, + "loss": 1.8278, + "step": 18359 + }, + { + "epoch": 5.6353591160221, + "grad_norm": 0.2303706556558609, + "learning_rate": 4.2178767684776895e-05, + "loss": 1.6943, + "step": 18360 + }, + { + "epoch": 5.635666052793125, + "grad_norm": 0.25089216232299805, + "learning_rate": 4.217385835168288e-05, + "loss": 1.6562, + "step": 18361 + }, + { + "epoch": 5.6359729895641495, + "grad_norm": 0.3013486862182617, + "learning_rate": 4.216894909593141e-05, + "loss": 1.7323, + "step": 18362 + }, + { + "epoch": 5.636279926335175, + "grad_norm": 0.19471928477287292, + "learning_rate": 4.2164039917570956e-05, + "loss": 1.7301, + "step": 18363 + }, + { + "epoch": 5.6365868631062, + "grad_norm": 0.3257733881473541, + "learning_rate": 4.2159130816650075e-05, + "loss": 1.7522, + "step": 18364 + }, + { + "epoch": 5.6368937998772255, + "grad_norm": 0.3065868020057678, + "learning_rate": 4.215422179321723e-05, + "loss": 1.7077, + "step": 18365 + }, + { + "epoch": 5.637200736648251, + "grad_norm": 0.20643819868564606, + "learning_rate": 4.214931284732098e-05, + "loss": 1.8033, + "step": 18366 + }, + { + "epoch": 5.637507673419275, + "grad_norm": 0.23551981151103973, + "learning_rate": 4.2144403979009826e-05, + "loss": 1.7391, + "step": 18367 + }, + { + "epoch": 5.637814610190301, + "grad_norm": 0.20602314174175262, + "learning_rate": 4.2139495188332265e-05, + "loss": 1.7593, + "step": 18368 + }, + { + "epoch": 5.638121546961326, + "grad_norm": 0.27911239862442017, + "learning_rate": 4.2134586475336834e-05, + "loss": 1.7212, + "step": 18369 + }, + { + "epoch": 5.638428483732351, + "grad_norm": 0.2700496017932892, + "learning_rate": 4.212967784007201e-05, + "loss": 1.7755, + "step": 18370 + }, + { + "epoch": 5.638735420503377, + "grad_norm": 0.24988985061645508, + "learning_rate": 4.2124769282586334e-05, + "loss": 1.7364, + "step": 18371 + }, + { + "epoch": 5.639042357274402, + "grad_norm": 0.20491284132003784, + "learning_rate": 4.211986080292829e-05, + "loss": 1.7477, + "step": 18372 + }, + { + "epoch": 5.639349294045426, + "grad_norm": 0.24953459203243256, + "learning_rate": 4.211495240114643e-05, + "loss": 1.7712, + "step": 18373 + }, + { + "epoch": 5.639656230816452, + "grad_norm": 0.2028491199016571, + "learning_rate": 4.2110044077289204e-05, + "loss": 1.701, + "step": 18374 + }, + { + "epoch": 5.639963167587477, + "grad_norm": 0.22320568561553955, + "learning_rate": 4.210513583140517e-05, + "loss": 1.7818, + "step": 18375 + }, + { + "epoch": 5.640270104358502, + "grad_norm": 0.22680947184562683, + "learning_rate": 4.210022766354278e-05, + "loss": 1.7631, + "step": 18376 + }, + { + "epoch": 5.640577041129527, + "grad_norm": 0.20724014937877655, + "learning_rate": 4.2095319573750596e-05, + "loss": 1.7757, + "step": 18377 + }, + { + "epoch": 5.640883977900552, + "grad_norm": 0.21785953640937805, + "learning_rate": 4.209041156207708e-05, + "loss": 1.7161, + "step": 18378 + }, + { + "epoch": 5.6411909146715775, + "grad_norm": 0.21751803159713745, + "learning_rate": 4.208550362857078e-05, + "loss": 1.7449, + "step": 18379 + }, + { + "epoch": 5.641497851442603, + "grad_norm": 0.1765962839126587, + "learning_rate": 4.208059577328014e-05, + "loss": 1.7191, + "step": 18380 + }, + { + "epoch": 5.641804788213628, + "grad_norm": 0.22720913589000702, + "learning_rate": 4.2075687996253724e-05, + "loss": 1.7037, + "step": 18381 + }, + { + "epoch": 5.6421117249846535, + "grad_norm": 0.23589655756950378, + "learning_rate": 4.2070780297539976e-05, + "loss": 1.8147, + "step": 18382 + }, + { + "epoch": 5.642418661755678, + "grad_norm": 0.21187056601047516, + "learning_rate": 4.2065872677187435e-05, + "loss": 1.7655, + "step": 18383 + }, + { + "epoch": 5.642725598526703, + "grad_norm": 0.24153946340084076, + "learning_rate": 4.2060965135244606e-05, + "loss": 1.7841, + "step": 18384 + }, + { + "epoch": 5.643032535297729, + "grad_norm": 0.2059229612350464, + "learning_rate": 4.205605767175995e-05, + "loss": 1.6718, + "step": 18385 + }, + { + "epoch": 5.643339472068754, + "grad_norm": 0.20235973596572876, + "learning_rate": 4.205115028678201e-05, + "loss": 1.6931, + "step": 18386 + }, + { + "epoch": 5.643646408839779, + "grad_norm": 0.25149911642074585, + "learning_rate": 4.204624298035924e-05, + "loss": 1.7465, + "step": 18387 + }, + { + "epoch": 5.643953345610804, + "grad_norm": 0.2050812691450119, + "learning_rate": 4.204133575254017e-05, + "loss": 1.7147, + "step": 18388 + }, + { + "epoch": 5.644260282381829, + "grad_norm": 0.20906420052051544, + "learning_rate": 4.2036428603373274e-05, + "loss": 1.6762, + "step": 18389 + }, + { + "epoch": 5.644567219152854, + "grad_norm": 0.20150595903396606, + "learning_rate": 4.2031521532907075e-05, + "loss": 1.678, + "step": 18390 + }, + { + "epoch": 5.64487415592388, + "grad_norm": 0.2141568511724472, + "learning_rate": 4.202661454119004e-05, + "loss": 1.7274, + "step": 18391 + }, + { + "epoch": 5.645181092694905, + "grad_norm": 0.2641741931438446, + "learning_rate": 4.202170762827069e-05, + "loss": 1.7975, + "step": 18392 + }, + { + "epoch": 5.64548802946593, + "grad_norm": 0.22928468883037567, + "learning_rate": 4.201680079419747e-05, + "loss": 1.7687, + "step": 18393 + }, + { + "epoch": 5.645794966236955, + "grad_norm": 0.22713731229305267, + "learning_rate": 4.2011894039018925e-05, + "loss": 1.7475, + "step": 18394 + }, + { + "epoch": 5.64610190300798, + "grad_norm": 0.25602981448173523, + "learning_rate": 4.200698736278351e-05, + "loss": 1.7356, + "step": 18395 + }, + { + "epoch": 5.6464088397790055, + "grad_norm": 0.2619759738445282, + "learning_rate": 4.200208076553975e-05, + "loss": 1.7334, + "step": 18396 + }, + { + "epoch": 5.646715776550031, + "grad_norm": 0.24756783246994019, + "learning_rate": 4.19971742473361e-05, + "loss": 1.7253, + "step": 18397 + }, + { + "epoch": 5.647022713321056, + "grad_norm": 0.2068249136209488, + "learning_rate": 4.199226780822109e-05, + "loss": 1.7246, + "step": 18398 + }, + { + "epoch": 5.647329650092081, + "grad_norm": 0.23219087719917297, + "learning_rate": 4.1987361448243165e-05, + "loss": 1.7388, + "step": 18399 + }, + { + "epoch": 5.647636586863106, + "grad_norm": 0.2051403522491455, + "learning_rate": 4.198245516745082e-05, + "loss": 1.7775, + "step": 18400 + }, + { + "epoch": 5.647943523634131, + "grad_norm": 0.26408639550209045, + "learning_rate": 4.1977548965892575e-05, + "loss": 1.8069, + "step": 18401 + }, + { + "epoch": 5.648250460405157, + "grad_norm": 0.2104891538619995, + "learning_rate": 4.197264284361687e-05, + "loss": 1.7335, + "step": 18402 + }, + { + "epoch": 5.648557397176182, + "grad_norm": 0.23963849246501923, + "learning_rate": 4.196773680067224e-05, + "loss": 1.7254, + "step": 18403 + }, + { + "epoch": 5.648864333947207, + "grad_norm": 0.2770128846168518, + "learning_rate": 4.1962830837107117e-05, + "loss": 1.7848, + "step": 18404 + }, + { + "epoch": 5.649171270718232, + "grad_norm": 0.23342710733413696, + "learning_rate": 4.195792495297002e-05, + "loss": 1.7818, + "step": 18405 + }, + { + "epoch": 5.649478207489257, + "grad_norm": 0.23835061490535736, + "learning_rate": 4.195301914830941e-05, + "loss": 1.7453, + "step": 18406 + }, + { + "epoch": 5.649785144260282, + "grad_norm": 0.21896767616271973, + "learning_rate": 4.194811342317381e-05, + "loss": 1.7205, + "step": 18407 + }, + { + "epoch": 5.650092081031308, + "grad_norm": 0.20222818851470947, + "learning_rate": 4.1943207777611646e-05, + "loss": 1.6833, + "step": 18408 + }, + { + "epoch": 5.650399017802332, + "grad_norm": 0.2182089239358902, + "learning_rate": 4.193830221167146e-05, + "loss": 1.7296, + "step": 18409 + }, + { + "epoch": 5.650705954573358, + "grad_norm": 0.19981688261032104, + "learning_rate": 4.1933396725401655e-05, + "loss": 1.7327, + "step": 18410 + }, + { + "epoch": 5.651012891344383, + "grad_norm": 0.23925067484378815, + "learning_rate": 4.192849131885077e-05, + "loss": 1.7545, + "step": 18411 + }, + { + "epoch": 5.651319828115408, + "grad_norm": 0.21967993676662445, + "learning_rate": 4.192358599206725e-05, + "loss": 1.6973, + "step": 18412 + }, + { + "epoch": 5.651626764886434, + "grad_norm": 0.2273840606212616, + "learning_rate": 4.1918680745099614e-05, + "loss": 1.8229, + "step": 18413 + }, + { + "epoch": 5.651933701657459, + "grad_norm": 0.26950231194496155, + "learning_rate": 4.1913775577996286e-05, + "loss": 1.7666, + "step": 18414 + }, + { + "epoch": 5.652240638428484, + "grad_norm": 0.26608848571777344, + "learning_rate": 4.190887049080579e-05, + "loss": 1.8279, + "step": 18415 + }, + { + "epoch": 5.652547575199509, + "grad_norm": 0.20856785774230957, + "learning_rate": 4.190396548357658e-05, + "loss": 1.7224, + "step": 18416 + }, + { + "epoch": 5.652854511970534, + "grad_norm": 0.2894255816936493, + "learning_rate": 4.18990605563571e-05, + "loss": 1.7308, + "step": 18417 + }, + { + "epoch": 5.653161448741559, + "grad_norm": 0.2047591209411621, + "learning_rate": 4.189415570919588e-05, + "loss": 1.758, + "step": 18418 + }, + { + "epoch": 5.653468385512585, + "grad_norm": 0.37161269783973694, + "learning_rate": 4.1889250942141346e-05, + "loss": 1.7926, + "step": 18419 + }, + { + "epoch": 5.653775322283609, + "grad_norm": 0.37338340282440186, + "learning_rate": 4.1884346255242e-05, + "loss": 1.7491, + "step": 18420 + }, + { + "epoch": 5.6540822590546345, + "grad_norm": 0.24279838800430298, + "learning_rate": 4.187944164854629e-05, + "loss": 1.7103, + "step": 18421 + }, + { + "epoch": 5.65438919582566, + "grad_norm": 0.219639852643013, + "learning_rate": 4.18745371221027e-05, + "loss": 1.7824, + "step": 18422 + }, + { + "epoch": 5.654696132596685, + "grad_norm": 0.22248409688472748, + "learning_rate": 4.186963267595969e-05, + "loss": 1.8098, + "step": 18423 + }, + { + "epoch": 5.6550030693677105, + "grad_norm": 0.2115657478570938, + "learning_rate": 4.1864728310165755e-05, + "loss": 1.72, + "step": 18424 + }, + { + "epoch": 5.655310006138736, + "grad_norm": 0.19723005592823029, + "learning_rate": 4.1859824024769325e-05, + "loss": 1.6818, + "step": 18425 + }, + { + "epoch": 5.65561694290976, + "grad_norm": 0.1828317642211914, + "learning_rate": 4.185491981981891e-05, + "loss": 1.7243, + "step": 18426 + }, + { + "epoch": 5.655923879680786, + "grad_norm": 0.271781861782074, + "learning_rate": 4.185001569536292e-05, + "loss": 1.7688, + "step": 18427 + }, + { + "epoch": 5.656230816451811, + "grad_norm": 0.3140811324119568, + "learning_rate": 4.184511165144986e-05, + "loss": 1.7319, + "step": 18428 + }, + { + "epoch": 5.656537753222836, + "grad_norm": 0.20013047754764557, + "learning_rate": 4.184020768812818e-05, + "loss": 1.7104, + "step": 18429 + }, + { + "epoch": 5.656844689993862, + "grad_norm": 0.2615044414997101, + "learning_rate": 4.183530380544638e-05, + "loss": 1.7314, + "step": 18430 + }, + { + "epoch": 5.657151626764886, + "grad_norm": 0.2645856440067291, + "learning_rate": 4.183040000345287e-05, + "loss": 1.7431, + "step": 18431 + }, + { + "epoch": 5.657458563535911, + "grad_norm": 0.1916145384311676, + "learning_rate": 4.182549628219615e-05, + "loss": 1.7013, + "step": 18432 + }, + { + "epoch": 5.657765500306937, + "grad_norm": 0.2647114396095276, + "learning_rate": 4.182059264172466e-05, + "loss": 1.7278, + "step": 18433 + }, + { + "epoch": 5.658072437077962, + "grad_norm": 0.20201756060123444, + "learning_rate": 4.1815689082086854e-05, + "loss": 1.7065, + "step": 18434 + }, + { + "epoch": 5.658379373848987, + "grad_norm": 0.23892022669315338, + "learning_rate": 4.181078560333123e-05, + "loss": 1.7365, + "step": 18435 + }, + { + "epoch": 5.658686310620013, + "grad_norm": 0.3125975728034973, + "learning_rate": 4.18058822055062e-05, + "loss": 1.7152, + "step": 18436 + }, + { + "epoch": 5.658993247391037, + "grad_norm": 0.18924804031848907, + "learning_rate": 4.180097888866027e-05, + "loss": 1.7763, + "step": 18437 + }, + { + "epoch": 5.6593001841620625, + "grad_norm": 0.28476929664611816, + "learning_rate": 4.1796075652841845e-05, + "loss": 1.7517, + "step": 18438 + }, + { + "epoch": 5.659607120933088, + "grad_norm": 0.30616337060928345, + "learning_rate": 4.1791172498099416e-05, + "loss": 1.7446, + "step": 18439 + }, + { + "epoch": 5.659914057704113, + "grad_norm": 0.3219330608844757, + "learning_rate": 4.1786269424481426e-05, + "loss": 1.8374, + "step": 18440 + }, + { + "epoch": 5.6602209944751385, + "grad_norm": 0.34074151515960693, + "learning_rate": 4.1781366432036364e-05, + "loss": 1.7915, + "step": 18441 + }, + { + "epoch": 5.660527931246163, + "grad_norm": 0.2321610003709793, + "learning_rate": 4.177646352081263e-05, + "loss": 1.7361, + "step": 18442 + }, + { + "epoch": 5.660834868017188, + "grad_norm": 0.34283575415611267, + "learning_rate": 4.1771560690858716e-05, + "loss": 1.6859, + "step": 18443 + }, + { + "epoch": 5.661141804788214, + "grad_norm": 0.32274290919303894, + "learning_rate": 4.1766657942223055e-05, + "loss": 1.7376, + "step": 18444 + }, + { + "epoch": 5.661448741559239, + "grad_norm": 0.23960906267166138, + "learning_rate": 4.1761755274954105e-05, + "loss": 1.7198, + "step": 18445 + }, + { + "epoch": 5.661755678330264, + "grad_norm": 0.2622305154800415, + "learning_rate": 4.175685268910031e-05, + "loss": 1.6997, + "step": 18446 + }, + { + "epoch": 5.66206261510129, + "grad_norm": 0.19836951792240143, + "learning_rate": 4.1751950184710157e-05, + "loss": 1.6612, + "step": 18447 + }, + { + "epoch": 5.662369551872314, + "grad_norm": 0.29541507363319397, + "learning_rate": 4.174704776183204e-05, + "loss": 1.7606, + "step": 18448 + }, + { + "epoch": 5.662676488643339, + "grad_norm": 0.21632203459739685, + "learning_rate": 4.174214542051445e-05, + "loss": 1.7108, + "step": 18449 + }, + { + "epoch": 5.662983425414365, + "grad_norm": 0.2851164638996124, + "learning_rate": 4.173724316080582e-05, + "loss": 1.747, + "step": 18450 + }, + { + "epoch": 5.66329036218539, + "grad_norm": 0.30293309688568115, + "learning_rate": 4.173234098275458e-05, + "loss": 1.7549, + "step": 18451 + }, + { + "epoch": 5.6635972989564145, + "grad_norm": 0.2131963074207306, + "learning_rate": 4.172743888640921e-05, + "loss": 1.7804, + "step": 18452 + }, + { + "epoch": 5.66390423572744, + "grad_norm": 0.234910249710083, + "learning_rate": 4.172253687181812e-05, + "loss": 1.7149, + "step": 18453 + }, + { + "epoch": 5.664211172498465, + "grad_norm": 0.21238654851913452, + "learning_rate": 4.171763493902979e-05, + "loss": 1.7272, + "step": 18454 + }, + { + "epoch": 5.6645181092694905, + "grad_norm": 0.20571236312389374, + "learning_rate": 4.171273308809263e-05, + "loss": 1.713, + "step": 18455 + }, + { + "epoch": 5.664825046040516, + "grad_norm": 0.24867361783981323, + "learning_rate": 4.1707831319055104e-05, + "loss": 1.682, + "step": 18456 + }, + { + "epoch": 5.665131982811541, + "grad_norm": 0.20556440949440002, + "learning_rate": 4.170292963196564e-05, + "loss": 1.7126, + "step": 18457 + }, + { + "epoch": 5.665438919582566, + "grad_norm": 0.26431065797805786, + "learning_rate": 4.169802802687271e-05, + "loss": 1.8142, + "step": 18458 + }, + { + "epoch": 5.665745856353591, + "grad_norm": 0.26041486859321594, + "learning_rate": 4.169312650382471e-05, + "loss": 1.7206, + "step": 18459 + }, + { + "epoch": 5.666052793124616, + "grad_norm": 0.2190525084733963, + "learning_rate": 4.1688225062870126e-05, + "loss": 1.787, + "step": 18460 + }, + { + "epoch": 5.666359729895642, + "grad_norm": 0.24726425111293793, + "learning_rate": 4.1683323704057354e-05, + "loss": 1.7677, + "step": 18461 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.22206442058086395, + "learning_rate": 4.167842242743486e-05, + "loss": 1.73, + "step": 18462 + }, + { + "epoch": 5.666973603437691, + "grad_norm": 0.22501195967197418, + "learning_rate": 4.167352123305108e-05, + "loss": 1.7213, + "step": 18463 + }, + { + "epoch": 5.667280540208717, + "grad_norm": 0.26164770126342773, + "learning_rate": 4.166862012095443e-05, + "loss": 1.7839, + "step": 18464 + }, + { + "epoch": 5.667587476979742, + "grad_norm": 0.19480809569358826, + "learning_rate": 4.166371909119336e-05, + "loss": 1.7562, + "step": 18465 + }, + { + "epoch": 5.667894413750767, + "grad_norm": 0.26677292585372925, + "learning_rate": 4.165881814381632e-05, + "loss": 1.776, + "step": 18466 + }, + { + "epoch": 5.668201350521793, + "grad_norm": 0.22019581496715546, + "learning_rate": 4.165391727887172e-05, + "loss": 1.7575, + "step": 18467 + }, + { + "epoch": 5.668508287292818, + "grad_norm": 0.23851899802684784, + "learning_rate": 4.1649016496407986e-05, + "loss": 1.7346, + "step": 18468 + }, + { + "epoch": 5.6688152240638425, + "grad_norm": 0.3118130564689636, + "learning_rate": 4.1644115796473596e-05, + "loss": 1.7808, + "step": 18469 + }, + { + "epoch": 5.669122160834868, + "grad_norm": 0.22783879935741425, + "learning_rate": 4.163921517911692e-05, + "loss": 1.831, + "step": 18470 + }, + { + "epoch": 5.669429097605893, + "grad_norm": 0.2203773707151413, + "learning_rate": 4.163431464438645e-05, + "loss": 1.7034, + "step": 18471 + }, + { + "epoch": 5.6697360343769185, + "grad_norm": 0.21838103234767914, + "learning_rate": 4.162941419233056e-05, + "loss": 1.7553, + "step": 18472 + }, + { + "epoch": 5.670042971147944, + "grad_norm": 0.18453563749790192, + "learning_rate": 4.162451382299771e-05, + "loss": 1.7139, + "step": 18473 + }, + { + "epoch": 5.670349907918968, + "grad_norm": 0.25308313965797424, + "learning_rate": 4.161961353643633e-05, + "loss": 1.7291, + "step": 18474 + }, + { + "epoch": 5.670656844689994, + "grad_norm": 0.2528827488422394, + "learning_rate": 4.1614713332694845e-05, + "loss": 1.781, + "step": 18475 + }, + { + "epoch": 5.670963781461019, + "grad_norm": 0.24774135649204254, + "learning_rate": 4.160981321182166e-05, + "loss": 1.7808, + "step": 18476 + }, + { + "epoch": 5.671270718232044, + "grad_norm": 0.25225830078125, + "learning_rate": 4.160491317386524e-05, + "loss": 1.739, + "step": 18477 + }, + { + "epoch": 5.67157765500307, + "grad_norm": 0.2095808982849121, + "learning_rate": 4.160001321887397e-05, + "loss": 1.7242, + "step": 18478 + }, + { + "epoch": 5.671884591774095, + "grad_norm": 0.23906216025352478, + "learning_rate": 4.159511334689631e-05, + "loss": 1.7071, + "step": 18479 + }, + { + "epoch": 5.672191528545119, + "grad_norm": 0.21851155161857605, + "learning_rate": 4.159021355798065e-05, + "loss": 1.7171, + "step": 18480 + }, + { + "epoch": 5.672498465316145, + "grad_norm": 0.2005140632390976, + "learning_rate": 4.158531385217544e-05, + "loss": 1.7483, + "step": 18481 + }, + { + "epoch": 5.67280540208717, + "grad_norm": 0.2230832278728485, + "learning_rate": 4.1580414229529074e-05, + "loss": 1.7386, + "step": 18482 + }, + { + "epoch": 5.673112338858195, + "grad_norm": 0.22402967512607574, + "learning_rate": 4.1575514690090014e-05, + "loss": 1.7989, + "step": 18483 + }, + { + "epoch": 5.67341927562922, + "grad_norm": 0.20350080728530884, + "learning_rate": 4.157061523390665e-05, + "loss": 1.6856, + "step": 18484 + }, + { + "epoch": 5.673726212400245, + "grad_norm": 0.2039422243833542, + "learning_rate": 4.15657158610274e-05, + "loss": 1.7262, + "step": 18485 + }, + { + "epoch": 5.6740331491712706, + "grad_norm": 0.20411522686481476, + "learning_rate": 4.156081657150069e-05, + "loss": 1.738, + "step": 18486 + }, + { + "epoch": 5.674340085942296, + "grad_norm": 0.2693086862564087, + "learning_rate": 4.155591736537493e-05, + "loss": 1.731, + "step": 18487 + }, + { + "epoch": 5.674647022713321, + "grad_norm": 0.20745019614696503, + "learning_rate": 4.1551018242698567e-05, + "loss": 1.7138, + "step": 18488 + }, + { + "epoch": 5.6749539594843466, + "grad_norm": 0.22033964097499847, + "learning_rate": 4.1546119203519964e-05, + "loss": 1.8144, + "step": 18489 + }, + { + "epoch": 5.675260896255372, + "grad_norm": 0.22859029471874237, + "learning_rate": 4.154122024788759e-05, + "loss": 1.6724, + "step": 18490 + }, + { + "epoch": 5.675567833026396, + "grad_norm": 0.2226465791463852, + "learning_rate": 4.153632137584982e-05, + "loss": 1.731, + "step": 18491 + }, + { + "epoch": 5.675874769797422, + "grad_norm": 0.19657716155052185, + "learning_rate": 4.1531422587455086e-05, + "loss": 1.6937, + "step": 18492 + }, + { + "epoch": 5.676181706568447, + "grad_norm": 0.23167578876018524, + "learning_rate": 4.152652388275179e-05, + "loss": 1.7444, + "step": 18493 + }, + { + "epoch": 5.676488643339472, + "grad_norm": 0.24468563497066498, + "learning_rate": 4.1521625261788374e-05, + "loss": 1.7173, + "step": 18494 + }, + { + "epoch": 5.676795580110497, + "grad_norm": 0.27125802636146545, + "learning_rate": 4.1516726724613206e-05, + "loss": 1.7424, + "step": 18495 + }, + { + "epoch": 5.677102516881522, + "grad_norm": 0.23816901445388794, + "learning_rate": 4.151182827127473e-05, + "loss": 1.6911, + "step": 18496 + }, + { + "epoch": 5.6774094536525475, + "grad_norm": 0.26058733463287354, + "learning_rate": 4.150692990182133e-05, + "loss": 1.7142, + "step": 18497 + }, + { + "epoch": 5.677716390423573, + "grad_norm": 0.20207929611206055, + "learning_rate": 4.150203161630143e-05, + "loss": 1.7506, + "step": 18498 + }, + { + "epoch": 5.678023327194598, + "grad_norm": 0.259857714176178, + "learning_rate": 4.1497133414763435e-05, + "loss": 1.7181, + "step": 18499 + }, + { + "epoch": 5.6783302639656235, + "grad_norm": 0.2607496380805969, + "learning_rate": 4.149223529725577e-05, + "loss": 1.7829, + "step": 18500 + }, + { + "epoch": 5.678637200736648, + "grad_norm": 0.23265719413757324, + "learning_rate": 4.148733726382681e-05, + "loss": 1.7028, + "step": 18501 + }, + { + "epoch": 5.678944137507673, + "grad_norm": 0.26610276103019714, + "learning_rate": 4.1482439314524964e-05, + "loss": 1.8604, + "step": 18502 + }, + { + "epoch": 5.679251074278699, + "grad_norm": 0.24022582173347473, + "learning_rate": 4.147754144939865e-05, + "loss": 1.7142, + "step": 18503 + }, + { + "epoch": 5.679558011049724, + "grad_norm": 0.2849755585193634, + "learning_rate": 4.1472643668496255e-05, + "loss": 1.6956, + "step": 18504 + }, + { + "epoch": 5.679864947820749, + "grad_norm": 0.24330341815948486, + "learning_rate": 4.1467745971866216e-05, + "loss": 1.7617, + "step": 18505 + }, + { + "epoch": 5.680171884591774, + "grad_norm": 0.21072770655155182, + "learning_rate": 4.146284835955689e-05, + "loss": 1.6999, + "step": 18506 + }, + { + "epoch": 5.680478821362799, + "grad_norm": 0.1971336454153061, + "learning_rate": 4.145795083161673e-05, + "loss": 1.6756, + "step": 18507 + }, + { + "epoch": 5.680785758133824, + "grad_norm": 0.18576614558696747, + "learning_rate": 4.1453053388094073e-05, + "loss": 1.6885, + "step": 18508 + }, + { + "epoch": 5.68109269490485, + "grad_norm": 0.21335965394973755, + "learning_rate": 4.144815602903737e-05, + "loss": 1.7278, + "step": 18509 + }, + { + "epoch": 5.681399631675875, + "grad_norm": 0.21756233274936676, + "learning_rate": 4.1443258754494986e-05, + "loss": 1.7549, + "step": 18510 + }, + { + "epoch": 5.6817065684469, + "grad_norm": 0.2214142084121704, + "learning_rate": 4.143836156451536e-05, + "loss": 1.6654, + "step": 18511 + }, + { + "epoch": 5.682013505217925, + "grad_norm": 0.2230863869190216, + "learning_rate": 4.143346445914684e-05, + "loss": 1.7286, + "step": 18512 + }, + { + "epoch": 5.68232044198895, + "grad_norm": 0.2283746749162674, + "learning_rate": 4.142856743843787e-05, + "loss": 1.7652, + "step": 18513 + }, + { + "epoch": 5.6826273787599755, + "grad_norm": 0.20059749484062195, + "learning_rate": 4.142367050243679e-05, + "loss": 1.6854, + "step": 18514 + }, + { + "epoch": 5.682934315531001, + "grad_norm": 0.17887794971466064, + "learning_rate": 4.141877365119204e-05, + "loss": 1.6975, + "step": 18515 + }, + { + "epoch": 5.683241252302026, + "grad_norm": 0.21266087889671326, + "learning_rate": 4.141387688475199e-05, + "loss": 1.7361, + "step": 18516 + }, + { + "epoch": 5.683548189073051, + "grad_norm": 0.20075422525405884, + "learning_rate": 4.140898020316506e-05, + "loss": 1.7496, + "step": 18517 + }, + { + "epoch": 5.683855125844076, + "grad_norm": 0.21430443227291107, + "learning_rate": 4.140408360647963e-05, + "loss": 1.7481, + "step": 18518 + }, + { + "epoch": 5.684162062615101, + "grad_norm": 0.1951984018087387, + "learning_rate": 4.139918709474405e-05, + "loss": 1.713, + "step": 18519 + }, + { + "epoch": 5.684468999386127, + "grad_norm": 0.21636274456977844, + "learning_rate": 4.1394290668006764e-05, + "loss": 1.8169, + "step": 18520 + }, + { + "epoch": 5.684775936157152, + "grad_norm": 0.21003715693950653, + "learning_rate": 4.138939432631613e-05, + "loss": 1.7453, + "step": 18521 + }, + { + "epoch": 5.685082872928177, + "grad_norm": 0.23559699952602386, + "learning_rate": 4.138449806972057e-05, + "loss": 1.7534, + "step": 18522 + }, + { + "epoch": 5.685389809699202, + "grad_norm": 0.23322029411792755, + "learning_rate": 4.137960189826843e-05, + "loss": 1.7535, + "step": 18523 + }, + { + "epoch": 5.685696746470227, + "grad_norm": 0.1998462826013565, + "learning_rate": 4.137470581200813e-05, + "loss": 1.7025, + "step": 18524 + }, + { + "epoch": 5.686003683241252, + "grad_norm": 0.22321350872516632, + "learning_rate": 4.1369809810988025e-05, + "loss": 1.7666, + "step": 18525 + }, + { + "epoch": 5.686310620012278, + "grad_norm": 0.20851604640483856, + "learning_rate": 4.136491389525653e-05, + "loss": 1.6958, + "step": 18526 + }, + { + "epoch": 5.686617556783302, + "grad_norm": 0.21494868397712708, + "learning_rate": 4.136001806486201e-05, + "loss": 1.7703, + "step": 18527 + }, + { + "epoch": 5.6869244935543275, + "grad_norm": 0.19872798025608063, + "learning_rate": 4.135512231985287e-05, + "loss": 1.7451, + "step": 18528 + }, + { + "epoch": 5.687231430325353, + "grad_norm": 0.2424371987581253, + "learning_rate": 4.1350226660277456e-05, + "loss": 1.8153, + "step": 18529 + }, + { + "epoch": 5.687538367096378, + "grad_norm": 0.20388297736644745, + "learning_rate": 4.1345331086184196e-05, + "loss": 1.6882, + "step": 18530 + }, + { + "epoch": 5.6878453038674035, + "grad_norm": 0.22662605345249176, + "learning_rate": 4.134043559762143e-05, + "loss": 1.7532, + "step": 18531 + }, + { + "epoch": 5.688152240638429, + "grad_norm": 0.2281452864408493, + "learning_rate": 4.133554019463756e-05, + "loss": 1.769, + "step": 18532 + }, + { + "epoch": 5.688459177409453, + "grad_norm": 0.2303505390882492, + "learning_rate": 4.1330644877280955e-05, + "loss": 1.7176, + "step": 18533 + }, + { + "epoch": 5.688766114180479, + "grad_norm": 0.24411743879318237, + "learning_rate": 4.132574964560001e-05, + "loss": 1.7557, + "step": 18534 + }, + { + "epoch": 5.689073050951504, + "grad_norm": 0.2674088776111603, + "learning_rate": 4.13208544996431e-05, + "loss": 1.6997, + "step": 18535 + }, + { + "epoch": 5.689379987722529, + "grad_norm": 0.22232958674430847, + "learning_rate": 4.1315959439458565e-05, + "loss": 1.7731, + "step": 18536 + }, + { + "epoch": 5.689686924493555, + "grad_norm": 0.23894453048706055, + "learning_rate": 4.131106446509483e-05, + "loss": 1.7454, + "step": 18537 + }, + { + "epoch": 5.689993861264579, + "grad_norm": 0.19710026681423187, + "learning_rate": 4.1306169576600226e-05, + "loss": 1.6872, + "step": 18538 + }, + { + "epoch": 5.690300798035604, + "grad_norm": 0.1879546344280243, + "learning_rate": 4.130127477402318e-05, + "loss": 1.6929, + "step": 18539 + }, + { + "epoch": 5.69060773480663, + "grad_norm": 0.1964653730392456, + "learning_rate": 4.129638005741201e-05, + "loss": 1.7778, + "step": 18540 + }, + { + "epoch": 5.690914671577655, + "grad_norm": 0.20161493122577667, + "learning_rate": 4.129148542681513e-05, + "loss": 1.7388, + "step": 18541 + }, + { + "epoch": 5.69122160834868, + "grad_norm": 0.26742830872535706, + "learning_rate": 4.1286590882280886e-05, + "loss": 1.7472, + "step": 18542 + }, + { + "epoch": 5.691528545119706, + "grad_norm": 0.2613312900066376, + "learning_rate": 4.128169642385766e-05, + "loss": 1.7656, + "step": 18543 + }, + { + "epoch": 5.69183548189073, + "grad_norm": 0.17979474365711212, + "learning_rate": 4.127680205159381e-05, + "loss": 1.6992, + "step": 18544 + }, + { + "epoch": 5.6921424186617555, + "grad_norm": 0.23575037717819214, + "learning_rate": 4.1271907765537745e-05, + "loss": 1.7399, + "step": 18545 + }, + { + "epoch": 5.692449355432781, + "grad_norm": 0.19461458921432495, + "learning_rate": 4.126701356573777e-05, + "loss": 1.709, + "step": 18546 + }, + { + "epoch": 5.692756292203806, + "grad_norm": 0.19715365767478943, + "learning_rate": 4.1262119452242306e-05, + "loss": 1.7634, + "step": 18547 + }, + { + "epoch": 5.6930632289748315, + "grad_norm": 0.21454904973506927, + "learning_rate": 4.125722542509969e-05, + "loss": 1.7663, + "step": 18548 + }, + { + "epoch": 5.693370165745856, + "grad_norm": 0.19884896278381348, + "learning_rate": 4.12523314843583e-05, + "loss": 1.7618, + "step": 18549 + }, + { + "epoch": 5.693677102516881, + "grad_norm": 0.2080020159482956, + "learning_rate": 4.124743763006648e-05, + "loss": 1.7379, + "step": 18550 + }, + { + "epoch": 5.693984039287907, + "grad_norm": 0.18780875205993652, + "learning_rate": 4.124254386227264e-05, + "loss": 1.7036, + "step": 18551 + }, + { + "epoch": 5.694290976058932, + "grad_norm": 0.2114439308643341, + "learning_rate": 4.123765018102512e-05, + "loss": 1.6873, + "step": 18552 + }, + { + "epoch": 5.694597912829957, + "grad_norm": 0.1712789535522461, + "learning_rate": 4.123275658637225e-05, + "loss": 1.6772, + "step": 18553 + }, + { + "epoch": 5.694904849600983, + "grad_norm": 0.2435859888792038, + "learning_rate": 4.122786307836243e-05, + "loss": 1.7946, + "step": 18554 + }, + { + "epoch": 5.695211786372007, + "grad_norm": 0.20587889850139618, + "learning_rate": 4.122296965704399e-05, + "loss": 1.7459, + "step": 18555 + }, + { + "epoch": 5.695518723143032, + "grad_norm": 0.2183443009853363, + "learning_rate": 4.121807632246534e-05, + "loss": 1.7036, + "step": 18556 + }, + { + "epoch": 5.695825659914058, + "grad_norm": 0.19276869297027588, + "learning_rate": 4.121318307467478e-05, + "loss": 1.7371, + "step": 18557 + }, + { + "epoch": 5.696132596685083, + "grad_norm": 0.19815512001514435, + "learning_rate": 4.120828991372072e-05, + "loss": 1.7038, + "step": 18558 + }, + { + "epoch": 5.696439533456108, + "grad_norm": 0.18509675562381744, + "learning_rate": 4.120339683965146e-05, + "loss": 1.6936, + "step": 18559 + }, + { + "epoch": 5.696746470227133, + "grad_norm": 0.2296193689107895, + "learning_rate": 4.1198503852515416e-05, + "loss": 1.7626, + "step": 18560 + }, + { + "epoch": 5.697053406998158, + "grad_norm": 0.2064799964427948, + "learning_rate": 4.11936109523609e-05, + "loss": 1.7387, + "step": 18561 + }, + { + "epoch": 5.6973603437691835, + "grad_norm": 0.20171360671520233, + "learning_rate": 4.1188718139236296e-05, + "loss": 1.7372, + "step": 18562 + }, + { + "epoch": 5.697667280540209, + "grad_norm": 0.19421936571598053, + "learning_rate": 4.118382541318993e-05, + "loss": 1.7187, + "step": 18563 + }, + { + "epoch": 5.697974217311234, + "grad_norm": 0.22517532110214233, + "learning_rate": 4.117893277427018e-05, + "loss": 1.7503, + "step": 18564 + }, + { + "epoch": 5.6982811540822595, + "grad_norm": 0.2293393909931183, + "learning_rate": 4.1174040222525366e-05, + "loss": 1.7174, + "step": 18565 + }, + { + "epoch": 5.698588090853284, + "grad_norm": 0.24003073573112488, + "learning_rate": 4.1169147758003876e-05, + "loss": 1.7829, + "step": 18566 + }, + { + "epoch": 5.698895027624309, + "grad_norm": 0.21476133167743683, + "learning_rate": 4.1164255380754034e-05, + "loss": 1.7906, + "step": 18567 + }, + { + "epoch": 5.699201964395335, + "grad_norm": 0.21347576379776, + "learning_rate": 4.115936309082422e-05, + "loss": 1.6986, + "step": 18568 + }, + { + "epoch": 5.69950890116636, + "grad_norm": 0.22650402784347534, + "learning_rate": 4.115447088826276e-05, + "loss": 1.7949, + "step": 18569 + }, + { + "epoch": 5.699815837937384, + "grad_norm": 0.25815197825431824, + "learning_rate": 4.114957877311799e-05, + "loss": 1.7499, + "step": 18570 + }, + { + "epoch": 5.70012277470841, + "grad_norm": 0.22644442319869995, + "learning_rate": 4.1144686745438265e-05, + "loss": 1.7689, + "step": 18571 + }, + { + "epoch": 5.700429711479435, + "grad_norm": 0.241188645362854, + "learning_rate": 4.113979480527194e-05, + "loss": 1.7341, + "step": 18572 + }, + { + "epoch": 5.7007366482504604, + "grad_norm": 0.20984862744808197, + "learning_rate": 4.1134902952667365e-05, + "loss": 1.7091, + "step": 18573 + }, + { + "epoch": 5.701043585021486, + "grad_norm": 0.25150877237319946, + "learning_rate": 4.113001118767286e-05, + "loss": 1.723, + "step": 18574 + }, + { + "epoch": 5.701350521792511, + "grad_norm": 0.21693028509616852, + "learning_rate": 4.1125119510336804e-05, + "loss": 1.7483, + "step": 18575 + }, + { + "epoch": 5.701657458563536, + "grad_norm": 0.2620212733745575, + "learning_rate": 4.11202279207075e-05, + "loss": 1.8159, + "step": 18576 + }, + { + "epoch": 5.701964395334561, + "grad_norm": 0.18722239136695862, + "learning_rate": 4.111533641883332e-05, + "loss": 1.7197, + "step": 18577 + }, + { + "epoch": 5.702271332105586, + "grad_norm": 0.21321091055870056, + "learning_rate": 4.111044500476258e-05, + "loss": 1.7408, + "step": 18578 + }, + { + "epoch": 5.702578268876612, + "grad_norm": 0.24459265172481537, + "learning_rate": 4.110555367854365e-05, + "loss": 1.8304, + "step": 18579 + }, + { + "epoch": 5.702885205647637, + "grad_norm": 0.24987100064754486, + "learning_rate": 4.110066244022483e-05, + "loss": 1.7051, + "step": 18580 + }, + { + "epoch": 5.703192142418661, + "grad_norm": 0.19059090316295624, + "learning_rate": 4.1095771289854506e-05, + "loss": 1.7489, + "step": 18581 + }, + { + "epoch": 5.703499079189687, + "grad_norm": 0.23020480573177338, + "learning_rate": 4.1090880227480966e-05, + "loss": 1.7101, + "step": 18582 + }, + { + "epoch": 5.703806015960712, + "grad_norm": 0.18733634054660797, + "learning_rate": 4.108598925315258e-05, + "loss": 1.7116, + "step": 18583 + }, + { + "epoch": 5.704112952731737, + "grad_norm": 0.1959095001220703, + "learning_rate": 4.108109836691766e-05, + "loss": 1.7283, + "step": 18584 + }, + { + "epoch": 5.704419889502763, + "grad_norm": 0.22685091197490692, + "learning_rate": 4.107620756882457e-05, + "loss": 1.7588, + "step": 18585 + }, + { + "epoch": 5.704726826273788, + "grad_norm": 0.1998603790998459, + "learning_rate": 4.107131685892164e-05, + "loss": 1.7071, + "step": 18586 + }, + { + "epoch": 5.7050337630448125, + "grad_norm": 0.2018733024597168, + "learning_rate": 4.106642623725717e-05, + "loss": 1.6782, + "step": 18587 + }, + { + "epoch": 5.705340699815838, + "grad_norm": 0.21826615929603577, + "learning_rate": 4.106153570387951e-05, + "loss": 1.736, + "step": 18588 + }, + { + "epoch": 5.705647636586863, + "grad_norm": 0.20197603106498718, + "learning_rate": 4.105664525883699e-05, + "loss": 1.6921, + "step": 18589 + }, + { + "epoch": 5.7059545733578885, + "grad_norm": 0.20943905413150787, + "learning_rate": 4.105175490217796e-05, + "loss": 1.665, + "step": 18590 + }, + { + "epoch": 5.706261510128914, + "grad_norm": 0.202060267329216, + "learning_rate": 4.104686463395071e-05, + "loss": 1.714, + "step": 18591 + }, + { + "epoch": 5.706568446899938, + "grad_norm": 0.220698744058609, + "learning_rate": 4.1041974454203623e-05, + "loss": 1.8076, + "step": 18592 + }, + { + "epoch": 5.706875383670964, + "grad_norm": 0.21536946296691895, + "learning_rate": 4.103708436298497e-05, + "loss": 1.6801, + "step": 18593 + }, + { + "epoch": 5.707182320441989, + "grad_norm": 0.21442468464374542, + "learning_rate": 4.103219436034311e-05, + "loss": 1.6921, + "step": 18594 + }, + { + "epoch": 5.707489257213014, + "grad_norm": 0.2047559767961502, + "learning_rate": 4.1027304446326356e-05, + "loss": 1.7861, + "step": 18595 + }, + { + "epoch": 5.70779619398404, + "grad_norm": 0.20304669439792633, + "learning_rate": 4.102241462098305e-05, + "loss": 1.7751, + "step": 18596 + }, + { + "epoch": 5.708103130755065, + "grad_norm": 0.18702620267868042, + "learning_rate": 4.101752488436149e-05, + "loss": 1.6951, + "step": 18597 + }, + { + "epoch": 5.708410067526089, + "grad_norm": 0.1821923404932022, + "learning_rate": 4.1012635236510034e-05, + "loss": 1.711, + "step": 18598 + }, + { + "epoch": 5.708717004297115, + "grad_norm": 0.19422096014022827, + "learning_rate": 4.100774567747696e-05, + "loss": 1.7202, + "step": 18599 + }, + { + "epoch": 5.70902394106814, + "grad_norm": 0.20800530910491943, + "learning_rate": 4.100285620731063e-05, + "loss": 1.7403, + "step": 18600 + }, + { + "epoch": 5.709330877839165, + "grad_norm": 0.221746027469635, + "learning_rate": 4.099796682605934e-05, + "loss": 1.7769, + "step": 18601 + }, + { + "epoch": 5.70963781461019, + "grad_norm": 0.19284313917160034, + "learning_rate": 4.099307753377143e-05, + "loss": 1.692, + "step": 18602 + }, + { + "epoch": 5.709944751381215, + "grad_norm": 0.17635129392147064, + "learning_rate": 4.0988188330495216e-05, + "loss": 1.7212, + "step": 18603 + }, + { + "epoch": 5.7102516881522405, + "grad_norm": 0.17728061974048615, + "learning_rate": 4.098329921627898e-05, + "loss": 1.7217, + "step": 18604 + }, + { + "epoch": 5.710558624923266, + "grad_norm": 0.19998152554035187, + "learning_rate": 4.097841019117108e-05, + "loss": 1.7583, + "step": 18605 + }, + { + "epoch": 5.710865561694291, + "grad_norm": 0.18840095400810242, + "learning_rate": 4.09735212552198e-05, + "loss": 1.7353, + "step": 18606 + }, + { + "epoch": 5.7111724984653165, + "grad_norm": 0.2528367042541504, + "learning_rate": 4.09686324084735e-05, + "loss": 1.7576, + "step": 18607 + }, + { + "epoch": 5.711479435236341, + "grad_norm": 0.27240338921546936, + "learning_rate": 4.096374365098045e-05, + "loss": 1.7303, + "step": 18608 + }, + { + "epoch": 5.711786372007366, + "grad_norm": 0.20187151432037354, + "learning_rate": 4.0958854982789e-05, + "loss": 1.7599, + "step": 18609 + }, + { + "epoch": 5.712093308778392, + "grad_norm": 0.24890528619289398, + "learning_rate": 4.095396640394742e-05, + "loss": 1.7737, + "step": 18610 + }, + { + "epoch": 5.712400245549417, + "grad_norm": 0.21524454653263092, + "learning_rate": 4.094907791450406e-05, + "loss": 1.7704, + "step": 18611 + }, + { + "epoch": 5.712707182320442, + "grad_norm": 0.20070379972457886, + "learning_rate": 4.094418951450721e-05, + "loss": 1.7358, + "step": 18612 + }, + { + "epoch": 5.713014119091467, + "grad_norm": 0.2252196967601776, + "learning_rate": 4.09393012040052e-05, + "loss": 1.7262, + "step": 18613 + }, + { + "epoch": 5.713321055862492, + "grad_norm": 0.19511987268924713, + "learning_rate": 4.093441298304631e-05, + "loss": 1.7146, + "step": 18614 + }, + { + "epoch": 5.713627992633517, + "grad_norm": 0.2047072798013687, + "learning_rate": 4.092952485167888e-05, + "loss": 1.7864, + "step": 18615 + }, + { + "epoch": 5.713934929404543, + "grad_norm": 0.21794871985912323, + "learning_rate": 4.092463680995119e-05, + "loss": 1.7759, + "step": 18616 + }, + { + "epoch": 5.714241866175568, + "grad_norm": 0.23863841593265533, + "learning_rate": 4.0919748857911566e-05, + "loss": 1.7207, + "step": 18617 + }, + { + "epoch": 5.714548802946593, + "grad_norm": 0.19706958532333374, + "learning_rate": 4.09148609956083e-05, + "loss": 1.7247, + "step": 18618 + }, + { + "epoch": 5.714855739717618, + "grad_norm": 0.23663771152496338, + "learning_rate": 4.090997322308971e-05, + "loss": 1.7929, + "step": 18619 + }, + { + "epoch": 5.715162676488643, + "grad_norm": 0.23079079389572144, + "learning_rate": 4.09050855404041e-05, + "loss": 1.763, + "step": 18620 + }, + { + "epoch": 5.7154696132596685, + "grad_norm": 0.23883379995822906, + "learning_rate": 4.0900197947599736e-05, + "loss": 1.7995, + "step": 18621 + }, + { + "epoch": 5.715776550030694, + "grad_norm": 0.2125123143196106, + "learning_rate": 4.0895310444724974e-05, + "loss": 1.8045, + "step": 18622 + }, + { + "epoch": 5.716083486801719, + "grad_norm": 0.21062424778938293, + "learning_rate": 4.0890423031828076e-05, + "loss": 1.7348, + "step": 18623 + }, + { + "epoch": 5.716390423572744, + "grad_norm": 0.24079614877700806, + "learning_rate": 4.088553570895737e-05, + "loss": 1.7462, + "step": 18624 + }, + { + "epoch": 5.716697360343769, + "grad_norm": 0.2120666354894638, + "learning_rate": 4.088064847616113e-05, + "loss": 1.7235, + "step": 18625 + }, + { + "epoch": 5.717004297114794, + "grad_norm": 0.19663050770759583, + "learning_rate": 4.0875761333487685e-05, + "loss": 1.6743, + "step": 18626 + }, + { + "epoch": 5.71731123388582, + "grad_norm": 0.24010685086250305, + "learning_rate": 4.0870874280985295e-05, + "loss": 1.6742, + "step": 18627 + }, + { + "epoch": 5.717618170656845, + "grad_norm": 0.22140294313430786, + "learning_rate": 4.086598731870228e-05, + "loss": 1.7601, + "step": 18628 + }, + { + "epoch": 5.71792510742787, + "grad_norm": 0.2876693308353424, + "learning_rate": 4.086110044668694e-05, + "loss": 1.7601, + "step": 18629 + }, + { + "epoch": 5.718232044198895, + "grad_norm": 0.3103853464126587, + "learning_rate": 4.085621366498756e-05, + "loss": 1.6824, + "step": 18630 + }, + { + "epoch": 5.71853898096992, + "grad_norm": 0.18194396793842316, + "learning_rate": 4.0851326973652424e-05, + "loss": 1.6976, + "step": 18631 + }, + { + "epoch": 5.718845917740945, + "grad_norm": 0.28400903940200806, + "learning_rate": 4.0846440372729854e-05, + "loss": 1.7352, + "step": 18632 + }, + { + "epoch": 5.719152854511971, + "grad_norm": 0.23753583431243896, + "learning_rate": 4.084155386226811e-05, + "loss": 1.7418, + "step": 18633 + }, + { + "epoch": 5.719459791282996, + "grad_norm": 0.215620756149292, + "learning_rate": 4.0836667442315514e-05, + "loss": 1.7602, + "step": 18634 + }, + { + "epoch": 5.7197667280540205, + "grad_norm": 0.21057941019535065, + "learning_rate": 4.083178111292034e-05, + "loss": 1.6818, + "step": 18635 + }, + { + "epoch": 5.720073664825046, + "grad_norm": 0.2169445902109146, + "learning_rate": 4.0826894874130863e-05, + "loss": 1.7942, + "step": 18636 + }, + { + "epoch": 5.720380601596071, + "grad_norm": 0.2779453992843628, + "learning_rate": 4.082200872599541e-05, + "loss": 1.7432, + "step": 18637 + }, + { + "epoch": 5.7206875383670965, + "grad_norm": 0.22556698322296143, + "learning_rate": 4.0817122668562224e-05, + "loss": 1.7748, + "step": 18638 + }, + { + "epoch": 5.720994475138122, + "grad_norm": 0.2570365071296692, + "learning_rate": 4.081223670187962e-05, + "loss": 1.7314, + "step": 18639 + }, + { + "epoch": 5.721301411909147, + "grad_norm": 0.266176700592041, + "learning_rate": 4.080735082599588e-05, + "loss": 1.689, + "step": 18640 + }, + { + "epoch": 5.721608348680172, + "grad_norm": 0.20190037786960602, + "learning_rate": 4.080246504095929e-05, + "loss": 1.7467, + "step": 18641 + }, + { + "epoch": 5.721915285451197, + "grad_norm": 0.2498215138912201, + "learning_rate": 4.079757934681813e-05, + "loss": 1.7063, + "step": 18642 + }, + { + "epoch": 5.722222222222222, + "grad_norm": 0.25594204664230347, + "learning_rate": 4.0792693743620695e-05, + "loss": 1.7096, + "step": 18643 + }, + { + "epoch": 5.722529158993248, + "grad_norm": 0.22674626111984253, + "learning_rate": 4.0787808231415233e-05, + "loss": 1.715, + "step": 18644 + }, + { + "epoch": 5.722836095764272, + "grad_norm": 0.267140656709671, + "learning_rate": 4.078292281025007e-05, + "loss": 1.7747, + "step": 18645 + }, + { + "epoch": 5.723143032535297, + "grad_norm": 0.21161147952079773, + "learning_rate": 4.077803748017345e-05, + "loss": 1.7312, + "step": 18646 + }, + { + "epoch": 5.723449969306323, + "grad_norm": 0.2580260634422302, + "learning_rate": 4.077315224123368e-05, + "loss": 1.7246, + "step": 18647 + }, + { + "epoch": 5.723756906077348, + "grad_norm": 0.23766927421092987, + "learning_rate": 4.076826709347902e-05, + "loss": 1.7147, + "step": 18648 + }, + { + "epoch": 5.724063842848373, + "grad_norm": 0.22764286398887634, + "learning_rate": 4.076338203695776e-05, + "loss": 1.7034, + "step": 18649 + }, + { + "epoch": 5.724370779619399, + "grad_norm": 0.28205159306526184, + "learning_rate": 4.075849707171817e-05, + "loss": 1.7472, + "step": 18650 + }, + { + "epoch": 5.724677716390423, + "grad_norm": 0.2091183066368103, + "learning_rate": 4.075361219780854e-05, + "loss": 1.7693, + "step": 18651 + }, + { + "epoch": 5.7249846531614486, + "grad_norm": 0.29513829946517944, + "learning_rate": 4.074872741527713e-05, + "loss": 1.7286, + "step": 18652 + }, + { + "epoch": 5.725291589932474, + "grad_norm": 0.226357102394104, + "learning_rate": 4.07438427241722e-05, + "loss": 1.7658, + "step": 18653 + }, + { + "epoch": 5.725598526703499, + "grad_norm": 0.23732580244541168, + "learning_rate": 4.073895812454207e-05, + "loss": 1.7591, + "step": 18654 + }, + { + "epoch": 5.725905463474525, + "grad_norm": 0.2835488021373749, + "learning_rate": 4.0734073616434956e-05, + "loss": 1.757, + "step": 18655 + }, + { + "epoch": 5.726212400245549, + "grad_norm": 0.1986306756734848, + "learning_rate": 4.0729189199899186e-05, + "loss": 1.714, + "step": 18656 + }, + { + "epoch": 5.726519337016574, + "grad_norm": 0.25071820616722107, + "learning_rate": 4.072430487498298e-05, + "loss": 1.7334, + "step": 18657 + }, + { + "epoch": 5.7268262737876, + "grad_norm": 0.19989889860153198, + "learning_rate": 4.0719420641734634e-05, + "loss": 1.7472, + "step": 18658 + }, + { + "epoch": 5.727133210558625, + "grad_norm": 0.30006101727485657, + "learning_rate": 4.071453650020241e-05, + "loss": 1.7846, + "step": 18659 + }, + { + "epoch": 5.72744014732965, + "grad_norm": 0.19856922328472137, + "learning_rate": 4.070965245043459e-05, + "loss": 1.6965, + "step": 18660 + }, + { + "epoch": 5.727747084100676, + "grad_norm": 0.20139823853969574, + "learning_rate": 4.070476849247941e-05, + "loss": 1.7265, + "step": 18661 + }, + { + "epoch": 5.7280540208717, + "grad_norm": 0.21507953107357025, + "learning_rate": 4.0699884626385184e-05, + "loss": 1.762, + "step": 18662 + }, + { + "epoch": 5.7283609576427255, + "grad_norm": 0.1885843127965927, + "learning_rate": 4.069500085220013e-05, + "loss": 1.6721, + "step": 18663 + }, + { + "epoch": 5.728667894413751, + "grad_norm": 0.2076897919178009, + "learning_rate": 4.069011716997253e-05, + "loss": 1.7399, + "step": 18664 + }, + { + "epoch": 5.728974831184776, + "grad_norm": 0.21482045948505402, + "learning_rate": 4.068523357975065e-05, + "loss": 1.7105, + "step": 18665 + }, + { + "epoch": 5.7292817679558015, + "grad_norm": 0.20438800752162933, + "learning_rate": 4.0680350081582765e-05, + "loss": 1.7408, + "step": 18666 + }, + { + "epoch": 5.729588704726826, + "grad_norm": 0.2137845903635025, + "learning_rate": 4.0675466675517104e-05, + "loss": 1.7814, + "step": 18667 + }, + { + "epoch": 5.729895641497851, + "grad_norm": 0.23009657859802246, + "learning_rate": 4.067058336160197e-05, + "loss": 1.7311, + "step": 18668 + }, + { + "epoch": 5.730202578268877, + "grad_norm": 0.20602397620677948, + "learning_rate": 4.066570013988558e-05, + "loss": 1.741, + "step": 18669 + }, + { + "epoch": 5.730509515039902, + "grad_norm": 0.24884814023971558, + "learning_rate": 4.066081701041621e-05, + "loss": 1.7222, + "step": 18670 + }, + { + "epoch": 5.730816451810927, + "grad_norm": 0.17906342446804047, + "learning_rate": 4.065593397324214e-05, + "loss": 1.6879, + "step": 18671 + }, + { + "epoch": 5.731123388581953, + "grad_norm": 0.20345427095890045, + "learning_rate": 4.0651051028411586e-05, + "loss": 1.7713, + "step": 18672 + }, + { + "epoch": 5.731430325352977, + "grad_norm": 0.21115002036094666, + "learning_rate": 4.0646168175972846e-05, + "loss": 1.7666, + "step": 18673 + }, + { + "epoch": 5.731737262124002, + "grad_norm": 0.22189734876155853, + "learning_rate": 4.064128541597413e-05, + "loss": 1.6989, + "step": 18674 + }, + { + "epoch": 5.732044198895028, + "grad_norm": 0.24036027491092682, + "learning_rate": 4.063640274846373e-05, + "loss": 1.707, + "step": 18675 + }, + { + "epoch": 5.732351135666053, + "grad_norm": 0.23091022670269012, + "learning_rate": 4.063152017348988e-05, + "loss": 1.7072, + "step": 18676 + }, + { + "epoch": 5.7326580724370775, + "grad_norm": 0.3142668306827545, + "learning_rate": 4.062663769110085e-05, + "loss": 1.7641, + "step": 18677 + }, + { + "epoch": 5.732965009208103, + "grad_norm": 0.2634848356246948, + "learning_rate": 4.0621755301344875e-05, + "loss": 1.7007, + "step": 18678 + }, + { + "epoch": 5.733271945979128, + "grad_norm": 0.21296904981136322, + "learning_rate": 4.061687300427022e-05, + "loss": 1.7201, + "step": 18679 + }, + { + "epoch": 5.7335788827501535, + "grad_norm": 0.24943144619464874, + "learning_rate": 4.0611990799925104e-05, + "loss": 1.7186, + "step": 18680 + }, + { + "epoch": 5.733885819521179, + "grad_norm": 0.2574152946472168, + "learning_rate": 4.060710868835781e-05, + "loss": 1.8671, + "step": 18681 + }, + { + "epoch": 5.734192756292204, + "grad_norm": 0.26023826003074646, + "learning_rate": 4.0602226669616564e-05, + "loss": 1.7618, + "step": 18682 + }, + { + "epoch": 5.734499693063229, + "grad_norm": 0.21078336238861084, + "learning_rate": 4.0597344743749645e-05, + "loss": 1.7548, + "step": 18683 + }, + { + "epoch": 5.734806629834254, + "grad_norm": 0.2195056676864624, + "learning_rate": 4.059246291080525e-05, + "loss": 1.6843, + "step": 18684 + }, + { + "epoch": 5.735113566605279, + "grad_norm": 0.20719893276691437, + "learning_rate": 4.058758117083168e-05, + "loss": 1.692, + "step": 18685 + }, + { + "epoch": 5.735420503376305, + "grad_norm": 0.23012077808380127, + "learning_rate": 4.058269952387713e-05, + "loss": 1.7072, + "step": 18686 + }, + { + "epoch": 5.73572744014733, + "grad_norm": 0.18598411977291107, + "learning_rate": 4.057781796998986e-05, + "loss": 1.6983, + "step": 18687 + }, + { + "epoch": 5.736034376918354, + "grad_norm": 0.20211926102638245, + "learning_rate": 4.057293650921813e-05, + "loss": 1.6818, + "step": 18688 + }, + { + "epoch": 5.73634131368938, + "grad_norm": 0.1957080215215683, + "learning_rate": 4.056805514161015e-05, + "loss": 1.7154, + "step": 18689 + }, + { + "epoch": 5.736648250460405, + "grad_norm": 0.23581798374652863, + "learning_rate": 4.0563173867214196e-05, + "loss": 1.7724, + "step": 18690 + }, + { + "epoch": 5.73695518723143, + "grad_norm": 0.22706671059131622, + "learning_rate": 4.055829268607847e-05, + "loss": 1.7387, + "step": 18691 + }, + { + "epoch": 5.737262124002456, + "grad_norm": 0.20050427317619324, + "learning_rate": 4.055341159825124e-05, + "loss": 1.7585, + "step": 18692 + }, + { + "epoch": 5.737569060773481, + "grad_norm": 0.18666231632232666, + "learning_rate": 4.054853060378072e-05, + "loss": 1.6996, + "step": 18693 + }, + { + "epoch": 5.7378759975445055, + "grad_norm": 0.23018911480903625, + "learning_rate": 4.0543649702715186e-05, + "loss": 1.7167, + "step": 18694 + }, + { + "epoch": 5.738182934315531, + "grad_norm": 0.21207039058208466, + "learning_rate": 4.053876889510282e-05, + "loss": 1.7539, + "step": 18695 + }, + { + "epoch": 5.738489871086556, + "grad_norm": 0.22042523324489594, + "learning_rate": 4.0533888180991915e-05, + "loss": 1.8145, + "step": 18696 + }, + { + "epoch": 5.7387968078575815, + "grad_norm": 0.20705139636993408, + "learning_rate": 4.0529007560430646e-05, + "loss": 1.7612, + "step": 18697 + }, + { + "epoch": 5.739103744628607, + "grad_norm": 0.20673857629299164, + "learning_rate": 4.052412703346729e-05, + "loss": 1.7338, + "step": 18698 + }, + { + "epoch": 5.739410681399631, + "grad_norm": 0.20742641389369965, + "learning_rate": 4.051924660015005e-05, + "loss": 1.7497, + "step": 18699 + }, + { + "epoch": 5.739717618170657, + "grad_norm": 0.22352617979049683, + "learning_rate": 4.05143662605272e-05, + "loss": 1.7568, + "step": 18700 + }, + { + "epoch": 5.740024554941682, + "grad_norm": 0.20306691527366638, + "learning_rate": 4.050948601464692e-05, + "loss": 1.7416, + "step": 18701 + }, + { + "epoch": 5.740331491712707, + "grad_norm": 0.22972522675991058, + "learning_rate": 4.050460586255748e-05, + "loss": 1.7907, + "step": 18702 + }, + { + "epoch": 5.740638428483733, + "grad_norm": 0.2056068629026413, + "learning_rate": 4.0499725804307084e-05, + "loss": 1.7584, + "step": 18703 + }, + { + "epoch": 5.740945365254758, + "grad_norm": 0.2150508463382721, + "learning_rate": 4.049484583994395e-05, + "loss": 1.7695, + "step": 18704 + }, + { + "epoch": 5.741252302025782, + "grad_norm": 0.20274797081947327, + "learning_rate": 4.048996596951634e-05, + "loss": 1.7398, + "step": 18705 + }, + { + "epoch": 5.741559238796808, + "grad_norm": 0.20521290600299835, + "learning_rate": 4.0485086193072444e-05, + "loss": 1.7529, + "step": 18706 + }, + { + "epoch": 5.741866175567833, + "grad_norm": 0.22344307601451874, + "learning_rate": 4.0480206510660527e-05, + "loss": 1.6729, + "step": 18707 + }, + { + "epoch": 5.742173112338858, + "grad_norm": 0.20007841289043427, + "learning_rate": 4.047532692232876e-05, + "loss": 1.7004, + "step": 18708 + }, + { + "epoch": 5.742480049109884, + "grad_norm": 0.2455853819847107, + "learning_rate": 4.047044742812541e-05, + "loss": 1.7324, + "step": 18709 + }, + { + "epoch": 5.742786985880908, + "grad_norm": 0.29901546239852905, + "learning_rate": 4.046556802809867e-05, + "loss": 1.7138, + "step": 18710 + }, + { + "epoch": 5.7430939226519335, + "grad_norm": 0.19636842608451843, + "learning_rate": 4.04606887222968e-05, + "loss": 1.7098, + "step": 18711 + }, + { + "epoch": 5.743400859422959, + "grad_norm": 0.24916070699691772, + "learning_rate": 4.045580951076797e-05, + "loss": 1.7073, + "step": 18712 + }, + { + "epoch": 5.743707796193984, + "grad_norm": 0.2122841477394104, + "learning_rate": 4.0450930393560453e-05, + "loss": 1.7608, + "step": 18713 + }, + { + "epoch": 5.7440147329650095, + "grad_norm": 0.25119176506996155, + "learning_rate": 4.044605137072241e-05, + "loss": 1.7528, + "step": 18714 + }, + { + "epoch": 5.744321669736035, + "grad_norm": 0.2128097116947174, + "learning_rate": 4.0441172442302104e-05, + "loss": 1.6834, + "step": 18715 + }, + { + "epoch": 5.744628606507059, + "grad_norm": 0.1771443784236908, + "learning_rate": 4.043629360834772e-05, + "loss": 1.6699, + "step": 18716 + }, + { + "epoch": 5.744935543278085, + "grad_norm": 0.2360549122095108, + "learning_rate": 4.043141486890751e-05, + "loss": 1.7704, + "step": 18717 + }, + { + "epoch": 5.74524248004911, + "grad_norm": 0.22453519701957703, + "learning_rate": 4.0426536224029645e-05, + "loss": 1.7305, + "step": 18718 + }, + { + "epoch": 5.745549416820135, + "grad_norm": 0.2170165628194809, + "learning_rate": 4.042165767376238e-05, + "loss": 1.7859, + "step": 18719 + }, + { + "epoch": 5.74585635359116, + "grad_norm": 0.233921617269516, + "learning_rate": 4.0416779218153896e-05, + "loss": 1.7622, + "step": 18720 + }, + { + "epoch": 5.746163290362185, + "grad_norm": 0.2698482871055603, + "learning_rate": 4.041190085725242e-05, + "loss": 1.7419, + "step": 18721 + }, + { + "epoch": 5.74647022713321, + "grad_norm": 0.28437280654907227, + "learning_rate": 4.0407022591106165e-05, + "loss": 1.7242, + "step": 18722 + }, + { + "epoch": 5.746777163904236, + "grad_norm": 0.2087356448173523, + "learning_rate": 4.040214441976332e-05, + "loss": 1.747, + "step": 18723 + }, + { + "epoch": 5.747084100675261, + "grad_norm": 0.2028181403875351, + "learning_rate": 4.039726634327213e-05, + "loss": 1.7843, + "step": 18724 + }, + { + "epoch": 5.747391037446286, + "grad_norm": 0.18513897061347961, + "learning_rate": 4.039238836168076e-05, + "loss": 1.692, + "step": 18725 + }, + { + "epoch": 5.747697974217311, + "grad_norm": 0.2308989316225052, + "learning_rate": 4.038751047503745e-05, + "loss": 1.6625, + "step": 18726 + }, + { + "epoch": 5.748004910988336, + "grad_norm": 0.23922030627727509, + "learning_rate": 4.0382632683390386e-05, + "loss": 1.7407, + "step": 18727 + }, + { + "epoch": 5.7483118477593615, + "grad_norm": 0.17225340008735657, + "learning_rate": 4.0377754986787806e-05, + "loss": 1.6888, + "step": 18728 + }, + { + "epoch": 5.748618784530387, + "grad_norm": 0.1898551732301712, + "learning_rate": 4.037287738527786e-05, + "loss": 1.6931, + "step": 18729 + }, + { + "epoch": 5.748925721301412, + "grad_norm": 0.22900012135505676, + "learning_rate": 4.036799987890881e-05, + "loss": 1.751, + "step": 18730 + }, + { + "epoch": 5.749232658072437, + "grad_norm": 0.21106193959712982, + "learning_rate": 4.0363122467728815e-05, + "loss": 1.6919, + "step": 18731 + }, + { + "epoch": 5.749539594843462, + "grad_norm": 0.19944290816783905, + "learning_rate": 4.03582451517861e-05, + "loss": 1.7232, + "step": 18732 + }, + { + "epoch": 5.749846531614487, + "grad_norm": 0.1833256036043167, + "learning_rate": 4.035336793112885e-05, + "loss": 1.7199, + "step": 18733 + }, + { + "epoch": 5.750153468385513, + "grad_norm": 0.2596902847290039, + "learning_rate": 4.0348490805805287e-05, + "loss": 1.7386, + "step": 18734 + }, + { + "epoch": 5.750460405156538, + "grad_norm": 0.23708637058734894, + "learning_rate": 4.034361377586357e-05, + "loss": 1.7697, + "step": 18735 + }, + { + "epoch": 5.750767341927563, + "grad_norm": 0.20476554334163666, + "learning_rate": 4.033873684135195e-05, + "loss": 1.7804, + "step": 18736 + }, + { + "epoch": 5.751074278698588, + "grad_norm": 0.2625868320465088, + "learning_rate": 4.033386000231858e-05, + "loss": 1.7046, + "step": 18737 + }, + { + "epoch": 5.751381215469613, + "grad_norm": 0.23011820018291473, + "learning_rate": 4.032898325881166e-05, + "loss": 1.7758, + "step": 18738 + }, + { + "epoch": 5.7516881522406385, + "grad_norm": 0.23972748219966888, + "learning_rate": 4.032410661087943e-05, + "loss": 1.7165, + "step": 18739 + }, + { + "epoch": 5.751995089011664, + "grad_norm": 0.2241208404302597, + "learning_rate": 4.031923005857001e-05, + "loss": 1.713, + "step": 18740 + }, + { + "epoch": 5.752302025782689, + "grad_norm": 0.22316952049732208, + "learning_rate": 4.0314353601931665e-05, + "loss": 1.7655, + "step": 18741 + }, + { + "epoch": 5.752608962553714, + "grad_norm": 0.2177707403898239, + "learning_rate": 4.030947724101253e-05, + "loss": 1.7517, + "step": 18742 + }, + { + "epoch": 5.752915899324739, + "grad_norm": 0.21731823682785034, + "learning_rate": 4.030460097586083e-05, + "loss": 1.718, + "step": 18743 + }, + { + "epoch": 5.753222836095764, + "grad_norm": 0.1700165718793869, + "learning_rate": 4.0299724806524744e-05, + "loss": 1.6536, + "step": 18744 + }, + { + "epoch": 5.75352977286679, + "grad_norm": 0.21920062601566315, + "learning_rate": 4.029484873305247e-05, + "loss": 1.7298, + "step": 18745 + }, + { + "epoch": 5.753836709637815, + "grad_norm": 0.22648905217647552, + "learning_rate": 4.028997275549218e-05, + "loss": 1.7878, + "step": 18746 + }, + { + "epoch": 5.75414364640884, + "grad_norm": 0.19443005323410034, + "learning_rate": 4.028509687389208e-05, + "loss": 1.7582, + "step": 18747 + }, + { + "epoch": 5.754450583179865, + "grad_norm": 0.21973860263824463, + "learning_rate": 4.028022108830034e-05, + "loss": 1.8215, + "step": 18748 + }, + { + "epoch": 5.75475751995089, + "grad_norm": 0.2215481847524643, + "learning_rate": 4.0275345398765155e-05, + "loss": 1.7092, + "step": 18749 + }, + { + "epoch": 5.755064456721915, + "grad_norm": 0.18789733946323395, + "learning_rate": 4.0270469805334696e-05, + "loss": 1.7089, + "step": 18750 + }, + { + "epoch": 5.755371393492941, + "grad_norm": 0.2423657774925232, + "learning_rate": 4.0265594308057175e-05, + "loss": 1.7412, + "step": 18751 + }, + { + "epoch": 5.755678330263965, + "grad_norm": 0.22020475566387177, + "learning_rate": 4.026071890698074e-05, + "loss": 1.7644, + "step": 18752 + }, + { + "epoch": 5.7559852670349905, + "grad_norm": 0.31772032380104065, + "learning_rate": 4.025584360215361e-05, + "loss": 1.7326, + "step": 18753 + }, + { + "epoch": 5.756292203806016, + "grad_norm": 0.23786257207393646, + "learning_rate": 4.025096839362393e-05, + "loss": 1.7652, + "step": 18754 + }, + { + "epoch": 5.756599140577041, + "grad_norm": 0.24288083612918854, + "learning_rate": 4.024609328143989e-05, + "loss": 1.6797, + "step": 18755 + }, + { + "epoch": 5.7569060773480665, + "grad_norm": 0.30519670248031616, + "learning_rate": 4.024121826564969e-05, + "loss": 1.7442, + "step": 18756 + }, + { + "epoch": 5.757213014119092, + "grad_norm": 0.218281090259552, + "learning_rate": 4.023634334630147e-05, + "loss": 1.7498, + "step": 18757 + }, + { + "epoch": 5.757519950890116, + "grad_norm": 0.215846985578537, + "learning_rate": 4.023146852344345e-05, + "loss": 1.7728, + "step": 18758 + }, + { + "epoch": 5.757826887661142, + "grad_norm": 0.2883944511413574, + "learning_rate": 4.022659379712376e-05, + "loss": 1.8098, + "step": 18759 + }, + { + "epoch": 5.758133824432167, + "grad_norm": 0.25141629576683044, + "learning_rate": 4.022171916739062e-05, + "loss": 1.6574, + "step": 18760 + }, + { + "epoch": 5.758440761203192, + "grad_norm": 0.22118757665157318, + "learning_rate": 4.021684463429216e-05, + "loss": 1.7542, + "step": 18761 + }, + { + "epoch": 5.758747697974218, + "grad_norm": 0.2437646985054016, + "learning_rate": 4.02119701978766e-05, + "loss": 1.7182, + "step": 18762 + }, + { + "epoch": 5.759054634745242, + "grad_norm": 0.24247203767299652, + "learning_rate": 4.020709585819206e-05, + "loss": 1.7134, + "step": 18763 + }, + { + "epoch": 5.759361571516267, + "grad_norm": 0.208528533577919, + "learning_rate": 4.020222161528677e-05, + "loss": 1.6966, + "step": 18764 + }, + { + "epoch": 5.759668508287293, + "grad_norm": 0.19645826518535614, + "learning_rate": 4.0197347469208843e-05, + "loss": 1.7261, + "step": 18765 + }, + { + "epoch": 5.759975445058318, + "grad_norm": 0.20066291093826294, + "learning_rate": 4.019247342000648e-05, + "loss": 1.7197, + "step": 18766 + }, + { + "epoch": 5.760282381829343, + "grad_norm": 0.25344669818878174, + "learning_rate": 4.0187599467727845e-05, + "loss": 1.7957, + "step": 18767 + }, + { + "epoch": 5.760589318600369, + "grad_norm": 0.1917620301246643, + "learning_rate": 4.018272561242111e-05, + "loss": 1.6868, + "step": 18768 + }, + { + "epoch": 5.760896255371393, + "grad_norm": 0.21996566653251648, + "learning_rate": 4.0177851854134424e-05, + "loss": 1.7128, + "step": 18769 + }, + { + "epoch": 5.7612031921424185, + "grad_norm": 0.23226283490657806, + "learning_rate": 4.017297819291598e-05, + "loss": 1.7079, + "step": 18770 + }, + { + "epoch": 5.761510128913444, + "grad_norm": 0.30606213212013245, + "learning_rate": 4.016810462881391e-05, + "loss": 1.8087, + "step": 18771 + }, + { + "epoch": 5.761817065684469, + "grad_norm": 0.2171698361635208, + "learning_rate": 4.016323116187639e-05, + "loss": 1.7377, + "step": 18772 + }, + { + "epoch": 5.7621240024554945, + "grad_norm": 0.24234412610530853, + "learning_rate": 4.01583577921516e-05, + "loss": 1.734, + "step": 18773 + }, + { + "epoch": 5.762430939226519, + "grad_norm": 0.2648961544036865, + "learning_rate": 4.015348451968767e-05, + "loss": 1.7423, + "step": 18774 + }, + { + "epoch": 5.762737875997544, + "grad_norm": 0.18316571414470673, + "learning_rate": 4.01486113445328e-05, + "loss": 1.6708, + "step": 18775 + }, + { + "epoch": 5.76304481276857, + "grad_norm": 0.241583451628685, + "learning_rate": 4.0143738266735104e-05, + "loss": 1.708, + "step": 18776 + }, + { + "epoch": 5.763351749539595, + "grad_norm": 0.2268480360507965, + "learning_rate": 4.0138865286342775e-05, + "loss": 1.7106, + "step": 18777 + }, + { + "epoch": 5.76365868631062, + "grad_norm": 0.2038748860359192, + "learning_rate": 4.0133992403403944e-05, + "loss": 1.7349, + "step": 18778 + }, + { + "epoch": 5.763965623081646, + "grad_norm": 0.24422483146190643, + "learning_rate": 4.0129119617966805e-05, + "loss": 1.659, + "step": 18779 + }, + { + "epoch": 5.76427255985267, + "grad_norm": 0.19925715029239655, + "learning_rate": 4.0124246930079476e-05, + "loss": 1.6983, + "step": 18780 + }, + { + "epoch": 5.764579496623695, + "grad_norm": 0.29671359062194824, + "learning_rate": 4.0119374339790136e-05, + "loss": 1.7188, + "step": 18781 + }, + { + "epoch": 5.764886433394721, + "grad_norm": 0.2752140760421753, + "learning_rate": 4.011450184714692e-05, + "loss": 1.738, + "step": 18782 + }, + { + "epoch": 5.765193370165746, + "grad_norm": 0.2112676352262497, + "learning_rate": 4.0109629452198e-05, + "loss": 1.7529, + "step": 18783 + }, + { + "epoch": 5.765500306936771, + "grad_norm": 0.2091330885887146, + "learning_rate": 4.010475715499151e-05, + "loss": 1.6771, + "step": 18784 + }, + { + "epoch": 5.765807243707796, + "grad_norm": 0.26556238532066345, + "learning_rate": 4.009988495557562e-05, + "loss": 1.7721, + "step": 18785 + }, + { + "epoch": 5.766114180478821, + "grad_norm": 0.20728638768196106, + "learning_rate": 4.009501285399846e-05, + "loss": 1.6893, + "step": 18786 + }, + { + "epoch": 5.7664211172498465, + "grad_norm": 0.213730126619339, + "learning_rate": 4.00901408503082e-05, + "loss": 1.704, + "step": 18787 + }, + { + "epoch": 5.766728054020872, + "grad_norm": 0.21422363817691803, + "learning_rate": 4.0085268944552975e-05, + "loss": 1.7571, + "step": 18788 + }, + { + "epoch": 5.767034990791897, + "grad_norm": 0.20936815440654755, + "learning_rate": 4.0080397136780915e-05, + "loss": 1.7423, + "step": 18789 + }, + { + "epoch": 5.7673419275629225, + "grad_norm": 0.26223674416542053, + "learning_rate": 4.007552542704021e-05, + "loss": 1.7687, + "step": 18790 + }, + { + "epoch": 5.767648864333947, + "grad_norm": 0.3524645268917084, + "learning_rate": 4.0070653815378954e-05, + "loss": 1.7754, + "step": 18791 + }, + { + "epoch": 5.767955801104972, + "grad_norm": 0.20238324999809265, + "learning_rate": 4.006578230184534e-05, + "loss": 1.7043, + "step": 18792 + }, + { + "epoch": 5.768262737875998, + "grad_norm": 0.2739984393119812, + "learning_rate": 4.006091088648747e-05, + "loss": 1.7596, + "step": 18793 + }, + { + "epoch": 5.768569674647023, + "grad_norm": 0.29209306836128235, + "learning_rate": 4.0056039569353515e-05, + "loss": 1.6857, + "step": 18794 + }, + { + "epoch": 5.768876611418047, + "grad_norm": 0.21838447451591492, + "learning_rate": 4.005116835049161e-05, + "loss": 1.7531, + "step": 18795 + }, + { + "epoch": 5.769183548189073, + "grad_norm": 0.21940091252326965, + "learning_rate": 4.0046297229949884e-05, + "loss": 1.7363, + "step": 18796 + }, + { + "epoch": 5.769490484960098, + "grad_norm": 0.22679758071899414, + "learning_rate": 4.004142620777647e-05, + "loss": 1.7586, + "step": 18797 + }, + { + "epoch": 5.769797421731123, + "grad_norm": 0.23782022297382355, + "learning_rate": 4.003655528401954e-05, + "loss": 1.7154, + "step": 18798 + }, + { + "epoch": 5.770104358502149, + "grad_norm": 0.20452092587947845, + "learning_rate": 4.0031684458727194e-05, + "loss": 1.7078, + "step": 18799 + }, + { + "epoch": 5.770411295273174, + "grad_norm": 0.22733618319034576, + "learning_rate": 4.0026813731947594e-05, + "loss": 1.6989, + "step": 18800 + }, + { + "epoch": 5.7707182320441985, + "grad_norm": 0.2322154939174652, + "learning_rate": 4.002194310372886e-05, + "loss": 1.7508, + "step": 18801 + }, + { + "epoch": 5.771025168815224, + "grad_norm": 0.24573352932929993, + "learning_rate": 4.001707257411914e-05, + "loss": 1.7245, + "step": 18802 + }, + { + "epoch": 5.771332105586249, + "grad_norm": 0.19692079722881317, + "learning_rate": 4.001220214316655e-05, + "loss": 1.7116, + "step": 18803 + }, + { + "epoch": 5.7716390423572745, + "grad_norm": 0.20525199174880981, + "learning_rate": 4.000733181091925e-05, + "loss": 1.7503, + "step": 18804 + }, + { + "epoch": 5.7719459791283, + "grad_norm": 0.2097626030445099, + "learning_rate": 4.0002461577425344e-05, + "loss": 1.8204, + "step": 18805 + }, + { + "epoch": 5.772252915899324, + "grad_norm": 0.23059608042240143, + "learning_rate": 3.9997591442732975e-05, + "loss": 1.7747, + "step": 18806 + }, + { + "epoch": 5.77255985267035, + "grad_norm": 0.22085745632648468, + "learning_rate": 3.9992721406890265e-05, + "loss": 1.7579, + "step": 18807 + }, + { + "epoch": 5.772866789441375, + "grad_norm": 0.21529869735240936, + "learning_rate": 3.9987851469945334e-05, + "loss": 1.711, + "step": 18808 + }, + { + "epoch": 5.7731737262124, + "grad_norm": 0.20563572645187378, + "learning_rate": 3.998298163194636e-05, + "loss": 1.761, + "step": 18809 + }, + { + "epoch": 5.773480662983426, + "grad_norm": 0.2081122100353241, + "learning_rate": 3.9978111892941394e-05, + "loss": 1.7112, + "step": 18810 + }, + { + "epoch": 5.773787599754451, + "grad_norm": 0.2373751550912857, + "learning_rate": 3.9973242252978635e-05, + "loss": 1.7726, + "step": 18811 + }, + { + "epoch": 5.774094536525475, + "grad_norm": 0.2742944359779358, + "learning_rate": 3.996837271210615e-05, + "loss": 1.7743, + "step": 18812 + }, + { + "epoch": 5.774401473296501, + "grad_norm": 0.20724992454051971, + "learning_rate": 3.996350327037208e-05, + "loss": 1.7052, + "step": 18813 + }, + { + "epoch": 5.774708410067526, + "grad_norm": 0.22324968874454498, + "learning_rate": 3.995863392782456e-05, + "loss": 1.7865, + "step": 18814 + }, + { + "epoch": 5.7750153468385514, + "grad_norm": 0.22314245998859406, + "learning_rate": 3.995376468451172e-05, + "loss": 1.7705, + "step": 18815 + }, + { + "epoch": 5.775322283609577, + "grad_norm": 0.20793841779232025, + "learning_rate": 3.994889554048165e-05, + "loss": 1.739, + "step": 18816 + }, + { + "epoch": 5.775629220380601, + "grad_norm": 0.20117145776748657, + "learning_rate": 3.994402649578249e-05, + "loss": 1.7256, + "step": 18817 + }, + { + "epoch": 5.775936157151627, + "grad_norm": 0.24406170845031738, + "learning_rate": 3.993915755046235e-05, + "loss": 1.8015, + "step": 18818 + }, + { + "epoch": 5.776243093922652, + "grad_norm": 0.20912545919418335, + "learning_rate": 3.993428870456935e-05, + "loss": 1.7038, + "step": 18819 + }, + { + "epoch": 5.776550030693677, + "grad_norm": 0.2587272822856903, + "learning_rate": 3.992941995815162e-05, + "loss": 1.7918, + "step": 18820 + }, + { + "epoch": 5.776856967464703, + "grad_norm": 0.2996658980846405, + "learning_rate": 3.9924551311257266e-05, + "loss": 1.7513, + "step": 18821 + }, + { + "epoch": 5.777163904235728, + "grad_norm": 0.24603547155857086, + "learning_rate": 3.991968276393441e-05, + "loss": 1.7329, + "step": 18822 + }, + { + "epoch": 5.777470841006752, + "grad_norm": 0.2321038693189621, + "learning_rate": 3.991481431623113e-05, + "loss": 1.7406, + "step": 18823 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.3397100269794464, + "learning_rate": 3.990994596819558e-05, + "loss": 1.8129, + "step": 18824 + }, + { + "epoch": 5.778084714548803, + "grad_norm": 0.2807735800743103, + "learning_rate": 3.990507771987584e-05, + "loss": 1.7579, + "step": 18825 + }, + { + "epoch": 5.778391651319828, + "grad_norm": 0.1952899694442749, + "learning_rate": 3.990020957132007e-05, + "loss": 1.7153, + "step": 18826 + }, + { + "epoch": 5.778698588090853, + "grad_norm": 0.28998714685440063, + "learning_rate": 3.989534152257632e-05, + "loss": 1.7844, + "step": 18827 + }, + { + "epoch": 5.779005524861878, + "grad_norm": 0.20929136872291565, + "learning_rate": 3.989047357369275e-05, + "loss": 1.7499, + "step": 18828 + }, + { + "epoch": 5.7793124616329035, + "grad_norm": 0.31144043803215027, + "learning_rate": 3.9885605724717436e-05, + "loss": 1.7745, + "step": 18829 + }, + { + "epoch": 5.779619398403929, + "grad_norm": 0.22598792612552643, + "learning_rate": 3.988073797569849e-05, + "loss": 1.7226, + "step": 18830 + }, + { + "epoch": 5.779926335174954, + "grad_norm": 0.1971752643585205, + "learning_rate": 3.987587032668402e-05, + "loss": 1.7033, + "step": 18831 + }, + { + "epoch": 5.7802332719459795, + "grad_norm": 0.221087247133255, + "learning_rate": 3.9871002777722156e-05, + "loss": 1.7281, + "step": 18832 + }, + { + "epoch": 5.780540208717004, + "grad_norm": 0.21678583323955536, + "learning_rate": 3.986613532886095e-05, + "loss": 1.7207, + "step": 18833 + }, + { + "epoch": 5.780847145488029, + "grad_norm": 0.2511122226715088, + "learning_rate": 3.9861267980148566e-05, + "loss": 1.7091, + "step": 18834 + }, + { + "epoch": 5.781154082259055, + "grad_norm": 0.2883855104446411, + "learning_rate": 3.985640073163304e-05, + "loss": 1.7963, + "step": 18835 + }, + { + "epoch": 5.78146101903008, + "grad_norm": 0.21786242723464966, + "learning_rate": 3.985153358336253e-05, + "loss": 1.6883, + "step": 18836 + }, + { + "epoch": 5.781767955801105, + "grad_norm": 0.18529155850410461, + "learning_rate": 3.98466665353851e-05, + "loss": 1.7194, + "step": 18837 + }, + { + "epoch": 5.78207489257213, + "grad_norm": 0.20535743236541748, + "learning_rate": 3.984179958774888e-05, + "loss": 1.6943, + "step": 18838 + }, + { + "epoch": 5.782381829343155, + "grad_norm": 0.19377392530441284, + "learning_rate": 3.983693274050195e-05, + "loss": 1.6732, + "step": 18839 + }, + { + "epoch": 5.78268876611418, + "grad_norm": 0.22373615205287933, + "learning_rate": 3.983206599369239e-05, + "loss": 1.7668, + "step": 18840 + }, + { + "epoch": 5.782995702885206, + "grad_norm": 0.2132388800382614, + "learning_rate": 3.982719934736832e-05, + "loss": 1.7155, + "step": 18841 + }, + { + "epoch": 5.783302639656231, + "grad_norm": 0.24871744215488434, + "learning_rate": 3.982233280157782e-05, + "loss": 1.7232, + "step": 18842 + }, + { + "epoch": 5.783609576427256, + "grad_norm": 0.1861848086118698, + "learning_rate": 3.981746635636902e-05, + "loss": 1.707, + "step": 18843 + }, + { + "epoch": 5.783916513198281, + "grad_norm": 0.21882779896259308, + "learning_rate": 3.981260001178995e-05, + "loss": 1.7165, + "step": 18844 + }, + { + "epoch": 5.784223449969306, + "grad_norm": 0.22144648432731628, + "learning_rate": 3.980773376788877e-05, + "loss": 1.7799, + "step": 18845 + }, + { + "epoch": 5.7845303867403315, + "grad_norm": 0.210894376039505, + "learning_rate": 3.980286762471351e-05, + "loss": 1.7539, + "step": 18846 + }, + { + "epoch": 5.784837323511357, + "grad_norm": 0.20435640215873718, + "learning_rate": 3.9798001582312305e-05, + "loss": 1.6736, + "step": 18847 + }, + { + "epoch": 5.785144260282382, + "grad_norm": 0.18998762965202332, + "learning_rate": 3.979313564073322e-05, + "loss": 1.7045, + "step": 18848 + }, + { + "epoch": 5.785451197053407, + "grad_norm": 0.19869361817836761, + "learning_rate": 3.978826980002437e-05, + "loss": 1.7444, + "step": 18849 + }, + { + "epoch": 5.785758133824432, + "grad_norm": 0.2175174504518509, + "learning_rate": 3.97834040602338e-05, + "loss": 1.7565, + "step": 18850 + }, + { + "epoch": 5.786065070595457, + "grad_norm": 0.22726793587207794, + "learning_rate": 3.977853842140964e-05, + "loss": 1.713, + "step": 18851 + }, + { + "epoch": 5.786372007366483, + "grad_norm": 0.26518720388412476, + "learning_rate": 3.9773672883599934e-05, + "loss": 1.6892, + "step": 18852 + }, + { + "epoch": 5.786678944137508, + "grad_norm": 0.20721858739852905, + "learning_rate": 3.97688074468528e-05, + "loss": 1.724, + "step": 18853 + }, + { + "epoch": 5.786985880908533, + "grad_norm": 0.22739483416080475, + "learning_rate": 3.976394211121629e-05, + "loss": 1.762, + "step": 18854 + }, + { + "epoch": 5.787292817679558, + "grad_norm": 0.21918894350528717, + "learning_rate": 3.975907687673853e-05, + "loss": 1.6812, + "step": 18855 + }, + { + "epoch": 5.787599754450583, + "grad_norm": 0.20931273698806763, + "learning_rate": 3.9754211743467574e-05, + "loss": 1.6874, + "step": 18856 + }, + { + "epoch": 5.787906691221608, + "grad_norm": 0.2015041708946228, + "learning_rate": 3.974934671145148e-05, + "loss": 1.7248, + "step": 18857 + }, + { + "epoch": 5.788213627992634, + "grad_norm": 0.21632663905620575, + "learning_rate": 3.974448178073836e-05, + "loss": 1.7313, + "step": 18858 + }, + { + "epoch": 5.788520564763659, + "grad_norm": 0.18995213508605957, + "learning_rate": 3.973961695137627e-05, + "loss": 1.6761, + "step": 18859 + }, + { + "epoch": 5.7888275015346835, + "grad_norm": 0.18678395450115204, + "learning_rate": 3.973475222341333e-05, + "loss": 1.7082, + "step": 18860 + }, + { + "epoch": 5.789134438305709, + "grad_norm": 0.1889343559741974, + "learning_rate": 3.972988759689756e-05, + "loss": 1.7296, + "step": 18861 + }, + { + "epoch": 5.789441375076734, + "grad_norm": 0.20196790993213654, + "learning_rate": 3.9725023071877074e-05, + "loss": 1.6876, + "step": 18862 + }, + { + "epoch": 5.7897483118477595, + "grad_norm": 0.198349729180336, + "learning_rate": 3.972015864839992e-05, + "loss": 1.6826, + "step": 18863 + }, + { + "epoch": 5.790055248618785, + "grad_norm": 0.21323837339878082, + "learning_rate": 3.9715294326514185e-05, + "loss": 1.7444, + "step": 18864 + }, + { + "epoch": 5.79036218538981, + "grad_norm": 0.18581731617450714, + "learning_rate": 3.9710430106267934e-05, + "loss": 1.7731, + "step": 18865 + }, + { + "epoch": 5.790669122160835, + "grad_norm": 0.21925146877765656, + "learning_rate": 3.970556598770927e-05, + "loss": 1.7505, + "step": 18866 + }, + { + "epoch": 5.79097605893186, + "grad_norm": 0.20773115754127502, + "learning_rate": 3.970070197088621e-05, + "loss": 1.7408, + "step": 18867 + }, + { + "epoch": 5.791282995702885, + "grad_norm": 0.1805189698934555, + "learning_rate": 3.9695838055846865e-05, + "loss": 1.6871, + "step": 18868 + }, + { + "epoch": 5.791589932473911, + "grad_norm": 0.24685314297676086, + "learning_rate": 3.969097424263928e-05, + "loss": 1.7186, + "step": 18869 + }, + { + "epoch": 5.791896869244935, + "grad_norm": 0.18801769614219666, + "learning_rate": 3.9686110531311526e-05, + "loss": 1.7196, + "step": 18870 + }, + { + "epoch": 5.79220380601596, + "grad_norm": 0.22717779874801636, + "learning_rate": 3.968124692191168e-05, + "loss": 1.7309, + "step": 18871 + }, + { + "epoch": 5.792510742786986, + "grad_norm": 0.23058642446994781, + "learning_rate": 3.9676383414487806e-05, + "loss": 1.6993, + "step": 18872 + }, + { + "epoch": 5.792817679558011, + "grad_norm": 0.24307532608509064, + "learning_rate": 3.967152000908796e-05, + "loss": 1.6986, + "step": 18873 + }, + { + "epoch": 5.793124616329036, + "grad_norm": 0.3032459318637848, + "learning_rate": 3.9666656705760195e-05, + "loss": 1.677, + "step": 18874 + }, + { + "epoch": 5.793431553100062, + "grad_norm": 0.22669538855552673, + "learning_rate": 3.966179350455259e-05, + "loss": 1.7361, + "step": 18875 + }, + { + "epoch": 5.793738489871086, + "grad_norm": 0.27729150652885437, + "learning_rate": 3.96569304055132e-05, + "loss": 1.746, + "step": 18876 + }, + { + "epoch": 5.7940454266421115, + "grad_norm": 0.3422098755836487, + "learning_rate": 3.96520674086901e-05, + "loss": 1.783, + "step": 18877 + }, + { + "epoch": 5.794352363413137, + "grad_norm": 0.2114052176475525, + "learning_rate": 3.964720451413131e-05, + "loss": 1.7127, + "step": 18878 + }, + { + "epoch": 5.794659300184162, + "grad_norm": 0.22928549349308014, + "learning_rate": 3.964234172188494e-05, + "loss": 1.6579, + "step": 18879 + }, + { + "epoch": 5.7949662369551875, + "grad_norm": 0.24813635647296906, + "learning_rate": 3.9637479031999e-05, + "loss": 1.728, + "step": 18880 + }, + { + "epoch": 5.795273173726212, + "grad_norm": 0.19779744744300842, + "learning_rate": 3.963261644452158e-05, + "loss": 1.7338, + "step": 18881 + }, + { + "epoch": 5.795580110497237, + "grad_norm": 0.2424263060092926, + "learning_rate": 3.96277539595007e-05, + "loss": 1.7762, + "step": 18882 + }, + { + "epoch": 5.795887047268263, + "grad_norm": 0.24621224403381348, + "learning_rate": 3.9622891576984456e-05, + "loss": 1.7746, + "step": 18883 + }, + { + "epoch": 5.796193984039288, + "grad_norm": 0.1973372846841812, + "learning_rate": 3.961802929702086e-05, + "loss": 1.7243, + "step": 18884 + }, + { + "epoch": 5.796500920810313, + "grad_norm": 0.22170570492744446, + "learning_rate": 3.961316711965801e-05, + "loss": 1.764, + "step": 18885 + }, + { + "epoch": 5.796807857581339, + "grad_norm": 0.22319282591342926, + "learning_rate": 3.9608305044943906e-05, + "loss": 1.6795, + "step": 18886 + }, + { + "epoch": 5.797114794352363, + "grad_norm": 0.20000022649765015, + "learning_rate": 3.9603443072926635e-05, + "loss": 1.7587, + "step": 18887 + }, + { + "epoch": 5.797421731123388, + "grad_norm": 0.25041815638542175, + "learning_rate": 3.959858120365424e-05, + "loss": 1.7631, + "step": 18888 + }, + { + "epoch": 5.797728667894414, + "grad_norm": 0.23383729159832, + "learning_rate": 3.959371943717474e-05, + "loss": 1.741, + "step": 18889 + }, + { + "epoch": 5.798035604665439, + "grad_norm": 0.18609663844108582, + "learning_rate": 3.958885777353623e-05, + "loss": 1.6981, + "step": 18890 + }, + { + "epoch": 5.798342541436464, + "grad_norm": 0.29523593187332153, + "learning_rate": 3.9583996212786706e-05, + "loss": 1.8018, + "step": 18891 + }, + { + "epoch": 5.798649478207489, + "grad_norm": 0.20356589555740356, + "learning_rate": 3.9579134754974244e-05, + "loss": 1.7157, + "step": 18892 + }, + { + "epoch": 5.798956414978514, + "grad_norm": 0.2901862561702728, + "learning_rate": 3.957427340014688e-05, + "loss": 1.7249, + "step": 18893 + }, + { + "epoch": 5.7992633517495396, + "grad_norm": 0.24768278002738953, + "learning_rate": 3.956941214835267e-05, + "loss": 1.6894, + "step": 18894 + }, + { + "epoch": 5.799570288520565, + "grad_norm": 0.2417999804019928, + "learning_rate": 3.956455099963962e-05, + "loss": 1.7203, + "step": 18895 + }, + { + "epoch": 5.79987722529159, + "grad_norm": 0.2889639437198639, + "learning_rate": 3.9559689954055814e-05, + "loss": 1.7531, + "step": 18896 + }, + { + "epoch": 5.800184162062616, + "grad_norm": 0.21204611659049988, + "learning_rate": 3.955482901164926e-05, + "loss": 1.7521, + "step": 18897 + }, + { + "epoch": 5.80049109883364, + "grad_norm": 0.2961438298225403, + "learning_rate": 3.954996817246801e-05, + "loss": 1.8102, + "step": 18898 + }, + { + "epoch": 5.800798035604665, + "grad_norm": 0.36562761664390564, + "learning_rate": 3.9545107436560084e-05, + "loss": 1.6722, + "step": 18899 + }, + { + "epoch": 5.801104972375691, + "grad_norm": 0.22423696517944336, + "learning_rate": 3.954024680397357e-05, + "loss": 1.7101, + "step": 18900 + }, + { + "epoch": 5.801411909146716, + "grad_norm": 0.3122335970401764, + "learning_rate": 3.953538627475644e-05, + "loss": 1.7314, + "step": 18901 + }, + { + "epoch": 5.8017188459177405, + "grad_norm": 0.39004257321357727, + "learning_rate": 3.953052584895677e-05, + "loss": 1.762, + "step": 18902 + }, + { + "epoch": 5.802025782688766, + "grad_norm": 0.1827487200498581, + "learning_rate": 3.952566552662256e-05, + "loss": 1.6935, + "step": 18903 + }, + { + "epoch": 5.802332719459791, + "grad_norm": 0.3025164306163788, + "learning_rate": 3.952080530780188e-05, + "loss": 1.7448, + "step": 18904 + }, + { + "epoch": 5.8026396562308165, + "grad_norm": 0.2313300520181656, + "learning_rate": 3.9515945192542754e-05, + "loss": 1.7686, + "step": 18905 + }, + { + "epoch": 5.802946593001842, + "grad_norm": 0.3501042425632477, + "learning_rate": 3.9511085180893184e-05, + "loss": 1.775, + "step": 18906 + }, + { + "epoch": 5.803253529772867, + "grad_norm": 0.4111124873161316, + "learning_rate": 3.950622527290123e-05, + "loss": 1.7561, + "step": 18907 + }, + { + "epoch": 5.803560466543892, + "grad_norm": 0.20877736806869507, + "learning_rate": 3.950136546861489e-05, + "loss": 1.7356, + "step": 18908 + }, + { + "epoch": 5.803867403314917, + "grad_norm": 0.33404025435447693, + "learning_rate": 3.949650576808222e-05, + "loss": 1.7289, + "step": 18909 + }, + { + "epoch": 5.804174340085942, + "grad_norm": 0.2183927446603775, + "learning_rate": 3.9491646171351234e-05, + "loss": 1.7136, + "step": 18910 + }, + { + "epoch": 5.804481276856968, + "grad_norm": 0.27149543166160583, + "learning_rate": 3.948678667846997e-05, + "loss": 1.7516, + "step": 18911 + }, + { + "epoch": 5.804788213627993, + "grad_norm": 0.2369886338710785, + "learning_rate": 3.948192728948643e-05, + "loss": 1.6767, + "step": 18912 + }, + { + "epoch": 5.805095150399017, + "grad_norm": 0.20671069622039795, + "learning_rate": 3.947706800444867e-05, + "loss": 1.7831, + "step": 18913 + }, + { + "epoch": 5.805402087170043, + "grad_norm": 0.23622260987758636, + "learning_rate": 3.9472208823404665e-05, + "loss": 1.7121, + "step": 18914 + }, + { + "epoch": 5.805709023941068, + "grad_norm": 0.21099595725536346, + "learning_rate": 3.946734974640247e-05, + "loss": 1.7137, + "step": 18915 + }, + { + "epoch": 5.806015960712093, + "grad_norm": 0.2205580472946167, + "learning_rate": 3.9462490773490094e-05, + "loss": 1.713, + "step": 18916 + }, + { + "epoch": 5.806322897483119, + "grad_norm": 0.20183326303958893, + "learning_rate": 3.9457631904715584e-05, + "loss": 1.7316, + "step": 18917 + }, + { + "epoch": 5.806629834254144, + "grad_norm": 0.27381497621536255, + "learning_rate": 3.9452773140126906e-05, + "loss": 1.7577, + "step": 18918 + }, + { + "epoch": 5.8069367710251685, + "grad_norm": 0.29962384700775146, + "learning_rate": 3.944791447977214e-05, + "loss": 1.7579, + "step": 18919 + }, + { + "epoch": 5.807243707796194, + "grad_norm": 0.22385326027870178, + "learning_rate": 3.944305592369923e-05, + "loss": 1.7795, + "step": 18920 + }, + { + "epoch": 5.807550644567219, + "grad_norm": 0.2954902648925781, + "learning_rate": 3.943819747195625e-05, + "loss": 1.6655, + "step": 18921 + }, + { + "epoch": 5.8078575813382445, + "grad_norm": 0.18947024643421173, + "learning_rate": 3.94333391245912e-05, + "loss": 1.6803, + "step": 18922 + }, + { + "epoch": 5.80816451810927, + "grad_norm": 0.26797959208488464, + "learning_rate": 3.942848088165206e-05, + "loss": 1.7671, + "step": 18923 + }, + { + "epoch": 5.808471454880294, + "grad_norm": 0.23453201353549957, + "learning_rate": 3.94236227431869e-05, + "loss": 1.7472, + "step": 18924 + }, + { + "epoch": 5.80877839165132, + "grad_norm": 0.24471673369407654, + "learning_rate": 3.941876470924367e-05, + "loss": 1.7482, + "step": 18925 + }, + { + "epoch": 5.809085328422345, + "grad_norm": 0.22249098122119904, + "learning_rate": 3.9413906779870426e-05, + "loss": 1.6794, + "step": 18926 + }, + { + "epoch": 5.80939226519337, + "grad_norm": 0.1985001564025879, + "learning_rate": 3.9409048955115144e-05, + "loss": 1.7278, + "step": 18927 + }, + { + "epoch": 5.809699201964396, + "grad_norm": 0.22482000291347504, + "learning_rate": 3.940419123502587e-05, + "loss": 1.7658, + "step": 18928 + }, + { + "epoch": 5.810006138735421, + "grad_norm": 0.18513578176498413, + "learning_rate": 3.939933361965057e-05, + "loss": 1.7154, + "step": 18929 + }, + { + "epoch": 5.810313075506445, + "grad_norm": 0.1984710991382599, + "learning_rate": 3.939447610903729e-05, + "loss": 1.7324, + "step": 18930 + }, + { + "epoch": 5.810620012277471, + "grad_norm": 0.26089081168174744, + "learning_rate": 3.938961870323399e-05, + "loss": 1.774, + "step": 18931 + }, + { + "epoch": 5.810926949048496, + "grad_norm": 0.2059585452079773, + "learning_rate": 3.9384761402288706e-05, + "loss": 1.7059, + "step": 18932 + }, + { + "epoch": 5.811233885819521, + "grad_norm": 0.1887979656457901, + "learning_rate": 3.937990420624942e-05, + "loss": 1.6829, + "step": 18933 + }, + { + "epoch": 5.811540822590547, + "grad_norm": 0.2589145600795746, + "learning_rate": 3.937504711516417e-05, + "loss": 1.7301, + "step": 18934 + }, + { + "epoch": 5.811847759361571, + "grad_norm": 0.209516704082489, + "learning_rate": 3.9370190129080907e-05, + "loss": 1.7716, + "step": 18935 + }, + { + "epoch": 5.8121546961325965, + "grad_norm": 0.3321632146835327, + "learning_rate": 3.936533324804768e-05, + "loss": 1.7754, + "step": 18936 + }, + { + "epoch": 5.812461632903622, + "grad_norm": 0.236944317817688, + "learning_rate": 3.9360476472112446e-05, + "loss": 1.7546, + "step": 18937 + }, + { + "epoch": 5.812768569674647, + "grad_norm": 0.29667431116104126, + "learning_rate": 3.9355619801323226e-05, + "loss": 1.7712, + "step": 18938 + }, + { + "epoch": 5.8130755064456725, + "grad_norm": 0.3071129620075226, + "learning_rate": 3.935076323572802e-05, + "loss": 1.7351, + "step": 18939 + }, + { + "epoch": 5.813382443216698, + "grad_norm": 0.22747032344341278, + "learning_rate": 3.934590677537479e-05, + "loss": 1.7788, + "step": 18940 + }, + { + "epoch": 5.813689379987722, + "grad_norm": 0.2575854957103729, + "learning_rate": 3.934105042031158e-05, + "loss": 1.705, + "step": 18941 + }, + { + "epoch": 5.813996316758748, + "grad_norm": 0.2561504542827606, + "learning_rate": 3.9336194170586325e-05, + "loss": 1.7309, + "step": 18942 + }, + { + "epoch": 5.814303253529773, + "grad_norm": 0.21570482850074768, + "learning_rate": 3.933133802624707e-05, + "loss": 1.7408, + "step": 18943 + }, + { + "epoch": 5.814610190300798, + "grad_norm": 0.29227179288864136, + "learning_rate": 3.932648198734177e-05, + "loss": 1.7415, + "step": 18944 + }, + { + "epoch": 5.814917127071823, + "grad_norm": 0.17847758531570435, + "learning_rate": 3.9321626053918456e-05, + "loss": 1.7926, + "step": 18945 + }, + { + "epoch": 5.815224063842848, + "grad_norm": 0.24604015052318573, + "learning_rate": 3.931677022602507e-05, + "loss": 1.7519, + "step": 18946 + }, + { + "epoch": 5.815531000613873, + "grad_norm": 0.23843185603618622, + "learning_rate": 3.931191450370965e-05, + "loss": 1.7206, + "step": 18947 + }, + { + "epoch": 5.815837937384899, + "grad_norm": 0.23431400954723358, + "learning_rate": 3.9307058887020126e-05, + "loss": 1.7743, + "step": 18948 + }, + { + "epoch": 5.816144874155924, + "grad_norm": 0.23685097694396973, + "learning_rate": 3.9302203376004525e-05, + "loss": 1.7485, + "step": 18949 + }, + { + "epoch": 5.816451810926949, + "grad_norm": 0.2129819542169571, + "learning_rate": 3.929734797071082e-05, + "loss": 1.6897, + "step": 18950 + }, + { + "epoch": 5.816758747697974, + "grad_norm": 0.24736030399799347, + "learning_rate": 3.9292492671187e-05, + "loss": 1.7292, + "step": 18951 + }, + { + "epoch": 5.817065684468999, + "grad_norm": 0.28659793734550476, + "learning_rate": 3.9287637477481025e-05, + "loss": 1.6772, + "step": 18952 + }, + { + "epoch": 5.8173726212400245, + "grad_norm": 0.22304075956344604, + "learning_rate": 3.928278238964092e-05, + "loss": 1.7991, + "step": 18953 + }, + { + "epoch": 5.81767955801105, + "grad_norm": 0.25354304909706116, + "learning_rate": 3.927792740771462e-05, + "loss": 1.7407, + "step": 18954 + }, + { + "epoch": 5.817986494782075, + "grad_norm": 0.3014552593231201, + "learning_rate": 3.927307253175014e-05, + "loss": 1.7714, + "step": 18955 + }, + { + "epoch": 5.8182934315531, + "grad_norm": 0.20537856221199036, + "learning_rate": 3.926821776179545e-05, + "loss": 1.6992, + "step": 18956 + }, + { + "epoch": 5.818600368324125, + "grad_norm": 0.29656440019607544, + "learning_rate": 3.92633630978985e-05, + "loss": 1.7476, + "step": 18957 + }, + { + "epoch": 5.81890730509515, + "grad_norm": 0.20956869423389435, + "learning_rate": 3.925850854010732e-05, + "loss": 1.808, + "step": 18958 + }, + { + "epoch": 5.819214241866176, + "grad_norm": 0.29395633935928345, + "learning_rate": 3.925365408846983e-05, + "loss": 1.7787, + "step": 18959 + }, + { + "epoch": 5.819521178637201, + "grad_norm": 0.31101030111312866, + "learning_rate": 3.9248799743034025e-05, + "loss": 1.7685, + "step": 18960 + }, + { + "epoch": 5.819828115408226, + "grad_norm": 0.2109794020652771, + "learning_rate": 3.9243945503847894e-05, + "loss": 1.7307, + "step": 18961 + }, + { + "epoch": 5.820135052179251, + "grad_norm": 0.2503393292427063, + "learning_rate": 3.9239091370959405e-05, + "loss": 1.763, + "step": 18962 + }, + { + "epoch": 5.820441988950276, + "grad_norm": 0.21757015585899353, + "learning_rate": 3.92342373444165e-05, + "loss": 1.7862, + "step": 18963 + }, + { + "epoch": 5.820748925721301, + "grad_norm": 0.22108088433742523, + "learning_rate": 3.9229383424267197e-05, + "loss": 1.6845, + "step": 18964 + }, + { + "epoch": 5.821055862492327, + "grad_norm": 0.20059655606746674, + "learning_rate": 3.922452961055941e-05, + "loss": 1.7523, + "step": 18965 + }, + { + "epoch": 5.821362799263352, + "grad_norm": 0.22009585797786713, + "learning_rate": 3.921967590334117e-05, + "loss": 1.7802, + "step": 18966 + }, + { + "epoch": 5.8216697360343765, + "grad_norm": 0.22554142773151398, + "learning_rate": 3.9214822302660386e-05, + "loss": 1.7911, + "step": 18967 + }, + { + "epoch": 5.821976672805402, + "grad_norm": 0.23434770107269287, + "learning_rate": 3.920996880856506e-05, + "loss": 1.6755, + "step": 18968 + }, + { + "epoch": 5.822283609576427, + "grad_norm": 0.2162926346063614, + "learning_rate": 3.920511542110314e-05, + "loss": 1.7145, + "step": 18969 + }, + { + "epoch": 5.8225905463474525, + "grad_norm": 0.18654806911945343, + "learning_rate": 3.9200262140322616e-05, + "loss": 1.7076, + "step": 18970 + }, + { + "epoch": 5.822897483118478, + "grad_norm": 0.22357499599456787, + "learning_rate": 3.9195408966271404e-05, + "loss": 1.791, + "step": 18971 + }, + { + "epoch": 5.823204419889503, + "grad_norm": 0.21073313057422638, + "learning_rate": 3.919055589899752e-05, + "loss": 1.7976, + "step": 18972 + }, + { + "epoch": 5.823511356660528, + "grad_norm": 0.21481956541538239, + "learning_rate": 3.9185702938548886e-05, + "loss": 1.7468, + "step": 18973 + }, + { + "epoch": 5.823818293431553, + "grad_norm": 0.22051872313022614, + "learning_rate": 3.9180850084973464e-05, + "loss": 1.7201, + "step": 18974 + }, + { + "epoch": 5.824125230202578, + "grad_norm": 0.24410493671894073, + "learning_rate": 3.917599733831924e-05, + "loss": 1.7774, + "step": 18975 + }, + { + "epoch": 5.824432166973604, + "grad_norm": 0.19711458683013916, + "learning_rate": 3.917114469863414e-05, + "loss": 1.7907, + "step": 18976 + }, + { + "epoch": 5.824739103744628, + "grad_norm": 0.2045203000307083, + "learning_rate": 3.9166292165966155e-05, + "loss": 1.7105, + "step": 18977 + }, + { + "epoch": 5.8250460405156534, + "grad_norm": 0.21570880711078644, + "learning_rate": 3.9161439740363196e-05, + "loss": 1.7312, + "step": 18978 + }, + { + "epoch": 5.825352977286679, + "grad_norm": 0.21203923225402832, + "learning_rate": 3.915658742187325e-05, + "loss": 1.7869, + "step": 18979 + }, + { + "epoch": 5.825659914057704, + "grad_norm": 0.26233312487602234, + "learning_rate": 3.915173521054426e-05, + "loss": 1.7453, + "step": 18980 + }, + { + "epoch": 5.8259668508287294, + "grad_norm": 0.23792949318885803, + "learning_rate": 3.91468831064242e-05, + "loss": 1.6886, + "step": 18981 + }, + { + "epoch": 5.826273787599755, + "grad_norm": 0.20325250923633575, + "learning_rate": 3.914203110956098e-05, + "loss": 1.7538, + "step": 18982 + }, + { + "epoch": 5.82658072437078, + "grad_norm": 0.28146329522132874, + "learning_rate": 3.9137179220002596e-05, + "loss": 1.7674, + "step": 18983 + }, + { + "epoch": 5.826887661141805, + "grad_norm": 0.2319503277540207, + "learning_rate": 3.9132327437796946e-05, + "loss": 1.7864, + "step": 18984 + }, + { + "epoch": 5.82719459791283, + "grad_norm": 0.22653794288635254, + "learning_rate": 3.9127475762992025e-05, + "loss": 1.7424, + "step": 18985 + }, + { + "epoch": 5.827501534683855, + "grad_norm": 0.26855236291885376, + "learning_rate": 3.912262419563574e-05, + "loss": 1.762, + "step": 18986 + }, + { + "epoch": 5.827808471454881, + "grad_norm": 0.18356221914291382, + "learning_rate": 3.9117772735776095e-05, + "loss": 1.7199, + "step": 18987 + }, + { + "epoch": 5.828115408225905, + "grad_norm": 0.2802455425262451, + "learning_rate": 3.911292138346096e-05, + "loss": 1.7142, + "step": 18988 + }, + { + "epoch": 5.82842234499693, + "grad_norm": 0.2638777494430542, + "learning_rate": 3.910807013873835e-05, + "loss": 1.6759, + "step": 18989 + }, + { + "epoch": 5.828729281767956, + "grad_norm": 0.18397162854671478, + "learning_rate": 3.910321900165615e-05, + "loss": 1.693, + "step": 18990 + }, + { + "epoch": 5.829036218538981, + "grad_norm": 0.20967607200145721, + "learning_rate": 3.909836797226233e-05, + "loss": 1.6908, + "step": 18991 + }, + { + "epoch": 5.829343155310006, + "grad_norm": 0.21123014390468597, + "learning_rate": 3.909351705060485e-05, + "loss": 1.7875, + "step": 18992 + }, + { + "epoch": 5.829650092081032, + "grad_norm": 0.1988777220249176, + "learning_rate": 3.90886662367316e-05, + "loss": 1.7254, + "step": 18993 + }, + { + "epoch": 5.829957028852056, + "grad_norm": 0.17793473601341248, + "learning_rate": 3.9083815530690564e-05, + "loss": 1.7233, + "step": 18994 + }, + { + "epoch": 5.8302639656230815, + "grad_norm": 0.2289644330739975, + "learning_rate": 3.9078964932529645e-05, + "loss": 1.7739, + "step": 18995 + }, + { + "epoch": 5.830570902394107, + "grad_norm": 0.18145552277565002, + "learning_rate": 3.9074114442296804e-05, + "loss": 1.6989, + "step": 18996 + }, + { + "epoch": 5.830877839165132, + "grad_norm": 0.1941588670015335, + "learning_rate": 3.9069264060039956e-05, + "loss": 1.6981, + "step": 18997 + }, + { + "epoch": 5.8311847759361575, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.9064413785807075e-05, + "loss": 1.7163, + "step": 18998 + }, + { + "epoch": 5.831491712707182, + "grad_norm": 0.19494447112083435, + "learning_rate": 3.905956361964604e-05, + "loss": 1.7481, + "step": 18999 + }, + { + "epoch": 5.831798649478207, + "grad_norm": 0.2127624899148941, + "learning_rate": 3.9054713561604826e-05, + "loss": 1.7494, + "step": 19000 + }, + { + "epoch": 5.832105586249233, + "grad_norm": 0.20107653737068176, + "learning_rate": 3.9049863611731334e-05, + "loss": 1.7483, + "step": 19001 + }, + { + "epoch": 5.832412523020258, + "grad_norm": 0.22574639320373535, + "learning_rate": 3.904501377007352e-05, + "loss": 1.8184, + "step": 19002 + }, + { + "epoch": 5.832719459791283, + "grad_norm": 0.20027579367160797, + "learning_rate": 3.9040164036679285e-05, + "loss": 1.6995, + "step": 19003 + }, + { + "epoch": 5.833026396562309, + "grad_norm": 0.21599887311458588, + "learning_rate": 3.90353144115966e-05, + "loss": 1.7487, + "step": 19004 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.21122781932353973, + "learning_rate": 3.9030464894873334e-05, + "loss": 1.7332, + "step": 19005 + }, + { + "epoch": 5.833640270104358, + "grad_norm": 0.19006453454494476, + "learning_rate": 3.902561548655747e-05, + "loss": 1.688, + "step": 19006 + }, + { + "epoch": 5.833947206875384, + "grad_norm": 0.22979344427585602, + "learning_rate": 3.9020766186696895e-05, + "loss": 1.7495, + "step": 19007 + }, + { + "epoch": 5.834254143646409, + "grad_norm": 0.18405365943908691, + "learning_rate": 3.901591699533953e-05, + "loss": 1.7395, + "step": 19008 + }, + { + "epoch": 5.834561080417434, + "grad_norm": 0.26198676228523254, + "learning_rate": 3.901106791253334e-05, + "loss": 1.8286, + "step": 19009 + }, + { + "epoch": 5.834868017188459, + "grad_norm": 0.2535797357559204, + "learning_rate": 3.900621893832619e-05, + "loss": 1.757, + "step": 19010 + }, + { + "epoch": 5.835174953959484, + "grad_norm": 0.24599581956863403, + "learning_rate": 3.900137007276605e-05, + "loss": 1.7266, + "step": 19011 + }, + { + "epoch": 5.8354818907305095, + "grad_norm": 0.25688427686691284, + "learning_rate": 3.8996521315900805e-05, + "loss": 1.7255, + "step": 19012 + }, + { + "epoch": 5.835788827501535, + "grad_norm": 0.24668128788471222, + "learning_rate": 3.8991672667778385e-05, + "loss": 1.737, + "step": 19013 + }, + { + "epoch": 5.83609576427256, + "grad_norm": 0.28365740180015564, + "learning_rate": 3.8986824128446695e-05, + "loss": 1.7129, + "step": 19014 + }, + { + "epoch": 5.8364027010435855, + "grad_norm": 0.2543952465057373, + "learning_rate": 3.89819756979537e-05, + "loss": 1.7249, + "step": 19015 + }, + { + "epoch": 5.83670963781461, + "grad_norm": 0.2868666350841522, + "learning_rate": 3.8977127376347245e-05, + "loss": 1.6985, + "step": 19016 + }, + { + "epoch": 5.837016574585635, + "grad_norm": 0.3818367123603821, + "learning_rate": 3.897227916367531e-05, + "loss": 1.6954, + "step": 19017 + }, + { + "epoch": 5.837323511356661, + "grad_norm": 0.20922113955020905, + "learning_rate": 3.896743105998574e-05, + "loss": 1.7571, + "step": 19018 + }, + { + "epoch": 5.837630448127686, + "grad_norm": 0.3669843375682831, + "learning_rate": 3.89625830653265e-05, + "loss": 1.8041, + "step": 19019 + }, + { + "epoch": 5.83793738489871, + "grad_norm": 0.2889872193336487, + "learning_rate": 3.895773517974548e-05, + "loss": 1.7775, + "step": 19020 + }, + { + "epoch": 5.838244321669736, + "grad_norm": 0.22619491815567017, + "learning_rate": 3.89528874032906e-05, + "loss": 1.7019, + "step": 19021 + }, + { + "epoch": 5.838551258440761, + "grad_norm": 0.4169046878814697, + "learning_rate": 3.894803973600976e-05, + "loss": 1.8282, + "step": 19022 + }, + { + "epoch": 5.838858195211786, + "grad_norm": 0.2567043900489807, + "learning_rate": 3.894319217795087e-05, + "loss": 1.733, + "step": 19023 + }, + { + "epoch": 5.839165131982812, + "grad_norm": 0.2435060739517212, + "learning_rate": 3.8938344729161834e-05, + "loss": 1.7208, + "step": 19024 + }, + { + "epoch": 5.839472068753837, + "grad_norm": 0.2941838204860687, + "learning_rate": 3.893349738969055e-05, + "loss": 1.7202, + "step": 19025 + }, + { + "epoch": 5.8397790055248615, + "grad_norm": 0.23542317748069763, + "learning_rate": 3.892865015958495e-05, + "loss": 1.7571, + "step": 19026 + }, + { + "epoch": 5.840085942295887, + "grad_norm": 0.3248259723186493, + "learning_rate": 3.8923803038892897e-05, + "loss": 1.7118, + "step": 19027 + }, + { + "epoch": 5.840392879066912, + "grad_norm": 0.24359026551246643, + "learning_rate": 3.891895602766234e-05, + "loss": 1.8126, + "step": 19028 + }, + { + "epoch": 5.8406998158379375, + "grad_norm": 0.3053695559501648, + "learning_rate": 3.8914109125941126e-05, + "loss": 1.6632, + "step": 19029 + }, + { + "epoch": 5.841006752608963, + "grad_norm": 0.3194943368434906, + "learning_rate": 3.8909262333777195e-05, + "loss": 1.8432, + "step": 19030 + }, + { + "epoch": 5.841313689379987, + "grad_norm": 0.23532693088054657, + "learning_rate": 3.8904415651218426e-05, + "loss": 1.716, + "step": 19031 + }, + { + "epoch": 5.841620626151013, + "grad_norm": 0.2941347062587738, + "learning_rate": 3.889956907831275e-05, + "loss": 1.7737, + "step": 19032 + }, + { + "epoch": 5.841927562922038, + "grad_norm": 0.2265428602695465, + "learning_rate": 3.889472261510801e-05, + "loss": 1.7111, + "step": 19033 + }, + { + "epoch": 5.842234499693063, + "grad_norm": 0.3023710548877716, + "learning_rate": 3.888987626165216e-05, + "loss": 1.7845, + "step": 19034 + }, + { + "epoch": 5.842541436464089, + "grad_norm": 0.2855348289012909, + "learning_rate": 3.8885030017993026e-05, + "loss": 1.8009, + "step": 19035 + }, + { + "epoch": 5.842848373235114, + "grad_norm": 0.23046357929706573, + "learning_rate": 3.888018388417857e-05, + "loss": 1.8225, + "step": 19036 + }, + { + "epoch": 5.843155310006138, + "grad_norm": 0.23732341825962067, + "learning_rate": 3.8875337860256634e-05, + "loss": 1.7542, + "step": 19037 + }, + { + "epoch": 5.843462246777164, + "grad_norm": 0.18987004458904266, + "learning_rate": 3.887049194627516e-05, + "loss": 1.7327, + "step": 19038 + }, + { + "epoch": 5.843769183548189, + "grad_norm": 0.21539908647537231, + "learning_rate": 3.8865646142281974e-05, + "loss": 1.715, + "step": 19039 + }, + { + "epoch": 5.844076120319214, + "grad_norm": 0.2991954982280731, + "learning_rate": 3.8860800448325024e-05, + "loss": 1.7728, + "step": 19040 + }, + { + "epoch": 5.84438305709024, + "grad_norm": 0.19066409766674042, + "learning_rate": 3.885595486445216e-05, + "loss": 1.7128, + "step": 19041 + }, + { + "epoch": 5.844689993861264, + "grad_norm": 0.21643762290477753, + "learning_rate": 3.885110939071128e-05, + "loss": 1.7584, + "step": 19042 + }, + { + "epoch": 5.8449969306322895, + "grad_norm": 0.20227304100990295, + "learning_rate": 3.884626402715029e-05, + "loss": 1.7053, + "step": 19043 + }, + { + "epoch": 5.845303867403315, + "grad_norm": 0.20429107546806335, + "learning_rate": 3.884141877381703e-05, + "loss": 1.761, + "step": 19044 + }, + { + "epoch": 5.84561080417434, + "grad_norm": 0.1873873621225357, + "learning_rate": 3.8836573630759435e-05, + "loss": 1.7251, + "step": 19045 + }, + { + "epoch": 5.8459177409453655, + "grad_norm": 0.18025323748588562, + "learning_rate": 3.883172859802534e-05, + "loss": 1.6696, + "step": 19046 + }, + { + "epoch": 5.846224677716391, + "grad_norm": 0.22011777758598328, + "learning_rate": 3.8826883675662664e-05, + "loss": 1.7148, + "step": 19047 + }, + { + "epoch": 5.846531614487415, + "grad_norm": 0.17827673256397247, + "learning_rate": 3.882203886371925e-05, + "loss": 1.69, + "step": 19048 + }, + { + "epoch": 5.846838551258441, + "grad_norm": 0.200766459107399, + "learning_rate": 3.881719416224303e-05, + "loss": 1.7773, + "step": 19049 + }, + { + "epoch": 5.847145488029466, + "grad_norm": 0.22770950198173523, + "learning_rate": 3.8812349571281834e-05, + "loss": 1.7156, + "step": 19050 + }, + { + "epoch": 5.847452424800491, + "grad_norm": 0.19483895599842072, + "learning_rate": 3.880750509088357e-05, + "loss": 1.7304, + "step": 19051 + }, + { + "epoch": 5.847759361571516, + "grad_norm": 0.1988774836063385, + "learning_rate": 3.8802660721096086e-05, + "loss": 1.7428, + "step": 19052 + }, + { + "epoch": 5.848066298342541, + "grad_norm": 0.19881510734558105, + "learning_rate": 3.879781646196727e-05, + "loss": 1.7268, + "step": 19053 + }, + { + "epoch": 5.848373235113566, + "grad_norm": 0.21257543563842773, + "learning_rate": 3.8792972313545e-05, + "loss": 1.7532, + "step": 19054 + }, + { + "epoch": 5.848680171884592, + "grad_norm": 0.21000613272190094, + "learning_rate": 3.878812827587716e-05, + "loss": 1.7782, + "step": 19055 + }, + { + "epoch": 5.848987108655617, + "grad_norm": 0.2136746346950531, + "learning_rate": 3.878328434901159e-05, + "loss": 1.6875, + "step": 19056 + }, + { + "epoch": 5.849294045426642, + "grad_norm": 0.20291505753993988, + "learning_rate": 3.8778440532996204e-05, + "loss": 1.74, + "step": 19057 + }, + { + "epoch": 5.849600982197668, + "grad_norm": 0.22568103671073914, + "learning_rate": 3.877359682787883e-05, + "loss": 1.7074, + "step": 19058 + }, + { + "epoch": 5.849907918968692, + "grad_norm": 0.24398963153362274, + "learning_rate": 3.876875323370734e-05, + "loss": 1.6825, + "step": 19059 + }, + { + "epoch": 5.850214855739718, + "grad_norm": 0.19684453308582306, + "learning_rate": 3.876390975052964e-05, + "loss": 1.7143, + "step": 19060 + }, + { + "epoch": 5.850521792510743, + "grad_norm": 0.2786783277988434, + "learning_rate": 3.8759066378393544e-05, + "loss": 1.8339, + "step": 19061 + }, + { + "epoch": 5.850828729281768, + "grad_norm": 0.1977633833885193, + "learning_rate": 3.875422311734697e-05, + "loss": 1.742, + "step": 19062 + }, + { + "epoch": 5.851135666052793, + "grad_norm": 0.260643869638443, + "learning_rate": 3.874937996743772e-05, + "loss": 1.7728, + "step": 19063 + }, + { + "epoch": 5.851442602823818, + "grad_norm": 0.20998433232307434, + "learning_rate": 3.874453692871372e-05, + "loss": 1.768, + "step": 19064 + }, + { + "epoch": 5.851749539594843, + "grad_norm": 0.2603224217891693, + "learning_rate": 3.873969400122278e-05, + "loss": 1.8015, + "step": 19065 + }, + { + "epoch": 5.852056476365869, + "grad_norm": 0.24428118765354156, + "learning_rate": 3.87348511850128e-05, + "loss": 1.8133, + "step": 19066 + }, + { + "epoch": 5.852363413136894, + "grad_norm": 0.19380085170269012, + "learning_rate": 3.873000848013161e-05, + "loss": 1.7331, + "step": 19067 + }, + { + "epoch": 5.852670349907919, + "grad_norm": 0.20088011026382446, + "learning_rate": 3.87251658866271e-05, + "loss": 1.7501, + "step": 19068 + }, + { + "epoch": 5.852977286678944, + "grad_norm": 0.21920672059059143, + "learning_rate": 3.8720323404547095e-05, + "loss": 1.6848, + "step": 19069 + }, + { + "epoch": 5.853284223449969, + "grad_norm": 0.21692565083503723, + "learning_rate": 3.871548103393947e-05, + "loss": 1.7132, + "step": 19070 + }, + { + "epoch": 5.8535911602209945, + "grad_norm": 0.19463133811950684, + "learning_rate": 3.871063877485207e-05, + "loss": 1.7263, + "step": 19071 + }, + { + "epoch": 5.85389809699202, + "grad_norm": 0.21563300490379333, + "learning_rate": 3.870579662733277e-05, + "loss": 1.7271, + "step": 19072 + }, + { + "epoch": 5.854205033763045, + "grad_norm": 0.19901902973651886, + "learning_rate": 3.870095459142939e-05, + "loss": 1.7153, + "step": 19073 + }, + { + "epoch": 5.85451197053407, + "grad_norm": 0.2053879052400589, + "learning_rate": 3.869611266718982e-05, + "loss": 1.7769, + "step": 19074 + }, + { + "epoch": 5.854818907305095, + "grad_norm": 0.18877504765987396, + "learning_rate": 3.869127085466188e-05, + "loss": 1.7427, + "step": 19075 + }, + { + "epoch": 5.85512584407612, + "grad_norm": 0.2000892460346222, + "learning_rate": 3.8686429153893414e-05, + "loss": 1.7245, + "step": 19076 + }, + { + "epoch": 5.855432780847146, + "grad_norm": 0.23791030049324036, + "learning_rate": 3.868158756493231e-05, + "loss": 1.7128, + "step": 19077 + }, + { + "epoch": 5.855739717618171, + "grad_norm": 0.20807631313800812, + "learning_rate": 3.8676746087826374e-05, + "loss": 1.7235, + "step": 19078 + }, + { + "epoch": 5.856046654389196, + "grad_norm": 0.2603290379047394, + "learning_rate": 3.867190472262349e-05, + "loss": 1.7272, + "step": 19079 + }, + { + "epoch": 5.856353591160221, + "grad_norm": 0.25234153866767883, + "learning_rate": 3.8667063469371456e-05, + "loss": 1.7818, + "step": 19080 + }, + { + "epoch": 5.856660527931246, + "grad_norm": 0.20621159672737122, + "learning_rate": 3.866222232811816e-05, + "loss": 1.7318, + "step": 19081 + }, + { + "epoch": 5.856967464702271, + "grad_norm": 0.19565562903881073, + "learning_rate": 3.865738129891141e-05, + "loss": 1.6364, + "step": 19082 + }, + { + "epoch": 5.857274401473297, + "grad_norm": 0.2090953141450882, + "learning_rate": 3.86525403817991e-05, + "loss": 1.7763, + "step": 19083 + }, + { + "epoch": 5.857581338244322, + "grad_norm": 0.21286322176456451, + "learning_rate": 3.864769957682901e-05, + "loss": 1.7652, + "step": 19084 + }, + { + "epoch": 5.8578882750153465, + "grad_norm": 0.20606130361557007, + "learning_rate": 3.864285888404902e-05, + "loss": 1.7267, + "step": 19085 + }, + { + "epoch": 5.858195211786372, + "grad_norm": 0.18837152421474457, + "learning_rate": 3.863801830350694e-05, + "loss": 1.7013, + "step": 19086 + }, + { + "epoch": 5.858502148557397, + "grad_norm": 0.19374001026153564, + "learning_rate": 3.8633177835250636e-05, + "loss": 1.7462, + "step": 19087 + }, + { + "epoch": 5.8588090853284225, + "grad_norm": 0.19090552628040314, + "learning_rate": 3.8628337479327914e-05, + "loss": 1.7321, + "step": 19088 + }, + { + "epoch": 5.859116022099448, + "grad_norm": 0.19487829506397247, + "learning_rate": 3.8623497235786656e-05, + "loss": 1.7323, + "step": 19089 + }, + { + "epoch": 5.859422958870473, + "grad_norm": 0.23836077749729156, + "learning_rate": 3.861865710467464e-05, + "loss": 1.7277, + "step": 19090 + }, + { + "epoch": 5.859729895641498, + "grad_norm": 0.22283829748630524, + "learning_rate": 3.861381708603974e-05, + "loss": 1.7521, + "step": 19091 + }, + { + "epoch": 5.860036832412523, + "grad_norm": 0.2094828337430954, + "learning_rate": 3.8608977179929774e-05, + "loss": 1.763, + "step": 19092 + }, + { + "epoch": 5.860343769183548, + "grad_norm": 0.30857667326927185, + "learning_rate": 3.860413738639256e-05, + "loss": 1.7112, + "step": 19093 + }, + { + "epoch": 5.860650705954574, + "grad_norm": 0.22634989023208618, + "learning_rate": 3.8599297705475954e-05, + "loss": 1.7076, + "step": 19094 + }, + { + "epoch": 5.860957642725598, + "grad_norm": 0.20488132536411285, + "learning_rate": 3.8594458137227757e-05, + "loss": 1.6821, + "step": 19095 + }, + { + "epoch": 5.861264579496623, + "grad_norm": 0.22760719060897827, + "learning_rate": 3.8589618681695826e-05, + "loss": 1.6981, + "step": 19096 + }, + { + "epoch": 5.861571516267649, + "grad_norm": 0.21168997883796692, + "learning_rate": 3.858477933892795e-05, + "loss": 1.7396, + "step": 19097 + }, + { + "epoch": 5.861878453038674, + "grad_norm": 0.24725143611431122, + "learning_rate": 3.8579940108971984e-05, + "loss": 1.791, + "step": 19098 + }, + { + "epoch": 5.862185389809699, + "grad_norm": 0.2245369702577591, + "learning_rate": 3.857510099187573e-05, + "loss": 1.7643, + "step": 19099 + }, + { + "epoch": 5.862492326580725, + "grad_norm": 0.20065639913082123, + "learning_rate": 3.8570261987687056e-05, + "loss": 1.715, + "step": 19100 + }, + { + "epoch": 5.862799263351749, + "grad_norm": 0.1857454925775528, + "learning_rate": 3.856542309645373e-05, + "loss": 1.6833, + "step": 19101 + }, + { + "epoch": 5.8631062001227745, + "grad_norm": 0.18816804885864258, + "learning_rate": 3.856058431822361e-05, + "loss": 1.7049, + "step": 19102 + }, + { + "epoch": 5.8634131368938, + "grad_norm": 0.2861626148223877, + "learning_rate": 3.855574565304448e-05, + "loss": 1.8275, + "step": 19103 + }, + { + "epoch": 5.863720073664825, + "grad_norm": 0.19937226176261902, + "learning_rate": 3.8550907100964196e-05, + "loss": 1.7137, + "step": 19104 + }, + { + "epoch": 5.8640270104358505, + "grad_norm": 0.2040586620569229, + "learning_rate": 3.854606866203055e-05, + "loss": 1.725, + "step": 19105 + }, + { + "epoch": 5.864333947206875, + "grad_norm": 0.21082650125026703, + "learning_rate": 3.854123033629137e-05, + "loss": 1.7143, + "step": 19106 + }, + { + "epoch": 5.8646408839779, + "grad_norm": 0.1977517306804657, + "learning_rate": 3.853639212379446e-05, + "loss": 1.7482, + "step": 19107 + }, + { + "epoch": 5.864947820748926, + "grad_norm": 0.2272191196680069, + "learning_rate": 3.8531554024587655e-05, + "loss": 1.7678, + "step": 19108 + }, + { + "epoch": 5.865254757519951, + "grad_norm": 0.22765736281871796, + "learning_rate": 3.852671603871876e-05, + "loss": 1.7721, + "step": 19109 + }, + { + "epoch": 5.865561694290976, + "grad_norm": 0.20707197487354279, + "learning_rate": 3.852187816623556e-05, + "loss": 1.7509, + "step": 19110 + }, + { + "epoch": 5.865868631062002, + "grad_norm": 0.2699931561946869, + "learning_rate": 3.851704040718591e-05, + "loss": 1.6845, + "step": 19111 + }, + { + "epoch": 5.866175567833026, + "grad_norm": 0.24394196271896362, + "learning_rate": 3.8512202761617575e-05, + "loss": 1.6895, + "step": 19112 + }, + { + "epoch": 5.866482504604051, + "grad_norm": 0.21921835839748383, + "learning_rate": 3.850736522957841e-05, + "loss": 1.7739, + "step": 19113 + }, + { + "epoch": 5.866789441375077, + "grad_norm": 0.2268306314945221, + "learning_rate": 3.8502527811116175e-05, + "loss": 1.7773, + "step": 19114 + }, + { + "epoch": 5.867096378146102, + "grad_norm": 0.2165728509426117, + "learning_rate": 3.84976905062787e-05, + "loss": 1.7567, + "step": 19115 + }, + { + "epoch": 5.867403314917127, + "grad_norm": 0.188106968998909, + "learning_rate": 3.8492853315113804e-05, + "loss": 1.7209, + "step": 19116 + }, + { + "epoch": 5.867710251688152, + "grad_norm": 0.20750530064105988, + "learning_rate": 3.848801623766927e-05, + "loss": 1.6999, + "step": 19117 + }, + { + "epoch": 5.868017188459177, + "grad_norm": 0.2475438266992569, + "learning_rate": 3.84831792739929e-05, + "loss": 1.7535, + "step": 19118 + }, + { + "epoch": 5.8683241252302025, + "grad_norm": 0.23291872441768646, + "learning_rate": 3.847834242413252e-05, + "loss": 1.7137, + "step": 19119 + }, + { + "epoch": 5.868631062001228, + "grad_norm": 0.18381048738956451, + "learning_rate": 3.847350568813589e-05, + "loss": 1.7657, + "step": 19120 + }, + { + "epoch": 5.868937998772253, + "grad_norm": 0.19330385327339172, + "learning_rate": 3.8468669066050845e-05, + "loss": 1.7109, + "step": 19121 + }, + { + "epoch": 5.8692449355432785, + "grad_norm": 0.22503000497817993, + "learning_rate": 3.846383255792517e-05, + "loss": 1.7668, + "step": 19122 + }, + { + "epoch": 5.869551872314303, + "grad_norm": 0.2147306352853775, + "learning_rate": 3.845899616380667e-05, + "loss": 1.74, + "step": 19123 + }, + { + "epoch": 5.869858809085328, + "grad_norm": 0.18493011593818665, + "learning_rate": 3.845415988374312e-05, + "loss": 1.7066, + "step": 19124 + }, + { + "epoch": 5.870165745856354, + "grad_norm": 0.28276753425598145, + "learning_rate": 3.844932371778235e-05, + "loss": 1.7925, + "step": 19125 + }, + { + "epoch": 5.870472682627379, + "grad_norm": 0.23486676812171936, + "learning_rate": 3.844448766597212e-05, + "loss": 1.8216, + "step": 19126 + }, + { + "epoch": 5.870779619398404, + "grad_norm": 0.24370723962783813, + "learning_rate": 3.843965172836024e-05, + "loss": 1.709, + "step": 19127 + }, + { + "epoch": 5.871086556169429, + "grad_norm": 0.22540852427482605, + "learning_rate": 3.843481590499449e-05, + "loss": 1.7608, + "step": 19128 + }, + { + "epoch": 5.871393492940454, + "grad_norm": 0.20578467845916748, + "learning_rate": 3.8429980195922666e-05, + "loss": 1.7288, + "step": 19129 + }, + { + "epoch": 5.871700429711479, + "grad_norm": 0.265325129032135, + "learning_rate": 3.842514460119258e-05, + "loss": 1.7711, + "step": 19130 + }, + { + "epoch": 5.872007366482505, + "grad_norm": 0.20076121389865875, + "learning_rate": 3.842030912085197e-05, + "loss": 1.6764, + "step": 19131 + }, + { + "epoch": 5.87231430325353, + "grad_norm": 0.23941899836063385, + "learning_rate": 3.841547375494868e-05, + "loss": 1.8157, + "step": 19132 + }, + { + "epoch": 5.872621240024555, + "grad_norm": 0.23184041678905487, + "learning_rate": 3.841063850353044e-05, + "loss": 1.6948, + "step": 19133 + }, + { + "epoch": 5.87292817679558, + "grad_norm": 0.20299546420574188, + "learning_rate": 3.840580336664508e-05, + "loss": 1.7812, + "step": 19134 + }, + { + "epoch": 5.873235113566605, + "grad_norm": 0.24654673039913177, + "learning_rate": 3.840096834434036e-05, + "loss": 1.7999, + "step": 19135 + }, + { + "epoch": 5.8735420503376305, + "grad_norm": 0.21144285798072815, + "learning_rate": 3.8396133436664085e-05, + "loss": 1.7033, + "step": 19136 + }, + { + "epoch": 5.873848987108656, + "grad_norm": 0.22186708450317383, + "learning_rate": 3.8391298643663997e-05, + "loss": 1.7292, + "step": 19137 + }, + { + "epoch": 5.87415592387968, + "grad_norm": 0.21017275750637054, + "learning_rate": 3.838646396538793e-05, + "loss": 1.6989, + "step": 19138 + }, + { + "epoch": 5.874462860650706, + "grad_norm": 0.19430704414844513, + "learning_rate": 3.83816294018836e-05, + "loss": 1.7446, + "step": 19139 + }, + { + "epoch": 5.874769797421731, + "grad_norm": 0.25048547983169556, + "learning_rate": 3.8376794953198836e-05, + "loss": 1.7358, + "step": 19140 + }, + { + "epoch": 5.875076734192756, + "grad_norm": 0.21869583427906036, + "learning_rate": 3.8371960619381406e-05, + "loss": 1.7017, + "step": 19141 + }, + { + "epoch": 5.875383670963782, + "grad_norm": 0.2053002119064331, + "learning_rate": 3.836712640047905e-05, + "loss": 1.7077, + "step": 19142 + }, + { + "epoch": 5.875690607734807, + "grad_norm": 0.2222425490617752, + "learning_rate": 3.83622922965396e-05, + "loss": 1.7259, + "step": 19143 + }, + { + "epoch": 5.8759975445058314, + "grad_norm": 0.20682495832443237, + "learning_rate": 3.8357458307610774e-05, + "loss": 1.7597, + "step": 19144 + }, + { + "epoch": 5.876304481276857, + "grad_norm": 0.2001802772283554, + "learning_rate": 3.835262443374038e-05, + "loss": 1.7546, + "step": 19145 + }, + { + "epoch": 5.876611418047882, + "grad_norm": 0.20499882102012634, + "learning_rate": 3.8347790674976166e-05, + "loss": 1.6741, + "step": 19146 + }, + { + "epoch": 5.8769183548189075, + "grad_norm": 0.17830348014831543, + "learning_rate": 3.834295703136593e-05, + "loss": 1.7067, + "step": 19147 + }, + { + "epoch": 5.877225291589933, + "grad_norm": 0.25055429339408875, + "learning_rate": 3.833812350295741e-05, + "loss": 1.753, + "step": 19148 + }, + { + "epoch": 5.877532228360957, + "grad_norm": 0.19037213921546936, + "learning_rate": 3.8333290089798415e-05, + "loss": 1.7336, + "step": 19149 + }, + { + "epoch": 5.877839165131983, + "grad_norm": 0.18041233718395233, + "learning_rate": 3.8328456791936656e-05, + "loss": 1.7172, + "step": 19150 + }, + { + "epoch": 5.878146101903008, + "grad_norm": 0.21531802415847778, + "learning_rate": 3.832362360941994e-05, + "loss": 1.7328, + "step": 19151 + }, + { + "epoch": 5.878453038674033, + "grad_norm": 0.23101283609867096, + "learning_rate": 3.831879054229601e-05, + "loss": 1.7548, + "step": 19152 + }, + { + "epoch": 5.878759975445059, + "grad_norm": 0.19029635190963745, + "learning_rate": 3.831395759061266e-05, + "loss": 1.6852, + "step": 19153 + }, + { + "epoch": 5.879066912216084, + "grad_norm": 0.20305602252483368, + "learning_rate": 3.830912475441761e-05, + "loss": 1.6982, + "step": 19154 + }, + { + "epoch": 5.879373848987108, + "grad_norm": 0.19752593338489532, + "learning_rate": 3.830429203375866e-05, + "loss": 1.7726, + "step": 19155 + }, + { + "epoch": 5.879680785758134, + "grad_norm": 0.2109406590461731, + "learning_rate": 3.8299459428683526e-05, + "loss": 1.7629, + "step": 19156 + }, + { + "epoch": 5.879987722529159, + "grad_norm": 0.19448740780353546, + "learning_rate": 3.829462693924001e-05, + "loss": 1.6981, + "step": 19157 + }, + { + "epoch": 5.880294659300184, + "grad_norm": 0.19344154000282288, + "learning_rate": 3.828979456547586e-05, + "loss": 1.6822, + "step": 19158 + }, + { + "epoch": 5.88060159607121, + "grad_norm": 0.24466145038604736, + "learning_rate": 3.82849623074388e-05, + "loss": 1.7575, + "step": 19159 + }, + { + "epoch": 5.880908532842234, + "grad_norm": 0.20174476504325867, + "learning_rate": 3.828013016517663e-05, + "loss": 1.7267, + "step": 19160 + }, + { + "epoch": 5.8812154696132595, + "grad_norm": 0.23560820519924164, + "learning_rate": 3.827529813873706e-05, + "loss": 1.7125, + "step": 19161 + }, + { + "epoch": 5.881522406384285, + "grad_norm": 0.18118280172348022, + "learning_rate": 3.827046622816789e-05, + "loss": 1.7436, + "step": 19162 + }, + { + "epoch": 5.88182934315531, + "grad_norm": 0.27250152826309204, + "learning_rate": 3.8265634433516824e-05, + "loss": 1.7249, + "step": 19163 + }, + { + "epoch": 5.8821362799263355, + "grad_norm": 0.23510734736919403, + "learning_rate": 3.826080275483166e-05, + "loss": 1.7502, + "step": 19164 + }, + { + "epoch": 5.882443216697361, + "grad_norm": 0.22708909213542938, + "learning_rate": 3.82559711921601e-05, + "loss": 1.7478, + "step": 19165 + }, + { + "epoch": 5.882750153468385, + "grad_norm": 0.292584627866745, + "learning_rate": 3.825113974554995e-05, + "loss": 1.6757, + "step": 19166 + }, + { + "epoch": 5.883057090239411, + "grad_norm": 0.22186334431171417, + "learning_rate": 3.8246308415048884e-05, + "loss": 1.7061, + "step": 19167 + }, + { + "epoch": 5.883364027010436, + "grad_norm": 0.23995520174503326, + "learning_rate": 3.8241477200704714e-05, + "loss": 1.6962, + "step": 19168 + }, + { + "epoch": 5.883670963781461, + "grad_norm": 0.25545260310173035, + "learning_rate": 3.823664610256513e-05, + "loss": 1.7582, + "step": 19169 + }, + { + "epoch": 5.883977900552486, + "grad_norm": 0.2209167629480362, + "learning_rate": 3.823181512067794e-05, + "loss": 1.7212, + "step": 19170 + }, + { + "epoch": 5.884284837323511, + "grad_norm": 0.24626508355140686, + "learning_rate": 3.8226984255090824e-05, + "loss": 1.7356, + "step": 19171 + }, + { + "epoch": 5.884591774094536, + "grad_norm": 0.22982320189476013, + "learning_rate": 3.822215350585157e-05, + "loss": 1.7516, + "step": 19172 + }, + { + "epoch": 5.884898710865562, + "grad_norm": 0.19458627700805664, + "learning_rate": 3.8217322873007874e-05, + "loss": 1.7097, + "step": 19173 + }, + { + "epoch": 5.885205647636587, + "grad_norm": 0.2030913233757019, + "learning_rate": 3.8212492356607524e-05, + "loss": 1.7273, + "step": 19174 + }, + { + "epoch": 5.885512584407612, + "grad_norm": 0.20174767076969147, + "learning_rate": 3.820766195669823e-05, + "loss": 1.7167, + "step": 19175 + }, + { + "epoch": 5.885819521178637, + "grad_norm": 0.22572553157806396, + "learning_rate": 3.820283167332772e-05, + "loss": 1.8034, + "step": 19176 + }, + { + "epoch": 5.886126457949662, + "grad_norm": 0.24423041939735413, + "learning_rate": 3.819800150654376e-05, + "loss": 1.7188, + "step": 19177 + }, + { + "epoch": 5.8864333947206875, + "grad_norm": 0.20805509388446808, + "learning_rate": 3.819317145639404e-05, + "loss": 1.7252, + "step": 19178 + }, + { + "epoch": 5.886740331491713, + "grad_norm": 0.2731400728225708, + "learning_rate": 3.8188341522926334e-05, + "loss": 1.7778, + "step": 19179 + }, + { + "epoch": 5.887047268262738, + "grad_norm": 0.2604491412639618, + "learning_rate": 3.818351170618835e-05, + "loss": 1.7524, + "step": 19180 + }, + { + "epoch": 5.887354205033763, + "grad_norm": 0.20043112337589264, + "learning_rate": 3.817868200622785e-05, + "loss": 1.7176, + "step": 19181 + }, + { + "epoch": 5.887661141804788, + "grad_norm": 0.2224988341331482, + "learning_rate": 3.817385242309253e-05, + "loss": 1.7267, + "step": 19182 + }, + { + "epoch": 5.887968078575813, + "grad_norm": 0.24603894352912903, + "learning_rate": 3.8169022956830135e-05, + "loss": 1.716, + "step": 19183 + }, + { + "epoch": 5.888275015346839, + "grad_norm": 0.19959969818592072, + "learning_rate": 3.816419360748839e-05, + "loss": 1.7461, + "step": 19184 + }, + { + "epoch": 5.888581952117864, + "grad_norm": 0.21907947957515717, + "learning_rate": 3.815936437511501e-05, + "loss": 1.6982, + "step": 19185 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.1920289248228073, + "learning_rate": 3.8154535259757735e-05, + "loss": 1.7213, + "step": 19186 + }, + { + "epoch": 5.889195825659914, + "grad_norm": 0.21930737793445587, + "learning_rate": 3.81497062614643e-05, + "loss": 1.7389, + "step": 19187 + }, + { + "epoch": 5.889502762430939, + "grad_norm": 0.1972137838602066, + "learning_rate": 3.814487738028239e-05, + "loss": 1.7317, + "step": 19188 + }, + { + "epoch": 5.889809699201964, + "grad_norm": 0.20000529289245605, + "learning_rate": 3.8140048616259785e-05, + "loss": 1.7148, + "step": 19189 + }, + { + "epoch": 5.89011663597299, + "grad_norm": 0.18828663229942322, + "learning_rate": 3.8135219969444135e-05, + "loss": 1.725, + "step": 19190 + }, + { + "epoch": 5.890423572744015, + "grad_norm": 0.2237224131822586, + "learning_rate": 3.8130391439883216e-05, + "loss": 1.7252, + "step": 19191 + }, + { + "epoch": 5.8907305095150395, + "grad_norm": 0.19954712688922882, + "learning_rate": 3.812556302762473e-05, + "loss": 1.7071, + "step": 19192 + }, + { + "epoch": 5.891037446286065, + "grad_norm": 0.23509685695171356, + "learning_rate": 3.812073473271637e-05, + "loss": 1.7603, + "step": 19193 + }, + { + "epoch": 5.89134438305709, + "grad_norm": 0.28477707505226135, + "learning_rate": 3.81159065552059e-05, + "loss": 1.8193, + "step": 19194 + }, + { + "epoch": 5.8916513198281155, + "grad_norm": 0.1936045140028, + "learning_rate": 3.811107849514098e-05, + "loss": 1.7438, + "step": 19195 + }, + { + "epoch": 5.891958256599141, + "grad_norm": 0.288253515958786, + "learning_rate": 3.810625055256936e-05, + "loss": 1.8042, + "step": 19196 + }, + { + "epoch": 5.892265193370166, + "grad_norm": 0.19256485998630524, + "learning_rate": 3.810142272753873e-05, + "loss": 1.6997, + "step": 19197 + }, + { + "epoch": 5.892572130141191, + "grad_norm": 0.2823546826839447, + "learning_rate": 3.809659502009684e-05, + "loss": 1.7133, + "step": 19198 + }, + { + "epoch": 5.892879066912216, + "grad_norm": 0.25116851925849915, + "learning_rate": 3.809176743029136e-05, + "loss": 1.7402, + "step": 19199 + }, + { + "epoch": 5.893186003683241, + "grad_norm": 0.19840675592422485, + "learning_rate": 3.808693995817003e-05, + "loss": 1.7009, + "step": 19200 + }, + { + "epoch": 5.893492940454267, + "grad_norm": 0.2703700363636017, + "learning_rate": 3.808211260378051e-05, + "loss": 1.741, + "step": 19201 + }, + { + "epoch": 5.893799877225292, + "grad_norm": 0.25683698058128357, + "learning_rate": 3.807728536717056e-05, + "loss": 1.7431, + "step": 19202 + }, + { + "epoch": 5.894106813996316, + "grad_norm": 0.19033822417259216, + "learning_rate": 3.8072458248387855e-05, + "loss": 1.7423, + "step": 19203 + }, + { + "epoch": 5.894413750767342, + "grad_norm": 0.2771024703979492, + "learning_rate": 3.806763124748012e-05, + "loss": 1.7376, + "step": 19204 + }, + { + "epoch": 5.894720687538367, + "grad_norm": 0.30265524983406067, + "learning_rate": 3.806280436449504e-05, + "loss": 1.7124, + "step": 19205 + }, + { + "epoch": 5.895027624309392, + "grad_norm": 0.21838776767253876, + "learning_rate": 3.805797759948033e-05, + "loss": 1.7319, + "step": 19206 + }, + { + "epoch": 5.895334561080418, + "grad_norm": 0.22244395315647125, + "learning_rate": 3.805315095248368e-05, + "loss": 1.7034, + "step": 19207 + }, + { + "epoch": 5.895641497851443, + "grad_norm": 0.20621941983699799, + "learning_rate": 3.8048324423552786e-05, + "loss": 1.7231, + "step": 19208 + }, + { + "epoch": 5.8959484346224675, + "grad_norm": 0.23735111951828003, + "learning_rate": 3.804349801273538e-05, + "loss": 1.7484, + "step": 19209 + }, + { + "epoch": 5.896255371393493, + "grad_norm": 0.33221447467803955, + "learning_rate": 3.803867172007911e-05, + "loss": 1.7782, + "step": 19210 + }, + { + "epoch": 5.896562308164518, + "grad_norm": 0.20859810709953308, + "learning_rate": 3.803384554563172e-05, + "loss": 1.688, + "step": 19211 + }, + { + "epoch": 5.8968692449355435, + "grad_norm": 0.25731268525123596, + "learning_rate": 3.8029019489440855e-05, + "loss": 1.7463, + "step": 19212 + }, + { + "epoch": 5.897176181706568, + "grad_norm": 0.26556700468063354, + "learning_rate": 3.802419355155425e-05, + "loss": 1.7251, + "step": 19213 + }, + { + "epoch": 5.897483118477593, + "grad_norm": 0.20397205650806427, + "learning_rate": 3.801936773201957e-05, + "loss": 1.6785, + "step": 19214 + }, + { + "epoch": 5.897790055248619, + "grad_norm": 0.2198234349489212, + "learning_rate": 3.8014542030884544e-05, + "loss": 1.7608, + "step": 19215 + }, + { + "epoch": 5.898096992019644, + "grad_norm": 0.22619546949863434, + "learning_rate": 3.800971644819681e-05, + "loss": 1.8034, + "step": 19216 + }, + { + "epoch": 5.898403928790669, + "grad_norm": 0.22074444591999054, + "learning_rate": 3.800489098400412e-05, + "loss": 1.777, + "step": 19217 + }, + { + "epoch": 5.898710865561695, + "grad_norm": 0.2555946707725525, + "learning_rate": 3.80000656383541e-05, + "loss": 1.7578, + "step": 19218 + }, + { + "epoch": 5.899017802332719, + "grad_norm": 0.2130863517522812, + "learning_rate": 3.7995240411294474e-05, + "loss": 1.7312, + "step": 19219 + }, + { + "epoch": 5.899324739103744, + "grad_norm": 0.2574099898338318, + "learning_rate": 3.799041530287291e-05, + "loss": 1.7509, + "step": 19220 + }, + { + "epoch": 5.89963167587477, + "grad_norm": 0.2556573152542114, + "learning_rate": 3.798559031313712e-05, + "loss": 1.7624, + "step": 19221 + }, + { + "epoch": 5.899938612645795, + "grad_norm": 0.19909335672855377, + "learning_rate": 3.798076544213475e-05, + "loss": 1.7466, + "step": 19222 + }, + { + "epoch": 5.9002455494168204, + "grad_norm": 0.19832594692707062, + "learning_rate": 3.7975940689913526e-05, + "loss": 1.6896, + "step": 19223 + }, + { + "epoch": 5.900552486187845, + "grad_norm": 0.18473665416240692, + "learning_rate": 3.7971116056521076e-05, + "loss": 1.7167, + "step": 19224 + }, + { + "epoch": 5.90085942295887, + "grad_norm": 0.21106892824172974, + "learning_rate": 3.796629154200512e-05, + "loss": 1.8071, + "step": 19225 + }, + { + "epoch": 5.901166359729896, + "grad_norm": 0.20903728902339935, + "learning_rate": 3.796146714641333e-05, + "loss": 1.6946, + "step": 19226 + }, + { + "epoch": 5.901473296500921, + "grad_norm": 0.21518728137016296, + "learning_rate": 3.795664286979336e-05, + "loss": 1.6899, + "step": 19227 + }, + { + "epoch": 5.901780233271946, + "grad_norm": 0.1948135644197464, + "learning_rate": 3.7951818712192926e-05, + "loss": 1.7568, + "step": 19228 + }, + { + "epoch": 5.902087170042972, + "grad_norm": 0.2222091257572174, + "learning_rate": 3.7946994673659667e-05, + "loss": 1.8118, + "step": 19229 + }, + { + "epoch": 5.902394106813996, + "grad_norm": 0.2173513025045395, + "learning_rate": 3.794217075424127e-05, + "loss": 1.7194, + "step": 19230 + }, + { + "epoch": 5.902701043585021, + "grad_norm": 0.2026323676109314, + "learning_rate": 3.79373469539854e-05, + "loss": 1.6944, + "step": 19231 + }, + { + "epoch": 5.903007980356047, + "grad_norm": 0.22178098559379578, + "learning_rate": 3.7932523272939765e-05, + "loss": 1.7328, + "step": 19232 + }, + { + "epoch": 5.903314917127072, + "grad_norm": 0.22846719622612, + "learning_rate": 3.792769971115198e-05, + "loss": 1.8065, + "step": 19233 + }, + { + "epoch": 5.903621853898097, + "grad_norm": 0.2086053490638733, + "learning_rate": 3.792287626866977e-05, + "loss": 1.7511, + "step": 19234 + }, + { + "epoch": 5.903928790669122, + "grad_norm": 0.22444705665111542, + "learning_rate": 3.791805294554075e-05, + "loss": 1.742, + "step": 19235 + }, + { + "epoch": 5.904235727440147, + "grad_norm": 0.24630236625671387, + "learning_rate": 3.7913229741812625e-05, + "loss": 1.7531, + "step": 19236 + }, + { + "epoch": 5.9045426642111725, + "grad_norm": 0.2618274986743927, + "learning_rate": 3.7908406657533036e-05, + "loss": 1.7387, + "step": 19237 + }, + { + "epoch": 5.904849600982198, + "grad_norm": 0.25871509313583374, + "learning_rate": 3.790358369274968e-05, + "loss": 1.7822, + "step": 19238 + }, + { + "epoch": 5.905156537753223, + "grad_norm": 0.22675062716007233, + "learning_rate": 3.789876084751018e-05, + "loss": 1.7788, + "step": 19239 + }, + { + "epoch": 5.9054634745242485, + "grad_norm": 0.26623663306236267, + "learning_rate": 3.789393812186224e-05, + "loss": 1.7092, + "step": 19240 + }, + { + "epoch": 5.905770411295273, + "grad_norm": 0.19448868930339813, + "learning_rate": 3.788911551585348e-05, + "loss": 1.7164, + "step": 19241 + }, + { + "epoch": 5.906077348066298, + "grad_norm": 0.22451938688755035, + "learning_rate": 3.788429302953158e-05, + "loss": 1.667, + "step": 19242 + }, + { + "epoch": 5.906384284837324, + "grad_norm": 0.2323608547449112, + "learning_rate": 3.7879470662944214e-05, + "loss": 1.7992, + "step": 19243 + }, + { + "epoch": 5.906691221608349, + "grad_norm": 0.2508258819580078, + "learning_rate": 3.7874648416139e-05, + "loss": 1.7681, + "step": 19244 + }, + { + "epoch": 5.906998158379373, + "grad_norm": 0.22333547472953796, + "learning_rate": 3.786982628916364e-05, + "loss": 1.7006, + "step": 19245 + }, + { + "epoch": 5.907305095150399, + "grad_norm": 0.19816327095031738, + "learning_rate": 3.786500428206575e-05, + "loss": 1.7458, + "step": 19246 + }, + { + "epoch": 5.907612031921424, + "grad_norm": 0.2047683447599411, + "learning_rate": 3.7860182394893006e-05, + "loss": 1.7385, + "step": 19247 + }, + { + "epoch": 5.907918968692449, + "grad_norm": 0.2124621719121933, + "learning_rate": 3.785536062769304e-05, + "loss": 1.7373, + "step": 19248 + }, + { + "epoch": 5.908225905463475, + "grad_norm": 0.200453981757164, + "learning_rate": 3.785053898051355e-05, + "loss": 1.7754, + "step": 19249 + }, + { + "epoch": 5.9085328422345, + "grad_norm": 0.19543224573135376, + "learning_rate": 3.784571745340212e-05, + "loss": 1.724, + "step": 19250 + }, + { + "epoch": 5.9088397790055245, + "grad_norm": 0.17079658806324005, + "learning_rate": 3.784089604640647e-05, + "loss": 1.6843, + "step": 19251 + }, + { + "epoch": 5.90914671577655, + "grad_norm": 0.22792236506938934, + "learning_rate": 3.783607475957418e-05, + "loss": 1.7442, + "step": 19252 + }, + { + "epoch": 5.909453652547575, + "grad_norm": 0.20699752867221832, + "learning_rate": 3.783125359295294e-05, + "loss": 1.7868, + "step": 19253 + }, + { + "epoch": 5.9097605893186005, + "grad_norm": 0.2156144678592682, + "learning_rate": 3.782643254659038e-05, + "loss": 1.7443, + "step": 19254 + }, + { + "epoch": 5.910067526089626, + "grad_norm": 0.2021300345659256, + "learning_rate": 3.782161162053417e-05, + "loss": 1.7749, + "step": 19255 + }, + { + "epoch": 5.91037446286065, + "grad_norm": 0.17613129317760468, + "learning_rate": 3.7816790814831905e-05, + "loss": 1.7001, + "step": 19256 + }, + { + "epoch": 5.910681399631676, + "grad_norm": 0.18911564350128174, + "learning_rate": 3.781197012953128e-05, + "loss": 1.6817, + "step": 19257 + }, + { + "epoch": 5.910988336402701, + "grad_norm": 0.18920689821243286, + "learning_rate": 3.780714956467989e-05, + "loss": 1.7554, + "step": 19258 + }, + { + "epoch": 5.911295273173726, + "grad_norm": 0.22030571103096008, + "learning_rate": 3.7802329120325396e-05, + "loss": 1.7554, + "step": 19259 + }, + { + "epoch": 5.911602209944752, + "grad_norm": 0.21164962649345398, + "learning_rate": 3.779750879651545e-05, + "loss": 1.74, + "step": 19260 + }, + { + "epoch": 5.911909146715777, + "grad_norm": 0.2205103188753128, + "learning_rate": 3.779268859329766e-05, + "loss": 1.7424, + "step": 19261 + }, + { + "epoch": 5.912216083486801, + "grad_norm": 0.19262658059597015, + "learning_rate": 3.7787868510719685e-05, + "loss": 1.7157, + "step": 19262 + }, + { + "epoch": 5.912523020257827, + "grad_norm": 0.19583287835121155, + "learning_rate": 3.778304854882914e-05, + "loss": 1.7343, + "step": 19263 + }, + { + "epoch": 5.912829957028852, + "grad_norm": 0.18275529146194458, + "learning_rate": 3.777822870767368e-05, + "loss": 1.6938, + "step": 19264 + }, + { + "epoch": 5.913136893799877, + "grad_norm": 0.21268916130065918, + "learning_rate": 3.7773408987300914e-05, + "loss": 1.7546, + "step": 19265 + }, + { + "epoch": 5.913443830570903, + "grad_norm": 0.20878887176513672, + "learning_rate": 3.77685893877585e-05, + "loss": 1.8109, + "step": 19266 + }, + { + "epoch": 5.913750767341927, + "grad_norm": 0.2326175421476364, + "learning_rate": 3.776376990909404e-05, + "loss": 1.7248, + "step": 19267 + }, + { + "epoch": 5.9140577041129525, + "grad_norm": 0.28189611434936523, + "learning_rate": 3.7758950551355204e-05, + "loss": 1.7796, + "step": 19268 + }, + { + "epoch": 5.914364640883978, + "grad_norm": 0.1922682821750641, + "learning_rate": 3.775413131458957e-05, + "loss": 1.7096, + "step": 19269 + }, + { + "epoch": 5.914671577655003, + "grad_norm": 0.2839193642139435, + "learning_rate": 3.774931219884479e-05, + "loss": 1.7341, + "step": 19270 + }, + { + "epoch": 5.9149785144260285, + "grad_norm": 0.2075256109237671, + "learning_rate": 3.7744493204168495e-05, + "loss": 1.7565, + "step": 19271 + }, + { + "epoch": 5.915285451197054, + "grad_norm": 0.2780497372150421, + "learning_rate": 3.7739674330608306e-05, + "loss": 1.7186, + "step": 19272 + }, + { + "epoch": 5.915592387968078, + "grad_norm": 0.26129212975502014, + "learning_rate": 3.773485557821182e-05, + "loss": 1.8468, + "step": 19273 + }, + { + "epoch": 5.915899324739104, + "grad_norm": 0.3299194276332855, + "learning_rate": 3.773003694702671e-05, + "loss": 1.7705, + "step": 19274 + }, + { + "epoch": 5.916206261510129, + "grad_norm": 0.3011106848716736, + "learning_rate": 3.772521843710054e-05, + "loss": 1.748, + "step": 19275 + }, + { + "epoch": 5.916513198281154, + "grad_norm": 0.21370603144168854, + "learning_rate": 3.7720400048480966e-05, + "loss": 1.7709, + "step": 19276 + }, + { + "epoch": 5.91682013505218, + "grad_norm": 0.29374879598617554, + "learning_rate": 3.771558178121561e-05, + "loss": 1.6948, + "step": 19277 + }, + { + "epoch": 5.917127071823204, + "grad_norm": 0.2545807659626007, + "learning_rate": 3.771076363535205e-05, + "loss": 1.7974, + "step": 19278 + }, + { + "epoch": 5.917434008594229, + "grad_norm": 0.24210263788700104, + "learning_rate": 3.7705945610937954e-05, + "loss": 1.7438, + "step": 19279 + }, + { + "epoch": 5.917740945365255, + "grad_norm": 0.26224827766418457, + "learning_rate": 3.770112770802088e-05, + "loss": 1.7294, + "step": 19280 + }, + { + "epoch": 5.91804788213628, + "grad_norm": 0.23358991742134094, + "learning_rate": 3.7696309926648486e-05, + "loss": 1.7973, + "step": 19281 + }, + { + "epoch": 5.918354818907305, + "grad_norm": 0.3466563820838928, + "learning_rate": 3.769149226686837e-05, + "loss": 1.784, + "step": 19282 + }, + { + "epoch": 5.918661755678331, + "grad_norm": 0.2416994869709015, + "learning_rate": 3.768667472872814e-05, + "loss": 1.6957, + "step": 19283 + }, + { + "epoch": 5.918968692449355, + "grad_norm": 0.2285085767507553, + "learning_rate": 3.768185731227539e-05, + "loss": 1.71, + "step": 19284 + }, + { + "epoch": 5.9192756292203805, + "grad_norm": 0.2566430866718292, + "learning_rate": 3.7677040017557775e-05, + "loss": 1.792, + "step": 19285 + }, + { + "epoch": 5.919582565991406, + "grad_norm": 0.21566689014434814, + "learning_rate": 3.767222284462285e-05, + "loss": 1.8085, + "step": 19286 + }, + { + "epoch": 5.919889502762431, + "grad_norm": 0.24078889191150665, + "learning_rate": 3.7667405793518264e-05, + "loss": 1.7221, + "step": 19287 + }, + { + "epoch": 5.920196439533456, + "grad_norm": 0.22127531468868256, + "learning_rate": 3.7662588864291584e-05, + "loss": 1.7173, + "step": 19288 + }, + { + "epoch": 5.920503376304481, + "grad_norm": 0.18165946006774902, + "learning_rate": 3.765777205699045e-05, + "loss": 1.7518, + "step": 19289 + }, + { + "epoch": 5.920810313075506, + "grad_norm": 0.2569290101528168, + "learning_rate": 3.765295537166242e-05, + "loss": 1.7716, + "step": 19290 + }, + { + "epoch": 5.921117249846532, + "grad_norm": 0.19010202586650848, + "learning_rate": 3.764813880835515e-05, + "loss": 1.7146, + "step": 19291 + }, + { + "epoch": 5.921424186617557, + "grad_norm": 0.2882116436958313, + "learning_rate": 3.7643322367116195e-05, + "loss": 1.7677, + "step": 19292 + }, + { + "epoch": 5.921731123388582, + "grad_norm": 0.30711185932159424, + "learning_rate": 3.763850604799319e-05, + "loss": 1.7506, + "step": 19293 + }, + { + "epoch": 5.922038060159607, + "grad_norm": 0.19295164942741394, + "learning_rate": 3.76336898510337e-05, + "loss": 1.715, + "step": 19294 + }, + { + "epoch": 5.922344996930632, + "grad_norm": 0.24849168956279755, + "learning_rate": 3.762887377628533e-05, + "loss": 1.6807, + "step": 19295 + }, + { + "epoch": 5.922651933701657, + "grad_norm": 0.23573634028434753, + "learning_rate": 3.7624057823795696e-05, + "loss": 1.7363, + "step": 19296 + }, + { + "epoch": 5.922958870472683, + "grad_norm": 0.24384267628192902, + "learning_rate": 3.761924199361235e-05, + "loss": 1.726, + "step": 19297 + }, + { + "epoch": 5.923265807243708, + "grad_norm": 0.2589210271835327, + "learning_rate": 3.761442628578294e-05, + "loss": 1.7771, + "step": 19298 + }, + { + "epoch": 5.9235727440147325, + "grad_norm": 0.23527951538562775, + "learning_rate": 3.760961070035501e-05, + "loss": 1.6561, + "step": 19299 + }, + { + "epoch": 5.923879680785758, + "grad_norm": 0.20286870002746582, + "learning_rate": 3.7604795237376175e-05, + "loss": 1.7464, + "step": 19300 + }, + { + "epoch": 5.924186617556783, + "grad_norm": 0.22705033421516418, + "learning_rate": 3.759997989689401e-05, + "loss": 1.7814, + "step": 19301 + }, + { + "epoch": 5.9244935543278086, + "grad_norm": 0.21780981123447418, + "learning_rate": 3.7595164678956135e-05, + "loss": 1.7601, + "step": 19302 + }, + { + "epoch": 5.924800491098834, + "grad_norm": 0.2030021697282791, + "learning_rate": 3.759034958361009e-05, + "loss": 1.7222, + "step": 19303 + }, + { + "epoch": 5.925107427869859, + "grad_norm": 0.22956500947475433, + "learning_rate": 3.758553461090351e-05, + "loss": 1.674, + "step": 19304 + }, + { + "epoch": 5.925414364640884, + "grad_norm": 0.2368287444114685, + "learning_rate": 3.758071976088392e-05, + "loss": 1.7483, + "step": 19305 + }, + { + "epoch": 5.925721301411909, + "grad_norm": 0.22852632403373718, + "learning_rate": 3.757590503359896e-05, + "loss": 1.7561, + "step": 19306 + }, + { + "epoch": 5.926028238182934, + "grad_norm": 0.21657361090183258, + "learning_rate": 3.757109042909617e-05, + "loss": 1.7814, + "step": 19307 + }, + { + "epoch": 5.92633517495396, + "grad_norm": 0.21996551752090454, + "learning_rate": 3.756627594742317e-05, + "loss": 1.732, + "step": 19308 + }, + { + "epoch": 5.926642111724985, + "grad_norm": 0.23319712281227112, + "learning_rate": 3.75614615886275e-05, + "loss": 1.6807, + "step": 19309 + }, + { + "epoch": 5.9269490484960095, + "grad_norm": 0.17926698923110962, + "learning_rate": 3.755664735275677e-05, + "loss": 1.6925, + "step": 19310 + }, + { + "epoch": 5.927255985267035, + "grad_norm": 0.18986931443214417, + "learning_rate": 3.755183323985855e-05, + "loss": 1.7002, + "step": 19311 + }, + { + "epoch": 5.92756292203806, + "grad_norm": 0.18753086030483246, + "learning_rate": 3.7547019249980385e-05, + "loss": 1.695, + "step": 19312 + }, + { + "epoch": 5.9278698588090855, + "grad_norm": 0.21354973316192627, + "learning_rate": 3.7542205383169904e-05, + "loss": 1.6629, + "step": 19313 + }, + { + "epoch": 5.928176795580111, + "grad_norm": 0.19713245332241058, + "learning_rate": 3.753739163947463e-05, + "loss": 1.707, + "step": 19314 + }, + { + "epoch": 5.928483732351136, + "grad_norm": 0.2122458517551422, + "learning_rate": 3.753257801894217e-05, + "loss": 1.7309, + "step": 19315 + }, + { + "epoch": 5.928790669122161, + "grad_norm": 0.20360666513442993, + "learning_rate": 3.7527764521620065e-05, + "loss": 1.6861, + "step": 19316 + }, + { + "epoch": 5.929097605893186, + "grad_norm": 0.2652932405471802, + "learning_rate": 3.752295114755592e-05, + "loss": 1.7662, + "step": 19317 + }, + { + "epoch": 5.929404542664211, + "grad_norm": 0.18292152881622314, + "learning_rate": 3.751813789679726e-05, + "loss": 1.6691, + "step": 19318 + }, + { + "epoch": 5.929711479435237, + "grad_norm": 0.25630465149879456, + "learning_rate": 3.75133247693917e-05, + "loss": 1.7647, + "step": 19319 + }, + { + "epoch": 5.930018416206261, + "grad_norm": 0.2463291883468628, + "learning_rate": 3.750851176538677e-05, + "loss": 1.7252, + "step": 19320 + }, + { + "epoch": 5.930325352977286, + "grad_norm": 0.19977931678295135, + "learning_rate": 3.750369888483007e-05, + "loss": 1.7694, + "step": 19321 + }, + { + "epoch": 5.930632289748312, + "grad_norm": 0.19523118436336517, + "learning_rate": 3.7498886127769116e-05, + "loss": 1.7095, + "step": 19322 + }, + { + "epoch": 5.930939226519337, + "grad_norm": 0.19273912906646729, + "learning_rate": 3.749407349425151e-05, + "loss": 1.7009, + "step": 19323 + }, + { + "epoch": 5.931246163290362, + "grad_norm": 0.2419402152299881, + "learning_rate": 3.748926098432479e-05, + "loss": 1.7167, + "step": 19324 + }, + { + "epoch": 5.931553100061388, + "grad_norm": 0.22429771721363068, + "learning_rate": 3.7484448598036534e-05, + "loss": 1.6957, + "step": 19325 + }, + { + "epoch": 5.931860036832412, + "grad_norm": 0.23211807012557983, + "learning_rate": 3.747963633543429e-05, + "loss": 1.767, + "step": 19326 + }, + { + "epoch": 5.9321669736034375, + "grad_norm": 0.23204533755779266, + "learning_rate": 3.7474824196565625e-05, + "loss": 1.7405, + "step": 19327 + }, + { + "epoch": 5.932473910374463, + "grad_norm": 0.24068887531757355, + "learning_rate": 3.747001218147809e-05, + "loss": 1.7539, + "step": 19328 + }, + { + "epoch": 5.932780847145488, + "grad_norm": 0.18140049278736115, + "learning_rate": 3.746520029021922e-05, + "loss": 1.6956, + "step": 19329 + }, + { + "epoch": 5.9330877839165135, + "grad_norm": 0.28421929478645325, + "learning_rate": 3.746038852283661e-05, + "loss": 1.8539, + "step": 19330 + }, + { + "epoch": 5.933394720687538, + "grad_norm": 0.21984805166721344, + "learning_rate": 3.745557687937777e-05, + "loss": 1.7469, + "step": 19331 + }, + { + "epoch": 5.933701657458563, + "grad_norm": 0.2500358819961548, + "learning_rate": 3.7450765359890294e-05, + "loss": 1.7184, + "step": 19332 + }, + { + "epoch": 5.934008594229589, + "grad_norm": 0.2608816623687744, + "learning_rate": 3.744595396442169e-05, + "loss": 1.6825, + "step": 19333 + }, + { + "epoch": 5.934315531000614, + "grad_norm": 0.20359274744987488, + "learning_rate": 3.7441142693019526e-05, + "loss": 1.7535, + "step": 19334 + }, + { + "epoch": 5.934622467771639, + "grad_norm": 0.24795760214328766, + "learning_rate": 3.743633154573135e-05, + "loss": 1.7829, + "step": 19335 + }, + { + "epoch": 5.934929404542665, + "grad_norm": 0.20762503147125244, + "learning_rate": 3.7431520522604736e-05, + "loss": 1.7657, + "step": 19336 + }, + { + "epoch": 5.935236341313689, + "grad_norm": 0.24349527060985565, + "learning_rate": 3.7426709623687174e-05, + "loss": 1.7037, + "step": 19337 + }, + { + "epoch": 5.935543278084714, + "grad_norm": 0.2138780951499939, + "learning_rate": 3.742189884902626e-05, + "loss": 1.7302, + "step": 19338 + }, + { + "epoch": 5.93585021485574, + "grad_norm": 0.24776574969291687, + "learning_rate": 3.741708819866949e-05, + "loss": 1.7293, + "step": 19339 + }, + { + "epoch": 5.936157151626765, + "grad_norm": 0.297888845205307, + "learning_rate": 3.7412277672664444e-05, + "loss": 1.8341, + "step": 19340 + }, + { + "epoch": 5.93646408839779, + "grad_norm": 0.2811104953289032, + "learning_rate": 3.740746727105864e-05, + "loss": 1.7188, + "step": 19341 + }, + { + "epoch": 5.936771025168815, + "grad_norm": 0.37908127903938293, + "learning_rate": 3.740265699389964e-05, + "loss": 1.765, + "step": 19342 + }, + { + "epoch": 5.93707796193984, + "grad_norm": 0.24403691291809082, + "learning_rate": 3.739784684123495e-05, + "loss": 1.6897, + "step": 19343 + }, + { + "epoch": 5.9373848987108655, + "grad_norm": 0.2393181174993515, + "learning_rate": 3.7393036813112135e-05, + "loss": 1.6843, + "step": 19344 + }, + { + "epoch": 5.937691835481891, + "grad_norm": 0.2927580177783966, + "learning_rate": 3.738822690957872e-05, + "loss": 1.6946, + "step": 19345 + }, + { + "epoch": 5.937998772252916, + "grad_norm": 0.23423373699188232, + "learning_rate": 3.738341713068223e-05, + "loss": 1.7409, + "step": 19346 + }, + { + "epoch": 5.9383057090239415, + "grad_norm": 0.2544272840023041, + "learning_rate": 3.7378607476470216e-05, + "loss": 1.698, + "step": 19347 + }, + { + "epoch": 5.938612645794966, + "grad_norm": 0.2120404839515686, + "learning_rate": 3.737379794699019e-05, + "loss": 1.7412, + "step": 19348 + }, + { + "epoch": 5.938919582565991, + "grad_norm": 0.2076033353805542, + "learning_rate": 3.736898854228971e-05, + "loss": 1.752, + "step": 19349 + }, + { + "epoch": 5.939226519337017, + "grad_norm": 0.20122376084327698, + "learning_rate": 3.736417926241627e-05, + "loss": 1.6741, + "step": 19350 + }, + { + "epoch": 5.939533456108042, + "grad_norm": 0.1856858730316162, + "learning_rate": 3.735937010741742e-05, + "loss": 1.6959, + "step": 19351 + }, + { + "epoch": 5.939840392879067, + "grad_norm": 0.22192558646202087, + "learning_rate": 3.7354561077340684e-05, + "loss": 1.7597, + "step": 19352 + }, + { + "epoch": 5.940147329650092, + "grad_norm": 0.2653545141220093, + "learning_rate": 3.73497521722336e-05, + "loss": 1.7324, + "step": 19353 + }, + { + "epoch": 5.940454266421117, + "grad_norm": 0.1975676715373993, + "learning_rate": 3.734494339214366e-05, + "loss": 1.6852, + "step": 19354 + }, + { + "epoch": 5.940761203192142, + "grad_norm": 0.26949796080589294, + "learning_rate": 3.734013473711843e-05, + "loss": 1.7695, + "step": 19355 + }, + { + "epoch": 5.941068139963168, + "grad_norm": 0.2272176742553711, + "learning_rate": 3.733532620720539e-05, + "loss": 1.745, + "step": 19356 + }, + { + "epoch": 5.941375076734193, + "grad_norm": 0.25740066170692444, + "learning_rate": 3.733051780245208e-05, + "loss": 1.7701, + "step": 19357 + }, + { + "epoch": 5.941682013505218, + "grad_norm": 0.1910635381937027, + "learning_rate": 3.732570952290602e-05, + "loss": 1.7276, + "step": 19358 + }, + { + "epoch": 5.941988950276243, + "grad_norm": 0.24896447360515594, + "learning_rate": 3.732090136861474e-05, + "loss": 1.7717, + "step": 19359 + }, + { + "epoch": 5.942295887047268, + "grad_norm": 0.20696721971035004, + "learning_rate": 3.731609333962572e-05, + "loss": 1.7053, + "step": 19360 + }, + { + "epoch": 5.9426028238182935, + "grad_norm": 0.18822510540485382, + "learning_rate": 3.731128543598653e-05, + "loss": 1.6869, + "step": 19361 + }, + { + "epoch": 5.942909760589319, + "grad_norm": 0.20757299661636353, + "learning_rate": 3.730647765774464e-05, + "loss": 1.7214, + "step": 19362 + }, + { + "epoch": 5.943216697360343, + "grad_norm": 0.21238471567630768, + "learning_rate": 3.7301670004947574e-05, + "loss": 1.6953, + "step": 19363 + }, + { + "epoch": 5.943523634131369, + "grad_norm": 0.19326119124889374, + "learning_rate": 3.729686247764286e-05, + "loss": 1.7224, + "step": 19364 + }, + { + "epoch": 5.943830570902394, + "grad_norm": 0.17631326615810394, + "learning_rate": 3.729205507587798e-05, + "loss": 1.6471, + "step": 19365 + }, + { + "epoch": 5.944137507673419, + "grad_norm": 0.1741493195295334, + "learning_rate": 3.728724779970048e-05, + "loss": 1.7169, + "step": 19366 + }, + { + "epoch": 5.944444444444445, + "grad_norm": 0.18203428387641907, + "learning_rate": 3.728244064915782e-05, + "loss": 1.7301, + "step": 19367 + }, + { + "epoch": 5.94475138121547, + "grad_norm": 0.2063162475824356, + "learning_rate": 3.727763362429756e-05, + "loss": 1.7274, + "step": 19368 + }, + { + "epoch": 5.945058317986494, + "grad_norm": 0.17239537835121155, + "learning_rate": 3.7272826725167164e-05, + "loss": 1.7194, + "step": 19369 + }, + { + "epoch": 5.94536525475752, + "grad_norm": 0.1910972148180008, + "learning_rate": 3.726801995181418e-05, + "loss": 1.7017, + "step": 19370 + }, + { + "epoch": 5.945672191528545, + "grad_norm": 0.18822111189365387, + "learning_rate": 3.726321330428606e-05, + "loss": 1.723, + "step": 19371 + }, + { + "epoch": 5.94597912829957, + "grad_norm": 0.19680333137512207, + "learning_rate": 3.725840678263035e-05, + "loss": 1.685, + "step": 19372 + }, + { + "epoch": 5.946286065070596, + "grad_norm": 0.19016215205192566, + "learning_rate": 3.725360038689451e-05, + "loss": 1.7148, + "step": 19373 + }, + { + "epoch": 5.94659300184162, + "grad_norm": 0.1992037147283554, + "learning_rate": 3.7248794117126075e-05, + "loss": 1.7278, + "step": 19374 + }, + { + "epoch": 5.9468999386126455, + "grad_norm": 0.1892910748720169, + "learning_rate": 3.724398797337252e-05, + "loss": 1.7093, + "step": 19375 + }, + { + "epoch": 5.947206875383671, + "grad_norm": 0.23379561305046082, + "learning_rate": 3.723918195568137e-05, + "loss": 1.768, + "step": 19376 + }, + { + "epoch": 5.947513812154696, + "grad_norm": 0.1986081600189209, + "learning_rate": 3.7234376064100104e-05, + "loss": 1.719, + "step": 19377 + }, + { + "epoch": 5.9478207489257215, + "grad_norm": 0.20901642739772797, + "learning_rate": 3.7229570298676195e-05, + "loss": 1.7066, + "step": 19378 + }, + { + "epoch": 5.948127685696747, + "grad_norm": 0.2102847546339035, + "learning_rate": 3.722476465945718e-05, + "loss": 1.7354, + "step": 19379 + }, + { + "epoch": 5.948434622467771, + "grad_norm": 0.1857316792011261, + "learning_rate": 3.72199591464905e-05, + "loss": 1.7159, + "step": 19380 + }, + { + "epoch": 5.948741559238797, + "grad_norm": 0.3045661151409149, + "learning_rate": 3.721515375982371e-05, + "loss": 1.8782, + "step": 19381 + }, + { + "epoch": 5.949048496009822, + "grad_norm": 0.24114711582660675, + "learning_rate": 3.7210348499504236e-05, + "loss": 1.6819, + "step": 19382 + }, + { + "epoch": 5.949355432780847, + "grad_norm": 0.20186996459960938, + "learning_rate": 3.720554336557961e-05, + "loss": 1.8028, + "step": 19383 + }, + { + "epoch": 5.949662369551873, + "grad_norm": 0.25385335087776184, + "learning_rate": 3.7200738358097295e-05, + "loss": 1.7278, + "step": 19384 + }, + { + "epoch": 5.949969306322897, + "grad_norm": 0.23390468955039978, + "learning_rate": 3.719593347710478e-05, + "loss": 1.7775, + "step": 19385 + }, + { + "epoch": 5.9502762430939224, + "grad_norm": 0.22577936947345734, + "learning_rate": 3.719112872264956e-05, + "loss": 1.7567, + "step": 19386 + }, + { + "epoch": 5.950583179864948, + "grad_norm": 0.2540932297706604, + "learning_rate": 3.718632409477912e-05, + "loss": 1.6749, + "step": 19387 + }, + { + "epoch": 5.950890116635973, + "grad_norm": 0.1994820535182953, + "learning_rate": 3.718151959354093e-05, + "loss": 1.6809, + "step": 19388 + }, + { + "epoch": 5.9511970534069984, + "grad_norm": 0.27669432759284973, + "learning_rate": 3.717671521898249e-05, + "loss": 1.7633, + "step": 19389 + }, + { + "epoch": 5.951503990178024, + "grad_norm": 0.2533062994480133, + "learning_rate": 3.717191097115125e-05, + "loss": 1.7536, + "step": 19390 + }, + { + "epoch": 5.951810926949048, + "grad_norm": 0.22249148786067963, + "learning_rate": 3.716710685009471e-05, + "loss": 1.7325, + "step": 19391 + }, + { + "epoch": 5.952117863720074, + "grad_norm": 0.3085922598838806, + "learning_rate": 3.716230285586033e-05, + "loss": 1.7046, + "step": 19392 + }, + { + "epoch": 5.952424800491099, + "grad_norm": 0.2591574192047119, + "learning_rate": 3.715749898849562e-05, + "loss": 1.7165, + "step": 19393 + }, + { + "epoch": 5.952731737262124, + "grad_norm": 0.24586348235607147, + "learning_rate": 3.715269524804803e-05, + "loss": 1.749, + "step": 19394 + }, + { + "epoch": 5.953038674033149, + "grad_norm": 0.3424640893936157, + "learning_rate": 3.714789163456502e-05, + "loss": 1.7143, + "step": 19395 + }, + { + "epoch": 5.953345610804174, + "grad_norm": 0.24856910109519958, + "learning_rate": 3.714308814809408e-05, + "loss": 1.868, + "step": 19396 + }, + { + "epoch": 5.953652547575199, + "grad_norm": 0.2758113145828247, + "learning_rate": 3.7138284788682676e-05, + "loss": 1.6722, + "step": 19397 + }, + { + "epoch": 5.953959484346225, + "grad_norm": 0.25981786847114563, + "learning_rate": 3.71334815563783e-05, + "loss": 1.764, + "step": 19398 + }, + { + "epoch": 5.95426642111725, + "grad_norm": 0.27885568141937256, + "learning_rate": 3.7128678451228385e-05, + "loss": 1.7422, + "step": 19399 + }, + { + "epoch": 5.954573357888275, + "grad_norm": 0.2909421920776367, + "learning_rate": 3.712387547328042e-05, + "loss": 1.7862, + "step": 19400 + }, + { + "epoch": 5.9548802946593, + "grad_norm": 0.2288074642419815, + "learning_rate": 3.711907262258185e-05, + "loss": 1.7054, + "step": 19401 + }, + { + "epoch": 5.955187231430325, + "grad_norm": 0.2986883819103241, + "learning_rate": 3.711426989918017e-05, + "loss": 1.7555, + "step": 19402 + }, + { + "epoch": 5.9554941682013505, + "grad_norm": 0.23201194405555725, + "learning_rate": 3.710946730312281e-05, + "loss": 1.8186, + "step": 19403 + }, + { + "epoch": 5.955801104972376, + "grad_norm": 0.2609403431415558, + "learning_rate": 3.710466483445728e-05, + "loss": 1.7743, + "step": 19404 + }, + { + "epoch": 5.956108041743401, + "grad_norm": 0.31131741404533386, + "learning_rate": 3.709986249323098e-05, + "loss": 1.7938, + "step": 19405 + }, + { + "epoch": 5.956414978514426, + "grad_norm": 0.20544753968715668, + "learning_rate": 3.7095060279491424e-05, + "loss": 1.7278, + "step": 19406 + }, + { + "epoch": 5.956721915285451, + "grad_norm": 0.3063479959964752, + "learning_rate": 3.709025819328602e-05, + "loss": 1.7544, + "step": 19407 + }, + { + "epoch": 5.957028852056476, + "grad_norm": 0.34868693351745605, + "learning_rate": 3.708545623466227e-05, + "loss": 1.7536, + "step": 19408 + }, + { + "epoch": 5.957335788827502, + "grad_norm": 0.20847822725772858, + "learning_rate": 3.70806544036676e-05, + "loss": 1.7003, + "step": 19409 + }, + { + "epoch": 5.957642725598527, + "grad_norm": 0.3250095844268799, + "learning_rate": 3.707585270034949e-05, + "loss": 1.6815, + "step": 19410 + }, + { + "epoch": 5.957949662369552, + "grad_norm": 0.24854284524917603, + "learning_rate": 3.707105112475539e-05, + "loss": 1.7665, + "step": 19411 + }, + { + "epoch": 5.958256599140577, + "grad_norm": 0.2921455502510071, + "learning_rate": 3.706624967693271e-05, + "loss": 1.7039, + "step": 19412 + }, + { + "epoch": 5.958563535911602, + "grad_norm": 0.2659071385860443, + "learning_rate": 3.706144835692894e-05, + "loss": 1.7641, + "step": 19413 + }, + { + "epoch": 5.958870472682627, + "grad_norm": 0.30329519510269165, + "learning_rate": 3.7056647164791516e-05, + "loss": 1.7962, + "step": 19414 + }, + { + "epoch": 5.959177409453653, + "grad_norm": 0.4023756682872772, + "learning_rate": 3.7051846100567906e-05, + "loss": 1.7624, + "step": 19415 + }, + { + "epoch": 5.959484346224678, + "grad_norm": 0.24528828263282776, + "learning_rate": 3.704704516430553e-05, + "loss": 1.8156, + "step": 19416 + }, + { + "epoch": 5.9597912829957025, + "grad_norm": 0.46833130717277527, + "learning_rate": 3.704224435605186e-05, + "loss": 1.798, + "step": 19417 + }, + { + "epoch": 5.960098219766728, + "grad_norm": 0.26952674984931946, + "learning_rate": 3.70374436758543e-05, + "loss": 1.743, + "step": 19418 + }, + { + "epoch": 5.960405156537753, + "grad_norm": 0.3126155734062195, + "learning_rate": 3.703264312376034e-05, + "loss": 1.8003, + "step": 19419 + }, + { + "epoch": 5.9607120933087785, + "grad_norm": 0.2833348512649536, + "learning_rate": 3.702784269981738e-05, + "loss": 1.7524, + "step": 19420 + }, + { + "epoch": 5.961019030079804, + "grad_norm": 0.25425654649734497, + "learning_rate": 3.7023042404072916e-05, + "loss": 1.7241, + "step": 19421 + }, + { + "epoch": 5.961325966850829, + "grad_norm": 0.29460933804512024, + "learning_rate": 3.701824223657433e-05, + "loss": 1.676, + "step": 19422 + }, + { + "epoch": 5.961632903621854, + "grad_norm": 0.21040670573711395, + "learning_rate": 3.7013442197369094e-05, + "loss": 1.71, + "step": 19423 + }, + { + "epoch": 5.961939840392879, + "grad_norm": 0.3200007379055023, + "learning_rate": 3.7008642286504624e-05, + "loss": 1.7108, + "step": 19424 + }, + { + "epoch": 5.962246777163904, + "grad_norm": 0.20397430658340454, + "learning_rate": 3.7003842504028366e-05, + "loss": 1.7472, + "step": 19425 + }, + { + "epoch": 5.96255371393493, + "grad_norm": 0.24811354279518127, + "learning_rate": 3.699904284998776e-05, + "loss": 1.7116, + "step": 19426 + }, + { + "epoch": 5.962860650705955, + "grad_norm": 0.20980580151081085, + "learning_rate": 3.699424332443023e-05, + "loss": 1.786, + "step": 19427 + }, + { + "epoch": 5.963167587476979, + "grad_norm": 0.1967400163412094, + "learning_rate": 3.698944392740322e-05, + "loss": 1.7141, + "step": 19428 + }, + { + "epoch": 5.963474524248005, + "grad_norm": 0.21907822787761688, + "learning_rate": 3.698464465895414e-05, + "loss": 1.6983, + "step": 19429 + }, + { + "epoch": 5.96378146101903, + "grad_norm": 0.19938960671424866, + "learning_rate": 3.697984551913043e-05, + "loss": 1.6811, + "step": 19430 + }, + { + "epoch": 5.964088397790055, + "grad_norm": 0.22280220687389374, + "learning_rate": 3.6975046507979506e-05, + "loss": 1.6838, + "step": 19431 + }, + { + "epoch": 5.964395334561081, + "grad_norm": 0.2530672550201416, + "learning_rate": 3.697024762554883e-05, + "loss": 1.8116, + "step": 19432 + }, + { + "epoch": 5.964702271332106, + "grad_norm": 0.21853135526180267, + "learning_rate": 3.696544887188579e-05, + "loss": 1.692, + "step": 19433 + }, + { + "epoch": 5.9650092081031305, + "grad_norm": 0.18738535046577454, + "learning_rate": 3.696065024703783e-05, + "loss": 1.6971, + "step": 19434 + }, + { + "epoch": 5.965316144874156, + "grad_norm": 0.21199190616607666, + "learning_rate": 3.695585175105236e-05, + "loss": 1.7526, + "step": 19435 + }, + { + "epoch": 5.965623081645181, + "grad_norm": 0.22184251248836517, + "learning_rate": 3.695105338397681e-05, + "loss": 1.8075, + "step": 19436 + }, + { + "epoch": 5.9659300184162065, + "grad_norm": 0.20191644132137299, + "learning_rate": 3.6946255145858605e-05, + "loss": 1.7427, + "step": 19437 + }, + { + "epoch": 5.966236955187231, + "grad_norm": 0.2113640457391739, + "learning_rate": 3.694145703674515e-05, + "loss": 1.7556, + "step": 19438 + }, + { + "epoch": 5.966543891958256, + "grad_norm": 0.21834735572338104, + "learning_rate": 3.693665905668387e-05, + "loss": 1.7673, + "step": 19439 + }, + { + "epoch": 5.966850828729282, + "grad_norm": 0.2260274887084961, + "learning_rate": 3.6931861205722197e-05, + "loss": 1.8168, + "step": 19440 + }, + { + "epoch": 5.967157765500307, + "grad_norm": 0.24090524017810822, + "learning_rate": 3.692706348390751e-05, + "loss": 1.821, + "step": 19441 + }, + { + "epoch": 5.967464702271332, + "grad_norm": 0.27469882369041443, + "learning_rate": 3.6922265891287256e-05, + "loss": 1.7114, + "step": 19442 + }, + { + "epoch": 5.967771639042358, + "grad_norm": 0.23479801416397095, + "learning_rate": 3.6917468427908833e-05, + "loss": 1.7334, + "step": 19443 + }, + { + "epoch": 5.968078575813382, + "grad_norm": 0.21109704673290253, + "learning_rate": 3.6912671093819663e-05, + "loss": 1.7047, + "step": 19444 + }, + { + "epoch": 5.968385512584407, + "grad_norm": 0.21141986548900604, + "learning_rate": 3.690787388906715e-05, + "loss": 1.6868, + "step": 19445 + }, + { + "epoch": 5.968692449355433, + "grad_norm": 0.21836397051811218, + "learning_rate": 3.690307681369868e-05, + "loss": 1.6923, + "step": 19446 + }, + { + "epoch": 5.968999386126458, + "grad_norm": 0.21733662486076355, + "learning_rate": 3.6898279867761695e-05, + "loss": 1.7699, + "step": 19447 + }, + { + "epoch": 5.969306322897483, + "grad_norm": 0.19220437109470367, + "learning_rate": 3.689348305130359e-05, + "loss": 1.7002, + "step": 19448 + }, + { + "epoch": 5.969613259668508, + "grad_norm": 0.22644726932048798, + "learning_rate": 3.688868636437176e-05, + "loss": 1.7024, + "step": 19449 + }, + { + "epoch": 5.969920196439533, + "grad_norm": 0.1832779198884964, + "learning_rate": 3.688388980701361e-05, + "loss": 1.699, + "step": 19450 + }, + { + "epoch": 5.9702271332105585, + "grad_norm": 0.20793284475803375, + "learning_rate": 3.687909337927658e-05, + "loss": 1.7557, + "step": 19451 + }, + { + "epoch": 5.970534069981584, + "grad_norm": 0.19485175609588623, + "learning_rate": 3.6874297081207995e-05, + "loss": 1.7641, + "step": 19452 + }, + { + "epoch": 5.970841006752609, + "grad_norm": 0.20980949699878693, + "learning_rate": 3.686950091285534e-05, + "loss": 1.7542, + "step": 19453 + }, + { + "epoch": 5.9711479435236345, + "grad_norm": 0.24902600049972534, + "learning_rate": 3.686470487426594e-05, + "loss": 1.7342, + "step": 19454 + }, + { + "epoch": 5.971454880294659, + "grad_norm": 0.20191124081611633, + "learning_rate": 3.685990896548724e-05, + "loss": 1.6844, + "step": 19455 + }, + { + "epoch": 5.971761817065684, + "grad_norm": 0.23217806220054626, + "learning_rate": 3.685511318656662e-05, + "loss": 1.7054, + "step": 19456 + }, + { + "epoch": 5.97206875383671, + "grad_norm": 0.23383383452892303, + "learning_rate": 3.6850317537551484e-05, + "loss": 1.6903, + "step": 19457 + }, + { + "epoch": 5.972375690607735, + "grad_norm": 0.2147756665945053, + "learning_rate": 3.6845522018489196e-05, + "loss": 1.736, + "step": 19458 + }, + { + "epoch": 5.97268262737876, + "grad_norm": 0.23864400386810303, + "learning_rate": 3.68407266294272e-05, + "loss": 1.7483, + "step": 19459 + }, + { + "epoch": 5.972989564149785, + "grad_norm": 0.18702742457389832, + "learning_rate": 3.6835931370412836e-05, + "loss": 1.6874, + "step": 19460 + }, + { + "epoch": 5.97329650092081, + "grad_norm": 0.2167401760816574, + "learning_rate": 3.683113624149351e-05, + "loss": 1.652, + "step": 19461 + }, + { + "epoch": 5.973603437691835, + "grad_norm": 0.17105139791965485, + "learning_rate": 3.6826341242716636e-05, + "loss": 1.7029, + "step": 19462 + }, + { + "epoch": 5.973910374462861, + "grad_norm": 0.2189798206090927, + "learning_rate": 3.682154637412956e-05, + "loss": 1.7203, + "step": 19463 + }, + { + "epoch": 5.974217311233886, + "grad_norm": 0.17864444851875305, + "learning_rate": 3.68167516357797e-05, + "loss": 1.7176, + "step": 19464 + }, + { + "epoch": 5.974524248004911, + "grad_norm": 0.22356030344963074, + "learning_rate": 3.681195702771442e-05, + "loss": 1.7492, + "step": 19465 + }, + { + "epoch": 5.974831184775936, + "grad_norm": 0.19020728766918182, + "learning_rate": 3.68071625499811e-05, + "loss": 1.6925, + "step": 19466 + }, + { + "epoch": 5.975138121546961, + "grad_norm": 0.19092151522636414, + "learning_rate": 3.680236820262714e-05, + "loss": 1.7253, + "step": 19467 + }, + { + "epoch": 5.975445058317987, + "grad_norm": 0.20842085778713226, + "learning_rate": 3.6797573985699926e-05, + "loss": 1.7251, + "step": 19468 + }, + { + "epoch": 5.975751995089012, + "grad_norm": 0.2245844155550003, + "learning_rate": 3.6792779899246796e-05, + "loss": 1.7351, + "step": 19469 + }, + { + "epoch": 5.976058931860036, + "grad_norm": 0.18867328763008118, + "learning_rate": 3.678798594331519e-05, + "loss": 1.6646, + "step": 19470 + }, + { + "epoch": 5.976365868631062, + "grad_norm": 0.2892500162124634, + "learning_rate": 3.678319211795242e-05, + "loss": 1.7146, + "step": 19471 + }, + { + "epoch": 5.976672805402087, + "grad_norm": 0.22490514814853668, + "learning_rate": 3.677839842320591e-05, + "loss": 1.7147, + "step": 19472 + }, + { + "epoch": 5.976979742173112, + "grad_norm": 0.296724796295166, + "learning_rate": 3.677360485912301e-05, + "loss": 1.7714, + "step": 19473 + }, + { + "epoch": 5.977286678944138, + "grad_norm": 0.2784444987773895, + "learning_rate": 3.676881142575111e-05, + "loss": 1.7198, + "step": 19474 + }, + { + "epoch": 5.977593615715163, + "grad_norm": 0.20270293951034546, + "learning_rate": 3.676401812313755e-05, + "loss": 1.7336, + "step": 19475 + }, + { + "epoch": 5.9779005524861875, + "grad_norm": 0.23352907598018646, + "learning_rate": 3.6759224951329745e-05, + "loss": 1.7428, + "step": 19476 + }, + { + "epoch": 5.978207489257213, + "grad_norm": 0.1892426460981369, + "learning_rate": 3.675443191037502e-05, + "loss": 1.6636, + "step": 19477 + }, + { + "epoch": 5.978514426028238, + "grad_norm": 0.22216783463954926, + "learning_rate": 3.6749639000320766e-05, + "loss": 1.7446, + "step": 19478 + }, + { + "epoch": 5.9788213627992635, + "grad_norm": 0.19465389847755432, + "learning_rate": 3.6744846221214364e-05, + "loss": 1.7403, + "step": 19479 + }, + { + "epoch": 5.979128299570289, + "grad_norm": 0.1918177455663681, + "learning_rate": 3.674005357310314e-05, + "loss": 1.6974, + "step": 19480 + }, + { + "epoch": 5.979435236341313, + "grad_norm": 0.19065791368484497, + "learning_rate": 3.673526105603449e-05, + "loss": 1.7299, + "step": 19481 + }, + { + "epoch": 5.979742173112339, + "grad_norm": 0.24036844074726105, + "learning_rate": 3.673046867005575e-05, + "loss": 1.7441, + "step": 19482 + }, + { + "epoch": 5.980049109883364, + "grad_norm": 0.22352568805217743, + "learning_rate": 3.6725676415214305e-05, + "loss": 1.7556, + "step": 19483 + }, + { + "epoch": 5.980356046654389, + "grad_norm": 0.2492935210466385, + "learning_rate": 3.67208842915575e-05, + "loss": 1.6833, + "step": 19484 + }, + { + "epoch": 5.980662983425415, + "grad_norm": 0.2554415762424469, + "learning_rate": 3.671609229913272e-05, + "loss": 1.7426, + "step": 19485 + }, + { + "epoch": 5.98096992019644, + "grad_norm": 0.24076475203037262, + "learning_rate": 3.671130043798728e-05, + "loss": 1.7362, + "step": 19486 + }, + { + "epoch": 5.981276856967464, + "grad_norm": 0.24297118186950684, + "learning_rate": 3.670650870816858e-05, + "loss": 1.7493, + "step": 19487 + }, + { + "epoch": 5.98158379373849, + "grad_norm": 0.19533030688762665, + "learning_rate": 3.6701717109723924e-05, + "loss": 1.7397, + "step": 19488 + }, + { + "epoch": 5.981890730509515, + "grad_norm": 0.24731193482875824, + "learning_rate": 3.669692564270071e-05, + "loss": 1.7483, + "step": 19489 + }, + { + "epoch": 5.98219766728054, + "grad_norm": 0.23274390399456024, + "learning_rate": 3.669213430714626e-05, + "loss": 1.7677, + "step": 19490 + }, + { + "epoch": 5.982504604051566, + "grad_norm": 0.180234894156456, + "learning_rate": 3.668734310310796e-05, + "loss": 1.7065, + "step": 19491 + }, + { + "epoch": 5.98281154082259, + "grad_norm": 0.19045281410217285, + "learning_rate": 3.6682552030633125e-05, + "loss": 1.7089, + "step": 19492 + }, + { + "epoch": 5.9831184775936155, + "grad_norm": 0.17261318862438202, + "learning_rate": 3.667776108976914e-05, + "loss": 1.7227, + "step": 19493 + }, + { + "epoch": 5.983425414364641, + "grad_norm": 0.2156316339969635, + "learning_rate": 3.667297028056329e-05, + "loss": 1.7025, + "step": 19494 + }, + { + "epoch": 5.983732351135666, + "grad_norm": 0.22288112342357635, + "learning_rate": 3.666817960306298e-05, + "loss": 1.7123, + "step": 19495 + }, + { + "epoch": 5.9840392879066915, + "grad_norm": 0.21983082592487335, + "learning_rate": 3.6663389057315543e-05, + "loss": 1.7688, + "step": 19496 + }, + { + "epoch": 5.984346224677717, + "grad_norm": 0.1804746687412262, + "learning_rate": 3.665859864336829e-05, + "loss": 1.759, + "step": 19497 + }, + { + "epoch": 5.984653161448741, + "grad_norm": 0.22762230038642883, + "learning_rate": 3.6653808361268605e-05, + "loss": 1.8128, + "step": 19498 + }, + { + "epoch": 5.984960098219767, + "grad_norm": 0.21779340505599976, + "learning_rate": 3.664901821106379e-05, + "loss": 1.7316, + "step": 19499 + }, + { + "epoch": 5.985267034990792, + "grad_norm": 0.18899449706077576, + "learning_rate": 3.664422819280121e-05, + "loss": 1.7535, + "step": 19500 + }, + { + "epoch": 5.985573971761817, + "grad_norm": 0.22799427807331085, + "learning_rate": 3.663943830652819e-05, + "loss": 1.7626, + "step": 19501 + }, + { + "epoch": 5.985880908532843, + "grad_norm": 0.19936929643154144, + "learning_rate": 3.6634648552292086e-05, + "loss": 1.6887, + "step": 19502 + }, + { + "epoch": 5.986187845303867, + "grad_norm": 0.22482532262802124, + "learning_rate": 3.6629858930140206e-05, + "loss": 1.6867, + "step": 19503 + }, + { + "epoch": 5.986494782074892, + "grad_norm": 0.23543842136859894, + "learning_rate": 3.662506944011991e-05, + "loss": 1.7715, + "step": 19504 + }, + { + "epoch": 5.986801718845918, + "grad_norm": 0.230603888630867, + "learning_rate": 3.6620280082278495e-05, + "loss": 1.7514, + "step": 19505 + }, + { + "epoch": 5.987108655616943, + "grad_norm": 0.26767033338546753, + "learning_rate": 3.6615490856663334e-05, + "loss": 1.6862, + "step": 19506 + }, + { + "epoch": 5.987415592387968, + "grad_norm": 0.18282492458820343, + "learning_rate": 3.661070176332172e-05, + "loss": 1.6569, + "step": 19507 + }, + { + "epoch": 5.987722529158994, + "grad_norm": 0.255426824092865, + "learning_rate": 3.6605912802301016e-05, + "loss": 1.7623, + "step": 19508 + }, + { + "epoch": 5.988029465930018, + "grad_norm": 0.25026118755340576, + "learning_rate": 3.6601123973648524e-05, + "loss": 1.6907, + "step": 19509 + }, + { + "epoch": 5.9883364027010435, + "grad_norm": 0.19193407893180847, + "learning_rate": 3.659633527741159e-05, + "loss": 1.7647, + "step": 19510 + }, + { + "epoch": 5.988643339472069, + "grad_norm": 0.25562727451324463, + "learning_rate": 3.6591546713637506e-05, + "loss": 1.6806, + "step": 19511 + }, + { + "epoch": 5.988950276243094, + "grad_norm": 0.2296016663312912, + "learning_rate": 3.6586758282373624e-05, + "loss": 1.7747, + "step": 19512 + }, + { + "epoch": 5.989257213014119, + "grad_norm": 0.22875753045082092, + "learning_rate": 3.6581969983667275e-05, + "loss": 1.7847, + "step": 19513 + }, + { + "epoch": 5.989564149785144, + "grad_norm": 0.24469317495822906, + "learning_rate": 3.6577181817565736e-05, + "loss": 1.6784, + "step": 19514 + }, + { + "epoch": 5.989871086556169, + "grad_norm": 0.22855928540229797, + "learning_rate": 3.657239378411638e-05, + "loss": 1.788, + "step": 19515 + }, + { + "epoch": 5.990178023327195, + "grad_norm": 0.28745612502098083, + "learning_rate": 3.656760588336647e-05, + "loss": 1.6836, + "step": 19516 + }, + { + "epoch": 5.99048496009822, + "grad_norm": 0.18221193552017212, + "learning_rate": 3.656281811536337e-05, + "loss": 1.6687, + "step": 19517 + }, + { + "epoch": 5.990791896869245, + "grad_norm": 0.2556660771369934, + "learning_rate": 3.655803048015437e-05, + "loss": 1.7351, + "step": 19518 + }, + { + "epoch": 5.99109883364027, + "grad_norm": 0.18791422247886658, + "learning_rate": 3.6553242977786803e-05, + "loss": 1.6749, + "step": 19519 + }, + { + "epoch": 5.991405770411295, + "grad_norm": 0.28149592876434326, + "learning_rate": 3.654845560830796e-05, + "loss": 1.7333, + "step": 19520 + }, + { + "epoch": 5.99171270718232, + "grad_norm": 0.24631322920322418, + "learning_rate": 3.654366837176517e-05, + "loss": 1.7672, + "step": 19521 + }, + { + "epoch": 5.992019643953346, + "grad_norm": 0.22054782509803772, + "learning_rate": 3.653888126820573e-05, + "loss": 1.7499, + "step": 19522 + }, + { + "epoch": 5.992326580724371, + "grad_norm": 0.23334862291812897, + "learning_rate": 3.653409429767696e-05, + "loss": 1.7133, + "step": 19523 + }, + { + "epoch": 5.9926335174953955, + "grad_norm": 0.19809292256832123, + "learning_rate": 3.6529307460226145e-05, + "loss": 1.6965, + "step": 19524 + }, + { + "epoch": 5.992940454266421, + "grad_norm": 0.23769772052764893, + "learning_rate": 3.652452075590064e-05, + "loss": 1.699, + "step": 19525 + }, + { + "epoch": 5.993247391037446, + "grad_norm": 0.19045031070709229, + "learning_rate": 3.6519734184747686e-05, + "loss": 1.7043, + "step": 19526 + }, + { + "epoch": 5.9935543278084715, + "grad_norm": 0.20795129239559174, + "learning_rate": 3.651494774681465e-05, + "loss": 1.7159, + "step": 19527 + }, + { + "epoch": 5.993861264579497, + "grad_norm": 0.1933370679616928, + "learning_rate": 3.651016144214878e-05, + "loss": 1.6999, + "step": 19528 + }, + { + "epoch": 5.994168201350522, + "grad_norm": 0.18360544741153717, + "learning_rate": 3.650537527079742e-05, + "loss": 1.7525, + "step": 19529 + }, + { + "epoch": 5.994475138121547, + "grad_norm": 0.21080785989761353, + "learning_rate": 3.650058923280786e-05, + "loss": 1.6832, + "step": 19530 + }, + { + "epoch": 5.994782074892572, + "grad_norm": 0.19701606035232544, + "learning_rate": 3.649580332822736e-05, + "loss": 1.7104, + "step": 19531 + }, + { + "epoch": 5.995089011663597, + "grad_norm": 0.24208703637123108, + "learning_rate": 3.6491017557103266e-05, + "loss": 1.726, + "step": 19532 + }, + { + "epoch": 5.995395948434623, + "grad_norm": 0.25981345772743225, + "learning_rate": 3.648623191948284e-05, + "loss": 1.7644, + "step": 19533 + }, + { + "epoch": 5.995702885205648, + "grad_norm": 0.24137455224990845, + "learning_rate": 3.64814464154134e-05, + "loss": 1.7354, + "step": 19534 + }, + { + "epoch": 5.996009821976672, + "grad_norm": 0.2140759378671646, + "learning_rate": 3.647666104494222e-05, + "loss": 1.7244, + "step": 19535 + }, + { + "epoch": 5.996316758747698, + "grad_norm": 0.2801622748374939, + "learning_rate": 3.647187580811663e-05, + "loss": 1.6996, + "step": 19536 + }, + { + "epoch": 5.996623695518723, + "grad_norm": 0.21048817038536072, + "learning_rate": 3.6467090704983856e-05, + "loss": 1.7378, + "step": 19537 + }, + { + "epoch": 5.996930632289748, + "grad_norm": 0.2935819625854492, + "learning_rate": 3.6462305735591254e-05, + "loss": 1.7066, + "step": 19538 + }, + { + "epoch": 5.997237569060774, + "grad_norm": 0.22473880648612976, + "learning_rate": 3.645752089998606e-05, + "loss": 1.7539, + "step": 19539 + }, + { + "epoch": 5.997544505831799, + "grad_norm": 0.20606113970279694, + "learning_rate": 3.6452736198215585e-05, + "loss": 1.7338, + "step": 19540 + }, + { + "epoch": 5.9978514426028235, + "grad_norm": 0.2702842950820923, + "learning_rate": 3.6447951630327116e-05, + "loss": 1.7171, + "step": 19541 + }, + { + "epoch": 5.998158379373849, + "grad_norm": 0.19971637427806854, + "learning_rate": 3.6443167196367946e-05, + "loss": 1.7132, + "step": 19542 + }, + { + "epoch": 5.998465316144874, + "grad_norm": 0.2352653592824936, + "learning_rate": 3.643838289638531e-05, + "loss": 1.787, + "step": 19543 + }, + { + "epoch": 5.9987722529158995, + "grad_norm": 0.2324669510126114, + "learning_rate": 3.643359873042656e-05, + "loss": 1.7039, + "step": 19544 + }, + { + "epoch": 5.999079189686924, + "grad_norm": 0.1935029774904251, + "learning_rate": 3.6428814698538914e-05, + "loss": 1.6846, + "step": 19545 + }, + { + "epoch": 5.999386126457949, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.642403080076968e-05, + "loss": 1.7018, + "step": 19546 + }, + { + "epoch": 5.999693063228975, + "grad_norm": 0.19364693760871887, + "learning_rate": 3.6419247037166146e-05, + "loss": 1.6901, + "step": 19547 + }, + { + "epoch": 6.0, + "grad_norm": 0.23718556761741638, + "learning_rate": 3.641446340777556e-05, + "loss": 1.7743, + "step": 19548 + }, + { + "epoch": 6.000306936771025, + "grad_norm": 0.23907634615898132, + "learning_rate": 3.640967991264521e-05, + "loss": 1.8225, + "step": 19549 + }, + { + "epoch": 6.000613873542051, + "grad_norm": 0.18895737826824188, + "learning_rate": 3.6404896551822365e-05, + "loss": 1.7004, + "step": 19550 + }, + { + "epoch": 6.000920810313075, + "grad_norm": 0.20192188024520874, + "learning_rate": 3.64001133253543e-05, + "loss": 1.7304, + "step": 19551 + }, + { + "epoch": 6.0012277470841005, + "grad_norm": 0.1961488425731659, + "learning_rate": 3.6395330233288285e-05, + "loss": 1.6839, + "step": 19552 + }, + { + "epoch": 6.001534683855126, + "grad_norm": 0.271635502576828, + "learning_rate": 3.639054727567161e-05, + "loss": 1.8182, + "step": 19553 + }, + { + "epoch": 6.001841620626151, + "grad_norm": 0.20838679373264313, + "learning_rate": 3.63857644525515e-05, + "loss": 1.7688, + "step": 19554 + }, + { + "epoch": 6.0021485573971765, + "grad_norm": 0.23661796748638153, + "learning_rate": 3.6380981763975266e-05, + "loss": 1.6785, + "step": 19555 + }, + { + "epoch": 6.002455494168202, + "grad_norm": 0.1728433072566986, + "learning_rate": 3.637619920999013e-05, + "loss": 1.6648, + "step": 19556 + }, + { + "epoch": 6.002762430939226, + "grad_norm": 0.2845853269100189, + "learning_rate": 3.6371416790643395e-05, + "loss": 1.7592, + "step": 19557 + }, + { + "epoch": 6.003069367710252, + "grad_norm": 0.3246566951274872, + "learning_rate": 3.636663450598229e-05, + "loss": 1.7045, + "step": 19558 + }, + { + "epoch": 6.003376304481277, + "grad_norm": 0.21857120096683502, + "learning_rate": 3.636185235605412e-05, + "loss": 1.756, + "step": 19559 + }, + { + "epoch": 6.003683241252302, + "grad_norm": 0.3583754599094391, + "learning_rate": 3.63570703409061e-05, + "loss": 1.6828, + "step": 19560 + }, + { + "epoch": 6.003990178023328, + "grad_norm": 0.25527241826057434, + "learning_rate": 3.635228846058552e-05, + "loss": 1.7611, + "step": 19561 + }, + { + "epoch": 6.004297114794352, + "grad_norm": 0.29662930965423584, + "learning_rate": 3.6347506715139604e-05, + "loss": 1.747, + "step": 19562 + }, + { + "epoch": 6.004604051565377, + "grad_norm": 0.2588978707790375, + "learning_rate": 3.634272510461564e-05, + "loss": 1.7153, + "step": 19563 + }, + { + "epoch": 6.004910988336403, + "grad_norm": 0.23874366283416748, + "learning_rate": 3.633794362906089e-05, + "loss": 1.7285, + "step": 19564 + }, + { + "epoch": 6.005217925107428, + "grad_norm": 0.2898634374141693, + "learning_rate": 3.633316228852256e-05, + "loss": 1.7539, + "step": 19565 + }, + { + "epoch": 6.005524861878453, + "grad_norm": 0.2578127682209015, + "learning_rate": 3.6328381083047946e-05, + "loss": 1.7504, + "step": 19566 + }, + { + "epoch": 6.005831798649478, + "grad_norm": 0.3094595968723297, + "learning_rate": 3.632360001268427e-05, + "loss": 1.7076, + "step": 19567 + }, + { + "epoch": 6.006138735420503, + "grad_norm": 0.27825623750686646, + "learning_rate": 3.63188190774788e-05, + "loss": 1.7651, + "step": 19568 + }, + { + "epoch": 6.0064456721915285, + "grad_norm": 0.27732032537460327, + "learning_rate": 3.631403827747878e-05, + "loss": 1.7209, + "step": 19569 + }, + { + "epoch": 6.006752608962554, + "grad_norm": 0.36446672677993774, + "learning_rate": 3.6309257612731475e-05, + "loss": 1.7191, + "step": 19570 + }, + { + "epoch": 6.007059545733579, + "grad_norm": 0.19071432948112488, + "learning_rate": 3.6304477083284076e-05, + "loss": 1.6981, + "step": 19571 + }, + { + "epoch": 6.0073664825046045, + "grad_norm": 0.40523234009742737, + "learning_rate": 3.6299696689183895e-05, + "loss": 1.7259, + "step": 19572 + }, + { + "epoch": 6.007673419275629, + "grad_norm": 0.30279576778411865, + "learning_rate": 3.6294916430478116e-05, + "loss": 1.8017, + "step": 19573 + }, + { + "epoch": 6.007980356046654, + "grad_norm": 0.2944689989089966, + "learning_rate": 3.629013630721402e-05, + "loss": 1.7347, + "step": 19574 + }, + { + "epoch": 6.00828729281768, + "grad_norm": 0.3557213246822357, + "learning_rate": 3.6285356319438814e-05, + "loss": 1.7308, + "step": 19575 + }, + { + "epoch": 6.008594229588705, + "grad_norm": 0.19888661801815033, + "learning_rate": 3.628057646719978e-05, + "loss": 1.7571, + "step": 19576 + }, + { + "epoch": 6.00890116635973, + "grad_norm": 0.34002986550331116, + "learning_rate": 3.627579675054411e-05, + "loss": 1.7417, + "step": 19577 + }, + { + "epoch": 6.009208103130755, + "grad_norm": 0.2756921350955963, + "learning_rate": 3.627101716951908e-05, + "loss": 1.7351, + "step": 19578 + }, + { + "epoch": 6.00951503990178, + "grad_norm": 0.3520946502685547, + "learning_rate": 3.6266237724171885e-05, + "loss": 1.7056, + "step": 19579 + }, + { + "epoch": 6.009821976672805, + "grad_norm": 0.3673728406429291, + "learning_rate": 3.6261458414549786e-05, + "loss": 1.6388, + "step": 19580 + }, + { + "epoch": 6.010128913443831, + "grad_norm": 0.2247757613658905, + "learning_rate": 3.625667924070003e-05, + "loss": 1.7772, + "step": 19581 + }, + { + "epoch": 6.010435850214856, + "grad_norm": 0.4387452006340027, + "learning_rate": 3.6251900202669795e-05, + "loss": 1.7629, + "step": 19582 + }, + { + "epoch": 6.0107427869858805, + "grad_norm": 0.23595796525478363, + "learning_rate": 3.624712130050636e-05, + "loss": 1.8044, + "step": 19583 + }, + { + "epoch": 6.011049723756906, + "grad_norm": 0.31198835372924805, + "learning_rate": 3.624234253425691e-05, + "loss": 1.7623, + "step": 19584 + }, + { + "epoch": 6.011356660527931, + "grad_norm": 0.25283896923065186, + "learning_rate": 3.6237563903968705e-05, + "loss": 1.7771, + "step": 19585 + }, + { + "epoch": 6.0116635972989565, + "grad_norm": 0.2595483064651489, + "learning_rate": 3.6232785409688954e-05, + "loss": 1.7405, + "step": 19586 + }, + { + "epoch": 6.011970534069982, + "grad_norm": 0.302273690700531, + "learning_rate": 3.622800705146491e-05, + "loss": 1.7236, + "step": 19587 + }, + { + "epoch": 6.012277470841007, + "grad_norm": 0.20444928109645844, + "learning_rate": 3.622322882934375e-05, + "loss": 1.6863, + "step": 19588 + }, + { + "epoch": 6.012584407612032, + "grad_norm": 0.2682531774044037, + "learning_rate": 3.621845074337273e-05, + "loss": 1.752, + "step": 19589 + }, + { + "epoch": 6.012891344383057, + "grad_norm": 0.25617173314094543, + "learning_rate": 3.621367279359905e-05, + "loss": 1.7496, + "step": 19590 + }, + { + "epoch": 6.013198281154082, + "grad_norm": 0.24514207243919373, + "learning_rate": 3.620889498006994e-05, + "loss": 1.6568, + "step": 19591 + }, + { + "epoch": 6.013505217925108, + "grad_norm": 0.2799128293991089, + "learning_rate": 3.6204117302832616e-05, + "loss": 1.7284, + "step": 19592 + }, + { + "epoch": 6.013812154696133, + "grad_norm": 0.2025543451309204, + "learning_rate": 3.619933976193428e-05, + "loss": 1.7172, + "step": 19593 + }, + { + "epoch": 6.014119091467157, + "grad_norm": 0.24697700142860413, + "learning_rate": 3.619456235742216e-05, + "loss": 1.7316, + "step": 19594 + }, + { + "epoch": 6.014426028238183, + "grad_norm": 0.2518150210380554, + "learning_rate": 3.618978508934348e-05, + "loss": 1.8183, + "step": 19595 + }, + { + "epoch": 6.014732965009208, + "grad_norm": 0.165326327085495, + "learning_rate": 3.618500795774542e-05, + "loss": 1.665, + "step": 19596 + }, + { + "epoch": 6.015039901780233, + "grad_norm": 0.19158180058002472, + "learning_rate": 3.6180230962675216e-05, + "loss": 1.7232, + "step": 19597 + }, + { + "epoch": 6.015346838551259, + "grad_norm": 0.19456413388252258, + "learning_rate": 3.6175454104180086e-05, + "loss": 1.7153, + "step": 19598 + }, + { + "epoch": 6.015653775322283, + "grad_norm": 0.233373761177063, + "learning_rate": 3.6170677382307195e-05, + "loss": 1.7914, + "step": 19599 + }, + { + "epoch": 6.0159607120933085, + "grad_norm": 0.18567882478237152, + "learning_rate": 3.6165900797103796e-05, + "loss": 1.6793, + "step": 19600 + }, + { + "epoch": 6.016267648864334, + "grad_norm": 0.2119273990392685, + "learning_rate": 3.616112434861706e-05, + "loss": 1.689, + "step": 19601 + }, + { + "epoch": 6.016574585635359, + "grad_norm": 0.1915217787027359, + "learning_rate": 3.61563480368942e-05, + "loss": 1.6835, + "step": 19602 + }, + { + "epoch": 6.0168815224063845, + "grad_norm": 0.24824760854244232, + "learning_rate": 3.615157186198244e-05, + "loss": 1.8411, + "step": 19603 + }, + { + "epoch": 6.01718845917741, + "grad_norm": 0.2198900282382965, + "learning_rate": 3.6146795823928955e-05, + "loss": 1.7311, + "step": 19604 + }, + { + "epoch": 6.017495395948434, + "grad_norm": 0.22993668913841248, + "learning_rate": 3.614201992278095e-05, + "loss": 1.7249, + "step": 19605 + }, + { + "epoch": 6.01780233271946, + "grad_norm": 0.20677974820137024, + "learning_rate": 3.613724415858564e-05, + "loss": 1.7137, + "step": 19606 + }, + { + "epoch": 6.018109269490485, + "grad_norm": 0.1844938099384308, + "learning_rate": 3.6132468531390184e-05, + "loss": 1.6512, + "step": 19607 + }, + { + "epoch": 6.01841620626151, + "grad_norm": 0.224154993891716, + "learning_rate": 3.6127693041241815e-05, + "loss": 1.7116, + "step": 19608 + }, + { + "epoch": 6.018723143032536, + "grad_norm": 0.17322199046611786, + "learning_rate": 3.612291768818772e-05, + "loss": 1.6743, + "step": 19609 + }, + { + "epoch": 6.01903007980356, + "grad_norm": 0.24451903998851776, + "learning_rate": 3.611814247227508e-05, + "loss": 1.8332, + "step": 19610 + }, + { + "epoch": 6.019337016574585, + "grad_norm": 0.1911642849445343, + "learning_rate": 3.611336739355109e-05, + "loss": 1.707, + "step": 19611 + }, + { + "epoch": 6.019643953345611, + "grad_norm": 0.20917518436908722, + "learning_rate": 3.6108592452062954e-05, + "loss": 1.7328, + "step": 19612 + }, + { + "epoch": 6.019950890116636, + "grad_norm": 0.2314450889825821, + "learning_rate": 3.610381764785784e-05, + "loss": 1.7575, + "step": 19613 + }, + { + "epoch": 6.020257826887661, + "grad_norm": 0.20701734721660614, + "learning_rate": 3.609904298098296e-05, + "loss": 1.6958, + "step": 19614 + }, + { + "epoch": 6.020564763658686, + "grad_norm": 0.2494465857744217, + "learning_rate": 3.609426845148547e-05, + "loss": 1.706, + "step": 19615 + }, + { + "epoch": 6.020871700429711, + "grad_norm": 0.25842729210853577, + "learning_rate": 3.608949405941256e-05, + "loss": 1.7667, + "step": 19616 + }, + { + "epoch": 6.0211786372007365, + "grad_norm": 0.19831863045692444, + "learning_rate": 3.608471980481145e-05, + "loss": 1.7135, + "step": 19617 + }, + { + "epoch": 6.021485573971762, + "grad_norm": 0.21611735224723816, + "learning_rate": 3.607994568772927e-05, + "loss": 1.7416, + "step": 19618 + }, + { + "epoch": 6.021792510742787, + "grad_norm": 0.2356715202331543, + "learning_rate": 3.607517170821324e-05, + "loss": 1.7696, + "step": 19619 + }, + { + "epoch": 6.0220994475138125, + "grad_norm": 0.24737675487995148, + "learning_rate": 3.6070397866310514e-05, + "loss": 1.7189, + "step": 19620 + }, + { + "epoch": 6.022406384284837, + "grad_norm": 0.19260701537132263, + "learning_rate": 3.6065624162068284e-05, + "loss": 1.7292, + "step": 19621 + }, + { + "epoch": 6.022713321055862, + "grad_norm": 0.29366952180862427, + "learning_rate": 3.6060850595533716e-05, + "loss": 1.7875, + "step": 19622 + }, + { + "epoch": 6.023020257826888, + "grad_norm": 0.2038174718618393, + "learning_rate": 3.605607716675401e-05, + "loss": 1.6777, + "step": 19623 + }, + { + "epoch": 6.023327194597913, + "grad_norm": 0.28923583030700684, + "learning_rate": 3.605130387577631e-05, + "loss": 1.7175, + "step": 19624 + }, + { + "epoch": 6.023634131368938, + "grad_norm": 0.3004317283630371, + "learning_rate": 3.6046530722647816e-05, + "loss": 1.8059, + "step": 19625 + }, + { + "epoch": 6.023941068139963, + "grad_norm": 0.19832390546798706, + "learning_rate": 3.6041757707415666e-05, + "loss": 1.7197, + "step": 19626 + }, + { + "epoch": 6.024248004910988, + "grad_norm": 0.2782927453517914, + "learning_rate": 3.6036984830127054e-05, + "loss": 1.6563, + "step": 19627 + }, + { + "epoch": 6.024554941682013, + "grad_norm": 0.20395785570144653, + "learning_rate": 3.603221209082913e-05, + "loss": 1.6972, + "step": 19628 + }, + { + "epoch": 6.024861878453039, + "grad_norm": 0.26302096247673035, + "learning_rate": 3.60274394895691e-05, + "loss": 1.7348, + "step": 19629 + }, + { + "epoch": 6.025168815224064, + "grad_norm": 0.26376327872276306, + "learning_rate": 3.6022667026394095e-05, + "loss": 1.7183, + "step": 19630 + }, + { + "epoch": 6.0254757519950894, + "grad_norm": 0.20590877532958984, + "learning_rate": 3.601789470135127e-05, + "loss": 1.7114, + "step": 19631 + }, + { + "epoch": 6.025782688766114, + "grad_norm": 0.2873607277870178, + "learning_rate": 3.6013122514487815e-05, + "loss": 1.7598, + "step": 19632 + }, + { + "epoch": 6.026089625537139, + "grad_norm": 0.24324963986873627, + "learning_rate": 3.600835046585087e-05, + "loss": 1.8844, + "step": 19633 + }, + { + "epoch": 6.026396562308165, + "grad_norm": 0.27910730242729187, + "learning_rate": 3.6003578555487624e-05, + "loss": 1.8598, + "step": 19634 + }, + { + "epoch": 6.02670349907919, + "grad_norm": 0.22766844928264618, + "learning_rate": 3.59988067834452e-05, + "loss": 1.7281, + "step": 19635 + }, + { + "epoch": 6.027010435850215, + "grad_norm": 0.2390190064907074, + "learning_rate": 3.5994035149770804e-05, + "loss": 1.7355, + "step": 19636 + }, + { + "epoch": 6.02731737262124, + "grad_norm": 0.23422548174858093, + "learning_rate": 3.598926365451153e-05, + "loss": 1.7226, + "step": 19637 + }, + { + "epoch": 6.027624309392265, + "grad_norm": 0.20240288972854614, + "learning_rate": 3.598449229771458e-05, + "loss": 1.7523, + "step": 19638 + }, + { + "epoch": 6.02793124616329, + "grad_norm": 0.26388832926750183, + "learning_rate": 3.597972107942708e-05, + "loss": 1.7003, + "step": 19639 + }, + { + "epoch": 6.028238182934316, + "grad_norm": 0.19814053177833557, + "learning_rate": 3.597494999969622e-05, + "loss": 1.7087, + "step": 19640 + }, + { + "epoch": 6.028545119705341, + "grad_norm": 0.2779136896133423, + "learning_rate": 3.5970179058569095e-05, + "loss": 1.7581, + "step": 19641 + }, + { + "epoch": 6.0288520564763655, + "grad_norm": 0.220394566655159, + "learning_rate": 3.5965408256092905e-05, + "loss": 1.7236, + "step": 19642 + }, + { + "epoch": 6.029158993247391, + "grad_norm": 0.28568828105926514, + "learning_rate": 3.596063759231476e-05, + "loss": 1.7933, + "step": 19643 + }, + { + "epoch": 6.029465930018416, + "grad_norm": 0.19509564340114594, + "learning_rate": 3.595586706728183e-05, + "loss": 1.6803, + "step": 19644 + }, + { + "epoch": 6.0297728667894415, + "grad_norm": 0.30855104327201843, + "learning_rate": 3.595109668104124e-05, + "loss": 1.7345, + "step": 19645 + }, + { + "epoch": 6.030079803560467, + "grad_norm": 0.24195496737957, + "learning_rate": 3.5946326433640174e-05, + "loss": 1.7493, + "step": 19646 + }, + { + "epoch": 6.030386740331492, + "grad_norm": 0.28324684500694275, + "learning_rate": 3.5941556325125744e-05, + "loss": 1.7959, + "step": 19647 + }, + { + "epoch": 6.030693677102517, + "grad_norm": 0.25351646542549133, + "learning_rate": 3.593678635554508e-05, + "loss": 1.7298, + "step": 19648 + }, + { + "epoch": 6.031000613873542, + "grad_norm": 0.2608177959918976, + "learning_rate": 3.593201652494534e-05, + "loss": 1.7072, + "step": 19649 + }, + { + "epoch": 6.031307550644567, + "grad_norm": 0.3182333707809448, + "learning_rate": 3.592724683337365e-05, + "loss": 1.6976, + "step": 19650 + }, + { + "epoch": 6.031614487415593, + "grad_norm": 0.19296859204769135, + "learning_rate": 3.592247728087717e-05, + "loss": 1.6879, + "step": 19651 + }, + { + "epoch": 6.031921424186618, + "grad_norm": 0.3927764594554901, + "learning_rate": 3.591770786750301e-05, + "loss": 1.6824, + "step": 19652 + }, + { + "epoch": 6.032228360957642, + "grad_norm": 0.23609496653079987, + "learning_rate": 3.591293859329833e-05, + "loss": 1.7224, + "step": 19653 + }, + { + "epoch": 6.032535297728668, + "grad_norm": 0.40787333250045776, + "learning_rate": 3.590816945831023e-05, + "loss": 1.7206, + "step": 19654 + }, + { + "epoch": 6.032842234499693, + "grad_norm": 0.31101885437965393, + "learning_rate": 3.590340046258586e-05, + "loss": 1.7446, + "step": 19655 + }, + { + "epoch": 6.033149171270718, + "grad_norm": 0.19401656091213226, + "learning_rate": 3.589863160617235e-05, + "loss": 1.6778, + "step": 19656 + }, + { + "epoch": 6.033456108041744, + "grad_norm": 0.3309115469455719, + "learning_rate": 3.589386288911684e-05, + "loss": 1.7196, + "step": 19657 + }, + { + "epoch": 6.033763044812768, + "grad_norm": 0.22281408309936523, + "learning_rate": 3.588909431146643e-05, + "loss": 1.7122, + "step": 19658 + }, + { + "epoch": 6.0340699815837935, + "grad_norm": 0.2903781831264496, + "learning_rate": 3.5884325873268275e-05, + "loss": 1.7428, + "step": 19659 + }, + { + "epoch": 6.034376918354819, + "grad_norm": 0.2529856562614441, + "learning_rate": 3.587955757456947e-05, + "loss": 1.7075, + "step": 19660 + }, + { + "epoch": 6.034683855125844, + "grad_norm": 0.2445102334022522, + "learning_rate": 3.587478941541716e-05, + "loss": 1.6631, + "step": 19661 + }, + { + "epoch": 6.0349907918968695, + "grad_norm": 0.31834688782691956, + "learning_rate": 3.5870021395858454e-05, + "loss": 1.7009, + "step": 19662 + }, + { + "epoch": 6.035297728667895, + "grad_norm": 0.20666317641735077, + "learning_rate": 3.5865253515940496e-05, + "loss": 1.7252, + "step": 19663 + }, + { + "epoch": 6.035604665438919, + "grad_norm": 0.3070019483566284, + "learning_rate": 3.586048577571039e-05, + "loss": 1.7139, + "step": 19664 + }, + { + "epoch": 6.035911602209945, + "grad_norm": 0.22463096678256989, + "learning_rate": 3.585571817521522e-05, + "loss": 1.7574, + "step": 19665 + }, + { + "epoch": 6.03621853898097, + "grad_norm": 0.25405722856521606, + "learning_rate": 3.585095071450216e-05, + "loss": 1.7135, + "step": 19666 + }, + { + "epoch": 6.036525475751995, + "grad_norm": 0.24543432891368866, + "learning_rate": 3.584618339361828e-05, + "loss": 1.7312, + "step": 19667 + }, + { + "epoch": 6.036832412523021, + "grad_norm": 0.2454189658164978, + "learning_rate": 3.584141621261073e-05, + "loss": 1.7905, + "step": 19668 + }, + { + "epoch": 6.037139349294045, + "grad_norm": 0.2163272649049759, + "learning_rate": 3.583664917152658e-05, + "loss": 1.7042, + "step": 19669 + }, + { + "epoch": 6.03744628606507, + "grad_norm": 0.2088690549135208, + "learning_rate": 3.5831882270412994e-05, + "loss": 1.7905, + "step": 19670 + }, + { + "epoch": 6.037753222836096, + "grad_norm": 0.26145869493484497, + "learning_rate": 3.5827115509317024e-05, + "loss": 1.7487, + "step": 19671 + }, + { + "epoch": 6.038060159607121, + "grad_norm": 0.20306496322155, + "learning_rate": 3.582234888828582e-05, + "loss": 1.7103, + "step": 19672 + }, + { + "epoch": 6.038367096378146, + "grad_norm": 0.2504192292690277, + "learning_rate": 3.5817582407366454e-05, + "loss": 1.7397, + "step": 19673 + }, + { + "epoch": 6.038674033149171, + "grad_norm": 0.22803208231925964, + "learning_rate": 3.5812816066606084e-05, + "loss": 1.7105, + "step": 19674 + }, + { + "epoch": 6.038980969920196, + "grad_norm": 0.24963071942329407, + "learning_rate": 3.580804986605176e-05, + "loss": 1.734, + "step": 19675 + }, + { + "epoch": 6.0392879066912215, + "grad_norm": 0.2468494027853012, + "learning_rate": 3.580328380575062e-05, + "loss": 1.6866, + "step": 19676 + }, + { + "epoch": 6.039594843462247, + "grad_norm": 0.17628586292266846, + "learning_rate": 3.579851788574973e-05, + "loss": 1.7106, + "step": 19677 + }, + { + "epoch": 6.039901780233272, + "grad_norm": 0.23965299129486084, + "learning_rate": 3.579375210609622e-05, + "loss": 1.7675, + "step": 19678 + }, + { + "epoch": 6.0402087170042975, + "grad_norm": 0.19638453423976898, + "learning_rate": 3.5788986466837175e-05, + "loss": 1.7242, + "step": 19679 + }, + { + "epoch": 6.040515653775322, + "grad_norm": 0.2602851092815399, + "learning_rate": 3.578422096801971e-05, + "loss": 1.7287, + "step": 19680 + }, + { + "epoch": 6.040822590546347, + "grad_norm": 0.25868186354637146, + "learning_rate": 3.577945560969091e-05, + "loss": 1.7604, + "step": 19681 + }, + { + "epoch": 6.041129527317373, + "grad_norm": 0.1996527463197708, + "learning_rate": 3.577469039189784e-05, + "loss": 1.7469, + "step": 19682 + }, + { + "epoch": 6.041436464088398, + "grad_norm": 0.29909980297088623, + "learning_rate": 3.576992531468763e-05, + "loss": 1.682, + "step": 19683 + }, + { + "epoch": 6.041743400859423, + "grad_norm": 0.20064286887645721, + "learning_rate": 3.576516037810734e-05, + "loss": 1.7125, + "step": 19684 + }, + { + "epoch": 6.042050337630448, + "grad_norm": 0.2134515345096588, + "learning_rate": 3.576039558220411e-05, + "loss": 1.7371, + "step": 19685 + }, + { + "epoch": 6.042357274401473, + "grad_norm": 0.20365437865257263, + "learning_rate": 3.575563092702497e-05, + "loss": 1.7446, + "step": 19686 + }, + { + "epoch": 6.042664211172498, + "grad_norm": 0.24526065587997437, + "learning_rate": 3.5750866412617054e-05, + "loss": 1.759, + "step": 19687 + }, + { + "epoch": 6.042971147943524, + "grad_norm": 0.24521295726299286, + "learning_rate": 3.5746102039027414e-05, + "loss": 1.7589, + "step": 19688 + }, + { + "epoch": 6.043278084714549, + "grad_norm": 0.2151515632867813, + "learning_rate": 3.5741337806303155e-05, + "loss": 1.761, + "step": 19689 + }, + { + "epoch": 6.043585021485574, + "grad_norm": 0.25733521580696106, + "learning_rate": 3.573657371449134e-05, + "loss": 1.7171, + "step": 19690 + }, + { + "epoch": 6.043891958256599, + "grad_norm": 0.18520839512348175, + "learning_rate": 3.5731809763639084e-05, + "loss": 1.6691, + "step": 19691 + }, + { + "epoch": 6.044198895027624, + "grad_norm": 0.24617944657802582, + "learning_rate": 3.572704595379342e-05, + "loss": 1.7869, + "step": 19692 + }, + { + "epoch": 6.0445058317986495, + "grad_norm": 0.20246629416942596, + "learning_rate": 3.5722282285001493e-05, + "loss": 1.7667, + "step": 19693 + }, + { + "epoch": 6.044812768569675, + "grad_norm": 0.21190209686756134, + "learning_rate": 3.5717518757310305e-05, + "loss": 1.6839, + "step": 19694 + }, + { + "epoch": 6.0451197053407, + "grad_norm": 0.19021087884902954, + "learning_rate": 3.571275537076699e-05, + "loss": 1.7023, + "step": 19695 + }, + { + "epoch": 6.045426642111725, + "grad_norm": 0.1793040931224823, + "learning_rate": 3.570799212541858e-05, + "loss": 1.7022, + "step": 19696 + }, + { + "epoch": 6.04573357888275, + "grad_norm": 0.19105301797389984, + "learning_rate": 3.570322902131219e-05, + "loss": 1.7151, + "step": 19697 + }, + { + "epoch": 6.046040515653775, + "grad_norm": 0.22083842754364014, + "learning_rate": 3.569846605849487e-05, + "loss": 1.7097, + "step": 19698 + }, + { + "epoch": 6.046347452424801, + "grad_norm": 0.2607622444629669, + "learning_rate": 3.569370323701368e-05, + "loss": 1.7508, + "step": 19699 + }, + { + "epoch": 6.046654389195826, + "grad_norm": 0.22349929809570312, + "learning_rate": 3.56889405569157e-05, + "loss": 1.7131, + "step": 19700 + }, + { + "epoch": 6.04696132596685, + "grad_norm": 0.19442661106586456, + "learning_rate": 3.5684178018247996e-05, + "loss": 1.7476, + "step": 19701 + }, + { + "epoch": 6.047268262737876, + "grad_norm": 0.2002776861190796, + "learning_rate": 3.5679415621057646e-05, + "loss": 1.7982, + "step": 19702 + }, + { + "epoch": 6.047575199508901, + "grad_norm": 0.21558646857738495, + "learning_rate": 3.567465336539169e-05, + "loss": 1.7231, + "step": 19703 + }, + { + "epoch": 6.047882136279926, + "grad_norm": 0.20468449592590332, + "learning_rate": 3.5669891251297224e-05, + "loss": 1.6426, + "step": 19704 + }, + { + "epoch": 6.048189073050952, + "grad_norm": 0.23098553717136383, + "learning_rate": 3.566512927882127e-05, + "loss": 1.7763, + "step": 19705 + }, + { + "epoch": 6.048496009821977, + "grad_norm": 0.22959274053573608, + "learning_rate": 3.566036744801092e-05, + "loss": 1.7663, + "step": 19706 + }, + { + "epoch": 6.0488029465930016, + "grad_norm": 0.18519435822963715, + "learning_rate": 3.5655605758913215e-05, + "loss": 1.6995, + "step": 19707 + }, + { + "epoch": 6.049109883364027, + "grad_norm": 0.2529381513595581, + "learning_rate": 3.565084421157524e-05, + "loss": 1.754, + "step": 19708 + }, + { + "epoch": 6.049416820135052, + "grad_norm": 0.2208617776632309, + "learning_rate": 3.5646082806044015e-05, + "loss": 1.6939, + "step": 19709 + }, + { + "epoch": 6.0497237569060776, + "grad_norm": 0.18433862924575806, + "learning_rate": 3.564132154236663e-05, + "loss": 1.7145, + "step": 19710 + }, + { + "epoch": 6.050030693677103, + "grad_norm": 0.1963127702474594, + "learning_rate": 3.563656042059011e-05, + "loss": 1.7101, + "step": 19711 + }, + { + "epoch": 6.050337630448127, + "grad_norm": 0.19860461354255676, + "learning_rate": 3.5631799440761526e-05, + "loss": 1.7218, + "step": 19712 + }, + { + "epoch": 6.050644567219153, + "grad_norm": 0.19304174184799194, + "learning_rate": 3.5627038602927905e-05, + "loss": 1.7575, + "step": 19713 + }, + { + "epoch": 6.050951503990178, + "grad_norm": 0.20402809977531433, + "learning_rate": 3.5622277907136335e-05, + "loss": 1.7438, + "step": 19714 + }, + { + "epoch": 6.051258440761203, + "grad_norm": 0.20821911096572876, + "learning_rate": 3.5617517353433844e-05, + "loss": 1.7381, + "step": 19715 + }, + { + "epoch": 6.051565377532229, + "grad_norm": 0.24375931918621063, + "learning_rate": 3.561275694186745e-05, + "loss": 1.8377, + "step": 19716 + }, + { + "epoch": 6.051872314303253, + "grad_norm": 0.19745339453220367, + "learning_rate": 3.560799667248424e-05, + "loss": 1.6839, + "step": 19717 + }, + { + "epoch": 6.0521792510742785, + "grad_norm": 0.2039431631565094, + "learning_rate": 3.560323654533124e-05, + "loss": 1.692, + "step": 19718 + }, + { + "epoch": 6.052486187845304, + "grad_norm": 0.23229047656059265, + "learning_rate": 3.559847656045551e-05, + "loss": 1.7408, + "step": 19719 + }, + { + "epoch": 6.052793124616329, + "grad_norm": 0.20387259125709534, + "learning_rate": 3.559371671790404e-05, + "loss": 1.7215, + "step": 19720 + }, + { + "epoch": 6.0531000613873545, + "grad_norm": 0.23960062861442566, + "learning_rate": 3.5588957017723944e-05, + "loss": 1.8048, + "step": 19721 + }, + { + "epoch": 6.05340699815838, + "grad_norm": 0.1979944109916687, + "learning_rate": 3.5584197459962196e-05, + "loss": 1.7307, + "step": 19722 + }, + { + "epoch": 6.053713934929404, + "grad_norm": 0.21914203464984894, + "learning_rate": 3.557943804466586e-05, + "loss": 1.6999, + "step": 19723 + }, + { + "epoch": 6.05402087170043, + "grad_norm": 0.22338175773620605, + "learning_rate": 3.557467877188197e-05, + "loss": 1.6977, + "step": 19724 + }, + { + "epoch": 6.054327808471455, + "grad_norm": 0.2692863643169403, + "learning_rate": 3.5569919641657576e-05, + "loss": 1.7664, + "step": 19725 + }, + { + "epoch": 6.05463474524248, + "grad_norm": 0.2882823944091797, + "learning_rate": 3.5565160654039675e-05, + "loss": 1.6943, + "step": 19726 + }, + { + "epoch": 6.054941682013506, + "grad_norm": 0.2114996612071991, + "learning_rate": 3.5560401809075336e-05, + "loss": 1.7426, + "step": 19727 + }, + { + "epoch": 6.05524861878453, + "grad_norm": 0.19616106152534485, + "learning_rate": 3.5555643106811546e-05, + "loss": 1.6616, + "step": 19728 + }, + { + "epoch": 6.055555555555555, + "grad_norm": 0.241346076130867, + "learning_rate": 3.555088454729537e-05, + "loss": 1.7423, + "step": 19729 + }, + { + "epoch": 6.055862492326581, + "grad_norm": 0.24495846033096313, + "learning_rate": 3.554612613057381e-05, + "loss": 1.7699, + "step": 19730 + }, + { + "epoch": 6.056169429097606, + "grad_norm": 0.233306422829628, + "learning_rate": 3.554136785669393e-05, + "loss": 1.7201, + "step": 19731 + }, + { + "epoch": 6.056476365868631, + "grad_norm": 0.23820927739143372, + "learning_rate": 3.553660972570272e-05, + "loss": 1.7694, + "step": 19732 + }, + { + "epoch": 6.056783302639656, + "grad_norm": 0.20664167404174805, + "learning_rate": 3.553185173764719e-05, + "loss": 1.7151, + "step": 19733 + }, + { + "epoch": 6.057090239410681, + "grad_norm": 0.22572578489780426, + "learning_rate": 3.5527093892574394e-05, + "loss": 1.7715, + "step": 19734 + }, + { + "epoch": 6.0573971761817065, + "grad_norm": 0.18554186820983887, + "learning_rate": 3.552233619053133e-05, + "loss": 1.7481, + "step": 19735 + }, + { + "epoch": 6.057704112952732, + "grad_norm": 0.2434636950492859, + "learning_rate": 3.551757863156504e-05, + "loss": 1.7992, + "step": 19736 + }, + { + "epoch": 6.058011049723757, + "grad_norm": 0.1949392408132553, + "learning_rate": 3.5512821215722514e-05, + "loss": 1.7439, + "step": 19737 + }, + { + "epoch": 6.0583179864947825, + "grad_norm": 0.2696731686592102, + "learning_rate": 3.55080639430508e-05, + "loss": 1.7092, + "step": 19738 + }, + { + "epoch": 6.058624923265807, + "grad_norm": 0.1963263303041458, + "learning_rate": 3.550330681359686e-05, + "loss": 1.6726, + "step": 19739 + }, + { + "epoch": 6.058931860036832, + "grad_norm": 0.20115122199058533, + "learning_rate": 3.549854982740776e-05, + "loss": 1.7459, + "step": 19740 + }, + { + "epoch": 6.059238796807858, + "grad_norm": 0.21378284692764282, + "learning_rate": 3.549379298453048e-05, + "loss": 1.7028, + "step": 19741 + }, + { + "epoch": 6.059545733578883, + "grad_norm": 0.21954336762428284, + "learning_rate": 3.5489036285012055e-05, + "loss": 1.7209, + "step": 19742 + }, + { + "epoch": 6.059852670349908, + "grad_norm": 0.20117704570293427, + "learning_rate": 3.548427972889946e-05, + "loss": 1.7273, + "step": 19743 + }, + { + "epoch": 6.060159607120933, + "grad_norm": 0.23786263167858124, + "learning_rate": 3.5479523316239745e-05, + "loss": 1.7519, + "step": 19744 + }, + { + "epoch": 6.060466543891958, + "grad_norm": 0.17704391479492188, + "learning_rate": 3.5474767047079864e-05, + "loss": 1.6644, + "step": 19745 + }, + { + "epoch": 6.060773480662983, + "grad_norm": 0.1883699744939804, + "learning_rate": 3.547001092146687e-05, + "loss": 1.6586, + "step": 19746 + }, + { + "epoch": 6.061080417434009, + "grad_norm": 0.19101519882678986, + "learning_rate": 3.546525493944773e-05, + "loss": 1.7575, + "step": 19747 + }, + { + "epoch": 6.061387354205034, + "grad_norm": 0.1924263834953308, + "learning_rate": 3.546049910106947e-05, + "loss": 1.743, + "step": 19748 + }, + { + "epoch": 6.0616942909760585, + "grad_norm": 0.1853020042181015, + "learning_rate": 3.5455743406379084e-05, + "loss": 1.7466, + "step": 19749 + }, + { + "epoch": 6.062001227747084, + "grad_norm": 0.21322499215602875, + "learning_rate": 3.545098785542355e-05, + "loss": 1.7625, + "step": 19750 + }, + { + "epoch": 6.062308164518109, + "grad_norm": 0.1567271500825882, + "learning_rate": 3.544623244824989e-05, + "loss": 1.6531, + "step": 19751 + }, + { + "epoch": 6.0626151012891345, + "grad_norm": 0.2125476449728012, + "learning_rate": 3.544147718490508e-05, + "loss": 1.7547, + "step": 19752 + }, + { + "epoch": 6.06292203806016, + "grad_norm": 0.19470059871673584, + "learning_rate": 3.543672206543615e-05, + "loss": 1.7327, + "step": 19753 + }, + { + "epoch": 6.063228974831185, + "grad_norm": 0.1690339744091034, + "learning_rate": 3.543196708989004e-05, + "loss": 1.6621, + "step": 19754 + }, + { + "epoch": 6.06353591160221, + "grad_norm": 0.17322230339050293, + "learning_rate": 3.54272122583138e-05, + "loss": 1.7018, + "step": 19755 + }, + { + "epoch": 6.063842848373235, + "grad_norm": 0.22174575924873352, + "learning_rate": 3.5422457570754365e-05, + "loss": 1.724, + "step": 19756 + }, + { + "epoch": 6.06414978514426, + "grad_norm": 0.20233364403247833, + "learning_rate": 3.541770302725875e-05, + "loss": 1.6518, + "step": 19757 + }, + { + "epoch": 6.064456721915286, + "grad_norm": 0.1585279405117035, + "learning_rate": 3.541294862787395e-05, + "loss": 1.6985, + "step": 19758 + }, + { + "epoch": 6.064763658686311, + "grad_norm": 0.2180105745792389, + "learning_rate": 3.540819437264694e-05, + "loss": 1.6728, + "step": 19759 + }, + { + "epoch": 6.065070595457335, + "grad_norm": 0.2295975238084793, + "learning_rate": 3.5403440261624696e-05, + "loss": 1.7566, + "step": 19760 + }, + { + "epoch": 6.065377532228361, + "grad_norm": 0.17460396885871887, + "learning_rate": 3.5398686294854234e-05, + "loss": 1.6977, + "step": 19761 + }, + { + "epoch": 6.065684468999386, + "grad_norm": 0.20828662812709808, + "learning_rate": 3.539393247238249e-05, + "loss": 1.7789, + "step": 19762 + }, + { + "epoch": 6.065991405770411, + "grad_norm": 0.2273385375738144, + "learning_rate": 3.5389178794256476e-05, + "loss": 1.7316, + "step": 19763 + }, + { + "epoch": 6.066298342541437, + "grad_norm": 0.2332257330417633, + "learning_rate": 3.538442526052316e-05, + "loss": 1.7355, + "step": 19764 + }, + { + "epoch": 6.066605279312462, + "grad_norm": 0.17953866720199585, + "learning_rate": 3.537967187122952e-05, + "loss": 1.7107, + "step": 19765 + }, + { + "epoch": 6.0669122160834865, + "grad_norm": 0.2334052473306656, + "learning_rate": 3.537491862642254e-05, + "loss": 1.7572, + "step": 19766 + }, + { + "epoch": 6.067219152854512, + "grad_norm": 0.2427968829870224, + "learning_rate": 3.5370165526149165e-05, + "loss": 1.7254, + "step": 19767 + }, + { + "epoch": 6.067526089625537, + "grad_norm": 0.2701692283153534, + "learning_rate": 3.53654125704564e-05, + "loss": 1.7525, + "step": 19768 + }, + { + "epoch": 6.0678330263965625, + "grad_norm": 0.3775569796562195, + "learning_rate": 3.536065975939121e-05, + "loss": 1.7516, + "step": 19769 + }, + { + "epoch": 6.068139963167588, + "grad_norm": 0.18971984088420868, + "learning_rate": 3.535590709300056e-05, + "loss": 1.6777, + "step": 19770 + }, + { + "epoch": 6.068446899938612, + "grad_norm": 0.2710094749927521, + "learning_rate": 3.535115457133141e-05, + "loss": 1.7612, + "step": 19771 + }, + { + "epoch": 6.068753836709638, + "grad_norm": 0.19414621591567993, + "learning_rate": 3.534640219443075e-05, + "loss": 1.6795, + "step": 19772 + }, + { + "epoch": 6.069060773480663, + "grad_norm": 0.2384893298149109, + "learning_rate": 3.534164996234552e-05, + "loss": 1.7869, + "step": 19773 + }, + { + "epoch": 6.069367710251688, + "grad_norm": 0.2206166833639145, + "learning_rate": 3.533689787512271e-05, + "loss": 1.7332, + "step": 19774 + }, + { + "epoch": 6.069674647022714, + "grad_norm": 0.19740800559520721, + "learning_rate": 3.533214593280926e-05, + "loss": 1.6744, + "step": 19775 + }, + { + "epoch": 6.069981583793738, + "grad_norm": 0.2098212093114853, + "learning_rate": 3.532739413545214e-05, + "loss": 1.731, + "step": 19776 + }, + { + "epoch": 6.070288520564763, + "grad_norm": 0.2508943974971771, + "learning_rate": 3.5322642483098304e-05, + "loss": 1.7682, + "step": 19777 + }, + { + "epoch": 6.070595457335789, + "grad_norm": 0.22202368080615997, + "learning_rate": 3.531789097579474e-05, + "loss": 1.6965, + "step": 19778 + }, + { + "epoch": 6.070902394106814, + "grad_norm": 0.19276803731918335, + "learning_rate": 3.5313139613588355e-05, + "loss": 1.6855, + "step": 19779 + }, + { + "epoch": 6.071209330877839, + "grad_norm": 0.23910140991210938, + "learning_rate": 3.530838839652616e-05, + "loss": 1.8099, + "step": 19780 + }, + { + "epoch": 6.071516267648865, + "grad_norm": 0.19440437853336334, + "learning_rate": 3.530363732465506e-05, + "loss": 1.67, + "step": 19781 + }, + { + "epoch": 6.071823204419889, + "grad_norm": 0.1954154074192047, + "learning_rate": 3.529888639802204e-05, + "loss": 1.7154, + "step": 19782 + }, + { + "epoch": 6.0721301411909145, + "grad_norm": 0.20836392045021057, + "learning_rate": 3.529413561667405e-05, + "loss": 1.7451, + "step": 19783 + }, + { + "epoch": 6.07243707796194, + "grad_norm": 0.20521731674671173, + "learning_rate": 3.5289384980658016e-05, + "loss": 1.7008, + "step": 19784 + }, + { + "epoch": 6.072744014732965, + "grad_norm": 0.22885540127754211, + "learning_rate": 3.528463449002092e-05, + "loss": 1.7605, + "step": 19785 + }, + { + "epoch": 6.0730509515039905, + "grad_norm": 0.27740219235420227, + "learning_rate": 3.5279884144809664e-05, + "loss": 1.7816, + "step": 19786 + }, + { + "epoch": 6.073357888275015, + "grad_norm": 0.24747557938098907, + "learning_rate": 3.527513394507124e-05, + "loss": 1.7207, + "step": 19787 + }, + { + "epoch": 6.07366482504604, + "grad_norm": 0.20127782225608826, + "learning_rate": 3.527038389085256e-05, + "loss": 1.702, + "step": 19788 + }, + { + "epoch": 6.073971761817066, + "grad_norm": 0.20683316886425018, + "learning_rate": 3.5265633982200595e-05, + "loss": 1.7022, + "step": 19789 + }, + { + "epoch": 6.074278698588091, + "grad_norm": 0.17829765379428864, + "learning_rate": 3.5260884219162256e-05, + "loss": 1.7099, + "step": 19790 + }, + { + "epoch": 6.074585635359116, + "grad_norm": 0.256964772939682, + "learning_rate": 3.525613460178452e-05, + "loss": 1.7226, + "step": 19791 + }, + { + "epoch": 6.074892572130141, + "grad_norm": 0.22840122878551483, + "learning_rate": 3.525138513011428e-05, + "loss": 1.7738, + "step": 19792 + }, + { + "epoch": 6.075199508901166, + "grad_norm": 0.18988655507564545, + "learning_rate": 3.52466358041985e-05, + "loss": 1.6775, + "step": 19793 + }, + { + "epoch": 6.0755064456721914, + "grad_norm": 0.21857139468193054, + "learning_rate": 3.524188662408411e-05, + "loss": 1.7596, + "step": 19794 + }, + { + "epoch": 6.075813382443217, + "grad_norm": 0.22910535335540771, + "learning_rate": 3.523713758981807e-05, + "loss": 1.7969, + "step": 19795 + }, + { + "epoch": 6.076120319214242, + "grad_norm": 0.20885716378688812, + "learning_rate": 3.523238870144726e-05, + "loss": 1.7407, + "step": 19796 + }, + { + "epoch": 6.0764272559852675, + "grad_norm": 0.2056209295988083, + "learning_rate": 3.5227639959018666e-05, + "loss": 1.759, + "step": 19797 + }, + { + "epoch": 6.076734192756292, + "grad_norm": 0.17485356330871582, + "learning_rate": 3.522289136257917e-05, + "loss": 1.6988, + "step": 19798 + }, + { + "epoch": 6.077041129527317, + "grad_norm": 0.2103404402732849, + "learning_rate": 3.521814291217573e-05, + "loss": 1.766, + "step": 19799 + }, + { + "epoch": 6.077348066298343, + "grad_norm": 0.21852105855941772, + "learning_rate": 3.521339460785528e-05, + "loss": 1.7435, + "step": 19800 + }, + { + "epoch": 6.077655003069368, + "grad_norm": 0.21578362584114075, + "learning_rate": 3.520864644966471e-05, + "loss": 1.7281, + "step": 19801 + }, + { + "epoch": 6.077961939840393, + "grad_norm": 0.20405036211013794, + "learning_rate": 3.520389843765099e-05, + "loss": 1.7367, + "step": 19802 + }, + { + "epoch": 6.078268876611418, + "grad_norm": 0.2578286826610565, + "learning_rate": 3.5199150571860996e-05, + "loss": 1.7625, + "step": 19803 + }, + { + "epoch": 6.078575813382443, + "grad_norm": 0.240324467420578, + "learning_rate": 3.519440285234168e-05, + "loss": 1.6979, + "step": 19804 + }, + { + "epoch": 6.078882750153468, + "grad_norm": 0.220765620470047, + "learning_rate": 3.5189655279139935e-05, + "loss": 1.7679, + "step": 19805 + }, + { + "epoch": 6.079189686924494, + "grad_norm": 0.2731996774673462, + "learning_rate": 3.518490785230273e-05, + "loss": 1.6723, + "step": 19806 + }, + { + "epoch": 6.079496623695519, + "grad_norm": 0.2593478262424469, + "learning_rate": 3.518016057187692e-05, + "loss": 1.7232, + "step": 19807 + }, + { + "epoch": 6.0798035604665435, + "grad_norm": 0.34642404317855835, + "learning_rate": 3.517541343790947e-05, + "loss": 1.8265, + "step": 19808 + }, + { + "epoch": 6.080110497237569, + "grad_norm": 0.3187299370765686, + "learning_rate": 3.5170666450447255e-05, + "loss": 1.6847, + "step": 19809 + }, + { + "epoch": 6.080417434008594, + "grad_norm": 0.20413202047348022, + "learning_rate": 3.5165919609537215e-05, + "loss": 1.6533, + "step": 19810 + }, + { + "epoch": 6.0807243707796195, + "grad_norm": 0.2753545343875885, + "learning_rate": 3.516117291522625e-05, + "loss": 1.7491, + "step": 19811 + }, + { + "epoch": 6.081031307550645, + "grad_norm": 0.20174793899059296, + "learning_rate": 3.515642636756128e-05, + "loss": 1.6902, + "step": 19812 + }, + { + "epoch": 6.08133824432167, + "grad_norm": 0.22567492723464966, + "learning_rate": 3.515167996658919e-05, + "loss": 1.7165, + "step": 19813 + }, + { + "epoch": 6.081645181092695, + "grad_norm": 0.2115732729434967, + "learning_rate": 3.514693371235692e-05, + "loss": 1.6888, + "step": 19814 + }, + { + "epoch": 6.08195211786372, + "grad_norm": 0.2141808122396469, + "learning_rate": 3.514218760491134e-05, + "loss": 1.7152, + "step": 19815 + }, + { + "epoch": 6.082259054634745, + "grad_norm": 0.19767558574676514, + "learning_rate": 3.513744164429938e-05, + "loss": 1.6926, + "step": 19816 + }, + { + "epoch": 6.082565991405771, + "grad_norm": 0.20220023393630981, + "learning_rate": 3.5132695830567944e-05, + "loss": 1.6727, + "step": 19817 + }, + { + "epoch": 6.082872928176796, + "grad_norm": 0.19589759409427643, + "learning_rate": 3.5127950163763896e-05, + "loss": 1.7545, + "step": 19818 + }, + { + "epoch": 6.08317986494782, + "grad_norm": 0.21303611993789673, + "learning_rate": 3.512320464393418e-05, + "loss": 1.753, + "step": 19819 + }, + { + "epoch": 6.083486801718846, + "grad_norm": 0.19438377022743225, + "learning_rate": 3.511845927112566e-05, + "loss": 1.7022, + "step": 19820 + }, + { + "epoch": 6.083793738489871, + "grad_norm": 0.21282976865768433, + "learning_rate": 3.511371404538526e-05, + "loss": 1.7099, + "step": 19821 + }, + { + "epoch": 6.084100675260896, + "grad_norm": 0.1874496042728424, + "learning_rate": 3.5108968966759846e-05, + "loss": 1.7033, + "step": 19822 + }, + { + "epoch": 6.084407612031922, + "grad_norm": 0.21199075877666473, + "learning_rate": 3.510422403529636e-05, + "loss": 1.7088, + "step": 19823 + }, + { + "epoch": 6.084714548802946, + "grad_norm": 0.21847110986709595, + "learning_rate": 3.5099479251041634e-05, + "loss": 1.7395, + "step": 19824 + }, + { + "epoch": 6.0850214855739715, + "grad_norm": 0.201395645737648, + "learning_rate": 3.509473461404261e-05, + "loss": 1.7522, + "step": 19825 + }, + { + "epoch": 6.085328422344997, + "grad_norm": 0.19637656211853027, + "learning_rate": 3.5089990124346135e-05, + "loss": 1.6774, + "step": 19826 + }, + { + "epoch": 6.085635359116022, + "grad_norm": 0.25918442010879517, + "learning_rate": 3.5085245781999124e-05, + "loss": 1.7704, + "step": 19827 + }, + { + "epoch": 6.0859422958870475, + "grad_norm": 0.21271947026252747, + "learning_rate": 3.508050158704844e-05, + "loss": 1.6902, + "step": 19828 + }, + { + "epoch": 6.086249232658073, + "grad_norm": 0.2065698802471161, + "learning_rate": 3.5075757539541024e-05, + "loss": 1.7945, + "step": 19829 + }, + { + "epoch": 6.086556169429097, + "grad_norm": 0.20247824490070343, + "learning_rate": 3.5071013639523684e-05, + "loss": 1.7532, + "step": 19830 + }, + { + "epoch": 6.086863106200123, + "grad_norm": 0.19705431163311005, + "learning_rate": 3.506626988704336e-05, + "loss": 1.6353, + "step": 19831 + }, + { + "epoch": 6.087170042971148, + "grad_norm": 0.20158523321151733, + "learning_rate": 3.5061526282146886e-05, + "loss": 1.6596, + "step": 19832 + }, + { + "epoch": 6.087476979742173, + "grad_norm": 0.19492848217487335, + "learning_rate": 3.505678282488118e-05, + "loss": 1.7107, + "step": 19833 + }, + { + "epoch": 6.087783916513199, + "grad_norm": 0.2403736114501953, + "learning_rate": 3.505203951529312e-05, + "loss": 1.7456, + "step": 19834 + }, + { + "epoch": 6.088090853284223, + "grad_norm": 0.25649771094322205, + "learning_rate": 3.504729635342954e-05, + "loss": 1.7513, + "step": 19835 + }, + { + "epoch": 6.088397790055248, + "grad_norm": 0.20172113180160522, + "learning_rate": 3.504255333933736e-05, + "loss": 1.7737, + "step": 19836 + }, + { + "epoch": 6.088704726826274, + "grad_norm": 0.2715936303138733, + "learning_rate": 3.5037810473063414e-05, + "loss": 1.759, + "step": 19837 + }, + { + "epoch": 6.089011663597299, + "grad_norm": 0.23145076632499695, + "learning_rate": 3.503306775465461e-05, + "loss": 1.7811, + "step": 19838 + }, + { + "epoch": 6.089318600368324, + "grad_norm": 0.1953691691160202, + "learning_rate": 3.502832518415778e-05, + "loss": 1.752, + "step": 19839 + }, + { + "epoch": 6.08962553713935, + "grad_norm": 0.1927584707736969, + "learning_rate": 3.502358276161986e-05, + "loss": 1.6865, + "step": 19840 + }, + { + "epoch": 6.089932473910374, + "grad_norm": 0.19294732809066772, + "learning_rate": 3.501884048708763e-05, + "loss": 1.6838, + "step": 19841 + }, + { + "epoch": 6.0902394106813995, + "grad_norm": 0.23351021111011505, + "learning_rate": 3.501409836060803e-05, + "loss": 1.8029, + "step": 19842 + }, + { + "epoch": 6.090546347452425, + "grad_norm": 0.21615718305110931, + "learning_rate": 3.5009356382227877e-05, + "loss": 1.7441, + "step": 19843 + }, + { + "epoch": 6.09085328422345, + "grad_norm": 0.19091549515724182, + "learning_rate": 3.500461455199405e-05, + "loss": 1.7056, + "step": 19844 + }, + { + "epoch": 6.0911602209944755, + "grad_norm": 0.21189090609550476, + "learning_rate": 3.499987286995341e-05, + "loss": 1.6853, + "step": 19845 + }, + { + "epoch": 6.0914671577655, + "grad_norm": 0.22545887529850006, + "learning_rate": 3.499513133615283e-05, + "loss": 1.7854, + "step": 19846 + }, + { + "epoch": 6.091774094536525, + "grad_norm": 0.21960650384426117, + "learning_rate": 3.4990389950639144e-05, + "loss": 1.7558, + "step": 19847 + }, + { + "epoch": 6.092081031307551, + "grad_norm": 0.20825782418251038, + "learning_rate": 3.4985648713459244e-05, + "loss": 1.7103, + "step": 19848 + }, + { + "epoch": 6.092387968078576, + "grad_norm": 0.20886415243148804, + "learning_rate": 3.498090762465993e-05, + "loss": 1.6897, + "step": 19849 + }, + { + "epoch": 6.092694904849601, + "grad_norm": 0.19306892156600952, + "learning_rate": 3.4976166684288115e-05, + "loss": 1.7506, + "step": 19850 + }, + { + "epoch": 6.093001841620626, + "grad_norm": 0.2178204357624054, + "learning_rate": 3.497142589239063e-05, + "loss": 1.6774, + "step": 19851 + }, + { + "epoch": 6.093308778391651, + "grad_norm": 0.1914307177066803, + "learning_rate": 3.4966685249014294e-05, + "loss": 1.7182, + "step": 19852 + }, + { + "epoch": 6.093615715162676, + "grad_norm": 0.22006092965602875, + "learning_rate": 3.496194475420602e-05, + "loss": 1.7209, + "step": 19853 + }, + { + "epoch": 6.093922651933702, + "grad_norm": 0.20621439814567566, + "learning_rate": 3.49572044080126e-05, + "loss": 1.7403, + "step": 19854 + }, + { + "epoch": 6.094229588704727, + "grad_norm": 0.24079272150993347, + "learning_rate": 3.495246421048091e-05, + "loss": 1.7619, + "step": 19855 + }, + { + "epoch": 6.094536525475752, + "grad_norm": 0.19073884189128876, + "learning_rate": 3.494772416165777e-05, + "loss": 1.6677, + "step": 19856 + }, + { + "epoch": 6.094843462246777, + "grad_norm": 0.18217229843139648, + "learning_rate": 3.494298426159007e-05, + "loss": 1.7162, + "step": 19857 + }, + { + "epoch": 6.095150399017802, + "grad_norm": 0.21901506185531616, + "learning_rate": 3.493824451032461e-05, + "loss": 1.7173, + "step": 19858 + }, + { + "epoch": 6.0954573357888275, + "grad_norm": 0.22156217694282532, + "learning_rate": 3.493350490790826e-05, + "loss": 1.8029, + "step": 19859 + }, + { + "epoch": 6.095764272559853, + "grad_norm": 0.1663675606250763, + "learning_rate": 3.4928765454387824e-05, + "loss": 1.7306, + "step": 19860 + }, + { + "epoch": 6.096071209330878, + "grad_norm": 0.19684657454490662, + "learning_rate": 3.4924026149810175e-05, + "loss": 1.6944, + "step": 19861 + }, + { + "epoch": 6.096378146101903, + "grad_norm": 0.19163468480110168, + "learning_rate": 3.4919286994222125e-05, + "loss": 1.7331, + "step": 19862 + }, + { + "epoch": 6.096685082872928, + "grad_norm": 0.20134083926677704, + "learning_rate": 3.491454798767054e-05, + "loss": 1.7365, + "step": 19863 + }, + { + "epoch": 6.096992019643953, + "grad_norm": 0.23877696692943573, + "learning_rate": 3.490980913020221e-05, + "loss": 1.753, + "step": 19864 + }, + { + "epoch": 6.097298956414979, + "grad_norm": 0.207699254155159, + "learning_rate": 3.490507042186402e-05, + "loss": 1.6835, + "step": 19865 + }, + { + "epoch": 6.097605893186004, + "grad_norm": 0.20608612895011902, + "learning_rate": 3.490033186270274e-05, + "loss": 1.7379, + "step": 19866 + }, + { + "epoch": 6.097912829957028, + "grad_norm": 0.25086313486099243, + "learning_rate": 3.489559345276524e-05, + "loss": 1.7692, + "step": 19867 + }, + { + "epoch": 6.098219766728054, + "grad_norm": 0.22025549411773682, + "learning_rate": 3.489085519209836e-05, + "loss": 1.6579, + "step": 19868 + }, + { + "epoch": 6.098526703499079, + "grad_norm": 0.23805730044841766, + "learning_rate": 3.4886117080748875e-05, + "loss": 1.7695, + "step": 19869 + }, + { + "epoch": 6.098833640270104, + "grad_norm": 0.23271869122982025, + "learning_rate": 3.4881379118763666e-05, + "loss": 1.7268, + "step": 19870 + }, + { + "epoch": 6.09914057704113, + "grad_norm": 0.21795618534088135, + "learning_rate": 3.4876641306189505e-05, + "loss": 1.6996, + "step": 19871 + }, + { + "epoch": 6.099447513812155, + "grad_norm": 0.22064761817455292, + "learning_rate": 3.487190364307326e-05, + "loss": 1.7032, + "step": 19872 + }, + { + "epoch": 6.0997544505831796, + "grad_norm": 0.23834183812141418, + "learning_rate": 3.4867166129461706e-05, + "loss": 1.6942, + "step": 19873 + }, + { + "epoch": 6.100061387354205, + "grad_norm": 0.21143686771392822, + "learning_rate": 3.486242876540171e-05, + "loss": 1.6904, + "step": 19874 + }, + { + "epoch": 6.10036832412523, + "grad_norm": 0.18099969625473022, + "learning_rate": 3.485769155094004e-05, + "loss": 1.6669, + "step": 19875 + }, + { + "epoch": 6.100675260896256, + "grad_norm": 0.25324884057044983, + "learning_rate": 3.4852954486123566e-05, + "loss": 1.7878, + "step": 19876 + }, + { + "epoch": 6.100982197667281, + "grad_norm": 0.2252139449119568, + "learning_rate": 3.4848217570999055e-05, + "loss": 1.7674, + "step": 19877 + }, + { + "epoch": 6.101289134438305, + "grad_norm": 0.19629882276058197, + "learning_rate": 3.4843480805613346e-05, + "loss": 1.6898, + "step": 19878 + }, + { + "epoch": 6.101596071209331, + "grad_norm": 0.1858786642551422, + "learning_rate": 3.483874419001323e-05, + "loss": 1.6856, + "step": 19879 + }, + { + "epoch": 6.101903007980356, + "grad_norm": 0.1842946857213974, + "learning_rate": 3.483400772424555e-05, + "loss": 1.7229, + "step": 19880 + }, + { + "epoch": 6.102209944751381, + "grad_norm": 0.18981511890888214, + "learning_rate": 3.482927140835708e-05, + "loss": 1.75, + "step": 19881 + }, + { + "epoch": 6.102516881522407, + "grad_norm": 0.19914525747299194, + "learning_rate": 3.482453524239466e-05, + "loss": 1.7702, + "step": 19882 + }, + { + "epoch": 6.102823818293431, + "grad_norm": 0.1960345208644867, + "learning_rate": 3.481979922640507e-05, + "loss": 1.7189, + "step": 19883 + }, + { + "epoch": 6.1031307550644565, + "grad_norm": 0.20309221744537354, + "learning_rate": 3.48150633604351e-05, + "loss": 1.7888, + "step": 19884 + }, + { + "epoch": 6.103437691835482, + "grad_norm": 0.20090891420841217, + "learning_rate": 3.48103276445316e-05, + "loss": 1.8017, + "step": 19885 + }, + { + "epoch": 6.103744628606507, + "grad_norm": 0.22500385344028473, + "learning_rate": 3.480559207874133e-05, + "loss": 1.7061, + "step": 19886 + }, + { + "epoch": 6.1040515653775325, + "grad_norm": 0.22594885528087616, + "learning_rate": 3.480085666311113e-05, + "loss": 1.7659, + "step": 19887 + }, + { + "epoch": 6.104358502148558, + "grad_norm": 0.2769651710987091, + "learning_rate": 3.479612139768774e-05, + "loss": 1.7668, + "step": 19888 + }, + { + "epoch": 6.104665438919582, + "grad_norm": 0.24251700937747955, + "learning_rate": 3.4791386282518e-05, + "loss": 1.8068, + "step": 19889 + }, + { + "epoch": 6.104972375690608, + "grad_norm": 0.23325790464878082, + "learning_rate": 3.478665131764869e-05, + "loss": 1.7116, + "step": 19890 + }, + { + "epoch": 6.105279312461633, + "grad_norm": 0.19998812675476074, + "learning_rate": 3.478191650312663e-05, + "loss": 1.7116, + "step": 19891 + }, + { + "epoch": 6.105586249232658, + "grad_norm": 0.20933640003204346, + "learning_rate": 3.4777181838998566e-05, + "loss": 1.7138, + "step": 19892 + }, + { + "epoch": 6.105893186003684, + "grad_norm": 0.24344035983085632, + "learning_rate": 3.477244732531134e-05, + "loss": 1.784, + "step": 19893 + }, + { + "epoch": 6.106200122774708, + "grad_norm": 0.2220575362443924, + "learning_rate": 3.4767712962111686e-05, + "loss": 1.7479, + "step": 19894 + }, + { + "epoch": 6.106507059545733, + "grad_norm": 0.2222832590341568, + "learning_rate": 3.476297874944644e-05, + "loss": 1.7278, + "step": 19895 + }, + { + "epoch": 6.106813996316759, + "grad_norm": 0.222265362739563, + "learning_rate": 3.4758244687362353e-05, + "loss": 1.7321, + "step": 19896 + }, + { + "epoch": 6.107120933087784, + "grad_norm": 0.2921304702758789, + "learning_rate": 3.475351077590625e-05, + "loss": 1.7848, + "step": 19897 + }, + { + "epoch": 6.107427869858809, + "grad_norm": 0.21015208959579468, + "learning_rate": 3.4748777015124856e-05, + "loss": 1.7987, + "step": 19898 + }, + { + "epoch": 6.107734806629834, + "grad_norm": 0.19510969519615173, + "learning_rate": 3.474404340506502e-05, + "loss": 1.7317, + "step": 19899 + }, + { + "epoch": 6.108041743400859, + "grad_norm": 0.21978609263896942, + "learning_rate": 3.473930994577348e-05, + "loss": 1.6943, + "step": 19900 + }, + { + "epoch": 6.1083486801718845, + "grad_norm": 0.1793510913848877, + "learning_rate": 3.4734576637297004e-05, + "loss": 1.6659, + "step": 19901 + }, + { + "epoch": 6.10865561694291, + "grad_norm": 0.2029319554567337, + "learning_rate": 3.4729843479682414e-05, + "loss": 1.7127, + "step": 19902 + }, + { + "epoch": 6.108962553713935, + "grad_norm": 0.2001914530992508, + "learning_rate": 3.472511047297644e-05, + "loss": 1.691, + "step": 19903 + }, + { + "epoch": 6.1092694904849605, + "grad_norm": 0.2194693237543106, + "learning_rate": 3.47203776172259e-05, + "loss": 1.7181, + "step": 19904 + }, + { + "epoch": 6.109576427255985, + "grad_norm": 0.1865277737379074, + "learning_rate": 3.4715644912477515e-05, + "loss": 1.6786, + "step": 19905 + }, + { + "epoch": 6.10988336402701, + "grad_norm": 0.20574906468391418, + "learning_rate": 3.471091235877811e-05, + "loss": 1.7681, + "step": 19906 + }, + { + "epoch": 6.110190300798036, + "grad_norm": 0.21072493493556976, + "learning_rate": 3.470617995617441e-05, + "loss": 1.7494, + "step": 19907 + }, + { + "epoch": 6.110497237569061, + "grad_norm": 0.2411658763885498, + "learning_rate": 3.470144770471323e-05, + "loss": 1.7183, + "step": 19908 + }, + { + "epoch": 6.110804174340086, + "grad_norm": 0.19782759249210358, + "learning_rate": 3.4696715604441285e-05, + "loss": 1.6823, + "step": 19909 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.315026193857193, + "learning_rate": 3.469198365540539e-05, + "loss": 1.691, + "step": 19910 + }, + { + "epoch": 6.111418047882136, + "grad_norm": 0.19840773940086365, + "learning_rate": 3.468725185765226e-05, + "loss": 1.7413, + "step": 19911 + }, + { + "epoch": 6.111724984653161, + "grad_norm": 0.1813160926103592, + "learning_rate": 3.46825202112287e-05, + "loss": 1.7095, + "step": 19912 + }, + { + "epoch": 6.112031921424187, + "grad_norm": 0.21025459468364716, + "learning_rate": 3.467778871618145e-05, + "loss": 1.7783, + "step": 19913 + }, + { + "epoch": 6.112338858195212, + "grad_norm": 0.20088298618793488, + "learning_rate": 3.4673057372557265e-05, + "loss": 1.7671, + "step": 19914 + }, + { + "epoch": 6.112645794966237, + "grad_norm": 0.21919472515583038, + "learning_rate": 3.466832618040291e-05, + "loss": 1.7052, + "step": 19915 + }, + { + "epoch": 6.112952731737262, + "grad_norm": 0.19135436415672302, + "learning_rate": 3.466359513976516e-05, + "loss": 1.7862, + "step": 19916 + }, + { + "epoch": 6.113259668508287, + "grad_norm": 0.19943594932556152, + "learning_rate": 3.465886425069074e-05, + "loss": 1.6926, + "step": 19917 + }, + { + "epoch": 6.1135666052793125, + "grad_norm": 0.19390980899333954, + "learning_rate": 3.46541335132264e-05, + "loss": 1.761, + "step": 19918 + }, + { + "epoch": 6.113873542050338, + "grad_norm": 0.22745995223522186, + "learning_rate": 3.4649402927418935e-05, + "loss": 1.7147, + "step": 19919 + }, + { + "epoch": 6.114180478821363, + "grad_norm": 0.17792920768260956, + "learning_rate": 3.4644672493315045e-05, + "loss": 1.6946, + "step": 19920 + }, + { + "epoch": 6.114487415592388, + "grad_norm": 0.2009986788034439, + "learning_rate": 3.463994221096152e-05, + "loss": 1.6977, + "step": 19921 + }, + { + "epoch": 6.114794352363413, + "grad_norm": 0.2448386251926422, + "learning_rate": 3.4635212080405066e-05, + "loss": 1.7169, + "step": 19922 + }, + { + "epoch": 6.115101289134438, + "grad_norm": 0.21506112813949585, + "learning_rate": 3.463048210169247e-05, + "loss": 1.6632, + "step": 19923 + }, + { + "epoch": 6.115408225905464, + "grad_norm": 0.1805233359336853, + "learning_rate": 3.462575227487045e-05, + "loss": 1.6742, + "step": 19924 + }, + { + "epoch": 6.115715162676489, + "grad_norm": 0.20023848116397858, + "learning_rate": 3.4621022599985766e-05, + "loss": 1.7106, + "step": 19925 + }, + { + "epoch": 6.116022099447513, + "grad_norm": 0.20388077199459076, + "learning_rate": 3.461629307708513e-05, + "loss": 1.7065, + "step": 19926 + }, + { + "epoch": 6.116329036218539, + "grad_norm": 0.23886005580425262, + "learning_rate": 3.461156370621533e-05, + "loss": 1.7177, + "step": 19927 + }, + { + "epoch": 6.116635972989564, + "grad_norm": 0.2054048627614975, + "learning_rate": 3.460683448742306e-05, + "loss": 1.6773, + "step": 19928 + }, + { + "epoch": 6.116942909760589, + "grad_norm": 0.1909634917974472, + "learning_rate": 3.460210542075508e-05, + "loss": 1.7562, + "step": 19929 + }, + { + "epoch": 6.117249846531615, + "grad_norm": 0.20221595466136932, + "learning_rate": 3.459737650625812e-05, + "loss": 1.7948, + "step": 19930 + }, + { + "epoch": 6.11755678330264, + "grad_norm": 0.25445356965065, + "learning_rate": 3.459264774397891e-05, + "loss": 1.7964, + "step": 19931 + }, + { + "epoch": 6.1178637200736645, + "grad_norm": 0.2227735072374344, + "learning_rate": 3.4587919133964176e-05, + "loss": 1.7833, + "step": 19932 + }, + { + "epoch": 6.11817065684469, + "grad_norm": 0.20591853559017181, + "learning_rate": 3.458319067626068e-05, + "loss": 1.7535, + "step": 19933 + }, + { + "epoch": 6.118477593615715, + "grad_norm": 0.22087402641773224, + "learning_rate": 3.4578462370915115e-05, + "loss": 1.7228, + "step": 19934 + }, + { + "epoch": 6.1187845303867405, + "grad_norm": 0.234156996011734, + "learning_rate": 3.457373421797423e-05, + "loss": 1.7167, + "step": 19935 + }, + { + "epoch": 6.119091467157766, + "grad_norm": 0.209685817360878, + "learning_rate": 3.4569006217484746e-05, + "loss": 1.6633, + "step": 19936 + }, + { + "epoch": 6.11939840392879, + "grad_norm": 0.18499237298965454, + "learning_rate": 3.4564278369493366e-05, + "loss": 1.6769, + "step": 19937 + }, + { + "epoch": 6.119705340699816, + "grad_norm": 0.2600767910480499, + "learning_rate": 3.455955067404686e-05, + "loss": 1.7788, + "step": 19938 + }, + { + "epoch": 6.120012277470841, + "grad_norm": 0.21499377489089966, + "learning_rate": 3.455482313119191e-05, + "loss": 1.789, + "step": 19939 + }, + { + "epoch": 6.120319214241866, + "grad_norm": 0.19618432223796844, + "learning_rate": 3.455009574097527e-05, + "loss": 1.7162, + "step": 19940 + }, + { + "epoch": 6.120626151012892, + "grad_norm": 0.23219916224479675, + "learning_rate": 3.4545368503443616e-05, + "loss": 1.7871, + "step": 19941 + }, + { + "epoch": 6.120933087783916, + "grad_norm": 0.22315794229507446, + "learning_rate": 3.45406414186437e-05, + "loss": 1.6944, + "step": 19942 + }, + { + "epoch": 6.121240024554941, + "grad_norm": 0.22536693513393402, + "learning_rate": 3.453591448662221e-05, + "loss": 1.7727, + "step": 19943 + }, + { + "epoch": 6.121546961325967, + "grad_norm": 0.21811100840568542, + "learning_rate": 3.45311877074259e-05, + "loss": 1.7037, + "step": 19944 + }, + { + "epoch": 6.121853898096992, + "grad_norm": 0.1957094967365265, + "learning_rate": 3.452646108110145e-05, + "loss": 1.7734, + "step": 19945 + }, + { + "epoch": 6.122160834868017, + "grad_norm": 0.185706228017807, + "learning_rate": 3.452173460769559e-05, + "loss": 1.6715, + "step": 19946 + }, + { + "epoch": 6.122467771639043, + "grad_norm": 0.21081562340259552, + "learning_rate": 3.4517008287255005e-05, + "loss": 1.7798, + "step": 19947 + }, + { + "epoch": 6.122774708410067, + "grad_norm": 0.24175535142421722, + "learning_rate": 3.451228211982642e-05, + "loss": 1.7111, + "step": 19948 + }, + { + "epoch": 6.1230816451810925, + "grad_norm": 0.244124636054039, + "learning_rate": 3.450755610545654e-05, + "loss": 1.7263, + "step": 19949 + }, + { + "epoch": 6.123388581952118, + "grad_norm": 0.21109984815120697, + "learning_rate": 3.45028302441921e-05, + "loss": 1.7556, + "step": 19950 + }, + { + "epoch": 6.123695518723143, + "grad_norm": 0.21721722185611725, + "learning_rate": 3.449810453607976e-05, + "loss": 1.7416, + "step": 19951 + }, + { + "epoch": 6.1240024554941686, + "grad_norm": 0.18695317208766937, + "learning_rate": 3.4493378981166216e-05, + "loss": 1.7128, + "step": 19952 + }, + { + "epoch": 6.124309392265193, + "grad_norm": 0.19175554811954498, + "learning_rate": 3.4488653579498206e-05, + "loss": 1.7014, + "step": 19953 + }, + { + "epoch": 6.124616329036218, + "grad_norm": 0.22297006845474243, + "learning_rate": 3.4483928331122405e-05, + "loss": 1.7231, + "step": 19954 + }, + { + "epoch": 6.124923265807244, + "grad_norm": 0.2407974898815155, + "learning_rate": 3.447920323608553e-05, + "loss": 1.7354, + "step": 19955 + }, + { + "epoch": 6.125230202578269, + "grad_norm": 0.19767232239246368, + "learning_rate": 3.447447829443425e-05, + "loss": 1.7487, + "step": 19956 + }, + { + "epoch": 6.125537139349294, + "grad_norm": 0.20033477246761322, + "learning_rate": 3.446975350621529e-05, + "loss": 1.7232, + "step": 19957 + }, + { + "epoch": 6.12584407612032, + "grad_norm": 0.20310243964195251, + "learning_rate": 3.446502887147532e-05, + "loss": 1.6946, + "step": 19958 + }, + { + "epoch": 6.126151012891344, + "grad_norm": 0.2322724461555481, + "learning_rate": 3.446030439026104e-05, + "loss": 1.7071, + "step": 19959 + }, + { + "epoch": 6.1264579496623695, + "grad_norm": 0.24134255945682526, + "learning_rate": 3.445558006261914e-05, + "loss": 1.7259, + "step": 19960 + }, + { + "epoch": 6.126764886433395, + "grad_norm": 0.22821731865406036, + "learning_rate": 3.445085588859632e-05, + "loss": 1.7488, + "step": 19961 + }, + { + "epoch": 6.12707182320442, + "grad_norm": 0.258241206407547, + "learning_rate": 3.444613186823924e-05, + "loss": 1.7403, + "step": 19962 + }, + { + "epoch": 6.1273787599754455, + "grad_norm": 0.18758481740951538, + "learning_rate": 3.4441408001594625e-05, + "loss": 1.7079, + "step": 19963 + }, + { + "epoch": 6.12768569674647, + "grad_norm": 0.24032682180404663, + "learning_rate": 3.443668428870911e-05, + "loss": 1.7377, + "step": 19964 + }, + { + "epoch": 6.127992633517495, + "grad_norm": 0.24468545615673065, + "learning_rate": 3.4431960729629406e-05, + "loss": 1.7724, + "step": 19965 + }, + { + "epoch": 6.128299570288521, + "grad_norm": 0.23840154707431793, + "learning_rate": 3.4427237324402197e-05, + "loss": 1.7813, + "step": 19966 + }, + { + "epoch": 6.128606507059546, + "grad_norm": 0.2476109117269516, + "learning_rate": 3.4422514073074165e-05, + "loss": 1.7578, + "step": 19967 + }, + { + "epoch": 6.128913443830571, + "grad_norm": 0.2109041064977646, + "learning_rate": 3.4417790975691974e-05, + "loss": 1.6917, + "step": 19968 + }, + { + "epoch": 6.129220380601596, + "grad_norm": 0.21841584146022797, + "learning_rate": 3.4413068032302296e-05, + "loss": 1.7511, + "step": 19969 + }, + { + "epoch": 6.129527317372621, + "grad_norm": 0.2111930102109909, + "learning_rate": 3.440834524295182e-05, + "loss": 1.7194, + "step": 19970 + }, + { + "epoch": 6.129834254143646, + "grad_norm": 0.21868006885051727, + "learning_rate": 3.440362260768721e-05, + "loss": 1.7933, + "step": 19971 + }, + { + "epoch": 6.130141190914672, + "grad_norm": 0.19846780598163605, + "learning_rate": 3.439890012655516e-05, + "loss": 1.6985, + "step": 19972 + }, + { + "epoch": 6.130448127685697, + "grad_norm": 0.218460813164711, + "learning_rate": 3.439417779960231e-05, + "loss": 1.7205, + "step": 19973 + }, + { + "epoch": 6.1307550644567215, + "grad_norm": 0.22504402697086334, + "learning_rate": 3.438945562687535e-05, + "loss": 1.7437, + "step": 19974 + }, + { + "epoch": 6.131062001227747, + "grad_norm": 0.35414671897888184, + "learning_rate": 3.438473360842093e-05, + "loss": 1.7641, + "step": 19975 + }, + { + "epoch": 6.131368937998772, + "grad_norm": 0.21090710163116455, + "learning_rate": 3.4380011744285726e-05, + "loss": 1.6817, + "step": 19976 + }, + { + "epoch": 6.1316758747697975, + "grad_norm": 0.19118748605251312, + "learning_rate": 3.437529003451639e-05, + "loss": 1.694, + "step": 19977 + }, + { + "epoch": 6.131982811540823, + "grad_norm": 0.2341139018535614, + "learning_rate": 3.437056847915962e-05, + "loss": 1.781, + "step": 19978 + }, + { + "epoch": 6.132289748311848, + "grad_norm": 0.19120962917804718, + "learning_rate": 3.4365847078262033e-05, + "loss": 1.6974, + "step": 19979 + }, + { + "epoch": 6.132596685082873, + "grad_norm": 0.1998066008090973, + "learning_rate": 3.436112583187033e-05, + "loss": 1.6933, + "step": 19980 + }, + { + "epoch": 6.132903621853898, + "grad_norm": 0.19839663803577423, + "learning_rate": 3.4356404740031123e-05, + "loss": 1.6867, + "step": 19981 + }, + { + "epoch": 6.133210558624923, + "grad_norm": 0.19892877340316772, + "learning_rate": 3.4351683802791114e-05, + "loss": 1.7349, + "step": 19982 + }, + { + "epoch": 6.133517495395949, + "grad_norm": 0.23215502500534058, + "learning_rate": 3.434696302019692e-05, + "loss": 1.7411, + "step": 19983 + }, + { + "epoch": 6.133824432166974, + "grad_norm": 0.21246971189975739, + "learning_rate": 3.4342242392295225e-05, + "loss": 1.6918, + "step": 19984 + }, + { + "epoch": 6.134131368937998, + "grad_norm": 0.18585935235023499, + "learning_rate": 3.4337521919132675e-05, + "loss": 1.71, + "step": 19985 + }, + { + "epoch": 6.134438305709024, + "grad_norm": 0.24194715917110443, + "learning_rate": 3.4332801600755896e-05, + "loss": 1.7314, + "step": 19986 + }, + { + "epoch": 6.134745242480049, + "grad_norm": 0.19925665855407715, + "learning_rate": 3.432808143721156e-05, + "loss": 1.7425, + "step": 19987 + }, + { + "epoch": 6.135052179251074, + "grad_norm": 0.22253449261188507, + "learning_rate": 3.43233614285463e-05, + "loss": 1.702, + "step": 19988 + }, + { + "epoch": 6.1353591160221, + "grad_norm": 0.22180478274822235, + "learning_rate": 3.4318641574806796e-05, + "loss": 1.6659, + "step": 19989 + }, + { + "epoch": 6.135666052793125, + "grad_norm": 0.19818264245986938, + "learning_rate": 3.431392187603964e-05, + "loss": 1.8057, + "step": 19990 + }, + { + "epoch": 6.1359729895641495, + "grad_norm": 0.34630170464515686, + "learning_rate": 3.4309202332291526e-05, + "loss": 1.7233, + "step": 19991 + }, + { + "epoch": 6.136279926335175, + "grad_norm": 0.2633006274700165, + "learning_rate": 3.430448294360905e-05, + "loss": 1.7421, + "step": 19992 + }, + { + "epoch": 6.1365868631062, + "grad_norm": 0.1976388394832611, + "learning_rate": 3.429976371003888e-05, + "loss": 1.7474, + "step": 19993 + }, + { + "epoch": 6.1368937998772255, + "grad_norm": 0.2386583834886551, + "learning_rate": 3.429504463162764e-05, + "loss": 1.7026, + "step": 19994 + }, + { + "epoch": 6.137200736648251, + "grad_norm": 0.20853812992572784, + "learning_rate": 3.4290325708422e-05, + "loss": 1.7846, + "step": 19995 + }, + { + "epoch": 6.137507673419275, + "grad_norm": 0.24667194485664368, + "learning_rate": 3.428560694046854e-05, + "loss": 1.6446, + "step": 19996 + }, + { + "epoch": 6.137814610190301, + "grad_norm": 0.24396342039108276, + "learning_rate": 3.428088832781394e-05, + "loss": 1.7368, + "step": 19997 + }, + { + "epoch": 6.138121546961326, + "grad_norm": 0.1958172619342804, + "learning_rate": 3.4276169870504804e-05, + "loss": 1.7197, + "step": 19998 + }, + { + "epoch": 6.138428483732351, + "grad_norm": 0.21487464010715485, + "learning_rate": 3.427145156858778e-05, + "loss": 1.7318, + "step": 19999 + }, + { + "epoch": 6.138735420503377, + "grad_norm": 0.2152775675058365, + "learning_rate": 3.4266733422109476e-05, + "loss": 1.7924, + "step": 20000 + }, + { + "epoch": 6.139042357274401, + "grad_norm": 0.17151346802711487, + "learning_rate": 3.426201543111656e-05, + "loss": 1.6915, + "step": 20001 + }, + { + "epoch": 6.139349294045426, + "grad_norm": 0.22197338938713074, + "learning_rate": 3.425729759565563e-05, + "loss": 1.8028, + "step": 20002 + }, + { + "epoch": 6.139656230816452, + "grad_norm": 0.23111973702907562, + "learning_rate": 3.42525799157733e-05, + "loss": 1.7515, + "step": 20003 + }, + { + "epoch": 6.139963167587477, + "grad_norm": 0.2829805314540863, + "learning_rate": 3.42478623915162e-05, + "loss": 1.8379, + "step": 20004 + }, + { + "epoch": 6.140270104358502, + "grad_norm": 0.23467600345611572, + "learning_rate": 3.424314502293096e-05, + "loss": 1.7755, + "step": 20005 + }, + { + "epoch": 6.140577041129528, + "grad_norm": 0.2047930657863617, + "learning_rate": 3.42384278100642e-05, + "loss": 1.7198, + "step": 20006 + }, + { + "epoch": 6.140883977900552, + "grad_norm": 0.1893673986196518, + "learning_rate": 3.423371075296253e-05, + "loss": 1.7318, + "step": 20007 + }, + { + "epoch": 6.1411909146715775, + "grad_norm": 0.21514710783958435, + "learning_rate": 3.422899385167259e-05, + "loss": 1.7499, + "step": 20008 + }, + { + "epoch": 6.141497851442603, + "grad_norm": 0.20030297338962555, + "learning_rate": 3.422427710624095e-05, + "loss": 1.7109, + "step": 20009 + }, + { + "epoch": 6.141804788213628, + "grad_norm": 0.23581266403198242, + "learning_rate": 3.421956051671426e-05, + "loss": 1.7834, + "step": 20010 + }, + { + "epoch": 6.1421117249846535, + "grad_norm": 0.22492484748363495, + "learning_rate": 3.421484408313911e-05, + "loss": 1.785, + "step": 20011 + }, + { + "epoch": 6.142418661755678, + "grad_norm": 0.34137019515037537, + "learning_rate": 3.421012780556215e-05, + "loss": 1.8101, + "step": 20012 + }, + { + "epoch": 6.142725598526703, + "grad_norm": 0.28489169478416443, + "learning_rate": 3.420541168402994e-05, + "loss": 1.7945, + "step": 20013 + }, + { + "epoch": 6.143032535297729, + "grad_norm": 0.259362131357193, + "learning_rate": 3.420069571858913e-05, + "loss": 1.7011, + "step": 20014 + }, + { + "epoch": 6.143339472068754, + "grad_norm": 0.3628309667110443, + "learning_rate": 3.419597990928628e-05, + "loss": 1.8273, + "step": 20015 + }, + { + "epoch": 6.143646408839779, + "grad_norm": 0.22306841611862183, + "learning_rate": 3.419126425616803e-05, + "loss": 1.7447, + "step": 20016 + }, + { + "epoch": 6.143953345610804, + "grad_norm": 0.36336812376976013, + "learning_rate": 3.4186548759280964e-05, + "loss": 1.7076, + "step": 20017 + }, + { + "epoch": 6.144260282381829, + "grad_norm": 0.23167413473129272, + "learning_rate": 3.418183341867172e-05, + "loss": 1.6924, + "step": 20018 + }, + { + "epoch": 6.144567219152854, + "grad_norm": 0.2541113495826721, + "learning_rate": 3.417711823438686e-05, + "loss": 1.755, + "step": 20019 + }, + { + "epoch": 6.14487415592388, + "grad_norm": 0.3733784854412079, + "learning_rate": 3.4172403206472975e-05, + "loss": 1.7087, + "step": 20020 + }, + { + "epoch": 6.145181092694905, + "grad_norm": 0.1940508335828781, + "learning_rate": 3.416768833497669e-05, + "loss": 1.717, + "step": 20021 + }, + { + "epoch": 6.14548802946593, + "grad_norm": 0.2707524001598358, + "learning_rate": 3.416297361994457e-05, + "loss": 1.7422, + "step": 20022 + }, + { + "epoch": 6.145794966236955, + "grad_norm": 0.25535452365875244, + "learning_rate": 3.415825906142326e-05, + "loss": 1.6915, + "step": 20023 + }, + { + "epoch": 6.14610190300798, + "grad_norm": 0.24094220995903015, + "learning_rate": 3.415354465945929e-05, + "loss": 1.7192, + "step": 20024 + }, + { + "epoch": 6.1464088397790055, + "grad_norm": 0.28329676389694214, + "learning_rate": 3.4148830414099306e-05, + "loss": 1.7272, + "step": 20025 + }, + { + "epoch": 6.146715776550031, + "grad_norm": 0.217180535197258, + "learning_rate": 3.414411632538984e-05, + "loss": 1.7195, + "step": 20026 + }, + { + "epoch": 6.147022713321056, + "grad_norm": 0.22693867981433868, + "learning_rate": 3.413940239337753e-05, + "loss": 1.6889, + "step": 20027 + }, + { + "epoch": 6.147329650092081, + "grad_norm": 0.30376315116882324, + "learning_rate": 3.413468861810892e-05, + "loss": 1.7741, + "step": 20028 + }, + { + "epoch": 6.147636586863106, + "grad_norm": 0.1928185671567917, + "learning_rate": 3.412997499963065e-05, + "loss": 1.6986, + "step": 20029 + }, + { + "epoch": 6.147943523634131, + "grad_norm": 0.260929137468338, + "learning_rate": 3.412526153798924e-05, + "loss": 1.7044, + "step": 20030 + }, + { + "epoch": 6.148250460405157, + "grad_norm": 0.23274847865104675, + "learning_rate": 3.4120548233231326e-05, + "loss": 1.7626, + "step": 20031 + }, + { + "epoch": 6.148557397176182, + "grad_norm": 0.2389308512210846, + "learning_rate": 3.411583508540344e-05, + "loss": 1.71, + "step": 20032 + }, + { + "epoch": 6.148864333947207, + "grad_norm": 0.2745562195777893, + "learning_rate": 3.411112209455219e-05, + "loss": 1.7144, + "step": 20033 + }, + { + "epoch": 6.149171270718232, + "grad_norm": 0.2369096428155899, + "learning_rate": 3.4106409260724135e-05, + "loss": 1.7879, + "step": 20034 + }, + { + "epoch": 6.149478207489257, + "grad_norm": 0.3103141486644745, + "learning_rate": 3.4101696583965874e-05, + "loss": 1.7862, + "step": 20035 + }, + { + "epoch": 6.149785144260282, + "grad_norm": 0.18625277280807495, + "learning_rate": 3.409698406432397e-05, + "loss": 1.7717, + "step": 20036 + }, + { + "epoch": 6.150092081031308, + "grad_norm": 0.2539508640766144, + "learning_rate": 3.409227170184497e-05, + "loss": 1.7023, + "step": 20037 + }, + { + "epoch": 6.150399017802333, + "grad_norm": 0.2185351699590683, + "learning_rate": 3.4087559496575474e-05, + "loss": 1.7283, + "step": 20038 + }, + { + "epoch": 6.150705954573358, + "grad_norm": 0.21225227415561676, + "learning_rate": 3.408284744856204e-05, + "loss": 1.7055, + "step": 20039 + }, + { + "epoch": 6.151012891344383, + "grad_norm": 0.23623189330101013, + "learning_rate": 3.407813555785125e-05, + "loss": 1.6862, + "step": 20040 + }, + { + "epoch": 6.151319828115408, + "grad_norm": 0.19061312079429626, + "learning_rate": 3.4073423824489634e-05, + "loss": 1.7501, + "step": 20041 + }, + { + "epoch": 6.151626764886434, + "grad_norm": 0.22176402807235718, + "learning_rate": 3.4068712248523804e-05, + "loss": 1.7417, + "step": 20042 + }, + { + "epoch": 6.151933701657459, + "grad_norm": 0.20093770325183868, + "learning_rate": 3.406400083000028e-05, + "loss": 1.7283, + "step": 20043 + }, + { + "epoch": 6.152240638428483, + "grad_norm": 0.21968910098075867, + "learning_rate": 3.4059289568965635e-05, + "loss": 1.7187, + "step": 20044 + }, + { + "epoch": 6.152547575199509, + "grad_norm": 0.19038841128349304, + "learning_rate": 3.4054578465466435e-05, + "loss": 1.7131, + "step": 20045 + }, + { + "epoch": 6.152854511970534, + "grad_norm": 0.2239457368850708, + "learning_rate": 3.404986751954925e-05, + "loss": 1.7643, + "step": 20046 + }, + { + "epoch": 6.153161448741559, + "grad_norm": 0.2357017546892166, + "learning_rate": 3.404515673126061e-05, + "loss": 1.7196, + "step": 20047 + }, + { + "epoch": 6.153468385512585, + "grad_norm": 0.2633310556411743, + "learning_rate": 3.4040446100647104e-05, + "loss": 1.7613, + "step": 20048 + }, + { + "epoch": 6.153775322283609, + "grad_norm": 0.28470975160598755, + "learning_rate": 3.403573562775524e-05, + "loss": 1.7564, + "step": 20049 + }, + { + "epoch": 6.1540822590546345, + "grad_norm": 0.37435805797576904, + "learning_rate": 3.40310253126316e-05, + "loss": 1.8365, + "step": 20050 + }, + { + "epoch": 6.15438919582566, + "grad_norm": 0.1706259697675705, + "learning_rate": 3.402631515532272e-05, + "loss": 1.7373, + "step": 20051 + }, + { + "epoch": 6.154696132596685, + "grad_norm": 0.30885928869247437, + "learning_rate": 3.402160515587518e-05, + "loss": 1.7152, + "step": 20052 + }, + { + "epoch": 6.1550030693677105, + "grad_norm": 0.21448500454425812, + "learning_rate": 3.40168953143355e-05, + "loss": 1.7463, + "step": 20053 + }, + { + "epoch": 6.155310006138736, + "grad_norm": 0.23774586617946625, + "learning_rate": 3.4012185630750204e-05, + "loss": 1.7268, + "step": 20054 + }, + { + "epoch": 6.15561694290976, + "grad_norm": 0.1943385899066925, + "learning_rate": 3.400747610516588e-05, + "loss": 1.6578, + "step": 20055 + }, + { + "epoch": 6.155923879680786, + "grad_norm": 0.27488210797309875, + "learning_rate": 3.400276673762903e-05, + "loss": 1.8204, + "step": 20056 + }, + { + "epoch": 6.156230816451811, + "grad_norm": 0.1871461570262909, + "learning_rate": 3.3998057528186244e-05, + "loss": 1.6775, + "step": 20057 + }, + { + "epoch": 6.156537753222836, + "grad_norm": 0.23566775023937225, + "learning_rate": 3.399334847688401e-05, + "loss": 1.7089, + "step": 20058 + }, + { + "epoch": 6.156844689993862, + "grad_norm": 0.26842471957206726, + "learning_rate": 3.398863958376891e-05, + "loss": 1.7554, + "step": 20059 + }, + { + "epoch": 6.157151626764886, + "grad_norm": 0.19267809391021729, + "learning_rate": 3.3983930848887435e-05, + "loss": 1.6709, + "step": 20060 + }, + { + "epoch": 6.157458563535911, + "grad_norm": 0.21130084991455078, + "learning_rate": 3.3979222272286156e-05, + "loss": 1.7312, + "step": 20061 + }, + { + "epoch": 6.157765500306937, + "grad_norm": 0.2322172224521637, + "learning_rate": 3.397451385401158e-05, + "loss": 1.8069, + "step": 20062 + }, + { + "epoch": 6.158072437077962, + "grad_norm": 0.21852418780326843, + "learning_rate": 3.396980559411027e-05, + "loss": 1.715, + "step": 20063 + }, + { + "epoch": 6.158379373848987, + "grad_norm": 0.21385829150676727, + "learning_rate": 3.3965097492628714e-05, + "loss": 1.6804, + "step": 20064 + }, + { + "epoch": 6.158686310620013, + "grad_norm": 0.21639080345630646, + "learning_rate": 3.3960389549613494e-05, + "loss": 1.655, + "step": 20065 + }, + { + "epoch": 6.158993247391037, + "grad_norm": 0.19219942390918732, + "learning_rate": 3.395568176511107e-05, + "loss": 1.7325, + "step": 20066 + }, + { + "epoch": 6.1593001841620625, + "grad_norm": 0.21853557229042053, + "learning_rate": 3.3950974139168024e-05, + "loss": 1.7204, + "step": 20067 + }, + { + "epoch": 6.159607120933088, + "grad_norm": 0.24144381284713745, + "learning_rate": 3.3946266671830854e-05, + "loss": 1.754, + "step": 20068 + }, + { + "epoch": 6.159914057704113, + "grad_norm": 0.2014230340719223, + "learning_rate": 3.394155936314609e-05, + "loss": 1.6905, + "step": 20069 + }, + { + "epoch": 6.1602209944751385, + "grad_norm": 0.26940762996673584, + "learning_rate": 3.393685221316025e-05, + "loss": 1.729, + "step": 20070 + }, + { + "epoch": 6.160527931246163, + "grad_norm": 0.1937808394432068, + "learning_rate": 3.3932145221919843e-05, + "loss": 1.7492, + "step": 20071 + }, + { + "epoch": 6.160834868017188, + "grad_norm": 0.2586243450641632, + "learning_rate": 3.39274383894714e-05, + "loss": 1.7706, + "step": 20072 + }, + { + "epoch": 6.161141804788214, + "grad_norm": 0.21995361149311066, + "learning_rate": 3.3922731715861416e-05, + "loss": 1.7716, + "step": 20073 + }, + { + "epoch": 6.161448741559239, + "grad_norm": 0.22915497422218323, + "learning_rate": 3.391802520113645e-05, + "loss": 1.716, + "step": 20074 + }, + { + "epoch": 6.161755678330264, + "grad_norm": 0.24317315220832825, + "learning_rate": 3.3913318845342956e-05, + "loss": 1.7392, + "step": 20075 + }, + { + "epoch": 6.162062615101289, + "grad_norm": 0.20439307391643524, + "learning_rate": 3.390861264852749e-05, + "loss": 1.7076, + "step": 20076 + }, + { + "epoch": 6.162369551872314, + "grad_norm": 0.2197176069021225, + "learning_rate": 3.3903906610736534e-05, + "loss": 1.7334, + "step": 20077 + }, + { + "epoch": 6.162676488643339, + "grad_norm": 0.21651993691921234, + "learning_rate": 3.389920073201662e-05, + "loss": 1.7651, + "step": 20078 + }, + { + "epoch": 6.162983425414365, + "grad_norm": 0.1999540627002716, + "learning_rate": 3.389449501241424e-05, + "loss": 1.7031, + "step": 20079 + }, + { + "epoch": 6.16329036218539, + "grad_norm": 0.21965044736862183, + "learning_rate": 3.38897894519759e-05, + "loss": 1.7243, + "step": 20080 + }, + { + "epoch": 6.163597298956415, + "grad_norm": 0.20127563178539276, + "learning_rate": 3.388508405074808e-05, + "loss": 1.693, + "step": 20081 + }, + { + "epoch": 6.16390423572744, + "grad_norm": 0.2143397182226181, + "learning_rate": 3.3880378808777336e-05, + "loss": 1.7304, + "step": 20082 + }, + { + "epoch": 6.164211172498465, + "grad_norm": 0.23116083443164825, + "learning_rate": 3.387567372611012e-05, + "loss": 1.7558, + "step": 20083 + }, + { + "epoch": 6.1645181092694905, + "grad_norm": 0.25513985753059387, + "learning_rate": 3.3870968802792946e-05, + "loss": 1.7169, + "step": 20084 + }, + { + "epoch": 6.164825046040516, + "grad_norm": 0.20549121499061584, + "learning_rate": 3.386626403887232e-05, + "loss": 1.7147, + "step": 20085 + }, + { + "epoch": 6.165131982811541, + "grad_norm": 0.2850625514984131, + "learning_rate": 3.386155943439473e-05, + "loss": 1.7865, + "step": 20086 + }, + { + "epoch": 6.165438919582566, + "grad_norm": 0.2689895033836365, + "learning_rate": 3.3856854989406675e-05, + "loss": 1.7576, + "step": 20087 + }, + { + "epoch": 6.165745856353591, + "grad_norm": 0.21677634119987488, + "learning_rate": 3.385215070395462e-05, + "loss": 1.7186, + "step": 20088 + }, + { + "epoch": 6.166052793124616, + "grad_norm": 0.19525155425071716, + "learning_rate": 3.384744657808509e-05, + "loss": 1.6713, + "step": 20089 + }, + { + "epoch": 6.166359729895642, + "grad_norm": 0.23097296059131622, + "learning_rate": 3.3842742611844555e-05, + "loss": 1.6975, + "step": 20090 + }, + { + "epoch": 6.166666666666667, + "grad_norm": 0.22210827469825745, + "learning_rate": 3.3838038805279516e-05, + "loss": 1.733, + "step": 20091 + }, + { + "epoch": 6.166973603437691, + "grad_norm": 0.3336607813835144, + "learning_rate": 3.383333515843643e-05, + "loss": 1.7441, + "step": 20092 + }, + { + "epoch": 6.167280540208717, + "grad_norm": 0.25274014472961426, + "learning_rate": 3.382863167136183e-05, + "loss": 1.7235, + "step": 20093 + }, + { + "epoch": 6.167587476979742, + "grad_norm": 0.3228790760040283, + "learning_rate": 3.3823928344102144e-05, + "loss": 1.8096, + "step": 20094 + }, + { + "epoch": 6.167894413750767, + "grad_norm": 0.34542208909988403, + "learning_rate": 3.381922517670389e-05, + "loss": 1.7431, + "step": 20095 + }, + { + "epoch": 6.168201350521793, + "grad_norm": 0.1921117901802063, + "learning_rate": 3.381452216921355e-05, + "loss": 1.787, + "step": 20096 + }, + { + "epoch": 6.168508287292818, + "grad_norm": 0.29019802808761597, + "learning_rate": 3.380981932167757e-05, + "loss": 1.7122, + "step": 20097 + }, + { + "epoch": 6.1688152240638425, + "grad_norm": 0.17999929189682007, + "learning_rate": 3.380511663414244e-05, + "loss": 1.7153, + "step": 20098 + }, + { + "epoch": 6.169122160834868, + "grad_norm": 0.2641841471195221, + "learning_rate": 3.380041410665466e-05, + "loss": 1.7317, + "step": 20099 + }, + { + "epoch": 6.169429097605893, + "grad_norm": 0.25492918491363525, + "learning_rate": 3.379571173926067e-05, + "loss": 1.6975, + "step": 20100 + }, + { + "epoch": 6.1697360343769185, + "grad_norm": 0.2554764151573181, + "learning_rate": 3.379100953200697e-05, + "loss": 1.7539, + "step": 20101 + }, + { + "epoch": 6.170042971147944, + "grad_norm": 0.2339072823524475, + "learning_rate": 3.378630748493999e-05, + "loss": 1.6871, + "step": 20102 + }, + { + "epoch": 6.170349907918968, + "grad_norm": 0.19663162529468536, + "learning_rate": 3.3781605598106236e-05, + "loss": 1.7419, + "step": 20103 + }, + { + "epoch": 6.170656844689994, + "grad_norm": 0.2479846328496933, + "learning_rate": 3.3776903871552166e-05, + "loss": 1.7849, + "step": 20104 + }, + { + "epoch": 6.170963781461019, + "grad_norm": 0.18630735576152802, + "learning_rate": 3.377220230532423e-05, + "loss": 1.7412, + "step": 20105 + }, + { + "epoch": 6.171270718232044, + "grad_norm": 0.2211095094680786, + "learning_rate": 3.376750089946892e-05, + "loss": 1.7445, + "step": 20106 + }, + { + "epoch": 6.17157765500307, + "grad_norm": 0.20783299207687378, + "learning_rate": 3.3762799654032653e-05, + "loss": 1.7346, + "step": 20107 + }, + { + "epoch": 6.171884591774095, + "grad_norm": 0.18022862076759338, + "learning_rate": 3.3758098569061934e-05, + "loss": 1.7083, + "step": 20108 + }, + { + "epoch": 6.172191528545119, + "grad_norm": 0.23707088828086853, + "learning_rate": 3.375339764460319e-05, + "loss": 1.8542, + "step": 20109 + }, + { + "epoch": 6.172498465316145, + "grad_norm": 0.2289234846830368, + "learning_rate": 3.3748696880702913e-05, + "loss": 1.7564, + "step": 20110 + }, + { + "epoch": 6.17280540208717, + "grad_norm": 0.28396767377853394, + "learning_rate": 3.374399627740752e-05, + "loss": 1.7349, + "step": 20111 + }, + { + "epoch": 6.173112338858195, + "grad_norm": 0.20154817402362823, + "learning_rate": 3.373929583476351e-05, + "loss": 1.7356, + "step": 20112 + }, + { + "epoch": 6.173419275629221, + "grad_norm": 0.22590605914592743, + "learning_rate": 3.373459555281728e-05, + "loss": 1.7291, + "step": 20113 + }, + { + "epoch": 6.173726212400245, + "grad_norm": 0.2145034223794937, + "learning_rate": 3.372989543161532e-05, + "loss": 1.7544, + "step": 20114 + }, + { + "epoch": 6.1740331491712706, + "grad_norm": 0.26797109842300415, + "learning_rate": 3.372519547120407e-05, + "loss": 1.743, + "step": 20115 + }, + { + "epoch": 6.174340085942296, + "grad_norm": 0.2795363664627075, + "learning_rate": 3.372049567162999e-05, + "loss": 1.7278, + "step": 20116 + }, + { + "epoch": 6.174647022713321, + "grad_norm": 0.21436716616153717, + "learning_rate": 3.3715796032939494e-05, + "loss": 1.7306, + "step": 20117 + }, + { + "epoch": 6.1749539594843466, + "grad_norm": 0.2593919336795807, + "learning_rate": 3.3711096555179064e-05, + "loss": 1.7323, + "step": 20118 + }, + { + "epoch": 6.175260896255371, + "grad_norm": 0.19639115035533905, + "learning_rate": 3.3706397238395124e-05, + "loss": 1.7444, + "step": 20119 + }, + { + "epoch": 6.175567833026396, + "grad_norm": 0.23408278822898865, + "learning_rate": 3.370169808263409e-05, + "loss": 1.7461, + "step": 20120 + }, + { + "epoch": 6.175874769797422, + "grad_norm": 0.21200022101402283, + "learning_rate": 3.369699908794246e-05, + "loss": 1.7588, + "step": 20121 + }, + { + "epoch": 6.176181706568447, + "grad_norm": 0.17609953880310059, + "learning_rate": 3.369230025436662e-05, + "loss": 1.6608, + "step": 20122 + }, + { + "epoch": 6.176488643339472, + "grad_norm": 0.19895964860916138, + "learning_rate": 3.3687601581953046e-05, + "loss": 1.729, + "step": 20123 + }, + { + "epoch": 6.176795580110497, + "grad_norm": 0.22833310067653656, + "learning_rate": 3.368290307074814e-05, + "loss": 1.7148, + "step": 20124 + }, + { + "epoch": 6.177102516881522, + "grad_norm": 0.1847219169139862, + "learning_rate": 3.367820472079835e-05, + "loss": 1.6894, + "step": 20125 + }, + { + "epoch": 6.1774094536525475, + "grad_norm": 0.20269884169101715, + "learning_rate": 3.36735065321501e-05, + "loss": 1.794, + "step": 20126 + }, + { + "epoch": 6.177716390423573, + "grad_norm": 0.19277122616767883, + "learning_rate": 3.3668808504849845e-05, + "loss": 1.6936, + "step": 20127 + }, + { + "epoch": 6.178023327194598, + "grad_norm": 0.23804394900798798, + "learning_rate": 3.3664110638943985e-05, + "loss": 1.746, + "step": 20128 + }, + { + "epoch": 6.1783302639656235, + "grad_norm": 0.20946018397808075, + "learning_rate": 3.365941293447897e-05, + "loss": 1.6952, + "step": 20129 + }, + { + "epoch": 6.178637200736648, + "grad_norm": 0.21680596470832825, + "learning_rate": 3.36547153915012e-05, + "loss": 1.7709, + "step": 20130 + }, + { + "epoch": 6.178944137507673, + "grad_norm": 0.22549709677696228, + "learning_rate": 3.365001801005712e-05, + "loss": 1.6814, + "step": 20131 + }, + { + "epoch": 6.179251074278699, + "grad_norm": 0.20660072565078735, + "learning_rate": 3.3645320790193136e-05, + "loss": 1.6992, + "step": 20132 + }, + { + "epoch": 6.179558011049724, + "grad_norm": 0.23697195947170258, + "learning_rate": 3.36406237319557e-05, + "loss": 1.7325, + "step": 20133 + }, + { + "epoch": 6.179864947820749, + "grad_norm": 0.20847748219966888, + "learning_rate": 3.363592683539118e-05, + "loss": 1.7066, + "step": 20134 + }, + { + "epoch": 6.180171884591774, + "grad_norm": 0.24317312240600586, + "learning_rate": 3.363123010054605e-05, + "loss": 1.7259, + "step": 20135 + }, + { + "epoch": 6.180478821362799, + "grad_norm": 0.22137925028800964, + "learning_rate": 3.3626533527466686e-05, + "loss": 1.7492, + "step": 20136 + }, + { + "epoch": 6.180785758133824, + "grad_norm": 0.23857460916042328, + "learning_rate": 3.362183711619951e-05, + "loss": 1.6671, + "step": 20137 + }, + { + "epoch": 6.18109269490485, + "grad_norm": 0.20017468929290771, + "learning_rate": 3.361714086679095e-05, + "loss": 1.7151, + "step": 20138 + }, + { + "epoch": 6.181399631675875, + "grad_norm": 0.21566617488861084, + "learning_rate": 3.361244477928739e-05, + "loss": 1.7659, + "step": 20139 + }, + { + "epoch": 6.1817065684469, + "grad_norm": 0.21695555746555328, + "learning_rate": 3.360774885373528e-05, + "loss": 1.7463, + "step": 20140 + }, + { + "epoch": 6.182013505217925, + "grad_norm": 0.19326116144657135, + "learning_rate": 3.360305309018098e-05, + "loss": 1.7182, + "step": 20141 + }, + { + "epoch": 6.18232044198895, + "grad_norm": 0.2135429084300995, + "learning_rate": 3.359835748867093e-05, + "loss": 1.8001, + "step": 20142 + }, + { + "epoch": 6.1826273787599755, + "grad_norm": 0.20097343623638153, + "learning_rate": 3.359366204925151e-05, + "loss": 1.7442, + "step": 20143 + }, + { + "epoch": 6.182934315531001, + "grad_norm": 0.212847501039505, + "learning_rate": 3.358896677196916e-05, + "loss": 1.7418, + "step": 20144 + }, + { + "epoch": 6.183241252302026, + "grad_norm": 0.18414677679538727, + "learning_rate": 3.358427165687024e-05, + "loss": 1.6813, + "step": 20145 + }, + { + "epoch": 6.183548189073051, + "grad_norm": 0.23170427978038788, + "learning_rate": 3.357957670400119e-05, + "loss": 1.7722, + "step": 20146 + }, + { + "epoch": 6.183855125844076, + "grad_norm": 0.28952550888061523, + "learning_rate": 3.357488191340837e-05, + "loss": 1.7785, + "step": 20147 + }, + { + "epoch": 6.184162062615101, + "grad_norm": 0.2126605361700058, + "learning_rate": 3.35701872851382e-05, + "loss": 1.7064, + "step": 20148 + }, + { + "epoch": 6.184468999386127, + "grad_norm": 0.2376919537782669, + "learning_rate": 3.356549281923706e-05, + "loss": 1.7322, + "step": 20149 + }, + { + "epoch": 6.184775936157152, + "grad_norm": 0.24168729782104492, + "learning_rate": 3.3560798515751375e-05, + "loss": 1.7296, + "step": 20150 + }, + { + "epoch": 6.185082872928176, + "grad_norm": 0.19746467471122742, + "learning_rate": 3.355610437472749e-05, + "loss": 1.7816, + "step": 20151 + }, + { + "epoch": 6.185389809699202, + "grad_norm": 0.2399774193763733, + "learning_rate": 3.3551410396211844e-05, + "loss": 1.7309, + "step": 20152 + }, + { + "epoch": 6.185696746470227, + "grad_norm": 0.20560777187347412, + "learning_rate": 3.3546716580250785e-05, + "loss": 1.7134, + "step": 20153 + }, + { + "epoch": 6.186003683241252, + "grad_norm": 0.22640523314476013, + "learning_rate": 3.354202292689072e-05, + "loss": 1.7572, + "step": 20154 + }, + { + "epoch": 6.186310620012278, + "grad_norm": 0.20796974003314972, + "learning_rate": 3.353732943617803e-05, + "loss": 1.6897, + "step": 20155 + }, + { + "epoch": 6.186617556783303, + "grad_norm": 0.19902797043323517, + "learning_rate": 3.35326361081591e-05, + "loss": 1.6836, + "step": 20156 + }, + { + "epoch": 6.1869244935543275, + "grad_norm": 0.30999818444252014, + "learning_rate": 3.352794294288032e-05, + "loss": 1.7704, + "step": 20157 + }, + { + "epoch": 6.187231430325353, + "grad_norm": 0.20634675025939941, + "learning_rate": 3.3523249940388045e-05, + "loss": 1.7599, + "step": 20158 + }, + { + "epoch": 6.187538367096378, + "grad_norm": 0.25650453567504883, + "learning_rate": 3.3518557100728674e-05, + "loss": 1.7441, + "step": 20159 + }, + { + "epoch": 6.1878453038674035, + "grad_norm": 0.2400079369544983, + "learning_rate": 3.351386442394858e-05, + "loss": 1.6836, + "step": 20160 + }, + { + "epoch": 6.188152240638429, + "grad_norm": 0.23734217882156372, + "learning_rate": 3.350917191009416e-05, + "loss": 1.7, + "step": 20161 + }, + { + "epoch": 6.188459177409453, + "grad_norm": 0.29579323530197144, + "learning_rate": 3.3504479559211755e-05, + "loss": 1.71, + "step": 20162 + }, + { + "epoch": 6.188766114180479, + "grad_norm": 0.18999184668064117, + "learning_rate": 3.349978737134776e-05, + "loss": 1.7396, + "step": 20163 + }, + { + "epoch": 6.189073050951504, + "grad_norm": 0.26760223507881165, + "learning_rate": 3.3495095346548525e-05, + "loss": 1.7846, + "step": 20164 + }, + { + "epoch": 6.189379987722529, + "grad_norm": 0.18416397273540497, + "learning_rate": 3.349040348486044e-05, + "loss": 1.6911, + "step": 20165 + }, + { + "epoch": 6.189686924493555, + "grad_norm": 0.23761679232120514, + "learning_rate": 3.348571178632986e-05, + "loss": 1.6776, + "step": 20166 + }, + { + "epoch": 6.189993861264579, + "grad_norm": 0.2056473195552826, + "learning_rate": 3.348102025100316e-05, + "loss": 1.697, + "step": 20167 + }, + { + "epoch": 6.190300798035604, + "grad_norm": 0.23916250467300415, + "learning_rate": 3.3476328878926685e-05, + "loss": 1.7943, + "step": 20168 + }, + { + "epoch": 6.19060773480663, + "grad_norm": 0.2205415964126587, + "learning_rate": 3.347163767014684e-05, + "loss": 1.8037, + "step": 20169 + }, + { + "epoch": 6.190914671577655, + "grad_norm": 0.28907346725463867, + "learning_rate": 3.346694662470995e-05, + "loss": 1.6875, + "step": 20170 + }, + { + "epoch": 6.19122160834868, + "grad_norm": 0.2382480502128601, + "learning_rate": 3.3462255742662364e-05, + "loss": 1.7116, + "step": 20171 + }, + { + "epoch": 6.191528545119706, + "grad_norm": 0.25309205055236816, + "learning_rate": 3.3457565024050485e-05, + "loss": 1.7584, + "step": 20172 + }, + { + "epoch": 6.19183548189073, + "grad_norm": 0.3959091901779175, + "learning_rate": 3.3452874468920626e-05, + "loss": 1.7054, + "step": 20173 + }, + { + "epoch": 6.1921424186617555, + "grad_norm": 0.22697016596794128, + "learning_rate": 3.344818407731918e-05, + "loss": 1.7373, + "step": 20174 + }, + { + "epoch": 6.192449355432781, + "grad_norm": 0.298178493976593, + "learning_rate": 3.3443493849292465e-05, + "loss": 1.7192, + "step": 20175 + }, + { + "epoch": 6.192756292203806, + "grad_norm": 0.2742854058742523, + "learning_rate": 3.343880378488685e-05, + "loss": 1.7538, + "step": 20176 + }, + { + "epoch": 6.1930632289748315, + "grad_norm": 0.23367546498775482, + "learning_rate": 3.343411388414867e-05, + "loss": 1.694, + "step": 20177 + }, + { + "epoch": 6.193370165745856, + "grad_norm": 0.2932305932044983, + "learning_rate": 3.342942414712431e-05, + "loss": 1.7291, + "step": 20178 + }, + { + "epoch": 6.193677102516881, + "grad_norm": 0.24306413531303406, + "learning_rate": 3.342473457386007e-05, + "loss": 1.6959, + "step": 20179 + }, + { + "epoch": 6.193984039287907, + "grad_norm": 0.30828577280044556, + "learning_rate": 3.3420045164402344e-05, + "loss": 1.6848, + "step": 20180 + }, + { + "epoch": 6.194290976058932, + "grad_norm": 0.18766994774341583, + "learning_rate": 3.341535591879743e-05, + "loss": 1.7261, + "step": 20181 + }, + { + "epoch": 6.194597912829957, + "grad_norm": 0.300778329372406, + "learning_rate": 3.3410666837091696e-05, + "loss": 1.7539, + "step": 20182 + }, + { + "epoch": 6.194904849600983, + "grad_norm": 0.20148977637290955, + "learning_rate": 3.340597791933147e-05, + "loss": 1.7496, + "step": 20183 + }, + { + "epoch": 6.195211786372007, + "grad_norm": 0.2746329605579376, + "learning_rate": 3.340128916556311e-05, + "loss": 1.6458, + "step": 20184 + }, + { + "epoch": 6.195518723143032, + "grad_norm": 0.2715265452861786, + "learning_rate": 3.339660057583292e-05, + "loss": 1.7799, + "step": 20185 + }, + { + "epoch": 6.195825659914058, + "grad_norm": 0.2145555317401886, + "learning_rate": 3.339191215018728e-05, + "loss": 1.6854, + "step": 20186 + }, + { + "epoch": 6.196132596685083, + "grad_norm": 0.3018960654735565, + "learning_rate": 3.338722388867248e-05, + "loss": 1.7569, + "step": 20187 + }, + { + "epoch": 6.196439533456108, + "grad_norm": 0.24876931309700012, + "learning_rate": 3.338253579133487e-05, + "loss": 1.7434, + "step": 20188 + }, + { + "epoch": 6.196746470227133, + "grad_norm": 0.3609273433685303, + "learning_rate": 3.337784785822079e-05, + "loss": 1.737, + "step": 20189 + }, + { + "epoch": 6.197053406998158, + "grad_norm": 0.21586830914020538, + "learning_rate": 3.337316008937655e-05, + "loss": 1.7553, + "step": 20190 + }, + { + "epoch": 6.1973603437691835, + "grad_norm": 0.23542988300323486, + "learning_rate": 3.3368472484848504e-05, + "loss": 1.7174, + "step": 20191 + }, + { + "epoch": 6.197667280540209, + "grad_norm": 0.19861294329166412, + "learning_rate": 3.336378504468294e-05, + "loss": 1.7268, + "step": 20192 + }, + { + "epoch": 6.197974217311234, + "grad_norm": 0.26865682005882263, + "learning_rate": 3.335909776892622e-05, + "loss": 1.7656, + "step": 20193 + }, + { + "epoch": 6.198281154082259, + "grad_norm": 0.343078076839447, + "learning_rate": 3.3354410657624624e-05, + "loss": 1.734, + "step": 20194 + }, + { + "epoch": 6.198588090853284, + "grad_norm": 0.21613667905330658, + "learning_rate": 3.334972371082453e-05, + "loss": 1.7777, + "step": 20195 + }, + { + "epoch": 6.198895027624309, + "grad_norm": 0.22268854081630707, + "learning_rate": 3.3345036928572207e-05, + "loss": 1.667, + "step": 20196 + }, + { + "epoch": 6.199201964395335, + "grad_norm": 0.22870087623596191, + "learning_rate": 3.3340350310914e-05, + "loss": 1.7532, + "step": 20197 + }, + { + "epoch": 6.19950890116636, + "grad_norm": 0.1969831883907318, + "learning_rate": 3.3335663857896205e-05, + "loss": 1.7821, + "step": 20198 + }, + { + "epoch": 6.199815837937384, + "grad_norm": 0.20414133369922638, + "learning_rate": 3.3330977569565154e-05, + "loss": 1.7449, + "step": 20199 + }, + { + "epoch": 6.20012277470841, + "grad_norm": 0.21947748959064484, + "learning_rate": 3.332629144596714e-05, + "loss": 1.6888, + "step": 20200 + }, + { + "epoch": 6.200429711479435, + "grad_norm": 0.20943035185337067, + "learning_rate": 3.332160548714851e-05, + "loss": 1.7278, + "step": 20201 + }, + { + "epoch": 6.2007366482504604, + "grad_norm": 0.22410117089748383, + "learning_rate": 3.331691969315553e-05, + "loss": 1.721, + "step": 20202 + }, + { + "epoch": 6.201043585021486, + "grad_norm": 0.21422281861305237, + "learning_rate": 3.3312234064034555e-05, + "loss": 1.7199, + "step": 20203 + }, + { + "epoch": 6.201350521792511, + "grad_norm": 0.21021418273448944, + "learning_rate": 3.330754859983184e-05, + "loss": 1.7972, + "step": 20204 + }, + { + "epoch": 6.201657458563536, + "grad_norm": 0.21155185997486115, + "learning_rate": 3.330286330059371e-05, + "loss": 1.7463, + "step": 20205 + }, + { + "epoch": 6.201964395334561, + "grad_norm": 0.20241162180900574, + "learning_rate": 3.329817816636649e-05, + "loss": 1.7804, + "step": 20206 + }, + { + "epoch": 6.202271332105586, + "grad_norm": 0.19882376492023468, + "learning_rate": 3.329349319719644e-05, + "loss": 1.7564, + "step": 20207 + }, + { + "epoch": 6.202578268876612, + "grad_norm": 0.20528686046600342, + "learning_rate": 3.328880839312991e-05, + "loss": 1.751, + "step": 20208 + }, + { + "epoch": 6.202885205647637, + "grad_norm": 0.2708488404750824, + "learning_rate": 3.328412375421315e-05, + "loss": 1.8008, + "step": 20209 + }, + { + "epoch": 6.203192142418661, + "grad_norm": 0.1986229121685028, + "learning_rate": 3.3279439280492486e-05, + "loss": 1.6833, + "step": 20210 + }, + { + "epoch": 6.203499079189687, + "grad_norm": 0.2700355350971222, + "learning_rate": 3.3274754972014186e-05, + "loss": 1.8071, + "step": 20211 + }, + { + "epoch": 6.203806015960712, + "grad_norm": 0.23060421645641327, + "learning_rate": 3.327007082882458e-05, + "loss": 1.6856, + "step": 20212 + }, + { + "epoch": 6.204112952731737, + "grad_norm": 0.20798510313034058, + "learning_rate": 3.3265386850969926e-05, + "loss": 1.7421, + "step": 20213 + }, + { + "epoch": 6.204419889502763, + "grad_norm": 0.21828265488147736, + "learning_rate": 3.3260703038496556e-05, + "loss": 1.7212, + "step": 20214 + }, + { + "epoch": 6.204726826273788, + "grad_norm": 0.1965378224849701, + "learning_rate": 3.325601939145069e-05, + "loss": 1.6987, + "step": 20215 + }, + { + "epoch": 6.2050337630448125, + "grad_norm": 0.23897121846675873, + "learning_rate": 3.325133590987868e-05, + "loss": 1.7501, + "step": 20216 + }, + { + "epoch": 6.205340699815838, + "grad_norm": 0.18647781014442444, + "learning_rate": 3.324665259382676e-05, + "loss": 1.688, + "step": 20217 + }, + { + "epoch": 6.205647636586863, + "grad_norm": 0.19906121492385864, + "learning_rate": 3.324196944334127e-05, + "loss": 1.749, + "step": 20218 + }, + { + "epoch": 6.2059545733578885, + "grad_norm": 0.2061154991388321, + "learning_rate": 3.3237286458468444e-05, + "loss": 1.757, + "step": 20219 + }, + { + "epoch": 6.206261510128914, + "grad_norm": 0.19410182535648346, + "learning_rate": 3.323260363925459e-05, + "loss": 1.6826, + "step": 20220 + }, + { + "epoch": 6.206568446899938, + "grad_norm": 0.2017979919910431, + "learning_rate": 3.322792098574597e-05, + "loss": 1.7568, + "step": 20221 + }, + { + "epoch": 6.206875383670964, + "grad_norm": 0.19491736590862274, + "learning_rate": 3.322323849798885e-05, + "loss": 1.7082, + "step": 20222 + }, + { + "epoch": 6.207182320441989, + "grad_norm": 0.19826333224773407, + "learning_rate": 3.321855617602954e-05, + "loss": 1.7654, + "step": 20223 + }, + { + "epoch": 6.207489257213014, + "grad_norm": 0.18185383081436157, + "learning_rate": 3.321387401991428e-05, + "loss": 1.6826, + "step": 20224 + }, + { + "epoch": 6.20779619398404, + "grad_norm": 0.22402678430080414, + "learning_rate": 3.320919202968937e-05, + "loss": 1.795, + "step": 20225 + }, + { + "epoch": 6.208103130755064, + "grad_norm": 0.201541468501091, + "learning_rate": 3.320451020540105e-05, + "loss": 1.6838, + "step": 20226 + }, + { + "epoch": 6.208410067526089, + "grad_norm": 0.25479504466056824, + "learning_rate": 3.3199828547095616e-05, + "loss": 1.7881, + "step": 20227 + }, + { + "epoch": 6.208717004297115, + "grad_norm": 0.2057993859052658, + "learning_rate": 3.31951470548193e-05, + "loss": 1.737, + "step": 20228 + }, + { + "epoch": 6.20902394106814, + "grad_norm": 0.183469757437706, + "learning_rate": 3.319046572861842e-05, + "loss": 1.6989, + "step": 20229 + }, + { + "epoch": 6.209330877839165, + "grad_norm": 0.21723738312721252, + "learning_rate": 3.318578456853919e-05, + "loss": 1.7537, + "step": 20230 + }, + { + "epoch": 6.209637814610191, + "grad_norm": 0.21919457614421844, + "learning_rate": 3.318110357462791e-05, + "loss": 1.7444, + "step": 20231 + }, + { + "epoch": 6.209944751381215, + "grad_norm": 0.17009909451007843, + "learning_rate": 3.317642274693081e-05, + "loss": 1.6885, + "step": 20232 + }, + { + "epoch": 6.2102516881522405, + "grad_norm": 0.19625195860862732, + "learning_rate": 3.317174208549416e-05, + "loss": 1.7255, + "step": 20233 + }, + { + "epoch": 6.210558624923266, + "grad_norm": 0.2131364941596985, + "learning_rate": 3.316706159036422e-05, + "loss": 1.7047, + "step": 20234 + }, + { + "epoch": 6.210865561694291, + "grad_norm": 0.18454425036907196, + "learning_rate": 3.316238126158725e-05, + "loss": 1.7536, + "step": 20235 + }, + { + "epoch": 6.2111724984653165, + "grad_norm": 0.2124820202589035, + "learning_rate": 3.3157701099209485e-05, + "loss": 1.7456, + "step": 20236 + }, + { + "epoch": 6.211479435236341, + "grad_norm": 0.1929594725370407, + "learning_rate": 3.3153021103277206e-05, + "loss": 1.7118, + "step": 20237 + }, + { + "epoch": 6.211786372007366, + "grad_norm": 0.19876480102539062, + "learning_rate": 3.314834127383664e-05, + "loss": 1.6855, + "step": 20238 + }, + { + "epoch": 6.212093308778392, + "grad_norm": 0.18902665376663208, + "learning_rate": 3.314366161093403e-05, + "loss": 1.7052, + "step": 20239 + }, + { + "epoch": 6.212400245549417, + "grad_norm": 0.1859758198261261, + "learning_rate": 3.313898211461566e-05, + "loss": 1.7277, + "step": 20240 + }, + { + "epoch": 6.212707182320442, + "grad_norm": 0.2160472422838211, + "learning_rate": 3.313430278492773e-05, + "loss": 1.6787, + "step": 20241 + }, + { + "epoch": 6.213014119091467, + "grad_norm": 0.24482262134552002, + "learning_rate": 3.312962362191652e-05, + "loss": 1.7439, + "step": 20242 + }, + { + "epoch": 6.213321055862492, + "grad_norm": 0.2343531847000122, + "learning_rate": 3.312494462562824e-05, + "loss": 1.7981, + "step": 20243 + }, + { + "epoch": 6.213627992633517, + "grad_norm": 0.2385960817337036, + "learning_rate": 3.3120265796109163e-05, + "loss": 1.7144, + "step": 20244 + }, + { + "epoch": 6.213934929404543, + "grad_norm": 0.21878042817115784, + "learning_rate": 3.3115587133405503e-05, + "loss": 1.7057, + "step": 20245 + }, + { + "epoch": 6.214241866175568, + "grad_norm": 0.23426075279712677, + "learning_rate": 3.311090863756351e-05, + "loss": 1.7372, + "step": 20246 + }, + { + "epoch": 6.214548802946593, + "grad_norm": 0.2369524985551834, + "learning_rate": 3.310623030862942e-05, + "loss": 1.7502, + "step": 20247 + }, + { + "epoch": 6.214855739717618, + "grad_norm": 0.31635788083076477, + "learning_rate": 3.3101552146649474e-05, + "loss": 1.7616, + "step": 20248 + }, + { + "epoch": 6.215162676488643, + "grad_norm": 0.2312999814748764, + "learning_rate": 3.309687415166986e-05, + "loss": 1.6991, + "step": 20249 + }, + { + "epoch": 6.2154696132596685, + "grad_norm": 0.23423358798027039, + "learning_rate": 3.309219632373688e-05, + "loss": 1.7737, + "step": 20250 + }, + { + "epoch": 6.215776550030694, + "grad_norm": 0.28763437271118164, + "learning_rate": 3.308751866289671e-05, + "loss": 1.7822, + "step": 20251 + }, + { + "epoch": 6.216083486801719, + "grad_norm": 0.20754525065422058, + "learning_rate": 3.30828411691956e-05, + "loss": 1.7427, + "step": 20252 + }, + { + "epoch": 6.216390423572744, + "grad_norm": 0.31858858466148376, + "learning_rate": 3.307816384267975e-05, + "loss": 1.7384, + "step": 20253 + }, + { + "epoch": 6.216697360343769, + "grad_norm": 0.21968062222003937, + "learning_rate": 3.307348668339543e-05, + "loss": 1.6896, + "step": 20254 + }, + { + "epoch": 6.217004297114794, + "grad_norm": 0.21643556654453278, + "learning_rate": 3.306880969138882e-05, + "loss": 1.7353, + "step": 20255 + }, + { + "epoch": 6.21731123388582, + "grad_norm": 0.22141097486019135, + "learning_rate": 3.306413286670616e-05, + "loss": 1.7254, + "step": 20256 + }, + { + "epoch": 6.217618170656845, + "grad_norm": 0.17666983604431152, + "learning_rate": 3.305945620939367e-05, + "loss": 1.7198, + "step": 20257 + }, + { + "epoch": 6.21792510742787, + "grad_norm": 0.25182467699050903, + "learning_rate": 3.3054779719497544e-05, + "loss": 1.7562, + "step": 20258 + }, + { + "epoch": 6.218232044198895, + "grad_norm": 0.23481281101703644, + "learning_rate": 3.305010339706404e-05, + "loss": 1.8293, + "step": 20259 + }, + { + "epoch": 6.21853898096992, + "grad_norm": 0.23981143534183502, + "learning_rate": 3.304542724213933e-05, + "loss": 1.7619, + "step": 20260 + }, + { + "epoch": 6.218845917740945, + "grad_norm": 0.2388351708650589, + "learning_rate": 3.3040751254769665e-05, + "loss": 1.7471, + "step": 20261 + }, + { + "epoch": 6.219152854511971, + "grad_norm": 0.2039698362350464, + "learning_rate": 3.3036075435001216e-05, + "loss": 1.6893, + "step": 20262 + }, + { + "epoch": 6.219459791282996, + "grad_norm": 0.218357652425766, + "learning_rate": 3.3031399782880224e-05, + "loss": 1.753, + "step": 20263 + }, + { + "epoch": 6.2197667280540205, + "grad_norm": 0.25466734170913696, + "learning_rate": 3.302672429845288e-05, + "loss": 1.7496, + "step": 20264 + }, + { + "epoch": 6.220073664825046, + "grad_norm": 0.1853330284357071, + "learning_rate": 3.302204898176541e-05, + "loss": 1.7779, + "step": 20265 + }, + { + "epoch": 6.220380601596071, + "grad_norm": 0.24044091999530792, + "learning_rate": 3.3017373832863976e-05, + "loss": 1.8226, + "step": 20266 + }, + { + "epoch": 6.2206875383670965, + "grad_norm": 0.2209070324897766, + "learning_rate": 3.3012698851794835e-05, + "loss": 1.7069, + "step": 20267 + }, + { + "epoch": 6.220994475138122, + "grad_norm": 0.2775282561779022, + "learning_rate": 3.3008024038604135e-05, + "loss": 1.7048, + "step": 20268 + }, + { + "epoch": 6.221301411909146, + "grad_norm": 0.22873717546463013, + "learning_rate": 3.3003349393338116e-05, + "loss": 1.7956, + "step": 20269 + }, + { + "epoch": 6.221608348680172, + "grad_norm": 0.27883464097976685, + "learning_rate": 3.2998674916042946e-05, + "loss": 1.6955, + "step": 20270 + }, + { + "epoch": 6.221915285451197, + "grad_norm": 0.2383071482181549, + "learning_rate": 3.2994000606764865e-05, + "loss": 1.7645, + "step": 20271 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 0.26280200481414795, + "learning_rate": 3.298932646555003e-05, + "loss": 1.7854, + "step": 20272 + }, + { + "epoch": 6.222529158993248, + "grad_norm": 0.2387673407793045, + "learning_rate": 3.2984652492444625e-05, + "loss": 1.679, + "step": 20273 + }, + { + "epoch": 6.222836095764273, + "grad_norm": 0.2136983871459961, + "learning_rate": 3.297997868749486e-05, + "loss": 1.7313, + "step": 20274 + }, + { + "epoch": 6.223143032535297, + "grad_norm": 0.2629627585411072, + "learning_rate": 3.297530505074692e-05, + "loss": 1.7452, + "step": 20275 + }, + { + "epoch": 6.223449969306323, + "grad_norm": 0.22018705308437347, + "learning_rate": 3.2970631582247e-05, + "loss": 1.7368, + "step": 20276 + }, + { + "epoch": 6.223756906077348, + "grad_norm": 0.19277356564998627, + "learning_rate": 3.296595828204128e-05, + "loss": 1.7084, + "step": 20277 + }, + { + "epoch": 6.224063842848373, + "grad_norm": 0.18806682527065277, + "learning_rate": 3.2961285150175944e-05, + "loss": 1.6576, + "step": 20278 + }, + { + "epoch": 6.224370779619399, + "grad_norm": 0.2019709348678589, + "learning_rate": 3.295661218669717e-05, + "loss": 1.7594, + "step": 20279 + }, + { + "epoch": 6.224677716390423, + "grad_norm": 0.19662119448184967, + "learning_rate": 3.295193939165114e-05, + "loss": 1.6946, + "step": 20280 + }, + { + "epoch": 6.2249846531614486, + "grad_norm": 0.1880662590265274, + "learning_rate": 3.294726676508404e-05, + "loss": 1.7232, + "step": 20281 + }, + { + "epoch": 6.225291589932474, + "grad_norm": 0.23242273926734924, + "learning_rate": 3.294259430704206e-05, + "loss": 1.7331, + "step": 20282 + }, + { + "epoch": 6.225598526703499, + "grad_norm": 0.19915202260017395, + "learning_rate": 3.293792201757134e-05, + "loss": 1.7844, + "step": 20283 + }, + { + "epoch": 6.225905463474525, + "grad_norm": 0.1845373958349228, + "learning_rate": 3.2933249896718097e-05, + "loss": 1.6803, + "step": 20284 + }, + { + "epoch": 6.226212400245549, + "grad_norm": 0.19340910017490387, + "learning_rate": 3.292857794452846e-05, + "loss": 1.6929, + "step": 20285 + }, + { + "epoch": 6.226519337016574, + "grad_norm": 0.21429216861724854, + "learning_rate": 3.292390616104863e-05, + "loss": 1.6833, + "step": 20286 + }, + { + "epoch": 6.2268262737876, + "grad_norm": 0.2267037034034729, + "learning_rate": 3.291923454632476e-05, + "loss": 1.7271, + "step": 20287 + }, + { + "epoch": 6.227133210558625, + "grad_norm": 0.23121988773345947, + "learning_rate": 3.2914563100403054e-05, + "loss": 1.8443, + "step": 20288 + }, + { + "epoch": 6.22744014732965, + "grad_norm": 0.20980899035930634, + "learning_rate": 3.290989182332964e-05, + "loss": 1.6907, + "step": 20289 + }, + { + "epoch": 6.227747084100676, + "grad_norm": 0.28162500262260437, + "learning_rate": 3.290522071515067e-05, + "loss": 1.7497, + "step": 20290 + }, + { + "epoch": 6.2280540208717, + "grad_norm": 0.2163640707731247, + "learning_rate": 3.290054977591234e-05, + "loss": 1.736, + "step": 20291 + }, + { + "epoch": 6.2283609576427255, + "grad_norm": 0.19144479930400848, + "learning_rate": 3.289587900566079e-05, + "loss": 1.7222, + "step": 20292 + }, + { + "epoch": 6.228667894413751, + "grad_norm": 0.24952897429466248, + "learning_rate": 3.2891208404442216e-05, + "loss": 1.7095, + "step": 20293 + }, + { + "epoch": 6.228974831184776, + "grad_norm": 0.19421981275081635, + "learning_rate": 3.288653797230272e-05, + "loss": 1.7231, + "step": 20294 + }, + { + "epoch": 6.2292817679558015, + "grad_norm": 0.22837944328784943, + "learning_rate": 3.288186770928851e-05, + "loss": 1.7404, + "step": 20295 + }, + { + "epoch": 6.229588704726826, + "grad_norm": 0.2292151004076004, + "learning_rate": 3.2877197615445685e-05, + "loss": 1.6999, + "step": 20296 + }, + { + "epoch": 6.229895641497851, + "grad_norm": 0.18376365303993225, + "learning_rate": 3.2872527690820456e-05, + "loss": 1.681, + "step": 20297 + }, + { + "epoch": 6.230202578268877, + "grad_norm": 0.21331918239593506, + "learning_rate": 3.286785793545893e-05, + "loss": 1.7362, + "step": 20298 + }, + { + "epoch": 6.230509515039902, + "grad_norm": 0.21247150003910065, + "learning_rate": 3.286318834940729e-05, + "loss": 1.7816, + "step": 20299 + }, + { + "epoch": 6.230816451810927, + "grad_norm": 0.19166043400764465, + "learning_rate": 3.285851893271165e-05, + "loss": 1.7209, + "step": 20300 + }, + { + "epoch": 6.231123388581952, + "grad_norm": 0.2139919251203537, + "learning_rate": 3.2853849685418195e-05, + "loss": 1.6946, + "step": 20301 + }, + { + "epoch": 6.231430325352977, + "grad_norm": 0.20296575129032135, + "learning_rate": 3.284918060757303e-05, + "loss": 1.6829, + "step": 20302 + }, + { + "epoch": 6.231737262124002, + "grad_norm": 0.2465996891260147, + "learning_rate": 3.2844511699222314e-05, + "loss": 1.751, + "step": 20303 + }, + { + "epoch": 6.232044198895028, + "grad_norm": 0.23327109217643738, + "learning_rate": 3.283984296041219e-05, + "loss": 1.736, + "step": 20304 + }, + { + "epoch": 6.232351135666053, + "grad_norm": 0.24316997826099396, + "learning_rate": 3.2835174391188806e-05, + "loss": 1.7187, + "step": 20305 + }, + { + "epoch": 6.232658072437078, + "grad_norm": 0.25280308723449707, + "learning_rate": 3.2830505991598294e-05, + "loss": 1.7087, + "step": 20306 + }, + { + "epoch": 6.232965009208103, + "grad_norm": 0.19143202900886536, + "learning_rate": 3.282583776168676e-05, + "loss": 1.674, + "step": 20307 + }, + { + "epoch": 6.233271945979128, + "grad_norm": 0.2667979598045349, + "learning_rate": 3.282116970150038e-05, + "loss": 1.7978, + "step": 20308 + }, + { + "epoch": 6.2335788827501535, + "grad_norm": 0.18397411704063416, + "learning_rate": 3.281650181108526e-05, + "loss": 1.7669, + "step": 20309 + }, + { + "epoch": 6.233885819521179, + "grad_norm": 0.2842588722705841, + "learning_rate": 3.281183409048756e-05, + "loss": 1.8238, + "step": 20310 + }, + { + "epoch": 6.234192756292204, + "grad_norm": 0.20290467143058777, + "learning_rate": 3.280716653975336e-05, + "loss": 1.7317, + "step": 20311 + }, + { + "epoch": 6.234499693063229, + "grad_norm": 0.224524587392807, + "learning_rate": 3.280249915892885e-05, + "loss": 1.8166, + "step": 20312 + }, + { + "epoch": 6.234806629834254, + "grad_norm": 0.28204405307769775, + "learning_rate": 3.2797831948060096e-05, + "loss": 1.7435, + "step": 20313 + }, + { + "epoch": 6.235113566605279, + "grad_norm": 0.2101798951625824, + "learning_rate": 3.2793164907193264e-05, + "loss": 1.6747, + "step": 20314 + }, + { + "epoch": 6.235420503376305, + "grad_norm": 0.1961289346218109, + "learning_rate": 3.278849803637445e-05, + "loss": 1.7131, + "step": 20315 + }, + { + "epoch": 6.23572744014733, + "grad_norm": 0.30541354417800903, + "learning_rate": 3.27838313356498e-05, + "loss": 1.8036, + "step": 20316 + }, + { + "epoch": 6.236034376918354, + "grad_norm": 0.21517200767993927, + "learning_rate": 3.277916480506541e-05, + "loss": 1.7684, + "step": 20317 + }, + { + "epoch": 6.23634131368938, + "grad_norm": 0.22871750593185425, + "learning_rate": 3.2774498444667426e-05, + "loss": 1.7545, + "step": 20318 + }, + { + "epoch": 6.236648250460405, + "grad_norm": 0.24596424400806427, + "learning_rate": 3.276983225450192e-05, + "loss": 1.6705, + "step": 20319 + }, + { + "epoch": 6.23695518723143, + "grad_norm": 0.19123119115829468, + "learning_rate": 3.2765166234615044e-05, + "loss": 1.7402, + "step": 20320 + }, + { + "epoch": 6.237262124002456, + "grad_norm": 0.25287121534347534, + "learning_rate": 3.276050038505288e-05, + "loss": 1.741, + "step": 20321 + }, + { + "epoch": 6.237569060773481, + "grad_norm": 0.19741536676883698, + "learning_rate": 3.275583470586158e-05, + "loss": 1.736, + "step": 20322 + }, + { + "epoch": 6.2378759975445055, + "grad_norm": 0.24529922008514404, + "learning_rate": 3.275116919708723e-05, + "loss": 1.6696, + "step": 20323 + }, + { + "epoch": 6.238182934315531, + "grad_norm": 0.25428420305252075, + "learning_rate": 3.274650385877591e-05, + "loss": 1.696, + "step": 20324 + }, + { + "epoch": 6.238489871086556, + "grad_norm": 0.19502994418144226, + "learning_rate": 3.274183869097377e-05, + "loss": 1.6976, + "step": 20325 + }, + { + "epoch": 6.2387968078575815, + "grad_norm": 0.23710335791110992, + "learning_rate": 3.273717369372688e-05, + "loss": 1.7395, + "step": 20326 + }, + { + "epoch": 6.239103744628607, + "grad_norm": 0.20904341340065002, + "learning_rate": 3.273250886708138e-05, + "loss": 1.7455, + "step": 20327 + }, + { + "epoch": 6.239410681399631, + "grad_norm": 0.2112383097410202, + "learning_rate": 3.272784421108332e-05, + "loss": 1.7401, + "step": 20328 + }, + { + "epoch": 6.239717618170657, + "grad_norm": 0.2310914695262909, + "learning_rate": 3.272317972577886e-05, + "loss": 1.8049, + "step": 20329 + }, + { + "epoch": 6.240024554941682, + "grad_norm": 0.18222108483314514, + "learning_rate": 3.271851541121404e-05, + "loss": 1.7119, + "step": 20330 + }, + { + "epoch": 6.240331491712707, + "grad_norm": 0.18739092350006104, + "learning_rate": 3.2713851267434984e-05, + "loss": 1.744, + "step": 20331 + }, + { + "epoch": 6.240638428483733, + "grad_norm": 0.17722012102603912, + "learning_rate": 3.2709187294487775e-05, + "loss": 1.7054, + "step": 20332 + }, + { + "epoch": 6.240945365254758, + "grad_norm": 0.18650192022323608, + "learning_rate": 3.270452349241854e-05, + "loss": 1.7272, + "step": 20333 + }, + { + "epoch": 6.241252302025782, + "grad_norm": 0.2004886120557785, + "learning_rate": 3.269985986127331e-05, + "loss": 1.6777, + "step": 20334 + }, + { + "epoch": 6.241559238796808, + "grad_norm": 0.1855446845293045, + "learning_rate": 3.269519640109823e-05, + "loss": 1.6823, + "step": 20335 + }, + { + "epoch": 6.241866175567833, + "grad_norm": 0.1950632780790329, + "learning_rate": 3.269053311193934e-05, + "loss": 1.7052, + "step": 20336 + }, + { + "epoch": 6.242173112338858, + "grad_norm": 0.19386698305606842, + "learning_rate": 3.268586999384276e-05, + "loss": 1.7431, + "step": 20337 + }, + { + "epoch": 6.242480049109884, + "grad_norm": 0.2266446053981781, + "learning_rate": 3.268120704685454e-05, + "loss": 1.735, + "step": 20338 + }, + { + "epoch": 6.242786985880908, + "grad_norm": 0.24133828282356262, + "learning_rate": 3.2676544271020814e-05, + "loss": 1.7707, + "step": 20339 + }, + { + "epoch": 6.2430939226519335, + "grad_norm": 0.22397162020206451, + "learning_rate": 3.267188166638763e-05, + "loss": 1.6943, + "step": 20340 + }, + { + "epoch": 6.243400859422959, + "grad_norm": 0.1614205688238144, + "learning_rate": 3.266721923300104e-05, + "loss": 1.6801, + "step": 20341 + }, + { + "epoch": 6.243707796193984, + "grad_norm": 0.22376522421836853, + "learning_rate": 3.2662556970907166e-05, + "loss": 1.6933, + "step": 20342 + }, + { + "epoch": 6.2440147329650095, + "grad_norm": 0.18614265322685242, + "learning_rate": 3.265789488015205e-05, + "loss": 1.7396, + "step": 20343 + }, + { + "epoch": 6.244321669736034, + "grad_norm": 0.2385358214378357, + "learning_rate": 3.265323296078181e-05, + "loss": 1.7782, + "step": 20344 + }, + { + "epoch": 6.244628606507059, + "grad_norm": 0.24316444993019104, + "learning_rate": 3.264857121284246e-05, + "loss": 1.7443, + "step": 20345 + }, + { + "epoch": 6.244935543278085, + "grad_norm": 0.184532031416893, + "learning_rate": 3.264390963638012e-05, + "loss": 1.7603, + "step": 20346 + }, + { + "epoch": 6.24524248004911, + "grad_norm": 0.2018461376428604, + "learning_rate": 3.2639248231440825e-05, + "loss": 1.7289, + "step": 20347 + }, + { + "epoch": 6.245549416820135, + "grad_norm": 0.23732338845729828, + "learning_rate": 3.263458699807066e-05, + "loss": 1.7924, + "step": 20348 + }, + { + "epoch": 6.245856353591161, + "grad_norm": 0.19645710289478302, + "learning_rate": 3.2629925936315674e-05, + "loss": 1.6855, + "step": 20349 + }, + { + "epoch": 6.246163290362185, + "grad_norm": 0.20730608701705933, + "learning_rate": 3.262526504622196e-05, + "loss": 1.7238, + "step": 20350 + }, + { + "epoch": 6.24647022713321, + "grad_norm": 0.21139587461948395, + "learning_rate": 3.2620604327835545e-05, + "loss": 1.7173, + "step": 20351 + }, + { + "epoch": 6.246777163904236, + "grad_norm": 0.22644877433776855, + "learning_rate": 3.261594378120252e-05, + "loss": 1.7976, + "step": 20352 + }, + { + "epoch": 6.247084100675261, + "grad_norm": 0.23719535768032074, + "learning_rate": 3.2611283406368906e-05, + "loss": 1.7549, + "step": 20353 + }, + { + "epoch": 6.247391037446286, + "grad_norm": 0.2046387791633606, + "learning_rate": 3.2606623203380807e-05, + "loss": 1.7343, + "step": 20354 + }, + { + "epoch": 6.247697974217311, + "grad_norm": 0.19325366616249084, + "learning_rate": 3.260196317228422e-05, + "loss": 1.7352, + "step": 20355 + }, + { + "epoch": 6.248004910988336, + "grad_norm": 0.2315458059310913, + "learning_rate": 3.259730331312526e-05, + "loss": 1.7838, + "step": 20356 + }, + { + "epoch": 6.2483118477593615, + "grad_norm": 0.24549536406993866, + "learning_rate": 3.2592643625949956e-05, + "loss": 1.7418, + "step": 20357 + }, + { + "epoch": 6.248618784530387, + "grad_norm": 0.2702246606349945, + "learning_rate": 3.258798411080432e-05, + "loss": 1.7651, + "step": 20358 + }, + { + "epoch": 6.248925721301412, + "grad_norm": 0.20515258610248566, + "learning_rate": 3.2583324767734444e-05, + "loss": 1.6866, + "step": 20359 + }, + { + "epoch": 6.249232658072437, + "grad_norm": 0.2696690261363983, + "learning_rate": 3.257866559678635e-05, + "loss": 1.7446, + "step": 20360 + }, + { + "epoch": 6.249539594843462, + "grad_norm": 0.19707174599170685, + "learning_rate": 3.2574006598006114e-05, + "loss": 1.6835, + "step": 20361 + }, + { + "epoch": 6.249846531614487, + "grad_norm": 0.23478952050209045, + "learning_rate": 3.256934777143974e-05, + "loss": 1.7344, + "step": 20362 + }, + { + "epoch": 6.250153468385513, + "grad_norm": 0.24214082956314087, + "learning_rate": 3.2564689117133306e-05, + "loss": 1.722, + "step": 20363 + }, + { + "epoch": 6.250460405156538, + "grad_norm": 0.18361221253871918, + "learning_rate": 3.256003063513281e-05, + "loss": 1.7336, + "step": 20364 + }, + { + "epoch": 6.250767341927563, + "grad_norm": 0.18548928201198578, + "learning_rate": 3.255537232548433e-05, + "loss": 1.6586, + "step": 20365 + }, + { + "epoch": 6.251074278698588, + "grad_norm": 0.2121812105178833, + "learning_rate": 3.2550714188233874e-05, + "loss": 1.7273, + "step": 20366 + }, + { + "epoch": 6.251381215469613, + "grad_norm": 0.2351878583431244, + "learning_rate": 3.25460562234275e-05, + "loss": 1.7101, + "step": 20367 + }, + { + "epoch": 6.2516881522406385, + "grad_norm": 0.20723144710063934, + "learning_rate": 3.2541398431111216e-05, + "loss": 1.7042, + "step": 20368 + }, + { + "epoch": 6.251995089011664, + "grad_norm": 0.19093643128871918, + "learning_rate": 3.2536740811331084e-05, + "loss": 1.7585, + "step": 20369 + }, + { + "epoch": 6.252302025782689, + "grad_norm": 0.27191361784935, + "learning_rate": 3.2532083364133094e-05, + "loss": 1.7734, + "step": 20370 + }, + { + "epoch": 6.252608962553714, + "grad_norm": 0.21019349992275238, + "learning_rate": 3.2527426089563306e-05, + "loss": 1.7015, + "step": 20371 + }, + { + "epoch": 6.252915899324739, + "grad_norm": 0.2300454080104828, + "learning_rate": 3.2522768987667744e-05, + "loss": 1.7311, + "step": 20372 + }, + { + "epoch": 6.253222836095764, + "grad_norm": 0.24723999202251434, + "learning_rate": 3.25181120584924e-05, + "loss": 1.674, + "step": 20373 + }, + { + "epoch": 6.25352977286679, + "grad_norm": 0.20302192866802216, + "learning_rate": 3.251345530208335e-05, + "loss": 1.6999, + "step": 20374 + }, + { + "epoch": 6.253836709637815, + "grad_norm": 0.25393861532211304, + "learning_rate": 3.250879871848655e-05, + "loss": 1.6761, + "step": 20375 + }, + { + "epoch": 6.25414364640884, + "grad_norm": 0.1879536211490631, + "learning_rate": 3.2504142307748064e-05, + "loss": 1.7233, + "step": 20376 + }, + { + "epoch": 6.254450583179865, + "grad_norm": 0.22197771072387695, + "learning_rate": 3.24994860699139e-05, + "loss": 1.6994, + "step": 20377 + }, + { + "epoch": 6.25475751995089, + "grad_norm": 0.24946242570877075, + "learning_rate": 3.249483000503008e-05, + "loss": 1.8488, + "step": 20378 + }, + { + "epoch": 6.255064456721915, + "grad_norm": 0.25218987464904785, + "learning_rate": 3.2490174113142594e-05, + "loss": 1.7947, + "step": 20379 + }, + { + "epoch": 6.255371393492941, + "grad_norm": 0.23970970511436462, + "learning_rate": 3.248551839429749e-05, + "loss": 1.785, + "step": 20380 + }, + { + "epoch": 6.255678330263966, + "grad_norm": 0.243649423122406, + "learning_rate": 3.248086284854074e-05, + "loss": 1.8089, + "step": 20381 + }, + { + "epoch": 6.2559852670349905, + "grad_norm": 0.18813125789165497, + "learning_rate": 3.247620747591838e-05, + "loss": 1.6892, + "step": 20382 + }, + { + "epoch": 6.256292203806016, + "grad_norm": 0.2495514154434204, + "learning_rate": 3.2471552276476404e-05, + "loss": 1.7573, + "step": 20383 + }, + { + "epoch": 6.256599140577041, + "grad_norm": 0.200107604265213, + "learning_rate": 3.2466897250260835e-05, + "loss": 1.7292, + "step": 20384 + }, + { + "epoch": 6.2569060773480665, + "grad_norm": 0.25782206654548645, + "learning_rate": 3.246224239731765e-05, + "loss": 1.8533, + "step": 20385 + }, + { + "epoch": 6.257213014119092, + "grad_norm": 0.1966158151626587, + "learning_rate": 3.245758771769288e-05, + "loss": 1.648, + "step": 20386 + }, + { + "epoch": 6.257519950890116, + "grad_norm": 0.23248116672039032, + "learning_rate": 3.245293321143249e-05, + "loss": 1.7277, + "step": 20387 + }, + { + "epoch": 6.257826887661142, + "grad_norm": 0.26347780227661133, + "learning_rate": 3.244827887858251e-05, + "loss": 1.7429, + "step": 20388 + }, + { + "epoch": 6.258133824432167, + "grad_norm": 0.20794285833835602, + "learning_rate": 3.244362471918894e-05, + "loss": 1.7358, + "step": 20389 + }, + { + "epoch": 6.258440761203192, + "grad_norm": 0.200898677110672, + "learning_rate": 3.243897073329774e-05, + "loss": 1.6661, + "step": 20390 + }, + { + "epoch": 6.258747697974218, + "grad_norm": 0.20945283770561218, + "learning_rate": 3.2434316920954935e-05, + "loss": 1.7036, + "step": 20391 + }, + { + "epoch": 6.259054634745242, + "grad_norm": 0.3154161274433136, + "learning_rate": 3.242966328220649e-05, + "loss": 1.8174, + "step": 20392 + }, + { + "epoch": 6.259361571516267, + "grad_norm": 0.19321799278259277, + "learning_rate": 3.242500981709843e-05, + "loss": 1.6823, + "step": 20393 + }, + { + "epoch": 6.259668508287293, + "grad_norm": 0.22610130906105042, + "learning_rate": 3.2420356525676696e-05, + "loss": 1.6865, + "step": 20394 + }, + { + "epoch": 6.259975445058318, + "grad_norm": 0.19190505146980286, + "learning_rate": 3.241570340798734e-05, + "loss": 1.6663, + "step": 20395 + }, + { + "epoch": 6.260282381829343, + "grad_norm": 0.21956418454647064, + "learning_rate": 3.2411050464076276e-05, + "loss": 1.7279, + "step": 20396 + }, + { + "epoch": 6.260589318600369, + "grad_norm": 0.2448553591966629, + "learning_rate": 3.240639769398956e-05, + "loss": 1.7438, + "step": 20397 + }, + { + "epoch": 6.260896255371393, + "grad_norm": 0.19194214046001434, + "learning_rate": 3.2401745097773096e-05, + "loss": 1.7429, + "step": 20398 + }, + { + "epoch": 6.2612031921424185, + "grad_norm": 0.2567521333694458, + "learning_rate": 3.239709267547291e-05, + "loss": 1.7051, + "step": 20399 + }, + { + "epoch": 6.261510128913444, + "grad_norm": 0.18335886299610138, + "learning_rate": 3.239244042713498e-05, + "loss": 1.6828, + "step": 20400 + }, + { + "epoch": 6.261817065684469, + "grad_norm": 0.20112362504005432, + "learning_rate": 3.238778835280527e-05, + "loss": 1.6887, + "step": 20401 + }, + { + "epoch": 6.2621240024554945, + "grad_norm": 0.17095179855823517, + "learning_rate": 3.238313645252975e-05, + "loss": 1.7202, + "step": 20402 + }, + { + "epoch": 6.262430939226519, + "grad_norm": 0.24681979417800903, + "learning_rate": 3.237848472635442e-05, + "loss": 1.7196, + "step": 20403 + }, + { + "epoch": 6.262737875997544, + "grad_norm": 0.2022300660610199, + "learning_rate": 3.237383317432522e-05, + "loss": 1.7265, + "step": 20404 + }, + { + "epoch": 6.26304481276857, + "grad_norm": 0.2900621294975281, + "learning_rate": 3.236918179648813e-05, + "loss": 1.7051, + "step": 20405 + }, + { + "epoch": 6.263351749539595, + "grad_norm": 0.37675586342811584, + "learning_rate": 3.2364530592889135e-05, + "loss": 1.7747, + "step": 20406 + }, + { + "epoch": 6.26365868631062, + "grad_norm": 0.19033703207969666, + "learning_rate": 3.235987956357416e-05, + "loss": 1.7529, + "step": 20407 + }, + { + "epoch": 6.263965623081646, + "grad_norm": 0.2877013385295868, + "learning_rate": 3.235522870858922e-05, + "loss": 1.6942, + "step": 20408 + }, + { + "epoch": 6.26427255985267, + "grad_norm": 0.22717125713825226, + "learning_rate": 3.235057802798023e-05, + "loss": 1.7302, + "step": 20409 + }, + { + "epoch": 6.264579496623695, + "grad_norm": 0.2571920156478882, + "learning_rate": 3.2345927521793185e-05, + "loss": 1.6782, + "step": 20410 + }, + { + "epoch": 6.264886433394721, + "grad_norm": 0.43085625767707825, + "learning_rate": 3.234127719007403e-05, + "loss": 1.7946, + "step": 20411 + }, + { + "epoch": 6.265193370165746, + "grad_norm": 0.19355928897857666, + "learning_rate": 3.2336627032868726e-05, + "loss": 1.7288, + "step": 20412 + }, + { + "epoch": 6.265500306936771, + "grad_norm": 0.24871474504470825, + "learning_rate": 3.233197705022322e-05, + "loss": 1.6862, + "step": 20413 + }, + { + "epoch": 6.265807243707796, + "grad_norm": 0.26919320225715637, + "learning_rate": 3.232732724218348e-05, + "loss": 1.8061, + "step": 20414 + }, + { + "epoch": 6.266114180478821, + "grad_norm": 0.21714363992214203, + "learning_rate": 3.2322677608795436e-05, + "loss": 1.7036, + "step": 20415 + }, + { + "epoch": 6.2664211172498465, + "grad_norm": 0.24496719241142273, + "learning_rate": 3.231802815010506e-05, + "loss": 1.7334, + "step": 20416 + }, + { + "epoch": 6.266728054020872, + "grad_norm": 0.22501519322395325, + "learning_rate": 3.231337886615831e-05, + "loss": 1.7545, + "step": 20417 + }, + { + "epoch": 6.267034990791897, + "grad_norm": 0.2683655917644501, + "learning_rate": 3.23087297570011e-05, + "loss": 1.7235, + "step": 20418 + }, + { + "epoch": 6.267341927562922, + "grad_norm": 0.23341359198093414, + "learning_rate": 3.230408082267938e-05, + "loss": 1.7389, + "step": 20419 + }, + { + "epoch": 6.267648864333947, + "grad_norm": 0.2914128601551056, + "learning_rate": 3.229943206323913e-05, + "loss": 1.7223, + "step": 20420 + }, + { + "epoch": 6.267955801104972, + "grad_norm": 0.2072528451681137, + "learning_rate": 3.229478347872625e-05, + "loss": 1.7422, + "step": 20421 + }, + { + "epoch": 6.268262737875998, + "grad_norm": 0.22678662836551666, + "learning_rate": 3.229013506918671e-05, + "loss": 1.6973, + "step": 20422 + }, + { + "epoch": 6.268569674647023, + "grad_norm": 0.1928883194923401, + "learning_rate": 3.228548683466643e-05, + "loss": 1.7235, + "step": 20423 + }, + { + "epoch": 6.268876611418047, + "grad_norm": 0.2402963638305664, + "learning_rate": 3.2280838775211345e-05, + "loss": 1.7587, + "step": 20424 + }, + { + "epoch": 6.269183548189073, + "grad_norm": 0.20416294038295746, + "learning_rate": 3.227619089086742e-05, + "loss": 1.7591, + "step": 20425 + }, + { + "epoch": 6.269490484960098, + "grad_norm": 0.20308947563171387, + "learning_rate": 3.227154318168053e-05, + "loss": 1.7264, + "step": 20426 + }, + { + "epoch": 6.269797421731123, + "grad_norm": 0.18733863532543182, + "learning_rate": 3.226689564769667e-05, + "loss": 1.6943, + "step": 20427 + }, + { + "epoch": 6.270104358502149, + "grad_norm": 0.183793842792511, + "learning_rate": 3.226224828896173e-05, + "loss": 1.7082, + "step": 20428 + }, + { + "epoch": 6.270411295273174, + "grad_norm": 0.20471547544002533, + "learning_rate": 3.225760110552165e-05, + "loss": 1.7352, + "step": 20429 + }, + { + "epoch": 6.2707182320441985, + "grad_norm": 0.23386713862419128, + "learning_rate": 3.225295409742234e-05, + "loss": 1.7666, + "step": 20430 + }, + { + "epoch": 6.271025168815224, + "grad_norm": 0.2024994194507599, + "learning_rate": 3.224830726470976e-05, + "loss": 1.6573, + "step": 20431 + }, + { + "epoch": 6.271332105586249, + "grad_norm": 0.2352776825428009, + "learning_rate": 3.2243660607429805e-05, + "loss": 1.7884, + "step": 20432 + }, + { + "epoch": 6.2716390423572745, + "grad_norm": 0.19755585491657257, + "learning_rate": 3.223901412562841e-05, + "loss": 1.6964, + "step": 20433 + }, + { + "epoch": 6.2719459791283, + "grad_norm": 0.25833839178085327, + "learning_rate": 3.223436781935148e-05, + "loss": 1.715, + "step": 20434 + }, + { + "epoch": 6.272252915899324, + "grad_norm": 0.2110220193862915, + "learning_rate": 3.222972168864493e-05, + "loss": 1.7617, + "step": 20435 + }, + { + "epoch": 6.27255985267035, + "grad_norm": 0.23262515664100647, + "learning_rate": 3.2225075733554685e-05, + "loss": 1.7616, + "step": 20436 + }, + { + "epoch": 6.272866789441375, + "grad_norm": 0.1926576942205429, + "learning_rate": 3.222042995412669e-05, + "loss": 1.6956, + "step": 20437 + }, + { + "epoch": 6.2731737262124, + "grad_norm": 0.20662757754325867, + "learning_rate": 3.22157843504068e-05, + "loss": 1.703, + "step": 20438 + }, + { + "epoch": 6.273480662983426, + "grad_norm": 0.22137406468391418, + "learning_rate": 3.2211138922440975e-05, + "loss": 1.6961, + "step": 20439 + }, + { + "epoch": 6.273787599754451, + "grad_norm": 0.25777003169059753, + "learning_rate": 3.2206493670275086e-05, + "loss": 1.704, + "step": 20440 + }, + { + "epoch": 6.274094536525475, + "grad_norm": 0.20540094375610352, + "learning_rate": 3.2201848593955046e-05, + "loss": 1.6759, + "step": 20441 + }, + { + "epoch": 6.274401473296501, + "grad_norm": 0.2447255402803421, + "learning_rate": 3.21972036935268e-05, + "loss": 1.7379, + "step": 20442 + }, + { + "epoch": 6.274708410067526, + "grad_norm": 0.2017194777727127, + "learning_rate": 3.219255896903619e-05, + "loss": 1.6518, + "step": 20443 + }, + { + "epoch": 6.2750153468385514, + "grad_norm": 0.22742003202438354, + "learning_rate": 3.2187914420529174e-05, + "loss": 1.7568, + "step": 20444 + }, + { + "epoch": 6.275322283609577, + "grad_norm": 0.2065356969833374, + "learning_rate": 3.218327004805161e-05, + "loss": 1.643, + "step": 20445 + }, + { + "epoch": 6.275629220380601, + "grad_norm": 0.18083053827285767, + "learning_rate": 3.217862585164942e-05, + "loss": 1.77, + "step": 20446 + }, + { + "epoch": 6.275936157151627, + "grad_norm": 0.2175968736410141, + "learning_rate": 3.2173981831368484e-05, + "loss": 1.738, + "step": 20447 + }, + { + "epoch": 6.276243093922652, + "grad_norm": 0.17635080218315125, + "learning_rate": 3.216933798725473e-05, + "loss": 1.7109, + "step": 20448 + }, + { + "epoch": 6.276550030693677, + "grad_norm": 0.22289423644542694, + "learning_rate": 3.216469431935401e-05, + "loss": 1.7853, + "step": 20449 + }, + { + "epoch": 6.276856967464703, + "grad_norm": 0.21214549243450165, + "learning_rate": 3.216005082771225e-05, + "loss": 1.8196, + "step": 20450 + }, + { + "epoch": 6.277163904235728, + "grad_norm": 0.21992212533950806, + "learning_rate": 3.215540751237531e-05, + "loss": 1.7445, + "step": 20451 + }, + { + "epoch": 6.277470841006752, + "grad_norm": 0.16256563365459442, + "learning_rate": 3.2150764373389096e-05, + "loss": 1.6582, + "step": 20452 + }, + { + "epoch": 6.277777777777778, + "grad_norm": 0.1885976791381836, + "learning_rate": 3.214612141079949e-05, + "loss": 1.7491, + "step": 20453 + }, + { + "epoch": 6.278084714548803, + "grad_norm": 0.24101774394512177, + "learning_rate": 3.2141478624652386e-05, + "loss": 1.7476, + "step": 20454 + }, + { + "epoch": 6.278391651319828, + "grad_norm": 0.23378998041152954, + "learning_rate": 3.213683601499364e-05, + "loss": 1.7575, + "step": 20455 + }, + { + "epoch": 6.278698588090854, + "grad_norm": 0.2032867670059204, + "learning_rate": 3.213219358186917e-05, + "loss": 1.6999, + "step": 20456 + }, + { + "epoch": 6.279005524861878, + "grad_norm": 0.21332181990146637, + "learning_rate": 3.2127551325324836e-05, + "loss": 1.6634, + "step": 20457 + }, + { + "epoch": 6.2793124616329035, + "grad_norm": 0.23767098784446716, + "learning_rate": 3.2122909245406494e-05, + "loss": 1.8023, + "step": 20458 + }, + { + "epoch": 6.279619398403929, + "grad_norm": 0.19987638294696808, + "learning_rate": 3.211826734216007e-05, + "loss": 1.6848, + "step": 20459 + }, + { + "epoch": 6.279926335174954, + "grad_norm": 0.22169579565525055, + "learning_rate": 3.2113625615631385e-05, + "loss": 1.7599, + "step": 20460 + }, + { + "epoch": 6.2802332719459795, + "grad_norm": 0.1768191009759903, + "learning_rate": 3.210898406586634e-05, + "loss": 1.6894, + "step": 20461 + }, + { + "epoch": 6.280540208717004, + "grad_norm": 0.1923041045665741, + "learning_rate": 3.21043426929108e-05, + "loss": 1.7379, + "step": 20462 + }, + { + "epoch": 6.280847145488029, + "grad_norm": 0.1836252212524414, + "learning_rate": 3.2099701496810644e-05, + "loss": 1.6748, + "step": 20463 + }, + { + "epoch": 6.281154082259055, + "grad_norm": 0.2203192561864853, + "learning_rate": 3.2095060477611705e-05, + "loss": 1.6969, + "step": 20464 + }, + { + "epoch": 6.28146101903008, + "grad_norm": 0.25511759519577026, + "learning_rate": 3.20904196353599e-05, + "loss": 1.7806, + "step": 20465 + }, + { + "epoch": 6.281767955801105, + "grad_norm": 0.19464822113513947, + "learning_rate": 3.208577897010106e-05, + "loss": 1.6784, + "step": 20466 + }, + { + "epoch": 6.28207489257213, + "grad_norm": 0.1949714869260788, + "learning_rate": 3.208113848188105e-05, + "loss": 1.713, + "step": 20467 + }, + { + "epoch": 6.282381829343155, + "grad_norm": 0.22094127535820007, + "learning_rate": 3.207649817074572e-05, + "loss": 1.7397, + "step": 20468 + }, + { + "epoch": 6.28268876611418, + "grad_norm": 0.22343899309635162, + "learning_rate": 3.2071858036740954e-05, + "loss": 1.717, + "step": 20469 + }, + { + "epoch": 6.282995702885206, + "grad_norm": 0.20854893326759338, + "learning_rate": 3.2067218079912584e-05, + "loss": 1.7255, + "step": 20470 + }, + { + "epoch": 6.283302639656231, + "grad_norm": 0.21306286752223969, + "learning_rate": 3.206257830030649e-05, + "loss": 1.7251, + "step": 20471 + }, + { + "epoch": 6.283609576427256, + "grad_norm": 0.24995777010917664, + "learning_rate": 3.20579386979685e-05, + "loss": 1.7892, + "step": 20472 + }, + { + "epoch": 6.283916513198281, + "grad_norm": 0.23720023036003113, + "learning_rate": 3.2053299272944486e-05, + "loss": 1.7843, + "step": 20473 + }, + { + "epoch": 6.284223449969306, + "grad_norm": 0.2042113095521927, + "learning_rate": 3.204866002528029e-05, + "loss": 1.7318, + "step": 20474 + }, + { + "epoch": 6.2845303867403315, + "grad_norm": 0.22996367514133453, + "learning_rate": 3.2044020955021735e-05, + "loss": 1.6875, + "step": 20475 + }, + { + "epoch": 6.284837323511357, + "grad_norm": 0.187749981880188, + "learning_rate": 3.203938206221471e-05, + "loss": 1.7297, + "step": 20476 + }, + { + "epoch": 6.285144260282382, + "grad_norm": 0.18279509246349335, + "learning_rate": 3.2034743346905025e-05, + "loss": 1.6858, + "step": 20477 + }, + { + "epoch": 6.285451197053407, + "grad_norm": 0.1871512532234192, + "learning_rate": 3.203010480913855e-05, + "loss": 1.7224, + "step": 20478 + }, + { + "epoch": 6.285758133824432, + "grad_norm": 0.17732922732830048, + "learning_rate": 3.202546644896109e-05, + "loss": 1.6872, + "step": 20479 + }, + { + "epoch": 6.286065070595457, + "grad_norm": 0.21146097779273987, + "learning_rate": 3.2020828266418527e-05, + "loss": 1.797, + "step": 20480 + }, + { + "epoch": 6.286372007366483, + "grad_norm": 0.18914340436458588, + "learning_rate": 3.201619026155666e-05, + "loss": 1.7149, + "step": 20481 + }, + { + "epoch": 6.286678944137508, + "grad_norm": 0.20919133722782135, + "learning_rate": 3.2011552434421364e-05, + "loss": 1.7803, + "step": 20482 + }, + { + "epoch": 6.286985880908533, + "grad_norm": 0.17882505059242249, + "learning_rate": 3.200691478505843e-05, + "loss": 1.757, + "step": 20483 + }, + { + "epoch": 6.287292817679558, + "grad_norm": 0.1850014477968216, + "learning_rate": 3.200227731351373e-05, + "loss": 1.7006, + "step": 20484 + }, + { + "epoch": 6.287599754450583, + "grad_norm": 0.19999323785305023, + "learning_rate": 3.1997640019833056e-05, + "loss": 1.702, + "step": 20485 + }, + { + "epoch": 6.287906691221608, + "grad_norm": 0.20464713871479034, + "learning_rate": 3.1993002904062255e-05, + "loss": 1.7272, + "step": 20486 + }, + { + "epoch": 6.288213627992634, + "grad_norm": 0.2105564922094345, + "learning_rate": 3.1988365966247154e-05, + "loss": 1.7062, + "step": 20487 + }, + { + "epoch": 6.288520564763659, + "grad_norm": 0.26322871446609497, + "learning_rate": 3.198372920643359e-05, + "loss": 1.7309, + "step": 20488 + }, + { + "epoch": 6.2888275015346835, + "grad_norm": 0.22787201404571533, + "learning_rate": 3.197909262466736e-05, + "loss": 1.7797, + "step": 20489 + }, + { + "epoch": 6.289134438305709, + "grad_norm": 0.21409621834754944, + "learning_rate": 3.1974456220994314e-05, + "loss": 1.8211, + "step": 20490 + }, + { + "epoch": 6.289441375076734, + "grad_norm": 0.2241450846195221, + "learning_rate": 3.196981999546025e-05, + "loss": 1.7255, + "step": 20491 + }, + { + "epoch": 6.2897483118477595, + "grad_norm": 0.23141883313655853, + "learning_rate": 3.1965183948110985e-05, + "loss": 1.7695, + "step": 20492 + }, + { + "epoch": 6.290055248618785, + "grad_norm": 0.209358349442482, + "learning_rate": 3.196054807899236e-05, + "loss": 1.6808, + "step": 20493 + }, + { + "epoch": 6.290362185389809, + "grad_norm": 0.20730538666248322, + "learning_rate": 3.195591238815015e-05, + "loss": 1.6847, + "step": 20494 + }, + { + "epoch": 6.290669122160835, + "grad_norm": 0.2568998634815216, + "learning_rate": 3.195127687563021e-05, + "loss": 1.664, + "step": 20495 + }, + { + "epoch": 6.29097605893186, + "grad_norm": 0.238932803273201, + "learning_rate": 3.1946641541478316e-05, + "loss": 1.7166, + "step": 20496 + }, + { + "epoch": 6.291282995702885, + "grad_norm": 0.235393688082695, + "learning_rate": 3.19420063857403e-05, + "loss": 1.6572, + "step": 20497 + }, + { + "epoch": 6.291589932473911, + "grad_norm": 0.2888807952404022, + "learning_rate": 3.1937371408461944e-05, + "loss": 1.7484, + "step": 20498 + }, + { + "epoch": 6.291896869244935, + "grad_norm": 0.18588709831237793, + "learning_rate": 3.1932736609689096e-05, + "loss": 1.7027, + "step": 20499 + }, + { + "epoch": 6.29220380601596, + "grad_norm": 0.3065604865550995, + "learning_rate": 3.1928101989467514e-05, + "loss": 1.8051, + "step": 20500 + }, + { + "epoch": 6.292510742786986, + "grad_norm": 0.2480497658252716, + "learning_rate": 3.192346754784304e-05, + "loss": 1.7749, + "step": 20501 + }, + { + "epoch": 6.292817679558011, + "grad_norm": 0.268686443567276, + "learning_rate": 3.1918833284861436e-05, + "loss": 1.7062, + "step": 20502 + }, + { + "epoch": 6.293124616329036, + "grad_norm": 0.337510883808136, + "learning_rate": 3.191419920056853e-05, + "loss": 1.745, + "step": 20503 + }, + { + "epoch": 6.293431553100062, + "grad_norm": 0.18532821536064148, + "learning_rate": 3.190956529501009e-05, + "loss": 1.7098, + "step": 20504 + }, + { + "epoch": 6.293738489871086, + "grad_norm": 0.27805468440055847, + "learning_rate": 3.1904931568231956e-05, + "loss": 1.7252, + "step": 20505 + }, + { + "epoch": 6.2940454266421115, + "grad_norm": 0.22137443721294403, + "learning_rate": 3.190029802027987e-05, + "loss": 1.7595, + "step": 20506 + }, + { + "epoch": 6.294352363413137, + "grad_norm": 0.23159445822238922, + "learning_rate": 3.189566465119968e-05, + "loss": 1.7503, + "step": 20507 + }, + { + "epoch": 6.294659300184162, + "grad_norm": 0.2089100182056427, + "learning_rate": 3.189103146103712e-05, + "loss": 1.7021, + "step": 20508 + }, + { + "epoch": 6.2949662369551875, + "grad_norm": 0.1985119879245758, + "learning_rate": 3.1886398449838e-05, + "loss": 1.7468, + "step": 20509 + }, + { + "epoch": 6.295273173726212, + "grad_norm": 0.18612028658390045, + "learning_rate": 3.188176561764812e-05, + "loss": 1.6657, + "step": 20510 + }, + { + "epoch": 6.295580110497237, + "grad_norm": 0.22453728318214417, + "learning_rate": 3.1877132964513226e-05, + "loss": 1.7223, + "step": 20511 + }, + { + "epoch": 6.295887047268263, + "grad_norm": 0.270304799079895, + "learning_rate": 3.187250049047916e-05, + "loss": 1.7548, + "step": 20512 + }, + { + "epoch": 6.296193984039288, + "grad_norm": 0.19762152433395386, + "learning_rate": 3.1867868195591643e-05, + "loss": 1.6945, + "step": 20513 + }, + { + "epoch": 6.296500920810313, + "grad_norm": 0.25173795223236084, + "learning_rate": 3.1863236079896486e-05, + "loss": 1.7303, + "step": 20514 + }, + { + "epoch": 6.296807857581339, + "grad_norm": 0.2073308676481247, + "learning_rate": 3.185860414343945e-05, + "loss": 1.7327, + "step": 20515 + }, + { + "epoch": 6.297114794352363, + "grad_norm": 0.24174070358276367, + "learning_rate": 3.185397238626635e-05, + "loss": 1.7577, + "step": 20516 + }, + { + "epoch": 6.297421731123388, + "grad_norm": 0.1950366348028183, + "learning_rate": 3.1849340808422905e-05, + "loss": 1.7137, + "step": 20517 + }, + { + "epoch": 6.297728667894414, + "grad_norm": 0.23416653275489807, + "learning_rate": 3.1844709409954936e-05, + "loss": 1.7547, + "step": 20518 + }, + { + "epoch": 6.298035604665439, + "grad_norm": 0.1939592808485031, + "learning_rate": 3.184007819090817e-05, + "loss": 1.7215, + "step": 20519 + }, + { + "epoch": 6.298342541436464, + "grad_norm": 0.21807245910167694, + "learning_rate": 3.1835447151328405e-05, + "loss": 1.7021, + "step": 20520 + }, + { + "epoch": 6.298649478207489, + "grad_norm": 0.21653762459754944, + "learning_rate": 3.183081629126138e-05, + "loss": 1.7426, + "step": 20521 + }, + { + "epoch": 6.298956414978514, + "grad_norm": 0.20749153196811676, + "learning_rate": 3.18261856107529e-05, + "loss": 1.7302, + "step": 20522 + }, + { + "epoch": 6.2992633517495396, + "grad_norm": 0.23450545966625214, + "learning_rate": 3.182155510984869e-05, + "loss": 1.7414, + "step": 20523 + }, + { + "epoch": 6.299570288520565, + "grad_norm": 0.17081578075885773, + "learning_rate": 3.181692478859455e-05, + "loss": 1.7017, + "step": 20524 + }, + { + "epoch": 6.29987722529159, + "grad_norm": 0.20244698226451874, + "learning_rate": 3.18122946470362e-05, + "loss": 1.6765, + "step": 20525 + }, + { + "epoch": 6.300184162062616, + "grad_norm": 0.20153406262397766, + "learning_rate": 3.180766468521941e-05, + "loss": 1.7437, + "step": 20526 + }, + { + "epoch": 6.30049109883364, + "grad_norm": 0.21135647594928741, + "learning_rate": 3.180303490318996e-05, + "loss": 1.7202, + "step": 20527 + }, + { + "epoch": 6.300798035604665, + "grad_norm": 0.20342735946178436, + "learning_rate": 3.1798405300993555e-05, + "loss": 1.7268, + "step": 20528 + }, + { + "epoch": 6.301104972375691, + "grad_norm": 0.21153734624385834, + "learning_rate": 3.1793775878676e-05, + "loss": 1.7455, + "step": 20529 + }, + { + "epoch": 6.301411909146716, + "grad_norm": 0.2197744995355606, + "learning_rate": 3.1789146636283015e-05, + "loss": 1.7876, + "step": 20530 + }, + { + "epoch": 6.301718845917741, + "grad_norm": 0.2236124575138092, + "learning_rate": 3.1784517573860356e-05, + "loss": 1.7454, + "step": 20531 + }, + { + "epoch": 6.302025782688766, + "grad_norm": 0.22071333229541779, + "learning_rate": 3.177988869145376e-05, + "loss": 1.7197, + "step": 20532 + }, + { + "epoch": 6.302332719459791, + "grad_norm": 0.20137591660022736, + "learning_rate": 3.177525998910901e-05, + "loss": 1.7153, + "step": 20533 + }, + { + "epoch": 6.3026396562308165, + "grad_norm": 0.18981720507144928, + "learning_rate": 3.17706314668718e-05, + "loss": 1.6948, + "step": 20534 + }, + { + "epoch": 6.302946593001842, + "grad_norm": 0.20803335309028625, + "learning_rate": 3.176600312478791e-05, + "loss": 1.7454, + "step": 20535 + }, + { + "epoch": 6.303253529772867, + "grad_norm": 0.2224191278219223, + "learning_rate": 3.176137496290305e-05, + "loss": 1.708, + "step": 20536 + }, + { + "epoch": 6.303560466543892, + "grad_norm": 0.21110501885414124, + "learning_rate": 3.175674698126298e-05, + "loss": 1.6976, + "step": 20537 + }, + { + "epoch": 6.303867403314917, + "grad_norm": 0.19902437925338745, + "learning_rate": 3.175211917991342e-05, + "loss": 1.7246, + "step": 20538 + }, + { + "epoch": 6.304174340085942, + "grad_norm": 0.1930927336215973, + "learning_rate": 3.174749155890013e-05, + "loss": 1.7849, + "step": 20539 + }, + { + "epoch": 6.304481276856968, + "grad_norm": 0.19350691139698029, + "learning_rate": 3.174286411826881e-05, + "loss": 1.7441, + "step": 20540 + }, + { + "epoch": 6.304788213627993, + "grad_norm": 0.18532924354076385, + "learning_rate": 3.173823685806523e-05, + "loss": 1.6675, + "step": 20541 + }, + { + "epoch": 6.305095150399017, + "grad_norm": 0.18890263140201569, + "learning_rate": 3.173360977833508e-05, + "loss": 1.7889, + "step": 20542 + }, + { + "epoch": 6.305402087170043, + "grad_norm": 0.20418904721736908, + "learning_rate": 3.17289828791241e-05, + "loss": 1.8298, + "step": 20543 + }, + { + "epoch": 6.305709023941068, + "grad_norm": 0.2298857718706131, + "learning_rate": 3.172435616047804e-05, + "loss": 1.7889, + "step": 20544 + }, + { + "epoch": 6.306015960712093, + "grad_norm": 0.20661889016628265, + "learning_rate": 3.171972962244258e-05, + "loss": 1.74, + "step": 20545 + }, + { + "epoch": 6.306322897483119, + "grad_norm": 0.17712774872779846, + "learning_rate": 3.1715103265063496e-05, + "loss": 1.72, + "step": 20546 + }, + { + "epoch": 6.306629834254144, + "grad_norm": 0.16776354610919952, + "learning_rate": 3.1710477088386456e-05, + "loss": 1.6715, + "step": 20547 + }, + { + "epoch": 6.3069367710251685, + "grad_norm": 0.21919682621955872, + "learning_rate": 3.170585109245721e-05, + "loss": 1.7232, + "step": 20548 + }, + { + "epoch": 6.307243707796194, + "grad_norm": 0.2026829719543457, + "learning_rate": 3.170122527732144e-05, + "loss": 1.7551, + "step": 20549 + }, + { + "epoch": 6.307550644567219, + "grad_norm": 0.18783780932426453, + "learning_rate": 3.169659964302493e-05, + "loss": 1.7024, + "step": 20550 + }, + { + "epoch": 6.3078575813382445, + "grad_norm": 0.2058420479297638, + "learning_rate": 3.1691974189613316e-05, + "loss": 1.7006, + "step": 20551 + }, + { + "epoch": 6.30816451810927, + "grad_norm": 0.21351832151412964, + "learning_rate": 3.168734891713237e-05, + "loss": 1.7586, + "step": 20552 + }, + { + "epoch": 6.308471454880294, + "grad_norm": 0.19816654920578003, + "learning_rate": 3.168272382562776e-05, + "loss": 1.7532, + "step": 20553 + }, + { + "epoch": 6.30877839165132, + "grad_norm": 0.18253186345100403, + "learning_rate": 3.16780989151452e-05, + "loss": 1.7413, + "step": 20554 + }, + { + "epoch": 6.309085328422345, + "grad_norm": 0.23097483813762665, + "learning_rate": 3.167347418573042e-05, + "loss": 1.7355, + "step": 20555 + }, + { + "epoch": 6.30939226519337, + "grad_norm": 0.1984725296497345, + "learning_rate": 3.166884963742911e-05, + "loss": 1.6754, + "step": 20556 + }, + { + "epoch": 6.309699201964396, + "grad_norm": 0.2385166734457016, + "learning_rate": 3.166422527028696e-05, + "loss": 1.7322, + "step": 20557 + }, + { + "epoch": 6.310006138735421, + "grad_norm": 0.23216524720191956, + "learning_rate": 3.165960108434971e-05, + "loss": 1.7426, + "step": 20558 + }, + { + "epoch": 6.310313075506445, + "grad_norm": 0.22017790377140045, + "learning_rate": 3.165497707966301e-05, + "loss": 1.6977, + "step": 20559 + }, + { + "epoch": 6.310620012277471, + "grad_norm": 0.2934584617614746, + "learning_rate": 3.165035325627257e-05, + "loss": 1.7252, + "step": 20560 + }, + { + "epoch": 6.310926949048496, + "grad_norm": 0.21830198168754578, + "learning_rate": 3.1645729614224126e-05, + "loss": 1.781, + "step": 20561 + }, + { + "epoch": 6.311233885819521, + "grad_norm": 0.3082836866378784, + "learning_rate": 3.1641106153563306e-05, + "loss": 1.8015, + "step": 20562 + }, + { + "epoch": 6.311540822590547, + "grad_norm": 0.22441358864307404, + "learning_rate": 3.163648287433586e-05, + "loss": 1.8058, + "step": 20563 + }, + { + "epoch": 6.311847759361571, + "grad_norm": 0.36623889207839966, + "learning_rate": 3.163185977658744e-05, + "loss": 1.7092, + "step": 20564 + }, + { + "epoch": 6.3121546961325965, + "grad_norm": 0.22231145203113556, + "learning_rate": 3.1627236860363755e-05, + "loss": 1.6432, + "step": 20565 + }, + { + "epoch": 6.312461632903622, + "grad_norm": 0.25871971249580383, + "learning_rate": 3.162261412571047e-05, + "loss": 1.7156, + "step": 20566 + }, + { + "epoch": 6.312768569674647, + "grad_norm": 0.24574241042137146, + "learning_rate": 3.16179915726733e-05, + "loss": 1.7977, + "step": 20567 + }, + { + "epoch": 6.3130755064456725, + "grad_norm": 0.197379007935524, + "learning_rate": 3.1613369201297895e-05, + "loss": 1.6966, + "step": 20568 + }, + { + "epoch": 6.313382443216697, + "grad_norm": 0.2149469256401062, + "learning_rate": 3.1608747011629975e-05, + "loss": 1.7385, + "step": 20569 + }, + { + "epoch": 6.313689379987722, + "grad_norm": 0.21942345798015594, + "learning_rate": 3.1604125003715174e-05, + "loss": 1.7369, + "step": 20570 + }, + { + "epoch": 6.313996316758748, + "grad_norm": 0.20977036654949188, + "learning_rate": 3.1599503177599197e-05, + "loss": 1.7429, + "step": 20571 + }, + { + "epoch": 6.314303253529773, + "grad_norm": 0.20113405585289001, + "learning_rate": 3.159488153332772e-05, + "loss": 1.7163, + "step": 20572 + }, + { + "epoch": 6.314610190300798, + "grad_norm": 0.22031868994235992, + "learning_rate": 3.1590260070946414e-05, + "loss": 1.7085, + "step": 20573 + }, + { + "epoch": 6.314917127071823, + "grad_norm": 0.24137777090072632, + "learning_rate": 3.158563879050094e-05, + "loss": 1.7169, + "step": 20574 + }, + { + "epoch": 6.315224063842848, + "grad_norm": 0.20265905559062958, + "learning_rate": 3.1581017692036985e-05, + "loss": 1.7466, + "step": 20575 + }, + { + "epoch": 6.315531000613873, + "grad_norm": 0.2997782528400421, + "learning_rate": 3.1576396775600206e-05, + "loss": 1.7287, + "step": 20576 + }, + { + "epoch": 6.315837937384899, + "grad_norm": 0.19672340154647827, + "learning_rate": 3.157177604123628e-05, + "loss": 1.7121, + "step": 20577 + }, + { + "epoch": 6.316144874155924, + "grad_norm": 0.26618507504463196, + "learning_rate": 3.156715548899085e-05, + "loss": 1.6958, + "step": 20578 + }, + { + "epoch": 6.316451810926949, + "grad_norm": 0.18854503333568573, + "learning_rate": 3.156253511890959e-05, + "loss": 1.7751, + "step": 20579 + }, + { + "epoch": 6.316758747697974, + "grad_norm": 0.2306061089038849, + "learning_rate": 3.155791493103819e-05, + "loss": 1.6853, + "step": 20580 + }, + { + "epoch": 6.317065684468999, + "grad_norm": 0.20650778710842133, + "learning_rate": 3.1553294925422254e-05, + "loss": 1.7021, + "step": 20581 + }, + { + "epoch": 6.3173726212400245, + "grad_norm": 0.19474658370018005, + "learning_rate": 3.1548675102107494e-05, + "loss": 1.7146, + "step": 20582 + }, + { + "epoch": 6.31767955801105, + "grad_norm": 0.2150747925043106, + "learning_rate": 3.154405546113952e-05, + "loss": 1.7473, + "step": 20583 + }, + { + "epoch": 6.317986494782075, + "grad_norm": 0.19304975867271423, + "learning_rate": 3.153943600256402e-05, + "loss": 1.7209, + "step": 20584 + }, + { + "epoch": 6.3182934315531, + "grad_norm": 0.22610948979854584, + "learning_rate": 3.153481672642662e-05, + "loss": 1.717, + "step": 20585 + }, + { + "epoch": 6.318600368324125, + "grad_norm": 0.18705105781555176, + "learning_rate": 3.1530197632773006e-05, + "loss": 1.7326, + "step": 20586 + }, + { + "epoch": 6.31890730509515, + "grad_norm": 0.25632867217063904, + "learning_rate": 3.152557872164878e-05, + "loss": 1.7391, + "step": 20587 + }, + { + "epoch": 6.319214241866176, + "grad_norm": 0.18723119795322418, + "learning_rate": 3.152095999309964e-05, + "loss": 1.7193, + "step": 20588 + }, + { + "epoch": 6.319521178637201, + "grad_norm": 0.1759091317653656, + "learning_rate": 3.1516341447171184e-05, + "loss": 1.7024, + "step": 20589 + }, + { + "epoch": 6.319828115408226, + "grad_norm": 0.1838626265525818, + "learning_rate": 3.1511723083909084e-05, + "loss": 1.7027, + "step": 20590 + }, + { + "epoch": 6.320135052179251, + "grad_norm": 0.2615656554698944, + "learning_rate": 3.1507104903358964e-05, + "loss": 1.7798, + "step": 20591 + }, + { + "epoch": 6.320441988950276, + "grad_norm": 0.18816477060317993, + "learning_rate": 3.150248690556649e-05, + "loss": 1.6778, + "step": 20592 + }, + { + "epoch": 6.320748925721301, + "grad_norm": 0.20011866092681885, + "learning_rate": 3.149786909057728e-05, + "loss": 1.6653, + "step": 20593 + }, + { + "epoch": 6.321055862492327, + "grad_norm": 0.26681140065193176, + "learning_rate": 3.149325145843696e-05, + "loss": 1.7523, + "step": 20594 + }, + { + "epoch": 6.321362799263352, + "grad_norm": 0.2062411904335022, + "learning_rate": 3.1488634009191177e-05, + "loss": 1.7584, + "step": 20595 + }, + { + "epoch": 6.3216697360343765, + "grad_norm": 0.22355243563652039, + "learning_rate": 3.148401674288556e-05, + "loss": 1.7106, + "step": 20596 + }, + { + "epoch": 6.321976672805402, + "grad_norm": 0.20189255475997925, + "learning_rate": 3.147939965956576e-05, + "loss": 1.6775, + "step": 20597 + }, + { + "epoch": 6.322283609576427, + "grad_norm": 0.23753875494003296, + "learning_rate": 3.147478275927736e-05, + "loss": 1.7661, + "step": 20598 + }, + { + "epoch": 6.3225905463474525, + "grad_norm": 0.18658648431301117, + "learning_rate": 3.147016604206604e-05, + "loss": 1.7562, + "step": 20599 + }, + { + "epoch": 6.322897483118478, + "grad_norm": 0.2610020637512207, + "learning_rate": 3.146554950797738e-05, + "loss": 1.7217, + "step": 20600 + }, + { + "epoch": 6.323204419889503, + "grad_norm": 0.18329289555549622, + "learning_rate": 3.146093315705704e-05, + "loss": 1.7206, + "step": 20601 + }, + { + "epoch": 6.323511356660528, + "grad_norm": 0.2393725961446762, + "learning_rate": 3.1456316989350606e-05, + "loss": 1.7646, + "step": 20602 + }, + { + "epoch": 6.323818293431553, + "grad_norm": 0.23535947501659393, + "learning_rate": 3.1451701004903736e-05, + "loss": 1.7718, + "step": 20603 + }, + { + "epoch": 6.324125230202578, + "grad_norm": 0.23179253935813904, + "learning_rate": 3.1447085203762014e-05, + "loss": 1.7311, + "step": 20604 + }, + { + "epoch": 6.324432166973604, + "grad_norm": 0.24929681420326233, + "learning_rate": 3.144246958597109e-05, + "loss": 1.7728, + "step": 20605 + }, + { + "epoch": 6.324739103744629, + "grad_norm": 0.22520960867404938, + "learning_rate": 3.1437854151576526e-05, + "loss": 1.749, + "step": 20606 + }, + { + "epoch": 6.3250460405156534, + "grad_norm": 0.3005391061306, + "learning_rate": 3.1433238900623997e-05, + "loss": 1.7725, + "step": 20607 + }, + { + "epoch": 6.325352977286679, + "grad_norm": 0.22625432908535004, + "learning_rate": 3.142862383315908e-05, + "loss": 1.7083, + "step": 20608 + }, + { + "epoch": 6.325659914057704, + "grad_norm": 0.28015029430389404, + "learning_rate": 3.142400894922737e-05, + "loss": 1.6862, + "step": 20609 + }, + { + "epoch": 6.3259668508287294, + "grad_norm": 0.2520587146282196, + "learning_rate": 3.141939424887451e-05, + "loss": 1.7059, + "step": 20610 + }, + { + "epoch": 6.326273787599755, + "grad_norm": 0.24668551981449127, + "learning_rate": 3.141477973214607e-05, + "loss": 1.6858, + "step": 20611 + }, + { + "epoch": 6.326580724370779, + "grad_norm": 0.2524704337120056, + "learning_rate": 3.1410165399087675e-05, + "loss": 1.6884, + "step": 20612 + }, + { + "epoch": 6.326887661141805, + "grad_norm": 0.18849264085292816, + "learning_rate": 3.1405551249744916e-05, + "loss": 1.6984, + "step": 20613 + }, + { + "epoch": 6.32719459791283, + "grad_norm": 0.2411552518606186, + "learning_rate": 3.140093728416342e-05, + "loss": 1.7455, + "step": 20614 + }, + { + "epoch": 6.327501534683855, + "grad_norm": 0.2268913835287094, + "learning_rate": 3.139632350238874e-05, + "loss": 1.7124, + "step": 20615 + }, + { + "epoch": 6.327808471454881, + "grad_norm": 0.3118770718574524, + "learning_rate": 3.1391709904466515e-05, + "loss": 1.7322, + "step": 20616 + }, + { + "epoch": 6.328115408225905, + "grad_norm": 0.25166428089141846, + "learning_rate": 3.1387096490442294e-05, + "loss": 1.7136, + "step": 20617 + }, + { + "epoch": 6.32842234499693, + "grad_norm": 0.2733297049999237, + "learning_rate": 3.138248326036172e-05, + "loss": 1.7939, + "step": 20618 + }, + { + "epoch": 6.328729281767956, + "grad_norm": 0.24583236873149872, + "learning_rate": 3.1377870214270334e-05, + "loss": 1.7105, + "step": 20619 + }, + { + "epoch": 6.329036218538981, + "grad_norm": 0.2533528506755829, + "learning_rate": 3.137325735221377e-05, + "loss": 1.7828, + "step": 20620 + }, + { + "epoch": 6.329343155310006, + "grad_norm": 0.27662715315818787, + "learning_rate": 3.136864467423758e-05, + "loss": 1.6969, + "step": 20621 + }, + { + "epoch": 6.329650092081032, + "grad_norm": 0.20107655227184296, + "learning_rate": 3.136403218038738e-05, + "loss": 1.6659, + "step": 20622 + }, + { + "epoch": 6.329957028852056, + "grad_norm": 0.21126115322113037, + "learning_rate": 3.135941987070872e-05, + "loss": 1.7372, + "step": 20623 + }, + { + "epoch": 6.3302639656230815, + "grad_norm": 0.1840609908103943, + "learning_rate": 3.1354807745247206e-05, + "loss": 1.7219, + "step": 20624 + }, + { + "epoch": 6.330570902394107, + "grad_norm": 0.23623648285865784, + "learning_rate": 3.135019580404842e-05, + "loss": 1.8059, + "step": 20625 + }, + { + "epoch": 6.330877839165132, + "grad_norm": 0.19853124022483826, + "learning_rate": 3.134558404715792e-05, + "loss": 1.7336, + "step": 20626 + }, + { + "epoch": 6.3311847759361575, + "grad_norm": 0.2261304259300232, + "learning_rate": 3.13409724746213e-05, + "loss": 1.7508, + "step": 20627 + }, + { + "epoch": 6.331491712707182, + "grad_norm": 0.1797952800989151, + "learning_rate": 3.1336361086484104e-05, + "loss": 1.6569, + "step": 20628 + }, + { + "epoch": 6.331798649478207, + "grad_norm": 0.21610359847545624, + "learning_rate": 3.133174988279195e-05, + "loss": 1.7093, + "step": 20629 + }, + { + "epoch": 6.332105586249233, + "grad_norm": 0.1818271279335022, + "learning_rate": 3.1327138863590365e-05, + "loss": 1.6951, + "step": 20630 + }, + { + "epoch": 6.332412523020258, + "grad_norm": 0.20425963401794434, + "learning_rate": 3.1322528028924956e-05, + "loss": 1.7399, + "step": 20631 + }, + { + "epoch": 6.332719459791283, + "grad_norm": 0.20357854664325714, + "learning_rate": 3.131791737884126e-05, + "loss": 1.693, + "step": 20632 + }, + { + "epoch": 6.333026396562309, + "grad_norm": 0.25307130813598633, + "learning_rate": 3.1313306913384874e-05, + "loss": 1.674, + "step": 20633 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 0.21596084535121918, + "learning_rate": 3.130869663260132e-05, + "loss": 1.7521, + "step": 20634 + }, + { + "epoch": 6.333640270104358, + "grad_norm": 0.24110902845859528, + "learning_rate": 3.1304086536536194e-05, + "loss": 1.6723, + "step": 20635 + }, + { + "epoch": 6.333947206875384, + "grad_norm": 0.21365956962108612, + "learning_rate": 3.129947662523503e-05, + "loss": 1.7702, + "step": 20636 + }, + { + "epoch": 6.334254143646409, + "grad_norm": 0.21873877942562103, + "learning_rate": 3.129486689874341e-05, + "loss": 1.7176, + "step": 20637 + }, + { + "epoch": 6.334561080417434, + "grad_norm": 0.2543679475784302, + "learning_rate": 3.129025735710687e-05, + "loss": 1.7733, + "step": 20638 + }, + { + "epoch": 6.334868017188459, + "grad_norm": 0.24591630697250366, + "learning_rate": 3.1285648000370996e-05, + "loss": 1.7212, + "step": 20639 + }, + { + "epoch": 6.335174953959484, + "grad_norm": 0.2453039139509201, + "learning_rate": 3.128103882858129e-05, + "loss": 1.7316, + "step": 20640 + }, + { + "epoch": 6.3354818907305095, + "grad_norm": 0.239897683262825, + "learning_rate": 3.127642984178334e-05, + "loss": 1.7495, + "step": 20641 + }, + { + "epoch": 6.335788827501535, + "grad_norm": 0.20719192922115326, + "learning_rate": 3.12718210400227e-05, + "loss": 1.7242, + "step": 20642 + }, + { + "epoch": 6.33609576427256, + "grad_norm": 0.1813955008983612, + "learning_rate": 3.126721242334487e-05, + "loss": 1.672, + "step": 20643 + }, + { + "epoch": 6.336402701043585, + "grad_norm": 0.20045650005340576, + "learning_rate": 3.126260399179546e-05, + "loss": 1.7854, + "step": 20644 + }, + { + "epoch": 6.33670963781461, + "grad_norm": 0.23010976612567902, + "learning_rate": 3.125799574541995e-05, + "loss": 1.7508, + "step": 20645 + }, + { + "epoch": 6.337016574585635, + "grad_norm": 0.1854519248008728, + "learning_rate": 3.1253387684263924e-05, + "loss": 1.7049, + "step": 20646 + }, + { + "epoch": 6.337323511356661, + "grad_norm": 0.2062511295080185, + "learning_rate": 3.1248779808372894e-05, + "loss": 1.6894, + "step": 20647 + }, + { + "epoch": 6.337630448127686, + "grad_norm": 0.19851341843605042, + "learning_rate": 3.124417211779244e-05, + "loss": 1.7332, + "step": 20648 + }, + { + "epoch": 6.337937384898711, + "grad_norm": 0.2099175751209259, + "learning_rate": 3.1239564612568054e-05, + "loss": 1.7577, + "step": 20649 + }, + { + "epoch": 6.338244321669736, + "grad_norm": 0.2152891904115677, + "learning_rate": 3.123495729274529e-05, + "loss": 1.7691, + "step": 20650 + }, + { + "epoch": 6.338551258440761, + "grad_norm": 0.19431835412979126, + "learning_rate": 3.123035015836967e-05, + "loss": 1.7035, + "step": 20651 + }, + { + "epoch": 6.338858195211786, + "grad_norm": 0.20863930881023407, + "learning_rate": 3.122574320948674e-05, + "loss": 1.7166, + "step": 20652 + }, + { + "epoch": 6.339165131982812, + "grad_norm": 0.17948369681835175, + "learning_rate": 3.122113644614201e-05, + "loss": 1.732, + "step": 20653 + }, + { + "epoch": 6.339472068753837, + "grad_norm": 0.2329161912202835, + "learning_rate": 3.121652986838103e-05, + "loss": 1.6934, + "step": 20654 + }, + { + "epoch": 6.3397790055248615, + "grad_norm": 0.23563681542873383, + "learning_rate": 3.12119234762493e-05, + "loss": 1.7329, + "step": 20655 + }, + { + "epoch": 6.340085942295887, + "grad_norm": 0.22654885053634644, + "learning_rate": 3.120731726979236e-05, + "loss": 1.767, + "step": 20656 + }, + { + "epoch": 6.340392879066912, + "grad_norm": 0.2507181465625763, + "learning_rate": 3.1202711249055715e-05, + "loss": 1.7071, + "step": 20657 + }, + { + "epoch": 6.3406998158379375, + "grad_norm": 0.20573864877223969, + "learning_rate": 3.1198105414084906e-05, + "loss": 1.7566, + "step": 20658 + }, + { + "epoch": 6.341006752608963, + "grad_norm": 0.23311644792556763, + "learning_rate": 3.119349976492545e-05, + "loss": 1.6778, + "step": 20659 + }, + { + "epoch": 6.341313689379987, + "grad_norm": 0.18166053295135498, + "learning_rate": 3.118889430162283e-05, + "loss": 1.7109, + "step": 20660 + }, + { + "epoch": 6.341620626151013, + "grad_norm": 0.21054090559482574, + "learning_rate": 3.11842890242226e-05, + "loss": 1.7255, + "step": 20661 + }, + { + "epoch": 6.341927562922038, + "grad_norm": 0.19898973405361176, + "learning_rate": 3.1179683932770235e-05, + "loss": 1.7017, + "step": 20662 + }, + { + "epoch": 6.342234499693063, + "grad_norm": 0.17782434821128845, + "learning_rate": 3.117507902731127e-05, + "loss": 1.6858, + "step": 20663 + }, + { + "epoch": 6.342541436464089, + "grad_norm": 0.19286927580833435, + "learning_rate": 3.117047430789121e-05, + "loss": 1.707, + "step": 20664 + }, + { + "epoch": 6.342848373235114, + "grad_norm": 0.18578651547431946, + "learning_rate": 3.1165869774555565e-05, + "loss": 1.7331, + "step": 20665 + }, + { + "epoch": 6.343155310006138, + "grad_norm": 0.19728249311447144, + "learning_rate": 3.1161265427349826e-05, + "loss": 1.7165, + "step": 20666 + }, + { + "epoch": 6.343462246777164, + "grad_norm": 0.18240176141262054, + "learning_rate": 3.115666126631952e-05, + "loss": 1.7167, + "step": 20667 + }, + { + "epoch": 6.343769183548189, + "grad_norm": 0.1928495317697525, + "learning_rate": 3.115205729151011e-05, + "loss": 1.7431, + "step": 20668 + }, + { + "epoch": 6.344076120319214, + "grad_norm": 0.19459952414035797, + "learning_rate": 3.1147453502967125e-05, + "loss": 1.7294, + "step": 20669 + }, + { + "epoch": 6.34438305709024, + "grad_norm": 0.18829894065856934, + "learning_rate": 3.1142849900736046e-05, + "loss": 1.7512, + "step": 20670 + }, + { + "epoch": 6.344689993861264, + "grad_norm": 0.19678451120853424, + "learning_rate": 3.11382464848624e-05, + "loss": 1.673, + "step": 20671 + }, + { + "epoch": 6.3449969306322895, + "grad_norm": 0.22256550192832947, + "learning_rate": 3.1133643255391635e-05, + "loss": 1.7044, + "step": 20672 + }, + { + "epoch": 6.345303867403315, + "grad_norm": 0.24741628766059875, + "learning_rate": 3.112904021236929e-05, + "loss": 1.7904, + "step": 20673 + }, + { + "epoch": 6.34561080417434, + "grad_norm": 0.20286159217357635, + "learning_rate": 3.11244373558408e-05, + "loss": 1.6976, + "step": 20674 + }, + { + "epoch": 6.3459177409453655, + "grad_norm": 0.2005387842655182, + "learning_rate": 3.11198346858517e-05, + "loss": 1.7083, + "step": 20675 + }, + { + "epoch": 6.346224677716391, + "grad_norm": 0.22312256693840027, + "learning_rate": 3.111523220244747e-05, + "loss": 1.7575, + "step": 20676 + }, + { + "epoch": 6.346531614487415, + "grad_norm": 0.2968841791152954, + "learning_rate": 3.111062990567356e-05, + "loss": 1.7813, + "step": 20677 + }, + { + "epoch": 6.346838551258441, + "grad_norm": 0.22900697588920593, + "learning_rate": 3.1106027795575496e-05, + "loss": 1.6818, + "step": 20678 + }, + { + "epoch": 6.347145488029466, + "grad_norm": 0.1912240833044052, + "learning_rate": 3.110142587219873e-05, + "loss": 1.7174, + "step": 20679 + }, + { + "epoch": 6.347452424800491, + "grad_norm": 0.20461280643939972, + "learning_rate": 3.1096824135588754e-05, + "loss": 1.6945, + "step": 20680 + }, + { + "epoch": 6.347759361571517, + "grad_norm": 0.19344913959503174, + "learning_rate": 3.109222258579103e-05, + "loss": 1.7064, + "step": 20681 + }, + { + "epoch": 6.348066298342541, + "grad_norm": 0.1833983063697815, + "learning_rate": 3.108762122285106e-05, + "loss": 1.702, + "step": 20682 + }, + { + "epoch": 6.348373235113566, + "grad_norm": 0.20344893634319305, + "learning_rate": 3.108302004681429e-05, + "loss": 1.7323, + "step": 20683 + }, + { + "epoch": 6.348680171884592, + "grad_norm": 0.18629617989063263, + "learning_rate": 3.107841905772622e-05, + "loss": 1.6841, + "step": 20684 + }, + { + "epoch": 6.348987108655617, + "grad_norm": 0.19279471039772034, + "learning_rate": 3.107381825563228e-05, + "loss": 1.7581, + "step": 20685 + }, + { + "epoch": 6.349294045426642, + "grad_norm": 0.21727058291435242, + "learning_rate": 3.106921764057798e-05, + "loss": 1.7231, + "step": 20686 + }, + { + "epoch": 6.349600982197667, + "grad_norm": 0.20952723920345306, + "learning_rate": 3.1064617212608747e-05, + "loss": 1.713, + "step": 20687 + }, + { + "epoch": 6.349907918968692, + "grad_norm": 0.2358582466840744, + "learning_rate": 3.10600169717701e-05, + "loss": 1.7291, + "step": 20688 + }, + { + "epoch": 6.350214855739718, + "grad_norm": 0.21846619248390198, + "learning_rate": 3.105541691810743e-05, + "loss": 1.7365, + "step": 20689 + }, + { + "epoch": 6.350521792510743, + "grad_norm": 0.22137843072414398, + "learning_rate": 3.1050817051666256e-05, + "loss": 1.7404, + "step": 20690 + }, + { + "epoch": 6.350828729281768, + "grad_norm": 0.2301674485206604, + "learning_rate": 3.1046217372492e-05, + "loss": 1.7422, + "step": 20691 + }, + { + "epoch": 6.351135666052793, + "grad_norm": 0.18955166637897491, + "learning_rate": 3.104161788063015e-05, + "loss": 1.7063, + "step": 20692 + }, + { + "epoch": 6.351442602823818, + "grad_norm": 0.21172095835208893, + "learning_rate": 3.103701857612614e-05, + "loss": 1.6856, + "step": 20693 + }, + { + "epoch": 6.351749539594843, + "grad_norm": 0.20921260118484497, + "learning_rate": 3.103241945902541e-05, + "loss": 1.7384, + "step": 20694 + }, + { + "epoch": 6.352056476365869, + "grad_norm": 0.21005603671073914, + "learning_rate": 3.102782052937345e-05, + "loss": 1.7118, + "step": 20695 + }, + { + "epoch": 6.352363413136894, + "grad_norm": 0.20888659358024597, + "learning_rate": 3.102322178721567e-05, + "loss": 1.7172, + "step": 20696 + }, + { + "epoch": 6.352670349907919, + "grad_norm": 0.194463849067688, + "learning_rate": 3.101862323259754e-05, + "loss": 1.6909, + "step": 20697 + }, + { + "epoch": 6.352977286678944, + "grad_norm": 0.20848685503005981, + "learning_rate": 3.1014024865564494e-05, + "loss": 1.7846, + "step": 20698 + }, + { + "epoch": 6.353284223449969, + "grad_norm": 0.18669761717319489, + "learning_rate": 3.100942668616201e-05, + "loss": 1.7542, + "step": 20699 + }, + { + "epoch": 6.3535911602209945, + "grad_norm": 0.23618464171886444, + "learning_rate": 3.100482869443547e-05, + "loss": 1.7292, + "step": 20700 + }, + { + "epoch": 6.35389809699202, + "grad_norm": 0.19389905035495758, + "learning_rate": 3.100023089043037e-05, + "loss": 1.6847, + "step": 20701 + }, + { + "epoch": 6.354205033763045, + "grad_norm": 0.20346343517303467, + "learning_rate": 3.09956332741921e-05, + "loss": 1.7096, + "step": 20702 + }, + { + "epoch": 6.35451197053407, + "grad_norm": 0.20825842022895813, + "learning_rate": 3.099103584576614e-05, + "loss": 1.6974, + "step": 20703 + }, + { + "epoch": 6.354818907305095, + "grad_norm": 0.2093508094549179, + "learning_rate": 3.0986438605197895e-05, + "loss": 1.6849, + "step": 20704 + }, + { + "epoch": 6.35512584407612, + "grad_norm": 0.2576633393764496, + "learning_rate": 3.098184155253282e-05, + "loss": 1.7974, + "step": 20705 + }, + { + "epoch": 6.355432780847146, + "grad_norm": 0.18197253346443176, + "learning_rate": 3.097724468781632e-05, + "loss": 1.6723, + "step": 20706 + }, + { + "epoch": 6.355739717618171, + "grad_norm": 0.24809512495994568, + "learning_rate": 3.0972648011093855e-05, + "loss": 1.7378, + "step": 20707 + }, + { + "epoch": 6.356046654389196, + "grad_norm": 0.2046923190355301, + "learning_rate": 3.0968051522410814e-05, + "loss": 1.7502, + "step": 20708 + }, + { + "epoch": 6.356353591160221, + "grad_norm": 0.20443019270896912, + "learning_rate": 3.096345522181265e-05, + "loss": 1.7179, + "step": 20709 + }, + { + "epoch": 6.356660527931246, + "grad_norm": 0.1906277984380722, + "learning_rate": 3.09588591093448e-05, + "loss": 1.7167, + "step": 20710 + }, + { + "epoch": 6.356967464702271, + "grad_norm": 0.20729197561740875, + "learning_rate": 3.095426318505263e-05, + "loss": 1.7193, + "step": 20711 + }, + { + "epoch": 6.357274401473297, + "grad_norm": 0.23446644842624664, + "learning_rate": 3.094966744898162e-05, + "loss": 1.7341, + "step": 20712 + }, + { + "epoch": 6.357581338244322, + "grad_norm": 0.18882590532302856, + "learning_rate": 3.094507190117715e-05, + "loss": 1.7001, + "step": 20713 + }, + { + "epoch": 6.3578882750153465, + "grad_norm": 0.27240705490112305, + "learning_rate": 3.094047654168465e-05, + "loss": 1.7641, + "step": 20714 + }, + { + "epoch": 6.358195211786372, + "grad_norm": 0.19616954028606415, + "learning_rate": 3.093588137054952e-05, + "loss": 1.751, + "step": 20715 + }, + { + "epoch": 6.358502148557397, + "grad_norm": 0.23402562737464905, + "learning_rate": 3.093128638781721e-05, + "loss": 1.7274, + "step": 20716 + }, + { + "epoch": 6.3588090853284225, + "grad_norm": 0.18189528584480286, + "learning_rate": 3.092669159353309e-05, + "loss": 1.7079, + "step": 20717 + }, + { + "epoch": 6.359116022099448, + "grad_norm": 0.21583771705627441, + "learning_rate": 3.092209698774259e-05, + "loss": 1.6811, + "step": 20718 + }, + { + "epoch": 6.359422958870473, + "grad_norm": 0.2477681040763855, + "learning_rate": 3.091750257049109e-05, + "loss": 1.6963, + "step": 20719 + }, + { + "epoch": 6.359729895641498, + "grad_norm": 0.2883109152317047, + "learning_rate": 3.091290834182403e-05, + "loss": 1.8349, + "step": 20720 + }, + { + "epoch": 6.360036832412523, + "grad_norm": 0.23407170176506042, + "learning_rate": 3.09083143017868e-05, + "loss": 1.7271, + "step": 20721 + }, + { + "epoch": 6.360343769183548, + "grad_norm": 0.2818833589553833, + "learning_rate": 3.090372045042479e-05, + "loss": 1.7852, + "step": 20722 + }, + { + "epoch": 6.360650705954574, + "grad_norm": 0.24415317177772522, + "learning_rate": 3.089912678778341e-05, + "loss": 1.6826, + "step": 20723 + }, + { + "epoch": 6.360957642725599, + "grad_norm": 0.26786303520202637, + "learning_rate": 3.0894533313908056e-05, + "loss": 1.7616, + "step": 20724 + }, + { + "epoch": 6.361264579496623, + "grad_norm": 0.3235633969306946, + "learning_rate": 3.088994002884411e-05, + "loss": 1.7637, + "step": 20725 + }, + { + "epoch": 6.361571516267649, + "grad_norm": 0.18675416707992554, + "learning_rate": 3.0885346932637e-05, + "loss": 1.7037, + "step": 20726 + }, + { + "epoch": 6.361878453038674, + "grad_norm": 0.295802503824234, + "learning_rate": 3.0880754025332084e-05, + "loss": 1.7435, + "step": 20727 + }, + { + "epoch": 6.362185389809699, + "grad_norm": 0.18665561079978943, + "learning_rate": 3.0876161306974756e-05, + "loss": 1.684, + "step": 20728 + }, + { + "epoch": 6.362492326580725, + "grad_norm": 0.2530463635921478, + "learning_rate": 3.087156877761043e-05, + "loss": 1.7934, + "step": 20729 + }, + { + "epoch": 6.362799263351749, + "grad_norm": 0.17860126495361328, + "learning_rate": 3.086697643728445e-05, + "loss": 1.6977, + "step": 20730 + }, + { + "epoch": 6.3631062001227745, + "grad_norm": 0.20118845999240875, + "learning_rate": 3.086238428604223e-05, + "loss": 1.7241, + "step": 20731 + }, + { + "epoch": 6.3634131368938, + "grad_norm": 0.18811924755573273, + "learning_rate": 3.085779232392915e-05, + "loss": 1.6918, + "step": 20732 + }, + { + "epoch": 6.363720073664825, + "grad_norm": 0.1841908097267151, + "learning_rate": 3.085320055099058e-05, + "loss": 1.735, + "step": 20733 + }, + { + "epoch": 6.3640270104358505, + "grad_norm": 0.1956033855676651, + "learning_rate": 3.08486089672719e-05, + "loss": 1.7203, + "step": 20734 + }, + { + "epoch": 6.364333947206875, + "grad_norm": 0.19844500720500946, + "learning_rate": 3.084401757281851e-05, + "loss": 1.6767, + "step": 20735 + }, + { + "epoch": 6.3646408839779, + "grad_norm": 0.2018919438123703, + "learning_rate": 3.083942636767575e-05, + "loss": 1.6912, + "step": 20736 + }, + { + "epoch": 6.364947820748926, + "grad_norm": 0.18929271399974823, + "learning_rate": 3.083483535188901e-05, + "loss": 1.6838, + "step": 20737 + }, + { + "epoch": 6.365254757519951, + "grad_norm": 0.19833499193191528, + "learning_rate": 3.0830244525503674e-05, + "loss": 1.7139, + "step": 20738 + }, + { + "epoch": 6.365561694290976, + "grad_norm": 0.17029902338981628, + "learning_rate": 3.082565388856509e-05, + "loss": 1.6665, + "step": 20739 + }, + { + "epoch": 6.365868631062002, + "grad_norm": 0.19526802003383636, + "learning_rate": 3.082106344111861e-05, + "loss": 1.7021, + "step": 20740 + }, + { + "epoch": 6.366175567833026, + "grad_norm": 0.19061279296875, + "learning_rate": 3.081647318320966e-05, + "loss": 1.7134, + "step": 20741 + }, + { + "epoch": 6.366482504604051, + "grad_norm": 0.17782293260097504, + "learning_rate": 3.081188311488354e-05, + "loss": 1.741, + "step": 20742 + }, + { + "epoch": 6.366789441375077, + "grad_norm": 0.20002372562885284, + "learning_rate": 3.080729323618565e-05, + "loss": 1.6943, + "step": 20743 + }, + { + "epoch": 6.367096378146102, + "grad_norm": 0.22873486578464508, + "learning_rate": 3.080270354716134e-05, + "loss": 1.7223, + "step": 20744 + }, + { + "epoch": 6.367403314917127, + "grad_norm": 0.191136434674263, + "learning_rate": 3.079811404785595e-05, + "loss": 1.6774, + "step": 20745 + }, + { + "epoch": 6.367710251688152, + "grad_norm": 0.20446795225143433, + "learning_rate": 3.0793524738314874e-05, + "loss": 1.7443, + "step": 20746 + }, + { + "epoch": 6.368017188459177, + "grad_norm": 0.20668596029281616, + "learning_rate": 3.078893561858341e-05, + "loss": 1.7553, + "step": 20747 + }, + { + "epoch": 6.3683241252302025, + "grad_norm": 0.18445394933223724, + "learning_rate": 3.078434668870698e-05, + "loss": 1.7365, + "step": 20748 + }, + { + "epoch": 6.368631062001228, + "grad_norm": 0.1824318915605545, + "learning_rate": 3.077975794873088e-05, + "loss": 1.7248, + "step": 20749 + }, + { + "epoch": 6.368937998772253, + "grad_norm": 0.18452249467372894, + "learning_rate": 3.077516939870047e-05, + "loss": 1.7095, + "step": 20750 + }, + { + "epoch": 6.3692449355432785, + "grad_norm": 0.17254458367824554, + "learning_rate": 3.077058103866112e-05, + "loss": 1.6937, + "step": 20751 + }, + { + "epoch": 6.369551872314303, + "grad_norm": 0.2022976130247116, + "learning_rate": 3.0765992868658154e-05, + "loss": 1.7593, + "step": 20752 + }, + { + "epoch": 6.369858809085328, + "grad_norm": 0.19274397194385529, + "learning_rate": 3.076140488873691e-05, + "loss": 1.7288, + "step": 20753 + }, + { + "epoch": 6.370165745856354, + "grad_norm": 0.18847523629665375, + "learning_rate": 3.075681709894276e-05, + "loss": 1.7293, + "step": 20754 + }, + { + "epoch": 6.370472682627379, + "grad_norm": 0.21054589748382568, + "learning_rate": 3.075222949932101e-05, + "loss": 1.7688, + "step": 20755 + }, + { + "epoch": 6.370779619398404, + "grad_norm": 0.16934558749198914, + "learning_rate": 3.0747642089917005e-05, + "loss": 1.7092, + "step": 20756 + }, + { + "epoch": 6.371086556169429, + "grad_norm": 0.19154684245586395, + "learning_rate": 3.0743054870776075e-05, + "loss": 1.6827, + "step": 20757 + }, + { + "epoch": 6.371393492940454, + "grad_norm": 0.2622900605201721, + "learning_rate": 3.0738467841943594e-05, + "loss": 1.748, + "step": 20758 + }, + { + "epoch": 6.371700429711479, + "grad_norm": 0.1767888218164444, + "learning_rate": 3.073388100346484e-05, + "loss": 1.717, + "step": 20759 + }, + { + "epoch": 6.372007366482505, + "grad_norm": 0.21692602336406708, + "learning_rate": 3.072929435538518e-05, + "loss": 1.7543, + "step": 20760 + }, + { + "epoch": 6.37231430325353, + "grad_norm": 0.19853977859020233, + "learning_rate": 3.0724707897749926e-05, + "loss": 1.7599, + "step": 20761 + }, + { + "epoch": 6.3726212400245545, + "grad_norm": 0.1904703676700592, + "learning_rate": 3.0720121630604396e-05, + "loss": 1.7094, + "step": 20762 + }, + { + "epoch": 6.37292817679558, + "grad_norm": 0.1961483359336853, + "learning_rate": 3.071553555399395e-05, + "loss": 1.7363, + "step": 20763 + }, + { + "epoch": 6.373235113566605, + "grad_norm": 0.16419392824172974, + "learning_rate": 3.071094966796385e-05, + "loss": 1.7073, + "step": 20764 + }, + { + "epoch": 6.3735420503376305, + "grad_norm": 0.1784946471452713, + "learning_rate": 3.0706363972559476e-05, + "loss": 1.699, + "step": 20765 + }, + { + "epoch": 6.373848987108656, + "grad_norm": 0.19472888112068176, + "learning_rate": 3.070177846782611e-05, + "loss": 1.7541, + "step": 20766 + }, + { + "epoch": 6.37415592387968, + "grad_norm": 0.2355004847049713, + "learning_rate": 3.0697193153809076e-05, + "loss": 1.7389, + "step": 20767 + }, + { + "epoch": 6.374462860650706, + "grad_norm": 0.1956906020641327, + "learning_rate": 3.069260803055369e-05, + "loss": 1.7197, + "step": 20768 + }, + { + "epoch": 6.374769797421731, + "grad_norm": 0.21212655305862427, + "learning_rate": 3.068802309810529e-05, + "loss": 1.7291, + "step": 20769 + }, + { + "epoch": 6.375076734192756, + "grad_norm": 0.22920182347297668, + "learning_rate": 3.068343835650914e-05, + "loss": 1.7397, + "step": 20770 + }, + { + "epoch": 6.375383670963782, + "grad_norm": 0.2143404483795166, + "learning_rate": 3.0678853805810605e-05, + "loss": 1.76, + "step": 20771 + }, + { + "epoch": 6.375690607734807, + "grad_norm": 0.1848321557044983, + "learning_rate": 3.067426944605492e-05, + "loss": 1.7127, + "step": 20772 + }, + { + "epoch": 6.3759975445058314, + "grad_norm": 0.23339331150054932, + "learning_rate": 3.0669685277287465e-05, + "loss": 1.7828, + "step": 20773 + }, + { + "epoch": 6.376304481276857, + "grad_norm": 0.19590741395950317, + "learning_rate": 3.066510129955349e-05, + "loss": 1.7224, + "step": 20774 + }, + { + "epoch": 6.376611418047882, + "grad_norm": 0.19986604154109955, + "learning_rate": 3.066051751289833e-05, + "loss": 1.7412, + "step": 20775 + }, + { + "epoch": 6.3769183548189075, + "grad_norm": 0.18629087507724762, + "learning_rate": 3.0655933917367266e-05, + "loss": 1.695, + "step": 20776 + }, + { + "epoch": 6.377225291589933, + "grad_norm": 0.2248111218214035, + "learning_rate": 3.0651350513005605e-05, + "loss": 1.7685, + "step": 20777 + }, + { + "epoch": 6.377532228360957, + "grad_norm": 0.1803683638572693, + "learning_rate": 3.064676729985864e-05, + "loss": 1.7206, + "step": 20778 + }, + { + "epoch": 6.377839165131983, + "grad_norm": 0.23836754262447357, + "learning_rate": 3.064218427797165e-05, + "loss": 1.7428, + "step": 20779 + }, + { + "epoch": 6.378146101903008, + "grad_norm": 0.22549279034137726, + "learning_rate": 3.063760144738996e-05, + "loss": 1.7314, + "step": 20780 + }, + { + "epoch": 6.378453038674033, + "grad_norm": 0.20714345574378967, + "learning_rate": 3.063301880815882e-05, + "loss": 1.7179, + "step": 20781 + }, + { + "epoch": 6.378759975445059, + "grad_norm": 0.17024052143096924, + "learning_rate": 3.0628436360323565e-05, + "loss": 1.6602, + "step": 20782 + }, + { + "epoch": 6.379066912216084, + "grad_norm": 0.20378601551055908, + "learning_rate": 3.062385410392943e-05, + "loss": 1.7708, + "step": 20783 + }, + { + "epoch": 6.379373848987108, + "grad_norm": 0.1885673850774765, + "learning_rate": 3.0619272039021734e-05, + "loss": 1.7034, + "step": 20784 + }, + { + "epoch": 6.379680785758134, + "grad_norm": 0.18746556341648102, + "learning_rate": 3.0614690165645746e-05, + "loss": 1.6946, + "step": 20785 + }, + { + "epoch": 6.379987722529159, + "grad_norm": 0.19569392502307892, + "learning_rate": 3.061010848384677e-05, + "loss": 1.7298, + "step": 20786 + }, + { + "epoch": 6.380294659300184, + "grad_norm": 0.21114139258861542, + "learning_rate": 3.0605526993670046e-05, + "loss": 1.795, + "step": 20787 + }, + { + "epoch": 6.38060159607121, + "grad_norm": 0.20940302312374115, + "learning_rate": 3.06009456951609e-05, + "loss": 1.6747, + "step": 20788 + }, + { + "epoch": 6.380908532842234, + "grad_norm": 0.21008993685245514, + "learning_rate": 3.059636458836455e-05, + "loss": 1.7219, + "step": 20789 + }, + { + "epoch": 6.3812154696132595, + "grad_norm": 0.17642457783222198, + "learning_rate": 3.0591783673326304e-05, + "loss": 1.6555, + "step": 20790 + }, + { + "epoch": 6.381522406384285, + "grad_norm": 0.2786177396774292, + "learning_rate": 3.058720295009143e-05, + "loss": 1.8463, + "step": 20791 + }, + { + "epoch": 6.38182934315531, + "grad_norm": 0.21209503710269928, + "learning_rate": 3.058262241870521e-05, + "loss": 1.6848, + "step": 20792 + }, + { + "epoch": 6.3821362799263355, + "grad_norm": 0.1880561262369156, + "learning_rate": 3.057804207921287e-05, + "loss": 1.7401, + "step": 20793 + }, + { + "epoch": 6.382443216697361, + "grad_norm": 0.22108516097068787, + "learning_rate": 3.0573461931659726e-05, + "loss": 1.7482, + "step": 20794 + }, + { + "epoch": 6.382750153468385, + "grad_norm": 0.2161533385515213, + "learning_rate": 3.0568881976091006e-05, + "loss": 1.7425, + "step": 20795 + }, + { + "epoch": 6.383057090239411, + "grad_norm": 0.22933612763881683, + "learning_rate": 3.0564302212551975e-05, + "loss": 1.7424, + "step": 20796 + }, + { + "epoch": 6.383364027010436, + "grad_norm": 0.19572989642620087, + "learning_rate": 3.0559722641087916e-05, + "loss": 1.6763, + "step": 20797 + }, + { + "epoch": 6.383670963781461, + "grad_norm": 0.2181084007024765, + "learning_rate": 3.0555143261744056e-05, + "loss": 1.7164, + "step": 20798 + }, + { + "epoch": 6.383977900552487, + "grad_norm": 0.1927991509437561, + "learning_rate": 3.055056407456569e-05, + "loss": 1.6833, + "step": 20799 + }, + { + "epoch": 6.384284837323511, + "grad_norm": 0.20569704473018646, + "learning_rate": 3.0545985079598025e-05, + "loss": 1.7716, + "step": 20800 + }, + { + "epoch": 6.384591774094536, + "grad_norm": 0.1856541931629181, + "learning_rate": 3.054140627688635e-05, + "loss": 1.6939, + "step": 20801 + }, + { + "epoch": 6.384898710865562, + "grad_norm": 0.2450970858335495, + "learning_rate": 3.05368276664759e-05, + "loss": 1.8197, + "step": 20802 + }, + { + "epoch": 6.385205647636587, + "grad_norm": 0.23325784504413605, + "learning_rate": 3.053224924841194e-05, + "loss": 1.7195, + "step": 20803 + }, + { + "epoch": 6.385512584407612, + "grad_norm": 0.19614358246326447, + "learning_rate": 3.052767102273968e-05, + "loss": 1.6966, + "step": 20804 + }, + { + "epoch": 6.385819521178637, + "grad_norm": 0.20615628361701965, + "learning_rate": 3.0523092989504415e-05, + "loss": 1.7429, + "step": 20805 + }, + { + "epoch": 6.386126457949662, + "grad_norm": 0.18418943881988525, + "learning_rate": 3.0518515148751336e-05, + "loss": 1.7612, + "step": 20806 + }, + { + "epoch": 6.3864333947206875, + "grad_norm": 0.17176245152950287, + "learning_rate": 3.0513937500525725e-05, + "loss": 1.6918, + "step": 20807 + }, + { + "epoch": 6.386740331491713, + "grad_norm": 0.22239255905151367, + "learning_rate": 3.0509360044872787e-05, + "loss": 1.8072, + "step": 20808 + }, + { + "epoch": 6.387047268262738, + "grad_norm": 0.20312704145908356, + "learning_rate": 3.0504782781837798e-05, + "loss": 1.7348, + "step": 20809 + }, + { + "epoch": 6.387354205033763, + "grad_norm": 0.23198208212852478, + "learning_rate": 3.0500205711465958e-05, + "loss": 1.7516, + "step": 20810 + }, + { + "epoch": 6.387661141804788, + "grad_norm": 0.2244081050157547, + "learning_rate": 3.0495628833802526e-05, + "loss": 1.731, + "step": 20811 + }, + { + "epoch": 6.387968078575813, + "grad_norm": 0.18282169103622437, + "learning_rate": 3.0491052148892717e-05, + "loss": 1.6743, + "step": 20812 + }, + { + "epoch": 6.388275015346839, + "grad_norm": 0.19108405709266663, + "learning_rate": 3.0486475656781753e-05, + "loss": 1.7485, + "step": 20813 + }, + { + "epoch": 6.388581952117864, + "grad_norm": 0.20574834942817688, + "learning_rate": 3.0481899357514898e-05, + "loss": 1.6979, + "step": 20814 + }, + { + "epoch": 6.388888888888889, + "grad_norm": 0.21263298392295837, + "learning_rate": 3.047732325113733e-05, + "loss": 1.687, + "step": 20815 + }, + { + "epoch": 6.389195825659914, + "grad_norm": 0.22646664083003998, + "learning_rate": 3.047274733769432e-05, + "loss": 1.7593, + "step": 20816 + }, + { + "epoch": 6.389502762430939, + "grad_norm": 0.1846906542778015, + "learning_rate": 3.046817161723104e-05, + "loss": 1.7271, + "step": 20817 + }, + { + "epoch": 6.389809699201964, + "grad_norm": 0.1965247541666031, + "learning_rate": 3.0463596089792746e-05, + "loss": 1.7121, + "step": 20818 + }, + { + "epoch": 6.39011663597299, + "grad_norm": 0.255577951669693, + "learning_rate": 3.045902075542464e-05, + "loss": 1.7311, + "step": 20819 + }, + { + "epoch": 6.390423572744015, + "grad_norm": 0.1837676465511322, + "learning_rate": 3.0454445614171966e-05, + "loss": 1.7177, + "step": 20820 + }, + { + "epoch": 6.3907305095150395, + "grad_norm": 0.24845893681049347, + "learning_rate": 3.0449870666079895e-05, + "loss": 1.6902, + "step": 20821 + }, + { + "epoch": 6.391037446286065, + "grad_norm": 0.28572577238082886, + "learning_rate": 3.0445295911193678e-05, + "loss": 1.7942, + "step": 20822 + }, + { + "epoch": 6.39134438305709, + "grad_norm": 0.20460839569568634, + "learning_rate": 3.044072134955849e-05, + "loss": 1.6747, + "step": 20823 + }, + { + "epoch": 6.3916513198281155, + "grad_norm": 0.3547010123729706, + "learning_rate": 3.0436146981219565e-05, + "loss": 1.7359, + "step": 20824 + }, + { + "epoch": 6.391958256599141, + "grad_norm": 0.20490451157093048, + "learning_rate": 3.04315728062221e-05, + "loss": 1.6863, + "step": 20825 + }, + { + "epoch": 6.392265193370166, + "grad_norm": 0.25874415040016174, + "learning_rate": 3.0426998824611307e-05, + "loss": 1.6798, + "step": 20826 + }, + { + "epoch": 6.392572130141191, + "grad_norm": 0.27858632802963257, + "learning_rate": 3.0422425036432378e-05, + "loss": 1.6943, + "step": 20827 + }, + { + "epoch": 6.392879066912216, + "grad_norm": 0.20951922237873077, + "learning_rate": 3.041785144173054e-05, + "loss": 1.7025, + "step": 20828 + }, + { + "epoch": 6.393186003683241, + "grad_norm": 0.3158397674560547, + "learning_rate": 3.0413278040550952e-05, + "loss": 1.7193, + "step": 20829 + }, + { + "epoch": 6.393492940454267, + "grad_norm": 0.18556484580039978, + "learning_rate": 3.0408704832938824e-05, + "loss": 1.7017, + "step": 20830 + }, + { + "epoch": 6.393799877225292, + "grad_norm": 0.31651169061660767, + "learning_rate": 3.0404131818939376e-05, + "loss": 1.7716, + "step": 20831 + }, + { + "epoch": 6.394106813996316, + "grad_norm": 0.2850388288497925, + "learning_rate": 3.0399558998597765e-05, + "loss": 1.7144, + "step": 20832 + }, + { + "epoch": 6.394413750767342, + "grad_norm": 0.19256308674812317, + "learning_rate": 3.0394986371959223e-05, + "loss": 1.6603, + "step": 20833 + }, + { + "epoch": 6.394720687538367, + "grad_norm": 0.2654922604560852, + "learning_rate": 3.0390413939068896e-05, + "loss": 1.6825, + "step": 20834 + }, + { + "epoch": 6.395027624309392, + "grad_norm": 0.19514231383800507, + "learning_rate": 3.0385841699971997e-05, + "loss": 1.7226, + "step": 20835 + }, + { + "epoch": 6.395334561080418, + "grad_norm": 0.27765151858329773, + "learning_rate": 3.0381269654713702e-05, + "loss": 1.7599, + "step": 20836 + }, + { + "epoch": 6.395641497851442, + "grad_norm": 0.2056504338979721, + "learning_rate": 3.0376697803339215e-05, + "loss": 1.7237, + "step": 20837 + }, + { + "epoch": 6.3959484346224675, + "grad_norm": 0.22516649961471558, + "learning_rate": 3.0372126145893688e-05, + "loss": 1.7566, + "step": 20838 + }, + { + "epoch": 6.396255371393493, + "grad_norm": 0.17632099986076355, + "learning_rate": 3.0367554682422327e-05, + "loss": 1.7014, + "step": 20839 + }, + { + "epoch": 6.396562308164518, + "grad_norm": 0.21872831881046295, + "learning_rate": 3.036298341297028e-05, + "loss": 1.6935, + "step": 20840 + }, + { + "epoch": 6.3968692449355435, + "grad_norm": 0.22132672369480133, + "learning_rate": 3.0358412337582752e-05, + "loss": 1.6735, + "step": 20841 + }, + { + "epoch": 6.397176181706568, + "grad_norm": 0.17865684628486633, + "learning_rate": 3.0353841456304895e-05, + "loss": 1.7097, + "step": 20842 + }, + { + "epoch": 6.397483118477593, + "grad_norm": 0.2069701999425888, + "learning_rate": 3.0349270769181914e-05, + "loss": 1.7592, + "step": 20843 + }, + { + "epoch": 6.397790055248619, + "grad_norm": 0.19800925254821777, + "learning_rate": 3.034470027625893e-05, + "loss": 1.6943, + "step": 20844 + }, + { + "epoch": 6.398096992019644, + "grad_norm": 0.24116787314414978, + "learning_rate": 3.0340129977581165e-05, + "loss": 1.7126, + "step": 20845 + }, + { + "epoch": 6.398403928790669, + "grad_norm": 0.1995212435722351, + "learning_rate": 3.033555987319375e-05, + "loss": 1.75, + "step": 20846 + }, + { + "epoch": 6.398710865561695, + "grad_norm": 0.23717111349105835, + "learning_rate": 3.0330989963141843e-05, + "loss": 1.7338, + "step": 20847 + }, + { + "epoch": 6.399017802332719, + "grad_norm": 0.18372474610805511, + "learning_rate": 3.0326420247470643e-05, + "loss": 1.7034, + "step": 20848 + }, + { + "epoch": 6.399324739103744, + "grad_norm": 0.25953924655914307, + "learning_rate": 3.0321850726225265e-05, + "loss": 1.731, + "step": 20849 + }, + { + "epoch": 6.39963167587477, + "grad_norm": 0.24846702814102173, + "learning_rate": 3.031728139945092e-05, + "loss": 1.7559, + "step": 20850 + }, + { + "epoch": 6.399938612645795, + "grad_norm": 0.20783887803554535, + "learning_rate": 3.0312712267192713e-05, + "loss": 1.7229, + "step": 20851 + }, + { + "epoch": 6.4002455494168204, + "grad_norm": 0.1904737949371338, + "learning_rate": 3.030814332949583e-05, + "loss": 1.6986, + "step": 20852 + }, + { + "epoch": 6.400552486187845, + "grad_norm": 0.2275397777557373, + "learning_rate": 3.030357458640541e-05, + "loss": 1.708, + "step": 20853 + }, + { + "epoch": 6.40085942295887, + "grad_norm": 0.20119737088680267, + "learning_rate": 3.0299006037966628e-05, + "loss": 1.7727, + "step": 20854 + }, + { + "epoch": 6.401166359729896, + "grad_norm": 0.17214249074459076, + "learning_rate": 3.0294437684224596e-05, + "loss": 1.6674, + "step": 20855 + }, + { + "epoch": 6.401473296500921, + "grad_norm": 0.21268978714942932, + "learning_rate": 3.02898695252245e-05, + "loss": 1.7182, + "step": 20856 + }, + { + "epoch": 6.401780233271946, + "grad_norm": 0.19911682605743408, + "learning_rate": 3.0285301561011448e-05, + "loss": 1.6861, + "step": 20857 + }, + { + "epoch": 6.402087170042972, + "grad_norm": 0.194064199924469, + "learning_rate": 3.0280733791630613e-05, + "loss": 1.6768, + "step": 20858 + }, + { + "epoch": 6.402394106813996, + "grad_norm": 0.17554323375225067, + "learning_rate": 3.027616621712711e-05, + "loss": 1.6987, + "step": 20859 + }, + { + "epoch": 6.402701043585021, + "grad_norm": 0.205257385969162, + "learning_rate": 3.027159883754611e-05, + "loss": 1.7951, + "step": 20860 + }, + { + "epoch": 6.403007980356047, + "grad_norm": 0.1766849011182785, + "learning_rate": 3.0267031652932743e-05, + "loss": 1.7157, + "step": 20861 + }, + { + "epoch": 6.403314917127072, + "grad_norm": 0.17106789350509644, + "learning_rate": 3.0262464663332106e-05, + "loss": 1.685, + "step": 20862 + }, + { + "epoch": 6.403621853898097, + "grad_norm": 0.17380768060684204, + "learning_rate": 3.0257897868789377e-05, + "loss": 1.708, + "step": 20863 + }, + { + "epoch": 6.403928790669122, + "grad_norm": 0.15817396342754364, + "learning_rate": 3.0253331269349662e-05, + "loss": 1.6629, + "step": 20864 + }, + { + "epoch": 6.404235727440147, + "grad_norm": 0.18253934383392334, + "learning_rate": 3.0248764865058122e-05, + "loss": 1.6877, + "step": 20865 + }, + { + "epoch": 6.4045426642111725, + "grad_norm": 0.20645618438720703, + "learning_rate": 3.0244198655959843e-05, + "loss": 1.7238, + "step": 20866 + }, + { + "epoch": 6.404849600982198, + "grad_norm": 0.2216680645942688, + "learning_rate": 3.0239632642099992e-05, + "loss": 1.7721, + "step": 20867 + }, + { + "epoch": 6.405156537753223, + "grad_norm": 0.21479755640029907, + "learning_rate": 3.023506682352365e-05, + "loss": 1.6686, + "step": 20868 + }, + { + "epoch": 6.4054634745242485, + "grad_norm": 0.21274925768375397, + "learning_rate": 3.0230501200275974e-05, + "loss": 1.7245, + "step": 20869 + }, + { + "epoch": 6.405770411295273, + "grad_norm": 0.19894039630889893, + "learning_rate": 3.0225935772402064e-05, + "loss": 1.6734, + "step": 20870 + }, + { + "epoch": 6.406077348066298, + "grad_norm": 0.24450170993804932, + "learning_rate": 3.022137053994707e-05, + "loss": 1.7103, + "step": 20871 + }, + { + "epoch": 6.406384284837324, + "grad_norm": 0.18289846181869507, + "learning_rate": 3.0216805502956057e-05, + "loss": 1.7866, + "step": 20872 + }, + { + "epoch": 6.406691221608349, + "grad_norm": 0.2884466350078583, + "learning_rate": 3.021224066147419e-05, + "loss": 1.7817, + "step": 20873 + }, + { + "epoch": 6.406998158379374, + "grad_norm": 0.21871373057365417, + "learning_rate": 3.0207676015546537e-05, + "loss": 1.6871, + "step": 20874 + }, + { + "epoch": 6.407305095150399, + "grad_norm": 0.239889994263649, + "learning_rate": 3.0203111565218244e-05, + "loss": 1.6412, + "step": 20875 + }, + { + "epoch": 6.407612031921424, + "grad_norm": 0.26960206031799316, + "learning_rate": 3.019854731053441e-05, + "loss": 1.7537, + "step": 20876 + }, + { + "epoch": 6.407918968692449, + "grad_norm": 0.32872483134269714, + "learning_rate": 3.019398325154013e-05, + "loss": 1.7718, + "step": 20877 + }, + { + "epoch": 6.408225905463475, + "grad_norm": 0.27766308188438416, + "learning_rate": 3.018941938828053e-05, + "loss": 1.7537, + "step": 20878 + }, + { + "epoch": 6.4085328422345, + "grad_norm": 0.1989286094903946, + "learning_rate": 3.0184855720800674e-05, + "loss": 1.7373, + "step": 20879 + }, + { + "epoch": 6.4088397790055245, + "grad_norm": 0.19748768210411072, + "learning_rate": 3.0180292249145703e-05, + "loss": 1.6821, + "step": 20880 + }, + { + "epoch": 6.40914671577655, + "grad_norm": 0.20632879436016083, + "learning_rate": 3.0175728973360694e-05, + "loss": 1.7641, + "step": 20881 + }, + { + "epoch": 6.409453652547575, + "grad_norm": 0.23808124661445618, + "learning_rate": 3.017116589349076e-05, + "loss": 1.7434, + "step": 20882 + }, + { + "epoch": 6.4097605893186005, + "grad_norm": 0.265514612197876, + "learning_rate": 3.0166603009580974e-05, + "loss": 1.7877, + "step": 20883 + }, + { + "epoch": 6.410067526089626, + "grad_norm": 0.21031250059604645, + "learning_rate": 3.0162040321676465e-05, + "loss": 1.738, + "step": 20884 + }, + { + "epoch": 6.41037446286065, + "grad_norm": 0.3011578619480133, + "learning_rate": 3.015747782982228e-05, + "loss": 1.7063, + "step": 20885 + }, + { + "epoch": 6.410681399631676, + "grad_norm": 0.28601503372192383, + "learning_rate": 3.015291553406353e-05, + "loss": 1.7021, + "step": 20886 + }, + { + "epoch": 6.410988336402701, + "grad_norm": 0.2433992624282837, + "learning_rate": 3.014835343444531e-05, + "loss": 1.6887, + "step": 20887 + }, + { + "epoch": 6.411295273173726, + "grad_norm": 0.3342660963535309, + "learning_rate": 3.014379153101269e-05, + "loss": 1.7798, + "step": 20888 + }, + { + "epoch": 6.411602209944752, + "grad_norm": 0.2390800267457962, + "learning_rate": 3.0139229823810757e-05, + "loss": 1.774, + "step": 20889 + }, + { + "epoch": 6.411909146715777, + "grad_norm": 0.2659217417240143, + "learning_rate": 3.0134668312884613e-05, + "loss": 1.7396, + "step": 20890 + }, + { + "epoch": 6.412216083486801, + "grad_norm": 0.22885620594024658, + "learning_rate": 3.0130106998279294e-05, + "loss": 1.7303, + "step": 20891 + }, + { + "epoch": 6.412523020257827, + "grad_norm": 0.20651856064796448, + "learning_rate": 3.0125545880039925e-05, + "loss": 1.7796, + "step": 20892 + }, + { + "epoch": 6.412829957028852, + "grad_norm": 0.26611828804016113, + "learning_rate": 3.0120984958211552e-05, + "loss": 1.7019, + "step": 20893 + }, + { + "epoch": 6.413136893799877, + "grad_norm": 0.2526776194572449, + "learning_rate": 3.0116424232839258e-05, + "loss": 1.7062, + "step": 20894 + }, + { + "epoch": 6.413443830570903, + "grad_norm": 0.2087634801864624, + "learning_rate": 3.0111863703968128e-05, + "loss": 1.7011, + "step": 20895 + }, + { + "epoch": 6.413750767341927, + "grad_norm": 0.20656780898571014, + "learning_rate": 3.0107303371643197e-05, + "loss": 1.7637, + "step": 20896 + }, + { + "epoch": 6.4140577041129525, + "grad_norm": 0.2083009034395218, + "learning_rate": 3.010274323590956e-05, + "loss": 1.7213, + "step": 20897 + }, + { + "epoch": 6.414364640883978, + "grad_norm": 0.22496090829372406, + "learning_rate": 3.0098183296812277e-05, + "loss": 1.7793, + "step": 20898 + }, + { + "epoch": 6.414671577655003, + "grad_norm": 0.2601132392883301, + "learning_rate": 3.0093623554396416e-05, + "loss": 1.8358, + "step": 20899 + }, + { + "epoch": 6.4149785144260285, + "grad_norm": 0.2364497184753418, + "learning_rate": 3.0089064008707026e-05, + "loss": 1.7299, + "step": 20900 + }, + { + "epoch": 6.415285451197054, + "grad_norm": 0.2011861503124237, + "learning_rate": 3.0084504659789186e-05, + "loss": 1.7521, + "step": 20901 + }, + { + "epoch": 6.415592387968078, + "grad_norm": 0.20605513453483582, + "learning_rate": 3.007994550768793e-05, + "loss": 1.7099, + "step": 20902 + }, + { + "epoch": 6.415899324739104, + "grad_norm": 0.20890796184539795, + "learning_rate": 3.0075386552448337e-05, + "loss": 1.7383, + "step": 20903 + }, + { + "epoch": 6.416206261510129, + "grad_norm": 0.20005083084106445, + "learning_rate": 3.0070827794115452e-05, + "loss": 1.6999, + "step": 20904 + }, + { + "epoch": 6.416513198281154, + "grad_norm": 0.20547670125961304, + "learning_rate": 3.006626923273433e-05, + "loss": 1.7424, + "step": 20905 + }, + { + "epoch": 6.41682013505218, + "grad_norm": 0.20799006521701813, + "learning_rate": 3.0061710868350003e-05, + "loss": 1.7266, + "step": 20906 + }, + { + "epoch": 6.417127071823204, + "grad_norm": 0.22234687209129333, + "learning_rate": 3.0057152701007563e-05, + "loss": 1.7755, + "step": 20907 + }, + { + "epoch": 6.417434008594229, + "grad_norm": 0.21947267651557922, + "learning_rate": 3.0052594730752005e-05, + "loss": 1.826, + "step": 20908 + }, + { + "epoch": 6.417740945365255, + "grad_norm": 0.2183268964290619, + "learning_rate": 3.0048036957628416e-05, + "loss": 1.7772, + "step": 20909 + }, + { + "epoch": 6.41804788213628, + "grad_norm": 0.1967134177684784, + "learning_rate": 3.0043479381681805e-05, + "loss": 1.6833, + "step": 20910 + }, + { + "epoch": 6.418354818907305, + "grad_norm": 0.2016787827014923, + "learning_rate": 3.003892200295723e-05, + "loss": 1.773, + "step": 20911 + }, + { + "epoch": 6.41866175567833, + "grad_norm": 0.2192344218492508, + "learning_rate": 3.0034364821499745e-05, + "loss": 1.7124, + "step": 20912 + }, + { + "epoch": 6.418968692449355, + "grad_norm": 0.24924327433109283, + "learning_rate": 3.002980783735434e-05, + "loss": 1.6882, + "step": 20913 + }, + { + "epoch": 6.4192756292203805, + "grad_norm": 0.2221844494342804, + "learning_rate": 3.0025251050566106e-05, + "loss": 1.8028, + "step": 20914 + }, + { + "epoch": 6.419582565991406, + "grad_norm": 0.27141162753105164, + "learning_rate": 3.0020694461180033e-05, + "loss": 1.698, + "step": 20915 + }, + { + "epoch": 6.419889502762431, + "grad_norm": 0.18856655061244965, + "learning_rate": 3.001613806924117e-05, + "loss": 1.7112, + "step": 20916 + }, + { + "epoch": 6.420196439533456, + "grad_norm": 0.2226688265800476, + "learning_rate": 3.0011581874794537e-05, + "loss": 1.6967, + "step": 20917 + }, + { + "epoch": 6.420503376304481, + "grad_norm": 0.2070344239473343, + "learning_rate": 3.000702587788518e-05, + "loss": 1.742, + "step": 20918 + }, + { + "epoch": 6.420810313075506, + "grad_norm": 0.22616387903690338, + "learning_rate": 3.00024700785581e-05, + "loss": 1.6865, + "step": 20919 + }, + { + "epoch": 6.421117249846532, + "grad_norm": 0.19745604693889618, + "learning_rate": 2.9997914476858348e-05, + "loss": 1.7328, + "step": 20920 + }, + { + "epoch": 6.421424186617557, + "grad_norm": 0.20654593408107758, + "learning_rate": 2.9993359072830906e-05, + "loss": 1.7811, + "step": 20921 + }, + { + "epoch": 6.421731123388582, + "grad_norm": 0.19188611209392548, + "learning_rate": 2.9988803866520832e-05, + "loss": 1.6808, + "step": 20922 + }, + { + "epoch": 6.422038060159607, + "grad_norm": 0.19907493889331818, + "learning_rate": 2.9984248857973118e-05, + "loss": 1.7326, + "step": 20923 + }, + { + "epoch": 6.422344996930632, + "grad_norm": 0.17484794557094574, + "learning_rate": 2.9979694047232804e-05, + "loss": 1.7166, + "step": 20924 + }, + { + "epoch": 6.422651933701657, + "grad_norm": 0.21412795782089233, + "learning_rate": 2.997513943434487e-05, + "loss": 1.7926, + "step": 20925 + }, + { + "epoch": 6.422958870472683, + "grad_norm": 0.17554008960723877, + "learning_rate": 2.9970585019354357e-05, + "loss": 1.6931, + "step": 20926 + }, + { + "epoch": 6.423265807243708, + "grad_norm": 0.16687868535518646, + "learning_rate": 2.9966030802306256e-05, + "loss": 1.6911, + "step": 20927 + }, + { + "epoch": 6.4235727440147325, + "grad_norm": 0.1802106350660324, + "learning_rate": 2.9961476783245578e-05, + "loss": 1.6921, + "step": 20928 + }, + { + "epoch": 6.423879680785758, + "grad_norm": 0.1968134343624115, + "learning_rate": 2.9956922962217347e-05, + "loss": 1.7035, + "step": 20929 + }, + { + "epoch": 6.424186617556783, + "grad_norm": 0.17703908681869507, + "learning_rate": 2.9952369339266538e-05, + "loss": 1.7122, + "step": 20930 + }, + { + "epoch": 6.4244935543278086, + "grad_norm": 0.22176744043827057, + "learning_rate": 2.9947815914438175e-05, + "loss": 1.7189, + "step": 20931 + }, + { + "epoch": 6.424800491098834, + "grad_norm": 0.19128306210041046, + "learning_rate": 2.9943262687777236e-05, + "loss": 1.7208, + "step": 20932 + }, + { + "epoch": 6.425107427869859, + "grad_norm": 0.2285725623369217, + "learning_rate": 2.9938709659328735e-05, + "loss": 1.7859, + "step": 20933 + }, + { + "epoch": 6.425414364640884, + "grad_norm": 0.1998651921749115, + "learning_rate": 2.9934156829137653e-05, + "loss": 1.6912, + "step": 20934 + }, + { + "epoch": 6.425721301411909, + "grad_norm": 0.1879023313522339, + "learning_rate": 2.9929604197249016e-05, + "loss": 1.7164, + "step": 20935 + }, + { + "epoch": 6.426028238182934, + "grad_norm": 0.2675700783729553, + "learning_rate": 2.992505176370778e-05, + "loss": 1.7475, + "step": 20936 + }, + { + "epoch": 6.42633517495396, + "grad_norm": 0.22345949709415436, + "learning_rate": 2.992049952855896e-05, + "loss": 1.6867, + "step": 20937 + }, + { + "epoch": 6.426642111724985, + "grad_norm": 0.17801997065544128, + "learning_rate": 2.9915947491847517e-05, + "loss": 1.736, + "step": 20938 + }, + { + "epoch": 6.4269490484960095, + "grad_norm": 0.22132502496242523, + "learning_rate": 2.991139565361846e-05, + "loss": 1.7244, + "step": 20939 + }, + { + "epoch": 6.427255985267035, + "grad_norm": 0.1899508535861969, + "learning_rate": 2.9906844013916758e-05, + "loss": 1.6781, + "step": 20940 + }, + { + "epoch": 6.42756292203806, + "grad_norm": 0.21948131918907166, + "learning_rate": 2.9902292572787414e-05, + "loss": 1.6911, + "step": 20941 + }, + { + "epoch": 6.4278698588090855, + "grad_norm": 0.16277503967285156, + "learning_rate": 2.9897741330275387e-05, + "loss": 1.702, + "step": 20942 + }, + { + "epoch": 6.428176795580111, + "grad_norm": 0.22303056716918945, + "learning_rate": 2.989319028642567e-05, + "loss": 1.7573, + "step": 20943 + }, + { + "epoch": 6.428483732351136, + "grad_norm": 0.21077899634838104, + "learning_rate": 2.9888639441283217e-05, + "loss": 1.7903, + "step": 20944 + }, + { + "epoch": 6.428790669122161, + "grad_norm": 0.23918256163597107, + "learning_rate": 2.988408879489303e-05, + "loss": 1.7112, + "step": 20945 + }, + { + "epoch": 6.429097605893186, + "grad_norm": 0.22226610779762268, + "learning_rate": 2.9879538347300074e-05, + "loss": 1.7039, + "step": 20946 + }, + { + "epoch": 6.429404542664211, + "grad_norm": 0.18605270981788635, + "learning_rate": 2.987498809854929e-05, + "loss": 1.7102, + "step": 20947 + }, + { + "epoch": 6.429711479435237, + "grad_norm": 0.24812746047973633, + "learning_rate": 2.987043804868569e-05, + "loss": 1.7112, + "step": 20948 + }, + { + "epoch": 6.430018416206262, + "grad_norm": 0.1869048923254013, + "learning_rate": 2.9865888197754206e-05, + "loss": 1.6946, + "step": 20949 + }, + { + "epoch": 6.430325352977286, + "grad_norm": 0.30707576870918274, + "learning_rate": 2.986133854579982e-05, + "loss": 1.7596, + "step": 20950 + }, + { + "epoch": 6.430632289748312, + "grad_norm": 0.20475640892982483, + "learning_rate": 2.985678909286748e-05, + "loss": 1.7162, + "step": 20951 + }, + { + "epoch": 6.430939226519337, + "grad_norm": 0.24273128807544708, + "learning_rate": 2.9852239839002182e-05, + "loss": 1.6803, + "step": 20952 + }, + { + "epoch": 6.431246163290362, + "grad_norm": 0.27484890818595886, + "learning_rate": 2.9847690784248834e-05, + "loss": 1.7948, + "step": 20953 + }, + { + "epoch": 6.431553100061388, + "grad_norm": 0.2204331010580063, + "learning_rate": 2.984314192865244e-05, + "loss": 1.769, + "step": 20954 + }, + { + "epoch": 6.431860036832412, + "grad_norm": 0.262463241815567, + "learning_rate": 2.9838593272257907e-05, + "loss": 1.7483, + "step": 20955 + }, + { + "epoch": 6.4321669736034375, + "grad_norm": 0.225942924618721, + "learning_rate": 2.983404481511023e-05, + "loss": 1.7228, + "step": 20956 + }, + { + "epoch": 6.432473910374463, + "grad_norm": 0.22381044924259186, + "learning_rate": 2.982949655725432e-05, + "loss": 1.7579, + "step": 20957 + }, + { + "epoch": 6.432780847145488, + "grad_norm": 0.1937711238861084, + "learning_rate": 2.982494849873518e-05, + "loss": 1.6833, + "step": 20958 + }, + { + "epoch": 6.4330877839165135, + "grad_norm": 0.2609664499759674, + "learning_rate": 2.9820400639597702e-05, + "loss": 1.7524, + "step": 20959 + }, + { + "epoch": 6.433394720687538, + "grad_norm": 0.2891463041305542, + "learning_rate": 2.981585297988686e-05, + "loss": 1.7672, + "step": 20960 + }, + { + "epoch": 6.433701657458563, + "grad_norm": 0.19604064524173737, + "learning_rate": 2.9811305519647582e-05, + "loss": 1.6684, + "step": 20961 + }, + { + "epoch": 6.434008594229589, + "grad_norm": 0.23522239923477173, + "learning_rate": 2.9806758258924822e-05, + "loss": 1.7461, + "step": 20962 + }, + { + "epoch": 6.434315531000614, + "grad_norm": 0.24907514452934265, + "learning_rate": 2.9802211197763525e-05, + "loss": 1.7702, + "step": 20963 + }, + { + "epoch": 6.434622467771639, + "grad_norm": 0.21963126957416534, + "learning_rate": 2.9797664336208592e-05, + "loss": 1.7263, + "step": 20964 + }, + { + "epoch": 6.434929404542665, + "grad_norm": 0.23124000430107117, + "learning_rate": 2.9793117674305004e-05, + "loss": 1.7362, + "step": 20965 + }, + { + "epoch": 6.435236341313689, + "grad_norm": 0.1917882263660431, + "learning_rate": 2.978857121209765e-05, + "loss": 1.7505, + "step": 20966 + }, + { + "epoch": 6.435543278084714, + "grad_norm": 0.24407804012298584, + "learning_rate": 2.9784024949631484e-05, + "loss": 1.7898, + "step": 20967 + }, + { + "epoch": 6.43585021485574, + "grad_norm": 0.210384339094162, + "learning_rate": 2.977947888695143e-05, + "loss": 1.7515, + "step": 20968 + }, + { + "epoch": 6.436157151626765, + "grad_norm": 0.20764803886413574, + "learning_rate": 2.9774933024102436e-05, + "loss": 1.7628, + "step": 20969 + }, + { + "epoch": 6.43646408839779, + "grad_norm": 0.21542097628116608, + "learning_rate": 2.9770387361129387e-05, + "loss": 1.7882, + "step": 20970 + }, + { + "epoch": 6.436771025168815, + "grad_norm": 0.1768570989370346, + "learning_rate": 2.976584189807725e-05, + "loss": 1.7471, + "step": 20971 + }, + { + "epoch": 6.43707796193984, + "grad_norm": 0.2398732751607895, + "learning_rate": 2.97612966349909e-05, + "loss": 1.6676, + "step": 20972 + }, + { + "epoch": 6.4373848987108655, + "grad_norm": 0.18291664123535156, + "learning_rate": 2.9756751571915286e-05, + "loss": 1.6791, + "step": 20973 + }, + { + "epoch": 6.437691835481891, + "grad_norm": 0.2769327759742737, + "learning_rate": 2.9752206708895314e-05, + "loss": 1.7675, + "step": 20974 + }, + { + "epoch": 6.437998772252916, + "grad_norm": 0.24859526753425598, + "learning_rate": 2.974766204597592e-05, + "loss": 1.7661, + "step": 20975 + }, + { + "epoch": 6.4383057090239415, + "grad_norm": 0.20495273172855377, + "learning_rate": 2.9743117583201984e-05, + "loss": 1.6774, + "step": 20976 + }, + { + "epoch": 6.438612645794966, + "grad_norm": 0.24650859832763672, + "learning_rate": 2.9738573320618447e-05, + "loss": 1.759, + "step": 20977 + }, + { + "epoch": 6.438919582565991, + "grad_norm": 0.21430176496505737, + "learning_rate": 2.973402925827019e-05, + "loss": 1.7273, + "step": 20978 + }, + { + "epoch": 6.439226519337017, + "grad_norm": 0.22392596304416656, + "learning_rate": 2.972948539620214e-05, + "loss": 1.7506, + "step": 20979 + }, + { + "epoch": 6.439533456108042, + "grad_norm": 0.24393923580646515, + "learning_rate": 2.9724941734459205e-05, + "loss": 1.7815, + "step": 20980 + }, + { + "epoch": 6.439840392879067, + "grad_norm": 0.2873772084712982, + "learning_rate": 2.9720398273086264e-05, + "loss": 1.7863, + "step": 20981 + }, + { + "epoch": 6.440147329650092, + "grad_norm": 0.218470498919487, + "learning_rate": 2.9715855012128246e-05, + "loss": 1.7347, + "step": 20982 + }, + { + "epoch": 6.440454266421117, + "grad_norm": 0.24520666897296906, + "learning_rate": 2.971131195163003e-05, + "loss": 1.6892, + "step": 20983 + }, + { + "epoch": 6.440761203192142, + "grad_norm": 0.2255270928144455, + "learning_rate": 2.970676909163652e-05, + "loss": 1.7179, + "step": 20984 + }, + { + "epoch": 6.441068139963168, + "grad_norm": 0.25171026587486267, + "learning_rate": 2.9702226432192604e-05, + "loss": 1.7087, + "step": 20985 + }, + { + "epoch": 6.441375076734193, + "grad_norm": 0.27045872807502747, + "learning_rate": 2.9697683973343204e-05, + "loss": 1.732, + "step": 20986 + }, + { + "epoch": 6.4416820135052175, + "grad_norm": 0.25374144315719604, + "learning_rate": 2.9693141715133177e-05, + "loss": 1.7688, + "step": 20987 + }, + { + "epoch": 6.441988950276243, + "grad_norm": 0.22694779932498932, + "learning_rate": 2.9688599657607442e-05, + "loss": 1.7105, + "step": 20988 + }, + { + "epoch": 6.442295887047268, + "grad_norm": 0.23455791175365448, + "learning_rate": 2.9684057800810845e-05, + "loss": 1.8007, + "step": 20989 + }, + { + "epoch": 6.4426028238182935, + "grad_norm": 0.23054158687591553, + "learning_rate": 2.9679516144788312e-05, + "loss": 1.6787, + "step": 20990 + }, + { + "epoch": 6.442909760589319, + "grad_norm": 0.22110030055046082, + "learning_rate": 2.9674974689584696e-05, + "loss": 1.8048, + "step": 20991 + }, + { + "epoch": 6.443216697360343, + "grad_norm": 0.22141657769680023, + "learning_rate": 2.9670433435244915e-05, + "loss": 1.7691, + "step": 20992 + }, + { + "epoch": 6.443523634131369, + "grad_norm": 0.18511974811553955, + "learning_rate": 2.9665892381813807e-05, + "loss": 1.6825, + "step": 20993 + }, + { + "epoch": 6.443830570902394, + "grad_norm": 0.21904997527599335, + "learning_rate": 2.966135152933629e-05, + "loss": 1.7711, + "step": 20994 + }, + { + "epoch": 6.444137507673419, + "grad_norm": 0.19334301352500916, + "learning_rate": 2.9656810877857196e-05, + "loss": 1.687, + "step": 20995 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 0.1766969859600067, + "learning_rate": 2.9652270427421426e-05, + "loss": 1.7211, + "step": 20996 + }, + { + "epoch": 6.44475138121547, + "grad_norm": 0.1821468323469162, + "learning_rate": 2.9647730178073864e-05, + "loss": 1.7086, + "step": 20997 + }, + { + "epoch": 6.445058317986494, + "grad_norm": 0.20812760293483734, + "learning_rate": 2.9643190129859333e-05, + "loss": 1.6844, + "step": 20998 + }, + { + "epoch": 6.44536525475752, + "grad_norm": 0.259042352437973, + "learning_rate": 2.9638650282822754e-05, + "loss": 1.7971, + "step": 20999 + }, + { + "epoch": 6.445672191528545, + "grad_norm": 0.2134076952934265, + "learning_rate": 2.9634110637008948e-05, + "loss": 1.7061, + "step": 21000 + }, + { + "epoch": 6.44597912829957, + "grad_norm": 0.21120613813400269, + "learning_rate": 2.962957119246281e-05, + "loss": 1.6708, + "step": 21001 + }, + { + "epoch": 6.446286065070596, + "grad_norm": 0.18577797710895538, + "learning_rate": 2.9625031949229176e-05, + "loss": 1.719, + "step": 21002 + }, + { + "epoch": 6.44659300184162, + "grad_norm": 0.21755708754062653, + "learning_rate": 2.962049290735294e-05, + "loss": 1.7203, + "step": 21003 + }, + { + "epoch": 6.4468999386126455, + "grad_norm": 0.2161538451910019, + "learning_rate": 2.961595406687891e-05, + "loss": 1.7254, + "step": 21004 + }, + { + "epoch": 6.447206875383671, + "grad_norm": 0.19979329407215118, + "learning_rate": 2.9611415427851995e-05, + "loss": 1.7203, + "step": 21005 + }, + { + "epoch": 6.447513812154696, + "grad_norm": 0.2103399932384491, + "learning_rate": 2.9606876990317e-05, + "loss": 1.7291, + "step": 21006 + }, + { + "epoch": 6.4478207489257215, + "grad_norm": 0.19513745605945587, + "learning_rate": 2.9602338754318815e-05, + "loss": 1.7574, + "step": 21007 + }, + { + "epoch": 6.448127685696747, + "grad_norm": 0.19819851219654083, + "learning_rate": 2.9597800719902256e-05, + "loss": 1.6913, + "step": 21008 + }, + { + "epoch": 6.448434622467771, + "grad_norm": 0.1847768872976303, + "learning_rate": 2.9593262887112215e-05, + "loss": 1.6987, + "step": 21009 + }, + { + "epoch": 6.448741559238797, + "grad_norm": 0.22399301826953888, + "learning_rate": 2.9588725255993487e-05, + "loss": 1.8328, + "step": 21010 + }, + { + "epoch": 6.449048496009822, + "grad_norm": 0.20540264248847961, + "learning_rate": 2.958418782659097e-05, + "loss": 1.765, + "step": 21011 + }, + { + "epoch": 6.449355432780847, + "grad_norm": 0.183661550283432, + "learning_rate": 2.9579650598949442e-05, + "loss": 1.7128, + "step": 21012 + }, + { + "epoch": 6.449662369551873, + "grad_norm": 0.1972927302122116, + "learning_rate": 2.9575113573113788e-05, + "loss": 1.717, + "step": 21013 + }, + { + "epoch": 6.449969306322897, + "grad_norm": 0.20188379287719727, + "learning_rate": 2.9570576749128846e-05, + "loss": 1.7603, + "step": 21014 + }, + { + "epoch": 6.4502762430939224, + "grad_norm": 0.20789781212806702, + "learning_rate": 2.9566040127039418e-05, + "loss": 1.7142, + "step": 21015 + }, + { + "epoch": 6.450583179864948, + "grad_norm": 0.19319608807563782, + "learning_rate": 2.956150370689038e-05, + "loss": 1.7524, + "step": 21016 + }, + { + "epoch": 6.450890116635973, + "grad_norm": 0.2153816968202591, + "learning_rate": 2.9556967488726516e-05, + "loss": 1.7325, + "step": 21017 + }, + { + "epoch": 6.4511970534069984, + "grad_norm": 0.19134823977947235, + "learning_rate": 2.9552431472592702e-05, + "loss": 1.7547, + "step": 21018 + }, + { + "epoch": 6.451503990178024, + "grad_norm": 0.21069955825805664, + "learning_rate": 2.9547895658533725e-05, + "loss": 1.7038, + "step": 21019 + }, + { + "epoch": 6.451810926949048, + "grad_norm": 0.20742546021938324, + "learning_rate": 2.9543360046594455e-05, + "loss": 1.7151, + "step": 21020 + }, + { + "epoch": 6.452117863720074, + "grad_norm": 0.16917672753334045, + "learning_rate": 2.9538824636819666e-05, + "loss": 1.6957, + "step": 21021 + }, + { + "epoch": 6.452424800491099, + "grad_norm": 0.21134577691555023, + "learning_rate": 2.953428942925423e-05, + "loss": 1.711, + "step": 21022 + }, + { + "epoch": 6.452731737262124, + "grad_norm": 0.19403810799121857, + "learning_rate": 2.9529754423942918e-05, + "loss": 1.734, + "step": 21023 + }, + { + "epoch": 6.45303867403315, + "grad_norm": 0.18534770607948303, + "learning_rate": 2.9525219620930582e-05, + "loss": 1.6857, + "step": 21024 + }, + { + "epoch": 6.453345610804174, + "grad_norm": 0.24268858134746552, + "learning_rate": 2.9520685020262016e-05, + "loss": 1.7316, + "step": 21025 + }, + { + "epoch": 6.453652547575199, + "grad_norm": 0.17590615153312683, + "learning_rate": 2.9516150621982063e-05, + "loss": 1.6608, + "step": 21026 + }, + { + "epoch": 6.453959484346225, + "grad_norm": 0.1949763298034668, + "learning_rate": 2.9511616426135504e-05, + "loss": 1.7955, + "step": 21027 + }, + { + "epoch": 6.45426642111725, + "grad_norm": 0.2424435019493103, + "learning_rate": 2.950708243276717e-05, + "loss": 1.7334, + "step": 21028 + }, + { + "epoch": 6.454573357888275, + "grad_norm": 0.22753369808197021, + "learning_rate": 2.950254864192184e-05, + "loss": 1.733, + "step": 21029 + }, + { + "epoch": 6.4548802946593, + "grad_norm": 0.1706271469593048, + "learning_rate": 2.949801505364435e-05, + "loss": 1.7424, + "step": 21030 + }, + { + "epoch": 6.455187231430325, + "grad_norm": 0.21614442765712738, + "learning_rate": 2.9493481667979506e-05, + "loss": 1.7813, + "step": 21031 + }, + { + "epoch": 6.4554941682013505, + "grad_norm": 0.1793162226676941, + "learning_rate": 2.9488948484972068e-05, + "loss": 1.7076, + "step": 21032 + }, + { + "epoch": 6.455801104972376, + "grad_norm": 0.19251759350299835, + "learning_rate": 2.9484415504666885e-05, + "loss": 1.7487, + "step": 21033 + }, + { + "epoch": 6.456108041743401, + "grad_norm": 0.1817556619644165, + "learning_rate": 2.947988272710871e-05, + "loss": 1.6958, + "step": 21034 + }, + { + "epoch": 6.456414978514426, + "grad_norm": 0.24368418753147125, + "learning_rate": 2.9475350152342378e-05, + "loss": 1.7867, + "step": 21035 + }, + { + "epoch": 6.456721915285451, + "grad_norm": 0.2362157702445984, + "learning_rate": 2.9470817780412653e-05, + "loss": 1.7241, + "step": 21036 + }, + { + "epoch": 6.457028852056476, + "grad_norm": 0.21049003303050995, + "learning_rate": 2.9466285611364358e-05, + "loss": 1.7146, + "step": 21037 + }, + { + "epoch": 6.457335788827502, + "grad_norm": 0.2516530454158783, + "learning_rate": 2.9461753645242246e-05, + "loss": 1.7349, + "step": 21038 + }, + { + "epoch": 6.457642725598527, + "grad_norm": 0.23165179789066315, + "learning_rate": 2.945722188209114e-05, + "loss": 1.7285, + "step": 21039 + }, + { + "epoch": 6.457949662369552, + "grad_norm": 0.27345010638237, + "learning_rate": 2.945269032195579e-05, + "loss": 1.7266, + "step": 21040 + }, + { + "epoch": 6.458256599140577, + "grad_norm": 0.16312900185585022, + "learning_rate": 2.9448158964881e-05, + "loss": 1.6781, + "step": 21041 + }, + { + "epoch": 6.458563535911602, + "grad_norm": 0.238658607006073, + "learning_rate": 2.9443627810911557e-05, + "loss": 1.6819, + "step": 21042 + }, + { + "epoch": 6.458870472682627, + "grad_norm": 0.19861388206481934, + "learning_rate": 2.943909686009223e-05, + "loss": 1.7397, + "step": 21043 + }, + { + "epoch": 6.459177409453653, + "grad_norm": 0.22675637900829315, + "learning_rate": 2.9434566112467793e-05, + "loss": 1.7231, + "step": 21044 + }, + { + "epoch": 6.459484346224678, + "grad_norm": 0.22638066112995148, + "learning_rate": 2.9430035568083043e-05, + "loss": 1.7466, + "step": 21045 + }, + { + "epoch": 6.4597912829957025, + "grad_norm": 0.2237064391374588, + "learning_rate": 2.942550522698272e-05, + "loss": 1.7373, + "step": 21046 + }, + { + "epoch": 6.460098219766728, + "grad_norm": 0.2613731324672699, + "learning_rate": 2.942097508921162e-05, + "loss": 1.7567, + "step": 21047 + }, + { + "epoch": 6.460405156537753, + "grad_norm": 0.21602070331573486, + "learning_rate": 2.941644515481452e-05, + "loss": 1.7512, + "step": 21048 + }, + { + "epoch": 6.4607120933087785, + "grad_norm": 0.30129116773605347, + "learning_rate": 2.941191542383615e-05, + "loss": 1.761, + "step": 21049 + }, + { + "epoch": 6.461019030079804, + "grad_norm": 0.2303919792175293, + "learning_rate": 2.940738589632132e-05, + "loss": 1.742, + "step": 21050 + }, + { + "epoch": 6.461325966850829, + "grad_norm": 0.2195158153772354, + "learning_rate": 2.940285657231475e-05, + "loss": 1.7169, + "step": 21051 + }, + { + "epoch": 6.461632903621854, + "grad_norm": 0.19029918313026428, + "learning_rate": 2.9398327451861242e-05, + "loss": 1.6721, + "step": 21052 + }, + { + "epoch": 6.461939840392879, + "grad_norm": 0.2006317377090454, + "learning_rate": 2.939379853500553e-05, + "loss": 1.7393, + "step": 21053 + }, + { + "epoch": 6.462246777163904, + "grad_norm": 0.222677081823349, + "learning_rate": 2.9389269821792377e-05, + "loss": 1.7858, + "step": 21054 + }, + { + "epoch": 6.46255371393493, + "grad_norm": 0.20772451162338257, + "learning_rate": 2.938474131226654e-05, + "loss": 1.735, + "step": 21055 + }, + { + "epoch": 6.462860650705955, + "grad_norm": 0.21006503701210022, + "learning_rate": 2.9380213006472778e-05, + "loss": 1.7197, + "step": 21056 + }, + { + "epoch": 6.463167587476979, + "grad_norm": 0.23545250296592712, + "learning_rate": 2.9375684904455825e-05, + "loss": 1.8278, + "step": 21057 + }, + { + "epoch": 6.463474524248005, + "grad_norm": 0.24590329825878143, + "learning_rate": 2.937115700626045e-05, + "loss": 1.6411, + "step": 21058 + }, + { + "epoch": 6.46378146101903, + "grad_norm": 0.22359445691108704, + "learning_rate": 2.9366629311931393e-05, + "loss": 1.7901, + "step": 21059 + }, + { + "epoch": 6.464088397790055, + "grad_norm": 0.22807523608207703, + "learning_rate": 2.93621018215134e-05, + "loss": 1.7472, + "step": 21060 + }, + { + "epoch": 6.464395334561081, + "grad_norm": 0.24183115363121033, + "learning_rate": 2.93575745350512e-05, + "loss": 1.7553, + "step": 21061 + }, + { + "epoch": 6.464702271332105, + "grad_norm": 0.23809055984020233, + "learning_rate": 2.935304745258958e-05, + "loss": 1.7451, + "step": 21062 + }, + { + "epoch": 6.4650092081031305, + "grad_norm": 0.28455644845962524, + "learning_rate": 2.934852057417321e-05, + "loss": 1.8112, + "step": 21063 + }, + { + "epoch": 6.465316144874156, + "grad_norm": 0.22193321585655212, + "learning_rate": 2.9343993899846888e-05, + "loss": 1.747, + "step": 21064 + }, + { + "epoch": 6.465623081645181, + "grad_norm": 0.30524322390556335, + "learning_rate": 2.933946742965532e-05, + "loss": 1.7117, + "step": 21065 + }, + { + "epoch": 6.4659300184162065, + "grad_norm": 0.19748717546463013, + "learning_rate": 2.9334941163643233e-05, + "loss": 1.6899, + "step": 21066 + }, + { + "epoch": 6.466236955187231, + "grad_norm": 0.25551193952560425, + "learning_rate": 2.933041510185539e-05, + "loss": 1.7264, + "step": 21067 + }, + { + "epoch": 6.466543891958256, + "grad_norm": 0.20016206800937653, + "learning_rate": 2.932588924433648e-05, + "loss": 1.6613, + "step": 21068 + }, + { + "epoch": 6.466850828729282, + "grad_norm": 0.31049394607543945, + "learning_rate": 2.932136359113127e-05, + "loss": 1.6575, + "step": 21069 + }, + { + "epoch": 6.467157765500307, + "grad_norm": 0.29408347606658936, + "learning_rate": 2.9316838142284436e-05, + "loss": 1.72, + "step": 21070 + }, + { + "epoch": 6.467464702271332, + "grad_norm": 0.18981193006038666, + "learning_rate": 2.9312312897840748e-05, + "loss": 1.6799, + "step": 21071 + }, + { + "epoch": 6.467771639042358, + "grad_norm": 0.26828575134277344, + "learning_rate": 2.9307787857844905e-05, + "loss": 1.6983, + "step": 21072 + }, + { + "epoch": 6.468078575813382, + "grad_norm": 0.2605530321598053, + "learning_rate": 2.9303263022341642e-05, + "loss": 1.7973, + "step": 21073 + }, + { + "epoch": 6.468385512584407, + "grad_norm": 0.389957070350647, + "learning_rate": 2.9298738391375648e-05, + "loss": 1.7288, + "step": 21074 + }, + { + "epoch": 6.468692449355433, + "grad_norm": 0.20525416731834412, + "learning_rate": 2.9294213964991667e-05, + "loss": 1.7526, + "step": 21075 + }, + { + "epoch": 6.468999386126458, + "grad_norm": 0.3628186285495758, + "learning_rate": 2.9289689743234387e-05, + "loss": 1.7055, + "step": 21076 + }, + { + "epoch": 6.469306322897483, + "grad_norm": 0.21661829948425293, + "learning_rate": 2.9285165726148545e-05, + "loss": 1.7806, + "step": 21077 + }, + { + "epoch": 6.469613259668508, + "grad_norm": 0.3815501034259796, + "learning_rate": 2.9280641913778816e-05, + "loss": 1.7257, + "step": 21078 + }, + { + "epoch": 6.469920196439533, + "grad_norm": 0.19470983743667603, + "learning_rate": 2.9276118306169957e-05, + "loss": 1.7055, + "step": 21079 + }, + { + "epoch": 6.4702271332105585, + "grad_norm": 0.36236056685447693, + "learning_rate": 2.927159490336662e-05, + "loss": 1.6748, + "step": 21080 + }, + { + "epoch": 6.470534069981584, + "grad_norm": 0.201282799243927, + "learning_rate": 2.9267071705413552e-05, + "loss": 1.6987, + "step": 21081 + }, + { + "epoch": 6.470841006752609, + "grad_norm": 0.3806697130203247, + "learning_rate": 2.9262548712355425e-05, + "loss": 1.7386, + "step": 21082 + }, + { + "epoch": 6.4711479435236345, + "grad_norm": 0.3023025691509247, + "learning_rate": 2.9258025924236933e-05, + "loss": 1.7183, + "step": 21083 + }, + { + "epoch": 6.471454880294659, + "grad_norm": 0.2648932635784149, + "learning_rate": 2.9253503341102806e-05, + "loss": 1.6755, + "step": 21084 + }, + { + "epoch": 6.471761817065684, + "grad_norm": 0.2647169828414917, + "learning_rate": 2.9248980962997707e-05, + "loss": 1.7326, + "step": 21085 + }, + { + "epoch": 6.47206875383671, + "grad_norm": 0.23535950481891632, + "learning_rate": 2.9244458789966355e-05, + "loss": 1.7541, + "step": 21086 + }, + { + "epoch": 6.472375690607735, + "grad_norm": 0.2551584541797638, + "learning_rate": 2.9239936822053403e-05, + "loss": 1.6907, + "step": 21087 + }, + { + "epoch": 6.47268262737876, + "grad_norm": 0.23313823342323303, + "learning_rate": 2.923541505930357e-05, + "loss": 1.705, + "step": 21088 + }, + { + "epoch": 6.472989564149785, + "grad_norm": 0.2368597686290741, + "learning_rate": 2.9230893501761534e-05, + "loss": 1.6666, + "step": 21089 + }, + { + "epoch": 6.47329650092081, + "grad_norm": 0.17861969769001007, + "learning_rate": 2.9226372149472003e-05, + "loss": 1.6927, + "step": 21090 + }, + { + "epoch": 6.473603437691835, + "grad_norm": 0.2212727665901184, + "learning_rate": 2.9221851002479616e-05, + "loss": 1.6972, + "step": 21091 + }, + { + "epoch": 6.473910374462861, + "grad_norm": 0.19382402300834656, + "learning_rate": 2.9217330060829096e-05, + "loss": 1.7602, + "step": 21092 + }, + { + "epoch": 6.474217311233886, + "grad_norm": 0.2762092053890228, + "learning_rate": 2.9212809324565076e-05, + "loss": 1.7642, + "step": 21093 + }, + { + "epoch": 6.474524248004911, + "grad_norm": 0.22068747878074646, + "learning_rate": 2.9208288793732274e-05, + "loss": 1.7477, + "step": 21094 + }, + { + "epoch": 6.474831184775936, + "grad_norm": 0.19979839026927948, + "learning_rate": 2.9203768468375337e-05, + "loss": 1.7266, + "step": 21095 + }, + { + "epoch": 6.475138121546961, + "grad_norm": 0.23038682341575623, + "learning_rate": 2.9199248348538965e-05, + "loss": 1.7428, + "step": 21096 + }, + { + "epoch": 6.475445058317987, + "grad_norm": 0.16841283440589905, + "learning_rate": 2.91947284342678e-05, + "loss": 1.6788, + "step": 21097 + }, + { + "epoch": 6.475751995089012, + "grad_norm": 0.22812627255916595, + "learning_rate": 2.9190208725606528e-05, + "loss": 1.7513, + "step": 21098 + }, + { + "epoch": 6.476058931860037, + "grad_norm": 0.18409393727779388, + "learning_rate": 2.9185689222599832e-05, + "loss": 1.6834, + "step": 21099 + }, + { + "epoch": 6.476365868631062, + "grad_norm": 0.26226910948753357, + "learning_rate": 2.9181169925292313e-05, + "loss": 1.7375, + "step": 21100 + }, + { + "epoch": 6.476672805402087, + "grad_norm": 0.1915685385465622, + "learning_rate": 2.9176650833728697e-05, + "loss": 1.7521, + "step": 21101 + }, + { + "epoch": 6.476979742173112, + "grad_norm": 0.22342176735401154, + "learning_rate": 2.917213194795362e-05, + "loss": 1.8018, + "step": 21102 + }, + { + "epoch": 6.477286678944138, + "grad_norm": 0.18338742852210999, + "learning_rate": 2.9167613268011745e-05, + "loss": 1.6817, + "step": 21103 + }, + { + "epoch": 6.477593615715163, + "grad_norm": 0.23008635640144348, + "learning_rate": 2.9163094793947728e-05, + "loss": 1.7037, + "step": 21104 + }, + { + "epoch": 6.4779005524861875, + "grad_norm": 0.20954197645187378, + "learning_rate": 2.9158576525806215e-05, + "loss": 1.7565, + "step": 21105 + }, + { + "epoch": 6.478207489257213, + "grad_norm": 0.21065562963485718, + "learning_rate": 2.9154058463631874e-05, + "loss": 1.6899, + "step": 21106 + }, + { + "epoch": 6.478514426028238, + "grad_norm": 0.20217828452587128, + "learning_rate": 2.9149540607469335e-05, + "loss": 1.7055, + "step": 21107 + }, + { + "epoch": 6.4788213627992635, + "grad_norm": 0.19058823585510254, + "learning_rate": 2.9145022957363244e-05, + "loss": 1.6794, + "step": 21108 + }, + { + "epoch": 6.479128299570289, + "grad_norm": 0.2308664619922638, + "learning_rate": 2.9140505513358297e-05, + "loss": 1.7322, + "step": 21109 + }, + { + "epoch": 6.479435236341313, + "grad_norm": 0.18911845982074738, + "learning_rate": 2.9135988275499056e-05, + "loss": 1.7255, + "step": 21110 + }, + { + "epoch": 6.479742173112339, + "grad_norm": 0.21459296345710754, + "learning_rate": 2.9131471243830256e-05, + "loss": 1.6599, + "step": 21111 + }, + { + "epoch": 6.480049109883364, + "grad_norm": 0.20521530508995056, + "learning_rate": 2.912695441839644e-05, + "loss": 1.7564, + "step": 21112 + }, + { + "epoch": 6.480356046654389, + "grad_norm": 0.21924994885921478, + "learning_rate": 2.912243779924232e-05, + "loss": 1.6922, + "step": 21113 + }, + { + "epoch": 6.480662983425415, + "grad_norm": 0.18219491839408875, + "learning_rate": 2.911792138641253e-05, + "loss": 1.6907, + "step": 21114 + }, + { + "epoch": 6.48096992019644, + "grad_norm": 0.23122453689575195, + "learning_rate": 2.9113405179951626e-05, + "loss": 1.7665, + "step": 21115 + }, + { + "epoch": 6.481276856967464, + "grad_norm": 0.18411210179328918, + "learning_rate": 2.9108889179904348e-05, + "loss": 1.7216, + "step": 21116 + }, + { + "epoch": 6.48158379373849, + "grad_norm": 0.2251562923192978, + "learning_rate": 2.9104373386315225e-05, + "loss": 1.7605, + "step": 21117 + }, + { + "epoch": 6.481890730509515, + "grad_norm": 0.2252185344696045, + "learning_rate": 2.9099857799228957e-05, + "loss": 1.7345, + "step": 21118 + }, + { + "epoch": 6.48219766728054, + "grad_norm": 0.20799386501312256, + "learning_rate": 2.909534241869014e-05, + "loss": 1.7497, + "step": 21119 + }, + { + "epoch": 6.482504604051566, + "grad_norm": 0.2059052586555481, + "learning_rate": 2.90908272447434e-05, + "loss": 1.7444, + "step": 21120 + }, + { + "epoch": 6.48281154082259, + "grad_norm": 0.17851221561431885, + "learning_rate": 2.9086312277433362e-05, + "loss": 1.7208, + "step": 21121 + }, + { + "epoch": 6.4831184775936155, + "grad_norm": 0.20561498403549194, + "learning_rate": 2.908179751680465e-05, + "loss": 1.731, + "step": 21122 + }, + { + "epoch": 6.483425414364641, + "grad_norm": 0.2386128008365631, + "learning_rate": 2.9077282962901868e-05, + "loss": 1.7493, + "step": 21123 + }, + { + "epoch": 6.483732351135666, + "grad_norm": 0.21024827659130096, + "learning_rate": 2.9072768615769642e-05, + "loss": 1.7353, + "step": 21124 + }, + { + "epoch": 6.4840392879066915, + "grad_norm": 0.23443256318569183, + "learning_rate": 2.9068254475452582e-05, + "loss": 1.7419, + "step": 21125 + }, + { + "epoch": 6.484346224677717, + "grad_norm": 0.1849295198917389, + "learning_rate": 2.90637405419953e-05, + "loss": 1.7239, + "step": 21126 + }, + { + "epoch": 6.484653161448741, + "grad_norm": 0.1967659890651703, + "learning_rate": 2.9059226815442385e-05, + "loss": 1.7163, + "step": 21127 + }, + { + "epoch": 6.484960098219767, + "grad_norm": 0.20395416021347046, + "learning_rate": 2.9054713295838505e-05, + "loss": 1.7108, + "step": 21128 + }, + { + "epoch": 6.485267034990792, + "grad_norm": 0.24162746965885162, + "learning_rate": 2.9050199983228184e-05, + "loss": 1.7666, + "step": 21129 + }, + { + "epoch": 6.485573971761817, + "grad_norm": 0.18104900419712067, + "learning_rate": 2.9045686877656086e-05, + "loss": 1.6863, + "step": 21130 + }, + { + "epoch": 6.485880908532843, + "grad_norm": 0.18469318747520447, + "learning_rate": 2.9041173979166813e-05, + "loss": 1.7344, + "step": 21131 + }, + { + "epoch": 6.486187845303867, + "grad_norm": 0.18488821387290955, + "learning_rate": 2.90366612878049e-05, + "loss": 1.694, + "step": 21132 + }, + { + "epoch": 6.486494782074892, + "grad_norm": 0.2030600905418396, + "learning_rate": 2.903214880361503e-05, + "loss": 1.7079, + "step": 21133 + }, + { + "epoch": 6.486801718845918, + "grad_norm": 0.2222873419523239, + "learning_rate": 2.902763652664171e-05, + "loss": 1.7193, + "step": 21134 + }, + { + "epoch": 6.487108655616943, + "grad_norm": 0.1936846524477005, + "learning_rate": 2.9023124456929608e-05, + "loss": 1.7152, + "step": 21135 + }, + { + "epoch": 6.487415592387968, + "grad_norm": 0.25259360671043396, + "learning_rate": 2.9018612594523274e-05, + "loss": 1.776, + "step": 21136 + }, + { + "epoch": 6.487722529158993, + "grad_norm": 0.22994543612003326, + "learning_rate": 2.9014100939467316e-05, + "loss": 1.7437, + "step": 21137 + }, + { + "epoch": 6.488029465930018, + "grad_norm": 0.2646990716457367, + "learning_rate": 2.900958949180631e-05, + "loss": 1.7535, + "step": 21138 + }, + { + "epoch": 6.4883364027010435, + "grad_norm": 0.22973869740962982, + "learning_rate": 2.9005078251584843e-05, + "loss": 1.6772, + "step": 21139 + }, + { + "epoch": 6.488643339472069, + "grad_norm": 0.21261750161647797, + "learning_rate": 2.9000567218847497e-05, + "loss": 1.6899, + "step": 21140 + }, + { + "epoch": 6.488950276243094, + "grad_norm": 0.24828271567821503, + "learning_rate": 2.8996056393638858e-05, + "loss": 1.7994, + "step": 21141 + }, + { + "epoch": 6.4892572130141195, + "grad_norm": 0.18308857083320618, + "learning_rate": 2.8991545776003497e-05, + "loss": 1.7847, + "step": 21142 + }, + { + "epoch": 6.489564149785144, + "grad_norm": 0.22744092345237732, + "learning_rate": 2.8987035365985994e-05, + "loss": 1.7789, + "step": 21143 + }, + { + "epoch": 6.489871086556169, + "grad_norm": 0.18573936820030212, + "learning_rate": 2.8982525163630903e-05, + "loss": 1.6649, + "step": 21144 + }, + { + "epoch": 6.490178023327195, + "grad_norm": 0.26056674122810364, + "learning_rate": 2.8978015168982863e-05, + "loss": 1.68, + "step": 21145 + }, + { + "epoch": 6.49048496009822, + "grad_norm": 0.1912553906440735, + "learning_rate": 2.897350538208635e-05, + "loss": 1.7011, + "step": 21146 + }, + { + "epoch": 6.490791896869245, + "grad_norm": 0.25937187671661377, + "learning_rate": 2.896899580298603e-05, + "loss": 1.7409, + "step": 21147 + }, + { + "epoch": 6.49109883364027, + "grad_norm": 0.22148750722408295, + "learning_rate": 2.8964486431726397e-05, + "loss": 1.6921, + "step": 21148 + }, + { + "epoch": 6.491405770411295, + "grad_norm": 0.23678559064865112, + "learning_rate": 2.8959977268352012e-05, + "loss": 1.6833, + "step": 21149 + }, + { + "epoch": 6.49171270718232, + "grad_norm": 0.2942093312740326, + "learning_rate": 2.8955468312907506e-05, + "loss": 1.7119, + "step": 21150 + }, + { + "epoch": 6.492019643953346, + "grad_norm": 0.18726128339767456, + "learning_rate": 2.8950959565437365e-05, + "loss": 1.7067, + "step": 21151 + }, + { + "epoch": 6.492326580724371, + "grad_norm": 0.23851951956748962, + "learning_rate": 2.894645102598621e-05, + "loss": 1.73, + "step": 21152 + }, + { + "epoch": 6.4926335174953955, + "grad_norm": 0.18054445087909698, + "learning_rate": 2.8941942694598533e-05, + "loss": 1.7243, + "step": 21153 + }, + { + "epoch": 6.492940454266421, + "grad_norm": 0.21889349818229675, + "learning_rate": 2.8937434571318934e-05, + "loss": 1.7789, + "step": 21154 + }, + { + "epoch": 6.493247391037446, + "grad_norm": 0.18788981437683105, + "learning_rate": 2.893292665619195e-05, + "loss": 1.7496, + "step": 21155 + }, + { + "epoch": 6.4935543278084715, + "grad_norm": 0.1964103877544403, + "learning_rate": 2.8928418949262138e-05, + "loss": 1.6732, + "step": 21156 + }, + { + "epoch": 6.493861264579497, + "grad_norm": 0.21939502656459808, + "learning_rate": 2.8923911450574043e-05, + "loss": 1.7149, + "step": 21157 + }, + { + "epoch": 6.494168201350522, + "grad_norm": 0.16927817463874817, + "learning_rate": 2.8919404160172203e-05, + "loss": 1.7093, + "step": 21158 + }, + { + "epoch": 6.494475138121547, + "grad_norm": 0.19907668232917786, + "learning_rate": 2.8914897078101166e-05, + "loss": 1.718, + "step": 21159 + }, + { + "epoch": 6.494782074892572, + "grad_norm": 0.18071576952934265, + "learning_rate": 2.891039020440548e-05, + "loss": 1.7241, + "step": 21160 + }, + { + "epoch": 6.495089011663597, + "grad_norm": 0.17780692875385284, + "learning_rate": 2.890588353912965e-05, + "loss": 1.7013, + "step": 21161 + }, + { + "epoch": 6.495395948434623, + "grad_norm": 0.20762500166893005, + "learning_rate": 2.8901377082318292e-05, + "loss": 1.8149, + "step": 21162 + }, + { + "epoch": 6.495702885205648, + "grad_norm": 0.21616768836975098, + "learning_rate": 2.889687083401585e-05, + "loss": 1.7467, + "step": 21163 + }, + { + "epoch": 6.496009821976672, + "grad_norm": 0.20075570046901703, + "learning_rate": 2.8892364794266935e-05, + "loss": 1.6643, + "step": 21164 + }, + { + "epoch": 6.496316758747698, + "grad_norm": 0.18893925845623016, + "learning_rate": 2.8887858963116028e-05, + "loss": 1.7362, + "step": 21165 + }, + { + "epoch": 6.496623695518723, + "grad_norm": 0.20031611621379852, + "learning_rate": 2.888335334060765e-05, + "loss": 1.6902, + "step": 21166 + }, + { + "epoch": 6.496930632289748, + "grad_norm": 0.2959407866001129, + "learning_rate": 2.887884792678639e-05, + "loss": 1.7874, + "step": 21167 + }, + { + "epoch": 6.497237569060774, + "grad_norm": 0.17434875667095184, + "learning_rate": 2.8874342721696697e-05, + "loss": 1.7353, + "step": 21168 + }, + { + "epoch": 6.497544505831799, + "grad_norm": 0.19451481103897095, + "learning_rate": 2.8869837725383163e-05, + "loss": 1.6942, + "step": 21169 + }, + { + "epoch": 6.4978514426028235, + "grad_norm": 0.17984920740127563, + "learning_rate": 2.886533293789025e-05, + "loss": 1.7461, + "step": 21170 + }, + { + "epoch": 6.498158379373849, + "grad_norm": 0.18166208267211914, + "learning_rate": 2.8860828359262516e-05, + "loss": 1.7202, + "step": 21171 + }, + { + "epoch": 6.498465316144874, + "grad_norm": 0.1849331557750702, + "learning_rate": 2.8856323989544472e-05, + "loss": 1.6862, + "step": 21172 + }, + { + "epoch": 6.4987722529158995, + "grad_norm": 0.17846204340457916, + "learning_rate": 2.8851819828780623e-05, + "loss": 1.7446, + "step": 21173 + }, + { + "epoch": 6.499079189686925, + "grad_norm": 0.1963818222284317, + "learning_rate": 2.8847315877015486e-05, + "loss": 1.7366, + "step": 21174 + }, + { + "epoch": 6.499386126457949, + "grad_norm": 0.1917402446269989, + "learning_rate": 2.8842812134293574e-05, + "loss": 1.7362, + "step": 21175 + }, + { + "epoch": 6.499693063228975, + "grad_norm": 0.16559138894081116, + "learning_rate": 2.883830860065939e-05, + "loss": 1.6735, + "step": 21176 + }, + { + "epoch": 6.5, + "grad_norm": 0.1820032149553299, + "learning_rate": 2.8833805276157442e-05, + "loss": 1.7107, + "step": 21177 + }, + { + "epoch": 6.500306936771025, + "grad_norm": 0.23760980367660522, + "learning_rate": 2.882930216083222e-05, + "loss": 1.7024, + "step": 21178 + }, + { + "epoch": 6.500613873542051, + "grad_norm": 0.22314296662807465, + "learning_rate": 2.8824799254728285e-05, + "loss": 1.714, + "step": 21179 + }, + { + "epoch": 6.500920810313076, + "grad_norm": 0.21919335424900055, + "learning_rate": 2.8820296557890046e-05, + "loss": 1.7625, + "step": 21180 + }, + { + "epoch": 6.5012277470841005, + "grad_norm": 0.21632128953933716, + "learning_rate": 2.88157940703621e-05, + "loss": 1.6589, + "step": 21181 + }, + { + "epoch": 6.501534683855126, + "grad_norm": 0.17998506128787994, + "learning_rate": 2.8811291792188867e-05, + "loss": 1.7528, + "step": 21182 + }, + { + "epoch": 6.501841620626151, + "grad_norm": 0.19783075153827667, + "learning_rate": 2.880678972341485e-05, + "loss": 1.6908, + "step": 21183 + }, + { + "epoch": 6.5021485573971765, + "grad_norm": 0.20510388910770416, + "learning_rate": 2.88022878640846e-05, + "loss": 1.7342, + "step": 21184 + }, + { + "epoch": 6.502455494168201, + "grad_norm": 0.24218666553497314, + "learning_rate": 2.879778621424253e-05, + "loss": 1.8, + "step": 21185 + }, + { + "epoch": 6.502762430939226, + "grad_norm": 0.1901179403066635, + "learning_rate": 2.8793284773933195e-05, + "loss": 1.699, + "step": 21186 + }, + { + "epoch": 6.503069367710252, + "grad_norm": 0.2652232348918915, + "learning_rate": 2.8788783543201007e-05, + "loss": 1.8394, + "step": 21187 + }, + { + "epoch": 6.503376304481277, + "grad_norm": 0.17701558768749237, + "learning_rate": 2.878428252209052e-05, + "loss": 1.6674, + "step": 21188 + }, + { + "epoch": 6.503683241252302, + "grad_norm": 0.17464707791805267, + "learning_rate": 2.8779781710646185e-05, + "loss": 1.6894, + "step": 21189 + }, + { + "epoch": 6.503990178023328, + "grad_norm": 0.19469478726387024, + "learning_rate": 2.877528110891249e-05, + "loss": 1.7487, + "step": 21190 + }, + { + "epoch": 6.504297114794352, + "grad_norm": 0.21656417846679688, + "learning_rate": 2.87707807169339e-05, + "loss": 1.641, + "step": 21191 + }, + { + "epoch": 6.504604051565377, + "grad_norm": 0.20374895632266998, + "learning_rate": 2.8766280534754896e-05, + "loss": 1.6692, + "step": 21192 + }, + { + "epoch": 6.504910988336403, + "grad_norm": 0.26638445258140564, + "learning_rate": 2.876178056241996e-05, + "loss": 1.7415, + "step": 21193 + }, + { + "epoch": 6.505217925107428, + "grad_norm": 0.1852893978357315, + "learning_rate": 2.8757280799973557e-05, + "loss": 1.6981, + "step": 21194 + }, + { + "epoch": 6.505524861878453, + "grad_norm": 0.20518383383750916, + "learning_rate": 2.875278124746013e-05, + "loss": 1.781, + "step": 21195 + }, + { + "epoch": 6.505831798649478, + "grad_norm": 0.19968904554843903, + "learning_rate": 2.874828190492422e-05, + "loss": 1.6813, + "step": 21196 + }, + { + "epoch": 6.506138735420503, + "grad_norm": 0.19164247810840607, + "learning_rate": 2.87437827724102e-05, + "loss": 1.6833, + "step": 21197 + }, + { + "epoch": 6.5064456721915285, + "grad_norm": 0.19305361807346344, + "learning_rate": 2.873928384996262e-05, + "loss": 1.7164, + "step": 21198 + }, + { + "epoch": 6.506752608962554, + "grad_norm": 0.1853758841753006, + "learning_rate": 2.873478513762587e-05, + "loss": 1.7481, + "step": 21199 + }, + { + "epoch": 6.507059545733579, + "grad_norm": 0.20187529921531677, + "learning_rate": 2.8730286635444425e-05, + "loss": 1.7666, + "step": 21200 + }, + { + "epoch": 6.5073664825046045, + "grad_norm": 0.19769401848316193, + "learning_rate": 2.872578834346279e-05, + "loss": 1.798, + "step": 21201 + }, + { + "epoch": 6.507673419275629, + "grad_norm": 0.1936112940311432, + "learning_rate": 2.8721290261725342e-05, + "loss": 1.6992, + "step": 21202 + }, + { + "epoch": 6.507980356046654, + "grad_norm": 0.17090481519699097, + "learning_rate": 2.871679239027662e-05, + "loss": 1.6802, + "step": 21203 + }, + { + "epoch": 6.50828729281768, + "grad_norm": 0.19443605840206146, + "learning_rate": 2.8712294729160987e-05, + "loss": 1.736, + "step": 21204 + }, + { + "epoch": 6.508594229588705, + "grad_norm": 0.19216817617416382, + "learning_rate": 2.8707797278422954e-05, + "loss": 1.7109, + "step": 21205 + }, + { + "epoch": 6.50890116635973, + "grad_norm": 0.19900040328502655, + "learning_rate": 2.8703300038106952e-05, + "loss": 1.7158, + "step": 21206 + }, + { + "epoch": 6.509208103130755, + "grad_norm": 0.17810803651809692, + "learning_rate": 2.8698803008257425e-05, + "loss": 1.6886, + "step": 21207 + }, + { + "epoch": 6.50951503990178, + "grad_norm": 0.1890508532524109, + "learning_rate": 2.8694306188918807e-05, + "loss": 1.7447, + "step": 21208 + }, + { + "epoch": 6.509821976672805, + "grad_norm": 0.17456012964248657, + "learning_rate": 2.868980958013554e-05, + "loss": 1.7094, + "step": 21209 + }, + { + "epoch": 6.510128913443831, + "grad_norm": 0.17089629173278809, + "learning_rate": 2.8685313181952066e-05, + "loss": 1.6827, + "step": 21210 + }, + { + "epoch": 6.510435850214856, + "grad_norm": 0.22681273519992828, + "learning_rate": 2.8680816994412823e-05, + "loss": 1.7374, + "step": 21211 + }, + { + "epoch": 6.510742786985881, + "grad_norm": 0.20642207562923431, + "learning_rate": 2.8676321017562225e-05, + "loss": 1.7609, + "step": 21212 + }, + { + "epoch": 6.511049723756906, + "grad_norm": 0.2360219657421112, + "learning_rate": 2.867182525144475e-05, + "loss": 1.7577, + "step": 21213 + }, + { + "epoch": 6.511356660527931, + "grad_norm": 0.19686923921108246, + "learning_rate": 2.8667329696104766e-05, + "loss": 1.7459, + "step": 21214 + }, + { + "epoch": 6.5116635972989565, + "grad_norm": 0.21280834078788757, + "learning_rate": 2.8662834351586777e-05, + "loss": 1.7837, + "step": 21215 + }, + { + "epoch": 6.511970534069982, + "grad_norm": 0.19297273457050323, + "learning_rate": 2.8658339217935136e-05, + "loss": 1.734, + "step": 21216 + }, + { + "epoch": 6.512277470841006, + "grad_norm": 0.1937931329011917, + "learning_rate": 2.8653844295194283e-05, + "loss": 1.6631, + "step": 21217 + }, + { + "epoch": 6.512584407612032, + "grad_norm": 0.2061077207326889, + "learning_rate": 2.8649349583408692e-05, + "loss": 1.7324, + "step": 21218 + }, + { + "epoch": 6.512891344383057, + "grad_norm": 0.19711358845233917, + "learning_rate": 2.8644855082622695e-05, + "loss": 1.7024, + "step": 21219 + }, + { + "epoch": 6.513198281154082, + "grad_norm": 0.17352496087551117, + "learning_rate": 2.8640360792880804e-05, + "loss": 1.7261, + "step": 21220 + }, + { + "epoch": 6.513505217925108, + "grad_norm": 0.181448295712471, + "learning_rate": 2.8635866714227344e-05, + "loss": 1.7147, + "step": 21221 + }, + { + "epoch": 6.513812154696133, + "grad_norm": 0.1827932894229889, + "learning_rate": 2.8631372846706787e-05, + "loss": 1.7338, + "step": 21222 + }, + { + "epoch": 6.514119091467157, + "grad_norm": 0.20659075677394867, + "learning_rate": 2.862687919036353e-05, + "loss": 1.6611, + "step": 21223 + }, + { + "epoch": 6.514426028238183, + "grad_norm": 0.19185996055603027, + "learning_rate": 2.8622385745241987e-05, + "loss": 1.7834, + "step": 21224 + }, + { + "epoch": 6.514732965009208, + "grad_norm": 0.19825506210327148, + "learning_rate": 2.8617892511386558e-05, + "loss": 1.7608, + "step": 21225 + }, + { + "epoch": 6.515039901780233, + "grad_norm": 0.16927020251750946, + "learning_rate": 2.861339948884164e-05, + "loss": 1.6651, + "step": 21226 + }, + { + "epoch": 6.515346838551259, + "grad_norm": 0.19211016595363617, + "learning_rate": 2.8608906677651646e-05, + "loss": 1.6673, + "step": 21227 + }, + { + "epoch": 6.515653775322283, + "grad_norm": 0.20192545652389526, + "learning_rate": 2.8604414077860974e-05, + "loss": 1.7301, + "step": 21228 + }, + { + "epoch": 6.5159607120933085, + "grad_norm": 0.2075425237417221, + "learning_rate": 2.8599921689514002e-05, + "loss": 1.783, + "step": 21229 + }, + { + "epoch": 6.516267648864334, + "grad_norm": 0.21261392533779144, + "learning_rate": 2.8595429512655192e-05, + "loss": 1.7277, + "step": 21230 + }, + { + "epoch": 6.516574585635359, + "grad_norm": 0.21201452612876892, + "learning_rate": 2.8590937547328844e-05, + "loss": 1.6582, + "step": 21231 + }, + { + "epoch": 6.5168815224063845, + "grad_norm": 0.2071799635887146, + "learning_rate": 2.858644579357944e-05, + "loss": 1.7559, + "step": 21232 + }, + { + "epoch": 6.51718845917741, + "grad_norm": 0.20225903391838074, + "learning_rate": 2.858195425145132e-05, + "loss": 1.7507, + "step": 21233 + }, + { + "epoch": 6.517495395948434, + "grad_norm": 0.2738147974014282, + "learning_rate": 2.8577462920988852e-05, + "loss": 1.7073, + "step": 21234 + }, + { + "epoch": 6.51780233271946, + "grad_norm": 0.17878220975399017, + "learning_rate": 2.8572971802236498e-05, + "loss": 1.6598, + "step": 21235 + }, + { + "epoch": 6.518109269490485, + "grad_norm": 0.21365594863891602, + "learning_rate": 2.8568480895238552e-05, + "loss": 1.7404, + "step": 21236 + }, + { + "epoch": 6.51841620626151, + "grad_norm": 0.18392804265022278, + "learning_rate": 2.856399020003948e-05, + "loss": 1.706, + "step": 21237 + }, + { + "epoch": 6.518723143032536, + "grad_norm": 0.16268405318260193, + "learning_rate": 2.855949971668358e-05, + "loss": 1.6725, + "step": 21238 + }, + { + "epoch": 6.51903007980356, + "grad_norm": 0.19590096175670624, + "learning_rate": 2.855500944521529e-05, + "loss": 1.7269, + "step": 21239 + }, + { + "epoch": 6.519337016574585, + "grad_norm": 0.19443263113498688, + "learning_rate": 2.8550519385678965e-05, + "loss": 1.686, + "step": 21240 + }, + { + "epoch": 6.519643953345611, + "grad_norm": 0.2112705111503601, + "learning_rate": 2.8546029538118985e-05, + "loss": 1.6904, + "step": 21241 + }, + { + "epoch": 6.519950890116636, + "grad_norm": 0.21015888452529907, + "learning_rate": 2.8541539902579712e-05, + "loss": 1.6972, + "step": 21242 + }, + { + "epoch": 6.520257826887661, + "grad_norm": 0.2853320837020874, + "learning_rate": 2.853705047910552e-05, + "loss": 1.7415, + "step": 21243 + }, + { + "epoch": 6.520564763658687, + "grad_norm": 0.20927128195762634, + "learning_rate": 2.853256126774077e-05, + "loss": 1.6955, + "step": 21244 + }, + { + "epoch": 6.520871700429711, + "grad_norm": 0.27824920415878296, + "learning_rate": 2.8528072268529836e-05, + "loss": 1.7666, + "step": 21245 + }, + { + "epoch": 6.5211786372007365, + "grad_norm": 0.21164646744728088, + "learning_rate": 2.8523583481517057e-05, + "loss": 1.75, + "step": 21246 + }, + { + "epoch": 6.521485573971762, + "grad_norm": 0.249397411942482, + "learning_rate": 2.851909490674686e-05, + "loss": 1.6767, + "step": 21247 + }, + { + "epoch": 6.521792510742787, + "grad_norm": 0.2311551868915558, + "learning_rate": 2.8514606544263507e-05, + "loss": 1.8071, + "step": 21248 + }, + { + "epoch": 6.5220994475138125, + "grad_norm": 0.21878042817115784, + "learning_rate": 2.8510118394111453e-05, + "loss": 1.6881, + "step": 21249 + }, + { + "epoch": 6.522406384284837, + "grad_norm": 0.2095690816640854, + "learning_rate": 2.8505630456334974e-05, + "loss": 1.6526, + "step": 21250 + }, + { + "epoch": 6.522713321055862, + "grad_norm": 0.2303982526063919, + "learning_rate": 2.850114273097844e-05, + "loss": 1.7256, + "step": 21251 + }, + { + "epoch": 6.523020257826888, + "grad_norm": 0.22640225291252136, + "learning_rate": 2.8496655218086255e-05, + "loss": 1.7797, + "step": 21252 + }, + { + "epoch": 6.523327194597913, + "grad_norm": 0.24268805980682373, + "learning_rate": 2.8492167917702683e-05, + "loss": 1.7673, + "step": 21253 + }, + { + "epoch": 6.523634131368938, + "grad_norm": 0.1988469958305359, + "learning_rate": 2.8487680829872158e-05, + "loss": 1.7126, + "step": 21254 + }, + { + "epoch": 6.523941068139964, + "grad_norm": 0.18385496735572815, + "learning_rate": 2.8483193954638942e-05, + "loss": 1.7113, + "step": 21255 + }, + { + "epoch": 6.524248004910988, + "grad_norm": 0.21865327656269073, + "learning_rate": 2.847870729204743e-05, + "loss": 1.6686, + "step": 21256 + }, + { + "epoch": 6.524554941682013, + "grad_norm": 0.16982951760292053, + "learning_rate": 2.8474220842141946e-05, + "loss": 1.6865, + "step": 21257 + }, + { + "epoch": 6.524861878453039, + "grad_norm": 0.23028478026390076, + "learning_rate": 2.8469734604966834e-05, + "loss": 1.7647, + "step": 21258 + }, + { + "epoch": 6.525168815224064, + "grad_norm": 0.1805485039949417, + "learning_rate": 2.8465248580566415e-05, + "loss": 1.7524, + "step": 21259 + }, + { + "epoch": 6.525475751995089, + "grad_norm": 0.18652063608169556, + "learning_rate": 2.8460762768985037e-05, + "loss": 1.7028, + "step": 21260 + }, + { + "epoch": 6.525782688766114, + "grad_norm": 0.22772997617721558, + "learning_rate": 2.845627717026703e-05, + "loss": 1.7866, + "step": 21261 + }, + { + "epoch": 6.526089625537139, + "grad_norm": 0.19889821112155914, + "learning_rate": 2.8451791784456718e-05, + "loss": 1.7076, + "step": 21262 + }, + { + "epoch": 6.526396562308165, + "grad_norm": 0.24747174978256226, + "learning_rate": 2.8447306611598402e-05, + "loss": 1.7615, + "step": 21263 + }, + { + "epoch": 6.52670349907919, + "grad_norm": 0.1988009363412857, + "learning_rate": 2.8442821651736473e-05, + "loss": 1.7853, + "step": 21264 + }, + { + "epoch": 6.527010435850215, + "grad_norm": 0.250032901763916, + "learning_rate": 2.8438336904915185e-05, + "loss": 1.6906, + "step": 21265 + }, + { + "epoch": 6.52731737262124, + "grad_norm": 0.15398284792900085, + "learning_rate": 2.8433852371178925e-05, + "loss": 1.6437, + "step": 21266 + }, + { + "epoch": 6.527624309392265, + "grad_norm": 0.33137503266334534, + "learning_rate": 2.8429368050571958e-05, + "loss": 1.8213, + "step": 21267 + }, + { + "epoch": 6.52793124616329, + "grad_norm": 0.23827852308750153, + "learning_rate": 2.8424883943138593e-05, + "loss": 1.7148, + "step": 21268 + }, + { + "epoch": 6.528238182934316, + "grad_norm": 0.21171489357948303, + "learning_rate": 2.8420400048923217e-05, + "loss": 1.7729, + "step": 21269 + }, + { + "epoch": 6.528545119705341, + "grad_norm": 0.21698513627052307, + "learning_rate": 2.8415916367970053e-05, + "loss": 1.7267, + "step": 21270 + }, + { + "epoch": 6.5288520564763655, + "grad_norm": 0.2217913120985031, + "learning_rate": 2.8411432900323498e-05, + "loss": 1.7259, + "step": 21271 + }, + { + "epoch": 6.529158993247391, + "grad_norm": 0.25518202781677246, + "learning_rate": 2.8406949646027768e-05, + "loss": 1.7754, + "step": 21272 + }, + { + "epoch": 6.529465930018416, + "grad_norm": 0.22206325829029083, + "learning_rate": 2.8402466605127247e-05, + "loss": 1.755, + "step": 21273 + }, + { + "epoch": 6.5297728667894415, + "grad_norm": 0.26918017864227295, + "learning_rate": 2.8397983777666206e-05, + "loss": 1.783, + "step": 21274 + }, + { + "epoch": 6.530079803560467, + "grad_norm": 0.19280646741390228, + "learning_rate": 2.8393501163688952e-05, + "loss": 1.6942, + "step": 21275 + }, + { + "epoch": 6.530386740331492, + "grad_norm": 0.24567140638828278, + "learning_rate": 2.8389018763239784e-05, + "loss": 1.7316, + "step": 21276 + }, + { + "epoch": 6.530693677102517, + "grad_norm": 0.21791695058345795, + "learning_rate": 2.8384536576362997e-05, + "loss": 1.7627, + "step": 21277 + }, + { + "epoch": 6.531000613873542, + "grad_norm": 0.2441660761833191, + "learning_rate": 2.8380054603102885e-05, + "loss": 1.7112, + "step": 21278 + }, + { + "epoch": 6.531307550644567, + "grad_norm": 0.1768653243780136, + "learning_rate": 2.837557284350375e-05, + "loss": 1.6906, + "step": 21279 + }, + { + "epoch": 6.531614487415593, + "grad_norm": 0.21037769317626953, + "learning_rate": 2.8371091297609877e-05, + "loss": 1.7197, + "step": 21280 + }, + { + "epoch": 6.531921424186618, + "grad_norm": 0.23989829421043396, + "learning_rate": 2.8366609965465563e-05, + "loss": 1.7693, + "step": 21281 + }, + { + "epoch": 6.532228360957642, + "grad_norm": 0.18302181363105774, + "learning_rate": 2.836212884711506e-05, + "loss": 1.6643, + "step": 21282 + }, + { + "epoch": 6.532535297728668, + "grad_norm": 0.2068471908569336, + "learning_rate": 2.835764794260273e-05, + "loss": 1.7431, + "step": 21283 + }, + { + "epoch": 6.532842234499693, + "grad_norm": 0.18803778290748596, + "learning_rate": 2.8353167251972777e-05, + "loss": 1.7506, + "step": 21284 + }, + { + "epoch": 6.533149171270718, + "grad_norm": 0.20789632201194763, + "learning_rate": 2.8348686775269507e-05, + "loss": 1.7174, + "step": 21285 + }, + { + "epoch": 6.533456108041744, + "grad_norm": 0.18927012383937836, + "learning_rate": 2.834420651253723e-05, + "loss": 1.6723, + "step": 21286 + }, + { + "epoch": 6.533763044812769, + "grad_norm": 0.22616887092590332, + "learning_rate": 2.8339726463820172e-05, + "loss": 1.7045, + "step": 21287 + }, + { + "epoch": 6.5340699815837935, + "grad_norm": 0.23880253732204437, + "learning_rate": 2.8335246629162658e-05, + "loss": 1.7255, + "step": 21288 + }, + { + "epoch": 6.534376918354819, + "grad_norm": 0.24279431998729706, + "learning_rate": 2.8330767008608904e-05, + "loss": 1.7548, + "step": 21289 + }, + { + "epoch": 6.534683855125844, + "grad_norm": 0.20542044937610626, + "learning_rate": 2.832628760220323e-05, + "loss": 1.6851, + "step": 21290 + }, + { + "epoch": 6.5349907918968695, + "grad_norm": 0.19426794350147247, + "learning_rate": 2.832180840998988e-05, + "loss": 1.7528, + "step": 21291 + }, + { + "epoch": 6.535297728667894, + "grad_norm": 0.2744491398334503, + "learning_rate": 2.8317329432013136e-05, + "loss": 1.7821, + "step": 21292 + }, + { + "epoch": 6.535604665438919, + "grad_norm": 0.2692170739173889, + "learning_rate": 2.8312850668317243e-05, + "loss": 1.6626, + "step": 21293 + }, + { + "epoch": 6.535911602209945, + "grad_norm": 0.24998809397220612, + "learning_rate": 2.830837211894647e-05, + "loss": 1.7031, + "step": 21294 + }, + { + "epoch": 6.53621853898097, + "grad_norm": 0.22888946533203125, + "learning_rate": 2.830389378394508e-05, + "loss": 1.7706, + "step": 21295 + }, + { + "epoch": 6.536525475751995, + "grad_norm": 0.21685005724430084, + "learning_rate": 2.8299415663357332e-05, + "loss": 1.681, + "step": 21296 + }, + { + "epoch": 6.536832412523021, + "grad_norm": 0.23309725522994995, + "learning_rate": 2.8294937757227475e-05, + "loss": 1.7781, + "step": 21297 + }, + { + "epoch": 6.537139349294045, + "grad_norm": 0.26712173223495483, + "learning_rate": 2.829046006559976e-05, + "loss": 1.6966, + "step": 21298 + }, + { + "epoch": 6.53744628606507, + "grad_norm": 0.1836499124765396, + "learning_rate": 2.8285982588518428e-05, + "loss": 1.7192, + "step": 21299 + }, + { + "epoch": 6.537753222836096, + "grad_norm": 0.24073021113872528, + "learning_rate": 2.828150532602778e-05, + "loss": 1.6997, + "step": 21300 + }, + { + "epoch": 6.538060159607121, + "grad_norm": 0.16308051347732544, + "learning_rate": 2.8277028278172014e-05, + "loss": 1.6901, + "step": 21301 + }, + { + "epoch": 6.538367096378146, + "grad_norm": 0.2330634444952011, + "learning_rate": 2.8272551444995376e-05, + "loss": 1.7426, + "step": 21302 + }, + { + "epoch": 6.538674033149171, + "grad_norm": 0.18600425124168396, + "learning_rate": 2.8268074826542123e-05, + "loss": 1.6906, + "step": 21303 + }, + { + "epoch": 6.538980969920196, + "grad_norm": 0.24717238545417786, + "learning_rate": 2.8263598422856475e-05, + "loss": 1.6962, + "step": 21304 + }, + { + "epoch": 6.5392879066912215, + "grad_norm": 0.1907368302345276, + "learning_rate": 2.8259122233982727e-05, + "loss": 1.7083, + "step": 21305 + }, + { + "epoch": 6.539594843462247, + "grad_norm": 0.22698798775672913, + "learning_rate": 2.8254646259965035e-05, + "loss": 1.7377, + "step": 21306 + }, + { + "epoch": 6.539901780233272, + "grad_norm": 0.19169457256793976, + "learning_rate": 2.8250170500847696e-05, + "loss": 1.7416, + "step": 21307 + }, + { + "epoch": 6.5402087170042975, + "grad_norm": 0.18730394542217255, + "learning_rate": 2.8245694956674918e-05, + "loss": 1.7273, + "step": 21308 + }, + { + "epoch": 6.540515653775322, + "grad_norm": 0.19813422858715057, + "learning_rate": 2.8241219627490927e-05, + "loss": 1.7638, + "step": 21309 + }, + { + "epoch": 6.540822590546347, + "grad_norm": 0.20460368692874908, + "learning_rate": 2.8236744513339965e-05, + "loss": 1.7266, + "step": 21310 + }, + { + "epoch": 6.541129527317373, + "grad_norm": 0.20448380708694458, + "learning_rate": 2.823226961426625e-05, + "loss": 1.7335, + "step": 21311 + }, + { + "epoch": 6.541436464088398, + "grad_norm": 0.21458712220191956, + "learning_rate": 2.8227794930314e-05, + "loss": 1.7274, + "step": 21312 + }, + { + "epoch": 6.541743400859423, + "grad_norm": 0.1964675635099411, + "learning_rate": 2.8223320461527442e-05, + "loss": 1.7514, + "step": 21313 + }, + { + "epoch": 6.542050337630448, + "grad_norm": 0.18982458114624023, + "learning_rate": 2.82188462079508e-05, + "loss": 1.6858, + "step": 21314 + }, + { + "epoch": 6.542357274401473, + "grad_norm": 0.21377761662006378, + "learning_rate": 2.8214372169628277e-05, + "loss": 1.727, + "step": 21315 + }, + { + "epoch": 6.542664211172498, + "grad_norm": 0.19484922289848328, + "learning_rate": 2.8209898346604087e-05, + "loss": 1.7646, + "step": 21316 + }, + { + "epoch": 6.542971147943524, + "grad_norm": 0.20614980161190033, + "learning_rate": 2.8205424738922488e-05, + "loss": 1.6705, + "step": 21317 + }, + { + "epoch": 6.543278084714549, + "grad_norm": 0.1888885796070099, + "learning_rate": 2.8200951346627636e-05, + "loss": 1.7854, + "step": 21318 + }, + { + "epoch": 6.543585021485574, + "grad_norm": 0.20957863330841064, + "learning_rate": 2.8196478169763763e-05, + "loss": 1.6971, + "step": 21319 + }, + { + "epoch": 6.543891958256599, + "grad_norm": 0.20744509994983673, + "learning_rate": 2.8192005208375073e-05, + "loss": 1.7408, + "step": 21320 + }, + { + "epoch": 6.544198895027624, + "grad_norm": 0.20038767158985138, + "learning_rate": 2.818753246250574e-05, + "loss": 1.7355, + "step": 21321 + }, + { + "epoch": 6.5445058317986495, + "grad_norm": 0.18535862863063812, + "learning_rate": 2.818305993220004e-05, + "loss": 1.7229, + "step": 21322 + }, + { + "epoch": 6.544812768569675, + "grad_norm": 0.2191225290298462, + "learning_rate": 2.8178587617502095e-05, + "loss": 1.7364, + "step": 21323 + }, + { + "epoch": 6.5451197053407, + "grad_norm": 0.2055424451828003, + "learning_rate": 2.8174115518456175e-05, + "loss": 1.7488, + "step": 21324 + }, + { + "epoch": 6.545426642111725, + "grad_norm": 0.22267968952655792, + "learning_rate": 2.8169643635106398e-05, + "loss": 1.6936, + "step": 21325 + }, + { + "epoch": 6.54573357888275, + "grad_norm": 0.20295512676239014, + "learning_rate": 2.8165171967497018e-05, + "loss": 1.7651, + "step": 21326 + }, + { + "epoch": 6.546040515653775, + "grad_norm": 0.25859618186950684, + "learning_rate": 2.81607005156722e-05, + "loss": 1.7264, + "step": 21327 + }, + { + "epoch": 6.546347452424801, + "grad_norm": 0.22232379019260406, + "learning_rate": 2.8156229279676143e-05, + "loss": 1.7282, + "step": 21328 + }, + { + "epoch": 6.546654389195826, + "grad_norm": 0.2548457682132721, + "learning_rate": 2.8151758259553035e-05, + "loss": 1.7137, + "step": 21329 + }, + { + "epoch": 6.546961325966851, + "grad_norm": 0.22040672600269318, + "learning_rate": 2.8147287455347055e-05, + "loss": 1.7553, + "step": 21330 + }, + { + "epoch": 6.547268262737876, + "grad_norm": 0.19622360169887543, + "learning_rate": 2.8142816867102388e-05, + "loss": 1.6502, + "step": 21331 + }, + { + "epoch": 6.547575199508901, + "grad_norm": 0.20849336683750153, + "learning_rate": 2.813834649486322e-05, + "loss": 1.6824, + "step": 21332 + }, + { + "epoch": 6.547882136279926, + "grad_norm": 0.18474788963794708, + "learning_rate": 2.8133876338673703e-05, + "loss": 1.7136, + "step": 21333 + }, + { + "epoch": 6.548189073050952, + "grad_norm": 0.2421834021806717, + "learning_rate": 2.8129406398578074e-05, + "loss": 1.7841, + "step": 21334 + }, + { + "epoch": 6.548496009821976, + "grad_norm": 0.18089748919010162, + "learning_rate": 2.812493667462045e-05, + "loss": 1.6918, + "step": 21335 + }, + { + "epoch": 6.5488029465930016, + "grad_norm": 0.18575069308280945, + "learning_rate": 2.8120467166845022e-05, + "loss": 1.7098, + "step": 21336 + }, + { + "epoch": 6.549109883364027, + "grad_norm": 0.20840388536453247, + "learning_rate": 2.811599787529596e-05, + "loss": 1.7405, + "step": 21337 + }, + { + "epoch": 6.549416820135052, + "grad_norm": 0.19018858671188354, + "learning_rate": 2.811152880001742e-05, + "loss": 1.7098, + "step": 21338 + }, + { + "epoch": 6.5497237569060776, + "grad_norm": 0.22326117753982544, + "learning_rate": 2.8107059941053627e-05, + "loss": 1.7452, + "step": 21339 + }, + { + "epoch": 6.550030693677103, + "grad_norm": 0.26071304082870483, + "learning_rate": 2.8102591298448643e-05, + "loss": 1.7685, + "step": 21340 + }, + { + "epoch": 6.550337630448127, + "grad_norm": 0.2253575623035431, + "learning_rate": 2.8098122872246734e-05, + "loss": 1.8025, + "step": 21341 + }, + { + "epoch": 6.550644567219153, + "grad_norm": 0.2503850758075714, + "learning_rate": 2.8093654662491975e-05, + "loss": 1.7453, + "step": 21342 + }, + { + "epoch": 6.550951503990178, + "grad_norm": 0.18953700363636017, + "learning_rate": 2.808918666922858e-05, + "loss": 1.7549, + "step": 21343 + }, + { + "epoch": 6.551258440761203, + "grad_norm": 0.21360619366168976, + "learning_rate": 2.8084718892500685e-05, + "loss": 1.7363, + "step": 21344 + }, + { + "epoch": 6.551565377532229, + "grad_norm": 0.24622702598571777, + "learning_rate": 2.8080251332352437e-05, + "loss": 1.7325, + "step": 21345 + }, + { + "epoch": 6.551872314303253, + "grad_norm": 0.20079167187213898, + "learning_rate": 2.8075783988827997e-05, + "loss": 1.7478, + "step": 21346 + }, + { + "epoch": 6.5521792510742785, + "grad_norm": 0.2337643951177597, + "learning_rate": 2.807131686197151e-05, + "loss": 1.6683, + "step": 21347 + }, + { + "epoch": 6.552486187845304, + "grad_norm": 0.20815308392047882, + "learning_rate": 2.8066849951827123e-05, + "loss": 1.7436, + "step": 21348 + }, + { + "epoch": 6.552793124616329, + "grad_norm": 0.2450367957353592, + "learning_rate": 2.8062383258438972e-05, + "loss": 1.7464, + "step": 21349 + }, + { + "epoch": 6.5531000613873545, + "grad_norm": 0.232087641954422, + "learning_rate": 2.8057916781851222e-05, + "loss": 1.7378, + "step": 21350 + }, + { + "epoch": 6.55340699815838, + "grad_norm": 0.2254600077867508, + "learning_rate": 2.8053450522107993e-05, + "loss": 1.7299, + "step": 21351 + }, + { + "epoch": 6.553713934929404, + "grad_norm": 0.23282572627067566, + "learning_rate": 2.8048984479253425e-05, + "loss": 1.7512, + "step": 21352 + }, + { + "epoch": 6.55402087170043, + "grad_norm": 0.21826763451099396, + "learning_rate": 2.8044518653331665e-05, + "loss": 1.706, + "step": 21353 + }, + { + "epoch": 6.554327808471455, + "grad_norm": 0.20807425677776337, + "learning_rate": 2.804005304438683e-05, + "loss": 1.7013, + "step": 21354 + }, + { + "epoch": 6.55463474524248, + "grad_norm": 0.21791879832744598, + "learning_rate": 2.8035587652463046e-05, + "loss": 1.7312, + "step": 21355 + }, + { + "epoch": 6.554941682013506, + "grad_norm": 0.23205329477787018, + "learning_rate": 2.8031122477604505e-05, + "loss": 1.7166, + "step": 21356 + }, + { + "epoch": 6.55524861878453, + "grad_norm": 0.1910320371389389, + "learning_rate": 2.802665751985525e-05, + "loss": 1.694, + "step": 21357 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 0.24150735139846802, + "learning_rate": 2.8022192779259472e-05, + "loss": 1.7934, + "step": 21358 + }, + { + "epoch": 6.555862492326581, + "grad_norm": 0.18308573961257935, + "learning_rate": 2.801772825586123e-05, + "loss": 1.6851, + "step": 21359 + }, + { + "epoch": 6.556169429097606, + "grad_norm": 0.28410083055496216, + "learning_rate": 2.8013263949704705e-05, + "loss": 1.7687, + "step": 21360 + }, + { + "epoch": 6.556476365868631, + "grad_norm": 0.21073146164417267, + "learning_rate": 2.8008799860833996e-05, + "loss": 1.711, + "step": 21361 + }, + { + "epoch": 6.556783302639657, + "grad_norm": 0.22758159041404724, + "learning_rate": 2.8004335989293213e-05, + "loss": 1.7495, + "step": 21362 + }, + { + "epoch": 6.557090239410681, + "grad_norm": 0.2112412452697754, + "learning_rate": 2.799987233512647e-05, + "loss": 1.7125, + "step": 21363 + }, + { + "epoch": 6.5573971761817065, + "grad_norm": 0.1804153323173523, + "learning_rate": 2.7995408898377884e-05, + "loss": 1.689, + "step": 21364 + }, + { + "epoch": 6.557704112952732, + "grad_norm": 0.17632657289505005, + "learning_rate": 2.7990945679091572e-05, + "loss": 1.6868, + "step": 21365 + }, + { + "epoch": 6.558011049723757, + "grad_norm": 0.17942996323108673, + "learning_rate": 2.7986482677311632e-05, + "loss": 1.7082, + "step": 21366 + }, + { + "epoch": 6.558317986494782, + "grad_norm": 0.278486967086792, + "learning_rate": 2.7982019893082167e-05, + "loss": 1.7909, + "step": 21367 + }, + { + "epoch": 6.558624923265807, + "grad_norm": 0.208990678191185, + "learning_rate": 2.797755732644729e-05, + "loss": 1.7643, + "step": 21368 + }, + { + "epoch": 6.558931860036832, + "grad_norm": 0.20375309884548187, + "learning_rate": 2.7973094977451096e-05, + "loss": 1.6957, + "step": 21369 + }, + { + "epoch": 6.559238796807858, + "grad_norm": 0.24685338139533997, + "learning_rate": 2.7968632846137694e-05, + "loss": 1.7574, + "step": 21370 + }, + { + "epoch": 6.559545733578883, + "grad_norm": 0.2237502634525299, + "learning_rate": 2.796417093255117e-05, + "loss": 1.7422, + "step": 21371 + }, + { + "epoch": 6.559852670349908, + "grad_norm": 0.22731846570968628, + "learning_rate": 2.795970923673561e-05, + "loss": 1.7594, + "step": 21372 + }, + { + "epoch": 6.560159607120933, + "grad_norm": 0.2518742084503174, + "learning_rate": 2.7955247758735158e-05, + "loss": 1.6817, + "step": 21373 + }, + { + "epoch": 6.560466543891958, + "grad_norm": 0.21982096135616302, + "learning_rate": 2.7950786498593827e-05, + "loss": 1.7289, + "step": 21374 + }, + { + "epoch": 6.560773480662983, + "grad_norm": 0.19061018526554108, + "learning_rate": 2.7946325456355787e-05, + "loss": 1.6809, + "step": 21375 + }, + { + "epoch": 6.561080417434009, + "grad_norm": 0.2023245394229889, + "learning_rate": 2.794186463206505e-05, + "loss": 1.7053, + "step": 21376 + }, + { + "epoch": 6.561387354205034, + "grad_norm": 0.18003186583518982, + "learning_rate": 2.7937404025765752e-05, + "loss": 1.6447, + "step": 21377 + }, + { + "epoch": 6.5616942909760585, + "grad_norm": 0.19133709371089935, + "learning_rate": 2.7932943637501956e-05, + "loss": 1.7677, + "step": 21378 + }, + { + "epoch": 6.562001227747084, + "grad_norm": 0.18476714193820953, + "learning_rate": 2.7928483467317746e-05, + "loss": 1.685, + "step": 21379 + }, + { + "epoch": 6.562308164518109, + "grad_norm": 0.2065780758857727, + "learning_rate": 2.79240235152572e-05, + "loss": 1.6827, + "step": 21380 + }, + { + "epoch": 6.5626151012891345, + "grad_norm": 0.1885409951210022, + "learning_rate": 2.79195637813644e-05, + "loss": 1.6819, + "step": 21381 + }, + { + "epoch": 6.56292203806016, + "grad_norm": 0.18055391311645508, + "learning_rate": 2.79151042656834e-05, + "loss": 1.7007, + "step": 21382 + }, + { + "epoch": 6.563228974831185, + "grad_norm": 0.25148439407348633, + "learning_rate": 2.7910644968258294e-05, + "loss": 1.7723, + "step": 21383 + }, + { + "epoch": 6.56353591160221, + "grad_norm": 0.2308066487312317, + "learning_rate": 2.7906185889133134e-05, + "loss": 1.7525, + "step": 21384 + }, + { + "epoch": 6.563842848373235, + "grad_norm": 0.19580784440040588, + "learning_rate": 2.7901727028351997e-05, + "loss": 1.7197, + "step": 21385 + }, + { + "epoch": 6.56414978514426, + "grad_norm": 0.19686979055404663, + "learning_rate": 2.7897268385958952e-05, + "loss": 1.6873, + "step": 21386 + }, + { + "epoch": 6.564456721915286, + "grad_norm": 0.2657351493835449, + "learning_rate": 2.7892809961998045e-05, + "loss": 1.7005, + "step": 21387 + }, + { + "epoch": 6.564763658686311, + "grad_norm": 0.20131130516529083, + "learning_rate": 2.7888351756513353e-05, + "loss": 1.7211, + "step": 21388 + }, + { + "epoch": 6.565070595457335, + "grad_norm": 0.2524282932281494, + "learning_rate": 2.7883893769548908e-05, + "loss": 1.7038, + "step": 21389 + }, + { + "epoch": 6.565377532228361, + "grad_norm": 0.1601654291152954, + "learning_rate": 2.787943600114883e-05, + "loss": 1.691, + "step": 21390 + }, + { + "epoch": 6.565684468999386, + "grad_norm": 0.25074124336242676, + "learning_rate": 2.787497845135709e-05, + "loss": 1.688, + "step": 21391 + }, + { + "epoch": 6.565991405770411, + "grad_norm": 0.19491349160671234, + "learning_rate": 2.787052112021782e-05, + "loss": 1.7108, + "step": 21392 + }, + { + "epoch": 6.566298342541437, + "grad_norm": 0.23931637406349182, + "learning_rate": 2.786606400777499e-05, + "loss": 1.7315, + "step": 21393 + }, + { + "epoch": 6.566605279312462, + "grad_norm": 0.1643616110086441, + "learning_rate": 2.786160711407271e-05, + "loss": 1.6745, + "step": 21394 + }, + { + "epoch": 6.5669122160834865, + "grad_norm": 0.17805394530296326, + "learning_rate": 2.7857150439155e-05, + "loss": 1.6817, + "step": 21395 + }, + { + "epoch": 6.567219152854512, + "grad_norm": 0.20370139181613922, + "learning_rate": 2.7852693983065913e-05, + "loss": 1.7173, + "step": 21396 + }, + { + "epoch": 6.567526089625537, + "grad_norm": 0.1620296984910965, + "learning_rate": 2.784823774584948e-05, + "loss": 1.7135, + "step": 21397 + }, + { + "epoch": 6.5678330263965625, + "grad_norm": 0.19116036593914032, + "learning_rate": 2.7843781727549752e-05, + "loss": 1.6815, + "step": 21398 + }, + { + "epoch": 6.568139963167588, + "grad_norm": 0.20118895173072815, + "learning_rate": 2.7839325928210757e-05, + "loss": 1.7336, + "step": 21399 + }, + { + "epoch": 6.568446899938612, + "grad_norm": 0.198282390832901, + "learning_rate": 2.7834870347876528e-05, + "loss": 1.7379, + "step": 21400 + }, + { + "epoch": 6.568753836709638, + "grad_norm": 0.19203920662403107, + "learning_rate": 2.7830414986591104e-05, + "loss": 1.6913, + "step": 21401 + }, + { + "epoch": 6.569060773480663, + "grad_norm": 0.24601610004901886, + "learning_rate": 2.7825959844398507e-05, + "loss": 1.7842, + "step": 21402 + }, + { + "epoch": 6.569367710251688, + "grad_norm": 0.19069935381412506, + "learning_rate": 2.7821504921342777e-05, + "loss": 1.706, + "step": 21403 + }, + { + "epoch": 6.569674647022714, + "grad_norm": 0.20221085846424103, + "learning_rate": 2.7817050217467945e-05, + "loss": 1.7223, + "step": 21404 + }, + { + "epoch": 6.569981583793739, + "grad_norm": 0.2129664123058319, + "learning_rate": 2.781259573281801e-05, + "loss": 1.7429, + "step": 21405 + }, + { + "epoch": 6.570288520564763, + "grad_norm": 0.20684000849723816, + "learning_rate": 2.7808141467436993e-05, + "loss": 1.7349, + "step": 21406 + }, + { + "epoch": 6.570595457335789, + "grad_norm": 0.2153804898262024, + "learning_rate": 2.7803687421368968e-05, + "loss": 1.7245, + "step": 21407 + }, + { + "epoch": 6.570902394106814, + "grad_norm": 0.245448499917984, + "learning_rate": 2.7799233594657875e-05, + "loss": 1.7102, + "step": 21408 + }, + { + "epoch": 6.571209330877839, + "grad_norm": 0.18146783113479614, + "learning_rate": 2.7794779987347807e-05, + "loss": 1.6777, + "step": 21409 + }, + { + "epoch": 6.571516267648864, + "grad_norm": 0.21388854086399078, + "learning_rate": 2.7790326599482698e-05, + "loss": 1.7263, + "step": 21410 + }, + { + "epoch": 6.571823204419889, + "grad_norm": 0.2242165058851242, + "learning_rate": 2.7785873431106625e-05, + "loss": 1.7624, + "step": 21411 + }, + { + "epoch": 6.5721301411909145, + "grad_norm": 0.23132537305355072, + "learning_rate": 2.7781420482263565e-05, + "loss": 1.7013, + "step": 21412 + }, + { + "epoch": 6.57243707796194, + "grad_norm": 0.21074987947940826, + "learning_rate": 2.777696775299753e-05, + "loss": 1.7111, + "step": 21413 + }, + { + "epoch": 6.572744014732965, + "grad_norm": 0.2933674156665802, + "learning_rate": 2.7772515243352525e-05, + "loss": 1.7515, + "step": 21414 + }, + { + "epoch": 6.5730509515039905, + "grad_norm": 0.2100256085395813, + "learning_rate": 2.7768062953372552e-05, + "loss": 1.7425, + "step": 21415 + }, + { + "epoch": 6.573357888275015, + "grad_norm": 0.21765680611133575, + "learning_rate": 2.776361088310161e-05, + "loss": 1.7064, + "step": 21416 + }, + { + "epoch": 6.57366482504604, + "grad_norm": 0.205422043800354, + "learning_rate": 2.7759159032583702e-05, + "loss": 1.7458, + "step": 21417 + }, + { + "epoch": 6.573971761817066, + "grad_norm": 0.2009960114955902, + "learning_rate": 2.775470740186282e-05, + "loss": 1.7111, + "step": 21418 + }, + { + "epoch": 6.574278698588091, + "grad_norm": 0.18974804878234863, + "learning_rate": 2.7750255990982955e-05, + "loss": 1.7385, + "step": 21419 + }, + { + "epoch": 6.574585635359116, + "grad_norm": 0.1784054934978485, + "learning_rate": 2.7745804799988106e-05, + "loss": 1.7129, + "step": 21420 + }, + { + "epoch": 6.574892572130141, + "grad_norm": 0.2047782689332962, + "learning_rate": 2.7741353828922258e-05, + "loss": 1.6972, + "step": 21421 + }, + { + "epoch": 6.575199508901166, + "grad_norm": 0.18886682391166687, + "learning_rate": 2.773690307782939e-05, + "loss": 1.6564, + "step": 21422 + }, + { + "epoch": 6.5755064456721914, + "grad_norm": 0.2088952213525772, + "learning_rate": 2.7732452546753484e-05, + "loss": 1.7309, + "step": 21423 + }, + { + "epoch": 6.575813382443217, + "grad_norm": 0.20526883006095886, + "learning_rate": 2.7728002235738565e-05, + "loss": 1.6811, + "step": 21424 + }, + { + "epoch": 6.576120319214242, + "grad_norm": 0.19648446142673492, + "learning_rate": 2.7723552144828545e-05, + "loss": 1.7237, + "step": 21425 + }, + { + "epoch": 6.5764272559852675, + "grad_norm": 0.22405673563480377, + "learning_rate": 2.7719102274067484e-05, + "loss": 1.7454, + "step": 21426 + }, + { + "epoch": 6.576734192756292, + "grad_norm": 0.24119171500205994, + "learning_rate": 2.7714652623499265e-05, + "loss": 1.7106, + "step": 21427 + }, + { + "epoch": 6.577041129527317, + "grad_norm": 0.2127196192741394, + "learning_rate": 2.771020319316794e-05, + "loss": 1.7895, + "step": 21428 + }, + { + "epoch": 6.577348066298343, + "grad_norm": 0.23805706202983856, + "learning_rate": 2.7705753983117443e-05, + "loss": 1.739, + "step": 21429 + }, + { + "epoch": 6.577655003069368, + "grad_norm": 0.24212954938411713, + "learning_rate": 2.7701304993391753e-05, + "loss": 1.683, + "step": 21430 + }, + { + "epoch": 6.577961939840393, + "grad_norm": 0.1946132481098175, + "learning_rate": 2.769685622403484e-05, + "loss": 1.6953, + "step": 21431 + }, + { + "epoch": 6.578268876611418, + "grad_norm": 0.2465951144695282, + "learning_rate": 2.769240767509067e-05, + "loss": 1.6594, + "step": 21432 + }, + { + "epoch": 6.578575813382443, + "grad_norm": 0.17029622197151184, + "learning_rate": 2.76879593466032e-05, + "loss": 1.6977, + "step": 21433 + }, + { + "epoch": 6.578882750153468, + "grad_norm": 0.23793117702007294, + "learning_rate": 2.7683511238616388e-05, + "loss": 1.6709, + "step": 21434 + }, + { + "epoch": 6.579189686924494, + "grad_norm": 0.20149341225624084, + "learning_rate": 2.76790633511742e-05, + "loss": 1.8074, + "step": 21435 + }, + { + "epoch": 6.579496623695519, + "grad_norm": 0.25029948353767395, + "learning_rate": 2.7674615684320593e-05, + "loss": 1.6649, + "step": 21436 + }, + { + "epoch": 6.579803560466544, + "grad_norm": 0.22212490439414978, + "learning_rate": 2.7670168238099515e-05, + "loss": 1.7322, + "step": 21437 + }, + { + "epoch": 6.580110497237569, + "grad_norm": 0.26087918877601624, + "learning_rate": 2.7665721012554925e-05, + "loss": 1.7285, + "step": 21438 + }, + { + "epoch": 6.580417434008594, + "grad_norm": 0.19286726415157318, + "learning_rate": 2.7661274007730776e-05, + "loss": 1.6912, + "step": 21439 + }, + { + "epoch": 6.5807243707796195, + "grad_norm": 0.23935118317604065, + "learning_rate": 2.7656827223670982e-05, + "loss": 1.6929, + "step": 21440 + }, + { + "epoch": 6.581031307550645, + "grad_norm": 0.2263423204421997, + "learning_rate": 2.7652380660419563e-05, + "loss": 1.6786, + "step": 21441 + }, + { + "epoch": 6.581338244321669, + "grad_norm": 0.19788038730621338, + "learning_rate": 2.7647934318020373e-05, + "loss": 1.7906, + "step": 21442 + }, + { + "epoch": 6.581645181092695, + "grad_norm": 0.25891759991645813, + "learning_rate": 2.7643488196517435e-05, + "loss": 1.7691, + "step": 21443 + }, + { + "epoch": 6.58195211786372, + "grad_norm": 0.25175485014915466, + "learning_rate": 2.7639042295954615e-05, + "loss": 1.7329, + "step": 21444 + }, + { + "epoch": 6.582259054634745, + "grad_norm": 0.1860336810350418, + "learning_rate": 2.7634596616375908e-05, + "loss": 1.7348, + "step": 21445 + }, + { + "epoch": 6.582565991405771, + "grad_norm": 0.2704271972179413, + "learning_rate": 2.7630151157825218e-05, + "loss": 1.7199, + "step": 21446 + }, + { + "epoch": 6.582872928176796, + "grad_norm": 0.16306720674037933, + "learning_rate": 2.762570592034649e-05, + "loss": 1.7174, + "step": 21447 + }, + { + "epoch": 6.58317986494782, + "grad_norm": 0.2585636079311371, + "learning_rate": 2.7621260903983648e-05, + "loss": 1.7392, + "step": 21448 + }, + { + "epoch": 6.583486801718846, + "grad_norm": 0.2086072564125061, + "learning_rate": 2.7616816108780623e-05, + "loss": 1.7417, + "step": 21449 + }, + { + "epoch": 6.583793738489871, + "grad_norm": 0.1747613251209259, + "learning_rate": 2.7612371534781343e-05, + "loss": 1.6607, + "step": 21450 + }, + { + "epoch": 6.584100675260896, + "grad_norm": 0.21026404201984406, + "learning_rate": 2.7607927182029726e-05, + "loss": 1.7725, + "step": 21451 + }, + { + "epoch": 6.584407612031922, + "grad_norm": 0.17881789803504944, + "learning_rate": 2.76034830505697e-05, + "loss": 1.7502, + "step": 21452 + }, + { + "epoch": 6.584714548802946, + "grad_norm": 0.2503713369369507, + "learning_rate": 2.7599039140445182e-05, + "loss": 1.798, + "step": 21453 + }, + { + "epoch": 6.5850214855739715, + "grad_norm": 0.22163939476013184, + "learning_rate": 2.7594595451700083e-05, + "loss": 1.725, + "step": 21454 + }, + { + "epoch": 6.585328422344997, + "grad_norm": 0.2154664546251297, + "learning_rate": 2.759015198437833e-05, + "loss": 1.7917, + "step": 21455 + }, + { + "epoch": 6.585635359116022, + "grad_norm": 0.1814090609550476, + "learning_rate": 2.7585708738523823e-05, + "loss": 1.6562, + "step": 21456 + }, + { + "epoch": 6.5859422958870475, + "grad_norm": 0.18815121054649353, + "learning_rate": 2.758126571418049e-05, + "loss": 1.6833, + "step": 21457 + }, + { + "epoch": 6.586249232658073, + "grad_norm": 0.19383473694324493, + "learning_rate": 2.757682291139222e-05, + "loss": 1.6987, + "step": 21458 + }, + { + "epoch": 6.586556169429097, + "grad_norm": 0.19574831426143646, + "learning_rate": 2.7572380330202912e-05, + "loss": 1.7231, + "step": 21459 + }, + { + "epoch": 6.586863106200123, + "grad_norm": 0.17509032785892487, + "learning_rate": 2.7567937970656527e-05, + "loss": 1.6452, + "step": 21460 + }, + { + "epoch": 6.587170042971148, + "grad_norm": 0.19439785182476044, + "learning_rate": 2.7563495832796886e-05, + "loss": 1.7168, + "step": 21461 + }, + { + "epoch": 6.587476979742173, + "grad_norm": 0.17384520173072815, + "learning_rate": 2.7559053916667953e-05, + "loss": 1.7128, + "step": 21462 + }, + { + "epoch": 6.587783916513199, + "grad_norm": 0.18308506906032562, + "learning_rate": 2.7554612222313597e-05, + "loss": 1.7184, + "step": 21463 + }, + { + "epoch": 6.588090853284223, + "grad_norm": 0.20052805542945862, + "learning_rate": 2.7550170749777726e-05, + "loss": 1.7239, + "step": 21464 + }, + { + "epoch": 6.588397790055248, + "grad_norm": 0.21892015635967255, + "learning_rate": 2.7545729499104215e-05, + "loss": 1.7297, + "step": 21465 + }, + { + "epoch": 6.588704726826274, + "grad_norm": 0.19819483160972595, + "learning_rate": 2.7541288470336973e-05, + "loss": 1.7303, + "step": 21466 + }, + { + "epoch": 6.589011663597299, + "grad_norm": 0.24296818673610687, + "learning_rate": 2.7536847663519884e-05, + "loss": 1.8525, + "step": 21467 + }, + { + "epoch": 6.589318600368324, + "grad_norm": 0.1971593201160431, + "learning_rate": 2.753240707869683e-05, + "loss": 1.7396, + "step": 21468 + }, + { + "epoch": 6.58962553713935, + "grad_norm": 0.24418935179710388, + "learning_rate": 2.7527966715911696e-05, + "loss": 1.7414, + "step": 21469 + }, + { + "epoch": 6.589932473910374, + "grad_norm": 0.2193990796804428, + "learning_rate": 2.7523526575208368e-05, + "loss": 1.7243, + "step": 21470 + }, + { + "epoch": 6.5902394106813995, + "grad_norm": 0.23612114787101746, + "learning_rate": 2.7519086656630722e-05, + "loss": 1.7072, + "step": 21471 + }, + { + "epoch": 6.590546347452425, + "grad_norm": 0.22282655537128448, + "learning_rate": 2.751464696022264e-05, + "loss": 1.7423, + "step": 21472 + }, + { + "epoch": 6.59085328422345, + "grad_norm": 0.21411976218223572, + "learning_rate": 2.7510207486027995e-05, + "loss": 1.7397, + "step": 21473 + }, + { + "epoch": 6.5911602209944755, + "grad_norm": 0.2244768589735031, + "learning_rate": 2.7505768234090663e-05, + "loss": 1.6964, + "step": 21474 + }, + { + "epoch": 6.5914671577655, + "grad_norm": 0.2250032275915146, + "learning_rate": 2.7501329204454512e-05, + "loss": 1.7307, + "step": 21475 + }, + { + "epoch": 6.591774094536525, + "grad_norm": 0.2643435299396515, + "learning_rate": 2.7496890397163395e-05, + "loss": 1.7298, + "step": 21476 + }, + { + "epoch": 6.592081031307551, + "grad_norm": 0.2204463928937912, + "learning_rate": 2.7492451812261232e-05, + "loss": 1.723, + "step": 21477 + }, + { + "epoch": 6.592387968078576, + "grad_norm": 0.2278377115726471, + "learning_rate": 2.7488013449791816e-05, + "loss": 1.7597, + "step": 21478 + }, + { + "epoch": 6.592694904849601, + "grad_norm": 0.18430690467357635, + "learning_rate": 2.7483575309799086e-05, + "loss": 1.6314, + "step": 21479 + }, + { + "epoch": 6.593001841620627, + "grad_norm": 0.26019781827926636, + "learning_rate": 2.7479137392326827e-05, + "loss": 1.7362, + "step": 21480 + }, + { + "epoch": 6.593308778391651, + "grad_norm": 0.2103995382785797, + "learning_rate": 2.7474699697418936e-05, + "loss": 1.7137, + "step": 21481 + }, + { + "epoch": 6.593615715162676, + "grad_norm": 0.220427006483078, + "learning_rate": 2.747026222511928e-05, + "loss": 1.7323, + "step": 21482 + }, + { + "epoch": 6.593922651933702, + "grad_norm": 0.21523109078407288, + "learning_rate": 2.7465824975471693e-05, + "loss": 1.7572, + "step": 21483 + }, + { + "epoch": 6.594229588704727, + "grad_norm": 0.21639512479305267, + "learning_rate": 2.7461387948520033e-05, + "loss": 1.7275, + "step": 21484 + }, + { + "epoch": 6.5945365254757515, + "grad_norm": 0.2043544203042984, + "learning_rate": 2.7456951144308147e-05, + "loss": 1.7454, + "step": 21485 + }, + { + "epoch": 6.594843462246777, + "grad_norm": 0.17847217619419098, + "learning_rate": 2.7452514562879882e-05, + "loss": 1.7356, + "step": 21486 + }, + { + "epoch": 6.595150399017802, + "grad_norm": 0.20756758749485016, + "learning_rate": 2.744807820427908e-05, + "loss": 1.7557, + "step": 21487 + }, + { + "epoch": 6.5954573357888275, + "grad_norm": 0.23579071462154388, + "learning_rate": 2.744364206854959e-05, + "loss": 1.7855, + "step": 21488 + }, + { + "epoch": 6.595764272559853, + "grad_norm": 0.1947307586669922, + "learning_rate": 2.7439206155735254e-05, + "loss": 1.7105, + "step": 21489 + }, + { + "epoch": 6.596071209330878, + "grad_norm": 0.1900642365217209, + "learning_rate": 2.74347704658799e-05, + "loss": 1.6692, + "step": 21490 + }, + { + "epoch": 6.596378146101903, + "grad_norm": 0.16756244003772736, + "learning_rate": 2.7430334999027375e-05, + "loss": 1.7175, + "step": 21491 + }, + { + "epoch": 6.596685082872928, + "grad_norm": 0.18581146001815796, + "learning_rate": 2.7425899755221506e-05, + "loss": 1.72, + "step": 21492 + }, + { + "epoch": 6.596992019643953, + "grad_norm": 0.2384853959083557, + "learning_rate": 2.7421464734506107e-05, + "loss": 1.718, + "step": 21493 + }, + { + "epoch": 6.597298956414979, + "grad_norm": 0.16853606700897217, + "learning_rate": 2.7417029936925065e-05, + "loss": 1.6819, + "step": 21494 + }, + { + "epoch": 6.597605893186004, + "grad_norm": 0.2273230254650116, + "learning_rate": 2.741259536252213e-05, + "loss": 1.7158, + "step": 21495 + }, + { + "epoch": 6.597912829957028, + "grad_norm": 0.2291530966758728, + "learning_rate": 2.7408161011341205e-05, + "loss": 1.7804, + "step": 21496 + }, + { + "epoch": 6.598219766728054, + "grad_norm": 0.17676831781864166, + "learning_rate": 2.740372688342604e-05, + "loss": 1.6693, + "step": 21497 + }, + { + "epoch": 6.598526703499079, + "grad_norm": 0.2386767417192459, + "learning_rate": 2.7399292978820508e-05, + "loss": 1.6932, + "step": 21498 + }, + { + "epoch": 6.598833640270104, + "grad_norm": 0.21329782903194427, + "learning_rate": 2.739485929756841e-05, + "loss": 1.7811, + "step": 21499 + }, + { + "epoch": 6.59914057704113, + "grad_norm": 0.19382116198539734, + "learning_rate": 2.7390425839713556e-05, + "loss": 1.7152, + "step": 21500 + }, + { + "epoch": 6.599447513812155, + "grad_norm": 0.1819920688867569, + "learning_rate": 2.738599260529977e-05, + "loss": 1.6571, + "step": 21501 + }, + { + "epoch": 6.5997544505831796, + "grad_norm": 0.19947806000709534, + "learning_rate": 2.738155959437086e-05, + "loss": 1.7138, + "step": 21502 + }, + { + "epoch": 6.600061387354205, + "grad_norm": 0.1851014792919159, + "learning_rate": 2.7377126806970634e-05, + "loss": 1.7109, + "step": 21503 + }, + { + "epoch": 6.60036832412523, + "grad_norm": 0.20365974307060242, + "learning_rate": 2.7372694243142905e-05, + "loss": 1.7145, + "step": 21504 + }, + { + "epoch": 6.600675260896256, + "grad_norm": 0.2070893943309784, + "learning_rate": 2.736826190293147e-05, + "loss": 1.7172, + "step": 21505 + }, + { + "epoch": 6.600982197667281, + "grad_norm": 0.19077777862548828, + "learning_rate": 2.7363829786380136e-05, + "loss": 1.7059, + "step": 21506 + }, + { + "epoch": 6.601289134438305, + "grad_norm": 0.21168744564056396, + "learning_rate": 2.73593978935327e-05, + "loss": 1.7483, + "step": 21507 + }, + { + "epoch": 6.601596071209331, + "grad_norm": 0.20746631920337677, + "learning_rate": 2.7354966224432965e-05, + "loss": 1.7165, + "step": 21508 + }, + { + "epoch": 6.601903007980356, + "grad_norm": 0.19440631568431854, + "learning_rate": 2.7350534779124732e-05, + "loss": 1.694, + "step": 21509 + }, + { + "epoch": 6.602209944751381, + "grad_norm": 0.20699405670166016, + "learning_rate": 2.7346103557651765e-05, + "loss": 1.7077, + "step": 21510 + }, + { + "epoch": 6.602516881522407, + "grad_norm": 0.19856512546539307, + "learning_rate": 2.7341672560057917e-05, + "loss": 1.77, + "step": 21511 + }, + { + "epoch": 6.602823818293432, + "grad_norm": 0.23978421092033386, + "learning_rate": 2.7337241786386915e-05, + "loss": 1.7531, + "step": 21512 + }, + { + "epoch": 6.6031307550644565, + "grad_norm": 0.1834867000579834, + "learning_rate": 2.73328112366826e-05, + "loss": 1.751, + "step": 21513 + }, + { + "epoch": 6.603437691835482, + "grad_norm": 0.2154606282711029, + "learning_rate": 2.7328380910988694e-05, + "loss": 1.737, + "step": 21514 + }, + { + "epoch": 6.603744628606507, + "grad_norm": 0.20554645359516144, + "learning_rate": 2.7323950809349035e-05, + "loss": 1.7629, + "step": 21515 + }, + { + "epoch": 6.6040515653775325, + "grad_norm": 0.20497548580169678, + "learning_rate": 2.7319520931807386e-05, + "loss": 1.7001, + "step": 21516 + }, + { + "epoch": 6.604358502148557, + "grad_norm": 0.18628253042697906, + "learning_rate": 2.7315091278407523e-05, + "loss": 1.7477, + "step": 21517 + }, + { + "epoch": 6.604665438919582, + "grad_norm": 0.20788705348968506, + "learning_rate": 2.731066184919323e-05, + "loss": 1.7185, + "step": 21518 + }, + { + "epoch": 6.604972375690608, + "grad_norm": 0.17834967374801636, + "learning_rate": 2.730623264420827e-05, + "loss": 1.67, + "step": 21519 + }, + { + "epoch": 6.605279312461633, + "grad_norm": 0.2183784693479538, + "learning_rate": 2.7301803663496417e-05, + "loss": 1.6983, + "step": 21520 + }, + { + "epoch": 6.605586249232658, + "grad_norm": 0.1735544204711914, + "learning_rate": 2.7297374907101447e-05, + "loss": 1.7352, + "step": 21521 + }, + { + "epoch": 6.605893186003684, + "grad_norm": 0.2504538893699646, + "learning_rate": 2.729294637506713e-05, + "loss": 1.7332, + "step": 21522 + }, + { + "epoch": 6.606200122774708, + "grad_norm": 0.1801074892282486, + "learning_rate": 2.728851806743722e-05, + "loss": 1.7251, + "step": 21523 + }, + { + "epoch": 6.606507059545733, + "grad_norm": 0.25701379776000977, + "learning_rate": 2.728408998425549e-05, + "loss": 1.732, + "step": 21524 + }, + { + "epoch": 6.606813996316759, + "grad_norm": 0.1801779717206955, + "learning_rate": 2.7279662125565697e-05, + "loss": 1.6793, + "step": 21525 + }, + { + "epoch": 6.607120933087784, + "grad_norm": 0.21244947612285614, + "learning_rate": 2.7275234491411595e-05, + "loss": 1.7493, + "step": 21526 + }, + { + "epoch": 6.607427869858809, + "grad_norm": 0.20944559574127197, + "learning_rate": 2.7270807081836924e-05, + "loss": 1.722, + "step": 21527 + }, + { + "epoch": 6.607734806629834, + "grad_norm": 0.2526783049106598, + "learning_rate": 2.7266379896885508e-05, + "loss": 1.7628, + "step": 21528 + }, + { + "epoch": 6.608041743400859, + "grad_norm": 0.19788937270641327, + "learning_rate": 2.7261952936601002e-05, + "loss": 1.6538, + "step": 21529 + }, + { + "epoch": 6.6083486801718845, + "grad_norm": 0.2623229920864105, + "learning_rate": 2.725752620102725e-05, + "loss": 1.7694, + "step": 21530 + }, + { + "epoch": 6.60865561694291, + "grad_norm": 0.21503256261348724, + "learning_rate": 2.7253099690207913e-05, + "loss": 1.7553, + "step": 21531 + }, + { + "epoch": 6.608962553713935, + "grad_norm": 0.2114928811788559, + "learning_rate": 2.724867340418679e-05, + "loss": 1.7067, + "step": 21532 + }, + { + "epoch": 6.6092694904849605, + "grad_norm": 0.17945198714733124, + "learning_rate": 2.7244247343007623e-05, + "loss": 1.7419, + "step": 21533 + }, + { + "epoch": 6.609576427255985, + "grad_norm": 0.19239214062690735, + "learning_rate": 2.7239821506714137e-05, + "loss": 1.7644, + "step": 21534 + }, + { + "epoch": 6.60988336402701, + "grad_norm": 0.22906997799873352, + "learning_rate": 2.7235395895350068e-05, + "loss": 1.8063, + "step": 21535 + }, + { + "epoch": 6.610190300798036, + "grad_norm": 0.1965717375278473, + "learning_rate": 2.7230970508959162e-05, + "loss": 1.7841, + "step": 21536 + }, + { + "epoch": 6.610497237569061, + "grad_norm": 0.19944418966770172, + "learning_rate": 2.7226545347585158e-05, + "loss": 1.7382, + "step": 21537 + }, + { + "epoch": 6.610804174340086, + "grad_norm": 0.17155805230140686, + "learning_rate": 2.722212041127178e-05, + "loss": 1.6621, + "step": 21538 + }, + { + "epoch": 6.611111111111111, + "grad_norm": 0.20459938049316406, + "learning_rate": 2.721769570006275e-05, + "loss": 1.7481, + "step": 21539 + }, + { + "epoch": 6.611418047882136, + "grad_norm": 0.1991354376077652, + "learning_rate": 2.7213271214001813e-05, + "loss": 1.7874, + "step": 21540 + }, + { + "epoch": 6.611724984653161, + "grad_norm": 0.25073128938674927, + "learning_rate": 2.7208846953132682e-05, + "loss": 1.7921, + "step": 21541 + }, + { + "epoch": 6.612031921424187, + "grad_norm": 0.24456258118152618, + "learning_rate": 2.7204422917499085e-05, + "loss": 1.7564, + "step": 21542 + }, + { + "epoch": 6.612338858195212, + "grad_norm": 0.18416531383991241, + "learning_rate": 2.7199999107144736e-05, + "loss": 1.7247, + "step": 21543 + }, + { + "epoch": 6.612645794966237, + "grad_norm": 0.18439221382141113, + "learning_rate": 2.7195575522113347e-05, + "loss": 1.6607, + "step": 21544 + }, + { + "epoch": 6.612952731737262, + "grad_norm": 0.20334671437740326, + "learning_rate": 2.7191152162448685e-05, + "loss": 1.7487, + "step": 21545 + }, + { + "epoch": 6.613259668508287, + "grad_norm": 0.17871633172035217, + "learning_rate": 2.718672902819438e-05, + "loss": 1.7355, + "step": 21546 + }, + { + "epoch": 6.6135666052793125, + "grad_norm": 0.23006688058376312, + "learning_rate": 2.718230611939424e-05, + "loss": 1.6489, + "step": 21547 + }, + { + "epoch": 6.613873542050338, + "grad_norm": 0.19141538441181183, + "learning_rate": 2.7177883436091877e-05, + "loss": 1.6793, + "step": 21548 + }, + { + "epoch": 6.614180478821363, + "grad_norm": 0.20549756288528442, + "learning_rate": 2.7173460978331068e-05, + "loss": 1.8331, + "step": 21549 + }, + { + "epoch": 6.614487415592388, + "grad_norm": 0.19106455147266388, + "learning_rate": 2.7169038746155495e-05, + "loss": 1.7295, + "step": 21550 + }, + { + "epoch": 6.614794352363413, + "grad_norm": 0.20190143585205078, + "learning_rate": 2.7164616739608866e-05, + "loss": 1.7032, + "step": 21551 + }, + { + "epoch": 6.615101289134438, + "grad_norm": 0.1969708949327469, + "learning_rate": 2.716019495873488e-05, + "loss": 1.6935, + "step": 21552 + }, + { + "epoch": 6.615408225905464, + "grad_norm": 0.23748311400413513, + "learning_rate": 2.7155773403577235e-05, + "loss": 1.7942, + "step": 21553 + }, + { + "epoch": 6.615715162676489, + "grad_norm": 0.29168081283569336, + "learning_rate": 2.715135207417962e-05, + "loss": 1.7121, + "step": 21554 + }, + { + "epoch": 6.616022099447514, + "grad_norm": 0.2428344041109085, + "learning_rate": 2.7146930970585738e-05, + "loss": 1.7287, + "step": 21555 + }, + { + "epoch": 6.616329036218539, + "grad_norm": 0.2520657479763031, + "learning_rate": 2.714251009283928e-05, + "loss": 1.8462, + "step": 21556 + }, + { + "epoch": 6.616635972989564, + "grad_norm": 0.2426053285598755, + "learning_rate": 2.713808944098394e-05, + "loss": 1.7094, + "step": 21557 + }, + { + "epoch": 6.616942909760589, + "grad_norm": 0.17593255639076233, + "learning_rate": 2.713366901506339e-05, + "loss": 1.6891, + "step": 21558 + }, + { + "epoch": 6.617249846531615, + "grad_norm": 0.20620940625667572, + "learning_rate": 2.7129248815121332e-05, + "loss": 1.7277, + "step": 21559 + }, + { + "epoch": 6.617556783302639, + "grad_norm": 0.21467719972133636, + "learning_rate": 2.7124828841201445e-05, + "loss": 1.7543, + "step": 21560 + }, + { + "epoch": 6.6178637200736645, + "grad_norm": 0.21372607350349426, + "learning_rate": 2.7120409093347378e-05, + "loss": 1.7207, + "step": 21561 + }, + { + "epoch": 6.61817065684469, + "grad_norm": 0.2123684585094452, + "learning_rate": 2.7115989571602884e-05, + "loss": 1.71, + "step": 21562 + }, + { + "epoch": 6.618477593615715, + "grad_norm": 0.19155478477478027, + "learning_rate": 2.711157027601155e-05, + "loss": 1.7182, + "step": 21563 + }, + { + "epoch": 6.6187845303867405, + "grad_norm": 0.23053184151649475, + "learning_rate": 2.7107151206617136e-05, + "loss": 1.7147, + "step": 21564 + }, + { + "epoch": 6.619091467157766, + "grad_norm": 0.1635691374540329, + "learning_rate": 2.7102732363463235e-05, + "loss": 1.6913, + "step": 21565 + }, + { + "epoch": 6.61939840392879, + "grad_norm": 0.19415298104286194, + "learning_rate": 2.709831374659357e-05, + "loss": 1.6813, + "step": 21566 + }, + { + "epoch": 6.619705340699816, + "grad_norm": 0.19547943770885468, + "learning_rate": 2.709389535605179e-05, + "loss": 1.6988, + "step": 21567 + }, + { + "epoch": 6.620012277470841, + "grad_norm": 0.1921805888414383, + "learning_rate": 2.7089477191881564e-05, + "loss": 1.6931, + "step": 21568 + }, + { + "epoch": 6.620319214241866, + "grad_norm": 0.18463274836540222, + "learning_rate": 2.7085059254126554e-05, + "loss": 1.7168, + "step": 21569 + }, + { + "epoch": 6.620626151012892, + "grad_norm": 0.2078532725572586, + "learning_rate": 2.7080641542830414e-05, + "loss": 1.7248, + "step": 21570 + }, + { + "epoch": 6.620933087783916, + "grad_norm": 0.18778283894062042, + "learning_rate": 2.7076224058036813e-05, + "loss": 1.6745, + "step": 21571 + }, + { + "epoch": 6.621240024554941, + "grad_norm": 0.26190707087516785, + "learning_rate": 2.70718067997894e-05, + "loss": 1.7317, + "step": 21572 + }, + { + "epoch": 6.621546961325967, + "grad_norm": 0.20449557900428772, + "learning_rate": 2.7067389768131836e-05, + "loss": 1.7167, + "step": 21573 + }, + { + "epoch": 6.621853898096992, + "grad_norm": 0.22722119092941284, + "learning_rate": 2.706297296310776e-05, + "loss": 1.7262, + "step": 21574 + }, + { + "epoch": 6.622160834868017, + "grad_norm": 0.24897173047065735, + "learning_rate": 2.7058556384760825e-05, + "loss": 1.7273, + "step": 21575 + }, + { + "epoch": 6.622467771639043, + "grad_norm": 0.19774340093135834, + "learning_rate": 2.705414003313469e-05, + "loss": 1.6765, + "step": 21576 + }, + { + "epoch": 6.622774708410067, + "grad_norm": 0.2661767303943634, + "learning_rate": 2.7049723908272995e-05, + "loss": 1.7046, + "step": 21577 + }, + { + "epoch": 6.6230816451810925, + "grad_norm": 0.2013266384601593, + "learning_rate": 2.7045308010219356e-05, + "loss": 1.7156, + "step": 21578 + }, + { + "epoch": 6.623388581952118, + "grad_norm": 0.22952915728092194, + "learning_rate": 2.7040892339017475e-05, + "loss": 1.7601, + "step": 21579 + }, + { + "epoch": 6.623695518723143, + "grad_norm": 0.18262411653995514, + "learning_rate": 2.7036476894710916e-05, + "loss": 1.7334, + "step": 21580 + }, + { + "epoch": 6.6240024554941686, + "grad_norm": 0.18907666206359863, + "learning_rate": 2.703206167734339e-05, + "loss": 1.7196, + "step": 21581 + }, + { + "epoch": 6.624309392265193, + "grad_norm": 0.2192571759223938, + "learning_rate": 2.7027646686958453e-05, + "loss": 1.7046, + "step": 21582 + }, + { + "epoch": 6.624616329036218, + "grad_norm": 0.165769562125206, + "learning_rate": 2.70232319235998e-05, + "loss": 1.7028, + "step": 21583 + }, + { + "epoch": 6.624923265807244, + "grad_norm": 0.19245828688144684, + "learning_rate": 2.701881738731103e-05, + "loss": 1.7153, + "step": 21584 + }, + { + "epoch": 6.625230202578269, + "grad_norm": 0.17638756334781647, + "learning_rate": 2.7014403078135776e-05, + "loss": 1.7071, + "step": 21585 + }, + { + "epoch": 6.625537139349294, + "grad_norm": 0.17205210030078888, + "learning_rate": 2.700998899611767e-05, + "loss": 1.6706, + "step": 21586 + }, + { + "epoch": 6.62584407612032, + "grad_norm": 0.24107681214809418, + "learning_rate": 2.700557514130032e-05, + "loss": 1.8013, + "step": 21587 + }, + { + "epoch": 6.626151012891344, + "grad_norm": 0.1839917004108429, + "learning_rate": 2.7001161513727358e-05, + "loss": 1.7381, + "step": 21588 + }, + { + "epoch": 6.6264579496623695, + "grad_norm": 0.24043352901935577, + "learning_rate": 2.6996748113442394e-05, + "loss": 1.7523, + "step": 21589 + }, + { + "epoch": 6.626764886433395, + "grad_norm": 0.23488068580627441, + "learning_rate": 2.6992334940489056e-05, + "loss": 1.7587, + "step": 21590 + }, + { + "epoch": 6.62707182320442, + "grad_norm": 0.18784530460834503, + "learning_rate": 2.698792199491094e-05, + "loss": 1.7053, + "step": 21591 + }, + { + "epoch": 6.627378759975445, + "grad_norm": 0.2758429944515228, + "learning_rate": 2.6983509276751673e-05, + "loss": 1.6927, + "step": 21592 + }, + { + "epoch": 6.62768569674647, + "grad_norm": 0.2731272280216217, + "learning_rate": 2.697909678605486e-05, + "loss": 1.7351, + "step": 21593 + }, + { + "epoch": 6.627992633517495, + "grad_norm": 0.24450576305389404, + "learning_rate": 2.6974684522864098e-05, + "loss": 1.7126, + "step": 21594 + }, + { + "epoch": 6.628299570288521, + "grad_norm": 0.21820391714572906, + "learning_rate": 2.6970272487222982e-05, + "loss": 1.7075, + "step": 21595 + }, + { + "epoch": 6.628606507059546, + "grad_norm": 0.23647959530353546, + "learning_rate": 2.696586067917517e-05, + "loss": 1.7369, + "step": 21596 + }, + { + "epoch": 6.628913443830571, + "grad_norm": 0.2665121555328369, + "learning_rate": 2.696144909876419e-05, + "loss": 1.7575, + "step": 21597 + }, + { + "epoch": 6.629220380601596, + "grad_norm": 0.19871680438518524, + "learning_rate": 2.695703774603371e-05, + "loss": 1.7334, + "step": 21598 + }, + { + "epoch": 6.629527317372621, + "grad_norm": 0.2363109588623047, + "learning_rate": 2.6952626621027245e-05, + "loss": 1.6878, + "step": 21599 + }, + { + "epoch": 6.629834254143646, + "grad_norm": 0.21958591043949127, + "learning_rate": 2.694821572378845e-05, + "loss": 1.6828, + "step": 21600 + }, + { + "epoch": 6.630141190914672, + "grad_norm": 0.20437858998775482, + "learning_rate": 2.6943805054360906e-05, + "loss": 1.7138, + "step": 21601 + }, + { + "epoch": 6.630448127685697, + "grad_norm": 0.27741923928260803, + "learning_rate": 2.6939394612788193e-05, + "loss": 1.7506, + "step": 21602 + }, + { + "epoch": 6.6307550644567215, + "grad_norm": 0.1885133981704712, + "learning_rate": 2.6934984399113917e-05, + "loss": 1.7669, + "step": 21603 + }, + { + "epoch": 6.631062001227747, + "grad_norm": 0.19453810155391693, + "learning_rate": 2.6930574413381604e-05, + "loss": 1.6837, + "step": 21604 + }, + { + "epoch": 6.631368937998772, + "grad_norm": 0.1685735285282135, + "learning_rate": 2.6926164655634894e-05, + "loss": 1.7045, + "step": 21605 + }, + { + "epoch": 6.6316758747697975, + "grad_norm": 0.2507462203502655, + "learning_rate": 2.6921755125917347e-05, + "loss": 1.7754, + "step": 21606 + }, + { + "epoch": 6.631982811540823, + "grad_norm": 0.1725471317768097, + "learning_rate": 2.691734582427255e-05, + "loss": 1.7219, + "step": 21607 + }, + { + "epoch": 6.632289748311848, + "grad_norm": 0.2633528709411621, + "learning_rate": 2.6912936750744068e-05, + "loss": 1.7362, + "step": 21608 + }, + { + "epoch": 6.632596685082873, + "grad_norm": 0.1808360069990158, + "learning_rate": 2.6908527905375474e-05, + "loss": 1.7338, + "step": 21609 + }, + { + "epoch": 6.632903621853898, + "grad_norm": 0.16186563670635223, + "learning_rate": 2.6904119288210344e-05, + "loss": 1.6752, + "step": 21610 + }, + { + "epoch": 6.633210558624923, + "grad_norm": 0.1954091340303421, + "learning_rate": 2.689971089929224e-05, + "loss": 1.714, + "step": 21611 + }, + { + "epoch": 6.633517495395949, + "grad_norm": 0.18954069912433624, + "learning_rate": 2.689530273866474e-05, + "loss": 1.7869, + "step": 21612 + }, + { + "epoch": 6.633824432166974, + "grad_norm": 0.182058185338974, + "learning_rate": 2.6890894806371392e-05, + "loss": 1.7708, + "step": 21613 + }, + { + "epoch": 6.634131368937998, + "grad_norm": 0.17313501238822937, + "learning_rate": 2.6886487102455755e-05, + "loss": 1.7064, + "step": 21614 + }, + { + "epoch": 6.634438305709024, + "grad_norm": 0.1732148379087448, + "learning_rate": 2.688207962696143e-05, + "loss": 1.7378, + "step": 21615 + }, + { + "epoch": 6.634745242480049, + "grad_norm": 0.17057274281978607, + "learning_rate": 2.687767237993191e-05, + "loss": 1.671, + "step": 21616 + }, + { + "epoch": 6.635052179251074, + "grad_norm": 0.17723220586776733, + "learning_rate": 2.6873265361410805e-05, + "loss": 1.7179, + "step": 21617 + }, + { + "epoch": 6.6353591160221, + "grad_norm": 0.18634437024593353, + "learning_rate": 2.6868858571441645e-05, + "loss": 1.7355, + "step": 21618 + }, + { + "epoch": 6.635666052793125, + "grad_norm": 0.205010786652565, + "learning_rate": 2.6864452010067985e-05, + "loss": 1.7399, + "step": 21619 + }, + { + "epoch": 6.6359729895641495, + "grad_norm": 0.2071879357099533, + "learning_rate": 2.6860045677333383e-05, + "loss": 1.7199, + "step": 21620 + }, + { + "epoch": 6.636279926335175, + "grad_norm": 0.17309685051441193, + "learning_rate": 2.685563957328134e-05, + "loss": 1.6595, + "step": 21621 + }, + { + "epoch": 6.6365868631062, + "grad_norm": 0.3505750000476837, + "learning_rate": 2.685123369795545e-05, + "loss": 1.7601, + "step": 21622 + }, + { + "epoch": 6.6368937998772255, + "grad_norm": 0.19184419512748718, + "learning_rate": 2.684682805139923e-05, + "loss": 1.7225, + "step": 21623 + }, + { + "epoch": 6.637200736648251, + "grad_norm": 0.20142409205436707, + "learning_rate": 2.6842422633656233e-05, + "loss": 1.7201, + "step": 21624 + }, + { + "epoch": 6.637507673419275, + "grad_norm": 0.18348537385463715, + "learning_rate": 2.6838017444769993e-05, + "loss": 1.6983, + "step": 21625 + }, + { + "epoch": 6.637814610190301, + "grad_norm": 0.19275228679180145, + "learning_rate": 2.6833612484784033e-05, + "loss": 1.7028, + "step": 21626 + }, + { + "epoch": 6.638121546961326, + "grad_norm": 0.21269574761390686, + "learning_rate": 2.682920775374189e-05, + "loss": 1.7888, + "step": 21627 + }, + { + "epoch": 6.638428483732351, + "grad_norm": 0.17470422387123108, + "learning_rate": 2.68248032516871e-05, + "loss": 1.7147, + "step": 21628 + }, + { + "epoch": 6.638735420503377, + "grad_norm": 0.15697288513183594, + "learning_rate": 2.6820398978663185e-05, + "loss": 1.6544, + "step": 21629 + }, + { + "epoch": 6.639042357274402, + "grad_norm": 0.18636487424373627, + "learning_rate": 2.6815994934713677e-05, + "loss": 1.721, + "step": 21630 + }, + { + "epoch": 6.639349294045426, + "grad_norm": 0.18091215193271637, + "learning_rate": 2.681159111988208e-05, + "loss": 1.6973, + "step": 21631 + }, + { + "epoch": 6.639656230816452, + "grad_norm": 0.21360217034816742, + "learning_rate": 2.6807187534211965e-05, + "loss": 1.7379, + "step": 21632 + }, + { + "epoch": 6.639963167587477, + "grad_norm": 0.20027592778205872, + "learning_rate": 2.6802784177746777e-05, + "loss": 1.7207, + "step": 21633 + }, + { + "epoch": 6.640270104358502, + "grad_norm": 0.21839644014835358, + "learning_rate": 2.679838105053011e-05, + "loss": 1.715, + "step": 21634 + }, + { + "epoch": 6.640577041129527, + "grad_norm": 0.19237302243709564, + "learning_rate": 2.6793978152605404e-05, + "loss": 1.7415, + "step": 21635 + }, + { + "epoch": 6.640883977900552, + "grad_norm": 0.1979883313179016, + "learning_rate": 2.678957548401623e-05, + "loss": 1.7005, + "step": 21636 + }, + { + "epoch": 6.6411909146715775, + "grad_norm": 0.21867144107818604, + "learning_rate": 2.678517304480609e-05, + "loss": 1.8008, + "step": 21637 + }, + { + "epoch": 6.641497851442603, + "grad_norm": 0.17232954502105713, + "learning_rate": 2.6780770835018433e-05, + "loss": 1.6867, + "step": 21638 + }, + { + "epoch": 6.641804788213628, + "grad_norm": 0.21535196900367737, + "learning_rate": 2.6776368854696853e-05, + "loss": 1.7545, + "step": 21639 + }, + { + "epoch": 6.6421117249846535, + "grad_norm": 0.18891240656375885, + "learning_rate": 2.6771967103884766e-05, + "loss": 1.7164, + "step": 21640 + }, + { + "epoch": 6.642418661755678, + "grad_norm": 0.2558320462703705, + "learning_rate": 2.6767565582625743e-05, + "loss": 1.8125, + "step": 21641 + }, + { + "epoch": 6.642725598526703, + "grad_norm": 0.20400027930736542, + "learning_rate": 2.6763164290963244e-05, + "loss": 1.7335, + "step": 21642 + }, + { + "epoch": 6.643032535297729, + "grad_norm": 0.21388766169548035, + "learning_rate": 2.6758763228940775e-05, + "loss": 1.7788, + "step": 21643 + }, + { + "epoch": 6.643339472068754, + "grad_norm": 0.20607435703277588, + "learning_rate": 2.6754362396601834e-05, + "loss": 1.7481, + "step": 21644 + }, + { + "epoch": 6.643646408839779, + "grad_norm": 0.1608831286430359, + "learning_rate": 2.6749961793989907e-05, + "loss": 1.6577, + "step": 21645 + }, + { + "epoch": 6.643953345610804, + "grad_norm": 0.19074808061122894, + "learning_rate": 2.6745561421148485e-05, + "loss": 1.7335, + "step": 21646 + }, + { + "epoch": 6.644260282381829, + "grad_norm": 0.16517756879329681, + "learning_rate": 2.6741161278121053e-05, + "loss": 1.6663, + "step": 21647 + }, + { + "epoch": 6.644567219152854, + "grad_norm": 0.18976998329162598, + "learning_rate": 2.673676136495108e-05, + "loss": 1.7231, + "step": 21648 + }, + { + "epoch": 6.64487415592388, + "grad_norm": 0.20694875717163086, + "learning_rate": 2.6732361681682106e-05, + "loss": 1.7469, + "step": 21649 + }, + { + "epoch": 6.645181092694905, + "grad_norm": 0.1994311809539795, + "learning_rate": 2.6727962228357533e-05, + "loss": 1.6864, + "step": 21650 + }, + { + "epoch": 6.64548802946593, + "grad_norm": 0.18886511027812958, + "learning_rate": 2.672356300502091e-05, + "loss": 1.6874, + "step": 21651 + }, + { + "epoch": 6.645794966236955, + "grad_norm": 0.2152819186449051, + "learning_rate": 2.6719164011715653e-05, + "loss": 1.7327, + "step": 21652 + }, + { + "epoch": 6.64610190300798, + "grad_norm": 0.20525617897510529, + "learning_rate": 2.6714765248485275e-05, + "loss": 1.7409, + "step": 21653 + }, + { + "epoch": 6.6464088397790055, + "grad_norm": 0.21892790496349335, + "learning_rate": 2.6710366715373254e-05, + "loss": 1.7281, + "step": 21654 + }, + { + "epoch": 6.646715776550031, + "grad_norm": 0.20156462490558624, + "learning_rate": 2.6705968412423e-05, + "loss": 1.7211, + "step": 21655 + }, + { + "epoch": 6.647022713321056, + "grad_norm": 0.19993625581264496, + "learning_rate": 2.670157033967806e-05, + "loss": 1.8058, + "step": 21656 + }, + { + "epoch": 6.647329650092081, + "grad_norm": 0.1970909684896469, + "learning_rate": 2.669717249718182e-05, + "loss": 1.7707, + "step": 21657 + }, + { + "epoch": 6.647636586863106, + "grad_norm": 0.19287796318531036, + "learning_rate": 2.6692774884977796e-05, + "loss": 1.688, + "step": 21658 + }, + { + "epoch": 6.647943523634131, + "grad_norm": 0.17658226191997528, + "learning_rate": 2.668837750310943e-05, + "loss": 1.6936, + "step": 21659 + }, + { + "epoch": 6.648250460405157, + "grad_norm": 0.20234479010105133, + "learning_rate": 2.6683980351620184e-05, + "loss": 1.7069, + "step": 21660 + }, + { + "epoch": 6.648557397176182, + "grad_norm": 0.1957871913909912, + "learning_rate": 2.6679583430553513e-05, + "loss": 1.736, + "step": 21661 + }, + { + "epoch": 6.648864333947207, + "grad_norm": 0.20084553956985474, + "learning_rate": 2.667518673995286e-05, + "loss": 1.7262, + "step": 21662 + }, + { + "epoch": 6.649171270718232, + "grad_norm": 0.18749211728572845, + "learning_rate": 2.667079027986169e-05, + "loss": 1.7127, + "step": 21663 + }, + { + "epoch": 6.649478207489257, + "grad_norm": 0.1747027188539505, + "learning_rate": 2.666639405032344e-05, + "loss": 1.6922, + "step": 21664 + }, + { + "epoch": 6.649785144260282, + "grad_norm": 0.3119397759437561, + "learning_rate": 2.666199805138154e-05, + "loss": 1.7373, + "step": 21665 + }, + { + "epoch": 6.650092081031308, + "grad_norm": 0.25986436009407043, + "learning_rate": 2.6657602283079498e-05, + "loss": 1.7521, + "step": 21666 + }, + { + "epoch": 6.650399017802332, + "grad_norm": 0.20535705983638763, + "learning_rate": 2.6653206745460663e-05, + "loss": 1.7144, + "step": 21667 + }, + { + "epoch": 6.650705954573358, + "grad_norm": 0.20804347097873688, + "learning_rate": 2.6648811438568566e-05, + "loss": 1.7186, + "step": 21668 + }, + { + "epoch": 6.651012891344383, + "grad_norm": 0.20753289759159088, + "learning_rate": 2.6644416362446566e-05, + "loss": 1.7098, + "step": 21669 + }, + { + "epoch": 6.651319828115408, + "grad_norm": 0.18725311756134033, + "learning_rate": 2.6640021517138148e-05, + "loss": 1.7331, + "step": 21670 + }, + { + "epoch": 6.651626764886434, + "grad_norm": 0.1907210648059845, + "learning_rate": 2.663562690268675e-05, + "loss": 1.6677, + "step": 21671 + }, + { + "epoch": 6.651933701657459, + "grad_norm": 0.19124922156333923, + "learning_rate": 2.6631232519135747e-05, + "loss": 1.7337, + "step": 21672 + }, + { + "epoch": 6.652240638428484, + "grad_norm": 0.21045447885990143, + "learning_rate": 2.6626838366528633e-05, + "loss": 1.7028, + "step": 21673 + }, + { + "epoch": 6.652547575199509, + "grad_norm": 0.1891855001449585, + "learning_rate": 2.6622444444908767e-05, + "loss": 1.7247, + "step": 21674 + }, + { + "epoch": 6.652854511970534, + "grad_norm": 0.2236541211605072, + "learning_rate": 2.6618050754319623e-05, + "loss": 1.6986, + "step": 21675 + }, + { + "epoch": 6.653161448741559, + "grad_norm": 0.19088539481163025, + "learning_rate": 2.6613657294804604e-05, + "loss": 1.7118, + "step": 21676 + }, + { + "epoch": 6.653468385512585, + "grad_norm": 0.26210764050483704, + "learning_rate": 2.660926406640714e-05, + "loss": 1.7542, + "step": 21677 + }, + { + "epoch": 6.653775322283609, + "grad_norm": 0.2564029097557068, + "learning_rate": 2.6604871069170632e-05, + "loss": 1.7395, + "step": 21678 + }, + { + "epoch": 6.6540822590546345, + "grad_norm": 0.22974301874637604, + "learning_rate": 2.6600478303138503e-05, + "loss": 1.6905, + "step": 21679 + }, + { + "epoch": 6.65438919582566, + "grad_norm": 0.299772173166275, + "learning_rate": 2.659608576835416e-05, + "loss": 1.7875, + "step": 21680 + }, + { + "epoch": 6.654696132596685, + "grad_norm": 0.26459556818008423, + "learning_rate": 2.6591693464861018e-05, + "loss": 1.7185, + "step": 21681 + }, + { + "epoch": 6.6550030693677105, + "grad_norm": 0.24505311250686646, + "learning_rate": 2.6587301392702457e-05, + "loss": 1.7105, + "step": 21682 + }, + { + "epoch": 6.655310006138736, + "grad_norm": 0.1626308262348175, + "learning_rate": 2.6582909551921953e-05, + "loss": 1.6668, + "step": 21683 + }, + { + "epoch": 6.65561694290976, + "grad_norm": 0.20354291796684265, + "learning_rate": 2.6578517942562813e-05, + "loss": 1.7437, + "step": 21684 + }, + { + "epoch": 6.655923879680786, + "grad_norm": 0.18618443608283997, + "learning_rate": 2.6574126564668532e-05, + "loss": 1.6757, + "step": 21685 + }, + { + "epoch": 6.656230816451811, + "grad_norm": 0.1863735467195511, + "learning_rate": 2.656973541828242e-05, + "loss": 1.6549, + "step": 21686 + }, + { + "epoch": 6.656537753222836, + "grad_norm": 0.2118620127439499, + "learning_rate": 2.6565344503447935e-05, + "loss": 1.6927, + "step": 21687 + }, + { + "epoch": 6.656844689993862, + "grad_norm": 0.24023136496543884, + "learning_rate": 2.6560953820208478e-05, + "loss": 1.6969, + "step": 21688 + }, + { + "epoch": 6.657151626764886, + "grad_norm": 0.21124204993247986, + "learning_rate": 2.6556563368607368e-05, + "loss": 1.6662, + "step": 21689 + }, + { + "epoch": 6.657458563535911, + "grad_norm": 0.16295355558395386, + "learning_rate": 2.6552173148688075e-05, + "loss": 1.7203, + "step": 21690 + }, + { + "epoch": 6.657765500306937, + "grad_norm": 0.18650858104228973, + "learning_rate": 2.6547783160493916e-05, + "loss": 1.7177, + "step": 21691 + }, + { + "epoch": 6.658072437077962, + "grad_norm": 0.20509213209152222, + "learning_rate": 2.6543393404068328e-05, + "loss": 1.723, + "step": 21692 + }, + { + "epoch": 6.658379373848987, + "grad_norm": 0.20985513925552368, + "learning_rate": 2.6539003879454678e-05, + "loss": 1.6679, + "step": 21693 + }, + { + "epoch": 6.658686310620013, + "grad_norm": 0.19907233119010925, + "learning_rate": 2.6534614586696338e-05, + "loss": 1.7028, + "step": 21694 + }, + { + "epoch": 6.658993247391037, + "grad_norm": 0.21793772280216217, + "learning_rate": 2.6530225525836692e-05, + "loss": 1.7706, + "step": 21695 + }, + { + "epoch": 6.6593001841620625, + "grad_norm": 0.24162191152572632, + "learning_rate": 2.6525836696919117e-05, + "loss": 1.806, + "step": 21696 + }, + { + "epoch": 6.659607120933088, + "grad_norm": 0.1735360324382782, + "learning_rate": 2.652144809998698e-05, + "loss": 1.7047, + "step": 21697 + }, + { + "epoch": 6.659914057704113, + "grad_norm": 0.18471799790859222, + "learning_rate": 2.651705973508365e-05, + "loss": 1.7306, + "step": 21698 + }, + { + "epoch": 6.6602209944751385, + "grad_norm": 0.17422814667224884, + "learning_rate": 2.6512671602252482e-05, + "loss": 1.6666, + "step": 21699 + }, + { + "epoch": 6.660527931246163, + "grad_norm": 0.19209833443164825, + "learning_rate": 2.6508283701536897e-05, + "loss": 1.6966, + "step": 21700 + }, + { + "epoch": 6.660834868017188, + "grad_norm": 0.1902640461921692, + "learning_rate": 2.650389603298019e-05, + "loss": 1.7887, + "step": 21701 + }, + { + "epoch": 6.661141804788214, + "grad_norm": 0.18551218509674072, + "learning_rate": 2.6499508596625787e-05, + "loss": 1.6851, + "step": 21702 + }, + { + "epoch": 6.661448741559239, + "grad_norm": 0.2165011614561081, + "learning_rate": 2.6495121392516976e-05, + "loss": 1.7465, + "step": 21703 + }, + { + "epoch": 6.661755678330264, + "grad_norm": 0.22871245443820953, + "learning_rate": 2.6490734420697172e-05, + "loss": 1.7487, + "step": 21704 + }, + { + "epoch": 6.66206261510129, + "grad_norm": 0.21275551617145538, + "learning_rate": 2.6486347681209723e-05, + "loss": 1.7782, + "step": 21705 + }, + { + "epoch": 6.662369551872314, + "grad_norm": 0.2926945984363556, + "learning_rate": 2.6481961174097937e-05, + "loss": 1.7413, + "step": 21706 + }, + { + "epoch": 6.662676488643339, + "grad_norm": 0.17143094539642334, + "learning_rate": 2.6477574899405233e-05, + "loss": 1.6639, + "step": 21707 + }, + { + "epoch": 6.662983425414365, + "grad_norm": 0.22194001078605652, + "learning_rate": 2.647318885717488e-05, + "loss": 1.7035, + "step": 21708 + }, + { + "epoch": 6.66329036218539, + "grad_norm": 0.18232671916484833, + "learning_rate": 2.6468803047450286e-05, + "loss": 1.6977, + "step": 21709 + }, + { + "epoch": 6.6635972989564145, + "grad_norm": 0.2626599371433258, + "learning_rate": 2.6464417470274773e-05, + "loss": 1.7422, + "step": 21710 + }, + { + "epoch": 6.66390423572744, + "grad_norm": 0.2034282237291336, + "learning_rate": 2.6460032125691668e-05, + "loss": 1.7531, + "step": 21711 + }, + { + "epoch": 6.664211172498465, + "grad_norm": 0.2308860868215561, + "learning_rate": 2.645564701374434e-05, + "loss": 1.7271, + "step": 21712 + }, + { + "epoch": 6.6645181092694905, + "grad_norm": 0.2163545936346054, + "learning_rate": 2.64512621344761e-05, + "loss": 1.7632, + "step": 21713 + }, + { + "epoch": 6.664825046040516, + "grad_norm": 0.2566233277320862, + "learning_rate": 2.644687748793029e-05, + "loss": 1.7573, + "step": 21714 + }, + { + "epoch": 6.665131982811541, + "grad_norm": 0.21093623340129852, + "learning_rate": 2.6442493074150244e-05, + "loss": 1.6703, + "step": 21715 + }, + { + "epoch": 6.665438919582566, + "grad_norm": 0.2083086222410202, + "learning_rate": 2.643810889317927e-05, + "loss": 1.6672, + "step": 21716 + }, + { + "epoch": 6.665745856353591, + "grad_norm": 0.20711155235767365, + "learning_rate": 2.643372494506075e-05, + "loss": 1.7276, + "step": 21717 + }, + { + "epoch": 6.666052793124616, + "grad_norm": 0.18977457284927368, + "learning_rate": 2.6429341229837935e-05, + "loss": 1.7207, + "step": 21718 + }, + { + "epoch": 6.666359729895642, + "grad_norm": 0.28336507081985474, + "learning_rate": 2.6424957747554224e-05, + "loss": 1.7473, + "step": 21719 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.1761232167482376, + "learning_rate": 2.642057449825286e-05, + "loss": 1.7172, + "step": 21720 + }, + { + "epoch": 6.666973603437691, + "grad_norm": 0.21672405302524567, + "learning_rate": 2.6416191481977215e-05, + "loss": 1.6561, + "step": 21721 + }, + { + "epoch": 6.667280540208717, + "grad_norm": 0.226834237575531, + "learning_rate": 2.6411808698770613e-05, + "loss": 1.7315, + "step": 21722 + }, + { + "epoch": 6.667587476979742, + "grad_norm": 0.22553586959838867, + "learning_rate": 2.6407426148676307e-05, + "loss": 1.7301, + "step": 21723 + }, + { + "epoch": 6.667894413750767, + "grad_norm": 0.1913517564535141, + "learning_rate": 2.6403043831737672e-05, + "loss": 1.6739, + "step": 21724 + }, + { + "epoch": 6.668201350521793, + "grad_norm": 0.24560052156448364, + "learning_rate": 2.6398661747997955e-05, + "loss": 1.7347, + "step": 21725 + }, + { + "epoch": 6.668508287292818, + "grad_norm": 0.27361172437667847, + "learning_rate": 2.6394279897500517e-05, + "loss": 1.7713, + "step": 21726 + }, + { + "epoch": 6.6688152240638425, + "grad_norm": 0.21486583352088928, + "learning_rate": 2.6389898280288638e-05, + "loss": 1.7504, + "step": 21727 + }, + { + "epoch": 6.669122160834868, + "grad_norm": 0.19056405127048492, + "learning_rate": 2.6385516896405627e-05, + "loss": 1.7457, + "step": 21728 + }, + { + "epoch": 6.669429097605893, + "grad_norm": 0.19316376745700836, + "learning_rate": 2.638113574589478e-05, + "loss": 1.6969, + "step": 21729 + }, + { + "epoch": 6.6697360343769185, + "grad_norm": 0.21700869500637054, + "learning_rate": 2.637675482879939e-05, + "loss": 1.7055, + "step": 21730 + }, + { + "epoch": 6.670042971147944, + "grad_norm": 0.19720883667469025, + "learning_rate": 2.637237414516275e-05, + "loss": 1.7029, + "step": 21731 + }, + { + "epoch": 6.670349907918968, + "grad_norm": 0.16528408229351044, + "learning_rate": 2.6367993695028158e-05, + "loss": 1.6915, + "step": 21732 + }, + { + "epoch": 6.670656844689994, + "grad_norm": 0.19576294720172882, + "learning_rate": 2.636361347843889e-05, + "loss": 1.7034, + "step": 21733 + }, + { + "epoch": 6.670963781461019, + "grad_norm": 0.16859273612499237, + "learning_rate": 2.6359233495438285e-05, + "loss": 1.7114, + "step": 21734 + }, + { + "epoch": 6.671270718232044, + "grad_norm": 0.20480163395404816, + "learning_rate": 2.6354853746069553e-05, + "loss": 1.7304, + "step": 21735 + }, + { + "epoch": 6.67157765500307, + "grad_norm": 0.19104263186454773, + "learning_rate": 2.6350474230376048e-05, + "loss": 1.7026, + "step": 21736 + }, + { + "epoch": 6.671884591774095, + "grad_norm": 0.18243174254894257, + "learning_rate": 2.634609494840098e-05, + "loss": 1.6769, + "step": 21737 + }, + { + "epoch": 6.672191528545119, + "grad_norm": 0.20766063034534454, + "learning_rate": 2.634171590018769e-05, + "loss": 1.7436, + "step": 21738 + }, + { + "epoch": 6.672498465316145, + "grad_norm": 0.22035297751426697, + "learning_rate": 2.6337337085779444e-05, + "loss": 1.8211, + "step": 21739 + }, + { + "epoch": 6.67280540208717, + "grad_norm": 0.18965984880924225, + "learning_rate": 2.6332958505219475e-05, + "loss": 1.7067, + "step": 21740 + }, + { + "epoch": 6.673112338858195, + "grad_norm": 0.21209993958473206, + "learning_rate": 2.632858015855111e-05, + "loss": 1.7743, + "step": 21741 + }, + { + "epoch": 6.67341927562922, + "grad_norm": 0.18409015238285065, + "learning_rate": 2.6324202045817547e-05, + "loss": 1.7494, + "step": 21742 + }, + { + "epoch": 6.673726212400245, + "grad_norm": 0.23252969980239868, + "learning_rate": 2.6319824167062125e-05, + "loss": 1.7459, + "step": 21743 + }, + { + "epoch": 6.6740331491712706, + "grad_norm": 0.16296416521072388, + "learning_rate": 2.631544652232808e-05, + "loss": 1.648, + "step": 21744 + }, + { + "epoch": 6.674340085942296, + "grad_norm": 0.2458602488040924, + "learning_rate": 2.631106911165867e-05, + "loss": 1.6847, + "step": 21745 + }, + { + "epoch": 6.674647022713321, + "grad_norm": 0.21203550696372986, + "learning_rate": 2.6306691935097162e-05, + "loss": 1.713, + "step": 21746 + }, + { + "epoch": 6.6749539594843466, + "grad_norm": 0.19969885051250458, + "learning_rate": 2.6302314992686804e-05, + "loss": 1.7445, + "step": 21747 + }, + { + "epoch": 6.675260896255372, + "grad_norm": 0.21001017093658447, + "learning_rate": 2.629793828447087e-05, + "loss": 1.703, + "step": 21748 + }, + { + "epoch": 6.675567833026396, + "grad_norm": 0.18607214093208313, + "learning_rate": 2.6293561810492595e-05, + "loss": 1.6765, + "step": 21749 + }, + { + "epoch": 6.675874769797422, + "grad_norm": 0.21806176006793976, + "learning_rate": 2.6289185570795223e-05, + "loss": 1.7099, + "step": 21750 + }, + { + "epoch": 6.676181706568447, + "grad_norm": 0.1861930787563324, + "learning_rate": 2.6284809565422052e-05, + "loss": 1.6978, + "step": 21751 + }, + { + "epoch": 6.676488643339472, + "grad_norm": 0.18779867887496948, + "learning_rate": 2.6280433794416254e-05, + "loss": 1.7132, + "step": 21752 + }, + { + "epoch": 6.676795580110497, + "grad_norm": 0.18255293369293213, + "learning_rate": 2.627605825782115e-05, + "loss": 1.7045, + "step": 21753 + }, + { + "epoch": 6.677102516881522, + "grad_norm": 0.22258871793746948, + "learning_rate": 2.6271682955679904e-05, + "loss": 1.7159, + "step": 21754 + }, + { + "epoch": 6.6774094536525475, + "grad_norm": 0.17425768077373505, + "learning_rate": 2.626730788803582e-05, + "loss": 1.6571, + "step": 21755 + }, + { + "epoch": 6.677716390423573, + "grad_norm": 0.1921091377735138, + "learning_rate": 2.6262933054932122e-05, + "loss": 1.8178, + "step": 21756 + }, + { + "epoch": 6.678023327194598, + "grad_norm": 0.16262951493263245, + "learning_rate": 2.6258558456411996e-05, + "loss": 1.6586, + "step": 21757 + }, + { + "epoch": 6.6783302639656235, + "grad_norm": 0.1853780597448349, + "learning_rate": 2.6254184092518752e-05, + "loss": 1.7116, + "step": 21758 + }, + { + "epoch": 6.678637200736648, + "grad_norm": 0.17973974347114563, + "learning_rate": 2.6249809963295536e-05, + "loss": 1.7317, + "step": 21759 + }, + { + "epoch": 6.678944137507673, + "grad_norm": 0.21258050203323364, + "learning_rate": 2.6245436068785634e-05, + "loss": 1.7852, + "step": 21760 + }, + { + "epoch": 6.679251074278699, + "grad_norm": 0.18741287291049957, + "learning_rate": 2.6241062409032262e-05, + "loss": 1.7071, + "step": 21761 + }, + { + "epoch": 6.679558011049724, + "grad_norm": 0.20436155796051025, + "learning_rate": 2.623668898407864e-05, + "loss": 1.7683, + "step": 21762 + }, + { + "epoch": 6.679864947820749, + "grad_norm": 0.18840116262435913, + "learning_rate": 2.6232315793967977e-05, + "loss": 1.7335, + "step": 21763 + }, + { + "epoch": 6.680171884591774, + "grad_norm": 0.1968357264995575, + "learning_rate": 2.62279428387435e-05, + "loss": 1.6848, + "step": 21764 + }, + { + "epoch": 6.680478821362799, + "grad_norm": 0.1774388998746872, + "learning_rate": 2.622357011844844e-05, + "loss": 1.6943, + "step": 21765 + }, + { + "epoch": 6.680785758133824, + "grad_norm": 0.2424328327178955, + "learning_rate": 2.621919763312598e-05, + "loss": 1.7479, + "step": 21766 + }, + { + "epoch": 6.68109269490485, + "grad_norm": 0.21220771968364716, + "learning_rate": 2.6214825382819353e-05, + "loss": 1.7384, + "step": 21767 + }, + { + "epoch": 6.681399631675875, + "grad_norm": 0.23322279751300812, + "learning_rate": 2.6210453367571764e-05, + "loss": 1.6625, + "step": 21768 + }, + { + "epoch": 6.6817065684469, + "grad_norm": 0.1726260483264923, + "learning_rate": 2.620608158742639e-05, + "loss": 1.7055, + "step": 21769 + }, + { + "epoch": 6.682013505217925, + "grad_norm": 0.25436410307884216, + "learning_rate": 2.6201710042426512e-05, + "loss": 1.7449, + "step": 21770 + }, + { + "epoch": 6.68232044198895, + "grad_norm": 0.20275171101093292, + "learning_rate": 2.619733873261524e-05, + "loss": 1.7575, + "step": 21771 + }, + { + "epoch": 6.6826273787599755, + "grad_norm": 0.24221903085708618, + "learning_rate": 2.6192967658035846e-05, + "loss": 1.7312, + "step": 21772 + }, + { + "epoch": 6.682934315531001, + "grad_norm": 0.30804362893104553, + "learning_rate": 2.6188596818731507e-05, + "loss": 1.7669, + "step": 21773 + }, + { + "epoch": 6.683241252302026, + "grad_norm": 0.1818273365497589, + "learning_rate": 2.6184226214745377e-05, + "loss": 1.7102, + "step": 21774 + }, + { + "epoch": 6.683548189073051, + "grad_norm": 0.28026455640792847, + "learning_rate": 2.6179855846120727e-05, + "loss": 1.7313, + "step": 21775 + }, + { + "epoch": 6.683855125844076, + "grad_norm": 0.26503586769104004, + "learning_rate": 2.6175485712900655e-05, + "loss": 1.7622, + "step": 21776 + }, + { + "epoch": 6.684162062615101, + "grad_norm": 0.19122248888015747, + "learning_rate": 2.6171115815128423e-05, + "loss": 1.7347, + "step": 21777 + }, + { + "epoch": 6.684468999386127, + "grad_norm": 0.18789063394069672, + "learning_rate": 2.6166746152847187e-05, + "loss": 1.7158, + "step": 21778 + }, + { + "epoch": 6.684775936157152, + "grad_norm": 0.17315362393856049, + "learning_rate": 2.6162376726100135e-05, + "loss": 1.6561, + "step": 21779 + }, + { + "epoch": 6.685082872928177, + "grad_norm": 0.20659680664539337, + "learning_rate": 2.615800753493045e-05, + "loss": 1.7063, + "step": 21780 + }, + { + "epoch": 6.685389809699202, + "grad_norm": 0.2051183432340622, + "learning_rate": 2.6153638579381307e-05, + "loss": 1.7213, + "step": 21781 + }, + { + "epoch": 6.685696746470227, + "grad_norm": 0.23349207639694214, + "learning_rate": 2.6149269859495884e-05, + "loss": 1.7453, + "step": 21782 + }, + { + "epoch": 6.686003683241252, + "grad_norm": 0.1979275941848755, + "learning_rate": 2.6144901375317355e-05, + "loss": 1.7482, + "step": 21783 + }, + { + "epoch": 6.686310620012278, + "grad_norm": 0.2742067873477936, + "learning_rate": 2.61405331268889e-05, + "loss": 1.7114, + "step": 21784 + }, + { + "epoch": 6.686617556783302, + "grad_norm": 0.18656300008296967, + "learning_rate": 2.6136165114253675e-05, + "loss": 1.7114, + "step": 21785 + }, + { + "epoch": 6.6869244935543275, + "grad_norm": 0.19345268607139587, + "learning_rate": 2.6131797337454834e-05, + "loss": 1.6818, + "step": 21786 + }, + { + "epoch": 6.687231430325353, + "grad_norm": 0.2194962054491043, + "learning_rate": 2.6127429796535597e-05, + "loss": 1.7519, + "step": 21787 + }, + { + "epoch": 6.687538367096378, + "grad_norm": 0.21714645624160767, + "learning_rate": 2.6123062491539054e-05, + "loss": 1.7334, + "step": 21788 + }, + { + "epoch": 6.6878453038674035, + "grad_norm": 0.1684521585702896, + "learning_rate": 2.6118695422508444e-05, + "loss": 1.6843, + "step": 21789 + }, + { + "epoch": 6.688152240638429, + "grad_norm": 0.16155442595481873, + "learning_rate": 2.6114328589486865e-05, + "loss": 1.6541, + "step": 21790 + }, + { + "epoch": 6.688459177409453, + "grad_norm": 0.18483634293079376, + "learning_rate": 2.6109961992517462e-05, + "loss": 1.688, + "step": 21791 + }, + { + "epoch": 6.688766114180479, + "grad_norm": 0.23146624863147736, + "learning_rate": 2.6105595631643466e-05, + "loss": 1.8006, + "step": 21792 + }, + { + "epoch": 6.689073050951504, + "grad_norm": 0.1852748543024063, + "learning_rate": 2.6101229506907937e-05, + "loss": 1.6624, + "step": 21793 + }, + { + "epoch": 6.689379987722529, + "grad_norm": 0.23809482157230377, + "learning_rate": 2.6096863618354105e-05, + "loss": 1.7313, + "step": 21794 + }, + { + "epoch": 6.689686924493555, + "grad_norm": 0.17145361006259918, + "learning_rate": 2.609249796602503e-05, + "loss": 1.6966, + "step": 21795 + }, + { + "epoch": 6.689993861264579, + "grad_norm": 0.1842796355485916, + "learning_rate": 2.6088132549963933e-05, + "loss": 1.6871, + "step": 21796 + }, + { + "epoch": 6.690300798035604, + "grad_norm": 0.1810201108455658, + "learning_rate": 2.608376737021392e-05, + "loss": 1.7509, + "step": 21797 + }, + { + "epoch": 6.69060773480663, + "grad_norm": 0.20428195595741272, + "learning_rate": 2.607940242681814e-05, + "loss": 1.7102, + "step": 21798 + }, + { + "epoch": 6.690914671577655, + "grad_norm": 0.1659073680639267, + "learning_rate": 2.6075037719819716e-05, + "loss": 1.7053, + "step": 21799 + }, + { + "epoch": 6.69122160834868, + "grad_norm": 0.19351087510585785, + "learning_rate": 2.60706732492618e-05, + "loss": 1.6847, + "step": 21800 + }, + { + "epoch": 6.691528545119706, + "grad_norm": 0.1734616905450821, + "learning_rate": 2.6066309015187517e-05, + "loss": 1.6989, + "step": 21801 + }, + { + "epoch": 6.69183548189073, + "grad_norm": 0.1863887459039688, + "learning_rate": 2.6061945017639995e-05, + "loss": 1.665, + "step": 21802 + }, + { + "epoch": 6.6921424186617555, + "grad_norm": 0.20225204527378082, + "learning_rate": 2.6057581256662344e-05, + "loss": 1.718, + "step": 21803 + }, + { + "epoch": 6.692449355432781, + "grad_norm": 0.22148309648036957, + "learning_rate": 2.605321773229774e-05, + "loss": 1.7801, + "step": 21804 + }, + { + "epoch": 6.692756292203806, + "grad_norm": 0.1870507448911667, + "learning_rate": 2.6048854444589242e-05, + "loss": 1.6613, + "step": 21805 + }, + { + "epoch": 6.6930632289748315, + "grad_norm": 0.18597224354743958, + "learning_rate": 2.604449139358004e-05, + "loss": 1.7284, + "step": 21806 + }, + { + "epoch": 6.693370165745856, + "grad_norm": 0.2082163542509079, + "learning_rate": 2.6040128579313193e-05, + "loss": 1.7456, + "step": 21807 + }, + { + "epoch": 6.693677102516881, + "grad_norm": 0.22506757080554962, + "learning_rate": 2.603576600183183e-05, + "loss": 1.7369, + "step": 21808 + }, + { + "epoch": 6.693984039287907, + "grad_norm": 0.20707464218139648, + "learning_rate": 2.60314036611791e-05, + "loss": 1.7176, + "step": 21809 + }, + { + "epoch": 6.694290976058932, + "grad_norm": 0.2306852787733078, + "learning_rate": 2.6027041557398053e-05, + "loss": 1.7582, + "step": 21810 + }, + { + "epoch": 6.694597912829957, + "grad_norm": 0.23120234906673431, + "learning_rate": 2.602267969053187e-05, + "loss": 1.7169, + "step": 21811 + }, + { + "epoch": 6.694904849600983, + "grad_norm": 0.24841509759426117, + "learning_rate": 2.6018318060623582e-05, + "loss": 1.7636, + "step": 21812 + }, + { + "epoch": 6.695211786372007, + "grad_norm": 0.22443681955337524, + "learning_rate": 2.601395666771635e-05, + "loss": 1.7465, + "step": 21813 + }, + { + "epoch": 6.695518723143032, + "grad_norm": 0.2905699908733368, + "learning_rate": 2.6009595511853257e-05, + "loss": 1.779, + "step": 21814 + }, + { + "epoch": 6.695825659914058, + "grad_norm": 0.18677717447280884, + "learning_rate": 2.60052345930774e-05, + "loss": 1.711, + "step": 21815 + }, + { + "epoch": 6.696132596685083, + "grad_norm": 0.2150946855545044, + "learning_rate": 2.6000873911431883e-05, + "loss": 1.7254, + "step": 21816 + }, + { + "epoch": 6.696439533456108, + "grad_norm": 0.20066408812999725, + "learning_rate": 2.5996513466959794e-05, + "loss": 1.7198, + "step": 21817 + }, + { + "epoch": 6.696746470227133, + "grad_norm": 0.23815886676311493, + "learning_rate": 2.5992153259704228e-05, + "loss": 1.749, + "step": 21818 + }, + { + "epoch": 6.697053406998158, + "grad_norm": 0.2067428082227707, + "learning_rate": 2.5987793289708273e-05, + "loss": 1.736, + "step": 21819 + }, + { + "epoch": 6.6973603437691835, + "grad_norm": 0.2126816362142563, + "learning_rate": 2.5983433557015e-05, + "loss": 1.6804, + "step": 21820 + }, + { + "epoch": 6.697667280540209, + "grad_norm": 0.2003033310174942, + "learning_rate": 2.597907406166756e-05, + "loss": 1.7303, + "step": 21821 + }, + { + "epoch": 6.697974217311234, + "grad_norm": 0.238821879029274, + "learning_rate": 2.5974714803708946e-05, + "loss": 1.7399, + "step": 21822 + }, + { + "epoch": 6.6982811540822595, + "grad_norm": 0.21327996253967285, + "learning_rate": 2.597035578318231e-05, + "loss": 1.766, + "step": 21823 + }, + { + "epoch": 6.698588090853284, + "grad_norm": 0.19689476490020752, + "learning_rate": 2.5965997000130694e-05, + "loss": 1.7621, + "step": 21824 + }, + { + "epoch": 6.698895027624309, + "grad_norm": 0.18349261581897736, + "learning_rate": 2.5961638454597158e-05, + "loss": 1.6339, + "step": 21825 + }, + { + "epoch": 6.699201964395335, + "grad_norm": 0.21475930511951447, + "learning_rate": 2.595728014662484e-05, + "loss": 1.6973, + "step": 21826 + }, + { + "epoch": 6.69950890116636, + "grad_norm": 0.2711705267429352, + "learning_rate": 2.5952922076256737e-05, + "loss": 1.7801, + "step": 21827 + }, + { + "epoch": 6.699815837937384, + "grad_norm": 0.2601792514324188, + "learning_rate": 2.5948564243535988e-05, + "loss": 1.7508, + "step": 21828 + }, + { + "epoch": 6.70012277470841, + "grad_norm": 0.206949844956398, + "learning_rate": 2.5944206648505586e-05, + "loss": 1.7853, + "step": 21829 + }, + { + "epoch": 6.700429711479435, + "grad_norm": 0.25003641843795776, + "learning_rate": 2.5939849291208653e-05, + "loss": 1.766, + "step": 21830 + }, + { + "epoch": 6.7007366482504604, + "grad_norm": 0.25864318013191223, + "learning_rate": 2.593549217168823e-05, + "loss": 1.7778, + "step": 21831 + }, + { + "epoch": 6.701043585021486, + "grad_norm": 0.20212729275226593, + "learning_rate": 2.593113528998738e-05, + "loss": 1.7249, + "step": 21832 + }, + { + "epoch": 6.701350521792511, + "grad_norm": 0.2518431842327118, + "learning_rate": 2.5926778646149154e-05, + "loss": 1.7466, + "step": 21833 + }, + { + "epoch": 6.701657458563536, + "grad_norm": 0.24284590780735016, + "learning_rate": 2.5922422240216614e-05, + "loss": 1.8309, + "step": 21834 + }, + { + "epoch": 6.701964395334561, + "grad_norm": 0.21829955279827118, + "learning_rate": 2.5918066072232817e-05, + "loss": 1.7458, + "step": 21835 + }, + { + "epoch": 6.702271332105586, + "grad_norm": 0.2842165231704712, + "learning_rate": 2.5913710142240792e-05, + "loss": 1.7379, + "step": 21836 + }, + { + "epoch": 6.702578268876612, + "grad_norm": 0.19648514688014984, + "learning_rate": 2.590935445028359e-05, + "loss": 1.7141, + "step": 21837 + }, + { + "epoch": 6.702885205647637, + "grad_norm": 0.24336646497249603, + "learning_rate": 2.5904998996404305e-05, + "loss": 1.6719, + "step": 21838 + }, + { + "epoch": 6.703192142418661, + "grad_norm": 0.17288628220558167, + "learning_rate": 2.5900643780645905e-05, + "loss": 1.6982, + "step": 21839 + }, + { + "epoch": 6.703499079189687, + "grad_norm": 0.24906334280967712, + "learning_rate": 2.5896288803051505e-05, + "loss": 1.6873, + "step": 21840 + }, + { + "epoch": 6.703806015960712, + "grad_norm": 0.2177029550075531, + "learning_rate": 2.5891934063664085e-05, + "loss": 1.6884, + "step": 21841 + }, + { + "epoch": 6.704112952731737, + "grad_norm": 0.20478956401348114, + "learning_rate": 2.5887579562526688e-05, + "loss": 1.7342, + "step": 21842 + }, + { + "epoch": 6.704419889502763, + "grad_norm": 0.26212164759635925, + "learning_rate": 2.58832252996824e-05, + "loss": 1.7304, + "step": 21843 + }, + { + "epoch": 6.704726826273788, + "grad_norm": 0.2049340009689331, + "learning_rate": 2.587887127517418e-05, + "loss": 1.7472, + "step": 21844 + }, + { + "epoch": 6.7050337630448125, + "grad_norm": 0.2453075796365738, + "learning_rate": 2.587451748904512e-05, + "loss": 1.7443, + "step": 21845 + }, + { + "epoch": 6.705340699815838, + "grad_norm": 0.19545187056064606, + "learning_rate": 2.5870163941338188e-05, + "loss": 1.7328, + "step": 21846 + }, + { + "epoch": 6.705647636586863, + "grad_norm": 0.24424482882022858, + "learning_rate": 2.5865810632096456e-05, + "loss": 1.6876, + "step": 21847 + }, + { + "epoch": 6.7059545733578885, + "grad_norm": 0.2150830626487732, + "learning_rate": 2.5861457561362922e-05, + "loss": 1.7272, + "step": 21848 + }, + { + "epoch": 6.706261510128914, + "grad_norm": 0.2632520794868469, + "learning_rate": 2.5857104729180626e-05, + "loss": 1.7542, + "step": 21849 + }, + { + "epoch": 6.706568446899938, + "grad_norm": 0.21789421141147614, + "learning_rate": 2.5852752135592563e-05, + "loss": 1.6856, + "step": 21850 + }, + { + "epoch": 6.706875383670964, + "grad_norm": 0.2227005511522293, + "learning_rate": 2.5848399780641758e-05, + "loss": 1.7473, + "step": 21851 + }, + { + "epoch": 6.707182320441989, + "grad_norm": 0.23424866795539856, + "learning_rate": 2.5844047664371218e-05, + "loss": 1.7016, + "step": 21852 + }, + { + "epoch": 6.707489257213014, + "grad_norm": 0.2125028669834137, + "learning_rate": 2.5839695786823964e-05, + "loss": 1.8296, + "step": 21853 + }, + { + "epoch": 6.70779619398404, + "grad_norm": 0.2533423900604248, + "learning_rate": 2.5835344148042972e-05, + "loss": 1.7237, + "step": 21854 + }, + { + "epoch": 6.708103130755065, + "grad_norm": 0.1951744705438614, + "learning_rate": 2.583099274807132e-05, + "loss": 1.6685, + "step": 21855 + }, + { + "epoch": 6.708410067526089, + "grad_norm": 0.2564519941806793, + "learning_rate": 2.5826641586951938e-05, + "loss": 1.7542, + "step": 21856 + }, + { + "epoch": 6.708717004297115, + "grad_norm": 0.2586502134799957, + "learning_rate": 2.5822290664727856e-05, + "loss": 1.7477, + "step": 21857 + }, + { + "epoch": 6.70902394106814, + "grad_norm": 0.30357107520103455, + "learning_rate": 2.5817939981442062e-05, + "loss": 1.7454, + "step": 21858 + }, + { + "epoch": 6.709330877839165, + "grad_norm": 0.20547500252723694, + "learning_rate": 2.5813589537137544e-05, + "loss": 1.7517, + "step": 21859 + }, + { + "epoch": 6.70963781461019, + "grad_norm": 0.2961783707141876, + "learning_rate": 2.5809239331857348e-05, + "loss": 1.698, + "step": 21860 + }, + { + "epoch": 6.709944751381215, + "grad_norm": 0.2062019556760788, + "learning_rate": 2.580488936564439e-05, + "loss": 1.7358, + "step": 21861 + }, + { + "epoch": 6.7102516881522405, + "grad_norm": 0.22287480533123016, + "learning_rate": 2.580053963854173e-05, + "loss": 1.7099, + "step": 21862 + }, + { + "epoch": 6.710558624923266, + "grad_norm": 0.1853112131357193, + "learning_rate": 2.579619015059229e-05, + "loss": 1.7493, + "step": 21863 + }, + { + "epoch": 6.710865561694291, + "grad_norm": 0.24855247139930725, + "learning_rate": 2.5791840901839105e-05, + "loss": 1.7248, + "step": 21864 + }, + { + "epoch": 6.7111724984653165, + "grad_norm": 0.18156948685646057, + "learning_rate": 2.5787491892325126e-05, + "loss": 1.6744, + "step": 21865 + }, + { + "epoch": 6.711479435236341, + "grad_norm": 0.3272082209587097, + "learning_rate": 2.5783143122093357e-05, + "loss": 1.7546, + "step": 21866 + }, + { + "epoch": 6.711786372007366, + "grad_norm": 0.2875421643257141, + "learning_rate": 2.577879459118675e-05, + "loss": 1.6477, + "step": 21867 + }, + { + "epoch": 6.712093308778392, + "grad_norm": 0.19682031869888306, + "learning_rate": 2.5774446299648297e-05, + "loss": 1.7455, + "step": 21868 + }, + { + "epoch": 6.712400245549417, + "grad_norm": 0.32829195261001587, + "learning_rate": 2.5770098247520968e-05, + "loss": 1.7817, + "step": 21869 + }, + { + "epoch": 6.712707182320442, + "grad_norm": 0.26227760314941406, + "learning_rate": 2.5765750434847724e-05, + "loss": 1.763, + "step": 21870 + }, + { + "epoch": 6.713014119091467, + "grad_norm": 0.2902637720108032, + "learning_rate": 2.576140286167152e-05, + "loss": 1.7432, + "step": 21871 + }, + { + "epoch": 6.713321055862492, + "grad_norm": 0.2290763407945633, + "learning_rate": 2.5757055528035377e-05, + "loss": 1.7149, + "step": 21872 + }, + { + "epoch": 6.713627992633517, + "grad_norm": 0.3445907533168793, + "learning_rate": 2.575270843398221e-05, + "loss": 1.7874, + "step": 21873 + }, + { + "epoch": 6.713934929404543, + "grad_norm": 0.1841191053390503, + "learning_rate": 2.574836157955498e-05, + "loss": 1.6954, + "step": 21874 + }, + { + "epoch": 6.714241866175568, + "grad_norm": 0.24168385565280914, + "learning_rate": 2.5744014964796657e-05, + "loss": 1.7153, + "step": 21875 + }, + { + "epoch": 6.714548802946593, + "grad_norm": 0.17855188250541687, + "learning_rate": 2.5739668589750175e-05, + "loss": 1.7329, + "step": 21876 + }, + { + "epoch": 6.714855739717618, + "grad_norm": 0.189789280295372, + "learning_rate": 2.5735322454458554e-05, + "loss": 1.6854, + "step": 21877 + }, + { + "epoch": 6.715162676488643, + "grad_norm": 0.1792519986629486, + "learning_rate": 2.5730976558964647e-05, + "loss": 1.7483, + "step": 21878 + }, + { + "epoch": 6.7154696132596685, + "grad_norm": 0.24460360407829285, + "learning_rate": 2.5726630903311504e-05, + "loss": 1.8337, + "step": 21879 + }, + { + "epoch": 6.715776550030694, + "grad_norm": 0.21612058579921722, + "learning_rate": 2.572228548754198e-05, + "loss": 1.7293, + "step": 21880 + }, + { + "epoch": 6.716083486801719, + "grad_norm": 0.22057892382144928, + "learning_rate": 2.5717940311699078e-05, + "loss": 1.7269, + "step": 21881 + }, + { + "epoch": 6.716390423572744, + "grad_norm": 0.19635777175426483, + "learning_rate": 2.571359537582572e-05, + "loss": 1.6744, + "step": 21882 + }, + { + "epoch": 6.716697360343769, + "grad_norm": 0.20406895875930786, + "learning_rate": 2.570925067996485e-05, + "loss": 1.6866, + "step": 21883 + }, + { + "epoch": 6.717004297114794, + "grad_norm": 0.1942419856786728, + "learning_rate": 2.5704906224159407e-05, + "loss": 1.724, + "step": 21884 + }, + { + "epoch": 6.71731123388582, + "grad_norm": 0.20423445105552673, + "learning_rate": 2.570056200845231e-05, + "loss": 1.6709, + "step": 21885 + }, + { + "epoch": 6.717618170656845, + "grad_norm": 0.27171632647514343, + "learning_rate": 2.569621803288651e-05, + "loss": 1.7532, + "step": 21886 + }, + { + "epoch": 6.71792510742787, + "grad_norm": 0.22753871977329254, + "learning_rate": 2.5691874297504926e-05, + "loss": 1.7534, + "step": 21887 + }, + { + "epoch": 6.718232044198895, + "grad_norm": 0.1907290369272232, + "learning_rate": 2.5687530802350468e-05, + "loss": 1.6696, + "step": 21888 + }, + { + "epoch": 6.71853898096992, + "grad_norm": 0.2226637750864029, + "learning_rate": 2.568318754746612e-05, + "loss": 1.7194, + "step": 21889 + }, + { + "epoch": 6.718845917740945, + "grad_norm": 0.20878726243972778, + "learning_rate": 2.5678844532894742e-05, + "loss": 1.6878, + "step": 21890 + }, + { + "epoch": 6.719152854511971, + "grad_norm": 0.18087267875671387, + "learning_rate": 2.567450175867928e-05, + "loss": 1.7432, + "step": 21891 + }, + { + "epoch": 6.719459791282996, + "grad_norm": 0.19818328320980072, + "learning_rate": 2.567015922486265e-05, + "loss": 1.6959, + "step": 21892 + }, + { + "epoch": 6.7197667280540205, + "grad_norm": 0.19593466818332672, + "learning_rate": 2.566581693148775e-05, + "loss": 1.7357, + "step": 21893 + }, + { + "epoch": 6.720073664825046, + "grad_norm": 0.24518795311450958, + "learning_rate": 2.5661474878597546e-05, + "loss": 1.7948, + "step": 21894 + }, + { + "epoch": 6.720380601596071, + "grad_norm": 0.18471074104309082, + "learning_rate": 2.5657133066234872e-05, + "loss": 1.6983, + "step": 21895 + }, + { + "epoch": 6.7206875383670965, + "grad_norm": 0.20073382556438446, + "learning_rate": 2.5652791494442718e-05, + "loss": 1.7241, + "step": 21896 + }, + { + "epoch": 6.720994475138122, + "grad_norm": 0.21688152849674225, + "learning_rate": 2.5648450163263903e-05, + "loss": 1.7073, + "step": 21897 + }, + { + "epoch": 6.721301411909147, + "grad_norm": 0.17722688615322113, + "learning_rate": 2.5644109072741406e-05, + "loss": 1.7047, + "step": 21898 + }, + { + "epoch": 6.721608348680172, + "grad_norm": 0.2060708999633789, + "learning_rate": 2.5639768222918093e-05, + "loss": 1.7246, + "step": 21899 + }, + { + "epoch": 6.721915285451197, + "grad_norm": 0.26590242981910706, + "learning_rate": 2.563542761383687e-05, + "loss": 1.8141, + "step": 21900 + }, + { + "epoch": 6.722222222222222, + "grad_norm": 0.22498780488967896, + "learning_rate": 2.5631087245540632e-05, + "loss": 1.7211, + "step": 21901 + }, + { + "epoch": 6.722529158993248, + "grad_norm": 0.20546968281269073, + "learning_rate": 2.562674711807227e-05, + "loss": 1.8001, + "step": 21902 + }, + { + "epoch": 6.722836095764272, + "grad_norm": 0.19668535888195038, + "learning_rate": 2.5622407231474683e-05, + "loss": 1.7443, + "step": 21903 + }, + { + "epoch": 6.723143032535297, + "grad_norm": 0.18932129442691803, + "learning_rate": 2.5618067585790752e-05, + "loss": 1.7307, + "step": 21904 + }, + { + "epoch": 6.723449969306323, + "grad_norm": 0.19501622021198273, + "learning_rate": 2.561372818106335e-05, + "loss": 1.7016, + "step": 21905 + }, + { + "epoch": 6.723756906077348, + "grad_norm": 0.21313562989234924, + "learning_rate": 2.5609389017335416e-05, + "loss": 1.8012, + "step": 21906 + }, + { + "epoch": 6.724063842848373, + "grad_norm": 0.174738347530365, + "learning_rate": 2.560505009464978e-05, + "loss": 1.6824, + "step": 21907 + }, + { + "epoch": 6.724370779619399, + "grad_norm": 0.20349650084972382, + "learning_rate": 2.560071141304934e-05, + "loss": 1.7813, + "step": 21908 + }, + { + "epoch": 6.724677716390423, + "grad_norm": 0.21878227591514587, + "learning_rate": 2.5596372972576967e-05, + "loss": 1.8166, + "step": 21909 + }, + { + "epoch": 6.7249846531614486, + "grad_norm": 0.2082633078098297, + "learning_rate": 2.559203477327552e-05, + "loss": 1.7197, + "step": 21910 + }, + { + "epoch": 6.725291589932474, + "grad_norm": 0.17738287150859833, + "learning_rate": 2.558769681518792e-05, + "loss": 1.7093, + "step": 21911 + }, + { + "epoch": 6.725598526703499, + "grad_norm": 0.1930074542760849, + "learning_rate": 2.5583359098356986e-05, + "loss": 1.7702, + "step": 21912 + }, + { + "epoch": 6.725905463474525, + "grad_norm": 0.17668531835079193, + "learning_rate": 2.5579021622825638e-05, + "loss": 1.7466, + "step": 21913 + }, + { + "epoch": 6.726212400245549, + "grad_norm": 0.1737186163663864, + "learning_rate": 2.5574684388636677e-05, + "loss": 1.6876, + "step": 21914 + }, + { + "epoch": 6.726519337016574, + "grad_norm": 0.18352502584457397, + "learning_rate": 2.5570347395833018e-05, + "loss": 1.6745, + "step": 21915 + }, + { + "epoch": 6.7268262737876, + "grad_norm": 0.19047673046588898, + "learning_rate": 2.5566010644457506e-05, + "loss": 1.7465, + "step": 21916 + }, + { + "epoch": 6.727133210558625, + "grad_norm": 0.1762397438287735, + "learning_rate": 2.5561674134553005e-05, + "loss": 1.6767, + "step": 21917 + }, + { + "epoch": 6.72744014732965, + "grad_norm": 0.22884784638881683, + "learning_rate": 2.5557337866162358e-05, + "loss": 1.7054, + "step": 21918 + }, + { + "epoch": 6.727747084100676, + "grad_norm": 0.17476098239421844, + "learning_rate": 2.5553001839328417e-05, + "loss": 1.721, + "step": 21919 + }, + { + "epoch": 6.7280540208717, + "grad_norm": 0.1827213317155838, + "learning_rate": 2.554866605409405e-05, + "loss": 1.78, + "step": 21920 + }, + { + "epoch": 6.7283609576427255, + "grad_norm": 0.21709343791007996, + "learning_rate": 2.554433051050209e-05, + "loss": 1.8064, + "step": 21921 + }, + { + "epoch": 6.728667894413751, + "grad_norm": 0.1972692310810089, + "learning_rate": 2.5539995208595398e-05, + "loss": 1.7231, + "step": 21922 + }, + { + "epoch": 6.728974831184776, + "grad_norm": 0.19464808702468872, + "learning_rate": 2.5535660148416802e-05, + "loss": 1.7931, + "step": 21923 + }, + { + "epoch": 6.7292817679558015, + "grad_norm": 0.19610099494457245, + "learning_rate": 2.5531325330009158e-05, + "loss": 1.7467, + "step": 21924 + }, + { + "epoch": 6.729588704726826, + "grad_norm": 0.21104763448238373, + "learning_rate": 2.5526990753415292e-05, + "loss": 1.7543, + "step": 21925 + }, + { + "epoch": 6.729895641497851, + "grad_norm": 0.1881588101387024, + "learning_rate": 2.5522656418678047e-05, + "loss": 1.7666, + "step": 21926 + }, + { + "epoch": 6.730202578268877, + "grad_norm": 0.2163291722536087, + "learning_rate": 2.551832232584025e-05, + "loss": 1.7321, + "step": 21927 + }, + { + "epoch": 6.730509515039902, + "grad_norm": 0.19252021610736847, + "learning_rate": 2.551398847494477e-05, + "loss": 1.7287, + "step": 21928 + }, + { + "epoch": 6.730816451810927, + "grad_norm": 0.22602233290672302, + "learning_rate": 2.550965486603437e-05, + "loss": 1.767, + "step": 21929 + }, + { + "epoch": 6.731123388581953, + "grad_norm": 0.21509617567062378, + "learning_rate": 2.5505321499151957e-05, + "loss": 1.7637, + "step": 21930 + }, + { + "epoch": 6.731430325352977, + "grad_norm": 0.24291658401489258, + "learning_rate": 2.5500988374340274e-05, + "loss": 1.7312, + "step": 21931 + }, + { + "epoch": 6.731737262124002, + "grad_norm": 0.26562216877937317, + "learning_rate": 2.5496655491642195e-05, + "loss": 1.7763, + "step": 21932 + }, + { + "epoch": 6.732044198895028, + "grad_norm": 0.19785790145397186, + "learning_rate": 2.5492322851100535e-05, + "loss": 1.6979, + "step": 21933 + }, + { + "epoch": 6.732351135666053, + "grad_norm": 0.20044486224651337, + "learning_rate": 2.5487990452758104e-05, + "loss": 1.7359, + "step": 21934 + }, + { + "epoch": 6.7326580724370775, + "grad_norm": 0.20468659698963165, + "learning_rate": 2.548365829665772e-05, + "loss": 1.6996, + "step": 21935 + }, + { + "epoch": 6.732965009208103, + "grad_norm": 0.16516120731830597, + "learning_rate": 2.5479326382842195e-05, + "loss": 1.717, + "step": 21936 + }, + { + "epoch": 6.733271945979128, + "grad_norm": 0.22404411435127258, + "learning_rate": 2.547499471135433e-05, + "loss": 1.7261, + "step": 21937 + }, + { + "epoch": 6.7335788827501535, + "grad_norm": 0.21485663950443268, + "learning_rate": 2.547066328223695e-05, + "loss": 1.7463, + "step": 21938 + }, + { + "epoch": 6.733885819521179, + "grad_norm": 0.330018550157547, + "learning_rate": 2.5466332095532853e-05, + "loss": 1.854, + "step": 21939 + }, + { + "epoch": 6.734192756292204, + "grad_norm": 0.25225213170051575, + "learning_rate": 2.5462001151284842e-05, + "loss": 1.722, + "step": 21940 + }, + { + "epoch": 6.734499693063229, + "grad_norm": 0.2422008365392685, + "learning_rate": 2.5457670449535713e-05, + "loss": 1.6996, + "step": 21941 + }, + { + "epoch": 6.734806629834254, + "grad_norm": 0.2421465814113617, + "learning_rate": 2.5453339990328275e-05, + "loss": 1.7014, + "step": 21942 + }, + { + "epoch": 6.735113566605279, + "grad_norm": 0.2520611882209778, + "learning_rate": 2.5449009773705313e-05, + "loss": 1.7149, + "step": 21943 + }, + { + "epoch": 6.735420503376305, + "grad_norm": 0.24940338730812073, + "learning_rate": 2.5444679799709626e-05, + "loss": 1.7423, + "step": 21944 + }, + { + "epoch": 6.73572744014733, + "grad_norm": 0.2328663021326065, + "learning_rate": 2.544035006838401e-05, + "loss": 1.6893, + "step": 21945 + }, + { + "epoch": 6.736034376918354, + "grad_norm": 0.2190757393836975, + "learning_rate": 2.5436020579771226e-05, + "loss": 1.7375, + "step": 21946 + }, + { + "epoch": 6.73634131368938, + "grad_norm": 0.2204900085926056, + "learning_rate": 2.543169133391413e-05, + "loss": 1.6971, + "step": 21947 + }, + { + "epoch": 6.736648250460405, + "grad_norm": 0.29192328453063965, + "learning_rate": 2.5427362330855415e-05, + "loss": 1.7633, + "step": 21948 + }, + { + "epoch": 6.73695518723143, + "grad_norm": 0.19859355688095093, + "learning_rate": 2.542303357063793e-05, + "loss": 1.7515, + "step": 21949 + }, + { + "epoch": 6.737262124002456, + "grad_norm": 0.23010417819023132, + "learning_rate": 2.5418705053304425e-05, + "loss": 1.7282, + "step": 21950 + }, + { + "epoch": 6.737569060773481, + "grad_norm": 0.2168324589729309, + "learning_rate": 2.5414376778897698e-05, + "loss": 1.7347, + "step": 21951 + }, + { + "epoch": 6.7378759975445055, + "grad_norm": 0.2190646231174469, + "learning_rate": 2.54100487474605e-05, + "loss": 1.7893, + "step": 21952 + }, + { + "epoch": 6.738182934315531, + "grad_norm": 0.23925794661045074, + "learning_rate": 2.5405720959035617e-05, + "loss": 1.7825, + "step": 21953 + }, + { + "epoch": 6.738489871086556, + "grad_norm": 0.17987917363643646, + "learning_rate": 2.5401393413665807e-05, + "loss": 1.724, + "step": 21954 + }, + { + "epoch": 6.7387968078575815, + "grad_norm": 0.2300983965396881, + "learning_rate": 2.5397066111393853e-05, + "loss": 1.7023, + "step": 21955 + }, + { + "epoch": 6.739103744628607, + "grad_norm": 0.2128167450428009, + "learning_rate": 2.539273905226251e-05, + "loss": 1.7218, + "step": 21956 + }, + { + "epoch": 6.739410681399631, + "grad_norm": 0.19105537235736847, + "learning_rate": 2.538841223631454e-05, + "loss": 1.7781, + "step": 21957 + }, + { + "epoch": 6.739717618170657, + "grad_norm": 0.22985289990901947, + "learning_rate": 2.5384085663592704e-05, + "loss": 1.7362, + "step": 21958 + }, + { + "epoch": 6.740024554941682, + "grad_norm": 0.18608705699443817, + "learning_rate": 2.5379759334139768e-05, + "loss": 1.7174, + "step": 21959 + }, + { + "epoch": 6.740331491712707, + "grad_norm": 0.2659450173377991, + "learning_rate": 2.5375433247998482e-05, + "loss": 1.8118, + "step": 21960 + }, + { + "epoch": 6.740638428483733, + "grad_norm": 0.1904401034116745, + "learning_rate": 2.537110740521159e-05, + "loss": 1.6789, + "step": 21961 + }, + { + "epoch": 6.740945365254758, + "grad_norm": 0.1826045662164688, + "learning_rate": 2.5366781805821847e-05, + "loss": 1.6906, + "step": 21962 + }, + { + "epoch": 6.741252302025782, + "grad_norm": 0.1919000893831253, + "learning_rate": 2.5362456449871995e-05, + "loss": 1.7412, + "step": 21963 + }, + { + "epoch": 6.741559238796808, + "grad_norm": 0.1921864151954651, + "learning_rate": 2.5358131337404822e-05, + "loss": 1.7023, + "step": 21964 + }, + { + "epoch": 6.741866175567833, + "grad_norm": 0.1628783494234085, + "learning_rate": 2.5353806468463004e-05, + "loss": 1.6842, + "step": 21965 + }, + { + "epoch": 6.742173112338858, + "grad_norm": 0.19764694571495056, + "learning_rate": 2.534948184308935e-05, + "loss": 1.7238, + "step": 21966 + }, + { + "epoch": 6.742480049109884, + "grad_norm": 0.1845860630273819, + "learning_rate": 2.534515746132653e-05, + "loss": 1.728, + "step": 21967 + }, + { + "epoch": 6.742786985880908, + "grad_norm": 0.20269328355789185, + "learning_rate": 2.5340833323217327e-05, + "loss": 1.7541, + "step": 21968 + }, + { + "epoch": 6.7430939226519335, + "grad_norm": 0.16586242616176605, + "learning_rate": 2.5336509428804468e-05, + "loss": 1.7025, + "step": 21969 + }, + { + "epoch": 6.743400859422959, + "grad_norm": 0.1693086177110672, + "learning_rate": 2.533218577813068e-05, + "loss": 1.6975, + "step": 21970 + }, + { + "epoch": 6.743707796193984, + "grad_norm": 0.2206759750843048, + "learning_rate": 2.5327862371238686e-05, + "loss": 1.764, + "step": 21971 + }, + { + "epoch": 6.7440147329650095, + "grad_norm": 0.1915574073791504, + "learning_rate": 2.532353920817122e-05, + "loss": 1.7576, + "step": 21972 + }, + { + "epoch": 6.744321669736035, + "grad_norm": 0.1741783618927002, + "learning_rate": 2.5319216288971003e-05, + "loss": 1.7394, + "step": 21973 + }, + { + "epoch": 6.744628606507059, + "grad_norm": 0.21624934673309326, + "learning_rate": 2.5314893613680755e-05, + "loss": 1.7358, + "step": 21974 + }, + { + "epoch": 6.744935543278085, + "grad_norm": 0.2350481003522873, + "learning_rate": 2.5310571182343197e-05, + "loss": 1.7801, + "step": 21975 + }, + { + "epoch": 6.74524248004911, + "grad_norm": 0.18618559837341309, + "learning_rate": 2.5306248995001048e-05, + "loss": 1.7012, + "step": 21976 + }, + { + "epoch": 6.745549416820135, + "grad_norm": 0.18479639291763306, + "learning_rate": 2.5301927051697016e-05, + "loss": 1.7238, + "step": 21977 + }, + { + "epoch": 6.74585635359116, + "grad_norm": 0.19978758692741394, + "learning_rate": 2.5297605352473818e-05, + "loss": 1.6636, + "step": 21978 + }, + { + "epoch": 6.746163290362185, + "grad_norm": 0.23122164607048035, + "learning_rate": 2.529328389737416e-05, + "loss": 1.7455, + "step": 21979 + }, + { + "epoch": 6.74647022713321, + "grad_norm": 0.20423240959644318, + "learning_rate": 2.5288962686440732e-05, + "loss": 1.7516, + "step": 21980 + }, + { + "epoch": 6.746777163904236, + "grad_norm": 0.18271920084953308, + "learning_rate": 2.52846417197163e-05, + "loss": 1.762, + "step": 21981 + }, + { + "epoch": 6.747084100675261, + "grad_norm": 0.19280247390270233, + "learning_rate": 2.528032099724349e-05, + "loss": 1.7298, + "step": 21982 + }, + { + "epoch": 6.747391037446286, + "grad_norm": 0.20908337831497192, + "learning_rate": 2.527600051906507e-05, + "loss": 1.7323, + "step": 21983 + }, + { + "epoch": 6.747697974217311, + "grad_norm": 0.18399856984615326, + "learning_rate": 2.5271680285223663e-05, + "loss": 1.6795, + "step": 21984 + }, + { + "epoch": 6.748004910988336, + "grad_norm": 0.2273191213607788, + "learning_rate": 2.5267360295762033e-05, + "loss": 1.6811, + "step": 21985 + }, + { + "epoch": 6.7483118477593615, + "grad_norm": 0.1844841092824936, + "learning_rate": 2.526304055072284e-05, + "loss": 1.7404, + "step": 21986 + }, + { + "epoch": 6.748618784530387, + "grad_norm": 0.25975871086120605, + "learning_rate": 2.5258721050148775e-05, + "loss": 1.6994, + "step": 21987 + }, + { + "epoch": 6.748925721301412, + "grad_norm": 0.1664818376302719, + "learning_rate": 2.5254401794082532e-05, + "loss": 1.6722, + "step": 21988 + }, + { + "epoch": 6.749232658072437, + "grad_norm": 0.2597639560699463, + "learning_rate": 2.5250082782566796e-05, + "loss": 1.7654, + "step": 21989 + }, + { + "epoch": 6.749539594843462, + "grad_norm": 0.19326356053352356, + "learning_rate": 2.5245764015644248e-05, + "loss": 1.668, + "step": 21990 + }, + { + "epoch": 6.749846531614487, + "grad_norm": 0.22924599051475525, + "learning_rate": 2.5241445493357574e-05, + "loss": 1.7522, + "step": 21991 + }, + { + "epoch": 6.750153468385513, + "grad_norm": 0.24588358402252197, + "learning_rate": 2.523712721574944e-05, + "loss": 1.7396, + "step": 21992 + }, + { + "epoch": 6.750460405156538, + "grad_norm": 0.1988971084356308, + "learning_rate": 2.5232809182862526e-05, + "loss": 1.7338, + "step": 21993 + }, + { + "epoch": 6.750767341927563, + "grad_norm": 0.18566425144672394, + "learning_rate": 2.5228491394739518e-05, + "loss": 1.7135, + "step": 21994 + }, + { + "epoch": 6.751074278698588, + "grad_norm": 0.22216622531414032, + "learning_rate": 2.5224173851423073e-05, + "loss": 1.744, + "step": 21995 + }, + { + "epoch": 6.751381215469613, + "grad_norm": 0.18695887923240662, + "learning_rate": 2.5219856552955863e-05, + "loss": 1.7324, + "step": 21996 + }, + { + "epoch": 6.7516881522406385, + "grad_norm": 0.1866987645626068, + "learning_rate": 2.5215539499380535e-05, + "loss": 1.6855, + "step": 21997 + }, + { + "epoch": 6.751995089011664, + "grad_norm": 0.1743573248386383, + "learning_rate": 2.521122269073981e-05, + "loss": 1.6833, + "step": 21998 + }, + { + "epoch": 6.752302025782689, + "grad_norm": 0.2173541784286499, + "learning_rate": 2.5206906127076274e-05, + "loss": 1.7434, + "step": 21999 + }, + { + "epoch": 6.752608962553714, + "grad_norm": 0.17558147013187408, + "learning_rate": 2.5202589808432665e-05, + "loss": 1.6884, + "step": 22000 + }, + { + "epoch": 6.752915899324739, + "grad_norm": 0.16630353033542633, + "learning_rate": 2.5198273734851553e-05, + "loss": 1.7005, + "step": 22001 + }, + { + "epoch": 6.753222836095764, + "grad_norm": 0.1834949105978012, + "learning_rate": 2.519395790637566e-05, + "loss": 1.7123, + "step": 22002 + }, + { + "epoch": 6.75352977286679, + "grad_norm": 0.1806751936674118, + "learning_rate": 2.5189642323047614e-05, + "loss": 1.7305, + "step": 22003 + }, + { + "epoch": 6.753836709637815, + "grad_norm": 0.2350265085697174, + "learning_rate": 2.5185326984910062e-05, + "loss": 1.772, + "step": 22004 + }, + { + "epoch": 6.75414364640884, + "grad_norm": 0.18105818331241608, + "learning_rate": 2.518101189200566e-05, + "loss": 1.7487, + "step": 22005 + }, + { + "epoch": 6.754450583179865, + "grad_norm": 0.17640845477581024, + "learning_rate": 2.517669704437704e-05, + "loss": 1.7178, + "step": 22006 + }, + { + "epoch": 6.75475751995089, + "grad_norm": 0.21648885309696198, + "learning_rate": 2.5172382442066845e-05, + "loss": 1.7144, + "step": 22007 + }, + { + "epoch": 6.755064456721915, + "grad_norm": 0.2042703926563263, + "learning_rate": 2.5168068085117724e-05, + "loss": 1.7476, + "step": 22008 + }, + { + "epoch": 6.755371393492941, + "grad_norm": 0.24397306144237518, + "learning_rate": 2.5163753973572306e-05, + "loss": 1.7033, + "step": 22009 + }, + { + "epoch": 6.755678330263965, + "grad_norm": 0.2030377835035324, + "learning_rate": 2.5159440107473232e-05, + "loss": 1.7353, + "step": 22010 + }, + { + "epoch": 6.7559852670349905, + "grad_norm": 0.2493598908185959, + "learning_rate": 2.5155126486863127e-05, + "loss": 1.7346, + "step": 22011 + }, + { + "epoch": 6.756292203806016, + "grad_norm": 0.17272062599658966, + "learning_rate": 2.5150813111784627e-05, + "loss": 1.7095, + "step": 22012 + }, + { + "epoch": 6.756599140577041, + "grad_norm": 0.2417706698179245, + "learning_rate": 2.514649998228036e-05, + "loss": 1.7631, + "step": 22013 + }, + { + "epoch": 6.7569060773480665, + "grad_norm": 0.17753612995147705, + "learning_rate": 2.5142187098392915e-05, + "loss": 1.697, + "step": 22014 + }, + { + "epoch": 6.757213014119092, + "grad_norm": 0.2246367186307907, + "learning_rate": 2.5137874460164995e-05, + "loss": 1.7216, + "step": 22015 + }, + { + "epoch": 6.757519950890116, + "grad_norm": 0.24141135811805725, + "learning_rate": 2.5133562067639134e-05, + "loss": 1.7368, + "step": 22016 + }, + { + "epoch": 6.757826887661142, + "grad_norm": 0.21253570914268494, + "learning_rate": 2.5129249920858022e-05, + "loss": 1.7029, + "step": 22017 + }, + { + "epoch": 6.758133824432167, + "grad_norm": 0.21176676452159882, + "learning_rate": 2.5124938019864198e-05, + "loss": 1.7472, + "step": 22018 + }, + { + "epoch": 6.758440761203192, + "grad_norm": 0.1990927904844284, + "learning_rate": 2.5120626364700338e-05, + "loss": 1.6686, + "step": 22019 + }, + { + "epoch": 6.758747697974218, + "grad_norm": 0.1736145317554474, + "learning_rate": 2.5116314955409038e-05, + "loss": 1.6984, + "step": 22020 + }, + { + "epoch": 6.759054634745242, + "grad_norm": 0.2618037462234497, + "learning_rate": 2.511200379203289e-05, + "loss": 1.7374, + "step": 22021 + }, + { + "epoch": 6.759361571516267, + "grad_norm": 0.25363266468048096, + "learning_rate": 2.5107692874614507e-05, + "loss": 1.7001, + "step": 22022 + }, + { + "epoch": 6.759668508287293, + "grad_norm": 0.20287153124809265, + "learning_rate": 2.51033822031965e-05, + "loss": 1.7704, + "step": 22023 + }, + { + "epoch": 6.759975445058318, + "grad_norm": 0.2401949167251587, + "learning_rate": 2.509907177782146e-05, + "loss": 1.7157, + "step": 22024 + }, + { + "epoch": 6.760282381829343, + "grad_norm": 0.177081897854805, + "learning_rate": 2.5094761598531985e-05, + "loss": 1.7572, + "step": 22025 + }, + { + "epoch": 6.760589318600369, + "grad_norm": 0.2641974687576294, + "learning_rate": 2.5090451665370674e-05, + "loss": 1.725, + "step": 22026 + }, + { + "epoch": 6.760896255371393, + "grad_norm": 0.20262297987937927, + "learning_rate": 2.5086141978380116e-05, + "loss": 1.6591, + "step": 22027 + }, + { + "epoch": 6.7612031921424185, + "grad_norm": 0.19107301533222198, + "learning_rate": 2.5081832537602913e-05, + "loss": 1.6914, + "step": 22028 + }, + { + "epoch": 6.761510128913444, + "grad_norm": 0.28122687339782715, + "learning_rate": 2.5077523343081643e-05, + "loss": 1.7759, + "step": 22029 + }, + { + "epoch": 6.761817065684469, + "grad_norm": 0.16575101017951965, + "learning_rate": 2.5073214394858897e-05, + "loss": 1.6994, + "step": 22030 + }, + { + "epoch": 6.7621240024554945, + "grad_norm": 0.26933449506759644, + "learning_rate": 2.506890569297723e-05, + "loss": 1.7565, + "step": 22031 + }, + { + "epoch": 6.762430939226519, + "grad_norm": 0.2452966868877411, + "learning_rate": 2.5064597237479292e-05, + "loss": 1.7442, + "step": 22032 + }, + { + "epoch": 6.762737875997544, + "grad_norm": 0.20781855285167694, + "learning_rate": 2.5060289028407585e-05, + "loss": 1.714, + "step": 22033 + }, + { + "epoch": 6.76304481276857, + "grad_norm": 0.1997823268175125, + "learning_rate": 2.5055981065804756e-05, + "loss": 1.7318, + "step": 22034 + }, + { + "epoch": 6.763351749539595, + "grad_norm": 0.2080194652080536, + "learning_rate": 2.50516733497133e-05, + "loss": 1.7466, + "step": 22035 + }, + { + "epoch": 6.76365868631062, + "grad_norm": 0.17558889091014862, + "learning_rate": 2.504736588017585e-05, + "loss": 1.7049, + "step": 22036 + }, + { + "epoch": 6.763965623081646, + "grad_norm": 0.1999572217464447, + "learning_rate": 2.5043058657234957e-05, + "loss": 1.7121, + "step": 22037 + }, + { + "epoch": 6.76427255985267, + "grad_norm": 0.16219176352024078, + "learning_rate": 2.5038751680933185e-05, + "loss": 1.698, + "step": 22038 + }, + { + "epoch": 6.764579496623695, + "grad_norm": 0.17965151369571686, + "learning_rate": 2.50344449513131e-05, + "loss": 1.7021, + "step": 22039 + }, + { + "epoch": 6.764886433394721, + "grad_norm": 0.18831093609333038, + "learning_rate": 2.5030138468417263e-05, + "loss": 1.7049, + "step": 22040 + }, + { + "epoch": 6.765193370165746, + "grad_norm": 0.20622828602790833, + "learning_rate": 2.5025832232288236e-05, + "loss": 1.7834, + "step": 22041 + }, + { + "epoch": 6.765500306936771, + "grad_norm": 0.22746746242046356, + "learning_rate": 2.5021526242968574e-05, + "loss": 1.7426, + "step": 22042 + }, + { + "epoch": 6.765807243707796, + "grad_norm": 0.2048977166414261, + "learning_rate": 2.5017220500500828e-05, + "loss": 1.7192, + "step": 22043 + }, + { + "epoch": 6.766114180478821, + "grad_norm": 0.19647538661956787, + "learning_rate": 2.5012915004927546e-05, + "loss": 1.6738, + "step": 22044 + }, + { + "epoch": 6.7664211172498465, + "grad_norm": 0.2133142054080963, + "learning_rate": 2.5008609756291284e-05, + "loss": 1.7482, + "step": 22045 + }, + { + "epoch": 6.766728054020872, + "grad_norm": 0.23578259348869324, + "learning_rate": 2.500430475463459e-05, + "loss": 1.696, + "step": 22046 + }, + { + "epoch": 6.767034990791897, + "grad_norm": 0.24862529337406158, + "learning_rate": 2.500000000000001e-05, + "loss": 1.7508, + "step": 22047 + }, + { + "epoch": 6.7673419275629225, + "grad_norm": 0.22704963386058807, + "learning_rate": 2.4995695492430066e-05, + "loss": 1.7739, + "step": 22048 + }, + { + "epoch": 6.767648864333947, + "grad_norm": 0.20216481387615204, + "learning_rate": 2.4991391231967347e-05, + "loss": 1.7406, + "step": 22049 + }, + { + "epoch": 6.767955801104972, + "grad_norm": 0.18778519332408905, + "learning_rate": 2.498708721865432e-05, + "loss": 1.683, + "step": 22050 + }, + { + "epoch": 6.768262737875998, + "grad_norm": 0.21680599451065063, + "learning_rate": 2.4982783452533597e-05, + "loss": 1.7652, + "step": 22051 + }, + { + "epoch": 6.768569674647023, + "grad_norm": 0.16952121257781982, + "learning_rate": 2.4978479933647637e-05, + "loss": 1.6551, + "step": 22052 + }, + { + "epoch": 6.768876611418047, + "grad_norm": 0.1979489028453827, + "learning_rate": 2.4974176662039017e-05, + "loss": 1.7399, + "step": 22053 + }, + { + "epoch": 6.769183548189073, + "grad_norm": 0.18934862315654755, + "learning_rate": 2.496987363775025e-05, + "loss": 1.7228, + "step": 22054 + }, + { + "epoch": 6.769490484960098, + "grad_norm": 0.17551462352275848, + "learning_rate": 2.496557086082387e-05, + "loss": 1.6725, + "step": 22055 + }, + { + "epoch": 6.769797421731123, + "grad_norm": 0.23561003804206848, + "learning_rate": 2.496126833130239e-05, + "loss": 1.7606, + "step": 22056 + }, + { + "epoch": 6.770104358502149, + "grad_norm": 0.19105803966522217, + "learning_rate": 2.4956966049228324e-05, + "loss": 1.6975, + "step": 22057 + }, + { + "epoch": 6.770411295273174, + "grad_norm": 0.28581124544143677, + "learning_rate": 2.4952664014644204e-05, + "loss": 1.7408, + "step": 22058 + }, + { + "epoch": 6.7707182320441985, + "grad_norm": 0.20723536610603333, + "learning_rate": 2.494836222759254e-05, + "loss": 1.752, + "step": 22059 + }, + { + "epoch": 6.771025168815224, + "grad_norm": 0.2089354693889618, + "learning_rate": 2.4944060688115846e-05, + "loss": 1.6662, + "step": 22060 + }, + { + "epoch": 6.771332105586249, + "grad_norm": 0.2299557626247406, + "learning_rate": 2.4939759396256625e-05, + "loss": 1.7978, + "step": 22061 + }, + { + "epoch": 6.7716390423572745, + "grad_norm": 0.17900820076465607, + "learning_rate": 2.493545835205739e-05, + "loss": 1.6876, + "step": 22062 + }, + { + "epoch": 6.7719459791283, + "grad_norm": 0.21412713825702667, + "learning_rate": 2.4931157555560648e-05, + "loss": 1.7347, + "step": 22063 + }, + { + "epoch": 6.772252915899324, + "grad_norm": 0.24448172748088837, + "learning_rate": 2.49268570068089e-05, + "loss": 1.7611, + "step": 22064 + }, + { + "epoch": 6.77255985267035, + "grad_norm": 0.20153972506523132, + "learning_rate": 2.4922556705844624e-05, + "loss": 1.7347, + "step": 22065 + }, + { + "epoch": 6.772866789441375, + "grad_norm": 0.2142268568277359, + "learning_rate": 2.4918256652710387e-05, + "loss": 1.7548, + "step": 22066 + }, + { + "epoch": 6.7731737262124, + "grad_norm": 0.19735601544380188, + "learning_rate": 2.4913956847448595e-05, + "loss": 1.7138, + "step": 22067 + }, + { + "epoch": 6.773480662983426, + "grad_norm": 0.1847008913755417, + "learning_rate": 2.4909657290101824e-05, + "loss": 1.6812, + "step": 22068 + }, + { + "epoch": 6.773787599754451, + "grad_norm": 0.18406464159488678, + "learning_rate": 2.4905357980712486e-05, + "loss": 1.6992, + "step": 22069 + }, + { + "epoch": 6.774094536525475, + "grad_norm": 0.19595865905284882, + "learning_rate": 2.490105891932313e-05, + "loss": 1.7118, + "step": 22070 + }, + { + "epoch": 6.774401473296501, + "grad_norm": 0.1929878294467926, + "learning_rate": 2.4896760105976218e-05, + "loss": 1.7187, + "step": 22071 + }, + { + "epoch": 6.774708410067526, + "grad_norm": 0.23972687125205994, + "learning_rate": 2.4892461540714242e-05, + "loss": 1.7293, + "step": 22072 + }, + { + "epoch": 6.7750153468385514, + "grad_norm": 0.18744204938411713, + "learning_rate": 2.4888163223579675e-05, + "loss": 1.7102, + "step": 22073 + }, + { + "epoch": 6.775322283609577, + "grad_norm": 0.20168112218379974, + "learning_rate": 2.4883865154614994e-05, + "loss": 1.7655, + "step": 22074 + }, + { + "epoch": 6.775629220380601, + "grad_norm": 0.22825658321380615, + "learning_rate": 2.487956733386268e-05, + "loss": 1.7251, + "step": 22075 + }, + { + "epoch": 6.775936157151627, + "grad_norm": 0.19441691040992737, + "learning_rate": 2.4875269761365205e-05, + "loss": 1.7657, + "step": 22076 + }, + { + "epoch": 6.776243093922652, + "grad_norm": 0.22861605882644653, + "learning_rate": 2.487097243716504e-05, + "loss": 1.7132, + "step": 22077 + }, + { + "epoch": 6.776550030693677, + "grad_norm": 0.19157674908638, + "learning_rate": 2.486667536130466e-05, + "loss": 1.7448, + "step": 22078 + }, + { + "epoch": 6.776856967464703, + "grad_norm": 0.2203369438648224, + "learning_rate": 2.486237853382652e-05, + "loss": 1.7535, + "step": 22079 + }, + { + "epoch": 6.777163904235728, + "grad_norm": 0.16477027535438538, + "learning_rate": 2.4858081954773088e-05, + "loss": 1.706, + "step": 22080 + }, + { + "epoch": 6.777470841006752, + "grad_norm": 0.16536933183670044, + "learning_rate": 2.4853785624186827e-05, + "loss": 1.6725, + "step": 22081 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 0.18266050517559052, + "learning_rate": 2.4849489542110176e-05, + "loss": 1.6799, + "step": 22082 + }, + { + "epoch": 6.778084714548803, + "grad_norm": 0.21422190964221954, + "learning_rate": 2.4845193708585647e-05, + "loss": 1.7275, + "step": 22083 + }, + { + "epoch": 6.778391651319828, + "grad_norm": 0.19356754422187805, + "learning_rate": 2.4840898123655622e-05, + "loss": 1.7172, + "step": 22084 + }, + { + "epoch": 6.778698588090853, + "grad_norm": 0.21090209484100342, + "learning_rate": 2.4836602787362628e-05, + "loss": 1.6581, + "step": 22085 + }, + { + "epoch": 6.779005524861878, + "grad_norm": 0.20072491466999054, + "learning_rate": 2.483230769974903e-05, + "loss": 1.7398, + "step": 22086 + }, + { + "epoch": 6.7793124616329035, + "grad_norm": 0.20642702281475067, + "learning_rate": 2.482801286085734e-05, + "loss": 1.7505, + "step": 22087 + }, + { + "epoch": 6.779619398403929, + "grad_norm": 0.20322991907596588, + "learning_rate": 2.4823718270729985e-05, + "loss": 1.6693, + "step": 22088 + }, + { + "epoch": 6.779926335174954, + "grad_norm": 0.17060843110084534, + "learning_rate": 2.4819423929409396e-05, + "loss": 1.6746, + "step": 22089 + }, + { + "epoch": 6.7802332719459795, + "grad_norm": 0.20697785913944244, + "learning_rate": 2.4815129836938024e-05, + "loss": 1.7413, + "step": 22090 + }, + { + "epoch": 6.780540208717004, + "grad_norm": 0.19845673441886902, + "learning_rate": 2.48108359933583e-05, + "loss": 1.694, + "step": 22091 + }, + { + "epoch": 6.780847145488029, + "grad_norm": 0.24547794461250305, + "learning_rate": 2.4806542398712657e-05, + "loss": 1.7316, + "step": 22092 + }, + { + "epoch": 6.781154082259055, + "grad_norm": 0.15587118268013, + "learning_rate": 2.4802249053043526e-05, + "loss": 1.667, + "step": 22093 + }, + { + "epoch": 6.78146101903008, + "grad_norm": 0.22754593193531036, + "learning_rate": 2.4797955956393336e-05, + "loss": 1.7504, + "step": 22094 + }, + { + "epoch": 6.781767955801105, + "grad_norm": 0.201420396566391, + "learning_rate": 2.4793663108804528e-05, + "loss": 1.749, + "step": 22095 + }, + { + "epoch": 6.78207489257213, + "grad_norm": 0.1952153891324997, + "learning_rate": 2.4789370510319504e-05, + "loss": 1.7306, + "step": 22096 + }, + { + "epoch": 6.782381829343155, + "grad_norm": 0.16750730574131012, + "learning_rate": 2.4785078160980703e-05, + "loss": 1.6775, + "step": 22097 + }, + { + "epoch": 6.78268876611418, + "grad_norm": 0.19943620264530182, + "learning_rate": 2.4780786060830535e-05, + "loss": 1.7233, + "step": 22098 + }, + { + "epoch": 6.782995702885206, + "grad_norm": 0.21302999556064606, + "learning_rate": 2.4776494209911423e-05, + "loss": 1.798, + "step": 22099 + }, + { + "epoch": 6.783302639656231, + "grad_norm": 0.22949734330177307, + "learning_rate": 2.4772202608265776e-05, + "loss": 1.7678, + "step": 22100 + }, + { + "epoch": 6.783609576427256, + "grad_norm": 0.20945954322814941, + "learning_rate": 2.4767911255935993e-05, + "loss": 1.701, + "step": 22101 + }, + { + "epoch": 6.783916513198281, + "grad_norm": 0.189425989985466, + "learning_rate": 2.476362015296454e-05, + "loss": 1.7152, + "step": 22102 + }, + { + "epoch": 6.784223449969306, + "grad_norm": 0.18826924264431, + "learning_rate": 2.4759329299393747e-05, + "loss": 1.7004, + "step": 22103 + }, + { + "epoch": 6.7845303867403315, + "grad_norm": 0.20359934866428375, + "learning_rate": 2.475503869526607e-05, + "loss": 1.705, + "step": 22104 + }, + { + "epoch": 6.784837323511357, + "grad_norm": 0.22381560504436493, + "learning_rate": 2.4750748340623896e-05, + "loss": 1.7345, + "step": 22105 + }, + { + "epoch": 6.785144260282382, + "grad_norm": 0.1750476062297821, + "learning_rate": 2.474645823550963e-05, + "loss": 1.7084, + "step": 22106 + }, + { + "epoch": 6.785451197053407, + "grad_norm": 0.17943856120109558, + "learning_rate": 2.4742168379965662e-05, + "loss": 1.7417, + "step": 22107 + }, + { + "epoch": 6.785758133824432, + "grad_norm": 0.21809861063957214, + "learning_rate": 2.4737878774034397e-05, + "loss": 1.7197, + "step": 22108 + }, + { + "epoch": 6.786065070595457, + "grad_norm": 0.19761307537555695, + "learning_rate": 2.473358941775821e-05, + "loss": 1.6763, + "step": 22109 + }, + { + "epoch": 6.786372007366483, + "grad_norm": 0.19513878226280212, + "learning_rate": 2.472930031117951e-05, + "loss": 1.6859, + "step": 22110 + }, + { + "epoch": 6.786678944137508, + "grad_norm": 0.21796870231628418, + "learning_rate": 2.4725011454340675e-05, + "loss": 1.6957, + "step": 22111 + }, + { + "epoch": 6.786985880908533, + "grad_norm": 0.1885530948638916, + "learning_rate": 2.4720722847284088e-05, + "loss": 1.731, + "step": 22112 + }, + { + "epoch": 6.787292817679558, + "grad_norm": 0.2108110785484314, + "learning_rate": 2.4716434490052137e-05, + "loss": 1.7985, + "step": 22113 + }, + { + "epoch": 6.787599754450583, + "grad_norm": 0.23425176739692688, + "learning_rate": 2.4712146382687194e-05, + "loss": 1.7177, + "step": 22114 + }, + { + "epoch": 6.787906691221608, + "grad_norm": 0.17368707060813904, + "learning_rate": 2.4707858525231652e-05, + "loss": 1.7158, + "step": 22115 + }, + { + "epoch": 6.788213627992634, + "grad_norm": 0.22731448709964752, + "learning_rate": 2.470357091772787e-05, + "loss": 1.7037, + "step": 22116 + }, + { + "epoch": 6.788520564763659, + "grad_norm": 0.19142407178878784, + "learning_rate": 2.469928356021823e-05, + "loss": 1.7283, + "step": 22117 + }, + { + "epoch": 6.7888275015346835, + "grad_norm": 0.17515631020069122, + "learning_rate": 2.4694996452745072e-05, + "loss": 1.6812, + "step": 22118 + }, + { + "epoch": 6.789134438305709, + "grad_norm": 0.17932391166687012, + "learning_rate": 2.4690709595350838e-05, + "loss": 1.6832, + "step": 22119 + }, + { + "epoch": 6.789441375076734, + "grad_norm": 0.21177144348621368, + "learning_rate": 2.4686422988077802e-05, + "loss": 1.7443, + "step": 22120 + }, + { + "epoch": 6.7897483118477595, + "grad_norm": 0.17952793836593628, + "learning_rate": 2.4682136630968412e-05, + "loss": 1.6794, + "step": 22121 + }, + { + "epoch": 6.790055248618785, + "grad_norm": 0.18464395403862, + "learning_rate": 2.467785052406495e-05, + "loss": 1.6316, + "step": 22122 + }, + { + "epoch": 6.79036218538981, + "grad_norm": 0.1936565786600113, + "learning_rate": 2.4673564667409828e-05, + "loss": 1.6935, + "step": 22123 + }, + { + "epoch": 6.790669122160835, + "grad_norm": 0.21169735491275787, + "learning_rate": 2.4669279061045387e-05, + "loss": 1.7232, + "step": 22124 + }, + { + "epoch": 6.79097605893186, + "grad_norm": 0.199925035238266, + "learning_rate": 2.466499370501397e-05, + "loss": 1.8242, + "step": 22125 + }, + { + "epoch": 6.791282995702885, + "grad_norm": 0.19049705564975739, + "learning_rate": 2.4660708599357963e-05, + "loss": 1.7342, + "step": 22126 + }, + { + "epoch": 6.791589932473911, + "grad_norm": 0.16483616828918457, + "learning_rate": 2.465642374411964e-05, + "loss": 1.7144, + "step": 22127 + }, + { + "epoch": 6.791896869244935, + "grad_norm": 0.17355477809906006, + "learning_rate": 2.4652139139341413e-05, + "loss": 1.6715, + "step": 22128 + }, + { + "epoch": 6.79220380601596, + "grad_norm": 0.17448700964450836, + "learning_rate": 2.4647854785065605e-05, + "loss": 1.6669, + "step": 22129 + }, + { + "epoch": 6.792510742786986, + "grad_norm": 0.19858810305595398, + "learning_rate": 2.4643570681334553e-05, + "loss": 1.6781, + "step": 22130 + }, + { + "epoch": 6.792817679558011, + "grad_norm": 0.17350561916828156, + "learning_rate": 2.46392868281906e-05, + "loss": 1.7005, + "step": 22131 + }, + { + "epoch": 6.793124616329036, + "grad_norm": 0.17494787275791168, + "learning_rate": 2.4635003225676078e-05, + "loss": 1.7204, + "step": 22132 + }, + { + "epoch": 6.793431553100062, + "grad_norm": 0.1988590806722641, + "learning_rate": 2.463071987383332e-05, + "loss": 1.7314, + "step": 22133 + }, + { + "epoch": 6.793738489871086, + "grad_norm": 0.18046239018440247, + "learning_rate": 2.4626436772704658e-05, + "loss": 1.706, + "step": 22134 + }, + { + "epoch": 6.7940454266421115, + "grad_norm": 0.21060462296009064, + "learning_rate": 2.4622153922332402e-05, + "loss": 1.6967, + "step": 22135 + }, + { + "epoch": 6.794352363413137, + "grad_norm": 0.22328679263591766, + "learning_rate": 2.4617871322758934e-05, + "loss": 1.7502, + "step": 22136 + }, + { + "epoch": 6.794659300184162, + "grad_norm": 0.18324224650859833, + "learning_rate": 2.46135889740265e-05, + "loss": 1.7183, + "step": 22137 + }, + { + "epoch": 6.7949662369551875, + "grad_norm": 0.2381133884191513, + "learning_rate": 2.4609306876177496e-05, + "loss": 1.739, + "step": 22138 + }, + { + "epoch": 6.795273173726212, + "grad_norm": 0.21471738815307617, + "learning_rate": 2.4605025029254164e-05, + "loss": 1.7466, + "step": 22139 + }, + { + "epoch": 6.795580110497237, + "grad_norm": 0.209581658244133, + "learning_rate": 2.4600743433298885e-05, + "loss": 1.7495, + "step": 22140 + }, + { + "epoch": 6.795887047268263, + "grad_norm": 0.1806897670030594, + "learning_rate": 2.459646208835394e-05, + "loss": 1.7137, + "step": 22141 + }, + { + "epoch": 6.796193984039288, + "grad_norm": 0.19036264717578888, + "learning_rate": 2.4592180994461644e-05, + "loss": 1.6993, + "step": 22142 + }, + { + "epoch": 6.796500920810313, + "grad_norm": 0.17937630414962769, + "learning_rate": 2.4587900151664335e-05, + "loss": 1.7102, + "step": 22143 + }, + { + "epoch": 6.796807857581339, + "grad_norm": 0.19278483092784882, + "learning_rate": 2.4583619560004244e-05, + "loss": 1.7058, + "step": 22144 + }, + { + "epoch": 6.797114794352363, + "grad_norm": 0.19507993757724762, + "learning_rate": 2.4579339219523744e-05, + "loss": 1.7137, + "step": 22145 + }, + { + "epoch": 6.797421731123388, + "grad_norm": 0.20417597889900208, + "learning_rate": 2.4575059130265115e-05, + "loss": 1.7156, + "step": 22146 + }, + { + "epoch": 6.797728667894414, + "grad_norm": 0.1898338943719864, + "learning_rate": 2.4570779292270658e-05, + "loss": 1.7501, + "step": 22147 + }, + { + "epoch": 6.798035604665439, + "grad_norm": 0.18777382373809814, + "learning_rate": 2.4566499705582656e-05, + "loss": 1.7192, + "step": 22148 + }, + { + "epoch": 6.798342541436464, + "grad_norm": 0.19526423513889313, + "learning_rate": 2.4562220370243415e-05, + "loss": 1.6637, + "step": 22149 + }, + { + "epoch": 6.798649478207489, + "grad_norm": 0.23661594092845917, + "learning_rate": 2.455794128629522e-05, + "loss": 1.7557, + "step": 22150 + }, + { + "epoch": 6.798956414978514, + "grad_norm": 0.27043846249580383, + "learning_rate": 2.4553662453780362e-05, + "loss": 1.7712, + "step": 22151 + }, + { + "epoch": 6.7992633517495396, + "grad_norm": 0.17968088388442993, + "learning_rate": 2.454938387274111e-05, + "loss": 1.6721, + "step": 22152 + }, + { + "epoch": 6.799570288520565, + "grad_norm": 0.21456219255924225, + "learning_rate": 2.45451055432198e-05, + "loss": 1.7249, + "step": 22153 + }, + { + "epoch": 6.79987722529159, + "grad_norm": 0.22433941066265106, + "learning_rate": 2.4540827465258638e-05, + "loss": 1.7319, + "step": 22154 + }, + { + "epoch": 6.800184162062616, + "grad_norm": 0.2808871567249298, + "learning_rate": 2.4536549638899976e-05, + "loss": 1.7802, + "step": 22155 + }, + { + "epoch": 6.80049109883364, + "grad_norm": 0.28654494881629944, + "learning_rate": 2.4532272064186018e-05, + "loss": 1.7431, + "step": 22156 + }, + { + "epoch": 6.800798035604665, + "grad_norm": 0.19476976990699768, + "learning_rate": 2.45279947411591e-05, + "loss": 1.6792, + "step": 22157 + }, + { + "epoch": 6.801104972375691, + "grad_norm": 0.25114744901657104, + "learning_rate": 2.452371766986146e-05, + "loss": 1.7458, + "step": 22158 + }, + { + "epoch": 6.801411909146716, + "grad_norm": 0.18099439144134521, + "learning_rate": 2.451944085033538e-05, + "loss": 1.6952, + "step": 22159 + }, + { + "epoch": 6.8017188459177405, + "grad_norm": 0.21425777673721313, + "learning_rate": 2.4515164282623138e-05, + "loss": 1.7593, + "step": 22160 + }, + { + "epoch": 6.802025782688766, + "grad_norm": 0.19833709299564362, + "learning_rate": 2.4510887966766937e-05, + "loss": 1.6643, + "step": 22161 + }, + { + "epoch": 6.802332719459791, + "grad_norm": 0.20073090493679047, + "learning_rate": 2.45066119028091e-05, + "loss": 1.7112, + "step": 22162 + }, + { + "epoch": 6.8026396562308165, + "grad_norm": 0.18599852919578552, + "learning_rate": 2.4502336090791872e-05, + "loss": 1.7121, + "step": 22163 + }, + { + "epoch": 6.802946593001842, + "grad_norm": 0.22036875784397125, + "learning_rate": 2.4498060530757498e-05, + "loss": 1.7944, + "step": 22164 + }, + { + "epoch": 6.803253529772867, + "grad_norm": 0.19521577656269073, + "learning_rate": 2.4493785222748243e-05, + "loss": 1.7463, + "step": 22165 + }, + { + "epoch": 6.803560466543892, + "grad_norm": 0.22010843455791473, + "learning_rate": 2.448951016680635e-05, + "loss": 1.6951, + "step": 22166 + }, + { + "epoch": 6.803867403314917, + "grad_norm": 0.20490090548992157, + "learning_rate": 2.448523536297407e-05, + "loss": 1.7723, + "step": 22167 + }, + { + "epoch": 6.804174340085942, + "grad_norm": 0.2298613339662552, + "learning_rate": 2.4480960811293648e-05, + "loss": 1.7644, + "step": 22168 + }, + { + "epoch": 6.804481276856968, + "grad_norm": 0.18560375273227692, + "learning_rate": 2.4476686511807306e-05, + "loss": 1.686, + "step": 22169 + }, + { + "epoch": 6.804788213627993, + "grad_norm": 0.24295780062675476, + "learning_rate": 2.4472412464557347e-05, + "loss": 1.7561, + "step": 22170 + }, + { + "epoch": 6.805095150399017, + "grad_norm": 0.1962144672870636, + "learning_rate": 2.4468138669585932e-05, + "loss": 1.7438, + "step": 22171 + }, + { + "epoch": 6.805402087170043, + "grad_norm": 0.21924439072608948, + "learning_rate": 2.4463865126935377e-05, + "loss": 1.7488, + "step": 22172 + }, + { + "epoch": 6.805709023941068, + "grad_norm": 0.1777856945991516, + "learning_rate": 2.4459591836647833e-05, + "loss": 1.6664, + "step": 22173 + }, + { + "epoch": 6.806015960712093, + "grad_norm": 0.24367454648017883, + "learning_rate": 2.4455318798765593e-05, + "loss": 1.7441, + "step": 22174 + }, + { + "epoch": 6.806322897483119, + "grad_norm": 0.2269427478313446, + "learning_rate": 2.4451046013330865e-05, + "loss": 1.7809, + "step": 22175 + }, + { + "epoch": 6.806629834254144, + "grad_norm": 0.21986174583435059, + "learning_rate": 2.444677348038587e-05, + "loss": 1.7453, + "step": 22176 + }, + { + "epoch": 6.8069367710251685, + "grad_norm": 0.1773367077112198, + "learning_rate": 2.4442501199972862e-05, + "loss": 1.6927, + "step": 22177 + }, + { + "epoch": 6.807243707796194, + "grad_norm": 0.20545031130313873, + "learning_rate": 2.4438229172133997e-05, + "loss": 1.7782, + "step": 22178 + }, + { + "epoch": 6.807550644567219, + "grad_norm": 0.1997014880180359, + "learning_rate": 2.443395739691155e-05, + "loss": 1.7295, + "step": 22179 + }, + { + "epoch": 6.8078575813382445, + "grad_norm": 0.19634006917476654, + "learning_rate": 2.4429685874347723e-05, + "loss": 1.7017, + "step": 22180 + }, + { + "epoch": 6.80816451810927, + "grad_norm": 0.2007836550474167, + "learning_rate": 2.442541460448473e-05, + "loss": 1.7252, + "step": 22181 + }, + { + "epoch": 6.808471454880294, + "grad_norm": 0.22204343974590302, + "learning_rate": 2.4421143587364775e-05, + "loss": 1.7526, + "step": 22182 + }, + { + "epoch": 6.80877839165132, + "grad_norm": 0.1906677633523941, + "learning_rate": 2.4416872823030073e-05, + "loss": 1.7121, + "step": 22183 + }, + { + "epoch": 6.809085328422345, + "grad_norm": 0.17165397107601166, + "learning_rate": 2.441260231152283e-05, + "loss": 1.6942, + "step": 22184 + }, + { + "epoch": 6.80939226519337, + "grad_norm": 0.17022575438022614, + "learning_rate": 2.4408332052885246e-05, + "loss": 1.6973, + "step": 22185 + }, + { + "epoch": 6.809699201964396, + "grad_norm": 0.16693587601184845, + "learning_rate": 2.4404062047159503e-05, + "loss": 1.6996, + "step": 22186 + }, + { + "epoch": 6.810006138735421, + "grad_norm": 0.2251187264919281, + "learning_rate": 2.4399792294387864e-05, + "loss": 1.778, + "step": 22187 + }, + { + "epoch": 6.810313075506445, + "grad_norm": 0.20622244477272034, + "learning_rate": 2.439552279461244e-05, + "loss": 1.7273, + "step": 22188 + }, + { + "epoch": 6.810620012277471, + "grad_norm": 0.19736994802951813, + "learning_rate": 2.439125354787551e-05, + "loss": 1.7096, + "step": 22189 + }, + { + "epoch": 6.810926949048496, + "grad_norm": 0.22955237329006195, + "learning_rate": 2.4386984554219182e-05, + "loss": 1.7859, + "step": 22190 + }, + { + "epoch": 6.811233885819521, + "grad_norm": 0.2283364087343216, + "learning_rate": 2.43827158136857e-05, + "loss": 1.6999, + "step": 22191 + }, + { + "epoch": 6.811540822590547, + "grad_norm": 0.18393704295158386, + "learning_rate": 2.4378447326317243e-05, + "loss": 1.654, + "step": 22192 + }, + { + "epoch": 6.811847759361571, + "grad_norm": 0.2031537890434265, + "learning_rate": 2.4374179092155986e-05, + "loss": 1.7353, + "step": 22193 + }, + { + "epoch": 6.8121546961325965, + "grad_norm": 0.1849071979522705, + "learning_rate": 2.4369911111244125e-05, + "loss": 1.7157, + "step": 22194 + }, + { + "epoch": 6.812461632903622, + "grad_norm": 0.20584192872047424, + "learning_rate": 2.4365643383623787e-05, + "loss": 1.7529, + "step": 22195 + }, + { + "epoch": 6.812768569674647, + "grad_norm": 0.24152903258800507, + "learning_rate": 2.436137590933721e-05, + "loss": 1.7662, + "step": 22196 + }, + { + "epoch": 6.8130755064456725, + "grad_norm": 0.26625362038612366, + "learning_rate": 2.4357108688426532e-05, + "loss": 1.7624, + "step": 22197 + }, + { + "epoch": 6.813382443216698, + "grad_norm": 0.27122190594673157, + "learning_rate": 2.435284172093395e-05, + "loss": 1.747, + "step": 22198 + }, + { + "epoch": 6.813689379987722, + "grad_norm": 0.18996810913085938, + "learning_rate": 2.434857500690161e-05, + "loss": 1.7377, + "step": 22199 + }, + { + "epoch": 6.813996316758748, + "grad_norm": 0.22355122864246368, + "learning_rate": 2.4344308546371686e-05, + "loss": 1.6865, + "step": 22200 + }, + { + "epoch": 6.814303253529773, + "grad_norm": 0.18468965590000153, + "learning_rate": 2.4340042339386348e-05, + "loss": 1.7091, + "step": 22201 + }, + { + "epoch": 6.814610190300798, + "grad_norm": 0.25356602668762207, + "learning_rate": 2.4335776385987747e-05, + "loss": 1.7482, + "step": 22202 + }, + { + "epoch": 6.814917127071823, + "grad_norm": 0.22462932765483856, + "learning_rate": 2.433151068621803e-05, + "loss": 1.6985, + "step": 22203 + }, + { + "epoch": 6.815224063842848, + "grad_norm": 0.2540687024593353, + "learning_rate": 2.43272452401194e-05, + "loss": 1.7878, + "step": 22204 + }, + { + "epoch": 6.815531000613873, + "grad_norm": 0.267811119556427, + "learning_rate": 2.432298004773395e-05, + "loss": 1.7862, + "step": 22205 + }, + { + "epoch": 6.815837937384899, + "grad_norm": 0.23089277744293213, + "learning_rate": 2.4318715109103894e-05, + "loss": 1.6892, + "step": 22206 + }, + { + "epoch": 6.816144874155924, + "grad_norm": 0.22740885615348816, + "learning_rate": 2.431445042427131e-05, + "loss": 1.6934, + "step": 22207 + }, + { + "epoch": 6.816451810926949, + "grad_norm": 0.18555034697055817, + "learning_rate": 2.4310185993278405e-05, + "loss": 1.6747, + "step": 22208 + }, + { + "epoch": 6.816758747697974, + "grad_norm": 0.23693101108074188, + "learning_rate": 2.430592181616729e-05, + "loss": 1.7212, + "step": 22209 + }, + { + "epoch": 6.817065684468999, + "grad_norm": 0.20551325380802155, + "learning_rate": 2.4301657892980128e-05, + "loss": 1.711, + "step": 22210 + }, + { + "epoch": 6.8173726212400245, + "grad_norm": 0.20047837495803833, + "learning_rate": 2.4297394223759056e-05, + "loss": 1.729, + "step": 22211 + }, + { + "epoch": 6.81767955801105, + "grad_norm": 0.22111602127552032, + "learning_rate": 2.4293130808546167e-05, + "loss": 1.706, + "step": 22212 + }, + { + "epoch": 6.817986494782075, + "grad_norm": 0.18199655413627625, + "learning_rate": 2.428886764738364e-05, + "loss": 1.7082, + "step": 22213 + }, + { + "epoch": 6.8182934315531, + "grad_norm": 0.18591821193695068, + "learning_rate": 2.4284604740313595e-05, + "loss": 1.6957, + "step": 22214 + }, + { + "epoch": 6.818600368324125, + "grad_norm": 0.19427789747714996, + "learning_rate": 2.4280342087378154e-05, + "loss": 1.7396, + "step": 22215 + }, + { + "epoch": 6.81890730509515, + "grad_norm": 0.233908548951149, + "learning_rate": 2.427607968861945e-05, + "loss": 1.741, + "step": 22216 + }, + { + "epoch": 6.819214241866176, + "grad_norm": 0.168926402926445, + "learning_rate": 2.4271817544079606e-05, + "loss": 1.7023, + "step": 22217 + }, + { + "epoch": 6.819521178637201, + "grad_norm": 0.34345322847366333, + "learning_rate": 2.426755565380074e-05, + "loss": 1.7201, + "step": 22218 + }, + { + "epoch": 6.819828115408226, + "grad_norm": 0.21531274914741516, + "learning_rate": 2.4263294017824974e-05, + "loss": 1.725, + "step": 22219 + }, + { + "epoch": 6.820135052179251, + "grad_norm": 0.25251755118370056, + "learning_rate": 2.4259032636194395e-05, + "loss": 1.6764, + "step": 22220 + }, + { + "epoch": 6.820441988950276, + "grad_norm": 0.246616929769516, + "learning_rate": 2.4254771508951186e-05, + "loss": 1.7971, + "step": 22221 + }, + { + "epoch": 6.820748925721301, + "grad_norm": 0.20998120307922363, + "learning_rate": 2.4250510636137375e-05, + "loss": 1.723, + "step": 22222 + }, + { + "epoch": 6.821055862492327, + "grad_norm": 0.28388240933418274, + "learning_rate": 2.4246250017795148e-05, + "loss": 1.7508, + "step": 22223 + }, + { + "epoch": 6.821362799263352, + "grad_norm": 0.18146218359470367, + "learning_rate": 2.4241989653966535e-05, + "loss": 1.7254, + "step": 22224 + }, + { + "epoch": 6.8216697360343765, + "grad_norm": 0.2384043037891388, + "learning_rate": 2.4237729544693694e-05, + "loss": 1.7624, + "step": 22225 + }, + { + "epoch": 6.821976672805402, + "grad_norm": 0.21908332407474518, + "learning_rate": 2.4233469690018714e-05, + "loss": 1.7595, + "step": 22226 + }, + { + "epoch": 6.822283609576427, + "grad_norm": 0.20963989198207855, + "learning_rate": 2.422921008998369e-05, + "loss": 1.6679, + "step": 22227 + }, + { + "epoch": 6.8225905463474525, + "grad_norm": 0.21045777201652527, + "learning_rate": 2.4224950744630732e-05, + "loss": 1.657, + "step": 22228 + }, + { + "epoch": 6.822897483118478, + "grad_norm": 0.21567417681217194, + "learning_rate": 2.4220691654001883e-05, + "loss": 1.7788, + "step": 22229 + }, + { + "epoch": 6.823204419889503, + "grad_norm": 0.2908889055252075, + "learning_rate": 2.4216432818139283e-05, + "loss": 1.7633, + "step": 22230 + }, + { + "epoch": 6.823511356660528, + "grad_norm": 0.22683843970298767, + "learning_rate": 2.4212174237085007e-05, + "loss": 1.7974, + "step": 22231 + }, + { + "epoch": 6.823818293431553, + "grad_norm": 0.25254085659980774, + "learning_rate": 2.420791591088114e-05, + "loss": 1.6871, + "step": 22232 + }, + { + "epoch": 6.824125230202578, + "grad_norm": 0.1804734766483307, + "learning_rate": 2.420365783956977e-05, + "loss": 1.7331, + "step": 22233 + }, + { + "epoch": 6.824432166973604, + "grad_norm": 0.21634186804294586, + "learning_rate": 2.419940002319297e-05, + "loss": 1.6641, + "step": 22234 + }, + { + "epoch": 6.824739103744628, + "grad_norm": 0.1941644847393036, + "learning_rate": 2.4195142461792818e-05, + "loss": 1.7198, + "step": 22235 + }, + { + "epoch": 6.8250460405156534, + "grad_norm": 0.20209947228431702, + "learning_rate": 2.4190885155411398e-05, + "loss": 1.7137, + "step": 22236 + }, + { + "epoch": 6.825352977286679, + "grad_norm": 0.17161925137043, + "learning_rate": 2.4186628104090757e-05, + "loss": 1.7059, + "step": 22237 + }, + { + "epoch": 6.825659914057704, + "grad_norm": 0.19352135062217712, + "learning_rate": 2.4182371307873025e-05, + "loss": 1.6699, + "step": 22238 + }, + { + "epoch": 6.8259668508287294, + "grad_norm": 0.20384716987609863, + "learning_rate": 2.417811476680019e-05, + "loss": 1.7167, + "step": 22239 + }, + { + "epoch": 6.826273787599755, + "grad_norm": 0.22764970362186432, + "learning_rate": 2.4173858480914402e-05, + "loss": 1.7085, + "step": 22240 + }, + { + "epoch": 6.82658072437078, + "grad_norm": 0.1988842487335205, + "learning_rate": 2.4169602450257645e-05, + "loss": 1.7458, + "step": 22241 + }, + { + "epoch": 6.826887661141805, + "grad_norm": 0.20511481165885925, + "learning_rate": 2.416534667487203e-05, + "loss": 1.7597, + "step": 22242 + }, + { + "epoch": 6.82719459791283, + "grad_norm": 0.20906902849674225, + "learning_rate": 2.4161091154799608e-05, + "loss": 1.7418, + "step": 22243 + }, + { + "epoch": 6.827501534683855, + "grad_norm": 0.22555884718894958, + "learning_rate": 2.4156835890082426e-05, + "loss": 1.8198, + "step": 22244 + }, + { + "epoch": 6.827808471454881, + "grad_norm": 0.25855058431625366, + "learning_rate": 2.4152580880762553e-05, + "loss": 1.7588, + "step": 22245 + }, + { + "epoch": 6.828115408225905, + "grad_norm": 0.16975226998329163, + "learning_rate": 2.4148326126881993e-05, + "loss": 1.6897, + "step": 22246 + }, + { + "epoch": 6.82842234499693, + "grad_norm": 0.2336781919002533, + "learning_rate": 2.414407162848284e-05, + "loss": 1.7412, + "step": 22247 + }, + { + "epoch": 6.828729281767956, + "grad_norm": 0.1660032868385315, + "learning_rate": 2.4139817385607126e-05, + "loss": 1.6221, + "step": 22248 + }, + { + "epoch": 6.829036218538981, + "grad_norm": 0.22926606237888336, + "learning_rate": 2.41355633982969e-05, + "loss": 1.7201, + "step": 22249 + }, + { + "epoch": 6.829343155310006, + "grad_norm": 0.1759374737739563, + "learning_rate": 2.4131309666594193e-05, + "loss": 1.6842, + "step": 22250 + }, + { + "epoch": 6.829650092081032, + "grad_norm": 0.23005764186382294, + "learning_rate": 2.4127056190541042e-05, + "loss": 1.7327, + "step": 22251 + }, + { + "epoch": 6.829957028852056, + "grad_norm": 0.2216579169034958, + "learning_rate": 2.412280297017949e-05, + "loss": 1.7856, + "step": 22252 + }, + { + "epoch": 6.8302639656230815, + "grad_norm": 0.22133000195026398, + "learning_rate": 2.4118550005551565e-05, + "loss": 1.7711, + "step": 22253 + }, + { + "epoch": 6.830570902394107, + "grad_norm": 0.21860742568969727, + "learning_rate": 2.41142972966993e-05, + "loss": 1.7276, + "step": 22254 + }, + { + "epoch": 6.830877839165132, + "grad_norm": 0.2484082579612732, + "learning_rate": 2.4110044843664726e-05, + "loss": 1.7038, + "step": 22255 + }, + { + "epoch": 6.8311847759361575, + "grad_norm": 0.22288921475410461, + "learning_rate": 2.410579264648984e-05, + "loss": 1.7149, + "step": 22256 + }, + { + "epoch": 6.831491712707182, + "grad_norm": 0.23635484278202057, + "learning_rate": 2.4101540705216724e-05, + "loss": 1.7296, + "step": 22257 + }, + { + "epoch": 6.831798649478207, + "grad_norm": 0.24334096908569336, + "learning_rate": 2.4097289019887324e-05, + "loss": 1.7458, + "step": 22258 + }, + { + "epoch": 6.832105586249233, + "grad_norm": 0.23019789159297943, + "learning_rate": 2.4093037590543716e-05, + "loss": 1.7296, + "step": 22259 + }, + { + "epoch": 6.832412523020258, + "grad_norm": 0.23739024996757507, + "learning_rate": 2.4088786417227895e-05, + "loss": 1.7844, + "step": 22260 + }, + { + "epoch": 6.832719459791283, + "grad_norm": 0.1969252973794937, + "learning_rate": 2.4084535499981873e-05, + "loss": 1.6692, + "step": 22261 + }, + { + "epoch": 6.833026396562309, + "grad_norm": 0.20111167430877686, + "learning_rate": 2.4080284838847682e-05, + "loss": 1.7813, + "step": 22262 + }, + { + "epoch": 6.833333333333333, + "grad_norm": 0.26112934947013855, + "learning_rate": 2.4076034433867268e-05, + "loss": 1.6852, + "step": 22263 + }, + { + "epoch": 6.833640270104358, + "grad_norm": 0.24244411289691925, + "learning_rate": 2.40717842850827e-05, + "loss": 1.7054, + "step": 22264 + }, + { + "epoch": 6.833947206875384, + "grad_norm": 0.22703053057193756, + "learning_rate": 2.406753439253595e-05, + "loss": 1.7655, + "step": 22265 + }, + { + "epoch": 6.834254143646409, + "grad_norm": 0.23935651779174805, + "learning_rate": 2.4063284756269027e-05, + "loss": 1.7462, + "step": 22266 + }, + { + "epoch": 6.834561080417434, + "grad_norm": 0.2169155478477478, + "learning_rate": 2.4059035376323928e-05, + "loss": 1.7059, + "step": 22267 + }, + { + "epoch": 6.834868017188459, + "grad_norm": 0.2045663446187973, + "learning_rate": 2.4054786252742645e-05, + "loss": 1.7166, + "step": 22268 + }, + { + "epoch": 6.835174953959484, + "grad_norm": 0.22796253859996796, + "learning_rate": 2.4050537385567172e-05, + "loss": 1.7361, + "step": 22269 + }, + { + "epoch": 6.8354818907305095, + "grad_norm": 0.20807915925979614, + "learning_rate": 2.4046288774839497e-05, + "loss": 1.7007, + "step": 22270 + }, + { + "epoch": 6.835788827501535, + "grad_norm": 0.22157903015613556, + "learning_rate": 2.4042040420601607e-05, + "loss": 1.7409, + "step": 22271 + }, + { + "epoch": 6.83609576427256, + "grad_norm": 0.21494148671627045, + "learning_rate": 2.4037792322895492e-05, + "loss": 1.7975, + "step": 22272 + }, + { + "epoch": 6.8364027010435855, + "grad_norm": 0.2275875061750412, + "learning_rate": 2.403354448176311e-05, + "loss": 1.6759, + "step": 22273 + }, + { + "epoch": 6.83670963781461, + "grad_norm": 0.21105073392391205, + "learning_rate": 2.4029296897246496e-05, + "loss": 1.7229, + "step": 22274 + }, + { + "epoch": 6.837016574585635, + "grad_norm": 0.21957579255104065, + "learning_rate": 2.4025049569387553e-05, + "loss": 1.737, + "step": 22275 + }, + { + "epoch": 6.837323511356661, + "grad_norm": 0.2291470617055893, + "learning_rate": 2.4020802498228335e-05, + "loss": 1.6731, + "step": 22276 + }, + { + "epoch": 6.837630448127686, + "grad_norm": 0.18196065723896027, + "learning_rate": 2.401655568381074e-05, + "loss": 1.6823, + "step": 22277 + }, + { + "epoch": 6.83793738489871, + "grad_norm": 0.20915214717388153, + "learning_rate": 2.401230912617678e-05, + "loss": 1.7038, + "step": 22278 + }, + { + "epoch": 6.838244321669736, + "grad_norm": 0.2060854732990265, + "learning_rate": 2.4008062825368437e-05, + "loss": 1.7514, + "step": 22279 + }, + { + "epoch": 6.838551258440761, + "grad_norm": 0.20858527719974518, + "learning_rate": 2.400381678142762e-05, + "loss": 1.7494, + "step": 22280 + }, + { + "epoch": 6.838858195211786, + "grad_norm": 0.19124718010425568, + "learning_rate": 2.3999570994396352e-05, + "loss": 1.7641, + "step": 22281 + }, + { + "epoch": 6.839165131982812, + "grad_norm": 0.28222304582595825, + "learning_rate": 2.3995325464316525e-05, + "loss": 1.7204, + "step": 22282 + }, + { + "epoch": 6.839472068753837, + "grad_norm": 0.20047026872634888, + "learning_rate": 2.399108019123016e-05, + "loss": 1.7261, + "step": 22283 + }, + { + "epoch": 6.8397790055248615, + "grad_norm": 0.2758225202560425, + "learning_rate": 2.3986835175179178e-05, + "loss": 1.6903, + "step": 22284 + }, + { + "epoch": 6.840085942295887, + "grad_norm": 0.2719727158546448, + "learning_rate": 2.3982590416205535e-05, + "loss": 1.8716, + "step": 22285 + }, + { + "epoch": 6.840392879066912, + "grad_norm": 0.3524060845375061, + "learning_rate": 2.3978345914351193e-05, + "loss": 1.7778, + "step": 22286 + }, + { + "epoch": 6.8406998158379375, + "grad_norm": 0.2711596190929413, + "learning_rate": 2.397410166965808e-05, + "loss": 1.7111, + "step": 22287 + }, + { + "epoch": 6.841006752608963, + "grad_norm": 0.2818336486816406, + "learning_rate": 2.396985768216815e-05, + "loss": 1.7292, + "step": 22288 + }, + { + "epoch": 6.841313689379987, + "grad_norm": 0.19677700102329254, + "learning_rate": 2.3965613951923343e-05, + "loss": 1.6975, + "step": 22289 + }, + { + "epoch": 6.841620626151013, + "grad_norm": 0.300997257232666, + "learning_rate": 2.3961370478965583e-05, + "loss": 1.7014, + "step": 22290 + }, + { + "epoch": 6.841927562922038, + "grad_norm": 0.23549453914165497, + "learning_rate": 2.395712726333686e-05, + "loss": 1.7052, + "step": 22291 + }, + { + "epoch": 6.842234499693063, + "grad_norm": 0.29898303747177124, + "learning_rate": 2.3952884305079026e-05, + "loss": 1.7828, + "step": 22292 + }, + { + "epoch": 6.842541436464089, + "grad_norm": 0.26108843088150024, + "learning_rate": 2.3948641604234096e-05, + "loss": 1.7023, + "step": 22293 + }, + { + "epoch": 6.842848373235114, + "grad_norm": 0.18781059980392456, + "learning_rate": 2.394439916084392e-05, + "loss": 1.6808, + "step": 22294 + }, + { + "epoch": 6.843155310006138, + "grad_norm": 0.22659730911254883, + "learning_rate": 2.3940156974950485e-05, + "loss": 1.7224, + "step": 22295 + }, + { + "epoch": 6.843462246777164, + "grad_norm": 0.17422057688236237, + "learning_rate": 2.3935915046595713e-05, + "loss": 1.668, + "step": 22296 + }, + { + "epoch": 6.843769183548189, + "grad_norm": 0.2008846402168274, + "learning_rate": 2.393167337582146e-05, + "loss": 1.7283, + "step": 22297 + }, + { + "epoch": 6.844076120319214, + "grad_norm": 0.20376072824001312, + "learning_rate": 2.392743196266973e-05, + "loss": 1.74, + "step": 22298 + }, + { + "epoch": 6.84438305709024, + "grad_norm": 0.16353756189346313, + "learning_rate": 2.3923190807182372e-05, + "loss": 1.717, + "step": 22299 + }, + { + "epoch": 6.844689993861264, + "grad_norm": 0.18436652421951294, + "learning_rate": 2.3918949909401335e-05, + "loss": 1.7257, + "step": 22300 + }, + { + "epoch": 6.8449969306322895, + "grad_norm": 0.2038460522890091, + "learning_rate": 2.3914709269368523e-05, + "loss": 1.7254, + "step": 22301 + }, + { + "epoch": 6.845303867403315, + "grad_norm": 0.17111587524414062, + "learning_rate": 2.3910468887125842e-05, + "loss": 1.6993, + "step": 22302 + }, + { + "epoch": 6.84561080417434, + "grad_norm": 0.20049406588077545, + "learning_rate": 2.3906228762715207e-05, + "loss": 1.7099, + "step": 22303 + }, + { + "epoch": 6.8459177409453655, + "grad_norm": 0.2168554663658142, + "learning_rate": 2.39019888961785e-05, + "loss": 1.725, + "step": 22304 + }, + { + "epoch": 6.846224677716391, + "grad_norm": 0.2228514850139618, + "learning_rate": 2.3897749287557647e-05, + "loss": 1.7348, + "step": 22305 + }, + { + "epoch": 6.846531614487415, + "grad_norm": 0.17166151106357574, + "learning_rate": 2.3893509936894532e-05, + "loss": 1.7451, + "step": 22306 + }, + { + "epoch": 6.846838551258441, + "grad_norm": 0.24896936118602753, + "learning_rate": 2.3889270844231026e-05, + "loss": 1.7397, + "step": 22307 + }, + { + "epoch": 6.847145488029466, + "grad_norm": 0.1984332948923111, + "learning_rate": 2.3885032009609098e-05, + "loss": 1.7167, + "step": 22308 + }, + { + "epoch": 6.847452424800491, + "grad_norm": 0.20763449370861053, + "learning_rate": 2.388079343307055e-05, + "loss": 1.7154, + "step": 22309 + }, + { + "epoch": 6.847759361571516, + "grad_norm": 0.21818630397319794, + "learning_rate": 2.3876555114657346e-05, + "loss": 1.7364, + "step": 22310 + }, + { + "epoch": 6.848066298342541, + "grad_norm": 0.21220166981220245, + "learning_rate": 2.3872317054411298e-05, + "loss": 1.74, + "step": 22311 + }, + { + "epoch": 6.848373235113566, + "grad_norm": 0.17486892640590668, + "learning_rate": 2.3868079252374343e-05, + "loss": 1.68, + "step": 22312 + }, + { + "epoch": 6.848680171884592, + "grad_norm": 0.20809298753738403, + "learning_rate": 2.386384170858837e-05, + "loss": 1.8102, + "step": 22313 + }, + { + "epoch": 6.848987108655617, + "grad_norm": 0.19927671551704407, + "learning_rate": 2.385960442309519e-05, + "loss": 1.7742, + "step": 22314 + }, + { + "epoch": 6.849294045426642, + "grad_norm": 0.18705040216445923, + "learning_rate": 2.3855367395936757e-05, + "loss": 1.689, + "step": 22315 + }, + { + "epoch": 6.849600982197668, + "grad_norm": 0.22023466229438782, + "learning_rate": 2.385113062715487e-05, + "loss": 1.7819, + "step": 22316 + }, + { + "epoch": 6.849907918968692, + "grad_norm": 0.24443435668945312, + "learning_rate": 2.384689411679146e-05, + "loss": 1.6533, + "step": 22317 + }, + { + "epoch": 6.850214855739718, + "grad_norm": 0.20103834569454193, + "learning_rate": 2.3842657864888368e-05, + "loss": 1.7274, + "step": 22318 + }, + { + "epoch": 6.850521792510743, + "grad_norm": 0.2265254408121109, + "learning_rate": 2.3838421871487465e-05, + "loss": 1.7874, + "step": 22319 + }, + { + "epoch": 6.850828729281768, + "grad_norm": 0.2775460183620453, + "learning_rate": 2.383418613663061e-05, + "loss": 1.8038, + "step": 22320 + }, + { + "epoch": 6.851135666052793, + "grad_norm": 0.2001011073589325, + "learning_rate": 2.3829950660359663e-05, + "loss": 1.7135, + "step": 22321 + }, + { + "epoch": 6.851442602823818, + "grad_norm": 0.21427330374717712, + "learning_rate": 2.382571544271648e-05, + "loss": 1.7155, + "step": 22322 + }, + { + "epoch": 6.851749539594843, + "grad_norm": 0.18420884013175964, + "learning_rate": 2.382148048374292e-05, + "loss": 1.7178, + "step": 22323 + }, + { + "epoch": 6.852056476365869, + "grad_norm": 0.19436471164226532, + "learning_rate": 2.3817245783480813e-05, + "loss": 1.7396, + "step": 22324 + }, + { + "epoch": 6.852363413136894, + "grad_norm": 0.23191674053668976, + "learning_rate": 2.381301134197207e-05, + "loss": 1.7102, + "step": 22325 + }, + { + "epoch": 6.852670349907919, + "grad_norm": 0.20381706953048706, + "learning_rate": 2.3808777159258462e-05, + "loss": 1.7671, + "step": 22326 + }, + { + "epoch": 6.852977286678944, + "grad_norm": 0.20202197134494781, + "learning_rate": 2.3804543235381897e-05, + "loss": 1.6774, + "step": 22327 + }, + { + "epoch": 6.853284223449969, + "grad_norm": 0.23496322333812714, + "learning_rate": 2.380030957038416e-05, + "loss": 1.7745, + "step": 22328 + }, + { + "epoch": 6.8535911602209945, + "grad_norm": 0.22473813593387604, + "learning_rate": 2.379607616430714e-05, + "loss": 1.7319, + "step": 22329 + }, + { + "epoch": 6.85389809699202, + "grad_norm": 0.2149224430322647, + "learning_rate": 2.3791843017192667e-05, + "loss": 1.77, + "step": 22330 + }, + { + "epoch": 6.854205033763045, + "grad_norm": 0.21146108210086823, + "learning_rate": 2.378761012908253e-05, + "loss": 1.762, + "step": 22331 + }, + { + "epoch": 6.85451197053407, + "grad_norm": 0.2031458169221878, + "learning_rate": 2.3783377500018626e-05, + "loss": 1.7007, + "step": 22332 + }, + { + "epoch": 6.854818907305095, + "grad_norm": 0.19763319194316864, + "learning_rate": 2.377914513004272e-05, + "loss": 1.6899, + "step": 22333 + }, + { + "epoch": 6.85512584407612, + "grad_norm": 0.17337046563625336, + "learning_rate": 2.3774913019196688e-05, + "loss": 1.683, + "step": 22334 + }, + { + "epoch": 6.855432780847146, + "grad_norm": 0.1850815862417221, + "learning_rate": 2.3770681167522328e-05, + "loss": 1.7284, + "step": 22335 + }, + { + "epoch": 6.855739717618171, + "grad_norm": 0.19693362712860107, + "learning_rate": 2.3766449575061477e-05, + "loss": 1.7694, + "step": 22336 + }, + { + "epoch": 6.856046654389196, + "grad_norm": 0.1981547325849533, + "learning_rate": 2.376221824185595e-05, + "loss": 1.736, + "step": 22337 + }, + { + "epoch": 6.856353591160221, + "grad_norm": 0.17638558149337769, + "learning_rate": 2.375798716794756e-05, + "loss": 1.6979, + "step": 22338 + }, + { + "epoch": 6.856660527931246, + "grad_norm": 0.20189990103244781, + "learning_rate": 2.3753756353378116e-05, + "loss": 1.7876, + "step": 22339 + }, + { + "epoch": 6.856967464702271, + "grad_norm": 0.1880224347114563, + "learning_rate": 2.3749525798189438e-05, + "loss": 1.7134, + "step": 22340 + }, + { + "epoch": 6.857274401473297, + "grad_norm": 0.2464265078306198, + "learning_rate": 2.3745295502423316e-05, + "loss": 1.7782, + "step": 22341 + }, + { + "epoch": 6.857581338244322, + "grad_norm": 0.19218963384628296, + "learning_rate": 2.3741065466121604e-05, + "loss": 1.7027, + "step": 22342 + }, + { + "epoch": 6.8578882750153465, + "grad_norm": 0.27446448802948, + "learning_rate": 2.3736835689326043e-05, + "loss": 1.772, + "step": 22343 + }, + { + "epoch": 6.858195211786372, + "grad_norm": 0.19315828382968903, + "learning_rate": 2.3732606172078497e-05, + "loss": 1.6855, + "step": 22344 + }, + { + "epoch": 6.858502148557397, + "grad_norm": 0.2668892741203308, + "learning_rate": 2.372837691442072e-05, + "loss": 1.7703, + "step": 22345 + }, + { + "epoch": 6.8588090853284225, + "grad_norm": 0.23552054166793823, + "learning_rate": 2.3724147916394497e-05, + "loss": 1.7184, + "step": 22346 + }, + { + "epoch": 6.859116022099448, + "grad_norm": 0.3194984793663025, + "learning_rate": 2.3719919178041682e-05, + "loss": 1.7531, + "step": 22347 + }, + { + "epoch": 6.859422958870473, + "grad_norm": 0.19298717379570007, + "learning_rate": 2.371569069940399e-05, + "loss": 1.7064, + "step": 22348 + }, + { + "epoch": 6.859729895641498, + "grad_norm": 0.2990693151950836, + "learning_rate": 2.3711462480523293e-05, + "loss": 1.7434, + "step": 22349 + }, + { + "epoch": 6.860036832412523, + "grad_norm": 0.1976640820503235, + "learning_rate": 2.370723452144129e-05, + "loss": 1.6881, + "step": 22350 + }, + { + "epoch": 6.860343769183548, + "grad_norm": 0.24306917190551758, + "learning_rate": 2.3703006822199825e-05, + "loss": 1.7791, + "step": 22351 + }, + { + "epoch": 6.860650705954574, + "grad_norm": 0.20065687596797943, + "learning_rate": 2.3698779382840657e-05, + "loss": 1.7162, + "step": 22352 + }, + { + "epoch": 6.860957642725598, + "grad_norm": 0.21599936485290527, + "learning_rate": 2.3694552203405574e-05, + "loss": 1.7702, + "step": 22353 + }, + { + "epoch": 6.861264579496623, + "grad_norm": 0.16836890578269958, + "learning_rate": 2.3690325283936338e-05, + "loss": 1.6676, + "step": 22354 + }, + { + "epoch": 6.861571516267649, + "grad_norm": 0.1756831407546997, + "learning_rate": 2.368609862447473e-05, + "loss": 1.6934, + "step": 22355 + }, + { + "epoch": 6.861878453038674, + "grad_norm": 0.18676789104938507, + "learning_rate": 2.3681872225062517e-05, + "loss": 1.6879, + "step": 22356 + }, + { + "epoch": 6.862185389809699, + "grad_norm": 0.18018634617328644, + "learning_rate": 2.3677646085741473e-05, + "loss": 1.7143, + "step": 22357 + }, + { + "epoch": 6.862492326580725, + "grad_norm": 0.1789008378982544, + "learning_rate": 2.3673420206553332e-05, + "loss": 1.6914, + "step": 22358 + }, + { + "epoch": 6.862799263351749, + "grad_norm": 0.1869693398475647, + "learning_rate": 2.366919458753993e-05, + "loss": 1.7431, + "step": 22359 + }, + { + "epoch": 6.8631062001227745, + "grad_norm": 0.1958019733428955, + "learning_rate": 2.3664969228742934e-05, + "loss": 1.7132, + "step": 22360 + }, + { + "epoch": 6.8634131368938, + "grad_norm": 0.199384868144989, + "learning_rate": 2.366074413020419e-05, + "loss": 1.7095, + "step": 22361 + }, + { + "epoch": 6.863720073664825, + "grad_norm": 0.2125246673822403, + "learning_rate": 2.365651929196539e-05, + "loss": 1.7125, + "step": 22362 + }, + { + "epoch": 6.8640270104358505, + "grad_norm": 0.1574707180261612, + "learning_rate": 2.3652294714068284e-05, + "loss": 1.6386, + "step": 22363 + }, + { + "epoch": 6.864333947206875, + "grad_norm": 0.30648529529571533, + "learning_rate": 2.364807039655469e-05, + "loss": 1.7665, + "step": 22364 + }, + { + "epoch": 6.8646408839779, + "grad_norm": 0.19746489822864532, + "learning_rate": 2.364384633946627e-05, + "loss": 1.6736, + "step": 22365 + }, + { + "epoch": 6.864947820748926, + "grad_norm": 0.25084391236305237, + "learning_rate": 2.3639622542844842e-05, + "loss": 1.7346, + "step": 22366 + }, + { + "epoch": 6.865254757519951, + "grad_norm": 0.1884133219718933, + "learning_rate": 2.3635399006732077e-05, + "loss": 1.6868, + "step": 22367 + }, + { + "epoch": 6.865561694290976, + "grad_norm": 0.21225856244564056, + "learning_rate": 2.3631175731169774e-05, + "loss": 1.7438, + "step": 22368 + }, + { + "epoch": 6.865868631062002, + "grad_norm": 0.1863771378993988, + "learning_rate": 2.3626952716199647e-05, + "loss": 1.7677, + "step": 22369 + }, + { + "epoch": 6.866175567833026, + "grad_norm": 0.1839088648557663, + "learning_rate": 2.362272996186343e-05, + "loss": 1.6902, + "step": 22370 + }, + { + "epoch": 6.866482504604051, + "grad_norm": 0.18304915726184845, + "learning_rate": 2.3618507468202856e-05, + "loss": 1.7142, + "step": 22371 + }, + { + "epoch": 6.866789441375077, + "grad_norm": 0.21228280663490295, + "learning_rate": 2.3614285235259655e-05, + "loss": 1.8277, + "step": 22372 + }, + { + "epoch": 6.867096378146102, + "grad_norm": 0.19515320658683777, + "learning_rate": 2.361006326307555e-05, + "loss": 1.7029, + "step": 22373 + }, + { + "epoch": 6.867403314917127, + "grad_norm": 0.16277433931827545, + "learning_rate": 2.360584155169227e-05, + "loss": 1.672, + "step": 22374 + }, + { + "epoch": 6.867710251688152, + "grad_norm": 0.2180202454328537, + "learning_rate": 2.360162010115151e-05, + "loss": 1.7516, + "step": 22375 + }, + { + "epoch": 6.868017188459177, + "grad_norm": 0.17940378189086914, + "learning_rate": 2.3597398911495055e-05, + "loss": 1.6782, + "step": 22376 + }, + { + "epoch": 6.8683241252302025, + "grad_norm": 0.20751933753490448, + "learning_rate": 2.3593177982764543e-05, + "loss": 1.7954, + "step": 22377 + }, + { + "epoch": 6.868631062001228, + "grad_norm": 0.23098444938659668, + "learning_rate": 2.3588957315001758e-05, + "loss": 1.7472, + "step": 22378 + }, + { + "epoch": 6.868937998772253, + "grad_norm": 0.2351236343383789, + "learning_rate": 2.358473690824836e-05, + "loss": 1.7959, + "step": 22379 + }, + { + "epoch": 6.8692449355432785, + "grad_norm": 0.1890626847743988, + "learning_rate": 2.3580516762546055e-05, + "loss": 1.7015, + "step": 22380 + }, + { + "epoch": 6.869551872314303, + "grad_norm": 0.21120475232601166, + "learning_rate": 2.3576296877936604e-05, + "loss": 1.7998, + "step": 22381 + }, + { + "epoch": 6.869858809085328, + "grad_norm": 0.18141280114650726, + "learning_rate": 2.3572077254461638e-05, + "loss": 1.6973, + "step": 22382 + }, + { + "epoch": 6.870165745856354, + "grad_norm": 0.19084444642066956, + "learning_rate": 2.356785789216293e-05, + "loss": 1.6853, + "step": 22383 + }, + { + "epoch": 6.870472682627379, + "grad_norm": 0.18046700954437256, + "learning_rate": 2.356363879108211e-05, + "loss": 1.7476, + "step": 22384 + }, + { + "epoch": 6.870779619398404, + "grad_norm": 0.19875061511993408, + "learning_rate": 2.3559419951260926e-05, + "loss": 1.7223, + "step": 22385 + }, + { + "epoch": 6.871086556169429, + "grad_norm": 0.2377827763557434, + "learning_rate": 2.3555201372741047e-05, + "loss": 1.7976, + "step": 22386 + }, + { + "epoch": 6.871393492940454, + "grad_norm": 0.17645993828773499, + "learning_rate": 2.3550983055564168e-05, + "loss": 1.6726, + "step": 22387 + }, + { + "epoch": 6.871700429711479, + "grad_norm": 0.19499735534191132, + "learning_rate": 2.3546764999771976e-05, + "loss": 1.67, + "step": 22388 + }, + { + "epoch": 6.872007366482505, + "grad_norm": 0.22010546922683716, + "learning_rate": 2.3542547205406163e-05, + "loss": 1.8461, + "step": 22389 + }, + { + "epoch": 6.87231430325353, + "grad_norm": 0.2101692259311676, + "learning_rate": 2.3538329672508396e-05, + "loss": 1.6922, + "step": 22390 + }, + { + "epoch": 6.872621240024555, + "grad_norm": 0.1926269382238388, + "learning_rate": 2.3534112401120372e-05, + "loss": 1.6934, + "step": 22391 + }, + { + "epoch": 6.87292817679558, + "grad_norm": 0.20662687718868256, + "learning_rate": 2.3529895391283742e-05, + "loss": 1.7284, + "step": 22392 + }, + { + "epoch": 6.873235113566605, + "grad_norm": 0.2392960786819458, + "learning_rate": 2.3525678643040235e-05, + "loss": 1.7207, + "step": 22393 + }, + { + "epoch": 6.8735420503376305, + "grad_norm": 0.2067870795726776, + "learning_rate": 2.3521462156431452e-05, + "loss": 1.7269, + "step": 22394 + }, + { + "epoch": 6.873848987108656, + "grad_norm": 0.2544265687465668, + "learning_rate": 2.351724593149914e-05, + "loss": 1.7358, + "step": 22395 + }, + { + "epoch": 6.87415592387968, + "grad_norm": 0.2243366837501526, + "learning_rate": 2.3513029968284907e-05, + "loss": 1.7625, + "step": 22396 + }, + { + "epoch": 6.874462860650706, + "grad_norm": 0.23003467917442322, + "learning_rate": 2.3508814266830414e-05, + "loss": 1.6943, + "step": 22397 + }, + { + "epoch": 6.874769797421731, + "grad_norm": 0.19257886707782745, + "learning_rate": 2.3504598827177383e-05, + "loss": 1.7393, + "step": 22398 + }, + { + "epoch": 6.875076734192756, + "grad_norm": 0.23782171308994293, + "learning_rate": 2.3500383649367404e-05, + "loss": 1.7758, + "step": 22399 + }, + { + "epoch": 6.875383670963782, + "grad_norm": 0.18137066066265106, + "learning_rate": 2.3496168733442197e-05, + "loss": 1.7083, + "step": 22400 + }, + { + "epoch": 6.875690607734807, + "grad_norm": 0.21970662474632263, + "learning_rate": 2.3491954079443344e-05, + "loss": 1.7552, + "step": 22401 + }, + { + "epoch": 6.8759975445058314, + "grad_norm": 0.2032134085893631, + "learning_rate": 2.3487739687412562e-05, + "loss": 1.7653, + "step": 22402 + }, + { + "epoch": 6.876304481276857, + "grad_norm": 0.22016118466854095, + "learning_rate": 2.348352555739148e-05, + "loss": 1.7277, + "step": 22403 + }, + { + "epoch": 6.876611418047882, + "grad_norm": 0.2250203788280487, + "learning_rate": 2.3479311689421736e-05, + "loss": 1.7451, + "step": 22404 + }, + { + "epoch": 6.8769183548189075, + "grad_norm": 0.19726359844207764, + "learning_rate": 2.3475098083544977e-05, + "loss": 1.728, + "step": 22405 + }, + { + "epoch": 6.877225291589933, + "grad_norm": 0.21295994520187378, + "learning_rate": 2.3470884739802844e-05, + "loss": 1.7438, + "step": 22406 + }, + { + "epoch": 6.877532228360957, + "grad_norm": 0.19653508067131042, + "learning_rate": 2.346667165823698e-05, + "loss": 1.7189, + "step": 22407 + }, + { + "epoch": 6.877839165131983, + "grad_norm": 0.21406517922878265, + "learning_rate": 2.3462458838889016e-05, + "loss": 1.7475, + "step": 22408 + }, + { + "epoch": 6.878146101903008, + "grad_norm": 0.20569753646850586, + "learning_rate": 2.3458246281800595e-05, + "loss": 1.7262, + "step": 22409 + }, + { + "epoch": 6.878453038674033, + "grad_norm": 0.19365517795085907, + "learning_rate": 2.3454033987013334e-05, + "loss": 1.6938, + "step": 22410 + }, + { + "epoch": 6.878759975445059, + "grad_norm": 0.20935405790805817, + "learning_rate": 2.344982195456885e-05, + "loss": 1.724, + "step": 22411 + }, + { + "epoch": 6.879066912216084, + "grad_norm": 0.2104228436946869, + "learning_rate": 2.3445610184508826e-05, + "loss": 1.7474, + "step": 22412 + }, + { + "epoch": 6.879373848987108, + "grad_norm": 0.19795742630958557, + "learning_rate": 2.3441398676874826e-05, + "loss": 1.7572, + "step": 22413 + }, + { + "epoch": 6.879680785758134, + "grad_norm": 0.20640577375888824, + "learning_rate": 2.3437187431708472e-05, + "loss": 1.7258, + "step": 22414 + }, + { + "epoch": 6.879987722529159, + "grad_norm": 0.2092565894126892, + "learning_rate": 2.3432976449051442e-05, + "loss": 1.7437, + "step": 22415 + }, + { + "epoch": 6.880294659300184, + "grad_norm": 0.2083825170993805, + "learning_rate": 2.3428765728945275e-05, + "loss": 1.7127, + "step": 22416 + }, + { + "epoch": 6.88060159607121, + "grad_norm": 0.20619866251945496, + "learning_rate": 2.3424555271431647e-05, + "loss": 1.7729, + "step": 22417 + }, + { + "epoch": 6.880908532842234, + "grad_norm": 0.22689959406852722, + "learning_rate": 2.3420345076552107e-05, + "loss": 1.7142, + "step": 22418 + }, + { + "epoch": 6.8812154696132595, + "grad_norm": 0.16664449870586395, + "learning_rate": 2.3416135144348316e-05, + "loss": 1.6857, + "step": 22419 + }, + { + "epoch": 6.881522406384285, + "grad_norm": 0.1895827353000641, + "learning_rate": 2.3411925474861856e-05, + "loss": 1.7075, + "step": 22420 + }, + { + "epoch": 6.88182934315531, + "grad_norm": 0.2058400958776474, + "learning_rate": 2.3407716068134334e-05, + "loss": 1.7623, + "step": 22421 + }, + { + "epoch": 6.8821362799263355, + "grad_norm": 0.18390826880931854, + "learning_rate": 2.3403506924207346e-05, + "loss": 1.6686, + "step": 22422 + }, + { + "epoch": 6.882443216697361, + "grad_norm": 0.1742098331451416, + "learning_rate": 2.3399298043122497e-05, + "loss": 1.6846, + "step": 22423 + }, + { + "epoch": 6.882750153468385, + "grad_norm": 0.18958622217178345, + "learning_rate": 2.3395089424921368e-05, + "loss": 1.7603, + "step": 22424 + }, + { + "epoch": 6.883057090239411, + "grad_norm": 0.21827174723148346, + "learning_rate": 2.3390881069645564e-05, + "loss": 1.6706, + "step": 22425 + }, + { + "epoch": 6.883364027010436, + "grad_norm": 0.17859303951263428, + "learning_rate": 2.338667297733667e-05, + "loss": 1.7612, + "step": 22426 + }, + { + "epoch": 6.883670963781461, + "grad_norm": 0.22383756935596466, + "learning_rate": 2.338246514803627e-05, + "loss": 1.7507, + "step": 22427 + }, + { + "epoch": 6.883977900552486, + "grad_norm": 0.20317313075065613, + "learning_rate": 2.3378257581785934e-05, + "loss": 1.6912, + "step": 22428 + }, + { + "epoch": 6.884284837323511, + "grad_norm": 0.20238614082336426, + "learning_rate": 2.3374050278627297e-05, + "loss": 1.7336, + "step": 22429 + }, + { + "epoch": 6.884591774094536, + "grad_norm": 0.2134159654378891, + "learning_rate": 2.336984323860188e-05, + "loss": 1.7252, + "step": 22430 + }, + { + "epoch": 6.884898710865562, + "grad_norm": 0.17153076827526093, + "learning_rate": 2.3365636461751277e-05, + "loss": 1.6769, + "step": 22431 + }, + { + "epoch": 6.885205647636587, + "grad_norm": 0.19001254439353943, + "learning_rate": 2.3361429948117075e-05, + "loss": 1.7812, + "step": 22432 + }, + { + "epoch": 6.885512584407612, + "grad_norm": 0.2074522078037262, + "learning_rate": 2.335722369774081e-05, + "loss": 1.7433, + "step": 22433 + }, + { + "epoch": 6.885819521178637, + "grad_norm": 0.22863705456256866, + "learning_rate": 2.3353017710664117e-05, + "loss": 1.7476, + "step": 22434 + }, + { + "epoch": 6.886126457949662, + "grad_norm": 0.19350804388523102, + "learning_rate": 2.334881198692848e-05, + "loss": 1.7071, + "step": 22435 + }, + { + "epoch": 6.8864333947206875, + "grad_norm": 0.22915633022785187, + "learning_rate": 2.3344606526575524e-05, + "loss": 1.7283, + "step": 22436 + }, + { + "epoch": 6.886740331491713, + "grad_norm": 0.21576058864593506, + "learning_rate": 2.3340401329646795e-05, + "loss": 1.7062, + "step": 22437 + }, + { + "epoch": 6.887047268262738, + "grad_norm": 0.17844067513942719, + "learning_rate": 2.333619639618384e-05, + "loss": 1.6994, + "step": 22438 + }, + { + "epoch": 6.887354205033763, + "grad_norm": 0.21019738912582397, + "learning_rate": 2.333199172622822e-05, + "loss": 1.6654, + "step": 22439 + }, + { + "epoch": 6.887661141804788, + "grad_norm": 0.1901654452085495, + "learning_rate": 2.3327787319821486e-05, + "loss": 1.7847, + "step": 22440 + }, + { + "epoch": 6.887968078575813, + "grad_norm": 0.21838930249214172, + "learning_rate": 2.3323583177005198e-05, + "loss": 1.6517, + "step": 22441 + }, + { + "epoch": 6.888275015346839, + "grad_norm": 0.16078172624111176, + "learning_rate": 2.3319379297820892e-05, + "loss": 1.7052, + "step": 22442 + }, + { + "epoch": 6.888581952117864, + "grad_norm": 0.19161897897720337, + "learning_rate": 2.331517568231012e-05, + "loss": 1.675, + "step": 22443 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 0.1874416172504425, + "learning_rate": 2.331097233051442e-05, + "loss": 1.7025, + "step": 22444 + }, + { + "epoch": 6.889195825659914, + "grad_norm": 0.1817546933889389, + "learning_rate": 2.3306769242475318e-05, + "loss": 1.7103, + "step": 22445 + }, + { + "epoch": 6.889502762430939, + "grad_norm": 0.18423372507095337, + "learning_rate": 2.3302566418234406e-05, + "loss": 1.6883, + "step": 22446 + }, + { + "epoch": 6.889809699201964, + "grad_norm": 0.1712140440940857, + "learning_rate": 2.3298363857833162e-05, + "loss": 1.7076, + "step": 22447 + }, + { + "epoch": 6.89011663597299, + "grad_norm": 0.15992864966392517, + "learning_rate": 2.3294161561313133e-05, + "loss": 1.6514, + "step": 22448 + }, + { + "epoch": 6.890423572744015, + "grad_norm": 0.24126072227954865, + "learning_rate": 2.3289959528715855e-05, + "loss": 1.7385, + "step": 22449 + }, + { + "epoch": 6.8907305095150395, + "grad_norm": 0.18130798637866974, + "learning_rate": 2.3285757760082832e-05, + "loss": 1.691, + "step": 22450 + }, + { + "epoch": 6.891037446286065, + "grad_norm": 0.20070049166679382, + "learning_rate": 2.3281556255455644e-05, + "loss": 1.7166, + "step": 22451 + }, + { + "epoch": 6.89134438305709, + "grad_norm": 0.20706996321678162, + "learning_rate": 2.327735501487574e-05, + "loss": 1.6763, + "step": 22452 + }, + { + "epoch": 6.8916513198281155, + "grad_norm": 0.22404810786247253, + "learning_rate": 2.327315403838472e-05, + "loss": 1.761, + "step": 22453 + }, + { + "epoch": 6.891958256599141, + "grad_norm": 0.21240194141864777, + "learning_rate": 2.3268953326024013e-05, + "loss": 1.7038, + "step": 22454 + }, + { + "epoch": 6.892265193370166, + "grad_norm": 0.24251966178417206, + "learning_rate": 2.32647528778352e-05, + "loss": 1.7829, + "step": 22455 + }, + { + "epoch": 6.892572130141191, + "grad_norm": 0.21213467419147491, + "learning_rate": 2.3260552693859765e-05, + "loss": 1.7433, + "step": 22456 + }, + { + "epoch": 6.892879066912216, + "grad_norm": 0.18008530139923096, + "learning_rate": 2.325635277413922e-05, + "loss": 1.7238, + "step": 22457 + }, + { + "epoch": 6.893186003683241, + "grad_norm": 0.18252789974212646, + "learning_rate": 2.325215311871508e-05, + "loss": 1.7143, + "step": 22458 + }, + { + "epoch": 6.893492940454267, + "grad_norm": 0.17830567061901093, + "learning_rate": 2.3247953727628833e-05, + "loss": 1.687, + "step": 22459 + }, + { + "epoch": 6.893799877225292, + "grad_norm": 0.19980686902999878, + "learning_rate": 2.3243754600921992e-05, + "loss": 1.7096, + "step": 22460 + }, + { + "epoch": 6.894106813996316, + "grad_norm": 0.1713438183069229, + "learning_rate": 2.3239555738636044e-05, + "loss": 1.6791, + "step": 22461 + }, + { + "epoch": 6.894413750767342, + "grad_norm": 0.17678281664848328, + "learning_rate": 2.3235357140812475e-05, + "loss": 1.6689, + "step": 22462 + }, + { + "epoch": 6.894720687538367, + "grad_norm": 0.20409992337226868, + "learning_rate": 2.3231158807492837e-05, + "loss": 1.7746, + "step": 22463 + }, + { + "epoch": 6.895027624309392, + "grad_norm": 0.19227825105190277, + "learning_rate": 2.3226960738718552e-05, + "loss": 1.7101, + "step": 22464 + }, + { + "epoch": 6.895334561080418, + "grad_norm": 0.24029433727264404, + "learning_rate": 2.3222762934531132e-05, + "loss": 1.7842, + "step": 22465 + }, + { + "epoch": 6.895641497851443, + "grad_norm": 0.21887856721878052, + "learning_rate": 2.321856539497207e-05, + "loss": 1.7032, + "step": 22466 + }, + { + "epoch": 6.8959484346224675, + "grad_norm": 0.17346082627773285, + "learning_rate": 2.321436812008282e-05, + "loss": 1.683, + "step": 22467 + }, + { + "epoch": 6.896255371393493, + "grad_norm": 0.18920177221298218, + "learning_rate": 2.3210171109904914e-05, + "loss": 1.7057, + "step": 22468 + }, + { + "epoch": 6.896562308164518, + "grad_norm": 0.21199388802051544, + "learning_rate": 2.320597436447977e-05, + "loss": 1.7534, + "step": 22469 + }, + { + "epoch": 6.8968692449355435, + "grad_norm": 0.1867530792951584, + "learning_rate": 2.320177788384893e-05, + "loss": 1.7185, + "step": 22470 + }, + { + "epoch": 6.897176181706568, + "grad_norm": 0.21009495854377747, + "learning_rate": 2.3197581668053785e-05, + "loss": 1.7379, + "step": 22471 + }, + { + "epoch": 6.897483118477593, + "grad_norm": 0.20078743994235992, + "learning_rate": 2.3193385717135874e-05, + "loss": 1.7226, + "step": 22472 + }, + { + "epoch": 6.897790055248619, + "grad_norm": 0.2135045975446701, + "learning_rate": 2.318919003113663e-05, + "loss": 1.7531, + "step": 22473 + }, + { + "epoch": 6.898096992019644, + "grad_norm": 0.18811136484146118, + "learning_rate": 2.3184994610097526e-05, + "loss": 1.6542, + "step": 22474 + }, + { + "epoch": 6.898403928790669, + "grad_norm": 0.2323937565088272, + "learning_rate": 2.3180799454060025e-05, + "loss": 1.7369, + "step": 22475 + }, + { + "epoch": 6.898710865561695, + "grad_norm": 0.19270992279052734, + "learning_rate": 2.317660456306558e-05, + "loss": 1.6818, + "step": 22476 + }, + { + "epoch": 6.899017802332719, + "grad_norm": 0.18951043486595154, + "learning_rate": 2.3172409937155654e-05, + "loss": 1.7183, + "step": 22477 + }, + { + "epoch": 6.899324739103744, + "grad_norm": 0.1758934110403061, + "learning_rate": 2.3168215576371694e-05, + "loss": 1.6826, + "step": 22478 + }, + { + "epoch": 6.89963167587477, + "grad_norm": 0.2048143893480301, + "learning_rate": 2.3164021480755133e-05, + "loss": 1.7769, + "step": 22479 + }, + { + "epoch": 6.899938612645795, + "grad_norm": 0.20538486540317535, + "learning_rate": 2.315982765034748e-05, + "loss": 1.7035, + "step": 22480 + }, + { + "epoch": 6.9002455494168204, + "grad_norm": 0.18417708575725555, + "learning_rate": 2.3155634085190124e-05, + "loss": 1.7533, + "step": 22481 + }, + { + "epoch": 6.900552486187845, + "grad_norm": 0.1978628784418106, + "learning_rate": 2.315144078532453e-05, + "loss": 1.691, + "step": 22482 + }, + { + "epoch": 6.90085942295887, + "grad_norm": 0.17665794491767883, + "learning_rate": 2.3147247750792128e-05, + "loss": 1.7018, + "step": 22483 + }, + { + "epoch": 6.901166359729896, + "grad_norm": 0.20218273997306824, + "learning_rate": 2.314305498163435e-05, + "loss": 1.7277, + "step": 22484 + }, + { + "epoch": 6.901473296500921, + "grad_norm": 0.18791642785072327, + "learning_rate": 2.3138862477892674e-05, + "loss": 1.7247, + "step": 22485 + }, + { + "epoch": 6.901780233271946, + "grad_norm": 0.1945842206478119, + "learning_rate": 2.313467023960847e-05, + "loss": 1.6648, + "step": 22486 + }, + { + "epoch": 6.902087170042972, + "grad_norm": 0.1871321201324463, + "learning_rate": 2.3130478266823237e-05, + "loss": 1.6978, + "step": 22487 + }, + { + "epoch": 6.902394106813996, + "grad_norm": 0.20094287395477295, + "learning_rate": 2.312628655957833e-05, + "loss": 1.7763, + "step": 22488 + }, + { + "epoch": 6.902701043585021, + "grad_norm": 0.1804366111755371, + "learning_rate": 2.3122095117915226e-05, + "loss": 1.689, + "step": 22489 + }, + { + "epoch": 6.903007980356047, + "grad_norm": 0.1846652776002884, + "learning_rate": 2.311790394187534e-05, + "loss": 1.7088, + "step": 22490 + }, + { + "epoch": 6.903314917127072, + "grad_norm": 0.18339675664901733, + "learning_rate": 2.311371303150008e-05, + "loss": 1.6974, + "step": 22491 + }, + { + "epoch": 6.903621853898097, + "grad_norm": 0.21333162486553192, + "learning_rate": 2.3109522386830863e-05, + "loss": 1.7614, + "step": 22492 + }, + { + "epoch": 6.903928790669122, + "grad_norm": 0.19845318794250488, + "learning_rate": 2.3105332007909104e-05, + "loss": 1.6895, + "step": 22493 + }, + { + "epoch": 6.904235727440147, + "grad_norm": 0.21082347631454468, + "learning_rate": 2.3101141894776224e-05, + "loss": 1.7397, + "step": 22494 + }, + { + "epoch": 6.9045426642111725, + "grad_norm": 0.16360893845558167, + "learning_rate": 2.3096952047473623e-05, + "loss": 1.6716, + "step": 22495 + }, + { + "epoch": 6.904849600982198, + "grad_norm": 0.2287478744983673, + "learning_rate": 2.3092762466042687e-05, + "loss": 1.7673, + "step": 22496 + }, + { + "epoch": 6.905156537753223, + "grad_norm": 0.17231078445911407, + "learning_rate": 2.308857315052489e-05, + "loss": 1.6744, + "step": 22497 + }, + { + "epoch": 6.9054634745242485, + "grad_norm": 0.2887173295021057, + "learning_rate": 2.3084384100961565e-05, + "loss": 1.7358, + "step": 22498 + }, + { + "epoch": 6.905770411295273, + "grad_norm": 0.1977192759513855, + "learning_rate": 2.3080195317394127e-05, + "loss": 1.7514, + "step": 22499 + }, + { + "epoch": 6.906077348066298, + "grad_norm": 0.24933035671710968, + "learning_rate": 2.307600679986398e-05, + "loss": 1.6845, + "step": 22500 + }, + { + "epoch": 6.906384284837324, + "grad_norm": 0.17288708686828613, + "learning_rate": 2.30718185484125e-05, + "loss": 1.7211, + "step": 22501 + }, + { + "epoch": 6.906691221608349, + "grad_norm": 0.22192007303237915, + "learning_rate": 2.306763056308112e-05, + "loss": 1.6924, + "step": 22502 + }, + { + "epoch": 6.906998158379373, + "grad_norm": 0.20500123500823975, + "learning_rate": 2.3063442843911172e-05, + "loss": 1.7412, + "step": 22503 + }, + { + "epoch": 6.907305095150399, + "grad_norm": 0.30658698081970215, + "learning_rate": 2.30592553909441e-05, + "loss": 1.7965, + "step": 22504 + }, + { + "epoch": 6.907612031921424, + "grad_norm": 0.177829772233963, + "learning_rate": 2.3055068204221224e-05, + "loss": 1.6914, + "step": 22505 + }, + { + "epoch": 6.907918968692449, + "grad_norm": 0.20281876623630524, + "learning_rate": 2.3050881283783977e-05, + "loss": 1.6946, + "step": 22506 + }, + { + "epoch": 6.908225905463475, + "grad_norm": 0.16111700236797333, + "learning_rate": 2.3046694629673716e-05, + "loss": 1.7004, + "step": 22507 + }, + { + "epoch": 6.9085328422345, + "grad_norm": 0.1911575049161911, + "learning_rate": 2.3042508241931814e-05, + "loss": 1.7013, + "step": 22508 + }, + { + "epoch": 6.9088397790055245, + "grad_norm": 0.17862342298030853, + "learning_rate": 2.303832212059965e-05, + "loss": 1.7053, + "step": 22509 + }, + { + "epoch": 6.90914671577655, + "grad_norm": 0.2268948256969452, + "learning_rate": 2.303413626571858e-05, + "loss": 1.7241, + "step": 22510 + }, + { + "epoch": 6.909453652547575, + "grad_norm": 0.1997457593679428, + "learning_rate": 2.3029950677329992e-05, + "loss": 1.6927, + "step": 22511 + }, + { + "epoch": 6.9097605893186005, + "grad_norm": 0.22120819985866547, + "learning_rate": 2.3025765355475232e-05, + "loss": 1.7447, + "step": 22512 + }, + { + "epoch": 6.910067526089626, + "grad_norm": 0.22097964584827423, + "learning_rate": 2.302158030019565e-05, + "loss": 1.7399, + "step": 22513 + }, + { + "epoch": 6.91037446286065, + "grad_norm": 0.2171044498682022, + "learning_rate": 2.3017395511532664e-05, + "loss": 1.7252, + "step": 22514 + }, + { + "epoch": 6.910681399631676, + "grad_norm": 0.1987348347902298, + "learning_rate": 2.301321098952757e-05, + "loss": 1.7071, + "step": 22515 + }, + { + "epoch": 6.910988336402701, + "grad_norm": 0.2131081372499466, + "learning_rate": 2.3009026734221746e-05, + "loss": 1.7314, + "step": 22516 + }, + { + "epoch": 6.911295273173726, + "grad_norm": 0.18867900967597961, + "learning_rate": 2.3004842745656536e-05, + "loss": 1.7431, + "step": 22517 + }, + { + "epoch": 6.911602209944752, + "grad_norm": 0.22853058576583862, + "learning_rate": 2.3000659023873277e-05, + "loss": 1.7234, + "step": 22518 + }, + { + "epoch": 6.911909146715777, + "grad_norm": 0.23441165685653687, + "learning_rate": 2.2996475568913366e-05, + "loss": 1.7535, + "step": 22519 + }, + { + "epoch": 6.912216083486801, + "grad_norm": 0.2376382052898407, + "learning_rate": 2.299229238081807e-05, + "loss": 1.7582, + "step": 22520 + }, + { + "epoch": 6.912523020257827, + "grad_norm": 0.2571510076522827, + "learning_rate": 2.2988109459628814e-05, + "loss": 1.722, + "step": 22521 + }, + { + "epoch": 6.912829957028852, + "grad_norm": 0.19782103598117828, + "learning_rate": 2.298392680538685e-05, + "loss": 1.7052, + "step": 22522 + }, + { + "epoch": 6.913136893799877, + "grad_norm": 0.24070625007152557, + "learning_rate": 2.297974441813358e-05, + "loss": 1.7306, + "step": 22523 + }, + { + "epoch": 6.913443830570903, + "grad_norm": 0.1783500611782074, + "learning_rate": 2.2975562297910307e-05, + "loss": 1.7077, + "step": 22524 + }, + { + "epoch": 6.913750767341927, + "grad_norm": 0.19469089806079865, + "learning_rate": 2.2971380444758373e-05, + "loss": 1.7275, + "step": 22525 + }, + { + "epoch": 6.9140577041129525, + "grad_norm": 0.21449480950832367, + "learning_rate": 2.2967198858719092e-05, + "loss": 1.7682, + "step": 22526 + }, + { + "epoch": 6.914364640883978, + "grad_norm": 0.21686261892318726, + "learning_rate": 2.2963017539833803e-05, + "loss": 1.6794, + "step": 22527 + }, + { + "epoch": 6.914671577655003, + "grad_norm": 0.2061273604631424, + "learning_rate": 2.2958836488143813e-05, + "loss": 1.7612, + "step": 22528 + }, + { + "epoch": 6.9149785144260285, + "grad_norm": 0.2708517611026764, + "learning_rate": 2.295465570369046e-05, + "loss": 1.7291, + "step": 22529 + }, + { + "epoch": 6.915285451197054, + "grad_norm": 0.17011860013008118, + "learning_rate": 2.295047518651503e-05, + "loss": 1.6541, + "step": 22530 + }, + { + "epoch": 6.915592387968078, + "grad_norm": 0.255305677652359, + "learning_rate": 2.294629493665889e-05, + "loss": 1.7063, + "step": 22531 + }, + { + "epoch": 6.915899324739104, + "grad_norm": 0.20172207057476044, + "learning_rate": 2.2942114954163306e-05, + "loss": 1.6678, + "step": 22532 + }, + { + "epoch": 6.916206261510129, + "grad_norm": 0.23726679384708405, + "learning_rate": 2.2937935239069603e-05, + "loss": 1.6762, + "step": 22533 + }, + { + "epoch": 6.916513198281154, + "grad_norm": 0.17716684937477112, + "learning_rate": 2.2933755791419082e-05, + "loss": 1.7302, + "step": 22534 + }, + { + "epoch": 6.91682013505218, + "grad_norm": 0.2513270974159241, + "learning_rate": 2.2929576611253035e-05, + "loss": 1.7371, + "step": 22535 + }, + { + "epoch": 6.917127071823204, + "grad_norm": 0.21994394063949585, + "learning_rate": 2.292539769861281e-05, + "loss": 1.7007, + "step": 22536 + }, + { + "epoch": 6.917434008594229, + "grad_norm": 0.2095540314912796, + "learning_rate": 2.292121905353964e-05, + "loss": 1.71, + "step": 22537 + }, + { + "epoch": 6.917740945365255, + "grad_norm": 0.24400855600833893, + "learning_rate": 2.2917040676074892e-05, + "loss": 1.7859, + "step": 22538 + }, + { + "epoch": 6.91804788213628, + "grad_norm": 0.23217935860157013, + "learning_rate": 2.2912862566259785e-05, + "loss": 1.8218, + "step": 22539 + }, + { + "epoch": 6.918354818907305, + "grad_norm": 0.23555497825145721, + "learning_rate": 2.2908684724135666e-05, + "loss": 1.7145, + "step": 22540 + }, + { + "epoch": 6.918661755678331, + "grad_norm": 0.17844347655773163, + "learning_rate": 2.2904507149743804e-05, + "loss": 1.6767, + "step": 22541 + }, + { + "epoch": 6.918968692449355, + "grad_norm": 0.20810428261756897, + "learning_rate": 2.290032984312548e-05, + "loss": 1.7359, + "step": 22542 + }, + { + "epoch": 6.9192756292203805, + "grad_norm": 0.20082542300224304, + "learning_rate": 2.289615280432198e-05, + "loss": 1.7623, + "step": 22543 + }, + { + "epoch": 6.919582565991406, + "grad_norm": 0.2005007117986679, + "learning_rate": 2.2891976033374584e-05, + "loss": 1.745, + "step": 22544 + }, + { + "epoch": 6.919889502762431, + "grad_norm": 0.18054969608783722, + "learning_rate": 2.2887799530324572e-05, + "loss": 1.6959, + "step": 22545 + }, + { + "epoch": 6.920196439533456, + "grad_norm": 0.18410442769527435, + "learning_rate": 2.2883623295213214e-05, + "loss": 1.7052, + "step": 22546 + }, + { + "epoch": 6.920503376304481, + "grad_norm": 0.17380426824092865, + "learning_rate": 2.2879447328081765e-05, + "loss": 1.6735, + "step": 22547 + }, + { + "epoch": 6.920810313075506, + "grad_norm": 0.19082246720790863, + "learning_rate": 2.2875271628971557e-05, + "loss": 1.7192, + "step": 22548 + }, + { + "epoch": 6.921117249846532, + "grad_norm": 0.17682792246341705, + "learning_rate": 2.2871096197923784e-05, + "loss": 1.649, + "step": 22549 + }, + { + "epoch": 6.921424186617557, + "grad_norm": 0.19127340614795685, + "learning_rate": 2.286692103497975e-05, + "loss": 1.7366, + "step": 22550 + }, + { + "epoch": 6.921731123388582, + "grad_norm": 0.1636040210723877, + "learning_rate": 2.2862746140180696e-05, + "loss": 1.6749, + "step": 22551 + }, + { + "epoch": 6.922038060159607, + "grad_norm": 0.2121013104915619, + "learning_rate": 2.285857151356788e-05, + "loss": 1.7342, + "step": 22552 + }, + { + "epoch": 6.922344996930632, + "grad_norm": 0.19183295965194702, + "learning_rate": 2.28543971551826e-05, + "loss": 1.7506, + "step": 22553 + }, + { + "epoch": 6.922651933701657, + "grad_norm": 0.23838891088962555, + "learning_rate": 2.285022306506604e-05, + "loss": 1.6875, + "step": 22554 + }, + { + "epoch": 6.922958870472683, + "grad_norm": 0.17147624492645264, + "learning_rate": 2.2846049243259526e-05, + "loss": 1.7074, + "step": 22555 + }, + { + "epoch": 6.923265807243708, + "grad_norm": 0.2254270762205124, + "learning_rate": 2.2841875689804236e-05, + "loss": 1.7589, + "step": 22556 + }, + { + "epoch": 6.9235727440147325, + "grad_norm": 0.249015673995018, + "learning_rate": 2.2837702404741462e-05, + "loss": 1.7708, + "step": 22557 + }, + { + "epoch": 6.923879680785758, + "grad_norm": 0.19401927292346954, + "learning_rate": 2.283352938811244e-05, + "loss": 1.696, + "step": 22558 + }, + { + "epoch": 6.924186617556783, + "grad_norm": 0.21134993433952332, + "learning_rate": 2.2829356639958398e-05, + "loss": 1.7136, + "step": 22559 + }, + { + "epoch": 6.9244935543278086, + "grad_norm": 0.17600105702877045, + "learning_rate": 2.2825184160320578e-05, + "loss": 1.679, + "step": 22560 + }, + { + "epoch": 6.924800491098834, + "grad_norm": 0.2426912486553192, + "learning_rate": 2.282101194924022e-05, + "loss": 1.7011, + "step": 22561 + }, + { + "epoch": 6.925107427869859, + "grad_norm": 0.20040342211723328, + "learning_rate": 2.281684000675855e-05, + "loss": 1.6844, + "step": 22562 + }, + { + "epoch": 6.925414364640884, + "grad_norm": 0.23790770769119263, + "learning_rate": 2.2812668332916798e-05, + "loss": 1.7318, + "step": 22563 + }, + { + "epoch": 6.925721301411909, + "grad_norm": 0.21387948095798492, + "learning_rate": 2.2808496927756196e-05, + "loss": 1.6903, + "step": 22564 + }, + { + "epoch": 6.926028238182934, + "grad_norm": 0.20471405982971191, + "learning_rate": 2.280432579131796e-05, + "loss": 1.7231, + "step": 22565 + }, + { + "epoch": 6.92633517495396, + "grad_norm": 0.1953156590461731, + "learning_rate": 2.280015492364332e-05, + "loss": 1.7322, + "step": 22566 + }, + { + "epoch": 6.926642111724985, + "grad_norm": 0.3107415437698364, + "learning_rate": 2.279598432477349e-05, + "loss": 1.7833, + "step": 22567 + }, + { + "epoch": 6.9269490484960095, + "grad_norm": 0.2114095836877823, + "learning_rate": 2.279181399474969e-05, + "loss": 1.6923, + "step": 22568 + }, + { + "epoch": 6.927255985267035, + "grad_norm": 0.21373972296714783, + "learning_rate": 2.2787643933613107e-05, + "loss": 1.6897, + "step": 22569 + }, + { + "epoch": 6.92756292203806, + "grad_norm": 0.17955096065998077, + "learning_rate": 2.278347414140502e-05, + "loss": 1.7443, + "step": 22570 + }, + { + "epoch": 6.9278698588090855, + "grad_norm": 0.19275230169296265, + "learning_rate": 2.2779304618166554e-05, + "loss": 1.7109, + "step": 22571 + }, + { + "epoch": 6.928176795580111, + "grad_norm": 0.16774436831474304, + "learning_rate": 2.277513536393899e-05, + "loss": 1.7059, + "step": 22572 + }, + { + "epoch": 6.928483732351136, + "grad_norm": 0.25093573331832886, + "learning_rate": 2.2770966378763457e-05, + "loss": 1.7501, + "step": 22573 + }, + { + "epoch": 6.928790669122161, + "grad_norm": 0.24859540164470673, + "learning_rate": 2.2766797662681216e-05, + "loss": 1.7315, + "step": 22574 + }, + { + "epoch": 6.929097605893186, + "grad_norm": 0.1736115962266922, + "learning_rate": 2.2762629215733438e-05, + "loss": 1.7422, + "step": 22575 + }, + { + "epoch": 6.929404542664211, + "grad_norm": 0.23705001175403595, + "learning_rate": 2.2758461037961326e-05, + "loss": 1.7818, + "step": 22576 + }, + { + "epoch": 6.929711479435237, + "grad_norm": 0.21123656630516052, + "learning_rate": 2.2754293129406073e-05, + "loss": 1.7652, + "step": 22577 + }, + { + "epoch": 6.930018416206261, + "grad_norm": 0.2195751667022705, + "learning_rate": 2.2750125490108858e-05, + "loss": 1.7103, + "step": 22578 + }, + { + "epoch": 6.930325352977286, + "grad_norm": 0.17324887216091156, + "learning_rate": 2.274595812011088e-05, + "loss": 1.7386, + "step": 22579 + }, + { + "epoch": 6.930632289748312, + "grad_norm": 0.3175726532936096, + "learning_rate": 2.2741791019453313e-05, + "loss": 1.7608, + "step": 22580 + }, + { + "epoch": 6.930939226519337, + "grad_norm": 0.26266980171203613, + "learning_rate": 2.273762418817734e-05, + "loss": 1.691, + "step": 22581 + }, + { + "epoch": 6.931246163290362, + "grad_norm": 0.21905983984470367, + "learning_rate": 2.273345762632415e-05, + "loss": 1.6886, + "step": 22582 + }, + { + "epoch": 6.931553100061388, + "grad_norm": 0.2201247364282608, + "learning_rate": 2.2729291333934914e-05, + "loss": 1.7313, + "step": 22583 + }, + { + "epoch": 6.931860036832412, + "grad_norm": 0.2844204306602478, + "learning_rate": 2.2725125311050805e-05, + "loss": 1.6918, + "step": 22584 + }, + { + "epoch": 6.9321669736034375, + "grad_norm": 0.22451715171337128, + "learning_rate": 2.272095955771299e-05, + "loss": 1.699, + "step": 22585 + }, + { + "epoch": 6.932473910374463, + "grad_norm": 0.27357545495033264, + "learning_rate": 2.2716794073962645e-05, + "loss": 1.7709, + "step": 22586 + }, + { + "epoch": 6.932780847145488, + "grad_norm": 0.2605188190937042, + "learning_rate": 2.271262885984093e-05, + "loss": 1.7812, + "step": 22587 + }, + { + "epoch": 6.9330877839165135, + "grad_norm": 0.1866278201341629, + "learning_rate": 2.270846391538899e-05, + "loss": 1.7204, + "step": 22588 + }, + { + "epoch": 6.933394720687538, + "grad_norm": 0.24624690413475037, + "learning_rate": 2.2704299240648043e-05, + "loss": 1.7345, + "step": 22589 + }, + { + "epoch": 6.933701657458563, + "grad_norm": 0.18003861606121063, + "learning_rate": 2.2700134835659175e-05, + "loss": 1.73, + "step": 22590 + }, + { + "epoch": 6.934008594229589, + "grad_norm": 0.2330949604511261, + "learning_rate": 2.269597070046359e-05, + "loss": 1.7614, + "step": 22591 + }, + { + "epoch": 6.934315531000614, + "grad_norm": 0.18806515634059906, + "learning_rate": 2.269180683510243e-05, + "loss": 1.7364, + "step": 22592 + }, + { + "epoch": 6.934622467771639, + "grad_norm": 0.23998546600341797, + "learning_rate": 2.268764323961684e-05, + "loss": 1.6858, + "step": 22593 + }, + { + "epoch": 6.934929404542665, + "grad_norm": 0.1707296371459961, + "learning_rate": 2.268347991404797e-05, + "loss": 1.6703, + "step": 22594 + }, + { + "epoch": 6.935236341313689, + "grad_norm": 0.19724871218204498, + "learning_rate": 2.267931685843696e-05, + "loss": 1.7338, + "step": 22595 + }, + { + "epoch": 6.935543278084714, + "grad_norm": 0.20384611189365387, + "learning_rate": 2.2675154072824955e-05, + "loss": 1.7224, + "step": 22596 + }, + { + "epoch": 6.93585021485574, + "grad_norm": 0.18632391095161438, + "learning_rate": 2.2670991557253092e-05, + "loss": 1.7006, + "step": 22597 + }, + { + "epoch": 6.936157151626765, + "grad_norm": 0.22928105294704437, + "learning_rate": 2.2666829311762505e-05, + "loss": 1.7462, + "step": 22598 + }, + { + "epoch": 6.93646408839779, + "grad_norm": 0.1905689388513565, + "learning_rate": 2.266266733639434e-05, + "loss": 1.7071, + "step": 22599 + }, + { + "epoch": 6.936771025168815, + "grad_norm": 0.2051437795162201, + "learning_rate": 2.2658505631189708e-05, + "loss": 1.6872, + "step": 22600 + }, + { + "epoch": 6.93707796193984, + "grad_norm": 0.178196981549263, + "learning_rate": 2.265434419618976e-05, + "loss": 1.7044, + "step": 22601 + }, + { + "epoch": 6.9373848987108655, + "grad_norm": 0.21399027109146118, + "learning_rate": 2.26501830314356e-05, + "loss": 1.7529, + "step": 22602 + }, + { + "epoch": 6.937691835481891, + "grad_norm": 0.21747443079948425, + "learning_rate": 2.264602213696837e-05, + "loss": 1.7662, + "step": 22603 + }, + { + "epoch": 6.937998772252916, + "grad_norm": 0.1939898282289505, + "learning_rate": 2.2641861512829177e-05, + "loss": 1.7194, + "step": 22604 + }, + { + "epoch": 6.9383057090239415, + "grad_norm": 0.2183499038219452, + "learning_rate": 2.2637701159059128e-05, + "loss": 1.6659, + "step": 22605 + }, + { + "epoch": 6.938612645794966, + "grad_norm": 0.21971984207630157, + "learning_rate": 2.2633541075699387e-05, + "loss": 1.7729, + "step": 22606 + }, + { + "epoch": 6.938919582565991, + "grad_norm": 0.2611743211746216, + "learning_rate": 2.2629381262790998e-05, + "loss": 1.8, + "step": 22607 + }, + { + "epoch": 6.939226519337017, + "grad_norm": 0.22962158918380737, + "learning_rate": 2.2625221720375144e-05, + "loss": 1.7244, + "step": 22608 + }, + { + "epoch": 6.939533456108042, + "grad_norm": 0.20961032807826996, + "learning_rate": 2.2621062448492858e-05, + "loss": 1.7107, + "step": 22609 + }, + { + "epoch": 6.939840392879067, + "grad_norm": 0.2370155155658722, + "learning_rate": 2.2616903447185293e-05, + "loss": 1.7185, + "step": 22610 + }, + { + "epoch": 6.940147329650092, + "grad_norm": 0.19033893942832947, + "learning_rate": 2.2612744716493544e-05, + "loss": 1.7034, + "step": 22611 + }, + { + "epoch": 6.940454266421117, + "grad_norm": 0.22657649219036102, + "learning_rate": 2.2608586256458704e-05, + "loss": 1.6987, + "step": 22612 + }, + { + "epoch": 6.940761203192142, + "grad_norm": 0.17767953872680664, + "learning_rate": 2.2604428067121862e-05, + "loss": 1.6934, + "step": 22613 + }, + { + "epoch": 6.941068139963168, + "grad_norm": 0.209768146276474, + "learning_rate": 2.2600270148524123e-05, + "loss": 1.7148, + "step": 22614 + }, + { + "epoch": 6.941375076734193, + "grad_norm": 0.21234147250652313, + "learning_rate": 2.2596112500706574e-05, + "loss": 1.7147, + "step": 22615 + }, + { + "epoch": 6.941682013505218, + "grad_norm": 0.17608872056007385, + "learning_rate": 2.2591955123710307e-05, + "loss": 1.6873, + "step": 22616 + }, + { + "epoch": 6.941988950276243, + "grad_norm": 0.1743561178445816, + "learning_rate": 2.25877980175764e-05, + "loss": 1.7273, + "step": 22617 + }, + { + "epoch": 6.942295887047268, + "grad_norm": 0.22064091265201569, + "learning_rate": 2.258364118234594e-05, + "loss": 1.7785, + "step": 22618 + }, + { + "epoch": 6.9426028238182935, + "grad_norm": 0.20353585481643677, + "learning_rate": 2.2579484618060005e-05, + "loss": 1.7518, + "step": 22619 + }, + { + "epoch": 6.942909760589319, + "grad_norm": 0.23978710174560547, + "learning_rate": 2.2575328324759676e-05, + "loss": 1.7576, + "step": 22620 + }, + { + "epoch": 6.943216697360343, + "grad_norm": 0.24991966784000397, + "learning_rate": 2.257117230248603e-05, + "loss": 1.7383, + "step": 22621 + }, + { + "epoch": 6.943523634131369, + "grad_norm": 0.20734381675720215, + "learning_rate": 2.256701655128011e-05, + "loss": 1.7063, + "step": 22622 + }, + { + "epoch": 6.943830570902394, + "grad_norm": 0.20097215473651886, + "learning_rate": 2.2562861071183057e-05, + "loss": 1.7647, + "step": 22623 + }, + { + "epoch": 6.944137507673419, + "grad_norm": 0.20144836604595184, + "learning_rate": 2.2558705862235852e-05, + "loss": 1.7165, + "step": 22624 + }, + { + "epoch": 6.944444444444445, + "grad_norm": 0.20394138991832733, + "learning_rate": 2.255455092447964e-05, + "loss": 1.7048, + "step": 22625 + }, + { + "epoch": 6.94475138121547, + "grad_norm": 0.21430160105228424, + "learning_rate": 2.2550396257955396e-05, + "loss": 1.7233, + "step": 22626 + }, + { + "epoch": 6.945058317986494, + "grad_norm": 0.19071494042873383, + "learning_rate": 2.254624186270425e-05, + "loss": 1.7407, + "step": 22627 + }, + { + "epoch": 6.94536525475752, + "grad_norm": 0.19658641517162323, + "learning_rate": 2.2542087738767232e-05, + "loss": 1.6371, + "step": 22628 + }, + { + "epoch": 6.945672191528545, + "grad_norm": 0.19009098410606384, + "learning_rate": 2.25379338861854e-05, + "loss": 1.7515, + "step": 22629 + }, + { + "epoch": 6.94597912829957, + "grad_norm": 0.21250933408737183, + "learning_rate": 2.2533780304999796e-05, + "loss": 1.7308, + "step": 22630 + }, + { + "epoch": 6.946286065070596, + "grad_norm": 0.22148491442203522, + "learning_rate": 2.2529626995251475e-05, + "loss": 1.705, + "step": 22631 + }, + { + "epoch": 6.94659300184162, + "grad_norm": 0.190248504281044, + "learning_rate": 2.252547395698148e-05, + "loss": 1.7507, + "step": 22632 + }, + { + "epoch": 6.9468999386126455, + "grad_norm": 0.20005743205547333, + "learning_rate": 2.2521321190230855e-05, + "loss": 1.7622, + "step": 22633 + }, + { + "epoch": 6.947206875383671, + "grad_norm": 0.24233438074588776, + "learning_rate": 2.251716869504064e-05, + "loss": 1.7119, + "step": 22634 + }, + { + "epoch": 6.947513812154696, + "grad_norm": 0.20823299884796143, + "learning_rate": 2.2513016471451874e-05, + "loss": 1.69, + "step": 22635 + }, + { + "epoch": 6.9478207489257215, + "grad_norm": 0.21486341953277588, + "learning_rate": 2.250886451950559e-05, + "loss": 1.6528, + "step": 22636 + }, + { + "epoch": 6.948127685696747, + "grad_norm": 0.22201848030090332, + "learning_rate": 2.2504712839242813e-05, + "loss": 1.7454, + "step": 22637 + }, + { + "epoch": 6.948434622467771, + "grad_norm": 0.25179341435432434, + "learning_rate": 2.2500561430704588e-05, + "loss": 1.7226, + "step": 22638 + }, + { + "epoch": 6.948741559238797, + "grad_norm": 0.2510581910610199, + "learning_rate": 2.2496410293931913e-05, + "loss": 1.7048, + "step": 22639 + }, + { + "epoch": 6.949048496009822, + "grad_norm": 0.2406487911939621, + "learning_rate": 2.2492259428965866e-05, + "loss": 1.6751, + "step": 22640 + }, + { + "epoch": 6.949355432780847, + "grad_norm": 0.2555276155471802, + "learning_rate": 2.24881088358474e-05, + "loss": 1.7369, + "step": 22641 + }, + { + "epoch": 6.949662369551873, + "grad_norm": 0.19703364372253418, + "learning_rate": 2.2483958514617597e-05, + "loss": 1.7196, + "step": 22642 + }, + { + "epoch": 6.949969306322897, + "grad_norm": 0.18491938710212708, + "learning_rate": 2.2479808465317414e-05, + "loss": 1.6923, + "step": 22643 + }, + { + "epoch": 6.9502762430939224, + "grad_norm": 0.21588458120822906, + "learning_rate": 2.247565868798791e-05, + "loss": 1.6797, + "step": 22644 + }, + { + "epoch": 6.950583179864948, + "grad_norm": 0.18480601906776428, + "learning_rate": 2.247150918267008e-05, + "loss": 1.6672, + "step": 22645 + }, + { + "epoch": 6.950890116635973, + "grad_norm": 0.261846125125885, + "learning_rate": 2.246735994940493e-05, + "loss": 1.7594, + "step": 22646 + }, + { + "epoch": 6.9511970534069984, + "grad_norm": 0.24510261416435242, + "learning_rate": 2.2463210988233468e-05, + "loss": 1.7712, + "step": 22647 + }, + { + "epoch": 6.951503990178024, + "grad_norm": 0.25896379351615906, + "learning_rate": 2.24590622991967e-05, + "loss": 1.6811, + "step": 22648 + }, + { + "epoch": 6.951810926949048, + "grad_norm": 0.26284709572792053, + "learning_rate": 2.245491388233561e-05, + "loss": 1.7269, + "step": 22649 + }, + { + "epoch": 6.952117863720074, + "grad_norm": 0.1613062471151352, + "learning_rate": 2.245076573769121e-05, + "loss": 1.6162, + "step": 22650 + }, + { + "epoch": 6.952424800491099, + "grad_norm": 0.203482523560524, + "learning_rate": 2.244661786530449e-05, + "loss": 1.7124, + "step": 22651 + }, + { + "epoch": 6.952731737262124, + "grad_norm": 0.18294258415699005, + "learning_rate": 2.2442470265216446e-05, + "loss": 1.7101, + "step": 22652 + }, + { + "epoch": 6.953038674033149, + "grad_norm": 0.1841319352388382, + "learning_rate": 2.2438322937468058e-05, + "loss": 1.723, + "step": 22653 + }, + { + "epoch": 6.953345610804174, + "grad_norm": 0.1600010097026825, + "learning_rate": 2.2434175882100322e-05, + "loss": 1.6867, + "step": 22654 + }, + { + "epoch": 6.953652547575199, + "grad_norm": 0.16904005408287048, + "learning_rate": 2.243002909915421e-05, + "loss": 1.6993, + "step": 22655 + }, + { + "epoch": 6.953959484346225, + "grad_norm": 0.20069406926631927, + "learning_rate": 2.2425882588670692e-05, + "loss": 1.6995, + "step": 22656 + }, + { + "epoch": 6.95426642111725, + "grad_norm": 0.170061394572258, + "learning_rate": 2.2421736350690808e-05, + "loss": 1.7217, + "step": 22657 + }, + { + "epoch": 6.954573357888275, + "grad_norm": 0.20549608767032623, + "learning_rate": 2.241759038525545e-05, + "loss": 1.7229, + "step": 22658 + }, + { + "epoch": 6.9548802946593, + "grad_norm": 0.20916205644607544, + "learning_rate": 2.241344469240566e-05, + "loss": 1.7499, + "step": 22659 + }, + { + "epoch": 6.955187231430325, + "grad_norm": 0.156641885638237, + "learning_rate": 2.2409299272182348e-05, + "loss": 1.6827, + "step": 22660 + }, + { + "epoch": 6.9554941682013505, + "grad_norm": 0.17876049876213074, + "learning_rate": 2.240515412462653e-05, + "loss": 1.6745, + "step": 22661 + }, + { + "epoch": 6.955801104972376, + "grad_norm": 0.17265759408473969, + "learning_rate": 2.2401009249779153e-05, + "loss": 1.7687, + "step": 22662 + }, + { + "epoch": 6.956108041743401, + "grad_norm": 0.18822525441646576, + "learning_rate": 2.2396864647681175e-05, + "loss": 1.6974, + "step": 22663 + }, + { + "epoch": 6.956414978514426, + "grad_norm": 0.18686626851558685, + "learning_rate": 2.2392720318373567e-05, + "loss": 1.7522, + "step": 22664 + }, + { + "epoch": 6.956721915285451, + "grad_norm": 0.1668211668729782, + "learning_rate": 2.238857626189727e-05, + "loss": 1.7198, + "step": 22665 + }, + { + "epoch": 6.957028852056476, + "grad_norm": 0.23307017982006073, + "learning_rate": 2.238443247829325e-05, + "loss": 1.7377, + "step": 22666 + }, + { + "epoch": 6.957335788827502, + "grad_norm": 0.1771896481513977, + "learning_rate": 2.2380288967602453e-05, + "loss": 1.7626, + "step": 22667 + }, + { + "epoch": 6.957642725598527, + "grad_norm": 0.185984805226326, + "learning_rate": 2.237614572986583e-05, + "loss": 1.7328, + "step": 22668 + }, + { + "epoch": 6.957949662369552, + "grad_norm": 0.3076271414756775, + "learning_rate": 2.2372002765124327e-05, + "loss": 1.7081, + "step": 22669 + }, + { + "epoch": 6.958256599140577, + "grad_norm": 0.17874667048454285, + "learning_rate": 2.2367860073418885e-05, + "loss": 1.6752, + "step": 22670 + }, + { + "epoch": 6.958563535911602, + "grad_norm": 0.2044304609298706, + "learning_rate": 2.2363717654790445e-05, + "loss": 1.7325, + "step": 22671 + }, + { + "epoch": 6.958870472682627, + "grad_norm": 0.19335824251174927, + "learning_rate": 2.2359575509279945e-05, + "loss": 1.7192, + "step": 22672 + }, + { + "epoch": 6.959177409453653, + "grad_norm": 0.19514116644859314, + "learning_rate": 2.23554336369283e-05, + "loss": 1.7186, + "step": 22673 + }, + { + "epoch": 6.959484346224678, + "grad_norm": 0.2779110372066498, + "learning_rate": 2.23512920377765e-05, + "loss": 1.7391, + "step": 22674 + }, + { + "epoch": 6.9597912829957025, + "grad_norm": 0.17390480637550354, + "learning_rate": 2.2347150711865406e-05, + "loss": 1.6538, + "step": 22675 + }, + { + "epoch": 6.960098219766728, + "grad_norm": 0.1640262007713318, + "learning_rate": 2.234300965923601e-05, + "loss": 1.6534, + "step": 22676 + }, + { + "epoch": 6.960405156537753, + "grad_norm": 0.17519034445285797, + "learning_rate": 2.2338868879929165e-05, + "loss": 1.6931, + "step": 22677 + }, + { + "epoch": 6.9607120933087785, + "grad_norm": 0.16885873675346375, + "learning_rate": 2.2334728373985847e-05, + "loss": 1.7204, + "step": 22678 + }, + { + "epoch": 6.961019030079804, + "grad_norm": 0.16997110843658447, + "learning_rate": 2.2330588141446963e-05, + "loss": 1.7063, + "step": 22679 + }, + { + "epoch": 6.961325966850829, + "grad_norm": 0.17793773114681244, + "learning_rate": 2.2326448182353422e-05, + "loss": 1.7382, + "step": 22680 + }, + { + "epoch": 6.961632903621854, + "grad_norm": 0.1809101551771164, + "learning_rate": 2.2322308496746134e-05, + "loss": 1.6874, + "step": 22681 + }, + { + "epoch": 6.961939840392879, + "grad_norm": 0.19095295667648315, + "learning_rate": 2.2318169084666023e-05, + "loss": 1.7122, + "step": 22682 + }, + { + "epoch": 6.962246777163904, + "grad_norm": 0.19206218421459198, + "learning_rate": 2.2314029946153992e-05, + "loss": 1.6733, + "step": 22683 + }, + { + "epoch": 6.96255371393493, + "grad_norm": 0.21243152022361755, + "learning_rate": 2.2309891081250938e-05, + "loss": 1.7026, + "step": 22684 + }, + { + "epoch": 6.962860650705955, + "grad_norm": 0.17602933943271637, + "learning_rate": 2.2305752489997777e-05, + "loss": 1.7073, + "step": 22685 + }, + { + "epoch": 6.963167587476979, + "grad_norm": 0.21810807287693024, + "learning_rate": 2.2301614172435398e-05, + "loss": 1.7323, + "step": 22686 + }, + { + "epoch": 6.963474524248005, + "grad_norm": 0.20711791515350342, + "learning_rate": 2.2297476128604706e-05, + "loss": 1.7228, + "step": 22687 + }, + { + "epoch": 6.96378146101903, + "grad_norm": 0.20376695692539215, + "learning_rate": 2.2293338358546583e-05, + "loss": 1.715, + "step": 22688 + }, + { + "epoch": 6.964088397790055, + "grad_norm": 0.20096196234226227, + "learning_rate": 2.228920086230194e-05, + "loss": 1.7239, + "step": 22689 + }, + { + "epoch": 6.964395334561081, + "grad_norm": 0.24215486645698547, + "learning_rate": 2.228506363991163e-05, + "loss": 1.7879, + "step": 22690 + }, + { + "epoch": 6.964702271332106, + "grad_norm": 0.1917567104101181, + "learning_rate": 2.2280926691416603e-05, + "loss": 1.6903, + "step": 22691 + }, + { + "epoch": 6.9650092081031305, + "grad_norm": 0.19827421009540558, + "learning_rate": 2.2276790016857673e-05, + "loss": 1.7654, + "step": 22692 + }, + { + "epoch": 6.965316144874156, + "grad_norm": 0.20852476358413696, + "learning_rate": 2.2272653616275784e-05, + "loss": 1.7452, + "step": 22693 + }, + { + "epoch": 6.965623081645181, + "grad_norm": 0.21223776042461395, + "learning_rate": 2.2268517489711755e-05, + "loss": 1.6973, + "step": 22694 + }, + { + "epoch": 6.9659300184162065, + "grad_norm": 0.1903543621301651, + "learning_rate": 2.22643816372065e-05, + "loss": 1.7398, + "step": 22695 + }, + { + "epoch": 6.966236955187231, + "grad_norm": 0.21726597845554352, + "learning_rate": 2.2260246058800888e-05, + "loss": 1.7813, + "step": 22696 + }, + { + "epoch": 6.966543891958256, + "grad_norm": 0.1710241734981537, + "learning_rate": 2.225611075453578e-05, + "loss": 1.6647, + "step": 22697 + }, + { + "epoch": 6.966850828729282, + "grad_norm": 0.199532151222229, + "learning_rate": 2.2251975724452045e-05, + "loss": 1.7503, + "step": 22698 + }, + { + "epoch": 6.967157765500307, + "grad_norm": 0.18966728448867798, + "learning_rate": 2.224784096859055e-05, + "loss": 1.8113, + "step": 22699 + }, + { + "epoch": 6.967464702271332, + "grad_norm": 0.1977413445711136, + "learning_rate": 2.2243706486992162e-05, + "loss": 1.7036, + "step": 22700 + }, + { + "epoch": 6.967771639042358, + "grad_norm": 0.1794840395450592, + "learning_rate": 2.223957227969773e-05, + "loss": 1.714, + "step": 22701 + }, + { + "epoch": 6.968078575813382, + "grad_norm": 0.1811632663011551, + "learning_rate": 2.2235438346748117e-05, + "loss": 1.6845, + "step": 22702 + }, + { + "epoch": 6.968385512584407, + "grad_norm": 0.17478540539741516, + "learning_rate": 2.2231304688184172e-05, + "loss": 1.7078, + "step": 22703 + }, + { + "epoch": 6.968692449355433, + "grad_norm": 0.22631226480007172, + "learning_rate": 2.2227171304046756e-05, + "loss": 1.7576, + "step": 22704 + }, + { + "epoch": 6.968999386126458, + "grad_norm": 0.20498304069042206, + "learning_rate": 2.2223038194376712e-05, + "loss": 1.7342, + "step": 22705 + }, + { + "epoch": 6.969306322897483, + "grad_norm": 0.18556833267211914, + "learning_rate": 2.221890535921488e-05, + "loss": 1.6583, + "step": 22706 + }, + { + "epoch": 6.969613259668508, + "grad_norm": 0.19878216087818146, + "learning_rate": 2.221477279860209e-05, + "loss": 1.7536, + "step": 22707 + }, + { + "epoch": 6.969920196439533, + "grad_norm": 0.20304621756076813, + "learning_rate": 2.221064051257924e-05, + "loss": 1.7263, + "step": 22708 + }, + { + "epoch": 6.9702271332105585, + "grad_norm": 0.18725872039794922, + "learning_rate": 2.220650850118709e-05, + "loss": 1.7174, + "step": 22709 + }, + { + "epoch": 6.970534069981584, + "grad_norm": 0.28994759917259216, + "learning_rate": 2.2202376764466554e-05, + "loss": 1.7401, + "step": 22710 + }, + { + "epoch": 6.970841006752609, + "grad_norm": 0.19320951402187347, + "learning_rate": 2.2198245302458383e-05, + "loss": 1.7204, + "step": 22711 + }, + { + "epoch": 6.9711479435236345, + "grad_norm": 0.24737104773521423, + "learning_rate": 2.2194114115203464e-05, + "loss": 1.7418, + "step": 22712 + }, + { + "epoch": 6.971454880294659, + "grad_norm": 0.18811406195163727, + "learning_rate": 2.218998320274261e-05, + "loss": 1.6999, + "step": 22713 + }, + { + "epoch": 6.971761817065684, + "grad_norm": 0.20729362964630127, + "learning_rate": 2.2185852565116638e-05, + "loss": 1.6833, + "step": 22714 + }, + { + "epoch": 6.97206875383671, + "grad_norm": 0.1862284392118454, + "learning_rate": 2.2181722202366378e-05, + "loss": 1.7232, + "step": 22715 + }, + { + "epoch": 6.972375690607735, + "grad_norm": 0.24128347635269165, + "learning_rate": 2.217759211453264e-05, + "loss": 1.7081, + "step": 22716 + }, + { + "epoch": 6.97268262737876, + "grad_norm": 0.2007059007883072, + "learning_rate": 2.217346230165625e-05, + "loss": 1.7383, + "step": 22717 + }, + { + "epoch": 6.972989564149785, + "grad_norm": 0.2177598625421524, + "learning_rate": 2.216933276377801e-05, + "loss": 1.7494, + "step": 22718 + }, + { + "epoch": 6.97329650092081, + "grad_norm": 0.20965704321861267, + "learning_rate": 2.2165203500938735e-05, + "loss": 1.7326, + "step": 22719 + }, + { + "epoch": 6.973603437691835, + "grad_norm": 0.17255879938602448, + "learning_rate": 2.2161074513179237e-05, + "loss": 1.6713, + "step": 22720 + }, + { + "epoch": 6.973910374462861, + "grad_norm": 0.21480637788772583, + "learning_rate": 2.215694580054032e-05, + "loss": 1.7248, + "step": 22721 + }, + { + "epoch": 6.974217311233886, + "grad_norm": 0.15835267305374146, + "learning_rate": 2.215281736306278e-05, + "loss": 1.7086, + "step": 22722 + }, + { + "epoch": 6.974524248004911, + "grad_norm": 0.20524290204048157, + "learning_rate": 2.2148689200787415e-05, + "loss": 1.7472, + "step": 22723 + }, + { + "epoch": 6.974831184775936, + "grad_norm": 0.16152524948120117, + "learning_rate": 2.214456131375502e-05, + "loss": 1.6373, + "step": 22724 + }, + { + "epoch": 6.975138121546961, + "grad_norm": 0.1995699107646942, + "learning_rate": 2.2140433702006425e-05, + "loss": 1.6949, + "step": 22725 + }, + { + "epoch": 6.975445058317987, + "grad_norm": 0.19927829504013062, + "learning_rate": 2.213630636558236e-05, + "loss": 1.7875, + "step": 22726 + }, + { + "epoch": 6.975751995089012, + "grad_norm": 0.19159351289272308, + "learning_rate": 2.213217930452368e-05, + "loss": 1.7067, + "step": 22727 + }, + { + "epoch": 6.976058931860036, + "grad_norm": 0.21832366287708282, + "learning_rate": 2.2128052518871107e-05, + "loss": 1.6952, + "step": 22728 + }, + { + "epoch": 6.976365868631062, + "grad_norm": 0.2433125376701355, + "learning_rate": 2.212392600866547e-05, + "loss": 1.7503, + "step": 22729 + }, + { + "epoch": 6.976672805402087, + "grad_norm": 0.25504401326179504, + "learning_rate": 2.2119799773947535e-05, + "loss": 1.7289, + "step": 22730 + }, + { + "epoch": 6.976979742173112, + "grad_norm": 0.20463863015174866, + "learning_rate": 2.211567381475808e-05, + "loss": 1.7442, + "step": 22731 + }, + { + "epoch": 6.977286678944138, + "grad_norm": 0.21862375736236572, + "learning_rate": 2.2111548131137883e-05, + "loss": 1.7266, + "step": 22732 + }, + { + "epoch": 6.977593615715163, + "grad_norm": 0.2124018520116806, + "learning_rate": 2.210742272312771e-05, + "loss": 1.7555, + "step": 22733 + }, + { + "epoch": 6.9779005524861875, + "grad_norm": 0.2911135256290436, + "learning_rate": 2.2103297590768334e-05, + "loss": 1.711, + "step": 22734 + }, + { + "epoch": 6.978207489257213, + "grad_norm": 0.2172393649816513, + "learning_rate": 2.2099172734100525e-05, + "loss": 1.7054, + "step": 22735 + }, + { + "epoch": 6.978514426028238, + "grad_norm": 0.28964513540267944, + "learning_rate": 2.2095048153165043e-05, + "loss": 1.7231, + "step": 22736 + }, + { + "epoch": 6.9788213627992635, + "grad_norm": 0.2557905316352844, + "learning_rate": 2.209092384800265e-05, + "loss": 1.7219, + "step": 22737 + }, + { + "epoch": 6.979128299570289, + "grad_norm": 0.23358628153800964, + "learning_rate": 2.2086799818654102e-05, + "loss": 1.7627, + "step": 22738 + }, + { + "epoch": 6.979435236341313, + "grad_norm": 0.18856312334537506, + "learning_rate": 2.2082676065160163e-05, + "loss": 1.6577, + "step": 22739 + }, + { + "epoch": 6.979742173112339, + "grad_norm": 0.18412479758262634, + "learning_rate": 2.207855258756158e-05, + "loss": 1.6661, + "step": 22740 + }, + { + "epoch": 6.980049109883364, + "grad_norm": 0.20592401921749115, + "learning_rate": 2.207442938589911e-05, + "loss": 1.6737, + "step": 22741 + }, + { + "epoch": 6.980356046654389, + "grad_norm": 0.2015630006790161, + "learning_rate": 2.2070306460213493e-05, + "loss": 1.73, + "step": 22742 + }, + { + "epoch": 6.980662983425415, + "grad_norm": 0.23446126282215118, + "learning_rate": 2.2066183810545454e-05, + "loss": 1.7391, + "step": 22743 + }, + { + "epoch": 6.98096992019644, + "grad_norm": 0.1810954511165619, + "learning_rate": 2.2062061436935803e-05, + "loss": 1.689, + "step": 22744 + }, + { + "epoch": 6.981276856967464, + "grad_norm": 0.25031471252441406, + "learning_rate": 2.20579393394252e-05, + "loss": 1.8161, + "step": 22745 + }, + { + "epoch": 6.98158379373849, + "grad_norm": 0.183212012052536, + "learning_rate": 2.2053817518054433e-05, + "loss": 1.6494, + "step": 22746 + }, + { + "epoch": 6.981890730509515, + "grad_norm": 0.2115766555070877, + "learning_rate": 2.204969597286422e-05, + "loss": 1.6912, + "step": 22747 + }, + { + "epoch": 6.98219766728054, + "grad_norm": 0.19966226816177368, + "learning_rate": 2.2045574703895296e-05, + "loss": 1.7002, + "step": 22748 + }, + { + "epoch": 6.982504604051566, + "grad_norm": 0.20601172745227814, + "learning_rate": 2.2041453711188385e-05, + "loss": 1.7839, + "step": 22749 + }, + { + "epoch": 6.98281154082259, + "grad_norm": 0.2174808531999588, + "learning_rate": 2.2037332994784222e-05, + "loss": 1.7169, + "step": 22750 + }, + { + "epoch": 6.9831184775936155, + "grad_norm": 0.1921808421611786, + "learning_rate": 2.2033212554723514e-05, + "loss": 1.6754, + "step": 22751 + }, + { + "epoch": 6.983425414364641, + "grad_norm": 0.1977350264787674, + "learning_rate": 2.2029092391046997e-05, + "loss": 1.7408, + "step": 22752 + }, + { + "epoch": 6.983732351135666, + "grad_norm": 0.18366695940494537, + "learning_rate": 2.2024972503795383e-05, + "loss": 1.6818, + "step": 22753 + }, + { + "epoch": 6.9840392879066915, + "grad_norm": 0.18127809464931488, + "learning_rate": 2.2020852893009387e-05, + "loss": 1.7392, + "step": 22754 + }, + { + "epoch": 6.984346224677717, + "grad_norm": 0.1973503679037094, + "learning_rate": 2.2016733558729718e-05, + "loss": 1.7416, + "step": 22755 + }, + { + "epoch": 6.984653161448741, + "grad_norm": 0.1971634328365326, + "learning_rate": 2.2012614500997096e-05, + "loss": 1.7545, + "step": 22756 + }, + { + "epoch": 6.984960098219767, + "grad_norm": 0.17244087159633636, + "learning_rate": 2.2008495719852218e-05, + "loss": 1.7348, + "step": 22757 + }, + { + "epoch": 6.985267034990792, + "grad_norm": 0.19024424254894257, + "learning_rate": 2.200437721533579e-05, + "loss": 1.6647, + "step": 22758 + }, + { + "epoch": 6.985573971761817, + "grad_norm": 0.18455122411251068, + "learning_rate": 2.200025898748852e-05, + "loss": 1.7528, + "step": 22759 + }, + { + "epoch": 6.985880908532843, + "grad_norm": 0.24437187612056732, + "learning_rate": 2.199614103635108e-05, + "loss": 1.7101, + "step": 22760 + }, + { + "epoch": 6.986187845303867, + "grad_norm": 0.18844331800937653, + "learning_rate": 2.1992023361964224e-05, + "loss": 1.6864, + "step": 22761 + }, + { + "epoch": 6.986494782074892, + "grad_norm": 0.18768003582954407, + "learning_rate": 2.1987905964368576e-05, + "loss": 1.6482, + "step": 22762 + }, + { + "epoch": 6.986801718845918, + "grad_norm": 0.19491778314113617, + "learning_rate": 2.1983788843604898e-05, + "loss": 1.7106, + "step": 22763 + }, + { + "epoch": 6.987108655616943, + "grad_norm": 0.23565757274627686, + "learning_rate": 2.1979671999713797e-05, + "loss": 1.7362, + "step": 22764 + }, + { + "epoch": 6.987415592387968, + "grad_norm": 0.2097240835428238, + "learning_rate": 2.1975555432736018e-05, + "loss": 1.7305, + "step": 22765 + }, + { + "epoch": 6.987722529158994, + "grad_norm": 0.2171555608510971, + "learning_rate": 2.197143914271223e-05, + "loss": 1.7213, + "step": 22766 + }, + { + "epoch": 6.988029465930018, + "grad_norm": 0.1993926763534546, + "learning_rate": 2.196732312968311e-05, + "loss": 1.6901, + "step": 22767 + }, + { + "epoch": 6.9883364027010435, + "grad_norm": 0.2345978319644928, + "learning_rate": 2.1963207393689346e-05, + "loss": 1.7456, + "step": 22768 + }, + { + "epoch": 6.988643339472069, + "grad_norm": 0.20831161737442017, + "learning_rate": 2.1959091934771564e-05, + "loss": 1.764, + "step": 22769 + }, + { + "epoch": 6.988950276243094, + "grad_norm": 0.24944809079170227, + "learning_rate": 2.195497675297049e-05, + "loss": 1.7398, + "step": 22770 + }, + { + "epoch": 6.989257213014119, + "grad_norm": 0.25463199615478516, + "learning_rate": 2.1950861848326777e-05, + "loss": 1.7002, + "step": 22771 + }, + { + "epoch": 6.989564149785144, + "grad_norm": 0.2298898696899414, + "learning_rate": 2.194674722088108e-05, + "loss": 1.755, + "step": 22772 + }, + { + "epoch": 6.989871086556169, + "grad_norm": 0.21839721500873566, + "learning_rate": 2.194263287067408e-05, + "loss": 1.6667, + "step": 22773 + }, + { + "epoch": 6.990178023327195, + "grad_norm": 0.2197437435388565, + "learning_rate": 2.1938518797746417e-05, + "loss": 1.6774, + "step": 22774 + }, + { + "epoch": 6.99048496009822, + "grad_norm": 0.23588024079799652, + "learning_rate": 2.1934405002138763e-05, + "loss": 1.6916, + "step": 22775 + }, + { + "epoch": 6.990791896869245, + "grad_norm": 0.20632316172122955, + "learning_rate": 2.1930291483891767e-05, + "loss": 1.6682, + "step": 22776 + }, + { + "epoch": 6.99109883364027, + "grad_norm": 0.22786293923854828, + "learning_rate": 2.192617824304607e-05, + "loss": 1.7138, + "step": 22777 + }, + { + "epoch": 6.991405770411295, + "grad_norm": 0.3235599994659424, + "learning_rate": 2.1922065279642363e-05, + "loss": 1.7545, + "step": 22778 + }, + { + "epoch": 6.99171270718232, + "grad_norm": 0.1919393390417099, + "learning_rate": 2.191795259372123e-05, + "loss": 1.7422, + "step": 22779 + }, + { + "epoch": 6.992019643953346, + "grad_norm": 0.16472585499286652, + "learning_rate": 2.1913840185323385e-05, + "loss": 1.6824, + "step": 22780 + }, + { + "epoch": 6.992326580724371, + "grad_norm": 0.21422579884529114, + "learning_rate": 2.1909728054489397e-05, + "loss": 1.696, + "step": 22781 + }, + { + "epoch": 6.9926335174953955, + "grad_norm": 0.18965782225131989, + "learning_rate": 2.190561620125996e-05, + "loss": 1.7026, + "step": 22782 + }, + { + "epoch": 6.992940454266421, + "grad_norm": 0.184856116771698, + "learning_rate": 2.190150462567569e-05, + "loss": 1.7202, + "step": 22783 + }, + { + "epoch": 6.993247391037446, + "grad_norm": 0.18382076919078827, + "learning_rate": 2.1897393327777223e-05, + "loss": 1.7525, + "step": 22784 + }, + { + "epoch": 6.9935543278084715, + "grad_norm": 0.17239750921726227, + "learning_rate": 2.1893282307605202e-05, + "loss": 1.7297, + "step": 22785 + }, + { + "epoch": 6.993861264579497, + "grad_norm": 0.18522322177886963, + "learning_rate": 2.18891715652002e-05, + "loss": 1.6952, + "step": 22786 + }, + { + "epoch": 6.994168201350522, + "grad_norm": 0.1946135014295578, + "learning_rate": 2.18850611006029e-05, + "loss": 1.6879, + "step": 22787 + }, + { + "epoch": 6.994475138121547, + "grad_norm": 0.2028069645166397, + "learning_rate": 2.188095091385391e-05, + "loss": 1.7412, + "step": 22788 + }, + { + "epoch": 6.994782074892572, + "grad_norm": 0.18794523179531097, + "learning_rate": 2.1876841004993838e-05, + "loss": 1.6936, + "step": 22789 + }, + { + "epoch": 6.995089011663597, + "grad_norm": 0.1912194788455963, + "learning_rate": 2.187273137406331e-05, + "loss": 1.7051, + "step": 22790 + }, + { + "epoch": 6.995395948434623, + "grad_norm": 0.1528688222169876, + "learning_rate": 2.1868622021102934e-05, + "loss": 1.6816, + "step": 22791 + }, + { + "epoch": 6.995702885205648, + "grad_norm": 0.2108357548713684, + "learning_rate": 2.1864512946153325e-05, + "loss": 1.7018, + "step": 22792 + }, + { + "epoch": 6.996009821976672, + "grad_norm": 0.16667310893535614, + "learning_rate": 2.1860404149255092e-05, + "loss": 1.7235, + "step": 22793 + }, + { + "epoch": 6.996316758747698, + "grad_norm": 0.16995872557163239, + "learning_rate": 2.185629563044882e-05, + "loss": 1.7086, + "step": 22794 + }, + { + "epoch": 6.996623695518723, + "grad_norm": 0.1962304711341858, + "learning_rate": 2.1852187389775165e-05, + "loss": 1.7523, + "step": 22795 + }, + { + "epoch": 6.996930632289748, + "grad_norm": 0.17774102091789246, + "learning_rate": 2.1848079427274655e-05, + "loss": 1.6649, + "step": 22796 + }, + { + "epoch": 6.997237569060774, + "grad_norm": 0.18844567239284515, + "learning_rate": 2.184397174298796e-05, + "loss": 1.7281, + "step": 22797 + }, + { + "epoch": 6.997544505831799, + "grad_norm": 0.15324150025844574, + "learning_rate": 2.1839864336955607e-05, + "loss": 1.6496, + "step": 22798 + }, + { + "epoch": 6.9978514426028235, + "grad_norm": 0.25148099660873413, + "learning_rate": 2.1835757209218233e-05, + "loss": 1.7889, + "step": 22799 + }, + { + "epoch": 6.998158379373849, + "grad_norm": 0.22258763015270233, + "learning_rate": 2.1831650359816414e-05, + "loss": 1.7303, + "step": 22800 + }, + { + "epoch": 6.998465316144874, + "grad_norm": 0.21465472877025604, + "learning_rate": 2.182754378879074e-05, + "loss": 1.733, + "step": 22801 + }, + { + "epoch": 6.9987722529158995, + "grad_norm": 0.1894017904996872, + "learning_rate": 2.182343749618181e-05, + "loss": 1.7104, + "step": 22802 + }, + { + "epoch": 6.999079189686924, + "grad_norm": 0.19616369903087616, + "learning_rate": 2.181933148203014e-05, + "loss": 1.7015, + "step": 22803 + }, + { + "epoch": 6.999386126457949, + "grad_norm": 0.1720295250415802, + "learning_rate": 2.181522574637638e-05, + "loss": 1.6609, + "step": 22804 + }, + { + "epoch": 6.999693063228975, + "grad_norm": 0.2508579194545746, + "learning_rate": 2.1811120289261077e-05, + "loss": 1.7485, + "step": 22805 + }, + { + "epoch": 7.0, + "grad_norm": 0.1701229363679886, + "learning_rate": 2.1807015110724805e-05, + "loss": 1.6822, + "step": 22806 + } + ], + "logging_steps": 1.0, + "max_steps": 32580, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.492793947216046e+20, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-22806/training_args.bin b/checkpoint-22806/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9db7ad91da5423a229826113feb3e9db3ef40c31 --- /dev/null +++ b/checkpoint-22806/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682b697e933b6e2693e5f9af9a0654effab1ca392c8500bf8af0eb089116a263 +size 7288 diff --git a/checkpoint-22806/zero_to_fp32.py b/checkpoint-22806/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-22806/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-26064/config.json b/checkpoint-26064/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a29af639fbf705188c21aae22660a85fee1ca26e --- /dev/null +++ b/checkpoint-26064/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "freeze_mm_mlp_adapter": false, + "gen_hidden_size": 1792, + "gen_pooling": "early_pool2d_4", + "gen_vision_tower": "eva-clip-E-14-plus", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "image_aspect_ratio": "square", + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "llava_llama", + "n_query": 64, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "tokenizer_model_max_length": 256, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "tune_mm_mlp_adapter": false, + "use_cache": false, + "use_mm_proj": true, + "vision_tower_pretrained": null, + "vocab_size": 128260 +} diff --git a/checkpoint-26064/generation_config.json b/checkpoint-26064/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..05892c70fa899883072c585fa444b4aa7175d6bc --- /dev/null +++ b/checkpoint-26064/generation_config.json @@ -0,0 +1,13 @@ +{ + "attn_implementation": "flash_attention_2", + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-26064/latest b/checkpoint-26064/latest new file mode 100644 index 0000000000000000000000000000000000000000..2f546d103e44ea0372c167c049820e71d6dddb8f --- /dev/null +++ b/checkpoint-26064/latest @@ -0,0 +1 @@ +global_step26064 \ No newline at end of file diff --git a/checkpoint-26064/model-00001-of-00003.safetensors b/checkpoint-26064/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..775b4ea204b9f17629d28518a4df310710090e79 --- /dev/null +++ b/checkpoint-26064/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc13277c222c87861f2ea8b14becf43c5849384ed1f3ccd72f49df7e50ef1f76 +size 4955415870 diff --git a/checkpoint-26064/model-00002-of-00003.safetensors b/checkpoint-26064/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1c6f3bf70f8abb1e7ffb233219debc10bc20bfc --- /dev/null +++ b/checkpoint-26064/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b088e0e2c4fb5916f448522fa5aef361db713e2c2c0ceac534662c8d52e330d +size 4971563008 diff --git a/checkpoint-26064/model-00003-of-00003.safetensors b/checkpoint-26064/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce3b6a049037aa76eadad3f7d156b72f95fefb57 --- /dev/null +++ b/checkpoint-26064/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00e488974289bf150d71dca673e46224f57ac55c3f3b5fb8aca9eccd7cf457cc +size 4180840856 diff --git a/checkpoint-26064/model.safetensors.index.json b/checkpoint-26064/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c911c94f46f802ae304903dd7796da96c28604 --- /dev/null +++ b/checkpoint-26064/model.safetensors.index.json @@ -0,0 +1,2358 @@ +{ + "metadata": { + "total_size": 14107506086 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.bias": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.cls_token": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.pos_embed": "model-00001-of-00003.safetensors", + "model.latent_queries": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/checkpoint-26064/rng_state_0.pth b/checkpoint-26064/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..2599732a63dac223778e47a53537fcea2e64af01 --- /dev/null +++ b/checkpoint-26064/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c06d7c427fab83fb999a0dffdf46eea379a2975f690de0a38251c154928430be +size 15984 diff --git a/checkpoint-26064/rng_state_1.pth b/checkpoint-26064/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1d97f2b291dbbf0cbc0e17ebfe1db9a22820e4ad --- /dev/null +++ b/checkpoint-26064/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49929b6e3fa0340ac3c1814ed10878645c8e6d5b26a32d96179cfe109238d8d3 +size 15984 diff --git a/checkpoint-26064/rng_state_10.pth b/checkpoint-26064/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..5db3c6a4541dce0a4d258167435cc3cc218bd10e --- /dev/null +++ b/checkpoint-26064/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c1c997ab73d86c655e072d3b909e8ecfa5060e41335e2cd2c98735f6ca633b9 +size 15997 diff --git a/checkpoint-26064/rng_state_11.pth b/checkpoint-26064/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..a22e1f5820104964cdc1d5078cd45bebefaf1a60 --- /dev/null +++ b/checkpoint-26064/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cbf234b28612b7eb055d25a3613630c92b0ce3fc1c505adcd019ef161c1f9cc +size 15997 diff --git a/checkpoint-26064/rng_state_12.pth b/checkpoint-26064/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..a2711addfcaea263d70fd5657216abcb8f008a03 --- /dev/null +++ b/checkpoint-26064/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddd13f213811abaa51bc2ca94ee21d2553dc1232fb45c406f1e99307916ad000 +size 15997 diff --git a/checkpoint-26064/rng_state_13.pth b/checkpoint-26064/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..734a8a8aaeb9658d1d2379c0aecbf1db8e1af28c --- /dev/null +++ b/checkpoint-26064/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62a4441b2c345e6cf0451d6d49cadad7f7d6973aa6a35ea983f146e150f135a1 +size 15997 diff --git a/checkpoint-26064/rng_state_14.pth b/checkpoint-26064/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..f359ad60d897d2c8500a1c8f2eb037ac33ad2297 --- /dev/null +++ b/checkpoint-26064/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1e3e8010e2d57f05ae2bce9f1fca2d620b7c6f0178b06b40a5e62a5ba0b6b50 +size 15997 diff --git a/checkpoint-26064/rng_state_15.pth b/checkpoint-26064/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..306e5cf71cbd8732b871a2a75a3c48f8692e85f2 --- /dev/null +++ b/checkpoint-26064/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52979440e07899d48d4a2bfea8eab71b844a02549c52c57d36c8648969e84b34 +size 15997 diff --git a/checkpoint-26064/rng_state_16.pth b/checkpoint-26064/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..574e25963b545deb2fc7ceedc7088ec68df68925 --- /dev/null +++ b/checkpoint-26064/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e34371fddbad6ae1972402bdec880e4ba2cdf74cd3b7da790aa03c26f5b07ba4 +size 15997 diff --git a/checkpoint-26064/rng_state_17.pth b/checkpoint-26064/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9c25b70d72976dede64824b755effebd54451d5 --- /dev/null +++ b/checkpoint-26064/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b453bc691495813c39eb05ac78bd9db9775f263b2003242692e69138c762967 +size 15997 diff --git a/checkpoint-26064/rng_state_18.pth b/checkpoint-26064/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..d2660ef5095d3d03dd72db528585205cbdc99511 --- /dev/null +++ b/checkpoint-26064/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:250216d67ecbde7266d416197b371baa647c24b4848ab122c6dd0c7b21d30b3f +size 15997 diff --git a/checkpoint-26064/rng_state_19.pth b/checkpoint-26064/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..6715885eb29f835889d4b50f9eb00756dbf1f84d --- /dev/null +++ b/checkpoint-26064/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28c00fffc4582cd143aa946c2c353515fbf42670b04f08d9dcc82c67aafe4445 +size 15997 diff --git a/checkpoint-26064/rng_state_2.pth b/checkpoint-26064/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..93cbac2943b998fe55d2759f6ea7c8848df794b4 --- /dev/null +++ b/checkpoint-26064/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b8fbafd68698fe34f9ab51e431d16f159428f39ca7e126aa46b9af979ec58f2 +size 15984 diff --git a/checkpoint-26064/rng_state_20.pth b/checkpoint-26064/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..b2a082c0ec42a494ba76c72ae638d29bc77b81ae --- /dev/null +++ b/checkpoint-26064/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b0f872151cd6f6546463055822e66e03bad720167f71700f86b9058687f644 +size 15997 diff --git a/checkpoint-26064/rng_state_21.pth b/checkpoint-26064/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b0f83483e2a8adaca47a646cbd839208c37e171 --- /dev/null +++ b/checkpoint-26064/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:817dc4ac12f65ebe5a3e99daf65fb24e0acd7d53a0eb361edda5c3e9b5a3944c +size 15997 diff --git a/checkpoint-26064/rng_state_22.pth b/checkpoint-26064/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..54e2bc990fa8b895fd0b19c936de0f0f8e5afda2 --- /dev/null +++ b/checkpoint-26064/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7f7fd8bc692ad83a8714717dc5c10daade5bf23334dfe127693dad24266ba18 +size 15997 diff --git a/checkpoint-26064/rng_state_23.pth b/checkpoint-26064/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..7944897799008455dcba30abd3b599e2331a8a80 --- /dev/null +++ b/checkpoint-26064/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54484be30305ec471b5ec5c74fc8137aff7d85ae4d6d642bced5e4d897b45187 +size 15997 diff --git a/checkpoint-26064/rng_state_24.pth b/checkpoint-26064/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..b21acfd0eca5c6cbe7c48d5f6e6eaa3ee9ca785b --- /dev/null +++ b/checkpoint-26064/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f2fc53b093b4dc3f8626f644d4a4d241ffb40023c3d6d991d40d1711bea1d81 +size 15997 diff --git a/checkpoint-26064/rng_state_25.pth b/checkpoint-26064/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..106b4ff49b9eb1251a266a593f4f57cfde89b5c6 --- /dev/null +++ b/checkpoint-26064/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:118d0677ed653d41704eadd456ebbae2d9e26bc926b373f0409c29a896a7810f +size 15997 diff --git a/checkpoint-26064/rng_state_26.pth b/checkpoint-26064/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..4fdbabc30b074f1b4bfae7e67ffad38d9bf6a50c --- /dev/null +++ b/checkpoint-26064/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3c322cd8f6ee3dfc46ef86c3751d482985a0695cac21d988caaeb81cba86e9f +size 15997 diff --git a/checkpoint-26064/rng_state_27.pth b/checkpoint-26064/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..5fff6c6932e504764a24846898e5f87f07453cd6 --- /dev/null +++ b/checkpoint-26064/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c81ca7e9f5dd23ec7b9e98c00a75ea026f212e66ddf582f6f44b53997e00f4b5 +size 15997 diff --git a/checkpoint-26064/rng_state_28.pth b/checkpoint-26064/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..5df79c3b6e47f6d54f7ee6509e0380ac85ea5654 --- /dev/null +++ b/checkpoint-26064/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80c0bb771879d0412aacec31fe74f0eae5d58aa8ee9d4d6652a603f0e2479652 +size 15997 diff --git a/checkpoint-26064/rng_state_29.pth b/checkpoint-26064/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..dad1d4921f82db4dfe12f20811bd019b83a0d14e --- /dev/null +++ b/checkpoint-26064/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57dc3d52bd56c4768f03f2b95267af5cdaa4ed5eacdde842383de7381d7cff2a +size 15997 diff --git a/checkpoint-26064/rng_state_3.pth b/checkpoint-26064/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7752384dda78a24360f822e337a872391515a1f0 --- /dev/null +++ b/checkpoint-26064/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb7f31633a3e71124553686d408903708b4222da629a79f6d976dae68c193ef7 +size 15984 diff --git a/checkpoint-26064/rng_state_30.pth b/checkpoint-26064/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f8b5d9564206b89536e8520acd83b7b51393126 --- /dev/null +++ b/checkpoint-26064/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76b6e0973150fbeb56b61db29062a7fd89af6fe94da5604782774760b8f9c784 +size 15997 diff --git a/checkpoint-26064/rng_state_31.pth b/checkpoint-26064/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca3c5009cef9e4bbc049b532200067363a5add4b --- /dev/null +++ b/checkpoint-26064/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75ee9dab85039bc8300c467909a9f40981b58b6420b4e91cfbee53e021a4310f +size 15997 diff --git a/checkpoint-26064/rng_state_32.pth b/checkpoint-26064/rng_state_32.pth new file mode 100644 index 0000000000000000000000000000000000000000..a2e86fe4d469acf708d9f75c2cac9e730d2d421a --- /dev/null +++ b/checkpoint-26064/rng_state_32.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:230fd58271ebf3344b3d76bd8cb47ce33bb035b009bcf4e695b2af262df8e79b +size 15997 diff --git a/checkpoint-26064/rng_state_33.pth b/checkpoint-26064/rng_state_33.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a3e4ec9652e4aace2dd9497cad7d2758fd59320 --- /dev/null +++ b/checkpoint-26064/rng_state_33.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f71b25b1cac357b2a86e8b41d009180041a85f17a5e276bb319c911d98548048 +size 15997 diff --git a/checkpoint-26064/rng_state_34.pth b/checkpoint-26064/rng_state_34.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ec80c8854d8d98aba38c5376904f1a83cd50a6f --- /dev/null +++ b/checkpoint-26064/rng_state_34.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:999e2be19a8ec273eb84f0036571605301216d3a57de99c968e202c502b19e08 +size 15997 diff --git a/checkpoint-26064/rng_state_35.pth b/checkpoint-26064/rng_state_35.pth new file mode 100644 index 0000000000000000000000000000000000000000..e2402966722503a3685f6cee57d7bcb4edc104ad --- /dev/null +++ b/checkpoint-26064/rng_state_35.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2a5cc34e53b6d3a63b05bd9bf8fbcd5a6dc0e69a95e9fc41511dfb4e17b3cb5 +size 15997 diff --git a/checkpoint-26064/rng_state_36.pth b/checkpoint-26064/rng_state_36.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a78ab292c9b37d2dc36d0af4a33f61dea8ac023 --- /dev/null +++ b/checkpoint-26064/rng_state_36.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9c3b4b5a0c71fddeda828820974e8d245440ea28daae85f0cecaa1815ada28d +size 15997 diff --git a/checkpoint-26064/rng_state_37.pth b/checkpoint-26064/rng_state_37.pth new file mode 100644 index 0000000000000000000000000000000000000000..a90c5d837bf92e6f895913f13888a50b9c60e535 --- /dev/null +++ b/checkpoint-26064/rng_state_37.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0055d2f2000f8e6cf3b11d59626bb34f024fe8196e661309a0e4c501d0faf64d +size 15997 diff --git a/checkpoint-26064/rng_state_38.pth b/checkpoint-26064/rng_state_38.pth new file mode 100644 index 0000000000000000000000000000000000000000..f30746191f519840d935150870d3b92d7b0ad09b --- /dev/null +++ b/checkpoint-26064/rng_state_38.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60f51f1ef9fecc6b7c6462a30531ac026f7fa0c02f54b3257d163d8a906d578 +size 15997 diff --git a/checkpoint-26064/rng_state_39.pth b/checkpoint-26064/rng_state_39.pth new file mode 100644 index 0000000000000000000000000000000000000000..0dbad64d63f2a3a7e2b2e585068f58f4484012e6 --- /dev/null +++ b/checkpoint-26064/rng_state_39.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f71d2ca814722d701599072991db76f62d468e1e81ac462b2f8309c1a2b5cca +size 15997 diff --git a/checkpoint-26064/rng_state_4.pth b/checkpoint-26064/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7988600d81b67bd5990670b265c814f48a32a49b --- /dev/null +++ b/checkpoint-26064/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48598ff4a3bc55abce2b7c275630c96279ba113a8871e3f51c25423ef097cc3e +size 15984 diff --git a/checkpoint-26064/rng_state_40.pth b/checkpoint-26064/rng_state_40.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd9ee5a3b06b74936eb9fada25f1789ee4331ca0 --- /dev/null +++ b/checkpoint-26064/rng_state_40.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7095d91c0ebf666207414954c6b32614e5e69a8a662086a728c8b25f728a008d +size 15997 diff --git a/checkpoint-26064/rng_state_41.pth b/checkpoint-26064/rng_state_41.pth new file mode 100644 index 0000000000000000000000000000000000000000..c5210a02738b1358773b1a20db2d42b51ca38328 --- /dev/null +++ b/checkpoint-26064/rng_state_41.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d62369a0352a97d407f79d77700f2244b95c55f6e5310319f997002f51609b86 +size 15997 diff --git a/checkpoint-26064/rng_state_42.pth b/checkpoint-26064/rng_state_42.pth new file mode 100644 index 0000000000000000000000000000000000000000..fa2137fb4628331d6828423050f0269ba32bbdf9 --- /dev/null +++ b/checkpoint-26064/rng_state_42.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:220491f0d53145f851713398d17052c9907cb08e477b085a62730d9dfc3d1c80 +size 15997 diff --git a/checkpoint-26064/rng_state_43.pth b/checkpoint-26064/rng_state_43.pth new file mode 100644 index 0000000000000000000000000000000000000000..b2cac60a7ed3a024d2a568cc06951aa48c282cf9 --- /dev/null +++ b/checkpoint-26064/rng_state_43.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04decd0f83b0f7b86d1de588144eb25b935a26a656320c84e4814ca7213b6490 +size 15997 diff --git a/checkpoint-26064/rng_state_44.pth b/checkpoint-26064/rng_state_44.pth new file mode 100644 index 0000000000000000000000000000000000000000..71f1e6f1240937135c8b98d102bc7bf26e60f14a --- /dev/null +++ b/checkpoint-26064/rng_state_44.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8fb48f6325db3393970d3e9c9242d6687e7e2c9ca1fb8ed4b16cd1592cba324 +size 15997 diff --git a/checkpoint-26064/rng_state_45.pth b/checkpoint-26064/rng_state_45.pth new file mode 100644 index 0000000000000000000000000000000000000000..ad81b7b9013284920e7c5d0a1d55764cdf4e6589 --- /dev/null +++ b/checkpoint-26064/rng_state_45.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:708b91121e966d17eabdb686ed4433e78387985eac5d9266ff921ae7781c5b55 +size 15997 diff --git a/checkpoint-26064/rng_state_46.pth b/checkpoint-26064/rng_state_46.pth new file mode 100644 index 0000000000000000000000000000000000000000..b5405aaed038fab58ac448b39f5c4951a73fbe79 --- /dev/null +++ b/checkpoint-26064/rng_state_46.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:566475055462c29d2d66fbe7cb369ea525be8572f1073f9fc570781bcde74d33 +size 15997 diff --git a/checkpoint-26064/rng_state_47.pth b/checkpoint-26064/rng_state_47.pth new file mode 100644 index 0000000000000000000000000000000000000000..78506b36fcac83c1310baf93bd5103c280410788 --- /dev/null +++ b/checkpoint-26064/rng_state_47.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24018a4bff3300764402b70c07b7a2a952e6d2f2a468007af8cfcbd40bc72563 +size 15997 diff --git a/checkpoint-26064/rng_state_48.pth b/checkpoint-26064/rng_state_48.pth new file mode 100644 index 0000000000000000000000000000000000000000..47f63f7e4110f215309dd1f348fcb402e39bd3f4 --- /dev/null +++ b/checkpoint-26064/rng_state_48.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d12c5b2b786d6244e4b3ff2d46f4d8d6fe6ec49f50d774f03f7934724545bf8b +size 15997 diff --git a/checkpoint-26064/rng_state_49.pth b/checkpoint-26064/rng_state_49.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ae2ec39733b09a1ab94c0bbe449eaf367ab03a1 --- /dev/null +++ b/checkpoint-26064/rng_state_49.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:904698f9452a8944a7a8454e390d158fe0f206c4dd242d53b7acd3d5cb3b46c9 +size 15997 diff --git a/checkpoint-26064/rng_state_5.pth b/checkpoint-26064/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1d9d48efeb5ceeb12c18f13b0846b747c055e702 --- /dev/null +++ b/checkpoint-26064/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67fd46dfda9a8c8c34597517daf6a36c21010b6bff148a288a46407324ae8fb2 +size 15984 diff --git a/checkpoint-26064/rng_state_50.pth b/checkpoint-26064/rng_state_50.pth new file mode 100644 index 0000000000000000000000000000000000000000..904cbd7764fd9b2cbf52677c7335899e05693aff --- /dev/null +++ b/checkpoint-26064/rng_state_50.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4e018b095777bcb27f04fbe036266456eb7fc496fda884574c49508bf6d5025 +size 15997 diff --git a/checkpoint-26064/rng_state_51.pth b/checkpoint-26064/rng_state_51.pth new file mode 100644 index 0000000000000000000000000000000000000000..545aeecedbc18d89a1eb140b6a302466fa37a2ba --- /dev/null +++ b/checkpoint-26064/rng_state_51.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aeccddb3bcceaa48121ac32978e7a083fdc29ae4f30ad8477aa415391054792 +size 15997 diff --git a/checkpoint-26064/rng_state_52.pth b/checkpoint-26064/rng_state_52.pth new file mode 100644 index 0000000000000000000000000000000000000000..8982338faae904a072313e0f105e1663f632d3c8 --- /dev/null +++ b/checkpoint-26064/rng_state_52.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:324caf68eef5ceb6651d11a05a182013534994f1b69fe5a38bb997ea2f085b8a +size 15997 diff --git a/checkpoint-26064/rng_state_53.pth b/checkpoint-26064/rng_state_53.pth new file mode 100644 index 0000000000000000000000000000000000000000..bde0625bd554b06e96825a1d8d293dcec1bb4713 --- /dev/null +++ b/checkpoint-26064/rng_state_53.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35e5a002402cff4d2dc323bccd69927b7e36d66b2601273caa3b8244b539c17b +size 15997 diff --git a/checkpoint-26064/rng_state_54.pth b/checkpoint-26064/rng_state_54.pth new file mode 100644 index 0000000000000000000000000000000000000000..766c1db13af5f39e20781a5941892ba4eaafc57f --- /dev/null +++ b/checkpoint-26064/rng_state_54.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dc92907982f5a27c0dc61faee5ff6cf2e4367351d2832db7c934c79724347f3 +size 15997 diff --git a/checkpoint-26064/rng_state_55.pth b/checkpoint-26064/rng_state_55.pth new file mode 100644 index 0000000000000000000000000000000000000000..6206142571ac0a1e526e50bc50ae3d8f6354606d --- /dev/null +++ b/checkpoint-26064/rng_state_55.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f320e4dc4077184e6b2c45c81e8e46d27e380d4275750d1123a372e165ece71b +size 15997 diff --git a/checkpoint-26064/rng_state_56.pth b/checkpoint-26064/rng_state_56.pth new file mode 100644 index 0000000000000000000000000000000000000000..183d861ed029ee1df57162812c4767b4617ace71 --- /dev/null +++ b/checkpoint-26064/rng_state_56.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4139efc51206dbe8b436f01d55475259d712320257645ae91f882a64a41aef4e +size 15997 diff --git a/checkpoint-26064/rng_state_57.pth b/checkpoint-26064/rng_state_57.pth new file mode 100644 index 0000000000000000000000000000000000000000..651b0b46b17c0f681f4fc63c326ac5c13cb3df72 --- /dev/null +++ b/checkpoint-26064/rng_state_57.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:422330fddb5b3f44b27e287676e7d15d942ed57dbfc01d07335a9fae6212a744 +size 15997 diff --git a/checkpoint-26064/rng_state_58.pth b/checkpoint-26064/rng_state_58.pth new file mode 100644 index 0000000000000000000000000000000000000000..68cdfdd3ffc2302459d99b62acb6d68110a70d01 --- /dev/null +++ b/checkpoint-26064/rng_state_58.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5dfd7d8b3a77ee26cd27b7fdbfab8913ab1c58cb0118fd2c7d28bdab297a972 +size 15997 diff --git a/checkpoint-26064/rng_state_59.pth b/checkpoint-26064/rng_state_59.pth new file mode 100644 index 0000000000000000000000000000000000000000..3885e8d9ad1345a79267e543da1cf832a2958964 --- /dev/null +++ b/checkpoint-26064/rng_state_59.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b88cb3bed4433f37cbce8e73aeb1ea2a989553fab84bb0223474dd5a3c0076c5 +size 15997 diff --git a/checkpoint-26064/rng_state_6.pth b/checkpoint-26064/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..f2733601fe2c09475c2a7dae043b090f0504eb28 --- /dev/null +++ b/checkpoint-26064/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d6d09540e53861846f0493062333e43e4e530a592aa21ea6267b74aa0086b4b +size 15984 diff --git a/checkpoint-26064/rng_state_60.pth b/checkpoint-26064/rng_state_60.pth new file mode 100644 index 0000000000000000000000000000000000000000..3519aef42184520a473ada414a20e1ad6db4e21f --- /dev/null +++ b/checkpoint-26064/rng_state_60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abae5b6b8c89ad8f58f2a56d61c4517ff1e54b796bbac22817d91f7780f8fcee +size 15997 diff --git a/checkpoint-26064/rng_state_61.pth b/checkpoint-26064/rng_state_61.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3efe1a54322828c52e8f67cd8a63c5fbfe1f4b6 --- /dev/null +++ b/checkpoint-26064/rng_state_61.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:324e9517e3aa6d22cadf84f215bdf7a9266366723666671bb6583ecb1b866ee1 +size 15997 diff --git a/checkpoint-26064/rng_state_62.pth b/checkpoint-26064/rng_state_62.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ad488c5ced7f05f764acc413f067fdab0bacb70 --- /dev/null +++ b/checkpoint-26064/rng_state_62.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b97eb00487fe1cebcd15c5b9e5d206f1aa98bbd34bdb4d800cdb060c718ae55 +size 15997 diff --git a/checkpoint-26064/rng_state_63.pth b/checkpoint-26064/rng_state_63.pth new file mode 100644 index 0000000000000000000000000000000000000000..777b444c88a046356f68aacbda39ce07dc5c4c3a --- /dev/null +++ b/checkpoint-26064/rng_state_63.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27d78b06ea00181f90ead04af4ecaa59de072b39d5fc4f814d248c0984807472 +size 15997 diff --git a/checkpoint-26064/rng_state_7.pth b/checkpoint-26064/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7e5674058ea37c0ee1690fdd5535d08f436427a6 --- /dev/null +++ b/checkpoint-26064/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c1a730d4711dcd6362c7cd8065a3bfbd8beb0991e1cadf670df3834419fca1 +size 15984 diff --git a/checkpoint-26064/rng_state_8.pth b/checkpoint-26064/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..5102fa8cd38186fb6daee3288bc8bef66d1149b1 --- /dev/null +++ b/checkpoint-26064/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:288d1ba49cef8e1e58fb160208397e379fb3eadde7bac027ddc691a42b0654a2 +size 15984 diff --git a/checkpoint-26064/rng_state_9.pth b/checkpoint-26064/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..446c4c739dae602e620f012ca811a37b89b88dfe --- /dev/null +++ b/checkpoint-26064/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c49fbfb872e5b4d272d7014341417014c947e2f0fd80fefefd219425a5bb6279 +size 15984 diff --git a/checkpoint-26064/scheduler.pt b/checkpoint-26064/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1ea86fad3cea00073713e8716ff90e7a7ab8983 --- /dev/null +++ b/checkpoint-26064/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fb8ea900a278a6270cebf4e576e8bb950ef31fc12d5f8b19060075ce3150e4a +size 1064 diff --git a/checkpoint-26064/special_tokens_map.json b/checkpoint-26064/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad43db72a0e94321a5a9455dce616c68d1f9673 --- /dev/null +++ b/checkpoint-26064/special_tokens_map.json @@ -0,0 +1,46 @@ +{ + "additional_special_tokens": [ + { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-26064/tokenizer.json b/checkpoint-26064/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..444d43e1c25d11b63381073024becd006c83d4f6 --- /dev/null +++ b/checkpoint-26064/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fbef9068a1d82c7fafc3fdfd7c717524c8bfbcaea19c14ce4f8a4e616deb57 +size 17210651 diff --git a/checkpoint-26064/tokenizer_config.json b/checkpoint-26064/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a54102d00c210427fe2da524cea00c5ace13686 --- /dev/null +++ b/checkpoint-26064/tokenizer_config.json @@ -0,0 +1,2102 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "[IMG]", + "[/IMG]", + "" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 256, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-26064/trainer_state.json b/checkpoint-26064/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b130e974fea06b77bce715d2651e6b0f40a67099 --- /dev/null +++ b/checkpoint-26064/trainer_state.json @@ -0,0 +1,182482 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 26064, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003069367710251688, + "grad_norm": 1.3492016792297363, + "learning_rate": 0.0, + "loss": 6.5185, + "step": 1 + }, + { + "epoch": 0.0006138735420503376, + "grad_norm": 1.4303781986236572, + "learning_rate": 1.0224948875255626e-07, + "loss": 6.5124, + "step": 2 + }, + { + "epoch": 0.0009208103130755065, + "grad_norm": 1.3981783390045166, + "learning_rate": 2.0449897750511251e-07, + "loss": 6.5204, + "step": 3 + }, + { + "epoch": 0.0012277470841006752, + "grad_norm": 1.3760672807693481, + "learning_rate": 3.0674846625766876e-07, + "loss": 6.502, + "step": 4 + }, + { + "epoch": 0.001534683855125844, + "grad_norm": 1.3704107999801636, + "learning_rate": 4.0899795501022503e-07, + "loss": 6.5021, + "step": 5 + }, + { + "epoch": 0.001841620626151013, + "grad_norm": 1.3109549283981323, + "learning_rate": 5.112474437627812e-07, + "loss": 6.521, + "step": 6 + }, + { + "epoch": 0.002148557397176182, + "grad_norm": 1.475183367729187, + "learning_rate": 6.134969325153375e-07, + "loss": 6.521, + "step": 7 + }, + { + "epoch": 0.0024554941682013503, + "grad_norm": 1.4563297033309937, + "learning_rate": 7.157464212678937e-07, + "loss": 6.5075, + "step": 8 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 1.437183141708374, + "learning_rate": 8.179959100204501e-07, + "loss": 6.5135, + "step": 9 + }, + { + "epoch": 0.003069367710251688, + "grad_norm": 1.336928129196167, + "learning_rate": 9.202453987730062e-07, + "loss": 6.5138, + "step": 10 + }, + { + "epoch": 0.003376304481276857, + "grad_norm": 1.3220698833465576, + "learning_rate": 1.0224948875255625e-06, + "loss": 6.5187, + "step": 11 + }, + { + "epoch": 0.003683241252302026, + "grad_norm": 1.3990652561187744, + "learning_rate": 1.1247443762781187e-06, + "loss": 6.5129, + "step": 12 + }, + { + "epoch": 0.003990178023327195, + "grad_norm": 1.4394340515136719, + "learning_rate": 1.226993865030675e-06, + "loss": 6.5078, + "step": 13 + }, + { + "epoch": 0.004297114794352364, + "grad_norm": 1.3675259351730347, + "learning_rate": 1.3292433537832312e-06, + "loss": 6.5115, + "step": 14 + }, + { + "epoch": 0.004604051565377533, + "grad_norm": 1.3085063695907593, + "learning_rate": 1.4314928425357874e-06, + "loss": 6.5092, + "step": 15 + }, + { + "epoch": 0.004910988336402701, + "grad_norm": 1.4214227199554443, + "learning_rate": 1.5337423312883435e-06, + "loss": 6.5026, + "step": 16 + }, + { + "epoch": 0.0052179251074278695, + "grad_norm": 1.377146601676941, + "learning_rate": 1.6359918200409001e-06, + "loss": 6.4882, + "step": 17 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.3461124897003174, + "learning_rate": 1.7382413087934563e-06, + "loss": 6.4935, + "step": 18 + }, + { + "epoch": 0.005831798649478207, + "grad_norm": 1.3161669969558716, + "learning_rate": 1.8404907975460124e-06, + "loss": 6.4795, + "step": 19 + }, + { + "epoch": 0.006138735420503376, + "grad_norm": 1.2915974855422974, + "learning_rate": 1.942740286298569e-06, + "loss": 6.4529, + "step": 20 + }, + { + "epoch": 0.006445672191528545, + "grad_norm": 1.2675414085388184, + "learning_rate": 2.044989775051125e-06, + "loss": 6.454, + "step": 21 + }, + { + "epoch": 0.006752608962553714, + "grad_norm": 1.2769283056259155, + "learning_rate": 2.147239263803681e-06, + "loss": 6.4574, + "step": 22 + }, + { + "epoch": 0.007059545733578883, + "grad_norm": 1.2556813955307007, + "learning_rate": 2.2494887525562373e-06, + "loss": 6.4486, + "step": 23 + }, + { + "epoch": 0.007366482504604052, + "grad_norm": 1.2158268690109253, + "learning_rate": 2.3517382413087935e-06, + "loss": 6.4357, + "step": 24 + }, + { + "epoch": 0.007673419275629221, + "grad_norm": 1.2383767366409302, + "learning_rate": 2.45398773006135e-06, + "loss": 6.4347, + "step": 25 + }, + { + "epoch": 0.00798035604665439, + "grad_norm": 1.2865383625030518, + "learning_rate": 2.5562372188139062e-06, + "loss": 6.3611, + "step": 26 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 1.1501989364624023, + "learning_rate": 2.6584867075664624e-06, + "loss": 6.3247, + "step": 27 + }, + { + "epoch": 0.008594229588704727, + "grad_norm": 1.0971378087997437, + "learning_rate": 2.7607361963190186e-06, + "loss": 6.3078, + "step": 28 + }, + { + "epoch": 0.008901166359729895, + "grad_norm": 1.1365599632263184, + "learning_rate": 2.8629856850715747e-06, + "loss": 6.3211, + "step": 29 + }, + { + "epoch": 0.009208103130755065, + "grad_norm": 1.1228944063186646, + "learning_rate": 2.965235173824131e-06, + "loss": 6.3185, + "step": 30 + }, + { + "epoch": 0.009515039901780233, + "grad_norm": 1.126287579536438, + "learning_rate": 3.067484662576687e-06, + "loss": 6.2845, + "step": 31 + }, + { + "epoch": 0.009821976672805401, + "grad_norm": 1.1070353984832764, + "learning_rate": 3.1697341513292436e-06, + "loss": 6.2855, + "step": 32 + }, + { + "epoch": 0.010128913443830571, + "grad_norm": 1.101291537284851, + "learning_rate": 3.2719836400818002e-06, + "loss": 6.2764, + "step": 33 + }, + { + "epoch": 0.010435850214855739, + "grad_norm": 1.0643113851547241, + "learning_rate": 3.374233128834356e-06, + "loss": 6.2363, + "step": 34 + }, + { + "epoch": 0.010742786985880909, + "grad_norm": 0.9714563488960266, + "learning_rate": 3.4764826175869125e-06, + "loss": 6.1771, + "step": 35 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8998560309410095, + "learning_rate": 3.5787321063394683e-06, + "loss": 6.1202, + "step": 36 + }, + { + "epoch": 0.011356660527931247, + "grad_norm": 0.8481987714767456, + "learning_rate": 3.680981595092025e-06, + "loss": 6.0954, + "step": 37 + }, + { + "epoch": 0.011663597298956415, + "grad_norm": 0.8124909996986389, + "learning_rate": 3.783231083844581e-06, + "loss": 6.0832, + "step": 38 + }, + { + "epoch": 0.011970534069981584, + "grad_norm": 0.7968178391456604, + "learning_rate": 3.885480572597138e-06, + "loss": 6.0661, + "step": 39 + }, + { + "epoch": 0.012277470841006752, + "grad_norm": 0.7714207768440247, + "learning_rate": 3.987730061349693e-06, + "loss": 6.0385, + "step": 40 + }, + { + "epoch": 0.012584407612031922, + "grad_norm": 0.7436742782592773, + "learning_rate": 4.08997955010225e-06, + "loss": 6.0227, + "step": 41 + }, + { + "epoch": 0.01289134438305709, + "grad_norm": 0.7447277307510376, + "learning_rate": 4.192229038854806e-06, + "loss": 6.0208, + "step": 42 + }, + { + "epoch": 0.013198281154082258, + "grad_norm": 0.6983785629272461, + "learning_rate": 4.294478527607362e-06, + "loss": 6.0295, + "step": 43 + }, + { + "epoch": 0.013505217925107428, + "grad_norm": 0.6630908250808716, + "learning_rate": 4.3967280163599184e-06, + "loss": 6.004, + "step": 44 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.6481929421424866, + "learning_rate": 4.498977505112475e-06, + "loss": 5.9986, + "step": 45 + }, + { + "epoch": 0.014119091467157766, + "grad_norm": 0.7187685966491699, + "learning_rate": 4.601226993865031e-06, + "loss": 6.0008, + "step": 46 + }, + { + "epoch": 0.014426028238182934, + "grad_norm": 0.6550983190536499, + "learning_rate": 4.703476482617587e-06, + "loss": 5.9735, + "step": 47 + }, + { + "epoch": 0.014732965009208104, + "grad_norm": 0.6780675649642944, + "learning_rate": 4.805725971370143e-06, + "loss": 5.9568, + "step": 48 + }, + { + "epoch": 0.015039901780233272, + "grad_norm": 0.703427791595459, + "learning_rate": 4.9079754601227e-06, + "loss": 5.961, + "step": 49 + }, + { + "epoch": 0.015346838551258441, + "grad_norm": 0.6507543921470642, + "learning_rate": 5.0102249488752554e-06, + "loss": 5.9557, + "step": 50 + }, + { + "epoch": 0.01565377532228361, + "grad_norm": 0.5959481000900269, + "learning_rate": 5.1124744376278124e-06, + "loss": 5.9391, + "step": 51 + }, + { + "epoch": 0.01596071209330878, + "grad_norm": 0.5798730254173279, + "learning_rate": 5.214723926380368e-06, + "loss": 5.9488, + "step": 52 + }, + { + "epoch": 0.016267648864333947, + "grad_norm": 0.5932896137237549, + "learning_rate": 5.316973415132925e-06, + "loss": 5.9176, + "step": 53 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5772561430931091, + "learning_rate": 5.419222903885481e-06, + "loss": 5.9069, + "step": 54 + }, + { + "epoch": 0.016881522406384283, + "grad_norm": 0.5578178763389587, + "learning_rate": 5.521472392638037e-06, + "loss": 5.8924, + "step": 55 + }, + { + "epoch": 0.017188459177409455, + "grad_norm": 0.5458457469940186, + "learning_rate": 5.623721881390593e-06, + "loss": 5.9001, + "step": 56 + }, + { + "epoch": 0.017495395948434623, + "grad_norm": 0.5381231904029846, + "learning_rate": 5.7259713701431494e-06, + "loss": 5.8827, + "step": 57 + }, + { + "epoch": 0.01780233271945979, + "grad_norm": 0.540920615196228, + "learning_rate": 5.828220858895706e-06, + "loss": 5.8763, + "step": 58 + }, + { + "epoch": 0.01810926949048496, + "grad_norm": 0.5378615260124207, + "learning_rate": 5.930470347648262e-06, + "loss": 5.865, + "step": 59 + }, + { + "epoch": 0.01841620626151013, + "grad_norm": 0.5139282941818237, + "learning_rate": 6.032719836400819e-06, + "loss": 5.873, + "step": 60 + }, + { + "epoch": 0.0187231430325353, + "grad_norm": 0.5298904776573181, + "learning_rate": 6.134969325153374e-06, + "loss": 5.861, + "step": 61 + }, + { + "epoch": 0.019030079803560467, + "grad_norm": 0.503131628036499, + "learning_rate": 6.237218813905931e-06, + "loss": 5.844, + "step": 62 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.5133433938026428, + "learning_rate": 6.339468302658487e-06, + "loss": 5.8535, + "step": 63 + }, + { + "epoch": 0.019643953345610803, + "grad_norm": 0.4909187853336334, + "learning_rate": 6.4417177914110434e-06, + "loss": 5.8378, + "step": 64 + }, + { + "epoch": 0.019950890116635974, + "grad_norm": 0.6916642785072327, + "learning_rate": 6.5439672801636004e-06, + "loss": 5.8385, + "step": 65 + }, + { + "epoch": 0.020257826887661142, + "grad_norm": 0.4801484942436218, + "learning_rate": 6.646216768916155e-06, + "loss": 5.8089, + "step": 66 + }, + { + "epoch": 0.02056476365868631, + "grad_norm": 0.47745251655578613, + "learning_rate": 6.748466257668712e-06, + "loss": 5.8119, + "step": 67 + }, + { + "epoch": 0.020871700429711478, + "grad_norm": 0.4693359136581421, + "learning_rate": 6.850715746421268e-06, + "loss": 5.8038, + "step": 68 + }, + { + "epoch": 0.02117863720073665, + "grad_norm": 0.46996453404426575, + "learning_rate": 6.952965235173825e-06, + "loss": 5.7966, + "step": 69 + }, + { + "epoch": 0.021485573971761818, + "grad_norm": 0.45779168605804443, + "learning_rate": 7.05521472392638e-06, + "loss": 5.7959, + "step": 70 + }, + { + "epoch": 0.021792510742786986, + "grad_norm": 0.49008259177207947, + "learning_rate": 7.1574642126789366e-06, + "loss": 5.7861, + "step": 71 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.44727766513824463, + "learning_rate": 7.259713701431494e-06, + "loss": 5.7716, + "step": 72 + }, + { + "epoch": 0.022406384284837322, + "grad_norm": 0.4392741918563843, + "learning_rate": 7.36196319018405e-06, + "loss": 5.7776, + "step": 73 + }, + { + "epoch": 0.022713321055862493, + "grad_norm": 0.43525391817092896, + "learning_rate": 7.464212678936605e-06, + "loss": 5.7687, + "step": 74 + }, + { + "epoch": 0.02302025782688766, + "grad_norm": 0.4370710253715515, + "learning_rate": 7.566462167689162e-06, + "loss": 5.7504, + "step": 75 + }, + { + "epoch": 0.02332719459791283, + "grad_norm": 0.4349770247936249, + "learning_rate": 7.668711656441718e-06, + "loss": 5.7425, + "step": 76 + }, + { + "epoch": 0.023634131368937997, + "grad_norm": 0.42710933089256287, + "learning_rate": 7.770961145194275e-06, + "loss": 5.7562, + "step": 77 + }, + { + "epoch": 0.02394106813996317, + "grad_norm": 0.42816224694252014, + "learning_rate": 7.87321063394683e-06, + "loss": 5.7301, + "step": 78 + }, + { + "epoch": 0.024248004910988337, + "grad_norm": 0.4183364510536194, + "learning_rate": 7.975460122699386e-06, + "loss": 5.7131, + "step": 79 + }, + { + "epoch": 0.024554941682013505, + "grad_norm": 0.4179428517818451, + "learning_rate": 8.077709611451943e-06, + "loss": 5.7057, + "step": 80 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.40880727767944336, + "learning_rate": 8.1799591002045e-06, + "loss": 5.7179, + "step": 81 + }, + { + "epoch": 0.025168815224063844, + "grad_norm": 0.40961235761642456, + "learning_rate": 8.282208588957055e-06, + "loss": 5.7008, + "step": 82 + }, + { + "epoch": 0.025475751995089013, + "grad_norm": 0.46789029240608215, + "learning_rate": 8.384458077709612e-06, + "loss": 5.7071, + "step": 83 + }, + { + "epoch": 0.02578268876611418, + "grad_norm": 0.4776248335838318, + "learning_rate": 8.486707566462168e-06, + "loss": 5.6829, + "step": 84 + }, + { + "epoch": 0.02608962553713935, + "grad_norm": 0.40660589933395386, + "learning_rate": 8.588957055214725e-06, + "loss": 5.6732, + "step": 85 + }, + { + "epoch": 0.026396562308164517, + "grad_norm": 0.3984324038028717, + "learning_rate": 8.69120654396728e-06, + "loss": 5.6777, + "step": 86 + }, + { + "epoch": 0.026703499079189688, + "grad_norm": 0.3972148597240448, + "learning_rate": 8.793456032719837e-06, + "loss": 5.6598, + "step": 87 + }, + { + "epoch": 0.027010435850214856, + "grad_norm": 0.3906182050704956, + "learning_rate": 8.895705521472392e-06, + "loss": 5.6468, + "step": 88 + }, + { + "epoch": 0.027317372621240024, + "grad_norm": 0.38598939776420593, + "learning_rate": 8.99795501022495e-06, + "loss": 5.6452, + "step": 89 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.405943363904953, + "learning_rate": 9.100204498977506e-06, + "loss": 5.6408, + "step": 90 + }, + { + "epoch": 0.027931246163290364, + "grad_norm": 0.3859459161758423, + "learning_rate": 9.202453987730062e-06, + "loss": 5.613, + "step": 91 + }, + { + "epoch": 0.028238182934315532, + "grad_norm": 0.3773545026779175, + "learning_rate": 9.304703476482619e-06, + "loss": 5.6277, + "step": 92 + }, + { + "epoch": 0.0285451197053407, + "grad_norm": 0.36915943026542664, + "learning_rate": 9.406952965235174e-06, + "loss": 5.618, + "step": 93 + }, + { + "epoch": 0.028852056476365868, + "grad_norm": 0.3732316792011261, + "learning_rate": 9.509202453987731e-06, + "loss": 5.6066, + "step": 94 + }, + { + "epoch": 0.029158993247391036, + "grad_norm": 0.3670802414417267, + "learning_rate": 9.611451942740286e-06, + "loss": 5.6189, + "step": 95 + }, + { + "epoch": 0.029465930018416207, + "grad_norm": 0.3672202229499817, + "learning_rate": 9.713701431492843e-06, + "loss": 5.6046, + "step": 96 + }, + { + "epoch": 0.029772866789441375, + "grad_norm": 0.3624509871006012, + "learning_rate": 9.8159509202454e-06, + "loss": 5.585, + "step": 97 + }, + { + "epoch": 0.030079803560466543, + "grad_norm": 0.36265870928764343, + "learning_rate": 9.918200408997956e-06, + "loss": 5.5867, + "step": 98 + }, + { + "epoch": 0.03038674033149171, + "grad_norm": 0.3606979548931122, + "learning_rate": 1.0020449897750511e-05, + "loss": 5.5658, + "step": 99 + }, + { + "epoch": 0.030693677102516883, + "grad_norm": 0.36800363659858704, + "learning_rate": 1.0122699386503068e-05, + "loss": 5.5494, + "step": 100 + }, + { + "epoch": 0.03100061387354205, + "grad_norm": 0.3641016483306885, + "learning_rate": 1.0224948875255625e-05, + "loss": 5.5553, + "step": 101 + }, + { + "epoch": 0.03130755064456722, + "grad_norm": 0.36807990074157715, + "learning_rate": 1.032719836400818e-05, + "loss": 5.5315, + "step": 102 + }, + { + "epoch": 0.03161448741559239, + "grad_norm": 0.37071728706359863, + "learning_rate": 1.0429447852760736e-05, + "loss": 5.522, + "step": 103 + }, + { + "epoch": 0.03192142418661756, + "grad_norm": 0.3549076020717621, + "learning_rate": 1.0531697341513293e-05, + "loss": 5.5354, + "step": 104 + }, + { + "epoch": 0.03222836095764273, + "grad_norm": 0.3589537441730499, + "learning_rate": 1.063394683026585e-05, + "loss": 5.534, + "step": 105 + }, + { + "epoch": 0.032535297728667895, + "grad_norm": 0.4341397285461426, + "learning_rate": 1.0736196319018407e-05, + "loss": 5.5088, + "step": 106 + }, + { + "epoch": 0.03284223449969306, + "grad_norm": 0.37220680713653564, + "learning_rate": 1.0838445807770962e-05, + "loss": 5.5213, + "step": 107 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.3776145875453949, + "learning_rate": 1.0940695296523517e-05, + "loss": 5.4955, + "step": 108 + }, + { + "epoch": 0.0334561080417434, + "grad_norm": 0.38651829957962036, + "learning_rate": 1.1042944785276074e-05, + "loss": 5.4916, + "step": 109 + }, + { + "epoch": 0.03376304481276857, + "grad_norm": 0.3749970495700836, + "learning_rate": 1.1145194274028631e-05, + "loss": 5.4686, + "step": 110 + }, + { + "epoch": 0.03406998158379374, + "grad_norm": 0.38184404373168945, + "learning_rate": 1.1247443762781187e-05, + "loss": 5.4694, + "step": 111 + }, + { + "epoch": 0.03437691835481891, + "grad_norm": 0.38783952593803406, + "learning_rate": 1.1349693251533742e-05, + "loss": 5.4447, + "step": 112 + }, + { + "epoch": 0.03468385512584408, + "grad_norm": 0.369125097990036, + "learning_rate": 1.1451942740286299e-05, + "loss": 5.4506, + "step": 113 + }, + { + "epoch": 0.034990791896869246, + "grad_norm": 0.3773012161254883, + "learning_rate": 1.1554192229038856e-05, + "loss": 5.4637, + "step": 114 + }, + { + "epoch": 0.035297728667894414, + "grad_norm": 0.47702446579933167, + "learning_rate": 1.1656441717791411e-05, + "loss": 5.4487, + "step": 115 + }, + { + "epoch": 0.03560466543891958, + "grad_norm": 0.5288241505622864, + "learning_rate": 1.1758691206543968e-05, + "loss": 5.4216, + "step": 116 + }, + { + "epoch": 0.03591160220994475, + "grad_norm": 0.49916699528694153, + "learning_rate": 1.1860940695296524e-05, + "loss": 5.4055, + "step": 117 + }, + { + "epoch": 0.03621853898096992, + "grad_norm": 0.5027921795845032, + "learning_rate": 1.196319018404908e-05, + "loss": 5.4141, + "step": 118 + }, + { + "epoch": 0.036525475751995086, + "grad_norm": 0.5069209933280945, + "learning_rate": 1.2065439672801638e-05, + "loss": 5.4277, + "step": 119 + }, + { + "epoch": 0.03683241252302026, + "grad_norm": 0.5208525657653809, + "learning_rate": 1.2167689161554193e-05, + "loss": 5.4023, + "step": 120 + }, + { + "epoch": 0.03713934929404543, + "grad_norm": 0.7059593796730042, + "learning_rate": 1.2269938650306748e-05, + "loss": 5.3797, + "step": 121 + }, + { + "epoch": 0.0374462860650706, + "grad_norm": 0.71112060546875, + "learning_rate": 1.2372188139059305e-05, + "loss": 5.3619, + "step": 122 + }, + { + "epoch": 0.037753222836095765, + "grad_norm": 0.5095361471176147, + "learning_rate": 1.2474437627811862e-05, + "loss": 5.3667, + "step": 123 + }, + { + "epoch": 0.03806015960712093, + "grad_norm": 0.986062228679657, + "learning_rate": 1.2576687116564418e-05, + "loss": 5.3459, + "step": 124 + }, + { + "epoch": 0.0383670963781461, + "grad_norm": 0.693392813205719, + "learning_rate": 1.2678936605316975e-05, + "loss": 5.3165, + "step": 125 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7835625410079956, + "learning_rate": 1.278118609406953e-05, + "loss": 5.3205, + "step": 126 + }, + { + "epoch": 0.03898096992019644, + "grad_norm": 0.6314569711685181, + "learning_rate": 1.2883435582822087e-05, + "loss": 5.3287, + "step": 127 + }, + { + "epoch": 0.039287906691221605, + "grad_norm": 0.9079526662826538, + "learning_rate": 1.2985685071574644e-05, + "loss": 5.2935, + "step": 128 + }, + { + "epoch": 0.03959484346224678, + "grad_norm": 0.6998131275177002, + "learning_rate": 1.3087934560327201e-05, + "loss": 5.315, + "step": 129 + }, + { + "epoch": 0.03990178023327195, + "grad_norm": 0.7570182085037231, + "learning_rate": 1.3190184049079754e-05, + "loss": 5.293, + "step": 130 + }, + { + "epoch": 0.040208717004297116, + "grad_norm": 0.6972737908363342, + "learning_rate": 1.329243353783231e-05, + "loss": 5.2863, + "step": 131 + }, + { + "epoch": 0.040515653775322284, + "grad_norm": 0.8841190934181213, + "learning_rate": 1.3394683026584867e-05, + "loss": 5.2518, + "step": 132 + }, + { + "epoch": 0.04082259054634745, + "grad_norm": 0.6792641282081604, + "learning_rate": 1.3496932515337424e-05, + "loss": 5.2386, + "step": 133 + }, + { + "epoch": 0.04112952731737262, + "grad_norm": 0.9234145879745483, + "learning_rate": 1.359918200408998e-05, + "loss": 5.2418, + "step": 134 + }, + { + "epoch": 0.04143646408839779, + "grad_norm": 1.1438226699829102, + "learning_rate": 1.3701431492842536e-05, + "loss": 5.2298, + "step": 135 + }, + { + "epoch": 0.041743400859422956, + "grad_norm": 0.910861074924469, + "learning_rate": 1.3803680981595093e-05, + "loss": 5.2437, + "step": 136 + }, + { + "epoch": 0.042050337630448124, + "grad_norm": 0.8995844721794128, + "learning_rate": 1.390593047034765e-05, + "loss": 5.2456, + "step": 137 + }, + { + "epoch": 0.0423572744014733, + "grad_norm": 0.8543404936790466, + "learning_rate": 1.4008179959100204e-05, + "loss": 5.1888, + "step": 138 + }, + { + "epoch": 0.04266421117249847, + "grad_norm": 0.7565917372703552, + "learning_rate": 1.411042944785276e-05, + "loss": 5.1939, + "step": 139 + }, + { + "epoch": 0.042971147943523635, + "grad_norm": 0.7103878259658813, + "learning_rate": 1.4212678936605318e-05, + "loss": 5.1693, + "step": 140 + }, + { + "epoch": 0.0432780847145488, + "grad_norm": 1.008686900138855, + "learning_rate": 1.4314928425357873e-05, + "loss": 5.1467, + "step": 141 + }, + { + "epoch": 0.04358502148557397, + "grad_norm": 1.1134421825408936, + "learning_rate": 1.441717791411043e-05, + "loss": 5.1695, + "step": 142 + }, + { + "epoch": 0.04389195825659914, + "grad_norm": 0.7418283820152283, + "learning_rate": 1.4519427402862987e-05, + "loss": 5.1556, + "step": 143 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 1.3332276344299316, + "learning_rate": 1.4621676891615542e-05, + "loss": 5.1736, + "step": 144 + }, + { + "epoch": 0.044505831798649476, + "grad_norm": 0.99709153175354, + "learning_rate": 1.47239263803681e-05, + "loss": 5.1326, + "step": 145 + }, + { + "epoch": 0.044812768569674644, + "grad_norm": 2.0185158252716064, + "learning_rate": 1.4826175869120657e-05, + "loss": 5.1075, + "step": 146 + }, + { + "epoch": 0.04511970534069982, + "grad_norm": 0.9810693264007568, + "learning_rate": 1.492842535787321e-05, + "loss": 5.1181, + "step": 147 + }, + { + "epoch": 0.04542664211172499, + "grad_norm": 1.3122087717056274, + "learning_rate": 1.5030674846625767e-05, + "loss": 5.1104, + "step": 148 + }, + { + "epoch": 0.045733578882750155, + "grad_norm": 1.230662226676941, + "learning_rate": 1.5132924335378324e-05, + "loss": 5.0721, + "step": 149 + }, + { + "epoch": 0.04604051565377532, + "grad_norm": 0.9584419131278992, + "learning_rate": 1.523517382413088e-05, + "loss": 5.0574, + "step": 150 + }, + { + "epoch": 0.04634745242480049, + "grad_norm": 1.3933353424072266, + "learning_rate": 1.5337423312883436e-05, + "loss": 5.0468, + "step": 151 + }, + { + "epoch": 0.04665438919582566, + "grad_norm": 1.2336134910583496, + "learning_rate": 1.5439672801635993e-05, + "loss": 5.0596, + "step": 152 + }, + { + "epoch": 0.04696132596685083, + "grad_norm": 1.3005256652832031, + "learning_rate": 1.554192229038855e-05, + "loss": 5.0236, + "step": 153 + }, + { + "epoch": 0.047268262737875995, + "grad_norm": 1.2528692483901978, + "learning_rate": 1.5644171779141108e-05, + "loss": 5.0269, + "step": 154 + }, + { + "epoch": 0.04757519950890117, + "grad_norm": 1.0448148250579834, + "learning_rate": 1.574642126789366e-05, + "loss": 5.0338, + "step": 155 + }, + { + "epoch": 0.04788213627992634, + "grad_norm": 1.2372045516967773, + "learning_rate": 1.5848670756646218e-05, + "loss": 4.9544, + "step": 156 + }, + { + "epoch": 0.048189073050951506, + "grad_norm": 1.2700645923614502, + "learning_rate": 1.5950920245398772e-05, + "loss": 4.9723, + "step": 157 + }, + { + "epoch": 0.048496009821976674, + "grad_norm": 1.1283228397369385, + "learning_rate": 1.605316973415133e-05, + "loss": 4.9801, + "step": 158 + }, + { + "epoch": 0.04880294659300184, + "grad_norm": 1.5563665628433228, + "learning_rate": 1.6155419222903886e-05, + "loss": 4.9118, + "step": 159 + }, + { + "epoch": 0.04910988336402701, + "grad_norm": 1.3759487867355347, + "learning_rate": 1.6257668711656443e-05, + "loss": 4.9552, + "step": 160 + }, + { + "epoch": 0.04941682013505218, + "grad_norm": 1.2167878150939941, + "learning_rate": 1.6359918200409e-05, + "loss": 4.9186, + "step": 161 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.6424930095672607, + "learning_rate": 1.6462167689161557e-05, + "loss": 4.9143, + "step": 162 + }, + { + "epoch": 0.050030693677102514, + "grad_norm": 1.0009948015213013, + "learning_rate": 1.656441717791411e-05, + "loss": 4.8615, + "step": 163 + }, + { + "epoch": 0.05033763044812769, + "grad_norm": 1.8803274631500244, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.8558, + "step": 164 + }, + { + "epoch": 0.05064456721915286, + "grad_norm": 1.1819735765457153, + "learning_rate": 1.6768916155419224e-05, + "loss": 4.8453, + "step": 165 + }, + { + "epoch": 0.050951503990178025, + "grad_norm": 1.9724273681640625, + "learning_rate": 1.6871165644171778e-05, + "loss": 4.8573, + "step": 166 + }, + { + "epoch": 0.05125844076120319, + "grad_norm": 1.4624557495117188, + "learning_rate": 1.6973415132924335e-05, + "loss": 4.8494, + "step": 167 + }, + { + "epoch": 0.05156537753222836, + "grad_norm": 1.4750267267227173, + "learning_rate": 1.7075664621676892e-05, + "loss": 4.8296, + "step": 168 + }, + { + "epoch": 0.05187231430325353, + "grad_norm": 1.3206923007965088, + "learning_rate": 1.717791411042945e-05, + "loss": 4.7834, + "step": 169 + }, + { + "epoch": 0.0521792510742787, + "grad_norm": 1.4332681894302368, + "learning_rate": 1.7280163599182006e-05, + "loss": 4.8008, + "step": 170 + }, + { + "epoch": 0.052486187845303865, + "grad_norm": 1.612804651260376, + "learning_rate": 1.738241308793456e-05, + "loss": 4.7885, + "step": 171 + }, + { + "epoch": 0.05279312461632903, + "grad_norm": 1.3880311250686646, + "learning_rate": 1.7484662576687117e-05, + "loss": 4.8034, + "step": 172 + }, + { + "epoch": 0.05310006138735421, + "grad_norm": 1.7550631761550903, + "learning_rate": 1.7586912065439674e-05, + "loss": 4.7568, + "step": 173 + }, + { + "epoch": 0.053406998158379376, + "grad_norm": 1.653678297996521, + "learning_rate": 1.768916155419223e-05, + "loss": 4.7294, + "step": 174 + }, + { + "epoch": 0.053713934929404544, + "grad_norm": 1.6094826459884644, + "learning_rate": 1.7791411042944784e-05, + "loss": 4.7409, + "step": 175 + }, + { + "epoch": 0.05402087170042971, + "grad_norm": 1.7453033924102783, + "learning_rate": 1.789366053169734e-05, + "loss": 4.7191, + "step": 176 + }, + { + "epoch": 0.05432780847145488, + "grad_norm": 1.3073794841766357, + "learning_rate": 1.79959100204499e-05, + "loss": 4.7347, + "step": 177 + }, + { + "epoch": 0.05463474524248005, + "grad_norm": 2.096515655517578, + "learning_rate": 1.8098159509202455e-05, + "loss": 4.7396, + "step": 178 + }, + { + "epoch": 0.054941682013505216, + "grad_norm": 1.3826024532318115, + "learning_rate": 1.8200408997955012e-05, + "loss": 4.6988, + "step": 179 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 1.9290310144424438, + "learning_rate": 1.8302658486707566e-05, + "loss": 4.6653, + "step": 180 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.7404149770736694, + "learning_rate": 1.8404907975460123e-05, + "loss": 4.7102, + "step": 181 + }, + { + "epoch": 0.05586249232658073, + "grad_norm": 1.7535779476165771, + "learning_rate": 1.850715746421268e-05, + "loss": 4.7124, + "step": 182 + }, + { + "epoch": 0.056169429097605895, + "grad_norm": 1.7792351245880127, + "learning_rate": 1.8609406952965237e-05, + "loss": 4.6969, + "step": 183 + }, + { + "epoch": 0.056476365868631064, + "grad_norm": 2.048332452774048, + "learning_rate": 1.8711656441717794e-05, + "loss": 4.6134, + "step": 184 + }, + { + "epoch": 0.05678330263965623, + "grad_norm": 1.9558366537094116, + "learning_rate": 1.8813905930470348e-05, + "loss": 4.6739, + "step": 185 + }, + { + "epoch": 0.0570902394106814, + "grad_norm": 2.5299644470214844, + "learning_rate": 1.8916155419222905e-05, + "loss": 4.6248, + "step": 186 + }, + { + "epoch": 0.05739717618170657, + "grad_norm": 2.143704891204834, + "learning_rate": 1.9018404907975462e-05, + "loss": 4.6664, + "step": 187 + }, + { + "epoch": 0.057704112952731736, + "grad_norm": 1.925010323524475, + "learning_rate": 1.9120654396728015e-05, + "loss": 4.5657, + "step": 188 + }, + { + "epoch": 0.058011049723756904, + "grad_norm": 1.8223596811294556, + "learning_rate": 1.9222903885480572e-05, + "loss": 4.6124, + "step": 189 + }, + { + "epoch": 0.05831798649478207, + "grad_norm": 1.9519827365875244, + "learning_rate": 1.932515337423313e-05, + "loss": 4.5937, + "step": 190 + }, + { + "epoch": 0.05862492326580725, + "grad_norm": 2.062534809112549, + "learning_rate": 1.9427402862985686e-05, + "loss": 4.6023, + "step": 191 + }, + { + "epoch": 0.058931860036832415, + "grad_norm": 1.8512892723083496, + "learning_rate": 1.9529652351738243e-05, + "loss": 4.5709, + "step": 192 + }, + { + "epoch": 0.05923879680785758, + "grad_norm": 2.7771248817443848, + "learning_rate": 1.96319018404908e-05, + "loss": 4.5902, + "step": 193 + }, + { + "epoch": 0.05954573357888275, + "grad_norm": 1.8911874294281006, + "learning_rate": 1.9734151329243354e-05, + "loss": 4.4973, + "step": 194 + }, + { + "epoch": 0.05985267034990792, + "grad_norm": 2.261096715927124, + "learning_rate": 1.983640081799591e-05, + "loss": 4.5343, + "step": 195 + }, + { + "epoch": 0.06015960712093309, + "grad_norm": 1.833983302116394, + "learning_rate": 1.9938650306748465e-05, + "loss": 4.5604, + "step": 196 + }, + { + "epoch": 0.060466543891958255, + "grad_norm": 2.6909141540527344, + "learning_rate": 2.0040899795501022e-05, + "loss": 4.5411, + "step": 197 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.8085883855819702, + "learning_rate": 2.014314928425358e-05, + "loss": 4.5127, + "step": 198 + }, + { + "epoch": 0.06108041743400859, + "grad_norm": 3.082063913345337, + "learning_rate": 2.0245398773006136e-05, + "loss": 4.5055, + "step": 199 + }, + { + "epoch": 0.061387354205033766, + "grad_norm": 1.6942392587661743, + "learning_rate": 2.0347648261758693e-05, + "loss": 4.4852, + "step": 200 + }, + { + "epoch": 0.061694290976058934, + "grad_norm": 2.428569793701172, + "learning_rate": 2.044989775051125e-05, + "loss": 4.4876, + "step": 201 + }, + { + "epoch": 0.0620012277470841, + "grad_norm": 2.1669068336486816, + "learning_rate": 2.0552147239263807e-05, + "loss": 4.5156, + "step": 202 + }, + { + "epoch": 0.06230816451810927, + "grad_norm": 1.8558237552642822, + "learning_rate": 2.065439672801636e-05, + "loss": 4.495, + "step": 203 + }, + { + "epoch": 0.06261510128913444, + "grad_norm": 2.86224627494812, + "learning_rate": 2.0756646216768917e-05, + "loss": 4.4881, + "step": 204 + }, + { + "epoch": 0.06292203806015961, + "grad_norm": 2.263230562210083, + "learning_rate": 2.085889570552147e-05, + "loss": 4.4349, + "step": 205 + }, + { + "epoch": 0.06322897483118478, + "grad_norm": 2.533039093017578, + "learning_rate": 2.0961145194274028e-05, + "loss": 4.4921, + "step": 206 + }, + { + "epoch": 0.06353591160220995, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.1063394683026585e-05, + "loss": 4.4581, + "step": 207 + }, + { + "epoch": 0.06384284837323512, + "grad_norm": 1.9801981449127197, + "learning_rate": 2.1165644171779142e-05, + "loss": 4.4646, + "step": 208 + }, + { + "epoch": 0.06414978514426029, + "grad_norm": 2.8499860763549805, + "learning_rate": 2.12678936605317e-05, + "loss": 4.3913, + "step": 209 + }, + { + "epoch": 0.06445672191528545, + "grad_norm": 1.8176993131637573, + "learning_rate": 2.1370143149284256e-05, + "loss": 4.4414, + "step": 210 + }, + { + "epoch": 0.06476365868631062, + "grad_norm": 3.1497061252593994, + "learning_rate": 2.1472392638036813e-05, + "loss": 4.4164, + "step": 211 + }, + { + "epoch": 0.06507059545733579, + "grad_norm": 2.0509049892425537, + "learning_rate": 2.1574642126789367e-05, + "loss": 4.4198, + "step": 212 + }, + { + "epoch": 0.06537753222836096, + "grad_norm": 2.5346014499664307, + "learning_rate": 2.1676891615541924e-05, + "loss": 4.3628, + "step": 213 + }, + { + "epoch": 0.06568446899938613, + "grad_norm": 2.281947135925293, + "learning_rate": 2.1779141104294477e-05, + "loss": 4.3824, + "step": 214 + }, + { + "epoch": 0.0659914057704113, + "grad_norm": 2.9005074501037598, + "learning_rate": 2.1881390593047034e-05, + "loss": 4.4227, + "step": 215 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 2.5869741439819336, + "learning_rate": 2.198364008179959e-05, + "loss": 4.4231, + "step": 216 + }, + { + "epoch": 0.06660527931246163, + "grad_norm": 2.339655637741089, + "learning_rate": 2.208588957055215e-05, + "loss": 4.3901, + "step": 217 + }, + { + "epoch": 0.0669122160834868, + "grad_norm": 2.430664539337158, + "learning_rate": 2.2188139059304705e-05, + "loss": 4.3487, + "step": 218 + }, + { + "epoch": 0.06721915285451197, + "grad_norm": 2.1791040897369385, + "learning_rate": 2.2290388548057262e-05, + "loss": 4.3404, + "step": 219 + }, + { + "epoch": 0.06752608962553713, + "grad_norm": 2.7054920196533203, + "learning_rate": 2.239263803680982e-05, + "loss": 4.4186, + "step": 220 + }, + { + "epoch": 0.0678330263965623, + "grad_norm": 2.516566514968872, + "learning_rate": 2.2494887525562373e-05, + "loss": 4.4102, + "step": 221 + }, + { + "epoch": 0.06813996316758748, + "grad_norm": 2.3522324562072754, + "learning_rate": 2.259713701431493e-05, + "loss": 4.4062, + "step": 222 + }, + { + "epoch": 0.06844689993861265, + "grad_norm": 2.557600259780884, + "learning_rate": 2.2699386503067484e-05, + "loss": 4.3711, + "step": 223 + }, + { + "epoch": 0.06875383670963782, + "grad_norm": 2.0590531826019287, + "learning_rate": 2.280163599182004e-05, + "loss": 4.3546, + "step": 224 + }, + { + "epoch": 0.06906077348066299, + "grad_norm": 4.704878330230713, + "learning_rate": 2.2903885480572598e-05, + "loss": 4.39, + "step": 225 + }, + { + "epoch": 0.06936771025168816, + "grad_norm": 2.237440347671509, + "learning_rate": 2.3006134969325155e-05, + "loss": 4.3425, + "step": 226 + }, + { + "epoch": 0.06967464702271332, + "grad_norm": 3.9394450187683105, + "learning_rate": 2.3108384458077712e-05, + "loss": 4.3641, + "step": 227 + }, + { + "epoch": 0.06998158379373849, + "grad_norm": 2.4857213497161865, + "learning_rate": 2.321063394683027e-05, + "loss": 4.3435, + "step": 228 + }, + { + "epoch": 0.07028852056476366, + "grad_norm": 2.893437147140503, + "learning_rate": 2.3312883435582822e-05, + "loss": 4.329, + "step": 229 + }, + { + "epoch": 0.07059545733578883, + "grad_norm": 2.6498284339904785, + "learning_rate": 2.341513292433538e-05, + "loss": 4.3058, + "step": 230 + }, + { + "epoch": 0.070902394106814, + "grad_norm": 2.4182214736938477, + "learning_rate": 2.3517382413087936e-05, + "loss": 4.3147, + "step": 231 + }, + { + "epoch": 0.07120933087783916, + "grad_norm": 2.532050371170044, + "learning_rate": 2.361963190184049e-05, + "loss": 4.3388, + "step": 232 + }, + { + "epoch": 0.07151626764886433, + "grad_norm": 2.5818533897399902, + "learning_rate": 2.3721881390593047e-05, + "loss": 4.3023, + "step": 233 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 2.1860098838806152, + "learning_rate": 2.3824130879345604e-05, + "loss": 4.2571, + "step": 234 + }, + { + "epoch": 0.07213014119091467, + "grad_norm": 3.5780131816864014, + "learning_rate": 2.392638036809816e-05, + "loss": 4.3336, + "step": 235 + }, + { + "epoch": 0.07243707796193984, + "grad_norm": 2.24653697013855, + "learning_rate": 2.4028629856850718e-05, + "loss": 4.3013, + "step": 236 + }, + { + "epoch": 0.072744014732965, + "grad_norm": 3.59663987159729, + "learning_rate": 2.4130879345603275e-05, + "loss": 4.3248, + "step": 237 + }, + { + "epoch": 0.07305095150399017, + "grad_norm": 2.818321943283081, + "learning_rate": 2.423312883435583e-05, + "loss": 4.2876, + "step": 238 + }, + { + "epoch": 0.07335788827501534, + "grad_norm": 2.457371950149536, + "learning_rate": 2.4335378323108386e-05, + "loss": 4.2584, + "step": 239 + }, + { + "epoch": 0.07366482504604052, + "grad_norm": 3.6243598461151123, + "learning_rate": 2.4437627811860943e-05, + "loss": 4.2786, + "step": 240 + }, + { + "epoch": 0.07397176181706569, + "grad_norm": 2.113060474395752, + "learning_rate": 2.4539877300613496e-05, + "loss": 4.2071, + "step": 241 + }, + { + "epoch": 0.07427869858809086, + "grad_norm": 5.355374813079834, + "learning_rate": 2.4642126789366053e-05, + "loss": 4.2871, + "step": 242 + }, + { + "epoch": 0.07458563535911603, + "grad_norm": 2.4509847164154053, + "learning_rate": 2.474437627811861e-05, + "loss": 4.2073, + "step": 243 + }, + { + "epoch": 0.0748925721301412, + "grad_norm": 3.313793659210205, + "learning_rate": 2.4846625766871167e-05, + "loss": 4.2938, + "step": 244 + }, + { + "epoch": 0.07519950890116636, + "grad_norm": 2.731903553009033, + "learning_rate": 2.4948875255623724e-05, + "loss": 4.2023, + "step": 245 + }, + { + "epoch": 0.07550644567219153, + "grad_norm": 2.6218042373657227, + "learning_rate": 2.505112474437628e-05, + "loss": 4.2492, + "step": 246 + }, + { + "epoch": 0.0758133824432167, + "grad_norm": 3.2865426540374756, + "learning_rate": 2.5153374233128835e-05, + "loss": 4.2358, + "step": 247 + }, + { + "epoch": 0.07612031921424187, + "grad_norm": 2.21870756149292, + "learning_rate": 2.5255623721881395e-05, + "loss": 4.1989, + "step": 248 + }, + { + "epoch": 0.07642725598526703, + "grad_norm": 4.095842361450195, + "learning_rate": 2.535787321063395e-05, + "loss": 4.2484, + "step": 249 + }, + { + "epoch": 0.0767341927562922, + "grad_norm": 2.21420955657959, + "learning_rate": 2.5460122699386503e-05, + "loss": 4.1985, + "step": 250 + }, + { + "epoch": 0.07704112952731737, + "grad_norm": 3.011272668838501, + "learning_rate": 2.556237218813906e-05, + "loss": 4.2182, + "step": 251 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 2.930999279022217, + "learning_rate": 2.5664621676891613e-05, + "loss": 4.1985, + "step": 252 + }, + { + "epoch": 0.0776550030693677, + "grad_norm": 2.8528032302856445, + "learning_rate": 2.5766871165644174e-05, + "loss": 4.1859, + "step": 253 + }, + { + "epoch": 0.07796193984039287, + "grad_norm": 3.215587854385376, + "learning_rate": 2.5869120654396727e-05, + "loss": 4.2416, + "step": 254 + }, + { + "epoch": 0.07826887661141804, + "grad_norm": 3.1349990367889404, + "learning_rate": 2.5971370143149288e-05, + "loss": 4.2204, + "step": 255 + }, + { + "epoch": 0.07857581338244321, + "grad_norm": 3.146942377090454, + "learning_rate": 2.607361963190184e-05, + "loss": 4.17, + "step": 256 + }, + { + "epoch": 0.07888275015346839, + "grad_norm": 2.2611942291259766, + "learning_rate": 2.6175869120654402e-05, + "loss": 4.191, + "step": 257 + }, + { + "epoch": 0.07918968692449356, + "grad_norm": 3.434574604034424, + "learning_rate": 2.6278118609406955e-05, + "loss": 4.1854, + "step": 258 + }, + { + "epoch": 0.07949662369551873, + "grad_norm": 2.3132400512695312, + "learning_rate": 2.638036809815951e-05, + "loss": 4.233, + "step": 259 + }, + { + "epoch": 0.0798035604665439, + "grad_norm": 3.2676596641540527, + "learning_rate": 2.6482617586912066e-05, + "loss": 4.1586, + "step": 260 + }, + { + "epoch": 0.08011049723756906, + "grad_norm": 2.6182920932769775, + "learning_rate": 2.658486707566462e-05, + "loss": 4.164, + "step": 261 + }, + { + "epoch": 0.08041743400859423, + "grad_norm": 2.872018814086914, + "learning_rate": 2.668711656441718e-05, + "loss": 4.1642, + "step": 262 + }, + { + "epoch": 0.0807243707796194, + "grad_norm": 3.147237539291382, + "learning_rate": 2.6789366053169734e-05, + "loss": 4.147, + "step": 263 + }, + { + "epoch": 0.08103130755064457, + "grad_norm": 2.363360643386841, + "learning_rate": 2.6891615541922294e-05, + "loss": 4.1388, + "step": 264 + }, + { + "epoch": 0.08133824432166974, + "grad_norm": 3.364442825317383, + "learning_rate": 2.6993865030674848e-05, + "loss": 4.1678, + "step": 265 + }, + { + "epoch": 0.0816451810926949, + "grad_norm": 2.393705368041992, + "learning_rate": 2.7096114519427408e-05, + "loss": 4.1626, + "step": 266 + }, + { + "epoch": 0.08195211786372007, + "grad_norm": 3.8512558937072754, + "learning_rate": 2.719836400817996e-05, + "loss": 4.1613, + "step": 267 + }, + { + "epoch": 0.08225905463474524, + "grad_norm": 3.0992584228515625, + "learning_rate": 2.7300613496932515e-05, + "loss": 4.1486, + "step": 268 + }, + { + "epoch": 0.08256599140577041, + "grad_norm": 3.481079578399658, + "learning_rate": 2.7402862985685072e-05, + "loss": 4.1772, + "step": 269 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 3.2167513370513916, + "learning_rate": 2.7505112474437626e-05, + "loss": 4.1253, + "step": 270 + }, + { + "epoch": 0.08317986494782074, + "grad_norm": 2.9698429107666016, + "learning_rate": 2.7607361963190186e-05, + "loss": 4.0897, + "step": 271 + }, + { + "epoch": 0.08348680171884591, + "grad_norm": 3.2549962997436523, + "learning_rate": 2.770961145194274e-05, + "loss": 4.0851, + "step": 272 + }, + { + "epoch": 0.08379373848987108, + "grad_norm": 3.089301824569702, + "learning_rate": 2.78118609406953e-05, + "loss": 4.1378, + "step": 273 + }, + { + "epoch": 0.08410067526089625, + "grad_norm": 3.1799745559692383, + "learning_rate": 2.7914110429447854e-05, + "loss": 4.159, + "step": 274 + }, + { + "epoch": 0.08440761203192143, + "grad_norm": 2.7577199935913086, + "learning_rate": 2.8016359918200408e-05, + "loss": 4.0524, + "step": 275 + }, + { + "epoch": 0.0847145488029466, + "grad_norm": 3.709740161895752, + "learning_rate": 2.8118609406952968e-05, + "loss": 4.0877, + "step": 276 + }, + { + "epoch": 0.08502148557397177, + "grad_norm": 2.930482864379883, + "learning_rate": 2.822085889570552e-05, + "loss": 4.0408, + "step": 277 + }, + { + "epoch": 0.08532842234499693, + "grad_norm": 3.8216278553009033, + "learning_rate": 2.832310838445808e-05, + "loss": 4.0915, + "step": 278 + }, + { + "epoch": 0.0856353591160221, + "grad_norm": 2.7614903450012207, + "learning_rate": 2.8425357873210636e-05, + "loss": 4.0793, + "step": 279 + }, + { + "epoch": 0.08594229588704727, + "grad_norm": 4.005281448364258, + "learning_rate": 2.8527607361963193e-05, + "loss": 4.1234, + "step": 280 + }, + { + "epoch": 0.08624923265807244, + "grad_norm": 2.731640338897705, + "learning_rate": 2.8629856850715746e-05, + "loss": 4.1408, + "step": 281 + }, + { + "epoch": 0.0865561694290976, + "grad_norm": 4.439471244812012, + "learning_rate": 2.8732106339468307e-05, + "loss": 4.08, + "step": 282 + }, + { + "epoch": 0.08686310620012277, + "grad_norm": 2.929032564163208, + "learning_rate": 2.883435582822086e-05, + "loss": 4.0521, + "step": 283 + }, + { + "epoch": 0.08717004297114794, + "grad_norm": 3.3943557739257812, + "learning_rate": 2.8936605316973414e-05, + "loss": 4.0936, + "step": 284 + }, + { + "epoch": 0.08747697974217311, + "grad_norm": 2.9899704456329346, + "learning_rate": 2.9038854805725974e-05, + "loss": 4.0985, + "step": 285 + }, + { + "epoch": 0.08778391651319828, + "grad_norm": 2.8169870376586914, + "learning_rate": 2.9141104294478528e-05, + "loss": 4.1044, + "step": 286 + }, + { + "epoch": 0.08809085328422345, + "grad_norm": 4.312693119049072, + "learning_rate": 2.9243353783231085e-05, + "loss": 4.0515, + "step": 287 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 2.9270846843719482, + "learning_rate": 2.9345603271983642e-05, + "loss": 4.0221, + "step": 288 + }, + { + "epoch": 0.08870472682627378, + "grad_norm": 3.9831974506378174, + "learning_rate": 2.94478527607362e-05, + "loss": 4.0807, + "step": 289 + }, + { + "epoch": 0.08901166359729895, + "grad_norm": 2.721794605255127, + "learning_rate": 2.9550102249488753e-05, + "loss": 4.0732, + "step": 290 + }, + { + "epoch": 0.08931860036832412, + "grad_norm": 4.721047878265381, + "learning_rate": 2.9652351738241313e-05, + "loss": 4.0457, + "step": 291 + }, + { + "epoch": 0.08962553713934929, + "grad_norm": 2.785738229751587, + "learning_rate": 2.9754601226993867e-05, + "loss": 4.0288, + "step": 292 + }, + { + "epoch": 0.08993247391037447, + "grad_norm": 4.842009544372559, + "learning_rate": 2.985685071574642e-05, + "loss": 4.1193, + "step": 293 + }, + { + "epoch": 0.09023941068139964, + "grad_norm": 2.802044153213501, + "learning_rate": 2.995910020449898e-05, + "loss": 4.0055, + "step": 294 + }, + { + "epoch": 0.0905463474524248, + "grad_norm": 3.7060954570770264, + "learning_rate": 3.0061349693251534e-05, + "loss": 4.0478, + "step": 295 + }, + { + "epoch": 0.09085328422344997, + "grad_norm": 2.8033370971679688, + "learning_rate": 3.0163599182004095e-05, + "loss": 4.0344, + "step": 296 + }, + { + "epoch": 0.09116022099447514, + "grad_norm": 3.148653984069824, + "learning_rate": 3.026584867075665e-05, + "loss": 3.9825, + "step": 297 + }, + { + "epoch": 0.09146715776550031, + "grad_norm": 3.925459384918213, + "learning_rate": 3.0368098159509205e-05, + "loss": 4.0253, + "step": 298 + }, + { + "epoch": 0.09177409453652548, + "grad_norm": 2.8502724170684814, + "learning_rate": 3.047034764826176e-05, + "loss": 4.0192, + "step": 299 + }, + { + "epoch": 0.09208103130755065, + "grad_norm": 3.8444268703460693, + "learning_rate": 3.057259713701431e-05, + "loss": 4.0354, + "step": 300 + }, + { + "epoch": 0.09238796807857581, + "grad_norm": 2.935976982116699, + "learning_rate": 3.067484662576687e-05, + "loss": 4.0397, + "step": 301 + }, + { + "epoch": 0.09269490484960098, + "grad_norm": 2.9375271797180176, + "learning_rate": 3.0777096114519427e-05, + "loss": 3.975, + "step": 302 + }, + { + "epoch": 0.09300184162062615, + "grad_norm": 3.7623329162597656, + "learning_rate": 3.087934560327199e-05, + "loss": 4.0259, + "step": 303 + }, + { + "epoch": 0.09330877839165132, + "grad_norm": 3.1480228900909424, + "learning_rate": 3.098159509202454e-05, + "loss": 3.9676, + "step": 304 + }, + { + "epoch": 0.09361571516267649, + "grad_norm": 4.572622299194336, + "learning_rate": 3.10838445807771e-05, + "loss": 4.0123, + "step": 305 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 2.469806671142578, + "learning_rate": 3.1186094069529655e-05, + "loss": 4.012, + "step": 306 + }, + { + "epoch": 0.09422958870472682, + "grad_norm": 5.133090019226074, + "learning_rate": 3.1288343558282215e-05, + "loss": 3.9892, + "step": 307 + }, + { + "epoch": 0.09453652547575199, + "grad_norm": 3.379105567932129, + "learning_rate": 3.139059304703477e-05, + "loss": 4.0286, + "step": 308 + }, + { + "epoch": 0.09484346224677716, + "grad_norm": 3.1413521766662598, + "learning_rate": 3.149284253578732e-05, + "loss": 4.0238, + "step": 309 + }, + { + "epoch": 0.09515039901780234, + "grad_norm": 2.832242250442505, + "learning_rate": 3.159509202453988e-05, + "loss": 3.9955, + "step": 310 + }, + { + "epoch": 0.09545733578882751, + "grad_norm": 4.405134201049805, + "learning_rate": 3.1697341513292436e-05, + "loss": 4.0093, + "step": 311 + }, + { + "epoch": 0.09576427255985268, + "grad_norm": 2.8928587436676025, + "learning_rate": 3.179959100204499e-05, + "loss": 3.9518, + "step": 312 + }, + { + "epoch": 0.09607120933087784, + "grad_norm": 3.8899731636047363, + "learning_rate": 3.1901840490797544e-05, + "loss": 3.9773, + "step": 313 + }, + { + "epoch": 0.09637814610190301, + "grad_norm": 2.768199920654297, + "learning_rate": 3.2004089979550104e-05, + "loss": 3.9671, + "step": 314 + }, + { + "epoch": 0.09668508287292818, + "grad_norm": 3.834092378616333, + "learning_rate": 3.210633946830266e-05, + "loss": 3.9641, + "step": 315 + }, + { + "epoch": 0.09699201964395335, + "grad_norm": 3.566220998764038, + "learning_rate": 3.220858895705521e-05, + "loss": 3.9585, + "step": 316 + }, + { + "epoch": 0.09729895641497852, + "grad_norm": 3.1876113414764404, + "learning_rate": 3.231083844580777e-05, + "loss": 3.9689, + "step": 317 + }, + { + "epoch": 0.09760589318600368, + "grad_norm": 3.122142791748047, + "learning_rate": 3.2413087934560325e-05, + "loss": 3.9601, + "step": 318 + }, + { + "epoch": 0.09791282995702885, + "grad_norm": 3.825195789337158, + "learning_rate": 3.2515337423312886e-05, + "loss": 3.9413, + "step": 319 + }, + { + "epoch": 0.09821976672805402, + "grad_norm": 3.3126778602600098, + "learning_rate": 3.261758691206544e-05, + "loss": 4.0414, + "step": 320 + }, + { + "epoch": 0.09852670349907919, + "grad_norm": 3.7704360485076904, + "learning_rate": 3.2719836400818e-05, + "loss": 3.9224, + "step": 321 + }, + { + "epoch": 0.09883364027010436, + "grad_norm": 2.997194290161133, + "learning_rate": 3.282208588957055e-05, + "loss": 3.9454, + "step": 322 + }, + { + "epoch": 0.09914057704112952, + "grad_norm": 3.4990131855010986, + "learning_rate": 3.2924335378323114e-05, + "loss": 3.8682, + "step": 323 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 3.146879196166992, + "learning_rate": 3.302658486707567e-05, + "loss": 3.8863, + "step": 324 + }, + { + "epoch": 0.09975445058317986, + "grad_norm": 4.963291645050049, + "learning_rate": 3.312883435582822e-05, + "loss": 3.9951, + "step": 325 + }, + { + "epoch": 0.10006138735420503, + "grad_norm": 2.4511775970458984, + "learning_rate": 3.323108384458078e-05, + "loss": 3.875, + "step": 326 + }, + { + "epoch": 0.1003683241252302, + "grad_norm": 5.670922756195068, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.0446, + "step": 327 + }, + { + "epoch": 0.10067526089625538, + "grad_norm": 3.54237699508667, + "learning_rate": 3.3435582822085895e-05, + "loss": 3.9877, + "step": 328 + }, + { + "epoch": 0.10098219766728055, + "grad_norm": 2.9059271812438965, + "learning_rate": 3.353783231083845e-05, + "loss": 3.949, + "step": 329 + }, + { + "epoch": 0.10128913443830571, + "grad_norm": 3.870962381362915, + "learning_rate": 3.3640081799591e-05, + "loss": 3.8985, + "step": 330 + }, + { + "epoch": 0.10159607120933088, + "grad_norm": 3.275129556655884, + "learning_rate": 3.3742331288343556e-05, + "loss": 4.0209, + "step": 331 + }, + { + "epoch": 0.10190300798035605, + "grad_norm": 3.040931224822998, + "learning_rate": 3.3844580777096117e-05, + "loss": 3.9938, + "step": 332 + }, + { + "epoch": 0.10220994475138122, + "grad_norm": 4.3355584144592285, + "learning_rate": 3.394683026584867e-05, + "loss": 3.876, + "step": 333 + }, + { + "epoch": 0.10251688152240639, + "grad_norm": 3.0981085300445557, + "learning_rate": 3.4049079754601224e-05, + "loss": 3.9014, + "step": 334 + }, + { + "epoch": 0.10282381829343155, + "grad_norm": 3.2902655601501465, + "learning_rate": 3.4151329243353784e-05, + "loss": 3.9599, + "step": 335 + }, + { + "epoch": 0.10313075506445672, + "grad_norm": 3.496514081954956, + "learning_rate": 3.425357873210634e-05, + "loss": 3.9005, + "step": 336 + }, + { + "epoch": 0.10343769183548189, + "grad_norm": 3.4680685997009277, + "learning_rate": 3.43558282208589e-05, + "loss": 3.8591, + "step": 337 + }, + { + "epoch": 0.10374462860650706, + "grad_norm": 3.3041694164276123, + "learning_rate": 3.445807770961145e-05, + "loss": 3.9566, + "step": 338 + }, + { + "epoch": 0.10405156537753223, + "grad_norm": 3.519709825515747, + "learning_rate": 3.456032719836401e-05, + "loss": 3.9219, + "step": 339 + }, + { + "epoch": 0.1043585021485574, + "grad_norm": 3.932344436645508, + "learning_rate": 3.4662576687116566e-05, + "loss": 3.9155, + "step": 340 + }, + { + "epoch": 0.10466543891958256, + "grad_norm": 3.3109822273254395, + "learning_rate": 3.476482617586912e-05, + "loss": 3.9729, + "step": 341 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 4.556341648101807, + "learning_rate": 3.486707566462168e-05, + "loss": 3.9459, + "step": 342 + }, + { + "epoch": 0.1052793124616329, + "grad_norm": 2.9105725288391113, + "learning_rate": 3.4969325153374234e-05, + "loss": 3.9384, + "step": 343 + }, + { + "epoch": 0.10558624923265807, + "grad_norm": 3.865682601928711, + "learning_rate": 3.5071574642126794e-05, + "loss": 3.9826, + "step": 344 + }, + { + "epoch": 0.10589318600368323, + "grad_norm": 2.8606700897216797, + "learning_rate": 3.517382413087935e-05, + "loss": 3.8184, + "step": 345 + }, + { + "epoch": 0.10620012277470842, + "grad_norm": 4.323507785797119, + "learning_rate": 3.527607361963191e-05, + "loss": 3.8772, + "step": 346 + }, + { + "epoch": 0.10650705954573358, + "grad_norm": 2.890390157699585, + "learning_rate": 3.537832310838446e-05, + "loss": 3.8769, + "step": 347 + }, + { + "epoch": 0.10681399631675875, + "grad_norm": 4.008283615112305, + "learning_rate": 3.5480572597137015e-05, + "loss": 3.8796, + "step": 348 + }, + { + "epoch": 0.10712093308778392, + "grad_norm": 3.3605823516845703, + "learning_rate": 3.558282208588957e-05, + "loss": 3.8924, + "step": 349 + }, + { + "epoch": 0.10742786985880909, + "grad_norm": 3.6573123931884766, + "learning_rate": 3.568507157464213e-05, + "loss": 3.812, + "step": 350 + }, + { + "epoch": 0.10773480662983426, + "grad_norm": 3.0771777629852295, + "learning_rate": 3.578732106339468e-05, + "loss": 3.8958, + "step": 351 + }, + { + "epoch": 0.10804174340085942, + "grad_norm": 3.6483314037323, + "learning_rate": 3.5889570552147236e-05, + "loss": 3.8863, + "step": 352 + }, + { + "epoch": 0.10834868017188459, + "grad_norm": 3.1320669651031494, + "learning_rate": 3.59918200408998e-05, + "loss": 3.8194, + "step": 353 + }, + { + "epoch": 0.10865561694290976, + "grad_norm": 3.6510627269744873, + "learning_rate": 3.609406952965235e-05, + "loss": 3.8916, + "step": 354 + }, + { + "epoch": 0.10896255371393493, + "grad_norm": 3.0419273376464844, + "learning_rate": 3.619631901840491e-05, + "loss": 3.7907, + "step": 355 + }, + { + "epoch": 0.1092694904849601, + "grad_norm": 4.519289493560791, + "learning_rate": 3.6298568507157465e-05, + "loss": 3.8902, + "step": 356 + }, + { + "epoch": 0.10957642725598526, + "grad_norm": 2.938493251800537, + "learning_rate": 3.6400817995910025e-05, + "loss": 3.8675, + "step": 357 + }, + { + "epoch": 0.10988336402701043, + "grad_norm": 4.398004531860352, + "learning_rate": 3.650306748466258e-05, + "loss": 3.9535, + "step": 358 + }, + { + "epoch": 0.1101903007980356, + "grad_norm": 2.9128408432006836, + "learning_rate": 3.660531697341513e-05, + "loss": 3.944, + "step": 359 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 5.364169597625732, + "learning_rate": 3.670756646216769e-05, + "loss": 3.9289, + "step": 360 + }, + { + "epoch": 0.11080417434008594, + "grad_norm": 2.8434085845947266, + "learning_rate": 3.6809815950920246e-05, + "loss": 3.8204, + "step": 361 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.494234561920166, + "learning_rate": 3.6912065439672807e-05, + "loss": 3.8518, + "step": 362 + }, + { + "epoch": 0.11141804788213629, + "grad_norm": 2.959608554840088, + "learning_rate": 3.701431492842536e-05, + "loss": 3.8365, + "step": 363 + }, + { + "epoch": 0.11172498465316145, + "grad_norm": 3.4115726947784424, + "learning_rate": 3.711656441717792e-05, + "loss": 3.8507, + "step": 364 + }, + { + "epoch": 0.11203192142418662, + "grad_norm": 3.8023531436920166, + "learning_rate": 3.7218813905930474e-05, + "loss": 3.8544, + "step": 365 + }, + { + "epoch": 0.11233885819521179, + "grad_norm": 3.0639398097991943, + "learning_rate": 3.732106339468303e-05, + "loss": 3.8772, + "step": 366 + }, + { + "epoch": 0.11264579496623696, + "grad_norm": 4.241199016571045, + "learning_rate": 3.742331288343559e-05, + "loss": 3.7739, + "step": 367 + }, + { + "epoch": 0.11295273173726213, + "grad_norm": 2.977330446243286, + "learning_rate": 3.752556237218814e-05, + "loss": 3.8376, + "step": 368 + }, + { + "epoch": 0.1132596685082873, + "grad_norm": 4.574001789093018, + "learning_rate": 3.7627811860940696e-05, + "loss": 3.8761, + "step": 369 + }, + { + "epoch": 0.11356660527931246, + "grad_norm": 3.1499617099761963, + "learning_rate": 3.773006134969325e-05, + "loss": 3.8884, + "step": 370 + }, + { + "epoch": 0.11387354205033763, + "grad_norm": 3.81887149810791, + "learning_rate": 3.783231083844581e-05, + "loss": 3.8474, + "step": 371 + }, + { + "epoch": 0.1141804788213628, + "grad_norm": 3.424117088317871, + "learning_rate": 3.793456032719836e-05, + "loss": 3.8715, + "step": 372 + }, + { + "epoch": 0.11448741559238797, + "grad_norm": 4.431595325469971, + "learning_rate": 3.8036809815950924e-05, + "loss": 3.8305, + "step": 373 + }, + { + "epoch": 0.11479435236341314, + "grad_norm": 3.1664443016052246, + "learning_rate": 3.813905930470348e-05, + "loss": 3.8203, + "step": 374 + }, + { + "epoch": 0.1151012891344383, + "grad_norm": 4.312273025512695, + "learning_rate": 3.824130879345603e-05, + "loss": 3.8195, + "step": 375 + }, + { + "epoch": 0.11540822590546347, + "grad_norm": 3.0893726348876953, + "learning_rate": 3.834355828220859e-05, + "loss": 3.8248, + "step": 376 + }, + { + "epoch": 0.11571516267648864, + "grad_norm": 4.526726722717285, + "learning_rate": 3.8445807770961145e-05, + "loss": 3.8505, + "step": 377 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 2.5805325508117676, + "learning_rate": 3.8548057259713705e-05, + "loss": 3.8153, + "step": 378 + }, + { + "epoch": 0.11632903621853898, + "grad_norm": 4.6043381690979, + "learning_rate": 3.865030674846626e-05, + "loss": 3.8248, + "step": 379 + }, + { + "epoch": 0.11663597298956414, + "grad_norm": 3.0713136196136475, + "learning_rate": 3.875255623721882e-05, + "loss": 3.7687, + "step": 380 + }, + { + "epoch": 0.11694290976058933, + "grad_norm": 3.6344685554504395, + "learning_rate": 3.885480572597137e-05, + "loss": 3.8061, + "step": 381 + }, + { + "epoch": 0.1172498465316145, + "grad_norm": 3.6261723041534424, + "learning_rate": 3.895705521472393e-05, + "loss": 3.7939, + "step": 382 + }, + { + "epoch": 0.11755678330263966, + "grad_norm": 3.811779260635376, + "learning_rate": 3.905930470347649e-05, + "loss": 3.7973, + "step": 383 + }, + { + "epoch": 0.11786372007366483, + "grad_norm": 3.741685628890991, + "learning_rate": 3.916155419222904e-05, + "loss": 3.8149, + "step": 384 + }, + { + "epoch": 0.11817065684469, + "grad_norm": 3.330526351928711, + "learning_rate": 3.92638036809816e-05, + "loss": 3.8058, + "step": 385 + }, + { + "epoch": 0.11847759361571517, + "grad_norm": 3.2102115154266357, + "learning_rate": 3.9366053169734155e-05, + "loss": 3.7199, + "step": 386 + }, + { + "epoch": 0.11878453038674033, + "grad_norm": 3.670474052429199, + "learning_rate": 3.946830265848671e-05, + "loss": 3.8087, + "step": 387 + }, + { + "epoch": 0.1190914671577655, + "grad_norm": 3.218390941619873, + "learning_rate": 3.957055214723926e-05, + "loss": 3.7631, + "step": 388 + }, + { + "epoch": 0.11939840392879067, + "grad_norm": 4.2256693840026855, + "learning_rate": 3.967280163599182e-05, + "loss": 3.7624, + "step": 389 + }, + { + "epoch": 0.11970534069981584, + "grad_norm": 2.86247181892395, + "learning_rate": 3.9775051124744376e-05, + "loss": 3.7638, + "step": 390 + }, + { + "epoch": 0.120012277470841, + "grad_norm": 4.083118915557861, + "learning_rate": 3.987730061349693e-05, + "loss": 3.7581, + "step": 391 + }, + { + "epoch": 0.12031921424186617, + "grad_norm": 2.836794376373291, + "learning_rate": 3.997955010224949e-05, + "loss": 3.7466, + "step": 392 + }, + { + "epoch": 0.12062615101289134, + "grad_norm": 4.071137428283691, + "learning_rate": 4.0081799591002043e-05, + "loss": 3.7836, + "step": 393 + }, + { + "epoch": 0.12093308778391651, + "grad_norm": 3.3141064643859863, + "learning_rate": 4.0184049079754604e-05, + "loss": 3.754, + "step": 394 + }, + { + "epoch": 0.12124002455494168, + "grad_norm": 3.6064393520355225, + "learning_rate": 4.028629856850716e-05, + "loss": 3.8379, + "step": 395 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 3.7306606769561768, + "learning_rate": 4.038854805725972e-05, + "loss": 3.6848, + "step": 396 + }, + { + "epoch": 0.12185389809699201, + "grad_norm": 3.5877859592437744, + "learning_rate": 4.049079754601227e-05, + "loss": 3.8201, + "step": 397 + }, + { + "epoch": 0.12216083486801718, + "grad_norm": 3.930271625518799, + "learning_rate": 4.059304703476483e-05, + "loss": 3.7507, + "step": 398 + }, + { + "epoch": 0.12246777163904236, + "grad_norm": 2.974968194961548, + "learning_rate": 4.0695296523517386e-05, + "loss": 3.7545, + "step": 399 + }, + { + "epoch": 0.12277470841006753, + "grad_norm": 4.655934810638428, + "learning_rate": 4.079754601226994e-05, + "loss": 3.8093, + "step": 400 + }, + { + "epoch": 0.1230816451810927, + "grad_norm": 3.201986312866211, + "learning_rate": 4.08997955010225e-05, + "loss": 3.7252, + "step": 401 + }, + { + "epoch": 0.12338858195211787, + "grad_norm": 4.447626113891602, + "learning_rate": 4.100204498977505e-05, + "loss": 3.7132, + "step": 402 + }, + { + "epoch": 0.12369551872314304, + "grad_norm": 2.6518118381500244, + "learning_rate": 4.1104294478527614e-05, + "loss": 3.7637, + "step": 403 + }, + { + "epoch": 0.1240024554941682, + "grad_norm": 5.116448402404785, + "learning_rate": 4.120654396728017e-05, + "loss": 3.6991, + "step": 404 + }, + { + "epoch": 0.12430939226519337, + "grad_norm": 2.7780613899230957, + "learning_rate": 4.130879345603272e-05, + "loss": 3.7555, + "step": 405 + }, + { + "epoch": 0.12461632903621854, + "grad_norm": 4.281010627746582, + "learning_rate": 4.1411042944785274e-05, + "loss": 3.688, + "step": 406 + }, + { + "epoch": 0.12492326580724371, + "grad_norm": 2.851562023162842, + "learning_rate": 4.1513292433537835e-05, + "loss": 3.7557, + "step": 407 + }, + { + "epoch": 0.1252302025782689, + "grad_norm": 4.092229843139648, + "learning_rate": 4.161554192229039e-05, + "loss": 3.7179, + "step": 408 + }, + { + "epoch": 0.12553713934929406, + "grad_norm": 3.410094976425171, + "learning_rate": 4.171779141104294e-05, + "loss": 3.7292, + "step": 409 + }, + { + "epoch": 0.12584407612031923, + "grad_norm": 4.266562461853027, + "learning_rate": 4.18200408997955e-05, + "loss": 3.8204, + "step": 410 + }, + { + "epoch": 0.1261510128913444, + "grad_norm": 2.997642755508423, + "learning_rate": 4.1922290388548056e-05, + "loss": 3.7773, + "step": 411 + }, + { + "epoch": 0.12645794966236956, + "grad_norm": 4.50873327255249, + "learning_rate": 4.2024539877300617e-05, + "loss": 3.7255, + "step": 412 + }, + { + "epoch": 0.12676488643339473, + "grad_norm": 3.65312123298645, + "learning_rate": 4.212678936605317e-05, + "loss": 3.6472, + "step": 413 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 3.985487222671509, + "learning_rate": 4.222903885480573e-05, + "loss": 3.6915, + "step": 414 + }, + { + "epoch": 0.12737875997544507, + "grad_norm": 3.6020219326019287, + "learning_rate": 4.2331288343558284e-05, + "loss": 3.7299, + "step": 415 + }, + { + "epoch": 0.12768569674647023, + "grad_norm": 3.414529323577881, + "learning_rate": 4.243353783231084e-05, + "loss": 3.7827, + "step": 416 + }, + { + "epoch": 0.1279926335174954, + "grad_norm": 3.537292718887329, + "learning_rate": 4.25357873210634e-05, + "loss": 3.751, + "step": 417 + }, + { + "epoch": 0.12829957028852057, + "grad_norm": 3.5442280769348145, + "learning_rate": 4.263803680981595e-05, + "loss": 3.6828, + "step": 418 + }, + { + "epoch": 0.12860650705954574, + "grad_norm": 3.9816019535064697, + "learning_rate": 4.274028629856851e-05, + "loss": 3.7668, + "step": 419 + }, + { + "epoch": 0.1289134438305709, + "grad_norm": 3.1632657051086426, + "learning_rate": 4.2842535787321066e-05, + "loss": 3.6946, + "step": 420 + }, + { + "epoch": 0.12922038060159607, + "grad_norm": 4.731013298034668, + "learning_rate": 4.2944785276073626e-05, + "loss": 3.7078, + "step": 421 + }, + { + "epoch": 0.12952731737262124, + "grad_norm": 2.7973382472991943, + "learning_rate": 4.304703476482618e-05, + "loss": 3.5934, + "step": 422 + }, + { + "epoch": 0.1298342541436464, + "grad_norm": 4.555461406707764, + "learning_rate": 4.3149284253578733e-05, + "loss": 3.7406, + "step": 423 + }, + { + "epoch": 0.13014119091467158, + "grad_norm": 3.25795841217041, + "learning_rate": 4.3251533742331294e-05, + "loss": 3.6302, + "step": 424 + }, + { + "epoch": 0.13044812768569675, + "grad_norm": 3.9974427223205566, + "learning_rate": 4.335378323108385e-05, + "loss": 3.6995, + "step": 425 + }, + { + "epoch": 0.13075506445672191, + "grad_norm": 3.4234917163848877, + "learning_rate": 4.34560327198364e-05, + "loss": 3.727, + "step": 426 + }, + { + "epoch": 0.13106200122774708, + "grad_norm": 3.40573787689209, + "learning_rate": 4.3558282208588955e-05, + "loss": 3.6964, + "step": 427 + }, + { + "epoch": 0.13136893799877225, + "grad_norm": 3.6903765201568604, + "learning_rate": 4.3660531697341515e-05, + "loss": 3.7139, + "step": 428 + }, + { + "epoch": 0.13167587476979742, + "grad_norm": 3.3252439498901367, + "learning_rate": 4.376278118609407e-05, + "loss": 3.7221, + "step": 429 + }, + { + "epoch": 0.1319828115408226, + "grad_norm": 3.591610908508301, + "learning_rate": 4.386503067484663e-05, + "loss": 3.6592, + "step": 430 + }, + { + "epoch": 0.13228974831184775, + "grad_norm": 3.584683418273926, + "learning_rate": 4.396728016359918e-05, + "loss": 3.695, + "step": 431 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 3.5093443393707275, + "learning_rate": 4.4069529652351736e-05, + "loss": 3.6368, + "step": 432 + }, + { + "epoch": 0.1329036218538981, + "grad_norm": 3.5040347576141357, + "learning_rate": 4.41717791411043e-05, + "loss": 3.6463, + "step": 433 + }, + { + "epoch": 0.13321055862492326, + "grad_norm": 3.534536361694336, + "learning_rate": 4.427402862985685e-05, + "loss": 3.681, + "step": 434 + }, + { + "epoch": 0.13351749539594843, + "grad_norm": 4.016106605529785, + "learning_rate": 4.437627811860941e-05, + "loss": 3.7592, + "step": 435 + }, + { + "epoch": 0.1338244321669736, + "grad_norm": 3.4661898612976074, + "learning_rate": 4.4478527607361964e-05, + "loss": 3.6437, + "step": 436 + }, + { + "epoch": 0.13413136893799876, + "grad_norm": 3.917189359664917, + "learning_rate": 4.4580777096114525e-05, + "loss": 3.6809, + "step": 437 + }, + { + "epoch": 0.13443830570902393, + "grad_norm": 3.472147226333618, + "learning_rate": 4.468302658486708e-05, + "loss": 3.5978, + "step": 438 + }, + { + "epoch": 0.1347452424800491, + "grad_norm": 3.2357044219970703, + "learning_rate": 4.478527607361964e-05, + "loss": 3.6758, + "step": 439 + }, + { + "epoch": 0.13505217925107427, + "grad_norm": 3.8607826232910156, + "learning_rate": 4.488752556237219e-05, + "loss": 3.7155, + "step": 440 + }, + { + "epoch": 0.13535911602209943, + "grad_norm": 3.085242509841919, + "learning_rate": 4.4989775051124746e-05, + "loss": 3.674, + "step": 441 + }, + { + "epoch": 0.1356660527931246, + "grad_norm": 4.0473432540893555, + "learning_rate": 4.5092024539877307e-05, + "loss": 3.6542, + "step": 442 + }, + { + "epoch": 0.1359729895641498, + "grad_norm": 3.4742088317871094, + "learning_rate": 4.519427402862986e-05, + "loss": 3.6226, + "step": 443 + }, + { + "epoch": 0.13627992633517497, + "grad_norm": 3.8838884830474854, + "learning_rate": 4.5296523517382414e-05, + "loss": 3.695, + "step": 444 + }, + { + "epoch": 0.13658686310620013, + "grad_norm": 3.1551895141601562, + "learning_rate": 4.539877300613497e-05, + "loss": 3.6886, + "step": 445 + }, + { + "epoch": 0.1368937998772253, + "grad_norm": 3.6824824810028076, + "learning_rate": 4.550102249488753e-05, + "loss": 3.6397, + "step": 446 + }, + { + "epoch": 0.13720073664825047, + "grad_norm": 3.3671298027038574, + "learning_rate": 4.560327198364008e-05, + "loss": 3.5983, + "step": 447 + }, + { + "epoch": 0.13750767341927564, + "grad_norm": 4.11976957321167, + "learning_rate": 4.570552147239264e-05, + "loss": 3.6371, + "step": 448 + }, + { + "epoch": 0.1378146101903008, + "grad_norm": 3.2035205364227295, + "learning_rate": 4.5807770961145195e-05, + "loss": 3.6097, + "step": 449 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 4.944174289703369, + "learning_rate": 4.591002044989775e-05, + "loss": 3.6317, + "step": 450 + }, + { + "epoch": 0.13842848373235114, + "grad_norm": 3.0040266513824463, + "learning_rate": 4.601226993865031e-05, + "loss": 3.6407, + "step": 451 + }, + { + "epoch": 0.1387354205033763, + "grad_norm": 5.124639511108398, + "learning_rate": 4.611451942740286e-05, + "loss": 3.6539, + "step": 452 + }, + { + "epoch": 0.13904235727440148, + "grad_norm": 2.792884349822998, + "learning_rate": 4.6216768916155423e-05, + "loss": 3.6542, + "step": 453 + }, + { + "epoch": 0.13934929404542665, + "grad_norm": 4.394725799560547, + "learning_rate": 4.631901840490798e-05, + "loss": 3.6811, + "step": 454 + }, + { + "epoch": 0.13965623081645182, + "grad_norm": 3.209400177001953, + "learning_rate": 4.642126789366054e-05, + "loss": 3.6635, + "step": 455 + }, + { + "epoch": 0.13996316758747698, + "grad_norm": 3.6599526405334473, + "learning_rate": 4.652351738241309e-05, + "loss": 3.5732, + "step": 456 + }, + { + "epoch": 0.14027010435850215, + "grad_norm": 3.6527204513549805, + "learning_rate": 4.6625766871165645e-05, + "loss": 3.5979, + "step": 457 + }, + { + "epoch": 0.14057704112952732, + "grad_norm": 3.4562110900878906, + "learning_rate": 4.6728016359918205e-05, + "loss": 3.6761, + "step": 458 + }, + { + "epoch": 0.1408839779005525, + "grad_norm": 3.5935721397399902, + "learning_rate": 4.683026584867076e-05, + "loss": 3.6598, + "step": 459 + }, + { + "epoch": 0.14119091467157766, + "grad_norm": 3.4518251419067383, + "learning_rate": 4.693251533742332e-05, + "loss": 3.5707, + "step": 460 + }, + { + "epoch": 0.14149785144260282, + "grad_norm": 3.3248815536499023, + "learning_rate": 4.703476482617587e-05, + "loss": 3.6949, + "step": 461 + }, + { + "epoch": 0.141804788213628, + "grad_norm": 3.6379971504211426, + "learning_rate": 4.7137014314928426e-05, + "loss": 3.6265, + "step": 462 + }, + { + "epoch": 0.14211172498465316, + "grad_norm": 4.068325996398926, + "learning_rate": 4.723926380368098e-05, + "loss": 3.6096, + "step": 463 + }, + { + "epoch": 0.14241866175567833, + "grad_norm": 3.0870959758758545, + "learning_rate": 4.734151329243354e-05, + "loss": 3.5201, + "step": 464 + }, + { + "epoch": 0.1427255985267035, + "grad_norm": 4.013638973236084, + "learning_rate": 4.7443762781186094e-05, + "loss": 3.5845, + "step": 465 + }, + { + "epoch": 0.14303253529772866, + "grad_norm": 3.421921968460083, + "learning_rate": 4.754601226993865e-05, + "loss": 3.6718, + "step": 466 + }, + { + "epoch": 0.14333947206875383, + "grad_norm": 3.4814112186431885, + "learning_rate": 4.764826175869121e-05, + "loss": 3.6225, + "step": 467 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 2.9323105812072754, + "learning_rate": 4.775051124744376e-05, + "loss": 3.5881, + "step": 468 + }, + { + "epoch": 0.14395334561080417, + "grad_norm": 3.862344264984131, + "learning_rate": 4.785276073619632e-05, + "loss": 3.6264, + "step": 469 + }, + { + "epoch": 0.14426028238182934, + "grad_norm": 2.950495481491089, + "learning_rate": 4.7955010224948876e-05, + "loss": 3.5891, + "step": 470 + }, + { + "epoch": 0.1445672191528545, + "grad_norm": 4.360744476318359, + "learning_rate": 4.8057259713701436e-05, + "loss": 3.6746, + "step": 471 + }, + { + "epoch": 0.14487415592387967, + "grad_norm": 2.689297914505005, + "learning_rate": 4.815950920245399e-05, + "loss": 3.616, + "step": 472 + }, + { + "epoch": 0.14518109269490484, + "grad_norm": 4.433006286621094, + "learning_rate": 4.826175869120655e-05, + "loss": 3.6259, + "step": 473 + }, + { + "epoch": 0.14548802946593, + "grad_norm": 2.9184467792510986, + "learning_rate": 4.8364008179959104e-05, + "loss": 3.59, + "step": 474 + }, + { + "epoch": 0.14579496623695518, + "grad_norm": 4.472714424133301, + "learning_rate": 4.846625766871166e-05, + "loss": 3.5608, + "step": 475 + }, + { + "epoch": 0.14610190300798034, + "grad_norm": 3.0839431285858154, + "learning_rate": 4.856850715746422e-05, + "loss": 3.6069, + "step": 476 + }, + { + "epoch": 0.1464088397790055, + "grad_norm": 3.8900411128997803, + "learning_rate": 4.867075664621677e-05, + "loss": 3.5387, + "step": 477 + }, + { + "epoch": 0.14671577655003068, + "grad_norm": 3.0446956157684326, + "learning_rate": 4.877300613496933e-05, + "loss": 3.5374, + "step": 478 + }, + { + "epoch": 0.14702271332105588, + "grad_norm": 3.805018901824951, + "learning_rate": 4.8875255623721885e-05, + "loss": 3.6032, + "step": 479 + }, + { + "epoch": 0.14732965009208104, + "grad_norm": 2.9937491416931152, + "learning_rate": 4.897750511247444e-05, + "loss": 3.548, + "step": 480 + }, + { + "epoch": 0.1476365868631062, + "grad_norm": 4.103757858276367, + "learning_rate": 4.907975460122699e-05, + "loss": 3.6292, + "step": 481 + }, + { + "epoch": 0.14794352363413138, + "grad_norm": 2.8275530338287354, + "learning_rate": 4.918200408997955e-05, + "loss": 3.5885, + "step": 482 + }, + { + "epoch": 0.14825046040515655, + "grad_norm": 4.104444980621338, + "learning_rate": 4.928425357873211e-05, + "loss": 3.5566, + "step": 483 + }, + { + "epoch": 0.14855739717618172, + "grad_norm": 2.820648670196533, + "learning_rate": 4.938650306748466e-05, + "loss": 3.6576, + "step": 484 + }, + { + "epoch": 0.14886433394720688, + "grad_norm": 4.639568328857422, + "learning_rate": 4.948875255623722e-05, + "loss": 3.583, + "step": 485 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 2.8675858974456787, + "learning_rate": 4.9591002044989774e-05, + "loss": 3.5982, + "step": 486 + }, + { + "epoch": 0.14947820748925722, + "grad_norm": 4.820484638214111, + "learning_rate": 4.9693251533742335e-05, + "loss": 3.5479, + "step": 487 + }, + { + "epoch": 0.1497851442602824, + "grad_norm": 2.9569075107574463, + "learning_rate": 4.979550102249489e-05, + "loss": 3.5846, + "step": 488 + }, + { + "epoch": 0.15009208103130756, + "grad_norm": 4.402152061462402, + "learning_rate": 4.989775051124745e-05, + "loss": 3.5368, + "step": 489 + }, + { + "epoch": 0.15039901780233272, + "grad_norm": 3.0454704761505127, + "learning_rate": 5e-05, + "loss": 3.5233, + "step": 490 + }, + { + "epoch": 0.1507059545733579, + "grad_norm": 3.564425468444824, + "learning_rate": 5.010224948875256e-05, + "loss": 3.5747, + "step": 491 + }, + { + "epoch": 0.15101289134438306, + "grad_norm": 3.2065536975860596, + "learning_rate": 5.020449897750511e-05, + "loss": 3.4803, + "step": 492 + }, + { + "epoch": 0.15131982811540823, + "grad_norm": 4.06170129776001, + "learning_rate": 5.030674846625767e-05, + "loss": 3.5867, + "step": 493 + }, + { + "epoch": 0.1516267648864334, + "grad_norm": 2.937181234359741, + "learning_rate": 5.040899795501023e-05, + "loss": 3.5098, + "step": 494 + }, + { + "epoch": 0.15193370165745856, + "grad_norm": 3.7272653579711914, + "learning_rate": 5.051124744376279e-05, + "loss": 3.5959, + "step": 495 + }, + { + "epoch": 0.15224063842848373, + "grad_norm": 2.8606886863708496, + "learning_rate": 5.061349693251534e-05, + "loss": 3.4881, + "step": 496 + }, + { + "epoch": 0.1525475751995089, + "grad_norm": 3.4861185550689697, + "learning_rate": 5.07157464212679e-05, + "loss": 3.563, + "step": 497 + }, + { + "epoch": 0.15285451197053407, + "grad_norm": 3.1362967491149902, + "learning_rate": 5.081799591002045e-05, + "loss": 3.5564, + "step": 498 + }, + { + "epoch": 0.15316144874155924, + "grad_norm": 3.360508441925049, + "learning_rate": 5.0920245398773005e-05, + "loss": 3.5307, + "step": 499 + }, + { + "epoch": 0.1534683855125844, + "grad_norm": 3.2896840572357178, + "learning_rate": 5.1022494887525566e-05, + "loss": 3.4843, + "step": 500 + }, + { + "epoch": 0.15377532228360957, + "grad_norm": 3.320429801940918, + "learning_rate": 5.112474437627812e-05, + "loss": 3.484, + "step": 501 + }, + { + "epoch": 0.15408225905463474, + "grad_norm": 3.409586191177368, + "learning_rate": 5.122699386503068e-05, + "loss": 3.506, + "step": 502 + }, + { + "epoch": 0.1543891958256599, + "grad_norm": 3.0944409370422363, + "learning_rate": 5.1329243353783227e-05, + "loss": 3.5011, + "step": 503 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 3.7220418453216553, + "learning_rate": 5.143149284253579e-05, + "loss": 3.5629, + "step": 504 + }, + { + "epoch": 0.15500306936771024, + "grad_norm": 3.217435359954834, + "learning_rate": 5.153374233128835e-05, + "loss": 3.4957, + "step": 505 + }, + { + "epoch": 0.1553100061387354, + "grad_norm": 4.0457444190979, + "learning_rate": 5.163599182004091e-05, + "loss": 3.5152, + "step": 506 + }, + { + "epoch": 0.15561694290976058, + "grad_norm": 2.9380006790161133, + "learning_rate": 5.1738241308793455e-05, + "loss": 3.5261, + "step": 507 + }, + { + "epoch": 0.15592387968078575, + "grad_norm": 4.134535312652588, + "learning_rate": 5.1840490797546015e-05, + "loss": 3.5622, + "step": 508 + }, + { + "epoch": 0.15623081645181092, + "grad_norm": 2.8209407329559326, + "learning_rate": 5.1942740286298575e-05, + "loss": 3.5335, + "step": 509 + }, + { + "epoch": 0.15653775322283608, + "grad_norm": 4.4260711669921875, + "learning_rate": 5.204498977505112e-05, + "loss": 3.5554, + "step": 510 + }, + { + "epoch": 0.15684468999386125, + "grad_norm": 2.8649590015411377, + "learning_rate": 5.214723926380368e-05, + "loss": 3.4989, + "step": 511 + }, + { + "epoch": 0.15715162676488642, + "grad_norm": 4.0349812507629395, + "learning_rate": 5.224948875255624e-05, + "loss": 3.4883, + "step": 512 + }, + { + "epoch": 0.1574585635359116, + "grad_norm": 2.841923475265503, + "learning_rate": 5.2351738241308803e-05, + "loss": 3.4748, + "step": 513 + }, + { + "epoch": 0.15776550030693678, + "grad_norm": 3.8810653686523438, + "learning_rate": 5.245398773006135e-05, + "loss": 3.5403, + "step": 514 + }, + { + "epoch": 0.15807243707796195, + "grad_norm": 3.0830774307250977, + "learning_rate": 5.255623721881391e-05, + "loss": 3.513, + "step": 515 + }, + { + "epoch": 0.15837937384898712, + "grad_norm": 3.8688604831695557, + "learning_rate": 5.265848670756647e-05, + "loss": 3.5409, + "step": 516 + }, + { + "epoch": 0.1586863106200123, + "grad_norm": 2.854600429534912, + "learning_rate": 5.276073619631902e-05, + "loss": 3.4441, + "step": 517 + }, + { + "epoch": 0.15899324739103746, + "grad_norm": 3.9125611782073975, + "learning_rate": 5.286298568507158e-05, + "loss": 3.4953, + "step": 518 + }, + { + "epoch": 0.15930018416206262, + "grad_norm": 2.8626177310943604, + "learning_rate": 5.296523517382413e-05, + "loss": 3.5279, + "step": 519 + }, + { + "epoch": 0.1596071209330878, + "grad_norm": 3.5023677349090576, + "learning_rate": 5.306748466257669e-05, + "loss": 3.4886, + "step": 520 + }, + { + "epoch": 0.15991405770411296, + "grad_norm": 2.960505962371826, + "learning_rate": 5.316973415132924e-05, + "loss": 3.5278, + "step": 521 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 3.976245164871216, + "learning_rate": 5.32719836400818e-05, + "loss": 3.5236, + "step": 522 + }, + { + "epoch": 0.1605279312461633, + "grad_norm": 3.078248977661133, + "learning_rate": 5.337423312883436e-05, + "loss": 3.5194, + "step": 523 + }, + { + "epoch": 0.16083486801718846, + "grad_norm": 3.7498552799224854, + "learning_rate": 5.347648261758691e-05, + "loss": 3.5315, + "step": 524 + }, + { + "epoch": 0.16114180478821363, + "grad_norm": 2.87638258934021, + "learning_rate": 5.357873210633947e-05, + "loss": 3.434, + "step": 525 + }, + { + "epoch": 0.1614487415592388, + "grad_norm": 3.786454677581787, + "learning_rate": 5.368098159509203e-05, + "loss": 3.4985, + "step": 526 + }, + { + "epoch": 0.16175567833026397, + "grad_norm": 2.915156364440918, + "learning_rate": 5.378323108384459e-05, + "loss": 3.4979, + "step": 527 + }, + { + "epoch": 0.16206261510128914, + "grad_norm": 4.095824718475342, + "learning_rate": 5.3885480572597135e-05, + "loss": 3.4605, + "step": 528 + }, + { + "epoch": 0.1623695518723143, + "grad_norm": 2.793501853942871, + "learning_rate": 5.3987730061349695e-05, + "loss": 3.476, + "step": 529 + }, + { + "epoch": 0.16267648864333947, + "grad_norm": 3.9074480533599854, + "learning_rate": 5.4089979550102256e-05, + "loss": 3.4636, + "step": 530 + }, + { + "epoch": 0.16298342541436464, + "grad_norm": 2.8382515907287598, + "learning_rate": 5.4192229038854816e-05, + "loss": 3.4364, + "step": 531 + }, + { + "epoch": 0.1632903621853898, + "grad_norm": 3.4670751094818115, + "learning_rate": 5.429447852760736e-05, + "loss": 3.5033, + "step": 532 + }, + { + "epoch": 0.16359729895641498, + "grad_norm": 2.8805580139160156, + "learning_rate": 5.439672801635992e-05, + "loss": 3.471, + "step": 533 + }, + { + "epoch": 0.16390423572744015, + "grad_norm": 3.745434522628784, + "learning_rate": 5.4498977505112484e-05, + "loss": 3.4565, + "step": 534 + }, + { + "epoch": 0.1642111724984653, + "grad_norm": 3.290579319000244, + "learning_rate": 5.460122699386503e-05, + "loss": 3.47, + "step": 535 + }, + { + "epoch": 0.16451810926949048, + "grad_norm": 3.2988481521606445, + "learning_rate": 5.470347648261759e-05, + "loss": 3.3781, + "step": 536 + }, + { + "epoch": 0.16482504604051565, + "grad_norm": 3.3673248291015625, + "learning_rate": 5.4805725971370145e-05, + "loss": 3.4891, + "step": 537 + }, + { + "epoch": 0.16513198281154082, + "grad_norm": 3.1917717456817627, + "learning_rate": 5.4907975460122705e-05, + "loss": 3.4493, + "step": 538 + }, + { + "epoch": 0.16543891958256599, + "grad_norm": 3.3869614601135254, + "learning_rate": 5.501022494887525e-05, + "loss": 3.3954, + "step": 539 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 2.896742820739746, + "learning_rate": 5.511247443762781e-05, + "loss": 3.4465, + "step": 540 + }, + { + "epoch": 0.16605279312461632, + "grad_norm": 3.771268844604492, + "learning_rate": 5.521472392638037e-05, + "loss": 3.4889, + "step": 541 + }, + { + "epoch": 0.1663597298956415, + "grad_norm": 2.8693349361419678, + "learning_rate": 5.531697341513292e-05, + "loss": 3.3661, + "step": 542 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.093103885650635, + "learning_rate": 5.541922290388548e-05, + "loss": 3.4451, + "step": 543 + }, + { + "epoch": 0.16697360343769183, + "grad_norm": 3.050361394882202, + "learning_rate": 5.552147239263804e-05, + "loss": 3.4203, + "step": 544 + }, + { + "epoch": 0.167280540208717, + "grad_norm": 3.041480302810669, + "learning_rate": 5.56237218813906e-05, + "loss": 3.4173, + "step": 545 + }, + { + "epoch": 0.16758747697974216, + "grad_norm": 3.385680675506592, + "learning_rate": 5.572597137014315e-05, + "loss": 3.4408, + "step": 546 + }, + { + "epoch": 0.16789441375076733, + "grad_norm": 2.88845157623291, + "learning_rate": 5.582822085889571e-05, + "loss": 3.4536, + "step": 547 + }, + { + "epoch": 0.1682013505217925, + "grad_norm": 3.7155961990356445, + "learning_rate": 5.593047034764827e-05, + "loss": 3.4392, + "step": 548 + }, + { + "epoch": 0.1685082872928177, + "grad_norm": 3.4626615047454834, + "learning_rate": 5.6032719836400815e-05, + "loss": 3.4395, + "step": 549 + }, + { + "epoch": 0.16881522406384286, + "grad_norm": 3.182154417037964, + "learning_rate": 5.6134969325153376e-05, + "loss": 3.5239, + "step": 550 + }, + { + "epoch": 0.16912216083486803, + "grad_norm": 3.478602886199951, + "learning_rate": 5.6237218813905936e-05, + "loss": 3.4258, + "step": 551 + }, + { + "epoch": 0.1694290976058932, + "grad_norm": 2.9652369022369385, + "learning_rate": 5.6339468302658496e-05, + "loss": 3.3919, + "step": 552 + }, + { + "epoch": 0.16973603437691837, + "grad_norm": 3.736821413040161, + "learning_rate": 5.644171779141104e-05, + "loss": 3.4491, + "step": 553 + }, + { + "epoch": 0.17004297114794353, + "grad_norm": 2.7791361808776855, + "learning_rate": 5.6543967280163604e-05, + "loss": 3.4748, + "step": 554 + }, + { + "epoch": 0.1703499079189687, + "grad_norm": 4.583637714385986, + "learning_rate": 5.664621676891616e-05, + "loss": 3.4554, + "step": 555 + }, + { + "epoch": 0.17065684468999387, + "grad_norm": 2.8527474403381348, + "learning_rate": 5.674846625766872e-05, + "loss": 3.4327, + "step": 556 + }, + { + "epoch": 0.17096378146101904, + "grad_norm": 4.116163730621338, + "learning_rate": 5.685071574642127e-05, + "loss": 3.4043, + "step": 557 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 3.0130903720855713, + "learning_rate": 5.6952965235173825e-05, + "loss": 3.4823, + "step": 558 + }, + { + "epoch": 0.17157765500306937, + "grad_norm": 3.3556432723999023, + "learning_rate": 5.7055214723926385e-05, + "loss": 3.4464, + "step": 559 + }, + { + "epoch": 0.17188459177409454, + "grad_norm": 2.854952573776245, + "learning_rate": 5.715746421267893e-05, + "loss": 3.3768, + "step": 560 + }, + { + "epoch": 0.1721915285451197, + "grad_norm": 3.9891982078552246, + "learning_rate": 5.725971370143149e-05, + "loss": 3.3949, + "step": 561 + }, + { + "epoch": 0.17249846531614488, + "grad_norm": 2.980468511581421, + "learning_rate": 5.736196319018405e-05, + "loss": 3.459, + "step": 562 + }, + { + "epoch": 0.17280540208717005, + "grad_norm": 3.453510284423828, + "learning_rate": 5.7464212678936613e-05, + "loss": 3.4549, + "step": 563 + }, + { + "epoch": 0.1731123388581952, + "grad_norm": 2.8926782608032227, + "learning_rate": 5.756646216768916e-05, + "loss": 3.392, + "step": 564 + }, + { + "epoch": 0.17341927562922038, + "grad_norm": 3.3722894191741943, + "learning_rate": 5.766871165644172e-05, + "loss": 3.4002, + "step": 565 + }, + { + "epoch": 0.17372621240024555, + "grad_norm": 2.8093647956848145, + "learning_rate": 5.777096114519428e-05, + "loss": 3.3862, + "step": 566 + }, + { + "epoch": 0.17403314917127072, + "grad_norm": 4.1722731590271, + "learning_rate": 5.787321063394683e-05, + "loss": 3.3903, + "step": 567 + }, + { + "epoch": 0.17434008594229589, + "grad_norm": 2.778069257736206, + "learning_rate": 5.797546012269939e-05, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.17464702271332105, + "grad_norm": 3.8501908779144287, + "learning_rate": 5.807770961145195e-05, + "loss": 3.4094, + "step": 569 + }, + { + "epoch": 0.17495395948434622, + "grad_norm": 2.5164549350738525, + "learning_rate": 5.817995910020451e-05, + "loss": 3.4343, + "step": 570 + }, + { + "epoch": 0.1752608962553714, + "grad_norm": 4.0673065185546875, + "learning_rate": 5.8282208588957056e-05, + "loss": 3.3993, + "step": 571 + }, + { + "epoch": 0.17556783302639656, + "grad_norm": 2.7882072925567627, + "learning_rate": 5.8384458077709616e-05, + "loss": 3.4759, + "step": 572 + }, + { + "epoch": 0.17587476979742173, + "grad_norm": 3.3252487182617188, + "learning_rate": 5.848670756646217e-05, + "loss": 3.3562, + "step": 573 + }, + { + "epoch": 0.1761817065684469, + "grad_norm": 2.7499115467071533, + "learning_rate": 5.8588957055214724e-05, + "loss": 3.3376, + "step": 574 + }, + { + "epoch": 0.17648864333947206, + "grad_norm": 4.061224460601807, + "learning_rate": 5.8691206543967284e-05, + "loss": 3.3521, + "step": 575 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 3.022193431854248, + "learning_rate": 5.879345603271984e-05, + "loss": 3.3933, + "step": 576 + }, + { + "epoch": 0.1771025168815224, + "grad_norm": 3.2442128658294678, + "learning_rate": 5.88957055214724e-05, + "loss": 3.4531, + "step": 577 + }, + { + "epoch": 0.17740945365254757, + "grad_norm": 2.9524872303009033, + "learning_rate": 5.8997955010224945e-05, + "loss": 3.332, + "step": 578 + }, + { + "epoch": 0.17771639042357273, + "grad_norm": 3.4604902267456055, + "learning_rate": 5.9100204498977505e-05, + "loss": 3.3706, + "step": 579 + }, + { + "epoch": 0.1780233271945979, + "grad_norm": 3.05216646194458, + "learning_rate": 5.9202453987730066e-05, + "loss": 3.463, + "step": 580 + }, + { + "epoch": 0.17833026396562307, + "grad_norm": 3.427311658859253, + "learning_rate": 5.9304703476482626e-05, + "loss": 3.4204, + "step": 581 + }, + { + "epoch": 0.17863720073664824, + "grad_norm": 2.5583856105804443, + "learning_rate": 5.940695296523517e-05, + "loss": 3.4686, + "step": 582 + }, + { + "epoch": 0.1789441375076734, + "grad_norm": 3.85471248626709, + "learning_rate": 5.950920245398773e-05, + "loss": 3.4518, + "step": 583 + }, + { + "epoch": 0.17925107427869857, + "grad_norm": 2.6894235610961914, + "learning_rate": 5.9611451942740294e-05, + "loss": 3.4179, + "step": 584 + }, + { + "epoch": 0.17955801104972377, + "grad_norm": 3.7592904567718506, + "learning_rate": 5.971370143149284e-05, + "loss": 3.3197, + "step": 585 + }, + { + "epoch": 0.17986494782074894, + "grad_norm": 2.8180313110351562, + "learning_rate": 5.98159509202454e-05, + "loss": 3.4098, + "step": 586 + }, + { + "epoch": 0.1801718845917741, + "grad_norm": 3.5678224563598633, + "learning_rate": 5.991820040899796e-05, + "loss": 3.3644, + "step": 587 + }, + { + "epoch": 0.18047882136279927, + "grad_norm": 2.920607328414917, + "learning_rate": 6.002044989775052e-05, + "loss": 3.4158, + "step": 588 + }, + { + "epoch": 0.18078575813382444, + "grad_norm": 2.9465436935424805, + "learning_rate": 6.012269938650307e-05, + "loss": 3.3369, + "step": 589 + }, + { + "epoch": 0.1810926949048496, + "grad_norm": 3.8760533332824707, + "learning_rate": 6.022494887525563e-05, + "loss": 3.4205, + "step": 590 + }, + { + "epoch": 0.18139963167587478, + "grad_norm": 3.2972259521484375, + "learning_rate": 6.032719836400819e-05, + "loss": 3.3234, + "step": 591 + }, + { + "epoch": 0.18170656844689995, + "grad_norm": 2.8855841159820557, + "learning_rate": 6.0429447852760736e-05, + "loss": 3.4172, + "step": 592 + }, + { + "epoch": 0.18201350521792511, + "grad_norm": 3.3035166263580322, + "learning_rate": 6.05316973415133e-05, + "loss": 3.3235, + "step": 593 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 2.5975232124328613, + "learning_rate": 6.063394683026585e-05, + "loss": 3.3245, + "step": 594 + }, + { + "epoch": 0.18262737875997545, + "grad_norm": 3.68007755279541, + "learning_rate": 6.073619631901841e-05, + "loss": 3.4348, + "step": 595 + }, + { + "epoch": 0.18293431553100062, + "grad_norm": 2.774419069290161, + "learning_rate": 6.083844580777096e-05, + "loss": 3.2763, + "step": 596 + }, + { + "epoch": 0.1832412523020258, + "grad_norm": 3.686140298843384, + "learning_rate": 6.094069529652352e-05, + "loss": 3.29, + "step": 597 + }, + { + "epoch": 0.18354818907305095, + "grad_norm": 2.71142315864563, + "learning_rate": 6.104294478527609e-05, + "loss": 3.3899, + "step": 598 + }, + { + "epoch": 0.18385512584407612, + "grad_norm": 3.725736141204834, + "learning_rate": 6.114519427402863e-05, + "loss": 3.3844, + "step": 599 + }, + { + "epoch": 0.1841620626151013, + "grad_norm": 2.691237211227417, + "learning_rate": 6.124744376278119e-05, + "loss": 3.3138, + "step": 600 + }, + { + "epoch": 0.18446899938612646, + "grad_norm": 3.467499256134033, + "learning_rate": 6.134969325153375e-05, + "loss": 3.3501, + "step": 601 + }, + { + "epoch": 0.18477593615715163, + "grad_norm": 2.776309013366699, + "learning_rate": 6.14519427402863e-05, + "loss": 3.3278, + "step": 602 + }, + { + "epoch": 0.1850828729281768, + "grad_norm": 3.4674019813537598, + "learning_rate": 6.155419222903885e-05, + "loss": 3.262, + "step": 603 + }, + { + "epoch": 0.18538980969920196, + "grad_norm": 2.8091421127319336, + "learning_rate": 6.165644171779141e-05, + "loss": 3.3296, + "step": 604 + }, + { + "epoch": 0.18569674647022713, + "grad_norm": 3.4938528537750244, + "learning_rate": 6.175869120654397e-05, + "loss": 3.4028, + "step": 605 + }, + { + "epoch": 0.1860036832412523, + "grad_norm": 2.5200188159942627, + "learning_rate": 6.186094069529653e-05, + "loss": 3.3726, + "step": 606 + }, + { + "epoch": 0.18631062001227747, + "grad_norm": 3.6415109634399414, + "learning_rate": 6.196319018404908e-05, + "loss": 3.3539, + "step": 607 + }, + { + "epoch": 0.18661755678330263, + "grad_norm": 2.553532123565674, + "learning_rate": 6.206543967280163e-05, + "loss": 3.2971, + "step": 608 + }, + { + "epoch": 0.1869244935543278, + "grad_norm": 3.7287046909332275, + "learning_rate": 6.21676891615542e-05, + "loss": 3.3987, + "step": 609 + }, + { + "epoch": 0.18723143032535297, + "grad_norm": 2.6285226345062256, + "learning_rate": 6.226993865030674e-05, + "loss": 3.2446, + "step": 610 + }, + { + "epoch": 0.18753836709637814, + "grad_norm": 3.453766107559204, + "learning_rate": 6.237218813905931e-05, + "loss": 3.2644, + "step": 611 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 2.7924115657806396, + "learning_rate": 6.247443762781186e-05, + "loss": 3.3056, + "step": 612 + }, + { + "epoch": 0.18815224063842848, + "grad_norm": 3.4854533672332764, + "learning_rate": 6.257668711656443e-05, + "loss": 3.3468, + "step": 613 + }, + { + "epoch": 0.18845917740945364, + "grad_norm": 2.8738653659820557, + "learning_rate": 6.267893660531697e-05, + "loss": 3.3079, + "step": 614 + }, + { + "epoch": 0.1887661141804788, + "grad_norm": 3.496342420578003, + "learning_rate": 6.278118609406954e-05, + "loss": 3.3453, + "step": 615 + }, + { + "epoch": 0.18907305095150398, + "grad_norm": 3.1935245990753174, + "learning_rate": 6.288343558282209e-05, + "loss": 3.303, + "step": 616 + }, + { + "epoch": 0.18937998772252915, + "grad_norm": 2.9726579189300537, + "learning_rate": 6.298568507157464e-05, + "loss": 3.284, + "step": 617 + }, + { + "epoch": 0.18968692449355432, + "grad_norm": 2.8515241146087646, + "learning_rate": 6.30879345603272e-05, + "loss": 3.2748, + "step": 618 + }, + { + "epoch": 0.18999386126457948, + "grad_norm": 3.216681480407715, + "learning_rate": 6.319018404907977e-05, + "loss": 3.2613, + "step": 619 + }, + { + "epoch": 0.19030079803560468, + "grad_norm": 2.9164562225341797, + "learning_rate": 6.329243353783232e-05, + "loss": 3.3234, + "step": 620 + }, + { + "epoch": 0.19060773480662985, + "grad_norm": 2.6724259853363037, + "learning_rate": 6.339468302658487e-05, + "loss": 3.3271, + "step": 621 + }, + { + "epoch": 0.19091467157765502, + "grad_norm": 3.298551082611084, + "learning_rate": 6.349693251533743e-05, + "loss": 3.2715, + "step": 622 + }, + { + "epoch": 0.19122160834868018, + "grad_norm": 2.609632968902588, + "learning_rate": 6.359918200408998e-05, + "loss": 3.2392, + "step": 623 + }, + { + "epoch": 0.19152854511970535, + "grad_norm": 3.6469385623931885, + "learning_rate": 6.370143149284253e-05, + "loss": 3.428, + "step": 624 + }, + { + "epoch": 0.19183548189073052, + "grad_norm": 2.4231622219085693, + "learning_rate": 6.380368098159509e-05, + "loss": 3.3436, + "step": 625 + }, + { + "epoch": 0.1921424186617557, + "grad_norm": 3.9182474613189697, + "learning_rate": 6.390593047034765e-05, + "loss": 3.3375, + "step": 626 + }, + { + "epoch": 0.19244935543278086, + "grad_norm": 2.3975942134857178, + "learning_rate": 6.400817995910021e-05, + "loss": 3.2711, + "step": 627 + }, + { + "epoch": 0.19275629220380602, + "grad_norm": 3.061039447784424, + "learning_rate": 6.411042944785276e-05, + "loss": 3.3124, + "step": 628 + }, + { + "epoch": 0.1930632289748312, + "grad_norm": 2.9461817741394043, + "learning_rate": 6.421267893660532e-05, + "loss": 3.2954, + "step": 629 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 2.6603662967681885, + "learning_rate": 6.431492842535788e-05, + "loss": 3.2138, + "step": 630 + }, + { + "epoch": 0.19367710251688153, + "grad_norm": 3.339444875717163, + "learning_rate": 6.441717791411042e-05, + "loss": 3.2796, + "step": 631 + }, + { + "epoch": 0.1939840392879067, + "grad_norm": 2.59061861038208, + "learning_rate": 6.451942740286299e-05, + "loss": 3.3906, + "step": 632 + }, + { + "epoch": 0.19429097605893186, + "grad_norm": 3.704300880432129, + "learning_rate": 6.462167689161554e-05, + "loss": 3.2604, + "step": 633 + }, + { + "epoch": 0.19459791282995703, + "grad_norm": 3.110203266143799, + "learning_rate": 6.472392638036811e-05, + "loss": 3.3236, + "step": 634 + }, + { + "epoch": 0.1949048496009822, + "grad_norm": 3.016730308532715, + "learning_rate": 6.482617586912065e-05, + "loss": 3.2911, + "step": 635 + }, + { + "epoch": 0.19521178637200737, + "grad_norm": 2.896956205368042, + "learning_rate": 6.492842535787322e-05, + "loss": 3.35, + "step": 636 + }, + { + "epoch": 0.19551872314303254, + "grad_norm": 2.7913663387298584, + "learning_rate": 6.503067484662577e-05, + "loss": 3.3474, + "step": 637 + }, + { + "epoch": 0.1958256599140577, + "grad_norm": 3.285518169403076, + "learning_rate": 6.513292433537832e-05, + "loss": 3.2131, + "step": 638 + }, + { + "epoch": 0.19613259668508287, + "grad_norm": 2.588491201400757, + "learning_rate": 6.523517382413088e-05, + "loss": 3.2955, + "step": 639 + }, + { + "epoch": 0.19643953345610804, + "grad_norm": 2.9417827129364014, + "learning_rate": 6.533742331288345e-05, + "loss": 3.2917, + "step": 640 + }, + { + "epoch": 0.1967464702271332, + "grad_norm": 3.2209408283233643, + "learning_rate": 6.5439672801636e-05, + "loss": 3.233, + "step": 641 + }, + { + "epoch": 0.19705340699815838, + "grad_norm": 2.8424925804138184, + "learning_rate": 6.554192229038855e-05, + "loss": 3.3194, + "step": 642 + }, + { + "epoch": 0.19736034376918354, + "grad_norm": 2.9005842208862305, + "learning_rate": 6.56441717791411e-05, + "loss": 3.275, + "step": 643 + }, + { + "epoch": 0.1976672805402087, + "grad_norm": 3.0277016162872314, + "learning_rate": 6.574642126789366e-05, + "loss": 3.2881, + "step": 644 + }, + { + "epoch": 0.19797421731123388, + "grad_norm": 2.8932368755340576, + "learning_rate": 6.584867075664623e-05, + "loss": 3.2799, + "step": 645 + }, + { + "epoch": 0.19828115408225905, + "grad_norm": 2.994464635848999, + "learning_rate": 6.595092024539877e-05, + "loss": 3.258, + "step": 646 + }, + { + "epoch": 0.19858809085328422, + "grad_norm": 2.943040132522583, + "learning_rate": 6.605316973415133e-05, + "loss": 3.1994, + "step": 647 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 2.942765712738037, + "learning_rate": 6.615541922290389e-05, + "loss": 3.1802, + "step": 648 + }, + { + "epoch": 0.19920196439533455, + "grad_norm": 2.8036246299743652, + "learning_rate": 6.625766871165644e-05, + "loss": 3.2426, + "step": 649 + }, + { + "epoch": 0.19950890116635972, + "grad_norm": 2.814507484436035, + "learning_rate": 6.6359918200409e-05, + "loss": 3.2978, + "step": 650 + }, + { + "epoch": 0.1998158379373849, + "grad_norm": 2.8133158683776855, + "learning_rate": 6.646216768916156e-05, + "loss": 3.2435, + "step": 651 + }, + { + "epoch": 0.20012277470841006, + "grad_norm": 2.8596129417419434, + "learning_rate": 6.656441717791412e-05, + "loss": 3.2154, + "step": 652 + }, + { + "epoch": 0.20042971147943522, + "grad_norm": 2.663926839828491, + "learning_rate": 6.666666666666667e-05, + "loss": 3.2487, + "step": 653 + }, + { + "epoch": 0.2007366482504604, + "grad_norm": 3.40561580657959, + "learning_rate": 6.676891615541922e-05, + "loss": 3.1509, + "step": 654 + }, + { + "epoch": 0.20104358502148556, + "grad_norm": 2.5786798000335693, + "learning_rate": 6.687116564417179e-05, + "loss": 3.2686, + "step": 655 + }, + { + "epoch": 0.20135052179251076, + "grad_norm": 3.007436752319336, + "learning_rate": 6.697341513292433e-05, + "loss": 3.2543, + "step": 656 + }, + { + "epoch": 0.20165745856353592, + "grad_norm": 2.5966951847076416, + "learning_rate": 6.70756646216769e-05, + "loss": 3.2643, + "step": 657 + }, + { + "epoch": 0.2019643953345611, + "grad_norm": 3.2698333263397217, + "learning_rate": 6.717791411042945e-05, + "loss": 3.2002, + "step": 658 + }, + { + "epoch": 0.20227133210558626, + "grad_norm": 2.513129472732544, + "learning_rate": 6.7280163599182e-05, + "loss": 3.1551, + "step": 659 + }, + { + "epoch": 0.20257826887661143, + "grad_norm": 2.9690299034118652, + "learning_rate": 6.738241308793456e-05, + "loss": 3.3037, + "step": 660 + }, + { + "epoch": 0.2028852056476366, + "grad_norm": 2.6644227504730225, + "learning_rate": 6.748466257668711e-05, + "loss": 3.3225, + "step": 661 + }, + { + "epoch": 0.20319214241866176, + "grad_norm": 2.6990232467651367, + "learning_rate": 6.758691206543968e-05, + "loss": 3.227, + "step": 662 + }, + { + "epoch": 0.20349907918968693, + "grad_norm": 3.6271350383758545, + "learning_rate": 6.768916155419223e-05, + "loss": 3.32, + "step": 663 + }, + { + "epoch": 0.2038060159607121, + "grad_norm": 2.6351428031921387, + "learning_rate": 6.779141104294479e-05, + "loss": 3.2104, + "step": 664 + }, + { + "epoch": 0.20411295273173727, + "grad_norm": 3.980685234069824, + "learning_rate": 6.789366053169734e-05, + "loss": 3.2602, + "step": 665 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 2.5207509994506836, + "learning_rate": 6.799591002044991e-05, + "loss": 3.2256, + "step": 666 + }, + { + "epoch": 0.2047268262737876, + "grad_norm": 3.0568666458129883, + "learning_rate": 6.809815950920245e-05, + "loss": 3.2918, + "step": 667 + }, + { + "epoch": 0.20503376304481277, + "grad_norm": 2.6476826667785645, + "learning_rate": 6.820040899795501e-05, + "loss": 3.2745, + "step": 668 + }, + { + "epoch": 0.20534069981583794, + "grad_norm": 3.0413191318511963, + "learning_rate": 6.830265848670757e-05, + "loss": 3.2683, + "step": 669 + }, + { + "epoch": 0.2056476365868631, + "grad_norm": 2.6214709281921387, + "learning_rate": 6.840490797546014e-05, + "loss": 3.1399, + "step": 670 + }, + { + "epoch": 0.20595457335788828, + "grad_norm": 3.0577988624572754, + "learning_rate": 6.850715746421268e-05, + "loss": 3.2131, + "step": 671 + }, + { + "epoch": 0.20626151012891344, + "grad_norm": 2.795365571975708, + "learning_rate": 6.860940695296524e-05, + "loss": 3.1633, + "step": 672 + }, + { + "epoch": 0.2065684468999386, + "grad_norm": 3.3030495643615723, + "learning_rate": 6.87116564417178e-05, + "loss": 3.2036, + "step": 673 + }, + { + "epoch": 0.20687538367096378, + "grad_norm": 2.3182966709136963, + "learning_rate": 6.881390593047035e-05, + "loss": 3.2154, + "step": 674 + }, + { + "epoch": 0.20718232044198895, + "grad_norm": 3.133702039718628, + "learning_rate": 6.89161554192229e-05, + "loss": 3.1828, + "step": 675 + }, + { + "epoch": 0.20748925721301412, + "grad_norm": 2.555358409881592, + "learning_rate": 6.901840490797547e-05, + "loss": 3.1434, + "step": 676 + }, + { + "epoch": 0.20779619398403928, + "grad_norm": 2.990675687789917, + "learning_rate": 6.912065439672802e-05, + "loss": 3.2182, + "step": 677 + }, + { + "epoch": 0.20810313075506445, + "grad_norm": 2.5072035789489746, + "learning_rate": 6.922290388548058e-05, + "loss": 3.2735, + "step": 678 + }, + { + "epoch": 0.20841006752608962, + "grad_norm": 3.311474323272705, + "learning_rate": 6.932515337423313e-05, + "loss": 3.2152, + "step": 679 + }, + { + "epoch": 0.2087170042971148, + "grad_norm": 2.7110986709594727, + "learning_rate": 6.942740286298569e-05, + "loss": 3.1633, + "step": 680 + }, + { + "epoch": 0.20902394106813996, + "grad_norm": 2.6963095664978027, + "learning_rate": 6.952965235173824e-05, + "loss": 3.2097, + "step": 681 + }, + { + "epoch": 0.20933087783916512, + "grad_norm": 2.7126448154449463, + "learning_rate": 6.963190184049079e-05, + "loss": 3.232, + "step": 682 + }, + { + "epoch": 0.2096378146101903, + "grad_norm": 2.723257541656494, + "learning_rate": 6.973415132924336e-05, + "loss": 3.1024, + "step": 683 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 2.985407829284668, + "learning_rate": 6.983640081799591e-05, + "loss": 3.215, + "step": 684 + }, + { + "epoch": 0.21025168815224063, + "grad_norm": 2.4878063201904297, + "learning_rate": 6.993865030674847e-05, + "loss": 3.2543, + "step": 685 + }, + { + "epoch": 0.2105586249232658, + "grad_norm": 3.417191505432129, + "learning_rate": 7.004089979550102e-05, + "loss": 3.217, + "step": 686 + }, + { + "epoch": 0.21086556169429096, + "grad_norm": 2.606513738632202, + "learning_rate": 7.014314928425359e-05, + "loss": 3.1831, + "step": 687 + }, + { + "epoch": 0.21117249846531613, + "grad_norm": 2.777334213256836, + "learning_rate": 7.024539877300614e-05, + "loss": 3.1513, + "step": 688 + }, + { + "epoch": 0.2114794352363413, + "grad_norm": 2.718494415283203, + "learning_rate": 7.03476482617587e-05, + "loss": 3.1695, + "step": 689 + }, + { + "epoch": 0.21178637200736647, + "grad_norm": 3.041794776916504, + "learning_rate": 7.044989775051125e-05, + "loss": 3.2078, + "step": 690 + }, + { + "epoch": 0.21209330877839166, + "grad_norm": 2.6473169326782227, + "learning_rate": 7.055214723926382e-05, + "loss": 3.177, + "step": 691 + }, + { + "epoch": 0.21240024554941683, + "grad_norm": 3.2349517345428467, + "learning_rate": 7.065439672801636e-05, + "loss": 3.2144, + "step": 692 + }, + { + "epoch": 0.212707182320442, + "grad_norm": 2.6024651527404785, + "learning_rate": 7.075664621676892e-05, + "loss": 3.2204, + "step": 693 + }, + { + "epoch": 0.21301411909146717, + "grad_norm": 2.9090511798858643, + "learning_rate": 7.085889570552148e-05, + "loss": 3.2473, + "step": 694 + }, + { + "epoch": 0.21332105586249234, + "grad_norm": 3.230525255203247, + "learning_rate": 7.096114519427403e-05, + "loss": 3.2552, + "step": 695 + }, + { + "epoch": 0.2136279926335175, + "grad_norm": 2.2609128952026367, + "learning_rate": 7.106339468302658e-05, + "loss": 3.1302, + "step": 696 + }, + { + "epoch": 0.21393492940454267, + "grad_norm": 3.484372854232788, + "learning_rate": 7.116564417177914e-05, + "loss": 3.1578, + "step": 697 + }, + { + "epoch": 0.21424186617556784, + "grad_norm": 2.130702257156372, + "learning_rate": 7.12678936605317e-05, + "loss": 3.2089, + "step": 698 + }, + { + "epoch": 0.214548802946593, + "grad_norm": 3.0673611164093018, + "learning_rate": 7.137014314928426e-05, + "loss": 3.214, + "step": 699 + }, + { + "epoch": 0.21485573971761818, + "grad_norm": 2.572826862335205, + "learning_rate": 7.147239263803681e-05, + "loss": 3.1824, + "step": 700 + }, + { + "epoch": 0.21516267648864335, + "grad_norm": 2.8327746391296387, + "learning_rate": 7.157464212678937e-05, + "loss": 3.2384, + "step": 701 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 2.863041877746582, + "learning_rate": 7.167689161554193e-05, + "loss": 3.1102, + "step": 702 + }, + { + "epoch": 0.21577655003069368, + "grad_norm": 2.2519750595092773, + "learning_rate": 7.177914110429447e-05, + "loss": 3.1541, + "step": 703 + }, + { + "epoch": 0.21608348680171885, + "grad_norm": 3.197129011154175, + "learning_rate": 7.188139059304704e-05, + "loss": 3.2407, + "step": 704 + }, + { + "epoch": 0.21639042357274402, + "grad_norm": 2.32582426071167, + "learning_rate": 7.19836400817996e-05, + "loss": 3.1895, + "step": 705 + }, + { + "epoch": 0.21669736034376919, + "grad_norm": 3.0128488540649414, + "learning_rate": 7.208588957055215e-05, + "loss": 3.2839, + "step": 706 + }, + { + "epoch": 0.21700429711479435, + "grad_norm": 2.503342390060425, + "learning_rate": 7.21881390593047e-05, + "loss": 3.2093, + "step": 707 + }, + { + "epoch": 0.21731123388581952, + "grad_norm": 2.7540833950042725, + "learning_rate": 7.229038854805727e-05, + "loss": 3.2143, + "step": 708 + }, + { + "epoch": 0.2176181706568447, + "grad_norm": 2.8838772773742676, + "learning_rate": 7.239263803680982e-05, + "loss": 3.2051, + "step": 709 + }, + { + "epoch": 0.21792510742786986, + "grad_norm": 2.7495758533477783, + "learning_rate": 7.249488752556238e-05, + "loss": 3.0701, + "step": 710 + }, + { + "epoch": 0.21823204419889503, + "grad_norm": 2.684539794921875, + "learning_rate": 7.259713701431493e-05, + "loss": 3.1917, + "step": 711 + }, + { + "epoch": 0.2185389809699202, + "grad_norm": 2.8330819606781006, + "learning_rate": 7.26993865030675e-05, + "loss": 3.1685, + "step": 712 + }, + { + "epoch": 0.21884591774094536, + "grad_norm": 2.6974711418151855, + "learning_rate": 7.280163599182005e-05, + "loss": 3.0953, + "step": 713 + }, + { + "epoch": 0.21915285451197053, + "grad_norm": 2.5129306316375732, + "learning_rate": 7.29038854805726e-05, + "loss": 3.1371, + "step": 714 + }, + { + "epoch": 0.2194597912829957, + "grad_norm": 2.7884230613708496, + "learning_rate": 7.300613496932516e-05, + "loss": 3.1386, + "step": 715 + }, + { + "epoch": 0.21976672805402087, + "grad_norm": 2.296306610107422, + "learning_rate": 7.310838445807771e-05, + "loss": 3.1735, + "step": 716 + }, + { + "epoch": 0.22007366482504603, + "grad_norm": 2.777911424636841, + "learning_rate": 7.321063394683026e-05, + "loss": 3.1726, + "step": 717 + }, + { + "epoch": 0.2203806015960712, + "grad_norm": 2.5349695682525635, + "learning_rate": 7.331288343558282e-05, + "loss": 3.1603, + "step": 718 + }, + { + "epoch": 0.22068753836709637, + "grad_norm": 2.415412425994873, + "learning_rate": 7.341513292433539e-05, + "loss": 3.1378, + "step": 719 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 2.7188358306884766, + "learning_rate": 7.351738241308794e-05, + "loss": 3.1321, + "step": 720 + }, + { + "epoch": 0.2213014119091467, + "grad_norm": 2.4872183799743652, + "learning_rate": 7.361963190184049e-05, + "loss": 3.1283, + "step": 721 + }, + { + "epoch": 0.22160834868017187, + "grad_norm": 2.454535961151123, + "learning_rate": 7.372188139059305e-05, + "loss": 3.1085, + "step": 722 + }, + { + "epoch": 0.22191528545119704, + "grad_norm": 2.5621426105499268, + "learning_rate": 7.382413087934561e-05, + "loss": 3.1307, + "step": 723 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.513777256011963, + "learning_rate": 7.392638036809815e-05, + "loss": 3.1103, + "step": 724 + }, + { + "epoch": 0.22252915899324738, + "grad_norm": 2.596559762954712, + "learning_rate": 7.402862985685072e-05, + "loss": 3.1563, + "step": 725 + }, + { + "epoch": 0.22283609576427257, + "grad_norm": 2.371487617492676, + "learning_rate": 7.413087934560327e-05, + "loss": 3.1344, + "step": 726 + }, + { + "epoch": 0.22314303253529774, + "grad_norm": 2.7252206802368164, + "learning_rate": 7.423312883435584e-05, + "loss": 3.2139, + "step": 727 + }, + { + "epoch": 0.2234499693063229, + "grad_norm": 2.2834722995758057, + "learning_rate": 7.433537832310838e-05, + "loss": 3.1461, + "step": 728 + }, + { + "epoch": 0.22375690607734808, + "grad_norm": 3.0965540409088135, + "learning_rate": 7.443762781186095e-05, + "loss": 3.1433, + "step": 729 + }, + { + "epoch": 0.22406384284837325, + "grad_norm": 2.351365804672241, + "learning_rate": 7.45398773006135e-05, + "loss": 3.1737, + "step": 730 + }, + { + "epoch": 0.2243707796193984, + "grad_norm": 3.0938596725463867, + "learning_rate": 7.464212678936606e-05, + "loss": 3.1689, + "step": 731 + }, + { + "epoch": 0.22467771639042358, + "grad_norm": 2.415039300918579, + "learning_rate": 7.474437627811861e-05, + "loss": 3.1146, + "step": 732 + }, + { + "epoch": 0.22498465316144875, + "grad_norm": 2.8242318630218506, + "learning_rate": 7.484662576687118e-05, + "loss": 3.0812, + "step": 733 + }, + { + "epoch": 0.22529158993247392, + "grad_norm": 2.4347777366638184, + "learning_rate": 7.494887525562373e-05, + "loss": 3.203, + "step": 734 + }, + { + "epoch": 0.22559852670349909, + "grad_norm": 2.953418016433716, + "learning_rate": 7.505112474437628e-05, + "loss": 3.109, + "step": 735 + }, + { + "epoch": 0.22590546347452425, + "grad_norm": 2.600888252258301, + "learning_rate": 7.515337423312884e-05, + "loss": 3.1859, + "step": 736 + }, + { + "epoch": 0.22621240024554942, + "grad_norm": 2.7484869956970215, + "learning_rate": 7.525562372188139e-05, + "loss": 3.1169, + "step": 737 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 2.4797677993774414, + "learning_rate": 7.535787321063396e-05, + "loss": 3.0696, + "step": 738 + }, + { + "epoch": 0.22682627378759976, + "grad_norm": 2.641873359680176, + "learning_rate": 7.54601226993865e-05, + "loss": 3.1545, + "step": 739 + }, + { + "epoch": 0.22713321055862493, + "grad_norm": 2.3956825733184814, + "learning_rate": 7.556237218813907e-05, + "loss": 3.1295, + "step": 740 + }, + { + "epoch": 0.2274401473296501, + "grad_norm": 2.8832130432128906, + "learning_rate": 7.566462167689162e-05, + "loss": 3.1119, + "step": 741 + }, + { + "epoch": 0.22774708410067526, + "grad_norm": 2.3001184463500977, + "learning_rate": 7.576687116564417e-05, + "loss": 3.0068, + "step": 742 + }, + { + "epoch": 0.22805402087170043, + "grad_norm": 2.8682122230529785, + "learning_rate": 7.586912065439673e-05, + "loss": 3.0562, + "step": 743 + }, + { + "epoch": 0.2283609576427256, + "grad_norm": 2.2176413536071777, + "learning_rate": 7.59713701431493e-05, + "loss": 3.1395, + "step": 744 + }, + { + "epoch": 0.22866789441375077, + "grad_norm": 3.698274612426758, + "learning_rate": 7.607361963190185e-05, + "loss": 3.209, + "step": 745 + }, + { + "epoch": 0.22897483118477593, + "grad_norm": 2.141063928604126, + "learning_rate": 7.61758691206544e-05, + "loss": 3.1734, + "step": 746 + }, + { + "epoch": 0.2292817679558011, + "grad_norm": 2.728498697280884, + "learning_rate": 7.627811860940695e-05, + "loss": 3.1498, + "step": 747 + }, + { + "epoch": 0.22958870472682627, + "grad_norm": 2.271678924560547, + "learning_rate": 7.638036809815952e-05, + "loss": 3.1538, + "step": 748 + }, + { + "epoch": 0.22989564149785144, + "grad_norm": 2.6095521450042725, + "learning_rate": 7.648261758691206e-05, + "loss": 3.155, + "step": 749 + }, + { + "epoch": 0.2302025782688766, + "grad_norm": 2.410792112350464, + "learning_rate": 7.658486707566463e-05, + "loss": 3.0478, + "step": 750 + }, + { + "epoch": 0.23050951503990177, + "grad_norm": 2.6980888843536377, + "learning_rate": 7.668711656441718e-05, + "loss": 3.1369, + "step": 751 + }, + { + "epoch": 0.23081645181092694, + "grad_norm": 2.353308916091919, + "learning_rate": 7.678936605316974e-05, + "loss": 3.0052, + "step": 752 + }, + { + "epoch": 0.2311233885819521, + "grad_norm": 2.4530155658721924, + "learning_rate": 7.689161554192229e-05, + "loss": 3.1348, + "step": 753 + }, + { + "epoch": 0.23143032535297728, + "grad_norm": 2.393601894378662, + "learning_rate": 7.699386503067484e-05, + "loss": 2.9941, + "step": 754 + }, + { + "epoch": 0.23173726212400245, + "grad_norm": 2.576876401901245, + "learning_rate": 7.709611451942741e-05, + "loss": 3.114, + "step": 755 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 2.0420913696289062, + "learning_rate": 7.719836400817996e-05, + "loss": 3.132, + "step": 756 + }, + { + "epoch": 0.23235113566605278, + "grad_norm": 3.0095622539520264, + "learning_rate": 7.730061349693252e-05, + "loss": 3.1763, + "step": 757 + }, + { + "epoch": 0.23265807243707795, + "grad_norm": 2.224005937576294, + "learning_rate": 7.740286298568507e-05, + "loss": 3.0703, + "step": 758 + }, + { + "epoch": 0.23296500920810312, + "grad_norm": 2.7559845447540283, + "learning_rate": 7.750511247443764e-05, + "loss": 3.1026, + "step": 759 + }, + { + "epoch": 0.2332719459791283, + "grad_norm": 2.2965753078460693, + "learning_rate": 7.760736196319018e-05, + "loss": 3.0284, + "step": 760 + }, + { + "epoch": 0.23357888275015345, + "grad_norm": 2.374398708343506, + "learning_rate": 7.770961145194275e-05, + "loss": 3.0636, + "step": 761 + }, + { + "epoch": 0.23388581952117865, + "grad_norm": 2.4315314292907715, + "learning_rate": 7.78118609406953e-05, + "loss": 3.0906, + "step": 762 + }, + { + "epoch": 0.23419275629220382, + "grad_norm": 2.5609946250915527, + "learning_rate": 7.791411042944787e-05, + "loss": 3.0692, + "step": 763 + }, + { + "epoch": 0.234499693063229, + "grad_norm": 2.419597864151001, + "learning_rate": 7.80163599182004e-05, + "loss": 3.1934, + "step": 764 + }, + { + "epoch": 0.23480662983425415, + "grad_norm": 3.0499062538146973, + "learning_rate": 7.811860940695297e-05, + "loss": 3.18, + "step": 765 + }, + { + "epoch": 0.23511356660527932, + "grad_norm": 2.464421510696411, + "learning_rate": 7.822085889570553e-05, + "loss": 3.1591, + "step": 766 + }, + { + "epoch": 0.2354205033763045, + "grad_norm": 3.4370174407958984, + "learning_rate": 7.832310838445808e-05, + "loss": 3.1156, + "step": 767 + }, + { + "epoch": 0.23572744014732966, + "grad_norm": 2.207406520843506, + "learning_rate": 7.842535787321063e-05, + "loss": 3.0557, + "step": 768 + }, + { + "epoch": 0.23603437691835483, + "grad_norm": 2.484807014465332, + "learning_rate": 7.85276073619632e-05, + "loss": 3.1003, + "step": 769 + }, + { + "epoch": 0.23634131368938, + "grad_norm": 2.33217716217041, + "learning_rate": 7.862985685071576e-05, + "loss": 3.0707, + "step": 770 + }, + { + "epoch": 0.23664825046040516, + "grad_norm": 2.493717670440674, + "learning_rate": 7.873210633946831e-05, + "loss": 3.127, + "step": 771 + }, + { + "epoch": 0.23695518723143033, + "grad_norm": 2.5824413299560547, + "learning_rate": 7.883435582822086e-05, + "loss": 3.1042, + "step": 772 + }, + { + "epoch": 0.2372621240024555, + "grad_norm": 2.4137654304504395, + "learning_rate": 7.893660531697342e-05, + "loss": 3.136, + "step": 773 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 2.4657833576202393, + "learning_rate": 7.903885480572597e-05, + "loss": 3.038, + "step": 774 + }, + { + "epoch": 0.23787599754450584, + "grad_norm": 2.426260471343994, + "learning_rate": 7.914110429447852e-05, + "loss": 3.0102, + "step": 775 + }, + { + "epoch": 0.238182934315531, + "grad_norm": 2.4658050537109375, + "learning_rate": 7.924335378323109e-05, + "loss": 3.0645, + "step": 776 + }, + { + "epoch": 0.23848987108655617, + "grad_norm": 2.186267614364624, + "learning_rate": 7.934560327198364e-05, + "loss": 3.0585, + "step": 777 + }, + { + "epoch": 0.23879680785758134, + "grad_norm": 2.8824141025543213, + "learning_rate": 7.94478527607362e-05, + "loss": 3.0796, + "step": 778 + }, + { + "epoch": 0.2391037446286065, + "grad_norm": 1.9940539598464966, + "learning_rate": 7.955010224948875e-05, + "loss": 2.9894, + "step": 779 + }, + { + "epoch": 0.23941068139963168, + "grad_norm": 2.9386861324310303, + "learning_rate": 7.965235173824132e-05, + "loss": 3.1147, + "step": 780 + }, + { + "epoch": 0.23971761817065684, + "grad_norm": 2.241983413696289, + "learning_rate": 7.975460122699386e-05, + "loss": 2.9977, + "step": 781 + }, + { + "epoch": 0.240024554941682, + "grad_norm": 2.4796900749206543, + "learning_rate": 7.985685071574643e-05, + "loss": 3.0507, + "step": 782 + }, + { + "epoch": 0.24033149171270718, + "grad_norm": 2.6178741455078125, + "learning_rate": 7.995910020449898e-05, + "loss": 3.0299, + "step": 783 + }, + { + "epoch": 0.24063842848373235, + "grad_norm": 2.157179594039917, + "learning_rate": 8.006134969325155e-05, + "loss": 3.0419, + "step": 784 + }, + { + "epoch": 0.24094536525475752, + "grad_norm": 2.49029541015625, + "learning_rate": 8.016359918200409e-05, + "loss": 3.0785, + "step": 785 + }, + { + "epoch": 0.24125230202578268, + "grad_norm": 2.254014492034912, + "learning_rate": 8.026584867075665e-05, + "loss": 3.0009, + "step": 786 + }, + { + "epoch": 0.24155923879680785, + "grad_norm": 2.514465570449829, + "learning_rate": 8.036809815950921e-05, + "loss": 3.0221, + "step": 787 + }, + { + "epoch": 0.24186617556783302, + "grad_norm": 2.309812545776367, + "learning_rate": 8.047034764826176e-05, + "loss": 2.9822, + "step": 788 + }, + { + "epoch": 0.2421731123388582, + "grad_norm": 2.5367796421051025, + "learning_rate": 8.057259713701431e-05, + "loss": 2.966, + "step": 789 + }, + { + "epoch": 0.24248004910988336, + "grad_norm": 2.4668943881988525, + "learning_rate": 8.067484662576688e-05, + "loss": 3.1177, + "step": 790 + }, + { + "epoch": 0.24278698588090852, + "grad_norm": 2.9424917697906494, + "learning_rate": 8.077709611451944e-05, + "loss": 3.078, + "step": 791 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 2.3068933486938477, + "learning_rate": 8.087934560327199e-05, + "loss": 3.0415, + "step": 792 + }, + { + "epoch": 0.24340085942295886, + "grad_norm": 2.675631284713745, + "learning_rate": 8.098159509202454e-05, + "loss": 3.012, + "step": 793 + }, + { + "epoch": 0.24370779619398403, + "grad_norm": 2.0261662006378174, + "learning_rate": 8.10838445807771e-05, + "loss": 3.0023, + "step": 794 + }, + { + "epoch": 0.2440147329650092, + "grad_norm": 3.32330322265625, + "learning_rate": 8.118609406952966e-05, + "loss": 3.0992, + "step": 795 + }, + { + "epoch": 0.24432166973603436, + "grad_norm": 2.1587088108062744, + "learning_rate": 8.12883435582822e-05, + "loss": 3.0922, + "step": 796 + }, + { + "epoch": 0.24462860650705956, + "grad_norm": 2.639254331588745, + "learning_rate": 8.139059304703477e-05, + "loss": 2.9856, + "step": 797 + }, + { + "epoch": 0.24493554327808473, + "grad_norm": 1.9976975917816162, + "learning_rate": 8.149284253578732e-05, + "loss": 3.0015, + "step": 798 + }, + { + "epoch": 0.2452424800491099, + "grad_norm": 2.763504981994629, + "learning_rate": 8.159509202453988e-05, + "loss": 3.0437, + "step": 799 + }, + { + "epoch": 0.24554941682013506, + "grad_norm": 1.9080138206481934, + "learning_rate": 8.169734151329243e-05, + "loss": 3.0009, + "step": 800 + }, + { + "epoch": 0.24585635359116023, + "grad_norm": 3.1276164054870605, + "learning_rate": 8.1799591002045e-05, + "loss": 3.0433, + "step": 801 + }, + { + "epoch": 0.2461632903621854, + "grad_norm": 2.0463218688964844, + "learning_rate": 8.190184049079755e-05, + "loss": 2.988, + "step": 802 + }, + { + "epoch": 0.24647022713321057, + "grad_norm": 2.8476648330688477, + "learning_rate": 8.20040899795501e-05, + "loss": 3.0238, + "step": 803 + }, + { + "epoch": 0.24677716390423574, + "grad_norm": 1.9715898036956787, + "learning_rate": 8.210633946830266e-05, + "loss": 3.0657, + "step": 804 + }, + { + "epoch": 0.2470841006752609, + "grad_norm": 3.369995594024658, + "learning_rate": 8.220858895705523e-05, + "loss": 3.0181, + "step": 805 + }, + { + "epoch": 0.24739103744628607, + "grad_norm": 2.0333900451660156, + "learning_rate": 8.231083844580777e-05, + "loss": 3.0589, + "step": 806 + }, + { + "epoch": 0.24769797421731124, + "grad_norm": 2.5702931880950928, + "learning_rate": 8.241308793456033e-05, + "loss": 2.9908, + "step": 807 + }, + { + "epoch": 0.2480049109883364, + "grad_norm": 2.12131929397583, + "learning_rate": 8.251533742331289e-05, + "loss": 3.0519, + "step": 808 + }, + { + "epoch": 0.24831184775936158, + "grad_norm": 2.5457377433776855, + "learning_rate": 8.261758691206544e-05, + "loss": 3.019, + "step": 809 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 2.0954740047454834, + "learning_rate": 8.2719836400818e-05, + "loss": 2.9805, + "step": 810 + }, + { + "epoch": 0.2489257213014119, + "grad_norm": 2.2456700801849365, + "learning_rate": 8.282208588957055e-05, + "loss": 3.0627, + "step": 811 + }, + { + "epoch": 0.24923265807243708, + "grad_norm": 2.4453790187835693, + "learning_rate": 8.292433537832312e-05, + "loss": 3.0447, + "step": 812 + }, + { + "epoch": 0.24953959484346225, + "grad_norm": 2.1835873126983643, + "learning_rate": 8.302658486707567e-05, + "loss": 3.0008, + "step": 813 + }, + { + "epoch": 0.24984653161448742, + "grad_norm": 2.292989492416382, + "learning_rate": 8.312883435582822e-05, + "loss": 2.9175, + "step": 814 + }, + { + "epoch": 0.2501534683855126, + "grad_norm": 2.408888816833496, + "learning_rate": 8.323108384458078e-05, + "loss": 2.9649, + "step": 815 + }, + { + "epoch": 0.2504604051565378, + "grad_norm": 2.1873834133148193, + "learning_rate": 8.333333333333334e-05, + "loss": 2.9812, + "step": 816 + }, + { + "epoch": 0.25076734192756295, + "grad_norm": 2.2599284648895264, + "learning_rate": 8.343558282208588e-05, + "loss": 3.0086, + "step": 817 + }, + { + "epoch": 0.2510742786985881, + "grad_norm": 2.1902761459350586, + "learning_rate": 8.353783231083845e-05, + "loss": 2.9295, + "step": 818 + }, + { + "epoch": 0.2513812154696133, + "grad_norm": 2.4830422401428223, + "learning_rate": 8.3640081799591e-05, + "loss": 2.9808, + "step": 819 + }, + { + "epoch": 0.25168815224063845, + "grad_norm": 2.2274281978607178, + "learning_rate": 8.374233128834357e-05, + "loss": 2.9525, + "step": 820 + }, + { + "epoch": 0.2519950890116636, + "grad_norm": 2.2949111461639404, + "learning_rate": 8.384458077709611e-05, + "loss": 3.0313, + "step": 821 + }, + { + "epoch": 0.2523020257826888, + "grad_norm": 2.2345564365386963, + "learning_rate": 8.394683026584868e-05, + "loss": 2.9024, + "step": 822 + }, + { + "epoch": 0.25260896255371396, + "grad_norm": 2.488744020462036, + "learning_rate": 8.404907975460123e-05, + "loss": 2.9907, + "step": 823 + }, + { + "epoch": 0.2529158993247391, + "grad_norm": 1.9192837476730347, + "learning_rate": 8.415132924335379e-05, + "loss": 2.9792, + "step": 824 + }, + { + "epoch": 0.2532228360957643, + "grad_norm": 2.6426947116851807, + "learning_rate": 8.425357873210634e-05, + "loss": 2.972, + "step": 825 + }, + { + "epoch": 0.25352977286678946, + "grad_norm": 1.9950047731399536, + "learning_rate": 8.435582822085891e-05, + "loss": 2.9885, + "step": 826 + }, + { + "epoch": 0.25383670963781463, + "grad_norm": 2.30191969871521, + "learning_rate": 8.445807770961146e-05, + "loss": 2.9358, + "step": 827 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 2.1111395359039307, + "learning_rate": 8.456032719836401e-05, + "loss": 3.0343, + "step": 828 + }, + { + "epoch": 0.25445058317986496, + "grad_norm": 2.7292258739471436, + "learning_rate": 8.466257668711657e-05, + "loss": 2.9465, + "step": 829 + }, + { + "epoch": 0.25475751995089013, + "grad_norm": 1.9130604267120361, + "learning_rate": 8.476482617586912e-05, + "loss": 2.9443, + "step": 830 + }, + { + "epoch": 0.2550644567219153, + "grad_norm": 2.4240024089813232, + "learning_rate": 8.486707566462168e-05, + "loss": 2.963, + "step": 831 + }, + { + "epoch": 0.25537139349294047, + "grad_norm": 2.062875509262085, + "learning_rate": 8.496932515337423e-05, + "loss": 3.0127, + "step": 832 + }, + { + "epoch": 0.25567833026396564, + "grad_norm": 2.223639726638794, + "learning_rate": 8.50715746421268e-05, + "loss": 2.944, + "step": 833 + }, + { + "epoch": 0.2559852670349908, + "grad_norm": 2.2969272136688232, + "learning_rate": 8.517382413087935e-05, + "loss": 2.9495, + "step": 834 + }, + { + "epoch": 0.256292203806016, + "grad_norm": 2.1343178749084473, + "learning_rate": 8.52760736196319e-05, + "loss": 3.0383, + "step": 835 + }, + { + "epoch": 0.25659914057704114, + "grad_norm": 2.2348313331604004, + "learning_rate": 8.537832310838446e-05, + "loss": 2.9205, + "step": 836 + }, + { + "epoch": 0.2569060773480663, + "grad_norm": 2.2653896808624268, + "learning_rate": 8.548057259713702e-05, + "loss": 2.9699, + "step": 837 + }, + { + "epoch": 0.2572130141190915, + "grad_norm": 2.1332547664642334, + "learning_rate": 8.558282208588958e-05, + "loss": 2.9318, + "step": 838 + }, + { + "epoch": 0.25751995089011664, + "grad_norm": 2.5935778617858887, + "learning_rate": 8.568507157464213e-05, + "loss": 2.9754, + "step": 839 + }, + { + "epoch": 0.2578268876611418, + "grad_norm": 2.073923110961914, + "learning_rate": 8.578732106339469e-05, + "loss": 3.0396, + "step": 840 + }, + { + "epoch": 0.258133824432167, + "grad_norm": 2.485049247741699, + "learning_rate": 8.588957055214725e-05, + "loss": 2.9297, + "step": 841 + }, + { + "epoch": 0.25844076120319215, + "grad_norm": 1.9425253868103027, + "learning_rate": 8.599182004089979e-05, + "loss": 3.0131, + "step": 842 + }, + { + "epoch": 0.2587476979742173, + "grad_norm": 2.6248724460601807, + "learning_rate": 8.609406952965236e-05, + "loss": 3.0345, + "step": 843 + }, + { + "epoch": 0.2590546347452425, + "grad_norm": 1.9123374223709106, + "learning_rate": 8.619631901840491e-05, + "loss": 3.0259, + "step": 844 + }, + { + "epoch": 0.25936157151626765, + "grad_norm": 2.457913637161255, + "learning_rate": 8.629856850715747e-05, + "loss": 3.0015, + "step": 845 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 2.0444202423095703, + "learning_rate": 8.640081799591002e-05, + "loss": 2.9663, + "step": 846 + }, + { + "epoch": 0.259975445058318, + "grad_norm": 2.1673583984375, + "learning_rate": 8.650306748466259e-05, + "loss": 3.0646, + "step": 847 + }, + { + "epoch": 0.26028238182934316, + "grad_norm": 2.1198627948760986, + "learning_rate": 8.660531697341514e-05, + "loss": 2.8769, + "step": 848 + }, + { + "epoch": 0.2605893186003683, + "grad_norm": 2.379960775375366, + "learning_rate": 8.67075664621677e-05, + "loss": 2.9637, + "step": 849 + }, + { + "epoch": 0.2608962553713935, + "grad_norm": 2.3954226970672607, + "learning_rate": 8.680981595092025e-05, + "loss": 3.025, + "step": 850 + }, + { + "epoch": 0.26120319214241866, + "grad_norm": 2.254746198654175, + "learning_rate": 8.69120654396728e-05, + "loss": 2.9962, + "step": 851 + }, + { + "epoch": 0.26151012891344383, + "grad_norm": 2.0851991176605225, + "learning_rate": 8.701431492842537e-05, + "loss": 2.9399, + "step": 852 + }, + { + "epoch": 0.261817065684469, + "grad_norm": 2.2800698280334473, + "learning_rate": 8.711656441717791e-05, + "loss": 2.9465, + "step": 853 + }, + { + "epoch": 0.26212400245549416, + "grad_norm": 2.3628437519073486, + "learning_rate": 8.721881390593048e-05, + "loss": 3.0298, + "step": 854 + }, + { + "epoch": 0.26243093922651933, + "grad_norm": 1.9642207622528076, + "learning_rate": 8.732106339468303e-05, + "loss": 2.8462, + "step": 855 + }, + { + "epoch": 0.2627378759975445, + "grad_norm": 2.5833423137664795, + "learning_rate": 8.742331288343558e-05, + "loss": 2.9024, + "step": 856 + }, + { + "epoch": 0.26304481276856967, + "grad_norm": 1.7022998332977295, + "learning_rate": 8.752556237218814e-05, + "loss": 2.9948, + "step": 857 + }, + { + "epoch": 0.26335174953959484, + "grad_norm": 3.181725025177002, + "learning_rate": 8.76278118609407e-05, + "loss": 3.0634, + "step": 858 + }, + { + "epoch": 0.26365868631062, + "grad_norm": 1.8931077718734741, + "learning_rate": 8.773006134969326e-05, + "loss": 2.9974, + "step": 859 + }, + { + "epoch": 0.2639656230816452, + "grad_norm": 2.5016703605651855, + "learning_rate": 8.783231083844581e-05, + "loss": 3.0109, + "step": 860 + }, + { + "epoch": 0.26427255985267034, + "grad_norm": 1.810957908630371, + "learning_rate": 8.793456032719837e-05, + "loss": 3.0143, + "step": 861 + }, + { + "epoch": 0.2645794966236955, + "grad_norm": 2.3004086017608643, + "learning_rate": 8.803680981595093e-05, + "loss": 2.9825, + "step": 862 + }, + { + "epoch": 0.2648864333947207, + "grad_norm": 2.23740816116333, + "learning_rate": 8.813905930470347e-05, + "loss": 2.8897, + "step": 863 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 2.441157579421997, + "learning_rate": 8.824130879345604e-05, + "loss": 2.8966, + "step": 864 + }, + { + "epoch": 0.265500306936771, + "grad_norm": 2.063201665878296, + "learning_rate": 8.83435582822086e-05, + "loss": 2.9468, + "step": 865 + }, + { + "epoch": 0.2658072437077962, + "grad_norm": 2.1484951972961426, + "learning_rate": 8.844580777096115e-05, + "loss": 2.9199, + "step": 866 + }, + { + "epoch": 0.26611418047882135, + "grad_norm": 2.167827844619751, + "learning_rate": 8.85480572597137e-05, + "loss": 2.9403, + "step": 867 + }, + { + "epoch": 0.2664211172498465, + "grad_norm": 2.193556070327759, + "learning_rate": 8.865030674846625e-05, + "loss": 2.9171, + "step": 868 + }, + { + "epoch": 0.2667280540208717, + "grad_norm": 2.0754151344299316, + "learning_rate": 8.875255623721882e-05, + "loss": 2.9605, + "step": 869 + }, + { + "epoch": 0.26703499079189685, + "grad_norm": 2.1351094245910645, + "learning_rate": 8.885480572597138e-05, + "loss": 2.9272, + "step": 870 + }, + { + "epoch": 0.267341927562922, + "grad_norm": 2.0486347675323486, + "learning_rate": 8.895705521472393e-05, + "loss": 3.0308, + "step": 871 + }, + { + "epoch": 0.2676488643339472, + "grad_norm": 2.3303308486938477, + "learning_rate": 8.905930470347648e-05, + "loss": 2.9061, + "step": 872 + }, + { + "epoch": 0.26795580110497236, + "grad_norm": 1.9345083236694336, + "learning_rate": 8.916155419222905e-05, + "loss": 2.9644, + "step": 873 + }, + { + "epoch": 0.2682627378759975, + "grad_norm": 2.451918601989746, + "learning_rate": 8.926380368098159e-05, + "loss": 2.9536, + "step": 874 + }, + { + "epoch": 0.2685696746470227, + "grad_norm": 1.6964573860168457, + "learning_rate": 8.936605316973416e-05, + "loss": 2.9228, + "step": 875 + }, + { + "epoch": 0.26887661141804786, + "grad_norm": 2.2414000034332275, + "learning_rate": 8.946830265848671e-05, + "loss": 2.9776, + "step": 876 + }, + { + "epoch": 0.26918354818907303, + "grad_norm": 1.725002408027649, + "learning_rate": 8.957055214723928e-05, + "loss": 2.9837, + "step": 877 + }, + { + "epoch": 0.2694904849600982, + "grad_norm": 2.1498587131500244, + "learning_rate": 8.967280163599182e-05, + "loss": 2.8684, + "step": 878 + }, + { + "epoch": 0.26979742173112337, + "grad_norm": 1.814738392829895, + "learning_rate": 8.977505112474438e-05, + "loss": 2.9077, + "step": 879 + }, + { + "epoch": 0.27010435850214853, + "grad_norm": 2.3086628913879395, + "learning_rate": 8.987730061349694e-05, + "loss": 2.9482, + "step": 880 + }, + { + "epoch": 0.2704112952731737, + "grad_norm": 1.7470855712890625, + "learning_rate": 8.997955010224949e-05, + "loss": 2.9775, + "step": 881 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 2.2822775840759277, + "learning_rate": 9.008179959100205e-05, + "loss": 3.0004, + "step": 882 + }, + { + "epoch": 0.27102516881522404, + "grad_norm": 1.9530903100967407, + "learning_rate": 9.018404907975461e-05, + "loss": 2.949, + "step": 883 + }, + { + "epoch": 0.2713321055862492, + "grad_norm": 2.0626885890960693, + "learning_rate": 9.028629856850717e-05, + "loss": 2.9184, + "step": 884 + }, + { + "epoch": 0.2716390423572744, + "grad_norm": 2.0040712356567383, + "learning_rate": 9.038854805725972e-05, + "loss": 2.8562, + "step": 885 + }, + { + "epoch": 0.2719459791282996, + "grad_norm": 2.026193141937256, + "learning_rate": 9.049079754601227e-05, + "loss": 2.883, + "step": 886 + }, + { + "epoch": 0.27225291589932477, + "grad_norm": 1.8337095975875854, + "learning_rate": 9.059304703476483e-05, + "loss": 2.8512, + "step": 887 + }, + { + "epoch": 0.27255985267034993, + "grad_norm": 2.1098122596740723, + "learning_rate": 9.069529652351738e-05, + "loss": 2.9024, + "step": 888 + }, + { + "epoch": 0.2728667894413751, + "grad_norm": 2.065650701522827, + "learning_rate": 9.079754601226993e-05, + "loss": 2.9291, + "step": 889 + }, + { + "epoch": 0.27317372621240027, + "grad_norm": 2.204819679260254, + "learning_rate": 9.08997955010225e-05, + "loss": 2.9153, + "step": 890 + }, + { + "epoch": 0.27348066298342544, + "grad_norm": 1.7931475639343262, + "learning_rate": 9.100204498977506e-05, + "loss": 2.9104, + "step": 891 + }, + { + "epoch": 0.2737875997544506, + "grad_norm": 2.4288859367370605, + "learning_rate": 9.110429447852761e-05, + "loss": 2.9974, + "step": 892 + }, + { + "epoch": 0.2740945365254758, + "grad_norm": 2.095872640609741, + "learning_rate": 9.120654396728016e-05, + "loss": 2.8446, + "step": 893 + }, + { + "epoch": 0.27440147329650094, + "grad_norm": 2.054410696029663, + "learning_rate": 9.130879345603273e-05, + "loss": 2.9008, + "step": 894 + }, + { + "epoch": 0.2747084100675261, + "grad_norm": 2.1989710330963135, + "learning_rate": 9.141104294478528e-05, + "loss": 2.8808, + "step": 895 + }, + { + "epoch": 0.2750153468385513, + "grad_norm": 2.531081199645996, + "learning_rate": 9.151329243353784e-05, + "loss": 2.8928, + "step": 896 + }, + { + "epoch": 0.27532228360957645, + "grad_norm": 2.010425567626953, + "learning_rate": 9.161554192229039e-05, + "loss": 2.9051, + "step": 897 + }, + { + "epoch": 0.2756292203806016, + "grad_norm": 1.9320241212844849, + "learning_rate": 9.171779141104296e-05, + "loss": 2.8675, + "step": 898 + }, + { + "epoch": 0.2759361571516268, + "grad_norm": 2.2280430793762207, + "learning_rate": 9.18200408997955e-05, + "loss": 2.9082, + "step": 899 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 1.9172335863113403, + "learning_rate": 9.192229038854807e-05, + "loss": 2.8947, + "step": 900 + }, + { + "epoch": 0.2765500306936771, + "grad_norm": 2.0846056938171387, + "learning_rate": 9.202453987730062e-05, + "loss": 2.9161, + "step": 901 + }, + { + "epoch": 0.2768569674647023, + "grad_norm": 1.875034213066101, + "learning_rate": 9.212678936605317e-05, + "loss": 2.8937, + "step": 902 + }, + { + "epoch": 0.27716390423572745, + "grad_norm": 2.230164051055908, + "learning_rate": 9.222903885480573e-05, + "loss": 2.8396, + "step": 903 + }, + { + "epoch": 0.2774708410067526, + "grad_norm": 1.6204382181167603, + "learning_rate": 9.233128834355828e-05, + "loss": 2.9367, + "step": 904 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.4218156337738037, + "learning_rate": 9.243353783231085e-05, + "loss": 2.9727, + "step": 905 + }, + { + "epoch": 0.27808471454880296, + "grad_norm": 1.7401793003082275, + "learning_rate": 9.25357873210634e-05, + "loss": 2.8957, + "step": 906 + }, + { + "epoch": 0.2783916513198281, + "grad_norm": 2.2128076553344727, + "learning_rate": 9.263803680981595e-05, + "loss": 2.8725, + "step": 907 + }, + { + "epoch": 0.2786985880908533, + "grad_norm": 2.004179000854492, + "learning_rate": 9.274028629856851e-05, + "loss": 2.8879, + "step": 908 + }, + { + "epoch": 0.27900552486187846, + "grad_norm": 2.198784112930298, + "learning_rate": 9.284253578732107e-05, + "loss": 2.9655, + "step": 909 + }, + { + "epoch": 0.27931246163290363, + "grad_norm": 1.8064004182815552, + "learning_rate": 9.294478527607362e-05, + "loss": 2.7801, + "step": 910 + }, + { + "epoch": 0.2796193984039288, + "grad_norm": 2.1273581981658936, + "learning_rate": 9.304703476482618e-05, + "loss": 2.8615, + "step": 911 + }, + { + "epoch": 0.27992633517495397, + "grad_norm": 1.7843197584152222, + "learning_rate": 9.314928425357874e-05, + "loss": 2.8735, + "step": 912 + }, + { + "epoch": 0.28023327194597913, + "grad_norm": 2.234886884689331, + "learning_rate": 9.325153374233129e-05, + "loss": 2.9444, + "step": 913 + }, + { + "epoch": 0.2805402087170043, + "grad_norm": 2.0565783977508545, + "learning_rate": 9.335378323108384e-05, + "loss": 2.9784, + "step": 914 + }, + { + "epoch": 0.28084714548802947, + "grad_norm": 1.836901068687439, + "learning_rate": 9.345603271983641e-05, + "loss": 2.9217, + "step": 915 + }, + { + "epoch": 0.28115408225905464, + "grad_norm": 2.0981357097625732, + "learning_rate": 9.355828220858896e-05, + "loss": 2.9091, + "step": 916 + }, + { + "epoch": 0.2814610190300798, + "grad_norm": 1.9199821949005127, + "learning_rate": 9.366053169734152e-05, + "loss": 2.8882, + "step": 917 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 1.9928756952285767, + "learning_rate": 9.376278118609407e-05, + "loss": 2.8463, + "step": 918 + }, + { + "epoch": 0.28207489257213014, + "grad_norm": 1.9580156803131104, + "learning_rate": 9.386503067484664e-05, + "loss": 2.7814, + "step": 919 + }, + { + "epoch": 0.2823818293431553, + "grad_norm": 2.016144275665283, + "learning_rate": 9.396728016359919e-05, + "loss": 2.8725, + "step": 920 + }, + { + "epoch": 0.2826887661141805, + "grad_norm": 1.967668890953064, + "learning_rate": 9.406952965235175e-05, + "loss": 2.912, + "step": 921 + }, + { + "epoch": 0.28299570288520565, + "grad_norm": 1.8826593160629272, + "learning_rate": 9.41717791411043e-05, + "loss": 2.7885, + "step": 922 + }, + { + "epoch": 0.2833026396562308, + "grad_norm": 2.0615732669830322, + "learning_rate": 9.427402862985685e-05, + "loss": 2.9111, + "step": 923 + }, + { + "epoch": 0.283609576427256, + "grad_norm": 1.7132701873779297, + "learning_rate": 9.43762781186094e-05, + "loss": 2.89, + "step": 924 + }, + { + "epoch": 0.28391651319828115, + "grad_norm": 2.1561272144317627, + "learning_rate": 9.447852760736196e-05, + "loss": 2.8741, + "step": 925 + }, + { + "epoch": 0.2842234499693063, + "grad_norm": 1.727338433265686, + "learning_rate": 9.458077709611453e-05, + "loss": 2.8449, + "step": 926 + }, + { + "epoch": 0.2845303867403315, + "grad_norm": 2.19234299659729, + "learning_rate": 9.468302658486708e-05, + "loss": 2.8499, + "step": 927 + }, + { + "epoch": 0.28483732351135665, + "grad_norm": 1.7370812892913818, + "learning_rate": 9.478527607361963e-05, + "loss": 2.882, + "step": 928 + }, + { + "epoch": 0.2851442602823818, + "grad_norm": 2.0576157569885254, + "learning_rate": 9.488752556237219e-05, + "loss": 2.7869, + "step": 929 + }, + { + "epoch": 0.285451197053407, + "grad_norm": 1.7926486730575562, + "learning_rate": 9.498977505112476e-05, + "loss": 2.906, + "step": 930 + }, + { + "epoch": 0.28575813382443216, + "grad_norm": 1.6877856254577637, + "learning_rate": 9.50920245398773e-05, + "loss": 2.8422, + "step": 931 + }, + { + "epoch": 0.2860650705954573, + "grad_norm": 2.3053178787231445, + "learning_rate": 9.519427402862986e-05, + "loss": 2.9039, + "step": 932 + }, + { + "epoch": 0.2863720073664825, + "grad_norm": 1.7746092081069946, + "learning_rate": 9.529652351738242e-05, + "loss": 2.9082, + "step": 933 + }, + { + "epoch": 0.28667894413750766, + "grad_norm": 2.1900086402893066, + "learning_rate": 9.539877300613498e-05, + "loss": 2.8511, + "step": 934 + }, + { + "epoch": 0.28698588090853283, + "grad_norm": 1.781988501548767, + "learning_rate": 9.550102249488752e-05, + "loss": 2.8264, + "step": 935 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 1.845797061920166, + "learning_rate": 9.560327198364009e-05, + "loss": 2.8657, + "step": 936 + }, + { + "epoch": 0.28759975445058317, + "grad_norm": 1.8794586658477783, + "learning_rate": 9.570552147239264e-05, + "loss": 2.8365, + "step": 937 + }, + { + "epoch": 0.28790669122160834, + "grad_norm": 2.078359603881836, + "learning_rate": 9.58077709611452e-05, + "loss": 2.8829, + "step": 938 + }, + { + "epoch": 0.2882136279926335, + "grad_norm": 1.8091285228729248, + "learning_rate": 9.591002044989775e-05, + "loss": 2.8083, + "step": 939 + }, + { + "epoch": 0.28852056476365867, + "grad_norm": 2.0130608081817627, + "learning_rate": 9.601226993865032e-05, + "loss": 2.8922, + "step": 940 + }, + { + "epoch": 0.28882750153468384, + "grad_norm": 1.8504360914230347, + "learning_rate": 9.611451942740287e-05, + "loss": 2.8034, + "step": 941 + }, + { + "epoch": 0.289134438305709, + "grad_norm": 1.860420823097229, + "learning_rate": 9.621676891615543e-05, + "loss": 2.8249, + "step": 942 + }, + { + "epoch": 0.2894413750767342, + "grad_norm": 2.157158374786377, + "learning_rate": 9.631901840490798e-05, + "loss": 2.8629, + "step": 943 + }, + { + "epoch": 0.28974831184775934, + "grad_norm": 1.8066895008087158, + "learning_rate": 9.642126789366053e-05, + "loss": 2.7965, + "step": 944 + }, + { + "epoch": 0.2900552486187845, + "grad_norm": 1.9674500226974487, + "learning_rate": 9.65235173824131e-05, + "loss": 2.8043, + "step": 945 + }, + { + "epoch": 0.2903621853898097, + "grad_norm": 1.7899354696273804, + "learning_rate": 9.662576687116564e-05, + "loss": 2.8803, + "step": 946 + }, + { + "epoch": 0.29066912216083485, + "grad_norm": 2.220201015472412, + "learning_rate": 9.672801635991821e-05, + "loss": 2.8201, + "step": 947 + }, + { + "epoch": 0.29097605893186, + "grad_norm": 1.76320219039917, + "learning_rate": 9.683026584867076e-05, + "loss": 2.8921, + "step": 948 + }, + { + "epoch": 0.2912829957028852, + "grad_norm": 1.6863081455230713, + "learning_rate": 9.693251533742331e-05, + "loss": 2.8208, + "step": 949 + }, + { + "epoch": 0.29158993247391035, + "grad_norm": 2.1578476428985596, + "learning_rate": 9.703476482617587e-05, + "loss": 2.8972, + "step": 950 + }, + { + "epoch": 0.2918968692449355, + "grad_norm": 1.6925181150436401, + "learning_rate": 9.713701431492844e-05, + "loss": 2.8225, + "step": 951 + }, + { + "epoch": 0.2922038060159607, + "grad_norm": 1.8861147165298462, + "learning_rate": 9.723926380368099e-05, + "loss": 2.8707, + "step": 952 + }, + { + "epoch": 0.29251074278698586, + "grad_norm": 1.5894604921340942, + "learning_rate": 9.734151329243354e-05, + "loss": 2.7576, + "step": 953 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 1.9092673063278198, + "learning_rate": 9.74437627811861e-05, + "loss": 2.8659, + "step": 954 + }, + { + "epoch": 0.2931246163290362, + "grad_norm": 1.8600605726242065, + "learning_rate": 9.754601226993866e-05, + "loss": 2.752, + "step": 955 + }, + { + "epoch": 0.29343155310006136, + "grad_norm": 2.005805015563965, + "learning_rate": 9.76482617586912e-05, + "loss": 2.8511, + "step": 956 + }, + { + "epoch": 0.2937384898710866, + "grad_norm": 1.9485148191452026, + "learning_rate": 9.775051124744377e-05, + "loss": 2.9726, + "step": 957 + }, + { + "epoch": 0.29404542664211175, + "grad_norm": 1.9197280406951904, + "learning_rate": 9.785276073619632e-05, + "loss": 2.7753, + "step": 958 + }, + { + "epoch": 0.2943523634131369, + "grad_norm": 1.6279773712158203, + "learning_rate": 9.795501022494888e-05, + "loss": 2.8855, + "step": 959 + }, + { + "epoch": 0.2946593001841621, + "grad_norm": 2.0233097076416016, + "learning_rate": 9.805725971370143e-05, + "loss": 2.749, + "step": 960 + }, + { + "epoch": 0.29496623695518726, + "grad_norm": 1.550295352935791, + "learning_rate": 9.815950920245399e-05, + "loss": 2.7991, + "step": 961 + }, + { + "epoch": 0.2952731737262124, + "grad_norm": 2.3194360733032227, + "learning_rate": 9.826175869120655e-05, + "loss": 2.8208, + "step": 962 + }, + { + "epoch": 0.2955801104972376, + "grad_norm": 1.634867787361145, + "learning_rate": 9.83640081799591e-05, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.29588704726826276, + "grad_norm": 2.1152596473693848, + "learning_rate": 9.846625766871166e-05, + "loss": 2.7667, + "step": 964 + }, + { + "epoch": 0.2961939840392879, + "grad_norm": 1.8927233219146729, + "learning_rate": 9.856850715746421e-05, + "loss": 2.8308, + "step": 965 + }, + { + "epoch": 0.2965009208103131, + "grad_norm": 1.765026330947876, + "learning_rate": 9.867075664621678e-05, + "loss": 2.7546, + "step": 966 + }, + { + "epoch": 0.29680785758133826, + "grad_norm": 1.7491015195846558, + "learning_rate": 9.877300613496932e-05, + "loss": 2.8156, + "step": 967 + }, + { + "epoch": 0.29711479435236343, + "grad_norm": 1.8352077007293701, + "learning_rate": 9.887525562372189e-05, + "loss": 2.8542, + "step": 968 + }, + { + "epoch": 0.2974217311233886, + "grad_norm": 1.8892323970794678, + "learning_rate": 9.897750511247444e-05, + "loss": 2.8216, + "step": 969 + }, + { + "epoch": 0.29772866789441377, + "grad_norm": 1.7171403169631958, + "learning_rate": 9.907975460122701e-05, + "loss": 2.8428, + "step": 970 + }, + { + "epoch": 0.29803560466543894, + "grad_norm": 1.8318040370941162, + "learning_rate": 9.918200408997955e-05, + "loss": 2.7821, + "step": 971 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 1.5829975605010986, + "learning_rate": 9.928425357873212e-05, + "loss": 2.9091, + "step": 972 + }, + { + "epoch": 0.29864947820748927, + "grad_norm": 1.7248235940933228, + "learning_rate": 9.938650306748467e-05, + "loss": 2.7914, + "step": 973 + }, + { + "epoch": 0.29895641497851444, + "grad_norm": 1.7741187810897827, + "learning_rate": 9.948875255623722e-05, + "loss": 2.8711, + "step": 974 + }, + { + "epoch": 0.2992633517495396, + "grad_norm": 1.7419151067733765, + "learning_rate": 9.959100204498978e-05, + "loss": 2.8933, + "step": 975 + }, + { + "epoch": 0.2995702885205648, + "grad_norm": 1.6603926420211792, + "learning_rate": 9.969325153374234e-05, + "loss": 2.7138, + "step": 976 + }, + { + "epoch": 0.29987722529158994, + "grad_norm": 1.8423576354980469, + "learning_rate": 9.97955010224949e-05, + "loss": 2.7776, + "step": 977 + }, + { + "epoch": 0.3001841620626151, + "grad_norm": 1.5548568964004517, + "learning_rate": 9.989775051124745e-05, + "loss": 2.8193, + "step": 978 + }, + { + "epoch": 0.3004910988336403, + "grad_norm": 1.711785078048706, + "learning_rate": 0.0001, + "loss": 2.7082, + "step": 979 + }, + { + "epoch": 0.30079803560466545, + "grad_norm": 1.6395221948623657, + "learning_rate": 9.999999975293535e-05, + "loss": 2.7526, + "step": 980 + }, + { + "epoch": 0.3011049723756906, + "grad_norm": 1.829174518585205, + "learning_rate": 9.999999901174139e-05, + "loss": 2.7555, + "step": 981 + }, + { + "epoch": 0.3014119091467158, + "grad_norm": 1.5807569026947021, + "learning_rate": 9.999999777641814e-05, + "loss": 2.848, + "step": 982 + }, + { + "epoch": 0.30171884591774095, + "grad_norm": 2.014803171157837, + "learning_rate": 9.99999960469656e-05, + "loss": 2.8318, + "step": 983 + }, + { + "epoch": 0.3020257826887661, + "grad_norm": 1.4732542037963867, + "learning_rate": 9.99999938233838e-05, + "loss": 2.8143, + "step": 984 + }, + { + "epoch": 0.3023327194597913, + "grad_norm": 2.4888343811035156, + "learning_rate": 9.999999110567275e-05, + "loss": 2.7979, + "step": 985 + }, + { + "epoch": 0.30263965623081646, + "grad_norm": 1.4265737533569336, + "learning_rate": 9.99999878938325e-05, + "loss": 2.7968, + "step": 986 + }, + { + "epoch": 0.3029465930018416, + "grad_norm": 2.0397326946258545, + "learning_rate": 9.999998418786303e-05, + "loss": 2.7413, + "step": 987 + }, + { + "epoch": 0.3032535297728668, + "grad_norm": 1.6565579175949097, + "learning_rate": 9.999997998776443e-05, + "loss": 2.8249, + "step": 988 + }, + { + "epoch": 0.30356046654389196, + "grad_norm": 1.8470033407211304, + "learning_rate": 9.999997529353673e-05, + "loss": 2.7815, + "step": 989 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 1.571768045425415, + "learning_rate": 9.999997010517995e-05, + "loss": 2.7202, + "step": 990 + }, + { + "epoch": 0.3041743400859423, + "grad_norm": 1.6217811107635498, + "learning_rate": 9.999996442269417e-05, + "loss": 2.832, + "step": 991 + }, + { + "epoch": 0.30448127685696746, + "grad_norm": 1.745591640472412, + "learning_rate": 9.999995824607943e-05, + "loss": 2.8271, + "step": 992 + }, + { + "epoch": 0.30478821362799263, + "grad_norm": 1.6469355821609497, + "learning_rate": 9.99999515753358e-05, + "loss": 2.7699, + "step": 993 + }, + { + "epoch": 0.3050951503990178, + "grad_norm": 1.733182430267334, + "learning_rate": 9.999994441046334e-05, + "loss": 2.7927, + "step": 994 + }, + { + "epoch": 0.30540208717004297, + "grad_norm": 1.6043230295181274, + "learning_rate": 9.999993675146213e-05, + "loss": 2.7536, + "step": 995 + }, + { + "epoch": 0.30570902394106814, + "grad_norm": 1.8154711723327637, + "learning_rate": 9.999992859833222e-05, + "loss": 2.7795, + "step": 996 + }, + { + "epoch": 0.3060159607120933, + "grad_norm": 1.7553666830062866, + "learning_rate": 9.999991995107374e-05, + "loss": 2.8128, + "step": 997 + }, + { + "epoch": 0.3063228974831185, + "grad_norm": 1.702697992324829, + "learning_rate": 9.999991080968672e-05, + "loss": 2.7234, + "step": 998 + }, + { + "epoch": 0.30662983425414364, + "grad_norm": 1.512619972229004, + "learning_rate": 9.99999011741713e-05, + "loss": 2.7555, + "step": 999 + }, + { + "epoch": 0.3069367710251688, + "grad_norm": 1.735844612121582, + "learning_rate": 9.999989104452753e-05, + "loss": 2.7847, + "step": 1000 + }, + { + "epoch": 0.307243707796194, + "grad_norm": 1.4687904119491577, + "learning_rate": 9.999988042075555e-05, + "loss": 2.8039, + "step": 1001 + }, + { + "epoch": 0.30755064456721914, + "grad_norm": 1.6867917776107788, + "learning_rate": 9.999986930285542e-05, + "loss": 2.7643, + "step": 1002 + }, + { + "epoch": 0.3078575813382443, + "grad_norm": 1.6974400281906128, + "learning_rate": 9.99998576908273e-05, + "loss": 2.7284, + "step": 1003 + }, + { + "epoch": 0.3081645181092695, + "grad_norm": 1.6622353792190552, + "learning_rate": 9.999984558467126e-05, + "loss": 2.8364, + "step": 1004 + }, + { + "epoch": 0.30847145488029465, + "grad_norm": 1.7920496463775635, + "learning_rate": 9.999983298438744e-05, + "loss": 2.7769, + "step": 1005 + }, + { + "epoch": 0.3087783916513198, + "grad_norm": 1.7111997604370117, + "learning_rate": 9.999981988997598e-05, + "loss": 2.7323, + "step": 1006 + }, + { + "epoch": 0.309085328422345, + "grad_norm": 1.6372064352035522, + "learning_rate": 9.9999806301437e-05, + "loss": 2.8128, + "step": 1007 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 1.841002345085144, + "learning_rate": 9.999979221877061e-05, + "loss": 2.7049, + "step": 1008 + }, + { + "epoch": 0.3096992019643953, + "grad_norm": 1.4474141597747803, + "learning_rate": 9.999977764197697e-05, + "loss": 2.64, + "step": 1009 + }, + { + "epoch": 0.3100061387354205, + "grad_norm": 1.6599560976028442, + "learning_rate": 9.999976257105622e-05, + "loss": 2.7989, + "step": 1010 + }, + { + "epoch": 0.31031307550644566, + "grad_norm": 1.7502890825271606, + "learning_rate": 9.999974700600851e-05, + "loss": 2.7949, + "step": 1011 + }, + { + "epoch": 0.3106200122774708, + "grad_norm": 1.8119313716888428, + "learning_rate": 9.9999730946834e-05, + "loss": 2.7577, + "step": 1012 + }, + { + "epoch": 0.310926949048496, + "grad_norm": 1.4398404359817505, + "learning_rate": 9.999971439353284e-05, + "loss": 2.7369, + "step": 1013 + }, + { + "epoch": 0.31123388581952116, + "grad_norm": 1.8501840829849243, + "learning_rate": 9.999969734610522e-05, + "loss": 2.6651, + "step": 1014 + }, + { + "epoch": 0.31154082259054633, + "grad_norm": 1.450804352760315, + "learning_rate": 9.999967980455125e-05, + "loss": 2.7231, + "step": 1015 + }, + { + "epoch": 0.3118477593615715, + "grad_norm": 1.9445282220840454, + "learning_rate": 9.999966176887115e-05, + "loss": 2.795, + "step": 1016 + }, + { + "epoch": 0.31215469613259667, + "grad_norm": 1.6361008882522583, + "learning_rate": 9.99996432390651e-05, + "loss": 2.8894, + "step": 1017 + }, + { + "epoch": 0.31246163290362183, + "grad_norm": 2.0804831981658936, + "learning_rate": 9.999962421513325e-05, + "loss": 2.8313, + "step": 1018 + }, + { + "epoch": 0.312768569674647, + "grad_norm": 1.3779852390289307, + "learning_rate": 9.999960469707582e-05, + "loss": 2.6776, + "step": 1019 + }, + { + "epoch": 0.31307550644567217, + "grad_norm": 1.7727700471878052, + "learning_rate": 9.999958468489299e-05, + "loss": 2.8076, + "step": 1020 + }, + { + "epoch": 0.31338244321669734, + "grad_norm": 1.5273795127868652, + "learning_rate": 9.999956417858496e-05, + "loss": 2.7069, + "step": 1021 + }, + { + "epoch": 0.3136893799877225, + "grad_norm": 1.8135402202606201, + "learning_rate": 9.999954317815193e-05, + "loss": 2.7375, + "step": 1022 + }, + { + "epoch": 0.3139963167587477, + "grad_norm": 1.6642818450927734, + "learning_rate": 9.99995216835941e-05, + "loss": 2.8085, + "step": 1023 + }, + { + "epoch": 0.31430325352977284, + "grad_norm": 1.681378722190857, + "learning_rate": 9.999949969491169e-05, + "loss": 2.807, + "step": 1024 + }, + { + "epoch": 0.314610190300798, + "grad_norm": 1.5521160364151, + "learning_rate": 9.999947721210493e-05, + "loss": 2.7266, + "step": 1025 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 1.486830711364746, + "learning_rate": 9.999945423517403e-05, + "loss": 2.774, + "step": 1026 + }, + { + "epoch": 0.3152240638428484, + "grad_norm": 1.5730900764465332, + "learning_rate": 9.99994307641192e-05, + "loss": 2.7101, + "step": 1027 + }, + { + "epoch": 0.31553100061387357, + "grad_norm": 1.4835596084594727, + "learning_rate": 9.999940679894071e-05, + "loss": 2.8195, + "step": 1028 + }, + { + "epoch": 0.31583793738489874, + "grad_norm": 1.7885956764221191, + "learning_rate": 9.999938233963877e-05, + "loss": 2.796, + "step": 1029 + }, + { + "epoch": 0.3161448741559239, + "grad_norm": 1.4036259651184082, + "learning_rate": 9.999935738621362e-05, + "loss": 2.7167, + "step": 1030 + }, + { + "epoch": 0.3164518109269491, + "grad_norm": 1.7480512857437134, + "learning_rate": 9.999933193866554e-05, + "loss": 2.6774, + "step": 1031 + }, + { + "epoch": 0.31675874769797424, + "grad_norm": 1.66177499294281, + "learning_rate": 9.999930599699473e-05, + "loss": 2.7635, + "step": 1032 + }, + { + "epoch": 0.3170656844689994, + "grad_norm": 1.5088306665420532, + "learning_rate": 9.999927956120147e-05, + "loss": 2.7284, + "step": 1033 + }, + { + "epoch": 0.3173726212400246, + "grad_norm": 1.6847199201583862, + "learning_rate": 9.999925263128605e-05, + "loss": 2.8287, + "step": 1034 + }, + { + "epoch": 0.31767955801104975, + "grad_norm": 1.6092369556427002, + "learning_rate": 9.999922520724869e-05, + "loss": 2.7189, + "step": 1035 + }, + { + "epoch": 0.3179864947820749, + "grad_norm": 1.41717529296875, + "learning_rate": 9.999919728908969e-05, + "loss": 2.7134, + "step": 1036 + }, + { + "epoch": 0.3182934315531001, + "grad_norm": 1.6256498098373413, + "learning_rate": 9.999916887680931e-05, + "loss": 2.7312, + "step": 1037 + }, + { + "epoch": 0.31860036832412525, + "grad_norm": 1.4934377670288086, + "learning_rate": 9.999913997040784e-05, + "loss": 2.7548, + "step": 1038 + }, + { + "epoch": 0.3189073050951504, + "grad_norm": 1.6037719249725342, + "learning_rate": 9.999911056988557e-05, + "loss": 2.7682, + "step": 1039 + }, + { + "epoch": 0.3192142418661756, + "grad_norm": 1.4746284484863281, + "learning_rate": 9.999908067524277e-05, + "loss": 2.7256, + "step": 1040 + }, + { + "epoch": 0.31952117863720075, + "grad_norm": 1.4633710384368896, + "learning_rate": 9.999905028647976e-05, + "loss": 2.6779, + "step": 1041 + }, + { + "epoch": 0.3198281154082259, + "grad_norm": 1.6108646392822266, + "learning_rate": 9.999901940359684e-05, + "loss": 2.781, + "step": 1042 + }, + { + "epoch": 0.3201350521792511, + "grad_norm": 1.4130996465682983, + "learning_rate": 9.999898802659428e-05, + "loss": 2.6327, + "step": 1043 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 2.110307455062866, + "learning_rate": 9.999895615547244e-05, + "loss": 2.7965, + "step": 1044 + }, + { + "epoch": 0.3207489257213014, + "grad_norm": 1.500618815422058, + "learning_rate": 9.99989237902316e-05, + "loss": 2.7874, + "step": 1045 + }, + { + "epoch": 0.3210558624923266, + "grad_norm": 1.577890157699585, + "learning_rate": 9.999889093087207e-05, + "loss": 2.6816, + "step": 1046 + }, + { + "epoch": 0.32136279926335176, + "grad_norm": 1.2820981740951538, + "learning_rate": 9.999885757739422e-05, + "loss": 2.6799, + "step": 1047 + }, + { + "epoch": 0.32166973603437693, + "grad_norm": 1.629936695098877, + "learning_rate": 9.999882372979835e-05, + "loss": 2.6783, + "step": 1048 + }, + { + "epoch": 0.3219766728054021, + "grad_norm": 1.3119972944259644, + "learning_rate": 9.999878938808478e-05, + "loss": 2.6403, + "step": 1049 + }, + { + "epoch": 0.32228360957642727, + "grad_norm": 1.720093846321106, + "learning_rate": 9.999875455225389e-05, + "loss": 2.709, + "step": 1050 + }, + { + "epoch": 0.32259054634745243, + "grad_norm": 1.446273922920227, + "learning_rate": 9.999871922230599e-05, + "loss": 2.6463, + "step": 1051 + }, + { + "epoch": 0.3228974831184776, + "grad_norm": 1.5000908374786377, + "learning_rate": 9.999868339824145e-05, + "loss": 2.7502, + "step": 1052 + }, + { + "epoch": 0.32320441988950277, + "grad_norm": 1.6257869005203247, + "learning_rate": 9.999864708006061e-05, + "loss": 2.6984, + "step": 1053 + }, + { + "epoch": 0.32351135666052794, + "grad_norm": 1.509638786315918, + "learning_rate": 9.999861026776384e-05, + "loss": 2.6931, + "step": 1054 + }, + { + "epoch": 0.3238182934315531, + "grad_norm": 1.5305874347686768, + "learning_rate": 9.999857296135149e-05, + "loss": 2.8423, + "step": 1055 + }, + { + "epoch": 0.3241252302025783, + "grad_norm": 1.7664300203323364, + "learning_rate": 9.999853516082394e-05, + "loss": 2.7703, + "step": 1056 + }, + { + "epoch": 0.32443216697360344, + "grad_norm": 1.4633153676986694, + "learning_rate": 9.999849686618157e-05, + "loss": 2.7588, + "step": 1057 + }, + { + "epoch": 0.3247391037446286, + "grad_norm": 1.5177773237228394, + "learning_rate": 9.999845807742473e-05, + "loss": 2.7376, + "step": 1058 + }, + { + "epoch": 0.3250460405156538, + "grad_norm": 1.6122089624404907, + "learning_rate": 9.999841879455383e-05, + "loss": 2.7871, + "step": 1059 + }, + { + "epoch": 0.32535297728667895, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.999837901756926e-05, + "loss": 2.6602, + "step": 1060 + }, + { + "epoch": 0.3256599140577041, + "grad_norm": 1.5714327096939087, + "learning_rate": 9.99983387464714e-05, + "loss": 2.6279, + "step": 1061 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 1.399731993675232, + "learning_rate": 9.999829798126065e-05, + "loss": 2.7957, + "step": 1062 + }, + { + "epoch": 0.32627378759975445, + "grad_norm": 1.694368839263916, + "learning_rate": 9.999825672193741e-05, + "loss": 2.6859, + "step": 1063 + }, + { + "epoch": 0.3265807243707796, + "grad_norm": 1.2585967779159546, + "learning_rate": 9.99982149685021e-05, + "loss": 2.7964, + "step": 1064 + }, + { + "epoch": 0.3268876611418048, + "grad_norm": 1.802262306213379, + "learning_rate": 9.999817272095512e-05, + "loss": 2.6325, + "step": 1065 + }, + { + "epoch": 0.32719459791282995, + "grad_norm": 1.213222861289978, + "learning_rate": 9.99981299792969e-05, + "loss": 2.718, + "step": 1066 + }, + { + "epoch": 0.3275015346838551, + "grad_norm": 1.5745760202407837, + "learning_rate": 9.999808674352785e-05, + "loss": 2.8589, + "step": 1067 + }, + { + "epoch": 0.3278084714548803, + "grad_norm": 1.516995906829834, + "learning_rate": 9.999804301364839e-05, + "loss": 2.6691, + "step": 1068 + }, + { + "epoch": 0.32811540822590546, + "grad_norm": 1.4223122596740723, + "learning_rate": 9.999799878965897e-05, + "loss": 2.6899, + "step": 1069 + }, + { + "epoch": 0.3284223449969306, + "grad_norm": 1.4502828121185303, + "learning_rate": 9.999795407156003e-05, + "loss": 2.7801, + "step": 1070 + }, + { + "epoch": 0.3287292817679558, + "grad_norm": 1.4692026376724243, + "learning_rate": 9.999790885935198e-05, + "loss": 2.6869, + "step": 1071 + }, + { + "epoch": 0.32903621853898096, + "grad_norm": 1.4182246923446655, + "learning_rate": 9.999786315303532e-05, + "loss": 2.7802, + "step": 1072 + }, + { + "epoch": 0.32934315531000613, + "grad_norm": 1.781173586845398, + "learning_rate": 9.999781695261046e-05, + "loss": 2.7522, + "step": 1073 + }, + { + "epoch": 0.3296500920810313, + "grad_norm": 1.3958306312561035, + "learning_rate": 9.999777025807786e-05, + "loss": 2.6894, + "step": 1074 + }, + { + "epoch": 0.32995702885205647, + "grad_norm": 1.7938110828399658, + "learning_rate": 9.9997723069438e-05, + "loss": 2.6468, + "step": 1075 + }, + { + "epoch": 0.33026396562308163, + "grad_norm": 1.2314528226852417, + "learning_rate": 9.999767538669134e-05, + "loss": 2.7446, + "step": 1076 + }, + { + "epoch": 0.3305709023941068, + "grad_norm": 1.4881565570831299, + "learning_rate": 9.999762720983835e-05, + "loss": 2.6904, + "step": 1077 + }, + { + "epoch": 0.33087783916513197, + "grad_norm": 1.3903130292892456, + "learning_rate": 9.999757853887948e-05, + "loss": 2.7315, + "step": 1078 + }, + { + "epoch": 0.33118477593615714, + "grad_norm": 1.491129755973816, + "learning_rate": 9.999752937381525e-05, + "loss": 2.7325, + "step": 1079 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 1.4748190641403198, + "learning_rate": 9.999747971464612e-05, + "loss": 2.7288, + "step": 1080 + }, + { + "epoch": 0.3317986494782075, + "grad_norm": 1.5664055347442627, + "learning_rate": 9.99974295613726e-05, + "loss": 2.8225, + "step": 1081 + }, + { + "epoch": 0.33210558624923264, + "grad_norm": 1.4422696828842163, + "learning_rate": 9.999737891399518e-05, + "loss": 2.6537, + "step": 1082 + }, + { + "epoch": 0.3324125230202578, + "grad_norm": 1.397817850112915, + "learning_rate": 9.999732777251436e-05, + "loss": 2.6329, + "step": 1083 + }, + { + "epoch": 0.332719459791283, + "grad_norm": 1.4253548383712769, + "learning_rate": 9.999727613693063e-05, + "loss": 2.7028, + "step": 1084 + }, + { + "epoch": 0.33302639656230815, + "grad_norm": 1.4327688217163086, + "learning_rate": 9.999722400724451e-05, + "loss": 2.6524, + "step": 1085 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.999717138345654e-05, + "loss": 2.7278, + "step": 1086 + }, + { + "epoch": 0.3336402701043585, + "grad_norm": 1.536656379699707, + "learning_rate": 9.999711826556719e-05, + "loss": 2.5858, + "step": 1087 + }, + { + "epoch": 0.33394720687538365, + "grad_norm": 1.4210286140441895, + "learning_rate": 9.999706465357703e-05, + "loss": 2.7057, + "step": 1088 + }, + { + "epoch": 0.3342541436464088, + "grad_norm": 1.4605839252471924, + "learning_rate": 9.999701054748657e-05, + "loss": 2.6461, + "step": 1089 + }, + { + "epoch": 0.334561080417434, + "grad_norm": 1.4764037132263184, + "learning_rate": 9.999695594729636e-05, + "loss": 2.608, + "step": 1090 + }, + { + "epoch": 0.33486801718845915, + "grad_norm": 1.630843162536621, + "learning_rate": 9.99969008530069e-05, + "loss": 2.6165, + "step": 1091 + }, + { + "epoch": 0.3351749539594843, + "grad_norm": 1.3693522214889526, + "learning_rate": 9.999684526461879e-05, + "loss": 2.72, + "step": 1092 + }, + { + "epoch": 0.3354818907305095, + "grad_norm": 1.609580636024475, + "learning_rate": 9.999678918213254e-05, + "loss": 2.7602, + "step": 1093 + }, + { + "epoch": 0.33578882750153466, + "grad_norm": 1.3815720081329346, + "learning_rate": 9.999673260554872e-05, + "loss": 2.6297, + "step": 1094 + }, + { + "epoch": 0.3360957642725598, + "grad_norm": 1.4511120319366455, + "learning_rate": 9.999667553486787e-05, + "loss": 2.7515, + "step": 1095 + }, + { + "epoch": 0.336402701043585, + "grad_norm": 1.486387848854065, + "learning_rate": 9.999661797009057e-05, + "loss": 2.6839, + "step": 1096 + }, + { + "epoch": 0.33670963781461016, + "grad_norm": 1.239160180091858, + "learning_rate": 9.999655991121739e-05, + "loss": 2.6033, + "step": 1097 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 1.499598741531372, + "learning_rate": 9.999650135824891e-05, + "loss": 2.5582, + "step": 1098 + }, + { + "epoch": 0.33732351135666055, + "grad_norm": 1.32973051071167, + "learning_rate": 9.999644231118571e-05, + "loss": 2.6253, + "step": 1099 + }, + { + "epoch": 0.3376304481276857, + "grad_norm": 1.4025259017944336, + "learning_rate": 9.999638277002833e-05, + "loss": 2.6199, + "step": 1100 + }, + { + "epoch": 0.3379373848987109, + "grad_norm": 1.3162082433700562, + "learning_rate": 9.999632273477742e-05, + "loss": 2.5528, + "step": 1101 + }, + { + "epoch": 0.33824432166973606, + "grad_norm": 1.5454723834991455, + "learning_rate": 9.999626220543352e-05, + "loss": 2.6724, + "step": 1102 + }, + { + "epoch": 0.3385512584407612, + "grad_norm": 1.45896315574646, + "learning_rate": 9.999620118199727e-05, + "loss": 2.688, + "step": 1103 + }, + { + "epoch": 0.3388581952117864, + "grad_norm": 1.3940998315811157, + "learning_rate": 9.999613966446926e-05, + "loss": 2.6991, + "step": 1104 + }, + { + "epoch": 0.33916513198281156, + "grad_norm": 1.4427480697631836, + "learning_rate": 9.999607765285009e-05, + "loss": 2.6869, + "step": 1105 + }, + { + "epoch": 0.33947206875383673, + "grad_norm": 1.260373830795288, + "learning_rate": 9.999601514714036e-05, + "loss": 2.7011, + "step": 1106 + }, + { + "epoch": 0.3397790055248619, + "grad_norm": 1.5985103845596313, + "learning_rate": 9.999595214734072e-05, + "loss": 2.599, + "step": 1107 + }, + { + "epoch": 0.34008594229588707, + "grad_norm": 1.1968494653701782, + "learning_rate": 9.999588865345179e-05, + "loss": 2.6346, + "step": 1108 + }, + { + "epoch": 0.34039287906691224, + "grad_norm": 1.4565916061401367, + "learning_rate": 9.999582466547417e-05, + "loss": 2.6303, + "step": 1109 + }, + { + "epoch": 0.3406998158379374, + "grad_norm": 1.2992361783981323, + "learning_rate": 9.999576018340851e-05, + "loss": 2.6121, + "step": 1110 + }, + { + "epoch": 0.34100675260896257, + "grad_norm": 1.402471899986267, + "learning_rate": 9.999569520725543e-05, + "loss": 2.6697, + "step": 1111 + }, + { + "epoch": 0.34131368937998774, + "grad_norm": 1.3006439208984375, + "learning_rate": 9.99956297370156e-05, + "loss": 2.6347, + "step": 1112 + }, + { + "epoch": 0.3416206261510129, + "grad_norm": 1.4235650300979614, + "learning_rate": 9.999556377268966e-05, + "loss": 2.6869, + "step": 1113 + }, + { + "epoch": 0.3419275629220381, + "grad_norm": 1.3288183212280273, + "learning_rate": 9.999549731427824e-05, + "loss": 2.5834, + "step": 1114 + }, + { + "epoch": 0.34223449969306324, + "grad_norm": 1.430736780166626, + "learning_rate": 9.999543036178203e-05, + "loss": 2.6248, + "step": 1115 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 1.467417597770691, + "learning_rate": 9.999536291520167e-05, + "loss": 2.6563, + "step": 1116 + }, + { + "epoch": 0.3428483732351136, + "grad_norm": 1.3988397121429443, + "learning_rate": 9.999529497453782e-05, + "loss": 2.6634, + "step": 1117 + }, + { + "epoch": 0.34315531000613875, + "grad_norm": 1.2072746753692627, + "learning_rate": 9.999522653979117e-05, + "loss": 2.6129, + "step": 1118 + }, + { + "epoch": 0.3434622467771639, + "grad_norm": 1.5297373533248901, + "learning_rate": 9.999515761096239e-05, + "loss": 2.6359, + "step": 1119 + }, + { + "epoch": 0.3437691835481891, + "grad_norm": 1.2022082805633545, + "learning_rate": 9.999508818805214e-05, + "loss": 2.6934, + "step": 1120 + }, + { + "epoch": 0.34407612031921425, + "grad_norm": 1.5655800104141235, + "learning_rate": 9.999501827106114e-05, + "loss": 2.6132, + "step": 1121 + }, + { + "epoch": 0.3443830570902394, + "grad_norm": 1.1639407873153687, + "learning_rate": 9.999494785999007e-05, + "loss": 2.6416, + "step": 1122 + }, + { + "epoch": 0.3446899938612646, + "grad_norm": 1.5784116983413696, + "learning_rate": 9.999487695483962e-05, + "loss": 2.5967, + "step": 1123 + }, + { + "epoch": 0.34499693063228976, + "grad_norm": 1.1812770366668701, + "learning_rate": 9.999480555561049e-05, + "loss": 2.6303, + "step": 1124 + }, + { + "epoch": 0.3453038674033149, + "grad_norm": 1.5105888843536377, + "learning_rate": 9.99947336623034e-05, + "loss": 2.58, + "step": 1125 + }, + { + "epoch": 0.3456108041743401, + "grad_norm": 1.2969506978988647, + "learning_rate": 9.999466127491904e-05, + "loss": 2.6857, + "step": 1126 + }, + { + "epoch": 0.34591774094536526, + "grad_norm": 1.679018259048462, + "learning_rate": 9.999458839345812e-05, + "loss": 2.6304, + "step": 1127 + }, + { + "epoch": 0.3462246777163904, + "grad_norm": 1.2718015909194946, + "learning_rate": 9.99945150179214e-05, + "loss": 2.6929, + "step": 1128 + }, + { + "epoch": 0.3465316144874156, + "grad_norm": 1.5834014415740967, + "learning_rate": 9.999444114830957e-05, + "loss": 2.6477, + "step": 1129 + }, + { + "epoch": 0.34683855125844076, + "grad_norm": 1.1575955152511597, + "learning_rate": 9.999436678462338e-05, + "loss": 2.6908, + "step": 1130 + }, + { + "epoch": 0.34714548802946593, + "grad_norm": 1.6231988668441772, + "learning_rate": 9.999429192686352e-05, + "loss": 2.6741, + "step": 1131 + }, + { + "epoch": 0.3474524248004911, + "grad_norm": 1.1616390943527222, + "learning_rate": 9.99942165750308e-05, + "loss": 2.5977, + "step": 1132 + }, + { + "epoch": 0.34775936157151627, + "grad_norm": 1.6188498735427856, + "learning_rate": 9.999414072912592e-05, + "loss": 2.6776, + "step": 1133 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 1.3885529041290283, + "learning_rate": 9.999406438914962e-05, + "loss": 2.7136, + "step": 1134 + }, + { + "epoch": 0.3483732351135666, + "grad_norm": 1.4522851705551147, + "learning_rate": 9.999398755510269e-05, + "loss": 2.6817, + "step": 1135 + }, + { + "epoch": 0.34868017188459177, + "grad_norm": 1.2695082426071167, + "learning_rate": 9.999391022698588e-05, + "loss": 2.6257, + "step": 1136 + }, + { + "epoch": 0.34898710865561694, + "grad_norm": 1.1735594272613525, + "learning_rate": 9.999383240479993e-05, + "loss": 2.5908, + "step": 1137 + }, + { + "epoch": 0.3492940454266421, + "grad_norm": 1.4158523082733154, + "learning_rate": 9.999375408854564e-05, + "loss": 2.572, + "step": 1138 + }, + { + "epoch": 0.3496009821976673, + "grad_norm": 1.1342333555221558, + "learning_rate": 9.999367527822376e-05, + "loss": 2.6918, + "step": 1139 + }, + { + "epoch": 0.34990791896869244, + "grad_norm": 1.4462997913360596, + "learning_rate": 9.999359597383509e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 0.3502148557397176, + "grad_norm": 1.254346251487732, + "learning_rate": 9.99935161753804e-05, + "loss": 2.6426, + "step": 1141 + }, + { + "epoch": 0.3505217925107428, + "grad_norm": 1.5101851224899292, + "learning_rate": 9.999343588286048e-05, + "loss": 2.6261, + "step": 1142 + }, + { + "epoch": 0.35082872928176795, + "grad_norm": 1.2910065650939941, + "learning_rate": 9.999335509627612e-05, + "loss": 2.5587, + "step": 1143 + }, + { + "epoch": 0.3511356660527931, + "grad_norm": 1.4421133995056152, + "learning_rate": 9.999327381562812e-05, + "loss": 2.6812, + "step": 1144 + }, + { + "epoch": 0.3514426028238183, + "grad_norm": 1.3265037536621094, + "learning_rate": 9.999319204091728e-05, + "loss": 2.6506, + "step": 1145 + }, + { + "epoch": 0.35174953959484345, + "grad_norm": 1.346258521080017, + "learning_rate": 9.999310977214443e-05, + "loss": 2.7038, + "step": 1146 + }, + { + "epoch": 0.3520564763658686, + "grad_norm": 1.3683836460113525, + "learning_rate": 9.999302700931037e-05, + "loss": 2.5823, + "step": 1147 + }, + { + "epoch": 0.3523634131368938, + "grad_norm": 1.3593783378601074, + "learning_rate": 9.99929437524159e-05, + "loss": 2.5705, + "step": 1148 + }, + { + "epoch": 0.35267034990791896, + "grad_norm": 1.4077095985412598, + "learning_rate": 9.999286000146186e-05, + "loss": 2.6259, + "step": 1149 + }, + { + "epoch": 0.3529772866789441, + "grad_norm": 1.3095922470092773, + "learning_rate": 9.99927757564491e-05, + "loss": 2.683, + "step": 1150 + }, + { + "epoch": 0.3532842234499693, + "grad_norm": 1.4188631772994995, + "learning_rate": 9.999269101737841e-05, + "loss": 2.619, + "step": 1151 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 1.2483123540878296, + "learning_rate": 9.999260578425063e-05, + "loss": 2.6477, + "step": 1152 + }, + { + "epoch": 0.35389809699201963, + "grad_norm": 1.4601099491119385, + "learning_rate": 9.999252005706663e-05, + "loss": 2.5861, + "step": 1153 + }, + { + "epoch": 0.3542050337630448, + "grad_norm": 1.107335090637207, + "learning_rate": 9.999243383582726e-05, + "loss": 2.6308, + "step": 1154 + }, + { + "epoch": 0.35451197053406996, + "grad_norm": 1.60590398311615, + "learning_rate": 9.999234712053334e-05, + "loss": 2.7057, + "step": 1155 + }, + { + "epoch": 0.35481890730509513, + "grad_norm": 1.2256578207015991, + "learning_rate": 9.999225991118575e-05, + "loss": 2.6371, + "step": 1156 + }, + { + "epoch": 0.3551258440761203, + "grad_norm": 1.4451910257339478, + "learning_rate": 9.999217220778535e-05, + "loss": 2.6424, + "step": 1157 + }, + { + "epoch": 0.35543278084714547, + "grad_norm": 1.184781789779663, + "learning_rate": 9.999208401033299e-05, + "loss": 2.6576, + "step": 1158 + }, + { + "epoch": 0.35573971761817064, + "grad_norm": 1.3395711183547974, + "learning_rate": 9.999199531882956e-05, + "loss": 2.6109, + "step": 1159 + }, + { + "epoch": 0.3560466543891958, + "grad_norm": 1.2052571773529053, + "learning_rate": 9.999190613327594e-05, + "loss": 2.5486, + "step": 1160 + }, + { + "epoch": 0.356353591160221, + "grad_norm": 1.2690850496292114, + "learning_rate": 9.999181645367299e-05, + "loss": 2.6457, + "step": 1161 + }, + { + "epoch": 0.35666052793124614, + "grad_norm": 1.2832787036895752, + "learning_rate": 9.999172628002162e-05, + "loss": 2.6097, + "step": 1162 + }, + { + "epoch": 0.3569674647022713, + "grad_norm": 1.3791579008102417, + "learning_rate": 9.999163561232272e-05, + "loss": 2.7458, + "step": 1163 + }, + { + "epoch": 0.3572744014732965, + "grad_norm": 1.260743498802185, + "learning_rate": 9.999154445057715e-05, + "loss": 2.594, + "step": 1164 + }, + { + "epoch": 0.35758133824432164, + "grad_norm": 1.1595406532287598, + "learning_rate": 9.999145279478585e-05, + "loss": 2.5315, + "step": 1165 + }, + { + "epoch": 0.3578882750153468, + "grad_norm": 1.3424396514892578, + "learning_rate": 9.999136064494972e-05, + "loss": 2.6017, + "step": 1166 + }, + { + "epoch": 0.358195211786372, + "grad_norm": 1.317750334739685, + "learning_rate": 9.999126800106963e-05, + "loss": 2.5787, + "step": 1167 + }, + { + "epoch": 0.35850214855739715, + "grad_norm": 1.104471206665039, + "learning_rate": 9.999117486314657e-05, + "loss": 2.6801, + "step": 1168 + }, + { + "epoch": 0.3588090853284224, + "grad_norm": 1.5555830001831055, + "learning_rate": 9.99910812311814e-05, + "loss": 2.6575, + "step": 1169 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 1.1883453130722046, + "learning_rate": 9.999098710517507e-05, + "loss": 2.5801, + "step": 1170 + }, + { + "epoch": 0.3594229588704727, + "grad_norm": 1.3885222673416138, + "learning_rate": 9.99908924851285e-05, + "loss": 2.5637, + "step": 1171 + }, + { + "epoch": 0.3597298956414979, + "grad_norm": 1.1860510110855103, + "learning_rate": 9.999079737104262e-05, + "loss": 2.6528, + "step": 1172 + }, + { + "epoch": 0.36003683241252304, + "grad_norm": 1.4319096803665161, + "learning_rate": 9.99907017629184e-05, + "loss": 2.579, + "step": 1173 + }, + { + "epoch": 0.3603437691835482, + "grad_norm": 1.256819725036621, + "learning_rate": 9.999060566075676e-05, + "loss": 2.5638, + "step": 1174 + }, + { + "epoch": 0.3606507059545734, + "grad_norm": 1.5452641248703003, + "learning_rate": 9.999050906455865e-05, + "loss": 2.6318, + "step": 1175 + }, + { + "epoch": 0.36095764272559855, + "grad_norm": 1.1933847665786743, + "learning_rate": 9.999041197432503e-05, + "loss": 2.5451, + "step": 1176 + }, + { + "epoch": 0.3612645794966237, + "grad_norm": 1.245689034461975, + "learning_rate": 9.999031439005684e-05, + "loss": 2.5452, + "step": 1177 + }, + { + "epoch": 0.3615715162676489, + "grad_norm": 1.2228111028671265, + "learning_rate": 9.99902163117551e-05, + "loss": 2.5856, + "step": 1178 + }, + { + "epoch": 0.36187845303867405, + "grad_norm": 1.3547098636627197, + "learning_rate": 9.999011773942071e-05, + "loss": 2.6604, + "step": 1179 + }, + { + "epoch": 0.3621853898096992, + "grad_norm": 1.25395929813385, + "learning_rate": 9.999001867305469e-05, + "loss": 2.5947, + "step": 1180 + }, + { + "epoch": 0.3624923265807244, + "grad_norm": 1.1676687002182007, + "learning_rate": 9.9989919112658e-05, + "loss": 2.5728, + "step": 1181 + }, + { + "epoch": 0.36279926335174956, + "grad_norm": 1.2076375484466553, + "learning_rate": 9.998981905823163e-05, + "loss": 2.569, + "step": 1182 + }, + { + "epoch": 0.3631062001227747, + "grad_norm": 1.3417900800704956, + "learning_rate": 9.998971850977659e-05, + "loss": 2.5552, + "step": 1183 + }, + { + "epoch": 0.3634131368937999, + "grad_norm": 1.135088324546814, + "learning_rate": 9.998961746729383e-05, + "loss": 2.5883, + "step": 1184 + }, + { + "epoch": 0.36372007366482506, + "grad_norm": 1.3329869508743286, + "learning_rate": 9.998951593078438e-05, + "loss": 2.6398, + "step": 1185 + }, + { + "epoch": 0.36402701043585023, + "grad_norm": 1.1681292057037354, + "learning_rate": 9.998941390024923e-05, + "loss": 2.6082, + "step": 1186 + }, + { + "epoch": 0.3643339472068754, + "grad_norm": 1.4083843231201172, + "learning_rate": 9.998931137568939e-05, + "loss": 2.6585, + "step": 1187 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 1.0879896879196167, + "learning_rate": 9.998920835710587e-05, + "loss": 2.4779, + "step": 1188 + }, + { + "epoch": 0.36494782074892573, + "grad_norm": 1.2977828979492188, + "learning_rate": 9.99891048444997e-05, + "loss": 2.6586, + "step": 1189 + }, + { + "epoch": 0.3652547575199509, + "grad_norm": 1.2552378177642822, + "learning_rate": 9.998900083787188e-05, + "loss": 2.5211, + "step": 1190 + }, + { + "epoch": 0.36556169429097607, + "grad_norm": 1.178227186203003, + "learning_rate": 9.998889633722348e-05, + "loss": 2.5365, + "step": 1191 + }, + { + "epoch": 0.36586863106200124, + "grad_norm": 1.36601722240448, + "learning_rate": 9.99887913425555e-05, + "loss": 2.6108, + "step": 1192 + }, + { + "epoch": 0.3661755678330264, + "grad_norm": 1.1947816610336304, + "learning_rate": 9.998868585386898e-05, + "loss": 2.5269, + "step": 1193 + }, + { + "epoch": 0.3664825046040516, + "grad_norm": 1.3113429546356201, + "learning_rate": 9.998857987116497e-05, + "loss": 2.5241, + "step": 1194 + }, + { + "epoch": 0.36678944137507674, + "grad_norm": 1.1573466062545776, + "learning_rate": 9.99884733944445e-05, + "loss": 2.5772, + "step": 1195 + }, + { + "epoch": 0.3670963781461019, + "grad_norm": 1.3841795921325684, + "learning_rate": 9.998836642370866e-05, + "loss": 2.6254, + "step": 1196 + }, + { + "epoch": 0.3674033149171271, + "grad_norm": 1.3332045078277588, + "learning_rate": 9.998825895895848e-05, + "loss": 2.6846, + "step": 1197 + }, + { + "epoch": 0.36771025168815225, + "grad_norm": 1.1578748226165771, + "learning_rate": 9.9988151000195e-05, + "loss": 2.4717, + "step": 1198 + }, + { + "epoch": 0.3680171884591774, + "grad_norm": 1.1045753955841064, + "learning_rate": 9.998804254741934e-05, + "loss": 2.6433, + "step": 1199 + }, + { + "epoch": 0.3683241252302026, + "grad_norm": 1.3260962963104248, + "learning_rate": 9.998793360063254e-05, + "loss": 2.6385, + "step": 1200 + }, + { + "epoch": 0.36863106200122775, + "grad_norm": 1.1483805179595947, + "learning_rate": 9.998782415983568e-05, + "loss": 2.6013, + "step": 1201 + }, + { + "epoch": 0.3689379987722529, + "grad_norm": 1.1897181272506714, + "learning_rate": 9.998771422502984e-05, + "loss": 2.485, + "step": 1202 + }, + { + "epoch": 0.3692449355432781, + "grad_norm": 1.2124346494674683, + "learning_rate": 9.99876037962161e-05, + "loss": 2.6271, + "step": 1203 + }, + { + "epoch": 0.36955187231430325, + "grad_norm": 1.2274240255355835, + "learning_rate": 9.998749287339557e-05, + "loss": 2.6072, + "step": 1204 + }, + { + "epoch": 0.3698588090853284, + "grad_norm": 1.2045015096664429, + "learning_rate": 9.998738145656934e-05, + "loss": 2.5567, + "step": 1205 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 1.187698483467102, + "learning_rate": 9.998726954573852e-05, + "loss": 2.6251, + "step": 1206 + }, + { + "epoch": 0.37047268262737876, + "grad_norm": 1.1760836839675903, + "learning_rate": 9.998715714090419e-05, + "loss": 2.6544, + "step": 1207 + }, + { + "epoch": 0.3707796193984039, + "grad_norm": 1.2181260585784912, + "learning_rate": 9.998704424206746e-05, + "loss": 2.6258, + "step": 1208 + }, + { + "epoch": 0.3710865561694291, + "grad_norm": 1.2106094360351562, + "learning_rate": 9.998693084922947e-05, + "loss": 2.5932, + "step": 1209 + }, + { + "epoch": 0.37139349294045426, + "grad_norm": 1.2973625659942627, + "learning_rate": 9.998681696239133e-05, + "loss": 2.5257, + "step": 1210 + }, + { + "epoch": 0.37170042971147943, + "grad_norm": 1.2477924823760986, + "learning_rate": 9.998670258155417e-05, + "loss": 2.6579, + "step": 1211 + }, + { + "epoch": 0.3720073664825046, + "grad_norm": 1.3301422595977783, + "learning_rate": 9.998658770671913e-05, + "loss": 2.4903, + "step": 1212 + }, + { + "epoch": 0.37231430325352977, + "grad_norm": 1.224321722984314, + "learning_rate": 9.998647233788732e-05, + "loss": 2.5865, + "step": 1213 + }, + { + "epoch": 0.37262124002455493, + "grad_norm": 1.3110655546188354, + "learning_rate": 9.99863564750599e-05, + "loss": 2.6134, + "step": 1214 + }, + { + "epoch": 0.3729281767955801, + "grad_norm": 1.2323014736175537, + "learning_rate": 9.998624011823801e-05, + "loss": 2.5892, + "step": 1215 + }, + { + "epoch": 0.37323511356660527, + "grad_norm": 1.0873770713806152, + "learning_rate": 9.998612326742279e-05, + "loss": 2.4897, + "step": 1216 + }, + { + "epoch": 0.37354205033763044, + "grad_norm": 1.2789679765701294, + "learning_rate": 9.998600592261539e-05, + "loss": 2.5603, + "step": 1217 + }, + { + "epoch": 0.3738489871086556, + "grad_norm": 1.1311540603637695, + "learning_rate": 9.998588808381699e-05, + "loss": 2.5327, + "step": 1218 + }, + { + "epoch": 0.3741559238796808, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.998576975102876e-05, + "loss": 2.4789, + "step": 1219 + }, + { + "epoch": 0.37446286065070594, + "grad_norm": 1.1840651035308838, + "learning_rate": 9.998565092425182e-05, + "loss": 2.5026, + "step": 1220 + }, + { + "epoch": 0.3747697974217311, + "grad_norm": 1.3145099878311157, + "learning_rate": 9.998553160348743e-05, + "loss": 2.5424, + "step": 1221 + }, + { + "epoch": 0.3750767341927563, + "grad_norm": 1.2192758321762085, + "learning_rate": 9.998541178873668e-05, + "loss": 2.5556, + "step": 1222 + }, + { + "epoch": 0.37538367096378145, + "grad_norm": 1.1329905986785889, + "learning_rate": 9.99852914800008e-05, + "loss": 2.4624, + "step": 1223 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 1.2490339279174805, + "learning_rate": 9.9985170677281e-05, + "loss": 2.5016, + "step": 1224 + }, + { + "epoch": 0.3759975445058318, + "grad_norm": 1.1884582042694092, + "learning_rate": 9.998504938057841e-05, + "loss": 2.5345, + "step": 1225 + }, + { + "epoch": 0.37630448127685695, + "grad_norm": 1.2075775861740112, + "learning_rate": 9.998492758989428e-05, + "loss": 2.5206, + "step": 1226 + }, + { + "epoch": 0.3766114180478821, + "grad_norm": 1.238457441329956, + "learning_rate": 9.99848053052298e-05, + "loss": 2.6748, + "step": 1227 + }, + { + "epoch": 0.3769183548189073, + "grad_norm": 1.3056883811950684, + "learning_rate": 9.998468252658618e-05, + "loss": 2.6146, + "step": 1228 + }, + { + "epoch": 0.37722529158993245, + "grad_norm": 1.191575050354004, + "learning_rate": 9.998455925396461e-05, + "loss": 2.4743, + "step": 1229 + }, + { + "epoch": 0.3775322283609576, + "grad_norm": 1.2834603786468506, + "learning_rate": 9.998443548736635e-05, + "loss": 2.5504, + "step": 1230 + }, + { + "epoch": 0.3778391651319828, + "grad_norm": 1.3023632764816284, + "learning_rate": 9.99843112267926e-05, + "loss": 2.5832, + "step": 1231 + }, + { + "epoch": 0.37814610190300796, + "grad_norm": 1.1219336986541748, + "learning_rate": 9.998418647224458e-05, + "loss": 2.5715, + "step": 1232 + }, + { + "epoch": 0.3784530386740331, + "grad_norm": 1.0666810274124146, + "learning_rate": 9.998406122372354e-05, + "loss": 2.4865, + "step": 1233 + }, + { + "epoch": 0.3787599754450583, + "grad_norm": 1.3699263334274292, + "learning_rate": 9.998393548123072e-05, + "loss": 2.5523, + "step": 1234 + }, + { + "epoch": 0.37906691221608346, + "grad_norm": 1.1383014917373657, + "learning_rate": 9.998380924476733e-05, + "loss": 2.7054, + "step": 1235 + }, + { + "epoch": 0.37937384898710863, + "grad_norm": 1.1304205656051636, + "learning_rate": 9.998368251433465e-05, + "loss": 2.5007, + "step": 1236 + }, + { + "epoch": 0.3796807857581338, + "grad_norm": 1.2220405340194702, + "learning_rate": 9.998355528993394e-05, + "loss": 2.5635, + "step": 1237 + }, + { + "epoch": 0.37998772252915897, + "grad_norm": 1.1126691102981567, + "learning_rate": 9.998342757156642e-05, + "loss": 2.5795, + "step": 1238 + }, + { + "epoch": 0.38029465930018413, + "grad_norm": 1.1675945520401, + "learning_rate": 9.998329935923339e-05, + "loss": 2.564, + "step": 1239 + }, + { + "epoch": 0.38060159607120936, + "grad_norm": 1.1286569833755493, + "learning_rate": 9.998317065293607e-05, + "loss": 2.5476, + "step": 1240 + }, + { + "epoch": 0.3809085328422345, + "grad_norm": 1.1252213716506958, + "learning_rate": 9.998304145267579e-05, + "loss": 2.5406, + "step": 1241 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 1.1931700706481934, + "learning_rate": 9.998291175845378e-05, + "loss": 2.5277, + "step": 1242 + }, + { + "epoch": 0.38152240638428486, + "grad_norm": 1.2148306369781494, + "learning_rate": 9.998278157027136e-05, + "loss": 2.5178, + "step": 1243 + }, + { + "epoch": 0.38182934315531003, + "grad_norm": 1.1597660779953003, + "learning_rate": 9.998265088812978e-05, + "loss": 2.5522, + "step": 1244 + }, + { + "epoch": 0.3821362799263352, + "grad_norm": 1.105973243713379, + "learning_rate": 9.998251971203035e-05, + "loss": 2.4558, + "step": 1245 + }, + { + "epoch": 0.38244321669736037, + "grad_norm": 1.1082781553268433, + "learning_rate": 9.998238804197437e-05, + "loss": 2.5504, + "step": 1246 + }, + { + "epoch": 0.38275015346838553, + "grad_norm": 1.2124732732772827, + "learning_rate": 9.998225587796312e-05, + "loss": 2.5536, + "step": 1247 + }, + { + "epoch": 0.3830570902394107, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.998212321999795e-05, + "loss": 2.4837, + "step": 1248 + }, + { + "epoch": 0.38336402701043587, + "grad_norm": 1.353562355041504, + "learning_rate": 9.998199006808014e-05, + "loss": 2.4554, + "step": 1249 + }, + { + "epoch": 0.38367096378146104, + "grad_norm": 1.2103357315063477, + "learning_rate": 9.998185642221098e-05, + "loss": 2.4843, + "step": 1250 + }, + { + "epoch": 0.3839779005524862, + "grad_norm": 1.2572352886199951, + "learning_rate": 9.998172228239185e-05, + "loss": 2.497, + "step": 1251 + }, + { + "epoch": 0.3842848373235114, + "grad_norm": 1.0910226106643677, + "learning_rate": 9.998158764862402e-05, + "loss": 2.577, + "step": 1252 + }, + { + "epoch": 0.38459177409453654, + "grad_norm": 1.2550606727600098, + "learning_rate": 9.998145252090886e-05, + "loss": 2.5087, + "step": 1253 + }, + { + "epoch": 0.3848987108655617, + "grad_norm": 1.0103787183761597, + "learning_rate": 9.998131689924768e-05, + "loss": 2.5306, + "step": 1254 + }, + { + "epoch": 0.3852056476365869, + "grad_norm": 1.2965941429138184, + "learning_rate": 9.998118078364184e-05, + "loss": 2.5622, + "step": 1255 + }, + { + "epoch": 0.38551258440761205, + "grad_norm": 1.0791535377502441, + "learning_rate": 9.998104417409269e-05, + "loss": 2.5608, + "step": 1256 + }, + { + "epoch": 0.3858195211786372, + "grad_norm": 1.3277596235275269, + "learning_rate": 9.998090707060155e-05, + "loss": 2.5748, + "step": 1257 + }, + { + "epoch": 0.3861264579496624, + "grad_norm": 1.004031777381897, + "learning_rate": 9.99807694731698e-05, + "loss": 2.5532, + "step": 1258 + }, + { + "epoch": 0.38643339472068755, + "grad_norm": 1.4802277088165283, + "learning_rate": 9.998063138179877e-05, + "loss": 2.585, + "step": 1259 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 1.0821146965026855, + "learning_rate": 9.998049279648987e-05, + "loss": 2.5248, + "step": 1260 + }, + { + "epoch": 0.3870472682627379, + "grad_norm": 1.2902108430862427, + "learning_rate": 9.998035371724443e-05, + "loss": 2.5134, + "step": 1261 + }, + { + "epoch": 0.38735420503376305, + "grad_norm": 1.082943320274353, + "learning_rate": 9.998021414406385e-05, + "loss": 2.5937, + "step": 1262 + }, + { + "epoch": 0.3876611418047882, + "grad_norm": 1.2164193391799927, + "learning_rate": 9.998007407694949e-05, + "loss": 2.5106, + "step": 1263 + }, + { + "epoch": 0.3879680785758134, + "grad_norm": 1.0999115705490112, + "learning_rate": 9.997993351590276e-05, + "loss": 2.5458, + "step": 1264 + }, + { + "epoch": 0.38827501534683856, + "grad_norm": 1.2275537252426147, + "learning_rate": 9.997979246092503e-05, + "loss": 2.5664, + "step": 1265 + }, + { + "epoch": 0.3885819521178637, + "grad_norm": 1.3246204853057861, + "learning_rate": 9.997965091201769e-05, + "loss": 2.5289, + "step": 1266 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.2404677867889404, + "learning_rate": 9.997950886918214e-05, + "loss": 2.5302, + "step": 1267 + }, + { + "epoch": 0.38919582565991406, + "grad_norm": 1.0993810892105103, + "learning_rate": 9.99793663324198e-05, + "loss": 2.5085, + "step": 1268 + }, + { + "epoch": 0.38950276243093923, + "grad_norm": 1.3394049406051636, + "learning_rate": 9.997922330173206e-05, + "loss": 2.5882, + "step": 1269 + }, + { + "epoch": 0.3898096992019644, + "grad_norm": 1.1464321613311768, + "learning_rate": 9.997907977712036e-05, + "loss": 2.5211, + "step": 1270 + }, + { + "epoch": 0.39011663597298957, + "grad_norm": 1.1246297359466553, + "learning_rate": 9.997893575858608e-05, + "loss": 2.4204, + "step": 1271 + }, + { + "epoch": 0.39042357274401474, + "grad_norm": 1.1278076171875, + "learning_rate": 9.997879124613067e-05, + "loss": 2.4405, + "step": 1272 + }, + { + "epoch": 0.3907305095150399, + "grad_norm": 1.2284942865371704, + "learning_rate": 9.997864623975555e-05, + "loss": 2.5674, + "step": 1273 + }, + { + "epoch": 0.39103744628606507, + "grad_norm": 1.1243138313293457, + "learning_rate": 9.997850073946215e-05, + "loss": 2.489, + "step": 1274 + }, + { + "epoch": 0.39134438305709024, + "grad_norm": 1.198461890220642, + "learning_rate": 9.997835474525193e-05, + "loss": 2.51, + "step": 1275 + }, + { + "epoch": 0.3916513198281154, + "grad_norm": 1.1643213033676147, + "learning_rate": 9.997820825712629e-05, + "loss": 2.5688, + "step": 1276 + }, + { + "epoch": 0.3919582565991406, + "grad_norm": 1.2107082605361938, + "learning_rate": 9.997806127508671e-05, + "loss": 2.5614, + "step": 1277 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 1.1856440305709839, + "learning_rate": 9.997791379913464e-05, + "loss": 2.5893, + "step": 1278 + }, + { + "epoch": 0.3925721301411909, + "grad_norm": 1.166395664215088, + "learning_rate": 9.997776582927153e-05, + "loss": 2.539, + "step": 1279 + }, + { + "epoch": 0.3928790669122161, + "grad_norm": 1.1638765335083008, + "learning_rate": 9.997761736549886e-05, + "loss": 2.5384, + "step": 1280 + }, + { + "epoch": 0.39318600368324125, + "grad_norm": 1.107485055923462, + "learning_rate": 9.997746840781806e-05, + "loss": 2.559, + "step": 1281 + }, + { + "epoch": 0.3934929404542664, + "grad_norm": 1.174592137336731, + "learning_rate": 9.997731895623063e-05, + "loss": 2.5132, + "step": 1282 + }, + { + "epoch": 0.3937998772252916, + "grad_norm": 1.0407745838165283, + "learning_rate": 9.997716901073806e-05, + "loss": 2.4871, + "step": 1283 + }, + { + "epoch": 0.39410681399631675, + "grad_norm": 1.059743046760559, + "learning_rate": 9.997701857134179e-05, + "loss": 2.4865, + "step": 1284 + }, + { + "epoch": 0.3944137507673419, + "grad_norm": 1.0606070756912231, + "learning_rate": 9.997686763804335e-05, + "loss": 2.5651, + "step": 1285 + }, + { + "epoch": 0.3947206875383671, + "grad_norm": 1.0753284692764282, + "learning_rate": 9.99767162108442e-05, + "loss": 2.4699, + "step": 1286 + }, + { + "epoch": 0.39502762430939226, + "grad_norm": 1.1155509948730469, + "learning_rate": 9.997656428974585e-05, + "loss": 2.5326, + "step": 1287 + }, + { + "epoch": 0.3953345610804174, + "grad_norm": 1.2243739366531372, + "learning_rate": 9.99764118747498e-05, + "loss": 2.5189, + "step": 1288 + }, + { + "epoch": 0.3956414978514426, + "grad_norm": 1.2526514530181885, + "learning_rate": 9.997625896585757e-05, + "loss": 2.5464, + "step": 1289 + }, + { + "epoch": 0.39594843462246776, + "grad_norm": 1.297153115272522, + "learning_rate": 9.997610556307062e-05, + "loss": 2.5752, + "step": 1290 + }, + { + "epoch": 0.39625537139349293, + "grad_norm": 1.1064956188201904, + "learning_rate": 9.997595166639054e-05, + "loss": 2.5743, + "step": 1291 + }, + { + "epoch": 0.3965623081645181, + "grad_norm": 1.255810022354126, + "learning_rate": 9.997579727581879e-05, + "loss": 2.7087, + "step": 1292 + }, + { + "epoch": 0.39686924493554326, + "grad_norm": 1.4290298223495483, + "learning_rate": 9.997564239135692e-05, + "loss": 2.5417, + "step": 1293 + }, + { + "epoch": 0.39717618170656843, + "grad_norm": 1.1937109231948853, + "learning_rate": 9.997548701300648e-05, + "loss": 2.4862, + "step": 1294 + }, + { + "epoch": 0.3974831184775936, + "grad_norm": 1.1707425117492676, + "learning_rate": 9.997533114076897e-05, + "loss": 2.4715, + "step": 1295 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 1.1248551607131958, + "learning_rate": 9.997517477464596e-05, + "loss": 2.4859, + "step": 1296 + }, + { + "epoch": 0.39809699201964394, + "grad_norm": 1.1656453609466553, + "learning_rate": 9.997501791463897e-05, + "loss": 2.5402, + "step": 1297 + }, + { + "epoch": 0.3984039287906691, + "grad_norm": 0.9916674494743347, + "learning_rate": 9.997486056074956e-05, + "loss": 2.5116, + "step": 1298 + }, + { + "epoch": 0.39871086556169427, + "grad_norm": 1.3229619264602661, + "learning_rate": 9.997470271297928e-05, + "loss": 2.5565, + "step": 1299 + }, + { + "epoch": 0.39901780233271944, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.997454437132971e-05, + "loss": 2.5191, + "step": 1300 + }, + { + "epoch": 0.3993247391037446, + "grad_norm": 1.2117778062820435, + "learning_rate": 9.997438553580241e-05, + "loss": 2.558, + "step": 1301 + }, + { + "epoch": 0.3996316758747698, + "grad_norm": 1.1083563566207886, + "learning_rate": 9.997422620639892e-05, + "loss": 2.4734, + "step": 1302 + }, + { + "epoch": 0.39993861264579494, + "grad_norm": 0.9662174582481384, + "learning_rate": 9.997406638312084e-05, + "loss": 2.4866, + "step": 1303 + }, + { + "epoch": 0.4002455494168201, + "grad_norm": 1.0886632204055786, + "learning_rate": 9.997390606596976e-05, + "loss": 2.5397, + "step": 1304 + }, + { + "epoch": 0.4005524861878453, + "grad_norm": 1.2318742275238037, + "learning_rate": 9.997374525494723e-05, + "loss": 2.6281, + "step": 1305 + }, + { + "epoch": 0.40085942295887045, + "grad_norm": 1.1717815399169922, + "learning_rate": 9.997358395005487e-05, + "loss": 2.5202, + "step": 1306 + }, + { + "epoch": 0.4011663597298956, + "grad_norm": 1.0533723831176758, + "learning_rate": 9.997342215129427e-05, + "loss": 2.5096, + "step": 1307 + }, + { + "epoch": 0.4014732965009208, + "grad_norm": 1.0814248323440552, + "learning_rate": 9.997325985866701e-05, + "loss": 2.5513, + "step": 1308 + }, + { + "epoch": 0.40178023327194595, + "grad_norm": 1.078261137008667, + "learning_rate": 9.997309707217472e-05, + "loss": 2.5115, + "step": 1309 + }, + { + "epoch": 0.4020871700429711, + "grad_norm": 1.0834710597991943, + "learning_rate": 9.997293379181897e-05, + "loss": 2.4754, + "step": 1310 + }, + { + "epoch": 0.40239410681399634, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.997277001760142e-05, + "loss": 2.5068, + "step": 1311 + }, + { + "epoch": 0.4027010435850215, + "grad_norm": 1.3008345365524292, + "learning_rate": 9.997260574952366e-05, + "loss": 2.4675, + "step": 1312 + }, + { + "epoch": 0.4030079803560467, + "grad_norm": 1.176858901977539, + "learning_rate": 9.997244098758732e-05, + "loss": 2.4786, + "step": 1313 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 1.0121303796768188, + "learning_rate": 9.997227573179403e-05, + "loss": 2.476, + "step": 1314 + }, + { + "epoch": 0.403621853898097, + "grad_norm": 1.326298713684082, + "learning_rate": 9.997210998214542e-05, + "loss": 2.4093, + "step": 1315 + }, + { + "epoch": 0.4039287906691222, + "grad_norm": 0.9008898735046387, + "learning_rate": 9.997194373864314e-05, + "loss": 2.4523, + "step": 1316 + }, + { + "epoch": 0.40423572744014735, + "grad_norm": 1.0441854000091553, + "learning_rate": 9.99717770012888e-05, + "loss": 2.5419, + "step": 1317 + }, + { + "epoch": 0.4045426642111725, + "grad_norm": 1.0490028858184814, + "learning_rate": 9.997160977008408e-05, + "loss": 2.4855, + "step": 1318 + }, + { + "epoch": 0.4048496009821977, + "grad_norm": 1.0244388580322266, + "learning_rate": 9.997144204503063e-05, + "loss": 2.4555, + "step": 1319 + }, + { + "epoch": 0.40515653775322286, + "grad_norm": 1.1217700242996216, + "learning_rate": 9.99712738261301e-05, + "loss": 2.4872, + "step": 1320 + }, + { + "epoch": 0.405463474524248, + "grad_norm": 1.031691551208496, + "learning_rate": 9.997110511338414e-05, + "loss": 2.4094, + "step": 1321 + }, + { + "epoch": 0.4057704112952732, + "grad_norm": 1.1658705472946167, + "learning_rate": 9.997093590679444e-05, + "loss": 2.407, + "step": 1322 + }, + { + "epoch": 0.40607734806629836, + "grad_norm": 1.1527072191238403, + "learning_rate": 9.997076620636266e-05, + "loss": 2.5041, + "step": 1323 + }, + { + "epoch": 0.40638428483732353, + "grad_norm": 1.2039116621017456, + "learning_rate": 9.997059601209049e-05, + "loss": 2.4682, + "step": 1324 + }, + { + "epoch": 0.4066912216083487, + "grad_norm": 1.142160177230835, + "learning_rate": 9.997042532397957e-05, + "loss": 2.4629, + "step": 1325 + }, + { + "epoch": 0.40699815837937386, + "grad_norm": 0.972081184387207, + "learning_rate": 9.997025414203164e-05, + "loss": 2.3941, + "step": 1326 + }, + { + "epoch": 0.40730509515039903, + "grad_norm": 1.0181753635406494, + "learning_rate": 9.99700824662484e-05, + "loss": 2.5649, + "step": 1327 + }, + { + "epoch": 0.4076120319214242, + "grad_norm": 1.145769715309143, + "learning_rate": 9.996991029663148e-05, + "loss": 2.5284, + "step": 1328 + }, + { + "epoch": 0.40791896869244937, + "grad_norm": 1.0604028701782227, + "learning_rate": 9.996973763318262e-05, + "loss": 2.4488, + "step": 1329 + }, + { + "epoch": 0.40822590546347454, + "grad_norm": 1.161383867263794, + "learning_rate": 9.996956447590354e-05, + "loss": 2.6081, + "step": 1330 + }, + { + "epoch": 0.4085328422344997, + "grad_norm": 1.0880714654922485, + "learning_rate": 9.996939082479591e-05, + "loss": 2.4695, + "step": 1331 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 1.036556601524353, + "learning_rate": 9.99692166798615e-05, + "loss": 2.4428, + "step": 1332 + }, + { + "epoch": 0.40914671577655004, + "grad_norm": 1.079179286956787, + "learning_rate": 9.996904204110198e-05, + "loss": 2.4543, + "step": 1333 + }, + { + "epoch": 0.4094536525475752, + "grad_norm": 1.0588144063949585, + "learning_rate": 9.996886690851912e-05, + "loss": 2.4755, + "step": 1334 + }, + { + "epoch": 0.4097605893186004, + "grad_norm": 1.0359580516815186, + "learning_rate": 9.996869128211462e-05, + "loss": 2.4933, + "step": 1335 + }, + { + "epoch": 0.41006752608962554, + "grad_norm": 1.0067389011383057, + "learning_rate": 9.996851516189021e-05, + "loss": 2.4291, + "step": 1336 + }, + { + "epoch": 0.4103744628606507, + "grad_norm": 1.0173524618148804, + "learning_rate": 9.996833854784766e-05, + "loss": 2.4856, + "step": 1337 + }, + { + "epoch": 0.4106813996316759, + "grad_norm": 1.0740927457809448, + "learning_rate": 9.99681614399887e-05, + "loss": 2.5248, + "step": 1338 + }, + { + "epoch": 0.41098833640270105, + "grad_norm": 0.9638547301292419, + "learning_rate": 9.99679838383151e-05, + "loss": 2.4777, + "step": 1339 + }, + { + "epoch": 0.4112952731737262, + "grad_norm": 1.0349369049072266, + "learning_rate": 9.996780574282856e-05, + "loss": 2.5188, + "step": 1340 + }, + { + "epoch": 0.4116022099447514, + "grad_norm": 1.099743127822876, + "learning_rate": 9.996762715353089e-05, + "loss": 2.4141, + "step": 1341 + }, + { + "epoch": 0.41190914671577655, + "grad_norm": 1.027178406715393, + "learning_rate": 9.996744807042386e-05, + "loss": 2.5134, + "step": 1342 + }, + { + "epoch": 0.4122160834868017, + "grad_norm": 1.1933472156524658, + "learning_rate": 9.996726849350922e-05, + "loss": 2.4821, + "step": 1343 + }, + { + "epoch": 0.4125230202578269, + "grad_norm": 1.1663923263549805, + "learning_rate": 9.996708842278872e-05, + "loss": 2.4593, + "step": 1344 + }, + { + "epoch": 0.41282995702885206, + "grad_norm": 1.2633854150772095, + "learning_rate": 9.996690785826418e-05, + "loss": 2.5524, + "step": 1345 + }, + { + "epoch": 0.4131368937998772, + "grad_norm": 1.03873610496521, + "learning_rate": 9.996672679993737e-05, + "loss": 2.5403, + "step": 1346 + }, + { + "epoch": 0.4134438305709024, + "grad_norm": 1.106656789779663, + "learning_rate": 9.996654524781009e-05, + "loss": 2.5172, + "step": 1347 + }, + { + "epoch": 0.41375076734192756, + "grad_norm": 1.015608310699463, + "learning_rate": 9.996636320188411e-05, + "loss": 2.423, + "step": 1348 + }, + { + "epoch": 0.41405770411295273, + "grad_norm": 1.0672087669372559, + "learning_rate": 9.996618066216124e-05, + "loss": 2.4861, + "step": 1349 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 1.1289842128753662, + "learning_rate": 9.996599762864329e-05, + "loss": 2.3944, + "step": 1350 + }, + { + "epoch": 0.41467157765500307, + "grad_norm": 1.080428957939148, + "learning_rate": 9.996581410133207e-05, + "loss": 2.4563, + "step": 1351 + }, + { + "epoch": 0.41497851442602823, + "grad_norm": 1.257104516029358, + "learning_rate": 9.996563008022939e-05, + "loss": 2.437, + "step": 1352 + }, + { + "epoch": 0.4152854511970534, + "grad_norm": 1.039293646812439, + "learning_rate": 9.996544556533706e-05, + "loss": 2.4654, + "step": 1353 + }, + { + "epoch": 0.41559238796807857, + "grad_norm": 1.0976085662841797, + "learning_rate": 9.996526055665692e-05, + "loss": 2.4755, + "step": 1354 + }, + { + "epoch": 0.41589932473910374, + "grad_norm": 0.937647819519043, + "learning_rate": 9.996507505419078e-05, + "loss": 2.4687, + "step": 1355 + }, + { + "epoch": 0.4162062615101289, + "grad_norm": 1.0461267232894897, + "learning_rate": 9.996488905794047e-05, + "loss": 2.4092, + "step": 1356 + }, + { + "epoch": 0.4165131982811541, + "grad_norm": 1.0510658025741577, + "learning_rate": 9.996470256790787e-05, + "loss": 2.4806, + "step": 1357 + }, + { + "epoch": 0.41682013505217924, + "grad_norm": 1.2323371171951294, + "learning_rate": 9.996451558409478e-05, + "loss": 2.5017, + "step": 1358 + }, + { + "epoch": 0.4171270718232044, + "grad_norm": 0.9880139827728271, + "learning_rate": 9.996432810650307e-05, + "loss": 2.5171, + "step": 1359 + }, + { + "epoch": 0.4174340085942296, + "grad_norm": 1.2572466135025024, + "learning_rate": 9.996414013513458e-05, + "loss": 2.4285, + "step": 1360 + }, + { + "epoch": 0.41774094536525475, + "grad_norm": 0.8809559941291809, + "learning_rate": 9.996395166999118e-05, + "loss": 2.398, + "step": 1361 + }, + { + "epoch": 0.4180478821362799, + "grad_norm": 0.9389429688453674, + "learning_rate": 9.996376271107471e-05, + "loss": 2.4539, + "step": 1362 + }, + { + "epoch": 0.4183548189073051, + "grad_norm": 0.8821789026260376, + "learning_rate": 9.996357325838705e-05, + "loss": 2.4762, + "step": 1363 + }, + { + "epoch": 0.41866175567833025, + "grad_norm": 1.0148484706878662, + "learning_rate": 9.99633833119301e-05, + "loss": 2.5292, + "step": 1364 + }, + { + "epoch": 0.4189686924493554, + "grad_norm": 0.9861947894096375, + "learning_rate": 9.996319287170569e-05, + "loss": 2.4285, + "step": 1365 + }, + { + "epoch": 0.4192756292203806, + "grad_norm": 1.1907099485397339, + "learning_rate": 9.996300193771573e-05, + "loss": 2.4325, + "step": 1366 + }, + { + "epoch": 0.41958256599140575, + "grad_norm": 1.0746681690216064, + "learning_rate": 9.99628105099621e-05, + "loss": 2.3349, + "step": 1367 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 1.2040268182754517, + "learning_rate": 9.996261858844669e-05, + "loss": 2.4427, + "step": 1368 + }, + { + "epoch": 0.4201964395334561, + "grad_norm": 1.0487430095672607, + "learning_rate": 9.99624261731714e-05, + "loss": 2.4305, + "step": 1369 + }, + { + "epoch": 0.42050337630448126, + "grad_norm": 1.0047999620437622, + "learning_rate": 9.996223326413812e-05, + "loss": 2.4442, + "step": 1370 + }, + { + "epoch": 0.4208103130755064, + "grad_norm": 1.147078275680542, + "learning_rate": 9.996203986134879e-05, + "loss": 2.5189, + "step": 1371 + }, + { + "epoch": 0.4211172498465316, + "grad_norm": 1.2269455194473267, + "learning_rate": 9.996184596480529e-05, + "loss": 2.3905, + "step": 1372 + }, + { + "epoch": 0.42142418661755676, + "grad_norm": 0.9716771245002747, + "learning_rate": 9.996165157450954e-05, + "loss": 2.4246, + "step": 1373 + }, + { + "epoch": 0.42173112338858193, + "grad_norm": 1.0569939613342285, + "learning_rate": 9.996145669046347e-05, + "loss": 2.529, + "step": 1374 + }, + { + "epoch": 0.4220380601596071, + "grad_norm": 1.1145942211151123, + "learning_rate": 9.996126131266899e-05, + "loss": 2.3965, + "step": 1375 + }, + { + "epoch": 0.42234499693063227, + "grad_norm": 0.9990974068641663, + "learning_rate": 9.996106544112805e-05, + "loss": 2.4991, + "step": 1376 + }, + { + "epoch": 0.42265193370165743, + "grad_norm": 0.9536247253417969, + "learning_rate": 9.99608690758426e-05, + "loss": 2.4347, + "step": 1377 + }, + { + "epoch": 0.4229588704726826, + "grad_norm": 1.0053460597991943, + "learning_rate": 9.996067221681452e-05, + "loss": 2.4213, + "step": 1378 + }, + { + "epoch": 0.42326580724370777, + "grad_norm": 1.0727168321609497, + "learning_rate": 9.99604748640458e-05, + "loss": 2.4479, + "step": 1379 + }, + { + "epoch": 0.42357274401473294, + "grad_norm": 1.2539277076721191, + "learning_rate": 9.996027701753841e-05, + "loss": 2.4721, + "step": 1380 + }, + { + "epoch": 0.4238796807857581, + "grad_norm": 1.0348230600357056, + "learning_rate": 9.996007867729427e-05, + "loss": 2.4263, + "step": 1381 + }, + { + "epoch": 0.42418661755678333, + "grad_norm": 1.051802158355713, + "learning_rate": 9.995987984331533e-05, + "loss": 2.4492, + "step": 1382 + }, + { + "epoch": 0.4244935543278085, + "grad_norm": 1.0394505262374878, + "learning_rate": 9.995968051560361e-05, + "loss": 2.4625, + "step": 1383 + }, + { + "epoch": 0.42480049109883367, + "grad_norm": 1.1121852397918701, + "learning_rate": 9.995948069416103e-05, + "loss": 2.4999, + "step": 1384 + }, + { + "epoch": 0.42510742786985883, + "grad_norm": 0.9693613052368164, + "learning_rate": 9.995928037898957e-05, + "loss": 2.4112, + "step": 1385 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 1.1416810750961304, + "learning_rate": 9.995907957009123e-05, + "loss": 2.5452, + "step": 1386 + }, + { + "epoch": 0.42572130141190917, + "grad_norm": 1.010640025138855, + "learning_rate": 9.995887826746797e-05, + "loss": 2.412, + "step": 1387 + }, + { + "epoch": 0.42602823818293434, + "grad_norm": 1.0800373554229736, + "learning_rate": 9.99586764711218e-05, + "loss": 2.4451, + "step": 1388 + }, + { + "epoch": 0.4263351749539595, + "grad_norm": 1.058931589126587, + "learning_rate": 9.995847418105471e-05, + "loss": 2.474, + "step": 1389 + }, + { + "epoch": 0.4266421117249847, + "grad_norm": 1.0727131366729736, + "learning_rate": 9.99582713972687e-05, + "loss": 2.468, + "step": 1390 + }, + { + "epoch": 0.42694904849600984, + "grad_norm": 1.0237464904785156, + "learning_rate": 9.995806811976576e-05, + "loss": 2.5208, + "step": 1391 + }, + { + "epoch": 0.427255985267035, + "grad_norm": 1.036582112312317, + "learning_rate": 9.995786434854793e-05, + "loss": 2.4338, + "step": 1392 + }, + { + "epoch": 0.4275629220380602, + "grad_norm": 0.9617817997932434, + "learning_rate": 9.995766008361719e-05, + "loss": 2.4465, + "step": 1393 + }, + { + "epoch": 0.42786985880908535, + "grad_norm": 1.2188911437988281, + "learning_rate": 9.995745532497556e-05, + "loss": 2.5069, + "step": 1394 + }, + { + "epoch": 0.4281767955801105, + "grad_norm": 1.0796585083007812, + "learning_rate": 9.99572500726251e-05, + "loss": 2.4839, + "step": 1395 + }, + { + "epoch": 0.4284837323511357, + "grad_norm": 0.9843130111694336, + "learning_rate": 9.99570443265678e-05, + "loss": 2.4968, + "step": 1396 + }, + { + "epoch": 0.42879066912216085, + "grad_norm": 1.0441415309906006, + "learning_rate": 9.99568380868057e-05, + "loss": 2.4134, + "step": 1397 + }, + { + "epoch": 0.429097605893186, + "grad_norm": 0.9156177639961243, + "learning_rate": 9.995663135334085e-05, + "loss": 2.4891, + "step": 1398 + }, + { + "epoch": 0.4294045426642112, + "grad_norm": 1.1159545183181763, + "learning_rate": 9.995642412617529e-05, + "loss": 2.4507, + "step": 1399 + }, + { + "epoch": 0.42971147943523635, + "grad_norm": 0.8944577574729919, + "learning_rate": 9.995621640531107e-05, + "loss": 2.4465, + "step": 1400 + }, + { + "epoch": 0.4300184162062615, + "grad_norm": 0.9043408036231995, + "learning_rate": 9.995600819075025e-05, + "loss": 2.3726, + "step": 1401 + }, + { + "epoch": 0.4303253529772867, + "grad_norm": 0.9028464555740356, + "learning_rate": 9.995579948249486e-05, + "loss": 2.427, + "step": 1402 + }, + { + "epoch": 0.43063228974831186, + "grad_norm": 0.9497705101966858, + "learning_rate": 9.995559028054699e-05, + "loss": 2.4666, + "step": 1403 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.927601158618927, + "learning_rate": 9.995538058490868e-05, + "loss": 2.3679, + "step": 1404 + }, + { + "epoch": 0.4312461632903622, + "grad_norm": 1.050394892692566, + "learning_rate": 9.995517039558204e-05, + "loss": 2.4096, + "step": 1405 + }, + { + "epoch": 0.43155310006138736, + "grad_norm": 1.3011974096298218, + "learning_rate": 9.995495971256911e-05, + "loss": 2.4439, + "step": 1406 + }, + { + "epoch": 0.43186003683241253, + "grad_norm": 1.0740708112716675, + "learning_rate": 9.9954748535872e-05, + "loss": 2.4891, + "step": 1407 + }, + { + "epoch": 0.4321669736034377, + "grad_norm": 1.1132466793060303, + "learning_rate": 9.995453686549279e-05, + "loss": 2.46, + "step": 1408 + }, + { + "epoch": 0.43247391037446287, + "grad_norm": 1.063275933265686, + "learning_rate": 9.995432470143356e-05, + "loss": 2.5035, + "step": 1409 + }, + { + "epoch": 0.43278084714548803, + "grad_norm": 1.065679669380188, + "learning_rate": 9.99541120436964e-05, + "loss": 2.4471, + "step": 1410 + }, + { + "epoch": 0.4330877839165132, + "grad_norm": 1.017587423324585, + "learning_rate": 9.995389889228344e-05, + "loss": 2.4879, + "step": 1411 + }, + { + "epoch": 0.43339472068753837, + "grad_norm": 0.9744442701339722, + "learning_rate": 9.995368524719678e-05, + "loss": 2.3923, + "step": 1412 + }, + { + "epoch": 0.43370165745856354, + "grad_norm": 0.8916706442832947, + "learning_rate": 9.995347110843851e-05, + "loss": 2.3965, + "step": 1413 + }, + { + "epoch": 0.4340085942295887, + "grad_norm": 0.916221559047699, + "learning_rate": 9.995325647601075e-05, + "loss": 2.4742, + "step": 1414 + }, + { + "epoch": 0.4343155310006139, + "grad_norm": 0.9388782978057861, + "learning_rate": 9.995304134991565e-05, + "loss": 2.453, + "step": 1415 + }, + { + "epoch": 0.43462246777163904, + "grad_norm": 1.057085633277893, + "learning_rate": 9.995282573015532e-05, + "loss": 2.5791, + "step": 1416 + }, + { + "epoch": 0.4349294045426642, + "grad_norm": 1.055145025253296, + "learning_rate": 9.995260961673187e-05, + "loss": 2.3565, + "step": 1417 + }, + { + "epoch": 0.4352363413136894, + "grad_norm": 1.0733528137207031, + "learning_rate": 9.995239300964747e-05, + "loss": 2.5413, + "step": 1418 + }, + { + "epoch": 0.43554327808471455, + "grad_norm": 1.1478198766708374, + "learning_rate": 9.995217590890425e-05, + "loss": 2.4093, + "step": 1419 + }, + { + "epoch": 0.4358502148557397, + "grad_norm": 0.8663081526756287, + "learning_rate": 9.995195831450432e-05, + "loss": 2.3968, + "step": 1420 + }, + { + "epoch": 0.4361571516267649, + "grad_norm": 0.9811860918998718, + "learning_rate": 9.995174022644988e-05, + "loss": 2.3536, + "step": 1421 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.9883477687835693, + "learning_rate": 9.995152164474306e-05, + "loss": 2.5372, + "step": 1422 + }, + { + "epoch": 0.4367710251688152, + "grad_norm": 1.2196532487869263, + "learning_rate": 9.995130256938603e-05, + "loss": 2.429, + "step": 1423 + }, + { + "epoch": 0.4370779619398404, + "grad_norm": 1.000264286994934, + "learning_rate": 9.995108300038096e-05, + "loss": 2.4116, + "step": 1424 + }, + { + "epoch": 0.43738489871086556, + "grad_norm": 1.1259286403656006, + "learning_rate": 9.995086293773e-05, + "loss": 2.4405, + "step": 1425 + }, + { + "epoch": 0.4376918354818907, + "grad_norm": 0.9334595203399658, + "learning_rate": 9.995064238143533e-05, + "loss": 2.3849, + "step": 1426 + }, + { + "epoch": 0.4379987722529159, + "grad_norm": 0.8880285620689392, + "learning_rate": 9.995042133149914e-05, + "loss": 2.4177, + "step": 1427 + }, + { + "epoch": 0.43830570902394106, + "grad_norm": 0.8823251724243164, + "learning_rate": 9.995019978792362e-05, + "loss": 2.4876, + "step": 1428 + }, + { + "epoch": 0.4386126457949662, + "grad_norm": 0.9289014339447021, + "learning_rate": 9.994997775071094e-05, + "loss": 2.4725, + "step": 1429 + }, + { + "epoch": 0.4389195825659914, + "grad_norm": 0.9100427627563477, + "learning_rate": 9.994975521986329e-05, + "loss": 2.3834, + "step": 1430 + }, + { + "epoch": 0.43922651933701656, + "grad_norm": 0.8956978917121887, + "learning_rate": 9.99495321953829e-05, + "loss": 2.4418, + "step": 1431 + }, + { + "epoch": 0.43953345610804173, + "grad_norm": 1.1248396635055542, + "learning_rate": 9.994930867727195e-05, + "loss": 2.4389, + "step": 1432 + }, + { + "epoch": 0.4398403928790669, + "grad_norm": 0.9285669922828674, + "learning_rate": 9.994908466553266e-05, + "loss": 2.3922, + "step": 1433 + }, + { + "epoch": 0.44014732965009207, + "grad_norm": 0.9604844450950623, + "learning_rate": 9.994886016016723e-05, + "loss": 2.4365, + "step": 1434 + }, + { + "epoch": 0.44045426642111724, + "grad_norm": 1.0534024238586426, + "learning_rate": 9.99486351611779e-05, + "loss": 2.4377, + "step": 1435 + }, + { + "epoch": 0.4407612031921424, + "grad_norm": 1.1028003692626953, + "learning_rate": 9.994840966856686e-05, + "loss": 2.4299, + "step": 1436 + }, + { + "epoch": 0.44106813996316757, + "grad_norm": 1.119832158088684, + "learning_rate": 9.994818368233639e-05, + "loss": 2.4656, + "step": 1437 + }, + { + "epoch": 0.44137507673419274, + "grad_norm": 0.9782878160476685, + "learning_rate": 9.994795720248867e-05, + "loss": 2.3661, + "step": 1438 + }, + { + "epoch": 0.4416820135052179, + "grad_norm": 1.0002741813659668, + "learning_rate": 9.994773022902597e-05, + "loss": 2.4157, + "step": 1439 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 1.051486611366272, + "learning_rate": 9.994750276195053e-05, + "loss": 2.452, + "step": 1440 + }, + { + "epoch": 0.44229588704726824, + "grad_norm": 1.0375488996505737, + "learning_rate": 9.994727480126457e-05, + "loss": 2.4406, + "step": 1441 + }, + { + "epoch": 0.4426028238182934, + "grad_norm": 0.9407445192337036, + "learning_rate": 9.99470463469704e-05, + "loss": 2.3434, + "step": 1442 + }, + { + "epoch": 0.4429097605893186, + "grad_norm": 1.0371474027633667, + "learning_rate": 9.994681739907022e-05, + "loss": 2.5094, + "step": 1443 + }, + { + "epoch": 0.44321669736034375, + "grad_norm": 1.057519555091858, + "learning_rate": 9.994658795756632e-05, + "loss": 2.4501, + "step": 1444 + }, + { + "epoch": 0.4435236341313689, + "grad_norm": 0.9340078234672546, + "learning_rate": 9.994635802246097e-05, + "loss": 2.4151, + "step": 1445 + }, + { + "epoch": 0.4438305709023941, + "grad_norm": 0.8906050324440002, + "learning_rate": 9.994612759375644e-05, + "loss": 2.3837, + "step": 1446 + }, + { + "epoch": 0.44413750767341925, + "grad_norm": 0.8349595665931702, + "learning_rate": 9.994589667145497e-05, + "loss": 2.4317, + "step": 1447 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.9362117648124695, + "learning_rate": 9.994566525555891e-05, + "loss": 2.4586, + "step": 1448 + }, + { + "epoch": 0.4447513812154696, + "grad_norm": 0.869215190410614, + "learning_rate": 9.99454333460705e-05, + "loss": 2.4458, + "step": 1449 + }, + { + "epoch": 0.44505831798649476, + "grad_norm": 0.904531717300415, + "learning_rate": 9.994520094299204e-05, + "loss": 2.4198, + "step": 1450 + }, + { + "epoch": 0.4453652547575199, + "grad_norm": 0.9153178930282593, + "learning_rate": 9.994496804632583e-05, + "loss": 2.3718, + "step": 1451 + }, + { + "epoch": 0.44567219152854515, + "grad_norm": 1.0229307413101196, + "learning_rate": 9.994473465607418e-05, + "loss": 2.3787, + "step": 1452 + }, + { + "epoch": 0.4459791282995703, + "grad_norm": 1.0449415445327759, + "learning_rate": 9.994450077223938e-05, + "loss": 2.4965, + "step": 1453 + }, + { + "epoch": 0.4462860650705955, + "grad_norm": 1.0524135828018188, + "learning_rate": 9.994426639482375e-05, + "loss": 2.3518, + "step": 1454 + }, + { + "epoch": 0.44659300184162065, + "grad_norm": 1.0612086057662964, + "learning_rate": 9.994403152382961e-05, + "loss": 2.4501, + "step": 1455 + }, + { + "epoch": 0.4468999386126458, + "grad_norm": 1.0568779706954956, + "learning_rate": 9.994379615925929e-05, + "loss": 2.3754, + "step": 1456 + }, + { + "epoch": 0.447206875383671, + "grad_norm": 1.0984265804290771, + "learning_rate": 9.994356030111509e-05, + "loss": 2.4318, + "step": 1457 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.9227646589279175, + "learning_rate": 9.994332394939936e-05, + "loss": 2.3928, + "step": 1458 + }, + { + "epoch": 0.4478207489257213, + "grad_norm": 1.0073471069335938, + "learning_rate": 9.994308710411442e-05, + "loss": 2.4203, + "step": 1459 + }, + { + "epoch": 0.4481276856967465, + "grad_norm": 1.1347973346710205, + "learning_rate": 9.994284976526263e-05, + "loss": 2.4991, + "step": 1460 + }, + { + "epoch": 0.44843462246777166, + "grad_norm": 0.9912654757499695, + "learning_rate": 9.994261193284631e-05, + "loss": 2.471, + "step": 1461 + }, + { + "epoch": 0.4487415592387968, + "grad_norm": 1.0599550008773804, + "learning_rate": 9.994237360686784e-05, + "loss": 2.505, + "step": 1462 + }, + { + "epoch": 0.449048496009822, + "grad_norm": 0.9811004996299744, + "learning_rate": 9.994213478732957e-05, + "loss": 2.3868, + "step": 1463 + }, + { + "epoch": 0.44935543278084716, + "grad_norm": 0.8389631509780884, + "learning_rate": 9.994189547423384e-05, + "loss": 2.4766, + "step": 1464 + }, + { + "epoch": 0.44966236955187233, + "grad_norm": 0.8475043773651123, + "learning_rate": 9.994165566758302e-05, + "loss": 2.3666, + "step": 1465 + }, + { + "epoch": 0.4499693063228975, + "grad_norm": 0.8922824859619141, + "learning_rate": 9.994141536737951e-05, + "loss": 2.3823, + "step": 1466 + }, + { + "epoch": 0.45027624309392267, + "grad_norm": 1.0286083221435547, + "learning_rate": 9.994117457362564e-05, + "loss": 2.4639, + "step": 1467 + }, + { + "epoch": 0.45058317986494784, + "grad_norm": 1.094282865524292, + "learning_rate": 9.994093328632383e-05, + "loss": 2.3984, + "step": 1468 + }, + { + "epoch": 0.450890116635973, + "grad_norm": 1.0993603467941284, + "learning_rate": 9.994069150547642e-05, + "loss": 2.3719, + "step": 1469 + }, + { + "epoch": 0.45119705340699817, + "grad_norm": 1.0274133682250977, + "learning_rate": 9.994044923108585e-05, + "loss": 2.3644, + "step": 1470 + }, + { + "epoch": 0.45150399017802334, + "grad_norm": 0.8834434747695923, + "learning_rate": 9.994020646315448e-05, + "loss": 2.4955, + "step": 1471 + }, + { + "epoch": 0.4518109269490485, + "grad_norm": 0.8540776968002319, + "learning_rate": 9.993996320168473e-05, + "loss": 2.4292, + "step": 1472 + }, + { + "epoch": 0.4521178637200737, + "grad_norm": 0.8735383749008179, + "learning_rate": 9.993971944667897e-05, + "loss": 2.4343, + "step": 1473 + }, + { + "epoch": 0.45242480049109884, + "grad_norm": 0.976224422454834, + "learning_rate": 9.993947519813965e-05, + "loss": 2.4173, + "step": 1474 + }, + { + "epoch": 0.452731737262124, + "grad_norm": 0.9638139009475708, + "learning_rate": 9.993923045606917e-05, + "loss": 2.4322, + "step": 1475 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.9689927697181702, + "learning_rate": 9.993898522046992e-05, + "loss": 2.4625, + "step": 1476 + }, + { + "epoch": 0.45334561080417435, + "grad_norm": 1.0496052503585815, + "learning_rate": 9.993873949134437e-05, + "loss": 2.4788, + "step": 1477 + }, + { + "epoch": 0.4536525475751995, + "grad_norm": 1.0285090208053589, + "learning_rate": 9.993849326869491e-05, + "loss": 2.4119, + "step": 1478 + }, + { + "epoch": 0.4539594843462247, + "grad_norm": 0.9423730373382568, + "learning_rate": 9.993824655252401e-05, + "loss": 2.3919, + "step": 1479 + }, + { + "epoch": 0.45426642111724985, + "grad_norm": 1.0312988758087158, + "learning_rate": 9.993799934283407e-05, + "loss": 2.3829, + "step": 1480 + }, + { + "epoch": 0.454573357888275, + "grad_norm": 1.0985655784606934, + "learning_rate": 9.993775163962755e-05, + "loss": 2.3958, + "step": 1481 + }, + { + "epoch": 0.4548802946593002, + "grad_norm": 0.9346623420715332, + "learning_rate": 9.993750344290691e-05, + "loss": 2.3611, + "step": 1482 + }, + { + "epoch": 0.45518723143032536, + "grad_norm": 1.039681315422058, + "learning_rate": 9.993725475267459e-05, + "loss": 2.3989, + "step": 1483 + }, + { + "epoch": 0.4554941682013505, + "grad_norm": 0.9941854476928711, + "learning_rate": 9.993700556893304e-05, + "loss": 2.3092, + "step": 1484 + }, + { + "epoch": 0.4558011049723757, + "grad_norm": 0.9752130508422852, + "learning_rate": 9.993675589168473e-05, + "loss": 2.3727, + "step": 1485 + }, + { + "epoch": 0.45610804174340086, + "grad_norm": 0.9946039319038391, + "learning_rate": 9.993650572093216e-05, + "loss": 2.4121, + "step": 1486 + }, + { + "epoch": 0.45641497851442603, + "grad_norm": 1.1340489387512207, + "learning_rate": 9.993625505667774e-05, + "loss": 2.4477, + "step": 1487 + }, + { + "epoch": 0.4567219152854512, + "grad_norm": 0.9300981760025024, + "learning_rate": 9.993600389892399e-05, + "loss": 2.4045, + "step": 1488 + }, + { + "epoch": 0.45702885205647636, + "grad_norm": 0.8670973181724548, + "learning_rate": 9.993575224767338e-05, + "loss": 2.3596, + "step": 1489 + }, + { + "epoch": 0.45733578882750153, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.99355001029284e-05, + "loss": 2.4191, + "step": 1490 + }, + { + "epoch": 0.4576427255985267, + "grad_norm": 0.9099079370498657, + "learning_rate": 9.993524746469154e-05, + "loss": 2.4139, + "step": 1491 + }, + { + "epoch": 0.45794966236955187, + "grad_norm": 0.9740153551101685, + "learning_rate": 9.99349943329653e-05, + "loss": 2.4269, + "step": 1492 + }, + { + "epoch": 0.45825659914057704, + "grad_norm": 0.9112171530723572, + "learning_rate": 9.993474070775217e-05, + "loss": 2.3575, + "step": 1493 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 1.124553918838501, + "learning_rate": 9.993448658905466e-05, + "loss": 2.5518, + "step": 1494 + }, + { + "epoch": 0.4588704726826274, + "grad_norm": 1.1732012033462524, + "learning_rate": 9.99342319768753e-05, + "loss": 2.4346, + "step": 1495 + }, + { + "epoch": 0.45917740945365254, + "grad_norm": 0.8880025148391724, + "learning_rate": 9.993397687121659e-05, + "loss": 2.3593, + "step": 1496 + }, + { + "epoch": 0.4594843462246777, + "grad_norm": 0.9916797876358032, + "learning_rate": 9.993372127208105e-05, + "loss": 2.3283, + "step": 1497 + }, + { + "epoch": 0.4597912829957029, + "grad_norm": 0.9372622966766357, + "learning_rate": 9.99334651794712e-05, + "loss": 2.3868, + "step": 1498 + }, + { + "epoch": 0.46009821976672804, + "grad_norm": 1.0630989074707031, + "learning_rate": 9.99332085933896e-05, + "loss": 2.3605, + "step": 1499 + }, + { + "epoch": 0.4604051565377532, + "grad_norm": 1.000473976135254, + "learning_rate": 9.993295151383874e-05, + "loss": 2.3478, + "step": 1500 + }, + { + "epoch": 0.4607120933087784, + "grad_norm": 1.0269688367843628, + "learning_rate": 9.99326939408212e-05, + "loss": 2.4104, + "step": 1501 + }, + { + "epoch": 0.46101903007980355, + "grad_norm": 0.9003174901008606, + "learning_rate": 9.993243587433952e-05, + "loss": 2.3461, + "step": 1502 + }, + { + "epoch": 0.4613259668508287, + "grad_norm": 0.7938058972358704, + "learning_rate": 9.993217731439623e-05, + "loss": 2.3463, + "step": 1503 + }, + { + "epoch": 0.4616329036218539, + "grad_norm": 0.8715407252311707, + "learning_rate": 9.993191826099391e-05, + "loss": 2.3962, + "step": 1504 + }, + { + "epoch": 0.46193984039287905, + "grad_norm": 0.8319756984710693, + "learning_rate": 9.99316587141351e-05, + "loss": 2.342, + "step": 1505 + }, + { + "epoch": 0.4622467771639042, + "grad_norm": 0.846592903137207, + "learning_rate": 9.993139867382238e-05, + "loss": 2.4064, + "step": 1506 + }, + { + "epoch": 0.4625537139349294, + "grad_norm": 0.8567312955856323, + "learning_rate": 9.99311381400583e-05, + "loss": 2.3603, + "step": 1507 + }, + { + "epoch": 0.46286065070595456, + "grad_norm": 0.8784321546554565, + "learning_rate": 9.993087711284546e-05, + "loss": 2.4031, + "step": 1508 + }, + { + "epoch": 0.4631675874769797, + "grad_norm": 0.838233232498169, + "learning_rate": 9.993061559218641e-05, + "loss": 2.3156, + "step": 1509 + }, + { + "epoch": 0.4634745242480049, + "grad_norm": 0.8804462552070618, + "learning_rate": 9.993035357808376e-05, + "loss": 2.4322, + "step": 1510 + }, + { + "epoch": 0.46378146101903006, + "grad_norm": 1.1055982112884521, + "learning_rate": 9.99300910705401e-05, + "loss": 2.5006, + "step": 1511 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.9872145056724548, + "learning_rate": 9.992982806955799e-05, + "loss": 2.3547, + "step": 1512 + }, + { + "epoch": 0.4643953345610804, + "grad_norm": 1.0710479021072388, + "learning_rate": 9.99295645751401e-05, + "loss": 2.4867, + "step": 1513 + }, + { + "epoch": 0.46470227133210557, + "grad_norm": 0.9858919382095337, + "learning_rate": 9.992930058728894e-05, + "loss": 2.2986, + "step": 1514 + }, + { + "epoch": 0.46500920810313073, + "grad_norm": 0.9031065702438354, + "learning_rate": 9.992903610600719e-05, + "loss": 2.3172, + "step": 1515 + }, + { + "epoch": 0.4653161448741559, + "grad_norm": 0.923160970211029, + "learning_rate": 9.992877113129744e-05, + "loss": 2.4231, + "step": 1516 + }, + { + "epoch": 0.46562308164518107, + "grad_norm": 1.0130947828292847, + "learning_rate": 9.992850566316231e-05, + "loss": 2.3593, + "step": 1517 + }, + { + "epoch": 0.46593001841620624, + "grad_norm": 0.8947033286094666, + "learning_rate": 9.992823970160441e-05, + "loss": 2.3324, + "step": 1518 + }, + { + "epoch": 0.4662369551872314, + "grad_norm": 0.8819900155067444, + "learning_rate": 9.992797324662639e-05, + "loss": 2.2885, + "step": 1519 + }, + { + "epoch": 0.4665438919582566, + "grad_norm": 0.9434374570846558, + "learning_rate": 9.99277062982309e-05, + "loss": 2.427, + "step": 1520 + }, + { + "epoch": 0.46685082872928174, + "grad_norm": 0.9568646550178528, + "learning_rate": 9.99274388564205e-05, + "loss": 2.4059, + "step": 1521 + }, + { + "epoch": 0.4671577655003069, + "grad_norm": 0.9125105142593384, + "learning_rate": 9.992717092119794e-05, + "loss": 2.3306, + "step": 1522 + }, + { + "epoch": 0.46746470227133213, + "grad_norm": 0.8893206715583801, + "learning_rate": 9.992690249256578e-05, + "loss": 2.4211, + "step": 1523 + }, + { + "epoch": 0.4677716390423573, + "grad_norm": 0.8655402660369873, + "learning_rate": 9.992663357052672e-05, + "loss": 2.3493, + "step": 1524 + }, + { + "epoch": 0.46807857581338247, + "grad_norm": 0.7973037958145142, + "learning_rate": 9.99263641550834e-05, + "loss": 2.4255, + "step": 1525 + }, + { + "epoch": 0.46838551258440764, + "grad_norm": 0.8158934116363525, + "learning_rate": 9.992609424623849e-05, + "loss": 2.3518, + "step": 1526 + }, + { + "epoch": 0.4686924493554328, + "grad_norm": 0.7919436693191528, + "learning_rate": 9.992582384399465e-05, + "loss": 2.3762, + "step": 1527 + }, + { + "epoch": 0.468999386126458, + "grad_norm": 0.911490261554718, + "learning_rate": 9.992555294835455e-05, + "loss": 2.454, + "step": 1528 + }, + { + "epoch": 0.46930632289748314, + "grad_norm": 0.9504674077033997, + "learning_rate": 9.992528155932088e-05, + "loss": 2.3554, + "step": 1529 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.9833991527557373, + "learning_rate": 9.99250096768963e-05, + "loss": 2.4245, + "step": 1530 + }, + { + "epoch": 0.4699201964395335, + "grad_norm": 0.9994687438011169, + "learning_rate": 9.992473730108354e-05, + "loss": 2.3269, + "step": 1531 + }, + { + "epoch": 0.47022713321055865, + "grad_norm": 0.977237343788147, + "learning_rate": 9.992446443188526e-05, + "loss": 2.3938, + "step": 1532 + }, + { + "epoch": 0.4705340699815838, + "grad_norm": 1.018334150314331, + "learning_rate": 9.992419106930415e-05, + "loss": 2.3076, + "step": 1533 + }, + { + "epoch": 0.470841006752609, + "grad_norm": 0.9752077460289001, + "learning_rate": 9.992391721334293e-05, + "loss": 2.4224, + "step": 1534 + }, + { + "epoch": 0.47114794352363415, + "grad_norm": 0.9457291960716248, + "learning_rate": 9.992364286400428e-05, + "loss": 2.3859, + "step": 1535 + }, + { + "epoch": 0.4714548802946593, + "grad_norm": 0.9112275838851929, + "learning_rate": 9.992336802129096e-05, + "loss": 2.3343, + "step": 1536 + }, + { + "epoch": 0.4717618170656845, + "grad_norm": 0.7701164484024048, + "learning_rate": 9.992309268520563e-05, + "loss": 2.3912, + "step": 1537 + }, + { + "epoch": 0.47206875383670965, + "grad_norm": 0.826822817325592, + "learning_rate": 9.992281685575105e-05, + "loss": 2.3794, + "step": 1538 + }, + { + "epoch": 0.4723756906077348, + "grad_norm": 0.8690019249916077, + "learning_rate": 9.992254053292994e-05, + "loss": 2.3474, + "step": 1539 + }, + { + "epoch": 0.47268262737876, + "grad_norm": 0.935954213142395, + "learning_rate": 9.9922263716745e-05, + "loss": 2.3794, + "step": 1540 + }, + { + "epoch": 0.47298956414978516, + "grad_norm": 1.0606616735458374, + "learning_rate": 9.992198640719901e-05, + "loss": 2.3491, + "step": 1541 + }, + { + "epoch": 0.4732965009208103, + "grad_norm": 1.0020630359649658, + "learning_rate": 9.992170860429469e-05, + "loss": 2.4723, + "step": 1542 + }, + { + "epoch": 0.4736034376918355, + "grad_norm": 0.9738268256187439, + "learning_rate": 9.992143030803476e-05, + "loss": 2.4282, + "step": 1543 + }, + { + "epoch": 0.47391037446286066, + "grad_norm": 1.0320461988449097, + "learning_rate": 9.992115151842203e-05, + "loss": 2.3935, + "step": 1544 + }, + { + "epoch": 0.47421731123388583, + "grad_norm": 0.926980197429657, + "learning_rate": 9.992087223545921e-05, + "loss": 2.4403, + "step": 1545 + }, + { + "epoch": 0.474524248004911, + "grad_norm": 0.8760805130004883, + "learning_rate": 9.992059245914906e-05, + "loss": 2.3282, + "step": 1546 + }, + { + "epoch": 0.47483118477593617, + "grad_norm": 0.807569146156311, + "learning_rate": 9.992031218949435e-05, + "loss": 2.351, + "step": 1547 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.7491574883460999, + "learning_rate": 9.992003142649788e-05, + "loss": 2.3788, + "step": 1548 + }, + { + "epoch": 0.4754450583179865, + "grad_norm": 0.8402566909790039, + "learning_rate": 9.99197501701624e-05, + "loss": 2.4025, + "step": 1549 + }, + { + "epoch": 0.47575199508901167, + "grad_norm": 0.9501824975013733, + "learning_rate": 9.991946842049067e-05, + "loss": 2.4433, + "step": 1550 + }, + { + "epoch": 0.47605893186003684, + "grad_norm": 1.0070267915725708, + "learning_rate": 9.99191861774855e-05, + "loss": 2.4267, + "step": 1551 + }, + { + "epoch": 0.476365868631062, + "grad_norm": 0.9052779078483582, + "learning_rate": 9.991890344114969e-05, + "loss": 2.37, + "step": 1552 + }, + { + "epoch": 0.4766728054020872, + "grad_norm": 0.9453344345092773, + "learning_rate": 9.9918620211486e-05, + "loss": 2.4687, + "step": 1553 + }, + { + "epoch": 0.47697974217311234, + "grad_norm": 0.9836863875389099, + "learning_rate": 9.991833648849725e-05, + "loss": 2.4005, + "step": 1554 + }, + { + "epoch": 0.4772866789441375, + "grad_norm": 0.856532633304596, + "learning_rate": 9.991805227218624e-05, + "loss": 2.329, + "step": 1555 + }, + { + "epoch": 0.4775936157151627, + "grad_norm": 0.8338705897331238, + "learning_rate": 9.991776756255579e-05, + "loss": 2.3648, + "step": 1556 + }, + { + "epoch": 0.47790055248618785, + "grad_norm": 0.7738644480705261, + "learning_rate": 9.991748235960869e-05, + "loss": 2.2784, + "step": 1557 + }, + { + "epoch": 0.478207489257213, + "grad_norm": 0.7771223783493042, + "learning_rate": 9.991719666334778e-05, + "loss": 2.2747, + "step": 1558 + }, + { + "epoch": 0.4785144260282382, + "grad_norm": 0.7564612627029419, + "learning_rate": 9.991691047377588e-05, + "loss": 2.2964, + "step": 1559 + }, + { + "epoch": 0.47882136279926335, + "grad_norm": 0.7877290844917297, + "learning_rate": 9.99166237908958e-05, + "loss": 2.3149, + "step": 1560 + }, + { + "epoch": 0.4791282995702885, + "grad_norm": 0.7967450022697449, + "learning_rate": 9.991633661471039e-05, + "loss": 2.4035, + "step": 1561 + }, + { + "epoch": 0.4794352363413137, + "grad_norm": 0.8993534445762634, + "learning_rate": 9.991604894522248e-05, + "loss": 2.4028, + "step": 1562 + }, + { + "epoch": 0.47974217311233885, + "grad_norm": 0.9135516881942749, + "learning_rate": 9.991576078243494e-05, + "loss": 2.3968, + "step": 1563 + }, + { + "epoch": 0.480049109883364, + "grad_norm": 0.8438525795936584, + "learning_rate": 9.991547212635057e-05, + "loss": 2.3589, + "step": 1564 + }, + { + "epoch": 0.4803560466543892, + "grad_norm": 0.8979686498641968, + "learning_rate": 9.991518297697226e-05, + "loss": 2.3835, + "step": 1565 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.8821539878845215, + "learning_rate": 9.991489333430286e-05, + "loss": 2.3503, + "step": 1566 + }, + { + "epoch": 0.4809699201964395, + "grad_norm": 0.8649077415466309, + "learning_rate": 9.991460319834523e-05, + "loss": 2.3806, + "step": 1567 + }, + { + "epoch": 0.4812768569674647, + "grad_norm": 0.8360965847969055, + "learning_rate": 9.991431256910223e-05, + "loss": 2.3997, + "step": 1568 + }, + { + "epoch": 0.48158379373848986, + "grad_norm": 0.9178828597068787, + "learning_rate": 9.991402144657673e-05, + "loss": 2.3611, + "step": 1569 + }, + { + "epoch": 0.48189073050951503, + "grad_norm": 0.7961607575416565, + "learning_rate": 9.991372983077161e-05, + "loss": 2.3588, + "step": 1570 + }, + { + "epoch": 0.4821976672805402, + "grad_norm": 0.8136993646621704, + "learning_rate": 9.991343772168978e-05, + "loss": 2.3241, + "step": 1571 + }, + { + "epoch": 0.48250460405156537, + "grad_norm": 0.8421273231506348, + "learning_rate": 9.991314511933407e-05, + "loss": 2.3493, + "step": 1572 + }, + { + "epoch": 0.48281154082259053, + "grad_norm": 0.774861752986908, + "learning_rate": 9.991285202370743e-05, + "loss": 2.362, + "step": 1573 + }, + { + "epoch": 0.4831184775936157, + "grad_norm": 0.9181589484214783, + "learning_rate": 9.991255843481273e-05, + "loss": 2.443, + "step": 1574 + }, + { + "epoch": 0.48342541436464087, + "grad_norm": 0.873884379863739, + "learning_rate": 9.991226435265286e-05, + "loss": 2.3819, + "step": 1575 + }, + { + "epoch": 0.48373235113566604, + "grad_norm": 0.923200786113739, + "learning_rate": 9.991196977723077e-05, + "loss": 2.4152, + "step": 1576 + }, + { + "epoch": 0.4840392879066912, + "grad_norm": 0.9097923040390015, + "learning_rate": 9.99116747085493e-05, + "loss": 2.4072, + "step": 1577 + }, + { + "epoch": 0.4843462246777164, + "grad_norm": 0.8885805010795593, + "learning_rate": 9.991137914661143e-05, + "loss": 2.3963, + "step": 1578 + }, + { + "epoch": 0.48465316144874154, + "grad_norm": 0.9016655683517456, + "learning_rate": 9.991108309142006e-05, + "loss": 2.4287, + "step": 1579 + }, + { + "epoch": 0.4849600982197667, + "grad_norm": 0.957548201084137, + "learning_rate": 9.99107865429781e-05, + "loss": 2.4306, + "step": 1580 + }, + { + "epoch": 0.4852670349907919, + "grad_norm": 0.9604195356369019, + "learning_rate": 9.99104895012885e-05, + "loss": 2.3721, + "step": 1581 + }, + { + "epoch": 0.48557397176181705, + "grad_norm": 1.0423815250396729, + "learning_rate": 9.991019196635419e-05, + "loss": 2.3847, + "step": 1582 + }, + { + "epoch": 0.4858809085328422, + "grad_norm": 0.9538045525550842, + "learning_rate": 9.990989393817809e-05, + "loss": 2.4307, + "step": 1583 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 1.0103334188461304, + "learning_rate": 9.990959541676318e-05, + "loss": 2.409, + "step": 1584 + }, + { + "epoch": 0.48649478207489255, + "grad_norm": 1.0780646800994873, + "learning_rate": 9.99092964021124e-05, + "loss": 2.3314, + "step": 1585 + }, + { + "epoch": 0.4868017188459177, + "grad_norm": 1.0062072277069092, + "learning_rate": 9.99089968942287e-05, + "loss": 2.3922, + "step": 1586 + }, + { + "epoch": 0.4871086556169429, + "grad_norm": 1.0575196743011475, + "learning_rate": 9.990869689311504e-05, + "loss": 2.4156, + "step": 1587 + }, + { + "epoch": 0.48741559238796806, + "grad_norm": 0.9953998923301697, + "learning_rate": 9.990839639877438e-05, + "loss": 2.381, + "step": 1588 + }, + { + "epoch": 0.4877225291589932, + "grad_norm": 0.8848470449447632, + "learning_rate": 9.99080954112097e-05, + "loss": 2.4178, + "step": 1589 + }, + { + "epoch": 0.4880294659300184, + "grad_norm": 0.7849117517471313, + "learning_rate": 9.990779393042397e-05, + "loss": 2.3021, + "step": 1590 + }, + { + "epoch": 0.48833640270104356, + "grad_norm": 0.7611599564552307, + "learning_rate": 9.990749195642016e-05, + "loss": 2.4426, + "step": 1591 + }, + { + "epoch": 0.4886433394720687, + "grad_norm": 0.8361895084381104, + "learning_rate": 9.990718948920127e-05, + "loss": 2.3442, + "step": 1592 + }, + { + "epoch": 0.4889502762430939, + "grad_norm": 0.8249576687812805, + "learning_rate": 9.990688652877028e-05, + "loss": 2.2745, + "step": 1593 + }, + { + "epoch": 0.4892572130141191, + "grad_norm": 0.763889729976654, + "learning_rate": 9.990658307513019e-05, + "loss": 2.3123, + "step": 1594 + }, + { + "epoch": 0.4895641497851443, + "grad_norm": 0.7517281770706177, + "learning_rate": 9.990627912828399e-05, + "loss": 2.3811, + "step": 1595 + }, + { + "epoch": 0.48987108655616945, + "grad_norm": 0.8254112005233765, + "learning_rate": 9.990597468823468e-05, + "loss": 2.4269, + "step": 1596 + }, + { + "epoch": 0.4901780233271946, + "grad_norm": 0.8267236948013306, + "learning_rate": 9.99056697549853e-05, + "loss": 2.354, + "step": 1597 + }, + { + "epoch": 0.4904849600982198, + "grad_norm": 0.8511303067207336, + "learning_rate": 9.990536432853881e-05, + "loss": 2.3755, + "step": 1598 + }, + { + "epoch": 0.49079189686924496, + "grad_norm": 0.8639636635780334, + "learning_rate": 9.990505840889828e-05, + "loss": 2.3828, + "step": 1599 + }, + { + "epoch": 0.4910988336402701, + "grad_norm": 0.8371795415878296, + "learning_rate": 9.990475199606672e-05, + "loss": 2.4235, + "step": 1600 + }, + { + "epoch": 0.4914057704112953, + "grad_norm": 0.7639186382293701, + "learning_rate": 9.990444509004713e-05, + "loss": 2.3547, + "step": 1601 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.7835492491722107, + "learning_rate": 9.990413769084257e-05, + "loss": 2.2983, + "step": 1602 + }, + { + "epoch": 0.49201964395334563, + "grad_norm": 0.8301565647125244, + "learning_rate": 9.990382979845609e-05, + "loss": 2.4109, + "step": 1603 + }, + { + "epoch": 0.4923265807243708, + "grad_norm": 0.9005976915359497, + "learning_rate": 9.99035214128907e-05, + "loss": 2.3618, + "step": 1604 + }, + { + "epoch": 0.49263351749539597, + "grad_norm": 1.0234936475753784, + "learning_rate": 9.990321253414945e-05, + "loss": 2.4622, + "step": 1605 + }, + { + "epoch": 0.49294045426642114, + "grad_norm": 1.1613819599151611, + "learning_rate": 9.990290316223542e-05, + "loss": 2.3231, + "step": 1606 + }, + { + "epoch": 0.4932473910374463, + "grad_norm": 0.9382983446121216, + "learning_rate": 9.990259329715165e-05, + "loss": 2.357, + "step": 1607 + }, + { + "epoch": 0.49355432780847147, + "grad_norm": 1.0277435779571533, + "learning_rate": 9.990228293890121e-05, + "loss": 2.3497, + "step": 1608 + }, + { + "epoch": 0.49386126457949664, + "grad_norm": 0.9809542894363403, + "learning_rate": 9.990197208748716e-05, + "loss": 2.363, + "step": 1609 + }, + { + "epoch": 0.4941682013505218, + "grad_norm": 1.151412844657898, + "learning_rate": 9.990166074291255e-05, + "loss": 2.4859, + "step": 1610 + }, + { + "epoch": 0.494475138121547, + "grad_norm": 0.9663482308387756, + "learning_rate": 9.990134890518051e-05, + "loss": 2.3848, + "step": 1611 + }, + { + "epoch": 0.49478207489257214, + "grad_norm": 0.9619266986846924, + "learning_rate": 9.990103657429405e-05, + "loss": 2.3381, + "step": 1612 + }, + { + "epoch": 0.4950890116635973, + "grad_norm": 1.1306475400924683, + "learning_rate": 9.990072375025634e-05, + "loss": 2.3859, + "step": 1613 + }, + { + "epoch": 0.4953959484346225, + "grad_norm": 1.127801537513733, + "learning_rate": 9.990041043307043e-05, + "loss": 2.4259, + "step": 1614 + }, + { + "epoch": 0.49570288520564765, + "grad_norm": 0.9880200624465942, + "learning_rate": 9.990009662273941e-05, + "loss": 2.3629, + "step": 1615 + }, + { + "epoch": 0.4960098219766728, + "grad_norm": 0.940493643283844, + "learning_rate": 9.989978231926636e-05, + "loss": 2.3716, + "step": 1616 + }, + { + "epoch": 0.496316758747698, + "grad_norm": 0.7923702597618103, + "learning_rate": 9.989946752265445e-05, + "loss": 2.3017, + "step": 1617 + }, + { + "epoch": 0.49662369551872315, + "grad_norm": 0.7668408155441284, + "learning_rate": 9.989915223290673e-05, + "loss": 2.3273, + "step": 1618 + }, + { + "epoch": 0.4969306322897483, + "grad_norm": 0.7134098410606384, + "learning_rate": 9.989883645002636e-05, + "loss": 2.302, + "step": 1619 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.6878800392150879, + "learning_rate": 9.989852017401643e-05, + "loss": 2.3047, + "step": 1620 + }, + { + "epoch": 0.49754450583179866, + "grad_norm": 0.8099397420883179, + "learning_rate": 9.989820340488008e-05, + "loss": 2.4747, + "step": 1621 + }, + { + "epoch": 0.4978514426028238, + "grad_norm": 0.9677640795707703, + "learning_rate": 9.989788614262043e-05, + "loss": 2.3347, + "step": 1622 + }, + { + "epoch": 0.498158379373849, + "grad_norm": 0.7592893838882446, + "learning_rate": 9.989756838724064e-05, + "loss": 2.3238, + "step": 1623 + }, + { + "epoch": 0.49846531614487416, + "grad_norm": 0.872529923915863, + "learning_rate": 9.989725013874382e-05, + "loss": 2.4117, + "step": 1624 + }, + { + "epoch": 0.49877225291589933, + "grad_norm": 1.023362159729004, + "learning_rate": 9.989693139713315e-05, + "loss": 2.3307, + "step": 1625 + }, + { + "epoch": 0.4990791896869245, + "grad_norm": 0.8994693756103516, + "learning_rate": 9.989661216241172e-05, + "loss": 2.3661, + "step": 1626 + }, + { + "epoch": 0.49938612645794966, + "grad_norm": 0.8854429125785828, + "learning_rate": 9.989629243458275e-05, + "loss": 2.311, + "step": 1627 + }, + { + "epoch": 0.49969306322897483, + "grad_norm": 0.8326926231384277, + "learning_rate": 9.989597221364937e-05, + "loss": 2.302, + "step": 1628 + }, + { + "epoch": 0.5, + "grad_norm": 0.8778239488601685, + "learning_rate": 9.989565149961475e-05, + "loss": 2.4653, + "step": 1629 + }, + { + "epoch": 0.5003069367710252, + "grad_norm": 0.9369759559631348, + "learning_rate": 9.989533029248205e-05, + "loss": 2.4165, + "step": 1630 + }, + { + "epoch": 0.5006138735420503, + "grad_norm": 0.8510915637016296, + "learning_rate": 9.989500859225445e-05, + "loss": 2.3345, + "step": 1631 + }, + { + "epoch": 0.5009208103130756, + "grad_norm": 0.787972629070282, + "learning_rate": 9.989468639893513e-05, + "loss": 2.283, + "step": 1632 + }, + { + "epoch": 0.5012277470841007, + "grad_norm": 0.7370568513870239, + "learning_rate": 9.989436371252729e-05, + "loss": 2.2867, + "step": 1633 + }, + { + "epoch": 0.5015346838551259, + "grad_norm": 0.8459502458572388, + "learning_rate": 9.989404053303409e-05, + "loss": 2.2875, + "step": 1634 + }, + { + "epoch": 0.501841620626151, + "grad_norm": 0.9123181700706482, + "learning_rate": 9.989371686045874e-05, + "loss": 2.2653, + "step": 1635 + }, + { + "epoch": 0.5021485573971762, + "grad_norm": 1.1908178329467773, + "learning_rate": 9.989339269480445e-05, + "loss": 2.4849, + "step": 1636 + }, + { + "epoch": 0.5024554941682013, + "grad_norm": 0.8162623643875122, + "learning_rate": 9.989306803607439e-05, + "loss": 2.2409, + "step": 1637 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.9289522171020508, + "learning_rate": 9.98927428842718e-05, + "loss": 2.455, + "step": 1638 + }, + { + "epoch": 0.5030693677102517, + "grad_norm": 1.212346076965332, + "learning_rate": 9.989241723939988e-05, + "loss": 2.3461, + "step": 1639 + }, + { + "epoch": 0.5033763044812769, + "grad_norm": 0.8971593976020813, + "learning_rate": 9.989209110146184e-05, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.503683241252302, + "grad_norm": 0.9293156862258911, + "learning_rate": 9.989176447046092e-05, + "loss": 2.3235, + "step": 1641 + }, + { + "epoch": 0.5039901780233272, + "grad_norm": 0.8665596842765808, + "learning_rate": 9.989143734640034e-05, + "loss": 2.4694, + "step": 1642 + }, + { + "epoch": 0.5042971147943524, + "grad_norm": 0.7732648253440857, + "learning_rate": 9.989110972928333e-05, + "loss": 2.1985, + "step": 1643 + }, + { + "epoch": 0.5046040515653776, + "grad_norm": 0.8124692440032959, + "learning_rate": 9.989078161911314e-05, + "loss": 2.315, + "step": 1644 + }, + { + "epoch": 0.5049109883364027, + "grad_norm": 0.8534342050552368, + "learning_rate": 9.989045301589301e-05, + "loss": 2.3491, + "step": 1645 + }, + { + "epoch": 0.5052179251074279, + "grad_norm": 0.8351274132728577, + "learning_rate": 9.989012391962617e-05, + "loss": 2.3416, + "step": 1646 + }, + { + "epoch": 0.505524861878453, + "grad_norm": 0.9143189787864685, + "learning_rate": 9.988979433031588e-05, + "loss": 2.4665, + "step": 1647 + }, + { + "epoch": 0.5058317986494782, + "grad_norm": 0.8978474140167236, + "learning_rate": 9.988946424796542e-05, + "loss": 2.389, + "step": 1648 + }, + { + "epoch": 0.5061387354205034, + "grad_norm": 1.0245648622512817, + "learning_rate": 9.988913367257802e-05, + "loss": 2.3391, + "step": 1649 + }, + { + "epoch": 0.5064456721915286, + "grad_norm": 0.9991573691368103, + "learning_rate": 9.988880260415695e-05, + "loss": 2.405, + "step": 1650 + }, + { + "epoch": 0.5067526089625537, + "grad_norm": 1.042378306388855, + "learning_rate": 9.98884710427055e-05, + "loss": 2.3467, + "step": 1651 + }, + { + "epoch": 0.5070595457335789, + "grad_norm": 0.9569510817527771, + "learning_rate": 9.988813898822694e-05, + "loss": 2.31, + "step": 1652 + }, + { + "epoch": 0.507366482504604, + "grad_norm": 0.9343158006668091, + "learning_rate": 9.988780644072456e-05, + "loss": 2.3659, + "step": 1653 + }, + { + "epoch": 0.5076734192756293, + "grad_norm": 0.7857093811035156, + "learning_rate": 9.988747340020162e-05, + "loss": 2.3424, + "step": 1654 + }, + { + "epoch": 0.5079803560466544, + "grad_norm": 0.7613041996955872, + "learning_rate": 9.988713986666144e-05, + "loss": 2.2698, + "step": 1655 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.8077516555786133, + "learning_rate": 9.98868058401073e-05, + "loss": 2.3827, + "step": 1656 + }, + { + "epoch": 0.5085942295887047, + "grad_norm": 0.8794304132461548, + "learning_rate": 9.98864713205425e-05, + "loss": 2.3079, + "step": 1657 + }, + { + "epoch": 0.5089011663597299, + "grad_norm": 0.8333674073219299, + "learning_rate": 9.988613630797036e-05, + "loss": 2.3622, + "step": 1658 + }, + { + "epoch": 0.509208103130755, + "grad_norm": 0.9654781222343445, + "learning_rate": 9.988580080239417e-05, + "loss": 2.3979, + "step": 1659 + }, + { + "epoch": 0.5095150399017803, + "grad_norm": 0.9278727769851685, + "learning_rate": 9.988546480381727e-05, + "loss": 2.3728, + "step": 1660 + }, + { + "epoch": 0.5098219766728054, + "grad_norm": 0.7971704006195068, + "learning_rate": 9.988512831224298e-05, + "loss": 2.2983, + "step": 1661 + }, + { + "epoch": 0.5101289134438306, + "grad_norm": 0.8991698026657104, + "learning_rate": 9.988479132767459e-05, + "loss": 2.3992, + "step": 1662 + }, + { + "epoch": 0.5104358502148557, + "grad_norm": 1.0208392143249512, + "learning_rate": 9.988445385011546e-05, + "loss": 2.3847, + "step": 1663 + }, + { + "epoch": 0.5107427869858809, + "grad_norm": 0.878237247467041, + "learning_rate": 9.988411587956891e-05, + "loss": 2.2851, + "step": 1664 + }, + { + "epoch": 0.511049723756906, + "grad_norm": 0.903287410736084, + "learning_rate": 9.98837774160383e-05, + "loss": 2.4233, + "step": 1665 + }, + { + "epoch": 0.5113566605279313, + "grad_norm": 0.8845674991607666, + "learning_rate": 9.988343845952697e-05, + "loss": 2.2923, + "step": 1666 + }, + { + "epoch": 0.5116635972989564, + "grad_norm": 0.7729392051696777, + "learning_rate": 9.988309901003825e-05, + "loss": 2.3044, + "step": 1667 + }, + { + "epoch": 0.5119705340699816, + "grad_norm": 0.719302237033844, + "learning_rate": 9.988275906757551e-05, + "loss": 2.3207, + "step": 1668 + }, + { + "epoch": 0.5122774708410067, + "grad_norm": 0.7205179333686829, + "learning_rate": 9.988241863214211e-05, + "loss": 2.341, + "step": 1669 + }, + { + "epoch": 0.512584407612032, + "grad_norm": 0.7318145036697388, + "learning_rate": 9.988207770374142e-05, + "loss": 2.3419, + "step": 1670 + }, + { + "epoch": 0.5128913443830571, + "grad_norm": 0.770630955696106, + "learning_rate": 9.98817362823768e-05, + "loss": 2.27, + "step": 1671 + }, + { + "epoch": 0.5131982811540823, + "grad_norm": 0.6485452651977539, + "learning_rate": 9.988139436805162e-05, + "loss": 2.2715, + "step": 1672 + }, + { + "epoch": 0.5135052179251074, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.988105196076925e-05, + "loss": 2.2806, + "step": 1673 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.695818305015564, + "learning_rate": 9.98807090605331e-05, + "loss": 2.3387, + "step": 1674 + }, + { + "epoch": 0.5141190914671577, + "grad_norm": 0.7685426473617554, + "learning_rate": 9.988036566734655e-05, + "loss": 2.2921, + "step": 1675 + }, + { + "epoch": 0.514426028238183, + "grad_norm": 0.6522897481918335, + "learning_rate": 9.988002178121301e-05, + "loss": 2.2507, + "step": 1676 + }, + { + "epoch": 0.5147329650092081, + "grad_norm": 0.7442181706428528, + "learning_rate": 9.987967740213583e-05, + "loss": 2.3292, + "step": 1677 + }, + { + "epoch": 0.5150399017802333, + "grad_norm": 0.8093023300170898, + "learning_rate": 9.987933253011846e-05, + "loss": 2.3384, + "step": 1678 + }, + { + "epoch": 0.5153468385512584, + "grad_norm": 0.8014655113220215, + "learning_rate": 9.987898716516428e-05, + "loss": 2.3619, + "step": 1679 + }, + { + "epoch": 0.5156537753222836, + "grad_norm": 0.8230258822441101, + "learning_rate": 9.987864130727671e-05, + "loss": 2.3242, + "step": 1680 + }, + { + "epoch": 0.5159607120933087, + "grad_norm": 0.9222247004508972, + "learning_rate": 9.987829495645918e-05, + "loss": 2.3907, + "step": 1681 + }, + { + "epoch": 0.516267648864334, + "grad_norm": 0.9293351769447327, + "learning_rate": 9.987794811271511e-05, + "loss": 2.3632, + "step": 1682 + }, + { + "epoch": 0.5165745856353591, + "grad_norm": 0.9555168747901917, + "learning_rate": 9.987760077604791e-05, + "loss": 2.3273, + "step": 1683 + }, + { + "epoch": 0.5168815224063843, + "grad_norm": 0.9839370250701904, + "learning_rate": 9.987725294646102e-05, + "loss": 2.3451, + "step": 1684 + }, + { + "epoch": 0.5171884591774094, + "grad_norm": 1.097970962524414, + "learning_rate": 9.987690462395791e-05, + "loss": 2.308, + "step": 1685 + }, + { + "epoch": 0.5174953959484346, + "grad_norm": 0.9345484972000122, + "learning_rate": 9.987655580854198e-05, + "loss": 2.3051, + "step": 1686 + }, + { + "epoch": 0.5178023327194597, + "grad_norm": 0.8075851798057556, + "learning_rate": 9.987620650021668e-05, + "loss": 2.3005, + "step": 1687 + }, + { + "epoch": 0.518109269490485, + "grad_norm": 0.7287935614585876, + "learning_rate": 9.987585669898549e-05, + "loss": 2.3709, + "step": 1688 + }, + { + "epoch": 0.5184162062615101, + "grad_norm": 0.7611173987388611, + "learning_rate": 9.987550640485184e-05, + "loss": 2.3265, + "step": 1689 + }, + { + "epoch": 0.5187231430325353, + "grad_norm": 0.7932588458061218, + "learning_rate": 9.987515561781921e-05, + "loss": 2.3625, + "step": 1690 + }, + { + "epoch": 0.5190300798035604, + "grad_norm": 0.7837479114532471, + "learning_rate": 9.987480433789106e-05, + "loss": 2.2614, + "step": 1691 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.905799925327301, + "learning_rate": 9.987445256507085e-05, + "loss": 2.2915, + "step": 1692 + }, + { + "epoch": 0.5196439533456108, + "grad_norm": 0.9417183995246887, + "learning_rate": 9.987410029936208e-05, + "loss": 2.3624, + "step": 1693 + }, + { + "epoch": 0.519950890116636, + "grad_norm": 0.9971327185630798, + "learning_rate": 9.987374754076822e-05, + "loss": 2.3913, + "step": 1694 + }, + { + "epoch": 0.5202578268876611, + "grad_norm": 0.8719072341918945, + "learning_rate": 9.987339428929274e-05, + "loss": 2.3412, + "step": 1695 + }, + { + "epoch": 0.5205647636586863, + "grad_norm": 0.8198116421699524, + "learning_rate": 9.987304054493916e-05, + "loss": 2.333, + "step": 1696 + }, + { + "epoch": 0.5208717004297114, + "grad_norm": 0.7450931668281555, + "learning_rate": 9.987268630771096e-05, + "loss": 2.2817, + "step": 1697 + }, + { + "epoch": 0.5211786372007366, + "grad_norm": 0.6867587566375732, + "learning_rate": 9.987233157761164e-05, + "loss": 2.3456, + "step": 1698 + }, + { + "epoch": 0.5214855739717618, + "grad_norm": 0.7537778615951538, + "learning_rate": 9.987197635464471e-05, + "loss": 2.176, + "step": 1699 + }, + { + "epoch": 0.521792510742787, + "grad_norm": 0.8347577452659607, + "learning_rate": 9.987162063881366e-05, + "loss": 2.3296, + "step": 1700 + }, + { + "epoch": 0.5220994475138122, + "grad_norm": 0.8714643120765686, + "learning_rate": 9.987126443012205e-05, + "loss": 2.3648, + "step": 1701 + }, + { + "epoch": 0.5224063842848373, + "grad_norm": 0.8579849004745483, + "learning_rate": 9.987090772857336e-05, + "loss": 2.4189, + "step": 1702 + }, + { + "epoch": 0.5227133210558625, + "grad_norm": 0.8651238083839417, + "learning_rate": 9.987055053417114e-05, + "loss": 2.3036, + "step": 1703 + }, + { + "epoch": 0.5230202578268877, + "grad_norm": 0.8447873592376709, + "learning_rate": 9.98701928469189e-05, + "loss": 2.3243, + "step": 1704 + }, + { + "epoch": 0.5233271945979129, + "grad_norm": 0.8218941688537598, + "learning_rate": 9.986983466682019e-05, + "loss": 2.3888, + "step": 1705 + }, + { + "epoch": 0.523634131368938, + "grad_norm": 0.7862920761108398, + "learning_rate": 9.986947599387855e-05, + "loss": 2.335, + "step": 1706 + }, + { + "epoch": 0.5239410681399632, + "grad_norm": 0.8096200227737427, + "learning_rate": 9.986911682809749e-05, + "loss": 2.4034, + "step": 1707 + }, + { + "epoch": 0.5242480049109883, + "grad_norm": 0.8217427730560303, + "learning_rate": 9.986875716948062e-05, + "loss": 2.2659, + "step": 1708 + }, + { + "epoch": 0.5245549416820136, + "grad_norm": 0.7676928043365479, + "learning_rate": 9.986839701803146e-05, + "loss": 2.2736, + "step": 1709 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.7783572673797607, + "learning_rate": 9.986803637375356e-05, + "loss": 2.3611, + "step": 1710 + }, + { + "epoch": 0.5251688152240639, + "grad_norm": 0.7657338380813599, + "learning_rate": 9.98676752366505e-05, + "loss": 2.3573, + "step": 1711 + }, + { + "epoch": 0.525475751995089, + "grad_norm": 0.8946976065635681, + "learning_rate": 9.986731360672585e-05, + "loss": 2.3443, + "step": 1712 + }, + { + "epoch": 0.5257826887661142, + "grad_norm": 0.8047227263450623, + "learning_rate": 9.986695148398318e-05, + "loss": 2.345, + "step": 1713 + }, + { + "epoch": 0.5260896255371393, + "grad_norm": 0.8407939672470093, + "learning_rate": 9.986658886842605e-05, + "loss": 2.2828, + "step": 1714 + }, + { + "epoch": 0.5263965623081646, + "grad_norm": 0.8460215330123901, + "learning_rate": 9.986622576005806e-05, + "loss": 2.2786, + "step": 1715 + }, + { + "epoch": 0.5267034990791897, + "grad_norm": 0.8291949033737183, + "learning_rate": 9.986586215888283e-05, + "loss": 2.3491, + "step": 1716 + }, + { + "epoch": 0.5270104358502149, + "grad_norm": 0.8812628388404846, + "learning_rate": 9.98654980649039e-05, + "loss": 2.3392, + "step": 1717 + }, + { + "epoch": 0.52731737262124, + "grad_norm": 0.8666933178901672, + "learning_rate": 9.98651334781249e-05, + "loss": 2.2585, + "step": 1718 + }, + { + "epoch": 0.5276243093922652, + "grad_norm": 0.8393275737762451, + "learning_rate": 9.986476839854941e-05, + "loss": 2.3315, + "step": 1719 + }, + { + "epoch": 0.5279312461632903, + "grad_norm": 0.8431777954101562, + "learning_rate": 9.986440282618105e-05, + "loss": 2.268, + "step": 1720 + }, + { + "epoch": 0.5282381829343156, + "grad_norm": 0.8020747900009155, + "learning_rate": 9.986403676102346e-05, + "loss": 2.2306, + "step": 1721 + }, + { + "epoch": 0.5285451197053407, + "grad_norm": 0.817395806312561, + "learning_rate": 9.986367020308022e-05, + "loss": 2.2914, + "step": 1722 + }, + { + "epoch": 0.5288520564763659, + "grad_norm": 0.8034493327140808, + "learning_rate": 9.986330315235497e-05, + "loss": 2.3598, + "step": 1723 + }, + { + "epoch": 0.529158993247391, + "grad_norm": 0.9001252055168152, + "learning_rate": 9.986293560885131e-05, + "loss": 2.3456, + "step": 1724 + }, + { + "epoch": 0.5294659300184162, + "grad_norm": 0.9782349467277527, + "learning_rate": 9.986256757257293e-05, + "loss": 2.231, + "step": 1725 + }, + { + "epoch": 0.5297728667894414, + "grad_norm": 1.0022578239440918, + "learning_rate": 9.98621990435234e-05, + "loss": 2.3457, + "step": 1726 + }, + { + "epoch": 0.5300798035604666, + "grad_norm": 1.0705206394195557, + "learning_rate": 9.986183002170642e-05, + "loss": 2.2775, + "step": 1727 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.8464064598083496, + "learning_rate": 9.98614605071256e-05, + "loss": 2.4006, + "step": 1728 + }, + { + "epoch": 0.5306936771025169, + "grad_norm": 0.7128132581710815, + "learning_rate": 9.98610904997846e-05, + "loss": 2.3273, + "step": 1729 + }, + { + "epoch": 0.531000613873542, + "grad_norm": 0.8113927245140076, + "learning_rate": 9.986071999968706e-05, + "loss": 2.3467, + "step": 1730 + }, + { + "epoch": 0.5313075506445673, + "grad_norm": 0.9236831665039062, + "learning_rate": 9.986034900683669e-05, + "loss": 2.3815, + "step": 1731 + }, + { + "epoch": 0.5316144874155924, + "grad_norm": 0.9325668811798096, + "learning_rate": 9.985997752123713e-05, + "loss": 2.3411, + "step": 1732 + }, + { + "epoch": 0.5319214241866176, + "grad_norm": 0.9585117101669312, + "learning_rate": 9.985960554289203e-05, + "loss": 2.3309, + "step": 1733 + }, + { + "epoch": 0.5322283609576427, + "grad_norm": 0.9459986686706543, + "learning_rate": 9.98592330718051e-05, + "loss": 2.3525, + "step": 1734 + }, + { + "epoch": 0.5325352977286679, + "grad_norm": 0.971592366695404, + "learning_rate": 9.985886010797997e-05, + "loss": 2.3665, + "step": 1735 + }, + { + "epoch": 0.532842234499693, + "grad_norm": 0.8533779978752136, + "learning_rate": 9.985848665142039e-05, + "loss": 2.26, + "step": 1736 + }, + { + "epoch": 0.5331491712707183, + "grad_norm": 0.8224228620529175, + "learning_rate": 9.985811270213002e-05, + "loss": 2.3523, + "step": 1737 + }, + { + "epoch": 0.5334561080417434, + "grad_norm": 0.8649810552597046, + "learning_rate": 9.985773826011255e-05, + "loss": 2.3262, + "step": 1738 + }, + { + "epoch": 0.5337630448127686, + "grad_norm": 0.8099339604377747, + "learning_rate": 9.98573633253717e-05, + "loss": 2.3038, + "step": 1739 + }, + { + "epoch": 0.5340699815837937, + "grad_norm": 0.6788219213485718, + "learning_rate": 9.985698789791115e-05, + "loss": 2.3278, + "step": 1740 + }, + { + "epoch": 0.5343769183548189, + "grad_norm": 0.8716040253639221, + "learning_rate": 9.985661197773464e-05, + "loss": 2.2955, + "step": 1741 + }, + { + "epoch": 0.534683855125844, + "grad_norm": 0.8377614617347717, + "learning_rate": 9.985623556484587e-05, + "loss": 2.2801, + "step": 1742 + }, + { + "epoch": 0.5349907918968693, + "grad_norm": 0.8452683091163635, + "learning_rate": 9.985585865924853e-05, + "loss": 2.3313, + "step": 1743 + }, + { + "epoch": 0.5352977286678944, + "grad_norm": 0.8226203918457031, + "learning_rate": 9.98554812609464e-05, + "loss": 2.3464, + "step": 1744 + }, + { + "epoch": 0.5356046654389196, + "grad_norm": 0.7476974725723267, + "learning_rate": 9.985510336994316e-05, + "loss": 2.3721, + "step": 1745 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.7132230997085571, + "learning_rate": 9.98547249862426e-05, + "loss": 2.2657, + "step": 1746 + }, + { + "epoch": 0.5362185389809699, + "grad_norm": 0.7022002339363098, + "learning_rate": 9.98543461098484e-05, + "loss": 2.2656, + "step": 1747 + }, + { + "epoch": 0.536525475751995, + "grad_norm": 0.7174789309501648, + "learning_rate": 9.985396674076435e-05, + "loss": 2.2914, + "step": 1748 + }, + { + "epoch": 0.5368324125230203, + "grad_norm": 0.78509920835495, + "learning_rate": 9.985358687899417e-05, + "loss": 2.3155, + "step": 1749 + }, + { + "epoch": 0.5371393492940454, + "grad_norm": 0.7670894861221313, + "learning_rate": 9.985320652454162e-05, + "loss": 2.2608, + "step": 1750 + }, + { + "epoch": 0.5374462860650706, + "grad_norm": 0.6196603178977966, + "learning_rate": 9.985282567741047e-05, + "loss": 2.2796, + "step": 1751 + }, + { + "epoch": 0.5377532228360957, + "grad_norm": 0.7119829058647156, + "learning_rate": 9.985244433760448e-05, + "loss": 2.2262, + "step": 1752 + }, + { + "epoch": 0.538060159607121, + "grad_norm": 0.6665359735488892, + "learning_rate": 9.98520625051274e-05, + "loss": 2.2714, + "step": 1753 + }, + { + "epoch": 0.5383670963781461, + "grad_norm": 0.7960934042930603, + "learning_rate": 9.985168017998303e-05, + "loss": 2.3703, + "step": 1754 + }, + { + "epoch": 0.5386740331491713, + "grad_norm": 0.9428521394729614, + "learning_rate": 9.985129736217513e-05, + "loss": 2.3334, + "step": 1755 + }, + { + "epoch": 0.5389809699201964, + "grad_norm": 0.9900842905044556, + "learning_rate": 9.985091405170751e-05, + "loss": 2.2369, + "step": 1756 + }, + { + "epoch": 0.5392879066912216, + "grad_norm": 0.9340593814849854, + "learning_rate": 9.985053024858393e-05, + "loss": 2.4332, + "step": 1757 + }, + { + "epoch": 0.5395948434622467, + "grad_norm": 0.9241896271705627, + "learning_rate": 9.985014595280818e-05, + "loss": 2.3484, + "step": 1758 + }, + { + "epoch": 0.539901780233272, + "grad_norm": 0.7724506258964539, + "learning_rate": 9.984976116438408e-05, + "loss": 2.282, + "step": 1759 + }, + { + "epoch": 0.5402087170042971, + "grad_norm": 0.9098101854324341, + "learning_rate": 9.984937588331543e-05, + "loss": 2.3039, + "step": 1760 + }, + { + "epoch": 0.5405156537753223, + "grad_norm": 0.9430370330810547, + "learning_rate": 9.984899010960601e-05, + "loss": 2.2555, + "step": 1761 + }, + { + "epoch": 0.5408225905463474, + "grad_norm": 0.8927021026611328, + "learning_rate": 9.984860384325965e-05, + "loss": 2.3034, + "step": 1762 + }, + { + "epoch": 0.5411295273173726, + "grad_norm": 0.8331896662712097, + "learning_rate": 9.98482170842802e-05, + "loss": 2.3341, + "step": 1763 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.8311246633529663, + "learning_rate": 9.984782983267142e-05, + "loss": 2.3913, + "step": 1764 + }, + { + "epoch": 0.541743400859423, + "grad_norm": 0.7459335923194885, + "learning_rate": 9.98474420884372e-05, + "loss": 2.2912, + "step": 1765 + }, + { + "epoch": 0.5420503376304481, + "grad_norm": 0.84760981798172, + "learning_rate": 9.984705385158131e-05, + "loss": 2.316, + "step": 1766 + }, + { + "epoch": 0.5423572744014733, + "grad_norm": 0.888793408870697, + "learning_rate": 9.984666512210762e-05, + "loss": 2.3452, + "step": 1767 + }, + { + "epoch": 0.5426642111724984, + "grad_norm": 0.7977499961853027, + "learning_rate": 9.984627590001999e-05, + "loss": 2.3325, + "step": 1768 + }, + { + "epoch": 0.5429711479435236, + "grad_norm": 0.8059934377670288, + "learning_rate": 9.984588618532224e-05, + "loss": 2.3347, + "step": 1769 + }, + { + "epoch": 0.5432780847145487, + "grad_norm": 0.8190197348594666, + "learning_rate": 9.984549597801822e-05, + "loss": 2.3446, + "step": 1770 + }, + { + "epoch": 0.543585021485574, + "grad_norm": 0.774773895740509, + "learning_rate": 9.98451052781118e-05, + "loss": 2.2598, + "step": 1771 + }, + { + "epoch": 0.5438919582565992, + "grad_norm": 0.7341485023498535, + "learning_rate": 9.984471408560682e-05, + "loss": 2.2728, + "step": 1772 + }, + { + "epoch": 0.5441988950276243, + "grad_norm": 0.6881145238876343, + "learning_rate": 9.984432240050719e-05, + "loss": 2.2922, + "step": 1773 + }, + { + "epoch": 0.5445058317986495, + "grad_norm": 0.6896151304244995, + "learning_rate": 9.984393022281673e-05, + "loss": 2.2915, + "step": 1774 + }, + { + "epoch": 0.5448127685696746, + "grad_norm": 0.6902059316635132, + "learning_rate": 9.984353755253932e-05, + "loss": 2.31, + "step": 1775 + }, + { + "epoch": 0.5451197053406999, + "grad_norm": 0.7594140768051147, + "learning_rate": 9.984314438967888e-05, + "loss": 2.3092, + "step": 1776 + }, + { + "epoch": 0.545426642111725, + "grad_norm": 0.8682328462600708, + "learning_rate": 9.984275073423927e-05, + "loss": 2.2851, + "step": 1777 + }, + { + "epoch": 0.5457335788827502, + "grad_norm": 0.8747107982635498, + "learning_rate": 9.98423565862244e-05, + "loss": 2.2927, + "step": 1778 + }, + { + "epoch": 0.5460405156537753, + "grad_norm": 0.9824326038360596, + "learning_rate": 9.984196194563813e-05, + "loss": 2.3622, + "step": 1779 + }, + { + "epoch": 0.5463474524248005, + "grad_norm": 1.0006790161132812, + "learning_rate": 9.984156681248438e-05, + "loss": 2.2531, + "step": 1780 + }, + { + "epoch": 0.5466543891958257, + "grad_norm": 0.9501944184303284, + "learning_rate": 9.984117118676705e-05, + "loss": 2.3902, + "step": 1781 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.7835353016853333, + "learning_rate": 9.984077506849005e-05, + "loss": 2.2754, + "step": 1782 + }, + { + "epoch": 0.547268262737876, + "grad_norm": 0.7310026288032532, + "learning_rate": 9.984037845765732e-05, + "loss": 2.2742, + "step": 1783 + }, + { + "epoch": 0.5475751995089012, + "grad_norm": 0.9469361901283264, + "learning_rate": 9.983998135427275e-05, + "loss": 2.4026, + "step": 1784 + }, + { + "epoch": 0.5478821362799263, + "grad_norm": 1.0639240741729736, + "learning_rate": 9.983958375834025e-05, + "loss": 2.3522, + "step": 1785 + }, + { + "epoch": 0.5481890730509515, + "grad_norm": 0.7771989703178406, + "learning_rate": 9.983918566986379e-05, + "loss": 2.216, + "step": 1786 + }, + { + "epoch": 0.5484960098219767, + "grad_norm": 0.6809307932853699, + "learning_rate": 9.983878708884728e-05, + "loss": 2.256, + "step": 1787 + }, + { + "epoch": 0.5488029465930019, + "grad_norm": 0.7300165891647339, + "learning_rate": 9.983838801529469e-05, + "loss": 2.3156, + "step": 1788 + }, + { + "epoch": 0.549109883364027, + "grad_norm": 0.8352389335632324, + "learning_rate": 9.98379884492099e-05, + "loss": 2.3344, + "step": 1789 + }, + { + "epoch": 0.5494168201350522, + "grad_norm": 0.830585777759552, + "learning_rate": 9.983758839059692e-05, + "loss": 2.3076, + "step": 1790 + }, + { + "epoch": 0.5497237569060773, + "grad_norm": 0.7384640574455261, + "learning_rate": 9.983718783945968e-05, + "loss": 2.2387, + "step": 1791 + }, + { + "epoch": 0.5500306936771026, + "grad_norm": 0.7133243083953857, + "learning_rate": 9.983678679580213e-05, + "loss": 2.2933, + "step": 1792 + }, + { + "epoch": 0.5503376304481277, + "grad_norm": 0.8462459444999695, + "learning_rate": 9.983638525962823e-05, + "loss": 2.3294, + "step": 1793 + }, + { + "epoch": 0.5506445672191529, + "grad_norm": 0.7841110825538635, + "learning_rate": 9.983598323094199e-05, + "loss": 2.3156, + "step": 1794 + }, + { + "epoch": 0.550951503990178, + "grad_norm": 0.8454114198684692, + "learning_rate": 9.983558070974735e-05, + "loss": 2.2203, + "step": 1795 + }, + { + "epoch": 0.5512584407612032, + "grad_norm": 0.7741531729698181, + "learning_rate": 9.983517769604826e-05, + "loss": 2.2585, + "step": 1796 + }, + { + "epoch": 0.5515653775322283, + "grad_norm": 0.717714250087738, + "learning_rate": 9.983477418984876e-05, + "loss": 2.3127, + "step": 1797 + }, + { + "epoch": 0.5518723143032536, + "grad_norm": 0.7546361088752747, + "learning_rate": 9.983437019115283e-05, + "loss": 2.2591, + "step": 1798 + }, + { + "epoch": 0.5521792510742787, + "grad_norm": 0.7947681546211243, + "learning_rate": 9.983396569996442e-05, + "loss": 2.337, + "step": 1799 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.9286270141601562, + "learning_rate": 9.983356071628756e-05, + "loss": 2.371, + "step": 1800 + }, + { + "epoch": 0.552793124616329, + "grad_norm": 1.0236682891845703, + "learning_rate": 9.983315524012625e-05, + "loss": 2.2673, + "step": 1801 + }, + { + "epoch": 0.5531000613873542, + "grad_norm": 1.043534278869629, + "learning_rate": 9.983274927148447e-05, + "loss": 2.3204, + "step": 1802 + }, + { + "epoch": 0.5534069981583793, + "grad_norm": 0.9694257378578186, + "learning_rate": 9.983234281036626e-05, + "loss": 2.2642, + "step": 1803 + }, + { + "epoch": 0.5537139349294046, + "grad_norm": 0.8890992403030396, + "learning_rate": 9.983193585677563e-05, + "loss": 2.2546, + "step": 1804 + }, + { + "epoch": 0.5540208717004297, + "grad_norm": 0.8109140396118164, + "learning_rate": 9.983152841071662e-05, + "loss": 2.3088, + "step": 1805 + }, + { + "epoch": 0.5543278084714549, + "grad_norm": 0.7762413620948792, + "learning_rate": 9.983112047219323e-05, + "loss": 2.2277, + "step": 1806 + }, + { + "epoch": 0.55463474524248, + "grad_norm": 0.7949336767196655, + "learning_rate": 9.983071204120951e-05, + "loss": 2.3004, + "step": 1807 + }, + { + "epoch": 0.5549416820135052, + "grad_norm": 0.9118300080299377, + "learning_rate": 9.983030311776946e-05, + "loss": 2.3986, + "step": 1808 + }, + { + "epoch": 0.5552486187845304, + "grad_norm": 0.874891996383667, + "learning_rate": 9.982989370187717e-05, + "loss": 2.2721, + "step": 1809 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.8089940547943115, + "learning_rate": 9.982948379353667e-05, + "loss": 2.2846, + "step": 1810 + }, + { + "epoch": 0.5558624923265807, + "grad_norm": 0.7407395839691162, + "learning_rate": 9.982907339275198e-05, + "loss": 2.2848, + "step": 1811 + }, + { + "epoch": 0.5561694290976059, + "grad_norm": 0.7487329244613647, + "learning_rate": 9.982866249952721e-05, + "loss": 2.266, + "step": 1812 + }, + { + "epoch": 0.556476365868631, + "grad_norm": 0.7910557389259338, + "learning_rate": 9.982825111386638e-05, + "loss": 2.2975, + "step": 1813 + }, + { + "epoch": 0.5567833026396563, + "grad_norm": 0.767186164855957, + "learning_rate": 9.982783923577356e-05, + "loss": 2.2867, + "step": 1814 + }, + { + "epoch": 0.5570902394106814, + "grad_norm": 0.7296959757804871, + "learning_rate": 9.982742686525284e-05, + "loss": 2.2167, + "step": 1815 + }, + { + "epoch": 0.5573971761817066, + "grad_norm": 0.6536411643028259, + "learning_rate": 9.982701400230827e-05, + "loss": 2.2278, + "step": 1816 + }, + { + "epoch": 0.5577041129527317, + "grad_norm": 0.7393643260002136, + "learning_rate": 9.982660064694394e-05, + "loss": 2.3275, + "step": 1817 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.7837240099906921, + "learning_rate": 9.982618679916396e-05, + "loss": 2.3516, + "step": 1818 + }, + { + "epoch": 0.558317986494782, + "grad_norm": 0.8186847567558289, + "learning_rate": 9.982577245897238e-05, + "loss": 2.4104, + "step": 1819 + }, + { + "epoch": 0.5586249232658073, + "grad_norm": 0.733651340007782, + "learning_rate": 9.98253576263733e-05, + "loss": 2.2151, + "step": 1820 + }, + { + "epoch": 0.5589318600368324, + "grad_norm": 0.7452411651611328, + "learning_rate": 9.982494230137086e-05, + "loss": 2.3288, + "step": 1821 + }, + { + "epoch": 0.5592387968078576, + "grad_norm": 0.7369456887245178, + "learning_rate": 9.982452648396913e-05, + "loss": 2.3023, + "step": 1822 + }, + { + "epoch": 0.5595457335788827, + "grad_norm": 0.794789731502533, + "learning_rate": 9.982411017417222e-05, + "loss": 2.2774, + "step": 1823 + }, + { + "epoch": 0.5598526703499079, + "grad_norm": 0.7677412033081055, + "learning_rate": 9.982369337198425e-05, + "loss": 2.3213, + "step": 1824 + }, + { + "epoch": 0.560159607120933, + "grad_norm": 0.8195241689682007, + "learning_rate": 9.982327607740934e-05, + "loss": 2.3721, + "step": 1825 + }, + { + "epoch": 0.5604665438919583, + "grad_norm": 0.867115318775177, + "learning_rate": 9.982285829045162e-05, + "loss": 2.3653, + "step": 1826 + }, + { + "epoch": 0.5607734806629834, + "grad_norm": 0.8519865870475769, + "learning_rate": 9.98224400111152e-05, + "loss": 2.3646, + "step": 1827 + }, + { + "epoch": 0.5610804174340086, + "grad_norm": 0.9408721923828125, + "learning_rate": 9.982202123940425e-05, + "loss": 2.2051, + "step": 1828 + }, + { + "epoch": 0.5613873542050337, + "grad_norm": 0.985325813293457, + "learning_rate": 9.982160197532287e-05, + "loss": 2.3402, + "step": 1829 + }, + { + "epoch": 0.5616942909760589, + "grad_norm": 1.018094539642334, + "learning_rate": 9.982118221887521e-05, + "loss": 2.2712, + "step": 1830 + }, + { + "epoch": 0.562001227747084, + "grad_norm": 0.9246920347213745, + "learning_rate": 9.982076197006543e-05, + "loss": 2.3808, + "step": 1831 + }, + { + "epoch": 0.5623081645181093, + "grad_norm": 0.8519729971885681, + "learning_rate": 9.982034122889768e-05, + "loss": 2.3774, + "step": 1832 + }, + { + "epoch": 0.5626151012891344, + "grad_norm": 0.801567018032074, + "learning_rate": 9.981991999537612e-05, + "loss": 2.2713, + "step": 1833 + }, + { + "epoch": 0.5629220380601596, + "grad_norm": 0.7212518453598022, + "learning_rate": 9.981949826950492e-05, + "loss": 2.1902, + "step": 1834 + }, + { + "epoch": 0.5632289748311847, + "grad_norm": 0.7644798755645752, + "learning_rate": 9.981907605128822e-05, + "loss": 2.2751, + "step": 1835 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.7941999435424805, + "learning_rate": 9.981865334073022e-05, + "loss": 2.2991, + "step": 1836 + }, + { + "epoch": 0.5638428483732351, + "grad_norm": 0.7274888753890991, + "learning_rate": 9.981823013783508e-05, + "loss": 2.3536, + "step": 1837 + }, + { + "epoch": 0.5641497851442603, + "grad_norm": 0.845024585723877, + "learning_rate": 9.9817806442607e-05, + "loss": 2.2796, + "step": 1838 + }, + { + "epoch": 0.5644567219152854, + "grad_norm": 0.8225597739219666, + "learning_rate": 9.981738225505015e-05, + "loss": 2.3339, + "step": 1839 + }, + { + "epoch": 0.5647636586863106, + "grad_norm": 0.8456425070762634, + "learning_rate": 9.981695757516873e-05, + "loss": 2.2583, + "step": 1840 + }, + { + "epoch": 0.5650705954573357, + "grad_norm": 1.0066497325897217, + "learning_rate": 9.981653240296695e-05, + "loss": 2.3628, + "step": 1841 + }, + { + "epoch": 0.565377532228361, + "grad_norm": 0.9574379920959473, + "learning_rate": 9.981610673844899e-05, + "loss": 2.306, + "step": 1842 + }, + { + "epoch": 0.5656844689993862, + "grad_norm": 0.7427437901496887, + "learning_rate": 9.981568058161905e-05, + "loss": 2.267, + "step": 1843 + }, + { + "epoch": 0.5659914057704113, + "grad_norm": 0.6984857320785522, + "learning_rate": 9.981525393248138e-05, + "loss": 2.2095, + "step": 1844 + }, + { + "epoch": 0.5662983425414365, + "grad_norm": 0.748062789440155, + "learning_rate": 9.981482679104016e-05, + "loss": 2.211, + "step": 1845 + }, + { + "epoch": 0.5666052793124616, + "grad_norm": 0.7978217005729675, + "learning_rate": 9.981439915729964e-05, + "loss": 2.2437, + "step": 1846 + }, + { + "epoch": 0.5669122160834869, + "grad_norm": 0.807849109172821, + "learning_rate": 9.981397103126401e-05, + "loss": 2.3063, + "step": 1847 + }, + { + "epoch": 0.567219152854512, + "grad_norm": 0.8626619577407837, + "learning_rate": 9.981354241293752e-05, + "loss": 2.3616, + "step": 1848 + }, + { + "epoch": 0.5675260896255372, + "grad_norm": 0.8991526961326599, + "learning_rate": 9.981311330232442e-05, + "loss": 2.2355, + "step": 1849 + }, + { + "epoch": 0.5678330263965623, + "grad_norm": 0.7399953007698059, + "learning_rate": 9.981268369942894e-05, + "loss": 2.2452, + "step": 1850 + }, + { + "epoch": 0.5681399631675875, + "grad_norm": 0.7787104845046997, + "learning_rate": 9.981225360425533e-05, + "loss": 2.4141, + "step": 1851 + }, + { + "epoch": 0.5684468999386126, + "grad_norm": 0.8570892214775085, + "learning_rate": 9.98118230168078e-05, + "loss": 2.2487, + "step": 1852 + }, + { + "epoch": 0.5687538367096379, + "grad_norm": 0.8277538418769836, + "learning_rate": 9.981139193709068e-05, + "loss": 2.2602, + "step": 1853 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.7638106942176819, + "learning_rate": 9.981096036510817e-05, + "loss": 2.2886, + "step": 1854 + }, + { + "epoch": 0.5693677102516882, + "grad_norm": 0.8480616807937622, + "learning_rate": 9.981052830086454e-05, + "loss": 2.2893, + "step": 1855 + }, + { + "epoch": 0.5696746470227133, + "grad_norm": 0.8568599820137024, + "learning_rate": 9.98100957443641e-05, + "loss": 2.3802, + "step": 1856 + }, + { + "epoch": 0.5699815837937385, + "grad_norm": 0.7863987684249878, + "learning_rate": 9.98096626956111e-05, + "loss": 2.2996, + "step": 1857 + }, + { + "epoch": 0.5702885205647636, + "grad_norm": 0.7636334896087646, + "learning_rate": 9.980922915460979e-05, + "loss": 2.2569, + "step": 1858 + }, + { + "epoch": 0.5705954573357889, + "grad_norm": 0.7514677047729492, + "learning_rate": 9.98087951213645e-05, + "loss": 2.3317, + "step": 1859 + }, + { + "epoch": 0.570902394106814, + "grad_norm": 0.717637300491333, + "learning_rate": 9.980836059587951e-05, + "loss": 2.2855, + "step": 1860 + }, + { + "epoch": 0.5712093308778392, + "grad_norm": 0.728518545627594, + "learning_rate": 9.98079255781591e-05, + "loss": 2.3166, + "step": 1861 + }, + { + "epoch": 0.5715162676488643, + "grad_norm": 0.7158043384552002, + "learning_rate": 9.980749006820757e-05, + "loss": 2.2639, + "step": 1862 + }, + { + "epoch": 0.5718232044198895, + "grad_norm": 0.7565107941627502, + "learning_rate": 9.980705406602924e-05, + "loss": 2.2833, + "step": 1863 + }, + { + "epoch": 0.5721301411909147, + "grad_norm": 0.7873388528823853, + "learning_rate": 9.980661757162841e-05, + "loss": 2.201, + "step": 1864 + }, + { + "epoch": 0.5724370779619399, + "grad_norm": 0.7818259596824646, + "learning_rate": 9.980618058500939e-05, + "loss": 2.242, + "step": 1865 + }, + { + "epoch": 0.572744014732965, + "grad_norm": 0.7464665770530701, + "learning_rate": 9.98057431061765e-05, + "loss": 2.2325, + "step": 1866 + }, + { + "epoch": 0.5730509515039902, + "grad_norm": 0.7778184413909912, + "learning_rate": 9.980530513513406e-05, + "loss": 2.3258, + "step": 1867 + }, + { + "epoch": 0.5733578882750153, + "grad_norm": 0.825661301612854, + "learning_rate": 9.980486667188642e-05, + "loss": 2.3477, + "step": 1868 + }, + { + "epoch": 0.5736648250460405, + "grad_norm": 0.8448848724365234, + "learning_rate": 9.980442771643788e-05, + "loss": 2.3523, + "step": 1869 + }, + { + "epoch": 0.5739717618170657, + "grad_norm": 0.8330404758453369, + "learning_rate": 9.98039882687928e-05, + "loss": 2.2274, + "step": 1870 + }, + { + "epoch": 0.5742786985880909, + "grad_norm": 0.7520943284034729, + "learning_rate": 9.98035483289555e-05, + "loss": 2.2773, + "step": 1871 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.8312448263168335, + "learning_rate": 9.980310789693037e-05, + "loss": 2.302, + "step": 1872 + }, + { + "epoch": 0.5748925721301412, + "grad_norm": 0.7383994460105896, + "learning_rate": 9.980266697272173e-05, + "loss": 2.2168, + "step": 1873 + }, + { + "epoch": 0.5751995089011663, + "grad_norm": 0.9612922072410583, + "learning_rate": 9.980222555633394e-05, + "loss": 2.3558, + "step": 1874 + }, + { + "epoch": 0.5755064456721916, + "grad_norm": 0.9921227097511292, + "learning_rate": 9.980178364777136e-05, + "loss": 2.2913, + "step": 1875 + }, + { + "epoch": 0.5758133824432167, + "grad_norm": 0.9152889847755432, + "learning_rate": 9.980134124703837e-05, + "loss": 2.2615, + "step": 1876 + }, + { + "epoch": 0.5761203192142419, + "grad_norm": 0.8090541362762451, + "learning_rate": 9.980089835413936e-05, + "loss": 2.2661, + "step": 1877 + }, + { + "epoch": 0.576427255985267, + "grad_norm": 0.8074322938919067, + "learning_rate": 9.980045496907865e-05, + "loss": 2.3209, + "step": 1878 + }, + { + "epoch": 0.5767341927562922, + "grad_norm": 0.784649670124054, + "learning_rate": 9.980001109186065e-05, + "loss": 2.241, + "step": 1879 + }, + { + "epoch": 0.5770411295273173, + "grad_norm": 0.768108069896698, + "learning_rate": 9.979956672248978e-05, + "loss": 2.3333, + "step": 1880 + }, + { + "epoch": 0.5773480662983426, + "grad_norm": 0.798058271408081, + "learning_rate": 9.97991218609704e-05, + "loss": 2.3564, + "step": 1881 + }, + { + "epoch": 0.5776550030693677, + "grad_norm": 0.7606865763664246, + "learning_rate": 9.97986765073069e-05, + "loss": 2.2277, + "step": 1882 + }, + { + "epoch": 0.5779619398403929, + "grad_norm": 0.8320558667182922, + "learning_rate": 9.979823066150369e-05, + "loss": 2.3715, + "step": 1883 + }, + { + "epoch": 0.578268876611418, + "grad_norm": 0.7935798168182373, + "learning_rate": 9.979778432356517e-05, + "loss": 2.2605, + "step": 1884 + }, + { + "epoch": 0.5785758133824432, + "grad_norm": 0.6914796829223633, + "learning_rate": 9.979733749349578e-05, + "loss": 2.2699, + "step": 1885 + }, + { + "epoch": 0.5788827501534684, + "grad_norm": 0.6546899676322937, + "learning_rate": 9.979689017129989e-05, + "loss": 2.1908, + "step": 1886 + }, + { + "epoch": 0.5791896869244936, + "grad_norm": 0.7231267094612122, + "learning_rate": 9.979644235698195e-05, + "loss": 2.2084, + "step": 1887 + }, + { + "epoch": 0.5794966236955187, + "grad_norm": 0.668933093547821, + "learning_rate": 9.979599405054639e-05, + "loss": 2.2722, + "step": 1888 + }, + { + "epoch": 0.5798035604665439, + "grad_norm": 0.678191602230072, + "learning_rate": 9.979554525199763e-05, + "loss": 2.2312, + "step": 1889 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.6407462954521179, + "learning_rate": 9.97950959613401e-05, + "loss": 2.2381, + "step": 1890 + }, + { + "epoch": 0.5804174340085942, + "grad_norm": 0.6920403242111206, + "learning_rate": 9.979464617857826e-05, + "loss": 2.2678, + "step": 1891 + }, + { + "epoch": 0.5807243707796194, + "grad_norm": 0.6907110810279846, + "learning_rate": 9.979419590371651e-05, + "loss": 2.2579, + "step": 1892 + }, + { + "epoch": 0.5810313075506446, + "grad_norm": 0.7683933973312378, + "learning_rate": 9.979374513675935e-05, + "loss": 2.2184, + "step": 1893 + }, + { + "epoch": 0.5813382443216697, + "grad_norm": 0.797286868095398, + "learning_rate": 9.979329387771121e-05, + "loss": 2.2518, + "step": 1894 + }, + { + "epoch": 0.5816451810926949, + "grad_norm": 0.8192877769470215, + "learning_rate": 9.979284212657657e-05, + "loss": 2.2271, + "step": 1895 + }, + { + "epoch": 0.58195211786372, + "grad_norm": 0.7510090470314026, + "learning_rate": 9.979238988335986e-05, + "loss": 2.2864, + "step": 1896 + }, + { + "epoch": 0.5822590546347453, + "grad_norm": 0.7541393041610718, + "learning_rate": 9.979193714806558e-05, + "loss": 2.239, + "step": 1897 + }, + { + "epoch": 0.5825659914057704, + "grad_norm": 0.7353073358535767, + "learning_rate": 9.97914839206982e-05, + "loss": 2.2145, + "step": 1898 + }, + { + "epoch": 0.5828729281767956, + "grad_norm": 0.6813456416130066, + "learning_rate": 9.979103020126218e-05, + "loss": 2.194, + "step": 1899 + }, + { + "epoch": 0.5831798649478207, + "grad_norm": 0.6922066807746887, + "learning_rate": 9.979057598976202e-05, + "loss": 2.2335, + "step": 1900 + }, + { + "epoch": 0.5834868017188459, + "grad_norm": 0.5800344944000244, + "learning_rate": 9.97901212862022e-05, + "loss": 2.2159, + "step": 1901 + }, + { + "epoch": 0.583793738489871, + "grad_norm": 0.5770835280418396, + "learning_rate": 9.978966609058722e-05, + "loss": 2.2217, + "step": 1902 + }, + { + "epoch": 0.5841006752608963, + "grad_norm": 0.6217128038406372, + "learning_rate": 9.978921040292158e-05, + "loss": 2.2703, + "step": 1903 + }, + { + "epoch": 0.5844076120319214, + "grad_norm": 0.6684436798095703, + "learning_rate": 9.97887542232098e-05, + "loss": 2.2747, + "step": 1904 + }, + { + "epoch": 0.5847145488029466, + "grad_norm": 0.6261670589447021, + "learning_rate": 9.978829755145633e-05, + "loss": 2.2867, + "step": 1905 + }, + { + "epoch": 0.5850214855739717, + "grad_norm": 0.646051824092865, + "learning_rate": 9.978784038766575e-05, + "loss": 2.2493, + "step": 1906 + }, + { + "epoch": 0.5853284223449969, + "grad_norm": 0.6757060885429382, + "learning_rate": 9.978738273184254e-05, + "loss": 2.218, + "step": 1907 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.7867937684059143, + "learning_rate": 9.978692458399122e-05, + "loss": 2.3405, + "step": 1908 + }, + { + "epoch": 0.5859422958870473, + "grad_norm": 0.8349789381027222, + "learning_rate": 9.978646594411636e-05, + "loss": 2.3292, + "step": 1909 + }, + { + "epoch": 0.5862492326580724, + "grad_norm": 0.8739562034606934, + "learning_rate": 9.978600681222243e-05, + "loss": 2.2132, + "step": 1910 + }, + { + "epoch": 0.5865561694290976, + "grad_norm": 0.8187520503997803, + "learning_rate": 9.978554718831402e-05, + "loss": 2.3078, + "step": 1911 + }, + { + "epoch": 0.5868631062001227, + "grad_norm": 0.8463271856307983, + "learning_rate": 9.978508707239565e-05, + "loss": 2.1924, + "step": 1912 + }, + { + "epoch": 0.5871700429711479, + "grad_norm": 0.8674206733703613, + "learning_rate": 9.978462646447187e-05, + "loss": 2.2185, + "step": 1913 + }, + { + "epoch": 0.5874769797421732, + "grad_norm": 0.7828893065452576, + "learning_rate": 9.978416536454722e-05, + "loss": 2.3137, + "step": 1914 + }, + { + "epoch": 0.5877839165131983, + "grad_norm": 0.7868914604187012, + "learning_rate": 9.978370377262629e-05, + "loss": 2.2202, + "step": 1915 + }, + { + "epoch": 0.5880908532842235, + "grad_norm": 0.811596155166626, + "learning_rate": 9.97832416887136e-05, + "loss": 2.3463, + "step": 1916 + }, + { + "epoch": 0.5883977900552486, + "grad_norm": 0.9281075596809387, + "learning_rate": 9.978277911281375e-05, + "loss": 2.2394, + "step": 1917 + }, + { + "epoch": 0.5887047268262738, + "grad_norm": 0.8862313628196716, + "learning_rate": 9.978231604493129e-05, + "loss": 2.2456, + "step": 1918 + }, + { + "epoch": 0.589011663597299, + "grad_norm": 0.8411116600036621, + "learning_rate": 9.978185248507081e-05, + "loss": 2.2409, + "step": 1919 + }, + { + "epoch": 0.5893186003683242, + "grad_norm": 0.8205060958862305, + "learning_rate": 9.978138843323688e-05, + "loss": 2.2468, + "step": 1920 + }, + { + "epoch": 0.5896255371393493, + "grad_norm": 0.8103171586990356, + "learning_rate": 9.97809238894341e-05, + "loss": 2.2979, + "step": 1921 + }, + { + "epoch": 0.5899324739103745, + "grad_norm": 0.7937025427818298, + "learning_rate": 9.978045885366704e-05, + "loss": 2.3582, + "step": 1922 + }, + { + "epoch": 0.5902394106813996, + "grad_norm": 0.7983896136283875, + "learning_rate": 9.977999332594032e-05, + "loss": 2.2725, + "step": 1923 + }, + { + "epoch": 0.5905463474524248, + "grad_norm": 0.8274399042129517, + "learning_rate": 9.977952730625852e-05, + "loss": 2.3091, + "step": 1924 + }, + { + "epoch": 0.59085328422345, + "grad_norm": 0.9385362863540649, + "learning_rate": 9.977906079462627e-05, + "loss": 2.4322, + "step": 1925 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.8405537009239197, + "learning_rate": 9.977859379104814e-05, + "loss": 2.1606, + "step": 1926 + }, + { + "epoch": 0.5914671577655003, + "grad_norm": 0.8082418441772461, + "learning_rate": 9.97781262955288e-05, + "loss": 2.2929, + "step": 1927 + }, + { + "epoch": 0.5917740945365255, + "grad_norm": 0.7444280385971069, + "learning_rate": 9.977765830807283e-05, + "loss": 2.3217, + "step": 1928 + }, + { + "epoch": 0.5920810313075506, + "grad_norm": 0.7369982600212097, + "learning_rate": 9.977718982868485e-05, + "loss": 2.2658, + "step": 1929 + }, + { + "epoch": 0.5923879680785759, + "grad_norm": 0.6842257380485535, + "learning_rate": 9.977672085736951e-05, + "loss": 2.2243, + "step": 1930 + }, + { + "epoch": 0.592694904849601, + "grad_norm": 0.6954882740974426, + "learning_rate": 9.977625139413145e-05, + "loss": 2.2802, + "step": 1931 + }, + { + "epoch": 0.5930018416206262, + "grad_norm": 0.749829888343811, + "learning_rate": 9.97757814389753e-05, + "loss": 2.3166, + "step": 1932 + }, + { + "epoch": 0.5933087783916513, + "grad_norm": 0.7725609540939331, + "learning_rate": 9.977531099190569e-05, + "loss": 2.2367, + "step": 1933 + }, + { + "epoch": 0.5936157151626765, + "grad_norm": 0.7467440366744995, + "learning_rate": 9.977484005292728e-05, + "loss": 2.2704, + "step": 1934 + }, + { + "epoch": 0.5939226519337016, + "grad_norm": 0.7104424834251404, + "learning_rate": 9.977436862204475e-05, + "loss": 2.1983, + "step": 1935 + }, + { + "epoch": 0.5942295887047269, + "grad_norm": 0.7562711834907532, + "learning_rate": 9.977389669926272e-05, + "loss": 2.2857, + "step": 1936 + }, + { + "epoch": 0.594536525475752, + "grad_norm": 0.7803298830986023, + "learning_rate": 9.977342428458585e-05, + "loss": 2.3526, + "step": 1937 + }, + { + "epoch": 0.5948434622467772, + "grad_norm": 0.7487826943397522, + "learning_rate": 9.977295137801885e-05, + "loss": 2.2338, + "step": 1938 + }, + { + "epoch": 0.5951503990178023, + "grad_norm": 0.6969291567802429, + "learning_rate": 9.977247797956639e-05, + "loss": 2.2185, + "step": 1939 + }, + { + "epoch": 0.5954573357888275, + "grad_norm": 0.6293052434921265, + "learning_rate": 9.977200408923311e-05, + "loss": 2.2767, + "step": 1940 + }, + { + "epoch": 0.5957642725598526, + "grad_norm": 0.7457680702209473, + "learning_rate": 9.97715297070237e-05, + "loss": 2.2688, + "step": 1941 + }, + { + "epoch": 0.5960712093308779, + "grad_norm": 0.7255130410194397, + "learning_rate": 9.977105483294288e-05, + "loss": 2.2157, + "step": 1942 + }, + { + "epoch": 0.596378146101903, + "grad_norm": 0.739815890789032, + "learning_rate": 9.977057946699532e-05, + "loss": 2.306, + "step": 1943 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.7493855357170105, + "learning_rate": 9.977010360918571e-05, + "loss": 2.1893, + "step": 1944 + }, + { + "epoch": 0.5969920196439533, + "grad_norm": 0.7976173758506775, + "learning_rate": 9.976962725951878e-05, + "loss": 2.3288, + "step": 1945 + }, + { + "epoch": 0.5972989564149785, + "grad_norm": 0.9487287998199463, + "learning_rate": 9.976915041799921e-05, + "loss": 2.4484, + "step": 1946 + }, + { + "epoch": 0.5976058931860037, + "grad_norm": 0.9866845011711121, + "learning_rate": 9.976867308463174e-05, + "loss": 2.3223, + "step": 1947 + }, + { + "epoch": 0.5979128299570289, + "grad_norm": 0.9258660674095154, + "learning_rate": 9.976819525942107e-05, + "loss": 2.2358, + "step": 1948 + }, + { + "epoch": 0.598219766728054, + "grad_norm": 0.9822832345962524, + "learning_rate": 9.976771694237192e-05, + "loss": 2.2951, + "step": 1949 + }, + { + "epoch": 0.5985267034990792, + "grad_norm": 1.005528450012207, + "learning_rate": 9.976723813348902e-05, + "loss": 2.2604, + "step": 1950 + }, + { + "epoch": 0.5988336402701043, + "grad_norm": 0.8988018035888672, + "learning_rate": 9.976675883277711e-05, + "loss": 2.3419, + "step": 1951 + }, + { + "epoch": 0.5991405770411296, + "grad_norm": 0.7386319041252136, + "learning_rate": 9.976627904024091e-05, + "loss": 2.2357, + "step": 1952 + }, + { + "epoch": 0.5994475138121547, + "grad_norm": 0.7715404033660889, + "learning_rate": 9.976579875588518e-05, + "loss": 2.3482, + "step": 1953 + }, + { + "epoch": 0.5997544505831799, + "grad_norm": 0.7529712319374084, + "learning_rate": 9.976531797971464e-05, + "loss": 2.1735, + "step": 1954 + }, + { + "epoch": 0.600061387354205, + "grad_norm": 0.8589643836021423, + "learning_rate": 9.97648367117341e-05, + "loss": 2.305, + "step": 1955 + }, + { + "epoch": 0.6003683241252302, + "grad_norm": 0.9038915634155273, + "learning_rate": 9.976435495194823e-05, + "loss": 2.2123, + "step": 1956 + }, + { + "epoch": 0.6006752608962553, + "grad_norm": 0.9388678073883057, + "learning_rate": 9.976387270036186e-05, + "loss": 2.1792, + "step": 1957 + }, + { + "epoch": 0.6009821976672806, + "grad_norm": 0.7970952391624451, + "learning_rate": 9.976338995697974e-05, + "loss": 2.2425, + "step": 1958 + }, + { + "epoch": 0.6012891344383057, + "grad_norm": 0.7219900488853455, + "learning_rate": 9.976290672180662e-05, + "loss": 2.1984, + "step": 1959 + }, + { + "epoch": 0.6015960712093309, + "grad_norm": 0.639715313911438, + "learning_rate": 9.976242299484728e-05, + "loss": 2.2796, + "step": 1960 + }, + { + "epoch": 0.601903007980356, + "grad_norm": 0.6734911799430847, + "learning_rate": 9.976193877610652e-05, + "loss": 2.3066, + "step": 1961 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.8328932523727417, + "learning_rate": 9.976145406558912e-05, + "loss": 2.3958, + "step": 1962 + }, + { + "epoch": 0.6025168815224063, + "grad_norm": 0.9552088379859924, + "learning_rate": 9.976096886329986e-05, + "loss": 2.3246, + "step": 1963 + }, + { + "epoch": 0.6028238182934316, + "grad_norm": 0.8407328128814697, + "learning_rate": 9.976048316924354e-05, + "loss": 2.2922, + "step": 1964 + }, + { + "epoch": 0.6031307550644567, + "grad_norm": 0.6899709105491638, + "learning_rate": 9.975999698342495e-05, + "loss": 2.1808, + "step": 1965 + }, + { + "epoch": 0.6034376918354819, + "grad_norm": 0.8114390969276428, + "learning_rate": 9.975951030584892e-05, + "loss": 2.3516, + "step": 1966 + }, + { + "epoch": 0.603744628606507, + "grad_norm": 0.8071461319923401, + "learning_rate": 9.975902313652024e-05, + "loss": 2.2044, + "step": 1967 + }, + { + "epoch": 0.6040515653775322, + "grad_norm": 0.8767913579940796, + "learning_rate": 9.975853547544372e-05, + "loss": 2.24, + "step": 1968 + }, + { + "epoch": 0.6043585021485574, + "grad_norm": 0.817095935344696, + "learning_rate": 9.975804732262419e-05, + "loss": 2.169, + "step": 1969 + }, + { + "epoch": 0.6046654389195826, + "grad_norm": 0.6818623542785645, + "learning_rate": 9.975755867806648e-05, + "loss": 2.2869, + "step": 1970 + }, + { + "epoch": 0.6049723756906077, + "grad_norm": 0.7248693704605103, + "learning_rate": 9.97570695417754e-05, + "loss": 2.2159, + "step": 1971 + }, + { + "epoch": 0.6052793124616329, + "grad_norm": 0.6425455212593079, + "learning_rate": 9.975657991375581e-05, + "loss": 2.2173, + "step": 1972 + }, + { + "epoch": 0.605586249232658, + "grad_norm": 0.6856566071510315, + "learning_rate": 9.975608979401252e-05, + "loss": 2.2994, + "step": 1973 + }, + { + "epoch": 0.6058931860036832, + "grad_norm": 0.6731004118919373, + "learning_rate": 9.97555991825504e-05, + "loss": 2.2286, + "step": 1974 + }, + { + "epoch": 0.6062001227747084, + "grad_norm": 0.7461759448051453, + "learning_rate": 9.975510807937428e-05, + "loss": 2.2057, + "step": 1975 + }, + { + "epoch": 0.6065070595457336, + "grad_norm": 0.7256236672401428, + "learning_rate": 9.975461648448902e-05, + "loss": 2.2686, + "step": 1976 + }, + { + "epoch": 0.6068139963167587, + "grad_norm": 0.7254514098167419, + "learning_rate": 9.975412439789949e-05, + "loss": 2.2748, + "step": 1977 + }, + { + "epoch": 0.6071209330877839, + "grad_norm": 0.7280047535896301, + "learning_rate": 9.975363181961052e-05, + "loss": 2.27, + "step": 1978 + }, + { + "epoch": 0.607427869858809, + "grad_norm": 0.6801813244819641, + "learning_rate": 9.9753138749627e-05, + "loss": 2.2356, + "step": 1979 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.841946005821228, + "learning_rate": 9.975264518795382e-05, + "loss": 2.3887, + "step": 1980 + }, + { + "epoch": 0.6080417434008594, + "grad_norm": 0.9610007405281067, + "learning_rate": 9.975215113459582e-05, + "loss": 2.2857, + "step": 1981 + }, + { + "epoch": 0.6083486801718846, + "grad_norm": 0.8726536631584167, + "learning_rate": 9.975165658955791e-05, + "loss": 2.3137, + "step": 1982 + }, + { + "epoch": 0.6086556169429097, + "grad_norm": 0.9275946021080017, + "learning_rate": 9.975116155284498e-05, + "loss": 2.291, + "step": 1983 + }, + { + "epoch": 0.6089625537139349, + "grad_norm": 0.9045402407646179, + "learning_rate": 9.97506660244619e-05, + "loss": 2.2183, + "step": 1984 + }, + { + "epoch": 0.6092694904849602, + "grad_norm": 0.7913599610328674, + "learning_rate": 9.975017000441358e-05, + "loss": 2.349, + "step": 1985 + }, + { + "epoch": 0.6095764272559853, + "grad_norm": 0.714824378490448, + "learning_rate": 9.974967349270492e-05, + "loss": 2.2163, + "step": 1986 + }, + { + "epoch": 0.6098833640270105, + "grad_norm": 0.7178559899330139, + "learning_rate": 9.974917648934084e-05, + "loss": 2.2338, + "step": 1987 + }, + { + "epoch": 0.6101903007980356, + "grad_norm": 0.8417280912399292, + "learning_rate": 9.97486789943262e-05, + "loss": 2.1961, + "step": 1988 + }, + { + "epoch": 0.6104972375690608, + "grad_norm": 0.8488532304763794, + "learning_rate": 9.9748181007666e-05, + "loss": 2.2509, + "step": 1989 + }, + { + "epoch": 0.6108041743400859, + "grad_norm": 0.796309769153595, + "learning_rate": 9.974768252936509e-05, + "loss": 2.2948, + "step": 1990 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.7163965702056885, + "learning_rate": 9.974718355942843e-05, + "loss": 2.2136, + "step": 1991 + }, + { + "epoch": 0.6114180478821363, + "grad_norm": 0.6620060205459595, + "learning_rate": 9.974668409786095e-05, + "loss": 2.2442, + "step": 1992 + }, + { + "epoch": 0.6117249846531615, + "grad_norm": 0.6843542456626892, + "learning_rate": 9.974618414466759e-05, + "loss": 2.1972, + "step": 1993 + }, + { + "epoch": 0.6120319214241866, + "grad_norm": 0.699847936630249, + "learning_rate": 9.974568369985327e-05, + "loss": 2.2194, + "step": 1994 + }, + { + "epoch": 0.6123388581952118, + "grad_norm": 0.693384051322937, + "learning_rate": 9.974518276342293e-05, + "loss": 2.2446, + "step": 1995 + }, + { + "epoch": 0.612645794966237, + "grad_norm": 0.6022316813468933, + "learning_rate": 9.974468133538155e-05, + "loss": 2.2037, + "step": 1996 + }, + { + "epoch": 0.6129527317372622, + "grad_norm": 0.6317062377929688, + "learning_rate": 9.974417941573409e-05, + "loss": 2.1855, + "step": 1997 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.7291355133056641, + "learning_rate": 9.974367700448547e-05, + "loss": 2.2179, + "step": 1998 + }, + { + "epoch": 0.6135666052793125, + "grad_norm": 0.6776867508888245, + "learning_rate": 9.97431741016407e-05, + "loss": 2.2437, + "step": 1999 + }, + { + "epoch": 0.6138735420503376, + "grad_norm": 0.6598517298698425, + "learning_rate": 9.97426707072047e-05, + "loss": 2.2775, + "step": 2000 + }, + { + "epoch": 0.6141804788213628, + "grad_norm": 0.6681709289550781, + "learning_rate": 9.974216682118249e-05, + "loss": 2.2004, + "step": 2001 + }, + { + "epoch": 0.614487415592388, + "grad_norm": 0.6725168228149414, + "learning_rate": 9.974166244357903e-05, + "loss": 2.2922, + "step": 2002 + }, + { + "epoch": 0.6147943523634132, + "grad_norm": 0.6547908782958984, + "learning_rate": 9.974115757439931e-05, + "loss": 2.2195, + "step": 2003 + }, + { + "epoch": 0.6151012891344383, + "grad_norm": 0.7195348739624023, + "learning_rate": 9.974065221364831e-05, + "loss": 2.2862, + "step": 2004 + }, + { + "epoch": 0.6154082259054635, + "grad_norm": 0.7992655038833618, + "learning_rate": 9.974014636133103e-05, + "loss": 2.3109, + "step": 2005 + }, + { + "epoch": 0.6157151626764886, + "grad_norm": 0.7932934165000916, + "learning_rate": 9.973964001745249e-05, + "loss": 2.2869, + "step": 2006 + }, + { + "epoch": 0.6160220994475138, + "grad_norm": 0.7778924107551575, + "learning_rate": 9.973913318201763e-05, + "loss": 2.2046, + "step": 2007 + }, + { + "epoch": 0.616329036218539, + "grad_norm": 0.7951294183731079, + "learning_rate": 9.973862585503155e-05, + "loss": 2.221, + "step": 2008 + }, + { + "epoch": 0.6166359729895642, + "grad_norm": 0.729552686214447, + "learning_rate": 9.97381180364992e-05, + "loss": 2.2929, + "step": 2009 + }, + { + "epoch": 0.6169429097605893, + "grad_norm": 0.731516420841217, + "learning_rate": 9.973760972642561e-05, + "loss": 2.2673, + "step": 2010 + }, + { + "epoch": 0.6172498465316145, + "grad_norm": 0.6950094103813171, + "learning_rate": 9.973710092481581e-05, + "loss": 2.2029, + "step": 2011 + }, + { + "epoch": 0.6175567833026396, + "grad_norm": 0.6260825395584106, + "learning_rate": 9.973659163167484e-05, + "loss": 2.3037, + "step": 2012 + }, + { + "epoch": 0.6178637200736649, + "grad_norm": 0.6949467658996582, + "learning_rate": 9.97360818470077e-05, + "loss": 2.2699, + "step": 2013 + }, + { + "epoch": 0.61817065684469, + "grad_norm": 0.7322572469711304, + "learning_rate": 9.973557157081945e-05, + "loss": 2.2921, + "step": 2014 + }, + { + "epoch": 0.6184775936157152, + "grad_norm": 0.8999563455581665, + "learning_rate": 9.973506080311514e-05, + "loss": 2.2499, + "step": 2015 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.9269914031028748, + "learning_rate": 9.973454954389981e-05, + "loss": 2.2676, + "step": 2016 + }, + { + "epoch": 0.6190914671577655, + "grad_norm": 0.8630712628364563, + "learning_rate": 9.973403779317852e-05, + "loss": 2.1379, + "step": 2017 + }, + { + "epoch": 0.6193984039287906, + "grad_norm": 0.8249645233154297, + "learning_rate": 9.97335255509563e-05, + "loss": 2.3109, + "step": 2018 + }, + { + "epoch": 0.6197053406998159, + "grad_norm": 0.7832711338996887, + "learning_rate": 9.973301281723824e-05, + "loss": 2.1316, + "step": 2019 + }, + { + "epoch": 0.620012277470841, + "grad_norm": 0.7502821683883667, + "learning_rate": 9.97324995920294e-05, + "loss": 2.2188, + "step": 2020 + }, + { + "epoch": 0.6203192142418662, + "grad_norm": 0.7804487347602844, + "learning_rate": 9.973198587533483e-05, + "loss": 2.2639, + "step": 2021 + }, + { + "epoch": 0.6206261510128913, + "grad_norm": 0.9198356866836548, + "learning_rate": 9.973147166715963e-05, + "loss": 2.2574, + "step": 2022 + }, + { + "epoch": 0.6209330877839165, + "grad_norm": 0.8792869448661804, + "learning_rate": 9.97309569675089e-05, + "loss": 2.2228, + "step": 2023 + }, + { + "epoch": 0.6212400245549416, + "grad_norm": 0.779772937297821, + "learning_rate": 9.97304417763877e-05, + "loss": 2.2179, + "step": 2024 + }, + { + "epoch": 0.6215469613259669, + "grad_norm": 0.7702100276947021, + "learning_rate": 9.972992609380111e-05, + "loss": 2.3872, + "step": 2025 + }, + { + "epoch": 0.621853898096992, + "grad_norm": 0.8576669096946716, + "learning_rate": 9.972940991975426e-05, + "loss": 2.2279, + "step": 2026 + }, + { + "epoch": 0.6221608348680172, + "grad_norm": 0.8312802314758301, + "learning_rate": 9.972889325425223e-05, + "loss": 2.3507, + "step": 2027 + }, + { + "epoch": 0.6224677716390423, + "grad_norm": 0.7873719930648804, + "learning_rate": 9.972837609730013e-05, + "loss": 2.2252, + "step": 2028 + }, + { + "epoch": 0.6227747084100675, + "grad_norm": 0.7763897180557251, + "learning_rate": 9.972785844890307e-05, + "loss": 2.2559, + "step": 2029 + }, + { + "epoch": 0.6230816451810927, + "grad_norm": 0.7053700685501099, + "learning_rate": 9.972734030906617e-05, + "loss": 2.2248, + "step": 2030 + }, + { + "epoch": 0.6233885819521179, + "grad_norm": 0.8800643682479858, + "learning_rate": 9.972682167779453e-05, + "loss": 2.3111, + "step": 2031 + }, + { + "epoch": 0.623695518723143, + "grad_norm": 0.7237632274627686, + "learning_rate": 9.97263025550933e-05, + "loss": 2.2255, + "step": 2032 + }, + { + "epoch": 0.6240024554941682, + "grad_norm": 0.7139064073562622, + "learning_rate": 9.97257829409676e-05, + "loss": 2.2065, + "step": 2033 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.6514315009117126, + "learning_rate": 9.972526283542259e-05, + "loss": 2.2176, + "step": 2034 + }, + { + "epoch": 0.6246163290362186, + "grad_norm": 0.726828932762146, + "learning_rate": 9.972474223846337e-05, + "loss": 2.2236, + "step": 2035 + }, + { + "epoch": 0.6249232658072437, + "grad_norm": 0.7121313810348511, + "learning_rate": 9.97242211500951e-05, + "loss": 2.2696, + "step": 2036 + }, + { + "epoch": 0.6252302025782689, + "grad_norm": 0.7203021049499512, + "learning_rate": 9.972369957032293e-05, + "loss": 2.2418, + "step": 2037 + }, + { + "epoch": 0.625537139349294, + "grad_norm": 0.6843051910400391, + "learning_rate": 9.972317749915203e-05, + "loss": 2.2408, + "step": 2038 + }, + { + "epoch": 0.6258440761203192, + "grad_norm": 0.6523141264915466, + "learning_rate": 9.972265493658754e-05, + "loss": 2.1693, + "step": 2039 + }, + { + "epoch": 0.6261510128913443, + "grad_norm": 0.6263946294784546, + "learning_rate": 9.972213188263463e-05, + "loss": 2.2477, + "step": 2040 + }, + { + "epoch": 0.6264579496623696, + "grad_norm": 0.6428464651107788, + "learning_rate": 9.972160833729847e-05, + "loss": 2.2131, + "step": 2041 + }, + { + "epoch": 0.6267648864333947, + "grad_norm": 0.6333484649658203, + "learning_rate": 9.972108430058423e-05, + "loss": 2.2806, + "step": 2042 + }, + { + "epoch": 0.6270718232044199, + "grad_norm": 0.7168832421302795, + "learning_rate": 9.97205597724971e-05, + "loss": 2.2468, + "step": 2043 + }, + { + "epoch": 0.627378759975445, + "grad_norm": 0.7522227168083191, + "learning_rate": 9.972003475304226e-05, + "loss": 2.249, + "step": 2044 + }, + { + "epoch": 0.6276856967464702, + "grad_norm": 0.6810066103935242, + "learning_rate": 9.971950924222488e-05, + "loss": 2.1988, + "step": 2045 + }, + { + "epoch": 0.6279926335174953, + "grad_norm": 0.6983187198638916, + "learning_rate": 9.971898324005018e-05, + "loss": 2.2444, + "step": 2046 + }, + { + "epoch": 0.6282995702885206, + "grad_norm": 0.7261439561843872, + "learning_rate": 9.971845674652333e-05, + "loss": 2.1789, + "step": 2047 + }, + { + "epoch": 0.6286065070595457, + "grad_norm": 0.6844322681427002, + "learning_rate": 9.971792976164957e-05, + "loss": 2.2666, + "step": 2048 + }, + { + "epoch": 0.6289134438305709, + "grad_norm": 0.7166746258735657, + "learning_rate": 9.971740228543407e-05, + "loss": 2.3002, + "step": 2049 + }, + { + "epoch": 0.629220380601596, + "grad_norm": 0.7386785745620728, + "learning_rate": 9.971687431788207e-05, + "loss": 2.1798, + "step": 2050 + }, + { + "epoch": 0.6295273173726212, + "grad_norm": 0.6873611211776733, + "learning_rate": 9.971634585899878e-05, + "loss": 2.184, + "step": 2051 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.8005948066711426, + "learning_rate": 9.971581690878941e-05, + "loss": 2.2778, + "step": 2052 + }, + { + "epoch": 0.6301411909146716, + "grad_norm": 0.8972415924072266, + "learning_rate": 9.971528746725922e-05, + "loss": 2.2822, + "step": 2053 + }, + { + "epoch": 0.6304481276856968, + "grad_norm": 0.7935822010040283, + "learning_rate": 9.97147575344134e-05, + "loss": 2.1732, + "step": 2054 + }, + { + "epoch": 0.6307550644567219, + "grad_norm": 0.7891644239425659, + "learning_rate": 9.971422711025721e-05, + "loss": 2.2765, + "step": 2055 + }, + { + "epoch": 0.6310620012277471, + "grad_norm": 0.7857005000114441, + "learning_rate": 9.971369619479589e-05, + "loss": 2.2386, + "step": 2056 + }, + { + "epoch": 0.6313689379987723, + "grad_norm": 0.6909852623939514, + "learning_rate": 9.97131647880347e-05, + "loss": 2.1251, + "step": 2057 + }, + { + "epoch": 0.6316758747697975, + "grad_norm": 0.6352387070655823, + "learning_rate": 9.971263288997885e-05, + "loss": 2.1883, + "step": 2058 + }, + { + "epoch": 0.6319828115408226, + "grad_norm": 0.5811386704444885, + "learning_rate": 9.971210050063364e-05, + "loss": 2.281, + "step": 2059 + }, + { + "epoch": 0.6322897483118478, + "grad_norm": 0.6227630376815796, + "learning_rate": 9.971156762000432e-05, + "loss": 2.1346, + "step": 2060 + }, + { + "epoch": 0.6325966850828729, + "grad_norm": 0.6628422737121582, + "learning_rate": 9.971103424809616e-05, + "loss": 2.2617, + "step": 2061 + }, + { + "epoch": 0.6329036218538981, + "grad_norm": 0.7212308645248413, + "learning_rate": 9.97105003849144e-05, + "loss": 2.1764, + "step": 2062 + }, + { + "epoch": 0.6332105586249233, + "grad_norm": 0.8368894457817078, + "learning_rate": 9.970996603046435e-05, + "loss": 2.2897, + "step": 2063 + }, + { + "epoch": 0.6335174953959485, + "grad_norm": 0.8797467350959778, + "learning_rate": 9.970943118475129e-05, + "loss": 2.1987, + "step": 2064 + }, + { + "epoch": 0.6338244321669736, + "grad_norm": 0.9241101145744324, + "learning_rate": 9.970889584778047e-05, + "loss": 2.2759, + "step": 2065 + }, + { + "epoch": 0.6341313689379988, + "grad_norm": 0.8636183142662048, + "learning_rate": 9.970836001955723e-05, + "loss": 2.2188, + "step": 2066 + }, + { + "epoch": 0.6344383057090239, + "grad_norm": 0.8965754508972168, + "learning_rate": 9.970782370008682e-05, + "loss": 2.2845, + "step": 2067 + }, + { + "epoch": 0.6347452424800492, + "grad_norm": 0.9064372777938843, + "learning_rate": 9.970728688937459e-05, + "loss": 2.1787, + "step": 2068 + }, + { + "epoch": 0.6350521792510743, + "grad_norm": 0.7387171387672424, + "learning_rate": 9.970674958742579e-05, + "loss": 2.1805, + "step": 2069 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.6220484972000122, + "learning_rate": 9.970621179424578e-05, + "loss": 2.2762, + "step": 2070 + }, + { + "epoch": 0.6356660527931246, + "grad_norm": 0.6268464922904968, + "learning_rate": 9.970567350983984e-05, + "loss": 2.2491, + "step": 2071 + }, + { + "epoch": 0.6359729895641498, + "grad_norm": 0.6385738253593445, + "learning_rate": 9.97051347342133e-05, + "loss": 2.2126, + "step": 2072 + }, + { + "epoch": 0.6362799263351749, + "grad_norm": 0.7084285020828247, + "learning_rate": 9.970459546737148e-05, + "loss": 2.2364, + "step": 2073 + }, + { + "epoch": 0.6365868631062002, + "grad_norm": 0.6957145929336548, + "learning_rate": 9.97040557093197e-05, + "loss": 2.266, + "step": 2074 + }, + { + "epoch": 0.6368937998772253, + "grad_norm": 0.6037309169769287, + "learning_rate": 9.970351546006334e-05, + "loss": 2.1514, + "step": 2075 + }, + { + "epoch": 0.6372007366482505, + "grad_norm": 0.6342970132827759, + "learning_rate": 9.97029747196077e-05, + "loss": 2.1602, + "step": 2076 + }, + { + "epoch": 0.6375076734192756, + "grad_norm": 0.5793863534927368, + "learning_rate": 9.970243348795812e-05, + "loss": 2.1853, + "step": 2077 + }, + { + "epoch": 0.6378146101903008, + "grad_norm": 0.5420103073120117, + "learning_rate": 9.970189176511997e-05, + "loss": 2.1885, + "step": 2078 + }, + { + "epoch": 0.638121546961326, + "grad_norm": 0.6713188886642456, + "learning_rate": 9.97013495510986e-05, + "loss": 2.2641, + "step": 2079 + }, + { + "epoch": 0.6384284837323512, + "grad_norm": 0.7410796880722046, + "learning_rate": 9.970080684589935e-05, + "loss": 2.2248, + "step": 2080 + }, + { + "epoch": 0.6387354205033763, + "grad_norm": 0.7138017416000366, + "learning_rate": 9.970026364952761e-05, + "loss": 2.1975, + "step": 2081 + }, + { + "epoch": 0.6390423572744015, + "grad_norm": 0.7553584575653076, + "learning_rate": 9.969971996198873e-05, + "loss": 2.2482, + "step": 2082 + }, + { + "epoch": 0.6393492940454266, + "grad_norm": 0.7082852125167847, + "learning_rate": 9.969917578328808e-05, + "loss": 2.1681, + "step": 2083 + }, + { + "epoch": 0.6396562308164518, + "grad_norm": 0.6190223097801208, + "learning_rate": 9.969863111343105e-05, + "loss": 2.1995, + "step": 2084 + }, + { + "epoch": 0.639963167587477, + "grad_norm": 0.6640429496765137, + "learning_rate": 9.969808595242302e-05, + "loss": 2.2969, + "step": 2085 + }, + { + "epoch": 0.6402701043585022, + "grad_norm": 0.761377215385437, + "learning_rate": 9.969754030026936e-05, + "loss": 2.2412, + "step": 2086 + }, + { + "epoch": 0.6405770411295273, + "grad_norm": 0.7226401567459106, + "learning_rate": 9.969699415697551e-05, + "loss": 2.1852, + "step": 2087 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.6474639177322388, + "learning_rate": 9.969644752254681e-05, + "loss": 2.1867, + "step": 2088 + }, + { + "epoch": 0.6411909146715776, + "grad_norm": 0.6725835800170898, + "learning_rate": 9.96959003969887e-05, + "loss": 2.1962, + "step": 2089 + }, + { + "epoch": 0.6414978514426029, + "grad_norm": 0.6669641733169556, + "learning_rate": 9.969535278030657e-05, + "loss": 2.2045, + "step": 2090 + }, + { + "epoch": 0.641804788213628, + "grad_norm": 0.7604048252105713, + "learning_rate": 9.969480467250583e-05, + "loss": 2.2543, + "step": 2091 + }, + { + "epoch": 0.6421117249846532, + "grad_norm": 0.9369953870773315, + "learning_rate": 9.969425607359191e-05, + "loss": 2.2461, + "step": 2092 + }, + { + "epoch": 0.6424186617556783, + "grad_norm": 1.116156816482544, + "learning_rate": 9.969370698357022e-05, + "loss": 2.2447, + "step": 2093 + }, + { + "epoch": 0.6427255985267035, + "grad_norm": 0.9179674983024597, + "learning_rate": 9.96931574024462e-05, + "loss": 2.2164, + "step": 2094 + }, + { + "epoch": 0.6430325352977286, + "grad_norm": 0.7629393339157104, + "learning_rate": 9.969260733022526e-05, + "loss": 2.22, + "step": 2095 + }, + { + "epoch": 0.6433394720687539, + "grad_norm": 0.7152948379516602, + "learning_rate": 9.969205676691286e-05, + "loss": 2.1967, + "step": 2096 + }, + { + "epoch": 0.643646408839779, + "grad_norm": 0.7527763247489929, + "learning_rate": 9.969150571251442e-05, + "loss": 2.2263, + "step": 2097 + }, + { + "epoch": 0.6439533456108042, + "grad_norm": 0.9889422655105591, + "learning_rate": 9.96909541670354e-05, + "loss": 2.2127, + "step": 2098 + }, + { + "epoch": 0.6442602823818293, + "grad_norm": 1.0340619087219238, + "learning_rate": 9.969040213048125e-05, + "loss": 2.2392, + "step": 2099 + }, + { + "epoch": 0.6445672191528545, + "grad_norm": 0.735322892665863, + "learning_rate": 9.968984960285743e-05, + "loss": 2.1351, + "step": 2100 + }, + { + "epoch": 0.6448741559238796, + "grad_norm": 0.6575397849082947, + "learning_rate": 9.968929658416936e-05, + "loss": 2.2481, + "step": 2101 + }, + { + "epoch": 0.6451810926949049, + "grad_norm": 0.6891960501670837, + "learning_rate": 9.968874307442258e-05, + "loss": 2.2164, + "step": 2102 + }, + { + "epoch": 0.64548802946593, + "grad_norm": 0.792298436164856, + "learning_rate": 9.968818907362248e-05, + "loss": 2.1681, + "step": 2103 + }, + { + "epoch": 0.6457949662369552, + "grad_norm": 0.8438142538070679, + "learning_rate": 9.968763458177459e-05, + "loss": 2.2123, + "step": 2104 + }, + { + "epoch": 0.6461019030079803, + "grad_norm": 0.7494921088218689, + "learning_rate": 9.968707959888436e-05, + "loss": 2.1863, + "step": 2105 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.7049927115440369, + "learning_rate": 9.968652412495731e-05, + "loss": 2.2364, + "step": 2106 + }, + { + "epoch": 0.6467157765500307, + "grad_norm": 0.7586455345153809, + "learning_rate": 9.968596815999889e-05, + "loss": 2.1976, + "step": 2107 + }, + { + "epoch": 0.6470227133210559, + "grad_norm": 0.7762691974639893, + "learning_rate": 9.968541170401462e-05, + "loss": 2.2323, + "step": 2108 + }, + { + "epoch": 0.647329650092081, + "grad_norm": 0.8127642869949341, + "learning_rate": 9.968485475700998e-05, + "loss": 2.1577, + "step": 2109 + }, + { + "epoch": 0.6476365868631062, + "grad_norm": 0.6762635111808777, + "learning_rate": 9.968429731899049e-05, + "loss": 2.1972, + "step": 2110 + }, + { + "epoch": 0.6479435236341313, + "grad_norm": 0.675707995891571, + "learning_rate": 9.968373938996165e-05, + "loss": 2.1932, + "step": 2111 + }, + { + "epoch": 0.6482504604051565, + "grad_norm": 0.6996815204620361, + "learning_rate": 9.968318096992898e-05, + "loss": 2.2695, + "step": 2112 + }, + { + "epoch": 0.6485573971761817, + "grad_norm": 0.8519851565361023, + "learning_rate": 9.968262205889799e-05, + "loss": 2.2662, + "step": 2113 + }, + { + "epoch": 0.6488643339472069, + "grad_norm": 0.7621145844459534, + "learning_rate": 9.968206265687421e-05, + "loss": 2.2888, + "step": 2114 + }, + { + "epoch": 0.649171270718232, + "grad_norm": 0.786609411239624, + "learning_rate": 9.968150276386317e-05, + "loss": 2.3354, + "step": 2115 + }, + { + "epoch": 0.6494782074892572, + "grad_norm": 0.7693428993225098, + "learning_rate": 9.96809423798704e-05, + "loss": 2.1981, + "step": 2116 + }, + { + "epoch": 0.6497851442602823, + "grad_norm": 0.72762131690979, + "learning_rate": 9.968038150490145e-05, + "loss": 2.2387, + "step": 2117 + }, + { + "epoch": 0.6500920810313076, + "grad_norm": 0.737617015838623, + "learning_rate": 9.967982013896184e-05, + "loss": 2.258, + "step": 2118 + }, + { + "epoch": 0.6503990178023327, + "grad_norm": 0.7320968508720398, + "learning_rate": 9.967925828205712e-05, + "loss": 2.3248, + "step": 2119 + }, + { + "epoch": 0.6507059545733579, + "grad_norm": 0.7904484868049622, + "learning_rate": 9.967869593419286e-05, + "loss": 2.2121, + "step": 2120 + }, + { + "epoch": 0.651012891344383, + "grad_norm": 0.7519722580909729, + "learning_rate": 9.967813309537461e-05, + "loss": 2.1999, + "step": 2121 + }, + { + "epoch": 0.6513198281154082, + "grad_norm": 0.7201504707336426, + "learning_rate": 9.967756976560793e-05, + "loss": 2.2022, + "step": 2122 + }, + { + "epoch": 0.6516267648864333, + "grad_norm": 0.6134514808654785, + "learning_rate": 9.96770059448984e-05, + "loss": 2.2105, + "step": 2123 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.6086028218269348, + "learning_rate": 9.967644163325156e-05, + "loss": 2.212, + "step": 2124 + }, + { + "epoch": 0.6522406384284838, + "grad_norm": 0.6550475358963013, + "learning_rate": 9.967587683067302e-05, + "loss": 2.181, + "step": 2125 + }, + { + "epoch": 0.6525475751995089, + "grad_norm": 0.7557916045188904, + "learning_rate": 9.967531153716835e-05, + "loss": 2.3194, + "step": 2126 + }, + { + "epoch": 0.6528545119705341, + "grad_norm": 0.8859965801239014, + "learning_rate": 9.967474575274314e-05, + "loss": 2.2104, + "step": 2127 + }, + { + "epoch": 0.6531614487415592, + "grad_norm": 0.8049005270004272, + "learning_rate": 9.967417947740296e-05, + "loss": 2.2949, + "step": 2128 + }, + { + "epoch": 0.6534683855125845, + "grad_norm": 0.708297073841095, + "learning_rate": 9.967361271115343e-05, + "loss": 2.1703, + "step": 2129 + }, + { + "epoch": 0.6537753222836096, + "grad_norm": 0.6764169335365295, + "learning_rate": 9.967304545400016e-05, + "loss": 2.2177, + "step": 2130 + }, + { + "epoch": 0.6540822590546348, + "grad_norm": 0.6987971067428589, + "learning_rate": 9.967247770594872e-05, + "loss": 2.1699, + "step": 2131 + }, + { + "epoch": 0.6543891958256599, + "grad_norm": 0.7212976217269897, + "learning_rate": 9.967190946700476e-05, + "loss": 2.1217, + "step": 2132 + }, + { + "epoch": 0.6546961325966851, + "grad_norm": 0.6805562973022461, + "learning_rate": 9.967134073717386e-05, + "loss": 2.2295, + "step": 2133 + }, + { + "epoch": 0.6550030693677102, + "grad_norm": 0.665428102016449, + "learning_rate": 9.967077151646167e-05, + "loss": 2.1742, + "step": 2134 + }, + { + "epoch": 0.6553100061387355, + "grad_norm": 0.6691353917121887, + "learning_rate": 9.967020180487378e-05, + "loss": 2.2313, + "step": 2135 + }, + { + "epoch": 0.6556169429097606, + "grad_norm": 0.7095547914505005, + "learning_rate": 9.966963160241587e-05, + "loss": 2.1367, + "step": 2136 + }, + { + "epoch": 0.6559238796807858, + "grad_norm": 0.7050215601921082, + "learning_rate": 9.966906090909353e-05, + "loss": 2.3234, + "step": 2137 + }, + { + "epoch": 0.6562308164518109, + "grad_norm": 0.7592353820800781, + "learning_rate": 9.966848972491245e-05, + "loss": 2.1722, + "step": 2138 + }, + { + "epoch": 0.6565377532228361, + "grad_norm": 0.6520100831985474, + "learning_rate": 9.96679180498782e-05, + "loss": 2.2401, + "step": 2139 + }, + { + "epoch": 0.6568446899938613, + "grad_norm": 0.6650902628898621, + "learning_rate": 9.966734588399651e-05, + "loss": 2.2094, + "step": 2140 + }, + { + "epoch": 0.6571516267648865, + "grad_norm": 0.7236151099205017, + "learning_rate": 9.966677322727299e-05, + "loss": 2.3021, + "step": 2141 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.7160753011703491, + "learning_rate": 9.966620007971327e-05, + "loss": 2.1992, + "step": 2142 + }, + { + "epoch": 0.6577655003069368, + "grad_norm": 0.6761705279350281, + "learning_rate": 9.966562644132309e-05, + "loss": 2.1853, + "step": 2143 + }, + { + "epoch": 0.6580724370779619, + "grad_norm": 0.7017555236816406, + "learning_rate": 9.966505231210806e-05, + "loss": 2.208, + "step": 2144 + }, + { + "epoch": 0.6583793738489871, + "grad_norm": 0.7652586102485657, + "learning_rate": 9.966447769207387e-05, + "loss": 2.3065, + "step": 2145 + }, + { + "epoch": 0.6586863106200123, + "grad_norm": 0.7148436307907104, + "learning_rate": 9.966390258122621e-05, + "loss": 2.1388, + "step": 2146 + }, + { + "epoch": 0.6589932473910375, + "grad_norm": 0.5885360240936279, + "learning_rate": 9.966332697957076e-05, + "loss": 2.1463, + "step": 2147 + }, + { + "epoch": 0.6593001841620626, + "grad_norm": 0.6800816655158997, + "learning_rate": 9.966275088711321e-05, + "loss": 2.3397, + "step": 2148 + }, + { + "epoch": 0.6596071209330878, + "grad_norm": 0.6856956481933594, + "learning_rate": 9.966217430385925e-05, + "loss": 2.0893, + "step": 2149 + }, + { + "epoch": 0.6599140577041129, + "grad_norm": 0.6302888989448547, + "learning_rate": 9.966159722981456e-05, + "loss": 2.1108, + "step": 2150 + }, + { + "epoch": 0.6602209944751382, + "grad_norm": 0.6145252585411072, + "learning_rate": 9.966101966498486e-05, + "loss": 2.2668, + "step": 2151 + }, + { + "epoch": 0.6605279312461633, + "grad_norm": 0.7258949279785156, + "learning_rate": 9.966044160937586e-05, + "loss": 2.2163, + "step": 2152 + }, + { + "epoch": 0.6608348680171885, + "grad_norm": 0.6809847950935364, + "learning_rate": 9.965986306299327e-05, + "loss": 2.1828, + "step": 2153 + }, + { + "epoch": 0.6611418047882136, + "grad_norm": 0.6673223376274109, + "learning_rate": 9.96592840258428e-05, + "loss": 2.232, + "step": 2154 + }, + { + "epoch": 0.6614487415592388, + "grad_norm": 0.6483572721481323, + "learning_rate": 9.96587044979302e-05, + "loss": 2.199, + "step": 2155 + }, + { + "epoch": 0.6617556783302639, + "grad_norm": 0.6227185726165771, + "learning_rate": 9.965812447926115e-05, + "loss": 2.166, + "step": 2156 + }, + { + "epoch": 0.6620626151012892, + "grad_norm": 0.5982463955879211, + "learning_rate": 9.965754396984142e-05, + "loss": 2.2074, + "step": 2157 + }, + { + "epoch": 0.6623695518723143, + "grad_norm": 0.6357809901237488, + "learning_rate": 9.965696296967673e-05, + "loss": 2.2086, + "step": 2158 + }, + { + "epoch": 0.6626764886433395, + "grad_norm": 0.5908147692680359, + "learning_rate": 9.965638147877283e-05, + "loss": 2.1103, + "step": 2159 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.591332733631134, + "learning_rate": 9.965579949713545e-05, + "loss": 2.1698, + "step": 2160 + }, + { + "epoch": 0.6632903621853898, + "grad_norm": 0.5748336911201477, + "learning_rate": 9.965521702477038e-05, + "loss": 2.1812, + "step": 2161 + }, + { + "epoch": 0.663597298956415, + "grad_norm": 0.6643908023834229, + "learning_rate": 9.965463406168334e-05, + "loss": 2.2129, + "step": 2162 + }, + { + "epoch": 0.6639042357274402, + "grad_norm": 0.637627124786377, + "learning_rate": 9.965405060788011e-05, + "loss": 2.226, + "step": 2163 + }, + { + "epoch": 0.6642111724984653, + "grad_norm": 0.6170387268066406, + "learning_rate": 9.965346666336644e-05, + "loss": 2.2025, + "step": 2164 + }, + { + "epoch": 0.6645181092694905, + "grad_norm": 0.6038833260536194, + "learning_rate": 9.965288222814812e-05, + "loss": 2.1761, + "step": 2165 + }, + { + "epoch": 0.6648250460405156, + "grad_norm": 0.5705585479736328, + "learning_rate": 9.965229730223092e-05, + "loss": 2.1511, + "step": 2166 + }, + { + "epoch": 0.6651319828115408, + "grad_norm": 0.5994759798049927, + "learning_rate": 9.965171188562059e-05, + "loss": 2.1763, + "step": 2167 + }, + { + "epoch": 0.665438919582566, + "grad_norm": 0.5887313485145569, + "learning_rate": 9.965112597832296e-05, + "loss": 2.2185, + "step": 2168 + }, + { + "epoch": 0.6657458563535912, + "grad_norm": 0.5688689947128296, + "learning_rate": 9.96505395803438e-05, + "loss": 2.2387, + "step": 2169 + }, + { + "epoch": 0.6660527931246163, + "grad_norm": 0.6121554970741272, + "learning_rate": 9.96499526916889e-05, + "loss": 2.1938, + "step": 2170 + }, + { + "epoch": 0.6663597298956415, + "grad_norm": 0.6048038005828857, + "learning_rate": 9.964936531236407e-05, + "loss": 2.197, + "step": 2171 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6442995071411133, + "learning_rate": 9.96487774423751e-05, + "loss": 2.1725, + "step": 2172 + }, + { + "epoch": 0.6669736034376919, + "grad_norm": 0.7136862874031067, + "learning_rate": 9.964818908172783e-05, + "loss": 2.2166, + "step": 2173 + }, + { + "epoch": 0.667280540208717, + "grad_norm": 0.6902804970741272, + "learning_rate": 9.964760023042805e-05, + "loss": 2.2318, + "step": 2174 + }, + { + "epoch": 0.6675874769797422, + "grad_norm": 0.6946488618850708, + "learning_rate": 9.964701088848158e-05, + "loss": 2.177, + "step": 2175 + }, + { + "epoch": 0.6678944137507673, + "grad_norm": 0.6283712983131409, + "learning_rate": 9.964642105589425e-05, + "loss": 2.2227, + "step": 2176 + }, + { + "epoch": 0.6682013505217925, + "grad_norm": 0.5768510103225708, + "learning_rate": 9.96458307326719e-05, + "loss": 2.1559, + "step": 2177 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.6045784950256348, + "learning_rate": 9.964523991882035e-05, + "loss": 2.2018, + "step": 2178 + }, + { + "epoch": 0.6688152240638429, + "grad_norm": 0.5962889790534973, + "learning_rate": 9.964464861434544e-05, + "loss": 2.1898, + "step": 2179 + }, + { + "epoch": 0.669122160834868, + "grad_norm": 0.6611660718917847, + "learning_rate": 9.964405681925301e-05, + "loss": 2.1989, + "step": 2180 + }, + { + "epoch": 0.6694290976058932, + "grad_norm": 0.6764575242996216, + "learning_rate": 9.964346453354891e-05, + "loss": 2.2764, + "step": 2181 + }, + { + "epoch": 0.6697360343769183, + "grad_norm": 0.6795048117637634, + "learning_rate": 9.964287175723899e-05, + "loss": 2.1313, + "step": 2182 + }, + { + "epoch": 0.6700429711479435, + "grad_norm": 0.6697003841400146, + "learning_rate": 9.964227849032914e-05, + "loss": 2.1999, + "step": 2183 + }, + { + "epoch": 0.6703499079189686, + "grad_norm": 0.669682502746582, + "learning_rate": 9.964168473282519e-05, + "loss": 2.202, + "step": 2184 + }, + { + "epoch": 0.6706568446899939, + "grad_norm": 0.6823530793190002, + "learning_rate": 9.9641090484733e-05, + "loss": 2.2326, + "step": 2185 + }, + { + "epoch": 0.670963781461019, + "grad_norm": 0.7460775971412659, + "learning_rate": 9.964049574605848e-05, + "loss": 2.1594, + "step": 2186 + }, + { + "epoch": 0.6712707182320442, + "grad_norm": 0.8075460195541382, + "learning_rate": 9.963990051680744e-05, + "loss": 2.1506, + "step": 2187 + }, + { + "epoch": 0.6715776550030693, + "grad_norm": 0.8041695356369019, + "learning_rate": 9.963930479698585e-05, + "loss": 2.123, + "step": 2188 + }, + { + "epoch": 0.6718845917740945, + "grad_norm": 0.9129732251167297, + "learning_rate": 9.963870858659955e-05, + "loss": 2.116, + "step": 2189 + }, + { + "epoch": 0.6721915285451197, + "grad_norm": 0.9989685416221619, + "learning_rate": 9.963811188565444e-05, + "loss": 2.3194, + "step": 2190 + }, + { + "epoch": 0.6724984653161449, + "grad_norm": 1.0353670120239258, + "learning_rate": 9.96375146941564e-05, + "loss": 2.113, + "step": 2191 + }, + { + "epoch": 0.67280540208717, + "grad_norm": 0.897750735282898, + "learning_rate": 9.963691701211135e-05, + "loss": 2.1038, + "step": 2192 + }, + { + "epoch": 0.6731123388581952, + "grad_norm": 0.7353916168212891, + "learning_rate": 9.96363188395252e-05, + "loss": 2.2185, + "step": 2193 + }, + { + "epoch": 0.6734192756292203, + "grad_norm": 0.6474063992500305, + "learning_rate": 9.963572017640385e-05, + "loss": 2.2229, + "step": 2194 + }, + { + "epoch": 0.6737262124002455, + "grad_norm": 0.7194583415985107, + "learning_rate": 9.963512102275322e-05, + "loss": 2.2172, + "step": 2195 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.6638131737709045, + "learning_rate": 9.963452137857926e-05, + "loss": 2.2212, + "step": 2196 + }, + { + "epoch": 0.6743400859422959, + "grad_norm": 0.7219048738479614, + "learning_rate": 9.963392124388782e-05, + "loss": 2.3302, + "step": 2197 + }, + { + "epoch": 0.6746470227133211, + "grad_norm": 0.7941164374351501, + "learning_rate": 9.963332061868491e-05, + "loss": 2.2982, + "step": 2198 + }, + { + "epoch": 0.6749539594843462, + "grad_norm": 0.7356888055801392, + "learning_rate": 9.963271950297643e-05, + "loss": 2.1761, + "step": 2199 + }, + { + "epoch": 0.6752608962553714, + "grad_norm": 0.6705774664878845, + "learning_rate": 9.963211789676831e-05, + "loss": 2.2483, + "step": 2200 + }, + { + "epoch": 0.6755678330263966, + "grad_norm": 0.7958056926727295, + "learning_rate": 9.963151580006653e-05, + "loss": 2.2209, + "step": 2201 + }, + { + "epoch": 0.6758747697974218, + "grad_norm": 0.7215412259101868, + "learning_rate": 9.9630913212877e-05, + "loss": 2.1676, + "step": 2202 + }, + { + "epoch": 0.6761817065684469, + "grad_norm": 0.705649197101593, + "learning_rate": 9.963031013520572e-05, + "loss": 2.1855, + "step": 2203 + }, + { + "epoch": 0.6764886433394721, + "grad_norm": 0.7050254344940186, + "learning_rate": 9.962970656705861e-05, + "loss": 2.171, + "step": 2204 + }, + { + "epoch": 0.6767955801104972, + "grad_norm": 0.7163556218147278, + "learning_rate": 9.962910250844167e-05, + "loss": 2.1295, + "step": 2205 + }, + { + "epoch": 0.6771025168815225, + "grad_norm": 0.7195280194282532, + "learning_rate": 9.962849795936083e-05, + "loss": 2.1436, + "step": 2206 + }, + { + "epoch": 0.6774094536525476, + "grad_norm": 0.7356030344963074, + "learning_rate": 9.962789291982208e-05, + "loss": 2.2739, + "step": 2207 + }, + { + "epoch": 0.6777163904235728, + "grad_norm": 0.783649742603302, + "learning_rate": 9.962728738983143e-05, + "loss": 2.2461, + "step": 2208 + }, + { + "epoch": 0.6780233271945979, + "grad_norm": 0.6966754794120789, + "learning_rate": 9.962668136939481e-05, + "loss": 2.1977, + "step": 2209 + }, + { + "epoch": 0.6783302639656231, + "grad_norm": 0.6986487507820129, + "learning_rate": 9.962607485851825e-05, + "loss": 2.1806, + "step": 2210 + }, + { + "epoch": 0.6786372007366482, + "grad_norm": 0.6502536535263062, + "learning_rate": 9.962546785720774e-05, + "loss": 2.174, + "step": 2211 + }, + { + "epoch": 0.6789441375076735, + "grad_norm": 0.6797144412994385, + "learning_rate": 9.962486036546926e-05, + "loss": 2.2635, + "step": 2212 + }, + { + "epoch": 0.6792510742786986, + "grad_norm": 0.7190150022506714, + "learning_rate": 9.962425238330884e-05, + "loss": 2.2231, + "step": 2213 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.6770560145378113, + "learning_rate": 9.962364391073245e-05, + "loss": 2.1639, + "step": 2214 + }, + { + "epoch": 0.6798649478207489, + "grad_norm": 0.624911904335022, + "learning_rate": 9.962303494774614e-05, + "loss": 2.1754, + "step": 2215 + }, + { + "epoch": 0.6801718845917741, + "grad_norm": 0.7127423286437988, + "learning_rate": 9.96224254943559e-05, + "loss": 2.2047, + "step": 2216 + }, + { + "epoch": 0.6804788213627992, + "grad_norm": 0.6729345321655273, + "learning_rate": 9.962181555056778e-05, + "loss": 2.2245, + "step": 2217 + }, + { + "epoch": 0.6807857581338245, + "grad_norm": 0.7142044901847839, + "learning_rate": 9.96212051163878e-05, + "loss": 2.1827, + "step": 2218 + }, + { + "epoch": 0.6810926949048496, + "grad_norm": 0.686295211315155, + "learning_rate": 9.962059419182196e-05, + "loss": 2.1784, + "step": 2219 + }, + { + "epoch": 0.6813996316758748, + "grad_norm": 0.7207211256027222, + "learning_rate": 9.961998277687634e-05, + "loss": 2.2603, + "step": 2220 + }, + { + "epoch": 0.6817065684468999, + "grad_norm": 0.814552903175354, + "learning_rate": 9.961937087155697e-05, + "loss": 2.2328, + "step": 2221 + }, + { + "epoch": 0.6820135052179251, + "grad_norm": 0.851860761642456, + "learning_rate": 9.96187584758699e-05, + "loss": 2.2334, + "step": 2222 + }, + { + "epoch": 0.6823204419889503, + "grad_norm": 0.9232058525085449, + "learning_rate": 9.961814558982117e-05, + "loss": 2.2259, + "step": 2223 + }, + { + "epoch": 0.6826273787599755, + "grad_norm": 0.8393358588218689, + "learning_rate": 9.961753221341684e-05, + "loss": 2.1347, + "step": 2224 + }, + { + "epoch": 0.6829343155310006, + "grad_norm": 0.7124439477920532, + "learning_rate": 9.961691834666297e-05, + "loss": 2.195, + "step": 2225 + }, + { + "epoch": 0.6832412523020258, + "grad_norm": 0.644290566444397, + "learning_rate": 9.961630398956565e-05, + "loss": 2.1967, + "step": 2226 + }, + { + "epoch": 0.6835481890730509, + "grad_norm": 0.6896283030509949, + "learning_rate": 9.961568914213092e-05, + "loss": 2.1781, + "step": 2227 + }, + { + "epoch": 0.6838551258440762, + "grad_norm": 0.711643636226654, + "learning_rate": 9.961507380436487e-05, + "loss": 2.1091, + "step": 2228 + }, + { + "epoch": 0.6841620626151013, + "grad_norm": 0.7056689858436584, + "learning_rate": 9.961445797627358e-05, + "loss": 2.1848, + "step": 2229 + }, + { + "epoch": 0.6844689993861265, + "grad_norm": 0.60573410987854, + "learning_rate": 9.961384165786314e-05, + "loss": 2.1156, + "step": 2230 + }, + { + "epoch": 0.6847759361571516, + "grad_norm": 0.5612443089485168, + "learning_rate": 9.961322484913963e-05, + "loss": 2.2311, + "step": 2231 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.6356449723243713, + "learning_rate": 9.961260755010916e-05, + "loss": 2.1945, + "step": 2232 + }, + { + "epoch": 0.6853898096992019, + "grad_norm": 0.7393341660499573, + "learning_rate": 9.961198976077782e-05, + "loss": 2.2743, + "step": 2233 + }, + { + "epoch": 0.6856967464702272, + "grad_norm": 0.7658794522285461, + "learning_rate": 9.961137148115171e-05, + "loss": 2.1729, + "step": 2234 + }, + { + "epoch": 0.6860036832412523, + "grad_norm": 0.790540337562561, + "learning_rate": 9.961075271123697e-05, + "loss": 2.1372, + "step": 2235 + }, + { + "epoch": 0.6863106200122775, + "grad_norm": 0.71295565366745, + "learning_rate": 9.961013345103968e-05, + "loss": 2.1325, + "step": 2236 + }, + { + "epoch": 0.6866175567833026, + "grad_norm": 0.6648302674293518, + "learning_rate": 9.960951370056597e-05, + "loss": 2.1626, + "step": 2237 + }, + { + "epoch": 0.6869244935543278, + "grad_norm": 0.6276865601539612, + "learning_rate": 9.960889345982198e-05, + "loss": 2.1848, + "step": 2238 + }, + { + "epoch": 0.6872314303253529, + "grad_norm": 0.6786942481994629, + "learning_rate": 9.960827272881383e-05, + "loss": 2.2402, + "step": 2239 + }, + { + "epoch": 0.6875383670963782, + "grad_norm": 0.7752293348312378, + "learning_rate": 9.960765150754764e-05, + "loss": 2.2187, + "step": 2240 + }, + { + "epoch": 0.6878453038674033, + "grad_norm": 0.7958577871322632, + "learning_rate": 9.960702979602956e-05, + "loss": 2.1995, + "step": 2241 + }, + { + "epoch": 0.6881522406384285, + "grad_norm": 0.7327582240104675, + "learning_rate": 9.960640759426575e-05, + "loss": 2.1709, + "step": 2242 + }, + { + "epoch": 0.6884591774094536, + "grad_norm": 0.7002710103988647, + "learning_rate": 9.960578490226233e-05, + "loss": 2.1966, + "step": 2243 + }, + { + "epoch": 0.6887661141804788, + "grad_norm": 0.6163785457611084, + "learning_rate": 9.960516172002548e-05, + "loss": 2.2012, + "step": 2244 + }, + { + "epoch": 0.689073050951504, + "grad_norm": 0.6808127760887146, + "learning_rate": 9.960453804756134e-05, + "loss": 2.1704, + "step": 2245 + }, + { + "epoch": 0.6893799877225292, + "grad_norm": 0.6571208834648132, + "learning_rate": 9.960391388487609e-05, + "loss": 2.17, + "step": 2246 + }, + { + "epoch": 0.6896869244935543, + "grad_norm": 0.7180834412574768, + "learning_rate": 9.960328923197588e-05, + "loss": 2.229, + "step": 2247 + }, + { + "epoch": 0.6899938612645795, + "grad_norm": 0.7283746600151062, + "learning_rate": 9.96026640888669e-05, + "loss": 2.195, + "step": 2248 + }, + { + "epoch": 0.6903007980356046, + "grad_norm": 0.6808122992515564, + "learning_rate": 9.960203845555531e-05, + "loss": 2.1327, + "step": 2249 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.7105094790458679, + "learning_rate": 9.960141233204731e-05, + "loss": 2.2747, + "step": 2250 + }, + { + "epoch": 0.690914671577655, + "grad_norm": 0.7650291919708252, + "learning_rate": 9.960078571834909e-05, + "loss": 2.2751, + "step": 2251 + }, + { + "epoch": 0.6912216083486802, + "grad_norm": 0.8347647786140442, + "learning_rate": 9.960015861446684e-05, + "loss": 2.2101, + "step": 2252 + }, + { + "epoch": 0.6915285451197053, + "grad_norm": 0.7774063348770142, + "learning_rate": 9.959953102040672e-05, + "loss": 2.1275, + "step": 2253 + }, + { + "epoch": 0.6918354818907305, + "grad_norm": 0.7466274499893188, + "learning_rate": 9.959890293617497e-05, + "loss": 2.1352, + "step": 2254 + }, + { + "epoch": 0.6921424186617556, + "grad_norm": 0.7451669573783875, + "learning_rate": 9.959827436177781e-05, + "loss": 2.1229, + "step": 2255 + }, + { + "epoch": 0.6924493554327809, + "grad_norm": 0.651746392250061, + "learning_rate": 9.959764529722142e-05, + "loss": 2.1416, + "step": 2256 + }, + { + "epoch": 0.692756292203806, + "grad_norm": 0.6267968416213989, + "learning_rate": 9.959701574251203e-05, + "loss": 2.1346, + "step": 2257 + }, + { + "epoch": 0.6930632289748312, + "grad_norm": 0.6087000966072083, + "learning_rate": 9.959638569765586e-05, + "loss": 2.2136, + "step": 2258 + }, + { + "epoch": 0.6933701657458563, + "grad_norm": 0.6032208204269409, + "learning_rate": 9.959575516265914e-05, + "loss": 2.1211, + "step": 2259 + }, + { + "epoch": 0.6936771025168815, + "grad_norm": 0.83074551820755, + "learning_rate": 9.95951241375281e-05, + "loss": 2.2951, + "step": 2260 + }, + { + "epoch": 0.6939840392879066, + "grad_norm": 0.8564106225967407, + "learning_rate": 9.959449262226897e-05, + "loss": 2.1496, + "step": 2261 + }, + { + "epoch": 0.6942909760589319, + "grad_norm": 0.8558153510093689, + "learning_rate": 9.9593860616888e-05, + "loss": 2.2325, + "step": 2262 + }, + { + "epoch": 0.694597912829957, + "grad_norm": 0.7391008734703064, + "learning_rate": 9.959322812139143e-05, + "loss": 2.1133, + "step": 2263 + }, + { + "epoch": 0.6949048496009822, + "grad_norm": 0.6090536713600159, + "learning_rate": 9.959259513578552e-05, + "loss": 2.1453, + "step": 2264 + }, + { + "epoch": 0.6952117863720073, + "grad_norm": 0.5893986821174622, + "learning_rate": 9.95919616600765e-05, + "loss": 2.2035, + "step": 2265 + }, + { + "epoch": 0.6955187231430325, + "grad_norm": 0.6274020671844482, + "learning_rate": 9.959132769427065e-05, + "loss": 2.2118, + "step": 2266 + }, + { + "epoch": 0.6958256599140578, + "grad_norm": 0.6287395358085632, + "learning_rate": 9.959069323837424e-05, + "loss": 2.2167, + "step": 2267 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.6281611323356628, + "learning_rate": 9.959005829239354e-05, + "loss": 2.1945, + "step": 2268 + }, + { + "epoch": 0.6964395334561081, + "grad_norm": 0.6422389149665833, + "learning_rate": 9.958942285633481e-05, + "loss": 2.1826, + "step": 2269 + }, + { + "epoch": 0.6967464702271332, + "grad_norm": 0.6461887955665588, + "learning_rate": 9.958878693020434e-05, + "loss": 2.2454, + "step": 2270 + }, + { + "epoch": 0.6970534069981584, + "grad_norm": 0.562102735042572, + "learning_rate": 9.958815051400841e-05, + "loss": 2.1375, + "step": 2271 + }, + { + "epoch": 0.6973603437691835, + "grad_norm": 0.5737003087997437, + "learning_rate": 9.958751360775331e-05, + "loss": 2.2344, + "step": 2272 + }, + { + "epoch": 0.6976672805402088, + "grad_norm": 0.5516494512557983, + "learning_rate": 9.958687621144535e-05, + "loss": 2.249, + "step": 2273 + }, + { + "epoch": 0.6979742173112339, + "grad_norm": 0.7148357629776001, + "learning_rate": 9.958623832509081e-05, + "loss": 2.2383, + "step": 2274 + }, + { + "epoch": 0.6982811540822591, + "grad_norm": 0.7151525020599365, + "learning_rate": 9.958559994869599e-05, + "loss": 2.1697, + "step": 2275 + }, + { + "epoch": 0.6985880908532842, + "grad_norm": 0.6927846670150757, + "learning_rate": 9.958496108226722e-05, + "loss": 2.1534, + "step": 2276 + }, + { + "epoch": 0.6988950276243094, + "grad_norm": 0.811660647392273, + "learning_rate": 9.958432172581079e-05, + "loss": 2.2197, + "step": 2277 + }, + { + "epoch": 0.6992019643953346, + "grad_norm": 0.9680081009864807, + "learning_rate": 9.958368187933305e-05, + "loss": 2.2241, + "step": 2278 + }, + { + "epoch": 0.6995089011663598, + "grad_norm": 0.9996320605278015, + "learning_rate": 9.958304154284028e-05, + "loss": 2.1598, + "step": 2279 + }, + { + "epoch": 0.6998158379373849, + "grad_norm": 1.008695363998413, + "learning_rate": 9.958240071633884e-05, + "loss": 2.2082, + "step": 2280 + }, + { + "epoch": 0.7001227747084101, + "grad_norm": 0.9931860566139221, + "learning_rate": 9.958175939983506e-05, + "loss": 2.1478, + "step": 2281 + }, + { + "epoch": 0.7004297114794352, + "grad_norm": 0.8637800812721252, + "learning_rate": 9.958111759333528e-05, + "loss": 2.149, + "step": 2282 + }, + { + "epoch": 0.7007366482504604, + "grad_norm": 0.7089012861251831, + "learning_rate": 9.958047529684582e-05, + "loss": 2.1845, + "step": 2283 + }, + { + "epoch": 0.7010435850214856, + "grad_norm": 0.6083673238754272, + "learning_rate": 9.957983251037303e-05, + "loss": 2.1542, + "step": 2284 + }, + { + "epoch": 0.7013505217925108, + "grad_norm": 0.7092905044555664, + "learning_rate": 9.957918923392331e-05, + "loss": 2.2305, + "step": 2285 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.8416675925254822, + "learning_rate": 9.957854546750297e-05, + "loss": 2.2975, + "step": 2286 + }, + { + "epoch": 0.7019643953345611, + "grad_norm": 0.7778663039207458, + "learning_rate": 9.957790121111838e-05, + "loss": 2.2363, + "step": 2287 + }, + { + "epoch": 0.7022713321055862, + "grad_norm": 0.7886617183685303, + "learning_rate": 9.957725646477592e-05, + "loss": 2.1547, + "step": 2288 + }, + { + "epoch": 0.7025782688766115, + "grad_norm": 0.6596038937568665, + "learning_rate": 9.957661122848194e-05, + "loss": 2.1537, + "step": 2289 + }, + { + "epoch": 0.7028852056476366, + "grad_norm": 0.6441544890403748, + "learning_rate": 9.957596550224285e-05, + "loss": 2.1678, + "step": 2290 + }, + { + "epoch": 0.7031921424186618, + "grad_norm": 0.7106116414070129, + "learning_rate": 9.957531928606499e-05, + "loss": 2.2039, + "step": 2291 + }, + { + "epoch": 0.7034990791896869, + "grad_norm": 0.6948207020759583, + "learning_rate": 9.957467257995476e-05, + "loss": 2.176, + "step": 2292 + }, + { + "epoch": 0.7038060159607121, + "grad_norm": 0.6834874153137207, + "learning_rate": 9.957402538391859e-05, + "loss": 2.2182, + "step": 2293 + }, + { + "epoch": 0.7041129527317372, + "grad_norm": 0.6246630549430847, + "learning_rate": 9.957337769796282e-05, + "loss": 2.1181, + "step": 2294 + }, + { + "epoch": 0.7044198895027625, + "grad_norm": 0.6421988606452942, + "learning_rate": 9.957272952209389e-05, + "loss": 2.1352, + "step": 2295 + }, + { + "epoch": 0.7047268262737876, + "grad_norm": 0.5955870151519775, + "learning_rate": 9.95720808563182e-05, + "loss": 2.1852, + "step": 2296 + }, + { + "epoch": 0.7050337630448128, + "grad_norm": 0.6961265206336975, + "learning_rate": 9.957143170064214e-05, + "loss": 2.242, + "step": 2297 + }, + { + "epoch": 0.7053406998158379, + "grad_norm": 0.6966063380241394, + "learning_rate": 9.957078205507213e-05, + "loss": 2.1505, + "step": 2298 + }, + { + "epoch": 0.7056476365868631, + "grad_norm": 0.6155996322631836, + "learning_rate": 9.957013191961459e-05, + "loss": 2.1928, + "step": 2299 + }, + { + "epoch": 0.7059545733578882, + "grad_norm": 0.6092718839645386, + "learning_rate": 9.956948129427597e-05, + "loss": 2.138, + "step": 2300 + }, + { + "epoch": 0.7062615101289135, + "grad_norm": 0.645746111869812, + "learning_rate": 9.95688301790627e-05, + "loss": 2.2334, + "step": 2301 + }, + { + "epoch": 0.7065684468999386, + "grad_norm": 0.5959149599075317, + "learning_rate": 9.956817857398116e-05, + "loss": 2.1985, + "step": 2302 + }, + { + "epoch": 0.7068753836709638, + "grad_norm": 0.7127073407173157, + "learning_rate": 9.956752647903785e-05, + "loss": 2.2157, + "step": 2303 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.5589274764060974, + "learning_rate": 9.956687389423917e-05, + "loss": 2.1251, + "step": 2304 + }, + { + "epoch": 0.7074892572130141, + "grad_norm": 0.5502300262451172, + "learning_rate": 9.95662208195916e-05, + "loss": 2.1344, + "step": 2305 + }, + { + "epoch": 0.7077961939840393, + "grad_norm": 0.6577275991439819, + "learning_rate": 9.95655672551016e-05, + "loss": 2.1646, + "step": 2306 + }, + { + "epoch": 0.7081031307550645, + "grad_norm": 0.6241618394851685, + "learning_rate": 9.956491320077559e-05, + "loss": 2.1153, + "step": 2307 + }, + { + "epoch": 0.7084100675260896, + "grad_norm": 0.5846728086471558, + "learning_rate": 9.956425865662007e-05, + "loss": 2.1477, + "step": 2308 + }, + { + "epoch": 0.7087170042971148, + "grad_norm": 0.6005275249481201, + "learning_rate": 9.95636036226415e-05, + "loss": 2.2034, + "step": 2309 + }, + { + "epoch": 0.7090239410681399, + "grad_norm": 0.6545519828796387, + "learning_rate": 9.956294809884635e-05, + "loss": 2.23, + "step": 2310 + }, + { + "epoch": 0.7093308778391652, + "grad_norm": 0.7513750791549683, + "learning_rate": 9.956229208524108e-05, + "loss": 2.2497, + "step": 2311 + }, + { + "epoch": 0.7096378146101903, + "grad_norm": 0.7308349609375, + "learning_rate": 9.956163558183219e-05, + "loss": 2.166, + "step": 2312 + }, + { + "epoch": 0.7099447513812155, + "grad_norm": 0.6278798580169678, + "learning_rate": 9.956097858862619e-05, + "loss": 2.1994, + "step": 2313 + }, + { + "epoch": 0.7102516881522406, + "grad_norm": 0.6725621223449707, + "learning_rate": 9.956032110562953e-05, + "loss": 2.2212, + "step": 2314 + }, + { + "epoch": 0.7105586249232658, + "grad_norm": 0.7116945385932922, + "learning_rate": 9.955966313284872e-05, + "loss": 2.2033, + "step": 2315 + }, + { + "epoch": 0.7108655616942909, + "grad_norm": 0.5906245112419128, + "learning_rate": 9.95590046702903e-05, + "loss": 2.1419, + "step": 2316 + }, + { + "epoch": 0.7111724984653162, + "grad_norm": 0.6911863684654236, + "learning_rate": 9.955834571796073e-05, + "loss": 2.1697, + "step": 2317 + }, + { + "epoch": 0.7114794352363413, + "grad_norm": 0.600350558757782, + "learning_rate": 9.955768627586655e-05, + "loss": 2.0864, + "step": 2318 + }, + { + "epoch": 0.7117863720073665, + "grad_norm": 0.6246278285980225, + "learning_rate": 9.955702634401427e-05, + "loss": 2.1549, + "step": 2319 + }, + { + "epoch": 0.7120933087783916, + "grad_norm": 0.6530009508132935, + "learning_rate": 9.95563659224104e-05, + "loss": 2.1457, + "step": 2320 + }, + { + "epoch": 0.7124002455494168, + "grad_norm": 0.6566256880760193, + "learning_rate": 9.955570501106148e-05, + "loss": 2.1589, + "step": 2321 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.6607041358947754, + "learning_rate": 9.955504360997404e-05, + "loss": 2.1692, + "step": 2322 + }, + { + "epoch": 0.7130141190914672, + "grad_norm": 0.7257810235023499, + "learning_rate": 9.95543817191546e-05, + "loss": 2.2067, + "step": 2323 + }, + { + "epoch": 0.7133210558624923, + "grad_norm": 0.7413349151611328, + "learning_rate": 9.955371933860973e-05, + "loss": 2.1817, + "step": 2324 + }, + { + "epoch": 0.7136279926335175, + "grad_norm": 0.6968317031860352, + "learning_rate": 9.955305646834596e-05, + "loss": 2.2574, + "step": 2325 + }, + { + "epoch": 0.7139349294045426, + "grad_norm": 0.8065732717514038, + "learning_rate": 9.955239310836983e-05, + "loss": 2.1957, + "step": 2326 + }, + { + "epoch": 0.7142418661755678, + "grad_norm": 0.7563133835792542, + "learning_rate": 9.955172925868792e-05, + "loss": 2.2113, + "step": 2327 + }, + { + "epoch": 0.714548802946593, + "grad_norm": 0.6790496110916138, + "learning_rate": 9.955106491930678e-05, + "loss": 2.103, + "step": 2328 + }, + { + "epoch": 0.7148557397176182, + "grad_norm": 0.65167236328125, + "learning_rate": 9.955040009023298e-05, + "loss": 2.1919, + "step": 2329 + }, + { + "epoch": 0.7151626764886433, + "grad_norm": 0.6869332790374756, + "learning_rate": 9.954973477147307e-05, + "loss": 2.2141, + "step": 2330 + }, + { + "epoch": 0.7154696132596685, + "grad_norm": 0.8613699078559875, + "learning_rate": 9.954906896303363e-05, + "loss": 2.1962, + "step": 2331 + }, + { + "epoch": 0.7157765500306936, + "grad_norm": 0.8827282786369324, + "learning_rate": 9.954840266492127e-05, + "loss": 2.216, + "step": 2332 + }, + { + "epoch": 0.7160834868017188, + "grad_norm": 0.9737905263900757, + "learning_rate": 9.954773587714255e-05, + "loss": 2.2118, + "step": 2333 + }, + { + "epoch": 0.716390423572744, + "grad_norm": 0.9978635311126709, + "learning_rate": 9.954706859970404e-05, + "loss": 2.0998, + "step": 2334 + }, + { + "epoch": 0.7166973603437692, + "grad_norm": 0.8694623112678528, + "learning_rate": 9.954640083261238e-05, + "loss": 2.1533, + "step": 2335 + }, + { + "epoch": 0.7170042971147943, + "grad_norm": 0.641293466091156, + "learning_rate": 9.954573257587415e-05, + "loss": 2.2095, + "step": 2336 + }, + { + "epoch": 0.7173112338858195, + "grad_norm": 0.6289860010147095, + "learning_rate": 9.954506382949594e-05, + "loss": 2.1683, + "step": 2337 + }, + { + "epoch": 0.7176181706568447, + "grad_norm": 0.8292246460914612, + "learning_rate": 9.954439459348437e-05, + "loss": 2.1729, + "step": 2338 + }, + { + "epoch": 0.7179251074278699, + "grad_norm": 0.8990920782089233, + "learning_rate": 9.954372486784605e-05, + "loss": 2.0888, + "step": 2339 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.7905614376068115, + "learning_rate": 9.954305465258762e-05, + "loss": 2.2262, + "step": 2340 + }, + { + "epoch": 0.7185389809699202, + "grad_norm": 0.7142611145973206, + "learning_rate": 9.954238394771567e-05, + "loss": 2.1311, + "step": 2341 + }, + { + "epoch": 0.7188459177409454, + "grad_norm": 0.68161541223526, + "learning_rate": 9.954171275323684e-05, + "loss": 2.2622, + "step": 2342 + }, + { + "epoch": 0.7191528545119705, + "grad_norm": 0.7524895668029785, + "learning_rate": 9.954104106915779e-05, + "loss": 2.1709, + "step": 2343 + }, + { + "epoch": 0.7194597912829958, + "grad_norm": 0.7419885396957397, + "learning_rate": 9.954036889548511e-05, + "loss": 2.1528, + "step": 2344 + }, + { + "epoch": 0.7197667280540209, + "grad_norm": 0.8045634031295776, + "learning_rate": 9.953969623222547e-05, + "loss": 2.1774, + "step": 2345 + }, + { + "epoch": 0.7200736648250461, + "grad_norm": 0.6680217385292053, + "learning_rate": 9.953902307938554e-05, + "loss": 2.2345, + "step": 2346 + }, + { + "epoch": 0.7203806015960712, + "grad_norm": 0.6900907754898071, + "learning_rate": 9.953834943697193e-05, + "loss": 2.1696, + "step": 2347 + }, + { + "epoch": 0.7206875383670964, + "grad_norm": 0.7231009006500244, + "learning_rate": 9.953767530499132e-05, + "loss": 2.2556, + "step": 2348 + }, + { + "epoch": 0.7209944751381215, + "grad_norm": 0.7766092419624329, + "learning_rate": 9.953700068345036e-05, + "loss": 2.1522, + "step": 2349 + }, + { + "epoch": 0.7213014119091468, + "grad_norm": 0.7361852526664734, + "learning_rate": 9.953632557235574e-05, + "loss": 2.2427, + "step": 2350 + }, + { + "epoch": 0.7216083486801719, + "grad_norm": 0.7170109152793884, + "learning_rate": 9.953564997171411e-05, + "loss": 2.2439, + "step": 2351 + }, + { + "epoch": 0.7219152854511971, + "grad_norm": 0.7192662954330444, + "learning_rate": 9.953497388153214e-05, + "loss": 2.1242, + "step": 2352 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.7363288402557373, + "learning_rate": 9.953429730181653e-05, + "loss": 2.2748, + "step": 2353 + }, + { + "epoch": 0.7225291589932474, + "grad_norm": 0.8516983985900879, + "learning_rate": 9.953362023257397e-05, + "loss": 2.2471, + "step": 2354 + }, + { + "epoch": 0.7228360957642725, + "grad_norm": 0.7928574681282043, + "learning_rate": 9.953294267381114e-05, + "loss": 2.164, + "step": 2355 + }, + { + "epoch": 0.7231430325352978, + "grad_norm": 0.6803320646286011, + "learning_rate": 9.953226462553474e-05, + "loss": 2.1671, + "step": 2356 + }, + { + "epoch": 0.7234499693063229, + "grad_norm": 0.6811994910240173, + "learning_rate": 9.953158608775147e-05, + "loss": 2.1042, + "step": 2357 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.6077840328216553, + "learning_rate": 9.953090706046804e-05, + "loss": 2.2161, + "step": 2358 + }, + { + "epoch": 0.7240638428483732, + "grad_norm": 0.5938412547111511, + "learning_rate": 9.953022754369114e-05, + "loss": 2.1177, + "step": 2359 + }, + { + "epoch": 0.7243707796193984, + "grad_norm": 0.6752299070358276, + "learning_rate": 9.952954753742751e-05, + "loss": 2.2255, + "step": 2360 + }, + { + "epoch": 0.7246777163904236, + "grad_norm": 0.6745245456695557, + "learning_rate": 9.952886704168387e-05, + "loss": 2.1817, + "step": 2361 + }, + { + "epoch": 0.7249846531614488, + "grad_norm": 0.6645397543907166, + "learning_rate": 9.95281860564669e-05, + "loss": 2.2495, + "step": 2362 + }, + { + "epoch": 0.7252915899324739, + "grad_norm": 0.6758745312690735, + "learning_rate": 9.95275045817834e-05, + "loss": 2.2059, + "step": 2363 + }, + { + "epoch": 0.7255985267034991, + "grad_norm": 0.6584516763687134, + "learning_rate": 9.952682261764006e-05, + "loss": 2.1868, + "step": 2364 + }, + { + "epoch": 0.7259054634745242, + "grad_norm": 0.6335561871528625, + "learning_rate": 9.952614016404363e-05, + "loss": 2.1352, + "step": 2365 + }, + { + "epoch": 0.7262124002455494, + "grad_norm": 0.6656816601753235, + "learning_rate": 9.952545722100087e-05, + "loss": 2.1805, + "step": 2366 + }, + { + "epoch": 0.7265193370165746, + "grad_norm": 0.6262782216072083, + "learning_rate": 9.95247737885185e-05, + "loss": 2.1435, + "step": 2367 + }, + { + "epoch": 0.7268262737875998, + "grad_norm": 0.569795548915863, + "learning_rate": 9.952408986660329e-05, + "loss": 2.1547, + "step": 2368 + }, + { + "epoch": 0.7271332105586249, + "grad_norm": 0.5249118208885193, + "learning_rate": 9.952340545526199e-05, + "loss": 2.1213, + "step": 2369 + }, + { + "epoch": 0.7274401473296501, + "grad_norm": 0.5581740140914917, + "learning_rate": 9.952272055450139e-05, + "loss": 2.1866, + "step": 2370 + }, + { + "epoch": 0.7277470841006752, + "grad_norm": 0.5986969470977783, + "learning_rate": 9.952203516432821e-05, + "loss": 2.143, + "step": 2371 + }, + { + "epoch": 0.7280540208717005, + "grad_norm": 0.6426723599433899, + "learning_rate": 9.952134928474926e-05, + "loss": 2.2132, + "step": 2372 + }, + { + "epoch": 0.7283609576427256, + "grad_norm": 0.5856953263282776, + "learning_rate": 9.952066291577133e-05, + "loss": 2.1502, + "step": 2373 + }, + { + "epoch": 0.7286678944137508, + "grad_norm": 0.5420570969581604, + "learning_rate": 9.951997605740117e-05, + "loss": 2.1213, + "step": 2374 + }, + { + "epoch": 0.7289748311847759, + "grad_norm": 0.6201688647270203, + "learning_rate": 9.951928870964558e-05, + "loss": 2.218, + "step": 2375 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.7023850083351135, + "learning_rate": 9.951860087251137e-05, + "loss": 2.2787, + "step": 2376 + }, + { + "epoch": 0.7295887047268262, + "grad_norm": 0.733650803565979, + "learning_rate": 9.951791254600532e-05, + "loss": 2.1861, + "step": 2377 + }, + { + "epoch": 0.7298956414978515, + "grad_norm": 0.7177363038063049, + "learning_rate": 9.951722373013421e-05, + "loss": 2.1905, + "step": 2378 + }, + { + "epoch": 0.7302025782688766, + "grad_norm": 0.7963547706604004, + "learning_rate": 9.95165344249049e-05, + "loss": 2.1842, + "step": 2379 + }, + { + "epoch": 0.7305095150399018, + "grad_norm": 0.8466546535491943, + "learning_rate": 9.951584463032416e-05, + "loss": 2.1661, + "step": 2380 + }, + { + "epoch": 0.7308164518109269, + "grad_norm": 0.7288870811462402, + "learning_rate": 9.951515434639882e-05, + "loss": 2.1153, + "step": 2381 + }, + { + "epoch": 0.7311233885819521, + "grad_norm": 0.6168704032897949, + "learning_rate": 9.951446357313571e-05, + "loss": 2.121, + "step": 2382 + }, + { + "epoch": 0.7314303253529773, + "grad_norm": 0.6534848809242249, + "learning_rate": 9.951377231054166e-05, + "loss": 2.2087, + "step": 2383 + }, + { + "epoch": 0.7317372621240025, + "grad_norm": 0.7872020602226257, + "learning_rate": 9.951308055862347e-05, + "loss": 2.2428, + "step": 2384 + }, + { + "epoch": 0.7320441988950276, + "grad_norm": 0.864799439907074, + "learning_rate": 9.9512388317388e-05, + "loss": 2.2392, + "step": 2385 + }, + { + "epoch": 0.7323511356660528, + "grad_norm": 0.7365485429763794, + "learning_rate": 9.95116955868421e-05, + "loss": 2.1614, + "step": 2386 + }, + { + "epoch": 0.7326580724370779, + "grad_norm": 0.6509390473365784, + "learning_rate": 9.95110023669926e-05, + "loss": 2.1917, + "step": 2387 + }, + { + "epoch": 0.7329650092081031, + "grad_norm": 0.7660403847694397, + "learning_rate": 9.951030865784635e-05, + "loss": 2.2414, + "step": 2388 + }, + { + "epoch": 0.7332719459791283, + "grad_norm": 0.9997872114181519, + "learning_rate": 9.950961445941022e-05, + "loss": 2.2063, + "step": 2389 + }, + { + "epoch": 0.7335788827501535, + "grad_norm": 1.0113418102264404, + "learning_rate": 9.950891977169106e-05, + "loss": 2.1898, + "step": 2390 + }, + { + "epoch": 0.7338858195211786, + "grad_norm": 0.8849206566810608, + "learning_rate": 9.950822459469573e-05, + "loss": 2.1503, + "step": 2391 + }, + { + "epoch": 0.7341927562922038, + "grad_norm": 0.6561055779457092, + "learning_rate": 9.950752892843112e-05, + "loss": 2.1234, + "step": 2392 + }, + { + "epoch": 0.7344996930632289, + "grad_norm": 0.5568758845329285, + "learning_rate": 9.950683277290407e-05, + "loss": 2.2129, + "step": 2393 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.7019078135490417, + "learning_rate": 9.950613612812149e-05, + "loss": 2.1162, + "step": 2394 + }, + { + "epoch": 0.7351135666052793, + "grad_norm": 0.7633521556854248, + "learning_rate": 9.950543899409026e-05, + "loss": 2.2427, + "step": 2395 + }, + { + "epoch": 0.7354205033763045, + "grad_norm": 0.6743205785751343, + "learning_rate": 9.950474137081726e-05, + "loss": 2.2213, + "step": 2396 + }, + { + "epoch": 0.7357274401473296, + "grad_norm": 0.6008336544036865, + "learning_rate": 9.950404325830941e-05, + "loss": 2.1605, + "step": 2397 + }, + { + "epoch": 0.7360343769183548, + "grad_norm": 0.648760199546814, + "learning_rate": 9.950334465657357e-05, + "loss": 2.2298, + "step": 2398 + }, + { + "epoch": 0.7363413136893799, + "grad_norm": 0.6996559500694275, + "learning_rate": 9.950264556561667e-05, + "loss": 2.1616, + "step": 2399 + }, + { + "epoch": 0.7366482504604052, + "grad_norm": 0.741629421710968, + "learning_rate": 9.950194598544561e-05, + "loss": 2.2162, + "step": 2400 + }, + { + "epoch": 0.7369551872314303, + "grad_norm": 0.6144673824310303, + "learning_rate": 9.95012459160673e-05, + "loss": 2.15, + "step": 2401 + }, + { + "epoch": 0.7372621240024555, + "grad_norm": 0.5826541781425476, + "learning_rate": 9.950054535748867e-05, + "loss": 2.1792, + "step": 2402 + }, + { + "epoch": 0.7375690607734806, + "grad_norm": 0.6489288806915283, + "learning_rate": 9.949984430971665e-05, + "loss": 2.1703, + "step": 2403 + }, + { + "epoch": 0.7378759975445058, + "grad_norm": 0.6752250790596008, + "learning_rate": 9.949914277275814e-05, + "loss": 2.2561, + "step": 2404 + }, + { + "epoch": 0.738182934315531, + "grad_norm": 0.5570092797279358, + "learning_rate": 9.94984407466201e-05, + "loss": 2.1418, + "step": 2405 + }, + { + "epoch": 0.7384898710865562, + "grad_norm": 0.5966812968254089, + "learning_rate": 9.949773823130944e-05, + "loss": 2.2168, + "step": 2406 + }, + { + "epoch": 0.7387968078575813, + "grad_norm": 0.6253142952919006, + "learning_rate": 9.949703522683314e-05, + "loss": 2.1646, + "step": 2407 + }, + { + "epoch": 0.7391037446286065, + "grad_norm": 0.6673659086227417, + "learning_rate": 9.94963317331981e-05, + "loss": 2.1904, + "step": 2408 + }, + { + "epoch": 0.7394106813996317, + "grad_norm": 0.6243279576301575, + "learning_rate": 9.949562775041133e-05, + "loss": 2.2568, + "step": 2409 + }, + { + "epoch": 0.7397176181706568, + "grad_norm": 0.7014298439025879, + "learning_rate": 9.949492327847973e-05, + "loss": 2.2331, + "step": 2410 + }, + { + "epoch": 0.7400245549416821, + "grad_norm": 0.698403537273407, + "learning_rate": 9.94942183174103e-05, + "loss": 2.1928, + "step": 2411 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.6354022026062012, + "learning_rate": 9.949351286721001e-05, + "loss": 2.0975, + "step": 2412 + }, + { + "epoch": 0.7406384284837324, + "grad_norm": 0.595302164554596, + "learning_rate": 9.949280692788579e-05, + "loss": 2.177, + "step": 2413 + }, + { + "epoch": 0.7409453652547575, + "grad_norm": 0.6844484210014343, + "learning_rate": 9.949210049944465e-05, + "loss": 2.1962, + "step": 2414 + }, + { + "epoch": 0.7412523020257827, + "grad_norm": 0.6242616176605225, + "learning_rate": 9.949139358189357e-05, + "loss": 2.2143, + "step": 2415 + }, + { + "epoch": 0.7415592387968079, + "grad_norm": 0.6524595022201538, + "learning_rate": 9.949068617523954e-05, + "loss": 2.1438, + "step": 2416 + }, + { + "epoch": 0.7418661755678331, + "grad_norm": 0.6667510867118835, + "learning_rate": 9.948997827948953e-05, + "loss": 2.2115, + "step": 2417 + }, + { + "epoch": 0.7421731123388582, + "grad_norm": 0.7688906192779541, + "learning_rate": 9.948926989465056e-05, + "loss": 2.1887, + "step": 2418 + }, + { + "epoch": 0.7424800491098834, + "grad_norm": 0.6888165473937988, + "learning_rate": 9.948856102072958e-05, + "loss": 2.1349, + "step": 2419 + }, + { + "epoch": 0.7427869858809085, + "grad_norm": 0.5672495365142822, + "learning_rate": 9.948785165773367e-05, + "loss": 2.1109, + "step": 2420 + }, + { + "epoch": 0.7430939226519337, + "grad_norm": 0.5714489221572876, + "learning_rate": 9.94871418056698e-05, + "loss": 2.1483, + "step": 2421 + }, + { + "epoch": 0.7434008594229589, + "grad_norm": 0.6061533093452454, + "learning_rate": 9.948643146454498e-05, + "loss": 2.211, + "step": 2422 + }, + { + "epoch": 0.7437077961939841, + "grad_norm": 0.6132726073265076, + "learning_rate": 9.948572063436625e-05, + "loss": 2.23, + "step": 2423 + }, + { + "epoch": 0.7440147329650092, + "grad_norm": 0.684301495552063, + "learning_rate": 9.948500931514062e-05, + "loss": 2.129, + "step": 2424 + }, + { + "epoch": 0.7443216697360344, + "grad_norm": 0.6325442790985107, + "learning_rate": 9.948429750687512e-05, + "loss": 2.129, + "step": 2425 + }, + { + "epoch": 0.7446286065070595, + "grad_norm": 0.6245989203453064, + "learning_rate": 9.948358520957678e-05, + "loss": 2.1999, + "step": 2426 + }, + { + "epoch": 0.7449355432780848, + "grad_norm": 0.6638534069061279, + "learning_rate": 9.948287242325267e-05, + "loss": 2.203, + "step": 2427 + }, + { + "epoch": 0.7452424800491099, + "grad_norm": 0.6121437549591064, + "learning_rate": 9.94821591479098e-05, + "loss": 2.1204, + "step": 2428 + }, + { + "epoch": 0.7455494168201351, + "grad_norm": 0.7919846177101135, + "learning_rate": 9.948144538355522e-05, + "loss": 2.2353, + "step": 2429 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.7246984839439392, + "learning_rate": 9.948073113019602e-05, + "loss": 2.1284, + "step": 2430 + }, + { + "epoch": 0.7461632903621854, + "grad_norm": 0.6120265126228333, + "learning_rate": 9.948001638783921e-05, + "loss": 2.0873, + "step": 2431 + }, + { + "epoch": 0.7464702271332105, + "grad_norm": 0.628588080406189, + "learning_rate": 9.947930115649189e-05, + "loss": 2.1713, + "step": 2432 + }, + { + "epoch": 0.7467771639042358, + "grad_norm": 0.63116854429245, + "learning_rate": 9.947858543616111e-05, + "loss": 2.123, + "step": 2433 + }, + { + "epoch": 0.7470841006752609, + "grad_norm": 0.6533017754554749, + "learning_rate": 9.947786922685394e-05, + "loss": 2.1593, + "step": 2434 + }, + { + "epoch": 0.7473910374462861, + "grad_norm": 0.6854177117347717, + "learning_rate": 9.947715252857749e-05, + "loss": 2.162, + "step": 2435 + }, + { + "epoch": 0.7476979742173112, + "grad_norm": 0.7257967591285706, + "learning_rate": 9.94764353413388e-05, + "loss": 2.2644, + "step": 2436 + }, + { + "epoch": 0.7480049109883364, + "grad_norm": 0.6806700825691223, + "learning_rate": 9.947571766514498e-05, + "loss": 2.0875, + "step": 2437 + }, + { + "epoch": 0.7483118477593615, + "grad_norm": 0.6616181135177612, + "learning_rate": 9.947499950000312e-05, + "loss": 2.1353, + "step": 2438 + }, + { + "epoch": 0.7486187845303868, + "grad_norm": 0.7249685525894165, + "learning_rate": 9.947428084592032e-05, + "loss": 2.148, + "step": 2439 + }, + { + "epoch": 0.7489257213014119, + "grad_norm": 0.6372905969619751, + "learning_rate": 9.947356170290369e-05, + "loss": 2.1749, + "step": 2440 + }, + { + "epoch": 0.7492326580724371, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.947284207096031e-05, + "loss": 2.1909, + "step": 2441 + }, + { + "epoch": 0.7495395948434622, + "grad_norm": 0.5830507278442383, + "learning_rate": 9.94721219500973e-05, + "loss": 2.1351, + "step": 2442 + }, + { + "epoch": 0.7498465316144874, + "grad_norm": 0.650262713432312, + "learning_rate": 9.94714013403218e-05, + "loss": 2.2602, + "step": 2443 + }, + { + "epoch": 0.7501534683855126, + "grad_norm": 0.6658717393875122, + "learning_rate": 9.947068024164091e-05, + "loss": 2.0919, + "step": 2444 + }, + { + "epoch": 0.7504604051565378, + "grad_norm": 0.7299105525016785, + "learning_rate": 9.946995865406177e-05, + "loss": 2.2079, + "step": 2445 + }, + { + "epoch": 0.7507673419275629, + "grad_norm": 0.762246310710907, + "learning_rate": 9.946923657759148e-05, + "loss": 2.2225, + "step": 2446 + }, + { + "epoch": 0.7510742786985881, + "grad_norm": 0.7019835710525513, + "learning_rate": 9.946851401223722e-05, + "loss": 2.175, + "step": 2447 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.6214791536331177, + "learning_rate": 9.946779095800611e-05, + "loss": 2.2095, + "step": 2448 + }, + { + "epoch": 0.7516881522406385, + "grad_norm": 0.6380667090415955, + "learning_rate": 9.94670674149053e-05, + "loss": 2.2325, + "step": 2449 + }, + { + "epoch": 0.7519950890116636, + "grad_norm": 0.6175886392593384, + "learning_rate": 9.946634338294191e-05, + "loss": 2.1431, + "step": 2450 + }, + { + "epoch": 0.7523020257826888, + "grad_norm": 0.6642621159553528, + "learning_rate": 9.946561886212315e-05, + "loss": 2.1538, + "step": 2451 + }, + { + "epoch": 0.7526089625537139, + "grad_norm": 0.7078617215156555, + "learning_rate": 9.946489385245614e-05, + "loss": 2.1544, + "step": 2452 + }, + { + "epoch": 0.7529158993247391, + "grad_norm": 0.6939398050308228, + "learning_rate": 9.946416835394806e-05, + "loss": 2.1131, + "step": 2453 + }, + { + "epoch": 0.7532228360957642, + "grad_norm": 0.7080716490745544, + "learning_rate": 9.946344236660608e-05, + "loss": 2.2135, + "step": 2454 + }, + { + "epoch": 0.7535297728667895, + "grad_norm": 0.7451115250587463, + "learning_rate": 9.946271589043736e-05, + "loss": 2.1475, + "step": 2455 + }, + { + "epoch": 0.7538367096378146, + "grad_norm": 0.6718367338180542, + "learning_rate": 9.946198892544909e-05, + "loss": 2.1853, + "step": 2456 + }, + { + "epoch": 0.7541436464088398, + "grad_norm": 0.7071637511253357, + "learning_rate": 9.946126147164847e-05, + "loss": 2.0981, + "step": 2457 + }, + { + "epoch": 0.7544505831798649, + "grad_norm": 0.6745624542236328, + "learning_rate": 9.946053352904267e-05, + "loss": 2.1914, + "step": 2458 + }, + { + "epoch": 0.7547575199508901, + "grad_norm": 0.7267486453056335, + "learning_rate": 9.945980509763888e-05, + "loss": 2.1091, + "step": 2459 + }, + { + "epoch": 0.7550644567219152, + "grad_norm": 0.6128695607185364, + "learning_rate": 9.94590761774443e-05, + "loss": 2.1721, + "step": 2460 + }, + { + "epoch": 0.7553713934929405, + "grad_norm": 0.6574678421020508, + "learning_rate": 9.945834676846615e-05, + "loss": 2.1609, + "step": 2461 + }, + { + "epoch": 0.7556783302639656, + "grad_norm": 0.6209995150566101, + "learning_rate": 9.945761687071164e-05, + "loss": 2.1889, + "step": 2462 + }, + { + "epoch": 0.7559852670349908, + "grad_norm": 0.7425361275672913, + "learning_rate": 9.945688648418795e-05, + "loss": 2.2189, + "step": 2463 + }, + { + "epoch": 0.7562922038060159, + "grad_norm": 1.0604934692382812, + "learning_rate": 9.945615560890234e-05, + "loss": 2.1858, + "step": 2464 + }, + { + "epoch": 0.7565991405770411, + "grad_norm": 0.7162829041481018, + "learning_rate": 9.945542424486201e-05, + "loss": 2.101, + "step": 2465 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.6361207962036133, + "learning_rate": 9.945469239207416e-05, + "loss": 2.0836, + "step": 2466 + }, + { + "epoch": 0.7572130141190915, + "grad_norm": 0.5858156085014343, + "learning_rate": 9.945396005054609e-05, + "loss": 2.2059, + "step": 2467 + }, + { + "epoch": 0.7575199508901166, + "grad_norm": 0.7322074174880981, + "learning_rate": 9.945322722028498e-05, + "loss": 2.2295, + "step": 2468 + }, + { + "epoch": 0.7578268876611418, + "grad_norm": 0.775900661945343, + "learning_rate": 9.945249390129811e-05, + "loss": 2.2171, + "step": 2469 + }, + { + "epoch": 0.7581338244321669, + "grad_norm": 0.8801379799842834, + "learning_rate": 9.94517600935927e-05, + "loss": 2.1632, + "step": 2470 + }, + { + "epoch": 0.7584407612031921, + "grad_norm": 0.8258405923843384, + "learning_rate": 9.945102579717602e-05, + "loss": 2.1591, + "step": 2471 + }, + { + "epoch": 0.7587476979742173, + "grad_norm": 0.7472482323646545, + "learning_rate": 9.945029101205532e-05, + "loss": 2.2242, + "step": 2472 + }, + { + "epoch": 0.7590546347452425, + "grad_norm": 0.6594643592834473, + "learning_rate": 9.944955573823785e-05, + "loss": 2.1217, + "step": 2473 + }, + { + "epoch": 0.7593615715162676, + "grad_norm": 0.6547524333000183, + "learning_rate": 9.944881997573088e-05, + "loss": 2.131, + "step": 2474 + }, + { + "epoch": 0.7596685082872928, + "grad_norm": 0.6630129814147949, + "learning_rate": 9.94480837245417e-05, + "loss": 2.1264, + "step": 2475 + }, + { + "epoch": 0.7599754450583179, + "grad_norm": 0.6877384781837463, + "learning_rate": 9.944734698467757e-05, + "loss": 2.2453, + "step": 2476 + }, + { + "epoch": 0.7602823818293432, + "grad_norm": 0.6736158728599548, + "learning_rate": 9.944660975614579e-05, + "loss": 2.1425, + "step": 2477 + }, + { + "epoch": 0.7605893186003683, + "grad_norm": 0.6140786409378052, + "learning_rate": 9.944587203895361e-05, + "loss": 2.1345, + "step": 2478 + }, + { + "epoch": 0.7608962553713935, + "grad_norm": 0.5515910387039185, + "learning_rate": 9.944513383310837e-05, + "loss": 2.086, + "step": 2479 + }, + { + "epoch": 0.7612031921424187, + "grad_norm": 0.49419671297073364, + "learning_rate": 9.944439513861731e-05, + "loss": 2.1069, + "step": 2480 + }, + { + "epoch": 0.7615101289134438, + "grad_norm": 0.5526577234268188, + "learning_rate": 9.944365595548777e-05, + "loss": 2.1702, + "step": 2481 + }, + { + "epoch": 0.761817065684469, + "grad_norm": 0.5430580973625183, + "learning_rate": 9.944291628372702e-05, + "loss": 2.121, + "step": 2482 + }, + { + "epoch": 0.7621240024554942, + "grad_norm": 0.5333554148674011, + "learning_rate": 9.94421761233424e-05, + "loss": 2.1154, + "step": 2483 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.5856761932373047, + "learning_rate": 9.944143547434124e-05, + "loss": 2.1734, + "step": 2484 + }, + { + "epoch": 0.7627378759975445, + "grad_norm": 0.6619083881378174, + "learning_rate": 9.944069433673082e-05, + "loss": 2.2068, + "step": 2485 + }, + { + "epoch": 0.7630448127685697, + "grad_norm": 0.5791018009185791, + "learning_rate": 9.943995271051849e-05, + "loss": 2.0834, + "step": 2486 + }, + { + "epoch": 0.7633517495395948, + "grad_norm": 0.5942522287368774, + "learning_rate": 9.943921059571155e-05, + "loss": 2.2001, + "step": 2487 + }, + { + "epoch": 0.7636586863106201, + "grad_norm": 0.6285880208015442, + "learning_rate": 9.943846799231738e-05, + "loss": 2.1601, + "step": 2488 + }, + { + "epoch": 0.7639656230816452, + "grad_norm": 0.6337715983390808, + "learning_rate": 9.943772490034326e-05, + "loss": 2.1722, + "step": 2489 + }, + { + "epoch": 0.7642725598526704, + "grad_norm": 0.6912121772766113, + "learning_rate": 9.94369813197966e-05, + "loss": 2.1933, + "step": 2490 + }, + { + "epoch": 0.7645794966236955, + "grad_norm": 0.8028284311294556, + "learning_rate": 9.943623725068469e-05, + "loss": 2.129, + "step": 2491 + }, + { + "epoch": 0.7648864333947207, + "grad_norm": 0.8527138233184814, + "learning_rate": 9.943549269301491e-05, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.7651933701657458, + "grad_norm": 0.8422580361366272, + "learning_rate": 9.943474764679462e-05, + "loss": 2.2958, + "step": 2493 + }, + { + "epoch": 0.7655003069367711, + "grad_norm": 0.7698150873184204, + "learning_rate": 9.943400211203118e-05, + "loss": 2.1415, + "step": 2494 + }, + { + "epoch": 0.7658072437077962, + "grad_norm": 0.6360690593719482, + "learning_rate": 9.943325608873196e-05, + "loss": 2.1188, + "step": 2495 + }, + { + "epoch": 0.7661141804788214, + "grad_norm": 0.6225799918174744, + "learning_rate": 9.943250957690433e-05, + "loss": 2.1006, + "step": 2496 + }, + { + "epoch": 0.7664211172498465, + "grad_norm": 0.6694490909576416, + "learning_rate": 9.943176257655567e-05, + "loss": 2.2455, + "step": 2497 + }, + { + "epoch": 0.7667280540208717, + "grad_norm": 0.6188158988952637, + "learning_rate": 9.943101508769335e-05, + "loss": 2.0853, + "step": 2498 + }, + { + "epoch": 0.7670349907918969, + "grad_norm": 0.5934504866600037, + "learning_rate": 9.943026711032477e-05, + "loss": 2.0718, + "step": 2499 + }, + { + "epoch": 0.7673419275629221, + "grad_norm": 0.6261292695999146, + "learning_rate": 9.942951864445732e-05, + "loss": 2.1747, + "step": 2500 + }, + { + "epoch": 0.7676488643339472, + "grad_norm": 0.5891184210777283, + "learning_rate": 9.94287696900984e-05, + "loss": 2.1637, + "step": 2501 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.5321740508079529, + "learning_rate": 9.94280202472554e-05, + "loss": 2.0717, + "step": 2502 + }, + { + "epoch": 0.7682627378759975, + "grad_norm": 0.5563281178474426, + "learning_rate": 9.942727031593573e-05, + "loss": 2.1654, + "step": 2503 + }, + { + "epoch": 0.7685696746470227, + "grad_norm": 0.5672664046287537, + "learning_rate": 9.942651989614681e-05, + "loss": 2.0853, + "step": 2504 + }, + { + "epoch": 0.7688766114180479, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.942576898789606e-05, + "loss": 2.0636, + "step": 2505 + }, + { + "epoch": 0.7691835481890731, + "grad_norm": 0.5802470445632935, + "learning_rate": 9.942501759119088e-05, + "loss": 2.0924, + "step": 2506 + }, + { + "epoch": 0.7694904849600982, + "grad_norm": 0.5630003213882446, + "learning_rate": 9.94242657060387e-05, + "loss": 2.1975, + "step": 2507 + }, + { + "epoch": 0.7697974217311234, + "grad_norm": 0.6001835465431213, + "learning_rate": 9.942351333244697e-05, + "loss": 2.1187, + "step": 2508 + }, + { + "epoch": 0.7701043585021485, + "grad_norm": 0.6702088117599487, + "learning_rate": 9.942276047042311e-05, + "loss": 2.1489, + "step": 2509 + }, + { + "epoch": 0.7704112952731738, + "grad_norm": 0.7941808700561523, + "learning_rate": 9.942200711997456e-05, + "loss": 2.1404, + "step": 2510 + }, + { + "epoch": 0.7707182320441989, + "grad_norm": 0.8202539682388306, + "learning_rate": 9.942125328110876e-05, + "loss": 2.1242, + "step": 2511 + }, + { + "epoch": 0.7710251688152241, + "grad_norm": 0.7667655348777771, + "learning_rate": 9.942049895383319e-05, + "loss": 2.118, + "step": 2512 + }, + { + "epoch": 0.7713321055862492, + "grad_norm": 0.6766887307167053, + "learning_rate": 9.941974413815527e-05, + "loss": 2.2632, + "step": 2513 + }, + { + "epoch": 0.7716390423572744, + "grad_norm": 0.5923287272453308, + "learning_rate": 9.941898883408248e-05, + "loss": 2.1096, + "step": 2514 + }, + { + "epoch": 0.7719459791282995, + "grad_norm": 0.8847586512565613, + "learning_rate": 9.941823304162227e-05, + "loss": 2.2629, + "step": 2515 + }, + { + "epoch": 0.7722529158993248, + "grad_norm": 1.2274069786071777, + "learning_rate": 9.941747676078211e-05, + "loss": 2.2493, + "step": 2516 + }, + { + "epoch": 0.7725598526703499, + "grad_norm": 0.8637729287147522, + "learning_rate": 9.94167199915695e-05, + "loss": 2.1545, + "step": 2517 + }, + { + "epoch": 0.7728667894413751, + "grad_norm": 0.7852178812026978, + "learning_rate": 9.941596273399187e-05, + "loss": 2.1984, + "step": 2518 + }, + { + "epoch": 0.7731737262124002, + "grad_norm": 0.6839576959609985, + "learning_rate": 9.941520498805677e-05, + "loss": 2.1913, + "step": 2519 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.7051649689674377, + "learning_rate": 9.941444675377163e-05, + "loss": 2.1678, + "step": 2520 + }, + { + "epoch": 0.7737875997544506, + "grad_norm": 0.702549159526825, + "learning_rate": 9.941368803114395e-05, + "loss": 2.1426, + "step": 2521 + }, + { + "epoch": 0.7740945365254758, + "grad_norm": 0.6717942953109741, + "learning_rate": 9.941292882018127e-05, + "loss": 2.1873, + "step": 2522 + }, + { + "epoch": 0.7744014732965009, + "grad_norm": 0.6705282926559448, + "learning_rate": 9.941216912089104e-05, + "loss": 2.1363, + "step": 2523 + }, + { + "epoch": 0.7747084100675261, + "grad_norm": 0.5858317017555237, + "learning_rate": 9.941140893328082e-05, + "loss": 2.1019, + "step": 2524 + }, + { + "epoch": 0.7750153468385512, + "grad_norm": 0.6353682279586792, + "learning_rate": 9.941064825735808e-05, + "loss": 2.1765, + "step": 2525 + }, + { + "epoch": 0.7753222836095764, + "grad_norm": 0.6573354601860046, + "learning_rate": 9.940988709313035e-05, + "loss": 2.0636, + "step": 2526 + }, + { + "epoch": 0.7756292203806016, + "grad_norm": 0.6040489077568054, + "learning_rate": 9.940912544060517e-05, + "loss": 2.0902, + "step": 2527 + }, + { + "epoch": 0.7759361571516268, + "grad_norm": 0.7024530172348022, + "learning_rate": 9.940836329979004e-05, + "loss": 2.2198, + "step": 2528 + }, + { + "epoch": 0.7762430939226519, + "grad_norm": 0.6910196542739868, + "learning_rate": 9.940760067069251e-05, + "loss": 2.0546, + "step": 2529 + }, + { + "epoch": 0.7765500306936771, + "grad_norm": 0.6841506361961365, + "learning_rate": 9.940683755332012e-05, + "loss": 2.2159, + "step": 2530 + }, + { + "epoch": 0.7768569674647022, + "grad_norm": 0.6503066420555115, + "learning_rate": 9.940607394768038e-05, + "loss": 2.2156, + "step": 2531 + }, + { + "epoch": 0.7771639042357275, + "grad_norm": 0.6512146592140198, + "learning_rate": 9.940530985378089e-05, + "loss": 2.1417, + "step": 2532 + }, + { + "epoch": 0.7774708410067526, + "grad_norm": 0.6234787106513977, + "learning_rate": 9.940454527162914e-05, + "loss": 2.1315, + "step": 2533 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.6279457211494446, + "learning_rate": 9.940378020123273e-05, + "loss": 2.2699, + "step": 2534 + }, + { + "epoch": 0.7780847145488029, + "grad_norm": 0.6793956160545349, + "learning_rate": 9.940301464259921e-05, + "loss": 2.2488, + "step": 2535 + }, + { + "epoch": 0.7783916513198281, + "grad_norm": 0.721234142780304, + "learning_rate": 9.940224859573614e-05, + "loss": 2.1183, + "step": 2536 + }, + { + "epoch": 0.7786985880908532, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.94014820606511e-05, + "loss": 2.0995, + "step": 2537 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.6358578205108643, + "learning_rate": 9.940071503735165e-05, + "loss": 2.2024, + "step": 2538 + }, + { + "epoch": 0.7793124616329036, + "grad_norm": 0.6250868439674377, + "learning_rate": 9.939994752584538e-05, + "loss": 2.1574, + "step": 2539 + }, + { + "epoch": 0.7796193984039288, + "grad_norm": 0.7657763361930847, + "learning_rate": 9.939917952613989e-05, + "loss": 2.2625, + "step": 2540 + }, + { + "epoch": 0.7799263351749539, + "grad_norm": 0.7625400424003601, + "learning_rate": 9.939841103824275e-05, + "loss": 2.1809, + "step": 2541 + }, + { + "epoch": 0.7802332719459791, + "grad_norm": 0.8593107461929321, + "learning_rate": 9.939764206216155e-05, + "loss": 2.2359, + "step": 2542 + }, + { + "epoch": 0.7805402087170042, + "grad_norm": 0.8441007733345032, + "learning_rate": 9.93968725979039e-05, + "loss": 2.1844, + "step": 2543 + }, + { + "epoch": 0.7808471454880295, + "grad_norm": 0.6408470273017883, + "learning_rate": 9.93961026454774e-05, + "loss": 2.1871, + "step": 2544 + }, + { + "epoch": 0.7811540822590546, + "grad_norm": 0.6779976487159729, + "learning_rate": 9.939533220488966e-05, + "loss": 2.1651, + "step": 2545 + }, + { + "epoch": 0.7814610190300798, + "grad_norm": 0.5885556936264038, + "learning_rate": 9.93945612761483e-05, + "loss": 2.0172, + "step": 2546 + }, + { + "epoch": 0.7817679558011049, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.939378985926094e-05, + "loss": 2.1358, + "step": 2547 + }, + { + "epoch": 0.7820748925721301, + "grad_norm": 0.685183584690094, + "learning_rate": 9.939301795423519e-05, + "loss": 2.1822, + "step": 2548 + }, + { + "epoch": 0.7823818293431553, + "grad_norm": 0.6666997671127319, + "learning_rate": 9.939224556107869e-05, + "loss": 2.288, + "step": 2549 + }, + { + "epoch": 0.7826887661141805, + "grad_norm": 0.6401170492172241, + "learning_rate": 9.939147267979905e-05, + "loss": 2.1038, + "step": 2550 + }, + { + "epoch": 0.7829957028852057, + "grad_norm": 0.645182728767395, + "learning_rate": 9.939069931040396e-05, + "loss": 2.1285, + "step": 2551 + }, + { + "epoch": 0.7833026396562308, + "grad_norm": 0.6795851588249207, + "learning_rate": 9.9389925452901e-05, + "loss": 2.1844, + "step": 2552 + }, + { + "epoch": 0.783609576427256, + "grad_norm": 0.7027488946914673, + "learning_rate": 9.938915110729788e-05, + "loss": 2.1712, + "step": 2553 + }, + { + "epoch": 0.7839165131982812, + "grad_norm": 0.7076524496078491, + "learning_rate": 9.93883762736022e-05, + "loss": 2.1812, + "step": 2554 + }, + { + "epoch": 0.7842234499693064, + "grad_norm": 0.5979459881782532, + "learning_rate": 9.938760095182165e-05, + "loss": 2.0877, + "step": 2555 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.6408665776252747, + "learning_rate": 9.938682514196387e-05, + "loss": 2.191, + "step": 2556 + }, + { + "epoch": 0.7848373235113567, + "grad_norm": 0.6545908451080322, + "learning_rate": 9.938604884403654e-05, + "loss": 2.0933, + "step": 2557 + }, + { + "epoch": 0.7851442602823818, + "grad_norm": 0.7271838784217834, + "learning_rate": 9.938527205804733e-05, + "loss": 2.1804, + "step": 2558 + }, + { + "epoch": 0.785451197053407, + "grad_norm": 0.6371840834617615, + "learning_rate": 9.938449478400391e-05, + "loss": 2.1161, + "step": 2559 + }, + { + "epoch": 0.7857581338244322, + "grad_norm": 0.5922467708587646, + "learning_rate": 9.938371702191398e-05, + "loss": 2.0929, + "step": 2560 + }, + { + "epoch": 0.7860650705954574, + "grad_norm": 0.536125898361206, + "learning_rate": 9.938293877178522e-05, + "loss": 2.0815, + "step": 2561 + }, + { + "epoch": 0.7863720073664825, + "grad_norm": 0.6026225090026855, + "learning_rate": 9.93821600336253e-05, + "loss": 2.1719, + "step": 2562 + }, + { + "epoch": 0.7866789441375077, + "grad_norm": 0.584267795085907, + "learning_rate": 9.938138080744192e-05, + "loss": 2.1515, + "step": 2563 + }, + { + "epoch": 0.7869858809085328, + "grad_norm": 0.6616362929344177, + "learning_rate": 9.938060109324281e-05, + "loss": 2.2425, + "step": 2564 + }, + { + "epoch": 0.787292817679558, + "grad_norm": 0.669987678527832, + "learning_rate": 9.937982089103566e-05, + "loss": 2.1883, + "step": 2565 + }, + { + "epoch": 0.7875997544505832, + "grad_norm": 0.6769465208053589, + "learning_rate": 9.937904020082815e-05, + "loss": 2.1508, + "step": 2566 + }, + { + "epoch": 0.7879066912216084, + "grad_norm": 0.5796112418174744, + "learning_rate": 9.937825902262805e-05, + "loss": 2.0925, + "step": 2567 + }, + { + "epoch": 0.7882136279926335, + "grad_norm": 0.5895870923995972, + "learning_rate": 9.937747735644305e-05, + "loss": 2.1002, + "step": 2568 + }, + { + "epoch": 0.7885205647636587, + "grad_norm": 0.5870219469070435, + "learning_rate": 9.937669520228088e-05, + "loss": 2.1189, + "step": 2569 + }, + { + "epoch": 0.7888275015346838, + "grad_norm": 0.6191404461860657, + "learning_rate": 9.937591256014925e-05, + "loss": 2.1783, + "step": 2570 + }, + { + "epoch": 0.7891344383057091, + "grad_norm": 0.6033806204795837, + "learning_rate": 9.937512943005592e-05, + "loss": 2.1507, + "step": 2571 + }, + { + "epoch": 0.7894413750767342, + "grad_norm": 0.6319470405578613, + "learning_rate": 9.937434581200863e-05, + "loss": 2.2088, + "step": 2572 + }, + { + "epoch": 0.7897483118477594, + "grad_norm": 0.621004581451416, + "learning_rate": 9.93735617060151e-05, + "loss": 2.1523, + "step": 2573 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.6069821715354919, + "learning_rate": 9.937277711208311e-05, + "loss": 2.1437, + "step": 2574 + }, + { + "epoch": 0.7903621853898097, + "grad_norm": 0.6186996102333069, + "learning_rate": 9.937199203022039e-05, + "loss": 2.1541, + "step": 2575 + }, + { + "epoch": 0.7906691221608348, + "grad_norm": 0.6531949639320374, + "learning_rate": 9.937120646043471e-05, + "loss": 2.1928, + "step": 2576 + }, + { + "epoch": 0.7909760589318601, + "grad_norm": 0.5974560379981995, + "learning_rate": 9.937042040273383e-05, + "loss": 2.1814, + "step": 2577 + }, + { + "epoch": 0.7912829957028852, + "grad_norm": 0.59506756067276, + "learning_rate": 9.936963385712552e-05, + "loss": 2.2143, + "step": 2578 + }, + { + "epoch": 0.7915899324739104, + "grad_norm": 0.5878757834434509, + "learning_rate": 9.936884682361755e-05, + "loss": 2.0718, + "step": 2579 + }, + { + "epoch": 0.7918968692449355, + "grad_norm": 0.6318243145942688, + "learning_rate": 9.936805930221769e-05, + "loss": 2.1465, + "step": 2580 + }, + { + "epoch": 0.7922038060159607, + "grad_norm": 0.6474836468696594, + "learning_rate": 9.936727129293376e-05, + "loss": 2.0869, + "step": 2581 + }, + { + "epoch": 0.7925107427869859, + "grad_norm": 0.6589438915252686, + "learning_rate": 9.936648279577349e-05, + "loss": 2.1422, + "step": 2582 + }, + { + "epoch": 0.7928176795580111, + "grad_norm": 0.6935134530067444, + "learning_rate": 9.93656938107447e-05, + "loss": 2.1571, + "step": 2583 + }, + { + "epoch": 0.7931246163290362, + "grad_norm": 0.655430793762207, + "learning_rate": 9.936490433785522e-05, + "loss": 2.1044, + "step": 2584 + }, + { + "epoch": 0.7934315531000614, + "grad_norm": 0.6856111288070679, + "learning_rate": 9.93641143771128e-05, + "loss": 2.0551, + "step": 2585 + }, + { + "epoch": 0.7937384898710865, + "grad_norm": 0.6783097386360168, + "learning_rate": 9.936332392852527e-05, + "loss": 2.1475, + "step": 2586 + }, + { + "epoch": 0.7940454266421118, + "grad_norm": 0.6746678948402405, + "learning_rate": 9.936253299210045e-05, + "loss": 2.1462, + "step": 2587 + }, + { + "epoch": 0.7943523634131369, + "grad_norm": 0.6854017972946167, + "learning_rate": 9.936174156784614e-05, + "loss": 2.1649, + "step": 2588 + }, + { + "epoch": 0.7946593001841621, + "grad_norm": 0.6740380525588989, + "learning_rate": 9.936094965577017e-05, + "loss": 2.06, + "step": 2589 + }, + { + "epoch": 0.7949662369551872, + "grad_norm": 0.6354179978370667, + "learning_rate": 9.936015725588037e-05, + "loss": 2.1938, + "step": 2590 + }, + { + "epoch": 0.7952731737262124, + "grad_norm": 0.6496716141700745, + "learning_rate": 9.935936436818453e-05, + "loss": 2.089, + "step": 2591 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.5996106266975403, + "learning_rate": 9.935857099269057e-05, + "loss": 2.2254, + "step": 2592 + }, + { + "epoch": 0.7958870472682628, + "grad_norm": 0.5630382895469666, + "learning_rate": 9.935777712940625e-05, + "loss": 2.069, + "step": 2593 + }, + { + "epoch": 0.7961939840392879, + "grad_norm": 0.5480468273162842, + "learning_rate": 9.935698277833946e-05, + "loss": 2.1288, + "step": 2594 + }, + { + "epoch": 0.7965009208103131, + "grad_norm": 0.5127096772193909, + "learning_rate": 9.935618793949803e-05, + "loss": 2.0753, + "step": 2595 + }, + { + "epoch": 0.7968078575813382, + "grad_norm": 0.6451439261436462, + "learning_rate": 9.935539261288983e-05, + "loss": 2.3005, + "step": 2596 + }, + { + "epoch": 0.7971147943523634, + "grad_norm": 0.7047737836837769, + "learning_rate": 9.935459679852271e-05, + "loss": 2.1307, + "step": 2597 + }, + { + "epoch": 0.7974217311233885, + "grad_norm": 0.6382983922958374, + "learning_rate": 9.935380049640454e-05, + "loss": 2.1136, + "step": 2598 + }, + { + "epoch": 0.7977286678944138, + "grad_norm": 0.7337773442268372, + "learning_rate": 9.935300370654317e-05, + "loss": 2.0719, + "step": 2599 + }, + { + "epoch": 0.7980356046654389, + "grad_norm": 0.7481197118759155, + "learning_rate": 9.935220642894652e-05, + "loss": 2.2263, + "step": 2600 + }, + { + "epoch": 0.7983425414364641, + "grad_norm": 0.7383365631103516, + "learning_rate": 9.93514086636224e-05, + "loss": 2.2207, + "step": 2601 + }, + { + "epoch": 0.7986494782074892, + "grad_norm": 0.800762951374054, + "learning_rate": 9.935061041057876e-05, + "loss": 2.1848, + "step": 2602 + }, + { + "epoch": 0.7989564149785144, + "grad_norm": 0.6972829699516296, + "learning_rate": 9.934981166982346e-05, + "loss": 2.1301, + "step": 2603 + }, + { + "epoch": 0.7992633517495396, + "grad_norm": 0.5842304229736328, + "learning_rate": 9.93490124413644e-05, + "loss": 2.1311, + "step": 2604 + }, + { + "epoch": 0.7995702885205648, + "grad_norm": 0.6070491075515747, + "learning_rate": 9.934821272520946e-05, + "loss": 2.2226, + "step": 2605 + }, + { + "epoch": 0.7998772252915899, + "grad_norm": 0.6141406297683716, + "learning_rate": 9.934741252136656e-05, + "loss": 2.1425, + "step": 2606 + }, + { + "epoch": 0.8001841620626151, + "grad_norm": 0.5515148043632507, + "learning_rate": 9.934661182984363e-05, + "loss": 2.1138, + "step": 2607 + }, + { + "epoch": 0.8004910988336402, + "grad_norm": 0.5819688439369202, + "learning_rate": 9.934581065064854e-05, + "loss": 2.0835, + "step": 2608 + }, + { + "epoch": 0.8007980356046654, + "grad_norm": 0.593979001045227, + "learning_rate": 9.934500898378922e-05, + "loss": 2.2262, + "step": 2609 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.6978363990783691, + "learning_rate": 9.934420682927361e-05, + "loss": 2.1283, + "step": 2610 + }, + { + "epoch": 0.8014119091467158, + "grad_norm": 0.6205853223800659, + "learning_rate": 9.934340418710963e-05, + "loss": 2.1254, + "step": 2611 + }, + { + "epoch": 0.8017188459177409, + "grad_norm": 0.5547113418579102, + "learning_rate": 9.93426010573052e-05, + "loss": 2.0895, + "step": 2612 + }, + { + "epoch": 0.8020257826887661, + "grad_norm": 0.5652415156364441, + "learning_rate": 9.934179743986827e-05, + "loss": 2.1496, + "step": 2613 + }, + { + "epoch": 0.8023327194597912, + "grad_norm": 0.5833094120025635, + "learning_rate": 9.934099333480678e-05, + "loss": 2.1159, + "step": 2614 + }, + { + "epoch": 0.8026396562308165, + "grad_norm": 0.5929473638534546, + "learning_rate": 9.934018874212866e-05, + "loss": 2.1512, + "step": 2615 + }, + { + "epoch": 0.8029465930018416, + "grad_norm": 0.6359207630157471, + "learning_rate": 9.93393836618419e-05, + "loss": 2.1384, + "step": 2616 + }, + { + "epoch": 0.8032535297728668, + "grad_norm": 0.5934728384017944, + "learning_rate": 9.933857809395441e-05, + "loss": 2.1087, + "step": 2617 + }, + { + "epoch": 0.8035604665438919, + "grad_norm": 0.5685787796974182, + "learning_rate": 9.933777203847418e-05, + "loss": 2.1521, + "step": 2618 + }, + { + "epoch": 0.8038674033149171, + "grad_norm": 0.6276339292526245, + "learning_rate": 9.933696549540918e-05, + "loss": 2.1151, + "step": 2619 + }, + { + "epoch": 0.8041743400859422, + "grad_norm": 0.6206804513931274, + "learning_rate": 9.933615846476736e-05, + "loss": 2.1872, + "step": 2620 + }, + { + "epoch": 0.8044812768569675, + "grad_norm": 0.6645623445510864, + "learning_rate": 9.933535094655671e-05, + "loss": 2.217, + "step": 2621 + }, + { + "epoch": 0.8047882136279927, + "grad_norm": 0.6639950275421143, + "learning_rate": 9.93345429407852e-05, + "loss": 2.1479, + "step": 2622 + }, + { + "epoch": 0.8050951503990178, + "grad_norm": 0.6284301280975342, + "learning_rate": 9.933373444746081e-05, + "loss": 2.1763, + "step": 2623 + }, + { + "epoch": 0.805402087170043, + "grad_norm": 0.5974198579788208, + "learning_rate": 9.933292546659156e-05, + "loss": 2.1453, + "step": 2624 + }, + { + "epoch": 0.8057090239410681, + "grad_norm": 0.6465814113616943, + "learning_rate": 9.933211599818541e-05, + "loss": 2.1999, + "step": 2625 + }, + { + "epoch": 0.8060159607120934, + "grad_norm": 0.6099503040313721, + "learning_rate": 9.933130604225038e-05, + "loss": 2.1523, + "step": 2626 + }, + { + "epoch": 0.8063228974831185, + "grad_norm": 0.5749596953392029, + "learning_rate": 9.933049559879448e-05, + "loss": 2.0802, + "step": 2627 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.5541282892227173, + "learning_rate": 9.93296846678257e-05, + "loss": 2.0851, + "step": 2628 + }, + { + "epoch": 0.8069367710251688, + "grad_norm": 0.5884469747543335, + "learning_rate": 9.932887324935207e-05, + "loss": 2.1824, + "step": 2629 + }, + { + "epoch": 0.807243707796194, + "grad_norm": 0.7330854535102844, + "learning_rate": 9.93280613433816e-05, + "loss": 2.1463, + "step": 2630 + }, + { + "epoch": 0.8075506445672191, + "grad_norm": 0.7012677192687988, + "learning_rate": 9.932724894992232e-05, + "loss": 2.0907, + "step": 2631 + }, + { + "epoch": 0.8078575813382444, + "grad_norm": 0.6487980484962463, + "learning_rate": 9.932643606898224e-05, + "loss": 2.2131, + "step": 2632 + }, + { + "epoch": 0.8081645181092695, + "grad_norm": 0.7956567406654358, + "learning_rate": 9.932562270056941e-05, + "loss": 2.2289, + "step": 2633 + }, + { + "epoch": 0.8084714548802947, + "grad_norm": 0.7904889583587646, + "learning_rate": 9.932480884469187e-05, + "loss": 2.195, + "step": 2634 + }, + { + "epoch": 0.8087783916513198, + "grad_norm": 0.8088505864143372, + "learning_rate": 9.932399450135766e-05, + "loss": 2.1199, + "step": 2635 + }, + { + "epoch": 0.809085328422345, + "grad_norm": 0.7557070851325989, + "learning_rate": 9.932317967057483e-05, + "loss": 2.177, + "step": 2636 + }, + { + "epoch": 0.8093922651933702, + "grad_norm": 0.8585113286972046, + "learning_rate": 9.932236435235143e-05, + "loss": 2.2215, + "step": 2637 + }, + { + "epoch": 0.8096992019643954, + "grad_norm": 0.9541242718696594, + "learning_rate": 9.932154854669551e-05, + "loss": 2.0971, + "step": 2638 + }, + { + "epoch": 0.8100061387354205, + "grad_norm": 0.9696017503738403, + "learning_rate": 9.932073225361513e-05, + "loss": 2.1723, + "step": 2639 + }, + { + "epoch": 0.8103130755064457, + "grad_norm": 0.9876028895378113, + "learning_rate": 9.931991547311839e-05, + "loss": 2.2266, + "step": 2640 + }, + { + "epoch": 0.8106200122774708, + "grad_norm": 0.9169884324073792, + "learning_rate": 9.931909820521332e-05, + "loss": 2.1453, + "step": 2641 + }, + { + "epoch": 0.810926949048496, + "grad_norm": 0.7645174860954285, + "learning_rate": 9.931828044990801e-05, + "loss": 2.1683, + "step": 2642 + }, + { + "epoch": 0.8112338858195212, + "grad_norm": 0.6733110547065735, + "learning_rate": 9.931746220721056e-05, + "loss": 2.0869, + "step": 2643 + }, + { + "epoch": 0.8115408225905464, + "grad_norm": 0.6033461689949036, + "learning_rate": 9.931664347712904e-05, + "loss": 2.1395, + "step": 2644 + }, + { + "epoch": 0.8118477593615715, + "grad_norm": 0.5953301191329956, + "learning_rate": 9.931582425967154e-05, + "loss": 2.0886, + "step": 2645 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.6587704420089722, + "learning_rate": 9.931500455484616e-05, + "loss": 2.1846, + "step": 2646 + }, + { + "epoch": 0.8124616329036218, + "grad_norm": 0.5837808847427368, + "learning_rate": 9.931418436266101e-05, + "loss": 2.0953, + "step": 2647 + }, + { + "epoch": 0.8127685696746471, + "grad_norm": 0.5593163967132568, + "learning_rate": 9.931336368312417e-05, + "loss": 2.1044, + "step": 2648 + }, + { + "epoch": 0.8130755064456722, + "grad_norm": 0.5758668780326843, + "learning_rate": 9.931254251624378e-05, + "loss": 2.1813, + "step": 2649 + }, + { + "epoch": 0.8133824432166974, + "grad_norm": 0.7128240466117859, + "learning_rate": 9.931172086202793e-05, + "loss": 2.1743, + "step": 2650 + }, + { + "epoch": 0.8136893799877225, + "grad_norm": 0.6214346885681152, + "learning_rate": 9.931089872048476e-05, + "loss": 2.0566, + "step": 2651 + }, + { + "epoch": 0.8139963167587477, + "grad_norm": 0.6279975771903992, + "learning_rate": 9.931007609162239e-05, + "loss": 2.1487, + "step": 2652 + }, + { + "epoch": 0.8143032535297728, + "grad_norm": 0.6137428879737854, + "learning_rate": 9.930925297544895e-05, + "loss": 2.1281, + "step": 2653 + }, + { + "epoch": 0.8146101903007981, + "grad_norm": 0.7433622479438782, + "learning_rate": 9.930842937197255e-05, + "loss": 2.2398, + "step": 2654 + }, + { + "epoch": 0.8149171270718232, + "grad_norm": 0.7490934729576111, + "learning_rate": 9.930760528120137e-05, + "loss": 2.0626, + "step": 2655 + }, + { + "epoch": 0.8152240638428484, + "grad_norm": 0.6829020380973816, + "learning_rate": 9.930678070314352e-05, + "loss": 2.0685, + "step": 2656 + }, + { + "epoch": 0.8155310006138735, + "grad_norm": 0.6328942775726318, + "learning_rate": 9.930595563780718e-05, + "loss": 2.1415, + "step": 2657 + }, + { + "epoch": 0.8158379373848987, + "grad_norm": 0.6919183135032654, + "learning_rate": 9.930513008520048e-05, + "loss": 2.1764, + "step": 2658 + }, + { + "epoch": 0.8161448741559238, + "grad_norm": 0.6600683331489563, + "learning_rate": 9.930430404533158e-05, + "loss": 2.2252, + "step": 2659 + }, + { + "epoch": 0.8164518109269491, + "grad_norm": 0.6614112257957458, + "learning_rate": 9.930347751820866e-05, + "loss": 2.0842, + "step": 2660 + }, + { + "epoch": 0.8167587476979742, + "grad_norm": 0.634395182132721, + "learning_rate": 9.930265050383987e-05, + "loss": 2.1784, + "step": 2661 + }, + { + "epoch": 0.8170656844689994, + "grad_norm": 0.6563819050788879, + "learning_rate": 9.930182300223338e-05, + "loss": 2.1845, + "step": 2662 + }, + { + "epoch": 0.8173726212400245, + "grad_norm": 0.7023175954818726, + "learning_rate": 9.93009950133974e-05, + "loss": 2.1913, + "step": 2663 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.6042037010192871, + "learning_rate": 9.930016653734007e-05, + "loss": 2.1624, + "step": 2664 + }, + { + "epoch": 0.8179864947820749, + "grad_norm": 0.5729875564575195, + "learning_rate": 9.929933757406962e-05, + "loss": 2.0439, + "step": 2665 + }, + { + "epoch": 0.8182934315531001, + "grad_norm": 0.5399687886238098, + "learning_rate": 9.929850812359421e-05, + "loss": 2.1438, + "step": 2666 + }, + { + "epoch": 0.8186003683241252, + "grad_norm": 0.6325745582580566, + "learning_rate": 9.929767818592205e-05, + "loss": 2.1644, + "step": 2667 + }, + { + "epoch": 0.8189073050951504, + "grad_norm": 0.6303146481513977, + "learning_rate": 9.929684776106134e-05, + "loss": 2.1106, + "step": 2668 + }, + { + "epoch": 0.8192142418661755, + "grad_norm": 0.6482712030410767, + "learning_rate": 9.929601684902027e-05, + "loss": 2.0877, + "step": 2669 + }, + { + "epoch": 0.8195211786372008, + "grad_norm": 0.6858036518096924, + "learning_rate": 9.92951854498071e-05, + "loss": 2.1263, + "step": 2670 + }, + { + "epoch": 0.8198281154082259, + "grad_norm": 0.6214284896850586, + "learning_rate": 9.929435356343e-05, + "loss": 2.1516, + "step": 2671 + }, + { + "epoch": 0.8201350521792511, + "grad_norm": 0.5486865639686584, + "learning_rate": 9.92935211898972e-05, + "loss": 2.1199, + "step": 2672 + }, + { + "epoch": 0.8204419889502762, + "grad_norm": 0.62936931848526, + "learning_rate": 9.929268832921693e-05, + "loss": 2.1555, + "step": 2673 + }, + { + "epoch": 0.8207489257213014, + "grad_norm": 0.6402064561843872, + "learning_rate": 9.929185498139744e-05, + "loss": 2.1017, + "step": 2674 + }, + { + "epoch": 0.8210558624923265, + "grad_norm": 0.7254593372344971, + "learning_rate": 9.929102114644693e-05, + "loss": 2.1145, + "step": 2675 + }, + { + "epoch": 0.8213627992633518, + "grad_norm": 0.776472806930542, + "learning_rate": 9.929018682437366e-05, + "loss": 2.2582, + "step": 2676 + }, + { + "epoch": 0.8216697360343769, + "grad_norm": 0.7073757648468018, + "learning_rate": 9.928935201518587e-05, + "loss": 2.1135, + "step": 2677 + }, + { + "epoch": 0.8219766728054021, + "grad_norm": 0.7075079679489136, + "learning_rate": 9.928851671889184e-05, + "loss": 2.128, + "step": 2678 + }, + { + "epoch": 0.8222836095764272, + "grad_norm": 0.7937450408935547, + "learning_rate": 9.928768093549979e-05, + "loss": 2.1401, + "step": 2679 + }, + { + "epoch": 0.8225905463474524, + "grad_norm": 0.7523970603942871, + "learning_rate": 9.928684466501797e-05, + "loss": 2.2055, + "step": 2680 + }, + { + "epoch": 0.8228974831184775, + "grad_norm": 0.6644876599311829, + "learning_rate": 9.928600790745466e-05, + "loss": 2.1449, + "step": 2681 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.6054069399833679, + "learning_rate": 9.928517066281816e-05, + "loss": 2.1191, + "step": 2682 + }, + { + "epoch": 0.8235113566605279, + "grad_norm": 0.6610973477363586, + "learning_rate": 9.92843329311167e-05, + "loss": 2.2247, + "step": 2683 + }, + { + "epoch": 0.8238182934315531, + "grad_norm": 0.69968181848526, + "learning_rate": 9.928349471235858e-05, + "loss": 2.149, + "step": 2684 + }, + { + "epoch": 0.8241252302025782, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.928265600655206e-05, + "loss": 2.1906, + "step": 2685 + }, + { + "epoch": 0.8244321669736034, + "grad_norm": 0.6621972918510437, + "learning_rate": 9.928181681370547e-05, + "loss": 2.1259, + "step": 2686 + }, + { + "epoch": 0.8247391037446286, + "grad_norm": 0.6452053785324097, + "learning_rate": 9.928097713382708e-05, + "loss": 2.1301, + "step": 2687 + }, + { + "epoch": 0.8250460405156538, + "grad_norm": 0.6137326955795288, + "learning_rate": 9.928013696692519e-05, + "loss": 2.0942, + "step": 2688 + }, + { + "epoch": 0.8253529772866789, + "grad_norm": 0.6449215412139893, + "learning_rate": 9.92792963130081e-05, + "loss": 2.2135, + "step": 2689 + }, + { + "epoch": 0.8256599140577041, + "grad_norm": 0.5838732123374939, + "learning_rate": 9.927845517208411e-05, + "loss": 2.1161, + "step": 2690 + }, + { + "epoch": 0.8259668508287292, + "grad_norm": 0.6642805337905884, + "learning_rate": 9.927761354416157e-05, + "loss": 2.1228, + "step": 2691 + }, + { + "epoch": 0.8262737875997545, + "grad_norm": 0.653274416923523, + "learning_rate": 9.927677142924874e-05, + "loss": 2.1777, + "step": 2692 + }, + { + "epoch": 0.8265807243707797, + "grad_norm": 0.6471827030181885, + "learning_rate": 9.927592882735398e-05, + "loss": 2.0756, + "step": 2693 + }, + { + "epoch": 0.8268876611418048, + "grad_norm": 0.6215457916259766, + "learning_rate": 9.927508573848562e-05, + "loss": 2.0691, + "step": 2694 + }, + { + "epoch": 0.82719459791283, + "grad_norm": 0.6343390345573425, + "learning_rate": 9.927424216265198e-05, + "loss": 2.2145, + "step": 2695 + }, + { + "epoch": 0.8275015346838551, + "grad_norm": 0.5296334624290466, + "learning_rate": 9.927339809986138e-05, + "loss": 2.0861, + "step": 2696 + }, + { + "epoch": 0.8278084714548803, + "grad_norm": 0.6457146406173706, + "learning_rate": 9.92725535501222e-05, + "loss": 2.1703, + "step": 2697 + }, + { + "epoch": 0.8281154082259055, + "grad_norm": 0.753579318523407, + "learning_rate": 9.927170851344276e-05, + "loss": 2.1628, + "step": 2698 + }, + { + "epoch": 0.8284223449969307, + "grad_norm": 0.7327163815498352, + "learning_rate": 9.927086298983141e-05, + "loss": 2.105, + "step": 2699 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.7786175608634949, + "learning_rate": 9.927001697929653e-05, + "loss": 2.084, + "step": 2700 + }, + { + "epoch": 0.829036218538981, + "grad_norm": 0.6370857357978821, + "learning_rate": 9.926917048184646e-05, + "loss": 2.0888, + "step": 2701 + }, + { + "epoch": 0.8293431553100061, + "grad_norm": 0.6600006818771362, + "learning_rate": 9.926832349748955e-05, + "loss": 2.148, + "step": 2702 + }, + { + "epoch": 0.8296500920810314, + "grad_norm": 0.6266845464706421, + "learning_rate": 9.926747602623422e-05, + "loss": 2.2182, + "step": 2703 + }, + { + "epoch": 0.8299570288520565, + "grad_norm": 0.588934600353241, + "learning_rate": 9.92666280680888e-05, + "loss": 2.1879, + "step": 2704 + }, + { + "epoch": 0.8302639656230817, + "grad_norm": 0.6467881202697754, + "learning_rate": 9.926577962306168e-05, + "loss": 2.1082, + "step": 2705 + }, + { + "epoch": 0.8305709023941068, + "grad_norm": 0.6256638765335083, + "learning_rate": 9.926493069116127e-05, + "loss": 2.1007, + "step": 2706 + }, + { + "epoch": 0.830877839165132, + "grad_norm": 0.5710256099700928, + "learning_rate": 9.926408127239592e-05, + "loss": 2.0783, + "step": 2707 + }, + { + "epoch": 0.8311847759361571, + "grad_norm": 0.5836597681045532, + "learning_rate": 9.926323136677405e-05, + "loss": 2.1292, + "step": 2708 + }, + { + "epoch": 0.8314917127071824, + "grad_norm": 0.6420408487319946, + "learning_rate": 9.926238097430405e-05, + "loss": 2.1191, + "step": 2709 + }, + { + "epoch": 0.8317986494782075, + "grad_norm": 0.6192520260810852, + "learning_rate": 9.926153009499433e-05, + "loss": 2.1401, + "step": 2710 + }, + { + "epoch": 0.8321055862492327, + "grad_norm": 0.5986925959587097, + "learning_rate": 9.92606787288533e-05, + "loss": 2.0466, + "step": 2711 + }, + { + "epoch": 0.8324125230202578, + "grad_norm": 0.6386710405349731, + "learning_rate": 9.925982687588937e-05, + "loss": 2.1975, + "step": 2712 + }, + { + "epoch": 0.832719459791283, + "grad_norm": 0.6678250432014465, + "learning_rate": 9.925897453611095e-05, + "loss": 2.1744, + "step": 2713 + }, + { + "epoch": 0.8330263965623081, + "grad_norm": 0.628873348236084, + "learning_rate": 9.925812170952648e-05, + "loss": 2.0901, + "step": 2714 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6365368366241455, + "learning_rate": 9.925726839614438e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.8336402701043585, + "grad_norm": 0.6812825798988342, + "learning_rate": 9.925641459597309e-05, + "loss": 2.1163, + "step": 2716 + }, + { + "epoch": 0.8339472068753837, + "grad_norm": 0.6961301565170288, + "learning_rate": 9.925556030902103e-05, + "loss": 2.1634, + "step": 2717 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.687017023563385, + "learning_rate": 9.925470553529666e-05, + "loss": 2.1921, + "step": 2718 + }, + { + "epoch": 0.834561080417434, + "grad_norm": 0.6528787612915039, + "learning_rate": 9.925385027480841e-05, + "loss": 2.1148, + "step": 2719 + }, + { + "epoch": 0.8348680171884592, + "grad_norm": 0.6092917323112488, + "learning_rate": 9.925299452756476e-05, + "loss": 2.0154, + "step": 2720 + }, + { + "epoch": 0.8351749539594844, + "grad_norm": 0.6537092328071594, + "learning_rate": 9.925213829357413e-05, + "loss": 2.1775, + "step": 2721 + }, + { + "epoch": 0.8354818907305095, + "grad_norm": 0.6560773849487305, + "learning_rate": 9.925128157284503e-05, + "loss": 2.1628, + "step": 2722 + }, + { + "epoch": 0.8357888275015347, + "grad_norm": 0.5976104140281677, + "learning_rate": 9.925042436538588e-05, + "loss": 2.1527, + "step": 2723 + }, + { + "epoch": 0.8360957642725598, + "grad_norm": 0.6577131152153015, + "learning_rate": 9.924956667120516e-05, + "loss": 2.1449, + "step": 2724 + }, + { + "epoch": 0.836402701043585, + "grad_norm": 0.6574232578277588, + "learning_rate": 9.924870849031136e-05, + "loss": 2.0517, + "step": 2725 + }, + { + "epoch": 0.8367096378146102, + "grad_norm": 0.5988326072692871, + "learning_rate": 9.924784982271297e-05, + "loss": 2.0975, + "step": 2726 + }, + { + "epoch": 0.8370165745856354, + "grad_norm": 0.5970706939697266, + "learning_rate": 9.924699066841845e-05, + "loss": 2.1754, + "step": 2727 + }, + { + "epoch": 0.8373235113566605, + "grad_norm": 0.6547200679779053, + "learning_rate": 9.924613102743632e-05, + "loss": 2.1651, + "step": 2728 + }, + { + "epoch": 0.8376304481276857, + "grad_norm": 0.643358588218689, + "learning_rate": 9.924527089977504e-05, + "loss": 2.1355, + "step": 2729 + }, + { + "epoch": 0.8379373848987108, + "grad_norm": 0.6696504950523376, + "learning_rate": 9.924441028544314e-05, + "loss": 2.1444, + "step": 2730 + }, + { + "epoch": 0.8382443216697361, + "grad_norm": 0.5923263430595398, + "learning_rate": 9.924354918444911e-05, + "loss": 2.1656, + "step": 2731 + }, + { + "epoch": 0.8385512584407612, + "grad_norm": 0.6507698893547058, + "learning_rate": 9.924268759680146e-05, + "loss": 2.1172, + "step": 2732 + }, + { + "epoch": 0.8388581952117864, + "grad_norm": 0.6240561008453369, + "learning_rate": 9.924182552250873e-05, + "loss": 2.113, + "step": 2733 + }, + { + "epoch": 0.8391651319828115, + "grad_norm": 0.7350605726242065, + "learning_rate": 9.92409629615794e-05, + "loss": 2.2099, + "step": 2734 + }, + { + "epoch": 0.8394720687538367, + "grad_norm": 0.679027795791626, + "learning_rate": 9.924009991402202e-05, + "loss": 2.1202, + "step": 2735 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.7187801003456116, + "learning_rate": 9.923923637984512e-05, + "loss": 2.1994, + "step": 2736 + }, + { + "epoch": 0.8400859422958871, + "grad_norm": 0.7437569499015808, + "learning_rate": 9.92383723590572e-05, + "loss": 2.1778, + "step": 2737 + }, + { + "epoch": 0.8403928790669122, + "grad_norm": 0.7004902958869934, + "learning_rate": 9.923750785166686e-05, + "loss": 2.1478, + "step": 2738 + }, + { + "epoch": 0.8406998158379374, + "grad_norm": 0.632478654384613, + "learning_rate": 9.923664285768258e-05, + "loss": 2.1785, + "step": 2739 + }, + { + "epoch": 0.8410067526089625, + "grad_norm": 0.6399826407432556, + "learning_rate": 9.923577737711295e-05, + "loss": 2.1708, + "step": 2740 + }, + { + "epoch": 0.8413136893799877, + "grad_norm": 0.649340033531189, + "learning_rate": 9.92349114099665e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.8416206261510129, + "grad_norm": 0.6143749952316284, + "learning_rate": 9.923404495625182e-05, + "loss": 2.0696, + "step": 2742 + }, + { + "epoch": 0.8419275629220381, + "grad_norm": 0.655846357345581, + "learning_rate": 9.923317801597742e-05, + "loss": 2.1163, + "step": 2743 + }, + { + "epoch": 0.8422344996930632, + "grad_norm": 0.588096022605896, + "learning_rate": 9.923231058915192e-05, + "loss": 2.0893, + "step": 2744 + }, + { + "epoch": 0.8425414364640884, + "grad_norm": 0.5445908904075623, + "learning_rate": 9.923144267578386e-05, + "loss": 2.1223, + "step": 2745 + }, + { + "epoch": 0.8428483732351135, + "grad_norm": 0.5372910499572754, + "learning_rate": 9.923057427588182e-05, + "loss": 2.1386, + "step": 2746 + }, + { + "epoch": 0.8431553100061387, + "grad_norm": 0.5118899345397949, + "learning_rate": 9.922970538945442e-05, + "loss": 2.0532, + "step": 2747 + }, + { + "epoch": 0.8434622467771639, + "grad_norm": 0.5252440571784973, + "learning_rate": 9.922883601651019e-05, + "loss": 2.1679, + "step": 2748 + }, + { + "epoch": 0.8437691835481891, + "grad_norm": 0.5978875160217285, + "learning_rate": 9.922796615705776e-05, + "loss": 2.2054, + "step": 2749 + }, + { + "epoch": 0.8440761203192142, + "grad_norm": 0.5642610788345337, + "learning_rate": 9.922709581110572e-05, + "loss": 2.1886, + "step": 2750 + }, + { + "epoch": 0.8443830570902394, + "grad_norm": 0.6332407593727112, + "learning_rate": 9.922622497866265e-05, + "loss": 2.1618, + "step": 2751 + }, + { + "epoch": 0.8446899938612645, + "grad_norm": 0.6971728801727295, + "learning_rate": 9.922535365973718e-05, + "loss": 2.1011, + "step": 2752 + }, + { + "epoch": 0.8449969306322898, + "grad_norm": 0.6917250156402588, + "learning_rate": 9.922448185433792e-05, + "loss": 2.1408, + "step": 2753 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.748960554599762, + "learning_rate": 9.922360956247348e-05, + "loss": 2.1612, + "step": 2754 + }, + { + "epoch": 0.8456108041743401, + "grad_norm": 0.6739722490310669, + "learning_rate": 9.922273678415245e-05, + "loss": 2.1234, + "step": 2755 + }, + { + "epoch": 0.8459177409453652, + "grad_norm": 0.6310722827911377, + "learning_rate": 9.922186351938351e-05, + "loss": 2.1476, + "step": 2756 + }, + { + "epoch": 0.8462246777163904, + "grad_norm": 0.5992079973220825, + "learning_rate": 9.922098976817527e-05, + "loss": 2.1009, + "step": 2757 + }, + { + "epoch": 0.8465316144874155, + "grad_norm": 0.5697188973426819, + "learning_rate": 9.922011553053637e-05, + "loss": 2.1277, + "step": 2758 + }, + { + "epoch": 0.8468385512584408, + "grad_norm": 0.7005256414413452, + "learning_rate": 9.921924080647541e-05, + "loss": 2.1592, + "step": 2759 + }, + { + "epoch": 0.8471454880294659, + "grad_norm": 0.7664382457733154, + "learning_rate": 9.921836559600109e-05, + "loss": 2.2328, + "step": 2760 + }, + { + "epoch": 0.8474524248004911, + "grad_norm": 0.8668230772018433, + "learning_rate": 9.921748989912201e-05, + "loss": 2.2285, + "step": 2761 + }, + { + "epoch": 0.8477593615715162, + "grad_norm": 0.9423169493675232, + "learning_rate": 9.921661371584685e-05, + "loss": 2.1172, + "step": 2762 + }, + { + "epoch": 0.8480662983425414, + "grad_norm": 0.8547552824020386, + "learning_rate": 9.921573704618428e-05, + "loss": 2.1426, + "step": 2763 + }, + { + "epoch": 0.8483732351135667, + "grad_norm": 0.7568690776824951, + "learning_rate": 9.921485989014294e-05, + "loss": 2.0861, + "step": 2764 + }, + { + "epoch": 0.8486801718845918, + "grad_norm": 0.6535828709602356, + "learning_rate": 9.92139822477315e-05, + "loss": 2.1705, + "step": 2765 + }, + { + "epoch": 0.848987108655617, + "grad_norm": 0.6099218130111694, + "learning_rate": 9.921310411895867e-05, + "loss": 2.1666, + "step": 2766 + }, + { + "epoch": 0.8492940454266421, + "grad_norm": 0.6315065026283264, + "learning_rate": 9.92122255038331e-05, + "loss": 2.1868, + "step": 2767 + }, + { + "epoch": 0.8496009821976673, + "grad_norm": 0.6861329078674316, + "learning_rate": 9.921134640236344e-05, + "loss": 2.1056, + "step": 2768 + }, + { + "epoch": 0.8499079189686924, + "grad_norm": 0.6357519626617432, + "learning_rate": 9.921046681455844e-05, + "loss": 2.1272, + "step": 2769 + }, + { + "epoch": 0.8502148557397177, + "grad_norm": 0.6245810389518738, + "learning_rate": 9.920958674042676e-05, + "loss": 2.1313, + "step": 2770 + }, + { + "epoch": 0.8505217925107428, + "grad_norm": 0.6087192296981812, + "learning_rate": 9.920870617997709e-05, + "loss": 2.123, + "step": 2771 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.6384228467941284, + "learning_rate": 9.920782513321814e-05, + "loss": 2.1343, + "step": 2772 + }, + { + "epoch": 0.8511356660527931, + "grad_norm": 0.6143882274627686, + "learning_rate": 9.920694360015863e-05, + "loss": 2.0706, + "step": 2773 + }, + { + "epoch": 0.8514426028238183, + "grad_norm": 0.5561975240707397, + "learning_rate": 9.920606158080725e-05, + "loss": 2.1015, + "step": 2774 + }, + { + "epoch": 0.8517495395948435, + "grad_norm": 0.5434146523475647, + "learning_rate": 9.920517907517275e-05, + "loss": 2.1306, + "step": 2775 + }, + { + "epoch": 0.8520564763658687, + "grad_norm": 0.6028591990470886, + "learning_rate": 9.920429608326382e-05, + "loss": 2.1665, + "step": 2776 + }, + { + "epoch": 0.8523634131368938, + "grad_norm": 0.6491599082946777, + "learning_rate": 9.920341260508918e-05, + "loss": 2.0715, + "step": 2777 + }, + { + "epoch": 0.852670349907919, + "grad_norm": 0.6350167989730835, + "learning_rate": 9.92025286406576e-05, + "loss": 2.1492, + "step": 2778 + }, + { + "epoch": 0.8529772866789441, + "grad_norm": 0.5726897120475769, + "learning_rate": 9.92016441899778e-05, + "loss": 2.1128, + "step": 2779 + }, + { + "epoch": 0.8532842234499693, + "grad_norm": 0.5680630207061768, + "learning_rate": 9.92007592530585e-05, + "loss": 2.0718, + "step": 2780 + }, + { + "epoch": 0.8535911602209945, + "grad_norm": 0.5901346802711487, + "learning_rate": 9.919987382990845e-05, + "loss": 2.0577, + "step": 2781 + }, + { + "epoch": 0.8538980969920197, + "grad_norm": 0.5756994485855103, + "learning_rate": 9.919898792053643e-05, + "loss": 2.106, + "step": 2782 + }, + { + "epoch": 0.8542050337630448, + "grad_norm": 0.5831238031387329, + "learning_rate": 9.919810152495116e-05, + "loss": 2.0507, + "step": 2783 + }, + { + "epoch": 0.85451197053407, + "grad_norm": 0.529931902885437, + "learning_rate": 9.919721464316143e-05, + "loss": 2.0934, + "step": 2784 + }, + { + "epoch": 0.8548189073050951, + "grad_norm": 0.603672981262207, + "learning_rate": 9.919632727517597e-05, + "loss": 2.164, + "step": 2785 + }, + { + "epoch": 0.8551258440761204, + "grad_norm": 0.5741528868675232, + "learning_rate": 9.919543942100357e-05, + "loss": 2.0948, + "step": 2786 + }, + { + "epoch": 0.8554327808471455, + "grad_norm": 0.5689142942428589, + "learning_rate": 9.919455108065303e-05, + "loss": 2.1572, + "step": 2787 + }, + { + "epoch": 0.8557397176181707, + "grad_norm": 0.5767523646354675, + "learning_rate": 9.919366225413308e-05, + "loss": 2.0528, + "step": 2788 + }, + { + "epoch": 0.8560466543891958, + "grad_norm": 0.6004374623298645, + "learning_rate": 9.919277294145252e-05, + "loss": 2.1078, + "step": 2789 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.6199560761451721, + "learning_rate": 9.919188314262017e-05, + "loss": 2.034, + "step": 2790 + }, + { + "epoch": 0.8566605279312461, + "grad_norm": 0.5928464531898499, + "learning_rate": 9.919099285764478e-05, + "loss": 2.1226, + "step": 2791 + }, + { + "epoch": 0.8569674647022714, + "grad_norm": 0.5620111227035522, + "learning_rate": 9.919010208653517e-05, + "loss": 2.1387, + "step": 2792 + }, + { + "epoch": 0.8572744014732965, + "grad_norm": 0.6035314798355103, + "learning_rate": 9.918921082930015e-05, + "loss": 2.0888, + "step": 2793 + }, + { + "epoch": 0.8575813382443217, + "grad_norm": 0.6842171549797058, + "learning_rate": 9.91883190859485e-05, + "loss": 2.15, + "step": 2794 + }, + { + "epoch": 0.8578882750153468, + "grad_norm": 0.7600229978561401, + "learning_rate": 9.918742685648906e-05, + "loss": 2.1776, + "step": 2795 + }, + { + "epoch": 0.858195211786372, + "grad_norm": 0.641504168510437, + "learning_rate": 9.918653414093065e-05, + "loss": 2.086, + "step": 2796 + }, + { + "epoch": 0.8585021485573971, + "grad_norm": 0.6062462329864502, + "learning_rate": 9.918564093928207e-05, + "loss": 2.0772, + "step": 2797 + }, + { + "epoch": 0.8588090853284224, + "grad_norm": 0.5259165167808533, + "learning_rate": 9.918474725155214e-05, + "loss": 2.1034, + "step": 2798 + }, + { + "epoch": 0.8591160220994475, + "grad_norm": 0.532511830329895, + "learning_rate": 9.918385307774973e-05, + "loss": 2.103, + "step": 2799 + }, + { + "epoch": 0.8594229588704727, + "grad_norm": 0.5996485352516174, + "learning_rate": 9.918295841788366e-05, + "loss": 2.1698, + "step": 2800 + }, + { + "epoch": 0.8597298956414978, + "grad_norm": 0.5895976424217224, + "learning_rate": 9.918206327196276e-05, + "loss": 2.132, + "step": 2801 + }, + { + "epoch": 0.860036832412523, + "grad_norm": 0.6363179087638855, + "learning_rate": 9.918116763999588e-05, + "loss": 2.0967, + "step": 2802 + }, + { + "epoch": 0.8603437691835482, + "grad_norm": 0.6594113707542419, + "learning_rate": 9.918027152199187e-05, + "loss": 2.1266, + "step": 2803 + }, + { + "epoch": 0.8606507059545734, + "grad_norm": 0.694879412651062, + "learning_rate": 9.917937491795961e-05, + "loss": 2.0694, + "step": 2804 + }, + { + "epoch": 0.8609576427255985, + "grad_norm": 0.6310710906982422, + "learning_rate": 9.917847782790793e-05, + "loss": 2.1546, + "step": 2805 + }, + { + "epoch": 0.8612645794966237, + "grad_norm": 0.6166081428527832, + "learning_rate": 9.917758025184572e-05, + "loss": 2.131, + "step": 2806 + }, + { + "epoch": 0.8615715162676488, + "grad_norm": 0.5857066512107849, + "learning_rate": 9.917668218978182e-05, + "loss": 2.1529, + "step": 2807 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.6374151706695557, + "learning_rate": 9.917578364172513e-05, + "loss": 2.151, + "step": 2808 + }, + { + "epoch": 0.8621853898096992, + "grad_norm": 0.6760959625244141, + "learning_rate": 9.917488460768453e-05, + "loss": 2.1955, + "step": 2809 + }, + { + "epoch": 0.8624923265807244, + "grad_norm": 0.6308501362800598, + "learning_rate": 9.917398508766889e-05, + "loss": 2.1449, + "step": 2810 + }, + { + "epoch": 0.8627992633517495, + "grad_norm": 0.615181028842926, + "learning_rate": 9.91730850816871e-05, + "loss": 2.0326, + "step": 2811 + }, + { + "epoch": 0.8631062001227747, + "grad_norm": 0.6746891736984253, + "learning_rate": 9.917218458974809e-05, + "loss": 2.1472, + "step": 2812 + }, + { + "epoch": 0.8634131368937998, + "grad_norm": 0.6594959497451782, + "learning_rate": 9.91712836118607e-05, + "loss": 2.0879, + "step": 2813 + }, + { + "epoch": 0.8637200736648251, + "grad_norm": 0.6843087077140808, + "learning_rate": 9.91703821480339e-05, + "loss": 2.13, + "step": 2814 + }, + { + "epoch": 0.8640270104358502, + "grad_norm": 0.7513928413391113, + "learning_rate": 9.916948019827653e-05, + "loss": 2.1866, + "step": 2815 + }, + { + "epoch": 0.8643339472068754, + "grad_norm": 0.7352319955825806, + "learning_rate": 9.916857776259755e-05, + "loss": 2.0844, + "step": 2816 + }, + { + "epoch": 0.8646408839779005, + "grad_norm": 0.6901769638061523, + "learning_rate": 9.916767484100587e-05, + "loss": 2.086, + "step": 2817 + }, + { + "epoch": 0.8649478207489257, + "grad_norm": 0.621734619140625, + "learning_rate": 9.91667714335104e-05, + "loss": 2.0764, + "step": 2818 + }, + { + "epoch": 0.8652547575199508, + "grad_norm": 0.5779813528060913, + "learning_rate": 9.916586754012008e-05, + "loss": 2.0568, + "step": 2819 + }, + { + "epoch": 0.8655616942909761, + "grad_norm": 0.566251814365387, + "learning_rate": 9.916496316084385e-05, + "loss": 2.1624, + "step": 2820 + }, + { + "epoch": 0.8658686310620012, + "grad_norm": 0.6039763689041138, + "learning_rate": 9.916405829569062e-05, + "loss": 2.0412, + "step": 2821 + }, + { + "epoch": 0.8661755678330264, + "grad_norm": 0.587469220161438, + "learning_rate": 9.916315294466935e-05, + "loss": 2.1513, + "step": 2822 + }, + { + "epoch": 0.8664825046040515, + "grad_norm": 0.5792883634567261, + "learning_rate": 9.916224710778901e-05, + "loss": 2.055, + "step": 2823 + }, + { + "epoch": 0.8667894413750767, + "grad_norm": 0.5533844232559204, + "learning_rate": 9.916134078505852e-05, + "loss": 2.1237, + "step": 2824 + }, + { + "epoch": 0.8670963781461019, + "grad_norm": 0.6140845417976379, + "learning_rate": 9.916043397648685e-05, + "loss": 2.1481, + "step": 2825 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.6092365384101868, + "learning_rate": 9.915952668208295e-05, + "loss": 2.1567, + "step": 2826 + }, + { + "epoch": 0.8677102516881522, + "grad_norm": 0.5712884068489075, + "learning_rate": 9.915861890185578e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.8680171884591774, + "grad_norm": 0.5314213633537292, + "learning_rate": 9.915771063581434e-05, + "loss": 2.0408, + "step": 2828 + }, + { + "epoch": 0.8683241252302025, + "grad_norm": 0.5258345007896423, + "learning_rate": 9.915680188396759e-05, + "loss": 2.0968, + "step": 2829 + }, + { + "epoch": 0.8686310620012277, + "grad_norm": 0.6071497797966003, + "learning_rate": 9.915589264632453e-05, + "loss": 2.0924, + "step": 2830 + }, + { + "epoch": 0.8689379987722529, + "grad_norm": 0.6742420792579651, + "learning_rate": 9.915498292289408e-05, + "loss": 2.1276, + "step": 2831 + }, + { + "epoch": 0.8692449355432781, + "grad_norm": 0.7642729878425598, + "learning_rate": 9.915407271368533e-05, + "loss": 2.204, + "step": 2832 + }, + { + "epoch": 0.8695518723143032, + "grad_norm": 0.8024489283561707, + "learning_rate": 9.915316201870718e-05, + "loss": 2.163, + "step": 2833 + }, + { + "epoch": 0.8698588090853284, + "grad_norm": 0.8268367648124695, + "learning_rate": 9.915225083796871e-05, + "loss": 2.117, + "step": 2834 + }, + { + "epoch": 0.8701657458563536, + "grad_norm": 0.7761407494544983, + "learning_rate": 9.915133917147888e-05, + "loss": 2.0727, + "step": 2835 + }, + { + "epoch": 0.8704726826273788, + "grad_norm": 0.7515753507614136, + "learning_rate": 9.91504270192467e-05, + "loss": 2.075, + "step": 2836 + }, + { + "epoch": 0.870779619398404, + "grad_norm": 0.6203973889350891, + "learning_rate": 9.914951438128119e-05, + "loss": 2.1163, + "step": 2837 + }, + { + "epoch": 0.8710865561694291, + "grad_norm": 0.6056976318359375, + "learning_rate": 9.914860125759138e-05, + "loss": 2.1515, + "step": 2838 + }, + { + "epoch": 0.8713934929404543, + "grad_norm": 0.6472234725952148, + "learning_rate": 9.914768764818627e-05, + "loss": 2.1618, + "step": 2839 + }, + { + "epoch": 0.8717004297114794, + "grad_norm": 0.5981749892234802, + "learning_rate": 9.914677355307491e-05, + "loss": 2.0763, + "step": 2840 + }, + { + "epoch": 0.8720073664825047, + "grad_norm": 0.5721938014030457, + "learning_rate": 9.914585897226634e-05, + "loss": 2.0916, + "step": 2841 + }, + { + "epoch": 0.8723143032535298, + "grad_norm": 0.6079535484313965, + "learning_rate": 9.914494390576958e-05, + "loss": 2.0767, + "step": 2842 + }, + { + "epoch": 0.872621240024555, + "grad_norm": 0.6684066653251648, + "learning_rate": 9.914402835359368e-05, + "loss": 2.2712, + "step": 2843 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.6992711424827576, + "learning_rate": 9.91431123157477e-05, + "loss": 2.0813, + "step": 2844 + }, + { + "epoch": 0.8732351135666053, + "grad_norm": 0.6585392951965332, + "learning_rate": 9.914219579224065e-05, + "loss": 2.1303, + "step": 2845 + }, + { + "epoch": 0.8735420503376304, + "grad_norm": 0.7267395257949829, + "learning_rate": 9.914127878308164e-05, + "loss": 2.2253, + "step": 2846 + }, + { + "epoch": 0.8738489871086557, + "grad_norm": 0.6764006018638611, + "learning_rate": 9.91403612882797e-05, + "loss": 2.0886, + "step": 2847 + }, + { + "epoch": 0.8741559238796808, + "grad_norm": 0.612808108329773, + "learning_rate": 9.91394433078439e-05, + "loss": 2.0469, + "step": 2848 + }, + { + "epoch": 0.874462860650706, + "grad_norm": 0.5598782896995544, + "learning_rate": 9.913852484178334e-05, + "loss": 2.1745, + "step": 2849 + }, + { + "epoch": 0.8747697974217311, + "grad_norm": 0.6498168706893921, + "learning_rate": 9.913760589010707e-05, + "loss": 2.2657, + "step": 2850 + }, + { + "epoch": 0.8750767341927563, + "grad_norm": 0.6796014904975891, + "learning_rate": 9.913668645282418e-05, + "loss": 2.1056, + "step": 2851 + }, + { + "epoch": 0.8753836709637814, + "grad_norm": 0.7409440279006958, + "learning_rate": 9.913576652994376e-05, + "loss": 2.1533, + "step": 2852 + }, + { + "epoch": 0.8756906077348067, + "grad_norm": 0.7044464945793152, + "learning_rate": 9.913484612147488e-05, + "loss": 2.2088, + "step": 2853 + }, + { + "epoch": 0.8759975445058318, + "grad_norm": 0.6333544254302979, + "learning_rate": 9.913392522742666e-05, + "loss": 2.132, + "step": 2854 + }, + { + "epoch": 0.876304481276857, + "grad_norm": 0.603382408618927, + "learning_rate": 9.91330038478082e-05, + "loss": 2.0657, + "step": 2855 + }, + { + "epoch": 0.8766114180478821, + "grad_norm": 0.5919856429100037, + "learning_rate": 9.913208198262858e-05, + "loss": 2.0854, + "step": 2856 + }, + { + "epoch": 0.8769183548189073, + "grad_norm": 0.6033365726470947, + "learning_rate": 9.913115963189694e-05, + "loss": 2.0825, + "step": 2857 + }, + { + "epoch": 0.8772252915899325, + "grad_norm": 0.5917964577674866, + "learning_rate": 9.913023679562238e-05, + "loss": 2.1608, + "step": 2858 + }, + { + "epoch": 0.8775322283609577, + "grad_norm": 0.5953360795974731, + "learning_rate": 9.912931347381402e-05, + "loss": 2.1454, + "step": 2859 + }, + { + "epoch": 0.8778391651319828, + "grad_norm": 0.5949352979660034, + "learning_rate": 9.9128389666481e-05, + "loss": 2.1575, + "step": 2860 + }, + { + "epoch": 0.878146101903008, + "grad_norm": 0.5468181371688843, + "learning_rate": 9.912746537363243e-05, + "loss": 2.151, + "step": 2861 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.5476632714271545, + "learning_rate": 9.912654059527746e-05, + "loss": 2.1015, + "step": 2862 + }, + { + "epoch": 0.8787599754450584, + "grad_norm": 0.6881390810012817, + "learning_rate": 9.912561533142521e-05, + "loss": 2.2002, + "step": 2863 + }, + { + "epoch": 0.8790669122160835, + "grad_norm": 0.6663404703140259, + "learning_rate": 9.912468958208486e-05, + "loss": 2.0691, + "step": 2864 + }, + { + "epoch": 0.8793738489871087, + "grad_norm": 0.5739100575447083, + "learning_rate": 9.91237633472655e-05, + "loss": 2.0852, + "step": 2865 + }, + { + "epoch": 0.8796807857581338, + "grad_norm": 0.5227558016777039, + "learning_rate": 9.912283662697635e-05, + "loss": 2.1144, + "step": 2866 + }, + { + "epoch": 0.879987722529159, + "grad_norm": 0.5626821517944336, + "learning_rate": 9.912190942122652e-05, + "loss": 2.0796, + "step": 2867 + }, + { + "epoch": 0.8802946593001841, + "grad_norm": 0.5367855429649353, + "learning_rate": 9.912098173002518e-05, + "loss": 2.0768, + "step": 2868 + }, + { + "epoch": 0.8806015960712094, + "grad_norm": 0.5285482406616211, + "learning_rate": 9.912005355338152e-05, + "loss": 2.0832, + "step": 2869 + }, + { + "epoch": 0.8809085328422345, + "grad_norm": 0.5384502410888672, + "learning_rate": 9.91191248913047e-05, + "loss": 2.0187, + "step": 2870 + }, + { + "epoch": 0.8812154696132597, + "grad_norm": 0.5099567770957947, + "learning_rate": 9.91181957438039e-05, + "loss": 2.0865, + "step": 2871 + }, + { + "epoch": 0.8815224063842848, + "grad_norm": 0.5513966679573059, + "learning_rate": 9.911726611088831e-05, + "loss": 2.1097, + "step": 2872 + }, + { + "epoch": 0.88182934315531, + "grad_norm": 0.5411790609359741, + "learning_rate": 9.911633599256709e-05, + "loss": 2.0964, + "step": 2873 + }, + { + "epoch": 0.8821362799263351, + "grad_norm": 0.6151100397109985, + "learning_rate": 9.911540538884947e-05, + "loss": 2.1006, + "step": 2874 + }, + { + "epoch": 0.8824432166973604, + "grad_norm": 0.754391610622406, + "learning_rate": 9.911447429974461e-05, + "loss": 2.1493, + "step": 2875 + }, + { + "epoch": 0.8827501534683855, + "grad_norm": 0.7485715746879578, + "learning_rate": 9.911354272526172e-05, + "loss": 2.1136, + "step": 2876 + }, + { + "epoch": 0.8830570902394107, + "grad_norm": 0.6808591485023499, + "learning_rate": 9.911261066541003e-05, + "loss": 2.1238, + "step": 2877 + }, + { + "epoch": 0.8833640270104358, + "grad_norm": 0.5771127343177795, + "learning_rate": 9.911167812019874e-05, + "loss": 2.0846, + "step": 2878 + }, + { + "epoch": 0.883670963781461, + "grad_norm": 0.5991767048835754, + "learning_rate": 9.911074508963705e-05, + "loss": 2.1486, + "step": 2879 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.6899440884590149, + "learning_rate": 9.91098115737342e-05, + "loss": 2.1357, + "step": 2880 + }, + { + "epoch": 0.8842848373235114, + "grad_norm": 0.7102574110031128, + "learning_rate": 9.91088775724994e-05, + "loss": 2.1269, + "step": 2881 + }, + { + "epoch": 0.8845917740945365, + "grad_norm": 0.7238754034042358, + "learning_rate": 9.910794308594189e-05, + "loss": 2.0829, + "step": 2882 + }, + { + "epoch": 0.8848987108655617, + "grad_norm": 0.7232441902160645, + "learning_rate": 9.91070081140709e-05, + "loss": 2.1704, + "step": 2883 + }, + { + "epoch": 0.8852056476365868, + "grad_norm": 0.7136173844337463, + "learning_rate": 9.910607265689569e-05, + "loss": 2.1553, + "step": 2884 + }, + { + "epoch": 0.885512584407612, + "grad_norm": 0.6566216945648193, + "learning_rate": 9.910513671442547e-05, + "loss": 2.0856, + "step": 2885 + }, + { + "epoch": 0.8858195211786372, + "grad_norm": 0.5712916851043701, + "learning_rate": 9.910420028666951e-05, + "loss": 2.1399, + "step": 2886 + }, + { + "epoch": 0.8861264579496624, + "grad_norm": 0.727664589881897, + "learning_rate": 9.910326337363707e-05, + "loss": 2.088, + "step": 2887 + }, + { + "epoch": 0.8864333947206875, + "grad_norm": 0.799963653087616, + "learning_rate": 9.91023259753374e-05, + "loss": 2.0984, + "step": 2888 + }, + { + "epoch": 0.8867403314917127, + "grad_norm": 0.9462977051734924, + "learning_rate": 9.910138809177975e-05, + "loss": 2.1262, + "step": 2889 + }, + { + "epoch": 0.8870472682627378, + "grad_norm": 0.9130533933639526, + "learning_rate": 9.910044972297343e-05, + "loss": 2.1967, + "step": 2890 + }, + { + "epoch": 0.887354205033763, + "grad_norm": 0.6971304416656494, + "learning_rate": 9.909951086892767e-05, + "loss": 2.0797, + "step": 2891 + }, + { + "epoch": 0.8876611418047882, + "grad_norm": 0.5822353363037109, + "learning_rate": 9.909857152965176e-05, + "loss": 2.1152, + "step": 2892 + }, + { + "epoch": 0.8879680785758134, + "grad_norm": 0.5885453820228577, + "learning_rate": 9.9097631705155e-05, + "loss": 2.0323, + "step": 2893 + }, + { + "epoch": 0.8882750153468385, + "grad_norm": 0.6249284744262695, + "learning_rate": 9.909669139544666e-05, + "loss": 2.1076, + "step": 2894 + }, + { + "epoch": 0.8885819521178637, + "grad_norm": 0.6117702722549438, + "learning_rate": 9.909575060053604e-05, + "loss": 2.0608, + "step": 2895 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.560357928276062, + "learning_rate": 9.909480932043245e-05, + "loss": 2.145, + "step": 2896 + }, + { + "epoch": 0.8891958256599141, + "grad_norm": 0.5442607998847961, + "learning_rate": 9.909386755514516e-05, + "loss": 2.1091, + "step": 2897 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.5653077363967896, + "learning_rate": 9.909292530468351e-05, + "loss": 2.1097, + "step": 2898 + }, + { + "epoch": 0.8898096992019644, + "grad_norm": 0.531939685344696, + "learning_rate": 9.909198256905679e-05, + "loss": 2.0866, + "step": 2899 + }, + { + "epoch": 0.8901166359729895, + "grad_norm": 0.6238400340080261, + "learning_rate": 9.909103934827433e-05, + "loss": 2.1421, + "step": 2900 + }, + { + "epoch": 0.8904235727440147, + "grad_norm": 0.5685901045799255, + "learning_rate": 9.909009564234543e-05, + "loss": 2.0019, + "step": 2901 + }, + { + "epoch": 0.8907305095150398, + "grad_norm": 0.5979083180427551, + "learning_rate": 9.908915145127945e-05, + "loss": 2.0891, + "step": 2902 + }, + { + "epoch": 0.8910374462860651, + "grad_norm": 0.5847237706184387, + "learning_rate": 9.90882067750857e-05, + "loss": 2.1165, + "step": 2903 + }, + { + "epoch": 0.8913443830570903, + "grad_norm": 0.6281530261039734, + "learning_rate": 9.908726161377351e-05, + "loss": 2.1396, + "step": 2904 + }, + { + "epoch": 0.8916513198281154, + "grad_norm": 0.5685252547264099, + "learning_rate": 9.908631596735225e-05, + "loss": 2.0781, + "step": 2905 + }, + { + "epoch": 0.8919582565991406, + "grad_norm": 0.5427065491676331, + "learning_rate": 9.908536983583123e-05, + "loss": 2.1387, + "step": 2906 + }, + { + "epoch": 0.8922651933701657, + "grad_norm": 0.5972270965576172, + "learning_rate": 9.908442321921982e-05, + "loss": 2.0546, + "step": 2907 + }, + { + "epoch": 0.892572130141191, + "grad_norm": 0.562685489654541, + "learning_rate": 9.908347611752735e-05, + "loss": 2.093, + "step": 2908 + }, + { + "epoch": 0.8928790669122161, + "grad_norm": 0.6781734824180603, + "learning_rate": 9.908252853076323e-05, + "loss": 2.1589, + "step": 2909 + }, + { + "epoch": 0.8931860036832413, + "grad_norm": 0.7591540813446045, + "learning_rate": 9.908158045893678e-05, + "loss": 2.164, + "step": 2910 + }, + { + "epoch": 0.8934929404542664, + "grad_norm": 0.7161938548088074, + "learning_rate": 9.908063190205738e-05, + "loss": 2.079, + "step": 2911 + }, + { + "epoch": 0.8937998772252916, + "grad_norm": 0.7338036298751831, + "learning_rate": 9.907968286013442e-05, + "loss": 2.0033, + "step": 2912 + }, + { + "epoch": 0.8941068139963168, + "grad_norm": 0.7641176581382751, + "learning_rate": 9.907873333317727e-05, + "loss": 2.187, + "step": 2913 + }, + { + "epoch": 0.894413750767342, + "grad_norm": 0.6073760390281677, + "learning_rate": 9.90777833211953e-05, + "loss": 2.0589, + "step": 2914 + }, + { + "epoch": 0.8947206875383671, + "grad_norm": 0.49493756890296936, + "learning_rate": 9.907683282419791e-05, + "loss": 2.0555, + "step": 2915 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.6428996920585632, + "learning_rate": 9.907588184219449e-05, + "loss": 2.1083, + "step": 2916 + }, + { + "epoch": 0.8953345610804174, + "grad_norm": 0.6752644777297974, + "learning_rate": 9.907493037519447e-05, + "loss": 2.0987, + "step": 2917 + }, + { + "epoch": 0.8956414978514426, + "grad_norm": 0.5719494223594666, + "learning_rate": 9.907397842320719e-05, + "loss": 2.1735, + "step": 2918 + }, + { + "epoch": 0.8959484346224678, + "grad_norm": 0.5799626111984253, + "learning_rate": 9.907302598624211e-05, + "loss": 2.0978, + "step": 2919 + }, + { + "epoch": 0.896255371393493, + "grad_norm": 0.5407500267028809, + "learning_rate": 9.907207306430861e-05, + "loss": 2.0303, + "step": 2920 + }, + { + "epoch": 0.8965623081645181, + "grad_norm": 0.5950884222984314, + "learning_rate": 9.907111965741614e-05, + "loss": 2.0721, + "step": 2921 + }, + { + "epoch": 0.8968692449355433, + "grad_norm": 0.7711441516876221, + "learning_rate": 9.907016576557409e-05, + "loss": 2.1693, + "step": 2922 + }, + { + "epoch": 0.8971761817065684, + "grad_norm": 0.5522177815437317, + "learning_rate": 9.906921138879191e-05, + "loss": 2.1057, + "step": 2923 + }, + { + "epoch": 0.8974831184775937, + "grad_norm": 0.5743894577026367, + "learning_rate": 9.906825652707903e-05, + "loss": 2.119, + "step": 2924 + }, + { + "epoch": 0.8977900552486188, + "grad_norm": 0.5996440649032593, + "learning_rate": 9.906730118044486e-05, + "loss": 2.1251, + "step": 2925 + }, + { + "epoch": 0.898096992019644, + "grad_norm": 0.691302478313446, + "learning_rate": 9.906634534889887e-05, + "loss": 2.1459, + "step": 2926 + }, + { + "epoch": 0.8984039287906691, + "grad_norm": 0.6125866770744324, + "learning_rate": 9.90653890324505e-05, + "loss": 2.0739, + "step": 2927 + }, + { + "epoch": 0.8987108655616943, + "grad_norm": 0.5285681486129761, + "learning_rate": 9.906443223110919e-05, + "loss": 2.0398, + "step": 2928 + }, + { + "epoch": 0.8990178023327194, + "grad_norm": 0.5747935771942139, + "learning_rate": 9.90634749448844e-05, + "loss": 2.0688, + "step": 2929 + }, + { + "epoch": 0.8993247391037447, + "grad_norm": 0.5686646103858948, + "learning_rate": 9.90625171737856e-05, + "loss": 2.1196, + "step": 2930 + }, + { + "epoch": 0.8996316758747698, + "grad_norm": 0.5320247411727905, + "learning_rate": 9.906155891782225e-05, + "loss": 2.1069, + "step": 2931 + }, + { + "epoch": 0.899938612645795, + "grad_norm": 0.5626047849655151, + "learning_rate": 9.906060017700383e-05, + "loss": 2.1091, + "step": 2932 + }, + { + "epoch": 0.9002455494168201, + "grad_norm": 0.5284978151321411, + "learning_rate": 9.905964095133979e-05, + "loss": 2.036, + "step": 2933 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.5362093448638916, + "learning_rate": 9.905868124083962e-05, + "loss": 2.1273, + "step": 2934 + }, + { + "epoch": 0.9008594229588704, + "grad_norm": 0.5583781599998474, + "learning_rate": 9.90577210455128e-05, + "loss": 2.0871, + "step": 2935 + }, + { + "epoch": 0.9011663597298957, + "grad_norm": 0.5552016496658325, + "learning_rate": 9.905676036536883e-05, + "loss": 2.0785, + "step": 2936 + }, + { + "epoch": 0.9014732965009208, + "grad_norm": 0.6875657439231873, + "learning_rate": 9.905579920041724e-05, + "loss": 2.083, + "step": 2937 + }, + { + "epoch": 0.901780233271946, + "grad_norm": 0.5396340489387512, + "learning_rate": 9.905483755066744e-05, + "loss": 2.0717, + "step": 2938 + }, + { + "epoch": 0.9020871700429711, + "grad_norm": 0.594739556312561, + "learning_rate": 9.9053875416129e-05, + "loss": 2.1305, + "step": 2939 + }, + { + "epoch": 0.9023941068139963, + "grad_norm": 0.6208831667900085, + "learning_rate": 9.905291279681143e-05, + "loss": 2.0034, + "step": 2940 + }, + { + "epoch": 0.9027010435850215, + "grad_norm": 0.5154325366020203, + "learning_rate": 9.90519496927242e-05, + "loss": 2.098, + "step": 2941 + }, + { + "epoch": 0.9030079803560467, + "grad_norm": 0.5217738151550293, + "learning_rate": 9.905098610387687e-05, + "loss": 2.0467, + "step": 2942 + }, + { + "epoch": 0.9033149171270718, + "grad_norm": 0.5623623728752136, + "learning_rate": 9.905002203027894e-05, + "loss": 2.1854, + "step": 2943 + }, + { + "epoch": 0.903621853898097, + "grad_norm": 0.5365456938743591, + "learning_rate": 9.904905747193993e-05, + "loss": 2.1021, + "step": 2944 + }, + { + "epoch": 0.9039287906691221, + "grad_norm": 0.5391906499862671, + "learning_rate": 9.904809242886941e-05, + "loss": 2.1102, + "step": 2945 + }, + { + "epoch": 0.9042357274401474, + "grad_norm": 0.5439971685409546, + "learning_rate": 9.904712690107687e-05, + "loss": 2.0691, + "step": 2946 + }, + { + "epoch": 0.9045426642111725, + "grad_norm": 0.539383053779602, + "learning_rate": 9.904616088857189e-05, + "loss": 2.0514, + "step": 2947 + }, + { + "epoch": 0.9048496009821977, + "grad_norm": 0.5370060801506042, + "learning_rate": 9.904519439136399e-05, + "loss": 2.1069, + "step": 2948 + }, + { + "epoch": 0.9051565377532228, + "grad_norm": 0.5136541724205017, + "learning_rate": 9.904422740946274e-05, + "loss": 2.0519, + "step": 2949 + }, + { + "epoch": 0.905463474524248, + "grad_norm": 0.4970051348209381, + "learning_rate": 9.904325994287768e-05, + "loss": 2.0624, + "step": 2950 + }, + { + "epoch": 0.9057704112952731, + "grad_norm": 0.5003986954689026, + "learning_rate": 9.90422919916184e-05, + "loss": 2.135, + "step": 2951 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.5559821724891663, + "learning_rate": 9.904132355569443e-05, + "loss": 2.0733, + "step": 2952 + }, + { + "epoch": 0.9063842848373235, + "grad_norm": 0.5450533628463745, + "learning_rate": 9.904035463511537e-05, + "loss": 2.1491, + "step": 2953 + }, + { + "epoch": 0.9066912216083487, + "grad_norm": 0.5789141058921814, + "learning_rate": 9.903938522989076e-05, + "loss": 2.0604, + "step": 2954 + }, + { + "epoch": 0.9069981583793738, + "grad_norm": 0.6327412128448486, + "learning_rate": 9.903841534003023e-05, + "loss": 2.1307, + "step": 2955 + }, + { + "epoch": 0.907305095150399, + "grad_norm": 0.5694023966789246, + "learning_rate": 9.90374449655433e-05, + "loss": 2.1322, + "step": 2956 + }, + { + "epoch": 0.9076120319214241, + "grad_norm": 0.6241337060928345, + "learning_rate": 9.903647410643963e-05, + "loss": 2.1026, + "step": 2957 + }, + { + "epoch": 0.9079189686924494, + "grad_norm": 0.6257766485214233, + "learning_rate": 9.903550276272878e-05, + "loss": 2.0449, + "step": 2958 + }, + { + "epoch": 0.9082259054634745, + "grad_norm": 0.708626389503479, + "learning_rate": 9.903453093442032e-05, + "loss": 2.095, + "step": 2959 + }, + { + "epoch": 0.9085328422344997, + "grad_norm": 0.6769086122512817, + "learning_rate": 9.903355862152391e-05, + "loss": 2.0939, + "step": 2960 + }, + { + "epoch": 0.9088397790055248, + "grad_norm": 0.6221890449523926, + "learning_rate": 9.903258582404913e-05, + "loss": 2.1552, + "step": 2961 + }, + { + "epoch": 0.90914671577655, + "grad_norm": 0.7477858662605286, + "learning_rate": 9.903161254200561e-05, + "loss": 2.1155, + "step": 2962 + }, + { + "epoch": 0.9094536525475752, + "grad_norm": 0.665538489818573, + "learning_rate": 9.903063877540294e-05, + "loss": 2.1032, + "step": 2963 + }, + { + "epoch": 0.9097605893186004, + "grad_norm": 0.5973435044288635, + "learning_rate": 9.902966452425076e-05, + "loss": 2.0793, + "step": 2964 + }, + { + "epoch": 0.9100675260896255, + "grad_norm": 0.6544547080993652, + "learning_rate": 9.90286897885587e-05, + "loss": 2.1566, + "step": 2965 + }, + { + "epoch": 0.9103744628606507, + "grad_norm": 0.7162452936172485, + "learning_rate": 9.90277145683364e-05, + "loss": 2.1234, + "step": 2966 + }, + { + "epoch": 0.9106813996316758, + "grad_norm": 0.8400503993034363, + "learning_rate": 9.902673886359349e-05, + "loss": 2.216, + "step": 2967 + }, + { + "epoch": 0.910988336402701, + "grad_norm": 1.0350611209869385, + "learning_rate": 9.902576267433961e-05, + "loss": 2.0785, + "step": 2968 + }, + { + "epoch": 0.9112952731737262, + "grad_norm": 0.9551987051963806, + "learning_rate": 9.90247860005844e-05, + "loss": 2.0652, + "step": 2969 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.839712381362915, + "learning_rate": 9.902380884233751e-05, + "loss": 2.1197, + "step": 2970 + }, + { + "epoch": 0.9119091467157765, + "grad_norm": 0.6588022708892822, + "learning_rate": 9.902283119960863e-05, + "loss": 2.155, + "step": 2971 + }, + { + "epoch": 0.9122160834868017, + "grad_norm": 0.6532430052757263, + "learning_rate": 9.902185307240739e-05, + "loss": 2.0947, + "step": 2972 + }, + { + "epoch": 0.9125230202578268, + "grad_norm": 0.7890481352806091, + "learning_rate": 9.902087446074346e-05, + "loss": 2.0246, + "step": 2973 + }, + { + "epoch": 0.9128299570288521, + "grad_norm": 0.6234511137008667, + "learning_rate": 9.901989536462652e-05, + "loss": 2.1033, + "step": 2974 + }, + { + "epoch": 0.9131368937998773, + "grad_norm": 0.5875300168991089, + "learning_rate": 9.901891578406623e-05, + "loss": 2.0553, + "step": 2975 + }, + { + "epoch": 0.9134438305709024, + "grad_norm": 0.6868174076080322, + "learning_rate": 9.901793571907231e-05, + "loss": 2.1398, + "step": 2976 + }, + { + "epoch": 0.9137507673419276, + "grad_norm": 0.7423301339149475, + "learning_rate": 9.90169551696544e-05, + "loss": 2.1034, + "step": 2977 + }, + { + "epoch": 0.9140577041129527, + "grad_norm": 0.588916003704071, + "learning_rate": 9.901597413582222e-05, + "loss": 2.078, + "step": 2978 + }, + { + "epoch": 0.914364640883978, + "grad_norm": 0.5895309448242188, + "learning_rate": 9.901499261758544e-05, + "loss": 2.0902, + "step": 2979 + }, + { + "epoch": 0.9146715776550031, + "grad_norm": 0.5403301119804382, + "learning_rate": 9.901401061495379e-05, + "loss": 2.0291, + "step": 2980 + }, + { + "epoch": 0.9149785144260283, + "grad_norm": 0.6102077960968018, + "learning_rate": 9.901302812793696e-05, + "loss": 2.0415, + "step": 2981 + }, + { + "epoch": 0.9152854511970534, + "grad_norm": 0.6728450059890747, + "learning_rate": 9.901204515654465e-05, + "loss": 2.105, + "step": 2982 + }, + { + "epoch": 0.9155923879680786, + "grad_norm": 0.5886163711547852, + "learning_rate": 9.901106170078657e-05, + "loss": 2.0186, + "step": 2983 + }, + { + "epoch": 0.9158993247391037, + "grad_norm": 0.539252758026123, + "learning_rate": 9.901007776067247e-05, + "loss": 2.0604, + "step": 2984 + }, + { + "epoch": 0.916206261510129, + "grad_norm": 0.6169516444206238, + "learning_rate": 9.900909333621205e-05, + "loss": 2.1257, + "step": 2985 + }, + { + "epoch": 0.9165131982811541, + "grad_norm": 0.5624274015426636, + "learning_rate": 9.900810842741506e-05, + "loss": 2.0325, + "step": 2986 + }, + { + "epoch": 0.9168201350521793, + "grad_norm": 0.5931735634803772, + "learning_rate": 9.900712303429119e-05, + "loss": 2.0815, + "step": 2987 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.5720505714416504, + "learning_rate": 9.900613715685023e-05, + "loss": 2.1261, + "step": 2988 + }, + { + "epoch": 0.9174340085942296, + "grad_norm": 0.5752067565917969, + "learning_rate": 9.900515079510189e-05, + "loss": 2.1402, + "step": 2989 + }, + { + "epoch": 0.9177409453652547, + "grad_norm": 0.5836917757987976, + "learning_rate": 9.900416394905591e-05, + "loss": 2.0523, + "step": 2990 + }, + { + "epoch": 0.91804788213628, + "grad_norm": 0.6408325433731079, + "learning_rate": 9.900317661872209e-05, + "loss": 2.1874, + "step": 2991 + }, + { + "epoch": 0.9183548189073051, + "grad_norm": 0.6188341379165649, + "learning_rate": 9.900218880411013e-05, + "loss": 2.0903, + "step": 2992 + }, + { + "epoch": 0.9186617556783303, + "grad_norm": 0.5740565657615662, + "learning_rate": 9.900120050522985e-05, + "loss": 2.1243, + "step": 2993 + }, + { + "epoch": 0.9189686924493554, + "grad_norm": 0.635638952255249, + "learning_rate": 9.900021172209096e-05, + "loss": 2.089, + "step": 2994 + }, + { + "epoch": 0.9192756292203806, + "grad_norm": 0.5538209676742554, + "learning_rate": 9.899922245470326e-05, + "loss": 2.0489, + "step": 2995 + }, + { + "epoch": 0.9195825659914058, + "grad_norm": 0.5440292954444885, + "learning_rate": 9.899823270307654e-05, + "loss": 2.0534, + "step": 2996 + }, + { + "epoch": 0.919889502762431, + "grad_norm": 0.6203792691230774, + "learning_rate": 9.899724246722055e-05, + "loss": 2.2799, + "step": 2997 + }, + { + "epoch": 0.9201964395334561, + "grad_norm": 0.6299278140068054, + "learning_rate": 9.89962517471451e-05, + "loss": 2.0813, + "step": 2998 + }, + { + "epoch": 0.9205033763044813, + "grad_norm": 0.6156774759292603, + "learning_rate": 9.899526054285997e-05, + "loss": 2.1345, + "step": 2999 + }, + { + "epoch": 0.9208103130755064, + "grad_norm": 0.5940032601356506, + "learning_rate": 9.899426885437496e-05, + "loss": 2.133, + "step": 3000 + }, + { + "epoch": 0.9211172498465316, + "grad_norm": 0.6210232377052307, + "learning_rate": 9.899327668169987e-05, + "loss": 2.0275, + "step": 3001 + }, + { + "epoch": 0.9214241866175568, + "grad_norm": 0.5578985214233398, + "learning_rate": 9.89922840248445e-05, + "loss": 2.0806, + "step": 3002 + }, + { + "epoch": 0.921731123388582, + "grad_norm": 0.5264963507652283, + "learning_rate": 9.899129088381866e-05, + "loss": 2.1233, + "step": 3003 + }, + { + "epoch": 0.9220380601596071, + "grad_norm": 0.5414119958877563, + "learning_rate": 9.899029725863218e-05, + "loss": 2.1052, + "step": 3004 + }, + { + "epoch": 0.9223449969306323, + "grad_norm": 0.5933207869529724, + "learning_rate": 9.898930314929486e-05, + "loss": 2.108, + "step": 3005 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.6170317530632019, + "learning_rate": 9.898830855581654e-05, + "loss": 2.0997, + "step": 3006 + }, + { + "epoch": 0.9229588704726827, + "grad_norm": 0.5930282473564148, + "learning_rate": 9.898731347820705e-05, + "loss": 2.0507, + "step": 3007 + }, + { + "epoch": 0.9232658072437078, + "grad_norm": 0.5894142985343933, + "learning_rate": 9.898631791647619e-05, + "loss": 2.0687, + "step": 3008 + }, + { + "epoch": 0.923572744014733, + "grad_norm": 0.6560437083244324, + "learning_rate": 9.898532187063383e-05, + "loss": 2.096, + "step": 3009 + }, + { + "epoch": 0.9238796807857581, + "grad_norm": 0.6083245873451233, + "learning_rate": 9.898432534068983e-05, + "loss": 2.0526, + "step": 3010 + }, + { + "epoch": 0.9241866175567833, + "grad_norm": 0.5152565240859985, + "learning_rate": 9.8983328326654e-05, + "loss": 2.0802, + "step": 3011 + }, + { + "epoch": 0.9244935543278084, + "grad_norm": 0.6326588988304138, + "learning_rate": 9.89823308285362e-05, + "loss": 2.1246, + "step": 3012 + }, + { + "epoch": 0.9248004910988337, + "grad_norm": 0.6821309328079224, + "learning_rate": 9.898133284634632e-05, + "loss": 2.1106, + "step": 3013 + }, + { + "epoch": 0.9251074278698588, + "grad_norm": 0.6192164421081543, + "learning_rate": 9.898033438009419e-05, + "loss": 2.0475, + "step": 3014 + }, + { + "epoch": 0.925414364640884, + "grad_norm": 0.6112427115440369, + "learning_rate": 9.897933542978967e-05, + "loss": 2.0904, + "step": 3015 + }, + { + "epoch": 0.9257213014119091, + "grad_norm": 0.5729427933692932, + "learning_rate": 9.897833599544268e-05, + "loss": 2.1151, + "step": 3016 + }, + { + "epoch": 0.9260282381829343, + "grad_norm": 0.6200255751609802, + "learning_rate": 9.897733607706305e-05, + "loss": 2.0815, + "step": 3017 + }, + { + "epoch": 0.9263351749539595, + "grad_norm": 0.635920524597168, + "learning_rate": 9.897633567466068e-05, + "loss": 2.0724, + "step": 3018 + }, + { + "epoch": 0.9266421117249847, + "grad_norm": 0.5916038155555725, + "learning_rate": 9.897533478824546e-05, + "loss": 2.1527, + "step": 3019 + }, + { + "epoch": 0.9269490484960098, + "grad_norm": 0.5552941560745239, + "learning_rate": 9.897433341782727e-05, + "loss": 2.0958, + "step": 3020 + }, + { + "epoch": 0.927255985267035, + "grad_norm": 0.562383770942688, + "learning_rate": 9.897333156341602e-05, + "loss": 2.0939, + "step": 3021 + }, + { + "epoch": 0.9275629220380601, + "grad_norm": 0.5227869153022766, + "learning_rate": 9.897232922502158e-05, + "loss": 2.1358, + "step": 3022 + }, + { + "epoch": 0.9278698588090853, + "grad_norm": 0.5671074986457825, + "learning_rate": 9.897132640265391e-05, + "loss": 2.0877, + "step": 3023 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.5176356434822083, + "learning_rate": 9.897032309632287e-05, + "loss": 2.0392, + "step": 3024 + }, + { + "epoch": 0.9284837323511357, + "grad_norm": 0.5160155296325684, + "learning_rate": 9.89693193060384e-05, + "loss": 2.069, + "step": 3025 + }, + { + "epoch": 0.9287906691221608, + "grad_norm": 0.5034440159797668, + "learning_rate": 9.896831503181042e-05, + "loss": 2.0348, + "step": 3026 + }, + { + "epoch": 0.929097605893186, + "grad_norm": 0.5146151781082153, + "learning_rate": 9.896731027364884e-05, + "loss": 2.0884, + "step": 3027 + }, + { + "epoch": 0.9294045426642111, + "grad_norm": 0.7153071165084839, + "learning_rate": 9.896630503156361e-05, + "loss": 2.2295, + "step": 3028 + }, + { + "epoch": 0.9297114794352364, + "grad_norm": 0.7201753258705139, + "learning_rate": 9.896529930556464e-05, + "loss": 2.1285, + "step": 3029 + }, + { + "epoch": 0.9300184162062615, + "grad_norm": 0.7110029458999634, + "learning_rate": 9.89642930956619e-05, + "loss": 2.1371, + "step": 3030 + }, + { + "epoch": 0.9303253529772867, + "grad_norm": 0.695444643497467, + "learning_rate": 9.896328640186531e-05, + "loss": 2.0698, + "step": 3031 + }, + { + "epoch": 0.9306322897483118, + "grad_norm": 0.6157357096672058, + "learning_rate": 9.896227922418482e-05, + "loss": 2.1294, + "step": 3032 + }, + { + "epoch": 0.930939226519337, + "grad_norm": 0.5473730564117432, + "learning_rate": 9.896127156263039e-05, + "loss": 2.0487, + "step": 3033 + }, + { + "epoch": 0.9312461632903621, + "grad_norm": 0.6400229334831238, + "learning_rate": 9.896026341721198e-05, + "loss": 2.0422, + "step": 3034 + }, + { + "epoch": 0.9315531000613874, + "grad_norm": 0.5046324729919434, + "learning_rate": 9.895925478793955e-05, + "loss": 2.0715, + "step": 3035 + }, + { + "epoch": 0.9318600368324125, + "grad_norm": 0.5316528081893921, + "learning_rate": 9.895824567482307e-05, + "loss": 2.11, + "step": 3036 + }, + { + "epoch": 0.9321669736034377, + "grad_norm": 0.5760478973388672, + "learning_rate": 9.895723607787251e-05, + "loss": 2.0885, + "step": 3037 + }, + { + "epoch": 0.9324739103744628, + "grad_norm": 0.5034705996513367, + "learning_rate": 9.895622599709785e-05, + "loss": 2.0024, + "step": 3038 + }, + { + "epoch": 0.932780847145488, + "grad_norm": 0.46088743209838867, + "learning_rate": 9.895521543250906e-05, + "loss": 2.0794, + "step": 3039 + }, + { + "epoch": 0.9330877839165131, + "grad_norm": 0.5219544172286987, + "learning_rate": 9.895420438411616e-05, + "loss": 2.1002, + "step": 3040 + }, + { + "epoch": 0.9333947206875384, + "grad_norm": 0.5363453030586243, + "learning_rate": 9.89531928519291e-05, + "loss": 2.0629, + "step": 3041 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.5860787630081177, + "learning_rate": 9.89521808359579e-05, + "loss": 2.0999, + "step": 3042 + }, + { + "epoch": 0.9340085942295887, + "grad_norm": 0.7155836224555969, + "learning_rate": 9.895116833621255e-05, + "loss": 2.1674, + "step": 3043 + }, + { + "epoch": 0.9343155310006138, + "grad_norm": 0.8029196262359619, + "learning_rate": 9.895015535270307e-05, + "loss": 2.0776, + "step": 3044 + }, + { + "epoch": 0.934622467771639, + "grad_norm": 0.6973832845687866, + "learning_rate": 9.894914188543946e-05, + "loss": 2.0537, + "step": 3045 + }, + { + "epoch": 0.9349294045426643, + "grad_norm": 0.6646706461906433, + "learning_rate": 9.894812793443175e-05, + "loss": 2.0857, + "step": 3046 + }, + { + "epoch": 0.9352363413136894, + "grad_norm": 0.6343888640403748, + "learning_rate": 9.894711349968995e-05, + "loss": 2.0832, + "step": 3047 + }, + { + "epoch": 0.9355432780847146, + "grad_norm": 0.54819256067276, + "learning_rate": 9.894609858122407e-05, + "loss": 2.1576, + "step": 3048 + }, + { + "epoch": 0.9358502148557397, + "grad_norm": 0.6905701160430908, + "learning_rate": 9.894508317904419e-05, + "loss": 2.0685, + "step": 3049 + }, + { + "epoch": 0.9361571516267649, + "grad_norm": 0.605591356754303, + "learning_rate": 9.894406729316028e-05, + "loss": 2.0931, + "step": 3050 + }, + { + "epoch": 0.93646408839779, + "grad_norm": 0.5702943801879883, + "learning_rate": 9.89430509235824e-05, + "loss": 2.1224, + "step": 3051 + }, + { + "epoch": 0.9367710251688153, + "grad_norm": 0.5855122804641724, + "learning_rate": 9.894203407032064e-05, + "loss": 2.0747, + "step": 3052 + }, + { + "epoch": 0.9370779619398404, + "grad_norm": 0.6002167463302612, + "learning_rate": 9.894101673338498e-05, + "loss": 2.0991, + "step": 3053 + }, + { + "epoch": 0.9373848987108656, + "grad_norm": 0.5914842486381531, + "learning_rate": 9.893999891278553e-05, + "loss": 2.0427, + "step": 3054 + }, + { + "epoch": 0.9376918354818907, + "grad_norm": 0.6283048391342163, + "learning_rate": 9.893898060853232e-05, + "loss": 2.0558, + "step": 3055 + }, + { + "epoch": 0.937998772252916, + "grad_norm": 0.5955209136009216, + "learning_rate": 9.893796182063542e-05, + "loss": 2.1286, + "step": 3056 + }, + { + "epoch": 0.9383057090239411, + "grad_norm": 0.5579878687858582, + "learning_rate": 9.893694254910489e-05, + "loss": 2.0799, + "step": 3057 + }, + { + "epoch": 0.9386126457949663, + "grad_norm": 0.5690281391143799, + "learning_rate": 9.893592279395082e-05, + "loss": 2.0699, + "step": 3058 + }, + { + "epoch": 0.9389195825659914, + "grad_norm": 0.5189259648323059, + "learning_rate": 9.893490255518327e-05, + "loss": 2.0627, + "step": 3059 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.5205439925193787, + "learning_rate": 9.893388183281233e-05, + "loss": 2.0136, + "step": 3060 + }, + { + "epoch": 0.9395334561080417, + "grad_norm": 0.492593914270401, + "learning_rate": 9.89328606268481e-05, + "loss": 2.0799, + "step": 3061 + }, + { + "epoch": 0.939840392879067, + "grad_norm": 0.6511666178703308, + "learning_rate": 9.893183893730067e-05, + "loss": 2.1297, + "step": 3062 + }, + { + "epoch": 0.9401473296500921, + "grad_norm": 0.7640050053596497, + "learning_rate": 9.89308167641801e-05, + "loss": 2.1384, + "step": 3063 + }, + { + "epoch": 0.9404542664211173, + "grad_norm": 0.7526536583900452, + "learning_rate": 9.892979410749654e-05, + "loss": 2.0454, + "step": 3064 + }, + { + "epoch": 0.9407612031921424, + "grad_norm": 0.7140639424324036, + "learning_rate": 9.892877096726007e-05, + "loss": 2.0219, + "step": 3065 + }, + { + "epoch": 0.9410681399631676, + "grad_norm": 0.6584374308586121, + "learning_rate": 9.89277473434808e-05, + "loss": 2.0943, + "step": 3066 + }, + { + "epoch": 0.9413750767341927, + "grad_norm": 0.5889024138450623, + "learning_rate": 9.892672323616888e-05, + "loss": 2.1088, + "step": 3067 + }, + { + "epoch": 0.941682013505218, + "grad_norm": 0.6196749806404114, + "learning_rate": 9.892569864533438e-05, + "loss": 2.101, + "step": 3068 + }, + { + "epoch": 0.9419889502762431, + "grad_norm": 0.6432211399078369, + "learning_rate": 9.892467357098744e-05, + "loss": 2.0828, + "step": 3069 + }, + { + "epoch": 0.9422958870472683, + "grad_norm": 0.6448069214820862, + "learning_rate": 9.892364801313823e-05, + "loss": 2.1389, + "step": 3070 + }, + { + "epoch": 0.9426028238182934, + "grad_norm": 0.597197949886322, + "learning_rate": 9.892262197179682e-05, + "loss": 2.0902, + "step": 3071 + }, + { + "epoch": 0.9429097605893186, + "grad_norm": 0.625348687171936, + "learning_rate": 9.892159544697341e-05, + "loss": 2.0659, + "step": 3072 + }, + { + "epoch": 0.9432166973603437, + "grad_norm": 0.5109166502952576, + "learning_rate": 9.892056843867812e-05, + "loss": 2.0895, + "step": 3073 + }, + { + "epoch": 0.943523634131369, + "grad_norm": 0.5917959213256836, + "learning_rate": 9.891954094692108e-05, + "loss": 2.0646, + "step": 3074 + }, + { + "epoch": 0.9438305709023941, + "grad_norm": 0.5320633053779602, + "learning_rate": 9.891851297171249e-05, + "loss": 2.107, + "step": 3075 + }, + { + "epoch": 0.9441375076734193, + "grad_norm": 0.5271332263946533, + "learning_rate": 9.891748451306246e-05, + "loss": 2.0984, + "step": 3076 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.5389983057975769, + "learning_rate": 9.89164555709812e-05, + "loss": 2.1097, + "step": 3077 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.5536573529243469, + "learning_rate": 9.891542614547885e-05, + "loss": 2.1271, + "step": 3078 + }, + { + "epoch": 0.9450583179864948, + "grad_norm": 0.5481712222099304, + "learning_rate": 9.891439623656558e-05, + "loss": 2.0975, + "step": 3079 + }, + { + "epoch": 0.94536525475752, + "grad_norm": 0.626431941986084, + "learning_rate": 9.891336584425157e-05, + "loss": 2.1561, + "step": 3080 + }, + { + "epoch": 0.9456721915285451, + "grad_norm": 0.7452689409255981, + "learning_rate": 9.891233496854702e-05, + "loss": 2.0791, + "step": 3081 + }, + { + "epoch": 0.9459791282995703, + "grad_norm": 0.9399113059043884, + "learning_rate": 9.89113036094621e-05, + "loss": 2.0706, + "step": 3082 + }, + { + "epoch": 0.9462860650705954, + "grad_norm": 1.0733267068862915, + "learning_rate": 9.891027176700701e-05, + "loss": 2.0705, + "step": 3083 + }, + { + "epoch": 0.9465930018416207, + "grad_norm": 0.7521542906761169, + "learning_rate": 9.890923944119194e-05, + "loss": 2.0862, + "step": 3084 + }, + { + "epoch": 0.9468999386126458, + "grad_norm": 0.5447198152542114, + "learning_rate": 9.890820663202713e-05, + "loss": 2.1047, + "step": 3085 + }, + { + "epoch": 0.947206875383671, + "grad_norm": 0.5733833312988281, + "learning_rate": 9.890717333952273e-05, + "loss": 2.121, + "step": 3086 + }, + { + "epoch": 0.9475138121546961, + "grad_norm": 0.7225440144538879, + "learning_rate": 9.890613956368899e-05, + "loss": 2.0533, + "step": 3087 + }, + { + "epoch": 0.9478207489257213, + "grad_norm": 0.6377096176147461, + "learning_rate": 9.89051053045361e-05, + "loss": 2.07, + "step": 3088 + }, + { + "epoch": 0.9481276856967464, + "grad_norm": 0.556656002998352, + "learning_rate": 9.890407056207432e-05, + "loss": 2.1103, + "step": 3089 + }, + { + "epoch": 0.9484346224677717, + "grad_norm": 0.6807621121406555, + "learning_rate": 9.890303533631382e-05, + "loss": 2.1351, + "step": 3090 + }, + { + "epoch": 0.9487415592387968, + "grad_norm": 0.7187803983688354, + "learning_rate": 9.890199962726487e-05, + "loss": 2.0582, + "step": 3091 + }, + { + "epoch": 0.949048496009822, + "grad_norm": 0.6201196908950806, + "learning_rate": 9.890096343493771e-05, + "loss": 2.0799, + "step": 3092 + }, + { + "epoch": 0.9493554327808471, + "grad_norm": 0.6258496046066284, + "learning_rate": 9.889992675934257e-05, + "loss": 2.156, + "step": 3093 + }, + { + "epoch": 0.9496623695518723, + "grad_norm": 0.6191570162773132, + "learning_rate": 9.889888960048967e-05, + "loss": 2.0121, + "step": 3094 + }, + { + "epoch": 0.9499693063228974, + "grad_norm": 0.5668848752975464, + "learning_rate": 9.88978519583893e-05, + "loss": 2.0954, + "step": 3095 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.5596859455108643, + "learning_rate": 9.88968138330517e-05, + "loss": 2.1274, + "step": 3096 + }, + { + "epoch": 0.9505831798649478, + "grad_norm": 0.6199706196784973, + "learning_rate": 9.889577522448712e-05, + "loss": 2.0588, + "step": 3097 + }, + { + "epoch": 0.950890116635973, + "grad_norm": 0.5129860639572144, + "learning_rate": 9.889473613270584e-05, + "loss": 2.0722, + "step": 3098 + }, + { + "epoch": 0.9511970534069981, + "grad_norm": 0.513263463973999, + "learning_rate": 9.88936965577181e-05, + "loss": 2.0298, + "step": 3099 + }, + { + "epoch": 0.9515039901780233, + "grad_norm": 0.4870156943798065, + "learning_rate": 9.88926564995342e-05, + "loss": 2.025, + "step": 3100 + }, + { + "epoch": 0.9518109269490485, + "grad_norm": 0.5310595035552979, + "learning_rate": 9.889161595816442e-05, + "loss": 2.0767, + "step": 3101 + }, + { + "epoch": 0.9521178637200737, + "grad_norm": 0.5993812084197998, + "learning_rate": 9.889057493361903e-05, + "loss": 2.1931, + "step": 3102 + }, + { + "epoch": 0.9524248004910988, + "grad_norm": 0.6157637238502502, + "learning_rate": 9.888953342590832e-05, + "loss": 2.0757, + "step": 3103 + }, + { + "epoch": 0.952731737262124, + "grad_norm": 0.6280032992362976, + "learning_rate": 9.88884914350426e-05, + "loss": 2.0042, + "step": 3104 + }, + { + "epoch": 0.9530386740331491, + "grad_norm": 0.6740781664848328, + "learning_rate": 9.888744896103212e-05, + "loss": 2.0663, + "step": 3105 + }, + { + "epoch": 0.9533456108041743, + "grad_norm": 0.5851804614067078, + "learning_rate": 9.888640600388725e-05, + "loss": 2.0585, + "step": 3106 + }, + { + "epoch": 0.9536525475751995, + "grad_norm": 0.6590312719345093, + "learning_rate": 9.888536256361825e-05, + "loss": 2.0698, + "step": 3107 + }, + { + "epoch": 0.9539594843462247, + "grad_norm": 0.5356595516204834, + "learning_rate": 9.888431864023544e-05, + "loss": 2.1019, + "step": 3108 + }, + { + "epoch": 0.9542664211172498, + "grad_norm": 0.6401084661483765, + "learning_rate": 9.888327423374915e-05, + "loss": 2.1176, + "step": 3109 + }, + { + "epoch": 0.954573357888275, + "grad_norm": 0.6582900285720825, + "learning_rate": 9.888222934416968e-05, + "loss": 2.0375, + "step": 3110 + }, + { + "epoch": 0.9548802946593001, + "grad_norm": 0.6245424151420593, + "learning_rate": 9.888118397150738e-05, + "loss": 1.9913, + "step": 3111 + }, + { + "epoch": 0.9551872314303254, + "grad_norm": 0.5871780514717102, + "learning_rate": 9.888013811577256e-05, + "loss": 2.1434, + "step": 3112 + }, + { + "epoch": 0.9554941682013505, + "grad_norm": 0.6295487284660339, + "learning_rate": 9.887909177697559e-05, + "loss": 2.0805, + "step": 3113 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.5844045877456665, + "learning_rate": 9.887804495512676e-05, + "loss": 2.076, + "step": 3114 + }, + { + "epoch": 0.9561080417434008, + "grad_norm": 0.5581921339035034, + "learning_rate": 9.887699765023645e-05, + "loss": 2.131, + "step": 3115 + }, + { + "epoch": 0.956414978514426, + "grad_norm": 0.6659174561500549, + "learning_rate": 9.8875949862315e-05, + "loss": 2.0759, + "step": 3116 + }, + { + "epoch": 0.9567219152854513, + "grad_norm": 0.5852961540222168, + "learning_rate": 9.887490159137276e-05, + "loss": 2.0486, + "step": 3117 + }, + { + "epoch": 0.9570288520564764, + "grad_norm": 0.6077566146850586, + "learning_rate": 9.887385283742011e-05, + "loss": 2.1132, + "step": 3118 + }, + { + "epoch": 0.9573357888275016, + "grad_norm": 0.5991361141204834, + "learning_rate": 9.88728036004674e-05, + "loss": 2.0322, + "step": 3119 + }, + { + "epoch": 0.9576427255985267, + "grad_norm": 0.5832391977310181, + "learning_rate": 9.887175388052499e-05, + "loss": 2.135, + "step": 3120 + }, + { + "epoch": 0.9579496623695519, + "grad_norm": 0.5479732751846313, + "learning_rate": 9.887070367760327e-05, + "loss": 2.1222, + "step": 3121 + }, + { + "epoch": 0.958256599140577, + "grad_norm": 0.5630220770835876, + "learning_rate": 9.88696529917126e-05, + "loss": 2.1247, + "step": 3122 + }, + { + "epoch": 0.9585635359116023, + "grad_norm": 0.7052439451217651, + "learning_rate": 9.88686018228634e-05, + "loss": 2.204, + "step": 3123 + }, + { + "epoch": 0.9588704726826274, + "grad_norm": 0.5995638370513916, + "learning_rate": 9.8867550171066e-05, + "loss": 2.0153, + "step": 3124 + }, + { + "epoch": 0.9591774094536526, + "grad_norm": 0.5689408779144287, + "learning_rate": 9.886649803633086e-05, + "loss": 2.0341, + "step": 3125 + }, + { + "epoch": 0.9594843462246777, + "grad_norm": 0.5247456431388855, + "learning_rate": 9.886544541866832e-05, + "loss": 2.0657, + "step": 3126 + }, + { + "epoch": 0.9597912829957029, + "grad_norm": 0.5596463084220886, + "learning_rate": 9.886439231808882e-05, + "loss": 2.0829, + "step": 3127 + }, + { + "epoch": 0.960098219766728, + "grad_norm": 0.4993874430656433, + "learning_rate": 9.886333873460275e-05, + "loss": 2.0517, + "step": 3128 + }, + { + "epoch": 0.9604051565377533, + "grad_norm": 0.5776910185813904, + "learning_rate": 9.886228466822054e-05, + "loss": 2.0124, + "step": 3129 + }, + { + "epoch": 0.9607120933087784, + "grad_norm": 0.5871354341506958, + "learning_rate": 9.886123011895258e-05, + "loss": 2.0327, + "step": 3130 + }, + { + "epoch": 0.9610190300798036, + "grad_norm": 0.5873207449913025, + "learning_rate": 9.886017508680931e-05, + "loss": 2.0756, + "step": 3131 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.6422720551490784, + "learning_rate": 9.885911957180113e-05, + "loss": 2.0649, + "step": 3132 + }, + { + "epoch": 0.9616329036218539, + "grad_norm": 0.6040814518928528, + "learning_rate": 9.885806357393853e-05, + "loss": 2.066, + "step": 3133 + }, + { + "epoch": 0.961939840392879, + "grad_norm": 0.6629621982574463, + "learning_rate": 9.885700709323189e-05, + "loss": 2.0824, + "step": 3134 + }, + { + "epoch": 0.9622467771639043, + "grad_norm": 0.572485625743866, + "learning_rate": 9.885595012969168e-05, + "loss": 2.0572, + "step": 3135 + }, + { + "epoch": 0.9625537139349294, + "grad_norm": 0.5050783753395081, + "learning_rate": 9.885489268332833e-05, + "loss": 2.0645, + "step": 3136 + }, + { + "epoch": 0.9628606507059546, + "grad_norm": 0.5744417309761047, + "learning_rate": 9.885383475415229e-05, + "loss": 2.0549, + "step": 3137 + }, + { + "epoch": 0.9631675874769797, + "grad_norm": 0.5604275465011597, + "learning_rate": 9.885277634217403e-05, + "loss": 2.1339, + "step": 3138 + }, + { + "epoch": 0.963474524248005, + "grad_norm": 0.6182584762573242, + "learning_rate": 9.8851717447404e-05, + "loss": 2.0397, + "step": 3139 + }, + { + "epoch": 0.9637814610190301, + "grad_norm": 0.510515570640564, + "learning_rate": 9.885065806985266e-05, + "loss": 1.9761, + "step": 3140 + }, + { + "epoch": 0.9640883977900553, + "grad_norm": 0.4881763756275177, + "learning_rate": 9.884959820953048e-05, + "loss": 2.005, + "step": 3141 + }, + { + "epoch": 0.9643953345610804, + "grad_norm": 0.47206851840019226, + "learning_rate": 9.884853786644794e-05, + "loss": 2.0661, + "step": 3142 + }, + { + "epoch": 0.9647022713321056, + "grad_norm": 0.5691676735877991, + "learning_rate": 9.884747704061552e-05, + "loss": 2.1316, + "step": 3143 + }, + { + "epoch": 0.9650092081031307, + "grad_norm": 0.5338765978813171, + "learning_rate": 9.884641573204372e-05, + "loss": 2.0715, + "step": 3144 + }, + { + "epoch": 0.965316144874156, + "grad_norm": 0.5721597075462341, + "learning_rate": 9.884535394074299e-05, + "loss": 2.1004, + "step": 3145 + }, + { + "epoch": 0.9656230816451811, + "grad_norm": 0.5269518494606018, + "learning_rate": 9.884429166672384e-05, + "loss": 2.1233, + "step": 3146 + }, + { + "epoch": 0.9659300184162063, + "grad_norm": 0.5264385342597961, + "learning_rate": 9.884322890999678e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.9662369551872314, + "grad_norm": 0.6094604730606079, + "learning_rate": 9.88421656705723e-05, + "loss": 2.1009, + "step": 3148 + }, + { + "epoch": 0.9665438919582566, + "grad_norm": 0.5538906455039978, + "learning_rate": 9.884110194846093e-05, + "loss": 2.0055, + "step": 3149 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.591526985168457, + "learning_rate": 9.884003774367313e-05, + "loss": 2.0655, + "step": 3150 + }, + { + "epoch": 0.967157765500307, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.883897305621948e-05, + "loss": 2.0775, + "step": 3151 + }, + { + "epoch": 0.9674647022713321, + "grad_norm": 0.5074640512466431, + "learning_rate": 9.883790788611045e-05, + "loss": 2.0322, + "step": 3152 + }, + { + "epoch": 0.9677716390423573, + "grad_norm": 0.5111376047134399, + "learning_rate": 9.883684223335661e-05, + "loss": 2.0972, + "step": 3153 + }, + { + "epoch": 0.9680785758133824, + "grad_norm": 0.5187644362449646, + "learning_rate": 9.883577609796846e-05, + "loss": 2.072, + "step": 3154 + }, + { + "epoch": 0.9683855125844076, + "grad_norm": 0.5285201072692871, + "learning_rate": 9.883470947995654e-05, + "loss": 2.0468, + "step": 3155 + }, + { + "epoch": 0.9686924493554327, + "grad_norm": 0.49360916018486023, + "learning_rate": 9.883364237933142e-05, + "loss": 2.07, + "step": 3156 + }, + { + "epoch": 0.968999386126458, + "grad_norm": 0.6359294056892395, + "learning_rate": 9.88325747961036e-05, + "loss": 2.1169, + "step": 3157 + }, + { + "epoch": 0.9693063228974831, + "grad_norm": 0.6274764537811279, + "learning_rate": 9.883150673028367e-05, + "loss": 2.1412, + "step": 3158 + }, + { + "epoch": 0.9696132596685083, + "grad_norm": 0.5755917429924011, + "learning_rate": 9.883043818188215e-05, + "loss": 2.0547, + "step": 3159 + }, + { + "epoch": 0.9699201964395334, + "grad_norm": 0.4765770137310028, + "learning_rate": 9.882936915090964e-05, + "loss": 2.02, + "step": 3160 + }, + { + "epoch": 0.9702271332105586, + "grad_norm": 0.5085053443908691, + "learning_rate": 9.882829963737667e-05, + "loss": 2.0355, + "step": 3161 + }, + { + "epoch": 0.9705340699815838, + "grad_norm": 0.49804505705833435, + "learning_rate": 9.882722964129385e-05, + "loss": 2.1274, + "step": 3162 + }, + { + "epoch": 0.970841006752609, + "grad_norm": 0.5575076341629028, + "learning_rate": 9.882615916267171e-05, + "loss": 2.0661, + "step": 3163 + }, + { + "epoch": 0.9711479435236341, + "grad_norm": 0.5678727626800537, + "learning_rate": 9.882508820152084e-05, + "loss": 2.1135, + "step": 3164 + }, + { + "epoch": 0.9714548802946593, + "grad_norm": 0.5505611896514893, + "learning_rate": 9.882401675785185e-05, + "loss": 2.0888, + "step": 3165 + }, + { + "epoch": 0.9717618170656844, + "grad_norm": 0.5224125385284424, + "learning_rate": 9.88229448316753e-05, + "loss": 2.0492, + "step": 3166 + }, + { + "epoch": 0.9720687538367097, + "grad_norm": 0.437215656042099, + "learning_rate": 9.882187242300178e-05, + "loss": 1.9927, + "step": 3167 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.4914848804473877, + "learning_rate": 9.882079953184192e-05, + "loss": 2.0309, + "step": 3168 + }, + { + "epoch": 0.97268262737876, + "grad_norm": 0.4990764260292053, + "learning_rate": 9.88197261582063e-05, + "loss": 2.0408, + "step": 3169 + }, + { + "epoch": 0.9729895641497851, + "grad_norm": 0.5283234715461731, + "learning_rate": 9.881865230210552e-05, + "loss": 2.0627, + "step": 3170 + }, + { + "epoch": 0.9732965009208103, + "grad_norm": 0.5771347284317017, + "learning_rate": 9.88175779635502e-05, + "loss": 2.1591, + "step": 3171 + }, + { + "epoch": 0.9736034376918354, + "grad_norm": 0.5020268559455872, + "learning_rate": 9.881650314255098e-05, + "loss": 2.0311, + "step": 3172 + }, + { + "epoch": 0.9739103744628607, + "grad_norm": 0.5476529002189636, + "learning_rate": 9.881542783911846e-05, + "loss": 2.1114, + "step": 3173 + }, + { + "epoch": 0.9742173112338858, + "grad_norm": 0.5630559921264648, + "learning_rate": 9.881435205326327e-05, + "loss": 2.0617, + "step": 3174 + }, + { + "epoch": 0.974524248004911, + "grad_norm": 0.5931001305580139, + "learning_rate": 9.881327578499604e-05, + "loss": 2.0376, + "step": 3175 + }, + { + "epoch": 0.9748311847759361, + "grad_norm": 0.6123979091644287, + "learning_rate": 9.881219903432742e-05, + "loss": 2.0995, + "step": 3176 + }, + { + "epoch": 0.9751381215469613, + "grad_norm": 0.6064465641975403, + "learning_rate": 9.881112180126802e-05, + "loss": 2.0533, + "step": 3177 + }, + { + "epoch": 0.9754450583179864, + "grad_norm": 0.6071485877037048, + "learning_rate": 9.881004408582852e-05, + "loss": 2.1007, + "step": 3178 + }, + { + "epoch": 0.9757519950890117, + "grad_norm": 0.6021482944488525, + "learning_rate": 9.880896588801954e-05, + "loss": 2.0528, + "step": 3179 + }, + { + "epoch": 0.9760589318600368, + "grad_norm": 0.5204832553863525, + "learning_rate": 9.880788720785177e-05, + "loss": 2.0489, + "step": 3180 + }, + { + "epoch": 0.976365868631062, + "grad_norm": 0.5347138047218323, + "learning_rate": 9.880680804533585e-05, + "loss": 2.1021, + "step": 3181 + }, + { + "epoch": 0.9766728054020871, + "grad_norm": 0.6318790912628174, + "learning_rate": 9.880572840048243e-05, + "loss": 2.0808, + "step": 3182 + }, + { + "epoch": 0.9769797421731123, + "grad_norm": 0.6978665590286255, + "learning_rate": 9.88046482733022e-05, + "loss": 2.0067, + "step": 3183 + }, + { + "epoch": 0.9772866789441375, + "grad_norm": 0.7986917495727539, + "learning_rate": 9.880356766380582e-05, + "loss": 2.0239, + "step": 3184 + }, + { + "epoch": 0.9775936157151627, + "grad_norm": 0.853898286819458, + "learning_rate": 9.880248657200402e-05, + "loss": 2.085, + "step": 3185 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.8207793235778809, + "learning_rate": 9.880140499790741e-05, + "loss": 2.0504, + "step": 3186 + }, + { + "epoch": 0.978207489257213, + "grad_norm": 0.7750336527824402, + "learning_rate": 9.880032294152673e-05, + "loss": 2.0962, + "step": 3187 + }, + { + "epoch": 0.9785144260282382, + "grad_norm": 0.7141241431236267, + "learning_rate": 9.879924040287263e-05, + "loss": 2.0655, + "step": 3188 + }, + { + "epoch": 0.9788213627992634, + "grad_norm": 0.6119080781936646, + "learning_rate": 9.879815738195585e-05, + "loss": 2.0611, + "step": 3189 + }, + { + "epoch": 0.9791282995702886, + "grad_norm": 0.5963751673698425, + "learning_rate": 9.879707387878708e-05, + "loss": 2.0978, + "step": 3190 + }, + { + "epoch": 0.9794352363413137, + "grad_norm": 0.5016428828239441, + "learning_rate": 9.879598989337703e-05, + "loss": 2.0323, + "step": 3191 + }, + { + "epoch": 0.9797421731123389, + "grad_norm": 0.5610151290893555, + "learning_rate": 9.87949054257364e-05, + "loss": 2.1362, + "step": 3192 + }, + { + "epoch": 0.980049109883364, + "grad_norm": 0.5687069296836853, + "learning_rate": 9.879382047587591e-05, + "loss": 2.0234, + "step": 3193 + }, + { + "epoch": 0.9803560466543892, + "grad_norm": 0.6210914254188538, + "learning_rate": 9.87927350438063e-05, + "loss": 2.0455, + "step": 3194 + }, + { + "epoch": 0.9806629834254144, + "grad_norm": 0.530215322971344, + "learning_rate": 9.879164912953827e-05, + "loss": 2.0607, + "step": 3195 + }, + { + "epoch": 0.9809699201964396, + "grad_norm": 0.5462486147880554, + "learning_rate": 9.879056273308258e-05, + "loss": 2.1229, + "step": 3196 + }, + { + "epoch": 0.9812768569674647, + "grad_norm": 0.5765405297279358, + "learning_rate": 9.878947585444994e-05, + "loss": 2.0575, + "step": 3197 + }, + { + "epoch": 0.9815837937384899, + "grad_norm": 0.531679630279541, + "learning_rate": 9.878838849365111e-05, + "loss": 2.0208, + "step": 3198 + }, + { + "epoch": 0.981890730509515, + "grad_norm": 0.5190781950950623, + "learning_rate": 9.878730065069683e-05, + "loss": 2.0073, + "step": 3199 + }, + { + "epoch": 0.9821976672805403, + "grad_norm": 0.6260761022567749, + "learning_rate": 9.878621232559784e-05, + "loss": 2.1144, + "step": 3200 + }, + { + "epoch": 0.9825046040515654, + "grad_norm": 0.664830207824707, + "learning_rate": 9.878512351836491e-05, + "loss": 2.1423, + "step": 3201 + }, + { + "epoch": 0.9828115408225906, + "grad_norm": 0.7107433676719666, + "learning_rate": 9.878403422900881e-05, + "loss": 2.0851, + "step": 3202 + }, + { + "epoch": 0.9831184775936157, + "grad_norm": 0.7426268458366394, + "learning_rate": 9.878294445754027e-05, + "loss": 2.0637, + "step": 3203 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.7643515467643738, + "learning_rate": 9.878185420397008e-05, + "loss": 2.0623, + "step": 3204 + }, + { + "epoch": 0.983732351135666, + "grad_norm": 0.644257664680481, + "learning_rate": 9.878076346830904e-05, + "loss": 2.103, + "step": 3205 + }, + { + "epoch": 0.9840392879066913, + "grad_norm": 0.5871284008026123, + "learning_rate": 9.877967225056787e-05, + "loss": 2.0695, + "step": 3206 + }, + { + "epoch": 0.9843462246777164, + "grad_norm": 0.6907737851142883, + "learning_rate": 9.877858055075742e-05, + "loss": 2.1148, + "step": 3207 + }, + { + "epoch": 0.9846531614487416, + "grad_norm": 0.6685691475868225, + "learning_rate": 9.877748836888843e-05, + "loss": 2.0356, + "step": 3208 + }, + { + "epoch": 0.9849600982197667, + "grad_norm": 0.797210156917572, + "learning_rate": 9.87763957049717e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.9852670349907919, + "grad_norm": 0.8397588133811951, + "learning_rate": 9.877530255901806e-05, + "loss": 2.0697, + "step": 3210 + }, + { + "epoch": 0.985573971761817, + "grad_norm": 0.6988976001739502, + "learning_rate": 9.877420893103828e-05, + "loss": 2.0676, + "step": 3211 + }, + { + "epoch": 0.9858809085328423, + "grad_norm": 0.5828577876091003, + "learning_rate": 9.877311482104319e-05, + "loss": 2.0988, + "step": 3212 + }, + { + "epoch": 0.9861878453038674, + "grad_norm": 0.66143798828125, + "learning_rate": 9.877202022904359e-05, + "loss": 2.101, + "step": 3213 + }, + { + "epoch": 0.9864947820748926, + "grad_norm": 0.7351155877113342, + "learning_rate": 9.877092515505028e-05, + "loss": 2.0198, + "step": 3214 + }, + { + "epoch": 0.9868017188459177, + "grad_norm": 0.6817437410354614, + "learning_rate": 9.876982959907413e-05, + "loss": 2.1182, + "step": 3215 + }, + { + "epoch": 0.9871086556169429, + "grad_norm": 0.6640676259994507, + "learning_rate": 9.876873356112592e-05, + "loss": 2.1264, + "step": 3216 + }, + { + "epoch": 0.987415592387968, + "grad_norm": 0.6146695017814636, + "learning_rate": 9.876763704121652e-05, + "loss": 2.0378, + "step": 3217 + }, + { + "epoch": 0.9877225291589933, + "grad_norm": 0.6681298017501831, + "learning_rate": 9.876654003935672e-05, + "loss": 2.1916, + "step": 3218 + }, + { + "epoch": 0.9880294659300184, + "grad_norm": 0.7407983541488647, + "learning_rate": 9.876544255555742e-05, + "loss": 2.0996, + "step": 3219 + }, + { + "epoch": 0.9883364027010436, + "grad_norm": 0.5995208621025085, + "learning_rate": 9.876434458982941e-05, + "loss": 2.0023, + "step": 3220 + }, + { + "epoch": 0.9886433394720687, + "grad_norm": 0.6491377949714661, + "learning_rate": 9.876324614218357e-05, + "loss": 2.129, + "step": 3221 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.6356569528579712, + "learning_rate": 9.876214721263074e-05, + "loss": 2.1396, + "step": 3222 + }, + { + "epoch": 0.9892572130141191, + "grad_norm": 0.6149557828903198, + "learning_rate": 9.876104780118182e-05, + "loss": 2.0204, + "step": 3223 + }, + { + "epoch": 0.9895641497851443, + "grad_norm": 0.600841224193573, + "learning_rate": 9.875994790784764e-05, + "loss": 2.0585, + "step": 3224 + }, + { + "epoch": 0.9898710865561694, + "grad_norm": 0.6398041248321533, + "learning_rate": 9.875884753263906e-05, + "loss": 2.1296, + "step": 3225 + }, + { + "epoch": 0.9901780233271946, + "grad_norm": 0.5978466272354126, + "learning_rate": 9.875774667556697e-05, + "loss": 1.9765, + "step": 3226 + }, + { + "epoch": 0.9904849600982197, + "grad_norm": 0.49499931931495667, + "learning_rate": 9.875664533664227e-05, + "loss": 2.0516, + "step": 3227 + }, + { + "epoch": 0.990791896869245, + "grad_norm": 0.5660768151283264, + "learning_rate": 9.875554351587579e-05, + "loss": 2.0743, + "step": 3228 + }, + { + "epoch": 0.9910988336402701, + "grad_norm": 0.56971275806427, + "learning_rate": 9.875444121327849e-05, + "loss": 2.0794, + "step": 3229 + }, + { + "epoch": 0.9914057704112953, + "grad_norm": 0.5806300044059753, + "learning_rate": 9.87533384288612e-05, + "loss": 2.1636, + "step": 3230 + }, + { + "epoch": 0.9917127071823204, + "grad_norm": 0.5485837459564209, + "learning_rate": 9.875223516263485e-05, + "loss": 2.025, + "step": 3231 + }, + { + "epoch": 0.9920196439533456, + "grad_norm": 0.6353451013565063, + "learning_rate": 9.875113141461034e-05, + "loss": 2.1033, + "step": 3232 + }, + { + "epoch": 0.9923265807243707, + "grad_norm": 0.577608048915863, + "learning_rate": 9.875002718479858e-05, + "loss": 2.1306, + "step": 3233 + }, + { + "epoch": 0.992633517495396, + "grad_norm": 0.5305901765823364, + "learning_rate": 9.874892247321046e-05, + "loss": 2.1123, + "step": 3234 + }, + { + "epoch": 0.9929404542664211, + "grad_norm": 0.5554118752479553, + "learning_rate": 9.874781727985693e-05, + "loss": 2.0524, + "step": 3235 + }, + { + "epoch": 0.9932473910374463, + "grad_norm": 0.48555269837379456, + "learning_rate": 9.87467116047489e-05, + "loss": 2.0699, + "step": 3236 + }, + { + "epoch": 0.9935543278084714, + "grad_norm": 0.578976035118103, + "learning_rate": 9.874560544789729e-05, + "loss": 2.0747, + "step": 3237 + }, + { + "epoch": 0.9938612645794966, + "grad_norm": 0.5508282780647278, + "learning_rate": 9.874449880931304e-05, + "loss": 2.0947, + "step": 3238 + }, + { + "epoch": 0.9941682013505218, + "grad_norm": 0.5458595752716064, + "learning_rate": 9.874339168900707e-05, + "loss": 2.0417, + "step": 3239 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.5668261647224426, + "learning_rate": 9.874228408699035e-05, + "loss": 2.0948, + "step": 3240 + }, + { + "epoch": 0.9947820748925721, + "grad_norm": 0.6127253174781799, + "learning_rate": 9.87411760032738e-05, + "loss": 2.0904, + "step": 3241 + }, + { + "epoch": 0.9950890116635973, + "grad_norm": 0.5736191868782043, + "learning_rate": 9.874006743786839e-05, + "loss": 2.0637, + "step": 3242 + }, + { + "epoch": 0.9953959484346224, + "grad_norm": 0.574163019657135, + "learning_rate": 9.873895839078507e-05, + "loss": 2.0925, + "step": 3243 + }, + { + "epoch": 0.9957028852056476, + "grad_norm": 0.5660602450370789, + "learning_rate": 9.873784886203478e-05, + "loss": 2.0743, + "step": 3244 + }, + { + "epoch": 0.9960098219766728, + "grad_norm": 0.6037993431091309, + "learning_rate": 9.87367388516285e-05, + "loss": 2.1274, + "step": 3245 + }, + { + "epoch": 0.996316758747698, + "grad_norm": 0.5664488673210144, + "learning_rate": 9.873562835957722e-05, + "loss": 2.0403, + "step": 3246 + }, + { + "epoch": 0.9966236955187231, + "grad_norm": 0.6170254349708557, + "learning_rate": 9.873451738589188e-05, + "loss": 2.0198, + "step": 3247 + }, + { + "epoch": 0.9969306322897483, + "grad_norm": 0.5582032799720764, + "learning_rate": 9.873340593058348e-05, + "loss": 2.1494, + "step": 3248 + }, + { + "epoch": 0.9972375690607734, + "grad_norm": 0.5565598607063293, + "learning_rate": 9.8732293993663e-05, + "loss": 2.1062, + "step": 3249 + }, + { + "epoch": 0.9975445058317987, + "grad_norm": 0.5526474118232727, + "learning_rate": 9.873118157514142e-05, + "loss": 2.1184, + "step": 3250 + }, + { + "epoch": 0.9978514426028238, + "grad_norm": 0.5864302515983582, + "learning_rate": 9.873006867502975e-05, + "loss": 2.1389, + "step": 3251 + }, + { + "epoch": 0.998158379373849, + "grad_norm": 0.5295118689537048, + "learning_rate": 9.872895529333899e-05, + "loss": 2.05, + "step": 3252 + }, + { + "epoch": 0.9984653161448741, + "grad_norm": 0.553537905216217, + "learning_rate": 9.872784143008012e-05, + "loss": 2.0464, + "step": 3253 + }, + { + "epoch": 0.9987722529158993, + "grad_norm": 0.558159589767456, + "learning_rate": 9.872672708526416e-05, + "loss": 2.1013, + "step": 3254 + }, + { + "epoch": 0.9990791896869244, + "grad_norm": 0.5479860901832581, + "learning_rate": 9.872561225890211e-05, + "loss": 2.0497, + "step": 3255 + }, + { + "epoch": 0.9993861264579497, + "grad_norm": 0.5538234114646912, + "learning_rate": 9.872449695100503e-05, + "loss": 2.1239, + "step": 3256 + }, + { + "epoch": 0.9996930632289748, + "grad_norm": 0.5970771908760071, + "learning_rate": 9.872338116158389e-05, + "loss": 2.0693, + "step": 3257 + }, + { + "epoch": 1.0, + "grad_norm": 0.5118132829666138, + "learning_rate": 9.872226489064975e-05, + "loss": 2.0302, + "step": 3258 + }, + { + "epoch": 1.0003069367710251, + "grad_norm": 0.538902223110199, + "learning_rate": 9.872114813821363e-05, + "loss": 2.0604, + "step": 3259 + }, + { + "epoch": 1.0006138735420504, + "grad_norm": 0.47124916315078735, + "learning_rate": 9.872003090428657e-05, + "loss": 2.054, + "step": 3260 + }, + { + "epoch": 1.0009208103130756, + "grad_norm": 0.5109235048294067, + "learning_rate": 9.87189131888796e-05, + "loss": 2.0107, + "step": 3261 + }, + { + "epoch": 1.0012277470841007, + "grad_norm": 0.5530306696891785, + "learning_rate": 9.871779499200377e-05, + "loss": 2.0914, + "step": 3262 + }, + { + "epoch": 1.0015346838551258, + "grad_norm": 0.6271992325782776, + "learning_rate": 9.871667631367017e-05, + "loss": 1.9855, + "step": 3263 + }, + { + "epoch": 1.0018416206261511, + "grad_norm": 0.5752004384994507, + "learning_rate": 9.871555715388978e-05, + "loss": 2.0689, + "step": 3264 + }, + { + "epoch": 1.0021485573971762, + "grad_norm": 0.6185278296470642, + "learning_rate": 9.871443751267373e-05, + "loss": 2.0751, + "step": 3265 + }, + { + "epoch": 1.0024554941682013, + "grad_norm": 0.625248908996582, + "learning_rate": 9.871331739003304e-05, + "loss": 2.102, + "step": 3266 + }, + { + "epoch": 1.0027624309392265, + "grad_norm": 0.6345300078392029, + "learning_rate": 9.87121967859788e-05, + "loss": 2.0898, + "step": 3267 + }, + { + "epoch": 1.0030693677102518, + "grad_norm": 0.6836622953414917, + "learning_rate": 9.871107570052207e-05, + "loss": 2.1348, + "step": 3268 + }, + { + "epoch": 1.003376304481277, + "grad_norm": 0.699739933013916, + "learning_rate": 9.870995413367397e-05, + "loss": 2.0085, + "step": 3269 + }, + { + "epoch": 1.003683241252302, + "grad_norm": 0.650558590888977, + "learning_rate": 9.870883208544553e-05, + "loss": 2.0927, + "step": 3270 + }, + { + "epoch": 1.0039901780233271, + "grad_norm": 0.6837300658226013, + "learning_rate": 9.870770955584785e-05, + "loss": 2.1415, + "step": 3271 + }, + { + "epoch": 1.0042971147943525, + "grad_norm": 0.595761239528656, + "learning_rate": 9.870658654489206e-05, + "loss": 2.0372, + "step": 3272 + }, + { + "epoch": 1.0046040515653776, + "grad_norm": 0.5177203416824341, + "learning_rate": 9.870546305258922e-05, + "loss": 2.053, + "step": 3273 + }, + { + "epoch": 1.0049109883364027, + "grad_norm": 0.5392438173294067, + "learning_rate": 9.870433907895045e-05, + "loss": 2.0886, + "step": 3274 + }, + { + "epoch": 1.0052179251074278, + "grad_norm": 0.594776451587677, + "learning_rate": 9.870321462398686e-05, + "loss": 2.0158, + "step": 3275 + }, + { + "epoch": 1.0055248618784531, + "grad_norm": 0.6363179683685303, + "learning_rate": 9.870208968770955e-05, + "loss": 2.0532, + "step": 3276 + }, + { + "epoch": 1.0058317986494782, + "grad_norm": 0.7506567239761353, + "learning_rate": 9.870096427012965e-05, + "loss": 2.1288, + "step": 3277 + }, + { + "epoch": 1.0061387354205034, + "grad_norm": 0.7155289053916931, + "learning_rate": 9.869983837125828e-05, + "loss": 2.0859, + "step": 3278 + }, + { + "epoch": 1.0064456721915285, + "grad_norm": 0.7589760422706604, + "learning_rate": 9.869871199110656e-05, + "loss": 2.1668, + "step": 3279 + }, + { + "epoch": 1.0067526089625538, + "grad_norm": 0.6161168217658997, + "learning_rate": 9.869758512968562e-05, + "loss": 2.0421, + "step": 3280 + }, + { + "epoch": 1.007059545733579, + "grad_norm": 0.5722637176513672, + "learning_rate": 9.86964577870066e-05, + "loss": 2.1333, + "step": 3281 + }, + { + "epoch": 1.007366482504604, + "grad_norm": 0.6443020701408386, + "learning_rate": 9.869532996308065e-05, + "loss": 2.0227, + "step": 3282 + }, + { + "epoch": 1.0076734192756291, + "grad_norm": 0.6603342890739441, + "learning_rate": 9.869420165791891e-05, + "loss": 2.0888, + "step": 3283 + }, + { + "epoch": 1.0079803560466545, + "grad_norm": 0.6666482090950012, + "learning_rate": 9.869307287153251e-05, + "loss": 2.0132, + "step": 3284 + }, + { + "epoch": 1.0082872928176796, + "grad_norm": 0.6691575646400452, + "learning_rate": 9.869194360393264e-05, + "loss": 2.0752, + "step": 3285 + }, + { + "epoch": 1.0085942295887047, + "grad_norm": 0.6142565011978149, + "learning_rate": 9.869081385513044e-05, + "loss": 2.0491, + "step": 3286 + }, + { + "epoch": 1.0089011663597298, + "grad_norm": 0.5869930386543274, + "learning_rate": 9.868968362513708e-05, + "loss": 2.1252, + "step": 3287 + }, + { + "epoch": 1.0092081031307552, + "grad_norm": 0.532183825969696, + "learning_rate": 9.868855291396373e-05, + "loss": 2.0589, + "step": 3288 + }, + { + "epoch": 1.0095150399017803, + "grad_norm": 0.616374135017395, + "learning_rate": 9.868742172162156e-05, + "loss": 2.0808, + "step": 3289 + }, + { + "epoch": 1.0098219766728054, + "grad_norm": 0.5750923156738281, + "learning_rate": 9.868629004812176e-05, + "loss": 2.0407, + "step": 3290 + }, + { + "epoch": 1.0101289134438305, + "grad_norm": 0.6161531209945679, + "learning_rate": 9.86851578934755e-05, + "loss": 2.0938, + "step": 3291 + }, + { + "epoch": 1.0104358502148558, + "grad_norm": 0.5369158983230591, + "learning_rate": 9.868402525769397e-05, + "loss": 2.1298, + "step": 3292 + }, + { + "epoch": 1.010742786985881, + "grad_norm": 0.5134824514389038, + "learning_rate": 9.868289214078837e-05, + "loss": 2.0345, + "step": 3293 + }, + { + "epoch": 1.011049723756906, + "grad_norm": 0.4972594082355499, + "learning_rate": 9.868175854276991e-05, + "loss": 2.1264, + "step": 3294 + }, + { + "epoch": 1.0113566605279312, + "grad_norm": 0.5727534890174866, + "learning_rate": 9.868062446364976e-05, + "loss": 2.1668, + "step": 3295 + }, + { + "epoch": 1.0116635972989565, + "grad_norm": 0.6384626030921936, + "learning_rate": 9.867948990343915e-05, + "loss": 2.1125, + "step": 3296 + }, + { + "epoch": 1.0119705340699816, + "grad_norm": 0.7591070532798767, + "learning_rate": 9.867835486214929e-05, + "loss": 2.0975, + "step": 3297 + }, + { + "epoch": 1.0122774708410067, + "grad_norm": 0.7940282821655273, + "learning_rate": 9.86772193397914e-05, + "loss": 2.0107, + "step": 3298 + }, + { + "epoch": 1.0125844076120318, + "grad_norm": 0.6877933144569397, + "learning_rate": 9.86760833363767e-05, + "loss": 2.0684, + "step": 3299 + }, + { + "epoch": 1.0128913443830572, + "grad_norm": 0.5361137986183167, + "learning_rate": 9.867494685191641e-05, + "loss": 2.0426, + "step": 3300 + }, + { + "epoch": 1.0131982811540823, + "grad_norm": 0.5104349851608276, + "learning_rate": 9.867380988642177e-05, + "loss": 2.0849, + "step": 3301 + }, + { + "epoch": 1.0135052179251074, + "grad_norm": 0.6133849024772644, + "learning_rate": 9.867267243990399e-05, + "loss": 2.0789, + "step": 3302 + }, + { + "epoch": 1.0138121546961325, + "grad_norm": 0.6607559323310852, + "learning_rate": 9.867153451237436e-05, + "loss": 2.0978, + "step": 3303 + }, + { + "epoch": 1.0141190914671578, + "grad_norm": 0.6853774189949036, + "learning_rate": 9.867039610384409e-05, + "loss": 2.1612, + "step": 3304 + }, + { + "epoch": 1.014426028238183, + "grad_norm": 0.6326626539230347, + "learning_rate": 9.866925721432442e-05, + "loss": 2.0887, + "step": 3305 + }, + { + "epoch": 1.014732965009208, + "grad_norm": 0.5483830571174622, + "learning_rate": 9.866811784382665e-05, + "loss": 2.0522, + "step": 3306 + }, + { + "epoch": 1.0150399017802332, + "grad_norm": 0.5980744957923889, + "learning_rate": 9.866697799236201e-05, + "loss": 2.0666, + "step": 3307 + }, + { + "epoch": 1.0153468385512585, + "grad_norm": 0.6047075986862183, + "learning_rate": 9.866583765994177e-05, + "loss": 2.0924, + "step": 3308 + }, + { + "epoch": 1.0156537753222836, + "grad_norm": 0.5932674407958984, + "learning_rate": 9.86646968465772e-05, + "loss": 2.0426, + "step": 3309 + }, + { + "epoch": 1.0159607120933087, + "grad_norm": 0.5349873304367065, + "learning_rate": 9.866355555227957e-05, + "loss": 2.027, + "step": 3310 + }, + { + "epoch": 1.0162676488643339, + "grad_norm": 0.5090891122817993, + "learning_rate": 9.866241377706015e-05, + "loss": 2.0554, + "step": 3311 + }, + { + "epoch": 1.0165745856353592, + "grad_norm": 0.605268120765686, + "learning_rate": 9.866127152093025e-05, + "loss": 2.0788, + "step": 3312 + }, + { + "epoch": 1.0168815224063843, + "grad_norm": 0.6006563305854797, + "learning_rate": 9.866012878390113e-05, + "loss": 2.0154, + "step": 3313 + }, + { + "epoch": 1.0171884591774094, + "grad_norm": 0.6412727236747742, + "learning_rate": 9.865898556598409e-05, + "loss": 2.0948, + "step": 3314 + }, + { + "epoch": 1.0174953959484345, + "grad_norm": 0.512140154838562, + "learning_rate": 9.865784186719046e-05, + "loss": 2.0314, + "step": 3315 + }, + { + "epoch": 1.0178023327194599, + "grad_norm": 0.48285913467407227, + "learning_rate": 9.865669768753151e-05, + "loss": 1.9689, + "step": 3316 + }, + { + "epoch": 1.018109269490485, + "grad_norm": 0.6067737340927124, + "learning_rate": 9.865555302701854e-05, + "loss": 2.1042, + "step": 3317 + }, + { + "epoch": 1.01841620626151, + "grad_norm": 0.6272363662719727, + "learning_rate": 9.865440788566289e-05, + "loss": 2.1092, + "step": 3318 + }, + { + "epoch": 1.0187231430325352, + "grad_norm": 0.6264182925224304, + "learning_rate": 9.865326226347586e-05, + "loss": 2.0445, + "step": 3319 + }, + { + "epoch": 1.0190300798035605, + "grad_norm": 0.5642834901809692, + "learning_rate": 9.86521161604688e-05, + "loss": 2.1041, + "step": 3320 + }, + { + "epoch": 1.0193370165745856, + "grad_norm": 0.5188324451446533, + "learning_rate": 9.865096957665297e-05, + "loss": 2.0174, + "step": 3321 + }, + { + "epoch": 1.0196439533456108, + "grad_norm": 0.5204416513442993, + "learning_rate": 9.864982251203976e-05, + "loss": 2.0927, + "step": 3322 + }, + { + "epoch": 1.0199508901166359, + "grad_norm": 0.5845292806625366, + "learning_rate": 9.86486749666405e-05, + "loss": 2.0751, + "step": 3323 + }, + { + "epoch": 1.0202578268876612, + "grad_norm": 0.5514994263648987, + "learning_rate": 9.86475269404665e-05, + "loss": 2.0976, + "step": 3324 + }, + { + "epoch": 1.0205647636586863, + "grad_norm": 0.6578981280326843, + "learning_rate": 9.864637843352915e-05, + "loss": 2.0668, + "step": 3325 + }, + { + "epoch": 1.0208717004297114, + "grad_norm": 0.6396434307098389, + "learning_rate": 9.864522944583976e-05, + "loss": 2.0648, + "step": 3326 + }, + { + "epoch": 1.0211786372007365, + "grad_norm": 0.548759400844574, + "learning_rate": 9.86440799774097e-05, + "loss": 2.0873, + "step": 3327 + }, + { + "epoch": 1.0214855739717619, + "grad_norm": 0.5739279985427856, + "learning_rate": 9.864293002825033e-05, + "loss": 2.0623, + "step": 3328 + }, + { + "epoch": 1.021792510742787, + "grad_norm": 0.5882315039634705, + "learning_rate": 9.864177959837303e-05, + "loss": 2.0399, + "step": 3329 + }, + { + "epoch": 1.022099447513812, + "grad_norm": 0.563359797000885, + "learning_rate": 9.864062868778914e-05, + "loss": 2.0839, + "step": 3330 + }, + { + "epoch": 1.0224063842848374, + "grad_norm": 0.6162607073783875, + "learning_rate": 9.863947729651006e-05, + "loss": 2.0439, + "step": 3331 + }, + { + "epoch": 1.0227133210558625, + "grad_norm": 0.6540365815162659, + "learning_rate": 9.863832542454715e-05, + "loss": 2.1234, + "step": 3332 + }, + { + "epoch": 1.0230202578268877, + "grad_norm": 0.6401089429855347, + "learning_rate": 9.86371730719118e-05, + "loss": 2.0418, + "step": 3333 + }, + { + "epoch": 1.0233271945979128, + "grad_norm": 0.6456391215324402, + "learning_rate": 9.86360202386154e-05, + "loss": 2.1191, + "step": 3334 + }, + { + "epoch": 1.023634131368938, + "grad_norm": 0.59992516040802, + "learning_rate": 9.863486692466933e-05, + "loss": 2.0582, + "step": 3335 + }, + { + "epoch": 1.0239410681399632, + "grad_norm": 0.5932520627975464, + "learning_rate": 9.8633713130085e-05, + "loss": 2.1812, + "step": 3336 + }, + { + "epoch": 1.0242480049109883, + "grad_norm": 0.6322866082191467, + "learning_rate": 9.863255885487384e-05, + "loss": 2.1523, + "step": 3337 + }, + { + "epoch": 1.0245549416820134, + "grad_norm": 0.6291313171386719, + "learning_rate": 9.863140409904719e-05, + "loss": 2.0495, + "step": 3338 + }, + { + "epoch": 1.0248618784530388, + "grad_norm": 0.6272565126419067, + "learning_rate": 9.863024886261653e-05, + "loss": 1.9812, + "step": 3339 + }, + { + "epoch": 1.025168815224064, + "grad_norm": 0.6485729217529297, + "learning_rate": 9.862909314559323e-05, + "loss": 2.0826, + "step": 3340 + }, + { + "epoch": 1.025475751995089, + "grad_norm": 0.608239471912384, + "learning_rate": 9.862793694798875e-05, + "loss": 2.0519, + "step": 3341 + }, + { + "epoch": 1.0257826887661141, + "grad_norm": 0.5492779612541199, + "learning_rate": 9.862678026981447e-05, + "loss": 1.9901, + "step": 3342 + }, + { + "epoch": 1.0260896255371394, + "grad_norm": 0.524030327796936, + "learning_rate": 9.862562311108187e-05, + "loss": 2.0695, + "step": 3343 + }, + { + "epoch": 1.0263965623081646, + "grad_norm": 0.6835227608680725, + "learning_rate": 9.862446547180235e-05, + "loss": 2.1312, + "step": 3344 + }, + { + "epoch": 1.0267034990791897, + "grad_norm": 0.6771748065948486, + "learning_rate": 9.862330735198736e-05, + "loss": 2.0566, + "step": 3345 + }, + { + "epoch": 1.0270104358502148, + "grad_norm": 0.609993577003479, + "learning_rate": 9.862214875164835e-05, + "loss": 2.1463, + "step": 3346 + }, + { + "epoch": 1.0273173726212401, + "grad_norm": 0.6617777347564697, + "learning_rate": 9.862098967079677e-05, + "loss": 2.0485, + "step": 3347 + }, + { + "epoch": 1.0276243093922652, + "grad_norm": 0.7935113906860352, + "learning_rate": 9.861983010944407e-05, + "loss": 2.0528, + "step": 3348 + }, + { + "epoch": 1.0279312461632903, + "grad_norm": 0.7510255575180054, + "learning_rate": 9.861867006760172e-05, + "loss": 1.9803, + "step": 3349 + }, + { + "epoch": 1.0282381829343155, + "grad_norm": 0.6944519281387329, + "learning_rate": 9.861750954528117e-05, + "loss": 2.0488, + "step": 3350 + }, + { + "epoch": 1.0285451197053408, + "grad_norm": 0.6057126522064209, + "learning_rate": 9.861634854249389e-05, + "loss": 2.1465, + "step": 3351 + }, + { + "epoch": 1.028852056476366, + "grad_norm": 0.6156182289123535, + "learning_rate": 9.861518705925135e-05, + "loss": 2.1227, + "step": 3352 + }, + { + "epoch": 1.029158993247391, + "grad_norm": 0.6016978621482849, + "learning_rate": 9.861402509556506e-05, + "loss": 2.0238, + "step": 3353 + }, + { + "epoch": 1.0294659300184161, + "grad_norm": 0.5987950563430786, + "learning_rate": 9.861286265144648e-05, + "loss": 2.0529, + "step": 3354 + }, + { + "epoch": 1.0297728667894415, + "grad_norm": 0.6011384725570679, + "learning_rate": 9.861169972690707e-05, + "loss": 2.0612, + "step": 3355 + }, + { + "epoch": 1.0300798035604666, + "grad_norm": 0.5217840671539307, + "learning_rate": 9.861053632195838e-05, + "loss": 2.0472, + "step": 3356 + }, + { + "epoch": 1.0303867403314917, + "grad_norm": 0.5202180743217468, + "learning_rate": 9.860937243661186e-05, + "loss": 2.1301, + "step": 3357 + }, + { + "epoch": 1.0306936771025168, + "grad_norm": 0.572290301322937, + "learning_rate": 9.860820807087905e-05, + "loss": 2.0309, + "step": 3358 + }, + { + "epoch": 1.0310006138735421, + "grad_norm": 0.5088694095611572, + "learning_rate": 9.860704322477142e-05, + "loss": 2.0789, + "step": 3359 + }, + { + "epoch": 1.0313075506445673, + "grad_norm": 0.5546056032180786, + "learning_rate": 9.860587789830052e-05, + "loss": 1.9708, + "step": 3360 + }, + { + "epoch": 1.0316144874155924, + "grad_norm": 0.5152996182441711, + "learning_rate": 9.860471209147782e-05, + "loss": 2.0656, + "step": 3361 + }, + { + "epoch": 1.0319214241866175, + "grad_norm": 0.4997018873691559, + "learning_rate": 9.860354580431488e-05, + "loss": 2.1404, + "step": 3362 + }, + { + "epoch": 1.0322283609576428, + "grad_norm": 0.5464209318161011, + "learning_rate": 9.860237903682321e-05, + "loss": 2.0013, + "step": 3363 + }, + { + "epoch": 1.032535297728668, + "grad_norm": 0.4934932589530945, + "learning_rate": 9.860121178901435e-05, + "loss": 2.0873, + "step": 3364 + }, + { + "epoch": 1.032842234499693, + "grad_norm": 0.5755184292793274, + "learning_rate": 9.860004406089982e-05, + "loss": 2.0706, + "step": 3365 + }, + { + "epoch": 1.0331491712707181, + "grad_norm": 0.6155427098274231, + "learning_rate": 9.859887585249117e-05, + "loss": 2.1153, + "step": 3366 + }, + { + "epoch": 1.0334561080417435, + "grad_norm": 0.6251068711280823, + "learning_rate": 9.859770716379995e-05, + "loss": 1.9988, + "step": 3367 + }, + { + "epoch": 1.0337630448127686, + "grad_norm": 0.5652515888214111, + "learning_rate": 9.85965379948377e-05, + "loss": 1.9834, + "step": 3368 + }, + { + "epoch": 1.0340699815837937, + "grad_norm": 0.49031418561935425, + "learning_rate": 9.859536834561599e-05, + "loss": 2.0719, + "step": 3369 + }, + { + "epoch": 1.0343769183548188, + "grad_norm": 0.5014585852622986, + "learning_rate": 9.859419821614635e-05, + "loss": 2.0309, + "step": 3370 + }, + { + "epoch": 1.0346838551258442, + "grad_norm": 0.5657221674919128, + "learning_rate": 9.859302760644036e-05, + "loss": 2.048, + "step": 3371 + }, + { + "epoch": 1.0349907918968693, + "grad_norm": 0.7023506164550781, + "learning_rate": 9.85918565165096e-05, + "loss": 2.033, + "step": 3372 + }, + { + "epoch": 1.0352977286678944, + "grad_norm": 0.5712850689888, + "learning_rate": 9.859068494636565e-05, + "loss": 2.1006, + "step": 3373 + }, + { + "epoch": 1.0356046654389195, + "grad_norm": 0.5352653861045837, + "learning_rate": 9.858951289602004e-05, + "loss": 1.9775, + "step": 3374 + }, + { + "epoch": 1.0359116022099448, + "grad_norm": 0.5282073616981506, + "learning_rate": 9.85883403654844e-05, + "loss": 2.0388, + "step": 3375 + }, + { + "epoch": 1.03621853898097, + "grad_norm": 0.6164727210998535, + "learning_rate": 9.85871673547703e-05, + "loss": 2.0758, + "step": 3376 + }, + { + "epoch": 1.036525475751995, + "grad_norm": 0.6034660935401917, + "learning_rate": 9.858599386388933e-05, + "loss": 2.0619, + "step": 3377 + }, + { + "epoch": 1.0368324125230202, + "grad_norm": 0.6129952073097229, + "learning_rate": 9.85848198928531e-05, + "loss": 2.0709, + "step": 3378 + }, + { + "epoch": 1.0371393492940455, + "grad_norm": 0.6287248134613037, + "learning_rate": 9.85836454416732e-05, + "loss": 2.1493, + "step": 3379 + }, + { + "epoch": 1.0374462860650706, + "grad_norm": 0.675419807434082, + "learning_rate": 9.858247051036124e-05, + "loss": 2.0558, + "step": 3380 + }, + { + "epoch": 1.0377532228360957, + "grad_norm": 0.6493481397628784, + "learning_rate": 9.858129509892882e-05, + "loss": 2.2019, + "step": 3381 + }, + { + "epoch": 1.0380601596071208, + "grad_norm": 0.6690036058425903, + "learning_rate": 9.85801192073876e-05, + "loss": 2.0069, + "step": 3382 + }, + { + "epoch": 1.0383670963781462, + "grad_norm": 0.6682954430580139, + "learning_rate": 9.857894283574913e-05, + "loss": 2.0559, + "step": 3383 + }, + { + "epoch": 1.0386740331491713, + "grad_norm": 0.6408236622810364, + "learning_rate": 9.857776598402508e-05, + "loss": 2.0837, + "step": 3384 + }, + { + "epoch": 1.0389809699201964, + "grad_norm": 0.7896385192871094, + "learning_rate": 9.85765886522271e-05, + "loss": 2.1344, + "step": 3385 + }, + { + "epoch": 1.0392879066912215, + "grad_norm": 0.7404007911682129, + "learning_rate": 9.857541084036677e-05, + "loss": 2.0937, + "step": 3386 + }, + { + "epoch": 1.0395948434622468, + "grad_norm": 0.6780609488487244, + "learning_rate": 9.857423254845577e-05, + "loss": 2.0279, + "step": 3387 + }, + { + "epoch": 1.039901780233272, + "grad_norm": 0.5989474654197693, + "learning_rate": 9.857305377650574e-05, + "loss": 2.0997, + "step": 3388 + }, + { + "epoch": 1.040208717004297, + "grad_norm": 0.5449484586715698, + "learning_rate": 9.857187452452832e-05, + "loss": 2.0544, + "step": 3389 + }, + { + "epoch": 1.0405156537753222, + "grad_norm": 0.6261779069900513, + "learning_rate": 9.857069479253516e-05, + "loss": 2.024, + "step": 3390 + }, + { + "epoch": 1.0408225905463475, + "grad_norm": 0.6665713787078857, + "learning_rate": 9.856951458053794e-05, + "loss": 2.1139, + "step": 3391 + }, + { + "epoch": 1.0411295273173726, + "grad_norm": 0.5861490964889526, + "learning_rate": 9.856833388854829e-05, + "loss": 2.0087, + "step": 3392 + }, + { + "epoch": 1.0414364640883977, + "grad_norm": 0.5511623620986938, + "learning_rate": 9.856715271657793e-05, + "loss": 2.106, + "step": 3393 + }, + { + "epoch": 1.0417434008594229, + "grad_norm": 0.5450705885887146, + "learning_rate": 9.856597106463848e-05, + "loss": 2.0669, + "step": 3394 + }, + { + "epoch": 1.0420503376304482, + "grad_norm": 0.5172801613807678, + "learning_rate": 9.856478893274163e-05, + "loss": 2.0492, + "step": 3395 + }, + { + "epoch": 1.0423572744014733, + "grad_norm": 0.580157458782196, + "learning_rate": 9.856360632089907e-05, + "loss": 2.0794, + "step": 3396 + }, + { + "epoch": 1.0426642111724984, + "grad_norm": 0.5138662457466125, + "learning_rate": 9.856242322912251e-05, + "loss": 2.0813, + "step": 3397 + }, + { + "epoch": 1.0429711479435237, + "grad_norm": 0.5626689791679382, + "learning_rate": 9.85612396574236e-05, + "loss": 2.071, + "step": 3398 + }, + { + "epoch": 1.0432780847145489, + "grad_norm": 0.6069894433021545, + "learning_rate": 9.856005560581407e-05, + "loss": 2.132, + "step": 3399 + }, + { + "epoch": 1.043585021485574, + "grad_norm": 0.547346293926239, + "learning_rate": 9.85588710743056e-05, + "loss": 2.0572, + "step": 3400 + }, + { + "epoch": 1.043891958256599, + "grad_norm": 0.5712311863899231, + "learning_rate": 9.855768606290992e-05, + "loss": 2.0943, + "step": 3401 + }, + { + "epoch": 1.0441988950276242, + "grad_norm": 0.5945014953613281, + "learning_rate": 9.85565005716387e-05, + "loss": 2.1004, + "step": 3402 + }, + { + "epoch": 1.0445058317986495, + "grad_norm": 0.5712563395500183, + "learning_rate": 9.85553146005037e-05, + "loss": 2.0817, + "step": 3403 + }, + { + "epoch": 1.0448127685696746, + "grad_norm": 0.552578866481781, + "learning_rate": 9.855412814951661e-05, + "loss": 2.0514, + "step": 3404 + }, + { + "epoch": 1.0451197053406998, + "grad_norm": 0.5654930472373962, + "learning_rate": 9.855294121868918e-05, + "loss": 2.1342, + "step": 3405 + }, + { + "epoch": 1.045426642111725, + "grad_norm": 0.516094446182251, + "learning_rate": 9.855175380803312e-05, + "loss": 2.01, + "step": 3406 + }, + { + "epoch": 1.0457335788827502, + "grad_norm": 0.5198549628257751, + "learning_rate": 9.855056591756018e-05, + "loss": 2.0423, + "step": 3407 + }, + { + "epoch": 1.0460405156537753, + "grad_norm": 0.45312678813934326, + "learning_rate": 9.854937754728209e-05, + "loss": 1.9767, + "step": 3408 + }, + { + "epoch": 1.0463474524248004, + "grad_norm": 0.4647958278656006, + "learning_rate": 9.854818869721059e-05, + "loss": 2.107, + "step": 3409 + }, + { + "epoch": 1.0466543891958258, + "grad_norm": 0.5034347772598267, + "learning_rate": 9.854699936735742e-05, + "loss": 2.0358, + "step": 3410 + }, + { + "epoch": 1.0469613259668509, + "grad_norm": 0.48189103603363037, + "learning_rate": 9.854580955773435e-05, + "loss": 2.0441, + "step": 3411 + }, + { + "epoch": 1.047268262737876, + "grad_norm": 0.5315099954605103, + "learning_rate": 9.854461926835316e-05, + "loss": 2.0222, + "step": 3412 + }, + { + "epoch": 1.047575199508901, + "grad_norm": 0.6013970971107483, + "learning_rate": 9.854342849922557e-05, + "loss": 2.09, + "step": 3413 + }, + { + "epoch": 1.0478821362799264, + "grad_norm": 0.7554240226745605, + "learning_rate": 9.854223725036339e-05, + "loss": 2.0411, + "step": 3414 + }, + { + "epoch": 1.0481890730509515, + "grad_norm": 0.7160158157348633, + "learning_rate": 9.854104552177835e-05, + "loss": 2.0858, + "step": 3415 + }, + { + "epoch": 1.0484960098219767, + "grad_norm": 0.5641576051712036, + "learning_rate": 9.853985331348225e-05, + "loss": 2.0287, + "step": 3416 + }, + { + "epoch": 1.0488029465930018, + "grad_norm": 0.5947676301002502, + "learning_rate": 9.853866062548687e-05, + "loss": 2.1177, + "step": 3417 + }, + { + "epoch": 1.049109883364027, + "grad_norm": 0.5780991911888123, + "learning_rate": 9.853746745780401e-05, + "loss": 2.024, + "step": 3418 + }, + { + "epoch": 1.0494168201350522, + "grad_norm": 0.6753053665161133, + "learning_rate": 9.853627381044543e-05, + "loss": 2.1303, + "step": 3419 + }, + { + "epoch": 1.0497237569060773, + "grad_norm": 0.7183442711830139, + "learning_rate": 9.853507968342295e-05, + "loss": 2.0845, + "step": 3420 + }, + { + "epoch": 1.0500306936771024, + "grad_norm": 0.6768840551376343, + "learning_rate": 9.853388507674837e-05, + "loss": 2.0991, + "step": 3421 + }, + { + "epoch": 1.0503376304481278, + "grad_norm": 0.624703049659729, + "learning_rate": 9.85326899904335e-05, + "loss": 2.0952, + "step": 3422 + }, + { + "epoch": 1.050644567219153, + "grad_norm": 0.523289144039154, + "learning_rate": 9.853149442449013e-05, + "loss": 2.0244, + "step": 3423 + }, + { + "epoch": 1.050951503990178, + "grad_norm": 0.4939860701560974, + "learning_rate": 9.853029837893008e-05, + "loss": 2.0312, + "step": 3424 + }, + { + "epoch": 1.0512584407612031, + "grad_norm": 0.5685132145881653, + "learning_rate": 9.852910185376519e-05, + "loss": 2.0863, + "step": 3425 + }, + { + "epoch": 1.0515653775322285, + "grad_norm": 0.5713129639625549, + "learning_rate": 9.852790484900725e-05, + "loss": 2.1182, + "step": 3426 + }, + { + "epoch": 1.0518723143032536, + "grad_norm": 0.5626100301742554, + "learning_rate": 9.852670736466813e-05, + "loss": 2.0187, + "step": 3427 + }, + { + "epoch": 1.0521792510742787, + "grad_norm": 0.5129684805870056, + "learning_rate": 9.852550940075965e-05, + "loss": 2.0354, + "step": 3428 + }, + { + "epoch": 1.0524861878453038, + "grad_norm": 0.6123769879341125, + "learning_rate": 9.852431095729361e-05, + "loss": 2.1315, + "step": 3429 + }, + { + "epoch": 1.0527931246163291, + "grad_norm": 0.66834956407547, + "learning_rate": 9.852311203428192e-05, + "loss": 2.1642, + "step": 3430 + }, + { + "epoch": 1.0531000613873542, + "grad_norm": 0.6253052353858948, + "learning_rate": 9.85219126317364e-05, + "loss": 2.0651, + "step": 3431 + }, + { + "epoch": 1.0534069981583793, + "grad_norm": 0.5162510871887207, + "learning_rate": 9.852071274966888e-05, + "loss": 2.0029, + "step": 3432 + }, + { + "epoch": 1.0537139349294045, + "grad_norm": 0.5725626349449158, + "learning_rate": 9.851951238809125e-05, + "loss": 2.0875, + "step": 3433 + }, + { + "epoch": 1.0540208717004298, + "grad_norm": 0.5319885611534119, + "learning_rate": 9.851831154701537e-05, + "loss": 2.0042, + "step": 3434 + }, + { + "epoch": 1.054327808471455, + "grad_norm": 0.5030925273895264, + "learning_rate": 9.851711022645307e-05, + "loss": 1.9805, + "step": 3435 + }, + { + "epoch": 1.05463474524248, + "grad_norm": 0.5786148309707642, + "learning_rate": 9.851590842641627e-05, + "loss": 2.1456, + "step": 3436 + }, + { + "epoch": 1.0549416820135051, + "grad_norm": 0.6246622800827026, + "learning_rate": 9.851470614691682e-05, + "loss": 2.042, + "step": 3437 + }, + { + "epoch": 1.0552486187845305, + "grad_norm": 0.5181210041046143, + "learning_rate": 9.851350338796662e-05, + "loss": 2.0423, + "step": 3438 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.5505120754241943, + "learning_rate": 9.851230014957754e-05, + "loss": 2.0478, + "step": 3439 + }, + { + "epoch": 1.0558624923265807, + "grad_norm": 0.6193632483482361, + "learning_rate": 9.851109643176147e-05, + "loss": 1.9904, + "step": 3440 + }, + { + "epoch": 1.0561694290976058, + "grad_norm": 0.6332803964614868, + "learning_rate": 9.85098922345303e-05, + "loss": 2.0037, + "step": 3441 + }, + { + "epoch": 1.0564763658686311, + "grad_norm": 0.5601481199264526, + "learning_rate": 9.850868755789595e-05, + "loss": 2.141, + "step": 3442 + }, + { + "epoch": 1.0567833026396563, + "grad_norm": 0.588182270526886, + "learning_rate": 9.850748240187033e-05, + "loss": 2.17, + "step": 3443 + }, + { + "epoch": 1.0570902394106814, + "grad_norm": 0.5955865383148193, + "learning_rate": 9.850627676646533e-05, + "loss": 2.1004, + "step": 3444 + }, + { + "epoch": 1.0573971761817065, + "grad_norm": 0.6412670612335205, + "learning_rate": 9.850507065169288e-05, + "loss": 2.0642, + "step": 3445 + }, + { + "epoch": 1.0577041129527318, + "grad_norm": 0.5597305297851562, + "learning_rate": 9.850386405756489e-05, + "loss": 2.0412, + "step": 3446 + }, + { + "epoch": 1.058011049723757, + "grad_norm": 0.5633887052536011, + "learning_rate": 9.850265698409328e-05, + "loss": 1.9976, + "step": 3447 + }, + { + "epoch": 1.058317986494782, + "grad_norm": 0.5924213528633118, + "learning_rate": 9.850144943128998e-05, + "loss": 2.0715, + "step": 3448 + }, + { + "epoch": 1.0586249232658071, + "grad_norm": 0.5968048572540283, + "learning_rate": 9.850024139916694e-05, + "loss": 2.0755, + "step": 3449 + }, + { + "epoch": 1.0589318600368325, + "grad_norm": 0.5745044946670532, + "learning_rate": 9.849903288773609e-05, + "loss": 2.0615, + "step": 3450 + }, + { + "epoch": 1.0592387968078576, + "grad_norm": 0.5154273509979248, + "learning_rate": 9.849782389700936e-05, + "loss": 2.0429, + "step": 3451 + }, + { + "epoch": 1.0595457335788827, + "grad_norm": 0.5307286977767944, + "learning_rate": 9.849661442699871e-05, + "loss": 2.0788, + "step": 3452 + }, + { + "epoch": 1.0598526703499078, + "grad_norm": 0.5445010662078857, + "learning_rate": 9.84954044777161e-05, + "loss": 2.0598, + "step": 3453 + }, + { + "epoch": 1.0601596071209332, + "grad_norm": 0.5858064889907837, + "learning_rate": 9.849419404917347e-05, + "loss": 2.069, + "step": 3454 + }, + { + "epoch": 1.0604665438919583, + "grad_norm": 0.5906962156295776, + "learning_rate": 9.84929831413828e-05, + "loss": 2.1256, + "step": 3455 + }, + { + "epoch": 1.0607734806629834, + "grad_norm": 0.6632845997810364, + "learning_rate": 9.849177175435605e-05, + "loss": 2.1002, + "step": 3456 + }, + { + "epoch": 1.0610804174340085, + "grad_norm": 0.6352782845497131, + "learning_rate": 9.849055988810518e-05, + "loss": 2.0901, + "step": 3457 + }, + { + "epoch": 1.0613873542050338, + "grad_norm": 0.5406731963157654, + "learning_rate": 9.848934754264218e-05, + "loss": 2.0562, + "step": 3458 + }, + { + "epoch": 1.061694290976059, + "grad_norm": 0.6067590117454529, + "learning_rate": 9.848813471797902e-05, + "loss": 2.0914, + "step": 3459 + }, + { + "epoch": 1.062001227747084, + "grad_norm": 0.5876826047897339, + "learning_rate": 9.84869214141277e-05, + "loss": 2.0065, + "step": 3460 + }, + { + "epoch": 1.0623081645181092, + "grad_norm": 0.611648440361023, + "learning_rate": 9.84857076311002e-05, + "loss": 2.1252, + "step": 3461 + }, + { + "epoch": 1.0626151012891345, + "grad_norm": 0.568358302116394, + "learning_rate": 9.848449336890853e-05, + "loss": 2.0312, + "step": 3462 + }, + { + "epoch": 1.0629220380601596, + "grad_norm": 0.5303518772125244, + "learning_rate": 9.848327862756466e-05, + "loss": 1.9989, + "step": 3463 + }, + { + "epoch": 1.0632289748311847, + "grad_norm": 0.5377182960510254, + "learning_rate": 9.848206340708062e-05, + "loss": 2.0759, + "step": 3464 + }, + { + "epoch": 1.06353591160221, + "grad_norm": 0.5178431868553162, + "learning_rate": 9.848084770746842e-05, + "loss": 2.0613, + "step": 3465 + }, + { + "epoch": 1.0638428483732352, + "grad_norm": 0.4605518877506256, + "learning_rate": 9.847963152874007e-05, + "loss": 1.9961, + "step": 3466 + }, + { + "epoch": 1.0641497851442603, + "grad_norm": 0.5262506604194641, + "learning_rate": 9.847841487090758e-05, + "loss": 2.032, + "step": 3467 + }, + { + "epoch": 1.0644567219152854, + "grad_norm": 0.5210484862327576, + "learning_rate": 9.847719773398298e-05, + "loss": 2.106, + "step": 3468 + }, + { + "epoch": 1.0647636586863105, + "grad_norm": 0.5159584283828735, + "learning_rate": 9.84759801179783e-05, + "loss": 2.07, + "step": 3469 + }, + { + "epoch": 1.0650705954573358, + "grad_norm": 0.5094224810600281, + "learning_rate": 9.847476202290557e-05, + "loss": 2.1379, + "step": 3470 + }, + { + "epoch": 1.065377532228361, + "grad_norm": 0.5180851221084595, + "learning_rate": 9.847354344877684e-05, + "loss": 2.0911, + "step": 3471 + }, + { + "epoch": 1.065684468999386, + "grad_norm": 0.5476199984550476, + "learning_rate": 9.847232439560412e-05, + "loss": 2.0654, + "step": 3472 + }, + { + "epoch": 1.0659914057704114, + "grad_norm": 0.5314182639122009, + "learning_rate": 9.84711048633995e-05, + "loss": 1.9829, + "step": 3473 + }, + { + "epoch": 1.0662983425414365, + "grad_norm": 0.549379825592041, + "learning_rate": 9.8469884852175e-05, + "loss": 2.0876, + "step": 3474 + }, + { + "epoch": 1.0666052793124616, + "grad_norm": 0.6280861496925354, + "learning_rate": 9.84686643619427e-05, + "loss": 2.1026, + "step": 3475 + }, + { + "epoch": 1.0669122160834867, + "grad_norm": 0.5838838219642639, + "learning_rate": 9.846744339271464e-05, + "loss": 2.0553, + "step": 3476 + }, + { + "epoch": 1.0672191528545119, + "grad_norm": 0.6090747117996216, + "learning_rate": 9.84662219445029e-05, + "loss": 2.0983, + "step": 3477 + }, + { + "epoch": 1.0675260896255372, + "grad_norm": 0.515504002571106, + "learning_rate": 9.846500001731955e-05, + "loss": 2.0992, + "step": 3478 + }, + { + "epoch": 1.0678330263965623, + "grad_norm": 0.5083954930305481, + "learning_rate": 9.846377761117667e-05, + "loss": 1.9851, + "step": 3479 + }, + { + "epoch": 1.0681399631675874, + "grad_norm": 0.5102222561836243, + "learning_rate": 9.846255472608632e-05, + "loss": 2.0553, + "step": 3480 + }, + { + "epoch": 1.0684468999386127, + "grad_norm": 0.5123574137687683, + "learning_rate": 9.846133136206061e-05, + "loss": 2.0382, + "step": 3481 + }, + { + "epoch": 1.0687538367096379, + "grad_norm": 0.5657833814620972, + "learning_rate": 9.84601075191116e-05, + "loss": 2.0735, + "step": 3482 + }, + { + "epoch": 1.069060773480663, + "grad_norm": 0.5460711121559143, + "learning_rate": 9.845888319725143e-05, + "loss": 2.0445, + "step": 3483 + }, + { + "epoch": 1.069367710251688, + "grad_norm": 0.42860034108161926, + "learning_rate": 9.845765839649217e-05, + "loss": 2.0166, + "step": 3484 + }, + { + "epoch": 1.0696746470227134, + "grad_norm": 0.5413190126419067, + "learning_rate": 9.845643311684592e-05, + "loss": 1.9923, + "step": 3485 + }, + { + "epoch": 1.0699815837937385, + "grad_norm": 0.4982166290283203, + "learning_rate": 9.84552073583248e-05, + "loss": 2.0279, + "step": 3486 + }, + { + "epoch": 1.0702885205647636, + "grad_norm": 0.4824393689632416, + "learning_rate": 9.845398112094091e-05, + "loss": 1.9661, + "step": 3487 + }, + { + "epoch": 1.0705954573357888, + "grad_norm": 0.5690898895263672, + "learning_rate": 9.845275440470639e-05, + "loss": 2.0866, + "step": 3488 + }, + { + "epoch": 1.070902394106814, + "grad_norm": 0.6087098717689514, + "learning_rate": 9.845152720963335e-05, + "loss": 2.055, + "step": 3489 + }, + { + "epoch": 1.0712093308778392, + "grad_norm": 0.5754218101501465, + "learning_rate": 9.845029953573392e-05, + "loss": 2.0577, + "step": 3490 + }, + { + "epoch": 1.0715162676488643, + "grad_norm": 0.619746744632721, + "learning_rate": 9.844907138302023e-05, + "loss": 2.0694, + "step": 3491 + }, + { + "epoch": 1.0718232044198894, + "grad_norm": 0.5165389776229858, + "learning_rate": 9.844784275150442e-05, + "loss": 1.9618, + "step": 3492 + }, + { + "epoch": 1.0721301411909148, + "grad_norm": 0.5098079442977905, + "learning_rate": 9.844661364119863e-05, + "loss": 2.0021, + "step": 3493 + }, + { + "epoch": 1.0724370779619399, + "grad_norm": 0.5978688597679138, + "learning_rate": 9.8445384052115e-05, + "loss": 2.0861, + "step": 3494 + }, + { + "epoch": 1.072744014732965, + "grad_norm": 0.5498695373535156, + "learning_rate": 9.844415398426572e-05, + "loss": 2.095, + "step": 3495 + }, + { + "epoch": 1.07305095150399, + "grad_norm": 0.4890369474887848, + "learning_rate": 9.844292343766289e-05, + "loss": 1.9819, + "step": 3496 + }, + { + "epoch": 1.0733578882750154, + "grad_norm": 0.49551400542259216, + "learning_rate": 9.844169241231871e-05, + "loss": 2.109, + "step": 3497 + }, + { + "epoch": 1.0736648250460405, + "grad_norm": 0.5358633399009705, + "learning_rate": 9.844046090824533e-05, + "loss": 2.0579, + "step": 3498 + }, + { + "epoch": 1.0739717618170657, + "grad_norm": 0.5990919470787048, + "learning_rate": 9.843922892545492e-05, + "loss": 2.1962, + "step": 3499 + }, + { + "epoch": 1.0742786985880908, + "grad_norm": 0.5973169207572937, + "learning_rate": 9.843799646395967e-05, + "loss": 2.0691, + "step": 3500 + }, + { + "epoch": 1.074585635359116, + "grad_norm": 0.5875831246376038, + "learning_rate": 9.843676352377172e-05, + "loss": 2.0807, + "step": 3501 + }, + { + "epoch": 1.0748925721301412, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.84355301049033e-05, + "loss": 2.0694, + "step": 3502 + }, + { + "epoch": 1.0751995089011663, + "grad_norm": 0.7694209814071655, + "learning_rate": 9.843429620736659e-05, + "loss": 2.1504, + "step": 3503 + }, + { + "epoch": 1.0755064456721914, + "grad_norm": 0.7930089831352234, + "learning_rate": 9.843306183117376e-05, + "loss": 2.0635, + "step": 3504 + }, + { + "epoch": 1.0758133824432168, + "grad_norm": 0.6518469452857971, + "learning_rate": 9.843182697633704e-05, + "loss": 2.0395, + "step": 3505 + }, + { + "epoch": 1.076120319214242, + "grad_norm": 0.49737605452537537, + "learning_rate": 9.843059164286861e-05, + "loss": 1.9875, + "step": 3506 + }, + { + "epoch": 1.076427255985267, + "grad_norm": 0.5311492085456848, + "learning_rate": 9.84293558307807e-05, + "loss": 2.1331, + "step": 3507 + }, + { + "epoch": 1.0767341927562921, + "grad_norm": 0.6801449656486511, + "learning_rate": 9.842811954008551e-05, + "loss": 2.0991, + "step": 3508 + }, + { + "epoch": 1.0770411295273175, + "grad_norm": 0.5404406189918518, + "learning_rate": 9.842688277079523e-05, + "loss": 2.0482, + "step": 3509 + }, + { + "epoch": 1.0773480662983426, + "grad_norm": 0.6136532425880432, + "learning_rate": 9.842564552292215e-05, + "loss": 2.1016, + "step": 3510 + }, + { + "epoch": 1.0776550030693677, + "grad_norm": 0.5874183773994446, + "learning_rate": 9.842440779647843e-05, + "loss": 2.0495, + "step": 3511 + }, + { + "epoch": 1.0779619398403928, + "grad_norm": 0.4891047775745392, + "learning_rate": 9.842316959147635e-05, + "loss": 2.0592, + "step": 3512 + }, + { + "epoch": 1.0782688766114181, + "grad_norm": 0.5115689635276794, + "learning_rate": 9.84219309079281e-05, + "loss": 2.0084, + "step": 3513 + }, + { + "epoch": 1.0785758133824432, + "grad_norm": 0.5662370324134827, + "learning_rate": 9.842069174584597e-05, + "loss": 2.1134, + "step": 3514 + }, + { + "epoch": 1.0788827501534684, + "grad_norm": 0.6859605312347412, + "learning_rate": 9.841945210524217e-05, + "loss": 2.1144, + "step": 3515 + }, + { + "epoch": 1.0791896869244935, + "grad_norm": 0.8003933429718018, + "learning_rate": 9.841821198612897e-05, + "loss": 2.0353, + "step": 3516 + }, + { + "epoch": 1.0794966236955188, + "grad_norm": 0.8481027483940125, + "learning_rate": 9.841697138851863e-05, + "loss": 2.1012, + "step": 3517 + }, + { + "epoch": 1.079803560466544, + "grad_norm": 0.7234178185462952, + "learning_rate": 9.84157303124234e-05, + "loss": 2.1134, + "step": 3518 + }, + { + "epoch": 1.080110497237569, + "grad_norm": 0.6129522919654846, + "learning_rate": 9.841448875785553e-05, + "loss": 2.0736, + "step": 3519 + }, + { + "epoch": 1.0804174340085941, + "grad_norm": 0.4983314573764801, + "learning_rate": 9.841324672482732e-05, + "loss": 2.0334, + "step": 3520 + }, + { + "epoch": 1.0807243707796195, + "grad_norm": 0.6069099307060242, + "learning_rate": 9.841200421335101e-05, + "loss": 2.0506, + "step": 3521 + }, + { + "epoch": 1.0810313075506446, + "grad_norm": 0.5841798186302185, + "learning_rate": 9.841076122343893e-05, + "loss": 2.0491, + "step": 3522 + }, + { + "epoch": 1.0813382443216697, + "grad_norm": 0.5629861354827881, + "learning_rate": 9.84095177551033e-05, + "loss": 2.0435, + "step": 3523 + }, + { + "epoch": 1.0816451810926948, + "grad_norm": 0.48676446080207825, + "learning_rate": 9.840827380835646e-05, + "loss": 2.0543, + "step": 3524 + }, + { + "epoch": 1.0819521178637201, + "grad_norm": 0.5119389295578003, + "learning_rate": 9.840702938321069e-05, + "loss": 2.0461, + "step": 3525 + }, + { + "epoch": 1.0822590546347453, + "grad_norm": 0.47259917855262756, + "learning_rate": 9.840578447967827e-05, + "loss": 2.0494, + "step": 3526 + }, + { + "epoch": 1.0825659914057704, + "grad_norm": 0.5083605647087097, + "learning_rate": 9.840453909777153e-05, + "loss": 2.0518, + "step": 3527 + }, + { + "epoch": 1.0828729281767955, + "grad_norm": 0.46149778366088867, + "learning_rate": 9.840329323750276e-05, + "loss": 2.0087, + "step": 3528 + }, + { + "epoch": 1.0831798649478208, + "grad_norm": 0.4698919951915741, + "learning_rate": 9.840204689888427e-05, + "loss": 2.0715, + "step": 3529 + }, + { + "epoch": 1.083486801718846, + "grad_norm": 0.514570951461792, + "learning_rate": 9.840080008192838e-05, + "loss": 2.1067, + "step": 3530 + }, + { + "epoch": 1.083793738489871, + "grad_norm": 0.5938723087310791, + "learning_rate": 9.839955278664743e-05, + "loss": 2.1246, + "step": 3531 + }, + { + "epoch": 1.0841006752608962, + "grad_norm": 0.58525550365448, + "learning_rate": 9.839830501305372e-05, + "loss": 2.0695, + "step": 3532 + }, + { + "epoch": 1.0844076120319215, + "grad_norm": 0.5693490505218506, + "learning_rate": 9.83970567611596e-05, + "loss": 2.0166, + "step": 3533 + }, + { + "epoch": 1.0847145488029466, + "grad_norm": 0.544964075088501, + "learning_rate": 9.839580803097738e-05, + "loss": 2.0093, + "step": 3534 + }, + { + "epoch": 1.0850214855739717, + "grad_norm": 0.5509639978408813, + "learning_rate": 9.839455882251945e-05, + "loss": 2.0511, + "step": 3535 + }, + { + "epoch": 1.0853284223449968, + "grad_norm": 0.5092516541481018, + "learning_rate": 9.83933091357981e-05, + "loss": 2.0586, + "step": 3536 + }, + { + "epoch": 1.0856353591160222, + "grad_norm": 0.5163968205451965, + "learning_rate": 9.83920589708257e-05, + "loss": 2.0541, + "step": 3537 + }, + { + "epoch": 1.0859422958870473, + "grad_norm": 0.49756479263305664, + "learning_rate": 9.839080832761464e-05, + "loss": 2.0495, + "step": 3538 + }, + { + "epoch": 1.0862492326580724, + "grad_norm": 0.6246916055679321, + "learning_rate": 9.838955720617722e-05, + "loss": 2.2082, + "step": 3539 + }, + { + "epoch": 1.0865561694290977, + "grad_norm": 0.5826153755187988, + "learning_rate": 9.838830560652585e-05, + "loss": 2.0318, + "step": 3540 + }, + { + "epoch": 1.0868631062001228, + "grad_norm": 0.6131548285484314, + "learning_rate": 9.838705352867287e-05, + "loss": 2.1172, + "step": 3541 + }, + { + "epoch": 1.087170042971148, + "grad_norm": 0.7028201818466187, + "learning_rate": 9.838580097263068e-05, + "loss": 2.061, + "step": 3542 + }, + { + "epoch": 1.087476979742173, + "grad_norm": 0.7061073780059814, + "learning_rate": 9.838454793841166e-05, + "loss": 2.0944, + "step": 3543 + }, + { + "epoch": 1.0877839165131982, + "grad_norm": 0.6820229887962341, + "learning_rate": 9.838329442602814e-05, + "loss": 2.072, + "step": 3544 + }, + { + "epoch": 1.0880908532842235, + "grad_norm": 0.5658139586448669, + "learning_rate": 9.838204043549257e-05, + "loss": 2.0499, + "step": 3545 + }, + { + "epoch": 1.0883977900552486, + "grad_norm": 0.5714126825332642, + "learning_rate": 9.838078596681731e-05, + "loss": 2.06, + "step": 3546 + }, + { + "epoch": 1.0887047268262737, + "grad_norm": 0.5343610048294067, + "learning_rate": 9.837953102001477e-05, + "loss": 2.0932, + "step": 3547 + }, + { + "epoch": 1.089011663597299, + "grad_norm": 0.5799851417541504, + "learning_rate": 9.837827559509735e-05, + "loss": 2.0615, + "step": 3548 + }, + { + "epoch": 1.0893186003683242, + "grad_norm": 0.5679401159286499, + "learning_rate": 9.837701969207745e-05, + "loss": 2.0161, + "step": 3549 + }, + { + "epoch": 1.0896255371393493, + "grad_norm": 0.5369420647621155, + "learning_rate": 9.83757633109675e-05, + "loss": 2.0066, + "step": 3550 + }, + { + "epoch": 1.0899324739103744, + "grad_norm": 0.5276355147361755, + "learning_rate": 9.837450645177988e-05, + "loss": 2.03, + "step": 3551 + }, + { + "epoch": 1.0902394106813997, + "grad_norm": 0.49717894196510315, + "learning_rate": 9.837324911452705e-05, + "loss": 1.9897, + "step": 3552 + }, + { + "epoch": 1.0905463474524248, + "grad_norm": 0.460783451795578, + "learning_rate": 9.837199129922142e-05, + "loss": 2.089, + "step": 3553 + }, + { + "epoch": 1.09085328422345, + "grad_norm": 0.505473792552948, + "learning_rate": 9.837073300587541e-05, + "loss": 2.035, + "step": 3554 + }, + { + "epoch": 1.091160220994475, + "grad_norm": 0.4588155150413513, + "learning_rate": 9.836947423450147e-05, + "loss": 2.0029, + "step": 3555 + }, + { + "epoch": 1.0914671577655004, + "grad_norm": 0.5151825547218323, + "learning_rate": 9.836821498511203e-05, + "loss": 2.1075, + "step": 3556 + }, + { + "epoch": 1.0917740945365255, + "grad_norm": 0.46669647097587585, + "learning_rate": 9.836695525771955e-05, + "loss": 2.0468, + "step": 3557 + }, + { + "epoch": 1.0920810313075506, + "grad_norm": 0.49291539192199707, + "learning_rate": 9.836569505233647e-05, + "loss": 2.1201, + "step": 3558 + }, + { + "epoch": 1.0923879680785757, + "grad_norm": 0.49323126673698425, + "learning_rate": 9.836443436897525e-05, + "loss": 1.9796, + "step": 3559 + }, + { + "epoch": 1.092694904849601, + "grad_norm": 0.4784039258956909, + "learning_rate": 9.836317320764832e-05, + "loss": 2.0267, + "step": 3560 + }, + { + "epoch": 1.0930018416206262, + "grad_norm": 0.5402999520301819, + "learning_rate": 9.836191156836818e-05, + "loss": 2.07, + "step": 3561 + }, + { + "epoch": 1.0933087783916513, + "grad_norm": 0.5989857912063599, + "learning_rate": 9.83606494511473e-05, + "loss": 2.0518, + "step": 3562 + }, + { + "epoch": 1.0936157151626764, + "grad_norm": 0.685855507850647, + "learning_rate": 9.835938685599811e-05, + "loss": 2.0632, + "step": 3563 + }, + { + "epoch": 1.0939226519337018, + "grad_norm": 0.7716066837310791, + "learning_rate": 9.835812378293312e-05, + "loss": 2.0758, + "step": 3564 + }, + { + "epoch": 1.0942295887047269, + "grad_norm": 0.6822659969329834, + "learning_rate": 9.835686023196481e-05, + "loss": 2.0077, + "step": 3565 + }, + { + "epoch": 1.094536525475752, + "grad_norm": 0.5031718611717224, + "learning_rate": 9.835559620310566e-05, + "loss": 2.0432, + "step": 3566 + }, + { + "epoch": 1.094843462246777, + "grad_norm": 0.5570902228355408, + "learning_rate": 9.835433169636818e-05, + "loss": 2.1203, + "step": 3567 + }, + { + "epoch": 1.0951503990178024, + "grad_norm": 0.6224993467330933, + "learning_rate": 9.835306671176484e-05, + "loss": 2.0281, + "step": 3568 + }, + { + "epoch": 1.0954573357888275, + "grad_norm": 0.67215895652771, + "learning_rate": 9.835180124930816e-05, + "loss": 2.1158, + "step": 3569 + }, + { + "epoch": 1.0957642725598526, + "grad_norm": 0.5764983892440796, + "learning_rate": 9.835053530901064e-05, + "loss": 1.9735, + "step": 3570 + }, + { + "epoch": 1.0960712093308778, + "grad_norm": 0.48459672927856445, + "learning_rate": 9.834926889088478e-05, + "loss": 2.0074, + "step": 3571 + }, + { + "epoch": 1.096378146101903, + "grad_norm": 0.4789890944957733, + "learning_rate": 9.834800199494312e-05, + "loss": 1.9942, + "step": 3572 + }, + { + "epoch": 1.0966850828729282, + "grad_norm": 0.5133237838745117, + "learning_rate": 9.834673462119817e-05, + "loss": 2.0204, + "step": 3573 + }, + { + "epoch": 1.0969920196439533, + "grad_norm": 0.638518750667572, + "learning_rate": 9.834546676966244e-05, + "loss": 2.1396, + "step": 3574 + }, + { + "epoch": 1.0972989564149784, + "grad_norm": 0.5471677780151367, + "learning_rate": 9.834419844034848e-05, + "loss": 1.99, + "step": 3575 + }, + { + "epoch": 1.0976058931860038, + "grad_norm": 0.5372926592826843, + "learning_rate": 9.83429296332688e-05, + "loss": 2.0241, + "step": 3576 + }, + { + "epoch": 1.0979128299570289, + "grad_norm": 0.5284983515739441, + "learning_rate": 9.834166034843597e-05, + "loss": 2.0705, + "step": 3577 + }, + { + "epoch": 1.098219766728054, + "grad_norm": 0.5212574601173401, + "learning_rate": 9.834039058586252e-05, + "loss": 2.0648, + "step": 3578 + }, + { + "epoch": 1.098526703499079, + "grad_norm": 0.439454048871994, + "learning_rate": 9.833912034556099e-05, + "loss": 1.9981, + "step": 3579 + }, + { + "epoch": 1.0988336402701044, + "grad_norm": 0.529550313949585, + "learning_rate": 9.833784962754394e-05, + "loss": 2.0092, + "step": 3580 + }, + { + "epoch": 1.0991405770411296, + "grad_norm": 0.5555844902992249, + "learning_rate": 9.833657843182394e-05, + "loss": 2.0457, + "step": 3581 + }, + { + "epoch": 1.0994475138121547, + "grad_norm": 0.56191086769104, + "learning_rate": 9.833530675841352e-05, + "loss": 2.0742, + "step": 3582 + }, + { + "epoch": 1.0997544505831798, + "grad_norm": 0.5119436383247375, + "learning_rate": 9.833403460732529e-05, + "loss": 2.0836, + "step": 3583 + }, + { + "epoch": 1.1000613873542051, + "grad_norm": 0.48049578070640564, + "learning_rate": 9.833276197857179e-05, + "loss": 2.0018, + "step": 3584 + }, + { + "epoch": 1.1003683241252302, + "grad_norm": 0.48501092195510864, + "learning_rate": 9.83314888721656e-05, + "loss": 2.0158, + "step": 3585 + }, + { + "epoch": 1.1006752608962553, + "grad_norm": 0.528548538684845, + "learning_rate": 9.833021528811932e-05, + "loss": 2.0327, + "step": 3586 + }, + { + "epoch": 1.1009821976672804, + "grad_norm": 0.5243194699287415, + "learning_rate": 9.832894122644551e-05, + "loss": 1.9874, + "step": 3587 + }, + { + "epoch": 1.1012891344383058, + "grad_norm": 0.46920302510261536, + "learning_rate": 9.832766668715681e-05, + "loss": 2.0487, + "step": 3588 + }, + { + "epoch": 1.101596071209331, + "grad_norm": 0.45994171500205994, + "learning_rate": 9.832639167026575e-05, + "loss": 2.0926, + "step": 3589 + }, + { + "epoch": 1.101903007980356, + "grad_norm": 0.5337465405464172, + "learning_rate": 9.832511617578497e-05, + "loss": 1.9957, + "step": 3590 + }, + { + "epoch": 1.1022099447513811, + "grad_norm": 0.5920217633247375, + "learning_rate": 9.832384020372707e-05, + "loss": 2.0571, + "step": 3591 + }, + { + "epoch": 1.1025168815224065, + "grad_norm": 0.651720404624939, + "learning_rate": 9.832256375410466e-05, + "loss": 2.0382, + "step": 3592 + }, + { + "epoch": 1.1028238182934316, + "grad_norm": 0.6063461899757385, + "learning_rate": 9.832128682693035e-05, + "loss": 1.9932, + "step": 3593 + }, + { + "epoch": 1.1031307550644567, + "grad_norm": 0.5111881494522095, + "learning_rate": 9.832000942221676e-05, + "loss": 1.9821, + "step": 3594 + }, + { + "epoch": 1.1034376918354818, + "grad_norm": 0.5419835448265076, + "learning_rate": 9.831873153997652e-05, + "loss": 2.0535, + "step": 3595 + }, + { + "epoch": 1.1037446286065071, + "grad_norm": 0.5685762763023376, + "learning_rate": 9.831745318022226e-05, + "loss": 2.0715, + "step": 3596 + }, + { + "epoch": 1.1040515653775322, + "grad_norm": 0.6095051765441895, + "learning_rate": 9.831617434296659e-05, + "loss": 2.0382, + "step": 3597 + }, + { + "epoch": 1.1043585021485574, + "grad_norm": 0.548292338848114, + "learning_rate": 9.831489502822217e-05, + "loss": 1.98, + "step": 3598 + }, + { + "epoch": 1.1046654389195825, + "grad_norm": 0.5056986808776855, + "learning_rate": 9.831361523600165e-05, + "loss": 2.0271, + "step": 3599 + }, + { + "epoch": 1.1049723756906078, + "grad_norm": 0.48790082335472107, + "learning_rate": 9.831233496631767e-05, + "loss": 1.9555, + "step": 3600 + }, + { + "epoch": 1.105279312461633, + "grad_norm": 0.4663766622543335, + "learning_rate": 9.831105421918287e-05, + "loss": 1.9985, + "step": 3601 + }, + { + "epoch": 1.105586249232658, + "grad_norm": 0.4549616277217865, + "learning_rate": 9.83097729946099e-05, + "loss": 2.0543, + "step": 3602 + }, + { + "epoch": 1.1058931860036831, + "grad_norm": 0.46699193120002747, + "learning_rate": 9.830849129261146e-05, + "loss": 2.0395, + "step": 3603 + }, + { + "epoch": 1.1062001227747085, + "grad_norm": 0.4600387215614319, + "learning_rate": 9.830720911320019e-05, + "loss": 2.0155, + "step": 3604 + }, + { + "epoch": 1.1065070595457336, + "grad_norm": 0.4854283034801483, + "learning_rate": 9.830592645638877e-05, + "loss": 2.0698, + "step": 3605 + }, + { + "epoch": 1.1068139963167587, + "grad_norm": 0.5249526500701904, + "learning_rate": 9.830464332218987e-05, + "loss": 2.0842, + "step": 3606 + }, + { + "epoch": 1.107120933087784, + "grad_norm": 0.6377332806587219, + "learning_rate": 9.830335971061616e-05, + "loss": 2.1399, + "step": 3607 + }, + { + "epoch": 1.1074278698588091, + "grad_norm": 0.632194995880127, + "learning_rate": 9.830207562168034e-05, + "loss": 2.1203, + "step": 3608 + }, + { + "epoch": 1.1077348066298343, + "grad_norm": 0.5585857629776001, + "learning_rate": 9.830079105539512e-05, + "loss": 2.0219, + "step": 3609 + }, + { + "epoch": 1.1080417434008594, + "grad_norm": 0.5613297820091248, + "learning_rate": 9.829950601177316e-05, + "loss": 2.0464, + "step": 3610 + }, + { + "epoch": 1.1083486801718845, + "grad_norm": 0.5213276743888855, + "learning_rate": 9.829822049082716e-05, + "loss": 2.0134, + "step": 3611 + }, + { + "epoch": 1.1086556169429098, + "grad_norm": 0.5008644461631775, + "learning_rate": 9.829693449256984e-05, + "loss": 1.9952, + "step": 3612 + }, + { + "epoch": 1.108962553713935, + "grad_norm": 0.5565455555915833, + "learning_rate": 9.829564801701392e-05, + "loss": 1.9737, + "step": 3613 + }, + { + "epoch": 1.10926949048496, + "grad_norm": 0.6150243878364563, + "learning_rate": 9.82943610641721e-05, + "loss": 2.0414, + "step": 3614 + }, + { + "epoch": 1.1095764272559854, + "grad_norm": 0.6731769442558289, + "learning_rate": 9.829307363405709e-05, + "loss": 2.0262, + "step": 3615 + }, + { + "epoch": 1.1098833640270105, + "grad_norm": 0.5681004524230957, + "learning_rate": 9.829178572668162e-05, + "loss": 2.0303, + "step": 3616 + }, + { + "epoch": 1.1101903007980356, + "grad_norm": 0.4748475253582001, + "learning_rate": 9.829049734205841e-05, + "loss": 1.9756, + "step": 3617 + }, + { + "epoch": 1.1104972375690607, + "grad_norm": 0.4218698740005493, + "learning_rate": 9.82892084802002e-05, + "loss": 2.0243, + "step": 3618 + }, + { + "epoch": 1.1108041743400858, + "grad_norm": 0.47928178310394287, + "learning_rate": 9.828791914111976e-05, + "loss": 2.0368, + "step": 3619 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5805749297142029, + "learning_rate": 9.828662932482977e-05, + "loss": 2.0071, + "step": 3620 + }, + { + "epoch": 1.1114180478821363, + "grad_norm": 0.5580070614814758, + "learning_rate": 9.828533903134302e-05, + "loss": 1.9568, + "step": 3621 + }, + { + "epoch": 1.1117249846531614, + "grad_norm": 0.572694718837738, + "learning_rate": 9.828404826067224e-05, + "loss": 2.0128, + "step": 3622 + }, + { + "epoch": 1.1120319214241867, + "grad_norm": 0.605338990688324, + "learning_rate": 9.828275701283021e-05, + "loss": 2.0638, + "step": 3623 + }, + { + "epoch": 1.1123388581952118, + "grad_norm": 0.550521969795227, + "learning_rate": 9.828146528782967e-05, + "loss": 2.118, + "step": 3624 + }, + { + "epoch": 1.112645794966237, + "grad_norm": 0.5420751571655273, + "learning_rate": 9.828017308568337e-05, + "loss": 2.0685, + "step": 3625 + }, + { + "epoch": 1.112952731737262, + "grad_norm": 0.5761057734489441, + "learning_rate": 9.827888040640414e-05, + "loss": 2.1111, + "step": 3626 + }, + { + "epoch": 1.1132596685082874, + "grad_norm": 0.5724154710769653, + "learning_rate": 9.827758725000468e-05, + "loss": 2.0596, + "step": 3627 + }, + { + "epoch": 1.1135666052793125, + "grad_norm": 0.5120618343353271, + "learning_rate": 9.827629361649783e-05, + "loss": 1.9811, + "step": 3628 + }, + { + "epoch": 1.1138735420503376, + "grad_norm": 0.4449520409107208, + "learning_rate": 9.827499950589633e-05, + "loss": 1.9935, + "step": 3629 + }, + { + "epoch": 1.1141804788213627, + "grad_norm": 0.5478667616844177, + "learning_rate": 9.827370491821302e-05, + "loss": 2.0142, + "step": 3630 + }, + { + "epoch": 1.114487415592388, + "grad_norm": 0.6170383095741272, + "learning_rate": 9.827240985346064e-05, + "loss": 2.0588, + "step": 3631 + }, + { + "epoch": 1.1147943523634132, + "grad_norm": 0.5950221419334412, + "learning_rate": 9.827111431165202e-05, + "loss": 2.0187, + "step": 3632 + }, + { + "epoch": 1.1151012891344383, + "grad_norm": 0.5250533819198608, + "learning_rate": 9.826981829279995e-05, + "loss": 2.0288, + "step": 3633 + }, + { + "epoch": 1.1154082259054634, + "grad_norm": 0.6252482533454895, + "learning_rate": 9.826852179691725e-05, + "loss": 2.1834, + "step": 3634 + }, + { + "epoch": 1.1157151626764887, + "grad_norm": 0.5258986353874207, + "learning_rate": 9.826722482401673e-05, + "loss": 1.9894, + "step": 3635 + }, + { + "epoch": 1.1160220994475138, + "grad_norm": 0.5532206892967224, + "learning_rate": 9.82659273741112e-05, + "loss": 2.013, + "step": 3636 + }, + { + "epoch": 1.116329036218539, + "grad_norm": 0.5178828835487366, + "learning_rate": 9.826462944721349e-05, + "loss": 1.955, + "step": 3637 + }, + { + "epoch": 1.116635972989564, + "grad_norm": 0.5466227531433105, + "learning_rate": 9.826333104333642e-05, + "loss": 2.1073, + "step": 3638 + }, + { + "epoch": 1.1169429097605894, + "grad_norm": 0.5513507723808289, + "learning_rate": 9.826203216249282e-05, + "loss": 2.0735, + "step": 3639 + }, + { + "epoch": 1.1172498465316145, + "grad_norm": 0.5485204458236694, + "learning_rate": 9.826073280469554e-05, + "loss": 2.0699, + "step": 3640 + }, + { + "epoch": 1.1175567833026396, + "grad_norm": 0.5148037075996399, + "learning_rate": 9.825943296995741e-05, + "loss": 1.9364, + "step": 3641 + }, + { + "epoch": 1.1178637200736647, + "grad_norm": 0.5639125108718872, + "learning_rate": 9.825813265829127e-05, + "loss": 2.078, + "step": 3642 + }, + { + "epoch": 1.11817065684469, + "grad_norm": 0.581631064414978, + "learning_rate": 9.825683186970997e-05, + "loss": 2.0404, + "step": 3643 + }, + { + "epoch": 1.1184775936157152, + "grad_norm": 0.5630286335945129, + "learning_rate": 9.82555306042264e-05, + "loss": 2.0615, + "step": 3644 + }, + { + "epoch": 1.1187845303867403, + "grad_norm": 0.5661062598228455, + "learning_rate": 9.825422886185338e-05, + "loss": 2.0432, + "step": 3645 + }, + { + "epoch": 1.1190914671577654, + "grad_norm": 0.4960556626319885, + "learning_rate": 9.825292664260379e-05, + "loss": 2.0576, + "step": 3646 + }, + { + "epoch": 1.1193984039287908, + "grad_norm": 0.5052362084388733, + "learning_rate": 9.825162394649048e-05, + "loss": 2.0615, + "step": 3647 + }, + { + "epoch": 1.1197053406998159, + "grad_norm": 0.566758930683136, + "learning_rate": 9.825032077352636e-05, + "loss": 2.0821, + "step": 3648 + }, + { + "epoch": 1.120012277470841, + "grad_norm": 0.5705568790435791, + "learning_rate": 9.824901712372429e-05, + "loss": 2.1455, + "step": 3649 + }, + { + "epoch": 1.120319214241866, + "grad_norm": 0.5584011673927307, + "learning_rate": 9.824771299709714e-05, + "loss": 2.0911, + "step": 3650 + }, + { + "epoch": 1.1206261510128914, + "grad_norm": 0.5621497631072998, + "learning_rate": 9.824640839365782e-05, + "loss": 2.1209, + "step": 3651 + }, + { + "epoch": 1.1209330877839165, + "grad_norm": 0.4893646240234375, + "learning_rate": 9.824510331341921e-05, + "loss": 1.977, + "step": 3652 + }, + { + "epoch": 1.1212400245549416, + "grad_norm": 0.5626688599586487, + "learning_rate": 9.82437977563942e-05, + "loss": 2.1114, + "step": 3653 + }, + { + "epoch": 1.1215469613259668, + "grad_norm": 0.5714966058731079, + "learning_rate": 9.824249172259573e-05, + "loss": 2.021, + "step": 3654 + }, + { + "epoch": 1.121853898096992, + "grad_norm": 0.5190821886062622, + "learning_rate": 9.824118521203666e-05, + "loss": 1.9788, + "step": 3655 + }, + { + "epoch": 1.1221608348680172, + "grad_norm": 0.46421363949775696, + "learning_rate": 9.823987822472994e-05, + "loss": 1.9762, + "step": 3656 + }, + { + "epoch": 1.1224677716390423, + "grad_norm": 0.5071156620979309, + "learning_rate": 9.823857076068846e-05, + "loss": 1.9625, + "step": 3657 + }, + { + "epoch": 1.1227747084100674, + "grad_norm": 0.5762679576873779, + "learning_rate": 9.823726281992515e-05, + "loss": 2.0543, + "step": 3658 + }, + { + "epoch": 1.1230816451810928, + "grad_norm": 0.6275226473808289, + "learning_rate": 9.823595440245294e-05, + "loss": 2.0878, + "step": 3659 + }, + { + "epoch": 1.1233885819521179, + "grad_norm": 0.6893213391304016, + "learning_rate": 9.823464550828476e-05, + "loss": 2.1059, + "step": 3660 + }, + { + "epoch": 1.123695518723143, + "grad_norm": 0.5521993041038513, + "learning_rate": 9.823333613743353e-05, + "loss": 2.035, + "step": 3661 + }, + { + "epoch": 1.124002455494168, + "grad_norm": 0.4918796718120575, + "learning_rate": 9.823202628991221e-05, + "loss": 1.9873, + "step": 3662 + }, + { + "epoch": 1.1243093922651934, + "grad_norm": 0.5177932977676392, + "learning_rate": 9.823071596573373e-05, + "loss": 2.0376, + "step": 3663 + }, + { + "epoch": 1.1246163290362186, + "grad_norm": 0.5337314009666443, + "learning_rate": 9.822940516491106e-05, + "loss": 2.1065, + "step": 3664 + }, + { + "epoch": 1.1249232658072437, + "grad_norm": 0.5179010629653931, + "learning_rate": 9.822809388745713e-05, + "loss": 1.9642, + "step": 3665 + }, + { + "epoch": 1.125230202578269, + "grad_norm": 0.5394679307937622, + "learning_rate": 9.82267821333849e-05, + "loss": 2.0275, + "step": 3666 + }, + { + "epoch": 1.1255371393492941, + "grad_norm": 0.582873523235321, + "learning_rate": 9.822546990270735e-05, + "loss": 2.0369, + "step": 3667 + }, + { + "epoch": 1.1258440761203192, + "grad_norm": 0.6595674753189087, + "learning_rate": 9.822415719543745e-05, + "loss": 1.9776, + "step": 3668 + }, + { + "epoch": 1.1261510128913443, + "grad_norm": 0.8103840947151184, + "learning_rate": 9.822284401158814e-05, + "loss": 2.0784, + "step": 3669 + }, + { + "epoch": 1.1264579496623695, + "grad_norm": 0.9062070250511169, + "learning_rate": 9.822153035117245e-05, + "loss": 1.9886, + "step": 3670 + }, + { + "epoch": 1.1267648864333948, + "grad_norm": 0.8718156814575195, + "learning_rate": 9.822021621420333e-05, + "loss": 2.0499, + "step": 3671 + }, + { + "epoch": 1.12707182320442, + "grad_norm": 0.6499583721160889, + "learning_rate": 9.821890160069375e-05, + "loss": 2.0734, + "step": 3672 + }, + { + "epoch": 1.127378759975445, + "grad_norm": 0.4573141932487488, + "learning_rate": 9.821758651065673e-05, + "loss": 2.0306, + "step": 3673 + }, + { + "epoch": 1.1276856967464703, + "grad_norm": 0.6441135406494141, + "learning_rate": 9.821627094410526e-05, + "loss": 2.051, + "step": 3674 + }, + { + "epoch": 1.1279926335174955, + "grad_norm": 0.7201390266418457, + "learning_rate": 9.821495490105235e-05, + "loss": 2.0187, + "step": 3675 + }, + { + "epoch": 1.1282995702885206, + "grad_norm": 0.6751874685287476, + "learning_rate": 9.821363838151099e-05, + "loss": 2.0363, + "step": 3676 + }, + { + "epoch": 1.1286065070595457, + "grad_norm": 0.5435949563980103, + "learning_rate": 9.821232138549419e-05, + "loss": 1.939, + "step": 3677 + }, + { + "epoch": 1.1289134438305708, + "grad_norm": 0.605248212814331, + "learning_rate": 9.821100391301497e-05, + "loss": 2.146, + "step": 3678 + }, + { + "epoch": 1.1292203806015961, + "grad_norm": 0.6798139810562134, + "learning_rate": 9.820968596408636e-05, + "loss": 2.0423, + "step": 3679 + }, + { + "epoch": 1.1295273173726212, + "grad_norm": 0.6683683395385742, + "learning_rate": 9.820836753872137e-05, + "loss": 1.9768, + "step": 3680 + }, + { + "epoch": 1.1298342541436464, + "grad_norm": 0.578346312046051, + "learning_rate": 9.820704863693304e-05, + "loss": 1.9313, + "step": 3681 + }, + { + "epoch": 1.1301411909146717, + "grad_norm": 0.5639599561691284, + "learning_rate": 9.820572925873441e-05, + "loss": 2.0706, + "step": 3682 + }, + { + "epoch": 1.1304481276856968, + "grad_norm": 0.5749368071556091, + "learning_rate": 9.82044094041385e-05, + "loss": 2.0072, + "step": 3683 + }, + { + "epoch": 1.130755064456722, + "grad_norm": 0.6490229368209839, + "learning_rate": 9.820308907315836e-05, + "loss": 1.9947, + "step": 3684 + }, + { + "epoch": 1.131062001227747, + "grad_norm": 0.6207692623138428, + "learning_rate": 9.820176826580705e-05, + "loss": 2.1426, + "step": 3685 + }, + { + "epoch": 1.1313689379987721, + "grad_norm": 0.6421573162078857, + "learning_rate": 9.82004469820976e-05, + "loss": 2.0558, + "step": 3686 + }, + { + "epoch": 1.1316758747697975, + "grad_norm": 0.5462764501571655, + "learning_rate": 9.81991252220431e-05, + "loss": 2.0072, + "step": 3687 + }, + { + "epoch": 1.1319828115408226, + "grad_norm": 0.49791282415390015, + "learning_rate": 9.819780298565657e-05, + "loss": 1.9949, + "step": 3688 + }, + { + "epoch": 1.1322897483118477, + "grad_norm": 0.5120366215705872, + "learning_rate": 9.819648027295112e-05, + "loss": 2.0503, + "step": 3689 + }, + { + "epoch": 1.132596685082873, + "grad_norm": 0.5118343830108643, + "learning_rate": 9.81951570839398e-05, + "loss": 2.0104, + "step": 3690 + }, + { + "epoch": 1.1329036218538981, + "grad_norm": 0.44520822167396545, + "learning_rate": 9.81938334186357e-05, + "loss": 2.0024, + "step": 3691 + }, + { + "epoch": 1.1332105586249233, + "grad_norm": 0.5505960583686829, + "learning_rate": 9.819250927705188e-05, + "loss": 2.0924, + "step": 3692 + }, + { + "epoch": 1.1335174953959484, + "grad_norm": 0.5269182920455933, + "learning_rate": 9.819118465920143e-05, + "loss": 2.0553, + "step": 3693 + }, + { + "epoch": 1.1338244321669735, + "grad_norm": 0.4864311218261719, + "learning_rate": 9.818985956509745e-05, + "loss": 2.0405, + "step": 3694 + }, + { + "epoch": 1.1341313689379988, + "grad_norm": 0.515202522277832, + "learning_rate": 9.818853399475304e-05, + "loss": 2.0211, + "step": 3695 + }, + { + "epoch": 1.134438305709024, + "grad_norm": 0.5360483527183533, + "learning_rate": 9.818720794818128e-05, + "loss": 2.1077, + "step": 3696 + }, + { + "epoch": 1.134745242480049, + "grad_norm": 0.5469255447387695, + "learning_rate": 9.818588142539531e-05, + "loss": 1.9538, + "step": 3697 + }, + { + "epoch": 1.1350521792510744, + "grad_norm": 0.5042214393615723, + "learning_rate": 9.818455442640819e-05, + "loss": 2.0477, + "step": 3698 + }, + { + "epoch": 1.1353591160220995, + "grad_norm": 0.5678744316101074, + "learning_rate": 9.81832269512331e-05, + "loss": 2.0871, + "step": 3699 + }, + { + "epoch": 1.1356660527931246, + "grad_norm": 0.5218677520751953, + "learning_rate": 9.818189899988308e-05, + "loss": 2.1014, + "step": 3700 + }, + { + "epoch": 1.1359729895641497, + "grad_norm": 0.5141727924346924, + "learning_rate": 9.818057057237132e-05, + "loss": 2.0385, + "step": 3701 + }, + { + "epoch": 1.136279926335175, + "grad_norm": 0.5288038849830627, + "learning_rate": 9.81792416687109e-05, + "loss": 2.0736, + "step": 3702 + }, + { + "epoch": 1.1365868631062002, + "grad_norm": 0.5533168911933899, + "learning_rate": 9.817791228891499e-05, + "loss": 2.032, + "step": 3703 + }, + { + "epoch": 1.1368937998772253, + "grad_norm": 0.4840674102306366, + "learning_rate": 9.81765824329967e-05, + "loss": 2.027, + "step": 3704 + }, + { + "epoch": 1.1372007366482504, + "grad_norm": 0.5060023069381714, + "learning_rate": 9.817525210096921e-05, + "loss": 2.0561, + "step": 3705 + }, + { + "epoch": 1.1375076734192757, + "grad_norm": 0.48830488324165344, + "learning_rate": 9.817392129284561e-05, + "loss": 1.9807, + "step": 3706 + }, + { + "epoch": 1.1378146101903008, + "grad_norm": 0.4644564390182495, + "learning_rate": 9.817259000863911e-05, + "loss": 1.9871, + "step": 3707 + }, + { + "epoch": 1.138121546961326, + "grad_norm": 0.4644739329814911, + "learning_rate": 9.817125824836283e-05, + "loss": 2.0253, + "step": 3708 + }, + { + "epoch": 1.138428483732351, + "grad_norm": 0.5376463532447815, + "learning_rate": 9.816992601202994e-05, + "loss": 2.0693, + "step": 3709 + }, + { + "epoch": 1.1387354205033764, + "grad_norm": 0.49980148673057556, + "learning_rate": 9.816859329965363e-05, + "loss": 2.0123, + "step": 3710 + }, + { + "epoch": 1.1390423572744015, + "grad_norm": 0.5452225208282471, + "learning_rate": 9.816726011124702e-05, + "loss": 2.0725, + "step": 3711 + }, + { + "epoch": 1.1393492940454266, + "grad_norm": 0.5428896546363831, + "learning_rate": 9.816592644682332e-05, + "loss": 2.0446, + "step": 3712 + }, + { + "epoch": 1.1396562308164517, + "grad_norm": 0.5448847413063049, + "learning_rate": 9.816459230639571e-05, + "loss": 2.0262, + "step": 3713 + }, + { + "epoch": 1.139963167587477, + "grad_norm": 0.48574572801589966, + "learning_rate": 9.816325768997736e-05, + "loss": 2.0105, + "step": 3714 + }, + { + "epoch": 1.1402701043585022, + "grad_norm": 0.5566397905349731, + "learning_rate": 9.816192259758147e-05, + "loss": 2.0665, + "step": 3715 + }, + { + "epoch": 1.1405770411295273, + "grad_norm": 0.6098625659942627, + "learning_rate": 9.816058702922124e-05, + "loss": 2.0589, + "step": 3716 + }, + { + "epoch": 1.1408839779005524, + "grad_norm": 0.6118699312210083, + "learning_rate": 9.815925098490985e-05, + "loss": 2.0683, + "step": 3717 + }, + { + "epoch": 1.1411909146715777, + "grad_norm": 0.5213121175765991, + "learning_rate": 9.815791446466053e-05, + "loss": 2.0226, + "step": 3718 + }, + { + "epoch": 1.1414978514426029, + "grad_norm": 0.45717960596084595, + "learning_rate": 9.815657746848648e-05, + "loss": 2.0371, + "step": 3719 + }, + { + "epoch": 1.141804788213628, + "grad_norm": 0.4613656997680664, + "learning_rate": 9.815523999640088e-05, + "loss": 2.0702, + "step": 3720 + }, + { + "epoch": 1.142111724984653, + "grad_norm": 0.4527476727962494, + "learning_rate": 9.8153902048417e-05, + "loss": 1.9893, + "step": 3721 + }, + { + "epoch": 1.1424186617556784, + "grad_norm": 0.4524305462837219, + "learning_rate": 9.815256362454801e-05, + "loss": 1.975, + "step": 3722 + }, + { + "epoch": 1.1427255985267035, + "grad_norm": 0.4421180188655853, + "learning_rate": 9.815122472480718e-05, + "loss": 1.9987, + "step": 3723 + }, + { + "epoch": 1.1430325352977286, + "grad_norm": 0.4833788275718689, + "learning_rate": 9.814988534920771e-05, + "loss": 2.0246, + "step": 3724 + }, + { + "epoch": 1.1433394720687537, + "grad_norm": 0.46547624468803406, + "learning_rate": 9.814854549776287e-05, + "loss": 2.0007, + "step": 3725 + }, + { + "epoch": 1.143646408839779, + "grad_norm": 0.43220648169517517, + "learning_rate": 9.814720517048587e-05, + "loss": 1.9845, + "step": 3726 + }, + { + "epoch": 1.1439533456108042, + "grad_norm": 0.473910391330719, + "learning_rate": 9.814586436738998e-05, + "loss": 2.0518, + "step": 3727 + }, + { + "epoch": 1.1442602823818293, + "grad_norm": 0.507354199886322, + "learning_rate": 9.814452308848843e-05, + "loss": 2.0708, + "step": 3728 + }, + { + "epoch": 1.1445672191528544, + "grad_norm": 0.4585053622722626, + "learning_rate": 9.814318133379448e-05, + "loss": 2.0124, + "step": 3729 + }, + { + "epoch": 1.1448741559238798, + "grad_norm": 0.5280457735061646, + "learning_rate": 9.81418391033214e-05, + "loss": 2.0424, + "step": 3730 + }, + { + "epoch": 1.1451810926949049, + "grad_norm": 0.5173056125640869, + "learning_rate": 9.814049639708245e-05, + "loss": 1.9666, + "step": 3731 + }, + { + "epoch": 1.14548802946593, + "grad_norm": 0.5850839018821716, + "learning_rate": 9.81391532150909e-05, + "loss": 2.0765, + "step": 3732 + }, + { + "epoch": 1.145794966236955, + "grad_norm": 0.5450417995452881, + "learning_rate": 9.813780955736002e-05, + "loss": 2.0696, + "step": 3733 + }, + { + "epoch": 1.1461019030079804, + "grad_norm": 0.4577319622039795, + "learning_rate": 9.81364654239031e-05, + "loss": 2.0493, + "step": 3734 + }, + { + "epoch": 1.1464088397790055, + "grad_norm": 0.5211838483810425, + "learning_rate": 9.813512081473339e-05, + "loss": 2.0578, + "step": 3735 + }, + { + "epoch": 1.1467157765500307, + "grad_norm": 0.6763051152229309, + "learning_rate": 9.813377572986422e-05, + "loss": 2.0859, + "step": 3736 + }, + { + "epoch": 1.1470227133210558, + "grad_norm": 0.8591815233230591, + "learning_rate": 9.813243016930887e-05, + "loss": 1.9743, + "step": 3737 + }, + { + "epoch": 1.147329650092081, + "grad_norm": 0.8573755025863647, + "learning_rate": 9.813108413308063e-05, + "loss": 2.048, + "step": 3738 + }, + { + "epoch": 1.1476365868631062, + "grad_norm": 0.6887713074684143, + "learning_rate": 9.812973762119281e-05, + "loss": 2.0184, + "step": 3739 + }, + { + "epoch": 1.1479435236341313, + "grad_norm": 0.5491438508033752, + "learning_rate": 9.81283906336587e-05, + "loss": 2.0373, + "step": 3740 + }, + { + "epoch": 1.1482504604051567, + "grad_norm": 0.6413923501968384, + "learning_rate": 9.812704317049164e-05, + "loss": 2.067, + "step": 3741 + }, + { + "epoch": 1.1485573971761818, + "grad_norm": 0.8731338381767273, + "learning_rate": 9.812569523170492e-05, + "loss": 1.9996, + "step": 3742 + }, + { + "epoch": 1.1488643339472069, + "grad_norm": 0.8043886423110962, + "learning_rate": 9.812434681731189e-05, + "loss": 2.0464, + "step": 3743 + }, + { + "epoch": 1.149171270718232, + "grad_norm": 0.5458279252052307, + "learning_rate": 9.812299792732584e-05, + "loss": 2.0026, + "step": 3744 + }, + { + "epoch": 1.149478207489257, + "grad_norm": 0.5135432481765747, + "learning_rate": 9.812164856176011e-05, + "loss": 2.0302, + "step": 3745 + }, + { + "epoch": 1.1497851442602824, + "grad_norm": 0.6673153638839722, + "learning_rate": 9.812029872062807e-05, + "loss": 2.0435, + "step": 3746 + }, + { + "epoch": 1.1500920810313076, + "grad_norm": 0.6777083873748779, + "learning_rate": 9.811894840394302e-05, + "loss": 2.0591, + "step": 3747 + }, + { + "epoch": 1.1503990178023327, + "grad_norm": 0.6660524010658264, + "learning_rate": 9.811759761171833e-05, + "loss": 2.0461, + "step": 3748 + }, + { + "epoch": 1.150705954573358, + "grad_norm": 0.6079594492912292, + "learning_rate": 9.811624634396733e-05, + "loss": 2.0708, + "step": 3749 + }, + { + "epoch": 1.1510128913443831, + "grad_norm": 0.5242465734481812, + "learning_rate": 9.811489460070337e-05, + "loss": 2.0513, + "step": 3750 + }, + { + "epoch": 1.1513198281154082, + "grad_norm": 0.7091820240020752, + "learning_rate": 9.811354238193984e-05, + "loss": 2.0356, + "step": 3751 + }, + { + "epoch": 1.1516267648864333, + "grad_norm": 0.6781896948814392, + "learning_rate": 9.811218968769007e-05, + "loss": 2.0693, + "step": 3752 + }, + { + "epoch": 1.1519337016574585, + "grad_norm": 0.6036314368247986, + "learning_rate": 9.811083651796744e-05, + "loss": 2.134, + "step": 3753 + }, + { + "epoch": 1.1522406384284838, + "grad_norm": 0.6173892617225647, + "learning_rate": 9.810948287278534e-05, + "loss": 2.056, + "step": 3754 + }, + { + "epoch": 1.152547575199509, + "grad_norm": 0.4903198182582855, + "learning_rate": 9.810812875215712e-05, + "loss": 2.0037, + "step": 3755 + }, + { + "epoch": 1.152854511970534, + "grad_norm": 0.5527236461639404, + "learning_rate": 9.810677415609619e-05, + "loss": 2.0334, + "step": 3756 + }, + { + "epoch": 1.1531614487415593, + "grad_norm": 0.5342993140220642, + "learning_rate": 9.81054190846159e-05, + "loss": 2.0376, + "step": 3757 + }, + { + "epoch": 1.1534683855125845, + "grad_norm": 0.4860527515411377, + "learning_rate": 9.810406353772968e-05, + "loss": 2.0009, + "step": 3758 + }, + { + "epoch": 1.1537753222836096, + "grad_norm": 0.49722176790237427, + "learning_rate": 9.810270751545089e-05, + "loss": 2.051, + "step": 3759 + }, + { + "epoch": 1.1540822590546347, + "grad_norm": 0.4714743196964264, + "learning_rate": 9.810135101779296e-05, + "loss": 2.0474, + "step": 3760 + }, + { + "epoch": 1.1543891958256598, + "grad_norm": 0.5183619856834412, + "learning_rate": 9.80999940447693e-05, + "loss": 2.1032, + "step": 3761 + }, + { + "epoch": 1.1546961325966851, + "grad_norm": 0.6118659377098083, + "learning_rate": 9.809863659639328e-05, + "loss": 2.0967, + "step": 3762 + }, + { + "epoch": 1.1550030693677102, + "grad_norm": 0.49166184663772583, + "learning_rate": 9.809727867267838e-05, + "loss": 2.0683, + "step": 3763 + }, + { + "epoch": 1.1553100061387354, + "grad_norm": 0.5190026164054871, + "learning_rate": 9.809592027363795e-05, + "loss": 2.0161, + "step": 3764 + }, + { + "epoch": 1.1556169429097607, + "grad_norm": 0.516914427280426, + "learning_rate": 9.809456139928546e-05, + "loss": 2.0886, + "step": 3765 + }, + { + "epoch": 1.1559238796807858, + "grad_norm": 0.49737948179244995, + "learning_rate": 9.809320204963433e-05, + "loss": 2.0111, + "step": 3766 + }, + { + "epoch": 1.156230816451811, + "grad_norm": 0.44676536321640015, + "learning_rate": 9.809184222469796e-05, + "loss": 2.0571, + "step": 3767 + }, + { + "epoch": 1.156537753222836, + "grad_norm": 0.5008999109268188, + "learning_rate": 9.809048192448983e-05, + "loss": 2.0489, + "step": 3768 + }, + { + "epoch": 1.1568446899938611, + "grad_norm": 0.5116657614707947, + "learning_rate": 9.80891211490234e-05, + "loss": 1.9571, + "step": 3769 + }, + { + "epoch": 1.1571516267648865, + "grad_norm": 0.49909651279449463, + "learning_rate": 9.808775989831207e-05, + "loss": 2.0568, + "step": 3770 + }, + { + "epoch": 1.1574585635359116, + "grad_norm": 0.5186662077903748, + "learning_rate": 9.80863981723693e-05, + "loss": 2.0283, + "step": 3771 + }, + { + "epoch": 1.1577655003069367, + "grad_norm": 0.4974740445613861, + "learning_rate": 9.808503597120858e-05, + "loss": 1.9525, + "step": 3772 + }, + { + "epoch": 1.158072437077962, + "grad_norm": 0.5369553565979004, + "learning_rate": 9.808367329484333e-05, + "loss": 1.9627, + "step": 3773 + }, + { + "epoch": 1.1583793738489871, + "grad_norm": 0.5084113478660583, + "learning_rate": 9.808231014328704e-05, + "loss": 1.9563, + "step": 3774 + }, + { + "epoch": 1.1586863106200123, + "grad_norm": 0.6059956550598145, + "learning_rate": 9.808094651655319e-05, + "loss": 2.078, + "step": 3775 + }, + { + "epoch": 1.1589932473910374, + "grad_norm": 0.5677124261856079, + "learning_rate": 9.807958241465523e-05, + "loss": 1.9977, + "step": 3776 + }, + { + "epoch": 1.1593001841620627, + "grad_norm": 0.5582616329193115, + "learning_rate": 9.807821783760667e-05, + "loss": 2.0053, + "step": 3777 + }, + { + "epoch": 1.1596071209330878, + "grad_norm": 0.5558032989501953, + "learning_rate": 9.807685278542097e-05, + "loss": 2.0015, + "step": 3778 + }, + { + "epoch": 1.159914057704113, + "grad_norm": 0.553292989730835, + "learning_rate": 9.807548725811165e-05, + "loss": 2.133, + "step": 3779 + }, + { + "epoch": 1.160220994475138, + "grad_norm": 0.5281317234039307, + "learning_rate": 9.807412125569217e-05, + "loss": 2.0018, + "step": 3780 + }, + { + "epoch": 1.1605279312461634, + "grad_norm": 0.45385050773620605, + "learning_rate": 9.807275477817605e-05, + "loss": 1.9986, + "step": 3781 + }, + { + "epoch": 1.1608348680171885, + "grad_norm": 0.5843673944473267, + "learning_rate": 9.80713878255768e-05, + "loss": 2.0653, + "step": 3782 + }, + { + "epoch": 1.1611418047882136, + "grad_norm": 0.6193283796310425, + "learning_rate": 9.807002039790792e-05, + "loss": 1.9646, + "step": 3783 + }, + { + "epoch": 1.1614487415592387, + "grad_norm": 0.5831897258758545, + "learning_rate": 9.806865249518292e-05, + "loss": 1.9708, + "step": 3784 + }, + { + "epoch": 1.161755678330264, + "grad_norm": 0.49771901965141296, + "learning_rate": 9.806728411741533e-05, + "loss": 1.9953, + "step": 3785 + }, + { + "epoch": 1.1620626151012892, + "grad_norm": 0.5003515481948853, + "learning_rate": 9.806591526461864e-05, + "loss": 2.0503, + "step": 3786 + }, + { + "epoch": 1.1623695518723143, + "grad_norm": 0.5710052847862244, + "learning_rate": 9.806454593680642e-05, + "loss": 1.9976, + "step": 3787 + }, + { + "epoch": 1.1626764886433394, + "grad_norm": 0.5180788040161133, + "learning_rate": 9.806317613399218e-05, + "loss": 1.9872, + "step": 3788 + }, + { + "epoch": 1.1629834254143647, + "grad_norm": 0.5202008485794067, + "learning_rate": 9.806180585618949e-05, + "loss": 1.9628, + "step": 3789 + }, + { + "epoch": 1.1632903621853898, + "grad_norm": 0.47358211874961853, + "learning_rate": 9.806043510341183e-05, + "loss": 1.9994, + "step": 3790 + }, + { + "epoch": 1.163597298956415, + "grad_norm": 0.4258720278739929, + "learning_rate": 9.80590638756728e-05, + "loss": 1.9547, + "step": 3791 + }, + { + "epoch": 1.16390423572744, + "grad_norm": 0.4487614035606384, + "learning_rate": 9.805769217298593e-05, + "loss": 1.9912, + "step": 3792 + }, + { + "epoch": 1.1642111724984654, + "grad_norm": 0.4970495104789734, + "learning_rate": 9.805631999536477e-05, + "loss": 2.0568, + "step": 3793 + }, + { + "epoch": 1.1645181092694905, + "grad_norm": 0.4535474479198456, + "learning_rate": 9.805494734282289e-05, + "loss": 2.0088, + "step": 3794 + }, + { + "epoch": 1.1648250460405156, + "grad_norm": 0.44582805037498474, + "learning_rate": 9.805357421537385e-05, + "loss": 1.9694, + "step": 3795 + }, + { + "epoch": 1.1651319828115407, + "grad_norm": 0.43872734904289246, + "learning_rate": 9.805220061303125e-05, + "loss": 2.0041, + "step": 3796 + }, + { + "epoch": 1.165438919582566, + "grad_norm": 0.5050458908081055, + "learning_rate": 9.805082653580861e-05, + "loss": 1.9963, + "step": 3797 + }, + { + "epoch": 1.1657458563535912, + "grad_norm": 0.5346884727478027, + "learning_rate": 9.804945198371956e-05, + "loss": 2.0334, + "step": 3798 + }, + { + "epoch": 1.1660527931246163, + "grad_norm": 0.5607240796089172, + "learning_rate": 9.804807695677764e-05, + "loss": 2.0474, + "step": 3799 + }, + { + "epoch": 1.1663597298956414, + "grad_norm": 0.5343592166900635, + "learning_rate": 9.804670145499648e-05, + "loss": 2.0542, + "step": 3800 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5195753574371338, + "learning_rate": 9.804532547838964e-05, + "loss": 2.0816, + "step": 3801 + }, + { + "epoch": 1.1669736034376919, + "grad_norm": 0.575821042060852, + "learning_rate": 9.804394902697075e-05, + "loss": 2.0182, + "step": 3802 + }, + { + "epoch": 1.167280540208717, + "grad_norm": 0.6385466456413269, + "learning_rate": 9.804257210075339e-05, + "loss": 2.0519, + "step": 3803 + }, + { + "epoch": 1.167587476979742, + "grad_norm": 0.7202457785606384, + "learning_rate": 9.804119469975117e-05, + "loss": 1.9871, + "step": 3804 + }, + { + "epoch": 1.1678944137507674, + "grad_norm": 0.696793258190155, + "learning_rate": 9.803981682397772e-05, + "loss": 2.1018, + "step": 3805 + }, + { + "epoch": 1.1682013505217925, + "grad_norm": 0.6217656135559082, + "learning_rate": 9.803843847344662e-05, + "loss": 2.1009, + "step": 3806 + }, + { + "epoch": 1.1685082872928176, + "grad_norm": 0.5296351313591003, + "learning_rate": 9.803705964817153e-05, + "loss": 2.1057, + "step": 3807 + }, + { + "epoch": 1.168815224063843, + "grad_norm": 0.5280975699424744, + "learning_rate": 9.803568034816606e-05, + "loss": 2.0019, + "step": 3808 + }, + { + "epoch": 1.169122160834868, + "grad_norm": 0.4981881380081177, + "learning_rate": 9.803430057344385e-05, + "loss": 1.9918, + "step": 3809 + }, + { + "epoch": 1.1694290976058932, + "grad_norm": 0.43662941455841064, + "learning_rate": 9.803292032401852e-05, + "loss": 2.0273, + "step": 3810 + }, + { + "epoch": 1.1697360343769183, + "grad_norm": 0.5039259791374207, + "learning_rate": 9.80315395999037e-05, + "loss": 2.0475, + "step": 3811 + }, + { + "epoch": 1.1700429711479434, + "grad_norm": 0.4330410957336426, + "learning_rate": 9.803015840111308e-05, + "loss": 1.99, + "step": 3812 + }, + { + "epoch": 1.1703499079189688, + "grad_norm": 0.4603813886642456, + "learning_rate": 9.802877672766026e-05, + "loss": 2.0288, + "step": 3813 + }, + { + "epoch": 1.1706568446899939, + "grad_norm": 0.45815590023994446, + "learning_rate": 9.802739457955894e-05, + "loss": 2.0026, + "step": 3814 + }, + { + "epoch": 1.170963781461019, + "grad_norm": 0.46995803713798523, + "learning_rate": 9.802601195682275e-05, + "loss": 2.0608, + "step": 3815 + }, + { + "epoch": 1.1712707182320443, + "grad_norm": 0.4511576294898987, + "learning_rate": 9.802462885946536e-05, + "loss": 1.9793, + "step": 3816 + }, + { + "epoch": 1.1715776550030694, + "grad_norm": 0.49079468846321106, + "learning_rate": 9.802324528750044e-05, + "loss": 2.0049, + "step": 3817 + }, + { + "epoch": 1.1718845917740945, + "grad_norm": 0.47245466709136963, + "learning_rate": 9.802186124094166e-05, + "loss": 1.9562, + "step": 3818 + }, + { + "epoch": 1.1721915285451197, + "grad_norm": 0.485575795173645, + "learning_rate": 9.80204767198027e-05, + "loss": 2.0212, + "step": 3819 + }, + { + "epoch": 1.1724984653161448, + "grad_norm": 0.5924440622329712, + "learning_rate": 9.801909172409724e-05, + "loss": 1.9875, + "step": 3820 + }, + { + "epoch": 1.17280540208717, + "grad_norm": 0.48908641934394836, + "learning_rate": 9.801770625383899e-05, + "loss": 1.9778, + "step": 3821 + }, + { + "epoch": 1.1731123388581952, + "grad_norm": 0.4372415840625763, + "learning_rate": 9.80163203090416e-05, + "loss": 1.9368, + "step": 3822 + }, + { + "epoch": 1.1734192756292203, + "grad_norm": 0.5811094641685486, + "learning_rate": 9.801493388971881e-05, + "loss": 2.1293, + "step": 3823 + }, + { + "epoch": 1.1737262124002457, + "grad_norm": 0.516983151435852, + "learning_rate": 9.801354699588428e-05, + "loss": 2.039, + "step": 3824 + }, + { + "epoch": 1.1740331491712708, + "grad_norm": 0.53409343957901, + "learning_rate": 9.801215962755175e-05, + "loss": 2.0294, + "step": 3825 + }, + { + "epoch": 1.1743400859422959, + "grad_norm": 0.5703202486038208, + "learning_rate": 9.801077178473492e-05, + "loss": 2.0241, + "step": 3826 + }, + { + "epoch": 1.174647022713321, + "grad_norm": 0.49341192841529846, + "learning_rate": 9.80093834674475e-05, + "loss": 1.9092, + "step": 3827 + }, + { + "epoch": 1.174953959484346, + "grad_norm": 0.46960577368736267, + "learning_rate": 9.800799467570321e-05, + "loss": 1.9994, + "step": 3828 + }, + { + "epoch": 1.1752608962553714, + "grad_norm": 0.468108594417572, + "learning_rate": 9.800660540951577e-05, + "loss": 1.9471, + "step": 3829 + }, + { + "epoch": 1.1755678330263966, + "grad_norm": 0.4133259057998657, + "learning_rate": 9.800521566889893e-05, + "loss": 2.0159, + "step": 3830 + }, + { + "epoch": 1.1758747697974217, + "grad_norm": 0.44991979002952576, + "learning_rate": 9.800382545386641e-05, + "loss": 2.0179, + "step": 3831 + }, + { + "epoch": 1.176181706568447, + "grad_norm": 0.43111294507980347, + "learning_rate": 9.800243476443195e-05, + "loss": 2.1092, + "step": 3832 + }, + { + "epoch": 1.1764886433394721, + "grad_norm": 0.4859693944454193, + "learning_rate": 9.800104360060929e-05, + "loss": 2.0134, + "step": 3833 + }, + { + "epoch": 1.1767955801104972, + "grad_norm": 0.474960058927536, + "learning_rate": 9.799965196241219e-05, + "loss": 2.0288, + "step": 3834 + }, + { + "epoch": 1.1771025168815223, + "grad_norm": 0.5269008278846741, + "learning_rate": 9.79982598498544e-05, + "loss": 2.063, + "step": 3835 + }, + { + "epoch": 1.1774094536525475, + "grad_norm": 0.4923003613948822, + "learning_rate": 9.799686726294965e-05, + "loss": 1.9506, + "step": 3836 + }, + { + "epoch": 1.1777163904235728, + "grad_norm": 0.5355561971664429, + "learning_rate": 9.799547420171175e-05, + "loss": 2.0066, + "step": 3837 + }, + { + "epoch": 1.178023327194598, + "grad_norm": 0.6095728874206543, + "learning_rate": 9.799408066615443e-05, + "loss": 1.9799, + "step": 3838 + }, + { + "epoch": 1.178330263965623, + "grad_norm": 0.5268104672431946, + "learning_rate": 9.799268665629148e-05, + "loss": 2.0409, + "step": 3839 + }, + { + "epoch": 1.1786372007366483, + "grad_norm": 0.4478130340576172, + "learning_rate": 9.799129217213667e-05, + "loss": 1.9521, + "step": 3840 + }, + { + "epoch": 1.1789441375076735, + "grad_norm": 0.4691653847694397, + "learning_rate": 9.798989721370379e-05, + "loss": 2.0432, + "step": 3841 + }, + { + "epoch": 1.1792510742786986, + "grad_norm": 0.5602376461029053, + "learning_rate": 9.798850178100661e-05, + "loss": 2.0557, + "step": 3842 + }, + { + "epoch": 1.1795580110497237, + "grad_norm": 0.5619905591011047, + "learning_rate": 9.798710587405893e-05, + "loss": 2.0258, + "step": 3843 + }, + { + "epoch": 1.179864947820749, + "grad_norm": 0.5845574736595154, + "learning_rate": 9.798570949287454e-05, + "loss": 2.0637, + "step": 3844 + }, + { + "epoch": 1.1801718845917741, + "grad_norm": 0.5339313745498657, + "learning_rate": 9.798431263746725e-05, + "loss": 2.0265, + "step": 3845 + }, + { + "epoch": 1.1804788213627992, + "grad_norm": 0.45720914006233215, + "learning_rate": 9.798291530785086e-05, + "loss": 1.9745, + "step": 3846 + }, + { + "epoch": 1.1807857581338244, + "grad_norm": 0.5121282935142517, + "learning_rate": 9.798151750403917e-05, + "loss": 2.0427, + "step": 3847 + }, + { + "epoch": 1.1810926949048497, + "grad_norm": 0.48100459575653076, + "learning_rate": 9.7980119226046e-05, + "loss": 2.0307, + "step": 3848 + }, + { + "epoch": 1.1813996316758748, + "grad_norm": 0.4424034655094147, + "learning_rate": 9.797872047388517e-05, + "loss": 1.9697, + "step": 3849 + }, + { + "epoch": 1.1817065684469, + "grad_norm": 0.45154938101768494, + "learning_rate": 9.797732124757051e-05, + "loss": 1.9689, + "step": 3850 + }, + { + "epoch": 1.182013505217925, + "grad_norm": 0.4807071387767792, + "learning_rate": 9.797592154711584e-05, + "loss": 1.9616, + "step": 3851 + }, + { + "epoch": 1.1823204419889504, + "grad_norm": 0.5113904476165771, + "learning_rate": 9.797452137253498e-05, + "loss": 2.0158, + "step": 3852 + }, + { + "epoch": 1.1826273787599755, + "grad_norm": 0.5456753969192505, + "learning_rate": 9.797312072384179e-05, + "loss": 1.977, + "step": 3853 + }, + { + "epoch": 1.1829343155310006, + "grad_norm": 0.5545704364776611, + "learning_rate": 9.797171960105012e-05, + "loss": 2.0622, + "step": 3854 + }, + { + "epoch": 1.1832412523020257, + "grad_norm": 0.651498556137085, + "learning_rate": 9.797031800417377e-05, + "loss": 2.0739, + "step": 3855 + }, + { + "epoch": 1.183548189073051, + "grad_norm": 0.748968780040741, + "learning_rate": 9.796891593322665e-05, + "loss": 2.0713, + "step": 3856 + }, + { + "epoch": 1.1838551258440762, + "grad_norm": 0.8724157214164734, + "learning_rate": 9.796751338822256e-05, + "loss": 2.0224, + "step": 3857 + }, + { + "epoch": 1.1841620626151013, + "grad_norm": 0.8158844709396362, + "learning_rate": 9.796611036917542e-05, + "loss": 2.0165, + "step": 3858 + }, + { + "epoch": 1.1844689993861264, + "grad_norm": 0.6231487989425659, + "learning_rate": 9.796470687609904e-05, + "loss": 1.9607, + "step": 3859 + }, + { + "epoch": 1.1847759361571517, + "grad_norm": 0.49367067217826843, + "learning_rate": 9.796330290900731e-05, + "loss": 2.0074, + "step": 3860 + }, + { + "epoch": 1.1850828729281768, + "grad_norm": 0.5546393990516663, + "learning_rate": 9.796189846791413e-05, + "loss": 1.9688, + "step": 3861 + }, + { + "epoch": 1.185389809699202, + "grad_norm": 0.5880963802337646, + "learning_rate": 9.796049355283333e-05, + "loss": 2.0192, + "step": 3862 + }, + { + "epoch": 1.185696746470227, + "grad_norm": 0.6064910292625427, + "learning_rate": 9.795908816377884e-05, + "loss": 2.0236, + "step": 3863 + }, + { + "epoch": 1.1860036832412524, + "grad_norm": 0.524116575717926, + "learning_rate": 9.795768230076454e-05, + "loss": 2.0315, + "step": 3864 + }, + { + "epoch": 1.1863106200122775, + "grad_norm": 0.449158251285553, + "learning_rate": 9.79562759638043e-05, + "loss": 1.9423, + "step": 3865 + }, + { + "epoch": 1.1866175567833026, + "grad_norm": 0.5623016953468323, + "learning_rate": 9.795486915291203e-05, + "loss": 2.096, + "step": 3866 + }, + { + "epoch": 1.1869244935543277, + "grad_norm": 0.6107217073440552, + "learning_rate": 9.795346186810164e-05, + "loss": 1.9994, + "step": 3867 + }, + { + "epoch": 1.187231430325353, + "grad_norm": 0.5559211373329163, + "learning_rate": 9.795205410938704e-05, + "loss": 2.0138, + "step": 3868 + }, + { + "epoch": 1.1875383670963782, + "grad_norm": 0.5022037029266357, + "learning_rate": 9.795064587678212e-05, + "loss": 2.0835, + "step": 3869 + }, + { + "epoch": 1.1878453038674033, + "grad_norm": 0.5760810971260071, + "learning_rate": 9.794923717030082e-05, + "loss": 2.0839, + "step": 3870 + }, + { + "epoch": 1.1881522406384284, + "grad_norm": 0.559018075466156, + "learning_rate": 9.794782798995706e-05, + "loss": 2.0397, + "step": 3871 + }, + { + "epoch": 1.1884591774094537, + "grad_norm": 0.48842501640319824, + "learning_rate": 9.794641833576477e-05, + "loss": 2.022, + "step": 3872 + }, + { + "epoch": 1.1887661141804788, + "grad_norm": 0.47267377376556396, + "learning_rate": 9.794500820773785e-05, + "loss": 1.9677, + "step": 3873 + }, + { + "epoch": 1.189073050951504, + "grad_norm": 0.5107980966567993, + "learning_rate": 9.794359760589026e-05, + "loss": 2.124, + "step": 3874 + }, + { + "epoch": 1.189379987722529, + "grad_norm": 0.4993875026702881, + "learning_rate": 9.794218653023595e-05, + "loss": 1.9528, + "step": 3875 + }, + { + "epoch": 1.1896869244935544, + "grad_norm": 0.49543896317481995, + "learning_rate": 9.794077498078885e-05, + "loss": 2.0257, + "step": 3876 + }, + { + "epoch": 1.1899938612645795, + "grad_norm": 0.5207403302192688, + "learning_rate": 9.79393629575629e-05, + "loss": 2.0853, + "step": 3877 + }, + { + "epoch": 1.1903007980356046, + "grad_norm": 0.44884833693504333, + "learning_rate": 9.793795046057208e-05, + "loss": 1.9366, + "step": 3878 + }, + { + "epoch": 1.1906077348066297, + "grad_norm": 0.47921934723854065, + "learning_rate": 9.793653748983033e-05, + "loss": 2.0614, + "step": 3879 + }, + { + "epoch": 1.190914671577655, + "grad_norm": 0.5371566414833069, + "learning_rate": 9.793512404535163e-05, + "loss": 2.0433, + "step": 3880 + }, + { + "epoch": 1.1912216083486802, + "grad_norm": 0.48760104179382324, + "learning_rate": 9.793371012714994e-05, + "loss": 2.0061, + "step": 3881 + }, + { + "epoch": 1.1915285451197053, + "grad_norm": 0.47291669249534607, + "learning_rate": 9.793229573523922e-05, + "loss": 2.0661, + "step": 3882 + }, + { + "epoch": 1.1918354818907306, + "grad_norm": 0.5348502397537231, + "learning_rate": 9.793088086963347e-05, + "loss": 2.0131, + "step": 3883 + }, + { + "epoch": 1.1921424186617557, + "grad_norm": 0.6291812062263489, + "learning_rate": 9.792946553034666e-05, + "loss": 2.0312, + "step": 3884 + }, + { + "epoch": 1.1924493554327809, + "grad_norm": 0.5620503425598145, + "learning_rate": 9.792804971739276e-05, + "loss": 2.0429, + "step": 3885 + }, + { + "epoch": 1.192756292203806, + "grad_norm": 0.4984607696533203, + "learning_rate": 9.792663343078581e-05, + "loss": 2.0183, + "step": 3886 + }, + { + "epoch": 1.193063228974831, + "grad_norm": 0.5867961645126343, + "learning_rate": 9.792521667053975e-05, + "loss": 2.0609, + "step": 3887 + }, + { + "epoch": 1.1933701657458564, + "grad_norm": 0.5819169282913208, + "learning_rate": 9.792379943666863e-05, + "loss": 1.9412, + "step": 3888 + }, + { + "epoch": 1.1936771025168815, + "grad_norm": 0.6232548952102661, + "learning_rate": 9.792238172918643e-05, + "loss": 2.0607, + "step": 3889 + }, + { + "epoch": 1.1939840392879066, + "grad_norm": 0.5859619379043579, + "learning_rate": 9.792096354810716e-05, + "loss": 2.0718, + "step": 3890 + }, + { + "epoch": 1.194290976058932, + "grad_norm": 0.47209057211875916, + "learning_rate": 9.791954489344485e-05, + "loss": 1.9872, + "step": 3891 + }, + { + "epoch": 1.194597912829957, + "grad_norm": 0.5183662176132202, + "learning_rate": 9.79181257652135e-05, + "loss": 2.0782, + "step": 3892 + }, + { + "epoch": 1.1949048496009822, + "grad_norm": 0.551873505115509, + "learning_rate": 9.791670616342715e-05, + "loss": 2.0477, + "step": 3893 + }, + { + "epoch": 1.1952117863720073, + "grad_norm": 0.47254955768585205, + "learning_rate": 9.791528608809984e-05, + "loss": 1.9859, + "step": 3894 + }, + { + "epoch": 1.1955187231430324, + "grad_norm": 0.45482897758483887, + "learning_rate": 9.791386553924556e-05, + "loss": 1.9939, + "step": 3895 + }, + { + "epoch": 1.1958256599140578, + "grad_norm": 0.4687066078186035, + "learning_rate": 9.79124445168784e-05, + "loss": 1.9982, + "step": 3896 + }, + { + "epoch": 1.1961325966850829, + "grad_norm": 0.4855460524559021, + "learning_rate": 9.791102302101236e-05, + "loss": 1.9667, + "step": 3897 + }, + { + "epoch": 1.196439533456108, + "grad_norm": 0.48152467608451843, + "learning_rate": 9.790960105166153e-05, + "loss": 1.9914, + "step": 3898 + }, + { + "epoch": 1.1967464702271333, + "grad_norm": 0.48487406969070435, + "learning_rate": 9.790817860883993e-05, + "loss": 1.9978, + "step": 3899 + }, + { + "epoch": 1.1970534069981584, + "grad_norm": 0.47665563225746155, + "learning_rate": 9.790675569256162e-05, + "loss": 1.9995, + "step": 3900 + }, + { + "epoch": 1.1973603437691835, + "grad_norm": 0.48938530683517456, + "learning_rate": 9.790533230284069e-05, + "loss": 2.0461, + "step": 3901 + }, + { + "epoch": 1.1976672805402087, + "grad_norm": 0.6336411237716675, + "learning_rate": 9.790390843969119e-05, + "loss": 2.0003, + "step": 3902 + }, + { + "epoch": 1.1979742173112338, + "grad_norm": 0.6946616172790527, + "learning_rate": 9.790248410312717e-05, + "loss": 1.9979, + "step": 3903 + }, + { + "epoch": 1.198281154082259, + "grad_norm": 0.7829384803771973, + "learning_rate": 9.790105929316274e-05, + "loss": 2.015, + "step": 3904 + }, + { + "epoch": 1.1985880908532842, + "grad_norm": 0.6874059438705444, + "learning_rate": 9.789963400981197e-05, + "loss": 1.9887, + "step": 3905 + }, + { + "epoch": 1.1988950276243093, + "grad_norm": 0.6074720025062561, + "learning_rate": 9.789820825308893e-05, + "loss": 2.0287, + "step": 3906 + }, + { + "epoch": 1.1992019643953347, + "grad_norm": 0.49311673641204834, + "learning_rate": 9.789678202300774e-05, + "loss": 1.9846, + "step": 3907 + }, + { + "epoch": 1.1995089011663598, + "grad_norm": 0.5266487002372742, + "learning_rate": 9.789535531958244e-05, + "loss": 2.017, + "step": 3908 + }, + { + "epoch": 1.1998158379373849, + "grad_norm": 0.6170570850372314, + "learning_rate": 9.789392814282721e-05, + "loss": 2.0615, + "step": 3909 + }, + { + "epoch": 1.20012277470841, + "grad_norm": 0.5820409059524536, + "learning_rate": 9.789250049275609e-05, + "loss": 2.0459, + "step": 3910 + }, + { + "epoch": 1.2004297114794351, + "grad_norm": 0.5220739841461182, + "learning_rate": 9.78910723693832e-05, + "loss": 2.0843, + "step": 3911 + }, + { + "epoch": 1.2007366482504604, + "grad_norm": 0.5884750485420227, + "learning_rate": 9.788964377272267e-05, + "loss": 2.1068, + "step": 3912 + }, + { + "epoch": 1.2010435850214856, + "grad_norm": 0.5634950995445251, + "learning_rate": 9.788821470278861e-05, + "loss": 2.0206, + "step": 3913 + }, + { + "epoch": 1.2013505217925107, + "grad_norm": 0.5219514966011047, + "learning_rate": 9.788678515959517e-05, + "loss": 2.0802, + "step": 3914 + }, + { + "epoch": 1.201657458563536, + "grad_norm": 0.5870078206062317, + "learning_rate": 9.788535514315642e-05, + "loss": 2.0149, + "step": 3915 + }, + { + "epoch": 1.2019643953345611, + "grad_norm": 0.4850577414035797, + "learning_rate": 9.788392465348653e-05, + "loss": 2.0424, + "step": 3916 + }, + { + "epoch": 1.2022713321055862, + "grad_norm": 0.5354881882667542, + "learning_rate": 9.788249369059964e-05, + "loss": 2.0822, + "step": 3917 + }, + { + "epoch": 1.2025782688766113, + "grad_norm": 0.5817529559135437, + "learning_rate": 9.788106225450988e-05, + "loss": 2.0384, + "step": 3918 + }, + { + "epoch": 1.2028852056476367, + "grad_norm": 0.5685575008392334, + "learning_rate": 9.78796303452314e-05, + "loss": 1.9777, + "step": 3919 + }, + { + "epoch": 1.2031921424186618, + "grad_norm": 0.5086472034454346, + "learning_rate": 9.787819796277835e-05, + "loss": 1.9109, + "step": 3920 + }, + { + "epoch": 1.203499079189687, + "grad_norm": 0.45905008912086487, + "learning_rate": 9.787676510716488e-05, + "loss": 1.9945, + "step": 3921 + }, + { + "epoch": 1.203806015960712, + "grad_norm": 0.6052672863006592, + "learning_rate": 9.787533177840516e-05, + "loss": 2.0873, + "step": 3922 + }, + { + "epoch": 1.2041129527317374, + "grad_norm": 0.636320173740387, + "learning_rate": 9.787389797651334e-05, + "loss": 1.954, + "step": 3923 + }, + { + "epoch": 1.2044198895027625, + "grad_norm": 0.5775459408760071, + "learning_rate": 9.78724637015036e-05, + "loss": 1.9632, + "step": 3924 + }, + { + "epoch": 1.2047268262737876, + "grad_norm": 0.4593936502933502, + "learning_rate": 9.787102895339013e-05, + "loss": 1.948, + "step": 3925 + }, + { + "epoch": 1.2050337630448127, + "grad_norm": 0.4568643867969513, + "learning_rate": 9.78695937321871e-05, + "loss": 1.977, + "step": 3926 + }, + { + "epoch": 1.205340699815838, + "grad_norm": 0.6079357266426086, + "learning_rate": 9.786815803790867e-05, + "loss": 1.9738, + "step": 3927 + }, + { + "epoch": 1.2056476365868631, + "grad_norm": 0.5991626977920532, + "learning_rate": 9.786672187056905e-05, + "loss": 1.9603, + "step": 3928 + }, + { + "epoch": 1.2059545733578882, + "grad_norm": 0.4844282865524292, + "learning_rate": 9.786528523018242e-05, + "loss": 1.9739, + "step": 3929 + }, + { + "epoch": 1.2062615101289134, + "grad_norm": 0.43694475293159485, + "learning_rate": 9.786384811676298e-05, + "loss": 1.957, + "step": 3930 + }, + { + "epoch": 1.2065684468999387, + "grad_norm": 0.5742451548576355, + "learning_rate": 9.786241053032496e-05, + "loss": 1.9872, + "step": 3931 + }, + { + "epoch": 1.2068753836709638, + "grad_norm": 0.6246824860572815, + "learning_rate": 9.786097247088255e-05, + "loss": 2.0747, + "step": 3932 + }, + { + "epoch": 1.207182320441989, + "grad_norm": 0.5364731550216675, + "learning_rate": 9.785953393844996e-05, + "loss": 1.9793, + "step": 3933 + }, + { + "epoch": 1.207489257213014, + "grad_norm": 0.42909273505210876, + "learning_rate": 9.785809493304139e-05, + "loss": 1.9959, + "step": 3934 + }, + { + "epoch": 1.2077961939840394, + "grad_norm": 0.43952879309654236, + "learning_rate": 9.785665545467108e-05, + "loss": 2.0019, + "step": 3935 + }, + { + "epoch": 1.2081031307550645, + "grad_norm": 0.45972180366516113, + "learning_rate": 9.785521550335323e-05, + "loss": 1.9504, + "step": 3936 + }, + { + "epoch": 1.2084100675260896, + "grad_norm": 0.5592246651649475, + "learning_rate": 9.785377507910212e-05, + "loss": 2.0214, + "step": 3937 + }, + { + "epoch": 1.2087170042971147, + "grad_norm": 0.6084285378456116, + "learning_rate": 9.785233418193196e-05, + "loss": 2.08, + "step": 3938 + }, + { + "epoch": 1.20902394106814, + "grad_norm": 0.5370670557022095, + "learning_rate": 9.785089281185698e-05, + "loss": 2.0877, + "step": 3939 + }, + { + "epoch": 1.2093308778391652, + "grad_norm": 0.466501921415329, + "learning_rate": 9.784945096889143e-05, + "loss": 1.9795, + "step": 3940 + }, + { + "epoch": 1.2096378146101903, + "grad_norm": 0.48617517948150635, + "learning_rate": 9.784800865304954e-05, + "loss": 2.0099, + "step": 3941 + }, + { + "epoch": 1.2099447513812154, + "grad_norm": 0.528110921382904, + "learning_rate": 9.78465658643456e-05, + "loss": 2.0597, + "step": 3942 + }, + { + "epoch": 1.2102516881522407, + "grad_norm": 0.47355538606643677, + "learning_rate": 9.784512260279385e-05, + "loss": 2.0145, + "step": 3943 + }, + { + "epoch": 1.2105586249232658, + "grad_norm": 0.46970823407173157, + "learning_rate": 9.784367886840856e-05, + "loss": 2.0533, + "step": 3944 + }, + { + "epoch": 1.210865561694291, + "grad_norm": 0.41206037998199463, + "learning_rate": 9.784223466120399e-05, + "loss": 1.9226, + "step": 3945 + }, + { + "epoch": 1.211172498465316, + "grad_norm": 0.4298155605792999, + "learning_rate": 9.784078998119442e-05, + "loss": 2.0686, + "step": 3946 + }, + { + "epoch": 1.2114794352363414, + "grad_norm": 0.4616359770298004, + "learning_rate": 9.783934482839412e-05, + "loss": 2.0063, + "step": 3947 + }, + { + "epoch": 1.2117863720073665, + "grad_norm": 0.476726233959198, + "learning_rate": 9.783789920281737e-05, + "loss": 1.9868, + "step": 3948 + }, + { + "epoch": 1.2120933087783916, + "grad_norm": 0.5075610876083374, + "learning_rate": 9.783645310447846e-05, + "loss": 2.1019, + "step": 3949 + }, + { + "epoch": 1.212400245549417, + "grad_norm": 0.49806225299835205, + "learning_rate": 9.78350065333917e-05, + "loss": 2.0503, + "step": 3950 + }, + { + "epoch": 1.212707182320442, + "grad_norm": 0.5278452634811401, + "learning_rate": 9.783355948957134e-05, + "loss": 2.0513, + "step": 3951 + }, + { + "epoch": 1.2130141190914672, + "grad_norm": 0.5634627938270569, + "learning_rate": 9.783211197303174e-05, + "loss": 2.1135, + "step": 3952 + }, + { + "epoch": 1.2133210558624923, + "grad_norm": 0.5152999758720398, + "learning_rate": 9.783066398378715e-05, + "loss": 2.0392, + "step": 3953 + }, + { + "epoch": 1.2136279926335174, + "grad_norm": 0.48095864057540894, + "learning_rate": 9.782921552185191e-05, + "loss": 1.982, + "step": 3954 + }, + { + "epoch": 1.2139349294045427, + "grad_norm": 0.47377893328666687, + "learning_rate": 9.782776658724034e-05, + "loss": 1.9538, + "step": 3955 + }, + { + "epoch": 1.2142418661755678, + "grad_norm": 0.5260181427001953, + "learning_rate": 9.782631717996675e-05, + "loss": 2.1197, + "step": 3956 + }, + { + "epoch": 1.214548802946593, + "grad_norm": 0.5640038251876831, + "learning_rate": 9.782486730004544e-05, + "loss": 2.0338, + "step": 3957 + }, + { + "epoch": 1.2148557397176183, + "grad_norm": 0.5091645121574402, + "learning_rate": 9.782341694749078e-05, + "loss": 1.9921, + "step": 3958 + }, + { + "epoch": 1.2151626764886434, + "grad_norm": 0.48285624384880066, + "learning_rate": 9.782196612231706e-05, + "loss": 2.0358, + "step": 3959 + }, + { + "epoch": 1.2154696132596685, + "grad_norm": 0.5013573169708252, + "learning_rate": 9.782051482453867e-05, + "loss": 1.9378, + "step": 3960 + }, + { + "epoch": 1.2157765500306936, + "grad_norm": 0.42000052332878113, + "learning_rate": 9.781906305416991e-05, + "loss": 1.9232, + "step": 3961 + }, + { + "epoch": 1.2160834868017187, + "grad_norm": 0.4651196599006653, + "learning_rate": 9.781761081122514e-05, + "loss": 2.0244, + "step": 3962 + }, + { + "epoch": 1.216390423572744, + "grad_norm": 0.48081469535827637, + "learning_rate": 9.781615809571871e-05, + "loss": 1.938, + "step": 3963 + }, + { + "epoch": 1.2166973603437692, + "grad_norm": 0.4692462086677551, + "learning_rate": 9.7814704907665e-05, + "loss": 1.9592, + "step": 3964 + }, + { + "epoch": 1.2170042971147943, + "grad_norm": 0.5545635223388672, + "learning_rate": 9.781325124707832e-05, + "loss": 2.0882, + "step": 3965 + }, + { + "epoch": 1.2173112338858196, + "grad_norm": 0.47801801562309265, + "learning_rate": 9.78117971139731e-05, + "loss": 2.0127, + "step": 3966 + }, + { + "epoch": 1.2176181706568447, + "grad_norm": 0.4705824851989746, + "learning_rate": 9.781034250836364e-05, + "loss": 2.0659, + "step": 3967 + }, + { + "epoch": 1.2179251074278699, + "grad_norm": 0.4757092297077179, + "learning_rate": 9.78088874302644e-05, + "loss": 1.9177, + "step": 3968 + }, + { + "epoch": 1.218232044198895, + "grad_norm": 0.4563291370868683, + "learning_rate": 9.780743187968968e-05, + "loss": 1.991, + "step": 3969 + }, + { + "epoch": 1.21853898096992, + "grad_norm": 0.4641762375831604, + "learning_rate": 9.78059758566539e-05, + "loss": 2.0357, + "step": 3970 + }, + { + "epoch": 1.2188459177409454, + "grad_norm": 0.510754406452179, + "learning_rate": 9.780451936117145e-05, + "loss": 2.0754, + "step": 3971 + }, + { + "epoch": 1.2191528545119705, + "grad_norm": 0.5595460534095764, + "learning_rate": 9.780306239325671e-05, + "loss": 2.0449, + "step": 3972 + }, + { + "epoch": 1.2194597912829956, + "grad_norm": 0.5778231620788574, + "learning_rate": 9.780160495292412e-05, + "loss": 2.0187, + "step": 3973 + }, + { + "epoch": 1.219766728054021, + "grad_norm": 0.5098022818565369, + "learning_rate": 9.780014704018803e-05, + "loss": 1.9881, + "step": 3974 + }, + { + "epoch": 1.220073664825046, + "grad_norm": 0.46725937724113464, + "learning_rate": 9.779868865506288e-05, + "loss": 1.9929, + "step": 3975 + }, + { + "epoch": 1.2203806015960712, + "grad_norm": 0.48517540097236633, + "learning_rate": 9.779722979756304e-05, + "loss": 1.9446, + "step": 3976 + }, + { + "epoch": 1.2206875383670963, + "grad_norm": 0.5013269186019897, + "learning_rate": 9.7795770467703e-05, + "loss": 2.0256, + "step": 3977 + }, + { + "epoch": 1.2209944751381214, + "grad_norm": 0.4918982982635498, + "learning_rate": 9.779431066549713e-05, + "loss": 1.9732, + "step": 3978 + }, + { + "epoch": 1.2213014119091468, + "grad_norm": 0.45646655559539795, + "learning_rate": 9.779285039095987e-05, + "loss": 1.9672, + "step": 3979 + }, + { + "epoch": 1.2216083486801719, + "grad_norm": 0.4712901711463928, + "learning_rate": 9.779138964410565e-05, + "loss": 2.0074, + "step": 3980 + }, + { + "epoch": 1.221915285451197, + "grad_norm": 0.4901394844055176, + "learning_rate": 9.77899284249489e-05, + "loss": 2.0073, + "step": 3981 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.579247772693634, + "learning_rate": 9.778846673350407e-05, + "loss": 2.0983, + "step": 3982 + }, + { + "epoch": 1.2225291589932474, + "grad_norm": 0.6108444929122925, + "learning_rate": 9.77870045697856e-05, + "loss": 2.0268, + "step": 3983 + }, + { + "epoch": 1.2228360957642725, + "grad_norm": 0.5592121481895447, + "learning_rate": 9.778554193380796e-05, + "loss": 2.0549, + "step": 3984 + }, + { + "epoch": 1.2231430325352977, + "grad_norm": 0.538088858127594, + "learning_rate": 9.778407882558556e-05, + "loss": 1.9398, + "step": 3985 + }, + { + "epoch": 1.223449969306323, + "grad_norm": 0.5928295850753784, + "learning_rate": 9.77826152451329e-05, + "loss": 2.0341, + "step": 3986 + }, + { + "epoch": 1.223756906077348, + "grad_norm": 0.566687822341919, + "learning_rate": 9.778115119246442e-05, + "loss": 2.0629, + "step": 3987 + }, + { + "epoch": 1.2240638428483732, + "grad_norm": 0.7019027471542358, + "learning_rate": 9.777968666759461e-05, + "loss": 1.9979, + "step": 3988 + }, + { + "epoch": 1.2243707796193983, + "grad_norm": 0.7198969721794128, + "learning_rate": 9.777822167053793e-05, + "loss": 1.9898, + "step": 3989 + }, + { + "epoch": 1.2246777163904237, + "grad_norm": 0.6319006085395813, + "learning_rate": 9.777675620130887e-05, + "loss": 1.9591, + "step": 3990 + }, + { + "epoch": 1.2249846531614488, + "grad_norm": 0.5372903347015381, + "learning_rate": 9.777529025992187e-05, + "loss": 1.9605, + "step": 3991 + }, + { + "epoch": 1.225291589932474, + "grad_norm": 0.47436487674713135, + "learning_rate": 9.777382384639147e-05, + "loss": 1.9667, + "step": 3992 + }, + { + "epoch": 1.225598526703499, + "grad_norm": 0.5885797739028931, + "learning_rate": 9.777235696073214e-05, + "loss": 2.0363, + "step": 3993 + }, + { + "epoch": 1.2259054634745243, + "grad_norm": 0.6333138346672058, + "learning_rate": 9.777088960295838e-05, + "loss": 1.9352, + "step": 3994 + }, + { + "epoch": 1.2262124002455494, + "grad_norm": 0.6364251971244812, + "learning_rate": 9.776942177308468e-05, + "loss": 1.9577, + "step": 3995 + }, + { + "epoch": 1.2265193370165746, + "grad_norm": 0.5114668607711792, + "learning_rate": 9.776795347112557e-05, + "loss": 2.0241, + "step": 3996 + }, + { + "epoch": 1.2268262737875997, + "grad_norm": 0.6139995455741882, + "learning_rate": 9.776648469709556e-05, + "loss": 1.9847, + "step": 3997 + }, + { + "epoch": 1.227133210558625, + "grad_norm": 0.6104671955108643, + "learning_rate": 9.776501545100911e-05, + "loss": 1.9311, + "step": 3998 + }, + { + "epoch": 1.2274401473296501, + "grad_norm": 0.5099297761917114, + "learning_rate": 9.776354573288081e-05, + "loss": 2.0877, + "step": 3999 + }, + { + "epoch": 1.2277470841006752, + "grad_norm": 0.48199233412742615, + "learning_rate": 9.776207554272516e-05, + "loss": 1.9802, + "step": 4000 + }, + { + "epoch": 1.2280540208717003, + "grad_norm": 0.5323067307472229, + "learning_rate": 9.776060488055667e-05, + "loss": 2.0278, + "step": 4001 + }, + { + "epoch": 1.2283609576427257, + "grad_norm": 0.49086472392082214, + "learning_rate": 9.775913374638988e-05, + "loss": 2.0242, + "step": 4002 + }, + { + "epoch": 1.2286678944137508, + "grad_norm": 0.4812946319580078, + "learning_rate": 9.775766214023936e-05, + "loss": 1.9762, + "step": 4003 + }, + { + "epoch": 1.228974831184776, + "grad_norm": 0.44118809700012207, + "learning_rate": 9.775619006211962e-05, + "loss": 1.9242, + "step": 4004 + }, + { + "epoch": 1.229281767955801, + "grad_norm": 0.4507352113723755, + "learning_rate": 9.775471751204522e-05, + "loss": 2.0015, + "step": 4005 + }, + { + "epoch": 1.2295887047268264, + "grad_norm": 0.4620691239833832, + "learning_rate": 9.775324449003072e-05, + "loss": 2.0269, + "step": 4006 + }, + { + "epoch": 1.2298956414978515, + "grad_norm": 0.5053025484085083, + "learning_rate": 9.775177099609065e-05, + "loss": 1.9764, + "step": 4007 + }, + { + "epoch": 1.2302025782688766, + "grad_norm": 0.5113483667373657, + "learning_rate": 9.775029703023961e-05, + "loss": 2.0583, + "step": 4008 + }, + { + "epoch": 1.2305095150399017, + "grad_norm": 0.517400324344635, + "learning_rate": 9.774882259249214e-05, + "loss": 2.0918, + "step": 4009 + }, + { + "epoch": 1.230816451810927, + "grad_norm": 0.5575035214424133, + "learning_rate": 9.774734768286282e-05, + "loss": 2.0573, + "step": 4010 + }, + { + "epoch": 1.2311233885819521, + "grad_norm": 0.5556582808494568, + "learning_rate": 9.774587230136622e-05, + "loss": 1.9612, + "step": 4011 + }, + { + "epoch": 1.2314303253529773, + "grad_norm": 0.541752815246582, + "learning_rate": 9.774439644801693e-05, + "loss": 2.0165, + "step": 4012 + }, + { + "epoch": 1.2317372621240024, + "grad_norm": 0.46944886445999146, + "learning_rate": 9.774292012282953e-05, + "loss": 2.0068, + "step": 4013 + }, + { + "epoch": 1.2320441988950277, + "grad_norm": 0.5507385730743408, + "learning_rate": 9.77414433258186e-05, + "loss": 2.0092, + "step": 4014 + }, + { + "epoch": 1.2323511356660528, + "grad_norm": 0.550862193107605, + "learning_rate": 9.773996605699875e-05, + "loss": 1.9887, + "step": 4015 + }, + { + "epoch": 1.232658072437078, + "grad_norm": 0.5281004905700684, + "learning_rate": 9.77384883163846e-05, + "loss": 2.0214, + "step": 4016 + }, + { + "epoch": 1.232965009208103, + "grad_norm": 0.5682541131973267, + "learning_rate": 9.77370101039907e-05, + "loss": 2.0021, + "step": 4017 + }, + { + "epoch": 1.2332719459791284, + "grad_norm": 0.5083168745040894, + "learning_rate": 9.77355314198317e-05, + "loss": 1.9589, + "step": 4018 + }, + { + "epoch": 1.2335788827501535, + "grad_norm": 0.48763957619667053, + "learning_rate": 9.773405226392218e-05, + "loss": 1.9517, + "step": 4019 + }, + { + "epoch": 1.2338858195211786, + "grad_norm": 0.4721868634223938, + "learning_rate": 9.77325726362768e-05, + "loss": 1.959, + "step": 4020 + }, + { + "epoch": 1.2341927562922037, + "grad_norm": 0.5072606205940247, + "learning_rate": 9.773109253691016e-05, + "loss": 2.0252, + "step": 4021 + }, + { + "epoch": 1.234499693063229, + "grad_norm": 0.483260840177536, + "learning_rate": 9.772961196583686e-05, + "loss": 2.0205, + "step": 4022 + }, + { + "epoch": 1.2348066298342542, + "grad_norm": 0.4468609392642975, + "learning_rate": 9.772813092307158e-05, + "loss": 2.0182, + "step": 4023 + }, + { + "epoch": 1.2351135666052793, + "grad_norm": 0.4950753152370453, + "learning_rate": 9.772664940862893e-05, + "loss": 2.0276, + "step": 4024 + }, + { + "epoch": 1.2354205033763046, + "grad_norm": 0.45740416646003723, + "learning_rate": 9.772516742252356e-05, + "loss": 1.9519, + "step": 4025 + }, + { + "epoch": 1.2357274401473297, + "grad_norm": 0.409072607755661, + "learning_rate": 9.772368496477011e-05, + "loss": 1.9441, + "step": 4026 + }, + { + "epoch": 1.2360343769183548, + "grad_norm": 0.44857287406921387, + "learning_rate": 9.772220203538325e-05, + "loss": 1.9941, + "step": 4027 + }, + { + "epoch": 1.23634131368938, + "grad_norm": 0.4610998034477234, + "learning_rate": 9.77207186343776e-05, + "loss": 1.9855, + "step": 4028 + }, + { + "epoch": 1.236648250460405, + "grad_norm": 0.4809660017490387, + "learning_rate": 9.771923476176784e-05, + "loss": 1.9596, + "step": 4029 + }, + { + "epoch": 1.2369551872314304, + "grad_norm": 0.5011657476425171, + "learning_rate": 9.771775041756865e-05, + "loss": 1.9537, + "step": 4030 + }, + { + "epoch": 1.2372621240024555, + "grad_norm": 0.476001501083374, + "learning_rate": 9.771626560179465e-05, + "loss": 1.9447, + "step": 4031 + }, + { + "epoch": 1.2375690607734806, + "grad_norm": 0.4733816385269165, + "learning_rate": 9.771478031446057e-05, + "loss": 2.08, + "step": 4032 + }, + { + "epoch": 1.237875997544506, + "grad_norm": 0.4763995409011841, + "learning_rate": 9.771329455558108e-05, + "loss": 1.9483, + "step": 4033 + }, + { + "epoch": 1.238182934315531, + "grad_norm": 0.4906281530857086, + "learning_rate": 9.771180832517082e-05, + "loss": 1.9619, + "step": 4034 + }, + { + "epoch": 1.2384898710865562, + "grad_norm": 0.48713672161102295, + "learning_rate": 9.77103216232445e-05, + "loss": 1.9753, + "step": 4035 + }, + { + "epoch": 1.2387968078575813, + "grad_norm": 0.5214180946350098, + "learning_rate": 9.770883444981683e-05, + "loss": 2.0407, + "step": 4036 + }, + { + "epoch": 1.2391037446286064, + "grad_norm": 0.5161129236221313, + "learning_rate": 9.77073468049025e-05, + "loss": 2.0298, + "step": 4037 + }, + { + "epoch": 1.2394106813996317, + "grad_norm": 0.5041607022285461, + "learning_rate": 9.770585868851621e-05, + "loss": 1.9898, + "step": 4038 + }, + { + "epoch": 1.2397176181706568, + "grad_norm": 0.5076795220375061, + "learning_rate": 9.770437010067264e-05, + "loss": 1.9899, + "step": 4039 + }, + { + "epoch": 1.240024554941682, + "grad_norm": 0.47992074489593506, + "learning_rate": 9.770288104138654e-05, + "loss": 1.9923, + "step": 4040 + }, + { + "epoch": 1.2403314917127073, + "grad_norm": 0.4655405580997467, + "learning_rate": 9.770139151067261e-05, + "loss": 2.0082, + "step": 4041 + }, + { + "epoch": 1.2406384284837324, + "grad_norm": 0.499953031539917, + "learning_rate": 9.769990150854558e-05, + "loss": 2.0412, + "step": 4042 + }, + { + "epoch": 1.2409453652547575, + "grad_norm": 0.5288184285163879, + "learning_rate": 9.769841103502016e-05, + "loss": 2.0163, + "step": 4043 + }, + { + "epoch": 1.2412523020257826, + "grad_norm": 0.6660463809967041, + "learning_rate": 9.769692009011107e-05, + "loss": 2.1644, + "step": 4044 + }, + { + "epoch": 1.2415592387968077, + "grad_norm": 0.7020677328109741, + "learning_rate": 9.769542867383306e-05, + "loss": 1.9921, + "step": 4045 + }, + { + "epoch": 1.241866175567833, + "grad_norm": 0.8394366502761841, + "learning_rate": 9.769393678620089e-05, + "loss": 2.0099, + "step": 4046 + }, + { + "epoch": 1.2421731123388582, + "grad_norm": 0.9541008472442627, + "learning_rate": 9.769244442722927e-05, + "loss": 2.0035, + "step": 4047 + }, + { + "epoch": 1.2424800491098833, + "grad_norm": 0.8454573750495911, + "learning_rate": 9.769095159693296e-05, + "loss": 2.0075, + "step": 4048 + }, + { + "epoch": 1.2427869858809086, + "grad_norm": 0.6634951233863831, + "learning_rate": 9.768945829532672e-05, + "loss": 2.0352, + "step": 4049 + }, + { + "epoch": 1.2430939226519337, + "grad_norm": 0.5453166365623474, + "learning_rate": 9.76879645224253e-05, + "loss": 2.0259, + "step": 4050 + }, + { + "epoch": 1.2434008594229589, + "grad_norm": 0.8018995523452759, + "learning_rate": 9.768647027824344e-05, + "loss": 2.0175, + "step": 4051 + }, + { + "epoch": 1.243707796193984, + "grad_norm": 0.8518994450569153, + "learning_rate": 9.768497556279596e-05, + "loss": 1.986, + "step": 4052 + }, + { + "epoch": 1.244014732965009, + "grad_norm": 0.670764684677124, + "learning_rate": 9.76834803760976e-05, + "loss": 1.9779, + "step": 4053 + }, + { + "epoch": 1.2443216697360344, + "grad_norm": 0.5042433142662048, + "learning_rate": 9.768198471816312e-05, + "loss": 1.9808, + "step": 4054 + }, + { + "epoch": 1.2446286065070595, + "grad_norm": 0.45487603545188904, + "learning_rate": 9.768048858900733e-05, + "loss": 2.011, + "step": 4055 + }, + { + "epoch": 1.2449355432780846, + "grad_norm": 0.5012104511260986, + "learning_rate": 9.767899198864502e-05, + "loss": 1.9945, + "step": 4056 + }, + { + "epoch": 1.24524248004911, + "grad_norm": 0.6275805234909058, + "learning_rate": 9.767749491709095e-05, + "loss": 2.0397, + "step": 4057 + }, + { + "epoch": 1.245549416820135, + "grad_norm": 0.601513683795929, + "learning_rate": 9.767599737435993e-05, + "loss": 2.0201, + "step": 4058 + }, + { + "epoch": 1.2458563535911602, + "grad_norm": 0.531112551689148, + "learning_rate": 9.767449936046678e-05, + "loss": 2.0449, + "step": 4059 + }, + { + "epoch": 1.2461632903621853, + "grad_norm": 0.48515528440475464, + "learning_rate": 9.767300087542626e-05, + "loss": 2.0318, + "step": 4060 + }, + { + "epoch": 1.2464702271332107, + "grad_norm": 0.49292388558387756, + "learning_rate": 9.767150191925321e-05, + "loss": 2.0004, + "step": 4061 + }, + { + "epoch": 1.2467771639042358, + "grad_norm": 0.6046907901763916, + "learning_rate": 9.767000249196242e-05, + "loss": 2.0141, + "step": 4062 + }, + { + "epoch": 1.2470841006752609, + "grad_norm": 0.5311875939369202, + "learning_rate": 9.766850259356876e-05, + "loss": 1.9909, + "step": 4063 + }, + { + "epoch": 1.247391037446286, + "grad_norm": 0.535664975643158, + "learning_rate": 9.7667002224087e-05, + "loss": 2.07, + "step": 4064 + }, + { + "epoch": 1.2476979742173113, + "grad_norm": 0.594886839389801, + "learning_rate": 9.766550138353199e-05, + "loss": 1.9646, + "step": 4065 + }, + { + "epoch": 1.2480049109883364, + "grad_norm": 0.6726763844490051, + "learning_rate": 9.766400007191856e-05, + "loss": 1.9778, + "step": 4066 + }, + { + "epoch": 1.2483118477593615, + "grad_norm": 0.6045297384262085, + "learning_rate": 9.766249828926154e-05, + "loss": 2.0215, + "step": 4067 + }, + { + "epoch": 1.2486187845303867, + "grad_norm": 0.56207275390625, + "learning_rate": 9.766099603557576e-05, + "loss": 2.0252, + "step": 4068 + }, + { + "epoch": 1.248925721301412, + "grad_norm": 0.6623022556304932, + "learning_rate": 9.765949331087611e-05, + "loss": 1.975, + "step": 4069 + }, + { + "epoch": 1.249232658072437, + "grad_norm": 0.6274738311767578, + "learning_rate": 9.76579901151774e-05, + "loss": 2.037, + "step": 4070 + }, + { + "epoch": 1.2495395948434622, + "grad_norm": 0.5161643028259277, + "learning_rate": 9.76564864484945e-05, + "loss": 1.969, + "step": 4071 + }, + { + "epoch": 1.2498465316144873, + "grad_norm": 0.5624449849128723, + "learning_rate": 9.765498231084227e-05, + "loss": 2.0322, + "step": 4072 + }, + { + "epoch": 1.2501534683855127, + "grad_norm": 0.6198796629905701, + "learning_rate": 9.765347770223556e-05, + "loss": 1.986, + "step": 4073 + }, + { + "epoch": 1.2504604051565378, + "grad_norm": 0.5928165316581726, + "learning_rate": 9.765197262268927e-05, + "loss": 1.9886, + "step": 4074 + }, + { + "epoch": 1.250767341927563, + "grad_norm": 0.476484090089798, + "learning_rate": 9.765046707221825e-05, + "loss": 2.0476, + "step": 4075 + }, + { + "epoch": 1.2510742786985882, + "grad_norm": 0.5001220703125, + "learning_rate": 9.764896105083738e-05, + "loss": 1.9222, + "step": 4076 + }, + { + "epoch": 1.2513812154696133, + "grad_norm": 0.5429214239120483, + "learning_rate": 9.764745455856156e-05, + "loss": 2.0005, + "step": 4077 + }, + { + "epoch": 1.2516881522406385, + "grad_norm": 0.49443748593330383, + "learning_rate": 9.764594759540566e-05, + "loss": 1.9746, + "step": 4078 + }, + { + "epoch": 1.2519950890116636, + "grad_norm": 0.46963369846343994, + "learning_rate": 9.764444016138458e-05, + "loss": 1.9133, + "step": 4079 + }, + { + "epoch": 1.2523020257826887, + "grad_norm": 0.5112172365188599, + "learning_rate": 9.764293225651324e-05, + "loss": 1.9488, + "step": 4080 + }, + { + "epoch": 1.252608962553714, + "grad_norm": 0.4584117829799652, + "learning_rate": 9.764142388080648e-05, + "loss": 1.9895, + "step": 4081 + }, + { + "epoch": 1.2529158993247391, + "grad_norm": 0.48059090971946716, + "learning_rate": 9.763991503427927e-05, + "loss": 2.0436, + "step": 4082 + }, + { + "epoch": 1.2532228360957642, + "grad_norm": 0.5877810120582581, + "learning_rate": 9.763840571694649e-05, + "loss": 1.97, + "step": 4083 + }, + { + "epoch": 1.2535297728667896, + "grad_norm": 0.5370834469795227, + "learning_rate": 9.763689592882306e-05, + "loss": 2.0369, + "step": 4084 + }, + { + "epoch": 1.2538367096378147, + "grad_norm": 0.5483170747756958, + "learning_rate": 9.763538566992392e-05, + "loss": 2.066, + "step": 4085 + }, + { + "epoch": 1.2541436464088398, + "grad_norm": 0.5209359526634216, + "learning_rate": 9.763387494026396e-05, + "loss": 2.0685, + "step": 4086 + }, + { + "epoch": 1.254450583179865, + "grad_norm": 0.5569130182266235, + "learning_rate": 9.763236373985813e-05, + "loss": 2.0253, + "step": 4087 + }, + { + "epoch": 1.25475751995089, + "grad_norm": 0.48483753204345703, + "learning_rate": 9.763085206872136e-05, + "loss": 1.9851, + "step": 4088 + }, + { + "epoch": 1.2550644567219154, + "grad_norm": 0.4289563000202179, + "learning_rate": 9.76293399268686e-05, + "loss": 1.9374, + "step": 4089 + }, + { + "epoch": 1.2553713934929405, + "grad_norm": 0.4691961109638214, + "learning_rate": 9.762782731431478e-05, + "loss": 1.9588, + "step": 4090 + }, + { + "epoch": 1.2556783302639656, + "grad_norm": 0.49626582860946655, + "learning_rate": 9.762631423107488e-05, + "loss": 1.999, + "step": 4091 + }, + { + "epoch": 1.255985267034991, + "grad_norm": 0.5099872946739197, + "learning_rate": 9.762480067716381e-05, + "loss": 2.013, + "step": 4092 + }, + { + "epoch": 1.256292203806016, + "grad_norm": 0.47525838017463684, + "learning_rate": 9.762328665259654e-05, + "loss": 1.9953, + "step": 4093 + }, + { + "epoch": 1.2565991405770411, + "grad_norm": 0.4277878999710083, + "learning_rate": 9.762177215738804e-05, + "loss": 1.9623, + "step": 4094 + }, + { + "epoch": 1.2569060773480663, + "grad_norm": 0.46068885922431946, + "learning_rate": 9.762025719155328e-05, + "loss": 2.0012, + "step": 4095 + }, + { + "epoch": 1.2572130141190914, + "grad_norm": 0.4566059410572052, + "learning_rate": 9.761874175510723e-05, + "loss": 1.9666, + "step": 4096 + }, + { + "epoch": 1.2575199508901167, + "grad_norm": 0.44656631350517273, + "learning_rate": 9.761722584806487e-05, + "loss": 1.9912, + "step": 4097 + }, + { + "epoch": 1.2578268876611418, + "grad_norm": 0.5149295330047607, + "learning_rate": 9.761570947044117e-05, + "loss": 1.9876, + "step": 4098 + }, + { + "epoch": 1.258133824432167, + "grad_norm": 0.5265617370605469, + "learning_rate": 9.761419262225111e-05, + "loss": 2.0817, + "step": 4099 + }, + { + "epoch": 1.2584407612031923, + "grad_norm": 0.5015068054199219, + "learning_rate": 9.76126753035097e-05, + "loss": 1.9767, + "step": 4100 + }, + { + "epoch": 1.2587476979742174, + "grad_norm": 0.5178890228271484, + "learning_rate": 9.761115751423192e-05, + "loss": 1.9968, + "step": 4101 + }, + { + "epoch": 1.2590546347452425, + "grad_norm": 0.46565014123916626, + "learning_rate": 9.760963925443279e-05, + "loss": 1.8977, + "step": 4102 + }, + { + "epoch": 1.2593615715162676, + "grad_norm": 0.466398686170578, + "learning_rate": 9.760812052412728e-05, + "loss": 2.0317, + "step": 4103 + }, + { + "epoch": 1.2596685082872927, + "grad_norm": 0.48445576429367065, + "learning_rate": 9.760660132333043e-05, + "loss": 1.9953, + "step": 4104 + }, + { + "epoch": 1.259975445058318, + "grad_norm": 0.5716978907585144, + "learning_rate": 9.760508165205724e-05, + "loss": 2.0468, + "step": 4105 + }, + { + "epoch": 1.2602823818293432, + "grad_norm": 0.5168376564979553, + "learning_rate": 9.760356151032273e-05, + "loss": 1.9896, + "step": 4106 + }, + { + "epoch": 1.2605893186003683, + "grad_norm": 0.5014469027519226, + "learning_rate": 9.760204089814192e-05, + "loss": 2.0855, + "step": 4107 + }, + { + "epoch": 1.2608962553713936, + "grad_norm": 0.5283352732658386, + "learning_rate": 9.760051981552984e-05, + "loss": 2.0477, + "step": 4108 + }, + { + "epoch": 1.2612031921424187, + "grad_norm": 0.4526209533214569, + "learning_rate": 9.759899826250153e-05, + "loss": 1.9638, + "step": 4109 + }, + { + "epoch": 1.2615101289134438, + "grad_norm": 0.4565027058124542, + "learning_rate": 9.759747623907203e-05, + "loss": 1.9401, + "step": 4110 + }, + { + "epoch": 1.261817065684469, + "grad_norm": 0.48825928568840027, + "learning_rate": 9.759595374525636e-05, + "loss": 1.9721, + "step": 4111 + }, + { + "epoch": 1.262124002455494, + "grad_norm": 0.4922933578491211, + "learning_rate": 9.759443078106958e-05, + "loss": 1.969, + "step": 4112 + }, + { + "epoch": 1.2624309392265194, + "grad_norm": 0.5227758884429932, + "learning_rate": 9.759290734652674e-05, + "loss": 2.0144, + "step": 4113 + }, + { + "epoch": 1.2627378759975445, + "grad_norm": 0.48013919591903687, + "learning_rate": 9.759138344164289e-05, + "loss": 1.9889, + "step": 4114 + }, + { + "epoch": 1.2630448127685696, + "grad_norm": 0.5039379596710205, + "learning_rate": 9.758985906643309e-05, + "loss": 1.9313, + "step": 4115 + }, + { + "epoch": 1.263351749539595, + "grad_norm": 0.5248776078224182, + "learning_rate": 9.758833422091244e-05, + "loss": 2.0091, + "step": 4116 + }, + { + "epoch": 1.26365868631062, + "grad_norm": 0.4788825809955597, + "learning_rate": 9.758680890509595e-05, + "loss": 2.0197, + "step": 4117 + }, + { + "epoch": 1.2639656230816452, + "grad_norm": 0.4926285743713379, + "learning_rate": 9.758528311899873e-05, + "loss": 2.0558, + "step": 4118 + }, + { + "epoch": 1.2642725598526703, + "grad_norm": 0.44785842299461365, + "learning_rate": 9.758375686263586e-05, + "loss": 1.9505, + "step": 4119 + }, + { + "epoch": 1.2645794966236954, + "grad_norm": 0.44693484902381897, + "learning_rate": 9.75822301360224e-05, + "loss": 1.9734, + "step": 4120 + }, + { + "epoch": 1.2648864333947207, + "grad_norm": 0.4691752791404724, + "learning_rate": 9.758070293917346e-05, + "loss": 2.0069, + "step": 4121 + }, + { + "epoch": 1.2651933701657458, + "grad_norm": 0.4718364477157593, + "learning_rate": 9.757917527210413e-05, + "loss": 1.9926, + "step": 4122 + }, + { + "epoch": 1.265500306936771, + "grad_norm": 0.47527435421943665, + "learning_rate": 9.757764713482949e-05, + "loss": 2.0304, + "step": 4123 + }, + { + "epoch": 1.2658072437077963, + "grad_norm": 0.5030924677848816, + "learning_rate": 9.757611852736467e-05, + "loss": 2.0281, + "step": 4124 + }, + { + "epoch": 1.2661141804788214, + "grad_norm": 0.5260440707206726, + "learning_rate": 9.757458944972475e-05, + "loss": 1.9952, + "step": 4125 + }, + { + "epoch": 1.2664211172498465, + "grad_norm": 0.5542300939559937, + "learning_rate": 9.757305990192486e-05, + "loss": 1.979, + "step": 4126 + }, + { + "epoch": 1.2667280540208716, + "grad_norm": 0.5589221715927124, + "learning_rate": 9.757152988398011e-05, + "loss": 2.0123, + "step": 4127 + }, + { + "epoch": 1.2670349907918967, + "grad_norm": 0.48933175206184387, + "learning_rate": 9.75699993959056e-05, + "loss": 1.9671, + "step": 4128 + }, + { + "epoch": 1.267341927562922, + "grad_norm": 0.4785501956939697, + "learning_rate": 9.75684684377165e-05, + "loss": 1.9452, + "step": 4129 + }, + { + "epoch": 1.2676488643339472, + "grad_norm": 0.5000367760658264, + "learning_rate": 9.75669370094279e-05, + "loss": 1.9637, + "step": 4130 + }, + { + "epoch": 1.2679558011049723, + "grad_norm": 0.5292743444442749, + "learning_rate": 9.756540511105496e-05, + "loss": 2.0464, + "step": 4131 + }, + { + "epoch": 1.2682627378759976, + "grad_norm": 0.4979592561721802, + "learning_rate": 9.75638727426128e-05, + "loss": 1.9863, + "step": 4132 + }, + { + "epoch": 1.2685696746470227, + "grad_norm": 0.4681611657142639, + "learning_rate": 9.756233990411656e-05, + "loss": 1.9978, + "step": 4133 + }, + { + "epoch": 1.2688766114180479, + "grad_norm": 0.5034354329109192, + "learning_rate": 9.756080659558142e-05, + "loss": 2.0332, + "step": 4134 + }, + { + "epoch": 1.269183548189073, + "grad_norm": 0.4815942347049713, + "learning_rate": 9.75592728170225e-05, + "loss": 1.9669, + "step": 4135 + }, + { + "epoch": 1.269490484960098, + "grad_norm": 0.49555137753486633, + "learning_rate": 9.755773856845498e-05, + "loss": 1.9774, + "step": 4136 + }, + { + "epoch": 1.2697974217311234, + "grad_norm": 0.5533550381660461, + "learning_rate": 9.755620384989401e-05, + "loss": 2.0236, + "step": 4137 + }, + { + "epoch": 1.2701043585021485, + "grad_norm": 0.49497511982917786, + "learning_rate": 9.755466866135476e-05, + "loss": 1.9266, + "step": 4138 + }, + { + "epoch": 1.2704112952731736, + "grad_norm": 0.5009804964065552, + "learning_rate": 9.755313300285239e-05, + "loss": 1.9463, + "step": 4139 + }, + { + "epoch": 1.270718232044199, + "grad_norm": 0.49870428442955017, + "learning_rate": 9.755159687440209e-05, + "loss": 1.9566, + "step": 4140 + }, + { + "epoch": 1.271025168815224, + "grad_norm": 0.49113500118255615, + "learning_rate": 9.755006027601905e-05, + "loss": 2.0075, + "step": 4141 + }, + { + "epoch": 1.2713321055862492, + "grad_norm": 0.45977187156677246, + "learning_rate": 9.754852320771845e-05, + "loss": 1.9358, + "step": 4142 + }, + { + "epoch": 1.2716390423572743, + "grad_norm": 0.5493664145469666, + "learning_rate": 9.754698566951545e-05, + "loss": 1.9996, + "step": 4143 + }, + { + "epoch": 1.2719459791282997, + "grad_norm": 0.4791078567504883, + "learning_rate": 9.75454476614253e-05, + "loss": 1.9426, + "step": 4144 + }, + { + "epoch": 1.2722529158993248, + "grad_norm": 0.4809282720088959, + "learning_rate": 9.754390918346315e-05, + "loss": 2.0197, + "step": 4145 + }, + { + "epoch": 1.2725598526703499, + "grad_norm": 0.5380387902259827, + "learning_rate": 9.754237023564423e-05, + "loss": 2.0261, + "step": 4146 + }, + { + "epoch": 1.272866789441375, + "grad_norm": 0.48302608728408813, + "learning_rate": 9.754083081798374e-05, + "loss": 2.0539, + "step": 4147 + }, + { + "epoch": 1.2731737262124003, + "grad_norm": 0.5752124786376953, + "learning_rate": 9.75392909304969e-05, + "loss": 2.0901, + "step": 4148 + }, + { + "epoch": 1.2734806629834254, + "grad_norm": 0.5538807511329651, + "learning_rate": 9.75377505731989e-05, + "loss": 1.9721, + "step": 4149 + }, + { + "epoch": 1.2737875997544506, + "grad_norm": 0.6331756114959717, + "learning_rate": 9.753620974610502e-05, + "loss": 2.0124, + "step": 4150 + }, + { + "epoch": 1.2740945365254759, + "grad_norm": 0.6422140598297119, + "learning_rate": 9.753466844923042e-05, + "loss": 2.0115, + "step": 4151 + }, + { + "epoch": 1.274401473296501, + "grad_norm": 0.6650347113609314, + "learning_rate": 9.753312668259038e-05, + "loss": 1.9735, + "step": 4152 + }, + { + "epoch": 1.274708410067526, + "grad_norm": 0.587230384349823, + "learning_rate": 9.753158444620013e-05, + "loss": 1.9382, + "step": 4153 + }, + { + "epoch": 1.2750153468385512, + "grad_norm": 0.5357664823532104, + "learning_rate": 9.75300417400749e-05, + "loss": 2.0437, + "step": 4154 + }, + { + "epoch": 1.2753222836095763, + "grad_norm": 0.5058115720748901, + "learning_rate": 9.752849856422994e-05, + "loss": 2.0031, + "step": 4155 + }, + { + "epoch": 1.2756292203806017, + "grad_norm": 0.5913745164871216, + "learning_rate": 9.75269549186805e-05, + "loss": 1.9923, + "step": 4156 + }, + { + "epoch": 1.2759361571516268, + "grad_norm": 0.6766920685768127, + "learning_rate": 9.752541080344181e-05, + "loss": 1.9619, + "step": 4157 + }, + { + "epoch": 1.276243093922652, + "grad_norm": 0.606132984161377, + "learning_rate": 9.752386621852919e-05, + "loss": 1.9689, + "step": 4158 + }, + { + "epoch": 1.2765500306936772, + "grad_norm": 0.521133542060852, + "learning_rate": 9.752232116395785e-05, + "loss": 1.9602, + "step": 4159 + }, + { + "epoch": 1.2768569674647023, + "grad_norm": 0.45266324281692505, + "learning_rate": 9.75207756397431e-05, + "loss": 2.0032, + "step": 4160 + }, + { + "epoch": 1.2771639042357275, + "grad_norm": 0.5078892707824707, + "learning_rate": 9.751922964590017e-05, + "loss": 2.0656, + "step": 4161 + }, + { + "epoch": 1.2774708410067526, + "grad_norm": 0.5042154788970947, + "learning_rate": 9.751768318244437e-05, + "loss": 1.9356, + "step": 4162 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.5866135954856873, + "learning_rate": 9.751613624939098e-05, + "loss": 1.9655, + "step": 4163 + }, + { + "epoch": 1.278084714548803, + "grad_norm": 0.6038163304328918, + "learning_rate": 9.751458884675527e-05, + "loss": 1.9445, + "step": 4164 + }, + { + "epoch": 1.2783916513198281, + "grad_norm": 0.4938269555568695, + "learning_rate": 9.751304097455254e-05, + "loss": 2.0164, + "step": 4165 + }, + { + "epoch": 1.2786985880908532, + "grad_norm": 0.4289272427558899, + "learning_rate": 9.75114926327981e-05, + "loss": 1.912, + "step": 4166 + }, + { + "epoch": 1.2790055248618786, + "grad_norm": 0.524058997631073, + "learning_rate": 9.750994382150724e-05, + "loss": 1.9279, + "step": 4167 + }, + { + "epoch": 1.2793124616329037, + "grad_norm": 0.6318224668502808, + "learning_rate": 9.750839454069527e-05, + "loss": 1.98, + "step": 4168 + }, + { + "epoch": 1.2796193984039288, + "grad_norm": 0.5709670782089233, + "learning_rate": 9.750684479037749e-05, + "loss": 2.0029, + "step": 4169 + }, + { + "epoch": 1.279926335174954, + "grad_norm": 0.4621523916721344, + "learning_rate": 9.750529457056924e-05, + "loss": 2.0295, + "step": 4170 + }, + { + "epoch": 1.280233271945979, + "grad_norm": 0.5812001824378967, + "learning_rate": 9.750374388128581e-05, + "loss": 2.0839, + "step": 4171 + }, + { + "epoch": 1.2805402087170044, + "grad_norm": 0.6389874219894409, + "learning_rate": 9.750219272254256e-05, + "loss": 2.0825, + "step": 4172 + }, + { + "epoch": 1.2808471454880295, + "grad_norm": 0.49902382493019104, + "learning_rate": 9.750064109435478e-05, + "loss": 1.8902, + "step": 4173 + }, + { + "epoch": 1.2811540822590546, + "grad_norm": 0.5641525983810425, + "learning_rate": 9.749908899673783e-05, + "loss": 2.0463, + "step": 4174 + }, + { + "epoch": 1.28146101903008, + "grad_norm": 0.5977841019630432, + "learning_rate": 9.749753642970704e-05, + "loss": 2.0253, + "step": 4175 + }, + { + "epoch": 1.281767955801105, + "grad_norm": 0.5438104271888733, + "learning_rate": 9.749598339327777e-05, + "loss": 1.9862, + "step": 4176 + }, + { + "epoch": 1.2820748925721301, + "grad_norm": 0.4542587697505951, + "learning_rate": 9.749442988746535e-05, + "loss": 1.9476, + "step": 4177 + }, + { + "epoch": 1.2823818293431553, + "grad_norm": 0.4900791347026825, + "learning_rate": 9.749287591228513e-05, + "loss": 2.0093, + "step": 4178 + }, + { + "epoch": 1.2826887661141804, + "grad_norm": 0.5837534666061401, + "learning_rate": 9.749132146775247e-05, + "loss": 2.0699, + "step": 4179 + }, + { + "epoch": 1.2829957028852057, + "grad_norm": 0.5315881967544556, + "learning_rate": 9.748976655388274e-05, + "loss": 1.9514, + "step": 4180 + }, + { + "epoch": 1.2833026396562308, + "grad_norm": 0.5284895300865173, + "learning_rate": 9.74882111706913e-05, + "loss": 2.0171, + "step": 4181 + }, + { + "epoch": 1.283609576427256, + "grad_norm": 0.521202802658081, + "learning_rate": 9.748665531819352e-05, + "loss": 2.025, + "step": 4182 + }, + { + "epoch": 1.2839165131982813, + "grad_norm": 0.5437573194503784, + "learning_rate": 9.748509899640479e-05, + "loss": 2.0352, + "step": 4183 + }, + { + "epoch": 1.2842234499693064, + "grad_norm": 0.5394143462181091, + "learning_rate": 9.748354220534048e-05, + "loss": 2.0245, + "step": 4184 + }, + { + "epoch": 1.2845303867403315, + "grad_norm": 0.47468093037605286, + "learning_rate": 9.748198494501597e-05, + "loss": 1.9719, + "step": 4185 + }, + { + "epoch": 1.2848373235113566, + "grad_norm": 0.5312216877937317, + "learning_rate": 9.748042721544666e-05, + "loss": 2.0111, + "step": 4186 + }, + { + "epoch": 1.2851442602823817, + "grad_norm": 0.525694727897644, + "learning_rate": 9.747886901664794e-05, + "loss": 2.0582, + "step": 4187 + }, + { + "epoch": 1.285451197053407, + "grad_norm": 0.4965955317020416, + "learning_rate": 9.74773103486352e-05, + "loss": 1.9777, + "step": 4188 + }, + { + "epoch": 1.2857581338244322, + "grad_norm": 0.4391513466835022, + "learning_rate": 9.747575121142385e-05, + "loss": 1.9725, + "step": 4189 + }, + { + "epoch": 1.2860650705954573, + "grad_norm": 0.48999011516571045, + "learning_rate": 9.74741916050293e-05, + "loss": 1.953, + "step": 4190 + }, + { + "epoch": 1.2863720073664826, + "grad_norm": 0.5297304391860962, + "learning_rate": 9.747263152946698e-05, + "loss": 2.0484, + "step": 4191 + }, + { + "epoch": 1.2866789441375077, + "grad_norm": 0.4878230690956116, + "learning_rate": 9.747107098475226e-05, + "loss": 2.0423, + "step": 4192 + }, + { + "epoch": 1.2869858809085328, + "grad_norm": 0.538070023059845, + "learning_rate": 9.74695099709006e-05, + "loss": 2.0699, + "step": 4193 + }, + { + "epoch": 1.287292817679558, + "grad_norm": 0.6656436324119568, + "learning_rate": 9.746794848792743e-05, + "loss": 2.0689, + "step": 4194 + }, + { + "epoch": 1.287599754450583, + "grad_norm": 0.6416848301887512, + "learning_rate": 9.746638653584819e-05, + "loss": 1.9796, + "step": 4195 + }, + { + "epoch": 1.2879066912216084, + "grad_norm": 0.5917447805404663, + "learning_rate": 9.746482411467827e-05, + "loss": 2.0324, + "step": 4196 + }, + { + "epoch": 1.2882136279926335, + "grad_norm": 0.5234537124633789, + "learning_rate": 9.746326122443314e-05, + "loss": 2.0468, + "step": 4197 + }, + { + "epoch": 1.2885205647636586, + "grad_norm": 0.4885808229446411, + "learning_rate": 9.746169786512827e-05, + "loss": 1.9619, + "step": 4198 + }, + { + "epoch": 1.288827501534684, + "grad_norm": 0.5776945948600769, + "learning_rate": 9.746013403677905e-05, + "loss": 2.0167, + "step": 4199 + }, + { + "epoch": 1.289134438305709, + "grad_norm": 0.5722271203994751, + "learning_rate": 9.745856973940099e-05, + "loss": 1.9751, + "step": 4200 + }, + { + "epoch": 1.2894413750767342, + "grad_norm": 0.49253931641578674, + "learning_rate": 9.745700497300951e-05, + "loss": 1.9821, + "step": 4201 + }, + { + "epoch": 1.2897483118477593, + "grad_norm": 0.4739282727241516, + "learning_rate": 9.74554397376201e-05, + "loss": 1.9926, + "step": 4202 + }, + { + "epoch": 1.2900552486187844, + "grad_norm": 0.5133153200149536, + "learning_rate": 9.745387403324823e-05, + "loss": 1.9655, + "step": 4203 + }, + { + "epoch": 1.2903621853898097, + "grad_norm": 0.48941388726234436, + "learning_rate": 9.745230785990935e-05, + "loss": 1.9401, + "step": 4204 + }, + { + "epoch": 1.2906691221608348, + "grad_norm": 0.5998152494430542, + "learning_rate": 9.745074121761896e-05, + "loss": 2.0223, + "step": 4205 + }, + { + "epoch": 1.29097605893186, + "grad_norm": 0.4423331618309021, + "learning_rate": 9.744917410639253e-05, + "loss": 1.9602, + "step": 4206 + }, + { + "epoch": 1.2912829957028853, + "grad_norm": 0.5387418866157532, + "learning_rate": 9.744760652624553e-05, + "loss": 2.0631, + "step": 4207 + }, + { + "epoch": 1.2915899324739104, + "grad_norm": 0.5992900729179382, + "learning_rate": 9.744603847719352e-05, + "loss": 1.9805, + "step": 4208 + }, + { + "epoch": 1.2918968692449355, + "grad_norm": 0.5033924579620361, + "learning_rate": 9.744446995925192e-05, + "loss": 1.9817, + "step": 4209 + }, + { + "epoch": 1.2922038060159606, + "grad_norm": 0.47493448853492737, + "learning_rate": 9.744290097243624e-05, + "loss": 2.0259, + "step": 4210 + }, + { + "epoch": 1.2925107427869857, + "grad_norm": 0.5161942839622498, + "learning_rate": 9.744133151676203e-05, + "loss": 1.9686, + "step": 4211 + }, + { + "epoch": 1.292817679558011, + "grad_norm": 0.4476351737976074, + "learning_rate": 9.743976159224477e-05, + "loss": 1.9488, + "step": 4212 + }, + { + "epoch": 1.2931246163290362, + "grad_norm": 0.5168361663818359, + "learning_rate": 9.743819119889999e-05, + "loss": 2.0645, + "step": 4213 + }, + { + "epoch": 1.2934315531000613, + "grad_norm": 0.5098811984062195, + "learning_rate": 9.743662033674319e-05, + "loss": 1.9889, + "step": 4214 + }, + { + "epoch": 1.2937384898710866, + "grad_norm": 0.5559372305870056, + "learning_rate": 9.74350490057899e-05, + "loss": 2.0348, + "step": 4215 + }, + { + "epoch": 1.2940454266421118, + "grad_norm": 0.5274948477745056, + "learning_rate": 9.743347720605566e-05, + "loss": 2.0566, + "step": 4216 + }, + { + "epoch": 1.2943523634131369, + "grad_norm": 0.5009967088699341, + "learning_rate": 9.743190493755601e-05, + "loss": 1.9915, + "step": 4217 + }, + { + "epoch": 1.2946593001841622, + "grad_norm": 0.5365834832191467, + "learning_rate": 9.743033220030646e-05, + "loss": 2.0581, + "step": 4218 + }, + { + "epoch": 1.2949662369551873, + "grad_norm": 0.519478976726532, + "learning_rate": 9.742875899432255e-05, + "loss": 1.9766, + "step": 4219 + }, + { + "epoch": 1.2952731737262124, + "grad_norm": 0.48030364513397217, + "learning_rate": 9.742718531961988e-05, + "loss": 2.0006, + "step": 4220 + }, + { + "epoch": 1.2955801104972375, + "grad_norm": 0.5257472991943359, + "learning_rate": 9.742561117621394e-05, + "loss": 2.0636, + "step": 4221 + }, + { + "epoch": 1.2958870472682626, + "grad_norm": 0.44784319400787354, + "learning_rate": 9.742403656412034e-05, + "loss": 1.9975, + "step": 4222 + }, + { + "epoch": 1.296193984039288, + "grad_norm": 0.4997022747993469, + "learning_rate": 9.742246148335459e-05, + "loss": 2.0167, + "step": 4223 + }, + { + "epoch": 1.296500920810313, + "grad_norm": 0.43378305435180664, + "learning_rate": 9.742088593393228e-05, + "loss": 1.9202, + "step": 4224 + }, + { + "epoch": 1.2968078575813382, + "grad_norm": 0.5256497859954834, + "learning_rate": 9.741930991586899e-05, + "loss": 2.0306, + "step": 4225 + }, + { + "epoch": 1.2971147943523635, + "grad_norm": 0.5017027258872986, + "learning_rate": 9.741773342918028e-05, + "loss": 2.0124, + "step": 4226 + }, + { + "epoch": 1.2974217311233887, + "grad_norm": 0.5393915176391602, + "learning_rate": 9.741615647388175e-05, + "loss": 2.0255, + "step": 4227 + }, + { + "epoch": 1.2977286678944138, + "grad_norm": 0.48618295788764954, + "learning_rate": 9.741457904998896e-05, + "loss": 1.9863, + "step": 4228 + }, + { + "epoch": 1.2980356046654389, + "grad_norm": 0.48060059547424316, + "learning_rate": 9.741300115751752e-05, + "loss": 2.0787, + "step": 4229 + }, + { + "epoch": 1.298342541436464, + "grad_norm": 0.4966236650943756, + "learning_rate": 9.741142279648298e-05, + "loss": 1.9818, + "step": 4230 + }, + { + "epoch": 1.2986494782074893, + "grad_norm": 0.5178021788597107, + "learning_rate": 9.7409843966901e-05, + "loss": 1.9847, + "step": 4231 + }, + { + "epoch": 1.2989564149785144, + "grad_norm": 0.5161063075065613, + "learning_rate": 9.740826466878716e-05, + "loss": 2.0028, + "step": 4232 + }, + { + "epoch": 1.2992633517495396, + "grad_norm": 0.5972462296485901, + "learning_rate": 9.740668490215705e-05, + "loss": 2.0205, + "step": 4233 + }, + { + "epoch": 1.2995702885205649, + "grad_norm": 0.5929185152053833, + "learning_rate": 9.740510466702629e-05, + "loss": 1.9802, + "step": 4234 + }, + { + "epoch": 1.29987722529159, + "grad_norm": 0.5496684908866882, + "learning_rate": 9.74035239634105e-05, + "loss": 1.9331, + "step": 4235 + }, + { + "epoch": 1.3001841620626151, + "grad_norm": 0.5822622179985046, + "learning_rate": 9.740194279132531e-05, + "loss": 2.1079, + "step": 4236 + }, + { + "epoch": 1.3004910988336402, + "grad_norm": 0.5886369943618774, + "learning_rate": 9.740036115078634e-05, + "loss": 1.9938, + "step": 4237 + }, + { + "epoch": 1.3007980356046653, + "grad_norm": 0.5259171724319458, + "learning_rate": 9.73987790418092e-05, + "loss": 2.0787, + "step": 4238 + }, + { + "epoch": 1.3011049723756907, + "grad_norm": 0.6112152934074402, + "learning_rate": 9.739719646440956e-05, + "loss": 2.0488, + "step": 4239 + }, + { + "epoch": 1.3014119091467158, + "grad_norm": 0.5786338448524475, + "learning_rate": 9.739561341860306e-05, + "loss": 1.9917, + "step": 4240 + }, + { + "epoch": 1.301718845917741, + "grad_norm": 0.5099230408668518, + "learning_rate": 9.739402990440531e-05, + "loss": 1.9949, + "step": 4241 + }, + { + "epoch": 1.3020257826887662, + "grad_norm": 0.5040346384048462, + "learning_rate": 9.739244592183198e-05, + "loss": 1.9368, + "step": 4242 + }, + { + "epoch": 1.3023327194597913, + "grad_norm": 0.48172008991241455, + "learning_rate": 9.739086147089871e-05, + "loss": 1.97, + "step": 4243 + }, + { + "epoch": 1.3026396562308165, + "grad_norm": 0.5350810885429382, + "learning_rate": 9.738927655162119e-05, + "loss": 2.0584, + "step": 4244 + }, + { + "epoch": 1.3029465930018416, + "grad_norm": 0.566371738910675, + "learning_rate": 9.738769116401505e-05, + "loss": 2.0138, + "step": 4245 + }, + { + "epoch": 1.3032535297728667, + "grad_norm": 0.5697746872901917, + "learning_rate": 9.738610530809598e-05, + "loss": 2.0319, + "step": 4246 + }, + { + "epoch": 1.303560466543892, + "grad_norm": 0.5186757445335388, + "learning_rate": 9.738451898387964e-05, + "loss": 1.9958, + "step": 4247 + }, + { + "epoch": 1.3038674033149171, + "grad_norm": 0.5318703651428223, + "learning_rate": 9.73829321913817e-05, + "loss": 2.0857, + "step": 4248 + }, + { + "epoch": 1.3041743400859422, + "grad_norm": 0.5013560056686401, + "learning_rate": 9.738134493061786e-05, + "loss": 1.9545, + "step": 4249 + }, + { + "epoch": 1.3044812768569676, + "grad_norm": 0.499009907245636, + "learning_rate": 9.737975720160382e-05, + "loss": 1.9773, + "step": 4250 + }, + { + "epoch": 1.3047882136279927, + "grad_norm": 0.5187140703201294, + "learning_rate": 9.737816900435522e-05, + "loss": 1.9826, + "step": 4251 + }, + { + "epoch": 1.3050951503990178, + "grad_norm": 0.4950683116912842, + "learning_rate": 9.73765803388878e-05, + "loss": 2.0061, + "step": 4252 + }, + { + "epoch": 1.305402087170043, + "grad_norm": 0.40729087591171265, + "learning_rate": 9.737499120521722e-05, + "loss": 1.9502, + "step": 4253 + }, + { + "epoch": 1.305709023941068, + "grad_norm": 0.4959156811237335, + "learning_rate": 9.737340160335924e-05, + "loss": 2.0975, + "step": 4254 + }, + { + "epoch": 1.3060159607120934, + "grad_norm": 0.5127618312835693, + "learning_rate": 9.737181153332952e-05, + "loss": 2.0098, + "step": 4255 + }, + { + "epoch": 1.3063228974831185, + "grad_norm": 0.45458972454071045, + "learning_rate": 9.737022099514381e-05, + "loss": 1.9475, + "step": 4256 + }, + { + "epoch": 1.3066298342541436, + "grad_norm": 0.5024627447128296, + "learning_rate": 9.736862998881779e-05, + "loss": 2.0682, + "step": 4257 + }, + { + "epoch": 1.306936771025169, + "grad_norm": 0.5217326283454895, + "learning_rate": 9.736703851436722e-05, + "loss": 2.0363, + "step": 4258 + }, + { + "epoch": 1.307243707796194, + "grad_norm": 0.4798679053783417, + "learning_rate": 9.736544657180781e-05, + "loss": 2.0357, + "step": 4259 + }, + { + "epoch": 1.3075506445672191, + "grad_norm": 0.6031736135482788, + "learning_rate": 9.73638541611553e-05, + "loss": 2.0143, + "step": 4260 + }, + { + "epoch": 1.3078575813382443, + "grad_norm": 0.4914969801902771, + "learning_rate": 9.736226128242542e-05, + "loss": 1.9292, + "step": 4261 + }, + { + "epoch": 1.3081645181092694, + "grad_norm": 0.40556418895721436, + "learning_rate": 9.736066793563392e-05, + "loss": 1.9528, + "step": 4262 + }, + { + "epoch": 1.3084714548802947, + "grad_norm": 0.45605841279029846, + "learning_rate": 9.735907412079652e-05, + "loss": 2.0704, + "step": 4263 + }, + { + "epoch": 1.3087783916513198, + "grad_norm": 0.4992324113845825, + "learning_rate": 9.7357479837929e-05, + "loss": 2.0211, + "step": 4264 + }, + { + "epoch": 1.309085328422345, + "grad_norm": 0.4904097020626068, + "learning_rate": 9.735588508704712e-05, + "loss": 1.987, + "step": 4265 + }, + { + "epoch": 1.3093922651933703, + "grad_norm": 0.5436086058616638, + "learning_rate": 9.735428986816661e-05, + "loss": 2.0704, + "step": 4266 + }, + { + "epoch": 1.3096992019643954, + "grad_norm": 0.4850294589996338, + "learning_rate": 9.735269418130326e-05, + "loss": 1.9576, + "step": 4267 + }, + { + "epoch": 1.3100061387354205, + "grad_norm": 0.44082164764404297, + "learning_rate": 9.735109802647283e-05, + "loss": 2.0018, + "step": 4268 + }, + { + "epoch": 1.3103130755064456, + "grad_norm": 0.4844531714916229, + "learning_rate": 9.73495014036911e-05, + "loss": 1.9852, + "step": 4269 + }, + { + "epoch": 1.3106200122774707, + "grad_norm": 0.547596275806427, + "learning_rate": 9.734790431297384e-05, + "loss": 2.0632, + "step": 4270 + }, + { + "epoch": 1.310926949048496, + "grad_norm": 0.517882764339447, + "learning_rate": 9.734630675433684e-05, + "loss": 1.9851, + "step": 4271 + }, + { + "epoch": 1.3112338858195212, + "grad_norm": 0.5148623585700989, + "learning_rate": 9.734470872779589e-05, + "loss": 2.0446, + "step": 4272 + }, + { + "epoch": 1.3115408225905463, + "grad_norm": 0.5872887372970581, + "learning_rate": 9.734311023336678e-05, + "loss": 2.0588, + "step": 4273 + }, + { + "epoch": 1.3118477593615716, + "grad_norm": 0.7116255164146423, + "learning_rate": 9.73415112710653e-05, + "loss": 2.0213, + "step": 4274 + }, + { + "epoch": 1.3121546961325967, + "grad_norm": 0.8191964626312256, + "learning_rate": 9.733991184090725e-05, + "loss": 1.9528, + "step": 4275 + }, + { + "epoch": 1.3124616329036218, + "grad_norm": 0.8214605450630188, + "learning_rate": 9.733831194290846e-05, + "loss": 1.9614, + "step": 4276 + }, + { + "epoch": 1.312768569674647, + "grad_norm": 0.7057182788848877, + "learning_rate": 9.733671157708472e-05, + "loss": 2.0767, + "step": 4277 + }, + { + "epoch": 1.313075506445672, + "grad_norm": 0.5114007592201233, + "learning_rate": 9.733511074345185e-05, + "loss": 1.946, + "step": 4278 + }, + { + "epoch": 1.3133824432166974, + "grad_norm": 0.5347970128059387, + "learning_rate": 9.733350944202566e-05, + "loss": 1.9658, + "step": 4279 + }, + { + "epoch": 1.3136893799877225, + "grad_norm": 0.6962214112281799, + "learning_rate": 9.733190767282202e-05, + "loss": 2.0943, + "step": 4280 + }, + { + "epoch": 1.3139963167587476, + "grad_norm": 0.5942707657814026, + "learning_rate": 9.733030543585668e-05, + "loss": 2.0101, + "step": 4281 + }, + { + "epoch": 1.314303253529773, + "grad_norm": 0.46218639612197876, + "learning_rate": 9.732870273114556e-05, + "loss": 2.0292, + "step": 4282 + }, + { + "epoch": 1.314610190300798, + "grad_norm": 0.5194444060325623, + "learning_rate": 9.732709955870445e-05, + "loss": 2.0666, + "step": 4283 + }, + { + "epoch": 1.3149171270718232, + "grad_norm": 0.5112141370773315, + "learning_rate": 9.732549591854918e-05, + "loss": 2.0205, + "step": 4284 + }, + { + "epoch": 1.3152240638428485, + "grad_norm": 0.5282790660858154, + "learning_rate": 9.732389181069566e-05, + "loss": 2.0704, + "step": 4285 + }, + { + "epoch": 1.3155310006138736, + "grad_norm": 0.4598311185836792, + "learning_rate": 9.732228723515968e-05, + "loss": 1.9485, + "step": 4286 + }, + { + "epoch": 1.3158379373848987, + "grad_norm": 0.4700186550617218, + "learning_rate": 9.732068219195711e-05, + "loss": 2.0329, + "step": 4287 + }, + { + "epoch": 1.3161448741559238, + "grad_norm": 0.4512452781200409, + "learning_rate": 9.731907668110384e-05, + "loss": 1.9829, + "step": 4288 + }, + { + "epoch": 1.316451810926949, + "grad_norm": 0.5053353309631348, + "learning_rate": 9.731747070261572e-05, + "loss": 2.0583, + "step": 4289 + }, + { + "epoch": 1.3167587476979743, + "grad_norm": 0.48143625259399414, + "learning_rate": 9.73158642565086e-05, + "loss": 2.014, + "step": 4290 + }, + { + "epoch": 1.3170656844689994, + "grad_norm": 0.4843716025352478, + "learning_rate": 9.73142573427984e-05, + "loss": 1.9951, + "step": 4291 + }, + { + "epoch": 1.3173726212400245, + "grad_norm": 0.45646217465400696, + "learning_rate": 9.731264996150098e-05, + "loss": 1.9701, + "step": 4292 + }, + { + "epoch": 1.3176795580110499, + "grad_norm": 0.5176306962966919, + "learning_rate": 9.73110421126322e-05, + "loss": 1.9915, + "step": 4293 + }, + { + "epoch": 1.317986494782075, + "grad_norm": 0.4862259328365326, + "learning_rate": 9.730943379620799e-05, + "loss": 2.0157, + "step": 4294 + }, + { + "epoch": 1.3182934315531, + "grad_norm": 0.4941593110561371, + "learning_rate": 9.730782501224423e-05, + "loss": 2.0164, + "step": 4295 + }, + { + "epoch": 1.3186003683241252, + "grad_norm": 0.46818530559539795, + "learning_rate": 9.73062157607568e-05, + "loss": 1.9749, + "step": 4296 + }, + { + "epoch": 1.3189073050951503, + "grad_norm": 0.41685113310813904, + "learning_rate": 9.730460604176163e-05, + "loss": 1.9443, + "step": 4297 + }, + { + "epoch": 1.3192142418661756, + "grad_norm": 0.40586861968040466, + "learning_rate": 9.73029958552746e-05, + "loss": 1.9227, + "step": 4298 + }, + { + "epoch": 1.3195211786372008, + "grad_norm": 0.3946068286895752, + "learning_rate": 9.730138520131167e-05, + "loss": 1.9073, + "step": 4299 + }, + { + "epoch": 1.3198281154082259, + "grad_norm": 0.3722321093082428, + "learning_rate": 9.729977407988871e-05, + "loss": 1.9299, + "step": 4300 + }, + { + "epoch": 1.3201350521792512, + "grad_norm": 0.39335691928863525, + "learning_rate": 9.729816249102164e-05, + "loss": 1.9673, + "step": 4301 + }, + { + "epoch": 1.3204419889502763, + "grad_norm": 0.4342779815196991, + "learning_rate": 9.729655043472643e-05, + "loss": 2.0704, + "step": 4302 + }, + { + "epoch": 1.3207489257213014, + "grad_norm": 0.46981000900268555, + "learning_rate": 9.729493791101899e-05, + "loss": 2.0593, + "step": 4303 + }, + { + "epoch": 1.3210558624923265, + "grad_norm": 0.4319849908351898, + "learning_rate": 9.729332491991524e-05, + "loss": 1.9378, + "step": 4304 + }, + { + "epoch": 1.3213627992633517, + "grad_norm": 0.4555012285709381, + "learning_rate": 9.729171146143115e-05, + "loss": 1.993, + "step": 4305 + }, + { + "epoch": 1.321669736034377, + "grad_norm": 0.5122297406196594, + "learning_rate": 9.729009753558262e-05, + "loss": 2.0237, + "step": 4306 + }, + { + "epoch": 1.321976672805402, + "grad_norm": 0.4814549386501312, + "learning_rate": 9.728848314238566e-05, + "loss": 2.0063, + "step": 4307 + }, + { + "epoch": 1.3222836095764272, + "grad_norm": 0.45410022139549255, + "learning_rate": 9.728686828185618e-05, + "loss": 2.0262, + "step": 4308 + }, + { + "epoch": 1.3225905463474525, + "grad_norm": 0.44759154319763184, + "learning_rate": 9.728525295401014e-05, + "loss": 1.9746, + "step": 4309 + }, + { + "epoch": 1.3228974831184777, + "grad_norm": 0.41539889574050903, + "learning_rate": 9.728363715886352e-05, + "loss": 1.9197, + "step": 4310 + }, + { + "epoch": 1.3232044198895028, + "grad_norm": 0.549961268901825, + "learning_rate": 9.72820208964323e-05, + "loss": 2.0168, + "step": 4311 + }, + { + "epoch": 1.3235113566605279, + "grad_norm": 0.6832249164581299, + "learning_rate": 9.728040416673243e-05, + "loss": 1.9711, + "step": 4312 + }, + { + "epoch": 1.323818293431553, + "grad_norm": 0.7458481788635254, + "learning_rate": 9.727878696977988e-05, + "loss": 2.1677, + "step": 4313 + }, + { + "epoch": 1.3241252302025783, + "grad_norm": 0.6268119812011719, + "learning_rate": 9.727716930559066e-05, + "loss": 2.0222, + "step": 4314 + }, + { + "epoch": 1.3244321669736034, + "grad_norm": 0.540987491607666, + "learning_rate": 9.727555117418075e-05, + "loss": 2.0552, + "step": 4315 + }, + { + "epoch": 1.3247391037446286, + "grad_norm": 0.6105024814605713, + "learning_rate": 9.727393257556612e-05, + "loss": 1.9287, + "step": 4316 + }, + { + "epoch": 1.325046040515654, + "grad_norm": 0.594327449798584, + "learning_rate": 9.727231350976277e-05, + "loss": 1.9737, + "step": 4317 + }, + { + "epoch": 1.325352977286679, + "grad_norm": 0.5686312913894653, + "learning_rate": 9.727069397678674e-05, + "loss": 1.988, + "step": 4318 + }, + { + "epoch": 1.3256599140577041, + "grad_norm": 0.5335875153541565, + "learning_rate": 9.726907397665399e-05, + "loss": 1.9992, + "step": 4319 + }, + { + "epoch": 1.3259668508287292, + "grad_norm": 0.514209508895874, + "learning_rate": 9.726745350938055e-05, + "loss": 2.0928, + "step": 4320 + }, + { + "epoch": 1.3262737875997543, + "grad_norm": 0.58844393491745, + "learning_rate": 9.726583257498242e-05, + "loss": 1.968, + "step": 4321 + }, + { + "epoch": 1.3265807243707797, + "grad_norm": 0.5247591733932495, + "learning_rate": 9.726421117347563e-05, + "loss": 1.9529, + "step": 4322 + }, + { + "epoch": 1.3268876611418048, + "grad_norm": 0.5057464241981506, + "learning_rate": 9.726258930487622e-05, + "loss": 2.0595, + "step": 4323 + }, + { + "epoch": 1.32719459791283, + "grad_norm": 0.564689040184021, + "learning_rate": 9.726096696920019e-05, + "loss": 1.9974, + "step": 4324 + }, + { + "epoch": 1.3275015346838552, + "grad_norm": 0.5755618214607239, + "learning_rate": 9.725934416646358e-05, + "loss": 1.9949, + "step": 4325 + }, + { + "epoch": 1.3278084714548803, + "grad_norm": 0.5969316959381104, + "learning_rate": 9.725772089668243e-05, + "loss": 1.972, + "step": 4326 + }, + { + "epoch": 1.3281154082259055, + "grad_norm": 0.5776877403259277, + "learning_rate": 9.725609715987278e-05, + "loss": 2.1018, + "step": 4327 + }, + { + "epoch": 1.3284223449969306, + "grad_norm": 0.5471270680427551, + "learning_rate": 9.725447295605071e-05, + "loss": 2.0153, + "step": 4328 + }, + { + "epoch": 1.3287292817679557, + "grad_norm": 0.49090373516082764, + "learning_rate": 9.725284828523222e-05, + "loss": 1.9651, + "step": 4329 + }, + { + "epoch": 1.329036218538981, + "grad_norm": 0.49420034885406494, + "learning_rate": 9.725122314743337e-05, + "loss": 2.0119, + "step": 4330 + }, + { + "epoch": 1.3293431553100061, + "grad_norm": 0.4841148853302002, + "learning_rate": 9.724959754267027e-05, + "loss": 1.974, + "step": 4331 + }, + { + "epoch": 1.3296500920810312, + "grad_norm": 0.42349007725715637, + "learning_rate": 9.724797147095893e-05, + "loss": 1.9779, + "step": 4332 + }, + { + "epoch": 1.3299570288520566, + "grad_norm": 0.47239863872528076, + "learning_rate": 9.724634493231545e-05, + "loss": 1.9184, + "step": 4333 + }, + { + "epoch": 1.3302639656230817, + "grad_norm": 0.5583773255348206, + "learning_rate": 9.72447179267559e-05, + "loss": 2.0742, + "step": 4334 + }, + { + "epoch": 1.3305709023941068, + "grad_norm": 0.486937552690506, + "learning_rate": 9.724309045429636e-05, + "loss": 2.0101, + "step": 4335 + }, + { + "epoch": 1.330877839165132, + "grad_norm": 0.42204493284225464, + "learning_rate": 9.724146251495289e-05, + "loss": 1.9564, + "step": 4336 + }, + { + "epoch": 1.331184775936157, + "grad_norm": 0.451628714799881, + "learning_rate": 9.723983410874163e-05, + "loss": 1.9949, + "step": 4337 + }, + { + "epoch": 1.3314917127071824, + "grad_norm": 0.4453491270542145, + "learning_rate": 9.723820523567861e-05, + "loss": 1.9415, + "step": 4338 + }, + { + "epoch": 1.3317986494782075, + "grad_norm": 0.4628424644470215, + "learning_rate": 9.723657589577999e-05, + "loss": 2.0296, + "step": 4339 + }, + { + "epoch": 1.3321055862492326, + "grad_norm": 0.5362148284912109, + "learning_rate": 9.723494608906181e-05, + "loss": 2.0719, + "step": 4340 + }, + { + "epoch": 1.332412523020258, + "grad_norm": 0.45357146859169006, + "learning_rate": 9.723331581554023e-05, + "loss": 1.9107, + "step": 4341 + }, + { + "epoch": 1.332719459791283, + "grad_norm": 0.5042485594749451, + "learning_rate": 9.723168507523133e-05, + "loss": 1.9838, + "step": 4342 + }, + { + "epoch": 1.3330263965623081, + "grad_norm": 0.4797585606575012, + "learning_rate": 9.723005386815123e-05, + "loss": 1.9779, + "step": 4343 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4489155113697052, + "learning_rate": 9.722842219431607e-05, + "loss": 1.9805, + "step": 4344 + }, + { + "epoch": 1.3336402701043584, + "grad_norm": 0.43091216683387756, + "learning_rate": 9.722679005374196e-05, + "loss": 1.9708, + "step": 4345 + }, + { + "epoch": 1.3339472068753837, + "grad_norm": 0.453937292098999, + "learning_rate": 9.722515744644502e-05, + "loss": 2.0038, + "step": 4346 + }, + { + "epoch": 1.3342541436464088, + "grad_norm": 0.38905346393585205, + "learning_rate": 9.722352437244138e-05, + "loss": 2.0042, + "step": 4347 + }, + { + "epoch": 1.334561080417434, + "grad_norm": 0.46686118841171265, + "learning_rate": 9.722189083174722e-05, + "loss": 2.0733, + "step": 4348 + }, + { + "epoch": 1.3348680171884593, + "grad_norm": 0.42737439274787903, + "learning_rate": 9.722025682437865e-05, + "loss": 1.9572, + "step": 4349 + }, + { + "epoch": 1.3351749539594844, + "grad_norm": 0.3857511878013611, + "learning_rate": 9.721862235035181e-05, + "loss": 1.9288, + "step": 4350 + }, + { + "epoch": 1.3354818907305095, + "grad_norm": 0.42448824644088745, + "learning_rate": 9.721698740968288e-05, + "loss": 1.99, + "step": 4351 + }, + { + "epoch": 1.3357888275015346, + "grad_norm": 0.4753642976284027, + "learning_rate": 9.721535200238802e-05, + "loss": 2.0268, + "step": 4352 + }, + { + "epoch": 1.3360957642725597, + "grad_norm": 0.5248960256576538, + "learning_rate": 9.721371612848336e-05, + "loss": 2.008, + "step": 4353 + }, + { + "epoch": 1.336402701043585, + "grad_norm": 0.5046865344047546, + "learning_rate": 9.721207978798507e-05, + "loss": 1.9248, + "step": 4354 + }, + { + "epoch": 1.3367096378146102, + "grad_norm": 0.48205190896987915, + "learning_rate": 9.721044298090937e-05, + "loss": 1.9895, + "step": 4355 + }, + { + "epoch": 1.3370165745856353, + "grad_norm": 0.46149346232414246, + "learning_rate": 9.720880570727238e-05, + "loss": 2.0001, + "step": 4356 + }, + { + "epoch": 1.3373235113566606, + "grad_norm": 0.6212405562400818, + "learning_rate": 9.72071679670903e-05, + "loss": 2.0772, + "step": 4357 + }, + { + "epoch": 1.3376304481276857, + "grad_norm": 0.6935828924179077, + "learning_rate": 9.720552976037934e-05, + "loss": 1.9865, + "step": 4358 + }, + { + "epoch": 1.3379373848987108, + "grad_norm": 0.6850154399871826, + "learning_rate": 9.720389108715564e-05, + "loss": 1.9964, + "step": 4359 + }, + { + "epoch": 1.3382443216697362, + "grad_norm": 0.5925734043121338, + "learning_rate": 9.720225194743544e-05, + "loss": 2.0109, + "step": 4360 + }, + { + "epoch": 1.3385512584407613, + "grad_norm": 0.47503459453582764, + "learning_rate": 9.720061234123492e-05, + "loss": 2.0406, + "step": 4361 + }, + { + "epoch": 1.3388581952117864, + "grad_norm": 0.44226083159446716, + "learning_rate": 9.719897226857026e-05, + "loss": 1.953, + "step": 4362 + }, + { + "epoch": 1.3391651319828115, + "grad_norm": 0.5688608884811401, + "learning_rate": 9.719733172945772e-05, + "loss": 1.9422, + "step": 4363 + }, + { + "epoch": 1.3394720687538366, + "grad_norm": 0.6097545027732849, + "learning_rate": 9.719569072391347e-05, + "loss": 2.0204, + "step": 4364 + }, + { + "epoch": 1.339779005524862, + "grad_norm": 0.44313064217567444, + "learning_rate": 9.719404925195374e-05, + "loss": 1.9458, + "step": 4365 + }, + { + "epoch": 1.340085942295887, + "grad_norm": 0.495632141828537, + "learning_rate": 9.719240731359476e-05, + "loss": 1.9682, + "step": 4366 + }, + { + "epoch": 1.3403928790669122, + "grad_norm": 0.5843736529350281, + "learning_rate": 9.719076490885275e-05, + "loss": 1.9948, + "step": 4367 + }, + { + "epoch": 1.3406998158379375, + "grad_norm": 0.6249645352363586, + "learning_rate": 9.718912203774395e-05, + "loss": 1.9675, + "step": 4368 + }, + { + "epoch": 1.3410067526089626, + "grad_norm": 0.48386043310165405, + "learning_rate": 9.718747870028457e-05, + "loss": 1.9678, + "step": 4369 + }, + { + "epoch": 1.3413136893799877, + "grad_norm": 0.4797835648059845, + "learning_rate": 9.718583489649088e-05, + "loss": 2.0118, + "step": 4370 + }, + { + "epoch": 1.3416206261510129, + "grad_norm": 0.6131169199943542, + "learning_rate": 9.718419062637911e-05, + "loss": 2.0057, + "step": 4371 + }, + { + "epoch": 1.341927562922038, + "grad_norm": 0.6230120062828064, + "learning_rate": 9.718254588996552e-05, + "loss": 1.9871, + "step": 4372 + }, + { + "epoch": 1.3422344996930633, + "grad_norm": 0.5323978662490845, + "learning_rate": 9.718090068726633e-05, + "loss": 1.9389, + "step": 4373 + }, + { + "epoch": 1.3425414364640884, + "grad_norm": 0.429446280002594, + "learning_rate": 9.717925501829786e-05, + "loss": 1.9928, + "step": 4374 + }, + { + "epoch": 1.3428483732351135, + "grad_norm": 0.5588231086730957, + "learning_rate": 9.717760888307632e-05, + "loss": 2.0197, + "step": 4375 + }, + { + "epoch": 1.3431553100061389, + "grad_norm": 0.608248770236969, + "learning_rate": 9.7175962281618e-05, + "loss": 1.9486, + "step": 4376 + }, + { + "epoch": 1.343462246777164, + "grad_norm": 0.6100868582725525, + "learning_rate": 9.717431521393918e-05, + "loss": 2.044, + "step": 4377 + }, + { + "epoch": 1.343769183548189, + "grad_norm": 0.5428611636161804, + "learning_rate": 9.717266768005611e-05, + "loss": 2.0078, + "step": 4378 + }, + { + "epoch": 1.3440761203192142, + "grad_norm": 0.4338260889053345, + "learning_rate": 9.71710196799851e-05, + "loss": 1.9206, + "step": 4379 + }, + { + "epoch": 1.3443830570902393, + "grad_norm": 0.4879632294178009, + "learning_rate": 9.716937121374243e-05, + "loss": 1.9852, + "step": 4380 + }, + { + "epoch": 1.3446899938612646, + "grad_norm": 0.5174580216407776, + "learning_rate": 9.716772228134438e-05, + "loss": 1.9328, + "step": 4381 + }, + { + "epoch": 1.3449969306322898, + "grad_norm": 0.4461662173271179, + "learning_rate": 9.716607288280726e-05, + "loss": 1.9653, + "step": 4382 + }, + { + "epoch": 1.3453038674033149, + "grad_norm": 0.49747103452682495, + "learning_rate": 9.716442301814735e-05, + "loss": 1.9904, + "step": 4383 + }, + { + "epoch": 1.3456108041743402, + "grad_norm": 0.5059060454368591, + "learning_rate": 9.716277268738097e-05, + "loss": 1.9408, + "step": 4384 + }, + { + "epoch": 1.3459177409453653, + "grad_norm": 0.47981831431388855, + "learning_rate": 9.716112189052445e-05, + "loss": 1.9604, + "step": 4385 + }, + { + "epoch": 1.3462246777163904, + "grad_norm": 0.48941048979759216, + "learning_rate": 9.715947062759405e-05, + "loss": 2.0005, + "step": 4386 + }, + { + "epoch": 1.3465316144874155, + "grad_norm": 0.4544732868671417, + "learning_rate": 9.715781889860613e-05, + "loss": 1.9641, + "step": 4387 + }, + { + "epoch": 1.3468385512584407, + "grad_norm": 0.4564060866832733, + "learning_rate": 9.715616670357701e-05, + "loss": 1.8786, + "step": 4388 + }, + { + "epoch": 1.347145488029466, + "grad_norm": 0.4216209352016449, + "learning_rate": 9.715451404252301e-05, + "loss": 1.9402, + "step": 4389 + }, + { + "epoch": 1.347452424800491, + "grad_norm": 0.5024694204330444, + "learning_rate": 9.715286091546046e-05, + "loss": 1.9815, + "step": 4390 + }, + { + "epoch": 1.3477593615715162, + "grad_norm": 0.523953378200531, + "learning_rate": 9.715120732240571e-05, + "loss": 2.008, + "step": 4391 + }, + { + "epoch": 1.3480662983425415, + "grad_norm": 0.5068427920341492, + "learning_rate": 9.714955326337508e-05, + "loss": 1.9984, + "step": 4392 + }, + { + "epoch": 1.3483732351135667, + "grad_norm": 0.4349055290222168, + "learning_rate": 9.714789873838494e-05, + "loss": 1.9576, + "step": 4393 + }, + { + "epoch": 1.3486801718845918, + "grad_norm": 0.4677357077598572, + "learning_rate": 9.714624374745162e-05, + "loss": 2.0491, + "step": 4394 + }, + { + "epoch": 1.3489871086556169, + "grad_norm": 0.5942007899284363, + "learning_rate": 9.71445882905915e-05, + "loss": 1.9951, + "step": 4395 + }, + { + "epoch": 1.349294045426642, + "grad_norm": 0.5354358553886414, + "learning_rate": 9.714293236782092e-05, + "loss": 2.0033, + "step": 4396 + }, + { + "epoch": 1.3496009821976673, + "grad_norm": 0.5081890821456909, + "learning_rate": 9.714127597915625e-05, + "loss": 1.9944, + "step": 4397 + }, + { + "epoch": 1.3499079189686924, + "grad_norm": 0.5279759764671326, + "learning_rate": 9.713961912461386e-05, + "loss": 2.025, + "step": 4398 + }, + { + "epoch": 1.3502148557397176, + "grad_norm": 0.41777312755584717, + "learning_rate": 9.713796180421012e-05, + "loss": 1.9214, + "step": 4399 + }, + { + "epoch": 1.350521792510743, + "grad_norm": 0.48946598172187805, + "learning_rate": 9.713630401796141e-05, + "loss": 1.9851, + "step": 4400 + }, + { + "epoch": 1.350828729281768, + "grad_norm": 0.45182350277900696, + "learning_rate": 9.713464576588413e-05, + "loss": 1.9825, + "step": 4401 + }, + { + "epoch": 1.3511356660527931, + "grad_norm": 0.4178939461708069, + "learning_rate": 9.713298704799465e-05, + "loss": 1.8944, + "step": 4402 + }, + { + "epoch": 1.3514426028238182, + "grad_norm": 0.4178236424922943, + "learning_rate": 9.713132786430937e-05, + "loss": 1.9884, + "step": 4403 + }, + { + "epoch": 1.3517495395948433, + "grad_norm": 0.45951130986213684, + "learning_rate": 9.712966821484467e-05, + "loss": 2.0786, + "step": 4404 + }, + { + "epoch": 1.3520564763658687, + "grad_norm": 0.4884461760520935, + "learning_rate": 9.712800809961697e-05, + "loss": 2.0494, + "step": 4405 + }, + { + "epoch": 1.3523634131368938, + "grad_norm": 0.5342240929603577, + "learning_rate": 9.712634751864268e-05, + "loss": 2.1068, + "step": 4406 + }, + { + "epoch": 1.352670349907919, + "grad_norm": 0.5503208637237549, + "learning_rate": 9.71246864719382e-05, + "loss": 1.9588, + "step": 4407 + }, + { + "epoch": 1.3529772866789442, + "grad_norm": 0.5576291084289551, + "learning_rate": 9.712302495951994e-05, + "loss": 2.0461, + "step": 4408 + }, + { + "epoch": 1.3532842234499693, + "grad_norm": 0.5063806772232056, + "learning_rate": 9.712136298140433e-05, + "loss": 1.9606, + "step": 4409 + }, + { + "epoch": 1.3535911602209945, + "grad_norm": 0.5391512513160706, + "learning_rate": 9.71197005376078e-05, + "loss": 2.0115, + "step": 4410 + }, + { + "epoch": 1.3538980969920196, + "grad_norm": 0.4934769868850708, + "learning_rate": 9.711803762814676e-05, + "loss": 1.9966, + "step": 4411 + }, + { + "epoch": 1.3542050337630447, + "grad_norm": 0.4658334255218506, + "learning_rate": 9.711637425303766e-05, + "loss": 1.9477, + "step": 4412 + }, + { + "epoch": 1.35451197053407, + "grad_norm": 0.4407191574573517, + "learning_rate": 9.711471041229693e-05, + "loss": 1.9334, + "step": 4413 + }, + { + "epoch": 1.3548189073050951, + "grad_norm": 0.5043092370033264, + "learning_rate": 9.711304610594104e-05, + "loss": 2.0068, + "step": 4414 + }, + { + "epoch": 1.3551258440761202, + "grad_norm": 0.4502009451389313, + "learning_rate": 9.711138133398639e-05, + "loss": 1.9389, + "step": 4415 + }, + { + "epoch": 1.3554327808471456, + "grad_norm": 0.41863033175468445, + "learning_rate": 9.710971609644945e-05, + "loss": 1.9244, + "step": 4416 + }, + { + "epoch": 1.3557397176181707, + "grad_norm": 0.47590091824531555, + "learning_rate": 9.71080503933467e-05, + "loss": 2.0144, + "step": 4417 + }, + { + "epoch": 1.3560466543891958, + "grad_norm": 0.47155439853668213, + "learning_rate": 9.71063842246946e-05, + "loss": 2.0729, + "step": 4418 + }, + { + "epoch": 1.356353591160221, + "grad_norm": 0.5231152176856995, + "learning_rate": 9.710471759050957e-05, + "loss": 2.0654, + "step": 4419 + }, + { + "epoch": 1.356660527931246, + "grad_norm": 0.5952544212341309, + "learning_rate": 9.710305049080812e-05, + "loss": 1.9983, + "step": 4420 + }, + { + "epoch": 1.3569674647022714, + "grad_norm": 0.4810022711753845, + "learning_rate": 9.710138292560673e-05, + "loss": 1.9725, + "step": 4421 + }, + { + "epoch": 1.3572744014732965, + "grad_norm": 0.553421676158905, + "learning_rate": 9.709971489492185e-05, + "loss": 2.0666, + "step": 4422 + }, + { + "epoch": 1.3575813382443216, + "grad_norm": 0.48790663480758667, + "learning_rate": 9.709804639877001e-05, + "loss": 1.9312, + "step": 4423 + }, + { + "epoch": 1.357888275015347, + "grad_norm": 0.42968273162841797, + "learning_rate": 9.709637743716764e-05, + "loss": 1.9061, + "step": 4424 + }, + { + "epoch": 1.358195211786372, + "grad_norm": 0.40183690190315247, + "learning_rate": 9.709470801013128e-05, + "loss": 2.0547, + "step": 4425 + }, + { + "epoch": 1.3585021485573971, + "grad_norm": 0.5162881016731262, + "learning_rate": 9.70930381176774e-05, + "loss": 2.0246, + "step": 4426 + }, + { + "epoch": 1.3588090853284225, + "grad_norm": 0.517995297908783, + "learning_rate": 9.709136775982252e-05, + "loss": 2.0029, + "step": 4427 + }, + { + "epoch": 1.3591160220994476, + "grad_norm": 0.47416025400161743, + "learning_rate": 9.708969693658314e-05, + "loss": 1.9517, + "step": 4428 + }, + { + "epoch": 1.3594229588704727, + "grad_norm": 0.4192255437374115, + "learning_rate": 9.708802564797578e-05, + "loss": 1.9138, + "step": 4429 + }, + { + "epoch": 1.3597298956414978, + "grad_norm": 0.4643617868423462, + "learning_rate": 9.708635389401697e-05, + "loss": 1.9753, + "step": 4430 + }, + { + "epoch": 1.360036832412523, + "grad_norm": 0.5007988214492798, + "learning_rate": 9.708468167472317e-05, + "loss": 1.9654, + "step": 4431 + }, + { + "epoch": 1.3603437691835483, + "grad_norm": 0.5188244581222534, + "learning_rate": 9.708300899011098e-05, + "loss": 1.9959, + "step": 4432 + }, + { + "epoch": 1.3606507059545734, + "grad_norm": 0.5209388732910156, + "learning_rate": 9.70813358401969e-05, + "loss": 2.0028, + "step": 4433 + }, + { + "epoch": 1.3609576427255985, + "grad_norm": 0.48829126358032227, + "learning_rate": 9.707966222499745e-05, + "loss": 2.0554, + "step": 4434 + }, + { + "epoch": 1.3612645794966238, + "grad_norm": 0.4373438358306885, + "learning_rate": 9.707798814452919e-05, + "loss": 1.9611, + "step": 4435 + }, + { + "epoch": 1.361571516267649, + "grad_norm": 0.4294830858707428, + "learning_rate": 9.707631359880867e-05, + "loss": 1.9049, + "step": 4436 + }, + { + "epoch": 1.361878453038674, + "grad_norm": 0.46988123655319214, + "learning_rate": 9.70746385878524e-05, + "loss": 1.9221, + "step": 4437 + }, + { + "epoch": 1.3621853898096992, + "grad_norm": 0.4956746995449066, + "learning_rate": 9.707296311167697e-05, + "loss": 1.9215, + "step": 4438 + }, + { + "epoch": 1.3624923265807243, + "grad_norm": 0.43748801946640015, + "learning_rate": 9.707128717029894e-05, + "loss": 1.9882, + "step": 4439 + }, + { + "epoch": 1.3627992633517496, + "grad_norm": 0.4926415979862213, + "learning_rate": 9.706961076373485e-05, + "loss": 1.9664, + "step": 4440 + }, + { + "epoch": 1.3631062001227747, + "grad_norm": 0.5239415764808655, + "learning_rate": 9.706793389200129e-05, + "loss": 1.9809, + "step": 4441 + }, + { + "epoch": 1.3634131368937998, + "grad_norm": 0.5134629607200623, + "learning_rate": 9.706625655511481e-05, + "loss": 1.9559, + "step": 4442 + }, + { + "epoch": 1.3637200736648252, + "grad_norm": 0.49562570452690125, + "learning_rate": 9.706457875309198e-05, + "loss": 1.9603, + "step": 4443 + }, + { + "epoch": 1.3640270104358503, + "grad_norm": 0.45000702142715454, + "learning_rate": 9.706290048594942e-05, + "loss": 1.9395, + "step": 4444 + }, + { + "epoch": 1.3643339472068754, + "grad_norm": 0.4216759502887726, + "learning_rate": 9.70612217537037e-05, + "loss": 1.8857, + "step": 4445 + }, + { + "epoch": 1.3646408839779005, + "grad_norm": 0.5022158622741699, + "learning_rate": 9.705954255637138e-05, + "loss": 1.9388, + "step": 4446 + }, + { + "epoch": 1.3649478207489256, + "grad_norm": 0.5086642503738403, + "learning_rate": 9.70578628939691e-05, + "loss": 1.9325, + "step": 4447 + }, + { + "epoch": 1.365254757519951, + "grad_norm": 0.4891139566898346, + "learning_rate": 9.705618276651342e-05, + "loss": 1.9068, + "step": 4448 + }, + { + "epoch": 1.365561694290976, + "grad_norm": 0.42479926347732544, + "learning_rate": 9.705450217402096e-05, + "loss": 2.0345, + "step": 4449 + }, + { + "epoch": 1.3658686310620012, + "grad_norm": 0.45347172021865845, + "learning_rate": 9.705282111650834e-05, + "loss": 1.9343, + "step": 4450 + }, + { + "epoch": 1.3661755678330265, + "grad_norm": 0.5443231463432312, + "learning_rate": 9.705113959399217e-05, + "loss": 2.0428, + "step": 4451 + }, + { + "epoch": 1.3664825046040516, + "grad_norm": 0.5320110321044922, + "learning_rate": 9.704945760648905e-05, + "loss": 2.0015, + "step": 4452 + }, + { + "epoch": 1.3667894413750767, + "grad_norm": 0.5018410086631775, + "learning_rate": 9.704777515401561e-05, + "loss": 1.9284, + "step": 4453 + }, + { + "epoch": 1.3670963781461019, + "grad_norm": 0.4587440490722656, + "learning_rate": 9.704609223658848e-05, + "loss": 1.8945, + "step": 4454 + }, + { + "epoch": 1.367403314917127, + "grad_norm": 0.4634784758090973, + "learning_rate": 9.70444088542243e-05, + "loss": 1.9564, + "step": 4455 + }, + { + "epoch": 1.3677102516881523, + "grad_norm": 0.43047839403152466, + "learning_rate": 9.70427250069397e-05, + "loss": 2.0417, + "step": 4456 + }, + { + "epoch": 1.3680171884591774, + "grad_norm": 0.46661630272865295, + "learning_rate": 9.70410406947513e-05, + "loss": 2.0563, + "step": 4457 + }, + { + "epoch": 1.3683241252302025, + "grad_norm": 0.46544912457466125, + "learning_rate": 9.703935591767579e-05, + "loss": 2.0115, + "step": 4458 + }, + { + "epoch": 1.3686310620012279, + "grad_norm": 0.466172993183136, + "learning_rate": 9.703767067572977e-05, + "loss": 1.9177, + "step": 4459 + }, + { + "epoch": 1.368937998772253, + "grad_norm": 0.44513949751853943, + "learning_rate": 9.703598496892994e-05, + "loss": 1.9954, + "step": 4460 + }, + { + "epoch": 1.369244935543278, + "grad_norm": 0.4502551257610321, + "learning_rate": 9.703429879729293e-05, + "loss": 1.9155, + "step": 4461 + }, + { + "epoch": 1.3695518723143032, + "grad_norm": 0.4618416726589203, + "learning_rate": 9.703261216083541e-05, + "loss": 2.015, + "step": 4462 + }, + { + "epoch": 1.3698588090853283, + "grad_norm": 0.4691082239151001, + "learning_rate": 9.703092505957405e-05, + "loss": 2.0332, + "step": 4463 + }, + { + "epoch": 1.3701657458563536, + "grad_norm": 0.5674530863761902, + "learning_rate": 9.702923749352553e-05, + "loss": 2.0, + "step": 4464 + }, + { + "epoch": 1.3704726826273788, + "grad_norm": 0.5828661322593689, + "learning_rate": 9.702754946270651e-05, + "loss": 1.9727, + "step": 4465 + }, + { + "epoch": 1.3707796193984039, + "grad_norm": 0.5861548781394958, + "learning_rate": 9.702586096713369e-05, + "loss": 2.0337, + "step": 4466 + }, + { + "epoch": 1.3710865561694292, + "grad_norm": 0.5607923865318298, + "learning_rate": 9.702417200682374e-05, + "loss": 1.9639, + "step": 4467 + }, + { + "epoch": 1.3713934929404543, + "grad_norm": 0.553827702999115, + "learning_rate": 9.702248258179337e-05, + "loss": 1.9644, + "step": 4468 + }, + { + "epoch": 1.3717004297114794, + "grad_norm": 0.6120470762252808, + "learning_rate": 9.702079269205925e-05, + "loss": 1.9562, + "step": 4469 + }, + { + "epoch": 1.3720073664825045, + "grad_norm": 0.6354473829269409, + "learning_rate": 9.70191023376381e-05, + "loss": 2.0984, + "step": 4470 + }, + { + "epoch": 1.3723143032535297, + "grad_norm": 0.5426626801490784, + "learning_rate": 9.701741151854665e-05, + "loss": 1.9473, + "step": 4471 + }, + { + "epoch": 1.372621240024555, + "grad_norm": 0.5632089376449585, + "learning_rate": 9.701572023480156e-05, + "loss": 2.0167, + "step": 4472 + }, + { + "epoch": 1.37292817679558, + "grad_norm": 0.5315039157867432, + "learning_rate": 9.701402848641957e-05, + "loss": 1.9537, + "step": 4473 + }, + { + "epoch": 1.3732351135666052, + "grad_norm": 0.4552931785583496, + "learning_rate": 9.70123362734174e-05, + "loss": 1.9553, + "step": 4474 + }, + { + "epoch": 1.3735420503376305, + "grad_norm": 0.49282166361808777, + "learning_rate": 9.701064359581176e-05, + "loss": 2.0409, + "step": 4475 + }, + { + "epoch": 1.3738489871086557, + "grad_norm": 0.46548575162887573, + "learning_rate": 9.700895045361939e-05, + "loss": 1.9707, + "step": 4476 + }, + { + "epoch": 1.3741559238796808, + "grad_norm": 0.4619027078151703, + "learning_rate": 9.7007256846857e-05, + "loss": 1.9531, + "step": 4477 + }, + { + "epoch": 1.3744628606507059, + "grad_norm": 0.5122626423835754, + "learning_rate": 9.700556277554138e-05, + "loss": 2.0625, + "step": 4478 + }, + { + "epoch": 1.374769797421731, + "grad_norm": 0.487246036529541, + "learning_rate": 9.700386823968922e-05, + "loss": 1.9667, + "step": 4479 + }, + { + "epoch": 1.3750767341927563, + "grad_norm": 0.5093865990638733, + "learning_rate": 9.700217323931729e-05, + "loss": 1.9982, + "step": 4480 + }, + { + "epoch": 1.3753836709637814, + "grad_norm": 0.47049981355667114, + "learning_rate": 9.700047777444232e-05, + "loss": 1.9876, + "step": 4481 + }, + { + "epoch": 1.3756906077348066, + "grad_norm": 0.4997411370277405, + "learning_rate": 9.699878184508109e-05, + "loss": 1.9925, + "step": 4482 + }, + { + "epoch": 1.375997544505832, + "grad_norm": 0.49374327063560486, + "learning_rate": 9.699708545125034e-05, + "loss": 1.9468, + "step": 4483 + }, + { + "epoch": 1.376304481276857, + "grad_norm": 0.44101378321647644, + "learning_rate": 9.699538859296686e-05, + "loss": 2.0577, + "step": 4484 + }, + { + "epoch": 1.3766114180478821, + "grad_norm": 0.47289925813674927, + "learning_rate": 9.699369127024741e-05, + "loss": 1.9611, + "step": 4485 + }, + { + "epoch": 1.3769183548189072, + "grad_norm": 0.4616342782974243, + "learning_rate": 9.699199348310875e-05, + "loss": 2.0196, + "step": 4486 + }, + { + "epoch": 1.3772252915899323, + "grad_norm": 0.45797309279441833, + "learning_rate": 9.699029523156766e-05, + "loss": 2.0168, + "step": 4487 + }, + { + "epoch": 1.3775322283609577, + "grad_norm": 0.5224477648735046, + "learning_rate": 9.698859651564095e-05, + "loss": 2.0312, + "step": 4488 + }, + { + "epoch": 1.3778391651319828, + "grad_norm": 0.4831027388572693, + "learning_rate": 9.698689733534539e-05, + "loss": 2.0084, + "step": 4489 + }, + { + "epoch": 1.378146101903008, + "grad_norm": 0.49492040276527405, + "learning_rate": 9.698519769069774e-05, + "loss": 1.9474, + "step": 4490 + }, + { + "epoch": 1.3784530386740332, + "grad_norm": 0.4911774694919586, + "learning_rate": 9.698349758171486e-05, + "loss": 1.987, + "step": 4491 + }, + { + "epoch": 1.3787599754450584, + "grad_norm": 0.5415390729904175, + "learning_rate": 9.69817970084135e-05, + "loss": 1.9927, + "step": 4492 + }, + { + "epoch": 1.3790669122160835, + "grad_norm": 0.6870381832122803, + "learning_rate": 9.698009597081048e-05, + "loss": 2.0348, + "step": 4493 + }, + { + "epoch": 1.3793738489871086, + "grad_norm": 0.6322616934776306, + "learning_rate": 9.697839446892263e-05, + "loss": 2.0119, + "step": 4494 + }, + { + "epoch": 1.3796807857581337, + "grad_norm": 0.5950151681900024, + "learning_rate": 9.697669250276675e-05, + "loss": 2.002, + "step": 4495 + }, + { + "epoch": 1.379987722529159, + "grad_norm": 0.4321151673793793, + "learning_rate": 9.697499007235966e-05, + "loss": 1.9173, + "step": 4496 + }, + { + "epoch": 1.3802946593001841, + "grad_norm": 0.4627344608306885, + "learning_rate": 9.697328717771818e-05, + "loss": 2.0289, + "step": 4497 + }, + { + "epoch": 1.3806015960712092, + "grad_norm": 0.5040726661682129, + "learning_rate": 9.697158381885915e-05, + "loss": 1.9844, + "step": 4498 + }, + { + "epoch": 1.3809085328422346, + "grad_norm": 0.5219398736953735, + "learning_rate": 9.696987999579939e-05, + "loss": 1.9536, + "step": 4499 + }, + { + "epoch": 1.3812154696132597, + "grad_norm": 0.487734317779541, + "learning_rate": 9.696817570855575e-05, + "loss": 1.9655, + "step": 4500 + }, + { + "epoch": 1.3815224063842848, + "grad_norm": 0.40818822383880615, + "learning_rate": 9.696647095714506e-05, + "loss": 1.9524, + "step": 4501 + }, + { + "epoch": 1.3818293431553101, + "grad_norm": 0.41752889752388, + "learning_rate": 9.69647657415842e-05, + "loss": 1.9927, + "step": 4502 + }, + { + "epoch": 1.3821362799263353, + "grad_norm": 0.44540464878082275, + "learning_rate": 9.696306006188998e-05, + "loss": 1.9207, + "step": 4503 + }, + { + "epoch": 1.3824432166973604, + "grad_norm": 0.44818806648254395, + "learning_rate": 9.696135391807927e-05, + "loss": 1.9054, + "step": 4504 + }, + { + "epoch": 1.3827501534683855, + "grad_norm": 0.430758535861969, + "learning_rate": 9.695964731016896e-05, + "loss": 1.9644, + "step": 4505 + }, + { + "epoch": 1.3830570902394106, + "grad_norm": 0.3787635564804077, + "learning_rate": 9.695794023817586e-05, + "loss": 1.9601, + "step": 4506 + }, + { + "epoch": 1.383364027010436, + "grad_norm": 0.42520588636398315, + "learning_rate": 9.695623270211689e-05, + "loss": 1.9681, + "step": 4507 + }, + { + "epoch": 1.383670963781461, + "grad_norm": 0.39063912630081177, + "learning_rate": 9.69545247020089e-05, + "loss": 2.0323, + "step": 4508 + }, + { + "epoch": 1.3839779005524862, + "grad_norm": 0.41405799984931946, + "learning_rate": 9.695281623786879e-05, + "loss": 1.9239, + "step": 4509 + }, + { + "epoch": 1.3842848373235115, + "grad_norm": 0.4275501072406769, + "learning_rate": 9.695110730971342e-05, + "loss": 1.941, + "step": 4510 + }, + { + "epoch": 1.3845917740945366, + "grad_norm": 0.5254966616630554, + "learning_rate": 9.694939791755968e-05, + "loss": 1.9997, + "step": 4511 + }, + { + "epoch": 1.3848987108655617, + "grad_norm": 0.581857442855835, + "learning_rate": 9.694768806142448e-05, + "loss": 2.0085, + "step": 4512 + }, + { + "epoch": 1.3852056476365868, + "grad_norm": 0.6330662965774536, + "learning_rate": 9.69459777413247e-05, + "loss": 1.9898, + "step": 4513 + }, + { + "epoch": 1.385512584407612, + "grad_norm": 0.693536639213562, + "learning_rate": 9.694426695727727e-05, + "loss": 1.9466, + "step": 4514 + }, + { + "epoch": 1.3858195211786373, + "grad_norm": 0.6494079232215881, + "learning_rate": 9.694255570929906e-05, + "loss": 1.9523, + "step": 4515 + }, + { + "epoch": 1.3861264579496624, + "grad_norm": 0.573515772819519, + "learning_rate": 9.694084399740701e-05, + "loss": 1.9789, + "step": 4516 + }, + { + "epoch": 1.3864333947206875, + "grad_norm": 0.5253448486328125, + "learning_rate": 9.693913182161805e-05, + "loss": 2.0348, + "step": 4517 + }, + { + "epoch": 1.3867403314917128, + "grad_norm": 0.49921590089797974, + "learning_rate": 9.693741918194904e-05, + "loss": 1.9684, + "step": 4518 + }, + { + "epoch": 1.387047268262738, + "grad_norm": 0.5164174437522888, + "learning_rate": 9.693570607841696e-05, + "loss": 2.0104, + "step": 4519 + }, + { + "epoch": 1.387354205033763, + "grad_norm": 0.5620231032371521, + "learning_rate": 9.693399251103872e-05, + "loss": 1.9969, + "step": 4520 + }, + { + "epoch": 1.3876611418047882, + "grad_norm": 0.495890349149704, + "learning_rate": 9.693227847983126e-05, + "loss": 2.0037, + "step": 4521 + }, + { + "epoch": 1.3879680785758133, + "grad_norm": 0.4942645728588104, + "learning_rate": 9.693056398481151e-05, + "loss": 2.0199, + "step": 4522 + }, + { + "epoch": 1.3882750153468386, + "grad_norm": 0.5366860628128052, + "learning_rate": 9.692884902599643e-05, + "loss": 2.0395, + "step": 4523 + }, + { + "epoch": 1.3885819521178637, + "grad_norm": 0.48179951310157776, + "learning_rate": 9.692713360340295e-05, + "loss": 2.0292, + "step": 4524 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.4709320366382599, + "learning_rate": 9.692541771704804e-05, + "loss": 2.006, + "step": 4525 + }, + { + "epoch": 1.3891958256599142, + "grad_norm": 0.4311975836753845, + "learning_rate": 9.692370136694864e-05, + "loss": 2.0122, + "step": 4526 + }, + { + "epoch": 1.3895027624309393, + "grad_norm": 0.4489841163158417, + "learning_rate": 9.692198455312172e-05, + "loss": 1.9635, + "step": 4527 + }, + { + "epoch": 1.3898096992019644, + "grad_norm": 0.40383243560791016, + "learning_rate": 9.692026727558425e-05, + "loss": 1.9352, + "step": 4528 + }, + { + "epoch": 1.3901166359729895, + "grad_norm": 0.4732677638530731, + "learning_rate": 9.691854953435319e-05, + "loss": 1.9882, + "step": 4529 + }, + { + "epoch": 1.3904235727440146, + "grad_norm": 0.5124688744544983, + "learning_rate": 9.691683132944553e-05, + "loss": 2.0068, + "step": 4530 + }, + { + "epoch": 1.39073050951504, + "grad_norm": 0.4810490906238556, + "learning_rate": 9.691511266087824e-05, + "loss": 2.0163, + "step": 4531 + }, + { + "epoch": 1.391037446286065, + "grad_norm": 0.4019710421562195, + "learning_rate": 9.691339352866831e-05, + "loss": 1.8943, + "step": 4532 + }, + { + "epoch": 1.3913443830570902, + "grad_norm": 0.4144287705421448, + "learning_rate": 9.691167393283274e-05, + "loss": 1.9457, + "step": 4533 + }, + { + "epoch": 1.3916513198281155, + "grad_norm": 0.42622655630111694, + "learning_rate": 9.690995387338851e-05, + "loss": 1.9618, + "step": 4534 + }, + { + "epoch": 1.3919582565991406, + "grad_norm": 0.4547794461250305, + "learning_rate": 9.690823335035259e-05, + "loss": 2.0243, + "step": 4535 + }, + { + "epoch": 1.3922651933701657, + "grad_norm": 0.4298909604549408, + "learning_rate": 9.690651236374205e-05, + "loss": 1.9872, + "step": 4536 + }, + { + "epoch": 1.3925721301411909, + "grad_norm": 0.40463829040527344, + "learning_rate": 9.690479091357386e-05, + "loss": 1.9617, + "step": 4537 + }, + { + "epoch": 1.392879066912216, + "grad_norm": 0.441487580537796, + "learning_rate": 9.690306899986502e-05, + "loss": 1.8965, + "step": 4538 + }, + { + "epoch": 1.3931860036832413, + "grad_norm": 0.4713582694530487, + "learning_rate": 9.690134662263256e-05, + "loss": 2.0112, + "step": 4539 + }, + { + "epoch": 1.3934929404542664, + "grad_norm": 0.5772922039031982, + "learning_rate": 9.689962378189351e-05, + "loss": 1.9903, + "step": 4540 + }, + { + "epoch": 1.3937998772252915, + "grad_norm": 0.6658890247344971, + "learning_rate": 9.689790047766489e-05, + "loss": 2.0569, + "step": 4541 + }, + { + "epoch": 1.3941068139963169, + "grad_norm": 0.6710116267204285, + "learning_rate": 9.689617670996372e-05, + "loss": 1.9692, + "step": 4542 + }, + { + "epoch": 1.394413750767342, + "grad_norm": 0.6778390407562256, + "learning_rate": 9.689445247880707e-05, + "loss": 2.0363, + "step": 4543 + }, + { + "epoch": 1.394720687538367, + "grad_norm": 0.6921203136444092, + "learning_rate": 9.689272778421192e-05, + "loss": 2.0104, + "step": 4544 + }, + { + "epoch": 1.3950276243093922, + "grad_norm": 0.48772117495536804, + "learning_rate": 9.689100262619537e-05, + "loss": 2.0006, + "step": 4545 + }, + { + "epoch": 1.3953345610804173, + "grad_norm": 0.4956360459327698, + "learning_rate": 9.688927700477445e-05, + "loss": 1.9724, + "step": 4546 + }, + { + "epoch": 1.3956414978514426, + "grad_norm": 0.6304072141647339, + "learning_rate": 9.68875509199662e-05, + "loss": 1.9904, + "step": 4547 + }, + { + "epoch": 1.3959484346224678, + "grad_norm": 0.6372275948524475, + "learning_rate": 9.68858243717877e-05, + "loss": 2.0328, + "step": 4548 + }, + { + "epoch": 1.3962553713934929, + "grad_norm": 0.48642870783805847, + "learning_rate": 9.688409736025601e-05, + "loss": 1.9898, + "step": 4549 + }, + { + "epoch": 1.3965623081645182, + "grad_norm": 0.41096800565719604, + "learning_rate": 9.688236988538817e-05, + "loss": 1.8945, + "step": 4550 + }, + { + "epoch": 1.3968692449355433, + "grad_norm": 0.48746830224990845, + "learning_rate": 9.68806419472013e-05, + "loss": 1.9809, + "step": 4551 + }, + { + "epoch": 1.3971761817065684, + "grad_norm": 0.5296676754951477, + "learning_rate": 9.687891354571242e-05, + "loss": 1.9194, + "step": 4552 + }, + { + "epoch": 1.3974831184775935, + "grad_norm": 0.43177086114883423, + "learning_rate": 9.687718468093865e-05, + "loss": 1.8785, + "step": 4553 + }, + { + "epoch": 1.3977900552486187, + "grad_norm": 0.4617565870285034, + "learning_rate": 9.687545535289705e-05, + "loss": 2.0021, + "step": 4554 + }, + { + "epoch": 1.398096992019644, + "grad_norm": 0.4460168182849884, + "learning_rate": 9.687372556160477e-05, + "loss": 1.9368, + "step": 4555 + }, + { + "epoch": 1.398403928790669, + "grad_norm": 0.5051010847091675, + "learning_rate": 9.687199530707882e-05, + "loss": 2.0321, + "step": 4556 + }, + { + "epoch": 1.3987108655616942, + "grad_norm": 0.5623685717582703, + "learning_rate": 9.687026458933636e-05, + "loss": 2.007, + "step": 4557 + }, + { + "epoch": 1.3990178023327196, + "grad_norm": 0.48149919509887695, + "learning_rate": 9.686853340839446e-05, + "loss": 1.9346, + "step": 4558 + }, + { + "epoch": 1.3993247391037447, + "grad_norm": 0.4651631712913513, + "learning_rate": 9.686680176427025e-05, + "loss": 1.9603, + "step": 4559 + }, + { + "epoch": 1.3996316758747698, + "grad_norm": 0.5255021452903748, + "learning_rate": 9.686506965698083e-05, + "loss": 2.0206, + "step": 4560 + }, + { + "epoch": 1.3999386126457949, + "grad_norm": 0.5137404799461365, + "learning_rate": 9.686333708654334e-05, + "loss": 1.9736, + "step": 4561 + }, + { + "epoch": 1.40024554941682, + "grad_norm": 0.5037943124771118, + "learning_rate": 9.686160405297487e-05, + "loss": 1.9886, + "step": 4562 + }, + { + "epoch": 1.4005524861878453, + "grad_norm": 0.46424365043640137, + "learning_rate": 9.685987055629256e-05, + "loss": 1.9316, + "step": 4563 + }, + { + "epoch": 1.4008594229588704, + "grad_norm": 0.4839535355567932, + "learning_rate": 9.685813659651355e-05, + "loss": 1.9651, + "step": 4564 + }, + { + "epoch": 1.4011663597298956, + "grad_norm": 0.48972323536872864, + "learning_rate": 9.685640217365497e-05, + "loss": 1.9544, + "step": 4565 + }, + { + "epoch": 1.401473296500921, + "grad_norm": 0.43038102984428406, + "learning_rate": 9.685466728773396e-05, + "loss": 1.9522, + "step": 4566 + }, + { + "epoch": 1.401780233271946, + "grad_norm": 0.5174641013145447, + "learning_rate": 9.685293193876765e-05, + "loss": 2.046, + "step": 4567 + }, + { + "epoch": 1.4020871700429711, + "grad_norm": 0.6731263995170593, + "learning_rate": 9.685119612677323e-05, + "loss": 2.0123, + "step": 4568 + }, + { + "epoch": 1.4023941068139965, + "grad_norm": 0.5863515734672546, + "learning_rate": 9.684945985176782e-05, + "loss": 1.9951, + "step": 4569 + }, + { + "epoch": 1.4027010435850216, + "grad_norm": 0.4479050934314728, + "learning_rate": 9.684772311376859e-05, + "loss": 1.9287, + "step": 4570 + }, + { + "epoch": 1.4030079803560467, + "grad_norm": 0.432740718126297, + "learning_rate": 9.68459859127927e-05, + "loss": 1.955, + "step": 4571 + }, + { + "epoch": 1.4033149171270718, + "grad_norm": 0.571775496006012, + "learning_rate": 9.684424824885731e-05, + "loss": 1.9519, + "step": 4572 + }, + { + "epoch": 1.403621853898097, + "grad_norm": 0.6454880237579346, + "learning_rate": 9.684251012197963e-05, + "loss": 1.9858, + "step": 4573 + }, + { + "epoch": 1.4039287906691222, + "grad_norm": 0.5274731516838074, + "learning_rate": 9.684077153217677e-05, + "loss": 1.9956, + "step": 4574 + }, + { + "epoch": 1.4042357274401474, + "grad_norm": 0.4459272027015686, + "learning_rate": 9.683903247946597e-05, + "loss": 2.0412, + "step": 4575 + }, + { + "epoch": 1.4045426642111725, + "grad_norm": 0.47089213132858276, + "learning_rate": 9.683729296386441e-05, + "loss": 1.9247, + "step": 4576 + }, + { + "epoch": 1.4048496009821978, + "grad_norm": 0.628490149974823, + "learning_rate": 9.683555298538927e-05, + "loss": 2.1311, + "step": 4577 + }, + { + "epoch": 1.405156537753223, + "grad_norm": 0.5498626232147217, + "learning_rate": 9.683381254405773e-05, + "loss": 1.9538, + "step": 4578 + }, + { + "epoch": 1.405463474524248, + "grad_norm": 0.4556458294391632, + "learning_rate": 9.6832071639887e-05, + "loss": 1.9957, + "step": 4579 + }, + { + "epoch": 1.4057704112952731, + "grad_norm": 0.5684164762496948, + "learning_rate": 9.68303302728943e-05, + "loss": 1.9339, + "step": 4580 + }, + { + "epoch": 1.4060773480662982, + "grad_norm": 0.5723292231559753, + "learning_rate": 9.682858844309682e-05, + "loss": 2.0043, + "step": 4581 + }, + { + "epoch": 1.4063842848373236, + "grad_norm": 0.4734770953655243, + "learning_rate": 9.682684615051178e-05, + "loss": 1.9854, + "step": 4582 + }, + { + "epoch": 1.4066912216083487, + "grad_norm": 0.49376189708709717, + "learning_rate": 9.682510339515642e-05, + "loss": 2.0436, + "step": 4583 + }, + { + "epoch": 1.4069981583793738, + "grad_norm": 0.6263520121574402, + "learning_rate": 9.682336017704793e-05, + "loss": 1.9426, + "step": 4584 + }, + { + "epoch": 1.4073050951503991, + "grad_norm": 0.5852357745170593, + "learning_rate": 9.682161649620355e-05, + "loss": 1.9865, + "step": 4585 + }, + { + "epoch": 1.4076120319214243, + "grad_norm": 0.45548367500305176, + "learning_rate": 9.681987235264052e-05, + "loss": 2.0454, + "step": 4586 + }, + { + "epoch": 1.4079189686924494, + "grad_norm": 0.4961472153663635, + "learning_rate": 9.681812774637607e-05, + "loss": 2.0414, + "step": 4587 + }, + { + "epoch": 1.4082259054634745, + "grad_norm": 0.5739028453826904, + "learning_rate": 9.681638267742741e-05, + "loss": 1.9591, + "step": 4588 + }, + { + "epoch": 1.4085328422344996, + "grad_norm": 0.546283483505249, + "learning_rate": 9.681463714581184e-05, + "loss": 1.9631, + "step": 4589 + }, + { + "epoch": 1.408839779005525, + "grad_norm": 0.4757421910762787, + "learning_rate": 9.681289115154659e-05, + "loss": 1.954, + "step": 4590 + }, + { + "epoch": 1.40914671577655, + "grad_norm": 0.5116898417472839, + "learning_rate": 9.681114469464891e-05, + "loss": 1.9816, + "step": 4591 + }, + { + "epoch": 1.4094536525475752, + "grad_norm": 0.6128544807434082, + "learning_rate": 9.680939777513607e-05, + "loss": 1.9408, + "step": 4592 + }, + { + "epoch": 1.4097605893186005, + "grad_norm": 0.5577036142349243, + "learning_rate": 9.680765039302531e-05, + "loss": 1.906, + "step": 4593 + }, + { + "epoch": 1.4100675260896256, + "grad_norm": 0.4608074128627777, + "learning_rate": 9.680590254833393e-05, + "loss": 1.9421, + "step": 4594 + }, + { + "epoch": 1.4103744628606507, + "grad_norm": 0.4221206307411194, + "learning_rate": 9.680415424107917e-05, + "loss": 1.9596, + "step": 4595 + }, + { + "epoch": 1.4106813996316758, + "grad_norm": 0.4278069734573364, + "learning_rate": 9.680240547127832e-05, + "loss": 1.9718, + "step": 4596 + }, + { + "epoch": 1.410988336402701, + "grad_norm": 0.48608019948005676, + "learning_rate": 9.680065623894869e-05, + "loss": 2.0595, + "step": 4597 + }, + { + "epoch": 1.4112952731737263, + "grad_norm": 0.4559817910194397, + "learning_rate": 9.679890654410753e-05, + "loss": 1.959, + "step": 4598 + }, + { + "epoch": 1.4116022099447514, + "grad_norm": 0.5122750997543335, + "learning_rate": 9.679715638677216e-05, + "loss": 2.0669, + "step": 4599 + }, + { + "epoch": 1.4119091467157765, + "grad_norm": 0.5203170776367188, + "learning_rate": 9.679540576695985e-05, + "loss": 1.9475, + "step": 4600 + }, + { + "epoch": 1.4122160834868018, + "grad_norm": 0.5420581698417664, + "learning_rate": 9.679365468468791e-05, + "loss": 1.9603, + "step": 4601 + }, + { + "epoch": 1.412523020257827, + "grad_norm": 0.527387261390686, + "learning_rate": 9.679190313997364e-05, + "loss": 1.9172, + "step": 4602 + }, + { + "epoch": 1.412829957028852, + "grad_norm": 0.48417946696281433, + "learning_rate": 9.679015113283438e-05, + "loss": 1.9619, + "step": 4603 + }, + { + "epoch": 1.4131368937998772, + "grad_norm": 0.49174100160598755, + "learning_rate": 9.678839866328742e-05, + "loss": 1.9959, + "step": 4604 + }, + { + "epoch": 1.4134438305709023, + "grad_norm": 0.5096092224121094, + "learning_rate": 9.678664573135006e-05, + "loss": 2.0046, + "step": 4605 + }, + { + "epoch": 1.4137507673419276, + "grad_norm": 0.4536958634853363, + "learning_rate": 9.678489233703965e-05, + "loss": 1.9289, + "step": 4606 + }, + { + "epoch": 1.4140577041129527, + "grad_norm": 0.40438196063041687, + "learning_rate": 9.678313848037353e-05, + "loss": 1.9488, + "step": 4607 + }, + { + "epoch": 1.4143646408839778, + "grad_norm": 0.4447456896305084, + "learning_rate": 9.6781384161369e-05, + "loss": 1.9638, + "step": 4608 + }, + { + "epoch": 1.4146715776550032, + "grad_norm": 0.44451746344566345, + "learning_rate": 9.677962938004342e-05, + "loss": 1.9026, + "step": 4609 + }, + { + "epoch": 1.4149785144260283, + "grad_norm": 0.4262266457080841, + "learning_rate": 9.677787413641412e-05, + "loss": 1.9408, + "step": 4610 + }, + { + "epoch": 1.4152854511970534, + "grad_norm": 0.42755937576293945, + "learning_rate": 9.677611843049845e-05, + "loss": 1.9542, + "step": 4611 + }, + { + "epoch": 1.4155923879680785, + "grad_norm": 0.43264830112457275, + "learning_rate": 9.677436226231375e-05, + "loss": 2.0244, + "step": 4612 + }, + { + "epoch": 1.4158993247391036, + "grad_norm": 0.4521278142929077, + "learning_rate": 9.67726056318774e-05, + "loss": 2.0343, + "step": 4613 + }, + { + "epoch": 1.416206261510129, + "grad_norm": 0.45257535576820374, + "learning_rate": 9.677084853920675e-05, + "loss": 1.9743, + "step": 4614 + }, + { + "epoch": 1.416513198281154, + "grad_norm": 0.42859771847724915, + "learning_rate": 9.676909098431915e-05, + "loss": 2.0067, + "step": 4615 + }, + { + "epoch": 1.4168201350521792, + "grad_norm": 0.4057050049304962, + "learning_rate": 9.6767332967232e-05, + "loss": 1.9074, + "step": 4616 + }, + { + "epoch": 1.4171270718232045, + "grad_norm": 0.46177807450294495, + "learning_rate": 9.676557448796264e-05, + "loss": 1.9899, + "step": 4617 + }, + { + "epoch": 1.4174340085942296, + "grad_norm": 0.44164395332336426, + "learning_rate": 9.676381554652846e-05, + "loss": 1.9759, + "step": 4618 + }, + { + "epoch": 1.4177409453652547, + "grad_norm": 0.42987993359565735, + "learning_rate": 9.676205614294684e-05, + "loss": 1.8783, + "step": 4619 + }, + { + "epoch": 1.4180478821362799, + "grad_norm": 0.541702389717102, + "learning_rate": 9.67602962772352e-05, + "loss": 2.0099, + "step": 4620 + }, + { + "epoch": 1.418354818907305, + "grad_norm": 0.42173272371292114, + "learning_rate": 9.67585359494109e-05, + "loss": 1.9281, + "step": 4621 + }, + { + "epoch": 1.4186617556783303, + "grad_norm": 0.432476669549942, + "learning_rate": 9.67567751594913e-05, + "loss": 1.9124, + "step": 4622 + }, + { + "epoch": 1.4189686924493554, + "grad_norm": 0.4952125549316406, + "learning_rate": 9.675501390749388e-05, + "loss": 1.973, + "step": 4623 + }, + { + "epoch": 1.4192756292203805, + "grad_norm": 0.5270698070526123, + "learning_rate": 9.6753252193436e-05, + "loss": 2.003, + "step": 4624 + }, + { + "epoch": 1.4195825659914059, + "grad_norm": 0.5735524892807007, + "learning_rate": 9.67514900173351e-05, + "loss": 1.9266, + "step": 4625 + }, + { + "epoch": 1.419889502762431, + "grad_norm": 0.508196234703064, + "learning_rate": 9.674972737920855e-05, + "loss": 1.9633, + "step": 4626 + }, + { + "epoch": 1.420196439533456, + "grad_norm": 0.4321250319480896, + "learning_rate": 9.674796427907379e-05, + "loss": 1.9994, + "step": 4627 + }, + { + "epoch": 1.4205033763044812, + "grad_norm": 0.5697643756866455, + "learning_rate": 9.674620071694826e-05, + "loss": 2.0018, + "step": 4628 + }, + { + "epoch": 1.4208103130755063, + "grad_norm": 0.6797513365745544, + "learning_rate": 9.674443669284936e-05, + "loss": 2.0514, + "step": 4629 + }, + { + "epoch": 1.4211172498465316, + "grad_norm": 0.6622742414474487, + "learning_rate": 9.674267220679456e-05, + "loss": 1.9315, + "step": 4630 + }, + { + "epoch": 1.4214241866175568, + "grad_norm": 0.5143589377403259, + "learning_rate": 9.674090725880125e-05, + "loss": 1.9691, + "step": 4631 + }, + { + "epoch": 1.4217311233885819, + "grad_norm": 0.4472220838069916, + "learning_rate": 9.673914184888692e-05, + "loss": 1.9629, + "step": 4632 + }, + { + "epoch": 1.4220380601596072, + "grad_norm": 0.4992378354072571, + "learning_rate": 9.6737375977069e-05, + "loss": 1.9202, + "step": 4633 + }, + { + "epoch": 1.4223449969306323, + "grad_norm": 0.5463345646858215, + "learning_rate": 9.673560964336493e-05, + "loss": 2.0143, + "step": 4634 + }, + { + "epoch": 1.4226519337016574, + "grad_norm": 0.4566437304019928, + "learning_rate": 9.673384284779217e-05, + "loss": 1.8907, + "step": 4635 + }, + { + "epoch": 1.4229588704726825, + "grad_norm": 0.41718652844429016, + "learning_rate": 9.673207559036816e-05, + "loss": 1.8955, + "step": 4636 + }, + { + "epoch": 1.4232658072437077, + "grad_norm": 0.5017329454421997, + "learning_rate": 9.673030787111043e-05, + "loss": 1.9745, + "step": 4637 + }, + { + "epoch": 1.423572744014733, + "grad_norm": 0.48890092968940735, + "learning_rate": 9.67285396900364e-05, + "loss": 1.9448, + "step": 4638 + }, + { + "epoch": 1.423879680785758, + "grad_norm": 0.4519537687301636, + "learning_rate": 9.672677104716352e-05, + "loss": 1.9572, + "step": 4639 + }, + { + "epoch": 1.4241866175567832, + "grad_norm": 0.4786919355392456, + "learning_rate": 9.672500194250932e-05, + "loss": 2.0212, + "step": 4640 + }, + { + "epoch": 1.4244935543278086, + "grad_norm": 0.4938487112522125, + "learning_rate": 9.672323237609127e-05, + "loss": 1.9842, + "step": 4641 + }, + { + "epoch": 1.4248004910988337, + "grad_norm": 0.5786599516868591, + "learning_rate": 9.672146234792686e-05, + "loss": 1.9575, + "step": 4642 + }, + { + "epoch": 1.4251074278698588, + "grad_norm": 0.5532247424125671, + "learning_rate": 9.671969185803356e-05, + "loss": 1.9972, + "step": 4643 + }, + { + "epoch": 1.4254143646408841, + "grad_norm": 0.5058014988899231, + "learning_rate": 9.671792090642889e-05, + "loss": 2.0042, + "step": 4644 + }, + { + "epoch": 1.4257213014119092, + "grad_norm": 0.46545106172561646, + "learning_rate": 9.671614949313033e-05, + "loss": 1.9853, + "step": 4645 + }, + { + "epoch": 1.4260282381829343, + "grad_norm": 0.47626879811286926, + "learning_rate": 9.671437761815541e-05, + "loss": 1.9725, + "step": 4646 + }, + { + "epoch": 1.4263351749539595, + "grad_norm": 0.4476237893104553, + "learning_rate": 9.671260528152165e-05, + "loss": 1.8876, + "step": 4647 + }, + { + "epoch": 1.4266421117249846, + "grad_norm": 0.4290693700313568, + "learning_rate": 9.671083248324651e-05, + "loss": 1.9766, + "step": 4648 + }, + { + "epoch": 1.42694904849601, + "grad_norm": 0.443131685256958, + "learning_rate": 9.670905922334757e-05, + "loss": 2.0201, + "step": 4649 + }, + { + "epoch": 1.427255985267035, + "grad_norm": 0.5181389451026917, + "learning_rate": 9.670728550184231e-05, + "loss": 2.0013, + "step": 4650 + }, + { + "epoch": 1.4275629220380601, + "grad_norm": 0.48453402519226074, + "learning_rate": 9.670551131874829e-05, + "loss": 1.9536, + "step": 4651 + }, + { + "epoch": 1.4278698588090855, + "grad_norm": 0.49652302265167236, + "learning_rate": 9.670373667408303e-05, + "loss": 1.9934, + "step": 4652 + }, + { + "epoch": 1.4281767955801106, + "grad_norm": 0.47071191668510437, + "learning_rate": 9.670196156786406e-05, + "loss": 2.0319, + "step": 4653 + }, + { + "epoch": 1.4284837323511357, + "grad_norm": 0.46828708052635193, + "learning_rate": 9.670018600010894e-05, + "loss": 1.9248, + "step": 4654 + }, + { + "epoch": 1.4287906691221608, + "grad_norm": 0.48472490906715393, + "learning_rate": 9.669840997083524e-05, + "loss": 1.9681, + "step": 4655 + }, + { + "epoch": 1.429097605893186, + "grad_norm": 0.48628562688827515, + "learning_rate": 9.669663348006044e-05, + "loss": 1.9818, + "step": 4656 + }, + { + "epoch": 1.4294045426642112, + "grad_norm": 0.40770742297172546, + "learning_rate": 9.669485652780215e-05, + "loss": 1.927, + "step": 4657 + }, + { + "epoch": 1.4297114794352364, + "grad_norm": 0.5005267858505249, + "learning_rate": 9.669307911407794e-05, + "loss": 2.0564, + "step": 4658 + }, + { + "epoch": 1.4300184162062615, + "grad_norm": 0.42432111501693726, + "learning_rate": 9.669130123890533e-05, + "loss": 1.9344, + "step": 4659 + }, + { + "epoch": 1.4303253529772868, + "grad_norm": 0.42347240447998047, + "learning_rate": 9.668952290230192e-05, + "loss": 1.962, + "step": 4660 + }, + { + "epoch": 1.430632289748312, + "grad_norm": 0.4718005955219269, + "learning_rate": 9.668774410428529e-05, + "loss": 2.0081, + "step": 4661 + }, + { + "epoch": 1.430939226519337, + "grad_norm": 0.45922374725341797, + "learning_rate": 9.6685964844873e-05, + "loss": 1.9378, + "step": 4662 + }, + { + "epoch": 1.4312461632903621, + "grad_norm": 0.43764227628707886, + "learning_rate": 9.668418512408263e-05, + "loss": 2.0084, + "step": 4663 + }, + { + "epoch": 1.4315531000613873, + "grad_norm": 0.42079678177833557, + "learning_rate": 9.668240494193179e-05, + "loss": 1.9675, + "step": 4664 + }, + { + "epoch": 1.4318600368324126, + "grad_norm": 0.4470539093017578, + "learning_rate": 9.668062429843808e-05, + "loss": 1.9781, + "step": 4665 + }, + { + "epoch": 1.4321669736034377, + "grad_norm": 0.4903084337711334, + "learning_rate": 9.667884319361906e-05, + "loss": 1.9612, + "step": 4666 + }, + { + "epoch": 1.4324739103744628, + "grad_norm": 0.4906228482723236, + "learning_rate": 9.667706162749234e-05, + "loss": 2.0115, + "step": 4667 + }, + { + "epoch": 1.4327808471454881, + "grad_norm": 0.4868105351924896, + "learning_rate": 9.667527960007556e-05, + "loss": 1.9648, + "step": 4668 + }, + { + "epoch": 1.4330877839165133, + "grad_norm": 0.5115882754325867, + "learning_rate": 9.667349711138632e-05, + "loss": 2.0366, + "step": 4669 + }, + { + "epoch": 1.4333947206875384, + "grad_norm": 0.47366276383399963, + "learning_rate": 9.66717141614422e-05, + "loss": 1.9467, + "step": 4670 + }, + { + "epoch": 1.4337016574585635, + "grad_norm": 0.6110171675682068, + "learning_rate": 9.666993075026086e-05, + "loss": 1.9272, + "step": 4671 + }, + { + "epoch": 1.4340085942295886, + "grad_norm": 0.5915683507919312, + "learning_rate": 9.66681468778599e-05, + "loss": 2.0444, + "step": 4672 + }, + { + "epoch": 1.434315531000614, + "grad_norm": 0.5783519744873047, + "learning_rate": 9.666636254425697e-05, + "loss": 1.9579, + "step": 4673 + }, + { + "epoch": 1.434622467771639, + "grad_norm": 0.4646502137184143, + "learning_rate": 9.66645777494697e-05, + "loss": 1.9172, + "step": 4674 + }, + { + "epoch": 1.4349294045426642, + "grad_norm": 0.4184744656085968, + "learning_rate": 9.666279249351571e-05, + "loss": 1.9189, + "step": 4675 + }, + { + "epoch": 1.4352363413136895, + "grad_norm": 0.5444575548171997, + "learning_rate": 9.666100677641266e-05, + "loss": 2.045, + "step": 4676 + }, + { + "epoch": 1.4355432780847146, + "grad_norm": 0.5232846140861511, + "learning_rate": 9.665922059817818e-05, + "loss": 2.0059, + "step": 4677 + }, + { + "epoch": 1.4358502148557397, + "grad_norm": 0.439259797334671, + "learning_rate": 9.665743395882994e-05, + "loss": 1.9164, + "step": 4678 + }, + { + "epoch": 1.4361571516267648, + "grad_norm": 0.405073344707489, + "learning_rate": 9.66556468583856e-05, + "loss": 1.9211, + "step": 4679 + }, + { + "epoch": 1.43646408839779, + "grad_norm": 0.47113174200057983, + "learning_rate": 9.665385929686279e-05, + "loss": 2.0732, + "step": 4680 + }, + { + "epoch": 1.4367710251688153, + "grad_norm": 0.4710143506526947, + "learning_rate": 9.665207127427923e-05, + "loss": 1.9153, + "step": 4681 + }, + { + "epoch": 1.4370779619398404, + "grad_norm": 0.41988152265548706, + "learning_rate": 9.665028279065254e-05, + "loss": 1.9985, + "step": 4682 + }, + { + "epoch": 1.4373848987108655, + "grad_norm": 0.4629889130592346, + "learning_rate": 9.664849384600042e-05, + "loss": 2.0188, + "step": 4683 + }, + { + "epoch": 1.4376918354818908, + "grad_norm": 0.42099106311798096, + "learning_rate": 9.664670444034051e-05, + "loss": 1.8915, + "step": 4684 + }, + { + "epoch": 1.437998772252916, + "grad_norm": 0.4132508337497711, + "learning_rate": 9.664491457369056e-05, + "loss": 1.9842, + "step": 4685 + }, + { + "epoch": 1.438305709023941, + "grad_norm": 0.4019499123096466, + "learning_rate": 9.664312424606822e-05, + "loss": 1.8653, + "step": 4686 + }, + { + "epoch": 1.4386126457949662, + "grad_norm": 0.40366294980049133, + "learning_rate": 9.664133345749118e-05, + "loss": 1.8993, + "step": 4687 + }, + { + "epoch": 1.4389195825659913, + "grad_norm": 0.4391988217830658, + "learning_rate": 9.663954220797715e-05, + "loss": 1.9471, + "step": 4688 + }, + { + "epoch": 1.4392265193370166, + "grad_norm": 0.44109684228897095, + "learning_rate": 9.663775049754382e-05, + "loss": 1.9579, + "step": 4689 + }, + { + "epoch": 1.4395334561080417, + "grad_norm": 0.45682960748672485, + "learning_rate": 9.663595832620891e-05, + "loss": 1.9757, + "step": 4690 + }, + { + "epoch": 1.4398403928790668, + "grad_norm": 0.4106207489967346, + "learning_rate": 9.663416569399013e-05, + "loss": 2.0038, + "step": 4691 + }, + { + "epoch": 1.4401473296500922, + "grad_norm": 0.4627512991428375, + "learning_rate": 9.66323726009052e-05, + "loss": 2.0253, + "step": 4692 + }, + { + "epoch": 1.4404542664211173, + "grad_norm": 0.43822941184043884, + "learning_rate": 9.663057904697182e-05, + "loss": 1.9565, + "step": 4693 + }, + { + "epoch": 1.4407612031921424, + "grad_norm": 0.46254315972328186, + "learning_rate": 9.662878503220772e-05, + "loss": 2.0042, + "step": 4694 + }, + { + "epoch": 1.4410681399631675, + "grad_norm": 0.49801671504974365, + "learning_rate": 9.662699055663065e-05, + "loss": 1.9725, + "step": 4695 + }, + { + "epoch": 1.4413750767341926, + "grad_norm": 0.40280646085739136, + "learning_rate": 9.662519562025832e-05, + "loss": 1.9016, + "step": 4696 + }, + { + "epoch": 1.441682013505218, + "grad_norm": 0.4095497131347656, + "learning_rate": 9.662340022310848e-05, + "loss": 2.0054, + "step": 4697 + }, + { + "epoch": 1.441988950276243, + "grad_norm": 0.44916659593582153, + "learning_rate": 9.662160436519889e-05, + "loss": 2.0126, + "step": 4698 + }, + { + "epoch": 1.4422958870472682, + "grad_norm": 0.47450655698776245, + "learning_rate": 9.661980804654725e-05, + "loss": 1.9679, + "step": 4699 + }, + { + "epoch": 1.4426028238182935, + "grad_norm": 0.4454696774482727, + "learning_rate": 9.661801126717136e-05, + "loss": 1.9335, + "step": 4700 + }, + { + "epoch": 1.4429097605893186, + "grad_norm": 0.5009927153587341, + "learning_rate": 9.661621402708896e-05, + "loss": 1.9777, + "step": 4701 + }, + { + "epoch": 1.4432166973603437, + "grad_norm": 0.49912458658218384, + "learning_rate": 9.66144163263178e-05, + "loss": 2.0095, + "step": 4702 + }, + { + "epoch": 1.4435236341313689, + "grad_norm": 0.4477069079875946, + "learning_rate": 9.661261816487568e-05, + "loss": 1.9265, + "step": 4703 + }, + { + "epoch": 1.443830570902394, + "grad_norm": 0.4170798361301422, + "learning_rate": 9.661081954278033e-05, + "loss": 1.9458, + "step": 4704 + }, + { + "epoch": 1.4441375076734193, + "grad_norm": 0.45160573720932007, + "learning_rate": 9.660902046004953e-05, + "loss": 1.9596, + "step": 4705 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4391551911830902, + "learning_rate": 9.660722091670109e-05, + "loss": 1.9158, + "step": 4706 + }, + { + "epoch": 1.4447513812154695, + "grad_norm": 0.5183218121528625, + "learning_rate": 9.660542091275276e-05, + "loss": 2.0055, + "step": 4707 + }, + { + "epoch": 1.4450583179864949, + "grad_norm": 0.49749481678009033, + "learning_rate": 9.660362044822235e-05, + "loss": 1.9695, + "step": 4708 + }, + { + "epoch": 1.44536525475752, + "grad_norm": 0.4839307963848114, + "learning_rate": 9.660181952312766e-05, + "loss": 1.9447, + "step": 4709 + }, + { + "epoch": 1.445672191528545, + "grad_norm": 0.5218588709831238, + "learning_rate": 9.660001813748647e-05, + "loss": 1.9892, + "step": 4710 + }, + { + "epoch": 1.4459791282995704, + "grad_norm": 0.5628986954689026, + "learning_rate": 9.659821629131658e-05, + "loss": 2.0598, + "step": 4711 + }, + { + "epoch": 1.4462860650705955, + "grad_norm": 0.5226300358772278, + "learning_rate": 9.65964139846358e-05, + "loss": 1.977, + "step": 4712 + }, + { + "epoch": 1.4465930018416207, + "grad_norm": 0.4345463216304779, + "learning_rate": 9.659461121746196e-05, + "loss": 1.9649, + "step": 4713 + }, + { + "epoch": 1.4468999386126458, + "grad_norm": 0.47233885526657104, + "learning_rate": 9.659280798981285e-05, + "loss": 1.9791, + "step": 4714 + }, + { + "epoch": 1.4472068753836709, + "grad_norm": 0.5272542238235474, + "learning_rate": 9.659100430170631e-05, + "loss": 2.0153, + "step": 4715 + }, + { + "epoch": 1.4475138121546962, + "grad_norm": 0.5567492246627808, + "learning_rate": 9.658920015316015e-05, + "loss": 2.0196, + "step": 4716 + }, + { + "epoch": 1.4478207489257213, + "grad_norm": 0.5393046140670776, + "learning_rate": 9.658739554419222e-05, + "loss": 1.9871, + "step": 4717 + }, + { + "epoch": 1.4481276856967464, + "grad_norm": 0.46408072113990784, + "learning_rate": 9.658559047482034e-05, + "loss": 1.9896, + "step": 4718 + }, + { + "epoch": 1.4484346224677718, + "grad_norm": 0.47001218795776367, + "learning_rate": 9.658378494506234e-05, + "loss": 2.0281, + "step": 4719 + }, + { + "epoch": 1.4487415592387969, + "grad_norm": 0.555749773979187, + "learning_rate": 9.658197895493608e-05, + "loss": 2.0184, + "step": 4720 + }, + { + "epoch": 1.449048496009822, + "grad_norm": 0.6206443905830383, + "learning_rate": 9.65801725044594e-05, + "loss": 1.9788, + "step": 4721 + }, + { + "epoch": 1.449355432780847, + "grad_norm": 0.533336877822876, + "learning_rate": 9.657836559365016e-05, + "loss": 1.9755, + "step": 4722 + }, + { + "epoch": 1.4496623695518722, + "grad_norm": 0.4553185701370239, + "learning_rate": 9.65765582225262e-05, + "loss": 1.9791, + "step": 4723 + }, + { + "epoch": 1.4499693063228976, + "grad_norm": 0.5754305124282837, + "learning_rate": 9.65747503911054e-05, + "loss": 1.9485, + "step": 4724 + }, + { + "epoch": 1.4502762430939227, + "grad_norm": 0.6812698245048523, + "learning_rate": 9.657294209940562e-05, + "loss": 2.0326, + "step": 4725 + }, + { + "epoch": 1.4505831798649478, + "grad_norm": 0.7532522678375244, + "learning_rate": 9.657113334744472e-05, + "loss": 1.9387, + "step": 4726 + }, + { + "epoch": 1.4508901166359731, + "grad_norm": 0.5618684887886047, + "learning_rate": 9.656932413524058e-05, + "loss": 1.9395, + "step": 4727 + }, + { + "epoch": 1.4511970534069982, + "grad_norm": 0.4818387031555176, + "learning_rate": 9.65675144628111e-05, + "loss": 1.9473, + "step": 4728 + }, + { + "epoch": 1.4515039901780233, + "grad_norm": 0.5152607560157776, + "learning_rate": 9.656570433017413e-05, + "loss": 1.894, + "step": 4729 + }, + { + "epoch": 1.4518109269490485, + "grad_norm": 0.5098578333854675, + "learning_rate": 9.656389373734759e-05, + "loss": 1.9519, + "step": 4730 + }, + { + "epoch": 1.4521178637200736, + "grad_norm": 0.5862317681312561, + "learning_rate": 9.656208268434936e-05, + "loss": 1.9968, + "step": 4731 + }, + { + "epoch": 1.452424800491099, + "grad_norm": 0.501220703125, + "learning_rate": 9.656027117119732e-05, + "loss": 1.993, + "step": 4732 + }, + { + "epoch": 1.452731737262124, + "grad_norm": 0.4974796772003174, + "learning_rate": 9.655845919790943e-05, + "loss": 2.0007, + "step": 4733 + }, + { + "epoch": 1.4530386740331491, + "grad_norm": 0.513671875, + "learning_rate": 9.655664676450351e-05, + "loss": 1.9321, + "step": 4734 + }, + { + "epoch": 1.4533456108041745, + "grad_norm": 0.5111755728721619, + "learning_rate": 9.655483387099756e-05, + "loss": 2.0187, + "step": 4735 + }, + { + "epoch": 1.4536525475751996, + "grad_norm": 0.47103258967399597, + "learning_rate": 9.655302051740942e-05, + "loss": 1.9716, + "step": 4736 + }, + { + "epoch": 1.4539594843462247, + "grad_norm": 0.4526553750038147, + "learning_rate": 9.655120670375707e-05, + "loss": 2.0424, + "step": 4737 + }, + { + "epoch": 1.4542664211172498, + "grad_norm": 0.44393640756607056, + "learning_rate": 9.65493924300584e-05, + "loss": 1.9318, + "step": 4738 + }, + { + "epoch": 1.454573357888275, + "grad_norm": 0.4070759415626526, + "learning_rate": 9.654757769633136e-05, + "loss": 1.9292, + "step": 4739 + }, + { + "epoch": 1.4548802946593002, + "grad_norm": 0.4010253846645355, + "learning_rate": 9.654576250259387e-05, + "loss": 1.9641, + "step": 4740 + }, + { + "epoch": 1.4551872314303254, + "grad_norm": 0.39156264066696167, + "learning_rate": 9.654394684886387e-05, + "loss": 1.9575, + "step": 4741 + }, + { + "epoch": 1.4554941682013505, + "grad_norm": 0.4360155463218689, + "learning_rate": 9.65421307351593e-05, + "loss": 1.9615, + "step": 4742 + }, + { + "epoch": 1.4558011049723758, + "grad_norm": 0.4203348755836487, + "learning_rate": 9.654031416149813e-05, + "loss": 1.9629, + "step": 4743 + }, + { + "epoch": 1.456108041743401, + "grad_norm": 0.42294225096702576, + "learning_rate": 9.653849712789828e-05, + "loss": 1.9756, + "step": 4744 + }, + { + "epoch": 1.456414978514426, + "grad_norm": 0.46253907680511475, + "learning_rate": 9.653667963437775e-05, + "loss": 2.0128, + "step": 4745 + }, + { + "epoch": 1.4567219152854511, + "grad_norm": 0.41743987798690796, + "learning_rate": 9.653486168095446e-05, + "loss": 1.938, + "step": 4746 + }, + { + "epoch": 1.4570288520564763, + "grad_norm": 0.43411263823509216, + "learning_rate": 9.653304326764639e-05, + "loss": 1.9744, + "step": 4747 + }, + { + "epoch": 1.4573357888275016, + "grad_norm": 0.4569607973098755, + "learning_rate": 9.653122439447151e-05, + "loss": 1.9844, + "step": 4748 + }, + { + "epoch": 1.4576427255985267, + "grad_norm": 0.41858115792274475, + "learning_rate": 9.652940506144781e-05, + "loss": 1.9835, + "step": 4749 + }, + { + "epoch": 1.4579496623695518, + "grad_norm": 0.4259703755378723, + "learning_rate": 9.652758526859324e-05, + "loss": 1.9467, + "step": 4750 + }, + { + "epoch": 1.4582565991405771, + "grad_norm": 0.49847620725631714, + "learning_rate": 9.652576501592583e-05, + "loss": 1.989, + "step": 4751 + }, + { + "epoch": 1.4585635359116023, + "grad_norm": 0.5898705720901489, + "learning_rate": 9.652394430346352e-05, + "loss": 1.9896, + "step": 4752 + }, + { + "epoch": 1.4588704726826274, + "grad_norm": 0.6528434157371521, + "learning_rate": 9.652212313122433e-05, + "loss": 1.9814, + "step": 4753 + }, + { + "epoch": 1.4591774094536525, + "grad_norm": 0.5704251527786255, + "learning_rate": 9.652030149922624e-05, + "loss": 1.9735, + "step": 4754 + }, + { + "epoch": 1.4594843462246776, + "grad_norm": 0.4349142014980316, + "learning_rate": 9.651847940748727e-05, + "loss": 1.9923, + "step": 4755 + }, + { + "epoch": 1.459791282995703, + "grad_norm": 0.43891096115112305, + "learning_rate": 9.651665685602542e-05, + "loss": 1.9429, + "step": 4756 + }, + { + "epoch": 1.460098219766728, + "grad_norm": 0.5881633758544922, + "learning_rate": 9.651483384485871e-05, + "loss": 2.0075, + "step": 4757 + }, + { + "epoch": 1.4604051565377532, + "grad_norm": 0.569064736366272, + "learning_rate": 9.651301037400515e-05, + "loss": 1.9968, + "step": 4758 + }, + { + "epoch": 1.4607120933087785, + "grad_norm": 0.49636805057525635, + "learning_rate": 9.651118644348276e-05, + "loss": 2.0844, + "step": 4759 + }, + { + "epoch": 1.4610190300798036, + "grad_norm": 0.4893283247947693, + "learning_rate": 9.650936205330955e-05, + "loss": 1.9635, + "step": 4760 + }, + { + "epoch": 1.4613259668508287, + "grad_norm": 0.5199632048606873, + "learning_rate": 9.650753720350358e-05, + "loss": 1.8934, + "step": 4761 + }, + { + "epoch": 1.4616329036218538, + "grad_norm": 0.5655859708786011, + "learning_rate": 9.650571189408287e-05, + "loss": 2.0473, + "step": 4762 + }, + { + "epoch": 1.461939840392879, + "grad_norm": 0.5004158020019531, + "learning_rate": 9.650388612506545e-05, + "loss": 1.9388, + "step": 4763 + }, + { + "epoch": 1.4622467771639043, + "grad_norm": 0.5075541734695435, + "learning_rate": 9.650205989646937e-05, + "loss": 2.0362, + "step": 4764 + }, + { + "epoch": 1.4625537139349294, + "grad_norm": 0.52835613489151, + "learning_rate": 9.650023320831267e-05, + "loss": 1.9849, + "step": 4765 + }, + { + "epoch": 1.4628606507059545, + "grad_norm": 0.5208338499069214, + "learning_rate": 9.649840606061342e-05, + "loss": 1.9619, + "step": 4766 + }, + { + "epoch": 1.4631675874769798, + "grad_norm": 0.4954691529273987, + "learning_rate": 9.649657845338966e-05, + "loss": 1.9282, + "step": 4767 + }, + { + "epoch": 1.463474524248005, + "grad_norm": 0.4260660409927368, + "learning_rate": 9.649475038665947e-05, + "loss": 2.0108, + "step": 4768 + }, + { + "epoch": 1.46378146101903, + "grad_norm": 0.4954771101474762, + "learning_rate": 9.64929218604409e-05, + "loss": 1.9995, + "step": 4769 + }, + { + "epoch": 1.4640883977900552, + "grad_norm": 0.6004415154457092, + "learning_rate": 9.649109287475202e-05, + "loss": 1.9816, + "step": 4770 + }, + { + "epoch": 1.4643953345610803, + "grad_norm": 0.6472858190536499, + "learning_rate": 9.648926342961092e-05, + "loss": 1.927, + "step": 4771 + }, + { + "epoch": 1.4647022713321056, + "grad_norm": 0.5293224453926086, + "learning_rate": 9.648743352503567e-05, + "loss": 1.9082, + "step": 4772 + }, + { + "epoch": 1.4650092081031307, + "grad_norm": 0.4413148760795593, + "learning_rate": 9.648560316104435e-05, + "loss": 1.9368, + "step": 4773 + }, + { + "epoch": 1.4653161448741558, + "grad_norm": 0.4727863371372223, + "learning_rate": 9.648377233765507e-05, + "loss": 1.944, + "step": 4774 + }, + { + "epoch": 1.4656230816451812, + "grad_norm": 0.5681154131889343, + "learning_rate": 9.648194105488589e-05, + "loss": 2.0003, + "step": 4775 + }, + { + "epoch": 1.4659300184162063, + "grad_norm": 0.5893644690513611, + "learning_rate": 9.648010931275493e-05, + "loss": 1.936, + "step": 4776 + }, + { + "epoch": 1.4662369551872314, + "grad_norm": 0.5034298300743103, + "learning_rate": 9.647827711128029e-05, + "loss": 2.0318, + "step": 4777 + }, + { + "epoch": 1.4665438919582565, + "grad_norm": 0.4954885244369507, + "learning_rate": 9.647644445048006e-05, + "loss": 2.0053, + "step": 4778 + }, + { + "epoch": 1.4668508287292816, + "grad_norm": 0.475923627614975, + "learning_rate": 9.647461133037236e-05, + "loss": 1.8911, + "step": 4779 + }, + { + "epoch": 1.467157765500307, + "grad_norm": 0.4725008010864258, + "learning_rate": 9.647277775097534e-05, + "loss": 1.8954, + "step": 4780 + }, + { + "epoch": 1.467464702271332, + "grad_norm": 0.4183707535266876, + "learning_rate": 9.647094371230707e-05, + "loss": 1.9891, + "step": 4781 + }, + { + "epoch": 1.4677716390423572, + "grad_norm": 0.4862513244152069, + "learning_rate": 9.64691092143857e-05, + "loss": 2.0364, + "step": 4782 + }, + { + "epoch": 1.4680785758133825, + "grad_norm": 0.5038082599639893, + "learning_rate": 9.646727425722936e-05, + "loss": 1.9304, + "step": 4783 + }, + { + "epoch": 1.4683855125844076, + "grad_norm": 0.47281327843666077, + "learning_rate": 9.646543884085618e-05, + "loss": 1.9453, + "step": 4784 + }, + { + "epoch": 1.4686924493554327, + "grad_norm": 0.42275354266166687, + "learning_rate": 9.646360296528431e-05, + "loss": 1.9434, + "step": 4785 + }, + { + "epoch": 1.468999386126458, + "grad_norm": 0.5757746696472168, + "learning_rate": 9.646176663053185e-05, + "loss": 2.0241, + "step": 4786 + }, + { + "epoch": 1.4693063228974832, + "grad_norm": 0.6757779121398926, + "learning_rate": 9.645992983661701e-05, + "loss": 1.9823, + "step": 4787 + }, + { + "epoch": 1.4696132596685083, + "grad_norm": 0.7052981853485107, + "learning_rate": 9.645809258355792e-05, + "loss": 2.0553, + "step": 4788 + }, + { + "epoch": 1.4699201964395334, + "grad_norm": 0.5630238652229309, + "learning_rate": 9.64562548713727e-05, + "loss": 2.0241, + "step": 4789 + }, + { + "epoch": 1.4702271332105585, + "grad_norm": 0.5034958124160767, + "learning_rate": 9.645441670007955e-05, + "loss": 1.9788, + "step": 4790 + }, + { + "epoch": 1.4705340699815839, + "grad_norm": 0.48978129029273987, + "learning_rate": 9.645257806969663e-05, + "loss": 1.9415, + "step": 4791 + }, + { + "epoch": 1.470841006752609, + "grad_norm": 0.4718508720397949, + "learning_rate": 9.645073898024211e-05, + "loss": 1.9657, + "step": 4792 + }, + { + "epoch": 1.471147943523634, + "grad_norm": 0.5171064734458923, + "learning_rate": 9.644889943173417e-05, + "loss": 1.9311, + "step": 4793 + }, + { + "epoch": 1.4714548802946594, + "grad_norm": 0.4556005597114563, + "learning_rate": 9.644705942419097e-05, + "loss": 1.9093, + "step": 4794 + }, + { + "epoch": 1.4717618170656845, + "grad_norm": 0.44836321473121643, + "learning_rate": 9.64452189576307e-05, + "loss": 1.9715, + "step": 4795 + }, + { + "epoch": 1.4720687538367097, + "grad_norm": 0.5139105916023254, + "learning_rate": 9.644337803207155e-05, + "loss": 1.967, + "step": 4796 + }, + { + "epoch": 1.4723756906077348, + "grad_norm": 0.49145743250846863, + "learning_rate": 9.644153664753173e-05, + "loss": 1.9679, + "step": 4797 + }, + { + "epoch": 1.4726826273787599, + "grad_norm": 0.4353790283203125, + "learning_rate": 9.643969480402942e-05, + "loss": 1.9438, + "step": 4798 + }, + { + "epoch": 1.4729895641497852, + "grad_norm": 0.39393118023872375, + "learning_rate": 9.643785250158283e-05, + "loss": 1.91, + "step": 4799 + }, + { + "epoch": 1.4732965009208103, + "grad_norm": 0.4250284731388092, + "learning_rate": 9.643600974021017e-05, + "loss": 1.9315, + "step": 4800 + }, + { + "epoch": 1.4736034376918354, + "grad_norm": 0.40301406383514404, + "learning_rate": 9.643416651992962e-05, + "loss": 1.9344, + "step": 4801 + }, + { + "epoch": 1.4739103744628608, + "grad_norm": 0.4428589940071106, + "learning_rate": 9.643232284075944e-05, + "loss": 1.9767, + "step": 4802 + }, + { + "epoch": 1.4742173112338859, + "grad_norm": 0.5098150372505188, + "learning_rate": 9.643047870271783e-05, + "loss": 2.0471, + "step": 4803 + }, + { + "epoch": 1.474524248004911, + "grad_norm": 0.5230079293251038, + "learning_rate": 9.642863410582302e-05, + "loss": 1.9647, + "step": 4804 + }, + { + "epoch": 1.474831184775936, + "grad_norm": 0.44200628995895386, + "learning_rate": 9.642678905009322e-05, + "loss": 1.9046, + "step": 4805 + }, + { + "epoch": 1.4751381215469612, + "grad_norm": 0.42684751749038696, + "learning_rate": 9.642494353554669e-05, + "loss": 1.82, + "step": 4806 + }, + { + "epoch": 1.4754450583179866, + "grad_norm": 0.3907437324523926, + "learning_rate": 9.642309756220165e-05, + "loss": 1.9257, + "step": 4807 + }, + { + "epoch": 1.4757519950890117, + "grad_norm": 0.43622660636901855, + "learning_rate": 9.642125113007636e-05, + "loss": 1.9319, + "step": 4808 + }, + { + "epoch": 1.4760589318600368, + "grad_norm": 0.4553097188472748, + "learning_rate": 9.641940423918905e-05, + "loss": 1.9699, + "step": 4809 + }, + { + "epoch": 1.4763658686310621, + "grad_norm": 0.48997193574905396, + "learning_rate": 9.641755688955798e-05, + "loss": 1.9843, + "step": 4810 + }, + { + "epoch": 1.4766728054020872, + "grad_norm": 0.5008227825164795, + "learning_rate": 9.641570908120141e-05, + "loss": 1.9616, + "step": 4811 + }, + { + "epoch": 1.4769797421731123, + "grad_norm": 0.49788615107536316, + "learning_rate": 9.64138608141376e-05, + "loss": 2.0233, + "step": 4812 + }, + { + "epoch": 1.4772866789441375, + "grad_norm": 0.509159505367279, + "learning_rate": 9.64120120883848e-05, + "loss": 1.9982, + "step": 4813 + }, + { + "epoch": 1.4775936157151626, + "grad_norm": 0.4976164996623993, + "learning_rate": 9.641016290396132e-05, + "loss": 1.9944, + "step": 4814 + }, + { + "epoch": 1.477900552486188, + "grad_norm": 0.4925370514392853, + "learning_rate": 9.640831326088539e-05, + "loss": 1.9547, + "step": 4815 + }, + { + "epoch": 1.478207489257213, + "grad_norm": 0.5058705806732178, + "learning_rate": 9.64064631591753e-05, + "loss": 2.0147, + "step": 4816 + }, + { + "epoch": 1.4785144260282381, + "grad_norm": 0.5614715814590454, + "learning_rate": 9.640461259884937e-05, + "loss": 1.9475, + "step": 4817 + }, + { + "epoch": 1.4788213627992635, + "grad_norm": 0.4417608380317688, + "learning_rate": 9.640276157992582e-05, + "loss": 1.9422, + "step": 4818 + }, + { + "epoch": 1.4791282995702886, + "grad_norm": 0.5124607682228088, + "learning_rate": 9.6400910102423e-05, + "loss": 1.9489, + "step": 4819 + }, + { + "epoch": 1.4794352363413137, + "grad_norm": 0.4931279420852661, + "learning_rate": 9.63990581663592e-05, + "loss": 1.9717, + "step": 4820 + }, + { + "epoch": 1.4797421731123388, + "grad_norm": 0.4716447591781616, + "learning_rate": 9.639720577175271e-05, + "loss": 1.9758, + "step": 4821 + }, + { + "epoch": 1.480049109883364, + "grad_norm": 0.4613695740699768, + "learning_rate": 9.639535291862183e-05, + "loss": 1.8998, + "step": 4822 + }, + { + "epoch": 1.4803560466543892, + "grad_norm": 0.4430600702762604, + "learning_rate": 9.639349960698489e-05, + "loss": 1.9539, + "step": 4823 + }, + { + "epoch": 1.4806629834254144, + "grad_norm": 0.45596009492874146, + "learning_rate": 9.639164583686018e-05, + "loss": 1.9626, + "step": 4824 + }, + { + "epoch": 1.4809699201964395, + "grad_norm": 0.4248705804347992, + "learning_rate": 9.638979160826604e-05, + "loss": 1.9627, + "step": 4825 + }, + { + "epoch": 1.4812768569674648, + "grad_norm": 0.43419960141181946, + "learning_rate": 9.63879369212208e-05, + "loss": 1.9589, + "step": 4826 + }, + { + "epoch": 1.48158379373849, + "grad_norm": 0.4715637266635895, + "learning_rate": 9.638608177574278e-05, + "loss": 1.981, + "step": 4827 + }, + { + "epoch": 1.481890730509515, + "grad_norm": 0.41809993982315063, + "learning_rate": 9.63842261718503e-05, + "loss": 1.9587, + "step": 4828 + }, + { + "epoch": 1.4821976672805401, + "grad_norm": 0.4085060656070709, + "learning_rate": 9.63823701095617e-05, + "loss": 1.9497, + "step": 4829 + }, + { + "epoch": 1.4825046040515653, + "grad_norm": 0.4199173152446747, + "learning_rate": 9.638051358889535e-05, + "loss": 1.9543, + "step": 4830 + }, + { + "epoch": 1.4828115408225906, + "grad_norm": 0.4560040235519409, + "learning_rate": 9.637865660986958e-05, + "loss": 1.9451, + "step": 4831 + }, + { + "epoch": 1.4831184775936157, + "grad_norm": 0.4059405028820038, + "learning_rate": 9.637679917250272e-05, + "loss": 1.9154, + "step": 4832 + }, + { + "epoch": 1.4834254143646408, + "grad_norm": 0.43314236402511597, + "learning_rate": 9.637494127681318e-05, + "loss": 1.9589, + "step": 4833 + }, + { + "epoch": 1.4837323511356661, + "grad_norm": 0.3866138458251953, + "learning_rate": 9.637308292281928e-05, + "loss": 1.9239, + "step": 4834 + }, + { + "epoch": 1.4840392879066913, + "grad_norm": 0.40781381726264954, + "learning_rate": 9.637122411053939e-05, + "loss": 1.9805, + "step": 4835 + }, + { + "epoch": 1.4843462246777164, + "grad_norm": 0.4605334401130676, + "learning_rate": 9.636936483999189e-05, + "loss": 1.9571, + "step": 4836 + }, + { + "epoch": 1.4846531614487415, + "grad_norm": 0.4730539917945862, + "learning_rate": 9.636750511119513e-05, + "loss": 1.9429, + "step": 4837 + }, + { + "epoch": 1.4849600982197666, + "grad_norm": 0.47973817586898804, + "learning_rate": 9.636564492416753e-05, + "loss": 1.9865, + "step": 4838 + }, + { + "epoch": 1.485267034990792, + "grad_norm": 0.4541794955730438, + "learning_rate": 9.636378427892744e-05, + "loss": 1.9796, + "step": 4839 + }, + { + "epoch": 1.485573971761817, + "grad_norm": 0.4863722026348114, + "learning_rate": 9.636192317549327e-05, + "loss": 1.9581, + "step": 4840 + }, + { + "epoch": 1.4858809085328422, + "grad_norm": 0.4559536278247833, + "learning_rate": 9.636006161388338e-05, + "loss": 1.9444, + "step": 4841 + }, + { + "epoch": 1.4861878453038675, + "grad_norm": 0.4385206401348114, + "learning_rate": 9.63581995941162e-05, + "loss": 1.9323, + "step": 4842 + }, + { + "epoch": 1.4864947820748926, + "grad_norm": 0.48802945017814636, + "learning_rate": 9.635633711621012e-05, + "loss": 1.9643, + "step": 4843 + }, + { + "epoch": 1.4868017188459177, + "grad_norm": 0.4051367938518524, + "learning_rate": 9.635447418018355e-05, + "loss": 1.9342, + "step": 4844 + }, + { + "epoch": 1.4871086556169428, + "grad_norm": 0.46384257078170776, + "learning_rate": 9.63526107860549e-05, + "loss": 1.9656, + "step": 4845 + }, + { + "epoch": 1.487415592387968, + "grad_norm": 0.3950713574886322, + "learning_rate": 9.635074693384257e-05, + "loss": 1.8673, + "step": 4846 + }, + { + "epoch": 1.4877225291589933, + "grad_norm": 0.4694644808769226, + "learning_rate": 9.634888262356501e-05, + "loss": 1.9484, + "step": 4847 + }, + { + "epoch": 1.4880294659300184, + "grad_norm": 0.45068567991256714, + "learning_rate": 9.63470178552406e-05, + "loss": 1.9221, + "step": 4848 + }, + { + "epoch": 1.4883364027010435, + "grad_norm": 0.44717836380004883, + "learning_rate": 9.634515262888781e-05, + "loss": 1.9968, + "step": 4849 + }, + { + "epoch": 1.4886433394720688, + "grad_norm": 0.42189615964889526, + "learning_rate": 9.634328694452506e-05, + "loss": 2.0262, + "step": 4850 + }, + { + "epoch": 1.488950276243094, + "grad_norm": 0.4895322322845459, + "learning_rate": 9.63414208021708e-05, + "loss": 2.0628, + "step": 4851 + }, + { + "epoch": 1.489257213014119, + "grad_norm": 0.4732883870601654, + "learning_rate": 9.633955420184342e-05, + "loss": 1.9487, + "step": 4852 + }, + { + "epoch": 1.4895641497851444, + "grad_norm": 0.4426051676273346, + "learning_rate": 9.633768714356143e-05, + "loss": 2.0181, + "step": 4853 + }, + { + "epoch": 1.4898710865561695, + "grad_norm": 0.5831739902496338, + "learning_rate": 9.633581962734326e-05, + "loss": 1.9311, + "step": 4854 + }, + { + "epoch": 1.4901780233271946, + "grad_norm": 0.6048587560653687, + "learning_rate": 9.633395165320734e-05, + "loss": 1.9159, + "step": 4855 + }, + { + "epoch": 1.4904849600982197, + "grad_norm": 0.60125732421875, + "learning_rate": 9.633208322117218e-05, + "loss": 1.9732, + "step": 4856 + }, + { + "epoch": 1.4907918968692448, + "grad_norm": 0.4806794822216034, + "learning_rate": 9.63302143312562e-05, + "loss": 1.9101, + "step": 4857 + }, + { + "epoch": 1.4910988336402702, + "grad_norm": 0.4032946228981018, + "learning_rate": 9.632834498347789e-05, + "loss": 1.9097, + "step": 4858 + }, + { + "epoch": 1.4914057704112953, + "grad_norm": 0.400632381439209, + "learning_rate": 9.632647517785571e-05, + "loss": 1.9949, + "step": 4859 + }, + { + "epoch": 1.4917127071823204, + "grad_norm": 0.49766576290130615, + "learning_rate": 9.632460491440818e-05, + "loss": 1.9762, + "step": 4860 + }, + { + "epoch": 1.4920196439533457, + "grad_norm": 0.6273209452629089, + "learning_rate": 9.632273419315372e-05, + "loss": 2.0797, + "step": 4861 + }, + { + "epoch": 1.4923265807243709, + "grad_norm": 0.5848406553268433, + "learning_rate": 9.632086301411087e-05, + "loss": 1.9366, + "step": 4862 + }, + { + "epoch": 1.492633517495396, + "grad_norm": 0.4683595597743988, + "learning_rate": 9.631899137729809e-05, + "loss": 1.9802, + "step": 4863 + }, + { + "epoch": 1.492940454266421, + "grad_norm": 0.43066033720970154, + "learning_rate": 9.63171192827339e-05, + "loss": 1.9621, + "step": 4864 + }, + { + "epoch": 1.4932473910374462, + "grad_norm": 0.47469422221183777, + "learning_rate": 9.63152467304368e-05, + "loss": 1.9795, + "step": 4865 + }, + { + "epoch": 1.4935543278084715, + "grad_norm": 0.5453927516937256, + "learning_rate": 9.631337372042526e-05, + "loss": 1.9711, + "step": 4866 + }, + { + "epoch": 1.4938612645794966, + "grad_norm": 0.5361614227294922, + "learning_rate": 9.631150025271782e-05, + "loss": 1.9849, + "step": 4867 + }, + { + "epoch": 1.4941682013505218, + "grad_norm": 0.4773578643798828, + "learning_rate": 9.6309626327333e-05, + "loss": 2.065, + "step": 4868 + }, + { + "epoch": 1.494475138121547, + "grad_norm": 0.428091824054718, + "learning_rate": 9.630775194428932e-05, + "loss": 1.9448, + "step": 4869 + }, + { + "epoch": 1.4947820748925722, + "grad_norm": 0.41679108142852783, + "learning_rate": 9.630587710360527e-05, + "loss": 1.9511, + "step": 4870 + }, + { + "epoch": 1.4950890116635973, + "grad_norm": 0.5072546601295471, + "learning_rate": 9.630400180529942e-05, + "loss": 1.9973, + "step": 4871 + }, + { + "epoch": 1.4953959484346224, + "grad_norm": 0.5230575799942017, + "learning_rate": 9.630212604939026e-05, + "loss": 1.9659, + "step": 4872 + }, + { + "epoch": 1.4957028852056475, + "grad_norm": 0.44307753443717957, + "learning_rate": 9.630024983589638e-05, + "loss": 1.9056, + "step": 4873 + }, + { + "epoch": 1.4960098219766729, + "grad_norm": 0.43783196806907654, + "learning_rate": 9.629837316483628e-05, + "loss": 1.9716, + "step": 4874 + }, + { + "epoch": 1.496316758747698, + "grad_norm": 0.4553990960121155, + "learning_rate": 9.629649603622852e-05, + "loss": 2.044, + "step": 4875 + }, + { + "epoch": 1.496623695518723, + "grad_norm": 0.49152833223342896, + "learning_rate": 9.629461845009164e-05, + "loss": 1.948, + "step": 4876 + }, + { + "epoch": 1.4969306322897484, + "grad_norm": 0.4371738135814667, + "learning_rate": 9.629274040644422e-05, + "loss": 1.9497, + "step": 4877 + }, + { + "epoch": 1.4972375690607735, + "grad_norm": 0.4973873198032379, + "learning_rate": 9.629086190530482e-05, + "loss": 2.0053, + "step": 4878 + }, + { + "epoch": 1.4975445058317987, + "grad_norm": 0.4250672459602356, + "learning_rate": 9.628898294669197e-05, + "loss": 1.9617, + "step": 4879 + }, + { + "epoch": 1.4978514426028238, + "grad_norm": 0.4514639675617218, + "learning_rate": 9.628710353062427e-05, + "loss": 1.9503, + "step": 4880 + }, + { + "epoch": 1.4981583793738489, + "grad_norm": 0.4960804879665375, + "learning_rate": 9.628522365712027e-05, + "loss": 1.9932, + "step": 4881 + }, + { + "epoch": 1.4984653161448742, + "grad_norm": 0.5604363083839417, + "learning_rate": 9.628334332619857e-05, + "loss": 2.0186, + "step": 4882 + }, + { + "epoch": 1.4987722529158993, + "grad_norm": 0.5125443935394287, + "learning_rate": 9.628146253787776e-05, + "loss": 1.9897, + "step": 4883 + }, + { + "epoch": 1.4990791896869244, + "grad_norm": 0.4029771089553833, + "learning_rate": 9.627958129217639e-05, + "loss": 1.9083, + "step": 4884 + }, + { + "epoch": 1.4993861264579498, + "grad_norm": 0.4608222544193268, + "learning_rate": 9.627769958911308e-05, + "loss": 2.0153, + "step": 4885 + }, + { + "epoch": 1.4996930632289749, + "grad_norm": 0.4253246486186981, + "learning_rate": 9.627581742870641e-05, + "loss": 1.9278, + "step": 4886 + }, + { + "epoch": 1.5, + "grad_norm": 0.4247463047504425, + "learning_rate": 9.6273934810975e-05, + "loss": 1.9456, + "step": 4887 + }, + { + "epoch": 1.5003069367710253, + "grad_norm": 0.44055816531181335, + "learning_rate": 9.627205173593744e-05, + "loss": 2.0225, + "step": 4888 + }, + { + "epoch": 1.5006138735420502, + "grad_norm": 0.47912710905075073, + "learning_rate": 9.627016820361235e-05, + "loss": 1.9716, + "step": 4889 + }, + { + "epoch": 1.5009208103130756, + "grad_norm": 0.47608625888824463, + "learning_rate": 9.626828421401832e-05, + "loss": 1.9444, + "step": 4890 + }, + { + "epoch": 1.5012277470841007, + "grad_norm": 0.4757349193096161, + "learning_rate": 9.6266399767174e-05, + "loss": 2.0699, + "step": 4891 + }, + { + "epoch": 1.5015346838551258, + "grad_norm": 0.5556650757789612, + "learning_rate": 9.6264514863098e-05, + "loss": 1.99, + "step": 4892 + }, + { + "epoch": 1.5018416206261511, + "grad_norm": 0.5072291493415833, + "learning_rate": 9.626262950180894e-05, + "loss": 1.9435, + "step": 4893 + }, + { + "epoch": 1.5021485573971762, + "grad_norm": 0.47811564803123474, + "learning_rate": 9.626074368332546e-05, + "loss": 1.9399, + "step": 4894 + }, + { + "epoch": 1.5024554941682013, + "grad_norm": 0.4613232910633087, + "learning_rate": 9.62588574076662e-05, + "loss": 1.9259, + "step": 4895 + }, + { + "epoch": 1.5027624309392267, + "grad_norm": 0.4170697331428528, + "learning_rate": 9.62569706748498e-05, + "loss": 1.9319, + "step": 4896 + }, + { + "epoch": 1.5030693677102516, + "grad_norm": 0.4731575548648834, + "learning_rate": 9.62550834848949e-05, + "loss": 1.9862, + "step": 4897 + }, + { + "epoch": 1.503376304481277, + "grad_norm": 0.49881401658058167, + "learning_rate": 9.625319583782016e-05, + "loss": 1.9837, + "step": 4898 + }, + { + "epoch": 1.503683241252302, + "grad_norm": 0.4689660668373108, + "learning_rate": 9.625130773364424e-05, + "loss": 1.9662, + "step": 4899 + }, + { + "epoch": 1.5039901780233271, + "grad_norm": 0.48389768600463867, + "learning_rate": 9.624941917238577e-05, + "loss": 2.0087, + "step": 4900 + }, + { + "epoch": 1.5042971147943525, + "grad_norm": 0.46716609597206116, + "learning_rate": 9.624753015406342e-05, + "loss": 1.9718, + "step": 4901 + }, + { + "epoch": 1.5046040515653776, + "grad_norm": 0.544793963432312, + "learning_rate": 9.62456406786959e-05, + "loss": 1.9878, + "step": 4902 + }, + { + "epoch": 1.5049109883364027, + "grad_norm": 0.44499701261520386, + "learning_rate": 9.624375074630183e-05, + "loss": 1.8849, + "step": 4903 + }, + { + "epoch": 1.505217925107428, + "grad_norm": 0.42464208602905273, + "learning_rate": 9.624186035689993e-05, + "loss": 1.8995, + "step": 4904 + }, + { + "epoch": 1.505524861878453, + "grad_norm": 0.41650670766830444, + "learning_rate": 9.623996951050885e-05, + "loss": 1.9138, + "step": 4905 + }, + { + "epoch": 1.5058317986494782, + "grad_norm": 0.37955889105796814, + "learning_rate": 9.62380782071473e-05, + "loss": 1.9746, + "step": 4906 + }, + { + "epoch": 1.5061387354205034, + "grad_norm": 0.3799228072166443, + "learning_rate": 9.623618644683394e-05, + "loss": 1.942, + "step": 4907 + }, + { + "epoch": 1.5064456721915285, + "grad_norm": 0.3799766004085541, + "learning_rate": 9.623429422958751e-05, + "loss": 1.9025, + "step": 4908 + }, + { + "epoch": 1.5067526089625538, + "grad_norm": 0.3780234456062317, + "learning_rate": 9.623240155542668e-05, + "loss": 1.9581, + "step": 4909 + }, + { + "epoch": 1.507059545733579, + "grad_norm": 0.36379706859588623, + "learning_rate": 9.623050842437014e-05, + "loss": 1.9299, + "step": 4910 + }, + { + "epoch": 1.507366482504604, + "grad_norm": 0.5230580568313599, + "learning_rate": 9.622861483643663e-05, + "loss": 2.0306, + "step": 4911 + }, + { + "epoch": 1.5076734192756294, + "grad_norm": 0.443945050239563, + "learning_rate": 9.622672079164486e-05, + "loss": 1.9032, + "step": 4912 + }, + { + "epoch": 1.5079803560466543, + "grad_norm": 0.4689701795578003, + "learning_rate": 9.622482629001355e-05, + "loss": 1.9901, + "step": 4913 + }, + { + "epoch": 1.5082872928176796, + "grad_norm": 0.4483632445335388, + "learning_rate": 9.622293133156139e-05, + "loss": 1.948, + "step": 4914 + }, + { + "epoch": 1.5085942295887047, + "grad_norm": 0.4064919948577881, + "learning_rate": 9.622103591630715e-05, + "loss": 1.9487, + "step": 4915 + }, + { + "epoch": 1.5089011663597298, + "grad_norm": 0.44170522689819336, + "learning_rate": 9.621914004426952e-05, + "loss": 1.9929, + "step": 4916 + }, + { + "epoch": 1.5092081031307552, + "grad_norm": 0.45979443192481995, + "learning_rate": 9.621724371546727e-05, + "loss": 1.9428, + "step": 4917 + }, + { + "epoch": 1.5095150399017803, + "grad_norm": 0.5258452892303467, + "learning_rate": 9.621534692991913e-05, + "loss": 2.0049, + "step": 4918 + }, + { + "epoch": 1.5098219766728054, + "grad_norm": 0.45191919803619385, + "learning_rate": 9.621344968764385e-05, + "loss": 2.0364, + "step": 4919 + }, + { + "epoch": 1.5101289134438307, + "grad_norm": 0.539245069026947, + "learning_rate": 9.621155198866016e-05, + "loss": 2.072, + "step": 4920 + }, + { + "epoch": 1.5104358502148556, + "grad_norm": 0.5410256385803223, + "learning_rate": 9.620965383298684e-05, + "loss": 2.0231, + "step": 4921 + }, + { + "epoch": 1.510742786985881, + "grad_norm": 0.4409741759300232, + "learning_rate": 9.620775522064264e-05, + "loss": 1.9024, + "step": 4922 + }, + { + "epoch": 1.511049723756906, + "grad_norm": 0.4911535680294037, + "learning_rate": 9.620585615164631e-05, + "loss": 2.0057, + "step": 4923 + }, + { + "epoch": 1.5113566605279312, + "grad_norm": 0.48139557242393494, + "learning_rate": 9.620395662601663e-05, + "loss": 2.0175, + "step": 4924 + }, + { + "epoch": 1.5116635972989565, + "grad_norm": 0.5130077004432678, + "learning_rate": 9.620205664377238e-05, + "loss": 1.952, + "step": 4925 + }, + { + "epoch": 1.5119705340699816, + "grad_norm": 0.5428542494773865, + "learning_rate": 9.62001562049323e-05, + "loss": 1.977, + "step": 4926 + }, + { + "epoch": 1.5122774708410067, + "grad_norm": 0.4586256444454193, + "learning_rate": 9.619825530951522e-05, + "loss": 1.9997, + "step": 4927 + }, + { + "epoch": 1.512584407612032, + "grad_norm": 0.3941349387168884, + "learning_rate": 9.61963539575399e-05, + "loss": 1.9174, + "step": 4928 + }, + { + "epoch": 1.512891344383057, + "grad_norm": 0.4396456480026245, + "learning_rate": 9.619445214902511e-05, + "loss": 1.9696, + "step": 4929 + }, + { + "epoch": 1.5131982811540823, + "grad_norm": 0.5413886904716492, + "learning_rate": 9.61925498839897e-05, + "loss": 2.0332, + "step": 4930 + }, + { + "epoch": 1.5135052179251074, + "grad_norm": 0.5946230888366699, + "learning_rate": 9.619064716245242e-05, + "loss": 2.0433, + "step": 4931 + }, + { + "epoch": 1.5138121546961325, + "grad_norm": 0.6353569030761719, + "learning_rate": 9.618874398443211e-05, + "loss": 1.9828, + "step": 4932 + }, + { + "epoch": 1.5141190914671578, + "grad_norm": 0.523690938949585, + "learning_rate": 9.618684034994754e-05, + "loss": 1.9024, + "step": 4933 + }, + { + "epoch": 1.514426028238183, + "grad_norm": 0.4437367022037506, + "learning_rate": 9.618493625901754e-05, + "loss": 1.9961, + "step": 4934 + }, + { + "epoch": 1.514732965009208, + "grad_norm": 0.48458734154701233, + "learning_rate": 9.618303171166094e-05, + "loss": 1.9515, + "step": 4935 + }, + { + "epoch": 1.5150399017802334, + "grad_norm": 0.47659310698509216, + "learning_rate": 9.618112670789657e-05, + "loss": 1.9943, + "step": 4936 + }, + { + "epoch": 1.5153468385512583, + "grad_norm": 0.49281415343284607, + "learning_rate": 9.617922124774322e-05, + "loss": 1.9311, + "step": 4937 + }, + { + "epoch": 1.5156537753222836, + "grad_norm": 0.4706041216850281, + "learning_rate": 9.617731533121972e-05, + "loss": 1.9478, + "step": 4938 + }, + { + "epoch": 1.5159607120933087, + "grad_norm": 0.4187149405479431, + "learning_rate": 9.617540895834496e-05, + "loss": 1.9915, + "step": 4939 + }, + { + "epoch": 1.5162676488643339, + "grad_norm": 0.3792540431022644, + "learning_rate": 9.617350212913772e-05, + "loss": 1.8609, + "step": 4940 + }, + { + "epoch": 1.5165745856353592, + "grad_norm": 0.46558165550231934, + "learning_rate": 9.617159484361688e-05, + "loss": 1.9574, + "step": 4941 + }, + { + "epoch": 1.5168815224063843, + "grad_norm": 0.4930344820022583, + "learning_rate": 9.616968710180127e-05, + "loss": 1.9924, + "step": 4942 + }, + { + "epoch": 1.5171884591774094, + "grad_norm": 0.44909337162971497, + "learning_rate": 9.616777890370976e-05, + "loss": 1.9674, + "step": 4943 + }, + { + "epoch": 1.5174953959484347, + "grad_norm": 0.43266600370407104, + "learning_rate": 9.616587024936119e-05, + "loss": 1.8899, + "step": 4944 + }, + { + "epoch": 1.5178023327194596, + "grad_norm": 0.43229207396507263, + "learning_rate": 9.616396113877444e-05, + "loss": 1.9671, + "step": 4945 + }, + { + "epoch": 1.518109269490485, + "grad_norm": 0.4609402120113373, + "learning_rate": 9.616205157196837e-05, + "loss": 1.9844, + "step": 4946 + }, + { + "epoch": 1.51841620626151, + "grad_norm": 0.4598314166069031, + "learning_rate": 9.616014154896184e-05, + "loss": 1.985, + "step": 4947 + }, + { + "epoch": 1.5187231430325352, + "grad_norm": 0.4746960997581482, + "learning_rate": 9.615823106977376e-05, + "loss": 2.0199, + "step": 4948 + }, + { + "epoch": 1.5190300798035605, + "grad_norm": 0.47560420632362366, + "learning_rate": 9.615632013442295e-05, + "loss": 1.8864, + "step": 4949 + }, + { + "epoch": 1.5193370165745856, + "grad_norm": 0.447837233543396, + "learning_rate": 9.615440874292835e-05, + "loss": 1.9699, + "step": 4950 + }, + { + "epoch": 1.5196439533456108, + "grad_norm": 0.49653175473213196, + "learning_rate": 9.615249689530883e-05, + "loss": 2.0645, + "step": 4951 + }, + { + "epoch": 1.519950890116636, + "grad_norm": 0.47083014249801636, + "learning_rate": 9.615058459158328e-05, + "loss": 2.01, + "step": 4952 + }, + { + "epoch": 1.520257826887661, + "grad_norm": 0.5299197435379028, + "learning_rate": 9.614867183177061e-05, + "loss": 2.0232, + "step": 4953 + }, + { + "epoch": 1.5205647636586863, + "grad_norm": 0.5005922317504883, + "learning_rate": 9.614675861588971e-05, + "loss": 1.9703, + "step": 4954 + }, + { + "epoch": 1.5208717004297114, + "grad_norm": 0.5131978392601013, + "learning_rate": 9.61448449439595e-05, + "loss": 1.9921, + "step": 4955 + }, + { + "epoch": 1.5211786372007365, + "grad_norm": 0.5278428196907043, + "learning_rate": 9.614293081599889e-05, + "loss": 1.9111, + "step": 4956 + }, + { + "epoch": 1.5214855739717619, + "grad_norm": 0.4914579689502716, + "learning_rate": 9.614101623202678e-05, + "loss": 2.0398, + "step": 4957 + }, + { + "epoch": 1.521792510742787, + "grad_norm": 0.454863041639328, + "learning_rate": 9.61391011920621e-05, + "loss": 1.9674, + "step": 4958 + }, + { + "epoch": 1.522099447513812, + "grad_norm": 0.464491605758667, + "learning_rate": 9.613718569612379e-05, + "loss": 2.0123, + "step": 4959 + }, + { + "epoch": 1.5224063842848374, + "grad_norm": 0.4252295196056366, + "learning_rate": 9.613526974423078e-05, + "loss": 1.9796, + "step": 4960 + }, + { + "epoch": 1.5227133210558625, + "grad_norm": 0.4643968641757965, + "learning_rate": 9.613335333640199e-05, + "loss": 1.9448, + "step": 4961 + }, + { + "epoch": 1.5230202578268877, + "grad_norm": 0.4204397201538086, + "learning_rate": 9.613143647265635e-05, + "loss": 2.0191, + "step": 4962 + }, + { + "epoch": 1.523327194597913, + "grad_norm": 0.3838767111301422, + "learning_rate": 9.612951915301283e-05, + "loss": 1.9057, + "step": 4963 + }, + { + "epoch": 1.5236341313689379, + "grad_norm": 0.4353863000869751, + "learning_rate": 9.612760137749035e-05, + "loss": 2.0435, + "step": 4964 + }, + { + "epoch": 1.5239410681399632, + "grad_norm": 0.4082738757133484, + "learning_rate": 9.612568314610788e-05, + "loss": 1.9229, + "step": 4965 + }, + { + "epoch": 1.5242480049109883, + "grad_norm": 0.4382591247558594, + "learning_rate": 9.612376445888437e-05, + "loss": 1.9185, + "step": 4966 + }, + { + "epoch": 1.5245549416820134, + "grad_norm": 0.48340749740600586, + "learning_rate": 9.61218453158388e-05, + "loss": 1.9669, + "step": 4967 + }, + { + "epoch": 1.5248618784530388, + "grad_norm": 0.47423556447029114, + "learning_rate": 9.611992571699012e-05, + "loss": 1.9372, + "step": 4968 + }, + { + "epoch": 1.525168815224064, + "grad_norm": 0.4070637822151184, + "learning_rate": 9.611800566235728e-05, + "loss": 2.0201, + "step": 4969 + }, + { + "epoch": 1.525475751995089, + "grad_norm": 0.43758198618888855, + "learning_rate": 9.61160851519593e-05, + "loss": 1.982, + "step": 4970 + }, + { + "epoch": 1.5257826887661143, + "grad_norm": 0.4724174737930298, + "learning_rate": 9.611416418581513e-05, + "loss": 1.9938, + "step": 4971 + }, + { + "epoch": 1.5260896255371392, + "grad_norm": 0.492405503988266, + "learning_rate": 9.611224276394374e-05, + "loss": 1.9462, + "step": 4972 + }, + { + "epoch": 1.5263965623081646, + "grad_norm": 0.5064161419868469, + "learning_rate": 9.611032088636418e-05, + "loss": 2.0326, + "step": 4973 + }, + { + "epoch": 1.5267034990791897, + "grad_norm": 0.4256031811237335, + "learning_rate": 9.610839855309537e-05, + "loss": 1.8885, + "step": 4974 + }, + { + "epoch": 1.5270104358502148, + "grad_norm": 0.4283316731452942, + "learning_rate": 9.610647576415636e-05, + "loss": 2.005, + "step": 4975 + }, + { + "epoch": 1.5273173726212401, + "grad_norm": 0.44234412908554077, + "learning_rate": 9.610455251956614e-05, + "loss": 1.9626, + "step": 4976 + }, + { + "epoch": 1.5276243093922652, + "grad_norm": 0.4135831594467163, + "learning_rate": 9.610262881934369e-05, + "loss": 1.9529, + "step": 4977 + }, + { + "epoch": 1.5279312461632903, + "grad_norm": 0.48090922832489014, + "learning_rate": 9.610070466350805e-05, + "loss": 2.0239, + "step": 4978 + }, + { + "epoch": 1.5282381829343157, + "grad_norm": 0.4546974301338196, + "learning_rate": 9.609878005207822e-05, + "loss": 1.9556, + "step": 4979 + }, + { + "epoch": 1.5285451197053406, + "grad_norm": 0.4197862148284912, + "learning_rate": 9.609685498507323e-05, + "loss": 1.9117, + "step": 4980 + }, + { + "epoch": 1.528852056476366, + "grad_norm": 0.4376974105834961, + "learning_rate": 9.60949294625121e-05, + "loss": 1.9514, + "step": 4981 + }, + { + "epoch": 1.529158993247391, + "grad_norm": 0.3671407401561737, + "learning_rate": 9.609300348441385e-05, + "loss": 1.9042, + "step": 4982 + }, + { + "epoch": 1.5294659300184161, + "grad_norm": 0.4326031506061554, + "learning_rate": 9.609107705079754e-05, + "loss": 1.9606, + "step": 4983 + }, + { + "epoch": 1.5297728667894415, + "grad_norm": 0.423308402299881, + "learning_rate": 9.608915016168218e-05, + "loss": 1.9663, + "step": 4984 + }, + { + "epoch": 1.5300798035604666, + "grad_norm": 0.46309906244277954, + "learning_rate": 9.608722281708683e-05, + "loss": 2.0114, + "step": 4985 + }, + { + "epoch": 1.5303867403314917, + "grad_norm": 0.4619913101196289, + "learning_rate": 9.608529501703053e-05, + "loss": 1.9328, + "step": 4986 + }, + { + "epoch": 1.530693677102517, + "grad_norm": 0.4335738718509674, + "learning_rate": 9.608336676153234e-05, + "loss": 1.9069, + "step": 4987 + }, + { + "epoch": 1.531000613873542, + "grad_norm": 0.40606966614723206, + "learning_rate": 9.608143805061129e-05, + "loss": 1.9243, + "step": 4988 + }, + { + "epoch": 1.5313075506445673, + "grad_norm": 0.45613235235214233, + "learning_rate": 9.607950888428649e-05, + "loss": 1.9943, + "step": 4989 + }, + { + "epoch": 1.5316144874155924, + "grad_norm": 0.4905582666397095, + "learning_rate": 9.607757926257696e-05, + "loss": 1.9649, + "step": 4990 + }, + { + "epoch": 1.5319214241866175, + "grad_norm": 0.44312527775764465, + "learning_rate": 9.607564918550179e-05, + "loss": 1.927, + "step": 4991 + }, + { + "epoch": 1.5322283609576428, + "grad_norm": 0.5193700790405273, + "learning_rate": 9.607371865308004e-05, + "loss": 1.9038, + "step": 4992 + }, + { + "epoch": 1.532535297728668, + "grad_norm": 0.5528806447982788, + "learning_rate": 9.607178766533078e-05, + "loss": 1.9194, + "step": 4993 + }, + { + "epoch": 1.532842234499693, + "grad_norm": 0.6561285257339478, + "learning_rate": 9.606985622227314e-05, + "loss": 2.0098, + "step": 4994 + }, + { + "epoch": 1.5331491712707184, + "grad_norm": 0.5642603635787964, + "learning_rate": 9.606792432392617e-05, + "loss": 1.9781, + "step": 4995 + }, + { + "epoch": 1.5334561080417433, + "grad_norm": 0.4974311590194702, + "learning_rate": 9.606599197030896e-05, + "loss": 1.9558, + "step": 4996 + }, + { + "epoch": 1.5337630448127686, + "grad_norm": 0.4324510395526886, + "learning_rate": 9.606405916144063e-05, + "loss": 1.9749, + "step": 4997 + }, + { + "epoch": 1.5340699815837937, + "grad_norm": 0.45244327187538147, + "learning_rate": 9.606212589734027e-05, + "loss": 1.8902, + "step": 4998 + }, + { + "epoch": 1.5343769183548188, + "grad_norm": 0.5418685078620911, + "learning_rate": 9.606019217802698e-05, + "loss": 1.9766, + "step": 4999 + }, + { + "epoch": 1.5346838551258442, + "grad_norm": 0.48479241132736206, + "learning_rate": 9.605825800351987e-05, + "loss": 1.9949, + "step": 5000 + }, + { + "epoch": 1.5349907918968693, + "grad_norm": 0.4958111643791199, + "learning_rate": 9.605632337383806e-05, + "loss": 1.988, + "step": 5001 + }, + { + "epoch": 1.5352977286678944, + "grad_norm": 0.47347983717918396, + "learning_rate": 9.605438828900067e-05, + "loss": 1.9157, + "step": 5002 + }, + { + "epoch": 1.5356046654389197, + "grad_norm": 0.4018974304199219, + "learning_rate": 9.605245274902684e-05, + "loss": 1.9347, + "step": 5003 + }, + { + "epoch": 1.5359116022099446, + "grad_norm": 0.46161791682243347, + "learning_rate": 9.605051675393565e-05, + "loss": 1.9785, + "step": 5004 + }, + { + "epoch": 1.53621853898097, + "grad_norm": 0.5113234519958496, + "learning_rate": 9.604858030374627e-05, + "loss": 1.9595, + "step": 5005 + }, + { + "epoch": 1.536525475751995, + "grad_norm": 0.6643409132957458, + "learning_rate": 9.604664339847784e-05, + "loss": 2.0395, + "step": 5006 + }, + { + "epoch": 1.5368324125230202, + "grad_norm": 0.6759974360466003, + "learning_rate": 9.604470603814948e-05, + "loss": 1.9058, + "step": 5007 + }, + { + "epoch": 1.5371393492940455, + "grad_norm": 0.5576213598251343, + "learning_rate": 9.604276822278035e-05, + "loss": 1.9326, + "step": 5008 + }, + { + "epoch": 1.5374462860650706, + "grad_norm": 0.4472630023956299, + "learning_rate": 9.60408299523896e-05, + "loss": 1.9553, + "step": 5009 + }, + { + "epoch": 1.5377532228360957, + "grad_norm": 0.48445144295692444, + "learning_rate": 9.603889122699638e-05, + "loss": 2.0136, + "step": 5010 + }, + { + "epoch": 1.538060159607121, + "grad_norm": 0.4793097972869873, + "learning_rate": 9.603695204661987e-05, + "loss": 1.9777, + "step": 5011 + }, + { + "epoch": 1.538367096378146, + "grad_norm": 0.5003167390823364, + "learning_rate": 9.60350124112792e-05, + "loss": 1.9672, + "step": 5012 + }, + { + "epoch": 1.5386740331491713, + "grad_norm": 0.5131042003631592, + "learning_rate": 9.603307232099355e-05, + "loss": 2.0058, + "step": 5013 + }, + { + "epoch": 1.5389809699201964, + "grad_norm": 0.4145869314670563, + "learning_rate": 9.603113177578212e-05, + "loss": 1.9332, + "step": 5014 + }, + { + "epoch": 1.5392879066912215, + "grad_norm": 0.4939991235733032, + "learning_rate": 9.602919077566404e-05, + "loss": 1.9967, + "step": 5015 + }, + { + "epoch": 1.5395948434622468, + "grad_norm": 0.4768902361392975, + "learning_rate": 9.602724932065853e-05, + "loss": 1.873, + "step": 5016 + }, + { + "epoch": 1.539901780233272, + "grad_norm": 0.45381611585617065, + "learning_rate": 9.602530741078476e-05, + "loss": 1.9416, + "step": 5017 + }, + { + "epoch": 1.540208717004297, + "grad_norm": 0.43104392290115356, + "learning_rate": 9.602336504606193e-05, + "loss": 1.9566, + "step": 5018 + }, + { + "epoch": 1.5405156537753224, + "grad_norm": 0.5354776978492737, + "learning_rate": 9.602142222650924e-05, + "loss": 1.9939, + "step": 5019 + }, + { + "epoch": 1.5408225905463473, + "grad_norm": 0.5623740553855896, + "learning_rate": 9.601947895214586e-05, + "loss": 1.9622, + "step": 5020 + }, + { + "epoch": 1.5411295273173726, + "grad_norm": 0.5234485268592834, + "learning_rate": 9.601753522299103e-05, + "loss": 1.9636, + "step": 5021 + }, + { + "epoch": 1.5414364640883977, + "grad_norm": 0.416384756565094, + "learning_rate": 9.601559103906396e-05, + "loss": 1.92, + "step": 5022 + }, + { + "epoch": 1.5417434008594229, + "grad_norm": 0.47080478072166443, + "learning_rate": 9.601364640038384e-05, + "loss": 1.9147, + "step": 5023 + }, + { + "epoch": 1.5420503376304482, + "grad_norm": 0.527463972568512, + "learning_rate": 9.601170130696988e-05, + "loss": 1.9458, + "step": 5024 + }, + { + "epoch": 1.5423572744014733, + "grad_norm": 0.4761022925376892, + "learning_rate": 9.600975575884134e-05, + "loss": 1.95, + "step": 5025 + }, + { + "epoch": 1.5426642111724984, + "grad_norm": 0.48202264308929443, + "learning_rate": 9.600780975601741e-05, + "loss": 1.9618, + "step": 5026 + }, + { + "epoch": 1.5429711479435237, + "grad_norm": 0.43222522735595703, + "learning_rate": 9.600586329851735e-05, + "loss": 1.9869, + "step": 5027 + }, + { + "epoch": 1.5432780847145486, + "grad_norm": 0.40816691517829895, + "learning_rate": 9.600391638636037e-05, + "loss": 1.991, + "step": 5028 + }, + { + "epoch": 1.543585021485574, + "grad_norm": 0.4365478754043579, + "learning_rate": 9.600196901956572e-05, + "loss": 1.9904, + "step": 5029 + }, + { + "epoch": 1.5438919582565993, + "grad_norm": 0.41411092877388, + "learning_rate": 9.600002119815268e-05, + "loss": 1.9449, + "step": 5030 + }, + { + "epoch": 1.5441988950276242, + "grad_norm": 0.41023650765419006, + "learning_rate": 9.599807292214045e-05, + "loss": 1.9318, + "step": 5031 + }, + { + "epoch": 1.5445058317986495, + "grad_norm": 0.4844631254673004, + "learning_rate": 9.599612419154831e-05, + "loss": 1.9884, + "step": 5032 + }, + { + "epoch": 1.5448127685696746, + "grad_norm": 0.4347037374973297, + "learning_rate": 9.59941750063955e-05, + "loss": 1.8992, + "step": 5033 + }, + { + "epoch": 1.5451197053406998, + "grad_norm": 0.6414445638656616, + "learning_rate": 9.59922253667013e-05, + "loss": 2.0268, + "step": 5034 + }, + { + "epoch": 1.545426642111725, + "grad_norm": 0.6607222557067871, + "learning_rate": 9.599027527248498e-05, + "loss": 2.0116, + "step": 5035 + }, + { + "epoch": 1.5457335788827502, + "grad_norm": 0.6406869292259216, + "learning_rate": 9.59883247237658e-05, + "loss": 1.9256, + "step": 5036 + }, + { + "epoch": 1.5460405156537753, + "grad_norm": 0.5388308167457581, + "learning_rate": 9.598637372056303e-05, + "loss": 1.906, + "step": 5037 + }, + { + "epoch": 1.5463474524248007, + "grad_norm": 0.42285510897636414, + "learning_rate": 9.598442226289596e-05, + "loss": 1.9137, + "step": 5038 + }, + { + "epoch": 1.5466543891958255, + "grad_norm": 0.5622994303703308, + "learning_rate": 9.598247035078389e-05, + "loss": 1.9825, + "step": 5039 + }, + { + "epoch": 1.5469613259668509, + "grad_norm": 0.7120574116706848, + "learning_rate": 9.59805179842461e-05, + "loss": 1.9467, + "step": 5040 + }, + { + "epoch": 1.547268262737876, + "grad_norm": 0.7050338983535767, + "learning_rate": 9.597856516330187e-05, + "loss": 1.9763, + "step": 5041 + }, + { + "epoch": 1.547575199508901, + "grad_norm": 0.4908922016620636, + "learning_rate": 9.597661188797051e-05, + "loss": 1.9826, + "step": 5042 + }, + { + "epoch": 1.5478821362799264, + "grad_norm": 0.47363361716270447, + "learning_rate": 9.597465815827133e-05, + "loss": 1.9769, + "step": 5043 + }, + { + "epoch": 1.5481890730509515, + "grad_norm": 0.6289864182472229, + "learning_rate": 9.597270397422364e-05, + "loss": 1.9364, + "step": 5044 + }, + { + "epoch": 1.5484960098219767, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.597074933584673e-05, + "loss": 1.949, + "step": 5045 + }, + { + "epoch": 1.548802946593002, + "grad_norm": 0.559152364730835, + "learning_rate": 9.596879424315993e-05, + "loss": 2.0194, + "step": 5046 + }, + { + "epoch": 1.5491098833640269, + "grad_norm": 0.4613901674747467, + "learning_rate": 9.596683869618257e-05, + "loss": 1.9658, + "step": 5047 + }, + { + "epoch": 1.5494168201350522, + "grad_norm": 0.6245483160018921, + "learning_rate": 9.596488269493396e-05, + "loss": 1.9265, + "step": 5048 + }, + { + "epoch": 1.5497237569060773, + "grad_norm": 0.8100824356079102, + "learning_rate": 9.596292623943343e-05, + "loss": 1.9536, + "step": 5049 + }, + { + "epoch": 1.5500306936771024, + "grad_norm": 0.7486092448234558, + "learning_rate": 9.596096932970035e-05, + "loss": 1.9801, + "step": 5050 + }, + { + "epoch": 1.5503376304481278, + "grad_norm": 0.4803295135498047, + "learning_rate": 9.595901196575401e-05, + "loss": 1.9943, + "step": 5051 + }, + { + "epoch": 1.550644567219153, + "grad_norm": 0.5027125477790833, + "learning_rate": 9.595705414761379e-05, + "loss": 1.9036, + "step": 5052 + }, + { + "epoch": 1.550951503990178, + "grad_norm": 0.5785070657730103, + "learning_rate": 9.595509587529902e-05, + "loss": 1.9489, + "step": 5053 + }, + { + "epoch": 1.5512584407612033, + "grad_norm": 0.6017338633537292, + "learning_rate": 9.595313714882906e-05, + "loss": 1.9964, + "step": 5054 + }, + { + "epoch": 1.5515653775322282, + "grad_norm": 0.5023195147514343, + "learning_rate": 9.595117796822326e-05, + "loss": 1.9778, + "step": 5055 + }, + { + "epoch": 1.5518723143032536, + "grad_norm": 0.4488884508609772, + "learning_rate": 9.594921833350099e-05, + "loss": 2.0141, + "step": 5056 + }, + { + "epoch": 1.5521792510742787, + "grad_norm": 0.47110801935195923, + "learning_rate": 9.59472582446816e-05, + "loss": 1.9294, + "step": 5057 + }, + { + "epoch": 1.5524861878453038, + "grad_norm": 0.5292330980300903, + "learning_rate": 9.594529770178449e-05, + "loss": 2.0427, + "step": 5058 + }, + { + "epoch": 1.5527931246163291, + "grad_norm": 0.522756814956665, + "learning_rate": 9.5943336704829e-05, + "loss": 1.9854, + "step": 5059 + }, + { + "epoch": 1.5531000613873542, + "grad_norm": 0.44659632444381714, + "learning_rate": 9.594137525383455e-05, + "loss": 2.028, + "step": 5060 + }, + { + "epoch": 1.5534069981583793, + "grad_norm": 0.4745616614818573, + "learning_rate": 9.593941334882048e-05, + "loss": 1.9994, + "step": 5061 + }, + { + "epoch": 1.5537139349294047, + "grad_norm": 0.41752973198890686, + "learning_rate": 9.593745098980622e-05, + "loss": 1.9466, + "step": 5062 + }, + { + "epoch": 1.5540208717004296, + "grad_norm": 0.4548248052597046, + "learning_rate": 9.593548817681115e-05, + "loss": 1.9064, + "step": 5063 + }, + { + "epoch": 1.554327808471455, + "grad_norm": 0.45780888199806213, + "learning_rate": 9.593352490985464e-05, + "loss": 2.0254, + "step": 5064 + }, + { + "epoch": 1.55463474524248, + "grad_norm": 0.4118718206882477, + "learning_rate": 9.593156118895613e-05, + "loss": 1.9761, + "step": 5065 + }, + { + "epoch": 1.5549416820135051, + "grad_norm": 0.41350236535072327, + "learning_rate": 9.592959701413501e-05, + "loss": 1.9476, + "step": 5066 + }, + { + "epoch": 1.5552486187845305, + "grad_norm": 0.4116091728210449, + "learning_rate": 9.59276323854107e-05, + "loss": 1.9325, + "step": 5067 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.44039735198020935, + "learning_rate": 9.592566730280259e-05, + "loss": 1.9916, + "step": 5068 + }, + { + "epoch": 1.5558624923265807, + "grad_norm": 0.4028816819190979, + "learning_rate": 9.592370176633012e-05, + "loss": 1.916, + "step": 5069 + }, + { + "epoch": 1.556169429097606, + "grad_norm": 0.42046302556991577, + "learning_rate": 9.592173577601271e-05, + "loss": 1.961, + "step": 5070 + }, + { + "epoch": 1.556476365868631, + "grad_norm": 0.3749450147151947, + "learning_rate": 9.591976933186982e-05, + "loss": 1.9279, + "step": 5071 + }, + { + "epoch": 1.5567833026396563, + "grad_norm": 0.3441384434700012, + "learning_rate": 9.591780243392081e-05, + "loss": 1.8967, + "step": 5072 + }, + { + "epoch": 1.5570902394106814, + "grad_norm": 0.4032546877861023, + "learning_rate": 9.59158350821852e-05, + "loss": 1.9912, + "step": 5073 + }, + { + "epoch": 1.5573971761817065, + "grad_norm": 0.44628265500068665, + "learning_rate": 9.591386727668238e-05, + "loss": 2.0539, + "step": 5074 + }, + { + "epoch": 1.5577041129527318, + "grad_norm": 0.43606969714164734, + "learning_rate": 9.59118990174318e-05, + "loss": 1.97, + "step": 5075 + }, + { + "epoch": 1.558011049723757, + "grad_norm": 0.42076775431632996, + "learning_rate": 9.590993030445295e-05, + "loss": 1.962, + "step": 5076 + }, + { + "epoch": 1.558317986494782, + "grad_norm": 0.34569117426872253, + "learning_rate": 9.590796113776526e-05, + "loss": 1.8815, + "step": 5077 + }, + { + "epoch": 1.5586249232658074, + "grad_norm": 0.3931111693382263, + "learning_rate": 9.590599151738817e-05, + "loss": 1.9016, + "step": 5078 + }, + { + "epoch": 1.5589318600368323, + "grad_norm": 0.3952369689941406, + "learning_rate": 9.590402144334117e-05, + "loss": 1.9277, + "step": 5079 + }, + { + "epoch": 1.5592387968078576, + "grad_norm": 0.3960857689380646, + "learning_rate": 9.590205091564372e-05, + "loss": 1.947, + "step": 5080 + }, + { + "epoch": 1.5595457335788827, + "grad_norm": 0.37946292757987976, + "learning_rate": 9.590007993431532e-05, + "loss": 1.9907, + "step": 5081 + }, + { + "epoch": 1.5598526703499078, + "grad_norm": 0.41619375348091125, + "learning_rate": 9.589810849937541e-05, + "loss": 1.9451, + "step": 5082 + }, + { + "epoch": 1.5601596071209332, + "grad_norm": 0.39266669750213623, + "learning_rate": 9.58961366108435e-05, + "loss": 2.0137, + "step": 5083 + }, + { + "epoch": 1.5604665438919583, + "grad_norm": 0.39510276913642883, + "learning_rate": 9.589416426873907e-05, + "loss": 1.947, + "step": 5084 + }, + { + "epoch": 1.5607734806629834, + "grad_norm": 0.40243181586265564, + "learning_rate": 9.58921914730816e-05, + "loss": 1.8957, + "step": 5085 + }, + { + "epoch": 1.5610804174340087, + "grad_norm": 0.39877578616142273, + "learning_rate": 9.58902182238906e-05, + "loss": 1.9497, + "step": 5086 + }, + { + "epoch": 1.5613873542050336, + "grad_norm": 0.39367151260375977, + "learning_rate": 9.588824452118557e-05, + "loss": 1.9616, + "step": 5087 + }, + { + "epoch": 1.561694290976059, + "grad_norm": 0.35690104961395264, + "learning_rate": 9.5886270364986e-05, + "loss": 1.9108, + "step": 5088 + }, + { + "epoch": 1.562001227747084, + "grad_norm": 0.39512762427330017, + "learning_rate": 9.588429575531141e-05, + "loss": 1.9909, + "step": 5089 + }, + { + "epoch": 1.5623081645181092, + "grad_norm": 0.39253926277160645, + "learning_rate": 9.588232069218132e-05, + "loss": 1.937, + "step": 5090 + }, + { + "epoch": 1.5626151012891345, + "grad_norm": 0.37811553478240967, + "learning_rate": 9.588034517561526e-05, + "loss": 1.8918, + "step": 5091 + }, + { + "epoch": 1.5629220380601596, + "grad_norm": 0.38191986083984375, + "learning_rate": 9.587836920563272e-05, + "loss": 1.9149, + "step": 5092 + }, + { + "epoch": 1.5632289748311847, + "grad_norm": 0.3903779089450836, + "learning_rate": 9.587639278225326e-05, + "loss": 1.9714, + "step": 5093 + }, + { + "epoch": 1.56353591160221, + "grad_norm": 0.4467499554157257, + "learning_rate": 9.587441590549639e-05, + "loss": 1.8822, + "step": 5094 + }, + { + "epoch": 1.563842848373235, + "grad_norm": 0.3819296956062317, + "learning_rate": 9.587243857538164e-05, + "loss": 1.9212, + "step": 5095 + }, + { + "epoch": 1.5641497851442603, + "grad_norm": 0.4305097162723541, + "learning_rate": 9.587046079192858e-05, + "loss": 1.9264, + "step": 5096 + }, + { + "epoch": 1.5644567219152854, + "grad_norm": 0.4135383367538452, + "learning_rate": 9.586848255515675e-05, + "loss": 1.9743, + "step": 5097 + }, + { + "epoch": 1.5647636586863105, + "grad_norm": 0.44688066840171814, + "learning_rate": 9.586650386508566e-05, + "loss": 1.8804, + "step": 5098 + }, + { + "epoch": 1.5650705954573358, + "grad_norm": 0.5358461737632751, + "learning_rate": 9.586452472173492e-05, + "loss": 1.9485, + "step": 5099 + }, + { + "epoch": 1.565377532228361, + "grad_norm": 0.5585343837738037, + "learning_rate": 9.586254512512408e-05, + "loss": 2.0901, + "step": 5100 + }, + { + "epoch": 1.565684468999386, + "grad_norm": 0.4682343602180481, + "learning_rate": 9.586056507527266e-05, + "loss": 1.8877, + "step": 5101 + }, + { + "epoch": 1.5659914057704114, + "grad_norm": 0.44076529145240784, + "learning_rate": 9.585858457220026e-05, + "loss": 1.93, + "step": 5102 + }, + { + "epoch": 1.5662983425414365, + "grad_norm": 0.4613071382045746, + "learning_rate": 9.585660361592646e-05, + "loss": 1.9689, + "step": 5103 + }, + { + "epoch": 1.5666052793124616, + "grad_norm": 0.4589289128780365, + "learning_rate": 9.585462220647082e-05, + "loss": 1.8876, + "step": 5104 + }, + { + "epoch": 1.566912216083487, + "grad_norm": 0.3495907485485077, + "learning_rate": 9.585264034385292e-05, + "loss": 1.9013, + "step": 5105 + }, + { + "epoch": 1.5672191528545119, + "grad_norm": 0.42263728380203247, + "learning_rate": 9.585065802809235e-05, + "loss": 1.8886, + "step": 5106 + }, + { + "epoch": 1.5675260896255372, + "grad_norm": 0.4275301694869995, + "learning_rate": 9.584867525920872e-05, + "loss": 1.9865, + "step": 5107 + }, + { + "epoch": 1.5678330263965623, + "grad_norm": 0.4228142201900482, + "learning_rate": 9.584669203722161e-05, + "loss": 1.8573, + "step": 5108 + }, + { + "epoch": 1.5681399631675874, + "grad_norm": 0.4422524571418762, + "learning_rate": 9.58447083621506e-05, + "loss": 1.924, + "step": 5109 + }, + { + "epoch": 1.5684468999386127, + "grad_norm": 0.41540947556495667, + "learning_rate": 9.584272423401532e-05, + "loss": 1.969, + "step": 5110 + }, + { + "epoch": 1.5687538367096379, + "grad_norm": 0.3963775336742401, + "learning_rate": 9.584073965283538e-05, + "loss": 1.9509, + "step": 5111 + }, + { + "epoch": 1.569060773480663, + "grad_norm": 0.41465985774993896, + "learning_rate": 9.583875461863037e-05, + "loss": 1.9393, + "step": 5112 + }, + { + "epoch": 1.5693677102516883, + "grad_norm": 0.4396083652973175, + "learning_rate": 9.583676913141991e-05, + "loss": 1.9872, + "step": 5113 + }, + { + "epoch": 1.5696746470227132, + "grad_norm": 0.4247182607650757, + "learning_rate": 9.583478319122366e-05, + "loss": 1.9807, + "step": 5114 + }, + { + "epoch": 1.5699815837937385, + "grad_norm": 0.3612080216407776, + "learning_rate": 9.583279679806119e-05, + "loss": 1.9563, + "step": 5115 + }, + { + "epoch": 1.5702885205647636, + "grad_norm": 0.40084055066108704, + "learning_rate": 9.583080995195217e-05, + "loss": 1.9099, + "step": 5116 + }, + { + "epoch": 1.5705954573357888, + "grad_norm": 0.432381272315979, + "learning_rate": 9.582882265291621e-05, + "loss": 2.0167, + "step": 5117 + }, + { + "epoch": 1.570902394106814, + "grad_norm": 0.45490768551826477, + "learning_rate": 9.5826834900973e-05, + "loss": 1.9179, + "step": 5118 + }, + { + "epoch": 1.5712093308778392, + "grad_norm": 0.39158329367637634, + "learning_rate": 9.582484669614211e-05, + "loss": 1.8716, + "step": 5119 + }, + { + "epoch": 1.5715162676488643, + "grad_norm": 0.45607441663742065, + "learning_rate": 9.582285803844324e-05, + "loss": 1.9631, + "step": 5120 + }, + { + "epoch": 1.5718232044198897, + "grad_norm": 0.42591094970703125, + "learning_rate": 9.582086892789604e-05, + "loss": 1.9809, + "step": 5121 + }, + { + "epoch": 1.5721301411909145, + "grad_norm": 0.46772903203964233, + "learning_rate": 9.581887936452015e-05, + "loss": 1.9991, + "step": 5122 + }, + { + "epoch": 1.5724370779619399, + "grad_norm": 0.4450485408306122, + "learning_rate": 9.581688934833524e-05, + "loss": 1.9471, + "step": 5123 + }, + { + "epoch": 1.572744014732965, + "grad_norm": 0.37539350986480713, + "learning_rate": 9.581489887936097e-05, + "loss": 1.8624, + "step": 5124 + }, + { + "epoch": 1.57305095150399, + "grad_norm": 0.4184030294418335, + "learning_rate": 9.581290795761702e-05, + "loss": 1.9746, + "step": 5125 + }, + { + "epoch": 1.5733578882750154, + "grad_norm": 0.43275317549705505, + "learning_rate": 9.581091658312305e-05, + "loss": 2.0484, + "step": 5126 + }, + { + "epoch": 1.5736648250460405, + "grad_norm": 0.48845502734184265, + "learning_rate": 9.580892475589876e-05, + "loss": 1.9331, + "step": 5127 + }, + { + "epoch": 1.5739717618170657, + "grad_norm": 0.4653528034687042, + "learning_rate": 9.580693247596383e-05, + "loss": 1.8888, + "step": 5128 + }, + { + "epoch": 1.574278698588091, + "grad_norm": 0.4371016323566437, + "learning_rate": 9.580493974333794e-05, + "loss": 1.9004, + "step": 5129 + }, + { + "epoch": 1.5745856353591159, + "grad_norm": 0.4274102747440338, + "learning_rate": 9.580294655804079e-05, + "loss": 1.9877, + "step": 5130 + }, + { + "epoch": 1.5748925721301412, + "grad_norm": 0.4053245484828949, + "learning_rate": 9.580095292009208e-05, + "loss": 1.9253, + "step": 5131 + }, + { + "epoch": 1.5751995089011663, + "grad_norm": 0.47868627309799194, + "learning_rate": 9.579895882951151e-05, + "loss": 1.9659, + "step": 5132 + }, + { + "epoch": 1.5755064456721914, + "grad_norm": 0.47420576214790344, + "learning_rate": 9.579696428631877e-05, + "loss": 1.9115, + "step": 5133 + }, + { + "epoch": 1.5758133824432168, + "grad_norm": 0.41192150115966797, + "learning_rate": 9.57949692905336e-05, + "loss": 1.8949, + "step": 5134 + }, + { + "epoch": 1.576120319214242, + "grad_norm": 0.44949471950531006, + "learning_rate": 9.57929738421757e-05, + "loss": 1.9393, + "step": 5135 + }, + { + "epoch": 1.576427255985267, + "grad_norm": 0.38450154662132263, + "learning_rate": 9.57909779412648e-05, + "loss": 1.8399, + "step": 5136 + }, + { + "epoch": 1.5767341927562923, + "grad_norm": 0.43553364276885986, + "learning_rate": 9.57889815878206e-05, + "loss": 1.9477, + "step": 5137 + }, + { + "epoch": 1.5770411295273172, + "grad_norm": 0.4546982944011688, + "learning_rate": 9.578698478186285e-05, + "loss": 1.9169, + "step": 5138 + }, + { + "epoch": 1.5773480662983426, + "grad_norm": 0.47802838683128357, + "learning_rate": 9.57849875234113e-05, + "loss": 1.9204, + "step": 5139 + }, + { + "epoch": 1.5776550030693677, + "grad_norm": 0.3648034930229187, + "learning_rate": 9.578298981248565e-05, + "loss": 1.9157, + "step": 5140 + }, + { + "epoch": 1.5779619398403928, + "grad_norm": 0.41951245069503784, + "learning_rate": 9.578099164910565e-05, + "loss": 1.9171, + "step": 5141 + }, + { + "epoch": 1.5782688766114181, + "grad_norm": 0.5198701620101929, + "learning_rate": 9.577899303329107e-05, + "loss": 1.9786, + "step": 5142 + }, + { + "epoch": 1.5785758133824432, + "grad_norm": 0.45244187116622925, + "learning_rate": 9.577699396506165e-05, + "loss": 2.0044, + "step": 5143 + }, + { + "epoch": 1.5788827501534684, + "grad_norm": 0.3874819874763489, + "learning_rate": 9.577499444443715e-05, + "loss": 1.9385, + "step": 5144 + }, + { + "epoch": 1.5791896869244937, + "grad_norm": 0.4578075110912323, + "learning_rate": 9.577299447143733e-05, + "loss": 1.9679, + "step": 5145 + }, + { + "epoch": 1.5794966236955186, + "grad_norm": 0.6001343727111816, + "learning_rate": 9.577099404608192e-05, + "loss": 1.9331, + "step": 5146 + }, + { + "epoch": 1.579803560466544, + "grad_norm": 0.5592501759529114, + "learning_rate": 9.576899316839074e-05, + "loss": 1.8968, + "step": 5147 + }, + { + "epoch": 1.580110497237569, + "grad_norm": 0.4333004951477051, + "learning_rate": 9.576699183838356e-05, + "loss": 2.0378, + "step": 5148 + }, + { + "epoch": 1.5804174340085941, + "grad_norm": 0.40593892335891724, + "learning_rate": 9.576499005608011e-05, + "loss": 1.9878, + "step": 5149 + }, + { + "epoch": 1.5807243707796195, + "grad_norm": 0.4805290400981903, + "learning_rate": 9.576298782150023e-05, + "loss": 1.9897, + "step": 5150 + }, + { + "epoch": 1.5810313075506446, + "grad_norm": 0.4620860517024994, + "learning_rate": 9.576098513466367e-05, + "loss": 1.9808, + "step": 5151 + }, + { + "epoch": 1.5813382443216697, + "grad_norm": 0.47085410356521606, + "learning_rate": 9.575898199559023e-05, + "loss": 1.9526, + "step": 5152 + }, + { + "epoch": 1.581645181092695, + "grad_norm": 0.512971043586731, + "learning_rate": 9.575697840429971e-05, + "loss": 1.9684, + "step": 5153 + }, + { + "epoch": 1.58195211786372, + "grad_norm": 0.5474939346313477, + "learning_rate": 9.575497436081193e-05, + "loss": 2.0052, + "step": 5154 + }, + { + "epoch": 1.5822590546347453, + "grad_norm": 0.6277830004692078, + "learning_rate": 9.575296986514666e-05, + "loss": 2.042, + "step": 5155 + }, + { + "epoch": 1.5825659914057704, + "grad_norm": 0.46941256523132324, + "learning_rate": 9.575096491732372e-05, + "loss": 1.952, + "step": 5156 + }, + { + "epoch": 1.5828729281767955, + "grad_norm": 0.4948115646839142, + "learning_rate": 9.574895951736294e-05, + "loss": 1.9573, + "step": 5157 + }, + { + "epoch": 1.5831798649478208, + "grad_norm": 0.5677160024642944, + "learning_rate": 9.574695366528411e-05, + "loss": 1.9696, + "step": 5158 + }, + { + "epoch": 1.583486801718846, + "grad_norm": 0.5915918350219727, + "learning_rate": 9.574494736110708e-05, + "loss": 1.9822, + "step": 5159 + }, + { + "epoch": 1.583793738489871, + "grad_norm": 0.556413471698761, + "learning_rate": 9.574294060485168e-05, + "loss": 1.9548, + "step": 5160 + }, + { + "epoch": 1.5841006752608964, + "grad_norm": 0.4706072509288788, + "learning_rate": 9.574093339653772e-05, + "loss": 2.0052, + "step": 5161 + }, + { + "epoch": 1.5844076120319213, + "grad_norm": 0.3931087553501129, + "learning_rate": 9.573892573618505e-05, + "loss": 1.9071, + "step": 5162 + }, + { + "epoch": 1.5847145488029466, + "grad_norm": 0.4590308368206024, + "learning_rate": 9.573691762381349e-05, + "loss": 2.048, + "step": 5163 + }, + { + "epoch": 1.5850214855739717, + "grad_norm": 0.4404078423976898, + "learning_rate": 9.573490905944293e-05, + "loss": 1.9426, + "step": 5164 + }, + { + "epoch": 1.5853284223449968, + "grad_norm": 0.486074298620224, + "learning_rate": 9.573290004309318e-05, + "loss": 1.9937, + "step": 5165 + }, + { + "epoch": 1.5856353591160222, + "grad_norm": 0.4650556445121765, + "learning_rate": 9.57308905747841e-05, + "loss": 1.9821, + "step": 5166 + }, + { + "epoch": 1.5859422958870473, + "grad_norm": 0.48193567991256714, + "learning_rate": 9.572888065453557e-05, + "loss": 2.0143, + "step": 5167 + }, + { + "epoch": 1.5862492326580724, + "grad_norm": 0.43178877234458923, + "learning_rate": 9.572687028236744e-05, + "loss": 2.0066, + "step": 5168 + }, + { + "epoch": 1.5865561694290977, + "grad_norm": 0.5256033539772034, + "learning_rate": 9.572485945829957e-05, + "loss": 2.0431, + "step": 5169 + }, + { + "epoch": 1.5868631062001226, + "grad_norm": 0.4714619517326355, + "learning_rate": 9.572284818235182e-05, + "loss": 1.9411, + "step": 5170 + }, + { + "epoch": 1.587170042971148, + "grad_norm": 0.4224734902381897, + "learning_rate": 9.572083645454411e-05, + "loss": 1.9648, + "step": 5171 + }, + { + "epoch": 1.5874769797421733, + "grad_norm": 0.45965152978897095, + "learning_rate": 9.571882427489628e-05, + "loss": 1.9241, + "step": 5172 + }, + { + "epoch": 1.5877839165131982, + "grad_norm": 0.459114670753479, + "learning_rate": 9.571681164342825e-05, + "loss": 2.0197, + "step": 5173 + }, + { + "epoch": 1.5880908532842235, + "grad_norm": 0.4278501272201538, + "learning_rate": 9.571479856015988e-05, + "loss": 1.9411, + "step": 5174 + }, + { + "epoch": 1.5883977900552486, + "grad_norm": 0.6875150799751282, + "learning_rate": 9.571278502511107e-05, + "loss": 1.8876, + "step": 5175 + }, + { + "epoch": 1.5887047268262737, + "grad_norm": 0.4596772789955139, + "learning_rate": 9.571077103830174e-05, + "loss": 1.9002, + "step": 5176 + }, + { + "epoch": 1.589011663597299, + "grad_norm": 0.47587937116622925, + "learning_rate": 9.570875659975178e-05, + "loss": 2.0034, + "step": 5177 + }, + { + "epoch": 1.5893186003683242, + "grad_norm": 0.42494842410087585, + "learning_rate": 9.570674170948109e-05, + "loss": 1.9668, + "step": 5178 + }, + { + "epoch": 1.5896255371393493, + "grad_norm": 0.4231310784816742, + "learning_rate": 9.570472636750957e-05, + "loss": 1.9365, + "step": 5179 + }, + { + "epoch": 1.5899324739103746, + "grad_norm": 0.4585247337818146, + "learning_rate": 9.570271057385719e-05, + "loss": 1.9707, + "step": 5180 + }, + { + "epoch": 1.5902394106813995, + "grad_norm": 0.4146895408630371, + "learning_rate": 9.570069432854382e-05, + "loss": 1.9405, + "step": 5181 + }, + { + "epoch": 1.5905463474524248, + "grad_norm": 0.42243605852127075, + "learning_rate": 9.56986776315894e-05, + "loss": 1.8893, + "step": 5182 + }, + { + "epoch": 1.59085328422345, + "grad_norm": 0.44299328327178955, + "learning_rate": 9.569666048301386e-05, + "loss": 1.9596, + "step": 5183 + }, + { + "epoch": 1.591160220994475, + "grad_norm": 0.4950970709323883, + "learning_rate": 9.569464288283716e-05, + "loss": 1.9066, + "step": 5184 + }, + { + "epoch": 1.5914671577655004, + "grad_norm": 0.4664969742298126, + "learning_rate": 9.569262483107919e-05, + "loss": 1.9485, + "step": 5185 + }, + { + "epoch": 1.5917740945365255, + "grad_norm": 0.5052160024642944, + "learning_rate": 9.569060632775993e-05, + "loss": 1.9189, + "step": 5186 + }, + { + "epoch": 1.5920810313075506, + "grad_norm": 0.4109063446521759, + "learning_rate": 9.568858737289932e-05, + "loss": 1.9236, + "step": 5187 + }, + { + "epoch": 1.592387968078576, + "grad_norm": 0.4078194499015808, + "learning_rate": 9.568656796651731e-05, + "loss": 1.9465, + "step": 5188 + }, + { + "epoch": 1.5926949048496009, + "grad_norm": 0.43199312686920166, + "learning_rate": 9.568454810863385e-05, + "loss": 1.9537, + "step": 5189 + }, + { + "epoch": 1.5930018416206262, + "grad_norm": 0.46389925479888916, + "learning_rate": 9.568252779926891e-05, + "loss": 1.9463, + "step": 5190 + }, + { + "epoch": 1.5933087783916513, + "grad_norm": 0.4130708575248718, + "learning_rate": 9.568050703844247e-05, + "loss": 1.948, + "step": 5191 + }, + { + "epoch": 1.5936157151626764, + "grad_norm": 0.4699256122112274, + "learning_rate": 9.567848582617448e-05, + "loss": 1.957, + "step": 5192 + }, + { + "epoch": 1.5939226519337018, + "grad_norm": 0.41965460777282715, + "learning_rate": 9.56764641624849e-05, + "loss": 1.9622, + "step": 5193 + }, + { + "epoch": 1.5942295887047269, + "grad_norm": 0.4313151240348816, + "learning_rate": 9.567444204739376e-05, + "loss": 1.981, + "step": 5194 + }, + { + "epoch": 1.594536525475752, + "grad_norm": 0.4149332642555237, + "learning_rate": 9.5672419480921e-05, + "loss": 1.9542, + "step": 5195 + }, + { + "epoch": 1.5948434622467773, + "grad_norm": 0.4456483721733093, + "learning_rate": 9.567039646308661e-05, + "loss": 2.0206, + "step": 5196 + }, + { + "epoch": 1.5951503990178022, + "grad_norm": 0.46637552976608276, + "learning_rate": 9.56683729939106e-05, + "loss": 2.0264, + "step": 5197 + }, + { + "epoch": 1.5954573357888275, + "grad_norm": 0.4809871315956116, + "learning_rate": 9.566634907341297e-05, + "loss": 1.9113, + "step": 5198 + }, + { + "epoch": 1.5957642725598526, + "grad_norm": 0.5220670104026794, + "learning_rate": 9.566432470161371e-05, + "loss": 1.9806, + "step": 5199 + }, + { + "epoch": 1.5960712093308778, + "grad_norm": 0.5020555853843689, + "learning_rate": 9.566229987853283e-05, + "loss": 1.9925, + "step": 5200 + }, + { + "epoch": 1.596378146101903, + "grad_norm": 0.5481683611869812, + "learning_rate": 9.566027460419034e-05, + "loss": 1.978, + "step": 5201 + }, + { + "epoch": 1.5966850828729282, + "grad_norm": 0.5014147758483887, + "learning_rate": 9.565824887860624e-05, + "loss": 1.9402, + "step": 5202 + }, + { + "epoch": 1.5969920196439533, + "grad_norm": 0.43973588943481445, + "learning_rate": 9.565622270180057e-05, + "loss": 1.9877, + "step": 5203 + }, + { + "epoch": 1.5972989564149787, + "grad_norm": 0.5172939300537109, + "learning_rate": 9.565419607379335e-05, + "loss": 1.9304, + "step": 5204 + }, + { + "epoch": 1.5976058931860035, + "grad_norm": 0.4767214357852936, + "learning_rate": 9.56521689946046e-05, + "loss": 1.9063, + "step": 5205 + }, + { + "epoch": 1.5979128299570289, + "grad_norm": 0.48810651898384094, + "learning_rate": 9.565014146425437e-05, + "loss": 1.9473, + "step": 5206 + }, + { + "epoch": 1.598219766728054, + "grad_norm": 0.4204402565956116, + "learning_rate": 9.564811348276269e-05, + "loss": 1.9562, + "step": 5207 + }, + { + "epoch": 1.598526703499079, + "grad_norm": 0.42679163813591003, + "learning_rate": 9.564608505014958e-05, + "loss": 1.8904, + "step": 5208 + }, + { + "epoch": 1.5988336402701044, + "grad_norm": 0.4240354299545288, + "learning_rate": 9.56440561664351e-05, + "loss": 1.9982, + "step": 5209 + }, + { + "epoch": 1.5991405770411296, + "grad_norm": 0.41588497161865234, + "learning_rate": 9.564202683163932e-05, + "loss": 1.9904, + "step": 5210 + }, + { + "epoch": 1.5994475138121547, + "grad_norm": 0.486240029335022, + "learning_rate": 9.563999704578226e-05, + "loss": 1.9379, + "step": 5211 + }, + { + "epoch": 1.59975445058318, + "grad_norm": 0.4628448188304901, + "learning_rate": 9.563796680888403e-05, + "loss": 2.0061, + "step": 5212 + }, + { + "epoch": 1.600061387354205, + "grad_norm": 0.4514544606208801, + "learning_rate": 9.563593612096464e-05, + "loss": 1.9692, + "step": 5213 + }, + { + "epoch": 1.6003683241252302, + "grad_norm": 0.3869803845882416, + "learning_rate": 9.563390498204419e-05, + "loss": 1.8801, + "step": 5214 + }, + { + "epoch": 1.6006752608962553, + "grad_norm": 0.47029098868370056, + "learning_rate": 9.563187339214274e-05, + "loss": 2.0457, + "step": 5215 + }, + { + "epoch": 1.6009821976672804, + "grad_norm": 0.49051982164382935, + "learning_rate": 9.562984135128037e-05, + "loss": 1.9121, + "step": 5216 + }, + { + "epoch": 1.6012891344383058, + "grad_norm": 0.5087830424308777, + "learning_rate": 9.562780885947717e-05, + "loss": 1.9165, + "step": 5217 + }, + { + "epoch": 1.601596071209331, + "grad_norm": 0.4597826600074768, + "learning_rate": 9.562577591675322e-05, + "loss": 1.9037, + "step": 5218 + }, + { + "epoch": 1.601903007980356, + "grad_norm": 0.43610528111457825, + "learning_rate": 9.562374252312858e-05, + "loss": 1.8785, + "step": 5219 + }, + { + "epoch": 1.6022099447513813, + "grad_norm": 0.45797282457351685, + "learning_rate": 9.56217086786234e-05, + "loss": 2.0713, + "step": 5220 + }, + { + "epoch": 1.6025168815224062, + "grad_norm": 0.46097078919410706, + "learning_rate": 9.561967438325777e-05, + "loss": 1.9176, + "step": 5221 + }, + { + "epoch": 1.6028238182934316, + "grad_norm": 0.47368288040161133, + "learning_rate": 9.561763963705176e-05, + "loss": 1.9333, + "step": 5222 + }, + { + "epoch": 1.6031307550644567, + "grad_norm": 0.5048179626464844, + "learning_rate": 9.561560444002551e-05, + "loss": 1.9473, + "step": 5223 + }, + { + "epoch": 1.6034376918354818, + "grad_norm": 0.42069435119628906, + "learning_rate": 9.56135687921991e-05, + "loss": 1.8507, + "step": 5224 + }, + { + "epoch": 1.6037446286065071, + "grad_norm": 0.37166985869407654, + "learning_rate": 9.561153269359269e-05, + "loss": 1.9404, + "step": 5225 + }, + { + "epoch": 1.6040515653775322, + "grad_norm": 0.42752668261528015, + "learning_rate": 9.560949614422637e-05, + "loss": 1.9791, + "step": 5226 + }, + { + "epoch": 1.6043585021485574, + "grad_norm": 0.4334527552127838, + "learning_rate": 9.560745914412029e-05, + "loss": 1.972, + "step": 5227 + }, + { + "epoch": 1.6046654389195827, + "grad_norm": 0.44162631034851074, + "learning_rate": 9.560542169329454e-05, + "loss": 1.9054, + "step": 5228 + }, + { + "epoch": 1.6049723756906076, + "grad_norm": 0.3891509771347046, + "learning_rate": 9.560338379176929e-05, + "loss": 1.9356, + "step": 5229 + }, + { + "epoch": 1.605279312461633, + "grad_norm": 0.3821989893913269, + "learning_rate": 9.56013454395647e-05, + "loss": 1.9197, + "step": 5230 + }, + { + "epoch": 1.605586249232658, + "grad_norm": 0.4338948428630829, + "learning_rate": 9.559930663670084e-05, + "loss": 2.002, + "step": 5231 + }, + { + "epoch": 1.6058931860036831, + "grad_norm": 0.4784114956855774, + "learning_rate": 9.559726738319794e-05, + "loss": 2.0344, + "step": 5232 + }, + { + "epoch": 1.6062001227747085, + "grad_norm": 0.43362441658973694, + "learning_rate": 9.559522767907612e-05, + "loss": 1.9282, + "step": 5233 + }, + { + "epoch": 1.6065070595457336, + "grad_norm": 0.40863800048828125, + "learning_rate": 9.559318752435553e-05, + "loss": 1.8468, + "step": 5234 + }, + { + "epoch": 1.6068139963167587, + "grad_norm": 0.4509727358818054, + "learning_rate": 9.559114691905633e-05, + "loss": 2.0175, + "step": 5235 + }, + { + "epoch": 1.607120933087784, + "grad_norm": 0.4650020897388458, + "learning_rate": 9.55891058631987e-05, + "loss": 1.9946, + "step": 5236 + }, + { + "epoch": 1.607427869858809, + "grad_norm": 0.4315911829471588, + "learning_rate": 9.55870643568028e-05, + "loss": 1.9271, + "step": 5237 + }, + { + "epoch": 1.6077348066298343, + "grad_norm": 0.4109809994697571, + "learning_rate": 9.558502239988882e-05, + "loss": 1.9791, + "step": 5238 + }, + { + "epoch": 1.6080417434008594, + "grad_norm": 0.4323776662349701, + "learning_rate": 9.558297999247692e-05, + "loss": 1.9745, + "step": 5239 + }, + { + "epoch": 1.6083486801718845, + "grad_norm": 0.4255007207393646, + "learning_rate": 9.558093713458729e-05, + "loss": 1.96, + "step": 5240 + }, + { + "epoch": 1.6086556169429098, + "grad_norm": 0.4045571982860565, + "learning_rate": 9.557889382624014e-05, + "loss": 1.9148, + "step": 5241 + }, + { + "epoch": 1.608962553713935, + "grad_norm": 0.39663615822792053, + "learning_rate": 9.557685006745564e-05, + "loss": 1.9313, + "step": 5242 + }, + { + "epoch": 1.60926949048496, + "grad_norm": 0.39130523800849915, + "learning_rate": 9.5574805858254e-05, + "loss": 2.0073, + "step": 5243 + }, + { + "epoch": 1.6095764272559854, + "grad_norm": 0.4071548581123352, + "learning_rate": 9.55727611986554e-05, + "loss": 1.9353, + "step": 5244 + }, + { + "epoch": 1.6098833640270105, + "grad_norm": 0.44347357749938965, + "learning_rate": 9.557071608868007e-05, + "loss": 1.9325, + "step": 5245 + }, + { + "epoch": 1.6101903007980356, + "grad_norm": 0.48900067806243896, + "learning_rate": 9.556867052834821e-05, + "loss": 2.0083, + "step": 5246 + }, + { + "epoch": 1.610497237569061, + "grad_norm": 0.44374197721481323, + "learning_rate": 9.556662451768006e-05, + "loss": 2.0143, + "step": 5247 + }, + { + "epoch": 1.6108041743400858, + "grad_norm": 0.385268896818161, + "learning_rate": 9.556457805669581e-05, + "loss": 1.8981, + "step": 5248 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.5355607867240906, + "learning_rate": 9.556253114541569e-05, + "loss": 2.0413, + "step": 5249 + }, + { + "epoch": 1.6114180478821363, + "grad_norm": 0.5672646164894104, + "learning_rate": 9.556048378385992e-05, + "loss": 1.9429, + "step": 5250 + }, + { + "epoch": 1.6117249846531614, + "grad_norm": 0.46225669980049133, + "learning_rate": 9.555843597204875e-05, + "loss": 1.9883, + "step": 5251 + }, + { + "epoch": 1.6120319214241867, + "grad_norm": 0.43236228823661804, + "learning_rate": 9.555638771000243e-05, + "loss": 1.9641, + "step": 5252 + }, + { + "epoch": 1.6123388581952118, + "grad_norm": 0.4843178987503052, + "learning_rate": 9.555433899774116e-05, + "loss": 1.9224, + "step": 5253 + }, + { + "epoch": 1.612645794966237, + "grad_norm": 0.4693675637245178, + "learning_rate": 9.555228983528523e-05, + "loss": 1.9774, + "step": 5254 + }, + { + "epoch": 1.6129527317372623, + "grad_norm": 0.3968529999256134, + "learning_rate": 9.555024022265487e-05, + "loss": 1.8939, + "step": 5255 + }, + { + "epoch": 1.6132596685082872, + "grad_norm": 0.42781850695610046, + "learning_rate": 9.554819015987033e-05, + "loss": 1.9561, + "step": 5256 + }, + { + "epoch": 1.6135666052793125, + "grad_norm": 0.5241015553474426, + "learning_rate": 9.554613964695189e-05, + "loss": 1.963, + "step": 5257 + }, + { + "epoch": 1.6138735420503376, + "grad_norm": 0.4292888641357422, + "learning_rate": 9.554408868391979e-05, + "loss": 2.0248, + "step": 5258 + }, + { + "epoch": 1.6141804788213627, + "grad_norm": 0.49197763204574585, + "learning_rate": 9.554203727079433e-05, + "loss": 1.9612, + "step": 5259 + }, + { + "epoch": 1.614487415592388, + "grad_norm": 0.45733556151390076, + "learning_rate": 9.553998540759575e-05, + "loss": 1.9093, + "step": 5260 + }, + { + "epoch": 1.6147943523634132, + "grad_norm": 0.4139576256275177, + "learning_rate": 9.553793309434436e-05, + "loss": 1.875, + "step": 5261 + }, + { + "epoch": 1.6151012891344383, + "grad_norm": 0.42295894026756287, + "learning_rate": 9.55358803310604e-05, + "loss": 1.9427, + "step": 5262 + }, + { + "epoch": 1.6154082259054636, + "grad_norm": 0.370761513710022, + "learning_rate": 9.55338271177642e-05, + "loss": 1.932, + "step": 5263 + }, + { + "epoch": 1.6157151626764885, + "grad_norm": 0.38912683725357056, + "learning_rate": 9.553177345447602e-05, + "loss": 1.9606, + "step": 5264 + }, + { + "epoch": 1.6160220994475138, + "grad_norm": 0.3901510238647461, + "learning_rate": 9.552971934121618e-05, + "loss": 1.9455, + "step": 5265 + }, + { + "epoch": 1.616329036218539, + "grad_norm": 0.4517458975315094, + "learning_rate": 9.552766477800494e-05, + "loss": 1.9291, + "step": 5266 + }, + { + "epoch": 1.616635972989564, + "grad_norm": 0.47282713651657104, + "learning_rate": 9.552560976486266e-05, + "loss": 1.9326, + "step": 5267 + }, + { + "epoch": 1.6169429097605894, + "grad_norm": 0.4741488993167877, + "learning_rate": 9.552355430180961e-05, + "loss": 1.9782, + "step": 5268 + }, + { + "epoch": 1.6172498465316145, + "grad_norm": 0.42634037137031555, + "learning_rate": 9.552149838886612e-05, + "loss": 1.9871, + "step": 5269 + }, + { + "epoch": 1.6175567833026396, + "grad_norm": 0.39007633924484253, + "learning_rate": 9.55194420260525e-05, + "loss": 1.9397, + "step": 5270 + }, + { + "epoch": 1.617863720073665, + "grad_norm": 0.41707170009613037, + "learning_rate": 9.551738521338906e-05, + "loss": 1.8555, + "step": 5271 + }, + { + "epoch": 1.6181706568446899, + "grad_norm": 0.46702343225479126, + "learning_rate": 9.551532795089616e-05, + "loss": 1.9987, + "step": 5272 + }, + { + "epoch": 1.6184775936157152, + "grad_norm": 0.44585564732551575, + "learning_rate": 9.551327023859411e-05, + "loss": 1.8512, + "step": 5273 + }, + { + "epoch": 1.6187845303867403, + "grad_norm": 0.42617684602737427, + "learning_rate": 9.551121207650324e-05, + "loss": 1.9405, + "step": 5274 + }, + { + "epoch": 1.6190914671577654, + "grad_norm": 0.39399340748786926, + "learning_rate": 9.55091534646439e-05, + "loss": 1.9787, + "step": 5275 + }, + { + "epoch": 1.6193984039287908, + "grad_norm": 0.44386324286460876, + "learning_rate": 9.550709440303642e-05, + "loss": 1.9791, + "step": 5276 + }, + { + "epoch": 1.6197053406998159, + "grad_norm": 0.3871287405490875, + "learning_rate": 9.550503489170117e-05, + "loss": 1.9354, + "step": 5277 + }, + { + "epoch": 1.620012277470841, + "grad_norm": 0.4131690263748169, + "learning_rate": 9.550297493065851e-05, + "loss": 1.9709, + "step": 5278 + }, + { + "epoch": 1.6203192142418663, + "grad_norm": 0.3919534683227539, + "learning_rate": 9.550091451992877e-05, + "loss": 1.8997, + "step": 5279 + }, + { + "epoch": 1.6206261510128912, + "grad_norm": 0.40001583099365234, + "learning_rate": 9.54988536595323e-05, + "loss": 1.9006, + "step": 5280 + }, + { + "epoch": 1.6209330877839165, + "grad_norm": 0.44222408533096313, + "learning_rate": 9.549679234948952e-05, + "loss": 2.0033, + "step": 5281 + }, + { + "epoch": 1.6212400245549416, + "grad_norm": 0.4243159592151642, + "learning_rate": 9.549473058982077e-05, + "loss": 1.9582, + "step": 5282 + }, + { + "epoch": 1.6215469613259668, + "grad_norm": 0.411408007144928, + "learning_rate": 9.549266838054641e-05, + "loss": 1.9244, + "step": 5283 + }, + { + "epoch": 1.621853898096992, + "grad_norm": 0.3833782970905304, + "learning_rate": 9.549060572168686e-05, + "loss": 1.9184, + "step": 5284 + }, + { + "epoch": 1.6221608348680172, + "grad_norm": 0.3925926685333252, + "learning_rate": 9.548854261326246e-05, + "loss": 1.9299, + "step": 5285 + }, + { + "epoch": 1.6224677716390423, + "grad_norm": 0.4472656846046448, + "learning_rate": 9.548647905529363e-05, + "loss": 2.0622, + "step": 5286 + }, + { + "epoch": 1.6227747084100677, + "grad_norm": 0.4842108488082886, + "learning_rate": 9.548441504780074e-05, + "loss": 1.9759, + "step": 5287 + }, + { + "epoch": 1.6230816451810925, + "grad_norm": 0.49826517701148987, + "learning_rate": 9.548235059080422e-05, + "loss": 1.9162, + "step": 5288 + }, + { + "epoch": 1.6233885819521179, + "grad_norm": 0.4672689735889435, + "learning_rate": 9.548028568432445e-05, + "loss": 1.9843, + "step": 5289 + }, + { + "epoch": 1.623695518723143, + "grad_norm": 0.48113325238227844, + "learning_rate": 9.547822032838182e-05, + "loss": 1.9426, + "step": 5290 + }, + { + "epoch": 1.624002455494168, + "grad_norm": 0.49646374583244324, + "learning_rate": 9.54761545229968e-05, + "loss": 1.908, + "step": 5291 + }, + { + "epoch": 1.6243093922651934, + "grad_norm": 0.42530664801597595, + "learning_rate": 9.547408826818974e-05, + "loss": 1.9189, + "step": 5292 + }, + { + "epoch": 1.6246163290362186, + "grad_norm": 0.592721164226532, + "learning_rate": 9.54720215639811e-05, + "loss": 1.9656, + "step": 5293 + }, + { + "epoch": 1.6249232658072437, + "grad_norm": 0.5530748963356018, + "learning_rate": 9.546995441039127e-05, + "loss": 1.8815, + "step": 5294 + }, + { + "epoch": 1.625230202578269, + "grad_norm": 0.4551030695438385, + "learning_rate": 9.546788680744073e-05, + "loss": 1.9485, + "step": 5295 + }, + { + "epoch": 1.625537139349294, + "grad_norm": 0.42004409432411194, + "learning_rate": 9.546581875514985e-05, + "loss": 1.9903, + "step": 5296 + }, + { + "epoch": 1.6258440761203192, + "grad_norm": 0.5363507270812988, + "learning_rate": 9.546375025353911e-05, + "loss": 1.93, + "step": 5297 + }, + { + "epoch": 1.6261510128913443, + "grad_norm": 0.457795649766922, + "learning_rate": 9.546168130262896e-05, + "loss": 1.9279, + "step": 5298 + }, + { + "epoch": 1.6264579496623695, + "grad_norm": 0.5061174631118774, + "learning_rate": 9.545961190243982e-05, + "loss": 1.9198, + "step": 5299 + }, + { + "epoch": 1.6267648864333948, + "grad_norm": 0.4366548955440521, + "learning_rate": 9.545754205299214e-05, + "loss": 1.9206, + "step": 5300 + }, + { + "epoch": 1.62707182320442, + "grad_norm": 0.361251562833786, + "learning_rate": 9.54554717543064e-05, + "loss": 1.8638, + "step": 5301 + }, + { + "epoch": 1.627378759975445, + "grad_norm": 0.45089036226272583, + "learning_rate": 9.545340100640303e-05, + "loss": 1.9206, + "step": 5302 + }, + { + "epoch": 1.6276856967464703, + "grad_norm": 0.38224726915359497, + "learning_rate": 9.545132980930251e-05, + "loss": 1.9893, + "step": 5303 + }, + { + "epoch": 1.6279926335174952, + "grad_norm": 0.43573206663131714, + "learning_rate": 9.544925816302533e-05, + "loss": 1.9358, + "step": 5304 + }, + { + "epoch": 1.6282995702885206, + "grad_norm": 0.5618723630905151, + "learning_rate": 9.544718606759193e-05, + "loss": 1.9745, + "step": 5305 + }, + { + "epoch": 1.6286065070595457, + "grad_norm": 0.517867386341095, + "learning_rate": 9.54451135230228e-05, + "loss": 2.0238, + "step": 5306 + }, + { + "epoch": 1.6289134438305708, + "grad_norm": 0.4745725393295288, + "learning_rate": 9.544304052933842e-05, + "loss": 1.999, + "step": 5307 + }, + { + "epoch": 1.6292203806015961, + "grad_norm": 0.4454270899295807, + "learning_rate": 9.544096708655928e-05, + "loss": 1.9215, + "step": 5308 + }, + { + "epoch": 1.6295273173726212, + "grad_norm": 0.5604696273803711, + "learning_rate": 9.543889319470586e-05, + "loss": 1.8756, + "step": 5309 + }, + { + "epoch": 1.6298342541436464, + "grad_norm": 0.645453155040741, + "learning_rate": 9.543681885379869e-05, + "loss": 1.9177, + "step": 5310 + }, + { + "epoch": 1.6301411909146717, + "grad_norm": 0.7018140554428101, + "learning_rate": 9.543474406385824e-05, + "loss": 1.9231, + "step": 5311 + }, + { + "epoch": 1.6304481276856968, + "grad_norm": 0.691644549369812, + "learning_rate": 9.543266882490501e-05, + "loss": 1.9055, + "step": 5312 + }, + { + "epoch": 1.630755064456722, + "grad_norm": 0.5484849810600281, + "learning_rate": 9.54305931369595e-05, + "loss": 1.8977, + "step": 5313 + }, + { + "epoch": 1.6310620012277472, + "grad_norm": 0.4035104811191559, + "learning_rate": 9.542851700004227e-05, + "loss": 1.9098, + "step": 5314 + }, + { + "epoch": 1.6313689379987721, + "grad_norm": 0.4578574299812317, + "learning_rate": 9.542644041417379e-05, + "loss": 1.9946, + "step": 5315 + }, + { + "epoch": 1.6316758747697975, + "grad_norm": 0.646272599697113, + "learning_rate": 9.542436337937462e-05, + "loss": 1.9489, + "step": 5316 + }, + { + "epoch": 1.6319828115408226, + "grad_norm": 0.5796291828155518, + "learning_rate": 9.542228589566524e-05, + "loss": 1.8396, + "step": 5317 + }, + { + "epoch": 1.6322897483118477, + "grad_norm": 0.42690619826316833, + "learning_rate": 9.542020796306623e-05, + "loss": 1.9691, + "step": 5318 + }, + { + "epoch": 1.632596685082873, + "grad_norm": 0.3943910002708435, + "learning_rate": 9.54181295815981e-05, + "loss": 1.8711, + "step": 5319 + }, + { + "epoch": 1.6329036218538981, + "grad_norm": 0.4636860489845276, + "learning_rate": 9.541605075128137e-05, + "loss": 1.8659, + "step": 5320 + }, + { + "epoch": 1.6332105586249233, + "grad_norm": 0.5485807061195374, + "learning_rate": 9.541397147213664e-05, + "loss": 2.031, + "step": 5321 + }, + { + "epoch": 1.6335174953959486, + "grad_norm": 0.40169721841812134, + "learning_rate": 9.541189174418441e-05, + "loss": 1.9346, + "step": 5322 + }, + { + "epoch": 1.6338244321669735, + "grad_norm": 0.3407663106918335, + "learning_rate": 9.540981156744524e-05, + "loss": 1.9238, + "step": 5323 + }, + { + "epoch": 1.6341313689379988, + "grad_norm": 0.4062422513961792, + "learning_rate": 9.540773094193971e-05, + "loss": 1.914, + "step": 5324 + }, + { + "epoch": 1.634438305709024, + "grad_norm": 0.47654685378074646, + "learning_rate": 9.540564986768836e-05, + "loss": 1.8957, + "step": 5325 + }, + { + "epoch": 1.634745242480049, + "grad_norm": 0.4369850754737854, + "learning_rate": 9.540356834471178e-05, + "loss": 1.968, + "step": 5326 + }, + { + "epoch": 1.6350521792510744, + "grad_norm": 0.38868457078933716, + "learning_rate": 9.540148637303052e-05, + "loss": 1.931, + "step": 5327 + }, + { + "epoch": 1.6353591160220995, + "grad_norm": 0.4998358190059662, + "learning_rate": 9.539940395266515e-05, + "loss": 1.9316, + "step": 5328 + }, + { + "epoch": 1.6356660527931246, + "grad_norm": 0.5497372150421143, + "learning_rate": 9.539732108363628e-05, + "loss": 1.9233, + "step": 5329 + }, + { + "epoch": 1.63597298956415, + "grad_norm": 0.5609846115112305, + "learning_rate": 9.539523776596445e-05, + "loss": 1.898, + "step": 5330 + }, + { + "epoch": 1.6362799263351748, + "grad_norm": 0.44984617829322815, + "learning_rate": 9.539315399967029e-05, + "loss": 2.0103, + "step": 5331 + }, + { + "epoch": 1.6365868631062002, + "grad_norm": 0.41710013151168823, + "learning_rate": 9.539106978477436e-05, + "loss": 1.9008, + "step": 5332 + }, + { + "epoch": 1.6368937998772253, + "grad_norm": 0.44854703545570374, + "learning_rate": 9.53889851212973e-05, + "loss": 1.9591, + "step": 5333 + }, + { + "epoch": 1.6372007366482504, + "grad_norm": 0.4259171485900879, + "learning_rate": 9.538690000925968e-05, + "loss": 1.915, + "step": 5334 + }, + { + "epoch": 1.6375076734192757, + "grad_norm": 0.4444480240345001, + "learning_rate": 9.53848144486821e-05, + "loss": 1.9562, + "step": 5335 + }, + { + "epoch": 1.6378146101903008, + "grad_norm": 0.40078794956207275, + "learning_rate": 9.538272843958518e-05, + "loss": 1.8802, + "step": 5336 + }, + { + "epoch": 1.638121546961326, + "grad_norm": 0.5346726179122925, + "learning_rate": 9.538064198198955e-05, + "loss": 2.0214, + "step": 5337 + }, + { + "epoch": 1.6384284837323513, + "grad_norm": 0.47136780619621277, + "learning_rate": 9.537855507591581e-05, + "loss": 1.9593, + "step": 5338 + }, + { + "epoch": 1.6387354205033762, + "grad_norm": 0.3839198052883148, + "learning_rate": 9.53764677213846e-05, + "loss": 1.9507, + "step": 5339 + }, + { + "epoch": 1.6390423572744015, + "grad_norm": 0.4565586447715759, + "learning_rate": 9.537437991841654e-05, + "loss": 1.9292, + "step": 5340 + }, + { + "epoch": 1.6393492940454266, + "grad_norm": 0.5139011740684509, + "learning_rate": 9.537229166703225e-05, + "loss": 1.9388, + "step": 5341 + }, + { + "epoch": 1.6396562308164517, + "grad_norm": 0.5421571135520935, + "learning_rate": 9.537020296725238e-05, + "loss": 1.9031, + "step": 5342 + }, + { + "epoch": 1.639963167587477, + "grad_norm": 0.4085434675216675, + "learning_rate": 9.536811381909758e-05, + "loss": 1.9167, + "step": 5343 + }, + { + "epoch": 1.6402701043585022, + "grad_norm": 0.3567824065685272, + "learning_rate": 9.536602422258849e-05, + "loss": 1.89, + "step": 5344 + }, + { + "epoch": 1.6405770411295273, + "grad_norm": 0.5427443385124207, + "learning_rate": 9.536393417774575e-05, + "loss": 2.0036, + "step": 5345 + }, + { + "epoch": 1.6408839779005526, + "grad_norm": 0.5275370478630066, + "learning_rate": 9.536184368459003e-05, + "loss": 1.94, + "step": 5346 + }, + { + "epoch": 1.6411909146715775, + "grad_norm": 0.3916989862918854, + "learning_rate": 9.535975274314198e-05, + "loss": 1.8769, + "step": 5347 + }, + { + "epoch": 1.6414978514426029, + "grad_norm": 0.4200802743434906, + "learning_rate": 9.535766135342228e-05, + "loss": 1.9384, + "step": 5348 + }, + { + "epoch": 1.641804788213628, + "grad_norm": 0.5287195444107056, + "learning_rate": 9.535556951545157e-05, + "loss": 1.9159, + "step": 5349 + }, + { + "epoch": 1.642111724984653, + "grad_norm": 0.5934851765632629, + "learning_rate": 9.535347722925055e-05, + "loss": 1.9927, + "step": 5350 + }, + { + "epoch": 1.6424186617556784, + "grad_norm": 0.49941807985305786, + "learning_rate": 9.535138449483987e-05, + "loss": 1.9124, + "step": 5351 + }, + { + "epoch": 1.6427255985267035, + "grad_norm": 0.41778016090393066, + "learning_rate": 9.534929131224024e-05, + "loss": 1.9468, + "step": 5352 + }, + { + "epoch": 1.6430325352977286, + "grad_norm": 0.5172474384307861, + "learning_rate": 9.534719768147233e-05, + "loss": 1.928, + "step": 5353 + }, + { + "epoch": 1.643339472068754, + "grad_norm": 0.6690294146537781, + "learning_rate": 9.534510360255683e-05, + "loss": 1.9697, + "step": 5354 + }, + { + "epoch": 1.6436464088397789, + "grad_norm": 0.617683470249176, + "learning_rate": 9.534300907551444e-05, + "loss": 1.9529, + "step": 5355 + }, + { + "epoch": 1.6439533456108042, + "grad_norm": 0.40067893266677856, + "learning_rate": 9.534091410036587e-05, + "loss": 1.915, + "step": 5356 + }, + { + "epoch": 1.6442602823818293, + "grad_norm": 0.46418440341949463, + "learning_rate": 9.53388186771318e-05, + "loss": 1.9056, + "step": 5357 + }, + { + "epoch": 1.6445672191528544, + "grad_norm": 0.6600098013877869, + "learning_rate": 9.533672280583295e-05, + "loss": 1.9641, + "step": 5358 + }, + { + "epoch": 1.6448741559238798, + "grad_norm": 0.6510347127914429, + "learning_rate": 9.533462648649004e-05, + "loss": 1.916, + "step": 5359 + }, + { + "epoch": 1.6451810926949049, + "grad_norm": 0.5004377365112305, + "learning_rate": 9.533252971912376e-05, + "loss": 1.9584, + "step": 5360 + }, + { + "epoch": 1.64548802946593, + "grad_norm": 0.45522230863571167, + "learning_rate": 9.533043250375488e-05, + "loss": 1.973, + "step": 5361 + }, + { + "epoch": 1.6457949662369553, + "grad_norm": 0.5304180383682251, + "learning_rate": 9.532833484040408e-05, + "loss": 1.8542, + "step": 5362 + }, + { + "epoch": 1.6461019030079802, + "grad_norm": 0.5320406556129456, + "learning_rate": 9.53262367290921e-05, + "loss": 1.9405, + "step": 5363 + }, + { + "epoch": 1.6464088397790055, + "grad_norm": 0.4377361536026001, + "learning_rate": 9.532413816983969e-05, + "loss": 1.9126, + "step": 5364 + }, + { + "epoch": 1.6467157765500307, + "grad_norm": 0.4632298946380615, + "learning_rate": 9.532203916266758e-05, + "loss": 1.9868, + "step": 5365 + }, + { + "epoch": 1.6470227133210558, + "grad_norm": 0.4861730635166168, + "learning_rate": 9.531993970759651e-05, + "loss": 1.895, + "step": 5366 + }, + { + "epoch": 1.647329650092081, + "grad_norm": 0.45012348890304565, + "learning_rate": 9.531783980464726e-05, + "loss": 1.9583, + "step": 5367 + }, + { + "epoch": 1.6476365868631062, + "grad_norm": 0.43772751092910767, + "learning_rate": 9.531573945384053e-05, + "loss": 1.9341, + "step": 5368 + }, + { + "epoch": 1.6479435236341313, + "grad_norm": 0.39253392815589905, + "learning_rate": 9.531363865519711e-05, + "loss": 1.8629, + "step": 5369 + }, + { + "epoch": 1.6482504604051567, + "grad_norm": 0.44614076614379883, + "learning_rate": 9.531153740873775e-05, + "loss": 1.9508, + "step": 5370 + }, + { + "epoch": 1.6485573971761815, + "grad_norm": 0.4442307949066162, + "learning_rate": 9.530943571448322e-05, + "loss": 1.9624, + "step": 5371 + }, + { + "epoch": 1.6488643339472069, + "grad_norm": 0.44962942600250244, + "learning_rate": 9.53073335724543e-05, + "loss": 1.9315, + "step": 5372 + }, + { + "epoch": 1.649171270718232, + "grad_norm": 0.4903222620487213, + "learning_rate": 9.530523098267173e-05, + "loss": 1.8776, + "step": 5373 + }, + { + "epoch": 1.649478207489257, + "grad_norm": 0.4733131229877472, + "learning_rate": 9.530312794515633e-05, + "loss": 1.958, + "step": 5374 + }, + { + "epoch": 1.6497851442602824, + "grad_norm": 0.4134232997894287, + "learning_rate": 9.530102445992886e-05, + "loss": 1.9184, + "step": 5375 + }, + { + "epoch": 1.6500920810313076, + "grad_norm": 0.43521758913993835, + "learning_rate": 9.529892052701012e-05, + "loss": 1.9383, + "step": 5376 + }, + { + "epoch": 1.6503990178023327, + "grad_norm": 0.5098583102226257, + "learning_rate": 9.52968161464209e-05, + "loss": 1.9596, + "step": 5377 + }, + { + "epoch": 1.650705954573358, + "grad_norm": 0.48421037197113037, + "learning_rate": 9.5294711318182e-05, + "loss": 1.9258, + "step": 5378 + }, + { + "epoch": 1.651012891344383, + "grad_norm": 0.4039461314678192, + "learning_rate": 9.52926060423142e-05, + "loss": 1.9975, + "step": 5379 + }, + { + "epoch": 1.6513198281154082, + "grad_norm": 0.491858571767807, + "learning_rate": 9.529050031883832e-05, + "loss": 1.9564, + "step": 5380 + }, + { + "epoch": 1.6516267648864333, + "grad_norm": 0.45920100808143616, + "learning_rate": 9.528839414777517e-05, + "loss": 1.8513, + "step": 5381 + }, + { + "epoch": 1.6519337016574585, + "grad_norm": 0.4812139868736267, + "learning_rate": 9.528628752914558e-05, + "loss": 1.9638, + "step": 5382 + }, + { + "epoch": 1.6522406384284838, + "grad_norm": 0.38021141290664673, + "learning_rate": 9.528418046297034e-05, + "loss": 1.848, + "step": 5383 + }, + { + "epoch": 1.652547575199509, + "grad_norm": 0.438681960105896, + "learning_rate": 9.52820729492703e-05, + "loss": 1.9931, + "step": 5384 + }, + { + "epoch": 1.652854511970534, + "grad_norm": 0.4387293756008148, + "learning_rate": 9.527996498806627e-05, + "loss": 1.9969, + "step": 5385 + }, + { + "epoch": 1.6531614487415593, + "grad_norm": 0.43315380811691284, + "learning_rate": 9.527785657937907e-05, + "loss": 1.9607, + "step": 5386 + }, + { + "epoch": 1.6534683855125845, + "grad_norm": 0.4800446927547455, + "learning_rate": 9.527574772322956e-05, + "loss": 1.9645, + "step": 5387 + }, + { + "epoch": 1.6537753222836096, + "grad_norm": 0.45495909452438354, + "learning_rate": 9.527363841963857e-05, + "loss": 1.8748, + "step": 5388 + }, + { + "epoch": 1.654082259054635, + "grad_norm": 0.4052638113498688, + "learning_rate": 9.527152866862696e-05, + "loss": 1.9491, + "step": 5389 + }, + { + "epoch": 1.6543891958256598, + "grad_norm": 0.44545745849609375, + "learning_rate": 9.526941847021558e-05, + "loss": 1.8938, + "step": 5390 + }, + { + "epoch": 1.6546961325966851, + "grad_norm": 0.5576399564743042, + "learning_rate": 9.526730782442526e-05, + "loss": 1.9656, + "step": 5391 + }, + { + "epoch": 1.6550030693677102, + "grad_norm": 0.5678401589393616, + "learning_rate": 9.526519673127686e-05, + "loss": 1.9914, + "step": 5392 + }, + { + "epoch": 1.6553100061387354, + "grad_norm": 0.4391598701477051, + "learning_rate": 9.526308519079127e-05, + "loss": 1.9452, + "step": 5393 + }, + { + "epoch": 1.6556169429097607, + "grad_norm": 0.4375559091567993, + "learning_rate": 9.526097320298934e-05, + "loss": 1.9335, + "step": 5394 + }, + { + "epoch": 1.6559238796807858, + "grad_norm": 0.4976498782634735, + "learning_rate": 9.525886076789194e-05, + "loss": 2.0065, + "step": 5395 + }, + { + "epoch": 1.656230816451811, + "grad_norm": 0.5966445207595825, + "learning_rate": 9.525674788551996e-05, + "loss": 1.9924, + "step": 5396 + }, + { + "epoch": 1.6565377532228363, + "grad_norm": 0.5119359493255615, + "learning_rate": 9.525463455589427e-05, + "loss": 2.0061, + "step": 5397 + }, + { + "epoch": 1.6568446899938611, + "grad_norm": 0.46835067868232727, + "learning_rate": 9.525252077903574e-05, + "loss": 1.9441, + "step": 5398 + }, + { + "epoch": 1.6571516267648865, + "grad_norm": 0.5319140553474426, + "learning_rate": 9.52504065549653e-05, + "loss": 1.9704, + "step": 5399 + }, + { + "epoch": 1.6574585635359116, + "grad_norm": 0.5132572054862976, + "learning_rate": 9.52482918837038e-05, + "loss": 1.9037, + "step": 5400 + }, + { + "epoch": 1.6577655003069367, + "grad_norm": 0.41260987520217896, + "learning_rate": 9.524617676527218e-05, + "loss": 1.9103, + "step": 5401 + }, + { + "epoch": 1.658072437077962, + "grad_norm": 0.41780540347099304, + "learning_rate": 9.524406119969131e-05, + "loss": 1.9419, + "step": 5402 + }, + { + "epoch": 1.6583793738489871, + "grad_norm": 0.42015889286994934, + "learning_rate": 9.524194518698211e-05, + "loss": 1.9143, + "step": 5403 + }, + { + "epoch": 1.6586863106200123, + "grad_norm": 0.4449796676635742, + "learning_rate": 9.523982872716548e-05, + "loss": 1.9794, + "step": 5404 + }, + { + "epoch": 1.6589932473910376, + "grad_norm": 0.4392293393611908, + "learning_rate": 9.523771182026237e-05, + "loss": 1.8687, + "step": 5405 + }, + { + "epoch": 1.6593001841620625, + "grad_norm": 0.49595963954925537, + "learning_rate": 9.523559446629366e-05, + "loss": 2.013, + "step": 5406 + }, + { + "epoch": 1.6596071209330878, + "grad_norm": 0.4456728994846344, + "learning_rate": 9.523347666528029e-05, + "loss": 1.9269, + "step": 5407 + }, + { + "epoch": 1.659914057704113, + "grad_norm": 0.3835284411907196, + "learning_rate": 9.52313584172432e-05, + "loss": 1.9042, + "step": 5408 + }, + { + "epoch": 1.660220994475138, + "grad_norm": 0.39068692922592163, + "learning_rate": 9.522923972220332e-05, + "loss": 1.999, + "step": 5409 + }, + { + "epoch": 1.6605279312461634, + "grad_norm": 0.4522729814052582, + "learning_rate": 9.522712058018157e-05, + "loss": 1.9546, + "step": 5410 + }, + { + "epoch": 1.6608348680171885, + "grad_norm": 0.3834155201911926, + "learning_rate": 9.522500099119891e-05, + "loss": 1.9184, + "step": 5411 + }, + { + "epoch": 1.6611418047882136, + "grad_norm": 0.36149126291275024, + "learning_rate": 9.522288095527629e-05, + "loss": 1.8973, + "step": 5412 + }, + { + "epoch": 1.661448741559239, + "grad_norm": 0.3502398729324341, + "learning_rate": 9.522076047243464e-05, + "loss": 1.8775, + "step": 5413 + }, + { + "epoch": 1.6617556783302638, + "grad_norm": 0.36552321910858154, + "learning_rate": 9.521863954269495e-05, + "loss": 1.901, + "step": 5414 + }, + { + "epoch": 1.6620626151012892, + "grad_norm": 0.37815216183662415, + "learning_rate": 9.521651816607814e-05, + "loss": 1.9143, + "step": 5415 + }, + { + "epoch": 1.6623695518723143, + "grad_norm": 0.4048994481563568, + "learning_rate": 9.52143963426052e-05, + "loss": 1.9892, + "step": 5416 + }, + { + "epoch": 1.6626764886433394, + "grad_norm": 0.35271233320236206, + "learning_rate": 9.52122740722971e-05, + "loss": 1.9209, + "step": 5417 + }, + { + "epoch": 1.6629834254143647, + "grad_norm": 0.405009925365448, + "learning_rate": 9.521015135517482e-05, + "loss": 1.9583, + "step": 5418 + }, + { + "epoch": 1.6632903621853898, + "grad_norm": 0.4041683077812195, + "learning_rate": 9.520802819125932e-05, + "loss": 1.8937, + "step": 5419 + }, + { + "epoch": 1.663597298956415, + "grad_norm": 0.41353970766067505, + "learning_rate": 9.520590458057157e-05, + "loss": 1.949, + "step": 5420 + }, + { + "epoch": 1.6639042357274403, + "grad_norm": 0.3704569637775421, + "learning_rate": 9.520378052313258e-05, + "loss": 1.9287, + "step": 5421 + }, + { + "epoch": 1.6642111724984652, + "grad_norm": 0.4043133854866028, + "learning_rate": 9.520165601896334e-05, + "loss": 1.9116, + "step": 5422 + }, + { + "epoch": 1.6645181092694905, + "grad_norm": 0.3976849317550659, + "learning_rate": 9.519953106808485e-05, + "loss": 1.9578, + "step": 5423 + }, + { + "epoch": 1.6648250460405156, + "grad_norm": 0.41225695610046387, + "learning_rate": 9.51974056705181e-05, + "loss": 1.8861, + "step": 5424 + }, + { + "epoch": 1.6651319828115407, + "grad_norm": 0.40096259117126465, + "learning_rate": 9.519527982628409e-05, + "loss": 1.926, + "step": 5425 + }, + { + "epoch": 1.665438919582566, + "grad_norm": 0.4373134970664978, + "learning_rate": 9.519315353540384e-05, + "loss": 1.8761, + "step": 5426 + }, + { + "epoch": 1.6657458563535912, + "grad_norm": 0.3798682689666748, + "learning_rate": 9.519102679789835e-05, + "loss": 1.8655, + "step": 5427 + }, + { + "epoch": 1.6660527931246163, + "grad_norm": 0.3889687955379486, + "learning_rate": 9.518889961378865e-05, + "loss": 1.8928, + "step": 5428 + }, + { + "epoch": 1.6663597298956416, + "grad_norm": 0.39567697048187256, + "learning_rate": 9.518677198309575e-05, + "loss": 1.9193, + "step": 5429 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.37571004033088684, + "learning_rate": 9.51846439058407e-05, + "loss": 1.9653, + "step": 5430 + }, + { + "epoch": 1.6669736034376919, + "grad_norm": 0.36011725664138794, + "learning_rate": 9.518251538204451e-05, + "loss": 1.9202, + "step": 5431 + }, + { + "epoch": 1.667280540208717, + "grad_norm": 0.42314839363098145, + "learning_rate": 9.518038641172822e-05, + "loss": 1.9883, + "step": 5432 + }, + { + "epoch": 1.667587476979742, + "grad_norm": 0.3986029326915741, + "learning_rate": 9.517825699491287e-05, + "loss": 1.9838, + "step": 5433 + }, + { + "epoch": 1.6678944137507674, + "grad_norm": 0.388236939907074, + "learning_rate": 9.517612713161949e-05, + "loss": 1.901, + "step": 5434 + }, + { + "epoch": 1.6682013505217925, + "grad_norm": 0.3849826455116272, + "learning_rate": 9.517399682186917e-05, + "loss": 1.9621, + "step": 5435 + }, + { + "epoch": 1.6685082872928176, + "grad_norm": 0.40182530879974365, + "learning_rate": 9.517186606568292e-05, + "loss": 1.9081, + "step": 5436 + }, + { + "epoch": 1.668815224063843, + "grad_norm": 0.4260261654853821, + "learning_rate": 9.516973486308181e-05, + "loss": 1.9701, + "step": 5437 + }, + { + "epoch": 1.6691221608348679, + "grad_norm": 0.4035099744796753, + "learning_rate": 9.516760321408692e-05, + "loss": 1.9269, + "step": 5438 + }, + { + "epoch": 1.6694290976058932, + "grad_norm": 0.42106589674949646, + "learning_rate": 9.51654711187193e-05, + "loss": 1.9026, + "step": 5439 + }, + { + "epoch": 1.6697360343769183, + "grad_norm": 0.4629819989204407, + "learning_rate": 9.516333857700001e-05, + "loss": 1.9128, + "step": 5440 + }, + { + "epoch": 1.6700429711479434, + "grad_norm": 0.3824837803840637, + "learning_rate": 9.516120558895014e-05, + "loss": 1.8861, + "step": 5441 + }, + { + "epoch": 1.6703499079189688, + "grad_norm": 0.37263223528862, + "learning_rate": 9.515907215459076e-05, + "loss": 1.9098, + "step": 5442 + }, + { + "epoch": 1.6706568446899939, + "grad_norm": 0.3980494439601898, + "learning_rate": 9.515693827394299e-05, + "loss": 1.9764, + "step": 5443 + }, + { + "epoch": 1.670963781461019, + "grad_norm": 0.5064507722854614, + "learning_rate": 9.515480394702786e-05, + "loss": 1.9771, + "step": 5444 + }, + { + "epoch": 1.6712707182320443, + "grad_norm": 0.5012909770011902, + "learning_rate": 9.515266917386649e-05, + "loss": 1.9162, + "step": 5445 + }, + { + "epoch": 1.6715776550030692, + "grad_norm": 0.5422279238700867, + "learning_rate": 9.515053395447999e-05, + "loss": 1.8913, + "step": 5446 + }, + { + "epoch": 1.6718845917740945, + "grad_norm": 0.4677022397518158, + "learning_rate": 9.514839828888946e-05, + "loss": 1.9156, + "step": 5447 + }, + { + "epoch": 1.6721915285451197, + "grad_norm": 0.39561185240745544, + "learning_rate": 9.514626217711597e-05, + "loss": 1.9203, + "step": 5448 + }, + { + "epoch": 1.6724984653161448, + "grad_norm": 0.4435743987560272, + "learning_rate": 9.514412561918068e-05, + "loss": 1.953, + "step": 5449 + }, + { + "epoch": 1.67280540208717, + "grad_norm": 0.5383535027503967, + "learning_rate": 9.514198861510467e-05, + "loss": 1.9662, + "step": 5450 + }, + { + "epoch": 1.6731123388581952, + "grad_norm": 0.4787214696407318, + "learning_rate": 9.513985116490906e-05, + "loss": 1.9278, + "step": 5451 + }, + { + "epoch": 1.6734192756292203, + "grad_norm": 0.40962034463882446, + "learning_rate": 9.513771326861501e-05, + "loss": 1.9267, + "step": 5452 + }, + { + "epoch": 1.6737262124002457, + "grad_norm": 0.43605929613113403, + "learning_rate": 9.513557492624359e-05, + "loss": 1.9537, + "step": 5453 + }, + { + "epoch": 1.6740331491712708, + "grad_norm": 0.46278494596481323, + "learning_rate": 9.513343613781599e-05, + "loss": 1.9383, + "step": 5454 + }, + { + "epoch": 1.6743400859422959, + "grad_norm": 0.4052918255329132, + "learning_rate": 9.513129690335331e-05, + "loss": 1.9289, + "step": 5455 + }, + { + "epoch": 1.6746470227133212, + "grad_norm": 0.37791141867637634, + "learning_rate": 9.51291572228767e-05, + "loss": 1.9185, + "step": 5456 + }, + { + "epoch": 1.674953959484346, + "grad_norm": 0.41135111451148987, + "learning_rate": 9.512701709640731e-05, + "loss": 2.0003, + "step": 5457 + }, + { + "epoch": 1.6752608962553714, + "grad_norm": 0.41175320744514465, + "learning_rate": 9.512487652396629e-05, + "loss": 1.9307, + "step": 5458 + }, + { + "epoch": 1.6755678330263966, + "grad_norm": 0.40061330795288086, + "learning_rate": 9.512273550557478e-05, + "loss": 1.9361, + "step": 5459 + }, + { + "epoch": 1.6758747697974217, + "grad_norm": 0.3938329219818115, + "learning_rate": 9.512059404125397e-05, + "loss": 1.9419, + "step": 5460 + }, + { + "epoch": 1.676181706568447, + "grad_norm": 0.42825883626937866, + "learning_rate": 9.511845213102498e-05, + "loss": 1.9201, + "step": 5461 + }, + { + "epoch": 1.6764886433394721, + "grad_norm": 0.3795798122882843, + "learning_rate": 9.511630977490901e-05, + "loss": 1.9872, + "step": 5462 + }, + { + "epoch": 1.6767955801104972, + "grad_norm": 0.3639005422592163, + "learning_rate": 9.511416697292724e-05, + "loss": 1.9066, + "step": 5463 + }, + { + "epoch": 1.6771025168815226, + "grad_norm": 0.4200088381767273, + "learning_rate": 9.511202372510082e-05, + "loss": 1.9928, + "step": 5464 + }, + { + "epoch": 1.6774094536525475, + "grad_norm": 0.436638742685318, + "learning_rate": 9.510988003145092e-05, + "loss": 1.8527, + "step": 5465 + }, + { + "epoch": 1.6777163904235728, + "grad_norm": 0.40901345014572144, + "learning_rate": 9.510773589199877e-05, + "loss": 1.9915, + "step": 5466 + }, + { + "epoch": 1.678023327194598, + "grad_norm": 0.39717167615890503, + "learning_rate": 9.510559130676553e-05, + "loss": 1.9682, + "step": 5467 + }, + { + "epoch": 1.678330263965623, + "grad_norm": 0.37574490904808044, + "learning_rate": 9.510344627577239e-05, + "loss": 1.9641, + "step": 5468 + }, + { + "epoch": 1.6786372007366483, + "grad_norm": 0.36686137318611145, + "learning_rate": 9.510130079904057e-05, + "loss": 1.9082, + "step": 5469 + }, + { + "epoch": 1.6789441375076735, + "grad_norm": 0.37321972846984863, + "learning_rate": 9.509915487659125e-05, + "loss": 1.8911, + "step": 5470 + }, + { + "epoch": 1.6792510742786986, + "grad_norm": 0.3911389112472534, + "learning_rate": 9.509700850844566e-05, + "loss": 1.9721, + "step": 5471 + }, + { + "epoch": 1.679558011049724, + "grad_norm": 0.41182973980903625, + "learning_rate": 9.509486169462499e-05, + "loss": 1.9188, + "step": 5472 + }, + { + "epoch": 1.6798649478207488, + "grad_norm": 0.4141900837421417, + "learning_rate": 9.509271443515047e-05, + "loss": 1.875, + "step": 5473 + }, + { + "epoch": 1.6801718845917741, + "grad_norm": 0.4259745478630066, + "learning_rate": 9.509056673004333e-05, + "loss": 1.9258, + "step": 5474 + }, + { + "epoch": 1.6804788213627992, + "grad_norm": 0.47081178426742554, + "learning_rate": 9.508841857932476e-05, + "loss": 2.0494, + "step": 5475 + }, + { + "epoch": 1.6807857581338244, + "grad_norm": 0.5346465110778809, + "learning_rate": 9.508626998301602e-05, + "loss": 1.9371, + "step": 5476 + }, + { + "epoch": 1.6810926949048497, + "grad_norm": 0.5532976388931274, + "learning_rate": 9.508412094113832e-05, + "loss": 1.8727, + "step": 5477 + }, + { + "epoch": 1.6813996316758748, + "grad_norm": 0.5262138843536377, + "learning_rate": 9.508197145371294e-05, + "loss": 1.9098, + "step": 5478 + }, + { + "epoch": 1.6817065684469, + "grad_norm": 0.47581788897514343, + "learning_rate": 9.507982152076108e-05, + "loss": 1.9174, + "step": 5479 + }, + { + "epoch": 1.6820135052179253, + "grad_norm": 0.41795024275779724, + "learning_rate": 9.507767114230399e-05, + "loss": 1.9333, + "step": 5480 + }, + { + "epoch": 1.6823204419889501, + "grad_norm": 0.5213392376899719, + "learning_rate": 9.507552031836295e-05, + "loss": 1.9731, + "step": 5481 + }, + { + "epoch": 1.6826273787599755, + "grad_norm": 0.624969482421875, + "learning_rate": 9.507336904895919e-05, + "loss": 1.965, + "step": 5482 + }, + { + "epoch": 1.6829343155310006, + "grad_norm": 0.5719303488731384, + "learning_rate": 9.507121733411397e-05, + "loss": 1.9325, + "step": 5483 + }, + { + "epoch": 1.6832412523020257, + "grad_norm": 0.45429563522338867, + "learning_rate": 9.506906517384858e-05, + "loss": 1.8846, + "step": 5484 + }, + { + "epoch": 1.683548189073051, + "grad_norm": 0.4679521322250366, + "learning_rate": 9.506691256818427e-05, + "loss": 1.9609, + "step": 5485 + }, + { + "epoch": 1.6838551258440762, + "grad_norm": 0.64385986328125, + "learning_rate": 9.50647595171423e-05, + "loss": 1.9138, + "step": 5486 + }, + { + "epoch": 1.6841620626151013, + "grad_norm": 0.6783073544502258, + "learning_rate": 9.506260602074398e-05, + "loss": 2.0252, + "step": 5487 + }, + { + "epoch": 1.6844689993861266, + "grad_norm": 0.6151844263076782, + "learning_rate": 9.506045207901058e-05, + "loss": 2.0077, + "step": 5488 + }, + { + "epoch": 1.6847759361571515, + "grad_norm": 0.43046683073043823, + "learning_rate": 9.505829769196338e-05, + "loss": 1.8945, + "step": 5489 + }, + { + "epoch": 1.6850828729281768, + "grad_norm": 0.44831258058547974, + "learning_rate": 9.505614285962366e-05, + "loss": 1.9775, + "step": 5490 + }, + { + "epoch": 1.685389809699202, + "grad_norm": 0.4917668402194977, + "learning_rate": 9.505398758201272e-05, + "loss": 1.9115, + "step": 5491 + }, + { + "epoch": 1.685696746470227, + "grad_norm": 0.4595036506652832, + "learning_rate": 9.505183185915187e-05, + "loss": 1.9103, + "step": 5492 + }, + { + "epoch": 1.6860036832412524, + "grad_norm": 0.43335607647895813, + "learning_rate": 9.504967569106243e-05, + "loss": 1.9147, + "step": 5493 + }, + { + "epoch": 1.6863106200122775, + "grad_norm": 0.42885956168174744, + "learning_rate": 9.504751907776567e-05, + "loss": 2.0085, + "step": 5494 + }, + { + "epoch": 1.6866175567833026, + "grad_norm": 0.4121492803096771, + "learning_rate": 9.504536201928295e-05, + "loss": 1.9212, + "step": 5495 + }, + { + "epoch": 1.686924493554328, + "grad_norm": 0.4387015700340271, + "learning_rate": 9.504320451563555e-05, + "loss": 1.9202, + "step": 5496 + }, + { + "epoch": 1.6872314303253528, + "grad_norm": 0.4333394467830658, + "learning_rate": 9.504104656684481e-05, + "loss": 1.9165, + "step": 5497 + }, + { + "epoch": 1.6875383670963782, + "grad_norm": 0.37835901975631714, + "learning_rate": 9.503888817293203e-05, + "loss": 1.9087, + "step": 5498 + }, + { + "epoch": 1.6878453038674033, + "grad_norm": 0.42156684398651123, + "learning_rate": 9.503672933391857e-05, + "loss": 1.8909, + "step": 5499 + }, + { + "epoch": 1.6881522406384284, + "grad_norm": 0.4315885603427887, + "learning_rate": 9.503457004982574e-05, + "loss": 1.8892, + "step": 5500 + }, + { + "epoch": 1.6884591774094537, + "grad_norm": 0.4349892735481262, + "learning_rate": 9.50324103206749e-05, + "loss": 1.9532, + "step": 5501 + }, + { + "epoch": 1.6887661141804788, + "grad_norm": 0.45786523818969727, + "learning_rate": 9.503025014648739e-05, + "loss": 1.9285, + "step": 5502 + }, + { + "epoch": 1.689073050951504, + "grad_norm": 0.36640092730522156, + "learning_rate": 9.502808952728456e-05, + "loss": 1.9167, + "step": 5503 + }, + { + "epoch": 1.6893799877225293, + "grad_norm": 0.46942031383514404, + "learning_rate": 9.502592846308775e-05, + "loss": 2.08, + "step": 5504 + }, + { + "epoch": 1.6896869244935542, + "grad_norm": 0.44714173674583435, + "learning_rate": 9.502376695391833e-05, + "loss": 1.9618, + "step": 5505 + }, + { + "epoch": 1.6899938612645795, + "grad_norm": 0.4216810464859009, + "learning_rate": 9.502160499979764e-05, + "loss": 1.888, + "step": 5506 + }, + { + "epoch": 1.6903007980356046, + "grad_norm": 0.40471377968788147, + "learning_rate": 9.501944260074709e-05, + "loss": 1.9048, + "step": 5507 + }, + { + "epoch": 1.6906077348066297, + "grad_norm": 0.399309366941452, + "learning_rate": 9.501727975678801e-05, + "loss": 1.8796, + "step": 5508 + }, + { + "epoch": 1.690914671577655, + "grad_norm": 0.36903873085975647, + "learning_rate": 9.501511646794176e-05, + "loss": 1.9607, + "step": 5509 + }, + { + "epoch": 1.6912216083486802, + "grad_norm": 0.40781939029693604, + "learning_rate": 9.501295273422977e-05, + "loss": 1.9328, + "step": 5510 + }, + { + "epoch": 1.6915285451197053, + "grad_norm": 0.38062483072280884, + "learning_rate": 9.50107885556734e-05, + "loss": 1.9552, + "step": 5511 + }, + { + "epoch": 1.6918354818907306, + "grad_norm": 0.4047648012638092, + "learning_rate": 9.500862393229402e-05, + "loss": 1.9503, + "step": 5512 + }, + { + "epoch": 1.6921424186617555, + "grad_norm": 0.3829517066478729, + "learning_rate": 9.500645886411305e-05, + "loss": 1.9034, + "step": 5513 + }, + { + "epoch": 1.6924493554327809, + "grad_norm": 0.3657867908477783, + "learning_rate": 9.500429335115188e-05, + "loss": 1.869, + "step": 5514 + }, + { + "epoch": 1.692756292203806, + "grad_norm": 0.410877525806427, + "learning_rate": 9.50021273934319e-05, + "loss": 1.9824, + "step": 5515 + }, + { + "epoch": 1.693063228974831, + "grad_norm": 0.420682817697525, + "learning_rate": 9.499996099097453e-05, + "loss": 1.969, + "step": 5516 + }, + { + "epoch": 1.6933701657458564, + "grad_norm": 0.44578227400779724, + "learning_rate": 9.499779414380115e-05, + "loss": 1.9513, + "step": 5517 + }, + { + "epoch": 1.6936771025168815, + "grad_norm": 0.42710423469543457, + "learning_rate": 9.499562685193319e-05, + "loss": 1.9423, + "step": 5518 + }, + { + "epoch": 1.6939840392879066, + "grad_norm": 0.4503214657306671, + "learning_rate": 9.49934591153921e-05, + "loss": 1.9849, + "step": 5519 + }, + { + "epoch": 1.694290976058932, + "grad_norm": 0.427157998085022, + "learning_rate": 9.499129093419926e-05, + "loss": 1.9502, + "step": 5520 + }, + { + "epoch": 1.6945979128299569, + "grad_norm": 0.4356638491153717, + "learning_rate": 9.498912230837611e-05, + "loss": 1.8593, + "step": 5521 + }, + { + "epoch": 1.6949048496009822, + "grad_norm": 0.3894338309764862, + "learning_rate": 9.498695323794409e-05, + "loss": 1.8857, + "step": 5522 + }, + { + "epoch": 1.6952117863720073, + "grad_norm": 0.4285121262073517, + "learning_rate": 9.498478372292464e-05, + "loss": 1.9774, + "step": 5523 + }, + { + "epoch": 1.6955187231430324, + "grad_norm": 0.4316183924674988, + "learning_rate": 9.498261376333916e-05, + "loss": 1.9067, + "step": 5524 + }, + { + "epoch": 1.6958256599140578, + "grad_norm": 0.3760167956352234, + "learning_rate": 9.498044335920914e-05, + "loss": 1.8375, + "step": 5525 + }, + { + "epoch": 1.6961325966850829, + "grad_norm": 0.4327097237110138, + "learning_rate": 9.497827251055602e-05, + "loss": 1.9333, + "step": 5526 + }, + { + "epoch": 1.696439533456108, + "grad_norm": 0.4169953167438507, + "learning_rate": 9.497610121740126e-05, + "loss": 1.9015, + "step": 5527 + }, + { + "epoch": 1.6967464702271333, + "grad_norm": 0.3915253281593323, + "learning_rate": 9.49739294797663e-05, + "loss": 1.8608, + "step": 5528 + }, + { + "epoch": 1.6970534069981584, + "grad_norm": 0.4071075916290283, + "learning_rate": 9.497175729767259e-05, + "loss": 1.9336, + "step": 5529 + }, + { + "epoch": 1.6973603437691835, + "grad_norm": 0.3550303876399994, + "learning_rate": 9.496958467114163e-05, + "loss": 1.8614, + "step": 5530 + }, + { + "epoch": 1.6976672805402089, + "grad_norm": 0.3757273554801941, + "learning_rate": 9.496741160019487e-05, + "loss": 1.9959, + "step": 5531 + }, + { + "epoch": 1.6979742173112338, + "grad_norm": 0.4126262366771698, + "learning_rate": 9.49652380848538e-05, + "loss": 1.935, + "step": 5532 + }, + { + "epoch": 1.698281154082259, + "grad_norm": 0.46366190910339355, + "learning_rate": 9.496306412513988e-05, + "loss": 1.9336, + "step": 5533 + }, + { + "epoch": 1.6985880908532842, + "grad_norm": 0.42553630471229553, + "learning_rate": 9.496088972107463e-05, + "loss": 1.9388, + "step": 5534 + }, + { + "epoch": 1.6988950276243093, + "grad_norm": 0.4060843884944916, + "learning_rate": 9.49587148726795e-05, + "loss": 1.917, + "step": 5535 + }, + { + "epoch": 1.6992019643953347, + "grad_norm": 0.37994736433029175, + "learning_rate": 9.495653957997601e-05, + "loss": 1.9268, + "step": 5536 + }, + { + "epoch": 1.6995089011663598, + "grad_norm": 0.4148559272289276, + "learning_rate": 9.495436384298563e-05, + "loss": 1.8936, + "step": 5537 + }, + { + "epoch": 1.6998158379373849, + "grad_norm": 0.39814767241477966, + "learning_rate": 9.495218766172989e-05, + "loss": 1.9468, + "step": 5538 + }, + { + "epoch": 1.7001227747084102, + "grad_norm": 0.40800294280052185, + "learning_rate": 9.495001103623027e-05, + "loss": 1.9649, + "step": 5539 + }, + { + "epoch": 1.7004297114794351, + "grad_norm": 0.4225989282131195, + "learning_rate": 9.49478339665083e-05, + "loss": 1.987, + "step": 5540 + }, + { + "epoch": 1.7007366482504604, + "grad_norm": 0.4280939996242523, + "learning_rate": 9.494565645258551e-05, + "loss": 2.0487, + "step": 5541 + }, + { + "epoch": 1.7010435850214856, + "grad_norm": 0.44816237688064575, + "learning_rate": 9.494347849448338e-05, + "loss": 1.9112, + "step": 5542 + }, + { + "epoch": 1.7013505217925107, + "grad_norm": 0.424629271030426, + "learning_rate": 9.494130009222346e-05, + "loss": 1.9284, + "step": 5543 + }, + { + "epoch": 1.701657458563536, + "grad_norm": 0.40010082721710205, + "learning_rate": 9.493912124582727e-05, + "loss": 1.9307, + "step": 5544 + }, + { + "epoch": 1.7019643953345611, + "grad_norm": 0.42541825771331787, + "learning_rate": 9.493694195531633e-05, + "loss": 2.0009, + "step": 5545 + }, + { + "epoch": 1.7022713321055862, + "grad_norm": 0.39693546295166016, + "learning_rate": 9.49347622207122e-05, + "loss": 1.9237, + "step": 5546 + }, + { + "epoch": 1.7025782688766116, + "grad_norm": 0.37853676080703735, + "learning_rate": 9.493258204203644e-05, + "loss": 1.9212, + "step": 5547 + }, + { + "epoch": 1.7028852056476365, + "grad_norm": 0.3856247663497925, + "learning_rate": 9.493040141931054e-05, + "loss": 1.926, + "step": 5548 + }, + { + "epoch": 1.7031921424186618, + "grad_norm": 0.3429555892944336, + "learning_rate": 9.492822035255608e-05, + "loss": 1.8854, + "step": 5549 + }, + { + "epoch": 1.703499079189687, + "grad_norm": 0.3500545620918274, + "learning_rate": 9.49260388417946e-05, + "loss": 1.8627, + "step": 5550 + }, + { + "epoch": 1.703806015960712, + "grad_norm": 0.3461480140686035, + "learning_rate": 9.49238568870477e-05, + "loss": 1.8962, + "step": 5551 + }, + { + "epoch": 1.7041129527317374, + "grad_norm": 0.36311015486717224, + "learning_rate": 9.492167448833691e-05, + "loss": 1.9398, + "step": 5552 + }, + { + "epoch": 1.7044198895027625, + "grad_norm": 0.36770105361938477, + "learning_rate": 9.491949164568379e-05, + "loss": 1.9083, + "step": 5553 + }, + { + "epoch": 1.7047268262737876, + "grad_norm": 0.42491769790649414, + "learning_rate": 9.491730835910993e-05, + "loss": 1.8874, + "step": 5554 + }, + { + "epoch": 1.705033763044813, + "grad_norm": 0.5321764945983887, + "learning_rate": 9.491512462863691e-05, + "loss": 1.9813, + "step": 5555 + }, + { + "epoch": 1.7053406998158378, + "grad_norm": 0.5481576323509216, + "learning_rate": 9.49129404542863e-05, + "loss": 1.8696, + "step": 5556 + }, + { + "epoch": 1.7056476365868631, + "grad_norm": 0.47720953822135925, + "learning_rate": 9.491075583607969e-05, + "loss": 1.9026, + "step": 5557 + }, + { + "epoch": 1.7059545733578882, + "grad_norm": 0.3976534605026245, + "learning_rate": 9.490857077403865e-05, + "loss": 1.8551, + "step": 5558 + }, + { + "epoch": 1.7062615101289134, + "grad_norm": 0.3744281828403473, + "learning_rate": 9.49063852681848e-05, + "loss": 2.012, + "step": 5559 + }, + { + "epoch": 1.7065684468999387, + "grad_norm": 0.3931918740272522, + "learning_rate": 9.490419931853974e-05, + "loss": 1.845, + "step": 5560 + }, + { + "epoch": 1.7068753836709638, + "grad_norm": 0.5411466956138611, + "learning_rate": 9.490201292512506e-05, + "loss": 2.0225, + "step": 5561 + }, + { + "epoch": 1.707182320441989, + "grad_norm": 0.6602910757064819, + "learning_rate": 9.489982608796237e-05, + "loss": 1.9559, + "step": 5562 + }, + { + "epoch": 1.7074892572130143, + "grad_norm": 0.5455329418182373, + "learning_rate": 9.489763880707329e-05, + "loss": 1.8855, + "step": 5563 + }, + { + "epoch": 1.7077961939840391, + "grad_norm": 0.42309099435806274, + "learning_rate": 9.489545108247941e-05, + "loss": 1.8784, + "step": 5564 + }, + { + "epoch": 1.7081031307550645, + "grad_norm": 0.3817001283168793, + "learning_rate": 9.489326291420239e-05, + "loss": 1.8926, + "step": 5565 + }, + { + "epoch": 1.7084100675260896, + "grad_norm": 0.5077582597732544, + "learning_rate": 9.489107430226381e-05, + "loss": 1.8742, + "step": 5566 + }, + { + "epoch": 1.7087170042971147, + "grad_norm": 0.5634065866470337, + "learning_rate": 9.488888524668533e-05, + "loss": 1.9251, + "step": 5567 + }, + { + "epoch": 1.70902394106814, + "grad_norm": 0.5182891488075256, + "learning_rate": 9.488669574748859e-05, + "loss": 1.9689, + "step": 5568 + }, + { + "epoch": 1.7093308778391652, + "grad_norm": 0.4180498719215393, + "learning_rate": 9.48845058046952e-05, + "loss": 1.9248, + "step": 5569 + }, + { + "epoch": 1.7096378146101903, + "grad_norm": 0.4833194315433502, + "learning_rate": 9.488231541832682e-05, + "loss": 2.0115, + "step": 5570 + }, + { + "epoch": 1.7099447513812156, + "grad_norm": 0.46525415778160095, + "learning_rate": 9.488012458840509e-05, + "loss": 1.9108, + "step": 5571 + }, + { + "epoch": 1.7102516881522405, + "grad_norm": 0.5051191449165344, + "learning_rate": 9.487793331495166e-05, + "loss": 1.9055, + "step": 5572 + }, + { + "epoch": 1.7105586249232658, + "grad_norm": 0.4713154137134552, + "learning_rate": 9.48757415979882e-05, + "loss": 1.9104, + "step": 5573 + }, + { + "epoch": 1.710865561694291, + "grad_norm": 0.44901835918426514, + "learning_rate": 9.487354943753635e-05, + "loss": 1.9536, + "step": 5574 + }, + { + "epoch": 1.711172498465316, + "grad_norm": 0.41106006503105164, + "learning_rate": 9.487135683361778e-05, + "loss": 1.9549, + "step": 5575 + }, + { + "epoch": 1.7114794352363414, + "grad_norm": 0.4571320116519928, + "learning_rate": 9.486916378625416e-05, + "loss": 1.859, + "step": 5576 + }, + { + "epoch": 1.7117863720073665, + "grad_norm": 0.4423540532588959, + "learning_rate": 9.486697029546718e-05, + "loss": 1.9621, + "step": 5577 + }, + { + "epoch": 1.7120933087783916, + "grad_norm": 0.44291070103645325, + "learning_rate": 9.48647763612785e-05, + "loss": 1.8567, + "step": 5578 + }, + { + "epoch": 1.712400245549417, + "grad_norm": 0.4374423921108246, + "learning_rate": 9.486258198370981e-05, + "loss": 1.9754, + "step": 5579 + }, + { + "epoch": 1.7127071823204418, + "grad_norm": 0.44008153676986694, + "learning_rate": 9.486038716278277e-05, + "loss": 1.8815, + "step": 5580 + }, + { + "epoch": 1.7130141190914672, + "grad_norm": 0.3571348190307617, + "learning_rate": 9.48581918985191e-05, + "loss": 1.8948, + "step": 5581 + }, + { + "epoch": 1.7133210558624923, + "grad_norm": 0.42260754108428955, + "learning_rate": 9.485599619094049e-05, + "loss": 1.9964, + "step": 5582 + }, + { + "epoch": 1.7136279926335174, + "grad_norm": 0.44568777084350586, + "learning_rate": 9.485380004006863e-05, + "loss": 1.9596, + "step": 5583 + }, + { + "epoch": 1.7139349294045427, + "grad_norm": 0.5488269925117493, + "learning_rate": 9.485160344592523e-05, + "loss": 1.9239, + "step": 5584 + }, + { + "epoch": 1.7142418661755678, + "grad_norm": 0.5653155446052551, + "learning_rate": 9.484940640853199e-05, + "loss": 1.9115, + "step": 5585 + }, + { + "epoch": 1.714548802946593, + "grad_norm": 0.4652312099933624, + "learning_rate": 9.484720892791064e-05, + "loss": 1.9973, + "step": 5586 + }, + { + "epoch": 1.7148557397176183, + "grad_norm": 0.41521382331848145, + "learning_rate": 9.484501100408288e-05, + "loss": 1.9395, + "step": 5587 + }, + { + "epoch": 1.7151626764886432, + "grad_norm": 0.46761438250541687, + "learning_rate": 9.484281263707043e-05, + "loss": 1.9465, + "step": 5588 + }, + { + "epoch": 1.7154696132596685, + "grad_norm": 0.46990182995796204, + "learning_rate": 9.484061382689501e-05, + "loss": 1.8969, + "step": 5589 + }, + { + "epoch": 1.7157765500306936, + "grad_norm": 0.44951021671295166, + "learning_rate": 9.48384145735784e-05, + "loss": 1.9925, + "step": 5590 + }, + { + "epoch": 1.7160834868017187, + "grad_norm": 0.4029327630996704, + "learning_rate": 9.483621487714227e-05, + "loss": 1.8574, + "step": 5591 + }, + { + "epoch": 1.716390423572744, + "grad_norm": 0.3501027226448059, + "learning_rate": 9.48340147376084e-05, + "loss": 1.9156, + "step": 5592 + }, + { + "epoch": 1.7166973603437692, + "grad_norm": 0.5058720111846924, + "learning_rate": 9.48318141549985e-05, + "loss": 2.071, + "step": 5593 + }, + { + "epoch": 1.7170042971147943, + "grad_norm": 0.5097518563270569, + "learning_rate": 9.482961312933435e-05, + "loss": 1.9609, + "step": 5594 + }, + { + "epoch": 1.7173112338858196, + "grad_norm": 0.4728573262691498, + "learning_rate": 9.482741166063769e-05, + "loss": 1.9552, + "step": 5595 + }, + { + "epoch": 1.7176181706568447, + "grad_norm": 0.44095897674560547, + "learning_rate": 9.482520974893026e-05, + "loss": 2.011, + "step": 5596 + }, + { + "epoch": 1.7179251074278699, + "grad_norm": 0.48331573605537415, + "learning_rate": 9.482300739423385e-05, + "loss": 1.9676, + "step": 5597 + }, + { + "epoch": 1.7182320441988952, + "grad_norm": 0.4890894293785095, + "learning_rate": 9.482080459657019e-05, + "loss": 1.9571, + "step": 5598 + }, + { + "epoch": 1.71853898096992, + "grad_norm": 0.4486929476261139, + "learning_rate": 9.481860135596109e-05, + "loss": 1.9205, + "step": 5599 + }, + { + "epoch": 1.7188459177409454, + "grad_norm": 0.44154083728790283, + "learning_rate": 9.48163976724283e-05, + "loss": 1.9995, + "step": 5600 + }, + { + "epoch": 1.7191528545119705, + "grad_norm": 0.4155641496181488, + "learning_rate": 9.481419354599358e-05, + "loss": 1.9192, + "step": 5601 + }, + { + "epoch": 1.7194597912829956, + "grad_norm": 0.453253835439682, + "learning_rate": 9.481198897667875e-05, + "loss": 2.0102, + "step": 5602 + }, + { + "epoch": 1.719766728054021, + "grad_norm": 0.4325653314590454, + "learning_rate": 9.480978396450557e-05, + "loss": 1.8859, + "step": 5603 + }, + { + "epoch": 1.720073664825046, + "grad_norm": 0.4191089868545532, + "learning_rate": 9.480757850949584e-05, + "loss": 2.0007, + "step": 5604 + }, + { + "epoch": 1.7203806015960712, + "grad_norm": 0.4182284474372864, + "learning_rate": 9.480537261167137e-05, + "loss": 1.9374, + "step": 5605 + }, + { + "epoch": 1.7206875383670965, + "grad_norm": 0.4695988893508911, + "learning_rate": 9.480316627105394e-05, + "loss": 1.983, + "step": 5606 + }, + { + "epoch": 1.7209944751381214, + "grad_norm": 0.4668160378932953, + "learning_rate": 9.480095948766536e-05, + "loss": 1.8705, + "step": 5607 + }, + { + "epoch": 1.7213014119091468, + "grad_norm": 0.3689236044883728, + "learning_rate": 9.479875226152744e-05, + "loss": 1.8695, + "step": 5608 + }, + { + "epoch": 1.7216083486801719, + "grad_norm": 0.4206932485103607, + "learning_rate": 9.4796544592662e-05, + "loss": 1.9494, + "step": 5609 + }, + { + "epoch": 1.721915285451197, + "grad_norm": 0.4420578181743622, + "learning_rate": 9.479433648109083e-05, + "loss": 1.8749, + "step": 5610 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.4243582487106323, + "learning_rate": 9.479212792683579e-05, + "loss": 1.9524, + "step": 5611 + }, + { + "epoch": 1.7225291589932474, + "grad_norm": 0.5053666234016418, + "learning_rate": 9.478991892991868e-05, + "loss": 1.9308, + "step": 5612 + }, + { + "epoch": 1.7228360957642725, + "grad_norm": 0.4365650713443756, + "learning_rate": 9.478770949036136e-05, + "loss": 1.9469, + "step": 5613 + }, + { + "epoch": 1.7231430325352979, + "grad_norm": 0.3916216194629669, + "learning_rate": 9.478549960818561e-05, + "loss": 1.8239, + "step": 5614 + }, + { + "epoch": 1.7234499693063228, + "grad_norm": 0.4051356911659241, + "learning_rate": 9.478328928341334e-05, + "loss": 1.892, + "step": 5615 + }, + { + "epoch": 1.723756906077348, + "grad_norm": 0.36592593789100647, + "learning_rate": 9.478107851606633e-05, + "loss": 1.8763, + "step": 5616 + }, + { + "epoch": 1.7240638428483732, + "grad_norm": 0.45741888880729675, + "learning_rate": 9.477886730616645e-05, + "loss": 1.9502, + "step": 5617 + }, + { + "epoch": 1.7243707796193983, + "grad_norm": 0.38170990347862244, + "learning_rate": 9.477665565373558e-05, + "loss": 1.8568, + "step": 5618 + }, + { + "epoch": 1.7246777163904237, + "grad_norm": 0.4193691313266754, + "learning_rate": 9.477444355879554e-05, + "loss": 1.9553, + "step": 5619 + }, + { + "epoch": 1.7249846531614488, + "grad_norm": 0.39682838320732117, + "learning_rate": 9.477223102136821e-05, + "loss": 1.9474, + "step": 5620 + }, + { + "epoch": 1.725291589932474, + "grad_norm": 0.391544371843338, + "learning_rate": 9.477001804147545e-05, + "loss": 1.9277, + "step": 5621 + }, + { + "epoch": 1.7255985267034992, + "grad_norm": 0.42348888516426086, + "learning_rate": 9.476780461913913e-05, + "loss": 1.8923, + "step": 5622 + }, + { + "epoch": 1.7259054634745241, + "grad_norm": 0.4393916130065918, + "learning_rate": 9.476559075438114e-05, + "loss": 1.9052, + "step": 5623 + }, + { + "epoch": 1.7262124002455494, + "grad_norm": 0.42631569504737854, + "learning_rate": 9.476337644722333e-05, + "loss": 1.8849, + "step": 5624 + }, + { + "epoch": 1.7265193370165746, + "grad_norm": 0.3514206111431122, + "learning_rate": 9.47611616976876e-05, + "loss": 1.9286, + "step": 5625 + }, + { + "epoch": 1.7268262737875997, + "grad_norm": 0.4104609191417694, + "learning_rate": 9.475894650579582e-05, + "loss": 1.9178, + "step": 5626 + }, + { + "epoch": 1.727133210558625, + "grad_norm": 0.44329676032066345, + "learning_rate": 9.475673087156992e-05, + "loss": 1.9789, + "step": 5627 + }, + { + "epoch": 1.7274401473296501, + "grad_norm": 0.41865840554237366, + "learning_rate": 9.475451479503175e-05, + "loss": 1.9105, + "step": 5628 + }, + { + "epoch": 1.7277470841006752, + "grad_norm": 0.4166790544986725, + "learning_rate": 9.475229827620326e-05, + "loss": 1.9089, + "step": 5629 + }, + { + "epoch": 1.7280540208717006, + "grad_norm": 0.353771448135376, + "learning_rate": 9.475008131510633e-05, + "loss": 1.9081, + "step": 5630 + }, + { + "epoch": 1.7283609576427255, + "grad_norm": 0.385046124458313, + "learning_rate": 9.474786391176284e-05, + "loss": 1.9268, + "step": 5631 + }, + { + "epoch": 1.7286678944137508, + "grad_norm": 0.3956538438796997, + "learning_rate": 9.474564606619474e-05, + "loss": 1.9445, + "step": 5632 + }, + { + "epoch": 1.728974831184776, + "grad_norm": 0.41305112838745117, + "learning_rate": 9.474342777842394e-05, + "loss": 1.9331, + "step": 5633 + }, + { + "epoch": 1.729281767955801, + "grad_norm": 0.39336860179901123, + "learning_rate": 9.474120904847237e-05, + "loss": 1.9792, + "step": 5634 + }, + { + "epoch": 1.7295887047268264, + "grad_norm": 0.41963186860084534, + "learning_rate": 9.473898987636194e-05, + "loss": 1.8719, + "step": 5635 + }, + { + "epoch": 1.7298956414978515, + "grad_norm": 0.4087338149547577, + "learning_rate": 9.473677026211458e-05, + "loss": 1.9121, + "step": 5636 + }, + { + "epoch": 1.7302025782688766, + "grad_norm": 0.3693830966949463, + "learning_rate": 9.473455020575226e-05, + "loss": 1.9293, + "step": 5637 + }, + { + "epoch": 1.730509515039902, + "grad_norm": 0.40699541568756104, + "learning_rate": 9.473232970729688e-05, + "loss": 1.94, + "step": 5638 + }, + { + "epoch": 1.7308164518109268, + "grad_norm": 0.4222811162471771, + "learning_rate": 9.473010876677041e-05, + "loss": 1.9416, + "step": 5639 + }, + { + "epoch": 1.7311233885819521, + "grad_norm": 0.41459110379219055, + "learning_rate": 9.472788738419477e-05, + "loss": 1.8801, + "step": 5640 + }, + { + "epoch": 1.7314303253529773, + "grad_norm": 0.36970487236976624, + "learning_rate": 9.472566555959195e-05, + "loss": 1.9122, + "step": 5641 + }, + { + "epoch": 1.7317372621240024, + "grad_norm": 0.35511577129364014, + "learning_rate": 9.472344329298388e-05, + "loss": 1.8646, + "step": 5642 + }, + { + "epoch": 1.7320441988950277, + "grad_norm": 0.3511577248573303, + "learning_rate": 9.472122058439252e-05, + "loss": 1.9047, + "step": 5643 + }, + { + "epoch": 1.7323511356660528, + "grad_norm": 0.3421955108642578, + "learning_rate": 9.471899743383986e-05, + "loss": 1.8732, + "step": 5644 + }, + { + "epoch": 1.732658072437078, + "grad_norm": 0.44008341431617737, + "learning_rate": 9.471677384134785e-05, + "loss": 1.8956, + "step": 5645 + }, + { + "epoch": 1.7329650092081033, + "grad_norm": 0.49410128593444824, + "learning_rate": 9.471454980693848e-05, + "loss": 1.9197, + "step": 5646 + }, + { + "epoch": 1.7332719459791281, + "grad_norm": 0.4664965867996216, + "learning_rate": 9.471232533063373e-05, + "loss": 1.8945, + "step": 5647 + }, + { + "epoch": 1.7335788827501535, + "grad_norm": 0.3789248764514923, + "learning_rate": 9.471010041245555e-05, + "loss": 1.9153, + "step": 5648 + }, + { + "epoch": 1.7338858195211786, + "grad_norm": 0.34556612372398376, + "learning_rate": 9.470787505242596e-05, + "loss": 1.9144, + "step": 5649 + }, + { + "epoch": 1.7341927562922037, + "grad_norm": 0.3466256856918335, + "learning_rate": 9.470564925056695e-05, + "loss": 1.8837, + "step": 5650 + }, + { + "epoch": 1.734499693063229, + "grad_norm": 0.34612321853637695, + "learning_rate": 9.470342300690051e-05, + "loss": 1.8667, + "step": 5651 + }, + { + "epoch": 1.7348066298342542, + "grad_norm": 0.3648833632469177, + "learning_rate": 9.470119632144864e-05, + "loss": 1.9499, + "step": 5652 + }, + { + "epoch": 1.7351135666052793, + "grad_norm": 0.3600454330444336, + "learning_rate": 9.469896919423334e-05, + "loss": 1.9093, + "step": 5653 + }, + { + "epoch": 1.7354205033763046, + "grad_norm": 0.41487598419189453, + "learning_rate": 9.469674162527664e-05, + "loss": 1.9714, + "step": 5654 + }, + { + "epoch": 1.7357274401473295, + "grad_norm": 0.35980695486068726, + "learning_rate": 9.469451361460053e-05, + "loss": 1.9006, + "step": 5655 + }, + { + "epoch": 1.7360343769183548, + "grad_norm": 0.42676928639411926, + "learning_rate": 9.469228516222705e-05, + "loss": 1.9286, + "step": 5656 + }, + { + "epoch": 1.73634131368938, + "grad_norm": 0.41541969776153564, + "learning_rate": 9.469005626817822e-05, + "loss": 1.9243, + "step": 5657 + }, + { + "epoch": 1.736648250460405, + "grad_norm": 0.4245065152645111, + "learning_rate": 9.468782693247604e-05, + "loss": 1.9427, + "step": 5658 + }, + { + "epoch": 1.7369551872314304, + "grad_norm": 0.46148940920829773, + "learning_rate": 9.468559715514257e-05, + "loss": 2.0201, + "step": 5659 + }, + { + "epoch": 1.7372621240024555, + "grad_norm": 0.47727301716804504, + "learning_rate": 9.468336693619985e-05, + "loss": 1.9792, + "step": 5660 + }, + { + "epoch": 1.7375690607734806, + "grad_norm": 0.4807848036289215, + "learning_rate": 9.46811362756699e-05, + "loss": 1.9036, + "step": 5661 + }, + { + "epoch": 1.737875997544506, + "grad_norm": 0.5129636526107788, + "learning_rate": 9.467890517357477e-05, + "loss": 1.8861, + "step": 5662 + }, + { + "epoch": 1.7381829343155308, + "grad_norm": 0.467804878950119, + "learning_rate": 9.467667362993651e-05, + "loss": 1.868, + "step": 5663 + }, + { + "epoch": 1.7384898710865562, + "grad_norm": 0.4179893136024475, + "learning_rate": 9.46744416447772e-05, + "loss": 1.9521, + "step": 5664 + }, + { + "epoch": 1.7387968078575813, + "grad_norm": 0.4384612739086151, + "learning_rate": 9.467220921811884e-05, + "loss": 1.9167, + "step": 5665 + }, + { + "epoch": 1.7391037446286064, + "grad_norm": 0.517855703830719, + "learning_rate": 9.466997634998354e-05, + "loss": 1.8919, + "step": 5666 + }, + { + "epoch": 1.7394106813996317, + "grad_norm": 0.4875940978527069, + "learning_rate": 9.466774304039334e-05, + "loss": 1.8774, + "step": 5667 + }, + { + "epoch": 1.7397176181706568, + "grad_norm": 0.44286540150642395, + "learning_rate": 9.466550928937034e-05, + "loss": 1.9696, + "step": 5668 + }, + { + "epoch": 1.740024554941682, + "grad_norm": 0.4092461168766022, + "learning_rate": 9.466327509693658e-05, + "loss": 1.9978, + "step": 5669 + }, + { + "epoch": 1.7403314917127073, + "grad_norm": 0.42797163128852844, + "learning_rate": 9.466104046311418e-05, + "loss": 1.9428, + "step": 5670 + }, + { + "epoch": 1.7406384284837324, + "grad_norm": 0.5174738764762878, + "learning_rate": 9.465880538792518e-05, + "loss": 1.9493, + "step": 5671 + }, + { + "epoch": 1.7409453652547575, + "grad_norm": 0.6263836622238159, + "learning_rate": 9.46565698713917e-05, + "loss": 1.9131, + "step": 5672 + }, + { + "epoch": 1.7412523020257828, + "grad_norm": 0.6452967524528503, + "learning_rate": 9.465433391353582e-05, + "loss": 2.0412, + "step": 5673 + }, + { + "epoch": 1.7415592387968077, + "grad_norm": 0.5004684925079346, + "learning_rate": 9.465209751437964e-05, + "loss": 1.8721, + "step": 5674 + }, + { + "epoch": 1.741866175567833, + "grad_norm": 0.4694507420063019, + "learning_rate": 9.464986067394526e-05, + "loss": 1.9614, + "step": 5675 + }, + { + "epoch": 1.7421731123388582, + "grad_norm": 0.4519532322883606, + "learning_rate": 9.464762339225479e-05, + "loss": 1.9687, + "step": 5676 + }, + { + "epoch": 1.7424800491098833, + "grad_norm": 0.4297941029071808, + "learning_rate": 9.464538566933033e-05, + "loss": 1.965, + "step": 5677 + }, + { + "epoch": 1.7427869858809086, + "grad_norm": 0.4612393081188202, + "learning_rate": 9.464314750519401e-05, + "loss": 1.9651, + "step": 5678 + }, + { + "epoch": 1.7430939226519337, + "grad_norm": 0.394142210483551, + "learning_rate": 9.464090889986794e-05, + "loss": 1.9185, + "step": 5679 + }, + { + "epoch": 1.7434008594229589, + "grad_norm": 0.39999979734420776, + "learning_rate": 9.463866985337424e-05, + "loss": 1.899, + "step": 5680 + }, + { + "epoch": 1.7437077961939842, + "grad_norm": 0.40942859649658203, + "learning_rate": 9.463643036573504e-05, + "loss": 1.9653, + "step": 5681 + }, + { + "epoch": 1.744014732965009, + "grad_norm": 0.4097300171852112, + "learning_rate": 9.463419043697248e-05, + "loss": 1.9944, + "step": 5682 + }, + { + "epoch": 1.7443216697360344, + "grad_norm": 0.41627535223960876, + "learning_rate": 9.463195006710868e-05, + "loss": 1.9156, + "step": 5683 + }, + { + "epoch": 1.7446286065070595, + "grad_norm": 0.3789215385913849, + "learning_rate": 9.46297092561658e-05, + "loss": 1.9262, + "step": 5684 + }, + { + "epoch": 1.7449355432780846, + "grad_norm": 0.4867783188819885, + "learning_rate": 9.462746800416595e-05, + "loss": 1.961, + "step": 5685 + }, + { + "epoch": 1.74524248004911, + "grad_norm": 0.6078580617904663, + "learning_rate": 9.462522631113133e-05, + "loss": 1.9694, + "step": 5686 + }, + { + "epoch": 1.745549416820135, + "grad_norm": 0.558968186378479, + "learning_rate": 9.462298417708406e-05, + "loss": 1.9537, + "step": 5687 + }, + { + "epoch": 1.7458563535911602, + "grad_norm": 0.4677596986293793, + "learning_rate": 9.46207416020463e-05, + "loss": 1.9253, + "step": 5688 + }, + { + "epoch": 1.7461632903621855, + "grad_norm": 0.40353646874427795, + "learning_rate": 9.461849858604023e-05, + "loss": 1.8992, + "step": 5689 + }, + { + "epoch": 1.7464702271332104, + "grad_norm": 0.3738614618778229, + "learning_rate": 9.4616255129088e-05, + "loss": 1.9109, + "step": 5690 + }, + { + "epoch": 1.7467771639042358, + "grad_norm": 0.4040324091911316, + "learning_rate": 9.461401123121179e-05, + "loss": 1.8981, + "step": 5691 + }, + { + "epoch": 1.7470841006752609, + "grad_norm": 0.44214901328086853, + "learning_rate": 9.461176689243376e-05, + "loss": 1.9244, + "step": 5692 + }, + { + "epoch": 1.747391037446286, + "grad_norm": 0.44187378883361816, + "learning_rate": 9.460952211277611e-05, + "loss": 1.9329, + "step": 5693 + }, + { + "epoch": 1.7476979742173113, + "grad_norm": 0.44287410378456116, + "learning_rate": 9.460727689226102e-05, + "loss": 1.97, + "step": 5694 + }, + { + "epoch": 1.7480049109883364, + "grad_norm": 0.3757341504096985, + "learning_rate": 9.460503123091067e-05, + "loss": 1.8766, + "step": 5695 + }, + { + "epoch": 1.7483118477593615, + "grad_norm": 0.4139314591884613, + "learning_rate": 9.460278512874725e-05, + "loss": 1.902, + "step": 5696 + }, + { + "epoch": 1.7486187845303869, + "grad_norm": 0.37526339292526245, + "learning_rate": 9.460053858579298e-05, + "loss": 1.9325, + "step": 5697 + }, + { + "epoch": 1.7489257213014118, + "grad_norm": 0.3770616948604584, + "learning_rate": 9.459829160207004e-05, + "loss": 1.9437, + "step": 5698 + }, + { + "epoch": 1.749232658072437, + "grad_norm": 0.4069806933403015, + "learning_rate": 9.459604417760064e-05, + "loss": 1.9454, + "step": 5699 + }, + { + "epoch": 1.7495395948434622, + "grad_norm": 0.42822694778442383, + "learning_rate": 9.459379631240699e-05, + "loss": 1.8798, + "step": 5700 + }, + { + "epoch": 1.7498465316144873, + "grad_norm": 0.44075292348861694, + "learning_rate": 9.459154800651131e-05, + "loss": 1.9842, + "step": 5701 + }, + { + "epoch": 1.7501534683855127, + "grad_norm": 0.4151122272014618, + "learning_rate": 9.458929925993583e-05, + "loss": 1.8495, + "step": 5702 + }, + { + "epoch": 1.7504604051565378, + "grad_norm": 0.41887882351875305, + "learning_rate": 9.458705007270275e-05, + "loss": 1.9611, + "step": 5703 + }, + { + "epoch": 1.750767341927563, + "grad_norm": 0.3976796865463257, + "learning_rate": 9.45848004448343e-05, + "loss": 1.8841, + "step": 5704 + }, + { + "epoch": 1.7510742786985882, + "grad_norm": 0.3783813416957855, + "learning_rate": 9.458255037635272e-05, + "loss": 1.8897, + "step": 5705 + }, + { + "epoch": 1.7513812154696131, + "grad_norm": 0.35153308510780334, + "learning_rate": 9.458029986728026e-05, + "loss": 1.911, + "step": 5706 + }, + { + "epoch": 1.7516881522406385, + "grad_norm": 0.38390985131263733, + "learning_rate": 9.457804891763913e-05, + "loss": 2.0105, + "step": 5707 + }, + { + "epoch": 1.7519950890116636, + "grad_norm": 0.3830740451812744, + "learning_rate": 9.457579752745161e-05, + "loss": 1.9635, + "step": 5708 + }, + { + "epoch": 1.7523020257826887, + "grad_norm": 0.3711417019367218, + "learning_rate": 9.457354569673993e-05, + "loss": 1.8553, + "step": 5709 + }, + { + "epoch": 1.752608962553714, + "grad_norm": 0.3670618236064911, + "learning_rate": 9.457129342552633e-05, + "loss": 1.9044, + "step": 5710 + }, + { + "epoch": 1.7529158993247391, + "grad_norm": 0.398863285779953, + "learning_rate": 9.45690407138331e-05, + "loss": 1.987, + "step": 5711 + }, + { + "epoch": 1.7532228360957642, + "grad_norm": 0.4100732207298279, + "learning_rate": 9.456678756168248e-05, + "loss": 1.8552, + "step": 5712 + }, + { + "epoch": 1.7535297728667896, + "grad_norm": 0.41883236169815063, + "learning_rate": 9.456453396909676e-05, + "loss": 1.9183, + "step": 5713 + }, + { + "epoch": 1.7538367096378145, + "grad_norm": 0.4063440263271332, + "learning_rate": 9.456227993609818e-05, + "loss": 1.8751, + "step": 5714 + }, + { + "epoch": 1.7541436464088398, + "grad_norm": 0.3880515694618225, + "learning_rate": 9.456002546270904e-05, + "loss": 1.9558, + "step": 5715 + }, + { + "epoch": 1.754450583179865, + "grad_norm": 0.38582444190979004, + "learning_rate": 9.45577705489516e-05, + "loss": 1.9588, + "step": 5716 + }, + { + "epoch": 1.75475751995089, + "grad_norm": 0.3678396940231323, + "learning_rate": 9.455551519484816e-05, + "loss": 1.9108, + "step": 5717 + }, + { + "epoch": 1.7550644567219154, + "grad_norm": 0.3590768277645111, + "learning_rate": 9.455325940042098e-05, + "loss": 1.9027, + "step": 5718 + }, + { + "epoch": 1.7553713934929405, + "grad_norm": 0.4104592204093933, + "learning_rate": 9.455100316569241e-05, + "loss": 1.9099, + "step": 5719 + }, + { + "epoch": 1.7556783302639656, + "grad_norm": 0.3774401843547821, + "learning_rate": 9.45487464906847e-05, + "loss": 1.9098, + "step": 5720 + }, + { + "epoch": 1.755985267034991, + "grad_norm": 0.38464388251304626, + "learning_rate": 9.454648937542019e-05, + "loss": 1.9194, + "step": 5721 + }, + { + "epoch": 1.7562922038060158, + "grad_norm": 0.435131698846817, + "learning_rate": 9.454423181992114e-05, + "loss": 1.9798, + "step": 5722 + }, + { + "epoch": 1.7565991405770411, + "grad_norm": 0.4583236575126648, + "learning_rate": 9.454197382420988e-05, + "loss": 1.9862, + "step": 5723 + }, + { + "epoch": 1.7569060773480663, + "grad_norm": 0.3644738793373108, + "learning_rate": 9.453971538830874e-05, + "loss": 1.8535, + "step": 5724 + }, + { + "epoch": 1.7572130141190914, + "grad_norm": 0.3644218444824219, + "learning_rate": 9.453745651224002e-05, + "loss": 1.8773, + "step": 5725 + }, + { + "epoch": 1.7575199508901167, + "grad_norm": 0.42884743213653564, + "learning_rate": 9.453519719602604e-05, + "loss": 1.882, + "step": 5726 + }, + { + "epoch": 1.7578268876611418, + "grad_norm": 0.41049477458000183, + "learning_rate": 9.453293743968916e-05, + "loss": 1.9133, + "step": 5727 + }, + { + "epoch": 1.758133824432167, + "grad_norm": 0.35882604122161865, + "learning_rate": 9.453067724325169e-05, + "loss": 1.9056, + "step": 5728 + }, + { + "epoch": 1.7584407612031923, + "grad_norm": 0.34516364336013794, + "learning_rate": 9.452841660673595e-05, + "loss": 1.8894, + "step": 5729 + }, + { + "epoch": 1.7587476979742172, + "grad_norm": 0.41804373264312744, + "learning_rate": 9.45261555301643e-05, + "loss": 1.8798, + "step": 5730 + }, + { + "epoch": 1.7590546347452425, + "grad_norm": 0.48584702610969543, + "learning_rate": 9.45238940135591e-05, + "loss": 1.9353, + "step": 5731 + }, + { + "epoch": 1.7593615715162676, + "grad_norm": 0.5693044662475586, + "learning_rate": 9.452163205694267e-05, + "loss": 1.8813, + "step": 5732 + }, + { + "epoch": 1.7596685082872927, + "grad_norm": 0.6146205067634583, + "learning_rate": 9.451936966033738e-05, + "loss": 1.9993, + "step": 5733 + }, + { + "epoch": 1.759975445058318, + "grad_norm": 0.4658338129520416, + "learning_rate": 9.451710682376558e-05, + "loss": 1.8977, + "step": 5734 + }, + { + "epoch": 1.7602823818293432, + "grad_norm": 0.35184696316719055, + "learning_rate": 9.451484354724964e-05, + "loss": 1.9924, + "step": 5735 + }, + { + "epoch": 1.7605893186003683, + "grad_norm": 0.48720163106918335, + "learning_rate": 9.451257983081194e-05, + "loss": 1.9054, + "step": 5736 + }, + { + "epoch": 1.7608962553713936, + "grad_norm": 0.6268271803855896, + "learning_rate": 9.451031567447482e-05, + "loss": 1.9956, + "step": 5737 + }, + { + "epoch": 1.7612031921424187, + "grad_norm": 0.5384534001350403, + "learning_rate": 9.450805107826068e-05, + "loss": 1.9169, + "step": 5738 + }, + { + "epoch": 1.7615101289134438, + "grad_norm": 0.4011121094226837, + "learning_rate": 9.450578604219188e-05, + "loss": 1.9845, + "step": 5739 + }, + { + "epoch": 1.7618170656844692, + "grad_norm": 0.4422668516635895, + "learning_rate": 9.450352056629082e-05, + "loss": 2.0014, + "step": 5740 + }, + { + "epoch": 1.762124002455494, + "grad_norm": 0.5033303499221802, + "learning_rate": 9.45012546505799e-05, + "loss": 1.9142, + "step": 5741 + }, + { + "epoch": 1.7624309392265194, + "grad_norm": 0.6074427366256714, + "learning_rate": 9.449898829508148e-05, + "loss": 1.9385, + "step": 5742 + }, + { + "epoch": 1.7627378759975445, + "grad_norm": 0.6405495405197144, + "learning_rate": 9.449672149981799e-05, + "loss": 1.9792, + "step": 5743 + }, + { + "epoch": 1.7630448127685696, + "grad_norm": 0.5432560443878174, + "learning_rate": 9.449445426481182e-05, + "loss": 1.9294, + "step": 5744 + }, + { + "epoch": 1.763351749539595, + "grad_norm": 0.41406089067459106, + "learning_rate": 9.449218659008536e-05, + "loss": 1.9266, + "step": 5745 + }, + { + "epoch": 1.76365868631062, + "grad_norm": 0.41278013586997986, + "learning_rate": 9.448991847566104e-05, + "loss": 1.9448, + "step": 5746 + }, + { + "epoch": 1.7639656230816452, + "grad_norm": 0.4682934582233429, + "learning_rate": 9.448764992156128e-05, + "loss": 1.9836, + "step": 5747 + }, + { + "epoch": 1.7642725598526705, + "grad_norm": 0.47673073410987854, + "learning_rate": 9.448538092780848e-05, + "loss": 2.0229, + "step": 5748 + }, + { + "epoch": 1.7645794966236954, + "grad_norm": 0.3956258296966553, + "learning_rate": 9.448311149442507e-05, + "loss": 1.9871, + "step": 5749 + }, + { + "epoch": 1.7648864333947207, + "grad_norm": 0.39578214287757874, + "learning_rate": 9.448084162143348e-05, + "loss": 1.8991, + "step": 5750 + }, + { + "epoch": 1.7651933701657458, + "grad_norm": 0.42902353405952454, + "learning_rate": 9.447857130885614e-05, + "loss": 1.9925, + "step": 5751 + }, + { + "epoch": 1.765500306936771, + "grad_norm": 0.45643556118011475, + "learning_rate": 9.44763005567155e-05, + "loss": 1.9662, + "step": 5752 + }, + { + "epoch": 1.7658072437077963, + "grad_norm": 0.39291635155677795, + "learning_rate": 9.447402936503398e-05, + "loss": 1.8925, + "step": 5753 + }, + { + "epoch": 1.7661141804788214, + "grad_norm": 0.36709296703338623, + "learning_rate": 9.447175773383404e-05, + "loss": 1.8669, + "step": 5754 + }, + { + "epoch": 1.7664211172498465, + "grad_norm": 0.41586652398109436, + "learning_rate": 9.446948566313812e-05, + "loss": 1.8925, + "step": 5755 + }, + { + "epoch": 1.7667280540208719, + "grad_norm": 0.42532578110694885, + "learning_rate": 9.446721315296867e-05, + "loss": 1.9923, + "step": 5756 + }, + { + "epoch": 1.7670349907918967, + "grad_norm": 0.45310646295547485, + "learning_rate": 9.446494020334817e-05, + "loss": 1.9908, + "step": 5757 + }, + { + "epoch": 1.767341927562922, + "grad_norm": 0.4391445219516754, + "learning_rate": 9.446266681429907e-05, + "loss": 1.9391, + "step": 5758 + }, + { + "epoch": 1.7676488643339472, + "grad_norm": 0.3728313446044922, + "learning_rate": 9.446039298584382e-05, + "loss": 1.9352, + "step": 5759 + }, + { + "epoch": 1.7679558011049723, + "grad_norm": 0.3862408697605133, + "learning_rate": 9.445811871800492e-05, + "loss": 1.9628, + "step": 5760 + }, + { + "epoch": 1.7682627378759976, + "grad_norm": 0.3704443573951721, + "learning_rate": 9.445584401080482e-05, + "loss": 1.9041, + "step": 5761 + }, + { + "epoch": 1.7685696746470227, + "grad_norm": 0.3490816652774811, + "learning_rate": 9.445356886426603e-05, + "loss": 1.9203, + "step": 5762 + }, + { + "epoch": 1.7688766114180479, + "grad_norm": 0.40135613083839417, + "learning_rate": 9.445129327841102e-05, + "loss": 1.9166, + "step": 5763 + }, + { + "epoch": 1.7691835481890732, + "grad_norm": 0.3794950246810913, + "learning_rate": 9.444901725326227e-05, + "loss": 1.8735, + "step": 5764 + }, + { + "epoch": 1.769490484960098, + "grad_norm": 0.3908408284187317, + "learning_rate": 9.444674078884228e-05, + "loss": 1.9044, + "step": 5765 + }, + { + "epoch": 1.7697974217311234, + "grad_norm": 0.45880573987960815, + "learning_rate": 9.444446388517354e-05, + "loss": 1.999, + "step": 5766 + }, + { + "epoch": 1.7701043585021485, + "grad_norm": 0.44833555817604065, + "learning_rate": 9.444218654227856e-05, + "loss": 1.8638, + "step": 5767 + }, + { + "epoch": 1.7704112952731736, + "grad_norm": 0.4608282446861267, + "learning_rate": 9.443990876017985e-05, + "loss": 2.0073, + "step": 5768 + }, + { + "epoch": 1.770718232044199, + "grad_norm": 0.41873493790626526, + "learning_rate": 9.44376305388999e-05, + "loss": 1.9337, + "step": 5769 + }, + { + "epoch": 1.771025168815224, + "grad_norm": 0.44395530223846436, + "learning_rate": 9.443535187846125e-05, + "loss": 1.9218, + "step": 5770 + }, + { + "epoch": 1.7713321055862492, + "grad_norm": 0.4347928464412689, + "learning_rate": 9.443307277888641e-05, + "loss": 1.9251, + "step": 5771 + }, + { + "epoch": 1.7716390423572745, + "grad_norm": 0.4892890155315399, + "learning_rate": 9.44307932401979e-05, + "loss": 1.9549, + "step": 5772 + }, + { + "epoch": 1.7719459791282994, + "grad_norm": 0.4234324097633362, + "learning_rate": 9.442851326241826e-05, + "loss": 1.9835, + "step": 5773 + }, + { + "epoch": 1.7722529158993248, + "grad_norm": 0.3614303171634674, + "learning_rate": 9.442623284557e-05, + "loss": 1.8942, + "step": 5774 + }, + { + "epoch": 1.7725598526703499, + "grad_norm": 0.4273429214954376, + "learning_rate": 9.442395198967566e-05, + "loss": 1.9363, + "step": 5775 + }, + { + "epoch": 1.772866789441375, + "grad_norm": 0.5049880146980286, + "learning_rate": 9.44216706947578e-05, + "loss": 1.904, + "step": 5776 + }, + { + "epoch": 1.7731737262124003, + "grad_norm": 0.5713424682617188, + "learning_rate": 9.441938896083895e-05, + "loss": 1.9756, + "step": 5777 + }, + { + "epoch": 1.7734806629834254, + "grad_norm": 0.4836362600326538, + "learning_rate": 9.441710678794166e-05, + "loss": 1.9657, + "step": 5778 + }, + { + "epoch": 1.7737875997544506, + "grad_norm": 0.39967820048332214, + "learning_rate": 9.44148241760885e-05, + "loss": 1.9566, + "step": 5779 + }, + { + "epoch": 1.7740945365254759, + "grad_norm": 0.38304075598716736, + "learning_rate": 9.4412541125302e-05, + "loss": 1.9055, + "step": 5780 + }, + { + "epoch": 1.7744014732965008, + "grad_norm": 0.3932463526725769, + "learning_rate": 9.441025763560474e-05, + "loss": 1.9603, + "step": 5781 + }, + { + "epoch": 1.774708410067526, + "grad_norm": 0.4528409242630005, + "learning_rate": 9.44079737070193e-05, + "loss": 2.0095, + "step": 5782 + }, + { + "epoch": 1.7750153468385512, + "grad_norm": 0.42075392603874207, + "learning_rate": 9.440568933956822e-05, + "loss": 1.8818, + "step": 5783 + }, + { + "epoch": 1.7753222836095763, + "grad_norm": 0.4114269018173218, + "learning_rate": 9.44034045332741e-05, + "loss": 1.8524, + "step": 5784 + }, + { + "epoch": 1.7756292203806017, + "grad_norm": 0.4052261412143707, + "learning_rate": 9.44011192881595e-05, + "loss": 1.9759, + "step": 5785 + }, + { + "epoch": 1.7759361571516268, + "grad_norm": 0.3551998436450958, + "learning_rate": 9.439883360424702e-05, + "loss": 1.9534, + "step": 5786 + }, + { + "epoch": 1.776243093922652, + "grad_norm": 0.404109925031662, + "learning_rate": 9.439654748155924e-05, + "loss": 1.8944, + "step": 5787 + }, + { + "epoch": 1.7765500306936772, + "grad_norm": 0.4092860519886017, + "learning_rate": 9.439426092011875e-05, + "loss": 2.0341, + "step": 5788 + }, + { + "epoch": 1.7768569674647021, + "grad_norm": 0.36132386326789856, + "learning_rate": 9.439197391994819e-05, + "loss": 1.8746, + "step": 5789 + }, + { + "epoch": 1.7771639042357275, + "grad_norm": 0.34845319390296936, + "learning_rate": 9.438968648107009e-05, + "loss": 1.8646, + "step": 5790 + }, + { + "epoch": 1.7774708410067526, + "grad_norm": 0.33360353112220764, + "learning_rate": 9.43873986035071e-05, + "loss": 1.901, + "step": 5791 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.348147988319397, + "learning_rate": 9.438511028728181e-05, + "loss": 1.8703, + "step": 5792 + }, + { + "epoch": 1.778084714548803, + "grad_norm": 0.385662704706192, + "learning_rate": 9.438282153241686e-05, + "loss": 1.9806, + "step": 5793 + }, + { + "epoch": 1.7783916513198281, + "grad_norm": 0.39457234740257263, + "learning_rate": 9.438053233893484e-05, + "loss": 1.9324, + "step": 5794 + }, + { + "epoch": 1.7786985880908532, + "grad_norm": 0.35745853185653687, + "learning_rate": 9.43782427068584e-05, + "loss": 1.9754, + "step": 5795 + }, + { + "epoch": 1.7790055248618786, + "grad_norm": 0.40866991877555847, + "learning_rate": 9.437595263621015e-05, + "loss": 1.959, + "step": 5796 + }, + { + "epoch": 1.7793124616329035, + "grad_norm": 0.3938930630683899, + "learning_rate": 9.437366212701274e-05, + "loss": 1.8746, + "step": 5797 + }, + { + "epoch": 1.7796193984039288, + "grad_norm": 0.36665603518486023, + "learning_rate": 9.437137117928878e-05, + "loss": 1.9209, + "step": 5798 + }, + { + "epoch": 1.779926335174954, + "grad_norm": 0.38514846563339233, + "learning_rate": 9.436907979306092e-05, + "loss": 1.8697, + "step": 5799 + }, + { + "epoch": 1.780233271945979, + "grad_norm": 0.4100898206233978, + "learning_rate": 9.43667879683518e-05, + "loss": 1.9606, + "step": 5800 + }, + { + "epoch": 1.7805402087170044, + "grad_norm": 0.40195250511169434, + "learning_rate": 9.43644957051841e-05, + "loss": 1.918, + "step": 5801 + }, + { + "epoch": 1.7808471454880295, + "grad_norm": 0.3943032920360565, + "learning_rate": 9.436220300358043e-05, + "loss": 1.9394, + "step": 5802 + }, + { + "epoch": 1.7811540822590546, + "grad_norm": 0.4171943664550781, + "learning_rate": 9.435990986356349e-05, + "loss": 1.9773, + "step": 5803 + }, + { + "epoch": 1.78146101903008, + "grad_norm": 0.4278806746006012, + "learning_rate": 9.435761628515589e-05, + "loss": 1.8696, + "step": 5804 + }, + { + "epoch": 1.7817679558011048, + "grad_norm": 0.4659377634525299, + "learning_rate": 9.435532226838036e-05, + "loss": 1.9387, + "step": 5805 + }, + { + "epoch": 1.7820748925721301, + "grad_norm": 0.4428139925003052, + "learning_rate": 9.435302781325952e-05, + "loss": 1.9673, + "step": 5806 + }, + { + "epoch": 1.7823818293431553, + "grad_norm": 0.4488377869129181, + "learning_rate": 9.435073291981607e-05, + "loss": 1.8493, + "step": 5807 + }, + { + "epoch": 1.7826887661141804, + "grad_norm": 0.5337218046188354, + "learning_rate": 9.434843758807268e-05, + "loss": 1.8631, + "step": 5808 + }, + { + "epoch": 1.7829957028852057, + "grad_norm": 0.5479410886764526, + "learning_rate": 9.434614181805202e-05, + "loss": 1.8548, + "step": 5809 + }, + { + "epoch": 1.7833026396562308, + "grad_norm": 0.5154398679733276, + "learning_rate": 9.434384560977681e-05, + "loss": 1.9558, + "step": 5810 + }, + { + "epoch": 1.783609576427256, + "grad_norm": 0.44863855838775635, + "learning_rate": 9.434154896326974e-05, + "loss": 1.9287, + "step": 5811 + }, + { + "epoch": 1.7839165131982813, + "grad_norm": 0.43923139572143555, + "learning_rate": 9.433925187855348e-05, + "loss": 1.9475, + "step": 5812 + }, + { + "epoch": 1.7842234499693064, + "grad_norm": 0.3602962791919708, + "learning_rate": 9.433695435565073e-05, + "loss": 1.8705, + "step": 5813 + }, + { + "epoch": 1.7845303867403315, + "grad_norm": 0.3956433832645416, + "learning_rate": 9.433465639458423e-05, + "loss": 1.9402, + "step": 5814 + }, + { + "epoch": 1.7848373235113568, + "grad_norm": 0.3382786810398102, + "learning_rate": 9.433235799537666e-05, + "loss": 1.9176, + "step": 5815 + }, + { + "epoch": 1.7851442602823817, + "grad_norm": 0.3681669533252716, + "learning_rate": 9.433005915805076e-05, + "loss": 1.8628, + "step": 5816 + }, + { + "epoch": 1.785451197053407, + "grad_norm": 0.32285505533218384, + "learning_rate": 9.432775988262921e-05, + "loss": 1.8875, + "step": 5817 + }, + { + "epoch": 1.7857581338244322, + "grad_norm": 0.35673508048057556, + "learning_rate": 9.432546016913477e-05, + "loss": 1.925, + "step": 5818 + }, + { + "epoch": 1.7860650705954573, + "grad_norm": 0.363308310508728, + "learning_rate": 9.432316001759015e-05, + "loss": 1.8711, + "step": 5819 + }, + { + "epoch": 1.7863720073664826, + "grad_norm": 0.36789265275001526, + "learning_rate": 9.432085942801808e-05, + "loss": 1.8578, + "step": 5820 + }, + { + "epoch": 1.7866789441375077, + "grad_norm": 0.3791796565055847, + "learning_rate": 9.43185584004413e-05, + "loss": 1.9162, + "step": 5821 + }, + { + "epoch": 1.7869858809085328, + "grad_norm": 0.3819539248943329, + "learning_rate": 9.431625693488256e-05, + "loss": 1.9042, + "step": 5822 + }, + { + "epoch": 1.7872928176795582, + "grad_norm": 0.36675095558166504, + "learning_rate": 9.43139550313646e-05, + "loss": 1.9775, + "step": 5823 + }, + { + "epoch": 1.787599754450583, + "grad_norm": 0.40895935893058777, + "learning_rate": 9.431165268991013e-05, + "loss": 1.9249, + "step": 5824 + }, + { + "epoch": 1.7879066912216084, + "grad_norm": 0.3866878151893616, + "learning_rate": 9.430934991054197e-05, + "loss": 1.8706, + "step": 5825 + }, + { + "epoch": 1.7882136279926335, + "grad_norm": 0.4892923831939697, + "learning_rate": 9.430704669328283e-05, + "loss": 1.9177, + "step": 5826 + }, + { + "epoch": 1.7885205647636586, + "grad_norm": 0.46216699481010437, + "learning_rate": 9.430474303815548e-05, + "loss": 1.8606, + "step": 5827 + }, + { + "epoch": 1.788827501534684, + "grad_norm": 0.4253760874271393, + "learning_rate": 9.430243894518271e-05, + "loss": 1.9123, + "step": 5828 + }, + { + "epoch": 1.789134438305709, + "grad_norm": 0.3316090404987335, + "learning_rate": 9.430013441438726e-05, + "loss": 1.9138, + "step": 5829 + }, + { + "epoch": 1.7894413750767342, + "grad_norm": 0.36144545674324036, + "learning_rate": 9.429782944579191e-05, + "loss": 1.8851, + "step": 5830 + }, + { + "epoch": 1.7897483118477595, + "grad_norm": 0.47213298082351685, + "learning_rate": 9.429552403941946e-05, + "loss": 1.9614, + "step": 5831 + }, + { + "epoch": 1.7900552486187844, + "grad_norm": 0.5166186094284058, + "learning_rate": 9.429321819529267e-05, + "loss": 1.9297, + "step": 5832 + }, + { + "epoch": 1.7903621853898097, + "grad_norm": 0.5276393294334412, + "learning_rate": 9.429091191343433e-05, + "loss": 1.8803, + "step": 5833 + }, + { + "epoch": 1.7906691221608348, + "grad_norm": 0.5736613869667053, + "learning_rate": 9.428860519386726e-05, + "loss": 1.9256, + "step": 5834 + }, + { + "epoch": 1.79097605893186, + "grad_norm": 0.6111080050468445, + "learning_rate": 9.428629803661421e-05, + "loss": 1.9624, + "step": 5835 + }, + { + "epoch": 1.7912829957028853, + "grad_norm": 0.45036107301712036, + "learning_rate": 9.428399044169802e-05, + "loss": 1.8625, + "step": 5836 + }, + { + "epoch": 1.7915899324739104, + "grad_norm": 0.35049325227737427, + "learning_rate": 9.428168240914148e-05, + "loss": 1.8988, + "step": 5837 + }, + { + "epoch": 1.7918968692449355, + "grad_norm": 0.4196048080921173, + "learning_rate": 9.427937393896739e-05, + "loss": 1.8593, + "step": 5838 + }, + { + "epoch": 1.7922038060159609, + "grad_norm": 0.5051491856575012, + "learning_rate": 9.42770650311986e-05, + "loss": 1.9283, + "step": 5839 + }, + { + "epoch": 1.7925107427869857, + "grad_norm": 0.5883297324180603, + "learning_rate": 9.427475568585787e-05, + "loss": 1.9211, + "step": 5840 + }, + { + "epoch": 1.792817679558011, + "grad_norm": 0.54326993227005, + "learning_rate": 9.427244590296807e-05, + "loss": 1.8856, + "step": 5841 + }, + { + "epoch": 1.7931246163290362, + "grad_norm": 0.3963034152984619, + "learning_rate": 9.4270135682552e-05, + "loss": 1.9302, + "step": 5842 + }, + { + "epoch": 1.7934315531000613, + "grad_norm": 0.3804232180118561, + "learning_rate": 9.426782502463251e-05, + "loss": 1.8615, + "step": 5843 + }, + { + "epoch": 1.7937384898710866, + "grad_norm": 0.5173880457878113, + "learning_rate": 9.426551392923244e-05, + "loss": 1.9702, + "step": 5844 + }, + { + "epoch": 1.7940454266421118, + "grad_norm": 0.5509253144264221, + "learning_rate": 9.42632023963746e-05, + "loss": 1.9091, + "step": 5845 + }, + { + "epoch": 1.7943523634131369, + "grad_norm": 0.4918860197067261, + "learning_rate": 9.426089042608186e-05, + "loss": 1.956, + "step": 5846 + }, + { + "epoch": 1.7946593001841622, + "grad_norm": 0.40632131695747375, + "learning_rate": 9.425857801837705e-05, + "loss": 1.978, + "step": 5847 + }, + { + "epoch": 1.794966236955187, + "grad_norm": 0.429643839597702, + "learning_rate": 9.425626517328303e-05, + "loss": 1.9293, + "step": 5848 + }, + { + "epoch": 1.7952731737262124, + "grad_norm": 0.46690109372138977, + "learning_rate": 9.425395189082267e-05, + "loss": 1.935, + "step": 5849 + }, + { + "epoch": 1.7955801104972375, + "grad_norm": 0.47745081782341003, + "learning_rate": 9.425163817101881e-05, + "loss": 1.9308, + "step": 5850 + }, + { + "epoch": 1.7958870472682626, + "grad_norm": 0.40971288084983826, + "learning_rate": 9.424932401389433e-05, + "loss": 1.8818, + "step": 5851 + }, + { + "epoch": 1.796193984039288, + "grad_norm": 0.44640809297561646, + "learning_rate": 9.424700941947209e-05, + "loss": 1.9298, + "step": 5852 + }, + { + "epoch": 1.796500920810313, + "grad_norm": 0.4068106412887573, + "learning_rate": 9.424469438777497e-05, + "loss": 1.9176, + "step": 5853 + }, + { + "epoch": 1.7968078575813382, + "grad_norm": 0.39228180050849915, + "learning_rate": 9.424237891882584e-05, + "loss": 1.9822, + "step": 5854 + }, + { + "epoch": 1.7971147943523635, + "grad_norm": 0.4050966203212738, + "learning_rate": 9.424006301264761e-05, + "loss": 2.0092, + "step": 5855 + }, + { + "epoch": 1.7974217311233884, + "grad_norm": 0.4402252733707428, + "learning_rate": 9.423774666926313e-05, + "loss": 1.9686, + "step": 5856 + }, + { + "epoch": 1.7977286678944138, + "grad_norm": 0.4362206757068634, + "learning_rate": 9.423542988869531e-05, + "loss": 1.9472, + "step": 5857 + }, + { + "epoch": 1.7980356046654389, + "grad_norm": 0.4363079369068146, + "learning_rate": 9.423311267096706e-05, + "loss": 1.9046, + "step": 5858 + }, + { + "epoch": 1.798342541436464, + "grad_norm": 0.4619371294975281, + "learning_rate": 9.423079501610123e-05, + "loss": 1.9322, + "step": 5859 + }, + { + "epoch": 1.7986494782074893, + "grad_norm": 0.3747330605983734, + "learning_rate": 9.42284769241208e-05, + "loss": 1.8859, + "step": 5860 + }, + { + "epoch": 1.7989564149785144, + "grad_norm": 0.46349939703941345, + "learning_rate": 9.422615839504863e-05, + "loss": 2.0343, + "step": 5861 + }, + { + "epoch": 1.7992633517495396, + "grad_norm": 0.4081406891345978, + "learning_rate": 9.422383942890762e-05, + "loss": 1.9261, + "step": 5862 + }, + { + "epoch": 1.7995702885205649, + "grad_norm": 0.4200274348258972, + "learning_rate": 9.42215200257207e-05, + "loss": 1.8922, + "step": 5863 + }, + { + "epoch": 1.7998772252915898, + "grad_norm": 0.4353233277797699, + "learning_rate": 9.421920018551084e-05, + "loss": 1.9263, + "step": 5864 + }, + { + "epoch": 1.8001841620626151, + "grad_norm": 0.43261346220970154, + "learning_rate": 9.42168799083009e-05, + "loss": 1.872, + "step": 5865 + }, + { + "epoch": 1.8004910988336402, + "grad_norm": 0.41588231921195984, + "learning_rate": 9.421455919411385e-05, + "loss": 1.9427, + "step": 5866 + }, + { + "epoch": 1.8007980356046653, + "grad_norm": 0.36490678787231445, + "learning_rate": 9.421223804297261e-05, + "loss": 1.9458, + "step": 5867 + }, + { + "epoch": 1.8011049723756907, + "grad_norm": 0.40656644105911255, + "learning_rate": 9.42099164549001e-05, + "loss": 1.8791, + "step": 5868 + }, + { + "epoch": 1.8014119091467158, + "grad_norm": 0.35529834032058716, + "learning_rate": 9.42075944299193e-05, + "loss": 1.8889, + "step": 5869 + }, + { + "epoch": 1.801718845917741, + "grad_norm": 0.3530628979206085, + "learning_rate": 9.420527196805314e-05, + "loss": 1.9093, + "step": 5870 + }, + { + "epoch": 1.8020257826887662, + "grad_norm": 0.35012003779411316, + "learning_rate": 9.420294906932457e-05, + "loss": 1.84, + "step": 5871 + }, + { + "epoch": 1.8023327194597911, + "grad_norm": 0.37993142008781433, + "learning_rate": 9.420062573375654e-05, + "loss": 1.9943, + "step": 5872 + }, + { + "epoch": 1.8026396562308165, + "grad_norm": 0.34801873564720154, + "learning_rate": 9.419830196137204e-05, + "loss": 1.9092, + "step": 5873 + }, + { + "epoch": 1.8029465930018416, + "grad_norm": 0.3381052017211914, + "learning_rate": 9.4195977752194e-05, + "loss": 1.9212, + "step": 5874 + }, + { + "epoch": 1.8032535297728667, + "grad_norm": 0.3624991476535797, + "learning_rate": 9.419365310624542e-05, + "loss": 1.9491, + "step": 5875 + }, + { + "epoch": 1.803560466543892, + "grad_norm": 0.3840768337249756, + "learning_rate": 9.419132802354925e-05, + "loss": 1.9531, + "step": 5876 + }, + { + "epoch": 1.8038674033149171, + "grad_norm": 0.377481073141098, + "learning_rate": 9.418900250412846e-05, + "loss": 1.9103, + "step": 5877 + }, + { + "epoch": 1.8041743400859422, + "grad_norm": 0.41462278366088867, + "learning_rate": 9.418667654800606e-05, + "loss": 1.944, + "step": 5878 + }, + { + "epoch": 1.8044812768569676, + "grad_norm": 0.5620705485343933, + "learning_rate": 9.418435015520502e-05, + "loss": 1.9184, + "step": 5879 + }, + { + "epoch": 1.8047882136279927, + "grad_norm": 0.6150699853897095, + "learning_rate": 9.418202332574833e-05, + "loss": 1.8971, + "step": 5880 + }, + { + "epoch": 1.8050951503990178, + "grad_norm": 0.5631645321846008, + "learning_rate": 9.4179696059659e-05, + "loss": 1.9668, + "step": 5881 + }, + { + "epoch": 1.8054020871700431, + "grad_norm": 0.4416831433773041, + "learning_rate": 9.417736835696001e-05, + "loss": 1.8531, + "step": 5882 + }, + { + "epoch": 1.805709023941068, + "grad_norm": 0.37340816855430603, + "learning_rate": 9.417504021767438e-05, + "loss": 1.8928, + "step": 5883 + }, + { + "epoch": 1.8060159607120934, + "grad_norm": 0.46018123626708984, + "learning_rate": 9.41727116418251e-05, + "loss": 1.8943, + "step": 5884 + }, + { + "epoch": 1.8063228974831185, + "grad_norm": 0.3852032721042633, + "learning_rate": 9.41703826294352e-05, + "loss": 1.8927, + "step": 5885 + }, + { + "epoch": 1.8066298342541436, + "grad_norm": 0.36783283948898315, + "learning_rate": 9.41680531805277e-05, + "loss": 1.9255, + "step": 5886 + }, + { + "epoch": 1.806936771025169, + "grad_norm": 0.39950302243232727, + "learning_rate": 9.416572329512559e-05, + "loss": 1.9215, + "step": 5887 + }, + { + "epoch": 1.807243707796194, + "grad_norm": 0.37217068672180176, + "learning_rate": 9.416339297325193e-05, + "loss": 1.8798, + "step": 5888 + }, + { + "epoch": 1.8075506445672191, + "grad_norm": 0.4334213137626648, + "learning_rate": 9.416106221492974e-05, + "loss": 1.9583, + "step": 5889 + }, + { + "epoch": 1.8078575813382445, + "grad_norm": 0.39610370993614197, + "learning_rate": 9.415873102018204e-05, + "loss": 1.9526, + "step": 5890 + }, + { + "epoch": 1.8081645181092694, + "grad_norm": 0.4256335496902466, + "learning_rate": 9.41563993890319e-05, + "loss": 1.9633, + "step": 5891 + }, + { + "epoch": 1.8084714548802947, + "grad_norm": 0.48030543327331543, + "learning_rate": 9.41540673215023e-05, + "loss": 1.8869, + "step": 5892 + }, + { + "epoch": 1.8087783916513198, + "grad_norm": 0.5549675822257996, + "learning_rate": 9.415173481761634e-05, + "loss": 1.9894, + "step": 5893 + }, + { + "epoch": 1.809085328422345, + "grad_norm": 0.5706361532211304, + "learning_rate": 9.414940187739708e-05, + "loss": 1.9721, + "step": 5894 + }, + { + "epoch": 1.8093922651933703, + "grad_norm": 0.4263947606086731, + "learning_rate": 9.414706850086754e-05, + "loss": 1.9408, + "step": 5895 + }, + { + "epoch": 1.8096992019643954, + "grad_norm": 0.3934611976146698, + "learning_rate": 9.414473468805078e-05, + "loss": 1.9444, + "step": 5896 + }, + { + "epoch": 1.8100061387354205, + "grad_norm": 0.4267776608467102, + "learning_rate": 9.41424004389699e-05, + "loss": 1.8774, + "step": 5897 + }, + { + "epoch": 1.8103130755064458, + "grad_norm": 0.46216219663619995, + "learning_rate": 9.414006575364795e-05, + "loss": 1.9648, + "step": 5898 + }, + { + "epoch": 1.8106200122774707, + "grad_norm": 0.4730767607688904, + "learning_rate": 9.413773063210798e-05, + "loss": 1.9528, + "step": 5899 + }, + { + "epoch": 1.810926949048496, + "grad_norm": 0.36383283138275146, + "learning_rate": 9.413539507437308e-05, + "loss": 1.843, + "step": 5900 + }, + { + "epoch": 1.8112338858195212, + "grad_norm": 0.343729168176651, + "learning_rate": 9.413305908046636e-05, + "loss": 1.9101, + "step": 5901 + }, + { + "epoch": 1.8115408225905463, + "grad_norm": 0.3774524927139282, + "learning_rate": 9.413072265041087e-05, + "loss": 1.8705, + "step": 5902 + }, + { + "epoch": 1.8118477593615716, + "grad_norm": 0.37734711170196533, + "learning_rate": 9.412838578422972e-05, + "loss": 1.868, + "step": 5903 + }, + { + "epoch": 1.8121546961325967, + "grad_norm": 0.3705524206161499, + "learning_rate": 9.4126048481946e-05, + "loss": 1.9587, + "step": 5904 + }, + { + "epoch": 1.8124616329036218, + "grad_norm": 0.45906612277030945, + "learning_rate": 9.41237107435828e-05, + "loss": 1.9872, + "step": 5905 + }, + { + "epoch": 1.8127685696746472, + "grad_norm": 0.5013484954833984, + "learning_rate": 9.412137256916323e-05, + "loss": 1.8692, + "step": 5906 + }, + { + "epoch": 1.813075506445672, + "grad_norm": 0.5123991370201111, + "learning_rate": 9.411903395871038e-05, + "loss": 1.9574, + "step": 5907 + }, + { + "epoch": 1.8133824432166974, + "grad_norm": 0.45425844192504883, + "learning_rate": 9.411669491224739e-05, + "loss": 1.9295, + "step": 5908 + }, + { + "epoch": 1.8136893799877225, + "grad_norm": 0.3939640522003174, + "learning_rate": 9.411435542979736e-05, + "loss": 1.9258, + "step": 5909 + }, + { + "epoch": 1.8139963167587476, + "grad_norm": 0.5032235383987427, + "learning_rate": 9.411201551138342e-05, + "loss": 1.9012, + "step": 5910 + }, + { + "epoch": 1.814303253529773, + "grad_norm": 0.6334826946258545, + "learning_rate": 9.410967515702869e-05, + "loss": 1.9699, + "step": 5911 + }, + { + "epoch": 1.814610190300798, + "grad_norm": 0.56645667552948, + "learning_rate": 9.41073343667563e-05, + "loss": 1.9346, + "step": 5912 + }, + { + "epoch": 1.8149171270718232, + "grad_norm": 0.461668461561203, + "learning_rate": 9.410499314058936e-05, + "loss": 1.9549, + "step": 5913 + }, + { + "epoch": 1.8152240638428485, + "grad_norm": 0.39917534589767456, + "learning_rate": 9.410265147855104e-05, + "loss": 1.9503, + "step": 5914 + }, + { + "epoch": 1.8155310006138734, + "grad_norm": 0.4409043788909912, + "learning_rate": 9.410030938066448e-05, + "loss": 1.897, + "step": 5915 + }, + { + "epoch": 1.8158379373848987, + "grad_norm": 0.5793384313583374, + "learning_rate": 9.40979668469528e-05, + "loss": 1.9526, + "step": 5916 + }, + { + "epoch": 1.8161448741559238, + "grad_norm": 0.4642924666404724, + "learning_rate": 9.409562387743917e-05, + "loss": 1.8993, + "step": 5917 + }, + { + "epoch": 1.816451810926949, + "grad_norm": 0.3799861669540405, + "learning_rate": 9.409328047214674e-05, + "loss": 1.9412, + "step": 5918 + }, + { + "epoch": 1.8167587476979743, + "grad_norm": 0.40758320689201355, + "learning_rate": 9.409093663109866e-05, + "loss": 1.9908, + "step": 5919 + }, + { + "epoch": 1.8170656844689994, + "grad_norm": 0.41446420550346375, + "learning_rate": 9.40885923543181e-05, + "loss": 1.8711, + "step": 5920 + }, + { + "epoch": 1.8173726212400245, + "grad_norm": 0.4744807183742523, + "learning_rate": 9.408624764182823e-05, + "loss": 2.0297, + "step": 5921 + }, + { + "epoch": 1.8176795580110499, + "grad_norm": 0.43377524614334106, + "learning_rate": 9.408390249365224e-05, + "loss": 1.9613, + "step": 5922 + }, + { + "epoch": 1.8179864947820747, + "grad_norm": 0.38450872898101807, + "learning_rate": 9.408155690981328e-05, + "loss": 1.8716, + "step": 5923 + }, + { + "epoch": 1.8182934315531, + "grad_norm": 0.4989684820175171, + "learning_rate": 9.407921089033452e-05, + "loss": 1.9909, + "step": 5924 + }, + { + "epoch": 1.8186003683241252, + "grad_norm": 0.4137042462825775, + "learning_rate": 9.407686443523918e-05, + "loss": 1.8778, + "step": 5925 + }, + { + "epoch": 1.8189073050951503, + "grad_norm": 0.3816729485988617, + "learning_rate": 9.407451754455042e-05, + "loss": 1.9355, + "step": 5926 + }, + { + "epoch": 1.8192142418661756, + "grad_norm": 0.48876214027404785, + "learning_rate": 9.407217021829145e-05, + "loss": 1.9256, + "step": 5927 + }, + { + "epoch": 1.8195211786372008, + "grad_norm": 0.5273690223693848, + "learning_rate": 9.406982245648547e-05, + "loss": 1.9456, + "step": 5928 + }, + { + "epoch": 1.8198281154082259, + "grad_norm": 0.4148990511894226, + "learning_rate": 9.406747425915566e-05, + "loss": 1.9184, + "step": 5929 + }, + { + "epoch": 1.8201350521792512, + "grad_norm": 0.4484131634235382, + "learning_rate": 9.406512562632526e-05, + "loss": 1.9305, + "step": 5930 + }, + { + "epoch": 1.820441988950276, + "grad_norm": 0.6036938428878784, + "learning_rate": 9.406277655801744e-05, + "loss": 1.9294, + "step": 5931 + }, + { + "epoch": 1.8207489257213014, + "grad_norm": 0.5399366021156311, + "learning_rate": 9.406042705425543e-05, + "loss": 1.9265, + "step": 5932 + }, + { + "epoch": 1.8210558624923265, + "grad_norm": 0.3591126501560211, + "learning_rate": 9.405807711506249e-05, + "loss": 1.8634, + "step": 5933 + }, + { + "epoch": 1.8213627992633517, + "grad_norm": 0.4474995732307434, + "learning_rate": 9.405572674046179e-05, + "loss": 2.0084, + "step": 5934 + }, + { + "epoch": 1.821669736034377, + "grad_norm": 0.4841657876968384, + "learning_rate": 9.405337593047657e-05, + "loss": 1.8885, + "step": 5935 + }, + { + "epoch": 1.821976672805402, + "grad_norm": 0.4786655008792877, + "learning_rate": 9.405102468513008e-05, + "loss": 1.9273, + "step": 5936 + }, + { + "epoch": 1.8222836095764272, + "grad_norm": 0.4675963521003723, + "learning_rate": 9.404867300444553e-05, + "loss": 1.9267, + "step": 5937 + }, + { + "epoch": 1.8225905463474525, + "grad_norm": 0.40235474705696106, + "learning_rate": 9.404632088844619e-05, + "loss": 2.0208, + "step": 5938 + }, + { + "epoch": 1.8228974831184774, + "grad_norm": 0.40626317262649536, + "learning_rate": 9.404396833715527e-05, + "loss": 1.9079, + "step": 5939 + }, + { + "epoch": 1.8232044198895028, + "grad_norm": 0.4164435565471649, + "learning_rate": 9.404161535059607e-05, + "loss": 1.8818, + "step": 5940 + }, + { + "epoch": 1.8235113566605279, + "grad_norm": 0.44487184286117554, + "learning_rate": 9.40392619287918e-05, + "loss": 1.9184, + "step": 5941 + }, + { + "epoch": 1.823818293431553, + "grad_norm": 0.4009508192539215, + "learning_rate": 9.403690807176572e-05, + "loss": 1.8814, + "step": 5942 + }, + { + "epoch": 1.8241252302025783, + "grad_norm": 0.3518575429916382, + "learning_rate": 9.403455377954112e-05, + "loss": 1.9319, + "step": 5943 + }, + { + "epoch": 1.8244321669736034, + "grad_norm": 0.36712533235549927, + "learning_rate": 9.403219905214125e-05, + "loss": 1.8609, + "step": 5944 + }, + { + "epoch": 1.8247391037446286, + "grad_norm": 0.3926267623901367, + "learning_rate": 9.402984388958937e-05, + "loss": 1.9328, + "step": 5945 + }, + { + "epoch": 1.825046040515654, + "grad_norm": 0.370781272649765, + "learning_rate": 9.402748829190878e-05, + "loss": 1.9848, + "step": 5946 + }, + { + "epoch": 1.8253529772866788, + "grad_norm": 0.38226625323295593, + "learning_rate": 9.402513225912273e-05, + "loss": 1.8933, + "step": 5947 + }, + { + "epoch": 1.8256599140577041, + "grad_norm": 0.40101101994514465, + "learning_rate": 9.402277579125451e-05, + "loss": 1.9231, + "step": 5948 + }, + { + "epoch": 1.8259668508287292, + "grad_norm": 0.41038060188293457, + "learning_rate": 9.402041888832744e-05, + "loss": 1.9445, + "step": 5949 + }, + { + "epoch": 1.8262737875997543, + "grad_norm": 0.37442395091056824, + "learning_rate": 9.401806155036479e-05, + "loss": 1.9271, + "step": 5950 + }, + { + "epoch": 1.8265807243707797, + "grad_norm": 0.43142926692962646, + "learning_rate": 9.401570377738984e-05, + "loss": 1.9489, + "step": 5951 + }, + { + "epoch": 1.8268876611418048, + "grad_norm": 0.38730981945991516, + "learning_rate": 9.401334556942591e-05, + "loss": 1.8802, + "step": 5952 + }, + { + "epoch": 1.82719459791283, + "grad_norm": 0.34189531207084656, + "learning_rate": 9.40109869264963e-05, + "loss": 1.9116, + "step": 5953 + }, + { + "epoch": 1.8275015346838552, + "grad_norm": 0.3632197678089142, + "learning_rate": 9.400862784862434e-05, + "loss": 1.8456, + "step": 5954 + }, + { + "epoch": 1.8278084714548803, + "grad_norm": 0.4008798599243164, + "learning_rate": 9.400626833583331e-05, + "loss": 1.9984, + "step": 5955 + }, + { + "epoch": 1.8281154082259055, + "grad_norm": 0.4087502062320709, + "learning_rate": 9.400390838814655e-05, + "loss": 1.8177, + "step": 5956 + }, + { + "epoch": 1.8284223449969308, + "grad_norm": 0.3753478229045868, + "learning_rate": 9.400154800558737e-05, + "loss": 1.864, + "step": 5957 + }, + { + "epoch": 1.8287292817679557, + "grad_norm": 0.37939608097076416, + "learning_rate": 9.399918718817911e-05, + "loss": 1.9331, + "step": 5958 + }, + { + "epoch": 1.829036218538981, + "grad_norm": 0.41382426023483276, + "learning_rate": 9.399682593594507e-05, + "loss": 1.9014, + "step": 5959 + }, + { + "epoch": 1.8293431553100061, + "grad_norm": 0.46129345893859863, + "learning_rate": 9.399446424890864e-05, + "loss": 1.9591, + "step": 5960 + }, + { + "epoch": 1.8296500920810312, + "grad_norm": 0.487870454788208, + "learning_rate": 9.399210212709312e-05, + "loss": 1.9073, + "step": 5961 + }, + { + "epoch": 1.8299570288520566, + "grad_norm": 0.4693615138530731, + "learning_rate": 9.398973957052185e-05, + "loss": 1.8336, + "step": 5962 + }, + { + "epoch": 1.8302639656230817, + "grad_norm": 0.38947850465774536, + "learning_rate": 9.39873765792182e-05, + "loss": 1.8599, + "step": 5963 + }, + { + "epoch": 1.8305709023941068, + "grad_norm": 0.372242271900177, + "learning_rate": 9.398501315320551e-05, + "loss": 1.9653, + "step": 5964 + }, + { + "epoch": 1.8308778391651321, + "grad_norm": 0.37679895758628845, + "learning_rate": 9.398264929250714e-05, + "loss": 1.8886, + "step": 5965 + }, + { + "epoch": 1.831184775936157, + "grad_norm": 0.347989022731781, + "learning_rate": 9.398028499714645e-05, + "loss": 1.8665, + "step": 5966 + }, + { + "epoch": 1.8314917127071824, + "grad_norm": 0.4297877550125122, + "learning_rate": 9.397792026714681e-05, + "loss": 1.9646, + "step": 5967 + }, + { + "epoch": 1.8317986494782075, + "grad_norm": 0.3698103427886963, + "learning_rate": 9.397555510253158e-05, + "loss": 1.9537, + "step": 5968 + }, + { + "epoch": 1.8321055862492326, + "grad_norm": 0.3268609941005707, + "learning_rate": 9.397318950332414e-05, + "loss": 1.8679, + "step": 5969 + }, + { + "epoch": 1.832412523020258, + "grad_norm": 0.3487341105937958, + "learning_rate": 9.397082346954788e-05, + "loss": 1.8936, + "step": 5970 + }, + { + "epoch": 1.832719459791283, + "grad_norm": 0.36363741755485535, + "learning_rate": 9.396845700122616e-05, + "loss": 1.8926, + "step": 5971 + }, + { + "epoch": 1.8330263965623081, + "grad_norm": 0.42258647084236145, + "learning_rate": 9.396609009838237e-05, + "loss": 1.9439, + "step": 5972 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.4087521433830261, + "learning_rate": 9.396372276103992e-05, + "loss": 1.8868, + "step": 5973 + }, + { + "epoch": 1.8336402701043584, + "grad_norm": 0.41857820749282837, + "learning_rate": 9.396135498922218e-05, + "loss": 1.9824, + "step": 5974 + }, + { + "epoch": 1.8339472068753837, + "grad_norm": 0.44207099080085754, + "learning_rate": 9.395898678295259e-05, + "loss": 1.9183, + "step": 5975 + }, + { + "epoch": 1.8342541436464088, + "grad_norm": 0.38295891880989075, + "learning_rate": 9.39566181422545e-05, + "loss": 1.8882, + "step": 5976 + }, + { + "epoch": 1.834561080417434, + "grad_norm": 0.4440687298774719, + "learning_rate": 9.395424906715136e-05, + "loss": 1.9401, + "step": 5977 + }, + { + "epoch": 1.8348680171884593, + "grad_norm": 0.3867577016353607, + "learning_rate": 9.395187955766655e-05, + "loss": 1.9243, + "step": 5978 + }, + { + "epoch": 1.8351749539594844, + "grad_norm": 0.47536182403564453, + "learning_rate": 9.394950961382354e-05, + "loss": 1.9248, + "step": 5979 + }, + { + "epoch": 1.8354818907305095, + "grad_norm": 0.4071936011314392, + "learning_rate": 9.394713923564569e-05, + "loss": 1.8701, + "step": 5980 + }, + { + "epoch": 1.8357888275015348, + "grad_norm": 0.41844502091407776, + "learning_rate": 9.394476842315645e-05, + "loss": 2.0087, + "step": 5981 + }, + { + "epoch": 1.8360957642725597, + "grad_norm": 0.40439316630363464, + "learning_rate": 9.394239717637927e-05, + "loss": 1.8945, + "step": 5982 + }, + { + "epoch": 1.836402701043585, + "grad_norm": 0.36738064885139465, + "learning_rate": 9.394002549533754e-05, + "loss": 1.9361, + "step": 5983 + }, + { + "epoch": 1.8367096378146102, + "grad_norm": 0.4733370542526245, + "learning_rate": 9.393765338005476e-05, + "loss": 1.9301, + "step": 5984 + }, + { + "epoch": 1.8370165745856353, + "grad_norm": 0.4467030465602875, + "learning_rate": 9.39352808305543e-05, + "loss": 1.8691, + "step": 5985 + }, + { + "epoch": 1.8373235113566606, + "grad_norm": 0.5276423692703247, + "learning_rate": 9.393290784685967e-05, + "loss": 1.9211, + "step": 5986 + }, + { + "epoch": 1.8376304481276857, + "grad_norm": 0.4791669547557831, + "learning_rate": 9.393053442899428e-05, + "loss": 1.9876, + "step": 5987 + }, + { + "epoch": 1.8379373848987108, + "grad_norm": 0.41468554735183716, + "learning_rate": 9.392816057698159e-05, + "loss": 1.9483, + "step": 5988 + }, + { + "epoch": 1.8382443216697362, + "grad_norm": 0.3979242742061615, + "learning_rate": 9.39257862908451e-05, + "loss": 1.8962, + "step": 5989 + }, + { + "epoch": 1.838551258440761, + "grad_norm": 0.47706472873687744, + "learning_rate": 9.392341157060822e-05, + "loss": 1.9028, + "step": 5990 + }, + { + "epoch": 1.8388581952117864, + "grad_norm": 0.5254244804382324, + "learning_rate": 9.392103641629446e-05, + "loss": 1.9244, + "step": 5991 + }, + { + "epoch": 1.8391651319828115, + "grad_norm": 0.49596595764160156, + "learning_rate": 9.391866082792727e-05, + "loss": 1.8731, + "step": 5992 + }, + { + "epoch": 1.8394720687538366, + "grad_norm": 0.3787136971950531, + "learning_rate": 9.391628480553013e-05, + "loss": 1.9404, + "step": 5993 + }, + { + "epoch": 1.839779005524862, + "grad_norm": 0.3986566960811615, + "learning_rate": 9.391390834912651e-05, + "loss": 1.9319, + "step": 5994 + }, + { + "epoch": 1.840085942295887, + "grad_norm": 0.4466419219970703, + "learning_rate": 9.391153145873992e-05, + "loss": 1.9755, + "step": 5995 + }, + { + "epoch": 1.8403928790669122, + "grad_norm": 0.43374884128570557, + "learning_rate": 9.390915413439385e-05, + "loss": 1.913, + "step": 5996 + }, + { + "epoch": 1.8406998158379375, + "grad_norm": 0.3897610902786255, + "learning_rate": 9.390677637611176e-05, + "loss": 1.9488, + "step": 5997 + }, + { + "epoch": 1.8410067526089624, + "grad_norm": 0.38407614827156067, + "learning_rate": 9.390439818391718e-05, + "loss": 1.8712, + "step": 5998 + }, + { + "epoch": 1.8413136893799877, + "grad_norm": 0.4159192740917206, + "learning_rate": 9.390201955783362e-05, + "loss": 1.9254, + "step": 5999 + }, + { + "epoch": 1.8416206261510129, + "grad_norm": 0.42220592498779297, + "learning_rate": 9.389964049788455e-05, + "loss": 1.9684, + "step": 6000 + }, + { + "epoch": 1.841927562922038, + "grad_norm": 0.3792029619216919, + "learning_rate": 9.389726100409351e-05, + "loss": 1.9091, + "step": 6001 + }, + { + "epoch": 1.8422344996930633, + "grad_norm": 0.37374788522720337, + "learning_rate": 9.389488107648401e-05, + "loss": 1.9498, + "step": 6002 + }, + { + "epoch": 1.8425414364640884, + "grad_norm": 0.4237084686756134, + "learning_rate": 9.389250071507958e-05, + "loss": 1.9177, + "step": 6003 + }, + { + "epoch": 1.8428483732351135, + "grad_norm": 0.5332993865013123, + "learning_rate": 9.38901199199037e-05, + "loss": 1.8994, + "step": 6004 + }, + { + "epoch": 1.8431553100061389, + "grad_norm": 0.42202335596084595, + "learning_rate": 9.388773869097996e-05, + "loss": 1.8365, + "step": 6005 + }, + { + "epoch": 1.8434622467771637, + "grad_norm": 0.3581100106239319, + "learning_rate": 9.388535702833185e-05, + "loss": 1.8536, + "step": 6006 + }, + { + "epoch": 1.843769183548189, + "grad_norm": 0.3670782446861267, + "learning_rate": 9.388297493198293e-05, + "loss": 1.8965, + "step": 6007 + }, + { + "epoch": 1.8440761203192142, + "grad_norm": 0.39181825518608093, + "learning_rate": 9.38805924019567e-05, + "loss": 1.8674, + "step": 6008 + }, + { + "epoch": 1.8443830570902393, + "grad_norm": 0.46757015585899353, + "learning_rate": 9.387820943827676e-05, + "loss": 1.8945, + "step": 6009 + }, + { + "epoch": 1.8446899938612646, + "grad_norm": 0.4656504690647125, + "learning_rate": 9.387582604096664e-05, + "loss": 1.8626, + "step": 6010 + }, + { + "epoch": 1.8449969306322898, + "grad_norm": 0.4699888825416565, + "learning_rate": 9.387344221004988e-05, + "loss": 1.9396, + "step": 6011 + }, + { + "epoch": 1.8453038674033149, + "grad_norm": 0.36591392755508423, + "learning_rate": 9.387105794555006e-05, + "loss": 1.8031, + "step": 6012 + }, + { + "epoch": 1.8456108041743402, + "grad_norm": 0.3563486933708191, + "learning_rate": 9.386867324749073e-05, + "loss": 1.8658, + "step": 6013 + }, + { + "epoch": 1.845917740945365, + "grad_norm": 0.4490883946418762, + "learning_rate": 9.386628811589547e-05, + "loss": 1.9809, + "step": 6014 + }, + { + "epoch": 1.8462246777163904, + "grad_norm": 0.39862295985221863, + "learning_rate": 9.38639025507878e-05, + "loss": 1.9268, + "step": 6015 + }, + { + "epoch": 1.8465316144874155, + "grad_norm": 0.3579883575439453, + "learning_rate": 9.386151655219138e-05, + "loss": 1.8538, + "step": 6016 + }, + { + "epoch": 1.8468385512584407, + "grad_norm": 0.411685973405838, + "learning_rate": 9.385913012012973e-05, + "loss": 1.9034, + "step": 6017 + }, + { + "epoch": 1.847145488029466, + "grad_norm": 0.44486066699028015, + "learning_rate": 9.385674325462643e-05, + "loss": 1.9279, + "step": 6018 + }, + { + "epoch": 1.847452424800491, + "grad_norm": 0.42794153094291687, + "learning_rate": 9.385435595570511e-05, + "loss": 1.9117, + "step": 6019 + }, + { + "epoch": 1.8477593615715162, + "grad_norm": 0.3652110695838928, + "learning_rate": 9.385196822338933e-05, + "loss": 1.9636, + "step": 6020 + }, + { + "epoch": 1.8480662983425415, + "grad_norm": 0.36490142345428467, + "learning_rate": 9.38495800577027e-05, + "loss": 1.9468, + "step": 6021 + }, + { + "epoch": 1.8483732351135667, + "grad_norm": 0.3946039080619812, + "learning_rate": 9.384719145866882e-05, + "loss": 1.8851, + "step": 6022 + }, + { + "epoch": 1.8486801718845918, + "grad_norm": 0.4236997067928314, + "learning_rate": 9.38448024263113e-05, + "loss": 2.0256, + "step": 6023 + }, + { + "epoch": 1.848987108655617, + "grad_norm": 0.34637942910194397, + "learning_rate": 9.384241296065374e-05, + "loss": 1.9032, + "step": 6024 + }, + { + "epoch": 1.849294045426642, + "grad_norm": 0.4096907079219818, + "learning_rate": 9.384002306171975e-05, + "loss": 1.9762, + "step": 6025 + }, + { + "epoch": 1.8496009821976673, + "grad_norm": 0.38225218653678894, + "learning_rate": 9.383763272953297e-05, + "loss": 2.023, + "step": 6026 + }, + { + "epoch": 1.8499079189686924, + "grad_norm": 0.4297153055667877, + "learning_rate": 9.3835241964117e-05, + "loss": 1.977, + "step": 6027 + }, + { + "epoch": 1.8502148557397176, + "grad_norm": 0.5225360989570618, + "learning_rate": 9.383285076549548e-05, + "loss": 1.919, + "step": 6028 + }, + { + "epoch": 1.850521792510743, + "grad_norm": 0.6799743175506592, + "learning_rate": 9.383045913369205e-05, + "loss": 1.9382, + "step": 6029 + }, + { + "epoch": 1.850828729281768, + "grad_norm": 0.6274817585945129, + "learning_rate": 9.382806706873031e-05, + "loss": 1.9782, + "step": 6030 + }, + { + "epoch": 1.8511356660527931, + "grad_norm": 0.4939708113670349, + "learning_rate": 9.382567457063392e-05, + "loss": 1.8794, + "step": 6031 + }, + { + "epoch": 1.8514426028238185, + "grad_norm": 0.3876135051250458, + "learning_rate": 9.382328163942656e-05, + "loss": 2.0153, + "step": 6032 + }, + { + "epoch": 1.8517495395948433, + "grad_norm": 0.592051088809967, + "learning_rate": 9.38208882751318e-05, + "loss": 1.9277, + "step": 6033 + }, + { + "epoch": 1.8520564763658687, + "grad_norm": 0.660763144493103, + "learning_rate": 9.381849447777337e-05, + "loss": 1.9177, + "step": 6034 + }, + { + "epoch": 1.8523634131368938, + "grad_norm": 0.5823151469230652, + "learning_rate": 9.381610024737489e-05, + "loss": 1.9363, + "step": 6035 + }, + { + "epoch": 1.852670349907919, + "grad_norm": 0.39519962668418884, + "learning_rate": 9.381370558396004e-05, + "loss": 1.8627, + "step": 6036 + }, + { + "epoch": 1.8529772866789442, + "grad_norm": 0.44657328724861145, + "learning_rate": 9.381131048755244e-05, + "loss": 1.9075, + "step": 6037 + }, + { + "epoch": 1.8532842234499693, + "grad_norm": 0.540743887424469, + "learning_rate": 9.380891495817581e-05, + "loss": 1.9518, + "step": 6038 + }, + { + "epoch": 1.8535911602209945, + "grad_norm": 0.4388680160045624, + "learning_rate": 9.38065189958538e-05, + "loss": 1.8485, + "step": 6039 + }, + { + "epoch": 1.8538980969920198, + "grad_norm": 0.37645572423934937, + "learning_rate": 9.38041226006101e-05, + "loss": 1.9542, + "step": 6040 + }, + { + "epoch": 1.8542050337630447, + "grad_norm": 0.4405656158924103, + "learning_rate": 9.380172577246837e-05, + "loss": 1.9054, + "step": 6041 + }, + { + "epoch": 1.85451197053407, + "grad_norm": 0.45483505725860596, + "learning_rate": 9.379932851145232e-05, + "loss": 1.9077, + "step": 6042 + }, + { + "epoch": 1.8548189073050951, + "grad_norm": 0.40666261315345764, + "learning_rate": 9.379693081758564e-05, + "loss": 1.9977, + "step": 6043 + }, + { + "epoch": 1.8551258440761202, + "grad_norm": 0.365241140127182, + "learning_rate": 9.379453269089202e-05, + "loss": 1.9047, + "step": 6044 + }, + { + "epoch": 1.8554327808471456, + "grad_norm": 0.40797916054725647, + "learning_rate": 9.379213413139516e-05, + "loss": 1.9621, + "step": 6045 + }, + { + "epoch": 1.8557397176181707, + "grad_norm": 0.4525306820869446, + "learning_rate": 9.378973513911875e-05, + "loss": 1.9479, + "step": 6046 + }, + { + "epoch": 1.8560466543891958, + "grad_norm": 0.45422959327697754, + "learning_rate": 9.378733571408652e-05, + "loss": 1.9754, + "step": 6047 + }, + { + "epoch": 1.8563535911602211, + "grad_norm": 0.381862998008728, + "learning_rate": 9.378493585632217e-05, + "loss": 1.8542, + "step": 6048 + }, + { + "epoch": 1.856660527931246, + "grad_norm": 0.40489691495895386, + "learning_rate": 9.378253556584944e-05, + "loss": 1.9331, + "step": 6049 + }, + { + "epoch": 1.8569674647022714, + "grad_norm": 0.40347445011138916, + "learning_rate": 9.378013484269201e-05, + "loss": 1.9414, + "step": 6050 + }, + { + "epoch": 1.8572744014732965, + "grad_norm": 0.35401904582977295, + "learning_rate": 9.377773368687363e-05, + "loss": 1.8094, + "step": 6051 + }, + { + "epoch": 1.8575813382443216, + "grad_norm": 0.4061582684516907, + "learning_rate": 9.377533209841805e-05, + "loss": 1.8686, + "step": 6052 + }, + { + "epoch": 1.857888275015347, + "grad_norm": 0.44419318437576294, + "learning_rate": 9.377293007734895e-05, + "loss": 1.929, + "step": 6053 + }, + { + "epoch": 1.858195211786372, + "grad_norm": 0.41038191318511963, + "learning_rate": 9.37705276236901e-05, + "loss": 1.9636, + "step": 6054 + }, + { + "epoch": 1.8585021485573971, + "grad_norm": 0.4431348145008087, + "learning_rate": 9.376812473746526e-05, + "loss": 1.953, + "step": 6055 + }, + { + "epoch": 1.8588090853284225, + "grad_norm": 0.42502057552337646, + "learning_rate": 9.376572141869814e-05, + "loss": 1.95, + "step": 6056 + }, + { + "epoch": 1.8591160220994474, + "grad_norm": 0.40050914883613586, + "learning_rate": 9.376331766741253e-05, + "loss": 1.9507, + "step": 6057 + }, + { + "epoch": 1.8594229588704727, + "grad_norm": 0.3863932490348816, + "learning_rate": 9.376091348363216e-05, + "loss": 1.8746, + "step": 6058 + }, + { + "epoch": 1.8597298956414978, + "grad_norm": 0.37295350432395935, + "learning_rate": 9.375850886738077e-05, + "loss": 1.8778, + "step": 6059 + }, + { + "epoch": 1.860036832412523, + "grad_norm": 0.37965887784957886, + "learning_rate": 9.375610381868217e-05, + "loss": 1.8511, + "step": 6060 + }, + { + "epoch": 1.8603437691835483, + "grad_norm": 0.3740752637386322, + "learning_rate": 9.37536983375601e-05, + "loss": 1.8988, + "step": 6061 + }, + { + "epoch": 1.8606507059545734, + "grad_norm": 0.40466782450675964, + "learning_rate": 9.375129242403834e-05, + "loss": 1.9195, + "step": 6062 + }, + { + "epoch": 1.8609576427255985, + "grad_norm": 0.3658956289291382, + "learning_rate": 9.374888607814067e-05, + "loss": 1.9598, + "step": 6063 + }, + { + "epoch": 1.8612645794966238, + "grad_norm": 0.3752783238887787, + "learning_rate": 9.374647929989085e-05, + "loss": 1.9791, + "step": 6064 + }, + { + "epoch": 1.8615715162676487, + "grad_norm": 0.408774733543396, + "learning_rate": 9.374407208931268e-05, + "loss": 1.88, + "step": 6065 + }, + { + "epoch": 1.861878453038674, + "grad_norm": 0.3968205749988556, + "learning_rate": 9.374166444642997e-05, + "loss": 1.8755, + "step": 6066 + }, + { + "epoch": 1.8621853898096992, + "grad_norm": 0.37851858139038086, + "learning_rate": 9.373925637126648e-05, + "loss": 1.9296, + "step": 6067 + }, + { + "epoch": 1.8624923265807243, + "grad_norm": 0.34285619854927063, + "learning_rate": 9.373684786384604e-05, + "loss": 2.0149, + "step": 6068 + }, + { + "epoch": 1.8627992633517496, + "grad_norm": 0.38841512799263, + "learning_rate": 9.373443892419242e-05, + "loss": 1.9134, + "step": 6069 + }, + { + "epoch": 1.8631062001227747, + "grad_norm": 0.4744485914707184, + "learning_rate": 9.373202955232943e-05, + "loss": 1.9164, + "step": 6070 + }, + { + "epoch": 1.8634131368937998, + "grad_norm": 0.522659420967102, + "learning_rate": 9.372961974828092e-05, + "loss": 1.9155, + "step": 6071 + }, + { + "epoch": 1.8637200736648252, + "grad_norm": 0.5794001817703247, + "learning_rate": 9.372720951207066e-05, + "loss": 1.9003, + "step": 6072 + }, + { + "epoch": 1.86402701043585, + "grad_norm": 0.5135447978973389, + "learning_rate": 9.372479884372247e-05, + "loss": 1.948, + "step": 6073 + }, + { + "epoch": 1.8643339472068754, + "grad_norm": 0.4060198664665222, + "learning_rate": 9.372238774326021e-05, + "loss": 1.8634, + "step": 6074 + }, + { + "epoch": 1.8646408839779005, + "grad_norm": 0.3880244195461273, + "learning_rate": 9.371997621070769e-05, + "loss": 1.8729, + "step": 6075 + }, + { + "epoch": 1.8649478207489256, + "grad_norm": 0.4862929582595825, + "learning_rate": 9.371756424608875e-05, + "loss": 1.9185, + "step": 6076 + }, + { + "epoch": 1.865254757519951, + "grad_norm": 0.4763035476207733, + "learning_rate": 9.371515184942719e-05, + "loss": 1.9696, + "step": 6077 + }, + { + "epoch": 1.865561694290976, + "grad_norm": 0.3552228808403015, + "learning_rate": 9.371273902074689e-05, + "loss": 1.9101, + "step": 6078 + }, + { + "epoch": 1.8658686310620012, + "grad_norm": 0.46329566836357117, + "learning_rate": 9.371032576007168e-05, + "loss": 1.8807, + "step": 6079 + }, + { + "epoch": 1.8661755678330265, + "grad_norm": 0.5176550149917603, + "learning_rate": 9.370791206742541e-05, + "loss": 1.9044, + "step": 6080 + }, + { + "epoch": 1.8664825046040514, + "grad_norm": 0.3929184675216675, + "learning_rate": 9.370549794283194e-05, + "loss": 1.8858, + "step": 6081 + }, + { + "epoch": 1.8667894413750767, + "grad_norm": 0.35135987401008606, + "learning_rate": 9.370308338631511e-05, + "loss": 1.8518, + "step": 6082 + }, + { + "epoch": 1.8670963781461019, + "grad_norm": 0.4229072034358978, + "learning_rate": 9.370066839789881e-05, + "loss": 1.891, + "step": 6083 + }, + { + "epoch": 1.867403314917127, + "grad_norm": 0.4862394630908966, + "learning_rate": 9.369825297760688e-05, + "loss": 1.9058, + "step": 6084 + }, + { + "epoch": 1.8677102516881523, + "grad_norm": 0.4775281548500061, + "learning_rate": 9.369583712546322e-05, + "loss": 1.9738, + "step": 6085 + }, + { + "epoch": 1.8680171884591774, + "grad_norm": 0.3831046521663666, + "learning_rate": 9.369342084149166e-05, + "loss": 1.9516, + "step": 6086 + }, + { + "epoch": 1.8683241252302025, + "grad_norm": 0.3970867395401001, + "learning_rate": 9.369100412571612e-05, + "loss": 2.0158, + "step": 6087 + }, + { + "epoch": 1.8686310620012279, + "grad_norm": 0.41662725806236267, + "learning_rate": 9.368858697816047e-05, + "loss": 1.86, + "step": 6088 + }, + { + "epoch": 1.8689379987722528, + "grad_norm": 0.44235244393348694, + "learning_rate": 9.36861693988486e-05, + "loss": 1.9257, + "step": 6089 + }, + { + "epoch": 1.869244935543278, + "grad_norm": 0.37863966822624207, + "learning_rate": 9.36837513878044e-05, + "loss": 1.8877, + "step": 6090 + }, + { + "epoch": 1.8695518723143032, + "grad_norm": 0.44757044315338135, + "learning_rate": 9.368133294505175e-05, + "loss": 1.8962, + "step": 6091 + }, + { + "epoch": 1.8698588090853283, + "grad_norm": 0.5299558639526367, + "learning_rate": 9.367891407061458e-05, + "loss": 1.8655, + "step": 6092 + }, + { + "epoch": 1.8701657458563536, + "grad_norm": 0.4899531900882721, + "learning_rate": 9.367649476451678e-05, + "loss": 1.8933, + "step": 6093 + }, + { + "epoch": 1.8704726826273788, + "grad_norm": 0.3883507251739502, + "learning_rate": 9.367407502678224e-05, + "loss": 1.88, + "step": 6094 + }, + { + "epoch": 1.8707796193984039, + "grad_norm": 0.40936750173568726, + "learning_rate": 9.367165485743493e-05, + "loss": 1.8926, + "step": 6095 + }, + { + "epoch": 1.8710865561694292, + "grad_norm": 0.5708447098731995, + "learning_rate": 9.36692342564987e-05, + "loss": 1.9701, + "step": 6096 + }, + { + "epoch": 1.8713934929404543, + "grad_norm": 0.5559602379798889, + "learning_rate": 9.366681322399751e-05, + "loss": 1.8962, + "step": 6097 + }, + { + "epoch": 1.8717004297114794, + "grad_norm": 0.45344826579093933, + "learning_rate": 9.366439175995528e-05, + "loss": 1.9766, + "step": 6098 + }, + { + "epoch": 1.8720073664825048, + "grad_norm": 0.4887133538722992, + "learning_rate": 9.366196986439592e-05, + "loss": 1.8982, + "step": 6099 + }, + { + "epoch": 1.8723143032535297, + "grad_norm": 0.536568284034729, + "learning_rate": 9.365954753734339e-05, + "loss": 1.9506, + "step": 6100 + }, + { + "epoch": 1.872621240024555, + "grad_norm": 0.4792746901512146, + "learning_rate": 9.365712477882162e-05, + "loss": 1.9392, + "step": 6101 + }, + { + "epoch": 1.87292817679558, + "grad_norm": 0.39836910367012024, + "learning_rate": 9.365470158885458e-05, + "loss": 1.8812, + "step": 6102 + }, + { + "epoch": 1.8732351135666052, + "grad_norm": 0.4263121783733368, + "learning_rate": 9.365227796746617e-05, + "loss": 1.8326, + "step": 6103 + }, + { + "epoch": 1.8735420503376305, + "grad_norm": 0.4158315360546112, + "learning_rate": 9.364985391468038e-05, + "loss": 1.8857, + "step": 6104 + }, + { + "epoch": 1.8738489871086557, + "grad_norm": 0.4384559094905853, + "learning_rate": 9.364742943052112e-05, + "loss": 1.9247, + "step": 6105 + }, + { + "epoch": 1.8741559238796808, + "grad_norm": 0.34221649169921875, + "learning_rate": 9.364500451501242e-05, + "loss": 1.8869, + "step": 6106 + }, + { + "epoch": 1.874462860650706, + "grad_norm": 0.38786688446998596, + "learning_rate": 9.364257916817817e-05, + "loss": 1.8879, + "step": 6107 + }, + { + "epoch": 1.874769797421731, + "grad_norm": 0.39408090710639954, + "learning_rate": 9.364015339004239e-05, + "loss": 1.8832, + "step": 6108 + }, + { + "epoch": 1.8750767341927563, + "grad_norm": 0.33985385298728943, + "learning_rate": 9.363772718062902e-05, + "loss": 1.8823, + "step": 6109 + }, + { + "epoch": 1.8753836709637814, + "grad_norm": 0.35319194197654724, + "learning_rate": 9.363530053996206e-05, + "loss": 1.9205, + "step": 6110 + }, + { + "epoch": 1.8756906077348066, + "grad_norm": 0.3455435335636139, + "learning_rate": 9.36328734680655e-05, + "loss": 1.9028, + "step": 6111 + }, + { + "epoch": 1.875997544505832, + "grad_norm": 0.3689115643501282, + "learning_rate": 9.363044596496329e-05, + "loss": 1.8996, + "step": 6112 + }, + { + "epoch": 1.876304481276857, + "grad_norm": 0.35776960849761963, + "learning_rate": 9.362801803067945e-05, + "loss": 1.9563, + "step": 6113 + }, + { + "epoch": 1.8766114180478821, + "grad_norm": 0.3524370491504669, + "learning_rate": 9.362558966523797e-05, + "loss": 1.9016, + "step": 6114 + }, + { + "epoch": 1.8769183548189075, + "grad_norm": 0.3725074529647827, + "learning_rate": 9.362316086866283e-05, + "loss": 1.9467, + "step": 6115 + }, + { + "epoch": 1.8772252915899323, + "grad_norm": 0.390055775642395, + "learning_rate": 9.362073164097807e-05, + "loss": 1.9326, + "step": 6116 + }, + { + "epoch": 1.8775322283609577, + "grad_norm": 0.39119964838027954, + "learning_rate": 9.361830198220764e-05, + "loss": 1.8723, + "step": 6117 + }, + { + "epoch": 1.8778391651319828, + "grad_norm": 0.3659103512763977, + "learning_rate": 9.36158718923756e-05, + "loss": 1.835, + "step": 6118 + }, + { + "epoch": 1.878146101903008, + "grad_norm": 0.3360283076763153, + "learning_rate": 9.361344137150597e-05, + "loss": 1.8622, + "step": 6119 + }, + { + "epoch": 1.8784530386740332, + "grad_norm": 0.35440295934677124, + "learning_rate": 9.361101041962272e-05, + "loss": 1.8523, + "step": 6120 + }, + { + "epoch": 1.8787599754450584, + "grad_norm": 1.2606174945831299, + "learning_rate": 9.36085790367499e-05, + "loss": 1.9826, + "step": 6121 + }, + { + "epoch": 1.8790669122160835, + "grad_norm": 0.49294769763946533, + "learning_rate": 9.360614722291157e-05, + "loss": 1.8478, + "step": 6122 + }, + { + "epoch": 1.8793738489871088, + "grad_norm": 0.5642881393432617, + "learning_rate": 9.360371497813172e-05, + "loss": 1.883, + "step": 6123 + }, + { + "epoch": 1.8796807857581337, + "grad_norm": 0.5257276296615601, + "learning_rate": 9.36012823024344e-05, + "loss": 1.8577, + "step": 6124 + }, + { + "epoch": 1.879987722529159, + "grad_norm": 0.36913231015205383, + "learning_rate": 9.359884919584366e-05, + "loss": 1.8934, + "step": 6125 + }, + { + "epoch": 1.8802946593001841, + "grad_norm": 0.43373262882232666, + "learning_rate": 9.359641565838353e-05, + "loss": 1.8354, + "step": 6126 + }, + { + "epoch": 1.8806015960712092, + "grad_norm": 0.5280462503433228, + "learning_rate": 9.359398169007807e-05, + "loss": 1.9446, + "step": 6127 + }, + { + "epoch": 1.8809085328422346, + "grad_norm": 0.4991915225982666, + "learning_rate": 9.359154729095135e-05, + "loss": 1.9003, + "step": 6128 + }, + { + "epoch": 1.8812154696132597, + "grad_norm": 0.3766331374645233, + "learning_rate": 9.358911246102738e-05, + "loss": 1.9149, + "step": 6129 + }, + { + "epoch": 1.8815224063842848, + "grad_norm": 0.39050692319869995, + "learning_rate": 9.358667720033026e-05, + "loss": 1.8945, + "step": 6130 + }, + { + "epoch": 1.8818293431553101, + "grad_norm": 0.47633904218673706, + "learning_rate": 9.358424150888405e-05, + "loss": 1.8772, + "step": 6131 + }, + { + "epoch": 1.882136279926335, + "grad_norm": 0.46322503685951233, + "learning_rate": 9.358180538671283e-05, + "loss": 1.893, + "step": 6132 + }, + { + "epoch": 1.8824432166973604, + "grad_norm": 0.39437612891197205, + "learning_rate": 9.357936883384066e-05, + "loss": 1.9394, + "step": 6133 + }, + { + "epoch": 1.8827501534683855, + "grad_norm": 0.4534996747970581, + "learning_rate": 9.357693185029162e-05, + "loss": 1.9689, + "step": 6134 + }, + { + "epoch": 1.8830570902394106, + "grad_norm": 0.4408230483531952, + "learning_rate": 9.35744944360898e-05, + "loss": 1.876, + "step": 6135 + }, + { + "epoch": 1.883364027010436, + "grad_norm": 0.5688899755477905, + "learning_rate": 9.35720565912593e-05, + "loss": 2.0153, + "step": 6136 + }, + { + "epoch": 1.883670963781461, + "grad_norm": 0.5005510449409485, + "learning_rate": 9.356961831582418e-05, + "loss": 1.9454, + "step": 6137 + }, + { + "epoch": 1.8839779005524862, + "grad_norm": 0.4002588987350464, + "learning_rate": 9.356717960980856e-05, + "loss": 1.9153, + "step": 6138 + }, + { + "epoch": 1.8842848373235115, + "grad_norm": 0.49053385853767395, + "learning_rate": 9.356474047323653e-05, + "loss": 1.9734, + "step": 6139 + }, + { + "epoch": 1.8845917740945364, + "grad_norm": 0.4828382432460785, + "learning_rate": 9.35623009061322e-05, + "loss": 1.8946, + "step": 6140 + }, + { + "epoch": 1.8848987108655617, + "grad_norm": 0.4389181137084961, + "learning_rate": 9.35598609085197e-05, + "loss": 1.9491, + "step": 6141 + }, + { + "epoch": 1.8852056476365868, + "grad_norm": 0.4010564982891083, + "learning_rate": 9.35574204804231e-05, + "loss": 1.8786, + "step": 6142 + }, + { + "epoch": 1.885512584407612, + "grad_norm": 0.4038756787776947, + "learning_rate": 9.355497962186657e-05, + "loss": 1.907, + "step": 6143 + }, + { + "epoch": 1.8858195211786373, + "grad_norm": 0.5030881762504578, + "learning_rate": 9.355253833287418e-05, + "loss": 1.8438, + "step": 6144 + }, + { + "epoch": 1.8861264579496624, + "grad_norm": 0.42690956592559814, + "learning_rate": 9.355009661347007e-05, + "loss": 1.8254, + "step": 6145 + }, + { + "epoch": 1.8864333947206875, + "grad_norm": 0.37733983993530273, + "learning_rate": 9.35476544636784e-05, + "loss": 1.9035, + "step": 6146 + }, + { + "epoch": 1.8867403314917128, + "grad_norm": 0.36874648928642273, + "learning_rate": 9.354521188352327e-05, + "loss": 1.885, + "step": 6147 + }, + { + "epoch": 1.8870472682627377, + "grad_norm": 0.36208659410476685, + "learning_rate": 9.354276887302885e-05, + "loss": 1.9416, + "step": 6148 + }, + { + "epoch": 1.887354205033763, + "grad_norm": 0.3952158987522125, + "learning_rate": 9.354032543221926e-05, + "loss": 1.9073, + "step": 6149 + }, + { + "epoch": 1.8876611418047882, + "grad_norm": 0.3603280782699585, + "learning_rate": 9.353788156111864e-05, + "loss": 1.9204, + "step": 6150 + }, + { + "epoch": 1.8879680785758133, + "grad_norm": 0.4325824975967407, + "learning_rate": 9.353543725975118e-05, + "loss": 1.9345, + "step": 6151 + }, + { + "epoch": 1.8882750153468386, + "grad_norm": 0.46270960569381714, + "learning_rate": 9.3532992528141e-05, + "loss": 1.9783, + "step": 6152 + }, + { + "epoch": 1.8885819521178637, + "grad_norm": 0.42317959666252136, + "learning_rate": 9.353054736631228e-05, + "loss": 1.9252, + "step": 6153 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.42653194069862366, + "learning_rate": 9.352810177428917e-05, + "loss": 1.9875, + "step": 6154 + }, + { + "epoch": 1.8891958256599142, + "grad_norm": 0.49819129705429077, + "learning_rate": 9.352565575209584e-05, + "loss": 1.9334, + "step": 6155 + }, + { + "epoch": 1.889502762430939, + "grad_norm": 0.4481790065765381, + "learning_rate": 9.352320929975646e-05, + "loss": 1.8939, + "step": 6156 + }, + { + "epoch": 1.8898096992019644, + "grad_norm": 0.41602686047554016, + "learning_rate": 9.352076241729524e-05, + "loss": 1.9207, + "step": 6157 + }, + { + "epoch": 1.8901166359729895, + "grad_norm": 0.4516759216785431, + "learning_rate": 9.351831510473633e-05, + "loss": 1.9384, + "step": 6158 + }, + { + "epoch": 1.8904235727440146, + "grad_norm": 0.5030251741409302, + "learning_rate": 9.351586736210391e-05, + "loss": 1.9787, + "step": 6159 + }, + { + "epoch": 1.89073050951504, + "grad_norm": 0.37176215648651123, + "learning_rate": 9.35134191894222e-05, + "loss": 1.8826, + "step": 6160 + }, + { + "epoch": 1.891037446286065, + "grad_norm": 0.3850235939025879, + "learning_rate": 9.351097058671537e-05, + "loss": 1.8689, + "step": 6161 + }, + { + "epoch": 1.8913443830570902, + "grad_norm": 0.3740260601043701, + "learning_rate": 9.350852155400764e-05, + "loss": 1.8624, + "step": 6162 + }, + { + "epoch": 1.8916513198281155, + "grad_norm": 0.386124849319458, + "learning_rate": 9.350607209132318e-05, + "loss": 1.8506, + "step": 6163 + }, + { + "epoch": 1.8919582565991406, + "grad_norm": 0.3743472993373871, + "learning_rate": 9.350362219868623e-05, + "loss": 1.9499, + "step": 6164 + }, + { + "epoch": 1.8922651933701657, + "grad_norm": 0.4257555603981018, + "learning_rate": 9.350117187612097e-05, + "loss": 1.9407, + "step": 6165 + }, + { + "epoch": 1.892572130141191, + "grad_norm": 0.37218552827835083, + "learning_rate": 9.349872112365163e-05, + "loss": 1.8772, + "step": 6166 + }, + { + "epoch": 1.892879066912216, + "grad_norm": 0.3443894386291504, + "learning_rate": 9.349626994130245e-05, + "loss": 1.8846, + "step": 6167 + }, + { + "epoch": 1.8931860036832413, + "grad_norm": 0.33507248759269714, + "learning_rate": 9.349381832909763e-05, + "loss": 1.9303, + "step": 6168 + }, + { + "epoch": 1.8934929404542664, + "grad_norm": 0.3844592869281769, + "learning_rate": 9.349136628706141e-05, + "loss": 1.9453, + "step": 6169 + }, + { + "epoch": 1.8937998772252915, + "grad_norm": 0.35765793919563293, + "learning_rate": 9.348891381521802e-05, + "loss": 1.8745, + "step": 6170 + }, + { + "epoch": 1.8941068139963169, + "grad_norm": 0.3732185661792755, + "learning_rate": 9.348646091359168e-05, + "loss": 1.9318, + "step": 6171 + }, + { + "epoch": 1.894413750767342, + "grad_norm": 0.3704257607460022, + "learning_rate": 9.348400758220666e-05, + "loss": 1.9285, + "step": 6172 + }, + { + "epoch": 1.894720687538367, + "grad_norm": 0.32159942388534546, + "learning_rate": 9.348155382108717e-05, + "loss": 1.8368, + "step": 6173 + }, + { + "epoch": 1.8950276243093924, + "grad_norm": 0.32755646109580994, + "learning_rate": 9.34790996302575e-05, + "loss": 1.8975, + "step": 6174 + }, + { + "epoch": 1.8953345610804173, + "grad_norm": 0.38797906041145325, + "learning_rate": 9.347664500974186e-05, + "loss": 1.9684, + "step": 6175 + }, + { + "epoch": 1.8956414978514426, + "grad_norm": 0.3870599865913391, + "learning_rate": 9.347418995956456e-05, + "loss": 1.963, + "step": 6176 + }, + { + "epoch": 1.8959484346224678, + "grad_norm": 0.35739025473594666, + "learning_rate": 9.347173447974982e-05, + "loss": 1.8912, + "step": 6177 + }, + { + "epoch": 1.8962553713934929, + "grad_norm": 0.3525852859020233, + "learning_rate": 9.346927857032193e-05, + "loss": 1.8455, + "step": 6178 + }, + { + "epoch": 1.8965623081645182, + "grad_norm": 0.39735934138298035, + "learning_rate": 9.346682223130514e-05, + "loss": 1.8824, + "step": 6179 + }, + { + "epoch": 1.8968692449355433, + "grad_norm": 0.3677692413330078, + "learning_rate": 9.346436546272373e-05, + "loss": 1.8723, + "step": 6180 + }, + { + "epoch": 1.8971761817065684, + "grad_norm": 0.3660476505756378, + "learning_rate": 9.346190826460199e-05, + "loss": 1.9674, + "step": 6181 + }, + { + "epoch": 1.8974831184775938, + "grad_norm": 0.4416230022907257, + "learning_rate": 9.34594506369642e-05, + "loss": 1.9309, + "step": 6182 + }, + { + "epoch": 1.8977900552486187, + "grad_norm": 0.39761826395988464, + "learning_rate": 9.345699257983466e-05, + "loss": 1.9408, + "step": 6183 + }, + { + "epoch": 1.898096992019644, + "grad_norm": 0.44419440627098083, + "learning_rate": 9.345453409323763e-05, + "loss": 2.0013, + "step": 6184 + }, + { + "epoch": 1.898403928790669, + "grad_norm": 0.4173676371574402, + "learning_rate": 9.345207517719743e-05, + "loss": 1.8462, + "step": 6185 + }, + { + "epoch": 1.8987108655616942, + "grad_norm": 0.39312002062797546, + "learning_rate": 9.344961583173837e-05, + "loss": 1.8716, + "step": 6186 + }, + { + "epoch": 1.8990178023327196, + "grad_norm": 0.389996737241745, + "learning_rate": 9.344715605688472e-05, + "loss": 1.9331, + "step": 6187 + }, + { + "epoch": 1.8993247391037447, + "grad_norm": 0.4575251340866089, + "learning_rate": 9.34446958526608e-05, + "loss": 1.9408, + "step": 6188 + }, + { + "epoch": 1.8996316758747698, + "grad_norm": 0.425075888633728, + "learning_rate": 9.344223521909097e-05, + "loss": 1.8632, + "step": 6189 + }, + { + "epoch": 1.899938612645795, + "grad_norm": 0.3622394800186157, + "learning_rate": 9.343977415619948e-05, + "loss": 1.8671, + "step": 6190 + }, + { + "epoch": 1.90024554941682, + "grad_norm": 0.38955047726631165, + "learning_rate": 9.343731266401068e-05, + "loss": 1.8955, + "step": 6191 + }, + { + "epoch": 1.9005524861878453, + "grad_norm": 0.40853381156921387, + "learning_rate": 9.34348507425489e-05, + "loss": 1.8477, + "step": 6192 + }, + { + "epoch": 1.9008594229588704, + "grad_norm": 0.36416095495224, + "learning_rate": 9.343238839183848e-05, + "loss": 1.8596, + "step": 6193 + }, + { + "epoch": 1.9011663597298956, + "grad_norm": 0.3371017277240753, + "learning_rate": 9.342992561190374e-05, + "loss": 1.9646, + "step": 6194 + }, + { + "epoch": 1.901473296500921, + "grad_norm": 0.3605191111564636, + "learning_rate": 9.3427462402769e-05, + "loss": 1.9165, + "step": 6195 + }, + { + "epoch": 1.901780233271946, + "grad_norm": 0.32952287793159485, + "learning_rate": 9.342499876445863e-05, + "loss": 1.8827, + "step": 6196 + }, + { + "epoch": 1.9020871700429711, + "grad_norm": 0.3627411425113678, + "learning_rate": 9.342253469699698e-05, + "loss": 1.9058, + "step": 6197 + }, + { + "epoch": 1.9023941068139965, + "grad_norm": 0.3830505311489105, + "learning_rate": 9.342007020040839e-05, + "loss": 1.89, + "step": 6198 + }, + { + "epoch": 1.9027010435850213, + "grad_norm": 0.36550065875053406, + "learning_rate": 9.341760527471722e-05, + "loss": 1.9004, + "step": 6199 + }, + { + "epoch": 1.9030079803560467, + "grad_norm": 0.4098506569862366, + "learning_rate": 9.341513991994782e-05, + "loss": 1.8656, + "step": 6200 + }, + { + "epoch": 1.9033149171270718, + "grad_norm": 0.5218825340270996, + "learning_rate": 9.341267413612456e-05, + "loss": 1.9179, + "step": 6201 + }, + { + "epoch": 1.903621853898097, + "grad_norm": 0.6201978921890259, + "learning_rate": 9.34102079232718e-05, + "loss": 1.9485, + "step": 6202 + }, + { + "epoch": 1.9039287906691222, + "grad_norm": 0.597594141960144, + "learning_rate": 9.340774128141395e-05, + "loss": 1.9074, + "step": 6203 + }, + { + "epoch": 1.9042357274401474, + "grad_norm": 0.477268248796463, + "learning_rate": 9.340527421057533e-05, + "loss": 1.9202, + "step": 6204 + }, + { + "epoch": 1.9045426642111725, + "grad_norm": 0.39805278182029724, + "learning_rate": 9.340280671078035e-05, + "loss": 1.8801, + "step": 6205 + }, + { + "epoch": 1.9048496009821978, + "grad_norm": 0.5815454721450806, + "learning_rate": 9.340033878205342e-05, + "loss": 1.8564, + "step": 6206 + }, + { + "epoch": 1.9051565377532227, + "grad_norm": 0.6385661363601685, + "learning_rate": 9.339787042441888e-05, + "loss": 1.8992, + "step": 6207 + }, + { + "epoch": 1.905463474524248, + "grad_norm": 0.5905124545097351, + "learning_rate": 9.339540163790116e-05, + "loss": 1.9608, + "step": 6208 + }, + { + "epoch": 1.9057704112952731, + "grad_norm": 0.37329113483428955, + "learning_rate": 9.339293242252465e-05, + "loss": 1.9037, + "step": 6209 + }, + { + "epoch": 1.9060773480662982, + "grad_norm": 0.4568968117237091, + "learning_rate": 9.339046277831374e-05, + "loss": 1.8719, + "step": 6210 + }, + { + "epoch": 1.9063842848373236, + "grad_norm": 0.43003782629966736, + "learning_rate": 9.338799270529284e-05, + "loss": 1.8594, + "step": 6211 + }, + { + "epoch": 1.9066912216083487, + "grad_norm": 0.3795240819454193, + "learning_rate": 9.338552220348637e-05, + "loss": 1.8645, + "step": 6212 + }, + { + "epoch": 1.9069981583793738, + "grad_norm": 0.3791581392288208, + "learning_rate": 9.338305127291876e-05, + "loss": 1.9076, + "step": 6213 + }, + { + "epoch": 1.9073050951503991, + "grad_norm": 0.3747733533382416, + "learning_rate": 9.338057991361438e-05, + "loss": 1.8665, + "step": 6214 + }, + { + "epoch": 1.907612031921424, + "grad_norm": 0.3994114100933075, + "learning_rate": 9.337810812559771e-05, + "loss": 1.9202, + "step": 6215 + }, + { + "epoch": 1.9079189686924494, + "grad_norm": 0.3808605670928955, + "learning_rate": 9.337563590889312e-05, + "loss": 1.9272, + "step": 6216 + }, + { + "epoch": 1.9082259054634745, + "grad_norm": 0.3461966812610626, + "learning_rate": 9.33731632635251e-05, + "loss": 1.8621, + "step": 6217 + }, + { + "epoch": 1.9085328422344996, + "grad_norm": 0.37272316217422485, + "learning_rate": 9.337069018951805e-05, + "loss": 1.8996, + "step": 6218 + }, + { + "epoch": 1.908839779005525, + "grad_norm": 0.40319329500198364, + "learning_rate": 9.336821668689642e-05, + "loss": 1.8852, + "step": 6219 + }, + { + "epoch": 1.90914671577655, + "grad_norm": 0.4059053659439087, + "learning_rate": 9.336574275568463e-05, + "loss": 1.9156, + "step": 6220 + }, + { + "epoch": 1.9094536525475752, + "grad_norm": 0.41244640946388245, + "learning_rate": 9.336326839590719e-05, + "loss": 1.9858, + "step": 6221 + }, + { + "epoch": 1.9097605893186005, + "grad_norm": 0.38230007886886597, + "learning_rate": 9.336079360758849e-05, + "loss": 1.8756, + "step": 6222 + }, + { + "epoch": 1.9100675260896254, + "grad_norm": 0.3620646297931671, + "learning_rate": 9.335831839075304e-05, + "loss": 1.9305, + "step": 6223 + }, + { + "epoch": 1.9103744628606507, + "grad_norm": 0.3700193166732788, + "learning_rate": 9.335584274542525e-05, + "loss": 1.8544, + "step": 6224 + }, + { + "epoch": 1.9106813996316758, + "grad_norm": 0.36827734112739563, + "learning_rate": 9.335336667162962e-05, + "loss": 1.8658, + "step": 6225 + }, + { + "epoch": 1.910988336402701, + "grad_norm": 0.33878061175346375, + "learning_rate": 9.33508901693906e-05, + "loss": 1.8638, + "step": 6226 + }, + { + "epoch": 1.9112952731737263, + "grad_norm": 0.3522186577320099, + "learning_rate": 9.334841323873269e-05, + "loss": 1.9109, + "step": 6227 + }, + { + "epoch": 1.9116022099447514, + "grad_norm": 0.3552776277065277, + "learning_rate": 9.334593587968035e-05, + "loss": 1.8499, + "step": 6228 + }, + { + "epoch": 1.9119091467157765, + "grad_norm": 0.3232300877571106, + "learning_rate": 9.334345809225805e-05, + "loss": 1.9078, + "step": 6229 + }, + { + "epoch": 1.9122160834868018, + "grad_norm": 0.3500599265098572, + "learning_rate": 9.33409798764903e-05, + "loss": 1.8953, + "step": 6230 + }, + { + "epoch": 1.9125230202578267, + "grad_norm": 0.4011479914188385, + "learning_rate": 9.333850123240159e-05, + "loss": 1.8961, + "step": 6231 + }, + { + "epoch": 1.912829957028852, + "grad_norm": 0.419539213180542, + "learning_rate": 9.333602216001642e-05, + "loss": 1.9381, + "step": 6232 + }, + { + "epoch": 1.9131368937998774, + "grad_norm": 0.364956259727478, + "learning_rate": 9.333354265935926e-05, + "loss": 1.8495, + "step": 6233 + }, + { + "epoch": 1.9134438305709023, + "grad_norm": 0.3322601318359375, + "learning_rate": 9.333106273045464e-05, + "loss": 1.8389, + "step": 6234 + }, + { + "epoch": 1.9137507673419276, + "grad_norm": 0.3706522583961487, + "learning_rate": 9.332858237332705e-05, + "loss": 1.904, + "step": 6235 + }, + { + "epoch": 1.9140577041129527, + "grad_norm": 0.3900963366031647, + "learning_rate": 9.332610158800104e-05, + "loss": 1.8974, + "step": 6236 + }, + { + "epoch": 1.9143646408839778, + "grad_norm": 0.3308334946632385, + "learning_rate": 9.332362037450108e-05, + "loss": 1.959, + "step": 6237 + }, + { + "epoch": 1.9146715776550032, + "grad_norm": 0.37876754999160767, + "learning_rate": 9.332113873285171e-05, + "loss": 1.9187, + "step": 6238 + }, + { + "epoch": 1.9149785144260283, + "grad_norm": 0.3557550609111786, + "learning_rate": 9.331865666307746e-05, + "loss": 1.9351, + "step": 6239 + }, + { + "epoch": 1.9152854511970534, + "grad_norm": 0.3792133927345276, + "learning_rate": 9.331617416520285e-05, + "loss": 1.8488, + "step": 6240 + }, + { + "epoch": 1.9155923879680787, + "grad_norm": 0.40517017245292664, + "learning_rate": 9.331369123925242e-05, + "loss": 1.9311, + "step": 6241 + }, + { + "epoch": 1.9158993247391036, + "grad_norm": 0.34011030197143555, + "learning_rate": 9.331120788525072e-05, + "loss": 1.8606, + "step": 6242 + }, + { + "epoch": 1.916206261510129, + "grad_norm": 0.39949584007263184, + "learning_rate": 9.330872410322227e-05, + "loss": 1.9156, + "step": 6243 + }, + { + "epoch": 1.916513198281154, + "grad_norm": 0.3771394193172455, + "learning_rate": 9.330623989319162e-05, + "loss": 1.8448, + "step": 6244 + }, + { + "epoch": 1.9168201350521792, + "grad_norm": 0.32114169001579285, + "learning_rate": 9.330375525518333e-05, + "loss": 1.8681, + "step": 6245 + }, + { + "epoch": 1.9171270718232045, + "grad_norm": 0.3438408672809601, + "learning_rate": 9.330127018922194e-05, + "loss": 1.8582, + "step": 6246 + }, + { + "epoch": 1.9174340085942296, + "grad_norm": 0.35971906781196594, + "learning_rate": 9.329878469533201e-05, + "loss": 1.9026, + "step": 6247 + }, + { + "epoch": 1.9177409453652547, + "grad_norm": 0.3953855633735657, + "learning_rate": 9.329629877353813e-05, + "loss": 1.8837, + "step": 6248 + }, + { + "epoch": 1.91804788213628, + "grad_norm": 0.36541905999183655, + "learning_rate": 9.329381242386485e-05, + "loss": 1.9156, + "step": 6249 + }, + { + "epoch": 1.918354818907305, + "grad_norm": 0.3577594459056854, + "learning_rate": 9.329132564633673e-05, + "loss": 1.8791, + "step": 6250 + }, + { + "epoch": 1.9186617556783303, + "grad_norm": 0.3869122564792633, + "learning_rate": 9.328883844097837e-05, + "loss": 1.9048, + "step": 6251 + }, + { + "epoch": 1.9189686924493554, + "grad_norm": 0.35097724199295044, + "learning_rate": 9.328635080781433e-05, + "loss": 1.9602, + "step": 6252 + }, + { + "epoch": 1.9192756292203805, + "grad_norm": 0.3813062012195587, + "learning_rate": 9.328386274686919e-05, + "loss": 1.9133, + "step": 6253 + }, + { + "epoch": 1.9195825659914059, + "grad_norm": 0.3950280249118805, + "learning_rate": 9.328137425816756e-05, + "loss": 1.9462, + "step": 6254 + }, + { + "epoch": 1.919889502762431, + "grad_norm": 0.41710540652275085, + "learning_rate": 9.327888534173402e-05, + "loss": 1.8616, + "step": 6255 + }, + { + "epoch": 1.920196439533456, + "grad_norm": 0.39998626708984375, + "learning_rate": 9.327639599759318e-05, + "loss": 1.8758, + "step": 6256 + }, + { + "epoch": 1.9205033763044814, + "grad_norm": 0.35425302386283875, + "learning_rate": 9.32739062257696e-05, + "loss": 1.8896, + "step": 6257 + }, + { + "epoch": 1.9208103130755063, + "grad_norm": 0.3487682640552521, + "learning_rate": 9.327141602628793e-05, + "loss": 1.8901, + "step": 6258 + }, + { + "epoch": 1.9211172498465316, + "grad_norm": 0.38767126202583313, + "learning_rate": 9.326892539917277e-05, + "loss": 1.9264, + "step": 6259 + }, + { + "epoch": 1.9214241866175568, + "grad_norm": 0.4265333116054535, + "learning_rate": 9.326643434444872e-05, + "loss": 1.9282, + "step": 6260 + }, + { + "epoch": 1.9217311233885819, + "grad_norm": 0.3386894166469574, + "learning_rate": 9.326394286214042e-05, + "loss": 1.8167, + "step": 6261 + }, + { + "epoch": 1.9220380601596072, + "grad_norm": 0.3594066798686981, + "learning_rate": 9.326145095227246e-05, + "loss": 1.9293, + "step": 6262 + }, + { + "epoch": 1.9223449969306323, + "grad_norm": 0.4041733741760254, + "learning_rate": 9.32589586148695e-05, + "loss": 2.0066, + "step": 6263 + }, + { + "epoch": 1.9226519337016574, + "grad_norm": 0.45588794350624084, + "learning_rate": 9.325646584995615e-05, + "loss": 1.9485, + "step": 6264 + }, + { + "epoch": 1.9229588704726828, + "grad_norm": 0.42583590745925903, + "learning_rate": 9.325397265755705e-05, + "loss": 1.8973, + "step": 6265 + }, + { + "epoch": 1.9232658072437077, + "grad_norm": 0.38701504468917847, + "learning_rate": 9.325147903769684e-05, + "loss": 1.9624, + "step": 6266 + }, + { + "epoch": 1.923572744014733, + "grad_norm": 0.4298608899116516, + "learning_rate": 9.324898499040017e-05, + "loss": 1.9033, + "step": 6267 + }, + { + "epoch": 1.923879680785758, + "grad_norm": 0.3692619800567627, + "learning_rate": 9.324649051569167e-05, + "loss": 1.973, + "step": 6268 + }, + { + "epoch": 1.9241866175567832, + "grad_norm": 0.40625011920928955, + "learning_rate": 9.324399561359602e-05, + "loss": 1.8629, + "step": 6269 + }, + { + "epoch": 1.9244935543278086, + "grad_norm": 0.43613263964653015, + "learning_rate": 9.324150028413784e-05, + "loss": 1.8928, + "step": 6270 + }, + { + "epoch": 1.9248004910988337, + "grad_norm": 0.4670937657356262, + "learning_rate": 9.323900452734182e-05, + "loss": 1.8809, + "step": 6271 + }, + { + "epoch": 1.9251074278698588, + "grad_norm": 0.43263986706733704, + "learning_rate": 9.323650834323262e-05, + "loss": 1.891, + "step": 6272 + }, + { + "epoch": 1.9254143646408841, + "grad_norm": 0.4253878891468048, + "learning_rate": 9.32340117318349e-05, + "loss": 2.0064, + "step": 6273 + }, + { + "epoch": 1.925721301411909, + "grad_norm": 0.3742302358150482, + "learning_rate": 9.323151469317332e-05, + "loss": 1.9441, + "step": 6274 + }, + { + "epoch": 1.9260282381829343, + "grad_norm": 0.37415632605552673, + "learning_rate": 9.32290172272726e-05, + "loss": 1.8901, + "step": 6275 + }, + { + "epoch": 1.9263351749539595, + "grad_norm": 0.402935266494751, + "learning_rate": 9.322651933415738e-05, + "loss": 1.9013, + "step": 6276 + }, + { + "epoch": 1.9266421117249846, + "grad_norm": 0.479819118976593, + "learning_rate": 9.322402101385235e-05, + "loss": 1.9713, + "step": 6277 + }, + { + "epoch": 1.92694904849601, + "grad_norm": 0.4472719430923462, + "learning_rate": 9.322152226638222e-05, + "loss": 1.9106, + "step": 6278 + }, + { + "epoch": 1.927255985267035, + "grad_norm": 0.36508920788764954, + "learning_rate": 9.321902309177168e-05, + "loss": 1.8999, + "step": 6279 + }, + { + "epoch": 1.9275629220380601, + "grad_norm": 0.38674476742744446, + "learning_rate": 9.321652349004542e-05, + "loss": 1.8653, + "step": 6280 + }, + { + "epoch": 1.9278698588090855, + "grad_norm": 0.3745587170124054, + "learning_rate": 9.321402346122814e-05, + "loss": 1.8764, + "step": 6281 + }, + { + "epoch": 1.9281767955801103, + "grad_norm": 0.37824445962905884, + "learning_rate": 9.321152300534454e-05, + "loss": 1.8712, + "step": 6282 + }, + { + "epoch": 1.9284837323511357, + "grad_norm": 0.3442685306072235, + "learning_rate": 9.320902212241936e-05, + "loss": 1.8242, + "step": 6283 + }, + { + "epoch": 1.9287906691221608, + "grad_norm": 0.3152186870574951, + "learning_rate": 9.32065208124773e-05, + "loss": 1.9282, + "step": 6284 + }, + { + "epoch": 1.929097605893186, + "grad_norm": 0.35380542278289795, + "learning_rate": 9.320401907554306e-05, + "loss": 1.8783, + "step": 6285 + }, + { + "epoch": 1.9294045426642112, + "grad_norm": 0.3140089511871338, + "learning_rate": 9.320151691164138e-05, + "loss": 1.9174, + "step": 6286 + }, + { + "epoch": 1.9297114794352364, + "grad_norm": 0.33666202425956726, + "learning_rate": 9.3199014320797e-05, + "loss": 1.8926, + "step": 6287 + }, + { + "epoch": 1.9300184162062615, + "grad_norm": 0.3297472894191742, + "learning_rate": 9.319651130303465e-05, + "loss": 1.8763, + "step": 6288 + }, + { + "epoch": 1.9303253529772868, + "grad_norm": 0.3323235511779785, + "learning_rate": 9.319400785837906e-05, + "loss": 1.9088, + "step": 6289 + }, + { + "epoch": 1.9306322897483117, + "grad_norm": 0.32601413130760193, + "learning_rate": 9.319150398685494e-05, + "loss": 1.8672, + "step": 6290 + }, + { + "epoch": 1.930939226519337, + "grad_norm": 0.35310089588165283, + "learning_rate": 9.318899968848708e-05, + "loss": 1.9492, + "step": 6291 + }, + { + "epoch": 1.9312461632903621, + "grad_norm": 0.3718548119068146, + "learning_rate": 9.31864949633002e-05, + "loss": 1.8692, + "step": 6292 + }, + { + "epoch": 1.9315531000613873, + "grad_norm": 0.42382025718688965, + "learning_rate": 9.318398981131908e-05, + "loss": 1.9693, + "step": 6293 + }, + { + "epoch": 1.9318600368324126, + "grad_norm": 0.5123299360275269, + "learning_rate": 9.318148423256845e-05, + "loss": 2.0117, + "step": 6294 + }, + { + "epoch": 1.9321669736034377, + "grad_norm": 0.4483809769153595, + "learning_rate": 9.317897822707308e-05, + "loss": 1.9165, + "step": 6295 + }, + { + "epoch": 1.9324739103744628, + "grad_norm": 0.4385908544063568, + "learning_rate": 9.317647179485776e-05, + "loss": 1.8869, + "step": 6296 + }, + { + "epoch": 1.9327808471454881, + "grad_norm": 0.42863771319389343, + "learning_rate": 9.317396493594724e-05, + "loss": 1.9484, + "step": 6297 + }, + { + "epoch": 1.933087783916513, + "grad_norm": 0.4130534529685974, + "learning_rate": 9.317145765036627e-05, + "loss": 1.9201, + "step": 6298 + }, + { + "epoch": 1.9333947206875384, + "grad_norm": 0.39024612307548523, + "learning_rate": 9.316894993813965e-05, + "loss": 1.9674, + "step": 6299 + }, + { + "epoch": 1.9337016574585635, + "grad_norm": 0.41060271859169006, + "learning_rate": 9.316644179929219e-05, + "loss": 1.9529, + "step": 6300 + }, + { + "epoch": 1.9340085942295886, + "grad_norm": 0.4302372634410858, + "learning_rate": 9.316393323384863e-05, + "loss": 1.8998, + "step": 6301 + }, + { + "epoch": 1.934315531000614, + "grad_norm": 0.3739410936832428, + "learning_rate": 9.316142424183379e-05, + "loss": 1.8812, + "step": 6302 + }, + { + "epoch": 1.934622467771639, + "grad_norm": 0.3965891897678375, + "learning_rate": 9.315891482327245e-05, + "loss": 1.8851, + "step": 6303 + }, + { + "epoch": 1.9349294045426642, + "grad_norm": 0.4486664831638336, + "learning_rate": 9.315640497818943e-05, + "loss": 1.9494, + "step": 6304 + }, + { + "epoch": 1.9352363413136895, + "grad_norm": 0.5530070662498474, + "learning_rate": 9.315389470660951e-05, + "loss": 1.9716, + "step": 6305 + }, + { + "epoch": 1.9355432780847146, + "grad_norm": 0.7142495512962341, + "learning_rate": 9.315138400855751e-05, + "loss": 1.947, + "step": 6306 + }, + { + "epoch": 1.9358502148557397, + "grad_norm": 0.7555594444274902, + "learning_rate": 9.314887288405827e-05, + "loss": 1.873, + "step": 6307 + }, + { + "epoch": 1.936157151626765, + "grad_norm": 0.6025232076644897, + "learning_rate": 9.314636133313654e-05, + "loss": 1.9189, + "step": 6308 + }, + { + "epoch": 1.93646408839779, + "grad_norm": 0.3686346113681793, + "learning_rate": 9.314384935581719e-05, + "loss": 1.8461, + "step": 6309 + }, + { + "epoch": 1.9367710251688153, + "grad_norm": 0.46265771985054016, + "learning_rate": 9.314133695212505e-05, + "loss": 1.8955, + "step": 6310 + }, + { + "epoch": 1.9370779619398404, + "grad_norm": 0.7023865580558777, + "learning_rate": 9.313882412208492e-05, + "loss": 1.9378, + "step": 6311 + }, + { + "epoch": 1.9373848987108655, + "grad_norm": 0.7163348197937012, + "learning_rate": 9.313631086572163e-05, + "loss": 1.9278, + "step": 6312 + }, + { + "epoch": 1.9376918354818908, + "grad_norm": 0.4772320091724396, + "learning_rate": 9.313379718306006e-05, + "loss": 1.9215, + "step": 6313 + }, + { + "epoch": 1.937998772252916, + "grad_norm": 0.4934171438217163, + "learning_rate": 9.313128307412501e-05, + "loss": 1.9725, + "step": 6314 + }, + { + "epoch": 1.938305709023941, + "grad_norm": 0.5988278985023499, + "learning_rate": 9.312876853894134e-05, + "loss": 1.9238, + "step": 6315 + }, + { + "epoch": 1.9386126457949664, + "grad_norm": 0.5819640159606934, + "learning_rate": 9.31262535775339e-05, + "loss": 1.9228, + "step": 6316 + }, + { + "epoch": 1.9389195825659913, + "grad_norm": 0.49525877833366394, + "learning_rate": 9.312373818992756e-05, + "loss": 1.8939, + "step": 6317 + }, + { + "epoch": 1.9392265193370166, + "grad_norm": 0.3778049647808075, + "learning_rate": 9.312122237614715e-05, + "loss": 1.8709, + "step": 6318 + }, + { + "epoch": 1.9395334561080417, + "grad_norm": 0.48716801404953003, + "learning_rate": 9.311870613621754e-05, + "loss": 1.9014, + "step": 6319 + }, + { + "epoch": 1.9398403928790668, + "grad_norm": 0.47298866510391235, + "learning_rate": 9.311618947016362e-05, + "loss": 1.8686, + "step": 6320 + }, + { + "epoch": 1.9401473296500922, + "grad_norm": 0.3709685206413269, + "learning_rate": 9.311367237801023e-05, + "loss": 1.9531, + "step": 6321 + }, + { + "epoch": 1.9404542664211173, + "grad_norm": 0.3898928761482239, + "learning_rate": 9.311115485978228e-05, + "loss": 1.8806, + "step": 6322 + }, + { + "epoch": 1.9407612031921424, + "grad_norm": 0.43091922998428345, + "learning_rate": 9.310863691550461e-05, + "loss": 1.9278, + "step": 6323 + }, + { + "epoch": 1.9410681399631677, + "grad_norm": 0.3788231909275055, + "learning_rate": 9.310611854520212e-05, + "loss": 1.893, + "step": 6324 + }, + { + "epoch": 1.9413750767341926, + "grad_norm": 0.4471469819545746, + "learning_rate": 9.310359974889972e-05, + "loss": 1.9706, + "step": 6325 + }, + { + "epoch": 1.941682013505218, + "grad_norm": 0.4047459661960602, + "learning_rate": 9.310108052662228e-05, + "loss": 1.8863, + "step": 6326 + }, + { + "epoch": 1.941988950276243, + "grad_norm": 0.4334566593170166, + "learning_rate": 9.309856087839468e-05, + "loss": 1.9543, + "step": 6327 + }, + { + "epoch": 1.9422958870472682, + "grad_norm": 0.3828316032886505, + "learning_rate": 9.309604080424185e-05, + "loss": 1.8601, + "step": 6328 + }, + { + "epoch": 1.9426028238182935, + "grad_norm": 0.3702560067176819, + "learning_rate": 9.30935203041887e-05, + "loss": 1.9055, + "step": 6329 + }, + { + "epoch": 1.9429097605893186, + "grad_norm": 0.4922797977924347, + "learning_rate": 9.309099937826011e-05, + "loss": 1.9589, + "step": 6330 + }, + { + "epoch": 1.9432166973603437, + "grad_norm": 0.4073271155357361, + "learning_rate": 9.308847802648102e-05, + "loss": 1.9727, + "step": 6331 + }, + { + "epoch": 1.943523634131369, + "grad_norm": 0.3833904266357422, + "learning_rate": 9.308595624887633e-05, + "loss": 1.8641, + "step": 6332 + }, + { + "epoch": 1.943830570902394, + "grad_norm": 0.44063761830329895, + "learning_rate": 9.308343404547095e-05, + "loss": 1.8996, + "step": 6333 + }, + { + "epoch": 1.9441375076734193, + "grad_norm": 0.4776977300643921, + "learning_rate": 9.308091141628983e-05, + "loss": 1.9353, + "step": 6334 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.39584699273109436, + "learning_rate": 9.307838836135792e-05, + "loss": 1.8521, + "step": 6335 + }, + { + "epoch": 1.9447513812154695, + "grad_norm": 0.3220890760421753, + "learning_rate": 9.30758648807001e-05, + "loss": 1.825, + "step": 6336 + }, + { + "epoch": 1.9450583179864949, + "grad_norm": 0.4301774501800537, + "learning_rate": 9.307334097434133e-05, + "loss": 1.9317, + "step": 6337 + }, + { + "epoch": 1.94536525475752, + "grad_norm": 0.439165323972702, + "learning_rate": 9.307081664230658e-05, + "loss": 1.8669, + "step": 6338 + }, + { + "epoch": 1.945672191528545, + "grad_norm": 0.4185279607772827, + "learning_rate": 9.306829188462076e-05, + "loss": 1.9512, + "step": 6339 + }, + { + "epoch": 1.9459791282995704, + "grad_norm": 0.4089502990245819, + "learning_rate": 9.306576670130885e-05, + "loss": 1.9607, + "step": 6340 + }, + { + "epoch": 1.9462860650705953, + "grad_norm": 0.508836567401886, + "learning_rate": 9.306324109239578e-05, + "loss": 1.9187, + "step": 6341 + }, + { + "epoch": 1.9465930018416207, + "grad_norm": 0.637534499168396, + "learning_rate": 9.306071505790652e-05, + "loss": 1.8237, + "step": 6342 + }, + { + "epoch": 1.9468999386126458, + "grad_norm": 0.5845112800598145, + "learning_rate": 9.305818859786603e-05, + "loss": 1.8238, + "step": 6343 + }, + { + "epoch": 1.9472068753836709, + "grad_norm": 0.4168374240398407, + "learning_rate": 9.305566171229932e-05, + "loss": 1.9343, + "step": 6344 + }, + { + "epoch": 1.9475138121546962, + "grad_norm": 0.43040701746940613, + "learning_rate": 9.305313440123129e-05, + "loss": 1.8774, + "step": 6345 + }, + { + "epoch": 1.9478207489257213, + "grad_norm": 0.6011641025543213, + "learning_rate": 9.305060666468696e-05, + "loss": 1.89, + "step": 6346 + }, + { + "epoch": 1.9481276856967464, + "grad_norm": 0.5530022382736206, + "learning_rate": 9.304807850269131e-05, + "loss": 2.0006, + "step": 6347 + }, + { + "epoch": 1.9484346224677718, + "grad_norm": 0.3707423210144043, + "learning_rate": 9.30455499152693e-05, + "loss": 1.9116, + "step": 6348 + }, + { + "epoch": 1.9487415592387967, + "grad_norm": 0.5013771653175354, + "learning_rate": 9.304302090244595e-05, + "loss": 1.8902, + "step": 6349 + }, + { + "epoch": 1.949048496009822, + "grad_norm": 0.5873609781265259, + "learning_rate": 9.304049146424623e-05, + "loss": 1.8879, + "step": 6350 + }, + { + "epoch": 1.949355432780847, + "grad_norm": 0.4389801621437073, + "learning_rate": 9.303796160069516e-05, + "loss": 1.9215, + "step": 6351 + }, + { + "epoch": 1.9496623695518722, + "grad_norm": 0.4004434645175934, + "learning_rate": 9.303543131181772e-05, + "loss": 1.9137, + "step": 6352 + }, + { + "epoch": 1.9499693063228976, + "grad_norm": 0.4928852617740631, + "learning_rate": 9.303290059763892e-05, + "loss": 1.9415, + "step": 6353 + }, + { + "epoch": 1.9502762430939227, + "grad_norm": 0.5045879483222961, + "learning_rate": 9.303036945818377e-05, + "loss": 1.8727, + "step": 6354 + }, + { + "epoch": 1.9505831798649478, + "grad_norm": 0.3434823453426361, + "learning_rate": 9.30278378934773e-05, + "loss": 1.8971, + "step": 6355 + }, + { + "epoch": 1.9508901166359731, + "grad_norm": 0.42980003356933594, + "learning_rate": 9.302530590354452e-05, + "loss": 1.9233, + "step": 6356 + }, + { + "epoch": 1.951197053406998, + "grad_norm": 0.3832406997680664, + "learning_rate": 9.302277348841042e-05, + "loss": 1.9317, + "step": 6357 + }, + { + "epoch": 1.9515039901780233, + "grad_norm": 0.37214264273643494, + "learning_rate": 9.30202406481001e-05, + "loss": 1.9172, + "step": 6358 + }, + { + "epoch": 1.9518109269490485, + "grad_norm": 0.3601585924625397, + "learning_rate": 9.30177073826385e-05, + "loss": 1.9286, + "step": 6359 + }, + { + "epoch": 1.9521178637200736, + "grad_norm": 0.36419349908828735, + "learning_rate": 9.301517369205072e-05, + "loss": 1.8624, + "step": 6360 + }, + { + "epoch": 1.952424800491099, + "grad_norm": 0.3808813691139221, + "learning_rate": 9.30126395763618e-05, + "loss": 1.8656, + "step": 6361 + }, + { + "epoch": 1.952731737262124, + "grad_norm": 0.39045700430870056, + "learning_rate": 9.301010503559675e-05, + "loss": 1.9205, + "step": 6362 + }, + { + "epoch": 1.9530386740331491, + "grad_norm": 0.37281444668769836, + "learning_rate": 9.300757006978065e-05, + "loss": 1.9162, + "step": 6363 + }, + { + "epoch": 1.9533456108041745, + "grad_norm": 0.4525204002857208, + "learning_rate": 9.300503467893851e-05, + "loss": 1.8999, + "step": 6364 + }, + { + "epoch": 1.9536525475751993, + "grad_norm": 0.41406187415122986, + "learning_rate": 9.300249886309542e-05, + "loss": 1.9804, + "step": 6365 + }, + { + "epoch": 1.9539594843462247, + "grad_norm": 0.4125058650970459, + "learning_rate": 9.299996262227644e-05, + "loss": 1.8464, + "step": 6366 + }, + { + "epoch": 1.9542664211172498, + "grad_norm": 0.41582876443862915, + "learning_rate": 9.299742595650663e-05, + "loss": 1.9937, + "step": 6367 + }, + { + "epoch": 1.954573357888275, + "grad_norm": 0.4360882639884949, + "learning_rate": 9.299488886581103e-05, + "loss": 1.9064, + "step": 6368 + }, + { + "epoch": 1.9548802946593002, + "grad_norm": 0.38369372487068176, + "learning_rate": 9.299235135021476e-05, + "loss": 1.9202, + "step": 6369 + }, + { + "epoch": 1.9551872314303254, + "grad_norm": 0.34401383996009827, + "learning_rate": 9.298981340974287e-05, + "loss": 1.844, + "step": 6370 + }, + { + "epoch": 1.9554941682013505, + "grad_norm": 0.3434326946735382, + "learning_rate": 9.298727504442044e-05, + "loss": 1.8206, + "step": 6371 + }, + { + "epoch": 1.9558011049723758, + "grad_norm": 0.35966724157333374, + "learning_rate": 9.298473625427257e-05, + "loss": 1.9, + "step": 6372 + }, + { + "epoch": 1.9561080417434007, + "grad_norm": 0.3726016581058502, + "learning_rate": 9.298219703932434e-05, + "loss": 1.9004, + "step": 6373 + }, + { + "epoch": 1.956414978514426, + "grad_norm": 0.3377366364002228, + "learning_rate": 9.297965739960084e-05, + "loss": 1.8747, + "step": 6374 + }, + { + "epoch": 1.9567219152854514, + "grad_norm": 0.36824578046798706, + "learning_rate": 9.297711733512718e-05, + "loss": 1.9059, + "step": 6375 + }, + { + "epoch": 1.9570288520564763, + "grad_norm": 0.3434023857116699, + "learning_rate": 9.297457684592847e-05, + "loss": 1.8624, + "step": 6376 + }, + { + "epoch": 1.9573357888275016, + "grad_norm": 0.36236703395843506, + "learning_rate": 9.297203593202979e-05, + "loss": 1.8558, + "step": 6377 + }, + { + "epoch": 1.9576427255985267, + "grad_norm": 0.3326953947544098, + "learning_rate": 9.296949459345625e-05, + "loss": 1.9189, + "step": 6378 + }, + { + "epoch": 1.9579496623695518, + "grad_norm": 0.3358452022075653, + "learning_rate": 9.2966952830233e-05, + "loss": 1.8601, + "step": 6379 + }, + { + "epoch": 1.9582565991405771, + "grad_norm": 0.36092114448547363, + "learning_rate": 9.296441064238514e-05, + "loss": 1.873, + "step": 6380 + }, + { + "epoch": 1.9585635359116023, + "grad_norm": 0.345683217048645, + "learning_rate": 9.296186802993778e-05, + "loss": 1.9122, + "step": 6381 + }, + { + "epoch": 1.9588704726826274, + "grad_norm": 0.32488611340522766, + "learning_rate": 9.295932499291606e-05, + "loss": 1.8709, + "step": 6382 + }, + { + "epoch": 1.9591774094536527, + "grad_norm": 0.34276288747787476, + "learning_rate": 9.295678153134512e-05, + "loss": 1.937, + "step": 6383 + }, + { + "epoch": 1.9594843462246776, + "grad_norm": 0.3953622877597809, + "learning_rate": 9.295423764525008e-05, + "loss": 1.9357, + "step": 6384 + }, + { + "epoch": 1.959791282995703, + "grad_norm": 0.37806951999664307, + "learning_rate": 9.29516933346561e-05, + "loss": 1.8813, + "step": 6385 + }, + { + "epoch": 1.960098219766728, + "grad_norm": 0.39551272988319397, + "learning_rate": 9.29491485995883e-05, + "loss": 1.8812, + "step": 6386 + }, + { + "epoch": 1.9604051565377532, + "grad_norm": 0.37042370438575745, + "learning_rate": 9.294660344007184e-05, + "loss": 1.9059, + "step": 6387 + }, + { + "epoch": 1.9607120933087785, + "grad_norm": 0.37503576278686523, + "learning_rate": 9.294405785613187e-05, + "loss": 1.9792, + "step": 6388 + }, + { + "epoch": 1.9610190300798036, + "grad_norm": 0.3515741229057312, + "learning_rate": 9.294151184779355e-05, + "loss": 1.8792, + "step": 6389 + }, + { + "epoch": 1.9613259668508287, + "grad_norm": 0.319890558719635, + "learning_rate": 9.293896541508205e-05, + "loss": 1.9222, + "step": 6390 + }, + { + "epoch": 1.961632903621854, + "grad_norm": 0.3517487645149231, + "learning_rate": 9.293641855802252e-05, + "loss": 1.8751, + "step": 6391 + }, + { + "epoch": 1.961939840392879, + "grad_norm": 0.33269986510276794, + "learning_rate": 9.293387127664012e-05, + "loss": 1.8372, + "step": 6392 + }, + { + "epoch": 1.9622467771639043, + "grad_norm": 0.36048516631126404, + "learning_rate": 9.293132357096007e-05, + "loss": 1.8944, + "step": 6393 + }, + { + "epoch": 1.9625537139349294, + "grad_norm": 0.4329642057418823, + "learning_rate": 9.292877544100751e-05, + "loss": 1.9868, + "step": 6394 + }, + { + "epoch": 1.9628606507059545, + "grad_norm": 0.445496529340744, + "learning_rate": 9.292622688680762e-05, + "loss": 1.9885, + "step": 6395 + }, + { + "epoch": 1.9631675874769798, + "grad_norm": 0.3818886876106262, + "learning_rate": 9.292367790838561e-05, + "loss": 1.9515, + "step": 6396 + }, + { + "epoch": 1.963474524248005, + "grad_norm": 0.3800121545791626, + "learning_rate": 9.292112850576664e-05, + "loss": 1.8838, + "step": 6397 + }, + { + "epoch": 1.96378146101903, + "grad_norm": 0.44252321124076843, + "learning_rate": 9.291857867897593e-05, + "loss": 1.9296, + "step": 6398 + }, + { + "epoch": 1.9640883977900554, + "grad_norm": 0.463766485452652, + "learning_rate": 9.291602842803867e-05, + "loss": 1.9164, + "step": 6399 + }, + { + "epoch": 1.9643953345610803, + "grad_norm": 0.4599217474460602, + "learning_rate": 9.291347775298006e-05, + "loss": 1.9277, + "step": 6400 + }, + { + "epoch": 1.9647022713321056, + "grad_norm": 0.371346652507782, + "learning_rate": 9.291092665382532e-05, + "loss": 1.9036, + "step": 6401 + }, + { + "epoch": 1.9650092081031307, + "grad_norm": 0.327197402715683, + "learning_rate": 9.290837513059965e-05, + "loss": 1.8214, + "step": 6402 + }, + { + "epoch": 1.9653161448741558, + "grad_norm": 0.3346688747406006, + "learning_rate": 9.290582318332826e-05, + "loss": 1.8671, + "step": 6403 + }, + { + "epoch": 1.9656230816451812, + "grad_norm": 0.342208594083786, + "learning_rate": 9.290327081203637e-05, + "loss": 1.9143, + "step": 6404 + }, + { + "epoch": 1.9659300184162063, + "grad_norm": 0.3430559039115906, + "learning_rate": 9.290071801674923e-05, + "loss": 1.9135, + "step": 6405 + }, + { + "epoch": 1.9662369551872314, + "grad_norm": 0.3335573971271515, + "learning_rate": 9.289816479749202e-05, + "loss": 1.9011, + "step": 6406 + }, + { + "epoch": 1.9665438919582567, + "grad_norm": 0.3464879095554352, + "learning_rate": 9.289561115429004e-05, + "loss": 1.9061, + "step": 6407 + }, + { + "epoch": 1.9668508287292816, + "grad_norm": 0.3513408899307251, + "learning_rate": 9.289305708716847e-05, + "loss": 1.8982, + "step": 6408 + }, + { + "epoch": 1.967157765500307, + "grad_norm": 0.3888663947582245, + "learning_rate": 9.289050259615256e-05, + "loss": 1.9196, + "step": 6409 + }, + { + "epoch": 1.967464702271332, + "grad_norm": 0.3414073884487152, + "learning_rate": 9.288794768126759e-05, + "loss": 1.932, + "step": 6410 + }, + { + "epoch": 1.9677716390423572, + "grad_norm": 0.33067384362220764, + "learning_rate": 9.288539234253876e-05, + "loss": 1.8547, + "step": 6411 + }, + { + "epoch": 1.9680785758133825, + "grad_norm": 0.31827688217163086, + "learning_rate": 9.288283657999135e-05, + "loss": 1.8691, + "step": 6412 + }, + { + "epoch": 1.9683855125844076, + "grad_norm": 0.32259073853492737, + "learning_rate": 9.288028039365062e-05, + "loss": 1.8889, + "step": 6413 + }, + { + "epoch": 1.9686924493554327, + "grad_norm": 0.37552687525749207, + "learning_rate": 9.287772378354182e-05, + "loss": 1.8709, + "step": 6414 + }, + { + "epoch": 1.968999386126458, + "grad_norm": 0.3446151316165924, + "learning_rate": 9.287516674969024e-05, + "loss": 1.8749, + "step": 6415 + }, + { + "epoch": 1.969306322897483, + "grad_norm": 0.3648208975791931, + "learning_rate": 9.287260929212111e-05, + "loss": 1.93, + "step": 6416 + }, + { + "epoch": 1.9696132596685083, + "grad_norm": 0.3430599868297577, + "learning_rate": 9.287005141085974e-05, + "loss": 1.8537, + "step": 6417 + }, + { + "epoch": 1.9699201964395334, + "grad_norm": 0.39110586047172546, + "learning_rate": 9.286749310593139e-05, + "loss": 1.987, + "step": 6418 + }, + { + "epoch": 1.9702271332105585, + "grad_norm": 0.4033393859863281, + "learning_rate": 9.286493437736136e-05, + "loss": 1.9793, + "step": 6419 + }, + { + "epoch": 1.9705340699815839, + "grad_norm": 0.3950151205062866, + "learning_rate": 9.286237522517491e-05, + "loss": 1.8781, + "step": 6420 + }, + { + "epoch": 1.970841006752609, + "grad_norm": 0.4614053964614868, + "learning_rate": 9.285981564939735e-05, + "loss": 1.9886, + "step": 6421 + }, + { + "epoch": 1.971147943523634, + "grad_norm": 0.4990023076534271, + "learning_rate": 9.285725565005398e-05, + "loss": 1.8957, + "step": 6422 + }, + { + "epoch": 1.9714548802946594, + "grad_norm": 0.501301109790802, + "learning_rate": 9.285469522717008e-05, + "loss": 1.8606, + "step": 6423 + }, + { + "epoch": 1.9717618170656843, + "grad_norm": 0.3820148706436157, + "learning_rate": 9.285213438077097e-05, + "loss": 1.9097, + "step": 6424 + }, + { + "epoch": 1.9720687538367097, + "grad_norm": 0.3959129750728607, + "learning_rate": 9.284957311088193e-05, + "loss": 1.8972, + "step": 6425 + }, + { + "epoch": 1.9723756906077348, + "grad_norm": 0.4914678931236267, + "learning_rate": 9.284701141752831e-05, + "loss": 1.9211, + "step": 6426 + }, + { + "epoch": 1.9726826273787599, + "grad_norm": 0.5992010831832886, + "learning_rate": 9.284444930073542e-05, + "loss": 1.917, + "step": 6427 + }, + { + "epoch": 1.9729895641497852, + "grad_norm": 0.6089407801628113, + "learning_rate": 9.284188676052856e-05, + "loss": 1.9497, + "step": 6428 + }, + { + "epoch": 1.9732965009208103, + "grad_norm": 0.5493173003196716, + "learning_rate": 9.283932379693306e-05, + "loss": 1.9888, + "step": 6429 + }, + { + "epoch": 1.9736034376918354, + "grad_norm": 0.4451984167098999, + "learning_rate": 9.283676040997426e-05, + "loss": 1.892, + "step": 6430 + }, + { + "epoch": 1.9739103744628608, + "grad_norm": 0.35765743255615234, + "learning_rate": 9.283419659967748e-05, + "loss": 1.8768, + "step": 6431 + }, + { + "epoch": 1.9742173112338857, + "grad_norm": 0.36561164259910583, + "learning_rate": 9.283163236606807e-05, + "loss": 1.825, + "step": 6432 + }, + { + "epoch": 1.974524248004911, + "grad_norm": 0.38473913073539734, + "learning_rate": 9.282906770917137e-05, + "loss": 1.9247, + "step": 6433 + }, + { + "epoch": 1.974831184775936, + "grad_norm": 0.324945867061615, + "learning_rate": 9.28265026290127e-05, + "loss": 1.8832, + "step": 6434 + }, + { + "epoch": 1.9751381215469612, + "grad_norm": 0.38697487115859985, + "learning_rate": 9.282393712561744e-05, + "loss": 1.9282, + "step": 6435 + }, + { + "epoch": 1.9754450583179866, + "grad_norm": 0.3772333264350891, + "learning_rate": 9.282137119901094e-05, + "loss": 1.8822, + "step": 6436 + }, + { + "epoch": 1.9757519950890117, + "grad_norm": 0.3522745668888092, + "learning_rate": 9.281880484921854e-05, + "loss": 1.9102, + "step": 6437 + }, + { + "epoch": 1.9760589318600368, + "grad_norm": 0.36745330691337585, + "learning_rate": 9.281623807626562e-05, + "loss": 1.8842, + "step": 6438 + }, + { + "epoch": 1.9763658686310621, + "grad_norm": 0.3990548253059387, + "learning_rate": 9.281367088017755e-05, + "loss": 1.9642, + "step": 6439 + }, + { + "epoch": 1.976672805402087, + "grad_norm": 0.3333520293235779, + "learning_rate": 9.281110326097969e-05, + "loss": 1.8541, + "step": 6440 + }, + { + "epoch": 1.9769797421731123, + "grad_norm": 0.3282802700996399, + "learning_rate": 9.280853521869739e-05, + "loss": 1.8416, + "step": 6441 + }, + { + "epoch": 1.9772866789441375, + "grad_norm": 0.3415268361568451, + "learning_rate": 9.280596675335607e-05, + "loss": 1.9009, + "step": 6442 + }, + { + "epoch": 1.9775936157151626, + "grad_norm": 0.3621836006641388, + "learning_rate": 9.28033978649811e-05, + "loss": 1.8584, + "step": 6443 + }, + { + "epoch": 1.977900552486188, + "grad_norm": 0.34778010845184326, + "learning_rate": 9.280082855359786e-05, + "loss": 1.9455, + "step": 6444 + }, + { + "epoch": 1.978207489257213, + "grad_norm": 0.36525633931159973, + "learning_rate": 9.279825881923174e-05, + "loss": 1.9182, + "step": 6445 + }, + { + "epoch": 1.9785144260282381, + "grad_norm": 0.3404203951358795, + "learning_rate": 9.279568866190815e-05, + "loss": 1.8853, + "step": 6446 + }, + { + "epoch": 1.9788213627992635, + "grad_norm": 0.4564785659313202, + "learning_rate": 9.279311808165249e-05, + "loss": 2.0012, + "step": 6447 + }, + { + "epoch": 1.9791282995702886, + "grad_norm": 0.4371441602706909, + "learning_rate": 9.279054707849015e-05, + "loss": 1.9372, + "step": 6448 + }, + { + "epoch": 1.9794352363413137, + "grad_norm": 0.3928726017475128, + "learning_rate": 9.278797565244652e-05, + "loss": 1.882, + "step": 6449 + }, + { + "epoch": 1.979742173112339, + "grad_norm": 0.483331561088562, + "learning_rate": 9.278540380354706e-05, + "loss": 1.9664, + "step": 6450 + }, + { + "epoch": 1.980049109883364, + "grad_norm": 0.39085066318511963, + "learning_rate": 9.278283153181716e-05, + "loss": 1.874, + "step": 6451 + }, + { + "epoch": 1.9803560466543892, + "grad_norm": 0.3549460172653198, + "learning_rate": 9.278025883728224e-05, + "loss": 1.9108, + "step": 6452 + }, + { + "epoch": 1.9806629834254144, + "grad_norm": 0.4260072410106659, + "learning_rate": 9.277768571996772e-05, + "loss": 1.8621, + "step": 6453 + }, + { + "epoch": 1.9809699201964395, + "grad_norm": 0.4531188905239105, + "learning_rate": 9.277511217989904e-05, + "loss": 1.9924, + "step": 6454 + }, + { + "epoch": 1.9812768569674648, + "grad_norm": 0.34916743636131287, + "learning_rate": 9.277253821710165e-05, + "loss": 1.9459, + "step": 6455 + }, + { + "epoch": 1.98158379373849, + "grad_norm": 0.45466169714927673, + "learning_rate": 9.276996383160095e-05, + "loss": 1.9129, + "step": 6456 + }, + { + "epoch": 1.981890730509515, + "grad_norm": 0.4948022663593292, + "learning_rate": 9.27673890234224e-05, + "loss": 1.9362, + "step": 6457 + }, + { + "epoch": 1.9821976672805404, + "grad_norm": 0.43365779519081116, + "learning_rate": 9.276481379259146e-05, + "loss": 1.9323, + "step": 6458 + }, + { + "epoch": 1.9825046040515653, + "grad_norm": 0.5301255583763123, + "learning_rate": 9.276223813913354e-05, + "loss": 1.9611, + "step": 6459 + }, + { + "epoch": 1.9828115408225906, + "grad_norm": 0.4785257577896118, + "learning_rate": 9.275966206307412e-05, + "loss": 1.8945, + "step": 6460 + }, + { + "epoch": 1.9831184775936157, + "grad_norm": 0.4091590940952301, + "learning_rate": 9.275708556443868e-05, + "loss": 1.9171, + "step": 6461 + }, + { + "epoch": 1.9834254143646408, + "grad_norm": 0.4031025767326355, + "learning_rate": 9.275450864325264e-05, + "loss": 1.9518, + "step": 6462 + }, + { + "epoch": 1.9837323511356661, + "grad_norm": 0.39147642254829407, + "learning_rate": 9.275193129954149e-05, + "loss": 1.8756, + "step": 6463 + }, + { + "epoch": 1.9840392879066913, + "grad_norm": 0.3863523006439209, + "learning_rate": 9.27493535333307e-05, + "loss": 1.8894, + "step": 6464 + }, + { + "epoch": 1.9843462246777164, + "grad_norm": 0.36373165249824524, + "learning_rate": 9.274677534464576e-05, + "loss": 1.8574, + "step": 6465 + }, + { + "epoch": 1.9846531614487417, + "grad_norm": 0.40247389674186707, + "learning_rate": 9.274419673351211e-05, + "loss": 1.832, + "step": 6466 + }, + { + "epoch": 1.9849600982197666, + "grad_norm": 0.3874013125896454, + "learning_rate": 9.274161769995526e-05, + "loss": 1.9079, + "step": 6467 + }, + { + "epoch": 1.985267034990792, + "grad_norm": 0.35506606101989746, + "learning_rate": 9.27390382440007e-05, + "loss": 1.8784, + "step": 6468 + }, + { + "epoch": 1.985573971761817, + "grad_norm": 0.406325101852417, + "learning_rate": 9.273645836567388e-05, + "loss": 1.9822, + "step": 6469 + }, + { + "epoch": 1.9858809085328422, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.273387806500036e-05, + "loss": 1.9334, + "step": 6470 + }, + { + "epoch": 1.9861878453038675, + "grad_norm": 0.4810343384742737, + "learning_rate": 9.273129734200561e-05, + "loss": 1.9598, + "step": 6471 + }, + { + "epoch": 1.9864947820748926, + "grad_norm": 0.4552834630012512, + "learning_rate": 9.272871619671513e-05, + "loss": 1.9504, + "step": 6472 + }, + { + "epoch": 1.9868017188459177, + "grad_norm": 0.38974207639694214, + "learning_rate": 9.272613462915443e-05, + "loss": 1.8811, + "step": 6473 + }, + { + "epoch": 1.987108655616943, + "grad_norm": 0.40983298420906067, + "learning_rate": 9.272355263934902e-05, + "loss": 1.8876, + "step": 6474 + }, + { + "epoch": 1.987415592387968, + "grad_norm": 0.3684757947921753, + "learning_rate": 9.272097022732443e-05, + "loss": 1.921, + "step": 6475 + }, + { + "epoch": 1.9877225291589933, + "grad_norm": 0.38384270668029785, + "learning_rate": 9.271838739310618e-05, + "loss": 1.9099, + "step": 6476 + }, + { + "epoch": 1.9880294659300184, + "grad_norm": 0.3783731460571289, + "learning_rate": 9.271580413671976e-05, + "loss": 1.9322, + "step": 6477 + }, + { + "epoch": 1.9883364027010435, + "grad_norm": 0.3686216473579407, + "learning_rate": 9.271322045819076e-05, + "loss": 1.914, + "step": 6478 + }, + { + "epoch": 1.9886433394720688, + "grad_norm": 0.38776305317878723, + "learning_rate": 9.271063635754466e-05, + "loss": 1.9331, + "step": 6479 + }, + { + "epoch": 1.988950276243094, + "grad_norm": 0.35099950432777405, + "learning_rate": 9.270805183480702e-05, + "loss": 1.9837, + "step": 6480 + }, + { + "epoch": 1.989257213014119, + "grad_norm": 0.3736453652381897, + "learning_rate": 9.270546689000339e-05, + "loss": 1.846, + "step": 6481 + }, + { + "epoch": 1.9895641497851444, + "grad_norm": 0.3654848635196686, + "learning_rate": 9.27028815231593e-05, + "loss": 1.8987, + "step": 6482 + }, + { + "epoch": 1.9898710865561693, + "grad_norm": 0.3534870147705078, + "learning_rate": 9.27002957343003e-05, + "loss": 1.868, + "step": 6483 + }, + { + "epoch": 1.9901780233271946, + "grad_norm": 0.3143392503261566, + "learning_rate": 9.269770952345197e-05, + "loss": 1.8042, + "step": 6484 + }, + { + "epoch": 1.9904849600982197, + "grad_norm": 0.37151026725769043, + "learning_rate": 9.269512289063982e-05, + "loss": 1.8392, + "step": 6485 + }, + { + "epoch": 1.9907918968692448, + "grad_norm": 0.39781463146209717, + "learning_rate": 9.269253583588947e-05, + "loss": 1.9911, + "step": 6486 + }, + { + "epoch": 1.9910988336402702, + "grad_norm": 0.44022107124328613, + "learning_rate": 9.268994835922643e-05, + "loss": 1.9644, + "step": 6487 + }, + { + "epoch": 1.9914057704112953, + "grad_norm": 0.4058530628681183, + "learning_rate": 9.268736046067632e-05, + "loss": 1.9062, + "step": 6488 + }, + { + "epoch": 1.9917127071823204, + "grad_norm": 0.3754481077194214, + "learning_rate": 9.268477214026467e-05, + "loss": 1.8278, + "step": 6489 + }, + { + "epoch": 1.9920196439533457, + "grad_norm": 0.318208247423172, + "learning_rate": 9.268218339801711e-05, + "loss": 1.8529, + "step": 6490 + }, + { + "epoch": 1.9923265807243706, + "grad_norm": 0.350777268409729, + "learning_rate": 9.267959423395918e-05, + "loss": 1.9024, + "step": 6491 + }, + { + "epoch": 1.992633517495396, + "grad_norm": 0.3145158588886261, + "learning_rate": 9.26770046481165e-05, + "loss": 1.934, + "step": 6492 + }, + { + "epoch": 1.992940454266421, + "grad_norm": 0.3347548842430115, + "learning_rate": 9.267441464051463e-05, + "loss": 1.8989, + "step": 6493 + }, + { + "epoch": 1.9932473910374462, + "grad_norm": 0.33111512660980225, + "learning_rate": 9.267182421117919e-05, + "loss": 1.8808, + "step": 6494 + }, + { + "epoch": 1.9935543278084715, + "grad_norm": 0.3135010898113251, + "learning_rate": 9.266923336013577e-05, + "loss": 1.895, + "step": 6495 + }, + { + "epoch": 1.9938612645794966, + "grad_norm": 0.3638830780982971, + "learning_rate": 9.266664208740998e-05, + "loss": 1.9331, + "step": 6496 + }, + { + "epoch": 1.9941682013505218, + "grad_norm": 0.3592624068260193, + "learning_rate": 9.266405039302743e-05, + "loss": 1.8963, + "step": 6497 + }, + { + "epoch": 1.994475138121547, + "grad_norm": 0.34216129779815674, + "learning_rate": 9.266145827701371e-05, + "loss": 1.9062, + "step": 6498 + }, + { + "epoch": 1.994782074892572, + "grad_norm": 0.4180343747138977, + "learning_rate": 9.265886573939447e-05, + "loss": 1.9351, + "step": 6499 + }, + { + "epoch": 1.9950890116635973, + "grad_norm": 0.36890342831611633, + "learning_rate": 9.265627278019531e-05, + "loss": 1.9037, + "step": 6500 + }, + { + "epoch": 1.9953959484346224, + "grad_norm": 0.36638152599334717, + "learning_rate": 9.265367939944188e-05, + "loss": 1.9524, + "step": 6501 + }, + { + "epoch": 1.9957028852056475, + "grad_norm": 0.44918373227119446, + "learning_rate": 9.265108559715976e-05, + "loss": 1.9236, + "step": 6502 + }, + { + "epoch": 1.9960098219766729, + "grad_norm": 0.3805326521396637, + "learning_rate": 9.264849137337462e-05, + "loss": 1.8526, + "step": 6503 + }, + { + "epoch": 1.996316758747698, + "grad_norm": 0.39035212993621826, + "learning_rate": 9.26458967281121e-05, + "loss": 1.8256, + "step": 6504 + }, + { + "epoch": 1.996623695518723, + "grad_norm": 0.330522358417511, + "learning_rate": 9.264330166139783e-05, + "loss": 1.8487, + "step": 6505 + }, + { + "epoch": 1.9969306322897484, + "grad_norm": 0.33569198846817017, + "learning_rate": 9.264070617325746e-05, + "loss": 1.8735, + "step": 6506 + }, + { + "epoch": 1.9972375690607733, + "grad_norm": 0.4121384918689728, + "learning_rate": 9.263811026371664e-05, + "loss": 2.0028, + "step": 6507 + }, + { + "epoch": 1.9975445058317987, + "grad_norm": 0.3419879972934723, + "learning_rate": 9.263551393280103e-05, + "loss": 1.8432, + "step": 6508 + }, + { + "epoch": 1.9978514426028238, + "grad_norm": 0.33369818329811096, + "learning_rate": 9.263291718053626e-05, + "loss": 1.8752, + "step": 6509 + }, + { + "epoch": 1.9981583793738489, + "grad_norm": 0.3580996096134186, + "learning_rate": 9.263032000694804e-05, + "loss": 1.9319, + "step": 6510 + }, + { + "epoch": 1.9984653161448742, + "grad_norm": 0.38216903805732727, + "learning_rate": 9.2627722412062e-05, + "loss": 1.9424, + "step": 6511 + }, + { + "epoch": 1.9987722529158993, + "grad_norm": 0.3836761713027954, + "learning_rate": 9.26251243959038e-05, + "loss": 1.9259, + "step": 6512 + }, + { + "epoch": 1.9990791896869244, + "grad_norm": 0.34978967905044556, + "learning_rate": 9.262252595849917e-05, + "loss": 1.8648, + "step": 6513 + }, + { + "epoch": 1.9993861264579498, + "grad_norm": 0.4190160632133484, + "learning_rate": 9.261992709987375e-05, + "loss": 1.9456, + "step": 6514 + }, + { + "epoch": 1.9996930632289747, + "grad_norm": 0.38700881600379944, + "learning_rate": 9.261732782005322e-05, + "loss": 1.8768, + "step": 6515 + }, + { + "epoch": 2.0, + "grad_norm": 0.3706338405609131, + "learning_rate": 9.261472811906328e-05, + "loss": 1.9247, + "step": 6516 + }, + { + "epoch": 2.0003069367710253, + "grad_norm": 0.36679908633232117, + "learning_rate": 9.261212799692962e-05, + "loss": 1.8193, + "step": 6517 + }, + { + "epoch": 2.0006138735420502, + "grad_norm": 0.45219072699546814, + "learning_rate": 9.260952745367795e-05, + "loss": 1.9019, + "step": 6518 + }, + { + "epoch": 2.0009208103130756, + "grad_norm": 0.6038491725921631, + "learning_rate": 9.260692648933393e-05, + "loss": 1.8834, + "step": 6519 + }, + { + "epoch": 2.001227747084101, + "grad_norm": 0.5823990106582642, + "learning_rate": 9.260432510392331e-05, + "loss": 1.9066, + "step": 6520 + }, + { + "epoch": 2.001534683855126, + "grad_norm": 0.4731088876724243, + "learning_rate": 9.260172329747178e-05, + "loss": 1.8997, + "step": 6521 + }, + { + "epoch": 2.001841620626151, + "grad_norm": 0.3397974669933319, + "learning_rate": 9.259912107000504e-05, + "loss": 1.9396, + "step": 6522 + }, + { + "epoch": 2.002148557397176, + "grad_norm": 0.374734103679657, + "learning_rate": 9.259651842154882e-05, + "loss": 1.9311, + "step": 6523 + }, + { + "epoch": 2.0024554941682013, + "grad_norm": 0.48218441009521484, + "learning_rate": 9.259391535212884e-05, + "loss": 1.948, + "step": 6524 + }, + { + "epoch": 2.0027624309392267, + "grad_norm": 0.40540626645088196, + "learning_rate": 9.259131186177082e-05, + "loss": 1.8541, + "step": 6525 + }, + { + "epoch": 2.0030693677102516, + "grad_norm": 0.3698440492153168, + "learning_rate": 9.258870795050048e-05, + "loss": 1.9622, + "step": 6526 + }, + { + "epoch": 2.003376304481277, + "grad_norm": 0.35084524750709534, + "learning_rate": 9.258610361834358e-05, + "loss": 1.8882, + "step": 6527 + }, + { + "epoch": 2.0036832412523022, + "grad_norm": 0.38982072472572327, + "learning_rate": 9.258349886532584e-05, + "loss": 1.9523, + "step": 6528 + }, + { + "epoch": 2.003990178023327, + "grad_norm": 0.3737744390964508, + "learning_rate": 9.258089369147302e-05, + "loss": 1.9091, + "step": 6529 + }, + { + "epoch": 2.0042971147943525, + "grad_norm": 0.36094167828559875, + "learning_rate": 9.257828809681083e-05, + "loss": 1.8711, + "step": 6530 + }, + { + "epoch": 2.0046040515653774, + "grad_norm": 0.3270244896411896, + "learning_rate": 9.257568208136506e-05, + "loss": 1.8738, + "step": 6531 + }, + { + "epoch": 2.0049109883364027, + "grad_norm": 0.3320237100124359, + "learning_rate": 9.257307564516145e-05, + "loss": 1.8889, + "step": 6532 + }, + { + "epoch": 2.005217925107428, + "grad_norm": 0.3091014623641968, + "learning_rate": 9.257046878822573e-05, + "loss": 1.8683, + "step": 6533 + }, + { + "epoch": 2.005524861878453, + "grad_norm": 0.3234712779521942, + "learning_rate": 9.25678615105837e-05, + "loss": 1.8787, + "step": 6534 + }, + { + "epoch": 2.0058317986494782, + "grad_norm": 0.38402292132377625, + "learning_rate": 9.25652538122611e-05, + "loss": 1.9414, + "step": 6535 + }, + { + "epoch": 2.0061387354205036, + "grad_norm": 0.41379863023757935, + "learning_rate": 9.256264569328372e-05, + "loss": 1.9185, + "step": 6536 + }, + { + "epoch": 2.0064456721915285, + "grad_norm": 0.35990384221076965, + "learning_rate": 9.256003715367733e-05, + "loss": 1.8756, + "step": 6537 + }, + { + "epoch": 2.006752608962554, + "grad_norm": 0.3489217460155487, + "learning_rate": 9.25574281934677e-05, + "loss": 1.8984, + "step": 6538 + }, + { + "epoch": 2.0070595457335787, + "grad_norm": 0.326541006565094, + "learning_rate": 9.255481881268064e-05, + "loss": 1.8559, + "step": 6539 + }, + { + "epoch": 2.007366482504604, + "grad_norm": 0.40900397300720215, + "learning_rate": 9.25522090113419e-05, + "loss": 1.8832, + "step": 6540 + }, + { + "epoch": 2.0076734192756294, + "grad_norm": 0.4130956828594208, + "learning_rate": 9.254959878947731e-05, + "loss": 1.8437, + "step": 6541 + }, + { + "epoch": 2.0079803560466543, + "grad_norm": 0.38869336247444153, + "learning_rate": 9.254698814711263e-05, + "loss": 1.8839, + "step": 6542 + }, + { + "epoch": 2.0082872928176796, + "grad_norm": 0.37832918763160706, + "learning_rate": 9.254437708427368e-05, + "loss": 1.9519, + "step": 6543 + }, + { + "epoch": 2.008594229588705, + "grad_norm": 0.35336560010910034, + "learning_rate": 9.254176560098625e-05, + "loss": 1.8928, + "step": 6544 + }, + { + "epoch": 2.00890116635973, + "grad_norm": 0.347260981798172, + "learning_rate": 9.253915369727617e-05, + "loss": 1.9133, + "step": 6545 + }, + { + "epoch": 2.009208103130755, + "grad_norm": 0.3706999719142914, + "learning_rate": 9.253654137316923e-05, + "loss": 1.9048, + "step": 6546 + }, + { + "epoch": 2.00951503990178, + "grad_norm": 0.40080907940864563, + "learning_rate": 9.253392862869127e-05, + "loss": 1.9169, + "step": 6547 + }, + { + "epoch": 2.0098219766728054, + "grad_norm": 0.3635334074497223, + "learning_rate": 9.253131546386808e-05, + "loss": 1.8623, + "step": 6548 + }, + { + "epoch": 2.0101289134438307, + "grad_norm": 0.32642990350723267, + "learning_rate": 9.252870187872552e-05, + "loss": 1.8624, + "step": 6549 + }, + { + "epoch": 2.0104358502148556, + "grad_norm": 0.32467779517173767, + "learning_rate": 9.25260878732894e-05, + "loss": 1.8867, + "step": 6550 + }, + { + "epoch": 2.010742786985881, + "grad_norm": 0.3496699631214142, + "learning_rate": 9.252347344758553e-05, + "loss": 1.8441, + "step": 6551 + }, + { + "epoch": 2.0110497237569063, + "grad_norm": 0.3624981939792633, + "learning_rate": 9.252085860163981e-05, + "loss": 1.9045, + "step": 6552 + }, + { + "epoch": 2.011356660527931, + "grad_norm": 0.3801099359989166, + "learning_rate": 9.251824333547801e-05, + "loss": 1.9273, + "step": 6553 + }, + { + "epoch": 2.0116635972989565, + "grad_norm": 0.355866402387619, + "learning_rate": 9.251562764912602e-05, + "loss": 1.9032, + "step": 6554 + }, + { + "epoch": 2.0119705340699814, + "grad_norm": 0.31210052967071533, + "learning_rate": 9.251301154260968e-05, + "loss": 1.8148, + "step": 6555 + }, + { + "epoch": 2.0122774708410067, + "grad_norm": 0.3583676218986511, + "learning_rate": 9.251039501595485e-05, + "loss": 1.9326, + "step": 6556 + }, + { + "epoch": 2.012584407612032, + "grad_norm": 0.40221846103668213, + "learning_rate": 9.250777806918737e-05, + "loss": 1.8968, + "step": 6557 + }, + { + "epoch": 2.012891344383057, + "grad_norm": 0.3403627574443817, + "learning_rate": 9.250516070233311e-05, + "loss": 1.8956, + "step": 6558 + }, + { + "epoch": 2.0131982811540823, + "grad_norm": 0.37752729654312134, + "learning_rate": 9.250254291541796e-05, + "loss": 1.9136, + "step": 6559 + }, + { + "epoch": 2.0135052179251076, + "grad_norm": 0.3661794364452362, + "learning_rate": 9.249992470846774e-05, + "loss": 1.8796, + "step": 6560 + }, + { + "epoch": 2.0138121546961325, + "grad_norm": 0.315603643655777, + "learning_rate": 9.249730608150837e-05, + "loss": 1.8711, + "step": 6561 + }, + { + "epoch": 2.014119091467158, + "grad_norm": 0.3187065124511719, + "learning_rate": 9.249468703456571e-05, + "loss": 1.8611, + "step": 6562 + }, + { + "epoch": 2.0144260282381827, + "grad_norm": 0.3018025755882263, + "learning_rate": 9.249206756766564e-05, + "loss": 1.786, + "step": 6563 + }, + { + "epoch": 2.014732965009208, + "grad_norm": 0.344963401556015, + "learning_rate": 9.248944768083406e-05, + "loss": 1.9428, + "step": 6564 + }, + { + "epoch": 2.0150399017802334, + "grad_norm": 0.29776978492736816, + "learning_rate": 9.248682737409687e-05, + "loss": 1.8089, + "step": 6565 + }, + { + "epoch": 2.0153468385512583, + "grad_norm": 0.348982572555542, + "learning_rate": 9.248420664747992e-05, + "loss": 1.8407, + "step": 6566 + }, + { + "epoch": 2.0156537753222836, + "grad_norm": 0.3413224518299103, + "learning_rate": 9.248158550100915e-05, + "loss": 1.9802, + "step": 6567 + }, + { + "epoch": 2.015960712093309, + "grad_norm": 0.3598950505256653, + "learning_rate": 9.247896393471044e-05, + "loss": 1.8882, + "step": 6568 + }, + { + "epoch": 2.016267648864334, + "grad_norm": 0.3609221875667572, + "learning_rate": 9.247634194860974e-05, + "loss": 1.934, + "step": 6569 + }, + { + "epoch": 2.016574585635359, + "grad_norm": 0.3893497586250305, + "learning_rate": 9.247371954273291e-05, + "loss": 1.8808, + "step": 6570 + }, + { + "epoch": 2.016881522406384, + "grad_norm": 0.347417950630188, + "learning_rate": 9.24710967171059e-05, + "loss": 1.863, + "step": 6571 + }, + { + "epoch": 2.0171884591774094, + "grad_norm": 0.35378298163414, + "learning_rate": 9.246847347175461e-05, + "loss": 1.8664, + "step": 6572 + }, + { + "epoch": 2.0174953959484347, + "grad_norm": 0.2819608151912689, + "learning_rate": 9.246584980670499e-05, + "loss": 1.9007, + "step": 6573 + }, + { + "epoch": 2.0178023327194596, + "grad_norm": 0.32445117831230164, + "learning_rate": 9.246322572198293e-05, + "loss": 1.9176, + "step": 6574 + }, + { + "epoch": 2.018109269490485, + "grad_norm": 0.33579203486442566, + "learning_rate": 9.24606012176144e-05, + "loss": 1.8192, + "step": 6575 + }, + { + "epoch": 2.0184162062615103, + "grad_norm": 0.40369588136672974, + "learning_rate": 9.245797629362532e-05, + "loss": 1.8731, + "step": 6576 + }, + { + "epoch": 2.018723143032535, + "grad_norm": 0.34241169691085815, + "learning_rate": 9.245535095004163e-05, + "loss": 1.8555, + "step": 6577 + }, + { + "epoch": 2.0190300798035605, + "grad_norm": 0.3627666234970093, + "learning_rate": 9.245272518688927e-05, + "loss": 1.9212, + "step": 6578 + }, + { + "epoch": 2.0193370165745854, + "grad_norm": 0.3330884873867035, + "learning_rate": 9.245009900419422e-05, + "loss": 1.8727, + "step": 6579 + }, + { + "epoch": 2.0196439533456108, + "grad_norm": 0.3259236514568329, + "learning_rate": 9.244747240198239e-05, + "loss": 1.8471, + "step": 6580 + }, + { + "epoch": 2.019950890116636, + "grad_norm": 0.3715277910232544, + "learning_rate": 9.244484538027976e-05, + "loss": 1.8925, + "step": 6581 + }, + { + "epoch": 2.020257826887661, + "grad_norm": 0.4752909541130066, + "learning_rate": 9.24422179391123e-05, + "loss": 1.889, + "step": 6582 + }, + { + "epoch": 2.0205647636586863, + "grad_norm": 0.5166791677474976, + "learning_rate": 9.243959007850597e-05, + "loss": 1.8637, + "step": 6583 + }, + { + "epoch": 2.0208717004297116, + "grad_norm": 0.5350266695022583, + "learning_rate": 9.243696179848673e-05, + "loss": 1.8916, + "step": 6584 + }, + { + "epoch": 2.0211786372007365, + "grad_norm": 0.6115607619285583, + "learning_rate": 9.243433309908055e-05, + "loss": 1.8847, + "step": 6585 + }, + { + "epoch": 2.021485573971762, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.243170398031343e-05, + "loss": 1.8889, + "step": 6586 + }, + { + "epoch": 2.021792510742787, + "grad_norm": 0.4547630846500397, + "learning_rate": 9.242907444221134e-05, + "loss": 1.8752, + "step": 6587 + }, + { + "epoch": 2.022099447513812, + "grad_norm": 0.39437413215637207, + "learning_rate": 9.242644448480027e-05, + "loss": 1.9318, + "step": 6588 + }, + { + "epoch": 2.0224063842848374, + "grad_norm": 0.39216291904449463, + "learning_rate": 9.24238141081062e-05, + "loss": 1.8799, + "step": 6589 + }, + { + "epoch": 2.0227133210558623, + "grad_norm": 0.4100605547428131, + "learning_rate": 9.242118331215513e-05, + "loss": 1.9278, + "step": 6590 + }, + { + "epoch": 2.0230202578268877, + "grad_norm": 0.38527074456214905, + "learning_rate": 9.241855209697307e-05, + "loss": 1.9085, + "step": 6591 + }, + { + "epoch": 2.023327194597913, + "grad_norm": 0.39856311678886414, + "learning_rate": 9.241592046258602e-05, + "loss": 1.8057, + "step": 6592 + }, + { + "epoch": 2.023634131368938, + "grad_norm": 0.4070499539375305, + "learning_rate": 9.241328840902e-05, + "loss": 1.8099, + "step": 6593 + }, + { + "epoch": 2.023941068139963, + "grad_norm": 0.40319183468818665, + "learning_rate": 9.241065593630097e-05, + "loss": 1.8654, + "step": 6594 + }, + { + "epoch": 2.0242480049109886, + "grad_norm": 0.3788430988788605, + "learning_rate": 9.240802304445499e-05, + "loss": 1.9419, + "step": 6595 + }, + { + "epoch": 2.0245549416820134, + "grad_norm": 0.3656894564628601, + "learning_rate": 9.240538973350809e-05, + "loss": 1.8625, + "step": 6596 + }, + { + "epoch": 2.0248618784530388, + "grad_norm": 0.4384852945804596, + "learning_rate": 9.240275600348625e-05, + "loss": 1.8893, + "step": 6597 + }, + { + "epoch": 2.0251688152240637, + "grad_norm": 0.5054775476455688, + "learning_rate": 9.240012185441554e-05, + "loss": 1.826, + "step": 6598 + }, + { + "epoch": 2.025475751995089, + "grad_norm": 0.4576725959777832, + "learning_rate": 9.239748728632196e-05, + "loss": 1.9319, + "step": 6599 + }, + { + "epoch": 2.0257826887661143, + "grad_norm": 0.40581515431404114, + "learning_rate": 9.239485229923157e-05, + "loss": 1.905, + "step": 6600 + }, + { + "epoch": 2.0260896255371392, + "grad_norm": 0.3168322443962097, + "learning_rate": 9.23922168931704e-05, + "loss": 1.8937, + "step": 6601 + }, + { + "epoch": 2.0263965623081646, + "grad_norm": 0.39211124181747437, + "learning_rate": 9.238958106816449e-05, + "loss": 1.8346, + "step": 6602 + }, + { + "epoch": 2.02670349907919, + "grad_norm": 0.4722496569156647, + "learning_rate": 9.23869448242399e-05, + "loss": 1.933, + "step": 6603 + }, + { + "epoch": 2.027010435850215, + "grad_norm": 0.47029170393943787, + "learning_rate": 9.238430816142268e-05, + "loss": 1.8873, + "step": 6604 + }, + { + "epoch": 2.02731737262124, + "grad_norm": 0.36421555280685425, + "learning_rate": 9.238167107973888e-05, + "loss": 1.8311, + "step": 6605 + }, + { + "epoch": 2.027624309392265, + "grad_norm": 0.36506712436676025, + "learning_rate": 9.237903357921455e-05, + "loss": 1.9025, + "step": 6606 + }, + { + "epoch": 2.0279312461632903, + "grad_norm": 0.5055087208747864, + "learning_rate": 9.237639565987579e-05, + "loss": 1.9138, + "step": 6607 + }, + { + "epoch": 2.0282381829343157, + "grad_norm": 0.5850993394851685, + "learning_rate": 9.237375732174867e-05, + "loss": 1.869, + "step": 6608 + }, + { + "epoch": 2.0285451197053406, + "grad_norm": 0.5053986310958862, + "learning_rate": 9.237111856485921e-05, + "loss": 1.8196, + "step": 6609 + }, + { + "epoch": 2.028852056476366, + "grad_norm": 0.40635839104652405, + "learning_rate": 9.236847938923354e-05, + "loss": 1.8399, + "step": 6610 + }, + { + "epoch": 2.0291589932473912, + "grad_norm": 0.32075709104537964, + "learning_rate": 9.236583979489771e-05, + "loss": 1.8532, + "step": 6611 + }, + { + "epoch": 2.029465930018416, + "grad_norm": 0.4474230408668518, + "learning_rate": 9.236319978187783e-05, + "loss": 1.8807, + "step": 6612 + }, + { + "epoch": 2.0297728667894415, + "grad_norm": 0.5391832590103149, + "learning_rate": 9.236055935019998e-05, + "loss": 1.8887, + "step": 6613 + }, + { + "epoch": 2.0300798035604664, + "grad_norm": 0.5129361748695374, + "learning_rate": 9.235791849989024e-05, + "loss": 1.8541, + "step": 6614 + }, + { + "epoch": 2.0303867403314917, + "grad_norm": 0.33113735914230347, + "learning_rate": 9.235527723097474e-05, + "loss": 1.8611, + "step": 6615 + }, + { + "epoch": 2.030693677102517, + "grad_norm": 0.3526761531829834, + "learning_rate": 9.235263554347956e-05, + "loss": 1.8436, + "step": 6616 + }, + { + "epoch": 2.031000613873542, + "grad_norm": 0.4380190670490265, + "learning_rate": 9.234999343743081e-05, + "loss": 1.854, + "step": 6617 + }, + { + "epoch": 2.0313075506445673, + "grad_norm": 0.4300559163093567, + "learning_rate": 9.23473509128546e-05, + "loss": 1.919, + "step": 6618 + }, + { + "epoch": 2.0316144874155926, + "grad_norm": 0.3445209860801697, + "learning_rate": 9.234470796977705e-05, + "loss": 1.88, + "step": 6619 + }, + { + "epoch": 2.0319214241866175, + "grad_norm": 0.35759109258651733, + "learning_rate": 9.234206460822428e-05, + "loss": 1.9244, + "step": 6620 + }, + { + "epoch": 2.032228360957643, + "grad_norm": 0.432804137468338, + "learning_rate": 9.23394208282224e-05, + "loss": 1.9312, + "step": 6621 + }, + { + "epoch": 2.0325352977286677, + "grad_norm": 0.446865439414978, + "learning_rate": 9.233677662979756e-05, + "loss": 1.8791, + "step": 6622 + }, + { + "epoch": 2.032842234499693, + "grad_norm": 0.37617436051368713, + "learning_rate": 9.233413201297588e-05, + "loss": 1.8794, + "step": 6623 + }, + { + "epoch": 2.0331491712707184, + "grad_norm": 0.33695775270462036, + "learning_rate": 9.233148697778349e-05, + "loss": 1.8649, + "step": 6624 + }, + { + "epoch": 2.0334561080417433, + "grad_norm": 0.3893069624900818, + "learning_rate": 9.232884152424654e-05, + "loss": 1.899, + "step": 6625 + }, + { + "epoch": 2.0337630448127686, + "grad_norm": 0.38993194699287415, + "learning_rate": 9.232619565239116e-05, + "loss": 1.8994, + "step": 6626 + }, + { + "epoch": 2.034069981583794, + "grad_norm": 0.3725507855415344, + "learning_rate": 9.23235493622435e-05, + "loss": 1.8758, + "step": 6627 + }, + { + "epoch": 2.034376918354819, + "grad_norm": 0.3236019015312195, + "learning_rate": 9.232090265382973e-05, + "loss": 1.9041, + "step": 6628 + }, + { + "epoch": 2.034683855125844, + "grad_norm": 0.3399617671966553, + "learning_rate": 9.231825552717599e-05, + "loss": 1.9081, + "step": 6629 + }, + { + "epoch": 2.034990791896869, + "grad_norm": 0.352096289396286, + "learning_rate": 9.231560798230845e-05, + "loss": 1.9001, + "step": 6630 + }, + { + "epoch": 2.0352977286678944, + "grad_norm": 0.39621952176094055, + "learning_rate": 9.231296001925327e-05, + "loss": 1.9258, + "step": 6631 + }, + { + "epoch": 2.0356046654389197, + "grad_norm": 0.36686012148857117, + "learning_rate": 9.23103116380366e-05, + "loss": 1.9325, + "step": 6632 + }, + { + "epoch": 2.0359116022099446, + "grad_norm": 0.36286696791648865, + "learning_rate": 9.230766283868466e-05, + "loss": 1.9623, + "step": 6633 + }, + { + "epoch": 2.03621853898097, + "grad_norm": 0.34748387336730957, + "learning_rate": 9.230501362122359e-05, + "loss": 1.8326, + "step": 6634 + }, + { + "epoch": 2.0365254757519953, + "grad_norm": 0.350993275642395, + "learning_rate": 9.230236398567958e-05, + "loss": 1.8333, + "step": 6635 + }, + { + "epoch": 2.03683241252302, + "grad_norm": 0.3181723356246948, + "learning_rate": 9.229971393207881e-05, + "loss": 1.8852, + "step": 6636 + }, + { + "epoch": 2.0371393492940455, + "grad_norm": 0.3446536660194397, + "learning_rate": 9.229706346044747e-05, + "loss": 1.8833, + "step": 6637 + }, + { + "epoch": 2.0374462860650704, + "grad_norm": 0.3077203631401062, + "learning_rate": 9.229441257081176e-05, + "loss": 1.8546, + "step": 6638 + }, + { + "epoch": 2.0377532228360957, + "grad_norm": 0.3659566342830658, + "learning_rate": 9.229176126319788e-05, + "loss": 1.8687, + "step": 6639 + }, + { + "epoch": 2.038060159607121, + "grad_norm": 0.379779577255249, + "learning_rate": 9.228910953763204e-05, + "loss": 1.9208, + "step": 6640 + }, + { + "epoch": 2.038367096378146, + "grad_norm": 0.4496903121471405, + "learning_rate": 9.228645739414042e-05, + "loss": 1.9471, + "step": 6641 + }, + { + "epoch": 2.0386740331491713, + "grad_norm": 0.37597209215164185, + "learning_rate": 9.228380483274923e-05, + "loss": 1.9047, + "step": 6642 + }, + { + "epoch": 2.0389809699201966, + "grad_norm": 0.3739323019981384, + "learning_rate": 9.228115185348471e-05, + "loss": 1.9697, + "step": 6643 + }, + { + "epoch": 2.0392879066912215, + "grad_norm": 0.3524092435836792, + "learning_rate": 9.227849845637306e-05, + "loss": 1.8716, + "step": 6644 + }, + { + "epoch": 2.039594843462247, + "grad_norm": 0.36939096450805664, + "learning_rate": 9.227584464144051e-05, + "loss": 1.9836, + "step": 6645 + }, + { + "epoch": 2.0399017802332717, + "grad_norm": 0.39015519618988037, + "learning_rate": 9.22731904087133e-05, + "loss": 1.907, + "step": 6646 + }, + { + "epoch": 2.040208717004297, + "grad_norm": 0.3725626468658447, + "learning_rate": 9.227053575821763e-05, + "loss": 1.9483, + "step": 6647 + }, + { + "epoch": 2.0405156537753224, + "grad_norm": 0.41595613956451416, + "learning_rate": 9.226788068997974e-05, + "loss": 1.9352, + "step": 6648 + }, + { + "epoch": 2.0408225905463473, + "grad_norm": 0.4026443660259247, + "learning_rate": 9.226522520402589e-05, + "loss": 1.9166, + "step": 6649 + }, + { + "epoch": 2.0411295273173726, + "grad_norm": 0.39883533120155334, + "learning_rate": 9.226256930038233e-05, + "loss": 1.8594, + "step": 6650 + }, + { + "epoch": 2.041436464088398, + "grad_norm": 0.35540083050727844, + "learning_rate": 9.225991297907526e-05, + "loss": 1.9065, + "step": 6651 + }, + { + "epoch": 2.041743400859423, + "grad_norm": 0.3799804747104645, + "learning_rate": 9.225725624013097e-05, + "loss": 1.9232, + "step": 6652 + }, + { + "epoch": 2.042050337630448, + "grad_norm": 0.37289959192276, + "learning_rate": 9.225459908357572e-05, + "loss": 1.9679, + "step": 6653 + }, + { + "epoch": 2.042357274401473, + "grad_norm": 0.38069143891334534, + "learning_rate": 9.225194150943574e-05, + "loss": 1.9699, + "step": 6654 + }, + { + "epoch": 2.0426642111724984, + "grad_norm": 0.43708884716033936, + "learning_rate": 9.224928351773731e-05, + "loss": 1.8907, + "step": 6655 + }, + { + "epoch": 2.0429711479435237, + "grad_norm": 0.47203195095062256, + "learning_rate": 9.22466251085067e-05, + "loss": 1.9615, + "step": 6656 + }, + { + "epoch": 2.0432780847145486, + "grad_norm": 0.405129998922348, + "learning_rate": 9.224396628177019e-05, + "loss": 1.9165, + "step": 6657 + }, + { + "epoch": 2.043585021485574, + "grad_norm": 0.33447468280792236, + "learning_rate": 9.224130703755403e-05, + "loss": 1.852, + "step": 6658 + }, + { + "epoch": 2.0438919582565993, + "grad_norm": 0.33780771493911743, + "learning_rate": 9.223864737588453e-05, + "loss": 1.875, + "step": 6659 + }, + { + "epoch": 2.044198895027624, + "grad_norm": 0.37942594289779663, + "learning_rate": 9.223598729678796e-05, + "loss": 1.9115, + "step": 6660 + }, + { + "epoch": 2.0445058317986495, + "grad_norm": 0.3368874192237854, + "learning_rate": 9.223332680029059e-05, + "loss": 1.822, + "step": 6661 + }, + { + "epoch": 2.044812768569675, + "grad_norm": 0.3029201924800873, + "learning_rate": 9.223066588641873e-05, + "loss": 1.8902, + "step": 6662 + }, + { + "epoch": 2.0451197053406998, + "grad_norm": 0.4605506360530853, + "learning_rate": 9.22280045551987e-05, + "loss": 1.9164, + "step": 6663 + }, + { + "epoch": 2.045426642111725, + "grad_norm": 0.5012617111206055, + "learning_rate": 9.222534280665675e-05, + "loss": 1.8859, + "step": 6664 + }, + { + "epoch": 2.04573357888275, + "grad_norm": 0.5177115797996521, + "learning_rate": 9.222268064081924e-05, + "loss": 1.93, + "step": 6665 + }, + { + "epoch": 2.0460405156537753, + "grad_norm": 0.3966628313064575, + "learning_rate": 9.222001805771244e-05, + "loss": 1.8817, + "step": 6666 + }, + { + "epoch": 2.0463474524248007, + "grad_norm": 0.3670666813850403, + "learning_rate": 9.221735505736269e-05, + "loss": 1.8224, + "step": 6667 + }, + { + "epoch": 2.0466543891958255, + "grad_norm": 0.4584221839904785, + "learning_rate": 9.221469163979628e-05, + "loss": 1.7788, + "step": 6668 + }, + { + "epoch": 2.046961325966851, + "grad_norm": 0.5598693490028381, + "learning_rate": 9.221202780503954e-05, + "loss": 1.9263, + "step": 6669 + }, + { + "epoch": 2.047268262737876, + "grad_norm": 0.44200289249420166, + "learning_rate": 9.22093635531188e-05, + "loss": 1.8455, + "step": 6670 + }, + { + "epoch": 2.047575199508901, + "grad_norm": 0.33257725834846497, + "learning_rate": 9.22066988840604e-05, + "loss": 1.9019, + "step": 6671 + }, + { + "epoch": 2.0478821362799264, + "grad_norm": 0.4716290831565857, + "learning_rate": 9.220403379789066e-05, + "loss": 1.9012, + "step": 6672 + }, + { + "epoch": 2.0481890730509513, + "grad_norm": 0.5600453615188599, + "learning_rate": 9.220136829463591e-05, + "loss": 1.9158, + "step": 6673 + }, + { + "epoch": 2.0484960098219767, + "grad_norm": 0.5345216393470764, + "learning_rate": 9.219870237432252e-05, + "loss": 1.931, + "step": 6674 + }, + { + "epoch": 2.048802946593002, + "grad_norm": 0.36617112159729004, + "learning_rate": 9.219603603697682e-05, + "loss": 1.9019, + "step": 6675 + }, + { + "epoch": 2.049109883364027, + "grad_norm": 0.33677804470062256, + "learning_rate": 9.219336928262514e-05, + "loss": 1.8897, + "step": 6676 + }, + { + "epoch": 2.049416820135052, + "grad_norm": 0.48563066124916077, + "learning_rate": 9.219070211129388e-05, + "loss": 1.9147, + "step": 6677 + }, + { + "epoch": 2.0497237569060776, + "grad_norm": 0.5029729008674622, + "learning_rate": 9.218803452300935e-05, + "loss": 1.8926, + "step": 6678 + }, + { + "epoch": 2.0500306936771024, + "grad_norm": 0.3969452977180481, + "learning_rate": 9.218536651779795e-05, + "loss": 1.9337, + "step": 6679 + }, + { + "epoch": 2.050337630448128, + "grad_norm": 0.37374138832092285, + "learning_rate": 9.218269809568603e-05, + "loss": 1.9147, + "step": 6680 + }, + { + "epoch": 2.0506445672191527, + "grad_norm": 0.416608065366745, + "learning_rate": 9.218002925669996e-05, + "loss": 1.975, + "step": 6681 + }, + { + "epoch": 2.050951503990178, + "grad_norm": 0.35848283767700195, + "learning_rate": 9.217736000086612e-05, + "loss": 1.9194, + "step": 6682 + }, + { + "epoch": 2.0512584407612033, + "grad_norm": 0.3294626772403717, + "learning_rate": 9.217469032821088e-05, + "loss": 1.8541, + "step": 6683 + }, + { + "epoch": 2.0515653775322282, + "grad_norm": 0.4164618253707886, + "learning_rate": 9.217202023876064e-05, + "loss": 1.8999, + "step": 6684 + }, + { + "epoch": 2.0518723143032536, + "grad_norm": 0.4067288935184479, + "learning_rate": 9.216934973254179e-05, + "loss": 1.8609, + "step": 6685 + }, + { + "epoch": 2.052179251074279, + "grad_norm": 0.38743069767951965, + "learning_rate": 9.216667880958069e-05, + "loss": 1.8571, + "step": 6686 + }, + { + "epoch": 2.052486187845304, + "grad_norm": 0.3430919647216797, + "learning_rate": 9.216400746990377e-05, + "loss": 1.9229, + "step": 6687 + }, + { + "epoch": 2.052793124616329, + "grad_norm": 0.3512028753757477, + "learning_rate": 9.21613357135374e-05, + "loss": 1.9331, + "step": 6688 + }, + { + "epoch": 2.053100061387354, + "grad_norm": 0.3708036541938782, + "learning_rate": 9.215866354050799e-05, + "loss": 1.8499, + "step": 6689 + }, + { + "epoch": 2.0534069981583793, + "grad_norm": 0.39376455545425415, + "learning_rate": 9.215599095084199e-05, + "loss": 1.8531, + "step": 6690 + }, + { + "epoch": 2.0537139349294047, + "grad_norm": 0.3855830430984497, + "learning_rate": 9.215331794456576e-05, + "loss": 1.8597, + "step": 6691 + }, + { + "epoch": 2.0540208717004296, + "grad_norm": 0.3515113592147827, + "learning_rate": 9.215064452170574e-05, + "loss": 1.8776, + "step": 6692 + }, + { + "epoch": 2.054327808471455, + "grad_norm": 0.3165057897567749, + "learning_rate": 9.214797068228833e-05, + "loss": 1.926, + "step": 6693 + }, + { + "epoch": 2.0546347452424802, + "grad_norm": 0.3516407310962677, + "learning_rate": 9.214529642633998e-05, + "loss": 1.9397, + "step": 6694 + }, + { + "epoch": 2.054941682013505, + "grad_norm": 0.36943888664245605, + "learning_rate": 9.214262175388713e-05, + "loss": 1.9114, + "step": 6695 + }, + { + "epoch": 2.0552486187845305, + "grad_norm": 0.3490065634250641, + "learning_rate": 9.213994666495616e-05, + "loss": 1.8637, + "step": 6696 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.30341869592666626, + "learning_rate": 9.213727115957356e-05, + "loss": 1.8525, + "step": 6697 + }, + { + "epoch": 2.0558624923265807, + "grad_norm": 0.3899247646331787, + "learning_rate": 9.213459523776573e-05, + "loss": 2.0578, + "step": 6698 + }, + { + "epoch": 2.056169429097606, + "grad_norm": 0.34904104471206665, + "learning_rate": 9.213191889955915e-05, + "loss": 1.9135, + "step": 6699 + }, + { + "epoch": 2.056476365868631, + "grad_norm": 0.3806450366973877, + "learning_rate": 9.212924214498024e-05, + "loss": 1.9252, + "step": 6700 + }, + { + "epoch": 2.0567833026396563, + "grad_norm": 0.33185848593711853, + "learning_rate": 9.212656497405547e-05, + "loss": 1.8457, + "step": 6701 + }, + { + "epoch": 2.0570902394106816, + "grad_norm": 0.356717050075531, + "learning_rate": 9.21238873868113e-05, + "loss": 1.9086, + "step": 6702 + }, + { + "epoch": 2.0573971761817065, + "grad_norm": 0.41743260622024536, + "learning_rate": 9.212120938327418e-05, + "loss": 1.9255, + "step": 6703 + }, + { + "epoch": 2.057704112952732, + "grad_norm": 0.3937377631664276, + "learning_rate": 9.211853096347058e-05, + "loss": 1.9529, + "step": 6704 + }, + { + "epoch": 2.0580110497237567, + "grad_norm": 0.43980923295021057, + "learning_rate": 9.211585212742698e-05, + "loss": 1.905, + "step": 6705 + }, + { + "epoch": 2.058317986494782, + "grad_norm": 0.36891186237335205, + "learning_rate": 9.211317287516984e-05, + "loss": 1.8109, + "step": 6706 + }, + { + "epoch": 2.0586249232658074, + "grad_norm": 0.3582547605037689, + "learning_rate": 9.211049320672563e-05, + "loss": 1.9633, + "step": 6707 + }, + { + "epoch": 2.0589318600368323, + "grad_norm": 0.3421446979045868, + "learning_rate": 9.210781312212087e-05, + "loss": 1.8956, + "step": 6708 + }, + { + "epoch": 2.0592387968078576, + "grad_norm": 0.34717023372650146, + "learning_rate": 9.210513262138199e-05, + "loss": 1.837, + "step": 6709 + }, + { + "epoch": 2.059545733578883, + "grad_norm": 0.32769930362701416, + "learning_rate": 9.210245170453553e-05, + "loss": 1.8588, + "step": 6710 + }, + { + "epoch": 2.059852670349908, + "grad_norm": 0.3694380223751068, + "learning_rate": 9.209977037160796e-05, + "loss": 1.9298, + "step": 6711 + }, + { + "epoch": 2.060159607120933, + "grad_norm": 0.38598594069480896, + "learning_rate": 9.209708862262578e-05, + "loss": 1.9011, + "step": 6712 + }, + { + "epoch": 2.060466543891958, + "grad_norm": 0.33520397543907166, + "learning_rate": 9.20944064576155e-05, + "loss": 1.9689, + "step": 6713 + }, + { + "epoch": 2.0607734806629834, + "grad_norm": 0.36898335814476013, + "learning_rate": 9.209172387660363e-05, + "loss": 1.9362, + "step": 6714 + }, + { + "epoch": 2.0610804174340087, + "grad_norm": 0.3989763855934143, + "learning_rate": 9.208904087961667e-05, + "loss": 1.8875, + "step": 6715 + }, + { + "epoch": 2.0613873542050336, + "grad_norm": 0.38079237937927246, + "learning_rate": 9.208635746668113e-05, + "loss": 1.8645, + "step": 6716 + }, + { + "epoch": 2.061694290976059, + "grad_norm": 0.3853057026863098, + "learning_rate": 9.208367363782355e-05, + "loss": 1.9346, + "step": 6717 + }, + { + "epoch": 2.0620012277470843, + "grad_norm": 0.33557942509651184, + "learning_rate": 9.208098939307044e-05, + "loss": 1.8629, + "step": 6718 + }, + { + "epoch": 2.062308164518109, + "grad_norm": 0.31848183274269104, + "learning_rate": 9.207830473244832e-05, + "loss": 1.7616, + "step": 6719 + }, + { + "epoch": 2.0626151012891345, + "grad_norm": 0.2901391088962555, + "learning_rate": 9.207561965598375e-05, + "loss": 1.8876, + "step": 6720 + }, + { + "epoch": 2.06292203806016, + "grad_norm": 0.33935174345970154, + "learning_rate": 9.207293416370322e-05, + "loss": 1.8407, + "step": 6721 + }, + { + "epoch": 2.0632289748311847, + "grad_norm": 0.3615114390850067, + "learning_rate": 9.207024825563331e-05, + "loss": 1.8378, + "step": 6722 + }, + { + "epoch": 2.06353591160221, + "grad_norm": 0.35903334617614746, + "learning_rate": 9.206756193180053e-05, + "loss": 1.8316, + "step": 6723 + }, + { + "epoch": 2.063842848373235, + "grad_norm": 0.35222968459129333, + "learning_rate": 9.206487519223146e-05, + "loss": 1.8786, + "step": 6724 + }, + { + "epoch": 2.0641497851442603, + "grad_norm": 0.3412967622280121, + "learning_rate": 9.206218803695264e-05, + "loss": 1.8682, + "step": 6725 + }, + { + "epoch": 2.0644567219152856, + "grad_norm": 0.4166354835033417, + "learning_rate": 9.205950046599062e-05, + "loss": 1.8871, + "step": 6726 + }, + { + "epoch": 2.0647636586863105, + "grad_norm": 0.4631161093711853, + "learning_rate": 9.205681247937196e-05, + "loss": 1.9328, + "step": 6727 + }, + { + "epoch": 2.065070595457336, + "grad_norm": 0.39197248220443726, + "learning_rate": 9.205412407712325e-05, + "loss": 1.9434, + "step": 6728 + }, + { + "epoch": 2.0653775322283607, + "grad_norm": 0.37939852476119995, + "learning_rate": 9.205143525927103e-05, + "loss": 1.9115, + "step": 6729 + }, + { + "epoch": 2.065684468999386, + "grad_norm": 0.35442814230918884, + "learning_rate": 9.204874602584186e-05, + "loss": 1.9197, + "step": 6730 + }, + { + "epoch": 2.0659914057704114, + "grad_norm": 0.3598809242248535, + "learning_rate": 9.204605637686235e-05, + "loss": 1.8684, + "step": 6731 + }, + { + "epoch": 2.0662983425414363, + "grad_norm": 0.3360415995121002, + "learning_rate": 9.204336631235905e-05, + "loss": 1.8531, + "step": 6732 + }, + { + "epoch": 2.0666052793124616, + "grad_norm": 0.4487619698047638, + "learning_rate": 9.204067583235859e-05, + "loss": 1.8509, + "step": 6733 + }, + { + "epoch": 2.066912216083487, + "grad_norm": 0.37166881561279297, + "learning_rate": 9.203798493688753e-05, + "loss": 1.8826, + "step": 6734 + }, + { + "epoch": 2.067219152854512, + "grad_norm": 0.35294032096862793, + "learning_rate": 9.203529362597244e-05, + "loss": 1.9029, + "step": 6735 + }, + { + "epoch": 2.067526089625537, + "grad_norm": 0.4115317165851593, + "learning_rate": 9.203260189963995e-05, + "loss": 1.9117, + "step": 6736 + }, + { + "epoch": 2.0678330263965625, + "grad_norm": 0.44137999415397644, + "learning_rate": 9.202990975791666e-05, + "loss": 1.8754, + "step": 6737 + }, + { + "epoch": 2.0681399631675874, + "grad_norm": 0.46055081486701965, + "learning_rate": 9.202721720082916e-05, + "loss": 1.8322, + "step": 6738 + }, + { + "epoch": 2.0684468999386127, + "grad_norm": 0.38548141717910767, + "learning_rate": 9.202452422840407e-05, + "loss": 1.8341, + "step": 6739 + }, + { + "epoch": 2.0687538367096376, + "grad_norm": 0.3542765974998474, + "learning_rate": 9.2021830840668e-05, + "loss": 1.9301, + "step": 6740 + }, + { + "epoch": 2.069060773480663, + "grad_norm": 0.35987207293510437, + "learning_rate": 9.201913703764755e-05, + "loss": 1.8756, + "step": 6741 + }, + { + "epoch": 2.0693677102516883, + "grad_norm": 0.4297364056110382, + "learning_rate": 9.201644281936938e-05, + "loss": 1.8549, + "step": 6742 + }, + { + "epoch": 2.069674647022713, + "grad_norm": 0.3679873049259186, + "learning_rate": 9.20137481858601e-05, + "loss": 1.8905, + "step": 6743 + }, + { + "epoch": 2.0699815837937385, + "grad_norm": 0.3402685523033142, + "learning_rate": 9.201105313714632e-05, + "loss": 1.8834, + "step": 6744 + }, + { + "epoch": 2.070288520564764, + "grad_norm": 0.40986955165863037, + "learning_rate": 9.200835767325469e-05, + "loss": 1.8861, + "step": 6745 + }, + { + "epoch": 2.0705954573357888, + "grad_norm": 0.4305949807167053, + "learning_rate": 9.200566179421186e-05, + "loss": 1.8977, + "step": 6746 + }, + { + "epoch": 2.070902394106814, + "grad_norm": 0.3948439359664917, + "learning_rate": 9.200296550004446e-05, + "loss": 1.8801, + "step": 6747 + }, + { + "epoch": 2.071209330877839, + "grad_norm": 0.3404015600681305, + "learning_rate": 9.200026879077912e-05, + "loss": 1.8417, + "step": 6748 + }, + { + "epoch": 2.0715162676488643, + "grad_norm": 0.39447101950645447, + "learning_rate": 9.199757166644252e-05, + "loss": 1.9675, + "step": 6749 + }, + { + "epoch": 2.0718232044198897, + "grad_norm": 0.44323647022247314, + "learning_rate": 9.199487412706129e-05, + "loss": 1.9014, + "step": 6750 + }, + { + "epoch": 2.0721301411909145, + "grad_norm": 0.47096556425094604, + "learning_rate": 9.199217617266212e-05, + "loss": 1.8783, + "step": 6751 + }, + { + "epoch": 2.07243707796194, + "grad_norm": 0.42863038182258606, + "learning_rate": 9.198947780327163e-05, + "loss": 1.8369, + "step": 6752 + }, + { + "epoch": 2.072744014732965, + "grad_norm": 0.414079874753952, + "learning_rate": 9.198677901891652e-05, + "loss": 1.9247, + "step": 6753 + }, + { + "epoch": 2.07305095150399, + "grad_norm": 0.3445589542388916, + "learning_rate": 9.198407981962345e-05, + "loss": 1.8494, + "step": 6754 + }, + { + "epoch": 2.0733578882750154, + "grad_norm": 0.4340321719646454, + "learning_rate": 9.198138020541908e-05, + "loss": 1.904, + "step": 6755 + }, + { + "epoch": 2.0736648250460403, + "grad_norm": 0.55349200963974, + "learning_rate": 9.197868017633013e-05, + "loss": 1.9368, + "step": 6756 + }, + { + "epoch": 2.0739717618170657, + "grad_norm": 0.5893970727920532, + "learning_rate": 9.197597973238326e-05, + "loss": 1.9329, + "step": 6757 + }, + { + "epoch": 2.074278698588091, + "grad_norm": 0.4942009449005127, + "learning_rate": 9.197327887360514e-05, + "loss": 1.7726, + "step": 6758 + }, + { + "epoch": 2.074585635359116, + "grad_norm": 0.36411046981811523, + "learning_rate": 9.197057760002247e-05, + "loss": 1.8214, + "step": 6759 + }, + { + "epoch": 2.074892572130141, + "grad_norm": 0.31520166993141174, + "learning_rate": 9.196787591166198e-05, + "loss": 1.8491, + "step": 6760 + }, + { + "epoch": 2.0751995089011666, + "grad_norm": 0.47392621636390686, + "learning_rate": 9.196517380855032e-05, + "loss": 2.0165, + "step": 6761 + }, + { + "epoch": 2.0755064456721914, + "grad_norm": 0.4768085181713104, + "learning_rate": 9.196247129071423e-05, + "loss": 1.9289, + "step": 6762 + }, + { + "epoch": 2.075813382443217, + "grad_norm": 0.396391361951828, + "learning_rate": 9.195976835818039e-05, + "loss": 1.9521, + "step": 6763 + }, + { + "epoch": 2.0761203192142417, + "grad_norm": 0.4030967950820923, + "learning_rate": 9.195706501097551e-05, + "loss": 1.8386, + "step": 6764 + }, + { + "epoch": 2.076427255985267, + "grad_norm": 0.48308777809143066, + "learning_rate": 9.195436124912635e-05, + "loss": 1.8874, + "step": 6765 + }, + { + "epoch": 2.0767341927562923, + "grad_norm": 0.5232771635055542, + "learning_rate": 9.19516570726596e-05, + "loss": 1.8822, + "step": 6766 + }, + { + "epoch": 2.0770411295273172, + "grad_norm": 0.3607174754142761, + "learning_rate": 9.194895248160198e-05, + "loss": 1.8995, + "step": 6767 + }, + { + "epoch": 2.0773480662983426, + "grad_norm": 0.4354429841041565, + "learning_rate": 9.194624747598022e-05, + "loss": 1.8629, + "step": 6768 + }, + { + "epoch": 2.077655003069368, + "grad_norm": 0.5405299067497253, + "learning_rate": 9.194354205582107e-05, + "loss": 1.8608, + "step": 6769 + }, + { + "epoch": 2.077961939840393, + "grad_norm": 0.5442025065422058, + "learning_rate": 9.194083622115123e-05, + "loss": 1.885, + "step": 6770 + }, + { + "epoch": 2.078268876611418, + "grad_norm": 0.4160112142562866, + "learning_rate": 9.193812997199749e-05, + "loss": 1.8617, + "step": 6771 + }, + { + "epoch": 2.078575813382443, + "grad_norm": 0.3550199866294861, + "learning_rate": 9.193542330838656e-05, + "loss": 1.9277, + "step": 6772 + }, + { + "epoch": 2.0788827501534684, + "grad_norm": 0.5224893093109131, + "learning_rate": 9.19327162303452e-05, + "loss": 1.7893, + "step": 6773 + }, + { + "epoch": 2.0791896869244937, + "grad_norm": 0.45021727681159973, + "learning_rate": 9.193000873790014e-05, + "loss": 1.8635, + "step": 6774 + }, + { + "epoch": 2.0794966236955186, + "grad_norm": 0.3087892532348633, + "learning_rate": 9.192730083107819e-05, + "loss": 1.842, + "step": 6775 + }, + { + "epoch": 2.079803560466544, + "grad_norm": 0.4304139018058777, + "learning_rate": 9.192459250990606e-05, + "loss": 1.8461, + "step": 6776 + }, + { + "epoch": 2.0801104972375692, + "grad_norm": 0.4388587474822998, + "learning_rate": 9.192188377441054e-05, + "loss": 1.8978, + "step": 6777 + }, + { + "epoch": 2.080417434008594, + "grad_norm": 0.3452616333961487, + "learning_rate": 9.19191746246184e-05, + "loss": 1.8849, + "step": 6778 + }, + { + "epoch": 2.0807243707796195, + "grad_norm": 0.3127618432044983, + "learning_rate": 9.191646506055638e-05, + "loss": 1.8703, + "step": 6779 + }, + { + "epoch": 2.0810313075506444, + "grad_norm": 0.3424977958202362, + "learning_rate": 9.191375508225131e-05, + "loss": 1.8446, + "step": 6780 + }, + { + "epoch": 2.0813382443216697, + "grad_norm": 0.3536671996116638, + "learning_rate": 9.191104468972993e-05, + "loss": 1.9079, + "step": 6781 + }, + { + "epoch": 2.081645181092695, + "grad_norm": 0.3689599633216858, + "learning_rate": 9.190833388301905e-05, + "loss": 1.8683, + "step": 6782 + }, + { + "epoch": 2.08195211786372, + "grad_norm": 0.30976906418800354, + "learning_rate": 9.190562266214546e-05, + "loss": 1.89, + "step": 6783 + }, + { + "epoch": 2.0822590546347453, + "grad_norm": 0.34682777523994446, + "learning_rate": 9.190291102713593e-05, + "loss": 1.8384, + "step": 6784 + }, + { + "epoch": 2.0825659914057706, + "grad_norm": 0.4135018587112427, + "learning_rate": 9.190019897801727e-05, + "loss": 1.8878, + "step": 6785 + }, + { + "epoch": 2.0828729281767955, + "grad_norm": 0.4247548580169678, + "learning_rate": 9.189748651481629e-05, + "loss": 1.9244, + "step": 6786 + }, + { + "epoch": 2.083179864947821, + "grad_norm": 0.3961609899997711, + "learning_rate": 9.18947736375598e-05, + "loss": 1.9539, + "step": 6787 + }, + { + "epoch": 2.0834868017188457, + "grad_norm": 0.4174231290817261, + "learning_rate": 9.18920603462746e-05, + "loss": 1.9705, + "step": 6788 + }, + { + "epoch": 2.083793738489871, + "grad_norm": 0.38771605491638184, + "learning_rate": 9.18893466409875e-05, + "loss": 1.9038, + "step": 6789 + }, + { + "epoch": 2.0841006752608964, + "grad_norm": 0.38480475544929504, + "learning_rate": 9.188663252172534e-05, + "loss": 1.8725, + "step": 6790 + }, + { + "epoch": 2.0844076120319213, + "grad_norm": 0.37508267164230347, + "learning_rate": 9.18839179885149e-05, + "loss": 1.8819, + "step": 6791 + }, + { + "epoch": 2.0847145488029466, + "grad_norm": 0.3970893621444702, + "learning_rate": 9.188120304138306e-05, + "loss": 1.9035, + "step": 6792 + }, + { + "epoch": 2.085021485573972, + "grad_norm": 0.42629706859588623, + "learning_rate": 9.18784876803566e-05, + "loss": 1.993, + "step": 6793 + }, + { + "epoch": 2.085328422344997, + "grad_norm": 0.40387317538261414, + "learning_rate": 9.18757719054624e-05, + "loss": 1.8987, + "step": 6794 + }, + { + "epoch": 2.085635359116022, + "grad_norm": 0.40304768085479736, + "learning_rate": 9.187305571672726e-05, + "loss": 1.9017, + "step": 6795 + }, + { + "epoch": 2.0859422958870475, + "grad_norm": 0.34255313873291016, + "learning_rate": 9.187033911417805e-05, + "loss": 1.8406, + "step": 6796 + }, + { + "epoch": 2.0862492326580724, + "grad_norm": 0.34713810682296753, + "learning_rate": 9.18676220978416e-05, + "loss": 1.8773, + "step": 6797 + }, + { + "epoch": 2.0865561694290977, + "grad_norm": 0.3651806712150574, + "learning_rate": 9.186490466774478e-05, + "loss": 1.9158, + "step": 6798 + }, + { + "epoch": 2.0868631062001226, + "grad_norm": 0.3859401047229767, + "learning_rate": 9.186218682391443e-05, + "loss": 1.8488, + "step": 6799 + }, + { + "epoch": 2.087170042971148, + "grad_norm": 0.34309303760528564, + "learning_rate": 9.185946856637742e-05, + "loss": 1.8373, + "step": 6800 + }, + { + "epoch": 2.0874769797421733, + "grad_norm": 0.3597384989261627, + "learning_rate": 9.18567498951606e-05, + "loss": 1.8297, + "step": 6801 + }, + { + "epoch": 2.087783916513198, + "grad_norm": 0.39170950651168823, + "learning_rate": 9.185403081029085e-05, + "loss": 1.9623, + "step": 6802 + }, + { + "epoch": 2.0880908532842235, + "grad_norm": 0.37024664878845215, + "learning_rate": 9.185131131179503e-05, + "loss": 1.8966, + "step": 6803 + }, + { + "epoch": 2.0883977900552484, + "grad_norm": 0.37869709730148315, + "learning_rate": 9.184859139970001e-05, + "loss": 1.9121, + "step": 6804 + }, + { + "epoch": 2.0887047268262737, + "grad_norm": 0.3808143436908722, + "learning_rate": 9.184587107403271e-05, + "loss": 1.918, + "step": 6805 + }, + { + "epoch": 2.089011663597299, + "grad_norm": 0.3864719271659851, + "learning_rate": 9.184315033481996e-05, + "loss": 1.9087, + "step": 6806 + }, + { + "epoch": 2.089318600368324, + "grad_norm": 0.41121476888656616, + "learning_rate": 9.184042918208869e-05, + "loss": 1.8971, + "step": 6807 + }, + { + "epoch": 2.0896255371393493, + "grad_norm": 0.33098986744880676, + "learning_rate": 9.183770761586576e-05, + "loss": 1.8497, + "step": 6808 + }, + { + "epoch": 2.0899324739103746, + "grad_norm": 0.336174339056015, + "learning_rate": 9.183498563617809e-05, + "loss": 1.8341, + "step": 6809 + }, + { + "epoch": 2.0902394106813995, + "grad_norm": 0.339040070772171, + "learning_rate": 9.183226324305258e-05, + "loss": 1.9228, + "step": 6810 + }, + { + "epoch": 2.090546347452425, + "grad_norm": 0.395000159740448, + "learning_rate": 9.182954043651613e-05, + "loss": 1.9773, + "step": 6811 + }, + { + "epoch": 2.09085328422345, + "grad_norm": 0.3884550929069519, + "learning_rate": 9.182681721659563e-05, + "loss": 1.9665, + "step": 6812 + }, + { + "epoch": 2.091160220994475, + "grad_norm": 0.38752105832099915, + "learning_rate": 9.182409358331801e-05, + "loss": 1.9337, + "step": 6813 + }, + { + "epoch": 2.0914671577655004, + "grad_norm": 0.3557493984699249, + "learning_rate": 9.182136953671017e-05, + "loss": 1.8506, + "step": 6814 + }, + { + "epoch": 2.0917740945365253, + "grad_norm": 0.36052554845809937, + "learning_rate": 9.181864507679906e-05, + "loss": 1.8336, + "step": 6815 + }, + { + "epoch": 2.0920810313075506, + "grad_norm": 0.3311133086681366, + "learning_rate": 9.181592020361158e-05, + "loss": 1.9121, + "step": 6816 + }, + { + "epoch": 2.092387968078576, + "grad_norm": 0.33922117948532104, + "learning_rate": 9.181319491717468e-05, + "loss": 1.8366, + "step": 6817 + }, + { + "epoch": 2.092694904849601, + "grad_norm": 0.30820000171661377, + "learning_rate": 9.181046921751527e-05, + "loss": 1.8931, + "step": 6818 + }, + { + "epoch": 2.093001841620626, + "grad_norm": 0.327374666929245, + "learning_rate": 9.180774310466031e-05, + "loss": 1.8818, + "step": 6819 + }, + { + "epoch": 2.0933087783916515, + "grad_norm": 0.3244091868400574, + "learning_rate": 9.180501657863672e-05, + "loss": 1.8542, + "step": 6820 + }, + { + "epoch": 2.0936157151626764, + "grad_norm": 0.32823657989501953, + "learning_rate": 9.180228963947144e-05, + "loss": 1.8745, + "step": 6821 + }, + { + "epoch": 2.0939226519337018, + "grad_norm": 0.32869017124176025, + "learning_rate": 9.179956228719144e-05, + "loss": 1.8497, + "step": 6822 + }, + { + "epoch": 2.0942295887047266, + "grad_norm": 0.3624805808067322, + "learning_rate": 9.179683452182369e-05, + "loss": 1.9499, + "step": 6823 + }, + { + "epoch": 2.094536525475752, + "grad_norm": 0.35709038376808167, + "learning_rate": 9.179410634339509e-05, + "loss": 1.8709, + "step": 6824 + }, + { + "epoch": 2.0948434622467773, + "grad_norm": 0.3875027298927307, + "learning_rate": 9.179137775193266e-05, + "loss": 1.883, + "step": 6825 + }, + { + "epoch": 2.095150399017802, + "grad_norm": 0.4203769862651825, + "learning_rate": 9.178864874746333e-05, + "loss": 1.814, + "step": 6826 + }, + { + "epoch": 2.0954573357888275, + "grad_norm": 0.46331214904785156, + "learning_rate": 9.178591933001407e-05, + "loss": 1.9821, + "step": 6827 + }, + { + "epoch": 2.095764272559853, + "grad_norm": 0.4264145791530609, + "learning_rate": 9.178318949961188e-05, + "loss": 1.9249, + "step": 6828 + }, + { + "epoch": 2.0960712093308778, + "grad_norm": 0.3697608709335327, + "learning_rate": 9.178045925628371e-05, + "loss": 2.0052, + "step": 6829 + }, + { + "epoch": 2.096378146101903, + "grad_norm": 0.39582517743110657, + "learning_rate": 9.177772860005656e-05, + "loss": 1.9086, + "step": 6830 + }, + { + "epoch": 2.096685082872928, + "grad_norm": 0.3287788927555084, + "learning_rate": 9.17749975309574e-05, + "loss": 1.8766, + "step": 6831 + }, + { + "epoch": 2.0969920196439533, + "grad_norm": 0.33648282289505005, + "learning_rate": 9.177226604901324e-05, + "loss": 1.933, + "step": 6832 + }, + { + "epoch": 2.0972989564149787, + "grad_norm": 0.34225910902023315, + "learning_rate": 9.176953415425106e-05, + "loss": 1.8801, + "step": 6833 + }, + { + "epoch": 2.0976058931860035, + "grad_norm": 0.35536935925483704, + "learning_rate": 9.176680184669786e-05, + "loss": 1.9472, + "step": 6834 + }, + { + "epoch": 2.097912829957029, + "grad_norm": 0.39152607321739197, + "learning_rate": 9.176406912638064e-05, + "loss": 1.9502, + "step": 6835 + }, + { + "epoch": 2.098219766728054, + "grad_norm": 0.3812694549560547, + "learning_rate": 9.176133599332643e-05, + "loss": 1.8746, + "step": 6836 + }, + { + "epoch": 2.098526703499079, + "grad_norm": 0.36225396394729614, + "learning_rate": 9.17586024475622e-05, + "loss": 1.8489, + "step": 6837 + }, + { + "epoch": 2.0988336402701044, + "grad_norm": 0.3953205943107605, + "learning_rate": 9.1755868489115e-05, + "loss": 1.8671, + "step": 6838 + }, + { + "epoch": 2.0991405770411293, + "grad_norm": 0.33443906903266907, + "learning_rate": 9.175313411801181e-05, + "loss": 1.8574, + "step": 6839 + }, + { + "epoch": 2.0994475138121547, + "grad_norm": 0.3358154892921448, + "learning_rate": 9.17503993342797e-05, + "loss": 1.8329, + "step": 6840 + }, + { + "epoch": 2.09975445058318, + "grad_norm": 0.45934513211250305, + "learning_rate": 9.174766413794566e-05, + "loss": 1.862, + "step": 6841 + }, + { + "epoch": 2.100061387354205, + "grad_norm": 0.46342480182647705, + "learning_rate": 9.174492852903673e-05, + "loss": 1.8747, + "step": 6842 + }, + { + "epoch": 2.1003683241252302, + "grad_norm": 0.4199588894844055, + "learning_rate": 9.174219250757996e-05, + "loss": 1.9308, + "step": 6843 + }, + { + "epoch": 2.1006752608962556, + "grad_norm": 0.3508588373661041, + "learning_rate": 9.173945607360238e-05, + "loss": 1.8622, + "step": 6844 + }, + { + "epoch": 2.1009821976672804, + "grad_norm": 0.3656609356403351, + "learning_rate": 9.173671922713104e-05, + "loss": 1.899, + "step": 6845 + }, + { + "epoch": 2.101289134438306, + "grad_norm": 0.43374791741371155, + "learning_rate": 9.173398196819295e-05, + "loss": 1.8725, + "step": 6846 + }, + { + "epoch": 2.1015960712093307, + "grad_norm": 0.49730411171913147, + "learning_rate": 9.17312442968152e-05, + "loss": 1.9224, + "step": 6847 + }, + { + "epoch": 2.101903007980356, + "grad_norm": 0.45392677187919617, + "learning_rate": 9.172850621302484e-05, + "loss": 1.8374, + "step": 6848 + }, + { + "epoch": 2.1022099447513813, + "grad_norm": 0.3507382273674011, + "learning_rate": 9.172576771684892e-05, + "loss": 1.8875, + "step": 6849 + }, + { + "epoch": 2.1025168815224062, + "grad_norm": 0.4124681055545807, + "learning_rate": 9.172302880831451e-05, + "loss": 1.8828, + "step": 6850 + }, + { + "epoch": 2.1028238182934316, + "grad_norm": 0.5120462775230408, + "learning_rate": 9.172028948744867e-05, + "loss": 1.8218, + "step": 6851 + }, + { + "epoch": 2.103130755064457, + "grad_norm": 0.5858038067817688, + "learning_rate": 9.171754975427848e-05, + "loss": 1.8679, + "step": 6852 + }, + { + "epoch": 2.103437691835482, + "grad_norm": 0.5196588039398193, + "learning_rate": 9.171480960883101e-05, + "loss": 1.8885, + "step": 6853 + }, + { + "epoch": 2.103744628606507, + "grad_norm": 0.38581255078315735, + "learning_rate": 9.171206905113335e-05, + "loss": 1.9127, + "step": 6854 + }, + { + "epoch": 2.104051565377532, + "grad_norm": 0.31531259417533875, + "learning_rate": 9.170932808121256e-05, + "loss": 1.84, + "step": 6855 + }, + { + "epoch": 2.1043585021485574, + "grad_norm": 0.4595080018043518, + "learning_rate": 9.170658669909575e-05, + "loss": 1.908, + "step": 6856 + }, + { + "epoch": 2.1046654389195827, + "grad_norm": 0.42485639452934265, + "learning_rate": 9.170384490481001e-05, + "loss": 1.8943, + "step": 6857 + }, + { + "epoch": 2.1049723756906076, + "grad_norm": 0.3465791344642639, + "learning_rate": 9.170110269838243e-05, + "loss": 1.8362, + "step": 6858 + }, + { + "epoch": 2.105279312461633, + "grad_norm": 0.26863181591033936, + "learning_rate": 9.16983600798401e-05, + "loss": 1.856, + "step": 6859 + }, + { + "epoch": 2.1055862492326582, + "grad_norm": 0.33826425671577454, + "learning_rate": 9.169561704921014e-05, + "loss": 1.8148, + "step": 6860 + }, + { + "epoch": 2.105893186003683, + "grad_norm": 0.3657929301261902, + "learning_rate": 9.169287360651967e-05, + "loss": 1.8978, + "step": 6861 + }, + { + "epoch": 2.1062001227747085, + "grad_norm": 0.2963617444038391, + "learning_rate": 9.169012975179579e-05, + "loss": 1.8432, + "step": 6862 + }, + { + "epoch": 2.1065070595457334, + "grad_norm": 0.32966092228889465, + "learning_rate": 9.168738548506559e-05, + "loss": 1.9137, + "step": 6863 + }, + { + "epoch": 2.1068139963167587, + "grad_norm": 0.4043191075325012, + "learning_rate": 9.168464080635622e-05, + "loss": 1.9294, + "step": 6864 + }, + { + "epoch": 2.107120933087784, + "grad_norm": 0.41461876034736633, + "learning_rate": 9.168189571569479e-05, + "loss": 1.8582, + "step": 6865 + }, + { + "epoch": 2.107427869858809, + "grad_norm": 0.34119492769241333, + "learning_rate": 9.167915021310845e-05, + "loss": 1.8245, + "step": 6866 + }, + { + "epoch": 2.1077348066298343, + "grad_norm": 0.3259434401988983, + "learning_rate": 9.167640429862429e-05, + "loss": 1.8962, + "step": 6867 + }, + { + "epoch": 2.1080417434008596, + "grad_norm": 0.3074548840522766, + "learning_rate": 9.167365797226951e-05, + "loss": 1.8617, + "step": 6868 + }, + { + "epoch": 2.1083486801718845, + "grad_norm": 0.40738388895988464, + "learning_rate": 9.167091123407121e-05, + "loss": 1.9701, + "step": 6869 + }, + { + "epoch": 2.10865561694291, + "grad_norm": 0.3931449055671692, + "learning_rate": 9.166816408405653e-05, + "loss": 1.8874, + "step": 6870 + }, + { + "epoch": 2.108962553713935, + "grad_norm": 0.3726460635662079, + "learning_rate": 9.166541652225264e-05, + "loss": 1.9307, + "step": 6871 + }, + { + "epoch": 2.10926949048496, + "grad_norm": 0.36566078662872314, + "learning_rate": 9.166266854868667e-05, + "loss": 1.8782, + "step": 6872 + }, + { + "epoch": 2.1095764272559854, + "grad_norm": 0.33448025584220886, + "learning_rate": 9.16599201633858e-05, + "loss": 1.8007, + "step": 6873 + }, + { + "epoch": 2.1098833640270103, + "grad_norm": 0.4261031150817871, + "learning_rate": 9.165717136637716e-05, + "loss": 1.9092, + "step": 6874 + }, + { + "epoch": 2.1101903007980356, + "grad_norm": 0.37860241532325745, + "learning_rate": 9.165442215768798e-05, + "loss": 1.8538, + "step": 6875 + }, + { + "epoch": 2.110497237569061, + "grad_norm": 0.35417279601097107, + "learning_rate": 9.165167253734535e-05, + "loss": 1.8859, + "step": 6876 + }, + { + "epoch": 2.110804174340086, + "grad_norm": 0.33357858657836914, + "learning_rate": 9.16489225053765e-05, + "loss": 1.8615, + "step": 6877 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.40441447496414185, + "learning_rate": 9.164617206180856e-05, + "loss": 1.8711, + "step": 6878 + }, + { + "epoch": 2.1114180478821365, + "grad_norm": 0.401530921459198, + "learning_rate": 9.164342120666876e-05, + "loss": 1.8378, + "step": 6879 + }, + { + "epoch": 2.1117249846531614, + "grad_norm": 0.36379504203796387, + "learning_rate": 9.164066993998426e-05, + "loss": 1.87, + "step": 6880 + }, + { + "epoch": 2.1120319214241867, + "grad_norm": 0.36242642998695374, + "learning_rate": 9.163791826178225e-05, + "loss": 1.9041, + "step": 6881 + }, + { + "epoch": 2.1123388581952116, + "grad_norm": 0.34601980447769165, + "learning_rate": 9.163516617208994e-05, + "loss": 1.9248, + "step": 6882 + }, + { + "epoch": 2.112645794966237, + "grad_norm": 0.4664660096168518, + "learning_rate": 9.163241367093451e-05, + "loss": 1.901, + "step": 6883 + }, + { + "epoch": 2.1129527317372623, + "grad_norm": 0.5991809964179993, + "learning_rate": 9.162966075834315e-05, + "loss": 1.9061, + "step": 6884 + }, + { + "epoch": 2.113259668508287, + "grad_norm": 0.5235050320625305, + "learning_rate": 9.16269074343431e-05, + "loss": 1.8958, + "step": 6885 + }, + { + "epoch": 2.1135666052793125, + "grad_norm": 0.39008161425590515, + "learning_rate": 9.162415369896153e-05, + "loss": 1.7935, + "step": 6886 + }, + { + "epoch": 2.113873542050338, + "grad_norm": 0.4212269186973572, + "learning_rate": 9.16213995522257e-05, + "loss": 1.9876, + "step": 6887 + }, + { + "epoch": 2.1141804788213627, + "grad_norm": 0.44495880603790283, + "learning_rate": 9.161864499416279e-05, + "loss": 1.9011, + "step": 6888 + }, + { + "epoch": 2.114487415592388, + "grad_norm": 0.40533384680747986, + "learning_rate": 9.161589002480006e-05, + "loss": 1.8734, + "step": 6889 + }, + { + "epoch": 2.114794352363413, + "grad_norm": 0.45783132314682007, + "learning_rate": 9.161313464416469e-05, + "loss": 1.9769, + "step": 6890 + }, + { + "epoch": 2.1151012891344383, + "grad_norm": 0.37975600361824036, + "learning_rate": 9.161037885228393e-05, + "loss": 1.8988, + "step": 6891 + }, + { + "epoch": 2.1154082259054636, + "grad_norm": 0.394987553358078, + "learning_rate": 9.160762264918504e-05, + "loss": 1.8076, + "step": 6892 + }, + { + "epoch": 2.1157151626764885, + "grad_norm": 0.4180262088775635, + "learning_rate": 9.160486603489522e-05, + "loss": 1.9497, + "step": 6893 + }, + { + "epoch": 2.116022099447514, + "grad_norm": 0.3917383849620819, + "learning_rate": 9.160210900944173e-05, + "loss": 1.9093, + "step": 6894 + }, + { + "epoch": 2.116329036218539, + "grad_norm": 0.3631739616394043, + "learning_rate": 9.15993515728518e-05, + "loss": 1.8724, + "step": 6895 + }, + { + "epoch": 2.116635972989564, + "grad_norm": 0.3304460942745209, + "learning_rate": 9.159659372515272e-05, + "loss": 1.8291, + "step": 6896 + }, + { + "epoch": 2.1169429097605894, + "grad_norm": 0.38202792406082153, + "learning_rate": 9.159383546637172e-05, + "loss": 1.8919, + "step": 6897 + }, + { + "epoch": 2.1172498465316143, + "grad_norm": 0.39544618129730225, + "learning_rate": 9.159107679653605e-05, + "loss": 1.8748, + "step": 6898 + }, + { + "epoch": 2.1175567833026396, + "grad_norm": 0.44175153970718384, + "learning_rate": 9.158831771567298e-05, + "loss": 1.9063, + "step": 6899 + }, + { + "epoch": 2.117863720073665, + "grad_norm": 0.3696559965610504, + "learning_rate": 9.158555822380979e-05, + "loss": 1.8356, + "step": 6900 + }, + { + "epoch": 2.11817065684469, + "grad_norm": 0.2917703688144684, + "learning_rate": 9.158279832097372e-05, + "loss": 1.8996, + "step": 6901 + }, + { + "epoch": 2.118477593615715, + "grad_norm": 0.3991266191005707, + "learning_rate": 9.158003800719208e-05, + "loss": 1.8872, + "step": 6902 + }, + { + "epoch": 2.1187845303867405, + "grad_norm": 0.41425880789756775, + "learning_rate": 9.157727728249213e-05, + "loss": 1.845, + "step": 6903 + }, + { + "epoch": 2.1190914671577654, + "grad_norm": 0.33590519428253174, + "learning_rate": 9.157451614690115e-05, + "loss": 1.8779, + "step": 6904 + }, + { + "epoch": 2.1193984039287908, + "grad_norm": 0.34963786602020264, + "learning_rate": 9.157175460044644e-05, + "loss": 1.8846, + "step": 6905 + }, + { + "epoch": 2.1197053406998156, + "grad_norm": 0.3274745047092438, + "learning_rate": 9.156899264315528e-05, + "loss": 1.8859, + "step": 6906 + }, + { + "epoch": 2.120012277470841, + "grad_norm": 0.35821303725242615, + "learning_rate": 9.156623027505498e-05, + "loss": 1.8314, + "step": 6907 + }, + { + "epoch": 2.1203192142418663, + "grad_norm": 0.41185733675956726, + "learning_rate": 9.156346749617283e-05, + "loss": 1.9162, + "step": 6908 + }, + { + "epoch": 2.120626151012891, + "grad_norm": 0.4120326042175293, + "learning_rate": 9.156070430653613e-05, + "loss": 1.8593, + "step": 6909 + }, + { + "epoch": 2.1209330877839165, + "grad_norm": 0.39017269015312195, + "learning_rate": 9.155794070617218e-05, + "loss": 1.9333, + "step": 6910 + }, + { + "epoch": 2.121240024554942, + "grad_norm": 0.3104727864265442, + "learning_rate": 9.155517669510832e-05, + "loss": 1.8274, + "step": 6911 + }, + { + "epoch": 2.1215469613259668, + "grad_norm": 0.38360875844955444, + "learning_rate": 9.155241227337183e-05, + "loss": 1.9013, + "step": 6912 + }, + { + "epoch": 2.121853898096992, + "grad_norm": 0.3752502501010895, + "learning_rate": 9.154964744099006e-05, + "loss": 1.9079, + "step": 6913 + }, + { + "epoch": 2.122160834868017, + "grad_norm": 0.32074928283691406, + "learning_rate": 9.154688219799033e-05, + "loss": 1.8232, + "step": 6914 + }, + { + "epoch": 2.1224677716390423, + "grad_norm": 0.39559221267700195, + "learning_rate": 9.154411654439993e-05, + "loss": 1.9273, + "step": 6915 + }, + { + "epoch": 2.1227747084100677, + "grad_norm": 0.4010276198387146, + "learning_rate": 9.154135048024623e-05, + "loss": 1.8368, + "step": 6916 + }, + { + "epoch": 2.1230816451810925, + "grad_norm": 0.5745936036109924, + "learning_rate": 9.153858400555658e-05, + "loss": 2.0344, + "step": 6917 + }, + { + "epoch": 2.123388581952118, + "grad_norm": 0.45708227157592773, + "learning_rate": 9.153581712035827e-05, + "loss": 1.9309, + "step": 6918 + }, + { + "epoch": 2.123695518723143, + "grad_norm": 0.43845629692077637, + "learning_rate": 9.153304982467868e-05, + "loss": 1.9213, + "step": 6919 + }, + { + "epoch": 2.124002455494168, + "grad_norm": 0.34456655383110046, + "learning_rate": 9.153028211854516e-05, + "loss": 1.9, + "step": 6920 + }, + { + "epoch": 2.1243093922651934, + "grad_norm": 0.3903563618659973, + "learning_rate": 9.152751400198502e-05, + "loss": 1.8619, + "step": 6921 + }, + { + "epoch": 2.1246163290362183, + "grad_norm": 0.3465174436569214, + "learning_rate": 9.152474547502566e-05, + "loss": 1.8253, + "step": 6922 + }, + { + "epoch": 2.1249232658072437, + "grad_norm": 0.38335317373275757, + "learning_rate": 9.152197653769444e-05, + "loss": 1.8824, + "step": 6923 + }, + { + "epoch": 2.125230202578269, + "grad_norm": 0.3583361506462097, + "learning_rate": 9.15192071900187e-05, + "loss": 1.8749, + "step": 6924 + }, + { + "epoch": 2.125537139349294, + "grad_norm": 0.38249272108078003, + "learning_rate": 9.151643743202582e-05, + "loss": 1.9289, + "step": 6925 + }, + { + "epoch": 2.1258440761203192, + "grad_norm": 0.3972204327583313, + "learning_rate": 9.151366726374318e-05, + "loss": 1.8259, + "step": 6926 + }, + { + "epoch": 2.1261510128913446, + "grad_norm": 0.42475268244743347, + "learning_rate": 9.151089668519814e-05, + "loss": 1.9026, + "step": 6927 + }, + { + "epoch": 2.1264579496623695, + "grad_norm": 0.39575010538101196, + "learning_rate": 9.15081256964181e-05, + "loss": 1.8835, + "step": 6928 + }, + { + "epoch": 2.126764886433395, + "grad_norm": 0.33592918515205383, + "learning_rate": 9.150535429743041e-05, + "loss": 1.9439, + "step": 6929 + }, + { + "epoch": 2.12707182320442, + "grad_norm": 0.41760140657424927, + "learning_rate": 9.150258248826249e-05, + "loss": 1.9326, + "step": 6930 + }, + { + "epoch": 2.127378759975445, + "grad_norm": 0.4759281575679779, + "learning_rate": 9.149981026894173e-05, + "loss": 1.8443, + "step": 6931 + }, + { + "epoch": 2.1276856967464703, + "grad_norm": 0.4669014513492584, + "learning_rate": 9.149703763949552e-05, + "loss": 1.9254, + "step": 6932 + }, + { + "epoch": 2.1279926335174952, + "grad_norm": 0.3498002588748932, + "learning_rate": 9.149426459995126e-05, + "loss": 1.8814, + "step": 6933 + }, + { + "epoch": 2.1282995702885206, + "grad_norm": 0.332998663187027, + "learning_rate": 9.149149115033637e-05, + "loss": 1.8223, + "step": 6934 + }, + { + "epoch": 2.128606507059546, + "grad_norm": 0.36990395188331604, + "learning_rate": 9.148871729067823e-05, + "loss": 1.917, + "step": 6935 + }, + { + "epoch": 2.128913443830571, + "grad_norm": 0.4807330369949341, + "learning_rate": 9.148594302100426e-05, + "loss": 1.9138, + "step": 6936 + }, + { + "epoch": 2.129220380601596, + "grad_norm": 0.4821743369102478, + "learning_rate": 9.14831683413419e-05, + "loss": 1.9201, + "step": 6937 + }, + { + "epoch": 2.129527317372621, + "grad_norm": 0.45373013615608215, + "learning_rate": 9.148039325171855e-05, + "loss": 1.88, + "step": 6938 + }, + { + "epoch": 2.1298342541436464, + "grad_norm": 0.3712935745716095, + "learning_rate": 9.147761775216166e-05, + "loss": 1.8424, + "step": 6939 + }, + { + "epoch": 2.1301411909146717, + "grad_norm": 0.32493939995765686, + "learning_rate": 9.147484184269862e-05, + "loss": 1.8691, + "step": 6940 + }, + { + "epoch": 2.1304481276856966, + "grad_norm": 0.41952449083328247, + "learning_rate": 9.14720655233569e-05, + "loss": 1.8468, + "step": 6941 + }, + { + "epoch": 2.130755064456722, + "grad_norm": 0.4730648398399353, + "learning_rate": 9.14692887941639e-05, + "loss": 2.0333, + "step": 6942 + }, + { + "epoch": 2.1310620012277472, + "grad_norm": 0.3745786249637604, + "learning_rate": 9.14665116551471e-05, + "loss": 1.8835, + "step": 6943 + }, + { + "epoch": 2.131368937998772, + "grad_norm": 0.3747421205043793, + "learning_rate": 9.146373410633392e-05, + "loss": 1.8958, + "step": 6944 + }, + { + "epoch": 2.1316758747697975, + "grad_norm": 0.4383934438228607, + "learning_rate": 9.146095614775182e-05, + "loss": 1.8527, + "step": 6945 + }, + { + "epoch": 2.131982811540823, + "grad_norm": 0.4657299220561981, + "learning_rate": 9.145817777942824e-05, + "loss": 1.9073, + "step": 6946 + }, + { + "epoch": 2.1322897483118477, + "grad_norm": 0.4741605818271637, + "learning_rate": 9.145539900139067e-05, + "loss": 1.8736, + "step": 6947 + }, + { + "epoch": 2.132596685082873, + "grad_norm": 0.4058460295200348, + "learning_rate": 9.145261981366653e-05, + "loss": 1.9365, + "step": 6948 + }, + { + "epoch": 2.132903621853898, + "grad_norm": 0.3430838882923126, + "learning_rate": 9.14498402162833e-05, + "loss": 1.8992, + "step": 6949 + }, + { + "epoch": 2.1332105586249233, + "grad_norm": 0.43009114265441895, + "learning_rate": 9.144706020926847e-05, + "loss": 1.925, + "step": 6950 + }, + { + "epoch": 2.1335174953959486, + "grad_norm": 0.47696158289909363, + "learning_rate": 9.144427979264949e-05, + "loss": 1.858, + "step": 6951 + }, + { + "epoch": 2.1338244321669735, + "grad_norm": 0.4477602243423462, + "learning_rate": 9.144149896645386e-05, + "loss": 1.9042, + "step": 6952 + }, + { + "epoch": 2.134131368937999, + "grad_norm": 0.3736960291862488, + "learning_rate": 9.143871773070903e-05, + "loss": 1.782, + "step": 6953 + }, + { + "epoch": 2.1344383057090237, + "grad_norm": 0.3065558075904846, + "learning_rate": 9.143593608544251e-05, + "loss": 1.8711, + "step": 6954 + }, + { + "epoch": 2.134745242480049, + "grad_norm": 0.41738569736480713, + "learning_rate": 9.143315403068178e-05, + "loss": 1.8651, + "step": 6955 + }, + { + "epoch": 2.1350521792510744, + "grad_norm": 0.4652978479862213, + "learning_rate": 9.143037156645435e-05, + "loss": 1.8225, + "step": 6956 + }, + { + "epoch": 2.1353591160220993, + "grad_norm": 0.3625001311302185, + "learning_rate": 9.142758869278769e-05, + "loss": 1.9045, + "step": 6957 + }, + { + "epoch": 2.1356660527931246, + "grad_norm": 0.34516090154647827, + "learning_rate": 9.142480540970933e-05, + "loss": 1.8527, + "step": 6958 + }, + { + "epoch": 2.13597298956415, + "grad_norm": 0.36983323097229004, + "learning_rate": 9.142202171724674e-05, + "loss": 1.7911, + "step": 6959 + }, + { + "epoch": 2.136279926335175, + "grad_norm": 0.46084535121917725, + "learning_rate": 9.141923761542748e-05, + "loss": 1.9489, + "step": 6960 + }, + { + "epoch": 2.1365868631062, + "grad_norm": 0.49472227692604065, + "learning_rate": 9.141645310427903e-05, + "loss": 1.9904, + "step": 6961 + }, + { + "epoch": 2.1368937998772255, + "grad_norm": 0.39878135919570923, + "learning_rate": 9.14136681838289e-05, + "loss": 1.8969, + "step": 6962 + }, + { + "epoch": 2.1372007366482504, + "grad_norm": 0.3451174795627594, + "learning_rate": 9.141088285410464e-05, + "loss": 1.9186, + "step": 6963 + }, + { + "epoch": 2.1375076734192757, + "grad_norm": 0.4497967064380646, + "learning_rate": 9.140809711513377e-05, + "loss": 1.8636, + "step": 6964 + }, + { + "epoch": 2.1378146101903006, + "grad_norm": 0.4643685221672058, + "learning_rate": 9.14053109669438e-05, + "loss": 1.8427, + "step": 6965 + }, + { + "epoch": 2.138121546961326, + "grad_norm": 0.3748690187931061, + "learning_rate": 9.140252440956229e-05, + "loss": 1.8529, + "step": 6966 + }, + { + "epoch": 2.1384284837323513, + "grad_norm": 0.3211230933666229, + "learning_rate": 9.139973744301675e-05, + "loss": 1.8849, + "step": 6967 + }, + { + "epoch": 2.138735420503376, + "grad_norm": 0.41169998049736023, + "learning_rate": 9.139695006733476e-05, + "loss": 1.8535, + "step": 6968 + }, + { + "epoch": 2.1390423572744015, + "grad_norm": 0.48356300592422485, + "learning_rate": 9.139416228254382e-05, + "loss": 1.8182, + "step": 6969 + }, + { + "epoch": 2.139349294045427, + "grad_norm": 0.4596598148345947, + "learning_rate": 9.139137408867153e-05, + "loss": 1.8522, + "step": 6970 + }, + { + "epoch": 2.1396562308164517, + "grad_norm": 0.37168747186660767, + "learning_rate": 9.138858548574543e-05, + "loss": 1.896, + "step": 6971 + }, + { + "epoch": 2.139963167587477, + "grad_norm": 0.34447649121284485, + "learning_rate": 9.138579647379305e-05, + "loss": 1.8473, + "step": 6972 + }, + { + "epoch": 2.140270104358502, + "grad_norm": 0.466169536113739, + "learning_rate": 9.138300705284197e-05, + "loss": 1.9131, + "step": 6973 + }, + { + "epoch": 2.1405770411295273, + "grad_norm": 0.4297258257865906, + "learning_rate": 9.138021722291977e-05, + "loss": 1.9013, + "step": 6974 + }, + { + "epoch": 2.1408839779005526, + "grad_norm": 0.29336342215538025, + "learning_rate": 9.1377426984054e-05, + "loss": 1.8242, + "step": 6975 + }, + { + "epoch": 2.1411909146715775, + "grad_norm": 0.4282750189304352, + "learning_rate": 9.137463633627226e-05, + "loss": 1.9159, + "step": 6976 + }, + { + "epoch": 2.141497851442603, + "grad_norm": 0.6071211099624634, + "learning_rate": 9.13718452796021e-05, + "loss": 1.9105, + "step": 6977 + }, + { + "epoch": 2.141804788213628, + "grad_norm": 0.5837090015411377, + "learning_rate": 9.136905381407113e-05, + "loss": 1.8735, + "step": 6978 + }, + { + "epoch": 2.142111724984653, + "grad_norm": 0.36910486221313477, + "learning_rate": 9.13662619397069e-05, + "loss": 1.9013, + "step": 6979 + }, + { + "epoch": 2.1424186617556784, + "grad_norm": 0.37497541308403015, + "learning_rate": 9.136346965653704e-05, + "loss": 1.8444, + "step": 6980 + }, + { + "epoch": 2.1427255985267033, + "grad_norm": 0.508252739906311, + "learning_rate": 9.136067696458911e-05, + "loss": 1.8756, + "step": 6981 + }, + { + "epoch": 2.1430325352977286, + "grad_norm": 0.4045214056968689, + "learning_rate": 9.135788386389077e-05, + "loss": 1.8843, + "step": 6982 + }, + { + "epoch": 2.143339472068754, + "grad_norm": 0.36260777711868286, + "learning_rate": 9.135509035446955e-05, + "loss": 1.9264, + "step": 6983 + }, + { + "epoch": 2.143646408839779, + "grad_norm": 0.4112427234649658, + "learning_rate": 9.135229643635309e-05, + "loss": 1.8843, + "step": 6984 + }, + { + "epoch": 2.143953345610804, + "grad_norm": 0.43893104791641235, + "learning_rate": 9.1349502109569e-05, + "loss": 1.9486, + "step": 6985 + }, + { + "epoch": 2.1442602823818295, + "grad_norm": 0.3942745625972748, + "learning_rate": 9.13467073741449e-05, + "loss": 1.8607, + "step": 6986 + }, + { + "epoch": 2.1445672191528544, + "grad_norm": 0.3920004963874817, + "learning_rate": 9.13439122301084e-05, + "loss": 1.8102, + "step": 6987 + }, + { + "epoch": 2.1448741559238798, + "grad_norm": 0.3774373531341553, + "learning_rate": 9.134111667748712e-05, + "loss": 1.8326, + "step": 6988 + }, + { + "epoch": 2.1451810926949046, + "grad_norm": 0.355228453874588, + "learning_rate": 9.13383207163087e-05, + "loss": 1.895, + "step": 6989 + }, + { + "epoch": 2.14548802946593, + "grad_norm": 0.40284648537635803, + "learning_rate": 9.133552434660077e-05, + "loss": 1.928, + "step": 6990 + }, + { + "epoch": 2.1457949662369553, + "grad_norm": 0.3974910378456116, + "learning_rate": 9.133272756839096e-05, + "loss": 1.8567, + "step": 6991 + }, + { + "epoch": 2.14610190300798, + "grad_norm": 0.3878382742404938, + "learning_rate": 9.13299303817069e-05, + "loss": 1.9125, + "step": 6992 + }, + { + "epoch": 2.1464088397790055, + "grad_norm": 0.36132267117500305, + "learning_rate": 9.132713278657625e-05, + "loss": 1.8395, + "step": 6993 + }, + { + "epoch": 2.146715776550031, + "grad_norm": 0.4648832082748413, + "learning_rate": 9.132433478302667e-05, + "loss": 1.8877, + "step": 6994 + }, + { + "epoch": 2.1470227133210558, + "grad_norm": 0.5171563625335693, + "learning_rate": 9.132153637108577e-05, + "loss": 1.857, + "step": 6995 + }, + { + "epoch": 2.147329650092081, + "grad_norm": 0.4256175756454468, + "learning_rate": 9.131873755078124e-05, + "loss": 1.8434, + "step": 6996 + }, + { + "epoch": 2.147636586863106, + "grad_norm": 0.3421500623226166, + "learning_rate": 9.131593832214072e-05, + "loss": 1.8747, + "step": 6997 + }, + { + "epoch": 2.1479435236341313, + "grad_norm": 0.3880314230918884, + "learning_rate": 9.131313868519188e-05, + "loss": 1.8592, + "step": 6998 + }, + { + "epoch": 2.1482504604051567, + "grad_norm": 0.41070252656936646, + "learning_rate": 9.131033863996239e-05, + "loss": 1.8746, + "step": 6999 + }, + { + "epoch": 2.1485573971761815, + "grad_norm": 0.3837376534938812, + "learning_rate": 9.130753818647992e-05, + "loss": 1.8722, + "step": 7000 + }, + { + "epoch": 2.148864333947207, + "grad_norm": 0.311184823513031, + "learning_rate": 9.130473732477217e-05, + "loss": 1.8964, + "step": 7001 + }, + { + "epoch": 2.149171270718232, + "grad_norm": 0.3548091948032379, + "learning_rate": 9.130193605486677e-05, + "loss": 1.9235, + "step": 7002 + }, + { + "epoch": 2.149478207489257, + "grad_norm": 0.3509860932826996, + "learning_rate": 9.129913437679143e-05, + "loss": 1.8088, + "step": 7003 + }, + { + "epoch": 2.1497851442602824, + "grad_norm": 0.3301749527454376, + "learning_rate": 9.129633229057384e-05, + "loss": 1.8926, + "step": 7004 + }, + { + "epoch": 2.150092081031308, + "grad_norm": 0.3071286082267761, + "learning_rate": 9.129352979624169e-05, + "loss": 1.8045, + "step": 7005 + }, + { + "epoch": 2.1503990178023327, + "grad_norm": 0.3222786486148834, + "learning_rate": 9.129072689382268e-05, + "loss": 1.877, + "step": 7006 + }, + { + "epoch": 2.150705954573358, + "grad_norm": 0.31817424297332764, + "learning_rate": 9.128792358334451e-05, + "loss": 1.8863, + "step": 7007 + }, + { + "epoch": 2.151012891344383, + "grad_norm": 0.29379183053970337, + "learning_rate": 9.128511986483487e-05, + "loss": 1.8339, + "step": 7008 + }, + { + "epoch": 2.1513198281154082, + "grad_norm": 0.3618883788585663, + "learning_rate": 9.128231573832149e-05, + "loss": 1.9521, + "step": 7009 + }, + { + "epoch": 2.1516267648864336, + "grad_norm": 0.3188464045524597, + "learning_rate": 9.127951120383205e-05, + "loss": 1.811, + "step": 7010 + }, + { + "epoch": 2.1519337016574585, + "grad_norm": 0.3257068395614624, + "learning_rate": 9.127670626139431e-05, + "loss": 1.9084, + "step": 7011 + }, + { + "epoch": 2.152240638428484, + "grad_norm": 0.3389057219028473, + "learning_rate": 9.127390091103595e-05, + "loss": 1.9272, + "step": 7012 + }, + { + "epoch": 2.1525475751995087, + "grad_norm": 0.3376730680465698, + "learning_rate": 9.127109515278471e-05, + "loss": 1.8841, + "step": 7013 + }, + { + "epoch": 2.152854511970534, + "grad_norm": 0.3032901883125305, + "learning_rate": 9.126828898666833e-05, + "loss": 1.8057, + "step": 7014 + }, + { + "epoch": 2.1531614487415593, + "grad_norm": 0.32034799456596375, + "learning_rate": 9.126548241271451e-05, + "loss": 1.7988, + "step": 7015 + }, + { + "epoch": 2.1534683855125842, + "grad_norm": 0.31879931688308716, + "learning_rate": 9.126267543095102e-05, + "loss": 1.8932, + "step": 7016 + }, + { + "epoch": 2.1537753222836096, + "grad_norm": 0.3282395005226135, + "learning_rate": 9.125986804140559e-05, + "loss": 1.907, + "step": 7017 + }, + { + "epoch": 2.154082259054635, + "grad_norm": 0.36310696601867676, + "learning_rate": 9.125706024410594e-05, + "loss": 1.9812, + "step": 7018 + }, + { + "epoch": 2.15438919582566, + "grad_norm": 0.39414262771606445, + "learning_rate": 9.125425203907985e-05, + "loss": 1.9112, + "step": 7019 + }, + { + "epoch": 2.154696132596685, + "grad_norm": 0.4457061290740967, + "learning_rate": 9.125144342635508e-05, + "loss": 1.8876, + "step": 7020 + }, + { + "epoch": 2.1550030693677105, + "grad_norm": 0.4651646316051483, + "learning_rate": 9.124863440595934e-05, + "loss": 1.8283, + "step": 7021 + }, + { + "epoch": 2.1553100061387354, + "grad_norm": 0.4404383897781372, + "learning_rate": 9.124582497792043e-05, + "loss": 1.8646, + "step": 7022 + }, + { + "epoch": 2.1556169429097607, + "grad_norm": 0.3569783866405487, + "learning_rate": 9.124301514226612e-05, + "loss": 1.9603, + "step": 7023 + }, + { + "epoch": 2.1559238796807856, + "grad_norm": 0.3878212571144104, + "learning_rate": 9.124020489902414e-05, + "loss": 1.889, + "step": 7024 + }, + { + "epoch": 2.156230816451811, + "grad_norm": 0.43005698919296265, + "learning_rate": 9.123739424822229e-05, + "loss": 1.9127, + "step": 7025 + }, + { + "epoch": 2.1565377532228363, + "grad_norm": 0.37798774242401123, + "learning_rate": 9.123458318988834e-05, + "loss": 1.8434, + "step": 7026 + }, + { + "epoch": 2.156844689993861, + "grad_norm": 0.38182979822158813, + "learning_rate": 9.123177172405007e-05, + "loss": 1.8905, + "step": 7027 + }, + { + "epoch": 2.1571516267648865, + "grad_norm": 0.4695180058479309, + "learning_rate": 9.122895985073524e-05, + "loss": 1.9035, + "step": 7028 + }, + { + "epoch": 2.1574585635359114, + "grad_norm": 0.37112870812416077, + "learning_rate": 9.12261475699717e-05, + "loss": 1.8497, + "step": 7029 + }, + { + "epoch": 2.1577655003069367, + "grad_norm": 0.36758264899253845, + "learning_rate": 9.122333488178721e-05, + "loss": 1.9015, + "step": 7030 + }, + { + "epoch": 2.158072437077962, + "grad_norm": 0.4691081643104553, + "learning_rate": 9.122052178620953e-05, + "loss": 1.9707, + "step": 7031 + }, + { + "epoch": 2.158379373848987, + "grad_norm": 0.47068753838539124, + "learning_rate": 9.121770828326653e-05, + "loss": 1.9103, + "step": 7032 + }, + { + "epoch": 2.1586863106200123, + "grad_norm": 0.38539063930511475, + "learning_rate": 9.121489437298593e-05, + "loss": 1.7872, + "step": 7033 + }, + { + "epoch": 2.1589932473910376, + "grad_norm": 0.43769749999046326, + "learning_rate": 9.121208005539563e-05, + "loss": 1.9654, + "step": 7034 + }, + { + "epoch": 2.1593001841620625, + "grad_norm": 0.4770655930042267, + "learning_rate": 9.120926533052338e-05, + "loss": 1.9754, + "step": 7035 + }, + { + "epoch": 2.159607120933088, + "grad_norm": 0.526979386806488, + "learning_rate": 9.120645019839702e-05, + "loss": 1.8833, + "step": 7036 + }, + { + "epoch": 2.159914057704113, + "grad_norm": 0.4734671413898468, + "learning_rate": 9.120363465904438e-05, + "loss": 1.8695, + "step": 7037 + }, + { + "epoch": 2.160220994475138, + "grad_norm": 0.40346798300743103, + "learning_rate": 9.120081871249326e-05, + "loss": 1.9216, + "step": 7038 + }, + { + "epoch": 2.1605279312461634, + "grad_norm": 0.38210105895996094, + "learning_rate": 9.119800235877149e-05, + "loss": 1.9334, + "step": 7039 + }, + { + "epoch": 2.1608348680171883, + "grad_norm": 0.5528677105903625, + "learning_rate": 9.119518559790694e-05, + "loss": 1.8858, + "step": 7040 + }, + { + "epoch": 2.1611418047882136, + "grad_norm": 0.6684148907661438, + "learning_rate": 9.11923684299274e-05, + "loss": 1.9105, + "step": 7041 + }, + { + "epoch": 2.161448741559239, + "grad_norm": 0.4497738778591156, + "learning_rate": 9.118955085486073e-05, + "loss": 1.8789, + "step": 7042 + }, + { + "epoch": 2.161755678330264, + "grad_norm": 0.4440831243991852, + "learning_rate": 9.11867328727348e-05, + "loss": 1.9966, + "step": 7043 + }, + { + "epoch": 2.162062615101289, + "grad_norm": 0.5910835266113281, + "learning_rate": 9.118391448357742e-05, + "loss": 1.8841, + "step": 7044 + }, + { + "epoch": 2.1623695518723145, + "grad_norm": 0.5312752723693848, + "learning_rate": 9.118109568741645e-05, + "loss": 1.8825, + "step": 7045 + }, + { + "epoch": 2.1626764886433394, + "grad_norm": 0.3885713815689087, + "learning_rate": 9.117827648427977e-05, + "loss": 1.8763, + "step": 7046 + }, + { + "epoch": 2.1629834254143647, + "grad_norm": 0.4274894893169403, + "learning_rate": 9.117545687419522e-05, + "loss": 1.8802, + "step": 7047 + }, + { + "epoch": 2.1632903621853896, + "grad_norm": 0.3984382748603821, + "learning_rate": 9.117263685719067e-05, + "loss": 1.8319, + "step": 7048 + }, + { + "epoch": 2.163597298956415, + "grad_norm": 0.3687778115272522, + "learning_rate": 9.1169816433294e-05, + "loss": 1.838, + "step": 7049 + }, + { + "epoch": 2.1639042357274403, + "grad_norm": 0.37597915530204773, + "learning_rate": 9.116699560253306e-05, + "loss": 1.8711, + "step": 7050 + }, + { + "epoch": 2.164211172498465, + "grad_norm": 0.41217467188835144, + "learning_rate": 9.116417436493574e-05, + "loss": 1.8552, + "step": 7051 + }, + { + "epoch": 2.1645181092694905, + "grad_norm": 0.3937448263168335, + "learning_rate": 9.116135272052994e-05, + "loss": 1.8548, + "step": 7052 + }, + { + "epoch": 2.164825046040516, + "grad_norm": 0.3545389175415039, + "learning_rate": 9.115853066934351e-05, + "loss": 1.8694, + "step": 7053 + }, + { + "epoch": 2.1651319828115407, + "grad_norm": 0.32625243067741394, + "learning_rate": 9.115570821140436e-05, + "loss": 1.8579, + "step": 7054 + }, + { + "epoch": 2.165438919582566, + "grad_norm": 0.32701975107192993, + "learning_rate": 9.115288534674038e-05, + "loss": 1.8676, + "step": 7055 + }, + { + "epoch": 2.165745856353591, + "grad_norm": 0.39372533559799194, + "learning_rate": 9.115006207537947e-05, + "loss": 1.8895, + "step": 7056 + }, + { + "epoch": 2.1660527931246163, + "grad_norm": 0.3688350021839142, + "learning_rate": 9.114723839734954e-05, + "loss": 1.8742, + "step": 7057 + }, + { + "epoch": 2.1663597298956416, + "grad_norm": 0.35461875796318054, + "learning_rate": 9.114441431267846e-05, + "loss": 1.8723, + "step": 7058 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.3331618010997772, + "learning_rate": 9.114158982139416e-05, + "loss": 1.8514, + "step": 7059 + }, + { + "epoch": 2.166973603437692, + "grad_norm": 0.3313215374946594, + "learning_rate": 9.113876492352458e-05, + "loss": 1.912, + "step": 7060 + }, + { + "epoch": 2.167280540208717, + "grad_norm": 0.3320949375629425, + "learning_rate": 9.113593961909759e-05, + "loss": 1.8908, + "step": 7061 + }, + { + "epoch": 2.167587476979742, + "grad_norm": 0.3292064070701599, + "learning_rate": 9.113311390814115e-05, + "loss": 1.8702, + "step": 7062 + }, + { + "epoch": 2.1678944137507674, + "grad_norm": 0.33991244435310364, + "learning_rate": 9.113028779068316e-05, + "loss": 1.8503, + "step": 7063 + }, + { + "epoch": 2.1682013505217923, + "grad_norm": 0.3602859377861023, + "learning_rate": 9.112746126675156e-05, + "loss": 1.9185, + "step": 7064 + }, + { + "epoch": 2.1685082872928176, + "grad_norm": 0.3354876637458801, + "learning_rate": 9.112463433637428e-05, + "loss": 1.8857, + "step": 7065 + }, + { + "epoch": 2.168815224063843, + "grad_norm": 0.32364192605018616, + "learning_rate": 9.112180699957926e-05, + "loss": 1.8548, + "step": 7066 + }, + { + "epoch": 2.169122160834868, + "grad_norm": 0.3617163896560669, + "learning_rate": 9.111897925639446e-05, + "loss": 1.9021, + "step": 7067 + }, + { + "epoch": 2.169429097605893, + "grad_norm": 0.3852904438972473, + "learning_rate": 9.111615110684778e-05, + "loss": 1.9331, + "step": 7068 + }, + { + "epoch": 2.1697360343769185, + "grad_norm": 0.332939088344574, + "learning_rate": 9.111332255096721e-05, + "loss": 1.9156, + "step": 7069 + }, + { + "epoch": 2.1700429711479434, + "grad_norm": 0.3386891186237335, + "learning_rate": 9.111049358878067e-05, + "loss": 1.8898, + "step": 7070 + }, + { + "epoch": 2.1703499079189688, + "grad_norm": 0.3559711277484894, + "learning_rate": 9.110766422031617e-05, + "loss": 1.8546, + "step": 7071 + }, + { + "epoch": 2.1706568446899936, + "grad_norm": 0.3440175950527191, + "learning_rate": 9.110483444560162e-05, + "loss": 1.9005, + "step": 7072 + }, + { + "epoch": 2.170963781461019, + "grad_norm": 0.3239493668079376, + "learning_rate": 9.110200426466499e-05, + "loss": 1.9258, + "step": 7073 + }, + { + "epoch": 2.1712707182320443, + "grad_norm": 0.3658723533153534, + "learning_rate": 9.109917367753428e-05, + "loss": 2.0203, + "step": 7074 + }, + { + "epoch": 2.171577655003069, + "grad_norm": 0.35419905185699463, + "learning_rate": 9.109634268423746e-05, + "loss": 1.8515, + "step": 7075 + }, + { + "epoch": 2.1718845917740945, + "grad_norm": 0.40852081775665283, + "learning_rate": 9.109351128480246e-05, + "loss": 1.8744, + "step": 7076 + }, + { + "epoch": 2.17219152854512, + "grad_norm": 0.3502386212348938, + "learning_rate": 9.109067947925732e-05, + "loss": 1.8785, + "step": 7077 + }, + { + "epoch": 2.1724984653161448, + "grad_norm": 0.42964309453964233, + "learning_rate": 9.108784726763e-05, + "loss": 1.9175, + "step": 7078 + }, + { + "epoch": 2.17280540208717, + "grad_norm": 0.39438319206237793, + "learning_rate": 9.108501464994849e-05, + "loss": 1.9072, + "step": 7079 + }, + { + "epoch": 2.1731123388581954, + "grad_norm": 0.5045785903930664, + "learning_rate": 9.108218162624079e-05, + "loss": 1.9246, + "step": 7080 + }, + { + "epoch": 2.1734192756292203, + "grad_norm": 0.4374946653842926, + "learning_rate": 9.107934819653488e-05, + "loss": 1.8669, + "step": 7081 + }, + { + "epoch": 2.1737262124002457, + "grad_norm": 0.3263556957244873, + "learning_rate": 9.107651436085878e-05, + "loss": 1.8402, + "step": 7082 + }, + { + "epoch": 2.1740331491712706, + "grad_norm": 0.4380986988544464, + "learning_rate": 9.107368011924048e-05, + "loss": 1.8948, + "step": 7083 + }, + { + "epoch": 2.174340085942296, + "grad_norm": 0.4350908696651459, + "learning_rate": 9.1070845471708e-05, + "loss": 1.8717, + "step": 7084 + }, + { + "epoch": 2.174647022713321, + "grad_norm": 0.37809762358665466, + "learning_rate": 9.106801041828936e-05, + "loss": 1.8703, + "step": 7085 + }, + { + "epoch": 2.174953959484346, + "grad_norm": 0.3473457992076874, + "learning_rate": 9.106517495901257e-05, + "loss": 1.8999, + "step": 7086 + }, + { + "epoch": 2.1752608962553714, + "grad_norm": 0.48066645860671997, + "learning_rate": 9.106233909390564e-05, + "loss": 1.8788, + "step": 7087 + }, + { + "epoch": 2.1755678330263963, + "grad_norm": 0.5873035788536072, + "learning_rate": 9.105950282299663e-05, + "loss": 1.8879, + "step": 7088 + }, + { + "epoch": 2.1758747697974217, + "grad_norm": 0.47609585523605347, + "learning_rate": 9.105666614631354e-05, + "loss": 1.8813, + "step": 7089 + }, + { + "epoch": 2.176181706568447, + "grad_norm": 0.3845362365245819, + "learning_rate": 9.10538290638844e-05, + "loss": 1.9629, + "step": 7090 + }, + { + "epoch": 2.176488643339472, + "grad_norm": 0.5463572144508362, + "learning_rate": 9.105099157573727e-05, + "loss": 1.9455, + "step": 7091 + }, + { + "epoch": 2.1767955801104972, + "grad_norm": 0.4875337779521942, + "learning_rate": 9.104815368190017e-05, + "loss": 1.9146, + "step": 7092 + }, + { + "epoch": 2.1771025168815226, + "grad_norm": 0.37513965368270874, + "learning_rate": 9.104531538240116e-05, + "loss": 1.8626, + "step": 7093 + }, + { + "epoch": 2.1774094536525475, + "grad_norm": 0.3477539122104645, + "learning_rate": 9.104247667726828e-05, + "loss": 1.878, + "step": 7094 + }, + { + "epoch": 2.177716390423573, + "grad_norm": 0.5122693181037903, + "learning_rate": 9.103963756652961e-05, + "loss": 1.8784, + "step": 7095 + }, + { + "epoch": 2.178023327194598, + "grad_norm": 0.49106159806251526, + "learning_rate": 9.103679805021317e-05, + "loss": 1.8441, + "step": 7096 + }, + { + "epoch": 2.178330263965623, + "grad_norm": 0.3801479637622833, + "learning_rate": 9.103395812834705e-05, + "loss": 1.8986, + "step": 7097 + }, + { + "epoch": 2.1786372007366483, + "grad_norm": 0.3429640233516693, + "learning_rate": 9.10311178009593e-05, + "loss": 1.8806, + "step": 7098 + }, + { + "epoch": 2.1789441375076732, + "grad_norm": 0.36715295910835266, + "learning_rate": 9.102827706807799e-05, + "loss": 1.8215, + "step": 7099 + }, + { + "epoch": 2.1792510742786986, + "grad_norm": 0.37225866317749023, + "learning_rate": 9.10254359297312e-05, + "loss": 1.8851, + "step": 7100 + }, + { + "epoch": 2.179558011049724, + "grad_norm": 0.3552459180355072, + "learning_rate": 9.102259438594702e-05, + "loss": 1.9345, + "step": 7101 + }, + { + "epoch": 2.179864947820749, + "grad_norm": 0.3876415193080902, + "learning_rate": 9.10197524367535e-05, + "loss": 1.8657, + "step": 7102 + }, + { + "epoch": 2.180171884591774, + "grad_norm": 0.4635472595691681, + "learning_rate": 9.101691008217875e-05, + "loss": 1.8527, + "step": 7103 + }, + { + "epoch": 2.1804788213627995, + "grad_norm": 0.46319296956062317, + "learning_rate": 9.101406732225086e-05, + "loss": 1.869, + "step": 7104 + }, + { + "epoch": 2.1807857581338244, + "grad_norm": 0.36179330945014954, + "learning_rate": 9.101122415699792e-05, + "loss": 1.9157, + "step": 7105 + }, + { + "epoch": 2.1810926949048497, + "grad_norm": 0.30921339988708496, + "learning_rate": 9.100838058644801e-05, + "loss": 1.858, + "step": 7106 + }, + { + "epoch": 2.1813996316758746, + "grad_norm": 0.4568884074687958, + "learning_rate": 9.100553661062925e-05, + "loss": 1.8663, + "step": 7107 + }, + { + "epoch": 2.1817065684469, + "grad_norm": 0.43856412172317505, + "learning_rate": 9.100269222956976e-05, + "loss": 1.8492, + "step": 7108 + }, + { + "epoch": 2.1820135052179253, + "grad_norm": 0.3025546967983246, + "learning_rate": 9.099984744329761e-05, + "loss": 1.8532, + "step": 7109 + }, + { + "epoch": 2.18232044198895, + "grad_norm": 0.38365665078163147, + "learning_rate": 9.099700225184096e-05, + "loss": 1.8883, + "step": 7110 + }, + { + "epoch": 2.1826273787599755, + "grad_norm": 0.4863334596157074, + "learning_rate": 9.099415665522788e-05, + "loss": 1.8682, + "step": 7111 + }, + { + "epoch": 2.182934315531001, + "grad_norm": 0.42789241671562195, + "learning_rate": 9.099131065348653e-05, + "loss": 1.8867, + "step": 7112 + }, + { + "epoch": 2.1832412523020257, + "grad_norm": 0.35933569073677063, + "learning_rate": 9.098846424664504e-05, + "loss": 1.9282, + "step": 7113 + }, + { + "epoch": 2.183548189073051, + "grad_norm": 0.42611026763916016, + "learning_rate": 9.09856174347315e-05, + "loss": 1.9609, + "step": 7114 + }, + { + "epoch": 2.183855125844076, + "grad_norm": 0.43970558047294617, + "learning_rate": 9.098277021777406e-05, + "loss": 1.823, + "step": 7115 + }, + { + "epoch": 2.1841620626151013, + "grad_norm": 0.36792683601379395, + "learning_rate": 9.097992259580089e-05, + "loss": 1.9231, + "step": 7116 + }, + { + "epoch": 2.1844689993861266, + "grad_norm": 0.3554590344429016, + "learning_rate": 9.097707456884008e-05, + "loss": 1.914, + "step": 7117 + }, + { + "epoch": 2.1847759361571515, + "grad_norm": 0.4271651804447174, + "learning_rate": 9.097422613691982e-05, + "loss": 1.8666, + "step": 7118 + }, + { + "epoch": 2.185082872928177, + "grad_norm": 0.32142770290374756, + "learning_rate": 9.097137730006822e-05, + "loss": 1.7989, + "step": 7119 + }, + { + "epoch": 2.185389809699202, + "grad_norm": 0.33245620131492615, + "learning_rate": 9.096852805831348e-05, + "loss": 1.8536, + "step": 7120 + }, + { + "epoch": 2.185696746470227, + "grad_norm": 0.3480495810508728, + "learning_rate": 9.09656784116837e-05, + "loss": 1.9008, + "step": 7121 + }, + { + "epoch": 2.1860036832412524, + "grad_norm": 0.35290226340293884, + "learning_rate": 9.09628283602071e-05, + "loss": 1.8593, + "step": 7122 + }, + { + "epoch": 2.1863106200122773, + "grad_norm": 0.3084987998008728, + "learning_rate": 9.095997790391183e-05, + "loss": 1.827, + "step": 7123 + }, + { + "epoch": 2.1866175567833026, + "grad_norm": 0.36295285820961, + "learning_rate": 9.095712704282604e-05, + "loss": 1.909, + "step": 7124 + }, + { + "epoch": 2.186924493554328, + "grad_norm": 0.3893873691558838, + "learning_rate": 9.095427577697791e-05, + "loss": 1.9221, + "step": 7125 + }, + { + "epoch": 2.187231430325353, + "grad_norm": 0.3699241578578949, + "learning_rate": 9.095142410639564e-05, + "loss": 1.9352, + "step": 7126 + }, + { + "epoch": 2.187538367096378, + "grad_norm": 0.3384705185890198, + "learning_rate": 9.094857203110738e-05, + "loss": 1.8541, + "step": 7127 + }, + { + "epoch": 2.1878453038674035, + "grad_norm": 0.377687007188797, + "learning_rate": 9.094571955114133e-05, + "loss": 1.8336, + "step": 7128 + }, + { + "epoch": 2.1881522406384284, + "grad_norm": 0.40227916836738586, + "learning_rate": 9.094286666652567e-05, + "loss": 1.9565, + "step": 7129 + }, + { + "epoch": 2.1884591774094537, + "grad_norm": 0.3679705560207367, + "learning_rate": 9.094001337728862e-05, + "loss": 1.8152, + "step": 7130 + }, + { + "epoch": 2.1887661141804786, + "grad_norm": 0.3197132647037506, + "learning_rate": 9.093715968345836e-05, + "loss": 1.9263, + "step": 7131 + }, + { + "epoch": 2.189073050951504, + "grad_norm": 0.3518284559249878, + "learning_rate": 9.09343055850631e-05, + "loss": 1.8675, + "step": 7132 + }, + { + "epoch": 2.1893799877225293, + "grad_norm": 0.3214010000228882, + "learning_rate": 9.093145108213103e-05, + "loss": 1.8991, + "step": 7133 + }, + { + "epoch": 2.189686924493554, + "grad_norm": 0.3563176393508911, + "learning_rate": 9.092859617469037e-05, + "loss": 1.8603, + "step": 7134 + }, + { + "epoch": 2.1899938612645795, + "grad_norm": 0.34053143858909607, + "learning_rate": 9.092574086276933e-05, + "loss": 1.8955, + "step": 7135 + }, + { + "epoch": 2.190300798035605, + "grad_norm": 0.3833705484867096, + "learning_rate": 9.092288514639613e-05, + "loss": 1.8845, + "step": 7136 + }, + { + "epoch": 2.1906077348066297, + "grad_norm": 0.3932427763938904, + "learning_rate": 9.092002902559901e-05, + "loss": 1.8608, + "step": 7137 + }, + { + "epoch": 2.190914671577655, + "grad_norm": 0.332955539226532, + "learning_rate": 9.091717250040617e-05, + "loss": 1.8558, + "step": 7138 + }, + { + "epoch": 2.1912216083486804, + "grad_norm": 0.3149980306625366, + "learning_rate": 9.091431557084584e-05, + "loss": 1.893, + "step": 7139 + }, + { + "epoch": 2.1915285451197053, + "grad_norm": 0.3679150640964508, + "learning_rate": 9.091145823694628e-05, + "loss": 1.9012, + "step": 7140 + }, + { + "epoch": 2.1918354818907306, + "grad_norm": 0.36836057901382446, + "learning_rate": 9.09086004987357e-05, + "loss": 1.9121, + "step": 7141 + }, + { + "epoch": 2.1921424186617555, + "grad_norm": 0.3581927418708801, + "learning_rate": 9.090574235624237e-05, + "loss": 1.8826, + "step": 7142 + }, + { + "epoch": 2.192449355432781, + "grad_norm": 0.40886545181274414, + "learning_rate": 9.09028838094945e-05, + "loss": 1.8828, + "step": 7143 + }, + { + "epoch": 2.192756292203806, + "grad_norm": 0.32729873061180115, + "learning_rate": 9.090002485852037e-05, + "loss": 1.8827, + "step": 7144 + }, + { + "epoch": 2.193063228974831, + "grad_norm": 0.35304784774780273, + "learning_rate": 9.089716550334819e-05, + "loss": 1.846, + "step": 7145 + }, + { + "epoch": 2.1933701657458564, + "grad_norm": 0.35022708773612976, + "learning_rate": 9.089430574400629e-05, + "loss": 1.9169, + "step": 7146 + }, + { + "epoch": 2.1936771025168813, + "grad_norm": 0.4137697219848633, + "learning_rate": 9.089144558052287e-05, + "loss": 1.9111, + "step": 7147 + }, + { + "epoch": 2.1939840392879066, + "grad_norm": 0.3193536102771759, + "learning_rate": 9.088858501292622e-05, + "loss": 1.8577, + "step": 7148 + }, + { + "epoch": 2.194290976058932, + "grad_norm": 0.35795432329177856, + "learning_rate": 9.08857240412446e-05, + "loss": 1.8645, + "step": 7149 + }, + { + "epoch": 2.194597912829957, + "grad_norm": 0.3626460134983063, + "learning_rate": 9.088286266550632e-05, + "loss": 1.9288, + "step": 7150 + }, + { + "epoch": 2.194904849600982, + "grad_norm": 0.3438000977039337, + "learning_rate": 9.08800008857396e-05, + "loss": 1.9112, + "step": 7151 + }, + { + "epoch": 2.1952117863720075, + "grad_norm": 0.3445241153240204, + "learning_rate": 9.087713870197276e-05, + "loss": 1.8711, + "step": 7152 + }, + { + "epoch": 2.1955187231430324, + "grad_norm": 0.34294596314430237, + "learning_rate": 9.087427611423408e-05, + "loss": 1.9061, + "step": 7153 + }, + { + "epoch": 2.1958256599140578, + "grad_norm": 0.3608735203742981, + "learning_rate": 9.087141312255184e-05, + "loss": 1.8634, + "step": 7154 + }, + { + "epoch": 2.196132596685083, + "grad_norm": 0.3417772352695465, + "learning_rate": 9.086854972695434e-05, + "loss": 1.9, + "step": 7155 + }, + { + "epoch": 2.196439533456108, + "grad_norm": 0.3516700863838196, + "learning_rate": 9.086568592746988e-05, + "loss": 1.9021, + "step": 7156 + }, + { + "epoch": 2.1967464702271333, + "grad_norm": 0.37481075525283813, + "learning_rate": 9.086282172412677e-05, + "loss": 1.8845, + "step": 7157 + }, + { + "epoch": 2.197053406998158, + "grad_norm": 0.3413105010986328, + "learning_rate": 9.08599571169533e-05, + "loss": 1.8128, + "step": 7158 + }, + { + "epoch": 2.1973603437691835, + "grad_norm": 0.3539934754371643, + "learning_rate": 9.085709210597777e-05, + "loss": 1.857, + "step": 7159 + }, + { + "epoch": 2.197667280540209, + "grad_norm": 0.4345060884952545, + "learning_rate": 9.085422669122851e-05, + "loss": 1.8698, + "step": 7160 + }, + { + "epoch": 2.1979742173112338, + "grad_norm": 0.40369880199432373, + "learning_rate": 9.085136087273386e-05, + "loss": 1.7948, + "step": 7161 + }, + { + "epoch": 2.198281154082259, + "grad_norm": 0.3832145035266876, + "learning_rate": 9.08484946505221e-05, + "loss": 1.8682, + "step": 7162 + }, + { + "epoch": 2.198588090853284, + "grad_norm": 0.2859131097793579, + "learning_rate": 9.084562802462158e-05, + "loss": 1.8123, + "step": 7163 + }, + { + "epoch": 2.1988950276243093, + "grad_norm": 0.3062222898006439, + "learning_rate": 9.084276099506062e-05, + "loss": 1.8448, + "step": 7164 + }, + { + "epoch": 2.1992019643953347, + "grad_norm": 0.3819046914577484, + "learning_rate": 9.083989356186757e-05, + "loss": 1.8661, + "step": 7165 + }, + { + "epoch": 2.1995089011663596, + "grad_norm": 0.5007020235061646, + "learning_rate": 9.083702572507074e-05, + "loss": 1.9144, + "step": 7166 + }, + { + "epoch": 2.199815837937385, + "grad_norm": 0.521885097026825, + "learning_rate": 9.083415748469849e-05, + "loss": 1.8695, + "step": 7167 + }, + { + "epoch": 2.2001227747084102, + "grad_norm": 0.35051268339157104, + "learning_rate": 9.083128884077916e-05, + "loss": 1.9378, + "step": 7168 + }, + { + "epoch": 2.200429711479435, + "grad_norm": 0.40265345573425293, + "learning_rate": 9.082841979334111e-05, + "loss": 1.8902, + "step": 7169 + }, + { + "epoch": 2.2007366482504604, + "grad_norm": 0.506377637386322, + "learning_rate": 9.082555034241267e-05, + "loss": 1.9115, + "step": 7170 + }, + { + "epoch": 2.201043585021486, + "grad_norm": 0.42828384041786194, + "learning_rate": 9.082268048802223e-05, + "loss": 1.8173, + "step": 7171 + }, + { + "epoch": 2.2013505217925107, + "grad_norm": 0.2979312539100647, + "learning_rate": 9.081981023019812e-05, + "loss": 1.8089, + "step": 7172 + }, + { + "epoch": 2.201657458563536, + "grad_norm": 0.3840465843677521, + "learning_rate": 9.081693956896872e-05, + "loss": 1.8557, + "step": 7173 + }, + { + "epoch": 2.201964395334561, + "grad_norm": 0.41454845666885376, + "learning_rate": 9.081406850436241e-05, + "loss": 1.8599, + "step": 7174 + }, + { + "epoch": 2.2022713321055862, + "grad_norm": 0.3305908739566803, + "learning_rate": 9.081119703640756e-05, + "loss": 1.8013, + "step": 7175 + }, + { + "epoch": 2.2025782688766116, + "grad_norm": 0.33649876713752747, + "learning_rate": 9.080832516513252e-05, + "loss": 1.9028, + "step": 7176 + }, + { + "epoch": 2.2028852056476365, + "grad_norm": 0.41247284412384033, + "learning_rate": 9.08054528905657e-05, + "loss": 1.8636, + "step": 7177 + }, + { + "epoch": 2.203192142418662, + "grad_norm": 0.4355279505252838, + "learning_rate": 9.080258021273548e-05, + "loss": 1.8923, + "step": 7178 + }, + { + "epoch": 2.203499079189687, + "grad_norm": 0.34598320722579956, + "learning_rate": 9.079970713167026e-05, + "loss": 1.9187, + "step": 7179 + }, + { + "epoch": 2.203806015960712, + "grad_norm": 0.3560951054096222, + "learning_rate": 9.07968336473984e-05, + "loss": 1.9382, + "step": 7180 + }, + { + "epoch": 2.2041129527317374, + "grad_norm": 0.3873176872730255, + "learning_rate": 9.079395975994834e-05, + "loss": 1.8377, + "step": 7181 + }, + { + "epoch": 2.2044198895027622, + "grad_norm": 0.38699567317962646, + "learning_rate": 9.079108546934844e-05, + "loss": 1.848, + "step": 7182 + }, + { + "epoch": 2.2047268262737876, + "grad_norm": 0.3658364713191986, + "learning_rate": 9.078821077562712e-05, + "loss": 1.9308, + "step": 7183 + }, + { + "epoch": 2.205033763044813, + "grad_norm": 0.35228830575942993, + "learning_rate": 9.078533567881281e-05, + "loss": 1.8886, + "step": 7184 + }, + { + "epoch": 2.205340699815838, + "grad_norm": 0.4177337884902954, + "learning_rate": 9.07824601789339e-05, + "loss": 1.8695, + "step": 7185 + }, + { + "epoch": 2.205647636586863, + "grad_norm": 0.4778536260128021, + "learning_rate": 9.077958427601882e-05, + "loss": 1.8288, + "step": 7186 + }, + { + "epoch": 2.2059545733578885, + "grad_norm": 0.46544820070266724, + "learning_rate": 9.077670797009599e-05, + "loss": 1.8974, + "step": 7187 + }, + { + "epoch": 2.2062615101289134, + "grad_norm": 0.36188805103302, + "learning_rate": 9.077383126119382e-05, + "loss": 1.8953, + "step": 7188 + }, + { + "epoch": 2.2065684468999387, + "grad_norm": 0.30941206216812134, + "learning_rate": 9.077095414934075e-05, + "loss": 1.8395, + "step": 7189 + }, + { + "epoch": 2.2068753836709636, + "grad_norm": 0.4497200846672058, + "learning_rate": 9.076807663456524e-05, + "loss": 1.8485, + "step": 7190 + }, + { + "epoch": 2.207182320441989, + "grad_norm": 0.4923233985900879, + "learning_rate": 9.076519871689568e-05, + "loss": 1.8233, + "step": 7191 + }, + { + "epoch": 2.2074892572130143, + "grad_norm": 0.32226502895355225, + "learning_rate": 9.076232039636053e-05, + "loss": 1.8563, + "step": 7192 + }, + { + "epoch": 2.207796193984039, + "grad_norm": 0.46719446778297424, + "learning_rate": 9.075944167298824e-05, + "loss": 1.8602, + "step": 7193 + }, + { + "epoch": 2.2081031307550645, + "grad_norm": 0.5534674525260925, + "learning_rate": 9.075656254680727e-05, + "loss": 1.8804, + "step": 7194 + }, + { + "epoch": 2.20841006752609, + "grad_norm": 0.4895678162574768, + "learning_rate": 9.075368301784606e-05, + "loss": 1.8893, + "step": 7195 + }, + { + "epoch": 2.2087170042971147, + "grad_norm": 0.33137625455856323, + "learning_rate": 9.075080308613306e-05, + "loss": 1.9158, + "step": 7196 + }, + { + "epoch": 2.20902394106814, + "grad_norm": 0.469319611787796, + "learning_rate": 9.074792275169674e-05, + "loss": 1.8628, + "step": 7197 + }, + { + "epoch": 2.209330877839165, + "grad_norm": 0.43872305750846863, + "learning_rate": 9.074504201456556e-05, + "loss": 1.8867, + "step": 7198 + }, + { + "epoch": 2.2096378146101903, + "grad_norm": 0.32900992035865784, + "learning_rate": 9.0742160874768e-05, + "loss": 1.8079, + "step": 7199 + }, + { + "epoch": 2.2099447513812156, + "grad_norm": 0.34231048822402954, + "learning_rate": 9.073927933233253e-05, + "loss": 1.9018, + "step": 7200 + }, + { + "epoch": 2.2102516881522405, + "grad_norm": 0.43461740016937256, + "learning_rate": 9.07363973872876e-05, + "loss": 1.8299, + "step": 7201 + }, + { + "epoch": 2.210558624923266, + "grad_norm": 0.43819913268089294, + "learning_rate": 9.073351503966174e-05, + "loss": 1.8641, + "step": 7202 + }, + { + "epoch": 2.210865561694291, + "grad_norm": 0.330683171749115, + "learning_rate": 9.073063228948339e-05, + "loss": 1.8595, + "step": 7203 + }, + { + "epoch": 2.211172498465316, + "grad_norm": 0.35648414492607117, + "learning_rate": 9.072774913678108e-05, + "loss": 1.8265, + "step": 7204 + }, + { + "epoch": 2.2114794352363414, + "grad_norm": 0.4420771300792694, + "learning_rate": 9.072486558158329e-05, + "loss": 1.902, + "step": 7205 + }, + { + "epoch": 2.2117863720073663, + "grad_norm": 0.41682472825050354, + "learning_rate": 9.072198162391849e-05, + "loss": 1.903, + "step": 7206 + }, + { + "epoch": 2.2120933087783916, + "grad_norm": 0.3194744288921356, + "learning_rate": 9.07190972638152e-05, + "loss": 1.8221, + "step": 7207 + }, + { + "epoch": 2.212400245549417, + "grad_norm": 0.35625776648521423, + "learning_rate": 9.071621250130192e-05, + "loss": 1.8737, + "step": 7208 + }, + { + "epoch": 2.212707182320442, + "grad_norm": 0.4136293828487396, + "learning_rate": 9.071332733640716e-05, + "loss": 1.7995, + "step": 7209 + }, + { + "epoch": 2.213014119091467, + "grad_norm": 0.39144495129585266, + "learning_rate": 9.071044176915947e-05, + "loss": 1.8446, + "step": 7210 + }, + { + "epoch": 2.2133210558624925, + "grad_norm": 0.3082813322544098, + "learning_rate": 9.07075557995873e-05, + "loss": 1.7635, + "step": 7211 + }, + { + "epoch": 2.2136279926335174, + "grad_norm": 0.3642291724681854, + "learning_rate": 9.070466942771921e-05, + "loss": 1.9471, + "step": 7212 + }, + { + "epoch": 2.2139349294045427, + "grad_norm": 0.4506807029247284, + "learning_rate": 9.070178265358372e-05, + "loss": 1.8542, + "step": 7213 + }, + { + "epoch": 2.214241866175568, + "grad_norm": 0.5011601448059082, + "learning_rate": 9.069889547720936e-05, + "loss": 1.9135, + "step": 7214 + }, + { + "epoch": 2.214548802946593, + "grad_norm": 0.3946228623390198, + "learning_rate": 9.069600789862467e-05, + "loss": 1.876, + "step": 7215 + }, + { + "epoch": 2.2148557397176183, + "grad_norm": 0.34833815693855286, + "learning_rate": 9.069311991785816e-05, + "loss": 1.8666, + "step": 7216 + }, + { + "epoch": 2.215162676488643, + "grad_norm": 0.43735191226005554, + "learning_rate": 9.069023153493839e-05, + "loss": 1.9238, + "step": 7217 + }, + { + "epoch": 2.2154696132596685, + "grad_norm": 0.5010718107223511, + "learning_rate": 9.06873427498939e-05, + "loss": 1.8724, + "step": 7218 + }, + { + "epoch": 2.215776550030694, + "grad_norm": 0.35850396752357483, + "learning_rate": 9.068445356275326e-05, + "loss": 1.8825, + "step": 7219 + }, + { + "epoch": 2.2160834868017187, + "grad_norm": 0.3528468906879425, + "learning_rate": 9.0681563973545e-05, + "loss": 1.8724, + "step": 7220 + }, + { + "epoch": 2.216390423572744, + "grad_norm": 0.34725508093833923, + "learning_rate": 9.067867398229767e-05, + "loss": 1.8722, + "step": 7221 + }, + { + "epoch": 2.216697360343769, + "grad_norm": 0.3343757092952728, + "learning_rate": 9.067578358903985e-05, + "loss": 1.8144, + "step": 7222 + }, + { + "epoch": 2.2170042971147943, + "grad_norm": 0.33384087681770325, + "learning_rate": 9.067289279380009e-05, + "loss": 1.832, + "step": 7223 + }, + { + "epoch": 2.2173112338858196, + "grad_norm": 0.3275810778141022, + "learning_rate": 9.067000159660697e-05, + "loss": 1.8819, + "step": 7224 + }, + { + "epoch": 2.2176181706568445, + "grad_norm": 0.405293732881546, + "learning_rate": 9.066710999748904e-05, + "loss": 1.8669, + "step": 7225 + }, + { + "epoch": 2.21792510742787, + "grad_norm": 0.3554569482803345, + "learning_rate": 9.066421799647491e-05, + "loss": 1.8331, + "step": 7226 + }, + { + "epoch": 2.218232044198895, + "grad_norm": 0.3896840810775757, + "learning_rate": 9.066132559359313e-05, + "loss": 1.891, + "step": 7227 + }, + { + "epoch": 2.21853898096992, + "grad_norm": 0.38668718934059143, + "learning_rate": 9.065843278887231e-05, + "loss": 1.9162, + "step": 7228 + }, + { + "epoch": 2.2188459177409454, + "grad_norm": 0.3593392074108124, + "learning_rate": 9.065553958234103e-05, + "loss": 1.866, + "step": 7229 + }, + { + "epoch": 2.2191528545119708, + "grad_norm": 0.3509809076786041, + "learning_rate": 9.065264597402788e-05, + "loss": 1.8979, + "step": 7230 + }, + { + "epoch": 2.2194597912829956, + "grad_norm": 0.35477882623672485, + "learning_rate": 9.064975196396144e-05, + "loss": 1.8425, + "step": 7231 + }, + { + "epoch": 2.219766728054021, + "grad_norm": 0.38763463497161865, + "learning_rate": 9.064685755217033e-05, + "loss": 1.8853, + "step": 7232 + }, + { + "epoch": 2.220073664825046, + "grad_norm": 0.33559930324554443, + "learning_rate": 9.064396273868316e-05, + "loss": 1.8825, + "step": 7233 + }, + { + "epoch": 2.220380601596071, + "grad_norm": 0.3130233585834503, + "learning_rate": 9.064106752352852e-05, + "loss": 1.8082, + "step": 7234 + }, + { + "epoch": 2.2206875383670965, + "grad_norm": 0.33321285247802734, + "learning_rate": 9.063817190673503e-05, + "loss": 1.8795, + "step": 7235 + }, + { + "epoch": 2.2209944751381214, + "grad_norm": 0.47564151883125305, + "learning_rate": 9.063527588833132e-05, + "loss": 1.9461, + "step": 7236 + }, + { + "epoch": 2.2213014119091468, + "grad_norm": 0.38102859258651733, + "learning_rate": 9.063237946834597e-05, + "loss": 1.8656, + "step": 7237 + }, + { + "epoch": 2.2216083486801717, + "grad_norm": 0.32240456342697144, + "learning_rate": 9.062948264680765e-05, + "loss": 1.8187, + "step": 7238 + }, + { + "epoch": 2.221915285451197, + "grad_norm": 0.2852800190448761, + "learning_rate": 9.062658542374496e-05, + "loss": 1.8172, + "step": 7239 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3506350815296173, + "learning_rate": 9.062368779918655e-05, + "loss": 1.8909, + "step": 7240 + }, + { + "epoch": 2.222529158993247, + "grad_norm": 0.29418817162513733, + "learning_rate": 9.062078977316104e-05, + "loss": 1.8078, + "step": 7241 + }, + { + "epoch": 2.2228360957642725, + "grad_norm": 0.31221407651901245, + "learning_rate": 9.061789134569707e-05, + "loss": 1.8813, + "step": 7242 + }, + { + "epoch": 2.223143032535298, + "grad_norm": 0.32314184308052063, + "learning_rate": 9.061499251682331e-05, + "loss": 1.8838, + "step": 7243 + }, + { + "epoch": 2.2234499693063228, + "grad_norm": 0.3329566419124603, + "learning_rate": 9.061209328656838e-05, + "loss": 1.8987, + "step": 7244 + }, + { + "epoch": 2.223756906077348, + "grad_norm": 0.35992133617401123, + "learning_rate": 9.060919365496094e-05, + "loss": 1.9194, + "step": 7245 + }, + { + "epoch": 2.2240638428483734, + "grad_norm": 0.33594536781311035, + "learning_rate": 9.060629362202964e-05, + "loss": 1.8303, + "step": 7246 + }, + { + "epoch": 2.2243707796193983, + "grad_norm": 0.3469938635826111, + "learning_rate": 9.060339318780316e-05, + "loss": 1.905, + "step": 7247 + }, + { + "epoch": 2.2246777163904237, + "grad_norm": 0.3989942967891693, + "learning_rate": 9.060049235231015e-05, + "loss": 1.8655, + "step": 7248 + }, + { + "epoch": 2.2249846531614486, + "grad_norm": 0.35004356503486633, + "learning_rate": 9.059759111557926e-05, + "loss": 1.8081, + "step": 7249 + }, + { + "epoch": 2.225291589932474, + "grad_norm": 0.38162320852279663, + "learning_rate": 9.059468947763919e-05, + "loss": 1.9243, + "step": 7250 + }, + { + "epoch": 2.2255985267034992, + "grad_norm": 0.3417564034461975, + "learning_rate": 9.059178743851859e-05, + "loss": 1.8246, + "step": 7251 + }, + { + "epoch": 2.225905463474524, + "grad_norm": 0.39185380935668945, + "learning_rate": 9.058888499824618e-05, + "loss": 1.9235, + "step": 7252 + }, + { + "epoch": 2.2262124002455494, + "grad_norm": 0.5741223096847534, + "learning_rate": 9.058598215685061e-05, + "loss": 1.9104, + "step": 7253 + }, + { + "epoch": 2.226519337016575, + "grad_norm": 0.6595804691314697, + "learning_rate": 9.058307891436057e-05, + "loss": 1.9956, + "step": 7254 + }, + { + "epoch": 2.2268262737875997, + "grad_norm": 0.6249661445617676, + "learning_rate": 9.058017527080476e-05, + "loss": 1.8913, + "step": 7255 + }, + { + "epoch": 2.227133210558625, + "grad_norm": 0.48208609223365784, + "learning_rate": 9.057727122621188e-05, + "loss": 1.9116, + "step": 7256 + }, + { + "epoch": 2.22744014732965, + "grad_norm": 0.37400147318840027, + "learning_rate": 9.057436678061062e-05, + "loss": 1.8828, + "step": 7257 + }, + { + "epoch": 2.2277470841006752, + "grad_norm": 0.40321463346481323, + "learning_rate": 9.057146193402968e-05, + "loss": 1.7984, + "step": 7258 + }, + { + "epoch": 2.2280540208717006, + "grad_norm": 0.43090149760246277, + "learning_rate": 9.056855668649778e-05, + "loss": 1.9135, + "step": 7259 + }, + { + "epoch": 2.2283609576427255, + "grad_norm": 0.3625677525997162, + "learning_rate": 9.056565103804362e-05, + "loss": 1.9005, + "step": 7260 + }, + { + "epoch": 2.228667894413751, + "grad_norm": 0.3386496901512146, + "learning_rate": 9.056274498869593e-05, + "loss": 1.879, + "step": 7261 + }, + { + "epoch": 2.228974831184776, + "grad_norm": 0.45207980275154114, + "learning_rate": 9.05598385384834e-05, + "loss": 1.8748, + "step": 7262 + }, + { + "epoch": 2.229281767955801, + "grad_norm": 0.38665562868118286, + "learning_rate": 9.055693168743478e-05, + "loss": 1.8828, + "step": 7263 + }, + { + "epoch": 2.2295887047268264, + "grad_norm": 0.3074968159198761, + "learning_rate": 9.05540244355788e-05, + "loss": 1.8443, + "step": 7264 + }, + { + "epoch": 2.2298956414978512, + "grad_norm": 0.36243903636932373, + "learning_rate": 9.055111678294418e-05, + "loss": 1.8681, + "step": 7265 + }, + { + "epoch": 2.2302025782688766, + "grad_norm": 0.4070085287094116, + "learning_rate": 9.054820872955965e-05, + "loss": 1.8643, + "step": 7266 + }, + { + "epoch": 2.230509515039902, + "grad_norm": 0.3784204125404358, + "learning_rate": 9.054530027545396e-05, + "loss": 1.9197, + "step": 7267 + }, + { + "epoch": 2.230816451810927, + "grad_norm": 0.32002586126327515, + "learning_rate": 9.054239142065583e-05, + "loss": 1.9, + "step": 7268 + }, + { + "epoch": 2.231123388581952, + "grad_norm": 0.3701259195804596, + "learning_rate": 9.053948216519405e-05, + "loss": 1.8815, + "step": 7269 + }, + { + "epoch": 2.2314303253529775, + "grad_norm": 0.32927554845809937, + "learning_rate": 9.053657250909734e-05, + "loss": 1.8599, + "step": 7270 + }, + { + "epoch": 2.2317372621240024, + "grad_norm": 0.2915503680706024, + "learning_rate": 9.053366245239445e-05, + "loss": 1.8553, + "step": 7271 + }, + { + "epoch": 2.2320441988950277, + "grad_norm": 0.3347928822040558, + "learning_rate": 9.053075199511416e-05, + "loss": 1.926, + "step": 7272 + }, + { + "epoch": 2.2323511356660526, + "grad_norm": 0.37499183416366577, + "learning_rate": 9.052784113728523e-05, + "loss": 1.8636, + "step": 7273 + }, + { + "epoch": 2.232658072437078, + "grad_norm": 0.38303107023239136, + "learning_rate": 9.05249298789364e-05, + "loss": 1.8739, + "step": 7274 + }, + { + "epoch": 2.2329650092081033, + "grad_norm": 0.356942355632782, + "learning_rate": 9.052201822009648e-05, + "loss": 1.8401, + "step": 7275 + }, + { + "epoch": 2.233271945979128, + "grad_norm": 0.3391316533088684, + "learning_rate": 9.051910616079422e-05, + "loss": 1.8954, + "step": 7276 + }, + { + "epoch": 2.2335788827501535, + "grad_norm": 0.3100464344024658, + "learning_rate": 9.051619370105839e-05, + "loss": 1.8726, + "step": 7277 + }, + { + "epoch": 2.233885819521179, + "grad_norm": 0.38745078444480896, + "learning_rate": 9.05132808409178e-05, + "loss": 1.9605, + "step": 7278 + }, + { + "epoch": 2.2341927562922037, + "grad_norm": 0.40631747245788574, + "learning_rate": 9.051036758040123e-05, + "loss": 1.8458, + "step": 7279 + }, + { + "epoch": 2.234499693063229, + "grad_norm": 0.4084717929363251, + "learning_rate": 9.050745391953745e-05, + "loss": 1.8696, + "step": 7280 + }, + { + "epoch": 2.234806629834254, + "grad_norm": 0.4426955282688141, + "learning_rate": 9.050453985835527e-05, + "loss": 1.9063, + "step": 7281 + }, + { + "epoch": 2.2351135666052793, + "grad_norm": 0.37360796332359314, + "learning_rate": 9.05016253968835e-05, + "loss": 1.9299, + "step": 7282 + }, + { + "epoch": 2.2354205033763046, + "grad_norm": 0.34415799379348755, + "learning_rate": 9.049871053515091e-05, + "loss": 1.8877, + "step": 7283 + }, + { + "epoch": 2.2357274401473295, + "grad_norm": 0.3745698928833008, + "learning_rate": 9.049579527318633e-05, + "loss": 1.9272, + "step": 7284 + }, + { + "epoch": 2.236034376918355, + "grad_norm": 0.3293079435825348, + "learning_rate": 9.049287961101857e-05, + "loss": 1.8599, + "step": 7285 + }, + { + "epoch": 2.23634131368938, + "grad_norm": 0.3563106060028076, + "learning_rate": 9.048996354867644e-05, + "loss": 1.938, + "step": 7286 + }, + { + "epoch": 2.236648250460405, + "grad_norm": 0.36354976892471313, + "learning_rate": 9.048704708618876e-05, + "loss": 1.9401, + "step": 7287 + }, + { + "epoch": 2.2369551872314304, + "grad_norm": 0.32659000158309937, + "learning_rate": 9.048413022358434e-05, + "loss": 1.8056, + "step": 7288 + }, + { + "epoch": 2.2372621240024557, + "grad_norm": 0.30486637353897095, + "learning_rate": 9.048121296089202e-05, + "loss": 1.8178, + "step": 7289 + }, + { + "epoch": 2.2375690607734806, + "grad_norm": 0.34506455063819885, + "learning_rate": 9.047829529814063e-05, + "loss": 1.8866, + "step": 7290 + }, + { + "epoch": 2.237875997544506, + "grad_norm": 0.3200983703136444, + "learning_rate": 9.047537723535902e-05, + "loss": 1.8218, + "step": 7291 + }, + { + "epoch": 2.238182934315531, + "grad_norm": 0.33315715193748474, + "learning_rate": 9.047245877257597e-05, + "loss": 1.8939, + "step": 7292 + }, + { + "epoch": 2.238489871086556, + "grad_norm": 0.38259127736091614, + "learning_rate": 9.046953990982039e-05, + "loss": 1.9566, + "step": 7293 + }, + { + "epoch": 2.2387968078575815, + "grad_norm": 0.32880350947380066, + "learning_rate": 9.04666206471211e-05, + "loss": 1.9056, + "step": 7294 + }, + { + "epoch": 2.2391037446286064, + "grad_norm": 0.39114195108413696, + "learning_rate": 9.046370098450692e-05, + "loss": 1.8773, + "step": 7295 + }, + { + "epoch": 2.2394106813996317, + "grad_norm": 0.37625813484191895, + "learning_rate": 9.046078092200675e-05, + "loss": 1.8685, + "step": 7296 + }, + { + "epoch": 2.2397176181706566, + "grad_norm": 0.3604978621006012, + "learning_rate": 9.045786045964942e-05, + "loss": 1.885, + "step": 7297 + }, + { + "epoch": 2.240024554941682, + "grad_norm": 0.32200589776039124, + "learning_rate": 9.045493959746381e-05, + "loss": 1.9146, + "step": 7298 + }, + { + "epoch": 2.2403314917127073, + "grad_norm": 0.3635976314544678, + "learning_rate": 9.045201833547876e-05, + "loss": 1.8597, + "step": 7299 + }, + { + "epoch": 2.240638428483732, + "grad_norm": 0.3326318562030792, + "learning_rate": 9.044909667372317e-05, + "loss": 1.8577, + "step": 7300 + }, + { + "epoch": 2.2409453652547575, + "grad_norm": 0.32209664583206177, + "learning_rate": 9.044617461222589e-05, + "loss": 1.844, + "step": 7301 + }, + { + "epoch": 2.241252302025783, + "grad_norm": 0.3654637634754181, + "learning_rate": 9.044325215101581e-05, + "loss": 1.8858, + "step": 7302 + }, + { + "epoch": 2.2415592387968077, + "grad_norm": 0.3583166003227234, + "learning_rate": 9.04403292901218e-05, + "loss": 1.8148, + "step": 7303 + }, + { + "epoch": 2.241866175567833, + "grad_norm": 0.3315606117248535, + "learning_rate": 9.043740602957276e-05, + "loss": 1.8504, + "step": 7304 + }, + { + "epoch": 2.2421731123388584, + "grad_norm": 0.36084556579589844, + "learning_rate": 9.043448236939758e-05, + "loss": 1.9167, + "step": 7305 + }, + { + "epoch": 2.2424800491098833, + "grad_norm": 0.43558987975120544, + "learning_rate": 9.043155830962514e-05, + "loss": 1.8937, + "step": 7306 + }, + { + "epoch": 2.2427869858809086, + "grad_norm": 0.455240398645401, + "learning_rate": 9.042863385028433e-05, + "loss": 1.9774, + "step": 7307 + }, + { + "epoch": 2.2430939226519335, + "grad_norm": 0.35868698358535767, + "learning_rate": 9.042570899140408e-05, + "loss": 1.7999, + "step": 7308 + }, + { + "epoch": 2.243400859422959, + "grad_norm": 0.33930447697639465, + "learning_rate": 9.042278373301327e-05, + "loss": 1.965, + "step": 7309 + }, + { + "epoch": 2.243707796193984, + "grad_norm": 0.34124335646629333, + "learning_rate": 9.041985807514082e-05, + "loss": 1.8916, + "step": 7310 + }, + { + "epoch": 2.244014732965009, + "grad_norm": 0.3905695974826813, + "learning_rate": 9.041693201781565e-05, + "loss": 1.9066, + "step": 7311 + }, + { + "epoch": 2.2443216697360344, + "grad_norm": 0.3108711242675781, + "learning_rate": 9.041400556106667e-05, + "loss": 1.8038, + "step": 7312 + }, + { + "epoch": 2.2446286065070598, + "grad_norm": 0.2853390872478485, + "learning_rate": 9.041107870492279e-05, + "loss": 1.8945, + "step": 7313 + }, + { + "epoch": 2.2449355432780846, + "grad_norm": 0.33351564407348633, + "learning_rate": 9.040815144941295e-05, + "loss": 1.8796, + "step": 7314 + }, + { + "epoch": 2.24524248004911, + "grad_norm": 0.3470609486103058, + "learning_rate": 9.040522379456606e-05, + "loss": 1.8914, + "step": 7315 + }, + { + "epoch": 2.245549416820135, + "grad_norm": 0.3474356532096863, + "learning_rate": 9.040229574041109e-05, + "loss": 1.838, + "step": 7316 + }, + { + "epoch": 2.24585635359116, + "grad_norm": 0.36590397357940674, + "learning_rate": 9.039936728697693e-05, + "loss": 1.86, + "step": 7317 + }, + { + "epoch": 2.2461632903621855, + "grad_norm": 0.35168272256851196, + "learning_rate": 9.039643843429257e-05, + "loss": 1.9337, + "step": 7318 + }, + { + "epoch": 2.2464702271332104, + "grad_norm": 0.3402341604232788, + "learning_rate": 9.039350918238691e-05, + "loss": 1.9291, + "step": 7319 + }, + { + "epoch": 2.2467771639042358, + "grad_norm": 0.3505321443080902, + "learning_rate": 9.03905795312889e-05, + "loss": 1.8252, + "step": 7320 + }, + { + "epoch": 2.247084100675261, + "grad_norm": 0.38366270065307617, + "learning_rate": 9.038764948102754e-05, + "loss": 1.8685, + "step": 7321 + }, + { + "epoch": 2.247391037446286, + "grad_norm": 0.3616010844707489, + "learning_rate": 9.038471903163176e-05, + "loss": 1.8734, + "step": 7322 + }, + { + "epoch": 2.2476979742173113, + "grad_norm": 0.2982875108718872, + "learning_rate": 9.038178818313048e-05, + "loss": 1.824, + "step": 7323 + }, + { + "epoch": 2.248004910988336, + "grad_norm": 0.41936174035072327, + "learning_rate": 9.037885693555273e-05, + "loss": 1.8799, + "step": 7324 + }, + { + "epoch": 2.2483118477593615, + "grad_norm": 0.3460717797279358, + "learning_rate": 9.037592528892744e-05, + "loss": 1.8889, + "step": 7325 + }, + { + "epoch": 2.248618784530387, + "grad_norm": 0.34347018599510193, + "learning_rate": 9.03729932432836e-05, + "loss": 1.8779, + "step": 7326 + }, + { + "epoch": 2.2489257213014118, + "grad_norm": 0.2988032400608063, + "learning_rate": 9.037006079865016e-05, + "loss": 1.8753, + "step": 7327 + }, + { + "epoch": 2.249232658072437, + "grad_norm": 0.32754310965538025, + "learning_rate": 9.036712795505613e-05, + "loss": 1.8896, + "step": 7328 + }, + { + "epoch": 2.2495395948434624, + "grad_norm": 0.3599032163619995, + "learning_rate": 9.036419471253049e-05, + "loss": 1.8752, + "step": 7329 + }, + { + "epoch": 2.2498465316144873, + "grad_norm": 0.3461225926876068, + "learning_rate": 9.03612610711022e-05, + "loss": 1.8723, + "step": 7330 + }, + { + "epoch": 2.2501534683855127, + "grad_norm": 0.3141838610172272, + "learning_rate": 9.035832703080027e-05, + "loss": 1.8825, + "step": 7331 + }, + { + "epoch": 2.250460405156538, + "grad_norm": 0.35188567638397217, + "learning_rate": 9.035539259165371e-05, + "loss": 1.8832, + "step": 7332 + }, + { + "epoch": 2.250767341927563, + "grad_norm": 0.3496280014514923, + "learning_rate": 9.035245775369151e-05, + "loss": 1.9084, + "step": 7333 + }, + { + "epoch": 2.2510742786985882, + "grad_norm": 0.34936273097991943, + "learning_rate": 9.034952251694266e-05, + "loss": 1.8142, + "step": 7334 + }, + { + "epoch": 2.251381215469613, + "grad_norm": 0.4227045774459839, + "learning_rate": 9.034658688143618e-05, + "loss": 1.9454, + "step": 7335 + }, + { + "epoch": 2.2516881522406385, + "grad_norm": 0.4042366147041321, + "learning_rate": 9.034365084720108e-05, + "loss": 1.8993, + "step": 7336 + }, + { + "epoch": 2.251995089011664, + "grad_norm": 0.392633318901062, + "learning_rate": 9.03407144142664e-05, + "loss": 1.9229, + "step": 7337 + }, + { + "epoch": 2.2523020257826887, + "grad_norm": 0.31304940581321716, + "learning_rate": 9.033777758266111e-05, + "loss": 1.8746, + "step": 7338 + }, + { + "epoch": 2.252608962553714, + "grad_norm": 0.3205752372741699, + "learning_rate": 9.033484035241426e-05, + "loss": 1.8224, + "step": 7339 + }, + { + "epoch": 2.252915899324739, + "grad_norm": 0.32164251804351807, + "learning_rate": 9.033190272355488e-05, + "loss": 1.8164, + "step": 7340 + }, + { + "epoch": 2.2532228360957642, + "grad_norm": 0.3567545413970947, + "learning_rate": 9.032896469611201e-05, + "loss": 1.8892, + "step": 7341 + }, + { + "epoch": 2.2535297728667896, + "grad_norm": 0.3475800156593323, + "learning_rate": 9.032602627011467e-05, + "loss": 1.8594, + "step": 7342 + }, + { + "epoch": 2.2538367096378145, + "grad_norm": 0.38770994544029236, + "learning_rate": 9.032308744559189e-05, + "loss": 1.8899, + "step": 7343 + }, + { + "epoch": 2.25414364640884, + "grad_norm": 0.3671153783798218, + "learning_rate": 9.032014822257273e-05, + "loss": 1.8795, + "step": 7344 + }, + { + "epoch": 2.254450583179865, + "grad_norm": 0.3415989875793457, + "learning_rate": 9.031720860108623e-05, + "loss": 1.9007, + "step": 7345 + }, + { + "epoch": 2.25475751995089, + "grad_norm": 0.3317084014415741, + "learning_rate": 9.031426858116145e-05, + "loss": 1.8604, + "step": 7346 + }, + { + "epoch": 2.2550644567219154, + "grad_norm": 0.3760251998901367, + "learning_rate": 9.031132816282745e-05, + "loss": 1.9061, + "step": 7347 + }, + { + "epoch": 2.2553713934929407, + "grad_norm": 0.4288908541202545, + "learning_rate": 9.030838734611326e-05, + "loss": 1.8621, + "step": 7348 + }, + { + "epoch": 2.2556783302639656, + "grad_norm": 0.3840491771697998, + "learning_rate": 9.030544613104797e-05, + "loss": 1.8743, + "step": 7349 + }, + { + "epoch": 2.255985267034991, + "grad_norm": 0.32746297121047974, + "learning_rate": 9.030250451766063e-05, + "loss": 1.8813, + "step": 7350 + }, + { + "epoch": 2.256292203806016, + "grad_norm": 0.31266525387763977, + "learning_rate": 9.029956250598032e-05, + "loss": 1.816, + "step": 7351 + }, + { + "epoch": 2.256599140577041, + "grad_norm": 0.34744998812675476, + "learning_rate": 9.029662009603613e-05, + "loss": 1.8728, + "step": 7352 + }, + { + "epoch": 2.2569060773480665, + "grad_norm": 0.36204856634140015, + "learning_rate": 9.029367728785709e-05, + "loss": 1.9331, + "step": 7353 + }, + { + "epoch": 2.2572130141190914, + "grad_norm": 0.3839271664619446, + "learning_rate": 9.029073408147234e-05, + "loss": 2.0018, + "step": 7354 + }, + { + "epoch": 2.2575199508901167, + "grad_norm": 0.34844526648521423, + "learning_rate": 9.028779047691094e-05, + "loss": 1.8873, + "step": 7355 + }, + { + "epoch": 2.2578268876611416, + "grad_norm": 0.31876906752586365, + "learning_rate": 9.028484647420196e-05, + "loss": 1.8569, + "step": 7356 + }, + { + "epoch": 2.258133824432167, + "grad_norm": 0.3633274435997009, + "learning_rate": 9.028190207337452e-05, + "loss": 1.8645, + "step": 7357 + }, + { + "epoch": 2.2584407612031923, + "grad_norm": 0.39025530219078064, + "learning_rate": 9.027895727445775e-05, + "loss": 1.911, + "step": 7358 + }, + { + "epoch": 2.258747697974217, + "grad_norm": 0.34168434143066406, + "learning_rate": 9.027601207748067e-05, + "loss": 1.8675, + "step": 7359 + }, + { + "epoch": 2.2590546347452425, + "grad_norm": 0.3539605438709259, + "learning_rate": 9.027306648247245e-05, + "loss": 1.9001, + "step": 7360 + }, + { + "epoch": 2.259361571516268, + "grad_norm": 0.30433401465415955, + "learning_rate": 9.02701204894622e-05, + "loss": 1.8598, + "step": 7361 + }, + { + "epoch": 2.2596685082872927, + "grad_norm": 0.35448700189590454, + "learning_rate": 9.026717409847898e-05, + "loss": 1.8845, + "step": 7362 + }, + { + "epoch": 2.259975445058318, + "grad_norm": 0.34060248732566833, + "learning_rate": 9.026422730955197e-05, + "loss": 1.9322, + "step": 7363 + }, + { + "epoch": 2.2602823818293434, + "grad_norm": 0.3370642364025116, + "learning_rate": 9.026128012271026e-05, + "loss": 1.8356, + "step": 7364 + }, + { + "epoch": 2.2605893186003683, + "grad_norm": 0.3148033022880554, + "learning_rate": 9.025833253798298e-05, + "loss": 1.7723, + "step": 7365 + }, + { + "epoch": 2.2608962553713936, + "grad_norm": 0.3062879145145416, + "learning_rate": 9.025538455539925e-05, + "loss": 1.8548, + "step": 7366 + }, + { + "epoch": 2.2612031921424185, + "grad_norm": 0.3378484547138214, + "learning_rate": 9.025243617498825e-05, + "loss": 1.9049, + "step": 7367 + }, + { + "epoch": 2.261510128913444, + "grad_norm": 0.277660608291626, + "learning_rate": 9.024948739677905e-05, + "loss": 1.7833, + "step": 7368 + }, + { + "epoch": 2.261817065684469, + "grad_norm": 0.3986060619354248, + "learning_rate": 9.024653822080083e-05, + "loss": 1.8837, + "step": 7369 + }, + { + "epoch": 2.262124002455494, + "grad_norm": 0.3013289272785187, + "learning_rate": 9.024358864708275e-05, + "loss": 1.8659, + "step": 7370 + }, + { + "epoch": 2.2624309392265194, + "grad_norm": 0.3403053879737854, + "learning_rate": 9.024063867565391e-05, + "loss": 1.8914, + "step": 7371 + }, + { + "epoch": 2.2627378759975443, + "grad_norm": 0.3488257825374603, + "learning_rate": 9.023768830654351e-05, + "loss": 1.8887, + "step": 7372 + }, + { + "epoch": 2.2630448127685696, + "grad_norm": 0.2950255274772644, + "learning_rate": 9.023473753978069e-05, + "loss": 1.8385, + "step": 7373 + }, + { + "epoch": 2.263351749539595, + "grad_norm": 0.35732173919677734, + "learning_rate": 9.023178637539461e-05, + "loss": 1.8769, + "step": 7374 + }, + { + "epoch": 2.26365868631062, + "grad_norm": 0.5403436422348022, + "learning_rate": 9.022883481341445e-05, + "loss": 1.9742, + "step": 7375 + }, + { + "epoch": 2.263965623081645, + "grad_norm": 0.5506799221038818, + "learning_rate": 9.022588285386935e-05, + "loss": 1.8667, + "step": 7376 + }, + { + "epoch": 2.2642725598526705, + "grad_norm": 0.4272395372390747, + "learning_rate": 9.02229304967885e-05, + "loss": 1.8336, + "step": 7377 + }, + { + "epoch": 2.2645794966236954, + "grad_norm": 0.34911462664604187, + "learning_rate": 9.021997774220108e-05, + "loss": 1.8608, + "step": 7378 + }, + { + "epoch": 2.2648864333947207, + "grad_norm": 0.3592715263366699, + "learning_rate": 9.021702459013626e-05, + "loss": 1.925, + "step": 7379 + }, + { + "epoch": 2.265193370165746, + "grad_norm": 0.38482216000556946, + "learning_rate": 9.021407104062323e-05, + "loss": 1.8553, + "step": 7380 + }, + { + "epoch": 2.265500306936771, + "grad_norm": 0.4675584137439728, + "learning_rate": 9.021111709369118e-05, + "loss": 1.9303, + "step": 7381 + }, + { + "epoch": 2.2658072437077963, + "grad_norm": 0.40397754311561584, + "learning_rate": 9.02081627493693e-05, + "loss": 1.9512, + "step": 7382 + }, + { + "epoch": 2.266114180478821, + "grad_norm": 0.3385498821735382, + "learning_rate": 9.02052080076868e-05, + "loss": 1.8314, + "step": 7383 + }, + { + "epoch": 2.2664211172498465, + "grad_norm": 0.40668871998786926, + "learning_rate": 9.020225286867285e-05, + "loss": 1.8658, + "step": 7384 + }, + { + "epoch": 2.266728054020872, + "grad_norm": 0.4566061198711395, + "learning_rate": 9.01992973323567e-05, + "loss": 1.8429, + "step": 7385 + }, + { + "epoch": 2.2670349907918967, + "grad_norm": 0.42283549904823303, + "learning_rate": 9.019634139876752e-05, + "loss": 1.8858, + "step": 7386 + }, + { + "epoch": 2.267341927562922, + "grad_norm": 0.3491251468658447, + "learning_rate": 9.019338506793454e-05, + "loss": 1.8389, + "step": 7387 + }, + { + "epoch": 2.267648864333947, + "grad_norm": 0.33846428990364075, + "learning_rate": 9.019042833988696e-05, + "loss": 1.8309, + "step": 7388 + }, + { + "epoch": 2.2679558011049723, + "grad_norm": 0.39968016743659973, + "learning_rate": 9.0187471214654e-05, + "loss": 1.8591, + "step": 7389 + }, + { + "epoch": 2.2682627378759976, + "grad_norm": 0.39926376938819885, + "learning_rate": 9.018451369226493e-05, + "loss": 1.9341, + "step": 7390 + }, + { + "epoch": 2.2685696746470225, + "grad_norm": 0.41112056374549866, + "learning_rate": 9.018155577274892e-05, + "loss": 1.8856, + "step": 7391 + }, + { + "epoch": 2.268876611418048, + "grad_norm": 0.49490058422088623, + "learning_rate": 9.017859745613521e-05, + "loss": 1.8458, + "step": 7392 + }, + { + "epoch": 2.269183548189073, + "grad_norm": 0.42149874567985535, + "learning_rate": 9.017563874245308e-05, + "loss": 1.862, + "step": 7393 + }, + { + "epoch": 2.269490484960098, + "grad_norm": 0.37284091114997864, + "learning_rate": 9.017267963173173e-05, + "loss": 1.8698, + "step": 7394 + }, + { + "epoch": 2.2697974217311234, + "grad_norm": 0.3743322193622589, + "learning_rate": 9.016972012400041e-05, + "loss": 1.8847, + "step": 7395 + }, + { + "epoch": 2.2701043585021488, + "grad_norm": 0.4327050447463989, + "learning_rate": 9.016676021928838e-05, + "loss": 1.8227, + "step": 7396 + }, + { + "epoch": 2.2704112952731736, + "grad_norm": 0.4334336519241333, + "learning_rate": 9.016379991762487e-05, + "loss": 1.9292, + "step": 7397 + }, + { + "epoch": 2.270718232044199, + "grad_norm": 0.37071630358695984, + "learning_rate": 9.016083921903915e-05, + "loss": 1.8045, + "step": 7398 + }, + { + "epoch": 2.271025168815224, + "grad_norm": 0.32131752371788025, + "learning_rate": 9.015787812356049e-05, + "loss": 1.8697, + "step": 7399 + }, + { + "epoch": 2.271332105586249, + "grad_norm": 0.3604664206504822, + "learning_rate": 9.015491663121813e-05, + "loss": 1.9259, + "step": 7400 + }, + { + "epoch": 2.2716390423572745, + "grad_norm": 0.3364580571651459, + "learning_rate": 9.015195474204136e-05, + "loss": 1.8964, + "step": 7401 + }, + { + "epoch": 2.2719459791282994, + "grad_norm": 0.3141402304172516, + "learning_rate": 9.014899245605944e-05, + "loss": 1.8536, + "step": 7402 + }, + { + "epoch": 2.2722529158993248, + "grad_norm": 0.3387024402618408, + "learning_rate": 9.014602977330162e-05, + "loss": 1.8362, + "step": 7403 + }, + { + "epoch": 2.27255985267035, + "grad_norm": 0.42270272970199585, + "learning_rate": 9.014306669379723e-05, + "loss": 1.8288, + "step": 7404 + }, + { + "epoch": 2.272866789441375, + "grad_norm": 0.4565230906009674, + "learning_rate": 9.01401032175755e-05, + "loss": 1.8573, + "step": 7405 + }, + { + "epoch": 2.2731737262124003, + "grad_norm": 0.38861140608787537, + "learning_rate": 9.013713934466576e-05, + "loss": 1.8778, + "step": 7406 + }, + { + "epoch": 2.2734806629834257, + "grad_norm": 0.31552520394325256, + "learning_rate": 9.01341750750973e-05, + "loss": 1.8342, + "step": 7407 + }, + { + "epoch": 2.2737875997544506, + "grad_norm": 0.3771591782569885, + "learning_rate": 9.013121040889938e-05, + "loss": 1.8847, + "step": 7408 + }, + { + "epoch": 2.274094536525476, + "grad_norm": 0.3689042925834656, + "learning_rate": 9.012824534610132e-05, + "loss": 1.9014, + "step": 7409 + }, + { + "epoch": 2.2744014732965008, + "grad_norm": 0.31477800011634827, + "learning_rate": 9.012527988673241e-05, + "loss": 1.8631, + "step": 7410 + }, + { + "epoch": 2.274708410067526, + "grad_norm": 0.3238977789878845, + "learning_rate": 9.012231403082199e-05, + "loss": 1.8319, + "step": 7411 + }, + { + "epoch": 2.2750153468385514, + "grad_norm": 0.3587593138217926, + "learning_rate": 9.011934777839932e-05, + "loss": 1.8982, + "step": 7412 + }, + { + "epoch": 2.2753222836095763, + "grad_norm": 0.35946986079216003, + "learning_rate": 9.011638112949376e-05, + "loss": 1.9206, + "step": 7413 + }, + { + "epoch": 2.2756292203806017, + "grad_norm": 0.3451001048088074, + "learning_rate": 9.01134140841346e-05, + "loss": 1.8122, + "step": 7414 + }, + { + "epoch": 2.2759361571516266, + "grad_norm": 0.3779532313346863, + "learning_rate": 9.011044664235116e-05, + "loss": 1.8851, + "step": 7415 + }, + { + "epoch": 2.276243093922652, + "grad_norm": 0.3812767267227173, + "learning_rate": 9.010747880417279e-05, + "loss": 1.902, + "step": 7416 + }, + { + "epoch": 2.2765500306936772, + "grad_norm": 0.3666127920150757, + "learning_rate": 9.01045105696288e-05, + "loss": 1.8296, + "step": 7417 + }, + { + "epoch": 2.276856967464702, + "grad_norm": 0.3588816225528717, + "learning_rate": 9.010154193874854e-05, + "loss": 1.9023, + "step": 7418 + }, + { + "epoch": 2.2771639042357275, + "grad_norm": 0.37766706943511963, + "learning_rate": 9.009857291156134e-05, + "loss": 1.7996, + "step": 7419 + }, + { + "epoch": 2.277470841006753, + "grad_norm": 0.4222901165485382, + "learning_rate": 9.009560348809654e-05, + "loss": 1.8802, + "step": 7420 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.39289870858192444, + "learning_rate": 9.009263366838348e-05, + "loss": 1.8988, + "step": 7421 + }, + { + "epoch": 2.278084714548803, + "grad_norm": 0.3670540750026703, + "learning_rate": 9.008966345245152e-05, + "loss": 1.8348, + "step": 7422 + }, + { + "epoch": 2.2783916513198283, + "grad_norm": 0.36671552062034607, + "learning_rate": 9.008669284032998e-05, + "loss": 1.9059, + "step": 7423 + }, + { + "epoch": 2.2786985880908532, + "grad_norm": 0.33226338028907776, + "learning_rate": 9.008372183204827e-05, + "loss": 1.8736, + "step": 7424 + }, + { + "epoch": 2.2790055248618786, + "grad_norm": 0.3424983322620392, + "learning_rate": 9.008075042763573e-05, + "loss": 1.8537, + "step": 7425 + }, + { + "epoch": 2.2793124616329035, + "grad_norm": 0.3336870074272156, + "learning_rate": 9.007777862712172e-05, + "loss": 1.8622, + "step": 7426 + }, + { + "epoch": 2.279619398403929, + "grad_norm": 0.3488881289958954, + "learning_rate": 9.007480643053561e-05, + "loss": 1.88, + "step": 7427 + }, + { + "epoch": 2.279926335174954, + "grad_norm": 0.34159761667251587, + "learning_rate": 9.007183383790676e-05, + "loss": 1.8893, + "step": 7428 + }, + { + "epoch": 2.280233271945979, + "grad_norm": 0.3075805604457855, + "learning_rate": 9.006886084926459e-05, + "loss": 1.8613, + "step": 7429 + }, + { + "epoch": 2.2805402087170044, + "grad_norm": 0.32371413707733154, + "learning_rate": 9.006588746463844e-05, + "loss": 1.909, + "step": 7430 + }, + { + "epoch": 2.2808471454880292, + "grad_norm": 0.34343451261520386, + "learning_rate": 9.006291368405769e-05, + "loss": 1.8696, + "step": 7431 + }, + { + "epoch": 2.2811540822590546, + "grad_norm": 0.34018251299858093, + "learning_rate": 9.005993950755177e-05, + "loss": 1.9155, + "step": 7432 + }, + { + "epoch": 2.28146101903008, + "grad_norm": 0.42582982778549194, + "learning_rate": 9.005696493515003e-05, + "loss": 1.8901, + "step": 7433 + }, + { + "epoch": 2.281767955801105, + "grad_norm": 0.44168829917907715, + "learning_rate": 9.005398996688188e-05, + "loss": 1.8693, + "step": 7434 + }, + { + "epoch": 2.28207489257213, + "grad_norm": 0.3650555908679962, + "learning_rate": 9.005101460277673e-05, + "loss": 1.8726, + "step": 7435 + }, + { + "epoch": 2.2823818293431555, + "grad_norm": 0.2945705056190491, + "learning_rate": 9.004803884286399e-05, + "loss": 1.8655, + "step": 7436 + }, + { + "epoch": 2.2826887661141804, + "grad_norm": 0.4192120432853699, + "learning_rate": 9.004506268717305e-05, + "loss": 1.9859, + "step": 7437 + }, + { + "epoch": 2.2829957028852057, + "grad_norm": 0.35403937101364136, + "learning_rate": 9.004208613573334e-05, + "loss": 1.785, + "step": 7438 + }, + { + "epoch": 2.283302639656231, + "grad_norm": 0.3038218021392822, + "learning_rate": 9.003910918857426e-05, + "loss": 1.8199, + "step": 7439 + }, + { + "epoch": 2.283609576427256, + "grad_norm": 0.3447442352771759, + "learning_rate": 9.003613184572522e-05, + "loss": 1.882, + "step": 7440 + }, + { + "epoch": 2.2839165131982813, + "grad_norm": 0.32208123803138733, + "learning_rate": 9.003315410721567e-05, + "loss": 1.8326, + "step": 7441 + }, + { + "epoch": 2.284223449969306, + "grad_norm": 0.31731268763542175, + "learning_rate": 9.003017597307504e-05, + "loss": 1.8947, + "step": 7442 + }, + { + "epoch": 2.2845303867403315, + "grad_norm": 0.3491910398006439, + "learning_rate": 9.002719744333273e-05, + "loss": 1.924, + "step": 7443 + }, + { + "epoch": 2.284837323511357, + "grad_norm": 0.32135117053985596, + "learning_rate": 9.00242185180182e-05, + "loss": 1.838, + "step": 7444 + }, + { + "epoch": 2.2851442602823817, + "grad_norm": 0.32201823592185974, + "learning_rate": 9.00212391971609e-05, + "loss": 1.8449, + "step": 7445 + }, + { + "epoch": 2.285451197053407, + "grad_norm": 0.3983609676361084, + "learning_rate": 9.001825948079024e-05, + "loss": 1.8897, + "step": 7446 + }, + { + "epoch": 2.285758133824432, + "grad_norm": 0.4174421727657318, + "learning_rate": 9.001527936893568e-05, + "loss": 1.8671, + "step": 7447 + }, + { + "epoch": 2.2860650705954573, + "grad_norm": 0.3456888496875763, + "learning_rate": 9.001229886162668e-05, + "loss": 1.9064, + "step": 7448 + }, + { + "epoch": 2.2863720073664826, + "grad_norm": 0.3092229664325714, + "learning_rate": 9.000931795889269e-05, + "loss": 1.8478, + "step": 7449 + }, + { + "epoch": 2.2866789441375075, + "grad_norm": 0.40093541145324707, + "learning_rate": 9.000633666076317e-05, + "loss": 1.9226, + "step": 7450 + }, + { + "epoch": 2.286985880908533, + "grad_norm": 0.41090336441993713, + "learning_rate": 9.000335496726759e-05, + "loss": 1.8542, + "step": 7451 + }, + { + "epoch": 2.287292817679558, + "grad_norm": 0.48479974269866943, + "learning_rate": 9.00003728784354e-05, + "loss": 1.9217, + "step": 7452 + }, + { + "epoch": 2.287599754450583, + "grad_norm": 0.662677526473999, + "learning_rate": 8.999739039429609e-05, + "loss": 1.957, + "step": 7453 + }, + { + "epoch": 2.2879066912216084, + "grad_norm": 0.6417959928512573, + "learning_rate": 8.999440751487911e-05, + "loss": 1.8273, + "step": 7454 + }, + { + "epoch": 2.2882136279926337, + "grad_norm": 0.5561745762825012, + "learning_rate": 8.999142424021396e-05, + "loss": 1.9081, + "step": 7455 + }, + { + "epoch": 2.2885205647636586, + "grad_norm": 0.3603537976741791, + "learning_rate": 8.998844057033013e-05, + "loss": 1.8256, + "step": 7456 + }, + { + "epoch": 2.288827501534684, + "grad_norm": 0.5149406790733337, + "learning_rate": 8.998545650525707e-05, + "loss": 1.8257, + "step": 7457 + }, + { + "epoch": 2.289134438305709, + "grad_norm": 0.6777750253677368, + "learning_rate": 8.99824720450243e-05, + "loss": 1.8581, + "step": 7458 + }, + { + "epoch": 2.289441375076734, + "grad_norm": 0.6244171857833862, + "learning_rate": 8.997948718966132e-05, + "loss": 1.9195, + "step": 7459 + }, + { + "epoch": 2.2897483118477595, + "grad_norm": 0.3903466463088989, + "learning_rate": 8.99765019391976e-05, + "loss": 1.8996, + "step": 7460 + }, + { + "epoch": 2.2900552486187844, + "grad_norm": 0.4231773614883423, + "learning_rate": 8.997351629366266e-05, + "loss": 1.9447, + "step": 7461 + }, + { + "epoch": 2.2903621853898097, + "grad_norm": 0.5735896825790405, + "learning_rate": 8.997053025308602e-05, + "loss": 1.9082, + "step": 7462 + }, + { + "epoch": 2.2906691221608346, + "grad_norm": 0.5015980005264282, + "learning_rate": 8.996754381749715e-05, + "loss": 1.8744, + "step": 7463 + }, + { + "epoch": 2.29097605893186, + "grad_norm": 0.3385339677333832, + "learning_rate": 8.996455698692558e-05, + "loss": 1.8908, + "step": 7464 + }, + { + "epoch": 2.2912829957028853, + "grad_norm": 0.35323935747146606, + "learning_rate": 8.996156976140086e-05, + "loss": 1.8739, + "step": 7465 + }, + { + "epoch": 2.29158993247391, + "grad_norm": 0.386081725358963, + "learning_rate": 8.995858214095248e-05, + "loss": 1.8734, + "step": 7466 + }, + { + "epoch": 2.2918968692449355, + "grad_norm": 0.32834386825561523, + "learning_rate": 8.995559412560996e-05, + "loss": 1.8849, + "step": 7467 + }, + { + "epoch": 2.292203806015961, + "grad_norm": 0.3868117034435272, + "learning_rate": 8.995260571540284e-05, + "loss": 1.8992, + "step": 7468 + }, + { + "epoch": 2.2925107427869857, + "grad_norm": 0.3869209885597229, + "learning_rate": 8.994961691036066e-05, + "loss": 1.8562, + "step": 7469 + }, + { + "epoch": 2.292817679558011, + "grad_norm": 0.39098650217056274, + "learning_rate": 8.994662771051294e-05, + "loss": 1.9077, + "step": 7470 + }, + { + "epoch": 2.2931246163290364, + "grad_norm": 0.4433341920375824, + "learning_rate": 8.994363811588923e-05, + "loss": 1.9193, + "step": 7471 + }, + { + "epoch": 2.2934315531000613, + "grad_norm": 0.37947940826416016, + "learning_rate": 8.99406481265191e-05, + "loss": 1.8843, + "step": 7472 + }, + { + "epoch": 2.2937384898710866, + "grad_norm": 0.4123954772949219, + "learning_rate": 8.993765774243206e-05, + "loss": 1.8847, + "step": 7473 + }, + { + "epoch": 2.2940454266421115, + "grad_norm": 0.3863835036754608, + "learning_rate": 8.993466696365768e-05, + "loss": 1.8226, + "step": 7474 + }, + { + "epoch": 2.294352363413137, + "grad_norm": 0.34903961420059204, + "learning_rate": 8.993167579022551e-05, + "loss": 1.9151, + "step": 7475 + }, + { + "epoch": 2.294659300184162, + "grad_norm": 0.439989298582077, + "learning_rate": 8.992868422216512e-05, + "loss": 1.8494, + "step": 7476 + }, + { + "epoch": 2.294966236955187, + "grad_norm": 0.42929476499557495, + "learning_rate": 8.992569225950607e-05, + "loss": 1.8174, + "step": 7477 + }, + { + "epoch": 2.2952731737262124, + "grad_norm": 0.39554497599601746, + "learning_rate": 8.992269990227792e-05, + "loss": 1.8692, + "step": 7478 + }, + { + "epoch": 2.2955801104972378, + "grad_norm": 0.29355254769325256, + "learning_rate": 8.991970715051026e-05, + "loss": 1.8033, + "step": 7479 + }, + { + "epoch": 2.2958870472682626, + "grad_norm": 0.3488605320453644, + "learning_rate": 8.991671400423265e-05, + "loss": 1.8979, + "step": 7480 + }, + { + "epoch": 2.296193984039288, + "grad_norm": 0.34984245896339417, + "learning_rate": 8.991372046347468e-05, + "loss": 1.8931, + "step": 7481 + }, + { + "epoch": 2.2965009208103133, + "grad_norm": 0.29404810070991516, + "learning_rate": 8.991072652826593e-05, + "loss": 1.8626, + "step": 7482 + }, + { + "epoch": 2.296807857581338, + "grad_norm": 0.2838701009750366, + "learning_rate": 8.990773219863598e-05, + "loss": 1.8542, + "step": 7483 + }, + { + "epoch": 2.2971147943523635, + "grad_norm": 0.28008925914764404, + "learning_rate": 8.990473747461444e-05, + "loss": 1.8354, + "step": 7484 + }, + { + "epoch": 2.2974217311233884, + "grad_norm": 0.3046751320362091, + "learning_rate": 8.99017423562309e-05, + "loss": 1.8657, + "step": 7485 + }, + { + "epoch": 2.2977286678944138, + "grad_norm": 0.28220781683921814, + "learning_rate": 8.989874684351494e-05, + "loss": 1.8349, + "step": 7486 + }, + { + "epoch": 2.298035604665439, + "grad_norm": 0.2665577232837677, + "learning_rate": 8.989575093649619e-05, + "loss": 1.8551, + "step": 7487 + }, + { + "epoch": 2.298342541436464, + "grad_norm": 0.2797924280166626, + "learning_rate": 8.989275463520423e-05, + "loss": 1.8568, + "step": 7488 + }, + { + "epoch": 2.2986494782074893, + "grad_norm": 0.2917410731315613, + "learning_rate": 8.98897579396687e-05, + "loss": 1.843, + "step": 7489 + }, + { + "epoch": 2.298956414978514, + "grad_norm": 0.3014819920063019, + "learning_rate": 8.98867608499192e-05, + "loss": 1.8527, + "step": 7490 + }, + { + "epoch": 2.2992633517495396, + "grad_norm": 0.28019243478775024, + "learning_rate": 8.988376336598537e-05, + "loss": 1.7744, + "step": 7491 + }, + { + "epoch": 2.299570288520565, + "grad_norm": 0.35014277696609497, + "learning_rate": 8.988076548789678e-05, + "loss": 1.9604, + "step": 7492 + }, + { + "epoch": 2.2998772252915898, + "grad_norm": 0.3060695230960846, + "learning_rate": 8.987776721568311e-05, + "loss": 1.8463, + "step": 7493 + }, + { + "epoch": 2.300184162062615, + "grad_norm": 0.29870638251304626, + "learning_rate": 8.987476854937395e-05, + "loss": 1.815, + "step": 7494 + }, + { + "epoch": 2.3004910988336404, + "grad_norm": 0.27395132184028625, + "learning_rate": 8.987176948899898e-05, + "loss": 1.8126, + "step": 7495 + }, + { + "epoch": 2.3007980356046653, + "grad_norm": 0.2982339859008789, + "learning_rate": 8.986877003458781e-05, + "loss": 1.9114, + "step": 7496 + }, + { + "epoch": 2.3011049723756907, + "grad_norm": 0.3113982081413269, + "learning_rate": 8.986577018617008e-05, + "loss": 1.8429, + "step": 7497 + }, + { + "epoch": 2.301411909146716, + "grad_norm": 0.3538585603237152, + "learning_rate": 8.986276994377544e-05, + "loss": 1.9045, + "step": 7498 + }, + { + "epoch": 2.301718845917741, + "grad_norm": 0.37576064467430115, + "learning_rate": 8.985976930743356e-05, + "loss": 1.8955, + "step": 7499 + }, + { + "epoch": 2.3020257826887662, + "grad_norm": 0.3080044388771057, + "learning_rate": 8.985676827717406e-05, + "loss": 1.7946, + "step": 7500 + }, + { + "epoch": 2.302332719459791, + "grad_norm": 0.33935341238975525, + "learning_rate": 8.985376685302662e-05, + "loss": 1.8817, + "step": 7501 + }, + { + "epoch": 2.3026396562308165, + "grad_norm": 0.3817180395126343, + "learning_rate": 8.98507650350209e-05, + "loss": 1.9178, + "step": 7502 + }, + { + "epoch": 2.302946593001842, + "grad_norm": 0.35170307755470276, + "learning_rate": 8.984776282318657e-05, + "loss": 1.9451, + "step": 7503 + }, + { + "epoch": 2.3032535297728667, + "grad_norm": 0.3451419770717621, + "learning_rate": 8.984476021755329e-05, + "loss": 1.9127, + "step": 7504 + }, + { + "epoch": 2.303560466543892, + "grad_norm": 0.4312259554862976, + "learning_rate": 8.984175721815071e-05, + "loss": 1.8784, + "step": 7505 + }, + { + "epoch": 2.303867403314917, + "grad_norm": 0.4684976041316986, + "learning_rate": 8.983875382500856e-05, + "loss": 1.8782, + "step": 7506 + }, + { + "epoch": 2.3041743400859422, + "grad_norm": 0.4230491518974304, + "learning_rate": 8.983575003815648e-05, + "loss": 1.8769, + "step": 7507 + }, + { + "epoch": 2.3044812768569676, + "grad_norm": 0.32715409994125366, + "learning_rate": 8.983274585762417e-05, + "loss": 1.8535, + "step": 7508 + }, + { + "epoch": 2.3047882136279925, + "grad_norm": 0.3857569396495819, + "learning_rate": 8.982974128344134e-05, + "loss": 1.8689, + "step": 7509 + }, + { + "epoch": 2.305095150399018, + "grad_norm": 0.46266329288482666, + "learning_rate": 8.982673631563766e-05, + "loss": 1.9151, + "step": 7510 + }, + { + "epoch": 2.305402087170043, + "grad_norm": 0.455713152885437, + "learning_rate": 8.98237309542428e-05, + "loss": 1.9304, + "step": 7511 + }, + { + "epoch": 2.305709023941068, + "grad_norm": 0.3413514792919159, + "learning_rate": 8.98207251992865e-05, + "loss": 1.8516, + "step": 7512 + }, + { + "epoch": 2.3060159607120934, + "grad_norm": 0.3705863058567047, + "learning_rate": 8.981771905079846e-05, + "loss": 1.8434, + "step": 7513 + }, + { + "epoch": 2.3063228974831187, + "grad_norm": 0.46615147590637207, + "learning_rate": 8.981471250880839e-05, + "loss": 1.9265, + "step": 7514 + }, + { + "epoch": 2.3066298342541436, + "grad_norm": 0.5400925278663635, + "learning_rate": 8.981170557334598e-05, + "loss": 1.9061, + "step": 7515 + }, + { + "epoch": 2.306936771025169, + "grad_norm": 0.40317288041114807, + "learning_rate": 8.980869824444096e-05, + "loss": 1.7916, + "step": 7516 + }, + { + "epoch": 2.307243707796194, + "grad_norm": 0.3522326648235321, + "learning_rate": 8.980569052212307e-05, + "loss": 1.867, + "step": 7517 + }, + { + "epoch": 2.307550644567219, + "grad_norm": 0.5134142637252808, + "learning_rate": 8.9802682406422e-05, + "loss": 1.8406, + "step": 7518 + }, + { + "epoch": 2.3078575813382445, + "grad_norm": 0.5792621970176697, + "learning_rate": 8.97996738973675e-05, + "loss": 1.8467, + "step": 7519 + }, + { + "epoch": 2.3081645181092694, + "grad_norm": 0.424405962228775, + "learning_rate": 8.979666499498928e-05, + "loss": 1.779, + "step": 7520 + }, + { + "epoch": 2.3084714548802947, + "grad_norm": 0.3233562409877777, + "learning_rate": 8.979365569931712e-05, + "loss": 1.9043, + "step": 7521 + }, + { + "epoch": 2.3087783916513196, + "grad_norm": 0.6043062806129456, + "learning_rate": 8.979064601038071e-05, + "loss": 1.9245, + "step": 7522 + }, + { + "epoch": 2.309085328422345, + "grad_norm": 0.6618810892105103, + "learning_rate": 8.978763592820982e-05, + "loss": 1.8601, + "step": 7523 + }, + { + "epoch": 2.3093922651933703, + "grad_norm": 0.44771909713745117, + "learning_rate": 8.978462545283418e-05, + "loss": 1.7836, + "step": 7524 + }, + { + "epoch": 2.309699201964395, + "grad_norm": 0.3473430871963501, + "learning_rate": 8.978161458428356e-05, + "loss": 1.8743, + "step": 7525 + }, + { + "epoch": 2.3100061387354205, + "grad_norm": 0.46158188581466675, + "learning_rate": 8.977860332258772e-05, + "loss": 1.8802, + "step": 7526 + }, + { + "epoch": 2.310313075506446, + "grad_norm": 0.42034098505973816, + "learning_rate": 8.977559166777639e-05, + "loss": 1.8773, + "step": 7527 + }, + { + "epoch": 2.3106200122774707, + "grad_norm": 0.30994895100593567, + "learning_rate": 8.977257961987936e-05, + "loss": 1.8042, + "step": 7528 + }, + { + "epoch": 2.310926949048496, + "grad_norm": 0.32265907526016235, + "learning_rate": 8.976956717892638e-05, + "loss": 1.8, + "step": 7529 + }, + { + "epoch": 2.3112338858195214, + "grad_norm": 0.3592197000980377, + "learning_rate": 8.976655434494723e-05, + "loss": 1.9053, + "step": 7530 + }, + { + "epoch": 2.3115408225905463, + "grad_norm": 0.36494702100753784, + "learning_rate": 8.97635411179717e-05, + "loss": 1.8982, + "step": 7531 + }, + { + "epoch": 2.3118477593615716, + "grad_norm": 0.3697327971458435, + "learning_rate": 8.976052749802952e-05, + "loss": 1.9446, + "step": 7532 + }, + { + "epoch": 2.3121546961325965, + "grad_norm": 0.5200048089027405, + "learning_rate": 8.975751348515052e-05, + "loss": 1.9429, + "step": 7533 + }, + { + "epoch": 2.312461632903622, + "grad_norm": 0.4033229947090149, + "learning_rate": 8.975449907936446e-05, + "loss": 1.8128, + "step": 7534 + }, + { + "epoch": 2.312768569674647, + "grad_norm": 0.35759851336479187, + "learning_rate": 8.975148428070115e-05, + "loss": 1.8721, + "step": 7535 + }, + { + "epoch": 2.313075506445672, + "grad_norm": 0.4578085243701935, + "learning_rate": 8.974846908919037e-05, + "loss": 1.8397, + "step": 7536 + }, + { + "epoch": 2.3133824432166974, + "grad_norm": 0.4557357132434845, + "learning_rate": 8.974545350486192e-05, + "loss": 1.8726, + "step": 7537 + }, + { + "epoch": 2.3136893799877223, + "grad_norm": 0.3946380615234375, + "learning_rate": 8.974243752774561e-05, + "loss": 1.8662, + "step": 7538 + }, + { + "epoch": 2.3139963167587476, + "grad_norm": 0.29723790287971497, + "learning_rate": 8.973942115787122e-05, + "loss": 1.8215, + "step": 7539 + }, + { + "epoch": 2.314303253529773, + "grad_norm": 0.37225791811943054, + "learning_rate": 8.973640439526858e-05, + "loss": 1.9422, + "step": 7540 + }, + { + "epoch": 2.314610190300798, + "grad_norm": 0.3359868824481964, + "learning_rate": 8.973338723996751e-05, + "loss": 1.7974, + "step": 7541 + }, + { + "epoch": 2.314917127071823, + "grad_norm": 0.2993139922618866, + "learning_rate": 8.973036969199782e-05, + "loss": 1.8691, + "step": 7542 + }, + { + "epoch": 2.3152240638428485, + "grad_norm": 0.3155567944049835, + "learning_rate": 8.972735175138933e-05, + "loss": 1.857, + "step": 7543 + }, + { + "epoch": 2.3155310006138734, + "grad_norm": 0.315820574760437, + "learning_rate": 8.972433341817188e-05, + "loss": 1.8597, + "step": 7544 + }, + { + "epoch": 2.3158379373848987, + "grad_norm": 0.32500606775283813, + "learning_rate": 8.972131469237526e-05, + "loss": 1.9293, + "step": 7545 + }, + { + "epoch": 2.316144874155924, + "grad_norm": 0.3481442332267761, + "learning_rate": 8.971829557402933e-05, + "loss": 1.8839, + "step": 7546 + }, + { + "epoch": 2.316451810926949, + "grad_norm": 0.3110404312610626, + "learning_rate": 8.971527606316394e-05, + "loss": 1.8717, + "step": 7547 + }, + { + "epoch": 2.3167587476979743, + "grad_norm": 0.319795161485672, + "learning_rate": 8.97122561598089e-05, + "loss": 1.8855, + "step": 7548 + }, + { + "epoch": 2.317065684468999, + "grad_norm": 0.33142411708831787, + "learning_rate": 8.970923586399407e-05, + "loss": 1.863, + "step": 7549 + }, + { + "epoch": 2.3173726212400245, + "grad_norm": 0.348715603351593, + "learning_rate": 8.970621517574929e-05, + "loss": 1.8886, + "step": 7550 + }, + { + "epoch": 2.31767955801105, + "grad_norm": 0.3179607689380646, + "learning_rate": 8.970319409510444e-05, + "loss": 1.8955, + "step": 7551 + }, + { + "epoch": 2.3179864947820747, + "grad_norm": 0.33166465163230896, + "learning_rate": 8.970017262208934e-05, + "loss": 1.8366, + "step": 7552 + }, + { + "epoch": 2.3182934315531, + "grad_norm": 0.30798691511154175, + "learning_rate": 8.969715075673386e-05, + "loss": 1.8437, + "step": 7553 + }, + { + "epoch": 2.3186003683241254, + "grad_norm": 0.292639821767807, + "learning_rate": 8.969412849906788e-05, + "loss": 1.8056, + "step": 7554 + }, + { + "epoch": 2.3189073050951503, + "grad_norm": 0.2972165048122406, + "learning_rate": 8.969110584912125e-05, + "loss": 1.8596, + "step": 7555 + }, + { + "epoch": 2.3192142418661756, + "grad_norm": 0.3346043527126312, + "learning_rate": 8.968808280692385e-05, + "loss": 1.8652, + "step": 7556 + }, + { + "epoch": 2.319521178637201, + "grad_norm": 0.31866857409477234, + "learning_rate": 8.968505937250555e-05, + "loss": 1.9263, + "step": 7557 + }, + { + "epoch": 2.319828115408226, + "grad_norm": 0.3511367440223694, + "learning_rate": 8.968203554589625e-05, + "loss": 1.8615, + "step": 7558 + }, + { + "epoch": 2.320135052179251, + "grad_norm": 0.36077243089675903, + "learning_rate": 8.96790113271258e-05, + "loss": 1.9155, + "step": 7559 + }, + { + "epoch": 2.320441988950276, + "grad_norm": 0.3335363268852234, + "learning_rate": 8.96759867162241e-05, + "loss": 1.8313, + "step": 7560 + }, + { + "epoch": 2.3207489257213014, + "grad_norm": 0.31834676861763, + "learning_rate": 8.967296171322105e-05, + "loss": 1.809, + "step": 7561 + }, + { + "epoch": 2.3210558624923268, + "grad_norm": 0.3629632890224457, + "learning_rate": 8.966993631814655e-05, + "loss": 1.854, + "step": 7562 + }, + { + "epoch": 2.3213627992633517, + "grad_norm": 0.3164220154285431, + "learning_rate": 8.966691053103049e-05, + "loss": 1.8431, + "step": 7563 + }, + { + "epoch": 2.321669736034377, + "grad_norm": 0.408178448677063, + "learning_rate": 8.966388435190276e-05, + "loss": 1.8652, + "step": 7564 + }, + { + "epoch": 2.321976672805402, + "grad_norm": 0.4244436025619507, + "learning_rate": 8.966085778079327e-05, + "loss": 1.8834, + "step": 7565 + }, + { + "epoch": 2.322283609576427, + "grad_norm": 0.44187989830970764, + "learning_rate": 8.965783081773195e-05, + "loss": 1.8822, + "step": 7566 + }, + { + "epoch": 2.3225905463474525, + "grad_norm": 0.30801042914390564, + "learning_rate": 8.965480346274869e-05, + "loss": 1.8145, + "step": 7567 + }, + { + "epoch": 2.3228974831184774, + "grad_norm": 0.30103740096092224, + "learning_rate": 8.965177571587343e-05, + "loss": 1.8207, + "step": 7568 + }, + { + "epoch": 2.3232044198895028, + "grad_norm": 0.417538046836853, + "learning_rate": 8.964874757713608e-05, + "loss": 1.9213, + "step": 7569 + }, + { + "epoch": 2.323511356660528, + "grad_norm": 0.4238434433937073, + "learning_rate": 8.964571904656656e-05, + "loss": 1.8309, + "step": 7570 + }, + { + "epoch": 2.323818293431553, + "grad_norm": 0.3717726171016693, + "learning_rate": 8.964269012419482e-05, + "loss": 1.8613, + "step": 7571 + }, + { + "epoch": 2.3241252302025783, + "grad_norm": 0.369182288646698, + "learning_rate": 8.963966081005078e-05, + "loss": 1.9232, + "step": 7572 + }, + { + "epoch": 2.3244321669736037, + "grad_norm": 0.40301385521888733, + "learning_rate": 8.963663110416436e-05, + "loss": 1.9509, + "step": 7573 + }, + { + "epoch": 2.3247391037446286, + "grad_norm": 0.3336825966835022, + "learning_rate": 8.963360100656553e-05, + "loss": 1.807, + "step": 7574 + }, + { + "epoch": 2.325046040515654, + "grad_norm": 0.4070039987564087, + "learning_rate": 8.963057051728423e-05, + "loss": 1.9349, + "step": 7575 + }, + { + "epoch": 2.325352977286679, + "grad_norm": 0.34244731068611145, + "learning_rate": 8.96275396363504e-05, + "loss": 1.8378, + "step": 7576 + }, + { + "epoch": 2.325659914057704, + "grad_norm": 0.3408849835395813, + "learning_rate": 8.962450836379401e-05, + "loss": 1.8087, + "step": 7577 + }, + { + "epoch": 2.3259668508287294, + "grad_norm": 0.34224358201026917, + "learning_rate": 8.962147669964498e-05, + "loss": 1.9158, + "step": 7578 + }, + { + "epoch": 2.3262737875997543, + "grad_norm": 0.36177051067352295, + "learning_rate": 8.961844464393332e-05, + "loss": 1.8774, + "step": 7579 + }, + { + "epoch": 2.3265807243707797, + "grad_norm": 0.3000224232673645, + "learning_rate": 8.961541219668895e-05, + "loss": 1.8092, + "step": 7580 + }, + { + "epoch": 2.3268876611418046, + "grad_norm": 0.34738194942474365, + "learning_rate": 8.961237935794185e-05, + "loss": 1.9107, + "step": 7581 + }, + { + "epoch": 2.32719459791283, + "grad_norm": 0.355585515499115, + "learning_rate": 8.960934612772203e-05, + "loss": 1.8343, + "step": 7582 + }, + { + "epoch": 2.3275015346838552, + "grad_norm": 0.29839828610420227, + "learning_rate": 8.96063125060594e-05, + "loss": 1.8345, + "step": 7583 + }, + { + "epoch": 2.32780847145488, + "grad_norm": 0.3695736229419708, + "learning_rate": 8.960327849298399e-05, + "loss": 1.8763, + "step": 7584 + }, + { + "epoch": 2.3281154082259055, + "grad_norm": 0.38834989070892334, + "learning_rate": 8.960024408852578e-05, + "loss": 1.8732, + "step": 7585 + }, + { + "epoch": 2.328422344996931, + "grad_norm": 0.4515606462955475, + "learning_rate": 8.959720929271474e-05, + "loss": 1.9685, + "step": 7586 + }, + { + "epoch": 2.3287292817679557, + "grad_norm": 0.39115825295448303, + "learning_rate": 8.959417410558087e-05, + "loss": 1.7969, + "step": 7587 + }, + { + "epoch": 2.329036218538981, + "grad_norm": 0.37858307361602783, + "learning_rate": 8.959113852715417e-05, + "loss": 1.9013, + "step": 7588 + }, + { + "epoch": 2.3293431553100064, + "grad_norm": 0.35533010959625244, + "learning_rate": 8.958810255746462e-05, + "loss": 1.8862, + "step": 7589 + }, + { + "epoch": 2.3296500920810312, + "grad_norm": 0.36994054913520813, + "learning_rate": 8.958506619654226e-05, + "loss": 1.9783, + "step": 7590 + }, + { + "epoch": 2.3299570288520566, + "grad_norm": 0.4424416124820709, + "learning_rate": 8.958202944441705e-05, + "loss": 1.9095, + "step": 7591 + }, + { + "epoch": 2.3302639656230815, + "grad_norm": 0.41932111978530884, + "learning_rate": 8.957899230111903e-05, + "loss": 1.8623, + "step": 7592 + }, + { + "epoch": 2.330570902394107, + "grad_norm": 0.4359748363494873, + "learning_rate": 8.957595476667822e-05, + "loss": 1.8917, + "step": 7593 + }, + { + "epoch": 2.330877839165132, + "grad_norm": 0.362957239151001, + "learning_rate": 8.957291684112463e-05, + "loss": 1.8478, + "step": 7594 + }, + { + "epoch": 2.331184775936157, + "grad_norm": 0.3442717492580414, + "learning_rate": 8.956987852448827e-05, + "loss": 1.862, + "step": 7595 + }, + { + "epoch": 2.3314917127071824, + "grad_norm": 0.33355212211608887, + "learning_rate": 8.956683981679918e-05, + "loss": 1.8319, + "step": 7596 + }, + { + "epoch": 2.3317986494782073, + "grad_norm": 0.36758801341056824, + "learning_rate": 8.95638007180874e-05, + "loss": 1.8989, + "step": 7597 + }, + { + "epoch": 2.3321055862492326, + "grad_norm": 0.3574751019477844, + "learning_rate": 8.956076122838294e-05, + "loss": 1.8304, + "step": 7598 + }, + { + "epoch": 2.332412523020258, + "grad_norm": 0.30615341663360596, + "learning_rate": 8.955772134771585e-05, + "loss": 1.9078, + "step": 7599 + }, + { + "epoch": 2.332719459791283, + "grad_norm": 0.38824397325515747, + "learning_rate": 8.955468107611618e-05, + "loss": 1.8733, + "step": 7600 + }, + { + "epoch": 2.333026396562308, + "grad_norm": 0.40545380115509033, + "learning_rate": 8.955164041361395e-05, + "loss": 1.8264, + "step": 7601 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.3104313910007477, + "learning_rate": 8.954859936023925e-05, + "loss": 1.8272, + "step": 7602 + }, + { + "epoch": 2.3336402701043584, + "grad_norm": 0.34795114398002625, + "learning_rate": 8.954555791602211e-05, + "loss": 1.8711, + "step": 7603 + }, + { + "epoch": 2.3339472068753837, + "grad_norm": 0.42790937423706055, + "learning_rate": 8.954251608099257e-05, + "loss": 1.8802, + "step": 7604 + }, + { + "epoch": 2.334254143646409, + "grad_norm": 0.3903054893016815, + "learning_rate": 8.953947385518072e-05, + "loss": 1.8489, + "step": 7605 + }, + { + "epoch": 2.334561080417434, + "grad_norm": 0.35869601368904114, + "learning_rate": 8.953643123861661e-05, + "loss": 1.8565, + "step": 7606 + }, + { + "epoch": 2.3348680171884593, + "grad_norm": 0.3960758447647095, + "learning_rate": 8.953338823133033e-05, + "loss": 1.9335, + "step": 7607 + }, + { + "epoch": 2.335174953959484, + "grad_norm": 0.3884136974811554, + "learning_rate": 8.953034483335191e-05, + "loss": 1.887, + "step": 7608 + }, + { + "epoch": 2.3354818907305095, + "grad_norm": 0.3734811246395111, + "learning_rate": 8.952730104471147e-05, + "loss": 1.861, + "step": 7609 + }, + { + "epoch": 2.335788827501535, + "grad_norm": 0.3074554204940796, + "learning_rate": 8.952425686543908e-05, + "loss": 1.8556, + "step": 7610 + }, + { + "epoch": 2.3360957642725597, + "grad_norm": 0.3098750412464142, + "learning_rate": 8.952121229556481e-05, + "loss": 1.8724, + "step": 7611 + }, + { + "epoch": 2.336402701043585, + "grad_norm": 0.3514649569988251, + "learning_rate": 8.951816733511875e-05, + "loss": 1.8023, + "step": 7612 + }, + { + "epoch": 2.33670963781461, + "grad_norm": 0.3275100290775299, + "learning_rate": 8.951512198413101e-05, + "loss": 1.8805, + "step": 7613 + }, + { + "epoch": 2.3370165745856353, + "grad_norm": 0.3380829989910126, + "learning_rate": 8.951207624263165e-05, + "loss": 1.8559, + "step": 7614 + }, + { + "epoch": 2.3373235113566606, + "grad_norm": 0.43179723620414734, + "learning_rate": 8.950903011065082e-05, + "loss": 1.937, + "step": 7615 + }, + { + "epoch": 2.337630448127686, + "grad_norm": 0.4981893002986908, + "learning_rate": 8.950598358821858e-05, + "loss": 1.8828, + "step": 7616 + }, + { + "epoch": 2.337937384898711, + "grad_norm": 0.42164552211761475, + "learning_rate": 8.950293667536506e-05, + "loss": 1.8898, + "step": 7617 + }, + { + "epoch": 2.338244321669736, + "grad_norm": 0.32897287607192993, + "learning_rate": 8.949988937212037e-05, + "loss": 1.9073, + "step": 7618 + }, + { + "epoch": 2.338551258440761, + "grad_norm": 0.38831618428230286, + "learning_rate": 8.949684167851462e-05, + "loss": 1.9694, + "step": 7619 + }, + { + "epoch": 2.3388581952117864, + "grad_norm": 0.3728467524051666, + "learning_rate": 8.949379359457793e-05, + "loss": 1.8803, + "step": 7620 + }, + { + "epoch": 2.3391651319828117, + "grad_norm": 0.4003579020500183, + "learning_rate": 8.949074512034044e-05, + "loss": 1.9306, + "step": 7621 + }, + { + "epoch": 2.3394720687538366, + "grad_norm": 0.35670751333236694, + "learning_rate": 8.948769625583224e-05, + "loss": 1.9176, + "step": 7622 + }, + { + "epoch": 2.339779005524862, + "grad_norm": 0.3257119357585907, + "learning_rate": 8.948464700108347e-05, + "loss": 1.8781, + "step": 7623 + }, + { + "epoch": 2.340085942295887, + "grad_norm": 0.2840226888656616, + "learning_rate": 8.94815973561243e-05, + "loss": 1.8112, + "step": 7624 + }, + { + "epoch": 2.340392879066912, + "grad_norm": 0.33156147599220276, + "learning_rate": 8.947854732098484e-05, + "loss": 1.8562, + "step": 7625 + }, + { + "epoch": 2.3406998158379375, + "grad_norm": 0.33335328102111816, + "learning_rate": 8.947549689569524e-05, + "loss": 1.8404, + "step": 7626 + }, + { + "epoch": 2.3410067526089624, + "grad_norm": 0.2913919985294342, + "learning_rate": 8.947244608028562e-05, + "loss": 1.83, + "step": 7627 + }, + { + "epoch": 2.3413136893799877, + "grad_norm": 0.32735875248908997, + "learning_rate": 8.946939487478618e-05, + "loss": 1.9047, + "step": 7628 + }, + { + "epoch": 2.341620626151013, + "grad_norm": 0.3421878516674042, + "learning_rate": 8.946634327922703e-05, + "loss": 1.8771, + "step": 7629 + }, + { + "epoch": 2.341927562922038, + "grad_norm": 0.33164483308792114, + "learning_rate": 8.946329129363835e-05, + "loss": 1.8463, + "step": 7630 + }, + { + "epoch": 2.3422344996930633, + "grad_norm": 0.35423099994659424, + "learning_rate": 8.946023891805029e-05, + "loss": 1.9254, + "step": 7631 + }, + { + "epoch": 2.3425414364640886, + "grad_norm": 0.3554958403110504, + "learning_rate": 8.9457186152493e-05, + "loss": 1.8949, + "step": 7632 + }, + { + "epoch": 2.3428483732351135, + "grad_norm": 0.35155919194221497, + "learning_rate": 8.94541329969967e-05, + "loss": 1.8432, + "step": 7633 + }, + { + "epoch": 2.343155310006139, + "grad_norm": 0.3210476338863373, + "learning_rate": 8.945107945159154e-05, + "loss": 1.8512, + "step": 7634 + }, + { + "epoch": 2.3434622467771637, + "grad_norm": 0.3587365746498108, + "learning_rate": 8.944802551630767e-05, + "loss": 1.8355, + "step": 7635 + }, + { + "epoch": 2.343769183548189, + "grad_norm": 0.41851457953453064, + "learning_rate": 8.94449711911753e-05, + "loss": 1.814, + "step": 7636 + }, + { + "epoch": 2.3440761203192144, + "grad_norm": 0.3516016900539398, + "learning_rate": 8.94419164762246e-05, + "loss": 1.8563, + "step": 7637 + }, + { + "epoch": 2.3443830570902393, + "grad_norm": 0.2917228937149048, + "learning_rate": 8.943886137148576e-05, + "loss": 1.8037, + "step": 7638 + }, + { + "epoch": 2.3446899938612646, + "grad_norm": 0.3597778379917145, + "learning_rate": 8.943580587698899e-05, + "loss": 1.8766, + "step": 7639 + }, + { + "epoch": 2.3449969306322895, + "grad_norm": 0.359642893075943, + "learning_rate": 8.943274999276445e-05, + "loss": 1.8485, + "step": 7640 + }, + { + "epoch": 2.345303867403315, + "grad_norm": 0.3543380796909332, + "learning_rate": 8.942969371884238e-05, + "loss": 1.8853, + "step": 7641 + }, + { + "epoch": 2.34561080417434, + "grad_norm": 0.371267706155777, + "learning_rate": 8.942663705525296e-05, + "loss": 1.869, + "step": 7642 + }, + { + "epoch": 2.345917740945365, + "grad_norm": 0.34073930978775024, + "learning_rate": 8.942358000202642e-05, + "loss": 1.831, + "step": 7643 + }, + { + "epoch": 2.3462246777163904, + "grad_norm": 0.3654492497444153, + "learning_rate": 8.942052255919293e-05, + "loss": 1.8697, + "step": 7644 + }, + { + "epoch": 2.3465316144874158, + "grad_norm": 0.31281957030296326, + "learning_rate": 8.941746472678275e-05, + "loss": 1.7908, + "step": 7645 + }, + { + "epoch": 2.3468385512584407, + "grad_norm": 0.3310844302177429, + "learning_rate": 8.941440650482607e-05, + "loss": 1.8523, + "step": 7646 + }, + { + "epoch": 2.347145488029466, + "grad_norm": 0.3187454342842102, + "learning_rate": 8.941134789335312e-05, + "loss": 1.8808, + "step": 7647 + }, + { + "epoch": 2.3474524248004913, + "grad_norm": 0.35980424284935, + "learning_rate": 8.940828889239415e-05, + "loss": 1.8713, + "step": 7648 + }, + { + "epoch": 2.347759361571516, + "grad_norm": 0.2960885763168335, + "learning_rate": 8.940522950197935e-05, + "loss": 1.8077, + "step": 7649 + }, + { + "epoch": 2.3480662983425415, + "grad_norm": 0.3056114912033081, + "learning_rate": 8.940216972213897e-05, + "loss": 1.8805, + "step": 7650 + }, + { + "epoch": 2.3483732351135664, + "grad_norm": 0.3047563135623932, + "learning_rate": 8.939910955290328e-05, + "loss": 1.793, + "step": 7651 + }, + { + "epoch": 2.3486801718845918, + "grad_norm": 0.3381251394748688, + "learning_rate": 8.939604899430248e-05, + "loss": 1.8267, + "step": 7652 + }, + { + "epoch": 2.348987108655617, + "grad_norm": 0.36855414509773254, + "learning_rate": 8.939298804636684e-05, + "loss": 1.9386, + "step": 7653 + }, + { + "epoch": 2.349294045426642, + "grad_norm": 0.3742626905441284, + "learning_rate": 8.93899267091266e-05, + "loss": 1.8695, + "step": 7654 + }, + { + "epoch": 2.3496009821976673, + "grad_norm": 0.3170017600059509, + "learning_rate": 8.938686498261201e-05, + "loss": 1.881, + "step": 7655 + }, + { + "epoch": 2.349907918968692, + "grad_norm": 0.2740418016910553, + "learning_rate": 8.938380286685334e-05, + "loss": 1.7992, + "step": 7656 + }, + { + "epoch": 2.3502148557397176, + "grad_norm": 0.3170342743396759, + "learning_rate": 8.938074036188087e-05, + "loss": 1.8281, + "step": 7657 + }, + { + "epoch": 2.350521792510743, + "grad_norm": 0.3487764298915863, + "learning_rate": 8.93776774677248e-05, + "loss": 1.8508, + "step": 7658 + }, + { + "epoch": 2.350828729281768, + "grad_norm": 0.3193725347518921, + "learning_rate": 8.937461418441549e-05, + "loss": 1.802, + "step": 7659 + }, + { + "epoch": 2.351135666052793, + "grad_norm": 0.30621078610420227, + "learning_rate": 8.937155051198312e-05, + "loss": 1.8723, + "step": 7660 + }, + { + "epoch": 2.3514426028238185, + "grad_norm": 0.3154527544975281, + "learning_rate": 8.936848645045803e-05, + "loss": 1.8276, + "step": 7661 + }, + { + "epoch": 2.3517495395948433, + "grad_norm": 0.3809822201728821, + "learning_rate": 8.936542199987048e-05, + "loss": 1.9682, + "step": 7662 + }, + { + "epoch": 2.3520564763658687, + "grad_norm": 0.3817490339279175, + "learning_rate": 8.936235716025076e-05, + "loss": 1.8896, + "step": 7663 + }, + { + "epoch": 2.352363413136894, + "grad_norm": 0.2996097207069397, + "learning_rate": 8.935929193162915e-05, + "loss": 1.7994, + "step": 7664 + }, + { + "epoch": 2.352670349907919, + "grad_norm": 0.30788013339042664, + "learning_rate": 8.935622631403596e-05, + "loss": 1.8243, + "step": 7665 + }, + { + "epoch": 2.3529772866789442, + "grad_norm": 0.331193745136261, + "learning_rate": 8.935316030750145e-05, + "loss": 1.9044, + "step": 7666 + }, + { + "epoch": 2.353284223449969, + "grad_norm": 0.31796711683273315, + "learning_rate": 8.935009391205598e-05, + "loss": 1.8006, + "step": 7667 + }, + { + "epoch": 2.3535911602209945, + "grad_norm": 0.3864014744758606, + "learning_rate": 8.934702712772979e-05, + "loss": 2.0193, + "step": 7668 + }, + { + "epoch": 2.35389809699202, + "grad_norm": 0.3923170566558838, + "learning_rate": 8.934395995455323e-05, + "loss": 1.9418, + "step": 7669 + }, + { + "epoch": 2.3542050337630447, + "grad_norm": 0.3210037052631378, + "learning_rate": 8.934089239255659e-05, + "loss": 1.7964, + "step": 7670 + }, + { + "epoch": 2.35451197053407, + "grad_norm": 0.32465317845344543, + "learning_rate": 8.933782444177019e-05, + "loss": 1.9405, + "step": 7671 + }, + { + "epoch": 2.354818907305095, + "grad_norm": 0.35554173588752747, + "learning_rate": 8.933475610222435e-05, + "loss": 1.8645, + "step": 7672 + }, + { + "epoch": 2.3551258440761202, + "grad_norm": 0.32723551988601685, + "learning_rate": 8.933168737394942e-05, + "loss": 1.8941, + "step": 7673 + }, + { + "epoch": 2.3554327808471456, + "grad_norm": 0.3295009732246399, + "learning_rate": 8.932861825697567e-05, + "loss": 1.9047, + "step": 7674 + }, + { + "epoch": 2.3557397176181705, + "grad_norm": 0.32315388321876526, + "learning_rate": 8.932554875133348e-05, + "loss": 1.8535, + "step": 7675 + }, + { + "epoch": 2.356046654389196, + "grad_norm": 0.31577154994010925, + "learning_rate": 8.932247885705315e-05, + "loss": 1.8697, + "step": 7676 + }, + { + "epoch": 2.356353591160221, + "grad_norm": 0.31099769473075867, + "learning_rate": 8.931940857416506e-05, + "loss": 1.8377, + "step": 7677 + }, + { + "epoch": 2.356660527931246, + "grad_norm": 0.32998642325401306, + "learning_rate": 8.931633790269954e-05, + "loss": 1.8528, + "step": 7678 + }, + { + "epoch": 2.3569674647022714, + "grad_norm": 0.29609233140945435, + "learning_rate": 8.93132668426869e-05, + "loss": 1.8646, + "step": 7679 + }, + { + "epoch": 2.3572744014732967, + "grad_norm": 0.31335413455963135, + "learning_rate": 8.931019539415752e-05, + "loss": 1.9011, + "step": 7680 + }, + { + "epoch": 2.3575813382443216, + "grad_norm": 0.3441788852214813, + "learning_rate": 8.930712355714174e-05, + "loss": 1.8673, + "step": 7681 + }, + { + "epoch": 2.357888275015347, + "grad_norm": 0.34610918164253235, + "learning_rate": 8.930405133166992e-05, + "loss": 1.8613, + "step": 7682 + }, + { + "epoch": 2.358195211786372, + "grad_norm": 0.31753265857696533, + "learning_rate": 8.930097871777245e-05, + "loss": 1.873, + "step": 7683 + }, + { + "epoch": 2.358502148557397, + "grad_norm": 0.29862073063850403, + "learning_rate": 8.929790571547966e-05, + "loss": 1.8392, + "step": 7684 + }, + { + "epoch": 2.3588090853284225, + "grad_norm": 0.2953017055988312, + "learning_rate": 8.929483232482194e-05, + "loss": 1.8402, + "step": 7685 + }, + { + "epoch": 2.3591160220994474, + "grad_norm": 0.36613956093788147, + "learning_rate": 8.929175854582966e-05, + "loss": 1.8954, + "step": 7686 + }, + { + "epoch": 2.3594229588704727, + "grad_norm": 0.3867746889591217, + "learning_rate": 8.928868437853319e-05, + "loss": 1.8496, + "step": 7687 + }, + { + "epoch": 2.359729895641498, + "grad_norm": 0.30742913484573364, + "learning_rate": 8.928560982296292e-05, + "loss": 1.82, + "step": 7688 + }, + { + "epoch": 2.360036832412523, + "grad_norm": 0.306905061006546, + "learning_rate": 8.928253487914921e-05, + "loss": 1.8299, + "step": 7689 + }, + { + "epoch": 2.3603437691835483, + "grad_norm": 0.3253326416015625, + "learning_rate": 8.927945954712247e-05, + "loss": 1.896, + "step": 7690 + }, + { + "epoch": 2.3606507059545736, + "grad_norm": 0.3139156699180603, + "learning_rate": 8.927638382691309e-05, + "loss": 1.838, + "step": 7691 + }, + { + "epoch": 2.3609576427255985, + "grad_norm": 0.3865121006965637, + "learning_rate": 8.927330771855147e-05, + "loss": 1.8502, + "step": 7692 + }, + { + "epoch": 2.361264579496624, + "grad_norm": 0.3640300929546356, + "learning_rate": 8.927023122206799e-05, + "loss": 1.8929, + "step": 7693 + }, + { + "epoch": 2.3615715162676487, + "grad_norm": 0.3446909487247467, + "learning_rate": 8.926715433749309e-05, + "loss": 1.864, + "step": 7694 + }, + { + "epoch": 2.361878453038674, + "grad_norm": 0.3086490035057068, + "learning_rate": 8.926407706485713e-05, + "loss": 1.8588, + "step": 7695 + }, + { + "epoch": 2.3621853898096994, + "grad_norm": 0.28351619839668274, + "learning_rate": 8.926099940419057e-05, + "loss": 1.8114, + "step": 7696 + }, + { + "epoch": 2.3624923265807243, + "grad_norm": 0.31882742047309875, + "learning_rate": 8.925792135552379e-05, + "loss": 1.8544, + "step": 7697 + }, + { + "epoch": 2.3627992633517496, + "grad_norm": 0.2691894769668579, + "learning_rate": 8.925484291888723e-05, + "loss": 1.8143, + "step": 7698 + }, + { + "epoch": 2.3631062001227745, + "grad_norm": 0.2815118432044983, + "learning_rate": 8.925176409431129e-05, + "loss": 1.8687, + "step": 7699 + }, + { + "epoch": 2.3634131368938, + "grad_norm": 0.34842196106910706, + "learning_rate": 8.924868488182643e-05, + "loss": 1.8673, + "step": 7700 + }, + { + "epoch": 2.363720073664825, + "grad_norm": 0.33553025126457214, + "learning_rate": 8.924560528146304e-05, + "loss": 1.8982, + "step": 7701 + }, + { + "epoch": 2.36402701043585, + "grad_norm": 0.30077221989631653, + "learning_rate": 8.924252529325159e-05, + "loss": 1.8155, + "step": 7702 + }, + { + "epoch": 2.3643339472068754, + "grad_norm": 0.3376595079898834, + "learning_rate": 8.923944491722252e-05, + "loss": 1.8871, + "step": 7703 + }, + { + "epoch": 2.3646408839779007, + "grad_norm": 0.3980284333229065, + "learning_rate": 8.923636415340622e-05, + "loss": 1.8414, + "step": 7704 + }, + { + "epoch": 2.3649478207489256, + "grad_norm": 0.4772777259349823, + "learning_rate": 8.92332830018332e-05, + "loss": 1.8393, + "step": 7705 + }, + { + "epoch": 2.365254757519951, + "grad_norm": 0.5061559081077576, + "learning_rate": 8.923020146253387e-05, + "loss": 1.9134, + "step": 7706 + }, + { + "epoch": 2.3655616942909763, + "grad_norm": 0.47147873044013977, + "learning_rate": 8.922711953553871e-05, + "loss": 1.9026, + "step": 7707 + }, + { + "epoch": 2.365868631062001, + "grad_norm": 0.37263748049736023, + "learning_rate": 8.922403722087814e-05, + "loss": 1.8474, + "step": 7708 + }, + { + "epoch": 2.3661755678330265, + "grad_norm": 0.3158501386642456, + "learning_rate": 8.922095451858265e-05, + "loss": 1.8771, + "step": 7709 + }, + { + "epoch": 2.3664825046040514, + "grad_norm": 0.3170566260814667, + "learning_rate": 8.921787142868271e-05, + "loss": 1.8111, + "step": 7710 + }, + { + "epoch": 2.3667894413750767, + "grad_norm": 0.3532208502292633, + "learning_rate": 8.921478795120877e-05, + "loss": 1.8708, + "step": 7711 + }, + { + "epoch": 2.367096378146102, + "grad_norm": 0.3211480379104614, + "learning_rate": 8.921170408619131e-05, + "loss": 1.8487, + "step": 7712 + }, + { + "epoch": 2.367403314917127, + "grad_norm": 0.2806071937084198, + "learning_rate": 8.920861983366083e-05, + "loss": 1.8325, + "step": 7713 + }, + { + "epoch": 2.3677102516881523, + "grad_norm": 0.30703970789909363, + "learning_rate": 8.920553519364777e-05, + "loss": 1.8364, + "step": 7714 + }, + { + "epoch": 2.368017188459177, + "grad_norm": 0.30848923325538635, + "learning_rate": 8.920245016618263e-05, + "loss": 1.833, + "step": 7715 + }, + { + "epoch": 2.3683241252302025, + "grad_norm": 0.31656739115715027, + "learning_rate": 8.919936475129588e-05, + "loss": 1.8884, + "step": 7716 + }, + { + "epoch": 2.368631062001228, + "grad_norm": 0.2806589603424072, + "learning_rate": 8.919627894901806e-05, + "loss": 1.7779, + "step": 7717 + }, + { + "epoch": 2.3689379987722528, + "grad_norm": 0.2943432629108429, + "learning_rate": 8.919319275937962e-05, + "loss": 1.8741, + "step": 7718 + }, + { + "epoch": 2.369244935543278, + "grad_norm": 0.2870347499847412, + "learning_rate": 8.919010618241111e-05, + "loss": 1.8415, + "step": 7719 + }, + { + "epoch": 2.3695518723143034, + "grad_norm": 0.3224312663078308, + "learning_rate": 8.918701921814297e-05, + "loss": 1.8594, + "step": 7720 + }, + { + "epoch": 2.3698588090853283, + "grad_norm": 0.3007681369781494, + "learning_rate": 8.918393186660575e-05, + "loss": 1.878, + "step": 7721 + }, + { + "epoch": 2.3701657458563536, + "grad_norm": 0.3083780109882355, + "learning_rate": 8.918084412782994e-05, + "loss": 1.9088, + "step": 7722 + }, + { + "epoch": 2.370472682627379, + "grad_norm": 0.30599063634872437, + "learning_rate": 8.917775600184608e-05, + "loss": 1.8743, + "step": 7723 + }, + { + "epoch": 2.370779619398404, + "grad_norm": 0.33503273129463196, + "learning_rate": 8.917466748868466e-05, + "loss": 1.9048, + "step": 7724 + }, + { + "epoch": 2.371086556169429, + "grad_norm": 0.3861919343471527, + "learning_rate": 8.917157858837622e-05, + "loss": 1.9073, + "step": 7725 + }, + { + "epoch": 2.371393492940454, + "grad_norm": 0.395945280790329, + "learning_rate": 8.916848930095128e-05, + "loss": 1.8678, + "step": 7726 + }, + { + "epoch": 2.3717004297114794, + "grad_norm": 0.3657386600971222, + "learning_rate": 8.916539962644037e-05, + "loss": 1.9138, + "step": 7727 + }, + { + "epoch": 2.3720073664825048, + "grad_norm": 0.32392752170562744, + "learning_rate": 8.916230956487402e-05, + "loss": 1.803, + "step": 7728 + }, + { + "epoch": 2.3723143032535297, + "grad_norm": 0.406703382730484, + "learning_rate": 8.915921911628278e-05, + "loss": 1.9222, + "step": 7729 + }, + { + "epoch": 2.372621240024555, + "grad_norm": 0.4293023645877838, + "learning_rate": 8.915612828069718e-05, + "loss": 1.8874, + "step": 7730 + }, + { + "epoch": 2.37292817679558, + "grad_norm": 0.45155876874923706, + "learning_rate": 8.915303705814777e-05, + "loss": 1.9059, + "step": 7731 + }, + { + "epoch": 2.373235113566605, + "grad_norm": 0.35105881094932556, + "learning_rate": 8.91499454486651e-05, + "loss": 1.8387, + "step": 7732 + }, + { + "epoch": 2.3735420503376305, + "grad_norm": 0.3197930157184601, + "learning_rate": 8.914685345227973e-05, + "loss": 1.8174, + "step": 7733 + }, + { + "epoch": 2.3738489871086554, + "grad_norm": 0.3610389232635498, + "learning_rate": 8.91437610690222e-05, + "loss": 1.841, + "step": 7734 + }, + { + "epoch": 2.3741559238796808, + "grad_norm": 0.3696954548358917, + "learning_rate": 8.91406682989231e-05, + "loss": 1.8511, + "step": 7735 + }, + { + "epoch": 2.374462860650706, + "grad_norm": 0.3364555239677429, + "learning_rate": 8.913757514201295e-05, + "loss": 1.8382, + "step": 7736 + }, + { + "epoch": 2.374769797421731, + "grad_norm": 0.4600698947906494, + "learning_rate": 8.913448159832236e-05, + "loss": 1.8247, + "step": 7737 + }, + { + "epoch": 2.3750767341927563, + "grad_norm": 0.5877843499183655, + "learning_rate": 8.913138766788187e-05, + "loss": 1.8449, + "step": 7738 + }, + { + "epoch": 2.3753836709637817, + "grad_norm": 0.5380640029907227, + "learning_rate": 8.912829335072208e-05, + "loss": 1.8647, + "step": 7739 + }, + { + "epoch": 2.3756906077348066, + "grad_norm": 0.5100306272506714, + "learning_rate": 8.912519864687357e-05, + "loss": 1.884, + "step": 7740 + }, + { + "epoch": 2.375997544505832, + "grad_norm": 0.48175910115242004, + "learning_rate": 8.91221035563669e-05, + "loss": 1.8378, + "step": 7741 + }, + { + "epoch": 2.376304481276857, + "grad_norm": 0.3296540081501007, + "learning_rate": 8.911900807923268e-05, + "loss": 1.8036, + "step": 7742 + }, + { + "epoch": 2.376611418047882, + "grad_norm": 0.32398131489753723, + "learning_rate": 8.911591221550149e-05, + "loss": 1.8415, + "step": 7743 + }, + { + "epoch": 2.3769183548189075, + "grad_norm": 0.33934786915779114, + "learning_rate": 8.911281596520393e-05, + "loss": 1.9002, + "step": 7744 + }, + { + "epoch": 2.3772252915899323, + "grad_norm": 0.33059465885162354, + "learning_rate": 8.91097193283706e-05, + "loss": 1.8194, + "step": 7745 + }, + { + "epoch": 2.3775322283609577, + "grad_norm": 0.2908796966075897, + "learning_rate": 8.91066223050321e-05, + "loss": 1.8272, + "step": 7746 + }, + { + "epoch": 2.3778391651319826, + "grad_norm": 0.31551963090896606, + "learning_rate": 8.910352489521904e-05, + "loss": 1.8717, + "step": 7747 + }, + { + "epoch": 2.378146101903008, + "grad_norm": 0.2886766493320465, + "learning_rate": 8.910042709896203e-05, + "loss": 1.8714, + "step": 7748 + }, + { + "epoch": 2.3784530386740332, + "grad_norm": 0.3288721740245819, + "learning_rate": 8.909732891629167e-05, + "loss": 1.9194, + "step": 7749 + }, + { + "epoch": 2.378759975445058, + "grad_norm": 0.42444637417793274, + "learning_rate": 8.90942303472386e-05, + "loss": 1.8871, + "step": 7750 + }, + { + "epoch": 2.3790669122160835, + "grad_norm": 0.3550770580768585, + "learning_rate": 8.909113139183343e-05, + "loss": 1.8639, + "step": 7751 + }, + { + "epoch": 2.379373848987109, + "grad_norm": 0.3291744589805603, + "learning_rate": 8.908803205010679e-05, + "loss": 1.8284, + "step": 7752 + }, + { + "epoch": 2.3796807857581337, + "grad_norm": 0.2803054451942444, + "learning_rate": 8.908493232208928e-05, + "loss": 1.8113, + "step": 7753 + }, + { + "epoch": 2.379987722529159, + "grad_norm": 0.30959245562553406, + "learning_rate": 8.908183220781158e-05, + "loss": 1.8821, + "step": 7754 + }, + { + "epoch": 2.3802946593001844, + "grad_norm": 0.37838777899742126, + "learning_rate": 8.907873170730431e-05, + "loss": 1.8749, + "step": 7755 + }, + { + "epoch": 2.3806015960712092, + "grad_norm": 0.34625449776649475, + "learning_rate": 8.907563082059813e-05, + "loss": 1.8804, + "step": 7756 + }, + { + "epoch": 2.3809085328422346, + "grad_norm": 0.3966830372810364, + "learning_rate": 8.907252954772364e-05, + "loss": 1.9295, + "step": 7757 + }, + { + "epoch": 2.3812154696132595, + "grad_norm": 0.3144119679927826, + "learning_rate": 8.906942788871151e-05, + "loss": 1.8486, + "step": 7758 + }, + { + "epoch": 2.381522406384285, + "grad_norm": 0.3498438596725464, + "learning_rate": 8.90663258435924e-05, + "loss": 1.8813, + "step": 7759 + }, + { + "epoch": 2.38182934315531, + "grad_norm": 0.32803723216056824, + "learning_rate": 8.906322341239696e-05, + "loss": 1.8282, + "step": 7760 + }, + { + "epoch": 2.382136279926335, + "grad_norm": 0.28600773215293884, + "learning_rate": 8.906012059515585e-05, + "loss": 1.8319, + "step": 7761 + }, + { + "epoch": 2.3824432166973604, + "grad_norm": 0.2743505537509918, + "learning_rate": 8.905701739189973e-05, + "loss": 1.8198, + "step": 7762 + }, + { + "epoch": 2.3827501534683857, + "grad_norm": 0.3011966347694397, + "learning_rate": 8.905391380265929e-05, + "loss": 1.8476, + "step": 7763 + }, + { + "epoch": 2.3830570902394106, + "grad_norm": 0.3022943437099457, + "learning_rate": 8.905080982746516e-05, + "loss": 1.9037, + "step": 7764 + }, + { + "epoch": 2.383364027010436, + "grad_norm": 0.3333243727684021, + "learning_rate": 8.904770546634805e-05, + "loss": 1.8487, + "step": 7765 + }, + { + "epoch": 2.3836709637814613, + "grad_norm": 0.3773072361946106, + "learning_rate": 8.904460071933862e-05, + "loss": 1.8828, + "step": 7766 + }, + { + "epoch": 2.383977900552486, + "grad_norm": 0.4382041096687317, + "learning_rate": 8.904149558646756e-05, + "loss": 1.9069, + "step": 7767 + }, + { + "epoch": 2.3842848373235115, + "grad_norm": 0.3963650166988373, + "learning_rate": 8.903839006776557e-05, + "loss": 1.816, + "step": 7768 + }, + { + "epoch": 2.3845917740945364, + "grad_norm": 0.35340386629104614, + "learning_rate": 8.903528416326333e-05, + "loss": 1.8853, + "step": 7769 + }, + { + "epoch": 2.3848987108655617, + "grad_norm": 0.31519120931625366, + "learning_rate": 8.903217787299153e-05, + "loss": 1.8953, + "step": 7770 + }, + { + "epoch": 2.385205647636587, + "grad_norm": 0.41126203536987305, + "learning_rate": 8.902907119698088e-05, + "loss": 1.9494, + "step": 7771 + }, + { + "epoch": 2.385512584407612, + "grad_norm": 0.4488140344619751, + "learning_rate": 8.902596413526205e-05, + "loss": 1.8717, + "step": 7772 + }, + { + "epoch": 2.3858195211786373, + "grad_norm": 0.36129191517829895, + "learning_rate": 8.902285668786578e-05, + "loss": 1.8472, + "step": 7773 + }, + { + "epoch": 2.386126457949662, + "grad_norm": 0.3357439935207367, + "learning_rate": 8.901974885482277e-05, + "loss": 1.8143, + "step": 7774 + }, + { + "epoch": 2.3864333947206875, + "grad_norm": 0.2832469046115875, + "learning_rate": 8.901664063616372e-05, + "loss": 1.7952, + "step": 7775 + }, + { + "epoch": 2.386740331491713, + "grad_norm": 0.31065669655799866, + "learning_rate": 8.901353203191937e-05, + "loss": 1.8651, + "step": 7776 + }, + { + "epoch": 2.3870472682627377, + "grad_norm": 0.2985263764858246, + "learning_rate": 8.901042304212042e-05, + "loss": 1.8106, + "step": 7777 + }, + { + "epoch": 2.387354205033763, + "grad_norm": 0.31606364250183105, + "learning_rate": 8.900731366679761e-05, + "loss": 1.8831, + "step": 7778 + }, + { + "epoch": 2.3876611418047884, + "grad_norm": 0.33167949318885803, + "learning_rate": 8.900420390598166e-05, + "loss": 1.9494, + "step": 7779 + }, + { + "epoch": 2.3879680785758133, + "grad_norm": 0.32814472913742065, + "learning_rate": 8.900109375970333e-05, + "loss": 1.8654, + "step": 7780 + }, + { + "epoch": 2.3882750153468386, + "grad_norm": 0.35307401418685913, + "learning_rate": 8.899798322799331e-05, + "loss": 1.904, + "step": 7781 + }, + { + "epoch": 2.388581952117864, + "grad_norm": 0.3936740458011627, + "learning_rate": 8.899487231088236e-05, + "loss": 1.8404, + "step": 7782 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.3675380349159241, + "learning_rate": 8.899176100840124e-05, + "loss": 1.8689, + "step": 7783 + }, + { + "epoch": 2.389195825659914, + "grad_norm": 0.34065911173820496, + "learning_rate": 8.898864932058067e-05, + "loss": 1.8819, + "step": 7784 + }, + { + "epoch": 2.389502762430939, + "grad_norm": 0.31531861424446106, + "learning_rate": 8.898553724745142e-05, + "loss": 1.8379, + "step": 7785 + }, + { + "epoch": 2.3898096992019644, + "grad_norm": 0.33485177159309387, + "learning_rate": 8.898242478904424e-05, + "loss": 1.9206, + "step": 7786 + }, + { + "epoch": 2.3901166359729897, + "grad_norm": 0.33116385340690613, + "learning_rate": 8.897931194538989e-05, + "loss": 1.8744, + "step": 7787 + }, + { + "epoch": 2.3904235727440146, + "grad_norm": 0.33216002583503723, + "learning_rate": 8.897619871651915e-05, + "loss": 1.8794, + "step": 7788 + }, + { + "epoch": 2.39073050951504, + "grad_norm": 0.3246794641017914, + "learning_rate": 8.897308510246273e-05, + "loss": 1.8739, + "step": 7789 + }, + { + "epoch": 2.391037446286065, + "grad_norm": 0.3038793206214905, + "learning_rate": 8.896997110325146e-05, + "loss": 1.8314, + "step": 7790 + }, + { + "epoch": 2.39134438305709, + "grad_norm": 0.35726267099380493, + "learning_rate": 8.896685671891612e-05, + "loss": 1.8764, + "step": 7791 + }, + { + "epoch": 2.3916513198281155, + "grad_norm": 0.421522855758667, + "learning_rate": 8.896374194948744e-05, + "loss": 1.8215, + "step": 7792 + }, + { + "epoch": 2.3919582565991404, + "grad_norm": 0.4456072747707367, + "learning_rate": 8.896062679499621e-05, + "loss": 1.9146, + "step": 7793 + }, + { + "epoch": 2.3922651933701657, + "grad_norm": 0.33498415350914, + "learning_rate": 8.895751125547325e-05, + "loss": 1.8372, + "step": 7794 + }, + { + "epoch": 2.392572130141191, + "grad_norm": 0.3279598355293274, + "learning_rate": 8.895439533094933e-05, + "loss": 1.8469, + "step": 7795 + }, + { + "epoch": 2.392879066912216, + "grad_norm": 0.4238305687904358, + "learning_rate": 8.895127902145524e-05, + "loss": 1.8259, + "step": 7796 + }, + { + "epoch": 2.3931860036832413, + "grad_norm": 0.473057359457016, + "learning_rate": 8.89481623270218e-05, + "loss": 1.8374, + "step": 7797 + }, + { + "epoch": 2.3934929404542666, + "grad_norm": 0.30914968252182007, + "learning_rate": 8.894504524767976e-05, + "loss": 1.7803, + "step": 7798 + }, + { + "epoch": 2.3937998772252915, + "grad_norm": 0.3433384597301483, + "learning_rate": 8.894192778345996e-05, + "loss": 1.8568, + "step": 7799 + }, + { + "epoch": 2.394106813996317, + "grad_norm": 0.4965706467628479, + "learning_rate": 8.893880993439323e-05, + "loss": 1.8576, + "step": 7800 + }, + { + "epoch": 2.3944137507673418, + "grad_norm": 0.4996519684791565, + "learning_rate": 8.893569170051032e-05, + "loss": 1.788, + "step": 7801 + }, + { + "epoch": 2.394720687538367, + "grad_norm": 0.31231364607810974, + "learning_rate": 8.893257308184212e-05, + "loss": 1.7846, + "step": 7802 + }, + { + "epoch": 2.3950276243093924, + "grad_norm": 0.32845574617385864, + "learning_rate": 8.89294540784194e-05, + "loss": 1.8811, + "step": 7803 + }, + { + "epoch": 2.3953345610804173, + "grad_norm": 0.525324285030365, + "learning_rate": 8.8926334690273e-05, + "loss": 1.8458, + "step": 7804 + }, + { + "epoch": 2.3956414978514426, + "grad_norm": 0.5107213854789734, + "learning_rate": 8.892321491743373e-05, + "loss": 1.8419, + "step": 7805 + }, + { + "epoch": 2.3959484346224675, + "grad_norm": 0.33831658959388733, + "learning_rate": 8.892009475993245e-05, + "loss": 1.811, + "step": 7806 + }, + { + "epoch": 2.396255371393493, + "grad_norm": 0.3781357407569885, + "learning_rate": 8.891697421779999e-05, + "loss": 1.9385, + "step": 7807 + }, + { + "epoch": 2.396562308164518, + "grad_norm": 0.43507882952690125, + "learning_rate": 8.891385329106717e-05, + "loss": 1.7705, + "step": 7808 + }, + { + "epoch": 2.396869244935543, + "grad_norm": 0.45114290714263916, + "learning_rate": 8.891073197976483e-05, + "loss": 1.8661, + "step": 7809 + }, + { + "epoch": 2.3971761817065684, + "grad_norm": 0.29369547963142395, + "learning_rate": 8.890761028392385e-05, + "loss": 1.873, + "step": 7810 + }, + { + "epoch": 2.3974831184775938, + "grad_norm": 0.3268595337867737, + "learning_rate": 8.890448820357506e-05, + "loss": 1.8461, + "step": 7811 + }, + { + "epoch": 2.3977900552486187, + "grad_norm": 0.4514225423336029, + "learning_rate": 8.890136573874931e-05, + "loss": 1.8458, + "step": 7812 + }, + { + "epoch": 2.398096992019644, + "grad_norm": 0.5288760662078857, + "learning_rate": 8.889824288947745e-05, + "loss": 1.8301, + "step": 7813 + }, + { + "epoch": 2.3984039287906693, + "grad_norm": 0.46517884731292725, + "learning_rate": 8.889511965579038e-05, + "loss": 1.8769, + "step": 7814 + }, + { + "epoch": 2.398710865561694, + "grad_norm": 0.29907044768333435, + "learning_rate": 8.889199603771892e-05, + "loss": 1.7815, + "step": 7815 + }, + { + "epoch": 2.3990178023327196, + "grad_norm": 0.36091622710227966, + "learning_rate": 8.888887203529398e-05, + "loss": 1.8375, + "step": 7816 + }, + { + "epoch": 2.3993247391037444, + "grad_norm": 0.5604190230369568, + "learning_rate": 8.88857476485464e-05, + "loss": 1.9176, + "step": 7817 + }, + { + "epoch": 2.3996316758747698, + "grad_norm": 0.48299452662467957, + "learning_rate": 8.888262287750707e-05, + "loss": 1.8682, + "step": 7818 + }, + { + "epoch": 2.399938612645795, + "grad_norm": 0.32829394936561584, + "learning_rate": 8.887949772220687e-05, + "loss": 1.9143, + "step": 7819 + }, + { + "epoch": 2.40024554941682, + "grad_norm": 0.401719868183136, + "learning_rate": 8.88763721826767e-05, + "loss": 1.8517, + "step": 7820 + }, + { + "epoch": 2.4005524861878453, + "grad_norm": 0.5205032825469971, + "learning_rate": 8.887324625894741e-05, + "loss": 1.811, + "step": 7821 + }, + { + "epoch": 2.4008594229588702, + "grad_norm": 0.3828800618648529, + "learning_rate": 8.887011995104993e-05, + "loss": 1.8042, + "step": 7822 + }, + { + "epoch": 2.4011663597298956, + "grad_norm": 0.31816062331199646, + "learning_rate": 8.886699325901514e-05, + "loss": 1.8998, + "step": 7823 + }, + { + "epoch": 2.401473296500921, + "grad_norm": 0.36172720789909363, + "learning_rate": 8.886386618287394e-05, + "loss": 1.8689, + "step": 7824 + }, + { + "epoch": 2.401780233271946, + "grad_norm": 0.3582005202770233, + "learning_rate": 8.886073872265725e-05, + "loss": 1.8565, + "step": 7825 + }, + { + "epoch": 2.402087170042971, + "grad_norm": 0.2915255129337311, + "learning_rate": 8.885761087839594e-05, + "loss": 1.8686, + "step": 7826 + }, + { + "epoch": 2.4023941068139965, + "grad_norm": 0.26619917154312134, + "learning_rate": 8.885448265012095e-05, + "loss": 1.7737, + "step": 7827 + }, + { + "epoch": 2.4027010435850213, + "grad_norm": 0.31685733795166016, + "learning_rate": 8.88513540378632e-05, + "loss": 1.9136, + "step": 7828 + }, + { + "epoch": 2.4030079803560467, + "grad_norm": 0.3427450954914093, + "learning_rate": 8.884822504165359e-05, + "loss": 1.8824, + "step": 7829 + }, + { + "epoch": 2.403314917127072, + "grad_norm": 0.3207513689994812, + "learning_rate": 8.884509566152306e-05, + "loss": 1.8332, + "step": 7830 + }, + { + "epoch": 2.403621853898097, + "grad_norm": 0.3301675319671631, + "learning_rate": 8.884196589750251e-05, + "loss": 1.9129, + "step": 7831 + }, + { + "epoch": 2.4039287906691222, + "grad_norm": 0.3232486844062805, + "learning_rate": 8.88388357496229e-05, + "loss": 1.8362, + "step": 7832 + }, + { + "epoch": 2.404235727440147, + "grad_norm": 0.3152230381965637, + "learning_rate": 8.883570521791514e-05, + "loss": 1.8586, + "step": 7833 + }, + { + "epoch": 2.4045426642111725, + "grad_norm": 0.3204822540283203, + "learning_rate": 8.883257430241019e-05, + "loss": 1.842, + "step": 7834 + }, + { + "epoch": 2.404849600982198, + "grad_norm": 0.28253886103630066, + "learning_rate": 8.882944300313897e-05, + "loss": 1.8521, + "step": 7835 + }, + { + "epoch": 2.4051565377532227, + "grad_norm": 0.37631165981292725, + "learning_rate": 8.882631132013245e-05, + "loss": 1.8838, + "step": 7836 + }, + { + "epoch": 2.405463474524248, + "grad_norm": 0.3606031537055969, + "learning_rate": 8.882317925342157e-05, + "loss": 1.8452, + "step": 7837 + }, + { + "epoch": 2.4057704112952734, + "grad_norm": 0.33793914318084717, + "learning_rate": 8.882004680303726e-05, + "loss": 1.8866, + "step": 7838 + }, + { + "epoch": 2.4060773480662982, + "grad_norm": 0.2714223265647888, + "learning_rate": 8.881691396901048e-05, + "loss": 1.7953, + "step": 7839 + }, + { + "epoch": 2.4063842848373236, + "grad_norm": 0.3588239252567291, + "learning_rate": 8.881378075137224e-05, + "loss": 1.9679, + "step": 7840 + }, + { + "epoch": 2.406691221608349, + "grad_norm": 0.3266383707523346, + "learning_rate": 8.881064715015344e-05, + "loss": 1.8747, + "step": 7841 + }, + { + "epoch": 2.406998158379374, + "grad_norm": 0.3498428761959076, + "learning_rate": 8.88075131653851e-05, + "loss": 1.8882, + "step": 7842 + }, + { + "epoch": 2.407305095150399, + "grad_norm": 0.36646100878715515, + "learning_rate": 8.880437879709815e-05, + "loss": 1.8624, + "step": 7843 + }, + { + "epoch": 2.407612031921424, + "grad_norm": 0.36088457703590393, + "learning_rate": 8.88012440453236e-05, + "loss": 1.8527, + "step": 7844 + }, + { + "epoch": 2.4079189686924494, + "grad_norm": 0.3267477750778198, + "learning_rate": 8.87981089100924e-05, + "loss": 1.8374, + "step": 7845 + }, + { + "epoch": 2.4082259054634747, + "grad_norm": 0.3262403607368469, + "learning_rate": 8.879497339143556e-05, + "loss": 1.8752, + "step": 7846 + }, + { + "epoch": 2.4085328422344996, + "grad_norm": 0.278877854347229, + "learning_rate": 8.879183748938405e-05, + "loss": 1.8056, + "step": 7847 + }, + { + "epoch": 2.408839779005525, + "grad_norm": 0.35509005188941956, + "learning_rate": 8.878870120396886e-05, + "loss": 1.8555, + "step": 7848 + }, + { + "epoch": 2.40914671577655, + "grad_norm": 0.3621126413345337, + "learning_rate": 8.8785564535221e-05, + "loss": 1.8084, + "step": 7849 + }, + { + "epoch": 2.409453652547575, + "grad_norm": 0.2772746682167053, + "learning_rate": 8.878242748317145e-05, + "loss": 1.8034, + "step": 7850 + }, + { + "epoch": 2.4097605893186005, + "grad_norm": 0.30938875675201416, + "learning_rate": 8.877929004785121e-05, + "loss": 1.8341, + "step": 7851 + }, + { + "epoch": 2.4100675260896254, + "grad_norm": 0.3349369764328003, + "learning_rate": 8.877615222929133e-05, + "loss": 1.8306, + "step": 7852 + }, + { + "epoch": 2.4103744628606507, + "grad_norm": 0.3109685778617859, + "learning_rate": 8.877301402752277e-05, + "loss": 1.7998, + "step": 7853 + }, + { + "epoch": 2.410681399631676, + "grad_norm": 0.3337927460670471, + "learning_rate": 8.876987544257655e-05, + "loss": 1.8766, + "step": 7854 + }, + { + "epoch": 2.410988336402701, + "grad_norm": 0.33891361951828003, + "learning_rate": 8.87667364744837e-05, + "loss": 1.8535, + "step": 7855 + }, + { + "epoch": 2.4112952731737263, + "grad_norm": 0.30946552753448486, + "learning_rate": 8.876359712327524e-05, + "loss": 1.8144, + "step": 7856 + }, + { + "epoch": 2.4116022099447516, + "grad_norm": 0.354981929063797, + "learning_rate": 8.87604573889822e-05, + "loss": 1.9253, + "step": 7857 + }, + { + "epoch": 2.4119091467157765, + "grad_norm": 0.42054516077041626, + "learning_rate": 8.875731727163559e-05, + "loss": 1.9122, + "step": 7858 + }, + { + "epoch": 2.412216083486802, + "grad_norm": 0.37435492873191833, + "learning_rate": 8.875417677126646e-05, + "loss": 1.8639, + "step": 7859 + }, + { + "epoch": 2.4125230202578267, + "grad_norm": 0.3742216229438782, + "learning_rate": 8.875103588790584e-05, + "loss": 1.8398, + "step": 7860 + }, + { + "epoch": 2.412829957028852, + "grad_norm": 0.3152104616165161, + "learning_rate": 8.874789462158478e-05, + "loss": 1.8078, + "step": 7861 + }, + { + "epoch": 2.4131368937998774, + "grad_norm": 0.32342761754989624, + "learning_rate": 8.87447529723343e-05, + "loss": 1.8632, + "step": 7862 + }, + { + "epoch": 2.4134438305709023, + "grad_norm": 0.31065210700035095, + "learning_rate": 8.874161094018547e-05, + "loss": 1.845, + "step": 7863 + }, + { + "epoch": 2.4137507673419276, + "grad_norm": 0.31379538774490356, + "learning_rate": 8.873846852516933e-05, + "loss": 1.8184, + "step": 7864 + }, + { + "epoch": 2.4140577041129525, + "grad_norm": 0.29058924317359924, + "learning_rate": 8.873532572731694e-05, + "loss": 1.8671, + "step": 7865 + }, + { + "epoch": 2.414364640883978, + "grad_norm": 0.3024691641330719, + "learning_rate": 8.873218254665936e-05, + "loss": 1.7977, + "step": 7866 + }, + { + "epoch": 2.414671577655003, + "grad_norm": 0.30356913805007935, + "learning_rate": 8.872903898322764e-05, + "loss": 1.8284, + "step": 7867 + }, + { + "epoch": 2.414978514426028, + "grad_norm": 0.29594334959983826, + "learning_rate": 8.872589503705287e-05, + "loss": 1.8651, + "step": 7868 + }, + { + "epoch": 2.4152854511970534, + "grad_norm": 0.2929564118385315, + "learning_rate": 8.872275070816612e-05, + "loss": 1.8671, + "step": 7869 + }, + { + "epoch": 2.4155923879680787, + "grad_norm": 0.30591902136802673, + "learning_rate": 8.871960599659842e-05, + "loss": 1.9341, + "step": 7870 + }, + { + "epoch": 2.4158993247391036, + "grad_norm": 0.3944799304008484, + "learning_rate": 8.87164609023809e-05, + "loss": 1.8947, + "step": 7871 + }, + { + "epoch": 2.416206261510129, + "grad_norm": 0.3568263351917267, + "learning_rate": 8.871331542554461e-05, + "loss": 1.8466, + "step": 7872 + }, + { + "epoch": 2.4165131982811543, + "grad_norm": 0.3182635009288788, + "learning_rate": 8.871016956612066e-05, + "loss": 1.8373, + "step": 7873 + }, + { + "epoch": 2.416820135052179, + "grad_norm": 0.31941649317741394, + "learning_rate": 8.870702332414012e-05, + "loss": 1.8356, + "step": 7874 + }, + { + "epoch": 2.4171270718232045, + "grad_norm": 0.3090899586677551, + "learning_rate": 8.870387669963407e-05, + "loss": 1.9308, + "step": 7875 + }, + { + "epoch": 2.4174340085942294, + "grad_norm": 0.3078390955924988, + "learning_rate": 8.870072969263364e-05, + "loss": 1.8521, + "step": 7876 + }, + { + "epoch": 2.4177409453652547, + "grad_norm": 0.29126885533332825, + "learning_rate": 8.869758230316992e-05, + "loss": 1.8091, + "step": 7877 + }, + { + "epoch": 2.41804788213628, + "grad_norm": 0.36473605036735535, + "learning_rate": 8.869443453127402e-05, + "loss": 1.8282, + "step": 7878 + }, + { + "epoch": 2.418354818907305, + "grad_norm": 0.3617660701274872, + "learning_rate": 8.869128637697702e-05, + "loss": 1.8843, + "step": 7879 + }, + { + "epoch": 2.4186617556783303, + "grad_norm": 0.33267220854759216, + "learning_rate": 8.868813784031005e-05, + "loss": 1.8647, + "step": 7880 + }, + { + "epoch": 2.418968692449355, + "grad_norm": 0.29990482330322266, + "learning_rate": 8.868498892130424e-05, + "loss": 1.7697, + "step": 7881 + }, + { + "epoch": 2.4192756292203805, + "grad_norm": 0.3618892431259155, + "learning_rate": 8.868183961999068e-05, + "loss": 1.7699, + "step": 7882 + }, + { + "epoch": 2.419582565991406, + "grad_norm": 0.29534587264060974, + "learning_rate": 8.867868993640051e-05, + "loss": 1.828, + "step": 7883 + }, + { + "epoch": 2.4198895027624308, + "grad_norm": 0.3086758255958557, + "learning_rate": 8.867553987056487e-05, + "loss": 1.8652, + "step": 7884 + }, + { + "epoch": 2.420196439533456, + "grad_norm": 0.3273947834968567, + "learning_rate": 8.867238942251487e-05, + "loss": 1.8553, + "step": 7885 + }, + { + "epoch": 2.4205033763044814, + "grad_norm": 0.3069070279598236, + "learning_rate": 8.866923859228165e-05, + "loss": 1.8057, + "step": 7886 + }, + { + "epoch": 2.4208103130755063, + "grad_norm": 0.2884439527988434, + "learning_rate": 8.866608737989635e-05, + "loss": 1.8479, + "step": 7887 + }, + { + "epoch": 2.4211172498465316, + "grad_norm": 0.32123002409935, + "learning_rate": 8.866293578539011e-05, + "loss": 1.916, + "step": 7888 + }, + { + "epoch": 2.421424186617557, + "grad_norm": 0.285966157913208, + "learning_rate": 8.865978380879407e-05, + "loss": 1.834, + "step": 7889 + }, + { + "epoch": 2.421731123388582, + "grad_norm": 0.28088799118995667, + "learning_rate": 8.865663145013941e-05, + "loss": 1.7794, + "step": 7890 + }, + { + "epoch": 2.422038060159607, + "grad_norm": 0.31160372495651245, + "learning_rate": 8.865347870945724e-05, + "loss": 1.8584, + "step": 7891 + }, + { + "epoch": 2.422344996930632, + "grad_norm": 0.3121089041233063, + "learning_rate": 8.865032558677874e-05, + "loss": 1.8797, + "step": 7892 + }, + { + "epoch": 2.4226519337016574, + "grad_norm": 0.35856643319129944, + "learning_rate": 8.864717208213506e-05, + "loss": 1.8664, + "step": 7893 + }, + { + "epoch": 2.4229588704726828, + "grad_norm": 0.32826781272888184, + "learning_rate": 8.864401819555739e-05, + "loss": 1.8473, + "step": 7894 + }, + { + "epoch": 2.4232658072437077, + "grad_norm": 0.34450921416282654, + "learning_rate": 8.86408639270769e-05, + "loss": 1.918, + "step": 7895 + }, + { + "epoch": 2.423572744014733, + "grad_norm": 0.39621153473854065, + "learning_rate": 8.86377092767247e-05, + "loss": 1.9411, + "step": 7896 + }, + { + "epoch": 2.423879680785758, + "grad_norm": 0.3765166103839874, + "learning_rate": 8.863455424453204e-05, + "loss": 1.9003, + "step": 7897 + }, + { + "epoch": 2.424186617556783, + "grad_norm": 0.3942621946334839, + "learning_rate": 8.863139883053007e-05, + "loss": 1.9647, + "step": 7898 + }, + { + "epoch": 2.4244935543278086, + "grad_norm": 0.4255806803703308, + "learning_rate": 8.862824303474996e-05, + "loss": 1.9147, + "step": 7899 + }, + { + "epoch": 2.424800491098834, + "grad_norm": 0.3993197977542877, + "learning_rate": 8.862508685722292e-05, + "loss": 1.8822, + "step": 7900 + }, + { + "epoch": 2.425107427869859, + "grad_norm": 0.3734201490879059, + "learning_rate": 8.862193029798013e-05, + "loss": 1.8745, + "step": 7901 + }, + { + "epoch": 2.425414364640884, + "grad_norm": 0.40955278277397156, + "learning_rate": 8.861877335705279e-05, + "loss": 1.877, + "step": 7902 + }, + { + "epoch": 2.425721301411909, + "grad_norm": 0.3975965678691864, + "learning_rate": 8.861561603447211e-05, + "loss": 1.868, + "step": 7903 + }, + { + "epoch": 2.4260282381829343, + "grad_norm": 0.30194091796875, + "learning_rate": 8.861245833026926e-05, + "loss": 1.7849, + "step": 7904 + }, + { + "epoch": 2.4263351749539597, + "grad_norm": 0.349930077791214, + "learning_rate": 8.860930024447547e-05, + "loss": 1.891, + "step": 7905 + }, + { + "epoch": 2.4266421117249846, + "grad_norm": 0.40644606947898865, + "learning_rate": 8.860614177712196e-05, + "loss": 1.8463, + "step": 7906 + }, + { + "epoch": 2.42694904849601, + "grad_norm": 0.3627426028251648, + "learning_rate": 8.86029829282399e-05, + "loss": 1.8518, + "step": 7907 + }, + { + "epoch": 2.427255985267035, + "grad_norm": 0.4019826054573059, + "learning_rate": 8.859982369786055e-05, + "loss": 1.7997, + "step": 7908 + }, + { + "epoch": 2.42756292203806, + "grad_norm": 0.375589519739151, + "learning_rate": 8.859666408601512e-05, + "loss": 1.9136, + "step": 7909 + }, + { + "epoch": 2.4278698588090855, + "grad_norm": 0.3135814070701599, + "learning_rate": 8.859350409273484e-05, + "loss": 1.8511, + "step": 7910 + }, + { + "epoch": 2.4281767955801103, + "grad_norm": 0.4534473717212677, + "learning_rate": 8.859034371805093e-05, + "loss": 1.9827, + "step": 7911 + }, + { + "epoch": 2.4284837323511357, + "grad_norm": 0.5559772849082947, + "learning_rate": 8.858718296199462e-05, + "loss": 1.8578, + "step": 7912 + }, + { + "epoch": 2.428790669122161, + "grad_norm": 0.4518011212348938, + "learning_rate": 8.858402182459715e-05, + "loss": 1.8374, + "step": 7913 + }, + { + "epoch": 2.429097605893186, + "grad_norm": 0.31662946939468384, + "learning_rate": 8.858086030588977e-05, + "loss": 1.8356, + "step": 7914 + }, + { + "epoch": 2.4294045426642112, + "grad_norm": 0.4660717844963074, + "learning_rate": 8.857769840590371e-05, + "loss": 1.7977, + "step": 7915 + }, + { + "epoch": 2.4297114794352366, + "grad_norm": 0.5611162185668945, + "learning_rate": 8.857453612467022e-05, + "loss": 1.8423, + "step": 7916 + }, + { + "epoch": 2.4300184162062615, + "grad_norm": 0.5055921077728271, + "learning_rate": 8.857137346222056e-05, + "loss": 1.8595, + "step": 7917 + }, + { + "epoch": 2.430325352977287, + "grad_norm": 0.3589123487472534, + "learning_rate": 8.856821041858597e-05, + "loss": 1.776, + "step": 7918 + }, + { + "epoch": 2.4306322897483117, + "grad_norm": 0.36849313974380493, + "learning_rate": 8.856504699379773e-05, + "loss": 1.8695, + "step": 7919 + }, + { + "epoch": 2.430939226519337, + "grad_norm": 0.47566625475883484, + "learning_rate": 8.856188318788709e-05, + "loss": 1.8578, + "step": 7920 + }, + { + "epoch": 2.4312461632903624, + "grad_norm": 0.554790735244751, + "learning_rate": 8.855871900088532e-05, + "loss": 1.8406, + "step": 7921 + }, + { + "epoch": 2.4315531000613873, + "grad_norm": 0.4846283197402954, + "learning_rate": 8.855555443282369e-05, + "loss": 1.8475, + "step": 7922 + }, + { + "epoch": 2.4318600368324126, + "grad_norm": 0.35256531834602356, + "learning_rate": 8.855238948373346e-05, + "loss": 1.8594, + "step": 7923 + }, + { + "epoch": 2.4321669736034375, + "grad_norm": 0.3713412880897522, + "learning_rate": 8.854922415364593e-05, + "loss": 1.893, + "step": 7924 + }, + { + "epoch": 2.432473910374463, + "grad_norm": 0.4289644658565521, + "learning_rate": 8.854605844259237e-05, + "loss": 1.8958, + "step": 7925 + }, + { + "epoch": 2.432780847145488, + "grad_norm": 0.4209578335285187, + "learning_rate": 8.854289235060406e-05, + "loss": 1.8419, + "step": 7926 + }, + { + "epoch": 2.433087783916513, + "grad_norm": 0.41226091980934143, + "learning_rate": 8.853972587771232e-05, + "loss": 1.958, + "step": 7927 + }, + { + "epoch": 2.4333947206875384, + "grad_norm": 0.36133915185928345, + "learning_rate": 8.853655902394841e-05, + "loss": 1.9181, + "step": 7928 + }, + { + "epoch": 2.4337016574585637, + "grad_norm": 0.44178202748298645, + "learning_rate": 8.853339178934363e-05, + "loss": 1.9242, + "step": 7929 + }, + { + "epoch": 2.4340085942295886, + "grad_norm": 0.4537523686885834, + "learning_rate": 8.853022417392929e-05, + "loss": 2.0451, + "step": 7930 + }, + { + "epoch": 2.434315531000614, + "grad_norm": 0.3214915990829468, + "learning_rate": 8.852705617773669e-05, + "loss": 1.8549, + "step": 7931 + }, + { + "epoch": 2.4346224677716393, + "grad_norm": 0.4621930420398712, + "learning_rate": 8.852388780079714e-05, + "loss": 1.8705, + "step": 7932 + }, + { + "epoch": 2.434929404542664, + "grad_norm": 0.52337646484375, + "learning_rate": 8.852071904314196e-05, + "loss": 1.8381, + "step": 7933 + }, + { + "epoch": 2.4352363413136895, + "grad_norm": 0.3846060633659363, + "learning_rate": 8.851754990480246e-05, + "loss": 1.828, + "step": 7934 + }, + { + "epoch": 2.4355432780847144, + "grad_norm": 0.34233763813972473, + "learning_rate": 8.851438038580994e-05, + "loss": 1.924, + "step": 7935 + }, + { + "epoch": 2.4358502148557397, + "grad_norm": 0.39583292603492737, + "learning_rate": 8.851121048619574e-05, + "loss": 1.8383, + "step": 7936 + }, + { + "epoch": 2.436157151626765, + "grad_norm": 0.3715476393699646, + "learning_rate": 8.850804020599119e-05, + "loss": 1.9251, + "step": 7937 + }, + { + "epoch": 2.43646408839779, + "grad_norm": 0.32089582085609436, + "learning_rate": 8.850486954522762e-05, + "loss": 1.9317, + "step": 7938 + }, + { + "epoch": 2.4367710251688153, + "grad_norm": 0.46823611855506897, + "learning_rate": 8.850169850393634e-05, + "loss": 1.9743, + "step": 7939 + }, + { + "epoch": 2.43707796193984, + "grad_norm": 0.405205637216568, + "learning_rate": 8.849852708214874e-05, + "loss": 1.8772, + "step": 7940 + }, + { + "epoch": 2.4373848987108655, + "grad_norm": 0.33672770857810974, + "learning_rate": 8.849535527989612e-05, + "loss": 1.8767, + "step": 7941 + }, + { + "epoch": 2.437691835481891, + "grad_norm": 0.38022953271865845, + "learning_rate": 8.849218309720983e-05, + "loss": 1.8882, + "step": 7942 + }, + { + "epoch": 2.4379987722529157, + "grad_norm": 0.4224186837673187, + "learning_rate": 8.848901053412124e-05, + "loss": 1.9016, + "step": 7943 + }, + { + "epoch": 2.438305709023941, + "grad_norm": 0.3890904486179352, + "learning_rate": 8.848583759066167e-05, + "loss": 1.8761, + "step": 7944 + }, + { + "epoch": 2.4386126457949664, + "grad_norm": 0.3747030794620514, + "learning_rate": 8.84826642668625e-05, + "loss": 1.8576, + "step": 7945 + }, + { + "epoch": 2.4389195825659913, + "grad_norm": 0.3317604959011078, + "learning_rate": 8.84794905627551e-05, + "loss": 1.9249, + "step": 7946 + }, + { + "epoch": 2.4392265193370166, + "grad_norm": 0.3294972777366638, + "learning_rate": 8.84763164783708e-05, + "loss": 1.8308, + "step": 7947 + }, + { + "epoch": 2.439533456108042, + "grad_norm": 0.42031124234199524, + "learning_rate": 8.847314201374101e-05, + "loss": 1.7884, + "step": 7948 + }, + { + "epoch": 2.439840392879067, + "grad_norm": 0.4018419682979584, + "learning_rate": 8.846996716889708e-05, + "loss": 1.8334, + "step": 7949 + }, + { + "epoch": 2.440147329650092, + "grad_norm": 0.39541858434677124, + "learning_rate": 8.846679194387036e-05, + "loss": 1.888, + "step": 7950 + }, + { + "epoch": 2.440454266421117, + "grad_norm": 0.34641456604003906, + "learning_rate": 8.846361633869228e-05, + "loss": 1.8521, + "step": 7951 + }, + { + "epoch": 2.4407612031921424, + "grad_norm": 0.42987826466560364, + "learning_rate": 8.846044035339419e-05, + "loss": 1.8789, + "step": 7952 + }, + { + "epoch": 2.4410681399631677, + "grad_norm": 0.3651089072227478, + "learning_rate": 8.845726398800749e-05, + "loss": 1.9024, + "step": 7953 + }, + { + "epoch": 2.4413750767341926, + "grad_norm": 0.3024137616157532, + "learning_rate": 8.845408724256356e-05, + "loss": 1.7773, + "step": 7954 + }, + { + "epoch": 2.441682013505218, + "grad_norm": 0.32426944375038147, + "learning_rate": 8.845091011709381e-05, + "loss": 1.7873, + "step": 7955 + }, + { + "epoch": 2.441988950276243, + "grad_norm": 0.34448274970054626, + "learning_rate": 8.844773261162962e-05, + "loss": 1.8854, + "step": 7956 + }, + { + "epoch": 2.442295887047268, + "grad_norm": 0.2942068874835968, + "learning_rate": 8.844455472620241e-05, + "loss": 1.8186, + "step": 7957 + }, + { + "epoch": 2.4426028238182935, + "grad_norm": 0.3849888741970062, + "learning_rate": 8.844137646084358e-05, + "loss": 1.905, + "step": 7958 + }, + { + "epoch": 2.4429097605893184, + "grad_norm": 0.44277897477149963, + "learning_rate": 8.843819781558452e-05, + "loss": 1.8836, + "step": 7959 + }, + { + "epoch": 2.4432166973603437, + "grad_norm": 0.34470248222351074, + "learning_rate": 8.843501879045667e-05, + "loss": 1.9368, + "step": 7960 + }, + { + "epoch": 2.443523634131369, + "grad_norm": 0.29713204503059387, + "learning_rate": 8.843183938549145e-05, + "loss": 1.8562, + "step": 7961 + }, + { + "epoch": 2.443830570902394, + "grad_norm": 0.370623379945755, + "learning_rate": 8.842865960072025e-05, + "loss": 1.8501, + "step": 7962 + }, + { + "epoch": 2.4441375076734193, + "grad_norm": 0.38828277587890625, + "learning_rate": 8.842547943617453e-05, + "loss": 1.884, + "step": 7963 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.294223427772522, + "learning_rate": 8.842229889188566e-05, + "loss": 1.857, + "step": 7964 + }, + { + "epoch": 2.4447513812154695, + "grad_norm": 0.31901589035987854, + "learning_rate": 8.841911796788516e-05, + "loss": 1.8675, + "step": 7965 + }, + { + "epoch": 2.445058317986495, + "grad_norm": 0.3586447834968567, + "learning_rate": 8.84159366642044e-05, + "loss": 1.86, + "step": 7966 + }, + { + "epoch": 2.4453652547575198, + "grad_norm": 0.30848199129104614, + "learning_rate": 8.841275498087482e-05, + "loss": 1.8153, + "step": 7967 + }, + { + "epoch": 2.445672191528545, + "grad_norm": 0.2694801688194275, + "learning_rate": 8.84095729179279e-05, + "loss": 1.7702, + "step": 7968 + }, + { + "epoch": 2.4459791282995704, + "grad_norm": 0.3068044185638428, + "learning_rate": 8.840639047539507e-05, + "loss": 1.8531, + "step": 7969 + }, + { + "epoch": 2.4462860650705953, + "grad_norm": 0.32885125279426575, + "learning_rate": 8.840320765330776e-05, + "loss": 1.9194, + "step": 7970 + }, + { + "epoch": 2.4465930018416207, + "grad_norm": 0.2949635088443756, + "learning_rate": 8.840002445169746e-05, + "loss": 1.8427, + "step": 7971 + }, + { + "epoch": 2.446899938612646, + "grad_norm": 0.27281275391578674, + "learning_rate": 8.83968408705956e-05, + "loss": 1.8279, + "step": 7972 + }, + { + "epoch": 2.447206875383671, + "grad_norm": 0.3038519620895386, + "learning_rate": 8.839365691003367e-05, + "loss": 1.8629, + "step": 7973 + }, + { + "epoch": 2.447513812154696, + "grad_norm": 0.28468266129493713, + "learning_rate": 8.839047257004311e-05, + "loss": 1.8765, + "step": 7974 + }, + { + "epoch": 2.4478207489257215, + "grad_norm": 0.29807159304618835, + "learning_rate": 8.83872878506554e-05, + "loss": 1.8152, + "step": 7975 + }, + { + "epoch": 2.4481276856967464, + "grad_norm": 0.3005301356315613, + "learning_rate": 8.838410275190201e-05, + "loss": 1.8577, + "step": 7976 + }, + { + "epoch": 2.4484346224677718, + "grad_norm": 0.3068598806858063, + "learning_rate": 8.838091727381442e-05, + "loss": 1.863, + "step": 7977 + }, + { + "epoch": 2.4487415592387967, + "grad_norm": 0.33748000860214233, + "learning_rate": 8.837773141642411e-05, + "loss": 1.7889, + "step": 7978 + }, + { + "epoch": 2.449048496009822, + "grad_norm": 0.344417542219162, + "learning_rate": 8.837454517976256e-05, + "loss": 1.9167, + "step": 7979 + }, + { + "epoch": 2.4493554327808473, + "grad_norm": 0.29128298163414, + "learning_rate": 8.837135856386127e-05, + "loss": 1.8246, + "step": 7980 + }, + { + "epoch": 2.449662369551872, + "grad_norm": 0.27023759484291077, + "learning_rate": 8.836817156875172e-05, + "loss": 1.8493, + "step": 7981 + }, + { + "epoch": 2.4499693063228976, + "grad_norm": 0.2792586088180542, + "learning_rate": 8.836498419446541e-05, + "loss": 1.8739, + "step": 7982 + }, + { + "epoch": 2.4502762430939224, + "grad_norm": 0.2715211510658264, + "learning_rate": 8.836179644103384e-05, + "loss": 1.8218, + "step": 7983 + }, + { + "epoch": 2.450583179864948, + "grad_norm": 0.273576557636261, + "learning_rate": 8.835860830848851e-05, + "loss": 1.9063, + "step": 7984 + }, + { + "epoch": 2.450890116635973, + "grad_norm": 0.2992589473724365, + "learning_rate": 8.835541979686093e-05, + "loss": 1.8799, + "step": 7985 + }, + { + "epoch": 2.451197053406998, + "grad_norm": 0.3231843411922455, + "learning_rate": 8.835223090618263e-05, + "loss": 1.8956, + "step": 7986 + }, + { + "epoch": 2.4515039901780233, + "grad_norm": 0.31108468770980835, + "learning_rate": 8.834904163648508e-05, + "loss": 1.8371, + "step": 7987 + }, + { + "epoch": 2.4518109269490487, + "grad_norm": 0.26657021045684814, + "learning_rate": 8.834585198779983e-05, + "loss": 1.8384, + "step": 7988 + }, + { + "epoch": 2.4521178637200736, + "grad_norm": 0.32093849778175354, + "learning_rate": 8.83426619601584e-05, + "loss": 1.8603, + "step": 7989 + }, + { + "epoch": 2.452424800491099, + "grad_norm": 0.32942765951156616, + "learning_rate": 8.833947155359231e-05, + "loss": 1.8306, + "step": 7990 + }, + { + "epoch": 2.4527317372621242, + "grad_norm": 0.31677374243736267, + "learning_rate": 8.83362807681331e-05, + "loss": 1.8339, + "step": 7991 + }, + { + "epoch": 2.453038674033149, + "grad_norm": 0.2739655673503876, + "learning_rate": 8.833308960381228e-05, + "loss": 1.8514, + "step": 7992 + }, + { + "epoch": 2.4533456108041745, + "grad_norm": 0.3194214105606079, + "learning_rate": 8.83298980606614e-05, + "loss": 1.8413, + "step": 7993 + }, + { + "epoch": 2.4536525475751993, + "grad_norm": 0.3346202075481415, + "learning_rate": 8.832670613871202e-05, + "loss": 1.8558, + "step": 7994 + }, + { + "epoch": 2.4539594843462247, + "grad_norm": 0.3400736451148987, + "learning_rate": 8.832351383799565e-05, + "loss": 1.8668, + "step": 7995 + }, + { + "epoch": 2.45426642111725, + "grad_norm": 0.2807479202747345, + "learning_rate": 8.832032115854385e-05, + "loss": 1.8361, + "step": 7996 + }, + { + "epoch": 2.454573357888275, + "grad_norm": 0.2977379262447357, + "learning_rate": 8.831712810038817e-05, + "loss": 1.84, + "step": 7997 + }, + { + "epoch": 2.4548802946593002, + "grad_norm": 0.3242948353290558, + "learning_rate": 8.831393466356019e-05, + "loss": 1.9421, + "step": 7998 + }, + { + "epoch": 2.455187231430325, + "grad_norm": 0.3289327025413513, + "learning_rate": 8.831074084809144e-05, + "loss": 1.9348, + "step": 7999 + }, + { + "epoch": 2.4554941682013505, + "grad_norm": 0.3378387987613678, + "learning_rate": 8.830754665401351e-05, + "loss": 1.7871, + "step": 8000 + }, + { + "epoch": 2.455801104972376, + "grad_norm": 0.29627665877342224, + "learning_rate": 8.830435208135794e-05, + "loss": 1.815, + "step": 8001 + }, + { + "epoch": 2.4561080417434007, + "grad_norm": 0.3509432375431061, + "learning_rate": 8.83011571301563e-05, + "loss": 1.9209, + "step": 8002 + }, + { + "epoch": 2.456414978514426, + "grad_norm": 0.3272305130958557, + "learning_rate": 8.829796180044019e-05, + "loss": 1.8437, + "step": 8003 + }, + { + "epoch": 2.4567219152854514, + "grad_norm": 0.33997493982315063, + "learning_rate": 8.829476609224119e-05, + "loss": 1.8827, + "step": 8004 + }, + { + "epoch": 2.4570288520564763, + "grad_norm": 0.30387789011001587, + "learning_rate": 8.829157000559084e-05, + "loss": 1.8427, + "step": 8005 + }, + { + "epoch": 2.4573357888275016, + "grad_norm": 0.30266425013542175, + "learning_rate": 8.828837354052075e-05, + "loss": 1.8274, + "step": 8006 + }, + { + "epoch": 2.457642725598527, + "grad_norm": 0.365546315908432, + "learning_rate": 8.828517669706254e-05, + "loss": 1.8455, + "step": 8007 + }, + { + "epoch": 2.457949662369552, + "grad_norm": 0.339226633310318, + "learning_rate": 8.828197947524774e-05, + "loss": 1.8665, + "step": 8008 + }, + { + "epoch": 2.458256599140577, + "grad_norm": 0.31167346239089966, + "learning_rate": 8.8278781875108e-05, + "loss": 1.7807, + "step": 8009 + }, + { + "epoch": 2.458563535911602, + "grad_norm": 0.2788028120994568, + "learning_rate": 8.82755838966749e-05, + "loss": 1.8834, + "step": 8010 + }, + { + "epoch": 2.4588704726826274, + "grad_norm": 0.34648752212524414, + "learning_rate": 8.827238553998005e-05, + "loss": 1.8981, + "step": 8011 + }, + { + "epoch": 2.4591774094536527, + "grad_norm": 0.3169974982738495, + "learning_rate": 8.826918680505504e-05, + "loss": 1.81, + "step": 8012 + }, + { + "epoch": 2.4594843462246776, + "grad_norm": 0.46924272179603577, + "learning_rate": 8.826598769193151e-05, + "loss": 1.9016, + "step": 8013 + }, + { + "epoch": 2.459791282995703, + "grad_norm": 0.38437098264694214, + "learning_rate": 8.826278820064106e-05, + "loss": 1.8924, + "step": 8014 + }, + { + "epoch": 2.460098219766728, + "grad_norm": 0.3350604474544525, + "learning_rate": 8.82595883312153e-05, + "loss": 1.8591, + "step": 8015 + }, + { + "epoch": 2.460405156537753, + "grad_norm": 0.3053742051124573, + "learning_rate": 8.825638808368588e-05, + "loss": 1.8114, + "step": 8016 + }, + { + "epoch": 2.4607120933087785, + "grad_norm": 0.29566875100135803, + "learning_rate": 8.82531874580844e-05, + "loss": 1.8055, + "step": 8017 + }, + { + "epoch": 2.4610190300798034, + "grad_norm": 0.3057360053062439, + "learning_rate": 8.824998645444249e-05, + "loss": 1.8268, + "step": 8018 + }, + { + "epoch": 2.4613259668508287, + "grad_norm": 0.27333348989486694, + "learning_rate": 8.82467850727918e-05, + "loss": 1.7876, + "step": 8019 + }, + { + "epoch": 2.461632903621854, + "grad_norm": 0.29202890396118164, + "learning_rate": 8.824358331316398e-05, + "loss": 1.8488, + "step": 8020 + }, + { + "epoch": 2.461939840392879, + "grad_norm": 0.3640623986721039, + "learning_rate": 8.824038117559064e-05, + "loss": 1.9665, + "step": 8021 + }, + { + "epoch": 2.4622467771639043, + "grad_norm": 0.35411131381988525, + "learning_rate": 8.823717866010344e-05, + "loss": 1.8561, + "step": 8022 + }, + { + "epoch": 2.4625537139349296, + "grad_norm": 0.3695240020751953, + "learning_rate": 8.823397576673403e-05, + "loss": 1.8489, + "step": 8023 + }, + { + "epoch": 2.4628606507059545, + "grad_norm": 0.36554715037345886, + "learning_rate": 8.823077249551406e-05, + "loss": 1.8523, + "step": 8024 + }, + { + "epoch": 2.46316758747698, + "grad_norm": 0.2982638478279114, + "learning_rate": 8.822756884647521e-05, + "loss": 1.8006, + "step": 8025 + }, + { + "epoch": 2.4634745242480047, + "grad_norm": 0.3693525791168213, + "learning_rate": 8.822436481964909e-05, + "loss": 1.8695, + "step": 8026 + }, + { + "epoch": 2.46378146101903, + "grad_norm": 0.46769842505455017, + "learning_rate": 8.82211604150674e-05, + "loss": 1.8509, + "step": 8027 + }, + { + "epoch": 2.4640883977900554, + "grad_norm": 0.5327584743499756, + "learning_rate": 8.82179556327618e-05, + "loss": 1.8642, + "step": 8028 + }, + { + "epoch": 2.4643953345610803, + "grad_norm": 0.5302795767784119, + "learning_rate": 8.821475047276398e-05, + "loss": 1.8645, + "step": 8029 + }, + { + "epoch": 2.4647022713321056, + "grad_norm": 0.43549028038978577, + "learning_rate": 8.821154493510557e-05, + "loss": 1.9193, + "step": 8030 + }, + { + "epoch": 2.4650092081031305, + "grad_norm": 0.3013847768306732, + "learning_rate": 8.82083390198183e-05, + "loss": 1.7819, + "step": 8031 + }, + { + "epoch": 2.465316144874156, + "grad_norm": 0.422325074672699, + "learning_rate": 8.820513272693383e-05, + "loss": 1.9307, + "step": 8032 + }, + { + "epoch": 2.465623081645181, + "grad_norm": 0.4823217988014221, + "learning_rate": 8.820192605648383e-05, + "loss": 1.8681, + "step": 8033 + }, + { + "epoch": 2.465930018416206, + "grad_norm": 0.3938382863998413, + "learning_rate": 8.819871900850001e-05, + "loss": 1.8483, + "step": 8034 + }, + { + "epoch": 2.4662369551872314, + "grad_norm": 0.30860164761543274, + "learning_rate": 8.819551158301406e-05, + "loss": 1.8818, + "step": 8035 + }, + { + "epoch": 2.4665438919582567, + "grad_norm": 0.3715503215789795, + "learning_rate": 8.819230378005767e-05, + "loss": 1.8443, + "step": 8036 + }, + { + "epoch": 2.4668508287292816, + "grad_norm": 0.4750272333621979, + "learning_rate": 8.818909559966255e-05, + "loss": 1.8379, + "step": 8037 + }, + { + "epoch": 2.467157765500307, + "grad_norm": 0.4794345796108246, + "learning_rate": 8.818588704186041e-05, + "loss": 1.8585, + "step": 8038 + }, + { + "epoch": 2.4674647022713323, + "grad_norm": 0.33470577001571655, + "learning_rate": 8.818267810668296e-05, + "loss": 1.8231, + "step": 8039 + }, + { + "epoch": 2.467771639042357, + "grad_norm": 0.31480371952056885, + "learning_rate": 8.817946879416191e-05, + "loss": 1.867, + "step": 8040 + }, + { + "epoch": 2.4680785758133825, + "grad_norm": 0.41635531187057495, + "learning_rate": 8.817625910432897e-05, + "loss": 1.9385, + "step": 8041 + }, + { + "epoch": 2.4683855125844074, + "grad_norm": 0.4570399522781372, + "learning_rate": 8.817304903721584e-05, + "loss": 1.7855, + "step": 8042 + }, + { + "epoch": 2.4686924493554327, + "grad_norm": 0.36506229639053345, + "learning_rate": 8.816983859285429e-05, + "loss": 1.808, + "step": 8043 + }, + { + "epoch": 2.468999386126458, + "grad_norm": 0.2650545537471771, + "learning_rate": 8.8166627771276e-05, + "loss": 1.8271, + "step": 8044 + }, + { + "epoch": 2.469306322897483, + "grad_norm": 0.3143758475780487, + "learning_rate": 8.816341657251272e-05, + "loss": 1.9016, + "step": 8045 + }, + { + "epoch": 2.4696132596685083, + "grad_norm": 0.3015407621860504, + "learning_rate": 8.81602049965962e-05, + "loss": 1.8357, + "step": 8046 + }, + { + "epoch": 2.4699201964395336, + "grad_norm": 0.26860085129737854, + "learning_rate": 8.815699304355819e-05, + "loss": 1.8223, + "step": 8047 + }, + { + "epoch": 2.4702271332105585, + "grad_norm": 0.2852436602115631, + "learning_rate": 8.81537807134304e-05, + "loss": 1.8298, + "step": 8048 + }, + { + "epoch": 2.470534069981584, + "grad_norm": 0.29519692063331604, + "learning_rate": 8.815056800624457e-05, + "loss": 1.863, + "step": 8049 + }, + { + "epoch": 2.470841006752609, + "grad_norm": 0.3163367807865143, + "learning_rate": 8.814735492203247e-05, + "loss": 1.878, + "step": 8050 + }, + { + "epoch": 2.471147943523634, + "grad_norm": 0.2955954968929291, + "learning_rate": 8.814414146082586e-05, + "loss": 1.8657, + "step": 8051 + }, + { + "epoch": 2.4714548802946594, + "grad_norm": 0.2773810029029846, + "learning_rate": 8.814092762265648e-05, + "loss": 1.7626, + "step": 8052 + }, + { + "epoch": 2.4717618170656843, + "grad_norm": 0.33908557891845703, + "learning_rate": 8.813771340755609e-05, + "loss": 1.8902, + "step": 8053 + }, + { + "epoch": 2.4720687538367097, + "grad_norm": 0.3083830773830414, + "learning_rate": 8.81344988155565e-05, + "loss": 1.876, + "step": 8054 + }, + { + "epoch": 2.472375690607735, + "grad_norm": 0.29082754254341125, + "learning_rate": 8.81312838466894e-05, + "loss": 1.8637, + "step": 8055 + }, + { + "epoch": 2.47268262737876, + "grad_norm": 0.3240490257740021, + "learning_rate": 8.81280685009866e-05, + "loss": 1.9096, + "step": 8056 + }, + { + "epoch": 2.472989564149785, + "grad_norm": 0.364561527967453, + "learning_rate": 8.812485277847991e-05, + "loss": 1.9361, + "step": 8057 + }, + { + "epoch": 2.47329650092081, + "grad_norm": 0.3420087695121765, + "learning_rate": 8.812163667920107e-05, + "loss": 1.9014, + "step": 8058 + }, + { + "epoch": 2.4736034376918354, + "grad_norm": 0.3346010148525238, + "learning_rate": 8.811842020318186e-05, + "loss": 1.9195, + "step": 8059 + }, + { + "epoch": 2.4739103744628608, + "grad_norm": 0.2990448772907257, + "learning_rate": 8.811520335045409e-05, + "loss": 1.8866, + "step": 8060 + }, + { + "epoch": 2.4742173112338857, + "grad_norm": 0.3047022223472595, + "learning_rate": 8.811198612104953e-05, + "loss": 1.8226, + "step": 8061 + }, + { + "epoch": 2.474524248004911, + "grad_norm": 0.300020307302475, + "learning_rate": 8.8108768515e-05, + "loss": 1.8496, + "step": 8062 + }, + { + "epoch": 2.4748311847759363, + "grad_norm": 0.31999605894088745, + "learning_rate": 8.810555053233729e-05, + "loss": 1.7853, + "step": 8063 + }, + { + "epoch": 2.4751381215469612, + "grad_norm": 0.3136597275733948, + "learning_rate": 8.810233217309318e-05, + "loss": 1.9317, + "step": 8064 + }, + { + "epoch": 2.4754450583179866, + "grad_norm": 0.3373543322086334, + "learning_rate": 8.809911343729948e-05, + "loss": 1.7827, + "step": 8065 + }, + { + "epoch": 2.475751995089012, + "grad_norm": 0.33876341581344604, + "learning_rate": 8.809589432498804e-05, + "loss": 1.8803, + "step": 8066 + }, + { + "epoch": 2.476058931860037, + "grad_norm": 0.3455486297607422, + "learning_rate": 8.809267483619061e-05, + "loss": 1.8987, + "step": 8067 + }, + { + "epoch": 2.476365868631062, + "grad_norm": 0.34245389699935913, + "learning_rate": 8.808945497093907e-05, + "loss": 1.8948, + "step": 8068 + }, + { + "epoch": 2.476672805402087, + "grad_norm": 0.3200787901878357, + "learning_rate": 8.808623472926521e-05, + "loss": 1.8234, + "step": 8069 + }, + { + "epoch": 2.4769797421731123, + "grad_norm": 0.3244795799255371, + "learning_rate": 8.808301411120083e-05, + "loss": 1.8974, + "step": 8070 + }, + { + "epoch": 2.4772866789441377, + "grad_norm": 0.30235809087753296, + "learning_rate": 8.80797931167778e-05, + "loss": 1.8461, + "step": 8071 + }, + { + "epoch": 2.4775936157151626, + "grad_norm": 0.3719651997089386, + "learning_rate": 8.807657174602792e-05, + "loss": 1.9717, + "step": 8072 + }, + { + "epoch": 2.477900552486188, + "grad_norm": 0.3349135220050812, + "learning_rate": 8.807334999898307e-05, + "loss": 1.9, + "step": 8073 + }, + { + "epoch": 2.478207489257213, + "grad_norm": 0.28822100162506104, + "learning_rate": 8.807012787567503e-05, + "loss": 1.7606, + "step": 8074 + }, + { + "epoch": 2.478514426028238, + "grad_norm": 0.33698850870132446, + "learning_rate": 8.806690537613568e-05, + "loss": 1.8909, + "step": 8075 + }, + { + "epoch": 2.4788213627992635, + "grad_norm": 0.35167089104652405, + "learning_rate": 8.806368250039687e-05, + "loss": 1.8529, + "step": 8076 + }, + { + "epoch": 2.4791282995702884, + "grad_norm": 0.3142544627189636, + "learning_rate": 8.806045924849044e-05, + "loss": 1.8169, + "step": 8077 + }, + { + "epoch": 2.4794352363413137, + "grad_norm": 0.3489094078540802, + "learning_rate": 8.805723562044824e-05, + "loss": 1.8822, + "step": 8078 + }, + { + "epoch": 2.479742173112339, + "grad_norm": 0.33814284205436707, + "learning_rate": 8.805401161630214e-05, + "loss": 1.7982, + "step": 8079 + }, + { + "epoch": 2.480049109883364, + "grad_norm": 0.26772376894950867, + "learning_rate": 8.805078723608398e-05, + "loss": 1.8354, + "step": 8080 + }, + { + "epoch": 2.4803560466543892, + "grad_norm": 0.3259965777397156, + "learning_rate": 8.804756247982563e-05, + "loss": 1.8292, + "step": 8081 + }, + { + "epoch": 2.4806629834254146, + "grad_norm": 0.32701683044433594, + "learning_rate": 8.804433734755899e-05, + "loss": 1.8339, + "step": 8082 + }, + { + "epoch": 2.4809699201964395, + "grad_norm": 0.3180190324783325, + "learning_rate": 8.804111183931589e-05, + "loss": 1.8839, + "step": 8083 + }, + { + "epoch": 2.481276856967465, + "grad_norm": 0.3318104147911072, + "learning_rate": 8.803788595512824e-05, + "loss": 1.9024, + "step": 8084 + }, + { + "epoch": 2.4815837937384897, + "grad_norm": 0.3849479854106903, + "learning_rate": 8.80346596950279e-05, + "loss": 1.8497, + "step": 8085 + }, + { + "epoch": 2.481890730509515, + "grad_norm": 0.48812124133110046, + "learning_rate": 8.803143305904676e-05, + "loss": 1.799, + "step": 8086 + }, + { + "epoch": 2.4821976672805404, + "grad_norm": 0.4957241415977478, + "learning_rate": 8.802820604721671e-05, + "loss": 1.8842, + "step": 8087 + }, + { + "epoch": 2.4825046040515653, + "grad_norm": 0.4011611342430115, + "learning_rate": 8.802497865956964e-05, + "loss": 1.8354, + "step": 8088 + }, + { + "epoch": 2.4828115408225906, + "grad_norm": 0.3676159679889679, + "learning_rate": 8.802175089613744e-05, + "loss": 1.8564, + "step": 8089 + }, + { + "epoch": 2.4831184775936155, + "grad_norm": 0.30699628591537476, + "learning_rate": 8.801852275695202e-05, + "loss": 1.8403, + "step": 8090 + }, + { + "epoch": 2.483425414364641, + "grad_norm": 0.4100657105445862, + "learning_rate": 8.801529424204527e-05, + "loss": 1.7885, + "step": 8091 + }, + { + "epoch": 2.483732351135666, + "grad_norm": 0.30880647897720337, + "learning_rate": 8.801206535144909e-05, + "loss": 1.8682, + "step": 8092 + }, + { + "epoch": 2.484039287906691, + "grad_norm": 0.2775783836841583, + "learning_rate": 8.800883608519541e-05, + "loss": 1.8179, + "step": 8093 + }, + { + "epoch": 2.4843462246777164, + "grad_norm": 0.3048902451992035, + "learning_rate": 8.800560644331613e-05, + "loss": 1.8799, + "step": 8094 + }, + { + "epoch": 2.4846531614487417, + "grad_norm": 0.30332526564598083, + "learning_rate": 8.800237642584318e-05, + "loss": 1.8892, + "step": 8095 + }, + { + "epoch": 2.4849600982197666, + "grad_norm": 0.27216237783432007, + "learning_rate": 8.799914603280847e-05, + "loss": 1.7896, + "step": 8096 + }, + { + "epoch": 2.485267034990792, + "grad_norm": 0.28771117329597473, + "learning_rate": 8.799591526424393e-05, + "loss": 1.8593, + "step": 8097 + }, + { + "epoch": 2.4855739717618173, + "grad_norm": 0.2986912429332733, + "learning_rate": 8.799268412018146e-05, + "loss": 1.8205, + "step": 8098 + }, + { + "epoch": 2.485880908532842, + "grad_norm": 0.3072153925895691, + "learning_rate": 8.798945260065306e-05, + "loss": 1.841, + "step": 8099 + }, + { + "epoch": 2.4861878453038675, + "grad_norm": 0.33869001269340515, + "learning_rate": 8.798622070569059e-05, + "loss": 1.8353, + "step": 8100 + }, + { + "epoch": 2.4864947820748924, + "grad_norm": 0.3075481951236725, + "learning_rate": 8.798298843532605e-05, + "loss": 1.8824, + "step": 8101 + }, + { + "epoch": 2.4868017188459177, + "grad_norm": 0.2758934795856476, + "learning_rate": 8.797975578959132e-05, + "loss": 1.8068, + "step": 8102 + }, + { + "epoch": 2.487108655616943, + "grad_norm": 0.3065447211265564, + "learning_rate": 8.79765227685184e-05, + "loss": 1.8661, + "step": 8103 + }, + { + "epoch": 2.487415592387968, + "grad_norm": 0.34466415643692017, + "learning_rate": 8.797328937213923e-05, + "loss": 1.8579, + "step": 8104 + }, + { + "epoch": 2.4877225291589933, + "grad_norm": 0.4202970862388611, + "learning_rate": 8.797005560048575e-05, + "loss": 1.8526, + "step": 8105 + }, + { + "epoch": 2.488029465930018, + "grad_norm": 0.35885924100875854, + "learning_rate": 8.796682145358991e-05, + "loss": 1.8194, + "step": 8106 + }, + { + "epoch": 2.4883364027010435, + "grad_norm": 0.3208492696285248, + "learning_rate": 8.796358693148372e-05, + "loss": 1.8379, + "step": 8107 + }, + { + "epoch": 2.488643339472069, + "grad_norm": 0.26514047384262085, + "learning_rate": 8.79603520341991e-05, + "loss": 1.7978, + "step": 8108 + }, + { + "epoch": 2.4889502762430937, + "grad_norm": 0.34550225734710693, + "learning_rate": 8.795711676176803e-05, + "loss": 1.8771, + "step": 8109 + }, + { + "epoch": 2.489257213014119, + "grad_norm": 0.3016511797904968, + "learning_rate": 8.795388111422248e-05, + "loss": 1.8184, + "step": 8110 + }, + { + "epoch": 2.4895641497851444, + "grad_norm": 0.34824177622795105, + "learning_rate": 8.795064509159444e-05, + "loss": 1.8486, + "step": 8111 + }, + { + "epoch": 2.4898710865561693, + "grad_norm": 0.341482013463974, + "learning_rate": 8.794740869391587e-05, + "loss": 1.7872, + "step": 8112 + }, + { + "epoch": 2.4901780233271946, + "grad_norm": 0.3366520404815674, + "learning_rate": 8.794417192121878e-05, + "loss": 1.838, + "step": 8113 + }, + { + "epoch": 2.49048496009822, + "grad_norm": 0.3168759047985077, + "learning_rate": 8.794093477353514e-05, + "loss": 1.8195, + "step": 8114 + }, + { + "epoch": 2.490791896869245, + "grad_norm": 0.36757516860961914, + "learning_rate": 8.793769725089693e-05, + "loss": 1.8825, + "step": 8115 + }, + { + "epoch": 2.49109883364027, + "grad_norm": 0.3936297297477722, + "learning_rate": 8.793445935333617e-05, + "loss": 1.855, + "step": 8116 + }, + { + "epoch": 2.491405770411295, + "grad_norm": 0.31962448358535767, + "learning_rate": 8.793122108088485e-05, + "loss": 1.8307, + "step": 8117 + }, + { + "epoch": 2.4917127071823204, + "grad_norm": 0.3082095980644226, + "learning_rate": 8.792798243357499e-05, + "loss": 1.8204, + "step": 8118 + }, + { + "epoch": 2.4920196439533457, + "grad_norm": 0.4574470520019531, + "learning_rate": 8.792474341143855e-05, + "loss": 1.8989, + "step": 8119 + }, + { + "epoch": 2.4923265807243706, + "grad_norm": 0.4596022367477417, + "learning_rate": 8.792150401450757e-05, + "loss": 1.8773, + "step": 8120 + }, + { + "epoch": 2.492633517495396, + "grad_norm": 0.32090309262275696, + "learning_rate": 8.791826424281407e-05, + "loss": 1.8621, + "step": 8121 + }, + { + "epoch": 2.4929404542664213, + "grad_norm": 0.3492026925086975, + "learning_rate": 8.791502409639006e-05, + "loss": 1.8887, + "step": 8122 + }, + { + "epoch": 2.493247391037446, + "grad_norm": 0.39859771728515625, + "learning_rate": 8.791178357526754e-05, + "loss": 1.8326, + "step": 8123 + }, + { + "epoch": 2.4935543278084715, + "grad_norm": 0.40439239144325256, + "learning_rate": 8.790854267947857e-05, + "loss": 1.8716, + "step": 8124 + }, + { + "epoch": 2.493861264579497, + "grad_norm": 0.4004671573638916, + "learning_rate": 8.790530140905515e-05, + "loss": 1.8253, + "step": 8125 + }, + { + "epoch": 2.4941682013505218, + "grad_norm": 0.31446993350982666, + "learning_rate": 8.790205976402934e-05, + "loss": 1.8356, + "step": 8126 + }, + { + "epoch": 2.494475138121547, + "grad_norm": 0.3069862723350525, + "learning_rate": 8.789881774443315e-05, + "loss": 1.8532, + "step": 8127 + }, + { + "epoch": 2.494782074892572, + "grad_norm": 0.3192054033279419, + "learning_rate": 8.789557535029864e-05, + "loss": 1.7991, + "step": 8128 + }, + { + "epoch": 2.4950890116635973, + "grad_norm": 0.30979350209236145, + "learning_rate": 8.789233258165783e-05, + "loss": 1.8874, + "step": 8129 + }, + { + "epoch": 2.4953959484346226, + "grad_norm": 0.3193976879119873, + "learning_rate": 8.788908943854279e-05, + "loss": 1.8218, + "step": 8130 + }, + { + "epoch": 2.4957028852056475, + "grad_norm": 0.3120083808898926, + "learning_rate": 8.788584592098557e-05, + "loss": 1.9542, + "step": 8131 + }, + { + "epoch": 2.496009821976673, + "grad_norm": 0.36913001537323, + "learning_rate": 8.788260202901819e-05, + "loss": 1.8543, + "step": 8132 + }, + { + "epoch": 2.4963167587476978, + "grad_norm": 0.40216776728630066, + "learning_rate": 8.787935776267275e-05, + "loss": 1.8645, + "step": 8133 + }, + { + "epoch": 2.496623695518723, + "grad_norm": 0.3553076684474945, + "learning_rate": 8.78761131219813e-05, + "loss": 1.8881, + "step": 8134 + }, + { + "epoch": 2.4969306322897484, + "grad_norm": 0.2926538288593292, + "learning_rate": 8.787286810697589e-05, + "loss": 1.8419, + "step": 8135 + }, + { + "epoch": 2.4972375690607733, + "grad_norm": 0.3412233293056488, + "learning_rate": 8.78696227176886e-05, + "loss": 1.8766, + "step": 8136 + }, + { + "epoch": 2.4975445058317987, + "grad_norm": 0.30935296416282654, + "learning_rate": 8.78663769541515e-05, + "loss": 1.8002, + "step": 8137 + }, + { + "epoch": 2.497851442602824, + "grad_norm": 0.31171828508377075, + "learning_rate": 8.786313081639666e-05, + "loss": 1.7795, + "step": 8138 + }, + { + "epoch": 2.498158379373849, + "grad_norm": 0.2874031364917755, + "learning_rate": 8.785988430445619e-05, + "loss": 1.8508, + "step": 8139 + }, + { + "epoch": 2.498465316144874, + "grad_norm": 0.3126043379306793, + "learning_rate": 8.785663741836215e-05, + "loss": 1.8328, + "step": 8140 + }, + { + "epoch": 2.4987722529158995, + "grad_norm": 0.32581454515457153, + "learning_rate": 8.785339015814662e-05, + "loss": 1.8333, + "step": 8141 + }, + { + "epoch": 2.4990791896869244, + "grad_norm": 0.329745888710022, + "learning_rate": 8.78501425238417e-05, + "loss": 1.8257, + "step": 8142 + }, + { + "epoch": 2.4993861264579498, + "grad_norm": 0.29101938009262085, + "learning_rate": 8.78468945154795e-05, + "loss": 1.8472, + "step": 8143 + }, + { + "epoch": 2.4996930632289747, + "grad_norm": 0.3123742341995239, + "learning_rate": 8.784364613309208e-05, + "loss": 1.9226, + "step": 8144 + }, + { + "epoch": 2.5, + "grad_norm": 0.3330230116844177, + "learning_rate": 8.784039737671159e-05, + "loss": 1.8768, + "step": 8145 + }, + { + "epoch": 2.5003069367710253, + "grad_norm": 0.3147718012332916, + "learning_rate": 8.783714824637011e-05, + "loss": 1.853, + "step": 8146 + }, + { + "epoch": 2.5006138735420502, + "grad_norm": 0.34790241718292236, + "learning_rate": 8.783389874209977e-05, + "loss": 1.8328, + "step": 8147 + }, + { + "epoch": 2.5009208103130756, + "grad_norm": 0.29425308108329773, + "learning_rate": 8.783064886393264e-05, + "loss": 1.8487, + "step": 8148 + }, + { + "epoch": 2.5012277470841005, + "grad_norm": 0.30555078387260437, + "learning_rate": 8.782739861190088e-05, + "loss": 1.8588, + "step": 8149 + }, + { + "epoch": 2.501534683855126, + "grad_norm": 0.29712429642677307, + "learning_rate": 8.78241479860366e-05, + "loss": 1.8056, + "step": 8150 + }, + { + "epoch": 2.501841620626151, + "grad_norm": 0.32512977719306946, + "learning_rate": 8.782089698637191e-05, + "loss": 1.9099, + "step": 8151 + }, + { + "epoch": 2.5021485573971765, + "grad_norm": 0.3660493493080139, + "learning_rate": 8.781764561293895e-05, + "loss": 1.905, + "step": 8152 + }, + { + "epoch": 2.5024554941682013, + "grad_norm": 0.33591583371162415, + "learning_rate": 8.781439386576984e-05, + "loss": 1.8353, + "step": 8153 + }, + { + "epoch": 2.5027624309392267, + "grad_norm": 0.3774370551109314, + "learning_rate": 8.781114174489673e-05, + "loss": 1.8626, + "step": 8154 + }, + { + "epoch": 2.5030693677102516, + "grad_norm": 0.3628109097480774, + "learning_rate": 8.780788925035178e-05, + "loss": 1.8549, + "step": 8155 + }, + { + "epoch": 2.503376304481277, + "grad_norm": 0.3089732825756073, + "learning_rate": 8.78046363821671e-05, + "loss": 1.835, + "step": 8156 + }, + { + "epoch": 2.5036832412523022, + "grad_norm": 0.3630690574645996, + "learning_rate": 8.780138314037482e-05, + "loss": 1.8308, + "step": 8157 + }, + { + "epoch": 2.503990178023327, + "grad_norm": 0.3658130466938019, + "learning_rate": 8.779812952500714e-05, + "loss": 1.8484, + "step": 8158 + }, + { + "epoch": 2.5042971147943525, + "grad_norm": 0.38401272892951965, + "learning_rate": 8.779487553609617e-05, + "loss": 1.8408, + "step": 8159 + }, + { + "epoch": 2.5046040515653774, + "grad_norm": 0.354514479637146, + "learning_rate": 8.77916211736741e-05, + "loss": 1.8491, + "step": 8160 + }, + { + "epoch": 2.5049109883364027, + "grad_norm": 0.3604681193828583, + "learning_rate": 8.778836643777309e-05, + "loss": 1.8887, + "step": 8161 + }, + { + "epoch": 2.505217925107428, + "grad_norm": 0.3155761957168579, + "learning_rate": 8.778511132842528e-05, + "loss": 1.8066, + "step": 8162 + }, + { + "epoch": 2.505524861878453, + "grad_norm": 0.35986092686653137, + "learning_rate": 8.778185584566286e-05, + "loss": 1.8348, + "step": 8163 + }, + { + "epoch": 2.5058317986494782, + "grad_norm": 0.558273434638977, + "learning_rate": 8.777859998951799e-05, + "loss": 1.9118, + "step": 8164 + }, + { + "epoch": 2.506138735420503, + "grad_norm": 0.6520169377326965, + "learning_rate": 8.777534376002285e-05, + "loss": 1.8747, + "step": 8165 + }, + { + "epoch": 2.5064456721915285, + "grad_norm": 0.5059971213340759, + "learning_rate": 8.777208715720963e-05, + "loss": 1.8218, + "step": 8166 + }, + { + "epoch": 2.506752608962554, + "grad_norm": 0.2873745560646057, + "learning_rate": 8.77688301811105e-05, + "loss": 1.8266, + "step": 8167 + }, + { + "epoch": 2.507059545733579, + "grad_norm": 0.4212021827697754, + "learning_rate": 8.776557283175765e-05, + "loss": 1.8553, + "step": 8168 + }, + { + "epoch": 2.507366482504604, + "grad_norm": 0.49324098229408264, + "learning_rate": 8.776231510918328e-05, + "loss": 1.8625, + "step": 8169 + }, + { + "epoch": 2.5076734192756294, + "grad_norm": 0.4414234459400177, + "learning_rate": 8.775905701341959e-05, + "loss": 1.7956, + "step": 8170 + }, + { + "epoch": 2.5079803560466543, + "grad_norm": 0.2691541612148285, + "learning_rate": 8.775579854449876e-05, + "loss": 1.8216, + "step": 8171 + }, + { + "epoch": 2.5082872928176796, + "grad_norm": 0.3366323411464691, + "learning_rate": 8.775253970245299e-05, + "loss": 1.8738, + "step": 8172 + }, + { + "epoch": 2.508594229588705, + "grad_norm": 0.49541351199150085, + "learning_rate": 8.77492804873145e-05, + "loss": 1.8281, + "step": 8173 + }, + { + "epoch": 2.50890116635973, + "grad_norm": 0.584227442741394, + "learning_rate": 8.774602089911548e-05, + "loss": 1.8248, + "step": 8174 + }, + { + "epoch": 2.509208103130755, + "grad_norm": 0.4493597149848938, + "learning_rate": 8.774276093788818e-05, + "loss": 1.8624, + "step": 8175 + }, + { + "epoch": 2.50951503990178, + "grad_norm": 0.29684513807296753, + "learning_rate": 8.77395006036648e-05, + "loss": 1.7806, + "step": 8176 + }, + { + "epoch": 2.5098219766728054, + "grad_norm": 0.38788866996765137, + "learning_rate": 8.773623989647754e-05, + "loss": 1.8334, + "step": 8177 + }, + { + "epoch": 2.5101289134438307, + "grad_norm": 0.44810980558395386, + "learning_rate": 8.773297881635865e-05, + "loss": 1.823, + "step": 8178 + }, + { + "epoch": 2.5104358502148556, + "grad_norm": 0.39918363094329834, + "learning_rate": 8.772971736334032e-05, + "loss": 1.8535, + "step": 8179 + }, + { + "epoch": 2.510742786985881, + "grad_norm": 0.3454466462135315, + "learning_rate": 8.772645553745484e-05, + "loss": 1.8532, + "step": 8180 + }, + { + "epoch": 2.511049723756906, + "grad_norm": 0.3523466885089874, + "learning_rate": 8.77231933387344e-05, + "loss": 1.8402, + "step": 8181 + }, + { + "epoch": 2.511356660527931, + "grad_norm": 0.41947969794273376, + "learning_rate": 8.771993076721126e-05, + "loss": 1.8509, + "step": 8182 + }, + { + "epoch": 2.5116635972989565, + "grad_norm": 0.43224433064460754, + "learning_rate": 8.771666782291765e-05, + "loss": 1.858, + "step": 8183 + }, + { + "epoch": 2.511970534069982, + "grad_norm": 0.3467538058757782, + "learning_rate": 8.771340450588584e-05, + "loss": 1.8528, + "step": 8184 + }, + { + "epoch": 2.5122774708410067, + "grad_norm": 0.33712685108184814, + "learning_rate": 8.771014081614803e-05, + "loss": 1.8741, + "step": 8185 + }, + { + "epoch": 2.512584407612032, + "grad_norm": 0.4289829134941101, + "learning_rate": 8.770687675373652e-05, + "loss": 1.8252, + "step": 8186 + }, + { + "epoch": 2.512891344383057, + "grad_norm": 0.4774068295955658, + "learning_rate": 8.770361231868356e-05, + "loss": 1.8285, + "step": 8187 + }, + { + "epoch": 2.5131982811540823, + "grad_norm": 0.3455580472946167, + "learning_rate": 8.77003475110214e-05, + "loss": 1.8025, + "step": 8188 + }, + { + "epoch": 2.5135052179251076, + "grad_norm": 0.3050900399684906, + "learning_rate": 8.769708233078231e-05, + "loss": 1.8764, + "step": 8189 + }, + { + "epoch": 2.5138121546961325, + "grad_norm": 0.42384061217308044, + "learning_rate": 8.769381677799855e-05, + "loss": 1.8937, + "step": 8190 + }, + { + "epoch": 2.514119091467158, + "grad_norm": 0.4084749221801758, + "learning_rate": 8.76905508527024e-05, + "loss": 1.8124, + "step": 8191 + }, + { + "epoch": 2.5144260282381827, + "grad_norm": 0.38785848021507263, + "learning_rate": 8.768728455492615e-05, + "loss": 1.8731, + "step": 8192 + }, + { + "epoch": 2.514732965009208, + "grad_norm": 0.28196588158607483, + "learning_rate": 8.768401788470206e-05, + "loss": 1.809, + "step": 8193 + }, + { + "epoch": 2.5150399017802334, + "grad_norm": 0.3551066815853119, + "learning_rate": 8.76807508420624e-05, + "loss": 1.8955, + "step": 8194 + }, + { + "epoch": 2.5153468385512583, + "grad_norm": 0.4327031373977661, + "learning_rate": 8.76774834270395e-05, + "loss": 1.8651, + "step": 8195 + }, + { + "epoch": 2.5156537753222836, + "grad_norm": 0.3748793303966522, + "learning_rate": 8.76742156396656e-05, + "loss": 1.8158, + "step": 8196 + }, + { + "epoch": 2.5159607120933085, + "grad_norm": 0.32504430413246155, + "learning_rate": 8.767094747997304e-05, + "loss": 1.8598, + "step": 8197 + }, + { + "epoch": 2.516267648864334, + "grad_norm": 0.3639826476573944, + "learning_rate": 8.76676789479941e-05, + "loss": 1.8829, + "step": 8198 + }, + { + "epoch": 2.516574585635359, + "grad_norm": 0.36793577671051025, + "learning_rate": 8.766441004376106e-05, + "loss": 1.8215, + "step": 8199 + }, + { + "epoch": 2.5168815224063845, + "grad_norm": 0.3245735466480255, + "learning_rate": 8.766114076730624e-05, + "loss": 1.8309, + "step": 8200 + }, + { + "epoch": 2.5171884591774094, + "grad_norm": 0.3022485673427582, + "learning_rate": 8.765787111866198e-05, + "loss": 1.8286, + "step": 8201 + }, + { + "epoch": 2.5174953959484347, + "grad_norm": 0.40962809324264526, + "learning_rate": 8.765460109786056e-05, + "loss": 1.8032, + "step": 8202 + }, + { + "epoch": 2.5178023327194596, + "grad_norm": 0.4123937487602234, + "learning_rate": 8.765133070493428e-05, + "loss": 1.9311, + "step": 8203 + }, + { + "epoch": 2.518109269490485, + "grad_norm": 0.30352556705474854, + "learning_rate": 8.764805993991551e-05, + "loss": 1.8197, + "step": 8204 + }, + { + "epoch": 2.5184162062615103, + "grad_norm": 0.3201169967651367, + "learning_rate": 8.764478880283653e-05, + "loss": 1.9355, + "step": 8205 + }, + { + "epoch": 2.518723143032535, + "grad_norm": 0.36343297362327576, + "learning_rate": 8.764151729372969e-05, + "loss": 1.9201, + "step": 8206 + }, + { + "epoch": 2.5190300798035605, + "grad_norm": 0.3273618817329407, + "learning_rate": 8.763824541262729e-05, + "loss": 1.8195, + "step": 8207 + }, + { + "epoch": 2.5193370165745854, + "grad_norm": 0.30200251936912537, + "learning_rate": 8.76349731595617e-05, + "loss": 1.8094, + "step": 8208 + }, + { + "epoch": 2.5196439533456108, + "grad_norm": 0.3177770674228668, + "learning_rate": 8.763170053456527e-05, + "loss": 1.8519, + "step": 8209 + }, + { + "epoch": 2.519950890116636, + "grad_norm": 0.3206307291984558, + "learning_rate": 8.762842753767031e-05, + "loss": 1.8496, + "step": 8210 + }, + { + "epoch": 2.520257826887661, + "grad_norm": 0.31902456283569336, + "learning_rate": 8.762515416890915e-05, + "loss": 1.9069, + "step": 8211 + }, + { + "epoch": 2.5205647636586863, + "grad_norm": 0.3088377118110657, + "learning_rate": 8.762188042831419e-05, + "loss": 1.8482, + "step": 8212 + }, + { + "epoch": 2.520871700429711, + "grad_norm": 0.3046402931213379, + "learning_rate": 8.761860631591773e-05, + "loss": 1.8241, + "step": 8213 + }, + { + "epoch": 2.5211786372007365, + "grad_norm": 0.291831910610199, + "learning_rate": 8.761533183175217e-05, + "loss": 1.846, + "step": 8214 + }, + { + "epoch": 2.521485573971762, + "grad_norm": 0.3514893054962158, + "learning_rate": 8.761205697584986e-05, + "loss": 1.9, + "step": 8215 + }, + { + "epoch": 2.521792510742787, + "grad_norm": 0.31843090057373047, + "learning_rate": 8.760878174824316e-05, + "loss": 1.78, + "step": 8216 + }, + { + "epoch": 2.522099447513812, + "grad_norm": 0.30090904235839844, + "learning_rate": 8.760550614896443e-05, + "loss": 1.8718, + "step": 8217 + }, + { + "epoch": 2.5224063842848374, + "grad_norm": 0.38502126932144165, + "learning_rate": 8.760223017804604e-05, + "loss": 1.8772, + "step": 8218 + }, + { + "epoch": 2.5227133210558623, + "grad_norm": 0.30862319469451904, + "learning_rate": 8.759895383552037e-05, + "loss": 1.8532, + "step": 8219 + }, + { + "epoch": 2.5230202578268877, + "grad_norm": 0.36331596970558167, + "learning_rate": 8.759567712141981e-05, + "loss": 1.8587, + "step": 8220 + }, + { + "epoch": 2.523327194597913, + "grad_norm": 0.3370853662490845, + "learning_rate": 8.759240003577673e-05, + "loss": 1.8065, + "step": 8221 + }, + { + "epoch": 2.523634131368938, + "grad_norm": 0.3047318160533905, + "learning_rate": 8.758912257862351e-05, + "loss": 1.8783, + "step": 8222 + }, + { + "epoch": 2.523941068139963, + "grad_norm": 0.3172069787979126, + "learning_rate": 8.758584474999257e-05, + "loss": 1.7844, + "step": 8223 + }, + { + "epoch": 2.524248004910988, + "grad_norm": 0.3063897490501404, + "learning_rate": 8.758256654991626e-05, + "loss": 1.8642, + "step": 8224 + }, + { + "epoch": 2.5245549416820134, + "grad_norm": 0.2535867393016815, + "learning_rate": 8.757928797842702e-05, + "loss": 1.7784, + "step": 8225 + }, + { + "epoch": 2.5248618784530388, + "grad_norm": 0.27732348442077637, + "learning_rate": 8.757600903555722e-05, + "loss": 1.8223, + "step": 8226 + }, + { + "epoch": 2.525168815224064, + "grad_norm": 0.29819566011428833, + "learning_rate": 8.757272972133927e-05, + "loss": 1.8237, + "step": 8227 + }, + { + "epoch": 2.525475751995089, + "grad_norm": 0.26726382970809937, + "learning_rate": 8.756945003580559e-05, + "loss": 1.8134, + "step": 8228 + }, + { + "epoch": 2.5257826887661143, + "grad_norm": 0.2845614552497864, + "learning_rate": 8.756616997898859e-05, + "loss": 1.8757, + "step": 8229 + }, + { + "epoch": 2.5260896255371392, + "grad_norm": 0.33399102091789246, + "learning_rate": 8.756288955092066e-05, + "loss": 1.9036, + "step": 8230 + }, + { + "epoch": 2.5263965623081646, + "grad_norm": 0.3839001953601837, + "learning_rate": 8.755960875163426e-05, + "loss": 1.8205, + "step": 8231 + }, + { + "epoch": 2.52670349907919, + "grad_norm": 0.3703761696815491, + "learning_rate": 8.75563275811618e-05, + "loss": 1.768, + "step": 8232 + }, + { + "epoch": 2.527010435850215, + "grad_norm": 0.3083760440349579, + "learning_rate": 8.755304603953568e-05, + "loss": 1.8621, + "step": 8233 + }, + { + "epoch": 2.52731737262124, + "grad_norm": 0.2995334267616272, + "learning_rate": 8.754976412678833e-05, + "loss": 1.8246, + "step": 8234 + }, + { + "epoch": 2.527624309392265, + "grad_norm": 0.3482929766178131, + "learning_rate": 8.754648184295222e-05, + "loss": 1.7982, + "step": 8235 + }, + { + "epoch": 2.5279312461632903, + "grad_norm": 0.37462911009788513, + "learning_rate": 8.754319918805978e-05, + "loss": 1.8458, + "step": 8236 + }, + { + "epoch": 2.5282381829343157, + "grad_norm": 0.3112029433250427, + "learning_rate": 8.753991616214343e-05, + "loss": 1.9116, + "step": 8237 + }, + { + "epoch": 2.5285451197053406, + "grad_norm": 0.309711217880249, + "learning_rate": 8.753663276523563e-05, + "loss": 1.8072, + "step": 8238 + }, + { + "epoch": 2.528852056476366, + "grad_norm": 0.3831833302974701, + "learning_rate": 8.753334899736882e-05, + "loss": 1.8769, + "step": 8239 + }, + { + "epoch": 2.529158993247391, + "grad_norm": 0.30272287130355835, + "learning_rate": 8.753006485857547e-05, + "loss": 1.7874, + "step": 8240 + }, + { + "epoch": 2.529465930018416, + "grad_norm": 0.3613976538181305, + "learning_rate": 8.752678034888801e-05, + "loss": 1.8591, + "step": 8241 + }, + { + "epoch": 2.5297728667894415, + "grad_norm": 0.35976549983024597, + "learning_rate": 8.75234954683389e-05, + "loss": 1.7831, + "step": 8242 + }, + { + "epoch": 2.530079803560467, + "grad_norm": 0.33987951278686523, + "learning_rate": 8.752021021696064e-05, + "loss": 1.7986, + "step": 8243 + }, + { + "epoch": 2.5303867403314917, + "grad_norm": 0.29231634736061096, + "learning_rate": 8.751692459478567e-05, + "loss": 1.8205, + "step": 8244 + }, + { + "epoch": 2.530693677102517, + "grad_norm": 0.3382028341293335, + "learning_rate": 8.751363860184644e-05, + "loss": 1.8403, + "step": 8245 + }, + { + "epoch": 2.531000613873542, + "grad_norm": 0.44643479585647583, + "learning_rate": 8.751035223817546e-05, + "loss": 1.8273, + "step": 8246 + }, + { + "epoch": 2.5313075506445673, + "grad_norm": 0.4412732720375061, + "learning_rate": 8.750706550380518e-05, + "loss": 1.7935, + "step": 8247 + }, + { + "epoch": 2.5316144874155926, + "grad_norm": 0.3826131820678711, + "learning_rate": 8.750377839876811e-05, + "loss": 1.8622, + "step": 8248 + }, + { + "epoch": 2.5319214241866175, + "grad_norm": 0.27509525418281555, + "learning_rate": 8.750049092309672e-05, + "loss": 1.8359, + "step": 8249 + }, + { + "epoch": 2.532228360957643, + "grad_norm": 0.36282727122306824, + "learning_rate": 8.749720307682348e-05, + "loss": 1.8531, + "step": 8250 + }, + { + "epoch": 2.5325352977286677, + "grad_norm": 0.3730177581310272, + "learning_rate": 8.749391485998091e-05, + "loss": 1.8616, + "step": 8251 + }, + { + "epoch": 2.532842234499693, + "grad_norm": 0.3347858190536499, + "learning_rate": 8.749062627260152e-05, + "loss": 1.8078, + "step": 8252 + }, + { + "epoch": 2.5331491712707184, + "grad_norm": 0.29422396421432495, + "learning_rate": 8.748733731471777e-05, + "loss": 1.8623, + "step": 8253 + }, + { + "epoch": 2.5334561080417433, + "grad_norm": 0.36915895342826843, + "learning_rate": 8.748404798636219e-05, + "loss": 1.8461, + "step": 8254 + }, + { + "epoch": 2.5337630448127686, + "grad_norm": 0.4497677981853485, + "learning_rate": 8.748075828756725e-05, + "loss": 1.8328, + "step": 8255 + }, + { + "epoch": 2.5340699815837935, + "grad_norm": 0.4770478308200836, + "learning_rate": 8.747746821836552e-05, + "loss": 1.8418, + "step": 8256 + }, + { + "epoch": 2.534376918354819, + "grad_norm": 0.39125776290893555, + "learning_rate": 8.747417777878946e-05, + "loss": 1.8044, + "step": 8257 + }, + { + "epoch": 2.534683855125844, + "grad_norm": 0.2976539731025696, + "learning_rate": 8.747088696887163e-05, + "loss": 1.8819, + "step": 8258 + }, + { + "epoch": 2.5349907918968695, + "grad_norm": 0.37511107325553894, + "learning_rate": 8.746759578864452e-05, + "loss": 1.8304, + "step": 8259 + }, + { + "epoch": 2.5352977286678944, + "grad_norm": 0.4462794363498688, + "learning_rate": 8.746430423814068e-05, + "loss": 1.8248, + "step": 8260 + }, + { + "epoch": 2.5356046654389197, + "grad_norm": 0.3465537130832672, + "learning_rate": 8.746101231739261e-05, + "loss": 1.7987, + "step": 8261 + }, + { + "epoch": 2.5359116022099446, + "grad_norm": 0.3182581663131714, + "learning_rate": 8.745772002643287e-05, + "loss": 1.8817, + "step": 8262 + }, + { + "epoch": 2.53621853898097, + "grad_norm": 0.43006083369255066, + "learning_rate": 8.745442736529398e-05, + "loss": 1.8003, + "step": 8263 + }, + { + "epoch": 2.5365254757519953, + "grad_norm": 0.45511460304260254, + "learning_rate": 8.745113433400849e-05, + "loss": 1.8735, + "step": 8264 + }, + { + "epoch": 2.53683241252302, + "grad_norm": 0.3625985085964203, + "learning_rate": 8.744784093260894e-05, + "loss": 1.8469, + "step": 8265 + }, + { + "epoch": 2.5371393492940455, + "grad_norm": 0.2977297306060791, + "learning_rate": 8.744454716112787e-05, + "loss": 1.7885, + "step": 8266 + }, + { + "epoch": 2.5374462860650704, + "grad_norm": 0.34910085797309875, + "learning_rate": 8.744125301959785e-05, + "loss": 1.8885, + "step": 8267 + }, + { + "epoch": 2.5377532228360957, + "grad_norm": 0.40707942843437195, + "learning_rate": 8.743795850805141e-05, + "loss": 1.8829, + "step": 8268 + }, + { + "epoch": 2.538060159607121, + "grad_norm": 0.4142697751522064, + "learning_rate": 8.743466362652114e-05, + "loss": 1.903, + "step": 8269 + }, + { + "epoch": 2.538367096378146, + "grad_norm": 0.38610437512397766, + "learning_rate": 8.743136837503958e-05, + "loss": 1.9245, + "step": 8270 + }, + { + "epoch": 2.5386740331491713, + "grad_norm": 0.2940465211868286, + "learning_rate": 8.742807275363928e-05, + "loss": 1.8532, + "step": 8271 + }, + { + "epoch": 2.538980969920196, + "grad_norm": 0.3257673978805542, + "learning_rate": 8.742477676235284e-05, + "loss": 1.8517, + "step": 8272 + }, + { + "epoch": 2.5392879066912215, + "grad_norm": 0.3709326982498169, + "learning_rate": 8.742148040121282e-05, + "loss": 1.872, + "step": 8273 + }, + { + "epoch": 2.539594843462247, + "grad_norm": 0.3433123826980591, + "learning_rate": 8.741818367025179e-05, + "loss": 1.8717, + "step": 8274 + }, + { + "epoch": 2.539901780233272, + "grad_norm": 0.39426255226135254, + "learning_rate": 8.741488656950234e-05, + "loss": 1.8155, + "step": 8275 + }, + { + "epoch": 2.540208717004297, + "grad_norm": 0.48205071687698364, + "learning_rate": 8.741158909899706e-05, + "loss": 1.8668, + "step": 8276 + }, + { + "epoch": 2.5405156537753224, + "grad_norm": 0.35280337929725647, + "learning_rate": 8.740829125876853e-05, + "loss": 1.7845, + "step": 8277 + }, + { + "epoch": 2.5408225905463473, + "grad_norm": 0.3148525059223175, + "learning_rate": 8.740499304884932e-05, + "loss": 1.8539, + "step": 8278 + }, + { + "epoch": 2.5411295273173726, + "grad_norm": 0.387932687997818, + "learning_rate": 8.740169446927207e-05, + "loss": 1.8514, + "step": 8279 + }, + { + "epoch": 2.541436464088398, + "grad_norm": 0.37375807762145996, + "learning_rate": 8.739839552006934e-05, + "loss": 1.8497, + "step": 8280 + }, + { + "epoch": 2.541743400859423, + "grad_norm": 0.3094288408756256, + "learning_rate": 8.739509620127375e-05, + "loss": 1.8675, + "step": 8281 + }, + { + "epoch": 2.542050337630448, + "grad_norm": 0.36951884627342224, + "learning_rate": 8.73917965129179e-05, + "loss": 1.8533, + "step": 8282 + }, + { + "epoch": 2.542357274401473, + "grad_norm": 0.39360809326171875, + "learning_rate": 8.73884964550344e-05, + "loss": 1.8688, + "step": 8283 + }, + { + "epoch": 2.5426642111724984, + "grad_norm": 0.29781201481819153, + "learning_rate": 8.738519602765586e-05, + "loss": 1.8285, + "step": 8284 + }, + { + "epoch": 2.5429711479435237, + "grad_norm": 0.29476743936538696, + "learning_rate": 8.73818952308149e-05, + "loss": 1.8234, + "step": 8285 + }, + { + "epoch": 2.5432780847145486, + "grad_norm": 0.3660123646259308, + "learning_rate": 8.737859406454416e-05, + "loss": 1.8933, + "step": 8286 + }, + { + "epoch": 2.543585021485574, + "grad_norm": 0.41587865352630615, + "learning_rate": 8.737529252887621e-05, + "loss": 1.8799, + "step": 8287 + }, + { + "epoch": 2.5438919582565993, + "grad_norm": 0.4183691143989563, + "learning_rate": 8.737199062384374e-05, + "loss": 1.8479, + "step": 8288 + }, + { + "epoch": 2.544198895027624, + "grad_norm": 0.35940057039260864, + "learning_rate": 8.736868834947935e-05, + "loss": 1.8164, + "step": 8289 + }, + { + "epoch": 2.5445058317986495, + "grad_norm": 0.26804691553115845, + "learning_rate": 8.736538570581568e-05, + "loss": 1.8017, + "step": 8290 + }, + { + "epoch": 2.544812768569675, + "grad_norm": 0.34537792205810547, + "learning_rate": 8.736208269288534e-05, + "loss": 1.9002, + "step": 8291 + }, + { + "epoch": 2.5451197053406998, + "grad_norm": 0.4636915624141693, + "learning_rate": 8.735877931072106e-05, + "loss": 1.8207, + "step": 8292 + }, + { + "epoch": 2.545426642111725, + "grad_norm": 0.4897560775279999, + "learning_rate": 8.735547555935537e-05, + "loss": 1.7981, + "step": 8293 + }, + { + "epoch": 2.54573357888275, + "grad_norm": 0.37379372119903564, + "learning_rate": 8.7352171438821e-05, + "loss": 1.8727, + "step": 8294 + }, + { + "epoch": 2.5460405156537753, + "grad_norm": 0.295436292886734, + "learning_rate": 8.734886694915059e-05, + "loss": 1.8321, + "step": 8295 + }, + { + "epoch": 2.5463474524248007, + "grad_norm": 0.40406084060668945, + "learning_rate": 8.734556209037676e-05, + "loss": 1.8666, + "step": 8296 + }, + { + "epoch": 2.5466543891958255, + "grad_norm": 0.3286290466785431, + "learning_rate": 8.734225686253221e-05, + "loss": 1.8574, + "step": 8297 + }, + { + "epoch": 2.546961325966851, + "grad_norm": 0.3200569152832031, + "learning_rate": 8.73389512656496e-05, + "loss": 1.8253, + "step": 8298 + }, + { + "epoch": 2.5472682627378758, + "grad_norm": 0.35550132393836975, + "learning_rate": 8.733564529976157e-05, + "loss": 1.8293, + "step": 8299 + }, + { + "epoch": 2.547575199508901, + "grad_norm": 0.3804685175418854, + "learning_rate": 8.733233896490081e-05, + "loss": 1.8689, + "step": 8300 + }, + { + "epoch": 2.5478821362799264, + "grad_norm": 0.34739598631858826, + "learning_rate": 8.73290322611e-05, + "loss": 1.8441, + "step": 8301 + }, + { + "epoch": 2.5481890730509518, + "grad_norm": 0.29757586121559143, + "learning_rate": 8.732572518839182e-05, + "loss": 1.8698, + "step": 8302 + }, + { + "epoch": 2.5484960098219767, + "grad_norm": 0.30403536558151245, + "learning_rate": 8.732241774680895e-05, + "loss": 1.8305, + "step": 8303 + }, + { + "epoch": 2.548802946593002, + "grad_norm": 0.326876699924469, + "learning_rate": 8.731910993638406e-05, + "loss": 1.8514, + "step": 8304 + }, + { + "epoch": 2.549109883364027, + "grad_norm": 0.3108467161655426, + "learning_rate": 8.731580175714986e-05, + "loss": 1.8509, + "step": 8305 + }, + { + "epoch": 2.549416820135052, + "grad_norm": 0.31641489267349243, + "learning_rate": 8.731249320913904e-05, + "loss": 1.9009, + "step": 8306 + }, + { + "epoch": 2.5497237569060776, + "grad_norm": 0.3166131377220154, + "learning_rate": 8.730918429238428e-05, + "loss": 1.8291, + "step": 8307 + }, + { + "epoch": 2.5500306936771024, + "grad_norm": 0.27900195121765137, + "learning_rate": 8.730587500691829e-05, + "loss": 1.856, + "step": 8308 + }, + { + "epoch": 2.550337630448128, + "grad_norm": 0.3000704050064087, + "learning_rate": 8.730256535277379e-05, + "loss": 1.839, + "step": 8309 + }, + { + "epoch": 2.5506445672191527, + "grad_norm": 0.30938518047332764, + "learning_rate": 8.729925532998348e-05, + "loss": 1.929, + "step": 8310 + }, + { + "epoch": 2.550951503990178, + "grad_norm": 0.3687250316143036, + "learning_rate": 8.729594493858007e-05, + "loss": 1.9214, + "step": 8311 + }, + { + "epoch": 2.5512584407612033, + "grad_norm": 0.3302690386772156, + "learning_rate": 8.729263417859625e-05, + "loss": 1.8667, + "step": 8312 + }, + { + "epoch": 2.5515653775322282, + "grad_norm": 0.32535505294799805, + "learning_rate": 8.728932305006478e-05, + "loss": 1.8298, + "step": 8313 + }, + { + "epoch": 2.5518723143032536, + "grad_norm": 0.3425545394420624, + "learning_rate": 8.728601155301834e-05, + "loss": 1.9479, + "step": 8314 + }, + { + "epoch": 2.5521792510742785, + "grad_norm": 0.29452621936798096, + "learning_rate": 8.72826996874897e-05, + "loss": 1.7963, + "step": 8315 + }, + { + "epoch": 2.552486187845304, + "grad_norm": 0.28749120235443115, + "learning_rate": 8.727938745351156e-05, + "loss": 1.7993, + "step": 8316 + }, + { + "epoch": 2.552793124616329, + "grad_norm": 0.29261404275894165, + "learning_rate": 8.727607485111669e-05, + "loss": 1.8307, + "step": 8317 + }, + { + "epoch": 2.5531000613873545, + "grad_norm": 0.2949221730232239, + "learning_rate": 8.727276188033778e-05, + "loss": 1.7918, + "step": 8318 + }, + { + "epoch": 2.5534069981583793, + "grad_norm": 0.2975117862224579, + "learning_rate": 8.726944854120757e-05, + "loss": 1.8488, + "step": 8319 + }, + { + "epoch": 2.5537139349294047, + "grad_norm": 0.30285659432411194, + "learning_rate": 8.726613483375885e-05, + "loss": 1.8763, + "step": 8320 + }, + { + "epoch": 2.5540208717004296, + "grad_norm": 0.3068414330482483, + "learning_rate": 8.726282075802435e-05, + "loss": 1.8684, + "step": 8321 + }, + { + "epoch": 2.554327808471455, + "grad_norm": 0.3904091715812683, + "learning_rate": 8.72595063140368e-05, + "loss": 1.8643, + "step": 8322 + }, + { + "epoch": 2.5546347452424802, + "grad_norm": 0.443294882774353, + "learning_rate": 8.725619150182897e-05, + "loss": 1.8268, + "step": 8323 + }, + { + "epoch": 2.554941682013505, + "grad_norm": 0.4574877619743347, + "learning_rate": 8.725287632143362e-05, + "loss": 1.8686, + "step": 8324 + }, + { + "epoch": 2.5552486187845305, + "grad_norm": 0.3246860206127167, + "learning_rate": 8.724956077288351e-05, + "loss": 1.8304, + "step": 8325 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.30745935440063477, + "learning_rate": 8.724624485621141e-05, + "loss": 1.8129, + "step": 8326 + }, + { + "epoch": 2.5558624923265807, + "grad_norm": 0.4026782214641571, + "learning_rate": 8.72429285714501e-05, + "loss": 1.8511, + "step": 8327 + }, + { + "epoch": 2.556169429097606, + "grad_norm": 0.41659530997276306, + "learning_rate": 8.723961191863232e-05, + "loss": 1.891, + "step": 8328 + }, + { + "epoch": 2.556476365868631, + "grad_norm": 0.31792551279067993, + "learning_rate": 8.723629489779088e-05, + "loss": 1.8413, + "step": 8329 + }, + { + "epoch": 2.5567833026396563, + "grad_norm": 0.3168247640132904, + "learning_rate": 8.723297750895856e-05, + "loss": 1.902, + "step": 8330 + }, + { + "epoch": 2.557090239410681, + "grad_norm": 0.27834242582321167, + "learning_rate": 8.72296597521681e-05, + "loss": 1.8185, + "step": 8331 + }, + { + "epoch": 2.5573971761817065, + "grad_norm": 0.2997399568557739, + "learning_rate": 8.722634162745236e-05, + "loss": 1.8389, + "step": 8332 + }, + { + "epoch": 2.557704112952732, + "grad_norm": 0.29116490483283997, + "learning_rate": 8.722302313484407e-05, + "loss": 1.8391, + "step": 8333 + }, + { + "epoch": 2.558011049723757, + "grad_norm": 0.2898460030555725, + "learning_rate": 8.721970427437605e-05, + "loss": 1.8891, + "step": 8334 + }, + { + "epoch": 2.558317986494782, + "grad_norm": 0.3231159746646881, + "learning_rate": 8.721638504608109e-05, + "loss": 1.826, + "step": 8335 + }, + { + "epoch": 2.5586249232658074, + "grad_norm": 0.38665273785591125, + "learning_rate": 8.721306544999203e-05, + "loss": 1.9162, + "step": 8336 + }, + { + "epoch": 2.5589318600368323, + "grad_norm": 0.367824912071228, + "learning_rate": 8.720974548614162e-05, + "loss": 1.8165, + "step": 8337 + }, + { + "epoch": 2.5592387968078576, + "grad_norm": 0.3095315098762512, + "learning_rate": 8.72064251545627e-05, + "loss": 1.8887, + "step": 8338 + }, + { + "epoch": 2.559545733578883, + "grad_norm": 0.316890150308609, + "learning_rate": 8.720310445528807e-05, + "loss": 1.8547, + "step": 8339 + }, + { + "epoch": 2.559852670349908, + "grad_norm": 0.2962728440761566, + "learning_rate": 8.719978338835057e-05, + "loss": 1.8252, + "step": 8340 + }, + { + "epoch": 2.560159607120933, + "grad_norm": 0.3351762890815735, + "learning_rate": 8.719646195378302e-05, + "loss": 1.8056, + "step": 8341 + }, + { + "epoch": 2.560466543891958, + "grad_norm": 0.2946149706840515, + "learning_rate": 8.719314015161822e-05, + "loss": 1.8219, + "step": 8342 + }, + { + "epoch": 2.5607734806629834, + "grad_norm": 0.30291053652763367, + "learning_rate": 8.718981798188899e-05, + "loss": 1.8161, + "step": 8343 + }, + { + "epoch": 2.5610804174340087, + "grad_norm": 0.30717429518699646, + "learning_rate": 8.71864954446282e-05, + "loss": 1.8763, + "step": 8344 + }, + { + "epoch": 2.5613873542050336, + "grad_norm": 0.28360515832901, + "learning_rate": 8.718317253986866e-05, + "loss": 1.7972, + "step": 8345 + }, + { + "epoch": 2.561694290976059, + "grad_norm": 0.34898701310157776, + "learning_rate": 8.717984926764322e-05, + "loss": 1.8843, + "step": 8346 + }, + { + "epoch": 2.562001227747084, + "grad_norm": 0.2702360451221466, + "learning_rate": 8.717652562798472e-05, + "loss": 1.7917, + "step": 8347 + }, + { + "epoch": 2.562308164518109, + "grad_norm": 0.30566295981407166, + "learning_rate": 8.7173201620926e-05, + "loss": 1.9027, + "step": 8348 + }, + { + "epoch": 2.5626151012891345, + "grad_norm": 0.2882433533668518, + "learning_rate": 8.716987724649991e-05, + "loss": 1.8167, + "step": 8349 + }, + { + "epoch": 2.56292203806016, + "grad_norm": 0.2616370916366577, + "learning_rate": 8.71665525047393e-05, + "loss": 1.7779, + "step": 8350 + }, + { + "epoch": 2.5632289748311847, + "grad_norm": 0.3033899664878845, + "learning_rate": 8.716322739567706e-05, + "loss": 1.9022, + "step": 8351 + }, + { + "epoch": 2.56353591160221, + "grad_norm": 0.30584800243377686, + "learning_rate": 8.7159901919346e-05, + "loss": 1.8808, + "step": 8352 + }, + { + "epoch": 2.563842848373235, + "grad_norm": 0.34650805592536926, + "learning_rate": 8.715657607577903e-05, + "loss": 1.8817, + "step": 8353 + }, + { + "epoch": 2.5641497851442603, + "grad_norm": 0.30568572878837585, + "learning_rate": 8.715324986500898e-05, + "loss": 1.8852, + "step": 8354 + }, + { + "epoch": 2.5644567219152856, + "grad_norm": 0.36174869537353516, + "learning_rate": 8.714992328706875e-05, + "loss": 1.8518, + "step": 8355 + }, + { + "epoch": 2.5647636586863105, + "grad_norm": 0.48538872599601746, + "learning_rate": 8.714659634199119e-05, + "loss": 1.8902, + "step": 8356 + }, + { + "epoch": 2.565070595457336, + "grad_norm": 0.44997766613960266, + "learning_rate": 8.71432690298092e-05, + "loss": 1.8914, + "step": 8357 + }, + { + "epoch": 2.5653775322283607, + "grad_norm": 0.30164965987205505, + "learning_rate": 8.713994135055566e-05, + "loss": 1.826, + "step": 8358 + }, + { + "epoch": 2.565684468999386, + "grad_norm": 0.35495996475219727, + "learning_rate": 8.713661330426345e-05, + "loss": 1.8006, + "step": 8359 + }, + { + "epoch": 2.5659914057704114, + "grad_norm": 0.4141593277454376, + "learning_rate": 8.713328489096545e-05, + "loss": 1.782, + "step": 8360 + }, + { + "epoch": 2.5662983425414367, + "grad_norm": 0.4758378267288208, + "learning_rate": 8.712995611069458e-05, + "loss": 1.8378, + "step": 8361 + }, + { + "epoch": 2.5666052793124616, + "grad_norm": 0.4852865934371948, + "learning_rate": 8.71266269634837e-05, + "loss": 1.8472, + "step": 8362 + }, + { + "epoch": 2.566912216083487, + "grad_norm": 0.43413496017456055, + "learning_rate": 8.712329744936576e-05, + "loss": 1.8118, + "step": 8363 + }, + { + "epoch": 2.567219152854512, + "grad_norm": 0.3100700080394745, + "learning_rate": 8.711996756837361e-05, + "loss": 1.8699, + "step": 8364 + }, + { + "epoch": 2.567526089625537, + "grad_norm": 0.31886258721351624, + "learning_rate": 8.711663732054021e-05, + "loss": 1.8022, + "step": 8365 + }, + { + "epoch": 2.5678330263965625, + "grad_norm": 0.38900697231292725, + "learning_rate": 8.711330670589841e-05, + "loss": 1.8119, + "step": 8366 + }, + { + "epoch": 2.5681399631675874, + "grad_norm": 0.4188348650932312, + "learning_rate": 8.710997572448119e-05, + "loss": 1.8561, + "step": 8367 + }, + { + "epoch": 2.5684468999386127, + "grad_norm": 0.3562021255493164, + "learning_rate": 8.710664437632143e-05, + "loss": 1.8605, + "step": 8368 + }, + { + "epoch": 2.5687538367096376, + "grad_norm": 0.3105112910270691, + "learning_rate": 8.710331266145206e-05, + "loss": 1.8122, + "step": 8369 + }, + { + "epoch": 2.569060773480663, + "grad_norm": 0.3209846615791321, + "learning_rate": 8.7099980579906e-05, + "loss": 1.8914, + "step": 8370 + }, + { + "epoch": 2.5693677102516883, + "grad_norm": 0.32560455799102783, + "learning_rate": 8.70966481317162e-05, + "loss": 1.9245, + "step": 8371 + }, + { + "epoch": 2.569674647022713, + "grad_norm": 0.29573267698287964, + "learning_rate": 8.709331531691558e-05, + "loss": 1.8576, + "step": 8372 + }, + { + "epoch": 2.5699815837937385, + "grad_norm": 0.2974778115749359, + "learning_rate": 8.708998213553707e-05, + "loss": 1.8464, + "step": 8373 + }, + { + "epoch": 2.5702885205647634, + "grad_norm": 0.3264322578907013, + "learning_rate": 8.708664858761362e-05, + "loss": 1.8945, + "step": 8374 + }, + { + "epoch": 2.5705954573357888, + "grad_norm": 0.28260353207588196, + "learning_rate": 8.708331467317816e-05, + "loss": 1.8296, + "step": 8375 + }, + { + "epoch": 2.570902394106814, + "grad_norm": 0.2991141676902771, + "learning_rate": 8.707998039226367e-05, + "loss": 1.9227, + "step": 8376 + }, + { + "epoch": 2.5712093308778394, + "grad_norm": 0.28582924604415894, + "learning_rate": 8.707664574490306e-05, + "loss": 1.8465, + "step": 8377 + }, + { + "epoch": 2.5715162676488643, + "grad_norm": 0.2860773205757141, + "learning_rate": 8.707331073112932e-05, + "loss": 1.8403, + "step": 8378 + }, + { + "epoch": 2.5718232044198897, + "grad_norm": 0.31145161390304565, + "learning_rate": 8.70699753509754e-05, + "loss": 1.8775, + "step": 8379 + }, + { + "epoch": 2.5721301411909145, + "grad_norm": 0.28711119294166565, + "learning_rate": 8.706663960447424e-05, + "loss": 1.8354, + "step": 8380 + }, + { + "epoch": 2.57243707796194, + "grad_norm": 0.2884272634983063, + "learning_rate": 8.706330349165884e-05, + "loss": 1.8772, + "step": 8381 + }, + { + "epoch": 2.572744014732965, + "grad_norm": 0.3581789433956146, + "learning_rate": 8.705996701256214e-05, + "loss": 1.8654, + "step": 8382 + }, + { + "epoch": 2.57305095150399, + "grad_norm": 0.41561809182167053, + "learning_rate": 8.705663016721712e-05, + "loss": 1.9112, + "step": 8383 + }, + { + "epoch": 2.5733578882750154, + "grad_norm": 0.301883727312088, + "learning_rate": 8.705329295565676e-05, + "loss": 1.803, + "step": 8384 + }, + { + "epoch": 2.5736648250460403, + "grad_norm": 0.37060779333114624, + "learning_rate": 8.704995537791405e-05, + "loss": 1.9371, + "step": 8385 + }, + { + "epoch": 2.5739717618170657, + "grad_norm": 0.44705548882484436, + "learning_rate": 8.704661743402195e-05, + "loss": 1.8599, + "step": 8386 + }, + { + "epoch": 2.574278698588091, + "grad_norm": 0.44097039103507996, + "learning_rate": 8.70432791240135e-05, + "loss": 1.8305, + "step": 8387 + }, + { + "epoch": 2.574585635359116, + "grad_norm": 0.3278143107891083, + "learning_rate": 8.703994044792161e-05, + "loss": 1.8817, + "step": 8388 + }, + { + "epoch": 2.574892572130141, + "grad_norm": 0.347153902053833, + "learning_rate": 8.703660140577934e-05, + "loss": 1.8182, + "step": 8389 + }, + { + "epoch": 2.575199508901166, + "grad_norm": 0.4667893052101135, + "learning_rate": 8.703326199761966e-05, + "loss": 1.8354, + "step": 8390 + }, + { + "epoch": 2.5755064456721914, + "grad_norm": 0.4956285059452057, + "learning_rate": 8.702992222347559e-05, + "loss": 1.8284, + "step": 8391 + }, + { + "epoch": 2.575813382443217, + "grad_norm": 0.3489355146884918, + "learning_rate": 8.702658208338012e-05, + "loss": 1.8439, + "step": 8392 + }, + { + "epoch": 2.576120319214242, + "grad_norm": 0.3054865002632141, + "learning_rate": 8.702324157736625e-05, + "loss": 1.8659, + "step": 8393 + }, + { + "epoch": 2.576427255985267, + "grad_norm": 0.3459004759788513, + "learning_rate": 8.701990070546703e-05, + "loss": 1.8644, + "step": 8394 + }, + { + "epoch": 2.5767341927562923, + "grad_norm": 0.34715306758880615, + "learning_rate": 8.701655946771544e-05, + "loss": 1.8765, + "step": 8395 + }, + { + "epoch": 2.5770411295273172, + "grad_norm": 0.35610535740852356, + "learning_rate": 8.701321786414452e-05, + "loss": 1.886, + "step": 8396 + }, + { + "epoch": 2.5773480662983426, + "grad_norm": 0.34869852662086487, + "learning_rate": 8.700987589478728e-05, + "loss": 1.8858, + "step": 8397 + }, + { + "epoch": 2.577655003069368, + "grad_norm": 0.33508050441741943, + "learning_rate": 8.700653355967675e-05, + "loss": 1.8429, + "step": 8398 + }, + { + "epoch": 2.577961939840393, + "grad_norm": 0.4707668721675873, + "learning_rate": 8.700319085884597e-05, + "loss": 1.8806, + "step": 8399 + }, + { + "epoch": 2.578268876611418, + "grad_norm": 0.5073609948158264, + "learning_rate": 8.699984779232797e-05, + "loss": 1.9252, + "step": 8400 + }, + { + "epoch": 2.578575813382443, + "grad_norm": 0.4120771884918213, + "learning_rate": 8.699650436015578e-05, + "loss": 1.9463, + "step": 8401 + }, + { + "epoch": 2.5788827501534684, + "grad_norm": 0.5639505386352539, + "learning_rate": 8.699316056236246e-05, + "loss": 1.9076, + "step": 8402 + }, + { + "epoch": 2.5791896869244937, + "grad_norm": 0.7611388564109802, + "learning_rate": 8.698981639898106e-05, + "loss": 1.8344, + "step": 8403 + }, + { + "epoch": 2.5794966236955186, + "grad_norm": 0.715629518032074, + "learning_rate": 8.69864718700446e-05, + "loss": 1.7928, + "step": 8404 + }, + { + "epoch": 2.579803560466544, + "grad_norm": 0.4248988926410675, + "learning_rate": 8.698312697558614e-05, + "loss": 1.835, + "step": 8405 + }, + { + "epoch": 2.580110497237569, + "grad_norm": 0.3638152778148651, + "learning_rate": 8.697978171563875e-05, + "loss": 1.8544, + "step": 8406 + }, + { + "epoch": 2.580417434008594, + "grad_norm": 0.40734997391700745, + "learning_rate": 8.697643609023547e-05, + "loss": 1.7759, + "step": 8407 + }, + { + "epoch": 2.5807243707796195, + "grad_norm": 0.41469305753707886, + "learning_rate": 8.697309009940939e-05, + "loss": 1.8989, + "step": 8408 + }, + { + "epoch": 2.581031307550645, + "grad_norm": 0.3003403842449188, + "learning_rate": 8.696974374319355e-05, + "loss": 1.8138, + "step": 8409 + }, + { + "epoch": 2.5813382443216697, + "grad_norm": 0.3475555181503296, + "learning_rate": 8.696639702162104e-05, + "loss": 1.8851, + "step": 8410 + }, + { + "epoch": 2.581645181092695, + "grad_norm": 0.3952930271625519, + "learning_rate": 8.696304993472493e-05, + "loss": 1.8421, + "step": 8411 + }, + { + "epoch": 2.58195211786372, + "grad_norm": 0.33059266209602356, + "learning_rate": 8.69597024825383e-05, + "loss": 1.886, + "step": 8412 + }, + { + "epoch": 2.5822590546347453, + "grad_norm": 0.291877806186676, + "learning_rate": 8.695635466509422e-05, + "loss": 1.8001, + "step": 8413 + }, + { + "epoch": 2.5825659914057706, + "grad_norm": 0.3707219064235687, + "learning_rate": 8.69530064824258e-05, + "loss": 1.8419, + "step": 8414 + }, + { + "epoch": 2.5828729281767955, + "grad_norm": 0.4656111001968384, + "learning_rate": 8.694965793456609e-05, + "loss": 1.8925, + "step": 8415 + }, + { + "epoch": 2.583179864947821, + "grad_norm": 0.4284421503543854, + "learning_rate": 8.694630902154821e-05, + "loss": 1.8794, + "step": 8416 + }, + { + "epoch": 2.5834868017188457, + "grad_norm": 0.25311100482940674, + "learning_rate": 8.694295974340525e-05, + "loss": 1.8004, + "step": 8417 + }, + { + "epoch": 2.583793738489871, + "grad_norm": 0.3463805615901947, + "learning_rate": 8.693961010017031e-05, + "loss": 1.8666, + "step": 8418 + }, + { + "epoch": 2.5841006752608964, + "grad_norm": 0.3193957209587097, + "learning_rate": 8.693626009187647e-05, + "loss": 1.8787, + "step": 8419 + }, + { + "epoch": 2.5844076120319213, + "grad_norm": 0.30919939279556274, + "learning_rate": 8.69329097185569e-05, + "loss": 1.9066, + "step": 8420 + }, + { + "epoch": 2.5847145488029466, + "grad_norm": 0.31369611620903015, + "learning_rate": 8.692955898024464e-05, + "loss": 1.8714, + "step": 8421 + }, + { + "epoch": 2.5850214855739715, + "grad_norm": 0.3191319406032562, + "learning_rate": 8.692620787697284e-05, + "loss": 1.8535, + "step": 8422 + }, + { + "epoch": 2.585328422344997, + "grad_norm": 0.3148418366909027, + "learning_rate": 8.692285640877462e-05, + "loss": 1.8648, + "step": 8423 + }, + { + "epoch": 2.585635359116022, + "grad_norm": 0.28245437145233154, + "learning_rate": 8.691950457568307e-05, + "loss": 1.8574, + "step": 8424 + }, + { + "epoch": 2.5859422958870475, + "grad_norm": 0.28383150696754456, + "learning_rate": 8.691615237773137e-05, + "loss": 1.7993, + "step": 8425 + }, + { + "epoch": 2.5862492326580724, + "grad_norm": 0.30522802472114563, + "learning_rate": 8.691279981495257e-05, + "loss": 1.8809, + "step": 8426 + }, + { + "epoch": 2.5865561694290977, + "grad_norm": 0.2936995327472687, + "learning_rate": 8.690944688737988e-05, + "loss": 1.745, + "step": 8427 + }, + { + "epoch": 2.5868631062001226, + "grad_norm": 0.2923533320426941, + "learning_rate": 8.69060935950464e-05, + "loss": 1.8929, + "step": 8428 + }, + { + "epoch": 2.587170042971148, + "grad_norm": 0.3280770182609558, + "learning_rate": 8.690273993798526e-05, + "loss": 1.8587, + "step": 8429 + }, + { + "epoch": 2.5874769797421733, + "grad_norm": 0.314712792634964, + "learning_rate": 8.689938591622962e-05, + "loss": 1.8569, + "step": 8430 + }, + { + "epoch": 2.587783916513198, + "grad_norm": 0.3230959475040436, + "learning_rate": 8.689603152981263e-05, + "loss": 1.8451, + "step": 8431 + }, + { + "epoch": 2.5880908532842235, + "grad_norm": 0.35917067527770996, + "learning_rate": 8.689267677876742e-05, + "loss": 1.7755, + "step": 8432 + }, + { + "epoch": 2.5883977900552484, + "grad_norm": 0.3590618968009949, + "learning_rate": 8.688932166312715e-05, + "loss": 1.8236, + "step": 8433 + }, + { + "epoch": 2.5887047268262737, + "grad_norm": 0.29416507482528687, + "learning_rate": 8.6885966182925e-05, + "loss": 1.7852, + "step": 8434 + }, + { + "epoch": 2.589011663597299, + "grad_norm": 0.24230079352855682, + "learning_rate": 8.688261033819409e-05, + "loss": 1.8006, + "step": 8435 + }, + { + "epoch": 2.5893186003683244, + "grad_norm": 0.2519497573375702, + "learning_rate": 8.687925412896762e-05, + "loss": 1.7787, + "step": 8436 + }, + { + "epoch": 2.5896255371393493, + "grad_norm": 0.2794395089149475, + "learning_rate": 8.687589755527874e-05, + "loss": 1.8408, + "step": 8437 + }, + { + "epoch": 2.5899324739103746, + "grad_norm": 0.28811511397361755, + "learning_rate": 8.687254061716063e-05, + "loss": 1.8961, + "step": 8438 + }, + { + "epoch": 2.5902394106813995, + "grad_norm": 0.28127825260162354, + "learning_rate": 8.686918331464647e-05, + "loss": 1.8235, + "step": 8439 + }, + { + "epoch": 2.590546347452425, + "grad_norm": 0.2869607210159302, + "learning_rate": 8.686582564776942e-05, + "loss": 1.8452, + "step": 8440 + }, + { + "epoch": 2.59085328422345, + "grad_norm": 0.36350393295288086, + "learning_rate": 8.686246761656268e-05, + "loss": 1.9262, + "step": 8441 + }, + { + "epoch": 2.591160220994475, + "grad_norm": 0.30231785774230957, + "learning_rate": 8.685910922105942e-05, + "loss": 1.8674, + "step": 8442 + }, + { + "epoch": 2.5914671577655004, + "grad_norm": 0.28321847319602966, + "learning_rate": 8.685575046129285e-05, + "loss": 1.8243, + "step": 8443 + }, + { + "epoch": 2.5917740945365253, + "grad_norm": 0.30235186219215393, + "learning_rate": 8.685239133729615e-05, + "loss": 1.8442, + "step": 8444 + }, + { + "epoch": 2.5920810313075506, + "grad_norm": 0.2684946060180664, + "learning_rate": 8.684903184910252e-05, + "loss": 1.8584, + "step": 8445 + }, + { + "epoch": 2.592387968078576, + "grad_norm": 0.33788567781448364, + "learning_rate": 8.684567199674514e-05, + "loss": 1.8296, + "step": 8446 + }, + { + "epoch": 2.592694904849601, + "grad_norm": 0.38110965490341187, + "learning_rate": 8.684231178025726e-05, + "loss": 1.8581, + "step": 8447 + }, + { + "epoch": 2.593001841620626, + "grad_norm": 0.36466923356056213, + "learning_rate": 8.683895119967204e-05, + "loss": 1.8799, + "step": 8448 + }, + { + "epoch": 2.593308778391651, + "grad_norm": 0.3052733838558197, + "learning_rate": 8.683559025502272e-05, + "loss": 1.8834, + "step": 8449 + }, + { + "epoch": 2.5936157151626764, + "grad_norm": 0.31457164883613586, + "learning_rate": 8.683222894634251e-05, + "loss": 1.8635, + "step": 8450 + }, + { + "epoch": 2.5939226519337018, + "grad_norm": 0.46189576387405396, + "learning_rate": 8.682886727366464e-05, + "loss": 1.8852, + "step": 8451 + }, + { + "epoch": 2.594229588704727, + "grad_norm": 0.467640221118927, + "learning_rate": 8.682550523702229e-05, + "loss": 1.8306, + "step": 8452 + }, + { + "epoch": 2.594536525475752, + "grad_norm": 0.3384416699409485, + "learning_rate": 8.682214283644873e-05, + "loss": 1.8298, + "step": 8453 + }, + { + "epoch": 2.5948434622467773, + "grad_norm": 0.2842169404029846, + "learning_rate": 8.681878007197717e-05, + "loss": 1.8091, + "step": 8454 + }, + { + "epoch": 2.595150399017802, + "grad_norm": 0.31266552209854126, + "learning_rate": 8.681541694364084e-05, + "loss": 1.8329, + "step": 8455 + }, + { + "epoch": 2.5954573357888275, + "grad_norm": 0.36803483963012695, + "learning_rate": 8.681205345147298e-05, + "loss": 1.8427, + "step": 8456 + }, + { + "epoch": 2.595764272559853, + "grad_norm": 0.37500229477882385, + "learning_rate": 8.680868959550684e-05, + "loss": 1.8865, + "step": 8457 + }, + { + "epoch": 2.5960712093308778, + "grad_norm": 0.30494266748428345, + "learning_rate": 8.680532537577565e-05, + "loss": 1.8375, + "step": 8458 + }, + { + "epoch": 2.596378146101903, + "grad_norm": 0.38320985436439514, + "learning_rate": 8.680196079231266e-05, + "loss": 1.8762, + "step": 8459 + }, + { + "epoch": 2.596685082872928, + "grad_norm": 0.48555347323417664, + "learning_rate": 8.679859584515112e-05, + "loss": 1.8558, + "step": 8460 + }, + { + "epoch": 2.5969920196439533, + "grad_norm": 0.3975796401500702, + "learning_rate": 8.67952305343243e-05, + "loss": 1.8265, + "step": 8461 + }, + { + "epoch": 2.5972989564149787, + "grad_norm": 0.3312734365463257, + "learning_rate": 8.679186485986544e-05, + "loss": 1.8346, + "step": 8462 + }, + { + "epoch": 2.5976058931860035, + "grad_norm": 0.37137889862060547, + "learning_rate": 8.67884988218078e-05, + "loss": 1.8894, + "step": 8463 + }, + { + "epoch": 2.597912829957029, + "grad_norm": 0.3645901083946228, + "learning_rate": 8.678513242018467e-05, + "loss": 1.8103, + "step": 8464 + }, + { + "epoch": 2.5982197667280538, + "grad_norm": 0.35010847449302673, + "learning_rate": 8.67817656550293e-05, + "loss": 1.8704, + "step": 8465 + }, + { + "epoch": 2.598526703499079, + "grad_norm": 0.36948931217193604, + "learning_rate": 8.677839852637492e-05, + "loss": 1.8413, + "step": 8466 + }, + { + "epoch": 2.5988336402701044, + "grad_norm": 0.3512018322944641, + "learning_rate": 8.67750310342549e-05, + "loss": 1.8222, + "step": 8467 + }, + { + "epoch": 2.5991405770411298, + "grad_norm": 0.3678590953350067, + "learning_rate": 8.677166317870245e-05, + "loss": 1.852, + "step": 8468 + }, + { + "epoch": 2.5994475138121547, + "grad_norm": 0.46718111634254456, + "learning_rate": 8.676829495975087e-05, + "loss": 1.8459, + "step": 8469 + }, + { + "epoch": 2.59975445058318, + "grad_norm": 0.4580456018447876, + "learning_rate": 8.676492637743345e-05, + "loss": 1.8547, + "step": 8470 + }, + { + "epoch": 2.600061387354205, + "grad_norm": 0.3790566921234131, + "learning_rate": 8.676155743178348e-05, + "loss": 1.8483, + "step": 8471 + }, + { + "epoch": 2.6003683241252302, + "grad_norm": 0.34775233268737793, + "learning_rate": 8.675818812283424e-05, + "loss": 1.9, + "step": 8472 + }, + { + "epoch": 2.6006752608962556, + "grad_norm": 0.4257417619228363, + "learning_rate": 8.675481845061906e-05, + "loss": 1.8354, + "step": 8473 + }, + { + "epoch": 2.6009821976672804, + "grad_norm": 0.46964964270591736, + "learning_rate": 8.675144841517122e-05, + "loss": 1.8305, + "step": 8474 + }, + { + "epoch": 2.601289134438306, + "grad_norm": 0.3592812120914459, + "learning_rate": 8.674807801652403e-05, + "loss": 1.778, + "step": 8475 + }, + { + "epoch": 2.6015960712093307, + "grad_norm": 0.3184985816478729, + "learning_rate": 8.674470725471078e-05, + "loss": 1.8706, + "step": 8476 + }, + { + "epoch": 2.601903007980356, + "grad_norm": 0.31306785345077515, + "learning_rate": 8.674133612976481e-05, + "loss": 1.8482, + "step": 8477 + }, + { + "epoch": 2.6022099447513813, + "grad_norm": 0.30568715929985046, + "learning_rate": 8.673796464171939e-05, + "loss": 1.8346, + "step": 8478 + }, + { + "epoch": 2.6025168815224062, + "grad_norm": 0.33701828122138977, + "learning_rate": 8.673459279060791e-05, + "loss": 1.8165, + "step": 8479 + }, + { + "epoch": 2.6028238182934316, + "grad_norm": 0.3153107166290283, + "learning_rate": 8.673122057646364e-05, + "loss": 1.8175, + "step": 8480 + }, + { + "epoch": 2.6031307550644565, + "grad_norm": 0.3428439497947693, + "learning_rate": 8.67278479993199e-05, + "loss": 1.8344, + "step": 8481 + }, + { + "epoch": 2.603437691835482, + "grad_norm": 0.39118432998657227, + "learning_rate": 8.672447505921006e-05, + "loss": 1.7904, + "step": 8482 + }, + { + "epoch": 2.603744628606507, + "grad_norm": 0.3845612108707428, + "learning_rate": 8.672110175616743e-05, + "loss": 1.8442, + "step": 8483 + }, + { + "epoch": 2.6040515653775325, + "grad_norm": 0.3402850329875946, + "learning_rate": 8.671772809022535e-05, + "loss": 1.8578, + "step": 8484 + }, + { + "epoch": 2.6043585021485574, + "grad_norm": 0.30314967036247253, + "learning_rate": 8.671435406141716e-05, + "loss": 1.8235, + "step": 8485 + }, + { + "epoch": 2.6046654389195827, + "grad_norm": 0.29402145743370056, + "learning_rate": 8.67109796697762e-05, + "loss": 1.8105, + "step": 8486 + }, + { + "epoch": 2.6049723756906076, + "grad_norm": 0.33207419514656067, + "learning_rate": 8.670760491533582e-05, + "loss": 1.9133, + "step": 8487 + }, + { + "epoch": 2.605279312461633, + "grad_norm": 0.3287195861339569, + "learning_rate": 8.670422979812938e-05, + "loss": 1.8344, + "step": 8488 + }, + { + "epoch": 2.6055862492326582, + "grad_norm": 0.37947842478752136, + "learning_rate": 8.670085431819021e-05, + "loss": 1.8504, + "step": 8489 + }, + { + "epoch": 2.605893186003683, + "grad_norm": 0.3688724935054779, + "learning_rate": 8.669747847555171e-05, + "loss": 1.8305, + "step": 8490 + }, + { + "epoch": 2.6062001227747085, + "grad_norm": 0.33962976932525635, + "learning_rate": 8.669410227024721e-05, + "loss": 1.861, + "step": 8491 + }, + { + "epoch": 2.6065070595457334, + "grad_norm": 0.27068057656288147, + "learning_rate": 8.669072570231009e-05, + "loss": 1.7666, + "step": 8492 + }, + { + "epoch": 2.6068139963167587, + "grad_norm": 0.32670122385025024, + "learning_rate": 8.668734877177371e-05, + "loss": 1.8434, + "step": 8493 + }, + { + "epoch": 2.607120933087784, + "grad_norm": 0.37303030490875244, + "learning_rate": 8.668397147867144e-05, + "loss": 1.8326, + "step": 8494 + }, + { + "epoch": 2.607427869858809, + "grad_norm": 0.2860218286514282, + "learning_rate": 8.668059382303666e-05, + "loss": 1.7993, + "step": 8495 + }, + { + "epoch": 2.6077348066298343, + "grad_norm": 0.3480636477470398, + "learning_rate": 8.667721580490278e-05, + "loss": 1.8895, + "step": 8496 + }, + { + "epoch": 2.608041743400859, + "grad_norm": 0.37609198689460754, + "learning_rate": 8.667383742430313e-05, + "loss": 1.8906, + "step": 8497 + }, + { + "epoch": 2.6083486801718845, + "grad_norm": 0.30747851729393005, + "learning_rate": 8.667045868127113e-05, + "loss": 1.8169, + "step": 8498 + }, + { + "epoch": 2.60865561694291, + "grad_norm": 0.3108443021774292, + "learning_rate": 8.666707957584016e-05, + "loss": 1.8296, + "step": 8499 + }, + { + "epoch": 2.608962553713935, + "grad_norm": 0.36353448033332825, + "learning_rate": 8.666370010804361e-05, + "loss": 1.879, + "step": 8500 + }, + { + "epoch": 2.60926949048496, + "grad_norm": 0.39959096908569336, + "learning_rate": 8.666032027791491e-05, + "loss": 1.8602, + "step": 8501 + }, + { + "epoch": 2.6095764272559854, + "grad_norm": 0.3505500853061676, + "learning_rate": 8.665694008548742e-05, + "loss": 1.861, + "step": 8502 + }, + { + "epoch": 2.6098833640270103, + "grad_norm": 0.3155219852924347, + "learning_rate": 8.665355953079457e-05, + "loss": 1.7911, + "step": 8503 + }, + { + "epoch": 2.6101903007980356, + "grad_norm": 0.2868075668811798, + "learning_rate": 8.665017861386975e-05, + "loss": 1.8023, + "step": 8504 + }, + { + "epoch": 2.610497237569061, + "grad_norm": 0.2890832722187042, + "learning_rate": 8.664679733474641e-05, + "loss": 1.8653, + "step": 8505 + }, + { + "epoch": 2.610804174340086, + "grad_norm": 0.3143366575241089, + "learning_rate": 8.66434156934579e-05, + "loss": 1.8024, + "step": 8506 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.28702911734580994, + "learning_rate": 8.664003369003772e-05, + "loss": 1.8231, + "step": 8507 + }, + { + "epoch": 2.611418047882136, + "grad_norm": 0.37087059020996094, + "learning_rate": 8.663665132451924e-05, + "loss": 1.8565, + "step": 8508 + }, + { + "epoch": 2.6117249846531614, + "grad_norm": 0.29796209931373596, + "learning_rate": 8.663326859693588e-05, + "loss": 1.8188, + "step": 8509 + }, + { + "epoch": 2.6120319214241867, + "grad_norm": 0.31352412700653076, + "learning_rate": 8.66298855073211e-05, + "loss": 1.806, + "step": 8510 + }, + { + "epoch": 2.612338858195212, + "grad_norm": 0.28749167919158936, + "learning_rate": 8.662650205570832e-05, + "loss": 1.8082, + "step": 8511 + }, + { + "epoch": 2.612645794966237, + "grad_norm": 0.26889678835868835, + "learning_rate": 8.662311824213099e-05, + "loss": 1.8211, + "step": 8512 + }, + { + "epoch": 2.6129527317372623, + "grad_norm": 0.2562754154205322, + "learning_rate": 8.661973406662253e-05, + "loss": 1.7519, + "step": 8513 + }, + { + "epoch": 2.613259668508287, + "grad_norm": 0.26967912912368774, + "learning_rate": 8.661634952921639e-05, + "loss": 1.8339, + "step": 8514 + }, + { + "epoch": 2.6135666052793125, + "grad_norm": 0.3468424081802368, + "learning_rate": 8.661296462994602e-05, + "loss": 1.9219, + "step": 8515 + }, + { + "epoch": 2.613873542050338, + "grad_norm": 0.34790560603141785, + "learning_rate": 8.660957936884489e-05, + "loss": 1.9089, + "step": 8516 + }, + { + "epoch": 2.6141804788213627, + "grad_norm": 0.350337952375412, + "learning_rate": 8.660619374594643e-05, + "loss": 1.8228, + "step": 8517 + }, + { + "epoch": 2.614487415592388, + "grad_norm": 0.37077057361602783, + "learning_rate": 8.660280776128411e-05, + "loss": 1.8658, + "step": 8518 + }, + { + "epoch": 2.614794352363413, + "grad_norm": 0.35846221446990967, + "learning_rate": 8.659942141489139e-05, + "loss": 1.8573, + "step": 8519 + }, + { + "epoch": 2.6151012891344383, + "grad_norm": 0.339101642370224, + "learning_rate": 8.659603470680173e-05, + "loss": 1.875, + "step": 8520 + }, + { + "epoch": 2.6154082259054636, + "grad_norm": 0.35074207186698914, + "learning_rate": 8.65926476370486e-05, + "loss": 1.8395, + "step": 8521 + }, + { + "epoch": 2.6157151626764885, + "grad_norm": 0.31544017791748047, + "learning_rate": 8.658926020566551e-05, + "loss": 1.8453, + "step": 8522 + }, + { + "epoch": 2.616022099447514, + "grad_norm": 0.30619683861732483, + "learning_rate": 8.658587241268587e-05, + "loss": 1.775, + "step": 8523 + }, + { + "epoch": 2.6163290362185387, + "grad_norm": 0.29331618547439575, + "learning_rate": 8.658248425814322e-05, + "loss": 1.8068, + "step": 8524 + }, + { + "epoch": 2.616635972989564, + "grad_norm": 0.2824336290359497, + "learning_rate": 8.6579095742071e-05, + "loss": 1.8759, + "step": 8525 + }, + { + "epoch": 2.6169429097605894, + "grad_norm": 0.2697986364364624, + "learning_rate": 8.657570686450271e-05, + "loss": 1.8295, + "step": 8526 + }, + { + "epoch": 2.6172498465316147, + "grad_norm": 0.3031822144985199, + "learning_rate": 8.657231762547186e-05, + "loss": 1.9205, + "step": 8527 + }, + { + "epoch": 2.6175567833026396, + "grad_norm": 0.2867984473705292, + "learning_rate": 8.656892802501196e-05, + "loss": 1.8638, + "step": 8528 + }, + { + "epoch": 2.617863720073665, + "grad_norm": 0.29799792170524597, + "learning_rate": 8.656553806315644e-05, + "loss": 1.8187, + "step": 8529 + }, + { + "epoch": 2.61817065684469, + "grad_norm": 0.3222150504589081, + "learning_rate": 8.656214773993884e-05, + "loss": 1.8661, + "step": 8530 + }, + { + "epoch": 2.618477593615715, + "grad_norm": 0.35999616980552673, + "learning_rate": 8.655875705539269e-05, + "loss": 1.9155, + "step": 8531 + }, + { + "epoch": 2.6187845303867405, + "grad_norm": 0.36571675539016724, + "learning_rate": 8.655536600955147e-05, + "loss": 1.8536, + "step": 8532 + }, + { + "epoch": 2.6190914671577654, + "grad_norm": 0.29667189717292786, + "learning_rate": 8.655197460244868e-05, + "loss": 1.8208, + "step": 8533 + }, + { + "epoch": 2.6193984039287908, + "grad_norm": 0.3216320276260376, + "learning_rate": 8.654858283411787e-05, + "loss": 1.8613, + "step": 8534 + }, + { + "epoch": 2.6197053406998156, + "grad_norm": 0.28880423307418823, + "learning_rate": 8.654519070459254e-05, + "loss": 1.8547, + "step": 8535 + }, + { + "epoch": 2.620012277470841, + "grad_norm": 0.3130050301551819, + "learning_rate": 8.654179821390621e-05, + "loss": 1.9355, + "step": 8536 + }, + { + "epoch": 2.6203192142418663, + "grad_norm": 0.3151358664035797, + "learning_rate": 8.653840536209241e-05, + "loss": 1.8462, + "step": 8537 + }, + { + "epoch": 2.620626151012891, + "grad_norm": 0.2702169120311737, + "learning_rate": 8.653501214918468e-05, + "loss": 1.7966, + "step": 8538 + }, + { + "epoch": 2.6209330877839165, + "grad_norm": 0.31494441628456116, + "learning_rate": 8.653161857521655e-05, + "loss": 1.7449, + "step": 8539 + }, + { + "epoch": 2.6212400245549414, + "grad_norm": 0.3219514787197113, + "learning_rate": 8.652822464022154e-05, + "loss": 1.8238, + "step": 8540 + }, + { + "epoch": 2.6215469613259668, + "grad_norm": 0.3237066864967346, + "learning_rate": 8.652483034423322e-05, + "loss": 1.8273, + "step": 8541 + }, + { + "epoch": 2.621853898096992, + "grad_norm": 0.31354910135269165, + "learning_rate": 8.65214356872851e-05, + "loss": 1.8662, + "step": 8542 + }, + { + "epoch": 2.6221608348680174, + "grad_norm": 0.30085036158561707, + "learning_rate": 8.651804066941077e-05, + "loss": 1.8922, + "step": 8543 + }, + { + "epoch": 2.6224677716390423, + "grad_norm": 0.337528258562088, + "learning_rate": 8.651464529064373e-05, + "loss": 1.8234, + "step": 8544 + }, + { + "epoch": 2.6227747084100677, + "grad_norm": 0.33202415704727173, + "learning_rate": 8.65112495510176e-05, + "loss": 1.8331, + "step": 8545 + }, + { + "epoch": 2.6230816451810925, + "grad_norm": 0.3288112282752991, + "learning_rate": 8.650785345056586e-05, + "loss": 1.8129, + "step": 8546 + }, + { + "epoch": 2.623388581952118, + "grad_norm": 0.35483047366142273, + "learning_rate": 8.650445698932214e-05, + "loss": 1.8488, + "step": 8547 + }, + { + "epoch": 2.623695518723143, + "grad_norm": 0.32108932733535767, + "learning_rate": 8.650106016731998e-05, + "loss": 1.8263, + "step": 8548 + }, + { + "epoch": 2.624002455494168, + "grad_norm": 0.2902318239212036, + "learning_rate": 8.649766298459295e-05, + "loss": 1.8352, + "step": 8549 + }, + { + "epoch": 2.6243093922651934, + "grad_norm": 0.29014477133750916, + "learning_rate": 8.64942654411746e-05, + "loss": 1.8568, + "step": 8550 + }, + { + "epoch": 2.6246163290362183, + "grad_norm": 0.3996742367744446, + "learning_rate": 8.649086753709855e-05, + "loss": 1.8928, + "step": 8551 + }, + { + "epoch": 2.6249232658072437, + "grad_norm": 0.3703175187110901, + "learning_rate": 8.648746927239835e-05, + "loss": 1.829, + "step": 8552 + }, + { + "epoch": 2.625230202578269, + "grad_norm": 0.33802542090415955, + "learning_rate": 8.64840706471076e-05, + "loss": 1.8827, + "step": 8553 + }, + { + "epoch": 2.625537139349294, + "grad_norm": 0.33303168416023254, + "learning_rate": 8.648067166125988e-05, + "loss": 1.8964, + "step": 8554 + }, + { + "epoch": 2.6258440761203192, + "grad_norm": 0.33449646830558777, + "learning_rate": 8.647727231488878e-05, + "loss": 1.8477, + "step": 8555 + }, + { + "epoch": 2.626151012891344, + "grad_norm": 0.3260989189147949, + "learning_rate": 8.647387260802788e-05, + "loss": 1.8623, + "step": 8556 + }, + { + "epoch": 2.6264579496623695, + "grad_norm": 0.2847815752029419, + "learning_rate": 8.647047254071082e-05, + "loss": 1.769, + "step": 8557 + }, + { + "epoch": 2.626764886433395, + "grad_norm": 0.30041372776031494, + "learning_rate": 8.646707211297116e-05, + "loss": 1.8451, + "step": 8558 + }, + { + "epoch": 2.62707182320442, + "grad_norm": 0.3557286560535431, + "learning_rate": 8.646367132484252e-05, + "loss": 1.8233, + "step": 8559 + }, + { + "epoch": 2.627378759975445, + "grad_norm": 0.39471131563186646, + "learning_rate": 8.646027017635851e-05, + "loss": 1.8364, + "step": 8560 + }, + { + "epoch": 2.6276856967464703, + "grad_norm": 0.37501803040504456, + "learning_rate": 8.645686866755273e-05, + "loss": 1.8129, + "step": 8561 + }, + { + "epoch": 2.6279926335174952, + "grad_norm": 0.374553918838501, + "learning_rate": 8.645346679845881e-05, + "loss": 1.9388, + "step": 8562 + }, + { + "epoch": 2.6282995702885206, + "grad_norm": 0.34410929679870605, + "learning_rate": 8.645006456911037e-05, + "loss": 1.8496, + "step": 8563 + }, + { + "epoch": 2.628606507059546, + "grad_norm": 0.28208592534065247, + "learning_rate": 8.644666197954103e-05, + "loss": 1.8405, + "step": 8564 + }, + { + "epoch": 2.628913443830571, + "grad_norm": 0.2913917005062103, + "learning_rate": 8.644325902978441e-05, + "loss": 1.8775, + "step": 8565 + }, + { + "epoch": 2.629220380601596, + "grad_norm": 0.33285796642303467, + "learning_rate": 8.643985571987414e-05, + "loss": 1.8217, + "step": 8566 + }, + { + "epoch": 2.629527317372621, + "grad_norm": 0.3419492244720459, + "learning_rate": 8.643645204984386e-05, + "loss": 1.8911, + "step": 8567 + }, + { + "epoch": 2.6298342541436464, + "grad_norm": 0.33901095390319824, + "learning_rate": 8.643304801972721e-05, + "loss": 1.8653, + "step": 8568 + }, + { + "epoch": 2.6301411909146717, + "grad_norm": 0.30073773860931396, + "learning_rate": 8.642964362955781e-05, + "loss": 1.7544, + "step": 8569 + }, + { + "epoch": 2.630448127685697, + "grad_norm": 0.3300367593765259, + "learning_rate": 8.642623887936933e-05, + "loss": 1.8764, + "step": 8570 + }, + { + "epoch": 2.630755064456722, + "grad_norm": 0.330671101808548, + "learning_rate": 8.642283376919542e-05, + "loss": 1.8227, + "step": 8571 + }, + { + "epoch": 2.6310620012277472, + "grad_norm": 0.3498590290546417, + "learning_rate": 8.64194282990697e-05, + "loss": 1.8639, + "step": 8572 + }, + { + "epoch": 2.631368937998772, + "grad_norm": 0.33145999908447266, + "learning_rate": 8.641602246902586e-05, + "loss": 1.8442, + "step": 8573 + }, + { + "epoch": 2.6316758747697975, + "grad_norm": 0.29510337114334106, + "learning_rate": 8.641261627909754e-05, + "loss": 1.829, + "step": 8574 + }, + { + "epoch": 2.631982811540823, + "grad_norm": 0.2788131833076477, + "learning_rate": 8.640920972931839e-05, + "loss": 1.7717, + "step": 8575 + }, + { + "epoch": 2.6322897483118477, + "grad_norm": 0.27459269762039185, + "learning_rate": 8.640580281972209e-05, + "loss": 1.7924, + "step": 8576 + }, + { + "epoch": 2.632596685082873, + "grad_norm": 0.3517146110534668, + "learning_rate": 8.640239555034232e-05, + "loss": 1.8921, + "step": 8577 + }, + { + "epoch": 2.632903621853898, + "grad_norm": 0.2852388620376587, + "learning_rate": 8.639898792121273e-05, + "loss": 1.8207, + "step": 8578 + }, + { + "epoch": 2.6332105586249233, + "grad_norm": 0.3164372742176056, + "learning_rate": 8.639557993236702e-05, + "loss": 1.8782, + "step": 8579 + }, + { + "epoch": 2.6335174953959486, + "grad_norm": 0.43939462304115295, + "learning_rate": 8.639217158383885e-05, + "loss": 1.8345, + "step": 8580 + }, + { + "epoch": 2.6338244321669735, + "grad_norm": 0.45321017503738403, + "learning_rate": 8.63887628756619e-05, + "loss": 1.904, + "step": 8581 + }, + { + "epoch": 2.634131368937999, + "grad_norm": 0.4423905611038208, + "learning_rate": 8.638535380786989e-05, + "loss": 1.8894, + "step": 8582 + }, + { + "epoch": 2.6344383057090237, + "grad_norm": 0.3929237723350525, + "learning_rate": 8.638194438049648e-05, + "loss": 1.8835, + "step": 8583 + }, + { + "epoch": 2.634745242480049, + "grad_norm": 0.3178403973579407, + "learning_rate": 8.637853459357536e-05, + "loss": 1.8125, + "step": 8584 + }, + { + "epoch": 2.6350521792510744, + "grad_norm": 0.3796660602092743, + "learning_rate": 8.637512444714024e-05, + "loss": 1.9376, + "step": 8585 + }, + { + "epoch": 2.6353591160220997, + "grad_norm": 0.34011390805244446, + "learning_rate": 8.637171394122483e-05, + "loss": 1.8339, + "step": 8586 + }, + { + "epoch": 2.6356660527931246, + "grad_norm": 0.3423489034175873, + "learning_rate": 8.636830307586281e-05, + "loss": 1.82, + "step": 8587 + }, + { + "epoch": 2.63597298956415, + "grad_norm": 0.3644867241382599, + "learning_rate": 8.636489185108791e-05, + "loss": 1.811, + "step": 8588 + }, + { + "epoch": 2.636279926335175, + "grad_norm": 0.35383811593055725, + "learning_rate": 8.636148026693384e-05, + "loss": 1.8228, + "step": 8589 + }, + { + "epoch": 2.6365868631062, + "grad_norm": 0.28066012263298035, + "learning_rate": 8.635806832343431e-05, + "loss": 1.7752, + "step": 8590 + }, + { + "epoch": 2.6368937998772255, + "grad_norm": 0.27132275700569153, + "learning_rate": 8.635465602062304e-05, + "loss": 1.8053, + "step": 8591 + }, + { + "epoch": 2.6372007366482504, + "grad_norm": 0.3076920211315155, + "learning_rate": 8.635124335853375e-05, + "loss": 1.77, + "step": 8592 + }, + { + "epoch": 2.6375076734192757, + "grad_norm": 0.35130617022514343, + "learning_rate": 8.634783033720015e-05, + "loss": 1.8272, + "step": 8593 + }, + { + "epoch": 2.6378146101903006, + "grad_norm": 0.3805561661720276, + "learning_rate": 8.634441695665601e-05, + "loss": 1.8549, + "step": 8594 + }, + { + "epoch": 2.638121546961326, + "grad_norm": 0.3168867230415344, + "learning_rate": 8.634100321693504e-05, + "loss": 1.9131, + "step": 8595 + }, + { + "epoch": 2.6384284837323513, + "grad_norm": 0.3061029314994812, + "learning_rate": 8.633758911807095e-05, + "loss": 1.84, + "step": 8596 + }, + { + "epoch": 2.638735420503376, + "grad_norm": 0.2766086459159851, + "learning_rate": 8.633417466009752e-05, + "loss": 1.8519, + "step": 8597 + }, + { + "epoch": 2.6390423572744015, + "grad_norm": 0.3250633180141449, + "learning_rate": 8.633075984304849e-05, + "loss": 1.8434, + "step": 8598 + }, + { + "epoch": 2.6393492940454264, + "grad_norm": 0.2819656729698181, + "learning_rate": 8.63273446669576e-05, + "loss": 1.8181, + "step": 8599 + }, + { + "epoch": 2.6396562308164517, + "grad_norm": 0.3506627678871155, + "learning_rate": 8.632392913185859e-05, + "loss": 1.8521, + "step": 8600 + }, + { + "epoch": 2.639963167587477, + "grad_norm": 0.3026714026927948, + "learning_rate": 8.632051323778521e-05, + "loss": 1.8183, + "step": 8601 + }, + { + "epoch": 2.6402701043585024, + "grad_norm": 0.31900104880332947, + "learning_rate": 8.631709698477124e-05, + "loss": 1.8615, + "step": 8602 + }, + { + "epoch": 2.6405770411295273, + "grad_norm": 0.3017260730266571, + "learning_rate": 8.631368037285044e-05, + "loss": 1.837, + "step": 8603 + }, + { + "epoch": 2.6408839779005526, + "grad_norm": 0.29461613297462463, + "learning_rate": 8.631026340205655e-05, + "loss": 1.8398, + "step": 8604 + }, + { + "epoch": 2.6411909146715775, + "grad_norm": 0.3405241370201111, + "learning_rate": 8.630684607242337e-05, + "loss": 1.9241, + "step": 8605 + }, + { + "epoch": 2.641497851442603, + "grad_norm": 0.36280715465545654, + "learning_rate": 8.630342838398465e-05, + "loss": 1.8319, + "step": 8606 + }, + { + "epoch": 2.641804788213628, + "grad_norm": 0.32274433970451355, + "learning_rate": 8.630001033677414e-05, + "loss": 1.8462, + "step": 8607 + }, + { + "epoch": 2.642111724984653, + "grad_norm": 0.28930720686912537, + "learning_rate": 8.629659193082571e-05, + "loss": 1.8251, + "step": 8608 + }, + { + "epoch": 2.6424186617556784, + "grad_norm": 0.30114278197288513, + "learning_rate": 8.629317316617305e-05, + "loss": 1.8037, + "step": 8609 + }, + { + "epoch": 2.6427255985267033, + "grad_norm": 0.31895074248313904, + "learning_rate": 8.628975404285e-05, + "loss": 1.808, + "step": 8610 + }, + { + "epoch": 2.6430325352977286, + "grad_norm": 0.31819066405296326, + "learning_rate": 8.62863345608903e-05, + "loss": 1.811, + "step": 8611 + }, + { + "epoch": 2.643339472068754, + "grad_norm": 0.3860008716583252, + "learning_rate": 8.628291472032779e-05, + "loss": 1.9041, + "step": 8612 + }, + { + "epoch": 2.643646408839779, + "grad_norm": 0.4598442614078522, + "learning_rate": 8.627949452119626e-05, + "loss": 1.788, + "step": 8613 + }, + { + "epoch": 2.643953345610804, + "grad_norm": 0.4720706641674042, + "learning_rate": 8.62760739635295e-05, + "loss": 1.8436, + "step": 8614 + }, + { + "epoch": 2.644260282381829, + "grad_norm": 0.3894381523132324, + "learning_rate": 8.627265304736131e-05, + "loss": 1.8188, + "step": 8615 + }, + { + "epoch": 2.6445672191528544, + "grad_norm": 0.2819352149963379, + "learning_rate": 8.626923177272551e-05, + "loss": 1.7804, + "step": 8616 + }, + { + "epoch": 2.6448741559238798, + "grad_norm": 0.33847305178642273, + "learning_rate": 8.626581013965588e-05, + "loss": 1.8628, + "step": 8617 + }, + { + "epoch": 2.645181092694905, + "grad_norm": 0.49113303422927856, + "learning_rate": 8.626238814818628e-05, + "loss": 1.821, + "step": 8618 + }, + { + "epoch": 2.64548802946593, + "grad_norm": 0.5562265515327454, + "learning_rate": 8.62589657983505e-05, + "loss": 1.8732, + "step": 8619 + }, + { + "epoch": 2.6457949662369553, + "grad_norm": 0.48525476455688477, + "learning_rate": 8.625554309018237e-05, + "loss": 1.8711, + "step": 8620 + }, + { + "epoch": 2.64610190300798, + "grad_norm": 0.35900986194610596, + "learning_rate": 8.62521200237157e-05, + "loss": 1.8922, + "step": 8621 + }, + { + "epoch": 2.6464088397790055, + "grad_norm": 0.2920636832714081, + "learning_rate": 8.624869659898435e-05, + "loss": 1.8121, + "step": 8622 + }, + { + "epoch": 2.646715776550031, + "grad_norm": 0.3626689314842224, + "learning_rate": 8.624527281602213e-05, + "loss": 1.8231, + "step": 8623 + }, + { + "epoch": 2.6470227133210558, + "grad_norm": 0.37683549523353577, + "learning_rate": 8.624184867486288e-05, + "loss": 1.8648, + "step": 8624 + }, + { + "epoch": 2.647329650092081, + "grad_norm": 0.293865829706192, + "learning_rate": 8.623842417554043e-05, + "loss": 1.8347, + "step": 8625 + }, + { + "epoch": 2.647636586863106, + "grad_norm": 0.28916221857070923, + "learning_rate": 8.623499931808863e-05, + "loss": 1.8337, + "step": 8626 + }, + { + "epoch": 2.6479435236341313, + "grad_norm": 0.439003586769104, + "learning_rate": 8.623157410254134e-05, + "loss": 1.8933, + "step": 8627 + }, + { + "epoch": 2.6482504604051567, + "grad_norm": 0.39125844836235046, + "learning_rate": 8.62281485289324e-05, + "loss": 1.7986, + "step": 8628 + }, + { + "epoch": 2.6485573971761815, + "grad_norm": 0.3968810439109802, + "learning_rate": 8.622472259729566e-05, + "loss": 1.8211, + "step": 8629 + }, + { + "epoch": 2.648864333947207, + "grad_norm": 0.37775713205337524, + "learning_rate": 8.622129630766498e-05, + "loss": 1.8976, + "step": 8630 + }, + { + "epoch": 2.6491712707182318, + "grad_norm": 0.329583078622818, + "learning_rate": 8.621786966007422e-05, + "loss": 1.9164, + "step": 8631 + }, + { + "epoch": 2.649478207489257, + "grad_norm": 0.3499230742454529, + "learning_rate": 8.621444265455725e-05, + "loss": 1.8589, + "step": 8632 + }, + { + "epoch": 2.6497851442602824, + "grad_norm": 0.504540741443634, + "learning_rate": 8.621101529114792e-05, + "loss": 1.7853, + "step": 8633 + }, + { + "epoch": 2.650092081031308, + "grad_norm": 0.47648704051971436, + "learning_rate": 8.620758756988012e-05, + "loss": 1.865, + "step": 8634 + }, + { + "epoch": 2.6503990178023327, + "grad_norm": 0.3592020869255066, + "learning_rate": 8.62041594907877e-05, + "loss": 1.886, + "step": 8635 + }, + { + "epoch": 2.650705954573358, + "grad_norm": 0.4862852096557617, + "learning_rate": 8.620073105390458e-05, + "loss": 1.8408, + "step": 8636 + }, + { + "epoch": 2.651012891344383, + "grad_norm": 0.5418413877487183, + "learning_rate": 8.619730225926462e-05, + "loss": 1.8715, + "step": 8637 + }, + { + "epoch": 2.6513198281154082, + "grad_norm": 0.4154299795627594, + "learning_rate": 8.619387310690168e-05, + "loss": 1.8879, + "step": 8638 + }, + { + "epoch": 2.6516267648864336, + "grad_norm": 0.3325296938419342, + "learning_rate": 8.619044359684968e-05, + "loss": 1.8422, + "step": 8639 + }, + { + "epoch": 2.6519337016574585, + "grad_norm": 0.4082878828048706, + "learning_rate": 8.61870137291425e-05, + "loss": 1.8375, + "step": 8640 + }, + { + "epoch": 2.652240638428484, + "grad_norm": 0.46948596835136414, + "learning_rate": 8.618358350381406e-05, + "loss": 1.8367, + "step": 8641 + }, + { + "epoch": 2.6525475751995087, + "grad_norm": 0.3770928978919983, + "learning_rate": 8.618015292089823e-05, + "loss": 1.8236, + "step": 8642 + }, + { + "epoch": 2.652854511970534, + "grad_norm": 0.27340826392173767, + "learning_rate": 8.617672198042892e-05, + "loss": 1.8446, + "step": 8643 + }, + { + "epoch": 2.6531614487415593, + "grad_norm": 0.4071608781814575, + "learning_rate": 8.617329068244004e-05, + "loss": 1.8576, + "step": 8644 + }, + { + "epoch": 2.6534683855125847, + "grad_norm": 0.5041884779930115, + "learning_rate": 8.61698590269655e-05, + "loss": 1.9075, + "step": 8645 + }, + { + "epoch": 2.6537753222836096, + "grad_norm": 0.4129817485809326, + "learning_rate": 8.616642701403921e-05, + "loss": 1.8592, + "step": 8646 + }, + { + "epoch": 2.654082259054635, + "grad_norm": 0.2837994694709778, + "learning_rate": 8.616299464369508e-05, + "loss": 1.8383, + "step": 8647 + }, + { + "epoch": 2.65438919582566, + "grad_norm": 0.3413170278072357, + "learning_rate": 8.615956191596707e-05, + "loss": 1.8083, + "step": 8648 + }, + { + "epoch": 2.654696132596685, + "grad_norm": 0.3661767244338989, + "learning_rate": 8.615612883088907e-05, + "loss": 1.9141, + "step": 8649 + }, + { + "epoch": 2.6550030693677105, + "grad_norm": 0.3209584951400757, + "learning_rate": 8.6152695388495e-05, + "loss": 1.8886, + "step": 8650 + }, + { + "epoch": 2.6553100061387354, + "grad_norm": 0.3161548674106598, + "learning_rate": 8.61492615888188e-05, + "loss": 1.832, + "step": 8651 + }, + { + "epoch": 2.6556169429097607, + "grad_norm": 0.3258545696735382, + "learning_rate": 8.614582743189441e-05, + "loss": 1.8747, + "step": 8652 + }, + { + "epoch": 2.6559238796807856, + "grad_norm": 0.3528682291507721, + "learning_rate": 8.614239291775579e-05, + "loss": 1.9192, + "step": 8653 + }, + { + "epoch": 2.656230816451811, + "grad_norm": 0.3430826961994171, + "learning_rate": 8.613895804643684e-05, + "loss": 1.8601, + "step": 8654 + }, + { + "epoch": 2.6565377532228363, + "grad_norm": 0.3221988379955292, + "learning_rate": 8.613552281797152e-05, + "loss": 1.9218, + "step": 8655 + }, + { + "epoch": 2.656844689993861, + "grad_norm": 0.2917289137840271, + "learning_rate": 8.613208723239379e-05, + "loss": 1.7443, + "step": 8656 + }, + { + "epoch": 2.6571516267648865, + "grad_norm": 0.28350377082824707, + "learning_rate": 8.612865128973762e-05, + "loss": 1.809, + "step": 8657 + }, + { + "epoch": 2.6574585635359114, + "grad_norm": 0.2758159339427948, + "learning_rate": 8.61252149900369e-05, + "loss": 1.8628, + "step": 8658 + }, + { + "epoch": 2.6577655003069367, + "grad_norm": 0.3537377417087555, + "learning_rate": 8.612177833332566e-05, + "loss": 1.8586, + "step": 8659 + }, + { + "epoch": 2.658072437077962, + "grad_norm": 0.38237693905830383, + "learning_rate": 8.611834131963783e-05, + "loss": 1.8869, + "step": 8660 + }, + { + "epoch": 2.6583793738489874, + "grad_norm": 0.30623751878738403, + "learning_rate": 8.611490394900739e-05, + "loss": 1.8508, + "step": 8661 + }, + { + "epoch": 2.6586863106200123, + "grad_norm": 0.2597752809524536, + "learning_rate": 8.611146622146828e-05, + "loss": 1.7931, + "step": 8662 + }, + { + "epoch": 2.6589932473910376, + "grad_norm": 0.2953357696533203, + "learning_rate": 8.61080281370545e-05, + "loss": 1.837, + "step": 8663 + }, + { + "epoch": 2.6593001841620625, + "grad_norm": 0.3018724322319031, + "learning_rate": 8.610458969580003e-05, + "loss": 1.871, + "step": 8664 + }, + { + "epoch": 2.659607120933088, + "grad_norm": 0.36607179045677185, + "learning_rate": 8.610115089773885e-05, + "loss": 1.9453, + "step": 8665 + }, + { + "epoch": 2.659914057704113, + "grad_norm": 0.38754695653915405, + "learning_rate": 8.609771174290493e-05, + "loss": 1.8886, + "step": 8666 + }, + { + "epoch": 2.660220994475138, + "grad_norm": 0.3752847909927368, + "learning_rate": 8.609427223133226e-05, + "loss": 1.8662, + "step": 8667 + }, + { + "epoch": 2.6605279312461634, + "grad_norm": 0.3301216661930084, + "learning_rate": 8.609083236305483e-05, + "loss": 1.8697, + "step": 8668 + }, + { + "epoch": 2.6608348680171883, + "grad_norm": 0.31682586669921875, + "learning_rate": 8.608739213810666e-05, + "loss": 1.8982, + "step": 8669 + }, + { + "epoch": 2.6611418047882136, + "grad_norm": 0.30835145711898804, + "learning_rate": 8.608395155652172e-05, + "loss": 1.8245, + "step": 8670 + }, + { + "epoch": 2.661448741559239, + "grad_norm": 0.32517582178115845, + "learning_rate": 8.608051061833402e-05, + "loss": 1.9117, + "step": 8671 + }, + { + "epoch": 2.661755678330264, + "grad_norm": 0.3120395541191101, + "learning_rate": 8.607706932357757e-05, + "loss": 1.76, + "step": 8672 + }, + { + "epoch": 2.662062615101289, + "grad_norm": 0.31719091534614563, + "learning_rate": 8.607362767228637e-05, + "loss": 1.8939, + "step": 8673 + }, + { + "epoch": 2.662369551872314, + "grad_norm": 0.28792136907577515, + "learning_rate": 8.607018566449445e-05, + "loss": 1.8403, + "step": 8674 + }, + { + "epoch": 2.6626764886433394, + "grad_norm": 0.28327643871307373, + "learning_rate": 8.606674330023581e-05, + "loss": 1.8204, + "step": 8675 + }, + { + "epoch": 2.6629834254143647, + "grad_norm": 0.29808422923088074, + "learning_rate": 8.606330057954446e-05, + "loss": 1.8325, + "step": 8676 + }, + { + "epoch": 2.66329036218539, + "grad_norm": 0.36162641644477844, + "learning_rate": 8.605985750245446e-05, + "loss": 1.8387, + "step": 8677 + }, + { + "epoch": 2.663597298956415, + "grad_norm": 0.3418589234352112, + "learning_rate": 8.605641406899978e-05, + "loss": 1.8139, + "step": 8678 + }, + { + "epoch": 2.6639042357274403, + "grad_norm": 0.31307870149612427, + "learning_rate": 8.605297027921451e-05, + "loss": 1.8897, + "step": 8679 + }, + { + "epoch": 2.664211172498465, + "grad_norm": 0.36962878704071045, + "learning_rate": 8.604952613313264e-05, + "loss": 1.9233, + "step": 8680 + }, + { + "epoch": 2.6645181092694905, + "grad_norm": 0.3502652049064636, + "learning_rate": 8.604608163078824e-05, + "loss": 1.8218, + "step": 8681 + }, + { + "epoch": 2.664825046040516, + "grad_norm": 0.3703038692474365, + "learning_rate": 8.604263677221533e-05, + "loss": 1.8484, + "step": 8682 + }, + { + "epoch": 2.6651319828115407, + "grad_norm": 0.2609662711620331, + "learning_rate": 8.603919155744796e-05, + "loss": 1.7645, + "step": 8683 + }, + { + "epoch": 2.665438919582566, + "grad_norm": 0.33297231793403625, + "learning_rate": 8.603574598652015e-05, + "loss": 1.8543, + "step": 8684 + }, + { + "epoch": 2.665745856353591, + "grad_norm": 0.28411462903022766, + "learning_rate": 8.603230005946601e-05, + "loss": 1.867, + "step": 8685 + }, + { + "epoch": 2.6660527931246163, + "grad_norm": 0.3209732174873352, + "learning_rate": 8.602885377631954e-05, + "loss": 1.8886, + "step": 8686 + }, + { + "epoch": 2.6663597298956416, + "grad_norm": 0.35397234559059143, + "learning_rate": 8.602540713711482e-05, + "loss": 1.8965, + "step": 8687 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2925071716308594, + "learning_rate": 8.602196014188593e-05, + "loss": 1.8027, + "step": 8688 + }, + { + "epoch": 2.666973603437692, + "grad_norm": 0.2902941107749939, + "learning_rate": 8.60185127906669e-05, + "loss": 1.8022, + "step": 8689 + }, + { + "epoch": 2.6672805402087167, + "grad_norm": 0.31528550386428833, + "learning_rate": 8.601506508349181e-05, + "loss": 1.8153, + "step": 8690 + }, + { + "epoch": 2.667587476979742, + "grad_norm": 0.32254844903945923, + "learning_rate": 8.601161702039477e-05, + "loss": 1.8199, + "step": 8691 + }, + { + "epoch": 2.6678944137507674, + "grad_norm": 0.2999059855937958, + "learning_rate": 8.600816860140979e-05, + "loss": 1.8404, + "step": 8692 + }, + { + "epoch": 2.6682013505217927, + "grad_norm": 0.32727453112602234, + "learning_rate": 8.6004719826571e-05, + "loss": 1.8148, + "step": 8693 + }, + { + "epoch": 2.6685082872928176, + "grad_norm": 0.3048906624317169, + "learning_rate": 8.600127069591245e-05, + "loss": 1.833, + "step": 8694 + }, + { + "epoch": 2.668815224063843, + "grad_norm": 0.43790102005004883, + "learning_rate": 8.599782120946826e-05, + "loss": 1.8537, + "step": 8695 + }, + { + "epoch": 2.669122160834868, + "grad_norm": 0.38096752762794495, + "learning_rate": 8.59943713672725e-05, + "loss": 1.8094, + "step": 8696 + }, + { + "epoch": 2.669429097605893, + "grad_norm": 0.3065931499004364, + "learning_rate": 8.599092116935927e-05, + "loss": 1.8878, + "step": 8697 + }, + { + "epoch": 2.6697360343769185, + "grad_norm": 0.41807904839515686, + "learning_rate": 8.598747061576264e-05, + "loss": 1.8753, + "step": 8698 + }, + { + "epoch": 2.6700429711479434, + "grad_norm": 0.4906943142414093, + "learning_rate": 8.598401970651676e-05, + "loss": 1.7642, + "step": 8699 + }, + { + "epoch": 2.6703499079189688, + "grad_norm": 0.37138858437538147, + "learning_rate": 8.598056844165567e-05, + "loss": 1.8191, + "step": 8700 + }, + { + "epoch": 2.6706568446899936, + "grad_norm": 0.2804940938949585, + "learning_rate": 8.597711682121354e-05, + "loss": 1.8238, + "step": 8701 + }, + { + "epoch": 2.670963781461019, + "grad_norm": 0.3853018581867218, + "learning_rate": 8.597366484522445e-05, + "loss": 1.8762, + "step": 8702 + }, + { + "epoch": 2.6712707182320443, + "grad_norm": 0.3066580295562744, + "learning_rate": 8.597021251372253e-05, + "loss": 1.7638, + "step": 8703 + }, + { + "epoch": 2.671577655003069, + "grad_norm": 0.30797824263572693, + "learning_rate": 8.596675982674186e-05, + "loss": 1.8574, + "step": 8704 + }, + { + "epoch": 2.6718845917740945, + "grad_norm": 0.3268548548221588, + "learning_rate": 8.596330678431661e-05, + "loss": 1.9184, + "step": 8705 + }, + { + "epoch": 2.6721915285451194, + "grad_norm": 0.4077534079551697, + "learning_rate": 8.595985338648087e-05, + "loss": 1.8967, + "step": 8706 + }, + { + "epoch": 2.6724984653161448, + "grad_norm": 0.4514889419078827, + "learning_rate": 8.595639963326881e-05, + "loss": 1.8491, + "step": 8707 + }, + { + "epoch": 2.67280540208717, + "grad_norm": 0.39269959926605225, + "learning_rate": 8.59529455247145e-05, + "loss": 1.7865, + "step": 8708 + }, + { + "epoch": 2.6731123388581954, + "grad_norm": 0.3139820694923401, + "learning_rate": 8.594949106085212e-05, + "loss": 1.8007, + "step": 8709 + }, + { + "epoch": 2.6734192756292203, + "grad_norm": 0.3423599600791931, + "learning_rate": 8.59460362417158e-05, + "loss": 1.8389, + "step": 8710 + }, + { + "epoch": 2.6737262124002457, + "grad_norm": 0.3829670548439026, + "learning_rate": 8.594258106733968e-05, + "loss": 1.8355, + "step": 8711 + }, + { + "epoch": 2.6740331491712706, + "grad_norm": 0.34447145462036133, + "learning_rate": 8.593912553775791e-05, + "loss": 1.8595, + "step": 8712 + }, + { + "epoch": 2.674340085942296, + "grad_norm": 0.34868502616882324, + "learning_rate": 8.593566965300465e-05, + "loss": 1.9195, + "step": 8713 + }, + { + "epoch": 2.674647022713321, + "grad_norm": 0.4919234812259674, + "learning_rate": 8.593221341311402e-05, + "loss": 1.8321, + "step": 8714 + }, + { + "epoch": 2.674953959484346, + "grad_norm": 0.4413202702999115, + "learning_rate": 8.59287568181202e-05, + "loss": 1.7976, + "step": 8715 + }, + { + "epoch": 2.6752608962553714, + "grad_norm": 0.3395153880119324, + "learning_rate": 8.592529986805736e-05, + "loss": 1.7974, + "step": 8716 + }, + { + "epoch": 2.6755678330263963, + "grad_norm": 0.30407002568244934, + "learning_rate": 8.592184256295965e-05, + "loss": 1.7929, + "step": 8717 + }, + { + "epoch": 2.6758747697974217, + "grad_norm": 0.31925150752067566, + "learning_rate": 8.591838490286121e-05, + "loss": 1.8413, + "step": 8718 + }, + { + "epoch": 2.676181706568447, + "grad_norm": 0.28456512093544006, + "learning_rate": 8.591492688779627e-05, + "loss": 1.8686, + "step": 8719 + }, + { + "epoch": 2.6764886433394723, + "grad_norm": 0.3286445438861847, + "learning_rate": 8.591146851779895e-05, + "loss": 1.8538, + "step": 8720 + }, + { + "epoch": 2.6767955801104972, + "grad_norm": 0.40354880690574646, + "learning_rate": 8.590800979290346e-05, + "loss": 1.8599, + "step": 8721 + }, + { + "epoch": 2.6771025168815226, + "grad_norm": 0.3654378652572632, + "learning_rate": 8.590455071314397e-05, + "loss": 1.8063, + "step": 8722 + }, + { + "epoch": 2.6774094536525475, + "grad_norm": 0.3211844861507416, + "learning_rate": 8.590109127855466e-05, + "loss": 1.8146, + "step": 8723 + }, + { + "epoch": 2.677716390423573, + "grad_norm": 0.30884361267089844, + "learning_rate": 8.589763148916973e-05, + "loss": 1.8725, + "step": 8724 + }, + { + "epoch": 2.678023327194598, + "grad_norm": 0.303095281124115, + "learning_rate": 8.589417134502336e-05, + "loss": 1.8994, + "step": 8725 + }, + { + "epoch": 2.678330263965623, + "grad_norm": 0.3086979389190674, + "learning_rate": 8.589071084614977e-05, + "loss": 1.7941, + "step": 8726 + }, + { + "epoch": 2.6786372007366483, + "grad_norm": 0.30298081040382385, + "learning_rate": 8.588724999258311e-05, + "loss": 1.8945, + "step": 8727 + }, + { + "epoch": 2.6789441375076732, + "grad_norm": 0.33253392577171326, + "learning_rate": 8.588378878435763e-05, + "loss": 1.8397, + "step": 8728 + }, + { + "epoch": 2.6792510742786986, + "grad_norm": 0.2782913148403168, + "learning_rate": 8.588032722150752e-05, + "loss": 1.8505, + "step": 8729 + }, + { + "epoch": 2.679558011049724, + "grad_norm": 0.3482373058795929, + "learning_rate": 8.587686530406697e-05, + "loss": 1.9144, + "step": 8730 + }, + { + "epoch": 2.679864947820749, + "grad_norm": 0.31985580921173096, + "learning_rate": 8.587340303207021e-05, + "loss": 1.7695, + "step": 8731 + }, + { + "epoch": 2.680171884591774, + "grad_norm": 0.3222995400428772, + "learning_rate": 8.586994040555147e-05, + "loss": 1.8624, + "step": 8732 + }, + { + "epoch": 2.680478821362799, + "grad_norm": 0.28178468346595764, + "learning_rate": 8.586647742454495e-05, + "loss": 1.8036, + "step": 8733 + }, + { + "epoch": 2.6807857581338244, + "grad_norm": 0.27367156744003296, + "learning_rate": 8.586301408908487e-05, + "loss": 1.801, + "step": 8734 + }, + { + "epoch": 2.6810926949048497, + "grad_norm": 0.2696636915206909, + "learning_rate": 8.585955039920547e-05, + "loss": 1.8211, + "step": 8735 + }, + { + "epoch": 2.681399631675875, + "grad_norm": 0.2880568504333496, + "learning_rate": 8.585608635494098e-05, + "loss": 1.8543, + "step": 8736 + }, + { + "epoch": 2.6817065684469, + "grad_norm": 0.28708669543266296, + "learning_rate": 8.585262195632562e-05, + "loss": 1.8311, + "step": 8737 + }, + { + "epoch": 2.6820135052179253, + "grad_norm": 0.2633354663848877, + "learning_rate": 8.584915720339364e-05, + "loss": 1.7815, + "step": 8738 + }, + { + "epoch": 2.68232044198895, + "grad_norm": 0.25772908329963684, + "learning_rate": 8.584569209617928e-05, + "loss": 1.8322, + "step": 8739 + }, + { + "epoch": 2.6826273787599755, + "grad_norm": 0.2665303647518158, + "learning_rate": 8.584222663471677e-05, + "loss": 1.8456, + "step": 8740 + }, + { + "epoch": 2.682934315531001, + "grad_norm": 0.26330938935279846, + "learning_rate": 8.583876081904038e-05, + "loss": 1.8552, + "step": 8741 + }, + { + "epoch": 2.6832412523020257, + "grad_norm": 0.29758915305137634, + "learning_rate": 8.583529464918434e-05, + "loss": 1.8362, + "step": 8742 + }, + { + "epoch": 2.683548189073051, + "grad_norm": 0.32018154859542847, + "learning_rate": 8.583182812518293e-05, + "loss": 1.8439, + "step": 8743 + }, + { + "epoch": 2.683855125844076, + "grad_norm": 0.33279770612716675, + "learning_rate": 8.582836124707036e-05, + "loss": 1.8629, + "step": 8744 + }, + { + "epoch": 2.6841620626151013, + "grad_norm": 0.40244174003601074, + "learning_rate": 8.582489401488096e-05, + "loss": 1.8221, + "step": 8745 + }, + { + "epoch": 2.6844689993861266, + "grad_norm": 0.3935016393661499, + "learning_rate": 8.582142642864895e-05, + "loss": 1.8564, + "step": 8746 + }, + { + "epoch": 2.6847759361571515, + "grad_norm": 0.3062369227409363, + "learning_rate": 8.58179584884086e-05, + "loss": 1.8587, + "step": 8747 + }, + { + "epoch": 2.685082872928177, + "grad_norm": 0.320422500371933, + "learning_rate": 8.58144901941942e-05, + "loss": 1.8758, + "step": 8748 + }, + { + "epoch": 2.6853898096992017, + "grad_norm": 0.3681413531303406, + "learning_rate": 8.581102154604001e-05, + "loss": 1.7899, + "step": 8749 + }, + { + "epoch": 2.685696746470227, + "grad_norm": 0.37779754400253296, + "learning_rate": 8.580755254398032e-05, + "loss": 1.8584, + "step": 8750 + }, + { + "epoch": 2.6860036832412524, + "grad_norm": 0.34761306643486023, + "learning_rate": 8.58040831880494e-05, + "loss": 1.8656, + "step": 8751 + }, + { + "epoch": 2.6863106200122777, + "grad_norm": 0.2833636403083801, + "learning_rate": 8.580061347828156e-05, + "loss": 1.8043, + "step": 8752 + }, + { + "epoch": 2.6866175567833026, + "grad_norm": 0.29990699887275696, + "learning_rate": 8.579714341471106e-05, + "loss": 1.8365, + "step": 8753 + }, + { + "epoch": 2.686924493554328, + "grad_norm": 0.3322729766368866, + "learning_rate": 8.579367299737222e-05, + "loss": 1.8541, + "step": 8754 + }, + { + "epoch": 2.687231430325353, + "grad_norm": 0.31999245285987854, + "learning_rate": 8.579020222629931e-05, + "loss": 1.8405, + "step": 8755 + }, + { + "epoch": 2.687538367096378, + "grad_norm": 0.332714319229126, + "learning_rate": 8.578673110152666e-05, + "loss": 1.9512, + "step": 8756 + }, + { + "epoch": 2.6878453038674035, + "grad_norm": 0.36372992396354675, + "learning_rate": 8.578325962308855e-05, + "loss": 1.8969, + "step": 8757 + }, + { + "epoch": 2.6881522406384284, + "grad_norm": 0.27239182591438293, + "learning_rate": 8.577978779101929e-05, + "loss": 1.7898, + "step": 8758 + }, + { + "epoch": 2.6884591774094537, + "grad_norm": 0.3552536070346832, + "learning_rate": 8.57763156053532e-05, + "loss": 1.8919, + "step": 8759 + }, + { + "epoch": 2.6887661141804786, + "grad_norm": 0.40591174364089966, + "learning_rate": 8.577284306612458e-05, + "loss": 1.8021, + "step": 8760 + }, + { + "epoch": 2.689073050951504, + "grad_norm": 0.37012994289398193, + "learning_rate": 8.576937017336777e-05, + "loss": 1.7803, + "step": 8761 + }, + { + "epoch": 2.6893799877225293, + "grad_norm": 0.33496031165122986, + "learning_rate": 8.576589692711707e-05, + "loss": 1.8573, + "step": 8762 + }, + { + "epoch": 2.689686924493554, + "grad_norm": 0.35000404715538025, + "learning_rate": 8.576242332740683e-05, + "loss": 1.8769, + "step": 8763 + }, + { + "epoch": 2.6899938612645795, + "grad_norm": 0.32730549573898315, + "learning_rate": 8.575894937427135e-05, + "loss": 1.823, + "step": 8764 + }, + { + "epoch": 2.6903007980356044, + "grad_norm": 0.31418806314468384, + "learning_rate": 8.575547506774497e-05, + "loss": 1.7646, + "step": 8765 + }, + { + "epoch": 2.6906077348066297, + "grad_norm": 0.277721107006073, + "learning_rate": 8.575200040786205e-05, + "loss": 1.8046, + "step": 8766 + }, + { + "epoch": 2.690914671577655, + "grad_norm": 0.3289557695388794, + "learning_rate": 8.574852539465688e-05, + "loss": 1.8145, + "step": 8767 + }, + { + "epoch": 2.6912216083486804, + "grad_norm": 0.28926602005958557, + "learning_rate": 8.574505002816385e-05, + "loss": 1.7627, + "step": 8768 + }, + { + "epoch": 2.6915285451197053, + "grad_norm": 0.2972332835197449, + "learning_rate": 8.574157430841727e-05, + "loss": 1.8294, + "step": 8769 + }, + { + "epoch": 2.6918354818907306, + "grad_norm": 0.28366953134536743, + "learning_rate": 8.57380982354515e-05, + "loss": 1.8535, + "step": 8770 + }, + { + "epoch": 2.6921424186617555, + "grad_norm": 0.2798771262168884, + "learning_rate": 8.57346218093009e-05, + "loss": 1.8298, + "step": 8771 + }, + { + "epoch": 2.692449355432781, + "grad_norm": 0.2614765465259552, + "learning_rate": 8.573114502999983e-05, + "loss": 1.8555, + "step": 8772 + }, + { + "epoch": 2.692756292203806, + "grad_norm": 0.30653777718544006, + "learning_rate": 8.572766789758265e-05, + "loss": 1.8507, + "step": 8773 + }, + { + "epoch": 2.693063228974831, + "grad_norm": 0.3189094066619873, + "learning_rate": 8.572419041208369e-05, + "loss": 1.8791, + "step": 8774 + }, + { + "epoch": 2.6933701657458564, + "grad_norm": 0.33381524682044983, + "learning_rate": 8.572071257353735e-05, + "loss": 1.8241, + "step": 8775 + }, + { + "epoch": 2.6936771025168813, + "grad_norm": 0.2776879668235779, + "learning_rate": 8.571723438197801e-05, + "loss": 1.7837, + "step": 8776 + }, + { + "epoch": 2.6939840392879066, + "grad_norm": 0.35845425724983215, + "learning_rate": 8.571375583744001e-05, + "loss": 1.8896, + "step": 8777 + }, + { + "epoch": 2.694290976058932, + "grad_norm": 0.28849005699157715, + "learning_rate": 8.571027693995775e-05, + "loss": 1.803, + "step": 8778 + }, + { + "epoch": 2.694597912829957, + "grad_norm": 0.3008786141872406, + "learning_rate": 8.57067976895656e-05, + "loss": 1.8559, + "step": 8779 + }, + { + "epoch": 2.694904849600982, + "grad_norm": 0.2924736440181732, + "learning_rate": 8.570331808629795e-05, + "loss": 1.8016, + "step": 8780 + }, + { + "epoch": 2.695211786372007, + "grad_norm": 0.2962380051612854, + "learning_rate": 8.569983813018917e-05, + "loss": 1.819, + "step": 8781 + }, + { + "epoch": 2.6955187231430324, + "grad_norm": 0.3141970634460449, + "learning_rate": 8.569635782127367e-05, + "loss": 1.8462, + "step": 8782 + }, + { + "epoch": 2.6958256599140578, + "grad_norm": 0.297061562538147, + "learning_rate": 8.569287715958584e-05, + "loss": 1.855, + "step": 8783 + }, + { + "epoch": 2.696132596685083, + "grad_norm": 0.30669623613357544, + "learning_rate": 8.568939614516009e-05, + "loss": 1.8626, + "step": 8784 + }, + { + "epoch": 2.696439533456108, + "grad_norm": 0.2782025933265686, + "learning_rate": 8.568591477803081e-05, + "loss": 1.8993, + "step": 8785 + }, + { + "epoch": 2.6967464702271333, + "grad_norm": 0.3644821345806122, + "learning_rate": 8.568243305823239e-05, + "loss": 1.8318, + "step": 8786 + }, + { + "epoch": 2.697053406998158, + "grad_norm": 0.4073259234428406, + "learning_rate": 8.567895098579925e-05, + "loss": 1.8963, + "step": 8787 + }, + { + "epoch": 2.6973603437691835, + "grad_norm": 0.40539780259132385, + "learning_rate": 8.567546856076583e-05, + "loss": 1.8644, + "step": 8788 + }, + { + "epoch": 2.697667280540209, + "grad_norm": 0.36739271879196167, + "learning_rate": 8.567198578316648e-05, + "loss": 1.8555, + "step": 8789 + }, + { + "epoch": 2.6979742173112338, + "grad_norm": 0.3339182138442993, + "learning_rate": 8.566850265303568e-05, + "loss": 1.8431, + "step": 8790 + }, + { + "epoch": 2.698281154082259, + "grad_norm": 0.3389740586280823, + "learning_rate": 8.566501917040784e-05, + "loss": 1.8271, + "step": 8791 + }, + { + "epoch": 2.698588090853284, + "grad_norm": 0.33819615840911865, + "learning_rate": 8.566153533531737e-05, + "loss": 1.8504, + "step": 8792 + }, + { + "epoch": 2.6988950276243093, + "grad_norm": 0.39106276631355286, + "learning_rate": 8.56580511477987e-05, + "loss": 1.7656, + "step": 8793 + }, + { + "epoch": 2.6992019643953347, + "grad_norm": 0.3374726474285126, + "learning_rate": 8.565456660788628e-05, + "loss": 1.8256, + "step": 8794 + }, + { + "epoch": 2.69950890116636, + "grad_norm": 0.33096614480018616, + "learning_rate": 8.565108171561452e-05, + "loss": 1.9486, + "step": 8795 + }, + { + "epoch": 2.699815837937385, + "grad_norm": 0.3202100396156311, + "learning_rate": 8.564759647101788e-05, + "loss": 1.7708, + "step": 8796 + }, + { + "epoch": 2.7001227747084102, + "grad_norm": 0.28830909729003906, + "learning_rate": 8.56441108741308e-05, + "loss": 1.8247, + "step": 8797 + }, + { + "epoch": 2.700429711479435, + "grad_norm": 0.32385459542274475, + "learning_rate": 8.564062492498772e-05, + "loss": 1.8338, + "step": 8798 + }, + { + "epoch": 2.7007366482504604, + "grad_norm": 0.3059900104999542, + "learning_rate": 8.56371386236231e-05, + "loss": 1.8321, + "step": 8799 + }, + { + "epoch": 2.701043585021486, + "grad_norm": 0.2922738492488861, + "learning_rate": 8.563365197007141e-05, + "loss": 1.7734, + "step": 8800 + }, + { + "epoch": 2.7013505217925107, + "grad_norm": 0.32542386651039124, + "learning_rate": 8.563016496436704e-05, + "loss": 1.8696, + "step": 8801 + }, + { + "epoch": 2.701657458563536, + "grad_norm": 0.2830851674079895, + "learning_rate": 8.562667760654452e-05, + "loss": 1.8237, + "step": 8802 + }, + { + "epoch": 2.701964395334561, + "grad_norm": 0.2794142961502075, + "learning_rate": 8.562318989663831e-05, + "loss": 1.8301, + "step": 8803 + }, + { + "epoch": 2.7022713321055862, + "grad_norm": 0.3149101436138153, + "learning_rate": 8.561970183468281e-05, + "loss": 1.8716, + "step": 8804 + }, + { + "epoch": 2.7025782688766116, + "grad_norm": 0.29530593752861023, + "learning_rate": 8.561621342071258e-05, + "loss": 1.9069, + "step": 8805 + }, + { + "epoch": 2.7028852056476365, + "grad_norm": 0.33965879678726196, + "learning_rate": 8.561272465476204e-05, + "loss": 1.8381, + "step": 8806 + }, + { + "epoch": 2.703192142418662, + "grad_norm": 0.3310995399951935, + "learning_rate": 8.560923553686569e-05, + "loss": 1.9293, + "step": 8807 + }, + { + "epoch": 2.7034990791896867, + "grad_norm": 0.3828842043876648, + "learning_rate": 8.5605746067058e-05, + "loss": 1.8789, + "step": 8808 + }, + { + "epoch": 2.703806015960712, + "grad_norm": 0.3666260242462158, + "learning_rate": 8.560225624537346e-05, + "loss": 1.8622, + "step": 8809 + }, + { + "epoch": 2.7041129527317374, + "grad_norm": 0.36732783913612366, + "learning_rate": 8.559876607184653e-05, + "loss": 1.8177, + "step": 8810 + }, + { + "epoch": 2.7044198895027627, + "grad_norm": 0.35554859042167664, + "learning_rate": 8.559527554651176e-05, + "loss": 1.884, + "step": 8811 + }, + { + "epoch": 2.7047268262737876, + "grad_norm": 0.3118159770965576, + "learning_rate": 8.55917846694036e-05, + "loss": 1.8779, + "step": 8812 + }, + { + "epoch": 2.705033763044813, + "grad_norm": 0.278105765581131, + "learning_rate": 8.558829344055657e-05, + "loss": 1.8513, + "step": 8813 + }, + { + "epoch": 2.705340699815838, + "grad_norm": 0.30809372663497925, + "learning_rate": 8.558480186000517e-05, + "loss": 1.8023, + "step": 8814 + }, + { + "epoch": 2.705647636586863, + "grad_norm": 0.28222522139549255, + "learning_rate": 8.558130992778388e-05, + "loss": 1.8421, + "step": 8815 + }, + { + "epoch": 2.7059545733578885, + "grad_norm": 0.29532718658447266, + "learning_rate": 8.557781764392725e-05, + "loss": 1.8131, + "step": 8816 + }, + { + "epoch": 2.7062615101289134, + "grad_norm": 0.2670072317123413, + "learning_rate": 8.557432500846975e-05, + "loss": 1.7856, + "step": 8817 + }, + { + "epoch": 2.7065684468999387, + "grad_norm": 0.3431483805179596, + "learning_rate": 8.557083202144594e-05, + "loss": 1.8484, + "step": 8818 + }, + { + "epoch": 2.7068753836709636, + "grad_norm": 0.3824561536312103, + "learning_rate": 8.556733868289033e-05, + "loss": 1.8954, + "step": 8819 + }, + { + "epoch": 2.707182320441989, + "grad_norm": 0.4189379811286926, + "learning_rate": 8.55638449928374e-05, + "loss": 1.7846, + "step": 8820 + }, + { + "epoch": 2.7074892572130143, + "grad_norm": 0.34948450326919556, + "learning_rate": 8.556035095132173e-05, + "loss": 1.7696, + "step": 8821 + }, + { + "epoch": 2.707796193984039, + "grad_norm": 0.2906292676925659, + "learning_rate": 8.555685655837783e-05, + "loss": 1.8359, + "step": 8822 + }, + { + "epoch": 2.7081031307550645, + "grad_norm": 0.2756035029888153, + "learning_rate": 8.555336181404023e-05, + "loss": 1.8684, + "step": 8823 + }, + { + "epoch": 2.7084100675260894, + "grad_norm": 0.3714772164821625, + "learning_rate": 8.554986671834346e-05, + "loss": 1.8833, + "step": 8824 + }, + { + "epoch": 2.7087170042971147, + "grad_norm": 0.41674792766571045, + "learning_rate": 8.554637127132209e-05, + "loss": 1.8272, + "step": 8825 + }, + { + "epoch": 2.70902394106814, + "grad_norm": 0.333915650844574, + "learning_rate": 8.554287547301063e-05, + "loss": 1.8343, + "step": 8826 + }, + { + "epoch": 2.7093308778391654, + "grad_norm": 0.33764639496803284, + "learning_rate": 8.553937932344365e-05, + "loss": 1.812, + "step": 8827 + }, + { + "epoch": 2.7096378146101903, + "grad_norm": 0.4445551931858063, + "learning_rate": 8.553588282265569e-05, + "loss": 1.8386, + "step": 8828 + }, + { + "epoch": 2.7099447513812156, + "grad_norm": 0.43314024806022644, + "learning_rate": 8.553238597068131e-05, + "loss": 1.7727, + "step": 8829 + }, + { + "epoch": 2.7102516881522405, + "grad_norm": 0.364596426486969, + "learning_rate": 8.552888876755506e-05, + "loss": 1.8875, + "step": 8830 + }, + { + "epoch": 2.710558624923266, + "grad_norm": 0.3023224174976349, + "learning_rate": 8.552539121331151e-05, + "loss": 1.8676, + "step": 8831 + }, + { + "epoch": 2.710865561694291, + "grad_norm": 0.3278682231903076, + "learning_rate": 8.552189330798522e-05, + "loss": 1.852, + "step": 8832 + }, + { + "epoch": 2.711172498465316, + "grad_norm": 0.34684303402900696, + "learning_rate": 8.551839505161077e-05, + "loss": 1.8449, + "step": 8833 + }, + { + "epoch": 2.7114794352363414, + "grad_norm": 0.3398132920265198, + "learning_rate": 8.551489644422271e-05, + "loss": 1.8493, + "step": 8834 + }, + { + "epoch": 2.7117863720073663, + "grad_norm": 0.2835905849933624, + "learning_rate": 8.551139748585563e-05, + "loss": 1.8283, + "step": 8835 + }, + { + "epoch": 2.7120933087783916, + "grad_norm": 0.30910351872444153, + "learning_rate": 8.55078981765441e-05, + "loss": 1.8429, + "step": 8836 + }, + { + "epoch": 2.712400245549417, + "grad_norm": 0.3802061676979065, + "learning_rate": 8.550439851632272e-05, + "loss": 1.8348, + "step": 8837 + }, + { + "epoch": 2.712707182320442, + "grad_norm": 0.3686448931694031, + "learning_rate": 8.550089850522606e-05, + "loss": 1.8652, + "step": 8838 + }, + { + "epoch": 2.713014119091467, + "grad_norm": 0.2919705808162689, + "learning_rate": 8.549739814328872e-05, + "loss": 1.8318, + "step": 8839 + }, + { + "epoch": 2.713321055862492, + "grad_norm": 0.34780198335647583, + "learning_rate": 8.549389743054527e-05, + "loss": 1.8781, + "step": 8840 + }, + { + "epoch": 2.7136279926335174, + "grad_norm": 0.3955966532230377, + "learning_rate": 8.549039636703034e-05, + "loss": 1.867, + "step": 8841 + }, + { + "epoch": 2.7139349294045427, + "grad_norm": 0.2836689054965973, + "learning_rate": 8.548689495277851e-05, + "loss": 1.7859, + "step": 8842 + }, + { + "epoch": 2.714241866175568, + "grad_norm": 0.369865357875824, + "learning_rate": 8.548339318782436e-05, + "loss": 1.8246, + "step": 8843 + }, + { + "epoch": 2.714548802946593, + "grad_norm": 0.2901081442832947, + "learning_rate": 8.547989107220256e-05, + "loss": 1.7888, + "step": 8844 + }, + { + "epoch": 2.7148557397176183, + "grad_norm": 0.2790970802307129, + "learning_rate": 8.547638860594764e-05, + "loss": 1.8311, + "step": 8845 + }, + { + "epoch": 2.715162676488643, + "grad_norm": 0.2935783267021179, + "learning_rate": 8.547288578909429e-05, + "loss": 1.857, + "step": 8846 + }, + { + "epoch": 2.7154696132596685, + "grad_norm": 0.27074959874153137, + "learning_rate": 8.546938262167708e-05, + "loss": 1.7457, + "step": 8847 + }, + { + "epoch": 2.715776550030694, + "grad_norm": 0.3042888343334198, + "learning_rate": 8.546587910373063e-05, + "loss": 1.8598, + "step": 8848 + }, + { + "epoch": 2.7160834868017187, + "grad_norm": 0.29088664054870605, + "learning_rate": 8.546237523528958e-05, + "loss": 1.8461, + "step": 8849 + }, + { + "epoch": 2.716390423572744, + "grad_norm": 0.3022211492061615, + "learning_rate": 8.545887101638857e-05, + "loss": 1.8327, + "step": 8850 + }, + { + "epoch": 2.716697360343769, + "grad_norm": 0.30194929242134094, + "learning_rate": 8.545536644706218e-05, + "loss": 1.8331, + "step": 8851 + }, + { + "epoch": 2.7170042971147943, + "grad_norm": 0.31702303886413574, + "learning_rate": 8.54518615273451e-05, + "loss": 1.8576, + "step": 8852 + }, + { + "epoch": 2.7173112338858196, + "grad_norm": 0.30386796593666077, + "learning_rate": 8.544835625727195e-05, + "loss": 1.8278, + "step": 8853 + }, + { + "epoch": 2.717618170656845, + "grad_norm": 0.30670568346977234, + "learning_rate": 8.544485063687735e-05, + "loss": 1.8123, + "step": 8854 + }, + { + "epoch": 2.71792510742787, + "grad_norm": 0.3896371126174927, + "learning_rate": 8.544134466619597e-05, + "loss": 1.8101, + "step": 8855 + }, + { + "epoch": 2.718232044198895, + "grad_norm": 0.4742000699043274, + "learning_rate": 8.543783834526245e-05, + "loss": 1.8402, + "step": 8856 + }, + { + "epoch": 2.71853898096992, + "grad_norm": 0.4234209954738617, + "learning_rate": 8.543433167411143e-05, + "loss": 1.8814, + "step": 8857 + }, + { + "epoch": 2.7188459177409454, + "grad_norm": 0.28478503227233887, + "learning_rate": 8.54308246527776e-05, + "loss": 1.8165, + "step": 8858 + }, + { + "epoch": 2.7191528545119708, + "grad_norm": 0.3534078896045685, + "learning_rate": 8.542731728129558e-05, + "loss": 1.7947, + "step": 8859 + }, + { + "epoch": 2.7194597912829956, + "grad_norm": 0.5471592545509338, + "learning_rate": 8.542380955970004e-05, + "loss": 1.9073, + "step": 8860 + }, + { + "epoch": 2.719766728054021, + "grad_norm": 0.5037226676940918, + "learning_rate": 8.542030148802566e-05, + "loss": 1.8701, + "step": 8861 + }, + { + "epoch": 2.720073664825046, + "grad_norm": 0.3415449559688568, + "learning_rate": 8.54167930663071e-05, + "loss": 1.827, + "step": 8862 + }, + { + "epoch": 2.720380601596071, + "grad_norm": 0.33516764640808105, + "learning_rate": 8.541328429457903e-05, + "loss": 1.9396, + "step": 8863 + }, + { + "epoch": 2.7206875383670965, + "grad_norm": 0.3934863209724426, + "learning_rate": 8.540977517287612e-05, + "loss": 1.8738, + "step": 8864 + }, + { + "epoch": 2.7209944751381214, + "grad_norm": 0.5137139558792114, + "learning_rate": 8.540626570123307e-05, + "loss": 1.9007, + "step": 8865 + }, + { + "epoch": 2.7213014119091468, + "grad_norm": 0.5846540331840515, + "learning_rate": 8.540275587968453e-05, + "loss": 1.9335, + "step": 8866 + }, + { + "epoch": 2.7216083486801717, + "grad_norm": 0.613388180732727, + "learning_rate": 8.539924570826523e-05, + "loss": 1.8967, + "step": 8867 + }, + { + "epoch": 2.721915285451197, + "grad_norm": 0.4804840087890625, + "learning_rate": 8.539573518700983e-05, + "loss": 1.7712, + "step": 8868 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.34939101338386536, + "learning_rate": 8.539222431595303e-05, + "loss": 1.8578, + "step": 8869 + }, + { + "epoch": 2.7225291589932477, + "grad_norm": 0.4230511486530304, + "learning_rate": 8.538871309512951e-05, + "loss": 1.793, + "step": 8870 + }, + { + "epoch": 2.7228360957642725, + "grad_norm": 0.5383400917053223, + "learning_rate": 8.538520152457402e-05, + "loss": 1.8153, + "step": 8871 + }, + { + "epoch": 2.723143032535298, + "grad_norm": 0.46213194727897644, + "learning_rate": 8.538168960432118e-05, + "loss": 1.9357, + "step": 8872 + }, + { + "epoch": 2.7234499693063228, + "grad_norm": 0.3126194477081299, + "learning_rate": 8.537817733440577e-05, + "loss": 1.7954, + "step": 8873 + }, + { + "epoch": 2.723756906077348, + "grad_norm": 0.4018714129924774, + "learning_rate": 8.537466471486248e-05, + "loss": 1.824, + "step": 8874 + }, + { + "epoch": 2.7240638428483734, + "grad_norm": 0.5690213441848755, + "learning_rate": 8.537115174572602e-05, + "loss": 1.7807, + "step": 8875 + }, + { + "epoch": 2.7243707796193983, + "grad_norm": 0.4669814705848694, + "learning_rate": 8.53676384270311e-05, + "loss": 1.7438, + "step": 8876 + }, + { + "epoch": 2.7246777163904237, + "grad_norm": 0.3040566146373749, + "learning_rate": 8.536412475881246e-05, + "loss": 1.8613, + "step": 8877 + }, + { + "epoch": 2.7249846531614486, + "grad_norm": 0.38985559344291687, + "learning_rate": 8.53606107411048e-05, + "loss": 1.816, + "step": 8878 + }, + { + "epoch": 2.725291589932474, + "grad_norm": 0.4417174160480499, + "learning_rate": 8.535709637394285e-05, + "loss": 1.8675, + "step": 8879 + }, + { + "epoch": 2.7255985267034992, + "grad_norm": 0.3254696726799011, + "learning_rate": 8.535358165736138e-05, + "loss": 1.8419, + "step": 8880 + }, + { + "epoch": 2.725905463474524, + "grad_norm": 0.36002370715141296, + "learning_rate": 8.535006659139506e-05, + "loss": 1.9084, + "step": 8881 + }, + { + "epoch": 2.7262124002455494, + "grad_norm": 0.3471790850162506, + "learning_rate": 8.534655117607869e-05, + "loss": 1.8442, + "step": 8882 + }, + { + "epoch": 2.7265193370165743, + "grad_norm": 0.3042849004268646, + "learning_rate": 8.534303541144697e-05, + "loss": 1.8261, + "step": 8883 + }, + { + "epoch": 2.7268262737875997, + "grad_norm": 0.32416659593582153, + "learning_rate": 8.533951929753465e-05, + "loss": 1.8625, + "step": 8884 + }, + { + "epoch": 2.727133210558625, + "grad_norm": 0.32449519634246826, + "learning_rate": 8.53360028343765e-05, + "loss": 1.8653, + "step": 8885 + }, + { + "epoch": 2.7274401473296503, + "grad_norm": 0.34744054079055786, + "learning_rate": 8.533248602200726e-05, + "loss": 1.8742, + "step": 8886 + }, + { + "epoch": 2.7277470841006752, + "grad_norm": 0.30540695786476135, + "learning_rate": 8.532896886046167e-05, + "loss": 1.8064, + "step": 8887 + }, + { + "epoch": 2.7280540208717006, + "grad_norm": 0.27105677127838135, + "learning_rate": 8.532545134977452e-05, + "loss": 1.7867, + "step": 8888 + }, + { + "epoch": 2.7283609576427255, + "grad_norm": 0.2682685852050781, + "learning_rate": 8.532193348998054e-05, + "loss": 1.8191, + "step": 8889 + }, + { + "epoch": 2.728667894413751, + "grad_norm": 0.33534809947013855, + "learning_rate": 8.531841528111452e-05, + "loss": 1.8758, + "step": 8890 + }, + { + "epoch": 2.728974831184776, + "grad_norm": 0.33555057644844055, + "learning_rate": 8.531489672321122e-05, + "loss": 1.8932, + "step": 8891 + }, + { + "epoch": 2.729281767955801, + "grad_norm": 0.3532167077064514, + "learning_rate": 8.531137781630542e-05, + "loss": 1.8621, + "step": 8892 + }, + { + "epoch": 2.7295887047268264, + "grad_norm": 0.337634414434433, + "learning_rate": 8.530785856043186e-05, + "loss": 1.8618, + "step": 8893 + }, + { + "epoch": 2.7298956414978512, + "grad_norm": 0.28855568170547485, + "learning_rate": 8.530433895562538e-05, + "loss": 1.8248, + "step": 8894 + }, + { + "epoch": 2.7302025782688766, + "grad_norm": 0.3128049373626709, + "learning_rate": 8.530081900192071e-05, + "loss": 1.8071, + "step": 8895 + }, + { + "epoch": 2.730509515039902, + "grad_norm": 0.2949801981449127, + "learning_rate": 8.529729869935265e-05, + "loss": 1.7704, + "step": 8896 + }, + { + "epoch": 2.730816451810927, + "grad_norm": 0.2708294987678528, + "learning_rate": 8.529377804795603e-05, + "loss": 1.8127, + "step": 8897 + }, + { + "epoch": 2.731123388581952, + "grad_norm": 0.300516813993454, + "learning_rate": 8.529025704776559e-05, + "loss": 1.9063, + "step": 8898 + }, + { + "epoch": 2.731430325352977, + "grad_norm": 0.2590954005718231, + "learning_rate": 8.528673569881613e-05, + "loss": 1.7595, + "step": 8899 + }, + { + "epoch": 2.7317372621240024, + "grad_norm": 0.30067136883735657, + "learning_rate": 8.528321400114248e-05, + "loss": 1.8697, + "step": 8900 + }, + { + "epoch": 2.7320441988950277, + "grad_norm": 0.3289981186389923, + "learning_rate": 8.527969195477943e-05, + "loss": 1.8257, + "step": 8901 + }, + { + "epoch": 2.732351135666053, + "grad_norm": 0.3205581307411194, + "learning_rate": 8.527616955976178e-05, + "loss": 1.9002, + "step": 8902 + }, + { + "epoch": 2.732658072437078, + "grad_norm": 0.30869361758232117, + "learning_rate": 8.527264681612435e-05, + "loss": 1.8239, + "step": 8903 + }, + { + "epoch": 2.7329650092081033, + "grad_norm": 0.3237484097480774, + "learning_rate": 8.526912372390195e-05, + "loss": 1.8879, + "step": 8904 + }, + { + "epoch": 2.733271945979128, + "grad_norm": 0.3172036111354828, + "learning_rate": 8.52656002831294e-05, + "loss": 1.8118, + "step": 8905 + }, + { + "epoch": 2.7335788827501535, + "grad_norm": 0.3326823115348816, + "learning_rate": 8.52620764938415e-05, + "loss": 1.8035, + "step": 8906 + }, + { + "epoch": 2.733885819521179, + "grad_norm": 0.36605212092399597, + "learning_rate": 8.525855235607311e-05, + "loss": 1.8689, + "step": 8907 + }, + { + "epoch": 2.7341927562922037, + "grad_norm": 0.31904828548431396, + "learning_rate": 8.525502786985905e-05, + "loss": 1.8188, + "step": 8908 + }, + { + "epoch": 2.734499693063229, + "grad_norm": 0.2657643258571625, + "learning_rate": 8.525150303523413e-05, + "loss": 1.7471, + "step": 8909 + }, + { + "epoch": 2.734806629834254, + "grad_norm": 0.32748520374298096, + "learning_rate": 8.524797785223318e-05, + "loss": 1.8678, + "step": 8910 + }, + { + "epoch": 2.7351135666052793, + "grad_norm": 0.32576173543930054, + "learning_rate": 8.524445232089107e-05, + "loss": 1.8296, + "step": 8911 + }, + { + "epoch": 2.7354205033763046, + "grad_norm": 0.3028578758239746, + "learning_rate": 8.524092644124261e-05, + "loss": 1.8656, + "step": 8912 + }, + { + "epoch": 2.7357274401473295, + "grad_norm": 0.29967090487480164, + "learning_rate": 8.523740021332268e-05, + "loss": 1.8206, + "step": 8913 + }, + { + "epoch": 2.736034376918355, + "grad_norm": 0.3042941391468048, + "learning_rate": 8.523387363716611e-05, + "loss": 1.7928, + "step": 8914 + }, + { + "epoch": 2.7363413136893797, + "grad_norm": 0.3278021216392517, + "learning_rate": 8.523034671280772e-05, + "loss": 1.9213, + "step": 8915 + }, + { + "epoch": 2.736648250460405, + "grad_norm": 0.39839017391204834, + "learning_rate": 8.522681944028242e-05, + "loss": 1.8242, + "step": 8916 + }, + { + "epoch": 2.7369551872314304, + "grad_norm": 0.3960748016834259, + "learning_rate": 8.522329181962504e-05, + "loss": 1.8761, + "step": 8917 + }, + { + "epoch": 2.7372621240024557, + "grad_norm": 0.3250591456890106, + "learning_rate": 8.521976385087044e-05, + "loss": 1.8318, + "step": 8918 + }, + { + "epoch": 2.7375690607734806, + "grad_norm": 0.31731119751930237, + "learning_rate": 8.521623553405349e-05, + "loss": 1.8062, + "step": 8919 + }, + { + "epoch": 2.737875997544506, + "grad_norm": 0.32452264428138733, + "learning_rate": 8.521270686920906e-05, + "loss": 1.8384, + "step": 8920 + }, + { + "epoch": 2.738182934315531, + "grad_norm": 0.2892500162124634, + "learning_rate": 8.520917785637204e-05, + "loss": 1.8128, + "step": 8921 + }, + { + "epoch": 2.738489871086556, + "grad_norm": 0.30028483271598816, + "learning_rate": 8.520564849557726e-05, + "loss": 1.8512, + "step": 8922 + }, + { + "epoch": 2.7387968078575815, + "grad_norm": 0.29927411675453186, + "learning_rate": 8.520211878685964e-05, + "loss": 1.8431, + "step": 8923 + }, + { + "epoch": 2.7391037446286064, + "grad_norm": 0.3426479995250702, + "learning_rate": 8.519858873025405e-05, + "loss": 1.8724, + "step": 8924 + }, + { + "epoch": 2.7394106813996317, + "grad_norm": 0.3795917332172394, + "learning_rate": 8.519505832579538e-05, + "loss": 1.8888, + "step": 8925 + }, + { + "epoch": 2.7397176181706566, + "grad_norm": 0.4924582839012146, + "learning_rate": 8.519152757351849e-05, + "loss": 1.7743, + "step": 8926 + }, + { + "epoch": 2.740024554941682, + "grad_norm": 0.43054282665252686, + "learning_rate": 8.518799647345832e-05, + "loss": 1.8556, + "step": 8927 + }, + { + "epoch": 2.7403314917127073, + "grad_norm": 0.37040412425994873, + "learning_rate": 8.518446502564974e-05, + "loss": 1.9162, + "step": 8928 + }, + { + "epoch": 2.7406384284837326, + "grad_norm": 0.38334885239601135, + "learning_rate": 8.518093323012766e-05, + "loss": 1.8078, + "step": 8929 + }, + { + "epoch": 2.7409453652547575, + "grad_norm": 0.409101665019989, + "learning_rate": 8.517740108692698e-05, + "loss": 1.7874, + "step": 8930 + }, + { + "epoch": 2.741252302025783, + "grad_norm": 0.3953499495983124, + "learning_rate": 8.517386859608258e-05, + "loss": 1.8455, + "step": 8931 + }, + { + "epoch": 2.7415592387968077, + "grad_norm": 0.30524972081184387, + "learning_rate": 8.517033575762942e-05, + "loss": 1.822, + "step": 8932 + }, + { + "epoch": 2.741866175567833, + "grad_norm": 0.354086309671402, + "learning_rate": 8.516680257160239e-05, + "loss": 1.859, + "step": 8933 + }, + { + "epoch": 2.7421731123388584, + "grad_norm": 0.4305376410484314, + "learning_rate": 8.516326903803638e-05, + "loss": 1.8918, + "step": 8934 + }, + { + "epoch": 2.7424800491098833, + "grad_norm": 0.590727686882019, + "learning_rate": 8.515973515696635e-05, + "loss": 1.8841, + "step": 8935 + }, + { + "epoch": 2.7427869858809086, + "grad_norm": 0.665314257144928, + "learning_rate": 8.515620092842723e-05, + "loss": 1.8166, + "step": 8936 + }, + { + "epoch": 2.7430939226519335, + "grad_norm": 0.5579181909561157, + "learning_rate": 8.515266635245389e-05, + "loss": 1.8344, + "step": 8937 + }, + { + "epoch": 2.743400859422959, + "grad_norm": 0.3698382079601288, + "learning_rate": 8.514913142908132e-05, + "loss": 1.8445, + "step": 8938 + }, + { + "epoch": 2.743707796193984, + "grad_norm": 0.30882057547569275, + "learning_rate": 8.514559615834442e-05, + "loss": 1.8443, + "step": 8939 + }, + { + "epoch": 2.744014732965009, + "grad_norm": 0.35821446776390076, + "learning_rate": 8.514206054027815e-05, + "loss": 1.8482, + "step": 8940 + }, + { + "epoch": 2.7443216697360344, + "grad_norm": 0.35552099347114563, + "learning_rate": 8.513852457491744e-05, + "loss": 1.7848, + "step": 8941 + }, + { + "epoch": 2.7446286065070593, + "grad_norm": 0.27788954973220825, + "learning_rate": 8.513498826229722e-05, + "loss": 1.7935, + "step": 8942 + }, + { + "epoch": 2.7449355432780846, + "grad_norm": 0.30653929710388184, + "learning_rate": 8.513145160245246e-05, + "loss": 1.808, + "step": 8943 + }, + { + "epoch": 2.74524248004911, + "grad_norm": 0.34749966859817505, + "learning_rate": 8.512791459541812e-05, + "loss": 1.8498, + "step": 8944 + }, + { + "epoch": 2.7455494168201353, + "grad_norm": 0.362326979637146, + "learning_rate": 8.512437724122912e-05, + "loss": 1.8263, + "step": 8945 + }, + { + "epoch": 2.74585635359116, + "grad_norm": 0.2914038598537445, + "learning_rate": 8.512083953992044e-05, + "loss": 1.834, + "step": 8946 + }, + { + "epoch": 2.7461632903621855, + "grad_norm": 0.31662893295288086, + "learning_rate": 8.511730149152705e-05, + "loss": 1.8157, + "step": 8947 + }, + { + "epoch": 2.7464702271332104, + "grad_norm": 0.38970568776130676, + "learning_rate": 8.51137630960839e-05, + "loss": 1.8764, + "step": 8948 + }, + { + "epoch": 2.7467771639042358, + "grad_norm": 0.3907272517681122, + "learning_rate": 8.511022435362594e-05, + "loss": 1.8665, + "step": 8949 + }, + { + "epoch": 2.747084100675261, + "grad_norm": 0.3315196931362152, + "learning_rate": 8.510668526418819e-05, + "loss": 1.8076, + "step": 8950 + }, + { + "epoch": 2.747391037446286, + "grad_norm": 0.29783520102500916, + "learning_rate": 8.510314582780559e-05, + "loss": 1.8518, + "step": 8951 + }, + { + "epoch": 2.7476979742173113, + "grad_norm": 0.3085685670375824, + "learning_rate": 8.509960604451312e-05, + "loss": 1.8961, + "step": 8952 + }, + { + "epoch": 2.748004910988336, + "grad_norm": 0.3204992711544037, + "learning_rate": 8.509606591434579e-05, + "loss": 1.8374, + "step": 8953 + }, + { + "epoch": 2.7483118477593615, + "grad_norm": 0.2801276445388794, + "learning_rate": 8.509252543733855e-05, + "loss": 1.8455, + "step": 8954 + }, + { + "epoch": 2.748618784530387, + "grad_norm": 0.26911506056785583, + "learning_rate": 8.508898461352641e-05, + "loss": 1.8093, + "step": 8955 + }, + { + "epoch": 2.7489257213014118, + "grad_norm": 0.30429625511169434, + "learning_rate": 8.508544344294435e-05, + "loss": 1.8526, + "step": 8956 + }, + { + "epoch": 2.749232658072437, + "grad_norm": 0.308403342962265, + "learning_rate": 8.50819019256274e-05, + "loss": 1.7917, + "step": 8957 + }, + { + "epoch": 2.749539594843462, + "grad_norm": 0.3292251229286194, + "learning_rate": 8.507836006161052e-05, + "loss": 1.8206, + "step": 8958 + }, + { + "epoch": 2.7498465316144873, + "grad_norm": 0.30014076828956604, + "learning_rate": 8.507481785092871e-05, + "loss": 1.8136, + "step": 8959 + }, + { + "epoch": 2.7501534683855127, + "grad_norm": 0.2879343032836914, + "learning_rate": 8.5071275293617e-05, + "loss": 1.8476, + "step": 8960 + }, + { + "epoch": 2.750460405156538, + "grad_norm": 0.30646058917045593, + "learning_rate": 8.506773238971039e-05, + "loss": 1.7936, + "step": 8961 + }, + { + "epoch": 2.750767341927563, + "grad_norm": 0.309804230928421, + "learning_rate": 8.506418913924391e-05, + "loss": 1.8076, + "step": 8962 + }, + { + "epoch": 2.7510742786985882, + "grad_norm": 0.27035996317863464, + "learning_rate": 8.506064554225255e-05, + "loss": 1.8169, + "step": 8963 + }, + { + "epoch": 2.751381215469613, + "grad_norm": 0.3185548782348633, + "learning_rate": 8.505710159877134e-05, + "loss": 1.8265, + "step": 8964 + }, + { + "epoch": 2.7516881522406385, + "grad_norm": 0.3806973099708557, + "learning_rate": 8.505355730883532e-05, + "loss": 1.824, + "step": 8965 + }, + { + "epoch": 2.751995089011664, + "grad_norm": 0.3206372857093811, + "learning_rate": 8.505001267247949e-05, + "loss": 1.8436, + "step": 8966 + }, + { + "epoch": 2.7523020257826887, + "grad_norm": 0.2957460880279541, + "learning_rate": 8.504646768973889e-05, + "loss": 1.8212, + "step": 8967 + }, + { + "epoch": 2.752608962553714, + "grad_norm": 0.2854628562927246, + "learning_rate": 8.504292236064854e-05, + "loss": 1.862, + "step": 8968 + }, + { + "epoch": 2.752915899324739, + "grad_norm": 0.30056047439575195, + "learning_rate": 8.503937668524351e-05, + "loss": 1.8007, + "step": 8969 + }, + { + "epoch": 2.7532228360957642, + "grad_norm": 0.33884522318840027, + "learning_rate": 8.503583066355883e-05, + "loss": 1.8972, + "step": 8970 + }, + { + "epoch": 2.7535297728667896, + "grad_norm": 0.29358747601509094, + "learning_rate": 8.503228429562951e-05, + "loss": 1.8343, + "step": 8971 + }, + { + "epoch": 2.7538367096378145, + "grad_norm": 0.3650909662246704, + "learning_rate": 8.502873758149063e-05, + "loss": 1.7866, + "step": 8972 + }, + { + "epoch": 2.75414364640884, + "grad_norm": 0.3245839476585388, + "learning_rate": 8.502519052117725e-05, + "loss": 1.8451, + "step": 8973 + }, + { + "epoch": 2.7544505831798647, + "grad_norm": 0.305429071187973, + "learning_rate": 8.502164311472441e-05, + "loss": 1.9277, + "step": 8974 + }, + { + "epoch": 2.75475751995089, + "grad_norm": 0.3520638942718506, + "learning_rate": 8.501809536216716e-05, + "loss": 1.7648, + "step": 8975 + }, + { + "epoch": 2.7550644567219154, + "grad_norm": 0.419918030500412, + "learning_rate": 8.501454726354054e-05, + "loss": 1.7862, + "step": 8976 + }, + { + "epoch": 2.7553713934929407, + "grad_norm": 0.3854345977306366, + "learning_rate": 8.501099881887968e-05, + "loss": 1.8234, + "step": 8977 + }, + { + "epoch": 2.7556783302639656, + "grad_norm": 0.27826064825057983, + "learning_rate": 8.50074500282196e-05, + "loss": 1.7694, + "step": 8978 + }, + { + "epoch": 2.755985267034991, + "grad_norm": 0.3439055383205414, + "learning_rate": 8.500390089159536e-05, + "loss": 1.8136, + "step": 8979 + }, + { + "epoch": 2.756292203806016, + "grad_norm": 0.3434913754463196, + "learning_rate": 8.500035140904208e-05, + "loss": 1.8053, + "step": 8980 + }, + { + "epoch": 2.756599140577041, + "grad_norm": 0.27551600337028503, + "learning_rate": 8.49968015805948e-05, + "loss": 1.8349, + "step": 8981 + }, + { + "epoch": 2.7569060773480665, + "grad_norm": 0.304706871509552, + "learning_rate": 8.499325140628863e-05, + "loss": 1.8488, + "step": 8982 + }, + { + "epoch": 2.7572130141190914, + "grad_norm": 0.36910584568977356, + "learning_rate": 8.498970088615861e-05, + "loss": 1.8519, + "step": 8983 + }, + { + "epoch": 2.7575199508901167, + "grad_norm": 0.30584999918937683, + "learning_rate": 8.498615002023987e-05, + "loss": 1.8479, + "step": 8984 + }, + { + "epoch": 2.7578268876611416, + "grad_norm": 0.28511542081832886, + "learning_rate": 8.498259880856749e-05, + "loss": 1.8047, + "step": 8985 + }, + { + "epoch": 2.758133824432167, + "grad_norm": 0.28804922103881836, + "learning_rate": 8.497904725117658e-05, + "loss": 1.891, + "step": 8986 + }, + { + "epoch": 2.7584407612031923, + "grad_norm": 0.32592445611953735, + "learning_rate": 8.497549534810221e-05, + "loss": 1.8081, + "step": 8987 + }, + { + "epoch": 2.758747697974217, + "grad_norm": 0.3298552632331848, + "learning_rate": 8.497194309937949e-05, + "loss": 1.8897, + "step": 8988 + }, + { + "epoch": 2.7590546347452425, + "grad_norm": 0.3506438136100769, + "learning_rate": 8.496839050504353e-05, + "loss": 1.9007, + "step": 8989 + }, + { + "epoch": 2.7593615715162674, + "grad_norm": 0.30891793966293335, + "learning_rate": 8.496483756512946e-05, + "loss": 1.8154, + "step": 8990 + }, + { + "epoch": 2.7596685082872927, + "grad_norm": 0.3697068691253662, + "learning_rate": 8.496128427967235e-05, + "loss": 1.8301, + "step": 8991 + }, + { + "epoch": 2.759975445058318, + "grad_norm": 0.3090182840824127, + "learning_rate": 8.495773064870734e-05, + "loss": 1.8443, + "step": 8992 + }, + { + "epoch": 2.7602823818293434, + "grad_norm": 0.31172695755958557, + "learning_rate": 8.495417667226955e-05, + "loss": 1.8051, + "step": 8993 + }, + { + "epoch": 2.7605893186003683, + "grad_norm": 0.34285077452659607, + "learning_rate": 8.495062235039411e-05, + "loss": 1.8766, + "step": 8994 + }, + { + "epoch": 2.7608962553713936, + "grad_norm": 0.30001118779182434, + "learning_rate": 8.494706768311612e-05, + "loss": 1.8267, + "step": 8995 + }, + { + "epoch": 2.7612031921424185, + "grad_norm": 0.2767544984817505, + "learning_rate": 8.494351267047074e-05, + "loss": 1.8038, + "step": 8996 + }, + { + "epoch": 2.761510128913444, + "grad_norm": 0.2952648401260376, + "learning_rate": 8.493995731249307e-05, + "loss": 1.7863, + "step": 8997 + }, + { + "epoch": 2.761817065684469, + "grad_norm": 0.27491581439971924, + "learning_rate": 8.493640160921828e-05, + "loss": 1.844, + "step": 8998 + }, + { + "epoch": 2.762124002455494, + "grad_norm": 0.2733328938484192, + "learning_rate": 8.493284556068147e-05, + "loss": 1.7909, + "step": 8999 + }, + { + "epoch": 2.7624309392265194, + "grad_norm": 0.3201010525226593, + "learning_rate": 8.492928916691783e-05, + "loss": 1.8827, + "step": 9000 + }, + { + "epoch": 2.7627378759975443, + "grad_norm": 0.293652206659317, + "learning_rate": 8.492573242796244e-05, + "loss": 1.7755, + "step": 9001 + }, + { + "epoch": 2.7630448127685696, + "grad_norm": 0.2862321436405182, + "learning_rate": 8.492217534385053e-05, + "loss": 1.7868, + "step": 9002 + }, + { + "epoch": 2.763351749539595, + "grad_norm": 0.364490270614624, + "learning_rate": 8.491861791461722e-05, + "loss": 1.8276, + "step": 9003 + }, + { + "epoch": 2.7636586863106203, + "grad_norm": 0.4316955506801605, + "learning_rate": 8.491506014029765e-05, + "loss": 1.8727, + "step": 9004 + }, + { + "epoch": 2.763965623081645, + "grad_norm": 0.37957659363746643, + "learning_rate": 8.491150202092697e-05, + "loss": 1.8471, + "step": 9005 + }, + { + "epoch": 2.7642725598526705, + "grad_norm": 0.2936808168888092, + "learning_rate": 8.490794355654039e-05, + "loss": 1.7964, + "step": 9006 + }, + { + "epoch": 2.7645794966236954, + "grad_norm": 0.3742556869983673, + "learning_rate": 8.490438474717304e-05, + "loss": 1.8461, + "step": 9007 + }, + { + "epoch": 2.7648864333947207, + "grad_norm": 0.4273780286312103, + "learning_rate": 8.49008255928601e-05, + "loss": 1.7947, + "step": 9008 + }, + { + "epoch": 2.765193370165746, + "grad_norm": 0.35967808961868286, + "learning_rate": 8.489726609363675e-05, + "loss": 1.8125, + "step": 9009 + }, + { + "epoch": 2.765500306936771, + "grad_norm": 0.27607613801956177, + "learning_rate": 8.489370624953817e-05, + "loss": 1.8413, + "step": 9010 + }, + { + "epoch": 2.7658072437077963, + "grad_norm": 0.38287433981895447, + "learning_rate": 8.489014606059952e-05, + "loss": 1.8184, + "step": 9011 + }, + { + "epoch": 2.766114180478821, + "grad_norm": 0.4284100830554962, + "learning_rate": 8.4886585526856e-05, + "loss": 1.7965, + "step": 9012 + }, + { + "epoch": 2.7664211172498465, + "grad_norm": 0.35851627588272095, + "learning_rate": 8.48830246483428e-05, + "loss": 1.8275, + "step": 9013 + }, + { + "epoch": 2.766728054020872, + "grad_norm": 0.30598360300064087, + "learning_rate": 8.487946342509509e-05, + "loss": 1.8383, + "step": 9014 + }, + { + "epoch": 2.7670349907918967, + "grad_norm": 0.30098259449005127, + "learning_rate": 8.487590185714811e-05, + "loss": 1.8229, + "step": 9015 + }, + { + "epoch": 2.767341927562922, + "grad_norm": 0.45887723565101624, + "learning_rate": 8.487233994453701e-05, + "loss": 1.9128, + "step": 9016 + }, + { + "epoch": 2.767648864333947, + "grad_norm": 0.4983403980731964, + "learning_rate": 8.4868777687297e-05, + "loss": 1.8269, + "step": 9017 + }, + { + "epoch": 2.7679558011049723, + "grad_norm": 0.4925507605075836, + "learning_rate": 8.48652150854633e-05, + "loss": 1.9231, + "step": 9018 + }, + { + "epoch": 2.7682627378759976, + "grad_norm": 0.31434112787246704, + "learning_rate": 8.48616521390711e-05, + "loss": 1.7782, + "step": 9019 + }, + { + "epoch": 2.768569674647023, + "grad_norm": 0.31802332401275635, + "learning_rate": 8.485808884815563e-05, + "loss": 1.8927, + "step": 9020 + }, + { + "epoch": 2.768876611418048, + "grad_norm": 0.4615871012210846, + "learning_rate": 8.485452521275208e-05, + "loss": 1.7866, + "step": 9021 + }, + { + "epoch": 2.769183548189073, + "grad_norm": 0.43722355365753174, + "learning_rate": 8.48509612328957e-05, + "loss": 1.8159, + "step": 9022 + }, + { + "epoch": 2.769490484960098, + "grad_norm": 0.27137285470962524, + "learning_rate": 8.484739690862169e-05, + "loss": 1.7613, + "step": 9023 + }, + { + "epoch": 2.7697974217311234, + "grad_norm": 0.32973676919937134, + "learning_rate": 8.484383223996528e-05, + "loss": 1.8321, + "step": 9024 + }, + { + "epoch": 2.7701043585021488, + "grad_norm": 0.38628003001213074, + "learning_rate": 8.484026722696169e-05, + "loss": 1.8154, + "step": 9025 + }, + { + "epoch": 2.7704112952731736, + "grad_norm": 0.33044543862342834, + "learning_rate": 8.483670186964617e-05, + "loss": 1.857, + "step": 9026 + }, + { + "epoch": 2.770718232044199, + "grad_norm": 0.2778245210647583, + "learning_rate": 8.483313616805393e-05, + "loss": 1.8524, + "step": 9027 + }, + { + "epoch": 2.771025168815224, + "grad_norm": 0.32064709067344666, + "learning_rate": 8.482957012222024e-05, + "loss": 1.8757, + "step": 9028 + }, + { + "epoch": 2.771332105586249, + "grad_norm": 0.29325249791145325, + "learning_rate": 8.48260037321803e-05, + "loss": 1.8504, + "step": 9029 + }, + { + "epoch": 2.7716390423572745, + "grad_norm": 0.308626651763916, + "learning_rate": 8.48224369979694e-05, + "loss": 1.882, + "step": 9030 + }, + { + "epoch": 2.7719459791282994, + "grad_norm": 0.34577706456184387, + "learning_rate": 8.481886991962276e-05, + "loss": 1.8178, + "step": 9031 + }, + { + "epoch": 2.7722529158993248, + "grad_norm": 0.3902320861816406, + "learning_rate": 8.481530249717564e-05, + "loss": 1.9111, + "step": 9032 + }, + { + "epoch": 2.7725598526703497, + "grad_norm": 0.431540310382843, + "learning_rate": 8.481173473066328e-05, + "loss": 1.8145, + "step": 9033 + }, + { + "epoch": 2.772866789441375, + "grad_norm": 0.3637184798717499, + "learning_rate": 8.480816662012097e-05, + "loss": 1.8298, + "step": 9034 + }, + { + "epoch": 2.7731737262124003, + "grad_norm": 0.3045017123222351, + "learning_rate": 8.480459816558397e-05, + "loss": 1.8099, + "step": 9035 + }, + { + "epoch": 2.7734806629834257, + "grad_norm": 0.4252402186393738, + "learning_rate": 8.48010293670875e-05, + "loss": 1.8125, + "step": 9036 + }, + { + "epoch": 2.7737875997544506, + "grad_norm": 0.37933188676834106, + "learning_rate": 8.479746022466688e-05, + "loss": 1.8162, + "step": 9037 + }, + { + "epoch": 2.774094536525476, + "grad_norm": 0.287536084651947, + "learning_rate": 8.479389073835735e-05, + "loss": 1.8377, + "step": 9038 + }, + { + "epoch": 2.7744014732965008, + "grad_norm": 0.3484840393066406, + "learning_rate": 8.47903209081942e-05, + "loss": 1.8166, + "step": 9039 + }, + { + "epoch": 2.774708410067526, + "grad_norm": 0.4489477872848511, + "learning_rate": 8.478675073421272e-05, + "loss": 1.8618, + "step": 9040 + }, + { + "epoch": 2.7750153468385514, + "grad_norm": 0.3817744553089142, + "learning_rate": 8.478318021644817e-05, + "loss": 1.86, + "step": 9041 + }, + { + "epoch": 2.7753222836095763, + "grad_norm": 0.263468861579895, + "learning_rate": 8.477960935493585e-05, + "loss": 1.7802, + "step": 9042 + }, + { + "epoch": 2.7756292203806017, + "grad_norm": 0.3218925893306732, + "learning_rate": 8.477603814971104e-05, + "loss": 1.8056, + "step": 9043 + }, + { + "epoch": 2.7759361571516266, + "grad_norm": 0.38502782583236694, + "learning_rate": 8.477246660080905e-05, + "loss": 1.8405, + "step": 9044 + }, + { + "epoch": 2.776243093922652, + "grad_norm": 0.3504064381122589, + "learning_rate": 8.476889470826517e-05, + "loss": 1.8606, + "step": 9045 + }, + { + "epoch": 2.7765500306936772, + "grad_norm": 0.3007161021232605, + "learning_rate": 8.476532247211468e-05, + "loss": 1.8407, + "step": 9046 + }, + { + "epoch": 2.776856967464702, + "grad_norm": 0.30306726694107056, + "learning_rate": 8.476174989239289e-05, + "loss": 1.8399, + "step": 9047 + }, + { + "epoch": 2.7771639042357275, + "grad_norm": 0.3898545801639557, + "learning_rate": 8.475817696913511e-05, + "loss": 1.8971, + "step": 9048 + }, + { + "epoch": 2.7774708410067523, + "grad_norm": 0.35386478900909424, + "learning_rate": 8.475460370237667e-05, + "loss": 1.8213, + "step": 9049 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.35815873742103577, + "learning_rate": 8.475103009215287e-05, + "loss": 1.9593, + "step": 9050 + }, + { + "epoch": 2.778084714548803, + "grad_norm": 0.28021275997161865, + "learning_rate": 8.474745613849901e-05, + "loss": 1.7767, + "step": 9051 + }, + { + "epoch": 2.7783916513198283, + "grad_norm": 0.3393603563308716, + "learning_rate": 8.474388184145042e-05, + "loss": 1.8484, + "step": 9052 + }, + { + "epoch": 2.7786985880908532, + "grad_norm": 0.30488693714141846, + "learning_rate": 8.474030720104243e-05, + "loss": 1.835, + "step": 9053 + }, + { + "epoch": 2.7790055248618786, + "grad_norm": 0.2839586138725281, + "learning_rate": 8.473673221731037e-05, + "loss": 1.8054, + "step": 9054 + }, + { + "epoch": 2.7793124616329035, + "grad_norm": 0.2718851864337921, + "learning_rate": 8.473315689028955e-05, + "loss": 1.8216, + "step": 9055 + }, + { + "epoch": 2.779619398403929, + "grad_norm": 0.3072827458381653, + "learning_rate": 8.472958122001531e-05, + "loss": 1.8537, + "step": 9056 + }, + { + "epoch": 2.779926335174954, + "grad_norm": 0.36827966570854187, + "learning_rate": 8.472600520652301e-05, + "loss": 1.8174, + "step": 9057 + }, + { + "epoch": 2.780233271945979, + "grad_norm": 0.37436968088150024, + "learning_rate": 8.472242884984797e-05, + "loss": 1.7983, + "step": 9058 + }, + { + "epoch": 2.7805402087170044, + "grad_norm": 0.3039530813694, + "learning_rate": 8.471885215002554e-05, + "loss": 1.839, + "step": 9059 + }, + { + "epoch": 2.7808471454880292, + "grad_norm": 0.2949865162372589, + "learning_rate": 8.471527510709106e-05, + "loss": 1.8191, + "step": 9060 + }, + { + "epoch": 2.7811540822590546, + "grad_norm": 0.2914051413536072, + "learning_rate": 8.471169772107987e-05, + "loss": 1.8511, + "step": 9061 + }, + { + "epoch": 2.78146101903008, + "grad_norm": 0.29169002175331116, + "learning_rate": 8.470811999202734e-05, + "loss": 1.8242, + "step": 9062 + }, + { + "epoch": 2.781767955801105, + "grad_norm": 0.2862909436225891, + "learning_rate": 8.470454191996884e-05, + "loss": 1.8471, + "step": 9063 + }, + { + "epoch": 2.78207489257213, + "grad_norm": 0.2820829749107361, + "learning_rate": 8.47009635049397e-05, + "loss": 1.8539, + "step": 9064 + }, + { + "epoch": 2.782381829343155, + "grad_norm": 0.2778072655200958, + "learning_rate": 8.469738474697532e-05, + "loss": 1.7999, + "step": 9065 + }, + { + "epoch": 2.7826887661141804, + "grad_norm": 0.35963353514671326, + "learning_rate": 8.469380564611103e-05, + "loss": 1.8589, + "step": 9066 + }, + { + "epoch": 2.7829957028852057, + "grad_norm": 0.29438379406929016, + "learning_rate": 8.469022620238223e-05, + "loss": 1.7898, + "step": 9067 + }, + { + "epoch": 2.783302639656231, + "grad_norm": 0.2766551971435547, + "learning_rate": 8.468664641582428e-05, + "loss": 1.858, + "step": 9068 + }, + { + "epoch": 2.783609576427256, + "grad_norm": 0.29893574118614197, + "learning_rate": 8.468306628647256e-05, + "loss": 1.7859, + "step": 9069 + }, + { + "epoch": 2.7839165131982813, + "grad_norm": 0.2744910717010498, + "learning_rate": 8.467948581436243e-05, + "loss": 1.7803, + "step": 9070 + }, + { + "epoch": 2.784223449969306, + "grad_norm": 0.2405908703804016, + "learning_rate": 8.467590499952931e-05, + "loss": 1.8064, + "step": 9071 + }, + { + "epoch": 2.7845303867403315, + "grad_norm": 0.28585049510002136, + "learning_rate": 8.467232384200858e-05, + "loss": 1.809, + "step": 9072 + }, + { + "epoch": 2.784837323511357, + "grad_norm": 0.25816819071769714, + "learning_rate": 8.466874234183562e-05, + "loss": 1.7687, + "step": 9073 + }, + { + "epoch": 2.7851442602823817, + "grad_norm": 0.3135145306587219, + "learning_rate": 8.466516049904582e-05, + "loss": 1.8902, + "step": 9074 + }, + { + "epoch": 2.785451197053407, + "grad_norm": 0.32004159688949585, + "learning_rate": 8.46615783136746e-05, + "loss": 1.8227, + "step": 9075 + }, + { + "epoch": 2.785758133824432, + "grad_norm": 0.2775251567363739, + "learning_rate": 8.465799578575733e-05, + "loss": 1.8293, + "step": 9076 + }, + { + "epoch": 2.7860650705954573, + "grad_norm": 0.3377391993999481, + "learning_rate": 8.465441291532944e-05, + "loss": 1.9096, + "step": 9077 + }, + { + "epoch": 2.7863720073664826, + "grad_norm": 0.322818398475647, + "learning_rate": 8.465082970242634e-05, + "loss": 1.8372, + "step": 9078 + }, + { + "epoch": 2.786678944137508, + "grad_norm": 0.30539727210998535, + "learning_rate": 8.464724614708342e-05, + "loss": 1.8678, + "step": 9079 + }, + { + "epoch": 2.786985880908533, + "grad_norm": 0.3148079216480255, + "learning_rate": 8.464366224933611e-05, + "loss": 1.798, + "step": 9080 + }, + { + "epoch": 2.787292817679558, + "grad_norm": 0.3834371566772461, + "learning_rate": 8.464007800921983e-05, + "loss": 1.7871, + "step": 9081 + }, + { + "epoch": 2.787599754450583, + "grad_norm": 0.360202431678772, + "learning_rate": 8.463649342676998e-05, + "loss": 1.8396, + "step": 9082 + }, + { + "epoch": 2.7879066912216084, + "grad_norm": 0.28360050916671753, + "learning_rate": 8.463290850202201e-05, + "loss": 1.7905, + "step": 9083 + }, + { + "epoch": 2.7882136279926337, + "grad_norm": 0.28087326884269714, + "learning_rate": 8.462932323501134e-05, + "loss": 1.8079, + "step": 9084 + }, + { + "epoch": 2.7885205647636586, + "grad_norm": 0.2725851833820343, + "learning_rate": 8.462573762577339e-05, + "loss": 1.8099, + "step": 9085 + }, + { + "epoch": 2.788827501534684, + "grad_norm": 0.27776938676834106, + "learning_rate": 8.462215167434363e-05, + "loss": 1.8002, + "step": 9086 + }, + { + "epoch": 2.789134438305709, + "grad_norm": 0.3118545711040497, + "learning_rate": 8.461856538075745e-05, + "loss": 1.8541, + "step": 9087 + }, + { + "epoch": 2.789441375076734, + "grad_norm": 0.29499873518943787, + "learning_rate": 8.461497874505034e-05, + "loss": 1.8667, + "step": 9088 + }, + { + "epoch": 2.7897483118477595, + "grad_norm": 0.31346917152404785, + "learning_rate": 8.46113917672577e-05, + "loss": 1.8737, + "step": 9089 + }, + { + "epoch": 2.7900552486187844, + "grad_norm": 0.30406203866004944, + "learning_rate": 8.460780444741501e-05, + "loss": 1.8467, + "step": 9090 + }, + { + "epoch": 2.7903621853898097, + "grad_norm": 0.28438735008239746, + "learning_rate": 8.46042167855577e-05, + "loss": 1.8008, + "step": 9091 + }, + { + "epoch": 2.7906691221608346, + "grad_norm": 0.29893866181373596, + "learning_rate": 8.460062878172125e-05, + "loss": 1.8498, + "step": 9092 + }, + { + "epoch": 2.79097605893186, + "grad_norm": 0.33810749650001526, + "learning_rate": 8.459704043594112e-05, + "loss": 1.8259, + "step": 9093 + }, + { + "epoch": 2.7912829957028853, + "grad_norm": 0.3726813495159149, + "learning_rate": 8.459345174825273e-05, + "loss": 1.8831, + "step": 9094 + }, + { + "epoch": 2.7915899324739106, + "grad_norm": 0.2983379662036896, + "learning_rate": 8.45898627186916e-05, + "loss": 1.7886, + "step": 9095 + }, + { + "epoch": 2.7918968692449355, + "grad_norm": 0.3235681354999542, + "learning_rate": 8.458627334729316e-05, + "loss": 1.8616, + "step": 9096 + }, + { + "epoch": 2.792203806015961, + "grad_norm": 0.47961094975471497, + "learning_rate": 8.458268363409288e-05, + "loss": 1.8134, + "step": 9097 + }, + { + "epoch": 2.7925107427869857, + "grad_norm": 0.5463281869888306, + "learning_rate": 8.457909357912628e-05, + "loss": 1.8288, + "step": 9098 + }, + { + "epoch": 2.792817679558011, + "grad_norm": 0.5377171635627747, + "learning_rate": 8.45755031824288e-05, + "loss": 1.8032, + "step": 9099 + }, + { + "epoch": 2.7931246163290364, + "grad_norm": 0.30159178376197815, + "learning_rate": 8.457191244403592e-05, + "loss": 1.7619, + "step": 9100 + }, + { + "epoch": 2.7934315531000613, + "grad_norm": 0.33798086643218994, + "learning_rate": 8.456832136398315e-05, + "loss": 1.839, + "step": 9101 + }, + { + "epoch": 2.7937384898710866, + "grad_norm": 0.5194488167762756, + "learning_rate": 8.456472994230595e-05, + "loss": 1.7908, + "step": 9102 + }, + { + "epoch": 2.7940454266421115, + "grad_norm": 0.49310582876205444, + "learning_rate": 8.456113817903986e-05, + "loss": 1.8471, + "step": 9103 + }, + { + "epoch": 2.794352363413137, + "grad_norm": 0.27490735054016113, + "learning_rate": 8.455754607422032e-05, + "loss": 1.8168, + "step": 9104 + }, + { + "epoch": 2.794659300184162, + "grad_norm": 0.3760504126548767, + "learning_rate": 8.455395362788285e-05, + "loss": 1.8796, + "step": 9105 + }, + { + "epoch": 2.794966236955187, + "grad_norm": 0.4636823534965515, + "learning_rate": 8.455036084006298e-05, + "loss": 1.8001, + "step": 9106 + }, + { + "epoch": 2.7952731737262124, + "grad_norm": 0.38666999340057373, + "learning_rate": 8.454676771079619e-05, + "loss": 1.8396, + "step": 9107 + }, + { + "epoch": 2.7955801104972373, + "grad_norm": 0.2992180585861206, + "learning_rate": 8.454317424011797e-05, + "loss": 1.8298, + "step": 9108 + }, + { + "epoch": 2.7958870472682626, + "grad_norm": 0.3744206428527832, + "learning_rate": 8.453958042806389e-05, + "loss": 1.8396, + "step": 9109 + }, + { + "epoch": 2.796193984039288, + "grad_norm": 0.5117284059524536, + "learning_rate": 8.453598627466941e-05, + "loss": 1.9734, + "step": 9110 + }, + { + "epoch": 2.7965009208103133, + "grad_norm": 0.36792969703674316, + "learning_rate": 8.453239177997008e-05, + "loss": 1.8347, + "step": 9111 + }, + { + "epoch": 2.796807857581338, + "grad_norm": 0.3352719843387604, + "learning_rate": 8.452879694400139e-05, + "loss": 1.7967, + "step": 9112 + }, + { + "epoch": 2.7971147943523635, + "grad_norm": 0.45745235681533813, + "learning_rate": 8.452520176679893e-05, + "loss": 1.8484, + "step": 9113 + }, + { + "epoch": 2.7974217311233884, + "grad_norm": 0.43958255648612976, + "learning_rate": 8.452160624839816e-05, + "loss": 1.7954, + "step": 9114 + }, + { + "epoch": 2.7977286678944138, + "grad_norm": 0.28715837001800537, + "learning_rate": 8.451801038883467e-05, + "loss": 1.8088, + "step": 9115 + }, + { + "epoch": 2.798035604665439, + "grad_norm": 0.3552972078323364, + "learning_rate": 8.451441418814394e-05, + "loss": 1.7654, + "step": 9116 + }, + { + "epoch": 2.798342541436464, + "grad_norm": 0.5065462589263916, + "learning_rate": 8.451081764636156e-05, + "loss": 1.7841, + "step": 9117 + }, + { + "epoch": 2.7986494782074893, + "grad_norm": 0.48900917172431946, + "learning_rate": 8.450722076352306e-05, + "loss": 1.8709, + "step": 9118 + }, + { + "epoch": 2.798956414978514, + "grad_norm": 0.31420227885246277, + "learning_rate": 8.450362353966395e-05, + "loss": 1.9057, + "step": 9119 + }, + { + "epoch": 2.7992633517495396, + "grad_norm": 0.35886913537979126, + "learning_rate": 8.450002597481982e-05, + "loss": 1.877, + "step": 9120 + }, + { + "epoch": 2.799570288520565, + "grad_norm": 0.3822213113307953, + "learning_rate": 8.449642806902623e-05, + "loss": 1.9171, + "step": 9121 + }, + { + "epoch": 2.7998772252915898, + "grad_norm": 0.3286183476448059, + "learning_rate": 8.449282982231869e-05, + "loss": 1.8342, + "step": 9122 + }, + { + "epoch": 2.800184162062615, + "grad_norm": 0.3498966693878174, + "learning_rate": 8.448923123473282e-05, + "loss": 1.8276, + "step": 9123 + }, + { + "epoch": 2.80049109883364, + "grad_norm": 0.3550187647342682, + "learning_rate": 8.448563230630413e-05, + "loss": 1.8585, + "step": 9124 + }, + { + "epoch": 2.8007980356046653, + "grad_norm": 0.32100117206573486, + "learning_rate": 8.448203303706821e-05, + "loss": 1.8168, + "step": 9125 + }, + { + "epoch": 2.8011049723756907, + "grad_norm": 0.3859860301017761, + "learning_rate": 8.447843342706063e-05, + "loss": 1.8941, + "step": 9126 + }, + { + "epoch": 2.801411909146716, + "grad_norm": 0.41674432158470154, + "learning_rate": 8.447483347631697e-05, + "loss": 1.7894, + "step": 9127 + }, + { + "epoch": 2.801718845917741, + "grad_norm": 0.3324837386608124, + "learning_rate": 8.44712331848728e-05, + "loss": 1.8901, + "step": 9128 + }, + { + "epoch": 2.8020257826887662, + "grad_norm": 0.30357789993286133, + "learning_rate": 8.44676325527637e-05, + "loss": 1.8434, + "step": 9129 + }, + { + "epoch": 2.802332719459791, + "grad_norm": 0.3215816617012024, + "learning_rate": 8.446403158002525e-05, + "loss": 1.8291, + "step": 9130 + }, + { + "epoch": 2.8026396562308165, + "grad_norm": 0.26280832290649414, + "learning_rate": 8.446043026669303e-05, + "loss": 1.7934, + "step": 9131 + }, + { + "epoch": 2.802946593001842, + "grad_norm": 0.2963539659976959, + "learning_rate": 8.445682861280265e-05, + "loss": 1.824, + "step": 9132 + }, + { + "epoch": 2.8032535297728667, + "grad_norm": 0.4251864552497864, + "learning_rate": 8.44532266183897e-05, + "loss": 1.9, + "step": 9133 + }, + { + "epoch": 2.803560466543892, + "grad_norm": 0.3920140862464905, + "learning_rate": 8.444962428348978e-05, + "loss": 1.7753, + "step": 9134 + }, + { + "epoch": 2.803867403314917, + "grad_norm": 0.2614890933036804, + "learning_rate": 8.444602160813845e-05, + "loss": 1.844, + "step": 9135 + }, + { + "epoch": 2.8041743400859422, + "grad_norm": 0.3359995484352112, + "learning_rate": 8.444241859237135e-05, + "loss": 1.8636, + "step": 9136 + }, + { + "epoch": 2.8044812768569676, + "grad_norm": 0.34399285912513733, + "learning_rate": 8.44388152362241e-05, + "loss": 1.8304, + "step": 9137 + }, + { + "epoch": 2.804788213627993, + "grad_norm": 0.27815961837768555, + "learning_rate": 8.443521153973228e-05, + "loss": 1.7916, + "step": 9138 + }, + { + "epoch": 2.805095150399018, + "grad_norm": 0.40705251693725586, + "learning_rate": 8.443160750293152e-05, + "loss": 1.7707, + "step": 9139 + }, + { + "epoch": 2.805402087170043, + "grad_norm": 0.49512532353401184, + "learning_rate": 8.442800312585744e-05, + "loss": 1.866, + "step": 9140 + }, + { + "epoch": 2.805709023941068, + "grad_norm": 0.31373831629753113, + "learning_rate": 8.442439840854565e-05, + "loss": 1.8495, + "step": 9141 + }, + { + "epoch": 2.8060159607120934, + "grad_norm": 0.33470213413238525, + "learning_rate": 8.442079335103177e-05, + "loss": 1.8459, + "step": 9142 + }, + { + "epoch": 2.8063228974831187, + "grad_norm": 0.4092586636543274, + "learning_rate": 8.441718795335145e-05, + "loss": 1.8547, + "step": 9143 + }, + { + "epoch": 2.8066298342541436, + "grad_norm": 0.37220728397369385, + "learning_rate": 8.44135822155403e-05, + "loss": 1.8922, + "step": 9144 + }, + { + "epoch": 2.806936771025169, + "grad_norm": 0.3197399973869324, + "learning_rate": 8.440997613763395e-05, + "loss": 1.872, + "step": 9145 + }, + { + "epoch": 2.807243707796194, + "grad_norm": 0.31258881092071533, + "learning_rate": 8.440636971966805e-05, + "loss": 1.8394, + "step": 9146 + }, + { + "epoch": 2.807550644567219, + "grad_norm": 0.31450721621513367, + "learning_rate": 8.440276296167825e-05, + "loss": 1.8496, + "step": 9147 + }, + { + "epoch": 2.8078575813382445, + "grad_norm": 0.30959805846214294, + "learning_rate": 8.439915586370018e-05, + "loss": 1.8326, + "step": 9148 + }, + { + "epoch": 2.8081645181092694, + "grad_norm": 0.2942456901073456, + "learning_rate": 8.439554842576949e-05, + "loss": 1.8742, + "step": 9149 + }, + { + "epoch": 2.8084714548802947, + "grad_norm": 0.32378795742988586, + "learning_rate": 8.439194064792182e-05, + "loss": 1.7991, + "step": 9150 + }, + { + "epoch": 2.8087783916513196, + "grad_norm": 0.30733996629714966, + "learning_rate": 8.438833253019285e-05, + "loss": 1.8822, + "step": 9151 + }, + { + "epoch": 2.809085328422345, + "grad_norm": 0.29933521151542664, + "learning_rate": 8.438472407261821e-05, + "loss": 1.7785, + "step": 9152 + }, + { + "epoch": 2.8093922651933703, + "grad_norm": 0.2992005944252014, + "learning_rate": 8.438111527523358e-05, + "loss": 1.9056, + "step": 9153 + }, + { + "epoch": 2.8096992019643956, + "grad_norm": 0.3074969947338104, + "learning_rate": 8.43775061380746e-05, + "loss": 1.8283, + "step": 9154 + }, + { + "epoch": 2.8100061387354205, + "grad_norm": 0.29843345284461975, + "learning_rate": 8.437389666117699e-05, + "loss": 1.87, + "step": 9155 + }, + { + "epoch": 2.810313075506446, + "grad_norm": 0.2939853072166443, + "learning_rate": 8.437028684457635e-05, + "loss": 1.8657, + "step": 9156 + }, + { + "epoch": 2.8106200122774707, + "grad_norm": 0.292972207069397, + "learning_rate": 8.436667668830841e-05, + "loss": 1.821, + "step": 9157 + }, + { + "epoch": 2.810926949048496, + "grad_norm": 0.298244833946228, + "learning_rate": 8.436306619240882e-05, + "loss": 1.8531, + "step": 9158 + }, + { + "epoch": 2.8112338858195214, + "grad_norm": 0.28567394614219666, + "learning_rate": 8.435945535691328e-05, + "loss": 1.7719, + "step": 9159 + }, + { + "epoch": 2.8115408225905463, + "grad_norm": 0.2876092493534088, + "learning_rate": 8.435584418185745e-05, + "loss": 1.7622, + "step": 9160 + }, + { + "epoch": 2.8118477593615716, + "grad_norm": 0.2656804919242859, + "learning_rate": 8.435223266727704e-05, + "loss": 1.7624, + "step": 9161 + }, + { + "epoch": 2.8121546961325965, + "grad_norm": 0.26690298318862915, + "learning_rate": 8.434862081320774e-05, + "loss": 1.807, + "step": 9162 + }, + { + "epoch": 2.812461632903622, + "grad_norm": 0.3088238537311554, + "learning_rate": 8.434500861968521e-05, + "loss": 1.9214, + "step": 9163 + }, + { + "epoch": 2.812768569674647, + "grad_norm": 0.32310751080513, + "learning_rate": 8.43413960867452e-05, + "loss": 1.8341, + "step": 9164 + }, + { + "epoch": 2.813075506445672, + "grad_norm": 0.3028428554534912, + "learning_rate": 8.433778321442339e-05, + "loss": 1.8316, + "step": 9165 + }, + { + "epoch": 2.8133824432166974, + "grad_norm": 0.28363901376724243, + "learning_rate": 8.433417000275545e-05, + "loss": 1.8506, + "step": 9166 + }, + { + "epoch": 2.8136893799877223, + "grad_norm": 0.2976547181606293, + "learning_rate": 8.433055645177714e-05, + "loss": 1.8654, + "step": 9167 + }, + { + "epoch": 2.8139963167587476, + "grad_norm": 0.2945725619792938, + "learning_rate": 8.432694256152414e-05, + "loss": 1.8146, + "step": 9168 + }, + { + "epoch": 2.814303253529773, + "grad_norm": 0.30364149808883667, + "learning_rate": 8.432332833203217e-05, + "loss": 1.8152, + "step": 9169 + }, + { + "epoch": 2.8146101903007983, + "grad_norm": 0.2776038348674774, + "learning_rate": 8.431971376333699e-05, + "loss": 1.7723, + "step": 9170 + }, + { + "epoch": 2.814917127071823, + "grad_norm": 0.41802000999450684, + "learning_rate": 8.431609885547425e-05, + "loss": 1.7909, + "step": 9171 + }, + { + "epoch": 2.8152240638428485, + "grad_norm": 0.400622695684433, + "learning_rate": 8.43124836084797e-05, + "loss": 1.8241, + "step": 9172 + }, + { + "epoch": 2.8155310006138734, + "grad_norm": 0.3760300576686859, + "learning_rate": 8.430886802238908e-05, + "loss": 1.9298, + "step": 9173 + }, + { + "epoch": 2.8158379373848987, + "grad_norm": 0.2944977283477783, + "learning_rate": 8.430525209723813e-05, + "loss": 1.8181, + "step": 9174 + }, + { + "epoch": 2.816144874155924, + "grad_norm": 0.28091785311698914, + "learning_rate": 8.430163583306257e-05, + "loss": 1.8178, + "step": 9175 + }, + { + "epoch": 2.816451810926949, + "grad_norm": 0.33689528703689575, + "learning_rate": 8.429801922989812e-05, + "loss": 1.8195, + "step": 9176 + }, + { + "epoch": 2.8167587476979743, + "grad_norm": 0.3541412055492401, + "learning_rate": 8.429440228778058e-05, + "loss": 1.8951, + "step": 9177 + }, + { + "epoch": 2.817065684468999, + "grad_norm": 0.2846376299858093, + "learning_rate": 8.429078500674564e-05, + "loss": 1.7858, + "step": 9178 + }, + { + "epoch": 2.8173726212400245, + "grad_norm": 0.28097108006477356, + "learning_rate": 8.428716738682905e-05, + "loss": 1.8503, + "step": 9179 + }, + { + "epoch": 2.81767955801105, + "grad_norm": 0.354670912027359, + "learning_rate": 8.428354942806658e-05, + "loss": 1.8332, + "step": 9180 + }, + { + "epoch": 2.8179864947820747, + "grad_norm": 0.3589770793914795, + "learning_rate": 8.427993113049397e-05, + "loss": 1.8527, + "step": 9181 + }, + { + "epoch": 2.8182934315531, + "grad_norm": 0.3171144723892212, + "learning_rate": 8.4276312494147e-05, + "loss": 1.789, + "step": 9182 + }, + { + "epoch": 2.818600368324125, + "grad_norm": 0.3540917932987213, + "learning_rate": 8.427269351906143e-05, + "loss": 1.8338, + "step": 9183 + }, + { + "epoch": 2.8189073050951503, + "grad_norm": 0.34149861335754395, + "learning_rate": 8.426907420527302e-05, + "loss": 1.8202, + "step": 9184 + }, + { + "epoch": 2.8192142418661756, + "grad_norm": 0.3035878837108612, + "learning_rate": 8.426545455281751e-05, + "loss": 1.842, + "step": 9185 + }, + { + "epoch": 2.819521178637201, + "grad_norm": 0.29007625579833984, + "learning_rate": 8.426183456173072e-05, + "loss": 1.8486, + "step": 9186 + }, + { + "epoch": 2.819828115408226, + "grad_norm": 0.3066602647304535, + "learning_rate": 8.425821423204837e-05, + "loss": 1.7833, + "step": 9187 + }, + { + "epoch": 2.820135052179251, + "grad_norm": 0.3163747191429138, + "learning_rate": 8.425459356380627e-05, + "loss": 1.8037, + "step": 9188 + }, + { + "epoch": 2.820441988950276, + "grad_norm": 0.3282648026943207, + "learning_rate": 8.425097255704022e-05, + "loss": 1.8476, + "step": 9189 + }, + { + "epoch": 2.8207489257213014, + "grad_norm": 0.3573009669780731, + "learning_rate": 8.424735121178598e-05, + "loss": 1.87, + "step": 9190 + }, + { + "epoch": 2.8210558624923268, + "grad_norm": 0.3480490744113922, + "learning_rate": 8.424372952807933e-05, + "loss": 1.8773, + "step": 9191 + }, + { + "epoch": 2.8213627992633517, + "grad_norm": 0.3296821415424347, + "learning_rate": 8.424010750595608e-05, + "loss": 1.8775, + "step": 9192 + }, + { + "epoch": 2.821669736034377, + "grad_norm": 0.33366382122039795, + "learning_rate": 8.423648514545202e-05, + "loss": 1.8064, + "step": 9193 + }, + { + "epoch": 2.821976672805402, + "grad_norm": 0.454303503036499, + "learning_rate": 8.423286244660295e-05, + "loss": 1.9702, + "step": 9194 + }, + { + "epoch": 2.822283609576427, + "grad_norm": 0.361215740442276, + "learning_rate": 8.422923940944466e-05, + "loss": 1.8055, + "step": 9195 + }, + { + "epoch": 2.8225905463474525, + "grad_norm": 0.3678447902202606, + "learning_rate": 8.422561603401297e-05, + "loss": 1.8924, + "step": 9196 + }, + { + "epoch": 2.8228974831184774, + "grad_norm": 0.32999005913734436, + "learning_rate": 8.422199232034369e-05, + "loss": 1.7887, + "step": 9197 + }, + { + "epoch": 2.8232044198895028, + "grad_norm": 0.2811618149280548, + "learning_rate": 8.42183682684726e-05, + "loss": 1.8166, + "step": 9198 + }, + { + "epoch": 2.8235113566605277, + "grad_norm": 0.3178839385509491, + "learning_rate": 8.421474387843555e-05, + "loss": 1.7868, + "step": 9199 + }, + { + "epoch": 2.823818293431553, + "grad_norm": 0.27299264073371887, + "learning_rate": 8.421111915026836e-05, + "loss": 1.816, + "step": 9200 + }, + { + "epoch": 2.8241252302025783, + "grad_norm": 0.3191591203212738, + "learning_rate": 8.420749408400684e-05, + "loss": 1.912, + "step": 9201 + }, + { + "epoch": 2.8244321669736037, + "grad_norm": 0.3638809323310852, + "learning_rate": 8.42038686796868e-05, + "loss": 1.7716, + "step": 9202 + }, + { + "epoch": 2.8247391037446286, + "grad_norm": 0.33573171496391296, + "learning_rate": 8.420024293734407e-05, + "loss": 1.8599, + "step": 9203 + }, + { + "epoch": 2.825046040515654, + "grad_norm": 0.29062843322753906, + "learning_rate": 8.419661685701452e-05, + "loss": 1.7982, + "step": 9204 + }, + { + "epoch": 2.825352977286679, + "grad_norm": 0.27475887537002563, + "learning_rate": 8.419299043873394e-05, + "loss": 1.7763, + "step": 9205 + }, + { + "epoch": 2.825659914057704, + "grad_norm": 0.2996850609779358, + "learning_rate": 8.41893636825382e-05, + "loss": 1.7957, + "step": 9206 + }, + { + "epoch": 2.8259668508287294, + "grad_norm": 0.38112908601760864, + "learning_rate": 8.418573658846314e-05, + "loss": 1.8536, + "step": 9207 + }, + { + "epoch": 2.8262737875997543, + "grad_norm": 0.3245584964752197, + "learning_rate": 8.418210915654456e-05, + "loss": 1.8254, + "step": 9208 + }, + { + "epoch": 2.8265807243707797, + "grad_norm": 0.24600234627723694, + "learning_rate": 8.417848138681837e-05, + "loss": 1.825, + "step": 9209 + }, + { + "epoch": 2.8268876611418046, + "grad_norm": 0.3130429685115814, + "learning_rate": 8.417485327932038e-05, + "loss": 1.7954, + "step": 9210 + }, + { + "epoch": 2.82719459791283, + "grad_norm": 0.3218819200992584, + "learning_rate": 8.417122483408647e-05, + "loss": 1.8343, + "step": 9211 + }, + { + "epoch": 2.8275015346838552, + "grad_norm": 0.3020598292350769, + "learning_rate": 8.416759605115248e-05, + "loss": 1.8547, + "step": 9212 + }, + { + "epoch": 2.8278084714548806, + "grad_norm": 0.2685437798500061, + "learning_rate": 8.416396693055429e-05, + "loss": 1.7828, + "step": 9213 + }, + { + "epoch": 2.8281154082259055, + "grad_norm": 0.2990378737449646, + "learning_rate": 8.416033747232775e-05, + "loss": 1.8108, + "step": 9214 + }, + { + "epoch": 2.828422344996931, + "grad_norm": 0.25395238399505615, + "learning_rate": 8.415670767650871e-05, + "loss": 1.786, + "step": 9215 + }, + { + "epoch": 2.8287292817679557, + "grad_norm": 0.3406725823879242, + "learning_rate": 8.41530775431331e-05, + "loss": 1.9015, + "step": 9216 + }, + { + "epoch": 2.829036218538981, + "grad_norm": 0.279859721660614, + "learning_rate": 8.414944707223676e-05, + "loss": 1.8639, + "step": 9217 + }, + { + "epoch": 2.8293431553100064, + "grad_norm": 0.2574310600757599, + "learning_rate": 8.414581626385554e-05, + "loss": 1.7595, + "step": 9218 + }, + { + "epoch": 2.8296500920810312, + "grad_norm": 0.2956291437149048, + "learning_rate": 8.414218511802537e-05, + "loss": 1.8418, + "step": 9219 + }, + { + "epoch": 2.8299570288520566, + "grad_norm": 0.30965283513069153, + "learning_rate": 8.41385536347821e-05, + "loss": 1.8241, + "step": 9220 + }, + { + "epoch": 2.8302639656230815, + "grad_norm": 0.3125357925891876, + "learning_rate": 8.413492181416166e-05, + "loss": 1.7961, + "step": 9221 + }, + { + "epoch": 2.830570902394107, + "grad_norm": 0.23901188373565674, + "learning_rate": 8.413128965619988e-05, + "loss": 1.8109, + "step": 9222 + }, + { + "epoch": 2.830877839165132, + "grad_norm": 0.26556700468063354, + "learning_rate": 8.412765716093272e-05, + "loss": 1.8756, + "step": 9223 + }, + { + "epoch": 2.831184775936157, + "grad_norm": 0.3080972731113434, + "learning_rate": 8.412402432839604e-05, + "loss": 1.8271, + "step": 9224 + }, + { + "epoch": 2.8314917127071824, + "grad_norm": 0.32894501090049744, + "learning_rate": 8.412039115862573e-05, + "loss": 1.8427, + "step": 9225 + }, + { + "epoch": 2.8317986494782073, + "grad_norm": 0.3136049509048462, + "learning_rate": 8.411675765165774e-05, + "loss": 1.8716, + "step": 9226 + }, + { + "epoch": 2.8321055862492326, + "grad_norm": 0.26859185099601746, + "learning_rate": 8.411312380752795e-05, + "loss": 1.8138, + "step": 9227 + }, + { + "epoch": 2.832412523020258, + "grad_norm": 0.26863718032836914, + "learning_rate": 8.410948962627227e-05, + "loss": 1.8286, + "step": 9228 + }, + { + "epoch": 2.8327194597912833, + "grad_norm": 0.25599852204322815, + "learning_rate": 8.410585510792663e-05, + "loss": 1.8274, + "step": 9229 + }, + { + "epoch": 2.833026396562308, + "grad_norm": 0.22787287831306458, + "learning_rate": 8.410222025252694e-05, + "loss": 1.7961, + "step": 9230 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.22957643866539001, + "learning_rate": 8.409858506010912e-05, + "loss": 1.7763, + "step": 9231 + }, + { + "epoch": 2.8336402701043584, + "grad_norm": 0.2794438302516937, + "learning_rate": 8.409494953070909e-05, + "loss": 1.8552, + "step": 9232 + }, + { + "epoch": 2.8339472068753837, + "grad_norm": 0.2755461037158966, + "learning_rate": 8.409131366436279e-05, + "loss": 1.8418, + "step": 9233 + }, + { + "epoch": 2.834254143646409, + "grad_norm": 0.27968719601631165, + "learning_rate": 8.408767746110616e-05, + "loss": 1.8774, + "step": 9234 + }, + { + "epoch": 2.834561080417434, + "grad_norm": 0.3014982044696808, + "learning_rate": 8.408404092097511e-05, + "loss": 1.8886, + "step": 9235 + }, + { + "epoch": 2.8348680171884593, + "grad_norm": 0.3139450252056122, + "learning_rate": 8.408040404400558e-05, + "loss": 1.8119, + "step": 9236 + }, + { + "epoch": 2.835174953959484, + "grad_norm": 0.43578827381134033, + "learning_rate": 8.407676683023353e-05, + "loss": 1.8173, + "step": 9237 + }, + { + "epoch": 2.8354818907305095, + "grad_norm": 0.4939953088760376, + "learning_rate": 8.407312927969489e-05, + "loss": 1.8647, + "step": 9238 + }, + { + "epoch": 2.835788827501535, + "grad_norm": 0.40801018476486206, + "learning_rate": 8.406949139242562e-05, + "loss": 1.8259, + "step": 9239 + }, + { + "epoch": 2.8360957642725597, + "grad_norm": 0.331249862909317, + "learning_rate": 8.406585316846168e-05, + "loss": 1.8727, + "step": 9240 + }, + { + "epoch": 2.836402701043585, + "grad_norm": 0.3368569314479828, + "learning_rate": 8.406221460783901e-05, + "loss": 1.8362, + "step": 9241 + }, + { + "epoch": 2.83670963781461, + "grad_norm": 0.4736326336860657, + "learning_rate": 8.405857571059355e-05, + "loss": 1.9543, + "step": 9242 + }, + { + "epoch": 2.8370165745856353, + "grad_norm": 0.4151712656021118, + "learning_rate": 8.405493647676131e-05, + "loss": 1.8764, + "step": 9243 + }, + { + "epoch": 2.8373235113566606, + "grad_norm": 0.3463367819786072, + "learning_rate": 8.405129690637821e-05, + "loss": 1.8578, + "step": 9244 + }, + { + "epoch": 2.837630448127686, + "grad_norm": 0.28701671957969666, + "learning_rate": 8.404765699948023e-05, + "loss": 1.8201, + "step": 9245 + }, + { + "epoch": 2.837937384898711, + "grad_norm": 0.2893613874912262, + "learning_rate": 8.404401675610336e-05, + "loss": 1.7918, + "step": 9246 + }, + { + "epoch": 2.838244321669736, + "grad_norm": 0.29359766840934753, + "learning_rate": 8.404037617628357e-05, + "loss": 1.7919, + "step": 9247 + }, + { + "epoch": 2.838551258440761, + "grad_norm": 0.30147913098335266, + "learning_rate": 8.403673526005682e-05, + "loss": 1.8227, + "step": 9248 + }, + { + "epoch": 2.8388581952117864, + "grad_norm": 0.28443291783332825, + "learning_rate": 8.403309400745908e-05, + "loss": 1.8128, + "step": 9249 + }, + { + "epoch": 2.8391651319828117, + "grad_norm": 0.27890142798423767, + "learning_rate": 8.40294524185264e-05, + "loss": 1.8109, + "step": 9250 + }, + { + "epoch": 2.8394720687538366, + "grad_norm": 0.29900890588760376, + "learning_rate": 8.402581049329471e-05, + "loss": 1.7852, + "step": 9251 + }, + { + "epoch": 2.839779005524862, + "grad_norm": 0.34249019622802734, + "learning_rate": 8.402216823180001e-05, + "loss": 1.8681, + "step": 9252 + }, + { + "epoch": 2.840085942295887, + "grad_norm": 0.3387257754802704, + "learning_rate": 8.40185256340783e-05, + "loss": 1.9171, + "step": 9253 + }, + { + "epoch": 2.840392879066912, + "grad_norm": 0.2831752598285675, + "learning_rate": 8.40148827001656e-05, + "loss": 1.8422, + "step": 9254 + }, + { + "epoch": 2.8406998158379375, + "grad_norm": 0.30895891785621643, + "learning_rate": 8.401123943009788e-05, + "loss": 1.7967, + "step": 9255 + }, + { + "epoch": 2.8410067526089624, + "grad_norm": 0.381154328584671, + "learning_rate": 8.400759582391116e-05, + "loss": 1.8359, + "step": 9256 + }, + { + "epoch": 2.8413136893799877, + "grad_norm": 0.4041622281074524, + "learning_rate": 8.400395188164144e-05, + "loss": 1.8306, + "step": 9257 + }, + { + "epoch": 2.8416206261510126, + "grad_norm": 0.3801247775554657, + "learning_rate": 8.400030760332474e-05, + "loss": 1.8696, + "step": 9258 + }, + { + "epoch": 2.841927562922038, + "grad_norm": 0.27382874488830566, + "learning_rate": 8.399666298899706e-05, + "loss": 1.8369, + "step": 9259 + }, + { + "epoch": 2.8422344996930633, + "grad_norm": 0.31395214796066284, + "learning_rate": 8.399301803869445e-05, + "loss": 1.8135, + "step": 9260 + }, + { + "epoch": 2.8425414364640886, + "grad_norm": 0.36473774909973145, + "learning_rate": 8.398937275245291e-05, + "loss": 1.8025, + "step": 9261 + }, + { + "epoch": 2.8428483732351135, + "grad_norm": 0.38420331478118896, + "learning_rate": 8.398572713030846e-05, + "loss": 1.7873, + "step": 9262 + }, + { + "epoch": 2.843155310006139, + "grad_norm": 0.2707001566886902, + "learning_rate": 8.398208117229714e-05, + "loss": 1.8071, + "step": 9263 + }, + { + "epoch": 2.8434622467771637, + "grad_norm": 0.3391258418560028, + "learning_rate": 8.397843487845496e-05, + "loss": 1.8186, + "step": 9264 + }, + { + "epoch": 2.843769183548189, + "grad_norm": 0.4473530650138855, + "learning_rate": 8.397478824881799e-05, + "loss": 1.9144, + "step": 9265 + }, + { + "epoch": 2.8440761203192144, + "grad_norm": 0.3141709268093109, + "learning_rate": 8.397114128342224e-05, + "loss": 1.77, + "step": 9266 + }, + { + "epoch": 2.8443830570902393, + "grad_norm": 0.29191854596138, + "learning_rate": 8.396749398230377e-05, + "loss": 1.8645, + "step": 9267 + }, + { + "epoch": 2.8446899938612646, + "grad_norm": 0.4399743676185608, + "learning_rate": 8.39638463454986e-05, + "loss": 1.8261, + "step": 9268 + }, + { + "epoch": 2.8449969306322895, + "grad_norm": 0.4741196036338806, + "learning_rate": 8.396019837304281e-05, + "loss": 1.8566, + "step": 9269 + }, + { + "epoch": 2.845303867403315, + "grad_norm": 0.39640361070632935, + "learning_rate": 8.395655006497243e-05, + "loss": 1.8062, + "step": 9270 + }, + { + "epoch": 2.84561080417434, + "grad_norm": 0.290171355009079, + "learning_rate": 8.39529014213235e-05, + "loss": 1.8463, + "step": 9271 + }, + { + "epoch": 2.845917740945365, + "grad_norm": 0.2773928940296173, + "learning_rate": 8.394925244213212e-05, + "loss": 1.7929, + "step": 9272 + }, + { + "epoch": 2.8462246777163904, + "grad_norm": 0.38512173295021057, + "learning_rate": 8.394560312743433e-05, + "loss": 1.8724, + "step": 9273 + }, + { + "epoch": 2.8465316144874153, + "grad_norm": 0.44405680894851685, + "learning_rate": 8.394195347726619e-05, + "loss": 1.8184, + "step": 9274 + }, + { + "epoch": 2.8468385512584407, + "grad_norm": 0.32526880502700806, + "learning_rate": 8.393830349166376e-05, + "loss": 1.8207, + "step": 9275 + }, + { + "epoch": 2.847145488029466, + "grad_norm": 0.2934194803237915, + "learning_rate": 8.393465317066313e-05, + "loss": 1.8023, + "step": 9276 + }, + { + "epoch": 2.8474524248004913, + "grad_norm": 0.43126001954078674, + "learning_rate": 8.393100251430037e-05, + "loss": 1.8283, + "step": 9277 + }, + { + "epoch": 2.847759361571516, + "grad_norm": 0.48253729939460754, + "learning_rate": 8.392735152261157e-05, + "loss": 1.8359, + "step": 9278 + }, + { + "epoch": 2.8480662983425415, + "grad_norm": 0.3736251890659332, + "learning_rate": 8.392370019563279e-05, + "loss": 1.8553, + "step": 9279 + }, + { + "epoch": 2.8483732351135664, + "grad_norm": 0.33329901099205017, + "learning_rate": 8.39200485334001e-05, + "loss": 1.8156, + "step": 9280 + }, + { + "epoch": 2.8486801718845918, + "grad_norm": 0.42538657784461975, + "learning_rate": 8.391639653594963e-05, + "loss": 1.7812, + "step": 9281 + }, + { + "epoch": 2.848987108655617, + "grad_norm": 0.39076727628707886, + "learning_rate": 8.391274420331744e-05, + "loss": 1.8027, + "step": 9282 + }, + { + "epoch": 2.849294045426642, + "grad_norm": 0.3558272123336792, + "learning_rate": 8.390909153553963e-05, + "loss": 1.8448, + "step": 9283 + }, + { + "epoch": 2.8496009821976673, + "grad_norm": 0.26782071590423584, + "learning_rate": 8.390543853265232e-05, + "loss": 1.7995, + "step": 9284 + }, + { + "epoch": 2.849907918968692, + "grad_norm": 0.3449724614620209, + "learning_rate": 8.390178519469158e-05, + "loss": 1.7888, + "step": 9285 + }, + { + "epoch": 2.8502148557397176, + "grad_norm": 0.36390578746795654, + "learning_rate": 8.389813152169355e-05, + "loss": 1.8072, + "step": 9286 + }, + { + "epoch": 2.850521792510743, + "grad_norm": 0.31959423422813416, + "learning_rate": 8.389447751369428e-05, + "loss": 1.8513, + "step": 9287 + }, + { + "epoch": 2.8508287292817682, + "grad_norm": 0.2717762589454651, + "learning_rate": 8.389082317072994e-05, + "loss": 1.8457, + "step": 9288 + }, + { + "epoch": 2.851135666052793, + "grad_norm": 0.28937265276908875, + "learning_rate": 8.388716849283662e-05, + "loss": 1.7945, + "step": 9289 + }, + { + "epoch": 2.8514426028238185, + "grad_norm": 0.293079674243927, + "learning_rate": 8.388351348005044e-05, + "loss": 1.7731, + "step": 9290 + }, + { + "epoch": 2.8517495395948433, + "grad_norm": 0.32930463552474976, + "learning_rate": 8.38798581324075e-05, + "loss": 1.9017, + "step": 9291 + }, + { + "epoch": 2.8520564763658687, + "grad_norm": 0.2972584664821625, + "learning_rate": 8.387620244994397e-05, + "loss": 1.861, + "step": 9292 + }, + { + "epoch": 2.852363413136894, + "grad_norm": 0.24732981622219086, + "learning_rate": 8.387254643269595e-05, + "loss": 1.7749, + "step": 9293 + }, + { + "epoch": 2.852670349907919, + "grad_norm": 0.31004419922828674, + "learning_rate": 8.386889008069955e-05, + "loss": 1.7848, + "step": 9294 + }, + { + "epoch": 2.8529772866789442, + "grad_norm": 0.2916278541088104, + "learning_rate": 8.386523339399095e-05, + "loss": 1.8299, + "step": 9295 + }, + { + "epoch": 2.853284223449969, + "grad_norm": 0.3109573423862457, + "learning_rate": 8.386157637260626e-05, + "loss": 1.8072, + "step": 9296 + }, + { + "epoch": 2.8535911602209945, + "grad_norm": 0.26398584246635437, + "learning_rate": 8.385791901658162e-05, + "loss": 1.8157, + "step": 9297 + }, + { + "epoch": 2.85389809699202, + "grad_norm": 0.3289371132850647, + "learning_rate": 8.385426132595317e-05, + "loss": 1.9382, + "step": 9298 + }, + { + "epoch": 2.8542050337630447, + "grad_norm": 0.2946974039077759, + "learning_rate": 8.38506033007571e-05, + "loss": 1.7893, + "step": 9299 + }, + { + "epoch": 2.85451197053407, + "grad_norm": 0.2909530699253082, + "learning_rate": 8.384694494102949e-05, + "loss": 1.8223, + "step": 9300 + }, + { + "epoch": 2.854818907305095, + "grad_norm": 0.2886645793914795, + "learning_rate": 8.384328624680655e-05, + "loss": 1.8239, + "step": 9301 + }, + { + "epoch": 2.8551258440761202, + "grad_norm": 0.2669137716293335, + "learning_rate": 8.383962721812442e-05, + "loss": 1.8102, + "step": 9302 + }, + { + "epoch": 2.8554327808471456, + "grad_norm": 0.3740660548210144, + "learning_rate": 8.383596785501926e-05, + "loss": 1.9014, + "step": 9303 + }, + { + "epoch": 2.855739717618171, + "grad_norm": 0.3062593638896942, + "learning_rate": 8.383230815752724e-05, + "loss": 1.8071, + "step": 9304 + }, + { + "epoch": 2.856046654389196, + "grad_norm": 0.2509091794490814, + "learning_rate": 8.382864812568452e-05, + "loss": 1.7968, + "step": 9305 + }, + { + "epoch": 2.856353591160221, + "grad_norm": 0.2764138877391815, + "learning_rate": 8.382498775952725e-05, + "loss": 1.7463, + "step": 9306 + }, + { + "epoch": 2.856660527931246, + "grad_norm": 0.3292323350906372, + "learning_rate": 8.382132705909165e-05, + "loss": 1.7888, + "step": 9307 + }, + { + "epoch": 2.8569674647022714, + "grad_norm": 0.3169284462928772, + "learning_rate": 8.381766602441386e-05, + "loss": 1.841, + "step": 9308 + }, + { + "epoch": 2.8572744014732967, + "grad_norm": 0.27665168046951294, + "learning_rate": 8.381400465553007e-05, + "loss": 1.7659, + "step": 9309 + }, + { + "epoch": 2.8575813382443216, + "grad_norm": 0.34908005595207214, + "learning_rate": 8.381034295247647e-05, + "loss": 1.8752, + "step": 9310 + }, + { + "epoch": 2.857888275015347, + "grad_norm": 0.31204238533973694, + "learning_rate": 8.380668091528924e-05, + "loss": 1.8201, + "step": 9311 + }, + { + "epoch": 2.858195211786372, + "grad_norm": 0.2713339328765869, + "learning_rate": 8.380301854400459e-05, + "loss": 1.8002, + "step": 9312 + }, + { + "epoch": 2.858502148557397, + "grad_norm": 0.30525076389312744, + "learning_rate": 8.379935583865868e-05, + "loss": 1.8533, + "step": 9313 + }, + { + "epoch": 2.8588090853284225, + "grad_norm": 0.3294430673122406, + "learning_rate": 8.379569279928774e-05, + "loss": 1.8895, + "step": 9314 + }, + { + "epoch": 2.8591160220994474, + "grad_norm": 0.31798750162124634, + "learning_rate": 8.379202942592795e-05, + "loss": 1.8148, + "step": 9315 + }, + { + "epoch": 2.8594229588704727, + "grad_norm": 0.3044969141483307, + "learning_rate": 8.378836571861553e-05, + "loss": 1.8477, + "step": 9316 + }, + { + "epoch": 2.8597298956414976, + "grad_norm": 0.2694118320941925, + "learning_rate": 8.378470167738665e-05, + "loss": 1.7998, + "step": 9317 + }, + { + "epoch": 2.860036832412523, + "grad_norm": 0.2601872980594635, + "learning_rate": 8.378103730227758e-05, + "loss": 1.8118, + "step": 9318 + }, + { + "epoch": 2.8603437691835483, + "grad_norm": 0.28168994188308716, + "learning_rate": 8.377737259332446e-05, + "loss": 1.8048, + "step": 9319 + }, + { + "epoch": 2.8606507059545736, + "grad_norm": 0.3008260428905487, + "learning_rate": 8.377370755056358e-05, + "loss": 1.7743, + "step": 9320 + }, + { + "epoch": 2.8609576427255985, + "grad_norm": 0.2578682601451874, + "learning_rate": 8.37700421740311e-05, + "loss": 1.8011, + "step": 9321 + }, + { + "epoch": 2.861264579496624, + "grad_norm": 0.3051932752132416, + "learning_rate": 8.376637646376329e-05, + "loss": 1.8747, + "step": 9322 + }, + { + "epoch": 2.8615715162676487, + "grad_norm": 0.27534300088882446, + "learning_rate": 8.376271041979636e-05, + "loss": 1.8018, + "step": 9323 + }, + { + "epoch": 2.861878453038674, + "grad_norm": 0.3990626335144043, + "learning_rate": 8.375904404216653e-05, + "loss": 1.9223, + "step": 9324 + }, + { + "epoch": 2.8621853898096994, + "grad_norm": 0.43015196919441223, + "learning_rate": 8.375537733091003e-05, + "loss": 1.8219, + "step": 9325 + }, + { + "epoch": 2.8624923265807243, + "grad_norm": 0.4051269590854645, + "learning_rate": 8.37517102860631e-05, + "loss": 1.8057, + "step": 9326 + }, + { + "epoch": 2.8627992633517496, + "grad_norm": 0.31781086325645447, + "learning_rate": 8.3748042907662e-05, + "loss": 1.8374, + "step": 9327 + }, + { + "epoch": 2.8631062001227745, + "grad_norm": 0.3476638197898865, + "learning_rate": 8.374437519574297e-05, + "loss": 1.8679, + "step": 9328 + }, + { + "epoch": 2.8634131368938, + "grad_norm": 0.40497875213623047, + "learning_rate": 8.374070715034224e-05, + "loss": 1.7996, + "step": 9329 + }, + { + "epoch": 2.863720073664825, + "grad_norm": 0.40277308225631714, + "learning_rate": 8.373703877149605e-05, + "loss": 1.8156, + "step": 9330 + }, + { + "epoch": 2.86402701043585, + "grad_norm": 0.3012325167655945, + "learning_rate": 8.373337005924069e-05, + "loss": 1.8765, + "step": 9331 + }, + { + "epoch": 2.8643339472068754, + "grad_norm": 0.3151897192001343, + "learning_rate": 8.372970101361238e-05, + "loss": 1.8395, + "step": 9332 + }, + { + "epoch": 2.8646408839779003, + "grad_norm": 0.33645790815353394, + "learning_rate": 8.372603163464741e-05, + "loss": 1.8587, + "step": 9333 + }, + { + "epoch": 2.8649478207489256, + "grad_norm": 0.29943743348121643, + "learning_rate": 8.3722361922382e-05, + "loss": 1.8007, + "step": 9334 + }, + { + "epoch": 2.865254757519951, + "grad_norm": 0.24727779626846313, + "learning_rate": 8.371869187685248e-05, + "loss": 1.766, + "step": 9335 + }, + { + "epoch": 2.8655616942909763, + "grad_norm": 0.3177282512187958, + "learning_rate": 8.371502149809507e-05, + "loss": 1.7954, + "step": 9336 + }, + { + "epoch": 2.865868631062001, + "grad_norm": 0.3415081202983856, + "learning_rate": 8.371135078614605e-05, + "loss": 1.8036, + "step": 9337 + }, + { + "epoch": 2.8661755678330265, + "grad_norm": 0.3044268488883972, + "learning_rate": 8.37076797410417e-05, + "loss": 1.8196, + "step": 9338 + }, + { + "epoch": 2.8664825046040514, + "grad_norm": 0.24425630271434784, + "learning_rate": 8.370400836281831e-05, + "loss": 1.8267, + "step": 9339 + }, + { + "epoch": 2.8667894413750767, + "grad_norm": 0.27264806628227234, + "learning_rate": 8.370033665151216e-05, + "loss": 1.8218, + "step": 9340 + }, + { + "epoch": 2.867096378146102, + "grad_norm": 0.275601327419281, + "learning_rate": 8.369666460715953e-05, + "loss": 1.8427, + "step": 9341 + }, + { + "epoch": 2.867403314917127, + "grad_norm": 0.2670573592185974, + "learning_rate": 8.36929922297967e-05, + "loss": 1.8449, + "step": 9342 + }, + { + "epoch": 2.8677102516881523, + "grad_norm": 0.2991434335708618, + "learning_rate": 8.368931951945998e-05, + "loss": 1.8866, + "step": 9343 + }, + { + "epoch": 2.868017188459177, + "grad_norm": 0.2975110411643982, + "learning_rate": 8.368564647618564e-05, + "loss": 1.7992, + "step": 9344 + }, + { + "epoch": 2.8683241252302025, + "grad_norm": 0.30109819769859314, + "learning_rate": 8.368197310001001e-05, + "loss": 1.8402, + "step": 9345 + }, + { + "epoch": 2.868631062001228, + "grad_norm": 0.3303714692592621, + "learning_rate": 8.367829939096938e-05, + "loss": 1.8329, + "step": 9346 + }, + { + "epoch": 2.8689379987722528, + "grad_norm": 0.3697182834148407, + "learning_rate": 8.367462534910007e-05, + "loss": 1.9328, + "step": 9347 + }, + { + "epoch": 2.869244935543278, + "grad_norm": 0.3292355537414551, + "learning_rate": 8.367095097443836e-05, + "loss": 1.8284, + "step": 9348 + }, + { + "epoch": 2.869551872314303, + "grad_norm": 0.30440348386764526, + "learning_rate": 8.366727626702058e-05, + "loss": 1.8891, + "step": 9349 + }, + { + "epoch": 2.8698588090853283, + "grad_norm": 0.28200212121009827, + "learning_rate": 8.366360122688303e-05, + "loss": 1.7931, + "step": 9350 + }, + { + "epoch": 2.8701657458563536, + "grad_norm": 0.3162787854671478, + "learning_rate": 8.365992585406207e-05, + "loss": 1.8033, + "step": 9351 + }, + { + "epoch": 2.870472682627379, + "grad_norm": 0.3326094448566437, + "learning_rate": 8.365625014859399e-05, + "loss": 1.8474, + "step": 9352 + }, + { + "epoch": 2.870779619398404, + "grad_norm": 0.36957383155822754, + "learning_rate": 8.36525741105151e-05, + "loss": 1.8387, + "step": 9353 + }, + { + "epoch": 2.871086556169429, + "grad_norm": 0.32996198534965515, + "learning_rate": 8.364889773986175e-05, + "loss": 1.9087, + "step": 9354 + }, + { + "epoch": 2.871393492940454, + "grad_norm": 0.3164239227771759, + "learning_rate": 8.36452210366703e-05, + "loss": 1.8735, + "step": 9355 + }, + { + "epoch": 2.8717004297114794, + "grad_norm": 0.411538302898407, + "learning_rate": 8.364154400097702e-05, + "loss": 1.832, + "step": 9356 + }, + { + "epoch": 2.8720073664825048, + "grad_norm": 0.48294687271118164, + "learning_rate": 8.36378666328183e-05, + "loss": 1.7772, + "step": 9357 + }, + { + "epoch": 2.8723143032535297, + "grad_norm": 0.4894202649593353, + "learning_rate": 8.363418893223046e-05, + "loss": 1.8396, + "step": 9358 + }, + { + "epoch": 2.872621240024555, + "grad_norm": 0.3328344225883484, + "learning_rate": 8.363051089924986e-05, + "loss": 1.8264, + "step": 9359 + }, + { + "epoch": 2.87292817679558, + "grad_norm": 0.29800695180892944, + "learning_rate": 8.362683253391284e-05, + "loss": 1.8609, + "step": 9360 + }, + { + "epoch": 2.873235113566605, + "grad_norm": 0.48049718141555786, + "learning_rate": 8.362315383625574e-05, + "loss": 1.8703, + "step": 9361 + }, + { + "epoch": 2.8735420503376305, + "grad_norm": 0.5477426052093506, + "learning_rate": 8.361947480631494e-05, + "loss": 1.8336, + "step": 9362 + }, + { + "epoch": 2.873848987108656, + "grad_norm": 0.42515942454338074, + "learning_rate": 8.361579544412676e-05, + "loss": 1.826, + "step": 9363 + }, + { + "epoch": 2.8741559238796808, + "grad_norm": 0.3049539029598236, + "learning_rate": 8.361211574972762e-05, + "loss": 1.9117, + "step": 9364 + }, + { + "epoch": 2.874462860650706, + "grad_norm": 0.4089799225330353, + "learning_rate": 8.360843572315384e-05, + "loss": 1.8669, + "step": 9365 + }, + { + "epoch": 2.874769797421731, + "grad_norm": 0.42594894766807556, + "learning_rate": 8.36047553644418e-05, + "loss": 1.8527, + "step": 9366 + }, + { + "epoch": 2.8750767341927563, + "grad_norm": 0.3282840847969055, + "learning_rate": 8.360107467362785e-05, + "loss": 1.833, + "step": 9367 + }, + { + "epoch": 2.8753836709637817, + "grad_norm": 0.26597294211387634, + "learning_rate": 8.359739365074841e-05, + "loss": 1.7735, + "step": 9368 + }, + { + "epoch": 2.8756906077348066, + "grad_norm": 0.33498096466064453, + "learning_rate": 8.359371229583983e-05, + "loss": 1.7923, + "step": 9369 + }, + { + "epoch": 2.875997544505832, + "grad_norm": 0.3046290874481201, + "learning_rate": 8.35900306089385e-05, + "loss": 1.8296, + "step": 9370 + }, + { + "epoch": 2.876304481276857, + "grad_norm": 0.3128269612789154, + "learning_rate": 8.358634859008079e-05, + "loss": 1.8115, + "step": 9371 + }, + { + "epoch": 2.876611418047882, + "grad_norm": 0.3814822733402252, + "learning_rate": 8.358266623930309e-05, + "loss": 1.8454, + "step": 9372 + }, + { + "epoch": 2.8769183548189075, + "grad_norm": 0.42400503158569336, + "learning_rate": 8.35789835566418e-05, + "loss": 1.8162, + "step": 9373 + }, + { + "epoch": 2.8772252915899323, + "grad_norm": 0.3131491243839264, + "learning_rate": 8.357530054213333e-05, + "loss": 1.8281, + "step": 9374 + }, + { + "epoch": 2.8775322283609577, + "grad_norm": 0.2566036581993103, + "learning_rate": 8.357161719581406e-05, + "loss": 1.7751, + "step": 9375 + }, + { + "epoch": 2.8778391651319826, + "grad_norm": 0.3858461081981659, + "learning_rate": 8.356793351772038e-05, + "loss": 1.8558, + "step": 9376 + }, + { + "epoch": 2.878146101903008, + "grad_norm": 0.38664349913597107, + "learning_rate": 8.35642495078887e-05, + "loss": 1.8009, + "step": 9377 + }, + { + "epoch": 2.8784530386740332, + "grad_norm": 0.33365172147750854, + "learning_rate": 8.356056516635545e-05, + "loss": 1.8689, + "step": 9378 + }, + { + "epoch": 2.8787599754450586, + "grad_norm": 0.3602980971336365, + "learning_rate": 8.355688049315702e-05, + "loss": 1.8397, + "step": 9379 + }, + { + "epoch": 2.8790669122160835, + "grad_norm": 0.4508447051048279, + "learning_rate": 8.355319548832983e-05, + "loss": 1.8163, + "step": 9380 + }, + { + "epoch": 2.879373848987109, + "grad_norm": 0.4433961808681488, + "learning_rate": 8.35495101519103e-05, + "loss": 1.7868, + "step": 9381 + }, + { + "epoch": 2.8796807857581337, + "grad_norm": 0.2754592299461365, + "learning_rate": 8.354582448393483e-05, + "loss": 1.8222, + "step": 9382 + }, + { + "epoch": 2.879987722529159, + "grad_norm": 0.29384344816207886, + "learning_rate": 8.354213848443987e-05, + "loss": 1.7742, + "step": 9383 + }, + { + "epoch": 2.8802946593001844, + "grad_norm": 0.33183756470680237, + "learning_rate": 8.353845215346183e-05, + "loss": 1.8327, + "step": 9384 + }, + { + "epoch": 2.8806015960712092, + "grad_norm": 0.3018858730792999, + "learning_rate": 8.353476549103717e-05, + "loss": 1.8606, + "step": 9385 + }, + { + "epoch": 2.8809085328422346, + "grad_norm": 0.38592803478240967, + "learning_rate": 8.353107849720229e-05, + "loss": 1.8091, + "step": 9386 + }, + { + "epoch": 2.8812154696132595, + "grad_norm": 0.448723703622818, + "learning_rate": 8.352739117199364e-05, + "loss": 1.8537, + "step": 9387 + }, + { + "epoch": 2.881522406384285, + "grad_norm": 0.25959616899490356, + "learning_rate": 8.352370351544765e-05, + "loss": 1.8188, + "step": 9388 + }, + { + "epoch": 2.88182934315531, + "grad_norm": 0.3304184079170227, + "learning_rate": 8.352001552760078e-05, + "loss": 1.8008, + "step": 9389 + }, + { + "epoch": 2.882136279926335, + "grad_norm": 0.3831254541873932, + "learning_rate": 8.351632720848947e-05, + "loss": 1.7636, + "step": 9390 + }, + { + "epoch": 2.8824432166973604, + "grad_norm": 0.3358294665813446, + "learning_rate": 8.351263855815017e-05, + "loss": 1.8375, + "step": 9391 + }, + { + "epoch": 2.8827501534683853, + "grad_norm": 0.31194913387298584, + "learning_rate": 8.350894957661935e-05, + "loss": 1.817, + "step": 9392 + }, + { + "epoch": 2.8830570902394106, + "grad_norm": 0.4156818687915802, + "learning_rate": 8.350526026393343e-05, + "loss": 1.799, + "step": 9393 + }, + { + "epoch": 2.883364027010436, + "grad_norm": 0.3062533140182495, + "learning_rate": 8.350157062012889e-05, + "loss": 1.8535, + "step": 9394 + }, + { + "epoch": 2.8836709637814613, + "grad_norm": 0.3091447949409485, + "learning_rate": 8.34978806452422e-05, + "loss": 1.839, + "step": 9395 + }, + { + "epoch": 2.883977900552486, + "grad_norm": 0.38731643557548523, + "learning_rate": 8.349419033930981e-05, + "loss": 1.8714, + "step": 9396 + }, + { + "epoch": 2.8842848373235115, + "grad_norm": 0.34655869007110596, + "learning_rate": 8.34904997023682e-05, + "loss": 1.8694, + "step": 9397 + }, + { + "epoch": 2.8845917740945364, + "grad_norm": 0.3094301223754883, + "learning_rate": 8.348680873445386e-05, + "loss": 1.8773, + "step": 9398 + }, + { + "epoch": 2.8848987108655617, + "grad_norm": 0.2954508364200592, + "learning_rate": 8.348311743560325e-05, + "loss": 1.7716, + "step": 9399 + }, + { + "epoch": 2.885205647636587, + "grad_norm": 0.32545948028564453, + "learning_rate": 8.347942580585282e-05, + "loss": 1.871, + "step": 9400 + }, + { + "epoch": 2.885512584407612, + "grad_norm": 0.3251612186431885, + "learning_rate": 8.34757338452391e-05, + "loss": 1.8553, + "step": 9401 + }, + { + "epoch": 2.8858195211786373, + "grad_norm": 0.2610895335674286, + "learning_rate": 8.347204155379856e-05, + "loss": 1.8018, + "step": 9402 + }, + { + "epoch": 2.886126457949662, + "grad_norm": 0.3369129002094269, + "learning_rate": 8.346834893156768e-05, + "loss": 1.8536, + "step": 9403 + }, + { + "epoch": 2.8864333947206875, + "grad_norm": 0.4544060528278351, + "learning_rate": 8.346465597858296e-05, + "loss": 1.8332, + "step": 9404 + }, + { + "epoch": 2.886740331491713, + "grad_norm": 0.45742174983024597, + "learning_rate": 8.346096269488089e-05, + "loss": 1.89, + "step": 9405 + }, + { + "epoch": 2.8870472682627377, + "grad_norm": 0.3458103537559509, + "learning_rate": 8.345726908049799e-05, + "loss": 1.8902, + "step": 9406 + }, + { + "epoch": 2.887354205033763, + "grad_norm": 0.33266058564186096, + "learning_rate": 8.345357513547074e-05, + "loss": 1.7975, + "step": 9407 + }, + { + "epoch": 2.887661141804788, + "grad_norm": 0.3503437042236328, + "learning_rate": 8.344988085983565e-05, + "loss": 1.8503, + "step": 9408 + }, + { + "epoch": 2.8879680785758133, + "grad_norm": 0.33511486649513245, + "learning_rate": 8.344618625362923e-05, + "loss": 1.8731, + "step": 9409 + }, + { + "epoch": 2.8882750153468386, + "grad_norm": 0.295250803232193, + "learning_rate": 8.344249131688799e-05, + "loss": 1.8557, + "step": 9410 + }, + { + "epoch": 2.888581952117864, + "grad_norm": 0.33287179470062256, + "learning_rate": 8.343879604964846e-05, + "loss": 1.8015, + "step": 9411 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.35169747471809387, + "learning_rate": 8.343510045194715e-05, + "loss": 1.7857, + "step": 9412 + }, + { + "epoch": 2.889195825659914, + "grad_norm": 0.3191360533237457, + "learning_rate": 8.343140452382056e-05, + "loss": 1.8474, + "step": 9413 + }, + { + "epoch": 2.889502762430939, + "grad_norm": 0.27216482162475586, + "learning_rate": 8.342770826530526e-05, + "loss": 1.7941, + "step": 9414 + }, + { + "epoch": 2.8898096992019644, + "grad_norm": 0.32968905568122864, + "learning_rate": 8.342401167643774e-05, + "loss": 1.8568, + "step": 9415 + }, + { + "epoch": 2.8901166359729897, + "grad_norm": 0.37429341673851013, + "learning_rate": 8.342031475725456e-05, + "loss": 1.8995, + "step": 9416 + }, + { + "epoch": 2.8904235727440146, + "grad_norm": 0.3318146765232086, + "learning_rate": 8.341661750779223e-05, + "loss": 1.8886, + "step": 9417 + }, + { + "epoch": 2.89073050951504, + "grad_norm": 0.3208807408809662, + "learning_rate": 8.34129199280873e-05, + "loss": 1.8306, + "step": 9418 + }, + { + "epoch": 2.891037446286065, + "grad_norm": 0.30906134843826294, + "learning_rate": 8.340922201817632e-05, + "loss": 1.8931, + "step": 9419 + }, + { + "epoch": 2.89134438305709, + "grad_norm": 0.2949373722076416, + "learning_rate": 8.340552377809581e-05, + "loss": 1.8375, + "step": 9420 + }, + { + "epoch": 2.8916513198281155, + "grad_norm": 0.2553368806838989, + "learning_rate": 8.340182520788236e-05, + "loss": 1.7816, + "step": 9421 + }, + { + "epoch": 2.891958256599141, + "grad_norm": 0.26867765188217163, + "learning_rate": 8.339812630757246e-05, + "loss": 1.7721, + "step": 9422 + }, + { + "epoch": 2.8922651933701657, + "grad_norm": 0.3132673501968384, + "learning_rate": 8.339442707720273e-05, + "loss": 1.8412, + "step": 9423 + }, + { + "epoch": 2.892572130141191, + "grad_norm": 0.32028669118881226, + "learning_rate": 8.33907275168097e-05, + "loss": 1.8081, + "step": 9424 + }, + { + "epoch": 2.892879066912216, + "grad_norm": 0.30383285880088806, + "learning_rate": 8.338702762642992e-05, + "loss": 1.8294, + "step": 9425 + }, + { + "epoch": 2.8931860036832413, + "grad_norm": 0.284161239862442, + "learning_rate": 8.338332740609995e-05, + "loss": 1.7788, + "step": 9426 + }, + { + "epoch": 2.8934929404542666, + "grad_norm": 0.26731929183006287, + "learning_rate": 8.337962685585638e-05, + "loss": 1.8244, + "step": 9427 + }, + { + "epoch": 2.8937998772252915, + "grad_norm": 0.2687760889530182, + "learning_rate": 8.337592597573578e-05, + "loss": 1.8104, + "step": 9428 + }, + { + "epoch": 2.894106813996317, + "grad_norm": 0.3097872734069824, + "learning_rate": 8.337222476577472e-05, + "loss": 1.8311, + "step": 9429 + }, + { + "epoch": 2.8944137507673418, + "grad_norm": 0.2915988862514496, + "learning_rate": 8.336852322600977e-05, + "loss": 1.8878, + "step": 9430 + }, + { + "epoch": 2.894720687538367, + "grad_norm": 0.2783167362213135, + "learning_rate": 8.336482135647751e-05, + "loss": 1.829, + "step": 9431 + }, + { + "epoch": 2.8950276243093924, + "grad_norm": 0.27866432070732117, + "learning_rate": 8.336111915721454e-05, + "loss": 1.8881, + "step": 9432 + }, + { + "epoch": 2.8953345610804173, + "grad_norm": 0.26949164271354675, + "learning_rate": 8.335741662825743e-05, + "loss": 1.7652, + "step": 9433 + }, + { + "epoch": 2.8956414978514426, + "grad_norm": 0.31324130296707153, + "learning_rate": 8.335371376964278e-05, + "loss": 1.8362, + "step": 9434 + }, + { + "epoch": 2.8959484346224675, + "grad_norm": 0.31150999665260315, + "learning_rate": 8.335001058140718e-05, + "loss": 1.8588, + "step": 9435 + }, + { + "epoch": 2.896255371393493, + "grad_norm": 0.30692601203918457, + "learning_rate": 8.334630706358724e-05, + "loss": 1.8473, + "step": 9436 + }, + { + "epoch": 2.896562308164518, + "grad_norm": 0.2764357328414917, + "learning_rate": 8.334260321621954e-05, + "loss": 1.8696, + "step": 9437 + }, + { + "epoch": 2.8968692449355435, + "grad_norm": 0.26108071208000183, + "learning_rate": 8.333889903934069e-05, + "loss": 1.7647, + "step": 9438 + }, + { + "epoch": 2.8971761817065684, + "grad_norm": 0.3382989466190338, + "learning_rate": 8.33351945329873e-05, + "loss": 1.8936, + "step": 9439 + }, + { + "epoch": 2.8974831184775938, + "grad_norm": 0.3121405839920044, + "learning_rate": 8.333148969719598e-05, + "loss": 1.8281, + "step": 9440 + }, + { + "epoch": 2.8977900552486187, + "grad_norm": 0.283149778842926, + "learning_rate": 8.332778453200334e-05, + "loss": 1.8642, + "step": 9441 + }, + { + "epoch": 2.898096992019644, + "grad_norm": 0.4140075445175171, + "learning_rate": 8.332407903744598e-05, + "loss": 1.8553, + "step": 9442 + }, + { + "epoch": 2.8984039287906693, + "grad_norm": 0.4345620274543762, + "learning_rate": 8.332037321356057e-05, + "loss": 1.7879, + "step": 9443 + }, + { + "epoch": 2.898710865561694, + "grad_norm": 0.4103661775588989, + "learning_rate": 8.33166670603837e-05, + "loss": 1.7928, + "step": 9444 + }, + { + "epoch": 2.8990178023327196, + "grad_norm": 0.2874266505241394, + "learning_rate": 8.3312960577952e-05, + "loss": 1.8097, + "step": 9445 + }, + { + "epoch": 2.8993247391037444, + "grad_norm": 0.2949487864971161, + "learning_rate": 8.330925376630208e-05, + "loss": 1.8679, + "step": 9446 + }, + { + "epoch": 2.8996316758747698, + "grad_norm": 0.3222406804561615, + "learning_rate": 8.330554662547059e-05, + "loss": 1.8184, + "step": 9447 + }, + { + "epoch": 2.899938612645795, + "grad_norm": 0.32089436054229736, + "learning_rate": 8.330183915549418e-05, + "loss": 1.8798, + "step": 9448 + }, + { + "epoch": 2.90024554941682, + "grad_norm": 0.28950363397598267, + "learning_rate": 8.329813135640947e-05, + "loss": 1.8502, + "step": 9449 + }, + { + "epoch": 2.9005524861878453, + "grad_norm": 0.29070547223091125, + "learning_rate": 8.329442322825312e-05, + "loss": 1.8826, + "step": 9450 + }, + { + "epoch": 2.9008594229588702, + "grad_norm": 0.3030688464641571, + "learning_rate": 8.329071477106175e-05, + "loss": 1.8002, + "step": 9451 + }, + { + "epoch": 2.9011663597298956, + "grad_norm": 0.33711570501327515, + "learning_rate": 8.328700598487203e-05, + "loss": 1.8876, + "step": 9452 + }, + { + "epoch": 2.901473296500921, + "grad_norm": 0.31995612382888794, + "learning_rate": 8.328329686972063e-05, + "loss": 1.7952, + "step": 9453 + }, + { + "epoch": 2.9017802332719462, + "grad_norm": 0.2619616389274597, + "learning_rate": 8.327958742564415e-05, + "loss": 1.7371, + "step": 9454 + }, + { + "epoch": 2.902087170042971, + "grad_norm": 0.3527650535106659, + "learning_rate": 8.32758776526793e-05, + "loss": 1.8385, + "step": 9455 + }, + { + "epoch": 2.9023941068139965, + "grad_norm": 0.3238582909107208, + "learning_rate": 8.327216755086271e-05, + "loss": 1.7955, + "step": 9456 + }, + { + "epoch": 2.9027010435850213, + "grad_norm": 0.2647970914840698, + "learning_rate": 8.326845712023106e-05, + "loss": 1.8639, + "step": 9457 + }, + { + "epoch": 2.9030079803560467, + "grad_norm": 0.3435346186161041, + "learning_rate": 8.326474636082103e-05, + "loss": 1.7831, + "step": 9458 + }, + { + "epoch": 2.903314917127072, + "grad_norm": 0.42539843916893005, + "learning_rate": 8.326103527266927e-05, + "loss": 1.8473, + "step": 9459 + }, + { + "epoch": 2.903621853898097, + "grad_norm": 0.3773367404937744, + "learning_rate": 8.325732385581247e-05, + "loss": 1.8993, + "step": 9460 + }, + { + "epoch": 2.9039287906691222, + "grad_norm": 0.2918262183666229, + "learning_rate": 8.32536121102873e-05, + "loss": 1.8198, + "step": 9461 + }, + { + "epoch": 2.904235727440147, + "grad_norm": 0.3997703492641449, + "learning_rate": 8.324990003613044e-05, + "loss": 1.8307, + "step": 9462 + }, + { + "epoch": 2.9045426642111725, + "grad_norm": 0.4593566656112671, + "learning_rate": 8.324618763337858e-05, + "loss": 1.8068, + "step": 9463 + }, + { + "epoch": 2.904849600982198, + "grad_norm": 0.30200180411338806, + "learning_rate": 8.324247490206841e-05, + "loss": 1.7935, + "step": 9464 + }, + { + "epoch": 2.9051565377532227, + "grad_norm": 0.37651970982551575, + "learning_rate": 8.323876184223663e-05, + "loss": 1.9268, + "step": 9465 + }, + { + "epoch": 2.905463474524248, + "grad_norm": 0.465863436460495, + "learning_rate": 8.32350484539199e-05, + "loss": 1.8331, + "step": 9466 + }, + { + "epoch": 2.905770411295273, + "grad_norm": 0.3527480661869049, + "learning_rate": 8.323133473715496e-05, + "loss": 1.899, + "step": 9467 + }, + { + "epoch": 2.9060773480662982, + "grad_norm": 0.30979883670806885, + "learning_rate": 8.32276206919785e-05, + "loss": 1.7578, + "step": 9468 + }, + { + "epoch": 2.9063842848373236, + "grad_norm": 0.5039793252944946, + "learning_rate": 8.322390631842718e-05, + "loss": 1.7822, + "step": 9469 + }, + { + "epoch": 2.906691221608349, + "grad_norm": 0.4683503806591034, + "learning_rate": 8.322019161653777e-05, + "loss": 1.7958, + "step": 9470 + }, + { + "epoch": 2.906998158379374, + "grad_norm": 0.27022865414619446, + "learning_rate": 8.321647658634696e-05, + "loss": 1.838, + "step": 9471 + }, + { + "epoch": 2.907305095150399, + "grad_norm": 0.3253246247768402, + "learning_rate": 8.321276122789146e-05, + "loss": 1.862, + "step": 9472 + }, + { + "epoch": 2.907612031921424, + "grad_norm": 0.3654547929763794, + "learning_rate": 8.320904554120798e-05, + "loss": 1.8578, + "step": 9473 + }, + { + "epoch": 2.9079189686924494, + "grad_norm": 0.3140239417552948, + "learning_rate": 8.320532952633325e-05, + "loss": 1.7954, + "step": 9474 + }, + { + "epoch": 2.9082259054634747, + "grad_norm": 0.24541302025318146, + "learning_rate": 8.3201613183304e-05, + "loss": 1.7711, + "step": 9475 + }, + { + "epoch": 2.9085328422344996, + "grad_norm": 0.2538415491580963, + "learning_rate": 8.319789651215692e-05, + "loss": 1.7756, + "step": 9476 + }, + { + "epoch": 2.908839779005525, + "grad_norm": 0.3181871175765991, + "learning_rate": 8.31941795129288e-05, + "loss": 1.7957, + "step": 9477 + }, + { + "epoch": 2.90914671577655, + "grad_norm": 0.3094673752784729, + "learning_rate": 8.319046218565633e-05, + "loss": 1.8897, + "step": 9478 + }, + { + "epoch": 2.909453652547575, + "grad_norm": 0.3004473149776459, + "learning_rate": 8.318674453037626e-05, + "loss": 1.7853, + "step": 9479 + }, + { + "epoch": 2.9097605893186005, + "grad_norm": 0.28673505783081055, + "learning_rate": 8.318302654712532e-05, + "loss": 1.8119, + "step": 9480 + }, + { + "epoch": 2.9100675260896254, + "grad_norm": 0.3177729547023773, + "learning_rate": 8.317930823594027e-05, + "loss": 1.8211, + "step": 9481 + }, + { + "epoch": 2.9103744628606507, + "grad_norm": 0.28347232937812805, + "learning_rate": 8.317558959685786e-05, + "loss": 1.8061, + "step": 9482 + }, + { + "epoch": 2.9106813996316756, + "grad_norm": 0.28247126936912537, + "learning_rate": 8.317187062991482e-05, + "loss": 1.8175, + "step": 9483 + }, + { + "epoch": 2.910988336402701, + "grad_norm": 0.3153017461299896, + "learning_rate": 8.31681513351479e-05, + "loss": 1.8619, + "step": 9484 + }, + { + "epoch": 2.9112952731737263, + "grad_norm": 0.265821635723114, + "learning_rate": 8.316443171259389e-05, + "loss": 1.7783, + "step": 9485 + }, + { + "epoch": 2.9116022099447516, + "grad_norm": 0.33247366547584534, + "learning_rate": 8.31607117622895e-05, + "loss": 1.8701, + "step": 9486 + }, + { + "epoch": 2.9119091467157765, + "grad_norm": 0.3343275189399719, + "learning_rate": 8.315699148427154e-05, + "loss": 1.742, + "step": 9487 + }, + { + "epoch": 2.912216083486802, + "grad_norm": 0.3427117168903351, + "learning_rate": 8.315327087857677e-05, + "loss": 1.8382, + "step": 9488 + }, + { + "epoch": 2.9125230202578267, + "grad_norm": 0.2884635925292969, + "learning_rate": 8.31495499452419e-05, + "loss": 1.8378, + "step": 9489 + }, + { + "epoch": 2.912829957028852, + "grad_norm": 0.30335184931755066, + "learning_rate": 8.31458286843038e-05, + "loss": 1.7619, + "step": 9490 + }, + { + "epoch": 2.9131368937998774, + "grad_norm": 0.3224368095397949, + "learning_rate": 8.314210709579916e-05, + "loss": 1.8289, + "step": 9491 + }, + { + "epoch": 2.9134438305709023, + "grad_norm": 0.28016242384910583, + "learning_rate": 8.31383851797648e-05, + "loss": 1.8027, + "step": 9492 + }, + { + "epoch": 2.9137507673419276, + "grad_norm": 0.32091468572616577, + "learning_rate": 8.313466293623749e-05, + "loss": 1.9027, + "step": 9493 + }, + { + "epoch": 2.9140577041129525, + "grad_norm": 0.2809069752693176, + "learning_rate": 8.313094036525403e-05, + "loss": 1.9194, + "step": 9494 + }, + { + "epoch": 2.914364640883978, + "grad_norm": 0.30734366178512573, + "learning_rate": 8.312721746685119e-05, + "loss": 1.8612, + "step": 9495 + }, + { + "epoch": 2.914671577655003, + "grad_norm": 0.25953513383865356, + "learning_rate": 8.312349424106578e-05, + "loss": 1.7593, + "step": 9496 + }, + { + "epoch": 2.9149785144260285, + "grad_norm": 0.27583983540534973, + "learning_rate": 8.311977068793459e-05, + "loss": 1.8138, + "step": 9497 + }, + { + "epoch": 2.9152854511970534, + "grad_norm": 0.30315884947776794, + "learning_rate": 8.31160468074944e-05, + "loss": 1.7704, + "step": 9498 + }, + { + "epoch": 2.9155923879680787, + "grad_norm": 0.321603387594223, + "learning_rate": 8.311232259978204e-05, + "loss": 1.8055, + "step": 9499 + }, + { + "epoch": 2.9158993247391036, + "grad_norm": 0.27882421016693115, + "learning_rate": 8.310859806483429e-05, + "loss": 1.8257, + "step": 9500 + }, + { + "epoch": 2.916206261510129, + "grad_norm": 0.3095625042915344, + "learning_rate": 8.310487320268795e-05, + "loss": 1.8561, + "step": 9501 + }, + { + "epoch": 2.9165131982811543, + "grad_norm": 0.27503731846809387, + "learning_rate": 8.310114801337988e-05, + "loss": 1.7588, + "step": 9502 + }, + { + "epoch": 2.916820135052179, + "grad_norm": 0.2534404695034027, + "learning_rate": 8.309742249694686e-05, + "loss": 1.7289, + "step": 9503 + }, + { + "epoch": 2.9171270718232045, + "grad_norm": 0.24968849122524261, + "learning_rate": 8.30936966534257e-05, + "loss": 1.7763, + "step": 9504 + }, + { + "epoch": 2.9174340085942294, + "grad_norm": 0.2728060781955719, + "learning_rate": 8.308997048285324e-05, + "loss": 1.7847, + "step": 9505 + }, + { + "epoch": 2.9177409453652547, + "grad_norm": 0.28728193044662476, + "learning_rate": 8.308624398526629e-05, + "loss": 1.7957, + "step": 9506 + }, + { + "epoch": 2.91804788213628, + "grad_norm": 0.3097241520881653, + "learning_rate": 8.308251716070169e-05, + "loss": 1.8141, + "step": 9507 + }, + { + "epoch": 2.918354818907305, + "grad_norm": 0.3570188879966736, + "learning_rate": 8.307879000919628e-05, + "loss": 1.8246, + "step": 9508 + }, + { + "epoch": 2.9186617556783303, + "grad_norm": 0.27077826857566833, + "learning_rate": 8.307506253078685e-05, + "loss": 1.7912, + "step": 9509 + }, + { + "epoch": 2.918968692449355, + "grad_norm": 0.26213565468788147, + "learning_rate": 8.307133472551028e-05, + "loss": 1.8378, + "step": 9510 + }, + { + "epoch": 2.9192756292203805, + "grad_norm": 0.3482845723628998, + "learning_rate": 8.306760659340339e-05, + "loss": 1.8031, + "step": 9511 + }, + { + "epoch": 2.919582565991406, + "grad_norm": 0.3730507791042328, + "learning_rate": 8.306387813450303e-05, + "loss": 1.7404, + "step": 9512 + }, + { + "epoch": 2.919889502762431, + "grad_norm": 0.2957874536514282, + "learning_rate": 8.306014934884606e-05, + "loss": 1.8623, + "step": 9513 + }, + { + "epoch": 2.920196439533456, + "grad_norm": 0.29137885570526123, + "learning_rate": 8.30564202364693e-05, + "loss": 1.847, + "step": 9514 + }, + { + "epoch": 2.9205033763044814, + "grad_norm": 0.35623642802238464, + "learning_rate": 8.305269079740964e-05, + "loss": 1.8382, + "step": 9515 + }, + { + "epoch": 2.9208103130755063, + "grad_norm": 0.28263330459594727, + "learning_rate": 8.304896103170389e-05, + "loss": 1.7732, + "step": 9516 + }, + { + "epoch": 2.9211172498465316, + "grad_norm": 0.23631221055984497, + "learning_rate": 8.304523093938897e-05, + "loss": 1.7709, + "step": 9517 + }, + { + "epoch": 2.921424186617557, + "grad_norm": 0.25887101888656616, + "learning_rate": 8.304150052050169e-05, + "loss": 1.7966, + "step": 9518 + }, + { + "epoch": 2.921731123388582, + "grad_norm": 0.31445473432540894, + "learning_rate": 8.303776977507894e-05, + "loss": 1.8735, + "step": 9519 + }, + { + "epoch": 2.922038060159607, + "grad_norm": 0.264930784702301, + "learning_rate": 8.303403870315757e-05, + "loss": 1.7983, + "step": 9520 + }, + { + "epoch": 2.922344996930632, + "grad_norm": 0.2664194107055664, + "learning_rate": 8.30303073047745e-05, + "loss": 1.8573, + "step": 9521 + }, + { + "epoch": 2.9226519337016574, + "grad_norm": 0.31645768880844116, + "learning_rate": 8.302657557996656e-05, + "loss": 1.913, + "step": 9522 + }, + { + "epoch": 2.9229588704726828, + "grad_norm": 0.2820858657360077, + "learning_rate": 8.302284352877063e-05, + "loss": 1.8714, + "step": 9523 + }, + { + "epoch": 2.9232658072437077, + "grad_norm": 0.2960543930530548, + "learning_rate": 8.30191111512236e-05, + "loss": 1.8296, + "step": 9524 + }, + { + "epoch": 2.923572744014733, + "grad_norm": 0.319363534450531, + "learning_rate": 8.301537844736237e-05, + "loss": 1.8533, + "step": 9525 + }, + { + "epoch": 2.923879680785758, + "grad_norm": 0.28047996759414673, + "learning_rate": 8.301164541722384e-05, + "loss": 1.7415, + "step": 9526 + }, + { + "epoch": 2.924186617556783, + "grad_norm": 0.3106628656387329, + "learning_rate": 8.300791206084486e-05, + "loss": 1.8809, + "step": 9527 + }, + { + "epoch": 2.9244935543278086, + "grad_norm": 0.2650253474712372, + "learning_rate": 8.300417837826235e-05, + "loss": 1.8097, + "step": 9528 + }, + { + "epoch": 2.924800491098834, + "grad_norm": 0.31832796335220337, + "learning_rate": 8.30004443695132e-05, + "loss": 1.881, + "step": 9529 + }, + { + "epoch": 2.925107427869859, + "grad_norm": 0.311018168926239, + "learning_rate": 8.299671003463432e-05, + "loss": 1.8725, + "step": 9530 + }, + { + "epoch": 2.925414364640884, + "grad_norm": 0.3125450909137726, + "learning_rate": 8.299297537366262e-05, + "loss": 1.8159, + "step": 9531 + }, + { + "epoch": 2.925721301411909, + "grad_norm": 0.30022570490837097, + "learning_rate": 8.298924038663498e-05, + "loss": 1.8217, + "step": 9532 + }, + { + "epoch": 2.9260282381829343, + "grad_norm": 0.3061163127422333, + "learning_rate": 8.298550507358836e-05, + "loss": 1.8529, + "step": 9533 + }, + { + "epoch": 2.9263351749539597, + "grad_norm": 0.258891224861145, + "learning_rate": 8.298176943455962e-05, + "loss": 1.8579, + "step": 9534 + }, + { + "epoch": 2.9266421117249846, + "grad_norm": 0.2871147096157074, + "learning_rate": 8.297803346958571e-05, + "loss": 1.8699, + "step": 9535 + }, + { + "epoch": 2.92694904849601, + "grad_norm": 0.3047468066215515, + "learning_rate": 8.297429717870356e-05, + "loss": 1.9165, + "step": 9536 + }, + { + "epoch": 2.927255985267035, + "grad_norm": 0.2852346897125244, + "learning_rate": 8.297056056195005e-05, + "loss": 1.8417, + "step": 9537 + }, + { + "epoch": 2.92756292203806, + "grad_norm": 0.30782654881477356, + "learning_rate": 8.296682361936216e-05, + "loss": 1.835, + "step": 9538 + }, + { + "epoch": 2.9278698588090855, + "grad_norm": 0.44828128814697266, + "learning_rate": 8.296308635097678e-05, + "loss": 1.8997, + "step": 9539 + }, + { + "epoch": 2.9281767955801103, + "grad_norm": 0.48911961913108826, + "learning_rate": 8.295934875683087e-05, + "loss": 1.8249, + "step": 9540 + }, + { + "epoch": 2.9284837323511357, + "grad_norm": 0.3377256691455841, + "learning_rate": 8.295561083696136e-05, + "loss": 1.757, + "step": 9541 + }, + { + "epoch": 2.9287906691221606, + "grad_norm": 0.29486989974975586, + "learning_rate": 8.295187259140518e-05, + "loss": 1.8282, + "step": 9542 + }, + { + "epoch": 2.929097605893186, + "grad_norm": 0.4291549026966095, + "learning_rate": 8.294813402019927e-05, + "loss": 1.7633, + "step": 9543 + }, + { + "epoch": 2.9294045426642112, + "grad_norm": 0.43153640627861023, + "learning_rate": 8.294439512338061e-05, + "loss": 1.7904, + "step": 9544 + }, + { + "epoch": 2.9297114794352366, + "grad_norm": 0.3454402685165405, + "learning_rate": 8.294065590098611e-05, + "loss": 1.8586, + "step": 9545 + }, + { + "epoch": 2.9300184162062615, + "grad_norm": 0.2709622383117676, + "learning_rate": 8.293691635305276e-05, + "loss": 1.8225, + "step": 9546 + }, + { + "epoch": 2.930325352977287, + "grad_norm": 0.34379467368125916, + "learning_rate": 8.293317647961749e-05, + "loss": 1.9005, + "step": 9547 + }, + { + "epoch": 2.9306322897483117, + "grad_norm": 0.37137365341186523, + "learning_rate": 8.292943628071727e-05, + "loss": 1.829, + "step": 9548 + }, + { + "epoch": 2.930939226519337, + "grad_norm": 0.31634894013404846, + "learning_rate": 8.292569575638905e-05, + "loss": 1.8062, + "step": 9549 + }, + { + "epoch": 2.9312461632903624, + "grad_norm": 0.25719332695007324, + "learning_rate": 8.292195490666981e-05, + "loss": 1.8044, + "step": 9550 + }, + { + "epoch": 2.9315531000613873, + "grad_norm": 0.3341852128505707, + "learning_rate": 8.291821373159652e-05, + "loss": 1.8627, + "step": 9551 + }, + { + "epoch": 2.9318600368324126, + "grad_norm": 0.38499385118484497, + "learning_rate": 8.291447223120614e-05, + "loss": 1.8138, + "step": 9552 + }, + { + "epoch": 2.9321669736034375, + "grad_norm": 0.28036460280418396, + "learning_rate": 8.291073040553567e-05, + "loss": 1.7958, + "step": 9553 + }, + { + "epoch": 2.932473910374463, + "grad_norm": 0.30798816680908203, + "learning_rate": 8.290698825462207e-05, + "loss": 1.899, + "step": 9554 + }, + { + "epoch": 2.932780847145488, + "grad_norm": 0.40930941700935364, + "learning_rate": 8.290324577850232e-05, + "loss": 1.841, + "step": 9555 + }, + { + "epoch": 2.933087783916513, + "grad_norm": 0.38794800639152527, + "learning_rate": 8.289950297721341e-05, + "loss": 1.8022, + "step": 9556 + }, + { + "epoch": 2.9333947206875384, + "grad_norm": 0.2716790437698364, + "learning_rate": 8.289575985079232e-05, + "loss": 1.8009, + "step": 9557 + }, + { + "epoch": 2.9337016574585633, + "grad_norm": 0.3063231110572815, + "learning_rate": 8.289201639927605e-05, + "loss": 1.8677, + "step": 9558 + }, + { + "epoch": 2.9340085942295886, + "grad_norm": 0.3279048800468445, + "learning_rate": 8.28882726227016e-05, + "loss": 1.8071, + "step": 9559 + }, + { + "epoch": 2.934315531000614, + "grad_norm": 0.32144758105278015, + "learning_rate": 8.288452852110596e-05, + "loss": 1.8601, + "step": 9560 + }, + { + "epoch": 2.9346224677716393, + "grad_norm": 0.284495085477829, + "learning_rate": 8.288078409452614e-05, + "loss": 1.8358, + "step": 9561 + }, + { + "epoch": 2.934929404542664, + "grad_norm": 0.3779112696647644, + "learning_rate": 8.287703934299915e-05, + "loss": 1.7903, + "step": 9562 + }, + { + "epoch": 2.9352363413136895, + "grad_norm": 0.33851495385169983, + "learning_rate": 8.287329426656197e-05, + "loss": 1.806, + "step": 9563 + }, + { + "epoch": 2.9355432780847144, + "grad_norm": 0.26610738039016724, + "learning_rate": 8.286954886525164e-05, + "loss": 1.7739, + "step": 9564 + }, + { + "epoch": 2.9358502148557397, + "grad_norm": 0.24825556576251984, + "learning_rate": 8.286580313910515e-05, + "loss": 1.7595, + "step": 9565 + }, + { + "epoch": 2.936157151626765, + "grad_norm": 0.28356245160102844, + "learning_rate": 8.286205708815954e-05, + "loss": 1.8497, + "step": 9566 + }, + { + "epoch": 2.93646408839779, + "grad_norm": 0.2974208891391754, + "learning_rate": 8.285831071245182e-05, + "loss": 1.8561, + "step": 9567 + }, + { + "epoch": 2.9367710251688153, + "grad_norm": 0.26718810200691223, + "learning_rate": 8.2854564012019e-05, + "loss": 1.776, + "step": 9568 + }, + { + "epoch": 2.93707796193984, + "grad_norm": 0.30627691745758057, + "learning_rate": 8.285081698689814e-05, + "loss": 1.8141, + "step": 9569 + }, + { + "epoch": 2.9373848987108655, + "grad_norm": 0.33287444710731506, + "learning_rate": 8.284706963712625e-05, + "loss": 1.8727, + "step": 9570 + }, + { + "epoch": 2.937691835481891, + "grad_norm": 0.30571332573890686, + "learning_rate": 8.284332196274036e-05, + "loss": 1.8388, + "step": 9571 + }, + { + "epoch": 2.937998772252916, + "grad_norm": 0.3603699207305908, + "learning_rate": 8.283957396377753e-05, + "loss": 1.8655, + "step": 9572 + }, + { + "epoch": 2.938305709023941, + "grad_norm": 0.2890760898590088, + "learning_rate": 8.283582564027477e-05, + "loss": 1.7919, + "step": 9573 + }, + { + "epoch": 2.9386126457949664, + "grad_norm": 0.34981194138526917, + "learning_rate": 8.283207699226912e-05, + "loss": 1.8542, + "step": 9574 + }, + { + "epoch": 2.9389195825659913, + "grad_norm": 0.43490317463874817, + "learning_rate": 8.282832801979766e-05, + "loss": 1.8109, + "step": 9575 + }, + { + "epoch": 2.9392265193370166, + "grad_norm": 0.4337438941001892, + "learning_rate": 8.282457872289742e-05, + "loss": 1.8856, + "step": 9576 + }, + { + "epoch": 2.939533456108042, + "grad_norm": 0.2723710834980011, + "learning_rate": 8.282082910160544e-05, + "loss": 1.8554, + "step": 9577 + }, + { + "epoch": 2.939840392879067, + "grad_norm": 0.32447734475135803, + "learning_rate": 8.28170791559588e-05, + "loss": 1.8086, + "step": 9578 + }, + { + "epoch": 2.940147329650092, + "grad_norm": 0.3495276868343353, + "learning_rate": 8.281332888599455e-05, + "loss": 1.785, + "step": 9579 + }, + { + "epoch": 2.940454266421117, + "grad_norm": 0.3324705958366394, + "learning_rate": 8.280957829174975e-05, + "loss": 1.8086, + "step": 9580 + }, + { + "epoch": 2.9407612031921424, + "grad_norm": 0.2633898854255676, + "learning_rate": 8.280582737326146e-05, + "loss": 1.8116, + "step": 9581 + }, + { + "epoch": 2.9410681399631677, + "grad_norm": 0.3109157085418701, + "learning_rate": 8.280207613056676e-05, + "loss": 1.8649, + "step": 9582 + }, + { + "epoch": 2.9413750767341926, + "grad_norm": 0.2772599756717682, + "learning_rate": 8.279832456370273e-05, + "loss": 1.8578, + "step": 9583 + }, + { + "epoch": 2.941682013505218, + "grad_norm": 0.32322654128074646, + "learning_rate": 8.279457267270642e-05, + "loss": 1.8621, + "step": 9584 + }, + { + "epoch": 2.941988950276243, + "grad_norm": 0.3678343594074249, + "learning_rate": 8.279082045761493e-05, + "loss": 1.8819, + "step": 9585 + }, + { + "epoch": 2.942295887047268, + "grad_norm": 0.30976057052612305, + "learning_rate": 8.27870679184653e-05, + "loss": 1.8126, + "step": 9586 + }, + { + "epoch": 2.9426028238182935, + "grad_norm": 0.26715603470802307, + "learning_rate": 8.278331505529469e-05, + "loss": 1.8831, + "step": 9587 + }, + { + "epoch": 2.942909760589319, + "grad_norm": 0.263288289308548, + "learning_rate": 8.277956186814014e-05, + "loss": 1.8057, + "step": 9588 + }, + { + "epoch": 2.9432166973603437, + "grad_norm": 0.29458633065223694, + "learning_rate": 8.277580835703873e-05, + "loss": 1.7307, + "step": 9589 + }, + { + "epoch": 2.943523634131369, + "grad_norm": 0.27819791436195374, + "learning_rate": 8.277205452202759e-05, + "loss": 1.8783, + "step": 9590 + }, + { + "epoch": 2.943830570902394, + "grad_norm": 0.29286056756973267, + "learning_rate": 8.276830036314379e-05, + "loss": 1.8061, + "step": 9591 + }, + { + "epoch": 2.9441375076734193, + "grad_norm": 0.2955230474472046, + "learning_rate": 8.276454588042442e-05, + "loss": 1.8227, + "step": 9592 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.3070714473724365, + "learning_rate": 8.276079107390663e-05, + "loss": 1.8451, + "step": 9593 + }, + { + "epoch": 2.9447513812154695, + "grad_norm": 0.34235841035842896, + "learning_rate": 8.275703594362749e-05, + "loss": 1.8052, + "step": 9594 + }, + { + "epoch": 2.945058317986495, + "grad_norm": 0.2863236665725708, + "learning_rate": 8.275328048962412e-05, + "loss": 1.8741, + "step": 9595 + }, + { + "epoch": 2.9453652547575198, + "grad_norm": 0.3013235032558441, + "learning_rate": 8.274952471193364e-05, + "loss": 1.8177, + "step": 9596 + }, + { + "epoch": 2.945672191528545, + "grad_norm": 0.2994023561477661, + "learning_rate": 8.274576861059316e-05, + "loss": 1.903, + "step": 9597 + }, + { + "epoch": 2.9459791282995704, + "grad_norm": 0.320049524307251, + "learning_rate": 8.27420121856398e-05, + "loss": 1.882, + "step": 9598 + }, + { + "epoch": 2.9462860650705953, + "grad_norm": 0.2789655029773712, + "learning_rate": 8.273825543711069e-05, + "loss": 1.794, + "step": 9599 + }, + { + "epoch": 2.9465930018416207, + "grad_norm": 0.3148564398288727, + "learning_rate": 8.273449836504294e-05, + "loss": 1.8453, + "step": 9600 + }, + { + "epoch": 2.9468999386126455, + "grad_norm": 0.46754372119903564, + "learning_rate": 8.273074096947371e-05, + "loss": 1.8147, + "step": 9601 + }, + { + "epoch": 2.947206875383671, + "grad_norm": 0.5946900844573975, + "learning_rate": 8.27269832504401e-05, + "loss": 1.8099, + "step": 9602 + }, + { + "epoch": 2.947513812154696, + "grad_norm": 0.4916069507598877, + "learning_rate": 8.272322520797926e-05, + "loss": 1.8315, + "step": 9603 + }, + { + "epoch": 2.9478207489257215, + "grad_norm": 0.30378973484039307, + "learning_rate": 8.271946684212833e-05, + "loss": 1.87, + "step": 9604 + }, + { + "epoch": 2.9481276856967464, + "grad_norm": 0.5197327136993408, + "learning_rate": 8.271570815292447e-05, + "loss": 1.8109, + "step": 9605 + }, + { + "epoch": 2.9484346224677718, + "grad_norm": 0.7213841080665588, + "learning_rate": 8.271194914040478e-05, + "loss": 1.8526, + "step": 9606 + }, + { + "epoch": 2.9487415592387967, + "grad_norm": 0.5521572232246399, + "learning_rate": 8.270818980460643e-05, + "loss": 1.7982, + "step": 9607 + }, + { + "epoch": 2.949048496009822, + "grad_norm": 0.3072868287563324, + "learning_rate": 8.27044301455666e-05, + "loss": 1.8708, + "step": 9608 + }, + { + "epoch": 2.9493554327808473, + "grad_norm": 0.5477200746536255, + "learning_rate": 8.270067016332241e-05, + "loss": 1.8708, + "step": 9609 + }, + { + "epoch": 2.949662369551872, + "grad_norm": 0.5991030335426331, + "learning_rate": 8.269690985791104e-05, + "loss": 1.7983, + "step": 9610 + }, + { + "epoch": 2.9499693063228976, + "grad_norm": 0.33343803882598877, + "learning_rate": 8.269314922936964e-05, + "loss": 1.7867, + "step": 9611 + }, + { + "epoch": 2.9502762430939224, + "grad_norm": 0.3671727776527405, + "learning_rate": 8.268938827773538e-05, + "loss": 1.9604, + "step": 9612 + }, + { + "epoch": 2.950583179864948, + "grad_norm": 0.5015503764152527, + "learning_rate": 8.26856270030454e-05, + "loss": 1.8424, + "step": 9613 + }, + { + "epoch": 2.950890116635973, + "grad_norm": 0.4369170367717743, + "learning_rate": 8.268186540533693e-05, + "loss": 1.7915, + "step": 9614 + }, + { + "epoch": 2.951197053406998, + "grad_norm": 0.2739746868610382, + "learning_rate": 8.267810348464709e-05, + "loss": 1.7816, + "step": 9615 + }, + { + "epoch": 2.9515039901780233, + "grad_norm": 0.3660983145236969, + "learning_rate": 8.26743412410131e-05, + "loss": 1.8235, + "step": 9616 + }, + { + "epoch": 2.9518109269490482, + "grad_norm": 0.44442248344421387, + "learning_rate": 8.26705786744721e-05, + "loss": 1.8566, + "step": 9617 + }, + { + "epoch": 2.9521178637200736, + "grad_norm": 0.28847622871398926, + "learning_rate": 8.266681578506129e-05, + "loss": 1.82, + "step": 9618 + }, + { + "epoch": 2.952424800491099, + "grad_norm": 0.32827475666999817, + "learning_rate": 8.266305257281786e-05, + "loss": 1.8422, + "step": 9619 + }, + { + "epoch": 2.9527317372621242, + "grad_norm": 0.3459654748439789, + "learning_rate": 8.265928903777902e-05, + "loss": 1.7919, + "step": 9620 + }, + { + "epoch": 2.953038674033149, + "grad_norm": 0.31467050313949585, + "learning_rate": 8.265552517998191e-05, + "loss": 1.8178, + "step": 9621 + }, + { + "epoch": 2.9533456108041745, + "grad_norm": 0.2814936935901642, + "learning_rate": 8.265176099946381e-05, + "loss": 1.7823, + "step": 9622 + }, + { + "epoch": 2.9536525475751993, + "grad_norm": 0.36387261748313904, + "learning_rate": 8.264799649626182e-05, + "loss": 1.7861, + "step": 9623 + }, + { + "epoch": 2.9539594843462247, + "grad_norm": 0.3504095673561096, + "learning_rate": 8.264423167041322e-05, + "loss": 1.8216, + "step": 9624 + }, + { + "epoch": 2.95426642111725, + "grad_norm": 0.28199300169944763, + "learning_rate": 8.264046652195519e-05, + "loss": 1.8397, + "step": 9625 + }, + { + "epoch": 2.954573357888275, + "grad_norm": 0.435774028301239, + "learning_rate": 8.263670105092494e-05, + "loss": 1.8316, + "step": 9626 + }, + { + "epoch": 2.9548802946593002, + "grad_norm": 0.37712937593460083, + "learning_rate": 8.263293525735967e-05, + "loss": 1.8089, + "step": 9627 + }, + { + "epoch": 2.955187231430325, + "grad_norm": 0.34833967685699463, + "learning_rate": 8.26291691412966e-05, + "loss": 1.8324, + "step": 9628 + }, + { + "epoch": 2.9554941682013505, + "grad_norm": 0.37515538930892944, + "learning_rate": 8.262540270277297e-05, + "loss": 1.7958, + "step": 9629 + }, + { + "epoch": 2.955801104972376, + "grad_norm": 0.3392273485660553, + "learning_rate": 8.262163594182598e-05, + "loss": 1.8322, + "step": 9630 + }, + { + "epoch": 2.9561080417434007, + "grad_norm": 0.3477925956249237, + "learning_rate": 8.261786885849287e-05, + "loss": 1.8525, + "step": 9631 + }, + { + "epoch": 2.956414978514426, + "grad_norm": 0.35574036836624146, + "learning_rate": 8.261410145281085e-05, + "loss": 1.8148, + "step": 9632 + }, + { + "epoch": 2.9567219152854514, + "grad_norm": 0.3166620135307312, + "learning_rate": 8.261033372481717e-05, + "loss": 1.7914, + "step": 9633 + }, + { + "epoch": 2.9570288520564763, + "grad_norm": 0.2562217116355896, + "learning_rate": 8.260656567454907e-05, + "loss": 1.7794, + "step": 9634 + }, + { + "epoch": 2.9573357888275016, + "grad_norm": 0.3328792452812195, + "learning_rate": 8.260279730204377e-05, + "loss": 1.8235, + "step": 9635 + }, + { + "epoch": 2.957642725598527, + "grad_norm": 0.33144834637641907, + "learning_rate": 8.259902860733852e-05, + "loss": 1.7668, + "step": 9636 + }, + { + "epoch": 2.957949662369552, + "grad_norm": 0.30557021498680115, + "learning_rate": 8.259525959047056e-05, + "loss": 1.9135, + "step": 9637 + }, + { + "epoch": 2.958256599140577, + "grad_norm": 0.2901468575000763, + "learning_rate": 8.259149025147713e-05, + "loss": 1.8023, + "step": 9638 + }, + { + "epoch": 2.958563535911602, + "grad_norm": 0.35177919268608093, + "learning_rate": 8.25877205903955e-05, + "loss": 1.8541, + "step": 9639 + }, + { + "epoch": 2.9588704726826274, + "grad_norm": 0.2745177447795868, + "learning_rate": 8.258395060726291e-05, + "loss": 1.8103, + "step": 9640 + }, + { + "epoch": 2.9591774094536527, + "grad_norm": 0.29005685448646545, + "learning_rate": 8.258018030211663e-05, + "loss": 1.7587, + "step": 9641 + }, + { + "epoch": 2.9594843462246776, + "grad_norm": 0.27498918771743774, + "learning_rate": 8.257640967499391e-05, + "loss": 1.8052, + "step": 9642 + }, + { + "epoch": 2.959791282995703, + "grad_norm": 0.2689644694328308, + "learning_rate": 8.257263872593202e-05, + "loss": 1.8582, + "step": 9643 + }, + { + "epoch": 2.960098219766728, + "grad_norm": 0.2953707277774811, + "learning_rate": 8.256886745496821e-05, + "loss": 1.7654, + "step": 9644 + }, + { + "epoch": 2.960405156537753, + "grad_norm": 0.2573971450328827, + "learning_rate": 8.256509586213978e-05, + "loss": 1.7819, + "step": 9645 + }, + { + "epoch": 2.9607120933087785, + "grad_norm": 0.29667192697525024, + "learning_rate": 8.256132394748398e-05, + "loss": 1.8632, + "step": 9646 + }, + { + "epoch": 2.961019030079804, + "grad_norm": 0.2953830361366272, + "learning_rate": 8.255755171103808e-05, + "loss": 1.8672, + "step": 9647 + }, + { + "epoch": 2.9613259668508287, + "grad_norm": 0.2925500273704529, + "learning_rate": 8.255377915283937e-05, + "loss": 1.8691, + "step": 9648 + }, + { + "epoch": 2.961632903621854, + "grad_norm": 0.32245302200317383, + "learning_rate": 8.255000627292515e-05, + "loss": 1.8701, + "step": 9649 + }, + { + "epoch": 2.961939840392879, + "grad_norm": 0.2671414315700531, + "learning_rate": 8.254623307133268e-05, + "loss": 1.8045, + "step": 9650 + }, + { + "epoch": 2.9622467771639043, + "grad_norm": 0.3135749101638794, + "learning_rate": 8.254245954809928e-05, + "loss": 1.7573, + "step": 9651 + }, + { + "epoch": 2.9625537139349296, + "grad_norm": 0.2604369521141052, + "learning_rate": 8.253868570326218e-05, + "loss": 1.8513, + "step": 9652 + }, + { + "epoch": 2.9628606507059545, + "grad_norm": 0.24657092988491058, + "learning_rate": 8.253491153685875e-05, + "loss": 1.8303, + "step": 9653 + }, + { + "epoch": 2.96316758747698, + "grad_norm": 0.24310527741909027, + "learning_rate": 8.253113704892623e-05, + "loss": 1.7648, + "step": 9654 + }, + { + "epoch": 2.9634745242480047, + "grad_norm": 0.24558408558368683, + "learning_rate": 8.252736223950198e-05, + "loss": 1.7517, + "step": 9655 + }, + { + "epoch": 2.96378146101903, + "grad_norm": 0.2500043511390686, + "learning_rate": 8.252358710862324e-05, + "loss": 1.7588, + "step": 9656 + }, + { + "epoch": 2.9640883977900554, + "grad_norm": 0.2532055079936981, + "learning_rate": 8.251981165632737e-05, + "loss": 1.8414, + "step": 9657 + }, + { + "epoch": 2.9643953345610803, + "grad_norm": 0.2692684829235077, + "learning_rate": 8.251603588265165e-05, + "loss": 1.8701, + "step": 9658 + }, + { + "epoch": 2.9647022713321056, + "grad_norm": 0.2511022984981537, + "learning_rate": 8.251225978763341e-05, + "loss": 1.8068, + "step": 9659 + }, + { + "epoch": 2.9650092081031305, + "grad_norm": 0.24702081084251404, + "learning_rate": 8.250848337130997e-05, + "loss": 1.7993, + "step": 9660 + }, + { + "epoch": 2.965316144874156, + "grad_norm": 0.26960623264312744, + "learning_rate": 8.250470663371862e-05, + "loss": 1.8269, + "step": 9661 + }, + { + "epoch": 2.965623081645181, + "grad_norm": 0.2651064693927765, + "learning_rate": 8.250092957489673e-05, + "loss": 1.8235, + "step": 9662 + }, + { + "epoch": 2.9659300184162065, + "grad_norm": 0.3117934465408325, + "learning_rate": 8.249715219488158e-05, + "loss": 1.9603, + "step": 9663 + }, + { + "epoch": 2.9662369551872314, + "grad_norm": 0.3244706988334656, + "learning_rate": 8.249337449371055e-05, + "loss": 1.8766, + "step": 9664 + }, + { + "epoch": 2.9665438919582567, + "grad_norm": 0.3071763515472412, + "learning_rate": 8.248959647142094e-05, + "loss": 1.8118, + "step": 9665 + }, + { + "epoch": 2.9668508287292816, + "grad_norm": 0.2575626075267792, + "learning_rate": 8.24858181280501e-05, + "loss": 1.8578, + "step": 9666 + }, + { + "epoch": 2.967157765500307, + "grad_norm": 0.369356244802475, + "learning_rate": 8.248203946363535e-05, + "loss": 1.7831, + "step": 9667 + }, + { + "epoch": 2.9674647022713323, + "grad_norm": 0.317775160074234, + "learning_rate": 8.247826047821405e-05, + "loss": 1.8839, + "step": 9668 + }, + { + "epoch": 2.967771639042357, + "grad_norm": 0.31816980242729187, + "learning_rate": 8.247448117182355e-05, + "loss": 1.8111, + "step": 9669 + }, + { + "epoch": 2.9680785758133825, + "grad_norm": 0.2943781316280365, + "learning_rate": 8.247070154450119e-05, + "loss": 1.848, + "step": 9670 + }, + { + "epoch": 2.9683855125844074, + "grad_norm": 0.28252434730529785, + "learning_rate": 8.246692159628433e-05, + "loss": 1.8601, + "step": 9671 + }, + { + "epoch": 2.9686924493554327, + "grad_norm": 0.29150691628456116, + "learning_rate": 8.246314132721032e-05, + "loss": 1.7738, + "step": 9672 + }, + { + "epoch": 2.968999386126458, + "grad_norm": 0.3699757754802704, + "learning_rate": 8.245936073731653e-05, + "loss": 1.842, + "step": 9673 + }, + { + "epoch": 2.969306322897483, + "grad_norm": 0.37951794266700745, + "learning_rate": 8.245557982664031e-05, + "loss": 1.8648, + "step": 9674 + }, + { + "epoch": 2.9696132596685083, + "grad_norm": 0.2792273461818695, + "learning_rate": 8.245179859521901e-05, + "loss": 1.889, + "step": 9675 + }, + { + "epoch": 2.969920196439533, + "grad_norm": 0.3405047059059143, + "learning_rate": 8.244801704309002e-05, + "loss": 1.7658, + "step": 9676 + }, + { + "epoch": 2.9702271332105585, + "grad_norm": 0.40138551592826843, + "learning_rate": 8.244423517029072e-05, + "loss": 1.79, + "step": 9677 + }, + { + "epoch": 2.970534069981584, + "grad_norm": 0.42260462045669556, + "learning_rate": 8.244045297685846e-05, + "loss": 1.9248, + "step": 9678 + }, + { + "epoch": 2.970841006752609, + "grad_norm": 0.30391061305999756, + "learning_rate": 8.243667046283063e-05, + "loss": 1.7922, + "step": 9679 + }, + { + "epoch": 2.971147943523634, + "grad_norm": 0.3194752037525177, + "learning_rate": 8.243288762824463e-05, + "loss": 1.8582, + "step": 9680 + }, + { + "epoch": 2.9714548802946594, + "grad_norm": 0.47853100299835205, + "learning_rate": 8.24291044731378e-05, + "loss": 1.8206, + "step": 9681 + }, + { + "epoch": 2.9717618170656843, + "grad_norm": 0.47428956627845764, + "learning_rate": 8.242532099754756e-05, + "loss": 1.8271, + "step": 9682 + }, + { + "epoch": 2.9720687538367097, + "grad_norm": 0.30275169014930725, + "learning_rate": 8.24215372015113e-05, + "loss": 1.8532, + "step": 9683 + }, + { + "epoch": 2.972375690607735, + "grad_norm": 0.31766825914382935, + "learning_rate": 8.24177530850664e-05, + "loss": 1.7751, + "step": 9684 + }, + { + "epoch": 2.97268262737876, + "grad_norm": 0.3738986551761627, + "learning_rate": 8.241396864825026e-05, + "loss": 1.7644, + "step": 9685 + }, + { + "epoch": 2.972989564149785, + "grad_norm": 0.2794596254825592, + "learning_rate": 8.24101838911003e-05, + "loss": 1.7445, + "step": 9686 + }, + { + "epoch": 2.97329650092081, + "grad_norm": 0.30008718371391296, + "learning_rate": 8.240639881365388e-05, + "loss": 1.8181, + "step": 9687 + }, + { + "epoch": 2.9736034376918354, + "grad_norm": 0.36667200922966003, + "learning_rate": 8.240261341594846e-05, + "loss": 1.8606, + "step": 9688 + }, + { + "epoch": 2.9739103744628608, + "grad_norm": 0.2943612039089203, + "learning_rate": 8.23988276980214e-05, + "loss": 1.8169, + "step": 9689 + }, + { + "epoch": 2.9742173112338857, + "grad_norm": 0.3499365746974945, + "learning_rate": 8.239504165991015e-05, + "loss": 1.8901, + "step": 9690 + }, + { + "epoch": 2.974524248004911, + "grad_norm": 0.35552978515625, + "learning_rate": 8.239125530165211e-05, + "loss": 1.8266, + "step": 9691 + }, + { + "epoch": 2.974831184775936, + "grad_norm": 0.35415011644363403, + "learning_rate": 8.23874686232847e-05, + "loss": 1.8588, + "step": 9692 + }, + { + "epoch": 2.9751381215469612, + "grad_norm": 0.3237420618534088, + "learning_rate": 8.238368162484533e-05, + "loss": 1.8112, + "step": 9693 + }, + { + "epoch": 2.9754450583179866, + "grad_norm": 0.31672203540802, + "learning_rate": 8.237989430637145e-05, + "loss": 1.7983, + "step": 9694 + }, + { + "epoch": 2.975751995089012, + "grad_norm": 0.2926657795906067, + "learning_rate": 8.237610666790048e-05, + "loss": 1.8137, + "step": 9695 + }, + { + "epoch": 2.976058931860037, + "grad_norm": 0.2924230992794037, + "learning_rate": 8.237231870946983e-05, + "loss": 1.8789, + "step": 9696 + }, + { + "epoch": 2.976365868631062, + "grad_norm": 0.2768077850341797, + "learning_rate": 8.236853043111697e-05, + "loss": 1.8643, + "step": 9697 + }, + { + "epoch": 2.976672805402087, + "grad_norm": 0.24151389300823212, + "learning_rate": 8.23647418328793e-05, + "loss": 1.8245, + "step": 9698 + }, + { + "epoch": 2.9769797421731123, + "grad_norm": 0.24514195322990417, + "learning_rate": 8.23609529147943e-05, + "loss": 1.761, + "step": 9699 + }, + { + "epoch": 2.9772866789441377, + "grad_norm": 0.2619125545024872, + "learning_rate": 8.235716367689938e-05, + "loss": 1.8445, + "step": 9700 + }, + { + "epoch": 2.9775936157151626, + "grad_norm": 0.2570437490940094, + "learning_rate": 8.235337411923203e-05, + "loss": 1.7881, + "step": 9701 + }, + { + "epoch": 2.977900552486188, + "grad_norm": 0.288775235414505, + "learning_rate": 8.234958424182966e-05, + "loss": 1.8177, + "step": 9702 + }, + { + "epoch": 2.978207489257213, + "grad_norm": 0.3186240792274475, + "learning_rate": 8.234579404472973e-05, + "loss": 1.8438, + "step": 9703 + }, + { + "epoch": 2.978514426028238, + "grad_norm": 0.2520117163658142, + "learning_rate": 8.23420035279697e-05, + "loss": 1.7791, + "step": 9704 + }, + { + "epoch": 2.9788213627992635, + "grad_norm": 0.23164312541484833, + "learning_rate": 8.233821269158706e-05, + "loss": 1.7368, + "step": 9705 + }, + { + "epoch": 2.979128299570289, + "grad_norm": 0.33843451738357544, + "learning_rate": 8.233442153561924e-05, + "loss": 1.8656, + "step": 9706 + }, + { + "epoch": 2.9794352363413137, + "grad_norm": 0.3070257604122162, + "learning_rate": 8.23306300601037e-05, + "loss": 1.7982, + "step": 9707 + }, + { + "epoch": 2.979742173112339, + "grad_norm": 0.29138872027397156, + "learning_rate": 8.232683826507793e-05, + "loss": 1.8227, + "step": 9708 + }, + { + "epoch": 2.980049109883364, + "grad_norm": 0.22698308527469635, + "learning_rate": 8.23230461505794e-05, + "loss": 1.7841, + "step": 9709 + }, + { + "epoch": 2.9803560466543892, + "grad_norm": 0.2597857713699341, + "learning_rate": 8.231925371664559e-05, + "loss": 1.7438, + "step": 9710 + }, + { + "epoch": 2.9806629834254146, + "grad_norm": 0.28672367334365845, + "learning_rate": 8.231546096331395e-05, + "loss": 1.8415, + "step": 9711 + }, + { + "epoch": 2.9809699201964395, + "grad_norm": 0.24295037984848022, + "learning_rate": 8.2311667890622e-05, + "loss": 1.8179, + "step": 9712 + }, + { + "epoch": 2.981276856967465, + "grad_norm": 0.24558894336223602, + "learning_rate": 8.23078744986072e-05, + "loss": 1.8092, + "step": 9713 + }, + { + "epoch": 2.9815837937384897, + "grad_norm": 0.2644276022911072, + "learning_rate": 8.230408078730706e-05, + "loss": 1.8214, + "step": 9714 + }, + { + "epoch": 2.981890730509515, + "grad_norm": 0.27007076144218445, + "learning_rate": 8.230028675675907e-05, + "loss": 1.8042, + "step": 9715 + }, + { + "epoch": 2.9821976672805404, + "grad_norm": 0.2729937732219696, + "learning_rate": 8.229649240700069e-05, + "loss": 1.8419, + "step": 9716 + }, + { + "epoch": 2.9825046040515653, + "grad_norm": 0.26545679569244385, + "learning_rate": 8.229269773806945e-05, + "loss": 1.823, + "step": 9717 + }, + { + "epoch": 2.9828115408225906, + "grad_norm": 0.23276878893375397, + "learning_rate": 8.228890275000285e-05, + "loss": 1.7635, + "step": 9718 + }, + { + "epoch": 2.9831184775936155, + "grad_norm": 0.28991779685020447, + "learning_rate": 8.228510744283837e-05, + "loss": 1.8303, + "step": 9719 + }, + { + "epoch": 2.983425414364641, + "grad_norm": 0.2821960151195526, + "learning_rate": 8.228131181661357e-05, + "loss": 1.8246, + "step": 9720 + }, + { + "epoch": 2.983732351135666, + "grad_norm": 0.25588423013687134, + "learning_rate": 8.22775158713659e-05, + "loss": 1.7764, + "step": 9721 + }, + { + "epoch": 2.9840392879066915, + "grad_norm": 0.2694758176803589, + "learning_rate": 8.227371960713289e-05, + "loss": 1.8026, + "step": 9722 + }, + { + "epoch": 2.9843462246777164, + "grad_norm": 0.27571097016334534, + "learning_rate": 8.226992302395209e-05, + "loss": 1.8051, + "step": 9723 + }, + { + "epoch": 2.9846531614487417, + "grad_norm": 0.2940119504928589, + "learning_rate": 8.226612612186099e-05, + "loss": 1.8782, + "step": 9724 + }, + { + "epoch": 2.9849600982197666, + "grad_norm": 0.34924936294555664, + "learning_rate": 8.226232890089711e-05, + "loss": 1.7845, + "step": 9725 + }, + { + "epoch": 2.985267034990792, + "grad_norm": 0.30503180623054504, + "learning_rate": 8.2258531361098e-05, + "loss": 1.8345, + "step": 9726 + }, + { + "epoch": 2.9855739717618173, + "grad_norm": 0.2463730275630951, + "learning_rate": 8.225473350250117e-05, + "loss": 1.8188, + "step": 9727 + }, + { + "epoch": 2.985880908532842, + "grad_norm": 0.3514629900455475, + "learning_rate": 8.225093532514417e-05, + "loss": 1.9253, + "step": 9728 + }, + { + "epoch": 2.9861878453038675, + "grad_norm": 0.26462769508361816, + "learning_rate": 8.224713682906449e-05, + "loss": 1.7396, + "step": 9729 + }, + { + "epoch": 2.9864947820748924, + "grad_norm": 0.27125996351242065, + "learning_rate": 8.224333801429973e-05, + "loss": 1.7784, + "step": 9730 + }, + { + "epoch": 2.9868017188459177, + "grad_norm": 0.3083387315273285, + "learning_rate": 8.22395388808874e-05, + "loss": 1.8503, + "step": 9731 + }, + { + "epoch": 2.987108655616943, + "grad_norm": 0.28289708495140076, + "learning_rate": 8.223573942886505e-05, + "loss": 1.8337, + "step": 9732 + }, + { + "epoch": 2.987415592387968, + "grad_norm": 0.3667753040790558, + "learning_rate": 8.223193965827023e-05, + "loss": 1.8213, + "step": 9733 + }, + { + "epoch": 2.9877225291589933, + "grad_norm": 0.3568948805332184, + "learning_rate": 8.222813956914049e-05, + "loss": 1.8337, + "step": 9734 + }, + { + "epoch": 2.988029465930018, + "grad_norm": 0.2883065640926361, + "learning_rate": 8.22243391615134e-05, + "loss": 1.7227, + "step": 9735 + }, + { + "epoch": 2.9883364027010435, + "grad_norm": 0.24940936267375946, + "learning_rate": 8.222053843542648e-05, + "loss": 1.7889, + "step": 9736 + }, + { + "epoch": 2.988643339472069, + "grad_norm": 0.31267982721328735, + "learning_rate": 8.221673739091732e-05, + "loss": 1.8432, + "step": 9737 + }, + { + "epoch": 2.988950276243094, + "grad_norm": 0.3552311658859253, + "learning_rate": 8.221293602802349e-05, + "loss": 1.8569, + "step": 9738 + }, + { + "epoch": 2.989257213014119, + "grad_norm": 0.4149966835975647, + "learning_rate": 8.220913434678252e-05, + "loss": 1.8052, + "step": 9739 + }, + { + "epoch": 2.9895641497851444, + "grad_norm": 0.282320499420166, + "learning_rate": 8.220533234723204e-05, + "loss": 1.7629, + "step": 9740 + }, + { + "epoch": 2.9898710865561693, + "grad_norm": 0.27737030386924744, + "learning_rate": 8.220153002940958e-05, + "loss": 1.8331, + "step": 9741 + }, + { + "epoch": 2.9901780233271946, + "grad_norm": 0.29296645522117615, + "learning_rate": 8.219772739335272e-05, + "loss": 1.8414, + "step": 9742 + }, + { + "epoch": 2.99048496009822, + "grad_norm": 0.35226449370384216, + "learning_rate": 8.219392443909903e-05, + "loss": 1.8608, + "step": 9743 + }, + { + "epoch": 2.990791896869245, + "grad_norm": 0.3199223577976227, + "learning_rate": 8.219012116668612e-05, + "loss": 1.7868, + "step": 9744 + }, + { + "epoch": 2.99109883364027, + "grad_norm": 0.2904597818851471, + "learning_rate": 8.218631757615159e-05, + "loss": 1.8495, + "step": 9745 + }, + { + "epoch": 2.991405770411295, + "grad_norm": 0.34674009680747986, + "learning_rate": 8.218251366753298e-05, + "loss": 1.8143, + "step": 9746 + }, + { + "epoch": 2.9917127071823204, + "grad_norm": 0.38007479906082153, + "learning_rate": 8.217870944086791e-05, + "loss": 1.8534, + "step": 9747 + }, + { + "epoch": 2.9920196439533457, + "grad_norm": 0.31660130620002747, + "learning_rate": 8.217490489619398e-05, + "loss": 1.7807, + "step": 9748 + }, + { + "epoch": 2.9923265807243706, + "grad_norm": 0.2923539876937866, + "learning_rate": 8.217110003354877e-05, + "loss": 1.8517, + "step": 9749 + }, + { + "epoch": 2.992633517495396, + "grad_norm": 0.31018227338790894, + "learning_rate": 8.21672948529699e-05, + "loss": 1.7998, + "step": 9750 + }, + { + "epoch": 2.992940454266421, + "grad_norm": 0.29448994994163513, + "learning_rate": 8.216348935449496e-05, + "loss": 1.7883, + "step": 9751 + }, + { + "epoch": 2.993247391037446, + "grad_norm": 0.26120781898498535, + "learning_rate": 8.215968353816158e-05, + "loss": 1.7762, + "step": 9752 + }, + { + "epoch": 2.9935543278084715, + "grad_norm": 0.27784180641174316, + "learning_rate": 8.215587740400735e-05, + "loss": 1.8711, + "step": 9753 + }, + { + "epoch": 2.993861264579497, + "grad_norm": 0.3106052577495575, + "learning_rate": 8.21520709520699e-05, + "loss": 1.8112, + "step": 9754 + }, + { + "epoch": 2.9941682013505218, + "grad_norm": 0.3170885145664215, + "learning_rate": 8.214826418238684e-05, + "loss": 1.8893, + "step": 9755 + }, + { + "epoch": 2.994475138121547, + "grad_norm": 0.2969432473182678, + "learning_rate": 8.214445709499577e-05, + "loss": 1.8628, + "step": 9756 + }, + { + "epoch": 2.994782074892572, + "grad_norm": 0.30484744906425476, + "learning_rate": 8.214064968993436e-05, + "loss": 1.8421, + "step": 9757 + }, + { + "epoch": 2.9950890116635973, + "grad_norm": 0.24819856882095337, + "learning_rate": 8.213684196724019e-05, + "loss": 1.8243, + "step": 9758 + }, + { + "epoch": 2.9953959484346226, + "grad_norm": 0.28566786646842957, + "learning_rate": 8.213303392695092e-05, + "loss": 1.8064, + "step": 9759 + }, + { + "epoch": 2.9957028852056475, + "grad_norm": 0.27742111682891846, + "learning_rate": 8.212922556910418e-05, + "loss": 1.8174, + "step": 9760 + }, + { + "epoch": 2.996009821976673, + "grad_norm": 0.27103090286254883, + "learning_rate": 8.212541689373761e-05, + "loss": 1.761, + "step": 9761 + }, + { + "epoch": 2.9963167587476978, + "grad_norm": 0.27157172560691833, + "learning_rate": 8.212160790088883e-05, + "loss": 1.8893, + "step": 9762 + }, + { + "epoch": 2.996623695518723, + "grad_norm": 0.2742370367050171, + "learning_rate": 8.21177985905955e-05, + "loss": 1.8774, + "step": 9763 + }, + { + "epoch": 2.9969306322897484, + "grad_norm": 0.26467064023017883, + "learning_rate": 8.211398896289524e-05, + "loss": 1.7805, + "step": 9764 + }, + { + "epoch": 2.9972375690607733, + "grad_norm": 0.2622149884700775, + "learning_rate": 8.211017901782574e-05, + "loss": 1.7346, + "step": 9765 + }, + { + "epoch": 2.9975445058317987, + "grad_norm": 0.3163202106952667, + "learning_rate": 8.210636875542462e-05, + "loss": 1.8348, + "step": 9766 + }, + { + "epoch": 2.9978514426028235, + "grad_norm": 0.2789528965950012, + "learning_rate": 8.210255817572955e-05, + "loss": 1.7535, + "step": 9767 + }, + { + "epoch": 2.998158379373849, + "grad_norm": 0.25694188475608826, + "learning_rate": 8.209874727877818e-05, + "loss": 1.8731, + "step": 9768 + }, + { + "epoch": 2.998465316144874, + "grad_norm": 0.40298742055892944, + "learning_rate": 8.209493606460818e-05, + "loss": 1.7924, + "step": 9769 + }, + { + "epoch": 2.9987722529158995, + "grad_norm": 0.5090280771255493, + "learning_rate": 8.20911245332572e-05, + "loss": 1.8253, + "step": 9770 + }, + { + "epoch": 2.9990791896869244, + "grad_norm": 0.41809162497520447, + "learning_rate": 8.208731268476293e-05, + "loss": 1.8233, + "step": 9771 + }, + { + "epoch": 2.9993861264579498, + "grad_norm": 0.23141434788703918, + "learning_rate": 8.208350051916303e-05, + "loss": 1.7842, + "step": 9772 + }, + { + "epoch": 2.9996930632289747, + "grad_norm": 0.3174372613430023, + "learning_rate": 8.207968803649517e-05, + "loss": 1.8477, + "step": 9773 + }, + { + "epoch": 3.0, + "grad_norm": 0.41795292496681213, + "learning_rate": 8.207587523679704e-05, + "loss": 1.8407, + "step": 9774 + }, + { + "epoch": 3.0003069367710253, + "grad_norm": 0.43365660309791565, + "learning_rate": 8.20720621201063e-05, + "loss": 1.8074, + "step": 9775 + }, + { + "epoch": 3.0006138735420502, + "grad_norm": 0.461374968290329, + "learning_rate": 8.206824868646064e-05, + "loss": 1.9089, + "step": 9776 + }, + { + "epoch": 3.0009208103130756, + "grad_norm": 0.3747929632663727, + "learning_rate": 8.206443493589776e-05, + "loss": 1.8358, + "step": 9777 + }, + { + "epoch": 3.001227747084101, + "grad_norm": 0.28436774015426636, + "learning_rate": 8.206062086845532e-05, + "loss": 1.8527, + "step": 9778 + }, + { + "epoch": 3.001534683855126, + "grad_norm": 0.33642131090164185, + "learning_rate": 8.205680648417106e-05, + "loss": 1.8142, + "step": 9779 + }, + { + "epoch": 3.001841620626151, + "grad_norm": 0.4283481240272522, + "learning_rate": 8.205299178308263e-05, + "loss": 1.9006, + "step": 9780 + }, + { + "epoch": 3.002148557397176, + "grad_norm": 0.34405630826950073, + "learning_rate": 8.204917676522777e-05, + "loss": 1.7988, + "step": 9781 + }, + { + "epoch": 3.0024554941682013, + "grad_norm": 0.3161070942878723, + "learning_rate": 8.204536143064414e-05, + "loss": 1.8271, + "step": 9782 + }, + { + "epoch": 3.0027624309392267, + "grad_norm": 0.42518749833106995, + "learning_rate": 8.204154577936946e-05, + "loss": 1.864, + "step": 9783 + }, + { + "epoch": 3.0030693677102516, + "grad_norm": 0.3760852813720703, + "learning_rate": 8.203772981144146e-05, + "loss": 1.8543, + "step": 9784 + }, + { + "epoch": 3.003376304481277, + "grad_norm": 0.32794755697250366, + "learning_rate": 8.203391352689784e-05, + "loss": 1.8776, + "step": 9785 + }, + { + "epoch": 3.0036832412523022, + "grad_norm": 0.3053889274597168, + "learning_rate": 8.20300969257763e-05, + "loss": 1.8064, + "step": 9786 + }, + { + "epoch": 3.003990178023327, + "grad_norm": 0.40283143520355225, + "learning_rate": 8.202628000811456e-05, + "loss": 1.8083, + "step": 9787 + }, + { + "epoch": 3.0042971147943525, + "grad_norm": 0.49270665645599365, + "learning_rate": 8.202246277395038e-05, + "loss": 1.802, + "step": 9788 + }, + { + "epoch": 3.0046040515653774, + "grad_norm": 0.4373023211956024, + "learning_rate": 8.201864522332143e-05, + "loss": 1.8429, + "step": 9789 + }, + { + "epoch": 3.0049109883364027, + "grad_norm": 0.3136310875415802, + "learning_rate": 8.201482735626547e-05, + "loss": 1.8224, + "step": 9790 + }, + { + "epoch": 3.005217925107428, + "grad_norm": 0.3306807279586792, + "learning_rate": 8.201100917282023e-05, + "loss": 1.8463, + "step": 9791 + }, + { + "epoch": 3.005524861878453, + "grad_norm": 0.45082196593284607, + "learning_rate": 8.200719067302342e-05, + "loss": 1.7587, + "step": 9792 + }, + { + "epoch": 3.0058317986494782, + "grad_norm": 0.49246448278427124, + "learning_rate": 8.20033718569128e-05, + "loss": 1.8245, + "step": 9793 + }, + { + "epoch": 3.0061387354205036, + "grad_norm": 0.3040246367454529, + "learning_rate": 8.199955272452609e-05, + "loss": 1.8309, + "step": 9794 + }, + { + "epoch": 3.0064456721915285, + "grad_norm": 0.3909318149089813, + "learning_rate": 8.199573327590105e-05, + "loss": 1.8187, + "step": 9795 + }, + { + "epoch": 3.006752608962554, + "grad_norm": 0.5753183960914612, + "learning_rate": 8.199191351107543e-05, + "loss": 1.826, + "step": 9796 + }, + { + "epoch": 3.0070595457335787, + "grad_norm": 0.48908689618110657, + "learning_rate": 8.198809343008695e-05, + "loss": 1.8475, + "step": 9797 + }, + { + "epoch": 3.007366482504604, + "grad_norm": 0.31570208072662354, + "learning_rate": 8.198427303297341e-05, + "loss": 1.8046, + "step": 9798 + }, + { + "epoch": 3.0076734192756294, + "grad_norm": 0.39205440878868103, + "learning_rate": 8.198045231977251e-05, + "loss": 1.8413, + "step": 9799 + }, + { + "epoch": 3.0079803560466543, + "grad_norm": 0.5117597579956055, + "learning_rate": 8.197663129052204e-05, + "loss": 1.8184, + "step": 9800 + }, + { + "epoch": 3.0082872928176796, + "grad_norm": 0.3623514175415039, + "learning_rate": 8.197280994525978e-05, + "loss": 1.8292, + "step": 9801 + }, + { + "epoch": 3.008594229588705, + "grad_norm": 0.2826726734638214, + "learning_rate": 8.196898828402344e-05, + "loss": 1.8216, + "step": 9802 + }, + { + "epoch": 3.00890116635973, + "grad_norm": 0.38658398389816284, + "learning_rate": 8.196516630685085e-05, + "loss": 1.867, + "step": 9803 + }, + { + "epoch": 3.009208103130755, + "grad_norm": 0.3371698260307312, + "learning_rate": 8.196134401377973e-05, + "loss": 1.8077, + "step": 9804 + }, + { + "epoch": 3.00951503990178, + "grad_norm": 0.24108785390853882, + "learning_rate": 8.195752140484789e-05, + "loss": 1.7858, + "step": 9805 + }, + { + "epoch": 3.0098219766728054, + "grad_norm": 0.34410104155540466, + "learning_rate": 8.195369848009309e-05, + "loss": 1.801, + "step": 9806 + }, + { + "epoch": 3.0101289134438307, + "grad_norm": 0.3412116467952728, + "learning_rate": 8.194987523955311e-05, + "loss": 1.7905, + "step": 9807 + }, + { + "epoch": 3.0104358502148556, + "grad_norm": 0.2473030537366867, + "learning_rate": 8.194605168326573e-05, + "loss": 1.7765, + "step": 9808 + }, + { + "epoch": 3.010742786985881, + "grad_norm": 0.28590065240859985, + "learning_rate": 8.194222781126875e-05, + "loss": 1.7897, + "step": 9809 + }, + { + "epoch": 3.0110497237569063, + "grad_norm": 0.2994272708892822, + "learning_rate": 8.193840362359994e-05, + "loss": 1.7976, + "step": 9810 + }, + { + "epoch": 3.011356660527931, + "grad_norm": 0.2971307635307312, + "learning_rate": 8.193457912029713e-05, + "loss": 1.829, + "step": 9811 + }, + { + "epoch": 3.0116635972989565, + "grad_norm": 0.25149810314178467, + "learning_rate": 8.193075430139809e-05, + "loss": 1.7709, + "step": 9812 + }, + { + "epoch": 3.0119705340699814, + "grad_norm": 0.2561332583427429, + "learning_rate": 8.19269291669406e-05, + "loss": 1.7689, + "step": 9813 + }, + { + "epoch": 3.0122774708410067, + "grad_norm": 0.2658882141113281, + "learning_rate": 8.192310371696249e-05, + "loss": 1.8497, + "step": 9814 + }, + { + "epoch": 3.012584407612032, + "grad_norm": 0.2873780429363251, + "learning_rate": 8.191927795150156e-05, + "loss": 1.8217, + "step": 9815 + }, + { + "epoch": 3.012891344383057, + "grad_norm": 0.2181183248758316, + "learning_rate": 8.191545187059562e-05, + "loss": 1.7261, + "step": 9816 + }, + { + "epoch": 3.0131982811540823, + "grad_norm": 0.2414858490228653, + "learning_rate": 8.191162547428248e-05, + "loss": 1.8035, + "step": 9817 + }, + { + "epoch": 3.0135052179251076, + "grad_norm": 0.2799840271472931, + "learning_rate": 8.190779876259995e-05, + "loss": 1.8279, + "step": 9818 + }, + { + "epoch": 3.0138121546961325, + "grad_norm": 0.2669760584831238, + "learning_rate": 8.190397173558584e-05, + "loss": 1.8155, + "step": 9819 + }, + { + "epoch": 3.014119091467158, + "grad_norm": 0.28857991099357605, + "learning_rate": 8.1900144393278e-05, + "loss": 1.8479, + "step": 9820 + }, + { + "epoch": 3.0144260282381827, + "grad_norm": 0.30534693598747253, + "learning_rate": 8.189631673571422e-05, + "loss": 1.8609, + "step": 9821 + }, + { + "epoch": 3.014732965009208, + "grad_norm": 0.3238218128681183, + "learning_rate": 8.189248876293236e-05, + "loss": 1.9292, + "step": 9822 + }, + { + "epoch": 3.0150399017802334, + "grad_norm": 0.3000536561012268, + "learning_rate": 8.188866047497022e-05, + "loss": 1.8214, + "step": 9823 + }, + { + "epoch": 3.0153468385512583, + "grad_norm": 0.2960065007209778, + "learning_rate": 8.188483187186565e-05, + "loss": 1.8316, + "step": 9824 + }, + { + "epoch": 3.0156537753222836, + "grad_norm": 0.28609779477119446, + "learning_rate": 8.188100295365648e-05, + "loss": 1.8002, + "step": 9825 + }, + { + "epoch": 3.015960712093309, + "grad_norm": 0.31390634179115295, + "learning_rate": 8.187717372038057e-05, + "loss": 1.8134, + "step": 9826 + }, + { + "epoch": 3.016267648864334, + "grad_norm": 0.28550946712493896, + "learning_rate": 8.187334417207573e-05, + "loss": 1.8359, + "step": 9827 + }, + { + "epoch": 3.016574585635359, + "grad_norm": 0.3085210621356964, + "learning_rate": 8.186951430877982e-05, + "loss": 1.813, + "step": 9828 + }, + { + "epoch": 3.016881522406384, + "grad_norm": 0.3043847978115082, + "learning_rate": 8.18656841305307e-05, + "loss": 1.8222, + "step": 9829 + }, + { + "epoch": 3.0171884591774094, + "grad_norm": 0.32524731755256653, + "learning_rate": 8.18618536373662e-05, + "loss": 1.8258, + "step": 9830 + }, + { + "epoch": 3.0174953959484347, + "grad_norm": 0.2690991461277008, + "learning_rate": 8.18580228293242e-05, + "loss": 1.8492, + "step": 9831 + }, + { + "epoch": 3.0178023327194596, + "grad_norm": 0.34936225414276123, + "learning_rate": 8.185419170644253e-05, + "loss": 1.8363, + "step": 9832 + }, + { + "epoch": 3.018109269490485, + "grad_norm": 0.3274296820163727, + "learning_rate": 8.185036026875908e-05, + "loss": 1.7789, + "step": 9833 + }, + { + "epoch": 3.0184162062615103, + "grad_norm": 0.2729836106300354, + "learning_rate": 8.184652851631169e-05, + "loss": 1.8264, + "step": 9834 + }, + { + "epoch": 3.018723143032535, + "grad_norm": 0.28682780265808105, + "learning_rate": 8.184269644913826e-05, + "loss": 1.8399, + "step": 9835 + }, + { + "epoch": 3.0190300798035605, + "grad_norm": 0.3224826455116272, + "learning_rate": 8.183886406727662e-05, + "loss": 1.8338, + "step": 9836 + }, + { + "epoch": 3.0193370165745854, + "grad_norm": 0.30945318937301636, + "learning_rate": 8.183503137076467e-05, + "loss": 1.8248, + "step": 9837 + }, + { + "epoch": 3.0196439533456108, + "grad_norm": 0.27580398321151733, + "learning_rate": 8.183119835964029e-05, + "loss": 1.8096, + "step": 9838 + }, + { + "epoch": 3.019950890116636, + "grad_norm": 0.28927183151245117, + "learning_rate": 8.182736503394132e-05, + "loss": 1.825, + "step": 9839 + }, + { + "epoch": 3.020257826887661, + "grad_norm": 0.253000408411026, + "learning_rate": 8.182353139370571e-05, + "loss": 1.7678, + "step": 9840 + }, + { + "epoch": 3.0205647636586863, + "grad_norm": 0.2882022559642792, + "learning_rate": 8.18196974389713e-05, + "loss": 1.8895, + "step": 9841 + }, + { + "epoch": 3.0208717004297116, + "grad_norm": 0.26864609122276306, + "learning_rate": 8.1815863169776e-05, + "loss": 1.7674, + "step": 9842 + }, + { + "epoch": 3.0211786372007365, + "grad_norm": 0.27344849705696106, + "learning_rate": 8.181202858615769e-05, + "loss": 1.8146, + "step": 9843 + }, + { + "epoch": 3.021485573971762, + "grad_norm": 0.31659772992134094, + "learning_rate": 8.180819368815425e-05, + "loss": 1.8485, + "step": 9844 + }, + { + "epoch": 3.021792510742787, + "grad_norm": 0.3163176476955414, + "learning_rate": 8.18043584758036e-05, + "loss": 1.8994, + "step": 9845 + }, + { + "epoch": 3.022099447513812, + "grad_norm": 0.2583829462528229, + "learning_rate": 8.180052294914365e-05, + "loss": 1.764, + "step": 9846 + }, + { + "epoch": 3.0224063842848374, + "grad_norm": 0.3006649315357208, + "learning_rate": 8.179668710821227e-05, + "loss": 1.9232, + "step": 9847 + }, + { + "epoch": 3.0227133210558623, + "grad_norm": 0.35702988505363464, + "learning_rate": 8.179285095304741e-05, + "loss": 1.8403, + "step": 9848 + }, + { + "epoch": 3.0230202578268877, + "grad_norm": 0.29699379205703735, + "learning_rate": 8.178901448368697e-05, + "loss": 1.8412, + "step": 9849 + }, + { + "epoch": 3.023327194597913, + "grad_norm": 0.3022700548171997, + "learning_rate": 8.178517770016885e-05, + "loss": 1.8197, + "step": 9850 + }, + { + "epoch": 3.023634131368938, + "grad_norm": 0.2943836748600006, + "learning_rate": 8.178134060253097e-05, + "loss": 1.8127, + "step": 9851 + }, + { + "epoch": 3.023941068139963, + "grad_norm": 0.31290489435195923, + "learning_rate": 8.177750319081126e-05, + "loss": 1.821, + "step": 9852 + }, + { + "epoch": 3.0242480049109886, + "grad_norm": 0.30308374762535095, + "learning_rate": 8.177366546504763e-05, + "loss": 1.8522, + "step": 9853 + }, + { + "epoch": 3.0245549416820134, + "grad_norm": 0.301559716463089, + "learning_rate": 8.176982742527802e-05, + "loss": 1.8758, + "step": 9854 + }, + { + "epoch": 3.0248618784530388, + "grad_norm": 0.33314836025238037, + "learning_rate": 8.176598907154034e-05, + "loss": 1.8178, + "step": 9855 + }, + { + "epoch": 3.0251688152240637, + "grad_norm": 0.3567935526371002, + "learning_rate": 8.176215040387255e-05, + "loss": 1.7847, + "step": 9856 + }, + { + "epoch": 3.025475751995089, + "grad_norm": 0.27716195583343506, + "learning_rate": 8.175831142231258e-05, + "loss": 1.772, + "step": 9857 + }, + { + "epoch": 3.0257826887661143, + "grad_norm": 0.24568212032318115, + "learning_rate": 8.175447212689836e-05, + "loss": 1.8171, + "step": 9858 + }, + { + "epoch": 3.0260896255371392, + "grad_norm": 0.25368261337280273, + "learning_rate": 8.175063251766784e-05, + "loss": 1.852, + "step": 9859 + }, + { + "epoch": 3.0263965623081646, + "grad_norm": 0.2509497404098511, + "learning_rate": 8.174679259465894e-05, + "loss": 1.7737, + "step": 9860 + }, + { + "epoch": 3.02670349907919, + "grad_norm": 0.3539343774318695, + "learning_rate": 8.174295235790963e-05, + "loss": 1.8663, + "step": 9861 + }, + { + "epoch": 3.027010435850215, + "grad_norm": 0.36450034379959106, + "learning_rate": 8.173911180745788e-05, + "loss": 1.8179, + "step": 9862 + }, + { + "epoch": 3.02731737262124, + "grad_norm": 0.3550017178058624, + "learning_rate": 8.173527094334162e-05, + "loss": 1.8256, + "step": 9863 + }, + { + "epoch": 3.027624309392265, + "grad_norm": 0.33518701791763306, + "learning_rate": 8.17314297655988e-05, + "loss": 1.7842, + "step": 9864 + }, + { + "epoch": 3.0279312461632903, + "grad_norm": 0.2522886097431183, + "learning_rate": 8.172758827426739e-05, + "loss": 1.7688, + "step": 9865 + }, + { + "epoch": 3.0282381829343157, + "grad_norm": 0.26222914457321167, + "learning_rate": 8.172374646938536e-05, + "loss": 1.8517, + "step": 9866 + }, + { + "epoch": 3.0285451197053406, + "grad_norm": 0.3355788588523865, + "learning_rate": 8.171990435099068e-05, + "loss": 1.9002, + "step": 9867 + }, + { + "epoch": 3.028852056476366, + "grad_norm": 0.32907500863075256, + "learning_rate": 8.171606191912131e-05, + "loss": 1.7801, + "step": 9868 + }, + { + "epoch": 3.0291589932473912, + "grad_norm": 0.29234179854393005, + "learning_rate": 8.171221917381523e-05, + "loss": 1.8055, + "step": 9869 + }, + { + "epoch": 3.029465930018416, + "grad_norm": 0.26374876499176025, + "learning_rate": 8.170837611511041e-05, + "loss": 1.781, + "step": 9870 + }, + { + "epoch": 3.0297728667894415, + "grad_norm": 0.311282217502594, + "learning_rate": 8.170453274304483e-05, + "loss": 1.839, + "step": 9871 + }, + { + "epoch": 3.0300798035604664, + "grad_norm": 0.24225831031799316, + "learning_rate": 8.170068905765648e-05, + "loss": 1.804, + "step": 9872 + }, + { + "epoch": 3.0303867403314917, + "grad_norm": 0.29383334517478943, + "learning_rate": 8.169684505898335e-05, + "loss": 1.7817, + "step": 9873 + }, + { + "epoch": 3.030693677102517, + "grad_norm": 0.2607928514480591, + "learning_rate": 8.169300074706339e-05, + "loss": 1.8379, + "step": 9874 + }, + { + "epoch": 3.031000613873542, + "grad_norm": 0.283028244972229, + "learning_rate": 8.168915612193464e-05, + "loss": 1.7797, + "step": 9875 + }, + { + "epoch": 3.0313075506445673, + "grad_norm": 0.27675309777259827, + "learning_rate": 8.168531118363508e-05, + "loss": 1.8355, + "step": 9876 + }, + { + "epoch": 3.0316144874155926, + "grad_norm": 0.2598227262496948, + "learning_rate": 8.16814659322027e-05, + "loss": 1.7898, + "step": 9877 + }, + { + "epoch": 3.0319214241866175, + "grad_norm": 0.24715003371238708, + "learning_rate": 8.16776203676755e-05, + "loss": 1.7791, + "step": 9878 + }, + { + "epoch": 3.032228360957643, + "grad_norm": 0.2749374210834503, + "learning_rate": 8.167377449009149e-05, + "loss": 1.8303, + "step": 9879 + }, + { + "epoch": 3.0325352977286677, + "grad_norm": 0.26150834560394287, + "learning_rate": 8.166992829948868e-05, + "loss": 1.8462, + "step": 9880 + }, + { + "epoch": 3.032842234499693, + "grad_norm": 0.3044755160808563, + "learning_rate": 8.166608179590506e-05, + "loss": 1.806, + "step": 9881 + }, + { + "epoch": 3.0331491712707184, + "grad_norm": 0.2949555516242981, + "learning_rate": 8.166223497937868e-05, + "loss": 1.8785, + "step": 9882 + }, + { + "epoch": 3.0334561080417433, + "grad_norm": 0.33206698298454285, + "learning_rate": 8.165838784994752e-05, + "loss": 1.8476, + "step": 9883 + }, + { + "epoch": 3.0337630448127686, + "grad_norm": 0.2720400094985962, + "learning_rate": 8.165454040764962e-05, + "loss": 1.843, + "step": 9884 + }, + { + "epoch": 3.034069981583794, + "grad_norm": 0.29340869188308716, + "learning_rate": 8.1650692652523e-05, + "loss": 1.7761, + "step": 9885 + }, + { + "epoch": 3.034376918354819, + "grad_norm": 0.35155293345451355, + "learning_rate": 8.16468445846057e-05, + "loss": 1.8887, + "step": 9886 + }, + { + "epoch": 3.034683855125844, + "grad_norm": 0.2688990831375122, + "learning_rate": 8.164299620393571e-05, + "loss": 1.8001, + "step": 9887 + }, + { + "epoch": 3.034990791896869, + "grad_norm": 0.2921253442764282, + "learning_rate": 8.16391475105511e-05, + "loss": 1.7951, + "step": 9888 + }, + { + "epoch": 3.0352977286678944, + "grad_norm": 0.28100699186325073, + "learning_rate": 8.163529850448988e-05, + "loss": 1.8041, + "step": 9889 + }, + { + "epoch": 3.0356046654389197, + "grad_norm": 0.3155081868171692, + "learning_rate": 8.16314491857901e-05, + "loss": 1.8026, + "step": 9890 + }, + { + "epoch": 3.0359116022099446, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.16275995544898e-05, + "loss": 1.8502, + "step": 9891 + }, + { + "epoch": 3.03621853898097, + "grad_norm": 0.2732076644897461, + "learning_rate": 8.162374961062704e-05, + "loss": 1.8424, + "step": 9892 + }, + { + "epoch": 3.0365254757519953, + "grad_norm": 0.2943679690361023, + "learning_rate": 8.161989935423984e-05, + "loss": 1.7635, + "step": 9893 + }, + { + "epoch": 3.03683241252302, + "grad_norm": 0.28894683718681335, + "learning_rate": 8.161604878536626e-05, + "loss": 1.78, + "step": 9894 + }, + { + "epoch": 3.0371393492940455, + "grad_norm": 0.2718082666397095, + "learning_rate": 8.161219790404435e-05, + "loss": 1.7664, + "step": 9895 + }, + { + "epoch": 3.0374462860650704, + "grad_norm": 0.29092124104499817, + "learning_rate": 8.160834671031216e-05, + "loss": 1.8621, + "step": 9896 + }, + { + "epoch": 3.0377532228360957, + "grad_norm": 0.284665584564209, + "learning_rate": 8.160449520420779e-05, + "loss": 1.8607, + "step": 9897 + }, + { + "epoch": 3.038060159607121, + "grad_norm": 0.23676982522010803, + "learning_rate": 8.160064338576925e-05, + "loss": 1.7137, + "step": 9898 + }, + { + "epoch": 3.038367096378146, + "grad_norm": 0.2666932940483093, + "learning_rate": 8.159679125503466e-05, + "loss": 1.8038, + "step": 9899 + }, + { + "epoch": 3.0386740331491713, + "grad_norm": 0.36214375495910645, + "learning_rate": 8.159293881204204e-05, + "loss": 1.8902, + "step": 9900 + }, + { + "epoch": 3.0389809699201966, + "grad_norm": 0.30301332473754883, + "learning_rate": 8.158908605682948e-05, + "loss": 1.8456, + "step": 9901 + }, + { + "epoch": 3.0392879066912215, + "grad_norm": 0.32190418243408203, + "learning_rate": 8.158523298943506e-05, + "loss": 1.8246, + "step": 9902 + }, + { + "epoch": 3.039594843462247, + "grad_norm": 0.2938043475151062, + "learning_rate": 8.158137960989685e-05, + "loss": 1.8324, + "step": 9903 + }, + { + "epoch": 3.0399017802332717, + "grad_norm": 0.29493969678878784, + "learning_rate": 8.157752591825294e-05, + "loss": 1.8458, + "step": 9904 + }, + { + "epoch": 3.040208717004297, + "grad_norm": 0.2681889832019806, + "learning_rate": 8.157367191454141e-05, + "loss": 1.889, + "step": 9905 + }, + { + "epoch": 3.0405156537753224, + "grad_norm": 0.3111969232559204, + "learning_rate": 8.156981759880035e-05, + "loss": 1.8966, + "step": 9906 + }, + { + "epoch": 3.0408225905463473, + "grad_norm": 0.345262736082077, + "learning_rate": 8.156596297106784e-05, + "loss": 1.8174, + "step": 9907 + }, + { + "epoch": 3.0411295273173726, + "grad_norm": 0.30156534910202026, + "learning_rate": 8.156210803138199e-05, + "loss": 1.766, + "step": 9908 + }, + { + "epoch": 3.041436464088398, + "grad_norm": 0.28691565990448, + "learning_rate": 8.15582527797809e-05, + "loss": 1.8436, + "step": 9909 + }, + { + "epoch": 3.041743400859423, + "grad_norm": 0.33418282866477966, + "learning_rate": 8.155439721630264e-05, + "loss": 1.8939, + "step": 9910 + }, + { + "epoch": 3.042050337630448, + "grad_norm": 0.25496938824653625, + "learning_rate": 8.155054134098535e-05, + "loss": 1.8368, + "step": 9911 + }, + { + "epoch": 3.042357274401473, + "grad_norm": 0.3806788921356201, + "learning_rate": 8.154668515386711e-05, + "loss": 1.8635, + "step": 9912 + }, + { + "epoch": 3.0426642111724984, + "grad_norm": 0.42668119072914124, + "learning_rate": 8.154282865498603e-05, + "loss": 1.76, + "step": 9913 + }, + { + "epoch": 3.0429711479435237, + "grad_norm": 0.35945314168930054, + "learning_rate": 8.153897184438024e-05, + "loss": 1.8275, + "step": 9914 + }, + { + "epoch": 3.0432780847145486, + "grad_norm": 0.3225449323654175, + "learning_rate": 8.153511472208784e-05, + "loss": 1.7901, + "step": 9915 + }, + { + "epoch": 3.043585021485574, + "grad_norm": 0.2905425727367401, + "learning_rate": 8.153125728814694e-05, + "loss": 1.8021, + "step": 9916 + }, + { + "epoch": 3.0438919582565993, + "grad_norm": 0.3315529525279999, + "learning_rate": 8.15273995425957e-05, + "loss": 1.8003, + "step": 9917 + }, + { + "epoch": 3.044198895027624, + "grad_norm": 0.30256444215774536, + "learning_rate": 8.152354148547221e-05, + "loss": 1.8243, + "step": 9918 + }, + { + "epoch": 3.0445058317986495, + "grad_norm": 0.2563035190105438, + "learning_rate": 8.15196831168146e-05, + "loss": 1.7877, + "step": 9919 + }, + { + "epoch": 3.044812768569675, + "grad_norm": 0.25705814361572266, + "learning_rate": 8.151582443666101e-05, + "loss": 1.813, + "step": 9920 + }, + { + "epoch": 3.0451197053406998, + "grad_norm": 0.3649071455001831, + "learning_rate": 8.151196544504957e-05, + "loss": 1.8114, + "step": 9921 + }, + { + "epoch": 3.045426642111725, + "grad_norm": 0.4076193571090698, + "learning_rate": 8.150810614201841e-05, + "loss": 1.7869, + "step": 9922 + }, + { + "epoch": 3.04573357888275, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.150424652760569e-05, + "loss": 1.7878, + "step": 9923 + }, + { + "epoch": 3.0460405156537753, + "grad_norm": 0.2243243157863617, + "learning_rate": 8.150038660184955e-05, + "loss": 1.8224, + "step": 9924 + }, + { + "epoch": 3.0463474524248007, + "grad_norm": 0.3295031487941742, + "learning_rate": 8.149652636478811e-05, + "loss": 1.8685, + "step": 9925 + }, + { + "epoch": 3.0466543891958255, + "grad_norm": 0.2973531186580658, + "learning_rate": 8.149266581645954e-05, + "loss": 1.8082, + "step": 9926 + }, + { + "epoch": 3.046961325966851, + "grad_norm": 0.25648918747901917, + "learning_rate": 8.148880495690199e-05, + "loss": 1.8089, + "step": 9927 + }, + { + "epoch": 3.047268262737876, + "grad_norm": 0.2845752537250519, + "learning_rate": 8.148494378615361e-05, + "loss": 1.8726, + "step": 9928 + }, + { + "epoch": 3.047575199508901, + "grad_norm": 0.2917105555534363, + "learning_rate": 8.148108230425255e-05, + "loss": 1.8035, + "step": 9929 + }, + { + "epoch": 3.0478821362799264, + "grad_norm": 0.2775834798812866, + "learning_rate": 8.1477220511237e-05, + "loss": 1.8545, + "step": 9930 + }, + { + "epoch": 3.0481890730509513, + "grad_norm": 0.3522767424583435, + "learning_rate": 8.14733584071451e-05, + "loss": 1.8261, + "step": 9931 + }, + { + "epoch": 3.0484960098219767, + "grad_norm": 0.3759000599384308, + "learning_rate": 8.146949599201503e-05, + "loss": 1.8405, + "step": 9932 + }, + { + "epoch": 3.048802946593002, + "grad_norm": 0.3353044390678406, + "learning_rate": 8.146563326588496e-05, + "loss": 1.7762, + "step": 9933 + }, + { + "epoch": 3.049109883364027, + "grad_norm": 0.263810932636261, + "learning_rate": 8.146177022879304e-05, + "loss": 1.7546, + "step": 9934 + }, + { + "epoch": 3.049416820135052, + "grad_norm": 0.24064256250858307, + "learning_rate": 8.14579068807775e-05, + "loss": 1.7903, + "step": 9935 + }, + { + "epoch": 3.0497237569060776, + "grad_norm": 0.3144194781780243, + "learning_rate": 8.145404322187645e-05, + "loss": 1.8011, + "step": 9936 + }, + { + "epoch": 3.0500306936771024, + "grad_norm": 0.3362879455089569, + "learning_rate": 8.145017925212812e-05, + "loss": 1.8224, + "step": 9937 + }, + { + "epoch": 3.050337630448128, + "grad_norm": 0.33979395031929016, + "learning_rate": 8.144631497157071e-05, + "loss": 1.8415, + "step": 9938 + }, + { + "epoch": 3.0506445672191527, + "grad_norm": 0.33391237258911133, + "learning_rate": 8.144245038024235e-05, + "loss": 1.7983, + "step": 9939 + }, + { + "epoch": 3.050951503990178, + "grad_norm": 0.34034964442253113, + "learning_rate": 8.143858547818128e-05, + "loss": 1.8635, + "step": 9940 + }, + { + "epoch": 3.0512584407612033, + "grad_norm": 0.3472529947757721, + "learning_rate": 8.143472026542569e-05, + "loss": 1.8067, + "step": 9941 + }, + { + "epoch": 3.0515653775322282, + "grad_norm": 0.3369109630584717, + "learning_rate": 8.143085474201376e-05, + "loss": 1.7933, + "step": 9942 + }, + { + "epoch": 3.0518723143032536, + "grad_norm": 0.3055182993412018, + "learning_rate": 8.14269889079837e-05, + "loss": 1.7358, + "step": 9943 + }, + { + "epoch": 3.052179251074279, + "grad_norm": 0.26729708909988403, + "learning_rate": 8.142312276337372e-05, + "loss": 1.8315, + "step": 9944 + }, + { + "epoch": 3.052486187845304, + "grad_norm": 0.3626720607280731, + "learning_rate": 8.141925630822203e-05, + "loss": 1.7593, + "step": 9945 + }, + { + "epoch": 3.052793124616329, + "grad_norm": 0.3673512637615204, + "learning_rate": 8.141538954256683e-05, + "loss": 1.8414, + "step": 9946 + }, + { + "epoch": 3.053100061387354, + "grad_norm": 0.30554768443107605, + "learning_rate": 8.141152246644632e-05, + "loss": 1.7504, + "step": 9947 + }, + { + "epoch": 3.0534069981583793, + "grad_norm": 0.41163405776023865, + "learning_rate": 8.140765507989875e-05, + "loss": 1.8794, + "step": 9948 + }, + { + "epoch": 3.0537139349294047, + "grad_norm": 0.592751145362854, + "learning_rate": 8.140378738296233e-05, + "loss": 1.8538, + "step": 9949 + }, + { + "epoch": 3.0540208717004296, + "grad_norm": 0.483828604221344, + "learning_rate": 8.139991937567527e-05, + "loss": 1.7952, + "step": 9950 + }, + { + "epoch": 3.054327808471455, + "grad_norm": 0.26665306091308594, + "learning_rate": 8.13960510580758e-05, + "loss": 1.8268, + "step": 9951 + }, + { + "epoch": 3.0546347452424802, + "grad_norm": 0.42917072772979736, + "learning_rate": 8.139218243020215e-05, + "loss": 1.843, + "step": 9952 + }, + { + "epoch": 3.054941682013505, + "grad_norm": 0.47911396622657776, + "learning_rate": 8.138831349209256e-05, + "loss": 1.8223, + "step": 9953 + }, + { + "epoch": 3.0552486187845305, + "grad_norm": 0.4540431797504425, + "learning_rate": 8.138444424378524e-05, + "loss": 1.9198, + "step": 9954 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.29719051718711853, + "learning_rate": 8.138057468531845e-05, + "loss": 1.7873, + "step": 9955 + }, + { + "epoch": 3.0558624923265807, + "grad_norm": 0.35133618116378784, + "learning_rate": 8.137670481673045e-05, + "loss": 1.8459, + "step": 9956 + }, + { + "epoch": 3.056169429097606, + "grad_norm": 0.42896488308906555, + "learning_rate": 8.137283463805945e-05, + "loss": 1.7814, + "step": 9957 + }, + { + "epoch": 3.056476365868631, + "grad_norm": 0.38993972539901733, + "learning_rate": 8.136896414934372e-05, + "loss": 1.7636, + "step": 9958 + }, + { + "epoch": 3.0567833026396563, + "grad_norm": 0.31362372636795044, + "learning_rate": 8.13650933506215e-05, + "loss": 1.8021, + "step": 9959 + }, + { + "epoch": 3.0570902394106816, + "grad_norm": 0.27980196475982666, + "learning_rate": 8.136122224193103e-05, + "loss": 1.8445, + "step": 9960 + }, + { + "epoch": 3.0573971761817065, + "grad_norm": 0.2721461057662964, + "learning_rate": 8.135735082331059e-05, + "loss": 1.7614, + "step": 9961 + }, + { + "epoch": 3.057704112952732, + "grad_norm": 0.25157424807548523, + "learning_rate": 8.135347909479843e-05, + "loss": 1.7598, + "step": 9962 + }, + { + "epoch": 3.0580110497237567, + "grad_norm": 0.25798025727272034, + "learning_rate": 8.13496070564328e-05, + "loss": 1.7823, + "step": 9963 + }, + { + "epoch": 3.058317986494782, + "grad_norm": 0.30775198340415955, + "learning_rate": 8.134573470825199e-05, + "loss": 1.7755, + "step": 9964 + }, + { + "epoch": 3.0586249232658074, + "grad_norm": 0.28916797041893005, + "learning_rate": 8.134186205029426e-05, + "loss": 1.8189, + "step": 9965 + }, + { + "epoch": 3.0589318600368323, + "grad_norm": 0.2829149067401886, + "learning_rate": 8.133798908259787e-05, + "loss": 1.8546, + "step": 9966 + }, + { + "epoch": 3.0592387968078576, + "grad_norm": 0.2884117662906647, + "learning_rate": 8.13341158052011e-05, + "loss": 1.7705, + "step": 9967 + }, + { + "epoch": 3.059545733578883, + "grad_norm": 0.28311973810195923, + "learning_rate": 8.133024221814225e-05, + "loss": 1.8147, + "step": 9968 + }, + { + "epoch": 3.059852670349908, + "grad_norm": 0.25405213236808777, + "learning_rate": 8.132636832145957e-05, + "loss": 1.7813, + "step": 9969 + }, + { + "epoch": 3.060159607120933, + "grad_norm": 0.3082229793071747, + "learning_rate": 8.132249411519137e-05, + "loss": 1.8536, + "step": 9970 + }, + { + "epoch": 3.060466543891958, + "grad_norm": 0.29918181896209717, + "learning_rate": 8.13186195993759e-05, + "loss": 1.8181, + "step": 9971 + }, + { + "epoch": 3.0607734806629834, + "grad_norm": 0.3025238811969757, + "learning_rate": 8.13147447740515e-05, + "loss": 1.7785, + "step": 9972 + }, + { + "epoch": 3.0610804174340087, + "grad_norm": 0.2798222303390503, + "learning_rate": 8.131086963925643e-05, + "loss": 1.7873, + "step": 9973 + }, + { + "epoch": 3.0613873542050336, + "grad_norm": 0.32636210322380066, + "learning_rate": 8.130699419502898e-05, + "loss": 1.882, + "step": 9974 + }, + { + "epoch": 3.061694290976059, + "grad_norm": 0.27722054719924927, + "learning_rate": 8.130311844140748e-05, + "loss": 1.7788, + "step": 9975 + }, + { + "epoch": 3.0620012277470843, + "grad_norm": 0.289156436920166, + "learning_rate": 8.129924237843023e-05, + "loss": 1.8591, + "step": 9976 + }, + { + "epoch": 3.062308164518109, + "grad_norm": 0.2839665412902832, + "learning_rate": 8.12953660061355e-05, + "loss": 1.8255, + "step": 9977 + }, + { + "epoch": 3.0626151012891345, + "grad_norm": 0.2650148272514343, + "learning_rate": 8.129148932456161e-05, + "loss": 1.8353, + "step": 9978 + }, + { + "epoch": 3.06292203806016, + "grad_norm": 0.2884560227394104, + "learning_rate": 8.128761233374691e-05, + "loss": 1.8099, + "step": 9979 + }, + { + "epoch": 3.0632289748311847, + "grad_norm": 0.2610029876232147, + "learning_rate": 8.128373503372967e-05, + "loss": 1.8173, + "step": 9980 + }, + { + "epoch": 3.06353591160221, + "grad_norm": 0.32512393593788147, + "learning_rate": 8.127985742454822e-05, + "loss": 1.8619, + "step": 9981 + }, + { + "epoch": 3.063842848373235, + "grad_norm": 0.3382968604564667, + "learning_rate": 8.127597950624091e-05, + "loss": 1.831, + "step": 9982 + }, + { + "epoch": 3.0641497851442603, + "grad_norm": 0.33773133158683777, + "learning_rate": 8.127210127884602e-05, + "loss": 1.8194, + "step": 9983 + }, + { + "epoch": 3.0644567219152856, + "grad_norm": 0.31642746925354004, + "learning_rate": 8.126822274240188e-05, + "loss": 1.8782, + "step": 9984 + }, + { + "epoch": 3.0647636586863105, + "grad_norm": 0.2476506233215332, + "learning_rate": 8.126434389694686e-05, + "loss": 1.7866, + "step": 9985 + }, + { + "epoch": 3.065070595457336, + "grad_norm": 0.27296319603919983, + "learning_rate": 8.126046474251927e-05, + "loss": 1.8276, + "step": 9986 + }, + { + "epoch": 3.0653775322283607, + "grad_norm": 0.353865385055542, + "learning_rate": 8.125658527915744e-05, + "loss": 1.9525, + "step": 9987 + }, + { + "epoch": 3.065684468999386, + "grad_norm": 0.370256632566452, + "learning_rate": 8.12527055068997e-05, + "loss": 1.8514, + "step": 9988 + }, + { + "epoch": 3.0659914057704114, + "grad_norm": 0.30738842487335205, + "learning_rate": 8.124882542578442e-05, + "loss": 1.8125, + "step": 9989 + }, + { + "epoch": 3.0662983425414363, + "grad_norm": 0.3151233494281769, + "learning_rate": 8.124494503584995e-05, + "loss": 1.8165, + "step": 9990 + }, + { + "epoch": 3.0666052793124616, + "grad_norm": 0.29071590304374695, + "learning_rate": 8.124106433713458e-05, + "loss": 1.7617, + "step": 9991 + }, + { + "epoch": 3.066912216083487, + "grad_norm": 0.2898697853088379, + "learning_rate": 8.123718332967672e-05, + "loss": 1.7779, + "step": 9992 + }, + { + "epoch": 3.067219152854512, + "grad_norm": 0.26601701974868774, + "learning_rate": 8.123330201351471e-05, + "loss": 1.8307, + "step": 9993 + }, + { + "epoch": 3.067526089625537, + "grad_norm": 0.2622119188308716, + "learning_rate": 8.12294203886869e-05, + "loss": 1.7958, + "step": 9994 + }, + { + "epoch": 3.0678330263965625, + "grad_norm": 0.29709386825561523, + "learning_rate": 8.122553845523166e-05, + "loss": 1.7799, + "step": 9995 + }, + { + "epoch": 3.0681399631675874, + "grad_norm": 0.31267789006233215, + "learning_rate": 8.122165621318733e-05, + "loss": 1.8149, + "step": 9996 + }, + { + "epoch": 3.0684468999386127, + "grad_norm": 0.3076523244380951, + "learning_rate": 8.121777366259232e-05, + "loss": 1.7701, + "step": 9997 + }, + { + "epoch": 3.0687538367096376, + "grad_norm": 0.30096009373664856, + "learning_rate": 8.121389080348496e-05, + "loss": 1.8323, + "step": 9998 + }, + { + "epoch": 3.069060773480663, + "grad_norm": 0.25739142298698425, + "learning_rate": 8.121000763590363e-05, + "loss": 1.8105, + "step": 9999 + }, + { + "epoch": 3.0693677102516883, + "grad_norm": 0.2780844271183014, + "learning_rate": 8.120612415988671e-05, + "loss": 1.8502, + "step": 10000 + }, + { + "epoch": 3.069674647022713, + "grad_norm": 0.3316378593444824, + "learning_rate": 8.120224037547259e-05, + "loss": 1.8244, + "step": 10001 + }, + { + "epoch": 3.0699815837937385, + "grad_norm": 0.261129766702652, + "learning_rate": 8.119835628269964e-05, + "loss": 1.7769, + "step": 10002 + }, + { + "epoch": 3.070288520564764, + "grad_norm": 0.29213985800743103, + "learning_rate": 8.119447188160625e-05, + "loss": 1.7717, + "step": 10003 + }, + { + "epoch": 3.0705954573357888, + "grad_norm": 0.38545623421669006, + "learning_rate": 8.11905871722308e-05, + "loss": 1.8433, + "step": 10004 + }, + { + "epoch": 3.070902394106814, + "grad_norm": 0.3617223799228668, + "learning_rate": 8.118670215461168e-05, + "loss": 1.8172, + "step": 10005 + }, + { + "epoch": 3.071209330877839, + "grad_norm": 0.3241543769836426, + "learning_rate": 8.11828168287873e-05, + "loss": 1.8325, + "step": 10006 + }, + { + "epoch": 3.0715162676488643, + "grad_norm": 0.3538578152656555, + "learning_rate": 8.117893119479605e-05, + "loss": 1.8188, + "step": 10007 + }, + { + "epoch": 3.0718232044198897, + "grad_norm": 0.3861970603466034, + "learning_rate": 8.117504525267632e-05, + "loss": 1.8518, + "step": 10008 + }, + { + "epoch": 3.0721301411909145, + "grad_norm": 0.35433146357536316, + "learning_rate": 8.117115900246652e-05, + "loss": 1.8601, + "step": 10009 + }, + { + "epoch": 3.07243707796194, + "grad_norm": 0.29796987771987915, + "learning_rate": 8.116727244420507e-05, + "loss": 1.7934, + "step": 10010 + }, + { + "epoch": 3.072744014732965, + "grad_norm": 0.3091779947280884, + "learning_rate": 8.116338557793035e-05, + "loss": 1.8111, + "step": 10011 + }, + { + "epoch": 3.07305095150399, + "grad_norm": 0.2741319537162781, + "learning_rate": 8.11594984036808e-05, + "loss": 1.8079, + "step": 10012 + }, + { + "epoch": 3.0733578882750154, + "grad_norm": 0.28905320167541504, + "learning_rate": 8.115561092149482e-05, + "loss": 1.8475, + "step": 10013 + }, + { + "epoch": 3.0736648250460403, + "grad_norm": 0.2897081673145294, + "learning_rate": 8.115172313141081e-05, + "loss": 1.838, + "step": 10014 + }, + { + "epoch": 3.0739717618170657, + "grad_norm": 0.2620783746242523, + "learning_rate": 8.114783503346725e-05, + "loss": 1.8024, + "step": 10015 + }, + { + "epoch": 3.074278698588091, + "grad_norm": 0.26478636264801025, + "learning_rate": 8.11439466277025e-05, + "loss": 1.8137, + "step": 10016 + }, + { + "epoch": 3.074585635359116, + "grad_norm": 0.2796174883842468, + "learning_rate": 8.114005791415502e-05, + "loss": 1.7976, + "step": 10017 + }, + { + "epoch": 3.074892572130141, + "grad_norm": 0.26813286542892456, + "learning_rate": 8.113616889286325e-05, + "loss": 1.7945, + "step": 10018 + }, + { + "epoch": 3.0751995089011666, + "grad_norm": 0.2443828582763672, + "learning_rate": 8.11322795638656e-05, + "loss": 1.7829, + "step": 10019 + }, + { + "epoch": 3.0755064456721914, + "grad_norm": 0.2981395423412323, + "learning_rate": 8.112838992720053e-05, + "loss": 1.7928, + "step": 10020 + }, + { + "epoch": 3.075813382443217, + "grad_norm": 0.25605037808418274, + "learning_rate": 8.112449998290644e-05, + "loss": 1.8129, + "step": 10021 + }, + { + "epoch": 3.0761203192142417, + "grad_norm": 0.31180307269096375, + "learning_rate": 8.112060973102181e-05, + "loss": 1.7393, + "step": 10022 + }, + { + "epoch": 3.076427255985267, + "grad_norm": 0.3230421543121338, + "learning_rate": 8.111671917158508e-05, + "loss": 1.818, + "step": 10023 + }, + { + "epoch": 3.0767341927562923, + "grad_norm": 0.3158549964427948, + "learning_rate": 8.111282830463468e-05, + "loss": 1.7582, + "step": 10024 + }, + { + "epoch": 3.0770411295273172, + "grad_norm": 0.24524325132369995, + "learning_rate": 8.110893713020908e-05, + "loss": 1.8215, + "step": 10025 + }, + { + "epoch": 3.0773480662983426, + "grad_norm": 0.2793932259082794, + "learning_rate": 8.110504564834675e-05, + "loss": 1.8551, + "step": 10026 + }, + { + "epoch": 3.077655003069368, + "grad_norm": 0.29629403352737427, + "learning_rate": 8.110115385908612e-05, + "loss": 1.8019, + "step": 10027 + }, + { + "epoch": 3.077961939840393, + "grad_norm": 0.3138490915298462, + "learning_rate": 8.109726176246564e-05, + "loss": 1.8436, + "step": 10028 + }, + { + "epoch": 3.078268876611418, + "grad_norm": 0.29802024364471436, + "learning_rate": 8.10933693585238e-05, + "loss": 1.8158, + "step": 10029 + }, + { + "epoch": 3.078575813382443, + "grad_norm": 0.30785220861434937, + "learning_rate": 8.108947664729907e-05, + "loss": 1.8674, + "step": 10030 + }, + { + "epoch": 3.0788827501534684, + "grad_norm": 0.277662992477417, + "learning_rate": 8.10855836288299e-05, + "loss": 1.8253, + "step": 10031 + }, + { + "epoch": 3.0791896869244937, + "grad_norm": 0.27399590611457825, + "learning_rate": 8.108169030315477e-05, + "loss": 1.8587, + "step": 10032 + }, + { + "epoch": 3.0794966236955186, + "grad_norm": 0.28398239612579346, + "learning_rate": 8.107779667031217e-05, + "loss": 1.8326, + "step": 10033 + }, + { + "epoch": 3.079803560466544, + "grad_norm": 0.2882741093635559, + "learning_rate": 8.107390273034057e-05, + "loss": 1.785, + "step": 10034 + }, + { + "epoch": 3.0801104972375692, + "grad_norm": 0.271043598651886, + "learning_rate": 8.107000848327843e-05, + "loss": 1.765, + "step": 10035 + }, + { + "epoch": 3.080417434008594, + "grad_norm": 0.2589638829231262, + "learning_rate": 8.106611392916427e-05, + "loss": 1.8136, + "step": 10036 + }, + { + "epoch": 3.0807243707796195, + "grad_norm": 0.3068227469921112, + "learning_rate": 8.106221906803656e-05, + "loss": 1.8034, + "step": 10037 + }, + { + "epoch": 3.0810313075506444, + "grad_norm": 0.2714168131351471, + "learning_rate": 8.105832389993379e-05, + "loss": 1.8007, + "step": 10038 + }, + { + "epoch": 3.0813382443216697, + "grad_norm": 0.2747504711151123, + "learning_rate": 8.105442842489447e-05, + "loss": 1.8135, + "step": 10039 + }, + { + "epoch": 3.081645181092695, + "grad_norm": 0.2719285488128662, + "learning_rate": 8.105053264295708e-05, + "loss": 1.7629, + "step": 10040 + }, + { + "epoch": 3.08195211786372, + "grad_norm": 0.3119582235813141, + "learning_rate": 8.104663655416014e-05, + "loss": 1.7887, + "step": 10041 + }, + { + "epoch": 3.0822590546347453, + "grad_norm": 0.35965192317962646, + "learning_rate": 8.104274015854212e-05, + "loss": 1.8484, + "step": 10042 + }, + { + "epoch": 3.0825659914057706, + "grad_norm": 0.3045980632305145, + "learning_rate": 8.103884345614157e-05, + "loss": 1.8625, + "step": 10043 + }, + { + "epoch": 3.0828729281767955, + "grad_norm": 0.2925138473510742, + "learning_rate": 8.103494644699696e-05, + "loss": 1.9306, + "step": 10044 + }, + { + "epoch": 3.083179864947821, + "grad_norm": 0.2894277274608612, + "learning_rate": 8.103104913114681e-05, + "loss": 1.7796, + "step": 10045 + }, + { + "epoch": 3.0834868017188457, + "grad_norm": 0.2776826322078705, + "learning_rate": 8.102715150862967e-05, + "loss": 1.8169, + "step": 10046 + }, + { + "epoch": 3.083793738489871, + "grad_norm": 0.3315230906009674, + "learning_rate": 8.102325357948402e-05, + "loss": 1.8139, + "step": 10047 + }, + { + "epoch": 3.0841006752608964, + "grad_norm": 0.2906761169433594, + "learning_rate": 8.10193553437484e-05, + "loss": 1.8162, + "step": 10048 + }, + { + "epoch": 3.0844076120319213, + "grad_norm": 0.32681339979171753, + "learning_rate": 8.101545680146132e-05, + "loss": 1.8245, + "step": 10049 + }, + { + "epoch": 3.0847145488029466, + "grad_norm": 0.32525795698165894, + "learning_rate": 8.101155795266131e-05, + "loss": 1.8605, + "step": 10050 + }, + { + "epoch": 3.085021485573972, + "grad_norm": 0.31705379486083984, + "learning_rate": 8.100765879738692e-05, + "loss": 1.8214, + "step": 10051 + }, + { + "epoch": 3.085328422344997, + "grad_norm": 0.27772918343544006, + "learning_rate": 8.100375933567668e-05, + "loss": 1.7822, + "step": 10052 + }, + { + "epoch": 3.085635359116022, + "grad_norm": 0.2877809405326843, + "learning_rate": 8.09998595675691e-05, + "loss": 1.7935, + "step": 10053 + }, + { + "epoch": 3.0859422958870475, + "grad_norm": 0.29759806394577026, + "learning_rate": 8.099595949310276e-05, + "loss": 1.8041, + "step": 10054 + }, + { + "epoch": 3.0862492326580724, + "grad_norm": 0.2715320289134979, + "learning_rate": 8.099205911231617e-05, + "loss": 1.7923, + "step": 10055 + }, + { + "epoch": 3.0865561694290977, + "grad_norm": 0.33566340804100037, + "learning_rate": 8.098815842524789e-05, + "loss": 1.7953, + "step": 10056 + }, + { + "epoch": 3.0868631062001226, + "grad_norm": 0.3360871970653534, + "learning_rate": 8.098425743193645e-05, + "loss": 1.8275, + "step": 10057 + }, + { + "epoch": 3.087170042971148, + "grad_norm": 0.2797739803791046, + "learning_rate": 8.098035613242043e-05, + "loss": 1.7597, + "step": 10058 + }, + { + "epoch": 3.0874769797421733, + "grad_norm": 0.25500187277793884, + "learning_rate": 8.097645452673837e-05, + "loss": 1.8059, + "step": 10059 + }, + { + "epoch": 3.087783916513198, + "grad_norm": 0.28042587637901306, + "learning_rate": 8.097255261492884e-05, + "loss": 1.7954, + "step": 10060 + }, + { + "epoch": 3.0880908532842235, + "grad_norm": 0.3616262376308441, + "learning_rate": 8.096865039703038e-05, + "loss": 1.8605, + "step": 10061 + }, + { + "epoch": 3.0883977900552484, + "grad_norm": 0.3453714847564697, + "learning_rate": 8.096474787308157e-05, + "loss": 1.7643, + "step": 10062 + }, + { + "epoch": 3.0887047268262737, + "grad_norm": 0.3192278742790222, + "learning_rate": 8.096084504312098e-05, + "loss": 1.8415, + "step": 10063 + }, + { + "epoch": 3.089011663597299, + "grad_norm": 0.2714482545852661, + "learning_rate": 8.095694190718715e-05, + "loss": 1.8204, + "step": 10064 + }, + { + "epoch": 3.089318600368324, + "grad_norm": 0.26562005281448364, + "learning_rate": 8.09530384653187e-05, + "loss": 1.7322, + "step": 10065 + }, + { + "epoch": 3.0896255371393493, + "grad_norm": 0.33727800846099854, + "learning_rate": 8.094913471755417e-05, + "loss": 1.8221, + "step": 10066 + }, + { + "epoch": 3.0899324739103746, + "grad_norm": 0.3561044931411743, + "learning_rate": 8.094523066393215e-05, + "loss": 1.8879, + "step": 10067 + }, + { + "epoch": 3.0902394106813995, + "grad_norm": 0.2568742334842682, + "learning_rate": 8.094132630449122e-05, + "loss": 1.8178, + "step": 10068 + }, + { + "epoch": 3.090546347452425, + "grad_norm": 0.4025525450706482, + "learning_rate": 8.093742163926998e-05, + "loss": 1.8186, + "step": 10069 + }, + { + "epoch": 3.09085328422345, + "grad_norm": 0.43863433599472046, + "learning_rate": 8.0933516668307e-05, + "loss": 1.8371, + "step": 10070 + }, + { + "epoch": 3.091160220994475, + "grad_norm": 0.34873950481414795, + "learning_rate": 8.092961139164087e-05, + "loss": 1.8083, + "step": 10071 + }, + { + "epoch": 3.0914671577655004, + "grad_norm": 0.31433534622192383, + "learning_rate": 8.092570580931021e-05, + "loss": 1.8154, + "step": 10072 + }, + { + "epoch": 3.0917740945365253, + "grad_norm": 0.25523966550827026, + "learning_rate": 8.092179992135358e-05, + "loss": 1.8158, + "step": 10073 + }, + { + "epoch": 3.0920810313075506, + "grad_norm": 0.348469078540802, + "learning_rate": 8.09178937278096e-05, + "loss": 1.8358, + "step": 10074 + }, + { + "epoch": 3.092387968078576, + "grad_norm": 0.33455297350883484, + "learning_rate": 8.091398722871688e-05, + "loss": 1.7779, + "step": 10075 + }, + { + "epoch": 3.092694904849601, + "grad_norm": 0.36544880270957947, + "learning_rate": 8.091008042411403e-05, + "loss": 1.9186, + "step": 10076 + }, + { + "epoch": 3.093001841620626, + "grad_norm": 0.29165831208229065, + "learning_rate": 8.090617331403965e-05, + "loss": 1.8964, + "step": 10077 + }, + { + "epoch": 3.0933087783916515, + "grad_norm": 0.31011059880256653, + "learning_rate": 8.090226589853234e-05, + "loss": 1.8453, + "step": 10078 + }, + { + "epoch": 3.0936157151626764, + "grad_norm": 0.2835703492164612, + "learning_rate": 8.089835817763071e-05, + "loss": 1.7718, + "step": 10079 + }, + { + "epoch": 3.0939226519337018, + "grad_norm": 0.2910583019256592, + "learning_rate": 8.08944501513734e-05, + "loss": 1.7881, + "step": 10080 + }, + { + "epoch": 3.0942295887047266, + "grad_norm": 0.391303688287735, + "learning_rate": 8.089054181979905e-05, + "loss": 1.7915, + "step": 10081 + }, + { + "epoch": 3.094536525475752, + "grad_norm": 0.4119330048561096, + "learning_rate": 8.088663318294623e-05, + "loss": 1.7975, + "step": 10082 + }, + { + "epoch": 3.0948434622467773, + "grad_norm": 0.2980102002620697, + "learning_rate": 8.088272424085361e-05, + "loss": 1.805, + "step": 10083 + }, + { + "epoch": 3.095150399017802, + "grad_norm": 0.3089980483055115, + "learning_rate": 8.087881499355983e-05, + "loss": 1.8265, + "step": 10084 + }, + { + "epoch": 3.0954573357888275, + "grad_norm": 0.3851003348827362, + "learning_rate": 8.087490544110348e-05, + "loss": 1.8174, + "step": 10085 + }, + { + "epoch": 3.095764272559853, + "grad_norm": 0.42357420921325684, + "learning_rate": 8.08709955835232e-05, + "loss": 1.8083, + "step": 10086 + }, + { + "epoch": 3.0960712093308778, + "grad_norm": 0.291777640581131, + "learning_rate": 8.086708542085768e-05, + "loss": 1.7713, + "step": 10087 + }, + { + "epoch": 3.096378146101903, + "grad_norm": 0.2563805878162384, + "learning_rate": 8.086317495314552e-05, + "loss": 1.7691, + "step": 10088 + }, + { + "epoch": 3.096685082872928, + "grad_norm": 0.3418877422809601, + "learning_rate": 8.085926418042536e-05, + "loss": 1.8547, + "step": 10089 + }, + { + "epoch": 3.0969920196439533, + "grad_norm": 0.3859385550022125, + "learning_rate": 8.085535310273589e-05, + "loss": 1.8226, + "step": 10090 + }, + { + "epoch": 3.0972989564149787, + "grad_norm": 0.3427267372608185, + "learning_rate": 8.085144172011571e-05, + "loss": 1.837, + "step": 10091 + }, + { + "epoch": 3.0976058931860035, + "grad_norm": 0.29290953278541565, + "learning_rate": 8.084753003260352e-05, + "loss": 1.8392, + "step": 10092 + }, + { + "epoch": 3.097912829957029, + "grad_norm": 0.33282020688056946, + "learning_rate": 8.084361804023795e-05, + "loss": 1.8351, + "step": 10093 + }, + { + "epoch": 3.098219766728054, + "grad_norm": 0.3802134394645691, + "learning_rate": 8.083970574305768e-05, + "loss": 1.7467, + "step": 10094 + }, + { + "epoch": 3.098526703499079, + "grad_norm": 0.3142111897468567, + "learning_rate": 8.083579314110135e-05, + "loss": 1.7966, + "step": 10095 + }, + { + "epoch": 3.0988336402701044, + "grad_norm": 0.2956278324127197, + "learning_rate": 8.083188023440765e-05, + "loss": 1.8724, + "step": 10096 + }, + { + "epoch": 3.0991405770411293, + "grad_norm": 0.3262473940849304, + "learning_rate": 8.082796702301522e-05, + "loss": 1.8448, + "step": 10097 + }, + { + "epoch": 3.0994475138121547, + "grad_norm": 0.29358017444610596, + "learning_rate": 8.082405350696276e-05, + "loss": 1.8679, + "step": 10098 + }, + { + "epoch": 3.09975445058318, + "grad_norm": 0.36439722776412964, + "learning_rate": 8.082013968628893e-05, + "loss": 1.8801, + "step": 10099 + }, + { + "epoch": 3.100061387354205, + "grad_norm": 0.3565322458744049, + "learning_rate": 8.081622556103244e-05, + "loss": 1.794, + "step": 10100 + }, + { + "epoch": 3.1003683241252302, + "grad_norm": 0.2841760814189911, + "learning_rate": 8.081231113123191e-05, + "loss": 1.7593, + "step": 10101 + }, + { + "epoch": 3.1006752608962556, + "grad_norm": 0.28589630126953125, + "learning_rate": 8.080839639692608e-05, + "loss": 1.864, + "step": 10102 + }, + { + "epoch": 3.1009821976672804, + "grad_norm": 0.3595057427883148, + "learning_rate": 8.080448135815362e-05, + "loss": 1.8067, + "step": 10103 + }, + { + "epoch": 3.101289134438306, + "grad_norm": 0.3909708261489868, + "learning_rate": 8.080056601495322e-05, + "loss": 1.8601, + "step": 10104 + }, + { + "epoch": 3.1015960712093307, + "grad_norm": 0.35180148482322693, + "learning_rate": 8.079665036736358e-05, + "loss": 1.8328, + "step": 10105 + }, + { + "epoch": 3.101903007980356, + "grad_norm": 0.3065175712108612, + "learning_rate": 8.079273441542338e-05, + "loss": 1.8449, + "step": 10106 + }, + { + "epoch": 3.1022099447513813, + "grad_norm": 0.31358617544174194, + "learning_rate": 8.078881815917134e-05, + "loss": 1.8325, + "step": 10107 + }, + { + "epoch": 3.1025168815224062, + "grad_norm": 0.4737118184566498, + "learning_rate": 8.078490159864614e-05, + "loss": 1.8232, + "step": 10108 + }, + { + "epoch": 3.1028238182934316, + "grad_norm": 0.435148686170578, + "learning_rate": 8.078098473388651e-05, + "loss": 1.8227, + "step": 10109 + }, + { + "epoch": 3.103130755064457, + "grad_norm": 0.3080987334251404, + "learning_rate": 8.077706756493115e-05, + "loss": 1.8072, + "step": 10110 + }, + { + "epoch": 3.103437691835482, + "grad_norm": 0.3225170075893402, + "learning_rate": 8.077315009181876e-05, + "loss": 1.7716, + "step": 10111 + }, + { + "epoch": 3.103744628606507, + "grad_norm": 0.46642443537712097, + "learning_rate": 8.076923231458808e-05, + "loss": 1.8295, + "step": 10112 + }, + { + "epoch": 3.104051565377532, + "grad_norm": 0.42561766505241394, + "learning_rate": 8.07653142332778e-05, + "loss": 1.8553, + "step": 10113 + }, + { + "epoch": 3.1043585021485574, + "grad_norm": 0.27187541127204895, + "learning_rate": 8.076139584792664e-05, + "loss": 1.7937, + "step": 10114 + }, + { + "epoch": 3.1046654389195827, + "grad_norm": 0.27822238206863403, + "learning_rate": 8.075747715857335e-05, + "loss": 1.8151, + "step": 10115 + }, + { + "epoch": 3.1049723756906076, + "grad_norm": 0.40106478333473206, + "learning_rate": 8.075355816525665e-05, + "loss": 1.8637, + "step": 10116 + }, + { + "epoch": 3.105279312461633, + "grad_norm": 0.33455124497413635, + "learning_rate": 8.074963886801525e-05, + "loss": 1.8543, + "step": 10117 + }, + { + "epoch": 3.1055862492326582, + "grad_norm": 0.32246437668800354, + "learning_rate": 8.07457192668879e-05, + "loss": 1.7907, + "step": 10118 + }, + { + "epoch": 3.105893186003683, + "grad_norm": 0.45360109210014343, + "learning_rate": 8.074179936191332e-05, + "loss": 1.7404, + "step": 10119 + }, + { + "epoch": 3.1062001227747085, + "grad_norm": 0.445916086435318, + "learning_rate": 8.07378791531303e-05, + "loss": 1.778, + "step": 10120 + }, + { + "epoch": 3.1065070595457334, + "grad_norm": 0.28561538457870483, + "learning_rate": 8.073395864057751e-05, + "loss": 1.8723, + "step": 10121 + }, + { + "epoch": 3.1068139963167587, + "grad_norm": 0.3258218467235565, + "learning_rate": 8.073003782429373e-05, + "loss": 1.8106, + "step": 10122 + }, + { + "epoch": 3.107120933087784, + "grad_norm": 0.5459560751914978, + "learning_rate": 8.07261167043177e-05, + "loss": 1.8022, + "step": 10123 + }, + { + "epoch": 3.107427869858809, + "grad_norm": 0.4828549921512604, + "learning_rate": 8.072219528068819e-05, + "loss": 1.7556, + "step": 10124 + }, + { + "epoch": 3.1077348066298343, + "grad_norm": 0.24075324833393097, + "learning_rate": 8.071827355344393e-05, + "loss": 1.7901, + "step": 10125 + }, + { + "epoch": 3.1080417434008596, + "grad_norm": 0.44677188992500305, + "learning_rate": 8.071435152262367e-05, + "loss": 1.7858, + "step": 10126 + }, + { + "epoch": 3.1083486801718845, + "grad_norm": 0.49862590432167053, + "learning_rate": 8.071042918826622e-05, + "loss": 1.805, + "step": 10127 + }, + { + "epoch": 3.10865561694291, + "grad_norm": 0.30883491039276123, + "learning_rate": 8.07065065504103e-05, + "loss": 1.7693, + "step": 10128 + }, + { + "epoch": 3.108962553713935, + "grad_norm": 0.29583030939102173, + "learning_rate": 8.070258360909467e-05, + "loss": 1.8141, + "step": 10129 + }, + { + "epoch": 3.10926949048496, + "grad_norm": 0.3595346510410309, + "learning_rate": 8.069866036435812e-05, + "loss": 1.8286, + "step": 10130 + }, + { + "epoch": 3.1095764272559854, + "grad_norm": 0.3215504288673401, + "learning_rate": 8.069473681623942e-05, + "loss": 1.8557, + "step": 10131 + }, + { + "epoch": 3.1098833640270103, + "grad_norm": 0.29734939336776733, + "learning_rate": 8.069081296477734e-05, + "loss": 1.7996, + "step": 10132 + }, + { + "epoch": 3.1101903007980356, + "grad_norm": 0.33546003699302673, + "learning_rate": 8.068688881001065e-05, + "loss": 1.8307, + "step": 10133 + }, + { + "epoch": 3.110497237569061, + "grad_norm": 0.3886832296848297, + "learning_rate": 8.068296435197814e-05, + "loss": 1.751, + "step": 10134 + }, + { + "epoch": 3.110804174340086, + "grad_norm": 0.34505394101142883, + "learning_rate": 8.06790395907186e-05, + "loss": 1.7543, + "step": 10135 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.27018141746520996, + "learning_rate": 8.06751145262708e-05, + "loss": 1.8109, + "step": 10136 + }, + { + "epoch": 3.1114180478821365, + "grad_norm": 0.3367149531841278, + "learning_rate": 8.067118915867355e-05, + "loss": 1.8025, + "step": 10137 + }, + { + "epoch": 3.1117249846531614, + "grad_norm": 0.40811091661453247, + "learning_rate": 8.066726348796562e-05, + "loss": 1.7327, + "step": 10138 + }, + { + "epoch": 3.1120319214241867, + "grad_norm": 0.3511471152305603, + "learning_rate": 8.066333751418583e-05, + "loss": 1.8711, + "step": 10139 + }, + { + "epoch": 3.1123388581952116, + "grad_norm": 0.3112446367740631, + "learning_rate": 8.065941123737295e-05, + "loss": 1.8621, + "step": 10140 + }, + { + "epoch": 3.112645794966237, + "grad_norm": 0.3424238860607147, + "learning_rate": 8.065548465756581e-05, + "loss": 1.8383, + "step": 10141 + }, + { + "epoch": 3.1129527317372623, + "grad_norm": 0.380013108253479, + "learning_rate": 8.06515577748032e-05, + "loss": 1.8121, + "step": 10142 + }, + { + "epoch": 3.113259668508287, + "grad_norm": 0.2650558650493622, + "learning_rate": 8.064763058912393e-05, + "loss": 1.866, + "step": 10143 + }, + { + "epoch": 3.1135666052793125, + "grad_norm": 0.30580762028694153, + "learning_rate": 8.06437031005668e-05, + "loss": 1.7769, + "step": 10144 + }, + { + "epoch": 3.113873542050338, + "grad_norm": 0.29927194118499756, + "learning_rate": 8.063977530917066e-05, + "loss": 1.7897, + "step": 10145 + }, + { + "epoch": 3.1141804788213627, + "grad_norm": 0.24322012066841125, + "learning_rate": 8.063584721497429e-05, + "loss": 1.7968, + "step": 10146 + }, + { + "epoch": 3.114487415592388, + "grad_norm": 0.3082945644855499, + "learning_rate": 8.063191881801651e-05, + "loss": 1.8456, + "step": 10147 + }, + { + "epoch": 3.114794352363413, + "grad_norm": 0.3247329890727997, + "learning_rate": 8.062799011833617e-05, + "loss": 1.7436, + "step": 10148 + }, + { + "epoch": 3.1151012891344383, + "grad_norm": 0.27591946721076965, + "learning_rate": 8.062406111597207e-05, + "loss": 1.7976, + "step": 10149 + }, + { + "epoch": 3.1154082259054636, + "grad_norm": 0.2752058804035187, + "learning_rate": 8.062013181096306e-05, + "loss": 1.7814, + "step": 10150 + }, + { + "epoch": 3.1157151626764885, + "grad_norm": 0.3207196891307831, + "learning_rate": 8.061620220334795e-05, + "loss": 1.7767, + "step": 10151 + }, + { + "epoch": 3.116022099447514, + "grad_norm": 0.2895309627056122, + "learning_rate": 8.061227229316559e-05, + "loss": 1.8588, + "step": 10152 + }, + { + "epoch": 3.116329036218539, + "grad_norm": 0.333843469619751, + "learning_rate": 8.060834208045481e-05, + "loss": 1.7871, + "step": 10153 + }, + { + "epoch": 3.116635972989564, + "grad_norm": 0.43877774477005005, + "learning_rate": 8.060441156525445e-05, + "loss": 1.8165, + "step": 10154 + }, + { + "epoch": 3.1169429097605894, + "grad_norm": 0.35700589418411255, + "learning_rate": 8.060048074760337e-05, + "loss": 1.777, + "step": 10155 + }, + { + "epoch": 3.1172498465316143, + "grad_norm": 0.26124534010887146, + "learning_rate": 8.059654962754039e-05, + "loss": 1.8343, + "step": 10156 + }, + { + "epoch": 3.1175567833026396, + "grad_norm": 0.331444650888443, + "learning_rate": 8.059261820510438e-05, + "loss": 1.9437, + "step": 10157 + }, + { + "epoch": 3.117863720073665, + "grad_norm": 0.31657731533050537, + "learning_rate": 8.058868648033419e-05, + "loss": 1.7621, + "step": 10158 + }, + { + "epoch": 3.11817065684469, + "grad_norm": 0.2785957455635071, + "learning_rate": 8.058475445326867e-05, + "loss": 1.9049, + "step": 10159 + }, + { + "epoch": 3.118477593615715, + "grad_norm": 0.2605743408203125, + "learning_rate": 8.058082212394667e-05, + "loss": 1.7895, + "step": 10160 + }, + { + "epoch": 3.1187845303867405, + "grad_norm": 0.2981378138065338, + "learning_rate": 8.057688949240707e-05, + "loss": 1.8373, + "step": 10161 + }, + { + "epoch": 3.1190914671577654, + "grad_norm": 0.2944273054599762, + "learning_rate": 8.057295655868873e-05, + "loss": 1.8373, + "step": 10162 + }, + { + "epoch": 3.1193984039287908, + "grad_norm": 0.2696721851825714, + "learning_rate": 8.056902332283052e-05, + "loss": 1.8023, + "step": 10163 + }, + { + "epoch": 3.1197053406998156, + "grad_norm": 0.27659857273101807, + "learning_rate": 8.056508978487128e-05, + "loss": 1.8453, + "step": 10164 + }, + { + "epoch": 3.120012277470841, + "grad_norm": 0.2982441186904907, + "learning_rate": 8.056115594484992e-05, + "loss": 1.9072, + "step": 10165 + }, + { + "epoch": 3.1203192142418663, + "grad_norm": 0.3136404752731323, + "learning_rate": 8.055722180280531e-05, + "loss": 1.8585, + "step": 10166 + }, + { + "epoch": 3.120626151012891, + "grad_norm": 0.2979940176010132, + "learning_rate": 8.055328735877631e-05, + "loss": 1.8699, + "step": 10167 + }, + { + "epoch": 3.1209330877839165, + "grad_norm": 0.2585618793964386, + "learning_rate": 8.054935261280184e-05, + "loss": 1.8323, + "step": 10168 + }, + { + "epoch": 3.121240024554942, + "grad_norm": 0.28734859824180603, + "learning_rate": 8.054541756492075e-05, + "loss": 1.8694, + "step": 10169 + }, + { + "epoch": 3.1215469613259668, + "grad_norm": 0.30582788586616516, + "learning_rate": 8.054148221517193e-05, + "loss": 1.856, + "step": 10170 + }, + { + "epoch": 3.121853898096992, + "grad_norm": 0.3128255009651184, + "learning_rate": 8.053754656359429e-05, + "loss": 1.8329, + "step": 10171 + }, + { + "epoch": 3.122160834868017, + "grad_norm": 0.2845318615436554, + "learning_rate": 8.053361061022671e-05, + "loss": 1.8111, + "step": 10172 + }, + { + "epoch": 3.1224677716390423, + "grad_norm": 0.2994609773159027, + "learning_rate": 8.05296743551081e-05, + "loss": 1.8157, + "step": 10173 + }, + { + "epoch": 3.1227747084100677, + "grad_norm": 0.26397961378097534, + "learning_rate": 8.052573779827737e-05, + "loss": 1.8572, + "step": 10174 + }, + { + "epoch": 3.1230816451810925, + "grad_norm": 0.2911500334739685, + "learning_rate": 8.052180093977339e-05, + "loss": 1.8312, + "step": 10175 + }, + { + "epoch": 3.123388581952118, + "grad_norm": 0.33455008268356323, + "learning_rate": 8.051786377963509e-05, + "loss": 1.8748, + "step": 10176 + }, + { + "epoch": 3.123695518723143, + "grad_norm": 0.3127586841583252, + "learning_rate": 8.051392631790135e-05, + "loss": 1.8224, + "step": 10177 + }, + { + "epoch": 3.124002455494168, + "grad_norm": 0.2910686433315277, + "learning_rate": 8.050998855461113e-05, + "loss": 1.8557, + "step": 10178 + }, + { + "epoch": 3.1243093922651934, + "grad_norm": 0.2849208414554596, + "learning_rate": 8.050605048980333e-05, + "loss": 1.82, + "step": 10179 + }, + { + "epoch": 3.1246163290362183, + "grad_norm": 0.35189691185951233, + "learning_rate": 8.050211212351683e-05, + "loss": 1.7884, + "step": 10180 + }, + { + "epoch": 3.1249232658072437, + "grad_norm": 0.3641110360622406, + "learning_rate": 8.04981734557906e-05, + "loss": 1.7984, + "step": 10181 + }, + { + "epoch": 3.125230202578269, + "grad_norm": 0.3111717700958252, + "learning_rate": 8.049423448666353e-05, + "loss": 1.8134, + "step": 10182 + }, + { + "epoch": 3.125537139349294, + "grad_norm": 0.2608453631401062, + "learning_rate": 8.049029521617457e-05, + "loss": 1.765, + "step": 10183 + }, + { + "epoch": 3.1258440761203192, + "grad_norm": 0.28779423236846924, + "learning_rate": 8.048635564436265e-05, + "loss": 1.8355, + "step": 10184 + }, + { + "epoch": 3.1261510128913446, + "grad_norm": 0.38227665424346924, + "learning_rate": 8.048241577126668e-05, + "loss": 1.8487, + "step": 10185 + }, + { + "epoch": 3.1264579496623695, + "grad_norm": 0.3603171706199646, + "learning_rate": 8.047847559692562e-05, + "loss": 1.8035, + "step": 10186 + }, + { + "epoch": 3.126764886433395, + "grad_norm": 0.21950066089630127, + "learning_rate": 8.04745351213784e-05, + "loss": 1.7399, + "step": 10187 + }, + { + "epoch": 3.12707182320442, + "grad_norm": 0.2796075642108917, + "learning_rate": 8.047059434466395e-05, + "loss": 1.8229, + "step": 10188 + }, + { + "epoch": 3.127378759975445, + "grad_norm": 0.3382907807826996, + "learning_rate": 8.046665326682125e-05, + "loss": 1.7713, + "step": 10189 + }, + { + "epoch": 3.1276856967464703, + "grad_norm": 0.36472463607788086, + "learning_rate": 8.04627118878892e-05, + "loss": 1.8129, + "step": 10190 + }, + { + "epoch": 3.1279926335174952, + "grad_norm": 0.2971884310245514, + "learning_rate": 8.045877020790679e-05, + "loss": 1.7894, + "step": 10191 + }, + { + "epoch": 3.1282995702885206, + "grad_norm": 0.2292303442955017, + "learning_rate": 8.045482822691297e-05, + "loss": 1.7637, + "step": 10192 + }, + { + "epoch": 3.128606507059546, + "grad_norm": 0.300750732421875, + "learning_rate": 8.045088594494668e-05, + "loss": 1.7678, + "step": 10193 + }, + { + "epoch": 3.128913443830571, + "grad_norm": 0.3121531009674072, + "learning_rate": 8.044694336204688e-05, + "loss": 1.8651, + "step": 10194 + }, + { + "epoch": 3.129220380601596, + "grad_norm": 0.2456093430519104, + "learning_rate": 8.044300047825254e-05, + "loss": 1.7769, + "step": 10195 + }, + { + "epoch": 3.129527317372621, + "grad_norm": 0.25085800886154175, + "learning_rate": 8.043905729360264e-05, + "loss": 1.7723, + "step": 10196 + }, + { + "epoch": 3.1298342541436464, + "grad_norm": 0.2505287826061249, + "learning_rate": 8.043511380813612e-05, + "loss": 1.7943, + "step": 10197 + }, + { + "epoch": 3.1301411909146717, + "grad_norm": 0.27144530415534973, + "learning_rate": 8.043117002189198e-05, + "loss": 1.8119, + "step": 10198 + }, + { + "epoch": 3.1304481276856966, + "grad_norm": 0.2702989876270294, + "learning_rate": 8.042722593490916e-05, + "loss": 1.8517, + "step": 10199 + }, + { + "epoch": 3.130755064456722, + "grad_norm": 0.2585136890411377, + "learning_rate": 8.042328154722667e-05, + "loss": 1.8382, + "step": 10200 + }, + { + "epoch": 3.1310620012277472, + "grad_norm": 0.26306065917015076, + "learning_rate": 8.041933685888348e-05, + "loss": 1.8211, + "step": 10201 + }, + { + "epoch": 3.131368937998772, + "grad_norm": 0.2208927720785141, + "learning_rate": 8.041539186991858e-05, + "loss": 1.7765, + "step": 10202 + }, + { + "epoch": 3.1316758747697975, + "grad_norm": 0.2756440043449402, + "learning_rate": 8.041144658037095e-05, + "loss": 1.898, + "step": 10203 + }, + { + "epoch": 3.131982811540823, + "grad_norm": 0.29718101024627686, + "learning_rate": 8.040750099027958e-05, + "loss": 1.8226, + "step": 10204 + }, + { + "epoch": 3.1322897483118477, + "grad_norm": 0.3166738748550415, + "learning_rate": 8.040355509968345e-05, + "loss": 1.8129, + "step": 10205 + }, + { + "epoch": 3.132596685082873, + "grad_norm": 0.3534909784793854, + "learning_rate": 8.039960890862158e-05, + "loss": 1.8915, + "step": 10206 + }, + { + "epoch": 3.132903621853898, + "grad_norm": 0.3015006184577942, + "learning_rate": 8.039566241713297e-05, + "loss": 1.8389, + "step": 10207 + }, + { + "epoch": 3.1332105586249233, + "grad_norm": 0.35226619243621826, + "learning_rate": 8.039171562525659e-05, + "loss": 1.7287, + "step": 10208 + }, + { + "epoch": 3.1335174953959486, + "grad_norm": 0.4290136694908142, + "learning_rate": 8.038776853303146e-05, + "loss": 1.8768, + "step": 10209 + }, + { + "epoch": 3.1338244321669735, + "grad_norm": 0.2828960418701172, + "learning_rate": 8.03838211404966e-05, + "loss": 1.7552, + "step": 10210 + }, + { + "epoch": 3.134131368937999, + "grad_norm": 0.3781953752040863, + "learning_rate": 8.0379873447691e-05, + "loss": 1.7812, + "step": 10211 + }, + { + "epoch": 3.1344383057090237, + "grad_norm": 0.4282926023006439, + "learning_rate": 8.037592545465371e-05, + "loss": 1.84, + "step": 10212 + }, + { + "epoch": 3.134745242480049, + "grad_norm": 0.2622411251068115, + "learning_rate": 8.03719771614237e-05, + "loss": 1.8114, + "step": 10213 + }, + { + "epoch": 3.1350521792510744, + "grad_norm": 0.34881457686424255, + "learning_rate": 8.036802856804001e-05, + "loss": 1.7694, + "step": 10214 + }, + { + "epoch": 3.1353591160220993, + "grad_norm": 0.40797632932662964, + "learning_rate": 8.036407967454167e-05, + "loss": 1.7595, + "step": 10215 + }, + { + "epoch": 3.1356660527931246, + "grad_norm": 0.24902814626693726, + "learning_rate": 8.036013048096769e-05, + "loss": 1.8068, + "step": 10216 + }, + { + "epoch": 3.13597298956415, + "grad_norm": 0.3682909607887268, + "learning_rate": 8.035618098735711e-05, + "loss": 1.8519, + "step": 10217 + }, + { + "epoch": 3.136279926335175, + "grad_norm": 0.6111233234405518, + "learning_rate": 8.035223119374895e-05, + "loss": 1.9254, + "step": 10218 + }, + { + "epoch": 3.1365868631062, + "grad_norm": 0.4793062210083008, + "learning_rate": 8.034828110018227e-05, + "loss": 1.786, + "step": 10219 + }, + { + "epoch": 3.1368937998772255, + "grad_norm": 0.3074932396411896, + "learning_rate": 8.034433070669607e-05, + "loss": 1.8495, + "step": 10220 + }, + { + "epoch": 3.1372007366482504, + "grad_norm": 0.4366479218006134, + "learning_rate": 8.034038001332942e-05, + "loss": 1.8501, + "step": 10221 + }, + { + "epoch": 3.1375076734192757, + "grad_norm": 0.4660070538520813, + "learning_rate": 8.033642902012135e-05, + "loss": 1.8317, + "step": 10222 + }, + { + "epoch": 3.1378146101903006, + "grad_norm": 0.3452899158000946, + "learning_rate": 8.03324777271109e-05, + "loss": 1.8702, + "step": 10223 + }, + { + "epoch": 3.138121546961326, + "grad_norm": 0.3658824563026428, + "learning_rate": 8.032852613433713e-05, + "loss": 1.8754, + "step": 10224 + }, + { + "epoch": 3.1384284837323513, + "grad_norm": 0.3777768909931183, + "learning_rate": 8.03245742418391e-05, + "loss": 1.8613, + "step": 10225 + }, + { + "epoch": 3.138735420503376, + "grad_norm": 0.3873192071914673, + "learning_rate": 8.032062204965582e-05, + "loss": 1.8438, + "step": 10226 + }, + { + "epoch": 3.1390423572744015, + "grad_norm": 0.30686715245246887, + "learning_rate": 8.031666955782641e-05, + "loss": 1.811, + "step": 10227 + }, + { + "epoch": 3.139349294045427, + "grad_norm": 0.2738516330718994, + "learning_rate": 8.03127167663899e-05, + "loss": 1.757, + "step": 10228 + }, + { + "epoch": 3.1396562308164517, + "grad_norm": 0.3093133270740509, + "learning_rate": 8.030876367538536e-05, + "loss": 1.8181, + "step": 10229 + }, + { + "epoch": 3.139963167587477, + "grad_norm": 0.3247159719467163, + "learning_rate": 8.030481028485185e-05, + "loss": 1.7798, + "step": 10230 + }, + { + "epoch": 3.140270104358502, + "grad_norm": 0.2855088412761688, + "learning_rate": 8.030085659482845e-05, + "loss": 1.825, + "step": 10231 + }, + { + "epoch": 3.1405770411295273, + "grad_norm": 0.2818242907524109, + "learning_rate": 8.02969026053542e-05, + "loss": 1.7737, + "step": 10232 + }, + { + "epoch": 3.1408839779005526, + "grad_norm": 0.27074751257896423, + "learning_rate": 8.029294831646822e-05, + "loss": 1.8306, + "step": 10233 + }, + { + "epoch": 3.1411909146715775, + "grad_norm": 0.29740920662879944, + "learning_rate": 8.028899372820954e-05, + "loss": 1.8157, + "step": 10234 + }, + { + "epoch": 3.141497851442603, + "grad_norm": 0.30743202567100525, + "learning_rate": 8.028503884061731e-05, + "loss": 1.7626, + "step": 10235 + }, + { + "epoch": 3.141804788213628, + "grad_norm": 0.27812567353248596, + "learning_rate": 8.028108365373058e-05, + "loss": 1.7604, + "step": 10236 + }, + { + "epoch": 3.142111724984653, + "grad_norm": 0.26212629675865173, + "learning_rate": 8.027712816758839e-05, + "loss": 1.8161, + "step": 10237 + }, + { + "epoch": 3.1424186617556784, + "grad_norm": 0.3611658811569214, + "learning_rate": 8.02731723822299e-05, + "loss": 1.8283, + "step": 10238 + }, + { + "epoch": 3.1427255985267033, + "grad_norm": 0.31705498695373535, + "learning_rate": 8.026921629769418e-05, + "loss": 1.7986, + "step": 10239 + }, + { + "epoch": 3.1430325352977286, + "grad_norm": 0.25905972719192505, + "learning_rate": 8.026525991402032e-05, + "loss": 1.7926, + "step": 10240 + }, + { + "epoch": 3.143339472068754, + "grad_norm": 0.42376595735549927, + "learning_rate": 8.026130323124741e-05, + "loss": 1.8275, + "step": 10241 + }, + { + "epoch": 3.143646408839779, + "grad_norm": 0.415556401014328, + "learning_rate": 8.025734624941458e-05, + "loss": 1.7938, + "step": 10242 + }, + { + "epoch": 3.143953345610804, + "grad_norm": 0.3558904528617859, + "learning_rate": 8.025338896856091e-05, + "loss": 1.836, + "step": 10243 + }, + { + "epoch": 3.1442602823818295, + "grad_norm": 0.3091062307357788, + "learning_rate": 8.024943138872553e-05, + "loss": 1.8285, + "step": 10244 + }, + { + "epoch": 3.1445672191528544, + "grad_norm": 0.2620905041694641, + "learning_rate": 8.024547350994753e-05, + "loss": 1.7115, + "step": 10245 + }, + { + "epoch": 3.1448741559238798, + "grad_norm": 0.25716835260391235, + "learning_rate": 8.024151533226604e-05, + "loss": 1.7702, + "step": 10246 + }, + { + "epoch": 3.1451810926949046, + "grad_norm": 0.250844269990921, + "learning_rate": 8.023755685572017e-05, + "loss": 1.7617, + "step": 10247 + }, + { + "epoch": 3.14548802946593, + "grad_norm": 0.23898956179618835, + "learning_rate": 8.023359808034903e-05, + "loss": 1.7872, + "step": 10248 + }, + { + "epoch": 3.1457949662369553, + "grad_norm": 0.2335387021303177, + "learning_rate": 8.022963900619176e-05, + "loss": 1.7656, + "step": 10249 + }, + { + "epoch": 3.14610190300798, + "grad_norm": 0.21822704374790192, + "learning_rate": 8.022567963328749e-05, + "loss": 1.7706, + "step": 10250 + }, + { + "epoch": 3.1464088397790055, + "grad_norm": 0.2627898156642914, + "learning_rate": 8.022171996167531e-05, + "loss": 1.8559, + "step": 10251 + }, + { + "epoch": 3.146715776550031, + "grad_norm": 0.2530064582824707, + "learning_rate": 8.021775999139441e-05, + "loss": 1.788, + "step": 10252 + }, + { + "epoch": 3.1470227133210558, + "grad_norm": 0.2293635457754135, + "learning_rate": 8.021379972248387e-05, + "loss": 1.8129, + "step": 10253 + }, + { + "epoch": 3.147329650092081, + "grad_norm": 0.27753588557243347, + "learning_rate": 8.020983915498286e-05, + "loss": 1.7957, + "step": 10254 + }, + { + "epoch": 3.147636586863106, + "grad_norm": 0.24507668614387512, + "learning_rate": 8.020587828893051e-05, + "loss": 1.7969, + "step": 10255 + }, + { + "epoch": 3.1479435236341313, + "grad_norm": 0.24818891286849976, + "learning_rate": 8.020191712436598e-05, + "loss": 1.8412, + "step": 10256 + }, + { + "epoch": 3.1482504604051567, + "grad_norm": 0.2463149130344391, + "learning_rate": 8.01979556613284e-05, + "loss": 1.8097, + "step": 10257 + }, + { + "epoch": 3.1485573971761815, + "grad_norm": 0.26742151379585266, + "learning_rate": 8.019399389985692e-05, + "loss": 1.8487, + "step": 10258 + }, + { + "epoch": 3.148864333947207, + "grad_norm": 0.3078254461288452, + "learning_rate": 8.01900318399907e-05, + "loss": 1.8189, + "step": 10259 + }, + { + "epoch": 3.149171270718232, + "grad_norm": 0.3819321393966675, + "learning_rate": 8.018606948176887e-05, + "loss": 1.8019, + "step": 10260 + }, + { + "epoch": 3.149478207489257, + "grad_norm": 0.3932126462459564, + "learning_rate": 8.018210682523061e-05, + "loss": 1.787, + "step": 10261 + }, + { + "epoch": 3.1497851442602824, + "grad_norm": 0.2696186900138855, + "learning_rate": 8.017814387041511e-05, + "loss": 1.8345, + "step": 10262 + }, + { + "epoch": 3.150092081031308, + "grad_norm": 0.32631832361221313, + "learning_rate": 8.017418061736149e-05, + "loss": 1.7724, + "step": 10263 + }, + { + "epoch": 3.1503990178023327, + "grad_norm": 0.36187833547592163, + "learning_rate": 8.017021706610893e-05, + "loss": 1.7829, + "step": 10264 + }, + { + "epoch": 3.150705954573358, + "grad_norm": 0.29678142070770264, + "learning_rate": 8.01662532166966e-05, + "loss": 1.7896, + "step": 10265 + }, + { + "epoch": 3.151012891344383, + "grad_norm": 0.2997078001499176, + "learning_rate": 8.016228906916368e-05, + "loss": 1.8401, + "step": 10266 + }, + { + "epoch": 3.1513198281154082, + "grad_norm": 0.4688792824745178, + "learning_rate": 8.015832462354933e-05, + "loss": 1.8263, + "step": 10267 + }, + { + "epoch": 3.1516267648864336, + "grad_norm": 0.42710503935813904, + "learning_rate": 8.015435987989275e-05, + "loss": 1.8233, + "step": 10268 + }, + { + "epoch": 3.1519337016574585, + "grad_norm": 0.2490987628698349, + "learning_rate": 8.01503948382331e-05, + "loss": 1.7792, + "step": 10269 + }, + { + "epoch": 3.152240638428484, + "grad_norm": 0.400836706161499, + "learning_rate": 8.014642949860957e-05, + "loss": 1.8113, + "step": 10270 + }, + { + "epoch": 3.1525475751995087, + "grad_norm": 0.47995972633361816, + "learning_rate": 8.014246386106138e-05, + "loss": 1.8754, + "step": 10271 + }, + { + "epoch": 3.152854511970534, + "grad_norm": 0.39069879055023193, + "learning_rate": 8.013849792562769e-05, + "loss": 1.8541, + "step": 10272 + }, + { + "epoch": 3.1531614487415593, + "grad_norm": 0.27174463868141174, + "learning_rate": 8.013453169234768e-05, + "loss": 1.8018, + "step": 10273 + }, + { + "epoch": 3.1534683855125842, + "grad_norm": 0.37808045744895935, + "learning_rate": 8.013056516126058e-05, + "loss": 1.8346, + "step": 10274 + }, + { + "epoch": 3.1537753222836096, + "grad_norm": 0.43864908814430237, + "learning_rate": 8.012659833240557e-05, + "loss": 1.7626, + "step": 10275 + }, + { + "epoch": 3.154082259054635, + "grad_norm": 0.3592168688774109, + "learning_rate": 8.012263120582187e-05, + "loss": 1.8261, + "step": 10276 + }, + { + "epoch": 3.15438919582566, + "grad_norm": 0.3056562542915344, + "learning_rate": 8.011866378154866e-05, + "loss": 1.903, + "step": 10277 + }, + { + "epoch": 3.154696132596685, + "grad_norm": 0.2898549735546112, + "learning_rate": 8.011469605962517e-05, + "loss": 1.7781, + "step": 10278 + }, + { + "epoch": 3.1550030693677105, + "grad_norm": 0.3498871624469757, + "learning_rate": 8.011072804009059e-05, + "loss": 1.7571, + "step": 10279 + }, + { + "epoch": 3.1553100061387354, + "grad_norm": 0.3330932557582855, + "learning_rate": 8.010675972298416e-05, + "loss": 1.8298, + "step": 10280 + }, + { + "epoch": 3.1556169429097607, + "grad_norm": 0.2540839910507202, + "learning_rate": 8.010279110834507e-05, + "loss": 1.8327, + "step": 10281 + }, + { + "epoch": 3.1559238796807856, + "grad_norm": 0.3557111322879791, + "learning_rate": 8.009882219621257e-05, + "loss": 1.7611, + "step": 10282 + }, + { + "epoch": 3.156230816451811, + "grad_norm": 0.28293952345848083, + "learning_rate": 8.009485298662584e-05, + "loss": 1.7761, + "step": 10283 + }, + { + "epoch": 3.1565377532228363, + "grad_norm": 0.27089303731918335, + "learning_rate": 8.009088347962416e-05, + "loss": 1.8081, + "step": 10284 + }, + { + "epoch": 3.156844689993861, + "grad_norm": 0.2689332664012909, + "learning_rate": 8.008691367524673e-05, + "loss": 1.7458, + "step": 10285 + }, + { + "epoch": 3.1571516267648865, + "grad_norm": 0.2495841234922409, + "learning_rate": 8.008294357353278e-05, + "loss": 1.8307, + "step": 10286 + }, + { + "epoch": 3.1574585635359114, + "grad_norm": 0.29242852330207825, + "learning_rate": 8.007897317452156e-05, + "loss": 1.9216, + "step": 10287 + }, + { + "epoch": 3.1577655003069367, + "grad_norm": 0.26574134826660156, + "learning_rate": 8.007500247825229e-05, + "loss": 1.8392, + "step": 10288 + }, + { + "epoch": 3.158072437077962, + "grad_norm": 0.2503872811794281, + "learning_rate": 8.00710314847642e-05, + "loss": 1.7742, + "step": 10289 + }, + { + "epoch": 3.158379373848987, + "grad_norm": 0.25614771246910095, + "learning_rate": 8.006706019409658e-05, + "loss": 1.828, + "step": 10290 + }, + { + "epoch": 3.1586863106200123, + "grad_norm": 0.259369820356369, + "learning_rate": 8.006308860628863e-05, + "loss": 1.8328, + "step": 10291 + }, + { + "epoch": 3.1589932473910376, + "grad_norm": 0.28183647990226746, + "learning_rate": 8.005911672137962e-05, + "loss": 1.8269, + "step": 10292 + }, + { + "epoch": 3.1593001841620625, + "grad_norm": 0.2926514446735382, + "learning_rate": 8.005514453940881e-05, + "loss": 1.8334, + "step": 10293 + }, + { + "epoch": 3.159607120933088, + "grad_norm": 0.34313449263572693, + "learning_rate": 8.005117206041543e-05, + "loss": 1.7866, + "step": 10294 + }, + { + "epoch": 3.159914057704113, + "grad_norm": 0.30971628427505493, + "learning_rate": 8.004719928443875e-05, + "loss": 1.7827, + "step": 10295 + }, + { + "epoch": 3.160220994475138, + "grad_norm": 0.23955371975898743, + "learning_rate": 8.004322621151807e-05, + "loss": 1.7619, + "step": 10296 + }, + { + "epoch": 3.1605279312461634, + "grad_norm": 0.31311795115470886, + "learning_rate": 8.003925284169261e-05, + "loss": 1.8247, + "step": 10297 + }, + { + "epoch": 3.1608348680171883, + "grad_norm": 0.3408358097076416, + "learning_rate": 8.003527917500163e-05, + "loss": 1.8146, + "step": 10298 + }, + { + "epoch": 3.1611418047882136, + "grad_norm": 0.3030858337879181, + "learning_rate": 8.003130521148442e-05, + "loss": 1.857, + "step": 10299 + }, + { + "epoch": 3.161448741559239, + "grad_norm": 0.25168511271476746, + "learning_rate": 8.002733095118025e-05, + "loss": 1.8404, + "step": 10300 + }, + { + "epoch": 3.161755678330264, + "grad_norm": 0.2956216335296631, + "learning_rate": 8.002335639412839e-05, + "loss": 1.7352, + "step": 10301 + }, + { + "epoch": 3.162062615101289, + "grad_norm": 0.27791857719421387, + "learning_rate": 8.001938154036814e-05, + "loss": 1.7797, + "step": 10302 + }, + { + "epoch": 3.1623695518723145, + "grad_norm": 0.3106420040130615, + "learning_rate": 8.001540638993876e-05, + "loss": 1.8434, + "step": 10303 + }, + { + "epoch": 3.1626764886433394, + "grad_norm": 0.2940445840358734, + "learning_rate": 8.001143094287954e-05, + "loss": 1.8459, + "step": 10304 + }, + { + "epoch": 3.1629834254143647, + "grad_norm": 0.3857429325580597, + "learning_rate": 8.000745519922977e-05, + "loss": 1.7853, + "step": 10305 + }, + { + "epoch": 3.1632903621853896, + "grad_norm": 0.3585071861743927, + "learning_rate": 8.000347915902874e-05, + "loss": 1.8905, + "step": 10306 + }, + { + "epoch": 3.163597298956415, + "grad_norm": 0.320003867149353, + "learning_rate": 7.999950282231574e-05, + "loss": 1.8397, + "step": 10307 + }, + { + "epoch": 3.1639042357274403, + "grad_norm": 0.24986252188682556, + "learning_rate": 7.999552618913009e-05, + "loss": 1.7916, + "step": 10308 + }, + { + "epoch": 3.164211172498465, + "grad_norm": 0.33077237010002136, + "learning_rate": 7.999154925951104e-05, + "loss": 1.8334, + "step": 10309 + }, + { + "epoch": 3.1645181092694905, + "grad_norm": 0.35700327157974243, + "learning_rate": 7.998757203349794e-05, + "loss": 1.7773, + "step": 10310 + }, + { + "epoch": 3.164825046040516, + "grad_norm": 0.3095493018627167, + "learning_rate": 7.998359451113007e-05, + "loss": 1.8156, + "step": 10311 + }, + { + "epoch": 3.1651319828115407, + "grad_norm": 0.3004748225212097, + "learning_rate": 7.997961669244673e-05, + "loss": 1.7862, + "step": 10312 + }, + { + "epoch": 3.165438919582566, + "grad_norm": 0.39382806420326233, + "learning_rate": 7.99756385774873e-05, + "loss": 1.764, + "step": 10313 + }, + { + "epoch": 3.165745856353591, + "grad_norm": 0.3109463155269623, + "learning_rate": 7.997166016629099e-05, + "loss": 1.8006, + "step": 10314 + }, + { + "epoch": 3.1660527931246163, + "grad_norm": 0.2896469235420227, + "learning_rate": 7.996768145889717e-05, + "loss": 1.8373, + "step": 10315 + }, + { + "epoch": 3.1663597298956416, + "grad_norm": 0.35024940967559814, + "learning_rate": 7.996370245534517e-05, + "loss": 1.797, + "step": 10316 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.3228827714920044, + "learning_rate": 7.995972315567431e-05, + "loss": 1.7757, + "step": 10317 + }, + { + "epoch": 3.166973603437692, + "grad_norm": 0.27102410793304443, + "learning_rate": 7.995574355992388e-05, + "loss": 1.7786, + "step": 10318 + }, + { + "epoch": 3.167280540208717, + "grad_norm": 0.2556116580963135, + "learning_rate": 7.995176366813325e-05, + "loss": 1.7621, + "step": 10319 + }, + { + "epoch": 3.167587476979742, + "grad_norm": 0.28279444575309753, + "learning_rate": 7.994778348034173e-05, + "loss": 1.7954, + "step": 10320 + }, + { + "epoch": 3.1678944137507674, + "grad_norm": 0.31778639554977417, + "learning_rate": 7.994380299658867e-05, + "loss": 1.7657, + "step": 10321 + }, + { + "epoch": 3.1682013505217923, + "grad_norm": 0.27935469150543213, + "learning_rate": 7.993982221691339e-05, + "loss": 1.7502, + "step": 10322 + }, + { + "epoch": 3.1685082872928176, + "grad_norm": 0.29012617468833923, + "learning_rate": 7.993584114135524e-05, + "loss": 1.8497, + "step": 10323 + }, + { + "epoch": 3.168815224063843, + "grad_norm": 0.2674056887626648, + "learning_rate": 7.993185976995356e-05, + "loss": 1.7875, + "step": 10324 + }, + { + "epoch": 3.169122160834868, + "grad_norm": 0.2667328417301178, + "learning_rate": 7.992787810274771e-05, + "loss": 1.771, + "step": 10325 + }, + { + "epoch": 3.169429097605893, + "grad_norm": 0.25807151198387146, + "learning_rate": 7.992389613977702e-05, + "loss": 1.7638, + "step": 10326 + }, + { + "epoch": 3.1697360343769185, + "grad_norm": 0.2572930157184601, + "learning_rate": 7.991991388108084e-05, + "loss": 1.8218, + "step": 10327 + }, + { + "epoch": 3.1700429711479434, + "grad_norm": 0.3955067992210388, + "learning_rate": 7.991593132669855e-05, + "loss": 1.8458, + "step": 10328 + }, + { + "epoch": 3.1703499079189688, + "grad_norm": 0.2813466489315033, + "learning_rate": 7.991194847666948e-05, + "loss": 1.8042, + "step": 10329 + }, + { + "epoch": 3.1706568446899936, + "grad_norm": 0.2645012140274048, + "learning_rate": 7.990796533103302e-05, + "loss": 1.8241, + "step": 10330 + }, + { + "epoch": 3.170963781461019, + "grad_norm": 0.28462091088294983, + "learning_rate": 7.99039818898285e-05, + "loss": 1.8853, + "step": 10331 + }, + { + "epoch": 3.1712707182320443, + "grad_norm": 0.2727372944355011, + "learning_rate": 7.98999981530953e-05, + "loss": 1.7564, + "step": 10332 + }, + { + "epoch": 3.171577655003069, + "grad_norm": 0.2658170759677887, + "learning_rate": 7.989601412087281e-05, + "loss": 1.8344, + "step": 10333 + }, + { + "epoch": 3.1718845917740945, + "grad_norm": 0.29713502526283264, + "learning_rate": 7.989202979320039e-05, + "loss": 1.8721, + "step": 10334 + }, + { + "epoch": 3.17219152854512, + "grad_norm": 0.26609495282173157, + "learning_rate": 7.98880451701174e-05, + "loss": 1.7991, + "step": 10335 + }, + { + "epoch": 3.1724984653161448, + "grad_norm": 0.29779741168022156, + "learning_rate": 7.988406025166322e-05, + "loss": 1.8182, + "step": 10336 + }, + { + "epoch": 3.17280540208717, + "grad_norm": 0.2771340012550354, + "learning_rate": 7.988007503787724e-05, + "loss": 1.8034, + "step": 10337 + }, + { + "epoch": 3.1731123388581954, + "grad_norm": 0.30510422587394714, + "learning_rate": 7.987608952879886e-05, + "loss": 1.8477, + "step": 10338 + }, + { + "epoch": 3.1734192756292203, + "grad_norm": 0.3097476363182068, + "learning_rate": 7.987210372446745e-05, + "loss": 1.7572, + "step": 10339 + }, + { + "epoch": 3.1737262124002457, + "grad_norm": 0.2553942799568176, + "learning_rate": 7.986811762492239e-05, + "loss": 1.7837, + "step": 10340 + }, + { + "epoch": 3.1740331491712706, + "grad_norm": 0.26546719670295715, + "learning_rate": 7.986413123020312e-05, + "loss": 1.7893, + "step": 10341 + }, + { + "epoch": 3.174340085942296, + "grad_norm": 0.37721553444862366, + "learning_rate": 7.986014454034895e-05, + "loss": 1.8475, + "step": 10342 + }, + { + "epoch": 3.174647022713321, + "grad_norm": 0.3215494453907013, + "learning_rate": 7.985615755539937e-05, + "loss": 1.7806, + "step": 10343 + }, + { + "epoch": 3.174953959484346, + "grad_norm": 0.2662442922592163, + "learning_rate": 7.985217027539373e-05, + "loss": 1.8116, + "step": 10344 + }, + { + "epoch": 3.1752608962553714, + "grad_norm": 0.23334236443042755, + "learning_rate": 7.984818270037145e-05, + "loss": 1.7929, + "step": 10345 + }, + { + "epoch": 3.1755678330263963, + "grad_norm": 0.2873367667198181, + "learning_rate": 7.98441948303719e-05, + "loss": 1.7808, + "step": 10346 + }, + { + "epoch": 3.1758747697974217, + "grad_norm": 0.3623826801776886, + "learning_rate": 7.984020666543458e-05, + "loss": 1.8817, + "step": 10347 + }, + { + "epoch": 3.176181706568447, + "grad_norm": 0.3060589134693146, + "learning_rate": 7.983621820559881e-05, + "loss": 1.796, + "step": 10348 + }, + { + "epoch": 3.176488643339472, + "grad_norm": 0.2396882325410843, + "learning_rate": 7.983222945090407e-05, + "loss": 1.7455, + "step": 10349 + }, + { + "epoch": 3.1767955801104972, + "grad_norm": 0.24811476469039917, + "learning_rate": 7.982824040138974e-05, + "loss": 1.7907, + "step": 10350 + }, + { + "epoch": 3.1771025168815226, + "grad_norm": 0.32749706506729126, + "learning_rate": 7.982425105709524e-05, + "loss": 1.8553, + "step": 10351 + }, + { + "epoch": 3.1774094536525475, + "grad_norm": 0.3648095726966858, + "learning_rate": 7.982026141806003e-05, + "loss": 1.8387, + "step": 10352 + }, + { + "epoch": 3.177716390423573, + "grad_norm": 0.2749348282814026, + "learning_rate": 7.981627148432352e-05, + "loss": 1.7676, + "step": 10353 + }, + { + "epoch": 3.178023327194598, + "grad_norm": 0.2735142409801483, + "learning_rate": 7.981228125592513e-05, + "loss": 1.822, + "step": 10354 + }, + { + "epoch": 3.178330263965623, + "grad_norm": 0.28759655356407166, + "learning_rate": 7.98082907329043e-05, + "loss": 1.8113, + "step": 10355 + }, + { + "epoch": 3.1786372007366483, + "grad_norm": 0.33661654591560364, + "learning_rate": 7.980429991530048e-05, + "loss": 1.8036, + "step": 10356 + }, + { + "epoch": 3.1789441375076732, + "grad_norm": 0.2634892761707306, + "learning_rate": 7.98003088031531e-05, + "loss": 1.8323, + "step": 10357 + }, + { + "epoch": 3.1792510742786986, + "grad_norm": 0.25864094495773315, + "learning_rate": 7.979631739650158e-05, + "loss": 1.8199, + "step": 10358 + }, + { + "epoch": 3.179558011049724, + "grad_norm": 0.27368444204330444, + "learning_rate": 7.979232569538541e-05, + "loss": 1.7673, + "step": 10359 + }, + { + "epoch": 3.179864947820749, + "grad_norm": 0.2506616413593292, + "learning_rate": 7.9788333699844e-05, + "loss": 1.7912, + "step": 10360 + }, + { + "epoch": 3.180171884591774, + "grad_norm": 0.2539178133010864, + "learning_rate": 7.978434140991684e-05, + "loss": 1.7934, + "step": 10361 + }, + { + "epoch": 3.1804788213627995, + "grad_norm": 0.2605626881122589, + "learning_rate": 7.978034882564334e-05, + "loss": 1.8031, + "step": 10362 + }, + { + "epoch": 3.1807857581338244, + "grad_norm": 0.2610207796096802, + "learning_rate": 7.977635594706299e-05, + "loss": 1.8664, + "step": 10363 + }, + { + "epoch": 3.1810926949048497, + "grad_norm": 0.26164132356643677, + "learning_rate": 7.977236277421523e-05, + "loss": 1.7758, + "step": 10364 + }, + { + "epoch": 3.1813996316758746, + "grad_norm": 0.3122340142726898, + "learning_rate": 7.976836930713953e-05, + "loss": 1.9033, + "step": 10365 + }, + { + "epoch": 3.1817065684469, + "grad_norm": 0.3317202031612396, + "learning_rate": 7.976437554587537e-05, + "loss": 1.7899, + "step": 10366 + }, + { + "epoch": 3.1820135052179253, + "grad_norm": 0.28612568974494934, + "learning_rate": 7.97603814904622e-05, + "loss": 1.8145, + "step": 10367 + }, + { + "epoch": 3.18232044198895, + "grad_norm": 0.349917471408844, + "learning_rate": 7.975638714093949e-05, + "loss": 1.877, + "step": 10368 + }, + { + "epoch": 3.1826273787599755, + "grad_norm": 0.3737771809101105, + "learning_rate": 7.975239249734672e-05, + "loss": 1.8204, + "step": 10369 + }, + { + "epoch": 3.182934315531001, + "grad_norm": 0.3688446879386902, + "learning_rate": 7.974839755972339e-05, + "loss": 1.8487, + "step": 10370 + }, + { + "epoch": 3.1832412523020257, + "grad_norm": 0.2934897541999817, + "learning_rate": 7.974440232810894e-05, + "loss": 1.8243, + "step": 10371 + }, + { + "epoch": 3.183548189073051, + "grad_norm": 0.2596173882484436, + "learning_rate": 7.974040680254287e-05, + "loss": 1.7887, + "step": 10372 + }, + { + "epoch": 3.183855125844076, + "grad_norm": 0.35686594247817993, + "learning_rate": 7.973641098306468e-05, + "loss": 1.8653, + "step": 10373 + }, + { + "epoch": 3.1841620626151013, + "grad_norm": 0.3187713921070099, + "learning_rate": 7.973241486971383e-05, + "loss": 1.8767, + "step": 10374 + }, + { + "epoch": 3.1844689993861266, + "grad_norm": 0.2596273124217987, + "learning_rate": 7.972841846252985e-05, + "loss": 1.8028, + "step": 10375 + }, + { + "epoch": 3.1847759361571515, + "grad_norm": 0.2637474834918976, + "learning_rate": 7.972442176155221e-05, + "loss": 1.802, + "step": 10376 + }, + { + "epoch": 3.185082872928177, + "grad_norm": 0.2641126215457916, + "learning_rate": 7.97204247668204e-05, + "loss": 1.7931, + "step": 10377 + }, + { + "epoch": 3.185389809699202, + "grad_norm": 0.25594159960746765, + "learning_rate": 7.971642747837393e-05, + "loss": 1.818, + "step": 10378 + }, + { + "epoch": 3.185696746470227, + "grad_norm": 0.26567938923835754, + "learning_rate": 7.971242989625233e-05, + "loss": 1.8174, + "step": 10379 + }, + { + "epoch": 3.1860036832412524, + "grad_norm": 0.29580214619636536, + "learning_rate": 7.970843202049508e-05, + "loss": 1.869, + "step": 10380 + }, + { + "epoch": 3.1863106200122773, + "grad_norm": 0.2657530605792999, + "learning_rate": 7.970443385114168e-05, + "loss": 1.8352, + "step": 10381 + }, + { + "epoch": 3.1866175567833026, + "grad_norm": 0.2468358278274536, + "learning_rate": 7.970043538823165e-05, + "loss": 1.7851, + "step": 10382 + }, + { + "epoch": 3.186924493554328, + "grad_norm": 0.26464715600013733, + "learning_rate": 7.969643663180451e-05, + "loss": 1.8208, + "step": 10383 + }, + { + "epoch": 3.187231430325353, + "grad_norm": 0.26035723090171814, + "learning_rate": 7.969243758189979e-05, + "loss": 1.8089, + "step": 10384 + }, + { + "epoch": 3.187538367096378, + "grad_norm": 0.2644619941711426, + "learning_rate": 7.968843823855699e-05, + "loss": 1.8379, + "step": 10385 + }, + { + "epoch": 3.1878453038674035, + "grad_norm": 0.25576624274253845, + "learning_rate": 7.968443860181565e-05, + "loss": 1.7932, + "step": 10386 + }, + { + "epoch": 3.1881522406384284, + "grad_norm": 0.24276074767112732, + "learning_rate": 7.968043867171528e-05, + "loss": 1.8037, + "step": 10387 + }, + { + "epoch": 3.1884591774094537, + "grad_norm": 0.27156540751457214, + "learning_rate": 7.967643844829543e-05, + "loss": 1.7998, + "step": 10388 + }, + { + "epoch": 3.1887661141804786, + "grad_norm": 0.2555428743362427, + "learning_rate": 7.96724379315956e-05, + "loss": 1.7612, + "step": 10389 + }, + { + "epoch": 3.189073050951504, + "grad_norm": 0.3358438014984131, + "learning_rate": 7.966843712165537e-05, + "loss": 1.8543, + "step": 10390 + }, + { + "epoch": 3.1893799877225293, + "grad_norm": 0.2799586355686188, + "learning_rate": 7.966443601851424e-05, + "loss": 1.819, + "step": 10391 + }, + { + "epoch": 3.189686924493554, + "grad_norm": 0.2364189177751541, + "learning_rate": 7.966043462221178e-05, + "loss": 1.8537, + "step": 10392 + }, + { + "epoch": 3.1899938612645795, + "grad_norm": 0.23849403858184814, + "learning_rate": 7.96564329327875e-05, + "loss": 1.8125, + "step": 10393 + }, + { + "epoch": 3.190300798035605, + "grad_norm": 0.2371583878993988, + "learning_rate": 7.965243095028098e-05, + "loss": 1.7352, + "step": 10394 + }, + { + "epoch": 3.1906077348066297, + "grad_norm": 0.2584737539291382, + "learning_rate": 7.964842867473176e-05, + "loss": 1.8801, + "step": 10395 + }, + { + "epoch": 3.190914671577655, + "grad_norm": 0.27768051624298096, + "learning_rate": 7.964442610617939e-05, + "loss": 1.8221, + "step": 10396 + }, + { + "epoch": 3.1912216083486804, + "grad_norm": 0.2680891752243042, + "learning_rate": 7.964042324466341e-05, + "loss": 1.8371, + "step": 10397 + }, + { + "epoch": 3.1915285451197053, + "grad_norm": 0.25301921367645264, + "learning_rate": 7.963642009022343e-05, + "loss": 1.7972, + "step": 10398 + }, + { + "epoch": 3.1918354818907306, + "grad_norm": 0.2589731216430664, + "learning_rate": 7.963241664289896e-05, + "loss": 1.8145, + "step": 10399 + }, + { + "epoch": 3.1921424186617555, + "grad_norm": 0.2611297369003296, + "learning_rate": 7.962841290272956e-05, + "loss": 1.8736, + "step": 10400 + }, + { + "epoch": 3.192449355432781, + "grad_norm": 0.2812272906303406, + "learning_rate": 7.962440886975483e-05, + "loss": 1.8116, + "step": 10401 + }, + { + "epoch": 3.192756292203806, + "grad_norm": 0.3261657655239105, + "learning_rate": 7.962040454401434e-05, + "loss": 1.7935, + "step": 10402 + }, + { + "epoch": 3.193063228974831, + "grad_norm": 0.3355373442173004, + "learning_rate": 7.961639992554764e-05, + "loss": 1.7957, + "step": 10403 + }, + { + "epoch": 3.1933701657458564, + "grad_norm": 0.2811843156814575, + "learning_rate": 7.961239501439432e-05, + "loss": 1.797, + "step": 10404 + }, + { + "epoch": 3.1936771025168813, + "grad_norm": 0.24933238327503204, + "learning_rate": 7.960838981059395e-05, + "loss": 1.7594, + "step": 10405 + }, + { + "epoch": 3.1939840392879066, + "grad_norm": 0.29110121726989746, + "learning_rate": 7.960438431418613e-05, + "loss": 1.8268, + "step": 10406 + }, + { + "epoch": 3.194290976058932, + "grad_norm": 0.3702283799648285, + "learning_rate": 7.960037852521043e-05, + "loss": 1.7629, + "step": 10407 + }, + { + "epoch": 3.194597912829957, + "grad_norm": 0.33275437355041504, + "learning_rate": 7.959637244370644e-05, + "loss": 1.8507, + "step": 10408 + }, + { + "epoch": 3.194904849600982, + "grad_norm": 0.2691981792449951, + "learning_rate": 7.959236606971375e-05, + "loss": 1.8084, + "step": 10409 + }, + { + "epoch": 3.1952117863720075, + "grad_norm": 0.30108413100242615, + "learning_rate": 7.958835940327194e-05, + "loss": 1.8525, + "step": 10410 + }, + { + "epoch": 3.1955187231430324, + "grad_norm": 0.32112306356430054, + "learning_rate": 7.958435244442064e-05, + "loss": 1.7431, + "step": 10411 + }, + { + "epoch": 3.1958256599140578, + "grad_norm": 0.2795291543006897, + "learning_rate": 7.958034519319942e-05, + "loss": 1.7985, + "step": 10412 + }, + { + "epoch": 3.196132596685083, + "grad_norm": 0.2485792338848114, + "learning_rate": 7.957633764964788e-05, + "loss": 1.7363, + "step": 10413 + }, + { + "epoch": 3.196439533456108, + "grad_norm": 0.3552432358264923, + "learning_rate": 7.957232981380565e-05, + "loss": 1.8174, + "step": 10414 + }, + { + "epoch": 3.1967464702271333, + "grad_norm": 0.3829655051231384, + "learning_rate": 7.956832168571234e-05, + "loss": 1.9249, + "step": 10415 + }, + { + "epoch": 3.197053406998158, + "grad_norm": 0.2498074769973755, + "learning_rate": 7.956431326540752e-05, + "loss": 1.8104, + "step": 10416 + }, + { + "epoch": 3.1973603437691835, + "grad_norm": 0.24596504867076874, + "learning_rate": 7.956030455293082e-05, + "loss": 1.8007, + "step": 10417 + }, + { + "epoch": 3.197667280540209, + "grad_norm": 0.2795363664627075, + "learning_rate": 7.95562955483219e-05, + "loss": 1.775, + "step": 10418 + }, + { + "epoch": 3.1979742173112338, + "grad_norm": 0.3581138253211975, + "learning_rate": 7.95522862516203e-05, + "loss": 1.8567, + "step": 10419 + }, + { + "epoch": 3.198281154082259, + "grad_norm": 0.36102500557899475, + "learning_rate": 7.95482766628657e-05, + "loss": 1.8509, + "step": 10420 + }, + { + "epoch": 3.198588090853284, + "grad_norm": 0.4717029929161072, + "learning_rate": 7.954426678209774e-05, + "loss": 1.8218, + "step": 10421 + }, + { + "epoch": 3.1988950276243093, + "grad_norm": 0.3211984932422638, + "learning_rate": 7.9540256609356e-05, + "loss": 1.8696, + "step": 10422 + }, + { + "epoch": 3.1992019643953347, + "grad_norm": 0.30094626545906067, + "learning_rate": 7.953624614468011e-05, + "loss": 1.8714, + "step": 10423 + }, + { + "epoch": 3.1995089011663596, + "grad_norm": 0.267578125, + "learning_rate": 7.953223538810976e-05, + "loss": 1.7903, + "step": 10424 + }, + { + "epoch": 3.199815837937385, + "grad_norm": 0.35577845573425293, + "learning_rate": 7.952822433968453e-05, + "loss": 1.7808, + "step": 10425 + }, + { + "epoch": 3.2001227747084102, + "grad_norm": 0.4117741882801056, + "learning_rate": 7.952421299944408e-05, + "loss": 1.7856, + "step": 10426 + }, + { + "epoch": 3.200429711479435, + "grad_norm": 0.35202035307884216, + "learning_rate": 7.952020136742806e-05, + "loss": 1.8112, + "step": 10427 + }, + { + "epoch": 3.2007366482504604, + "grad_norm": 0.26514917612075806, + "learning_rate": 7.951618944367611e-05, + "loss": 1.828, + "step": 10428 + }, + { + "epoch": 3.201043585021486, + "grad_norm": 0.29219159483909607, + "learning_rate": 7.951217722822786e-05, + "loss": 1.9366, + "step": 10429 + }, + { + "epoch": 3.2013505217925107, + "grad_norm": 0.2929961383342743, + "learning_rate": 7.950816472112298e-05, + "loss": 1.8006, + "step": 10430 + }, + { + "epoch": 3.201657458563536, + "grad_norm": 0.28339722752571106, + "learning_rate": 7.950415192240114e-05, + "loss": 1.7411, + "step": 10431 + }, + { + "epoch": 3.201964395334561, + "grad_norm": 0.258884996175766, + "learning_rate": 7.950013883210196e-05, + "loss": 1.8153, + "step": 10432 + }, + { + "epoch": 3.2022713321055862, + "grad_norm": 0.3065929114818573, + "learning_rate": 7.949612545026512e-05, + "loss": 1.7918, + "step": 10433 + }, + { + "epoch": 3.2025782688766116, + "grad_norm": 0.289874404668808, + "learning_rate": 7.949211177693029e-05, + "loss": 1.7975, + "step": 10434 + }, + { + "epoch": 3.2028852056476365, + "grad_norm": 0.27025631070137024, + "learning_rate": 7.948809781213711e-05, + "loss": 1.8129, + "step": 10435 + }, + { + "epoch": 3.203192142418662, + "grad_norm": 0.2501074969768524, + "learning_rate": 7.948408355592528e-05, + "loss": 1.7653, + "step": 10436 + }, + { + "epoch": 3.203499079189687, + "grad_norm": 0.30402958393096924, + "learning_rate": 7.948006900833445e-05, + "loss": 1.8311, + "step": 10437 + }, + { + "epoch": 3.203806015960712, + "grad_norm": 0.28783223032951355, + "learning_rate": 7.94760541694043e-05, + "loss": 1.82, + "step": 10438 + }, + { + "epoch": 3.2041129527317374, + "grad_norm": 0.30428317189216614, + "learning_rate": 7.947203903917451e-05, + "loss": 1.8673, + "step": 10439 + }, + { + "epoch": 3.2044198895027622, + "grad_norm": 0.2860367000102997, + "learning_rate": 7.946802361768473e-05, + "loss": 1.824, + "step": 10440 + }, + { + "epoch": 3.2047268262737876, + "grad_norm": 0.2995273172855377, + "learning_rate": 7.946400790497469e-05, + "loss": 1.7342, + "step": 10441 + }, + { + "epoch": 3.205033763044813, + "grad_norm": 0.4374088943004608, + "learning_rate": 7.945999190108407e-05, + "loss": 1.8522, + "step": 10442 + }, + { + "epoch": 3.205340699815838, + "grad_norm": 0.37659478187561035, + "learning_rate": 7.945597560605252e-05, + "loss": 1.7518, + "step": 10443 + }, + { + "epoch": 3.205647636586863, + "grad_norm": 0.24257932603359222, + "learning_rate": 7.945195901991975e-05, + "loss": 1.7892, + "step": 10444 + }, + { + "epoch": 3.2059545733578885, + "grad_norm": 0.3682694435119629, + "learning_rate": 7.944794214272546e-05, + "loss": 1.7757, + "step": 10445 + }, + { + "epoch": 3.2062615101289134, + "grad_norm": 0.434692919254303, + "learning_rate": 7.944392497450936e-05, + "loss": 1.8207, + "step": 10446 + }, + { + "epoch": 3.2065684468999387, + "grad_norm": 0.3982211947441101, + "learning_rate": 7.943990751531113e-05, + "loss": 1.8303, + "step": 10447 + }, + { + "epoch": 3.2068753836709636, + "grad_norm": 0.2877334654331207, + "learning_rate": 7.943588976517049e-05, + "loss": 1.8495, + "step": 10448 + }, + { + "epoch": 3.207182320441989, + "grad_norm": 0.34589654207229614, + "learning_rate": 7.943187172412712e-05, + "loss": 1.7773, + "step": 10449 + }, + { + "epoch": 3.2074892572130143, + "grad_norm": 0.4727517366409302, + "learning_rate": 7.942785339222074e-05, + "loss": 1.8702, + "step": 10450 + }, + { + "epoch": 3.207796193984039, + "grad_norm": 0.4019354581832886, + "learning_rate": 7.942383476949107e-05, + "loss": 1.8095, + "step": 10451 + }, + { + "epoch": 3.2081031307550645, + "grad_norm": 0.2726243734359741, + "learning_rate": 7.941981585597782e-05, + "loss": 1.7273, + "step": 10452 + }, + { + "epoch": 3.20841006752609, + "grad_norm": 0.2944760024547577, + "learning_rate": 7.941579665172072e-05, + "loss": 1.7507, + "step": 10453 + }, + { + "epoch": 3.2087170042971147, + "grad_norm": 0.3530777096748352, + "learning_rate": 7.941177715675945e-05, + "loss": 1.8434, + "step": 10454 + }, + { + "epoch": 3.20902394106814, + "grad_norm": 0.28612539172172546, + "learning_rate": 7.940775737113378e-05, + "loss": 1.8094, + "step": 10455 + }, + { + "epoch": 3.209330877839165, + "grad_norm": 0.27006468176841736, + "learning_rate": 7.94037372948834e-05, + "loss": 1.7854, + "step": 10456 + }, + { + "epoch": 3.2096378146101903, + "grad_norm": 0.3027147054672241, + "learning_rate": 7.939971692804806e-05, + "loss": 1.7596, + "step": 10457 + }, + { + "epoch": 3.2099447513812156, + "grad_norm": 0.31999528408050537, + "learning_rate": 7.939569627066749e-05, + "loss": 1.8836, + "step": 10458 + }, + { + "epoch": 3.2102516881522405, + "grad_norm": 0.267600417137146, + "learning_rate": 7.939167532278142e-05, + "loss": 1.8508, + "step": 10459 + }, + { + "epoch": 3.210558624923266, + "grad_norm": 0.3171706795692444, + "learning_rate": 7.938765408442958e-05, + "loss": 1.7507, + "step": 10460 + }, + { + "epoch": 3.210865561694291, + "grad_norm": 0.2955280840396881, + "learning_rate": 7.938363255565171e-05, + "loss": 1.733, + "step": 10461 + }, + { + "epoch": 3.211172498465316, + "grad_norm": 0.3427969217300415, + "learning_rate": 7.937961073648759e-05, + "loss": 1.9208, + "step": 10462 + }, + { + "epoch": 3.2114794352363414, + "grad_norm": 0.28788647055625916, + "learning_rate": 7.937558862697692e-05, + "loss": 1.7723, + "step": 10463 + }, + { + "epoch": 3.2117863720073663, + "grad_norm": 0.26093682646751404, + "learning_rate": 7.937156622715945e-05, + "loss": 1.803, + "step": 10464 + }, + { + "epoch": 3.2120933087783916, + "grad_norm": 0.2791301906108856, + "learning_rate": 7.936754353707497e-05, + "loss": 1.7601, + "step": 10465 + }, + { + "epoch": 3.212400245549417, + "grad_norm": 0.3039831519126892, + "learning_rate": 7.93635205567632e-05, + "loss": 1.7864, + "step": 10466 + }, + { + "epoch": 3.212707182320442, + "grad_norm": 0.28498128056526184, + "learning_rate": 7.935949728626392e-05, + "loss": 1.7745, + "step": 10467 + }, + { + "epoch": 3.213014119091467, + "grad_norm": 0.2908780872821808, + "learning_rate": 7.935547372561687e-05, + "loss": 1.8281, + "step": 10468 + }, + { + "epoch": 3.2133210558624925, + "grad_norm": 0.26148509979248047, + "learning_rate": 7.935144987486183e-05, + "loss": 1.8545, + "step": 10469 + }, + { + "epoch": 3.2136279926335174, + "grad_norm": 0.2853962481021881, + "learning_rate": 7.934742573403856e-05, + "loss": 1.7765, + "step": 10470 + }, + { + "epoch": 3.2139349294045427, + "grad_norm": 0.26497501134872437, + "learning_rate": 7.934340130318681e-05, + "loss": 1.7472, + "step": 10471 + }, + { + "epoch": 3.214241866175568, + "grad_norm": 0.2806912660598755, + "learning_rate": 7.933937658234638e-05, + "loss": 1.7879, + "step": 10472 + }, + { + "epoch": 3.214548802946593, + "grad_norm": 0.2699974477291107, + "learning_rate": 7.933535157155705e-05, + "loss": 1.7539, + "step": 10473 + }, + { + "epoch": 3.2148557397176183, + "grad_norm": 0.22714731097221375, + "learning_rate": 7.933132627085856e-05, + "loss": 1.7861, + "step": 10474 + }, + { + "epoch": 3.215162676488643, + "grad_norm": 0.291340708732605, + "learning_rate": 7.932730068029072e-05, + "loss": 1.8381, + "step": 10475 + }, + { + "epoch": 3.2154696132596685, + "grad_norm": 0.3257324695587158, + "learning_rate": 7.93232747998933e-05, + "loss": 1.8293, + "step": 10476 + }, + { + "epoch": 3.215776550030694, + "grad_norm": 0.3518911600112915, + "learning_rate": 7.93192486297061e-05, + "loss": 1.853, + "step": 10477 + }, + { + "epoch": 3.2160834868017187, + "grad_norm": 0.27663540840148926, + "learning_rate": 7.93152221697689e-05, + "loss": 1.7831, + "step": 10478 + }, + { + "epoch": 3.216390423572744, + "grad_norm": 0.3153248429298401, + "learning_rate": 7.931119542012149e-05, + "loss": 1.7443, + "step": 10479 + }, + { + "epoch": 3.216697360343769, + "grad_norm": 0.2919597029685974, + "learning_rate": 7.930716838080368e-05, + "loss": 1.8108, + "step": 10480 + }, + { + "epoch": 3.2170042971147943, + "grad_norm": 0.26892516016960144, + "learning_rate": 7.930314105185524e-05, + "loss": 1.7791, + "step": 10481 + }, + { + "epoch": 3.2173112338858196, + "grad_norm": 0.2486005276441574, + "learning_rate": 7.929911343331599e-05, + "loss": 1.8184, + "step": 10482 + }, + { + "epoch": 3.2176181706568445, + "grad_norm": 0.260728120803833, + "learning_rate": 7.929508552522571e-05, + "loss": 1.7933, + "step": 10483 + }, + { + "epoch": 3.21792510742787, + "grad_norm": 0.3081948757171631, + "learning_rate": 7.929105732762425e-05, + "loss": 1.7732, + "step": 10484 + }, + { + "epoch": 3.218232044198895, + "grad_norm": 0.3807671368122101, + "learning_rate": 7.928702884055138e-05, + "loss": 1.7652, + "step": 10485 + }, + { + "epoch": 3.21853898096992, + "grad_norm": 0.31637755036354065, + "learning_rate": 7.928300006404692e-05, + "loss": 1.7605, + "step": 10486 + }, + { + "epoch": 3.2188459177409454, + "grad_norm": 0.2812853455543518, + "learning_rate": 7.927897099815071e-05, + "loss": 1.7925, + "step": 10487 + }, + { + "epoch": 3.2191528545119708, + "grad_norm": 0.3472350239753723, + "learning_rate": 7.927494164290253e-05, + "loss": 1.8252, + "step": 10488 + }, + { + "epoch": 3.2194597912829956, + "grad_norm": 0.4202714264392853, + "learning_rate": 7.927091199834222e-05, + "loss": 1.7993, + "step": 10489 + }, + { + "epoch": 3.219766728054021, + "grad_norm": 0.44552353024482727, + "learning_rate": 7.92668820645096e-05, + "loss": 1.8609, + "step": 10490 + }, + { + "epoch": 3.220073664825046, + "grad_norm": 0.38964664936065674, + "learning_rate": 7.926285184144451e-05, + "loss": 1.864, + "step": 10491 + }, + { + "epoch": 3.220380601596071, + "grad_norm": 0.2978462278842926, + "learning_rate": 7.925882132918676e-05, + "loss": 1.7892, + "step": 10492 + }, + { + "epoch": 3.2206875383670965, + "grad_norm": 0.2520316243171692, + "learning_rate": 7.925479052777619e-05, + "loss": 1.7702, + "step": 10493 + }, + { + "epoch": 3.2209944751381214, + "grad_norm": 0.28151068091392517, + "learning_rate": 7.925075943725263e-05, + "loss": 1.7613, + "step": 10494 + }, + { + "epoch": 3.2213014119091468, + "grad_norm": 0.3346099555492401, + "learning_rate": 7.924672805765592e-05, + "loss": 1.894, + "step": 10495 + }, + { + "epoch": 3.2216083486801717, + "grad_norm": 0.2981362044811249, + "learning_rate": 7.924269638902591e-05, + "loss": 1.8157, + "step": 10496 + }, + { + "epoch": 3.221915285451197, + "grad_norm": 0.2561499774456024, + "learning_rate": 7.923866443140242e-05, + "loss": 1.8259, + "step": 10497 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.26480481028556824, + "learning_rate": 7.923463218482532e-05, + "loss": 1.7856, + "step": 10498 + }, + { + "epoch": 3.222529158993247, + "grad_norm": 0.24103692173957825, + "learning_rate": 7.923059964933446e-05, + "loss": 1.7765, + "step": 10499 + }, + { + "epoch": 3.2228360957642725, + "grad_norm": 0.2399173080921173, + "learning_rate": 7.922656682496967e-05, + "loss": 1.8216, + "step": 10500 + }, + { + "epoch": 3.223143032535298, + "grad_norm": 0.24530018866062164, + "learning_rate": 7.922253371177082e-05, + "loss": 1.8155, + "step": 10501 + }, + { + "epoch": 3.2234499693063228, + "grad_norm": 0.23298653960227966, + "learning_rate": 7.921850030977775e-05, + "loss": 1.7843, + "step": 10502 + }, + { + "epoch": 3.223756906077348, + "grad_norm": 0.3053973317146301, + "learning_rate": 7.921446661903035e-05, + "loss": 1.8113, + "step": 10503 + }, + { + "epoch": 3.2240638428483734, + "grad_norm": 0.261336088180542, + "learning_rate": 7.921043263956847e-05, + "loss": 1.8073, + "step": 10504 + }, + { + "epoch": 3.2243707796193983, + "grad_norm": 0.24877268075942993, + "learning_rate": 7.920639837143195e-05, + "loss": 1.8344, + "step": 10505 + }, + { + "epoch": 3.2246777163904237, + "grad_norm": 0.26784422993659973, + "learning_rate": 7.920236381466071e-05, + "loss": 1.7757, + "step": 10506 + }, + { + "epoch": 3.2249846531614486, + "grad_norm": 0.2672121226787567, + "learning_rate": 7.919832896929458e-05, + "loss": 1.8384, + "step": 10507 + }, + { + "epoch": 3.225291589932474, + "grad_norm": 0.27254921197891235, + "learning_rate": 7.919429383537346e-05, + "loss": 1.8056, + "step": 10508 + }, + { + "epoch": 3.2255985267034992, + "grad_norm": 0.24467822909355164, + "learning_rate": 7.91902584129372e-05, + "loss": 1.8109, + "step": 10509 + }, + { + "epoch": 3.225905463474524, + "grad_norm": 0.25966358184814453, + "learning_rate": 7.918622270202571e-05, + "loss": 1.82, + "step": 10510 + }, + { + "epoch": 3.2262124002455494, + "grad_norm": 0.28601330518722534, + "learning_rate": 7.918218670267886e-05, + "loss": 1.7266, + "step": 10511 + }, + { + "epoch": 3.226519337016575, + "grad_norm": 0.4017516076564789, + "learning_rate": 7.917815041493653e-05, + "loss": 1.8408, + "step": 10512 + }, + { + "epoch": 3.2268262737875997, + "grad_norm": 0.3995787501335144, + "learning_rate": 7.917411383883862e-05, + "loss": 1.8441, + "step": 10513 + }, + { + "epoch": 3.227133210558625, + "grad_norm": 0.26997458934783936, + "learning_rate": 7.917007697442502e-05, + "loss": 1.8078, + "step": 10514 + }, + { + "epoch": 3.22744014732965, + "grad_norm": 0.34353014826774597, + "learning_rate": 7.916603982173562e-05, + "loss": 1.7523, + "step": 10515 + }, + { + "epoch": 3.2277470841006752, + "grad_norm": 0.39522337913513184, + "learning_rate": 7.916200238081032e-05, + "loss": 1.7532, + "step": 10516 + }, + { + "epoch": 3.2280540208717006, + "grad_norm": 0.4176923334598541, + "learning_rate": 7.915796465168903e-05, + "loss": 1.8895, + "step": 10517 + }, + { + "epoch": 3.2283609576427255, + "grad_norm": 0.30232906341552734, + "learning_rate": 7.915392663441164e-05, + "loss": 1.8223, + "step": 10518 + }, + { + "epoch": 3.228667894413751, + "grad_norm": 0.230951726436615, + "learning_rate": 7.914988832901805e-05, + "loss": 1.7265, + "step": 10519 + }, + { + "epoch": 3.228974831184776, + "grad_norm": 0.26381877064704895, + "learning_rate": 7.914584973554819e-05, + "loss": 1.7858, + "step": 10520 + }, + { + "epoch": 3.229281767955801, + "grad_norm": 0.2500905394554138, + "learning_rate": 7.914181085404194e-05, + "loss": 1.7606, + "step": 10521 + }, + { + "epoch": 3.2295887047268264, + "grad_norm": 0.2585415840148926, + "learning_rate": 7.913777168453925e-05, + "loss": 1.787, + "step": 10522 + }, + { + "epoch": 3.2298956414978512, + "grad_norm": 0.24236604571342468, + "learning_rate": 7.913373222708001e-05, + "loss": 1.7623, + "step": 10523 + }, + { + "epoch": 3.2302025782688766, + "grad_norm": 0.3113093078136444, + "learning_rate": 7.912969248170416e-05, + "loss": 1.7736, + "step": 10524 + }, + { + "epoch": 3.230509515039902, + "grad_norm": 0.3341342806816101, + "learning_rate": 7.912565244845163e-05, + "loss": 1.8583, + "step": 10525 + }, + { + "epoch": 3.230816451810927, + "grad_norm": 0.2644478678703308, + "learning_rate": 7.912161212736231e-05, + "loss": 1.7891, + "step": 10526 + }, + { + "epoch": 3.231123388581952, + "grad_norm": 0.22916561365127563, + "learning_rate": 7.911757151847616e-05, + "loss": 1.7642, + "step": 10527 + }, + { + "epoch": 3.2314303253529775, + "grad_norm": 0.24204877018928528, + "learning_rate": 7.911353062183309e-05, + "loss": 1.8522, + "step": 10528 + }, + { + "epoch": 3.2317372621240024, + "grad_norm": 0.25339365005493164, + "learning_rate": 7.910948943747307e-05, + "loss": 1.8391, + "step": 10529 + }, + { + "epoch": 3.2320441988950277, + "grad_norm": 0.2652709186077118, + "learning_rate": 7.9105447965436e-05, + "loss": 1.7735, + "step": 10530 + }, + { + "epoch": 3.2323511356660526, + "grad_norm": 0.2711019217967987, + "learning_rate": 7.910140620576183e-05, + "loss": 1.8491, + "step": 10531 + }, + { + "epoch": 3.232658072437078, + "grad_norm": 0.2598389685153961, + "learning_rate": 7.909736415849052e-05, + "loss": 1.8417, + "step": 10532 + }, + { + "epoch": 3.2329650092081033, + "grad_norm": 0.278037428855896, + "learning_rate": 7.9093321823662e-05, + "loss": 1.8774, + "step": 10533 + }, + { + "epoch": 3.233271945979128, + "grad_norm": 0.32015568017959595, + "learning_rate": 7.90892792013162e-05, + "loss": 1.8873, + "step": 10534 + }, + { + "epoch": 3.2335788827501535, + "grad_norm": 0.3098098635673523, + "learning_rate": 7.908523629149312e-05, + "loss": 1.8141, + "step": 10535 + }, + { + "epoch": 3.233885819521179, + "grad_norm": 0.3127266764640808, + "learning_rate": 7.908119309423267e-05, + "loss": 1.8587, + "step": 10536 + }, + { + "epoch": 3.2341927562922037, + "grad_norm": 0.3085545301437378, + "learning_rate": 7.907714960957483e-05, + "loss": 1.8544, + "step": 10537 + }, + { + "epoch": 3.234499693063229, + "grad_norm": 0.3051004409790039, + "learning_rate": 7.907310583755956e-05, + "loss": 1.8144, + "step": 10538 + }, + { + "epoch": 3.234806629834254, + "grad_norm": 0.3458186686038971, + "learning_rate": 7.906906177822682e-05, + "loss": 1.8388, + "step": 10539 + }, + { + "epoch": 3.2351135666052793, + "grad_norm": 0.37064439058303833, + "learning_rate": 7.906501743161656e-05, + "loss": 1.7574, + "step": 10540 + }, + { + "epoch": 3.2354205033763046, + "grad_norm": 0.3382316827774048, + "learning_rate": 7.906097279776876e-05, + "loss": 1.8785, + "step": 10541 + }, + { + "epoch": 3.2357274401473295, + "grad_norm": 0.254802942276001, + "learning_rate": 7.905692787672341e-05, + "loss": 1.8276, + "step": 10542 + }, + { + "epoch": 3.236034376918355, + "grad_norm": 0.3362341523170471, + "learning_rate": 7.905288266852047e-05, + "loss": 1.8057, + "step": 10543 + }, + { + "epoch": 3.23634131368938, + "grad_norm": 0.38821661472320557, + "learning_rate": 7.904883717319988e-05, + "loss": 1.7841, + "step": 10544 + }, + { + "epoch": 3.236648250460405, + "grad_norm": 0.33889076113700867, + "learning_rate": 7.90447913908017e-05, + "loss": 1.7892, + "step": 10545 + }, + { + "epoch": 3.2369551872314304, + "grad_norm": 0.2741014361381531, + "learning_rate": 7.904074532136585e-05, + "loss": 1.7611, + "step": 10546 + }, + { + "epoch": 3.2372621240024557, + "grad_norm": 0.28950995206832886, + "learning_rate": 7.903669896493233e-05, + "loss": 1.7963, + "step": 10547 + }, + { + "epoch": 3.2375690607734806, + "grad_norm": 0.30647143721580505, + "learning_rate": 7.903265232154113e-05, + "loss": 1.7522, + "step": 10548 + }, + { + "epoch": 3.237875997544506, + "grad_norm": 0.30428263545036316, + "learning_rate": 7.902860539123225e-05, + "loss": 1.7383, + "step": 10549 + }, + { + "epoch": 3.238182934315531, + "grad_norm": 0.2357146292924881, + "learning_rate": 7.902455817404569e-05, + "loss": 1.7243, + "step": 10550 + }, + { + "epoch": 3.238489871086556, + "grad_norm": 0.3125104606151581, + "learning_rate": 7.90205106700214e-05, + "loss": 1.8542, + "step": 10551 + }, + { + "epoch": 3.2387968078575815, + "grad_norm": 0.25797244906425476, + "learning_rate": 7.901646287919944e-05, + "loss": 1.8374, + "step": 10552 + }, + { + "epoch": 3.2391037446286064, + "grad_norm": 0.3127591907978058, + "learning_rate": 7.901241480161978e-05, + "loss": 1.9457, + "step": 10553 + }, + { + "epoch": 3.2394106813996317, + "grad_norm": 0.2971835434436798, + "learning_rate": 7.900836643732243e-05, + "loss": 1.7933, + "step": 10554 + }, + { + "epoch": 3.2397176181706566, + "grad_norm": 0.28931814432144165, + "learning_rate": 7.90043177863474e-05, + "loss": 1.8201, + "step": 10555 + }, + { + "epoch": 3.240024554941682, + "grad_norm": 0.3348724842071533, + "learning_rate": 7.90002688487347e-05, + "loss": 1.8718, + "step": 10556 + }, + { + "epoch": 3.2403314917127073, + "grad_norm": 0.28566426038742065, + "learning_rate": 7.899621962452436e-05, + "loss": 1.805, + "step": 10557 + }, + { + "epoch": 3.240638428483732, + "grad_norm": 0.27074119448661804, + "learning_rate": 7.899217011375637e-05, + "loss": 1.842, + "step": 10558 + }, + { + "epoch": 3.2409453652547575, + "grad_norm": 0.27014291286468506, + "learning_rate": 7.898812031647076e-05, + "loss": 1.8156, + "step": 10559 + }, + { + "epoch": 3.241252302025783, + "grad_norm": 0.28087863326072693, + "learning_rate": 7.898407023270756e-05, + "loss": 1.8399, + "step": 10560 + }, + { + "epoch": 3.2415592387968077, + "grad_norm": 0.2641037404537201, + "learning_rate": 7.898001986250679e-05, + "loss": 1.7977, + "step": 10561 + }, + { + "epoch": 3.241866175567833, + "grad_norm": 0.2843858301639557, + "learning_rate": 7.897596920590848e-05, + "loss": 1.834, + "step": 10562 + }, + { + "epoch": 3.2421731123388584, + "grad_norm": 0.2724611163139343, + "learning_rate": 7.897191826295266e-05, + "loss": 1.7547, + "step": 10563 + }, + { + "epoch": 3.2424800491098833, + "grad_norm": 0.2583858370780945, + "learning_rate": 7.896786703367935e-05, + "loss": 1.7658, + "step": 10564 + }, + { + "epoch": 3.2427869858809086, + "grad_norm": 0.2666650712490082, + "learning_rate": 7.896381551812861e-05, + "loss": 1.8017, + "step": 10565 + }, + { + "epoch": 3.2430939226519335, + "grad_norm": 0.23269347846508026, + "learning_rate": 7.895976371634047e-05, + "loss": 1.8267, + "step": 10566 + }, + { + "epoch": 3.243400859422959, + "grad_norm": 0.27865225076675415, + "learning_rate": 7.895571162835496e-05, + "loss": 1.8093, + "step": 10567 + }, + { + "epoch": 3.243707796193984, + "grad_norm": 0.29445022344589233, + "learning_rate": 7.895165925421216e-05, + "loss": 1.7999, + "step": 10568 + }, + { + "epoch": 3.244014732965009, + "grad_norm": 0.32135528326034546, + "learning_rate": 7.894760659395206e-05, + "loss": 1.8405, + "step": 10569 + }, + { + "epoch": 3.2443216697360344, + "grad_norm": 0.3409091532230377, + "learning_rate": 7.894355364761477e-05, + "loss": 1.7861, + "step": 10570 + }, + { + "epoch": 3.2446286065070598, + "grad_norm": 0.3379025459289551, + "learning_rate": 7.893950041524032e-05, + "loss": 1.8495, + "step": 10571 + }, + { + "epoch": 3.2449355432780846, + "grad_norm": 0.2843063473701477, + "learning_rate": 7.893544689686874e-05, + "loss": 1.7888, + "step": 10572 + }, + { + "epoch": 3.24524248004911, + "grad_norm": 0.2914074957370758, + "learning_rate": 7.893139309254013e-05, + "loss": 1.7866, + "step": 10573 + }, + { + "epoch": 3.245549416820135, + "grad_norm": 0.39855021238327026, + "learning_rate": 7.892733900229454e-05, + "loss": 1.7865, + "step": 10574 + }, + { + "epoch": 3.24585635359116, + "grad_norm": 0.4232102632522583, + "learning_rate": 7.892328462617203e-05, + "loss": 1.8443, + "step": 10575 + }, + { + "epoch": 3.2461632903621855, + "grad_norm": 0.390794962644577, + "learning_rate": 7.891922996421267e-05, + "loss": 1.8735, + "step": 10576 + }, + { + "epoch": 3.2464702271332104, + "grad_norm": 0.3051595687866211, + "learning_rate": 7.891517501645653e-05, + "loss": 1.8654, + "step": 10577 + }, + { + "epoch": 3.2467771639042358, + "grad_norm": 0.25363096594810486, + "learning_rate": 7.891111978294367e-05, + "loss": 1.7602, + "step": 10578 + }, + { + "epoch": 3.247084100675261, + "grad_norm": 0.29785794019699097, + "learning_rate": 7.890706426371419e-05, + "loss": 1.8242, + "step": 10579 + }, + { + "epoch": 3.247391037446286, + "grad_norm": 0.346162885427475, + "learning_rate": 7.890300845880816e-05, + "loss": 1.8551, + "step": 10580 + }, + { + "epoch": 3.2476979742173113, + "grad_norm": 0.33906155824661255, + "learning_rate": 7.889895236826566e-05, + "loss": 1.765, + "step": 10581 + }, + { + "epoch": 3.248004910988336, + "grad_norm": 0.26083165407180786, + "learning_rate": 7.889489599212676e-05, + "loss": 1.8246, + "step": 10582 + }, + { + "epoch": 3.2483118477593615, + "grad_norm": 0.3042019009590149, + "learning_rate": 7.889083933043157e-05, + "loss": 1.9017, + "step": 10583 + }, + { + "epoch": 3.248618784530387, + "grad_norm": 0.34833577275276184, + "learning_rate": 7.888678238322018e-05, + "loss": 1.7863, + "step": 10584 + }, + { + "epoch": 3.2489257213014118, + "grad_norm": 0.34436655044555664, + "learning_rate": 7.888272515053267e-05, + "loss": 1.7937, + "step": 10585 + }, + { + "epoch": 3.249232658072437, + "grad_norm": 0.2550172507762909, + "learning_rate": 7.887866763240914e-05, + "loss": 1.7615, + "step": 10586 + }, + { + "epoch": 3.2495395948434624, + "grad_norm": 0.3334405720233917, + "learning_rate": 7.88746098288897e-05, + "loss": 1.7465, + "step": 10587 + }, + { + "epoch": 3.2498465316144873, + "grad_norm": 0.4668157696723938, + "learning_rate": 7.887055174001443e-05, + "loss": 1.7836, + "step": 10588 + }, + { + "epoch": 3.2501534683855127, + "grad_norm": 0.524680495262146, + "learning_rate": 7.886649336582344e-05, + "loss": 1.844, + "step": 10589 + }, + { + "epoch": 3.250460405156538, + "grad_norm": 0.36859074234962463, + "learning_rate": 7.886243470635685e-05, + "loss": 1.8072, + "step": 10590 + }, + { + "epoch": 3.250767341927563, + "grad_norm": 0.32370296120643616, + "learning_rate": 7.885837576165478e-05, + "loss": 1.802, + "step": 10591 + }, + { + "epoch": 3.2510742786985882, + "grad_norm": 0.3506374955177307, + "learning_rate": 7.88543165317573e-05, + "loss": 1.7965, + "step": 10592 + }, + { + "epoch": 3.251381215469613, + "grad_norm": 0.39058688282966614, + "learning_rate": 7.885025701670457e-05, + "loss": 1.7987, + "step": 10593 + }, + { + "epoch": 3.2516881522406385, + "grad_norm": 0.3042154014110565, + "learning_rate": 7.884619721653669e-05, + "loss": 1.8345, + "step": 10594 + }, + { + "epoch": 3.251995089011664, + "grad_norm": 0.2249498963356018, + "learning_rate": 7.884213713129378e-05, + "loss": 1.7796, + "step": 10595 + }, + { + "epoch": 3.2523020257826887, + "grad_norm": 0.2701997458934784, + "learning_rate": 7.883807676101595e-05, + "loss": 1.8027, + "step": 10596 + }, + { + "epoch": 3.252608962553714, + "grad_norm": 0.2574785053730011, + "learning_rate": 7.883401610574336e-05, + "loss": 1.7878, + "step": 10597 + }, + { + "epoch": 3.252915899324739, + "grad_norm": 0.24964739382266998, + "learning_rate": 7.882995516551613e-05, + "loss": 1.7612, + "step": 10598 + }, + { + "epoch": 3.2532228360957642, + "grad_norm": 0.2519865930080414, + "learning_rate": 7.882589394037437e-05, + "loss": 1.7583, + "step": 10599 + }, + { + "epoch": 3.2535297728667896, + "grad_norm": 0.23174463212490082, + "learning_rate": 7.882183243035823e-05, + "loss": 1.7607, + "step": 10600 + }, + { + "epoch": 3.2538367096378145, + "grad_norm": 0.28103554248809814, + "learning_rate": 7.881777063550786e-05, + "loss": 1.904, + "step": 10601 + }, + { + "epoch": 3.25414364640884, + "grad_norm": 0.265677809715271, + "learning_rate": 7.881370855586339e-05, + "loss": 1.8169, + "step": 10602 + }, + { + "epoch": 3.254450583179865, + "grad_norm": 0.2539603114128113, + "learning_rate": 7.880964619146493e-05, + "loss": 1.8439, + "step": 10603 + }, + { + "epoch": 3.25475751995089, + "grad_norm": 0.2741886377334595, + "learning_rate": 7.88055835423527e-05, + "loss": 1.8737, + "step": 10604 + }, + { + "epoch": 3.2550644567219154, + "grad_norm": 0.27548348903656006, + "learning_rate": 7.88015206085668e-05, + "loss": 1.8385, + "step": 10605 + }, + { + "epoch": 3.2553713934929407, + "grad_norm": 0.2958502769470215, + "learning_rate": 7.879745739014739e-05, + "loss": 1.8603, + "step": 10606 + }, + { + "epoch": 3.2556783302639656, + "grad_norm": 0.2728644907474518, + "learning_rate": 7.879339388713462e-05, + "loss": 1.8, + "step": 10607 + }, + { + "epoch": 3.255985267034991, + "grad_norm": 0.28718289732933044, + "learning_rate": 7.878933009956866e-05, + "loss": 1.7803, + "step": 10608 + }, + { + "epoch": 3.256292203806016, + "grad_norm": 0.2989691197872162, + "learning_rate": 7.878526602748967e-05, + "loss": 1.8155, + "step": 10609 + }, + { + "epoch": 3.256599140577041, + "grad_norm": 0.24515527486801147, + "learning_rate": 7.87812016709378e-05, + "loss": 1.7623, + "step": 10610 + }, + { + "epoch": 3.2569060773480665, + "grad_norm": 0.29946041107177734, + "learning_rate": 7.877713702995324e-05, + "loss": 1.8097, + "step": 10611 + }, + { + "epoch": 3.2572130141190914, + "grad_norm": 0.2854483723640442, + "learning_rate": 7.877307210457613e-05, + "loss": 1.8088, + "step": 10612 + }, + { + "epoch": 3.2575199508901167, + "grad_norm": 0.27812930941581726, + "learning_rate": 7.876900689484668e-05, + "loss": 1.8151, + "step": 10613 + }, + { + "epoch": 3.2578268876611416, + "grad_norm": 0.2658015787601471, + "learning_rate": 7.876494140080503e-05, + "loss": 1.8314, + "step": 10614 + }, + { + "epoch": 3.258133824432167, + "grad_norm": 0.28935661911964417, + "learning_rate": 7.876087562249137e-05, + "loss": 1.7948, + "step": 10615 + }, + { + "epoch": 3.2584407612031923, + "grad_norm": 0.27497121691703796, + "learning_rate": 7.875680955994587e-05, + "loss": 1.7964, + "step": 10616 + }, + { + "epoch": 3.258747697974217, + "grad_norm": 0.3313405513763428, + "learning_rate": 7.875274321320873e-05, + "loss": 1.8143, + "step": 10617 + }, + { + "epoch": 3.2590546347452425, + "grad_norm": 0.3217218816280365, + "learning_rate": 7.874867658232013e-05, + "loss": 1.7749, + "step": 10618 + }, + { + "epoch": 3.259361571516268, + "grad_norm": 0.25105544924736023, + "learning_rate": 7.874460966732025e-05, + "loss": 1.7834, + "step": 10619 + }, + { + "epoch": 3.2596685082872927, + "grad_norm": 0.2931382358074188, + "learning_rate": 7.874054246824931e-05, + "loss": 1.8252, + "step": 10620 + }, + { + "epoch": 3.259975445058318, + "grad_norm": 0.2803363502025604, + "learning_rate": 7.873647498514747e-05, + "loss": 1.7527, + "step": 10621 + }, + { + "epoch": 3.2602823818293434, + "grad_norm": 0.29857927560806274, + "learning_rate": 7.873240721805492e-05, + "loss": 1.8085, + "step": 10622 + }, + { + "epoch": 3.2605893186003683, + "grad_norm": 0.24864110350608826, + "learning_rate": 7.872833916701192e-05, + "loss": 1.7509, + "step": 10623 + }, + { + "epoch": 3.2608962553713936, + "grad_norm": 0.24105949699878693, + "learning_rate": 7.872427083205862e-05, + "loss": 1.7871, + "step": 10624 + }, + { + "epoch": 3.2612031921424185, + "grad_norm": 0.2429245114326477, + "learning_rate": 7.872020221323523e-05, + "loss": 1.777, + "step": 10625 + }, + { + "epoch": 3.261510128913444, + "grad_norm": 0.234287828207016, + "learning_rate": 7.871613331058197e-05, + "loss": 1.8001, + "step": 10626 + }, + { + "epoch": 3.261817065684469, + "grad_norm": 0.3463406264781952, + "learning_rate": 7.871206412413905e-05, + "loss": 1.8925, + "step": 10627 + }, + { + "epoch": 3.262124002455494, + "grad_norm": 0.26798921823501587, + "learning_rate": 7.87079946539467e-05, + "loss": 1.7963, + "step": 10628 + }, + { + "epoch": 3.2624309392265194, + "grad_norm": 0.28603312373161316, + "learning_rate": 7.87039249000451e-05, + "loss": 1.8308, + "step": 10629 + }, + { + "epoch": 3.2627378759975443, + "grad_norm": 0.2717527747154236, + "learning_rate": 7.86998548624745e-05, + "loss": 1.8246, + "step": 10630 + }, + { + "epoch": 3.2630448127685696, + "grad_norm": 0.32215580344200134, + "learning_rate": 7.86957845412751e-05, + "loss": 1.7278, + "step": 10631 + }, + { + "epoch": 3.263351749539595, + "grad_norm": 0.3578735589981079, + "learning_rate": 7.869171393648717e-05, + "loss": 1.7288, + "step": 10632 + }, + { + "epoch": 3.26365868631062, + "grad_norm": 0.3120707869529724, + "learning_rate": 7.868764304815089e-05, + "loss": 1.7971, + "step": 10633 + }, + { + "epoch": 3.263965623081645, + "grad_norm": 0.27419236302375793, + "learning_rate": 7.86835718763065e-05, + "loss": 1.8529, + "step": 10634 + }, + { + "epoch": 3.2642725598526705, + "grad_norm": 0.3200531601905823, + "learning_rate": 7.867950042099423e-05, + "loss": 1.7892, + "step": 10635 + }, + { + "epoch": 3.2645794966236954, + "grad_norm": 0.325706422328949, + "learning_rate": 7.867542868225435e-05, + "loss": 1.8236, + "step": 10636 + }, + { + "epoch": 3.2648864333947207, + "grad_norm": 0.2950136065483093, + "learning_rate": 7.867135666012707e-05, + "loss": 1.8163, + "step": 10637 + }, + { + "epoch": 3.265193370165746, + "grad_norm": 0.2772117257118225, + "learning_rate": 7.866728435465263e-05, + "loss": 1.8373, + "step": 10638 + }, + { + "epoch": 3.265500306936771, + "grad_norm": 0.2887401580810547, + "learning_rate": 7.866321176587129e-05, + "loss": 1.7756, + "step": 10639 + }, + { + "epoch": 3.2658072437077963, + "grad_norm": 0.3474489152431488, + "learning_rate": 7.865913889382329e-05, + "loss": 1.7539, + "step": 10640 + }, + { + "epoch": 3.266114180478821, + "grad_norm": 0.3433493971824646, + "learning_rate": 7.865506573854888e-05, + "loss": 1.7987, + "step": 10641 + }, + { + "epoch": 3.2664211172498465, + "grad_norm": 0.3075394630432129, + "learning_rate": 7.865099230008832e-05, + "loss": 1.7907, + "step": 10642 + }, + { + "epoch": 3.266728054020872, + "grad_norm": 0.24817697703838348, + "learning_rate": 7.864691857848187e-05, + "loss": 1.7941, + "step": 10643 + }, + { + "epoch": 3.2670349907918967, + "grad_norm": 0.290147602558136, + "learning_rate": 7.864284457376976e-05, + "loss": 1.9125, + "step": 10644 + }, + { + "epoch": 3.267341927562922, + "grad_norm": 0.253684937953949, + "learning_rate": 7.863877028599229e-05, + "loss": 1.8084, + "step": 10645 + }, + { + "epoch": 3.267648864333947, + "grad_norm": 0.26349252462387085, + "learning_rate": 7.863469571518969e-05, + "loss": 1.7548, + "step": 10646 + }, + { + "epoch": 3.2679558011049723, + "grad_norm": 0.30568864941596985, + "learning_rate": 7.863062086140224e-05, + "loss": 1.8551, + "step": 10647 + }, + { + "epoch": 3.2682627378759976, + "grad_norm": 0.2866690456867218, + "learning_rate": 7.862654572467024e-05, + "loss": 1.8145, + "step": 10648 + }, + { + "epoch": 3.2685696746470225, + "grad_norm": 0.32022854685783386, + "learning_rate": 7.862247030503391e-05, + "loss": 1.896, + "step": 10649 + }, + { + "epoch": 3.268876611418048, + "grad_norm": 0.25260284543037415, + "learning_rate": 7.861839460253356e-05, + "loss": 1.814, + "step": 10650 + }, + { + "epoch": 3.269183548189073, + "grad_norm": 0.26776066422462463, + "learning_rate": 7.861431861720947e-05, + "loss": 1.7755, + "step": 10651 + }, + { + "epoch": 3.269490484960098, + "grad_norm": 0.26514193415641785, + "learning_rate": 7.861024234910191e-05, + "loss": 1.7606, + "step": 10652 + }, + { + "epoch": 3.2697974217311234, + "grad_norm": 0.27213940024375916, + "learning_rate": 7.860616579825116e-05, + "loss": 1.8074, + "step": 10653 + }, + { + "epoch": 3.2701043585021488, + "grad_norm": 0.29192888736724854, + "learning_rate": 7.860208896469752e-05, + "loss": 1.8436, + "step": 10654 + }, + { + "epoch": 3.2704112952731736, + "grad_norm": 0.3772370219230652, + "learning_rate": 7.859801184848127e-05, + "loss": 1.8096, + "step": 10655 + }, + { + "epoch": 3.270718232044199, + "grad_norm": 0.4574970006942749, + "learning_rate": 7.859393444964269e-05, + "loss": 1.7612, + "step": 10656 + }, + { + "epoch": 3.271025168815224, + "grad_norm": 0.4614393413066864, + "learning_rate": 7.858985676822211e-05, + "loss": 1.8529, + "step": 10657 + }, + { + "epoch": 3.271332105586249, + "grad_norm": 0.33567267656326294, + "learning_rate": 7.85857788042598e-05, + "loss": 1.8391, + "step": 10658 + }, + { + "epoch": 3.2716390423572745, + "grad_norm": 0.2564064860343933, + "learning_rate": 7.858170055779609e-05, + "loss": 1.7621, + "step": 10659 + }, + { + "epoch": 3.2719459791282994, + "grad_norm": 0.26769882440567017, + "learning_rate": 7.857762202887122e-05, + "loss": 1.8145, + "step": 10660 + }, + { + "epoch": 3.2722529158993248, + "grad_norm": 0.262008935213089, + "learning_rate": 7.857354321752558e-05, + "loss": 1.7513, + "step": 10661 + }, + { + "epoch": 3.27255985267035, + "grad_norm": 0.26494377851486206, + "learning_rate": 7.856946412379942e-05, + "loss": 1.8071, + "step": 10662 + }, + { + "epoch": 3.272866789441375, + "grad_norm": 0.25613999366760254, + "learning_rate": 7.856538474773307e-05, + "loss": 1.8775, + "step": 10663 + }, + { + "epoch": 3.2731737262124003, + "grad_norm": 0.24789929389953613, + "learning_rate": 7.856130508936684e-05, + "loss": 1.8055, + "step": 10664 + }, + { + "epoch": 3.2734806629834257, + "grad_norm": 0.29111939668655396, + "learning_rate": 7.855722514874107e-05, + "loss": 1.8114, + "step": 10665 + }, + { + "epoch": 3.2737875997544506, + "grad_norm": 0.30511030554771423, + "learning_rate": 7.855314492589605e-05, + "loss": 1.8131, + "step": 10666 + }, + { + "epoch": 3.274094536525476, + "grad_norm": 0.2545989453792572, + "learning_rate": 7.854906442087212e-05, + "loss": 1.7933, + "step": 10667 + }, + { + "epoch": 3.2744014732965008, + "grad_norm": 0.26684823632240295, + "learning_rate": 7.85449836337096e-05, + "loss": 1.7604, + "step": 10668 + }, + { + "epoch": 3.274708410067526, + "grad_norm": 0.5097808837890625, + "learning_rate": 7.854090256444881e-05, + "loss": 1.777, + "step": 10669 + }, + { + "epoch": 3.2750153468385514, + "grad_norm": 0.27828142046928406, + "learning_rate": 7.853682121313011e-05, + "loss": 1.7885, + "step": 10670 + }, + { + "epoch": 3.2753222836095763, + "grad_norm": 0.2925552725791931, + "learning_rate": 7.853273957979381e-05, + "loss": 1.7962, + "step": 10671 + }, + { + "epoch": 3.2756292203806017, + "grad_norm": 0.284574955701828, + "learning_rate": 7.852865766448025e-05, + "loss": 1.8645, + "step": 10672 + }, + { + "epoch": 3.2759361571516266, + "grad_norm": 0.23407664895057678, + "learning_rate": 7.85245754672298e-05, + "loss": 1.7106, + "step": 10673 + }, + { + "epoch": 3.276243093922652, + "grad_norm": 0.2555919885635376, + "learning_rate": 7.852049298808274e-05, + "loss": 1.8237, + "step": 10674 + }, + { + "epoch": 3.2765500306936772, + "grad_norm": 0.26703694462776184, + "learning_rate": 7.851641022707947e-05, + "loss": 1.7844, + "step": 10675 + }, + { + "epoch": 3.276856967464702, + "grad_norm": 0.24889135360717773, + "learning_rate": 7.851232718426033e-05, + "loss": 1.7783, + "step": 10676 + }, + { + "epoch": 3.2771639042357275, + "grad_norm": 0.25770726799964905, + "learning_rate": 7.850824385966564e-05, + "loss": 1.8007, + "step": 10677 + }, + { + "epoch": 3.277470841006753, + "grad_norm": 0.31806984543800354, + "learning_rate": 7.850416025333578e-05, + "loss": 1.8623, + "step": 10678 + }, + { + "epoch": 3.2777777777777777, + "grad_norm": 0.2906930148601532, + "learning_rate": 7.850007636531111e-05, + "loss": 1.8315, + "step": 10679 + }, + { + "epoch": 3.278084714548803, + "grad_norm": 0.2802525460720062, + "learning_rate": 7.849599219563197e-05, + "loss": 1.8488, + "step": 10680 + }, + { + "epoch": 3.2783916513198283, + "grad_norm": 0.26150405406951904, + "learning_rate": 7.849190774433874e-05, + "loss": 1.7967, + "step": 10681 + }, + { + "epoch": 3.2786985880908532, + "grad_norm": 0.25863370299339294, + "learning_rate": 7.848782301147178e-05, + "loss": 1.864, + "step": 10682 + }, + { + "epoch": 3.2790055248618786, + "grad_norm": 0.25381043553352356, + "learning_rate": 7.848373799707145e-05, + "loss": 1.8239, + "step": 10683 + }, + { + "epoch": 3.2793124616329035, + "grad_norm": 0.2583387792110443, + "learning_rate": 7.847965270117814e-05, + "loss": 1.8449, + "step": 10684 + }, + { + "epoch": 3.279619398403929, + "grad_norm": 0.30759841203689575, + "learning_rate": 7.84755671238322e-05, + "loss": 1.7992, + "step": 10685 + }, + { + "epoch": 3.279926335174954, + "grad_norm": 0.4316023588180542, + "learning_rate": 7.847148126507402e-05, + "loss": 1.7912, + "step": 10686 + }, + { + "epoch": 3.280233271945979, + "grad_norm": 0.3988901674747467, + "learning_rate": 7.846739512494396e-05, + "loss": 1.8831, + "step": 10687 + }, + { + "epoch": 3.2805402087170044, + "grad_norm": 0.318934828042984, + "learning_rate": 7.846330870348244e-05, + "loss": 1.8411, + "step": 10688 + }, + { + "epoch": 3.2808471454880292, + "grad_norm": 0.27755632996559143, + "learning_rate": 7.84592220007298e-05, + "loss": 1.8763, + "step": 10689 + }, + { + "epoch": 3.2811540822590546, + "grad_norm": 0.33544883131980896, + "learning_rate": 7.845513501672646e-05, + "loss": 1.731, + "step": 10690 + }, + { + "epoch": 3.28146101903008, + "grad_norm": 0.28299057483673096, + "learning_rate": 7.845104775151278e-05, + "loss": 1.813, + "step": 10691 + }, + { + "epoch": 3.281767955801105, + "grad_norm": 0.2761382460594177, + "learning_rate": 7.844696020512918e-05, + "loss": 1.8018, + "step": 10692 + }, + { + "epoch": 3.28207489257213, + "grad_norm": 0.2919033169746399, + "learning_rate": 7.844287237761605e-05, + "loss": 1.793, + "step": 10693 + }, + { + "epoch": 3.2823818293431555, + "grad_norm": 0.32922014594078064, + "learning_rate": 7.843878426901378e-05, + "loss": 1.8186, + "step": 10694 + }, + { + "epoch": 3.2826887661141804, + "grad_norm": 0.2818562090396881, + "learning_rate": 7.843469587936279e-05, + "loss": 1.7794, + "step": 10695 + }, + { + "epoch": 3.2829957028852057, + "grad_norm": 0.26414254307746887, + "learning_rate": 7.843060720870345e-05, + "loss": 1.7854, + "step": 10696 + }, + { + "epoch": 3.283302639656231, + "grad_norm": 0.28345760703086853, + "learning_rate": 7.842651825707618e-05, + "loss": 1.7659, + "step": 10697 + }, + { + "epoch": 3.283609576427256, + "grad_norm": 0.3522340655326843, + "learning_rate": 7.842242902452141e-05, + "loss": 1.8427, + "step": 10698 + }, + { + "epoch": 3.2839165131982813, + "grad_norm": 0.2861590087413788, + "learning_rate": 7.841833951107954e-05, + "loss": 1.7539, + "step": 10699 + }, + { + "epoch": 3.284223449969306, + "grad_norm": 0.2596624493598938, + "learning_rate": 7.841424971679099e-05, + "loss": 1.8407, + "step": 10700 + }, + { + "epoch": 3.2845303867403315, + "grad_norm": 0.2847718298435211, + "learning_rate": 7.841015964169616e-05, + "loss": 1.8085, + "step": 10701 + }, + { + "epoch": 3.284837323511357, + "grad_norm": 0.29566115140914917, + "learning_rate": 7.840606928583547e-05, + "loss": 1.7873, + "step": 10702 + }, + { + "epoch": 3.2851442602823817, + "grad_norm": 0.2752111256122589, + "learning_rate": 7.840197864924936e-05, + "loss": 1.8186, + "step": 10703 + }, + { + "epoch": 3.285451197053407, + "grad_norm": 0.2907958924770355, + "learning_rate": 7.839788773197826e-05, + "loss": 1.8081, + "step": 10704 + }, + { + "epoch": 3.285758133824432, + "grad_norm": 0.25808724761009216, + "learning_rate": 7.839379653406258e-05, + "loss": 1.7635, + "step": 10705 + }, + { + "epoch": 3.2860650705954573, + "grad_norm": 0.2732730507850647, + "learning_rate": 7.838970505554277e-05, + "loss": 1.8061, + "step": 10706 + }, + { + "epoch": 3.2863720073664826, + "grad_norm": 0.23820067942142487, + "learning_rate": 7.838561329645923e-05, + "loss": 1.8091, + "step": 10707 + }, + { + "epoch": 3.2866789441375075, + "grad_norm": 0.24179396033287048, + "learning_rate": 7.838152125685245e-05, + "loss": 1.7513, + "step": 10708 + }, + { + "epoch": 3.286985880908533, + "grad_norm": 0.2627546787261963, + "learning_rate": 7.837742893676283e-05, + "loss": 1.8741, + "step": 10709 + }, + { + "epoch": 3.287292817679558, + "grad_norm": 0.2827817499637604, + "learning_rate": 7.837333633623083e-05, + "loss": 1.8387, + "step": 10710 + }, + { + "epoch": 3.287599754450583, + "grad_norm": 0.2666749060153961, + "learning_rate": 7.836924345529688e-05, + "loss": 1.8319, + "step": 10711 + }, + { + "epoch": 3.2879066912216084, + "grad_norm": 0.3403390944004059, + "learning_rate": 7.836515029400145e-05, + "loss": 1.7827, + "step": 10712 + }, + { + "epoch": 3.2882136279926337, + "grad_norm": 0.30646705627441406, + "learning_rate": 7.836105685238497e-05, + "loss": 1.8612, + "step": 10713 + }, + { + "epoch": 3.2885205647636586, + "grad_norm": 0.2580253481864929, + "learning_rate": 7.83569631304879e-05, + "loss": 1.7332, + "step": 10714 + }, + { + "epoch": 3.288827501534684, + "grad_norm": 0.23734542727470398, + "learning_rate": 7.835286912835071e-05, + "loss": 1.7899, + "step": 10715 + }, + { + "epoch": 3.289134438305709, + "grad_norm": 0.2457810491323471, + "learning_rate": 7.834877484601384e-05, + "loss": 1.8059, + "step": 10716 + }, + { + "epoch": 3.289441375076734, + "grad_norm": 0.2558443248271942, + "learning_rate": 7.834468028351778e-05, + "loss": 1.8689, + "step": 10717 + }, + { + "epoch": 3.2897483118477595, + "grad_norm": 0.26596710085868835, + "learning_rate": 7.834058544090298e-05, + "loss": 1.816, + "step": 10718 + }, + { + "epoch": 3.2900552486187844, + "grad_norm": 0.25424903631210327, + "learning_rate": 7.833649031820987e-05, + "loss": 1.7907, + "step": 10719 + }, + { + "epoch": 3.2903621853898097, + "grad_norm": 0.23873139917850494, + "learning_rate": 7.833239491547896e-05, + "loss": 1.7666, + "step": 10720 + }, + { + "epoch": 3.2906691221608346, + "grad_norm": 0.23292972147464752, + "learning_rate": 7.832829923275073e-05, + "loss": 1.7674, + "step": 10721 + }, + { + "epoch": 3.29097605893186, + "grad_norm": 0.30133312940597534, + "learning_rate": 7.832420327006566e-05, + "loss": 1.8229, + "step": 10722 + }, + { + "epoch": 3.2912829957028853, + "grad_norm": 0.2882522642612457, + "learning_rate": 7.83201070274642e-05, + "loss": 1.7855, + "step": 10723 + }, + { + "epoch": 3.29158993247391, + "grad_norm": 0.2578088045120239, + "learning_rate": 7.831601050498683e-05, + "loss": 1.7276, + "step": 10724 + }, + { + "epoch": 3.2918968692449355, + "grad_norm": 0.29511600732803345, + "learning_rate": 7.831191370267406e-05, + "loss": 1.8085, + "step": 10725 + }, + { + "epoch": 3.292203806015961, + "grad_norm": 0.29557499289512634, + "learning_rate": 7.830781662056634e-05, + "loss": 1.815, + "step": 10726 + }, + { + "epoch": 3.2925107427869857, + "grad_norm": 0.32722121477127075, + "learning_rate": 7.830371925870422e-05, + "loss": 1.7889, + "step": 10727 + }, + { + "epoch": 3.292817679558011, + "grad_norm": 0.3124488592147827, + "learning_rate": 7.829962161712814e-05, + "loss": 1.8063, + "step": 10728 + }, + { + "epoch": 3.2931246163290364, + "grad_norm": 0.311334490776062, + "learning_rate": 7.829552369587861e-05, + "loss": 1.8852, + "step": 10729 + }, + { + "epoch": 3.2934315531000613, + "grad_norm": 0.28010860085487366, + "learning_rate": 7.829142549499613e-05, + "loss": 1.8274, + "step": 10730 + }, + { + "epoch": 3.2937384898710866, + "grad_norm": 0.3453529477119446, + "learning_rate": 7.828732701452119e-05, + "loss": 1.8618, + "step": 10731 + }, + { + "epoch": 3.2940454266421115, + "grad_norm": 0.2946802079677582, + "learning_rate": 7.828322825449432e-05, + "loss": 1.7123, + "step": 10732 + }, + { + "epoch": 3.294352363413137, + "grad_norm": 0.2467648684978485, + "learning_rate": 7.827912921495601e-05, + "loss": 1.7786, + "step": 10733 + }, + { + "epoch": 3.294659300184162, + "grad_norm": 0.2957034707069397, + "learning_rate": 7.827502989594677e-05, + "loss": 1.7817, + "step": 10734 + }, + { + "epoch": 3.294966236955187, + "grad_norm": 0.300905704498291, + "learning_rate": 7.827093029750713e-05, + "loss": 1.7582, + "step": 10735 + }, + { + "epoch": 3.2952731737262124, + "grad_norm": 0.28935131430625916, + "learning_rate": 7.826683041967757e-05, + "loss": 1.7766, + "step": 10736 + }, + { + "epoch": 3.2955801104972378, + "grad_norm": 0.26046010851860046, + "learning_rate": 7.826273026249861e-05, + "loss": 1.8152, + "step": 10737 + }, + { + "epoch": 3.2958870472682626, + "grad_norm": 0.24247924983501434, + "learning_rate": 7.82586298260108e-05, + "loss": 1.8679, + "step": 10738 + }, + { + "epoch": 3.296193984039288, + "grad_norm": 0.25977620482444763, + "learning_rate": 7.825452911025466e-05, + "loss": 1.8108, + "step": 10739 + }, + { + "epoch": 3.2965009208103133, + "grad_norm": 0.2732592821121216, + "learning_rate": 7.825042811527068e-05, + "loss": 1.7355, + "step": 10740 + }, + { + "epoch": 3.296807857581338, + "grad_norm": 0.38407859206199646, + "learning_rate": 7.824632684109941e-05, + "loss": 1.8418, + "step": 10741 + }, + { + "epoch": 3.2971147943523635, + "grad_norm": 0.4239252805709839, + "learning_rate": 7.82422252877814e-05, + "loss": 1.7655, + "step": 10742 + }, + { + "epoch": 3.2974217311233884, + "grad_norm": 0.3810526132583618, + "learning_rate": 7.823812345535716e-05, + "loss": 1.8804, + "step": 10743 + }, + { + "epoch": 3.2977286678944138, + "grad_norm": 0.29939520359039307, + "learning_rate": 7.823402134386722e-05, + "loss": 1.8207, + "step": 10744 + }, + { + "epoch": 3.298035604665439, + "grad_norm": 0.4053972065448761, + "learning_rate": 7.822991895335215e-05, + "loss": 1.7901, + "step": 10745 + }, + { + "epoch": 3.298342541436464, + "grad_norm": 0.4975005090236664, + "learning_rate": 7.822581628385247e-05, + "loss": 1.8344, + "step": 10746 + }, + { + "epoch": 3.2986494782074893, + "grad_norm": 0.4100436270236969, + "learning_rate": 7.822171333540874e-05, + "loss": 1.7891, + "step": 10747 + }, + { + "epoch": 3.298956414978514, + "grad_norm": 0.2817644476890564, + "learning_rate": 7.821761010806147e-05, + "loss": 1.7895, + "step": 10748 + }, + { + "epoch": 3.2992633517495396, + "grad_norm": 0.332660973072052, + "learning_rate": 7.821350660185125e-05, + "loss": 1.7281, + "step": 10749 + }, + { + "epoch": 3.299570288520565, + "grad_norm": 0.42652732133865356, + "learning_rate": 7.820940281681863e-05, + "loss": 1.7855, + "step": 10750 + }, + { + "epoch": 3.2998772252915898, + "grad_norm": 0.35700714588165283, + "learning_rate": 7.820529875300415e-05, + "loss": 1.8722, + "step": 10751 + }, + { + "epoch": 3.300184162062615, + "grad_norm": 0.25305211544036865, + "learning_rate": 7.820119441044838e-05, + "loss": 1.7696, + "step": 10752 + }, + { + "epoch": 3.3004910988336404, + "grad_norm": 0.280205637216568, + "learning_rate": 7.819708978919188e-05, + "loss": 1.756, + "step": 10753 + }, + { + "epoch": 3.3007980356046653, + "grad_norm": 0.4176226854324341, + "learning_rate": 7.819298488927521e-05, + "loss": 1.7731, + "step": 10754 + }, + { + "epoch": 3.3011049723756907, + "grad_norm": 0.4264865517616272, + "learning_rate": 7.818887971073894e-05, + "loss": 1.7851, + "step": 10755 + }, + { + "epoch": 3.301411909146716, + "grad_norm": 0.2901221215724945, + "learning_rate": 7.818477425362363e-05, + "loss": 1.7356, + "step": 10756 + }, + { + "epoch": 3.301718845917741, + "grad_norm": 0.29583361744880676, + "learning_rate": 7.818066851796986e-05, + "loss": 1.8269, + "step": 10757 + }, + { + "epoch": 3.3020257826887662, + "grad_norm": 0.38592997193336487, + "learning_rate": 7.817656250381821e-05, + "loss": 1.7515, + "step": 10758 + }, + { + "epoch": 3.302332719459791, + "grad_norm": 0.29301533102989197, + "learning_rate": 7.817245621120927e-05, + "loss": 1.7955, + "step": 10759 + }, + { + "epoch": 3.3026396562308165, + "grad_norm": 0.2770880162715912, + "learning_rate": 7.816834964018359e-05, + "loss": 1.7899, + "step": 10760 + }, + { + "epoch": 3.302946593001842, + "grad_norm": 0.32566413283348083, + "learning_rate": 7.816424279078176e-05, + "loss": 1.74, + "step": 10761 + }, + { + "epoch": 3.3032535297728667, + "grad_norm": 0.3077750504016876, + "learning_rate": 7.81601356630444e-05, + "loss": 1.8123, + "step": 10762 + }, + { + "epoch": 3.303560466543892, + "grad_norm": 0.2826370298862457, + "learning_rate": 7.815602825701206e-05, + "loss": 1.865, + "step": 10763 + }, + { + "epoch": 3.303867403314917, + "grad_norm": 0.31700822710990906, + "learning_rate": 7.815192057272534e-05, + "loss": 1.8021, + "step": 10764 + }, + { + "epoch": 3.3041743400859422, + "grad_norm": 0.33182790875434875, + "learning_rate": 7.814781261022486e-05, + "loss": 1.818, + "step": 10765 + }, + { + "epoch": 3.3044812768569676, + "grad_norm": 0.2720039486885071, + "learning_rate": 7.814370436955118e-05, + "loss": 1.8369, + "step": 10766 + }, + { + "epoch": 3.3047882136279925, + "grad_norm": 0.28134068846702576, + "learning_rate": 7.813959585074493e-05, + "loss": 1.8391, + "step": 10767 + }, + { + "epoch": 3.305095150399018, + "grad_norm": 0.25748828053474426, + "learning_rate": 7.813548705384667e-05, + "loss": 1.7987, + "step": 10768 + }, + { + "epoch": 3.305402087170043, + "grad_norm": 0.26187625527381897, + "learning_rate": 7.813137797889708e-05, + "loss": 1.7645, + "step": 10769 + }, + { + "epoch": 3.305709023941068, + "grad_norm": 0.297262579202652, + "learning_rate": 7.812726862593671e-05, + "loss": 1.771, + "step": 10770 + }, + { + "epoch": 3.3060159607120934, + "grad_norm": 0.2987872064113617, + "learning_rate": 7.812315899500618e-05, + "loss": 1.8115, + "step": 10771 + }, + { + "epoch": 3.3063228974831187, + "grad_norm": 0.31963878870010376, + "learning_rate": 7.81190490861461e-05, + "loss": 1.7685, + "step": 10772 + }, + { + "epoch": 3.3066298342541436, + "grad_norm": 0.27007177472114563, + "learning_rate": 7.81149388993971e-05, + "loss": 1.8272, + "step": 10773 + }, + { + "epoch": 3.306936771025169, + "grad_norm": 0.26818498969078064, + "learning_rate": 7.811082843479981e-05, + "loss": 1.7894, + "step": 10774 + }, + { + "epoch": 3.307243707796194, + "grad_norm": 0.28857091069221497, + "learning_rate": 7.810671769239483e-05, + "loss": 1.8769, + "step": 10775 + }, + { + "epoch": 3.307550644567219, + "grad_norm": 0.26983144879341125, + "learning_rate": 7.810260667222277e-05, + "loss": 1.796, + "step": 10776 + }, + { + "epoch": 3.3078575813382445, + "grad_norm": 0.2566467225551605, + "learning_rate": 7.809849537432432e-05, + "loss": 1.848, + "step": 10777 + }, + { + "epoch": 3.3081645181092694, + "grad_norm": 0.25607848167419434, + "learning_rate": 7.809438379874005e-05, + "loss": 1.8072, + "step": 10778 + }, + { + "epoch": 3.3084714548802947, + "grad_norm": 0.29158470034599304, + "learning_rate": 7.809027194551059e-05, + "loss": 1.7772, + "step": 10779 + }, + { + "epoch": 3.3087783916513196, + "grad_norm": 0.360897421836853, + "learning_rate": 7.808615981467664e-05, + "loss": 1.8404, + "step": 10780 + }, + { + "epoch": 3.309085328422345, + "grad_norm": 0.31121253967285156, + "learning_rate": 7.808204740627877e-05, + "loss": 1.8137, + "step": 10781 + }, + { + "epoch": 3.3093922651933703, + "grad_norm": 0.2846451699733734, + "learning_rate": 7.807793472035765e-05, + "loss": 1.8367, + "step": 10782 + }, + { + "epoch": 3.309699201964395, + "grad_norm": 0.2711004316806793, + "learning_rate": 7.807382175695393e-05, + "loss": 1.7728, + "step": 10783 + }, + { + "epoch": 3.3100061387354205, + "grad_norm": 0.2693859338760376, + "learning_rate": 7.806970851610824e-05, + "loss": 1.7026, + "step": 10784 + }, + { + "epoch": 3.310313075506446, + "grad_norm": 0.3050517439842224, + "learning_rate": 7.806559499786125e-05, + "loss": 1.8041, + "step": 10785 + }, + { + "epoch": 3.3106200122774707, + "grad_norm": 0.27304747700691223, + "learning_rate": 7.80614812022536e-05, + "loss": 1.8182, + "step": 10786 + }, + { + "epoch": 3.310926949048496, + "grad_norm": 0.28378555178642273, + "learning_rate": 7.805736712932594e-05, + "loss": 1.8519, + "step": 10787 + }, + { + "epoch": 3.3112338858195214, + "grad_norm": 0.30620133876800537, + "learning_rate": 7.805325277911892e-05, + "loss": 1.8594, + "step": 10788 + }, + { + "epoch": 3.3115408225905463, + "grad_norm": 0.2580169141292572, + "learning_rate": 7.804913815167325e-05, + "loss": 1.7897, + "step": 10789 + }, + { + "epoch": 3.3118477593615716, + "grad_norm": 0.28937023878097534, + "learning_rate": 7.804502324702951e-05, + "loss": 1.8362, + "step": 10790 + }, + { + "epoch": 3.3121546961325965, + "grad_norm": 0.28032705187797546, + "learning_rate": 7.804090806522844e-05, + "loss": 1.8168, + "step": 10791 + }, + { + "epoch": 3.312461632903622, + "grad_norm": 0.33712559938430786, + "learning_rate": 7.803679260631069e-05, + "loss": 1.7489, + "step": 10792 + }, + { + "epoch": 3.312768569674647, + "grad_norm": 0.40536820888519287, + "learning_rate": 7.80326768703169e-05, + "loss": 1.8413, + "step": 10793 + }, + { + "epoch": 3.313075506445672, + "grad_norm": 0.34967559576034546, + "learning_rate": 7.802856085728778e-05, + "loss": 1.8076, + "step": 10794 + }, + { + "epoch": 3.3133824432166974, + "grad_norm": 0.2429870367050171, + "learning_rate": 7.8024444567264e-05, + "loss": 1.8002, + "step": 10795 + }, + { + "epoch": 3.3136893799877223, + "grad_norm": 0.40956684947013855, + "learning_rate": 7.802032800028621e-05, + "loss": 1.8151, + "step": 10796 + }, + { + "epoch": 3.3139963167587476, + "grad_norm": 0.4908781945705414, + "learning_rate": 7.801621115639512e-05, + "loss": 1.8124, + "step": 10797 + }, + { + "epoch": 3.314303253529773, + "grad_norm": 0.3922197222709656, + "learning_rate": 7.801209403563143e-05, + "loss": 1.7911, + "step": 10798 + }, + { + "epoch": 3.314610190300798, + "grad_norm": 0.29467105865478516, + "learning_rate": 7.800797663803578e-05, + "loss": 1.8472, + "step": 10799 + }, + { + "epoch": 3.314917127071823, + "grad_norm": 0.384974867105484, + "learning_rate": 7.800385896364891e-05, + "loss": 1.8139, + "step": 10800 + }, + { + "epoch": 3.3152240638428485, + "grad_norm": 0.4605129063129425, + "learning_rate": 7.79997410125115e-05, + "loss": 1.7982, + "step": 10801 + }, + { + "epoch": 3.3155310006138734, + "grad_norm": 0.2982464134693146, + "learning_rate": 7.799562278466423e-05, + "loss": 1.8496, + "step": 10802 + }, + { + "epoch": 3.3158379373848987, + "grad_norm": 0.3101392984390259, + "learning_rate": 7.79915042801478e-05, + "loss": 1.8172, + "step": 10803 + }, + { + "epoch": 3.316144874155924, + "grad_norm": 0.3651282489299774, + "learning_rate": 7.798738549900292e-05, + "loss": 1.7497, + "step": 10804 + }, + { + "epoch": 3.316451810926949, + "grad_norm": 0.28504419326782227, + "learning_rate": 7.79832664412703e-05, + "loss": 1.8027, + "step": 10805 + }, + { + "epoch": 3.3167587476979743, + "grad_norm": 0.28333309292793274, + "learning_rate": 7.797914710699063e-05, + "loss": 1.8121, + "step": 10806 + }, + { + "epoch": 3.317065684468999, + "grad_norm": 0.37549784779548645, + "learning_rate": 7.797502749620462e-05, + "loss": 1.817, + "step": 10807 + }, + { + "epoch": 3.3173726212400245, + "grad_norm": 0.3864210844039917, + "learning_rate": 7.797090760895301e-05, + "loss": 1.852, + "step": 10808 + }, + { + "epoch": 3.31767955801105, + "grad_norm": 0.2422102987766266, + "learning_rate": 7.79667874452765e-05, + "loss": 1.7523, + "step": 10809 + }, + { + "epoch": 3.3179864947820747, + "grad_norm": 0.307892382144928, + "learning_rate": 7.79626670052158e-05, + "loss": 1.7436, + "step": 10810 + }, + { + "epoch": 3.3182934315531, + "grad_norm": 0.29607462882995605, + "learning_rate": 7.795854628881162e-05, + "loss": 1.768, + "step": 10811 + }, + { + "epoch": 3.3186003683241254, + "grad_norm": 0.23334427177906036, + "learning_rate": 7.795442529610471e-05, + "loss": 1.7687, + "step": 10812 + }, + { + "epoch": 3.3189073050951503, + "grad_norm": 0.26257455348968506, + "learning_rate": 7.795030402713578e-05, + "loss": 1.8266, + "step": 10813 + }, + { + "epoch": 3.3192142418661756, + "grad_norm": 0.3252788782119751, + "learning_rate": 7.794618248194556e-05, + "loss": 1.8645, + "step": 10814 + }, + { + "epoch": 3.319521178637201, + "grad_norm": 0.3807232975959778, + "learning_rate": 7.79420606605748e-05, + "loss": 1.8154, + "step": 10815 + }, + { + "epoch": 3.319828115408226, + "grad_norm": 0.3395625948905945, + "learning_rate": 7.793793856306422e-05, + "loss": 1.8002, + "step": 10816 + }, + { + "epoch": 3.320135052179251, + "grad_norm": 0.2896415889263153, + "learning_rate": 7.793381618945455e-05, + "loss": 1.8077, + "step": 10817 + }, + { + "epoch": 3.320441988950276, + "grad_norm": 0.27733489871025085, + "learning_rate": 7.792969353978652e-05, + "loss": 1.7976, + "step": 10818 + }, + { + "epoch": 3.3207489257213014, + "grad_norm": 0.36985141038894653, + "learning_rate": 7.79255706141009e-05, + "loss": 1.8724, + "step": 10819 + }, + { + "epoch": 3.3210558624923268, + "grad_norm": 0.37886983156204224, + "learning_rate": 7.792144741243843e-05, + "loss": 1.8249, + "step": 10820 + }, + { + "epoch": 3.3213627992633517, + "grad_norm": 0.3030721843242645, + "learning_rate": 7.791732393483986e-05, + "loss": 1.7975, + "step": 10821 + }, + { + "epoch": 3.321669736034377, + "grad_norm": 0.2637709081172943, + "learning_rate": 7.791320018134592e-05, + "loss": 1.7205, + "step": 10822 + }, + { + "epoch": 3.321976672805402, + "grad_norm": 0.35307520627975464, + "learning_rate": 7.790907615199736e-05, + "loss": 1.8786, + "step": 10823 + }, + { + "epoch": 3.322283609576427, + "grad_norm": 0.3333272635936737, + "learning_rate": 7.790495184683497e-05, + "loss": 1.7715, + "step": 10824 + }, + { + "epoch": 3.3225905463474525, + "grad_norm": 0.2597469091415405, + "learning_rate": 7.790082726589948e-05, + "loss": 1.8379, + "step": 10825 + }, + { + "epoch": 3.3228974831184774, + "grad_norm": 0.34176257252693176, + "learning_rate": 7.789670240923168e-05, + "loss": 1.8305, + "step": 10826 + }, + { + "epoch": 3.3232044198895028, + "grad_norm": 0.37954533100128174, + "learning_rate": 7.789257727687229e-05, + "loss": 1.7728, + "step": 10827 + }, + { + "epoch": 3.323511356660528, + "grad_norm": 0.2840248644351959, + "learning_rate": 7.788845186886212e-05, + "loss": 1.8059, + "step": 10828 + }, + { + "epoch": 3.323818293431553, + "grad_norm": 0.3650275766849518, + "learning_rate": 7.788432618524193e-05, + "loss": 1.8127, + "step": 10829 + }, + { + "epoch": 3.3241252302025783, + "grad_norm": 0.4869692623615265, + "learning_rate": 7.788020022605247e-05, + "loss": 1.833, + "step": 10830 + }, + { + "epoch": 3.3244321669736037, + "grad_norm": 0.3419482707977295, + "learning_rate": 7.787607399133453e-05, + "loss": 1.7812, + "step": 10831 + }, + { + "epoch": 3.3247391037446286, + "grad_norm": 0.27625617384910583, + "learning_rate": 7.787194748112889e-05, + "loss": 1.8513, + "step": 10832 + }, + { + "epoch": 3.325046040515654, + "grad_norm": 0.4287806749343872, + "learning_rate": 7.786782069547633e-05, + "loss": 1.836, + "step": 10833 + }, + { + "epoch": 3.325352977286679, + "grad_norm": 0.4345545172691345, + "learning_rate": 7.786369363441763e-05, + "loss": 1.8027, + "step": 10834 + }, + { + "epoch": 3.325659914057704, + "grad_norm": 0.32976534962654114, + "learning_rate": 7.78595662979936e-05, + "loss": 1.7987, + "step": 10835 + }, + { + "epoch": 3.3259668508287294, + "grad_norm": 0.2677469849586487, + "learning_rate": 7.785543868624498e-05, + "loss": 1.8312, + "step": 10836 + }, + { + "epoch": 3.3262737875997543, + "grad_norm": 0.2547740638256073, + "learning_rate": 7.785131079921259e-05, + "loss": 1.7844, + "step": 10837 + }, + { + "epoch": 3.3265807243707797, + "grad_norm": 0.26755592226982117, + "learning_rate": 7.784718263693725e-05, + "loss": 1.8263, + "step": 10838 + }, + { + "epoch": 3.3268876611418046, + "grad_norm": 0.23884403705596924, + "learning_rate": 7.784305419945969e-05, + "loss": 1.7862, + "step": 10839 + }, + { + "epoch": 3.32719459791283, + "grad_norm": 0.2896903157234192, + "learning_rate": 7.783892548682077e-05, + "loss": 1.9138, + "step": 10840 + }, + { + "epoch": 3.3275015346838552, + "grad_norm": 0.3201359510421753, + "learning_rate": 7.783479649906127e-05, + "loss": 1.8382, + "step": 10841 + }, + { + "epoch": 3.32780847145488, + "grad_norm": 0.39285311102867126, + "learning_rate": 7.7830667236222e-05, + "loss": 1.7763, + "step": 10842 + }, + { + "epoch": 3.3281154082259055, + "grad_norm": 0.435007244348526, + "learning_rate": 7.782653769834376e-05, + "loss": 1.8415, + "step": 10843 + }, + { + "epoch": 3.328422344996931, + "grad_norm": 0.34605318307876587, + "learning_rate": 7.782240788546736e-05, + "loss": 1.757, + "step": 10844 + }, + { + "epoch": 3.3287292817679557, + "grad_norm": 0.26830604672431946, + "learning_rate": 7.781827779763362e-05, + "loss": 1.7779, + "step": 10845 + }, + { + "epoch": 3.329036218538981, + "grad_norm": 0.41851529479026794, + "learning_rate": 7.781414743488336e-05, + "loss": 1.8609, + "step": 10846 + }, + { + "epoch": 3.3293431553100064, + "grad_norm": 0.5058079361915588, + "learning_rate": 7.78100167972574e-05, + "loss": 1.8146, + "step": 10847 + }, + { + "epoch": 3.3296500920810312, + "grad_norm": 0.34394967555999756, + "learning_rate": 7.780588588479654e-05, + "loss": 1.8079, + "step": 10848 + }, + { + "epoch": 3.3299570288520566, + "grad_norm": 0.3033885061740875, + "learning_rate": 7.780175469754161e-05, + "loss": 1.8223, + "step": 10849 + }, + { + "epoch": 3.3302639656230815, + "grad_norm": 0.4431045651435852, + "learning_rate": 7.779762323553347e-05, + "loss": 1.8841, + "step": 10850 + }, + { + "epoch": 3.330570902394107, + "grad_norm": 0.3451448976993561, + "learning_rate": 7.77934914988129e-05, + "loss": 1.8092, + "step": 10851 + }, + { + "epoch": 3.330877839165132, + "grad_norm": 0.26580891013145447, + "learning_rate": 7.778935948742077e-05, + "loss": 1.8244, + "step": 10852 + }, + { + "epoch": 3.331184775936157, + "grad_norm": 0.32079070806503296, + "learning_rate": 7.778522720139792e-05, + "loss": 1.7816, + "step": 10853 + }, + { + "epoch": 3.3314917127071824, + "grad_norm": 0.35789042711257935, + "learning_rate": 7.778109464078514e-05, + "loss": 1.8211, + "step": 10854 + }, + { + "epoch": 3.3317986494782073, + "grad_norm": 0.2808612585067749, + "learning_rate": 7.77769618056233e-05, + "loss": 1.8387, + "step": 10855 + }, + { + "epoch": 3.3321055862492326, + "grad_norm": 0.24760548770427704, + "learning_rate": 7.777282869595326e-05, + "loss": 1.7795, + "step": 10856 + }, + { + "epoch": 3.332412523020258, + "grad_norm": 0.2840912640094757, + "learning_rate": 7.776869531181583e-05, + "loss": 1.7492, + "step": 10857 + }, + { + "epoch": 3.332719459791283, + "grad_norm": 0.2881413698196411, + "learning_rate": 7.77645616532519e-05, + "loss": 1.8157, + "step": 10858 + }, + { + "epoch": 3.333026396562308, + "grad_norm": 0.2508779764175415, + "learning_rate": 7.776042772030228e-05, + "loss": 1.8196, + "step": 10859 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.3307822048664093, + "learning_rate": 7.775629351300785e-05, + "loss": 1.8195, + "step": 10860 + }, + { + "epoch": 3.3336402701043584, + "grad_norm": 0.34392043948173523, + "learning_rate": 7.775215903140946e-05, + "loss": 1.7775, + "step": 10861 + }, + { + "epoch": 3.3339472068753837, + "grad_norm": 0.2594252824783325, + "learning_rate": 7.774802427554796e-05, + "loss": 1.7687, + "step": 10862 + }, + { + "epoch": 3.334254143646409, + "grad_norm": 0.3109053075313568, + "learning_rate": 7.774388924546423e-05, + "loss": 1.7908, + "step": 10863 + }, + { + "epoch": 3.334561080417434, + "grad_norm": 0.4801923930644989, + "learning_rate": 7.773975394119913e-05, + "loss": 1.8316, + "step": 10864 + }, + { + "epoch": 3.3348680171884593, + "grad_norm": 0.4754973351955414, + "learning_rate": 7.77356183627935e-05, + "loss": 1.8015, + "step": 10865 + }, + { + "epoch": 3.335174953959484, + "grad_norm": 0.29624658823013306, + "learning_rate": 7.773148251028825e-05, + "loss": 1.8179, + "step": 10866 + }, + { + "epoch": 3.3354818907305095, + "grad_norm": 0.32207581400871277, + "learning_rate": 7.772734638372423e-05, + "loss": 1.799, + "step": 10867 + }, + { + "epoch": 3.335788827501535, + "grad_norm": 0.5227517485618591, + "learning_rate": 7.772320998314233e-05, + "loss": 1.8452, + "step": 10868 + }, + { + "epoch": 3.3360957642725597, + "grad_norm": 0.4081100523471832, + "learning_rate": 7.771907330858341e-05, + "loss": 1.8182, + "step": 10869 + }, + { + "epoch": 3.336402701043585, + "grad_norm": 0.23786653578281403, + "learning_rate": 7.771493636008838e-05, + "loss": 1.7392, + "step": 10870 + }, + { + "epoch": 3.33670963781461, + "grad_norm": 0.37913820147514343, + "learning_rate": 7.771079913769807e-05, + "loss": 1.7559, + "step": 10871 + }, + { + "epoch": 3.3370165745856353, + "grad_norm": 0.4939163625240326, + "learning_rate": 7.770666164145344e-05, + "loss": 1.8076, + "step": 10872 + }, + { + "epoch": 3.3373235113566606, + "grad_norm": 0.3322528302669525, + "learning_rate": 7.770252387139532e-05, + "loss": 1.8045, + "step": 10873 + }, + { + "epoch": 3.337630448127686, + "grad_norm": 0.3685782849788666, + "learning_rate": 7.769838582756461e-05, + "loss": 1.7703, + "step": 10874 + }, + { + "epoch": 3.337937384898711, + "grad_norm": 0.5564271807670593, + "learning_rate": 7.769424751000224e-05, + "loss": 1.7697, + "step": 10875 + }, + { + "epoch": 3.338244321669736, + "grad_norm": 0.38610726594924927, + "learning_rate": 7.769010891874906e-05, + "loss": 1.7944, + "step": 10876 + }, + { + "epoch": 3.338551258440761, + "grad_norm": 0.23838558793067932, + "learning_rate": 7.768597005384602e-05, + "loss": 1.765, + "step": 10877 + }, + { + "epoch": 3.3388581952117864, + "grad_norm": 0.4334571063518524, + "learning_rate": 7.768183091533399e-05, + "loss": 1.7854, + "step": 10878 + }, + { + "epoch": 3.3391651319828117, + "grad_norm": 0.44844719767570496, + "learning_rate": 7.767769150325386e-05, + "loss": 1.7955, + "step": 10879 + }, + { + "epoch": 3.3394720687538366, + "grad_norm": 0.26543378829956055, + "learning_rate": 7.767355181764659e-05, + "loss": 1.8311, + "step": 10880 + }, + { + "epoch": 3.339779005524862, + "grad_norm": 0.39401358366012573, + "learning_rate": 7.766941185855304e-05, + "loss": 1.8264, + "step": 10881 + }, + { + "epoch": 3.340085942295887, + "grad_norm": 0.5476824045181274, + "learning_rate": 7.766527162601416e-05, + "loss": 1.8051, + "step": 10882 + }, + { + "epoch": 3.340392879066912, + "grad_norm": 0.4021138548851013, + "learning_rate": 7.766113112007084e-05, + "loss": 1.7941, + "step": 10883 + }, + { + "epoch": 3.3406998158379375, + "grad_norm": 0.3262040317058563, + "learning_rate": 7.765699034076402e-05, + "loss": 1.8317, + "step": 10884 + }, + { + "epoch": 3.3410067526089624, + "grad_norm": 0.5461146831512451, + "learning_rate": 7.765284928813459e-05, + "loss": 1.833, + "step": 10885 + }, + { + "epoch": 3.3413136893799877, + "grad_norm": 0.5067405700683594, + "learning_rate": 7.764870796222351e-05, + "loss": 1.7862, + "step": 10886 + }, + { + "epoch": 3.341620626151013, + "grad_norm": 0.2731069028377533, + "learning_rate": 7.76445663630717e-05, + "loss": 1.8173, + "step": 10887 + }, + { + "epoch": 3.341927562922038, + "grad_norm": 0.48928195238113403, + "learning_rate": 7.764042449072008e-05, + "loss": 1.7992, + "step": 10888 + }, + { + "epoch": 3.3422344996930633, + "grad_norm": 0.5338504910469055, + "learning_rate": 7.763628234520958e-05, + "loss": 1.7891, + "step": 10889 + }, + { + "epoch": 3.3425414364640886, + "grad_norm": 0.3136523365974426, + "learning_rate": 7.763213992658114e-05, + "loss": 1.8623, + "step": 10890 + }, + { + "epoch": 3.3428483732351135, + "grad_norm": 0.36551395058631897, + "learning_rate": 7.762799723487568e-05, + "loss": 1.8474, + "step": 10891 + }, + { + "epoch": 3.343155310006139, + "grad_norm": 0.35772353410720825, + "learning_rate": 7.762385427013419e-05, + "loss": 1.8625, + "step": 10892 + }, + { + "epoch": 3.3434622467771637, + "grad_norm": 0.29944708943367004, + "learning_rate": 7.761971103239755e-05, + "loss": 1.8181, + "step": 10893 + }, + { + "epoch": 3.343769183548189, + "grad_norm": 0.3395330309867859, + "learning_rate": 7.761556752170676e-05, + "loss": 1.7943, + "step": 10894 + }, + { + "epoch": 3.3440761203192144, + "grad_norm": 0.3624265193939209, + "learning_rate": 7.761142373810274e-05, + "loss": 1.8234, + "step": 10895 + }, + { + "epoch": 3.3443830570902393, + "grad_norm": 0.25409621000289917, + "learning_rate": 7.760727968162644e-05, + "loss": 1.7532, + "step": 10896 + }, + { + "epoch": 3.3446899938612646, + "grad_norm": 0.321437805891037, + "learning_rate": 7.760313535231883e-05, + "loss": 1.8808, + "step": 10897 + }, + { + "epoch": 3.3449969306322895, + "grad_norm": 0.2919142544269562, + "learning_rate": 7.759899075022086e-05, + "loss": 1.7677, + "step": 10898 + }, + { + "epoch": 3.345303867403315, + "grad_norm": 0.26515716314315796, + "learning_rate": 7.759484587537346e-05, + "loss": 1.8118, + "step": 10899 + }, + { + "epoch": 3.34561080417434, + "grad_norm": 0.2963240146636963, + "learning_rate": 7.759070072781764e-05, + "loss": 1.8329, + "step": 10900 + }, + { + "epoch": 3.345917740945365, + "grad_norm": 0.3186480700969696, + "learning_rate": 7.758655530759435e-05, + "loss": 1.8013, + "step": 10901 + }, + { + "epoch": 3.3462246777163904, + "grad_norm": 0.256145715713501, + "learning_rate": 7.758240961474454e-05, + "loss": 1.7865, + "step": 10902 + }, + { + "epoch": 3.3465316144874158, + "grad_norm": 0.28951629996299744, + "learning_rate": 7.757826364930921e-05, + "loss": 1.8091, + "step": 10903 + }, + { + "epoch": 3.3468385512584407, + "grad_norm": 0.2692483365535736, + "learning_rate": 7.75741174113293e-05, + "loss": 1.8308, + "step": 10904 + }, + { + "epoch": 3.347145488029466, + "grad_norm": 0.27615389227867126, + "learning_rate": 7.75699709008458e-05, + "loss": 1.7888, + "step": 10905 + }, + { + "epoch": 3.3474524248004913, + "grad_norm": 0.2819034457206726, + "learning_rate": 7.75658241178997e-05, + "loss": 1.7624, + "step": 10906 + }, + { + "epoch": 3.347759361571516, + "grad_norm": 0.2627592086791992, + "learning_rate": 7.756167706253196e-05, + "loss": 1.7696, + "step": 10907 + }, + { + "epoch": 3.3480662983425415, + "grad_norm": 0.3528621196746826, + "learning_rate": 7.755752973478356e-05, + "loss": 1.7725, + "step": 10908 + }, + { + "epoch": 3.3483732351135664, + "grad_norm": 0.35949698090553284, + "learning_rate": 7.755338213469552e-05, + "loss": 1.8163, + "step": 10909 + }, + { + "epoch": 3.3486801718845918, + "grad_norm": 0.25142577290534973, + "learning_rate": 7.75492342623088e-05, + "loss": 1.7879, + "step": 10910 + }, + { + "epoch": 3.348987108655617, + "grad_norm": 0.25766023993492126, + "learning_rate": 7.75450861176644e-05, + "loss": 1.8143, + "step": 10911 + }, + { + "epoch": 3.349294045426642, + "grad_norm": 0.2736956477165222, + "learning_rate": 7.754093770080331e-05, + "loss": 1.8907, + "step": 10912 + }, + { + "epoch": 3.3496009821976673, + "grad_norm": 0.23700755834579468, + "learning_rate": 7.753678901176654e-05, + "loss": 1.813, + "step": 10913 + }, + { + "epoch": 3.349907918968692, + "grad_norm": 0.245509073138237, + "learning_rate": 7.753264005059507e-05, + "loss": 1.8019, + "step": 10914 + }, + { + "epoch": 3.3502148557397176, + "grad_norm": 0.232910618185997, + "learning_rate": 7.752849081732993e-05, + "loss": 1.784, + "step": 10915 + }, + { + "epoch": 3.350521792510743, + "grad_norm": 0.22989360988140106, + "learning_rate": 7.75243413120121e-05, + "loss": 1.7597, + "step": 10916 + }, + { + "epoch": 3.350828729281768, + "grad_norm": 0.2093925178050995, + "learning_rate": 7.752019153468258e-05, + "loss": 1.7698, + "step": 10917 + }, + { + "epoch": 3.351135666052793, + "grad_norm": 0.25539630651474, + "learning_rate": 7.751604148538241e-05, + "loss": 1.8287, + "step": 10918 + }, + { + "epoch": 3.3514426028238185, + "grad_norm": 0.2731820046901703, + "learning_rate": 7.75118911641526e-05, + "loss": 1.8862, + "step": 10919 + }, + { + "epoch": 3.3517495395948433, + "grad_norm": 0.2464541345834732, + "learning_rate": 7.750774057103416e-05, + "loss": 1.8165, + "step": 10920 + }, + { + "epoch": 3.3520564763658687, + "grad_norm": 0.26380276679992676, + "learning_rate": 7.75035897060681e-05, + "loss": 1.78, + "step": 10921 + }, + { + "epoch": 3.352363413136894, + "grad_norm": 0.3080748915672302, + "learning_rate": 7.749943856929542e-05, + "loss": 1.7925, + "step": 10922 + }, + { + "epoch": 3.352670349907919, + "grad_norm": 0.317754864692688, + "learning_rate": 7.74952871607572e-05, + "loss": 1.8248, + "step": 10923 + }, + { + "epoch": 3.3529772866789442, + "grad_norm": 0.2525196373462677, + "learning_rate": 7.749113548049442e-05, + "loss": 1.762, + "step": 10924 + }, + { + "epoch": 3.353284223449969, + "grad_norm": 0.3149549961090088, + "learning_rate": 7.748698352854814e-05, + "loss": 1.8289, + "step": 10925 + }, + { + "epoch": 3.3535911602209945, + "grad_norm": 0.35744383931159973, + "learning_rate": 7.748283130495937e-05, + "loss": 1.8132, + "step": 10926 + }, + { + "epoch": 3.35389809699202, + "grad_norm": 0.28599128127098083, + "learning_rate": 7.747867880976916e-05, + "loss": 1.7351, + "step": 10927 + }, + { + "epoch": 3.3542050337630447, + "grad_norm": 0.24428869783878326, + "learning_rate": 7.747452604301852e-05, + "loss": 1.794, + "step": 10928 + }, + { + "epoch": 3.35451197053407, + "grad_norm": 0.29067808389663696, + "learning_rate": 7.747037300474854e-05, + "loss": 1.8181, + "step": 10929 + }, + { + "epoch": 3.354818907305095, + "grad_norm": 0.32417505979537964, + "learning_rate": 7.746621969500021e-05, + "loss": 1.8338, + "step": 10930 + }, + { + "epoch": 3.3551258440761202, + "grad_norm": 0.29536551237106323, + "learning_rate": 7.746206611381462e-05, + "loss": 1.8732, + "step": 10931 + }, + { + "epoch": 3.3554327808471456, + "grad_norm": 0.3169345259666443, + "learning_rate": 7.745791226123278e-05, + "loss": 1.876, + "step": 10932 + }, + { + "epoch": 3.3557397176181705, + "grad_norm": 0.2680271565914154, + "learning_rate": 7.745375813729576e-05, + "loss": 1.7347, + "step": 10933 + }, + { + "epoch": 3.356046654389196, + "grad_norm": 0.28339266777038574, + "learning_rate": 7.74496037420446e-05, + "loss": 1.8507, + "step": 10934 + }, + { + "epoch": 3.356353591160221, + "grad_norm": 0.2567409574985504, + "learning_rate": 7.744544907552038e-05, + "loss": 1.8244, + "step": 10935 + }, + { + "epoch": 3.356660527931246, + "grad_norm": 0.266063928604126, + "learning_rate": 7.744129413776416e-05, + "loss": 1.7864, + "step": 10936 + }, + { + "epoch": 3.3569674647022714, + "grad_norm": 0.2490999698638916, + "learning_rate": 7.743713892881696e-05, + "loss": 1.7637, + "step": 10937 + }, + { + "epoch": 3.3572744014732967, + "grad_norm": 0.25857025384902954, + "learning_rate": 7.743298344871988e-05, + "loss": 1.8101, + "step": 10938 + }, + { + "epoch": 3.3575813382443216, + "grad_norm": 0.2549006938934326, + "learning_rate": 7.742882769751398e-05, + "loss": 1.7782, + "step": 10939 + }, + { + "epoch": 3.357888275015347, + "grad_norm": 0.23915350437164307, + "learning_rate": 7.742467167524035e-05, + "loss": 1.7822, + "step": 10940 + }, + { + "epoch": 3.358195211786372, + "grad_norm": 0.25501590967178345, + "learning_rate": 7.742051538194e-05, + "loss": 1.798, + "step": 10941 + }, + { + "epoch": 3.358502148557397, + "grad_norm": 0.29332005977630615, + "learning_rate": 7.741635881765408e-05, + "loss": 1.8334, + "step": 10942 + }, + { + "epoch": 3.3588090853284225, + "grad_norm": 0.28878241777420044, + "learning_rate": 7.741220198242362e-05, + "loss": 1.8266, + "step": 10943 + }, + { + "epoch": 3.3591160220994474, + "grad_norm": 0.3068650960922241, + "learning_rate": 7.740804487628971e-05, + "loss": 1.8562, + "step": 10944 + }, + { + "epoch": 3.3594229588704727, + "grad_norm": 0.2522405683994293, + "learning_rate": 7.740388749929343e-05, + "loss": 1.8001, + "step": 10945 + }, + { + "epoch": 3.359729895641498, + "grad_norm": 0.3073521554470062, + "learning_rate": 7.739972985147588e-05, + "loss": 1.7454, + "step": 10946 + }, + { + "epoch": 3.360036832412523, + "grad_norm": 0.3018052577972412, + "learning_rate": 7.739557193287815e-05, + "loss": 1.7888, + "step": 10947 + }, + { + "epoch": 3.3603437691835483, + "grad_norm": 0.2738604247570038, + "learning_rate": 7.73914137435413e-05, + "loss": 1.7208, + "step": 10948 + }, + { + "epoch": 3.3606507059545736, + "grad_norm": 0.37699586153030396, + "learning_rate": 7.738725528350646e-05, + "loss": 1.8175, + "step": 10949 + }, + { + "epoch": 3.3609576427255985, + "grad_norm": 0.3479778468608856, + "learning_rate": 7.738309655281471e-05, + "loss": 1.818, + "step": 10950 + }, + { + "epoch": 3.361264579496624, + "grad_norm": 0.24871166050434113, + "learning_rate": 7.737893755150715e-05, + "loss": 1.7046, + "step": 10951 + }, + { + "epoch": 3.3615715162676487, + "grad_norm": 0.45015642046928406, + "learning_rate": 7.737477827962488e-05, + "loss": 1.8517, + "step": 10952 + }, + { + "epoch": 3.361878453038674, + "grad_norm": 0.4149077534675598, + "learning_rate": 7.7370618737209e-05, + "loss": 1.7403, + "step": 10953 + }, + { + "epoch": 3.3621853898096994, + "grad_norm": 0.2556059658527374, + "learning_rate": 7.736645892430064e-05, + "loss": 1.8167, + "step": 10954 + }, + { + "epoch": 3.3624923265807243, + "grad_norm": 0.3153657615184784, + "learning_rate": 7.736229884094088e-05, + "loss": 1.8471, + "step": 10955 + }, + { + "epoch": 3.3627992633517496, + "grad_norm": 0.27943772077560425, + "learning_rate": 7.735813848717084e-05, + "loss": 1.7742, + "step": 10956 + }, + { + "epoch": 3.3631062001227745, + "grad_norm": 0.28270283341407776, + "learning_rate": 7.735397786303164e-05, + "loss": 1.8418, + "step": 10957 + }, + { + "epoch": 3.3634131368938, + "grad_norm": 0.3596261441707611, + "learning_rate": 7.734981696856442e-05, + "loss": 1.8213, + "step": 10958 + }, + { + "epoch": 3.363720073664825, + "grad_norm": 0.3678492307662964, + "learning_rate": 7.734565580381026e-05, + "loss": 1.806, + "step": 10959 + }, + { + "epoch": 3.36402701043585, + "grad_norm": 0.27758681774139404, + "learning_rate": 7.734149436881031e-05, + "loss": 1.7832, + "step": 10960 + }, + { + "epoch": 3.3643339472068754, + "grad_norm": 0.2821379005908966, + "learning_rate": 7.733733266360568e-05, + "loss": 1.8888, + "step": 10961 + }, + { + "epoch": 3.3646408839779007, + "grad_norm": 0.33676958084106445, + "learning_rate": 7.733317068823751e-05, + "loss": 1.902, + "step": 10962 + }, + { + "epoch": 3.3649478207489256, + "grad_norm": 0.3116114139556885, + "learning_rate": 7.732900844274691e-05, + "loss": 1.8228, + "step": 10963 + }, + { + "epoch": 3.365254757519951, + "grad_norm": 0.3286324143409729, + "learning_rate": 7.732484592717506e-05, + "loss": 1.8707, + "step": 10964 + }, + { + "epoch": 3.3655616942909763, + "grad_norm": 0.2732192873954773, + "learning_rate": 7.732068314156304e-05, + "loss": 1.773, + "step": 10965 + }, + { + "epoch": 3.365868631062001, + "grad_norm": 0.26663896441459656, + "learning_rate": 7.731652008595204e-05, + "loss": 1.7837, + "step": 10966 + }, + { + "epoch": 3.3661755678330265, + "grad_norm": 0.27447745203971863, + "learning_rate": 7.731235676038317e-05, + "loss": 1.9103, + "step": 10967 + }, + { + "epoch": 3.3664825046040514, + "grad_norm": 0.30832916498184204, + "learning_rate": 7.730819316489757e-05, + "loss": 1.7552, + "step": 10968 + }, + { + "epoch": 3.3667894413750767, + "grad_norm": 0.29657161235809326, + "learning_rate": 7.73040292995364e-05, + "loss": 1.7654, + "step": 10969 + }, + { + "epoch": 3.367096378146102, + "grad_norm": 0.30434274673461914, + "learning_rate": 7.729986516434082e-05, + "loss": 1.8646, + "step": 10970 + }, + { + "epoch": 3.367403314917127, + "grad_norm": 0.25926661491394043, + "learning_rate": 7.729570075935198e-05, + "loss": 1.7555, + "step": 10971 + }, + { + "epoch": 3.3677102516881523, + "grad_norm": 0.2775980532169342, + "learning_rate": 7.729153608461102e-05, + "loss": 1.8427, + "step": 10972 + }, + { + "epoch": 3.368017188459177, + "grad_norm": 0.23915666341781616, + "learning_rate": 7.72873711401591e-05, + "loss": 1.7902, + "step": 10973 + }, + { + "epoch": 3.3683241252302025, + "grad_norm": 0.2603691518306732, + "learning_rate": 7.728320592603737e-05, + "loss": 1.8587, + "step": 10974 + }, + { + "epoch": 3.368631062001228, + "grad_norm": 0.2579508125782013, + "learning_rate": 7.727904044228703e-05, + "loss": 1.7617, + "step": 10975 + }, + { + "epoch": 3.3689379987722528, + "grad_norm": 0.3384297788143158, + "learning_rate": 7.72748746889492e-05, + "loss": 1.8499, + "step": 10976 + }, + { + "epoch": 3.369244935543278, + "grad_norm": 0.36756646633148193, + "learning_rate": 7.727070866606509e-05, + "loss": 1.808, + "step": 10977 + }, + { + "epoch": 3.3695518723143034, + "grad_norm": 0.3212372958660126, + "learning_rate": 7.726654237367587e-05, + "loss": 1.8245, + "step": 10978 + }, + { + "epoch": 3.3698588090853283, + "grad_norm": 0.23782415688037872, + "learning_rate": 7.726237581182267e-05, + "loss": 1.7629, + "step": 10979 + }, + { + "epoch": 3.3701657458563536, + "grad_norm": 0.2782919108867645, + "learning_rate": 7.725820898054669e-05, + "loss": 1.8, + "step": 10980 + }, + { + "epoch": 3.370472682627379, + "grad_norm": 0.2973455488681793, + "learning_rate": 7.725404187988914e-05, + "loss": 1.7949, + "step": 10981 + }, + { + "epoch": 3.370779619398404, + "grad_norm": 0.2875392735004425, + "learning_rate": 7.724987450989114e-05, + "loss": 1.8019, + "step": 10982 + }, + { + "epoch": 3.371086556169429, + "grad_norm": 0.26133236289024353, + "learning_rate": 7.724570687059394e-05, + "loss": 1.7984, + "step": 10983 + }, + { + "epoch": 3.371393492940454, + "grad_norm": 0.2760173976421356, + "learning_rate": 7.724153896203867e-05, + "loss": 1.8082, + "step": 10984 + }, + { + "epoch": 3.3717004297114794, + "grad_norm": 0.26373061537742615, + "learning_rate": 7.723737078426656e-05, + "loss": 1.8408, + "step": 10985 + }, + { + "epoch": 3.3720073664825048, + "grad_norm": 0.29425618052482605, + "learning_rate": 7.723320233731879e-05, + "loss": 1.7992, + "step": 10986 + }, + { + "epoch": 3.3723143032535297, + "grad_norm": 0.29822099208831787, + "learning_rate": 7.722903362123655e-05, + "loss": 1.8204, + "step": 10987 + }, + { + "epoch": 3.372621240024555, + "grad_norm": 0.25945618748664856, + "learning_rate": 7.722486463606104e-05, + "loss": 1.7376, + "step": 10988 + }, + { + "epoch": 3.37292817679558, + "grad_norm": 0.26367196440696716, + "learning_rate": 7.722069538183345e-05, + "loss": 1.814, + "step": 10989 + }, + { + "epoch": 3.373235113566605, + "grad_norm": 0.25015249848365784, + "learning_rate": 7.7216525858595e-05, + "loss": 1.8199, + "step": 10990 + }, + { + "epoch": 3.3735420503376305, + "grad_norm": 0.3035781681537628, + "learning_rate": 7.72123560663869e-05, + "loss": 1.739, + "step": 10991 + }, + { + "epoch": 3.3738489871086554, + "grad_norm": 0.2847912013530731, + "learning_rate": 7.720818600525033e-05, + "loss": 1.8754, + "step": 10992 + }, + { + "epoch": 3.3741559238796808, + "grad_norm": 0.2533976435661316, + "learning_rate": 7.720401567522653e-05, + "loss": 1.7616, + "step": 10993 + }, + { + "epoch": 3.374462860650706, + "grad_norm": 0.250828355550766, + "learning_rate": 7.719984507635669e-05, + "loss": 1.7973, + "step": 10994 + }, + { + "epoch": 3.374769797421731, + "grad_norm": 0.3019898235797882, + "learning_rate": 7.719567420868206e-05, + "loss": 1.7563, + "step": 10995 + }, + { + "epoch": 3.3750767341927563, + "grad_norm": 0.2703310549259186, + "learning_rate": 7.719150307224382e-05, + "loss": 1.8183, + "step": 10996 + }, + { + "epoch": 3.3753836709637817, + "grad_norm": 0.2434745579957962, + "learning_rate": 7.718733166708321e-05, + "loss": 1.7913, + "step": 10997 + }, + { + "epoch": 3.3756906077348066, + "grad_norm": 0.28036773204803467, + "learning_rate": 7.718315999324146e-05, + "loss": 1.7884, + "step": 10998 + }, + { + "epoch": 3.375997544505832, + "grad_norm": 0.25123077630996704, + "learning_rate": 7.717898805075978e-05, + "loss": 1.7394, + "step": 10999 + }, + { + "epoch": 3.376304481276857, + "grad_norm": 0.2313947230577469, + "learning_rate": 7.717481583967943e-05, + "loss": 1.7537, + "step": 11000 + }, + { + "epoch": 3.376611418047882, + "grad_norm": 0.27152860164642334, + "learning_rate": 7.71706433600416e-05, + "loss": 1.8596, + "step": 11001 + }, + { + "epoch": 3.3769183548189075, + "grad_norm": 0.32866382598876953, + "learning_rate": 7.716647061188757e-05, + "loss": 1.9007, + "step": 11002 + }, + { + "epoch": 3.3772252915899323, + "grad_norm": 0.2842368185520172, + "learning_rate": 7.716229759525854e-05, + "loss": 1.7781, + "step": 11003 + }, + { + "epoch": 3.3775322283609577, + "grad_norm": 0.30411216616630554, + "learning_rate": 7.715812431019576e-05, + "loss": 1.7403, + "step": 11004 + }, + { + "epoch": 3.3778391651319826, + "grad_norm": 0.31848132610321045, + "learning_rate": 7.71539507567405e-05, + "loss": 1.817, + "step": 11005 + }, + { + "epoch": 3.378146101903008, + "grad_norm": 0.24206148087978363, + "learning_rate": 7.714977693493397e-05, + "loss": 1.7796, + "step": 11006 + }, + { + "epoch": 3.3784530386740332, + "grad_norm": 0.2982998490333557, + "learning_rate": 7.714560284481742e-05, + "loss": 1.7883, + "step": 11007 + }, + { + "epoch": 3.378759975445058, + "grad_norm": 0.24857483804225922, + "learning_rate": 7.714142848643213e-05, + "loss": 1.7447, + "step": 11008 + }, + { + "epoch": 3.3790669122160835, + "grad_norm": 0.2509039044380188, + "learning_rate": 7.713725385981932e-05, + "loss": 1.8362, + "step": 11009 + }, + { + "epoch": 3.379373848987109, + "grad_norm": 0.2759779095649719, + "learning_rate": 7.713307896502027e-05, + "loss": 1.8655, + "step": 11010 + }, + { + "epoch": 3.3796807857581337, + "grad_norm": 0.264776349067688, + "learning_rate": 7.712890380207623e-05, + "loss": 1.8221, + "step": 11011 + }, + { + "epoch": 3.379987722529159, + "grad_norm": 0.2771971821784973, + "learning_rate": 7.712472837102846e-05, + "loss": 1.6992, + "step": 11012 + }, + { + "epoch": 3.3802946593001844, + "grad_norm": 0.2749316096305847, + "learning_rate": 7.712055267191822e-05, + "loss": 1.8128, + "step": 11013 + }, + { + "epoch": 3.3806015960712092, + "grad_norm": 0.256656289100647, + "learning_rate": 7.71163767047868e-05, + "loss": 1.8382, + "step": 11014 + }, + { + "epoch": 3.3809085328422346, + "grad_norm": 0.27646976709365845, + "learning_rate": 7.711220046967545e-05, + "loss": 1.8321, + "step": 11015 + }, + { + "epoch": 3.3812154696132595, + "grad_norm": 0.3083149194717407, + "learning_rate": 7.710802396662542e-05, + "loss": 1.904, + "step": 11016 + }, + { + "epoch": 3.381522406384285, + "grad_norm": 0.2750856280326843, + "learning_rate": 7.710384719567803e-05, + "loss": 1.7596, + "step": 11017 + }, + { + "epoch": 3.38182934315531, + "grad_norm": 0.3029455244541168, + "learning_rate": 7.709967015687452e-05, + "loss": 1.8542, + "step": 11018 + }, + { + "epoch": 3.382136279926335, + "grad_norm": 0.3144093453884125, + "learning_rate": 7.709549285025622e-05, + "loss": 1.7489, + "step": 11019 + }, + { + "epoch": 3.3824432166973604, + "grad_norm": 0.2675442099571228, + "learning_rate": 7.709131527586433e-05, + "loss": 1.7324, + "step": 11020 + }, + { + "epoch": 3.3827501534683857, + "grad_norm": 0.2906095087528229, + "learning_rate": 7.708713743374021e-05, + "loss": 1.7848, + "step": 11021 + }, + { + "epoch": 3.3830570902394106, + "grad_norm": 0.25141623616218567, + "learning_rate": 7.708295932392513e-05, + "loss": 1.7423, + "step": 11022 + }, + { + "epoch": 3.383364027010436, + "grad_norm": 0.25832003355026245, + "learning_rate": 7.707878094646037e-05, + "loss": 1.7792, + "step": 11023 + }, + { + "epoch": 3.3836709637814613, + "grad_norm": 0.23710070550441742, + "learning_rate": 7.70746023013872e-05, + "loss": 1.7916, + "step": 11024 + }, + { + "epoch": 3.383977900552486, + "grad_norm": 0.286735862493515, + "learning_rate": 7.707042338874697e-05, + "loss": 1.8272, + "step": 11025 + }, + { + "epoch": 3.3842848373235115, + "grad_norm": 0.2536577582359314, + "learning_rate": 7.706624420858094e-05, + "loss": 1.7839, + "step": 11026 + }, + { + "epoch": 3.3845917740945364, + "grad_norm": 0.5564702749252319, + "learning_rate": 7.706206476093043e-05, + "loss": 1.7832, + "step": 11027 + }, + { + "epoch": 3.3848987108655617, + "grad_norm": 0.34694772958755493, + "learning_rate": 7.705788504583671e-05, + "loss": 1.8668, + "step": 11028 + }, + { + "epoch": 3.385205647636587, + "grad_norm": 0.30388176441192627, + "learning_rate": 7.705370506334113e-05, + "loss": 1.8244, + "step": 11029 + }, + { + "epoch": 3.385512584407612, + "grad_norm": 0.2998919188976288, + "learning_rate": 7.704952481348497e-05, + "loss": 1.7927, + "step": 11030 + }, + { + "epoch": 3.3858195211786373, + "grad_norm": 0.2714936435222626, + "learning_rate": 7.704534429630955e-05, + "loss": 1.8757, + "step": 11031 + }, + { + "epoch": 3.386126457949662, + "grad_norm": 0.26670241355895996, + "learning_rate": 7.704116351185619e-05, + "loss": 1.8146, + "step": 11032 + }, + { + "epoch": 3.3864333947206875, + "grad_norm": 0.2500552833080292, + "learning_rate": 7.703698246016621e-05, + "loss": 1.7984, + "step": 11033 + }, + { + "epoch": 3.386740331491713, + "grad_norm": 0.2494918406009674, + "learning_rate": 7.703280114128091e-05, + "loss": 1.7433, + "step": 11034 + }, + { + "epoch": 3.3870472682627377, + "grad_norm": 0.25658491253852844, + "learning_rate": 7.702861955524163e-05, + "loss": 1.8487, + "step": 11035 + }, + { + "epoch": 3.387354205033763, + "grad_norm": 0.2871410548686981, + "learning_rate": 7.702443770208969e-05, + "loss": 1.7919, + "step": 11036 + }, + { + "epoch": 3.3876611418047884, + "grad_norm": 0.3347938060760498, + "learning_rate": 7.702025558186643e-05, + "loss": 1.8091, + "step": 11037 + }, + { + "epoch": 3.3879680785758133, + "grad_norm": 0.39016643166542053, + "learning_rate": 7.701607319461315e-05, + "loss": 1.7816, + "step": 11038 + }, + { + "epoch": 3.3882750153468386, + "grad_norm": 0.3423028290271759, + "learning_rate": 7.701189054037121e-05, + "loss": 1.8454, + "step": 11039 + }, + { + "epoch": 3.388581952117864, + "grad_norm": 0.27592089772224426, + "learning_rate": 7.700770761918192e-05, + "loss": 1.8431, + "step": 11040 + }, + { + "epoch": 3.388888888888889, + "grad_norm": 0.46047264337539673, + "learning_rate": 7.700352443108665e-05, + "loss": 1.8412, + "step": 11041 + }, + { + "epoch": 3.389195825659914, + "grad_norm": 0.49226754903793335, + "learning_rate": 7.699934097612673e-05, + "loss": 1.8212, + "step": 11042 + }, + { + "epoch": 3.389502762430939, + "grad_norm": 0.3958778381347656, + "learning_rate": 7.699515725434348e-05, + "loss": 1.747, + "step": 11043 + }, + { + "epoch": 3.3898096992019644, + "grad_norm": 0.26097169518470764, + "learning_rate": 7.699097326577827e-05, + "loss": 1.7631, + "step": 11044 + }, + { + "epoch": 3.3901166359729897, + "grad_norm": 0.2922612130641937, + "learning_rate": 7.698678901047245e-05, + "loss": 1.7891, + "step": 11045 + }, + { + "epoch": 3.3904235727440146, + "grad_norm": 0.4195055365562439, + "learning_rate": 7.698260448846734e-05, + "loss": 1.7765, + "step": 11046 + }, + { + "epoch": 3.39073050951504, + "grad_norm": 0.4572988450527191, + "learning_rate": 7.697841969980434e-05, + "loss": 1.8085, + "step": 11047 + }, + { + "epoch": 3.391037446286065, + "grad_norm": 0.38819587230682373, + "learning_rate": 7.697423464452478e-05, + "loss": 1.8854, + "step": 11048 + }, + { + "epoch": 3.39134438305709, + "grad_norm": 0.27421653270721436, + "learning_rate": 7.697004932267003e-05, + "loss": 1.8327, + "step": 11049 + }, + { + "epoch": 3.3916513198281155, + "grad_norm": 0.33559146523475647, + "learning_rate": 7.696586373428142e-05, + "loss": 1.8109, + "step": 11050 + }, + { + "epoch": 3.3919582565991404, + "grad_norm": 0.39438655972480774, + "learning_rate": 7.696167787940037e-05, + "loss": 1.7909, + "step": 11051 + }, + { + "epoch": 3.3922651933701657, + "grad_norm": 0.3425842523574829, + "learning_rate": 7.695749175806819e-05, + "loss": 1.8571, + "step": 11052 + }, + { + "epoch": 3.392572130141191, + "grad_norm": 0.2860080301761627, + "learning_rate": 7.695330537032628e-05, + "loss": 1.8546, + "step": 11053 + }, + { + "epoch": 3.392879066912216, + "grad_norm": 0.35894665122032166, + "learning_rate": 7.694911871621601e-05, + "loss": 1.7895, + "step": 11054 + }, + { + "epoch": 3.3931860036832413, + "grad_norm": 0.351193904876709, + "learning_rate": 7.694493179577879e-05, + "loss": 1.7453, + "step": 11055 + }, + { + "epoch": 3.3934929404542666, + "grad_norm": 0.24812865257263184, + "learning_rate": 7.694074460905592e-05, + "loss": 1.8131, + "step": 11056 + }, + { + "epoch": 3.3937998772252915, + "grad_norm": 0.38620972633361816, + "learning_rate": 7.693655715608883e-05, + "loss": 1.8346, + "step": 11057 + }, + { + "epoch": 3.394106813996317, + "grad_norm": 0.5005692839622498, + "learning_rate": 7.69323694369189e-05, + "loss": 1.9031, + "step": 11058 + }, + { + "epoch": 3.3944137507673418, + "grad_norm": 0.4321887791156769, + "learning_rate": 7.692818145158751e-05, + "loss": 1.8783, + "step": 11059 + }, + { + "epoch": 3.394720687538367, + "grad_norm": 0.269307017326355, + "learning_rate": 7.692399320013603e-05, + "loss": 1.8075, + "step": 11060 + }, + { + "epoch": 3.3950276243093924, + "grad_norm": 0.2945556342601776, + "learning_rate": 7.69198046826059e-05, + "loss": 1.8366, + "step": 11061 + }, + { + "epoch": 3.3953345610804173, + "grad_norm": 0.30531853437423706, + "learning_rate": 7.691561589903847e-05, + "loss": 1.7665, + "step": 11062 + }, + { + "epoch": 3.3956414978514426, + "grad_norm": 0.25105199217796326, + "learning_rate": 7.691142684947513e-05, + "loss": 1.782, + "step": 11063 + }, + { + "epoch": 3.3959484346224675, + "grad_norm": 0.3373202085494995, + "learning_rate": 7.69072375339573e-05, + "loss": 1.8148, + "step": 11064 + }, + { + "epoch": 3.396255371393493, + "grad_norm": 0.34207093715667725, + "learning_rate": 7.690304795252638e-05, + "loss": 1.8287, + "step": 11065 + }, + { + "epoch": 3.396562308164518, + "grad_norm": 0.26281681656837463, + "learning_rate": 7.68988581052238e-05, + "loss": 1.8551, + "step": 11066 + }, + { + "epoch": 3.396869244935543, + "grad_norm": 0.3091152608394623, + "learning_rate": 7.689466799209091e-05, + "loss": 1.7689, + "step": 11067 + }, + { + "epoch": 3.3971761817065684, + "grad_norm": 0.37421298027038574, + "learning_rate": 7.689047761316914e-05, + "loss": 1.7908, + "step": 11068 + }, + { + "epoch": 3.3974831184775938, + "grad_norm": 0.3745511770248413, + "learning_rate": 7.688628696849993e-05, + "loss": 1.8408, + "step": 11069 + }, + { + "epoch": 3.3977900552486187, + "grad_norm": 0.3003663122653961, + "learning_rate": 7.688209605812467e-05, + "loss": 1.9109, + "step": 11070 + }, + { + "epoch": 3.398096992019644, + "grad_norm": 0.3437681496143341, + "learning_rate": 7.687790488208478e-05, + "loss": 1.811, + "step": 11071 + }, + { + "epoch": 3.3984039287906693, + "grad_norm": 0.3480641841888428, + "learning_rate": 7.687371344042168e-05, + "loss": 1.8114, + "step": 11072 + }, + { + "epoch": 3.398710865561694, + "grad_norm": 0.24670913815498352, + "learning_rate": 7.686952173317679e-05, + "loss": 1.7959, + "step": 11073 + }, + { + "epoch": 3.3990178023327196, + "grad_norm": 0.2939499020576477, + "learning_rate": 7.686532976039154e-05, + "loss": 1.7518, + "step": 11074 + }, + { + "epoch": 3.3993247391037444, + "grad_norm": 0.3332279622554779, + "learning_rate": 7.686113752210736e-05, + "loss": 1.843, + "step": 11075 + }, + { + "epoch": 3.3996316758747698, + "grad_norm": 0.22967280447483063, + "learning_rate": 7.685694501836566e-05, + "loss": 1.7408, + "step": 11076 + }, + { + "epoch": 3.399938612645795, + "grad_norm": 0.3443470001220703, + "learning_rate": 7.685275224920789e-05, + "loss": 1.8004, + "step": 11077 + }, + { + "epoch": 3.40024554941682, + "grad_norm": 0.3725457489490509, + "learning_rate": 7.684855921467548e-05, + "loss": 1.833, + "step": 11078 + }, + { + "epoch": 3.4005524861878453, + "grad_norm": 0.3178638219833374, + "learning_rate": 7.68443659148099e-05, + "loss": 1.8055, + "step": 11079 + }, + { + "epoch": 3.4008594229588702, + "grad_norm": 0.2609167695045471, + "learning_rate": 7.684017234965254e-05, + "loss": 1.7881, + "step": 11080 + }, + { + "epoch": 3.4011663597298956, + "grad_norm": 0.26975762844085693, + "learning_rate": 7.683597851924486e-05, + "loss": 1.8424, + "step": 11081 + }, + { + "epoch": 3.401473296500921, + "grad_norm": 0.266661673784256, + "learning_rate": 7.683178442362832e-05, + "loss": 1.7785, + "step": 11082 + }, + { + "epoch": 3.401780233271946, + "grad_norm": 0.27915671467781067, + "learning_rate": 7.682759006284436e-05, + "loss": 1.8241, + "step": 11083 + }, + { + "epoch": 3.402087170042971, + "grad_norm": 0.25167274475097656, + "learning_rate": 7.682339543693444e-05, + "loss": 1.7637, + "step": 11084 + }, + { + "epoch": 3.4023941068139965, + "grad_norm": 0.2439529299736023, + "learning_rate": 7.681920054593999e-05, + "loss": 1.7796, + "step": 11085 + }, + { + "epoch": 3.4027010435850213, + "grad_norm": 0.26224252581596375, + "learning_rate": 7.681500538990249e-05, + "loss": 1.8018, + "step": 11086 + }, + { + "epoch": 3.4030079803560467, + "grad_norm": 0.25093868374824524, + "learning_rate": 7.681080996886336e-05, + "loss": 1.7664, + "step": 11087 + }, + { + "epoch": 3.403314917127072, + "grad_norm": 0.26393210887908936, + "learning_rate": 7.680661428286413e-05, + "loss": 1.8389, + "step": 11088 + }, + { + "epoch": 3.403621853898097, + "grad_norm": 0.24750283360481262, + "learning_rate": 7.680241833194622e-05, + "loss": 1.8358, + "step": 11089 + }, + { + "epoch": 3.4039287906691222, + "grad_norm": 0.21568982303142548, + "learning_rate": 7.67982221161511e-05, + "loss": 1.7874, + "step": 11090 + }, + { + "epoch": 3.404235727440147, + "grad_norm": 0.24407126009464264, + "learning_rate": 7.679402563552023e-05, + "loss": 1.7753, + "step": 11091 + }, + { + "epoch": 3.4045426642111725, + "grad_norm": 0.23288260400295258, + "learning_rate": 7.67898288900951e-05, + "loss": 1.8046, + "step": 11092 + }, + { + "epoch": 3.404849600982198, + "grad_norm": 0.2548544108867645, + "learning_rate": 7.678563187991718e-05, + "loss": 1.8778, + "step": 11093 + }, + { + "epoch": 3.4051565377532227, + "grad_norm": 0.24008090794086456, + "learning_rate": 7.678143460502796e-05, + "loss": 1.7912, + "step": 11094 + }, + { + "epoch": 3.405463474524248, + "grad_norm": 0.26085031032562256, + "learning_rate": 7.677723706546889e-05, + "loss": 1.849, + "step": 11095 + }, + { + "epoch": 3.4057704112952734, + "grad_norm": 0.2830932140350342, + "learning_rate": 7.677303926128147e-05, + "loss": 1.8265, + "step": 11096 + }, + { + "epoch": 3.4060773480662982, + "grad_norm": 0.27593597769737244, + "learning_rate": 7.676884119250718e-05, + "loss": 1.8555, + "step": 11097 + }, + { + "epoch": 3.4063842848373236, + "grad_norm": 0.2403372824192047, + "learning_rate": 7.676464285918751e-05, + "loss": 1.7243, + "step": 11098 + }, + { + "epoch": 3.406691221608349, + "grad_norm": 0.28830090165138245, + "learning_rate": 7.676044426136397e-05, + "loss": 1.8108, + "step": 11099 + }, + { + "epoch": 3.406998158379374, + "grad_norm": 0.2918153405189514, + "learning_rate": 7.675624539907802e-05, + "loss": 1.7875, + "step": 11100 + }, + { + "epoch": 3.407305095150399, + "grad_norm": 0.2609013020992279, + "learning_rate": 7.675204627237117e-05, + "loss": 1.778, + "step": 11101 + }, + { + "epoch": 3.407612031921424, + "grad_norm": 0.2714763283729553, + "learning_rate": 7.674784688128494e-05, + "loss": 1.8472, + "step": 11102 + }, + { + "epoch": 3.4079189686924494, + "grad_norm": 0.25857117772102356, + "learning_rate": 7.674364722586078e-05, + "loss": 1.7495, + "step": 11103 + }, + { + "epoch": 3.4082259054634747, + "grad_norm": 0.25485143065452576, + "learning_rate": 7.673944730614023e-05, + "loss": 1.7817, + "step": 11104 + }, + { + "epoch": 3.4085328422344996, + "grad_norm": 0.2735857665538788, + "learning_rate": 7.67352471221648e-05, + "loss": 1.7522, + "step": 11105 + }, + { + "epoch": 3.408839779005525, + "grad_norm": 0.25079572200775146, + "learning_rate": 7.6731046673976e-05, + "loss": 1.765, + "step": 11106 + }, + { + "epoch": 3.40914671577655, + "grad_norm": 0.3080148696899414, + "learning_rate": 7.672684596161532e-05, + "loss": 1.8305, + "step": 11107 + }, + { + "epoch": 3.409453652547575, + "grad_norm": 0.23771968483924866, + "learning_rate": 7.672264498512427e-05, + "loss": 1.7837, + "step": 11108 + }, + { + "epoch": 3.4097605893186005, + "grad_norm": 0.29941999912261963, + "learning_rate": 7.671844374454437e-05, + "loss": 1.8013, + "step": 11109 + }, + { + "epoch": 3.4100675260896254, + "grad_norm": 0.27871644496917725, + "learning_rate": 7.671424223991717e-05, + "loss": 1.8598, + "step": 11110 + }, + { + "epoch": 3.4103744628606507, + "grad_norm": 0.2751443684101105, + "learning_rate": 7.671004047128416e-05, + "loss": 1.8341, + "step": 11111 + }, + { + "epoch": 3.410681399631676, + "grad_norm": 0.27227312326431274, + "learning_rate": 7.670583843868688e-05, + "loss": 1.81, + "step": 11112 + }, + { + "epoch": 3.410988336402701, + "grad_norm": 0.29617756605148315, + "learning_rate": 7.670163614216685e-05, + "loss": 1.8795, + "step": 11113 + }, + { + "epoch": 3.4112952731737263, + "grad_norm": 0.268920361995697, + "learning_rate": 7.669743358176563e-05, + "loss": 1.7659, + "step": 11114 + }, + { + "epoch": 3.4116022099447516, + "grad_norm": 0.2875109314918518, + "learning_rate": 7.669323075752467e-05, + "loss": 1.8263, + "step": 11115 + }, + { + "epoch": 3.4119091467157765, + "grad_norm": 0.34703585505485535, + "learning_rate": 7.668902766948558e-05, + "loss": 1.7622, + "step": 11116 + }, + { + "epoch": 3.412216083486802, + "grad_norm": 0.3090265393257141, + "learning_rate": 7.668482431768989e-05, + "loss": 1.7381, + "step": 11117 + }, + { + "epoch": 3.4125230202578267, + "grad_norm": 0.2619737684726715, + "learning_rate": 7.668062070217911e-05, + "loss": 1.8004, + "step": 11118 + }, + { + "epoch": 3.412829957028852, + "grad_norm": 0.289815217256546, + "learning_rate": 7.667641682299482e-05, + "loss": 1.7946, + "step": 11119 + }, + { + "epoch": 3.4131368937998774, + "grad_norm": 0.28732073307037354, + "learning_rate": 7.667221268017852e-05, + "loss": 1.8746, + "step": 11120 + }, + { + "epoch": 3.4134438305709023, + "grad_norm": 0.23232576251029968, + "learning_rate": 7.666800827377178e-05, + "loss": 1.7403, + "step": 11121 + }, + { + "epoch": 3.4137507673419276, + "grad_norm": 0.22903507947921753, + "learning_rate": 7.666380360381616e-05, + "loss": 1.7785, + "step": 11122 + }, + { + "epoch": 3.4140577041129525, + "grad_norm": 0.25023025274276733, + "learning_rate": 7.665959867035321e-05, + "loss": 1.7881, + "step": 11123 + }, + { + "epoch": 3.414364640883978, + "grad_norm": 0.2199166864156723, + "learning_rate": 7.665539347342449e-05, + "loss": 1.7522, + "step": 11124 + }, + { + "epoch": 3.414671577655003, + "grad_norm": 0.2539862394332886, + "learning_rate": 7.665118801307152e-05, + "loss": 1.7964, + "step": 11125 + }, + { + "epoch": 3.414978514426028, + "grad_norm": 0.22670161724090576, + "learning_rate": 7.664698228933591e-05, + "loss": 1.7071, + "step": 11126 + }, + { + "epoch": 3.4152854511970534, + "grad_norm": 0.24827396869659424, + "learning_rate": 7.664277630225919e-05, + "loss": 1.7897, + "step": 11127 + }, + { + "epoch": 3.4155923879680787, + "grad_norm": 0.29391366243362427, + "learning_rate": 7.663857005188296e-05, + "loss": 1.7967, + "step": 11128 + }, + { + "epoch": 3.4158993247391036, + "grad_norm": 0.3201812505722046, + "learning_rate": 7.663436353824874e-05, + "loss": 1.7681, + "step": 11129 + }, + { + "epoch": 3.416206261510129, + "grad_norm": 0.2274552583694458, + "learning_rate": 7.663015676139814e-05, + "loss": 1.7535, + "step": 11130 + }, + { + "epoch": 3.4165131982811543, + "grad_norm": 0.3955044150352478, + "learning_rate": 7.662594972137273e-05, + "loss": 1.8175, + "step": 11131 + }, + { + "epoch": 3.416820135052179, + "grad_norm": 0.46493569016456604, + "learning_rate": 7.662174241821406e-05, + "loss": 1.7806, + "step": 11132 + }, + { + "epoch": 3.4171270718232045, + "grad_norm": 0.37731611728668213, + "learning_rate": 7.661753485196375e-05, + "loss": 1.7555, + "step": 11133 + }, + { + "epoch": 3.4174340085942294, + "grad_norm": 0.23983556032180786, + "learning_rate": 7.661332702266334e-05, + "loss": 1.7662, + "step": 11134 + }, + { + "epoch": 3.4177409453652547, + "grad_norm": 0.34964314103126526, + "learning_rate": 7.660911893035445e-05, + "loss": 1.7786, + "step": 11135 + }, + { + "epoch": 3.41804788213628, + "grad_norm": 0.44820764660835266, + "learning_rate": 7.660491057507864e-05, + "loss": 1.778, + "step": 11136 + }, + { + "epoch": 3.418354818907305, + "grad_norm": 0.32936233282089233, + "learning_rate": 7.660070195687752e-05, + "loss": 1.8181, + "step": 11137 + }, + { + "epoch": 3.4186617556783303, + "grad_norm": 0.2874850332736969, + "learning_rate": 7.659649307579266e-05, + "loss": 1.8733, + "step": 11138 + }, + { + "epoch": 3.418968692449355, + "grad_norm": 0.46269866824150085, + "learning_rate": 7.659228393186566e-05, + "loss": 1.8566, + "step": 11139 + }, + { + "epoch": 3.4192756292203805, + "grad_norm": 0.5873839855194092, + "learning_rate": 7.658807452513816e-05, + "loss": 1.8317, + "step": 11140 + }, + { + "epoch": 3.419582565991406, + "grad_norm": 0.43150341510772705, + "learning_rate": 7.65838648556517e-05, + "loss": 1.7702, + "step": 11141 + }, + { + "epoch": 3.4198895027624308, + "grad_norm": 0.2803891599178314, + "learning_rate": 7.65796549234479e-05, + "loss": 1.8043, + "step": 11142 + }, + { + "epoch": 3.420196439533456, + "grad_norm": 0.37295013666152954, + "learning_rate": 7.657544472856838e-05, + "loss": 1.7923, + "step": 11143 + }, + { + "epoch": 3.4205033763044814, + "grad_norm": 0.3922573924064636, + "learning_rate": 7.657123427105473e-05, + "loss": 1.8231, + "step": 11144 + }, + { + "epoch": 3.4208103130755063, + "grad_norm": 0.27254152297973633, + "learning_rate": 7.656702355094859e-05, + "loss": 1.8168, + "step": 11145 + }, + { + "epoch": 3.4211172498465316, + "grad_norm": 0.28005337715148926, + "learning_rate": 7.656281256829152e-05, + "loss": 1.8047, + "step": 11146 + }, + { + "epoch": 3.421424186617557, + "grad_norm": 0.4369073808193207, + "learning_rate": 7.655860132312519e-05, + "loss": 1.7243, + "step": 11147 + }, + { + "epoch": 3.421731123388582, + "grad_norm": 0.4127553701400757, + "learning_rate": 7.655438981549119e-05, + "loss": 1.8148, + "step": 11148 + }, + { + "epoch": 3.422038060159607, + "grad_norm": 0.3131798207759857, + "learning_rate": 7.655017804543114e-05, + "loss": 1.789, + "step": 11149 + }, + { + "epoch": 3.422344996930632, + "grad_norm": 0.2947194576263428, + "learning_rate": 7.654596601298666e-05, + "loss": 1.8221, + "step": 11150 + }, + { + "epoch": 3.4226519337016574, + "grad_norm": 0.3072497546672821, + "learning_rate": 7.654175371819941e-05, + "loss": 1.7747, + "step": 11151 + }, + { + "epoch": 3.4229588704726828, + "grad_norm": 0.29408320784568787, + "learning_rate": 7.653754116111099e-05, + "loss": 1.9009, + "step": 11152 + }, + { + "epoch": 3.4232658072437077, + "grad_norm": 0.2629215717315674, + "learning_rate": 7.653332834176303e-05, + "loss": 1.7354, + "step": 11153 + }, + { + "epoch": 3.423572744014733, + "grad_norm": 0.2850257456302643, + "learning_rate": 7.652911526019716e-05, + "loss": 1.8422, + "step": 11154 + }, + { + "epoch": 3.423879680785758, + "grad_norm": 0.29787111282348633, + "learning_rate": 7.652490191645503e-05, + "loss": 1.8122, + "step": 11155 + }, + { + "epoch": 3.424186617556783, + "grad_norm": 0.2670947015285492, + "learning_rate": 7.652068831057826e-05, + "loss": 1.7734, + "step": 11156 + }, + { + "epoch": 3.4244935543278086, + "grad_norm": 0.26415133476257324, + "learning_rate": 7.651647444260853e-05, + "loss": 1.7661, + "step": 11157 + }, + { + "epoch": 3.424800491098834, + "grad_norm": 0.2614886164665222, + "learning_rate": 7.651226031258745e-05, + "loss": 1.6918, + "step": 11158 + }, + { + "epoch": 3.425107427869859, + "grad_norm": 0.28485649824142456, + "learning_rate": 7.650804592055667e-05, + "loss": 1.7771, + "step": 11159 + }, + { + "epoch": 3.425414364640884, + "grad_norm": 0.26080289483070374, + "learning_rate": 7.650383126655784e-05, + "loss": 1.7637, + "step": 11160 + }, + { + "epoch": 3.425721301411909, + "grad_norm": 0.2503695487976074, + "learning_rate": 7.649961635063261e-05, + "loss": 1.7864, + "step": 11161 + }, + { + "epoch": 3.4260282381829343, + "grad_norm": 0.3165570795536041, + "learning_rate": 7.649540117282263e-05, + "loss": 1.8107, + "step": 11162 + }, + { + "epoch": 3.4263351749539597, + "grad_norm": 0.28411731123924255, + "learning_rate": 7.649118573316959e-05, + "loss": 1.7557, + "step": 11163 + }, + { + "epoch": 3.4266421117249846, + "grad_norm": 0.24469570815563202, + "learning_rate": 7.648697003171512e-05, + "loss": 1.7597, + "step": 11164 + }, + { + "epoch": 3.42694904849601, + "grad_norm": 0.31968292593955994, + "learning_rate": 7.648275406850087e-05, + "loss": 1.7796, + "step": 11165 + }, + { + "epoch": 3.427255985267035, + "grad_norm": 0.24520765244960785, + "learning_rate": 7.647853784356856e-05, + "loss": 1.7931, + "step": 11166 + }, + { + "epoch": 3.42756292203806, + "grad_norm": 0.23946821689605713, + "learning_rate": 7.647432135695977e-05, + "loss": 1.7143, + "step": 11167 + }, + { + "epoch": 3.4278698588090855, + "grad_norm": 0.321455180644989, + "learning_rate": 7.647010460871624e-05, + "loss": 1.8682, + "step": 11168 + }, + { + "epoch": 3.4281767955801103, + "grad_norm": 0.2803197503089905, + "learning_rate": 7.646588759887964e-05, + "loss": 1.8, + "step": 11169 + }, + { + "epoch": 3.4284837323511357, + "grad_norm": 0.2597559988498688, + "learning_rate": 7.64616703274916e-05, + "loss": 1.8027, + "step": 11170 + }, + { + "epoch": 3.428790669122161, + "grad_norm": 0.25055503845214844, + "learning_rate": 7.645745279459384e-05, + "loss": 1.7659, + "step": 11171 + }, + { + "epoch": 3.429097605893186, + "grad_norm": 0.34582629799842834, + "learning_rate": 7.645323500022803e-05, + "loss": 1.7868, + "step": 11172 + }, + { + "epoch": 3.4294045426642112, + "grad_norm": 0.32845041155815125, + "learning_rate": 7.644901694443584e-05, + "loss": 1.8247, + "step": 11173 + }, + { + "epoch": 3.4297114794352366, + "grad_norm": 0.2570398449897766, + "learning_rate": 7.644479862725896e-05, + "loss": 1.7802, + "step": 11174 + }, + { + "epoch": 3.4300184162062615, + "grad_norm": 0.23117294907569885, + "learning_rate": 7.644058004873908e-05, + "loss": 1.7575, + "step": 11175 + }, + { + "epoch": 3.430325352977287, + "grad_norm": 0.2417830377817154, + "learning_rate": 7.64363612089179e-05, + "loss": 1.7954, + "step": 11176 + }, + { + "epoch": 3.4306322897483117, + "grad_norm": 0.249378964304924, + "learning_rate": 7.643214210783708e-05, + "loss": 1.8161, + "step": 11177 + }, + { + "epoch": 3.430939226519337, + "grad_norm": 0.24494746327400208, + "learning_rate": 7.642792274553836e-05, + "loss": 1.825, + "step": 11178 + }, + { + "epoch": 3.4312461632903624, + "grad_norm": 0.2663760185241699, + "learning_rate": 7.642370312206342e-05, + "loss": 1.7589, + "step": 11179 + }, + { + "epoch": 3.4315531000613873, + "grad_norm": 0.2819322645664215, + "learning_rate": 7.641948323745395e-05, + "loss": 1.8097, + "step": 11180 + }, + { + "epoch": 3.4318600368324126, + "grad_norm": 0.26917630434036255, + "learning_rate": 7.641526309175166e-05, + "loss": 1.7934, + "step": 11181 + }, + { + "epoch": 3.4321669736034375, + "grad_norm": 0.31618112325668335, + "learning_rate": 7.641104268499826e-05, + "loss": 1.8522, + "step": 11182 + }, + { + "epoch": 3.432473910374463, + "grad_norm": 0.29209139943122864, + "learning_rate": 7.640682201723546e-05, + "loss": 1.7499, + "step": 11183 + }, + { + "epoch": 3.432780847145488, + "grad_norm": 0.24831914901733398, + "learning_rate": 7.640260108850496e-05, + "loss": 1.7897, + "step": 11184 + }, + { + "epoch": 3.433087783916513, + "grad_norm": 0.2459818720817566, + "learning_rate": 7.639837989884849e-05, + "loss": 1.7604, + "step": 11185 + }, + { + "epoch": 3.4333947206875384, + "grad_norm": 0.27157485485076904, + "learning_rate": 7.639415844830774e-05, + "loss": 1.7776, + "step": 11186 + }, + { + "epoch": 3.4337016574585637, + "grad_norm": 0.3021515905857086, + "learning_rate": 7.638993673692445e-05, + "loss": 1.7771, + "step": 11187 + }, + { + "epoch": 3.4340085942295886, + "grad_norm": 0.2591722309589386, + "learning_rate": 7.638571476474036e-05, + "loss": 1.8333, + "step": 11188 + }, + { + "epoch": 3.434315531000614, + "grad_norm": 0.2255258709192276, + "learning_rate": 7.638149253179717e-05, + "loss": 1.7647, + "step": 11189 + }, + { + "epoch": 3.4346224677716393, + "grad_norm": 0.2585793733596802, + "learning_rate": 7.637727003813658e-05, + "loss": 1.786, + "step": 11190 + }, + { + "epoch": 3.434929404542664, + "grad_norm": 0.23649543523788452, + "learning_rate": 7.637304728380036e-05, + "loss": 1.822, + "step": 11191 + }, + { + "epoch": 3.4352363413136895, + "grad_norm": 0.2610832452774048, + "learning_rate": 7.636882426883023e-05, + "loss": 1.7925, + "step": 11192 + }, + { + "epoch": 3.4355432780847144, + "grad_norm": 0.26230642199516296, + "learning_rate": 7.636460099326793e-05, + "loss": 1.8169, + "step": 11193 + }, + { + "epoch": 3.4358502148557397, + "grad_norm": 0.2800561189651489, + "learning_rate": 7.636037745715518e-05, + "loss": 1.845, + "step": 11194 + }, + { + "epoch": 3.436157151626765, + "grad_norm": 0.27790409326553345, + "learning_rate": 7.635615366053372e-05, + "loss": 1.8141, + "step": 11195 + }, + { + "epoch": 3.43646408839779, + "grad_norm": 0.2894865870475769, + "learning_rate": 7.635192960344533e-05, + "loss": 1.7916, + "step": 11196 + }, + { + "epoch": 3.4367710251688153, + "grad_norm": 0.22310738265514374, + "learning_rate": 7.634770528593171e-05, + "loss": 1.79, + "step": 11197 + }, + { + "epoch": 3.43707796193984, + "grad_norm": 0.2837755084037781, + "learning_rate": 7.634348070803463e-05, + "loss": 1.8763, + "step": 11198 + }, + { + "epoch": 3.4373848987108655, + "grad_norm": 0.32488104701042175, + "learning_rate": 7.633925586979583e-05, + "loss": 1.8331, + "step": 11199 + }, + { + "epoch": 3.437691835481891, + "grad_norm": 0.2708779573440552, + "learning_rate": 7.633503077125706e-05, + "loss": 1.761, + "step": 11200 + }, + { + "epoch": 3.4379987722529157, + "grad_norm": 0.23929642140865326, + "learning_rate": 7.633080541246008e-05, + "loss": 1.8217, + "step": 11201 + }, + { + "epoch": 3.438305709023941, + "grad_norm": 0.3213331997394562, + "learning_rate": 7.632657979344667e-05, + "loss": 1.8375, + "step": 11202 + }, + { + "epoch": 3.4386126457949664, + "grad_norm": 0.38420629501342773, + "learning_rate": 7.632235391425854e-05, + "loss": 1.765, + "step": 11203 + }, + { + "epoch": 3.4389195825659913, + "grad_norm": 0.40466073155403137, + "learning_rate": 7.631812777493749e-05, + "loss": 1.8262, + "step": 11204 + }, + { + "epoch": 3.4392265193370166, + "grad_norm": 0.35904639959335327, + "learning_rate": 7.631390137552527e-05, + "loss": 1.894, + "step": 11205 + }, + { + "epoch": 3.439533456108042, + "grad_norm": 0.28880515694618225, + "learning_rate": 7.630967471606368e-05, + "loss": 1.87, + "step": 11206 + }, + { + "epoch": 3.439840392879067, + "grad_norm": 0.2878882884979248, + "learning_rate": 7.630544779659444e-05, + "loss": 1.7841, + "step": 11207 + }, + { + "epoch": 3.440147329650092, + "grad_norm": 0.36002418398857117, + "learning_rate": 7.630122061715935e-05, + "loss": 1.7318, + "step": 11208 + }, + { + "epoch": 3.440454266421117, + "grad_norm": 0.3304644227027893, + "learning_rate": 7.629699317780019e-05, + "loss": 1.8581, + "step": 11209 + }, + { + "epoch": 3.4407612031921424, + "grad_norm": 0.23396331071853638, + "learning_rate": 7.629276547855872e-05, + "loss": 1.7897, + "step": 11210 + }, + { + "epoch": 3.4410681399631677, + "grad_norm": 0.34914183616638184, + "learning_rate": 7.628853751947674e-05, + "loss": 1.8531, + "step": 11211 + }, + { + "epoch": 3.4413750767341926, + "grad_norm": 0.3700502812862396, + "learning_rate": 7.6284309300596e-05, + "loss": 1.7884, + "step": 11212 + }, + { + "epoch": 3.441682013505218, + "grad_norm": 0.24606801569461823, + "learning_rate": 7.628008082195835e-05, + "loss": 1.7292, + "step": 11213 + }, + { + "epoch": 3.441988950276243, + "grad_norm": 0.26344993710517883, + "learning_rate": 7.627585208360551e-05, + "loss": 1.7832, + "step": 11214 + }, + { + "epoch": 3.442295887047268, + "grad_norm": 0.4034743010997772, + "learning_rate": 7.62716230855793e-05, + "loss": 1.8164, + "step": 11215 + }, + { + "epoch": 3.4426028238182935, + "grad_norm": 0.4508039355278015, + "learning_rate": 7.626739382792152e-05, + "loss": 1.7855, + "step": 11216 + }, + { + "epoch": 3.4429097605893184, + "grad_norm": 0.2963111400604248, + "learning_rate": 7.626316431067395e-05, + "loss": 1.7995, + "step": 11217 + }, + { + "epoch": 3.4432166973603437, + "grad_norm": 0.35248515009880066, + "learning_rate": 7.625893453387841e-05, + "loss": 1.8761, + "step": 11218 + }, + { + "epoch": 3.443523634131369, + "grad_norm": 0.4032224416732788, + "learning_rate": 7.625470449757668e-05, + "loss": 1.7746, + "step": 11219 + }, + { + "epoch": 3.443830570902394, + "grad_norm": 0.3505195081233978, + "learning_rate": 7.625047420181057e-05, + "loss": 1.851, + "step": 11220 + }, + { + "epoch": 3.4441375076734193, + "grad_norm": 0.288968563079834, + "learning_rate": 7.62462436466219e-05, + "loss": 1.8055, + "step": 11221 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.43141910433769226, + "learning_rate": 7.624201283205246e-05, + "loss": 1.816, + "step": 11222 + }, + { + "epoch": 3.4447513812154695, + "grad_norm": 0.46902137994766235, + "learning_rate": 7.623778175814407e-05, + "loss": 1.8478, + "step": 11223 + }, + { + "epoch": 3.445058317986495, + "grad_norm": 0.3333328366279602, + "learning_rate": 7.623355042493854e-05, + "loss": 1.7949, + "step": 11224 + }, + { + "epoch": 3.4453652547575198, + "grad_norm": 0.2625340521335602, + "learning_rate": 7.622931883247768e-05, + "loss": 1.745, + "step": 11225 + }, + { + "epoch": 3.445672191528545, + "grad_norm": 0.4565848410129547, + "learning_rate": 7.622508698080333e-05, + "loss": 1.796, + "step": 11226 + }, + { + "epoch": 3.4459791282995704, + "grad_norm": 0.4676518738269806, + "learning_rate": 7.622085486995729e-05, + "loss": 1.8115, + "step": 11227 + }, + { + "epoch": 3.4462860650705953, + "grad_norm": 0.3828938603401184, + "learning_rate": 7.62166224999814e-05, + "loss": 1.8758, + "step": 11228 + }, + { + "epoch": 3.4465930018416207, + "grad_norm": 0.2786383628845215, + "learning_rate": 7.621238987091747e-05, + "loss": 1.7616, + "step": 11229 + }, + { + "epoch": 3.446899938612646, + "grad_norm": 0.4442835748195648, + "learning_rate": 7.620815698280734e-05, + "loss": 1.8342, + "step": 11230 + }, + { + "epoch": 3.447206875383671, + "grad_norm": 0.45760586857795715, + "learning_rate": 7.620392383569286e-05, + "loss": 1.8159, + "step": 11231 + }, + { + "epoch": 3.447513812154696, + "grad_norm": 0.2567009925842285, + "learning_rate": 7.619969042961583e-05, + "loss": 1.774, + "step": 11232 + }, + { + "epoch": 3.4478207489257215, + "grad_norm": 0.3720102310180664, + "learning_rate": 7.619545676461812e-05, + "loss": 1.8366, + "step": 11233 + }, + { + "epoch": 3.4481276856967464, + "grad_norm": 0.36436137557029724, + "learning_rate": 7.619122284074154e-05, + "loss": 1.832, + "step": 11234 + }, + { + "epoch": 3.4484346224677718, + "grad_norm": 0.310310959815979, + "learning_rate": 7.618698865802795e-05, + "loss": 1.9023, + "step": 11235 + }, + { + "epoch": 3.4487415592387967, + "grad_norm": 0.2693026661872864, + "learning_rate": 7.618275421651916e-05, + "loss": 1.7696, + "step": 11236 + }, + { + "epoch": 3.449048496009822, + "grad_norm": 0.2942425608634949, + "learning_rate": 7.61785195162571e-05, + "loss": 1.822, + "step": 11237 + }, + { + "epoch": 3.4493554327808473, + "grad_norm": 0.22454749047756195, + "learning_rate": 7.617428455728353e-05, + "loss": 1.7011, + "step": 11238 + }, + { + "epoch": 3.449662369551872, + "grad_norm": 0.23345038294792175, + "learning_rate": 7.617004933964035e-05, + "loss": 1.7563, + "step": 11239 + }, + { + "epoch": 3.4499693063228976, + "grad_norm": 0.24990662932395935, + "learning_rate": 7.616581386336941e-05, + "loss": 1.8031, + "step": 11240 + }, + { + "epoch": 3.4502762430939224, + "grad_norm": 0.2919348478317261, + "learning_rate": 7.616157812851254e-05, + "loss": 1.7355, + "step": 11241 + }, + { + "epoch": 3.450583179864948, + "grad_norm": 0.2926909327507019, + "learning_rate": 7.615734213511165e-05, + "loss": 1.8341, + "step": 11242 + }, + { + "epoch": 3.450890116635973, + "grad_norm": 0.24316683411598206, + "learning_rate": 7.615310588320855e-05, + "loss": 1.8154, + "step": 11243 + }, + { + "epoch": 3.451197053406998, + "grad_norm": 0.23154498636722565, + "learning_rate": 7.614886937284513e-05, + "loss": 1.7904, + "step": 11244 + }, + { + "epoch": 3.4515039901780233, + "grad_norm": 0.25973939895629883, + "learning_rate": 7.614463260406327e-05, + "loss": 1.7598, + "step": 11245 + }, + { + "epoch": 3.4518109269490487, + "grad_norm": 0.22110119462013245, + "learning_rate": 7.614039557690482e-05, + "loss": 1.7903, + "step": 11246 + }, + { + "epoch": 3.4521178637200736, + "grad_norm": 0.26184993982315063, + "learning_rate": 7.613615829141165e-05, + "loss": 1.748, + "step": 11247 + }, + { + "epoch": 3.452424800491099, + "grad_norm": 0.26128727197647095, + "learning_rate": 7.613192074762565e-05, + "loss": 1.7786, + "step": 11248 + }, + { + "epoch": 3.4527317372621242, + "grad_norm": 0.23230813443660736, + "learning_rate": 7.612768294558871e-05, + "loss": 1.8114, + "step": 11249 + }, + { + "epoch": 3.453038674033149, + "grad_norm": 0.2686540186405182, + "learning_rate": 7.612344488534268e-05, + "loss": 1.7311, + "step": 11250 + }, + { + "epoch": 3.4533456108041745, + "grad_norm": 0.25553348660469055, + "learning_rate": 7.611920656692946e-05, + "loss": 1.8468, + "step": 11251 + }, + { + "epoch": 3.4536525475751993, + "grad_norm": 0.2639308273792267, + "learning_rate": 7.611496799039092e-05, + "loss": 1.8292, + "step": 11252 + }, + { + "epoch": 3.4539594843462247, + "grad_norm": 0.2468358874320984, + "learning_rate": 7.611072915576895e-05, + "loss": 1.8173, + "step": 11253 + }, + { + "epoch": 3.45426642111725, + "grad_norm": 0.27236035466194153, + "learning_rate": 7.610649006310549e-05, + "loss": 1.8082, + "step": 11254 + }, + { + "epoch": 3.454573357888275, + "grad_norm": 0.2277914434671402, + "learning_rate": 7.610225071244237e-05, + "loss": 1.7483, + "step": 11255 + }, + { + "epoch": 3.4548802946593002, + "grad_norm": 0.2292868196964264, + "learning_rate": 7.60980111038215e-05, + "loss": 1.7716, + "step": 11256 + }, + { + "epoch": 3.455187231430325, + "grad_norm": 0.22116152942180634, + "learning_rate": 7.60937712372848e-05, + "loss": 1.773, + "step": 11257 + }, + { + "epoch": 3.4554941682013505, + "grad_norm": 0.23238304257392883, + "learning_rate": 7.608953111287416e-05, + "loss": 1.7602, + "step": 11258 + }, + { + "epoch": 3.455801104972376, + "grad_norm": 0.2810615003108978, + "learning_rate": 7.608529073063149e-05, + "loss": 1.8781, + "step": 11259 + }, + { + "epoch": 3.4561080417434007, + "grad_norm": 0.2516821324825287, + "learning_rate": 7.608105009059867e-05, + "loss": 1.835, + "step": 11260 + }, + { + "epoch": 3.456414978514426, + "grad_norm": 0.25698330998420715, + "learning_rate": 7.607680919281763e-05, + "loss": 1.7859, + "step": 11261 + }, + { + "epoch": 3.4567219152854514, + "grad_norm": 0.2597602903842926, + "learning_rate": 7.60725680373303e-05, + "loss": 1.8287, + "step": 11262 + }, + { + "epoch": 3.4570288520564763, + "grad_norm": 0.2564091980457306, + "learning_rate": 7.606832662417855e-05, + "loss": 1.8003, + "step": 11263 + }, + { + "epoch": 3.4573357888275016, + "grad_norm": 0.2872684597969055, + "learning_rate": 7.606408495340432e-05, + "loss": 1.8242, + "step": 11264 + }, + { + "epoch": 3.457642725598527, + "grad_norm": 0.27513590455055237, + "learning_rate": 7.605984302504952e-05, + "loss": 1.8605, + "step": 11265 + }, + { + "epoch": 3.457949662369552, + "grad_norm": 0.27768459916114807, + "learning_rate": 7.605560083915609e-05, + "loss": 1.7948, + "step": 11266 + }, + { + "epoch": 3.458256599140577, + "grad_norm": 0.23911382257938385, + "learning_rate": 7.605135839576593e-05, + "loss": 1.7575, + "step": 11267 + }, + { + "epoch": 3.458563535911602, + "grad_norm": 0.26773568987846375, + "learning_rate": 7.604711569492098e-05, + "loss": 1.752, + "step": 11268 + }, + { + "epoch": 3.4588704726826274, + "grad_norm": 0.30079394578933716, + "learning_rate": 7.604287273666316e-05, + "loss": 1.8022, + "step": 11269 + }, + { + "epoch": 3.4591774094536527, + "grad_norm": 0.27393853664398193, + "learning_rate": 7.603862952103441e-05, + "loss": 1.8054, + "step": 11270 + }, + { + "epoch": 3.4594843462246776, + "grad_norm": 0.2794870436191559, + "learning_rate": 7.603438604807667e-05, + "loss": 1.808, + "step": 11271 + }, + { + "epoch": 3.459791282995703, + "grad_norm": 0.26482146978378296, + "learning_rate": 7.603014231783185e-05, + "loss": 1.8696, + "step": 11272 + }, + { + "epoch": 3.460098219766728, + "grad_norm": 0.2755354344844818, + "learning_rate": 7.602589833034192e-05, + "loss": 1.8412, + "step": 11273 + }, + { + "epoch": 3.460405156537753, + "grad_norm": 0.2666642367839813, + "learning_rate": 7.602165408564883e-05, + "loss": 1.8333, + "step": 11274 + }, + { + "epoch": 3.4607120933087785, + "grad_norm": 0.26958519220352173, + "learning_rate": 7.601740958379448e-05, + "loss": 1.7943, + "step": 11275 + }, + { + "epoch": 3.4610190300798034, + "grad_norm": 0.2915789783000946, + "learning_rate": 7.601316482482084e-05, + "loss": 1.7519, + "step": 11276 + }, + { + "epoch": 3.4613259668508287, + "grad_norm": 0.2456950694322586, + "learning_rate": 7.600891980876985e-05, + "loss": 1.8064, + "step": 11277 + }, + { + "epoch": 3.461632903621854, + "grad_norm": 0.2517867088317871, + "learning_rate": 7.600467453568348e-05, + "loss": 1.7766, + "step": 11278 + }, + { + "epoch": 3.461939840392879, + "grad_norm": 0.24567969143390656, + "learning_rate": 7.600042900560368e-05, + "loss": 1.7331, + "step": 11279 + }, + { + "epoch": 3.4622467771639043, + "grad_norm": 0.23986820876598358, + "learning_rate": 7.599618321857239e-05, + "loss": 1.7477, + "step": 11280 + }, + { + "epoch": 3.4625537139349296, + "grad_norm": 0.2555375397205353, + "learning_rate": 7.599193717463158e-05, + "loss": 1.8154, + "step": 11281 + }, + { + "epoch": 3.4628606507059545, + "grad_norm": 0.2522781193256378, + "learning_rate": 7.598769087382323e-05, + "loss": 1.7821, + "step": 11282 + }, + { + "epoch": 3.46316758747698, + "grad_norm": 0.25631004571914673, + "learning_rate": 7.598344431618926e-05, + "loss": 1.8043, + "step": 11283 + }, + { + "epoch": 3.4634745242480047, + "grad_norm": 0.2611328661441803, + "learning_rate": 7.597919750177168e-05, + "loss": 1.8036, + "step": 11284 + }, + { + "epoch": 3.46378146101903, + "grad_norm": 0.255670428276062, + "learning_rate": 7.597495043061244e-05, + "loss": 1.7375, + "step": 11285 + }, + { + "epoch": 3.4640883977900554, + "grad_norm": 0.2687236964702606, + "learning_rate": 7.597070310275353e-05, + "loss": 1.7496, + "step": 11286 + }, + { + "epoch": 3.4643953345610803, + "grad_norm": 0.2643752992153168, + "learning_rate": 7.596645551823688e-05, + "loss": 1.8444, + "step": 11287 + }, + { + "epoch": 3.4647022713321056, + "grad_norm": 0.2564511299133301, + "learning_rate": 7.596220767710452e-05, + "loss": 1.7557, + "step": 11288 + }, + { + "epoch": 3.4650092081031305, + "grad_norm": 0.2510208487510681, + "learning_rate": 7.59579595793984e-05, + "loss": 1.7234, + "step": 11289 + }, + { + "epoch": 3.465316144874156, + "grad_norm": 0.2765158712863922, + "learning_rate": 7.595371122516051e-05, + "loss": 1.8215, + "step": 11290 + }, + { + "epoch": 3.465623081645181, + "grad_norm": 0.28233039379119873, + "learning_rate": 7.594946261443286e-05, + "loss": 1.7752, + "step": 11291 + }, + { + "epoch": 3.465930018416206, + "grad_norm": 0.26971468329429626, + "learning_rate": 7.594521374725735e-05, + "loss": 1.7924, + "step": 11292 + }, + { + "epoch": 3.4662369551872314, + "grad_norm": 0.29425930976867676, + "learning_rate": 7.594096462367608e-05, + "loss": 1.8144, + "step": 11293 + }, + { + "epoch": 3.4665438919582567, + "grad_norm": 0.233150452375412, + "learning_rate": 7.593671524373098e-05, + "loss": 1.7741, + "step": 11294 + }, + { + "epoch": 3.4668508287292816, + "grad_norm": 0.2947762608528137, + "learning_rate": 7.593246560746406e-05, + "loss": 1.8031, + "step": 11295 + }, + { + "epoch": 3.467157765500307, + "grad_norm": 0.250552773475647, + "learning_rate": 7.59282157149173e-05, + "loss": 1.7501, + "step": 11296 + }, + { + "epoch": 3.4674647022713323, + "grad_norm": 0.26091331243515015, + "learning_rate": 7.592396556613274e-05, + "loss": 1.836, + "step": 11297 + }, + { + "epoch": 3.467771639042357, + "grad_norm": 0.28625619411468506, + "learning_rate": 7.591971516115233e-05, + "loss": 1.7555, + "step": 11298 + }, + { + "epoch": 3.4680785758133825, + "grad_norm": 0.2723398804664612, + "learning_rate": 7.591546450001811e-05, + "loss": 1.825, + "step": 11299 + }, + { + "epoch": 3.4683855125844074, + "grad_norm": 0.24289946258068085, + "learning_rate": 7.591121358277211e-05, + "loss": 1.7441, + "step": 11300 + }, + { + "epoch": 3.4686924493554327, + "grad_norm": 0.2706952691078186, + "learning_rate": 7.590696240945629e-05, + "loss": 1.8651, + "step": 11301 + }, + { + "epoch": 3.468999386126458, + "grad_norm": 0.24632862210273743, + "learning_rate": 7.590271098011268e-05, + "loss": 1.8229, + "step": 11302 + }, + { + "epoch": 3.469306322897483, + "grad_norm": 0.29275211691856384, + "learning_rate": 7.58984592947833e-05, + "loss": 1.7591, + "step": 11303 + }, + { + "epoch": 3.4696132596685083, + "grad_norm": 0.29228144884109497, + "learning_rate": 7.589420735351016e-05, + "loss": 1.8395, + "step": 11304 + }, + { + "epoch": 3.4699201964395336, + "grad_norm": 0.28339114785194397, + "learning_rate": 7.588995515633528e-05, + "loss": 1.8543, + "step": 11305 + }, + { + "epoch": 3.4702271332105585, + "grad_norm": 0.2834693193435669, + "learning_rate": 7.588570270330071e-05, + "loss": 1.826, + "step": 11306 + }, + { + "epoch": 3.470534069981584, + "grad_norm": 0.26130759716033936, + "learning_rate": 7.588144999444844e-05, + "loss": 1.7887, + "step": 11307 + }, + { + "epoch": 3.470841006752609, + "grad_norm": 0.29554685950279236, + "learning_rate": 7.587719702982052e-05, + "loss": 1.819, + "step": 11308 + }, + { + "epoch": 3.471147943523634, + "grad_norm": 0.2687968611717224, + "learning_rate": 7.587294380945898e-05, + "loss": 1.7354, + "step": 11309 + }, + { + "epoch": 3.4714548802946594, + "grad_norm": 0.28795287013053894, + "learning_rate": 7.586869033340582e-05, + "loss": 1.8267, + "step": 11310 + }, + { + "epoch": 3.4717618170656843, + "grad_norm": 0.33244553208351135, + "learning_rate": 7.58644366017031e-05, + "loss": 1.86, + "step": 11311 + }, + { + "epoch": 3.4720687538367097, + "grad_norm": 0.2878025472164154, + "learning_rate": 7.586018261439288e-05, + "loss": 1.7587, + "step": 11312 + }, + { + "epoch": 3.472375690607735, + "grad_norm": 0.26856711506843567, + "learning_rate": 7.585592837151716e-05, + "loss": 1.7351, + "step": 11313 + }, + { + "epoch": 3.47268262737876, + "grad_norm": 0.2554367780685425, + "learning_rate": 7.585167387311802e-05, + "loss": 1.7664, + "step": 11314 + }, + { + "epoch": 3.472989564149785, + "grad_norm": 0.3193204700946808, + "learning_rate": 7.584741911923748e-05, + "loss": 1.7487, + "step": 11315 + }, + { + "epoch": 3.47329650092081, + "grad_norm": 0.3227958679199219, + "learning_rate": 7.584316410991759e-05, + "loss": 1.8107, + "step": 11316 + }, + { + "epoch": 3.4736034376918354, + "grad_norm": 0.33891916275024414, + "learning_rate": 7.58389088452004e-05, + "loss": 1.8466, + "step": 11317 + }, + { + "epoch": 3.4739103744628608, + "grad_norm": 0.27050724625587463, + "learning_rate": 7.583465332512797e-05, + "loss": 1.7877, + "step": 11318 + }, + { + "epoch": 3.4742173112338857, + "grad_norm": 0.2935837209224701, + "learning_rate": 7.583039754974235e-05, + "loss": 1.7932, + "step": 11319 + }, + { + "epoch": 3.474524248004911, + "grad_norm": 0.27780550718307495, + "learning_rate": 7.582614151908561e-05, + "loss": 1.8374, + "step": 11320 + }, + { + "epoch": 3.4748311847759363, + "grad_norm": 0.2579033076763153, + "learning_rate": 7.58218852331998e-05, + "loss": 1.7305, + "step": 11321 + }, + { + "epoch": 3.4751381215469612, + "grad_norm": 0.2531716227531433, + "learning_rate": 7.581762869212699e-05, + "loss": 1.8136, + "step": 11322 + }, + { + "epoch": 3.4754450583179866, + "grad_norm": 0.25504544377326965, + "learning_rate": 7.581337189590924e-05, + "loss": 1.787, + "step": 11323 + }, + { + "epoch": 3.475751995089012, + "grad_norm": 0.23659855127334595, + "learning_rate": 7.580911484458861e-05, + "loss": 1.77, + "step": 11324 + }, + { + "epoch": 3.476058931860037, + "grad_norm": 0.22556856274604797, + "learning_rate": 7.580485753820721e-05, + "loss": 1.7808, + "step": 11325 + }, + { + "epoch": 3.476365868631062, + "grad_norm": 0.2860291600227356, + "learning_rate": 7.580059997680705e-05, + "loss": 1.8224, + "step": 11326 + }, + { + "epoch": 3.476672805402087, + "grad_norm": 0.3134596645832062, + "learning_rate": 7.579634216043023e-05, + "loss": 1.8278, + "step": 11327 + }, + { + "epoch": 3.4769797421731123, + "grad_norm": 0.2883087992668152, + "learning_rate": 7.579208408911887e-05, + "loss": 1.7917, + "step": 11328 + }, + { + "epoch": 3.4772866789441377, + "grad_norm": 0.2743333578109741, + "learning_rate": 7.578782576291501e-05, + "loss": 1.8228, + "step": 11329 + }, + { + "epoch": 3.4775936157151626, + "grad_norm": 0.25026053190231323, + "learning_rate": 7.578356718186073e-05, + "loss": 1.7717, + "step": 11330 + }, + { + "epoch": 3.477900552486188, + "grad_norm": 0.246905118227005, + "learning_rate": 7.577930834599813e-05, + "loss": 1.7979, + "step": 11331 + }, + { + "epoch": 3.478207489257213, + "grad_norm": 0.24709418416023254, + "learning_rate": 7.577504925536929e-05, + "loss": 1.8111, + "step": 11332 + }, + { + "epoch": 3.478514426028238, + "grad_norm": 0.25685814023017883, + "learning_rate": 7.577078991001632e-05, + "loss": 1.8255, + "step": 11333 + }, + { + "epoch": 3.4788213627992635, + "grad_norm": 0.23937836289405823, + "learning_rate": 7.576653030998129e-05, + "loss": 1.7254, + "step": 11334 + }, + { + "epoch": 3.4791282995702884, + "grad_norm": 0.22638650238513947, + "learning_rate": 7.57622704553063e-05, + "loss": 1.7847, + "step": 11335 + }, + { + "epoch": 3.4794352363413137, + "grad_norm": 0.26083993911743164, + "learning_rate": 7.575801034603347e-05, + "loss": 1.7947, + "step": 11336 + }, + { + "epoch": 3.479742173112339, + "grad_norm": 0.2715466022491455, + "learning_rate": 7.575374998220488e-05, + "loss": 1.848, + "step": 11337 + }, + { + "epoch": 3.480049109883364, + "grad_norm": 0.25554224848747253, + "learning_rate": 7.574948936386262e-05, + "loss": 1.7811, + "step": 11338 + }, + { + "epoch": 3.4803560466543892, + "grad_norm": 0.2689397931098938, + "learning_rate": 7.574522849104882e-05, + "loss": 1.82, + "step": 11339 + }, + { + "epoch": 3.4806629834254146, + "grad_norm": 0.25027474761009216, + "learning_rate": 7.57409673638056e-05, + "loss": 1.775, + "step": 11340 + }, + { + "epoch": 3.4809699201964395, + "grad_norm": 0.2545457184314728, + "learning_rate": 7.573670598217504e-05, + "loss": 1.8056, + "step": 11341 + }, + { + "epoch": 3.481276856967465, + "grad_norm": 0.28404027223587036, + "learning_rate": 7.573244434619928e-05, + "loss": 1.8372, + "step": 11342 + }, + { + "epoch": 3.4815837937384897, + "grad_norm": 0.28046950697898865, + "learning_rate": 7.572818245592041e-05, + "loss": 1.7851, + "step": 11343 + }, + { + "epoch": 3.481890730509515, + "grad_norm": 0.23005759716033936, + "learning_rate": 7.572392031138056e-05, + "loss": 1.7059, + "step": 11344 + }, + { + "epoch": 3.4821976672805404, + "grad_norm": 0.2931719124317169, + "learning_rate": 7.571965791262185e-05, + "loss": 1.84, + "step": 11345 + }, + { + "epoch": 3.4825046040515653, + "grad_norm": 0.4399266242980957, + "learning_rate": 7.571539525968642e-05, + "loss": 1.7465, + "step": 11346 + }, + { + "epoch": 3.4828115408225906, + "grad_norm": 0.48957565426826477, + "learning_rate": 7.571113235261638e-05, + "loss": 1.8494, + "step": 11347 + }, + { + "epoch": 3.4831184775936155, + "grad_norm": 0.37828895449638367, + "learning_rate": 7.570686919145385e-05, + "loss": 1.7598, + "step": 11348 + }, + { + "epoch": 3.483425414364641, + "grad_norm": 0.22943973541259766, + "learning_rate": 7.570260577624098e-05, + "loss": 1.7443, + "step": 11349 + }, + { + "epoch": 3.483732351135666, + "grad_norm": 0.3245384991168976, + "learning_rate": 7.569834210701987e-05, + "loss": 1.7232, + "step": 11350 + }, + { + "epoch": 3.484039287906691, + "grad_norm": 0.4419693648815155, + "learning_rate": 7.569407818383271e-05, + "loss": 1.841, + "step": 11351 + }, + { + "epoch": 3.4843462246777164, + "grad_norm": 0.4061864912509918, + "learning_rate": 7.568981400672159e-05, + "loss": 1.8274, + "step": 11352 + }, + { + "epoch": 3.4846531614487417, + "grad_norm": 0.2609417736530304, + "learning_rate": 7.56855495757287e-05, + "loss": 1.8631, + "step": 11353 + }, + { + "epoch": 3.4849600982197666, + "grad_norm": 0.28758567571640015, + "learning_rate": 7.568128489089612e-05, + "loss": 1.8169, + "step": 11354 + }, + { + "epoch": 3.485267034990792, + "grad_norm": 0.40643060207366943, + "learning_rate": 7.567701995226606e-05, + "loss": 1.809, + "step": 11355 + }, + { + "epoch": 3.4855739717618173, + "grad_norm": 0.37649446725845337, + "learning_rate": 7.56727547598806e-05, + "loss": 1.7661, + "step": 11356 + }, + { + "epoch": 3.485880908532842, + "grad_norm": 0.22863779962062836, + "learning_rate": 7.566848931378197e-05, + "loss": 1.808, + "step": 11357 + }, + { + "epoch": 3.4861878453038675, + "grad_norm": 0.4487019181251526, + "learning_rate": 7.566422361401226e-05, + "loss": 1.7627, + "step": 11358 + }, + { + "epoch": 3.4864947820748924, + "grad_norm": 0.4583640694618225, + "learning_rate": 7.565995766061367e-05, + "loss": 1.8186, + "step": 11359 + }, + { + "epoch": 3.4868017188459177, + "grad_norm": 0.27231526374816895, + "learning_rate": 7.565569145362833e-05, + "loss": 1.8465, + "step": 11360 + }, + { + "epoch": 3.487108655616943, + "grad_norm": 0.3877887725830078, + "learning_rate": 7.565142499309841e-05, + "loss": 1.7668, + "step": 11361 + }, + { + "epoch": 3.487415592387968, + "grad_norm": 0.5511242747306824, + "learning_rate": 7.564715827906606e-05, + "loss": 1.8417, + "step": 11362 + }, + { + "epoch": 3.4877225291589933, + "grad_norm": 0.5112231373786926, + "learning_rate": 7.564289131157348e-05, + "loss": 1.8038, + "step": 11363 + }, + { + "epoch": 3.488029465930018, + "grad_norm": 0.279502809047699, + "learning_rate": 7.56386240906628e-05, + "loss": 1.7545, + "step": 11364 + }, + { + "epoch": 3.4883364027010435, + "grad_norm": 0.30080464482307434, + "learning_rate": 7.563435661637623e-05, + "loss": 1.8136, + "step": 11365 + }, + { + "epoch": 3.488643339472069, + "grad_norm": 0.4424717128276825, + "learning_rate": 7.563008888875591e-05, + "loss": 1.7542, + "step": 11366 + }, + { + "epoch": 3.4889502762430937, + "grad_norm": 0.42144715785980225, + "learning_rate": 7.562582090784403e-05, + "loss": 1.8245, + "step": 11367 + }, + { + "epoch": 3.489257213014119, + "grad_norm": 0.2533668875694275, + "learning_rate": 7.562155267368277e-05, + "loss": 1.8654, + "step": 11368 + }, + { + "epoch": 3.4895641497851444, + "grad_norm": 0.3327534794807434, + "learning_rate": 7.56172841863143e-05, + "loss": 1.7882, + "step": 11369 + }, + { + "epoch": 3.4898710865561693, + "grad_norm": 0.44001486897468567, + "learning_rate": 7.561301544578081e-05, + "loss": 1.8397, + "step": 11370 + }, + { + "epoch": 3.4901780233271946, + "grad_norm": 0.2779090106487274, + "learning_rate": 7.56087464521245e-05, + "loss": 1.7398, + "step": 11371 + }, + { + "epoch": 3.49048496009822, + "grad_norm": 0.3018067479133606, + "learning_rate": 7.560447720538755e-05, + "loss": 1.8076, + "step": 11372 + }, + { + "epoch": 3.490791896869245, + "grad_norm": 0.4370935261249542, + "learning_rate": 7.560020770561216e-05, + "loss": 1.8057, + "step": 11373 + }, + { + "epoch": 3.49109883364027, + "grad_norm": 0.2936978042125702, + "learning_rate": 7.559593795284047e-05, + "loss": 1.7726, + "step": 11374 + }, + { + "epoch": 3.491405770411295, + "grad_norm": 0.28825095295906067, + "learning_rate": 7.559166794711476e-05, + "loss": 1.8039, + "step": 11375 + }, + { + "epoch": 3.4917127071823204, + "grad_norm": 0.39334073662757874, + "learning_rate": 7.55873976884772e-05, + "loss": 1.8388, + "step": 11376 + }, + { + "epoch": 3.4920196439533457, + "grad_norm": 0.33880460262298584, + "learning_rate": 7.558312717696995e-05, + "loss": 1.7791, + "step": 11377 + }, + { + "epoch": 3.4923265807243706, + "grad_norm": 0.4433762729167938, + "learning_rate": 7.557885641263524e-05, + "loss": 1.7786, + "step": 11378 + }, + { + "epoch": 3.492633517495396, + "grad_norm": 0.4710264205932617, + "learning_rate": 7.557458539551527e-05, + "loss": 1.7193, + "step": 11379 + }, + { + "epoch": 3.4929404542664213, + "grad_norm": 0.27514326572418213, + "learning_rate": 7.557031412565228e-05, + "loss": 1.823, + "step": 11380 + }, + { + "epoch": 3.493247391037446, + "grad_norm": 0.4681413471698761, + "learning_rate": 7.556604260308846e-05, + "loss": 1.7598, + "step": 11381 + }, + { + "epoch": 3.4935543278084715, + "grad_norm": 0.5032503604888916, + "learning_rate": 7.556177082786602e-05, + "loss": 1.741, + "step": 11382 + }, + { + "epoch": 3.493861264579497, + "grad_norm": 0.2677086889743805, + "learning_rate": 7.555749880002716e-05, + "loss": 1.8528, + "step": 11383 + }, + { + "epoch": 3.4941682013505218, + "grad_norm": 0.43870940804481506, + "learning_rate": 7.555322651961414e-05, + "loss": 1.7632, + "step": 11384 + }, + { + "epoch": 3.494475138121547, + "grad_norm": 0.5403209924697876, + "learning_rate": 7.554895398666914e-05, + "loss": 1.8181, + "step": 11385 + }, + { + "epoch": 3.494782074892572, + "grad_norm": 0.2714318335056305, + "learning_rate": 7.554468120123441e-05, + "loss": 1.8151, + "step": 11386 + }, + { + "epoch": 3.4950890116635973, + "grad_norm": 0.49661698937416077, + "learning_rate": 7.554040816335217e-05, + "loss": 1.8116, + "step": 11387 + }, + { + "epoch": 3.4953959484346226, + "grad_norm": 0.49954715371131897, + "learning_rate": 7.553613487306465e-05, + "loss": 1.8841, + "step": 11388 + }, + { + "epoch": 3.4957028852056475, + "grad_norm": 0.28189441561698914, + "learning_rate": 7.553186133041406e-05, + "loss": 1.7834, + "step": 11389 + }, + { + "epoch": 3.496009821976673, + "grad_norm": 0.36029115319252014, + "learning_rate": 7.552758753544267e-05, + "loss": 1.7796, + "step": 11390 + }, + { + "epoch": 3.4963167587476978, + "grad_norm": 0.45023465156555176, + "learning_rate": 7.552331348819268e-05, + "loss": 1.8773, + "step": 11391 + }, + { + "epoch": 3.496623695518723, + "grad_norm": 0.3235788643360138, + "learning_rate": 7.551903918870636e-05, + "loss": 1.7984, + "step": 11392 + }, + { + "epoch": 3.4969306322897484, + "grad_norm": 0.25656190514564514, + "learning_rate": 7.551476463702596e-05, + "loss": 1.8403, + "step": 11393 + }, + { + "epoch": 3.4972375690607733, + "grad_norm": 0.2866458594799042, + "learning_rate": 7.551048983319366e-05, + "loss": 1.7428, + "step": 11394 + }, + { + "epoch": 3.4975445058317987, + "grad_norm": 0.2713877856731415, + "learning_rate": 7.550621477725177e-05, + "loss": 1.8508, + "step": 11395 + }, + { + "epoch": 3.497851442602824, + "grad_norm": 0.27978867292404175, + "learning_rate": 7.55019394692425e-05, + "loss": 1.8049, + "step": 11396 + }, + { + "epoch": 3.498158379373849, + "grad_norm": 0.3275020122528076, + "learning_rate": 7.549766390920814e-05, + "loss": 1.8553, + "step": 11397 + }, + { + "epoch": 3.498465316144874, + "grad_norm": 0.29947492480278015, + "learning_rate": 7.54933880971909e-05, + "loss": 1.7614, + "step": 11398 + }, + { + "epoch": 3.4987722529158995, + "grad_norm": 0.25790849328041077, + "learning_rate": 7.548911203323308e-05, + "loss": 1.8223, + "step": 11399 + }, + { + "epoch": 3.4990791896869244, + "grad_norm": 0.3145451545715332, + "learning_rate": 7.54848357173769e-05, + "loss": 1.7642, + "step": 11400 + }, + { + "epoch": 3.4993861264579498, + "grad_norm": 0.29052913188934326, + "learning_rate": 7.548055914966463e-05, + "loss": 1.7728, + "step": 11401 + }, + { + "epoch": 3.4996930632289747, + "grad_norm": 0.2741037905216217, + "learning_rate": 7.547628233013854e-05, + "loss": 1.7382, + "step": 11402 + }, + { + "epoch": 3.5, + "grad_norm": 0.2562723755836487, + "learning_rate": 7.54720052588409e-05, + "loss": 1.7455, + "step": 11403 + }, + { + "epoch": 3.5003069367710253, + "grad_norm": 0.27649983763694763, + "learning_rate": 7.546772793581398e-05, + "loss": 1.7194, + "step": 11404 + }, + { + "epoch": 3.5006138735420502, + "grad_norm": 0.27290579676628113, + "learning_rate": 7.546345036110004e-05, + "loss": 1.87, + "step": 11405 + }, + { + "epoch": 3.5009208103130756, + "grad_norm": 0.33585605025291443, + "learning_rate": 7.545917253474136e-05, + "loss": 1.7703, + "step": 11406 + }, + { + "epoch": 3.5012277470841005, + "grad_norm": 0.2592691481113434, + "learning_rate": 7.545489445678022e-05, + "loss": 1.7657, + "step": 11407 + }, + { + "epoch": 3.501534683855126, + "grad_norm": 0.3081367015838623, + "learning_rate": 7.545061612725888e-05, + "loss": 1.8067, + "step": 11408 + }, + { + "epoch": 3.501841620626151, + "grad_norm": 0.31012001633644104, + "learning_rate": 7.544633754621965e-05, + "loss": 1.8009, + "step": 11409 + }, + { + "epoch": 3.5021485573971765, + "grad_norm": 0.28232479095458984, + "learning_rate": 7.54420587137048e-05, + "loss": 1.8124, + "step": 11410 + }, + { + "epoch": 3.5024554941682013, + "grad_norm": 0.24079222977161407, + "learning_rate": 7.54377796297566e-05, + "loss": 1.789, + "step": 11411 + }, + { + "epoch": 3.5027624309392267, + "grad_norm": 0.27347204089164734, + "learning_rate": 7.543350029441737e-05, + "loss": 1.7704, + "step": 11412 + }, + { + "epoch": 3.5030693677102516, + "grad_norm": 0.25545811653137207, + "learning_rate": 7.542922070772935e-05, + "loss": 1.7871, + "step": 11413 + }, + { + "epoch": 3.503376304481277, + "grad_norm": 0.2507263123989105, + "learning_rate": 7.54249408697349e-05, + "loss": 1.8424, + "step": 11414 + }, + { + "epoch": 3.5036832412523022, + "grad_norm": 0.2776084244251251, + "learning_rate": 7.542066078047627e-05, + "loss": 1.8246, + "step": 11415 + }, + { + "epoch": 3.503990178023327, + "grad_norm": 0.32833749055862427, + "learning_rate": 7.541638043999577e-05, + "loss": 1.7785, + "step": 11416 + }, + { + "epoch": 3.5042971147943525, + "grad_norm": 0.258486270904541, + "learning_rate": 7.541209984833571e-05, + "loss": 1.7543, + "step": 11417 + }, + { + "epoch": 3.5046040515653774, + "grad_norm": 0.25825178623199463, + "learning_rate": 7.540781900553837e-05, + "loss": 1.7939, + "step": 11418 + }, + { + "epoch": 3.5049109883364027, + "grad_norm": 0.26980888843536377, + "learning_rate": 7.540353791164606e-05, + "loss": 1.7777, + "step": 11419 + }, + { + "epoch": 3.505217925107428, + "grad_norm": 0.24103333055973053, + "learning_rate": 7.539925656670111e-05, + "loss": 1.7565, + "step": 11420 + }, + { + "epoch": 3.505524861878453, + "grad_norm": 0.25192007422447205, + "learning_rate": 7.539497497074584e-05, + "loss": 1.7696, + "step": 11421 + }, + { + "epoch": 3.5058317986494782, + "grad_norm": 0.218489870429039, + "learning_rate": 7.539069312382252e-05, + "loss": 1.761, + "step": 11422 + }, + { + "epoch": 3.506138735420503, + "grad_norm": 0.27533552050590515, + "learning_rate": 7.53864110259735e-05, + "loss": 1.7374, + "step": 11423 + }, + { + "epoch": 3.5064456721915285, + "grad_norm": 0.2603490650653839, + "learning_rate": 7.538212867724108e-05, + "loss": 1.8342, + "step": 11424 + }, + { + "epoch": 3.506752608962554, + "grad_norm": 0.27340635657310486, + "learning_rate": 7.537784607766758e-05, + "loss": 1.8099, + "step": 11425 + }, + { + "epoch": 3.507059545733579, + "grad_norm": 0.25342679023742676, + "learning_rate": 7.537356322729537e-05, + "loss": 1.7949, + "step": 11426 + }, + { + "epoch": 3.507366482504604, + "grad_norm": 0.292819082736969, + "learning_rate": 7.536928012616669e-05, + "loss": 1.9049, + "step": 11427 + }, + { + "epoch": 3.5076734192756294, + "grad_norm": 0.28256532549858093, + "learning_rate": 7.536499677432393e-05, + "loss": 1.8464, + "step": 11428 + }, + { + "epoch": 3.5079803560466543, + "grad_norm": 0.2672989070415497, + "learning_rate": 7.536071317180942e-05, + "loss": 1.8301, + "step": 11429 + }, + { + "epoch": 3.5082872928176796, + "grad_norm": 0.2525518238544464, + "learning_rate": 7.535642931866546e-05, + "loss": 1.8054, + "step": 11430 + }, + { + "epoch": 3.508594229588705, + "grad_norm": 0.2622447609901428, + "learning_rate": 7.535214521493442e-05, + "loss": 1.8293, + "step": 11431 + }, + { + "epoch": 3.50890116635973, + "grad_norm": 0.27057385444641113, + "learning_rate": 7.534786086065859e-05, + "loss": 1.7426, + "step": 11432 + }, + { + "epoch": 3.509208103130755, + "grad_norm": 0.27363866567611694, + "learning_rate": 7.534357625588038e-05, + "loss": 1.7138, + "step": 11433 + }, + { + "epoch": 3.50951503990178, + "grad_norm": 0.3029060363769531, + "learning_rate": 7.533929140064207e-05, + "loss": 1.864, + "step": 11434 + }, + { + "epoch": 3.5098219766728054, + "grad_norm": 0.3144821524620056, + "learning_rate": 7.533500629498604e-05, + "loss": 1.7846, + "step": 11435 + }, + { + "epoch": 3.5101289134438307, + "grad_norm": 0.44535213708877563, + "learning_rate": 7.533072093895461e-05, + "loss": 1.799, + "step": 11436 + }, + { + "epoch": 3.5104358502148556, + "grad_norm": 0.25344160199165344, + "learning_rate": 7.532643533259017e-05, + "loss": 1.7391, + "step": 11437 + }, + { + "epoch": 3.510742786985881, + "grad_norm": 0.286026269197464, + "learning_rate": 7.532214947593506e-05, + "loss": 1.8436, + "step": 11438 + }, + { + "epoch": 3.511049723756906, + "grad_norm": 0.3317352533340454, + "learning_rate": 7.53178633690316e-05, + "loss": 1.8507, + "step": 11439 + }, + { + "epoch": 3.511356660527931, + "grad_norm": 0.2547265589237213, + "learning_rate": 7.53135770119222e-05, + "loss": 1.7483, + "step": 11440 + }, + { + "epoch": 3.5116635972989565, + "grad_norm": 0.24281835556030273, + "learning_rate": 7.530929040464917e-05, + "loss": 1.759, + "step": 11441 + }, + { + "epoch": 3.511970534069982, + "grad_norm": 0.2935381829738617, + "learning_rate": 7.530500354725491e-05, + "loss": 1.8235, + "step": 11442 + }, + { + "epoch": 3.5122774708410067, + "grad_norm": 0.26642969250679016, + "learning_rate": 7.53007164397818e-05, + "loss": 1.8324, + "step": 11443 + }, + { + "epoch": 3.512584407612032, + "grad_norm": 0.24830882251262665, + "learning_rate": 7.529642908227215e-05, + "loss": 1.8132, + "step": 11444 + }, + { + "epoch": 3.512891344383057, + "grad_norm": 0.3100191056728363, + "learning_rate": 7.529214147476838e-05, + "loss": 1.8453, + "step": 11445 + }, + { + "epoch": 3.5131982811540823, + "grad_norm": 0.27948811650276184, + "learning_rate": 7.528785361731282e-05, + "loss": 1.7792, + "step": 11446 + }, + { + "epoch": 3.5135052179251076, + "grad_norm": 0.26978832483291626, + "learning_rate": 7.528356550994787e-05, + "loss": 1.7857, + "step": 11447 + }, + { + "epoch": 3.5138121546961325, + "grad_norm": 0.30527836084365845, + "learning_rate": 7.527927715271592e-05, + "loss": 1.807, + "step": 11448 + }, + { + "epoch": 3.514119091467158, + "grad_norm": 0.2915664315223694, + "learning_rate": 7.527498854565934e-05, + "loss": 1.8414, + "step": 11449 + }, + { + "epoch": 3.5144260282381827, + "grad_norm": 0.2854034900665283, + "learning_rate": 7.52706996888205e-05, + "loss": 1.793, + "step": 11450 + }, + { + "epoch": 3.514732965009208, + "grad_norm": 0.30281978845596313, + "learning_rate": 7.52664105822418e-05, + "loss": 1.7896, + "step": 11451 + }, + { + "epoch": 3.5150399017802334, + "grad_norm": 0.3317166566848755, + "learning_rate": 7.526212122596561e-05, + "loss": 1.7776, + "step": 11452 + }, + { + "epoch": 3.5153468385512583, + "grad_norm": 0.3400021195411682, + "learning_rate": 7.525783162003434e-05, + "loss": 1.8411, + "step": 11453 + }, + { + "epoch": 3.5156537753222836, + "grad_norm": 0.25169485807418823, + "learning_rate": 7.525354176449037e-05, + "loss": 1.7871, + "step": 11454 + }, + { + "epoch": 3.5159607120933085, + "grad_norm": 0.3442455530166626, + "learning_rate": 7.52492516593761e-05, + "loss": 1.7644, + "step": 11455 + }, + { + "epoch": 3.516267648864334, + "grad_norm": 0.35644033551216125, + "learning_rate": 7.524496130473394e-05, + "loss": 1.801, + "step": 11456 + }, + { + "epoch": 3.516574585635359, + "grad_norm": 0.3180185854434967, + "learning_rate": 7.524067070060625e-05, + "loss": 1.7897, + "step": 11457 + }, + { + "epoch": 3.5168815224063845, + "grad_norm": 0.2417978048324585, + "learning_rate": 7.523637984703548e-05, + "loss": 1.8527, + "step": 11458 + }, + { + "epoch": 3.5171884591774094, + "grad_norm": 0.29661375284194946, + "learning_rate": 7.5232088744064e-05, + "loss": 1.8276, + "step": 11459 + }, + { + "epoch": 3.5174953959484347, + "grad_norm": 0.2467545121908188, + "learning_rate": 7.522779739173424e-05, + "loss": 1.7819, + "step": 11460 + }, + { + "epoch": 3.5178023327194596, + "grad_norm": 0.26177898049354553, + "learning_rate": 7.522350579008859e-05, + "loss": 1.8017, + "step": 11461 + }, + { + "epoch": 3.518109269490485, + "grad_norm": 0.28740498423576355, + "learning_rate": 7.521921393916948e-05, + "loss": 1.7863, + "step": 11462 + }, + { + "epoch": 3.5184162062615103, + "grad_norm": 0.28685200214385986, + "learning_rate": 7.521492183901932e-05, + "loss": 1.8069, + "step": 11463 + }, + { + "epoch": 3.518723143032535, + "grad_norm": 0.24174338579177856, + "learning_rate": 7.521062948968051e-05, + "loss": 1.7523, + "step": 11464 + }, + { + "epoch": 3.5190300798035605, + "grad_norm": 0.23273243010044098, + "learning_rate": 7.520633689119548e-05, + "loss": 1.7827, + "step": 11465 + }, + { + "epoch": 3.5193370165745854, + "grad_norm": 0.22708217799663544, + "learning_rate": 7.520204404360667e-05, + "loss": 1.7377, + "step": 11466 + }, + { + "epoch": 3.5196439533456108, + "grad_norm": 0.24725353717803955, + "learning_rate": 7.519775094695649e-05, + "loss": 1.7828, + "step": 11467 + }, + { + "epoch": 3.519950890116636, + "grad_norm": 0.23046265542507172, + "learning_rate": 7.519345760128736e-05, + "loss": 1.7427, + "step": 11468 + }, + { + "epoch": 3.520257826887661, + "grad_norm": 0.2618728280067444, + "learning_rate": 7.518916400664171e-05, + "loss": 1.8133, + "step": 11469 + }, + { + "epoch": 3.5205647636586863, + "grad_norm": 0.23232363164424896, + "learning_rate": 7.5184870163062e-05, + "loss": 1.7468, + "step": 11470 + }, + { + "epoch": 3.520871700429711, + "grad_norm": 0.21993626654148102, + "learning_rate": 7.51805760705906e-05, + "loss": 1.7565, + "step": 11471 + }, + { + "epoch": 3.5211786372007365, + "grad_norm": 0.23563124239444733, + "learning_rate": 7.517628172927001e-05, + "loss": 1.7795, + "step": 11472 + }, + { + "epoch": 3.521485573971762, + "grad_norm": 0.24502862989902496, + "learning_rate": 7.517198713914266e-05, + "loss": 1.813, + "step": 11473 + }, + { + "epoch": 3.521792510742787, + "grad_norm": 0.24745969474315643, + "learning_rate": 7.516769230025097e-05, + "loss": 1.7601, + "step": 11474 + }, + { + "epoch": 3.522099447513812, + "grad_norm": 0.27686986327171326, + "learning_rate": 7.516339721263739e-05, + "loss": 1.8121, + "step": 11475 + }, + { + "epoch": 3.5224063842848374, + "grad_norm": 0.3110332787036896, + "learning_rate": 7.515910187634439e-05, + "loss": 1.7978, + "step": 11476 + }, + { + "epoch": 3.5227133210558623, + "grad_norm": 0.3394792377948761, + "learning_rate": 7.515480629141436e-05, + "loss": 1.8427, + "step": 11477 + }, + { + "epoch": 3.5230202578268877, + "grad_norm": 0.2802537679672241, + "learning_rate": 7.515051045788984e-05, + "loss": 1.7343, + "step": 11478 + }, + { + "epoch": 3.523327194597913, + "grad_norm": 0.23687711358070374, + "learning_rate": 7.514621437581319e-05, + "loss": 1.7786, + "step": 11479 + }, + { + "epoch": 3.523634131368938, + "grad_norm": 0.31114310026168823, + "learning_rate": 7.514191804522693e-05, + "loss": 1.8137, + "step": 11480 + }, + { + "epoch": 3.523941068139963, + "grad_norm": 0.3257891833782196, + "learning_rate": 7.513762146617351e-05, + "loss": 1.8015, + "step": 11481 + }, + { + "epoch": 3.524248004910988, + "grad_norm": 0.24353443086147308, + "learning_rate": 7.513332463869536e-05, + "loss": 1.7485, + "step": 11482 + }, + { + "epoch": 3.5245549416820134, + "grad_norm": 0.29861485958099365, + "learning_rate": 7.512902756283498e-05, + "loss": 1.7993, + "step": 11483 + }, + { + "epoch": 3.5248618784530388, + "grad_norm": 0.40380924940109253, + "learning_rate": 7.51247302386348e-05, + "loss": 1.7664, + "step": 11484 + }, + { + "epoch": 3.525168815224064, + "grad_norm": 0.3365862965583801, + "learning_rate": 7.512043266613733e-05, + "loss": 1.7512, + "step": 11485 + }, + { + "epoch": 3.525475751995089, + "grad_norm": 0.2502824068069458, + "learning_rate": 7.511613484538502e-05, + "loss": 1.8414, + "step": 11486 + }, + { + "epoch": 3.5257826887661143, + "grad_norm": 0.2598603069782257, + "learning_rate": 7.511183677642034e-05, + "loss": 1.7358, + "step": 11487 + }, + { + "epoch": 3.5260896255371392, + "grad_norm": 0.30246880650520325, + "learning_rate": 7.510753845928576e-05, + "loss": 1.791, + "step": 11488 + }, + { + "epoch": 3.5263965623081646, + "grad_norm": 0.25170832872390747, + "learning_rate": 7.510323989402378e-05, + "loss": 1.7498, + "step": 11489 + }, + { + "epoch": 3.52670349907919, + "grad_norm": 0.2925282418727875, + "learning_rate": 7.509894108067688e-05, + "loss": 1.8413, + "step": 11490 + }, + { + "epoch": 3.527010435850215, + "grad_norm": 0.2643601596355438, + "learning_rate": 7.509464201928752e-05, + "loss": 1.8052, + "step": 11491 + }, + { + "epoch": 3.52731737262124, + "grad_norm": 0.2938917279243469, + "learning_rate": 7.50903427098982e-05, + "loss": 1.7308, + "step": 11492 + }, + { + "epoch": 3.527624309392265, + "grad_norm": 0.2978343367576599, + "learning_rate": 7.508604315255142e-05, + "loss": 1.8147, + "step": 11493 + }, + { + "epoch": 3.5279312461632903, + "grad_norm": 0.2507816255092621, + "learning_rate": 7.508174334728963e-05, + "loss": 1.774, + "step": 11494 + }, + { + "epoch": 3.5282381829343157, + "grad_norm": 0.32971861958503723, + "learning_rate": 7.507744329415538e-05, + "loss": 1.7634, + "step": 11495 + }, + { + "epoch": 3.5285451197053406, + "grad_norm": 0.3149639964103699, + "learning_rate": 7.507314299319113e-05, + "loss": 1.8032, + "step": 11496 + }, + { + "epoch": 3.528852056476366, + "grad_norm": 0.2721364498138428, + "learning_rate": 7.506884244443937e-05, + "loss": 1.7702, + "step": 11497 + }, + { + "epoch": 3.529158993247391, + "grad_norm": 0.29375985264778137, + "learning_rate": 7.506454164794263e-05, + "loss": 1.8673, + "step": 11498 + }, + { + "epoch": 3.529465930018416, + "grad_norm": 0.379944384098053, + "learning_rate": 7.50602406037434e-05, + "loss": 1.883, + "step": 11499 + }, + { + "epoch": 3.5297728667894415, + "grad_norm": 0.4041840136051178, + "learning_rate": 7.505593931188417e-05, + "loss": 1.7998, + "step": 11500 + }, + { + "epoch": 3.530079803560467, + "grad_norm": 0.30013784766197205, + "learning_rate": 7.505163777240747e-05, + "loss": 1.775, + "step": 11501 + }, + { + "epoch": 3.5303867403314917, + "grad_norm": 0.25161153078079224, + "learning_rate": 7.50473359853558e-05, + "loss": 1.8609, + "step": 11502 + }, + { + "epoch": 3.530693677102517, + "grad_norm": 0.2803831100463867, + "learning_rate": 7.504303395077168e-05, + "loss": 1.8397, + "step": 11503 + }, + { + "epoch": 3.531000613873542, + "grad_norm": 0.26678118109703064, + "learning_rate": 7.503873166869762e-05, + "loss": 1.7877, + "step": 11504 + }, + { + "epoch": 3.5313075506445673, + "grad_norm": 0.24280449748039246, + "learning_rate": 7.503442913917613e-05, + "loss": 1.7891, + "step": 11505 + }, + { + "epoch": 3.5316144874155926, + "grad_norm": 0.26461485028266907, + "learning_rate": 7.503012636224976e-05, + "loss": 1.7993, + "step": 11506 + }, + { + "epoch": 3.5319214241866175, + "grad_norm": 0.27001824975013733, + "learning_rate": 7.502582333796098e-05, + "loss": 1.7719, + "step": 11507 + }, + { + "epoch": 3.532228360957643, + "grad_norm": 0.27585846185684204, + "learning_rate": 7.502152006635237e-05, + "loss": 1.7412, + "step": 11508 + }, + { + "epoch": 3.5325352977286677, + "grad_norm": 0.24896648526191711, + "learning_rate": 7.501721654746643e-05, + "loss": 1.7459, + "step": 11509 + }, + { + "epoch": 3.532842234499693, + "grad_norm": 0.2308502197265625, + "learning_rate": 7.501291278134569e-05, + "loss": 1.7717, + "step": 11510 + }, + { + "epoch": 3.5331491712707184, + "grad_norm": 0.3026069104671478, + "learning_rate": 7.500860876803267e-05, + "loss": 1.8578, + "step": 11511 + }, + { + "epoch": 3.5334561080417433, + "grad_norm": 0.30242082476615906, + "learning_rate": 7.500430450756995e-05, + "loss": 1.7793, + "step": 11512 + }, + { + "epoch": 3.5337630448127686, + "grad_norm": 0.2583339214324951, + "learning_rate": 7.500000000000001e-05, + "loss": 1.8388, + "step": 11513 + }, + { + "epoch": 3.5340699815837935, + "grad_norm": 0.29673871397972107, + "learning_rate": 7.499569524536542e-05, + "loss": 1.7749, + "step": 11514 + }, + { + "epoch": 3.534376918354819, + "grad_norm": 0.35199788212776184, + "learning_rate": 7.499139024370874e-05, + "loss": 1.7863, + "step": 11515 + }, + { + "epoch": 3.534683855125844, + "grad_norm": 0.25776436924934387, + "learning_rate": 7.498708499507247e-05, + "loss": 1.7568, + "step": 11516 + }, + { + "epoch": 3.5349907918968695, + "grad_norm": 0.26081520318984985, + "learning_rate": 7.498277949949919e-05, + "loss": 1.807, + "step": 11517 + }, + { + "epoch": 3.5352977286678944, + "grad_norm": 0.29247912764549255, + "learning_rate": 7.497847375703145e-05, + "loss": 1.7568, + "step": 11518 + }, + { + "epoch": 3.5356046654389197, + "grad_norm": 0.20964498817920685, + "learning_rate": 7.497416776771178e-05, + "loss": 1.7601, + "step": 11519 + }, + { + "epoch": 3.5359116022099446, + "grad_norm": 0.28739818930625916, + "learning_rate": 7.496986153158273e-05, + "loss": 1.7915, + "step": 11520 + }, + { + "epoch": 3.53621853898097, + "grad_norm": 0.3109932839870453, + "learning_rate": 7.496555504868691e-05, + "loss": 1.8046, + "step": 11521 + }, + { + "epoch": 3.5365254757519953, + "grad_norm": 0.259284108877182, + "learning_rate": 7.496124831906681e-05, + "loss": 1.7595, + "step": 11522 + }, + { + "epoch": 3.53683241252302, + "grad_norm": 0.265909343957901, + "learning_rate": 7.495694134276504e-05, + "loss": 1.8249, + "step": 11523 + }, + { + "epoch": 3.5371393492940455, + "grad_norm": 0.2478799819946289, + "learning_rate": 7.495263411982415e-05, + "loss": 1.8531, + "step": 11524 + }, + { + "epoch": 3.5374462860650704, + "grad_norm": 0.2636432945728302, + "learning_rate": 7.494832665028671e-05, + "loss": 1.8114, + "step": 11525 + }, + { + "epoch": 3.5377532228360957, + "grad_norm": 0.25323864817619324, + "learning_rate": 7.494401893419527e-05, + "loss": 1.8271, + "step": 11526 + }, + { + "epoch": 3.538060159607121, + "grad_norm": 0.2352467179298401, + "learning_rate": 7.493971097159241e-05, + "loss": 1.7524, + "step": 11527 + }, + { + "epoch": 3.538367096378146, + "grad_norm": 0.2788623869419098, + "learning_rate": 7.493540276252072e-05, + "loss": 1.8238, + "step": 11528 + }, + { + "epoch": 3.5386740331491713, + "grad_norm": 0.3506326377391815, + "learning_rate": 7.493109430702277e-05, + "loss": 1.8525, + "step": 11529 + }, + { + "epoch": 3.538980969920196, + "grad_norm": 0.3685263395309448, + "learning_rate": 7.492678560514113e-05, + "loss": 1.8497, + "step": 11530 + }, + { + "epoch": 3.5392879066912215, + "grad_norm": 0.32200056314468384, + "learning_rate": 7.492247665691837e-05, + "loss": 1.7587, + "step": 11531 + }, + { + "epoch": 3.539594843462247, + "grad_norm": 0.2800062894821167, + "learning_rate": 7.49181674623971e-05, + "loss": 1.8188, + "step": 11532 + }, + { + "epoch": 3.539901780233272, + "grad_norm": 0.24137580394744873, + "learning_rate": 7.491385802161989e-05, + "loss": 1.7947, + "step": 11533 + }, + { + "epoch": 3.540208717004297, + "grad_norm": 0.21900027990341187, + "learning_rate": 7.490954833462933e-05, + "loss": 1.7722, + "step": 11534 + }, + { + "epoch": 3.5405156537753224, + "grad_norm": 0.25009945034980774, + "learning_rate": 7.490523840146803e-05, + "loss": 1.8173, + "step": 11535 + }, + { + "epoch": 3.5408225905463473, + "grad_norm": 0.2778431475162506, + "learning_rate": 7.490092822217855e-05, + "loss": 1.8368, + "step": 11536 + }, + { + "epoch": 3.5411295273173726, + "grad_norm": 0.2845982611179352, + "learning_rate": 7.48966177968035e-05, + "loss": 1.7539, + "step": 11537 + }, + { + "epoch": 3.541436464088398, + "grad_norm": 0.27480921149253845, + "learning_rate": 7.48923071253855e-05, + "loss": 1.8494, + "step": 11538 + }, + { + "epoch": 3.541743400859423, + "grad_norm": 0.2722087502479553, + "learning_rate": 7.488799620796711e-05, + "loss": 1.8422, + "step": 11539 + }, + { + "epoch": 3.542050337630448, + "grad_norm": 0.2984340190887451, + "learning_rate": 7.488368504459097e-05, + "loss": 1.8042, + "step": 11540 + }, + { + "epoch": 3.542357274401473, + "grad_norm": 0.2405850738286972, + "learning_rate": 7.487937363529966e-05, + "loss": 1.749, + "step": 11541 + }, + { + "epoch": 3.5426642111724984, + "grad_norm": 0.24816973507404327, + "learning_rate": 7.487506198013579e-05, + "loss": 1.8671, + "step": 11542 + }, + { + "epoch": 3.5429711479435237, + "grad_norm": 0.2796473503112793, + "learning_rate": 7.487075007914199e-05, + "loss": 1.8023, + "step": 11543 + }, + { + "epoch": 3.5432780847145486, + "grad_norm": 0.2600162625312805, + "learning_rate": 7.486643793236086e-05, + "loss": 1.7997, + "step": 11544 + }, + { + "epoch": 3.543585021485574, + "grad_norm": 0.2746226489543915, + "learning_rate": 7.486212553983503e-05, + "loss": 1.7773, + "step": 11545 + }, + { + "epoch": 3.5438919582565993, + "grad_norm": 0.24142079055309296, + "learning_rate": 7.485781290160708e-05, + "loss": 1.791, + "step": 11546 + }, + { + "epoch": 3.544198895027624, + "grad_norm": 0.2472934126853943, + "learning_rate": 7.485350001771966e-05, + "loss": 1.8183, + "step": 11547 + }, + { + "epoch": 3.5445058317986495, + "grad_norm": 0.26891404390335083, + "learning_rate": 7.48491868882154e-05, + "loss": 1.7421, + "step": 11548 + }, + { + "epoch": 3.544812768569675, + "grad_norm": 0.24820464849472046, + "learning_rate": 7.48448735131369e-05, + "loss": 1.7372, + "step": 11549 + }, + { + "epoch": 3.5451197053406998, + "grad_norm": 0.2456594705581665, + "learning_rate": 7.484055989252679e-05, + "loss": 1.7883, + "step": 11550 + }, + { + "epoch": 3.545426642111725, + "grad_norm": 0.32420551776885986, + "learning_rate": 7.48362460264277e-05, + "loss": 1.8363, + "step": 11551 + }, + { + "epoch": 3.54573357888275, + "grad_norm": 0.3187662661075592, + "learning_rate": 7.483193191488229e-05, + "loss": 1.7957, + "step": 11552 + }, + { + "epoch": 3.5460405156537753, + "grad_norm": 0.2845410108566284, + "learning_rate": 7.482761755793316e-05, + "loss": 1.8288, + "step": 11553 + }, + { + "epoch": 3.5463474524248007, + "grad_norm": 0.2816021740436554, + "learning_rate": 7.482330295562298e-05, + "loss": 1.7562, + "step": 11554 + }, + { + "epoch": 3.5466543891958255, + "grad_norm": 0.28938058018684387, + "learning_rate": 7.481898810799435e-05, + "loss": 1.8139, + "step": 11555 + }, + { + "epoch": 3.546961325966851, + "grad_norm": 0.3305707573890686, + "learning_rate": 7.481467301508995e-05, + "loss": 1.8956, + "step": 11556 + }, + { + "epoch": 3.5472682627378758, + "grad_norm": 0.3890376091003418, + "learning_rate": 7.48103576769524e-05, + "loss": 1.8552, + "step": 11557 + }, + { + "epoch": 3.547575199508901, + "grad_norm": 0.3900652825832367, + "learning_rate": 7.480604209362434e-05, + "loss": 1.7748, + "step": 11558 + }, + { + "epoch": 3.5478821362799264, + "grad_norm": 0.3297326862812042, + "learning_rate": 7.480172626514845e-05, + "loss": 1.8201, + "step": 11559 + }, + { + "epoch": 3.5481890730509518, + "grad_norm": 0.28797218203544617, + "learning_rate": 7.479741019156737e-05, + "loss": 1.7652, + "step": 11560 + }, + { + "epoch": 3.5484960098219767, + "grad_norm": 0.2764691114425659, + "learning_rate": 7.479309387292373e-05, + "loss": 1.7534, + "step": 11561 + }, + { + "epoch": 3.548802946593002, + "grad_norm": 0.25067585706710815, + "learning_rate": 7.47887773092602e-05, + "loss": 1.7849, + "step": 11562 + }, + { + "epoch": 3.549109883364027, + "grad_norm": 0.29966798424720764, + "learning_rate": 7.478446050061947e-05, + "loss": 1.8299, + "step": 11563 + }, + { + "epoch": 3.549416820135052, + "grad_norm": 0.24068406224250793, + "learning_rate": 7.478014344704416e-05, + "loss": 1.8366, + "step": 11564 + }, + { + "epoch": 3.5497237569060776, + "grad_norm": 0.2559303641319275, + "learning_rate": 7.477582614857695e-05, + "loss": 1.7665, + "step": 11565 + }, + { + "epoch": 3.5500306936771024, + "grad_norm": 0.24617858231067657, + "learning_rate": 7.47715086052605e-05, + "loss": 1.8334, + "step": 11566 + }, + { + "epoch": 3.550337630448128, + "grad_norm": 0.2433501034975052, + "learning_rate": 7.476719081713749e-05, + "loss": 1.7963, + "step": 11567 + }, + { + "epoch": 3.5506445672191527, + "grad_norm": 0.2583518326282501, + "learning_rate": 7.476287278425057e-05, + "loss": 1.8311, + "step": 11568 + }, + { + "epoch": 3.550951503990178, + "grad_norm": 0.3232485055923462, + "learning_rate": 7.475855450664244e-05, + "loss": 1.9162, + "step": 11569 + }, + { + "epoch": 3.5512584407612033, + "grad_norm": 0.28247153759002686, + "learning_rate": 7.475423598435576e-05, + "loss": 1.8027, + "step": 11570 + }, + { + "epoch": 3.5515653775322282, + "grad_norm": 0.27201834321022034, + "learning_rate": 7.47499172174332e-05, + "loss": 1.7822, + "step": 11571 + }, + { + "epoch": 3.5518723143032536, + "grad_norm": 0.2408471554517746, + "learning_rate": 7.474559820591748e-05, + "loss": 1.7735, + "step": 11572 + }, + { + "epoch": 3.5521792510742785, + "grad_norm": 0.24187393486499786, + "learning_rate": 7.474127894985124e-05, + "loss": 1.7931, + "step": 11573 + }, + { + "epoch": 3.552486187845304, + "grad_norm": 0.2759699523448944, + "learning_rate": 7.473695944927717e-05, + "loss": 1.8407, + "step": 11574 + }, + { + "epoch": 3.552793124616329, + "grad_norm": 0.2503111958503723, + "learning_rate": 7.473263970423797e-05, + "loss": 1.7613, + "step": 11575 + }, + { + "epoch": 3.5531000613873545, + "grad_norm": 0.24795177578926086, + "learning_rate": 7.472831971477633e-05, + "loss": 1.8221, + "step": 11576 + }, + { + "epoch": 3.5534069981583793, + "grad_norm": 0.23190177977085114, + "learning_rate": 7.472399948093494e-05, + "loss": 1.7541, + "step": 11577 + }, + { + "epoch": 3.5537139349294047, + "grad_norm": 0.24650825560092926, + "learning_rate": 7.471967900275653e-05, + "loss": 1.8002, + "step": 11578 + }, + { + "epoch": 3.5540208717004296, + "grad_norm": 0.256598562002182, + "learning_rate": 7.471535828028372e-05, + "loss": 1.7052, + "step": 11579 + }, + { + "epoch": 3.554327808471455, + "grad_norm": 0.2715381681919098, + "learning_rate": 7.471103731355926e-05, + "loss": 1.7701, + "step": 11580 + }, + { + "epoch": 3.5546347452424802, + "grad_norm": 0.29806044697761536, + "learning_rate": 7.470671610262586e-05, + "loss": 1.7614, + "step": 11581 + }, + { + "epoch": 3.554941682013505, + "grad_norm": 0.26364314556121826, + "learning_rate": 7.470239464752621e-05, + "loss": 1.7957, + "step": 11582 + }, + { + "epoch": 3.5552486187845305, + "grad_norm": 0.29270800948143005, + "learning_rate": 7.4698072948303e-05, + "loss": 1.8263, + "step": 11583 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.25941839814186096, + "learning_rate": 7.469375100499898e-05, + "loss": 1.8517, + "step": 11584 + }, + { + "epoch": 3.5558624923265807, + "grad_norm": 0.29509237408638, + "learning_rate": 7.468942881765681e-05, + "loss": 1.8643, + "step": 11585 + }, + { + "epoch": 3.556169429097606, + "grad_norm": 0.23090367019176483, + "learning_rate": 7.468510638631926e-05, + "loss": 1.7239, + "step": 11586 + }, + { + "epoch": 3.556476365868631, + "grad_norm": 0.2696724236011505, + "learning_rate": 7.468078371102901e-05, + "loss": 1.848, + "step": 11587 + }, + { + "epoch": 3.5567833026396563, + "grad_norm": 0.2691192626953125, + "learning_rate": 7.46764607918288e-05, + "loss": 1.8194, + "step": 11588 + }, + { + "epoch": 3.557090239410681, + "grad_norm": 0.26616501808166504, + "learning_rate": 7.467213762876131e-05, + "loss": 1.8382, + "step": 11589 + }, + { + "epoch": 3.5573971761817065, + "grad_norm": 0.30629831552505493, + "learning_rate": 7.466781422186933e-05, + "loss": 1.8417, + "step": 11590 + }, + { + "epoch": 3.557704112952732, + "grad_norm": 0.27212417125701904, + "learning_rate": 7.466349057119552e-05, + "loss": 1.7612, + "step": 11591 + }, + { + "epoch": 3.558011049723757, + "grad_norm": 0.2872084379196167, + "learning_rate": 7.465916667678266e-05, + "loss": 1.7998, + "step": 11592 + }, + { + "epoch": 3.558317986494782, + "grad_norm": 0.3017117977142334, + "learning_rate": 7.465484253867348e-05, + "loss": 1.7996, + "step": 11593 + }, + { + "epoch": 3.5586249232658074, + "grad_norm": 0.2707957327365875, + "learning_rate": 7.465051815691066e-05, + "loss": 1.7678, + "step": 11594 + }, + { + "epoch": 3.5589318600368323, + "grad_norm": 0.28932711482048035, + "learning_rate": 7.464619353153702e-05, + "loss": 1.8576, + "step": 11595 + }, + { + "epoch": 3.5592387968078576, + "grad_norm": 0.2585125267505646, + "learning_rate": 7.464186866259519e-05, + "loss": 1.8678, + "step": 11596 + }, + { + "epoch": 3.559545733578883, + "grad_norm": 0.24386851489543915, + "learning_rate": 7.4637543550128e-05, + "loss": 1.7778, + "step": 11597 + }, + { + "epoch": 3.559852670349908, + "grad_norm": 0.2375860959291458, + "learning_rate": 7.463321819417817e-05, + "loss": 1.8096, + "step": 11598 + }, + { + "epoch": 3.560159607120933, + "grad_norm": 0.2341299206018448, + "learning_rate": 7.462889259478842e-05, + "loss": 1.7191, + "step": 11599 + }, + { + "epoch": 3.560466543891958, + "grad_norm": 0.2510595917701721, + "learning_rate": 7.462456675200154e-05, + "loss": 1.7763, + "step": 11600 + }, + { + "epoch": 3.5607734806629834, + "grad_norm": 0.2554674744606018, + "learning_rate": 7.462024066586025e-05, + "loss": 1.7578, + "step": 11601 + }, + { + "epoch": 3.5610804174340087, + "grad_norm": 0.25040730834007263, + "learning_rate": 7.46159143364073e-05, + "loss": 1.8194, + "step": 11602 + }, + { + "epoch": 3.5613873542050336, + "grad_norm": 0.24294932186603546, + "learning_rate": 7.461158776368547e-05, + "loss": 1.8063, + "step": 11603 + }, + { + "epoch": 3.561694290976059, + "grad_norm": 0.2388325333595276, + "learning_rate": 7.46072609477375e-05, + "loss": 1.7942, + "step": 11604 + }, + { + "epoch": 3.562001227747084, + "grad_norm": 0.2569502890110016, + "learning_rate": 7.460293388860615e-05, + "loss": 1.7824, + "step": 11605 + }, + { + "epoch": 3.562308164518109, + "grad_norm": 0.24004346132278442, + "learning_rate": 7.45986065863342e-05, + "loss": 1.8676, + "step": 11606 + }, + { + "epoch": 3.5626151012891345, + "grad_norm": 0.25446319580078125, + "learning_rate": 7.45942790409644e-05, + "loss": 1.7726, + "step": 11607 + }, + { + "epoch": 3.56292203806016, + "grad_norm": 0.26257482171058655, + "learning_rate": 7.458995125253951e-05, + "loss": 1.779, + "step": 11608 + }, + { + "epoch": 3.5632289748311847, + "grad_norm": 0.27703070640563965, + "learning_rate": 7.458562322110231e-05, + "loss": 1.8247, + "step": 11609 + }, + { + "epoch": 3.56353591160221, + "grad_norm": 0.25478535890579224, + "learning_rate": 7.458129494669556e-05, + "loss": 1.7794, + "step": 11610 + }, + { + "epoch": 3.563842848373235, + "grad_norm": 0.26173365116119385, + "learning_rate": 7.457696642936207e-05, + "loss": 1.758, + "step": 11611 + }, + { + "epoch": 3.5641497851442603, + "grad_norm": 0.25077274441719055, + "learning_rate": 7.45726376691446e-05, + "loss": 1.8234, + "step": 11612 + }, + { + "epoch": 3.5644567219152856, + "grad_norm": 0.2591109275817871, + "learning_rate": 7.456830866608589e-05, + "loss": 1.7723, + "step": 11613 + }, + { + "epoch": 3.5647636586863105, + "grad_norm": 0.2653447091579437, + "learning_rate": 7.456397942022877e-05, + "loss": 1.7839, + "step": 11614 + }, + { + "epoch": 3.565070595457336, + "grad_norm": 0.3203454911708832, + "learning_rate": 7.455964993161601e-05, + "loss": 1.8548, + "step": 11615 + }, + { + "epoch": 3.5653775322283607, + "grad_norm": 0.3041793704032898, + "learning_rate": 7.455532020029039e-05, + "loss": 1.7925, + "step": 11616 + }, + { + "epoch": 3.565684468999386, + "grad_norm": 0.26066139340400696, + "learning_rate": 7.45509902262947e-05, + "loss": 1.7905, + "step": 11617 + }, + { + "epoch": 3.5659914057704114, + "grad_norm": 0.2483314871788025, + "learning_rate": 7.454666000967174e-05, + "loss": 1.7658, + "step": 11618 + }, + { + "epoch": 3.5662983425414367, + "grad_norm": 0.24285900592803955, + "learning_rate": 7.45423295504643e-05, + "loss": 1.7575, + "step": 11619 + }, + { + "epoch": 3.5666052793124616, + "grad_norm": 0.27231669425964355, + "learning_rate": 7.453799884871517e-05, + "loss": 1.8389, + "step": 11620 + }, + { + "epoch": 3.566912216083487, + "grad_norm": 0.24324406683444977, + "learning_rate": 7.453366790446717e-05, + "loss": 1.7775, + "step": 11621 + }, + { + "epoch": 3.567219152854512, + "grad_norm": 0.2724440097808838, + "learning_rate": 7.452933671776305e-05, + "loss": 1.8135, + "step": 11622 + }, + { + "epoch": 3.567526089625537, + "grad_norm": 0.22207655012607574, + "learning_rate": 7.452500528864568e-05, + "loss": 1.722, + "step": 11623 + }, + { + "epoch": 3.5678330263965625, + "grad_norm": 0.25650298595428467, + "learning_rate": 7.452067361715782e-05, + "loss": 1.7813, + "step": 11624 + }, + { + "epoch": 3.5681399631675874, + "grad_norm": 0.2582200765609741, + "learning_rate": 7.45163417033423e-05, + "loss": 1.8253, + "step": 11625 + }, + { + "epoch": 3.5684468999386127, + "grad_norm": 0.29545384645462036, + "learning_rate": 7.451200954724188e-05, + "loss": 1.8108, + "step": 11626 + }, + { + "epoch": 3.5687538367096376, + "grad_norm": 0.30457428097724915, + "learning_rate": 7.450767714889946e-05, + "loss": 1.8257, + "step": 11627 + }, + { + "epoch": 3.569060773480663, + "grad_norm": 0.2955166697502136, + "learning_rate": 7.450334450835781e-05, + "loss": 1.8172, + "step": 11628 + }, + { + "epoch": 3.5693677102516883, + "grad_norm": 0.2793857753276825, + "learning_rate": 7.449901162565974e-05, + "loss": 1.8493, + "step": 11629 + }, + { + "epoch": 3.569674647022713, + "grad_norm": 0.27154335379600525, + "learning_rate": 7.449467850084808e-05, + "loss": 1.8306, + "step": 11630 + }, + { + "epoch": 3.5699815837937385, + "grad_norm": 0.22336189448833466, + "learning_rate": 7.449034513396564e-05, + "loss": 1.7435, + "step": 11631 + }, + { + "epoch": 3.5702885205647634, + "grad_norm": 0.22799183428287506, + "learning_rate": 7.448601152505526e-05, + "loss": 1.7818, + "step": 11632 + }, + { + "epoch": 3.5705954573357888, + "grad_norm": 0.26670658588409424, + "learning_rate": 7.448167767415976e-05, + "loss": 1.7777, + "step": 11633 + }, + { + "epoch": 3.570902394106814, + "grad_norm": 0.2848666310310364, + "learning_rate": 7.447734358132196e-05, + "loss": 1.7572, + "step": 11634 + }, + { + "epoch": 3.5712093308778394, + "grad_norm": 0.26843544840812683, + "learning_rate": 7.447300924658473e-05, + "loss": 1.7642, + "step": 11635 + }, + { + "epoch": 3.5715162676488643, + "grad_norm": 0.24666404724121094, + "learning_rate": 7.446867466999087e-05, + "loss": 1.7533, + "step": 11636 + }, + { + "epoch": 3.5718232044198897, + "grad_norm": 0.31111210584640503, + "learning_rate": 7.44643398515832e-05, + "loss": 1.7875, + "step": 11637 + }, + { + "epoch": 3.5721301411909145, + "grad_norm": 0.3157108724117279, + "learning_rate": 7.446000479140462e-05, + "loss": 1.7879, + "step": 11638 + }, + { + "epoch": 3.57243707796194, + "grad_norm": 0.2935558259487152, + "learning_rate": 7.445566948949792e-05, + "loss": 1.7819, + "step": 11639 + }, + { + "epoch": 3.572744014732965, + "grad_norm": 0.2265472710132599, + "learning_rate": 7.445133394590597e-05, + "loss": 1.7518, + "step": 11640 + }, + { + "epoch": 3.57305095150399, + "grad_norm": 0.2564176023006439, + "learning_rate": 7.444699816067159e-05, + "loss": 1.7281, + "step": 11641 + }, + { + "epoch": 3.5733578882750154, + "grad_norm": 0.27933555841445923, + "learning_rate": 7.444266213383766e-05, + "loss": 1.7852, + "step": 11642 + }, + { + "epoch": 3.5736648250460403, + "grad_norm": 0.29105356335639954, + "learning_rate": 7.4438325865447e-05, + "loss": 1.8056, + "step": 11643 + }, + { + "epoch": 3.5739717618170657, + "grad_norm": 0.27665549516677856, + "learning_rate": 7.443398935554249e-05, + "loss": 1.7249, + "step": 11644 + }, + { + "epoch": 3.574278698588091, + "grad_norm": 0.21899232268333435, + "learning_rate": 7.442965260416698e-05, + "loss": 1.7689, + "step": 11645 + }, + { + "epoch": 3.574585635359116, + "grad_norm": 0.3250672221183777, + "learning_rate": 7.442531561136333e-05, + "loss": 1.8058, + "step": 11646 + }, + { + "epoch": 3.574892572130141, + "grad_norm": 0.42442524433135986, + "learning_rate": 7.442097837717438e-05, + "loss": 1.7887, + "step": 11647 + }, + { + "epoch": 3.575199508901166, + "grad_norm": 0.33108964562416077, + "learning_rate": 7.441664090164302e-05, + "loss": 1.7628, + "step": 11648 + }, + { + "epoch": 3.5755064456721914, + "grad_norm": 0.23050357401371002, + "learning_rate": 7.44123031848121e-05, + "loss": 1.8121, + "step": 11649 + }, + { + "epoch": 3.575813382443217, + "grad_norm": 0.29251593351364136, + "learning_rate": 7.440796522672448e-05, + "loss": 1.8051, + "step": 11650 + }, + { + "epoch": 3.576120319214242, + "grad_norm": 0.3764750063419342, + "learning_rate": 7.440362702742305e-05, + "loss": 1.9002, + "step": 11651 + }, + { + "epoch": 3.576427255985267, + "grad_norm": 0.3751949071884155, + "learning_rate": 7.439928858695069e-05, + "loss": 1.821, + "step": 11652 + }, + { + "epoch": 3.5767341927562923, + "grad_norm": 0.268476665019989, + "learning_rate": 7.439494990535024e-05, + "loss": 1.8241, + "step": 11653 + }, + { + "epoch": 3.5770411295273172, + "grad_norm": 0.3072795271873474, + "learning_rate": 7.439061098266459e-05, + "loss": 1.8169, + "step": 11654 + }, + { + "epoch": 3.5773480662983426, + "grad_norm": 0.4948901832103729, + "learning_rate": 7.438627181893664e-05, + "loss": 1.7706, + "step": 11655 + }, + { + "epoch": 3.577655003069368, + "grad_norm": 0.5892601013183594, + "learning_rate": 7.438193241420926e-05, + "loss": 1.7631, + "step": 11656 + }, + { + "epoch": 3.577961939840393, + "grad_norm": 0.4599401652812958, + "learning_rate": 7.437759276852533e-05, + "loss": 1.7471, + "step": 11657 + }, + { + "epoch": 3.578268876611418, + "grad_norm": 0.2545170783996582, + "learning_rate": 7.437325288192773e-05, + "loss": 1.7945, + "step": 11658 + }, + { + "epoch": 3.578575813382443, + "grad_norm": 0.3136496841907501, + "learning_rate": 7.436891275445938e-05, + "loss": 1.828, + "step": 11659 + }, + { + "epoch": 3.5788827501534684, + "grad_norm": 0.3631688058376312, + "learning_rate": 7.436457238616313e-05, + "loss": 1.8302, + "step": 11660 + }, + { + "epoch": 3.5791896869244937, + "grad_norm": 0.3097386658191681, + "learning_rate": 7.436023177708192e-05, + "loss": 1.8397, + "step": 11661 + }, + { + "epoch": 3.5794966236955186, + "grad_norm": 0.20948798954486847, + "learning_rate": 7.43558909272586e-05, + "loss": 1.7844, + "step": 11662 + }, + { + "epoch": 3.579803560466544, + "grad_norm": 0.24327392876148224, + "learning_rate": 7.43515498367361e-05, + "loss": 1.7827, + "step": 11663 + }, + { + "epoch": 3.580110497237569, + "grad_norm": 0.25268325209617615, + "learning_rate": 7.434720850555731e-05, + "loss": 1.8224, + "step": 11664 + }, + { + "epoch": 3.580417434008594, + "grad_norm": 0.24883607029914856, + "learning_rate": 7.434286693376513e-05, + "loss": 1.8189, + "step": 11665 + }, + { + "epoch": 3.5807243707796195, + "grad_norm": 0.2942518889904022, + "learning_rate": 7.433852512140248e-05, + "loss": 1.8325, + "step": 11666 + }, + { + "epoch": 3.581031307550645, + "grad_norm": 0.3556186556816101, + "learning_rate": 7.433418306851225e-05, + "loss": 1.7511, + "step": 11667 + }, + { + "epoch": 3.5813382443216697, + "grad_norm": 0.421220600605011, + "learning_rate": 7.432984077513738e-05, + "loss": 1.8081, + "step": 11668 + }, + { + "epoch": 3.581645181092695, + "grad_norm": 0.3338243067264557, + "learning_rate": 7.432549824132074e-05, + "loss": 1.8274, + "step": 11669 + }, + { + "epoch": 3.58195211786372, + "grad_norm": 0.25091543793678284, + "learning_rate": 7.432115546710528e-05, + "loss": 1.7637, + "step": 11670 + }, + { + "epoch": 3.5822590546347453, + "grad_norm": 0.29870370030403137, + "learning_rate": 7.431681245253389e-05, + "loss": 1.8036, + "step": 11671 + }, + { + "epoch": 3.5825659914057706, + "grad_norm": 0.2682137191295624, + "learning_rate": 7.431246919764953e-05, + "loss": 1.8252, + "step": 11672 + }, + { + "epoch": 3.5828729281767955, + "grad_norm": 0.28790801763534546, + "learning_rate": 7.430812570249508e-05, + "loss": 1.7713, + "step": 11673 + }, + { + "epoch": 3.583179864947821, + "grad_norm": 0.26357609033584595, + "learning_rate": 7.43037819671135e-05, + "loss": 1.8388, + "step": 11674 + }, + { + "epoch": 3.5834868017188457, + "grad_norm": 0.2505483031272888, + "learning_rate": 7.42994379915477e-05, + "loss": 1.7722, + "step": 11675 + }, + { + "epoch": 3.583793738489871, + "grad_norm": 0.2535844147205353, + "learning_rate": 7.42950937758406e-05, + "loss": 1.756, + "step": 11676 + }, + { + "epoch": 3.5841006752608964, + "grad_norm": 0.23045027256011963, + "learning_rate": 7.429074932003515e-05, + "loss": 1.791, + "step": 11677 + }, + { + "epoch": 3.5844076120319213, + "grad_norm": 0.22525762021541595, + "learning_rate": 7.428640462417428e-05, + "loss": 1.7234, + "step": 11678 + }, + { + "epoch": 3.5847145488029466, + "grad_norm": 0.2402270883321762, + "learning_rate": 7.428205968830094e-05, + "loss": 1.845, + "step": 11679 + }, + { + "epoch": 3.5850214855739715, + "grad_norm": 0.24909646809101105, + "learning_rate": 7.427771451245802e-05, + "loss": 1.8537, + "step": 11680 + }, + { + "epoch": 3.585328422344997, + "grad_norm": 0.25813063979148865, + "learning_rate": 7.427336909668853e-05, + "loss": 1.7353, + "step": 11681 + }, + { + "epoch": 3.585635359116022, + "grad_norm": 0.26073768734931946, + "learning_rate": 7.426902344103534e-05, + "loss": 1.8142, + "step": 11682 + }, + { + "epoch": 3.5859422958870475, + "grad_norm": 0.2498280256986618, + "learning_rate": 7.426467754554147e-05, + "loss": 1.7996, + "step": 11683 + }, + { + "epoch": 3.5862492326580724, + "grad_norm": 0.3131188154220581, + "learning_rate": 7.426033141024981e-05, + "loss": 1.7793, + "step": 11684 + }, + { + "epoch": 3.5865561694290977, + "grad_norm": 0.24118199944496155, + "learning_rate": 7.425598503520337e-05, + "loss": 1.8249, + "step": 11685 + }, + { + "epoch": 3.5868631062001226, + "grad_norm": 0.2791197597980499, + "learning_rate": 7.425163842044504e-05, + "loss": 1.7966, + "step": 11686 + }, + { + "epoch": 3.587170042971148, + "grad_norm": 0.2298576384782791, + "learning_rate": 7.424729156601781e-05, + "loss": 1.7224, + "step": 11687 + }, + { + "epoch": 3.5874769797421733, + "grad_norm": 0.23113438487052917, + "learning_rate": 7.424294447196462e-05, + "loss": 1.7641, + "step": 11688 + }, + { + "epoch": 3.587783916513198, + "grad_norm": 0.3064495027065277, + "learning_rate": 7.423859713832847e-05, + "loss": 1.8688, + "step": 11689 + }, + { + "epoch": 3.5880908532842235, + "grad_norm": 0.22847676277160645, + "learning_rate": 7.423424956515228e-05, + "loss": 1.7513, + "step": 11690 + }, + { + "epoch": 3.5883977900552484, + "grad_norm": 0.2797350585460663, + "learning_rate": 7.422990175247905e-05, + "loss": 1.8268, + "step": 11691 + }, + { + "epoch": 3.5887047268262737, + "grad_norm": 0.2753821313381195, + "learning_rate": 7.422555370035171e-05, + "loss": 1.7313, + "step": 11692 + }, + { + "epoch": 3.589011663597299, + "grad_norm": 0.2981179654598236, + "learning_rate": 7.422120540881326e-05, + "loss": 1.8455, + "step": 11693 + }, + { + "epoch": 3.5893186003683244, + "grad_norm": 0.33028867840766907, + "learning_rate": 7.421685687790667e-05, + "loss": 1.8397, + "step": 11694 + }, + { + "epoch": 3.5896255371393493, + "grad_norm": 0.409173846244812, + "learning_rate": 7.421250810767487e-05, + "loss": 1.8088, + "step": 11695 + }, + { + "epoch": 3.5899324739103746, + "grad_norm": 0.4118194878101349, + "learning_rate": 7.42081590981609e-05, + "loss": 1.7719, + "step": 11696 + }, + { + "epoch": 3.5902394106813995, + "grad_norm": 0.34716179966926575, + "learning_rate": 7.420380984940773e-05, + "loss": 1.8063, + "step": 11697 + }, + { + "epoch": 3.590546347452425, + "grad_norm": 0.27763083577156067, + "learning_rate": 7.419946036145829e-05, + "loss": 1.7777, + "step": 11698 + }, + { + "epoch": 3.59085328422345, + "grad_norm": 0.3175280690193176, + "learning_rate": 7.419511063435562e-05, + "loss": 1.697, + "step": 11699 + }, + { + "epoch": 3.591160220994475, + "grad_norm": 0.3151503801345825, + "learning_rate": 7.419076066814268e-05, + "loss": 1.8067, + "step": 11700 + }, + { + "epoch": 3.5914671577655004, + "grad_norm": 0.26914867758750916, + "learning_rate": 7.418641046286245e-05, + "loss": 1.7797, + "step": 11701 + }, + { + "epoch": 3.5917740945365253, + "grad_norm": 0.27231964468955994, + "learning_rate": 7.418206001855797e-05, + "loss": 1.7931, + "step": 11702 + }, + { + "epoch": 3.5920810313075506, + "grad_norm": 0.3352177143096924, + "learning_rate": 7.417770933527217e-05, + "loss": 1.9187, + "step": 11703 + }, + { + "epoch": 3.592387968078576, + "grad_norm": 0.3510081470012665, + "learning_rate": 7.417335841304808e-05, + "loss": 1.7889, + "step": 11704 + }, + { + "epoch": 3.592694904849601, + "grad_norm": 0.24949313700199127, + "learning_rate": 7.41690072519287e-05, + "loss": 1.7683, + "step": 11705 + }, + { + "epoch": 3.593001841620626, + "grad_norm": 0.28442221879959106, + "learning_rate": 7.416465585195702e-05, + "loss": 1.7889, + "step": 11706 + }, + { + "epoch": 3.593308778391651, + "grad_norm": 0.3355824649333954, + "learning_rate": 7.416030421317605e-05, + "loss": 1.7637, + "step": 11707 + }, + { + "epoch": 3.5936157151626764, + "grad_norm": 0.33569446206092834, + "learning_rate": 7.415595233562878e-05, + "loss": 1.919, + "step": 11708 + }, + { + "epoch": 3.5939226519337018, + "grad_norm": 0.2488354742527008, + "learning_rate": 7.415160021935825e-05, + "loss": 1.8424, + "step": 11709 + }, + { + "epoch": 3.594229588704727, + "grad_norm": 0.2701130509376526, + "learning_rate": 7.414724786440746e-05, + "loss": 1.7586, + "step": 11710 + }, + { + "epoch": 3.594536525475752, + "grad_norm": 0.26289790868759155, + "learning_rate": 7.414289527081939e-05, + "loss": 1.7975, + "step": 11711 + }, + { + "epoch": 3.5948434622467773, + "grad_norm": 0.25382301211357117, + "learning_rate": 7.413854243863707e-05, + "loss": 1.7393, + "step": 11712 + }, + { + "epoch": 3.595150399017802, + "grad_norm": 0.28282979130744934, + "learning_rate": 7.413418936790357e-05, + "loss": 1.8048, + "step": 11713 + }, + { + "epoch": 3.5954573357888275, + "grad_norm": 0.28001347184181213, + "learning_rate": 7.412983605866183e-05, + "loss": 1.7864, + "step": 11714 + }, + { + "epoch": 3.595764272559853, + "grad_norm": 0.26107707619667053, + "learning_rate": 7.412548251095491e-05, + "loss": 1.8016, + "step": 11715 + }, + { + "epoch": 3.5960712093308778, + "grad_norm": 0.2518761456012726, + "learning_rate": 7.412112872482583e-05, + "loss": 1.7565, + "step": 11716 + }, + { + "epoch": 3.596378146101903, + "grad_norm": 0.25911152362823486, + "learning_rate": 7.411677470031762e-05, + "loss": 1.8333, + "step": 11717 + }, + { + "epoch": 3.596685082872928, + "grad_norm": 0.3411506414413452, + "learning_rate": 7.41124204374733e-05, + "loss": 1.8027, + "step": 11718 + }, + { + "epoch": 3.5969920196439533, + "grad_norm": 0.28535547852516174, + "learning_rate": 7.410806593633593e-05, + "loss": 1.7596, + "step": 11719 + }, + { + "epoch": 3.5972989564149787, + "grad_norm": 0.24665530025959015, + "learning_rate": 7.410371119694852e-05, + "loss": 1.7777, + "step": 11720 + }, + { + "epoch": 3.5976058931860035, + "grad_norm": 0.29162275791168213, + "learning_rate": 7.40993562193541e-05, + "loss": 1.795, + "step": 11721 + }, + { + "epoch": 3.597912829957029, + "grad_norm": 0.2712220549583435, + "learning_rate": 7.409500100359573e-05, + "loss": 1.824, + "step": 11722 + }, + { + "epoch": 3.5982197667280538, + "grad_norm": 0.239755779504776, + "learning_rate": 7.40906455497164e-05, + "loss": 1.7534, + "step": 11723 + }, + { + "epoch": 3.598526703499079, + "grad_norm": 0.26056957244873047, + "learning_rate": 7.408628985775922e-05, + "loss": 1.757, + "step": 11724 + }, + { + "epoch": 3.5988336402701044, + "grad_norm": 0.3230258822441101, + "learning_rate": 7.40819339277672e-05, + "loss": 1.8684, + "step": 11725 + }, + { + "epoch": 3.5991405770411298, + "grad_norm": 0.26070696115493774, + "learning_rate": 7.407757775978339e-05, + "loss": 1.7868, + "step": 11726 + }, + { + "epoch": 3.5994475138121547, + "grad_norm": 0.24940893054008484, + "learning_rate": 7.407322135385085e-05, + "loss": 1.8391, + "step": 11727 + }, + { + "epoch": 3.59975445058318, + "grad_norm": 0.2717723250389099, + "learning_rate": 7.406886471001263e-05, + "loss": 1.7567, + "step": 11728 + }, + { + "epoch": 3.600061387354205, + "grad_norm": 0.2328445315361023, + "learning_rate": 7.406450782831177e-05, + "loss": 1.7761, + "step": 11729 + }, + { + "epoch": 3.6003683241252302, + "grad_norm": 0.2740287184715271, + "learning_rate": 7.406015070879136e-05, + "loss": 1.8599, + "step": 11730 + }, + { + "epoch": 3.6006752608962556, + "grad_norm": 0.2930558919906616, + "learning_rate": 7.405579335149441e-05, + "loss": 1.852, + "step": 11731 + }, + { + "epoch": 3.6009821976672804, + "grad_norm": 0.30175161361694336, + "learning_rate": 7.405143575646403e-05, + "loss": 1.8861, + "step": 11732 + }, + { + "epoch": 3.601289134438306, + "grad_norm": 0.2617531418800354, + "learning_rate": 7.404707792374328e-05, + "loss": 1.7598, + "step": 11733 + }, + { + "epoch": 3.6015960712093307, + "grad_norm": 0.25384122133255005, + "learning_rate": 7.404271985337517e-05, + "loss": 1.7634, + "step": 11734 + }, + { + "epoch": 3.601903007980356, + "grad_norm": 0.31706711649894714, + "learning_rate": 7.403836154540284e-05, + "loss": 1.8125, + "step": 11735 + }, + { + "epoch": 3.6022099447513813, + "grad_norm": 0.299662709236145, + "learning_rate": 7.403400299986932e-05, + "loss": 1.748, + "step": 11736 + }, + { + "epoch": 3.6025168815224062, + "grad_norm": 0.23828944563865662, + "learning_rate": 7.40296442168177e-05, + "loss": 1.7473, + "step": 11737 + }, + { + "epoch": 3.6028238182934316, + "grad_norm": 0.22611604630947113, + "learning_rate": 7.402528519629106e-05, + "loss": 1.7519, + "step": 11738 + }, + { + "epoch": 3.6031307550644565, + "grad_norm": 0.28498536348342896, + "learning_rate": 7.402092593833246e-05, + "loss": 1.7792, + "step": 11739 + }, + { + "epoch": 3.603437691835482, + "grad_norm": 0.2404283881187439, + "learning_rate": 7.4016566442985e-05, + "loss": 1.7434, + "step": 11740 + }, + { + "epoch": 3.603744628606507, + "grad_norm": 0.2291589230298996, + "learning_rate": 7.401220671029173e-05, + "loss": 1.7623, + "step": 11741 + }, + { + "epoch": 3.6040515653775325, + "grad_norm": 0.23962698876857758, + "learning_rate": 7.400784674029578e-05, + "loss": 1.7232, + "step": 11742 + }, + { + "epoch": 3.6043585021485574, + "grad_norm": 0.3015185594558716, + "learning_rate": 7.400348653304022e-05, + "loss": 1.7808, + "step": 11743 + }, + { + "epoch": 3.6046654389195827, + "grad_norm": 0.30623099207878113, + "learning_rate": 7.399912608856813e-05, + "loss": 1.8518, + "step": 11744 + }, + { + "epoch": 3.6049723756906076, + "grad_norm": 0.2698235511779785, + "learning_rate": 7.39947654069226e-05, + "loss": 1.7829, + "step": 11745 + }, + { + "epoch": 3.605279312461633, + "grad_norm": 0.2195274829864502, + "learning_rate": 7.399040448814674e-05, + "loss": 1.7709, + "step": 11746 + }, + { + "epoch": 3.6055862492326582, + "grad_norm": 0.22962357103824615, + "learning_rate": 7.398604333228366e-05, + "loss": 1.7482, + "step": 11747 + }, + { + "epoch": 3.605893186003683, + "grad_norm": 0.2403932511806488, + "learning_rate": 7.398168193937642e-05, + "loss": 1.8063, + "step": 11748 + }, + { + "epoch": 3.6062001227747085, + "grad_norm": 0.23542718589305878, + "learning_rate": 7.397732030946816e-05, + "loss": 1.7599, + "step": 11749 + }, + { + "epoch": 3.6065070595457334, + "grad_norm": 0.2462490350008011, + "learning_rate": 7.397295844260195e-05, + "loss": 1.8183, + "step": 11750 + }, + { + "epoch": 3.6068139963167587, + "grad_norm": 0.21428349614143372, + "learning_rate": 7.396859633882091e-05, + "loss": 1.6944, + "step": 11751 + }, + { + "epoch": 3.607120933087784, + "grad_norm": 0.21240907907485962, + "learning_rate": 7.396423399816817e-05, + "loss": 1.7795, + "step": 11752 + }, + { + "epoch": 3.607427869858809, + "grad_norm": 0.23413677513599396, + "learning_rate": 7.395987142068682e-05, + "loss": 1.8015, + "step": 11753 + }, + { + "epoch": 3.6077348066298343, + "grad_norm": 0.26724907755851746, + "learning_rate": 7.395550860641998e-05, + "loss": 1.8174, + "step": 11754 + }, + { + "epoch": 3.608041743400859, + "grad_norm": 0.22077679634094238, + "learning_rate": 7.395114555541077e-05, + "loss": 1.7929, + "step": 11755 + }, + { + "epoch": 3.6083486801718845, + "grad_norm": 0.2475263774394989, + "learning_rate": 7.394678226770228e-05, + "loss": 1.7744, + "step": 11756 + }, + { + "epoch": 3.60865561694291, + "grad_norm": 0.22579342126846313, + "learning_rate": 7.394241874333764e-05, + "loss": 1.79, + "step": 11757 + }, + { + "epoch": 3.608962553713935, + "grad_norm": 0.26798152923583984, + "learning_rate": 7.393805498236001e-05, + "loss": 1.8087, + "step": 11758 + }, + { + "epoch": 3.60926949048496, + "grad_norm": 0.2755621373653412, + "learning_rate": 7.393369098481248e-05, + "loss": 1.7834, + "step": 11759 + }, + { + "epoch": 3.6095764272559854, + "grad_norm": 0.2741812467575073, + "learning_rate": 7.39293267507382e-05, + "loss": 1.7948, + "step": 11760 + }, + { + "epoch": 3.6098833640270103, + "grad_norm": 0.2378924936056137, + "learning_rate": 7.392496228018028e-05, + "loss": 1.8317, + "step": 11761 + }, + { + "epoch": 3.6101903007980356, + "grad_norm": 0.2628132700920105, + "learning_rate": 7.392059757318187e-05, + "loss": 1.8123, + "step": 11762 + }, + { + "epoch": 3.610497237569061, + "grad_norm": 0.2613002359867096, + "learning_rate": 7.391623262978607e-05, + "loss": 1.795, + "step": 11763 + }, + { + "epoch": 3.610804174340086, + "grad_norm": 0.27272161841392517, + "learning_rate": 7.391186745003608e-05, + "loss": 1.7808, + "step": 11764 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.21366162598133087, + "learning_rate": 7.390750203397497e-05, + "loss": 1.77, + "step": 11765 + }, + { + "epoch": 3.611418047882136, + "grad_norm": 0.25559261441230774, + "learning_rate": 7.390313638164593e-05, + "loss": 1.8442, + "step": 11766 + }, + { + "epoch": 3.6117249846531614, + "grad_norm": 0.23794838786125183, + "learning_rate": 7.389877049309207e-05, + "loss": 1.8237, + "step": 11767 + }, + { + "epoch": 3.6120319214241867, + "grad_norm": 0.2690154016017914, + "learning_rate": 7.389440436835656e-05, + "loss": 1.8194, + "step": 11768 + }, + { + "epoch": 3.612338858195212, + "grad_norm": 0.26148009300231934, + "learning_rate": 7.389003800748254e-05, + "loss": 1.7862, + "step": 11769 + }, + { + "epoch": 3.612645794966237, + "grad_norm": 0.26414936780929565, + "learning_rate": 7.388567141051315e-05, + "loss": 1.7815, + "step": 11770 + }, + { + "epoch": 3.6129527317372623, + "grad_norm": 0.24473857879638672, + "learning_rate": 7.388130457749157e-05, + "loss": 1.801, + "step": 11771 + }, + { + "epoch": 3.613259668508287, + "grad_norm": 0.24356001615524292, + "learning_rate": 7.387693750846094e-05, + "loss": 1.8031, + "step": 11772 + }, + { + "epoch": 3.6135666052793125, + "grad_norm": 0.26716411113739014, + "learning_rate": 7.387257020346441e-05, + "loss": 1.7999, + "step": 11773 + }, + { + "epoch": 3.613873542050338, + "grad_norm": 0.2730760872364044, + "learning_rate": 7.386820266254516e-05, + "loss": 1.8079, + "step": 11774 + }, + { + "epoch": 3.6141804788213627, + "grad_norm": 0.2570728361606598, + "learning_rate": 7.386383488574635e-05, + "loss": 1.7374, + "step": 11775 + }, + { + "epoch": 3.614487415592388, + "grad_norm": 0.24992883205413818, + "learning_rate": 7.385946687311112e-05, + "loss": 1.8432, + "step": 11776 + }, + { + "epoch": 3.614794352363413, + "grad_norm": 0.28632259368896484, + "learning_rate": 7.385509862468266e-05, + "loss": 1.8014, + "step": 11777 + }, + { + "epoch": 3.6151012891344383, + "grad_norm": 0.257303923368454, + "learning_rate": 7.385073014050412e-05, + "loss": 1.8166, + "step": 11778 + }, + { + "epoch": 3.6154082259054636, + "grad_norm": 0.2791872024536133, + "learning_rate": 7.38463614206187e-05, + "loss": 1.7865, + "step": 11779 + }, + { + "epoch": 3.6157151626764885, + "grad_norm": 0.25708603858947754, + "learning_rate": 7.384199246506956e-05, + "loss": 1.807, + "step": 11780 + }, + { + "epoch": 3.616022099447514, + "grad_norm": 0.28693172335624695, + "learning_rate": 7.383762327389988e-05, + "loss": 1.8049, + "step": 11781 + }, + { + "epoch": 3.6163290362185387, + "grad_norm": 0.2731167674064636, + "learning_rate": 7.383325384715283e-05, + "loss": 1.8937, + "step": 11782 + }, + { + "epoch": 3.616635972989564, + "grad_norm": 0.26151663064956665, + "learning_rate": 7.38288841848716e-05, + "loss": 1.8288, + "step": 11783 + }, + { + "epoch": 3.6169429097605894, + "grad_norm": 0.2732257843017578, + "learning_rate": 7.382451428709936e-05, + "loss": 1.7668, + "step": 11784 + }, + { + "epoch": 3.6172498465316147, + "grad_norm": 0.2747575640678406, + "learning_rate": 7.38201441538793e-05, + "loss": 1.7991, + "step": 11785 + }, + { + "epoch": 3.6175567833026396, + "grad_norm": 0.2884783446788788, + "learning_rate": 7.381577378525462e-05, + "loss": 1.7798, + "step": 11786 + }, + { + "epoch": 3.617863720073665, + "grad_norm": 0.2716344892978668, + "learning_rate": 7.381140318126851e-05, + "loss": 1.7923, + "step": 11787 + }, + { + "epoch": 3.61817065684469, + "grad_norm": 0.3007747232913971, + "learning_rate": 7.380703234196416e-05, + "loss": 1.8397, + "step": 11788 + }, + { + "epoch": 3.618477593615715, + "grad_norm": 0.39218056201934814, + "learning_rate": 7.380266126738476e-05, + "loss": 1.8517, + "step": 11789 + }, + { + "epoch": 3.6187845303867405, + "grad_norm": 0.43425866961479187, + "learning_rate": 7.379828995757351e-05, + "loss": 1.7518, + "step": 11790 + }, + { + "epoch": 3.6190914671577654, + "grad_norm": 0.34399518370628357, + "learning_rate": 7.37939184125736e-05, + "loss": 1.7607, + "step": 11791 + }, + { + "epoch": 3.6193984039287908, + "grad_norm": 0.23124302923679352, + "learning_rate": 7.378954663242825e-05, + "loss": 1.7898, + "step": 11792 + }, + { + "epoch": 3.6197053406998156, + "grad_norm": 0.32839757204055786, + "learning_rate": 7.378517461718066e-05, + "loss": 1.7472, + "step": 11793 + }, + { + "epoch": 3.620012277470841, + "grad_norm": 0.38583460450172424, + "learning_rate": 7.378080236687403e-05, + "loss": 1.7947, + "step": 11794 + }, + { + "epoch": 3.6203192142418663, + "grad_norm": 0.4622896909713745, + "learning_rate": 7.377642988155157e-05, + "loss": 1.9023, + "step": 11795 + }, + { + "epoch": 3.620626151012891, + "grad_norm": 0.3783189058303833, + "learning_rate": 7.37720571612565e-05, + "loss": 1.7813, + "step": 11796 + }, + { + "epoch": 3.6209330877839165, + "grad_norm": 0.3468814790248871, + "learning_rate": 7.376768420603204e-05, + "loss": 1.7509, + "step": 11797 + }, + { + "epoch": 3.6212400245549414, + "grad_norm": 0.2602507174015045, + "learning_rate": 7.376331101592138e-05, + "loss": 1.8158, + "step": 11798 + }, + { + "epoch": 3.6215469613259668, + "grad_norm": 0.28337883949279785, + "learning_rate": 7.375893759096775e-05, + "loss": 1.7755, + "step": 11799 + }, + { + "epoch": 3.621853898096992, + "grad_norm": 0.3644609749317169, + "learning_rate": 7.375456393121437e-05, + "loss": 1.8193, + "step": 11800 + }, + { + "epoch": 3.6221608348680174, + "grad_norm": 0.338211327791214, + "learning_rate": 7.375019003670448e-05, + "loss": 1.821, + "step": 11801 + }, + { + "epoch": 3.6224677716390423, + "grad_norm": 0.23850654065608978, + "learning_rate": 7.374581590748129e-05, + "loss": 1.7317, + "step": 11802 + }, + { + "epoch": 3.6227747084100677, + "grad_norm": 0.3496716618537903, + "learning_rate": 7.374144154358801e-05, + "loss": 1.8361, + "step": 11803 + }, + { + "epoch": 3.6230816451810925, + "grad_norm": 0.5585216283798218, + "learning_rate": 7.37370669450679e-05, + "loss": 1.7667, + "step": 11804 + }, + { + "epoch": 3.623388581952118, + "grad_norm": 0.4578089714050293, + "learning_rate": 7.373269211196418e-05, + "loss": 1.8051, + "step": 11805 + }, + { + "epoch": 3.623695518723143, + "grad_norm": 0.28195759654045105, + "learning_rate": 7.37283170443201e-05, + "loss": 1.7823, + "step": 11806 + }, + { + "epoch": 3.624002455494168, + "grad_norm": 0.4066108465194702, + "learning_rate": 7.372394174217887e-05, + "loss": 1.7819, + "step": 11807 + }, + { + "epoch": 3.6243093922651934, + "grad_norm": 0.5368703007698059, + "learning_rate": 7.371956620558375e-05, + "loss": 1.8121, + "step": 11808 + }, + { + "epoch": 3.6246163290362183, + "grad_norm": 0.36627063155174255, + "learning_rate": 7.371519043457795e-05, + "loss": 1.7944, + "step": 11809 + }, + { + "epoch": 3.6249232658072437, + "grad_norm": 0.3100780248641968, + "learning_rate": 7.371081442920476e-05, + "loss": 1.783, + "step": 11810 + }, + { + "epoch": 3.625230202578269, + "grad_norm": 0.3277178704738617, + "learning_rate": 7.370643818950741e-05, + "loss": 1.8105, + "step": 11811 + }, + { + "epoch": 3.625537139349294, + "grad_norm": 0.3887772560119629, + "learning_rate": 7.370206171552914e-05, + "loss": 1.8136, + "step": 11812 + }, + { + "epoch": 3.6258440761203192, + "grad_norm": 0.2770824134349823, + "learning_rate": 7.36976850073132e-05, + "loss": 1.7852, + "step": 11813 + }, + { + "epoch": 3.626151012891344, + "grad_norm": 0.26357728242874146, + "learning_rate": 7.369330806490284e-05, + "loss": 1.7621, + "step": 11814 + }, + { + "epoch": 3.6264579496623695, + "grad_norm": 0.3387344181537628, + "learning_rate": 7.368893088834135e-05, + "loss": 1.7785, + "step": 11815 + }, + { + "epoch": 3.626764886433395, + "grad_norm": 0.35155174136161804, + "learning_rate": 7.368455347767193e-05, + "loss": 1.8081, + "step": 11816 + }, + { + "epoch": 3.62707182320442, + "grad_norm": 0.2855289876461029, + "learning_rate": 7.368017583293788e-05, + "loss": 1.8245, + "step": 11817 + }, + { + "epoch": 3.627378759975445, + "grad_norm": 0.28462162613868713, + "learning_rate": 7.367579795418245e-05, + "loss": 1.8066, + "step": 11818 + }, + { + "epoch": 3.6276856967464703, + "grad_norm": 0.40696555376052856, + "learning_rate": 7.367141984144891e-05, + "loss": 1.8897, + "step": 11819 + }, + { + "epoch": 3.6279926335174952, + "grad_norm": 0.472782701253891, + "learning_rate": 7.366704149478054e-05, + "loss": 1.8071, + "step": 11820 + }, + { + "epoch": 3.6282995702885206, + "grad_norm": 0.27022916078567505, + "learning_rate": 7.366266291422057e-05, + "loss": 1.8574, + "step": 11821 + }, + { + "epoch": 3.628606507059546, + "grad_norm": 0.4207148253917694, + "learning_rate": 7.365828409981231e-05, + "loss": 1.7759, + "step": 11822 + }, + { + "epoch": 3.628913443830571, + "grad_norm": 0.42866072058677673, + "learning_rate": 7.365390505159902e-05, + "loss": 1.7366, + "step": 11823 + }, + { + "epoch": 3.629220380601596, + "grad_norm": 0.28288859128952026, + "learning_rate": 7.364952576962398e-05, + "loss": 1.8591, + "step": 11824 + }, + { + "epoch": 3.629527317372621, + "grad_norm": 0.30544906854629517, + "learning_rate": 7.364514625393045e-05, + "loss": 1.7965, + "step": 11825 + }, + { + "epoch": 3.6298342541436464, + "grad_norm": 0.3251616954803467, + "learning_rate": 7.364076650456173e-05, + "loss": 1.8197, + "step": 11826 + }, + { + "epoch": 3.6301411909146717, + "grad_norm": 0.3133888840675354, + "learning_rate": 7.363638652156109e-05, + "loss": 1.7978, + "step": 11827 + }, + { + "epoch": 3.630448127685697, + "grad_norm": 0.29004594683647156, + "learning_rate": 7.363200630497185e-05, + "loss": 1.8035, + "step": 11828 + }, + { + "epoch": 3.630755064456722, + "grad_norm": 0.2781279683113098, + "learning_rate": 7.362762585483725e-05, + "loss": 1.8462, + "step": 11829 + }, + { + "epoch": 3.6310620012277472, + "grad_norm": 0.29003822803497314, + "learning_rate": 7.362324517120063e-05, + "loss": 1.7952, + "step": 11830 + }, + { + "epoch": 3.631368937998772, + "grad_norm": 0.2510940134525299, + "learning_rate": 7.361886425410524e-05, + "loss": 1.7645, + "step": 11831 + }, + { + "epoch": 3.6316758747697975, + "grad_norm": 0.23798540234565735, + "learning_rate": 7.361448310359438e-05, + "loss": 1.7329, + "step": 11832 + }, + { + "epoch": 3.631982811540823, + "grad_norm": 0.2711278796195984, + "learning_rate": 7.361010171971137e-05, + "loss": 1.8245, + "step": 11833 + }, + { + "epoch": 3.6322897483118477, + "grad_norm": 0.2895669639110565, + "learning_rate": 7.360572010249949e-05, + "loss": 1.7668, + "step": 11834 + }, + { + "epoch": 3.632596685082873, + "grad_norm": 0.2216273844242096, + "learning_rate": 7.360133825200205e-05, + "loss": 1.8164, + "step": 11835 + }, + { + "epoch": 3.632903621853898, + "grad_norm": 0.3075082302093506, + "learning_rate": 7.359695616826236e-05, + "loss": 1.8159, + "step": 11836 + }, + { + "epoch": 3.6332105586249233, + "grad_norm": 0.3208801746368408, + "learning_rate": 7.35925738513237e-05, + "loss": 1.8385, + "step": 11837 + }, + { + "epoch": 3.6335174953959486, + "grad_norm": 0.272517591714859, + "learning_rate": 7.35881913012294e-05, + "loss": 1.7653, + "step": 11838 + }, + { + "epoch": 3.6338244321669735, + "grad_norm": 0.23105360567569733, + "learning_rate": 7.358380851802277e-05, + "loss": 1.7697, + "step": 11839 + }, + { + "epoch": 3.634131368937999, + "grad_norm": 0.2643153667449951, + "learning_rate": 7.357942550174714e-05, + "loss": 1.7885, + "step": 11840 + }, + { + "epoch": 3.6344383057090237, + "grad_norm": 0.22643202543258667, + "learning_rate": 7.357504225244579e-05, + "loss": 1.746, + "step": 11841 + }, + { + "epoch": 3.634745242480049, + "grad_norm": 0.27782970666885376, + "learning_rate": 7.357065877016207e-05, + "loss": 1.794, + "step": 11842 + }, + { + "epoch": 3.6350521792510744, + "grad_norm": 0.3035561740398407, + "learning_rate": 7.356627505493925e-05, + "loss": 1.7892, + "step": 11843 + }, + { + "epoch": 3.6353591160220997, + "grad_norm": 0.31859731674194336, + "learning_rate": 7.356189110682072e-05, + "loss": 1.7636, + "step": 11844 + }, + { + "epoch": 3.6356660527931246, + "grad_norm": 0.2960890233516693, + "learning_rate": 7.355750692584977e-05, + "loss": 1.8294, + "step": 11845 + }, + { + "epoch": 3.63597298956415, + "grad_norm": 0.2544194459915161, + "learning_rate": 7.355312251206972e-05, + "loss": 1.7603, + "step": 11846 + }, + { + "epoch": 3.636279926335175, + "grad_norm": 0.27864789962768555, + "learning_rate": 7.354873786552391e-05, + "loss": 1.7917, + "step": 11847 + }, + { + "epoch": 3.6365868631062, + "grad_norm": 0.32552552223205566, + "learning_rate": 7.354435298625568e-05, + "loss": 1.7769, + "step": 11848 + }, + { + "epoch": 3.6368937998772255, + "grad_norm": 0.25094640254974365, + "learning_rate": 7.353996787430833e-05, + "loss": 1.8371, + "step": 11849 + }, + { + "epoch": 3.6372007366482504, + "grad_norm": 0.26656433939933777, + "learning_rate": 7.353558252972524e-05, + "loss": 1.7686, + "step": 11850 + }, + { + "epoch": 3.6375076734192757, + "grad_norm": 0.3023635745048523, + "learning_rate": 7.353119695254973e-05, + "loss": 1.7892, + "step": 11851 + }, + { + "epoch": 3.6378146101903006, + "grad_norm": 0.2822463810443878, + "learning_rate": 7.352681114282514e-05, + "loss": 1.8221, + "step": 11852 + }, + { + "epoch": 3.638121546961326, + "grad_norm": 0.31159496307373047, + "learning_rate": 7.35224251005948e-05, + "loss": 1.803, + "step": 11853 + }, + { + "epoch": 3.6384284837323513, + "grad_norm": 0.3133087158203125, + "learning_rate": 7.351803882590207e-05, + "loss": 1.744, + "step": 11854 + }, + { + "epoch": 3.638735420503376, + "grad_norm": 0.3050002455711365, + "learning_rate": 7.351365231879029e-05, + "loss": 1.7522, + "step": 11855 + }, + { + "epoch": 3.6390423572744015, + "grad_norm": 0.2729037404060364, + "learning_rate": 7.350926557930283e-05, + "loss": 1.7629, + "step": 11856 + }, + { + "epoch": 3.6393492940454264, + "grad_norm": 0.3181995153427124, + "learning_rate": 7.350487860748303e-05, + "loss": 1.7603, + "step": 11857 + }, + { + "epoch": 3.6396562308164517, + "grad_norm": 0.352651447057724, + "learning_rate": 7.350049140337423e-05, + "loss": 1.8177, + "step": 11858 + }, + { + "epoch": 3.639963167587477, + "grad_norm": 0.22935177385807037, + "learning_rate": 7.349610396701981e-05, + "loss": 1.7421, + "step": 11859 + }, + { + "epoch": 3.6402701043585024, + "grad_norm": 0.26442599296569824, + "learning_rate": 7.349171629846312e-05, + "loss": 1.8026, + "step": 11860 + }, + { + "epoch": 3.6405770411295273, + "grad_norm": 0.25357648730278015, + "learning_rate": 7.348732839774751e-05, + "loss": 1.788, + "step": 11861 + }, + { + "epoch": 3.6408839779005526, + "grad_norm": 0.26959577202796936, + "learning_rate": 7.348294026491635e-05, + "loss": 1.884, + "step": 11862 + }, + { + "epoch": 3.6411909146715775, + "grad_norm": 0.2243001013994217, + "learning_rate": 7.347855190001304e-05, + "loss": 1.7765, + "step": 11863 + }, + { + "epoch": 3.641497851442603, + "grad_norm": 0.2480708807706833, + "learning_rate": 7.34741633030809e-05, + "loss": 1.7597, + "step": 11864 + }, + { + "epoch": 3.641804788213628, + "grad_norm": 0.22512994706630707, + "learning_rate": 7.346977447416332e-05, + "loss": 1.7647, + "step": 11865 + }, + { + "epoch": 3.642111724984653, + "grad_norm": 0.24961981177330017, + "learning_rate": 7.346538541330368e-05, + "loss": 1.8178, + "step": 11866 + }, + { + "epoch": 3.6424186617556784, + "grad_norm": 0.320896714925766, + "learning_rate": 7.346099612054533e-05, + "loss": 1.85, + "step": 11867 + }, + { + "epoch": 3.6427255985267033, + "grad_norm": 0.3420880436897278, + "learning_rate": 7.345660659593167e-05, + "loss": 1.8661, + "step": 11868 + }, + { + "epoch": 3.6430325352977286, + "grad_norm": 0.2675844132900238, + "learning_rate": 7.34522168395061e-05, + "loss": 1.8177, + "step": 11869 + }, + { + "epoch": 3.643339472068754, + "grad_norm": 0.23993943631649017, + "learning_rate": 7.344782685131195e-05, + "loss": 1.7365, + "step": 11870 + }, + { + "epoch": 3.643646408839779, + "grad_norm": 0.21805813908576965, + "learning_rate": 7.344343663139264e-05, + "loss": 1.7813, + "step": 11871 + }, + { + "epoch": 3.643953345610804, + "grad_norm": 0.24334421753883362, + "learning_rate": 7.343904617979154e-05, + "loss": 1.7763, + "step": 11872 + }, + { + "epoch": 3.644260282381829, + "grad_norm": 0.22768431901931763, + "learning_rate": 7.343465549655206e-05, + "loss": 1.7817, + "step": 11873 + }, + { + "epoch": 3.6445672191528544, + "grad_norm": 0.23828962445259094, + "learning_rate": 7.343026458171757e-05, + "loss": 1.8391, + "step": 11874 + }, + { + "epoch": 3.6448741559238798, + "grad_norm": 0.24838197231292725, + "learning_rate": 7.342587343533149e-05, + "loss": 1.759, + "step": 11875 + }, + { + "epoch": 3.645181092694905, + "grad_norm": 0.22732019424438477, + "learning_rate": 7.342148205743718e-05, + "loss": 1.7348, + "step": 11876 + }, + { + "epoch": 3.64548802946593, + "grad_norm": 0.25106775760650635, + "learning_rate": 7.341709044807807e-05, + "loss": 1.8121, + "step": 11877 + }, + { + "epoch": 3.6457949662369553, + "grad_norm": 0.28532838821411133, + "learning_rate": 7.341269860729753e-05, + "loss": 1.7147, + "step": 11878 + }, + { + "epoch": 3.64610190300798, + "grad_norm": 0.3041890859603882, + "learning_rate": 7.340830653513899e-05, + "loss": 1.7666, + "step": 11879 + }, + { + "epoch": 3.6464088397790055, + "grad_norm": 0.3142147958278656, + "learning_rate": 7.340391423164585e-05, + "loss": 1.8707, + "step": 11880 + }, + { + "epoch": 3.646715776550031, + "grad_norm": 0.28531381487846375, + "learning_rate": 7.339952169686151e-05, + "loss": 1.7961, + "step": 11881 + }, + { + "epoch": 3.6470227133210558, + "grad_norm": 0.33779671788215637, + "learning_rate": 7.339512893082938e-05, + "loss": 1.7428, + "step": 11882 + }, + { + "epoch": 3.647329650092081, + "grad_norm": 0.29611849784851074, + "learning_rate": 7.339073593359287e-05, + "loss": 1.8803, + "step": 11883 + }, + { + "epoch": 3.647636586863106, + "grad_norm": 0.31248557567596436, + "learning_rate": 7.33863427051954e-05, + "loss": 1.7868, + "step": 11884 + }, + { + "epoch": 3.6479435236341313, + "grad_norm": 0.42829564213752747, + "learning_rate": 7.338194924568039e-05, + "loss": 1.8558, + "step": 11885 + }, + { + "epoch": 3.6482504604051567, + "grad_norm": 0.431023508310318, + "learning_rate": 7.337755555509126e-05, + "loss": 1.7565, + "step": 11886 + }, + { + "epoch": 3.6485573971761815, + "grad_norm": 0.2917975187301636, + "learning_rate": 7.33731616334714e-05, + "loss": 1.8067, + "step": 11887 + }, + { + "epoch": 3.648864333947207, + "grad_norm": 0.3072175085544586, + "learning_rate": 7.336876748086427e-05, + "loss": 1.782, + "step": 11888 + }, + { + "epoch": 3.6491712707182318, + "grad_norm": 0.33658862113952637, + "learning_rate": 7.336437309731327e-05, + "loss": 1.8007, + "step": 11889 + }, + { + "epoch": 3.649478207489257, + "grad_norm": 0.23774033784866333, + "learning_rate": 7.335997848286185e-05, + "loss": 1.7606, + "step": 11890 + }, + { + "epoch": 3.6497851442602824, + "grad_norm": 0.3373236358165741, + "learning_rate": 7.335558363755344e-05, + "loss": 1.7335, + "step": 11891 + }, + { + "epoch": 3.650092081031308, + "grad_norm": 0.3906517028808594, + "learning_rate": 7.335118856143145e-05, + "loss": 1.7974, + "step": 11892 + }, + { + "epoch": 3.6503990178023327, + "grad_norm": 0.37715303897857666, + "learning_rate": 7.334679325453934e-05, + "loss": 1.8875, + "step": 11893 + }, + { + "epoch": 3.650705954573358, + "grad_norm": 0.278540700674057, + "learning_rate": 7.334239771692053e-05, + "loss": 1.8165, + "step": 11894 + }, + { + "epoch": 3.651012891344383, + "grad_norm": 0.24434895813465118, + "learning_rate": 7.333800194861845e-05, + "loss": 1.7756, + "step": 11895 + }, + { + "epoch": 3.6513198281154082, + "grad_norm": 0.25057271122932434, + "learning_rate": 7.333360594967658e-05, + "loss": 1.7932, + "step": 11896 + }, + { + "epoch": 3.6516267648864336, + "grad_norm": 0.3277342617511749, + "learning_rate": 7.332920972013833e-05, + "loss": 1.7781, + "step": 11897 + }, + { + "epoch": 3.6519337016574585, + "grad_norm": 0.2754829525947571, + "learning_rate": 7.332481326004715e-05, + "loss": 1.7916, + "step": 11898 + }, + { + "epoch": 3.652240638428484, + "grad_norm": 0.24490588903427124, + "learning_rate": 7.332041656944651e-05, + "loss": 1.7904, + "step": 11899 + }, + { + "epoch": 3.6525475751995087, + "grad_norm": 0.3176959455013275, + "learning_rate": 7.331601964837982e-05, + "loss": 1.7379, + "step": 11900 + }, + { + "epoch": 3.652854511970534, + "grad_norm": 0.3435784876346588, + "learning_rate": 7.331162249689057e-05, + "loss": 1.7635, + "step": 11901 + }, + { + "epoch": 3.6531614487415593, + "grad_norm": 0.335697740316391, + "learning_rate": 7.330722511502221e-05, + "loss": 1.7903, + "step": 11902 + }, + { + "epoch": 3.6534683855125847, + "grad_norm": 0.2748894691467285, + "learning_rate": 7.330282750281819e-05, + "loss": 1.8259, + "step": 11903 + }, + { + "epoch": 3.6537753222836096, + "grad_norm": 0.36754751205444336, + "learning_rate": 7.329842966032197e-05, + "loss": 1.7728, + "step": 11904 + }, + { + "epoch": 3.654082259054635, + "grad_norm": 0.4355713129043579, + "learning_rate": 7.3294031587577e-05, + "loss": 1.7447, + "step": 11905 + }, + { + "epoch": 3.65438919582566, + "grad_norm": 0.3967476487159729, + "learning_rate": 7.328963328462677e-05, + "loss": 1.8299, + "step": 11906 + }, + { + "epoch": 3.654696132596685, + "grad_norm": 0.23805755376815796, + "learning_rate": 7.328523475151472e-05, + "loss": 1.7631, + "step": 11907 + }, + { + "epoch": 3.6550030693677105, + "grad_norm": 0.40350377559661865, + "learning_rate": 7.328083598828435e-05, + "loss": 1.8693, + "step": 11908 + }, + { + "epoch": 3.6553100061387354, + "grad_norm": 0.4743673801422119, + "learning_rate": 7.32764369949791e-05, + "loss": 1.7887, + "step": 11909 + }, + { + "epoch": 3.6556169429097607, + "grad_norm": 0.33830127120018005, + "learning_rate": 7.327203777164246e-05, + "loss": 1.7527, + "step": 11910 + }, + { + "epoch": 3.6559238796807856, + "grad_norm": 0.2465003877878189, + "learning_rate": 7.326763831831791e-05, + "loss": 1.7898, + "step": 11911 + }, + { + "epoch": 3.656230816451811, + "grad_norm": 0.31647852063179016, + "learning_rate": 7.326323863504892e-05, + "loss": 1.8056, + "step": 11912 + }, + { + "epoch": 3.6565377532228363, + "grad_norm": 0.31436124444007874, + "learning_rate": 7.325883872187896e-05, + "loss": 1.7972, + "step": 11913 + }, + { + "epoch": 3.656844689993861, + "grad_norm": 0.260405957698822, + "learning_rate": 7.325443857885153e-05, + "loss": 1.8109, + "step": 11914 + }, + { + "epoch": 3.6571516267648865, + "grad_norm": 0.29312583804130554, + "learning_rate": 7.325003820601011e-05, + "loss": 1.8947, + "step": 11915 + }, + { + "epoch": 3.6574585635359114, + "grad_norm": 0.2641582190990448, + "learning_rate": 7.324563760339819e-05, + "loss": 1.7737, + "step": 11916 + }, + { + "epoch": 3.6577655003069367, + "grad_norm": 0.2338121086359024, + "learning_rate": 7.324123677105923e-05, + "loss": 1.7462, + "step": 11917 + }, + { + "epoch": 3.658072437077962, + "grad_norm": 0.27877378463745117, + "learning_rate": 7.323683570903676e-05, + "loss": 1.8371, + "step": 11918 + }, + { + "epoch": 3.6583793738489874, + "grad_norm": 0.24238766729831696, + "learning_rate": 7.323243441737427e-05, + "loss": 1.7304, + "step": 11919 + }, + { + "epoch": 3.6586863106200123, + "grad_norm": 0.2349759042263031, + "learning_rate": 7.322803289611525e-05, + "loss": 1.7422, + "step": 11920 + }, + { + "epoch": 3.6589932473910376, + "grad_norm": 0.2254217565059662, + "learning_rate": 7.322363114530318e-05, + "loss": 1.7296, + "step": 11921 + }, + { + "epoch": 3.6593001841620625, + "grad_norm": 0.24533270299434662, + "learning_rate": 7.321922916498158e-05, + "loss": 1.7834, + "step": 11922 + }, + { + "epoch": 3.659607120933088, + "grad_norm": 0.24993161857128143, + "learning_rate": 7.321482695519393e-05, + "loss": 1.8502, + "step": 11923 + }, + { + "epoch": 3.659914057704113, + "grad_norm": 0.2540178894996643, + "learning_rate": 7.321042451598378e-05, + "loss": 1.8372, + "step": 11924 + }, + { + "epoch": 3.660220994475138, + "grad_norm": 0.2241390198469162, + "learning_rate": 7.32060218473946e-05, + "loss": 1.7619, + "step": 11925 + }, + { + "epoch": 3.6605279312461634, + "grad_norm": 0.2137840837240219, + "learning_rate": 7.32016189494699e-05, + "loss": 1.751, + "step": 11926 + }, + { + "epoch": 3.6608348680171883, + "grad_norm": 0.2596585154533386, + "learning_rate": 7.319721582225323e-05, + "loss": 1.7773, + "step": 11927 + }, + { + "epoch": 3.6611418047882136, + "grad_norm": 0.24898354709148407, + "learning_rate": 7.319281246578806e-05, + "loss": 1.7347, + "step": 11928 + }, + { + "epoch": 3.661448741559239, + "grad_norm": 0.26553863286972046, + "learning_rate": 7.31884088801179e-05, + "loss": 1.7812, + "step": 11929 + }, + { + "epoch": 3.661755678330264, + "grad_norm": 0.2494438737630844, + "learning_rate": 7.318400506528633e-05, + "loss": 1.7554, + "step": 11930 + }, + { + "epoch": 3.662062615101289, + "grad_norm": 0.2794995903968811, + "learning_rate": 7.317960102133682e-05, + "loss": 1.7495, + "step": 11931 + }, + { + "epoch": 3.662369551872314, + "grad_norm": 0.2843860983848572, + "learning_rate": 7.317519674831293e-05, + "loss": 1.7734, + "step": 11932 + }, + { + "epoch": 3.6626764886433394, + "grad_norm": 0.28261128067970276, + "learning_rate": 7.317079224625813e-05, + "loss": 1.7794, + "step": 11933 + }, + { + "epoch": 3.6629834254143647, + "grad_norm": 0.2552426755428314, + "learning_rate": 7.316638751521599e-05, + "loss": 1.8397, + "step": 11934 + }, + { + "epoch": 3.66329036218539, + "grad_norm": 0.4140608608722687, + "learning_rate": 7.316198255523002e-05, + "loss": 1.848, + "step": 11935 + }, + { + "epoch": 3.663597298956415, + "grad_norm": 0.3709854483604431, + "learning_rate": 7.315757736634377e-05, + "loss": 1.8489, + "step": 11936 + }, + { + "epoch": 3.6639042357274403, + "grad_norm": 0.23637300729751587, + "learning_rate": 7.315317194860078e-05, + "loss": 1.7549, + "step": 11937 + }, + { + "epoch": 3.664211172498465, + "grad_norm": 0.32884421944618225, + "learning_rate": 7.314876630204456e-05, + "loss": 1.8061, + "step": 11938 + }, + { + "epoch": 3.6645181092694905, + "grad_norm": 0.33354130387306213, + "learning_rate": 7.314436042671867e-05, + "loss": 1.8346, + "step": 11939 + }, + { + "epoch": 3.664825046040516, + "grad_norm": 0.25776317715644836, + "learning_rate": 7.313995432266663e-05, + "loss": 1.8598, + "step": 11940 + }, + { + "epoch": 3.6651319828115407, + "grad_norm": 0.2910402715206146, + "learning_rate": 7.313554798993202e-05, + "loss": 1.7613, + "step": 11941 + }, + { + "epoch": 3.665438919582566, + "grad_norm": 0.3487538695335388, + "learning_rate": 7.313114142855836e-05, + "loss": 1.8105, + "step": 11942 + }, + { + "epoch": 3.665745856353591, + "grad_norm": 0.27271291613578796, + "learning_rate": 7.312673463858918e-05, + "loss": 1.8107, + "step": 11943 + }, + { + "epoch": 3.6660527931246163, + "grad_norm": 0.2613036632537842, + "learning_rate": 7.312232762006809e-05, + "loss": 1.7871, + "step": 11944 + }, + { + "epoch": 3.6663597298956416, + "grad_norm": 0.30594903230667114, + "learning_rate": 7.311792037303859e-05, + "loss": 1.8043, + "step": 11945 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.3960847854614258, + "learning_rate": 7.311351289754425e-05, + "loss": 1.8434, + "step": 11946 + }, + { + "epoch": 3.666973603437692, + "grad_norm": 0.33369311690330505, + "learning_rate": 7.310910519362861e-05, + "loss": 1.7496, + "step": 11947 + }, + { + "epoch": 3.6672805402087167, + "grad_norm": 0.29852384328842163, + "learning_rate": 7.310469726133528e-05, + "loss": 1.858, + "step": 11948 + }, + { + "epoch": 3.667587476979742, + "grad_norm": 0.2610527276992798, + "learning_rate": 7.310028910070777e-05, + "loss": 1.7642, + "step": 11949 + }, + { + "epoch": 3.6678944137507674, + "grad_norm": 0.3606704771518707, + "learning_rate": 7.309588071178967e-05, + "loss": 1.845, + "step": 11950 + }, + { + "epoch": 3.6682013505217927, + "grad_norm": 0.3157273828983307, + "learning_rate": 7.309147209462454e-05, + "loss": 1.7864, + "step": 11951 + }, + { + "epoch": 3.6685082872928176, + "grad_norm": 0.23907925188541412, + "learning_rate": 7.308706324925594e-05, + "loss": 1.8363, + "step": 11952 + }, + { + "epoch": 3.668815224063843, + "grad_norm": 0.3365088999271393, + "learning_rate": 7.308265417572747e-05, + "loss": 1.8755, + "step": 11953 + }, + { + "epoch": 3.669122160834868, + "grad_norm": 0.29404979944229126, + "learning_rate": 7.307824487408266e-05, + "loss": 1.8128, + "step": 11954 + }, + { + "epoch": 3.669429097605893, + "grad_norm": 0.2689574658870697, + "learning_rate": 7.307383534436511e-05, + "loss": 1.8072, + "step": 11955 + }, + { + "epoch": 3.6697360343769185, + "grad_norm": 0.28394198417663574, + "learning_rate": 7.306942558661841e-05, + "loss": 1.7919, + "step": 11956 + }, + { + "epoch": 3.6700429711479434, + "grad_norm": 0.2594783902168274, + "learning_rate": 7.306501560088612e-05, + "loss": 1.7467, + "step": 11957 + }, + { + "epoch": 3.6703499079189688, + "grad_norm": 0.24765191972255707, + "learning_rate": 7.30606053872118e-05, + "loss": 1.7876, + "step": 11958 + }, + { + "epoch": 3.6706568446899936, + "grad_norm": 0.22157172858715057, + "learning_rate": 7.305619494563909e-05, + "loss": 1.7802, + "step": 11959 + }, + { + "epoch": 3.670963781461019, + "grad_norm": 0.270151287317276, + "learning_rate": 7.305178427621155e-05, + "loss": 1.7723, + "step": 11960 + }, + { + "epoch": 3.6712707182320443, + "grad_norm": 0.3163939118385315, + "learning_rate": 7.304737337897277e-05, + "loss": 1.8488, + "step": 11961 + }, + { + "epoch": 3.671577655003069, + "grad_norm": 0.2605706453323364, + "learning_rate": 7.304296225396632e-05, + "loss": 1.7442, + "step": 11962 + }, + { + "epoch": 3.6718845917740945, + "grad_norm": 0.31179291009902954, + "learning_rate": 7.303855090123582e-05, + "loss": 1.831, + "step": 11963 + }, + { + "epoch": 3.6721915285451194, + "grad_norm": 0.33365359902381897, + "learning_rate": 7.303413932082483e-05, + "loss": 1.8376, + "step": 11964 + }, + { + "epoch": 3.6724984653161448, + "grad_norm": 0.2952130138874054, + "learning_rate": 7.302972751277701e-05, + "loss": 1.7733, + "step": 11965 + }, + { + "epoch": 3.67280540208717, + "grad_norm": 0.24270877242088318, + "learning_rate": 7.302531547713592e-05, + "loss": 1.8367, + "step": 11966 + }, + { + "epoch": 3.6731123388581954, + "grad_norm": 0.34315919876098633, + "learning_rate": 7.302090321394517e-05, + "loss": 1.7901, + "step": 11967 + }, + { + "epoch": 3.6734192756292203, + "grad_norm": 0.33511418104171753, + "learning_rate": 7.301649072324834e-05, + "loss": 1.7929, + "step": 11968 + }, + { + "epoch": 3.6737262124002457, + "grad_norm": 0.22397933900356293, + "learning_rate": 7.301207800508907e-05, + "loss": 1.7533, + "step": 11969 + }, + { + "epoch": 3.6740331491712706, + "grad_norm": 0.2882738411426544, + "learning_rate": 7.300766505951095e-05, + "loss": 1.8071, + "step": 11970 + }, + { + "epoch": 3.674340085942296, + "grad_norm": 0.242112398147583, + "learning_rate": 7.300325188655761e-05, + "loss": 1.7739, + "step": 11971 + }, + { + "epoch": 3.674647022713321, + "grad_norm": 0.27754491567611694, + "learning_rate": 7.299883848627265e-05, + "loss": 1.8295, + "step": 11972 + }, + { + "epoch": 3.674953959484346, + "grad_norm": 0.2787899076938629, + "learning_rate": 7.29944248586997e-05, + "loss": 1.7682, + "step": 11973 + }, + { + "epoch": 3.6752608962553714, + "grad_norm": 0.24448934197425842, + "learning_rate": 7.299001100388234e-05, + "loss": 1.7826, + "step": 11974 + }, + { + "epoch": 3.6755678330263963, + "grad_norm": 0.37869495153427124, + "learning_rate": 7.298559692186421e-05, + "loss": 1.8582, + "step": 11975 + }, + { + "epoch": 3.6758747697974217, + "grad_norm": 0.3299996256828308, + "learning_rate": 7.298118261268897e-05, + "loss": 1.7716, + "step": 11976 + }, + { + "epoch": 3.676181706568447, + "grad_norm": 0.278891384601593, + "learning_rate": 7.29767680764002e-05, + "loss": 1.879, + "step": 11977 + }, + { + "epoch": 3.6764886433394723, + "grad_norm": 0.29326459765434265, + "learning_rate": 7.297235331304155e-05, + "loss": 1.804, + "step": 11978 + }, + { + "epoch": 3.6767955801104972, + "grad_norm": 0.2697092592716217, + "learning_rate": 7.296793832265663e-05, + "loss": 1.7842, + "step": 11979 + }, + { + "epoch": 3.6771025168815226, + "grad_norm": 0.3045118749141693, + "learning_rate": 7.296352310528909e-05, + "loss": 1.7959, + "step": 11980 + }, + { + "epoch": 3.6774094536525475, + "grad_norm": 0.278647780418396, + "learning_rate": 7.295910766098252e-05, + "loss": 1.7907, + "step": 11981 + }, + { + "epoch": 3.677716390423573, + "grad_norm": 0.2370275855064392, + "learning_rate": 7.295469198978063e-05, + "loss": 1.757, + "step": 11982 + }, + { + "epoch": 3.678023327194598, + "grad_norm": 0.3061021566390991, + "learning_rate": 7.295027609172702e-05, + "loss": 1.7927, + "step": 11983 + }, + { + "epoch": 3.678330263965623, + "grad_norm": 0.2844544053077698, + "learning_rate": 7.294585996686532e-05, + "loss": 1.7705, + "step": 11984 + }, + { + "epoch": 3.6786372007366483, + "grad_norm": 0.31121113896369934, + "learning_rate": 7.29414436152392e-05, + "loss": 1.783, + "step": 11985 + }, + { + "epoch": 3.6789441375076732, + "grad_norm": 0.2566785514354706, + "learning_rate": 7.293702703689225e-05, + "loss": 1.7781, + "step": 11986 + }, + { + "epoch": 3.6792510742786986, + "grad_norm": 0.22176961600780487, + "learning_rate": 7.293261023186818e-05, + "loss": 1.7302, + "step": 11987 + }, + { + "epoch": 3.679558011049724, + "grad_norm": 0.21547441184520721, + "learning_rate": 7.292819320021062e-05, + "loss": 1.7666, + "step": 11988 + }, + { + "epoch": 3.679864947820749, + "grad_norm": 0.26309674978256226, + "learning_rate": 7.29237759419632e-05, + "loss": 1.7817, + "step": 11989 + }, + { + "epoch": 3.680171884591774, + "grad_norm": 0.2558063864707947, + "learning_rate": 7.29193584571696e-05, + "loss": 1.8257, + "step": 11990 + }, + { + "epoch": 3.680478821362799, + "grad_norm": 0.24516844749450684, + "learning_rate": 7.291494074587347e-05, + "loss": 1.7803, + "step": 11991 + }, + { + "epoch": 3.6807857581338244, + "grad_norm": 0.22891047596931458, + "learning_rate": 7.291052280811843e-05, + "loss": 1.7977, + "step": 11992 + }, + { + "epoch": 3.6810926949048497, + "grad_norm": 0.2776026129722595, + "learning_rate": 7.290610464394822e-05, + "loss": 1.8486, + "step": 11993 + }, + { + "epoch": 3.681399631675875, + "grad_norm": 0.31472426652908325, + "learning_rate": 7.290168625340644e-05, + "loss": 1.7841, + "step": 11994 + }, + { + "epoch": 3.6817065684469, + "grad_norm": 0.3459274470806122, + "learning_rate": 7.289726763653677e-05, + "loss": 1.7458, + "step": 11995 + }, + { + "epoch": 3.6820135052179253, + "grad_norm": 0.23645849525928497, + "learning_rate": 7.289284879338289e-05, + "loss": 1.781, + "step": 11996 + }, + { + "epoch": 3.68232044198895, + "grad_norm": 0.3257114291191101, + "learning_rate": 7.288842972398845e-05, + "loss": 1.8269, + "step": 11997 + }, + { + "epoch": 3.6826273787599755, + "grad_norm": 0.5450126528739929, + "learning_rate": 7.288401042839713e-05, + "loss": 1.8342, + "step": 11998 + }, + { + "epoch": 3.682934315531001, + "grad_norm": 0.5080512762069702, + "learning_rate": 7.287959090665262e-05, + "loss": 1.8097, + "step": 11999 + }, + { + "epoch": 3.6832412523020257, + "grad_norm": 0.3005252480506897, + "learning_rate": 7.287517115879858e-05, + "loss": 1.8271, + "step": 12000 + }, + { + "epoch": 3.683548189073051, + "grad_norm": 0.2760924994945526, + "learning_rate": 7.287075118487869e-05, + "loss": 1.8267, + "step": 12001 + }, + { + "epoch": 3.683855125844076, + "grad_norm": 0.3475865423679352, + "learning_rate": 7.286633098493663e-05, + "loss": 1.785, + "step": 12002 + }, + { + "epoch": 3.6841620626151013, + "grad_norm": 0.2905690670013428, + "learning_rate": 7.286191055901608e-05, + "loss": 1.8283, + "step": 12003 + }, + { + "epoch": 3.6844689993861266, + "grad_norm": 0.23666246235370636, + "learning_rate": 7.285748990716072e-05, + "loss": 1.7665, + "step": 12004 + }, + { + "epoch": 3.6847759361571515, + "grad_norm": 0.32329514622688293, + "learning_rate": 7.285306902941427e-05, + "loss": 1.7267, + "step": 12005 + }, + { + "epoch": 3.685082872928177, + "grad_norm": 0.32345879077911377, + "learning_rate": 7.28486479258204e-05, + "loss": 1.7529, + "step": 12006 + }, + { + "epoch": 3.6853898096992017, + "grad_norm": 0.2727855443954468, + "learning_rate": 7.284422659642279e-05, + "loss": 1.8279, + "step": 12007 + }, + { + "epoch": 3.685696746470227, + "grad_norm": 0.37847277522087097, + "learning_rate": 7.283980504126513e-05, + "loss": 1.7809, + "step": 12008 + }, + { + "epoch": 3.6860036832412524, + "grad_norm": 0.44694215059280396, + "learning_rate": 7.283538326039113e-05, + "loss": 1.8184, + "step": 12009 + }, + { + "epoch": 3.6863106200122777, + "grad_norm": 0.2868261933326721, + "learning_rate": 7.28309612538445e-05, + "loss": 1.7461, + "step": 12010 + }, + { + "epoch": 3.6866175567833026, + "grad_norm": 0.2601351737976074, + "learning_rate": 7.282653902166894e-05, + "loss": 1.8011, + "step": 12011 + }, + { + "epoch": 3.686924493554328, + "grad_norm": 0.328185498714447, + "learning_rate": 7.282211656390813e-05, + "loss": 1.7934, + "step": 12012 + }, + { + "epoch": 3.687231430325353, + "grad_norm": 0.2712559103965759, + "learning_rate": 7.281769388060578e-05, + "loss": 1.7566, + "step": 12013 + }, + { + "epoch": 3.687538367096378, + "grad_norm": 0.2725805938243866, + "learning_rate": 7.281327097180562e-05, + "loss": 1.8024, + "step": 12014 + }, + { + "epoch": 3.6878453038674035, + "grad_norm": 0.37282630801200867, + "learning_rate": 7.280884783755133e-05, + "loss": 1.7624, + "step": 12015 + }, + { + "epoch": 3.6881522406384284, + "grad_norm": 0.36519256234169006, + "learning_rate": 7.280442447788664e-05, + "loss": 1.8691, + "step": 12016 + }, + { + "epoch": 3.6884591774094537, + "grad_norm": 0.21699345111846924, + "learning_rate": 7.280000089285528e-05, + "loss": 1.7308, + "step": 12017 + }, + { + "epoch": 3.6887661141804786, + "grad_norm": 0.3159945011138916, + "learning_rate": 7.279557708250094e-05, + "loss": 1.8144, + "step": 12018 + }, + { + "epoch": 3.689073050951504, + "grad_norm": 0.2927449643611908, + "learning_rate": 7.279115304686735e-05, + "loss": 1.7746, + "step": 12019 + }, + { + "epoch": 3.6893799877225293, + "grad_norm": 0.279208242893219, + "learning_rate": 7.278672878599819e-05, + "loss": 1.7678, + "step": 12020 + }, + { + "epoch": 3.689686924493554, + "grad_norm": 0.40005648136138916, + "learning_rate": 7.278230429993725e-05, + "loss": 1.7876, + "step": 12021 + }, + { + "epoch": 3.6899938612645795, + "grad_norm": 0.3444392681121826, + "learning_rate": 7.277787958872824e-05, + "loss": 1.7591, + "step": 12022 + }, + { + "epoch": 3.6903007980356044, + "grad_norm": 0.21841467916965485, + "learning_rate": 7.277345465241485e-05, + "loss": 1.785, + "step": 12023 + }, + { + "epoch": 3.6906077348066297, + "grad_norm": 0.32463181018829346, + "learning_rate": 7.276902949104084e-05, + "loss": 1.8164, + "step": 12024 + }, + { + "epoch": 3.690914671577655, + "grad_norm": 0.36221247911453247, + "learning_rate": 7.276460410464994e-05, + "loss": 1.7529, + "step": 12025 + }, + { + "epoch": 3.6912216083486804, + "grad_norm": 0.24451927840709686, + "learning_rate": 7.276017849328588e-05, + "loss": 1.8031, + "step": 12026 + }, + { + "epoch": 3.6915285451197053, + "grad_norm": 0.3055694103240967, + "learning_rate": 7.275575265699239e-05, + "loss": 1.8158, + "step": 12027 + }, + { + "epoch": 3.6918354818907306, + "grad_norm": 0.4315083622932434, + "learning_rate": 7.27513265958132e-05, + "loss": 1.8322, + "step": 12028 + }, + { + "epoch": 3.6921424186617555, + "grad_norm": 0.3391095697879791, + "learning_rate": 7.274690030979209e-05, + "loss": 1.8214, + "step": 12029 + }, + { + "epoch": 3.692449355432781, + "grad_norm": 0.22714883089065552, + "learning_rate": 7.274247379897277e-05, + "loss": 1.7312, + "step": 12030 + }, + { + "epoch": 3.692756292203806, + "grad_norm": 0.24982765316963196, + "learning_rate": 7.273804706339899e-05, + "loss": 1.738, + "step": 12031 + }, + { + "epoch": 3.693063228974831, + "grad_norm": 0.32509860396385193, + "learning_rate": 7.273362010311451e-05, + "loss": 1.7773, + "step": 12032 + }, + { + "epoch": 3.6933701657458564, + "grad_norm": 0.2643086612224579, + "learning_rate": 7.272919291816307e-05, + "loss": 1.7545, + "step": 12033 + }, + { + "epoch": 3.6936771025168813, + "grad_norm": 0.2568800747394562, + "learning_rate": 7.272476550858842e-05, + "loss": 1.8055, + "step": 12034 + }, + { + "epoch": 3.6939840392879066, + "grad_norm": 0.27418240904808044, + "learning_rate": 7.272033787443433e-05, + "loss": 1.7769, + "step": 12035 + }, + { + "epoch": 3.694290976058932, + "grad_norm": 0.2459677755832672, + "learning_rate": 7.271591001574453e-05, + "loss": 1.7971, + "step": 12036 + }, + { + "epoch": 3.694597912829957, + "grad_norm": 0.22349393367767334, + "learning_rate": 7.27114819325628e-05, + "loss": 1.7791, + "step": 12037 + }, + { + "epoch": 3.694904849600982, + "grad_norm": 0.25321197509765625, + "learning_rate": 7.270705362493288e-05, + "loss": 1.7475, + "step": 12038 + }, + { + "epoch": 3.695211786372007, + "grad_norm": 0.2585916519165039, + "learning_rate": 7.270262509289855e-05, + "loss": 1.7801, + "step": 12039 + }, + { + "epoch": 3.6955187231430324, + "grad_norm": 0.2673574686050415, + "learning_rate": 7.269819633650359e-05, + "loss": 1.7578, + "step": 12040 + }, + { + "epoch": 3.6958256599140578, + "grad_norm": 0.2509469985961914, + "learning_rate": 7.269376735579175e-05, + "loss": 1.7994, + "step": 12041 + }, + { + "epoch": 3.696132596685083, + "grad_norm": 0.28527703881263733, + "learning_rate": 7.268933815080679e-05, + "loss": 1.7752, + "step": 12042 + }, + { + "epoch": 3.696439533456108, + "grad_norm": 0.22716578841209412, + "learning_rate": 7.268490872159248e-05, + "loss": 1.7186, + "step": 12043 + }, + { + "epoch": 3.6967464702271333, + "grad_norm": 0.24888403713703156, + "learning_rate": 7.268047906819262e-05, + "loss": 1.7882, + "step": 12044 + }, + { + "epoch": 3.697053406998158, + "grad_norm": 0.28976112604141235, + "learning_rate": 7.267604919065096e-05, + "loss": 1.7655, + "step": 12045 + }, + { + "epoch": 3.6973603437691835, + "grad_norm": 0.24668502807617188, + "learning_rate": 7.267161908901131e-05, + "loss": 1.8051, + "step": 12046 + }, + { + "epoch": 3.697667280540209, + "grad_norm": 0.2464776188135147, + "learning_rate": 7.266718876331742e-05, + "loss": 1.809, + "step": 12047 + }, + { + "epoch": 3.6979742173112338, + "grad_norm": 0.27648577094078064, + "learning_rate": 7.266275821361309e-05, + "loss": 1.7869, + "step": 12048 + }, + { + "epoch": 3.698281154082259, + "grad_norm": 0.26427242159843445, + "learning_rate": 7.26583274399421e-05, + "loss": 1.7681, + "step": 12049 + }, + { + "epoch": 3.698588090853284, + "grad_norm": 0.24595285952091217, + "learning_rate": 7.265389644234823e-05, + "loss": 1.7209, + "step": 12050 + }, + { + "epoch": 3.6988950276243093, + "grad_norm": 0.32514405250549316, + "learning_rate": 7.26494652208753e-05, + "loss": 1.8702, + "step": 12051 + }, + { + "epoch": 3.6992019643953347, + "grad_norm": 0.24512936174869537, + "learning_rate": 7.264503377556705e-05, + "loss": 1.784, + "step": 12052 + }, + { + "epoch": 3.69950890116636, + "grad_norm": 0.28698310256004333, + "learning_rate": 7.264060210646733e-05, + "loss": 1.905, + "step": 12053 + }, + { + "epoch": 3.699815837937385, + "grad_norm": 0.2995007336139679, + "learning_rate": 7.263617021361989e-05, + "loss": 1.7822, + "step": 12054 + }, + { + "epoch": 3.7001227747084102, + "grad_norm": 0.25869423151016235, + "learning_rate": 7.263173809706855e-05, + "loss": 1.7988, + "step": 12055 + }, + { + "epoch": 3.700429711479435, + "grad_norm": 0.350918710231781, + "learning_rate": 7.262730575685711e-05, + "loss": 1.9504, + "step": 12056 + }, + { + "epoch": 3.7007366482504604, + "grad_norm": 0.3407665491104126, + "learning_rate": 7.262287319302937e-05, + "loss": 1.8506, + "step": 12057 + }, + { + "epoch": 3.701043585021486, + "grad_norm": 0.3039441704750061, + "learning_rate": 7.261844040562915e-05, + "loss": 1.7841, + "step": 12058 + }, + { + "epoch": 3.7013505217925107, + "grad_norm": 0.23483428359031677, + "learning_rate": 7.261400739470023e-05, + "loss": 1.7899, + "step": 12059 + }, + { + "epoch": 3.701657458563536, + "grad_norm": 0.30779507756233215, + "learning_rate": 7.260957416028645e-05, + "loss": 1.8131, + "step": 12060 + }, + { + "epoch": 3.701964395334561, + "grad_norm": 0.29901376366615295, + "learning_rate": 7.26051407024316e-05, + "loss": 1.7861, + "step": 12061 + }, + { + "epoch": 3.7022713321055862, + "grad_norm": 0.30058762431144714, + "learning_rate": 7.260070702117949e-05, + "loss": 1.7485, + "step": 12062 + }, + { + "epoch": 3.7025782688766116, + "grad_norm": 0.24523651599884033, + "learning_rate": 7.259627311657396e-05, + "loss": 1.772, + "step": 12063 + }, + { + "epoch": 3.7028852056476365, + "grad_norm": 0.24375474452972412, + "learning_rate": 7.259183898865882e-05, + "loss": 1.7848, + "step": 12064 + }, + { + "epoch": 3.703192142418662, + "grad_norm": 0.2562403380870819, + "learning_rate": 7.258740463747788e-05, + "loss": 1.7447, + "step": 12065 + }, + { + "epoch": 3.7034990791896867, + "grad_norm": 0.265229195356369, + "learning_rate": 7.258297006307496e-05, + "loss": 1.8111, + "step": 12066 + }, + { + "epoch": 3.703806015960712, + "grad_norm": 0.2836552858352661, + "learning_rate": 7.25785352654939e-05, + "loss": 1.7952, + "step": 12067 + }, + { + "epoch": 3.7041129527317374, + "grad_norm": 0.3269572854042053, + "learning_rate": 7.257410024477852e-05, + "loss": 1.8604, + "step": 12068 + }, + { + "epoch": 3.7044198895027627, + "grad_norm": 0.2391490638256073, + "learning_rate": 7.256966500097264e-05, + "loss": 1.7417, + "step": 12069 + }, + { + "epoch": 3.7047268262737876, + "grad_norm": 0.2610675096511841, + "learning_rate": 7.256522953412011e-05, + "loss": 1.7712, + "step": 12070 + }, + { + "epoch": 3.705033763044813, + "grad_norm": 0.24954774975776672, + "learning_rate": 7.256079384426477e-05, + "loss": 1.7506, + "step": 12071 + }, + { + "epoch": 3.705340699815838, + "grad_norm": 0.2603892385959625, + "learning_rate": 7.255635793145042e-05, + "loss": 1.8105, + "step": 12072 + }, + { + "epoch": 3.705647636586863, + "grad_norm": 0.32728591561317444, + "learning_rate": 7.255192179572092e-05, + "loss": 1.8448, + "step": 12073 + }, + { + "epoch": 3.7059545733578885, + "grad_norm": 0.4559340178966522, + "learning_rate": 7.254748543712013e-05, + "loss": 1.7232, + "step": 12074 + }, + { + "epoch": 3.7062615101289134, + "grad_norm": 0.36526206135749817, + "learning_rate": 7.254304885569186e-05, + "loss": 1.7874, + "step": 12075 + }, + { + "epoch": 3.7065684468999387, + "grad_norm": 0.21606837213039398, + "learning_rate": 7.253861205147998e-05, + "loss": 1.7266, + "step": 12076 + }, + { + "epoch": 3.7068753836709636, + "grad_norm": 0.3629585802555084, + "learning_rate": 7.253417502452831e-05, + "loss": 1.7722, + "step": 12077 + }, + { + "epoch": 3.707182320441989, + "grad_norm": 0.4224923551082611, + "learning_rate": 7.252973777488072e-05, + "loss": 1.7369, + "step": 12078 + }, + { + "epoch": 3.7074892572130143, + "grad_norm": 0.32245784997940063, + "learning_rate": 7.252530030258106e-05, + "loss": 1.7836, + "step": 12079 + }, + { + "epoch": 3.707796193984039, + "grad_norm": 0.29909494519233704, + "learning_rate": 7.252086260767317e-05, + "loss": 1.8718, + "step": 12080 + }, + { + "epoch": 3.7081031307550645, + "grad_norm": 0.21995799243450165, + "learning_rate": 7.251642469020093e-05, + "loss": 1.7103, + "step": 12081 + }, + { + "epoch": 3.7084100675260894, + "grad_norm": 0.2737572193145752, + "learning_rate": 7.251198655020818e-05, + "loss": 1.7787, + "step": 12082 + }, + { + "epoch": 3.7087170042971147, + "grad_norm": 0.22417058050632477, + "learning_rate": 7.250754818773879e-05, + "loss": 1.7782, + "step": 12083 + }, + { + "epoch": 3.70902394106814, + "grad_norm": 0.3350662887096405, + "learning_rate": 7.25031096028366e-05, + "loss": 1.8193, + "step": 12084 + }, + { + "epoch": 3.7093308778391654, + "grad_norm": 0.3199101686477661, + "learning_rate": 7.24986707955455e-05, + "loss": 1.831, + "step": 12085 + }, + { + "epoch": 3.7096378146101903, + "grad_norm": 0.2513977289199829, + "learning_rate": 7.249423176590936e-05, + "loss": 1.8288, + "step": 12086 + }, + { + "epoch": 3.7099447513812156, + "grad_norm": 0.30411866307258606, + "learning_rate": 7.248979251397203e-05, + "loss": 1.7837, + "step": 12087 + }, + { + "epoch": 3.7102516881522405, + "grad_norm": 0.30755332112312317, + "learning_rate": 7.248535303977738e-05, + "loss": 1.8016, + "step": 12088 + }, + { + "epoch": 3.710558624923266, + "grad_norm": 0.25746986269950867, + "learning_rate": 7.248091334336929e-05, + "loss": 1.8014, + "step": 12089 + }, + { + "epoch": 3.710865561694291, + "grad_norm": 0.3327447772026062, + "learning_rate": 7.247647342479164e-05, + "loss": 1.752, + "step": 12090 + }, + { + "epoch": 3.711172498465316, + "grad_norm": 0.3101816475391388, + "learning_rate": 7.247203328408832e-05, + "loss": 1.7867, + "step": 12091 + }, + { + "epoch": 3.7114794352363414, + "grad_norm": 0.2168906182050705, + "learning_rate": 7.246759292130318e-05, + "loss": 1.7452, + "step": 12092 + }, + { + "epoch": 3.7117863720073663, + "grad_norm": 0.34260258078575134, + "learning_rate": 7.246315233648013e-05, + "loss": 1.8156, + "step": 12093 + }, + { + "epoch": 3.7120933087783916, + "grad_norm": 0.2730714976787567, + "learning_rate": 7.245871152966303e-05, + "loss": 1.7429, + "step": 12094 + }, + { + "epoch": 3.712400245549417, + "grad_norm": 0.2560936212539673, + "learning_rate": 7.245427050089578e-05, + "loss": 1.7969, + "step": 12095 + }, + { + "epoch": 3.712707182320442, + "grad_norm": 0.27510303258895874, + "learning_rate": 7.244982925022228e-05, + "loss": 1.7981, + "step": 12096 + }, + { + "epoch": 3.713014119091467, + "grad_norm": 0.29171642661094666, + "learning_rate": 7.24453877776864e-05, + "loss": 1.7913, + "step": 12097 + }, + { + "epoch": 3.713321055862492, + "grad_norm": 0.26431843638420105, + "learning_rate": 7.244094608333206e-05, + "loss": 1.8262, + "step": 12098 + }, + { + "epoch": 3.7136279926335174, + "grad_norm": 0.30747905373573303, + "learning_rate": 7.243650416720311e-05, + "loss": 1.7951, + "step": 12099 + }, + { + "epoch": 3.7139349294045427, + "grad_norm": 0.346443772315979, + "learning_rate": 7.24320620293435e-05, + "loss": 1.7677, + "step": 12100 + }, + { + "epoch": 3.714241866175568, + "grad_norm": 0.2910652458667755, + "learning_rate": 7.242761966979709e-05, + "loss": 1.7887, + "step": 12101 + }, + { + "epoch": 3.714548802946593, + "grad_norm": 0.22342006862163544, + "learning_rate": 7.24231770886078e-05, + "loss": 1.7678, + "step": 12102 + }, + { + "epoch": 3.7148557397176183, + "grad_norm": 0.24125796556472778, + "learning_rate": 7.241873428581954e-05, + "loss": 1.7436, + "step": 12103 + }, + { + "epoch": 3.715162676488643, + "grad_norm": 0.23542635142803192, + "learning_rate": 7.24142912614762e-05, + "loss": 1.7942, + "step": 12104 + }, + { + "epoch": 3.7154696132596685, + "grad_norm": 0.22476384043693542, + "learning_rate": 7.240984801562169e-05, + "loss": 1.8235, + "step": 12105 + }, + { + "epoch": 3.715776550030694, + "grad_norm": 0.25123465061187744, + "learning_rate": 7.240540454829992e-05, + "loss": 1.8112, + "step": 12106 + }, + { + "epoch": 3.7160834868017187, + "grad_norm": 0.27230000495910645, + "learning_rate": 7.240096085955483e-05, + "loss": 1.8312, + "step": 12107 + }, + { + "epoch": 3.716390423572744, + "grad_norm": 0.2722976803779602, + "learning_rate": 7.239651694943031e-05, + "loss": 1.8368, + "step": 12108 + }, + { + "epoch": 3.716697360343769, + "grad_norm": 0.264138400554657, + "learning_rate": 7.239207281797028e-05, + "loss": 1.8206, + "step": 12109 + }, + { + "epoch": 3.7170042971147943, + "grad_norm": 0.28813931345939636, + "learning_rate": 7.238762846521866e-05, + "loss": 1.7391, + "step": 12110 + }, + { + "epoch": 3.7173112338858196, + "grad_norm": 0.2319631576538086, + "learning_rate": 7.238318389121939e-05, + "loss": 1.7574, + "step": 12111 + }, + { + "epoch": 3.717618170656845, + "grad_norm": 0.2507809102535248, + "learning_rate": 7.237873909601635e-05, + "loss": 1.7359, + "step": 12112 + }, + { + "epoch": 3.71792510742787, + "grad_norm": 0.2717304825782776, + "learning_rate": 7.237429407965351e-05, + "loss": 1.774, + "step": 12113 + }, + { + "epoch": 3.718232044198895, + "grad_norm": 0.2619280517101288, + "learning_rate": 7.236984884217478e-05, + "loss": 1.8083, + "step": 12114 + }, + { + "epoch": 3.71853898096992, + "grad_norm": 0.22268806397914886, + "learning_rate": 7.23654033836241e-05, + "loss": 1.7436, + "step": 12115 + }, + { + "epoch": 3.7188459177409454, + "grad_norm": 0.2341407984495163, + "learning_rate": 7.236095770404539e-05, + "loss": 1.7807, + "step": 12116 + }, + { + "epoch": 3.7191528545119708, + "grad_norm": 0.23519712686538696, + "learning_rate": 7.235651180348258e-05, + "loss": 1.8051, + "step": 12117 + }, + { + "epoch": 3.7194597912829956, + "grad_norm": 0.2391074150800705, + "learning_rate": 7.235206568197963e-05, + "loss": 1.8377, + "step": 12118 + }, + { + "epoch": 3.719766728054021, + "grad_norm": 0.26821592450141907, + "learning_rate": 7.234761933958045e-05, + "loss": 1.8586, + "step": 12119 + }, + { + "epoch": 3.720073664825046, + "grad_norm": 0.24971134960651398, + "learning_rate": 7.234317277632902e-05, + "loss": 1.8404, + "step": 12120 + }, + { + "epoch": 3.720380601596071, + "grad_norm": 0.20817919075489044, + "learning_rate": 7.233872599226926e-05, + "loss": 1.7204, + "step": 12121 + }, + { + "epoch": 3.7206875383670965, + "grad_norm": 0.29301291704177856, + "learning_rate": 7.233427898744509e-05, + "loss": 1.8528, + "step": 12122 + }, + { + "epoch": 3.7209944751381214, + "grad_norm": 0.22214651107788086, + "learning_rate": 7.23298317619005e-05, + "loss": 1.748, + "step": 12123 + }, + { + "epoch": 3.7213014119091468, + "grad_norm": 0.2511044442653656, + "learning_rate": 7.232538431567941e-05, + "loss": 1.8146, + "step": 12124 + }, + { + "epoch": 3.7216083486801717, + "grad_norm": 0.26976367831230164, + "learning_rate": 7.232093664882581e-05, + "loss": 1.8483, + "step": 12125 + }, + { + "epoch": 3.721915285451197, + "grad_norm": 0.2538089156150818, + "learning_rate": 7.231648876138361e-05, + "loss": 1.8097, + "step": 12126 + }, + { + "epoch": 3.7222222222222223, + "grad_norm": 0.2353016883134842, + "learning_rate": 7.231204065339682e-05, + "loss": 1.737, + "step": 12127 + }, + { + "epoch": 3.7225291589932477, + "grad_norm": 0.3205147981643677, + "learning_rate": 7.230759232490935e-05, + "loss": 1.8116, + "step": 12128 + }, + { + "epoch": 3.7228360957642725, + "grad_norm": 0.39056599140167236, + "learning_rate": 7.230314377596516e-05, + "loss": 1.7785, + "step": 12129 + }, + { + "epoch": 3.723143032535298, + "grad_norm": 0.3846863806247711, + "learning_rate": 7.229869500660825e-05, + "loss": 1.738, + "step": 12130 + }, + { + "epoch": 3.7234499693063228, + "grad_norm": 0.24412120878696442, + "learning_rate": 7.229424601688256e-05, + "loss": 1.7351, + "step": 12131 + }, + { + "epoch": 3.723756906077348, + "grad_norm": 0.2978009581565857, + "learning_rate": 7.228979680683206e-05, + "loss": 1.8037, + "step": 12132 + }, + { + "epoch": 3.7240638428483734, + "grad_norm": 0.33787262439727783, + "learning_rate": 7.228534737650074e-05, + "loss": 1.8421, + "step": 12133 + }, + { + "epoch": 3.7243707796193983, + "grad_norm": 0.2536921203136444, + "learning_rate": 7.228089772593254e-05, + "loss": 1.7472, + "step": 12134 + }, + { + "epoch": 3.7246777163904237, + "grad_norm": 0.24103601276874542, + "learning_rate": 7.227644785517144e-05, + "loss": 1.8011, + "step": 12135 + }, + { + "epoch": 3.7249846531614486, + "grad_norm": 0.3653033375740051, + "learning_rate": 7.227199776426146e-05, + "loss": 1.8018, + "step": 12136 + }, + { + "epoch": 3.725291589932474, + "grad_norm": 0.35728752613067627, + "learning_rate": 7.226754745324652e-05, + "loss": 1.7684, + "step": 12137 + }, + { + "epoch": 3.7255985267034992, + "grad_norm": 0.262018620967865, + "learning_rate": 7.226309692217063e-05, + "loss": 1.8124, + "step": 12138 + }, + { + "epoch": 3.725905463474524, + "grad_norm": 0.3467118442058563, + "learning_rate": 7.225864617107776e-05, + "loss": 1.8761, + "step": 12139 + }, + { + "epoch": 3.7262124002455494, + "grad_norm": 0.4365626871585846, + "learning_rate": 7.22541952000119e-05, + "loss": 1.7159, + "step": 12140 + }, + { + "epoch": 3.7265193370165743, + "grad_norm": 0.2819811999797821, + "learning_rate": 7.224974400901705e-05, + "loss": 1.8051, + "step": 12141 + }, + { + "epoch": 3.7268262737875997, + "grad_norm": 0.39062437415122986, + "learning_rate": 7.224529259813719e-05, + "loss": 1.8517, + "step": 12142 + }, + { + "epoch": 3.727133210558625, + "grad_norm": 0.4383927285671234, + "learning_rate": 7.22408409674163e-05, + "loss": 1.8295, + "step": 12143 + }, + { + "epoch": 3.7274401473296503, + "grad_norm": 0.3043094575405121, + "learning_rate": 7.223638911689839e-05, + "loss": 1.7653, + "step": 12144 + }, + { + "epoch": 3.7277470841006752, + "grad_norm": 0.25198984146118164, + "learning_rate": 7.223193704662746e-05, + "loss": 1.7561, + "step": 12145 + }, + { + "epoch": 3.7280540208717006, + "grad_norm": 0.353565514087677, + "learning_rate": 7.222748475664749e-05, + "loss": 1.8077, + "step": 12146 + }, + { + "epoch": 3.7283609576427255, + "grad_norm": 0.39757224917411804, + "learning_rate": 7.222303224700248e-05, + "loss": 1.7622, + "step": 12147 + }, + { + "epoch": 3.728667894413751, + "grad_norm": 0.35595703125, + "learning_rate": 7.221857951773644e-05, + "loss": 1.8436, + "step": 12148 + }, + { + "epoch": 3.728974831184776, + "grad_norm": 0.2469715029001236, + "learning_rate": 7.221412656889338e-05, + "loss": 1.8531, + "step": 12149 + }, + { + "epoch": 3.729281767955801, + "grad_norm": 0.35324424505233765, + "learning_rate": 7.22096734005173e-05, + "loss": 1.7361, + "step": 12150 + }, + { + "epoch": 3.7295887047268264, + "grad_norm": 0.3783365488052368, + "learning_rate": 7.220522001265223e-05, + "loss": 1.7459, + "step": 12151 + }, + { + "epoch": 3.7298956414978512, + "grad_norm": 0.27526360750198364, + "learning_rate": 7.220076640534212e-05, + "loss": 1.8867, + "step": 12152 + }, + { + "epoch": 3.7302025782688766, + "grad_norm": 0.30863118171691895, + "learning_rate": 7.219631257863105e-05, + "loss": 1.7363, + "step": 12153 + }, + { + "epoch": 3.730509515039902, + "grad_norm": 0.38505107164382935, + "learning_rate": 7.219185853256301e-05, + "loss": 1.764, + "step": 12154 + }, + { + "epoch": 3.730816451810927, + "grad_norm": 0.2925978899002075, + "learning_rate": 7.218740426718202e-05, + "loss": 1.7693, + "step": 12155 + }, + { + "epoch": 3.731123388581952, + "grad_norm": 0.24510078132152557, + "learning_rate": 7.218294978253209e-05, + "loss": 1.8089, + "step": 12156 + }, + { + "epoch": 3.731430325352977, + "grad_norm": 0.33029109239578247, + "learning_rate": 7.217849507865724e-05, + "loss": 1.6885, + "step": 12157 + }, + { + "epoch": 3.7317372621240024, + "grad_norm": 0.333970308303833, + "learning_rate": 7.217404015560149e-05, + "loss": 1.8132, + "step": 12158 + }, + { + "epoch": 3.7320441988950277, + "grad_norm": 0.2467660754919052, + "learning_rate": 7.216958501340891e-05, + "loss": 1.8021, + "step": 12159 + }, + { + "epoch": 3.732351135666053, + "grad_norm": 0.2701449990272522, + "learning_rate": 7.216512965212348e-05, + "loss": 1.7006, + "step": 12160 + }, + { + "epoch": 3.732658072437078, + "grad_norm": 0.2784138023853302, + "learning_rate": 7.216067407178926e-05, + "loss": 1.7616, + "step": 12161 + }, + { + "epoch": 3.7329650092081033, + "grad_norm": 0.2082870900630951, + "learning_rate": 7.215621827245026e-05, + "loss": 1.7391, + "step": 12162 + }, + { + "epoch": 3.733271945979128, + "grad_norm": 0.2477869987487793, + "learning_rate": 7.215176225415053e-05, + "loss": 1.7761, + "step": 12163 + }, + { + "epoch": 3.7335788827501535, + "grad_norm": 0.28395572304725647, + "learning_rate": 7.21473060169341e-05, + "loss": 1.8181, + "step": 12164 + }, + { + "epoch": 3.733885819521179, + "grad_norm": 0.20430058240890503, + "learning_rate": 7.2142849560845e-05, + "loss": 1.7035, + "step": 12165 + }, + { + "epoch": 3.7341927562922037, + "grad_norm": 0.30061420798301697, + "learning_rate": 7.21383928859273e-05, + "loss": 1.7703, + "step": 12166 + }, + { + "epoch": 3.734499693063229, + "grad_norm": 0.33865803480148315, + "learning_rate": 7.2133935992225e-05, + "loss": 1.8204, + "step": 12167 + }, + { + "epoch": 3.734806629834254, + "grad_norm": 0.29172980785369873, + "learning_rate": 7.212947887978221e-05, + "loss": 1.739, + "step": 12168 + }, + { + "epoch": 3.7351135666052793, + "grad_norm": 0.2799396812915802, + "learning_rate": 7.212502154864291e-05, + "loss": 1.8503, + "step": 12169 + }, + { + "epoch": 3.7354205033763046, + "grad_norm": 0.2945539355278015, + "learning_rate": 7.212056399885118e-05, + "loss": 1.7523, + "step": 12170 + }, + { + "epoch": 3.7357274401473295, + "grad_norm": 0.2395290732383728, + "learning_rate": 7.211610623045108e-05, + "loss": 1.7728, + "step": 12171 + }, + { + "epoch": 3.736034376918355, + "grad_norm": 0.24369286000728607, + "learning_rate": 7.211164824348667e-05, + "loss": 1.7725, + "step": 12172 + }, + { + "epoch": 3.7363413136893797, + "grad_norm": 0.3272435963153839, + "learning_rate": 7.210719003800197e-05, + "loss": 1.8531, + "step": 12173 + }, + { + "epoch": 3.736648250460405, + "grad_norm": 0.23954182863235474, + "learning_rate": 7.210273161404107e-05, + "loss": 1.7807, + "step": 12174 + }, + { + "epoch": 3.7369551872314304, + "grad_norm": 0.24547603726387024, + "learning_rate": 7.209827297164801e-05, + "loss": 1.8481, + "step": 12175 + }, + { + "epoch": 3.7372621240024557, + "grad_norm": 0.26926249265670776, + "learning_rate": 7.209381411086687e-05, + "loss": 1.7496, + "step": 12176 + }, + { + "epoch": 3.7375690607734806, + "grad_norm": 0.22948235273361206, + "learning_rate": 7.208935503174172e-05, + "loss": 1.7681, + "step": 12177 + }, + { + "epoch": 3.737875997544506, + "grad_norm": 0.2697654664516449, + "learning_rate": 7.20848957343166e-05, + "loss": 1.789, + "step": 12178 + }, + { + "epoch": 3.738182934315531, + "grad_norm": 0.235344797372818, + "learning_rate": 7.208043621863562e-05, + "loss": 1.8309, + "step": 12179 + }, + { + "epoch": 3.738489871086556, + "grad_norm": 0.2688879072666168, + "learning_rate": 7.20759764847428e-05, + "loss": 1.7898, + "step": 12180 + }, + { + "epoch": 3.7387968078575815, + "grad_norm": 0.26818978786468506, + "learning_rate": 7.207151653268226e-05, + "loss": 1.7882, + "step": 12181 + }, + { + "epoch": 3.7391037446286064, + "grad_norm": 0.2612875998020172, + "learning_rate": 7.206705636249804e-05, + "loss": 1.7352, + "step": 12182 + }, + { + "epoch": 3.7394106813996317, + "grad_norm": 0.22547565400600433, + "learning_rate": 7.206259597423425e-05, + "loss": 1.733, + "step": 12183 + }, + { + "epoch": 3.7397176181706566, + "grad_norm": 0.24645474553108215, + "learning_rate": 7.205813536793495e-05, + "loss": 1.8064, + "step": 12184 + }, + { + "epoch": 3.740024554941682, + "grad_norm": 0.25879329442977905, + "learning_rate": 7.205367454364424e-05, + "loss": 1.8134, + "step": 12185 + }, + { + "epoch": 3.7403314917127073, + "grad_norm": 0.22420097887516022, + "learning_rate": 7.204921350140617e-05, + "loss": 1.7819, + "step": 12186 + }, + { + "epoch": 3.7406384284837326, + "grad_norm": 0.2569858431816101, + "learning_rate": 7.204475224126487e-05, + "loss": 1.784, + "step": 12187 + }, + { + "epoch": 3.7409453652547575, + "grad_norm": 0.23769912123680115, + "learning_rate": 7.20402907632644e-05, + "loss": 1.7853, + "step": 12188 + }, + { + "epoch": 3.741252302025783, + "grad_norm": 0.26935988664627075, + "learning_rate": 7.203582906744885e-05, + "loss": 1.806, + "step": 12189 + }, + { + "epoch": 3.7415592387968077, + "grad_norm": 0.2544274628162384, + "learning_rate": 7.203136715386233e-05, + "loss": 1.7988, + "step": 12190 + }, + { + "epoch": 3.741866175567833, + "grad_norm": 0.22665882110595703, + "learning_rate": 7.202690502254892e-05, + "loss": 1.7798, + "step": 12191 + }, + { + "epoch": 3.7421731123388584, + "grad_norm": 0.24512888491153717, + "learning_rate": 7.202244267355273e-05, + "loss": 1.816, + "step": 12192 + }, + { + "epoch": 3.7424800491098833, + "grad_norm": 0.2408553808927536, + "learning_rate": 7.201798010691785e-05, + "loss": 1.7417, + "step": 12193 + }, + { + "epoch": 3.7427869858809086, + "grad_norm": 0.23142600059509277, + "learning_rate": 7.201351732268838e-05, + "loss": 1.7771, + "step": 12194 + }, + { + "epoch": 3.7430939226519335, + "grad_norm": 0.245071142911911, + "learning_rate": 7.200905432090844e-05, + "loss": 1.7556, + "step": 12195 + }, + { + "epoch": 3.743400859422959, + "grad_norm": 0.2623934745788574, + "learning_rate": 7.200459110162211e-05, + "loss": 1.8042, + "step": 12196 + }, + { + "epoch": 3.743707796193984, + "grad_norm": 0.2531217038631439, + "learning_rate": 7.200012766487353e-05, + "loss": 1.7709, + "step": 12197 + }, + { + "epoch": 3.744014732965009, + "grad_norm": 0.23839864134788513, + "learning_rate": 7.19956640107068e-05, + "loss": 1.8202, + "step": 12198 + }, + { + "epoch": 3.7443216697360344, + "grad_norm": 0.2342260777950287, + "learning_rate": 7.1991200139166e-05, + "loss": 1.827, + "step": 12199 + }, + { + "epoch": 3.7446286065070593, + "grad_norm": 0.25511276721954346, + "learning_rate": 7.198673605029528e-05, + "loss": 1.7766, + "step": 12200 + }, + { + "epoch": 3.7449355432780846, + "grad_norm": 0.27601274847984314, + "learning_rate": 7.198227174413876e-05, + "loss": 1.7716, + "step": 12201 + }, + { + "epoch": 3.74524248004911, + "grad_norm": 0.3027385175228119, + "learning_rate": 7.197780722074056e-05, + "loss": 1.8007, + "step": 12202 + }, + { + "epoch": 3.7455494168201353, + "grad_norm": 0.31242382526397705, + "learning_rate": 7.197334248014477e-05, + "loss": 1.8089, + "step": 12203 + }, + { + "epoch": 3.74585635359116, + "grad_norm": 0.3673859238624573, + "learning_rate": 7.196887752239551e-05, + "loss": 1.8017, + "step": 12204 + }, + { + "epoch": 3.7461632903621855, + "grad_norm": 0.3152726888656616, + "learning_rate": 7.196441234753695e-05, + "loss": 1.7108, + "step": 12205 + }, + { + "epoch": 3.7464702271332104, + "grad_norm": 0.2606927156448364, + "learning_rate": 7.195994695561319e-05, + "loss": 1.8066, + "step": 12206 + }, + { + "epoch": 3.7467771639042358, + "grad_norm": 0.37624871730804443, + "learning_rate": 7.195548134666836e-05, + "loss": 1.725, + "step": 12207 + }, + { + "epoch": 3.747084100675261, + "grad_norm": 0.4138187766075134, + "learning_rate": 7.195101552074658e-05, + "loss": 1.7838, + "step": 12208 + }, + { + "epoch": 3.747391037446286, + "grad_norm": 0.3668459951877594, + "learning_rate": 7.194654947789204e-05, + "loss": 1.7575, + "step": 12209 + }, + { + "epoch": 3.7476979742173113, + "grad_norm": 0.27947792410850525, + "learning_rate": 7.19420832181488e-05, + "loss": 1.792, + "step": 12210 + }, + { + "epoch": 3.748004910988336, + "grad_norm": 0.2507692873477936, + "learning_rate": 7.193761674156103e-05, + "loss": 1.7752, + "step": 12211 + }, + { + "epoch": 3.7483118477593615, + "grad_norm": 0.3209949731826782, + "learning_rate": 7.193315004817289e-05, + "loss": 1.8491, + "step": 12212 + }, + { + "epoch": 3.748618784530387, + "grad_norm": 0.32883042097091675, + "learning_rate": 7.192868313802849e-05, + "loss": 1.8135, + "step": 12213 + }, + { + "epoch": 3.7489257213014118, + "grad_norm": 0.2450616955757141, + "learning_rate": 7.192421601117201e-05, + "loss": 1.7722, + "step": 12214 + }, + { + "epoch": 3.749232658072437, + "grad_norm": 0.2545110285282135, + "learning_rate": 7.191974866764757e-05, + "loss": 1.7866, + "step": 12215 + }, + { + "epoch": 3.749539594843462, + "grad_norm": 0.264017790555954, + "learning_rate": 7.191528110749932e-05, + "loss": 1.778, + "step": 12216 + }, + { + "epoch": 3.7498465316144873, + "grad_norm": 0.3156309425830841, + "learning_rate": 7.191081333077142e-05, + "loss": 1.7917, + "step": 12217 + }, + { + "epoch": 3.7501534683855127, + "grad_norm": 0.3578774631023407, + "learning_rate": 7.190634533750802e-05, + "loss": 1.8468, + "step": 12218 + }, + { + "epoch": 3.750460405156538, + "grad_norm": 0.30735981464385986, + "learning_rate": 7.19018771277533e-05, + "loss": 1.7502, + "step": 12219 + }, + { + "epoch": 3.750767341927563, + "grad_norm": 0.22870220243930817, + "learning_rate": 7.189740870155135e-05, + "loss": 1.7686, + "step": 12220 + }, + { + "epoch": 3.7510742786985882, + "grad_norm": 0.30297720432281494, + "learning_rate": 7.18929400589464e-05, + "loss": 1.826, + "step": 12221 + }, + { + "epoch": 3.751381215469613, + "grad_norm": 0.2735389173030853, + "learning_rate": 7.188847119998257e-05, + "loss": 1.8142, + "step": 12222 + }, + { + "epoch": 3.7516881522406385, + "grad_norm": 0.2823885679244995, + "learning_rate": 7.188400212470405e-05, + "loss": 1.8028, + "step": 12223 + }, + { + "epoch": 3.751995089011664, + "grad_norm": 0.4184139370918274, + "learning_rate": 7.187953283315499e-05, + "loss": 1.8467, + "step": 12224 + }, + { + "epoch": 3.7523020257826887, + "grad_norm": 0.3559226095676422, + "learning_rate": 7.187506332537957e-05, + "loss": 1.7416, + "step": 12225 + }, + { + "epoch": 3.752608962553714, + "grad_norm": 0.26055800914764404, + "learning_rate": 7.187059360142194e-05, + "loss": 1.8309, + "step": 12226 + }, + { + "epoch": 3.752915899324739, + "grad_norm": 0.28032660484313965, + "learning_rate": 7.186612366132629e-05, + "loss": 1.7926, + "step": 12227 + }, + { + "epoch": 3.7532228360957642, + "grad_norm": 0.26229965686798096, + "learning_rate": 7.18616535051368e-05, + "loss": 1.7368, + "step": 12228 + }, + { + "epoch": 3.7535297728667896, + "grad_norm": 0.2779417634010315, + "learning_rate": 7.185718313289763e-05, + "loss": 1.8418, + "step": 12229 + }, + { + "epoch": 3.7538367096378145, + "grad_norm": 0.26164770126342773, + "learning_rate": 7.185271254465295e-05, + "loss": 1.7511, + "step": 12230 + }, + { + "epoch": 3.75414364640884, + "grad_norm": 0.30725157260894775, + "learning_rate": 7.184824174044698e-05, + "loss": 1.7661, + "step": 12231 + }, + { + "epoch": 3.7544505831798647, + "grad_norm": 0.33111417293548584, + "learning_rate": 7.184377072032386e-05, + "loss": 1.7341, + "step": 12232 + }, + { + "epoch": 3.75475751995089, + "grad_norm": 0.23978343605995178, + "learning_rate": 7.183929948432779e-05, + "loss": 1.7151, + "step": 12233 + }, + { + "epoch": 3.7550644567219154, + "grad_norm": 0.3057664632797241, + "learning_rate": 7.183482803250299e-05, + "loss": 1.8446, + "step": 12234 + }, + { + "epoch": 3.7553713934929407, + "grad_norm": 0.2629055678844452, + "learning_rate": 7.18303563648936e-05, + "loss": 1.7415, + "step": 12235 + }, + { + "epoch": 3.7556783302639656, + "grad_norm": 0.22703498601913452, + "learning_rate": 7.182588448154386e-05, + "loss": 1.8188, + "step": 12236 + }, + { + "epoch": 3.755985267034991, + "grad_norm": 0.3014034032821655, + "learning_rate": 7.182141238249792e-05, + "loss": 1.8634, + "step": 12237 + }, + { + "epoch": 3.756292203806016, + "grad_norm": 0.28859084844589233, + "learning_rate": 7.181694006779998e-05, + "loss": 1.7509, + "step": 12238 + }, + { + "epoch": 3.756599140577041, + "grad_norm": 0.293720543384552, + "learning_rate": 7.181246753749426e-05, + "loss": 1.777, + "step": 12239 + }, + { + "epoch": 3.7569060773480665, + "grad_norm": 0.2374580055475235, + "learning_rate": 7.180799479162496e-05, + "loss": 1.7492, + "step": 12240 + }, + { + "epoch": 3.7572130141190914, + "grad_norm": 0.30106452107429504, + "learning_rate": 7.180352183023627e-05, + "loss": 1.7538, + "step": 12241 + }, + { + "epoch": 3.7575199508901167, + "grad_norm": 0.3504682183265686, + "learning_rate": 7.179904865337238e-05, + "loss": 1.7477, + "step": 12242 + }, + { + "epoch": 3.7578268876611416, + "grad_norm": 0.2901679575443268, + "learning_rate": 7.179457526107754e-05, + "loss": 1.9412, + "step": 12243 + }, + { + "epoch": 3.758133824432167, + "grad_norm": 0.37690606713294983, + "learning_rate": 7.179010165339591e-05, + "loss": 1.8222, + "step": 12244 + }, + { + "epoch": 3.7584407612031923, + "grad_norm": 0.45126965641975403, + "learning_rate": 7.178562783037172e-05, + "loss": 1.8563, + "step": 12245 + }, + { + "epoch": 3.758747697974217, + "grad_norm": 0.2747548818588257, + "learning_rate": 7.178115379204921e-05, + "loss": 1.7179, + "step": 12246 + }, + { + "epoch": 3.7590546347452425, + "grad_norm": 0.43243977427482605, + "learning_rate": 7.177667953847257e-05, + "loss": 1.8157, + "step": 12247 + }, + { + "epoch": 3.7593615715162674, + "grad_norm": 0.529448390007019, + "learning_rate": 7.177220506968602e-05, + "loss": 1.8113, + "step": 12248 + }, + { + "epoch": 3.7596685082872927, + "grad_norm": 0.3099314868450165, + "learning_rate": 7.176773038573377e-05, + "loss": 1.7833, + "step": 12249 + }, + { + "epoch": 3.759975445058318, + "grad_norm": 0.3111872375011444, + "learning_rate": 7.176325548666004e-05, + "loss": 1.7965, + "step": 12250 + }, + { + "epoch": 3.7602823818293434, + "grad_norm": 0.38437551259994507, + "learning_rate": 7.175878037250907e-05, + "loss": 1.7822, + "step": 12251 + }, + { + "epoch": 3.7605893186003683, + "grad_norm": 0.33643704652786255, + "learning_rate": 7.175430504332509e-05, + "loss": 1.7839, + "step": 12252 + }, + { + "epoch": 3.7608962553713936, + "grad_norm": 0.24705304205417633, + "learning_rate": 7.174982949915232e-05, + "loss": 1.8302, + "step": 12253 + }, + { + "epoch": 3.7612031921424185, + "grad_norm": 0.3615458309650421, + "learning_rate": 7.174535374003497e-05, + "loss": 1.7963, + "step": 12254 + }, + { + "epoch": 3.761510128913444, + "grad_norm": 0.36486589908599854, + "learning_rate": 7.17408777660173e-05, + "loss": 1.7933, + "step": 12255 + }, + { + "epoch": 3.761817065684469, + "grad_norm": 0.2566867172718048, + "learning_rate": 7.173640157714352e-05, + "loss": 1.7254, + "step": 12256 + }, + { + "epoch": 3.762124002455494, + "grad_norm": 0.2602523863315582, + "learning_rate": 7.17319251734579e-05, + "loss": 1.7357, + "step": 12257 + }, + { + "epoch": 3.7624309392265194, + "grad_norm": 0.3626105785369873, + "learning_rate": 7.172744855500464e-05, + "loss": 1.7971, + "step": 12258 + }, + { + "epoch": 3.7627378759975443, + "grad_norm": 0.36327603459358215, + "learning_rate": 7.172297172182802e-05, + "loss": 1.7819, + "step": 12259 + }, + { + "epoch": 3.7630448127685696, + "grad_norm": 0.25935736298561096, + "learning_rate": 7.171849467397224e-05, + "loss": 1.8112, + "step": 12260 + }, + { + "epoch": 3.763351749539595, + "grad_norm": 0.2779700756072998, + "learning_rate": 7.171401741148156e-05, + "loss": 1.786, + "step": 12261 + }, + { + "epoch": 3.7636586863106203, + "grad_norm": 0.3089013695716858, + "learning_rate": 7.170953993440025e-05, + "loss": 1.7808, + "step": 12262 + }, + { + "epoch": 3.763965623081645, + "grad_norm": 0.2562308609485626, + "learning_rate": 7.170506224277253e-05, + "loss": 1.8207, + "step": 12263 + }, + { + "epoch": 3.7642725598526705, + "grad_norm": 0.2907634973526001, + "learning_rate": 7.170058433664268e-05, + "loss": 1.7638, + "step": 12264 + }, + { + "epoch": 3.7645794966236954, + "grad_norm": 0.30341312289237976, + "learning_rate": 7.169610621605493e-05, + "loss": 1.7827, + "step": 12265 + }, + { + "epoch": 3.7648864333947207, + "grad_norm": 0.27091866731643677, + "learning_rate": 7.169162788105353e-05, + "loss": 1.786, + "step": 12266 + }, + { + "epoch": 3.765193370165746, + "grad_norm": 0.234042689204216, + "learning_rate": 7.168714933168277e-05, + "loss": 1.7638, + "step": 12267 + }, + { + "epoch": 3.765500306936771, + "grad_norm": 0.2477465271949768, + "learning_rate": 7.168267056798686e-05, + "loss": 1.7275, + "step": 12268 + }, + { + "epoch": 3.7658072437077963, + "grad_norm": 0.25578543543815613, + "learning_rate": 7.167819159001012e-05, + "loss": 1.7831, + "step": 12269 + }, + { + "epoch": 3.766114180478821, + "grad_norm": 0.26629674434661865, + "learning_rate": 7.167371239779678e-05, + "loss": 1.7866, + "step": 12270 + }, + { + "epoch": 3.7664211172498465, + "grad_norm": 0.31350967288017273, + "learning_rate": 7.16692329913911e-05, + "loss": 1.7755, + "step": 12271 + }, + { + "epoch": 3.766728054020872, + "grad_norm": 0.2670116126537323, + "learning_rate": 7.166475337083735e-05, + "loss": 1.7524, + "step": 12272 + }, + { + "epoch": 3.7670349907918967, + "grad_norm": 0.26503682136535645, + "learning_rate": 7.166027353617983e-05, + "loss": 1.7867, + "step": 12273 + }, + { + "epoch": 3.767341927562922, + "grad_norm": 0.3674192428588867, + "learning_rate": 7.165579348746278e-05, + "loss": 1.7604, + "step": 12274 + }, + { + "epoch": 3.767648864333947, + "grad_norm": 0.4120824337005615, + "learning_rate": 7.16513132247305e-05, + "loss": 1.7905, + "step": 12275 + }, + { + "epoch": 3.7679558011049723, + "grad_norm": 0.29074826836586, + "learning_rate": 7.164683274802723e-05, + "loss": 1.7539, + "step": 12276 + }, + { + "epoch": 3.7682627378759976, + "grad_norm": 0.22223204374313354, + "learning_rate": 7.164235205739729e-05, + "loss": 1.755, + "step": 12277 + }, + { + "epoch": 3.768569674647023, + "grad_norm": 0.23997461795806885, + "learning_rate": 7.163787115288494e-05, + "loss": 1.8024, + "step": 12278 + }, + { + "epoch": 3.768876611418048, + "grad_norm": 0.2556418776512146, + "learning_rate": 7.163339003453445e-05, + "loss": 1.7717, + "step": 12279 + }, + { + "epoch": 3.769183548189073, + "grad_norm": 0.3107141852378845, + "learning_rate": 7.162890870239013e-05, + "loss": 1.8257, + "step": 12280 + }, + { + "epoch": 3.769490484960098, + "grad_norm": 0.35293644666671753, + "learning_rate": 7.162442715649627e-05, + "loss": 1.7855, + "step": 12281 + }, + { + "epoch": 3.7697974217311234, + "grad_norm": 0.25989311933517456, + "learning_rate": 7.161994539689713e-05, + "loss": 1.7816, + "step": 12282 + }, + { + "epoch": 3.7701043585021488, + "grad_norm": 0.25615137815475464, + "learning_rate": 7.161546342363701e-05, + "loss": 1.7738, + "step": 12283 + }, + { + "epoch": 3.7704112952731736, + "grad_norm": 0.29345229268074036, + "learning_rate": 7.161098123676023e-05, + "loss": 1.8496, + "step": 12284 + }, + { + "epoch": 3.770718232044199, + "grad_norm": 0.2975969612598419, + "learning_rate": 7.160649883631105e-05, + "loss": 1.7342, + "step": 12285 + }, + { + "epoch": 3.771025168815224, + "grad_norm": 0.28458064794540405, + "learning_rate": 7.16020162223338e-05, + "loss": 1.8253, + "step": 12286 + }, + { + "epoch": 3.771332105586249, + "grad_norm": 0.2798703908920288, + "learning_rate": 7.159753339487276e-05, + "loss": 1.746, + "step": 12287 + }, + { + "epoch": 3.7716390423572745, + "grad_norm": 0.380044549703598, + "learning_rate": 7.159305035397223e-05, + "loss": 1.769, + "step": 12288 + }, + { + "epoch": 3.7719459791282994, + "grad_norm": 0.28760263323783875, + "learning_rate": 7.158856709967654e-05, + "loss": 1.7466, + "step": 12289 + }, + { + "epoch": 3.7722529158993248, + "grad_norm": 0.23314130306243896, + "learning_rate": 7.158408363202996e-05, + "loss": 1.7545, + "step": 12290 + }, + { + "epoch": 3.7725598526703497, + "grad_norm": 0.2864209711551666, + "learning_rate": 7.15795999510768e-05, + "loss": 1.7549, + "step": 12291 + }, + { + "epoch": 3.772866789441375, + "grad_norm": 0.2605510354042053, + "learning_rate": 7.15751160568614e-05, + "loss": 1.7684, + "step": 12292 + }, + { + "epoch": 3.7731737262124003, + "grad_norm": 0.2475409358739853, + "learning_rate": 7.157063194942806e-05, + "loss": 1.7841, + "step": 12293 + }, + { + "epoch": 3.7734806629834257, + "grad_norm": 0.22479289770126343, + "learning_rate": 7.15661476288211e-05, + "loss": 1.7592, + "step": 12294 + }, + { + "epoch": 3.7737875997544506, + "grad_norm": 0.22076937556266785, + "learning_rate": 7.156166309508482e-05, + "loss": 1.7853, + "step": 12295 + }, + { + "epoch": 3.774094536525476, + "grad_norm": 0.26082465052604675, + "learning_rate": 7.155717834826353e-05, + "loss": 1.7828, + "step": 12296 + }, + { + "epoch": 3.7744014732965008, + "grad_norm": 0.24771755933761597, + "learning_rate": 7.15526933884016e-05, + "loss": 1.758, + "step": 12297 + }, + { + "epoch": 3.774708410067526, + "grad_norm": 0.23806311190128326, + "learning_rate": 7.15482082155433e-05, + "loss": 1.7237, + "step": 12298 + }, + { + "epoch": 3.7750153468385514, + "grad_norm": 0.24822844564914703, + "learning_rate": 7.154372282973299e-05, + "loss": 1.7828, + "step": 12299 + }, + { + "epoch": 3.7753222836095763, + "grad_norm": 0.24423740804195404, + "learning_rate": 7.153923723101496e-05, + "loss": 1.8014, + "step": 12300 + }, + { + "epoch": 3.7756292203806017, + "grad_norm": 0.24966634809970856, + "learning_rate": 7.15347514194336e-05, + "loss": 1.8005, + "step": 12301 + }, + { + "epoch": 3.7759361571516266, + "grad_norm": 0.2549348473548889, + "learning_rate": 7.153026539503317e-05, + "loss": 1.8473, + "step": 12302 + }, + { + "epoch": 3.776243093922652, + "grad_norm": 0.23709465563297272, + "learning_rate": 7.152577915785807e-05, + "loss": 1.8031, + "step": 12303 + }, + { + "epoch": 3.7765500306936772, + "grad_norm": 0.28554168343544006, + "learning_rate": 7.152129270795258e-05, + "loss": 1.7836, + "step": 12304 + }, + { + "epoch": 3.776856967464702, + "grad_norm": 0.2568756639957428, + "learning_rate": 7.151680604536107e-05, + "loss": 1.7345, + "step": 12305 + }, + { + "epoch": 3.7771639042357275, + "grad_norm": 0.23883797228336334, + "learning_rate": 7.151231917012787e-05, + "loss": 1.7342, + "step": 12306 + }, + { + "epoch": 3.7774708410067523, + "grad_norm": 0.24026677012443542, + "learning_rate": 7.150783208229732e-05, + "loss": 1.8156, + "step": 12307 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.25756222009658813, + "learning_rate": 7.150334478191376e-05, + "loss": 1.8204, + "step": 12308 + }, + { + "epoch": 3.778084714548803, + "grad_norm": 0.24917428195476532, + "learning_rate": 7.149885726902156e-05, + "loss": 1.7867, + "step": 12309 + }, + { + "epoch": 3.7783916513198283, + "grad_norm": 0.26269277930259705, + "learning_rate": 7.149436954366504e-05, + "loss": 1.8233, + "step": 12310 + }, + { + "epoch": 3.7786985880908532, + "grad_norm": 0.2502293586730957, + "learning_rate": 7.148988160588857e-05, + "loss": 1.8329, + "step": 12311 + }, + { + "epoch": 3.7790055248618786, + "grad_norm": 0.24845796823501587, + "learning_rate": 7.14853934557365e-05, + "loss": 1.7936, + "step": 12312 + }, + { + "epoch": 3.7793124616329035, + "grad_norm": 0.2453537881374359, + "learning_rate": 7.148090509325315e-05, + "loss": 1.8149, + "step": 12313 + }, + { + "epoch": 3.779619398403929, + "grad_norm": 0.2336922138929367, + "learning_rate": 7.147641651848293e-05, + "loss": 1.7826, + "step": 12314 + }, + { + "epoch": 3.779926335174954, + "grad_norm": 0.25542667508125305, + "learning_rate": 7.147192773147017e-05, + "loss": 1.801, + "step": 12315 + }, + { + "epoch": 3.780233271945979, + "grad_norm": 0.2301866114139557, + "learning_rate": 7.146743873225923e-05, + "loss": 1.7302, + "step": 12316 + }, + { + "epoch": 3.7805402087170044, + "grad_norm": 0.25821468234062195, + "learning_rate": 7.14629495208945e-05, + "loss": 1.7704, + "step": 12317 + }, + { + "epoch": 3.7808471454880292, + "grad_norm": 0.22537970542907715, + "learning_rate": 7.145846009742029e-05, + "loss": 1.7281, + "step": 12318 + }, + { + "epoch": 3.7811540822590546, + "grad_norm": 0.2565869688987732, + "learning_rate": 7.145397046188102e-05, + "loss": 1.8077, + "step": 12319 + }, + { + "epoch": 3.78146101903008, + "grad_norm": 0.2588396966457367, + "learning_rate": 7.144948061432105e-05, + "loss": 1.7438, + "step": 12320 + }, + { + "epoch": 3.781767955801105, + "grad_norm": 0.2538135349750519, + "learning_rate": 7.144499055478472e-05, + "loss": 1.8253, + "step": 12321 + }, + { + "epoch": 3.78207489257213, + "grad_norm": 0.2272680401802063, + "learning_rate": 7.144050028331644e-05, + "loss": 1.7408, + "step": 12322 + }, + { + "epoch": 3.782381829343155, + "grad_norm": 0.25010406970977783, + "learning_rate": 7.143600979996055e-05, + "loss": 1.8219, + "step": 12323 + }, + { + "epoch": 3.7826887661141804, + "grad_norm": 0.2560291290283203, + "learning_rate": 7.143151910476144e-05, + "loss": 1.7734, + "step": 12324 + }, + { + "epoch": 3.7829957028852057, + "grad_norm": 0.24927431344985962, + "learning_rate": 7.142702819776352e-05, + "loss": 1.7682, + "step": 12325 + }, + { + "epoch": 3.783302639656231, + "grad_norm": 0.2501368224620819, + "learning_rate": 7.142253707901114e-05, + "loss": 1.818, + "step": 12326 + }, + { + "epoch": 3.783609576427256, + "grad_norm": 0.3132917284965515, + "learning_rate": 7.141804574854871e-05, + "loss": 1.7793, + "step": 12327 + }, + { + "epoch": 3.7839165131982813, + "grad_norm": 0.24229925870895386, + "learning_rate": 7.141355420642057e-05, + "loss": 1.7585, + "step": 12328 + }, + { + "epoch": 3.784223449969306, + "grad_norm": 0.22612906992435455, + "learning_rate": 7.140906245267116e-05, + "loss": 1.7374, + "step": 12329 + }, + { + "epoch": 3.7845303867403315, + "grad_norm": 0.26354333758354187, + "learning_rate": 7.140457048734482e-05, + "loss": 1.7751, + "step": 12330 + }, + { + "epoch": 3.784837323511357, + "grad_norm": 0.21500451862812042, + "learning_rate": 7.140007831048599e-05, + "loss": 1.7827, + "step": 12331 + }, + { + "epoch": 3.7851442602823817, + "grad_norm": 0.2826332151889801, + "learning_rate": 7.139558592213904e-05, + "loss": 1.7522, + "step": 12332 + }, + { + "epoch": 3.785451197053407, + "grad_norm": 0.3217725455760956, + "learning_rate": 7.139109332234837e-05, + "loss": 1.8758, + "step": 12333 + }, + { + "epoch": 3.785758133824432, + "grad_norm": 0.26934614777565, + "learning_rate": 7.138660051115837e-05, + "loss": 1.8322, + "step": 12334 + }, + { + "epoch": 3.7860650705954573, + "grad_norm": 0.2653827667236328, + "learning_rate": 7.138210748861346e-05, + "loss": 1.7651, + "step": 12335 + }, + { + "epoch": 3.7863720073664826, + "grad_norm": 0.30470311641693115, + "learning_rate": 7.137761425475802e-05, + "loss": 1.855, + "step": 12336 + }, + { + "epoch": 3.786678944137508, + "grad_norm": 0.2558726370334625, + "learning_rate": 7.137312080963647e-05, + "loss": 1.7174, + "step": 12337 + }, + { + "epoch": 3.786985880908533, + "grad_norm": 0.24025602638721466, + "learning_rate": 7.136862715329322e-05, + "loss": 1.7565, + "step": 12338 + }, + { + "epoch": 3.787292817679558, + "grad_norm": 0.34205392003059387, + "learning_rate": 7.136413328577267e-05, + "loss": 1.8116, + "step": 12339 + }, + { + "epoch": 3.787599754450583, + "grad_norm": 0.4069152772426605, + "learning_rate": 7.135963920711923e-05, + "loss": 1.7662, + "step": 12340 + }, + { + "epoch": 3.7879066912216084, + "grad_norm": 0.3915627598762512, + "learning_rate": 7.13551449173773e-05, + "loss": 1.81, + "step": 12341 + }, + { + "epoch": 3.7882136279926337, + "grad_norm": 0.27136507630348206, + "learning_rate": 7.135065041659134e-05, + "loss": 1.7845, + "step": 12342 + }, + { + "epoch": 3.7885205647636586, + "grad_norm": 0.2924078106880188, + "learning_rate": 7.134615570480572e-05, + "loss": 1.8606, + "step": 12343 + }, + { + "epoch": 3.788827501534684, + "grad_norm": 0.35581526160240173, + "learning_rate": 7.134166078206488e-05, + "loss": 1.7785, + "step": 12344 + }, + { + "epoch": 3.789134438305709, + "grad_norm": 0.3003756105899811, + "learning_rate": 7.133716564841324e-05, + "loss": 1.7321, + "step": 12345 + }, + { + "epoch": 3.789441375076734, + "grad_norm": 0.2586000859737396, + "learning_rate": 7.133267030389524e-05, + "loss": 1.7889, + "step": 12346 + }, + { + "epoch": 3.7897483118477595, + "grad_norm": 0.28053075075149536, + "learning_rate": 7.132817474855527e-05, + "loss": 1.8216, + "step": 12347 + }, + { + "epoch": 3.7900552486187844, + "grad_norm": 0.3064870834350586, + "learning_rate": 7.132367898243777e-05, + "loss": 1.7528, + "step": 12348 + }, + { + "epoch": 3.7903621853898097, + "grad_norm": 0.3045158386230469, + "learning_rate": 7.131918300558719e-05, + "loss": 1.8251, + "step": 12349 + }, + { + "epoch": 3.7906691221608346, + "grad_norm": 0.2438485324382782, + "learning_rate": 7.131468681804794e-05, + "loss": 1.7505, + "step": 12350 + }, + { + "epoch": 3.79097605893186, + "grad_norm": 0.24239958822727203, + "learning_rate": 7.131019041986447e-05, + "loss": 1.7544, + "step": 12351 + }, + { + "epoch": 3.7912829957028853, + "grad_norm": 0.24632441997528076, + "learning_rate": 7.130569381108121e-05, + "loss": 1.7485, + "step": 12352 + }, + { + "epoch": 3.7915899324739106, + "grad_norm": 0.22553624212741852, + "learning_rate": 7.13011969917426e-05, + "loss": 1.803, + "step": 12353 + }, + { + "epoch": 3.7918968692449355, + "grad_norm": 0.2164420485496521, + "learning_rate": 7.129669996189306e-05, + "loss": 1.7307, + "step": 12354 + }, + { + "epoch": 3.792203806015961, + "grad_norm": 0.25104281306266785, + "learning_rate": 7.129220272157705e-05, + "loss": 1.8154, + "step": 12355 + }, + { + "epoch": 3.7925107427869857, + "grad_norm": 0.25533202290534973, + "learning_rate": 7.128770527083903e-05, + "loss": 1.8046, + "step": 12356 + }, + { + "epoch": 3.792817679558011, + "grad_norm": 0.24428130686283112, + "learning_rate": 7.128320760972341e-05, + "loss": 1.7984, + "step": 12357 + }, + { + "epoch": 3.7931246163290364, + "grad_norm": 0.2366408109664917, + "learning_rate": 7.127870973827467e-05, + "loss": 1.7781, + "step": 12358 + }, + { + "epoch": 3.7934315531000613, + "grad_norm": 0.2558888792991638, + "learning_rate": 7.127421165653722e-05, + "loss": 1.7858, + "step": 12359 + }, + { + "epoch": 3.7937384898710866, + "grad_norm": 0.25825443863868713, + "learning_rate": 7.126971336455558e-05, + "loss": 1.8292, + "step": 12360 + }, + { + "epoch": 3.7940454266421115, + "grad_norm": 0.2554624080657959, + "learning_rate": 7.126521486237415e-05, + "loss": 1.822, + "step": 12361 + }, + { + "epoch": 3.794352363413137, + "grad_norm": 0.3030763268470764, + "learning_rate": 7.126071615003742e-05, + "loss": 1.8261, + "step": 12362 + }, + { + "epoch": 3.794659300184162, + "grad_norm": 0.3047907054424286, + "learning_rate": 7.125621722758981e-05, + "loss": 1.8419, + "step": 12363 + }, + { + "epoch": 3.794966236955187, + "grad_norm": 0.27782654762268066, + "learning_rate": 7.12517180950758e-05, + "loss": 1.7959, + "step": 12364 + }, + { + "epoch": 3.7952731737262124, + "grad_norm": 0.24526572227478027, + "learning_rate": 7.124721875253986e-05, + "loss": 1.7313, + "step": 12365 + }, + { + "epoch": 3.7955801104972373, + "grad_norm": 0.23718179762363434, + "learning_rate": 7.124271920002646e-05, + "loss": 1.7479, + "step": 12366 + }, + { + "epoch": 3.7958870472682626, + "grad_norm": 0.2880019247531891, + "learning_rate": 7.123821943758004e-05, + "loss": 1.7792, + "step": 12367 + }, + { + "epoch": 3.796193984039288, + "grad_norm": 0.28923723101615906, + "learning_rate": 7.123371946524511e-05, + "loss": 1.7474, + "step": 12368 + }, + { + "epoch": 3.7965009208103133, + "grad_norm": 0.2281525880098343, + "learning_rate": 7.122921928306612e-05, + "loss": 1.8106, + "step": 12369 + }, + { + "epoch": 3.796807857581338, + "grad_norm": 0.34825438261032104, + "learning_rate": 7.122471889108752e-05, + "loss": 1.8076, + "step": 12370 + }, + { + "epoch": 3.7971147943523635, + "grad_norm": 0.41145995259284973, + "learning_rate": 7.122021828935382e-05, + "loss": 1.7692, + "step": 12371 + }, + { + "epoch": 3.7974217311233884, + "grad_norm": 0.31711262464523315, + "learning_rate": 7.12157174779095e-05, + "loss": 1.8101, + "step": 12372 + }, + { + "epoch": 3.7977286678944138, + "grad_norm": 0.3044308125972748, + "learning_rate": 7.1211216456799e-05, + "loss": 1.8238, + "step": 12373 + }, + { + "epoch": 3.798035604665439, + "grad_norm": 0.3750055134296417, + "learning_rate": 7.120671522606683e-05, + "loss": 1.7323, + "step": 12374 + }, + { + "epoch": 3.798342541436464, + "grad_norm": 0.38852599263191223, + "learning_rate": 7.120221378575749e-05, + "loss": 1.8402, + "step": 12375 + }, + { + "epoch": 3.7986494782074893, + "grad_norm": 0.3430371582508087, + "learning_rate": 7.119771213591541e-05, + "loss": 1.8369, + "step": 12376 + }, + { + "epoch": 3.798956414978514, + "grad_norm": 0.4787428677082062, + "learning_rate": 7.119321027658515e-05, + "loss": 1.7977, + "step": 12377 + }, + { + "epoch": 3.7992633517495396, + "grad_norm": 0.4263977110385895, + "learning_rate": 7.118870820781114e-05, + "loss": 1.8208, + "step": 12378 + }, + { + "epoch": 3.799570288520565, + "grad_norm": 0.28649669885635376, + "learning_rate": 7.118420592963793e-05, + "loss": 1.773, + "step": 12379 + }, + { + "epoch": 3.7998772252915898, + "grad_norm": 0.26070261001586914, + "learning_rate": 7.117970344210996e-05, + "loss": 1.6866, + "step": 12380 + }, + { + "epoch": 3.800184162062615, + "grad_norm": 0.30127593874931335, + "learning_rate": 7.117520074527173e-05, + "loss": 1.7208, + "step": 12381 + }, + { + "epoch": 3.80049109883364, + "grad_norm": 0.23639258742332458, + "learning_rate": 7.117069783916777e-05, + "loss": 1.7504, + "step": 12382 + }, + { + "epoch": 3.8007980356046653, + "grad_norm": 0.2852858901023865, + "learning_rate": 7.116619472384256e-05, + "loss": 1.7954, + "step": 12383 + }, + { + "epoch": 3.8011049723756907, + "grad_norm": 0.2673225998878479, + "learning_rate": 7.116169139934063e-05, + "loss": 1.7562, + "step": 12384 + }, + { + "epoch": 3.801411909146716, + "grad_norm": 0.21615394949913025, + "learning_rate": 7.115718786570644e-05, + "loss": 1.7126, + "step": 12385 + }, + { + "epoch": 3.801718845917741, + "grad_norm": 0.2165435254573822, + "learning_rate": 7.115268412298453e-05, + "loss": 1.7171, + "step": 12386 + }, + { + "epoch": 3.8020257826887662, + "grad_norm": 0.280564546585083, + "learning_rate": 7.114818017121939e-05, + "loss": 1.7711, + "step": 12387 + }, + { + "epoch": 3.802332719459791, + "grad_norm": 0.3023521304130554, + "learning_rate": 7.114367601045555e-05, + "loss": 1.7538, + "step": 12388 + }, + { + "epoch": 3.8026396562308165, + "grad_norm": 0.27252480387687683, + "learning_rate": 7.11391716407375e-05, + "loss": 1.7604, + "step": 12389 + }, + { + "epoch": 3.802946593001842, + "grad_norm": 0.2122909128665924, + "learning_rate": 7.113466706210976e-05, + "loss": 1.716, + "step": 12390 + }, + { + "epoch": 3.8032535297728667, + "grad_norm": 0.30141574144363403, + "learning_rate": 7.113016227461686e-05, + "loss": 1.7636, + "step": 12391 + }, + { + "epoch": 3.803560466543892, + "grad_norm": 0.33359697461128235, + "learning_rate": 7.112565727830331e-05, + "loss": 1.7805, + "step": 12392 + }, + { + "epoch": 3.803867403314917, + "grad_norm": 0.3161376714706421, + "learning_rate": 7.112115207321364e-05, + "loss": 1.7974, + "step": 12393 + }, + { + "epoch": 3.8041743400859422, + "grad_norm": 0.29028698801994324, + "learning_rate": 7.111664665939235e-05, + "loss": 1.83, + "step": 12394 + }, + { + "epoch": 3.8044812768569676, + "grad_norm": 0.38829556107521057, + "learning_rate": 7.1112141036884e-05, + "loss": 1.8684, + "step": 12395 + }, + { + "epoch": 3.804788213627993, + "grad_norm": 0.4118283987045288, + "learning_rate": 7.110763520573309e-05, + "loss": 1.7812, + "step": 12396 + }, + { + "epoch": 3.805095150399018, + "grad_norm": 0.3907717168331146, + "learning_rate": 7.110312916598416e-05, + "loss": 1.7789, + "step": 12397 + }, + { + "epoch": 3.805402087170043, + "grad_norm": 0.2768644690513611, + "learning_rate": 7.109862291768173e-05, + "loss": 1.8575, + "step": 12398 + }, + { + "epoch": 3.805709023941068, + "grad_norm": 0.3234006464481354, + "learning_rate": 7.109411646087035e-05, + "loss": 1.7485, + "step": 12399 + }, + { + "epoch": 3.8060159607120934, + "grad_norm": 0.415475994348526, + "learning_rate": 7.108960979559454e-05, + "loss": 1.7363, + "step": 12400 + }, + { + "epoch": 3.8063228974831187, + "grad_norm": 0.38654613494873047, + "learning_rate": 7.108510292189884e-05, + "loss": 1.7907, + "step": 12401 + }, + { + "epoch": 3.8066298342541436, + "grad_norm": 0.2541481852531433, + "learning_rate": 7.10805958398278e-05, + "loss": 1.8458, + "step": 12402 + }, + { + "epoch": 3.806936771025169, + "grad_norm": 0.32562851905822754, + "learning_rate": 7.107608854942597e-05, + "loss": 1.7989, + "step": 12403 + }, + { + "epoch": 3.807243707796194, + "grad_norm": 0.3628395199775696, + "learning_rate": 7.107158105073786e-05, + "loss": 1.8044, + "step": 12404 + }, + { + "epoch": 3.807550644567219, + "grad_norm": 0.3363969027996063, + "learning_rate": 7.106707334380805e-05, + "loss": 1.8078, + "step": 12405 + }, + { + "epoch": 3.8078575813382445, + "grad_norm": 0.2853989601135254, + "learning_rate": 7.106256542868108e-05, + "loss": 1.7913, + "step": 12406 + }, + { + "epoch": 3.8081645181092694, + "grad_norm": 0.33455806970596313, + "learning_rate": 7.105805730540148e-05, + "loss": 1.7252, + "step": 12407 + }, + { + "epoch": 3.8084714548802947, + "grad_norm": 0.28103405237197876, + "learning_rate": 7.105354897401382e-05, + "loss": 1.6942, + "step": 12408 + }, + { + "epoch": 3.8087783916513196, + "grad_norm": 0.23230718076229095, + "learning_rate": 7.104904043456264e-05, + "loss": 1.7723, + "step": 12409 + }, + { + "epoch": 3.809085328422345, + "grad_norm": 0.2883053421974182, + "learning_rate": 7.104453168709251e-05, + "loss": 1.8015, + "step": 12410 + }, + { + "epoch": 3.8093922651933703, + "grad_norm": 0.28462252020835876, + "learning_rate": 7.104002273164798e-05, + "loss": 1.791, + "step": 12411 + }, + { + "epoch": 3.8096992019643956, + "grad_norm": 0.3004699647426605, + "learning_rate": 7.103551356827363e-05, + "loss": 1.8401, + "step": 12412 + }, + { + "epoch": 3.8100061387354205, + "grad_norm": 0.2546156048774719, + "learning_rate": 7.1031004197014e-05, + "loss": 1.7645, + "step": 12413 + }, + { + "epoch": 3.810313075506446, + "grad_norm": 0.24532915651798248, + "learning_rate": 7.102649461791364e-05, + "loss": 1.8, + "step": 12414 + }, + { + "epoch": 3.8106200122774707, + "grad_norm": 0.2432405799627304, + "learning_rate": 7.102198483101716e-05, + "loss": 1.7957, + "step": 12415 + }, + { + "epoch": 3.810926949048496, + "grad_norm": 0.24405215680599213, + "learning_rate": 7.101747483636908e-05, + "loss": 1.79, + "step": 12416 + }, + { + "epoch": 3.8112338858195214, + "grad_norm": 0.29519838094711304, + "learning_rate": 7.101296463401401e-05, + "loss": 1.8087, + "step": 12417 + }, + { + "epoch": 3.8115408225905463, + "grad_norm": 0.28205612301826477, + "learning_rate": 7.100845422399652e-05, + "loss": 1.7897, + "step": 12418 + }, + { + "epoch": 3.8118477593615716, + "grad_norm": 0.25014567375183105, + "learning_rate": 7.100394360636115e-05, + "loss": 1.7574, + "step": 12419 + }, + { + "epoch": 3.8121546961325965, + "grad_norm": 0.3133499026298523, + "learning_rate": 7.099943278115251e-05, + "loss": 1.7957, + "step": 12420 + }, + { + "epoch": 3.812461632903622, + "grad_norm": 0.3706473708152771, + "learning_rate": 7.099492174841516e-05, + "loss": 1.8519, + "step": 12421 + }, + { + "epoch": 3.812768569674647, + "grad_norm": 0.30085715651512146, + "learning_rate": 7.09904105081937e-05, + "loss": 1.778, + "step": 12422 + }, + { + "epoch": 3.813075506445672, + "grad_norm": 0.23897981643676758, + "learning_rate": 7.09858990605327e-05, + "loss": 1.7289, + "step": 12423 + }, + { + "epoch": 3.8133824432166974, + "grad_norm": 0.30046290159225464, + "learning_rate": 7.098138740547673e-05, + "loss": 1.8838, + "step": 12424 + }, + { + "epoch": 3.8136893799877223, + "grad_norm": 0.32126328349113464, + "learning_rate": 7.097687554307041e-05, + "loss": 1.7916, + "step": 12425 + }, + { + "epoch": 3.8139963167587476, + "grad_norm": 0.2922256886959076, + "learning_rate": 7.097236347335829e-05, + "loss": 1.8305, + "step": 12426 + }, + { + "epoch": 3.814303253529773, + "grad_norm": 0.2772706151008606, + "learning_rate": 7.0967851196385e-05, + "loss": 1.7694, + "step": 12427 + }, + { + "epoch": 3.8146101903007983, + "grad_norm": 0.25763455033302307, + "learning_rate": 7.096333871219511e-05, + "loss": 1.8716, + "step": 12428 + }, + { + "epoch": 3.814917127071823, + "grad_norm": 0.2631739377975464, + "learning_rate": 7.095882602083322e-05, + "loss": 1.7771, + "step": 12429 + }, + { + "epoch": 3.8152240638428485, + "grad_norm": 0.29229632019996643, + "learning_rate": 7.095431312234392e-05, + "loss": 1.7865, + "step": 12430 + }, + { + "epoch": 3.8155310006138734, + "grad_norm": 0.2672729790210724, + "learning_rate": 7.094980001677181e-05, + "loss": 1.7848, + "step": 12431 + }, + { + "epoch": 3.8158379373848987, + "grad_norm": 0.2388373166322708, + "learning_rate": 7.094528670416152e-05, + "loss": 1.75, + "step": 12432 + }, + { + "epoch": 3.816144874155924, + "grad_norm": 0.2385305017232895, + "learning_rate": 7.094077318455762e-05, + "loss": 1.748, + "step": 12433 + }, + { + "epoch": 3.816451810926949, + "grad_norm": 0.25421401858329773, + "learning_rate": 7.093625945800471e-05, + "loss": 1.779, + "step": 12434 + }, + { + "epoch": 3.8167587476979743, + "grad_norm": 0.2785158157348633, + "learning_rate": 7.093174552454743e-05, + "loss": 1.8295, + "step": 12435 + }, + { + "epoch": 3.817065684468999, + "grad_norm": 0.2907472252845764, + "learning_rate": 7.092723138423036e-05, + "loss": 1.8216, + "step": 12436 + }, + { + "epoch": 3.8173726212400245, + "grad_norm": 0.253955215215683, + "learning_rate": 7.092271703709814e-05, + "loss": 1.8394, + "step": 12437 + }, + { + "epoch": 3.81767955801105, + "grad_norm": 0.32139912247657776, + "learning_rate": 7.091820248319537e-05, + "loss": 1.8634, + "step": 12438 + }, + { + "epoch": 3.8179864947820747, + "grad_norm": 0.25890466570854187, + "learning_rate": 7.091368772256664e-05, + "loss": 1.7336, + "step": 12439 + }, + { + "epoch": 3.8182934315531, + "grad_norm": 0.2823775112628937, + "learning_rate": 7.090917275525661e-05, + "loss": 1.7927, + "step": 12440 + }, + { + "epoch": 3.818600368324125, + "grad_norm": 0.28739333152770996, + "learning_rate": 7.090465758130988e-05, + "loss": 1.7807, + "step": 12441 + }, + { + "epoch": 3.8189073050951503, + "grad_norm": 0.36823949217796326, + "learning_rate": 7.090014220077106e-05, + "loss": 1.7288, + "step": 12442 + }, + { + "epoch": 3.8192142418661756, + "grad_norm": 0.3061312735080719, + "learning_rate": 7.089562661368479e-05, + "loss": 1.8039, + "step": 12443 + }, + { + "epoch": 3.819521178637201, + "grad_norm": 0.25867924094200134, + "learning_rate": 7.089111082009569e-05, + "loss": 1.7678, + "step": 12444 + }, + { + "epoch": 3.819828115408226, + "grad_norm": 0.26834985613822937, + "learning_rate": 7.088659482004837e-05, + "loss": 1.7592, + "step": 12445 + }, + { + "epoch": 3.820135052179251, + "grad_norm": 0.25608211755752563, + "learning_rate": 7.08820786135875e-05, + "loss": 1.7622, + "step": 12446 + }, + { + "epoch": 3.820441988950276, + "grad_norm": 0.2512456774711609, + "learning_rate": 7.087756220075769e-05, + "loss": 1.7648, + "step": 12447 + }, + { + "epoch": 3.8207489257213014, + "grad_norm": 0.2434878647327423, + "learning_rate": 7.087304558160355e-05, + "loss": 1.7435, + "step": 12448 + }, + { + "epoch": 3.8210558624923268, + "grad_norm": 0.26456570625305176, + "learning_rate": 7.086852875616978e-05, + "loss": 1.7342, + "step": 12449 + }, + { + "epoch": 3.8213627992633517, + "grad_norm": 0.2958984971046448, + "learning_rate": 7.086401172450095e-05, + "loss": 1.8532, + "step": 12450 + }, + { + "epoch": 3.821669736034377, + "grad_norm": 0.25939157605171204, + "learning_rate": 7.085949448664172e-05, + "loss": 1.7746, + "step": 12451 + }, + { + "epoch": 3.821976672805402, + "grad_norm": 0.2210223525762558, + "learning_rate": 7.085497704263675e-05, + "loss": 1.7745, + "step": 12452 + }, + { + "epoch": 3.822283609576427, + "grad_norm": 0.2409319430589676, + "learning_rate": 7.085045939253068e-05, + "loss": 1.7981, + "step": 12453 + }, + { + "epoch": 3.8225905463474525, + "grad_norm": 0.26331812143325806, + "learning_rate": 7.084594153636815e-05, + "loss": 1.8163, + "step": 12454 + }, + { + "epoch": 3.8228974831184774, + "grad_norm": 0.2613828480243683, + "learning_rate": 7.08414234741938e-05, + "loss": 1.8362, + "step": 12455 + }, + { + "epoch": 3.8232044198895028, + "grad_norm": 0.3139529228210449, + "learning_rate": 7.083690520605228e-05, + "loss": 1.8247, + "step": 12456 + }, + { + "epoch": 3.8235113566605277, + "grad_norm": 0.2958570718765259, + "learning_rate": 7.083238673198826e-05, + "loss": 1.8011, + "step": 12457 + }, + { + "epoch": 3.823818293431553, + "grad_norm": 0.2517626881599426, + "learning_rate": 7.082786805204639e-05, + "loss": 1.7353, + "step": 12458 + }, + { + "epoch": 3.8241252302025783, + "grad_norm": 0.2443888783454895, + "learning_rate": 7.082334916627132e-05, + "loss": 1.7916, + "step": 12459 + }, + { + "epoch": 3.8244321669736037, + "grad_norm": 0.283514142036438, + "learning_rate": 7.08188300747077e-05, + "loss": 1.8048, + "step": 12460 + }, + { + "epoch": 3.8247391037446286, + "grad_norm": 0.24775351583957672, + "learning_rate": 7.08143107774002e-05, + "loss": 1.8145, + "step": 12461 + }, + { + "epoch": 3.825046040515654, + "grad_norm": 0.27904003858566284, + "learning_rate": 7.080979127439347e-05, + "loss": 1.8003, + "step": 12462 + }, + { + "epoch": 3.825352977286679, + "grad_norm": 0.24997512996196747, + "learning_rate": 7.08052715657322e-05, + "loss": 1.7962, + "step": 12463 + }, + { + "epoch": 3.825659914057704, + "grad_norm": 0.25874343514442444, + "learning_rate": 7.080075165146104e-05, + "loss": 1.7861, + "step": 12464 + }, + { + "epoch": 3.8259668508287294, + "grad_norm": 0.2964434027671814, + "learning_rate": 7.079623153162467e-05, + "loss": 1.7618, + "step": 12465 + }, + { + "epoch": 3.8262737875997543, + "grad_norm": 0.26403337717056274, + "learning_rate": 7.079171120626774e-05, + "loss": 1.8016, + "step": 12466 + }, + { + "epoch": 3.8265807243707797, + "grad_norm": 0.28369295597076416, + "learning_rate": 7.078719067543494e-05, + "loss": 1.7517, + "step": 12467 + }, + { + "epoch": 3.8268876611418046, + "grad_norm": 0.254312127828598, + "learning_rate": 7.078266993917093e-05, + "loss": 1.8085, + "step": 12468 + }, + { + "epoch": 3.82719459791283, + "grad_norm": 0.24992622435092926, + "learning_rate": 7.077814899752038e-05, + "loss": 1.7657, + "step": 12469 + }, + { + "epoch": 3.8275015346838552, + "grad_norm": 0.26485762000083923, + "learning_rate": 7.077362785052802e-05, + "loss": 1.7303, + "step": 12470 + }, + { + "epoch": 3.8278084714548806, + "grad_norm": 0.29864901304244995, + "learning_rate": 7.076910649823846e-05, + "loss": 1.7734, + "step": 12471 + }, + { + "epoch": 3.8281154082259055, + "grad_norm": 0.2973599433898926, + "learning_rate": 7.076458494069644e-05, + "loss": 1.8055, + "step": 12472 + }, + { + "epoch": 3.828422344996931, + "grad_norm": 0.2150362730026245, + "learning_rate": 7.07600631779466e-05, + "loss": 1.7377, + "step": 12473 + }, + { + "epoch": 3.8287292817679557, + "grad_norm": 0.26443010568618774, + "learning_rate": 7.075554121003367e-05, + "loss": 1.837, + "step": 12474 + }, + { + "epoch": 3.829036218538981, + "grad_norm": 0.27365007996559143, + "learning_rate": 7.075101903700231e-05, + "loss": 1.7784, + "step": 12475 + }, + { + "epoch": 3.8293431553100064, + "grad_norm": 0.22037263214588165, + "learning_rate": 7.074649665889721e-05, + "loss": 1.8182, + "step": 12476 + }, + { + "epoch": 3.8296500920810312, + "grad_norm": 0.29614946246147156, + "learning_rate": 7.074197407576308e-05, + "loss": 1.7993, + "step": 12477 + }, + { + "epoch": 3.8299570288520566, + "grad_norm": 0.25135520100593567, + "learning_rate": 7.07374512876446e-05, + "loss": 1.8211, + "step": 12478 + }, + { + "epoch": 3.8302639656230815, + "grad_norm": 0.2711503207683563, + "learning_rate": 7.073292829458645e-05, + "loss": 1.8274, + "step": 12479 + }, + { + "epoch": 3.830570902394107, + "grad_norm": 0.38659265637397766, + "learning_rate": 7.072840509663338e-05, + "loss": 1.796, + "step": 12480 + }, + { + "epoch": 3.830877839165132, + "grad_norm": 0.39382728934288025, + "learning_rate": 7.072388169383005e-05, + "loss": 1.8439, + "step": 12481 + }, + { + "epoch": 3.831184775936157, + "grad_norm": 0.27570033073425293, + "learning_rate": 7.071935808622118e-05, + "loss": 1.8155, + "step": 12482 + }, + { + "epoch": 3.8314917127071824, + "grad_norm": 0.29054465889930725, + "learning_rate": 7.071483427385147e-05, + "loss": 1.754, + "step": 12483 + }, + { + "epoch": 3.8317986494782073, + "grad_norm": 0.4138031303882599, + "learning_rate": 7.071031025676562e-05, + "loss": 1.7686, + "step": 12484 + }, + { + "epoch": 3.8321055862492326, + "grad_norm": 0.3447251617908478, + "learning_rate": 7.070578603500833e-05, + "loss": 1.8135, + "step": 12485 + }, + { + "epoch": 3.832412523020258, + "grad_norm": 0.265115886926651, + "learning_rate": 7.070126160862436e-05, + "loss": 1.803, + "step": 12486 + }, + { + "epoch": 3.8327194597912833, + "grad_norm": 0.4288817346096039, + "learning_rate": 7.069673697765837e-05, + "loss": 1.7814, + "step": 12487 + }, + { + "epoch": 3.833026396562308, + "grad_norm": 0.4890103340148926, + "learning_rate": 7.06922121421551e-05, + "loss": 1.8318, + "step": 12488 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.3676142990589142, + "learning_rate": 7.068768710215928e-05, + "loss": 1.7792, + "step": 12489 + }, + { + "epoch": 3.8336402701043584, + "grad_norm": 0.23254090547561646, + "learning_rate": 7.068316185771557e-05, + "loss": 1.7154, + "step": 12490 + }, + { + "epoch": 3.8339472068753837, + "grad_norm": 0.35014036297798157, + "learning_rate": 7.067863640886876e-05, + "loss": 1.7031, + "step": 12491 + }, + { + "epoch": 3.834254143646409, + "grad_norm": 0.32155317068099976, + "learning_rate": 7.067411075566353e-05, + "loss": 1.7692, + "step": 12492 + }, + { + "epoch": 3.834561080417434, + "grad_norm": 0.260772705078125, + "learning_rate": 7.066958489814463e-05, + "loss": 1.7488, + "step": 12493 + }, + { + "epoch": 3.8348680171884593, + "grad_norm": 0.2624910771846771, + "learning_rate": 7.066505883635678e-05, + "loss": 1.7436, + "step": 12494 + }, + { + "epoch": 3.835174953959484, + "grad_norm": 0.2782299220561981, + "learning_rate": 7.066053257034471e-05, + "loss": 1.8219, + "step": 12495 + }, + { + "epoch": 3.8354818907305095, + "grad_norm": 0.2749497890472412, + "learning_rate": 7.065600610015312e-05, + "loss": 1.8068, + "step": 12496 + }, + { + "epoch": 3.835788827501535, + "grad_norm": 0.2730359733104706, + "learning_rate": 7.06514794258268e-05, + "loss": 1.7588, + "step": 12497 + }, + { + "epoch": 3.8360957642725597, + "grad_norm": 0.3606291711330414, + "learning_rate": 7.064695254741044e-05, + "loss": 1.8509, + "step": 12498 + }, + { + "epoch": 3.836402701043585, + "grad_norm": 0.23282989859580994, + "learning_rate": 7.064242546494879e-05, + "loss": 1.7444, + "step": 12499 + }, + { + "epoch": 3.83670963781461, + "grad_norm": 0.2554507255554199, + "learning_rate": 7.06378981784866e-05, + "loss": 1.7486, + "step": 12500 + }, + { + "epoch": 3.8370165745856353, + "grad_norm": 0.2916143834590912, + "learning_rate": 7.06333706880686e-05, + "loss": 1.8035, + "step": 12501 + }, + { + "epoch": 3.8373235113566606, + "grad_norm": 0.23719090223312378, + "learning_rate": 7.062884299373955e-05, + "loss": 1.7896, + "step": 12502 + }, + { + "epoch": 3.837630448127686, + "grad_norm": 0.2596152126789093, + "learning_rate": 7.062431509554417e-05, + "loss": 1.7944, + "step": 12503 + }, + { + "epoch": 3.837937384898711, + "grad_norm": 0.29140764474868774, + "learning_rate": 7.061978699352723e-05, + "loss": 1.7988, + "step": 12504 + }, + { + "epoch": 3.838244321669736, + "grad_norm": 0.3421068489551544, + "learning_rate": 7.061525868773347e-05, + "loss": 1.751, + "step": 12505 + }, + { + "epoch": 3.838551258440761, + "grad_norm": 0.2705349624156952, + "learning_rate": 7.061073017820764e-05, + "loss": 1.7578, + "step": 12506 + }, + { + "epoch": 3.8388581952117864, + "grad_norm": 0.2403286248445511, + "learning_rate": 7.060620146499448e-05, + "loss": 1.8422, + "step": 12507 + }, + { + "epoch": 3.8391651319828117, + "grad_norm": 0.3860442042350769, + "learning_rate": 7.060167254813876e-05, + "loss": 1.8168, + "step": 12508 + }, + { + "epoch": 3.8394720687538366, + "grad_norm": 0.4729512631893158, + "learning_rate": 7.059714342768526e-05, + "loss": 1.7786, + "step": 12509 + }, + { + "epoch": 3.839779005524862, + "grad_norm": 0.3522968888282776, + "learning_rate": 7.059261410367871e-05, + "loss": 1.8749, + "step": 12510 + }, + { + "epoch": 3.840085942295887, + "grad_norm": 0.28071436285972595, + "learning_rate": 7.058808457616386e-05, + "loss": 1.7959, + "step": 12511 + }, + { + "epoch": 3.840392879066912, + "grad_norm": 0.4356439411640167, + "learning_rate": 7.05835548451855e-05, + "loss": 1.8045, + "step": 12512 + }, + { + "epoch": 3.8406998158379375, + "grad_norm": 0.4051562249660492, + "learning_rate": 7.057902491078839e-05, + "loss": 1.7909, + "step": 12513 + }, + { + "epoch": 3.8410067526089624, + "grad_norm": 0.2817205488681793, + "learning_rate": 7.057449477301728e-05, + "loss": 1.8736, + "step": 12514 + }, + { + "epoch": 3.8413136893799877, + "grad_norm": 0.33369559049606323, + "learning_rate": 7.056996443191697e-05, + "loss": 1.7799, + "step": 12515 + }, + { + "epoch": 3.8416206261510126, + "grad_norm": 0.369954913854599, + "learning_rate": 7.056543388753221e-05, + "loss": 1.795, + "step": 12516 + }, + { + "epoch": 3.841927562922038, + "grad_norm": 0.289474755525589, + "learning_rate": 7.056090313990778e-05, + "loss": 1.786, + "step": 12517 + }, + { + "epoch": 3.8422344996930633, + "grad_norm": 0.2431849092245102, + "learning_rate": 7.055637218908845e-05, + "loss": 1.7363, + "step": 12518 + }, + { + "epoch": 3.8425414364640886, + "grad_norm": 0.3736060857772827, + "learning_rate": 7.0551841035119e-05, + "loss": 1.8234, + "step": 12519 + }, + { + "epoch": 3.8428483732351135, + "grad_norm": 0.34008854627609253, + "learning_rate": 7.054730967804422e-05, + "loss": 1.8001, + "step": 12520 + }, + { + "epoch": 3.843155310006139, + "grad_norm": 0.24852876365184784, + "learning_rate": 7.054277811790887e-05, + "loss": 1.8298, + "step": 12521 + }, + { + "epoch": 3.8434622467771637, + "grad_norm": 0.3491046726703644, + "learning_rate": 7.053824635475777e-05, + "loss": 1.7336, + "step": 12522 + }, + { + "epoch": 3.843769183548189, + "grad_norm": 0.38757824897766113, + "learning_rate": 7.053371438863566e-05, + "loss": 1.8241, + "step": 12523 + }, + { + "epoch": 3.8440761203192144, + "grad_norm": 0.2607647180557251, + "learning_rate": 7.052918221958735e-05, + "loss": 1.7813, + "step": 12524 + }, + { + "epoch": 3.8443830570902393, + "grad_norm": 0.25634410977363586, + "learning_rate": 7.052464984765764e-05, + "loss": 1.7836, + "step": 12525 + }, + { + "epoch": 3.8446899938612646, + "grad_norm": 0.3113503158092499, + "learning_rate": 7.052011727289129e-05, + "loss": 1.8477, + "step": 12526 + }, + { + "epoch": 3.8449969306322895, + "grad_norm": 0.2852596044540405, + "learning_rate": 7.051558449533313e-05, + "loss": 1.7607, + "step": 12527 + }, + { + "epoch": 3.845303867403315, + "grad_norm": 0.24841541051864624, + "learning_rate": 7.051105151502795e-05, + "loss": 1.8109, + "step": 12528 + }, + { + "epoch": 3.84561080417434, + "grad_norm": 0.2231549620628357, + "learning_rate": 7.050651833202053e-05, + "loss": 1.7245, + "step": 12529 + }, + { + "epoch": 3.845917740945365, + "grad_norm": 0.21975892782211304, + "learning_rate": 7.050198494635566e-05, + "loss": 1.7512, + "step": 12530 + }, + { + "epoch": 3.8462246777163904, + "grad_norm": 0.2546280324459076, + "learning_rate": 7.049745135807816e-05, + "loss": 1.8003, + "step": 12531 + }, + { + "epoch": 3.8465316144874153, + "grad_norm": 0.21507929265499115, + "learning_rate": 7.049291756723284e-05, + "loss": 1.7616, + "step": 12532 + }, + { + "epoch": 3.8468385512584407, + "grad_norm": 0.24927987158298492, + "learning_rate": 7.04883835738645e-05, + "loss": 1.7519, + "step": 12533 + }, + { + "epoch": 3.847145488029466, + "grad_norm": 0.24988602101802826, + "learning_rate": 7.048384937801793e-05, + "loss": 1.7966, + "step": 12534 + }, + { + "epoch": 3.8474524248004913, + "grad_norm": 0.24039845168590546, + "learning_rate": 7.047931497973798e-05, + "loss": 1.7834, + "step": 12535 + }, + { + "epoch": 3.847759361571516, + "grad_norm": 0.22826696932315826, + "learning_rate": 7.047478037906943e-05, + "loss": 1.7334, + "step": 12536 + }, + { + "epoch": 3.8480662983425415, + "grad_norm": 0.22260744869709015, + "learning_rate": 7.047024557605708e-05, + "loss": 1.787, + "step": 12537 + }, + { + "epoch": 3.8483732351135664, + "grad_norm": 0.2457917332649231, + "learning_rate": 7.046571057074578e-05, + "loss": 1.7865, + "step": 12538 + }, + { + "epoch": 3.8486801718845918, + "grad_norm": 0.23952928185462952, + "learning_rate": 7.046117536318035e-05, + "loss": 1.7764, + "step": 12539 + }, + { + "epoch": 3.848987108655617, + "grad_norm": 0.22186748683452606, + "learning_rate": 7.045663995340557e-05, + "loss": 1.7917, + "step": 12540 + }, + { + "epoch": 3.849294045426642, + "grad_norm": 0.24234962463378906, + "learning_rate": 7.045210434146629e-05, + "loss": 1.7697, + "step": 12541 + }, + { + "epoch": 3.8496009821976673, + "grad_norm": 0.2510770857334137, + "learning_rate": 7.044756852740732e-05, + "loss": 1.8012, + "step": 12542 + }, + { + "epoch": 3.849907918968692, + "grad_norm": 0.24910703301429749, + "learning_rate": 7.044303251127349e-05, + "loss": 1.831, + "step": 12543 + }, + { + "epoch": 3.8502148557397176, + "grad_norm": 0.3159966468811035, + "learning_rate": 7.043849629310964e-05, + "loss": 1.8029, + "step": 12544 + }, + { + "epoch": 3.850521792510743, + "grad_norm": 0.3155403733253479, + "learning_rate": 7.04339598729606e-05, + "loss": 1.7429, + "step": 12545 + }, + { + "epoch": 3.8508287292817682, + "grad_norm": 0.3037515878677368, + "learning_rate": 7.042942325087117e-05, + "loss": 1.8186, + "step": 12546 + }, + { + "epoch": 3.851135666052793, + "grad_norm": 0.2319766730070114, + "learning_rate": 7.042488642688621e-05, + "loss": 1.7853, + "step": 12547 + }, + { + "epoch": 3.8514426028238185, + "grad_norm": 0.23911969363689423, + "learning_rate": 7.042034940105055e-05, + "loss": 1.8314, + "step": 12548 + }, + { + "epoch": 3.8517495395948433, + "grad_norm": 0.2541846036911011, + "learning_rate": 7.041581217340905e-05, + "loss": 1.8289, + "step": 12549 + }, + { + "epoch": 3.8520564763658687, + "grad_norm": 0.22234943509101868, + "learning_rate": 7.04112747440065e-05, + "loss": 1.7847, + "step": 12550 + }, + { + "epoch": 3.852363413136894, + "grad_norm": 0.2747870981693268, + "learning_rate": 7.04067371128878e-05, + "loss": 1.7875, + "step": 12551 + }, + { + "epoch": 3.852670349907919, + "grad_norm": 0.28589147329330444, + "learning_rate": 7.040219928009775e-05, + "loss": 1.7289, + "step": 12552 + }, + { + "epoch": 3.8529772866789442, + "grad_norm": 0.21180351078510284, + "learning_rate": 7.039766124568119e-05, + "loss": 1.7611, + "step": 12553 + }, + { + "epoch": 3.853284223449969, + "grad_norm": 0.27751782536506653, + "learning_rate": 7.0393123009683e-05, + "loss": 1.7481, + "step": 12554 + }, + { + "epoch": 3.8535911602209945, + "grad_norm": 0.32883307337760925, + "learning_rate": 7.038858457214802e-05, + "loss": 1.7271, + "step": 12555 + }, + { + "epoch": 3.85389809699202, + "grad_norm": 0.30965641140937805, + "learning_rate": 7.03840459331211e-05, + "loss": 1.81, + "step": 12556 + }, + { + "epoch": 3.8542050337630447, + "grad_norm": 0.25184348225593567, + "learning_rate": 7.037950709264709e-05, + "loss": 1.7642, + "step": 12557 + }, + { + "epoch": 3.85451197053407, + "grad_norm": 0.2376822829246521, + "learning_rate": 7.037496805077084e-05, + "loss": 1.7774, + "step": 12558 + }, + { + "epoch": 3.854818907305095, + "grad_norm": 0.2395993024110794, + "learning_rate": 7.03704288075372e-05, + "loss": 1.8397, + "step": 12559 + }, + { + "epoch": 3.8551258440761202, + "grad_norm": 0.26460394263267517, + "learning_rate": 7.036588936299107e-05, + "loss": 1.7472, + "step": 12560 + }, + { + "epoch": 3.8554327808471456, + "grad_norm": 0.34742459654808044, + "learning_rate": 7.036134971717725e-05, + "loss": 1.8003, + "step": 12561 + }, + { + "epoch": 3.855739717618171, + "grad_norm": 0.2829316556453705, + "learning_rate": 7.035680987014068e-05, + "loss": 1.7765, + "step": 12562 + }, + { + "epoch": 3.856046654389196, + "grad_norm": 0.3087223172187805, + "learning_rate": 7.035226982192615e-05, + "loss": 1.8462, + "step": 12563 + }, + { + "epoch": 3.856353591160221, + "grad_norm": 0.2806380093097687, + "learning_rate": 7.034772957257858e-05, + "loss": 1.7704, + "step": 12564 + }, + { + "epoch": 3.856660527931246, + "grad_norm": 0.25598087906837463, + "learning_rate": 7.03431891221428e-05, + "loss": 1.7843, + "step": 12565 + }, + { + "epoch": 3.8569674647022714, + "grad_norm": 0.30833700299263, + "learning_rate": 7.033864847066373e-05, + "loss": 1.8404, + "step": 12566 + }, + { + "epoch": 3.8572744014732967, + "grad_norm": 0.29562532901763916, + "learning_rate": 7.03341076181862e-05, + "loss": 1.8044, + "step": 12567 + }, + { + "epoch": 3.8575813382443216, + "grad_norm": 0.2901719808578491, + "learning_rate": 7.03295665647551e-05, + "loss": 1.7789, + "step": 12568 + }, + { + "epoch": 3.857888275015347, + "grad_norm": 0.25453686714172363, + "learning_rate": 7.03250253104153e-05, + "loss": 1.6792, + "step": 12569 + }, + { + "epoch": 3.858195211786372, + "grad_norm": 0.26009416580200195, + "learning_rate": 7.03204838552117e-05, + "loss": 1.7835, + "step": 12570 + }, + { + "epoch": 3.858502148557397, + "grad_norm": 0.28074127435684204, + "learning_rate": 7.031594219918916e-05, + "loss": 1.7932, + "step": 12571 + }, + { + "epoch": 3.8588090853284225, + "grad_norm": 0.3341725170612335, + "learning_rate": 7.031140034239258e-05, + "loss": 1.7439, + "step": 12572 + }, + { + "epoch": 3.8591160220994474, + "grad_norm": 0.28142449259757996, + "learning_rate": 7.030685828486684e-05, + "loss": 1.8263, + "step": 12573 + }, + { + "epoch": 3.8594229588704727, + "grad_norm": 0.2571438252925873, + "learning_rate": 7.030231602665681e-05, + "loss": 1.7628, + "step": 12574 + }, + { + "epoch": 3.8597298956414976, + "grad_norm": 0.3079041838645935, + "learning_rate": 7.029777356780741e-05, + "loss": 1.7879, + "step": 12575 + }, + { + "epoch": 3.860036832412523, + "grad_norm": 0.2605433464050293, + "learning_rate": 7.029323090836349e-05, + "loss": 1.7841, + "step": 12576 + }, + { + "epoch": 3.8603437691835483, + "grad_norm": 0.24069640040397644, + "learning_rate": 7.028868804836999e-05, + "loss": 1.7939, + "step": 12577 + }, + { + "epoch": 3.8606507059545736, + "grad_norm": 0.26801639795303345, + "learning_rate": 7.028414498787177e-05, + "loss": 1.8082, + "step": 12578 + }, + { + "epoch": 3.8609576427255985, + "grad_norm": 0.28828585147857666, + "learning_rate": 7.027960172691375e-05, + "loss": 1.8094, + "step": 12579 + }, + { + "epoch": 3.861264579496624, + "grad_norm": 0.22927051782608032, + "learning_rate": 7.027505826554082e-05, + "loss": 1.7758, + "step": 12580 + }, + { + "epoch": 3.8615715162676487, + "grad_norm": 0.25755998492240906, + "learning_rate": 7.027051460379788e-05, + "loss": 1.8429, + "step": 12581 + }, + { + "epoch": 3.861878453038674, + "grad_norm": 0.23636581003665924, + "learning_rate": 7.026597074172982e-05, + "loss": 1.7662, + "step": 12582 + }, + { + "epoch": 3.8621853898096994, + "grad_norm": 0.22599349915981293, + "learning_rate": 7.026142667938156e-05, + "loss": 1.7199, + "step": 12583 + }, + { + "epoch": 3.8624923265807243, + "grad_norm": 0.2504875659942627, + "learning_rate": 7.025688241679802e-05, + "loss": 1.8473, + "step": 12584 + }, + { + "epoch": 3.8627992633517496, + "grad_norm": 0.3012976348400116, + "learning_rate": 7.025233795402408e-05, + "loss": 1.8715, + "step": 12585 + }, + { + "epoch": 3.8631062001227745, + "grad_norm": 0.31703677773475647, + "learning_rate": 7.024779329110469e-05, + "loss": 1.8143, + "step": 12586 + }, + { + "epoch": 3.8634131368938, + "grad_norm": 0.27287593483924866, + "learning_rate": 7.024324842808472e-05, + "loss": 1.7227, + "step": 12587 + }, + { + "epoch": 3.863720073664825, + "grad_norm": 0.24663801491260529, + "learning_rate": 7.02387033650091e-05, + "loss": 1.7529, + "step": 12588 + }, + { + "epoch": 3.86402701043585, + "grad_norm": 0.26127147674560547, + "learning_rate": 7.023415810192277e-05, + "loss": 1.7629, + "step": 12589 + }, + { + "epoch": 3.8643339472068754, + "grad_norm": 0.3457142114639282, + "learning_rate": 7.022961263887062e-05, + "loss": 1.8212, + "step": 12590 + }, + { + "epoch": 3.8646408839779003, + "grad_norm": 0.3296070694923401, + "learning_rate": 7.022506697589759e-05, + "loss": 1.7907, + "step": 12591 + }, + { + "epoch": 3.8649478207489256, + "grad_norm": 0.29474303126335144, + "learning_rate": 7.022052111304858e-05, + "loss": 1.7866, + "step": 12592 + }, + { + "epoch": 3.865254757519951, + "grad_norm": 0.2535403072834015, + "learning_rate": 7.021597505036852e-05, + "loss": 1.7607, + "step": 12593 + }, + { + "epoch": 3.8655616942909763, + "grad_norm": 0.26691222190856934, + "learning_rate": 7.021142878790237e-05, + "loss": 1.8063, + "step": 12594 + }, + { + "epoch": 3.865868631062001, + "grad_norm": 0.2784755229949951, + "learning_rate": 7.020688232569502e-05, + "loss": 1.8065, + "step": 12595 + }, + { + "epoch": 3.8661755678330265, + "grad_norm": 0.23714317381381989, + "learning_rate": 7.020233566379142e-05, + "loss": 1.8317, + "step": 12596 + }, + { + "epoch": 3.8664825046040514, + "grad_norm": 0.25010553002357483, + "learning_rate": 7.019778880223649e-05, + "loss": 1.8493, + "step": 12597 + }, + { + "epoch": 3.8667894413750767, + "grad_norm": 0.2798489034175873, + "learning_rate": 7.01932417410752e-05, + "loss": 1.8134, + "step": 12598 + }, + { + "epoch": 3.867096378146102, + "grad_norm": 0.26199260354042053, + "learning_rate": 7.018869448035243e-05, + "loss": 1.6931, + "step": 12599 + }, + { + "epoch": 3.867403314917127, + "grad_norm": 0.24582891166210175, + "learning_rate": 7.018414702011314e-05, + "loss": 1.8076, + "step": 12600 + }, + { + "epoch": 3.8677102516881523, + "grad_norm": 0.25493237376213074, + "learning_rate": 7.01795993604023e-05, + "loss": 1.7851, + "step": 12601 + }, + { + "epoch": 3.868017188459177, + "grad_norm": 0.2607674300670624, + "learning_rate": 7.017505150126483e-05, + "loss": 1.7285, + "step": 12602 + }, + { + "epoch": 3.8683241252302025, + "grad_norm": 0.23629581928253174, + "learning_rate": 7.017050344274568e-05, + "loss": 1.8254, + "step": 12603 + }, + { + "epoch": 3.868631062001228, + "grad_norm": 0.3129318058490753, + "learning_rate": 7.016595518488979e-05, + "loss": 1.7914, + "step": 12604 + }, + { + "epoch": 3.8689379987722528, + "grad_norm": 0.3178271949291229, + "learning_rate": 7.01614067277421e-05, + "loss": 1.8139, + "step": 12605 + }, + { + "epoch": 3.869244935543278, + "grad_norm": 0.3230711817741394, + "learning_rate": 7.015685807134757e-05, + "loss": 1.8203, + "step": 12606 + }, + { + "epoch": 3.869551872314303, + "grad_norm": 0.26339825987815857, + "learning_rate": 7.015230921575118e-05, + "loss": 1.8022, + "step": 12607 + }, + { + "epoch": 3.8698588090853283, + "grad_norm": 0.25337356328964233, + "learning_rate": 7.014776016099785e-05, + "loss": 1.7779, + "step": 12608 + }, + { + "epoch": 3.8701657458563536, + "grad_norm": 0.2506195306777954, + "learning_rate": 7.014321090713253e-05, + "loss": 1.7858, + "step": 12609 + }, + { + "epoch": 3.870472682627379, + "grad_norm": 0.26249951124191284, + "learning_rate": 7.013866145420021e-05, + "loss": 1.8051, + "step": 12610 + }, + { + "epoch": 3.870779619398404, + "grad_norm": 0.25666534900665283, + "learning_rate": 7.013411180224581e-05, + "loss": 1.7945, + "step": 12611 + }, + { + "epoch": 3.871086556169429, + "grad_norm": 0.23901648819446564, + "learning_rate": 7.012956195131433e-05, + "loss": 1.7844, + "step": 12612 + }, + { + "epoch": 3.871393492940454, + "grad_norm": 0.26814451813697815, + "learning_rate": 7.012501190145071e-05, + "loss": 1.7713, + "step": 12613 + }, + { + "epoch": 3.8717004297114794, + "grad_norm": 0.28377315402030945, + "learning_rate": 7.012046165269995e-05, + "loss": 1.7866, + "step": 12614 + }, + { + "epoch": 3.8720073664825048, + "grad_norm": 0.2751680612564087, + "learning_rate": 7.011591120510699e-05, + "loss": 1.7215, + "step": 12615 + }, + { + "epoch": 3.8723143032535297, + "grad_norm": 0.21988113224506378, + "learning_rate": 7.011136055871679e-05, + "loss": 1.8009, + "step": 12616 + }, + { + "epoch": 3.872621240024555, + "grad_norm": 0.26462143659591675, + "learning_rate": 7.010680971357434e-05, + "loss": 1.7618, + "step": 12617 + }, + { + "epoch": 3.87292817679558, + "grad_norm": 0.29054632782936096, + "learning_rate": 7.010225866972462e-05, + "loss": 1.7549, + "step": 12618 + }, + { + "epoch": 3.873235113566605, + "grad_norm": 0.31341224908828735, + "learning_rate": 7.00977074272126e-05, + "loss": 1.8827, + "step": 12619 + }, + { + "epoch": 3.8735420503376305, + "grad_norm": 0.24252115190029144, + "learning_rate": 7.009315598608324e-05, + "loss": 1.7544, + "step": 12620 + }, + { + "epoch": 3.873848987108656, + "grad_norm": 0.30036893486976624, + "learning_rate": 7.008860434638154e-05, + "loss": 1.7465, + "step": 12621 + }, + { + "epoch": 3.8741559238796808, + "grad_norm": 0.3217438757419586, + "learning_rate": 7.00840525081525e-05, + "loss": 1.72, + "step": 12622 + }, + { + "epoch": 3.874462860650706, + "grad_norm": 0.22507290542125702, + "learning_rate": 7.007950047144105e-05, + "loss": 1.7177, + "step": 12623 + }, + { + "epoch": 3.874769797421731, + "grad_norm": 0.3014441728591919, + "learning_rate": 7.007494823629224e-05, + "loss": 1.7502, + "step": 12624 + }, + { + "epoch": 3.8750767341927563, + "grad_norm": 0.3836904466152191, + "learning_rate": 7.0070395802751e-05, + "loss": 1.7971, + "step": 12625 + }, + { + "epoch": 3.8753836709637817, + "grad_norm": 0.33565691113471985, + "learning_rate": 7.006584317086235e-05, + "loss": 1.7439, + "step": 12626 + }, + { + "epoch": 3.8756906077348066, + "grad_norm": 0.2292134314775467, + "learning_rate": 7.006129034067128e-05, + "loss": 1.7998, + "step": 12627 + }, + { + "epoch": 3.875997544505832, + "grad_norm": 0.26385873556137085, + "learning_rate": 7.005673731222277e-05, + "loss": 1.7914, + "step": 12628 + }, + { + "epoch": 3.876304481276857, + "grad_norm": 0.2854950428009033, + "learning_rate": 7.005218408556184e-05, + "loss": 1.7761, + "step": 12629 + }, + { + "epoch": 3.876611418047882, + "grad_norm": 0.34260645508766174, + "learning_rate": 7.004763066073348e-05, + "loss": 1.8015, + "step": 12630 + }, + { + "epoch": 3.8769183548189075, + "grad_norm": 0.3223683834075928, + "learning_rate": 7.004307703778267e-05, + "loss": 1.7453, + "step": 12631 + }, + { + "epoch": 3.8772252915899323, + "grad_norm": 0.24715089797973633, + "learning_rate": 7.003852321675442e-05, + "loss": 1.7813, + "step": 12632 + }, + { + "epoch": 3.8775322283609577, + "grad_norm": 0.22822390496730804, + "learning_rate": 7.003396919769377e-05, + "loss": 1.7982, + "step": 12633 + }, + { + "epoch": 3.8778391651319826, + "grad_norm": 0.24125081300735474, + "learning_rate": 7.002941498064565e-05, + "loss": 1.8606, + "step": 12634 + }, + { + "epoch": 3.878146101903008, + "grad_norm": 0.23512506484985352, + "learning_rate": 7.002486056565513e-05, + "loss": 1.7469, + "step": 12635 + }, + { + "epoch": 3.8784530386740332, + "grad_norm": 0.2908322215080261, + "learning_rate": 7.00203059527672e-05, + "loss": 1.796, + "step": 12636 + }, + { + "epoch": 3.8787599754450586, + "grad_norm": 0.22931252419948578, + "learning_rate": 7.001575114202689e-05, + "loss": 1.7482, + "step": 12637 + }, + { + "epoch": 3.8790669122160835, + "grad_norm": 0.22574284672737122, + "learning_rate": 7.001119613347917e-05, + "loss": 1.7698, + "step": 12638 + }, + { + "epoch": 3.879373848987109, + "grad_norm": 0.23129726946353912, + "learning_rate": 7.000664092716909e-05, + "loss": 1.776, + "step": 12639 + }, + { + "epoch": 3.8796807857581337, + "grad_norm": 0.2763366401195526, + "learning_rate": 7.000208552314165e-05, + "loss": 1.7814, + "step": 12640 + }, + { + "epoch": 3.879987722529159, + "grad_norm": 0.29870158433914185, + "learning_rate": 6.99975299214419e-05, + "loss": 1.7467, + "step": 12641 + }, + { + "epoch": 3.8802946593001844, + "grad_norm": 0.33574381470680237, + "learning_rate": 6.999297412211484e-05, + "loss": 1.8159, + "step": 12642 + }, + { + "epoch": 3.8806015960712092, + "grad_norm": 0.30309897661209106, + "learning_rate": 6.998841812520547e-05, + "loss": 1.8454, + "step": 12643 + }, + { + "epoch": 3.8809085328422346, + "grad_norm": 0.27399247884750366, + "learning_rate": 6.998386193075886e-05, + "loss": 1.7956, + "step": 12644 + }, + { + "epoch": 3.8812154696132595, + "grad_norm": 0.28649580478668213, + "learning_rate": 6.997930553881998e-05, + "loss": 1.8308, + "step": 12645 + }, + { + "epoch": 3.881522406384285, + "grad_norm": 0.2716052532196045, + "learning_rate": 6.997474894943392e-05, + "loss": 1.7698, + "step": 12646 + }, + { + "epoch": 3.88182934315531, + "grad_norm": 0.21380536258220673, + "learning_rate": 6.997019216264567e-05, + "loss": 1.7028, + "step": 12647 + }, + { + "epoch": 3.882136279926335, + "grad_norm": 0.25262731313705444, + "learning_rate": 6.996563517850028e-05, + "loss": 1.8236, + "step": 12648 + }, + { + "epoch": 3.8824432166973604, + "grad_norm": 0.21150052547454834, + "learning_rate": 6.996107799704277e-05, + "loss": 1.7437, + "step": 12649 + }, + { + "epoch": 3.8827501534683853, + "grad_norm": 0.2614554464817047, + "learning_rate": 6.995652061831821e-05, + "loss": 1.7575, + "step": 12650 + }, + { + "epoch": 3.8830570902394106, + "grad_norm": 0.214684396982193, + "learning_rate": 6.995196304237159e-05, + "loss": 1.8195, + "step": 12651 + }, + { + "epoch": 3.883364027010436, + "grad_norm": 0.2226872444152832, + "learning_rate": 6.994740526924798e-05, + "loss": 1.7556, + "step": 12652 + }, + { + "epoch": 3.8836709637814613, + "grad_norm": 0.22270764410495758, + "learning_rate": 6.994284729899246e-05, + "loss": 1.7536, + "step": 12653 + }, + { + "epoch": 3.883977900552486, + "grad_norm": 0.20683564245700836, + "learning_rate": 6.993828913165e-05, + "loss": 1.7728, + "step": 12654 + }, + { + "epoch": 3.8842848373235115, + "grad_norm": 0.23667018115520477, + "learning_rate": 6.993373076726568e-05, + "loss": 1.7819, + "step": 12655 + }, + { + "epoch": 3.8845917740945364, + "grad_norm": 0.2265234887599945, + "learning_rate": 6.992917220588455e-05, + "loss": 1.7502, + "step": 12656 + }, + { + "epoch": 3.8848987108655617, + "grad_norm": 0.24490754306316376, + "learning_rate": 6.992461344755168e-05, + "loss": 1.7513, + "step": 12657 + }, + { + "epoch": 3.885205647636587, + "grad_norm": 0.23001348972320557, + "learning_rate": 6.992005449231208e-05, + "loss": 1.733, + "step": 12658 + }, + { + "epoch": 3.885512584407612, + "grad_norm": 0.25424695014953613, + "learning_rate": 6.991549534021084e-05, + "loss": 1.7621, + "step": 12659 + }, + { + "epoch": 3.8858195211786373, + "grad_norm": 0.25552862882614136, + "learning_rate": 6.991093599129299e-05, + "loss": 1.7974, + "step": 12660 + }, + { + "epoch": 3.886126457949662, + "grad_norm": 0.26876959204673767, + "learning_rate": 6.99063764456036e-05, + "loss": 1.7924, + "step": 12661 + }, + { + "epoch": 3.8864333947206875, + "grad_norm": 0.2754429578781128, + "learning_rate": 6.990181670318772e-05, + "loss": 1.7981, + "step": 12662 + }, + { + "epoch": 3.886740331491713, + "grad_norm": 0.281818687915802, + "learning_rate": 6.989725676409044e-05, + "loss": 1.7328, + "step": 12663 + }, + { + "epoch": 3.8870472682627377, + "grad_norm": 0.21676552295684814, + "learning_rate": 6.989269662835681e-05, + "loss": 1.7376, + "step": 12664 + }, + { + "epoch": 3.887354205033763, + "grad_norm": 0.276115745306015, + "learning_rate": 6.98881362960319e-05, + "loss": 1.7784, + "step": 12665 + }, + { + "epoch": 3.887661141804788, + "grad_norm": 0.2806364893913269, + "learning_rate": 6.988357576716075e-05, + "loss": 1.8078, + "step": 12666 + }, + { + "epoch": 3.8879680785758133, + "grad_norm": 0.27620184421539307, + "learning_rate": 6.987901504178845e-05, + "loss": 1.8115, + "step": 12667 + }, + { + "epoch": 3.8882750153468386, + "grad_norm": 0.23845402896404266, + "learning_rate": 6.987445411996009e-05, + "loss": 1.7485, + "step": 12668 + }, + { + "epoch": 3.888581952117864, + "grad_norm": 0.25063586235046387, + "learning_rate": 6.986989300172071e-05, + "loss": 1.7663, + "step": 12669 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.2417975515127182, + "learning_rate": 6.98653316871154e-05, + "loss": 1.7562, + "step": 12670 + }, + { + "epoch": 3.889195825659914, + "grad_norm": 0.24952733516693115, + "learning_rate": 6.986077017618923e-05, + "loss": 1.8063, + "step": 12671 + }, + { + "epoch": 3.889502762430939, + "grad_norm": 0.25847554206848145, + "learning_rate": 6.985620846898732e-05, + "loss": 1.7722, + "step": 12672 + }, + { + "epoch": 3.8898096992019644, + "grad_norm": 0.23762650787830353, + "learning_rate": 6.985164656555471e-05, + "loss": 1.8368, + "step": 12673 + }, + { + "epoch": 3.8901166359729897, + "grad_norm": 0.25346314907073975, + "learning_rate": 6.984708446593648e-05, + "loss": 1.7957, + "step": 12674 + }, + { + "epoch": 3.8904235727440146, + "grad_norm": 0.2466745674610138, + "learning_rate": 6.984252217017774e-05, + "loss": 1.8286, + "step": 12675 + }, + { + "epoch": 3.89073050951504, + "grad_norm": 0.25413215160369873, + "learning_rate": 6.983795967832356e-05, + "loss": 1.7711, + "step": 12676 + }, + { + "epoch": 3.891037446286065, + "grad_norm": 0.2315925806760788, + "learning_rate": 6.983339699041903e-05, + "loss": 1.7546, + "step": 12677 + }, + { + "epoch": 3.89134438305709, + "grad_norm": 0.26473405957221985, + "learning_rate": 6.982883410650925e-05, + "loss": 1.7563, + "step": 12678 + }, + { + "epoch": 3.8916513198281155, + "grad_norm": 0.24176491796970367, + "learning_rate": 6.982427102663932e-05, + "loss": 1.7734, + "step": 12679 + }, + { + "epoch": 3.891958256599141, + "grad_norm": 0.25444844365119934, + "learning_rate": 6.98197077508543e-05, + "loss": 1.803, + "step": 12680 + }, + { + "epoch": 3.8922651933701657, + "grad_norm": 0.25234144926071167, + "learning_rate": 6.981514427919933e-05, + "loss": 1.8099, + "step": 12681 + }, + { + "epoch": 3.892572130141191, + "grad_norm": 0.2571142315864563, + "learning_rate": 6.98105806117195e-05, + "loss": 1.8618, + "step": 12682 + }, + { + "epoch": 3.892879066912216, + "grad_norm": 0.21235275268554688, + "learning_rate": 6.980601674845988e-05, + "loss": 1.7121, + "step": 12683 + }, + { + "epoch": 3.8931860036832413, + "grad_norm": 0.27078527212142944, + "learning_rate": 6.98014526894656e-05, + "loss": 1.8103, + "step": 12684 + }, + { + "epoch": 3.8934929404542666, + "grad_norm": 0.3198096454143524, + "learning_rate": 6.979688843478176e-05, + "loss": 1.7529, + "step": 12685 + }, + { + "epoch": 3.8937998772252915, + "grad_norm": 0.3170493245124817, + "learning_rate": 6.979232398445345e-05, + "loss": 1.7629, + "step": 12686 + }, + { + "epoch": 3.894106813996317, + "grad_norm": 0.2495265007019043, + "learning_rate": 6.978775933852582e-05, + "loss": 1.7407, + "step": 12687 + }, + { + "epoch": 3.8944137507673418, + "grad_norm": 0.24570141732692719, + "learning_rate": 6.978319449704395e-05, + "loss": 1.7688, + "step": 12688 + }, + { + "epoch": 3.894720687538367, + "grad_norm": 0.23956388235092163, + "learning_rate": 6.977862946005295e-05, + "loss": 1.7115, + "step": 12689 + }, + { + "epoch": 3.8950276243093924, + "grad_norm": 0.21548940241336823, + "learning_rate": 6.977406422759793e-05, + "loss": 1.7611, + "step": 12690 + }, + { + "epoch": 3.8953345610804173, + "grad_norm": 0.25797295570373535, + "learning_rate": 6.976949879972403e-05, + "loss": 1.7688, + "step": 12691 + }, + { + "epoch": 3.8956414978514426, + "grad_norm": 0.28257784247398376, + "learning_rate": 6.976493317647636e-05, + "loss": 1.7517, + "step": 12692 + }, + { + "epoch": 3.8959484346224675, + "grad_norm": 0.23828580975532532, + "learning_rate": 6.976036735790004e-05, + "loss": 1.7877, + "step": 12693 + }, + { + "epoch": 3.896255371393493, + "grad_norm": 0.22915001213550568, + "learning_rate": 6.975580134404017e-05, + "loss": 1.7741, + "step": 12694 + }, + { + "epoch": 3.896562308164518, + "grad_norm": 0.22975030541419983, + "learning_rate": 6.97512351349419e-05, + "loss": 1.772, + "step": 12695 + }, + { + "epoch": 3.8968692449355435, + "grad_norm": 0.29515185952186584, + "learning_rate": 6.974666873065034e-05, + "loss": 1.8001, + "step": 12696 + }, + { + "epoch": 3.8971761817065684, + "grad_norm": 0.26904794573783875, + "learning_rate": 6.974210213121064e-05, + "loss": 1.7069, + "step": 12697 + }, + { + "epoch": 3.8974831184775938, + "grad_norm": 0.2549479603767395, + "learning_rate": 6.97375353366679e-05, + "loss": 1.7419, + "step": 12698 + }, + { + "epoch": 3.8977900552486187, + "grad_norm": 0.23750101029872894, + "learning_rate": 6.973296834706729e-05, + "loss": 1.7815, + "step": 12699 + }, + { + "epoch": 3.898096992019644, + "grad_norm": 0.23529762029647827, + "learning_rate": 6.972840116245389e-05, + "loss": 1.8139, + "step": 12700 + }, + { + "epoch": 3.8984039287906693, + "grad_norm": 0.3212098777294159, + "learning_rate": 6.97238337828729e-05, + "loss": 1.7507, + "step": 12701 + }, + { + "epoch": 3.898710865561694, + "grad_norm": 0.3167687952518463, + "learning_rate": 6.971926620836941e-05, + "loss": 1.8062, + "step": 12702 + }, + { + "epoch": 3.8990178023327196, + "grad_norm": 0.31298309564590454, + "learning_rate": 6.971469843898855e-05, + "loss": 1.8127, + "step": 12703 + }, + { + "epoch": 3.8993247391037444, + "grad_norm": 0.2537378668785095, + "learning_rate": 6.971013047477551e-05, + "loss": 1.7675, + "step": 12704 + }, + { + "epoch": 3.8996316758747698, + "grad_norm": 0.24292805790901184, + "learning_rate": 6.97055623157754e-05, + "loss": 1.8004, + "step": 12705 + }, + { + "epoch": 3.899938612645795, + "grad_norm": 0.2929537296295166, + "learning_rate": 6.970099396203338e-05, + "loss": 1.7963, + "step": 12706 + }, + { + "epoch": 3.90024554941682, + "grad_norm": 0.30531612038612366, + "learning_rate": 6.969642541359459e-05, + "loss": 1.7347, + "step": 12707 + }, + { + "epoch": 3.9005524861878453, + "grad_norm": 0.3138202726840973, + "learning_rate": 6.969185667050417e-05, + "loss": 1.7987, + "step": 12708 + }, + { + "epoch": 3.9008594229588702, + "grad_norm": 0.2366247922182083, + "learning_rate": 6.96872877328073e-05, + "loss": 1.7671, + "step": 12709 + }, + { + "epoch": 3.9011663597298956, + "grad_norm": 0.26251721382141113, + "learning_rate": 6.96827186005491e-05, + "loss": 1.7657, + "step": 12710 + }, + { + "epoch": 3.901473296500921, + "grad_norm": 0.32497119903564453, + "learning_rate": 6.967814927377474e-05, + "loss": 1.7873, + "step": 12711 + }, + { + "epoch": 3.9017802332719462, + "grad_norm": 0.3290228843688965, + "learning_rate": 6.967357975252939e-05, + "loss": 1.8076, + "step": 12712 + }, + { + "epoch": 3.902087170042971, + "grad_norm": 0.2737300992012024, + "learning_rate": 6.966901003685817e-05, + "loss": 1.7405, + "step": 12713 + }, + { + "epoch": 3.9023941068139965, + "grad_norm": 0.25465309619903564, + "learning_rate": 6.966444012680626e-05, + "loss": 1.8063, + "step": 12714 + }, + { + "epoch": 3.9027010435850213, + "grad_norm": 0.2397255003452301, + "learning_rate": 6.965987002241885e-05, + "loss": 1.8079, + "step": 12715 + }, + { + "epoch": 3.9030079803560467, + "grad_norm": 0.23115718364715576, + "learning_rate": 6.965529972374108e-05, + "loss": 1.8032, + "step": 12716 + }, + { + "epoch": 3.903314917127072, + "grad_norm": 0.2536461055278778, + "learning_rate": 6.96507292308181e-05, + "loss": 1.7477, + "step": 12717 + }, + { + "epoch": 3.903621853898097, + "grad_norm": 0.27151185274124146, + "learning_rate": 6.96461585436951e-05, + "loss": 1.75, + "step": 12718 + }, + { + "epoch": 3.9039287906691222, + "grad_norm": 0.26894113421440125, + "learning_rate": 6.964158766241726e-05, + "loss": 1.7816, + "step": 12719 + }, + { + "epoch": 3.904235727440147, + "grad_norm": 0.23541375994682312, + "learning_rate": 6.963701658702972e-05, + "loss": 1.7991, + "step": 12720 + }, + { + "epoch": 3.9045426642111725, + "grad_norm": 0.22142915427684784, + "learning_rate": 6.96324453175777e-05, + "loss": 1.7245, + "step": 12721 + }, + { + "epoch": 3.904849600982198, + "grad_norm": 0.32864269614219666, + "learning_rate": 6.962787385410632e-05, + "loss": 1.7631, + "step": 12722 + }, + { + "epoch": 3.9051565377532227, + "grad_norm": 0.23657776415348053, + "learning_rate": 6.96233021966608e-05, + "loss": 1.8081, + "step": 12723 + }, + { + "epoch": 3.905463474524248, + "grad_norm": 0.24790632724761963, + "learning_rate": 6.961873034528629e-05, + "loss": 1.7193, + "step": 12724 + }, + { + "epoch": 3.905770411295273, + "grad_norm": 0.2517886459827423, + "learning_rate": 6.961415830002801e-05, + "loss": 1.7785, + "step": 12725 + }, + { + "epoch": 3.9060773480662982, + "grad_norm": 0.2340923547744751, + "learning_rate": 6.960958606093113e-05, + "loss": 1.7632, + "step": 12726 + }, + { + "epoch": 3.9063842848373236, + "grad_norm": 0.23260441422462463, + "learning_rate": 6.960501362804079e-05, + "loss": 1.7865, + "step": 12727 + }, + { + "epoch": 3.906691221608349, + "grad_norm": 0.22616329789161682, + "learning_rate": 6.960044100140224e-05, + "loss": 1.7851, + "step": 12728 + }, + { + "epoch": 3.906998158379374, + "grad_norm": 0.2849951982498169, + "learning_rate": 6.959586818106064e-05, + "loss": 1.8618, + "step": 12729 + }, + { + "epoch": 3.907305095150399, + "grad_norm": 0.3279374837875366, + "learning_rate": 6.95912951670612e-05, + "loss": 1.8563, + "step": 12730 + }, + { + "epoch": 3.907612031921424, + "grad_norm": 0.24359555542469025, + "learning_rate": 6.958672195944906e-05, + "loss": 1.7604, + "step": 12731 + }, + { + "epoch": 3.9079189686924494, + "grad_norm": 0.30881935358047485, + "learning_rate": 6.958214855826947e-05, + "loss": 1.8463, + "step": 12732 + }, + { + "epoch": 3.9082259054634747, + "grad_norm": 0.25361543893814087, + "learning_rate": 6.957757496356763e-05, + "loss": 1.7831, + "step": 12733 + }, + { + "epoch": 3.9085328422344996, + "grad_norm": 0.26763513684272766, + "learning_rate": 6.957300117538869e-05, + "loss": 1.8383, + "step": 12734 + }, + { + "epoch": 3.908839779005525, + "grad_norm": 0.2238057255744934, + "learning_rate": 6.95684271937779e-05, + "loss": 1.7702, + "step": 12735 + }, + { + "epoch": 3.90914671577655, + "grad_norm": 0.22110232710838318, + "learning_rate": 6.956385301878045e-05, + "loss": 1.7931, + "step": 12736 + }, + { + "epoch": 3.909453652547575, + "grad_norm": 0.23765070736408234, + "learning_rate": 6.955927865044152e-05, + "loss": 1.7212, + "step": 12737 + }, + { + "epoch": 3.9097605893186005, + "grad_norm": 0.22324508428573608, + "learning_rate": 6.955470408880633e-05, + "loss": 1.7161, + "step": 12738 + }, + { + "epoch": 3.9100675260896254, + "grad_norm": 0.22485347092151642, + "learning_rate": 6.955012933392012e-05, + "loss": 1.7374, + "step": 12739 + }, + { + "epoch": 3.9103744628606507, + "grad_norm": 0.28046715259552, + "learning_rate": 6.954555438582806e-05, + "loss": 1.9264, + "step": 12740 + }, + { + "epoch": 3.9106813996316756, + "grad_norm": 0.26391276717185974, + "learning_rate": 6.954097924457536e-05, + "loss": 1.7343, + "step": 12741 + }, + { + "epoch": 3.910988336402701, + "grad_norm": 0.29596614837646484, + "learning_rate": 6.953640391020726e-05, + "loss": 1.8111, + "step": 12742 + }, + { + "epoch": 3.9112952731737263, + "grad_norm": 0.2709808051586151, + "learning_rate": 6.953182838276896e-05, + "loss": 1.7776, + "step": 12743 + }, + { + "epoch": 3.9116022099447516, + "grad_norm": 0.2585100531578064, + "learning_rate": 6.952725266230571e-05, + "loss": 1.7774, + "step": 12744 + }, + { + "epoch": 3.9119091467157765, + "grad_norm": 0.26490530371665955, + "learning_rate": 6.952267674886268e-05, + "loss": 1.78, + "step": 12745 + }, + { + "epoch": 3.912216083486802, + "grad_norm": 0.23654767870903015, + "learning_rate": 6.951810064248512e-05, + "loss": 1.8263, + "step": 12746 + }, + { + "epoch": 3.9125230202578267, + "grad_norm": 0.2495296597480774, + "learning_rate": 6.951352434321826e-05, + "loss": 1.787, + "step": 12747 + }, + { + "epoch": 3.912829957028852, + "grad_norm": 0.24038313329219818, + "learning_rate": 6.950894785110728e-05, + "loss": 1.774, + "step": 12748 + }, + { + "epoch": 3.9131368937998774, + "grad_norm": 0.23738732933998108, + "learning_rate": 6.950437116619749e-05, + "loss": 1.7401, + "step": 12749 + }, + { + "epoch": 3.9134438305709023, + "grad_norm": 0.28192025423049927, + "learning_rate": 6.949979428853405e-05, + "loss": 1.8416, + "step": 12750 + }, + { + "epoch": 3.9137507673419276, + "grad_norm": 0.30579057335853577, + "learning_rate": 6.949521721816221e-05, + "loss": 1.7404, + "step": 12751 + }, + { + "epoch": 3.9140577041129525, + "grad_norm": 0.23972894251346588, + "learning_rate": 6.949063995512721e-05, + "loss": 1.7543, + "step": 12752 + }, + { + "epoch": 3.914364640883978, + "grad_norm": 0.2837793231010437, + "learning_rate": 6.94860624994743e-05, + "loss": 1.7779, + "step": 12753 + }, + { + "epoch": 3.914671577655003, + "grad_norm": 0.3344916105270386, + "learning_rate": 6.948148485124868e-05, + "loss": 1.7803, + "step": 12754 + }, + { + "epoch": 3.9149785144260285, + "grad_norm": 0.24271291494369507, + "learning_rate": 6.94769070104956e-05, + "loss": 1.7362, + "step": 12755 + }, + { + "epoch": 3.9152854511970534, + "grad_norm": 0.25299304723739624, + "learning_rate": 6.947232897726031e-05, + "loss": 1.7685, + "step": 12756 + }, + { + "epoch": 3.9155923879680787, + "grad_norm": 0.24766205251216888, + "learning_rate": 6.946775075158807e-05, + "loss": 1.829, + "step": 12757 + }, + { + "epoch": 3.9158993247391036, + "grad_norm": 0.2508428692817688, + "learning_rate": 6.94631723335241e-05, + "loss": 1.809, + "step": 12758 + }, + { + "epoch": 3.916206261510129, + "grad_norm": 0.2172096222639084, + "learning_rate": 6.945859372311365e-05, + "loss": 1.7376, + "step": 12759 + }, + { + "epoch": 3.9165131982811543, + "grad_norm": 0.28976425528526306, + "learning_rate": 6.945401492040198e-05, + "loss": 1.8229, + "step": 12760 + }, + { + "epoch": 3.916820135052179, + "grad_norm": 0.3528063893318176, + "learning_rate": 6.944943592543432e-05, + "loss": 1.7559, + "step": 12761 + }, + { + "epoch": 3.9171270718232045, + "grad_norm": 0.46312370896339417, + "learning_rate": 6.944485673825595e-05, + "loss": 1.7664, + "step": 12762 + }, + { + "epoch": 3.9174340085942294, + "grad_norm": 0.4466164708137512, + "learning_rate": 6.94402773589121e-05, + "loss": 1.7833, + "step": 12763 + }, + { + "epoch": 3.9177409453652547, + "grad_norm": 0.2637740969657898, + "learning_rate": 6.943569778744804e-05, + "loss": 1.818, + "step": 12764 + }, + { + "epoch": 3.91804788213628, + "grad_norm": 0.37515267729759216, + "learning_rate": 6.943111802390901e-05, + "loss": 1.7898, + "step": 12765 + }, + { + "epoch": 3.918354818907305, + "grad_norm": 0.45146289467811584, + "learning_rate": 6.942653806834029e-05, + "loss": 1.7797, + "step": 12766 + }, + { + "epoch": 3.9186617556783303, + "grad_norm": 0.2809859812259674, + "learning_rate": 6.942195792078712e-05, + "loss": 1.7836, + "step": 12767 + }, + { + "epoch": 3.918968692449355, + "grad_norm": 0.3606306314468384, + "learning_rate": 6.94173775812948e-05, + "loss": 1.7657, + "step": 12768 + }, + { + "epoch": 3.9192756292203805, + "grad_norm": 0.49528738856315613, + "learning_rate": 6.941279704990857e-05, + "loss": 1.7628, + "step": 12769 + }, + { + "epoch": 3.919582565991406, + "grad_norm": 0.3484322428703308, + "learning_rate": 6.940821632667371e-05, + "loss": 1.7939, + "step": 12770 + }, + { + "epoch": 3.919889502762431, + "grad_norm": 0.2479606419801712, + "learning_rate": 6.940363541163546e-05, + "loss": 1.813, + "step": 12771 + }, + { + "epoch": 3.920196439533456, + "grad_norm": 0.3491765558719635, + "learning_rate": 6.939905430483911e-05, + "loss": 1.7338, + "step": 12772 + }, + { + "epoch": 3.9205033763044814, + "grad_norm": 0.291810005903244, + "learning_rate": 6.939447300632995e-05, + "loss": 1.7445, + "step": 12773 + }, + { + "epoch": 3.9208103130755063, + "grad_norm": 0.2467527985572815, + "learning_rate": 6.938989151615324e-05, + "loss": 1.8462, + "step": 12774 + }, + { + "epoch": 3.9211172498465316, + "grad_norm": 0.35656824707984924, + "learning_rate": 6.938530983435426e-05, + "loss": 1.7751, + "step": 12775 + }, + { + "epoch": 3.921424186617557, + "grad_norm": 0.31269776821136475, + "learning_rate": 6.938072796097828e-05, + "loss": 1.7714, + "step": 12776 + }, + { + "epoch": 3.921731123388582, + "grad_norm": 0.2082831859588623, + "learning_rate": 6.937614589607058e-05, + "loss": 1.7263, + "step": 12777 + }, + { + "epoch": 3.922038060159607, + "grad_norm": 0.27583765983581543, + "learning_rate": 6.937156363967646e-05, + "loss": 1.6822, + "step": 12778 + }, + { + "epoch": 3.922344996930632, + "grad_norm": 0.32773876190185547, + "learning_rate": 6.93669811918412e-05, + "loss": 1.7792, + "step": 12779 + }, + { + "epoch": 3.9226519337016574, + "grad_norm": 0.2583121657371521, + "learning_rate": 6.936239855261007e-05, + "loss": 1.7812, + "step": 12780 + }, + { + "epoch": 3.9229588704726828, + "grad_norm": 0.245570570230484, + "learning_rate": 6.935781572202836e-05, + "loss": 1.7252, + "step": 12781 + }, + { + "epoch": 3.9232658072437077, + "grad_norm": 0.2379419505596161, + "learning_rate": 6.935323270014138e-05, + "loss": 1.7485, + "step": 12782 + }, + { + "epoch": 3.923572744014733, + "grad_norm": 0.2239784598350525, + "learning_rate": 6.934864948699439e-05, + "loss": 1.7444, + "step": 12783 + }, + { + "epoch": 3.923879680785758, + "grad_norm": 0.2366618812084198, + "learning_rate": 6.934406608263274e-05, + "loss": 1.777, + "step": 12784 + }, + { + "epoch": 3.924186617556783, + "grad_norm": 0.22583791613578796, + "learning_rate": 6.933948248710169e-05, + "loss": 1.7291, + "step": 12785 + }, + { + "epoch": 3.9244935543278086, + "grad_norm": 0.24141047894954681, + "learning_rate": 6.933489870044651e-05, + "loss": 1.7748, + "step": 12786 + }, + { + "epoch": 3.924800491098834, + "grad_norm": 0.2389962524175644, + "learning_rate": 6.933031472271255e-05, + "loss": 1.7957, + "step": 12787 + }, + { + "epoch": 3.925107427869859, + "grad_norm": 0.25230300426483154, + "learning_rate": 6.932573055394509e-05, + "loss": 1.7621, + "step": 12788 + }, + { + "epoch": 3.925414364640884, + "grad_norm": 0.23894043266773224, + "learning_rate": 6.932114619418941e-05, + "loss": 1.7285, + "step": 12789 + }, + { + "epoch": 3.925721301411909, + "grad_norm": 0.2650291919708252, + "learning_rate": 6.931656164349086e-05, + "loss": 1.7613, + "step": 12790 + }, + { + "epoch": 3.9260282381829343, + "grad_norm": 0.20616789162158966, + "learning_rate": 6.931197690189472e-05, + "loss": 1.7505, + "step": 12791 + }, + { + "epoch": 3.9263351749539597, + "grad_norm": 0.23915675282478333, + "learning_rate": 6.930739196944633e-05, + "loss": 1.7477, + "step": 12792 + }, + { + "epoch": 3.9266421117249846, + "grad_norm": 0.2522687613964081, + "learning_rate": 6.930280684619094e-05, + "loss": 1.8, + "step": 12793 + }, + { + "epoch": 3.92694904849601, + "grad_norm": 0.264167845249176, + "learning_rate": 6.929822153217391e-05, + "loss": 1.7516, + "step": 12794 + }, + { + "epoch": 3.927255985267035, + "grad_norm": 0.21358054876327515, + "learning_rate": 6.929363602744054e-05, + "loss": 1.7207, + "step": 12795 + }, + { + "epoch": 3.92756292203806, + "grad_norm": 0.25632721185684204, + "learning_rate": 6.928905033203617e-05, + "loss": 1.7446, + "step": 12796 + }, + { + "epoch": 3.9278698588090855, + "grad_norm": 0.2717185318470001, + "learning_rate": 6.928446444600608e-05, + "loss": 1.8555, + "step": 12797 + }, + { + "epoch": 3.9281767955801103, + "grad_norm": 0.2871767282485962, + "learning_rate": 6.927987836939561e-05, + "loss": 1.7861, + "step": 12798 + }, + { + "epoch": 3.9284837323511357, + "grad_norm": 0.282507061958313, + "learning_rate": 6.927529210225009e-05, + "loss": 1.7683, + "step": 12799 + }, + { + "epoch": 3.9287906691221606, + "grad_norm": 0.24870644509792328, + "learning_rate": 6.927070564461482e-05, + "loss": 1.7355, + "step": 12800 + }, + { + "epoch": 3.929097605893186, + "grad_norm": 0.2093631625175476, + "learning_rate": 6.926611899653516e-05, + "loss": 1.7691, + "step": 12801 + }, + { + "epoch": 3.9294045426642112, + "grad_norm": 0.34258076548576355, + "learning_rate": 6.926153215805642e-05, + "loss": 1.8398, + "step": 12802 + }, + { + "epoch": 3.9297114794352366, + "grad_norm": 0.39179500937461853, + "learning_rate": 6.925694512922391e-05, + "loss": 1.8229, + "step": 12803 + }, + { + "epoch": 3.9300184162062615, + "grad_norm": 0.36814743280410767, + "learning_rate": 6.9252357910083e-05, + "loss": 1.7759, + "step": 12804 + }, + { + "epoch": 3.930325352977287, + "grad_norm": 0.2659403085708618, + "learning_rate": 6.924777050067902e-05, + "loss": 1.7553, + "step": 12805 + }, + { + "epoch": 3.9306322897483117, + "grad_norm": 0.20617491006851196, + "learning_rate": 6.924318290105724e-05, + "loss": 1.7398, + "step": 12806 + }, + { + "epoch": 3.930939226519337, + "grad_norm": 0.23730522394180298, + "learning_rate": 6.923859511126309e-05, + "loss": 1.699, + "step": 12807 + }, + { + "epoch": 3.9312461632903624, + "grad_norm": 0.24865423142910004, + "learning_rate": 6.923400713134184e-05, + "loss": 1.7801, + "step": 12808 + }, + { + "epoch": 3.9315531000613873, + "grad_norm": 0.2495356798171997, + "learning_rate": 6.92294189613389e-05, + "loss": 1.803, + "step": 12809 + }, + { + "epoch": 3.9318600368324126, + "grad_norm": 0.24223244190216064, + "learning_rate": 6.922483060129955e-05, + "loss": 1.751, + "step": 12810 + }, + { + "epoch": 3.9321669736034375, + "grad_norm": 0.2541450262069702, + "learning_rate": 6.922024205126913e-05, + "loss": 1.7721, + "step": 12811 + }, + { + "epoch": 3.932473910374463, + "grad_norm": 0.24528831243515015, + "learning_rate": 6.921565331129304e-05, + "loss": 1.792, + "step": 12812 + }, + { + "epoch": 3.932780847145488, + "grad_norm": 0.22789500653743744, + "learning_rate": 6.921106438141659e-05, + "loss": 1.8455, + "step": 12813 + }, + { + "epoch": 3.933087783916513, + "grad_norm": 0.26267170906066895, + "learning_rate": 6.920647526168515e-05, + "loss": 1.7254, + "step": 12814 + }, + { + "epoch": 3.9333947206875384, + "grad_norm": 0.23044808208942413, + "learning_rate": 6.920188595214406e-05, + "loss": 1.7217, + "step": 12815 + }, + { + "epoch": 3.9337016574585633, + "grad_norm": 0.2304011732339859, + "learning_rate": 6.919729645283867e-05, + "loss": 1.8121, + "step": 12816 + }, + { + "epoch": 3.9340085942295886, + "grad_norm": 0.21516792476177216, + "learning_rate": 6.919270676381435e-05, + "loss": 1.7305, + "step": 12817 + }, + { + "epoch": 3.934315531000614, + "grad_norm": 0.24698840081691742, + "learning_rate": 6.918811688511646e-05, + "loss": 1.7967, + "step": 12818 + }, + { + "epoch": 3.9346224677716393, + "grad_norm": 0.23132537305355072, + "learning_rate": 6.918352681679035e-05, + "loss": 1.7439, + "step": 12819 + }, + { + "epoch": 3.934929404542664, + "grad_norm": 0.2597793936729431, + "learning_rate": 6.917893655888139e-05, + "loss": 1.7882, + "step": 12820 + }, + { + "epoch": 3.9352363413136895, + "grad_norm": 0.23946607112884521, + "learning_rate": 6.917434611143493e-05, + "loss": 1.7991, + "step": 12821 + }, + { + "epoch": 3.9355432780847144, + "grad_norm": 0.25808244943618774, + "learning_rate": 6.916975547449634e-05, + "loss": 1.845, + "step": 12822 + }, + { + "epoch": 3.9358502148557397, + "grad_norm": 0.26082557439804077, + "learning_rate": 6.9165164648111e-05, + "loss": 1.7562, + "step": 12823 + }, + { + "epoch": 3.936157151626765, + "grad_norm": 0.24810053408145905, + "learning_rate": 6.916057363232425e-05, + "loss": 1.778, + "step": 12824 + }, + { + "epoch": 3.93646408839779, + "grad_norm": 0.24168157577514648, + "learning_rate": 6.91559824271815e-05, + "loss": 1.7628, + "step": 12825 + }, + { + "epoch": 3.9367710251688153, + "grad_norm": 0.23800434172153473, + "learning_rate": 6.91513910327281e-05, + "loss": 1.8063, + "step": 12826 + }, + { + "epoch": 3.93707796193984, + "grad_norm": 0.23055073618888855, + "learning_rate": 6.914679944900944e-05, + "loss": 1.749, + "step": 12827 + }, + { + "epoch": 3.9373848987108655, + "grad_norm": 0.22455987334251404, + "learning_rate": 6.914220767607088e-05, + "loss": 1.7471, + "step": 12828 + }, + { + "epoch": 3.937691835481891, + "grad_norm": 0.21808378398418427, + "learning_rate": 6.913761571395778e-05, + "loss": 1.7503, + "step": 12829 + }, + { + "epoch": 3.937998772252916, + "grad_norm": 0.23136213421821594, + "learning_rate": 6.913302356271556e-05, + "loss": 1.752, + "step": 12830 + }, + { + "epoch": 3.938305709023941, + "grad_norm": 0.29579970240592957, + "learning_rate": 6.912843122238959e-05, + "loss": 1.8028, + "step": 12831 + }, + { + "epoch": 3.9386126457949664, + "grad_norm": 0.28578072786331177, + "learning_rate": 6.912383869302526e-05, + "loss": 1.8183, + "step": 12832 + }, + { + "epoch": 3.9389195825659913, + "grad_norm": 0.2616737186908722, + "learning_rate": 6.911924597466793e-05, + "loss": 1.8366, + "step": 12833 + }, + { + "epoch": 3.9392265193370166, + "grad_norm": 0.29275768995285034, + "learning_rate": 6.911465306736302e-05, + "loss": 1.731, + "step": 12834 + }, + { + "epoch": 3.939533456108042, + "grad_norm": 0.3300873041152954, + "learning_rate": 6.91100599711559e-05, + "loss": 1.8713, + "step": 12835 + }, + { + "epoch": 3.939840392879067, + "grad_norm": 0.2744643986225128, + "learning_rate": 6.910546668609195e-05, + "loss": 1.8479, + "step": 12836 + }, + { + "epoch": 3.940147329650092, + "grad_norm": 0.25248417258262634, + "learning_rate": 6.91008732122166e-05, + "loss": 1.7962, + "step": 12837 + }, + { + "epoch": 3.940454266421117, + "grad_norm": 0.3068546652793884, + "learning_rate": 6.909627954957521e-05, + "loss": 1.759, + "step": 12838 + }, + { + "epoch": 3.9407612031921424, + "grad_norm": 0.3273559808731079, + "learning_rate": 6.909168569821321e-05, + "loss": 1.814, + "step": 12839 + }, + { + "epoch": 3.9410681399631677, + "grad_norm": 0.31192758679389954, + "learning_rate": 6.908709165817597e-05, + "loss": 1.7906, + "step": 12840 + }, + { + "epoch": 3.9413750767341926, + "grad_norm": 0.24487090110778809, + "learning_rate": 6.90824974295089e-05, + "loss": 1.8238, + "step": 12841 + }, + { + "epoch": 3.941682013505218, + "grad_norm": 0.24863721430301666, + "learning_rate": 6.907790301225743e-05, + "loss": 1.7651, + "step": 12842 + }, + { + "epoch": 3.941988950276243, + "grad_norm": 0.26555630564689636, + "learning_rate": 6.907330840646693e-05, + "loss": 1.8268, + "step": 12843 + }, + { + "epoch": 3.942295887047268, + "grad_norm": 0.2439817190170288, + "learning_rate": 6.906871361218281e-05, + "loss": 1.7291, + "step": 12844 + }, + { + "epoch": 3.9426028238182935, + "grad_norm": 0.2410304993391037, + "learning_rate": 6.906411862945048e-05, + "loss": 1.712, + "step": 12845 + }, + { + "epoch": 3.942909760589319, + "grad_norm": 0.28575149178504944, + "learning_rate": 6.905952345831537e-05, + "loss": 1.7269, + "step": 12846 + }, + { + "epoch": 3.9432166973603437, + "grad_norm": 0.3055815100669861, + "learning_rate": 6.905492809882286e-05, + "loss": 1.7234, + "step": 12847 + }, + { + "epoch": 3.943523634131369, + "grad_norm": 0.2762533724308014, + "learning_rate": 6.905033255101839e-05, + "loss": 1.7768, + "step": 12848 + }, + { + "epoch": 3.943830570902394, + "grad_norm": 0.22819125652313232, + "learning_rate": 6.904573681494738e-05, + "loss": 1.7416, + "step": 12849 + }, + { + "epoch": 3.9441375076734193, + "grad_norm": 0.21664194762706757, + "learning_rate": 6.904114089065523e-05, + "loss": 1.7506, + "step": 12850 + }, + { + "epoch": 3.9444444444444446, + "grad_norm": 0.21935151517391205, + "learning_rate": 6.903654477818735e-05, + "loss": 1.7522, + "step": 12851 + }, + { + "epoch": 3.9447513812154695, + "grad_norm": 0.2204175442457199, + "learning_rate": 6.903194847758918e-05, + "loss": 1.7753, + "step": 12852 + }, + { + "epoch": 3.945058317986495, + "grad_norm": 0.23130151629447937, + "learning_rate": 6.902735198890615e-05, + "loss": 1.7743, + "step": 12853 + }, + { + "epoch": 3.9453652547575198, + "grad_norm": 0.2548399567604065, + "learning_rate": 6.902275531218368e-05, + "loss": 1.8373, + "step": 12854 + }, + { + "epoch": 3.945672191528545, + "grad_norm": 0.2905479371547699, + "learning_rate": 6.901815844746718e-05, + "loss": 1.8336, + "step": 12855 + }, + { + "epoch": 3.9459791282995704, + "grad_norm": 0.2698945105075836, + "learning_rate": 6.90135613948021e-05, + "loss": 1.7498, + "step": 12856 + }, + { + "epoch": 3.9462860650705953, + "grad_norm": 0.24966828525066376, + "learning_rate": 6.900896415423387e-05, + "loss": 1.7664, + "step": 12857 + }, + { + "epoch": 3.9465930018416207, + "grad_norm": 0.23272784054279327, + "learning_rate": 6.90043667258079e-05, + "loss": 1.7742, + "step": 12858 + }, + { + "epoch": 3.9468999386126455, + "grad_norm": 0.2277698516845703, + "learning_rate": 6.899976910956965e-05, + "loss": 1.7465, + "step": 12859 + }, + { + "epoch": 3.947206875383671, + "grad_norm": 0.2376442402601242, + "learning_rate": 6.899517130556454e-05, + "loss": 1.7995, + "step": 12860 + }, + { + "epoch": 3.947513812154696, + "grad_norm": 0.25591593980789185, + "learning_rate": 6.899057331383802e-05, + "loss": 1.8017, + "step": 12861 + }, + { + "epoch": 3.9478207489257215, + "grad_norm": 0.2715262472629547, + "learning_rate": 6.898597513443551e-05, + "loss": 1.7967, + "step": 12862 + }, + { + "epoch": 3.9481276856967464, + "grad_norm": 0.20916256308555603, + "learning_rate": 6.898137676740246e-05, + "loss": 1.7711, + "step": 12863 + }, + { + "epoch": 3.9484346224677718, + "grad_norm": 0.2570229768753052, + "learning_rate": 6.897677821278435e-05, + "loss": 1.833, + "step": 12864 + }, + { + "epoch": 3.9487415592387967, + "grad_norm": 0.26343438029289246, + "learning_rate": 6.897217947062657e-05, + "loss": 1.7625, + "step": 12865 + }, + { + "epoch": 3.949048496009822, + "grad_norm": 0.23407024145126343, + "learning_rate": 6.896758054097459e-05, + "loss": 1.7211, + "step": 12866 + }, + { + "epoch": 3.9493554327808473, + "grad_norm": 0.2554715573787689, + "learning_rate": 6.896298142387387e-05, + "loss": 1.8548, + "step": 12867 + }, + { + "epoch": 3.949662369551872, + "grad_norm": 0.24143370985984802, + "learning_rate": 6.895838211936986e-05, + "loss": 1.7635, + "step": 12868 + }, + { + "epoch": 3.9499693063228976, + "grad_norm": 0.24634715914726257, + "learning_rate": 6.8953782627508e-05, + "loss": 1.8012, + "step": 12869 + }, + { + "epoch": 3.9502762430939224, + "grad_norm": 0.22740426659584045, + "learning_rate": 6.894918294833375e-05, + "loss": 1.7294, + "step": 12870 + }, + { + "epoch": 3.950583179864948, + "grad_norm": 0.2651631832122803, + "learning_rate": 6.894458308189257e-05, + "loss": 1.8289, + "step": 12871 + }, + { + "epoch": 3.950890116635973, + "grad_norm": 0.28693267703056335, + "learning_rate": 6.893998302822991e-05, + "loss": 1.8462, + "step": 12872 + }, + { + "epoch": 3.951197053406998, + "grad_norm": 0.26584213972091675, + "learning_rate": 6.893538278739125e-05, + "loss": 1.7621, + "step": 12873 + }, + { + "epoch": 3.9515039901780233, + "grad_norm": 0.29970669746398926, + "learning_rate": 6.893078235942203e-05, + "loss": 1.7659, + "step": 12874 + }, + { + "epoch": 3.9518109269490482, + "grad_norm": 0.2271152138710022, + "learning_rate": 6.892618174436771e-05, + "loss": 1.7151, + "step": 12875 + }, + { + "epoch": 3.9521178637200736, + "grad_norm": 0.24783682823181152, + "learning_rate": 6.892158094227379e-05, + "loss": 1.761, + "step": 12876 + }, + { + "epoch": 3.952424800491099, + "grad_norm": 0.2371140718460083, + "learning_rate": 6.891697995318573e-05, + "loss": 1.7557, + "step": 12877 + }, + { + "epoch": 3.9527317372621242, + "grad_norm": 0.29708394408226013, + "learning_rate": 6.891237877714896e-05, + "loss": 1.8629, + "step": 12878 + }, + { + "epoch": 3.953038674033149, + "grad_norm": 0.2724219262599945, + "learning_rate": 6.890777741420899e-05, + "loss": 1.7378, + "step": 12879 + }, + { + "epoch": 3.9533456108041745, + "grad_norm": 0.2227276861667633, + "learning_rate": 6.890317586441126e-05, + "loss": 1.6989, + "step": 12880 + }, + { + "epoch": 3.9536525475751993, + "grad_norm": 0.2546161413192749, + "learning_rate": 6.889857412780128e-05, + "loss": 1.8688, + "step": 12881 + }, + { + "epoch": 3.9539594843462247, + "grad_norm": 0.24882884323596954, + "learning_rate": 6.889397220442452e-05, + "loss": 1.8137, + "step": 12882 + }, + { + "epoch": 3.95426642111725, + "grad_norm": 0.2549113929271698, + "learning_rate": 6.888937009432644e-05, + "loss": 1.8366, + "step": 12883 + }, + { + "epoch": 3.954573357888275, + "grad_norm": 0.30032673478126526, + "learning_rate": 6.888476779755255e-05, + "loss": 1.8267, + "step": 12884 + }, + { + "epoch": 3.9548802946593002, + "grad_norm": 0.2887294292449951, + "learning_rate": 6.888016531414832e-05, + "loss": 1.8295, + "step": 12885 + }, + { + "epoch": 3.955187231430325, + "grad_norm": 0.2947406470775604, + "learning_rate": 6.88755626441592e-05, + "loss": 1.7713, + "step": 12886 + }, + { + "epoch": 3.9554941682013505, + "grad_norm": 0.2967108190059662, + "learning_rate": 6.887095978763072e-05, + "loss": 1.7636, + "step": 12887 + }, + { + "epoch": 3.955801104972376, + "grad_norm": 0.2495311200618744, + "learning_rate": 6.886635674460836e-05, + "loss": 1.8148, + "step": 12888 + }, + { + "epoch": 3.9561080417434007, + "grad_norm": 0.23367099463939667, + "learning_rate": 6.88617535151376e-05, + "loss": 1.7353, + "step": 12889 + }, + { + "epoch": 3.956414978514426, + "grad_norm": 0.36790570616722107, + "learning_rate": 6.885715009926395e-05, + "loss": 1.7853, + "step": 12890 + }, + { + "epoch": 3.9567219152854514, + "grad_norm": 0.5013020038604736, + "learning_rate": 6.885254649703287e-05, + "loss": 1.7923, + "step": 12891 + }, + { + "epoch": 3.9570288520564763, + "grad_norm": 0.4446276128292084, + "learning_rate": 6.884794270848988e-05, + "loss": 1.7504, + "step": 12892 + }, + { + "epoch": 3.9573357888275016, + "grad_norm": 0.2478526383638382, + "learning_rate": 6.88433387336805e-05, + "loss": 1.7629, + "step": 12893 + }, + { + "epoch": 3.957642725598527, + "grad_norm": 0.30111798644065857, + "learning_rate": 6.883873457265019e-05, + "loss": 1.8291, + "step": 12894 + }, + { + "epoch": 3.957949662369552, + "grad_norm": 0.3812437951564789, + "learning_rate": 6.883413022544445e-05, + "loss": 1.7919, + "step": 12895 + }, + { + "epoch": 3.958256599140577, + "grad_norm": 0.2895318269729614, + "learning_rate": 6.882952569210881e-05, + "loss": 1.7467, + "step": 12896 + }, + { + "epoch": 3.958563535911602, + "grad_norm": 0.30391454696655273, + "learning_rate": 6.882492097268873e-05, + "loss": 1.8145, + "step": 12897 + }, + { + "epoch": 3.9588704726826274, + "grad_norm": 0.5033623576164246, + "learning_rate": 6.882031606722977e-05, + "loss": 1.8231, + "step": 12898 + }, + { + "epoch": 3.9591774094536527, + "grad_norm": 0.5351777672767639, + "learning_rate": 6.881571097577742e-05, + "loss": 1.807, + "step": 12899 + }, + { + "epoch": 3.9594843462246776, + "grad_norm": 0.35540491342544556, + "learning_rate": 6.881110569837719e-05, + "loss": 1.7626, + "step": 12900 + }, + { + "epoch": 3.959791282995703, + "grad_norm": 0.22447600960731506, + "learning_rate": 6.880650023507457e-05, + "loss": 1.7392, + "step": 12901 + }, + { + "epoch": 3.960098219766728, + "grad_norm": 0.44619202613830566, + "learning_rate": 6.88018945859151e-05, + "loss": 1.8138, + "step": 12902 + }, + { + "epoch": 3.960405156537753, + "grad_norm": 0.41381633281707764, + "learning_rate": 6.879728875094428e-05, + "loss": 1.7676, + "step": 12903 + }, + { + "epoch": 3.9607120933087785, + "grad_norm": 0.2601528465747833, + "learning_rate": 6.879268273020764e-05, + "loss": 1.8406, + "step": 12904 + }, + { + "epoch": 3.961019030079804, + "grad_norm": 0.3309035003185272, + "learning_rate": 6.878807652375071e-05, + "loss": 1.7673, + "step": 12905 + }, + { + "epoch": 3.9613259668508287, + "grad_norm": 0.5281669497489929, + "learning_rate": 6.878347013161899e-05, + "loss": 1.7686, + "step": 12906 + }, + { + "epoch": 3.961632903621854, + "grad_norm": 0.5397645831108093, + "learning_rate": 6.8778863553858e-05, + "loss": 1.8575, + "step": 12907 + }, + { + "epoch": 3.961939840392879, + "grad_norm": 0.329485684633255, + "learning_rate": 6.877425679051327e-05, + "loss": 1.8185, + "step": 12908 + }, + { + "epoch": 3.9622467771639043, + "grad_norm": 0.3012789487838745, + "learning_rate": 6.876964984163034e-05, + "loss": 1.7962, + "step": 12909 + }, + { + "epoch": 3.9625537139349296, + "grad_norm": 0.5596817135810852, + "learning_rate": 6.876504270725472e-05, + "loss": 1.7972, + "step": 12910 + }, + { + "epoch": 3.9628606507059545, + "grad_norm": 0.5374729633331299, + "learning_rate": 6.876043538743197e-05, + "loss": 1.7863, + "step": 12911 + }, + { + "epoch": 3.96316758747698, + "grad_norm": 0.24617290496826172, + "learning_rate": 6.875582788220757e-05, + "loss": 1.7555, + "step": 12912 + }, + { + "epoch": 3.9634745242480047, + "grad_norm": 0.3493972420692444, + "learning_rate": 6.875122019162712e-05, + "loss": 1.8595, + "step": 12913 + }, + { + "epoch": 3.96378146101903, + "grad_norm": 0.4293089807033539, + "learning_rate": 6.874661231573609e-05, + "loss": 1.7647, + "step": 12914 + }, + { + "epoch": 3.9640883977900554, + "grad_norm": 0.30602574348449707, + "learning_rate": 6.874200425458006e-05, + "loss": 1.7122, + "step": 12915 + }, + { + "epoch": 3.9643953345610803, + "grad_norm": 0.22776013612747192, + "learning_rate": 6.873739600820457e-05, + "loss": 1.7136, + "step": 12916 + }, + { + "epoch": 3.9647022713321056, + "grad_norm": 0.3727327585220337, + "learning_rate": 6.873278757665513e-05, + "loss": 1.8314, + "step": 12917 + }, + { + "epoch": 3.9650092081031305, + "grad_norm": 0.35110536217689514, + "learning_rate": 6.872817895997733e-05, + "loss": 1.7506, + "step": 12918 + }, + { + "epoch": 3.965316144874156, + "grad_norm": 0.275560587644577, + "learning_rate": 6.872357015821666e-05, + "loss": 1.7865, + "step": 12919 + }, + { + "epoch": 3.965623081645181, + "grad_norm": 0.2686980366706848, + "learning_rate": 6.871896117141873e-05, + "loss": 1.8431, + "step": 12920 + }, + { + "epoch": 3.9659300184162065, + "grad_norm": 0.3299664556980133, + "learning_rate": 6.871435199962901e-05, + "loss": 1.7988, + "step": 12921 + }, + { + "epoch": 3.9662369551872314, + "grad_norm": 0.2833637297153473, + "learning_rate": 6.870974264289313e-05, + "loss": 1.6993, + "step": 12922 + }, + { + "epoch": 3.9665438919582567, + "grad_norm": 0.25062620639801025, + "learning_rate": 6.870513310125659e-05, + "loss": 1.7814, + "step": 12923 + }, + { + "epoch": 3.9668508287292816, + "grad_norm": 0.26609909534454346, + "learning_rate": 6.870052337476498e-05, + "loss": 1.7871, + "step": 12924 + }, + { + "epoch": 3.967157765500307, + "grad_norm": 0.22760890424251556, + "learning_rate": 6.869591346346382e-05, + "loss": 1.7941, + "step": 12925 + }, + { + "epoch": 3.9674647022713323, + "grad_norm": 0.2845582067966461, + "learning_rate": 6.869130336739869e-05, + "loss": 1.8215, + "step": 12926 + }, + { + "epoch": 3.967771639042357, + "grad_norm": 0.254948228597641, + "learning_rate": 6.868669308661514e-05, + "loss": 1.7515, + "step": 12927 + }, + { + "epoch": 3.9680785758133825, + "grad_norm": 0.2372167855501175, + "learning_rate": 6.868208262115875e-05, + "loss": 1.7524, + "step": 12928 + }, + { + "epoch": 3.9683855125844074, + "grad_norm": 0.31165993213653564, + "learning_rate": 6.867747197107506e-05, + "loss": 1.8139, + "step": 12929 + }, + { + "epoch": 3.9686924493554327, + "grad_norm": 0.2617839276790619, + "learning_rate": 6.867286113640965e-05, + "loss": 1.7388, + "step": 12930 + }, + { + "epoch": 3.968999386126458, + "grad_norm": 0.22749558091163635, + "learning_rate": 6.866825011720807e-05, + "loss": 1.7421, + "step": 12931 + }, + { + "epoch": 3.969306322897483, + "grad_norm": 0.27737462520599365, + "learning_rate": 6.86636389135159e-05, + "loss": 1.7977, + "step": 12932 + }, + { + "epoch": 3.9696132596685083, + "grad_norm": 0.3331063985824585, + "learning_rate": 6.865902752537871e-05, + "loss": 1.7925, + "step": 12933 + }, + { + "epoch": 3.969920196439533, + "grad_norm": 0.24229519069194794, + "learning_rate": 6.86544159528421e-05, + "loss": 1.7782, + "step": 12934 + }, + { + "epoch": 3.9702271332105585, + "grad_norm": 0.29494860768318176, + "learning_rate": 6.86498041959516e-05, + "loss": 1.7713, + "step": 12935 + }, + { + "epoch": 3.970534069981584, + "grad_norm": 0.26064008474349976, + "learning_rate": 6.86451922547528e-05, + "loss": 1.7161, + "step": 12936 + }, + { + "epoch": 3.970841006752609, + "grad_norm": 0.2656785547733307, + "learning_rate": 6.864058012929129e-05, + "loss": 1.8154, + "step": 12937 + }, + { + "epoch": 3.971147943523634, + "grad_norm": 0.21170997619628906, + "learning_rate": 6.863596781961263e-05, + "loss": 1.7614, + "step": 12938 + }, + { + "epoch": 3.9714548802946594, + "grad_norm": 0.21709072589874268, + "learning_rate": 6.863135532576241e-05, + "loss": 1.7896, + "step": 12939 + }, + { + "epoch": 3.9717618170656843, + "grad_norm": 0.2361367791891098, + "learning_rate": 6.862674264778623e-05, + "loss": 1.7775, + "step": 12940 + }, + { + "epoch": 3.9720687538367097, + "grad_norm": 0.22042550146579742, + "learning_rate": 6.862212978572967e-05, + "loss": 1.7781, + "step": 12941 + }, + { + "epoch": 3.972375690607735, + "grad_norm": 0.2535422146320343, + "learning_rate": 6.86175167396383e-05, + "loss": 1.7665, + "step": 12942 + }, + { + "epoch": 3.97268262737876, + "grad_norm": 0.23741906881332397, + "learning_rate": 6.861290350955771e-05, + "loss": 1.7829, + "step": 12943 + }, + { + "epoch": 3.972989564149785, + "grad_norm": 0.23789910972118378, + "learning_rate": 6.860829009553351e-05, + "loss": 1.7745, + "step": 12944 + }, + { + "epoch": 3.97329650092081, + "grad_norm": 0.26867765188217163, + "learning_rate": 6.860367649761127e-05, + "loss": 1.7239, + "step": 12945 + }, + { + "epoch": 3.9736034376918354, + "grad_norm": 0.3211663067340851, + "learning_rate": 6.85990627158366e-05, + "loss": 1.7976, + "step": 12946 + }, + { + "epoch": 3.9739103744628608, + "grad_norm": 0.26177310943603516, + "learning_rate": 6.85944487502551e-05, + "loss": 1.7446, + "step": 12947 + }, + { + "epoch": 3.9742173112338857, + "grad_norm": 0.23622745275497437, + "learning_rate": 6.858983460091234e-05, + "loss": 1.7824, + "step": 12948 + }, + { + "epoch": 3.974524248004911, + "grad_norm": 0.24372988939285278, + "learning_rate": 6.858522026785395e-05, + "loss": 1.8014, + "step": 12949 + }, + { + "epoch": 3.974831184775936, + "grad_norm": 0.2566998600959778, + "learning_rate": 6.85806057511255e-05, + "loss": 1.742, + "step": 12950 + }, + { + "epoch": 3.9751381215469612, + "grad_norm": 0.24418365955352783, + "learning_rate": 6.857599105077264e-05, + "loss": 1.7331, + "step": 12951 + }, + { + "epoch": 3.9754450583179866, + "grad_norm": 0.2260327935218811, + "learning_rate": 6.857137616684094e-05, + "loss": 1.7173, + "step": 12952 + }, + { + "epoch": 3.975751995089012, + "grad_norm": 0.277044415473938, + "learning_rate": 6.856676109937602e-05, + "loss": 1.7255, + "step": 12953 + }, + { + "epoch": 3.976058931860037, + "grad_norm": 0.228300079703331, + "learning_rate": 6.856214584842348e-05, + "loss": 1.7796, + "step": 12954 + }, + { + "epoch": 3.976365868631062, + "grad_norm": 0.2246638983488083, + "learning_rate": 6.855753041402893e-05, + "loss": 1.7458, + "step": 12955 + }, + { + "epoch": 3.976672805402087, + "grad_norm": 0.22235621511936188, + "learning_rate": 6.855291479623799e-05, + "loss": 1.7585, + "step": 12956 + }, + { + "epoch": 3.9769797421731123, + "grad_norm": 0.23710694909095764, + "learning_rate": 6.854829899509627e-05, + "loss": 1.767, + "step": 12957 + }, + { + "epoch": 3.9772866789441377, + "grad_norm": 0.2527346611022949, + "learning_rate": 6.854368301064939e-05, + "loss": 1.828, + "step": 12958 + }, + { + "epoch": 3.9775936157151626, + "grad_norm": 0.25032514333724976, + "learning_rate": 6.853906684294298e-05, + "loss": 1.8533, + "step": 12959 + }, + { + "epoch": 3.977900552486188, + "grad_norm": 0.2346320003271103, + "learning_rate": 6.853445049202262e-05, + "loss": 1.8046, + "step": 12960 + }, + { + "epoch": 3.978207489257213, + "grad_norm": 0.22576460242271423, + "learning_rate": 6.852983395793398e-05, + "loss": 1.7502, + "step": 12961 + }, + { + "epoch": 3.978514426028238, + "grad_norm": 0.2230147123336792, + "learning_rate": 6.852521724072266e-05, + "loss": 1.7362, + "step": 12962 + }, + { + "epoch": 3.9788213627992635, + "grad_norm": 0.2339705526828766, + "learning_rate": 6.852060034043425e-05, + "loss": 1.763, + "step": 12963 + }, + { + "epoch": 3.979128299570289, + "grad_norm": 0.24511271715164185, + "learning_rate": 6.851598325711446e-05, + "loss": 1.7988, + "step": 12964 + }, + { + "epoch": 3.9794352363413137, + "grad_norm": 0.2927285134792328, + "learning_rate": 6.851136599080885e-05, + "loss": 1.8346, + "step": 12965 + }, + { + "epoch": 3.979742173112339, + "grad_norm": 0.2593212425708771, + "learning_rate": 6.850674854156305e-05, + "loss": 1.7368, + "step": 12966 + }, + { + "epoch": 3.980049109883364, + "grad_norm": 0.3013291656970978, + "learning_rate": 6.850213090942275e-05, + "loss": 1.7911, + "step": 12967 + }, + { + "epoch": 3.9803560466543892, + "grad_norm": 0.3420047163963318, + "learning_rate": 6.849751309443352e-05, + "loss": 1.7899, + "step": 12968 + }, + { + "epoch": 3.9806629834254146, + "grad_norm": 0.2901746928691864, + "learning_rate": 6.849289509664105e-05, + "loss": 1.8244, + "step": 12969 + }, + { + "epoch": 3.9809699201964395, + "grad_norm": 0.2389298677444458, + "learning_rate": 6.848827691609093e-05, + "loss": 1.7116, + "step": 12970 + }, + { + "epoch": 3.981276856967465, + "grad_norm": 0.3153960704803467, + "learning_rate": 6.848365855282882e-05, + "loss": 1.7665, + "step": 12971 + }, + { + "epoch": 3.9815837937384897, + "grad_norm": 0.3162175118923187, + "learning_rate": 6.847904000690036e-05, + "loss": 1.7722, + "step": 12972 + }, + { + "epoch": 3.981890730509515, + "grad_norm": 0.27458643913269043, + "learning_rate": 6.847442127835122e-05, + "loss": 1.8095, + "step": 12973 + }, + { + "epoch": 3.9821976672805404, + "grad_norm": 0.22330710291862488, + "learning_rate": 6.846980236722699e-05, + "loss": 1.7179, + "step": 12974 + }, + { + "epoch": 3.9825046040515653, + "grad_norm": 0.2940923869609833, + "learning_rate": 6.846518327357339e-05, + "loss": 1.7363, + "step": 12975 + }, + { + "epoch": 3.9828115408225906, + "grad_norm": 0.26479849219322205, + "learning_rate": 6.846056399743599e-05, + "loss": 1.7788, + "step": 12976 + }, + { + "epoch": 3.9831184775936155, + "grad_norm": 0.24145057797431946, + "learning_rate": 6.845594453886048e-05, + "loss": 1.7825, + "step": 12977 + }, + { + "epoch": 3.983425414364641, + "grad_norm": 0.2795869708061218, + "learning_rate": 6.845132489789252e-05, + "loss": 1.7705, + "step": 12978 + }, + { + "epoch": 3.983732351135666, + "grad_norm": 0.3117202818393707, + "learning_rate": 6.844670507457776e-05, + "loss": 1.8183, + "step": 12979 + }, + { + "epoch": 3.9840392879066915, + "grad_norm": 0.2666899263858795, + "learning_rate": 6.844208506896184e-05, + "loss": 1.7434, + "step": 12980 + }, + { + "epoch": 3.9843462246777164, + "grad_norm": 0.24682332575321198, + "learning_rate": 6.843746488109042e-05, + "loss": 1.751, + "step": 12981 + }, + { + "epoch": 3.9846531614487417, + "grad_norm": 0.2558208703994751, + "learning_rate": 6.843284451100916e-05, + "loss": 1.7983, + "step": 12982 + }, + { + "epoch": 3.9849600982197666, + "grad_norm": 0.4236481189727783, + "learning_rate": 6.842822395876374e-05, + "loss": 1.8584, + "step": 12983 + }, + { + "epoch": 3.985267034990792, + "grad_norm": 0.4931485950946808, + "learning_rate": 6.84236032243998e-05, + "loss": 1.7617, + "step": 12984 + }, + { + "epoch": 3.9855739717618173, + "grad_norm": 0.37793654203414917, + "learning_rate": 6.841898230796302e-05, + "loss": 1.7411, + "step": 12985 + }, + { + "epoch": 3.985880908532842, + "grad_norm": 0.2093842774629593, + "learning_rate": 6.841436120949906e-05, + "loss": 1.772, + "step": 12986 + }, + { + "epoch": 3.9861878453038675, + "grad_norm": 0.4065552055835724, + "learning_rate": 6.840973992905359e-05, + "loss": 1.7675, + "step": 12987 + }, + { + "epoch": 3.9864947820748924, + "grad_norm": 0.5334183573722839, + "learning_rate": 6.840511846667228e-05, + "loss": 1.7872, + "step": 12988 + }, + { + "epoch": 3.9868017188459177, + "grad_norm": 0.378974974155426, + "learning_rate": 6.84004968224008e-05, + "loss": 1.8288, + "step": 12989 + }, + { + "epoch": 3.987108655616943, + "grad_norm": 0.22518309950828552, + "learning_rate": 6.839587499628483e-05, + "loss": 1.7715, + "step": 12990 + }, + { + "epoch": 3.987415592387968, + "grad_norm": 0.4270850718021393, + "learning_rate": 6.839125298837003e-05, + "loss": 1.7797, + "step": 12991 + }, + { + "epoch": 3.9877225291589933, + "grad_norm": 0.4629896879196167, + "learning_rate": 6.838663079870211e-05, + "loss": 1.7936, + "step": 12992 + }, + { + "epoch": 3.988029465930018, + "grad_norm": 0.29273948073387146, + "learning_rate": 6.838200842732672e-05, + "loss": 1.8264, + "step": 12993 + }, + { + "epoch": 3.9883364027010435, + "grad_norm": 0.31575852632522583, + "learning_rate": 6.837738587428954e-05, + "loss": 1.8043, + "step": 12994 + }, + { + "epoch": 3.988643339472069, + "grad_norm": 0.40602433681488037, + "learning_rate": 6.837276313963627e-05, + "loss": 1.7409, + "step": 12995 + }, + { + "epoch": 3.988950276243094, + "grad_norm": 0.23413142561912537, + "learning_rate": 6.836814022341259e-05, + "loss": 1.8585, + "step": 12996 + }, + { + "epoch": 3.989257213014119, + "grad_norm": 0.3518814444541931, + "learning_rate": 6.836351712566416e-05, + "loss": 1.7768, + "step": 12997 + }, + { + "epoch": 3.9895641497851444, + "grad_norm": 0.3811505436897278, + "learning_rate": 6.83588938464367e-05, + "loss": 1.7738, + "step": 12998 + }, + { + "epoch": 3.9898710865561693, + "grad_norm": 0.2516780197620392, + "learning_rate": 6.835427038577589e-05, + "loss": 1.7351, + "step": 12999 + }, + { + "epoch": 3.9901780233271946, + "grad_norm": 0.23704510927200317, + "learning_rate": 6.834964674372744e-05, + "loss": 1.7907, + "step": 13000 + }, + { + "epoch": 3.99048496009822, + "grad_norm": 0.2890201807022095, + "learning_rate": 6.8345022920337e-05, + "loss": 1.9546, + "step": 13001 + }, + { + "epoch": 3.990791896869245, + "grad_norm": 0.2678101360797882, + "learning_rate": 6.834039891565031e-05, + "loss": 1.7338, + "step": 13002 + }, + { + "epoch": 3.99109883364027, + "grad_norm": 0.31726256012916565, + "learning_rate": 6.833577472971304e-05, + "loss": 1.8464, + "step": 13003 + }, + { + "epoch": 3.991405770411295, + "grad_norm": 0.28112682700157166, + "learning_rate": 6.83311503625709e-05, + "loss": 1.7427, + "step": 13004 + }, + { + "epoch": 3.9917127071823204, + "grad_norm": 0.2651563584804535, + "learning_rate": 6.832652581426958e-05, + "loss": 1.8117, + "step": 13005 + }, + { + "epoch": 3.9920196439533457, + "grad_norm": 0.3095388114452362, + "learning_rate": 6.83219010848548e-05, + "loss": 1.8286, + "step": 13006 + }, + { + "epoch": 3.9923265807243706, + "grad_norm": 0.24704942107200623, + "learning_rate": 6.831727617437225e-05, + "loss": 1.77, + "step": 13007 + }, + { + "epoch": 3.992633517495396, + "grad_norm": 0.24868519604206085, + "learning_rate": 6.831265108286764e-05, + "loss": 1.8129, + "step": 13008 + }, + { + "epoch": 3.992940454266421, + "grad_norm": 0.26511049270629883, + "learning_rate": 6.830802581038669e-05, + "loss": 1.7539, + "step": 13009 + }, + { + "epoch": 3.993247391037446, + "grad_norm": 0.2823421061038971, + "learning_rate": 6.830340035697508e-05, + "loss": 1.8068, + "step": 13010 + }, + { + "epoch": 3.9935543278084715, + "grad_norm": 0.28526121377944946, + "learning_rate": 6.829877472267856e-05, + "loss": 1.764, + "step": 13011 + }, + { + "epoch": 3.993861264579497, + "grad_norm": 0.2576456069946289, + "learning_rate": 6.829414890754281e-05, + "loss": 1.728, + "step": 13012 + }, + { + "epoch": 3.9941682013505218, + "grad_norm": 0.27154842019081116, + "learning_rate": 6.828952291161356e-05, + "loss": 1.797, + "step": 13013 + }, + { + "epoch": 3.994475138121547, + "grad_norm": 0.3129710555076599, + "learning_rate": 6.828489673493652e-05, + "loss": 1.769, + "step": 13014 + }, + { + "epoch": 3.994782074892572, + "grad_norm": 0.40118902921676636, + "learning_rate": 6.828027037755742e-05, + "loss": 1.8029, + "step": 13015 + }, + { + "epoch": 3.9950890116635973, + "grad_norm": 0.33228442072868347, + "learning_rate": 6.827564383952197e-05, + "loss": 1.7295, + "step": 13016 + }, + { + "epoch": 3.9953959484346226, + "grad_norm": 0.218771830201149, + "learning_rate": 6.827101712087591e-05, + "loss": 1.7693, + "step": 13017 + }, + { + "epoch": 3.9957028852056475, + "grad_norm": 0.31354373693466187, + "learning_rate": 6.826639022166492e-05, + "loss": 1.743, + "step": 13018 + }, + { + "epoch": 3.996009821976673, + "grad_norm": 0.3584701418876648, + "learning_rate": 6.826176314193478e-05, + "loss": 1.7597, + "step": 13019 + }, + { + "epoch": 3.9963167587476978, + "grad_norm": 0.2692064344882965, + "learning_rate": 6.82571358817312e-05, + "loss": 1.7871, + "step": 13020 + }, + { + "epoch": 3.996623695518723, + "grad_norm": 0.3064020276069641, + "learning_rate": 6.825250844109987e-05, + "loss": 1.7858, + "step": 13021 + }, + { + "epoch": 3.9969306322897484, + "grad_norm": 0.29913413524627686, + "learning_rate": 6.824788082008657e-05, + "loss": 1.7773, + "step": 13022 + }, + { + "epoch": 3.9972375690607733, + "grad_norm": 0.2682165801525116, + "learning_rate": 6.824325301873703e-05, + "loss": 1.8321, + "step": 13023 + }, + { + "epoch": 3.9975445058317987, + "grad_norm": 0.3274376690387726, + "learning_rate": 6.823862503709694e-05, + "loss": 1.8514, + "step": 13024 + }, + { + "epoch": 3.9978514426028235, + "grad_norm": 0.29828041791915894, + "learning_rate": 6.823399687521211e-05, + "loss": 1.7923, + "step": 13025 + }, + { + "epoch": 3.998158379373849, + "grad_norm": 0.22339288890361786, + "learning_rate": 6.82293685331282e-05, + "loss": 1.756, + "step": 13026 + }, + { + "epoch": 3.998465316144874, + "grad_norm": 0.2254658192396164, + "learning_rate": 6.8224740010891e-05, + "loss": 1.7392, + "step": 13027 + }, + { + "epoch": 3.9987722529158995, + "grad_norm": 0.24932752549648285, + "learning_rate": 6.822011130854624e-05, + "loss": 1.7538, + "step": 13028 + }, + { + "epoch": 3.9990791896869244, + "grad_norm": 0.21429690718650818, + "learning_rate": 6.821548242613966e-05, + "loss": 1.7746, + "step": 13029 + }, + { + "epoch": 3.9993861264579498, + "grad_norm": 0.25503116846084595, + "learning_rate": 6.8210853363717e-05, + "loss": 1.814, + "step": 13030 + }, + { + "epoch": 3.9996930632289747, + "grad_norm": 0.23168155550956726, + "learning_rate": 6.820622412132402e-05, + "loss": 1.769, + "step": 13031 + }, + { + "epoch": 4.0, + "grad_norm": 0.2252223789691925, + "learning_rate": 6.820159469900645e-05, + "loss": 1.7782, + "step": 13032 + }, + { + "epoch": 4.000306936771025, + "grad_norm": 0.1996588408946991, + "learning_rate": 6.819696509681007e-05, + "loss": 1.6839, + "step": 13033 + }, + { + "epoch": 4.000613873542051, + "grad_norm": 0.22297053039073944, + "learning_rate": 6.81923353147806e-05, + "loss": 1.7767, + "step": 13034 + }, + { + "epoch": 4.000920810313075, + "grad_norm": 0.25867611169815063, + "learning_rate": 6.818770535296381e-05, + "loss": 1.8623, + "step": 13035 + }, + { + "epoch": 4.0012277470841005, + "grad_norm": 0.2173648178577423, + "learning_rate": 6.818307521140547e-05, + "loss": 1.8034, + "step": 13036 + }, + { + "epoch": 4.001534683855126, + "grad_norm": 0.23634609580039978, + "learning_rate": 6.81784448901513e-05, + "loss": 1.7503, + "step": 13037 + }, + { + "epoch": 4.001841620626151, + "grad_norm": 0.2626810073852539, + "learning_rate": 6.81738143892471e-05, + "loss": 1.8116, + "step": 13038 + }, + { + "epoch": 4.0021485573971765, + "grad_norm": 0.27888983488082886, + "learning_rate": 6.816918370873861e-05, + "loss": 1.8032, + "step": 13039 + }, + { + "epoch": 4.002455494168202, + "grad_norm": 0.275038480758667, + "learning_rate": 6.816455284867162e-05, + "loss": 1.7445, + "step": 13040 + }, + { + "epoch": 4.002762430939226, + "grad_norm": 0.3475828170776367, + "learning_rate": 6.815992180909184e-05, + "loss": 1.7404, + "step": 13041 + }, + { + "epoch": 4.003069367710252, + "grad_norm": 0.27314287424087524, + "learning_rate": 6.815529059004507e-05, + "loss": 1.8333, + "step": 13042 + }, + { + "epoch": 4.003376304481277, + "grad_norm": 0.34846973419189453, + "learning_rate": 6.815065919157709e-05, + "loss": 1.7921, + "step": 13043 + }, + { + "epoch": 4.003683241252302, + "grad_norm": 0.4191788136959076, + "learning_rate": 6.814602761373365e-05, + "loss": 1.8018, + "step": 13044 + }, + { + "epoch": 4.003990178023328, + "grad_norm": 0.2655608057975769, + "learning_rate": 6.814139585656055e-05, + "loss": 1.7638, + "step": 13045 + }, + { + "epoch": 4.004297114794352, + "grad_norm": 0.25938618183135986, + "learning_rate": 6.813676392010353e-05, + "loss": 1.794, + "step": 13046 + }, + { + "epoch": 4.004604051565377, + "grad_norm": 0.3464813828468323, + "learning_rate": 6.813213180440837e-05, + "loss": 1.8662, + "step": 13047 + }, + { + "epoch": 4.004910988336403, + "grad_norm": 0.30185338854789734, + "learning_rate": 6.812749950952087e-05, + "loss": 1.8029, + "step": 13048 + }, + { + "epoch": 4.005217925107428, + "grad_norm": 0.23291908204555511, + "learning_rate": 6.812286703548678e-05, + "loss": 1.7365, + "step": 13049 + }, + { + "epoch": 4.005524861878453, + "grad_norm": 0.3542841374874115, + "learning_rate": 6.811823438235189e-05, + "loss": 1.8674, + "step": 13050 + }, + { + "epoch": 4.005831798649478, + "grad_norm": 0.2914685606956482, + "learning_rate": 6.811360155016202e-05, + "loss": 1.8306, + "step": 13051 + }, + { + "epoch": 4.006138735420503, + "grad_norm": 0.24888737499713898, + "learning_rate": 6.810896853896289e-05, + "loss": 1.7767, + "step": 13052 + }, + { + "epoch": 4.0064456721915285, + "grad_norm": 0.2977537512779236, + "learning_rate": 6.810433534880033e-05, + "loss": 1.8227, + "step": 13053 + }, + { + "epoch": 4.006752608962554, + "grad_norm": 0.3367510735988617, + "learning_rate": 6.809970197972013e-05, + "loss": 1.734, + "step": 13054 + }, + { + "epoch": 4.007059545733579, + "grad_norm": 0.28098800778388977, + "learning_rate": 6.809506843176806e-05, + "loss": 1.7032, + "step": 13055 + }, + { + "epoch": 4.0073664825046045, + "grad_norm": 0.24016784131526947, + "learning_rate": 6.809043470498991e-05, + "loss": 1.7863, + "step": 13056 + }, + { + "epoch": 4.007673419275629, + "grad_norm": 0.2883957624435425, + "learning_rate": 6.808580079943148e-05, + "loss": 1.7342, + "step": 13057 + }, + { + "epoch": 4.007980356046654, + "grad_norm": 0.3069116473197937, + "learning_rate": 6.808116671513856e-05, + "loss": 1.8544, + "step": 13058 + }, + { + "epoch": 4.00828729281768, + "grad_norm": 0.24113236367702484, + "learning_rate": 6.807653245215697e-05, + "loss": 1.7692, + "step": 13059 + }, + { + "epoch": 4.008594229588705, + "grad_norm": 0.2651619017124176, + "learning_rate": 6.807189801053249e-05, + "loss": 1.8096, + "step": 13060 + }, + { + "epoch": 4.00890116635973, + "grad_norm": 0.2636481523513794, + "learning_rate": 6.806726339031092e-05, + "loss": 1.8062, + "step": 13061 + }, + { + "epoch": 4.009208103130755, + "grad_norm": 0.22691169381141663, + "learning_rate": 6.806262859153807e-05, + "loss": 1.7001, + "step": 13062 + }, + { + "epoch": 4.00951503990178, + "grad_norm": 0.23288170993328094, + "learning_rate": 6.805799361425972e-05, + "loss": 1.7508, + "step": 13063 + }, + { + "epoch": 4.009821976672805, + "grad_norm": 0.243272602558136, + "learning_rate": 6.80533584585217e-05, + "loss": 1.7797, + "step": 13064 + }, + { + "epoch": 4.010128913443831, + "grad_norm": 0.24594646692276, + "learning_rate": 6.80487231243698e-05, + "loss": 1.7894, + "step": 13065 + }, + { + "epoch": 4.010435850214856, + "grad_norm": 0.21726086735725403, + "learning_rate": 6.804408761184986e-05, + "loss": 1.7472, + "step": 13066 + }, + { + "epoch": 4.0107427869858805, + "grad_norm": 0.2262321561574936, + "learning_rate": 6.803945192100767e-05, + "loss": 1.7563, + "step": 13067 + }, + { + "epoch": 4.011049723756906, + "grad_norm": 0.2449522763490677, + "learning_rate": 6.803481605188903e-05, + "loss": 1.7282, + "step": 13068 + }, + { + "epoch": 4.011356660527931, + "grad_norm": 0.2281760573387146, + "learning_rate": 6.803018000453975e-05, + "loss": 1.8191, + "step": 13069 + }, + { + "epoch": 4.0116635972989565, + "grad_norm": 0.3039850890636444, + "learning_rate": 6.80255437790057e-05, + "loss": 1.8258, + "step": 13070 + }, + { + "epoch": 4.011970534069982, + "grad_norm": 0.3978467881679535, + "learning_rate": 6.802090737533264e-05, + "loss": 1.7338, + "step": 13071 + }, + { + "epoch": 4.012277470841007, + "grad_norm": 0.29175812005996704, + "learning_rate": 6.801627079356641e-05, + "loss": 1.7754, + "step": 13072 + }, + { + "epoch": 4.012584407612032, + "grad_norm": 0.24228449165821075, + "learning_rate": 6.801163403375285e-05, + "loss": 1.7624, + "step": 13073 + }, + { + "epoch": 4.012891344383057, + "grad_norm": 0.34527531266212463, + "learning_rate": 6.800699709593776e-05, + "loss": 1.87, + "step": 13074 + }, + { + "epoch": 4.013198281154082, + "grad_norm": 0.1995161920785904, + "learning_rate": 6.800235998016696e-05, + "loss": 1.7253, + "step": 13075 + }, + { + "epoch": 4.013505217925108, + "grad_norm": 0.3509151339530945, + "learning_rate": 6.799772268648628e-05, + "loss": 1.8013, + "step": 13076 + }, + { + "epoch": 4.013812154696133, + "grad_norm": 0.38569679856300354, + "learning_rate": 6.799308521494156e-05, + "loss": 1.7761, + "step": 13077 + }, + { + "epoch": 4.014119091467157, + "grad_norm": 0.2636256814002991, + "learning_rate": 6.798844756557865e-05, + "loss": 1.8101, + "step": 13078 + }, + { + "epoch": 4.014426028238183, + "grad_norm": 0.2570696473121643, + "learning_rate": 6.798380973844335e-05, + "loss": 1.7561, + "step": 13079 + }, + { + "epoch": 4.014732965009208, + "grad_norm": 0.38540002703666687, + "learning_rate": 6.797917173358148e-05, + "loss": 1.7893, + "step": 13080 + }, + { + "epoch": 4.015039901780233, + "grad_norm": 0.2974525988101959, + "learning_rate": 6.79745335510389e-05, + "loss": 1.8331, + "step": 13081 + }, + { + "epoch": 4.015346838551259, + "grad_norm": 0.2563362419605255, + "learning_rate": 6.796989519086146e-05, + "loss": 1.7784, + "step": 13082 + }, + { + "epoch": 4.015653775322283, + "grad_norm": 0.37037795782089233, + "learning_rate": 6.7965256653095e-05, + "loss": 1.7947, + "step": 13083 + }, + { + "epoch": 4.0159607120933085, + "grad_norm": 0.4145336449146271, + "learning_rate": 6.796061793778531e-05, + "loss": 1.7633, + "step": 13084 + }, + { + "epoch": 4.016267648864334, + "grad_norm": 0.32278406620025635, + "learning_rate": 6.795597904497828e-05, + "loss": 1.7827, + "step": 13085 + }, + { + "epoch": 4.016574585635359, + "grad_norm": 0.26466837525367737, + "learning_rate": 6.795133997471974e-05, + "loss": 1.7441, + "step": 13086 + }, + { + "epoch": 4.0168815224063845, + "grad_norm": 0.3212043344974518, + "learning_rate": 6.794670072705553e-05, + "loss": 1.7602, + "step": 13087 + }, + { + "epoch": 4.01718845917741, + "grad_norm": 0.3054736852645874, + "learning_rate": 6.79420613020315e-05, + "loss": 1.7417, + "step": 13088 + }, + { + "epoch": 4.017495395948434, + "grad_norm": 0.22281476855278015, + "learning_rate": 6.793742169969351e-05, + "loss": 1.7675, + "step": 13089 + }, + { + "epoch": 4.01780233271946, + "grad_norm": 0.32630839943885803, + "learning_rate": 6.793278192008742e-05, + "loss": 1.8409, + "step": 13090 + }, + { + "epoch": 4.018109269490485, + "grad_norm": 0.2658778429031372, + "learning_rate": 6.792814196325905e-05, + "loss": 1.7718, + "step": 13091 + }, + { + "epoch": 4.01841620626151, + "grad_norm": 0.24016901850700378, + "learning_rate": 6.792350182925429e-05, + "loss": 1.8393, + "step": 13092 + }, + { + "epoch": 4.018723143032536, + "grad_norm": 0.2882223427295685, + "learning_rate": 6.791886151811897e-05, + "loss": 1.7497, + "step": 13093 + }, + { + "epoch": 4.01903007980356, + "grad_norm": 0.24340751767158508, + "learning_rate": 6.791422102989895e-05, + "loss": 1.72, + "step": 13094 + }, + { + "epoch": 4.019337016574585, + "grad_norm": 0.235665962100029, + "learning_rate": 6.79095803646401e-05, + "loss": 1.7269, + "step": 13095 + }, + { + "epoch": 4.019643953345611, + "grad_norm": 0.32772955298423767, + "learning_rate": 6.79049395223883e-05, + "loss": 1.7916, + "step": 13096 + }, + { + "epoch": 4.019950890116636, + "grad_norm": 0.3189625144004822, + "learning_rate": 6.790029850318938e-05, + "loss": 1.7571, + "step": 13097 + }, + { + "epoch": 4.020257826887661, + "grad_norm": 0.2211185097694397, + "learning_rate": 6.789565730708921e-05, + "loss": 1.793, + "step": 13098 + }, + { + "epoch": 4.020564763658686, + "grad_norm": 0.2840392291545868, + "learning_rate": 6.789101593413367e-05, + "loss": 1.7434, + "step": 13099 + }, + { + "epoch": 4.020871700429711, + "grad_norm": 0.27857357263565063, + "learning_rate": 6.788637438436863e-05, + "loss": 1.742, + "step": 13100 + }, + { + "epoch": 4.0211786372007365, + "grad_norm": 0.314628005027771, + "learning_rate": 6.788173265783996e-05, + "loss": 1.7881, + "step": 13101 + }, + { + "epoch": 4.021485573971762, + "grad_norm": 0.2994774580001831, + "learning_rate": 6.787709075459352e-05, + "loss": 1.7741, + "step": 13102 + }, + { + "epoch": 4.021792510742787, + "grad_norm": 0.3256312310695648, + "learning_rate": 6.787244867467519e-05, + "loss": 1.7758, + "step": 13103 + }, + { + "epoch": 4.0220994475138125, + "grad_norm": 0.2332412451505661, + "learning_rate": 6.786780641813083e-05, + "loss": 1.7654, + "step": 13104 + }, + { + "epoch": 4.022406384284837, + "grad_norm": 0.23226258158683777, + "learning_rate": 6.786316398500636e-05, + "loss": 1.7605, + "step": 13105 + }, + { + "epoch": 4.022713321055862, + "grad_norm": 0.24631965160369873, + "learning_rate": 6.785852137534763e-05, + "loss": 1.7469, + "step": 13106 + }, + { + "epoch": 4.023020257826888, + "grad_norm": 0.1969226449728012, + "learning_rate": 6.785387858920051e-05, + "loss": 1.8151, + "step": 13107 + }, + { + "epoch": 4.023327194597913, + "grad_norm": 0.22769485414028168, + "learning_rate": 6.784923562661091e-05, + "loss": 1.7024, + "step": 13108 + }, + { + "epoch": 4.023634131368938, + "grad_norm": 0.2174670249223709, + "learning_rate": 6.78445924876247e-05, + "loss": 1.8094, + "step": 13109 + }, + { + "epoch": 4.023941068139963, + "grad_norm": 0.2606858015060425, + "learning_rate": 6.783994917228775e-05, + "loss": 1.8043, + "step": 13110 + }, + { + "epoch": 4.024248004910988, + "grad_norm": 0.24721349775791168, + "learning_rate": 6.783530568064599e-05, + "loss": 1.842, + "step": 13111 + }, + { + "epoch": 4.024554941682013, + "grad_norm": 0.2353603094816208, + "learning_rate": 6.783066201274529e-05, + "loss": 1.76, + "step": 13112 + }, + { + "epoch": 4.024861878453039, + "grad_norm": 0.22285830974578857, + "learning_rate": 6.782601816863153e-05, + "loss": 1.8014, + "step": 13113 + }, + { + "epoch": 4.025168815224064, + "grad_norm": 0.2482440173625946, + "learning_rate": 6.782137414835061e-05, + "loss": 1.7552, + "step": 13114 + }, + { + "epoch": 4.0254757519950894, + "grad_norm": 0.19926191866397858, + "learning_rate": 6.781672995194842e-05, + "loss": 1.7549, + "step": 13115 + }, + { + "epoch": 4.025782688766114, + "grad_norm": 0.2342877984046936, + "learning_rate": 6.781208557947086e-05, + "loss": 1.8622, + "step": 13116 + }, + { + "epoch": 4.026089625537139, + "grad_norm": 0.24096547067165375, + "learning_rate": 6.780744103096382e-05, + "loss": 1.7795, + "step": 13117 + }, + { + "epoch": 4.026396562308165, + "grad_norm": 0.23714657127857208, + "learning_rate": 6.780279630647322e-05, + "loss": 1.799, + "step": 13118 + }, + { + "epoch": 4.02670349907919, + "grad_norm": 0.28252026438713074, + "learning_rate": 6.779815140604496e-05, + "loss": 1.7573, + "step": 13119 + }, + { + "epoch": 4.027010435850215, + "grad_norm": 0.28028404712677, + "learning_rate": 6.779350632972493e-05, + "loss": 1.8103, + "step": 13120 + }, + { + "epoch": 4.02731737262124, + "grad_norm": 0.21088312566280365, + "learning_rate": 6.778886107755904e-05, + "loss": 1.7169, + "step": 13121 + }, + { + "epoch": 4.027624309392265, + "grad_norm": 0.22282038629055023, + "learning_rate": 6.77842156495932e-05, + "loss": 1.7206, + "step": 13122 + }, + { + "epoch": 4.02793124616329, + "grad_norm": 0.3281327784061432, + "learning_rate": 6.777957004587331e-05, + "loss": 1.8664, + "step": 13123 + }, + { + "epoch": 4.028238182934316, + "grad_norm": 0.29496827721595764, + "learning_rate": 6.77749242664453e-05, + "loss": 1.7532, + "step": 13124 + }, + { + "epoch": 4.028545119705341, + "grad_norm": 0.25299328565597534, + "learning_rate": 6.777027831135508e-05, + "loss": 1.7836, + "step": 13125 + }, + { + "epoch": 4.0288520564763655, + "grad_norm": 0.3000280559062958, + "learning_rate": 6.776563218064854e-05, + "loss": 1.8079, + "step": 13126 + }, + { + "epoch": 4.029158993247391, + "grad_norm": 0.3613673448562622, + "learning_rate": 6.77609858743716e-05, + "loss": 1.7931, + "step": 13127 + }, + { + "epoch": 4.029465930018416, + "grad_norm": 0.25613468885421753, + "learning_rate": 6.77563393925702e-05, + "loss": 1.7522, + "step": 13128 + }, + { + "epoch": 4.0297728667894415, + "grad_norm": 0.24391578137874603, + "learning_rate": 6.775169273529026e-05, + "loss": 1.818, + "step": 13129 + }, + { + "epoch": 4.030079803560467, + "grad_norm": 0.2806173264980316, + "learning_rate": 6.774704590257768e-05, + "loss": 1.7349, + "step": 13130 + }, + { + "epoch": 4.030386740331492, + "grad_norm": 0.22214172780513763, + "learning_rate": 6.774239889447838e-05, + "loss": 1.759, + "step": 13131 + }, + { + "epoch": 4.030693677102517, + "grad_norm": 0.27285513281822205, + "learning_rate": 6.773775171103828e-05, + "loss": 1.742, + "step": 13132 + }, + { + "epoch": 4.031000613873542, + "grad_norm": 0.22302402555942535, + "learning_rate": 6.773310435230334e-05, + "loss": 1.7277, + "step": 13133 + }, + { + "epoch": 4.031307550644567, + "grad_norm": 0.2350187450647354, + "learning_rate": 6.772845681831947e-05, + "loss": 1.8648, + "step": 13134 + }, + { + "epoch": 4.031614487415593, + "grad_norm": 0.2665547728538513, + "learning_rate": 6.772380910913261e-05, + "loss": 1.776, + "step": 13135 + }, + { + "epoch": 4.031921424186618, + "grad_norm": 0.30652403831481934, + "learning_rate": 6.771916122478867e-05, + "loss": 1.7884, + "step": 13136 + }, + { + "epoch": 4.032228360957642, + "grad_norm": 0.29372814297676086, + "learning_rate": 6.771451316533359e-05, + "loss": 1.8203, + "step": 13137 + }, + { + "epoch": 4.032535297728668, + "grad_norm": 0.2244873046875, + "learning_rate": 6.770986493081329e-05, + "loss": 1.7869, + "step": 13138 + }, + { + "epoch": 4.032842234499693, + "grad_norm": 0.25075265765190125, + "learning_rate": 6.770521652127375e-05, + "loss": 1.772, + "step": 13139 + }, + { + "epoch": 4.033149171270718, + "grad_norm": 0.28118211030960083, + "learning_rate": 6.770056793676087e-05, + "loss": 1.7922, + "step": 13140 + }, + { + "epoch": 4.033456108041744, + "grad_norm": 0.25199100375175476, + "learning_rate": 6.769591917732062e-05, + "loss": 1.7526, + "step": 13141 + }, + { + "epoch": 4.033763044812768, + "grad_norm": 0.2920379638671875, + "learning_rate": 6.769127024299892e-05, + "loss": 1.8365, + "step": 13142 + }, + { + "epoch": 4.0340699815837935, + "grad_norm": 0.23018018901348114, + "learning_rate": 6.768662113384171e-05, + "loss": 1.7411, + "step": 13143 + }, + { + "epoch": 4.034376918354819, + "grad_norm": 0.23253841698169708, + "learning_rate": 6.768197184989494e-05, + "loss": 1.7921, + "step": 13144 + }, + { + "epoch": 4.034683855125844, + "grad_norm": 0.22618864476680756, + "learning_rate": 6.767732239120456e-05, + "loss": 1.7421, + "step": 13145 + }, + { + "epoch": 4.0349907918968695, + "grad_norm": 0.24552187323570251, + "learning_rate": 6.767267275781655e-05, + "loss": 1.7299, + "step": 13146 + }, + { + "epoch": 4.035297728667895, + "grad_norm": 0.22562766075134277, + "learning_rate": 6.76680229497768e-05, + "loss": 1.766, + "step": 13147 + }, + { + "epoch": 4.035604665438919, + "grad_norm": 0.28718629479408264, + "learning_rate": 6.76633729671313e-05, + "loss": 1.7366, + "step": 13148 + }, + { + "epoch": 4.035911602209945, + "grad_norm": 0.38769885897636414, + "learning_rate": 6.765872280992598e-05, + "loss": 1.8244, + "step": 13149 + }, + { + "epoch": 4.03621853898097, + "grad_norm": 0.4232725501060486, + "learning_rate": 6.765407247820683e-05, + "loss": 1.8244, + "step": 13150 + }, + { + "epoch": 4.036525475751995, + "grad_norm": 0.2771088778972626, + "learning_rate": 6.764942197201977e-05, + "loss": 1.7863, + "step": 13151 + }, + { + "epoch": 4.036832412523021, + "grad_norm": 0.2917862832546234, + "learning_rate": 6.76447712914108e-05, + "loss": 1.791, + "step": 13152 + }, + { + "epoch": 4.037139349294045, + "grad_norm": 0.37355467677116394, + "learning_rate": 6.764012043642584e-05, + "loss": 1.74, + "step": 13153 + }, + { + "epoch": 4.03744628606507, + "grad_norm": 0.35664018988609314, + "learning_rate": 6.763546940711089e-05, + "loss": 1.7734, + "step": 13154 + }, + { + "epoch": 4.037753222836096, + "grad_norm": 0.2335754930973053, + "learning_rate": 6.763081820351188e-05, + "loss": 1.7765, + "step": 13155 + }, + { + "epoch": 4.038060159607121, + "grad_norm": 0.2825562357902527, + "learning_rate": 6.762616682567478e-05, + "loss": 1.7867, + "step": 13156 + }, + { + "epoch": 4.038367096378146, + "grad_norm": 0.3103202283382416, + "learning_rate": 6.762151527364559e-05, + "loss": 1.7331, + "step": 13157 + }, + { + "epoch": 4.038674033149171, + "grad_norm": 0.2897353172302246, + "learning_rate": 6.761686354747025e-05, + "loss": 1.7638, + "step": 13158 + }, + { + "epoch": 4.038980969920196, + "grad_norm": 0.21260851621627808, + "learning_rate": 6.761221164719474e-05, + "loss": 1.7302, + "step": 13159 + }, + { + "epoch": 4.0392879066912215, + "grad_norm": 0.2878021001815796, + "learning_rate": 6.760755957286503e-05, + "loss": 1.7368, + "step": 13160 + }, + { + "epoch": 4.039594843462247, + "grad_norm": 0.2785978317260742, + "learning_rate": 6.76029073245271e-05, + "loss": 1.7258, + "step": 13161 + }, + { + "epoch": 4.039901780233272, + "grad_norm": 0.1963953971862793, + "learning_rate": 6.759825490222692e-05, + "loss": 1.755, + "step": 13162 + }, + { + "epoch": 4.0402087170042975, + "grad_norm": 0.26776790618896484, + "learning_rate": 6.759360230601047e-05, + "loss": 1.7676, + "step": 13163 + }, + { + "epoch": 4.040515653775322, + "grad_norm": 0.2751332223415375, + "learning_rate": 6.758894953592373e-05, + "loss": 1.7313, + "step": 13164 + }, + { + "epoch": 4.040822590546347, + "grad_norm": 0.2339213341474533, + "learning_rate": 6.758429659201269e-05, + "loss": 1.714, + "step": 13165 + }, + { + "epoch": 4.041129527317373, + "grad_norm": 0.2624664008617401, + "learning_rate": 6.75796434743233e-05, + "loss": 1.8296, + "step": 13166 + }, + { + "epoch": 4.041436464088398, + "grad_norm": 0.40156883001327515, + "learning_rate": 6.757499018290159e-05, + "loss": 1.8228, + "step": 13167 + }, + { + "epoch": 4.041743400859423, + "grad_norm": 0.32976576685905457, + "learning_rate": 6.757033671779352e-05, + "loss": 1.7403, + "step": 13168 + }, + { + "epoch": 4.042050337630448, + "grad_norm": 0.2343887835741043, + "learning_rate": 6.756568307904508e-05, + "loss": 1.7837, + "step": 13169 + }, + { + "epoch": 4.042357274401473, + "grad_norm": 0.36174145340919495, + "learning_rate": 6.756102926670227e-05, + "loss": 1.7291, + "step": 13170 + }, + { + "epoch": 4.042664211172498, + "grad_norm": 0.3324793577194214, + "learning_rate": 6.755637528081108e-05, + "loss": 1.7414, + "step": 13171 + }, + { + "epoch": 4.042971147943524, + "grad_norm": 0.21945348381996155, + "learning_rate": 6.75517211214175e-05, + "loss": 1.7762, + "step": 13172 + }, + { + "epoch": 4.043278084714549, + "grad_norm": 0.31069812178611755, + "learning_rate": 6.75470667885675e-05, + "loss": 1.7666, + "step": 13173 + }, + { + "epoch": 4.043585021485574, + "grad_norm": 0.3931153118610382, + "learning_rate": 6.754241228230713e-05, + "loss": 1.7871, + "step": 13174 + }, + { + "epoch": 4.043891958256599, + "grad_norm": 0.25559595227241516, + "learning_rate": 6.753775760268234e-05, + "loss": 1.7916, + "step": 13175 + }, + { + "epoch": 4.044198895027624, + "grad_norm": 0.3686937391757965, + "learning_rate": 6.753310274973917e-05, + "loss": 1.7642, + "step": 13176 + }, + { + "epoch": 4.0445058317986495, + "grad_norm": 0.4793247580528259, + "learning_rate": 6.75284477235236e-05, + "loss": 1.739, + "step": 13177 + }, + { + "epoch": 4.044812768569675, + "grad_norm": 0.36179354786872864, + "learning_rate": 6.752379252408164e-05, + "loss": 1.7993, + "step": 13178 + }, + { + "epoch": 4.0451197053407, + "grad_norm": 0.22559234499931335, + "learning_rate": 6.751913715145926e-05, + "loss": 1.7401, + "step": 13179 + }, + { + "epoch": 4.045426642111725, + "grad_norm": 0.29058873653411865, + "learning_rate": 6.751448160570253e-05, + "loss": 1.8089, + "step": 13180 + }, + { + "epoch": 4.04573357888275, + "grad_norm": 0.3069808781147003, + "learning_rate": 6.750982588685742e-05, + "loss": 1.7587, + "step": 13181 + }, + { + "epoch": 4.046040515653775, + "grad_norm": 0.2292155921459198, + "learning_rate": 6.750516999496994e-05, + "loss": 1.7429, + "step": 13182 + }, + { + "epoch": 4.046347452424801, + "grad_norm": 0.2520677149295807, + "learning_rate": 6.750051393008612e-05, + "loss": 1.7842, + "step": 13183 + }, + { + "epoch": 4.046654389195826, + "grad_norm": 0.32546502351760864, + "learning_rate": 6.749585769225194e-05, + "loss": 1.8057, + "step": 13184 + }, + { + "epoch": 4.04696132596685, + "grad_norm": 0.27634644508361816, + "learning_rate": 6.749120128151346e-05, + "loss": 1.7708, + "step": 13185 + }, + { + "epoch": 4.047268262737876, + "grad_norm": 0.2546750009059906, + "learning_rate": 6.748654469791668e-05, + "loss": 1.8744, + "step": 13186 + }, + { + "epoch": 4.047575199508901, + "grad_norm": 0.43873605132102966, + "learning_rate": 6.748188794150761e-05, + "loss": 1.8573, + "step": 13187 + }, + { + "epoch": 4.047882136279926, + "grad_norm": 0.45526960492134094, + "learning_rate": 6.747723101233227e-05, + "loss": 1.7761, + "step": 13188 + }, + { + "epoch": 4.048189073050952, + "grad_norm": 0.24995557963848114, + "learning_rate": 6.74725739104367e-05, + "loss": 1.7679, + "step": 13189 + }, + { + "epoch": 4.048496009821977, + "grad_norm": 0.3203068971633911, + "learning_rate": 6.74679166358669e-05, + "loss": 1.7772, + "step": 13190 + }, + { + "epoch": 4.0488029465930016, + "grad_norm": 0.37020671367645264, + "learning_rate": 6.746325918866893e-05, + "loss": 1.8002, + "step": 13191 + }, + { + "epoch": 4.049109883364027, + "grad_norm": 0.2543959319591522, + "learning_rate": 6.745860156888878e-05, + "loss": 1.8057, + "step": 13192 + }, + { + "epoch": 4.049416820135052, + "grad_norm": 0.2566509246826172, + "learning_rate": 6.74539437765725e-05, + "loss": 1.7853, + "step": 13193 + }, + { + "epoch": 4.0497237569060776, + "grad_norm": 0.2545804977416992, + "learning_rate": 6.744928581176612e-05, + "loss": 1.8136, + "step": 13194 + }, + { + "epoch": 4.050030693677103, + "grad_norm": 0.24307197332382202, + "learning_rate": 6.744462767451568e-05, + "loss": 1.7919, + "step": 13195 + }, + { + "epoch": 4.050337630448127, + "grad_norm": 0.24427616596221924, + "learning_rate": 6.743996936486719e-05, + "loss": 1.8037, + "step": 13196 + }, + { + "epoch": 4.050644567219153, + "grad_norm": 0.2154439389705658, + "learning_rate": 6.743531088286673e-05, + "loss": 1.7088, + "step": 13197 + }, + { + "epoch": 4.050951503990178, + "grad_norm": 0.22251558303833008, + "learning_rate": 6.743065222856027e-05, + "loss": 1.7512, + "step": 13198 + }, + { + "epoch": 4.051258440761203, + "grad_norm": 0.2373272329568863, + "learning_rate": 6.74259934019939e-05, + "loss": 1.8056, + "step": 13199 + }, + { + "epoch": 4.051565377532229, + "grad_norm": 0.23308727145195007, + "learning_rate": 6.742133440321366e-05, + "loss": 1.731, + "step": 13200 + }, + { + "epoch": 4.051872314303253, + "grad_norm": 0.2438805252313614, + "learning_rate": 6.741667523226557e-05, + "loss": 1.7938, + "step": 13201 + }, + { + "epoch": 4.0521792510742785, + "grad_norm": 0.22354702651500702, + "learning_rate": 6.741201588919569e-05, + "loss": 1.762, + "step": 13202 + }, + { + "epoch": 4.052486187845304, + "grad_norm": 0.2505488097667694, + "learning_rate": 6.740735637405006e-05, + "loss": 1.7627, + "step": 13203 + }, + { + "epoch": 4.052793124616329, + "grad_norm": 0.21378709375858307, + "learning_rate": 6.740269668687474e-05, + "loss": 1.7598, + "step": 13204 + }, + { + "epoch": 4.0531000613873545, + "grad_norm": 0.24863660335540771, + "learning_rate": 6.739803682771577e-05, + "loss": 1.7665, + "step": 13205 + }, + { + "epoch": 4.05340699815838, + "grad_norm": 0.3041808605194092, + "learning_rate": 6.739337679661921e-05, + "loss": 1.7909, + "step": 13206 + }, + { + "epoch": 4.053713934929404, + "grad_norm": 0.2745797634124756, + "learning_rate": 6.738871659363109e-05, + "loss": 1.7547, + "step": 13207 + }, + { + "epoch": 4.05402087170043, + "grad_norm": 0.2610073387622833, + "learning_rate": 6.738405621879748e-05, + "loss": 1.7723, + "step": 13208 + }, + { + "epoch": 4.054327808471455, + "grad_norm": 0.22728075087070465, + "learning_rate": 6.737939567216446e-05, + "loss": 1.7865, + "step": 13209 + }, + { + "epoch": 4.05463474524248, + "grad_norm": 0.2877669930458069, + "learning_rate": 6.737473495377804e-05, + "loss": 1.8352, + "step": 13210 + }, + { + "epoch": 4.054941682013506, + "grad_norm": 0.35316282510757446, + "learning_rate": 6.737007406368432e-05, + "loss": 1.8202, + "step": 13211 + }, + { + "epoch": 4.05524861878453, + "grad_norm": 0.34625691175460815, + "learning_rate": 6.736541300192936e-05, + "loss": 1.8456, + "step": 13212 + }, + { + "epoch": 4.055555555555555, + "grad_norm": 0.2432134598493576, + "learning_rate": 6.736075176855917e-05, + "loss": 1.8237, + "step": 13213 + }, + { + "epoch": 4.055862492326581, + "grad_norm": 0.27446529269218445, + "learning_rate": 6.735609036361989e-05, + "loss": 1.71, + "step": 13214 + }, + { + "epoch": 4.056169429097606, + "grad_norm": 0.2870408892631531, + "learning_rate": 6.735142878715754e-05, + "loss": 1.7473, + "step": 13215 + }, + { + "epoch": 4.056476365868631, + "grad_norm": 0.22249078750610352, + "learning_rate": 6.734676703921822e-05, + "loss": 1.7462, + "step": 13216 + }, + { + "epoch": 4.056783302639656, + "grad_norm": 0.25519105792045593, + "learning_rate": 6.734210511984796e-05, + "loss": 1.7022, + "step": 13217 + }, + { + "epoch": 4.057090239410681, + "grad_norm": 0.3366561830043793, + "learning_rate": 6.733744302909285e-05, + "loss": 1.787, + "step": 13218 + }, + { + "epoch": 4.0573971761817065, + "grad_norm": 0.2443208247423172, + "learning_rate": 6.733278076699897e-05, + "loss": 1.8048, + "step": 13219 + }, + { + "epoch": 4.057704112952732, + "grad_norm": 0.2893153131008148, + "learning_rate": 6.73281183336124e-05, + "loss": 1.7805, + "step": 13220 + }, + { + "epoch": 4.058011049723757, + "grad_norm": 0.3178043067455292, + "learning_rate": 6.73234557289792e-05, + "loss": 1.8264, + "step": 13221 + }, + { + "epoch": 4.0583179864947825, + "grad_norm": 0.27355703711509705, + "learning_rate": 6.731879295314546e-05, + "loss": 1.8427, + "step": 13222 + }, + { + "epoch": 4.058624923265807, + "grad_norm": 0.32180166244506836, + "learning_rate": 6.731413000615726e-05, + "loss": 1.7332, + "step": 13223 + }, + { + "epoch": 4.058931860036832, + "grad_norm": 0.3736574351787567, + "learning_rate": 6.730946688806067e-05, + "loss": 1.7447, + "step": 13224 + }, + { + "epoch": 4.059238796807858, + "grad_norm": 0.2526068687438965, + "learning_rate": 6.73048035989018e-05, + "loss": 1.8104, + "step": 13225 + }, + { + "epoch": 4.059545733578883, + "grad_norm": 0.29076167941093445, + "learning_rate": 6.73001401387267e-05, + "loss": 1.7977, + "step": 13226 + }, + { + "epoch": 4.059852670349908, + "grad_norm": 0.37963762879371643, + "learning_rate": 6.729547650758148e-05, + "loss": 1.8336, + "step": 13227 + }, + { + "epoch": 4.060159607120933, + "grad_norm": 0.31584078073501587, + "learning_rate": 6.729081270551222e-05, + "loss": 1.7843, + "step": 13228 + }, + { + "epoch": 4.060466543891958, + "grad_norm": 0.22793468832969666, + "learning_rate": 6.728614873256502e-05, + "loss": 1.7444, + "step": 13229 + }, + { + "epoch": 4.060773480662983, + "grad_norm": 0.3114435076713562, + "learning_rate": 6.728148458878596e-05, + "loss": 1.8012, + "step": 13230 + }, + { + "epoch": 4.061080417434009, + "grad_norm": 0.29843854904174805, + "learning_rate": 6.727682027422116e-05, + "loss": 1.8014, + "step": 13231 + }, + { + "epoch": 4.061387354205034, + "grad_norm": 0.22745616734027863, + "learning_rate": 6.727215578891668e-05, + "loss": 1.7303, + "step": 13232 + }, + { + "epoch": 4.0616942909760585, + "grad_norm": 0.2701241970062256, + "learning_rate": 6.726749113291864e-05, + "loss": 1.7665, + "step": 13233 + }, + { + "epoch": 4.062001227747084, + "grad_norm": 0.29304635524749756, + "learning_rate": 6.726282630627313e-05, + "loss": 1.875, + "step": 13234 + }, + { + "epoch": 4.062308164518109, + "grad_norm": 0.21467708051204681, + "learning_rate": 6.725816130902625e-05, + "loss": 1.7442, + "step": 13235 + }, + { + "epoch": 4.0626151012891345, + "grad_norm": 0.23517470061779022, + "learning_rate": 6.72534961412241e-05, + "loss": 1.7154, + "step": 13236 + }, + { + "epoch": 4.06292203806016, + "grad_norm": 0.21483808755874634, + "learning_rate": 6.724883080291278e-05, + "loss": 1.7162, + "step": 13237 + }, + { + "epoch": 4.063228974831185, + "grad_norm": 0.2274744212627411, + "learning_rate": 6.724416529413843e-05, + "loss": 1.8066, + "step": 13238 + }, + { + "epoch": 4.06353591160221, + "grad_norm": 0.24682378768920898, + "learning_rate": 6.723949961494712e-05, + "loss": 1.7905, + "step": 13239 + }, + { + "epoch": 4.063842848373235, + "grad_norm": 0.2516227066516876, + "learning_rate": 6.723483376538498e-05, + "loss": 1.7693, + "step": 13240 + }, + { + "epoch": 4.06414978514426, + "grad_norm": 0.22076398134231567, + "learning_rate": 6.723016774549808e-05, + "loss": 1.7357, + "step": 13241 + }, + { + "epoch": 4.064456721915286, + "grad_norm": 0.20741026103496552, + "learning_rate": 6.722550155533258e-05, + "loss": 1.8082, + "step": 13242 + }, + { + "epoch": 4.064763658686311, + "grad_norm": 0.2074010819196701, + "learning_rate": 6.722083519493458e-05, + "loss": 1.71, + "step": 13243 + }, + { + "epoch": 4.065070595457335, + "grad_norm": 0.2661527991294861, + "learning_rate": 6.72161686643502e-05, + "loss": 1.7448, + "step": 13244 + }, + { + "epoch": 4.065377532228361, + "grad_norm": 0.2877216935157776, + "learning_rate": 6.721150196362555e-05, + "loss": 1.7574, + "step": 13245 + }, + { + "epoch": 4.065684468999386, + "grad_norm": 0.2520955801010132, + "learning_rate": 6.720683509280675e-05, + "loss": 1.7717, + "step": 13246 + }, + { + "epoch": 4.065991405770411, + "grad_norm": 0.2219560444355011, + "learning_rate": 6.72021680519399e-05, + "loss": 1.7355, + "step": 13247 + }, + { + "epoch": 4.066298342541437, + "grad_norm": 0.24671706557273865, + "learning_rate": 6.719750084107117e-05, + "loss": 1.8204, + "step": 13248 + }, + { + "epoch": 4.066605279312462, + "grad_norm": 0.24512135982513428, + "learning_rate": 6.719283346024664e-05, + "loss": 1.826, + "step": 13249 + }, + { + "epoch": 4.0669122160834865, + "grad_norm": 0.24370841681957245, + "learning_rate": 6.718816590951247e-05, + "loss": 1.8322, + "step": 13250 + }, + { + "epoch": 4.067219152854512, + "grad_norm": 0.2312363088130951, + "learning_rate": 6.718349818891475e-05, + "loss": 1.7621, + "step": 13251 + }, + { + "epoch": 4.067526089625537, + "grad_norm": 0.2500494420528412, + "learning_rate": 6.717883029849965e-05, + "loss": 1.829, + "step": 13252 + }, + { + "epoch": 4.0678330263965625, + "grad_norm": 0.29882633686065674, + "learning_rate": 6.717416223831324e-05, + "loss": 1.799, + "step": 13253 + }, + { + "epoch": 4.068139963167588, + "grad_norm": 0.21962928771972656, + "learning_rate": 6.716949400840172e-05, + "loss": 1.7714, + "step": 13254 + }, + { + "epoch": 4.068446899938612, + "grad_norm": 0.25544899702072144, + "learning_rate": 6.716482560881121e-05, + "loss": 1.7911, + "step": 13255 + }, + { + "epoch": 4.068753836709638, + "grad_norm": 0.24865686893463135, + "learning_rate": 6.716015703958781e-05, + "loss": 1.7107, + "step": 13256 + }, + { + "epoch": 4.069060773480663, + "grad_norm": 0.22669239342212677, + "learning_rate": 6.715548830077769e-05, + "loss": 1.8503, + "step": 13257 + }, + { + "epoch": 4.069367710251688, + "grad_norm": 0.2973819077014923, + "learning_rate": 6.715081939242698e-05, + "loss": 1.7859, + "step": 13258 + }, + { + "epoch": 4.069674647022714, + "grad_norm": 0.3178746700286865, + "learning_rate": 6.714615031458181e-05, + "loss": 1.7705, + "step": 13259 + }, + { + "epoch": 4.069981583793738, + "grad_norm": 0.20452535152435303, + "learning_rate": 6.714148106728835e-05, + "loss": 1.7386, + "step": 13260 + }, + { + "epoch": 4.070288520564763, + "grad_norm": 0.30288320779800415, + "learning_rate": 6.713681165059271e-05, + "loss": 1.7823, + "step": 13261 + }, + { + "epoch": 4.070595457335789, + "grad_norm": 0.30014416575431824, + "learning_rate": 6.713214206454107e-05, + "loss": 1.7626, + "step": 13262 + }, + { + "epoch": 4.070902394106814, + "grad_norm": 0.25144243240356445, + "learning_rate": 6.712747230917956e-05, + "loss": 1.8359, + "step": 13263 + }, + { + "epoch": 4.071209330877839, + "grad_norm": 0.308148592710495, + "learning_rate": 6.712280238455432e-05, + "loss": 1.7226, + "step": 13264 + }, + { + "epoch": 4.071516267648865, + "grad_norm": 0.2704198658466339, + "learning_rate": 6.711813229071151e-05, + "loss": 1.7982, + "step": 13265 + }, + { + "epoch": 4.071823204419889, + "grad_norm": 0.3928656280040741, + "learning_rate": 6.711346202769729e-05, + "loss": 1.7987, + "step": 13266 + }, + { + "epoch": 4.0721301411909145, + "grad_norm": 0.3603350520133972, + "learning_rate": 6.71087915955578e-05, + "loss": 1.7963, + "step": 13267 + }, + { + "epoch": 4.07243707796194, + "grad_norm": 0.2673214077949524, + "learning_rate": 6.710412099433921e-05, + "loss": 1.8011, + "step": 13268 + }, + { + "epoch": 4.072744014732965, + "grad_norm": 0.2523653209209442, + "learning_rate": 6.709945022408768e-05, + "loss": 1.755, + "step": 13269 + }, + { + "epoch": 4.0730509515039905, + "grad_norm": 0.3818903863430023, + "learning_rate": 6.709477928484934e-05, + "loss": 1.7968, + "step": 13270 + }, + { + "epoch": 4.073357888275015, + "grad_norm": 0.31509929895401, + "learning_rate": 6.709010817667039e-05, + "loss": 1.744, + "step": 13271 + }, + { + "epoch": 4.07366482504604, + "grad_norm": 0.21875518560409546, + "learning_rate": 6.708543689959697e-05, + "loss": 1.7511, + "step": 13272 + }, + { + "epoch": 4.073971761817066, + "grad_norm": 0.25381338596343994, + "learning_rate": 6.708076545367523e-05, + "loss": 1.7523, + "step": 13273 + }, + { + "epoch": 4.074278698588091, + "grad_norm": 0.24193842709064484, + "learning_rate": 6.707609383895137e-05, + "loss": 1.7713, + "step": 13274 + }, + { + "epoch": 4.074585635359116, + "grad_norm": 0.21972359716892242, + "learning_rate": 6.707142205547154e-05, + "loss": 1.7329, + "step": 13275 + }, + { + "epoch": 4.074892572130141, + "grad_norm": 0.22188499569892883, + "learning_rate": 6.706675010328192e-05, + "loss": 1.7507, + "step": 13276 + }, + { + "epoch": 4.075199508901166, + "grad_norm": 0.23344436287879944, + "learning_rate": 6.706207798242865e-05, + "loss": 1.771, + "step": 13277 + }, + { + "epoch": 4.0755064456721914, + "grad_norm": 0.3008805513381958, + "learning_rate": 6.705740569295795e-05, + "loss": 1.775, + "step": 13278 + }, + { + "epoch": 4.075813382443217, + "grad_norm": 0.31407982110977173, + "learning_rate": 6.705273323491595e-05, + "loss": 1.7625, + "step": 13279 + }, + { + "epoch": 4.076120319214242, + "grad_norm": 0.2430381178855896, + "learning_rate": 6.704806060834886e-05, + "loss": 1.7706, + "step": 13280 + }, + { + "epoch": 4.0764272559852675, + "grad_norm": 0.23250171542167664, + "learning_rate": 6.704338781330284e-05, + "loss": 1.7977, + "step": 13281 + }, + { + "epoch": 4.076734192756292, + "grad_norm": 0.22073723375797272, + "learning_rate": 6.703871484982407e-05, + "loss": 1.7686, + "step": 13282 + }, + { + "epoch": 4.077041129527317, + "grad_norm": 0.24987035989761353, + "learning_rate": 6.703404171795874e-05, + "loss": 1.736, + "step": 13283 + }, + { + "epoch": 4.077348066298343, + "grad_norm": 0.2697623670101166, + "learning_rate": 6.702936841775301e-05, + "loss": 1.8367, + "step": 13284 + }, + { + "epoch": 4.077655003069368, + "grad_norm": 0.21592749655246735, + "learning_rate": 6.702469494925309e-05, + "loss": 1.7467, + "step": 13285 + }, + { + "epoch": 4.077961939840393, + "grad_norm": 0.2612052261829376, + "learning_rate": 6.702002131250515e-05, + "loss": 1.7689, + "step": 13286 + }, + { + "epoch": 4.078268876611418, + "grad_norm": 0.3004797697067261, + "learning_rate": 6.701534750755539e-05, + "loss": 1.7586, + "step": 13287 + }, + { + "epoch": 4.078575813382443, + "grad_norm": 0.24615366756916046, + "learning_rate": 6.701067353444998e-05, + "loss": 1.7636, + "step": 13288 + }, + { + "epoch": 4.078882750153468, + "grad_norm": 0.23401159048080444, + "learning_rate": 6.700599939323515e-05, + "loss": 1.8015, + "step": 13289 + }, + { + "epoch": 4.079189686924494, + "grad_norm": 0.24546295404434204, + "learning_rate": 6.700132508395705e-05, + "loss": 1.7606, + "step": 13290 + }, + { + "epoch": 4.079496623695519, + "grad_norm": 0.24664412438869476, + "learning_rate": 6.69966506066619e-05, + "loss": 1.7994, + "step": 13291 + }, + { + "epoch": 4.0798035604665435, + "grad_norm": 0.2780163288116455, + "learning_rate": 6.699197596139587e-05, + "loss": 1.7972, + "step": 13292 + }, + { + "epoch": 4.080110497237569, + "grad_norm": 0.2554188668727875, + "learning_rate": 6.698730114820517e-05, + "loss": 1.7928, + "step": 13293 + }, + { + "epoch": 4.080417434008594, + "grad_norm": 0.2471141666173935, + "learning_rate": 6.698262616713602e-05, + "loss": 1.7948, + "step": 13294 + }, + { + "epoch": 4.0807243707796195, + "grad_norm": 0.2556581199169159, + "learning_rate": 6.697795101823461e-05, + "loss": 1.7942, + "step": 13295 + }, + { + "epoch": 4.081031307550645, + "grad_norm": 0.24462421238422394, + "learning_rate": 6.697327570154712e-05, + "loss": 1.7336, + "step": 13296 + }, + { + "epoch": 4.08133824432167, + "grad_norm": 0.22378689050674438, + "learning_rate": 6.696860021711978e-05, + "loss": 1.7703, + "step": 13297 + }, + { + "epoch": 4.081645181092695, + "grad_norm": 0.23949933052062988, + "learning_rate": 6.69639245649988e-05, + "loss": 1.7651, + "step": 13298 + }, + { + "epoch": 4.08195211786372, + "grad_norm": 0.27751216292381287, + "learning_rate": 6.695924874523035e-05, + "loss": 1.7866, + "step": 13299 + }, + { + "epoch": 4.082259054634745, + "grad_norm": 0.22700226306915283, + "learning_rate": 6.695457275786068e-05, + "loss": 1.79, + "step": 13300 + }, + { + "epoch": 4.082565991405771, + "grad_norm": 0.2138090431690216, + "learning_rate": 6.694989660293598e-05, + "loss": 1.7882, + "step": 13301 + }, + { + "epoch": 4.082872928176796, + "grad_norm": 0.2963469326496124, + "learning_rate": 6.694522028050246e-05, + "loss": 1.8779, + "step": 13302 + }, + { + "epoch": 4.08317986494782, + "grad_norm": 0.31833669543266296, + "learning_rate": 6.694054379060634e-05, + "loss": 1.7923, + "step": 13303 + }, + { + "epoch": 4.083486801718846, + "grad_norm": 0.27751585841178894, + "learning_rate": 6.693586713329385e-05, + "loss": 1.7557, + "step": 13304 + }, + { + "epoch": 4.083793738489871, + "grad_norm": 0.23790816962718964, + "learning_rate": 6.69311903086112e-05, + "loss": 1.7587, + "step": 13305 + }, + { + "epoch": 4.084100675260896, + "grad_norm": 0.24153777956962585, + "learning_rate": 6.692651331660458e-05, + "loss": 1.7573, + "step": 13306 + }, + { + "epoch": 4.084407612031922, + "grad_norm": 0.26607179641723633, + "learning_rate": 6.692183615732025e-05, + "loss": 1.7823, + "step": 13307 + }, + { + "epoch": 4.084714548802946, + "grad_norm": 0.26670268177986145, + "learning_rate": 6.691715883080442e-05, + "loss": 1.784, + "step": 13308 + }, + { + "epoch": 4.0850214855739715, + "grad_norm": 0.25980666279792786, + "learning_rate": 6.69124813371033e-05, + "loss": 1.797, + "step": 13309 + }, + { + "epoch": 4.085328422344997, + "grad_norm": 0.2805597484111786, + "learning_rate": 6.690780367626314e-05, + "loss": 1.8298, + "step": 13310 + }, + { + "epoch": 4.085635359116022, + "grad_norm": 0.27198413014411926, + "learning_rate": 6.690312584833012e-05, + "loss": 1.8104, + "step": 13311 + }, + { + "epoch": 4.0859422958870475, + "grad_norm": 0.2619116008281708, + "learning_rate": 6.689844785335054e-05, + "loss": 1.771, + "step": 13312 + }, + { + "epoch": 4.086249232658073, + "grad_norm": 0.22647863626480103, + "learning_rate": 6.689376969137057e-05, + "loss": 1.8114, + "step": 13313 + }, + { + "epoch": 4.086556169429097, + "grad_norm": 1.469475507736206, + "learning_rate": 6.68890913624365e-05, + "loss": 1.8796, + "step": 13314 + }, + { + "epoch": 4.086863106200123, + "grad_norm": 0.4577515423297882, + "learning_rate": 6.68844128665945e-05, + "loss": 1.716, + "step": 13315 + }, + { + "epoch": 4.087170042971148, + "grad_norm": 0.5830543637275696, + "learning_rate": 6.687973420389085e-05, + "loss": 1.7692, + "step": 13316 + }, + { + "epoch": 4.087476979742173, + "grad_norm": 0.4404197037220001, + "learning_rate": 6.687505537437178e-05, + "loss": 1.7909, + "step": 13317 + }, + { + "epoch": 4.087783916513199, + "grad_norm": 0.31379908323287964, + "learning_rate": 6.68703763780835e-05, + "loss": 1.7957, + "step": 13318 + }, + { + "epoch": 4.088090853284223, + "grad_norm": 0.49588730931282043, + "learning_rate": 6.686569721507229e-05, + "loss": 1.7126, + "step": 13319 + }, + { + "epoch": 4.088397790055248, + "grad_norm": 0.3690234124660492, + "learning_rate": 6.686101788538437e-05, + "loss": 1.8233, + "step": 13320 + }, + { + "epoch": 4.088704726826274, + "grad_norm": 0.337310254573822, + "learning_rate": 6.685633838906598e-05, + "loss": 1.6886, + "step": 13321 + }, + { + "epoch": 4.089011663597299, + "grad_norm": 0.5164821147918701, + "learning_rate": 6.685165872616337e-05, + "loss": 1.7967, + "step": 13322 + }, + { + "epoch": 4.089318600368324, + "grad_norm": 0.36501309275627136, + "learning_rate": 6.68469788967228e-05, + "loss": 1.755, + "step": 13323 + }, + { + "epoch": 4.08962553713935, + "grad_norm": 0.35017216205596924, + "learning_rate": 6.684229890079052e-05, + "loss": 1.7595, + "step": 13324 + }, + { + "epoch": 4.089932473910374, + "grad_norm": 0.5622650980949402, + "learning_rate": 6.683761873841277e-05, + "loss": 1.7841, + "step": 13325 + }, + { + "epoch": 4.0902394106813995, + "grad_norm": 0.47010260820388794, + "learning_rate": 6.683293840963578e-05, + "loss": 1.7537, + "step": 13326 + }, + { + "epoch": 4.090546347452425, + "grad_norm": 0.25515374541282654, + "learning_rate": 6.682825791450584e-05, + "loss": 1.7692, + "step": 13327 + }, + { + "epoch": 4.09085328422345, + "grad_norm": 0.5063003897666931, + "learning_rate": 6.682357725306919e-05, + "loss": 1.7454, + "step": 13328 + }, + { + "epoch": 4.0911602209944755, + "grad_norm": 0.4197622835636139, + "learning_rate": 6.681889642537209e-05, + "loss": 1.7792, + "step": 13329 + }, + { + "epoch": 4.0914671577655, + "grad_norm": 0.24038295447826385, + "learning_rate": 6.68142154314608e-05, + "loss": 1.7631, + "step": 13330 + }, + { + "epoch": 4.091774094536525, + "grad_norm": 0.42108532786369324, + "learning_rate": 6.680953427138159e-05, + "loss": 1.7784, + "step": 13331 + }, + { + "epoch": 4.092081031307551, + "grad_norm": 0.33729633688926697, + "learning_rate": 6.68048529451807e-05, + "loss": 1.8057, + "step": 13332 + }, + { + "epoch": 4.092387968078576, + "grad_norm": 0.31847241520881653, + "learning_rate": 6.68001714529044e-05, + "loss": 1.7375, + "step": 13333 + }, + { + "epoch": 4.092694904849601, + "grad_norm": 0.45276644825935364, + "learning_rate": 6.679548979459896e-05, + "loss": 1.7507, + "step": 13334 + }, + { + "epoch": 4.093001841620626, + "grad_norm": 0.3781665861606598, + "learning_rate": 6.679080797031065e-05, + "loss": 1.7718, + "step": 13335 + }, + { + "epoch": 4.093308778391651, + "grad_norm": 0.25868359208106995, + "learning_rate": 6.678612598008573e-05, + "loss": 1.8105, + "step": 13336 + }, + { + "epoch": 4.093615715162676, + "grad_norm": 0.32834702730178833, + "learning_rate": 6.678144382397048e-05, + "loss": 1.7883, + "step": 13337 + }, + { + "epoch": 4.093922651933702, + "grad_norm": 0.2830568253993988, + "learning_rate": 6.677676150201116e-05, + "loss": 1.7994, + "step": 13338 + }, + { + "epoch": 4.094229588704727, + "grad_norm": 0.219541534781456, + "learning_rate": 6.677207901425405e-05, + "loss": 1.7344, + "step": 13339 + }, + { + "epoch": 4.094536525475752, + "grad_norm": 0.2557326555252075, + "learning_rate": 6.676739636074542e-05, + "loss": 1.7734, + "step": 13340 + }, + { + "epoch": 4.094843462246777, + "grad_norm": 0.2741365432739258, + "learning_rate": 6.676271354153156e-05, + "loss": 1.7912, + "step": 13341 + }, + { + "epoch": 4.095150399017802, + "grad_norm": 0.31258970499038696, + "learning_rate": 6.675803055665874e-05, + "loss": 1.7798, + "step": 13342 + }, + { + "epoch": 4.0954573357888275, + "grad_norm": 0.30181947350502014, + "learning_rate": 6.675334740617322e-05, + "loss": 1.7746, + "step": 13343 + }, + { + "epoch": 4.095764272559853, + "grad_norm": 0.3000102937221527, + "learning_rate": 6.674866409012133e-05, + "loss": 1.7842, + "step": 13344 + }, + { + "epoch": 4.096071209330878, + "grad_norm": 0.22871005535125732, + "learning_rate": 6.674398060854931e-05, + "loss": 1.7473, + "step": 13345 + }, + { + "epoch": 4.096378146101903, + "grad_norm": 0.2700810432434082, + "learning_rate": 6.673929696150346e-05, + "loss": 1.7862, + "step": 13346 + }, + { + "epoch": 4.096685082872928, + "grad_norm": 0.27537551522254944, + "learning_rate": 6.673461314903007e-05, + "loss": 1.7843, + "step": 13347 + }, + { + "epoch": 4.096992019643953, + "grad_norm": 0.23700574040412903, + "learning_rate": 6.672992917117542e-05, + "loss": 1.765, + "step": 13348 + }, + { + "epoch": 4.097298956414979, + "grad_norm": 0.23331589996814728, + "learning_rate": 6.672524502798583e-05, + "loss": 1.7894, + "step": 13349 + }, + { + "epoch": 4.097605893186004, + "grad_norm": 0.28591978549957275, + "learning_rate": 6.672056071950753e-05, + "loss": 1.7736, + "step": 13350 + }, + { + "epoch": 4.097912829957028, + "grad_norm": 0.3000452518463135, + "learning_rate": 6.671587624578685e-05, + "loss": 1.7635, + "step": 13351 + }, + { + "epoch": 4.098219766728054, + "grad_norm": 0.21877998113632202, + "learning_rate": 6.67111916068701e-05, + "loss": 1.7225, + "step": 13352 + }, + { + "epoch": 4.098526703499079, + "grad_norm": 0.2598817050457001, + "learning_rate": 6.670650680280358e-05, + "loss": 1.6874, + "step": 13353 + }, + { + "epoch": 4.098833640270104, + "grad_norm": 0.3063203692436218, + "learning_rate": 6.670182183363353e-05, + "loss": 1.7821, + "step": 13354 + }, + { + "epoch": 4.09914057704113, + "grad_norm": 0.2328508347272873, + "learning_rate": 6.66971366994063e-05, + "loss": 1.788, + "step": 13355 + }, + { + "epoch": 4.099447513812155, + "grad_norm": 0.33936765789985657, + "learning_rate": 6.669245140016817e-05, + "loss": 1.8159, + "step": 13356 + }, + { + "epoch": 4.0997544505831796, + "grad_norm": 0.27464553713798523, + "learning_rate": 6.668776593596546e-05, + "loss": 1.7371, + "step": 13357 + }, + { + "epoch": 4.100061387354205, + "grad_norm": 0.24255812168121338, + "learning_rate": 6.668308030684447e-05, + "loss": 1.7993, + "step": 13358 + }, + { + "epoch": 4.10036832412523, + "grad_norm": 0.27203628420829773, + "learning_rate": 6.667839451285149e-05, + "loss": 1.8253, + "step": 13359 + }, + { + "epoch": 4.100675260896256, + "grad_norm": 0.2503862679004669, + "learning_rate": 6.667370855403286e-05, + "loss": 1.7927, + "step": 13360 + }, + { + "epoch": 4.100982197667281, + "grad_norm": 0.2616904377937317, + "learning_rate": 6.666902243043486e-05, + "loss": 1.8226, + "step": 13361 + }, + { + "epoch": 4.101289134438305, + "grad_norm": 0.26707521080970764, + "learning_rate": 6.666433614210379e-05, + "loss": 1.8485, + "step": 13362 + }, + { + "epoch": 4.101596071209331, + "grad_norm": 0.2427528202533722, + "learning_rate": 6.6659649689086e-05, + "loss": 1.7387, + "step": 13363 + }, + { + "epoch": 4.101903007980356, + "grad_norm": 0.2319549173116684, + "learning_rate": 6.66549630714278e-05, + "loss": 1.7396, + "step": 13364 + }, + { + "epoch": 4.102209944751381, + "grad_norm": 0.2248002141714096, + "learning_rate": 6.665027628917548e-05, + "loss": 1.7817, + "step": 13365 + }, + { + "epoch": 4.102516881522407, + "grad_norm": 0.21929535269737244, + "learning_rate": 6.664558934237538e-05, + "loss": 1.7478, + "step": 13366 + }, + { + "epoch": 4.102823818293431, + "grad_norm": 0.21144583821296692, + "learning_rate": 6.66409022310738e-05, + "loss": 1.7602, + "step": 13367 + }, + { + "epoch": 4.1031307550644565, + "grad_norm": 0.21984660625457764, + "learning_rate": 6.663621495531707e-05, + "loss": 1.7541, + "step": 13368 + }, + { + "epoch": 4.103437691835482, + "grad_norm": 0.2075357735157013, + "learning_rate": 6.663152751515152e-05, + "loss": 1.7362, + "step": 13369 + }, + { + "epoch": 4.103744628606507, + "grad_norm": 0.23316961526870728, + "learning_rate": 6.662683991062347e-05, + "loss": 1.8273, + "step": 13370 + }, + { + "epoch": 4.1040515653775325, + "grad_norm": 0.23142337799072266, + "learning_rate": 6.662215214177922e-05, + "loss": 1.7543, + "step": 13371 + }, + { + "epoch": 4.104358502148558, + "grad_norm": 0.24335260689258575, + "learning_rate": 6.661746420866515e-05, + "loss": 1.8328, + "step": 13372 + }, + { + "epoch": 4.104665438919582, + "grad_norm": 0.2440192997455597, + "learning_rate": 6.661277611132753e-05, + "loss": 1.8114, + "step": 13373 + }, + { + "epoch": 4.104972375690608, + "grad_norm": 0.252808541059494, + "learning_rate": 6.660808784981273e-05, + "loss": 1.8556, + "step": 13374 + }, + { + "epoch": 4.105279312461633, + "grad_norm": 0.24564477801322937, + "learning_rate": 6.660339942416708e-05, + "loss": 1.8231, + "step": 13375 + }, + { + "epoch": 4.105586249232658, + "grad_norm": 0.2371874898672104, + "learning_rate": 6.65987108344369e-05, + "loss": 1.7763, + "step": 13376 + }, + { + "epoch": 4.105893186003684, + "grad_norm": 0.22882802784442902, + "learning_rate": 6.659402208066854e-05, + "loss": 1.7388, + "step": 13377 + }, + { + "epoch": 4.106200122774708, + "grad_norm": 0.24857540428638458, + "learning_rate": 6.658933316290832e-05, + "loss": 1.7735, + "step": 13378 + }, + { + "epoch": 4.106507059545733, + "grad_norm": 0.22574029862880707, + "learning_rate": 6.658464408120257e-05, + "loss": 1.7403, + "step": 13379 + }, + { + "epoch": 4.106813996316759, + "grad_norm": 0.24944272637367249, + "learning_rate": 6.657995483559767e-05, + "loss": 1.7827, + "step": 13380 + }, + { + "epoch": 4.107120933087784, + "grad_norm": 0.27386224269866943, + "learning_rate": 6.657526542613992e-05, + "loss": 1.7673, + "step": 13381 + }, + { + "epoch": 4.107427869858809, + "grad_norm": 0.29222097992897034, + "learning_rate": 6.65705758528757e-05, + "loss": 1.7958, + "step": 13382 + }, + { + "epoch": 4.107734806629834, + "grad_norm": 0.2471150904893875, + "learning_rate": 6.656588611585133e-05, + "loss": 1.7706, + "step": 13383 + }, + { + "epoch": 4.108041743400859, + "grad_norm": 0.289316862821579, + "learning_rate": 6.656119621511317e-05, + "loss": 1.7828, + "step": 13384 + }, + { + "epoch": 4.1083486801718845, + "grad_norm": 0.36710497736930847, + "learning_rate": 6.655650615070756e-05, + "loss": 1.712, + "step": 13385 + }, + { + "epoch": 4.10865561694291, + "grad_norm": 0.2999880611896515, + "learning_rate": 6.655181592268084e-05, + "loss": 1.7711, + "step": 13386 + }, + { + "epoch": 4.108962553713935, + "grad_norm": 0.332011342048645, + "learning_rate": 6.654712553107939e-05, + "loss": 1.907, + "step": 13387 + }, + { + "epoch": 4.1092694904849605, + "grad_norm": 0.43125995993614197, + "learning_rate": 6.654243497594953e-05, + "loss": 1.7819, + "step": 13388 + }, + { + "epoch": 4.109576427255985, + "grad_norm": 0.33719149231910706, + "learning_rate": 6.653774425733765e-05, + "loss": 1.797, + "step": 13389 + }, + { + "epoch": 4.10988336402701, + "grad_norm": 0.23091599345207214, + "learning_rate": 6.653305337529006e-05, + "loss": 1.7384, + "step": 13390 + }, + { + "epoch": 4.110190300798036, + "grad_norm": 0.4283982515335083, + "learning_rate": 6.652836232985317e-05, + "loss": 1.8284, + "step": 13391 + }, + { + "epoch": 4.110497237569061, + "grad_norm": 0.43575870990753174, + "learning_rate": 6.652367112107332e-05, + "loss": 1.7235, + "step": 13392 + }, + { + "epoch": 4.110804174340086, + "grad_norm": 0.246877059340477, + "learning_rate": 6.651897974899685e-05, + "loss": 1.7174, + "step": 13393 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.36063629388809204, + "learning_rate": 6.651428821367015e-05, + "loss": 1.8064, + "step": 13394 + }, + { + "epoch": 4.111418047882136, + "grad_norm": 0.4454420804977417, + "learning_rate": 6.650959651513957e-05, + "loss": 1.7575, + "step": 13395 + }, + { + "epoch": 4.111724984653161, + "grad_norm": 0.2788856327533722, + "learning_rate": 6.650490465345149e-05, + "loss": 1.7696, + "step": 13396 + }, + { + "epoch": 4.112031921424187, + "grad_norm": 0.40281879901885986, + "learning_rate": 6.650021262865225e-05, + "loss": 1.8368, + "step": 13397 + }, + { + "epoch": 4.112338858195212, + "grad_norm": 0.5151103138923645, + "learning_rate": 6.649552044078825e-05, + "loss": 1.8224, + "step": 13398 + }, + { + "epoch": 4.112645794966237, + "grad_norm": 0.29390639066696167, + "learning_rate": 6.649082808990586e-05, + "loss": 1.7846, + "step": 13399 + }, + { + "epoch": 4.112952731737262, + "grad_norm": 0.3061942458152771, + "learning_rate": 6.648613557605142e-05, + "loss": 1.7954, + "step": 13400 + }, + { + "epoch": 4.113259668508287, + "grad_norm": 0.47628748416900635, + "learning_rate": 6.648144289927132e-05, + "loss": 1.7782, + "step": 13401 + }, + { + "epoch": 4.1135666052793125, + "grad_norm": 0.4299588203430176, + "learning_rate": 6.647675005961197e-05, + "loss": 1.7459, + "step": 13402 + }, + { + "epoch": 4.113873542050338, + "grad_norm": 0.24556589126586914, + "learning_rate": 6.64720570571197e-05, + "loss": 1.753, + "step": 13403 + }, + { + "epoch": 4.114180478821363, + "grad_norm": 0.29620522260665894, + "learning_rate": 6.646736389184092e-05, + "loss": 1.773, + "step": 13404 + }, + { + "epoch": 4.114487415592388, + "grad_norm": 0.37710070610046387, + "learning_rate": 6.646267056382199e-05, + "loss": 1.8389, + "step": 13405 + }, + { + "epoch": 4.114794352363413, + "grad_norm": 0.2562984824180603, + "learning_rate": 6.64579770731093e-05, + "loss": 1.7905, + "step": 13406 + }, + { + "epoch": 4.115101289134438, + "grad_norm": 0.3999946713447571, + "learning_rate": 6.645328341974924e-05, + "loss": 1.7734, + "step": 13407 + }, + { + "epoch": 4.115408225905464, + "grad_norm": 0.36087217926979065, + "learning_rate": 6.644858960378817e-05, + "loss": 1.801, + "step": 13408 + }, + { + "epoch": 4.115715162676489, + "grad_norm": 0.2520254850387573, + "learning_rate": 6.644389562527251e-05, + "loss": 1.7394, + "step": 13409 + }, + { + "epoch": 4.116022099447513, + "grad_norm": 0.4321835935115814, + "learning_rate": 6.643920148424864e-05, + "loss": 1.8091, + "step": 13410 + }, + { + "epoch": 4.116329036218539, + "grad_norm": 0.40900173783302307, + "learning_rate": 6.643450718076294e-05, + "loss": 1.8198, + "step": 13411 + }, + { + "epoch": 4.116635972989564, + "grad_norm": 0.23693956434726715, + "learning_rate": 6.642981271486182e-05, + "loss": 1.6807, + "step": 13412 + }, + { + "epoch": 4.116942909760589, + "grad_norm": 0.33526891469955444, + "learning_rate": 6.642511808659164e-05, + "loss": 1.8673, + "step": 13413 + }, + { + "epoch": 4.117249846531615, + "grad_norm": 0.4037325382232666, + "learning_rate": 6.642042329599883e-05, + "loss": 1.743, + "step": 13414 + }, + { + "epoch": 4.11755678330264, + "grad_norm": 0.25629740953445435, + "learning_rate": 6.641572834312975e-05, + "loss": 1.6904, + "step": 13415 + }, + { + "epoch": 4.1178637200736645, + "grad_norm": 0.29203253984451294, + "learning_rate": 6.641103322803087e-05, + "loss": 1.7811, + "step": 13416 + }, + { + "epoch": 4.11817065684469, + "grad_norm": 0.423926442861557, + "learning_rate": 6.64063379507485e-05, + "loss": 1.7341, + "step": 13417 + }, + { + "epoch": 4.118477593615715, + "grad_norm": 0.29561251401901245, + "learning_rate": 6.64016425113291e-05, + "loss": 1.7915, + "step": 13418 + }, + { + "epoch": 4.1187845303867405, + "grad_norm": 0.2536832094192505, + "learning_rate": 6.639694690981903e-05, + "loss": 1.7628, + "step": 13419 + }, + { + "epoch": 4.119091467157766, + "grad_norm": 0.2931392192840576, + "learning_rate": 6.639225114626475e-05, + "loss": 1.7877, + "step": 13420 + }, + { + "epoch": 4.11939840392879, + "grad_norm": 0.2219499796628952, + "learning_rate": 6.638755522071263e-05, + "loss": 1.7183, + "step": 13421 + }, + { + "epoch": 4.119705340699816, + "grad_norm": 0.2951931953430176, + "learning_rate": 6.638285913320908e-05, + "loss": 1.7983, + "step": 13422 + }, + { + "epoch": 4.120012277470841, + "grad_norm": 0.3495960533618927, + "learning_rate": 6.63781628838005e-05, + "loss": 1.7531, + "step": 13423 + }, + { + "epoch": 4.120319214241866, + "grad_norm": 0.2389262616634369, + "learning_rate": 6.637346647253333e-05, + "loss": 1.7454, + "step": 13424 + }, + { + "epoch": 4.120626151012892, + "grad_norm": 0.28729167580604553, + "learning_rate": 6.636876989945395e-05, + "loss": 1.8105, + "step": 13425 + }, + { + "epoch": 4.120933087783916, + "grad_norm": 0.2620082199573517, + "learning_rate": 6.636407316460882e-05, + "loss": 1.7948, + "step": 13426 + }, + { + "epoch": 4.121240024554941, + "grad_norm": 0.2694189250469208, + "learning_rate": 6.635937626804432e-05, + "loss": 1.809, + "step": 13427 + }, + { + "epoch": 4.121546961325967, + "grad_norm": 0.2660866379737854, + "learning_rate": 6.635467920980687e-05, + "loss": 1.7431, + "step": 13428 + }, + { + "epoch": 4.121853898096992, + "grad_norm": 0.2579907774925232, + "learning_rate": 6.634998198994289e-05, + "loss": 1.7941, + "step": 13429 + }, + { + "epoch": 4.122160834868017, + "grad_norm": 0.28349989652633667, + "learning_rate": 6.634528460849881e-05, + "loss": 1.8142, + "step": 13430 + }, + { + "epoch": 4.122467771639043, + "grad_norm": 0.28716522455215454, + "learning_rate": 6.634058706552104e-05, + "loss": 1.7496, + "step": 13431 + }, + { + "epoch": 4.122774708410067, + "grad_norm": 0.23228077590465546, + "learning_rate": 6.633588936105601e-05, + "loss": 1.7399, + "step": 13432 + }, + { + "epoch": 4.1230816451810925, + "grad_norm": 0.3649841248989105, + "learning_rate": 6.633119149515017e-05, + "loss": 1.7696, + "step": 13433 + }, + { + "epoch": 4.123388581952118, + "grad_norm": 0.2757830321788788, + "learning_rate": 6.632649346784992e-05, + "loss": 1.8329, + "step": 13434 + }, + { + "epoch": 4.123695518723143, + "grad_norm": 0.28163692355155945, + "learning_rate": 6.632179527920167e-05, + "loss": 1.7761, + "step": 13435 + }, + { + "epoch": 4.1240024554941686, + "grad_norm": 0.3453187048435211, + "learning_rate": 6.631709692925188e-05, + "loss": 1.7843, + "step": 13436 + }, + { + "epoch": 4.124309392265193, + "grad_norm": 0.2792697250843048, + "learning_rate": 6.631239841804698e-05, + "loss": 1.7889, + "step": 13437 + }, + { + "epoch": 4.124616329036218, + "grad_norm": 0.21881693601608276, + "learning_rate": 6.630769974563339e-05, + "loss": 1.8015, + "step": 13438 + }, + { + "epoch": 4.124923265807244, + "grad_norm": 0.4464910328388214, + "learning_rate": 6.630300091205756e-05, + "loss": 1.7851, + "step": 13439 + }, + { + "epoch": 4.125230202578269, + "grad_norm": 0.40191107988357544, + "learning_rate": 6.629830191736591e-05, + "loss": 1.8608, + "step": 13440 + }, + { + "epoch": 4.125537139349294, + "grad_norm": 0.2809060513973236, + "learning_rate": 6.62936027616049e-05, + "loss": 1.7374, + "step": 13441 + }, + { + "epoch": 4.12584407612032, + "grad_norm": 0.24980643391609192, + "learning_rate": 6.628890344482095e-05, + "loss": 1.8152, + "step": 13442 + }, + { + "epoch": 4.126151012891344, + "grad_norm": 0.24538342654705048, + "learning_rate": 6.62842039670605e-05, + "loss": 1.7687, + "step": 13443 + }, + { + "epoch": 4.1264579496623695, + "grad_norm": 0.24684634804725647, + "learning_rate": 6.627950432837002e-05, + "loss": 1.787, + "step": 13444 + }, + { + "epoch": 4.126764886433395, + "grad_norm": 0.22724607586860657, + "learning_rate": 6.627480452879593e-05, + "loss": 1.7871, + "step": 13445 + }, + { + "epoch": 4.12707182320442, + "grad_norm": 0.24724406003952026, + "learning_rate": 6.627010456838469e-05, + "loss": 1.7524, + "step": 13446 + }, + { + "epoch": 4.1273787599754455, + "grad_norm": 0.24219536781311035, + "learning_rate": 6.626540444718274e-05, + "loss": 1.7754, + "step": 13447 + }, + { + "epoch": 4.12768569674647, + "grad_norm": 0.24857915937900543, + "learning_rate": 6.626070416523652e-05, + "loss": 1.7839, + "step": 13448 + }, + { + "epoch": 4.127992633517495, + "grad_norm": 0.2639105021953583, + "learning_rate": 6.625600372259248e-05, + "loss": 1.7546, + "step": 13449 + }, + { + "epoch": 4.128299570288521, + "grad_norm": 0.23598137497901917, + "learning_rate": 6.62513031192971e-05, + "loss": 1.7957, + "step": 13450 + }, + { + "epoch": 4.128606507059546, + "grad_norm": 0.3038909137248993, + "learning_rate": 6.624660235539682e-05, + "loss": 1.8117, + "step": 13451 + }, + { + "epoch": 4.128913443830571, + "grad_norm": 0.27671241760253906, + "learning_rate": 6.624190143093809e-05, + "loss": 1.729, + "step": 13452 + }, + { + "epoch": 4.129220380601596, + "grad_norm": 0.24638360738754272, + "learning_rate": 6.623720034596735e-05, + "loss": 1.7414, + "step": 13453 + }, + { + "epoch": 4.129527317372621, + "grad_norm": 0.24073924124240875, + "learning_rate": 6.623249910053111e-05, + "loss": 1.8046, + "step": 13454 + }, + { + "epoch": 4.129834254143646, + "grad_norm": 0.29734376072883606, + "learning_rate": 6.622779769467578e-05, + "loss": 1.8336, + "step": 13455 + }, + { + "epoch": 4.130141190914672, + "grad_norm": 0.23182810842990875, + "learning_rate": 6.622309612844785e-05, + "loss": 1.7742, + "step": 13456 + }, + { + "epoch": 4.130448127685697, + "grad_norm": 0.2179390788078308, + "learning_rate": 6.621839440189378e-05, + "loss": 1.7656, + "step": 13457 + }, + { + "epoch": 4.1307550644567215, + "grad_norm": 0.21389013528823853, + "learning_rate": 6.621369251506002e-05, + "loss": 1.7504, + "step": 13458 + }, + { + "epoch": 4.131062001227747, + "grad_norm": 0.22306203842163086, + "learning_rate": 6.620899046799305e-05, + "loss": 1.7573, + "step": 13459 + }, + { + "epoch": 4.131368937998772, + "grad_norm": 0.2699708938598633, + "learning_rate": 6.620428826073934e-05, + "loss": 1.7419, + "step": 13460 + }, + { + "epoch": 4.1316758747697975, + "grad_norm": 0.34087565541267395, + "learning_rate": 6.619958589334534e-05, + "loss": 1.7545, + "step": 13461 + }, + { + "epoch": 4.131982811540823, + "grad_norm": 0.2934977412223816, + "learning_rate": 6.619488336585755e-05, + "loss": 1.7611, + "step": 13462 + }, + { + "epoch": 4.132289748311848, + "grad_norm": 0.22545567154884338, + "learning_rate": 6.619018067832243e-05, + "loss": 1.7562, + "step": 13463 + }, + { + "epoch": 4.132596685082873, + "grad_norm": 0.23334743082523346, + "learning_rate": 6.618547783078647e-05, + "loss": 1.7784, + "step": 13464 + }, + { + "epoch": 4.132903621853898, + "grad_norm": 0.22466403245925903, + "learning_rate": 6.618077482329612e-05, + "loss": 1.7277, + "step": 13465 + }, + { + "epoch": 4.133210558624923, + "grad_norm": 0.23504197597503662, + "learning_rate": 6.617607165589785e-05, + "loss": 1.7983, + "step": 13466 + }, + { + "epoch": 4.133517495395949, + "grad_norm": 0.2500833570957184, + "learning_rate": 6.617136832863819e-05, + "loss": 1.7826, + "step": 13467 + }, + { + "epoch": 4.133824432166974, + "grad_norm": 0.22398658096790314, + "learning_rate": 6.616666484156357e-05, + "loss": 1.7281, + "step": 13468 + }, + { + "epoch": 4.134131368937998, + "grad_norm": 0.2537873089313507, + "learning_rate": 6.616196119472052e-05, + "loss": 1.7598, + "step": 13469 + }, + { + "epoch": 4.134438305709024, + "grad_norm": 0.26881173253059387, + "learning_rate": 6.615725738815546e-05, + "loss": 1.8161, + "step": 13470 + }, + { + "epoch": 4.134745242480049, + "grad_norm": 0.3311346471309662, + "learning_rate": 6.615255342191492e-05, + "loss": 1.7954, + "step": 13471 + }, + { + "epoch": 4.135052179251074, + "grad_norm": 0.2562953233718872, + "learning_rate": 6.614784929604539e-05, + "loss": 1.7284, + "step": 13472 + }, + { + "epoch": 4.1353591160221, + "grad_norm": 0.2563154101371765, + "learning_rate": 6.614314501059334e-05, + "loss": 1.7995, + "step": 13473 + }, + { + "epoch": 4.135666052793125, + "grad_norm": 0.24861161410808563, + "learning_rate": 6.613844056560527e-05, + "loss": 1.7589, + "step": 13474 + }, + { + "epoch": 4.1359729895641495, + "grad_norm": 0.23815487325191498, + "learning_rate": 6.613373596112769e-05, + "loss": 1.6906, + "step": 13475 + }, + { + "epoch": 4.136279926335175, + "grad_norm": 0.25394049286842346, + "learning_rate": 6.612903119720705e-05, + "loss": 1.781, + "step": 13476 + }, + { + "epoch": 4.1365868631062, + "grad_norm": 0.24501466751098633, + "learning_rate": 6.612432627388988e-05, + "loss": 1.797, + "step": 13477 + }, + { + "epoch": 4.1368937998772255, + "grad_norm": 0.24909707903862, + "learning_rate": 6.611962119122267e-05, + "loss": 1.7643, + "step": 13478 + }, + { + "epoch": 4.137200736648251, + "grad_norm": 0.24954476952552795, + "learning_rate": 6.611491594925192e-05, + "loss": 1.8219, + "step": 13479 + }, + { + "epoch": 4.137507673419275, + "grad_norm": 0.30572372674942017, + "learning_rate": 6.611021054802411e-05, + "loss": 1.8039, + "step": 13480 + }, + { + "epoch": 4.137814610190301, + "grad_norm": 0.27466365694999695, + "learning_rate": 6.610550498758577e-05, + "loss": 1.6945, + "step": 13481 + }, + { + "epoch": 4.138121546961326, + "grad_norm": 0.2614271640777588, + "learning_rate": 6.610079926798339e-05, + "loss": 1.8648, + "step": 13482 + }, + { + "epoch": 4.138428483732351, + "grad_norm": 0.23645827174186707, + "learning_rate": 6.609609338926346e-05, + "loss": 1.7424, + "step": 13483 + }, + { + "epoch": 4.138735420503377, + "grad_norm": 0.24473626911640167, + "learning_rate": 6.609138735147253e-05, + "loss": 1.8036, + "step": 13484 + }, + { + "epoch": 4.139042357274401, + "grad_norm": 0.2472417950630188, + "learning_rate": 6.608668115465706e-05, + "loss": 1.794, + "step": 13485 + }, + { + "epoch": 4.139349294045426, + "grad_norm": 0.25330284237861633, + "learning_rate": 6.608197479886358e-05, + "loss": 1.8052, + "step": 13486 + }, + { + "epoch": 4.139656230816452, + "grad_norm": 0.24279309809207916, + "learning_rate": 6.60772682841386e-05, + "loss": 1.7375, + "step": 13487 + }, + { + "epoch": 4.139963167587477, + "grad_norm": 0.22319461405277252, + "learning_rate": 6.607256161052862e-05, + "loss": 1.7696, + "step": 13488 + }, + { + "epoch": 4.140270104358502, + "grad_norm": 0.25261563062667847, + "learning_rate": 6.606785477808017e-05, + "loss": 1.7646, + "step": 13489 + }, + { + "epoch": 4.140577041129528, + "grad_norm": 0.3127744793891907, + "learning_rate": 6.606314778683977e-05, + "loss": 1.7899, + "step": 13490 + }, + { + "epoch": 4.140883977900552, + "grad_norm": 0.3550816774368286, + "learning_rate": 6.605844063685392e-05, + "loss": 1.7971, + "step": 13491 + }, + { + "epoch": 4.1411909146715775, + "grad_norm": 0.20977813005447388, + "learning_rate": 6.605373332816916e-05, + "loss": 1.7416, + "step": 13492 + }, + { + "epoch": 4.141497851442603, + "grad_norm": 0.26593849062919617, + "learning_rate": 6.6049025860832e-05, + "loss": 1.7586, + "step": 13493 + }, + { + "epoch": 4.141804788213628, + "grad_norm": 0.2452937364578247, + "learning_rate": 6.604431823488893e-05, + "loss": 1.757, + "step": 13494 + }, + { + "epoch": 4.1421117249846535, + "grad_norm": 0.21029168367385864, + "learning_rate": 6.603961045038652e-05, + "loss": 1.7665, + "step": 13495 + }, + { + "epoch": 4.142418661755678, + "grad_norm": 0.2396312952041626, + "learning_rate": 6.603490250737128e-05, + "loss": 1.7609, + "step": 13496 + }, + { + "epoch": 4.142725598526703, + "grad_norm": 0.23266808688640594, + "learning_rate": 6.603019440588975e-05, + "loss": 1.7893, + "step": 13497 + }, + { + "epoch": 4.143032535297729, + "grad_norm": 0.25235217809677124, + "learning_rate": 6.602548614598842e-05, + "loss": 1.7465, + "step": 13498 + }, + { + "epoch": 4.143339472068754, + "grad_norm": 0.22944024205207825, + "learning_rate": 6.602077772771386e-05, + "loss": 1.7052, + "step": 13499 + }, + { + "epoch": 4.143646408839779, + "grad_norm": 0.2116660475730896, + "learning_rate": 6.601606915111257e-05, + "loss": 1.7042, + "step": 13500 + }, + { + "epoch": 4.143953345610804, + "grad_norm": 0.21777184307575226, + "learning_rate": 6.601136041623111e-05, + "loss": 1.7938, + "step": 13501 + }, + { + "epoch": 4.144260282381829, + "grad_norm": 0.23663075268268585, + "learning_rate": 6.600665152311601e-05, + "loss": 1.7475, + "step": 13502 + }, + { + "epoch": 4.144567219152854, + "grad_norm": 0.20644642412662506, + "learning_rate": 6.600194247181377e-05, + "loss": 1.7992, + "step": 13503 + }, + { + "epoch": 4.14487415592388, + "grad_norm": 0.21479010581970215, + "learning_rate": 6.599723326237098e-05, + "loss": 1.7877, + "step": 13504 + }, + { + "epoch": 4.145181092694905, + "grad_norm": 0.2266562283039093, + "learning_rate": 6.599252389483413e-05, + "loss": 1.8097, + "step": 13505 + }, + { + "epoch": 4.14548802946593, + "grad_norm": 0.2053738683462143, + "learning_rate": 6.59878143692498e-05, + "loss": 1.6878, + "step": 13506 + }, + { + "epoch": 4.145794966236955, + "grad_norm": 0.19583995640277863, + "learning_rate": 6.598310468566452e-05, + "loss": 1.7547, + "step": 13507 + }, + { + "epoch": 4.14610190300798, + "grad_norm": 0.23421542346477509, + "learning_rate": 6.597839484412484e-05, + "loss": 1.7926, + "step": 13508 + }, + { + "epoch": 4.1464088397790055, + "grad_norm": 0.24575260281562805, + "learning_rate": 6.597368484467728e-05, + "loss": 1.7311, + "step": 13509 + }, + { + "epoch": 4.146715776550031, + "grad_norm": 0.27519574761390686, + "learning_rate": 6.596897468736842e-05, + "loss": 1.7858, + "step": 13510 + }, + { + "epoch": 4.147022713321056, + "grad_norm": 0.26434022188186646, + "learning_rate": 6.596426437224477e-05, + "loss": 1.7387, + "step": 13511 + }, + { + "epoch": 4.147329650092081, + "grad_norm": 0.2192772775888443, + "learning_rate": 6.595955389935291e-05, + "loss": 1.7565, + "step": 13512 + }, + { + "epoch": 4.147636586863106, + "grad_norm": 0.21047350764274597, + "learning_rate": 6.595484326873938e-05, + "loss": 1.7234, + "step": 13513 + }, + { + "epoch": 4.147943523634131, + "grad_norm": 0.22838951647281647, + "learning_rate": 6.595013248045075e-05, + "loss": 1.8205, + "step": 13514 + }, + { + "epoch": 4.148250460405157, + "grad_norm": 0.3467923402786255, + "learning_rate": 6.594542153453356e-05, + "loss": 1.7973, + "step": 13515 + }, + { + "epoch": 4.148557397176182, + "grad_norm": 0.241237074136734, + "learning_rate": 6.594071043103438e-05, + "loss": 1.7764, + "step": 13516 + }, + { + "epoch": 4.148864333947207, + "grad_norm": 0.22543516755104065, + "learning_rate": 6.593599916999973e-05, + "loss": 1.7528, + "step": 13517 + }, + { + "epoch": 4.149171270718232, + "grad_norm": 0.24590276181697845, + "learning_rate": 6.593128775147623e-05, + "loss": 1.7422, + "step": 13518 + }, + { + "epoch": 4.149478207489257, + "grad_norm": 0.2434391975402832, + "learning_rate": 6.592657617551038e-05, + "loss": 1.7523, + "step": 13519 + }, + { + "epoch": 4.149785144260282, + "grad_norm": 0.23169009387493134, + "learning_rate": 6.592186444214877e-05, + "loss": 1.8158, + "step": 13520 + }, + { + "epoch": 4.150092081031308, + "grad_norm": 0.2217840999364853, + "learning_rate": 6.591715255143798e-05, + "loss": 1.7487, + "step": 13521 + }, + { + "epoch": 4.150399017802333, + "grad_norm": 0.2405092418193817, + "learning_rate": 6.591244050342454e-05, + "loss": 1.7726, + "step": 13522 + }, + { + "epoch": 4.150705954573358, + "grad_norm": 0.29432612657546997, + "learning_rate": 6.590772829815504e-05, + "loss": 1.7841, + "step": 13523 + }, + { + "epoch": 4.151012891344383, + "grad_norm": 0.2708737850189209, + "learning_rate": 6.590301593567605e-05, + "loss": 1.8551, + "step": 13524 + }, + { + "epoch": 4.151319828115408, + "grad_norm": 0.26643216609954834, + "learning_rate": 6.589830341603413e-05, + "loss": 1.7697, + "step": 13525 + }, + { + "epoch": 4.151626764886434, + "grad_norm": 0.3672652840614319, + "learning_rate": 6.589359073927587e-05, + "loss": 1.8292, + "step": 13526 + }, + { + "epoch": 4.151933701657459, + "grad_norm": 0.2413325160741806, + "learning_rate": 6.588887790544782e-05, + "loss": 1.7514, + "step": 13527 + }, + { + "epoch": 4.152240638428483, + "grad_norm": 0.3248155117034912, + "learning_rate": 6.588416491459657e-05, + "loss": 1.7437, + "step": 13528 + }, + { + "epoch": 4.152547575199509, + "grad_norm": 0.40951836109161377, + "learning_rate": 6.587945176676869e-05, + "loss": 1.7779, + "step": 13529 + }, + { + "epoch": 4.152854511970534, + "grad_norm": 0.23874351382255554, + "learning_rate": 6.587473846201075e-05, + "loss": 1.8343, + "step": 13530 + }, + { + "epoch": 4.153161448741559, + "grad_norm": 0.4535207450389862, + "learning_rate": 6.587002500036936e-05, + "loss": 1.8301, + "step": 13531 + }, + { + "epoch": 4.153468385512585, + "grad_norm": 0.458003968000412, + "learning_rate": 6.586531138189108e-05, + "loss": 1.7053, + "step": 13532 + }, + { + "epoch": 4.153775322283609, + "grad_norm": 0.24350887537002563, + "learning_rate": 6.586059760662248e-05, + "loss": 1.7642, + "step": 13533 + }, + { + "epoch": 4.1540822590546345, + "grad_norm": 0.46951553225517273, + "learning_rate": 6.585588367461017e-05, + "loss": 1.7345, + "step": 13534 + }, + { + "epoch": 4.15438919582566, + "grad_norm": 0.5524527430534363, + "learning_rate": 6.585116958590072e-05, + "loss": 1.7677, + "step": 13535 + }, + { + "epoch": 4.154696132596685, + "grad_norm": 0.2887112498283386, + "learning_rate": 6.584645534054072e-05, + "loss": 1.7704, + "step": 13536 + }, + { + "epoch": 4.1550030693677105, + "grad_norm": 0.36243724822998047, + "learning_rate": 6.584174093857675e-05, + "loss": 1.8133, + "step": 13537 + }, + { + "epoch": 4.155310006138736, + "grad_norm": 0.3869550824165344, + "learning_rate": 6.583702638005543e-05, + "loss": 1.7253, + "step": 13538 + }, + { + "epoch": 4.15561694290976, + "grad_norm": 0.25859662890434265, + "learning_rate": 6.583231166502333e-05, + "loss": 1.7683, + "step": 13539 + }, + { + "epoch": 4.155923879680786, + "grad_norm": 0.3011144995689392, + "learning_rate": 6.582759679352704e-05, + "loss": 1.7139, + "step": 13540 + }, + { + "epoch": 4.156230816451811, + "grad_norm": 0.38033372163772583, + "learning_rate": 6.582288176561316e-05, + "loss": 1.8182, + "step": 13541 + }, + { + "epoch": 4.156537753222836, + "grad_norm": 0.2224060595035553, + "learning_rate": 6.581816658132829e-05, + "loss": 1.7527, + "step": 13542 + }, + { + "epoch": 4.156844689993862, + "grad_norm": 0.4147234261035919, + "learning_rate": 6.581345124071903e-05, + "loss": 1.7339, + "step": 13543 + }, + { + "epoch": 4.157151626764886, + "grad_norm": 0.45334625244140625, + "learning_rate": 6.580873574383198e-05, + "loss": 1.8166, + "step": 13544 + }, + { + "epoch": 4.157458563535911, + "grad_norm": 0.3050530254840851, + "learning_rate": 6.580402009071372e-05, + "loss": 1.7967, + "step": 13545 + }, + { + "epoch": 4.157765500306937, + "grad_norm": 0.25901293754577637, + "learning_rate": 6.579930428141088e-05, + "loss": 1.7806, + "step": 13546 + }, + { + "epoch": 4.158072437077962, + "grad_norm": 0.3142934739589691, + "learning_rate": 6.579458831597006e-05, + "loss": 1.7724, + "step": 13547 + }, + { + "epoch": 4.158379373848987, + "grad_norm": 0.23943179845809937, + "learning_rate": 6.578987219443787e-05, + "loss": 1.7515, + "step": 13548 + }, + { + "epoch": 4.158686310620013, + "grad_norm": 0.2838635742664337, + "learning_rate": 6.578515591686089e-05, + "loss": 1.7707, + "step": 13549 + }, + { + "epoch": 4.158993247391037, + "grad_norm": 0.3064457178115845, + "learning_rate": 6.578043948328575e-05, + "loss": 1.7839, + "step": 13550 + }, + { + "epoch": 4.1593001841620625, + "grad_norm": 0.2311718463897705, + "learning_rate": 6.577572289375907e-05, + "loss": 1.8298, + "step": 13551 + }, + { + "epoch": 4.159607120933088, + "grad_norm": 0.35726481676101685, + "learning_rate": 6.577100614832743e-05, + "loss": 1.811, + "step": 13552 + }, + { + "epoch": 4.159914057704113, + "grad_norm": 0.3176140785217285, + "learning_rate": 6.576628924703749e-05, + "loss": 1.732, + "step": 13553 + }, + { + "epoch": 4.1602209944751385, + "grad_norm": 0.2325647473335266, + "learning_rate": 6.576157218993582e-05, + "loss": 1.827, + "step": 13554 + }, + { + "epoch": 4.160527931246163, + "grad_norm": 0.32260453701019287, + "learning_rate": 6.575685497706905e-05, + "loss": 1.8218, + "step": 13555 + }, + { + "epoch": 4.160834868017188, + "grad_norm": 0.2638537287712097, + "learning_rate": 6.575213760848382e-05, + "loss": 1.7091, + "step": 13556 + }, + { + "epoch": 4.161141804788214, + "grad_norm": 0.2501799762248993, + "learning_rate": 6.574742008422671e-05, + "loss": 1.7707, + "step": 13557 + }, + { + "epoch": 4.161448741559239, + "grad_norm": 0.3212645649909973, + "learning_rate": 6.574270240434439e-05, + "loss": 1.7541, + "step": 13558 + }, + { + "epoch": 4.161755678330264, + "grad_norm": 0.25915586948394775, + "learning_rate": 6.573798456888345e-05, + "loss": 1.7597, + "step": 13559 + }, + { + "epoch": 4.162062615101289, + "grad_norm": 0.2538192868232727, + "learning_rate": 6.573326657789052e-05, + "loss": 1.8507, + "step": 13560 + }, + { + "epoch": 4.162369551872314, + "grad_norm": 0.2542131543159485, + "learning_rate": 6.572854843141223e-05, + "loss": 1.782, + "step": 13561 + }, + { + "epoch": 4.162676488643339, + "grad_norm": 0.26163414120674133, + "learning_rate": 6.572383012949521e-05, + "loss": 1.8482, + "step": 13562 + }, + { + "epoch": 4.162983425414365, + "grad_norm": 0.2566238343715668, + "learning_rate": 6.571911167218608e-05, + "loss": 1.7284, + "step": 13563 + }, + { + "epoch": 4.16329036218539, + "grad_norm": 0.28413113951683044, + "learning_rate": 6.571439305953147e-05, + "loss": 1.7473, + "step": 13564 + }, + { + "epoch": 4.163597298956415, + "grad_norm": 0.20399242639541626, + "learning_rate": 6.570967429157802e-05, + "loss": 1.6942, + "step": 13565 + }, + { + "epoch": 4.16390423572744, + "grad_norm": 0.256104439496994, + "learning_rate": 6.570495536837235e-05, + "loss": 1.7346, + "step": 13566 + }, + { + "epoch": 4.164211172498465, + "grad_norm": 0.350909560918808, + "learning_rate": 6.570023628996112e-05, + "loss": 1.8284, + "step": 13567 + }, + { + "epoch": 4.1645181092694905, + "grad_norm": 0.23500367999076843, + "learning_rate": 6.569551705639096e-05, + "loss": 1.7504, + "step": 13568 + }, + { + "epoch": 4.164825046040516, + "grad_norm": 0.26683783531188965, + "learning_rate": 6.569079766770849e-05, + "loss": 1.7293, + "step": 13569 + }, + { + "epoch": 4.165131982811541, + "grad_norm": 0.3145855963230133, + "learning_rate": 6.568607812396037e-05, + "loss": 1.8171, + "step": 13570 + }, + { + "epoch": 4.165438919582566, + "grad_norm": 0.2354860156774521, + "learning_rate": 6.568135842519324e-05, + "loss": 1.7555, + "step": 13571 + }, + { + "epoch": 4.165745856353591, + "grad_norm": 0.2893243730068207, + "learning_rate": 6.56766385714537e-05, + "loss": 1.7636, + "step": 13572 + }, + { + "epoch": 4.166052793124616, + "grad_norm": 0.20707663893699646, + "learning_rate": 6.567191856278846e-05, + "loss": 1.7239, + "step": 13573 + }, + { + "epoch": 4.166359729895642, + "grad_norm": 0.34200331568717957, + "learning_rate": 6.566719839924412e-05, + "loss": 1.7848, + "step": 13574 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.23326615989208221, + "learning_rate": 6.566247808086734e-05, + "loss": 1.7447, + "step": 13575 + }, + { + "epoch": 4.166973603437691, + "grad_norm": 0.22375629842281342, + "learning_rate": 6.565775760770479e-05, + "loss": 1.7429, + "step": 13576 + }, + { + "epoch": 4.167280540208717, + "grad_norm": 0.2412862777709961, + "learning_rate": 6.565303697980308e-05, + "loss": 1.7671, + "step": 13577 + }, + { + "epoch": 4.167587476979742, + "grad_norm": 0.2482215315103531, + "learning_rate": 6.56483161972089e-05, + "loss": 1.812, + "step": 13578 + }, + { + "epoch": 4.167894413750767, + "grad_norm": 0.2252974659204483, + "learning_rate": 6.564359525996889e-05, + "loss": 1.8173, + "step": 13579 + }, + { + "epoch": 4.168201350521793, + "grad_norm": 0.23497292399406433, + "learning_rate": 6.563887416812969e-05, + "loss": 1.7945, + "step": 13580 + }, + { + "epoch": 4.168508287292818, + "grad_norm": 0.24911245703697205, + "learning_rate": 6.563415292173796e-05, + "loss": 1.7516, + "step": 13581 + }, + { + "epoch": 4.1688152240638425, + "grad_norm": 0.20920930802822113, + "learning_rate": 6.562943152084039e-05, + "loss": 1.765, + "step": 13582 + }, + { + "epoch": 4.169122160834868, + "grad_norm": 0.26001816987991333, + "learning_rate": 6.562470996548361e-05, + "loss": 1.7504, + "step": 13583 + }, + { + "epoch": 4.169429097605893, + "grad_norm": 0.2504529058933258, + "learning_rate": 6.561998825571429e-05, + "loss": 1.7689, + "step": 13584 + }, + { + "epoch": 4.1697360343769185, + "grad_norm": 0.2210187464952469, + "learning_rate": 6.561526639157908e-05, + "loss": 1.752, + "step": 13585 + }, + { + "epoch": 4.170042971147944, + "grad_norm": 0.26323240995407104, + "learning_rate": 6.561054437312467e-05, + "loss": 1.8104, + "step": 13586 + }, + { + "epoch": 4.170349907918968, + "grad_norm": 0.20436744391918182, + "learning_rate": 6.560582220039771e-05, + "loss": 1.7281, + "step": 13587 + }, + { + "epoch": 4.170656844689994, + "grad_norm": 0.2053878903388977, + "learning_rate": 6.560109987344487e-05, + "loss": 1.7192, + "step": 13588 + }, + { + "epoch": 4.170963781461019, + "grad_norm": 0.2416568547487259, + "learning_rate": 6.559637739231281e-05, + "loss": 1.7679, + "step": 13589 + }, + { + "epoch": 4.171270718232044, + "grad_norm": 0.23847989737987518, + "learning_rate": 6.55916547570482e-05, + "loss": 1.7182, + "step": 13590 + }, + { + "epoch": 4.17157765500307, + "grad_norm": 0.2057785540819168, + "learning_rate": 6.558693196769772e-05, + "loss": 1.816, + "step": 13591 + }, + { + "epoch": 4.171884591774095, + "grad_norm": 0.2270805537700653, + "learning_rate": 6.558220902430804e-05, + "loss": 1.7091, + "step": 13592 + }, + { + "epoch": 4.172191528545119, + "grad_norm": 0.22143644094467163, + "learning_rate": 6.557748592692585e-05, + "loss": 1.7446, + "step": 13593 + }, + { + "epoch": 4.172498465316145, + "grad_norm": 0.2032770961523056, + "learning_rate": 6.557276267559781e-05, + "loss": 1.7501, + "step": 13594 + }, + { + "epoch": 4.17280540208717, + "grad_norm": 0.20851244032382965, + "learning_rate": 6.55680392703706e-05, + "loss": 1.8283, + "step": 13595 + }, + { + "epoch": 4.173112338858195, + "grad_norm": 0.2603934109210968, + "learning_rate": 6.55633157112909e-05, + "loss": 1.8523, + "step": 13596 + }, + { + "epoch": 4.173419275629221, + "grad_norm": 0.2232515811920166, + "learning_rate": 6.55585919984054e-05, + "loss": 1.7803, + "step": 13597 + }, + { + "epoch": 4.173726212400245, + "grad_norm": 0.2541115880012512, + "learning_rate": 6.555386813176075e-05, + "loss": 1.7407, + "step": 13598 + }, + { + "epoch": 4.1740331491712706, + "grad_norm": 0.3044603765010834, + "learning_rate": 6.55491441114037e-05, + "loss": 1.8257, + "step": 13599 + }, + { + "epoch": 4.174340085942296, + "grad_norm": 0.29227301478385925, + "learning_rate": 6.554441993738086e-05, + "loss": 1.7998, + "step": 13600 + }, + { + "epoch": 4.174647022713321, + "grad_norm": 0.25166594982147217, + "learning_rate": 6.553969560973896e-05, + "loss": 1.8258, + "step": 13601 + }, + { + "epoch": 4.1749539594843466, + "grad_norm": 0.22973991930484772, + "learning_rate": 6.55349711285247e-05, + "loss": 1.7871, + "step": 13602 + }, + { + "epoch": 4.175260896255371, + "grad_norm": 0.2615009844303131, + "learning_rate": 6.553024649378473e-05, + "loss": 1.7572, + "step": 13603 + }, + { + "epoch": 4.175567833026396, + "grad_norm": 0.24145473539829254, + "learning_rate": 6.552552170556576e-05, + "loss": 1.7546, + "step": 13604 + }, + { + "epoch": 4.175874769797422, + "grad_norm": 0.21989156305789948, + "learning_rate": 6.55207967639145e-05, + "loss": 1.6939, + "step": 13605 + }, + { + "epoch": 4.176181706568447, + "grad_norm": 0.206025168299675, + "learning_rate": 6.551607166887761e-05, + "loss": 1.7531, + "step": 13606 + }, + { + "epoch": 4.176488643339472, + "grad_norm": 0.2175903469324112, + "learning_rate": 6.551134642050181e-05, + "loss": 1.7631, + "step": 13607 + }, + { + "epoch": 4.176795580110497, + "grad_norm": 0.23259282112121582, + "learning_rate": 6.550662101883379e-05, + "loss": 1.7773, + "step": 13608 + }, + { + "epoch": 4.177102516881522, + "grad_norm": 0.23955227434635162, + "learning_rate": 6.550189546392025e-05, + "loss": 1.7321, + "step": 13609 + }, + { + "epoch": 4.1774094536525475, + "grad_norm": 0.23614998161792755, + "learning_rate": 6.549716975580792e-05, + "loss": 1.7855, + "step": 13610 + }, + { + "epoch": 4.177716390423573, + "grad_norm": 0.2274426817893982, + "learning_rate": 6.549244389454345e-05, + "loss": 1.7778, + "step": 13611 + }, + { + "epoch": 4.178023327194598, + "grad_norm": 0.2204308807849884, + "learning_rate": 6.548771788017358e-05, + "loss": 1.7175, + "step": 13612 + }, + { + "epoch": 4.1783302639656235, + "grad_norm": 0.2283930778503418, + "learning_rate": 6.548299171274501e-05, + "loss": 1.8081, + "step": 13613 + }, + { + "epoch": 4.178637200736648, + "grad_norm": 0.25433486700057983, + "learning_rate": 6.547826539230442e-05, + "loss": 1.8009, + "step": 13614 + }, + { + "epoch": 4.178944137507673, + "grad_norm": 0.24452579021453857, + "learning_rate": 6.547353891889856e-05, + "loss": 1.7244, + "step": 13615 + }, + { + "epoch": 4.179251074278699, + "grad_norm": 0.20611275732517242, + "learning_rate": 6.546881229257411e-05, + "loss": 1.7566, + "step": 13616 + }, + { + "epoch": 4.179558011049724, + "grad_norm": 0.24557232856750488, + "learning_rate": 6.546408551337779e-05, + "loss": 1.7638, + "step": 13617 + }, + { + "epoch": 4.179864947820749, + "grad_norm": 0.2158801257610321, + "learning_rate": 6.545935858135631e-05, + "loss": 1.7659, + "step": 13618 + }, + { + "epoch": 4.180171884591774, + "grad_norm": 0.23800688982009888, + "learning_rate": 6.54546314965564e-05, + "loss": 1.7468, + "step": 13619 + }, + { + "epoch": 4.180478821362799, + "grad_norm": 0.2504122853279114, + "learning_rate": 6.544990425902476e-05, + "loss": 1.7682, + "step": 13620 + }, + { + "epoch": 4.180785758133824, + "grad_norm": 0.21556814014911652, + "learning_rate": 6.54451768688081e-05, + "loss": 1.772, + "step": 13621 + }, + { + "epoch": 4.18109269490485, + "grad_norm": 0.23404552042484283, + "learning_rate": 6.544044932595315e-05, + "loss": 1.7844, + "step": 13622 + }, + { + "epoch": 4.181399631675875, + "grad_norm": 0.22129055857658386, + "learning_rate": 6.543572163050664e-05, + "loss": 1.7725, + "step": 13623 + }, + { + "epoch": 4.1817065684469, + "grad_norm": 0.2533521354198456, + "learning_rate": 6.543099378251528e-05, + "loss": 1.7908, + "step": 13624 + }, + { + "epoch": 4.182013505217925, + "grad_norm": 0.2905815541744232, + "learning_rate": 6.542626578202579e-05, + "loss": 1.7913, + "step": 13625 + }, + { + "epoch": 4.18232044198895, + "grad_norm": 0.3330783247947693, + "learning_rate": 6.54215376290849e-05, + "loss": 1.8374, + "step": 13626 + }, + { + "epoch": 4.1826273787599755, + "grad_norm": 0.29268717765808105, + "learning_rate": 6.541680932373933e-05, + "loss": 1.8714, + "step": 13627 + }, + { + "epoch": 4.182934315531001, + "grad_norm": 0.2820781171321869, + "learning_rate": 6.541208086603584e-05, + "loss": 1.8089, + "step": 13628 + }, + { + "epoch": 4.183241252302026, + "grad_norm": 0.3062323033809662, + "learning_rate": 6.54073522560211e-05, + "loss": 1.7307, + "step": 13629 + }, + { + "epoch": 4.183548189073051, + "grad_norm": 0.3010510504245758, + "learning_rate": 6.54026234937419e-05, + "loss": 1.7523, + "step": 13630 + }, + { + "epoch": 4.183855125844076, + "grad_norm": 0.21932095289230347, + "learning_rate": 6.539789457924493e-05, + "loss": 1.737, + "step": 13631 + }, + { + "epoch": 4.184162062615101, + "grad_norm": 0.2710212469100952, + "learning_rate": 6.539316551257695e-05, + "loss": 1.7228, + "step": 13632 + }, + { + "epoch": 4.184468999386127, + "grad_norm": 0.2885816991329193, + "learning_rate": 6.538843629378469e-05, + "loss": 1.8734, + "step": 13633 + }, + { + "epoch": 4.184775936157152, + "grad_norm": 0.2621026635169983, + "learning_rate": 6.538370692291487e-05, + "loss": 1.7884, + "step": 13634 + }, + { + "epoch": 4.185082872928176, + "grad_norm": 0.30503126978874207, + "learning_rate": 6.537897740001426e-05, + "loss": 1.7833, + "step": 13635 + }, + { + "epoch": 4.185389809699202, + "grad_norm": 0.29491373896598816, + "learning_rate": 6.537424772512955e-05, + "loss": 1.7894, + "step": 13636 + }, + { + "epoch": 4.185696746470227, + "grad_norm": 0.24423296749591827, + "learning_rate": 6.536951789830754e-05, + "loss": 1.7409, + "step": 13637 + }, + { + "epoch": 4.186003683241252, + "grad_norm": 0.2184748351573944, + "learning_rate": 6.536478791959495e-05, + "loss": 1.747, + "step": 13638 + }, + { + "epoch": 4.186310620012278, + "grad_norm": 0.2348455935716629, + "learning_rate": 6.53600577890385e-05, + "loss": 1.7422, + "step": 13639 + }, + { + "epoch": 4.186617556783303, + "grad_norm": 0.2554566264152527, + "learning_rate": 6.535532750668497e-05, + "loss": 1.7623, + "step": 13640 + }, + { + "epoch": 4.1869244935543275, + "grad_norm": 0.26424553990364075, + "learning_rate": 6.535059707258109e-05, + "loss": 1.8408, + "step": 13641 + }, + { + "epoch": 4.187231430325353, + "grad_norm": 0.35363274812698364, + "learning_rate": 6.534586648677361e-05, + "loss": 1.7435, + "step": 13642 + }, + { + "epoch": 4.187538367096378, + "grad_norm": 0.3225265443325043, + "learning_rate": 6.534113574930926e-05, + "loss": 1.7181, + "step": 13643 + }, + { + "epoch": 4.1878453038674035, + "grad_norm": 0.23529650270938873, + "learning_rate": 6.533640486023485e-05, + "loss": 1.7712, + "step": 13644 + }, + { + "epoch": 4.188152240638429, + "grad_norm": 0.3490132987499237, + "learning_rate": 6.53316738195971e-05, + "loss": 1.7329, + "step": 13645 + }, + { + "epoch": 4.188459177409453, + "grad_norm": 0.3759285509586334, + "learning_rate": 6.532694262744274e-05, + "loss": 1.802, + "step": 13646 + }, + { + "epoch": 4.188766114180479, + "grad_norm": 0.27383577823638916, + "learning_rate": 6.532221128381858e-05, + "loss": 1.801, + "step": 13647 + }, + { + "epoch": 4.189073050951504, + "grad_norm": 0.23240652680397034, + "learning_rate": 6.531747978877132e-05, + "loss": 1.8415, + "step": 13648 + }, + { + "epoch": 4.189379987722529, + "grad_norm": 0.3302704989910126, + "learning_rate": 6.531274814234773e-05, + "loss": 1.7765, + "step": 13649 + }, + { + "epoch": 4.189686924493555, + "grad_norm": 0.3209368586540222, + "learning_rate": 6.530801634459463e-05, + "loss": 1.6935, + "step": 13650 + }, + { + "epoch": 4.189993861264579, + "grad_norm": 0.26643648743629456, + "learning_rate": 6.530328439555872e-05, + "loss": 1.8159, + "step": 13651 + }, + { + "epoch": 4.190300798035604, + "grad_norm": 0.22594431042671204, + "learning_rate": 6.529855229528679e-05, + "loss": 1.7764, + "step": 13652 + }, + { + "epoch": 4.19060773480663, + "grad_norm": 0.3288109302520752, + "learning_rate": 6.529382004382561e-05, + "loss": 1.7963, + "step": 13653 + }, + { + "epoch": 4.190914671577655, + "grad_norm": 0.3067106604576111, + "learning_rate": 6.528908764122191e-05, + "loss": 1.7564, + "step": 13654 + }, + { + "epoch": 4.19122160834868, + "grad_norm": 0.23437078297138214, + "learning_rate": 6.528435508752249e-05, + "loss": 1.759, + "step": 13655 + }, + { + "epoch": 4.191528545119706, + "grad_norm": 0.30662333965301514, + "learning_rate": 6.527962238277413e-05, + "loss": 1.7549, + "step": 13656 + }, + { + "epoch": 4.19183548189073, + "grad_norm": 0.3545009195804596, + "learning_rate": 6.527488952702356e-05, + "loss": 1.7761, + "step": 13657 + }, + { + "epoch": 4.1921424186617555, + "grad_norm": 0.2509438991546631, + "learning_rate": 6.52701565203176e-05, + "loss": 1.7162, + "step": 13658 + }, + { + "epoch": 4.192449355432781, + "grad_norm": 0.24423806369304657, + "learning_rate": 6.5265423362703e-05, + "loss": 1.735, + "step": 13659 + }, + { + "epoch": 4.192756292203806, + "grad_norm": 0.37365156412124634, + "learning_rate": 6.526069005422654e-05, + "loss": 1.7697, + "step": 13660 + }, + { + "epoch": 4.1930632289748315, + "grad_norm": 0.4025731682777405, + "learning_rate": 6.525595659493499e-05, + "loss": 1.7931, + "step": 13661 + }, + { + "epoch": 4.193370165745856, + "grad_norm": 0.31360915303230286, + "learning_rate": 6.525122298487514e-05, + "loss": 1.8014, + "step": 13662 + }, + { + "epoch": 4.193677102516881, + "grad_norm": 0.2480524778366089, + "learning_rate": 6.524648922409376e-05, + "loss": 1.7753, + "step": 13663 + }, + { + "epoch": 4.193984039287907, + "grad_norm": 0.33740919828414917, + "learning_rate": 6.524175531263765e-05, + "loss": 1.7296, + "step": 13664 + }, + { + "epoch": 4.194290976058932, + "grad_norm": 0.26871639490127563, + "learning_rate": 6.523702125055358e-05, + "loss": 1.7113, + "step": 13665 + }, + { + "epoch": 4.194597912829957, + "grad_norm": 0.2687455415725708, + "learning_rate": 6.52322870378883e-05, + "loss": 1.7645, + "step": 13666 + }, + { + "epoch": 4.194904849600983, + "grad_norm": 0.4207400679588318, + "learning_rate": 6.522755267468868e-05, + "loss": 1.7758, + "step": 13667 + }, + { + "epoch": 4.195211786372007, + "grad_norm": 0.36043494939804077, + "learning_rate": 6.522281816100142e-05, + "loss": 1.7433, + "step": 13668 + }, + { + "epoch": 4.195518723143032, + "grad_norm": 0.2515890598297119, + "learning_rate": 6.52180834968734e-05, + "loss": 1.7646, + "step": 13669 + }, + { + "epoch": 4.195825659914058, + "grad_norm": 0.2871458828449249, + "learning_rate": 6.521334868235132e-05, + "loss": 1.8147, + "step": 13670 + }, + { + "epoch": 4.196132596685083, + "grad_norm": 0.28454354405403137, + "learning_rate": 6.5208613717482e-05, + "loss": 1.8576, + "step": 13671 + }, + { + "epoch": 4.196439533456108, + "grad_norm": 0.2520541548728943, + "learning_rate": 6.520387860231227e-05, + "loss": 1.7513, + "step": 13672 + }, + { + "epoch": 4.196746470227133, + "grad_norm": 0.22782307863235474, + "learning_rate": 6.51991433368889e-05, + "loss": 1.7737, + "step": 13673 + }, + { + "epoch": 4.197053406998158, + "grad_norm": 0.2451259195804596, + "learning_rate": 6.519440792125869e-05, + "loss": 1.7483, + "step": 13674 + }, + { + "epoch": 4.1973603437691835, + "grad_norm": 0.21915963292121887, + "learning_rate": 6.518967235546841e-05, + "loss": 1.718, + "step": 13675 + }, + { + "epoch": 4.197667280540209, + "grad_norm": 0.23005805909633636, + "learning_rate": 6.51849366395649e-05, + "loss": 1.7786, + "step": 13676 + }, + { + "epoch": 4.197974217311234, + "grad_norm": 0.25039517879486084, + "learning_rate": 6.518020077359494e-05, + "loss": 1.7785, + "step": 13677 + }, + { + "epoch": 4.198281154082259, + "grad_norm": 0.26631081104278564, + "learning_rate": 6.517546475760535e-05, + "loss": 1.7921, + "step": 13678 + }, + { + "epoch": 4.198588090853284, + "grad_norm": 0.2220793515443802, + "learning_rate": 6.517072859164292e-05, + "loss": 1.7696, + "step": 13679 + }, + { + "epoch": 4.198895027624309, + "grad_norm": 0.24681030213832855, + "learning_rate": 6.516599227575446e-05, + "loss": 1.7702, + "step": 13680 + }, + { + "epoch": 4.199201964395335, + "grad_norm": 0.2421828955411911, + "learning_rate": 6.516125580998678e-05, + "loss": 1.8058, + "step": 13681 + }, + { + "epoch": 4.19950890116636, + "grad_norm": 0.2170087695121765, + "learning_rate": 6.515651919438667e-05, + "loss": 1.7271, + "step": 13682 + }, + { + "epoch": 4.199815837937384, + "grad_norm": 0.23383566737174988, + "learning_rate": 6.515178242900096e-05, + "loss": 1.7515, + "step": 13683 + }, + { + "epoch": 4.20012277470841, + "grad_norm": 0.2522997558116913, + "learning_rate": 6.514704551387645e-05, + "loss": 1.7619, + "step": 13684 + }, + { + "epoch": 4.200429711479435, + "grad_norm": 0.20973703265190125, + "learning_rate": 6.514230844905995e-05, + "loss": 1.7326, + "step": 13685 + }, + { + "epoch": 4.2007366482504604, + "grad_norm": 0.2308073341846466, + "learning_rate": 6.513757123459832e-05, + "loss": 1.811, + "step": 13686 + }, + { + "epoch": 4.201043585021486, + "grad_norm": 0.21751229465007782, + "learning_rate": 6.51328338705383e-05, + "loss": 1.7795, + "step": 13687 + }, + { + "epoch": 4.201350521792511, + "grad_norm": 0.2357407957315445, + "learning_rate": 6.512809635692675e-05, + "loss": 1.8069, + "step": 13688 + }, + { + "epoch": 4.201657458563536, + "grad_norm": 0.32245033979415894, + "learning_rate": 6.51233586938105e-05, + "loss": 1.8179, + "step": 13689 + }, + { + "epoch": 4.201964395334561, + "grad_norm": 0.22740167379379272, + "learning_rate": 6.511862088123635e-05, + "loss": 1.7482, + "step": 13690 + }, + { + "epoch": 4.202271332105586, + "grad_norm": 0.26880496740341187, + "learning_rate": 6.511388291925114e-05, + "loss": 1.7919, + "step": 13691 + }, + { + "epoch": 4.202578268876612, + "grad_norm": 0.2261822521686554, + "learning_rate": 6.510914480790166e-05, + "loss": 1.7543, + "step": 13692 + }, + { + "epoch": 4.202885205647637, + "grad_norm": 0.2635782063007355, + "learning_rate": 6.510440654723477e-05, + "loss": 1.7874, + "step": 13693 + }, + { + "epoch": 4.203192142418661, + "grad_norm": 0.2505982518196106, + "learning_rate": 6.509966813729726e-05, + "loss": 1.8016, + "step": 13694 + }, + { + "epoch": 4.203499079189687, + "grad_norm": 0.23177236318588257, + "learning_rate": 6.5094929578136e-05, + "loss": 1.7582, + "step": 13695 + }, + { + "epoch": 4.203806015960712, + "grad_norm": 0.2315056324005127, + "learning_rate": 6.509019086979779e-05, + "loss": 1.7418, + "step": 13696 + }, + { + "epoch": 4.204112952731737, + "grad_norm": 0.25565484166145325, + "learning_rate": 6.508545201232947e-05, + "loss": 1.7476, + "step": 13697 + }, + { + "epoch": 4.204419889502763, + "grad_norm": 0.29210081696510315, + "learning_rate": 6.508071300577787e-05, + "loss": 1.8397, + "step": 13698 + }, + { + "epoch": 4.204726826273788, + "grad_norm": 0.2830582559108734, + "learning_rate": 6.507597385018984e-05, + "loss": 1.834, + "step": 13699 + }, + { + "epoch": 4.2050337630448125, + "grad_norm": 0.23013398051261902, + "learning_rate": 6.507123454561217e-05, + "loss": 1.7593, + "step": 13700 + }, + { + "epoch": 4.205340699815838, + "grad_norm": 0.21970276534557343, + "learning_rate": 6.506649509209174e-05, + "loss": 1.754, + "step": 13701 + }, + { + "epoch": 4.205647636586863, + "grad_norm": 0.32052233815193176, + "learning_rate": 6.50617554896754e-05, + "loss": 1.7531, + "step": 13702 + }, + { + "epoch": 4.2059545733578885, + "grad_norm": 0.2597332000732422, + "learning_rate": 6.505701573840995e-05, + "loss": 1.7836, + "step": 13703 + }, + { + "epoch": 4.206261510128914, + "grad_norm": 0.22070355713367462, + "learning_rate": 6.505227583834224e-05, + "loss": 1.7225, + "step": 13704 + }, + { + "epoch": 4.206568446899938, + "grad_norm": 0.27219358086586, + "learning_rate": 6.50475357895191e-05, + "loss": 1.8215, + "step": 13705 + }, + { + "epoch": 4.206875383670964, + "grad_norm": 0.32541659474372864, + "learning_rate": 6.504279559198741e-05, + "loss": 1.7786, + "step": 13706 + }, + { + "epoch": 4.207182320441989, + "grad_norm": 0.25871729850769043, + "learning_rate": 6.5038055245794e-05, + "loss": 1.7621, + "step": 13707 + }, + { + "epoch": 4.207489257213014, + "grad_norm": 0.2190464735031128, + "learning_rate": 6.50333147509857e-05, + "loss": 1.7612, + "step": 13708 + }, + { + "epoch": 4.20779619398404, + "grad_norm": 0.19565832614898682, + "learning_rate": 6.50285741076094e-05, + "loss": 1.7581, + "step": 13709 + }, + { + "epoch": 4.208103130755064, + "grad_norm": 0.1889251321554184, + "learning_rate": 6.50238333157119e-05, + "loss": 1.7611, + "step": 13710 + }, + { + "epoch": 4.208410067526089, + "grad_norm": 0.2013053596019745, + "learning_rate": 6.501909237534008e-05, + "loss": 1.7393, + "step": 13711 + }, + { + "epoch": 4.208717004297115, + "grad_norm": 0.1899433434009552, + "learning_rate": 6.501435128654077e-05, + "loss": 1.7122, + "step": 13712 + }, + { + "epoch": 4.20902394106814, + "grad_norm": 0.19337882101535797, + "learning_rate": 6.500961004936085e-05, + "loss": 1.7538, + "step": 13713 + }, + { + "epoch": 4.209330877839165, + "grad_norm": 0.20419920980930328, + "learning_rate": 6.500486866384718e-05, + "loss": 1.728, + "step": 13714 + }, + { + "epoch": 4.209637814610191, + "grad_norm": 0.20615679025650024, + "learning_rate": 6.50001271300466e-05, + "loss": 1.7843, + "step": 13715 + }, + { + "epoch": 4.209944751381215, + "grad_norm": 0.22178977727890015, + "learning_rate": 6.499538544800596e-05, + "loss": 1.7751, + "step": 13716 + }, + { + "epoch": 4.2102516881522405, + "grad_norm": 0.23703891038894653, + "learning_rate": 6.499064361777214e-05, + "loss": 1.7304, + "step": 13717 + }, + { + "epoch": 4.210558624923266, + "grad_norm": 0.2785723805427551, + "learning_rate": 6.498590163939198e-05, + "loss": 1.802, + "step": 13718 + }, + { + "epoch": 4.210865561694291, + "grad_norm": 0.23277060687541962, + "learning_rate": 6.498115951291237e-05, + "loss": 1.7316, + "step": 13719 + }, + { + "epoch": 4.2111724984653165, + "grad_norm": 0.22289474308490753, + "learning_rate": 6.497641723838017e-05, + "loss": 1.8469, + "step": 13720 + }, + { + "epoch": 4.211479435236341, + "grad_norm": 0.2715846002101898, + "learning_rate": 6.497167481584221e-05, + "loss": 1.7919, + "step": 13721 + }, + { + "epoch": 4.211786372007366, + "grad_norm": 0.29262226819992065, + "learning_rate": 6.49669322453454e-05, + "loss": 1.8379, + "step": 13722 + }, + { + "epoch": 4.212093308778392, + "grad_norm": 0.29136186838150024, + "learning_rate": 6.49621895269366e-05, + "loss": 1.789, + "step": 13723 + }, + { + "epoch": 4.212400245549417, + "grad_norm": 0.25110194087028503, + "learning_rate": 6.495744666066266e-05, + "loss": 1.7574, + "step": 13724 + }, + { + "epoch": 4.212707182320442, + "grad_norm": 0.2301366776227951, + "learning_rate": 6.495270364657048e-05, + "loss": 1.7637, + "step": 13725 + }, + { + "epoch": 4.213014119091467, + "grad_norm": 0.2556478977203369, + "learning_rate": 6.49479604847069e-05, + "loss": 1.7975, + "step": 13726 + }, + { + "epoch": 4.213321055862492, + "grad_norm": 0.2645667493343353, + "learning_rate": 6.494321717511884e-05, + "loss": 1.7594, + "step": 13727 + }, + { + "epoch": 4.213627992633517, + "grad_norm": 0.23664188385009766, + "learning_rate": 6.493847371785312e-05, + "loss": 1.7963, + "step": 13728 + }, + { + "epoch": 4.213934929404543, + "grad_norm": 0.2947930693626404, + "learning_rate": 6.493373011295665e-05, + "loss": 1.7477, + "step": 13729 + }, + { + "epoch": 4.214241866175568, + "grad_norm": 0.34598737955093384, + "learning_rate": 6.492898636047631e-05, + "loss": 1.7014, + "step": 13730 + }, + { + "epoch": 4.214548802946593, + "grad_norm": 0.24406935274600983, + "learning_rate": 6.4924242460459e-05, + "loss": 1.7436, + "step": 13731 + }, + { + "epoch": 4.214855739717618, + "grad_norm": 0.27176225185394287, + "learning_rate": 6.491949841295156e-05, + "loss": 1.8429, + "step": 13732 + }, + { + "epoch": 4.215162676488643, + "grad_norm": 0.2506968080997467, + "learning_rate": 6.491475421800089e-05, + "loss": 1.7519, + "step": 13733 + }, + { + "epoch": 4.2154696132596685, + "grad_norm": 0.2240980863571167, + "learning_rate": 6.491000987565387e-05, + "loss": 1.7595, + "step": 13734 + }, + { + "epoch": 4.215776550030694, + "grad_norm": 0.23201732337474823, + "learning_rate": 6.490526538595741e-05, + "loss": 1.7466, + "step": 13735 + }, + { + "epoch": 4.216083486801719, + "grad_norm": 0.24624750018119812, + "learning_rate": 6.490052074895836e-05, + "loss": 1.7364, + "step": 13736 + }, + { + "epoch": 4.216390423572744, + "grad_norm": 0.22936980426311493, + "learning_rate": 6.489577596470366e-05, + "loss": 1.7095, + "step": 13737 + }, + { + "epoch": 4.216697360343769, + "grad_norm": 0.2106638103723526, + "learning_rate": 6.489103103324016e-05, + "loss": 1.7387, + "step": 13738 + }, + { + "epoch": 4.217004297114794, + "grad_norm": 0.2936140298843384, + "learning_rate": 6.488628595461477e-05, + "loss": 1.9129, + "step": 13739 + }, + { + "epoch": 4.21731123388582, + "grad_norm": 0.21871696412563324, + "learning_rate": 6.488154072887435e-05, + "loss": 1.7489, + "step": 13740 + }, + { + "epoch": 4.217618170656845, + "grad_norm": 0.25941070914268494, + "learning_rate": 6.487679535606583e-05, + "loss": 1.7788, + "step": 13741 + }, + { + "epoch": 4.21792510742787, + "grad_norm": 0.2540862560272217, + "learning_rate": 6.487204983623612e-05, + "loss": 1.8074, + "step": 13742 + }, + { + "epoch": 4.218232044198895, + "grad_norm": 0.25180327892303467, + "learning_rate": 6.486730416943207e-05, + "loss": 1.7503, + "step": 13743 + }, + { + "epoch": 4.21853898096992, + "grad_norm": 0.26625585556030273, + "learning_rate": 6.486255835570063e-05, + "loss": 1.8149, + "step": 13744 + }, + { + "epoch": 4.218845917740945, + "grad_norm": 0.3023914396762848, + "learning_rate": 6.485781239508867e-05, + "loss": 1.8599, + "step": 13745 + }, + { + "epoch": 4.219152854511971, + "grad_norm": 0.2683780789375305, + "learning_rate": 6.48530662876431e-05, + "loss": 1.7911, + "step": 13746 + }, + { + "epoch": 4.219459791282996, + "grad_norm": 0.20747442543506622, + "learning_rate": 6.484832003341081e-05, + "loss": 1.7343, + "step": 13747 + }, + { + "epoch": 4.2197667280540205, + "grad_norm": 0.29284465312957764, + "learning_rate": 6.484357363243873e-05, + "loss": 1.7917, + "step": 13748 + }, + { + "epoch": 4.220073664825046, + "grad_norm": 0.24303840100765228, + "learning_rate": 6.483882708477376e-05, + "loss": 1.7921, + "step": 13749 + }, + { + "epoch": 4.220380601596071, + "grad_norm": 0.26253026723861694, + "learning_rate": 6.48340803904628e-05, + "loss": 1.7971, + "step": 13750 + }, + { + "epoch": 4.2206875383670965, + "grad_norm": 0.23888511955738068, + "learning_rate": 6.482933354955275e-05, + "loss": 1.7967, + "step": 13751 + }, + { + "epoch": 4.220994475138122, + "grad_norm": 0.24966883659362793, + "learning_rate": 6.482458656209054e-05, + "loss": 1.7924, + "step": 13752 + }, + { + "epoch": 4.221301411909146, + "grad_norm": 0.26556864380836487, + "learning_rate": 6.481983942812309e-05, + "loss": 1.8608, + "step": 13753 + }, + { + "epoch": 4.221608348680172, + "grad_norm": 0.29064711928367615, + "learning_rate": 6.48150921476973e-05, + "loss": 1.7785, + "step": 13754 + }, + { + "epoch": 4.221915285451197, + "grad_norm": 0.30876123905181885, + "learning_rate": 6.481034472086008e-05, + "loss": 1.8287, + "step": 13755 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.2622467875480652, + "learning_rate": 6.480559714765835e-05, + "loss": 1.8336, + "step": 13756 + }, + { + "epoch": 4.222529158993248, + "grad_norm": 0.2502644956111908, + "learning_rate": 6.480084942813902e-05, + "loss": 1.7803, + "step": 13757 + }, + { + "epoch": 4.222836095764273, + "grad_norm": 0.2879922688007355, + "learning_rate": 6.479610156234903e-05, + "loss": 1.7544, + "step": 13758 + }, + { + "epoch": 4.223143032535297, + "grad_norm": 0.2831384241580963, + "learning_rate": 6.47913535503353e-05, + "loss": 1.887, + "step": 13759 + }, + { + "epoch": 4.223449969306323, + "grad_norm": 0.3221064805984497, + "learning_rate": 6.478660539214474e-05, + "loss": 1.7455, + "step": 13760 + }, + { + "epoch": 4.223756906077348, + "grad_norm": 0.4231930673122406, + "learning_rate": 6.478185708782427e-05, + "loss": 1.8209, + "step": 13761 + }, + { + "epoch": 4.224063842848373, + "grad_norm": 0.34327802062034607, + "learning_rate": 6.477710863742083e-05, + "loss": 1.7754, + "step": 13762 + }, + { + "epoch": 4.224370779619399, + "grad_norm": 0.21713349223136902, + "learning_rate": 6.477236004098135e-05, + "loss": 1.7576, + "step": 13763 + }, + { + "epoch": 4.224677716390423, + "grad_norm": 0.3262602388858795, + "learning_rate": 6.476761129855275e-05, + "loss": 1.7772, + "step": 13764 + }, + { + "epoch": 4.2249846531614486, + "grad_norm": 0.3231413662433624, + "learning_rate": 6.476286241018195e-05, + "loss": 1.7821, + "step": 13765 + }, + { + "epoch": 4.225291589932474, + "grad_norm": 0.2440098226070404, + "learning_rate": 6.475811337591588e-05, + "loss": 1.7684, + "step": 13766 + }, + { + "epoch": 4.225598526703499, + "grad_norm": 0.329949289560318, + "learning_rate": 6.475336419580151e-05, + "loss": 1.8564, + "step": 13767 + }, + { + "epoch": 4.225905463474525, + "grad_norm": 0.3567483425140381, + "learning_rate": 6.474861486988574e-05, + "loss": 1.7625, + "step": 13768 + }, + { + "epoch": 4.226212400245549, + "grad_norm": 0.25257283449172974, + "learning_rate": 6.47438653982155e-05, + "loss": 1.823, + "step": 13769 + }, + { + "epoch": 4.226519337016574, + "grad_norm": 0.31542617082595825, + "learning_rate": 6.473911578083776e-05, + "loss": 1.7817, + "step": 13770 + }, + { + "epoch": 4.2268262737876, + "grad_norm": 0.29670149087905884, + "learning_rate": 6.473436601779944e-05, + "loss": 1.7493, + "step": 13771 + }, + { + "epoch": 4.227133210558625, + "grad_norm": 0.2635453939437866, + "learning_rate": 6.472961610914745e-05, + "loss": 1.792, + "step": 13772 + }, + { + "epoch": 4.22744014732965, + "grad_norm": 0.25017979741096497, + "learning_rate": 6.472486605492878e-05, + "loss": 1.7183, + "step": 13773 + }, + { + "epoch": 4.227747084100676, + "grad_norm": 0.3766646087169647, + "learning_rate": 6.472011585519034e-05, + "loss": 1.8039, + "step": 13774 + }, + { + "epoch": 4.2280540208717, + "grad_norm": 0.29860204458236694, + "learning_rate": 6.47153655099791e-05, + "loss": 1.8016, + "step": 13775 + }, + { + "epoch": 4.2283609576427255, + "grad_norm": 0.2540898323059082, + "learning_rate": 6.4710615019342e-05, + "loss": 1.8481, + "step": 13776 + }, + { + "epoch": 4.228667894413751, + "grad_norm": 0.3677786886692047, + "learning_rate": 6.470586438332597e-05, + "loss": 1.7663, + "step": 13777 + }, + { + "epoch": 4.228974831184776, + "grad_norm": 0.35693466663360596, + "learning_rate": 6.470111360197797e-05, + "loss": 1.7733, + "step": 13778 + }, + { + "epoch": 4.2292817679558015, + "grad_norm": 0.23747926950454712, + "learning_rate": 6.469636267534496e-05, + "loss": 1.7938, + "step": 13779 + }, + { + "epoch": 4.229588704726826, + "grad_norm": 0.32890695333480835, + "learning_rate": 6.469161160347386e-05, + "loss": 1.7233, + "step": 13780 + }, + { + "epoch": 4.229895641497851, + "grad_norm": 0.3437706530094147, + "learning_rate": 6.468686038641164e-05, + "loss": 1.7716, + "step": 13781 + }, + { + "epoch": 4.230202578268877, + "grad_norm": 0.23452162742614746, + "learning_rate": 6.468210902420527e-05, + "loss": 1.764, + "step": 13782 + }, + { + "epoch": 4.230509515039902, + "grad_norm": 0.3205265402793884, + "learning_rate": 6.46773575169017e-05, + "loss": 1.7464, + "step": 13783 + }, + { + "epoch": 4.230816451810927, + "grad_norm": 0.4234732985496521, + "learning_rate": 6.467260586454787e-05, + "loss": 1.7786, + "step": 13784 + }, + { + "epoch": 4.231123388581952, + "grad_norm": 0.2484128773212433, + "learning_rate": 6.466785406719076e-05, + "loss": 1.8125, + "step": 13785 + }, + { + "epoch": 4.231430325352977, + "grad_norm": 0.3696556091308594, + "learning_rate": 6.46631021248773e-05, + "loss": 1.7974, + "step": 13786 + }, + { + "epoch": 4.231737262124002, + "grad_norm": 0.4251437485218048, + "learning_rate": 6.465835003765449e-05, + "loss": 1.7486, + "step": 13787 + }, + { + "epoch": 4.232044198895028, + "grad_norm": 0.2507621943950653, + "learning_rate": 6.465359780556927e-05, + "loss": 1.829, + "step": 13788 + }, + { + "epoch": 4.232351135666053, + "grad_norm": 0.2911818325519562, + "learning_rate": 6.464884542866861e-05, + "loss": 1.7401, + "step": 13789 + }, + { + "epoch": 4.232658072437078, + "grad_norm": 0.35354506969451904, + "learning_rate": 6.464409290699946e-05, + "loss": 1.7848, + "step": 13790 + }, + { + "epoch": 4.232965009208103, + "grad_norm": 0.2659081518650055, + "learning_rate": 6.46393402406088e-05, + "loss": 1.7408, + "step": 13791 + }, + { + "epoch": 4.233271945979128, + "grad_norm": 0.22676481306552887, + "learning_rate": 6.46345874295436e-05, + "loss": 1.7542, + "step": 13792 + }, + { + "epoch": 4.2335788827501535, + "grad_norm": 0.2549789845943451, + "learning_rate": 6.462983447385085e-05, + "loss": 1.8095, + "step": 13793 + }, + { + "epoch": 4.233885819521179, + "grad_norm": 0.2157238870859146, + "learning_rate": 6.462508137357748e-05, + "loss": 1.7529, + "step": 13794 + }, + { + "epoch": 4.234192756292204, + "grad_norm": 0.2494724988937378, + "learning_rate": 6.46203281287705e-05, + "loss": 1.7839, + "step": 13795 + }, + { + "epoch": 4.234499693063229, + "grad_norm": 0.29560065269470215, + "learning_rate": 6.461557473947685e-05, + "loss": 1.7239, + "step": 13796 + }, + { + "epoch": 4.234806629834254, + "grad_norm": 0.23693916201591492, + "learning_rate": 6.461082120574354e-05, + "loss": 1.8074, + "step": 13797 + }, + { + "epoch": 4.235113566605279, + "grad_norm": 0.2538869082927704, + "learning_rate": 6.460606752761752e-05, + "loss": 1.8319, + "step": 13798 + }, + { + "epoch": 4.235420503376305, + "grad_norm": 0.3186401426792145, + "learning_rate": 6.460131370514578e-05, + "loss": 1.7877, + "step": 13799 + }, + { + "epoch": 4.23572744014733, + "grad_norm": 0.2473619133234024, + "learning_rate": 6.45965597383753e-05, + "loss": 1.8323, + "step": 13800 + }, + { + "epoch": 4.236034376918354, + "grad_norm": 0.32806503772735596, + "learning_rate": 6.459180562735307e-05, + "loss": 1.744, + "step": 13801 + }, + { + "epoch": 4.23634131368938, + "grad_norm": 0.3975784480571747, + "learning_rate": 6.458705137212606e-05, + "loss": 1.7216, + "step": 13802 + }, + { + "epoch": 4.236648250460405, + "grad_norm": 0.2946135997772217, + "learning_rate": 6.458229697274125e-05, + "loss": 1.8781, + "step": 13803 + }, + { + "epoch": 4.23695518723143, + "grad_norm": 0.25109192728996277, + "learning_rate": 6.457754242924565e-05, + "loss": 1.7458, + "step": 13804 + }, + { + "epoch": 4.237262124002456, + "grad_norm": 0.2763883173465729, + "learning_rate": 6.457278774168623e-05, + "loss": 1.7612, + "step": 13805 + }, + { + "epoch": 4.237569060773481, + "grad_norm": 0.22427856922149658, + "learning_rate": 6.456803291010996e-05, + "loss": 1.8049, + "step": 13806 + }, + { + "epoch": 4.2378759975445055, + "grad_norm": 0.28295788168907166, + "learning_rate": 6.456327793456387e-05, + "loss": 1.7608, + "step": 13807 + }, + { + "epoch": 4.238182934315531, + "grad_norm": 0.27857527136802673, + "learning_rate": 6.455852281509493e-05, + "loss": 1.7281, + "step": 13808 + }, + { + "epoch": 4.238489871086556, + "grad_norm": 0.24014849960803986, + "learning_rate": 6.455376755175012e-05, + "loss": 1.7247, + "step": 13809 + }, + { + "epoch": 4.2387968078575815, + "grad_norm": 0.25149038434028625, + "learning_rate": 6.454901214457646e-05, + "loss": 1.8575, + "step": 13810 + }, + { + "epoch": 4.239103744628607, + "grad_norm": 0.32072681188583374, + "learning_rate": 6.454425659362093e-05, + "loss": 1.7421, + "step": 13811 + }, + { + "epoch": 4.239410681399631, + "grad_norm": 0.28418242931365967, + "learning_rate": 6.453950089893054e-05, + "loss": 1.7031, + "step": 13812 + }, + { + "epoch": 4.239717618170657, + "grad_norm": 0.23725132644176483, + "learning_rate": 6.453474506055228e-05, + "loss": 1.7901, + "step": 13813 + }, + { + "epoch": 4.240024554941682, + "grad_norm": 0.3056317865848541, + "learning_rate": 6.452998907853315e-05, + "loss": 1.7414, + "step": 13814 + }, + { + "epoch": 4.240331491712707, + "grad_norm": 0.3111891448497772, + "learning_rate": 6.452523295292013e-05, + "loss": 1.7532, + "step": 13815 + }, + { + "epoch": 4.240638428483733, + "grad_norm": 0.2126779705286026, + "learning_rate": 6.452047668376027e-05, + "loss": 1.6779, + "step": 13816 + }, + { + "epoch": 4.240945365254758, + "grad_norm": 0.26660779118537903, + "learning_rate": 6.451572027110054e-05, + "loss": 1.7162, + "step": 13817 + }, + { + "epoch": 4.241252302025782, + "grad_norm": 0.25901922583580017, + "learning_rate": 6.451096371498794e-05, + "loss": 1.7784, + "step": 13818 + }, + { + "epoch": 4.241559238796808, + "grad_norm": 0.24091807007789612, + "learning_rate": 6.450620701546953e-05, + "loss": 1.7928, + "step": 13819 + }, + { + "epoch": 4.241866175567833, + "grad_norm": 0.25097009539604187, + "learning_rate": 6.450145017259225e-05, + "loss": 1.761, + "step": 13820 + }, + { + "epoch": 4.242173112338858, + "grad_norm": 0.22978942096233368, + "learning_rate": 6.449669318640315e-05, + "loss": 1.7891, + "step": 13821 + }, + { + "epoch": 4.242480049109884, + "grad_norm": 0.27255937457084656, + "learning_rate": 6.449193605694923e-05, + "loss": 1.7964, + "step": 13822 + }, + { + "epoch": 4.242786985880908, + "grad_norm": 0.2210773378610611, + "learning_rate": 6.44871787842775e-05, + "loss": 1.7628, + "step": 13823 + }, + { + "epoch": 4.2430939226519335, + "grad_norm": 0.25784751772880554, + "learning_rate": 6.448242136843497e-05, + "loss": 1.7596, + "step": 13824 + }, + { + "epoch": 4.243400859422959, + "grad_norm": 0.23475486040115356, + "learning_rate": 6.447766380946868e-05, + "loss": 1.8174, + "step": 13825 + }, + { + "epoch": 4.243707796193984, + "grad_norm": 0.2567705512046814, + "learning_rate": 6.447290610742561e-05, + "loss": 1.737, + "step": 13826 + }, + { + "epoch": 4.2440147329650095, + "grad_norm": 0.23973144590854645, + "learning_rate": 6.446814826235281e-05, + "loss": 1.7881, + "step": 13827 + }, + { + "epoch": 4.244321669736034, + "grad_norm": 0.25584739446640015, + "learning_rate": 6.446339027429729e-05, + "loss": 1.7673, + "step": 13828 + }, + { + "epoch": 4.244628606507059, + "grad_norm": 0.2653748393058777, + "learning_rate": 6.445863214330608e-05, + "loss": 1.7443, + "step": 13829 + }, + { + "epoch": 4.244935543278085, + "grad_norm": 0.2492038607597351, + "learning_rate": 6.445387386942619e-05, + "loss": 1.7223, + "step": 13830 + }, + { + "epoch": 4.24524248004911, + "grad_norm": 0.2282228320837021, + "learning_rate": 6.444911545270464e-05, + "loss": 1.7577, + "step": 13831 + }, + { + "epoch": 4.245549416820135, + "grad_norm": 0.2411092072725296, + "learning_rate": 6.444435689318845e-05, + "loss": 1.7324, + "step": 13832 + }, + { + "epoch": 4.245856353591161, + "grad_norm": 0.21557089686393738, + "learning_rate": 6.443959819092468e-05, + "loss": 1.7355, + "step": 13833 + }, + { + "epoch": 4.246163290362185, + "grad_norm": 0.2500394880771637, + "learning_rate": 6.443483934596033e-05, + "loss": 1.775, + "step": 13834 + }, + { + "epoch": 4.24647022713321, + "grad_norm": 0.24135248363018036, + "learning_rate": 6.443008035834244e-05, + "loss": 1.7885, + "step": 13835 + }, + { + "epoch": 4.246777163904236, + "grad_norm": 0.22860904037952423, + "learning_rate": 6.442532122811803e-05, + "loss": 1.7891, + "step": 13836 + }, + { + "epoch": 4.247084100675261, + "grad_norm": 0.2277665138244629, + "learning_rate": 6.442056195533415e-05, + "loss": 1.7583, + "step": 13837 + }, + { + "epoch": 4.247391037446286, + "grad_norm": 0.22822454571723938, + "learning_rate": 6.441580254003782e-05, + "loss": 1.7777, + "step": 13838 + }, + { + "epoch": 4.247697974217311, + "grad_norm": 0.24274896085262299, + "learning_rate": 6.441104298227608e-05, + "loss": 1.7537, + "step": 13839 + }, + { + "epoch": 4.248004910988336, + "grad_norm": 0.25080999732017517, + "learning_rate": 6.440628328209598e-05, + "loss": 1.7537, + "step": 13840 + }, + { + "epoch": 4.2483118477593615, + "grad_norm": 0.22409579157829285, + "learning_rate": 6.440152343954453e-05, + "loss": 1.7652, + "step": 13841 + }, + { + "epoch": 4.248618784530387, + "grad_norm": 0.24028798937797546, + "learning_rate": 6.439676345466877e-05, + "loss": 1.7512, + "step": 13842 + }, + { + "epoch": 4.248925721301412, + "grad_norm": 0.28739503026008606, + "learning_rate": 6.439200332751576e-05, + "loss": 1.8034, + "step": 13843 + }, + { + "epoch": 4.249232658072437, + "grad_norm": 0.2244807928800583, + "learning_rate": 6.438724305813255e-05, + "loss": 1.7243, + "step": 13844 + }, + { + "epoch": 4.249539594843462, + "grad_norm": 0.24478118121623993, + "learning_rate": 6.438248264656618e-05, + "loss": 1.7754, + "step": 13845 + }, + { + "epoch": 4.249846531614487, + "grad_norm": 0.25554370880126953, + "learning_rate": 6.437772209286368e-05, + "loss": 1.7845, + "step": 13846 + }, + { + "epoch": 4.250153468385513, + "grad_norm": 0.24478472769260406, + "learning_rate": 6.43729613970721e-05, + "loss": 1.7954, + "step": 13847 + }, + { + "epoch": 4.250460405156538, + "grad_norm": 0.22287282347679138, + "learning_rate": 6.436820055923849e-05, + "loss": 1.7379, + "step": 13848 + }, + { + "epoch": 4.250767341927563, + "grad_norm": 0.2810569703578949, + "learning_rate": 6.43634395794099e-05, + "loss": 1.8492, + "step": 13849 + }, + { + "epoch": 4.251074278698588, + "grad_norm": 0.2544163465499878, + "learning_rate": 6.435867845763337e-05, + "loss": 1.7846, + "step": 13850 + }, + { + "epoch": 4.251381215469613, + "grad_norm": 0.27879175543785095, + "learning_rate": 6.435391719395598e-05, + "loss": 1.767, + "step": 13851 + }, + { + "epoch": 4.2516881522406385, + "grad_norm": 0.2876715362071991, + "learning_rate": 6.434915578842477e-05, + "loss": 1.8048, + "step": 13852 + }, + { + "epoch": 4.251995089011664, + "grad_norm": 0.27844297885894775, + "learning_rate": 6.434439424108678e-05, + "loss": 1.7472, + "step": 13853 + }, + { + "epoch": 4.252302025782689, + "grad_norm": 0.2417020946741104, + "learning_rate": 6.43396325519891e-05, + "loss": 1.8481, + "step": 13854 + }, + { + "epoch": 4.252608962553714, + "grad_norm": 0.23828522861003876, + "learning_rate": 6.433487072117874e-05, + "loss": 1.7536, + "step": 13855 + }, + { + "epoch": 4.252915899324739, + "grad_norm": 0.22304333746433258, + "learning_rate": 6.43301087487028e-05, + "loss": 1.741, + "step": 13856 + }, + { + "epoch": 4.253222836095764, + "grad_norm": 0.27089163661003113, + "learning_rate": 6.432534663460832e-05, + "loss": 1.7974, + "step": 13857 + }, + { + "epoch": 4.25352977286679, + "grad_norm": 0.2439592182636261, + "learning_rate": 6.432058437894237e-05, + "loss": 1.7713, + "step": 13858 + }, + { + "epoch": 4.253836709637815, + "grad_norm": 0.2368553727865219, + "learning_rate": 6.431582198175203e-05, + "loss": 1.6915, + "step": 13859 + }, + { + "epoch": 4.25414364640884, + "grad_norm": 0.25248441100120544, + "learning_rate": 6.431105944308431e-05, + "loss": 1.7286, + "step": 13860 + }, + { + "epoch": 4.254450583179865, + "grad_norm": 0.20928484201431274, + "learning_rate": 6.430629676298634e-05, + "loss": 1.79, + "step": 13861 + }, + { + "epoch": 4.25475751995089, + "grad_norm": 0.25262540578842163, + "learning_rate": 6.430153394150514e-05, + "loss": 1.7443, + "step": 13862 + }, + { + "epoch": 4.255064456721915, + "grad_norm": 0.27508237957954407, + "learning_rate": 6.429677097868783e-05, + "loss": 1.8207, + "step": 13863 + }, + { + "epoch": 4.255371393492941, + "grad_norm": 0.28129303455352783, + "learning_rate": 6.429200787458141e-05, + "loss": 1.7589, + "step": 13864 + }, + { + "epoch": 4.255678330263966, + "grad_norm": 0.3205658495426178, + "learning_rate": 6.428724462923302e-05, + "loss": 1.8037, + "step": 13865 + }, + { + "epoch": 4.2559852670349905, + "grad_norm": 0.24048078060150146, + "learning_rate": 6.428248124268969e-05, + "loss": 1.7303, + "step": 13866 + }, + { + "epoch": 4.256292203806016, + "grad_norm": 0.24742475152015686, + "learning_rate": 6.427771771499852e-05, + "loss": 1.7753, + "step": 13867 + }, + { + "epoch": 4.256599140577041, + "grad_norm": 0.3082354962825775, + "learning_rate": 6.427295404620656e-05, + "loss": 1.7275, + "step": 13868 + }, + { + "epoch": 4.2569060773480665, + "grad_norm": 0.23319822549819946, + "learning_rate": 6.426819023636093e-05, + "loss": 1.7562, + "step": 13869 + }, + { + "epoch": 4.257213014119092, + "grad_norm": 0.2611405551433563, + "learning_rate": 6.426342628550866e-05, + "loss": 1.7417, + "step": 13870 + }, + { + "epoch": 4.257519950890116, + "grad_norm": 0.2577543258666992, + "learning_rate": 6.425866219369686e-05, + "loss": 1.6906, + "step": 13871 + }, + { + "epoch": 4.257826887661142, + "grad_norm": 0.31353357434272766, + "learning_rate": 6.42538979609726e-05, + "loss": 1.7155, + "step": 13872 + }, + { + "epoch": 4.258133824432167, + "grad_norm": 0.23280073702335358, + "learning_rate": 6.424913358738296e-05, + "loss": 1.7576, + "step": 13873 + }, + { + "epoch": 4.258440761203192, + "grad_norm": 0.24087542295455933, + "learning_rate": 6.424436907297504e-05, + "loss": 1.7622, + "step": 13874 + }, + { + "epoch": 4.258747697974218, + "grad_norm": 0.3146509826183319, + "learning_rate": 6.42396044177959e-05, + "loss": 1.769, + "step": 13875 + }, + { + "epoch": 4.259054634745242, + "grad_norm": 0.2645811438560486, + "learning_rate": 6.423483962189268e-05, + "loss": 1.7713, + "step": 13876 + }, + { + "epoch": 4.259361571516267, + "grad_norm": 0.2166455090045929, + "learning_rate": 6.423007468531238e-05, + "loss": 1.7705, + "step": 13877 + }, + { + "epoch": 4.259668508287293, + "grad_norm": 0.29142528772354126, + "learning_rate": 6.422530960810217e-05, + "loss": 1.7725, + "step": 13878 + }, + { + "epoch": 4.259975445058318, + "grad_norm": 0.28777652978897095, + "learning_rate": 6.422054439030911e-05, + "loss": 1.7853, + "step": 13879 + }, + { + "epoch": 4.260282381829343, + "grad_norm": 0.2285117357969284, + "learning_rate": 6.42157790319803e-05, + "loss": 1.7034, + "step": 13880 + }, + { + "epoch": 4.260589318600369, + "grad_norm": 0.32407644391059875, + "learning_rate": 6.421101353316282e-05, + "loss": 1.7858, + "step": 13881 + }, + { + "epoch": 4.260896255371393, + "grad_norm": 0.4803469777107239, + "learning_rate": 6.420624789390378e-05, + "loss": 1.7337, + "step": 13882 + }, + { + "epoch": 4.2612031921424185, + "grad_norm": 0.4245823919773102, + "learning_rate": 6.420148211425027e-05, + "loss": 1.8024, + "step": 13883 + }, + { + "epoch": 4.261510128913444, + "grad_norm": 0.22298674285411835, + "learning_rate": 6.419671619424938e-05, + "loss": 1.7129, + "step": 13884 + }, + { + "epoch": 4.261817065684469, + "grad_norm": 0.46955862641334534, + "learning_rate": 6.419195013394824e-05, + "loss": 1.7151, + "step": 13885 + }, + { + "epoch": 4.2621240024554945, + "grad_norm": 0.4809224009513855, + "learning_rate": 6.418718393339392e-05, + "loss": 1.7697, + "step": 13886 + }, + { + "epoch": 4.262430939226519, + "grad_norm": 0.2741130292415619, + "learning_rate": 6.418241759263353e-05, + "loss": 1.8133, + "step": 13887 + }, + { + "epoch": 4.262737875997544, + "grad_norm": 0.3673117756843567, + "learning_rate": 6.417765111171419e-05, + "loss": 1.7424, + "step": 13888 + }, + { + "epoch": 4.26304481276857, + "grad_norm": 0.4609327018260956, + "learning_rate": 6.417288449068299e-05, + "loss": 1.741, + "step": 13889 + }, + { + "epoch": 4.263351749539595, + "grad_norm": 0.2929460406303406, + "learning_rate": 6.416811772958702e-05, + "loss": 1.8385, + "step": 13890 + }, + { + "epoch": 4.26365868631062, + "grad_norm": 0.2727305293083191, + "learning_rate": 6.416335082847342e-05, + "loss": 1.794, + "step": 13891 + }, + { + "epoch": 4.263965623081646, + "grad_norm": 0.26089411973953247, + "learning_rate": 6.41585837873893e-05, + "loss": 1.7907, + "step": 13892 + }, + { + "epoch": 4.26427255985267, + "grad_norm": 0.24655573070049286, + "learning_rate": 6.415381660638174e-05, + "loss": 1.7481, + "step": 13893 + }, + { + "epoch": 4.264579496623695, + "grad_norm": 0.4186919629573822, + "learning_rate": 6.414904928549787e-05, + "loss": 1.8021, + "step": 13894 + }, + { + "epoch": 4.264886433394721, + "grad_norm": 0.38188236951828003, + "learning_rate": 6.414428182478478e-05, + "loss": 1.75, + "step": 13895 + }, + { + "epoch": 4.265193370165746, + "grad_norm": 0.23686440289020538, + "learning_rate": 6.413951422428963e-05, + "loss": 1.7882, + "step": 13896 + }, + { + "epoch": 4.265500306936771, + "grad_norm": 0.35963737964630127, + "learning_rate": 6.413474648405952e-05, + "loss": 1.7427, + "step": 13897 + }, + { + "epoch": 4.265807243707796, + "grad_norm": 0.38558289408683777, + "learning_rate": 6.412997860414155e-05, + "loss": 1.7622, + "step": 13898 + }, + { + "epoch": 4.266114180478821, + "grad_norm": 0.2311459481716156, + "learning_rate": 6.412521058458285e-05, + "loss": 1.7894, + "step": 13899 + }, + { + "epoch": 4.2664211172498465, + "grad_norm": 0.2647818624973297, + "learning_rate": 6.412044242543054e-05, + "loss": 1.7399, + "step": 13900 + }, + { + "epoch": 4.266728054020872, + "grad_norm": 0.3174133002758026, + "learning_rate": 6.411567412673174e-05, + "loss": 1.7552, + "step": 13901 + }, + { + "epoch": 4.267034990791897, + "grad_norm": 0.25207316875457764, + "learning_rate": 6.411090568853358e-05, + "loss": 1.7876, + "step": 13902 + }, + { + "epoch": 4.267341927562922, + "grad_norm": 0.24549202620983124, + "learning_rate": 6.410613711088317e-05, + "loss": 1.8554, + "step": 13903 + }, + { + "epoch": 4.267648864333947, + "grad_norm": 0.26293641328811646, + "learning_rate": 6.410136839382765e-05, + "loss": 1.8553, + "step": 13904 + }, + { + "epoch": 4.267955801104972, + "grad_norm": 0.20258362591266632, + "learning_rate": 6.409659953741416e-05, + "loss": 1.7205, + "step": 13905 + }, + { + "epoch": 4.268262737875998, + "grad_norm": 0.24885907769203186, + "learning_rate": 6.409183054168979e-05, + "loss": 1.7718, + "step": 13906 + }, + { + "epoch": 4.268569674647023, + "grad_norm": 0.22737209498882294, + "learning_rate": 6.408706140670169e-05, + "loss": 1.7228, + "step": 13907 + }, + { + "epoch": 4.268876611418047, + "grad_norm": 0.2201235145330429, + "learning_rate": 6.4082292132497e-05, + "loss": 1.7451, + "step": 13908 + }, + { + "epoch": 4.269183548189073, + "grad_norm": 0.24108454585075378, + "learning_rate": 6.407752271912285e-05, + "loss": 1.7531, + "step": 13909 + }, + { + "epoch": 4.269490484960098, + "grad_norm": 0.21723641455173492, + "learning_rate": 6.407275316662636e-05, + "loss": 1.7139, + "step": 13910 + }, + { + "epoch": 4.269797421731123, + "grad_norm": 0.22557848691940308, + "learning_rate": 6.406798347505469e-05, + "loss": 1.7633, + "step": 13911 + }, + { + "epoch": 4.270104358502149, + "grad_norm": 0.24664700031280518, + "learning_rate": 6.406321364445494e-05, + "loss": 1.7854, + "step": 13912 + }, + { + "epoch": 4.270411295273174, + "grad_norm": 0.2599056661128998, + "learning_rate": 6.405844367487428e-05, + "loss": 1.7662, + "step": 13913 + }, + { + "epoch": 4.2707182320441985, + "grad_norm": 0.2378663718700409, + "learning_rate": 6.405367356635982e-05, + "loss": 1.7477, + "step": 13914 + }, + { + "epoch": 4.271025168815224, + "grad_norm": 0.27158626914024353, + "learning_rate": 6.404890331895876e-05, + "loss": 1.7426, + "step": 13915 + }, + { + "epoch": 4.271332105586249, + "grad_norm": 0.28585317730903625, + "learning_rate": 6.404413293271818e-05, + "loss": 1.7492, + "step": 13916 + }, + { + "epoch": 4.2716390423572745, + "grad_norm": 0.2321750968694687, + "learning_rate": 6.403936240768526e-05, + "loss": 1.8594, + "step": 13917 + }, + { + "epoch": 4.2719459791283, + "grad_norm": 0.25824111700057983, + "learning_rate": 6.40345917439071e-05, + "loss": 1.7622, + "step": 13918 + }, + { + "epoch": 4.272252915899324, + "grad_norm": 0.24641194939613342, + "learning_rate": 6.40298209414309e-05, + "loss": 1.7519, + "step": 13919 + }, + { + "epoch": 4.27255985267035, + "grad_norm": 0.2132398933172226, + "learning_rate": 6.40250500003038e-05, + "loss": 1.7339, + "step": 13920 + }, + { + "epoch": 4.272866789441375, + "grad_norm": 0.22630736231803894, + "learning_rate": 6.402027892057292e-05, + "loss": 1.7396, + "step": 13921 + }, + { + "epoch": 4.2731737262124, + "grad_norm": 0.295163631439209, + "learning_rate": 6.401550770228543e-05, + "loss": 1.8063, + "step": 13922 + }, + { + "epoch": 4.273480662983426, + "grad_norm": 0.2722746729850769, + "learning_rate": 6.401073634548848e-05, + "loss": 1.7775, + "step": 13923 + }, + { + "epoch": 4.273787599754451, + "grad_norm": 0.23201976716518402, + "learning_rate": 6.400596485022922e-05, + "loss": 1.7755, + "step": 13924 + }, + { + "epoch": 4.274094536525475, + "grad_norm": 0.23880761861801147, + "learning_rate": 6.40011932165548e-05, + "loss": 1.778, + "step": 13925 + }, + { + "epoch": 4.274401473296501, + "grad_norm": 0.22305625677108765, + "learning_rate": 6.399642144451239e-05, + "loss": 1.761, + "step": 13926 + }, + { + "epoch": 4.274708410067526, + "grad_norm": 0.21874886751174927, + "learning_rate": 6.399164953414914e-05, + "loss": 1.7148, + "step": 13927 + }, + { + "epoch": 4.2750153468385514, + "grad_norm": 0.2003604918718338, + "learning_rate": 6.398687748551221e-05, + "loss": 1.8049, + "step": 13928 + }, + { + "epoch": 4.275322283609577, + "grad_norm": 0.2443511188030243, + "learning_rate": 6.398210529864875e-05, + "loss": 1.782, + "step": 13929 + }, + { + "epoch": 4.275629220380601, + "grad_norm": 0.2297198623418808, + "learning_rate": 6.397733297360594e-05, + "loss": 1.7682, + "step": 13930 + }, + { + "epoch": 4.275936157151627, + "grad_norm": 0.23474562168121338, + "learning_rate": 6.39725605104309e-05, + "loss": 1.7809, + "step": 13931 + }, + { + "epoch": 4.276243093922652, + "grad_norm": 0.25908544659614563, + "learning_rate": 6.396778790917087e-05, + "loss": 1.7343, + "step": 13932 + }, + { + "epoch": 4.276550030693677, + "grad_norm": 0.2440379112958908, + "learning_rate": 6.396301516987295e-05, + "loss": 1.786, + "step": 13933 + }, + { + "epoch": 4.276856967464703, + "grad_norm": 0.26185858249664307, + "learning_rate": 6.395824229258435e-05, + "loss": 1.7863, + "step": 13934 + }, + { + "epoch": 4.277163904235728, + "grad_norm": 0.24470919370651245, + "learning_rate": 6.39534692773522e-05, + "loss": 1.7774, + "step": 13935 + }, + { + "epoch": 4.277470841006752, + "grad_norm": 0.2612632215023041, + "learning_rate": 6.39486961242237e-05, + "loss": 1.7536, + "step": 13936 + }, + { + "epoch": 4.277777777777778, + "grad_norm": 0.26870301365852356, + "learning_rate": 6.3943922833246e-05, + "loss": 1.8177, + "step": 13937 + }, + { + "epoch": 4.278084714548803, + "grad_norm": 0.24445784091949463, + "learning_rate": 6.393914940446628e-05, + "loss": 1.7539, + "step": 13938 + }, + { + "epoch": 4.278391651319828, + "grad_norm": 0.2622319757938385, + "learning_rate": 6.393437583793174e-05, + "loss": 1.8252, + "step": 13939 + }, + { + "epoch": 4.278698588090854, + "grad_norm": 0.2586652636528015, + "learning_rate": 6.39296021336895e-05, + "loss": 1.7975, + "step": 13940 + }, + { + "epoch": 4.279005524861878, + "grad_norm": 0.19488228857517242, + "learning_rate": 6.392482829178678e-05, + "loss": 1.7678, + "step": 13941 + }, + { + "epoch": 4.2793124616329035, + "grad_norm": 0.23956604301929474, + "learning_rate": 6.392005431227074e-05, + "loss": 1.7444, + "step": 13942 + }, + { + "epoch": 4.279619398403929, + "grad_norm": 0.24195842444896698, + "learning_rate": 6.391528019518857e-05, + "loss": 1.8116, + "step": 13943 + }, + { + "epoch": 4.279926335174954, + "grad_norm": 0.21479523181915283, + "learning_rate": 6.391050594058746e-05, + "loss": 1.7351, + "step": 13944 + }, + { + "epoch": 4.2802332719459795, + "grad_norm": 0.2309941202402115, + "learning_rate": 6.390573154851456e-05, + "loss": 1.8245, + "step": 13945 + }, + { + "epoch": 4.280540208717004, + "grad_norm": 0.2375536412000656, + "learning_rate": 6.390095701901706e-05, + "loss": 1.7921, + "step": 13946 + }, + { + "epoch": 4.280847145488029, + "grad_norm": 0.25518664717674255, + "learning_rate": 6.389618235214216e-05, + "loss": 1.7549, + "step": 13947 + }, + { + "epoch": 4.281154082259055, + "grad_norm": 0.2579016089439392, + "learning_rate": 6.389140754793705e-05, + "loss": 1.7637, + "step": 13948 + }, + { + "epoch": 4.28146101903008, + "grad_norm": 0.25350916385650635, + "learning_rate": 6.388663260644892e-05, + "loss": 1.746, + "step": 13949 + }, + { + "epoch": 4.281767955801105, + "grad_norm": 0.2994026839733124, + "learning_rate": 6.388185752772493e-05, + "loss": 1.8196, + "step": 13950 + }, + { + "epoch": 4.28207489257213, + "grad_norm": 0.29938533902168274, + "learning_rate": 6.387708231181229e-05, + "loss": 1.7187, + "step": 13951 + }, + { + "epoch": 4.282381829343155, + "grad_norm": 0.23865137994289398, + "learning_rate": 6.387230695875819e-05, + "loss": 1.7317, + "step": 13952 + }, + { + "epoch": 4.28268876611418, + "grad_norm": 0.23812857270240784, + "learning_rate": 6.386753146860982e-05, + "loss": 1.7536, + "step": 13953 + }, + { + "epoch": 4.282995702885206, + "grad_norm": 0.3395650088787079, + "learning_rate": 6.386275584141438e-05, + "loss": 1.7932, + "step": 13954 + }, + { + "epoch": 4.283302639656231, + "grad_norm": 0.38207507133483887, + "learning_rate": 6.385798007721906e-05, + "loss": 1.8196, + "step": 13955 + }, + { + "epoch": 4.283609576427256, + "grad_norm": 0.32960978150367737, + "learning_rate": 6.385320417607107e-05, + "loss": 1.7898, + "step": 13956 + }, + { + "epoch": 4.283916513198281, + "grad_norm": 0.22978928685188293, + "learning_rate": 6.384842813801757e-05, + "loss": 1.7835, + "step": 13957 + }, + { + "epoch": 4.284223449969306, + "grad_norm": 0.24607588350772858, + "learning_rate": 6.38436519631058e-05, + "loss": 1.7829, + "step": 13958 + }, + { + "epoch": 4.2845303867403315, + "grad_norm": 0.2770270109176636, + "learning_rate": 6.383887565138295e-05, + "loss": 1.7294, + "step": 13959 + }, + { + "epoch": 4.284837323511357, + "grad_norm": 0.27644863724708557, + "learning_rate": 6.383409920289622e-05, + "loss": 1.829, + "step": 13960 + }, + { + "epoch": 4.285144260282382, + "grad_norm": 0.3870919942855835, + "learning_rate": 6.382932261769282e-05, + "loss": 1.8146, + "step": 13961 + }, + { + "epoch": 4.285451197053407, + "grad_norm": 0.3562348186969757, + "learning_rate": 6.382454589581994e-05, + "loss": 1.8225, + "step": 13962 + }, + { + "epoch": 4.285758133824432, + "grad_norm": 0.28444886207580566, + "learning_rate": 6.38197690373248e-05, + "loss": 1.7734, + "step": 13963 + }, + { + "epoch": 4.286065070595457, + "grad_norm": 0.27935758233070374, + "learning_rate": 6.381499204225459e-05, + "loss": 1.7402, + "step": 13964 + }, + { + "epoch": 4.286372007366483, + "grad_norm": 0.34188997745513916, + "learning_rate": 6.381021491065653e-05, + "loss": 1.7661, + "step": 13965 + }, + { + "epoch": 4.286678944137508, + "grad_norm": 0.28648918867111206, + "learning_rate": 6.380543764257785e-05, + "loss": 1.8312, + "step": 13966 + }, + { + "epoch": 4.286985880908533, + "grad_norm": 0.2733290493488312, + "learning_rate": 6.380066023806572e-05, + "loss": 1.7505, + "step": 13967 + }, + { + "epoch": 4.287292817679558, + "grad_norm": 0.3344273865222931, + "learning_rate": 6.37958826971674e-05, + "loss": 1.8392, + "step": 13968 + }, + { + "epoch": 4.287599754450583, + "grad_norm": 0.2655799090862274, + "learning_rate": 6.379110501993006e-05, + "loss": 1.7575, + "step": 13969 + }, + { + "epoch": 4.287906691221608, + "grad_norm": 0.2569151818752289, + "learning_rate": 6.378632720640095e-05, + "loss": 1.6619, + "step": 13970 + }, + { + "epoch": 4.288213627992634, + "grad_norm": 0.2477198988199234, + "learning_rate": 6.378154925662727e-05, + "loss": 1.7532, + "step": 13971 + }, + { + "epoch": 4.288520564763659, + "grad_norm": 0.2867630422115326, + "learning_rate": 6.377677117065624e-05, + "loss": 1.7725, + "step": 13972 + }, + { + "epoch": 4.2888275015346835, + "grad_norm": 0.28316137194633484, + "learning_rate": 6.37719929485351e-05, + "loss": 1.7628, + "step": 13973 + }, + { + "epoch": 4.289134438305709, + "grad_norm": 0.2934304475784302, + "learning_rate": 6.376721459031106e-05, + "loss": 1.7346, + "step": 13974 + }, + { + "epoch": 4.289441375076734, + "grad_norm": 0.22847147285938263, + "learning_rate": 6.376243609603129e-05, + "loss": 1.7409, + "step": 13975 + }, + { + "epoch": 4.2897483118477595, + "grad_norm": 0.360441118478775, + "learning_rate": 6.375765746574311e-05, + "loss": 1.808, + "step": 13976 + }, + { + "epoch": 4.290055248618785, + "grad_norm": 0.2750907242298126, + "learning_rate": 6.375287869949367e-05, + "loss": 1.8046, + "step": 13977 + }, + { + "epoch": 4.290362185389809, + "grad_norm": 0.26193201541900635, + "learning_rate": 6.374809979733022e-05, + "loss": 1.7097, + "step": 13978 + }, + { + "epoch": 4.290669122160835, + "grad_norm": 0.3282175064086914, + "learning_rate": 6.37433207593e-05, + "loss": 1.7924, + "step": 13979 + }, + { + "epoch": 4.29097605893186, + "grad_norm": 0.2845167815685272, + "learning_rate": 6.373854158545021e-05, + "loss": 1.7663, + "step": 13980 + }, + { + "epoch": 4.291282995702885, + "grad_norm": 0.21816621720790863, + "learning_rate": 6.37337622758281e-05, + "loss": 1.7368, + "step": 13981 + }, + { + "epoch": 4.291589932473911, + "grad_norm": 0.264272540807724, + "learning_rate": 6.372898283048094e-05, + "loss": 1.7377, + "step": 13982 + }, + { + "epoch": 4.291896869244935, + "grad_norm": 0.2182006686925888, + "learning_rate": 6.37242032494559e-05, + "loss": 1.8107, + "step": 13983 + }, + { + "epoch": 4.29220380601596, + "grad_norm": 0.26856422424316406, + "learning_rate": 6.371942353280023e-05, + "loss": 1.7708, + "step": 13984 + }, + { + "epoch": 4.292510742786986, + "grad_norm": 0.3025323748588562, + "learning_rate": 6.37146436805612e-05, + "loss": 1.7768, + "step": 13985 + }, + { + "epoch": 4.292817679558011, + "grad_norm": 0.2949144244194031, + "learning_rate": 6.3709863692786e-05, + "loss": 1.7848, + "step": 13986 + }, + { + "epoch": 4.293124616329036, + "grad_norm": 0.20670418441295624, + "learning_rate": 6.370508356952188e-05, + "loss": 1.7367, + "step": 13987 + }, + { + "epoch": 4.293431553100062, + "grad_norm": 0.2453860342502594, + "learning_rate": 6.370030331081611e-05, + "loss": 1.7246, + "step": 13988 + }, + { + "epoch": 4.293738489871086, + "grad_norm": 0.3413507044315338, + "learning_rate": 6.369552291671592e-05, + "loss": 1.7829, + "step": 13989 + }, + { + "epoch": 4.2940454266421115, + "grad_norm": 0.28352782130241394, + "learning_rate": 6.369074238726856e-05, + "loss": 1.7755, + "step": 13990 + }, + { + "epoch": 4.294352363413137, + "grad_norm": 0.21408751606941223, + "learning_rate": 6.368596172252124e-05, + "loss": 1.7292, + "step": 13991 + }, + { + "epoch": 4.294659300184162, + "grad_norm": 0.28372085094451904, + "learning_rate": 6.36811809225212e-05, + "loss": 1.8197, + "step": 13992 + }, + { + "epoch": 4.2949662369551875, + "grad_norm": 0.2400829792022705, + "learning_rate": 6.367639998731573e-05, + "loss": 1.7559, + "step": 13993 + }, + { + "epoch": 4.295273173726212, + "grad_norm": 0.22853593528270721, + "learning_rate": 6.367161891695207e-05, + "loss": 1.8116, + "step": 13994 + }, + { + "epoch": 4.295580110497237, + "grad_norm": 0.22098208963871002, + "learning_rate": 6.366683771147745e-05, + "loss": 1.7269, + "step": 13995 + }, + { + "epoch": 4.295887047268263, + "grad_norm": 0.22293934226036072, + "learning_rate": 6.366205637093914e-05, + "loss": 1.7944, + "step": 13996 + }, + { + "epoch": 4.296193984039288, + "grad_norm": 0.26120004057884216, + "learning_rate": 6.365727489538437e-05, + "loss": 1.7581, + "step": 13997 + }, + { + "epoch": 4.296500920810313, + "grad_norm": 0.2568937838077545, + "learning_rate": 6.365249328486041e-05, + "loss": 1.7356, + "step": 13998 + }, + { + "epoch": 4.296807857581339, + "grad_norm": 0.2419043630361557, + "learning_rate": 6.364771153941449e-05, + "loss": 1.8127, + "step": 13999 + }, + { + "epoch": 4.297114794352363, + "grad_norm": 0.2521972060203552, + "learning_rate": 6.364292965909391e-05, + "loss": 1.7445, + "step": 14000 + }, + { + "epoch": 4.297421731123388, + "grad_norm": 0.3269292414188385, + "learning_rate": 6.363814764394589e-05, + "loss": 1.7835, + "step": 14001 + }, + { + "epoch": 4.297728667894414, + "grad_norm": 0.258405864238739, + "learning_rate": 6.36333654940177e-05, + "loss": 1.7407, + "step": 14002 + }, + { + "epoch": 4.298035604665439, + "grad_norm": 0.21527236700057983, + "learning_rate": 6.362858320935662e-05, + "loss": 1.7729, + "step": 14003 + }, + { + "epoch": 4.298342541436464, + "grad_norm": 0.25343602895736694, + "learning_rate": 6.362380079000988e-05, + "loss": 1.8087, + "step": 14004 + }, + { + "epoch": 4.298649478207489, + "grad_norm": 0.26110637187957764, + "learning_rate": 6.361901823602474e-05, + "loss": 1.813, + "step": 14005 + }, + { + "epoch": 4.298956414978514, + "grad_norm": 0.26749926805496216, + "learning_rate": 6.361423554744851e-05, + "loss": 1.8193, + "step": 14006 + }, + { + "epoch": 4.2992633517495396, + "grad_norm": 0.22357676923274994, + "learning_rate": 6.360945272432841e-05, + "loss": 1.7498, + "step": 14007 + }, + { + "epoch": 4.299570288520565, + "grad_norm": 0.2367832362651825, + "learning_rate": 6.360466976671172e-05, + "loss": 1.7843, + "step": 14008 + }, + { + "epoch": 4.29987722529159, + "grad_norm": 0.23594366014003754, + "learning_rate": 6.35998866746457e-05, + "loss": 1.7442, + "step": 14009 + }, + { + "epoch": 4.300184162062616, + "grad_norm": 0.2660543918609619, + "learning_rate": 6.359510344817765e-05, + "loss": 1.7557, + "step": 14010 + }, + { + "epoch": 4.30049109883364, + "grad_norm": 0.191593199968338, + "learning_rate": 6.359032008735481e-05, + "loss": 1.7988, + "step": 14011 + }, + { + "epoch": 4.300798035604665, + "grad_norm": 0.2755490243434906, + "learning_rate": 6.358553659222447e-05, + "loss": 1.7551, + "step": 14012 + }, + { + "epoch": 4.301104972375691, + "grad_norm": 0.2900530993938446, + "learning_rate": 6.358075296283387e-05, + "loss": 1.7523, + "step": 14013 + }, + { + "epoch": 4.301411909146716, + "grad_norm": 0.22242774069309235, + "learning_rate": 6.357596919923033e-05, + "loss": 1.7626, + "step": 14014 + }, + { + "epoch": 4.301718845917741, + "grad_norm": 0.26636210083961487, + "learning_rate": 6.357118530146108e-05, + "loss": 1.7855, + "step": 14015 + }, + { + "epoch": 4.302025782688766, + "grad_norm": 0.3055269718170166, + "learning_rate": 6.356640126957344e-05, + "loss": 1.7528, + "step": 14016 + }, + { + "epoch": 4.302332719459791, + "grad_norm": 0.29695719480514526, + "learning_rate": 6.356161710361468e-05, + "loss": 1.7482, + "step": 14017 + }, + { + "epoch": 4.3026396562308165, + "grad_norm": 0.2369711697101593, + "learning_rate": 6.355683280363207e-05, + "loss": 1.7635, + "step": 14018 + }, + { + "epoch": 4.302946593001842, + "grad_norm": 0.26681363582611084, + "learning_rate": 6.35520483696729e-05, + "loss": 1.8814, + "step": 14019 + }, + { + "epoch": 4.303253529772867, + "grad_norm": 0.2623308598995209, + "learning_rate": 6.354726380178442e-05, + "loss": 1.8645, + "step": 14020 + }, + { + "epoch": 4.303560466543892, + "grad_norm": 0.23326413333415985, + "learning_rate": 6.354247910001394e-05, + "loss": 1.8093, + "step": 14021 + }, + { + "epoch": 4.303867403314917, + "grad_norm": 0.3037295639514923, + "learning_rate": 6.353769426440875e-05, + "loss": 1.8556, + "step": 14022 + }, + { + "epoch": 4.304174340085942, + "grad_norm": 0.23624882102012634, + "learning_rate": 6.353290929501616e-05, + "loss": 1.803, + "step": 14023 + }, + { + "epoch": 4.304481276856968, + "grad_norm": 0.22106927633285522, + "learning_rate": 6.35281241918834e-05, + "loss": 1.7133, + "step": 14024 + }, + { + "epoch": 4.304788213627993, + "grad_norm": 0.2374040186405182, + "learning_rate": 6.352333895505778e-05, + "loss": 1.8127, + "step": 14025 + }, + { + "epoch": 4.305095150399017, + "grad_norm": 0.2782450318336487, + "learning_rate": 6.35185535845866e-05, + "loss": 1.8613, + "step": 14026 + }, + { + "epoch": 4.305402087170043, + "grad_norm": 0.2527763843536377, + "learning_rate": 6.351376808051717e-05, + "loss": 1.7533, + "step": 14027 + }, + { + "epoch": 4.305709023941068, + "grad_norm": 0.2462318390607834, + "learning_rate": 6.350898244289675e-05, + "loss": 1.8075, + "step": 14028 + }, + { + "epoch": 4.306015960712093, + "grad_norm": 0.2646189332008362, + "learning_rate": 6.350419667177265e-05, + "loss": 1.8261, + "step": 14029 + }, + { + "epoch": 4.306322897483119, + "grad_norm": 0.24918611347675323, + "learning_rate": 6.349941076719218e-05, + "loss": 1.7542, + "step": 14030 + }, + { + "epoch": 4.306629834254144, + "grad_norm": 0.22440841794013977, + "learning_rate": 6.349462472920259e-05, + "loss": 1.7897, + "step": 14031 + }, + { + "epoch": 4.3069367710251685, + "grad_norm": 0.28614330291748047, + "learning_rate": 6.348983855785121e-05, + "loss": 1.88, + "step": 14032 + }, + { + "epoch": 4.307243707796194, + "grad_norm": 0.25015848875045776, + "learning_rate": 6.348505225318535e-05, + "loss": 1.8008, + "step": 14033 + }, + { + "epoch": 4.307550644567219, + "grad_norm": 0.2468707263469696, + "learning_rate": 6.34802658152523e-05, + "loss": 1.8025, + "step": 14034 + }, + { + "epoch": 4.3078575813382445, + "grad_norm": 0.30504748225212097, + "learning_rate": 6.347547924409937e-05, + "loss": 1.8765, + "step": 14035 + }, + { + "epoch": 4.30816451810927, + "grad_norm": 0.35419392585754395, + "learning_rate": 6.347069253977385e-05, + "loss": 1.7807, + "step": 14036 + }, + { + "epoch": 4.308471454880294, + "grad_norm": 0.33683931827545166, + "learning_rate": 6.346590570232305e-05, + "loss": 1.7244, + "step": 14037 + }, + { + "epoch": 4.30877839165132, + "grad_norm": 0.3339467942714691, + "learning_rate": 6.346111873179427e-05, + "loss": 1.7642, + "step": 14038 + }, + { + "epoch": 4.309085328422345, + "grad_norm": 0.2369392216205597, + "learning_rate": 6.345633162823484e-05, + "loss": 1.7127, + "step": 14039 + }, + { + "epoch": 4.30939226519337, + "grad_norm": 0.26469686627388, + "learning_rate": 6.345154439169206e-05, + "loss": 1.7235, + "step": 14040 + }, + { + "epoch": 4.309699201964396, + "grad_norm": 0.2737344205379486, + "learning_rate": 6.344675702221321e-05, + "loss": 1.783, + "step": 14041 + }, + { + "epoch": 4.310006138735421, + "grad_norm": 0.2381773442029953, + "learning_rate": 6.344196951984565e-05, + "loss": 1.7172, + "step": 14042 + }, + { + "epoch": 4.310313075506445, + "grad_norm": 0.28199076652526855, + "learning_rate": 6.343718188463663e-05, + "loss": 1.8315, + "step": 14043 + }, + { + "epoch": 4.310620012277471, + "grad_norm": 0.24378590285778046, + "learning_rate": 6.343239411663353e-05, + "loss": 1.7828, + "step": 14044 + }, + { + "epoch": 4.310926949048496, + "grad_norm": 0.26343944668769836, + "learning_rate": 6.342760621588365e-05, + "loss": 1.7679, + "step": 14045 + }, + { + "epoch": 4.311233885819521, + "grad_norm": 0.23703521490097046, + "learning_rate": 6.342281818243427e-05, + "loss": 1.7885, + "step": 14046 + }, + { + "epoch": 4.311540822590547, + "grad_norm": 0.2230173498392105, + "learning_rate": 6.341803001633276e-05, + "loss": 1.767, + "step": 14047 + }, + { + "epoch": 4.311847759361571, + "grad_norm": 0.249002143740654, + "learning_rate": 6.34132417176264e-05, + "loss": 1.8032, + "step": 14048 + }, + { + "epoch": 4.3121546961325965, + "grad_norm": 0.2383791208267212, + "learning_rate": 6.34084532863625e-05, + "loss": 1.7558, + "step": 14049 + }, + { + "epoch": 4.312461632903622, + "grad_norm": 0.2783047556877136, + "learning_rate": 6.340366472258843e-05, + "loss": 1.8389, + "step": 14050 + }, + { + "epoch": 4.312768569674647, + "grad_norm": 0.2654891312122345, + "learning_rate": 6.339887602635148e-05, + "loss": 1.7989, + "step": 14051 + }, + { + "epoch": 4.3130755064456725, + "grad_norm": 0.2638411521911621, + "learning_rate": 6.3394087197699e-05, + "loss": 1.8707, + "step": 14052 + }, + { + "epoch": 4.313382443216697, + "grad_norm": 0.3026179075241089, + "learning_rate": 6.338929823667829e-05, + "loss": 1.7892, + "step": 14053 + }, + { + "epoch": 4.313689379987722, + "grad_norm": 0.27496880292892456, + "learning_rate": 6.338450914333668e-05, + "loss": 1.7398, + "step": 14054 + }, + { + "epoch": 4.313996316758748, + "grad_norm": 0.2601073086261749, + "learning_rate": 6.337971991772151e-05, + "loss": 1.7646, + "step": 14055 + }, + { + "epoch": 4.314303253529773, + "grad_norm": 0.2061719298362732, + "learning_rate": 6.337493055988011e-05, + "loss": 1.7372, + "step": 14056 + }, + { + "epoch": 4.314610190300798, + "grad_norm": 0.23722340166568756, + "learning_rate": 6.337014106985981e-05, + "loss": 1.7457, + "step": 14057 + }, + { + "epoch": 4.314917127071823, + "grad_norm": 0.2729428708553314, + "learning_rate": 6.336535144770793e-05, + "loss": 1.8423, + "step": 14058 + }, + { + "epoch": 4.315224063842848, + "grad_norm": 0.23520450294017792, + "learning_rate": 6.336056169347182e-05, + "loss": 1.8124, + "step": 14059 + }, + { + "epoch": 4.315531000613873, + "grad_norm": 0.25142738223075867, + "learning_rate": 6.33557718071988e-05, + "loss": 1.7285, + "step": 14060 + }, + { + "epoch": 4.315837937384899, + "grad_norm": 0.24833035469055176, + "learning_rate": 6.335098178893621e-05, + "loss": 1.766, + "step": 14061 + }, + { + "epoch": 4.316144874155924, + "grad_norm": 0.2406177669763565, + "learning_rate": 6.334619163873141e-05, + "loss": 1.8824, + "step": 14062 + }, + { + "epoch": 4.316451810926949, + "grad_norm": 0.23077574372291565, + "learning_rate": 6.334140135663172e-05, + "loss": 1.7589, + "step": 14063 + }, + { + "epoch": 4.316758747697974, + "grad_norm": 0.20476560294628143, + "learning_rate": 6.333661094268448e-05, + "loss": 1.7331, + "step": 14064 + }, + { + "epoch": 4.317065684468999, + "grad_norm": 0.207991823554039, + "learning_rate": 6.333182039693704e-05, + "loss": 1.6876, + "step": 14065 + }, + { + "epoch": 4.3173726212400245, + "grad_norm": 0.20813052356243134, + "learning_rate": 6.332702971943671e-05, + "loss": 1.775, + "step": 14066 + }, + { + "epoch": 4.31767955801105, + "grad_norm": 0.2470991462469101, + "learning_rate": 6.332223891023087e-05, + "loss": 1.7673, + "step": 14067 + }, + { + "epoch": 4.317986494782075, + "grad_norm": 0.23855723440647125, + "learning_rate": 6.331744796936687e-05, + "loss": 1.7842, + "step": 14068 + }, + { + "epoch": 4.3182934315531, + "grad_norm": 0.21852652728557587, + "learning_rate": 6.331265689689204e-05, + "loss": 1.7727, + "step": 14069 + }, + { + "epoch": 4.318600368324125, + "grad_norm": 0.284496545791626, + "learning_rate": 6.330786569285374e-05, + "loss": 1.8248, + "step": 14070 + }, + { + "epoch": 4.31890730509515, + "grad_norm": 0.21709981560707092, + "learning_rate": 6.33030743572993e-05, + "loss": 1.7547, + "step": 14071 + }, + { + "epoch": 4.319214241866176, + "grad_norm": 0.24209457635879517, + "learning_rate": 6.329828289027608e-05, + "loss": 1.7695, + "step": 14072 + }, + { + "epoch": 4.319521178637201, + "grad_norm": 0.24869373440742493, + "learning_rate": 6.329349129183144e-05, + "loss": 1.8204, + "step": 14073 + }, + { + "epoch": 4.319828115408226, + "grad_norm": 0.21702703833580017, + "learning_rate": 6.328869956201274e-05, + "loss": 1.779, + "step": 14074 + }, + { + "epoch": 4.320135052179251, + "grad_norm": 0.22993850708007812, + "learning_rate": 6.328390770086731e-05, + "loss": 1.7935, + "step": 14075 + }, + { + "epoch": 4.320441988950276, + "grad_norm": 0.23491734266281128, + "learning_rate": 6.327911570844252e-05, + "loss": 1.7261, + "step": 14076 + }, + { + "epoch": 4.320748925721301, + "grad_norm": 0.2479303777217865, + "learning_rate": 6.327432358478571e-05, + "loss": 1.7683, + "step": 14077 + }, + { + "epoch": 4.321055862492327, + "grad_norm": 0.24261580407619476, + "learning_rate": 6.326953132994427e-05, + "loss": 1.7147, + "step": 14078 + }, + { + "epoch": 4.321362799263352, + "grad_norm": 0.24627646803855896, + "learning_rate": 6.326473894396553e-05, + "loss": 1.7976, + "step": 14079 + }, + { + "epoch": 4.3216697360343765, + "grad_norm": 0.269149512052536, + "learning_rate": 6.325994642689688e-05, + "loss": 1.7247, + "step": 14080 + }, + { + "epoch": 4.321976672805402, + "grad_norm": 0.4162158966064453, + "learning_rate": 6.325515377878566e-05, + "loss": 1.7485, + "step": 14081 + }, + { + "epoch": 4.322283609576427, + "grad_norm": 0.366459459066391, + "learning_rate": 6.325036099967925e-05, + "loss": 1.7286, + "step": 14082 + }, + { + "epoch": 4.3225905463474525, + "grad_norm": 0.2465270757675171, + "learning_rate": 6.324556808962499e-05, + "loss": 1.8097, + "step": 14083 + }, + { + "epoch": 4.322897483118478, + "grad_norm": 0.2911076843738556, + "learning_rate": 6.324077504867026e-05, + "loss": 1.7979, + "step": 14084 + }, + { + "epoch": 4.323204419889503, + "grad_norm": 0.33455169200897217, + "learning_rate": 6.323598187686245e-05, + "loss": 1.7988, + "step": 14085 + }, + { + "epoch": 4.323511356660528, + "grad_norm": 0.25020337104797363, + "learning_rate": 6.32311885742489e-05, + "loss": 1.7184, + "step": 14086 + }, + { + "epoch": 4.323818293431553, + "grad_norm": 0.23941513895988464, + "learning_rate": 6.322639514087699e-05, + "loss": 1.7672, + "step": 14087 + }, + { + "epoch": 4.324125230202578, + "grad_norm": 0.35258981585502625, + "learning_rate": 6.32216015767941e-05, + "loss": 1.7571, + "step": 14088 + }, + { + "epoch": 4.324432166973604, + "grad_norm": 0.2854993939399719, + "learning_rate": 6.321680788204758e-05, + "loss": 1.8096, + "step": 14089 + }, + { + "epoch": 4.324739103744629, + "grad_norm": 0.24422863125801086, + "learning_rate": 6.321201405668482e-05, + "loss": 1.778, + "step": 14090 + }, + { + "epoch": 4.3250460405156534, + "grad_norm": 0.36629122495651245, + "learning_rate": 6.320722010075321e-05, + "loss": 1.716, + "step": 14091 + }, + { + "epoch": 4.325352977286679, + "grad_norm": 0.37115517258644104, + "learning_rate": 6.32024260143001e-05, + "loss": 1.77, + "step": 14092 + }, + { + "epoch": 4.325659914057704, + "grad_norm": 0.21540327370166779, + "learning_rate": 6.319763179737288e-05, + "loss": 1.7529, + "step": 14093 + }, + { + "epoch": 4.3259668508287294, + "grad_norm": 0.2573898732662201, + "learning_rate": 6.319283745001892e-05, + "loss": 1.8101, + "step": 14094 + }, + { + "epoch": 4.326273787599755, + "grad_norm": 0.29481247067451477, + "learning_rate": 6.31880429722856e-05, + "loss": 1.7459, + "step": 14095 + }, + { + "epoch": 4.326580724370779, + "grad_norm": 0.23474647104740143, + "learning_rate": 6.318324836422031e-05, + "loss": 1.786, + "step": 14096 + }, + { + "epoch": 4.326887661141805, + "grad_norm": 0.2884673476219177, + "learning_rate": 6.317845362587045e-05, + "loss": 1.8123, + "step": 14097 + }, + { + "epoch": 4.32719459791283, + "grad_norm": 0.39008447527885437, + "learning_rate": 6.317365875728338e-05, + "loss": 1.7729, + "step": 14098 + }, + { + "epoch": 4.327501534683855, + "grad_norm": 0.30568063259124756, + "learning_rate": 6.316886375850651e-05, + "loss": 1.7088, + "step": 14099 + }, + { + "epoch": 4.327808471454881, + "grad_norm": 0.2538018524646759, + "learning_rate": 6.316406862958718e-05, + "loss": 1.8028, + "step": 14100 + }, + { + "epoch": 4.328115408225905, + "grad_norm": 0.3815068006515503, + "learning_rate": 6.315927337057281e-05, + "loss": 1.7143, + "step": 14101 + }, + { + "epoch": 4.32842234499693, + "grad_norm": 0.3813243508338928, + "learning_rate": 6.31544779815108e-05, + "loss": 1.7072, + "step": 14102 + }, + { + "epoch": 4.328729281767956, + "grad_norm": 0.22438868880271912, + "learning_rate": 6.314968246244852e-05, + "loss": 1.7445, + "step": 14103 + }, + { + "epoch": 4.329036218538981, + "grad_norm": 0.3818886876106262, + "learning_rate": 6.314488681343337e-05, + "loss": 1.8292, + "step": 14104 + }, + { + "epoch": 4.329343155310006, + "grad_norm": 0.4376567006111145, + "learning_rate": 6.314009103451277e-05, + "loss": 1.8224, + "step": 14105 + }, + { + "epoch": 4.329650092081032, + "grad_norm": 0.2741515636444092, + "learning_rate": 6.313529512573406e-05, + "loss": 1.8078, + "step": 14106 + }, + { + "epoch": 4.329957028852056, + "grad_norm": 0.264343798160553, + "learning_rate": 6.313049908714467e-05, + "loss": 1.7314, + "step": 14107 + }, + { + "epoch": 4.3302639656230815, + "grad_norm": 0.3601943552494049, + "learning_rate": 6.312570291879201e-05, + "loss": 1.7351, + "step": 14108 + }, + { + "epoch": 4.330570902394107, + "grad_norm": 0.2931751012802124, + "learning_rate": 6.312090662072345e-05, + "loss": 1.8117, + "step": 14109 + }, + { + "epoch": 4.330877839165132, + "grad_norm": 0.27670225501060486, + "learning_rate": 6.31161101929864e-05, + "loss": 1.7707, + "step": 14110 + }, + { + "epoch": 4.3311847759361575, + "grad_norm": 0.33669596910476685, + "learning_rate": 6.311131363562825e-05, + "loss": 1.7337, + "step": 14111 + }, + { + "epoch": 4.331491712707182, + "grad_norm": 0.232634037733078, + "learning_rate": 6.310651694869643e-05, + "loss": 1.7372, + "step": 14112 + }, + { + "epoch": 4.331798649478207, + "grad_norm": 0.28611311316490173, + "learning_rate": 6.310172013223832e-05, + "loss": 1.6977, + "step": 14113 + }, + { + "epoch": 4.332105586249233, + "grad_norm": 0.30207201838493347, + "learning_rate": 6.309692318630132e-05, + "loss": 1.7765, + "step": 14114 + }, + { + "epoch": 4.332412523020258, + "grad_norm": 0.20757484436035156, + "learning_rate": 6.309212611093287e-05, + "loss": 1.697, + "step": 14115 + }, + { + "epoch": 4.332719459791283, + "grad_norm": 0.31472963094711304, + "learning_rate": 6.308732890618034e-05, + "loss": 1.7757, + "step": 14116 + }, + { + "epoch": 4.333026396562309, + "grad_norm": 0.37042325735092163, + "learning_rate": 6.308253157209117e-05, + "loss": 1.7745, + "step": 14117 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.25001442432403564, + "learning_rate": 6.307773410871275e-05, + "loss": 1.7461, + "step": 14118 + }, + { + "epoch": 4.333640270104358, + "grad_norm": 0.2691943347454071, + "learning_rate": 6.307293651609248e-05, + "loss": 1.7539, + "step": 14119 + }, + { + "epoch": 4.333947206875384, + "grad_norm": 0.30845868587493896, + "learning_rate": 6.306813879427782e-05, + "loss": 1.7559, + "step": 14120 + }, + { + "epoch": 4.334254143646409, + "grad_norm": 0.2244730293750763, + "learning_rate": 6.306334094331613e-05, + "loss": 1.7609, + "step": 14121 + }, + { + "epoch": 4.334561080417434, + "grad_norm": 0.32132062315940857, + "learning_rate": 6.305854296325485e-05, + "loss": 1.7837, + "step": 14122 + }, + { + "epoch": 4.334868017188459, + "grad_norm": 0.3762948513031006, + "learning_rate": 6.30537448541414e-05, + "loss": 1.7631, + "step": 14123 + }, + { + "epoch": 4.335174953959484, + "grad_norm": 0.24174273014068604, + "learning_rate": 6.30489466160232e-05, + "loss": 1.7532, + "step": 14124 + }, + { + "epoch": 4.3354818907305095, + "grad_norm": 0.23468497395515442, + "learning_rate": 6.304414824894765e-05, + "loss": 1.7731, + "step": 14125 + }, + { + "epoch": 4.335788827501535, + "grad_norm": 0.29086077213287354, + "learning_rate": 6.303934975296218e-05, + "loss": 1.7668, + "step": 14126 + }, + { + "epoch": 4.33609576427256, + "grad_norm": 0.2889879643917084, + "learning_rate": 6.303455112811422e-05, + "loss": 1.8188, + "step": 14127 + }, + { + "epoch": 4.336402701043585, + "grad_norm": 0.2335619181394577, + "learning_rate": 6.302975237445119e-05, + "loss": 1.7944, + "step": 14128 + }, + { + "epoch": 4.33670963781461, + "grad_norm": 0.29027310013771057, + "learning_rate": 6.302495349202051e-05, + "loss": 1.7771, + "step": 14129 + }, + { + "epoch": 4.337016574585635, + "grad_norm": 0.31961241364479065, + "learning_rate": 6.302015448086959e-05, + "loss": 1.8187, + "step": 14130 + }, + { + "epoch": 4.337323511356661, + "grad_norm": 0.26015788316726685, + "learning_rate": 6.301535534104587e-05, + "loss": 1.7819, + "step": 14131 + }, + { + "epoch": 4.337630448127686, + "grad_norm": 0.2440631091594696, + "learning_rate": 6.30105560725968e-05, + "loss": 1.7127, + "step": 14132 + }, + { + "epoch": 4.337937384898711, + "grad_norm": 0.304441899061203, + "learning_rate": 6.300575667556979e-05, + "loss": 1.7619, + "step": 14133 + }, + { + "epoch": 4.338244321669736, + "grad_norm": 0.3085228204727173, + "learning_rate": 6.300095715001226e-05, + "loss": 1.8287, + "step": 14134 + }, + { + "epoch": 4.338551258440761, + "grad_norm": 0.2863372564315796, + "learning_rate": 6.299615749597165e-05, + "loss": 1.8068, + "step": 14135 + }, + { + "epoch": 4.338858195211786, + "grad_norm": 0.25255265831947327, + "learning_rate": 6.299135771349537e-05, + "loss": 1.7506, + "step": 14136 + }, + { + "epoch": 4.339165131982812, + "grad_norm": 0.30224961042404175, + "learning_rate": 6.298655780263092e-05, + "loss": 1.7292, + "step": 14137 + }, + { + "epoch": 4.339472068753837, + "grad_norm": 0.24222104251384735, + "learning_rate": 6.298175776342567e-05, + "loss": 1.7616, + "step": 14138 + }, + { + "epoch": 4.3397790055248615, + "grad_norm": 0.3236368000507355, + "learning_rate": 6.29769575959271e-05, + "loss": 1.787, + "step": 14139 + }, + { + "epoch": 4.340085942295887, + "grad_norm": 0.26049408316612244, + "learning_rate": 6.297215730018261e-05, + "loss": 1.7108, + "step": 14140 + }, + { + "epoch": 4.340392879066912, + "grad_norm": 0.22833532094955444, + "learning_rate": 6.296735687623967e-05, + "loss": 1.7661, + "step": 14141 + }, + { + "epoch": 4.3406998158379375, + "grad_norm": 0.28397905826568604, + "learning_rate": 6.296255632414571e-05, + "loss": 1.7163, + "step": 14142 + }, + { + "epoch": 4.341006752608963, + "grad_norm": 0.3072611093521118, + "learning_rate": 6.295775564394817e-05, + "loss": 1.857, + "step": 14143 + }, + { + "epoch": 4.341313689379987, + "grad_norm": 0.22901058197021484, + "learning_rate": 6.295295483569448e-05, + "loss": 1.7325, + "step": 14144 + }, + { + "epoch": 4.341620626151013, + "grad_norm": 0.27433091402053833, + "learning_rate": 6.294815389943212e-05, + "loss": 1.8229, + "step": 14145 + }, + { + "epoch": 4.341927562922038, + "grad_norm": 0.2635616958141327, + "learning_rate": 6.29433528352085e-05, + "loss": 1.7585, + "step": 14146 + }, + { + "epoch": 4.342234499693063, + "grad_norm": 0.29129260778427124, + "learning_rate": 6.293855164307108e-05, + "loss": 1.8294, + "step": 14147 + }, + { + "epoch": 4.342541436464089, + "grad_norm": 0.3429001569747925, + "learning_rate": 6.293375032306731e-05, + "loss": 1.7725, + "step": 14148 + }, + { + "epoch": 4.342848373235114, + "grad_norm": 0.22407259047031403, + "learning_rate": 6.292894887524464e-05, + "loss": 1.7018, + "step": 14149 + }, + { + "epoch": 4.343155310006138, + "grad_norm": 0.3319321274757385, + "learning_rate": 6.292414729965053e-05, + "loss": 1.8472, + "step": 14150 + }, + { + "epoch": 4.343462246777164, + "grad_norm": 0.42744341492652893, + "learning_rate": 6.291934559633241e-05, + "loss": 1.8118, + "step": 14151 + }, + { + "epoch": 4.343769183548189, + "grad_norm": 0.24572840332984924, + "learning_rate": 6.291454376533774e-05, + "loss": 1.7184, + "step": 14152 + }, + { + "epoch": 4.344076120319214, + "grad_norm": 0.2485980987548828, + "learning_rate": 6.290974180671397e-05, + "loss": 1.7649, + "step": 14153 + }, + { + "epoch": 4.34438305709024, + "grad_norm": 0.3911706209182739, + "learning_rate": 6.29049397205086e-05, + "loss": 1.8105, + "step": 14154 + }, + { + "epoch": 4.344689993861264, + "grad_norm": 0.3008342981338501, + "learning_rate": 6.290013750676902e-05, + "loss": 1.7671, + "step": 14155 + }, + { + "epoch": 4.3449969306322895, + "grad_norm": 0.2072051614522934, + "learning_rate": 6.289533516554274e-05, + "loss": 1.7406, + "step": 14156 + }, + { + "epoch": 4.345303867403315, + "grad_norm": 0.3047312796115875, + "learning_rate": 6.289053269687719e-05, + "loss": 1.8133, + "step": 14157 + }, + { + "epoch": 4.34561080417434, + "grad_norm": 0.28260552883148193, + "learning_rate": 6.288573010081984e-05, + "loss": 1.7253, + "step": 14158 + }, + { + "epoch": 4.3459177409453655, + "grad_norm": 0.2474137246608734, + "learning_rate": 6.288092737741815e-05, + "loss": 1.822, + "step": 14159 + }, + { + "epoch": 4.346224677716391, + "grad_norm": 0.23717878758907318, + "learning_rate": 6.287612452671961e-05, + "loss": 1.7826, + "step": 14160 + }, + { + "epoch": 4.346531614487415, + "grad_norm": 0.2646107077598572, + "learning_rate": 6.287132154877163e-05, + "loss": 1.8118, + "step": 14161 + }, + { + "epoch": 4.346838551258441, + "grad_norm": 0.22026480734348297, + "learning_rate": 6.286651844362172e-05, + "loss": 1.7767, + "step": 14162 + }, + { + "epoch": 4.347145488029466, + "grad_norm": 0.2692350447177887, + "learning_rate": 6.286171521131733e-05, + "loss": 1.8718, + "step": 14163 + }, + { + "epoch": 4.347452424800491, + "grad_norm": 0.2749998867511749, + "learning_rate": 6.285691185190592e-05, + "loss": 1.7689, + "step": 14164 + }, + { + "epoch": 4.347759361571517, + "grad_norm": 0.24552448093891144, + "learning_rate": 6.2852108365435e-05, + "loss": 1.8049, + "step": 14165 + }, + { + "epoch": 4.348066298342541, + "grad_norm": 0.20530807971954346, + "learning_rate": 6.2847304751952e-05, + "loss": 1.7606, + "step": 14166 + }, + { + "epoch": 4.348373235113566, + "grad_norm": 0.23396088182926178, + "learning_rate": 6.28425010115044e-05, + "loss": 1.7482, + "step": 14167 + }, + { + "epoch": 4.348680171884592, + "grad_norm": 0.20512452721595764, + "learning_rate": 6.283769714413968e-05, + "loss": 1.6976, + "step": 14168 + }, + { + "epoch": 4.348987108655617, + "grad_norm": 0.20287172496318817, + "learning_rate": 6.283289314990531e-05, + "loss": 1.7439, + "step": 14169 + }, + { + "epoch": 4.349294045426642, + "grad_norm": 0.2193746268749237, + "learning_rate": 6.282808902884876e-05, + "loss": 1.763, + "step": 14170 + }, + { + "epoch": 4.349600982197667, + "grad_norm": 0.20415273308753967, + "learning_rate": 6.282328478101753e-05, + "loss": 1.7025, + "step": 14171 + }, + { + "epoch": 4.349907918968692, + "grad_norm": 0.19286803901195526, + "learning_rate": 6.281848040645907e-05, + "loss": 1.7529, + "step": 14172 + }, + { + "epoch": 4.350214855739718, + "grad_norm": 0.20908218622207642, + "learning_rate": 6.281367590522088e-05, + "loss": 1.7896, + "step": 14173 + }, + { + "epoch": 4.350521792510743, + "grad_norm": 0.2599989175796509, + "learning_rate": 6.280887127735045e-05, + "loss": 1.764, + "step": 14174 + }, + { + "epoch": 4.350828729281768, + "grad_norm": 0.23955710232257843, + "learning_rate": 6.280406652289523e-05, + "loss": 1.7321, + "step": 14175 + }, + { + "epoch": 4.351135666052793, + "grad_norm": 0.2311990112066269, + "learning_rate": 6.279926164190272e-05, + "loss": 1.7338, + "step": 14176 + }, + { + "epoch": 4.351442602823818, + "grad_norm": 0.2599658966064453, + "learning_rate": 6.27944566344204e-05, + "loss": 1.7444, + "step": 14177 + }, + { + "epoch": 4.351749539594843, + "grad_norm": 0.23079386353492737, + "learning_rate": 6.278965150049579e-05, + "loss": 1.7011, + "step": 14178 + }, + { + "epoch": 4.352056476365869, + "grad_norm": 0.24844171106815338, + "learning_rate": 6.278484624017631e-05, + "loss": 1.7298, + "step": 14179 + }, + { + "epoch": 4.352363413136894, + "grad_norm": 0.24839860200881958, + "learning_rate": 6.27800408535095e-05, + "loss": 1.7717, + "step": 14180 + }, + { + "epoch": 4.352670349907919, + "grad_norm": 0.2652966380119324, + "learning_rate": 6.277523534054284e-05, + "loss": 1.7759, + "step": 14181 + }, + { + "epoch": 4.352977286678944, + "grad_norm": 0.2787603735923767, + "learning_rate": 6.277042970132381e-05, + "loss": 1.8981, + "step": 14182 + }, + { + "epoch": 4.353284223449969, + "grad_norm": 0.2535475194454193, + "learning_rate": 6.276562393589991e-05, + "loss": 1.7538, + "step": 14183 + }, + { + "epoch": 4.3535911602209945, + "grad_norm": 0.3210967183113098, + "learning_rate": 6.276081804431863e-05, + "loss": 1.7087, + "step": 14184 + }, + { + "epoch": 4.35389809699202, + "grad_norm": 0.29936519265174866, + "learning_rate": 6.275601202662749e-05, + "loss": 1.7647, + "step": 14185 + }, + { + "epoch": 4.354205033763045, + "grad_norm": 0.21980762481689453, + "learning_rate": 6.275120588287394e-05, + "loss": 1.7759, + "step": 14186 + }, + { + "epoch": 4.35451197053407, + "grad_norm": 0.26833051443099976, + "learning_rate": 6.274639961310549e-05, + "loss": 1.7648, + "step": 14187 + }, + { + "epoch": 4.354818907305095, + "grad_norm": 0.27998095750808716, + "learning_rate": 6.274159321736966e-05, + "loss": 1.746, + "step": 14188 + }, + { + "epoch": 4.35512584407612, + "grad_norm": 0.21354494988918304, + "learning_rate": 6.273678669571395e-05, + "loss": 1.7417, + "step": 14189 + }, + { + "epoch": 4.355432780847146, + "grad_norm": 0.2295297235250473, + "learning_rate": 6.273198004818583e-05, + "loss": 1.7805, + "step": 14190 + }, + { + "epoch": 4.355739717618171, + "grad_norm": 0.2416422963142395, + "learning_rate": 6.272717327483283e-05, + "loss": 1.73, + "step": 14191 + }, + { + "epoch": 4.356046654389196, + "grad_norm": 0.2685304880142212, + "learning_rate": 6.272236637570244e-05, + "loss": 1.7936, + "step": 14192 + }, + { + "epoch": 4.356353591160221, + "grad_norm": 0.32481294870376587, + "learning_rate": 6.271755935084218e-05, + "loss": 1.7192, + "step": 14193 + }, + { + "epoch": 4.356660527931246, + "grad_norm": 0.2428581267595291, + "learning_rate": 6.271275220029954e-05, + "loss": 1.7428, + "step": 14194 + }, + { + "epoch": 4.356967464702271, + "grad_norm": 0.2266654521226883, + "learning_rate": 6.270794492412203e-05, + "loss": 1.7266, + "step": 14195 + }, + { + "epoch": 4.357274401473297, + "grad_norm": 0.25062093138694763, + "learning_rate": 6.270313752235716e-05, + "loss": 1.7476, + "step": 14196 + }, + { + "epoch": 4.357581338244322, + "grad_norm": 0.24085770547389984, + "learning_rate": 6.269832999505244e-05, + "loss": 1.7981, + "step": 14197 + }, + { + "epoch": 4.3578882750153465, + "grad_norm": 0.27035796642303467, + "learning_rate": 6.269352234225536e-05, + "loss": 1.8867, + "step": 14198 + }, + { + "epoch": 4.358195211786372, + "grad_norm": 0.22464458644390106, + "learning_rate": 6.268871456401348e-05, + "loss": 1.7514, + "step": 14199 + }, + { + "epoch": 4.358502148557397, + "grad_norm": 0.22485734522342682, + "learning_rate": 6.268390666037427e-05, + "loss": 1.7558, + "step": 14200 + }, + { + "epoch": 4.3588090853284225, + "grad_norm": 0.2052135169506073, + "learning_rate": 6.267909863138527e-05, + "loss": 1.7453, + "step": 14201 + }, + { + "epoch": 4.359116022099448, + "grad_norm": 0.2130763679742813, + "learning_rate": 6.267429047709397e-05, + "loss": 1.7712, + "step": 14202 + }, + { + "epoch": 4.359422958870473, + "grad_norm": 0.23146997392177582, + "learning_rate": 6.266948219754793e-05, + "loss": 1.6978, + "step": 14203 + }, + { + "epoch": 4.359729895641498, + "grad_norm": 0.21657225489616394, + "learning_rate": 6.266467379279463e-05, + "loss": 1.7641, + "step": 14204 + }, + { + "epoch": 4.360036832412523, + "grad_norm": 0.2598700523376465, + "learning_rate": 6.265986526288158e-05, + "loss": 1.7956, + "step": 14205 + }, + { + "epoch": 4.360343769183548, + "grad_norm": 0.23497453331947327, + "learning_rate": 6.265505660785633e-05, + "loss": 1.7835, + "step": 14206 + }, + { + "epoch": 4.360650705954574, + "grad_norm": 0.2491760104894638, + "learning_rate": 6.265024782776641e-05, + "loss": 1.8454, + "step": 14207 + }, + { + "epoch": 4.360957642725599, + "grad_norm": 0.224884033203125, + "learning_rate": 6.264543892265932e-05, + "loss": 1.8383, + "step": 14208 + }, + { + "epoch": 4.361264579496623, + "grad_norm": 0.24057646095752716, + "learning_rate": 6.264062989258259e-05, + "loss": 1.7437, + "step": 14209 + }, + { + "epoch": 4.361571516267649, + "grad_norm": 0.24661841988563538, + "learning_rate": 6.263582073758374e-05, + "loss": 1.8151, + "step": 14210 + }, + { + "epoch": 4.361878453038674, + "grad_norm": 0.24618980288505554, + "learning_rate": 6.263101145771031e-05, + "loss": 1.7955, + "step": 14211 + }, + { + "epoch": 4.362185389809699, + "grad_norm": 0.2615448236465454, + "learning_rate": 6.262620205300981e-05, + "loss": 1.7819, + "step": 14212 + }, + { + "epoch": 4.362492326580725, + "grad_norm": 0.3528309464454651, + "learning_rate": 6.26213925235298e-05, + "loss": 1.7723, + "step": 14213 + }, + { + "epoch": 4.362799263351749, + "grad_norm": 0.3099561035633087, + "learning_rate": 6.261658286931779e-05, + "loss": 1.7361, + "step": 14214 + }, + { + "epoch": 4.3631062001227745, + "grad_norm": 0.23693235218524933, + "learning_rate": 6.26117730904213e-05, + "loss": 1.8117, + "step": 14215 + }, + { + "epoch": 4.3634131368938, + "grad_norm": 0.4164150655269623, + "learning_rate": 6.260696318688786e-05, + "loss": 1.7908, + "step": 14216 + }, + { + "epoch": 4.363720073664825, + "grad_norm": 0.39376336336135864, + "learning_rate": 6.260215315876506e-05, + "loss": 1.7832, + "step": 14217 + }, + { + "epoch": 4.3640270104358505, + "grad_norm": 0.24071799218654633, + "learning_rate": 6.259734300610037e-05, + "loss": 1.7569, + "step": 14218 + }, + { + "epoch": 4.364333947206875, + "grad_norm": 0.4305122494697571, + "learning_rate": 6.259253272894136e-05, + "loss": 1.7974, + "step": 14219 + }, + { + "epoch": 4.3646408839779, + "grad_norm": 0.3023197054862976, + "learning_rate": 6.258772232733556e-05, + "loss": 1.7589, + "step": 14220 + }, + { + "epoch": 4.364947820748926, + "grad_norm": 0.23253366351127625, + "learning_rate": 6.258291180133052e-05, + "loss": 1.7138, + "step": 14221 + }, + { + "epoch": 4.365254757519951, + "grad_norm": 0.41141277551651, + "learning_rate": 6.257810115097376e-05, + "loss": 1.7608, + "step": 14222 + }, + { + "epoch": 4.365561694290976, + "grad_norm": 0.3308235704898834, + "learning_rate": 6.257329037631284e-05, + "loss": 1.8006, + "step": 14223 + }, + { + "epoch": 4.365868631062002, + "grad_norm": 0.2635105848312378, + "learning_rate": 6.256847947739528e-05, + "loss": 1.7275, + "step": 14224 + }, + { + "epoch": 4.366175567833026, + "grad_norm": 0.45886602997779846, + "learning_rate": 6.256366845426864e-05, + "loss": 1.7701, + "step": 14225 + }, + { + "epoch": 4.366482504604051, + "grad_norm": 0.48503565788269043, + "learning_rate": 6.255885730698049e-05, + "loss": 1.7409, + "step": 14226 + }, + { + "epoch": 4.366789441375077, + "grad_norm": 0.26727184653282166, + "learning_rate": 6.255404603557833e-05, + "loss": 1.7288, + "step": 14227 + }, + { + "epoch": 4.367096378146102, + "grad_norm": 0.3343912363052368, + "learning_rate": 6.254923464010974e-05, + "loss": 1.764, + "step": 14228 + }, + { + "epoch": 4.367403314917127, + "grad_norm": 0.40050622820854187, + "learning_rate": 6.254442312062224e-05, + "loss": 1.7653, + "step": 14229 + }, + { + "epoch": 4.367710251688152, + "grad_norm": 0.23941144347190857, + "learning_rate": 6.253961147716341e-05, + "loss": 1.6886, + "step": 14230 + }, + { + "epoch": 4.368017188459177, + "grad_norm": 0.25737255811691284, + "learning_rate": 6.253479970978079e-05, + "loss": 1.8047, + "step": 14231 + }, + { + "epoch": 4.3683241252302025, + "grad_norm": 0.28780993819236755, + "learning_rate": 6.252998781852192e-05, + "loss": 1.7453, + "step": 14232 + }, + { + "epoch": 4.368631062001228, + "grad_norm": 0.2362327128648758, + "learning_rate": 6.252517580343438e-05, + "loss": 1.7963, + "step": 14233 + }, + { + "epoch": 4.368937998772253, + "grad_norm": 0.263013631105423, + "learning_rate": 6.252036366456571e-05, + "loss": 1.7837, + "step": 14234 + }, + { + "epoch": 4.3692449355432785, + "grad_norm": 0.27674412727355957, + "learning_rate": 6.251555140196347e-05, + "loss": 1.767, + "step": 14235 + }, + { + "epoch": 4.369551872314303, + "grad_norm": 0.2360621690750122, + "learning_rate": 6.251073901567522e-05, + "loss": 1.7806, + "step": 14236 + }, + { + "epoch": 4.369858809085328, + "grad_norm": 0.2568018138408661, + "learning_rate": 6.25059265057485e-05, + "loss": 1.7672, + "step": 14237 + }, + { + "epoch": 4.370165745856354, + "grad_norm": 0.2512381374835968, + "learning_rate": 6.25011138722309e-05, + "loss": 1.7506, + "step": 14238 + }, + { + "epoch": 4.370472682627379, + "grad_norm": 0.21587291359901428, + "learning_rate": 6.249630111516994e-05, + "loss": 1.7336, + "step": 14239 + }, + { + "epoch": 4.370779619398404, + "grad_norm": 0.21791933476924896, + "learning_rate": 6.249148823461323e-05, + "loss": 1.7588, + "step": 14240 + }, + { + "epoch": 4.371086556169429, + "grad_norm": 0.23061512410640717, + "learning_rate": 6.248667523060831e-05, + "loss": 1.742, + "step": 14241 + }, + { + "epoch": 4.371393492940454, + "grad_norm": 0.2007007598876953, + "learning_rate": 6.248186210320274e-05, + "loss": 1.7227, + "step": 14242 + }, + { + "epoch": 4.371700429711479, + "grad_norm": 0.2564350366592407, + "learning_rate": 6.247704885244411e-05, + "loss": 1.7529, + "step": 14243 + }, + { + "epoch": 4.372007366482505, + "grad_norm": 0.21880537271499634, + "learning_rate": 6.247223547837995e-05, + "loss": 1.7828, + "step": 14244 + }, + { + "epoch": 4.37231430325353, + "grad_norm": 0.26154282689094543, + "learning_rate": 6.246742198105785e-05, + "loss": 1.7895, + "step": 14245 + }, + { + "epoch": 4.3726212400245545, + "grad_norm": 0.2652645707130432, + "learning_rate": 6.24626083605254e-05, + "loss": 1.8038, + "step": 14246 + }, + { + "epoch": 4.37292817679558, + "grad_norm": 0.21463751792907715, + "learning_rate": 6.245779461683013e-05, + "loss": 1.7139, + "step": 14247 + }, + { + "epoch": 4.373235113566605, + "grad_norm": 0.21285851299762726, + "learning_rate": 6.245298075001961e-05, + "loss": 1.7686, + "step": 14248 + }, + { + "epoch": 4.3735420503376305, + "grad_norm": 0.258602499961853, + "learning_rate": 6.244816676014149e-05, + "loss": 1.8518, + "step": 14249 + }, + { + "epoch": 4.373848987108656, + "grad_norm": 0.25747501850128174, + "learning_rate": 6.244335264724323e-05, + "loss": 1.8019, + "step": 14250 + }, + { + "epoch": 4.37415592387968, + "grad_norm": 0.24678784608840942, + "learning_rate": 6.243853841137251e-05, + "loss": 1.7846, + "step": 14251 + }, + { + "epoch": 4.374462860650706, + "grad_norm": 0.31382107734680176, + "learning_rate": 6.243372405257685e-05, + "loss": 1.8389, + "step": 14252 + }, + { + "epoch": 4.374769797421731, + "grad_norm": 0.30522868037223816, + "learning_rate": 6.242890957090383e-05, + "loss": 1.8057, + "step": 14253 + }, + { + "epoch": 4.375076734192756, + "grad_norm": 0.2449347972869873, + "learning_rate": 6.242409496640106e-05, + "loss": 1.7144, + "step": 14254 + }, + { + "epoch": 4.375383670963782, + "grad_norm": 0.3193594217300415, + "learning_rate": 6.241928023911609e-05, + "loss": 1.7404, + "step": 14255 + }, + { + "epoch": 4.375690607734807, + "grad_norm": 0.23948179185390472, + "learning_rate": 6.241446538909651e-05, + "loss": 1.7338, + "step": 14256 + }, + { + "epoch": 4.3759975445058314, + "grad_norm": 0.35325706005096436, + "learning_rate": 6.240965041638991e-05, + "loss": 1.7673, + "step": 14257 + }, + { + "epoch": 4.376304481276857, + "grad_norm": 0.38753262162208557, + "learning_rate": 6.240483532104387e-05, + "loss": 1.769, + "step": 14258 + }, + { + "epoch": 4.376611418047882, + "grad_norm": 0.2749052941799164, + "learning_rate": 6.2400020103106e-05, + "loss": 1.8086, + "step": 14259 + }, + { + "epoch": 4.3769183548189075, + "grad_norm": 0.2553126811981201, + "learning_rate": 6.239520476262384e-05, + "loss": 1.7733, + "step": 14260 + }, + { + "epoch": 4.377225291589933, + "grad_norm": 0.2854517698287964, + "learning_rate": 6.2390389299645e-05, + "loss": 1.7926, + "step": 14261 + }, + { + "epoch": 4.377532228360957, + "grad_norm": 0.24617259204387665, + "learning_rate": 6.238557371421708e-05, + "loss": 1.7297, + "step": 14262 + }, + { + "epoch": 4.377839165131983, + "grad_norm": 0.2555331289768219, + "learning_rate": 6.238075800638765e-05, + "loss": 1.7566, + "step": 14263 + }, + { + "epoch": 4.378146101903008, + "grad_norm": 0.31666773557662964, + "learning_rate": 6.237594217620432e-05, + "loss": 1.8003, + "step": 14264 + }, + { + "epoch": 4.378453038674033, + "grad_norm": 0.24166476726531982, + "learning_rate": 6.237112622371468e-05, + "loss": 1.7425, + "step": 14265 + }, + { + "epoch": 4.378759975445059, + "grad_norm": 0.21237102150917053, + "learning_rate": 6.236631014896633e-05, + "loss": 1.73, + "step": 14266 + }, + { + "epoch": 4.379066912216084, + "grad_norm": 0.2739151120185852, + "learning_rate": 6.236149395200683e-05, + "loss": 1.7113, + "step": 14267 + }, + { + "epoch": 4.379373848987108, + "grad_norm": 0.23700746893882751, + "learning_rate": 6.23566776328838e-05, + "loss": 1.7256, + "step": 14268 + }, + { + "epoch": 4.379680785758134, + "grad_norm": 0.22366748750209808, + "learning_rate": 6.235186119164485e-05, + "loss": 1.7981, + "step": 14269 + }, + { + "epoch": 4.379987722529159, + "grad_norm": 0.28440114855766296, + "learning_rate": 6.234704462833758e-05, + "loss": 1.8087, + "step": 14270 + }, + { + "epoch": 4.380294659300184, + "grad_norm": 0.2706616520881653, + "learning_rate": 6.234222794300957e-05, + "loss": 1.7502, + "step": 14271 + }, + { + "epoch": 4.38060159607121, + "grad_norm": 0.21666266024112701, + "learning_rate": 6.233741113570843e-05, + "loss": 1.7639, + "step": 14272 + }, + { + "epoch": 4.380908532842234, + "grad_norm": 0.26790255308151245, + "learning_rate": 6.233259420648175e-05, + "loss": 1.796, + "step": 14273 + }, + { + "epoch": 4.3812154696132595, + "grad_norm": 0.22233673930168152, + "learning_rate": 6.232777715537715e-05, + "loss": 1.7661, + "step": 14274 + }, + { + "epoch": 4.381522406384285, + "grad_norm": 0.3277546763420105, + "learning_rate": 6.232295998244223e-05, + "loss": 1.7932, + "step": 14275 + }, + { + "epoch": 4.38182934315531, + "grad_norm": 0.2907596826553345, + "learning_rate": 6.231814268772463e-05, + "loss": 1.7103, + "step": 14276 + }, + { + "epoch": 4.3821362799263355, + "grad_norm": 0.2318384349346161, + "learning_rate": 6.231332527127188e-05, + "loss": 1.7351, + "step": 14277 + }, + { + "epoch": 4.382443216697361, + "grad_norm": 0.32904061675071716, + "learning_rate": 6.230850773313163e-05, + "loss": 1.7967, + "step": 14278 + }, + { + "epoch": 4.382750153468385, + "grad_norm": 0.2455490082502365, + "learning_rate": 6.230369007335153e-05, + "loss": 1.7474, + "step": 14279 + }, + { + "epoch": 4.383057090239411, + "grad_norm": 0.23648180067539215, + "learning_rate": 6.229887229197913e-05, + "loss": 1.7106, + "step": 14280 + }, + { + "epoch": 4.383364027010436, + "grad_norm": 0.29552599787712097, + "learning_rate": 6.229405438906207e-05, + "loss": 1.7765, + "step": 14281 + }, + { + "epoch": 4.383670963781461, + "grad_norm": 0.2094641923904419, + "learning_rate": 6.228923636464796e-05, + "loss": 1.7105, + "step": 14282 + }, + { + "epoch": 4.383977900552487, + "grad_norm": 0.24632154405117035, + "learning_rate": 6.228441821878441e-05, + "loss": 1.7913, + "step": 14283 + }, + { + "epoch": 4.384284837323511, + "grad_norm": 0.28114691376686096, + "learning_rate": 6.227959995151904e-05, + "loss": 1.7456, + "step": 14284 + }, + { + "epoch": 4.384591774094536, + "grad_norm": 0.24226875603199005, + "learning_rate": 6.227478156289946e-05, + "loss": 1.797, + "step": 14285 + }, + { + "epoch": 4.384898710865562, + "grad_norm": 0.2526854872703552, + "learning_rate": 6.22699630529733e-05, + "loss": 1.7155, + "step": 14286 + }, + { + "epoch": 4.385205647636587, + "grad_norm": 0.312916100025177, + "learning_rate": 6.226514442178818e-05, + "loss": 1.7808, + "step": 14287 + }, + { + "epoch": 4.385512584407612, + "grad_norm": 0.23087100684642792, + "learning_rate": 6.22603256693917e-05, + "loss": 1.7543, + "step": 14288 + }, + { + "epoch": 4.385819521178637, + "grad_norm": 0.3042476177215576, + "learning_rate": 6.22555067958315e-05, + "loss": 1.747, + "step": 14289 + }, + { + "epoch": 4.386126457949662, + "grad_norm": 0.2604007422924042, + "learning_rate": 6.225068780115522e-05, + "loss": 1.7262, + "step": 14290 + }, + { + "epoch": 4.3864333947206875, + "grad_norm": 0.2200118750333786, + "learning_rate": 6.224586868541044e-05, + "loss": 1.75, + "step": 14291 + }, + { + "epoch": 4.386740331491713, + "grad_norm": 0.3452017307281494, + "learning_rate": 6.224104944864481e-05, + "loss": 1.7598, + "step": 14292 + }, + { + "epoch": 4.387047268262738, + "grad_norm": 0.3169453740119934, + "learning_rate": 6.223623009090597e-05, + "loss": 1.7939, + "step": 14293 + }, + { + "epoch": 4.387354205033763, + "grad_norm": 0.23640502989292145, + "learning_rate": 6.223141061224151e-05, + "loss": 1.8005, + "step": 14294 + }, + { + "epoch": 4.387661141804788, + "grad_norm": 0.26212456822395325, + "learning_rate": 6.22265910126991e-05, + "loss": 1.7951, + "step": 14295 + }, + { + "epoch": 4.387968078575813, + "grad_norm": 0.2687644362449646, + "learning_rate": 6.222177129232634e-05, + "loss": 1.7674, + "step": 14296 + }, + { + "epoch": 4.388275015346839, + "grad_norm": 0.2553202211856842, + "learning_rate": 6.221695145117086e-05, + "loss": 1.8142, + "step": 14297 + }, + { + "epoch": 4.388581952117864, + "grad_norm": 0.3317619264125824, + "learning_rate": 6.221213148928034e-05, + "loss": 1.7884, + "step": 14298 + }, + { + "epoch": 4.388888888888889, + "grad_norm": 0.3059331476688385, + "learning_rate": 6.220731140670235e-05, + "loss": 1.7377, + "step": 14299 + }, + { + "epoch": 4.389195825659914, + "grad_norm": 0.21544015407562256, + "learning_rate": 6.220249120348457e-05, + "loss": 1.6818, + "step": 14300 + }, + { + "epoch": 4.389502762430939, + "grad_norm": 0.3112640380859375, + "learning_rate": 6.219767087967461e-05, + "loss": 1.72, + "step": 14301 + }, + { + "epoch": 4.389809699201964, + "grad_norm": 0.2572654187679291, + "learning_rate": 6.219285043532011e-05, + "loss": 1.793, + "step": 14302 + }, + { + "epoch": 4.39011663597299, + "grad_norm": 0.2621476948261261, + "learning_rate": 6.218802987046874e-05, + "loss": 1.8301, + "step": 14303 + }, + { + "epoch": 4.390423572744015, + "grad_norm": 0.2592658996582031, + "learning_rate": 6.218320918516809e-05, + "loss": 1.7219, + "step": 14304 + }, + { + "epoch": 4.3907305095150395, + "grad_norm": 0.25503265857696533, + "learning_rate": 6.217838837946584e-05, + "loss": 1.8149, + "step": 14305 + }, + { + "epoch": 4.391037446286065, + "grad_norm": 0.21944166719913483, + "learning_rate": 6.217356745340962e-05, + "loss": 1.7174, + "step": 14306 + }, + { + "epoch": 4.39134438305709, + "grad_norm": 0.2937396466732025, + "learning_rate": 6.216874640704707e-05, + "loss": 1.8562, + "step": 14307 + }, + { + "epoch": 4.3916513198281155, + "grad_norm": 0.22520211338996887, + "learning_rate": 6.216392524042581e-05, + "loss": 1.7701, + "step": 14308 + }, + { + "epoch": 4.391958256599141, + "grad_norm": 0.24397830665111542, + "learning_rate": 6.215910395359355e-05, + "loss": 1.7794, + "step": 14309 + }, + { + "epoch": 4.392265193370166, + "grad_norm": 0.2867623567581177, + "learning_rate": 6.215428254659788e-05, + "loss": 1.7275, + "step": 14310 + }, + { + "epoch": 4.392572130141191, + "grad_norm": 0.2632426917552948, + "learning_rate": 6.214946101948648e-05, + "loss": 1.7919, + "step": 14311 + }, + { + "epoch": 4.392879066912216, + "grad_norm": 0.23146092891693115, + "learning_rate": 6.214463937230696e-05, + "loss": 1.744, + "step": 14312 + }, + { + "epoch": 4.393186003683241, + "grad_norm": 0.21877676248550415, + "learning_rate": 6.213981760510701e-05, + "loss": 1.7577, + "step": 14313 + }, + { + "epoch": 4.393492940454267, + "grad_norm": 0.2320399284362793, + "learning_rate": 6.213499571793426e-05, + "loss": 1.7864, + "step": 14314 + }, + { + "epoch": 4.393799877225292, + "grad_norm": 0.2951548993587494, + "learning_rate": 6.213017371083638e-05, + "loss": 1.8257, + "step": 14315 + }, + { + "epoch": 4.394106813996316, + "grad_norm": 0.26062941551208496, + "learning_rate": 6.212535158386102e-05, + "loss": 1.7448, + "step": 14316 + }, + { + "epoch": 4.394413750767342, + "grad_norm": 0.24760986864566803, + "learning_rate": 6.21205293370558e-05, + "loss": 1.7902, + "step": 14317 + }, + { + "epoch": 4.394720687538367, + "grad_norm": 0.2686399221420288, + "learning_rate": 6.211570697046844e-05, + "loss": 1.8209, + "step": 14318 + }, + { + "epoch": 4.395027624309392, + "grad_norm": 0.2599134147167206, + "learning_rate": 6.211088448414653e-05, + "loss": 1.8231, + "step": 14319 + }, + { + "epoch": 4.395334561080418, + "grad_norm": 0.254044771194458, + "learning_rate": 6.210606187813778e-05, + "loss": 1.806, + "step": 14320 + }, + { + "epoch": 4.395641497851442, + "grad_norm": 0.262229323387146, + "learning_rate": 6.210123915248982e-05, + "loss": 1.7857, + "step": 14321 + }, + { + "epoch": 4.3959484346224675, + "grad_norm": 0.2849259078502655, + "learning_rate": 6.209641630725033e-05, + "loss": 1.8005, + "step": 14322 + }, + { + "epoch": 4.396255371393493, + "grad_norm": 0.35480254888534546, + "learning_rate": 6.209159334246697e-05, + "loss": 1.8189, + "step": 14323 + }, + { + "epoch": 4.396562308164518, + "grad_norm": 0.2599184215068817, + "learning_rate": 6.20867702581874e-05, + "loss": 1.7384, + "step": 14324 + }, + { + "epoch": 4.3968692449355435, + "grad_norm": 0.23994222283363342, + "learning_rate": 6.208194705445926e-05, + "loss": 1.7566, + "step": 14325 + }, + { + "epoch": 4.397176181706568, + "grad_norm": 0.24361753463745117, + "learning_rate": 6.207712373133024e-05, + "loss": 1.6965, + "step": 14326 + }, + { + "epoch": 4.397483118477593, + "grad_norm": 0.23925161361694336, + "learning_rate": 6.207230028884803e-05, + "loss": 1.7596, + "step": 14327 + }, + { + "epoch": 4.397790055248619, + "grad_norm": 0.24365897476673126, + "learning_rate": 6.206747672706025e-05, + "loss": 1.7951, + "step": 14328 + }, + { + "epoch": 4.398096992019644, + "grad_norm": 0.25245413184165955, + "learning_rate": 6.206265304601461e-05, + "loss": 1.8086, + "step": 14329 + }, + { + "epoch": 4.398403928790669, + "grad_norm": 0.24272513389587402, + "learning_rate": 6.205782924575874e-05, + "loss": 1.8148, + "step": 14330 + }, + { + "epoch": 4.398710865561695, + "grad_norm": 0.21299590170383453, + "learning_rate": 6.205300532634036e-05, + "loss": 1.7666, + "step": 14331 + }, + { + "epoch": 4.399017802332719, + "grad_norm": 0.23543189465999603, + "learning_rate": 6.20481812878071e-05, + "loss": 1.7629, + "step": 14332 + }, + { + "epoch": 4.399324739103744, + "grad_norm": 0.2284495085477829, + "learning_rate": 6.204335713020665e-05, + "loss": 1.768, + "step": 14333 + }, + { + "epoch": 4.39963167587477, + "grad_norm": 0.23158542811870575, + "learning_rate": 6.20385328535867e-05, + "loss": 1.7761, + "step": 14334 + }, + { + "epoch": 4.399938612645795, + "grad_norm": 0.2378150224685669, + "learning_rate": 6.20337084579949e-05, + "loss": 1.8483, + "step": 14335 + }, + { + "epoch": 4.4002455494168204, + "grad_norm": 0.2407436966896057, + "learning_rate": 6.202888394347892e-05, + "loss": 1.7364, + "step": 14336 + }, + { + "epoch": 4.400552486187845, + "grad_norm": 0.256259560585022, + "learning_rate": 6.202405931008649e-05, + "loss": 1.7376, + "step": 14337 + }, + { + "epoch": 4.40085942295887, + "grad_norm": 0.29293057322502136, + "learning_rate": 6.201923455786524e-05, + "loss": 1.7493, + "step": 14338 + }, + { + "epoch": 4.401166359729896, + "grad_norm": 0.24025334417819977, + "learning_rate": 6.201440968686288e-05, + "loss": 1.7522, + "step": 14339 + }, + { + "epoch": 4.401473296500921, + "grad_norm": 0.3215656280517578, + "learning_rate": 6.200958469712708e-05, + "loss": 1.7748, + "step": 14340 + }, + { + "epoch": 4.401780233271946, + "grad_norm": 0.43553170561790466, + "learning_rate": 6.200475958870553e-05, + "loss": 1.771, + "step": 14341 + }, + { + "epoch": 4.402087170042972, + "grad_norm": 0.3112131953239441, + "learning_rate": 6.19999343616459e-05, + "loss": 1.7655, + "step": 14342 + }, + { + "epoch": 4.402394106813996, + "grad_norm": 0.25197842717170715, + "learning_rate": 6.199510901599589e-05, + "loss": 1.7214, + "step": 14343 + }, + { + "epoch": 4.402701043585021, + "grad_norm": 0.33227142691612244, + "learning_rate": 6.19902835518032e-05, + "loss": 1.7332, + "step": 14344 + }, + { + "epoch": 4.403007980356047, + "grad_norm": 0.27962982654571533, + "learning_rate": 6.198545796911548e-05, + "loss": 1.6943, + "step": 14345 + }, + { + "epoch": 4.403314917127072, + "grad_norm": 0.24374182522296906, + "learning_rate": 6.198063226798044e-05, + "loss": 1.7222, + "step": 14346 + }, + { + "epoch": 4.403621853898097, + "grad_norm": 0.3101944625377655, + "learning_rate": 6.197580644844576e-05, + "loss": 1.7113, + "step": 14347 + }, + { + "epoch": 4.403928790669122, + "grad_norm": 0.25919321179389954, + "learning_rate": 6.197098051055916e-05, + "loss": 1.71, + "step": 14348 + }, + { + "epoch": 4.404235727440147, + "grad_norm": 0.23140330612659454, + "learning_rate": 6.19661544543683e-05, + "loss": 1.7472, + "step": 14349 + }, + { + "epoch": 4.4045426642111725, + "grad_norm": 0.3274286687374115, + "learning_rate": 6.19613282799209e-05, + "loss": 1.7093, + "step": 14350 + }, + { + "epoch": 4.404849600982198, + "grad_norm": 0.3187442123889923, + "learning_rate": 6.195650198726464e-05, + "loss": 1.7488, + "step": 14351 + }, + { + "epoch": 4.405156537753223, + "grad_norm": 0.20547433197498322, + "learning_rate": 6.195167557644722e-05, + "loss": 1.7295, + "step": 14352 + }, + { + "epoch": 4.4054634745242485, + "grad_norm": 0.2623414993286133, + "learning_rate": 6.194684904751633e-05, + "loss": 1.8258, + "step": 14353 + }, + { + "epoch": 4.405770411295273, + "grad_norm": 0.2468457818031311, + "learning_rate": 6.194202240051967e-05, + "loss": 1.6957, + "step": 14354 + }, + { + "epoch": 4.406077348066298, + "grad_norm": 0.2082364559173584, + "learning_rate": 6.193719563550496e-05, + "loss": 1.7596, + "step": 14355 + }, + { + "epoch": 4.406384284837324, + "grad_norm": 0.27072983980178833, + "learning_rate": 6.193236875251988e-05, + "loss": 1.7341, + "step": 14356 + }, + { + "epoch": 4.406691221608349, + "grad_norm": 0.2630362808704376, + "learning_rate": 6.192754175161215e-05, + "loss": 1.7664, + "step": 14357 + }, + { + "epoch": 4.406998158379374, + "grad_norm": 0.25400006771087646, + "learning_rate": 6.192271463282944e-05, + "loss": 1.7582, + "step": 14358 + }, + { + "epoch": 4.407305095150399, + "grad_norm": 0.22256311774253845, + "learning_rate": 6.191788739621949e-05, + "loss": 1.7389, + "step": 14359 + }, + { + "epoch": 4.407612031921424, + "grad_norm": 0.2160387486219406, + "learning_rate": 6.191306004182999e-05, + "loss": 1.7051, + "step": 14360 + }, + { + "epoch": 4.407918968692449, + "grad_norm": 0.20665684342384338, + "learning_rate": 6.190823256970865e-05, + "loss": 1.7606, + "step": 14361 + }, + { + "epoch": 4.408225905463475, + "grad_norm": 0.2173188328742981, + "learning_rate": 6.190340497990318e-05, + "loss": 1.7944, + "step": 14362 + }, + { + "epoch": 4.4085328422345, + "grad_norm": 0.189287930727005, + "learning_rate": 6.189857727246127e-05, + "loss": 1.7283, + "step": 14363 + }, + { + "epoch": 4.4088397790055245, + "grad_norm": 0.2531645596027374, + "learning_rate": 6.189374944743065e-05, + "loss": 1.7554, + "step": 14364 + }, + { + "epoch": 4.40914671577655, + "grad_norm": 0.25439125299453735, + "learning_rate": 6.188892150485903e-05, + "loss": 1.8032, + "step": 14365 + }, + { + "epoch": 4.409453652547575, + "grad_norm": 0.20938685536384583, + "learning_rate": 6.188409344479412e-05, + "loss": 1.7385, + "step": 14366 + }, + { + "epoch": 4.4097605893186005, + "grad_norm": 0.20471477508544922, + "learning_rate": 6.187926526728364e-05, + "loss": 1.7487, + "step": 14367 + }, + { + "epoch": 4.410067526089626, + "grad_norm": 0.2381851226091385, + "learning_rate": 6.187443697237529e-05, + "loss": 1.7443, + "step": 14368 + }, + { + "epoch": 4.41037446286065, + "grad_norm": 0.21584098041057587, + "learning_rate": 6.18696085601168e-05, + "loss": 1.7818, + "step": 14369 + }, + { + "epoch": 4.410681399631676, + "grad_norm": 0.2575368583202362, + "learning_rate": 6.186478003055587e-05, + "loss": 1.8204, + "step": 14370 + }, + { + "epoch": 4.410988336402701, + "grad_norm": 0.21133238077163696, + "learning_rate": 6.185995138374024e-05, + "loss": 1.7274, + "step": 14371 + }, + { + "epoch": 4.411295273173726, + "grad_norm": 0.24918322265148163, + "learning_rate": 6.18551226197176e-05, + "loss": 1.8021, + "step": 14372 + }, + { + "epoch": 4.411602209944752, + "grad_norm": 0.2253655642271042, + "learning_rate": 6.185029373853572e-05, + "loss": 1.7308, + "step": 14373 + }, + { + "epoch": 4.411909146715777, + "grad_norm": 0.20098713040351868, + "learning_rate": 6.184546474024226e-05, + "loss": 1.7549, + "step": 14374 + }, + { + "epoch": 4.412216083486801, + "grad_norm": 0.25612789392471313, + "learning_rate": 6.1840635624885e-05, + "loss": 1.8305, + "step": 14375 + }, + { + "epoch": 4.412523020257827, + "grad_norm": 0.24287539720535278, + "learning_rate": 6.183580639251164e-05, + "loss": 1.7339, + "step": 14376 + }, + { + "epoch": 4.412829957028852, + "grad_norm": 0.2304944545030594, + "learning_rate": 6.183097704316988e-05, + "loss": 1.7023, + "step": 14377 + }, + { + "epoch": 4.413136893799877, + "grad_norm": 0.21911773085594177, + "learning_rate": 6.18261475769075e-05, + "loss": 1.7305, + "step": 14378 + }, + { + "epoch": 4.413443830570903, + "grad_norm": 0.24207864701747894, + "learning_rate": 6.182131799377217e-05, + "loss": 1.7318, + "step": 14379 + }, + { + "epoch": 4.413750767341927, + "grad_norm": 0.2551634609699249, + "learning_rate": 6.181648829381165e-05, + "loss": 1.8101, + "step": 14380 + }, + { + "epoch": 4.4140577041129525, + "grad_norm": 0.4114011526107788, + "learning_rate": 6.181165847707368e-05, + "loss": 1.772, + "step": 14381 + }, + { + "epoch": 4.414364640883978, + "grad_norm": 0.4592796862125397, + "learning_rate": 6.180682854360598e-05, + "loss": 1.7359, + "step": 14382 + }, + { + "epoch": 4.414671577655003, + "grad_norm": 0.2599259614944458, + "learning_rate": 6.180199849345627e-05, + "loss": 1.7028, + "step": 14383 + }, + { + "epoch": 4.4149785144260285, + "grad_norm": 0.3489506244659424, + "learning_rate": 6.17971683266723e-05, + "loss": 1.8252, + "step": 14384 + }, + { + "epoch": 4.415285451197054, + "grad_norm": 0.44563809037208557, + "learning_rate": 6.179233804330179e-05, + "loss": 1.6894, + "step": 14385 + }, + { + "epoch": 4.415592387968078, + "grad_norm": 0.2596888542175293, + "learning_rate": 6.17875076433925e-05, + "loss": 1.8141, + "step": 14386 + }, + { + "epoch": 4.415899324739104, + "grad_norm": 0.3560626804828644, + "learning_rate": 6.178267712699213e-05, + "loss": 1.7764, + "step": 14387 + }, + { + "epoch": 4.416206261510129, + "grad_norm": 0.3746717572212219, + "learning_rate": 6.177784649414843e-05, + "loss": 1.7528, + "step": 14388 + }, + { + "epoch": 4.416513198281154, + "grad_norm": 0.23248885571956635, + "learning_rate": 6.177301574490918e-05, + "loss": 1.7148, + "step": 14389 + }, + { + "epoch": 4.41682013505218, + "grad_norm": 0.26936978101730347, + "learning_rate": 6.176818487932208e-05, + "loss": 1.7199, + "step": 14390 + }, + { + "epoch": 4.417127071823204, + "grad_norm": 0.3102504014968872, + "learning_rate": 6.176335389743486e-05, + "loss": 1.6886, + "step": 14391 + }, + { + "epoch": 4.417434008594229, + "grad_norm": 0.24406832456588745, + "learning_rate": 6.175852279929531e-05, + "loss": 1.7766, + "step": 14392 + }, + { + "epoch": 4.417740945365255, + "grad_norm": 0.271158903837204, + "learning_rate": 6.175369158495112e-05, + "loss": 1.8099, + "step": 14393 + }, + { + "epoch": 4.41804788213628, + "grad_norm": 0.343667209148407, + "learning_rate": 6.174886025445008e-05, + "loss": 1.779, + "step": 14394 + }, + { + "epoch": 4.418354818907305, + "grad_norm": 0.37423139810562134, + "learning_rate": 6.17440288078399e-05, + "loss": 1.7796, + "step": 14395 + }, + { + "epoch": 4.41866175567833, + "grad_norm": 0.3152335286140442, + "learning_rate": 6.173919724516836e-05, + "loss": 1.7388, + "step": 14396 + }, + { + "epoch": 4.418968692449355, + "grad_norm": 0.21467824280261993, + "learning_rate": 6.173436556648319e-05, + "loss": 1.7689, + "step": 14397 + }, + { + "epoch": 4.4192756292203805, + "grad_norm": 0.2861369848251343, + "learning_rate": 6.172953377183213e-05, + "loss": 1.819, + "step": 14398 + }, + { + "epoch": 4.419582565991406, + "grad_norm": 0.34777504205703735, + "learning_rate": 6.172470186126295e-05, + "loss": 1.7444, + "step": 14399 + }, + { + "epoch": 4.419889502762431, + "grad_norm": 0.2728833854198456, + "learning_rate": 6.171986983482339e-05, + "loss": 1.7637, + "step": 14400 + }, + { + "epoch": 4.420196439533456, + "grad_norm": 0.2593914270401001, + "learning_rate": 6.17150376925612e-05, + "loss": 1.8196, + "step": 14401 + }, + { + "epoch": 4.420503376304481, + "grad_norm": 0.29425305128097534, + "learning_rate": 6.171020543452416e-05, + "loss": 1.7511, + "step": 14402 + }, + { + "epoch": 4.420810313075506, + "grad_norm": 0.2587110102176666, + "learning_rate": 6.170537306076e-05, + "loss": 1.8085, + "step": 14403 + }, + { + "epoch": 4.421117249846532, + "grad_norm": 0.22442933917045593, + "learning_rate": 6.170054057131648e-05, + "loss": 1.8023, + "step": 14404 + }, + { + "epoch": 4.421424186617557, + "grad_norm": 0.23302629590034485, + "learning_rate": 6.169570796624136e-05, + "loss": 1.7995, + "step": 14405 + }, + { + "epoch": 4.421731123388582, + "grad_norm": 0.2295885682106018, + "learning_rate": 6.169087524558239e-05, + "loss": 1.7948, + "step": 14406 + }, + { + "epoch": 4.422038060159607, + "grad_norm": 0.2161262482404709, + "learning_rate": 6.168604240938735e-05, + "loss": 1.7159, + "step": 14407 + }, + { + "epoch": 4.422344996930632, + "grad_norm": 0.20746205747127533, + "learning_rate": 6.1681209457704e-05, + "loss": 1.7703, + "step": 14408 + }, + { + "epoch": 4.422651933701657, + "grad_norm": 0.25677376985549927, + "learning_rate": 6.167637639058006e-05, + "loss": 1.7819, + "step": 14409 + }, + { + "epoch": 4.422958870472683, + "grad_norm": 0.226568341255188, + "learning_rate": 6.167154320806336e-05, + "loss": 1.7661, + "step": 14410 + }, + { + "epoch": 4.423265807243708, + "grad_norm": 0.22997824847698212, + "learning_rate": 6.166670991020162e-05, + "loss": 1.7364, + "step": 14411 + }, + { + "epoch": 4.4235727440147325, + "grad_norm": 0.2528770864009857, + "learning_rate": 6.166187649704261e-05, + "loss": 1.8505, + "step": 14412 + }, + { + "epoch": 4.423879680785758, + "grad_norm": 0.27278614044189453, + "learning_rate": 6.165704296863409e-05, + "loss": 1.7855, + "step": 14413 + }, + { + "epoch": 4.424186617556783, + "grad_norm": 0.23086364567279816, + "learning_rate": 6.165220932502385e-05, + "loss": 1.7489, + "step": 14414 + }, + { + "epoch": 4.4244935543278086, + "grad_norm": 0.2570587396621704, + "learning_rate": 6.164737556625965e-05, + "loss": 1.8008, + "step": 14415 + }, + { + "epoch": 4.424800491098834, + "grad_norm": 0.2637264132499695, + "learning_rate": 6.164254169238923e-05, + "loss": 1.7563, + "step": 14416 + }, + { + "epoch": 4.425107427869859, + "grad_norm": 0.23046623170375824, + "learning_rate": 6.163770770346043e-05, + "loss": 1.7433, + "step": 14417 + }, + { + "epoch": 4.425414364640884, + "grad_norm": 0.2531467080116272, + "learning_rate": 6.163287359952095e-05, + "loss": 1.8122, + "step": 14418 + }, + { + "epoch": 4.425721301411909, + "grad_norm": 0.26507216691970825, + "learning_rate": 6.162803938061861e-05, + "loss": 1.7019, + "step": 14419 + }, + { + "epoch": 4.426028238182934, + "grad_norm": 0.229641854763031, + "learning_rate": 6.162320504680117e-05, + "loss": 1.7518, + "step": 14420 + }, + { + "epoch": 4.42633517495396, + "grad_norm": 0.22777152061462402, + "learning_rate": 6.161837059811641e-05, + "loss": 1.8094, + "step": 14421 + }, + { + "epoch": 4.426642111724985, + "grad_norm": 0.22121338546276093, + "learning_rate": 6.161353603461209e-05, + "loss": 1.7204, + "step": 14422 + }, + { + "epoch": 4.4269490484960095, + "grad_norm": 0.21914128959178925, + "learning_rate": 6.1608701356336e-05, + "loss": 1.7554, + "step": 14423 + }, + { + "epoch": 4.427255985267035, + "grad_norm": 0.22649390995502472, + "learning_rate": 6.160386656333593e-05, + "loss": 1.8058, + "step": 14424 + }, + { + "epoch": 4.42756292203806, + "grad_norm": 0.24529023468494415, + "learning_rate": 6.159903165565964e-05, + "loss": 1.7302, + "step": 14425 + }, + { + "epoch": 4.4278698588090855, + "grad_norm": 0.2726481854915619, + "learning_rate": 6.159419663335492e-05, + "loss": 1.825, + "step": 14426 + }, + { + "epoch": 4.428176795580111, + "grad_norm": 0.2772440016269684, + "learning_rate": 6.158936149646957e-05, + "loss": 1.7322, + "step": 14427 + }, + { + "epoch": 4.428483732351136, + "grad_norm": 0.29778853058815, + "learning_rate": 6.158452624505135e-05, + "loss": 1.7421, + "step": 14428 + }, + { + "epoch": 4.428790669122161, + "grad_norm": 0.21327480673789978, + "learning_rate": 6.157969087914804e-05, + "loss": 1.7269, + "step": 14429 + }, + { + "epoch": 4.429097605893186, + "grad_norm": 0.2718868851661682, + "learning_rate": 6.157485539880744e-05, + "loss": 1.7817, + "step": 14430 + }, + { + "epoch": 4.429404542664211, + "grad_norm": 0.32242509722709656, + "learning_rate": 6.157001980407735e-05, + "loss": 1.7115, + "step": 14431 + }, + { + "epoch": 4.429711479435237, + "grad_norm": 0.2931978106498718, + "learning_rate": 6.156518409500553e-05, + "loss": 1.7822, + "step": 14432 + }, + { + "epoch": 4.430018416206262, + "grad_norm": 0.229528546333313, + "learning_rate": 6.156034827163977e-05, + "loss": 1.7623, + "step": 14433 + }, + { + "epoch": 4.430325352977286, + "grad_norm": 0.28702354431152344, + "learning_rate": 6.15555123340279e-05, + "loss": 1.8101, + "step": 14434 + }, + { + "epoch": 4.430632289748312, + "grad_norm": 0.27162131667137146, + "learning_rate": 6.155067628221766e-05, + "loss": 1.7525, + "step": 14435 + }, + { + "epoch": 4.430939226519337, + "grad_norm": 0.24290388822555542, + "learning_rate": 6.154584011625688e-05, + "loss": 1.8701, + "step": 14436 + }, + { + "epoch": 4.431246163290362, + "grad_norm": 0.3055405020713806, + "learning_rate": 6.154100383619334e-05, + "loss": 1.8659, + "step": 14437 + }, + { + "epoch": 4.431553100061388, + "grad_norm": 0.24528950452804565, + "learning_rate": 6.153616744207483e-05, + "loss": 1.8493, + "step": 14438 + }, + { + "epoch": 4.431860036832412, + "grad_norm": 0.2611897587776184, + "learning_rate": 6.153133093394917e-05, + "loss": 1.7905, + "step": 14439 + }, + { + "epoch": 4.4321669736034375, + "grad_norm": 0.2172730267047882, + "learning_rate": 6.15264943118641e-05, + "loss": 1.7087, + "step": 14440 + }, + { + "epoch": 4.432473910374463, + "grad_norm": 0.2320949286222458, + "learning_rate": 6.152165757586749e-05, + "loss": 1.7473, + "step": 14441 + }, + { + "epoch": 4.432780847145488, + "grad_norm": 0.2602086365222931, + "learning_rate": 6.15168207260071e-05, + "loss": 1.7365, + "step": 14442 + }, + { + "epoch": 4.4330877839165135, + "grad_norm": 0.25193190574645996, + "learning_rate": 6.151198376233074e-05, + "loss": 1.8205, + "step": 14443 + }, + { + "epoch": 4.433394720687538, + "grad_norm": 0.2894204556941986, + "learning_rate": 6.150714668488621e-05, + "loss": 1.7759, + "step": 14444 + }, + { + "epoch": 4.433701657458563, + "grad_norm": 0.24150310456752777, + "learning_rate": 6.150230949372131e-05, + "loss": 1.8415, + "step": 14445 + }, + { + "epoch": 4.434008594229589, + "grad_norm": 0.23475918173789978, + "learning_rate": 6.149747218888384e-05, + "loss": 1.7487, + "step": 14446 + }, + { + "epoch": 4.434315531000614, + "grad_norm": 0.29425546526908875, + "learning_rate": 6.149263477042162e-05, + "loss": 1.7538, + "step": 14447 + }, + { + "epoch": 4.434622467771639, + "grad_norm": 0.26241615414619446, + "learning_rate": 6.148779723838244e-05, + "loss": 1.7564, + "step": 14448 + }, + { + "epoch": 4.434929404542665, + "grad_norm": 0.23195287585258484, + "learning_rate": 6.148295959281411e-05, + "loss": 1.837, + "step": 14449 + }, + { + "epoch": 4.435236341313689, + "grad_norm": 0.34972792863845825, + "learning_rate": 6.147812183376445e-05, + "loss": 1.7632, + "step": 14450 + }, + { + "epoch": 4.435543278084714, + "grad_norm": 0.3536125719547272, + "learning_rate": 6.147328396128126e-05, + "loss": 1.8372, + "step": 14451 + }, + { + "epoch": 4.43585021485574, + "grad_norm": 0.2086079865694046, + "learning_rate": 6.146844597541235e-05, + "loss": 1.7014, + "step": 14452 + }, + { + "epoch": 4.436157151626765, + "grad_norm": 0.25547802448272705, + "learning_rate": 6.146360787620554e-05, + "loss": 1.7544, + "step": 14453 + }, + { + "epoch": 4.43646408839779, + "grad_norm": 0.26176998019218445, + "learning_rate": 6.145876966370864e-05, + "loss": 1.7617, + "step": 14454 + }, + { + "epoch": 4.436771025168815, + "grad_norm": 0.2672959566116333, + "learning_rate": 6.145393133796946e-05, + "loss": 1.8178, + "step": 14455 + }, + { + "epoch": 4.43707796193984, + "grad_norm": 0.23373909294605255, + "learning_rate": 6.144909289903582e-05, + "loss": 1.7295, + "step": 14456 + }, + { + "epoch": 4.4373848987108655, + "grad_norm": 0.2369835078716278, + "learning_rate": 6.144425434695551e-05, + "loss": 1.8097, + "step": 14457 + }, + { + "epoch": 4.437691835481891, + "grad_norm": 0.25528979301452637, + "learning_rate": 6.14394156817764e-05, + "loss": 1.7523, + "step": 14458 + }, + { + "epoch": 4.437998772252916, + "grad_norm": 0.2541787624359131, + "learning_rate": 6.143457690354626e-05, + "loss": 1.7606, + "step": 14459 + }, + { + "epoch": 4.4383057090239415, + "grad_norm": 0.2032637745141983, + "learning_rate": 6.142973801231295e-05, + "loss": 1.7967, + "step": 14460 + }, + { + "epoch": 4.438612645794966, + "grad_norm": 0.2413996160030365, + "learning_rate": 6.142489900812426e-05, + "loss": 1.7688, + "step": 14461 + }, + { + "epoch": 4.438919582565991, + "grad_norm": 0.43451038002967834, + "learning_rate": 6.142005989102803e-05, + "loss": 1.8269, + "step": 14462 + }, + { + "epoch": 4.439226519337017, + "grad_norm": 0.23981481790542603, + "learning_rate": 6.141522066107206e-05, + "loss": 1.7628, + "step": 14463 + }, + { + "epoch": 4.439533456108042, + "grad_norm": 0.25396493077278137, + "learning_rate": 6.14103813183042e-05, + "loss": 1.7913, + "step": 14464 + }, + { + "epoch": 4.439840392879067, + "grad_norm": 0.2567536532878876, + "learning_rate": 6.140554186277225e-05, + "loss": 1.7612, + "step": 14465 + }, + { + "epoch": 4.440147329650092, + "grad_norm": 0.2201337069272995, + "learning_rate": 6.140070229452406e-05, + "loss": 1.7541, + "step": 14466 + }, + { + "epoch": 4.440454266421117, + "grad_norm": 0.24202953279018402, + "learning_rate": 6.139586261360746e-05, + "loss": 1.777, + "step": 14467 + }, + { + "epoch": 4.440761203192142, + "grad_norm": 0.23891687393188477, + "learning_rate": 6.139102282007024e-05, + "loss": 1.7509, + "step": 14468 + }, + { + "epoch": 4.441068139963168, + "grad_norm": 0.21132555603981018, + "learning_rate": 6.138618291396026e-05, + "loss": 1.7362, + "step": 14469 + }, + { + "epoch": 4.441375076734193, + "grad_norm": 0.2731861472129822, + "learning_rate": 6.138134289532536e-05, + "loss": 1.8063, + "step": 14470 + }, + { + "epoch": 4.4416820135052175, + "grad_norm": 0.29503315687179565, + "learning_rate": 6.137650276421336e-05, + "loss": 1.7193, + "step": 14471 + }, + { + "epoch": 4.441988950276243, + "grad_norm": 0.2778526544570923, + "learning_rate": 6.137166252067208e-05, + "loss": 1.7507, + "step": 14472 + }, + { + "epoch": 4.442295887047268, + "grad_norm": 0.2907710075378418, + "learning_rate": 6.136682216474938e-05, + "loss": 1.7939, + "step": 14473 + }, + { + "epoch": 4.4426028238182935, + "grad_norm": 0.4133768379688263, + "learning_rate": 6.136198169649306e-05, + "loss": 1.8012, + "step": 14474 + }, + { + "epoch": 4.442909760589319, + "grad_norm": 0.2505052983760834, + "learning_rate": 6.135714111595099e-05, + "loss": 1.8426, + "step": 14475 + }, + { + "epoch": 4.443216697360343, + "grad_norm": 0.3884379267692566, + "learning_rate": 6.135230042317099e-05, + "loss": 1.7383, + "step": 14476 + }, + { + "epoch": 4.443523634131369, + "grad_norm": 0.42902377247810364, + "learning_rate": 6.134745961820091e-05, + "loss": 1.732, + "step": 14477 + }, + { + "epoch": 4.443830570902394, + "grad_norm": 0.21782708168029785, + "learning_rate": 6.134261870108858e-05, + "loss": 1.7369, + "step": 14478 + }, + { + "epoch": 4.444137507673419, + "grad_norm": 0.4160648286342621, + "learning_rate": 6.133777767188186e-05, + "loss": 1.8083, + "step": 14479 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.5057216882705688, + "learning_rate": 6.133293653062856e-05, + "loss": 1.8971, + "step": 14480 + }, + { + "epoch": 4.44475138121547, + "grad_norm": 0.2189750075340271, + "learning_rate": 6.132809527737654e-05, + "loss": 1.7508, + "step": 14481 + }, + { + "epoch": 4.445058317986494, + "grad_norm": 0.4415782392024994, + "learning_rate": 6.132325391217364e-05, + "loss": 1.8548, + "step": 14482 + }, + { + "epoch": 4.44536525475752, + "grad_norm": 0.3907296359539032, + "learning_rate": 6.13184124350677e-05, + "loss": 1.7879, + "step": 14483 + }, + { + "epoch": 4.445672191528545, + "grad_norm": 0.24117955565452576, + "learning_rate": 6.131357084610659e-05, + "loss": 1.7227, + "step": 14484 + }, + { + "epoch": 4.44597912829957, + "grad_norm": 0.3083679974079132, + "learning_rate": 6.130872914533815e-05, + "loss": 1.7505, + "step": 14485 + }, + { + "epoch": 4.446286065070596, + "grad_norm": 0.27730658650398254, + "learning_rate": 6.13038873328102e-05, + "loss": 1.7485, + "step": 14486 + }, + { + "epoch": 4.44659300184162, + "grad_norm": 0.28548410534858704, + "learning_rate": 6.12990454085706e-05, + "loss": 1.8145, + "step": 14487 + }, + { + "epoch": 4.4468999386126455, + "grad_norm": 0.24743106961250305, + "learning_rate": 6.129420337266724e-05, + "loss": 1.7131, + "step": 14488 + }, + { + "epoch": 4.447206875383671, + "grad_norm": 0.2899693250656128, + "learning_rate": 6.128936122514794e-05, + "loss": 1.8567, + "step": 14489 + }, + { + "epoch": 4.447513812154696, + "grad_norm": 0.259916752576828, + "learning_rate": 6.128451896606053e-05, + "loss": 1.7563, + "step": 14490 + }, + { + "epoch": 4.4478207489257215, + "grad_norm": 0.21112586557865143, + "learning_rate": 6.12796765954529e-05, + "loss": 1.6975, + "step": 14491 + }, + { + "epoch": 4.448127685696747, + "grad_norm": 0.2890239953994751, + "learning_rate": 6.12748341133729e-05, + "loss": 1.7904, + "step": 14492 + }, + { + "epoch": 4.448434622467771, + "grad_norm": 0.23394012451171875, + "learning_rate": 6.126999151986839e-05, + "loss": 1.7559, + "step": 14493 + }, + { + "epoch": 4.448741559238797, + "grad_norm": 0.3492949903011322, + "learning_rate": 6.12651488149872e-05, + "loss": 1.7734, + "step": 14494 + }, + { + "epoch": 4.449048496009822, + "grad_norm": 0.48309218883514404, + "learning_rate": 6.126030599877723e-05, + "loss": 1.7798, + "step": 14495 + }, + { + "epoch": 4.449355432780847, + "grad_norm": 0.341146320104599, + "learning_rate": 6.12554630712863e-05, + "loss": 1.7921, + "step": 14496 + }, + { + "epoch": 4.449662369551873, + "grad_norm": 0.223160982131958, + "learning_rate": 6.125062003256229e-05, + "loss": 1.7784, + "step": 14497 + }, + { + "epoch": 4.449969306322897, + "grad_norm": 0.32664811611175537, + "learning_rate": 6.124577688265306e-05, + "loss": 1.7353, + "step": 14498 + }, + { + "epoch": 4.4502762430939224, + "grad_norm": 0.215936541557312, + "learning_rate": 6.124093362160646e-05, + "loss": 1.68, + "step": 14499 + }, + { + "epoch": 4.450583179864948, + "grad_norm": 0.26081225275993347, + "learning_rate": 6.123609024947038e-05, + "loss": 1.7107, + "step": 14500 + }, + { + "epoch": 4.450890116635973, + "grad_norm": 0.3124069571495056, + "learning_rate": 6.123124676629267e-05, + "loss": 1.7338, + "step": 14501 + }, + { + "epoch": 4.4511970534069984, + "grad_norm": 0.23125620186328888, + "learning_rate": 6.122640317212118e-05, + "loss": 1.7842, + "step": 14502 + }, + { + "epoch": 4.451503990178024, + "grad_norm": 0.27065595984458923, + "learning_rate": 6.122155946700381e-05, + "loss": 1.7284, + "step": 14503 + }, + { + "epoch": 4.451810926949048, + "grad_norm": 0.4677436053752899, + "learning_rate": 6.121671565098841e-05, + "loss": 1.8156, + "step": 14504 + }, + { + "epoch": 4.452117863720074, + "grad_norm": 0.36325082182884216, + "learning_rate": 6.121187172412285e-05, + "loss": 1.7875, + "step": 14505 + }, + { + "epoch": 4.452424800491099, + "grad_norm": 0.23409567773342133, + "learning_rate": 6.1207027686455e-05, + "loss": 1.7421, + "step": 14506 + }, + { + "epoch": 4.452731737262124, + "grad_norm": 0.36919257044792175, + "learning_rate": 6.120218353803273e-05, + "loss": 1.7545, + "step": 14507 + }, + { + "epoch": 4.45303867403315, + "grad_norm": 0.318452388048172, + "learning_rate": 6.119733927890393e-05, + "loss": 1.7179, + "step": 14508 + }, + { + "epoch": 4.453345610804174, + "grad_norm": 0.21279768645763397, + "learning_rate": 6.119249490911643e-05, + "loss": 1.7534, + "step": 14509 + }, + { + "epoch": 4.453652547575199, + "grad_norm": 0.30565473437309265, + "learning_rate": 6.118765042871816e-05, + "loss": 1.7962, + "step": 14510 + }, + { + "epoch": 4.453959484346225, + "grad_norm": 0.2608480453491211, + "learning_rate": 6.118280583775697e-05, + "loss": 1.7336, + "step": 14511 + }, + { + "epoch": 4.45426642111725, + "grad_norm": 0.22978845238685608, + "learning_rate": 6.117796113628075e-05, + "loss": 1.8244, + "step": 14512 + }, + { + "epoch": 4.454573357888275, + "grad_norm": 0.26357781887054443, + "learning_rate": 6.117311632433735e-05, + "loss": 1.7425, + "step": 14513 + }, + { + "epoch": 4.4548802946593, + "grad_norm": 0.22127102315425873, + "learning_rate": 6.116827140197467e-05, + "loss": 1.7679, + "step": 14514 + }, + { + "epoch": 4.455187231430325, + "grad_norm": 0.2876584231853485, + "learning_rate": 6.116342636924058e-05, + "loss": 1.8104, + "step": 14515 + }, + { + "epoch": 4.4554941682013505, + "grad_norm": 0.28290677070617676, + "learning_rate": 6.115858122618297e-05, + "loss": 1.7485, + "step": 14516 + }, + { + "epoch": 4.455801104972376, + "grad_norm": 0.21914640069007874, + "learning_rate": 6.115373597284974e-05, + "loss": 1.7736, + "step": 14517 + }, + { + "epoch": 4.456108041743401, + "grad_norm": 0.2603909969329834, + "learning_rate": 6.114889060928873e-05, + "loss": 1.7446, + "step": 14518 + }, + { + "epoch": 4.456414978514426, + "grad_norm": 0.2157236635684967, + "learning_rate": 6.114404513554784e-05, + "loss": 1.7594, + "step": 14519 + }, + { + "epoch": 4.456721915285451, + "grad_norm": 0.27622368931770325, + "learning_rate": 6.113919955167499e-05, + "loss": 1.8154, + "step": 14520 + }, + { + "epoch": 4.457028852056476, + "grad_norm": 0.27298516035079956, + "learning_rate": 6.113435385771803e-05, + "loss": 1.7458, + "step": 14521 + }, + { + "epoch": 4.457335788827502, + "grad_norm": 0.22220586240291595, + "learning_rate": 6.112950805372485e-05, + "loss": 1.7102, + "step": 14522 + }, + { + "epoch": 4.457642725598527, + "grad_norm": 0.19480876624584198, + "learning_rate": 6.112466213974336e-05, + "loss": 1.7696, + "step": 14523 + }, + { + "epoch": 4.457949662369552, + "grad_norm": 0.24261653423309326, + "learning_rate": 6.111981611582144e-05, + "loss": 1.8193, + "step": 14524 + }, + { + "epoch": 4.458256599140577, + "grad_norm": 0.2502967417240143, + "learning_rate": 6.111496998200697e-05, + "loss": 1.7701, + "step": 14525 + }, + { + "epoch": 4.458563535911602, + "grad_norm": 0.25764599442481995, + "learning_rate": 6.111012373834786e-05, + "loss": 1.8055, + "step": 14526 + }, + { + "epoch": 4.458870472682627, + "grad_norm": 0.24085427820682526, + "learning_rate": 6.110527738489198e-05, + "loss": 1.7592, + "step": 14527 + }, + { + "epoch": 4.459177409453653, + "grad_norm": 0.2469809502363205, + "learning_rate": 6.110043092168727e-05, + "loss": 1.6977, + "step": 14528 + }, + { + "epoch": 4.459484346224678, + "grad_norm": 0.21888838708400726, + "learning_rate": 6.109558434878159e-05, + "loss": 1.777, + "step": 14529 + }, + { + "epoch": 4.4597912829957025, + "grad_norm": 0.2094014585018158, + "learning_rate": 6.109073766622281e-05, + "loss": 1.7041, + "step": 14530 + }, + { + "epoch": 4.460098219766728, + "grad_norm": 0.23801055550575256, + "learning_rate": 6.108589087405888e-05, + "loss": 1.8392, + "step": 14531 + }, + { + "epoch": 4.460405156537753, + "grad_norm": 0.2164965718984604, + "learning_rate": 6.108104397233769e-05, + "loss": 1.7643, + "step": 14532 + }, + { + "epoch": 4.4607120933087785, + "grad_norm": 0.21322336792945862, + "learning_rate": 6.107619696110712e-05, + "loss": 1.7063, + "step": 14533 + }, + { + "epoch": 4.461019030079804, + "grad_norm": 0.29019200801849365, + "learning_rate": 6.107134984041507e-05, + "loss": 1.8254, + "step": 14534 + }, + { + "epoch": 4.461325966850829, + "grad_norm": 0.2765025496482849, + "learning_rate": 6.106650261030947e-05, + "loss": 1.7609, + "step": 14535 + }, + { + "epoch": 4.461632903621854, + "grad_norm": 0.20879749953746796, + "learning_rate": 6.106165527083818e-05, + "loss": 1.7387, + "step": 14536 + }, + { + "epoch": 4.461939840392879, + "grad_norm": 0.22295843064785004, + "learning_rate": 6.105680782204913e-05, + "loss": 1.7691, + "step": 14537 + }, + { + "epoch": 4.462246777163904, + "grad_norm": 0.23502351343631744, + "learning_rate": 6.105196026399025e-05, + "loss": 1.7335, + "step": 14538 + }, + { + "epoch": 4.46255371393493, + "grad_norm": 0.22143007814884186, + "learning_rate": 6.104711259670941e-05, + "loss": 1.7338, + "step": 14539 + }, + { + "epoch": 4.462860650705955, + "grad_norm": 0.22361041605472565, + "learning_rate": 6.104226482025453e-05, + "loss": 1.7033, + "step": 14540 + }, + { + "epoch": 4.463167587476979, + "grad_norm": 0.27104905247688293, + "learning_rate": 6.10374169346735e-05, + "loss": 1.7926, + "step": 14541 + }, + { + "epoch": 4.463474524248005, + "grad_norm": 0.23564264178276062, + "learning_rate": 6.103256894001427e-05, + "loss": 1.7522, + "step": 14542 + }, + { + "epoch": 4.46378146101903, + "grad_norm": 0.2585970163345337, + "learning_rate": 6.102772083632471e-05, + "loss": 1.7755, + "step": 14543 + }, + { + "epoch": 4.464088397790055, + "grad_norm": 0.358634889125824, + "learning_rate": 6.102287262365276e-05, + "loss": 1.8092, + "step": 14544 + }, + { + "epoch": 4.464395334561081, + "grad_norm": 0.2862946689128876, + "learning_rate": 6.1018024302046314e-05, + "loss": 1.7051, + "step": 14545 + }, + { + "epoch": 4.464702271332105, + "grad_norm": 0.21907158195972443, + "learning_rate": 6.101317587155331e-05, + "loss": 1.7882, + "step": 14546 + }, + { + "epoch": 4.4650092081031305, + "grad_norm": 0.24268488585948944, + "learning_rate": 6.100832733222164e-05, + "loss": 1.7756, + "step": 14547 + }, + { + "epoch": 4.465316144874156, + "grad_norm": 0.2350744605064392, + "learning_rate": 6.1003478684099214e-05, + "loss": 1.7483, + "step": 14548 + }, + { + "epoch": 4.465623081645181, + "grad_norm": 0.22902250289916992, + "learning_rate": 6.099862992723397e-05, + "loss": 1.7687, + "step": 14549 + }, + { + "epoch": 4.4659300184162065, + "grad_norm": 0.23590944707393646, + "learning_rate": 6.099378106167382e-05, + "loss": 1.8481, + "step": 14550 + }, + { + "epoch": 4.466236955187231, + "grad_norm": 0.23644296824932098, + "learning_rate": 6.098893208746668e-05, + "loss": 1.7422, + "step": 14551 + }, + { + "epoch": 4.466543891958256, + "grad_norm": 0.23782360553741455, + "learning_rate": 6.0984083004660475e-05, + "loss": 1.7852, + "step": 14552 + }, + { + "epoch": 4.466850828729282, + "grad_norm": 0.2546575665473938, + "learning_rate": 6.097923381330313e-05, + "loss": 1.8483, + "step": 14553 + }, + { + "epoch": 4.467157765500307, + "grad_norm": 0.2555409371852875, + "learning_rate": 6.097438451344254e-05, + "loss": 1.7887, + "step": 14554 + }, + { + "epoch": 4.467464702271332, + "grad_norm": 0.28074198961257935, + "learning_rate": 6.0969535105126664e-05, + "loss": 1.7521, + "step": 14555 + }, + { + "epoch": 4.467771639042358, + "grad_norm": 0.22622554004192352, + "learning_rate": 6.096468558840341e-05, + "loss": 1.8088, + "step": 14556 + }, + { + "epoch": 4.468078575813382, + "grad_norm": 0.302749902009964, + "learning_rate": 6.095983596332071e-05, + "loss": 1.8192, + "step": 14557 + }, + { + "epoch": 4.468385512584407, + "grad_norm": 0.27925750613212585, + "learning_rate": 6.0954986229926494e-05, + "loss": 1.8453, + "step": 14558 + }, + { + "epoch": 4.468692449355433, + "grad_norm": 0.2246330976486206, + "learning_rate": 6.095013638826868e-05, + "loss": 1.744, + "step": 14559 + }, + { + "epoch": 4.468999386126458, + "grad_norm": 0.26677101850509644, + "learning_rate": 6.094528643839518e-05, + "loss": 1.708, + "step": 14560 + }, + { + "epoch": 4.469306322897483, + "grad_norm": 0.23684042692184448, + "learning_rate": 6.094043638035396e-05, + "loss": 1.713, + "step": 14561 + }, + { + "epoch": 4.469613259668508, + "grad_norm": 0.2470075935125351, + "learning_rate": 6.093558621419294e-05, + "loss": 1.8096, + "step": 14562 + }, + { + "epoch": 4.469920196439533, + "grad_norm": 0.2775517702102661, + "learning_rate": 6.093073593996005e-05, + "loss": 1.697, + "step": 14563 + }, + { + "epoch": 4.4702271332105585, + "grad_norm": 0.21053175628185272, + "learning_rate": 6.092588555770322e-05, + "loss": 1.6894, + "step": 14564 + }, + { + "epoch": 4.470534069981584, + "grad_norm": 0.2555869221687317, + "learning_rate": 6.0921035067470366e-05, + "loss": 1.7051, + "step": 14565 + }, + { + "epoch": 4.470841006752609, + "grad_norm": 0.34468984603881836, + "learning_rate": 6.0916184469309454e-05, + "loss": 1.7317, + "step": 14566 + }, + { + "epoch": 4.4711479435236345, + "grad_norm": 0.2517752945423126, + "learning_rate": 6.0911333763268407e-05, + "loss": 1.7524, + "step": 14567 + }, + { + "epoch": 4.471454880294659, + "grad_norm": 0.2749727666378021, + "learning_rate": 6.090648294939517e-05, + "loss": 1.7045, + "step": 14568 + }, + { + "epoch": 4.471761817065684, + "grad_norm": 0.36250773072242737, + "learning_rate": 6.0901632027737673e-05, + "loss": 1.7196, + "step": 14569 + }, + { + "epoch": 4.47206875383671, + "grad_norm": 0.2317698448896408, + "learning_rate": 6.089678099834386e-05, + "loss": 1.7318, + "step": 14570 + }, + { + "epoch": 4.472375690607735, + "grad_norm": 0.2863345444202423, + "learning_rate": 6.089192986126166e-05, + "loss": 1.7798, + "step": 14571 + }, + { + "epoch": 4.47268262737876, + "grad_norm": 0.3493366241455078, + "learning_rate": 6.088707861653904e-05, + "loss": 1.7749, + "step": 14572 + }, + { + "epoch": 4.472989564149785, + "grad_norm": 0.25718605518341064, + "learning_rate": 6.0882227264223924e-05, + "loss": 1.7683, + "step": 14573 + }, + { + "epoch": 4.47329650092081, + "grad_norm": 0.2320062816143036, + "learning_rate": 6.087737580436426e-05, + "loss": 1.8296, + "step": 14574 + }, + { + "epoch": 4.473603437691835, + "grad_norm": 0.29071560502052307, + "learning_rate": 6.087252423700799e-05, + "loss": 1.7428, + "step": 14575 + }, + { + "epoch": 4.473910374462861, + "grad_norm": 0.24233707785606384, + "learning_rate": 6.086767256220306e-05, + "loss": 1.7332, + "step": 14576 + }, + { + "epoch": 4.474217311233886, + "grad_norm": 0.228043332695961, + "learning_rate": 6.086282077999742e-05, + "loss": 1.7697, + "step": 14577 + }, + { + "epoch": 4.474524248004911, + "grad_norm": 0.29154402017593384, + "learning_rate": 6.085796889043902e-05, + "loss": 1.8043, + "step": 14578 + }, + { + "epoch": 4.474831184775936, + "grad_norm": 0.30543211102485657, + "learning_rate": 6.0853116893575814e-05, + "loss": 1.7665, + "step": 14579 + }, + { + "epoch": 4.475138121546961, + "grad_norm": 0.22792959213256836, + "learning_rate": 6.0848264789455754e-05, + "loss": 1.729, + "step": 14580 + }, + { + "epoch": 4.475445058317987, + "grad_norm": 0.2615707218647003, + "learning_rate": 6.084341257812677e-05, + "loss": 1.7438, + "step": 14581 + }, + { + "epoch": 4.475751995089012, + "grad_norm": 0.23342981934547424, + "learning_rate": 6.083856025963681e-05, + "loss": 1.7158, + "step": 14582 + }, + { + "epoch": 4.476058931860037, + "grad_norm": 0.22279240190982819, + "learning_rate": 6.083370783403387e-05, + "loss": 1.7413, + "step": 14583 + }, + { + "epoch": 4.476365868631062, + "grad_norm": 0.28867462277412415, + "learning_rate": 6.082885530136587e-05, + "loss": 1.7932, + "step": 14584 + }, + { + "epoch": 4.476672805402087, + "grad_norm": 0.2947152256965637, + "learning_rate": 6.082400266168078e-05, + "loss": 1.8986, + "step": 14585 + }, + { + "epoch": 4.476979742173112, + "grad_norm": 0.2948935627937317, + "learning_rate": 6.0819149915026555e-05, + "loss": 1.9134, + "step": 14586 + }, + { + "epoch": 4.477286678944138, + "grad_norm": 0.4436163902282715, + "learning_rate": 6.081429706145114e-05, + "loss": 1.7616, + "step": 14587 + }, + { + "epoch": 4.477593615715163, + "grad_norm": 0.4879693388938904, + "learning_rate": 6.080944410100249e-05, + "loss": 1.8155, + "step": 14588 + }, + { + "epoch": 4.4779005524861875, + "grad_norm": 0.29742667078971863, + "learning_rate": 6.08045910337286e-05, + "loss": 1.7428, + "step": 14589 + }, + { + "epoch": 4.478207489257213, + "grad_norm": 0.2994751036167145, + "learning_rate": 6.0799737859677395e-05, + "loss": 1.7764, + "step": 14590 + }, + { + "epoch": 4.478514426028238, + "grad_norm": 0.46379905939102173, + "learning_rate": 6.079488457889686e-05, + "loss": 1.7289, + "step": 14591 + }, + { + "epoch": 4.4788213627992635, + "grad_norm": 0.3511717617511749, + "learning_rate": 6.0790031191434946e-05, + "loss": 1.7658, + "step": 14592 + }, + { + "epoch": 4.479128299570289, + "grad_norm": 0.22678083181381226, + "learning_rate": 6.0785177697339626e-05, + "loss": 1.7973, + "step": 14593 + }, + { + "epoch": 4.479435236341313, + "grad_norm": 0.31201767921447754, + "learning_rate": 6.0780324096658837e-05, + "loss": 1.7542, + "step": 14594 + }, + { + "epoch": 4.479742173112339, + "grad_norm": 0.23759113252162933, + "learning_rate": 6.077547038944058e-05, + "loss": 1.7191, + "step": 14595 + }, + { + "epoch": 4.480049109883364, + "grad_norm": 0.25801756978034973, + "learning_rate": 6.077061657573282e-05, + "loss": 1.8229, + "step": 14596 + }, + { + "epoch": 4.480356046654389, + "grad_norm": 0.3435722887516022, + "learning_rate": 6.0765762655583514e-05, + "loss": 1.7633, + "step": 14597 + }, + { + "epoch": 4.480662983425415, + "grad_norm": 0.2710443437099457, + "learning_rate": 6.076090862904063e-05, + "loss": 1.8126, + "step": 14598 + }, + { + "epoch": 4.48096992019644, + "grad_norm": 0.25750285387039185, + "learning_rate": 6.075605449615212e-05, + "loss": 1.7382, + "step": 14599 + }, + { + "epoch": 4.481276856967464, + "grad_norm": 0.3638051152229309, + "learning_rate": 6.075120025696598e-05, + "loss": 1.8191, + "step": 14600 + }, + { + "epoch": 4.48158379373849, + "grad_norm": 0.24185293912887573, + "learning_rate": 6.074634591153019e-05, + "loss": 1.7637, + "step": 14601 + }, + { + "epoch": 4.481890730509515, + "grad_norm": 0.317283570766449, + "learning_rate": 6.0741491459892707e-05, + "loss": 1.7805, + "step": 14602 + }, + { + "epoch": 4.48219766728054, + "grad_norm": 0.33884385228157043, + "learning_rate": 6.073663690210151e-05, + "loss": 1.7719, + "step": 14603 + }, + { + "epoch": 4.482504604051566, + "grad_norm": 0.2554258704185486, + "learning_rate": 6.073178223820457e-05, + "loss": 1.836, + "step": 14604 + }, + { + "epoch": 4.48281154082259, + "grad_norm": 0.3363535702228546, + "learning_rate": 6.072692746824987e-05, + "loss": 1.8249, + "step": 14605 + }, + { + "epoch": 4.4831184775936155, + "grad_norm": 0.36090195178985596, + "learning_rate": 6.072207259228537e-05, + "loss": 1.733, + "step": 14606 + }, + { + "epoch": 4.483425414364641, + "grad_norm": 0.21928483247756958, + "learning_rate": 6.071721761035909e-05, + "loss": 1.7413, + "step": 14607 + }, + { + "epoch": 4.483732351135666, + "grad_norm": 0.4256608486175537, + "learning_rate": 6.071236252251897e-05, + "loss": 1.7585, + "step": 14608 + }, + { + "epoch": 4.4840392879066915, + "grad_norm": 0.41980308294296265, + "learning_rate": 6.0707507328813007e-05, + "loss": 1.7584, + "step": 14609 + }, + { + "epoch": 4.484346224677717, + "grad_norm": 0.200295090675354, + "learning_rate": 6.0702652029289186e-05, + "loss": 1.7492, + "step": 14610 + }, + { + "epoch": 4.484653161448741, + "grad_norm": 0.41847771406173706, + "learning_rate": 6.069779662399549e-05, + "loss": 1.8101, + "step": 14611 + }, + { + "epoch": 4.484960098219767, + "grad_norm": 0.4846353530883789, + "learning_rate": 6.069294111297987e-05, + "loss": 1.8227, + "step": 14612 + }, + { + "epoch": 4.485267034990792, + "grad_norm": 0.23216098546981812, + "learning_rate": 6.068808549629036e-05, + "loss": 1.6811, + "step": 14613 + }, + { + "epoch": 4.485573971761817, + "grad_norm": 0.34903186559677124, + "learning_rate": 6.0683229773974934e-05, + "loss": 1.6858, + "step": 14614 + }, + { + "epoch": 4.485880908532843, + "grad_norm": 0.4349122941493988, + "learning_rate": 6.0678373946081556e-05, + "loss": 1.7704, + "step": 14615 + }, + { + "epoch": 4.486187845303867, + "grad_norm": 0.25738775730133057, + "learning_rate": 6.067351801265824e-05, + "loss": 1.7487, + "step": 14616 + }, + { + "epoch": 4.486494782074892, + "grad_norm": 0.3052736818790436, + "learning_rate": 6.0668661973752936e-05, + "loss": 1.7528, + "step": 14617 + }, + { + "epoch": 4.486801718845918, + "grad_norm": 0.3400498628616333, + "learning_rate": 6.066380582941368e-05, + "loss": 1.7414, + "step": 14618 + }, + { + "epoch": 4.487108655616943, + "grad_norm": 0.28251948952674866, + "learning_rate": 6.065894957968845e-05, + "loss": 1.8078, + "step": 14619 + }, + { + "epoch": 4.487415592387968, + "grad_norm": 0.26907965540885925, + "learning_rate": 6.0654093224625216e-05, + "loss": 1.8143, + "step": 14620 + }, + { + "epoch": 4.487722529158993, + "grad_norm": 0.2821955978870392, + "learning_rate": 6.064923676427201e-05, + "loss": 1.7163, + "step": 14621 + }, + { + "epoch": 4.488029465930018, + "grad_norm": 0.2223028987646103, + "learning_rate": 6.0644380198676786e-05, + "loss": 1.704, + "step": 14622 + }, + { + "epoch": 4.4883364027010435, + "grad_norm": 0.25243067741394043, + "learning_rate": 6.063952352788755e-05, + "loss": 1.7236, + "step": 14623 + }, + { + "epoch": 4.488643339472069, + "grad_norm": 0.30026015639305115, + "learning_rate": 6.063466675195233e-05, + "loss": 1.7575, + "step": 14624 + }, + { + "epoch": 4.488950276243094, + "grad_norm": 0.2055491805076599, + "learning_rate": 6.0629809870919085e-05, + "loss": 1.7294, + "step": 14625 + }, + { + "epoch": 4.4892572130141195, + "grad_norm": 0.2507593035697937, + "learning_rate": 6.0624952884835836e-05, + "loss": 1.762, + "step": 14626 + }, + { + "epoch": 4.489564149785144, + "grad_norm": 0.21385909616947174, + "learning_rate": 6.0620095793750576e-05, + "loss": 1.7396, + "step": 14627 + }, + { + "epoch": 4.489871086556169, + "grad_norm": 0.21926651895046234, + "learning_rate": 6.06152385977113e-05, + "loss": 1.7863, + "step": 14628 + }, + { + "epoch": 4.490178023327195, + "grad_norm": 0.21950845420360565, + "learning_rate": 6.0610381296766016e-05, + "loss": 1.7576, + "step": 14629 + }, + { + "epoch": 4.49048496009822, + "grad_norm": 0.2030971795320511, + "learning_rate": 6.0605523890962736e-05, + "loss": 1.7069, + "step": 14630 + }, + { + "epoch": 4.490791896869245, + "grad_norm": 0.23991432785987854, + "learning_rate": 6.0600666380349436e-05, + "loss": 1.7598, + "step": 14631 + }, + { + "epoch": 4.49109883364027, + "grad_norm": 0.23766861855983734, + "learning_rate": 6.059580876497415e-05, + "loss": 1.7687, + "step": 14632 + }, + { + "epoch": 4.491405770411295, + "grad_norm": 0.2361454963684082, + "learning_rate": 6.059095104488487e-05, + "loss": 1.7883, + "step": 14633 + }, + { + "epoch": 4.49171270718232, + "grad_norm": 0.3128328323364258, + "learning_rate": 6.058609322012958e-05, + "loss": 1.8087, + "step": 14634 + }, + { + "epoch": 4.492019643953346, + "grad_norm": 0.2958957850933075, + "learning_rate": 6.0581235290756335e-05, + "loss": 1.782, + "step": 14635 + }, + { + "epoch": 4.492326580724371, + "grad_norm": 0.2197243571281433, + "learning_rate": 6.057637725681312e-05, + "loss": 1.7408, + "step": 14636 + }, + { + "epoch": 4.4926335174953955, + "grad_norm": 0.22227831184864044, + "learning_rate": 6.0571519118347944e-05, + "loss": 1.734, + "step": 14637 + }, + { + "epoch": 4.492940454266421, + "grad_norm": 0.2784527540206909, + "learning_rate": 6.056666087540882e-05, + "loss": 1.8017, + "step": 14638 + }, + { + "epoch": 4.493247391037446, + "grad_norm": 0.21929821372032166, + "learning_rate": 6.056180252804377e-05, + "loss": 1.7271, + "step": 14639 + }, + { + "epoch": 4.4935543278084715, + "grad_norm": 0.2156134843826294, + "learning_rate": 6.055694407630077e-05, + "loss": 1.8082, + "step": 14640 + }, + { + "epoch": 4.493861264579497, + "grad_norm": 0.22672387957572937, + "learning_rate": 6.0552085520227875e-05, + "loss": 1.7506, + "step": 14641 + }, + { + "epoch": 4.494168201350522, + "grad_norm": 0.228785440325737, + "learning_rate": 6.0547226859873086e-05, + "loss": 1.7023, + "step": 14642 + }, + { + "epoch": 4.494475138121547, + "grad_norm": 0.19483685493469238, + "learning_rate": 6.054236809528443e-05, + "loss": 1.6879, + "step": 14643 + }, + { + "epoch": 4.494782074892572, + "grad_norm": 0.24911309778690338, + "learning_rate": 6.0537509226509904e-05, + "loss": 1.7856, + "step": 14644 + }, + { + "epoch": 4.495089011663597, + "grad_norm": 0.24811938405036926, + "learning_rate": 6.053265025359753e-05, + "loss": 1.7581, + "step": 14645 + }, + { + "epoch": 4.495395948434623, + "grad_norm": 0.2487260401248932, + "learning_rate": 6.052779117659534e-05, + "loss": 1.7536, + "step": 14646 + }, + { + "epoch": 4.495702885205648, + "grad_norm": 0.2594854235649109, + "learning_rate": 6.052293199555136e-05, + "loss": 1.7822, + "step": 14647 + }, + { + "epoch": 4.496009821976672, + "grad_norm": 0.22837325930595398, + "learning_rate": 6.051807271051359e-05, + "loss": 1.7542, + "step": 14648 + }, + { + "epoch": 4.496316758747698, + "grad_norm": 0.23106649518013, + "learning_rate": 6.051321332153005e-05, + "loss": 1.7758, + "step": 14649 + }, + { + "epoch": 4.496623695518723, + "grad_norm": 0.29424673318862915, + "learning_rate": 6.050835382864878e-05, + "loss": 1.8335, + "step": 14650 + }, + { + "epoch": 4.496930632289748, + "grad_norm": 0.28297343850135803, + "learning_rate": 6.050349423191779e-05, + "loss": 1.7711, + "step": 14651 + }, + { + "epoch": 4.497237569060774, + "grad_norm": 0.2001795768737793, + "learning_rate": 6.049863453138511e-05, + "loss": 1.7008, + "step": 14652 + }, + { + "epoch": 4.497544505831799, + "grad_norm": 0.35177022218704224, + "learning_rate": 6.04937747270988e-05, + "loss": 1.7763, + "step": 14653 + }, + { + "epoch": 4.4978514426028235, + "grad_norm": 0.28870898485183716, + "learning_rate": 6.0488914819106835e-05, + "loss": 1.7373, + "step": 14654 + }, + { + "epoch": 4.498158379373849, + "grad_norm": 0.23962664604187012, + "learning_rate": 6.048405480745727e-05, + "loss": 1.7278, + "step": 14655 + }, + { + "epoch": 4.498465316144874, + "grad_norm": 0.324505478143692, + "learning_rate": 6.047919469219813e-05, + "loss": 1.7674, + "step": 14656 + }, + { + "epoch": 4.4987722529158995, + "grad_norm": 0.38313817977905273, + "learning_rate": 6.047433447337744e-05, + "loss": 1.789, + "step": 14657 + }, + { + "epoch": 4.499079189686925, + "grad_norm": 0.2101358324289322, + "learning_rate": 6.046947415104324e-05, + "loss": 1.7331, + "step": 14658 + }, + { + "epoch": 4.499386126457949, + "grad_norm": 0.3388524353504181, + "learning_rate": 6.046461372524357e-05, + "loss": 1.8467, + "step": 14659 + }, + { + "epoch": 4.499693063228975, + "grad_norm": 0.3360123634338379, + "learning_rate": 6.045975319602645e-05, + "loss": 1.8427, + "step": 14660 + }, + { + "epoch": 4.5, + "grad_norm": 0.27596545219421387, + "learning_rate": 6.0454892563439914e-05, + "loss": 1.7768, + "step": 14661 + }, + { + "epoch": 4.500306936771025, + "grad_norm": 0.2580861747264862, + "learning_rate": 6.0450031827532e-05, + "loss": 1.763, + "step": 14662 + }, + { + "epoch": 4.500613873542051, + "grad_norm": 0.3521091938018799, + "learning_rate": 6.044517098835074e-05, + "loss": 1.7118, + "step": 14663 + }, + { + "epoch": 4.500920810313076, + "grad_norm": 0.29412439465522766, + "learning_rate": 6.0440310045944204e-05, + "loss": 1.7252, + "step": 14664 + }, + { + "epoch": 4.5012277470841005, + "grad_norm": 0.23845252394676208, + "learning_rate": 6.043544900036039e-05, + "loss": 1.7622, + "step": 14665 + }, + { + "epoch": 4.501534683855126, + "grad_norm": 0.22957031428813934, + "learning_rate": 6.043058785164736e-05, + "loss": 1.7527, + "step": 14666 + }, + { + "epoch": 4.501841620626151, + "grad_norm": 0.2564462721347809, + "learning_rate": 6.042572659985314e-05, + "loss": 1.801, + "step": 14667 + }, + { + "epoch": 4.5021485573971765, + "grad_norm": 0.22588051855564117, + "learning_rate": 6.042086524502576e-05, + "loss": 1.7387, + "step": 14668 + }, + { + "epoch": 4.502455494168201, + "grad_norm": 0.2609740197658539, + "learning_rate": 6.0416003787213306e-05, + "loss": 1.7615, + "step": 14669 + }, + { + "epoch": 4.502762430939226, + "grad_norm": 0.2535521984100342, + "learning_rate": 6.041114222646379e-05, + "loss": 1.7398, + "step": 14670 + }, + { + "epoch": 4.503069367710252, + "grad_norm": 0.2512127757072449, + "learning_rate": 6.040628056282527e-05, + "loss": 1.7679, + "step": 14671 + }, + { + "epoch": 4.503376304481277, + "grad_norm": 0.2438639998435974, + "learning_rate": 6.0401418796345774e-05, + "loss": 1.7, + "step": 14672 + }, + { + "epoch": 4.503683241252302, + "grad_norm": 0.23428042232990265, + "learning_rate": 6.0396556927073376e-05, + "loss": 1.7748, + "step": 14673 + }, + { + "epoch": 4.503990178023328, + "grad_norm": 0.22894345223903656, + "learning_rate": 6.03916949550561e-05, + "loss": 1.7881, + "step": 14674 + }, + { + "epoch": 4.504297114794352, + "grad_norm": 0.24813716113567352, + "learning_rate": 6.0386832880342006e-05, + "loss": 1.7676, + "step": 14675 + }, + { + "epoch": 4.504604051565377, + "grad_norm": 0.23448842763900757, + "learning_rate": 6.038197070297914e-05, + "loss": 1.7828, + "step": 14676 + }, + { + "epoch": 4.504910988336403, + "grad_norm": 0.25302332639694214, + "learning_rate": 6.037710842301556e-05, + "loss": 1.8061, + "step": 14677 + }, + { + "epoch": 4.505217925107428, + "grad_norm": 0.2411813735961914, + "learning_rate": 6.0372246040499305e-05, + "loss": 1.6901, + "step": 14678 + }, + { + "epoch": 4.505524861878453, + "grad_norm": 0.3154819905757904, + "learning_rate": 6.036738355547844e-05, + "loss": 1.7472, + "step": 14679 + }, + { + "epoch": 4.505831798649478, + "grad_norm": 0.2935639023780823, + "learning_rate": 6.0362520968001014e-05, + "loss": 1.7508, + "step": 14680 + }, + { + "epoch": 4.506138735420503, + "grad_norm": 0.27064070105552673, + "learning_rate": 6.035765827811508e-05, + "loss": 1.8133, + "step": 14681 + }, + { + "epoch": 4.5064456721915285, + "grad_norm": 0.23748525977134705, + "learning_rate": 6.03527954858687e-05, + "loss": 1.7742, + "step": 14682 + }, + { + "epoch": 4.506752608962554, + "grad_norm": 0.216410830616951, + "learning_rate": 6.034793259130992e-05, + "loss": 1.7448, + "step": 14683 + }, + { + "epoch": 4.507059545733579, + "grad_norm": 0.23339977860450745, + "learning_rate": 6.034306959448681e-05, + "loss": 1.7437, + "step": 14684 + }, + { + "epoch": 4.5073664825046045, + "grad_norm": 0.23951120674610138, + "learning_rate": 6.0338206495447414e-05, + "loss": 1.7535, + "step": 14685 + }, + { + "epoch": 4.507673419275629, + "grad_norm": 0.22137518227100372, + "learning_rate": 6.0333343294239816e-05, + "loss": 1.7537, + "step": 14686 + }, + { + "epoch": 4.507980356046654, + "grad_norm": 0.2550075054168701, + "learning_rate": 6.032847999091206e-05, + "loss": 1.8069, + "step": 14687 + }, + { + "epoch": 4.50828729281768, + "grad_norm": 0.2166420966386795, + "learning_rate": 6.032361658551221e-05, + "loss": 1.7746, + "step": 14688 + }, + { + "epoch": 4.508594229588705, + "grad_norm": 0.21926096081733704, + "learning_rate": 6.031875307808833e-05, + "loss": 1.7848, + "step": 14689 + }, + { + "epoch": 4.50890116635973, + "grad_norm": 0.27769652009010315, + "learning_rate": 6.031388946868848e-05, + "loss": 1.7563, + "step": 14690 + }, + { + "epoch": 4.509208103130755, + "grad_norm": 0.23417410254478455, + "learning_rate": 6.030902575736074e-05, + "loss": 1.7475, + "step": 14691 + }, + { + "epoch": 4.50951503990178, + "grad_norm": 0.25454118847846985, + "learning_rate": 6.030416194415314e-05, + "loss": 1.7416, + "step": 14692 + }, + { + "epoch": 4.509821976672805, + "grad_norm": 0.3118220567703247, + "learning_rate": 6.029929802911379e-05, + "loss": 1.8001, + "step": 14693 + }, + { + "epoch": 4.510128913443831, + "grad_norm": 0.2338017225265503, + "learning_rate": 6.029443401229075e-05, + "loss": 1.7243, + "step": 14694 + }, + { + "epoch": 4.510435850214856, + "grad_norm": 0.2490454763174057, + "learning_rate": 6.028956989373207e-05, + "loss": 1.7866, + "step": 14695 + }, + { + "epoch": 4.510742786985881, + "grad_norm": 0.2579275369644165, + "learning_rate": 6.028470567348582e-05, + "loss": 1.7594, + "step": 14696 + }, + { + "epoch": 4.511049723756906, + "grad_norm": 0.23982174694538116, + "learning_rate": 6.0279841351600094e-05, + "loss": 1.7444, + "step": 14697 + }, + { + "epoch": 4.511356660527931, + "grad_norm": 0.2160159945487976, + "learning_rate": 6.027497692812295e-05, + "loss": 1.7002, + "step": 14698 + }, + { + "epoch": 4.5116635972989565, + "grad_norm": 0.24604511260986328, + "learning_rate": 6.0270112403102455e-05, + "loss": 1.7654, + "step": 14699 + }, + { + "epoch": 4.511970534069982, + "grad_norm": 0.21978263556957245, + "learning_rate": 6.026524777658669e-05, + "loss": 1.7278, + "step": 14700 + }, + { + "epoch": 4.512277470841006, + "grad_norm": 0.2814212441444397, + "learning_rate": 6.026038304862373e-05, + "loss": 1.7743, + "step": 14701 + }, + { + "epoch": 4.512584407612032, + "grad_norm": 0.23798944056034088, + "learning_rate": 6.025551821926165e-05, + "loss": 1.7348, + "step": 14702 + }, + { + "epoch": 4.512891344383057, + "grad_norm": 0.22415988147258759, + "learning_rate": 6.025065328854853e-05, + "loss": 1.7973, + "step": 14703 + }, + { + "epoch": 4.513198281154082, + "grad_norm": 0.34614792466163635, + "learning_rate": 6.0245788256532445e-05, + "loss": 1.7263, + "step": 14704 + }, + { + "epoch": 4.513505217925108, + "grad_norm": 0.333918958902359, + "learning_rate": 6.0240923123261485e-05, + "loss": 1.7305, + "step": 14705 + }, + { + "epoch": 4.513812154696133, + "grad_norm": 0.22231793403625488, + "learning_rate": 6.02360578887837e-05, + "loss": 1.806, + "step": 14706 + }, + { + "epoch": 4.514119091467157, + "grad_norm": 0.23323194682598114, + "learning_rate": 6.023119255314721e-05, + "loss": 1.7076, + "step": 14707 + }, + { + "epoch": 4.514426028238183, + "grad_norm": 0.26695477962493896, + "learning_rate": 6.022632711640007e-05, + "loss": 1.775, + "step": 14708 + }, + { + "epoch": 4.514732965009208, + "grad_norm": 0.21446476876735687, + "learning_rate": 6.0221461578590364e-05, + "loss": 1.7524, + "step": 14709 + }, + { + "epoch": 4.515039901780233, + "grad_norm": 0.2677358090877533, + "learning_rate": 6.0216595939766204e-05, + "loss": 1.7513, + "step": 14710 + }, + { + "epoch": 4.515346838551259, + "grad_norm": 0.28648239374160767, + "learning_rate": 6.021173019997565e-05, + "loss": 1.7249, + "step": 14711 + }, + { + "epoch": 4.515653775322283, + "grad_norm": 0.2178548276424408, + "learning_rate": 6.020686435926678e-05, + "loss": 1.7502, + "step": 14712 + }, + { + "epoch": 4.5159607120933085, + "grad_norm": 0.3391740024089813, + "learning_rate": 6.02019984176877e-05, + "loss": 1.6828, + "step": 14713 + }, + { + "epoch": 4.516267648864334, + "grad_norm": 0.25222229957580566, + "learning_rate": 6.01971323752865e-05, + "loss": 1.6982, + "step": 14714 + }, + { + "epoch": 4.516574585635359, + "grad_norm": 0.28776636719703674, + "learning_rate": 6.019226623211125e-05, + "loss": 1.8595, + "step": 14715 + }, + { + "epoch": 4.5168815224063845, + "grad_norm": 0.3240084648132324, + "learning_rate": 6.018739998821006e-05, + "loss": 1.7461, + "step": 14716 + }, + { + "epoch": 4.51718845917741, + "grad_norm": 0.26735052466392517, + "learning_rate": 6.0182533643631015e-05, + "loss": 1.7955, + "step": 14717 + }, + { + "epoch": 4.517495395948434, + "grad_norm": 0.24573692679405212, + "learning_rate": 6.017766719842219e-05, + "loss": 1.7441, + "step": 14718 + }, + { + "epoch": 4.51780233271946, + "grad_norm": 0.27401313185691833, + "learning_rate": 6.01728006526317e-05, + "loss": 1.7399, + "step": 14719 + }, + { + "epoch": 4.518109269490485, + "grad_norm": 0.23578806221485138, + "learning_rate": 6.016793400630763e-05, + "loss": 1.7936, + "step": 14720 + }, + { + "epoch": 4.51841620626151, + "grad_norm": 0.27763426303863525, + "learning_rate": 6.0163067259498074e-05, + "loss": 1.7263, + "step": 14721 + }, + { + "epoch": 4.518723143032536, + "grad_norm": 0.27102044224739075, + "learning_rate": 6.015820041225113e-05, + "loss": 1.7085, + "step": 14722 + }, + { + "epoch": 4.51903007980356, + "grad_norm": 0.2046152651309967, + "learning_rate": 6.01533334646149e-05, + "loss": 1.7602, + "step": 14723 + }, + { + "epoch": 4.519337016574585, + "grad_norm": 0.2645253837108612, + "learning_rate": 6.0148466416637484e-05, + "loss": 1.7729, + "step": 14724 + }, + { + "epoch": 4.519643953345611, + "grad_norm": 0.27467650175094604, + "learning_rate": 6.014359926836697e-05, + "loss": 1.7834, + "step": 14725 + }, + { + "epoch": 4.519950890116636, + "grad_norm": 0.30357635021209717, + "learning_rate": 6.013873201985145e-05, + "loss": 1.8685, + "step": 14726 + }, + { + "epoch": 4.520257826887661, + "grad_norm": 0.22923336923122406, + "learning_rate": 6.013386467113905e-05, + "loss": 1.7531, + "step": 14727 + }, + { + "epoch": 4.520564763658687, + "grad_norm": 0.2792156934738159, + "learning_rate": 6.012899722227786e-05, + "loss": 1.7927, + "step": 14728 + }, + { + "epoch": 4.520871700429711, + "grad_norm": 0.286161869764328, + "learning_rate": 6.012412967331598e-05, + "loss": 1.77, + "step": 14729 + }, + { + "epoch": 4.5211786372007365, + "grad_norm": 0.23964659869670868, + "learning_rate": 6.011926202430151e-05, + "loss": 1.7873, + "step": 14730 + }, + { + "epoch": 4.521485573971762, + "grad_norm": 0.2250162959098816, + "learning_rate": 6.011439427528258e-05, + "loss": 1.741, + "step": 14731 + }, + { + "epoch": 4.521792510742787, + "grad_norm": 0.2797175347805023, + "learning_rate": 6.010952642630726e-05, + "loss": 1.7482, + "step": 14732 + }, + { + "epoch": 4.5220994475138125, + "grad_norm": 0.22159560024738312, + "learning_rate": 6.010465847742368e-05, + "loss": 1.7591, + "step": 14733 + }, + { + "epoch": 4.522406384284837, + "grad_norm": 0.26638463139533997, + "learning_rate": 6.009979042867995e-05, + "loss": 1.8564, + "step": 14734 + }, + { + "epoch": 4.522713321055862, + "grad_norm": 0.2972821891307831, + "learning_rate": 6.009492228012416e-05, + "loss": 1.7569, + "step": 14735 + }, + { + "epoch": 4.523020257826888, + "grad_norm": 0.28108885884284973, + "learning_rate": 6.0090054031804444e-05, + "loss": 1.7256, + "step": 14736 + }, + { + "epoch": 4.523327194597913, + "grad_norm": 0.22359851002693176, + "learning_rate": 6.008518568376888e-05, + "loss": 1.7342, + "step": 14737 + }, + { + "epoch": 4.523634131368938, + "grad_norm": 0.2620728015899658, + "learning_rate": 6.008031723606562e-05, + "loss": 1.7703, + "step": 14738 + }, + { + "epoch": 4.523941068139964, + "grad_norm": 0.2641485333442688, + "learning_rate": 6.007544868874274e-05, + "loss": 1.6944, + "step": 14739 + }, + { + "epoch": 4.524248004910988, + "grad_norm": 0.24957752227783203, + "learning_rate": 6.007058004184839e-05, + "loss": 1.7746, + "step": 14740 + }, + { + "epoch": 4.524554941682013, + "grad_norm": 0.29830998182296753, + "learning_rate": 6.006571129543065e-05, + "loss": 1.7718, + "step": 14741 + }, + { + "epoch": 4.524861878453039, + "grad_norm": 0.32740798592567444, + "learning_rate": 6.006084244953766e-05, + "loss": 1.8194, + "step": 14742 + }, + { + "epoch": 4.525168815224064, + "grad_norm": 0.2614956796169281, + "learning_rate": 6.005597350421751e-05, + "loss": 1.7078, + "step": 14743 + }, + { + "epoch": 4.525475751995089, + "grad_norm": 0.23940515518188477, + "learning_rate": 6.005110445951836e-05, + "loss": 1.7488, + "step": 14744 + }, + { + "epoch": 4.525782688766114, + "grad_norm": 0.25485914945602417, + "learning_rate": 6.004623531548829e-05, + "loss": 1.7705, + "step": 14745 + }, + { + "epoch": 4.526089625537139, + "grad_norm": 0.213532954454422, + "learning_rate": 6.0041366072175445e-05, + "loss": 1.7501, + "step": 14746 + }, + { + "epoch": 4.526396562308165, + "grad_norm": 0.2420104295015335, + "learning_rate": 6.003649672962792e-05, + "loss": 1.717, + "step": 14747 + }, + { + "epoch": 4.52670349907919, + "grad_norm": 0.26179102063179016, + "learning_rate": 6.0031627287893865e-05, + "loss": 1.7665, + "step": 14748 + }, + { + "epoch": 4.527010435850215, + "grad_norm": 0.22032082080841064, + "learning_rate": 6.002675774702139e-05, + "loss": 1.7555, + "step": 14749 + }, + { + "epoch": 4.52731737262124, + "grad_norm": 0.23915240168571472, + "learning_rate": 6.002188810705861e-05, + "loss": 1.8219, + "step": 14750 + }, + { + "epoch": 4.527624309392265, + "grad_norm": 0.2275150567293167, + "learning_rate": 6.0017018368053665e-05, + "loss": 1.7418, + "step": 14751 + }, + { + "epoch": 4.52793124616329, + "grad_norm": 0.2349669486284256, + "learning_rate": 6.001214853005467e-05, + "loss": 1.7814, + "step": 14752 + }, + { + "epoch": 4.528238182934316, + "grad_norm": 0.29985731840133667, + "learning_rate": 6.000727859310975e-05, + "loss": 1.7109, + "step": 14753 + }, + { + "epoch": 4.528545119705341, + "grad_norm": 0.27282044291496277, + "learning_rate": 6.0002408557267044e-05, + "loss": 1.7806, + "step": 14754 + }, + { + "epoch": 4.5288520564763655, + "grad_norm": 0.20906320214271545, + "learning_rate": 5.9997538422574675e-05, + "loss": 1.7221, + "step": 14755 + }, + { + "epoch": 4.529158993247391, + "grad_norm": 0.24553455412387848, + "learning_rate": 5.999266818908076e-05, + "loss": 1.793, + "step": 14756 + }, + { + "epoch": 4.529465930018416, + "grad_norm": 0.29730647802352905, + "learning_rate": 5.998779785683345e-05, + "loss": 1.7597, + "step": 14757 + }, + { + "epoch": 4.5297728667894415, + "grad_norm": 0.28297582268714905, + "learning_rate": 5.998292742588087e-05, + "loss": 1.7459, + "step": 14758 + }, + { + "epoch": 4.530079803560467, + "grad_norm": 0.21853844821453094, + "learning_rate": 5.997805689627115e-05, + "loss": 1.7234, + "step": 14759 + }, + { + "epoch": 4.530386740331492, + "grad_norm": 0.2997361421585083, + "learning_rate": 5.997318626805242e-05, + "loss": 1.7294, + "step": 14760 + }, + { + "epoch": 4.530693677102517, + "grad_norm": 0.3298671543598175, + "learning_rate": 5.9968315541272804e-05, + "loss": 1.7837, + "step": 14761 + }, + { + "epoch": 4.531000613873542, + "grad_norm": 0.22812490165233612, + "learning_rate": 5.996344471598047e-05, + "loss": 1.7509, + "step": 14762 + }, + { + "epoch": 4.531307550644567, + "grad_norm": 0.3179669678211212, + "learning_rate": 5.995857379222354e-05, + "loss": 1.8354, + "step": 14763 + }, + { + "epoch": 4.531614487415593, + "grad_norm": 0.3072827458381653, + "learning_rate": 5.9953702770050135e-05, + "loss": 1.8051, + "step": 14764 + }, + { + "epoch": 4.531921424186618, + "grad_norm": 0.19386722147464752, + "learning_rate": 5.994883164950841e-05, + "loss": 1.7093, + "step": 14765 + }, + { + "epoch": 4.532228360957642, + "grad_norm": 0.2380950152873993, + "learning_rate": 5.99439604306465e-05, + "loss": 1.7547, + "step": 14766 + }, + { + "epoch": 4.532535297728668, + "grad_norm": 0.32604947686195374, + "learning_rate": 5.993908911351254e-05, + "loss": 1.8708, + "step": 14767 + }, + { + "epoch": 4.532842234499693, + "grad_norm": 0.2436954528093338, + "learning_rate": 5.993421769815468e-05, + "loss": 1.7272, + "step": 14768 + }, + { + "epoch": 4.533149171270718, + "grad_norm": 0.2470337301492691, + "learning_rate": 5.992934618462105e-05, + "loss": 1.7242, + "step": 14769 + }, + { + "epoch": 4.533456108041744, + "grad_norm": 0.25720325112342834, + "learning_rate": 5.992447457295981e-05, + "loss": 1.7219, + "step": 14770 + }, + { + "epoch": 4.533763044812769, + "grad_norm": 0.2518918812274933, + "learning_rate": 5.991960286321909e-05, + "loss": 1.7916, + "step": 14771 + }, + { + "epoch": 4.5340699815837935, + "grad_norm": 0.2561487853527069, + "learning_rate": 5.9914731055447037e-05, + "loss": 1.7695, + "step": 14772 + }, + { + "epoch": 4.534376918354819, + "grad_norm": 0.25361356139183044, + "learning_rate": 5.9909859149691804e-05, + "loss": 1.7464, + "step": 14773 + }, + { + "epoch": 4.534683855125844, + "grad_norm": 0.22827522456645966, + "learning_rate": 5.9904987146001545e-05, + "loss": 1.7288, + "step": 14774 + }, + { + "epoch": 4.5349907918968695, + "grad_norm": 0.2417261302471161, + "learning_rate": 5.9900115044424385e-05, + "loss": 1.7311, + "step": 14775 + }, + { + "epoch": 4.535297728667894, + "grad_norm": 0.20756755769252777, + "learning_rate": 5.9895242845008495e-05, + "loss": 1.7799, + "step": 14776 + }, + { + "epoch": 4.535604665438919, + "grad_norm": 0.21999207139015198, + "learning_rate": 5.989037054780201e-05, + "loss": 1.7782, + "step": 14777 + }, + { + "epoch": 4.535911602209945, + "grad_norm": 0.22863444685935974, + "learning_rate": 5.988549815285308e-05, + "loss": 1.7869, + "step": 14778 + }, + { + "epoch": 4.53621853898097, + "grad_norm": 0.23033374547958374, + "learning_rate": 5.988062566020987e-05, + "loss": 1.7328, + "step": 14779 + }, + { + "epoch": 4.536525475751995, + "grad_norm": 0.21903404593467712, + "learning_rate": 5.987575306992053e-05, + "loss": 1.7689, + "step": 14780 + }, + { + "epoch": 4.536832412523021, + "grad_norm": 0.2433948963880539, + "learning_rate": 5.98708803820332e-05, + "loss": 1.7647, + "step": 14781 + }, + { + "epoch": 4.537139349294045, + "grad_norm": 0.2564239799976349, + "learning_rate": 5.986600759659606e-05, + "loss": 1.7958, + "step": 14782 + }, + { + "epoch": 4.53744628606507, + "grad_norm": 0.24009190499782562, + "learning_rate": 5.9861134713657244e-05, + "loss": 1.7511, + "step": 14783 + }, + { + "epoch": 4.537753222836096, + "grad_norm": 0.2578975558280945, + "learning_rate": 5.985626173326491e-05, + "loss": 1.8285, + "step": 14784 + }, + { + "epoch": 4.538060159607121, + "grad_norm": 0.24334335327148438, + "learning_rate": 5.9851388655467225e-05, + "loss": 1.7391, + "step": 14785 + }, + { + "epoch": 4.538367096378146, + "grad_norm": 0.26446983218193054, + "learning_rate": 5.9846515480312335e-05, + "loss": 1.8232, + "step": 14786 + }, + { + "epoch": 4.538674033149171, + "grad_norm": 0.3125670850276947, + "learning_rate": 5.9841642207848415e-05, + "loss": 1.7202, + "step": 14787 + }, + { + "epoch": 4.538980969920196, + "grad_norm": 0.2524511218070984, + "learning_rate": 5.983676883812361e-05, + "loss": 1.7653, + "step": 14788 + }, + { + "epoch": 4.5392879066912215, + "grad_norm": 0.3693946897983551, + "learning_rate": 5.98318953711861e-05, + "loss": 1.7457, + "step": 14789 + }, + { + "epoch": 4.539594843462247, + "grad_norm": 0.32625386118888855, + "learning_rate": 5.9827021807084026e-05, + "loss": 1.784, + "step": 14790 + }, + { + "epoch": 4.539901780233272, + "grad_norm": 0.24243168532848358, + "learning_rate": 5.9822148145865574e-05, + "loss": 1.7651, + "step": 14791 + }, + { + "epoch": 4.5402087170042975, + "grad_norm": 0.2950129210948944, + "learning_rate": 5.9817274387578895e-05, + "loss": 1.7316, + "step": 14792 + }, + { + "epoch": 4.540515653775322, + "grad_norm": 0.29455235600471497, + "learning_rate": 5.981240053227216e-05, + "loss": 1.7504, + "step": 14793 + }, + { + "epoch": 4.540822590546347, + "grad_norm": 0.23161925375461578, + "learning_rate": 5.980752657999352e-05, + "loss": 1.7663, + "step": 14794 + }, + { + "epoch": 4.541129527317373, + "grad_norm": 0.2725144922733307, + "learning_rate": 5.980265253079116e-05, + "loss": 1.765, + "step": 14795 + }, + { + "epoch": 4.541436464088398, + "grad_norm": 0.30911222100257874, + "learning_rate": 5.979777838471324e-05, + "loss": 1.7888, + "step": 14796 + }, + { + "epoch": 4.541743400859423, + "grad_norm": 0.2818063497543335, + "learning_rate": 5.979290414180794e-05, + "loss": 1.8047, + "step": 14797 + }, + { + "epoch": 4.542050337630448, + "grad_norm": 0.23335030674934387, + "learning_rate": 5.978802980212341e-05, + "loss": 1.8205, + "step": 14798 + }, + { + "epoch": 4.542357274401473, + "grad_norm": 0.24228201806545258, + "learning_rate": 5.9783155365707855e-05, + "loss": 1.7774, + "step": 14799 + }, + { + "epoch": 4.542664211172498, + "grad_norm": 0.2410847544670105, + "learning_rate": 5.97782808326094e-05, + "loss": 1.6959, + "step": 14800 + }, + { + "epoch": 4.542971147943524, + "grad_norm": 0.24812567234039307, + "learning_rate": 5.9773406202876245e-05, + "loss": 1.8158, + "step": 14801 + }, + { + "epoch": 4.543278084714549, + "grad_norm": 0.2606147229671478, + "learning_rate": 5.9768531476556566e-05, + "loss": 1.7478, + "step": 14802 + }, + { + "epoch": 4.543585021485574, + "grad_norm": 0.24853013455867767, + "learning_rate": 5.976365665369854e-05, + "loss": 1.8158, + "step": 14803 + }, + { + "epoch": 4.543891958256599, + "grad_norm": 0.2320917695760727, + "learning_rate": 5.9758781734350334e-05, + "loss": 1.7812, + "step": 14804 + }, + { + "epoch": 4.544198895027624, + "grad_norm": 0.3460223376750946, + "learning_rate": 5.9753906718560127e-05, + "loss": 1.7562, + "step": 14805 + }, + { + "epoch": 4.5445058317986495, + "grad_norm": 0.2941136658191681, + "learning_rate": 5.9749031606376086e-05, + "loss": 1.7562, + "step": 14806 + }, + { + "epoch": 4.544812768569675, + "grad_norm": 0.2371312975883484, + "learning_rate": 5.9744156397846404e-05, + "loss": 1.7793, + "step": 14807 + }, + { + "epoch": 4.5451197053407, + "grad_norm": 0.2885094881057739, + "learning_rate": 5.973928109301926e-05, + "loss": 1.7564, + "step": 14808 + }, + { + "epoch": 4.545426642111725, + "grad_norm": 0.2369023859500885, + "learning_rate": 5.973440569194284e-05, + "loss": 1.7862, + "step": 14809 + }, + { + "epoch": 4.54573357888275, + "grad_norm": 0.26628994941711426, + "learning_rate": 5.972953019466531e-05, + "loss": 1.7828, + "step": 14810 + }, + { + "epoch": 4.546040515653775, + "grad_norm": 0.3091031610965729, + "learning_rate": 5.9724654601234864e-05, + "loss": 1.7623, + "step": 14811 + }, + { + "epoch": 4.546347452424801, + "grad_norm": 0.24652205407619476, + "learning_rate": 5.971977891169966e-05, + "loss": 1.6982, + "step": 14812 + }, + { + "epoch": 4.546654389195826, + "grad_norm": 0.21779046952724457, + "learning_rate": 5.971490312610793e-05, + "loss": 1.7363, + "step": 14813 + }, + { + "epoch": 4.546961325966851, + "grad_norm": 0.24130751192569733, + "learning_rate": 5.971002724450783e-05, + "loss": 1.7014, + "step": 14814 + }, + { + "epoch": 4.547268262737876, + "grad_norm": 0.21868734061717987, + "learning_rate": 5.9705151266947534e-05, + "loss": 1.7872, + "step": 14815 + }, + { + "epoch": 4.547575199508901, + "grad_norm": 0.257376492023468, + "learning_rate": 5.9700275193475275e-05, + "loss": 1.75, + "step": 14816 + }, + { + "epoch": 4.547882136279926, + "grad_norm": 0.3182791769504547, + "learning_rate": 5.9695399024139174e-05, + "loss": 1.7965, + "step": 14817 + }, + { + "epoch": 4.548189073050952, + "grad_norm": 0.25553280115127563, + "learning_rate": 5.969052275898748e-05, + "loss": 1.8394, + "step": 14818 + }, + { + "epoch": 4.548496009821976, + "grad_norm": 0.2810833752155304, + "learning_rate": 5.9685646398068354e-05, + "loss": 1.704, + "step": 14819 + }, + { + "epoch": 4.5488029465930016, + "grad_norm": 0.21320512890815735, + "learning_rate": 5.9680769941429993e-05, + "loss": 1.7248, + "step": 14820 + }, + { + "epoch": 4.549109883364027, + "grad_norm": 0.3159593939781189, + "learning_rate": 5.96758933891206e-05, + "loss": 1.7885, + "step": 14821 + }, + { + "epoch": 4.549416820135052, + "grad_norm": 0.21894599497318268, + "learning_rate": 5.967101674118834e-05, + "loss": 1.7388, + "step": 14822 + }, + { + "epoch": 4.5497237569060776, + "grad_norm": 0.24804852902889252, + "learning_rate": 5.9666139997681424e-05, + "loss": 1.7631, + "step": 14823 + }, + { + "epoch": 4.550030693677103, + "grad_norm": 0.2678423523902893, + "learning_rate": 5.966126315864806e-05, + "loss": 1.7631, + "step": 14824 + }, + { + "epoch": 4.550337630448127, + "grad_norm": 0.229649156332016, + "learning_rate": 5.9656386224136426e-05, + "loss": 1.7292, + "step": 14825 + }, + { + "epoch": 4.550644567219153, + "grad_norm": 0.25248458981513977, + "learning_rate": 5.965150919419473e-05, + "loss": 1.8, + "step": 14826 + }, + { + "epoch": 4.550951503990178, + "grad_norm": 0.2583169937133789, + "learning_rate": 5.964663206887116e-05, + "loss": 1.7641, + "step": 14827 + }, + { + "epoch": 4.551258440761203, + "grad_norm": 0.21465209126472473, + "learning_rate": 5.964175484821392e-05, + "loss": 1.7475, + "step": 14828 + }, + { + "epoch": 4.551565377532229, + "grad_norm": 0.28028783202171326, + "learning_rate": 5.963687753227118e-05, + "loss": 1.7649, + "step": 14829 + }, + { + "epoch": 4.551872314303253, + "grad_norm": 0.30248284339904785, + "learning_rate": 5.9632000121091194e-05, + "loss": 1.6969, + "step": 14830 + }, + { + "epoch": 4.5521792510742785, + "grad_norm": 0.24335962533950806, + "learning_rate": 5.962712261472213e-05, + "loss": 1.7295, + "step": 14831 + }, + { + "epoch": 4.552486187845304, + "grad_norm": 0.21014504134655, + "learning_rate": 5.9622245013212206e-05, + "loss": 1.7508, + "step": 14832 + }, + { + "epoch": 4.552793124616329, + "grad_norm": 0.24892041087150574, + "learning_rate": 5.961736731660963e-05, + "loss": 1.7317, + "step": 14833 + }, + { + "epoch": 4.5531000613873545, + "grad_norm": 0.2159881740808487, + "learning_rate": 5.9612489524962556e-05, + "loss": 1.7114, + "step": 14834 + }, + { + "epoch": 4.55340699815838, + "grad_norm": 0.2952292263507843, + "learning_rate": 5.960761163831925e-05, + "loss": 1.8226, + "step": 14835 + }, + { + "epoch": 4.553713934929404, + "grad_norm": 0.3019000291824341, + "learning_rate": 5.9602733656727895e-05, + "loss": 1.7391, + "step": 14836 + }, + { + "epoch": 4.55402087170043, + "grad_norm": 0.2273966521024704, + "learning_rate": 5.9597855580236696e-05, + "loss": 1.7718, + "step": 14837 + }, + { + "epoch": 4.554327808471455, + "grad_norm": 0.2462005764245987, + "learning_rate": 5.959297740889386e-05, + "loss": 1.8428, + "step": 14838 + }, + { + "epoch": 4.55463474524248, + "grad_norm": 0.2773323059082031, + "learning_rate": 5.95880991427476e-05, + "loss": 1.6878, + "step": 14839 + }, + { + "epoch": 4.554941682013506, + "grad_norm": 0.26519861817359924, + "learning_rate": 5.958322078184611e-05, + "loss": 1.737, + "step": 14840 + }, + { + "epoch": 4.55524861878453, + "grad_norm": 0.20157647132873535, + "learning_rate": 5.9578342326237626e-05, + "loss": 1.7164, + "step": 14841 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.21715669333934784, + "learning_rate": 5.957346377597035e-05, + "loss": 1.705, + "step": 14842 + }, + { + "epoch": 4.555862492326581, + "grad_norm": 0.3056442439556122, + "learning_rate": 5.95685851310925e-05, + "loss": 1.7672, + "step": 14843 + }, + { + "epoch": 4.556169429097606, + "grad_norm": 0.24832262098789215, + "learning_rate": 5.956370639165228e-05, + "loss": 1.7305, + "step": 14844 + }, + { + "epoch": 4.556476365868631, + "grad_norm": 0.25814661383628845, + "learning_rate": 5.955882755769791e-05, + "loss": 1.7562, + "step": 14845 + }, + { + "epoch": 4.556783302639657, + "grad_norm": 0.38242629170417786, + "learning_rate": 5.95539486292776e-05, + "loss": 1.7077, + "step": 14846 + }, + { + "epoch": 4.557090239410681, + "grad_norm": 0.2901807427406311, + "learning_rate": 5.954906960643956e-05, + "loss": 1.7233, + "step": 14847 + }, + { + "epoch": 4.5573971761817065, + "grad_norm": 0.22636106610298157, + "learning_rate": 5.954419048923202e-05, + "loss": 1.777, + "step": 14848 + }, + { + "epoch": 4.557704112952732, + "grad_norm": 0.32392850518226624, + "learning_rate": 5.953931127770321e-05, + "loss": 1.7477, + "step": 14849 + }, + { + "epoch": 4.558011049723757, + "grad_norm": 0.3403460681438446, + "learning_rate": 5.953443197190134e-05, + "loss": 1.7712, + "step": 14850 + }, + { + "epoch": 4.558317986494782, + "grad_norm": 0.22923234105110168, + "learning_rate": 5.95295525718746e-05, + "loss": 1.8154, + "step": 14851 + }, + { + "epoch": 4.558624923265807, + "grad_norm": 0.25152841210365295, + "learning_rate": 5.952467307767124e-05, + "loss": 1.7091, + "step": 14852 + }, + { + "epoch": 4.558931860036832, + "grad_norm": 0.27743563055992126, + "learning_rate": 5.951979348933949e-05, + "loss": 1.7621, + "step": 14853 + }, + { + "epoch": 4.559238796807858, + "grad_norm": 0.25809308886528015, + "learning_rate": 5.951491380692756e-05, + "loss": 1.7669, + "step": 14854 + }, + { + "epoch": 4.559545733578883, + "grad_norm": 0.24863946437835693, + "learning_rate": 5.9510034030483676e-05, + "loss": 1.7354, + "step": 14855 + }, + { + "epoch": 4.559852670349908, + "grad_norm": 0.2896040380001068, + "learning_rate": 5.9505154160056066e-05, + "loss": 1.7878, + "step": 14856 + }, + { + "epoch": 4.560159607120933, + "grad_norm": 0.23814482986927032, + "learning_rate": 5.950027419569294e-05, + "loss": 1.7781, + "step": 14857 + }, + { + "epoch": 4.560466543891958, + "grad_norm": 0.2531175911426544, + "learning_rate": 5.949539413744253e-05, + "loss": 1.762, + "step": 14858 + }, + { + "epoch": 4.560773480662983, + "grad_norm": 0.2541767656803131, + "learning_rate": 5.949051398535308e-05, + "loss": 1.7722, + "step": 14859 + }, + { + "epoch": 4.561080417434009, + "grad_norm": 0.25216221809387207, + "learning_rate": 5.948563373947281e-05, + "loss": 1.754, + "step": 14860 + }, + { + "epoch": 4.561387354205034, + "grad_norm": 0.24421775341033936, + "learning_rate": 5.948075339984994e-05, + "loss": 1.7976, + "step": 14861 + }, + { + "epoch": 4.5616942909760585, + "grad_norm": 0.24435418844223022, + "learning_rate": 5.947587296653272e-05, + "loss": 1.79, + "step": 14862 + }, + { + "epoch": 4.562001227747084, + "grad_norm": 0.24471627175807953, + "learning_rate": 5.947099243956936e-05, + "loss": 1.755, + "step": 14863 + }, + { + "epoch": 4.562308164518109, + "grad_norm": 0.2762158215045929, + "learning_rate": 5.9466111819008096e-05, + "loss": 1.7695, + "step": 14864 + }, + { + "epoch": 4.5626151012891345, + "grad_norm": 0.23841319978237152, + "learning_rate": 5.9461231104897174e-05, + "loss": 1.7302, + "step": 14865 + }, + { + "epoch": 4.56292203806016, + "grad_norm": 0.260231077671051, + "learning_rate": 5.9456350297284826e-05, + "loss": 1.7917, + "step": 14866 + }, + { + "epoch": 4.563228974831185, + "grad_norm": 0.2752247452735901, + "learning_rate": 5.945146939621929e-05, + "loss": 1.7953, + "step": 14867 + }, + { + "epoch": 4.56353591160221, + "grad_norm": 0.28760650753974915, + "learning_rate": 5.944658840174878e-05, + "loss": 1.8582, + "step": 14868 + }, + { + "epoch": 4.563842848373235, + "grad_norm": 0.24311676621437073, + "learning_rate": 5.944170731392153e-05, + "loss": 1.8006, + "step": 14869 + }, + { + "epoch": 4.56414978514426, + "grad_norm": 0.2692974805831909, + "learning_rate": 5.943682613278583e-05, + "loss": 1.6984, + "step": 14870 + }, + { + "epoch": 4.564456721915286, + "grad_norm": 0.2784348726272583, + "learning_rate": 5.943194485838985e-05, + "loss": 1.8082, + "step": 14871 + }, + { + "epoch": 4.564763658686311, + "grad_norm": 0.2557264268398285, + "learning_rate": 5.9427063490781885e-05, + "loss": 1.7715, + "step": 14872 + }, + { + "epoch": 4.565070595457335, + "grad_norm": 0.3738742470741272, + "learning_rate": 5.942218203001015e-05, + "loss": 1.7549, + "step": 14873 + }, + { + "epoch": 4.565377532228361, + "grad_norm": 0.2424495816230774, + "learning_rate": 5.941730047612288e-05, + "loss": 1.7388, + "step": 14874 + }, + { + "epoch": 4.565684468999386, + "grad_norm": 0.27020737528800964, + "learning_rate": 5.941241882916833e-05, + "loss": 1.752, + "step": 14875 + }, + { + "epoch": 4.565991405770411, + "grad_norm": 0.3763764798641205, + "learning_rate": 5.940753708919474e-05, + "loss": 1.7918, + "step": 14876 + }, + { + "epoch": 4.566298342541437, + "grad_norm": 0.26782163977622986, + "learning_rate": 5.940265525625036e-05, + "loss": 1.7244, + "step": 14877 + }, + { + "epoch": 4.566605279312462, + "grad_norm": 0.24978911876678467, + "learning_rate": 5.9397773330383434e-05, + "loss": 1.7706, + "step": 14878 + }, + { + "epoch": 4.5669122160834865, + "grad_norm": 0.32905304431915283, + "learning_rate": 5.93928913116422e-05, + "loss": 1.7381, + "step": 14879 + }, + { + "epoch": 4.567219152854512, + "grad_norm": 0.2196444720029831, + "learning_rate": 5.93880092000749e-05, + "loss": 1.7605, + "step": 14880 + }, + { + "epoch": 4.567526089625537, + "grad_norm": 0.3156622350215912, + "learning_rate": 5.9383126995729786e-05, + "loss": 1.9181, + "step": 14881 + }, + { + "epoch": 4.5678330263965625, + "grad_norm": 0.2895203232765198, + "learning_rate": 5.937824469865513e-05, + "loss": 1.7967, + "step": 14882 + }, + { + "epoch": 4.568139963167588, + "grad_norm": 0.24854810535907745, + "learning_rate": 5.937336230889916e-05, + "loss": 1.7332, + "step": 14883 + }, + { + "epoch": 4.568446899938612, + "grad_norm": 0.3417081832885742, + "learning_rate": 5.936847982651013e-05, + "loss": 1.7525, + "step": 14884 + }, + { + "epoch": 4.568753836709638, + "grad_norm": 0.2874949276447296, + "learning_rate": 5.936359725153629e-05, + "loss": 1.7659, + "step": 14885 + }, + { + "epoch": 4.569060773480663, + "grad_norm": 0.25031307339668274, + "learning_rate": 5.935871458402588e-05, + "loss": 1.8061, + "step": 14886 + }, + { + "epoch": 4.569367710251688, + "grad_norm": 0.27047309279441833, + "learning_rate": 5.935383182402717e-05, + "loss": 1.7318, + "step": 14887 + }, + { + "epoch": 4.569674647022714, + "grad_norm": 0.2642819881439209, + "learning_rate": 5.9348948971588425e-05, + "loss": 1.849, + "step": 14888 + }, + { + "epoch": 4.569981583793739, + "grad_norm": 0.2452307790517807, + "learning_rate": 5.9344066026757886e-05, + "loss": 1.7491, + "step": 14889 + }, + { + "epoch": 4.570288520564763, + "grad_norm": 0.24055036902427673, + "learning_rate": 5.9339182989583795e-05, + "loss": 1.7573, + "step": 14890 + }, + { + "epoch": 4.570595457335789, + "grad_norm": 0.23036183416843414, + "learning_rate": 5.933429986011444e-05, + "loss": 1.7841, + "step": 14891 + }, + { + "epoch": 4.570902394106814, + "grad_norm": 0.27987608313560486, + "learning_rate": 5.932941663839805e-05, + "loss": 1.7835, + "step": 14892 + }, + { + "epoch": 4.571209330877839, + "grad_norm": 0.31747013330459595, + "learning_rate": 5.93245333244829e-05, + "loss": 1.7905, + "step": 14893 + }, + { + "epoch": 4.571516267648864, + "grad_norm": 0.24841344356536865, + "learning_rate": 5.931964991841725e-05, + "loss": 1.8003, + "step": 14894 + }, + { + "epoch": 4.571823204419889, + "grad_norm": 0.2416950911283493, + "learning_rate": 5.9314766420249356e-05, + "loss": 1.7787, + "step": 14895 + }, + { + "epoch": 4.5721301411909145, + "grad_norm": 0.2322494238615036, + "learning_rate": 5.930988283002748e-05, + "loss": 1.8153, + "step": 14896 + }, + { + "epoch": 4.57243707796194, + "grad_norm": 0.22629016637802124, + "learning_rate": 5.930499914779989e-05, + "loss": 1.6743, + "step": 14897 + }, + { + "epoch": 4.572744014732965, + "grad_norm": 0.21481508016586304, + "learning_rate": 5.930011537361483e-05, + "loss": 1.7301, + "step": 14898 + }, + { + "epoch": 4.5730509515039905, + "grad_norm": 0.1993340700864792, + "learning_rate": 5.9295231507520586e-05, + "loss": 1.6796, + "step": 14899 + }, + { + "epoch": 4.573357888275015, + "grad_norm": 0.21681822836399078, + "learning_rate": 5.929034754956543e-05, + "loss": 1.7333, + "step": 14900 + }, + { + "epoch": 4.57366482504604, + "grad_norm": 0.23105305433273315, + "learning_rate": 5.928546349979761e-05, + "loss": 1.8207, + "step": 14901 + }, + { + "epoch": 4.573971761817066, + "grad_norm": 0.24656468629837036, + "learning_rate": 5.9280579358265384e-05, + "loss": 1.7805, + "step": 14902 + }, + { + "epoch": 4.574278698588091, + "grad_norm": 0.28564780950546265, + "learning_rate": 5.927569512501704e-05, + "loss": 1.7224, + "step": 14903 + }, + { + "epoch": 4.574585635359116, + "grad_norm": 0.26030251383781433, + "learning_rate": 5.927081080010084e-05, + "loss": 1.7417, + "step": 14904 + }, + { + "epoch": 4.574892572130141, + "grad_norm": 0.21427087485790253, + "learning_rate": 5.926592638356505e-05, + "loss": 1.7239, + "step": 14905 + }, + { + "epoch": 4.575199508901166, + "grad_norm": 0.2351662665605545, + "learning_rate": 5.9261041875457956e-05, + "loss": 1.7711, + "step": 14906 + }, + { + "epoch": 4.5755064456721914, + "grad_norm": 0.27335020899772644, + "learning_rate": 5.925615727582781e-05, + "loss": 1.7496, + "step": 14907 + }, + { + "epoch": 4.575813382443217, + "grad_norm": 0.27849945425987244, + "learning_rate": 5.925127258472289e-05, + "loss": 1.7576, + "step": 14908 + }, + { + "epoch": 4.576120319214242, + "grad_norm": 0.27859339118003845, + "learning_rate": 5.924638780219147e-05, + "loss": 1.8076, + "step": 14909 + }, + { + "epoch": 4.5764272559852675, + "grad_norm": 0.24664369225502014, + "learning_rate": 5.9241502928281836e-05, + "loss": 1.7657, + "step": 14910 + }, + { + "epoch": 4.576734192756292, + "grad_norm": 0.29881149530410767, + "learning_rate": 5.923661796304224e-05, + "loss": 1.7611, + "step": 14911 + }, + { + "epoch": 4.577041129527317, + "grad_norm": 0.2672356367111206, + "learning_rate": 5.9231732906520984e-05, + "loss": 1.7605, + "step": 14912 + }, + { + "epoch": 4.577348066298343, + "grad_norm": 0.24282832443714142, + "learning_rate": 5.9226847758766336e-05, + "loss": 1.7037, + "step": 14913 + }, + { + "epoch": 4.577655003069368, + "grad_norm": 0.3822915852069855, + "learning_rate": 5.922196251982656e-05, + "loss": 1.7609, + "step": 14914 + }, + { + "epoch": 4.577961939840393, + "grad_norm": 0.30721214413642883, + "learning_rate": 5.921707718974994e-05, + "loss": 1.7398, + "step": 14915 + }, + { + "epoch": 4.578268876611418, + "grad_norm": 0.235477477312088, + "learning_rate": 5.921219176858477e-05, + "loss": 1.6869, + "step": 14916 + }, + { + "epoch": 4.578575813382443, + "grad_norm": 0.3752216100692749, + "learning_rate": 5.920730625637934e-05, + "loss": 1.7296, + "step": 14917 + }, + { + "epoch": 4.578882750153468, + "grad_norm": 0.36901310086250305, + "learning_rate": 5.920242065318189e-05, + "loss": 1.7405, + "step": 14918 + }, + { + "epoch": 4.579189686924494, + "grad_norm": 0.2308608740568161, + "learning_rate": 5.9197534959040725e-05, + "loss": 1.7953, + "step": 14919 + }, + { + "epoch": 4.579496623695519, + "grad_norm": 0.3286738991737366, + "learning_rate": 5.919264917400412e-05, + "loss": 1.7669, + "step": 14920 + }, + { + "epoch": 4.579803560466544, + "grad_norm": 0.3944021165370941, + "learning_rate": 5.918776329812039e-05, + "loss": 1.7165, + "step": 14921 + }, + { + "epoch": 4.580110497237569, + "grad_norm": 0.22054845094680786, + "learning_rate": 5.9182877331437795e-05, + "loss": 1.7739, + "step": 14922 + }, + { + "epoch": 4.580417434008594, + "grad_norm": 0.3467540740966797, + "learning_rate": 5.9177991274004605e-05, + "loss": 1.7713, + "step": 14923 + }, + { + "epoch": 4.5807243707796195, + "grad_norm": 0.4313695728778839, + "learning_rate": 5.917310512586914e-05, + "loss": 1.7654, + "step": 14924 + }, + { + "epoch": 4.581031307550645, + "grad_norm": 0.2723502814769745, + "learning_rate": 5.9168218887079685e-05, + "loss": 1.7314, + "step": 14925 + }, + { + "epoch": 4.581338244321669, + "grad_norm": 0.2641250789165497, + "learning_rate": 5.9163332557684504e-05, + "loss": 1.7303, + "step": 14926 + }, + { + "epoch": 4.581645181092695, + "grad_norm": 0.3780760169029236, + "learning_rate": 5.915844613773189e-05, + "loss": 1.7748, + "step": 14927 + }, + { + "epoch": 4.58195211786372, + "grad_norm": 0.23379632830619812, + "learning_rate": 5.915355962727015e-05, + "loss": 1.7482, + "step": 14928 + }, + { + "epoch": 4.582259054634745, + "grad_norm": 0.35227084159851074, + "learning_rate": 5.914867302634758e-05, + "loss": 1.8198, + "step": 14929 + }, + { + "epoch": 4.582565991405771, + "grad_norm": 0.34348124265670776, + "learning_rate": 5.914378633501245e-05, + "loss": 1.8364, + "step": 14930 + }, + { + "epoch": 4.582872928176796, + "grad_norm": 0.2446804940700531, + "learning_rate": 5.9138899553313066e-05, + "loss": 1.7779, + "step": 14931 + }, + { + "epoch": 4.58317986494782, + "grad_norm": 0.23893557488918304, + "learning_rate": 5.913401268129772e-05, + "loss": 1.7582, + "step": 14932 + }, + { + "epoch": 4.583486801718846, + "grad_norm": 0.3046814203262329, + "learning_rate": 5.912912571901471e-05, + "loss": 1.6871, + "step": 14933 + }, + { + "epoch": 4.583793738489871, + "grad_norm": 0.2232733964920044, + "learning_rate": 5.912423866651233e-05, + "loss": 1.7269, + "step": 14934 + }, + { + "epoch": 4.584100675260896, + "grad_norm": 0.18664126098155975, + "learning_rate": 5.911935152383888e-05, + "loss": 1.7155, + "step": 14935 + }, + { + "epoch": 4.584407612031922, + "grad_norm": 0.2573263347148895, + "learning_rate": 5.911446429104265e-05, + "loss": 1.7901, + "step": 14936 + }, + { + "epoch": 4.584714548802946, + "grad_norm": 0.2382393181324005, + "learning_rate": 5.910957696817194e-05, + "loss": 1.7407, + "step": 14937 + }, + { + "epoch": 4.5850214855739715, + "grad_norm": 0.28363972902297974, + "learning_rate": 5.910468955527504e-05, + "loss": 1.7971, + "step": 14938 + }, + { + "epoch": 4.585328422344997, + "grad_norm": 0.3173120617866516, + "learning_rate": 5.909980205240027e-05, + "loss": 1.744, + "step": 14939 + }, + { + "epoch": 4.585635359116022, + "grad_norm": 0.2281302511692047, + "learning_rate": 5.909491445959592e-05, + "loss": 1.6976, + "step": 14940 + }, + { + "epoch": 4.5859422958870475, + "grad_norm": 0.24962912499904633, + "learning_rate": 5.9090026776910304e-05, + "loss": 1.7979, + "step": 14941 + }, + { + "epoch": 4.586249232658073, + "grad_norm": 0.22330854833126068, + "learning_rate": 5.908513900439171e-05, + "loss": 1.7854, + "step": 14942 + }, + { + "epoch": 4.586556169429097, + "grad_norm": 0.20861582458019257, + "learning_rate": 5.908025114208845e-05, + "loss": 1.7133, + "step": 14943 + }, + { + "epoch": 4.586863106200123, + "grad_norm": 0.21838510036468506, + "learning_rate": 5.90753631900488e-05, + "loss": 1.6919, + "step": 14944 + }, + { + "epoch": 4.587170042971148, + "grad_norm": 0.252798467874527, + "learning_rate": 5.907047514832112e-05, + "loss": 1.838, + "step": 14945 + }, + { + "epoch": 4.587476979742173, + "grad_norm": 0.326893150806427, + "learning_rate": 5.906558701695369e-05, + "loss": 1.7303, + "step": 14946 + }, + { + "epoch": 4.587783916513199, + "grad_norm": 0.36489585041999817, + "learning_rate": 5.9060698795994804e-05, + "loss": 1.7631, + "step": 14947 + }, + { + "epoch": 4.588090853284223, + "grad_norm": 0.27491649985313416, + "learning_rate": 5.905581048549279e-05, + "loss": 1.7773, + "step": 14948 + }, + { + "epoch": 4.588397790055248, + "grad_norm": 0.2334890067577362, + "learning_rate": 5.905092208549595e-05, + "loss": 1.7254, + "step": 14949 + }, + { + "epoch": 4.588704726826274, + "grad_norm": 0.24383895099163055, + "learning_rate": 5.904603359605257e-05, + "loss": 1.7496, + "step": 14950 + }, + { + "epoch": 4.589011663597299, + "grad_norm": 0.2144637256860733, + "learning_rate": 5.904114501721102e-05, + "loss": 1.7028, + "step": 14951 + }, + { + "epoch": 4.589318600368324, + "grad_norm": 0.19675977528095245, + "learning_rate": 5.9036256349019555e-05, + "loss": 1.7548, + "step": 14952 + }, + { + "epoch": 4.58962553713935, + "grad_norm": 0.23712843656539917, + "learning_rate": 5.903136759152652e-05, + "loss": 1.7722, + "step": 14953 + }, + { + "epoch": 4.589932473910374, + "grad_norm": 0.20307733118534088, + "learning_rate": 5.902647874478021e-05, + "loss": 1.7177, + "step": 14954 + }, + { + "epoch": 4.5902394106813995, + "grad_norm": 0.21767669916152954, + "learning_rate": 5.9021589808828936e-05, + "loss": 1.7963, + "step": 14955 + }, + { + "epoch": 4.590546347452425, + "grad_norm": 0.2056351602077484, + "learning_rate": 5.9016700783721036e-05, + "loss": 1.7439, + "step": 14956 + }, + { + "epoch": 4.59085328422345, + "grad_norm": 0.20480911433696747, + "learning_rate": 5.90118116695048e-05, + "loss": 1.7122, + "step": 14957 + }, + { + "epoch": 4.5911602209944755, + "grad_norm": 0.24091731011867523, + "learning_rate": 5.900692246622858e-05, + "loss": 1.7862, + "step": 14958 + }, + { + "epoch": 4.5914671577655, + "grad_norm": 0.20246434211730957, + "learning_rate": 5.900203317394066e-05, + "loss": 1.6895, + "step": 14959 + }, + { + "epoch": 4.591774094536525, + "grad_norm": 0.23771630227565765, + "learning_rate": 5.899714379268938e-05, + "loss": 1.7794, + "step": 14960 + }, + { + "epoch": 4.592081031307551, + "grad_norm": 0.2638718783855438, + "learning_rate": 5.899225432252303e-05, + "loss": 1.8059, + "step": 14961 + }, + { + "epoch": 4.592387968078576, + "grad_norm": 0.24251408874988556, + "learning_rate": 5.898736476348997e-05, + "loss": 1.8063, + "step": 14962 + }, + { + "epoch": 4.592694904849601, + "grad_norm": 0.2487735152244568, + "learning_rate": 5.8982475115638515e-05, + "loss": 1.7615, + "step": 14963 + }, + { + "epoch": 4.593001841620627, + "grad_norm": 0.23507241904735565, + "learning_rate": 5.897758537901696e-05, + "loss": 1.7496, + "step": 14964 + }, + { + "epoch": 4.593308778391651, + "grad_norm": 0.22354768216609955, + "learning_rate": 5.897269555367365e-05, + "loss": 1.7293, + "step": 14965 + }, + { + "epoch": 4.593615715162676, + "grad_norm": 0.2711353003978729, + "learning_rate": 5.89678056396569e-05, + "loss": 1.8127, + "step": 14966 + }, + { + "epoch": 4.593922651933702, + "grad_norm": 0.30061110854148865, + "learning_rate": 5.8962915637015036e-05, + "loss": 1.7653, + "step": 14967 + }, + { + "epoch": 4.594229588704727, + "grad_norm": 0.24577318131923676, + "learning_rate": 5.895802554579639e-05, + "loss": 1.7888, + "step": 14968 + }, + { + "epoch": 4.5945365254757515, + "grad_norm": 0.25568944215774536, + "learning_rate": 5.895313536604929e-05, + "loss": 1.7912, + "step": 14969 + }, + { + "epoch": 4.594843462246777, + "grad_norm": 0.2710168957710266, + "learning_rate": 5.894824509782206e-05, + "loss": 1.7681, + "step": 14970 + }, + { + "epoch": 4.595150399017802, + "grad_norm": 0.24056777358055115, + "learning_rate": 5.894335474116303e-05, + "loss": 1.7729, + "step": 14971 + }, + { + "epoch": 4.5954573357888275, + "grad_norm": 0.21956710517406464, + "learning_rate": 5.89384642961205e-05, + "loss": 1.7576, + "step": 14972 + }, + { + "epoch": 4.595764272559853, + "grad_norm": 0.27499106526374817, + "learning_rate": 5.893357376274284e-05, + "loss": 1.7909, + "step": 14973 + }, + { + "epoch": 4.596071209330878, + "grad_norm": 0.28581273555755615, + "learning_rate": 5.8928683141078376e-05, + "loss": 1.7592, + "step": 14974 + }, + { + "epoch": 4.596378146101903, + "grad_norm": 0.23218442499637604, + "learning_rate": 5.892379243117543e-05, + "loss": 1.7142, + "step": 14975 + }, + { + "epoch": 4.596685082872928, + "grad_norm": 0.34015771746635437, + "learning_rate": 5.891890163308234e-05, + "loss": 1.7457, + "step": 14976 + }, + { + "epoch": 4.596992019643953, + "grad_norm": 0.2630012333393097, + "learning_rate": 5.8914010746847435e-05, + "loss": 1.7612, + "step": 14977 + }, + { + "epoch": 4.597298956414979, + "grad_norm": 0.2265843003988266, + "learning_rate": 5.890911977251904e-05, + "loss": 1.7272, + "step": 14978 + }, + { + "epoch": 4.597605893186004, + "grad_norm": 0.22325244545936584, + "learning_rate": 5.8904228710145505e-05, + "loss": 1.7447, + "step": 14979 + }, + { + "epoch": 4.597912829957028, + "grad_norm": 0.23512716591358185, + "learning_rate": 5.889933755977517e-05, + "loss": 1.7123, + "step": 14980 + }, + { + "epoch": 4.598219766728054, + "grad_norm": 0.22534869611263275, + "learning_rate": 5.8894446321456365e-05, + "loss": 1.785, + "step": 14981 + }, + { + "epoch": 4.598526703499079, + "grad_norm": 0.2447836697101593, + "learning_rate": 5.888955499523743e-05, + "loss": 1.7154, + "step": 14982 + }, + { + "epoch": 4.598833640270104, + "grad_norm": 0.2451140582561493, + "learning_rate": 5.88846635811667e-05, + "loss": 1.7494, + "step": 14983 + }, + { + "epoch": 4.59914057704113, + "grad_norm": 0.2253585308790207, + "learning_rate": 5.8879772079292504e-05, + "loss": 1.7591, + "step": 14984 + }, + { + "epoch": 4.599447513812155, + "grad_norm": 0.21714572608470917, + "learning_rate": 5.887488048966322e-05, + "loss": 1.7314, + "step": 14985 + }, + { + "epoch": 4.5997544505831796, + "grad_norm": 0.24897411465644836, + "learning_rate": 5.8869988812327145e-05, + "loss": 1.776, + "step": 14986 + }, + { + "epoch": 4.600061387354205, + "grad_norm": 0.22575093805789948, + "learning_rate": 5.8865097047332653e-05, + "loss": 1.7168, + "step": 14987 + }, + { + "epoch": 4.60036832412523, + "grad_norm": 0.22857412695884705, + "learning_rate": 5.886020519472808e-05, + "loss": 1.8262, + "step": 14988 + }, + { + "epoch": 4.600675260896256, + "grad_norm": 0.22741298377513885, + "learning_rate": 5.885531325456174e-05, + "loss": 1.6732, + "step": 14989 + }, + { + "epoch": 4.600982197667281, + "grad_norm": 0.2229645550251007, + "learning_rate": 5.885042122688202e-05, + "loss": 1.7384, + "step": 14990 + }, + { + "epoch": 4.601289134438305, + "grad_norm": 0.22609494626522064, + "learning_rate": 5.884552911173726e-05, + "loss": 1.714, + "step": 14991 + }, + { + "epoch": 4.601596071209331, + "grad_norm": 0.2629149854183197, + "learning_rate": 5.884063690917578e-05, + "loss": 1.8133, + "step": 14992 + }, + { + "epoch": 4.601903007980356, + "grad_norm": 0.220725417137146, + "learning_rate": 5.883574461924597e-05, + "loss": 1.6898, + "step": 14993 + }, + { + "epoch": 4.602209944751381, + "grad_norm": 0.207612082362175, + "learning_rate": 5.8830852241996135e-05, + "loss": 1.7302, + "step": 14994 + }, + { + "epoch": 4.602516881522407, + "grad_norm": 0.22418084740638733, + "learning_rate": 5.8825959777474625e-05, + "loss": 1.763, + "step": 14995 + }, + { + "epoch": 4.602823818293432, + "grad_norm": 0.30606865882873535, + "learning_rate": 5.882106722572983e-05, + "loss": 1.7657, + "step": 14996 + }, + { + "epoch": 4.6031307550644565, + "grad_norm": 0.2947966456413269, + "learning_rate": 5.881617458681008e-05, + "loss": 1.7796, + "step": 14997 + }, + { + "epoch": 4.603437691835482, + "grad_norm": 0.23430216312408447, + "learning_rate": 5.881128186076372e-05, + "loss": 1.78, + "step": 14998 + }, + { + "epoch": 4.603744628606507, + "grad_norm": 0.28081849217414856, + "learning_rate": 5.880638904763911e-05, + "loss": 1.6791, + "step": 14999 + }, + { + "epoch": 4.6040515653775325, + "grad_norm": 0.25459226965904236, + "learning_rate": 5.88014961474846e-05, + "loss": 1.8064, + "step": 15000 + }, + { + "epoch": 4.604358502148557, + "grad_norm": 0.2358713001012802, + "learning_rate": 5.879660316034854e-05, + "loss": 1.763, + "step": 15001 + }, + { + "epoch": 4.604665438919582, + "grad_norm": 0.32954758405685425, + "learning_rate": 5.879171008627931e-05, + "loss": 1.7462, + "step": 15002 + }, + { + "epoch": 4.604972375690608, + "grad_norm": 0.2588615417480469, + "learning_rate": 5.878681692532523e-05, + "loss": 1.7771, + "step": 15003 + }, + { + "epoch": 4.605279312461633, + "grad_norm": 0.21216195821762085, + "learning_rate": 5.878192367753468e-05, + "loss": 1.7128, + "step": 15004 + }, + { + "epoch": 4.605586249232658, + "grad_norm": 0.26849040389060974, + "learning_rate": 5.8777030342956016e-05, + "loss": 1.7048, + "step": 15005 + }, + { + "epoch": 4.605893186003684, + "grad_norm": 0.22343295812606812, + "learning_rate": 5.877213692163759e-05, + "loss": 1.7695, + "step": 15006 + }, + { + "epoch": 4.606200122774708, + "grad_norm": 0.2794288694858551, + "learning_rate": 5.876724341362776e-05, + "loss": 1.7856, + "step": 15007 + }, + { + "epoch": 4.606507059545733, + "grad_norm": 0.3525427579879761, + "learning_rate": 5.8762349818974905e-05, + "loss": 1.7807, + "step": 15008 + }, + { + "epoch": 4.606813996316759, + "grad_norm": 0.25886499881744385, + "learning_rate": 5.875745613772736e-05, + "loss": 1.7818, + "step": 15009 + }, + { + "epoch": 4.607120933087784, + "grad_norm": 0.24822987616062164, + "learning_rate": 5.8752562369933515e-05, + "loss": 1.7369, + "step": 15010 + }, + { + "epoch": 4.607427869858809, + "grad_norm": 0.26067355275154114, + "learning_rate": 5.874766851564171e-05, + "loss": 1.7056, + "step": 15011 + }, + { + "epoch": 4.607734806629834, + "grad_norm": 0.2869747579097748, + "learning_rate": 5.874277457490033e-05, + "loss": 1.7284, + "step": 15012 + }, + { + "epoch": 4.608041743400859, + "grad_norm": 0.23153580725193024, + "learning_rate": 5.87378805477577e-05, + "loss": 1.7331, + "step": 15013 + }, + { + "epoch": 4.6083486801718845, + "grad_norm": 0.29307299852371216, + "learning_rate": 5.873298643426223e-05, + "loss": 1.7376, + "step": 15014 + }, + { + "epoch": 4.60865561694291, + "grad_norm": 0.25638771057128906, + "learning_rate": 5.872809223446227e-05, + "loss": 1.7585, + "step": 15015 + }, + { + "epoch": 4.608962553713935, + "grad_norm": 0.2272702306509018, + "learning_rate": 5.872319794840618e-05, + "loss": 1.7482, + "step": 15016 + }, + { + "epoch": 4.6092694904849605, + "grad_norm": 0.2579486072063446, + "learning_rate": 5.8718303576142356e-05, + "loss": 1.778, + "step": 15017 + }, + { + "epoch": 4.609576427255985, + "grad_norm": 0.2216452956199646, + "learning_rate": 5.871340911771912e-05, + "loss": 1.7517, + "step": 15018 + }, + { + "epoch": 4.60988336402701, + "grad_norm": 0.22628961503505707, + "learning_rate": 5.870851457318488e-05, + "loss": 1.7579, + "step": 15019 + }, + { + "epoch": 4.610190300798036, + "grad_norm": 0.31018149852752686, + "learning_rate": 5.8703619942588e-05, + "loss": 1.7911, + "step": 15020 + }, + { + "epoch": 4.610497237569061, + "grad_norm": 0.2618122100830078, + "learning_rate": 5.869872522597683e-05, + "loss": 1.8121, + "step": 15021 + }, + { + "epoch": 4.610804174340086, + "grad_norm": 0.26085740327835083, + "learning_rate": 5.869383042339978e-05, + "loss": 1.7952, + "step": 15022 + }, + { + "epoch": 4.611111111111111, + "grad_norm": 0.25237780809402466, + "learning_rate": 5.86889355349052e-05, + "loss": 1.7575, + "step": 15023 + }, + { + "epoch": 4.611418047882136, + "grad_norm": 0.27550897002220154, + "learning_rate": 5.868404056054144e-05, + "loss": 1.7816, + "step": 15024 + }, + { + "epoch": 4.611724984653161, + "grad_norm": 0.2458692342042923, + "learning_rate": 5.8679145500356926e-05, + "loss": 1.7783, + "step": 15025 + }, + { + "epoch": 4.612031921424187, + "grad_norm": 0.25606176257133484, + "learning_rate": 5.867425035439999e-05, + "loss": 1.7863, + "step": 15026 + }, + { + "epoch": 4.612338858195212, + "grad_norm": 0.3206995725631714, + "learning_rate": 5.866935512271905e-05, + "loss": 1.7468, + "step": 15027 + }, + { + "epoch": 4.612645794966237, + "grad_norm": 0.2754824459552765, + "learning_rate": 5.866445980536245e-05, + "loss": 1.793, + "step": 15028 + }, + { + "epoch": 4.612952731737262, + "grad_norm": 0.25168612599372864, + "learning_rate": 5.865956440237859e-05, + "loss": 1.7252, + "step": 15029 + }, + { + "epoch": 4.613259668508287, + "grad_norm": 0.3226735293865204, + "learning_rate": 5.8654668913815815e-05, + "loss": 1.7291, + "step": 15030 + }, + { + "epoch": 4.6135666052793125, + "grad_norm": 0.2580295503139496, + "learning_rate": 5.864977333972255e-05, + "loss": 1.7622, + "step": 15031 + }, + { + "epoch": 4.613873542050338, + "grad_norm": 0.21486075222492218, + "learning_rate": 5.864487768014715e-05, + "loss": 1.7662, + "step": 15032 + }, + { + "epoch": 4.614180478821363, + "grad_norm": 0.2331690639257431, + "learning_rate": 5.8639981935137996e-05, + "loss": 1.7389, + "step": 15033 + }, + { + "epoch": 4.614487415592388, + "grad_norm": 0.2573511302471161, + "learning_rate": 5.863508610474348e-05, + "loss": 1.7699, + "step": 15034 + }, + { + "epoch": 4.614794352363413, + "grad_norm": 0.2260694056749344, + "learning_rate": 5.863019018901199e-05, + "loss": 1.7784, + "step": 15035 + }, + { + "epoch": 4.615101289134438, + "grad_norm": 0.2283065915107727, + "learning_rate": 5.8625294187991895e-05, + "loss": 1.7061, + "step": 15036 + }, + { + "epoch": 4.615408225905464, + "grad_norm": 0.24772310256958008, + "learning_rate": 5.862039810173159e-05, + "loss": 1.7568, + "step": 15037 + }, + { + "epoch": 4.615715162676489, + "grad_norm": 0.2515513002872467, + "learning_rate": 5.861550193027945e-05, + "loss": 1.7445, + "step": 15038 + }, + { + "epoch": 4.616022099447514, + "grad_norm": 0.26472151279449463, + "learning_rate": 5.8610605673683885e-05, + "loss": 1.7735, + "step": 15039 + }, + { + "epoch": 4.616329036218539, + "grad_norm": 0.24053528904914856, + "learning_rate": 5.8605709331993254e-05, + "loss": 1.8009, + "step": 15040 + }, + { + "epoch": 4.616635972989564, + "grad_norm": 0.25125381350517273, + "learning_rate": 5.860081290525596e-05, + "loss": 1.7712, + "step": 15041 + }, + { + "epoch": 4.616942909760589, + "grad_norm": 0.23056018352508545, + "learning_rate": 5.85959163935204e-05, + "loss": 1.7684, + "step": 15042 + }, + { + "epoch": 4.617249846531615, + "grad_norm": 0.2533007562160492, + "learning_rate": 5.859101979683494e-05, + "loss": 1.7793, + "step": 15043 + }, + { + "epoch": 4.617556783302639, + "grad_norm": 0.21007375419139862, + "learning_rate": 5.8586123115248e-05, + "loss": 1.7484, + "step": 15044 + }, + { + "epoch": 4.6178637200736645, + "grad_norm": 0.21329566836357117, + "learning_rate": 5.858122634880797e-05, + "loss": 1.7763, + "step": 15045 + }, + { + "epoch": 4.61817065684469, + "grad_norm": 0.2362898588180542, + "learning_rate": 5.857632949756322e-05, + "loss": 1.7484, + "step": 15046 + }, + { + "epoch": 4.618477593615715, + "grad_norm": 0.2168794423341751, + "learning_rate": 5.857143256156214e-05, + "loss": 1.7752, + "step": 15047 + }, + { + "epoch": 4.6187845303867405, + "grad_norm": 0.24761471152305603, + "learning_rate": 5.856653554085316e-05, + "loss": 1.7793, + "step": 15048 + }, + { + "epoch": 4.619091467157766, + "grad_norm": 0.23202158510684967, + "learning_rate": 5.856163843548466e-05, + "loss": 1.6862, + "step": 15049 + }, + { + "epoch": 4.61939840392879, + "grad_norm": 0.23868000507354736, + "learning_rate": 5.855674124550501e-05, + "loss": 1.8075, + "step": 15050 + }, + { + "epoch": 4.619705340699816, + "grad_norm": 0.3063114583492279, + "learning_rate": 5.855184397096265e-05, + "loss": 1.8051, + "step": 15051 + }, + { + "epoch": 4.620012277470841, + "grad_norm": 0.22672493755817413, + "learning_rate": 5.854694661190594e-05, + "loss": 1.7478, + "step": 15052 + }, + { + "epoch": 4.620319214241866, + "grad_norm": 0.3403559923171997, + "learning_rate": 5.8542049168383296e-05, + "loss": 1.765, + "step": 15053 + }, + { + "epoch": 4.620626151012892, + "grad_norm": 0.33852189779281616, + "learning_rate": 5.853715164044312e-05, + "loss": 1.7602, + "step": 15054 + }, + { + "epoch": 4.620933087783916, + "grad_norm": 0.25166940689086914, + "learning_rate": 5.85322540281338e-05, + "loss": 1.7584, + "step": 15055 + }, + { + "epoch": 4.621240024554941, + "grad_norm": 0.3417987823486328, + "learning_rate": 5.8527356331503757e-05, + "loss": 1.8491, + "step": 15056 + }, + { + "epoch": 4.621546961325967, + "grad_norm": 0.3286994397640228, + "learning_rate": 5.852245855060138e-05, + "loss": 1.7146, + "step": 15057 + }, + { + "epoch": 4.621853898096992, + "grad_norm": 0.24394257366657257, + "learning_rate": 5.851756068547505e-05, + "loss": 1.8762, + "step": 15058 + }, + { + "epoch": 4.622160834868017, + "grad_norm": 0.34945347905158997, + "learning_rate": 5.851266273617321e-05, + "loss": 1.8086, + "step": 15059 + }, + { + "epoch": 4.622467771639043, + "grad_norm": 0.30189210176467896, + "learning_rate": 5.850776470274425e-05, + "loss": 1.7366, + "step": 15060 + }, + { + "epoch": 4.622774708410067, + "grad_norm": 0.24050579965114594, + "learning_rate": 5.850286658523657e-05, + "loss": 1.7599, + "step": 15061 + }, + { + "epoch": 4.6230816451810925, + "grad_norm": 0.33650726079940796, + "learning_rate": 5.849796838369857e-05, + "loss": 1.7343, + "step": 15062 + }, + { + "epoch": 4.623388581952118, + "grad_norm": 0.2855902910232544, + "learning_rate": 5.849307009817868e-05, + "loss": 1.7325, + "step": 15063 + }, + { + "epoch": 4.623695518723143, + "grad_norm": 0.2562592923641205, + "learning_rate": 5.8488171728725275e-05, + "loss": 1.7772, + "step": 15064 + }, + { + "epoch": 4.6240024554941686, + "grad_norm": 0.23494984209537506, + "learning_rate": 5.84832732753868e-05, + "loss": 1.7263, + "step": 15065 + }, + { + "epoch": 4.624309392265193, + "grad_norm": 0.23248226940631866, + "learning_rate": 5.847837473821164e-05, + "loss": 1.7441, + "step": 15066 + }, + { + "epoch": 4.624616329036218, + "grad_norm": 0.2291254848241806, + "learning_rate": 5.847347611724821e-05, + "loss": 1.7742, + "step": 15067 + }, + { + "epoch": 4.624923265807244, + "grad_norm": 0.28305280208587646, + "learning_rate": 5.8468577412544925e-05, + "loss": 1.8224, + "step": 15068 + }, + { + "epoch": 4.625230202578269, + "grad_norm": 0.25531691312789917, + "learning_rate": 5.84636786241502e-05, + "loss": 1.7458, + "step": 15069 + }, + { + "epoch": 4.625537139349294, + "grad_norm": 0.2363462746143341, + "learning_rate": 5.845877975211242e-05, + "loss": 1.7977, + "step": 15070 + }, + { + "epoch": 4.62584407612032, + "grad_norm": 0.2707001864910126, + "learning_rate": 5.845388079648004e-05, + "loss": 1.774, + "step": 15071 + }, + { + "epoch": 4.626151012891344, + "grad_norm": 0.22281844913959503, + "learning_rate": 5.844898175730146e-05, + "loss": 1.7888, + "step": 15072 + }, + { + "epoch": 4.6264579496623695, + "grad_norm": 0.24809995293617249, + "learning_rate": 5.8444082634625086e-05, + "loss": 1.7895, + "step": 15073 + }, + { + "epoch": 4.626764886433395, + "grad_norm": 0.2842096984386444, + "learning_rate": 5.843918342849933e-05, + "loss": 1.7323, + "step": 15074 + }, + { + "epoch": 4.62707182320442, + "grad_norm": 0.21343614161014557, + "learning_rate": 5.843428413897261e-05, + "loss": 1.7298, + "step": 15075 + }, + { + "epoch": 4.627378759975445, + "grad_norm": 0.2420526146888733, + "learning_rate": 5.842938476609336e-05, + "loss": 1.778, + "step": 15076 + }, + { + "epoch": 4.62768569674647, + "grad_norm": 0.22202003002166748, + "learning_rate": 5.842448530990999e-05, + "loss": 1.779, + "step": 15077 + }, + { + "epoch": 4.627992633517495, + "grad_norm": 0.26784011721611023, + "learning_rate": 5.841958577047092e-05, + "loss": 1.799, + "step": 15078 + }, + { + "epoch": 4.628299570288521, + "grad_norm": 0.3230212926864624, + "learning_rate": 5.841468614782457e-05, + "loss": 1.7789, + "step": 15079 + }, + { + "epoch": 4.628606507059546, + "grad_norm": 0.24062715470790863, + "learning_rate": 5.840978644201935e-05, + "loss": 1.7697, + "step": 15080 + }, + { + "epoch": 4.628913443830571, + "grad_norm": 0.2882130444049835, + "learning_rate": 5.84048866531037e-05, + "loss": 1.7946, + "step": 15081 + }, + { + "epoch": 4.629220380601596, + "grad_norm": 0.3145603537559509, + "learning_rate": 5.839998678112602e-05, + "loss": 1.7116, + "step": 15082 + }, + { + "epoch": 4.629527317372621, + "grad_norm": 0.270997017621994, + "learning_rate": 5.839508682613477e-05, + "loss": 1.8281, + "step": 15083 + }, + { + "epoch": 4.629834254143646, + "grad_norm": 0.27299395203590393, + "learning_rate": 5.839018678817834e-05, + "loss": 1.8233, + "step": 15084 + }, + { + "epoch": 4.630141190914672, + "grad_norm": 0.2684478461742401, + "learning_rate": 5.838528666730517e-05, + "loss": 1.8111, + "step": 15085 + }, + { + "epoch": 4.630448127685697, + "grad_norm": 0.2365201860666275, + "learning_rate": 5.838038646356367e-05, + "loss": 1.7475, + "step": 15086 + }, + { + "epoch": 4.6307550644567215, + "grad_norm": 0.2661258280277252, + "learning_rate": 5.8375486177002305e-05, + "loss": 1.748, + "step": 15087 + }, + { + "epoch": 4.631062001227747, + "grad_norm": 0.2865012586116791, + "learning_rate": 5.8370585807669455e-05, + "loss": 1.7525, + "step": 15088 + }, + { + "epoch": 4.631368937998772, + "grad_norm": 0.2445172518491745, + "learning_rate": 5.836568535561358e-05, + "loss": 1.7278, + "step": 15089 + }, + { + "epoch": 4.6316758747697975, + "grad_norm": 0.28192558884620667, + "learning_rate": 5.8360784820883083e-05, + "loss": 1.7371, + "step": 15090 + }, + { + "epoch": 4.631982811540823, + "grad_norm": 0.38927358388900757, + "learning_rate": 5.835588420352642e-05, + "loss": 1.8088, + "step": 15091 + }, + { + "epoch": 4.632289748311848, + "grad_norm": 0.3409229516983032, + "learning_rate": 5.8350983503592025e-05, + "loss": 1.8011, + "step": 15092 + }, + { + "epoch": 4.632596685082873, + "grad_norm": 0.2464994341135025, + "learning_rate": 5.8346082721128294e-05, + "loss": 1.8354, + "step": 15093 + }, + { + "epoch": 4.632903621853898, + "grad_norm": 0.38765814900398254, + "learning_rate": 5.834118185618369e-05, + "loss": 1.7811, + "step": 15094 + }, + { + "epoch": 4.633210558624923, + "grad_norm": 0.42435070872306824, + "learning_rate": 5.833628090880664e-05, + "loss": 1.7855, + "step": 15095 + }, + { + "epoch": 4.633517495395949, + "grad_norm": 0.244876891374588, + "learning_rate": 5.833137987904558e-05, + "loss": 1.7494, + "step": 15096 + }, + { + "epoch": 4.633824432166974, + "grad_norm": 0.30353477597236633, + "learning_rate": 5.8326478766948934e-05, + "loss": 1.7772, + "step": 15097 + }, + { + "epoch": 4.634131368937998, + "grad_norm": 0.38839244842529297, + "learning_rate": 5.8321577572565146e-05, + "loss": 1.7689, + "step": 15098 + }, + { + "epoch": 4.634438305709024, + "grad_norm": 0.357129842042923, + "learning_rate": 5.8316676295942644e-05, + "loss": 1.7777, + "step": 15099 + }, + { + "epoch": 4.634745242480049, + "grad_norm": 0.23458799719810486, + "learning_rate": 5.831177493712988e-05, + "loss": 1.7544, + "step": 15100 + }, + { + "epoch": 4.635052179251074, + "grad_norm": 0.23751308023929596, + "learning_rate": 5.830687349617529e-05, + "loss": 1.7491, + "step": 15101 + }, + { + "epoch": 4.6353591160221, + "grad_norm": 0.31978943943977356, + "learning_rate": 5.83019719731273e-05, + "loss": 1.7439, + "step": 15102 + }, + { + "epoch": 4.635666052793125, + "grad_norm": 0.2751142084598541, + "learning_rate": 5.829707036803438e-05, + "loss": 1.8598, + "step": 15103 + }, + { + "epoch": 4.6359729895641495, + "grad_norm": 0.23670406639575958, + "learning_rate": 5.8292168680944914e-05, + "loss": 1.7629, + "step": 15104 + }, + { + "epoch": 4.636279926335175, + "grad_norm": 0.2447349727153778, + "learning_rate": 5.828726691190739e-05, + "loss": 1.7606, + "step": 15105 + }, + { + "epoch": 4.6365868631062, + "grad_norm": 0.2739902436733246, + "learning_rate": 5.828236506097023e-05, + "loss": 1.707, + "step": 15106 + }, + { + "epoch": 4.6368937998772255, + "grad_norm": 0.2050863653421402, + "learning_rate": 5.82774631281819e-05, + "loss": 1.7235, + "step": 15107 + }, + { + "epoch": 4.637200736648251, + "grad_norm": 0.3005560338497162, + "learning_rate": 5.827256111359082e-05, + "loss": 1.7785, + "step": 15108 + }, + { + "epoch": 4.637507673419275, + "grad_norm": 0.27168264985084534, + "learning_rate": 5.8267659017245434e-05, + "loss": 1.7844, + "step": 15109 + }, + { + "epoch": 4.637814610190301, + "grad_norm": 0.2965840995311737, + "learning_rate": 5.82627568391942e-05, + "loss": 1.7631, + "step": 15110 + }, + { + "epoch": 4.638121546961326, + "grad_norm": 0.3114408552646637, + "learning_rate": 5.825785457948556e-05, + "loss": 1.77, + "step": 15111 + }, + { + "epoch": 4.638428483732351, + "grad_norm": 0.2638910114765167, + "learning_rate": 5.825295223816796e-05, + "loss": 1.9183, + "step": 15112 + }, + { + "epoch": 4.638735420503377, + "grad_norm": 0.3293665051460266, + "learning_rate": 5.824804981528986e-05, + "loss": 1.6779, + "step": 15113 + }, + { + "epoch": 4.639042357274402, + "grad_norm": 0.28586456179618835, + "learning_rate": 5.824314731089968e-05, + "loss": 1.7905, + "step": 15114 + }, + { + "epoch": 4.639349294045426, + "grad_norm": 0.2254554182291031, + "learning_rate": 5.8238244725045906e-05, + "loss": 1.7602, + "step": 15115 + }, + { + "epoch": 4.639656230816452, + "grad_norm": 0.2770406901836395, + "learning_rate": 5.823334205777695e-05, + "loss": 1.7789, + "step": 15116 + }, + { + "epoch": 4.639963167587477, + "grad_norm": 0.2867025136947632, + "learning_rate": 5.822843930914129e-05, + "loss": 1.7408, + "step": 15117 + }, + { + "epoch": 4.640270104358502, + "grad_norm": 0.23486989736557007, + "learning_rate": 5.822353647918737e-05, + "loss": 1.7489, + "step": 15118 + }, + { + "epoch": 4.640577041129527, + "grad_norm": 0.2274324595928192, + "learning_rate": 5.821863356796367e-05, + "loss": 1.768, + "step": 15119 + }, + { + "epoch": 4.640883977900552, + "grad_norm": 0.25032591819763184, + "learning_rate": 5.821373057551858e-05, + "loss": 1.7602, + "step": 15120 + }, + { + "epoch": 4.6411909146715775, + "grad_norm": 0.22332963347434998, + "learning_rate": 5.820882750190059e-05, + "loss": 1.756, + "step": 15121 + }, + { + "epoch": 4.641497851442603, + "grad_norm": 0.24975591897964478, + "learning_rate": 5.820392434715817e-05, + "loss": 1.6963, + "step": 15122 + }, + { + "epoch": 4.641804788213628, + "grad_norm": 0.27892687916755676, + "learning_rate": 5.819902111133976e-05, + "loss": 1.8295, + "step": 15123 + }, + { + "epoch": 4.6421117249846535, + "grad_norm": 0.23914897441864014, + "learning_rate": 5.819411779449381e-05, + "loss": 1.7636, + "step": 15124 + }, + { + "epoch": 4.642418661755678, + "grad_norm": 0.2349565476179123, + "learning_rate": 5.818921439666879e-05, + "loss": 1.7823, + "step": 15125 + }, + { + "epoch": 4.642725598526703, + "grad_norm": 0.2075800597667694, + "learning_rate": 5.818431091791315e-05, + "loss": 1.7282, + "step": 15126 + }, + { + "epoch": 4.643032535297729, + "grad_norm": 0.19781073927879333, + "learning_rate": 5.817940735827535e-05, + "loss": 1.7598, + "step": 15127 + }, + { + "epoch": 4.643339472068754, + "grad_norm": 0.21997439861297607, + "learning_rate": 5.8174503717803866e-05, + "loss": 1.766, + "step": 15128 + }, + { + "epoch": 4.643646408839779, + "grad_norm": 0.23971444368362427, + "learning_rate": 5.816959999654713e-05, + "loss": 1.7824, + "step": 15129 + }, + { + "epoch": 4.643953345610804, + "grad_norm": 0.23357853293418884, + "learning_rate": 5.816469619455363e-05, + "loss": 1.7353, + "step": 15130 + }, + { + "epoch": 4.644260282381829, + "grad_norm": 0.22030897438526154, + "learning_rate": 5.815979231187181e-05, + "loss": 1.7413, + "step": 15131 + }, + { + "epoch": 4.644567219152854, + "grad_norm": 0.2322571873664856, + "learning_rate": 5.815488834855014e-05, + "loss": 1.7305, + "step": 15132 + }, + { + "epoch": 4.64487415592388, + "grad_norm": 0.25256821513175964, + "learning_rate": 5.814998430463709e-05, + "loss": 1.7533, + "step": 15133 + }, + { + "epoch": 4.645181092694905, + "grad_norm": 0.248504638671875, + "learning_rate": 5.81450801801811e-05, + "loss": 1.7345, + "step": 15134 + }, + { + "epoch": 4.64548802946593, + "grad_norm": 0.22850964963436127, + "learning_rate": 5.8140175975230673e-05, + "loss": 1.8308, + "step": 15135 + }, + { + "epoch": 4.645794966236955, + "grad_norm": 0.3517951965332031, + "learning_rate": 5.813527168983426e-05, + "loss": 1.811, + "step": 15136 + }, + { + "epoch": 4.64610190300798, + "grad_norm": 0.32132068276405334, + "learning_rate": 5.813036732404031e-05, + "loss": 1.7584, + "step": 15137 + }, + { + "epoch": 4.6464088397790055, + "grad_norm": 0.2349396049976349, + "learning_rate": 5.812546287789731e-05, + "loss": 1.7762, + "step": 15138 + }, + { + "epoch": 4.646715776550031, + "grad_norm": 0.23519493639469147, + "learning_rate": 5.812055835145372e-05, + "loss": 1.7428, + "step": 15139 + }, + { + "epoch": 4.647022713321056, + "grad_norm": 0.29277852177619934, + "learning_rate": 5.8115653744758016e-05, + "loss": 1.7599, + "step": 15140 + }, + { + "epoch": 4.647329650092081, + "grad_norm": 0.2347593754529953, + "learning_rate": 5.811074905785867e-05, + "loss": 1.7401, + "step": 15141 + }, + { + "epoch": 4.647636586863106, + "grad_norm": 0.23080264031887054, + "learning_rate": 5.8105844290804147e-05, + "loss": 1.7705, + "step": 15142 + }, + { + "epoch": 4.647943523634131, + "grad_norm": 0.24686801433563232, + "learning_rate": 5.810093944364291e-05, + "loss": 1.7409, + "step": 15143 + }, + { + "epoch": 4.648250460405157, + "grad_norm": 0.24098120629787445, + "learning_rate": 5.809603451642344e-05, + "loss": 1.7893, + "step": 15144 + }, + { + "epoch": 4.648557397176182, + "grad_norm": 0.23020638525485992, + "learning_rate": 5.809112950919422e-05, + "loss": 1.7589, + "step": 15145 + }, + { + "epoch": 4.648864333947207, + "grad_norm": 0.3036736249923706, + "learning_rate": 5.808622442200371e-05, + "loss": 1.7964, + "step": 15146 + }, + { + "epoch": 4.649171270718232, + "grad_norm": 0.2965635657310486, + "learning_rate": 5.808131925490039e-05, + "loss": 1.7986, + "step": 15147 + }, + { + "epoch": 4.649478207489257, + "grad_norm": 0.22241640090942383, + "learning_rate": 5.8076414007932745e-05, + "loss": 1.749, + "step": 15148 + }, + { + "epoch": 4.649785144260282, + "grad_norm": 0.20304246246814728, + "learning_rate": 5.8071508681149246e-05, + "loss": 1.7374, + "step": 15149 + }, + { + "epoch": 4.650092081031308, + "grad_norm": 0.19534410536289215, + "learning_rate": 5.806660327459834e-05, + "loss": 1.7087, + "step": 15150 + }, + { + "epoch": 4.650399017802332, + "grad_norm": 0.2151753008365631, + "learning_rate": 5.806169778832856e-05, + "loss": 1.7409, + "step": 15151 + }, + { + "epoch": 4.650705954573358, + "grad_norm": 0.2180301696062088, + "learning_rate": 5.805679222238836e-05, + "loss": 1.7522, + "step": 15152 + }, + { + "epoch": 4.651012891344383, + "grad_norm": 0.19917607307434082, + "learning_rate": 5.8051886576826205e-05, + "loss": 1.768, + "step": 15153 + }, + { + "epoch": 4.651319828115408, + "grad_norm": 0.2312052994966507, + "learning_rate": 5.804698085169059e-05, + "loss": 1.7799, + "step": 15154 + }, + { + "epoch": 4.651626764886434, + "grad_norm": 0.21541514992713928, + "learning_rate": 5.804207504702999e-05, + "loss": 1.7595, + "step": 15155 + }, + { + "epoch": 4.651933701657459, + "grad_norm": 0.2029450386762619, + "learning_rate": 5.803716916289289e-05, + "loss": 1.7727, + "step": 15156 + }, + { + "epoch": 4.652240638428484, + "grad_norm": 0.21796850860118866, + "learning_rate": 5.8032263199327787e-05, + "loss": 1.7445, + "step": 15157 + }, + { + "epoch": 4.652547575199509, + "grad_norm": 0.20309078693389893, + "learning_rate": 5.802735715638314e-05, + "loss": 1.6971, + "step": 15158 + }, + { + "epoch": 4.652854511970534, + "grad_norm": 0.21270112693309784, + "learning_rate": 5.802245103410745e-05, + "loss": 1.7162, + "step": 15159 + }, + { + "epoch": 4.653161448741559, + "grad_norm": 0.25357750058174133, + "learning_rate": 5.8017544832549184e-05, + "loss": 1.7534, + "step": 15160 + }, + { + "epoch": 4.653468385512585, + "grad_norm": 0.24015015363693237, + "learning_rate": 5.8012638551756847e-05, + "loss": 1.7639, + "step": 15161 + }, + { + "epoch": 4.653775322283609, + "grad_norm": 0.20507018268108368, + "learning_rate": 5.800773219177893e-05, + "loss": 1.7293, + "step": 15162 + }, + { + "epoch": 4.6540822590546345, + "grad_norm": 0.23399868607521057, + "learning_rate": 5.800282575266389e-05, + "loss": 1.8286, + "step": 15163 + }, + { + "epoch": 4.65438919582566, + "grad_norm": 0.27126726508140564, + "learning_rate": 5.799791923446025e-05, + "loss": 1.8028, + "step": 15164 + }, + { + "epoch": 4.654696132596685, + "grad_norm": 0.23644569516181946, + "learning_rate": 5.7993012637216494e-05, + "loss": 1.7138, + "step": 15165 + }, + { + "epoch": 4.6550030693677105, + "grad_norm": 0.21557916700839996, + "learning_rate": 5.7988105960981086e-05, + "loss": 1.7703, + "step": 15166 + }, + { + "epoch": 4.655310006138736, + "grad_norm": 0.22030150890350342, + "learning_rate": 5.798319920580254e-05, + "loss": 1.7282, + "step": 15167 + }, + { + "epoch": 4.65561694290976, + "grad_norm": 0.2092939168214798, + "learning_rate": 5.7978292371729325e-05, + "loss": 1.7853, + "step": 15168 + }, + { + "epoch": 4.655923879680786, + "grad_norm": 0.21643707156181335, + "learning_rate": 5.797338545880997e-05, + "loss": 1.7582, + "step": 15169 + }, + { + "epoch": 4.656230816451811, + "grad_norm": 0.3064669668674469, + "learning_rate": 5.796847846709294e-05, + "loss": 1.8139, + "step": 15170 + }, + { + "epoch": 4.656537753222836, + "grad_norm": 0.3060479760169983, + "learning_rate": 5.796357139662674e-05, + "loss": 1.7356, + "step": 15171 + }, + { + "epoch": 4.656844689993862, + "grad_norm": 0.23546656966209412, + "learning_rate": 5.7958664247459835e-05, + "loss": 1.7937, + "step": 15172 + }, + { + "epoch": 4.657151626764886, + "grad_norm": 0.2890888750553131, + "learning_rate": 5.795375701964077e-05, + "loss": 1.7305, + "step": 15173 + }, + { + "epoch": 4.657458563535911, + "grad_norm": 0.27948084473609924, + "learning_rate": 5.794884971321801e-05, + "loss": 1.7428, + "step": 15174 + }, + { + "epoch": 4.657765500306937, + "grad_norm": 0.2354089468717575, + "learning_rate": 5.794394232824007e-05, + "loss": 1.7622, + "step": 15175 + }, + { + "epoch": 4.658072437077962, + "grad_norm": 0.3271159827709198, + "learning_rate": 5.793903486475541e-05, + "loss": 1.7826, + "step": 15176 + }, + { + "epoch": 4.658379373848987, + "grad_norm": 0.3561338782310486, + "learning_rate": 5.793412732281257e-05, + "loss": 1.7698, + "step": 15177 + }, + { + "epoch": 4.658686310620013, + "grad_norm": 0.2913050949573517, + "learning_rate": 5.7929219702460035e-05, + "loss": 1.8156, + "step": 15178 + }, + { + "epoch": 4.658993247391037, + "grad_norm": 0.2345089465379715, + "learning_rate": 5.7924312003746294e-05, + "loss": 1.7859, + "step": 15179 + }, + { + "epoch": 4.6593001841620625, + "grad_norm": 0.3018132150173187, + "learning_rate": 5.7919404226719865e-05, + "loss": 1.7622, + "step": 15180 + }, + { + "epoch": 4.659607120933088, + "grad_norm": 0.29134172201156616, + "learning_rate": 5.791449637142924e-05, + "loss": 1.7287, + "step": 15181 + }, + { + "epoch": 4.659914057704113, + "grad_norm": 0.24126321077346802, + "learning_rate": 5.7909588437922924e-05, + "loss": 1.7969, + "step": 15182 + }, + { + "epoch": 4.6602209944751385, + "grad_norm": 0.27053284645080566, + "learning_rate": 5.7904680426249415e-05, + "loss": 1.7399, + "step": 15183 + }, + { + "epoch": 4.660527931246163, + "grad_norm": 0.2636512219905853, + "learning_rate": 5.789977233645722e-05, + "loss": 1.7615, + "step": 15184 + }, + { + "epoch": 4.660834868017188, + "grad_norm": 0.2263207584619522, + "learning_rate": 5.789486416859484e-05, + "loss": 1.7668, + "step": 15185 + }, + { + "epoch": 4.661141804788214, + "grad_norm": 0.25387826561927795, + "learning_rate": 5.78899559227108e-05, + "loss": 1.7594, + "step": 15186 + }, + { + "epoch": 4.661448741559239, + "grad_norm": 0.2268977165222168, + "learning_rate": 5.7885047598853596e-05, + "loss": 1.75, + "step": 15187 + }, + { + "epoch": 4.661755678330264, + "grad_norm": 0.29093095660209656, + "learning_rate": 5.788013919707172e-05, + "loss": 1.7291, + "step": 15188 + }, + { + "epoch": 4.66206261510129, + "grad_norm": 0.26578736305236816, + "learning_rate": 5.7875230717413684e-05, + "loss": 1.7276, + "step": 15189 + }, + { + "epoch": 4.662369551872314, + "grad_norm": 0.2548983097076416, + "learning_rate": 5.7870322159928e-05, + "loss": 1.755, + "step": 15190 + }, + { + "epoch": 4.662676488643339, + "grad_norm": 0.2246701419353485, + "learning_rate": 5.7865413524663184e-05, + "loss": 1.751, + "step": 15191 + }, + { + "epoch": 4.662983425414365, + "grad_norm": 0.3069002032279968, + "learning_rate": 5.7860504811667747e-05, + "loss": 1.7522, + "step": 15192 + }, + { + "epoch": 4.66329036218539, + "grad_norm": 0.3081241250038147, + "learning_rate": 5.7855596020990186e-05, + "loss": 1.7152, + "step": 15193 + }, + { + "epoch": 4.6635972989564145, + "grad_norm": 0.29006731510162354, + "learning_rate": 5.7850687152679026e-05, + "loss": 1.8471, + "step": 15194 + }, + { + "epoch": 4.66390423572744, + "grad_norm": 0.24131664633750916, + "learning_rate": 5.7845778206782786e-05, + "loss": 1.763, + "step": 15195 + }, + { + "epoch": 4.664211172498465, + "grad_norm": 0.21808001399040222, + "learning_rate": 5.784086918334994e-05, + "loss": 1.6989, + "step": 15196 + }, + { + "epoch": 4.6645181092694905, + "grad_norm": 0.2413240373134613, + "learning_rate": 5.783596008242904e-05, + "loss": 1.7869, + "step": 15197 + }, + { + "epoch": 4.664825046040516, + "grad_norm": 0.23310934007167816, + "learning_rate": 5.7831050904068594e-05, + "loss": 1.8017, + "step": 15198 + }, + { + "epoch": 4.665131982811541, + "grad_norm": 0.2577926814556122, + "learning_rate": 5.7826141648317125e-05, + "loss": 1.6938, + "step": 15199 + }, + { + "epoch": 4.665438919582566, + "grad_norm": 0.22523443400859833, + "learning_rate": 5.782123231522312e-05, + "loss": 1.8104, + "step": 15200 + }, + { + "epoch": 4.665745856353591, + "grad_norm": 0.23603026568889618, + "learning_rate": 5.781632290483512e-05, + "loss": 1.7484, + "step": 15201 + }, + { + "epoch": 4.666052793124616, + "grad_norm": 0.23195989429950714, + "learning_rate": 5.781141341720162e-05, + "loss": 1.7786, + "step": 15202 + }, + { + "epoch": 4.666359729895642, + "grad_norm": 0.21838274598121643, + "learning_rate": 5.780650385237118e-05, + "loss": 1.7509, + "step": 15203 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.26656514406204224, + "learning_rate": 5.780159421039229e-05, + "loss": 1.7875, + "step": 15204 + }, + { + "epoch": 4.666973603437691, + "grad_norm": 0.2293243706226349, + "learning_rate": 5.7796684491313456e-05, + "loss": 1.7518, + "step": 15205 + }, + { + "epoch": 4.667280540208717, + "grad_norm": 0.24190817773342133, + "learning_rate": 5.779177469518323e-05, + "loss": 1.7593, + "step": 15206 + }, + { + "epoch": 4.667587476979742, + "grad_norm": 0.31113871932029724, + "learning_rate": 5.77868648220501e-05, + "loss": 1.7911, + "step": 15207 + }, + { + "epoch": 4.667894413750767, + "grad_norm": 0.2875262498855591, + "learning_rate": 5.778195487196263e-05, + "loss": 1.7871, + "step": 15208 + }, + { + "epoch": 4.668201350521793, + "grad_norm": 0.2172149419784546, + "learning_rate": 5.777704484496931e-05, + "loss": 1.7592, + "step": 15209 + }, + { + "epoch": 4.668508287292818, + "grad_norm": 0.3282458186149597, + "learning_rate": 5.7772134741118675e-05, + "loss": 1.7687, + "step": 15210 + }, + { + "epoch": 4.6688152240638425, + "grad_norm": 0.36963000893592834, + "learning_rate": 5.7767224560459255e-05, + "loss": 1.812, + "step": 15211 + }, + { + "epoch": 4.669122160834868, + "grad_norm": 0.22387740015983582, + "learning_rate": 5.776231430303957e-05, + "loss": 1.7449, + "step": 15212 + }, + { + "epoch": 4.669429097605893, + "grad_norm": 0.21468734741210938, + "learning_rate": 5.775740396890813e-05, + "loss": 1.716, + "step": 15213 + }, + { + "epoch": 4.6697360343769185, + "grad_norm": 0.2478475719690323, + "learning_rate": 5.7752493558113486e-05, + "loss": 1.7182, + "step": 15214 + }, + { + "epoch": 4.670042971147944, + "grad_norm": 0.20924845337867737, + "learning_rate": 5.774758307070416e-05, + "loss": 1.784, + "step": 15215 + }, + { + "epoch": 4.670349907918968, + "grad_norm": 0.2933209538459778, + "learning_rate": 5.774267250672868e-05, + "loss": 1.8375, + "step": 15216 + }, + { + "epoch": 4.670656844689994, + "grad_norm": 0.2744538486003876, + "learning_rate": 5.7737761866235565e-05, + "loss": 1.7019, + "step": 15217 + }, + { + "epoch": 4.670963781461019, + "grad_norm": 0.20991720259189606, + "learning_rate": 5.773285114927336e-05, + "loss": 1.7189, + "step": 15218 + }, + { + "epoch": 4.671270718232044, + "grad_norm": 0.2873254716396332, + "learning_rate": 5.772794035589057e-05, + "loss": 1.7492, + "step": 15219 + }, + { + "epoch": 4.67157765500307, + "grad_norm": 0.2781519591808319, + "learning_rate": 5.772302948613576e-05, + "loss": 1.7342, + "step": 15220 + }, + { + "epoch": 4.671884591774095, + "grad_norm": 0.23288768529891968, + "learning_rate": 5.7718118540057455e-05, + "loss": 1.7245, + "step": 15221 + }, + { + "epoch": 4.672191528545119, + "grad_norm": 0.40817564725875854, + "learning_rate": 5.771320751770417e-05, + "loss": 1.7659, + "step": 15222 + }, + { + "epoch": 4.672498465316145, + "grad_norm": 0.45521771907806396, + "learning_rate": 5.770829641912444e-05, + "loss": 1.7875, + "step": 15223 + }, + { + "epoch": 4.67280540208717, + "grad_norm": 0.22353248298168182, + "learning_rate": 5.77033852443668e-05, + "loss": 1.7098, + "step": 15224 + }, + { + "epoch": 4.673112338858195, + "grad_norm": 0.4066791534423828, + "learning_rate": 5.769847399347981e-05, + "loss": 1.7277, + "step": 15225 + }, + { + "epoch": 4.67341927562922, + "grad_norm": 0.4299545884132385, + "learning_rate": 5.769356266651198e-05, + "loss": 1.7777, + "step": 15226 + }, + { + "epoch": 4.673726212400245, + "grad_norm": 0.21037638187408447, + "learning_rate": 5.768865126351186e-05, + "loss": 1.7263, + "step": 15227 + }, + { + "epoch": 4.6740331491712706, + "grad_norm": 0.3390437066555023, + "learning_rate": 5.768373978452798e-05, + "loss": 1.7457, + "step": 15228 + }, + { + "epoch": 4.674340085942296, + "grad_norm": 0.40003323554992676, + "learning_rate": 5.767882822960887e-05, + "loss": 1.8137, + "step": 15229 + }, + { + "epoch": 4.674647022713321, + "grad_norm": 0.2212848961353302, + "learning_rate": 5.767391659880308e-05, + "loss": 1.7131, + "step": 15230 + }, + { + "epoch": 4.6749539594843466, + "grad_norm": 0.30634984374046326, + "learning_rate": 5.766900489215915e-05, + "loss": 1.7775, + "step": 15231 + }, + { + "epoch": 4.675260896255372, + "grad_norm": 0.31412798166275024, + "learning_rate": 5.766409310972563e-05, + "loss": 1.7383, + "step": 15232 + }, + { + "epoch": 4.675567833026396, + "grad_norm": 0.21125225722789764, + "learning_rate": 5.7659181251551045e-05, + "loss": 1.8046, + "step": 15233 + }, + { + "epoch": 4.675874769797422, + "grad_norm": 0.3234494924545288, + "learning_rate": 5.765426931768394e-05, + "loss": 1.7838, + "step": 15234 + }, + { + "epoch": 4.676181706568447, + "grad_norm": 0.2668779194355011, + "learning_rate": 5.764935730817286e-05, + "loss": 1.7464, + "step": 15235 + }, + { + "epoch": 4.676488643339472, + "grad_norm": 0.22423583269119263, + "learning_rate": 5.764444522306633e-05, + "loss": 1.7165, + "step": 15236 + }, + { + "epoch": 4.676795580110497, + "grad_norm": 0.29066675901412964, + "learning_rate": 5.7639533062412945e-05, + "loss": 1.75, + "step": 15237 + }, + { + "epoch": 4.677102516881522, + "grad_norm": 0.2963598370552063, + "learning_rate": 5.76346208262612e-05, + "loss": 1.8168, + "step": 15238 + }, + { + "epoch": 4.6774094536525475, + "grad_norm": 0.21484358608722687, + "learning_rate": 5.7629708514659655e-05, + "loss": 1.71, + "step": 15239 + }, + { + "epoch": 4.677716390423573, + "grad_norm": 0.20657925307750702, + "learning_rate": 5.762479612765686e-05, + "loss": 1.7239, + "step": 15240 + }, + { + "epoch": 4.678023327194598, + "grad_norm": 0.21336235105991364, + "learning_rate": 5.761988366530136e-05, + "loss": 1.7952, + "step": 15241 + }, + { + "epoch": 4.6783302639656235, + "grad_norm": 0.24156586825847626, + "learning_rate": 5.7614971127641696e-05, + "loss": 1.7709, + "step": 15242 + }, + { + "epoch": 4.678637200736648, + "grad_norm": 0.2633824944496155, + "learning_rate": 5.761005851472643e-05, + "loss": 1.7404, + "step": 15243 + }, + { + "epoch": 4.678944137507673, + "grad_norm": 0.23302829265594482, + "learning_rate": 5.760514582660411e-05, + "loss": 1.7006, + "step": 15244 + }, + { + "epoch": 4.679251074278699, + "grad_norm": 0.22404874861240387, + "learning_rate": 5.7600233063323283e-05, + "loss": 1.7731, + "step": 15245 + }, + { + "epoch": 4.679558011049724, + "grad_norm": 0.23217839002609253, + "learning_rate": 5.7595320224932495e-05, + "loss": 1.7452, + "step": 15246 + }, + { + "epoch": 4.679864947820749, + "grad_norm": 0.23131491243839264, + "learning_rate": 5.7590407311480296e-05, + "loss": 1.7547, + "step": 15247 + }, + { + "epoch": 4.680171884591774, + "grad_norm": 0.21907350420951843, + "learning_rate": 5.7585494323015245e-05, + "loss": 1.7556, + "step": 15248 + }, + { + "epoch": 4.680478821362799, + "grad_norm": 0.22416768968105316, + "learning_rate": 5.7580581259585895e-05, + "loss": 1.7783, + "step": 15249 + }, + { + "epoch": 4.680785758133824, + "grad_norm": 0.20203055441379547, + "learning_rate": 5.75756681212408e-05, + "loss": 1.7285, + "step": 15250 + }, + { + "epoch": 4.68109269490485, + "grad_norm": 0.27838602662086487, + "learning_rate": 5.75707549080285e-05, + "loss": 1.7489, + "step": 15251 + }, + { + "epoch": 4.681399631675875, + "grad_norm": 0.2415023297071457, + "learning_rate": 5.7565841619997586e-05, + "loss": 1.7453, + "step": 15252 + }, + { + "epoch": 4.6817065684469, + "grad_norm": 0.22986920177936554, + "learning_rate": 5.756092825719658e-05, + "loss": 1.7315, + "step": 15253 + }, + { + "epoch": 4.682013505217925, + "grad_norm": 0.2427850216627121, + "learning_rate": 5.755601481967404e-05, + "loss": 1.772, + "step": 15254 + }, + { + "epoch": 4.68232044198895, + "grad_norm": 0.24556589126586914, + "learning_rate": 5.755110130747854e-05, + "loss": 1.7475, + "step": 15255 + }, + { + "epoch": 4.6826273787599755, + "grad_norm": 0.25252529978752136, + "learning_rate": 5.754618772065864e-05, + "loss": 1.7152, + "step": 15256 + }, + { + "epoch": 4.682934315531001, + "grad_norm": 0.24599005281925201, + "learning_rate": 5.754127405926287e-05, + "loss": 1.7911, + "step": 15257 + }, + { + "epoch": 4.683241252302026, + "grad_norm": 0.18961480259895325, + "learning_rate": 5.7536360323339836e-05, + "loss": 1.681, + "step": 15258 + }, + { + "epoch": 4.683548189073051, + "grad_norm": 0.24372327327728271, + "learning_rate": 5.7531446512938035e-05, + "loss": 1.7771, + "step": 15259 + }, + { + "epoch": 4.683855125844076, + "grad_norm": 0.23239269852638245, + "learning_rate": 5.752653262810609e-05, + "loss": 1.7502, + "step": 15260 + }, + { + "epoch": 4.684162062615101, + "grad_norm": 0.25076135993003845, + "learning_rate": 5.752161866889254e-05, + "loss": 1.7974, + "step": 15261 + }, + { + "epoch": 4.684468999386127, + "grad_norm": 0.2703748941421509, + "learning_rate": 5.7516704635345945e-05, + "loss": 1.7245, + "step": 15262 + }, + { + "epoch": 4.684775936157152, + "grad_norm": 0.19247616827487946, + "learning_rate": 5.751179052751487e-05, + "loss": 1.7105, + "step": 15263 + }, + { + "epoch": 4.685082872928177, + "grad_norm": 0.23166817426681519, + "learning_rate": 5.750687634544787e-05, + "loss": 1.8026, + "step": 15264 + }, + { + "epoch": 4.685389809699202, + "grad_norm": 0.22434166073799133, + "learning_rate": 5.7501962089193507e-05, + "loss": 1.7779, + "step": 15265 + }, + { + "epoch": 4.685696746470227, + "grad_norm": 0.190699502825737, + "learning_rate": 5.749704775880037e-05, + "loss": 1.726, + "step": 15266 + }, + { + "epoch": 4.686003683241252, + "grad_norm": 0.22995290160179138, + "learning_rate": 5.749213335431702e-05, + "loss": 1.7495, + "step": 15267 + }, + { + "epoch": 4.686310620012278, + "grad_norm": 0.2712057828903198, + "learning_rate": 5.7487218875792016e-05, + "loss": 1.7862, + "step": 15268 + }, + { + "epoch": 4.686617556783302, + "grad_norm": 0.2524562180042267, + "learning_rate": 5.7482304323273913e-05, + "loss": 1.7092, + "step": 15269 + }, + { + "epoch": 4.6869244935543275, + "grad_norm": 0.23810559511184692, + "learning_rate": 5.747738969681131e-05, + "loss": 1.8049, + "step": 15270 + }, + { + "epoch": 4.687231430325353, + "grad_norm": 0.25521910190582275, + "learning_rate": 5.747247499645275e-05, + "loss": 1.8124, + "step": 15271 + }, + { + "epoch": 4.687538367096378, + "grad_norm": 0.27797845005989075, + "learning_rate": 5.746756022224682e-05, + "loss": 1.7694, + "step": 15272 + }, + { + "epoch": 4.6878453038674035, + "grad_norm": 0.23849260807037354, + "learning_rate": 5.746264537424208e-05, + "loss": 1.7771, + "step": 15273 + }, + { + "epoch": 4.688152240638429, + "grad_norm": 0.24368882179260254, + "learning_rate": 5.74577304524871e-05, + "loss": 1.8143, + "step": 15274 + }, + { + "epoch": 4.688459177409453, + "grad_norm": 0.2712198793888092, + "learning_rate": 5.745281545703045e-05, + "loss": 1.7683, + "step": 15275 + }, + { + "epoch": 4.688766114180479, + "grad_norm": 0.30913081765174866, + "learning_rate": 5.7447900387920716e-05, + "loss": 1.7111, + "step": 15276 + }, + { + "epoch": 4.689073050951504, + "grad_norm": 0.22123363614082336, + "learning_rate": 5.744298524520646e-05, + "loss": 1.7466, + "step": 15277 + }, + { + "epoch": 4.689379987722529, + "grad_norm": 0.32836318016052246, + "learning_rate": 5.743807002893628e-05, + "loss": 1.8083, + "step": 15278 + }, + { + "epoch": 4.689686924493555, + "grad_norm": 0.33319979906082153, + "learning_rate": 5.743315473915871e-05, + "loss": 1.7122, + "step": 15279 + }, + { + "epoch": 4.689993861264579, + "grad_norm": 0.252163290977478, + "learning_rate": 5.742823937592236e-05, + "loss": 1.7599, + "step": 15280 + }, + { + "epoch": 4.690300798035604, + "grad_norm": 0.23248571157455444, + "learning_rate": 5.7423323939275797e-05, + "loss": 1.7791, + "step": 15281 + }, + { + "epoch": 4.69060773480663, + "grad_norm": 0.27024057507514954, + "learning_rate": 5.741840842926759e-05, + "loss": 1.7608, + "step": 15282 + }, + { + "epoch": 4.690914671577655, + "grad_norm": 0.21888256072998047, + "learning_rate": 5.7413492845946326e-05, + "loss": 1.7407, + "step": 15283 + }, + { + "epoch": 4.69122160834868, + "grad_norm": 0.2574782073497772, + "learning_rate": 5.740857718936058e-05, + "loss": 1.707, + "step": 15284 + }, + { + "epoch": 4.691528545119706, + "grad_norm": 0.2541569769382477, + "learning_rate": 5.740366145955893e-05, + "loss": 1.7301, + "step": 15285 + }, + { + "epoch": 4.69183548189073, + "grad_norm": 0.23484647274017334, + "learning_rate": 5.7398745656589955e-05, + "loss": 1.772, + "step": 15286 + }, + { + "epoch": 4.6921424186617555, + "grad_norm": 0.2827093005180359, + "learning_rate": 5.739382978050225e-05, + "loss": 1.7745, + "step": 15287 + }, + { + "epoch": 4.692449355432781, + "grad_norm": 0.300387978553772, + "learning_rate": 5.738891383134437e-05, + "loss": 1.7966, + "step": 15288 + }, + { + "epoch": 4.692756292203806, + "grad_norm": 0.2414523959159851, + "learning_rate": 5.7383997809164926e-05, + "loss": 1.7355, + "step": 15289 + }, + { + "epoch": 4.6930632289748315, + "grad_norm": 0.21221841871738434, + "learning_rate": 5.737908171401248e-05, + "loss": 1.7935, + "step": 15290 + }, + { + "epoch": 4.693370165745856, + "grad_norm": 0.23488084971904755, + "learning_rate": 5.737416554593563e-05, + "loss": 1.7447, + "step": 15291 + }, + { + "epoch": 4.693677102516881, + "grad_norm": 0.26176631450653076, + "learning_rate": 5.7369249304982954e-05, + "loss": 1.769, + "step": 15292 + }, + { + "epoch": 4.693984039287907, + "grad_norm": 0.23060615360736847, + "learning_rate": 5.736433299120303e-05, + "loss": 1.7344, + "step": 15293 + }, + { + "epoch": 4.694290976058932, + "grad_norm": 0.2536846399307251, + "learning_rate": 5.7359416604644456e-05, + "loss": 1.7862, + "step": 15294 + }, + { + "epoch": 4.694597912829957, + "grad_norm": 0.23221342265605927, + "learning_rate": 5.735450014535581e-05, + "loss": 1.743, + "step": 15295 + }, + { + "epoch": 4.694904849600983, + "grad_norm": 0.25320062041282654, + "learning_rate": 5.734958361338568e-05, + "loss": 1.8001, + "step": 15296 + }, + { + "epoch": 4.695211786372007, + "grad_norm": 0.23132461309432983, + "learning_rate": 5.734466700878267e-05, + "loss": 1.7676, + "step": 15297 + }, + { + "epoch": 4.695518723143032, + "grad_norm": 0.2222728580236435, + "learning_rate": 5.7339750331595346e-05, + "loss": 1.7267, + "step": 15298 + }, + { + "epoch": 4.695825659914058, + "grad_norm": 0.2505118250846863, + "learning_rate": 5.733483358187231e-05, + "loss": 1.7467, + "step": 15299 + }, + { + "epoch": 4.696132596685083, + "grad_norm": 0.23609887063503265, + "learning_rate": 5.732991675966214e-05, + "loss": 1.7319, + "step": 15300 + }, + { + "epoch": 4.696439533456108, + "grad_norm": 0.2939738631248474, + "learning_rate": 5.732499986501345e-05, + "loss": 1.8676, + "step": 15301 + }, + { + "epoch": 4.696746470227133, + "grad_norm": 0.29868564009666443, + "learning_rate": 5.7320082897974814e-05, + "loss": 1.7541, + "step": 15302 + }, + { + "epoch": 4.697053406998158, + "grad_norm": 0.2366383820772171, + "learning_rate": 5.731516585859482e-05, + "loss": 1.7531, + "step": 15303 + }, + { + "epoch": 4.6973603437691835, + "grad_norm": 0.2721317410469055, + "learning_rate": 5.731024874692208e-05, + "loss": 1.7444, + "step": 15304 + }, + { + "epoch": 4.697667280540209, + "grad_norm": 0.24925900995731354, + "learning_rate": 5.730533156300517e-05, + "loss": 1.7716, + "step": 15305 + }, + { + "epoch": 4.697974217311234, + "grad_norm": 0.23012754321098328, + "learning_rate": 5.7300414306892704e-05, + "loss": 1.7211, + "step": 15306 + }, + { + "epoch": 4.6982811540822595, + "grad_norm": 0.21274085342884064, + "learning_rate": 5.7295496978633254e-05, + "loss": 1.7853, + "step": 15307 + }, + { + "epoch": 4.698588090853284, + "grad_norm": 0.21799001097679138, + "learning_rate": 5.729057957827544e-05, + "loss": 1.7505, + "step": 15308 + }, + { + "epoch": 4.698895027624309, + "grad_norm": 0.22365793585777283, + "learning_rate": 5.728566210586783e-05, + "loss": 1.7934, + "step": 15309 + }, + { + "epoch": 4.699201964395335, + "grad_norm": 0.23325085639953613, + "learning_rate": 5.728074456145903e-05, + "loss": 1.7354, + "step": 15310 + }, + { + "epoch": 4.69950890116636, + "grad_norm": 0.2175164669752121, + "learning_rate": 5.7275826945097654e-05, + "loss": 1.7541, + "step": 15311 + }, + { + "epoch": 4.699815837937384, + "grad_norm": 0.24657388031482697, + "learning_rate": 5.727090925683231e-05, + "loss": 1.814, + "step": 15312 + }, + { + "epoch": 4.70012277470841, + "grad_norm": 0.2437550574541092, + "learning_rate": 5.726599149671156e-05, + "loss": 1.7234, + "step": 15313 + }, + { + "epoch": 4.700429711479435, + "grad_norm": 0.21053487062454224, + "learning_rate": 5.726107366478402e-05, + "loss": 1.7788, + "step": 15314 + }, + { + "epoch": 4.7007366482504604, + "grad_norm": 0.2007097452878952, + "learning_rate": 5.725615576109831e-05, + "loss": 1.7453, + "step": 15315 + }, + { + "epoch": 4.701043585021486, + "grad_norm": 0.19331564009189606, + "learning_rate": 5.725123778570299e-05, + "loss": 1.7142, + "step": 15316 + }, + { + "epoch": 4.701350521792511, + "grad_norm": 0.24291567504405975, + "learning_rate": 5.7246319738646706e-05, + "loss": 1.8081, + "step": 15317 + }, + { + "epoch": 4.701657458563536, + "grad_norm": 0.21423695981502533, + "learning_rate": 5.724140161997804e-05, + "loss": 1.7021, + "step": 15318 + }, + { + "epoch": 4.701964395334561, + "grad_norm": 0.20857618749141693, + "learning_rate": 5.72364834297456e-05, + "loss": 1.7447, + "step": 15319 + }, + { + "epoch": 4.702271332105586, + "grad_norm": 0.2547401487827301, + "learning_rate": 5.7231565167998e-05, + "loss": 1.7505, + "step": 15320 + }, + { + "epoch": 4.702578268876612, + "grad_norm": 0.2729472219944, + "learning_rate": 5.7226646834783825e-05, + "loss": 1.7974, + "step": 15321 + }, + { + "epoch": 4.702885205647637, + "grad_norm": 0.23258371651172638, + "learning_rate": 5.722172843015169e-05, + "loss": 1.7562, + "step": 15322 + }, + { + "epoch": 4.703192142418661, + "grad_norm": 0.23399893939495087, + "learning_rate": 5.72168099541502e-05, + "loss": 1.7674, + "step": 15323 + }, + { + "epoch": 4.703499079189687, + "grad_norm": 0.2678206264972687, + "learning_rate": 5.721189140682797e-05, + "loss": 1.7331, + "step": 15324 + }, + { + "epoch": 4.703806015960712, + "grad_norm": 0.19472146034240723, + "learning_rate": 5.7206972788233593e-05, + "loss": 1.7003, + "step": 15325 + }, + { + "epoch": 4.704112952731737, + "grad_norm": 0.2199394404888153, + "learning_rate": 5.72020540984157e-05, + "loss": 1.7072, + "step": 15326 + }, + { + "epoch": 4.704419889502763, + "grad_norm": 0.219175323843956, + "learning_rate": 5.719713533742287e-05, + "loss": 1.7591, + "step": 15327 + }, + { + "epoch": 4.704726826273788, + "grad_norm": 0.21127547323703766, + "learning_rate": 5.719221650530374e-05, + "loss": 1.8059, + "step": 15328 + }, + { + "epoch": 4.7050337630448125, + "grad_norm": 0.22189834713935852, + "learning_rate": 5.7187297602106905e-05, + "loss": 1.7529, + "step": 15329 + }, + { + "epoch": 4.705340699815838, + "grad_norm": 0.19945195317268372, + "learning_rate": 5.7182378627881e-05, + "loss": 1.7133, + "step": 15330 + }, + { + "epoch": 4.705647636586863, + "grad_norm": 0.2177499681711197, + "learning_rate": 5.7177459582674595e-05, + "loss": 1.7451, + "step": 15331 + }, + { + "epoch": 4.7059545733578885, + "grad_norm": 0.19489440321922302, + "learning_rate": 5.717254046653635e-05, + "loss": 1.7499, + "step": 15332 + }, + { + "epoch": 4.706261510128914, + "grad_norm": 0.21366968750953674, + "learning_rate": 5.716762127951485e-05, + "loss": 1.7683, + "step": 15333 + }, + { + "epoch": 4.706568446899938, + "grad_norm": 0.2894177734851837, + "learning_rate": 5.71627020216587e-05, + "loss": 1.8235, + "step": 15334 + }, + { + "epoch": 4.706875383670964, + "grad_norm": 0.22175677120685577, + "learning_rate": 5.7157782693016534e-05, + "loss": 1.7421, + "step": 15335 + }, + { + "epoch": 4.707182320441989, + "grad_norm": 0.23653541505336761, + "learning_rate": 5.715286329363698e-05, + "loss": 1.6937, + "step": 15336 + }, + { + "epoch": 4.707489257213014, + "grad_norm": 0.3015746772289276, + "learning_rate": 5.714794382356863e-05, + "loss": 1.7159, + "step": 15337 + }, + { + "epoch": 4.70779619398404, + "grad_norm": 0.24045881628990173, + "learning_rate": 5.714302428286011e-05, + "loss": 1.7263, + "step": 15338 + }, + { + "epoch": 4.708103130755065, + "grad_norm": 0.19836920499801636, + "learning_rate": 5.7138104671560035e-05, + "loss": 1.7604, + "step": 15339 + }, + { + "epoch": 4.708410067526089, + "grad_norm": 0.2430238276720047, + "learning_rate": 5.7133184989717036e-05, + "loss": 1.7147, + "step": 15340 + }, + { + "epoch": 4.708717004297115, + "grad_norm": 0.19388417899608612, + "learning_rate": 5.712826523737971e-05, + "loss": 1.7153, + "step": 15341 + }, + { + "epoch": 4.70902394106814, + "grad_norm": 0.19648151099681854, + "learning_rate": 5.7123345414596694e-05, + "loss": 1.7373, + "step": 15342 + }, + { + "epoch": 4.709330877839165, + "grad_norm": 0.20326325297355652, + "learning_rate": 5.711842552141661e-05, + "loss": 1.7012, + "step": 15343 + }, + { + "epoch": 4.70963781461019, + "grad_norm": 0.20798304677009583, + "learning_rate": 5.711350555788806e-05, + "loss": 1.7134, + "step": 15344 + }, + { + "epoch": 4.709944751381215, + "grad_norm": 0.29318806529045105, + "learning_rate": 5.7108585524059674e-05, + "loss": 1.7661, + "step": 15345 + }, + { + "epoch": 4.7102516881522405, + "grad_norm": 0.273318350315094, + "learning_rate": 5.710366541998009e-05, + "loss": 1.7329, + "step": 15346 + }, + { + "epoch": 4.710558624923266, + "grad_norm": 0.2306031584739685, + "learning_rate": 5.7098745245697925e-05, + "loss": 1.8152, + "step": 15347 + }, + { + "epoch": 4.710865561694291, + "grad_norm": 0.27630630135536194, + "learning_rate": 5.709382500126179e-05, + "loss": 1.7955, + "step": 15348 + }, + { + "epoch": 4.7111724984653165, + "grad_norm": 0.2366025298833847, + "learning_rate": 5.7088904686720326e-05, + "loss": 1.7943, + "step": 15349 + }, + { + "epoch": 4.711479435236341, + "grad_norm": 0.24196656048297882, + "learning_rate": 5.708398430212215e-05, + "loss": 1.698, + "step": 15350 + }, + { + "epoch": 4.711786372007366, + "grad_norm": 0.2770058512687683, + "learning_rate": 5.707906384751588e-05, + "loss": 1.7618, + "step": 15351 + }, + { + "epoch": 4.712093308778392, + "grad_norm": 0.20432323217391968, + "learning_rate": 5.7074143322950157e-05, + "loss": 1.7422, + "step": 15352 + }, + { + "epoch": 4.712400245549417, + "grad_norm": 0.25543150305747986, + "learning_rate": 5.70692227284736e-05, + "loss": 1.7744, + "step": 15353 + }, + { + "epoch": 4.712707182320442, + "grad_norm": 0.24315913021564484, + "learning_rate": 5.7064302064134855e-05, + "loss": 1.7127, + "step": 15354 + }, + { + "epoch": 4.713014119091467, + "grad_norm": 0.23636099696159363, + "learning_rate": 5.705938132998252e-05, + "loss": 1.7725, + "step": 15355 + }, + { + "epoch": 4.713321055862492, + "grad_norm": 0.26809820532798767, + "learning_rate": 5.705446052606526e-05, + "loss": 1.8338, + "step": 15356 + }, + { + "epoch": 4.713627992633517, + "grad_norm": 0.24969002604484558, + "learning_rate": 5.704953965243167e-05, + "loss": 1.8225, + "step": 15357 + }, + { + "epoch": 4.713934929404543, + "grad_norm": 0.23189692199230194, + "learning_rate": 5.70446187091304e-05, + "loss": 1.7901, + "step": 15358 + }, + { + "epoch": 4.714241866175568, + "grad_norm": 0.22373750805854797, + "learning_rate": 5.703969769621008e-05, + "loss": 1.6919, + "step": 15359 + }, + { + "epoch": 4.714548802946593, + "grad_norm": 0.23963531851768494, + "learning_rate": 5.703477661371934e-05, + "loss": 1.7806, + "step": 15360 + }, + { + "epoch": 4.714855739717618, + "grad_norm": 0.20365150272846222, + "learning_rate": 5.702985546170683e-05, + "loss": 1.7207, + "step": 15361 + }, + { + "epoch": 4.715162676488643, + "grad_norm": 0.245658278465271, + "learning_rate": 5.702493424022114e-05, + "loss": 1.7589, + "step": 15362 + }, + { + "epoch": 4.7154696132596685, + "grad_norm": 0.22633756697177887, + "learning_rate": 5.702001294931094e-05, + "loss": 1.7893, + "step": 15363 + }, + { + "epoch": 4.715776550030694, + "grad_norm": 0.21587726473808289, + "learning_rate": 5.701509158902487e-05, + "loss": 1.8095, + "step": 15364 + }, + { + "epoch": 4.716083486801719, + "grad_norm": 0.22553963959217072, + "learning_rate": 5.701017015941155e-05, + "loss": 1.7419, + "step": 15365 + }, + { + "epoch": 4.716390423572744, + "grad_norm": 0.2276087999343872, + "learning_rate": 5.700524866051962e-05, + "loss": 1.7052, + "step": 15366 + }, + { + "epoch": 4.716697360343769, + "grad_norm": 0.22236761450767517, + "learning_rate": 5.700032709239771e-05, + "loss": 1.8612, + "step": 15367 + }, + { + "epoch": 4.717004297114794, + "grad_norm": 0.22816185653209686, + "learning_rate": 5.6995405455094465e-05, + "loss": 1.78, + "step": 15368 + }, + { + "epoch": 4.71731123388582, + "grad_norm": 0.21597479283809662, + "learning_rate": 5.6990483748658516e-05, + "loss": 1.8276, + "step": 15369 + }, + { + "epoch": 4.717618170656845, + "grad_norm": 0.22209586203098297, + "learning_rate": 5.6985561973138533e-05, + "loss": 1.74, + "step": 15370 + }, + { + "epoch": 4.71792510742787, + "grad_norm": 0.24249997735023499, + "learning_rate": 5.6980640128583116e-05, + "loss": 1.8035, + "step": 15371 + }, + { + "epoch": 4.718232044198895, + "grad_norm": 0.23326106369495392, + "learning_rate": 5.6975718215040943e-05, + "loss": 1.7969, + "step": 15372 + }, + { + "epoch": 4.71853898096992, + "grad_norm": 0.215044766664505, + "learning_rate": 5.6970796232560596e-05, + "loss": 1.7345, + "step": 15373 + }, + { + "epoch": 4.718845917740945, + "grad_norm": 0.20231883227825165, + "learning_rate": 5.696587418119078e-05, + "loss": 1.7231, + "step": 15374 + }, + { + "epoch": 4.719152854511971, + "grad_norm": 0.2136038839817047, + "learning_rate": 5.696095206098011e-05, + "loss": 1.7421, + "step": 15375 + }, + { + "epoch": 4.719459791282996, + "grad_norm": 0.2662335932254791, + "learning_rate": 5.6956029871977235e-05, + "loss": 1.7518, + "step": 15376 + }, + { + "epoch": 4.7197667280540205, + "grad_norm": 0.25649648904800415, + "learning_rate": 5.6951107614230783e-05, + "loss": 1.8314, + "step": 15377 + }, + { + "epoch": 4.720073664825046, + "grad_norm": 0.21995560824871063, + "learning_rate": 5.6946185287789425e-05, + "loss": 1.7511, + "step": 15378 + }, + { + "epoch": 4.720380601596071, + "grad_norm": 0.3388935923576355, + "learning_rate": 5.694126289270177e-05, + "loss": 1.7975, + "step": 15379 + }, + { + "epoch": 4.7206875383670965, + "grad_norm": 0.32886409759521484, + "learning_rate": 5.693634042901651e-05, + "loss": 1.7153, + "step": 15380 + }, + { + "epoch": 4.720994475138122, + "grad_norm": 0.21727977693080902, + "learning_rate": 5.693141789678226e-05, + "loss": 1.7095, + "step": 15381 + }, + { + "epoch": 4.721301411909147, + "grad_norm": 0.2680833041667938, + "learning_rate": 5.6926495296047675e-05, + "loss": 1.696, + "step": 15382 + }, + { + "epoch": 4.721608348680172, + "grad_norm": 0.2645499110221863, + "learning_rate": 5.692157262686141e-05, + "loss": 1.6889, + "step": 15383 + }, + { + "epoch": 4.721915285451197, + "grad_norm": 0.20362348854541779, + "learning_rate": 5.69166498892721e-05, + "loss": 1.7303, + "step": 15384 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.24259062111377716, + "learning_rate": 5.691172708332839e-05, + "loss": 1.7684, + "step": 15385 + }, + { + "epoch": 4.722529158993248, + "grad_norm": 0.24204276502132416, + "learning_rate": 5.690680420907897e-05, + "loss": 1.7728, + "step": 15386 + }, + { + "epoch": 4.722836095764272, + "grad_norm": 0.3038320243358612, + "learning_rate": 5.690188126657244e-05, + "loss": 1.7573, + "step": 15387 + }, + { + "epoch": 4.723143032535297, + "grad_norm": 0.24619868397712708, + "learning_rate": 5.689695825585749e-05, + "loss": 1.754, + "step": 15388 + }, + { + "epoch": 4.723449969306323, + "grad_norm": 0.19441325962543488, + "learning_rate": 5.689203517698276e-05, + "loss": 1.726, + "step": 15389 + }, + { + "epoch": 4.723756906077348, + "grad_norm": 0.2874276340007782, + "learning_rate": 5.688711202999688e-05, + "loss": 1.7704, + "step": 15390 + }, + { + "epoch": 4.724063842848373, + "grad_norm": 0.24488390982151031, + "learning_rate": 5.6882188814948535e-05, + "loss": 1.7477, + "step": 15391 + }, + { + "epoch": 4.724370779619399, + "grad_norm": 0.22674018144607544, + "learning_rate": 5.687726553188636e-05, + "loss": 1.7287, + "step": 15392 + }, + { + "epoch": 4.724677716390423, + "grad_norm": 0.2653258442878723, + "learning_rate": 5.687234218085902e-05, + "loss": 1.7415, + "step": 15393 + }, + { + "epoch": 4.7249846531614486, + "grad_norm": 0.20345374941825867, + "learning_rate": 5.686741876191516e-05, + "loss": 1.764, + "step": 15394 + }, + { + "epoch": 4.725291589932474, + "grad_norm": 0.23193977773189545, + "learning_rate": 5.686249527510345e-05, + "loss": 1.7557, + "step": 15395 + }, + { + "epoch": 4.725598526703499, + "grad_norm": 0.26426708698272705, + "learning_rate": 5.685757172047253e-05, + "loss": 1.7708, + "step": 15396 + }, + { + "epoch": 4.725905463474525, + "grad_norm": 0.21377156674861908, + "learning_rate": 5.685264809807107e-05, + "loss": 1.6921, + "step": 15397 + }, + { + "epoch": 4.726212400245549, + "grad_norm": 0.21628457307815552, + "learning_rate": 5.684772440794773e-05, + "loss": 1.72, + "step": 15398 + }, + { + "epoch": 4.726519337016574, + "grad_norm": 0.19200581312179565, + "learning_rate": 5.684280065015116e-05, + "loss": 1.7311, + "step": 15399 + }, + { + "epoch": 4.7268262737876, + "grad_norm": 0.22227540612220764, + "learning_rate": 5.683787682473003e-05, + "loss": 1.7451, + "step": 15400 + }, + { + "epoch": 4.727133210558625, + "grad_norm": 0.18053604662418365, + "learning_rate": 5.683295293173299e-05, + "loss": 1.6816, + "step": 15401 + }, + { + "epoch": 4.72744014732965, + "grad_norm": 0.19827169179916382, + "learning_rate": 5.682802897120869e-05, + "loss": 1.7315, + "step": 15402 + }, + { + "epoch": 4.727747084100676, + "grad_norm": 0.2768021821975708, + "learning_rate": 5.682310494320582e-05, + "loss": 1.7714, + "step": 15403 + }, + { + "epoch": 4.7280540208717, + "grad_norm": 0.2613474428653717, + "learning_rate": 5.6818180847773027e-05, + "loss": 1.7332, + "step": 15404 + }, + { + "epoch": 4.7283609576427255, + "grad_norm": 0.21546787023544312, + "learning_rate": 5.681325668495898e-05, + "loss": 1.771, + "step": 15405 + }, + { + "epoch": 4.728667894413751, + "grad_norm": 0.24442137777805328, + "learning_rate": 5.680833245481234e-05, + "loss": 1.7296, + "step": 15406 + }, + { + "epoch": 4.728974831184776, + "grad_norm": 0.2622109055519104, + "learning_rate": 5.680340815738175e-05, + "loss": 1.7778, + "step": 15407 + }, + { + "epoch": 4.7292817679558015, + "grad_norm": 0.22379513084888458, + "learning_rate": 5.6798483792715904e-05, + "loss": 1.7953, + "step": 15408 + }, + { + "epoch": 4.729588704726826, + "grad_norm": 0.21901065111160278, + "learning_rate": 5.679355936086346e-05, + "loss": 1.7287, + "step": 15409 + }, + { + "epoch": 4.729895641497851, + "grad_norm": 0.3023792505264282, + "learning_rate": 5.6788634861873066e-05, + "loss": 1.7851, + "step": 15410 + }, + { + "epoch": 4.730202578268877, + "grad_norm": 0.23882482945919037, + "learning_rate": 5.678371029579342e-05, + "loss": 1.7621, + "step": 15411 + }, + { + "epoch": 4.730509515039902, + "grad_norm": 0.2661043703556061, + "learning_rate": 5.6778785662673175e-05, + "loss": 1.7453, + "step": 15412 + }, + { + "epoch": 4.730816451810927, + "grad_norm": 0.330208957195282, + "learning_rate": 5.677386096256099e-05, + "loss": 1.761, + "step": 15413 + }, + { + "epoch": 4.731123388581953, + "grad_norm": 0.2686570882797241, + "learning_rate": 5.676893619550552e-05, + "loss": 1.7539, + "step": 15414 + }, + { + "epoch": 4.731430325352977, + "grad_norm": 0.24308046698570251, + "learning_rate": 5.676401136155548e-05, + "loss": 1.7345, + "step": 15415 + }, + { + "epoch": 4.731737262124002, + "grad_norm": 0.4137137830257416, + "learning_rate": 5.67590864607595e-05, + "loss": 1.7688, + "step": 15416 + }, + { + "epoch": 4.732044198895028, + "grad_norm": 0.32161539793014526, + "learning_rate": 5.675416149316628e-05, + "loss": 1.7881, + "step": 15417 + }, + { + "epoch": 4.732351135666053, + "grad_norm": 0.2336999475955963, + "learning_rate": 5.674923645882447e-05, + "loss": 1.755, + "step": 15418 + }, + { + "epoch": 4.7326580724370775, + "grad_norm": 0.32781684398651123, + "learning_rate": 5.6744311357782754e-05, + "loss": 1.8062, + "step": 15419 + }, + { + "epoch": 4.732965009208103, + "grad_norm": 0.2475704401731491, + "learning_rate": 5.6739386190089795e-05, + "loss": 1.725, + "step": 15420 + }, + { + "epoch": 4.733271945979128, + "grad_norm": 0.26295650005340576, + "learning_rate": 5.673446095579427e-05, + "loss": 1.7673, + "step": 15421 + }, + { + "epoch": 4.7335788827501535, + "grad_norm": 0.3454873859882355, + "learning_rate": 5.6729535654944864e-05, + "loss": 1.7523, + "step": 15422 + }, + { + "epoch": 4.733885819521179, + "grad_norm": 0.2306666374206543, + "learning_rate": 5.672461028759024e-05, + "loss": 1.7085, + "step": 15423 + }, + { + "epoch": 4.734192756292204, + "grad_norm": 0.30825871229171753, + "learning_rate": 5.671968485377908e-05, + "loss": 1.7642, + "step": 15424 + }, + { + "epoch": 4.734499693063229, + "grad_norm": 0.42611342668533325, + "learning_rate": 5.6714759353560045e-05, + "loss": 1.7832, + "step": 15425 + }, + { + "epoch": 4.734806629834254, + "grad_norm": 0.29502514004707336, + "learning_rate": 5.670983378698182e-05, + "loss": 1.8153, + "step": 15426 + }, + { + "epoch": 4.735113566605279, + "grad_norm": 0.28416305780410767, + "learning_rate": 5.6704908154093096e-05, + "loss": 1.756, + "step": 15427 + }, + { + "epoch": 4.735420503376305, + "grad_norm": 0.43111103773117065, + "learning_rate": 5.6699982454942534e-05, + "loss": 1.7797, + "step": 15428 + }, + { + "epoch": 4.73572744014733, + "grad_norm": 0.27667397260665894, + "learning_rate": 5.669505668957882e-05, + "loss": 1.7316, + "step": 15429 + }, + { + "epoch": 4.736034376918354, + "grad_norm": 0.3045295774936676, + "learning_rate": 5.669013085805063e-05, + "loss": 1.7591, + "step": 15430 + }, + { + "epoch": 4.73634131368938, + "grad_norm": 0.4494635760784149, + "learning_rate": 5.6685204960406635e-05, + "loss": 1.8295, + "step": 15431 + }, + { + "epoch": 4.736648250460405, + "grad_norm": 0.2951449453830719, + "learning_rate": 5.6680278996695544e-05, + "loss": 1.7857, + "step": 15432 + }, + { + "epoch": 4.73695518723143, + "grad_norm": 0.2714167535305023, + "learning_rate": 5.6675352966966014e-05, + "loss": 1.816, + "step": 15433 + }, + { + "epoch": 4.737262124002456, + "grad_norm": 0.32701000571250916, + "learning_rate": 5.667042687126673e-05, + "loss": 1.7637, + "step": 15434 + }, + { + "epoch": 4.737569060773481, + "grad_norm": 0.2466556429862976, + "learning_rate": 5.666550070964638e-05, + "loss": 1.7805, + "step": 15435 + }, + { + "epoch": 4.7378759975445055, + "grad_norm": 0.3283855617046356, + "learning_rate": 5.666057448215365e-05, + "loss": 1.786, + "step": 15436 + }, + { + "epoch": 4.738182934315531, + "grad_norm": 0.35860660672187805, + "learning_rate": 5.6655648188837205e-05, + "loss": 1.8309, + "step": 15437 + }, + { + "epoch": 4.738489871086556, + "grad_norm": 0.22293898463249207, + "learning_rate": 5.665072182974576e-05, + "loss": 1.7317, + "step": 15438 + }, + { + "epoch": 4.7387968078575815, + "grad_norm": 0.3155089020729065, + "learning_rate": 5.664579540492798e-05, + "loss": 1.7202, + "step": 15439 + }, + { + "epoch": 4.739103744628607, + "grad_norm": 0.28723904490470886, + "learning_rate": 5.6640868914432566e-05, + "loss": 1.7788, + "step": 15440 + }, + { + "epoch": 4.739410681399631, + "grad_norm": 0.2461984008550644, + "learning_rate": 5.6635942358308183e-05, + "loss": 1.8504, + "step": 15441 + }, + { + "epoch": 4.739717618170657, + "grad_norm": 0.2503122091293335, + "learning_rate": 5.663101573660351e-05, + "loss": 1.7375, + "step": 15442 + }, + { + "epoch": 4.740024554941682, + "grad_norm": 0.24925372004508972, + "learning_rate": 5.662608904936727e-05, + "loss": 1.7152, + "step": 15443 + }, + { + "epoch": 4.740331491712707, + "grad_norm": 0.2734573483467102, + "learning_rate": 5.662116229664813e-05, + "loss": 1.7476, + "step": 15444 + }, + { + "epoch": 4.740638428483733, + "grad_norm": 0.38122060894966125, + "learning_rate": 5.661623547849479e-05, + "loss": 1.7682, + "step": 15445 + }, + { + "epoch": 4.740945365254758, + "grad_norm": 0.3786417245864868, + "learning_rate": 5.661130859495593e-05, + "loss": 1.7446, + "step": 15446 + }, + { + "epoch": 4.741252302025782, + "grad_norm": 0.22618255019187927, + "learning_rate": 5.6606381646080244e-05, + "loss": 1.7427, + "step": 15447 + }, + { + "epoch": 4.741559238796808, + "grad_norm": 0.3000899851322174, + "learning_rate": 5.6601454631916405e-05, + "loss": 1.7087, + "step": 15448 + }, + { + "epoch": 4.741866175567833, + "grad_norm": 0.36542513966560364, + "learning_rate": 5.659652755251315e-05, + "loss": 1.7985, + "step": 15449 + }, + { + "epoch": 4.742173112338858, + "grad_norm": 0.23550496995449066, + "learning_rate": 5.659160040791912e-05, + "loss": 1.8163, + "step": 15450 + }, + { + "epoch": 4.742480049109884, + "grad_norm": 0.25615251064300537, + "learning_rate": 5.658667319818305e-05, + "loss": 1.7372, + "step": 15451 + }, + { + "epoch": 4.742786985880908, + "grad_norm": 0.28744083642959595, + "learning_rate": 5.6581745923353615e-05, + "loss": 1.7193, + "step": 15452 + }, + { + "epoch": 4.7430939226519335, + "grad_norm": 0.2500229775905609, + "learning_rate": 5.65768185834795e-05, + "loss": 1.7263, + "step": 15453 + }, + { + "epoch": 4.743400859422959, + "grad_norm": 0.21520425379276276, + "learning_rate": 5.6571891178609394e-05, + "loss": 1.7337, + "step": 15454 + }, + { + "epoch": 4.743707796193984, + "grad_norm": 0.212506502866745, + "learning_rate": 5.656696370879202e-05, + "loss": 1.7672, + "step": 15455 + }, + { + "epoch": 4.7440147329650095, + "grad_norm": 0.21143417060375214, + "learning_rate": 5.656203617407607e-05, + "loss": 1.7189, + "step": 15456 + }, + { + "epoch": 4.744321669736035, + "grad_norm": 0.18320922553539276, + "learning_rate": 5.6557108574510243e-05, + "loss": 1.7521, + "step": 15457 + }, + { + "epoch": 4.744628606507059, + "grad_norm": 0.19202999770641327, + "learning_rate": 5.655218091014321e-05, + "loss": 1.6756, + "step": 15458 + }, + { + "epoch": 4.744935543278085, + "grad_norm": 0.2152331918478012, + "learning_rate": 5.654725318102367e-05, + "loss": 1.7653, + "step": 15459 + }, + { + "epoch": 4.74524248004911, + "grad_norm": 0.24565903842449188, + "learning_rate": 5.6542325387200354e-05, + "loss": 1.7654, + "step": 15460 + }, + { + "epoch": 4.745549416820135, + "grad_norm": 0.2504819333553314, + "learning_rate": 5.653739752872195e-05, + "loss": 1.7073, + "step": 15461 + }, + { + "epoch": 4.74585635359116, + "grad_norm": 0.19258706271648407, + "learning_rate": 5.653246960563714e-05, + "loss": 1.7106, + "step": 15462 + }, + { + "epoch": 4.746163290362185, + "grad_norm": 0.22961968183517456, + "learning_rate": 5.652754161799465e-05, + "loss": 1.7868, + "step": 15463 + }, + { + "epoch": 4.74647022713321, + "grad_norm": 0.2763231098651886, + "learning_rate": 5.652261356584315e-05, + "loss": 1.7714, + "step": 15464 + }, + { + "epoch": 4.746777163904236, + "grad_norm": 0.23866096138954163, + "learning_rate": 5.651768544923136e-05, + "loss": 1.7537, + "step": 15465 + }, + { + "epoch": 4.747084100675261, + "grad_norm": 0.21851976215839386, + "learning_rate": 5.6512757268207997e-05, + "loss": 1.8109, + "step": 15466 + }, + { + "epoch": 4.747391037446286, + "grad_norm": 0.22249393165111542, + "learning_rate": 5.6507829022821745e-05, + "loss": 1.7357, + "step": 15467 + }, + { + "epoch": 4.747697974217311, + "grad_norm": 0.20202289521694183, + "learning_rate": 5.650290071312131e-05, + "loss": 1.7867, + "step": 15468 + }, + { + "epoch": 4.748004910988336, + "grad_norm": 0.20618727803230286, + "learning_rate": 5.649797233915539e-05, + "loss": 1.6904, + "step": 15469 + }, + { + "epoch": 4.7483118477593615, + "grad_norm": 0.25609052181243896, + "learning_rate": 5.649304390097272e-05, + "loss": 1.7287, + "step": 15470 + }, + { + "epoch": 4.748618784530387, + "grad_norm": 0.22966544330120087, + "learning_rate": 5.648811539862195e-05, + "loss": 1.7384, + "step": 15471 + }, + { + "epoch": 4.748925721301412, + "grad_norm": 0.24070143699645996, + "learning_rate": 5.6483186832151856e-05, + "loss": 1.7625, + "step": 15472 + }, + { + "epoch": 4.749232658072437, + "grad_norm": 0.22642426192760468, + "learning_rate": 5.647825820161109e-05, + "loss": 1.7291, + "step": 15473 + }, + { + "epoch": 4.749539594843462, + "grad_norm": 0.23255646228790283, + "learning_rate": 5.64733295070484e-05, + "loss": 1.8076, + "step": 15474 + }, + { + "epoch": 4.749846531614487, + "grad_norm": 0.20902042090892792, + "learning_rate": 5.646840074851246e-05, + "loss": 1.6627, + "step": 15475 + }, + { + "epoch": 4.750153468385513, + "grad_norm": 0.21608836948871613, + "learning_rate": 5.646347192605198e-05, + "loss": 1.7458, + "step": 15476 + }, + { + "epoch": 4.750460405156538, + "grad_norm": 0.22368495166301727, + "learning_rate": 5.6458543039715694e-05, + "loss": 1.7601, + "step": 15477 + }, + { + "epoch": 4.750767341927563, + "grad_norm": 0.30586308240890503, + "learning_rate": 5.645361408955231e-05, + "loss": 1.8389, + "step": 15478 + }, + { + "epoch": 4.751074278698588, + "grad_norm": 0.25122150778770447, + "learning_rate": 5.644868507561052e-05, + "loss": 1.7509, + "step": 15479 + }, + { + "epoch": 4.751381215469613, + "grad_norm": 0.28435763716697693, + "learning_rate": 5.644375599793904e-05, + "loss": 1.7723, + "step": 15480 + }, + { + "epoch": 4.7516881522406385, + "grad_norm": 0.3111409842967987, + "learning_rate": 5.643882685658659e-05, + "loss": 1.7973, + "step": 15481 + }, + { + "epoch": 4.751995089011664, + "grad_norm": 0.3108380138874054, + "learning_rate": 5.6433897651601874e-05, + "loss": 1.8126, + "step": 15482 + }, + { + "epoch": 4.752302025782689, + "grad_norm": 0.25894731283187866, + "learning_rate": 5.642896838303362e-05, + "loss": 1.7849, + "step": 15483 + }, + { + "epoch": 4.752608962553714, + "grad_norm": 0.39321839809417725, + "learning_rate": 5.642403905093052e-05, + "loss": 1.7583, + "step": 15484 + }, + { + "epoch": 4.752915899324739, + "grad_norm": 0.3206121027469635, + "learning_rate": 5.6419109655341315e-05, + "loss": 1.8061, + "step": 15485 + }, + { + "epoch": 4.753222836095764, + "grad_norm": 0.2817624807357788, + "learning_rate": 5.64141801963147e-05, + "loss": 1.8252, + "step": 15486 + }, + { + "epoch": 4.75352977286679, + "grad_norm": 0.3344736397266388, + "learning_rate": 5.6409250673899405e-05, + "loss": 1.6975, + "step": 15487 + }, + { + "epoch": 4.753836709637815, + "grad_norm": 0.21873882412910461, + "learning_rate": 5.640432108814413e-05, + "loss": 1.7126, + "step": 15488 + }, + { + "epoch": 4.75414364640884, + "grad_norm": 0.3317199945449829, + "learning_rate": 5.639939143909758e-05, + "loss": 1.7826, + "step": 15489 + }, + { + "epoch": 4.754450583179865, + "grad_norm": 0.34901630878448486, + "learning_rate": 5.639446172680854e-05, + "loss": 1.7411, + "step": 15490 + }, + { + "epoch": 4.75475751995089, + "grad_norm": 0.24015867710113525, + "learning_rate": 5.6389531951325645e-05, + "loss": 1.7514, + "step": 15491 + }, + { + "epoch": 4.755064456721915, + "grad_norm": 0.28364554047584534, + "learning_rate": 5.6384602112697674e-05, + "loss": 1.7569, + "step": 15492 + }, + { + "epoch": 4.755371393492941, + "grad_norm": 0.3561246693134308, + "learning_rate": 5.637967221097329e-05, + "loss": 1.7212, + "step": 15493 + }, + { + "epoch": 4.755678330263965, + "grad_norm": 0.3383684456348419, + "learning_rate": 5.637474224620126e-05, + "loss": 1.6866, + "step": 15494 + }, + { + "epoch": 4.7559852670349905, + "grad_norm": 0.2399235963821411, + "learning_rate": 5.63698122184303e-05, + "loss": 1.7609, + "step": 15495 + }, + { + "epoch": 4.756292203806016, + "grad_norm": 0.38559645414352417, + "learning_rate": 5.636488212770912e-05, + "loss": 1.7509, + "step": 15496 + }, + { + "epoch": 4.756599140577041, + "grad_norm": 0.365005224943161, + "learning_rate": 5.635995197408645e-05, + "loss": 1.7894, + "step": 15497 + }, + { + "epoch": 4.7569060773480665, + "grad_norm": 0.21254757046699524, + "learning_rate": 5.635502175761099e-05, + "loss": 1.6969, + "step": 15498 + }, + { + "epoch": 4.757213014119092, + "grad_norm": 0.42865821719169617, + "learning_rate": 5.635009147833149e-05, + "loss": 1.7989, + "step": 15499 + }, + { + "epoch": 4.757519950890116, + "grad_norm": 0.35717228055000305, + "learning_rate": 5.634516113629665e-05, + "loss": 1.7338, + "step": 15500 + }, + { + "epoch": 4.757826887661142, + "grad_norm": 0.21582463383674622, + "learning_rate": 5.634023073155523e-05, + "loss": 1.7429, + "step": 15501 + }, + { + "epoch": 4.758133824432167, + "grad_norm": 0.3376842141151428, + "learning_rate": 5.633530026415592e-05, + "loss": 1.7703, + "step": 15502 + }, + { + "epoch": 4.758440761203192, + "grad_norm": 0.2760981023311615, + "learning_rate": 5.633036973414747e-05, + "loss": 1.7389, + "step": 15503 + }, + { + "epoch": 4.758747697974218, + "grad_norm": 0.3808997571468353, + "learning_rate": 5.63254391415786e-05, + "loss": 1.7513, + "step": 15504 + }, + { + "epoch": 4.759054634745242, + "grad_norm": 0.5152496695518494, + "learning_rate": 5.6320508486498014e-05, + "loss": 1.7376, + "step": 15505 + }, + { + "epoch": 4.759361571516267, + "grad_norm": 0.33983346819877625, + "learning_rate": 5.6315577768954464e-05, + "loss": 1.7209, + "step": 15506 + }, + { + "epoch": 4.759668508287293, + "grad_norm": 0.27064043283462524, + "learning_rate": 5.631064698899669e-05, + "loss": 1.7808, + "step": 15507 + }, + { + "epoch": 4.759975445058318, + "grad_norm": 0.3659237027168274, + "learning_rate": 5.630571614667339e-05, + "loss": 1.7706, + "step": 15508 + }, + { + "epoch": 4.760282381829343, + "grad_norm": 0.246379554271698, + "learning_rate": 5.63007852420333e-05, + "loss": 1.7425, + "step": 15509 + }, + { + "epoch": 4.760589318600369, + "grad_norm": 0.2683795392513275, + "learning_rate": 5.629585427512518e-05, + "loss": 1.7332, + "step": 15510 + }, + { + "epoch": 4.760896255371393, + "grad_norm": 0.32626205682754517, + "learning_rate": 5.6290923245997704e-05, + "loss": 1.786, + "step": 15511 + }, + { + "epoch": 4.7612031921424185, + "grad_norm": 0.23723098635673523, + "learning_rate": 5.6285992154699666e-05, + "loss": 1.7305, + "step": 15512 + }, + { + "epoch": 4.761510128913444, + "grad_norm": 0.26316091418266296, + "learning_rate": 5.628106100127976e-05, + "loss": 1.7804, + "step": 15513 + }, + { + "epoch": 4.761817065684469, + "grad_norm": 0.24376356601715088, + "learning_rate": 5.6276129785786726e-05, + "loss": 1.738, + "step": 15514 + }, + { + "epoch": 4.7621240024554945, + "grad_norm": 0.27778422832489014, + "learning_rate": 5.627119850826931e-05, + "loss": 1.7444, + "step": 15515 + }, + { + "epoch": 4.762430939226519, + "grad_norm": 0.3134306073188782, + "learning_rate": 5.6266267168776224e-05, + "loss": 1.7696, + "step": 15516 + }, + { + "epoch": 4.762737875997544, + "grad_norm": 0.2354283481836319, + "learning_rate": 5.6261335767356195e-05, + "loss": 1.799, + "step": 15517 + }, + { + "epoch": 4.76304481276857, + "grad_norm": 0.26902756094932556, + "learning_rate": 5.6256404304058e-05, + "loss": 1.7091, + "step": 15518 + }, + { + "epoch": 4.763351749539595, + "grad_norm": 0.2760716676712036, + "learning_rate": 5.6251472778930345e-05, + "loss": 1.742, + "step": 15519 + }, + { + "epoch": 4.76365868631062, + "grad_norm": 0.2138829231262207, + "learning_rate": 5.624654119202197e-05, + "loss": 1.7093, + "step": 15520 + }, + { + "epoch": 4.763965623081646, + "grad_norm": 0.31404614448547363, + "learning_rate": 5.624160954338162e-05, + "loss": 1.7467, + "step": 15521 + }, + { + "epoch": 4.76427255985267, + "grad_norm": 0.24810083210468292, + "learning_rate": 5.623667783305803e-05, + "loss": 1.745, + "step": 15522 + }, + { + "epoch": 4.764579496623695, + "grad_norm": 0.23674242198467255, + "learning_rate": 5.6231746061099913e-05, + "loss": 1.7662, + "step": 15523 + }, + { + "epoch": 4.764886433394721, + "grad_norm": 0.264230877161026, + "learning_rate": 5.622681422755606e-05, + "loss": 1.7627, + "step": 15524 + }, + { + "epoch": 4.765193370165746, + "grad_norm": 0.2982041537761688, + "learning_rate": 5.6221882332475165e-05, + "loss": 1.7558, + "step": 15525 + }, + { + "epoch": 4.765500306936771, + "grad_norm": 0.29215967655181885, + "learning_rate": 5.6216950375905975e-05, + "loss": 1.7981, + "step": 15526 + }, + { + "epoch": 4.765807243707796, + "grad_norm": 0.20014487206935883, + "learning_rate": 5.6212018357897244e-05, + "loss": 1.7113, + "step": 15527 + }, + { + "epoch": 4.766114180478821, + "grad_norm": 0.22359825670719147, + "learning_rate": 5.620708627849769e-05, + "loss": 1.7356, + "step": 15528 + }, + { + "epoch": 4.7664211172498465, + "grad_norm": 0.2254783809185028, + "learning_rate": 5.620215413775609e-05, + "loss": 1.7397, + "step": 15529 + }, + { + "epoch": 4.766728054020872, + "grad_norm": 0.2827560305595398, + "learning_rate": 5.619722193572117e-05, + "loss": 1.732, + "step": 15530 + }, + { + "epoch": 4.767034990791897, + "grad_norm": 0.22591307759284973, + "learning_rate": 5.619228967244165e-05, + "loss": 1.7713, + "step": 15531 + }, + { + "epoch": 4.7673419275629225, + "grad_norm": 0.25872737169265747, + "learning_rate": 5.618735734796632e-05, + "loss": 1.7291, + "step": 15532 + }, + { + "epoch": 4.767648864333947, + "grad_norm": 0.24515275657176971, + "learning_rate": 5.6182424962343884e-05, + "loss": 1.8079, + "step": 15533 + }, + { + "epoch": 4.767955801104972, + "grad_norm": 0.2456643134355545, + "learning_rate": 5.617749251562309e-05, + "loss": 1.7082, + "step": 15534 + }, + { + "epoch": 4.768262737875998, + "grad_norm": 0.21684220433235168, + "learning_rate": 5.6172560007852716e-05, + "loss": 1.7563, + "step": 15535 + }, + { + "epoch": 4.768569674647023, + "grad_norm": 0.2141445428133011, + "learning_rate": 5.616762743908147e-05, + "loss": 1.7115, + "step": 15536 + }, + { + "epoch": 4.768876611418047, + "grad_norm": 0.22502638399600983, + "learning_rate": 5.616269480935812e-05, + "loss": 1.723, + "step": 15537 + }, + { + "epoch": 4.769183548189073, + "grad_norm": 0.23387989401817322, + "learning_rate": 5.6157762118731416e-05, + "loss": 1.7775, + "step": 15538 + }, + { + "epoch": 4.769490484960098, + "grad_norm": 0.19615057110786438, + "learning_rate": 5.6152829367250096e-05, + "loss": 1.7696, + "step": 15539 + }, + { + "epoch": 4.769797421731123, + "grad_norm": 0.2408154010772705, + "learning_rate": 5.614789655496289e-05, + "loss": 1.7758, + "step": 15540 + }, + { + "epoch": 4.770104358502149, + "grad_norm": 0.20994634926319122, + "learning_rate": 5.614296368191859e-05, + "loss": 1.6935, + "step": 15541 + }, + { + "epoch": 4.770411295273174, + "grad_norm": 0.24135129153728485, + "learning_rate": 5.613803074816591e-05, + "loss": 1.7644, + "step": 15542 + }, + { + "epoch": 4.7707182320441985, + "grad_norm": 0.2380143105983734, + "learning_rate": 5.6133097753753625e-05, + "loss": 1.741, + "step": 15543 + }, + { + "epoch": 4.771025168815224, + "grad_norm": 0.30300623178482056, + "learning_rate": 5.6128164698730465e-05, + "loss": 1.7935, + "step": 15544 + }, + { + "epoch": 4.771332105586249, + "grad_norm": 0.2620760500431061, + "learning_rate": 5.612323158314519e-05, + "loss": 1.7436, + "step": 15545 + }, + { + "epoch": 4.7716390423572745, + "grad_norm": 0.3791491389274597, + "learning_rate": 5.6118298407046544e-05, + "loss": 1.7503, + "step": 15546 + }, + { + "epoch": 4.7719459791283, + "grad_norm": 0.3830909729003906, + "learning_rate": 5.61133651704833e-05, + "loss": 1.7651, + "step": 15547 + }, + { + "epoch": 4.772252915899324, + "grad_norm": 0.26680612564086914, + "learning_rate": 5.610843187350419e-05, + "loss": 1.8075, + "step": 15548 + }, + { + "epoch": 4.77255985267035, + "grad_norm": 0.38018953800201416, + "learning_rate": 5.610349851615798e-05, + "loss": 1.8301, + "step": 15549 + }, + { + "epoch": 4.772866789441375, + "grad_norm": 0.4514484107494354, + "learning_rate": 5.6098565098493414e-05, + "loss": 1.7709, + "step": 15550 + }, + { + "epoch": 4.7731737262124, + "grad_norm": 0.28267863392829895, + "learning_rate": 5.6093631620559254e-05, + "loss": 1.8087, + "step": 15551 + }, + { + "epoch": 4.773480662983426, + "grad_norm": 0.22541162371635437, + "learning_rate": 5.6088698082404256e-05, + "loss": 1.7457, + "step": 15552 + }, + { + "epoch": 4.773787599754451, + "grad_norm": 0.3012544512748718, + "learning_rate": 5.608376448407718e-05, + "loss": 1.7454, + "step": 15553 + }, + { + "epoch": 4.774094536525475, + "grad_norm": 0.2460169941186905, + "learning_rate": 5.607883082562677e-05, + "loss": 1.8237, + "step": 15554 + }, + { + "epoch": 4.774401473296501, + "grad_norm": 0.2918507158756256, + "learning_rate": 5.6073897107101804e-05, + "loss": 1.7416, + "step": 15555 + }, + { + "epoch": 4.774708410067526, + "grad_norm": 0.3104710280895233, + "learning_rate": 5.6068963328551016e-05, + "loss": 1.8162, + "step": 15556 + }, + { + "epoch": 4.7750153468385514, + "grad_norm": 0.2576459050178528, + "learning_rate": 5.606402949002317e-05, + "loss": 1.7732, + "step": 15557 + }, + { + "epoch": 4.775322283609577, + "grad_norm": 0.2373739629983902, + "learning_rate": 5.605909559156706e-05, + "loss": 1.7812, + "step": 15558 + }, + { + "epoch": 4.775629220380601, + "grad_norm": 0.30436694622039795, + "learning_rate": 5.6054161633231385e-05, + "loss": 1.7606, + "step": 15559 + }, + { + "epoch": 4.775936157151627, + "grad_norm": 0.3058558702468872, + "learning_rate": 5.604922761506495e-05, + "loss": 1.8384, + "step": 15560 + }, + { + "epoch": 4.776243093922652, + "grad_norm": 0.26421624422073364, + "learning_rate": 5.6044293537116496e-05, + "loss": 1.8041, + "step": 15561 + }, + { + "epoch": 4.776550030693677, + "grad_norm": 0.4945085346698761, + "learning_rate": 5.603935939943479e-05, + "loss": 1.7522, + "step": 15562 + }, + { + "epoch": 4.776856967464703, + "grad_norm": 0.41049134731292725, + "learning_rate": 5.6034425202068595e-05, + "loss": 1.7471, + "step": 15563 + }, + { + "epoch": 4.777163904235728, + "grad_norm": 0.22972853481769562, + "learning_rate": 5.602949094506668e-05, + "loss": 1.7041, + "step": 15564 + }, + { + "epoch": 4.777470841006752, + "grad_norm": 0.37373700737953186, + "learning_rate": 5.6024556628477785e-05, + "loss": 1.7811, + "step": 15565 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.3603375554084778, + "learning_rate": 5.6019622252350714e-05, + "loss": 1.8396, + "step": 15566 + }, + { + "epoch": 4.778084714548803, + "grad_norm": 0.2085956335067749, + "learning_rate": 5.601468781673419e-05, + "loss": 1.7453, + "step": 15567 + }, + { + "epoch": 4.778391651319828, + "grad_norm": 0.28871405124664307, + "learning_rate": 5.6009753321677e-05, + "loss": 1.7135, + "step": 15568 + }, + { + "epoch": 4.778698588090853, + "grad_norm": 0.2378411591053009, + "learning_rate": 5.600481876722791e-05, + "loss": 1.77, + "step": 15569 + }, + { + "epoch": 4.779005524861878, + "grad_norm": 0.2902696430683136, + "learning_rate": 5.599988415343567e-05, + "loss": 1.7416, + "step": 15570 + }, + { + "epoch": 4.7793124616329035, + "grad_norm": 0.36155447363853455, + "learning_rate": 5.5994949480349066e-05, + "loss": 1.7095, + "step": 15571 + }, + { + "epoch": 4.779619398403929, + "grad_norm": 0.24867403507232666, + "learning_rate": 5.599001474801686e-05, + "loss": 1.8063, + "step": 15572 + }, + { + "epoch": 4.779926335174954, + "grad_norm": 0.24853186309337616, + "learning_rate": 5.5985079956487815e-05, + "loss": 1.7537, + "step": 15573 + }, + { + "epoch": 4.7802332719459795, + "grad_norm": 0.31984636187553406, + "learning_rate": 5.598014510581071e-05, + "loss": 1.7888, + "step": 15574 + }, + { + "epoch": 4.780540208717004, + "grad_norm": 0.23907123506069183, + "learning_rate": 5.597521019603429e-05, + "loss": 1.7157, + "step": 15575 + }, + { + "epoch": 4.780847145488029, + "grad_norm": 0.25759413838386536, + "learning_rate": 5.597027522720736e-05, + "loss": 1.7579, + "step": 15576 + }, + { + "epoch": 4.781154082259055, + "grad_norm": 0.34123921394348145, + "learning_rate": 5.5965340199378654e-05, + "loss": 1.838, + "step": 15577 + }, + { + "epoch": 4.78146101903008, + "grad_norm": 0.2769980728626251, + "learning_rate": 5.596040511259697e-05, + "loss": 1.7889, + "step": 15578 + }, + { + "epoch": 4.781767955801105, + "grad_norm": 0.21936915814876556, + "learning_rate": 5.5955469966911066e-05, + "loss": 1.7434, + "step": 15579 + }, + { + "epoch": 4.78207489257213, + "grad_norm": 0.27583181858062744, + "learning_rate": 5.59505347623697e-05, + "loss": 1.7229, + "step": 15580 + }, + { + "epoch": 4.782381829343155, + "grad_norm": 0.24246171116828918, + "learning_rate": 5.594559949902168e-05, + "loss": 1.7368, + "step": 15581 + }, + { + "epoch": 4.78268876611418, + "grad_norm": 0.22705630958080292, + "learning_rate": 5.594066417691576e-05, + "loss": 1.7261, + "step": 15582 + }, + { + "epoch": 4.782995702885206, + "grad_norm": 0.23308728635311127, + "learning_rate": 5.593572879610072e-05, + "loss": 1.7451, + "step": 15583 + }, + { + "epoch": 4.783302639656231, + "grad_norm": 0.21654267609119415, + "learning_rate": 5.5930793356625324e-05, + "loss": 1.7133, + "step": 15584 + }, + { + "epoch": 4.783609576427256, + "grad_norm": 0.22884133458137512, + "learning_rate": 5.5925857858538347e-05, + "loss": 1.6899, + "step": 15585 + }, + { + "epoch": 4.783916513198281, + "grad_norm": 0.2396838665008545, + "learning_rate": 5.5920922301888555e-05, + "loss": 1.7837, + "step": 15586 + }, + { + "epoch": 4.784223449969306, + "grad_norm": 0.22941450774669647, + "learning_rate": 5.5915986686724765e-05, + "loss": 1.7443, + "step": 15587 + }, + { + "epoch": 4.7845303867403315, + "grad_norm": 0.23992502689361572, + "learning_rate": 5.591105101309572e-05, + "loss": 1.8054, + "step": 15588 + }, + { + "epoch": 4.784837323511357, + "grad_norm": 0.2540588974952698, + "learning_rate": 5.59061152810502e-05, + "loss": 1.855, + "step": 15589 + }, + { + "epoch": 4.785144260282382, + "grad_norm": 0.22691720724105835, + "learning_rate": 5.590117949063699e-05, + "loss": 1.7441, + "step": 15590 + }, + { + "epoch": 4.785451197053407, + "grad_norm": 0.23691289126873016, + "learning_rate": 5.5896243641904864e-05, + "loss": 1.8156, + "step": 15591 + }, + { + "epoch": 4.785758133824432, + "grad_norm": 0.2749332785606384, + "learning_rate": 5.589130773490261e-05, + "loss": 1.8157, + "step": 15592 + }, + { + "epoch": 4.786065070595457, + "grad_norm": 0.2435624748468399, + "learning_rate": 5.588637176967899e-05, + "loss": 1.7473, + "step": 15593 + }, + { + "epoch": 4.786372007366483, + "grad_norm": 0.22931383550167084, + "learning_rate": 5.5881435746282795e-05, + "loss": 1.7652, + "step": 15594 + }, + { + "epoch": 4.786678944137508, + "grad_norm": 0.23916593194007874, + "learning_rate": 5.587649966476282e-05, + "loss": 1.7415, + "step": 15595 + }, + { + "epoch": 4.786985880908533, + "grad_norm": 0.23483172059059143, + "learning_rate": 5.5871563525167814e-05, + "loss": 1.7308, + "step": 15596 + }, + { + "epoch": 4.787292817679558, + "grad_norm": 0.24850021302700043, + "learning_rate": 5.586662732754656e-05, + "loss": 1.8294, + "step": 15597 + }, + { + "epoch": 4.787599754450583, + "grad_norm": 0.2439260333776474, + "learning_rate": 5.586169107194788e-05, + "loss": 1.7599, + "step": 15598 + }, + { + "epoch": 4.787906691221608, + "grad_norm": 0.22379007935523987, + "learning_rate": 5.585675475842054e-05, + "loss": 1.7278, + "step": 15599 + }, + { + "epoch": 4.788213627992634, + "grad_norm": 0.2633908689022064, + "learning_rate": 5.58518183870133e-05, + "loss": 1.7318, + "step": 15600 + }, + { + "epoch": 4.788520564763659, + "grad_norm": 0.20992474257946014, + "learning_rate": 5.584688195777497e-05, + "loss": 1.7003, + "step": 15601 + }, + { + "epoch": 4.7888275015346835, + "grad_norm": 0.2460084706544876, + "learning_rate": 5.584194547075432e-05, + "loss": 1.78, + "step": 15602 + }, + { + "epoch": 4.789134438305709, + "grad_norm": 0.23955418169498444, + "learning_rate": 5.583700892600013e-05, + "loss": 1.7953, + "step": 15603 + }, + { + "epoch": 4.789441375076734, + "grad_norm": 0.2495713233947754, + "learning_rate": 5.583207232356121e-05, + "loss": 1.7874, + "step": 15604 + }, + { + "epoch": 4.7897483118477595, + "grad_norm": 0.22878028452396393, + "learning_rate": 5.5827135663486344e-05, + "loss": 1.7961, + "step": 15605 + }, + { + "epoch": 4.790055248618785, + "grad_norm": 0.2299363762140274, + "learning_rate": 5.582219894582429e-05, + "loss": 1.7497, + "step": 15606 + }, + { + "epoch": 4.79036218538981, + "grad_norm": 0.22896108031272888, + "learning_rate": 5.5817262170623865e-05, + "loss": 1.7543, + "step": 15607 + }, + { + "epoch": 4.790669122160835, + "grad_norm": 0.2150495946407318, + "learning_rate": 5.581232533793383e-05, + "loss": 1.8034, + "step": 15608 + }, + { + "epoch": 4.79097605893186, + "grad_norm": 0.21317999064922333, + "learning_rate": 5.580738844780301e-05, + "loss": 1.7482, + "step": 15609 + }, + { + "epoch": 4.791282995702885, + "grad_norm": 0.21904391050338745, + "learning_rate": 5.580245150028016e-05, + "loss": 1.7647, + "step": 15610 + }, + { + "epoch": 4.791589932473911, + "grad_norm": 0.2026481032371521, + "learning_rate": 5.5797514495414095e-05, + "loss": 1.6997, + "step": 15611 + }, + { + "epoch": 4.791896869244935, + "grad_norm": 0.22508487105369568, + "learning_rate": 5.579257743325359e-05, + "loss": 1.8258, + "step": 15612 + }, + { + "epoch": 4.79220380601596, + "grad_norm": 0.2801211178302765, + "learning_rate": 5.5787640313847435e-05, + "loss": 1.6991, + "step": 15613 + }, + { + "epoch": 4.792510742786986, + "grad_norm": 0.2696724236011505, + "learning_rate": 5.578270313724442e-05, + "loss": 1.7339, + "step": 15614 + }, + { + "epoch": 4.792817679558011, + "grad_norm": 0.2909143269062042, + "learning_rate": 5.577776590349334e-05, + "loss": 1.8481, + "step": 15615 + }, + { + "epoch": 4.793124616329036, + "grad_norm": 0.21682757139205933, + "learning_rate": 5.5772828612643005e-05, + "loss": 1.759, + "step": 15616 + }, + { + "epoch": 4.793431553100062, + "grad_norm": 0.23074059188365936, + "learning_rate": 5.576789126474219e-05, + "loss": 1.7652, + "step": 15617 + }, + { + "epoch": 4.793738489871086, + "grad_norm": 0.24018999934196472, + "learning_rate": 5.576295385983969e-05, + "loss": 1.7986, + "step": 15618 + }, + { + "epoch": 4.7940454266421115, + "grad_norm": 0.23987948894500732, + "learning_rate": 5.575801639798431e-05, + "loss": 1.779, + "step": 15619 + }, + { + "epoch": 4.794352363413137, + "grad_norm": 0.2138533890247345, + "learning_rate": 5.575307887922482e-05, + "loss": 1.7097, + "step": 15620 + }, + { + "epoch": 4.794659300184162, + "grad_norm": 0.1995106190443039, + "learning_rate": 5.5748141303610044e-05, + "loss": 1.6924, + "step": 15621 + }, + { + "epoch": 4.7949662369551875, + "grad_norm": 0.23547641932964325, + "learning_rate": 5.574320367118877e-05, + "loss": 1.8492, + "step": 15622 + }, + { + "epoch": 4.795273173726212, + "grad_norm": 0.22931239008903503, + "learning_rate": 5.5738265982009794e-05, + "loss": 1.8054, + "step": 15623 + }, + { + "epoch": 4.795580110497237, + "grad_norm": 0.19957222044467926, + "learning_rate": 5.573332823612191e-05, + "loss": 1.7464, + "step": 15624 + }, + { + "epoch": 4.795887047268263, + "grad_norm": 0.1990327090024948, + "learning_rate": 5.5728390433573905e-05, + "loss": 1.7438, + "step": 15625 + }, + { + "epoch": 4.796193984039288, + "grad_norm": 0.22276802361011505, + "learning_rate": 5.572345257441459e-05, + "loss": 1.7674, + "step": 15626 + }, + { + "epoch": 4.796500920810313, + "grad_norm": 0.2109617441892624, + "learning_rate": 5.571851465869277e-05, + "loss": 1.7577, + "step": 15627 + }, + { + "epoch": 4.796807857581339, + "grad_norm": 0.22917217016220093, + "learning_rate": 5.5713576686457234e-05, + "loss": 1.7478, + "step": 15628 + }, + { + "epoch": 4.797114794352363, + "grad_norm": 0.21016938984394073, + "learning_rate": 5.570863865775678e-05, + "loss": 1.8078, + "step": 15629 + }, + { + "epoch": 4.797421731123388, + "grad_norm": 0.22478216886520386, + "learning_rate": 5.5703700572640215e-05, + "loss": 1.7621, + "step": 15630 + }, + { + "epoch": 4.797728667894414, + "grad_norm": 0.26899904012680054, + "learning_rate": 5.569876243115634e-05, + "loss": 1.8065, + "step": 15631 + }, + { + "epoch": 4.798035604665439, + "grad_norm": 0.23187808692455292, + "learning_rate": 5.569382423335394e-05, + "loss": 1.7337, + "step": 15632 + }, + { + "epoch": 4.798342541436464, + "grad_norm": 0.2264855057001114, + "learning_rate": 5.568888597928185e-05, + "loss": 1.7879, + "step": 15633 + }, + { + "epoch": 4.798649478207489, + "grad_norm": 0.244137242436409, + "learning_rate": 5.568394766898886e-05, + "loss": 1.8307, + "step": 15634 + }, + { + "epoch": 4.798956414978514, + "grad_norm": 0.2400583177804947, + "learning_rate": 5.5679009302523744e-05, + "loss": 1.76, + "step": 15635 + }, + { + "epoch": 4.7992633517495396, + "grad_norm": 0.2324059158563614, + "learning_rate": 5.5674070879935347e-05, + "loss": 1.7594, + "step": 15636 + }, + { + "epoch": 4.799570288520565, + "grad_norm": 0.21753786504268646, + "learning_rate": 5.566913240127244e-05, + "loss": 1.7568, + "step": 15637 + }, + { + "epoch": 4.79987722529159, + "grad_norm": 0.21557624638080597, + "learning_rate": 5.566419386658386e-05, + "loss": 1.7733, + "step": 15638 + }, + { + "epoch": 4.800184162062616, + "grad_norm": 0.22795113921165466, + "learning_rate": 5.565925527591839e-05, + "loss": 1.7624, + "step": 15639 + }, + { + "epoch": 4.80049109883364, + "grad_norm": 0.23035180568695068, + "learning_rate": 5.565431662932484e-05, + "loss": 1.7436, + "step": 15640 + }, + { + "epoch": 4.800798035604665, + "grad_norm": 0.2569425404071808, + "learning_rate": 5.564937792685203e-05, + "loss": 1.7027, + "step": 15641 + }, + { + "epoch": 4.801104972375691, + "grad_norm": 0.20544980466365814, + "learning_rate": 5.564443916854875e-05, + "loss": 1.7125, + "step": 15642 + }, + { + "epoch": 4.801411909146716, + "grad_norm": 0.25040850043296814, + "learning_rate": 5.5639500354463815e-05, + "loss": 1.7646, + "step": 15643 + }, + { + "epoch": 4.8017188459177405, + "grad_norm": 0.1991344839334488, + "learning_rate": 5.563456148464602e-05, + "loss": 1.7206, + "step": 15644 + }, + { + "epoch": 4.802025782688766, + "grad_norm": 0.236537903547287, + "learning_rate": 5.56296225591442e-05, + "loss": 1.7288, + "step": 15645 + }, + { + "epoch": 4.802332719459791, + "grad_norm": 0.253619521856308, + "learning_rate": 5.562468357800714e-05, + "loss": 1.7347, + "step": 15646 + }, + { + "epoch": 4.8026396562308165, + "grad_norm": 0.22038741409778595, + "learning_rate": 5.561974454128367e-05, + "loss": 1.7854, + "step": 15647 + }, + { + "epoch": 4.802946593001842, + "grad_norm": 0.24848157167434692, + "learning_rate": 5.5614805449022576e-05, + "loss": 1.6904, + "step": 15648 + }, + { + "epoch": 4.803253529772867, + "grad_norm": 0.28735271096229553, + "learning_rate": 5.56098663012727e-05, + "loss": 1.7476, + "step": 15649 + }, + { + "epoch": 4.803560466543892, + "grad_norm": 0.2658432722091675, + "learning_rate": 5.5604927098082825e-05, + "loss": 1.7314, + "step": 15650 + }, + { + "epoch": 4.803867403314917, + "grad_norm": 0.20409154891967773, + "learning_rate": 5.559998783950179e-05, + "loss": 1.7698, + "step": 15651 + }, + { + "epoch": 4.804174340085942, + "grad_norm": 0.21932728588581085, + "learning_rate": 5.5595048525578384e-05, + "loss": 1.7808, + "step": 15652 + }, + { + "epoch": 4.804481276856968, + "grad_norm": 0.2549879848957062, + "learning_rate": 5.559010915636143e-05, + "loss": 1.8294, + "step": 15653 + }, + { + "epoch": 4.804788213627993, + "grad_norm": 0.2002289742231369, + "learning_rate": 5.5585169731899736e-05, + "loss": 1.732, + "step": 15654 + }, + { + "epoch": 4.805095150399017, + "grad_norm": 0.19988931715488434, + "learning_rate": 5.558023025224212e-05, + "loss": 1.7482, + "step": 15655 + }, + { + "epoch": 4.805402087170043, + "grad_norm": 0.21265259385108948, + "learning_rate": 5.55752907174374e-05, + "loss": 1.8003, + "step": 15656 + }, + { + "epoch": 4.805709023941068, + "grad_norm": 0.22365640103816986, + "learning_rate": 5.5570351127534395e-05, + "loss": 1.7536, + "step": 15657 + }, + { + "epoch": 4.806015960712093, + "grad_norm": 0.25516408681869507, + "learning_rate": 5.556541148258192e-05, + "loss": 1.7648, + "step": 15658 + }, + { + "epoch": 4.806322897483119, + "grad_norm": 0.24870765209197998, + "learning_rate": 5.5560471782628775e-05, + "loss": 1.7793, + "step": 15659 + }, + { + "epoch": 4.806629834254144, + "grad_norm": 0.22119416296482086, + "learning_rate": 5.555553202772379e-05, + "loss": 1.7464, + "step": 15660 + }, + { + "epoch": 4.8069367710251685, + "grad_norm": 0.2781904637813568, + "learning_rate": 5.555059221791579e-05, + "loss": 1.7537, + "step": 15661 + }, + { + "epoch": 4.807243707796194, + "grad_norm": 0.2433774471282959, + "learning_rate": 5.5545652353253574e-05, + "loss": 1.74, + "step": 15662 + }, + { + "epoch": 4.807550644567219, + "grad_norm": 0.19932180643081665, + "learning_rate": 5.554071243378598e-05, + "loss": 1.75, + "step": 15663 + }, + { + "epoch": 4.8078575813382445, + "grad_norm": 0.2428865283727646, + "learning_rate": 5.553577245956182e-05, + "loss": 1.7198, + "step": 15664 + }, + { + "epoch": 4.80816451810927, + "grad_norm": 0.2914198338985443, + "learning_rate": 5.553083243062991e-05, + "loss": 1.7544, + "step": 15665 + }, + { + "epoch": 4.808471454880294, + "grad_norm": 0.2274291068315506, + "learning_rate": 5.5525892347039056e-05, + "loss": 1.8213, + "step": 15666 + }, + { + "epoch": 4.80877839165132, + "grad_norm": 0.23662471771240234, + "learning_rate": 5.552095220883811e-05, + "loss": 1.8025, + "step": 15667 + }, + { + "epoch": 4.809085328422345, + "grad_norm": 0.23062555491924286, + "learning_rate": 5.551601201607587e-05, + "loss": 1.7109, + "step": 15668 + }, + { + "epoch": 4.80939226519337, + "grad_norm": 0.19986943900585175, + "learning_rate": 5.551107176880117e-05, + "loss": 1.7442, + "step": 15669 + }, + { + "epoch": 4.809699201964396, + "grad_norm": 0.2545560300350189, + "learning_rate": 5.5506131467062836e-05, + "loss": 1.7609, + "step": 15670 + }, + { + "epoch": 4.810006138735421, + "grad_norm": 0.253296434879303, + "learning_rate": 5.550119111090968e-05, + "loss": 1.7307, + "step": 15671 + }, + { + "epoch": 4.810313075506445, + "grad_norm": 0.19617940485477448, + "learning_rate": 5.549625070039052e-05, + "loss": 1.7507, + "step": 15672 + }, + { + "epoch": 4.810620012277471, + "grad_norm": 0.2525297999382019, + "learning_rate": 5.5491310235554193e-05, + "loss": 1.8021, + "step": 15673 + }, + { + "epoch": 4.810926949048496, + "grad_norm": 0.20537389814853668, + "learning_rate": 5.548636971644953e-05, + "loss": 1.7432, + "step": 15674 + }, + { + "epoch": 4.811233885819521, + "grad_norm": 0.19924211502075195, + "learning_rate": 5.548142914312533e-05, + "loss": 1.7741, + "step": 15675 + }, + { + "epoch": 4.811540822590547, + "grad_norm": 0.21121448278427124, + "learning_rate": 5.547648851563046e-05, + "loss": 1.7198, + "step": 15676 + }, + { + "epoch": 4.811847759361571, + "grad_norm": 0.23504914343357086, + "learning_rate": 5.547154783401369e-05, + "loss": 1.7173, + "step": 15677 + }, + { + "epoch": 4.8121546961325965, + "grad_norm": 0.2362392097711563, + "learning_rate": 5.54666070983239e-05, + "loss": 1.7752, + "step": 15678 + }, + { + "epoch": 4.812461632903622, + "grad_norm": 0.2524966895580292, + "learning_rate": 5.5461666308609886e-05, + "loss": 1.7943, + "step": 15679 + }, + { + "epoch": 4.812768569674647, + "grad_norm": 0.2250952422618866, + "learning_rate": 5.5456725464920476e-05, + "loss": 1.7606, + "step": 15680 + }, + { + "epoch": 4.8130755064456725, + "grad_norm": 0.21753156185150146, + "learning_rate": 5.5451784567304524e-05, + "loss": 1.7846, + "step": 15681 + }, + { + "epoch": 4.813382443216698, + "grad_norm": 0.220795676112175, + "learning_rate": 5.5446843615810825e-05, + "loss": 1.7422, + "step": 15682 + }, + { + "epoch": 4.813689379987722, + "grad_norm": 0.23597733676433563, + "learning_rate": 5.544190261048823e-05, + "loss": 1.7818, + "step": 15683 + }, + { + "epoch": 4.813996316758748, + "grad_norm": 0.2625976502895355, + "learning_rate": 5.543696155138557e-05, + "loss": 1.7796, + "step": 15684 + }, + { + "epoch": 4.814303253529773, + "grad_norm": 0.20515871047973633, + "learning_rate": 5.5432020438551656e-05, + "loss": 1.7096, + "step": 15685 + }, + { + "epoch": 4.814610190300798, + "grad_norm": 0.19353924691677094, + "learning_rate": 5.542707927203536e-05, + "loss": 1.7541, + "step": 15686 + }, + { + "epoch": 4.814917127071823, + "grad_norm": 0.21998172998428345, + "learning_rate": 5.5422138051885454e-05, + "loss": 1.7696, + "step": 15687 + }, + { + "epoch": 4.815224063842848, + "grad_norm": 0.27576857805252075, + "learning_rate": 5.5417196778150816e-05, + "loss": 1.7491, + "step": 15688 + }, + { + "epoch": 4.815531000613873, + "grad_norm": 0.28202036023139954, + "learning_rate": 5.5412255450880254e-05, + "loss": 1.8615, + "step": 15689 + }, + { + "epoch": 4.815837937384899, + "grad_norm": 0.29632845520973206, + "learning_rate": 5.540731407012263e-05, + "loss": 1.7698, + "step": 15690 + }, + { + "epoch": 4.816144874155924, + "grad_norm": 0.35393890738487244, + "learning_rate": 5.540237263592675e-05, + "loss": 1.7924, + "step": 15691 + }, + { + "epoch": 4.816451810926949, + "grad_norm": 0.23756493628025055, + "learning_rate": 5.5397431148341447e-05, + "loss": 1.8301, + "step": 15692 + }, + { + "epoch": 4.816758747697974, + "grad_norm": 0.310153603553772, + "learning_rate": 5.53924896074156e-05, + "loss": 1.8162, + "step": 15693 + }, + { + "epoch": 4.817065684468999, + "grad_norm": 0.3355565369129181, + "learning_rate": 5.538754801319797e-05, + "loss": 1.7738, + "step": 15694 + }, + { + "epoch": 4.8173726212400245, + "grad_norm": 0.2360079288482666, + "learning_rate": 5.5382606365737446e-05, + "loss": 1.6883, + "step": 15695 + }, + { + "epoch": 4.81767955801105, + "grad_norm": 0.2932819724082947, + "learning_rate": 5.537766466508286e-05, + "loss": 1.8045, + "step": 15696 + }, + { + "epoch": 4.817986494782075, + "grad_norm": 0.31298181414604187, + "learning_rate": 5.537272291128304e-05, + "loss": 1.7516, + "step": 15697 + }, + { + "epoch": 4.8182934315531, + "grad_norm": 0.22871924936771393, + "learning_rate": 5.5367781104386806e-05, + "loss": 1.7386, + "step": 15698 + }, + { + "epoch": 4.818600368324125, + "grad_norm": 0.27097782492637634, + "learning_rate": 5.5362839244443034e-05, + "loss": 1.733, + "step": 15699 + }, + { + "epoch": 4.81890730509515, + "grad_norm": 0.23296736180782318, + "learning_rate": 5.535789733150052e-05, + "loss": 1.7735, + "step": 15700 + }, + { + "epoch": 4.819214241866176, + "grad_norm": 0.22650237381458282, + "learning_rate": 5.5352955365608125e-05, + "loss": 1.7443, + "step": 15701 + }, + { + "epoch": 4.819521178637201, + "grad_norm": 0.25525161623954773, + "learning_rate": 5.534801334681471e-05, + "loss": 1.7379, + "step": 15702 + }, + { + "epoch": 4.819828115408226, + "grad_norm": 0.2249457836151123, + "learning_rate": 5.534307127516908e-05, + "loss": 1.7393, + "step": 15703 + }, + { + "epoch": 4.820135052179251, + "grad_norm": 0.1995566338300705, + "learning_rate": 5.5338129150720084e-05, + "loss": 1.7411, + "step": 15704 + }, + { + "epoch": 4.820441988950276, + "grad_norm": 0.250851035118103, + "learning_rate": 5.533318697351657e-05, + "loss": 1.7801, + "step": 15705 + }, + { + "epoch": 4.820748925721301, + "grad_norm": 0.3175830543041229, + "learning_rate": 5.532824474360737e-05, + "loss": 1.7553, + "step": 15706 + }, + { + "epoch": 4.821055862492327, + "grad_norm": 0.22842039167881012, + "learning_rate": 5.532330246104134e-05, + "loss": 1.7489, + "step": 15707 + }, + { + "epoch": 4.821362799263352, + "grad_norm": 0.21125485002994537, + "learning_rate": 5.531836012586732e-05, + "loss": 1.7543, + "step": 15708 + }, + { + "epoch": 4.8216697360343765, + "grad_norm": 0.33028700947761536, + "learning_rate": 5.531341773813414e-05, + "loss": 1.8237, + "step": 15709 + }, + { + "epoch": 4.821976672805402, + "grad_norm": 0.324564129114151, + "learning_rate": 5.530847529789067e-05, + "loss": 1.7288, + "step": 15710 + }, + { + "epoch": 4.822283609576427, + "grad_norm": 0.3299528956413269, + "learning_rate": 5.530353280518571e-05, + "loss": 1.7536, + "step": 15711 + }, + { + "epoch": 4.8225905463474525, + "grad_norm": 0.3535030782222748, + "learning_rate": 5.5298590260068136e-05, + "loss": 1.7941, + "step": 15712 + }, + { + "epoch": 4.822897483118478, + "grad_norm": 0.2627669870853424, + "learning_rate": 5.5293647662586804e-05, + "loss": 1.7638, + "step": 15713 + }, + { + "epoch": 4.823204419889503, + "grad_norm": 0.25569450855255127, + "learning_rate": 5.5288705012790535e-05, + "loss": 1.7396, + "step": 15714 + }, + { + "epoch": 4.823511356660528, + "grad_norm": 0.26099520921707153, + "learning_rate": 5.528376231072817e-05, + "loss": 1.7415, + "step": 15715 + }, + { + "epoch": 4.823818293431553, + "grad_norm": 0.31833693385124207, + "learning_rate": 5.527881955644858e-05, + "loss": 1.7683, + "step": 15716 + }, + { + "epoch": 4.824125230202578, + "grad_norm": 0.2753448188304901, + "learning_rate": 5.5273876750000594e-05, + "loss": 1.6653, + "step": 15717 + }, + { + "epoch": 4.824432166973604, + "grad_norm": 0.23816895484924316, + "learning_rate": 5.526893389143307e-05, + "loss": 1.7575, + "step": 15718 + }, + { + "epoch": 4.824739103744628, + "grad_norm": 0.25376051664352417, + "learning_rate": 5.5263990980794856e-05, + "loss": 1.755, + "step": 15719 + }, + { + "epoch": 4.8250460405156534, + "grad_norm": 0.2483726590871811, + "learning_rate": 5.52590480181348e-05, + "loss": 1.7566, + "step": 15720 + }, + { + "epoch": 4.825352977286679, + "grad_norm": 0.2073517143726349, + "learning_rate": 5.5254105003501746e-05, + "loss": 1.7069, + "step": 15721 + }, + { + "epoch": 4.825659914057704, + "grad_norm": 0.3166659474372864, + "learning_rate": 5.524916193694455e-05, + "loss": 1.7012, + "step": 15722 + }, + { + "epoch": 4.8259668508287294, + "grad_norm": 0.24518641829490662, + "learning_rate": 5.524421881851205e-05, + "loss": 1.7027, + "step": 15723 + }, + { + "epoch": 4.826273787599755, + "grad_norm": 0.23137906193733215, + "learning_rate": 5.523927564825311e-05, + "loss": 1.746, + "step": 15724 + }, + { + "epoch": 4.82658072437078, + "grad_norm": 0.27937051653862, + "learning_rate": 5.5234332426216586e-05, + "loss": 1.7064, + "step": 15725 + }, + { + "epoch": 4.826887661141805, + "grad_norm": 0.26408496499061584, + "learning_rate": 5.522938915245131e-05, + "loss": 1.6598, + "step": 15726 + }, + { + "epoch": 4.82719459791283, + "grad_norm": 0.22269997000694275, + "learning_rate": 5.5224445827006164e-05, + "loss": 1.7166, + "step": 15727 + }, + { + "epoch": 4.827501534683855, + "grad_norm": 0.22687453031539917, + "learning_rate": 5.5219502449929964e-05, + "loss": 1.7156, + "step": 15728 + }, + { + "epoch": 4.827808471454881, + "grad_norm": 0.26355600357055664, + "learning_rate": 5.5214559021271585e-05, + "loss": 1.8016, + "step": 15729 + }, + { + "epoch": 4.828115408225905, + "grad_norm": 0.30103012919425964, + "learning_rate": 5.520961554107987e-05, + "loss": 1.7856, + "step": 15730 + }, + { + "epoch": 4.82842234499693, + "grad_norm": 0.22604018449783325, + "learning_rate": 5.520467200940369e-05, + "loss": 1.813, + "step": 15731 + }, + { + "epoch": 4.828729281767956, + "grad_norm": 0.25435203313827515, + "learning_rate": 5.51997284262919e-05, + "loss": 1.7511, + "step": 15732 + }, + { + "epoch": 4.829036218538981, + "grad_norm": 0.2740691304206848, + "learning_rate": 5.519478479179333e-05, + "loss": 1.7326, + "step": 15733 + }, + { + "epoch": 4.829343155310006, + "grad_norm": 0.19710861146450043, + "learning_rate": 5.5189841105956866e-05, + "loss": 1.7581, + "step": 15734 + }, + { + "epoch": 4.829650092081032, + "grad_norm": 0.2315293401479721, + "learning_rate": 5.518489736883132e-05, + "loss": 1.6796, + "step": 15735 + }, + { + "epoch": 4.829957028852056, + "grad_norm": 0.2465476542711258, + "learning_rate": 5.51799535804656e-05, + "loss": 1.7276, + "step": 15736 + }, + { + "epoch": 4.8302639656230815, + "grad_norm": 0.20438486337661743, + "learning_rate": 5.5175009740908546e-05, + "loss": 1.7188, + "step": 15737 + }, + { + "epoch": 4.830570902394107, + "grad_norm": 0.24328351020812988, + "learning_rate": 5.5170065850209016e-05, + "loss": 1.7165, + "step": 15738 + }, + { + "epoch": 4.830877839165132, + "grad_norm": 0.22486837208271027, + "learning_rate": 5.516512190841586e-05, + "loss": 1.7369, + "step": 15739 + }, + { + "epoch": 4.8311847759361575, + "grad_norm": 0.2065822333097458, + "learning_rate": 5.5160177915577934e-05, + "loss": 1.7125, + "step": 15740 + }, + { + "epoch": 4.831491712707182, + "grad_norm": 0.21223095059394836, + "learning_rate": 5.5155233871744104e-05, + "loss": 1.7319, + "step": 15741 + }, + { + "epoch": 4.831798649478207, + "grad_norm": 0.25712934136390686, + "learning_rate": 5.515028977696325e-05, + "loss": 1.7847, + "step": 15742 + }, + { + "epoch": 4.832105586249233, + "grad_norm": 0.21289978921413422, + "learning_rate": 5.5145345631284215e-05, + "loss": 1.7629, + "step": 15743 + }, + { + "epoch": 4.832412523020258, + "grad_norm": 0.22347134351730347, + "learning_rate": 5.514040143475585e-05, + "loss": 1.7491, + "step": 15744 + }, + { + "epoch": 4.832719459791283, + "grad_norm": 0.20660510659217834, + "learning_rate": 5.513545718742702e-05, + "loss": 1.7377, + "step": 15745 + }, + { + "epoch": 4.833026396562309, + "grad_norm": 0.21612273156642914, + "learning_rate": 5.513051288934658e-05, + "loss": 1.7973, + "step": 15746 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.22515933215618134, + "learning_rate": 5.512556854056342e-05, + "loss": 1.7774, + "step": 15747 + }, + { + "epoch": 4.833640270104358, + "grad_norm": 0.21075554192066193, + "learning_rate": 5.512062414112639e-05, + "loss": 1.7741, + "step": 15748 + }, + { + "epoch": 4.833947206875384, + "grad_norm": 0.2203720659017563, + "learning_rate": 5.511567969108436e-05, + "loss": 1.7902, + "step": 15749 + }, + { + "epoch": 4.834254143646409, + "grad_norm": 0.20247167348861694, + "learning_rate": 5.511073519048616e-05, + "loss": 1.7084, + "step": 15750 + }, + { + "epoch": 4.834561080417434, + "grad_norm": 0.247711181640625, + "learning_rate": 5.5105790639380695e-05, + "loss": 1.8465, + "step": 15751 + }, + { + "epoch": 4.834868017188459, + "grad_norm": 0.22866854071617126, + "learning_rate": 5.51008460378168e-05, + "loss": 1.7252, + "step": 15752 + }, + { + "epoch": 4.835174953959484, + "grad_norm": 0.2335643470287323, + "learning_rate": 5.5095901385843374e-05, + "loss": 1.703, + "step": 15753 + }, + { + "epoch": 4.8354818907305095, + "grad_norm": 0.20874348282814026, + "learning_rate": 5.509095668350926e-05, + "loss": 1.7114, + "step": 15754 + }, + { + "epoch": 4.835788827501535, + "grad_norm": 0.19156917929649353, + "learning_rate": 5.5086011930863314e-05, + "loss": 1.6975, + "step": 15755 + }, + { + "epoch": 4.83609576427256, + "grad_norm": 0.23480524122714996, + "learning_rate": 5.508106712795443e-05, + "loss": 1.8291, + "step": 15756 + }, + { + "epoch": 4.8364027010435855, + "grad_norm": 0.20430417358875275, + "learning_rate": 5.5076122274831454e-05, + "loss": 1.7605, + "step": 15757 + }, + { + "epoch": 4.83670963781461, + "grad_norm": 0.26790598034858704, + "learning_rate": 5.5071177371543256e-05, + "loss": 1.7541, + "step": 15758 + }, + { + "epoch": 4.837016574585635, + "grad_norm": 0.3339289724826813, + "learning_rate": 5.506623241813873e-05, + "loss": 1.7566, + "step": 15759 + }, + { + "epoch": 4.837323511356661, + "grad_norm": 0.30528193712234497, + "learning_rate": 5.5061287414666726e-05, + "loss": 1.7371, + "step": 15760 + }, + { + "epoch": 4.837630448127686, + "grad_norm": 0.21059657633304596, + "learning_rate": 5.5056342361176114e-05, + "loss": 1.7599, + "step": 15761 + }, + { + "epoch": 4.83793738489871, + "grad_norm": 0.27918973565101624, + "learning_rate": 5.5051397257715756e-05, + "loss": 1.7485, + "step": 15762 + }, + { + "epoch": 4.838244321669736, + "grad_norm": 0.23147793114185333, + "learning_rate": 5.5046452104334514e-05, + "loss": 1.7121, + "step": 15763 + }, + { + "epoch": 4.838551258440761, + "grad_norm": 0.22028742730617523, + "learning_rate": 5.5041506901081294e-05, + "loss": 1.803, + "step": 15764 + }, + { + "epoch": 4.838858195211786, + "grad_norm": 0.22840891778469086, + "learning_rate": 5.5036561648004946e-05, + "loss": 1.7555, + "step": 15765 + }, + { + "epoch": 4.839165131982812, + "grad_norm": 0.2610893249511719, + "learning_rate": 5.503161634515433e-05, + "loss": 1.7873, + "step": 15766 + }, + { + "epoch": 4.839472068753837, + "grad_norm": 0.2530003786087036, + "learning_rate": 5.502667099257836e-05, + "loss": 1.7604, + "step": 15767 + }, + { + "epoch": 4.8397790055248615, + "grad_norm": 0.20120400190353394, + "learning_rate": 5.5021725590325854e-05, + "loss": 1.7476, + "step": 15768 + }, + { + "epoch": 4.840085942295887, + "grad_norm": 0.2189723700284958, + "learning_rate": 5.501678013844571e-05, + "loss": 1.7174, + "step": 15769 + }, + { + "epoch": 4.840392879066912, + "grad_norm": 0.2511899173259735, + "learning_rate": 5.501183463698683e-05, + "loss": 1.7589, + "step": 15770 + }, + { + "epoch": 4.8406998158379375, + "grad_norm": 0.24899333715438843, + "learning_rate": 5.5006889085998035e-05, + "loss": 1.7253, + "step": 15771 + }, + { + "epoch": 4.841006752608963, + "grad_norm": 0.21223559975624084, + "learning_rate": 5.5001943485528254e-05, + "loss": 1.6949, + "step": 15772 + }, + { + "epoch": 4.841313689379987, + "grad_norm": 0.21394596993923187, + "learning_rate": 5.499699783562632e-05, + "loss": 1.7827, + "step": 15773 + }, + { + "epoch": 4.841620626151013, + "grad_norm": 0.2379613220691681, + "learning_rate": 5.4992052136341134e-05, + "loss": 1.7968, + "step": 15774 + }, + { + "epoch": 4.841927562922038, + "grad_norm": 0.23748385906219482, + "learning_rate": 5.498710638772154e-05, + "loss": 1.797, + "step": 15775 + }, + { + "epoch": 4.842234499693063, + "grad_norm": 0.2502206265926361, + "learning_rate": 5.498216058981646e-05, + "loss": 1.7292, + "step": 15776 + }, + { + "epoch": 4.842541436464089, + "grad_norm": 0.23613516986370087, + "learning_rate": 5.497721474267475e-05, + "loss": 1.7353, + "step": 15777 + }, + { + "epoch": 4.842848373235114, + "grad_norm": 0.25274696946144104, + "learning_rate": 5.497226884634527e-05, + "loss": 1.7782, + "step": 15778 + }, + { + "epoch": 4.843155310006138, + "grad_norm": 0.19574183225631714, + "learning_rate": 5.496732290087694e-05, + "loss": 1.6926, + "step": 15779 + }, + { + "epoch": 4.843462246777164, + "grad_norm": 0.21040405333042145, + "learning_rate": 5.496237690631858e-05, + "loss": 1.7235, + "step": 15780 + }, + { + "epoch": 4.843769183548189, + "grad_norm": 0.22499679028987885, + "learning_rate": 5.495743086271913e-05, + "loss": 1.7889, + "step": 15781 + }, + { + "epoch": 4.844076120319214, + "grad_norm": 0.24623246490955353, + "learning_rate": 5.4952484770127433e-05, + "loss": 1.7357, + "step": 15782 + }, + { + "epoch": 4.84438305709024, + "grad_norm": 0.21706275641918182, + "learning_rate": 5.494753862859238e-05, + "loss": 1.7349, + "step": 15783 + }, + { + "epoch": 4.844689993861264, + "grad_norm": 0.20705166459083557, + "learning_rate": 5.4942592438162855e-05, + "loss": 1.7047, + "step": 15784 + }, + { + "epoch": 4.8449969306322895, + "grad_norm": 0.21216751635074615, + "learning_rate": 5.493764619888773e-05, + "loss": 1.7335, + "step": 15785 + }, + { + "epoch": 4.845303867403315, + "grad_norm": 0.2945895195007324, + "learning_rate": 5.493269991081588e-05, + "loss": 1.838, + "step": 15786 + }, + { + "epoch": 4.84561080417434, + "grad_norm": 0.22013652324676514, + "learning_rate": 5.492775357399621e-05, + "loss": 1.7541, + "step": 15787 + }, + { + "epoch": 4.8459177409453655, + "grad_norm": 0.25428512692451477, + "learning_rate": 5.4922807188477585e-05, + "loss": 1.7405, + "step": 15788 + }, + { + "epoch": 4.846224677716391, + "grad_norm": 0.23189012706279755, + "learning_rate": 5.49178607543089e-05, + "loss": 1.8075, + "step": 15789 + }, + { + "epoch": 4.846531614487415, + "grad_norm": 0.21637389063835144, + "learning_rate": 5.491291427153904e-05, + "loss": 1.7229, + "step": 15790 + }, + { + "epoch": 4.846838551258441, + "grad_norm": 0.20628009736537933, + "learning_rate": 5.490796774021687e-05, + "loss": 1.7605, + "step": 15791 + }, + { + "epoch": 4.847145488029466, + "grad_norm": 0.20845308899879456, + "learning_rate": 5.4903021160391276e-05, + "loss": 1.7864, + "step": 15792 + }, + { + "epoch": 4.847452424800491, + "grad_norm": 0.20367322862148285, + "learning_rate": 5.4898074532111164e-05, + "loss": 1.733, + "step": 15793 + }, + { + "epoch": 4.847759361571516, + "grad_norm": 0.2066505253314972, + "learning_rate": 5.489312785542543e-05, + "loss": 1.7113, + "step": 15794 + }, + { + "epoch": 4.848066298342541, + "grad_norm": 0.23874987661838531, + "learning_rate": 5.488818113038292e-05, + "loss": 1.7735, + "step": 15795 + }, + { + "epoch": 4.848373235113566, + "grad_norm": 0.26583850383758545, + "learning_rate": 5.488323435703254e-05, + "loss": 1.8019, + "step": 15796 + }, + { + "epoch": 4.848680171884592, + "grad_norm": 0.25207552313804626, + "learning_rate": 5.487828753542317e-05, + "loss": 1.7491, + "step": 15797 + }, + { + "epoch": 4.848987108655617, + "grad_norm": 0.23065905272960663, + "learning_rate": 5.48733406656037e-05, + "loss": 1.7451, + "step": 15798 + }, + { + "epoch": 4.849294045426642, + "grad_norm": 0.26914483308792114, + "learning_rate": 5.486839374762304e-05, + "loss": 1.7553, + "step": 15799 + }, + { + "epoch": 4.849600982197668, + "grad_norm": 0.2509605884552002, + "learning_rate": 5.4863446781530046e-05, + "loss": 1.7124, + "step": 15800 + }, + { + "epoch": 4.849907918968692, + "grad_norm": 0.2618432343006134, + "learning_rate": 5.485849976737362e-05, + "loss": 1.7368, + "step": 15801 + }, + { + "epoch": 4.850214855739718, + "grad_norm": 0.46875160932540894, + "learning_rate": 5.485355270520266e-05, + "loss": 1.7883, + "step": 15802 + }, + { + "epoch": 4.850521792510743, + "grad_norm": 0.37585484981536865, + "learning_rate": 5.4848605595066025e-05, + "loss": 1.7894, + "step": 15803 + }, + { + "epoch": 4.850828729281768, + "grad_norm": 0.2244408279657364, + "learning_rate": 5.4843658437012646e-05, + "loss": 1.7394, + "step": 15804 + }, + { + "epoch": 4.851135666052793, + "grad_norm": 0.4061773419380188, + "learning_rate": 5.48387112310914e-05, + "loss": 1.7703, + "step": 15805 + }, + { + "epoch": 4.851442602823818, + "grad_norm": 0.35925009846687317, + "learning_rate": 5.483376397735117e-05, + "loss": 1.7798, + "step": 15806 + }, + { + "epoch": 4.851749539594843, + "grad_norm": 0.23050184547901154, + "learning_rate": 5.482881667584084e-05, + "loss": 1.7984, + "step": 15807 + }, + { + "epoch": 4.852056476365869, + "grad_norm": 0.37308645248413086, + "learning_rate": 5.4823869326609335e-05, + "loss": 1.6747, + "step": 15808 + }, + { + "epoch": 4.852363413136894, + "grad_norm": 0.29826754331588745, + "learning_rate": 5.481892192970551e-05, + "loss": 1.7432, + "step": 15809 + }, + { + "epoch": 4.852670349907919, + "grad_norm": 0.23652370274066925, + "learning_rate": 5.4813974485178266e-05, + "loss": 1.7557, + "step": 15810 + }, + { + "epoch": 4.852977286678944, + "grad_norm": 0.40549808740615845, + "learning_rate": 5.4809026993076526e-05, + "loss": 1.7317, + "step": 15811 + }, + { + "epoch": 4.853284223449969, + "grad_norm": 0.3367961347103119, + "learning_rate": 5.4804079453449156e-05, + "loss": 1.7648, + "step": 15812 + }, + { + "epoch": 4.8535911602209945, + "grad_norm": 0.21629661321640015, + "learning_rate": 5.4799131866345055e-05, + "loss": 1.7986, + "step": 15813 + }, + { + "epoch": 4.85389809699202, + "grad_norm": 0.26381492614746094, + "learning_rate": 5.4794184231813105e-05, + "loss": 1.7401, + "step": 15814 + }, + { + "epoch": 4.854205033763045, + "grad_norm": 0.22319363057613373, + "learning_rate": 5.478923654990223e-05, + "loss": 1.7773, + "step": 15815 + }, + { + "epoch": 4.85451197053407, + "grad_norm": 0.2547159492969513, + "learning_rate": 5.4784288820661326e-05, + "loss": 1.8194, + "step": 15816 + }, + { + "epoch": 4.854818907305095, + "grad_norm": 0.29574522376060486, + "learning_rate": 5.477934104413925e-05, + "loss": 1.7351, + "step": 15817 + }, + { + "epoch": 4.85512584407612, + "grad_norm": 0.17389361560344696, + "learning_rate": 5.4774393220384945e-05, + "loss": 1.6957, + "step": 15818 + }, + { + "epoch": 4.855432780847146, + "grad_norm": 0.23746751248836517, + "learning_rate": 5.476944534944728e-05, + "loss": 1.7713, + "step": 15819 + }, + { + "epoch": 4.855739717618171, + "grad_norm": 0.182356595993042, + "learning_rate": 5.476449743137516e-05, + "loss": 1.7144, + "step": 15820 + }, + { + "epoch": 4.856046654389196, + "grad_norm": 0.23716382682323456, + "learning_rate": 5.4759549466217475e-05, + "loss": 1.7451, + "step": 15821 + }, + { + "epoch": 4.856353591160221, + "grad_norm": 0.316806823015213, + "learning_rate": 5.475460145402313e-05, + "loss": 1.7823, + "step": 15822 + }, + { + "epoch": 4.856660527931246, + "grad_norm": 0.2333129197359085, + "learning_rate": 5.474965339484105e-05, + "loss": 1.7788, + "step": 15823 + }, + { + "epoch": 4.856967464702271, + "grad_norm": 0.21180212497711182, + "learning_rate": 5.47447052887201e-05, + "loss": 1.7513, + "step": 15824 + }, + { + "epoch": 4.857274401473297, + "grad_norm": 0.22641299664974213, + "learning_rate": 5.473975713570919e-05, + "loss": 1.7514, + "step": 15825 + }, + { + "epoch": 4.857581338244322, + "grad_norm": 0.3179668188095093, + "learning_rate": 5.473480893585723e-05, + "loss": 1.7939, + "step": 15826 + }, + { + "epoch": 4.8578882750153465, + "grad_norm": 0.27463147044181824, + "learning_rate": 5.472986068921309e-05, + "loss": 1.7487, + "step": 15827 + }, + { + "epoch": 4.858195211786372, + "grad_norm": 0.18621626496315002, + "learning_rate": 5.472491239582572e-05, + "loss": 1.7155, + "step": 15828 + }, + { + "epoch": 4.858502148557397, + "grad_norm": 0.2437327802181244, + "learning_rate": 5.471996405574399e-05, + "loss": 1.7586, + "step": 15829 + }, + { + "epoch": 4.8588090853284225, + "grad_norm": 0.26658934354782104, + "learning_rate": 5.47150156690168e-05, + "loss": 1.7331, + "step": 15830 + }, + { + "epoch": 4.859116022099448, + "grad_norm": 0.2257174700498581, + "learning_rate": 5.471006723569308e-05, + "loss": 1.7556, + "step": 15831 + }, + { + "epoch": 4.859422958870473, + "grad_norm": 0.25434550642967224, + "learning_rate": 5.470511875582168e-05, + "loss": 1.7196, + "step": 15832 + }, + { + "epoch": 4.859729895641498, + "grad_norm": 0.2251453697681427, + "learning_rate": 5.470017022945156e-05, + "loss": 1.7174, + "step": 15833 + }, + { + "epoch": 4.860036832412523, + "grad_norm": 0.2757972180843353, + "learning_rate": 5.469522165663161e-05, + "loss": 1.7701, + "step": 15834 + }, + { + "epoch": 4.860343769183548, + "grad_norm": 0.2771994173526764, + "learning_rate": 5.469027303741072e-05, + "loss": 1.8085, + "step": 15835 + }, + { + "epoch": 4.860650705954574, + "grad_norm": 0.23825454711914062, + "learning_rate": 5.468532437183781e-05, + "loss": 1.733, + "step": 15836 + }, + { + "epoch": 4.860957642725598, + "grad_norm": 0.18100066483020782, + "learning_rate": 5.468037565996177e-05, + "loss": 1.7012, + "step": 15837 + }, + { + "epoch": 4.861264579496623, + "grad_norm": 0.22552812099456787, + "learning_rate": 5.4675426901831506e-05, + "loss": 1.728, + "step": 15838 + }, + { + "epoch": 4.861571516267649, + "grad_norm": 0.2505643665790558, + "learning_rate": 5.467047809749595e-05, + "loss": 1.7219, + "step": 15839 + }, + { + "epoch": 4.861878453038674, + "grad_norm": 0.25920796394348145, + "learning_rate": 5.4665529247003975e-05, + "loss": 1.7945, + "step": 15840 + }, + { + "epoch": 4.862185389809699, + "grad_norm": 0.23549394309520721, + "learning_rate": 5.466058035040452e-05, + "loss": 1.7904, + "step": 15841 + }, + { + "epoch": 4.862492326580725, + "grad_norm": 0.26510992646217346, + "learning_rate": 5.465563140774648e-05, + "loss": 1.8051, + "step": 15842 + }, + { + "epoch": 4.862799263351749, + "grad_norm": 0.19175390899181366, + "learning_rate": 5.465068241907876e-05, + "loss": 1.6799, + "step": 15843 + }, + { + "epoch": 4.8631062001227745, + "grad_norm": 0.2588976323604584, + "learning_rate": 5.464573338445025e-05, + "loss": 1.7394, + "step": 15844 + }, + { + "epoch": 4.8634131368938, + "grad_norm": 0.28729483485221863, + "learning_rate": 5.464078430390991e-05, + "loss": 1.797, + "step": 15845 + }, + { + "epoch": 4.863720073664825, + "grad_norm": 0.21302445232868195, + "learning_rate": 5.463583517750661e-05, + "loss": 1.7303, + "step": 15846 + }, + { + "epoch": 4.8640270104358505, + "grad_norm": 0.2407636195421219, + "learning_rate": 5.463088600528926e-05, + "loss": 1.7175, + "step": 15847 + }, + { + "epoch": 4.864333947206875, + "grad_norm": 0.25653502345085144, + "learning_rate": 5.4625936787306784e-05, + "loss": 1.6996, + "step": 15848 + }, + { + "epoch": 4.8646408839779, + "grad_norm": 0.2100832760334015, + "learning_rate": 5.462098752360809e-05, + "loss": 1.7416, + "step": 15849 + }, + { + "epoch": 4.864947820748926, + "grad_norm": 0.2785186469554901, + "learning_rate": 5.461603821424208e-05, + "loss": 1.74, + "step": 15850 + }, + { + "epoch": 4.865254757519951, + "grad_norm": 0.2896614968776703, + "learning_rate": 5.4611088859257696e-05, + "loss": 1.7436, + "step": 15851 + }, + { + "epoch": 4.865561694290976, + "grad_norm": 0.18890418112277985, + "learning_rate": 5.460613945870382e-05, + "loss": 1.7093, + "step": 15852 + }, + { + "epoch": 4.865868631062002, + "grad_norm": 0.27681079506874084, + "learning_rate": 5.4601190012629364e-05, + "loss": 1.8772, + "step": 15853 + }, + { + "epoch": 4.866175567833026, + "grad_norm": 0.24658115208148956, + "learning_rate": 5.4596240521083265e-05, + "loss": 1.776, + "step": 15854 + }, + { + "epoch": 4.866482504604051, + "grad_norm": 0.21958144009113312, + "learning_rate": 5.459129098411441e-05, + "loss": 1.7503, + "step": 15855 + }, + { + "epoch": 4.866789441375077, + "grad_norm": 0.2778300642967224, + "learning_rate": 5.458634140177174e-05, + "loss": 1.8194, + "step": 15856 + }, + { + "epoch": 4.867096378146102, + "grad_norm": 0.28673580288887024, + "learning_rate": 5.458139177410414e-05, + "loss": 1.8033, + "step": 15857 + }, + { + "epoch": 4.867403314917127, + "grad_norm": 0.24472850561141968, + "learning_rate": 5.457644210116055e-05, + "loss": 1.7304, + "step": 15858 + }, + { + "epoch": 4.867710251688152, + "grad_norm": 0.24581189453601837, + "learning_rate": 5.4571492382989886e-05, + "loss": 1.7443, + "step": 15859 + }, + { + "epoch": 4.868017188459177, + "grad_norm": 0.22296221554279327, + "learning_rate": 5.4566542619641045e-05, + "loss": 1.7201, + "step": 15860 + }, + { + "epoch": 4.8683241252302025, + "grad_norm": 0.2378673404455185, + "learning_rate": 5.456159281116295e-05, + "loss": 1.7893, + "step": 15861 + }, + { + "epoch": 4.868631062001228, + "grad_norm": 0.3320823907852173, + "learning_rate": 5.4556642957604534e-05, + "loss": 1.7944, + "step": 15862 + }, + { + "epoch": 4.868937998772253, + "grad_norm": 0.3303453326225281, + "learning_rate": 5.45516930590147e-05, + "loss": 1.7267, + "step": 15863 + }, + { + "epoch": 4.8692449355432785, + "grad_norm": 0.223227858543396, + "learning_rate": 5.454674311544235e-05, + "loss": 1.7477, + "step": 15864 + }, + { + "epoch": 4.869551872314303, + "grad_norm": 0.3012549579143524, + "learning_rate": 5.454179312693643e-05, + "loss": 1.731, + "step": 15865 + }, + { + "epoch": 4.869858809085328, + "grad_norm": 0.3780311942100525, + "learning_rate": 5.453684309354585e-05, + "loss": 1.7296, + "step": 15866 + }, + { + "epoch": 4.870165745856354, + "grad_norm": 0.2753889262676239, + "learning_rate": 5.4531893015319526e-05, + "loss": 1.8024, + "step": 15867 + }, + { + "epoch": 4.870472682627379, + "grad_norm": 0.2270934134721756, + "learning_rate": 5.452694289230639e-05, + "loss": 1.7095, + "step": 15868 + }, + { + "epoch": 4.870779619398404, + "grad_norm": 0.2621576488018036, + "learning_rate": 5.452199272455534e-05, + "loss": 1.75, + "step": 15869 + }, + { + "epoch": 4.871086556169429, + "grad_norm": 0.22175776958465576, + "learning_rate": 5.45170425121153e-05, + "loss": 1.7658, + "step": 15870 + }, + { + "epoch": 4.871393492940454, + "grad_norm": 0.2038736790418625, + "learning_rate": 5.451209225503521e-05, + "loss": 1.6916, + "step": 15871 + }, + { + "epoch": 4.871700429711479, + "grad_norm": 0.2493467777967453, + "learning_rate": 5.450714195336397e-05, + "loss": 1.7408, + "step": 15872 + }, + { + "epoch": 4.872007366482505, + "grad_norm": 0.1966754049062729, + "learning_rate": 5.450219160715052e-05, + "loss": 1.7379, + "step": 15873 + }, + { + "epoch": 4.87231430325353, + "grad_norm": 0.23193517327308655, + "learning_rate": 5.4497241216443775e-05, + "loss": 1.7736, + "step": 15874 + }, + { + "epoch": 4.872621240024555, + "grad_norm": 0.2164391279220581, + "learning_rate": 5.4492290781292646e-05, + "loss": 1.7618, + "step": 15875 + }, + { + "epoch": 4.87292817679558, + "grad_norm": 0.286460816860199, + "learning_rate": 5.448734030174607e-05, + "loss": 1.7745, + "step": 15876 + }, + { + "epoch": 4.873235113566605, + "grad_norm": 0.3454538881778717, + "learning_rate": 5.448238977785298e-05, + "loss": 1.7605, + "step": 15877 + }, + { + "epoch": 4.8735420503376305, + "grad_norm": 0.26775062084198, + "learning_rate": 5.447743920966227e-05, + "loss": 1.7263, + "step": 15878 + }, + { + "epoch": 4.873848987108656, + "grad_norm": 0.2644907832145691, + "learning_rate": 5.447248859722289e-05, + "loss": 1.8489, + "step": 15879 + }, + { + "epoch": 4.87415592387968, + "grad_norm": 0.21646654605865479, + "learning_rate": 5.446753794058376e-05, + "loss": 1.7605, + "step": 15880 + }, + { + "epoch": 4.874462860650706, + "grad_norm": 0.23431318998336792, + "learning_rate": 5.446258723979381e-05, + "loss": 1.7209, + "step": 15881 + }, + { + "epoch": 4.874769797421731, + "grad_norm": 0.24665607511997223, + "learning_rate": 5.4457636494901934e-05, + "loss": 1.813, + "step": 15882 + }, + { + "epoch": 4.875076734192756, + "grad_norm": 0.26269975304603577, + "learning_rate": 5.445268570595708e-05, + "loss": 1.8255, + "step": 15883 + }, + { + "epoch": 4.875383670963782, + "grad_norm": 0.2722402811050415, + "learning_rate": 5.444773487300819e-05, + "loss": 1.7795, + "step": 15884 + }, + { + "epoch": 4.875690607734807, + "grad_norm": 0.3235624134540558, + "learning_rate": 5.444278399610417e-05, + "loss": 1.7804, + "step": 15885 + }, + { + "epoch": 4.8759975445058314, + "grad_norm": 0.2647583782672882, + "learning_rate": 5.4437833075293964e-05, + "loss": 1.7359, + "step": 15886 + }, + { + "epoch": 4.876304481276857, + "grad_norm": 0.272370845079422, + "learning_rate": 5.443288211062649e-05, + "loss": 1.7605, + "step": 15887 + }, + { + "epoch": 4.876611418047882, + "grad_norm": 0.3147594630718231, + "learning_rate": 5.4427931102150675e-05, + "loss": 1.7118, + "step": 15888 + }, + { + "epoch": 4.8769183548189075, + "grad_norm": 0.22751441597938538, + "learning_rate": 5.442298004991544e-05, + "loss": 1.723, + "step": 15889 + }, + { + "epoch": 4.877225291589933, + "grad_norm": 0.2121521681547165, + "learning_rate": 5.441802895396972e-05, + "loss": 1.7485, + "step": 15890 + }, + { + "epoch": 4.877532228360957, + "grad_norm": 0.25370222330093384, + "learning_rate": 5.4413077814362466e-05, + "loss": 1.8064, + "step": 15891 + }, + { + "epoch": 4.877839165131983, + "grad_norm": 0.19492633640766144, + "learning_rate": 5.440812663114259e-05, + "loss": 1.6773, + "step": 15892 + }, + { + "epoch": 4.878146101903008, + "grad_norm": 0.2101750522851944, + "learning_rate": 5.440317540435901e-05, + "loss": 1.7215, + "step": 15893 + }, + { + "epoch": 4.878453038674033, + "grad_norm": 0.21150651574134827, + "learning_rate": 5.439822413406068e-05, + "loss": 1.7875, + "step": 15894 + }, + { + "epoch": 4.878759975445059, + "grad_norm": 0.21008379757404327, + "learning_rate": 5.439327282029651e-05, + "loss": 1.7108, + "step": 15895 + }, + { + "epoch": 4.879066912216084, + "grad_norm": 0.22885502874851227, + "learning_rate": 5.4388321463115453e-05, + "loss": 1.7899, + "step": 15896 + }, + { + "epoch": 4.879373848987108, + "grad_norm": 0.24868059158325195, + "learning_rate": 5.4383370062566444e-05, + "loss": 1.7368, + "step": 15897 + }, + { + "epoch": 4.879680785758134, + "grad_norm": 0.27225378155708313, + "learning_rate": 5.437841861869838e-05, + "loss": 1.7623, + "step": 15898 + }, + { + "epoch": 4.879987722529159, + "grad_norm": 0.23353120684623718, + "learning_rate": 5.437346713156023e-05, + "loss": 1.7908, + "step": 15899 + }, + { + "epoch": 4.880294659300184, + "grad_norm": 0.19032470881938934, + "learning_rate": 5.436851560120091e-05, + "loss": 1.7511, + "step": 15900 + }, + { + "epoch": 4.88060159607121, + "grad_norm": 0.23714862763881683, + "learning_rate": 5.4363564027669345e-05, + "loss": 1.7197, + "step": 15901 + }, + { + "epoch": 4.880908532842234, + "grad_norm": 0.24897022545337677, + "learning_rate": 5.4358612411014495e-05, + "loss": 1.7822, + "step": 15902 + }, + { + "epoch": 4.8812154696132595, + "grad_norm": 0.21433588862419128, + "learning_rate": 5.435366075128528e-05, + "loss": 1.7928, + "step": 15903 + }, + { + "epoch": 4.881522406384285, + "grad_norm": 0.30019649863243103, + "learning_rate": 5.4348709048530646e-05, + "loss": 1.8067, + "step": 15904 + }, + { + "epoch": 4.88182934315531, + "grad_norm": 0.20227669179439545, + "learning_rate": 5.4343757302799515e-05, + "loss": 1.7254, + "step": 15905 + }, + { + "epoch": 4.8821362799263355, + "grad_norm": 0.23447728157043457, + "learning_rate": 5.4338805514140836e-05, + "loss": 1.7314, + "step": 15906 + }, + { + "epoch": 4.882443216697361, + "grad_norm": 0.29545050859451294, + "learning_rate": 5.4333853682603506e-05, + "loss": 1.7659, + "step": 15907 + }, + { + "epoch": 4.882750153468385, + "grad_norm": 0.245390385389328, + "learning_rate": 5.432890180823652e-05, + "loss": 1.7264, + "step": 15908 + }, + { + "epoch": 4.883057090239411, + "grad_norm": 0.209987074136734, + "learning_rate": 5.432394989108879e-05, + "loss": 1.7174, + "step": 15909 + }, + { + "epoch": 4.883364027010436, + "grad_norm": 0.2402341365814209, + "learning_rate": 5.431899793120925e-05, + "loss": 1.7512, + "step": 15910 + }, + { + "epoch": 4.883670963781461, + "grad_norm": 0.26227688789367676, + "learning_rate": 5.431404592864684e-05, + "loss": 1.7697, + "step": 15911 + }, + { + "epoch": 4.883977900552486, + "grad_norm": 0.2556503117084503, + "learning_rate": 5.4309093883450504e-05, + "loss": 1.8191, + "step": 15912 + }, + { + "epoch": 4.884284837323511, + "grad_norm": 0.24766884744167328, + "learning_rate": 5.4304141795669174e-05, + "loss": 1.7574, + "step": 15913 + }, + { + "epoch": 4.884591774094536, + "grad_norm": 0.19925951957702637, + "learning_rate": 5.429918966535179e-05, + "loss": 1.7249, + "step": 15914 + }, + { + "epoch": 4.884898710865562, + "grad_norm": 0.1899442970752716, + "learning_rate": 5.4294237492547294e-05, + "loss": 1.7446, + "step": 15915 + }, + { + "epoch": 4.885205647636587, + "grad_norm": 0.25900956988334656, + "learning_rate": 5.4289285277304636e-05, + "loss": 1.725, + "step": 15916 + }, + { + "epoch": 4.885512584407612, + "grad_norm": 0.2537781000137329, + "learning_rate": 5.428433301967274e-05, + "loss": 1.7861, + "step": 15917 + }, + { + "epoch": 4.885819521178637, + "grad_norm": 0.26432034373283386, + "learning_rate": 5.427938071970054e-05, + "loss": 1.7538, + "step": 15918 + }, + { + "epoch": 4.886126457949662, + "grad_norm": 0.22722363471984863, + "learning_rate": 5.4274428377437e-05, + "loss": 1.7631, + "step": 15919 + }, + { + "epoch": 4.8864333947206875, + "grad_norm": 0.24846172332763672, + "learning_rate": 5.426947599293106e-05, + "loss": 1.7833, + "step": 15920 + }, + { + "epoch": 4.886740331491713, + "grad_norm": 0.24821995198726654, + "learning_rate": 5.426452356623165e-05, + "loss": 1.7638, + "step": 15921 + }, + { + "epoch": 4.887047268262738, + "grad_norm": 0.2796781063079834, + "learning_rate": 5.425957109738773e-05, + "loss": 1.6982, + "step": 15922 + }, + { + "epoch": 4.887354205033763, + "grad_norm": 0.2875385284423828, + "learning_rate": 5.425461858644821e-05, + "loss": 1.7172, + "step": 15923 + }, + { + "epoch": 4.887661141804788, + "grad_norm": 0.21614491939544678, + "learning_rate": 5.424966603346207e-05, + "loss": 1.7521, + "step": 15924 + }, + { + "epoch": 4.887968078575813, + "grad_norm": 0.22944390773773193, + "learning_rate": 5.4244713438478235e-05, + "loss": 1.772, + "step": 15925 + }, + { + "epoch": 4.888275015346839, + "grad_norm": 0.21566039323806763, + "learning_rate": 5.423976080154566e-05, + "loss": 1.734, + "step": 15926 + }, + { + "epoch": 4.888581952117864, + "grad_norm": 0.4253925383090973, + "learning_rate": 5.4234808122713275e-05, + "loss": 1.8017, + "step": 15927 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.239146426320076, + "learning_rate": 5.422985540203004e-05, + "loss": 1.7229, + "step": 15928 + }, + { + "epoch": 4.889195825659914, + "grad_norm": 0.2344054877758026, + "learning_rate": 5.42249026395449e-05, + "loss": 1.7111, + "step": 15929 + }, + { + "epoch": 4.889502762430939, + "grad_norm": 0.21717922389507294, + "learning_rate": 5.421994983530679e-05, + "loss": 1.7427, + "step": 15930 + }, + { + "epoch": 4.889809699201964, + "grad_norm": 0.26895472407341003, + "learning_rate": 5.421499698936466e-05, + "loss": 1.8402, + "step": 15931 + }, + { + "epoch": 4.89011663597299, + "grad_norm": 0.25761866569519043, + "learning_rate": 5.421004410176746e-05, + "loss": 1.7822, + "step": 15932 + }, + { + "epoch": 4.890423572744015, + "grad_norm": 0.24465128779411316, + "learning_rate": 5.420509117256415e-05, + "loss": 1.8074, + "step": 15933 + }, + { + "epoch": 4.8907305095150395, + "grad_norm": 0.2527398467063904, + "learning_rate": 5.4200138201803655e-05, + "loss": 1.7522, + "step": 15934 + }, + { + "epoch": 4.891037446286065, + "grad_norm": 0.23118112981319427, + "learning_rate": 5.4195185189534916e-05, + "loss": 1.7394, + "step": 15935 + }, + { + "epoch": 4.89134438305709, + "grad_norm": 0.2054537534713745, + "learning_rate": 5.419023213580691e-05, + "loss": 1.7096, + "step": 15936 + }, + { + "epoch": 4.8916513198281155, + "grad_norm": 0.2929638922214508, + "learning_rate": 5.418527904066858e-05, + "loss": 1.8733, + "step": 15937 + }, + { + "epoch": 4.891958256599141, + "grad_norm": 0.2957170009613037, + "learning_rate": 5.418032590416886e-05, + "loss": 1.7201, + "step": 15938 + }, + { + "epoch": 4.892265193370166, + "grad_norm": 0.2520081698894501, + "learning_rate": 5.417537272635672e-05, + "loss": 1.7034, + "step": 15939 + }, + { + "epoch": 4.892572130141191, + "grad_norm": 0.25217053294181824, + "learning_rate": 5.41704195072811e-05, + "loss": 1.8538, + "step": 15940 + }, + { + "epoch": 4.892879066912216, + "grad_norm": 0.23605379462242126, + "learning_rate": 5.416546624699093e-05, + "loss": 1.724, + "step": 15941 + }, + { + "epoch": 4.893186003683241, + "grad_norm": 0.321750283241272, + "learning_rate": 5.416051294553519e-05, + "loss": 1.806, + "step": 15942 + }, + { + "epoch": 4.893492940454267, + "grad_norm": 0.23800241947174072, + "learning_rate": 5.415555960296284e-05, + "loss": 1.7578, + "step": 15943 + }, + { + "epoch": 4.893799877225292, + "grad_norm": 0.3423094153404236, + "learning_rate": 5.4150606219322796e-05, + "loss": 1.7324, + "step": 15944 + }, + { + "epoch": 4.894106813996316, + "grad_norm": 0.453074187040329, + "learning_rate": 5.414565279466404e-05, + "loss": 1.7268, + "step": 15945 + }, + { + "epoch": 4.894413750767342, + "grad_norm": 0.21972697973251343, + "learning_rate": 5.4140699329035504e-05, + "loss": 1.6547, + "step": 15946 + }, + { + "epoch": 4.894720687538367, + "grad_norm": 0.32876282930374146, + "learning_rate": 5.413574582248616e-05, + "loss": 1.7527, + "step": 15947 + }, + { + "epoch": 4.895027624309392, + "grad_norm": 0.34035229682922363, + "learning_rate": 5.413079227506494e-05, + "loss": 1.7636, + "step": 15948 + }, + { + "epoch": 4.895334561080418, + "grad_norm": 0.2410411536693573, + "learning_rate": 5.412583868682082e-05, + "loss": 1.8114, + "step": 15949 + }, + { + "epoch": 4.895641497851443, + "grad_norm": 0.2787366211414337, + "learning_rate": 5.412088505780274e-05, + "loss": 1.7393, + "step": 15950 + }, + { + "epoch": 4.8959484346224675, + "grad_norm": 0.23288428783416748, + "learning_rate": 5.411593138805966e-05, + "loss": 1.7413, + "step": 15951 + }, + { + "epoch": 4.896255371393493, + "grad_norm": 0.26302778720855713, + "learning_rate": 5.411097767764053e-05, + "loss": 1.7372, + "step": 15952 + }, + { + "epoch": 4.896562308164518, + "grad_norm": 0.31638020277023315, + "learning_rate": 5.410602392659431e-05, + "loss": 1.8114, + "step": 15953 + }, + { + "epoch": 4.8968692449355435, + "grad_norm": 0.23361825942993164, + "learning_rate": 5.410107013496996e-05, + "loss": 1.7592, + "step": 15954 + }, + { + "epoch": 4.897176181706568, + "grad_norm": 0.19887785613536835, + "learning_rate": 5.409611630281642e-05, + "loss": 1.7509, + "step": 15955 + }, + { + "epoch": 4.897483118477593, + "grad_norm": 0.22396783530712128, + "learning_rate": 5.409116243018266e-05, + "loss": 1.6841, + "step": 15956 + }, + { + "epoch": 4.897790055248619, + "grad_norm": 0.20397686958312988, + "learning_rate": 5.4086208517117645e-05, + "loss": 1.7427, + "step": 15957 + }, + { + "epoch": 4.898096992019644, + "grad_norm": 0.20848311483860016, + "learning_rate": 5.4081254563670314e-05, + "loss": 1.713, + "step": 15958 + }, + { + "epoch": 4.898403928790669, + "grad_norm": 0.2739275395870209, + "learning_rate": 5.407630056988964e-05, + "loss": 1.7673, + "step": 15959 + }, + { + "epoch": 4.898710865561695, + "grad_norm": 0.21485929191112518, + "learning_rate": 5.407134653582456e-05, + "loss": 1.7347, + "step": 15960 + }, + { + "epoch": 4.899017802332719, + "grad_norm": 0.26980286836624146, + "learning_rate": 5.406639246152406e-05, + "loss": 1.7158, + "step": 15961 + }, + { + "epoch": 4.899324739103744, + "grad_norm": 0.22327515482902527, + "learning_rate": 5.4061438347037084e-05, + "loss": 1.7387, + "step": 15962 + }, + { + "epoch": 4.89963167587477, + "grad_norm": 0.2542823553085327, + "learning_rate": 5.4056484192412603e-05, + "loss": 1.7826, + "step": 15963 + }, + { + "epoch": 4.899938612645795, + "grad_norm": 0.3248840868473053, + "learning_rate": 5.405152999769956e-05, + "loss": 1.7878, + "step": 15964 + }, + { + "epoch": 4.9002455494168204, + "grad_norm": 0.21210803091526031, + "learning_rate": 5.404657576294691e-05, + "loss": 1.7378, + "step": 15965 + }, + { + "epoch": 4.900552486187845, + "grad_norm": 0.25679782032966614, + "learning_rate": 5.404162148820365e-05, + "loss": 1.7493, + "step": 15966 + }, + { + "epoch": 4.90085942295887, + "grad_norm": 0.36698678135871887, + "learning_rate": 5.4036667173518704e-05, + "loss": 1.7662, + "step": 15967 + }, + { + "epoch": 4.901166359729896, + "grad_norm": 0.3396874964237213, + "learning_rate": 5.403171281894105e-05, + "loss": 1.7618, + "step": 15968 + }, + { + "epoch": 4.901473296500921, + "grad_norm": 0.2792030870914459, + "learning_rate": 5.402675842451964e-05, + "loss": 1.7858, + "step": 15969 + }, + { + "epoch": 4.901780233271946, + "grad_norm": 0.24499626457691193, + "learning_rate": 5.4021803990303454e-05, + "loss": 1.7503, + "step": 15970 + }, + { + "epoch": 4.902087170042972, + "grad_norm": 0.29185110330581665, + "learning_rate": 5.401684951634144e-05, + "loss": 1.7536, + "step": 15971 + }, + { + "epoch": 4.902394106813996, + "grad_norm": 0.2480020374059677, + "learning_rate": 5.401189500268256e-05, + "loss": 1.7877, + "step": 15972 + }, + { + "epoch": 4.902701043585021, + "grad_norm": 0.3302663564682007, + "learning_rate": 5.400694044937579e-05, + "loss": 1.8693, + "step": 15973 + }, + { + "epoch": 4.903007980356047, + "grad_norm": 0.2500915825366974, + "learning_rate": 5.400198585647008e-05, + "loss": 1.7489, + "step": 15974 + }, + { + "epoch": 4.903314917127072, + "grad_norm": 0.25079864263534546, + "learning_rate": 5.399703122401441e-05, + "loss": 1.7965, + "step": 15975 + }, + { + "epoch": 4.903621853898097, + "grad_norm": 0.2643207907676697, + "learning_rate": 5.399207655205771e-05, + "loss": 1.7696, + "step": 15976 + }, + { + "epoch": 4.903928790669122, + "grad_norm": 0.23719522356987, + "learning_rate": 5.398712184064899e-05, + "loss": 1.7608, + "step": 15977 + }, + { + "epoch": 4.904235727440147, + "grad_norm": 0.25226888060569763, + "learning_rate": 5.3982167089837184e-05, + "loss": 1.8055, + "step": 15978 + }, + { + "epoch": 4.9045426642111725, + "grad_norm": 0.21601852774620056, + "learning_rate": 5.39772122996713e-05, + "loss": 1.7553, + "step": 15979 + }, + { + "epoch": 4.904849600982198, + "grad_norm": 0.20275430381298065, + "learning_rate": 5.397225747020023e-05, + "loss": 1.7221, + "step": 15980 + }, + { + "epoch": 4.905156537753223, + "grad_norm": 0.24815937876701355, + "learning_rate": 5.3967302601473e-05, + "loss": 1.8098, + "step": 15981 + }, + { + "epoch": 4.9054634745242485, + "grad_norm": 0.2193612903356552, + "learning_rate": 5.3962347693538575e-05, + "loss": 1.7116, + "step": 15982 + }, + { + "epoch": 4.905770411295273, + "grad_norm": 0.21409118175506592, + "learning_rate": 5.395739274644589e-05, + "loss": 1.7503, + "step": 15983 + }, + { + "epoch": 4.906077348066298, + "grad_norm": 0.20907564461231232, + "learning_rate": 5.3952437760243935e-05, + "loss": 1.7518, + "step": 15984 + }, + { + "epoch": 4.906384284837324, + "grad_norm": 0.21193571388721466, + "learning_rate": 5.394748273498168e-05, + "loss": 1.6905, + "step": 15985 + }, + { + "epoch": 4.906691221608349, + "grad_norm": 0.19729891419410706, + "learning_rate": 5.394252767070808e-05, + "loss": 1.7398, + "step": 15986 + }, + { + "epoch": 4.906998158379373, + "grad_norm": 0.2654789686203003, + "learning_rate": 5.393757256747211e-05, + "loss": 1.7931, + "step": 15987 + }, + { + "epoch": 4.907305095150399, + "grad_norm": 0.2627345025539398, + "learning_rate": 5.3932617425322726e-05, + "loss": 1.8174, + "step": 15988 + }, + { + "epoch": 4.907612031921424, + "grad_norm": 0.27162298560142517, + "learning_rate": 5.392766224430894e-05, + "loss": 1.8015, + "step": 15989 + }, + { + "epoch": 4.907918968692449, + "grad_norm": 0.24248667061328888, + "learning_rate": 5.3922707024479676e-05, + "loss": 1.7457, + "step": 15990 + }, + { + "epoch": 4.908225905463475, + "grad_norm": 0.24715331196784973, + "learning_rate": 5.391775176588393e-05, + "loss": 1.7724, + "step": 15991 + }, + { + "epoch": 4.9085328422345, + "grad_norm": 0.26335644721984863, + "learning_rate": 5.3912796468570656e-05, + "loss": 1.7183, + "step": 15992 + }, + { + "epoch": 4.9088397790055245, + "grad_norm": 0.23459944128990173, + "learning_rate": 5.3907841132588843e-05, + "loss": 1.7245, + "step": 15993 + }, + { + "epoch": 4.90914671577655, + "grad_norm": 0.21779637038707733, + "learning_rate": 5.3902885757987444e-05, + "loss": 1.7485, + "step": 15994 + }, + { + "epoch": 4.909453652547575, + "grad_norm": 0.227664977312088, + "learning_rate": 5.389793034481545e-05, + "loss": 1.7418, + "step": 15995 + }, + { + "epoch": 4.9097605893186005, + "grad_norm": 0.26230278611183167, + "learning_rate": 5.389297489312183e-05, + "loss": 1.7619, + "step": 15996 + }, + { + "epoch": 4.910067526089626, + "grad_norm": 0.22563579678535461, + "learning_rate": 5.388801940295555e-05, + "loss": 1.7168, + "step": 15997 + }, + { + "epoch": 4.91037446286065, + "grad_norm": 0.24829435348510742, + "learning_rate": 5.388306387436556e-05, + "loss": 1.7422, + "step": 15998 + }, + { + "epoch": 4.910681399631676, + "grad_norm": 0.24395976960659027, + "learning_rate": 5.387810830740088e-05, + "loss": 1.7783, + "step": 15999 + }, + { + "epoch": 4.910988336402701, + "grad_norm": 0.2189297378063202, + "learning_rate": 5.387315270211044e-05, + "loss": 1.7885, + "step": 16000 + }, + { + "epoch": 4.911295273173726, + "grad_norm": 0.21750971674919128, + "learning_rate": 5.386819705854324e-05, + "loss": 1.7659, + "step": 16001 + }, + { + "epoch": 4.911602209944752, + "grad_norm": 0.21907657384872437, + "learning_rate": 5.386324137674826e-05, + "loss": 1.789, + "step": 16002 + }, + { + "epoch": 4.911909146715777, + "grad_norm": 0.18778781592845917, + "learning_rate": 5.3858285656774465e-05, + "loss": 1.7151, + "step": 16003 + }, + { + "epoch": 4.912216083486801, + "grad_norm": 0.24217712879180908, + "learning_rate": 5.385332989867082e-05, + "loss": 1.8108, + "step": 16004 + }, + { + "epoch": 4.912523020257827, + "grad_norm": 0.27637016773223877, + "learning_rate": 5.384837410248632e-05, + "loss": 1.8368, + "step": 16005 + }, + { + "epoch": 4.912829957028852, + "grad_norm": 0.22366084158420563, + "learning_rate": 5.3843418268269926e-05, + "loss": 1.7351, + "step": 16006 + }, + { + "epoch": 4.913136893799877, + "grad_norm": 0.2742357552051544, + "learning_rate": 5.383846239607062e-05, + "loss": 1.7599, + "step": 16007 + }, + { + "epoch": 4.913443830570903, + "grad_norm": 0.2288598269224167, + "learning_rate": 5.383350648593738e-05, + "loss": 1.7056, + "step": 16008 + }, + { + "epoch": 4.913750767341927, + "grad_norm": 0.23319020867347717, + "learning_rate": 5.382855053791919e-05, + "loss": 1.7356, + "step": 16009 + }, + { + "epoch": 4.9140577041129525, + "grad_norm": 0.2232198268175125, + "learning_rate": 5.382359455206499e-05, + "loss": 1.7375, + "step": 16010 + }, + { + "epoch": 4.914364640883978, + "grad_norm": 0.24420048296451569, + "learning_rate": 5.381863852842381e-05, + "loss": 1.8287, + "step": 16011 + }, + { + "epoch": 4.914671577655003, + "grad_norm": 0.22653080523014069, + "learning_rate": 5.381368246704461e-05, + "loss": 1.7137, + "step": 16012 + }, + { + "epoch": 4.9149785144260285, + "grad_norm": 0.20439405739307404, + "learning_rate": 5.380872636797637e-05, + "loss": 1.7688, + "step": 16013 + }, + { + "epoch": 4.915285451197054, + "grad_norm": 0.2602155804634094, + "learning_rate": 5.380377023126806e-05, + "loss": 1.7875, + "step": 16014 + }, + { + "epoch": 4.915592387968078, + "grad_norm": 0.2757892608642578, + "learning_rate": 5.3798814056968647e-05, + "loss": 1.7446, + "step": 16015 + }, + { + "epoch": 4.915899324739104, + "grad_norm": 0.25938209891319275, + "learning_rate": 5.379385784512714e-05, + "loss": 1.6997, + "step": 16016 + }, + { + "epoch": 4.916206261510129, + "grad_norm": 0.2056962549686432, + "learning_rate": 5.37889015957925e-05, + "loss": 1.6961, + "step": 16017 + }, + { + "epoch": 4.916513198281154, + "grad_norm": 0.24388402700424194, + "learning_rate": 5.3783945309013714e-05, + "loss": 1.712, + "step": 16018 + }, + { + "epoch": 4.91682013505218, + "grad_norm": 0.2381993532180786, + "learning_rate": 5.3778988984839775e-05, + "loss": 1.7444, + "step": 16019 + }, + { + "epoch": 4.917127071823204, + "grad_norm": 0.20201562345027924, + "learning_rate": 5.377403262331964e-05, + "loss": 1.7254, + "step": 16020 + }, + { + "epoch": 4.917434008594229, + "grad_norm": 0.24019409716129303, + "learning_rate": 5.376907622450229e-05, + "loss": 1.684, + "step": 16021 + }, + { + "epoch": 4.917740945365255, + "grad_norm": 0.2441694289445877, + "learning_rate": 5.376411978843674e-05, + "loss": 1.7334, + "step": 16022 + }, + { + "epoch": 4.91804788213628, + "grad_norm": 0.23866300284862518, + "learning_rate": 5.3759163315171945e-05, + "loss": 1.7258, + "step": 16023 + }, + { + "epoch": 4.918354818907305, + "grad_norm": 0.28068670630455017, + "learning_rate": 5.375420680475689e-05, + "loss": 1.8049, + "step": 16024 + }, + { + "epoch": 4.918661755678331, + "grad_norm": 0.2956274151802063, + "learning_rate": 5.3749250257240566e-05, + "loss": 1.8544, + "step": 16025 + }, + { + "epoch": 4.918968692449355, + "grad_norm": 0.1971627175807953, + "learning_rate": 5.374429367267196e-05, + "loss": 1.7314, + "step": 16026 + }, + { + "epoch": 4.9192756292203805, + "grad_norm": 0.28565749526023865, + "learning_rate": 5.373933705110004e-05, + "loss": 1.7587, + "step": 16027 + }, + { + "epoch": 4.919582565991406, + "grad_norm": 0.3087369501590729, + "learning_rate": 5.37343803925738e-05, + "loss": 1.7708, + "step": 16028 + }, + { + "epoch": 4.919889502762431, + "grad_norm": 0.22460010647773743, + "learning_rate": 5.372942369714223e-05, + "loss": 1.7401, + "step": 16029 + }, + { + "epoch": 4.920196439533456, + "grad_norm": 0.29492735862731934, + "learning_rate": 5.3724466964854326e-05, + "loss": 1.7033, + "step": 16030 + }, + { + "epoch": 4.920503376304481, + "grad_norm": 0.24452674388885498, + "learning_rate": 5.371951019575904e-05, + "loss": 1.7688, + "step": 16031 + }, + { + "epoch": 4.920810313075506, + "grad_norm": 0.24686957895755768, + "learning_rate": 5.3714553389905366e-05, + "loss": 1.7463, + "step": 16032 + }, + { + "epoch": 4.921117249846532, + "grad_norm": 0.23661597073078156, + "learning_rate": 5.37095965473423e-05, + "loss": 1.7256, + "step": 16033 + }, + { + "epoch": 4.921424186617557, + "grad_norm": 0.22861288487911224, + "learning_rate": 5.370463966811884e-05, + "loss": 1.7722, + "step": 16034 + }, + { + "epoch": 4.921731123388582, + "grad_norm": 0.2453136146068573, + "learning_rate": 5.3699682752283944e-05, + "loss": 1.7343, + "step": 16035 + }, + { + "epoch": 4.922038060159607, + "grad_norm": 0.25267064571380615, + "learning_rate": 5.369472579988663e-05, + "loss": 1.7817, + "step": 16036 + }, + { + "epoch": 4.922344996930632, + "grad_norm": 0.25301575660705566, + "learning_rate": 5.368976881097586e-05, + "loss": 1.8146, + "step": 16037 + }, + { + "epoch": 4.922651933701657, + "grad_norm": 0.23579831421375275, + "learning_rate": 5.368481178560062e-05, + "loss": 1.8089, + "step": 16038 + }, + { + "epoch": 4.922958870472683, + "grad_norm": 0.2181949019432068, + "learning_rate": 5.367985472380993e-05, + "loss": 1.7689, + "step": 16039 + }, + { + "epoch": 4.923265807243708, + "grad_norm": 0.24622827768325806, + "learning_rate": 5.367489762565276e-05, + "loss": 1.791, + "step": 16040 + }, + { + "epoch": 4.9235727440147325, + "grad_norm": 0.2545134723186493, + "learning_rate": 5.3669940491178084e-05, + "loss": 1.738, + "step": 16041 + }, + { + "epoch": 4.923879680785758, + "grad_norm": 0.258139431476593, + "learning_rate": 5.366498332043491e-05, + "loss": 1.8303, + "step": 16042 + }, + { + "epoch": 4.924186617556783, + "grad_norm": 0.23804105818271637, + "learning_rate": 5.366002611347223e-05, + "loss": 1.751, + "step": 16043 + }, + { + "epoch": 4.9244935543278086, + "grad_norm": 0.2354477345943451, + "learning_rate": 5.365506887033901e-05, + "loss": 1.7911, + "step": 16044 + }, + { + "epoch": 4.924800491098834, + "grad_norm": 0.22212550044059753, + "learning_rate": 5.3650111591084276e-05, + "loss": 1.7439, + "step": 16045 + }, + { + "epoch": 4.925107427869859, + "grad_norm": 0.23621168732643127, + "learning_rate": 5.3645154275756984e-05, + "loss": 1.7339, + "step": 16046 + }, + { + "epoch": 4.925414364640884, + "grad_norm": 0.2163209468126297, + "learning_rate": 5.364019692440616e-05, + "loss": 1.7247, + "step": 16047 + }, + { + "epoch": 4.925721301411909, + "grad_norm": 0.21352291107177734, + "learning_rate": 5.3635239537080774e-05, + "loss": 1.7431, + "step": 16048 + }, + { + "epoch": 4.926028238182934, + "grad_norm": 0.3170754909515381, + "learning_rate": 5.36302821138298e-05, + "loss": 1.8075, + "step": 16049 + }, + { + "epoch": 4.92633517495396, + "grad_norm": 0.27073633670806885, + "learning_rate": 5.362532465470226e-05, + "loss": 1.7209, + "step": 16050 + }, + { + "epoch": 4.926642111724985, + "grad_norm": 0.2677803039550781, + "learning_rate": 5.362036715974714e-05, + "loss": 1.7454, + "step": 16051 + }, + { + "epoch": 4.9269490484960095, + "grad_norm": 0.3555704355239868, + "learning_rate": 5.3615409629013436e-05, + "loss": 1.7737, + "step": 16052 + }, + { + "epoch": 4.927255985267035, + "grad_norm": 0.2819947302341461, + "learning_rate": 5.3610452062550124e-05, + "loss": 1.7588, + "step": 16053 + }, + { + "epoch": 4.92756292203806, + "grad_norm": 0.26638996601104736, + "learning_rate": 5.360549446040621e-05, + "loss": 1.8078, + "step": 16054 + }, + { + "epoch": 4.9278698588090855, + "grad_norm": 0.37828773260116577, + "learning_rate": 5.360053682263069e-05, + "loss": 1.7527, + "step": 16055 + }, + { + "epoch": 4.928176795580111, + "grad_norm": 0.35836395621299744, + "learning_rate": 5.359557914927254e-05, + "loss": 1.7199, + "step": 16056 + }, + { + "epoch": 4.928483732351136, + "grad_norm": 0.2720802128314972, + "learning_rate": 5.359062144038078e-05, + "loss": 1.7598, + "step": 16057 + }, + { + "epoch": 4.928790669122161, + "grad_norm": 0.36662939190864563, + "learning_rate": 5.358566369600441e-05, + "loss": 1.7199, + "step": 16058 + }, + { + "epoch": 4.929097605893186, + "grad_norm": 0.42243221402168274, + "learning_rate": 5.3580705916192395e-05, + "loss": 1.7584, + "step": 16059 + }, + { + "epoch": 4.929404542664211, + "grad_norm": 0.21667765080928802, + "learning_rate": 5.357574810099375e-05, + "loss": 1.7608, + "step": 16060 + }, + { + "epoch": 4.929711479435237, + "grad_norm": 0.48101645708084106, + "learning_rate": 5.3570790250457456e-05, + "loss": 1.8157, + "step": 16061 + }, + { + "epoch": 4.930018416206261, + "grad_norm": 0.5289245843887329, + "learning_rate": 5.356583236463253e-05, + "loss": 1.7173, + "step": 16062 + }, + { + "epoch": 4.930325352977286, + "grad_norm": 0.21454930305480957, + "learning_rate": 5.356087444356795e-05, + "loss": 1.7399, + "step": 16063 + }, + { + "epoch": 4.930632289748312, + "grad_norm": 0.5648324489593506, + "learning_rate": 5.355591648731274e-05, + "loss": 1.7814, + "step": 16064 + }, + { + "epoch": 4.930939226519337, + "grad_norm": 0.5669483542442322, + "learning_rate": 5.355095849591587e-05, + "loss": 1.7769, + "step": 16065 + }, + { + "epoch": 4.931246163290362, + "grad_norm": 0.33108505606651306, + "learning_rate": 5.354600046942635e-05, + "loss": 1.7704, + "step": 16066 + }, + { + "epoch": 4.931553100061388, + "grad_norm": 0.31149306893348694, + "learning_rate": 5.3541042407893164e-05, + "loss": 1.7631, + "step": 16067 + }, + { + "epoch": 4.931860036832412, + "grad_norm": 0.30377596616744995, + "learning_rate": 5.353608431136532e-05, + "loss": 1.7888, + "step": 16068 + }, + { + "epoch": 4.9321669736034375, + "grad_norm": 0.25041452050209045, + "learning_rate": 5.3531126179891825e-05, + "loss": 1.7507, + "step": 16069 + }, + { + "epoch": 4.932473910374463, + "grad_norm": 0.33900725841522217, + "learning_rate": 5.352616801352167e-05, + "loss": 1.7365, + "step": 16070 + }, + { + "epoch": 4.932780847145488, + "grad_norm": 0.23939846456050873, + "learning_rate": 5.352120981230386e-05, + "loss": 1.7934, + "step": 16071 + }, + { + "epoch": 4.9330877839165135, + "grad_norm": 0.2419881969690323, + "learning_rate": 5.351625157628739e-05, + "loss": 1.7555, + "step": 16072 + }, + { + "epoch": 4.933394720687538, + "grad_norm": 0.3517596423625946, + "learning_rate": 5.351129330552125e-05, + "loss": 1.7102, + "step": 16073 + }, + { + "epoch": 4.933701657458563, + "grad_norm": 0.2660250663757324, + "learning_rate": 5.350633500005446e-05, + "loss": 1.7692, + "step": 16074 + }, + { + "epoch": 4.934008594229589, + "grad_norm": 0.20726454257965088, + "learning_rate": 5.350137665993601e-05, + "loss": 1.718, + "step": 16075 + }, + { + "epoch": 4.934315531000614, + "grad_norm": 0.28218522667884827, + "learning_rate": 5.3496418285214914e-05, + "loss": 1.8402, + "step": 16076 + }, + { + "epoch": 4.934622467771639, + "grad_norm": 0.2142515480518341, + "learning_rate": 5.349145987594015e-05, + "loss": 1.7571, + "step": 16077 + }, + { + "epoch": 4.934929404542665, + "grad_norm": 0.2777026891708374, + "learning_rate": 5.348650143216074e-05, + "loss": 1.7617, + "step": 16078 + }, + { + "epoch": 4.935236341313689, + "grad_norm": 0.24057620763778687, + "learning_rate": 5.348154295392567e-05, + "loss": 1.7149, + "step": 16079 + }, + { + "epoch": 4.935543278084714, + "grad_norm": 0.22220350801944733, + "learning_rate": 5.3476584441283964e-05, + "loss": 1.7402, + "step": 16080 + }, + { + "epoch": 4.93585021485574, + "grad_norm": 0.2451290488243103, + "learning_rate": 5.347162589428462e-05, + "loss": 1.7004, + "step": 16081 + }, + { + "epoch": 4.936157151626765, + "grad_norm": 0.25621771812438965, + "learning_rate": 5.3466667312976625e-05, + "loss": 1.7765, + "step": 16082 + }, + { + "epoch": 4.93646408839779, + "grad_norm": 0.217393159866333, + "learning_rate": 5.346170869740899e-05, + "loss": 1.7695, + "step": 16083 + }, + { + "epoch": 4.936771025168815, + "grad_norm": 0.21248537302017212, + "learning_rate": 5.345675004763071e-05, + "loss": 1.7277, + "step": 16084 + }, + { + "epoch": 4.93707796193984, + "grad_norm": 0.19431474804878235, + "learning_rate": 5.3451791363690805e-05, + "loss": 1.7352, + "step": 16085 + }, + { + "epoch": 4.9373848987108655, + "grad_norm": 0.20233909785747528, + "learning_rate": 5.344683264563829e-05, + "loss": 1.71, + "step": 16086 + }, + { + "epoch": 4.937691835481891, + "grad_norm": 0.2199622094631195, + "learning_rate": 5.344187389352214e-05, + "loss": 1.7443, + "step": 16087 + }, + { + "epoch": 4.937998772252916, + "grad_norm": 0.23495158553123474, + "learning_rate": 5.343691510739138e-05, + "loss": 1.7758, + "step": 16088 + }, + { + "epoch": 4.9383057090239415, + "grad_norm": 0.228348970413208, + "learning_rate": 5.3431956287295015e-05, + "loss": 1.7645, + "step": 16089 + }, + { + "epoch": 4.938612645794966, + "grad_norm": 0.2337537258863449, + "learning_rate": 5.342699743328203e-05, + "loss": 1.7353, + "step": 16090 + }, + { + "epoch": 4.938919582565991, + "grad_norm": 0.1899309754371643, + "learning_rate": 5.3422038545401454e-05, + "loss": 1.6907, + "step": 16091 + }, + { + "epoch": 4.939226519337017, + "grad_norm": 0.2479192316532135, + "learning_rate": 5.341707962370229e-05, + "loss": 1.7961, + "step": 16092 + }, + { + "epoch": 4.939533456108042, + "grad_norm": 0.2444314956665039, + "learning_rate": 5.341212066823355e-05, + "loss": 1.7768, + "step": 16093 + }, + { + "epoch": 4.939840392879067, + "grad_norm": 0.2123393714427948, + "learning_rate": 5.340716167904423e-05, + "loss": 1.7617, + "step": 16094 + }, + { + "epoch": 4.940147329650092, + "grad_norm": 0.20779116451740265, + "learning_rate": 5.340220265618334e-05, + "loss": 1.6951, + "step": 16095 + }, + { + "epoch": 4.940454266421117, + "grad_norm": 0.22189265489578247, + "learning_rate": 5.3397243599699884e-05, + "loss": 1.8368, + "step": 16096 + }, + { + "epoch": 4.940761203192142, + "grad_norm": 0.22316497564315796, + "learning_rate": 5.3392284509642875e-05, + "loss": 1.7096, + "step": 16097 + }, + { + "epoch": 4.941068139963168, + "grad_norm": 0.20406664907932281, + "learning_rate": 5.3387325386061346e-05, + "loss": 1.7269, + "step": 16098 + }, + { + "epoch": 4.941375076734193, + "grad_norm": 0.263007789850235, + "learning_rate": 5.338236622900427e-05, + "loss": 1.7663, + "step": 16099 + }, + { + "epoch": 4.941682013505218, + "grad_norm": 0.24388311803340912, + "learning_rate": 5.3377407038520654e-05, + "loss": 1.7113, + "step": 16100 + }, + { + "epoch": 4.941988950276243, + "grad_norm": 0.21918313205242157, + "learning_rate": 5.3372447814659524e-05, + "loss": 1.775, + "step": 16101 + }, + { + "epoch": 4.942295887047268, + "grad_norm": 0.30842962861061096, + "learning_rate": 5.336748855746989e-05, + "loss": 1.8229, + "step": 16102 + }, + { + "epoch": 4.9426028238182935, + "grad_norm": 0.2875657379627228, + "learning_rate": 5.336252926700077e-05, + "loss": 1.7377, + "step": 16103 + }, + { + "epoch": 4.942909760589319, + "grad_norm": 0.23411425948143005, + "learning_rate": 5.3357569943301156e-05, + "loss": 1.754, + "step": 16104 + }, + { + "epoch": 4.943216697360343, + "grad_norm": 0.29758864641189575, + "learning_rate": 5.335261058642007e-05, + "loss": 1.7471, + "step": 16105 + }, + { + "epoch": 4.943523634131369, + "grad_norm": 0.31761085987091064, + "learning_rate": 5.3347651196406534e-05, + "loss": 1.7658, + "step": 16106 + }, + { + "epoch": 4.943830570902394, + "grad_norm": 0.2487023025751114, + "learning_rate": 5.334269177330952e-05, + "loss": 1.786, + "step": 16107 + }, + { + "epoch": 4.944137507673419, + "grad_norm": 0.23954913020133972, + "learning_rate": 5.333773231717808e-05, + "loss": 1.8486, + "step": 16108 + }, + { + "epoch": 4.944444444444445, + "grad_norm": 0.24893096089363098, + "learning_rate": 5.3332772828061214e-05, + "loss": 1.7927, + "step": 16109 + }, + { + "epoch": 4.94475138121547, + "grad_norm": 0.28653839230537415, + "learning_rate": 5.332781330600795e-05, + "loss": 1.8331, + "step": 16110 + }, + { + "epoch": 4.945058317986494, + "grad_norm": 0.2597404718399048, + "learning_rate": 5.332285375106726e-05, + "loss": 1.7128, + "step": 16111 + }, + { + "epoch": 4.94536525475752, + "grad_norm": 0.23813198506832123, + "learning_rate": 5.3317894163288196e-05, + "loss": 1.7483, + "step": 16112 + }, + { + "epoch": 4.945672191528545, + "grad_norm": 0.2545793652534485, + "learning_rate": 5.331293454271974e-05, + "loss": 1.7987, + "step": 16113 + }, + { + "epoch": 4.94597912829957, + "grad_norm": 0.2453712821006775, + "learning_rate": 5.330797488941095e-05, + "loss": 1.7376, + "step": 16114 + }, + { + "epoch": 4.946286065070596, + "grad_norm": 0.20583751797676086, + "learning_rate": 5.33030152034108e-05, + "loss": 1.7038, + "step": 16115 + }, + { + "epoch": 4.94659300184162, + "grad_norm": 0.22557811439037323, + "learning_rate": 5.3298055484768313e-05, + "loss": 1.6999, + "step": 16116 + }, + { + "epoch": 4.9468999386126455, + "grad_norm": 0.23163801431655884, + "learning_rate": 5.329309573353252e-05, + "loss": 1.7575, + "step": 16117 + }, + { + "epoch": 4.947206875383671, + "grad_norm": 0.3560176491737366, + "learning_rate": 5.3288135949752394e-05, + "loss": 1.8494, + "step": 16118 + }, + { + "epoch": 4.947513812154696, + "grad_norm": 0.306379109621048, + "learning_rate": 5.328317613347701e-05, + "loss": 1.7229, + "step": 16119 + }, + { + "epoch": 4.9478207489257215, + "grad_norm": 0.24428823590278625, + "learning_rate": 5.3278216284755344e-05, + "loss": 1.7939, + "step": 16120 + }, + { + "epoch": 4.948127685696747, + "grad_norm": 0.22251521050930023, + "learning_rate": 5.327325640363643e-05, + "loss": 1.7624, + "step": 16121 + }, + { + "epoch": 4.948434622467771, + "grad_norm": 0.23310889303684235, + "learning_rate": 5.326829649016928e-05, + "loss": 1.7727, + "step": 16122 + }, + { + "epoch": 4.948741559238797, + "grad_norm": 0.22457881271839142, + "learning_rate": 5.326333654440291e-05, + "loss": 1.7602, + "step": 16123 + }, + { + "epoch": 4.949048496009822, + "grad_norm": 0.24032343924045563, + "learning_rate": 5.325837656638631e-05, + "loss": 1.7591, + "step": 16124 + }, + { + "epoch": 4.949355432780847, + "grad_norm": 0.25082892179489136, + "learning_rate": 5.3253416556168546e-05, + "loss": 1.7745, + "step": 16125 + }, + { + "epoch": 4.949662369551873, + "grad_norm": 0.22859038412570953, + "learning_rate": 5.3248456513798615e-05, + "loss": 1.7475, + "step": 16126 + }, + { + "epoch": 4.949969306322897, + "grad_norm": 0.27282553911209106, + "learning_rate": 5.3243496439325525e-05, + "loss": 1.7438, + "step": 16127 + }, + { + "epoch": 4.9502762430939224, + "grad_norm": 0.23622353374958038, + "learning_rate": 5.3238536332798303e-05, + "loss": 1.7625, + "step": 16128 + }, + { + "epoch": 4.950583179864948, + "grad_norm": 0.28060024976730347, + "learning_rate": 5.3233576194265975e-05, + "loss": 1.8028, + "step": 16129 + }, + { + "epoch": 4.950890116635973, + "grad_norm": 0.33281829953193665, + "learning_rate": 5.322861602377755e-05, + "loss": 1.7163, + "step": 16130 + }, + { + "epoch": 4.9511970534069984, + "grad_norm": 0.26457497477531433, + "learning_rate": 5.322365582138203e-05, + "loss": 1.7347, + "step": 16131 + }, + { + "epoch": 4.951503990178024, + "grad_norm": 0.21651674807071686, + "learning_rate": 5.3218695587128476e-05, + "loss": 1.7123, + "step": 16132 + }, + { + "epoch": 4.951810926949048, + "grad_norm": 0.2299882024526596, + "learning_rate": 5.3213735321065885e-05, + "loss": 1.775, + "step": 16133 + }, + { + "epoch": 4.952117863720074, + "grad_norm": 0.2252396047115326, + "learning_rate": 5.3208775023243265e-05, + "loss": 1.7598, + "step": 16134 + }, + { + "epoch": 4.952424800491099, + "grad_norm": 0.2263660430908203, + "learning_rate": 5.3203814693709655e-05, + "loss": 1.7519, + "step": 16135 + }, + { + "epoch": 4.952731737262124, + "grad_norm": 0.2425432950258255, + "learning_rate": 5.3198854332514056e-05, + "loss": 1.7769, + "step": 16136 + }, + { + "epoch": 4.953038674033149, + "grad_norm": 0.22624996304512024, + "learning_rate": 5.319389393970553e-05, + "loss": 1.7686, + "step": 16137 + }, + { + "epoch": 4.953345610804174, + "grad_norm": 0.2240568846464157, + "learning_rate": 5.318893351533306e-05, + "loss": 1.7795, + "step": 16138 + }, + { + "epoch": 4.953652547575199, + "grad_norm": 0.21708132326602936, + "learning_rate": 5.318397305944568e-05, + "loss": 1.7348, + "step": 16139 + }, + { + "epoch": 4.953959484346225, + "grad_norm": 0.2263328731060028, + "learning_rate": 5.3179012572092415e-05, + "loss": 1.7645, + "step": 16140 + }, + { + "epoch": 4.95426642111725, + "grad_norm": 0.2541986107826233, + "learning_rate": 5.3174052053322274e-05, + "loss": 1.723, + "step": 16141 + }, + { + "epoch": 4.954573357888275, + "grad_norm": 0.25829461216926575, + "learning_rate": 5.316909150318429e-05, + "loss": 1.7469, + "step": 16142 + }, + { + "epoch": 4.9548802946593, + "grad_norm": 0.21251125633716583, + "learning_rate": 5.3164130921727494e-05, + "loss": 1.7699, + "step": 16143 + }, + { + "epoch": 4.955187231430325, + "grad_norm": 0.29195618629455566, + "learning_rate": 5.315917030900091e-05, + "loss": 1.7373, + "step": 16144 + }, + { + "epoch": 4.9554941682013505, + "grad_norm": 0.29457888007164, + "learning_rate": 5.315420966505355e-05, + "loss": 1.7202, + "step": 16145 + }, + { + "epoch": 4.955801104972376, + "grad_norm": 0.19679461419582367, + "learning_rate": 5.314924898993443e-05, + "loss": 1.75, + "step": 16146 + }, + { + "epoch": 4.956108041743401, + "grad_norm": 0.287955105304718, + "learning_rate": 5.314428828369259e-05, + "loss": 1.7385, + "step": 16147 + }, + { + "epoch": 4.956414978514426, + "grad_norm": 0.3081825375556946, + "learning_rate": 5.313932754637706e-05, + "loss": 1.7558, + "step": 16148 + }, + { + "epoch": 4.956721915285451, + "grad_norm": 0.25226521492004395, + "learning_rate": 5.3134366778036846e-05, + "loss": 1.8407, + "step": 16149 + }, + { + "epoch": 4.957028852056476, + "grad_norm": 0.43601852655410767, + "learning_rate": 5.3129405978720984e-05, + "loss": 1.7762, + "step": 16150 + }, + { + "epoch": 4.957335788827502, + "grad_norm": 0.3630274832248688, + "learning_rate": 5.31244451484785e-05, + "loss": 1.7802, + "step": 16151 + }, + { + "epoch": 4.957642725598527, + "grad_norm": 0.21337948739528656, + "learning_rate": 5.311948428735841e-05, + "loss": 1.7107, + "step": 16152 + }, + { + "epoch": 4.957949662369552, + "grad_norm": 0.38581085205078125, + "learning_rate": 5.311452339540974e-05, + "loss": 1.7583, + "step": 16153 + }, + { + "epoch": 4.958256599140577, + "grad_norm": 0.28447309136390686, + "learning_rate": 5.310956247268154e-05, + "loss": 1.6992, + "step": 16154 + }, + { + "epoch": 4.958563535911602, + "grad_norm": 0.24510730803012848, + "learning_rate": 5.310460151922283e-05, + "loss": 1.7059, + "step": 16155 + }, + { + "epoch": 4.958870472682627, + "grad_norm": 0.41670146584510803, + "learning_rate": 5.309964053508262e-05, + "loss": 1.7191, + "step": 16156 + }, + { + "epoch": 4.959177409453653, + "grad_norm": 0.3123849034309387, + "learning_rate": 5.309467952030993e-05, + "loss": 1.7161, + "step": 16157 + }, + { + "epoch": 4.959484346224678, + "grad_norm": 0.2275281697511673, + "learning_rate": 5.308971847495382e-05, + "loss": 1.722, + "step": 16158 + }, + { + "epoch": 4.9597912829957025, + "grad_norm": 0.40216436982154846, + "learning_rate": 5.308475739906329e-05, + "loss": 1.7477, + "step": 16159 + }, + { + "epoch": 4.960098219766728, + "grad_norm": 0.259981244802475, + "learning_rate": 5.307979629268739e-05, + "loss": 1.7384, + "step": 16160 + }, + { + "epoch": 4.960405156537753, + "grad_norm": 0.22969573736190796, + "learning_rate": 5.3074835155875134e-05, + "loss": 1.7328, + "step": 16161 + }, + { + "epoch": 4.9607120933087785, + "grad_norm": 0.2773746848106384, + "learning_rate": 5.3069873988675556e-05, + "loss": 1.7333, + "step": 16162 + }, + { + "epoch": 4.961019030079804, + "grad_norm": 0.2764189541339874, + "learning_rate": 5.306491279113768e-05, + "loss": 1.7956, + "step": 16163 + }, + { + "epoch": 4.961325966850829, + "grad_norm": 0.3640958070755005, + "learning_rate": 5.305995156331054e-05, + "loss": 1.7464, + "step": 16164 + }, + { + "epoch": 4.961632903621854, + "grad_norm": 0.3573450446128845, + "learning_rate": 5.305499030524317e-05, + "loss": 1.75, + "step": 16165 + }, + { + "epoch": 4.961939840392879, + "grad_norm": 0.24313980340957642, + "learning_rate": 5.305002901698459e-05, + "loss": 1.7505, + "step": 16166 + }, + { + "epoch": 4.962246777163904, + "grad_norm": 0.3417615592479706, + "learning_rate": 5.304506769858384e-05, + "loss": 1.7387, + "step": 16167 + }, + { + "epoch": 4.96255371393493, + "grad_norm": 0.23209623992443085, + "learning_rate": 5.304010635008995e-05, + "loss": 1.7111, + "step": 16168 + }, + { + "epoch": 4.962860650705955, + "grad_norm": 0.2994776666164398, + "learning_rate": 5.3035144971551944e-05, + "loss": 1.75, + "step": 16169 + }, + { + "epoch": 4.963167587476979, + "grad_norm": 0.3147084712982178, + "learning_rate": 5.303018356301884e-05, + "loss": 1.7598, + "step": 16170 + }, + { + "epoch": 4.963474524248005, + "grad_norm": 0.20136526226997375, + "learning_rate": 5.30252221245397e-05, + "loss": 1.7217, + "step": 16171 + }, + { + "epoch": 4.96378146101903, + "grad_norm": 0.3308684229850769, + "learning_rate": 5.302026065616355e-05, + "loss": 1.7554, + "step": 16172 + }, + { + "epoch": 4.964088397790055, + "grad_norm": 0.22890877723693848, + "learning_rate": 5.30152991579394e-05, + "loss": 1.7598, + "step": 16173 + }, + { + "epoch": 4.964395334561081, + "grad_norm": 0.3036035895347595, + "learning_rate": 5.301033762991631e-05, + "loss": 1.758, + "step": 16174 + }, + { + "epoch": 4.964702271332106, + "grad_norm": 0.2983579933643341, + "learning_rate": 5.300537607214329e-05, + "loss": 1.8132, + "step": 16175 + }, + { + "epoch": 4.9650092081031305, + "grad_norm": 0.21401815116405487, + "learning_rate": 5.300041448466937e-05, + "loss": 1.7179, + "step": 16176 + }, + { + "epoch": 4.965316144874156, + "grad_norm": 0.2939651608467102, + "learning_rate": 5.2995452867543606e-05, + "loss": 1.7928, + "step": 16177 + }, + { + "epoch": 4.965623081645181, + "grad_norm": 0.24803484976291656, + "learning_rate": 5.2990491220815034e-05, + "loss": 1.7366, + "step": 16178 + }, + { + "epoch": 4.9659300184162065, + "grad_norm": 0.1999569535255432, + "learning_rate": 5.2985529544532656e-05, + "loss": 1.6691, + "step": 16179 + }, + { + "epoch": 4.966236955187231, + "grad_norm": 0.22315269708633423, + "learning_rate": 5.298056783874553e-05, + "loss": 1.7693, + "step": 16180 + }, + { + "epoch": 4.966543891958256, + "grad_norm": 0.22688794136047363, + "learning_rate": 5.2975606103502694e-05, + "loss": 1.8401, + "step": 16181 + }, + { + "epoch": 4.966850828729282, + "grad_norm": 0.2592024505138397, + "learning_rate": 5.297064433885317e-05, + "loss": 1.8054, + "step": 16182 + }, + { + "epoch": 4.967157765500307, + "grad_norm": 0.2508920133113861, + "learning_rate": 5.2965682544846e-05, + "loss": 1.766, + "step": 16183 + }, + { + "epoch": 4.967464702271332, + "grad_norm": 0.22318799793720245, + "learning_rate": 5.296072072153022e-05, + "loss": 1.751, + "step": 16184 + }, + { + "epoch": 4.967771639042358, + "grad_norm": 0.2348448485136032, + "learning_rate": 5.2955758868954855e-05, + "loss": 1.7844, + "step": 16185 + }, + { + "epoch": 4.968078575813382, + "grad_norm": 0.23294343054294586, + "learning_rate": 5.295079698716895e-05, + "loss": 1.7685, + "step": 16186 + }, + { + "epoch": 4.968385512584407, + "grad_norm": 0.20854508876800537, + "learning_rate": 5.2945835076221526e-05, + "loss": 1.6914, + "step": 16187 + }, + { + "epoch": 4.968692449355433, + "grad_norm": 0.21952031552791595, + "learning_rate": 5.294087313616165e-05, + "loss": 1.7121, + "step": 16188 + }, + { + "epoch": 4.968999386126458, + "grad_norm": 0.24097788333892822, + "learning_rate": 5.2935911167038346e-05, + "loss": 1.7712, + "step": 16189 + }, + { + "epoch": 4.969306322897483, + "grad_norm": 0.24433603882789612, + "learning_rate": 5.293094916890063e-05, + "loss": 1.7608, + "step": 16190 + }, + { + "epoch": 4.969613259668508, + "grad_norm": 0.22209061682224274, + "learning_rate": 5.292598714179757e-05, + "loss": 1.7563, + "step": 16191 + }, + { + "epoch": 4.969920196439533, + "grad_norm": 0.24291595816612244, + "learning_rate": 5.29210250857782e-05, + "loss": 1.7765, + "step": 16192 + }, + { + "epoch": 4.9702271332105585, + "grad_norm": 0.3143673837184906, + "learning_rate": 5.291606300089151e-05, + "loss": 1.7945, + "step": 16193 + }, + { + "epoch": 4.970534069981584, + "grad_norm": 0.22693613171577454, + "learning_rate": 5.291110088718661e-05, + "loss": 1.7411, + "step": 16194 + }, + { + "epoch": 4.970841006752609, + "grad_norm": 0.2271365374326706, + "learning_rate": 5.2906138744712494e-05, + "loss": 1.7754, + "step": 16195 + }, + { + "epoch": 4.9711479435236345, + "grad_norm": 0.2428499162197113, + "learning_rate": 5.290117657351822e-05, + "loss": 1.8007, + "step": 16196 + }, + { + "epoch": 4.971454880294659, + "grad_norm": 0.21862711012363434, + "learning_rate": 5.289621437365281e-05, + "loss": 1.7484, + "step": 16197 + }, + { + "epoch": 4.971761817065684, + "grad_norm": 0.26744964718818665, + "learning_rate": 5.2891252145165315e-05, + "loss": 1.7759, + "step": 16198 + }, + { + "epoch": 4.97206875383671, + "grad_norm": 0.2608526647090912, + "learning_rate": 5.288628988810477e-05, + "loss": 1.8527, + "step": 16199 + }, + { + "epoch": 4.972375690607735, + "grad_norm": 0.2245805710554123, + "learning_rate": 5.2881327602520216e-05, + "loss": 1.7773, + "step": 16200 + }, + { + "epoch": 4.97268262737876, + "grad_norm": 0.22023041546344757, + "learning_rate": 5.2876365288460694e-05, + "loss": 1.7101, + "step": 16201 + }, + { + "epoch": 4.972989564149785, + "grad_norm": 0.22034525871276855, + "learning_rate": 5.287140294597525e-05, + "loss": 1.7672, + "step": 16202 + }, + { + "epoch": 4.97329650092081, + "grad_norm": 0.23101158440113068, + "learning_rate": 5.286644057511292e-05, + "loss": 1.741, + "step": 16203 + }, + { + "epoch": 4.973603437691835, + "grad_norm": 0.23050430417060852, + "learning_rate": 5.286147817592273e-05, + "loss": 1.7727, + "step": 16204 + }, + { + "epoch": 4.973910374462861, + "grad_norm": 0.21803520619869232, + "learning_rate": 5.285651574845374e-05, + "loss": 1.7353, + "step": 16205 + }, + { + "epoch": 4.974217311233886, + "grad_norm": 0.22252169251441956, + "learning_rate": 5.2851553292754995e-05, + "loss": 1.7658, + "step": 16206 + }, + { + "epoch": 4.974524248004911, + "grad_norm": 0.22458864748477936, + "learning_rate": 5.284659080887552e-05, + "loss": 1.7157, + "step": 16207 + }, + { + "epoch": 4.974831184775936, + "grad_norm": 0.20769210159778595, + "learning_rate": 5.2841628296864376e-05, + "loss": 1.7731, + "step": 16208 + }, + { + "epoch": 4.975138121546961, + "grad_norm": 0.1952340304851532, + "learning_rate": 5.283666575677059e-05, + "loss": 1.6907, + "step": 16209 + }, + { + "epoch": 4.975445058317987, + "grad_norm": 0.21943804621696472, + "learning_rate": 5.28317031886432e-05, + "loss": 1.8007, + "step": 16210 + }, + { + "epoch": 4.975751995089012, + "grad_norm": 0.21987493336200714, + "learning_rate": 5.2826740592531276e-05, + "loss": 1.7205, + "step": 16211 + }, + { + "epoch": 4.976058931860036, + "grad_norm": 0.2076522558927536, + "learning_rate": 5.2821777968483845e-05, + "loss": 1.7063, + "step": 16212 + }, + { + "epoch": 4.976365868631062, + "grad_norm": 0.19126583635807037, + "learning_rate": 5.281681531654994e-05, + "loss": 1.7118, + "step": 16213 + }, + { + "epoch": 4.976672805402087, + "grad_norm": 0.22308050096035004, + "learning_rate": 5.2811852636778625e-05, + "loss": 1.7565, + "step": 16214 + }, + { + "epoch": 4.976979742173112, + "grad_norm": 0.23187528550624847, + "learning_rate": 5.280688992921893e-05, + "loss": 1.8261, + "step": 16215 + }, + { + "epoch": 4.977286678944138, + "grad_norm": 0.21373791992664337, + "learning_rate": 5.28019271939199e-05, + "loss": 1.6974, + "step": 16216 + }, + { + "epoch": 4.977593615715163, + "grad_norm": 0.21647346019744873, + "learning_rate": 5.2796964430930585e-05, + "loss": 1.7967, + "step": 16217 + }, + { + "epoch": 4.9779005524861875, + "grad_norm": 0.2231660932302475, + "learning_rate": 5.279200164030002e-05, + "loss": 1.7495, + "step": 16218 + }, + { + "epoch": 4.978207489257213, + "grad_norm": 0.2810545563697815, + "learning_rate": 5.278703882207728e-05, + "loss": 1.875, + "step": 16219 + }, + { + "epoch": 4.978514426028238, + "grad_norm": 0.298984557390213, + "learning_rate": 5.2782075976311374e-05, + "loss": 1.7494, + "step": 16220 + }, + { + "epoch": 4.9788213627992635, + "grad_norm": 0.2530893385410309, + "learning_rate": 5.2777113103051365e-05, + "loss": 1.7594, + "step": 16221 + }, + { + "epoch": 4.979128299570289, + "grad_norm": 0.26165664196014404, + "learning_rate": 5.277215020234629e-05, + "loss": 1.7543, + "step": 16222 + }, + { + "epoch": 4.979435236341313, + "grad_norm": 0.25115957856178284, + "learning_rate": 5.276718727424521e-05, + "loss": 1.7925, + "step": 16223 + }, + { + "epoch": 4.979742173112339, + "grad_norm": 0.22134126722812653, + "learning_rate": 5.276222431879716e-05, + "loss": 1.8359, + "step": 16224 + }, + { + "epoch": 4.980049109883364, + "grad_norm": 0.24447613954544067, + "learning_rate": 5.275726133605119e-05, + "loss": 1.7693, + "step": 16225 + }, + { + "epoch": 4.980356046654389, + "grad_norm": 0.23025095462799072, + "learning_rate": 5.275229832605635e-05, + "loss": 1.7911, + "step": 16226 + }, + { + "epoch": 4.980662983425415, + "grad_norm": 0.23424232006072998, + "learning_rate": 5.2747335288861686e-05, + "loss": 1.7628, + "step": 16227 + }, + { + "epoch": 4.98096992019644, + "grad_norm": 0.24598535895347595, + "learning_rate": 5.2742372224516235e-05, + "loss": 1.7651, + "step": 16228 + }, + { + "epoch": 4.981276856967464, + "grad_norm": 0.262893944978714, + "learning_rate": 5.273740913306906e-05, + "loss": 1.7282, + "step": 16229 + }, + { + "epoch": 4.98158379373849, + "grad_norm": 0.21981783211231232, + "learning_rate": 5.2732446014569207e-05, + "loss": 1.7448, + "step": 16230 + }, + { + "epoch": 4.981890730509515, + "grad_norm": 0.24244973063468933, + "learning_rate": 5.272748286906573e-05, + "loss": 1.7216, + "step": 16231 + }, + { + "epoch": 4.98219766728054, + "grad_norm": 0.2365221232175827, + "learning_rate": 5.272251969660766e-05, + "loss": 1.7227, + "step": 16232 + }, + { + "epoch": 4.982504604051566, + "grad_norm": 0.2081129401922226, + "learning_rate": 5.271755649724405e-05, + "loss": 1.7184, + "step": 16233 + }, + { + "epoch": 4.98281154082259, + "grad_norm": 0.2256374955177307, + "learning_rate": 5.271259327102395e-05, + "loss": 1.7412, + "step": 16234 + }, + { + "epoch": 4.9831184775936155, + "grad_norm": 0.23727381229400635, + "learning_rate": 5.270763001799643e-05, + "loss": 1.8095, + "step": 16235 + }, + { + "epoch": 4.983425414364641, + "grad_norm": 0.21498435735702515, + "learning_rate": 5.2702666738210504e-05, + "loss": 1.744, + "step": 16236 + }, + { + "epoch": 4.983732351135666, + "grad_norm": 0.24772173166275024, + "learning_rate": 5.269770343171525e-05, + "loss": 1.741, + "step": 16237 + }, + { + "epoch": 4.9840392879066915, + "grad_norm": 0.2835623621940613, + "learning_rate": 5.269274009855971e-05, + "loss": 1.7765, + "step": 16238 + }, + { + "epoch": 4.984346224677717, + "grad_norm": 0.2570044696331024, + "learning_rate": 5.2687776738792926e-05, + "loss": 1.8206, + "step": 16239 + }, + { + "epoch": 4.984653161448741, + "grad_norm": 0.21549640595912933, + "learning_rate": 5.268281335246397e-05, + "loss": 1.7022, + "step": 16240 + }, + { + "epoch": 4.984960098219767, + "grad_norm": 0.23158684372901917, + "learning_rate": 5.267784993962187e-05, + "loss": 1.7882, + "step": 16241 + }, + { + "epoch": 4.985267034990792, + "grad_norm": 0.22778423130512238, + "learning_rate": 5.26728865003157e-05, + "loss": 1.7358, + "step": 16242 + }, + { + "epoch": 4.985573971761817, + "grad_norm": 0.23197145760059357, + "learning_rate": 5.266792303459449e-05, + "loss": 1.7687, + "step": 16243 + }, + { + "epoch": 4.985880908532843, + "grad_norm": 0.19270172715187073, + "learning_rate": 5.26629595425073e-05, + "loss": 1.6999, + "step": 16244 + }, + { + "epoch": 4.986187845303867, + "grad_norm": 0.25262632966041565, + "learning_rate": 5.2657996024103175e-05, + "loss": 1.7536, + "step": 16245 + }, + { + "epoch": 4.986494782074892, + "grad_norm": 0.18620926141738892, + "learning_rate": 5.2653032479431185e-05, + "loss": 1.7033, + "step": 16246 + }, + { + "epoch": 4.986801718845918, + "grad_norm": 0.19537273049354553, + "learning_rate": 5.2648068908540374e-05, + "loss": 1.7457, + "step": 16247 + }, + { + "epoch": 4.987108655616943, + "grad_norm": 0.19447599351406097, + "learning_rate": 5.26431053114798e-05, + "loss": 1.7053, + "step": 16248 + }, + { + "epoch": 4.987415592387968, + "grad_norm": 0.20431137084960938, + "learning_rate": 5.263814168829852e-05, + "loss": 1.7695, + "step": 16249 + }, + { + "epoch": 4.987722529158994, + "grad_norm": 0.21123024821281433, + "learning_rate": 5.263317803904554e-05, + "loss": 1.7666, + "step": 16250 + }, + { + "epoch": 4.988029465930018, + "grad_norm": 0.21279335021972656, + "learning_rate": 5.262821436376998e-05, + "loss": 1.7231, + "step": 16251 + }, + { + "epoch": 4.9883364027010435, + "grad_norm": 0.22504910826683044, + "learning_rate": 5.262325066252085e-05, + "loss": 1.7657, + "step": 16252 + }, + { + "epoch": 4.988643339472069, + "grad_norm": 0.23505981266498566, + "learning_rate": 5.261828693534723e-05, + "loss": 1.7576, + "step": 16253 + }, + { + "epoch": 4.988950276243094, + "grad_norm": 0.21553601324558258, + "learning_rate": 5.261332318229817e-05, + "loss": 1.7782, + "step": 16254 + }, + { + "epoch": 4.989257213014119, + "grad_norm": 0.29189521074295044, + "learning_rate": 5.26083594034227e-05, + "loss": 1.7664, + "step": 16255 + }, + { + "epoch": 4.989564149785144, + "grad_norm": 0.38108906149864197, + "learning_rate": 5.26033955987699e-05, + "loss": 1.8573, + "step": 16256 + }, + { + "epoch": 4.989871086556169, + "grad_norm": 0.30329224467277527, + "learning_rate": 5.2598431768388824e-05, + "loss": 1.7584, + "step": 16257 + }, + { + "epoch": 4.990178023327195, + "grad_norm": 0.2437417358160019, + "learning_rate": 5.259346791232852e-05, + "loss": 1.7352, + "step": 16258 + }, + { + "epoch": 4.99048496009822, + "grad_norm": 0.3601737320423126, + "learning_rate": 5.258850403063804e-05, + "loss": 1.7206, + "step": 16259 + }, + { + "epoch": 4.990791896869245, + "grad_norm": 0.20259195566177368, + "learning_rate": 5.258354012336646e-05, + "loss": 1.7403, + "step": 16260 + }, + { + "epoch": 4.99109883364027, + "grad_norm": 0.38022148609161377, + "learning_rate": 5.257857619056281e-05, + "loss": 1.7783, + "step": 16261 + }, + { + "epoch": 4.991405770411295, + "grad_norm": 0.30131712555885315, + "learning_rate": 5.257361223227615e-05, + "loss": 1.7826, + "step": 16262 + }, + { + "epoch": 4.99171270718232, + "grad_norm": 0.24159663915634155, + "learning_rate": 5.2568648248555565e-05, + "loss": 1.7792, + "step": 16263 + }, + { + "epoch": 4.992019643953346, + "grad_norm": 0.4641213119029999, + "learning_rate": 5.2563684239450084e-05, + "loss": 1.7432, + "step": 16264 + }, + { + "epoch": 4.992326580724371, + "grad_norm": 0.3526865541934967, + "learning_rate": 5.255872020500877e-05, + "loss": 1.7736, + "step": 16265 + }, + { + "epoch": 4.9926335174953955, + "grad_norm": 0.2396051585674286, + "learning_rate": 5.255375614528071e-05, + "loss": 1.7505, + "step": 16266 + }, + { + "epoch": 4.992940454266421, + "grad_norm": 0.320987343788147, + "learning_rate": 5.25487920603149e-05, + "loss": 1.8229, + "step": 16267 + }, + { + "epoch": 4.993247391037446, + "grad_norm": 0.24689678847789764, + "learning_rate": 5.254382795016044e-05, + "loss": 1.7011, + "step": 16268 + }, + { + "epoch": 4.9935543278084715, + "grad_norm": 0.2407137155532837, + "learning_rate": 5.253886381486639e-05, + "loss": 1.741, + "step": 16269 + }, + { + "epoch": 4.993861264579497, + "grad_norm": 0.3677252531051636, + "learning_rate": 5.25338996544818e-05, + "loss": 1.7792, + "step": 16270 + }, + { + "epoch": 4.994168201350522, + "grad_norm": 0.25096553564071655, + "learning_rate": 5.252893546905573e-05, + "loss": 1.7523, + "step": 16271 + }, + { + "epoch": 4.994475138121547, + "grad_norm": 0.2966327965259552, + "learning_rate": 5.252397125863723e-05, + "loss": 1.7114, + "step": 16272 + }, + { + "epoch": 4.994782074892572, + "grad_norm": 0.36577650904655457, + "learning_rate": 5.2519007023275356e-05, + "loss": 1.7609, + "step": 16273 + }, + { + "epoch": 4.995089011663597, + "grad_norm": 0.2450687140226364, + "learning_rate": 5.25140427630192e-05, + "loss": 1.7452, + "step": 16274 + }, + { + "epoch": 4.995395948434623, + "grad_norm": 0.20782120525836945, + "learning_rate": 5.250907847791778e-05, + "loss": 1.7109, + "step": 16275 + }, + { + "epoch": 4.995702885205648, + "grad_norm": 0.2423330545425415, + "learning_rate": 5.25041141680202e-05, + "loss": 1.7234, + "step": 16276 + }, + { + "epoch": 4.996009821976672, + "grad_norm": 0.20855975151062012, + "learning_rate": 5.2499149833375484e-05, + "loss": 1.7734, + "step": 16277 + }, + { + "epoch": 4.996316758747698, + "grad_norm": 0.24400894343852997, + "learning_rate": 5.24941854740327e-05, + "loss": 1.7566, + "step": 16278 + }, + { + "epoch": 4.996623695518723, + "grad_norm": 0.4378018379211426, + "learning_rate": 5.2489221090040906e-05, + "loss": 1.7536, + "step": 16279 + }, + { + "epoch": 4.996930632289748, + "grad_norm": 0.20726722478866577, + "learning_rate": 5.248425668144918e-05, + "loss": 1.8008, + "step": 16280 + }, + { + "epoch": 4.997237569060774, + "grad_norm": 0.2506333589553833, + "learning_rate": 5.247929224830658e-05, + "loss": 1.7404, + "step": 16281 + }, + { + "epoch": 4.997544505831799, + "grad_norm": 0.24178004264831543, + "learning_rate": 5.247432779066216e-05, + "loss": 1.7517, + "step": 16282 + }, + { + "epoch": 4.9978514426028235, + "grad_norm": 0.2500220835208893, + "learning_rate": 5.246936330856499e-05, + "loss": 1.7705, + "step": 16283 + }, + { + "epoch": 4.998158379373849, + "grad_norm": 0.30043718218803406, + "learning_rate": 5.24643988020641e-05, + "loss": 1.8118, + "step": 16284 + }, + { + "epoch": 4.998465316144874, + "grad_norm": 0.284805566072464, + "learning_rate": 5.245943427120859e-05, + "loss": 1.7968, + "step": 16285 + }, + { + "epoch": 4.9987722529158995, + "grad_norm": 0.3652406632900238, + "learning_rate": 5.245446971604751e-05, + "loss": 1.7785, + "step": 16286 + }, + { + "epoch": 4.999079189686924, + "grad_norm": 0.24879656732082367, + "learning_rate": 5.244950513662992e-05, + "loss": 1.734, + "step": 16287 + }, + { + "epoch": 4.999386126457949, + "grad_norm": 0.2374224215745926, + "learning_rate": 5.244454053300488e-05, + "loss": 1.7394, + "step": 16288 + }, + { + "epoch": 4.999693063228975, + "grad_norm": 0.27090463042259216, + "learning_rate": 5.243957590522147e-05, + "loss": 1.7529, + "step": 16289 + }, + { + "epoch": 5.0, + "grad_norm": 0.23060791194438934, + "learning_rate": 5.243461125332873e-05, + "loss": 1.7599, + "step": 16290 + }, + { + "epoch": 5.000306936771025, + "grad_norm": 0.21159487962722778, + "learning_rate": 5.242964657737572e-05, + "loss": 1.747, + "step": 16291 + }, + { + "epoch": 5.000613873542051, + "grad_norm": 0.21556304395198822, + "learning_rate": 5.242468187741154e-05, + "loss": 1.7653, + "step": 16292 + }, + { + "epoch": 5.000920810313075, + "grad_norm": 0.2569669783115387, + "learning_rate": 5.241971715348524e-05, + "loss": 1.7284, + "step": 16293 + }, + { + "epoch": 5.0012277470841005, + "grad_norm": 0.2827381491661072, + "learning_rate": 5.241475240564586e-05, + "loss": 1.7765, + "step": 16294 + }, + { + "epoch": 5.001534683855126, + "grad_norm": 0.22498267889022827, + "learning_rate": 5.240978763394249e-05, + "loss": 1.729, + "step": 16295 + }, + { + "epoch": 5.001841620626151, + "grad_norm": 0.23975814878940582, + "learning_rate": 5.240482283842418e-05, + "loss": 1.7968, + "step": 16296 + }, + { + "epoch": 5.0021485573971765, + "grad_norm": 0.20811420679092407, + "learning_rate": 5.239985801914e-05, + "loss": 1.6931, + "step": 16297 + }, + { + "epoch": 5.002455494168202, + "grad_norm": 0.22985060513019562, + "learning_rate": 5.2394893176139014e-05, + "loss": 1.7724, + "step": 16298 + }, + { + "epoch": 5.002762430939226, + "grad_norm": 0.22867995500564575, + "learning_rate": 5.2389928309470305e-05, + "loss": 1.7179, + "step": 16299 + }, + { + "epoch": 5.003069367710252, + "grad_norm": 0.2543974220752716, + "learning_rate": 5.238496341918293e-05, + "loss": 1.7859, + "step": 16300 + }, + { + "epoch": 5.003376304481277, + "grad_norm": 0.226583793759346, + "learning_rate": 5.237999850532592e-05, + "loss": 1.7567, + "step": 16301 + }, + { + "epoch": 5.003683241252302, + "grad_norm": 0.21744728088378906, + "learning_rate": 5.237503356794838e-05, + "loss": 1.7345, + "step": 16302 + }, + { + "epoch": 5.003990178023328, + "grad_norm": 0.25915467739105225, + "learning_rate": 5.2370068607099373e-05, + "loss": 1.7179, + "step": 16303 + }, + { + "epoch": 5.004297114794352, + "grad_norm": 0.20572461187839508, + "learning_rate": 5.236510362282796e-05, + "loss": 1.7211, + "step": 16304 + }, + { + "epoch": 5.004604051565377, + "grad_norm": 0.2821461856365204, + "learning_rate": 5.236013861518321e-05, + "loss": 1.7894, + "step": 16305 + }, + { + "epoch": 5.004910988336403, + "grad_norm": 0.22273759543895721, + "learning_rate": 5.235517358421417e-05, + "loss": 1.7919, + "step": 16306 + }, + { + "epoch": 5.005217925107428, + "grad_norm": 0.23875468969345093, + "learning_rate": 5.2350208529969935e-05, + "loss": 1.7558, + "step": 16307 + }, + { + "epoch": 5.005524861878453, + "grad_norm": 0.24673783779144287, + "learning_rate": 5.234524345249955e-05, + "loss": 1.7705, + "step": 16308 + }, + { + "epoch": 5.005831798649478, + "grad_norm": 0.21992872655391693, + "learning_rate": 5.234027835185211e-05, + "loss": 1.7059, + "step": 16309 + }, + { + "epoch": 5.006138735420503, + "grad_norm": 0.19214966893196106, + "learning_rate": 5.233531322807667e-05, + "loss": 1.6647, + "step": 16310 + }, + { + "epoch": 5.0064456721915285, + "grad_norm": 0.18525120615959167, + "learning_rate": 5.233034808122228e-05, + "loss": 1.719, + "step": 16311 + }, + { + "epoch": 5.006752608962554, + "grad_norm": 0.25996243953704834, + "learning_rate": 5.232538291133804e-05, + "loss": 1.7227, + "step": 16312 + }, + { + "epoch": 5.007059545733579, + "grad_norm": 0.2163757085800171, + "learning_rate": 5.232041771847299e-05, + "loss": 1.6962, + "step": 16313 + }, + { + "epoch": 5.0073664825046045, + "grad_norm": 0.23484158515930176, + "learning_rate": 5.231545250267621e-05, + "loss": 1.7816, + "step": 16314 + }, + { + "epoch": 5.007673419275629, + "grad_norm": 0.2188636213541031, + "learning_rate": 5.2310487263996776e-05, + "loss": 1.7477, + "step": 16315 + }, + { + "epoch": 5.007980356046654, + "grad_norm": 0.1950213611125946, + "learning_rate": 5.230552200248377e-05, + "loss": 1.7165, + "step": 16316 + }, + { + "epoch": 5.00828729281768, + "grad_norm": 0.25340089201927185, + "learning_rate": 5.230055671818623e-05, + "loss": 1.7764, + "step": 16317 + }, + { + "epoch": 5.008594229588705, + "grad_norm": 0.23749271035194397, + "learning_rate": 5.2295591411153245e-05, + "loss": 1.7193, + "step": 16318 + }, + { + "epoch": 5.00890116635973, + "grad_norm": 0.2317294180393219, + "learning_rate": 5.229062608143387e-05, + "loss": 1.7607, + "step": 16319 + }, + { + "epoch": 5.009208103130755, + "grad_norm": 0.2751505672931671, + "learning_rate": 5.228566072907719e-05, + "loss": 1.7562, + "step": 16320 + }, + { + "epoch": 5.00951503990178, + "grad_norm": 0.29476025700569153, + "learning_rate": 5.2280695354132267e-05, + "loss": 1.687, + "step": 16321 + }, + { + "epoch": 5.009821976672805, + "grad_norm": 0.20734120905399323, + "learning_rate": 5.227572995664819e-05, + "loss": 1.7608, + "step": 16322 + }, + { + "epoch": 5.010128913443831, + "grad_norm": 0.2537878155708313, + "learning_rate": 5.227076453667401e-05, + "loss": 1.7947, + "step": 16323 + }, + { + "epoch": 5.010435850214856, + "grad_norm": 0.23516076803207397, + "learning_rate": 5.2265799094258796e-05, + "loss": 1.7545, + "step": 16324 + }, + { + "epoch": 5.0107427869858805, + "grad_norm": 0.2581529915332794, + "learning_rate": 5.226083362945162e-05, + "loss": 1.7529, + "step": 16325 + }, + { + "epoch": 5.011049723756906, + "grad_norm": 0.2982035279273987, + "learning_rate": 5.225586814230158e-05, + "loss": 1.74, + "step": 16326 + }, + { + "epoch": 5.011356660527931, + "grad_norm": 0.2773981988430023, + "learning_rate": 5.225090263285772e-05, + "loss": 1.7562, + "step": 16327 + }, + { + "epoch": 5.0116635972989565, + "grad_norm": 0.19992689788341522, + "learning_rate": 5.2245937101169116e-05, + "loss": 1.6896, + "step": 16328 + }, + { + "epoch": 5.011970534069982, + "grad_norm": 0.2913428246974945, + "learning_rate": 5.224097154728486e-05, + "loss": 1.7574, + "step": 16329 + }, + { + "epoch": 5.012277470841007, + "grad_norm": 0.23173104226589203, + "learning_rate": 5.2236005971254e-05, + "loss": 1.6954, + "step": 16330 + }, + { + "epoch": 5.012584407612032, + "grad_norm": 0.2019525170326233, + "learning_rate": 5.2231040373125614e-05, + "loss": 1.7711, + "step": 16331 + }, + { + "epoch": 5.012891344383057, + "grad_norm": 0.29070746898651123, + "learning_rate": 5.222607475294878e-05, + "loss": 1.8201, + "step": 16332 + }, + { + "epoch": 5.013198281154082, + "grad_norm": 0.22005079686641693, + "learning_rate": 5.222110911077258e-05, + "loss": 1.7421, + "step": 16333 + }, + { + "epoch": 5.013505217925108, + "grad_norm": 0.24422192573547363, + "learning_rate": 5.2216143446646085e-05, + "loss": 1.7074, + "step": 16334 + }, + { + "epoch": 5.013812154696133, + "grad_norm": 0.2417927384376526, + "learning_rate": 5.221117776061836e-05, + "loss": 1.7726, + "step": 16335 + }, + { + "epoch": 5.014119091467157, + "grad_norm": 0.245828777551651, + "learning_rate": 5.2206212052738454e-05, + "loss": 1.7932, + "step": 16336 + }, + { + "epoch": 5.014426028238183, + "grad_norm": 0.24054239690303802, + "learning_rate": 5.220124632305548e-05, + "loss": 1.727, + "step": 16337 + }, + { + "epoch": 5.014732965009208, + "grad_norm": 0.2572494149208069, + "learning_rate": 5.21962805716185e-05, + "loss": 1.7234, + "step": 16338 + }, + { + "epoch": 5.015039901780233, + "grad_norm": 0.33624622225761414, + "learning_rate": 5.2191314798476595e-05, + "loss": 1.7499, + "step": 16339 + }, + { + "epoch": 5.015346838551259, + "grad_norm": 0.22321413457393646, + "learning_rate": 5.218634900367883e-05, + "loss": 1.7155, + "step": 16340 + }, + { + "epoch": 5.015653775322283, + "grad_norm": 0.26709917187690735, + "learning_rate": 5.218138318727429e-05, + "loss": 1.8346, + "step": 16341 + }, + { + "epoch": 5.0159607120933085, + "grad_norm": 0.27600952982902527, + "learning_rate": 5.217641734931202e-05, + "loss": 1.789, + "step": 16342 + }, + { + "epoch": 5.016267648864334, + "grad_norm": 0.21392405033111572, + "learning_rate": 5.217145148984114e-05, + "loss": 1.7266, + "step": 16343 + }, + { + "epoch": 5.016574585635359, + "grad_norm": 0.3215450942516327, + "learning_rate": 5.2166485608910696e-05, + "loss": 1.7453, + "step": 16344 + }, + { + "epoch": 5.0168815224063845, + "grad_norm": 0.22328032553195953, + "learning_rate": 5.2161519706569776e-05, + "loss": 1.7209, + "step": 16345 + }, + { + "epoch": 5.01718845917741, + "grad_norm": 0.2438887059688568, + "learning_rate": 5.215655378286744e-05, + "loss": 1.7289, + "step": 16346 + }, + { + "epoch": 5.017495395948434, + "grad_norm": 0.30078747868537903, + "learning_rate": 5.2151587837852786e-05, + "loss": 1.7483, + "step": 16347 + }, + { + "epoch": 5.01780233271946, + "grad_norm": 0.21723167598247528, + "learning_rate": 5.214662187157488e-05, + "loss": 1.7654, + "step": 16348 + }, + { + "epoch": 5.018109269490485, + "grad_norm": 0.26358669996261597, + "learning_rate": 5.2141655884082784e-05, + "loss": 1.7563, + "step": 16349 + }, + { + "epoch": 5.01841620626151, + "grad_norm": 0.24285505712032318, + "learning_rate": 5.2136689875425615e-05, + "loss": 1.7377, + "step": 16350 + }, + { + "epoch": 5.018723143032536, + "grad_norm": 0.2401108294725418, + "learning_rate": 5.2131723845652416e-05, + "loss": 1.7445, + "step": 16351 + }, + { + "epoch": 5.01903007980356, + "grad_norm": 0.3347793519496918, + "learning_rate": 5.212675779481226e-05, + "loss": 1.7872, + "step": 16352 + }, + { + "epoch": 5.019337016574585, + "grad_norm": 0.306728720664978, + "learning_rate": 5.212179172295424e-05, + "loss": 1.8051, + "step": 16353 + }, + { + "epoch": 5.019643953345611, + "grad_norm": 0.22297725081443787, + "learning_rate": 5.211682563012743e-05, + "loss": 1.7082, + "step": 16354 + }, + { + "epoch": 5.019950890116636, + "grad_norm": 0.24047277867794037, + "learning_rate": 5.211185951638091e-05, + "loss": 1.7024, + "step": 16355 + }, + { + "epoch": 5.020257826887661, + "grad_norm": 0.19570080935955048, + "learning_rate": 5.210689338176377e-05, + "loss": 1.6947, + "step": 16356 + }, + { + "epoch": 5.020564763658686, + "grad_norm": 0.2024889886379242, + "learning_rate": 5.2101927226325066e-05, + "loss": 1.7168, + "step": 16357 + }, + { + "epoch": 5.020871700429711, + "grad_norm": 0.23546278476715088, + "learning_rate": 5.209696105011388e-05, + "loss": 1.7697, + "step": 16358 + }, + { + "epoch": 5.0211786372007365, + "grad_norm": 0.21003498136997223, + "learning_rate": 5.209199485317928e-05, + "loss": 1.7198, + "step": 16359 + }, + { + "epoch": 5.021485573971762, + "grad_norm": 0.21375493705272675, + "learning_rate": 5.208702863557039e-05, + "loss": 1.7689, + "step": 16360 + }, + { + "epoch": 5.021792510742787, + "grad_norm": 0.21549762785434723, + "learning_rate": 5.2082062397336254e-05, + "loss": 1.6936, + "step": 16361 + }, + { + "epoch": 5.0220994475138125, + "grad_norm": 0.22633691132068634, + "learning_rate": 5.207709613852595e-05, + "loss": 1.7512, + "step": 16362 + }, + { + "epoch": 5.022406384284837, + "grad_norm": 0.21888238191604614, + "learning_rate": 5.2072129859188566e-05, + "loss": 1.7082, + "step": 16363 + }, + { + "epoch": 5.022713321055862, + "grad_norm": 0.2416619062423706, + "learning_rate": 5.206716355937318e-05, + "loss": 1.7938, + "step": 16364 + }, + { + "epoch": 5.023020257826888, + "grad_norm": 0.22451527416706085, + "learning_rate": 5.206219723912886e-05, + "loss": 1.7372, + "step": 16365 + }, + { + "epoch": 5.023327194597913, + "grad_norm": 0.19698494672775269, + "learning_rate": 5.2057230898504716e-05, + "loss": 1.7205, + "step": 16366 + }, + { + "epoch": 5.023634131368938, + "grad_norm": 0.2441127747297287, + "learning_rate": 5.205226453754982e-05, + "loss": 1.7625, + "step": 16367 + }, + { + "epoch": 5.023941068139963, + "grad_norm": 0.21940121054649353, + "learning_rate": 5.204729815631323e-05, + "loss": 1.7985, + "step": 16368 + }, + { + "epoch": 5.024248004910988, + "grad_norm": 0.21751399338245392, + "learning_rate": 5.204233175484403e-05, + "loss": 1.7759, + "step": 16369 + }, + { + "epoch": 5.024554941682013, + "grad_norm": 0.20261377096176147, + "learning_rate": 5.2037365333191315e-05, + "loss": 1.746, + "step": 16370 + }, + { + "epoch": 5.024861878453039, + "grad_norm": 0.2628774046897888, + "learning_rate": 5.2032398891404166e-05, + "loss": 1.8178, + "step": 16371 + }, + { + "epoch": 5.025168815224064, + "grad_norm": 0.20626378059387207, + "learning_rate": 5.2027432429531665e-05, + "loss": 1.7456, + "step": 16372 + }, + { + "epoch": 5.0254757519950894, + "grad_norm": 0.25548869371414185, + "learning_rate": 5.2022465947622876e-05, + "loss": 1.8098, + "step": 16373 + }, + { + "epoch": 5.025782688766114, + "grad_norm": 0.1978374719619751, + "learning_rate": 5.20174994457269e-05, + "loss": 1.685, + "step": 16374 + }, + { + "epoch": 5.026089625537139, + "grad_norm": 0.2708980143070221, + "learning_rate": 5.201253292389282e-05, + "loss": 1.7464, + "step": 16375 + }, + { + "epoch": 5.026396562308165, + "grad_norm": 0.2730494737625122, + "learning_rate": 5.2007566382169706e-05, + "loss": 1.7391, + "step": 16376 + }, + { + "epoch": 5.02670349907919, + "grad_norm": 0.243557408452034, + "learning_rate": 5.2002599820606624e-05, + "loss": 1.7439, + "step": 16377 + }, + { + "epoch": 5.027010435850215, + "grad_norm": 0.2208259105682373, + "learning_rate": 5.19976332392527e-05, + "loss": 1.7612, + "step": 16378 + }, + { + "epoch": 5.02731737262124, + "grad_norm": 0.21288715302944183, + "learning_rate": 5.199266663815698e-05, + "loss": 1.7546, + "step": 16379 + }, + { + "epoch": 5.027624309392265, + "grad_norm": 0.2106054425239563, + "learning_rate": 5.198770001736857e-05, + "loss": 1.7281, + "step": 16380 + }, + { + "epoch": 5.02793124616329, + "grad_norm": 0.2247164249420166, + "learning_rate": 5.198273337693654e-05, + "loss": 1.8405, + "step": 16381 + }, + { + "epoch": 5.028238182934316, + "grad_norm": 0.21713724732398987, + "learning_rate": 5.197776671690998e-05, + "loss": 1.7333, + "step": 16382 + }, + { + "epoch": 5.028545119705341, + "grad_norm": 0.24063727259635925, + "learning_rate": 5.1972800037337956e-05, + "loss": 1.7608, + "step": 16383 + }, + { + "epoch": 5.0288520564763655, + "grad_norm": 0.22022177278995514, + "learning_rate": 5.196783333826959e-05, + "loss": 1.7045, + "step": 16384 + }, + { + "epoch": 5.029158993247391, + "grad_norm": 0.21348948776721954, + "learning_rate": 5.1962866619753927e-05, + "loss": 1.7516, + "step": 16385 + }, + { + "epoch": 5.029465930018416, + "grad_norm": 0.289315789937973, + "learning_rate": 5.195789988184007e-05, + "loss": 1.8555, + "step": 16386 + }, + { + "epoch": 5.0297728667894415, + "grad_norm": 0.30966848134994507, + "learning_rate": 5.19529331245771e-05, + "loss": 1.7245, + "step": 16387 + }, + { + "epoch": 5.030079803560467, + "grad_norm": 0.24625633656978607, + "learning_rate": 5.194796634801409e-05, + "loss": 1.7788, + "step": 16388 + }, + { + "epoch": 5.030386740331492, + "grad_norm": 0.25937986373901367, + "learning_rate": 5.1942999552200136e-05, + "loss": 1.7655, + "step": 16389 + }, + { + "epoch": 5.030693677102517, + "grad_norm": 0.3056741952896118, + "learning_rate": 5.1938032737184325e-05, + "loss": 1.7167, + "step": 16390 + }, + { + "epoch": 5.031000613873542, + "grad_norm": 0.29773563146591187, + "learning_rate": 5.1933065903015743e-05, + "loss": 1.7247, + "step": 16391 + }, + { + "epoch": 5.031307550644567, + "grad_norm": 0.26433971524238586, + "learning_rate": 5.192809904974347e-05, + "loss": 1.7779, + "step": 16392 + }, + { + "epoch": 5.031614487415593, + "grad_norm": 0.3308073580265045, + "learning_rate": 5.192313217741659e-05, + "loss": 1.7782, + "step": 16393 + }, + { + "epoch": 5.031921424186618, + "grad_norm": 0.2584165632724762, + "learning_rate": 5.1918165286084176e-05, + "loss": 1.7812, + "step": 16394 + }, + { + "epoch": 5.032228360957642, + "grad_norm": 0.31678953766822815, + "learning_rate": 5.1913198375795346e-05, + "loss": 1.7341, + "step": 16395 + }, + { + "epoch": 5.032535297728668, + "grad_norm": 0.3527325391769409, + "learning_rate": 5.190823144659916e-05, + "loss": 1.7844, + "step": 16396 + }, + { + "epoch": 5.032842234499693, + "grad_norm": 0.29233935475349426, + "learning_rate": 5.1903264498544724e-05, + "loss": 1.7993, + "step": 16397 + }, + { + "epoch": 5.033149171270718, + "grad_norm": 0.24549467861652374, + "learning_rate": 5.1898297531681106e-05, + "loss": 1.7294, + "step": 16398 + }, + { + "epoch": 5.033456108041744, + "grad_norm": 0.3446930944919586, + "learning_rate": 5.18933305460574e-05, + "loss": 1.6818, + "step": 16399 + }, + { + "epoch": 5.033763044812768, + "grad_norm": 0.2628229856491089, + "learning_rate": 5.188836354172268e-05, + "loss": 1.7867, + "step": 16400 + }, + { + "epoch": 5.0340699815837935, + "grad_norm": 0.26548629999160767, + "learning_rate": 5.188339651872607e-05, + "loss": 1.7448, + "step": 16401 + }, + { + "epoch": 5.034376918354819, + "grad_norm": 0.29242032766342163, + "learning_rate": 5.187842947711662e-05, + "loss": 1.7103, + "step": 16402 + }, + { + "epoch": 5.034683855125844, + "grad_norm": 0.2515408992767334, + "learning_rate": 5.187346241694343e-05, + "loss": 1.7865, + "step": 16403 + }, + { + "epoch": 5.0349907918968695, + "grad_norm": 0.2253103256225586, + "learning_rate": 5.186849533825559e-05, + "loss": 1.6993, + "step": 16404 + }, + { + "epoch": 5.035297728667895, + "grad_norm": 0.2743360102176666, + "learning_rate": 5.1863528241102154e-05, + "loss": 1.7532, + "step": 16405 + }, + { + "epoch": 5.035604665438919, + "grad_norm": 0.22807851433753967, + "learning_rate": 5.185856112553227e-05, + "loss": 1.7873, + "step": 16406 + }, + { + "epoch": 5.035911602209945, + "grad_norm": 0.23719090223312378, + "learning_rate": 5.1853593991594985e-05, + "loss": 1.7555, + "step": 16407 + }, + { + "epoch": 5.03621853898097, + "grad_norm": 0.2964477241039276, + "learning_rate": 5.184862683933941e-05, + "loss": 1.7204, + "step": 16408 + }, + { + "epoch": 5.036525475751995, + "grad_norm": 0.23717865347862244, + "learning_rate": 5.18436596688146e-05, + "loss": 1.7239, + "step": 16409 + }, + { + "epoch": 5.036832412523021, + "grad_norm": 0.22650085389614105, + "learning_rate": 5.1838692480069686e-05, + "loss": 1.7148, + "step": 16410 + }, + { + "epoch": 5.037139349294045, + "grad_norm": 0.25606781244277954, + "learning_rate": 5.183372527315371e-05, + "loss": 1.7916, + "step": 16411 + }, + { + "epoch": 5.03744628606507, + "grad_norm": 0.22266390919685364, + "learning_rate": 5.182875804811581e-05, + "loss": 1.7481, + "step": 16412 + }, + { + "epoch": 5.037753222836096, + "grad_norm": 0.23481780290603638, + "learning_rate": 5.1823790805005045e-05, + "loss": 1.8014, + "step": 16413 + }, + { + "epoch": 5.038060159607121, + "grad_norm": 0.2629338800907135, + "learning_rate": 5.1818823543870506e-05, + "loss": 1.81, + "step": 16414 + }, + { + "epoch": 5.038367096378146, + "grad_norm": 0.22891482710838318, + "learning_rate": 5.18138562647613e-05, + "loss": 1.757, + "step": 16415 + }, + { + "epoch": 5.038674033149171, + "grad_norm": 0.2666641175746918, + "learning_rate": 5.180888896772649e-05, + "loss": 1.7457, + "step": 16416 + }, + { + "epoch": 5.038980969920196, + "grad_norm": 0.37610310316085815, + "learning_rate": 5.180392165281517e-05, + "loss": 1.8214, + "step": 16417 + }, + { + "epoch": 5.0392879066912215, + "grad_norm": 0.2521277964115143, + "learning_rate": 5.1798954320076455e-05, + "loss": 1.7731, + "step": 16418 + }, + { + "epoch": 5.039594843462247, + "grad_norm": 0.25097090005874634, + "learning_rate": 5.1793986969559415e-05, + "loss": 1.8029, + "step": 16419 + }, + { + "epoch": 5.039901780233272, + "grad_norm": 0.2946726381778717, + "learning_rate": 5.178901960131315e-05, + "loss": 1.7483, + "step": 16420 + }, + { + "epoch": 5.0402087170042975, + "grad_norm": 0.24240419268608093, + "learning_rate": 5.1784052215386736e-05, + "loss": 1.731, + "step": 16421 + }, + { + "epoch": 5.040515653775322, + "grad_norm": 0.2403198480606079, + "learning_rate": 5.177908481182926e-05, + "loss": 1.722, + "step": 16422 + }, + { + "epoch": 5.040822590546347, + "grad_norm": 0.3451874554157257, + "learning_rate": 5.177411739068985e-05, + "loss": 1.7562, + "step": 16423 + }, + { + "epoch": 5.041129527317373, + "grad_norm": 0.3244951069355011, + "learning_rate": 5.176914995201756e-05, + "loss": 1.7321, + "step": 16424 + }, + { + "epoch": 5.041436464088398, + "grad_norm": 0.2346230000257492, + "learning_rate": 5.176418249586149e-05, + "loss": 1.7839, + "step": 16425 + }, + { + "epoch": 5.041743400859423, + "grad_norm": 0.357022225856781, + "learning_rate": 5.1759215022270744e-05, + "loss": 1.7776, + "step": 16426 + }, + { + "epoch": 5.042050337630448, + "grad_norm": 0.259007066488266, + "learning_rate": 5.17542475312944e-05, + "loss": 1.7544, + "step": 16427 + }, + { + "epoch": 5.042357274401473, + "grad_norm": 0.2516533136367798, + "learning_rate": 5.174928002298154e-05, + "loss": 1.7269, + "step": 16428 + }, + { + "epoch": 5.042664211172498, + "grad_norm": 0.3393619954586029, + "learning_rate": 5.174431249738129e-05, + "loss": 1.7487, + "step": 16429 + }, + { + "epoch": 5.042971147943524, + "grad_norm": 0.2730594873428345, + "learning_rate": 5.1739344954542714e-05, + "loss": 1.7468, + "step": 16430 + }, + { + "epoch": 5.043278084714549, + "grad_norm": 0.21233965456485748, + "learning_rate": 5.1734377394514914e-05, + "loss": 1.783, + "step": 16431 + }, + { + "epoch": 5.043585021485574, + "grad_norm": 0.3460896909236908, + "learning_rate": 5.1729409817346974e-05, + "loss": 1.7497, + "step": 16432 + }, + { + "epoch": 5.043891958256599, + "grad_norm": 0.31918221712112427, + "learning_rate": 5.1724442223088e-05, + "loss": 1.7834, + "step": 16433 + }, + { + "epoch": 5.044198895027624, + "grad_norm": 0.23016802966594696, + "learning_rate": 5.171947461178706e-05, + "loss": 1.7348, + "step": 16434 + }, + { + "epoch": 5.0445058317986495, + "grad_norm": 0.35758304595947266, + "learning_rate": 5.171450698349329e-05, + "loss": 1.7734, + "step": 16435 + }, + { + "epoch": 5.044812768569675, + "grad_norm": 0.279725581407547, + "learning_rate": 5.170953933825574e-05, + "loss": 1.7283, + "step": 16436 + }, + { + "epoch": 5.0451197053407, + "grad_norm": 0.23965120315551758, + "learning_rate": 5.170457167612354e-05, + "loss": 1.7606, + "step": 16437 + }, + { + "epoch": 5.045426642111725, + "grad_norm": 0.28026309609413147, + "learning_rate": 5.169960399714574e-05, + "loss": 1.7872, + "step": 16438 + }, + { + "epoch": 5.04573357888275, + "grad_norm": 0.3262448012828827, + "learning_rate": 5.169463630137146e-05, + "loss": 1.8654, + "step": 16439 + }, + { + "epoch": 5.046040515653775, + "grad_norm": 0.4249584674835205, + "learning_rate": 5.168966858884979e-05, + "loss": 1.7244, + "step": 16440 + }, + { + "epoch": 5.046347452424801, + "grad_norm": 0.3385370969772339, + "learning_rate": 5.168470085962984e-05, + "loss": 1.7745, + "step": 16441 + }, + { + "epoch": 5.046654389195826, + "grad_norm": 0.2321811318397522, + "learning_rate": 5.1679733113760675e-05, + "loss": 1.8093, + "step": 16442 + }, + { + "epoch": 5.04696132596685, + "grad_norm": 0.3426755368709564, + "learning_rate": 5.167476535129141e-05, + "loss": 1.7752, + "step": 16443 + }, + { + "epoch": 5.047268262737876, + "grad_norm": 0.27672505378723145, + "learning_rate": 5.166979757227114e-05, + "loss": 1.7619, + "step": 16444 + }, + { + "epoch": 5.047575199508901, + "grad_norm": 0.4111184775829315, + "learning_rate": 5.1664829776748925e-05, + "loss": 1.7672, + "step": 16445 + }, + { + "epoch": 5.047882136279926, + "grad_norm": 0.40139874815940857, + "learning_rate": 5.1659861964773905e-05, + "loss": 1.7753, + "step": 16446 + }, + { + "epoch": 5.048189073050952, + "grad_norm": 0.28931725025177, + "learning_rate": 5.165489413639516e-05, + "loss": 1.7607, + "step": 16447 + }, + { + "epoch": 5.048496009821977, + "grad_norm": 0.297538161277771, + "learning_rate": 5.1649926291661775e-05, + "loss": 1.7661, + "step": 16448 + }, + { + "epoch": 5.0488029465930016, + "grad_norm": 0.4299027621746063, + "learning_rate": 5.1644958430622846e-05, + "loss": 1.6998, + "step": 16449 + }, + { + "epoch": 5.049109883364027, + "grad_norm": 0.2554767429828644, + "learning_rate": 5.163999055332749e-05, + "loss": 1.7716, + "step": 16450 + }, + { + "epoch": 5.049416820135052, + "grad_norm": 0.3561006486415863, + "learning_rate": 5.163502265982477e-05, + "loss": 1.7493, + "step": 16451 + }, + { + "epoch": 5.0497237569060776, + "grad_norm": 0.3839687407016754, + "learning_rate": 5.1630054750163806e-05, + "loss": 1.7314, + "step": 16452 + }, + { + "epoch": 5.050030693677103, + "grad_norm": 0.20022284984588623, + "learning_rate": 5.1625086824393684e-05, + "loss": 1.6992, + "step": 16453 + }, + { + "epoch": 5.050337630448127, + "grad_norm": 0.36830398440361023, + "learning_rate": 5.162011888256349e-05, + "loss": 1.7339, + "step": 16454 + }, + { + "epoch": 5.050644567219153, + "grad_norm": 0.31947389245033264, + "learning_rate": 5.161515092472236e-05, + "loss": 1.7254, + "step": 16455 + }, + { + "epoch": 5.050951503990178, + "grad_norm": 0.2779252827167511, + "learning_rate": 5.161018295091933e-05, + "loss": 1.7941, + "step": 16456 + }, + { + "epoch": 5.051258440761203, + "grad_norm": 0.3796578347682953, + "learning_rate": 5.160521496120354e-05, + "loss": 1.7389, + "step": 16457 + }, + { + "epoch": 5.051565377532229, + "grad_norm": 0.23569442331790924, + "learning_rate": 5.1600246955624076e-05, + "loss": 1.7149, + "step": 16458 + }, + { + "epoch": 5.051872314303253, + "grad_norm": 0.27342507243156433, + "learning_rate": 5.159527893423004e-05, + "loss": 1.699, + "step": 16459 + }, + { + "epoch": 5.0521792510742785, + "grad_norm": 0.2877296209335327, + "learning_rate": 5.159031089707052e-05, + "loss": 1.7668, + "step": 16460 + }, + { + "epoch": 5.052486187845304, + "grad_norm": 0.21482446789741516, + "learning_rate": 5.1585342844194605e-05, + "loss": 1.7132, + "step": 16461 + }, + { + "epoch": 5.052793124616329, + "grad_norm": 0.23588669300079346, + "learning_rate": 5.158037477565142e-05, + "loss": 1.7267, + "step": 16462 + }, + { + "epoch": 5.0531000613873545, + "grad_norm": 0.20188623666763306, + "learning_rate": 5.157540669149003e-05, + "loss": 1.7486, + "step": 16463 + }, + { + "epoch": 5.05340699815838, + "grad_norm": 0.2012643963098526, + "learning_rate": 5.157043859175955e-05, + "loss": 1.718, + "step": 16464 + }, + { + "epoch": 5.053713934929404, + "grad_norm": 0.23133818805217743, + "learning_rate": 5.156547047650908e-05, + "loss": 1.7892, + "step": 16465 + }, + { + "epoch": 5.05402087170043, + "grad_norm": 0.2524542510509491, + "learning_rate": 5.156050234578771e-05, + "loss": 1.8034, + "step": 16466 + }, + { + "epoch": 5.054327808471455, + "grad_norm": 0.20992529392242432, + "learning_rate": 5.155553419964454e-05, + "loss": 1.7158, + "step": 16467 + }, + { + "epoch": 5.05463474524248, + "grad_norm": 0.23815447092056274, + "learning_rate": 5.155056603812868e-05, + "loss": 1.7632, + "step": 16468 + }, + { + "epoch": 5.054941682013506, + "grad_norm": 0.3306051790714264, + "learning_rate": 5.1545597861289205e-05, + "loss": 1.7719, + "step": 16469 + }, + { + "epoch": 5.05524861878453, + "grad_norm": 0.287541925907135, + "learning_rate": 5.154062966917523e-05, + "loss": 1.7092, + "step": 16470 + }, + { + "epoch": 5.055555555555555, + "grad_norm": 0.28186658024787903, + "learning_rate": 5.153566146183586e-05, + "loss": 1.8548, + "step": 16471 + }, + { + "epoch": 5.055862492326581, + "grad_norm": 0.3511136472225189, + "learning_rate": 5.153069323932017e-05, + "loss": 1.8029, + "step": 16472 + }, + { + "epoch": 5.056169429097606, + "grad_norm": 0.32083824276924133, + "learning_rate": 5.152572500167728e-05, + "loss": 1.7321, + "step": 16473 + }, + { + "epoch": 5.056476365868631, + "grad_norm": 0.22571051120758057, + "learning_rate": 5.1520756748956265e-05, + "loss": 1.7218, + "step": 16474 + }, + { + "epoch": 5.056783302639656, + "grad_norm": 0.2902646064758301, + "learning_rate": 5.151578848120626e-05, + "loss": 1.7231, + "step": 16475 + }, + { + "epoch": 5.057090239410681, + "grad_norm": 0.20447610318660736, + "learning_rate": 5.1510820198476336e-05, + "loss": 1.6998, + "step": 16476 + }, + { + "epoch": 5.0573971761817065, + "grad_norm": 0.29436638951301575, + "learning_rate": 5.1505851900815606e-05, + "loss": 1.6793, + "step": 16477 + }, + { + "epoch": 5.057704112952732, + "grad_norm": 0.29718565940856934, + "learning_rate": 5.1500883588273164e-05, + "loss": 1.8322, + "step": 16478 + }, + { + "epoch": 5.058011049723757, + "grad_norm": 0.23530519008636475, + "learning_rate": 5.149591526089811e-05, + "loss": 1.7408, + "step": 16479 + }, + { + "epoch": 5.0583179864947825, + "grad_norm": 0.30735042691230774, + "learning_rate": 5.1490946918739536e-05, + "loss": 1.7454, + "step": 16480 + }, + { + "epoch": 5.058624923265807, + "grad_norm": 0.26151445508003235, + "learning_rate": 5.148597856184656e-05, + "loss": 1.7728, + "step": 16481 + }, + { + "epoch": 5.058931860036832, + "grad_norm": 0.2657756209373474, + "learning_rate": 5.1481010190268263e-05, + "loss": 1.7905, + "step": 16482 + }, + { + "epoch": 5.059238796807858, + "grad_norm": 0.25418251752853394, + "learning_rate": 5.147604180405376e-05, + "loss": 1.7676, + "step": 16483 + }, + { + "epoch": 5.059545733578883, + "grad_norm": 0.25486254692077637, + "learning_rate": 5.1471073403252154e-05, + "loss": 1.8347, + "step": 16484 + }, + { + "epoch": 5.059852670349908, + "grad_norm": 0.22693100571632385, + "learning_rate": 5.146610498791255e-05, + "loss": 1.7308, + "step": 16485 + }, + { + "epoch": 5.060159607120933, + "grad_norm": 0.22056837379932404, + "learning_rate": 5.146113655808401e-05, + "loss": 1.7158, + "step": 16486 + }, + { + "epoch": 5.060466543891958, + "grad_norm": 0.221246138215065, + "learning_rate": 5.1456168113815685e-05, + "loss": 1.6985, + "step": 16487 + }, + { + "epoch": 5.060773480662983, + "grad_norm": 0.2149408906698227, + "learning_rate": 5.145119965515664e-05, + "loss": 1.716, + "step": 16488 + }, + { + "epoch": 5.061080417434009, + "grad_norm": 0.23958513140678406, + "learning_rate": 5.144623118215599e-05, + "loss": 1.8092, + "step": 16489 + }, + { + "epoch": 5.061387354205034, + "grad_norm": 0.2870621085166931, + "learning_rate": 5.1441262694862836e-05, + "loss": 1.75, + "step": 16490 + }, + { + "epoch": 5.0616942909760585, + "grad_norm": 0.26755061745643616, + "learning_rate": 5.1436294193326276e-05, + "loss": 1.7848, + "step": 16491 + }, + { + "epoch": 5.062001227747084, + "grad_norm": 0.2434249073266983, + "learning_rate": 5.143132567759542e-05, + "loss": 1.7487, + "step": 16492 + }, + { + "epoch": 5.062308164518109, + "grad_norm": 0.3044668138027191, + "learning_rate": 5.142635714771936e-05, + "loss": 1.741, + "step": 16493 + }, + { + "epoch": 5.0626151012891345, + "grad_norm": 0.2166958749294281, + "learning_rate": 5.142138860374721e-05, + "loss": 1.7232, + "step": 16494 + }, + { + "epoch": 5.06292203806016, + "grad_norm": 0.34558552503585815, + "learning_rate": 5.141642004572806e-05, + "loss": 1.7663, + "step": 16495 + }, + { + "epoch": 5.063228974831185, + "grad_norm": 0.330751895904541, + "learning_rate": 5.141145147371102e-05, + "loss": 1.6818, + "step": 16496 + }, + { + "epoch": 5.06353591160221, + "grad_norm": 0.21613973379135132, + "learning_rate": 5.140648288774518e-05, + "loss": 1.7914, + "step": 16497 + }, + { + "epoch": 5.063842848373235, + "grad_norm": 0.32759732007980347, + "learning_rate": 5.140151428787966e-05, + "loss": 1.7543, + "step": 16498 + }, + { + "epoch": 5.06414978514426, + "grad_norm": 0.3180293142795563, + "learning_rate": 5.1396545674163556e-05, + "loss": 1.8163, + "step": 16499 + }, + { + "epoch": 5.064456721915286, + "grad_norm": 0.19757944345474243, + "learning_rate": 5.1391577046645964e-05, + "loss": 1.71, + "step": 16500 + }, + { + "epoch": 5.064763658686311, + "grad_norm": 0.253366619348526, + "learning_rate": 5.1386608405376005e-05, + "loss": 1.7266, + "step": 16501 + }, + { + "epoch": 5.065070595457335, + "grad_norm": 0.24577608704566956, + "learning_rate": 5.1381639750402754e-05, + "loss": 1.7218, + "step": 16502 + }, + { + "epoch": 5.065377532228361, + "grad_norm": 0.22847014665603638, + "learning_rate": 5.137667108177533e-05, + "loss": 1.8025, + "step": 16503 + }, + { + "epoch": 5.065684468999386, + "grad_norm": 0.2089833766222, + "learning_rate": 5.137170239954284e-05, + "loss": 1.8032, + "step": 16504 + }, + { + "epoch": 5.065991405770411, + "grad_norm": 0.21528512239456177, + "learning_rate": 5.136673370375439e-05, + "loss": 1.7227, + "step": 16505 + }, + { + "epoch": 5.066298342541437, + "grad_norm": 0.2099117785692215, + "learning_rate": 5.1361764994459074e-05, + "loss": 1.7176, + "step": 16506 + }, + { + "epoch": 5.066605279312462, + "grad_norm": 0.2140430212020874, + "learning_rate": 5.135679627170599e-05, + "loss": 1.8195, + "step": 16507 + }, + { + "epoch": 5.0669122160834865, + "grad_norm": 0.20253533124923706, + "learning_rate": 5.135182753554424e-05, + "loss": 1.7284, + "step": 16508 + }, + { + "epoch": 5.067219152854512, + "grad_norm": 0.19945639371871948, + "learning_rate": 5.134685878602295e-05, + "loss": 1.6915, + "step": 16509 + }, + { + "epoch": 5.067526089625537, + "grad_norm": 0.20138494670391083, + "learning_rate": 5.1341890023191216e-05, + "loss": 1.7856, + "step": 16510 + }, + { + "epoch": 5.0678330263965625, + "grad_norm": 0.22124232351779938, + "learning_rate": 5.1336921247098136e-05, + "loss": 1.7674, + "step": 16511 + }, + { + "epoch": 5.068139963167588, + "grad_norm": 0.21564216911792755, + "learning_rate": 5.133195245779282e-05, + "loss": 1.6998, + "step": 16512 + }, + { + "epoch": 5.068446899938612, + "grad_norm": 0.21836799383163452, + "learning_rate": 5.1326983655324365e-05, + "loss": 1.7468, + "step": 16513 + }, + { + "epoch": 5.068753836709638, + "grad_norm": 0.2412201464176178, + "learning_rate": 5.132201483974187e-05, + "loss": 1.7433, + "step": 16514 + }, + { + "epoch": 5.069060773480663, + "grad_norm": 0.262054979801178, + "learning_rate": 5.131704601109446e-05, + "loss": 1.8315, + "step": 16515 + }, + { + "epoch": 5.069367710251688, + "grad_norm": 0.21573080122470856, + "learning_rate": 5.1312077169431225e-05, + "loss": 1.7668, + "step": 16516 + }, + { + "epoch": 5.069674647022714, + "grad_norm": 0.21407057344913483, + "learning_rate": 5.130710831480129e-05, + "loss": 1.7486, + "step": 16517 + }, + { + "epoch": 5.069981583793738, + "grad_norm": 0.2128407508134842, + "learning_rate": 5.130213944725373e-05, + "loss": 1.7618, + "step": 16518 + }, + { + "epoch": 5.070288520564763, + "grad_norm": 0.2034141719341278, + "learning_rate": 5.129717056683767e-05, + "loss": 1.726, + "step": 16519 + }, + { + "epoch": 5.070595457335789, + "grad_norm": 0.21474458277225494, + "learning_rate": 5.1292201673602205e-05, + "loss": 1.7883, + "step": 16520 + }, + { + "epoch": 5.070902394106814, + "grad_norm": 0.2102673202753067, + "learning_rate": 5.128723276759645e-05, + "loss": 1.7826, + "step": 16521 + }, + { + "epoch": 5.071209330877839, + "grad_norm": 0.21342496573925018, + "learning_rate": 5.1282263848869505e-05, + "loss": 1.7561, + "step": 16522 + }, + { + "epoch": 5.071516267648865, + "grad_norm": 0.21749620139598846, + "learning_rate": 5.1277294917470474e-05, + "loss": 1.7814, + "step": 16523 + }, + { + "epoch": 5.071823204419889, + "grad_norm": 0.20006774365901947, + "learning_rate": 5.1272325973448476e-05, + "loss": 1.6965, + "step": 16524 + }, + { + "epoch": 5.0721301411909145, + "grad_norm": 0.20878590643405914, + "learning_rate": 5.1267357016852593e-05, + "loss": 1.7426, + "step": 16525 + }, + { + "epoch": 5.07243707796194, + "grad_norm": 0.21824820339679718, + "learning_rate": 5.1262388047731946e-05, + "loss": 1.7704, + "step": 16526 + }, + { + "epoch": 5.072744014732965, + "grad_norm": 0.1992526650428772, + "learning_rate": 5.125741906613565e-05, + "loss": 1.7874, + "step": 16527 + }, + { + "epoch": 5.0730509515039905, + "grad_norm": 0.21028028428554535, + "learning_rate": 5.12524500721128e-05, + "loss": 1.7483, + "step": 16528 + }, + { + "epoch": 5.073357888275015, + "grad_norm": 0.21840833127498627, + "learning_rate": 5.12474810657125e-05, + "loss": 1.7763, + "step": 16529 + }, + { + "epoch": 5.07366482504604, + "grad_norm": 0.249269038438797, + "learning_rate": 5.124251204698387e-05, + "loss": 1.7451, + "step": 16530 + }, + { + "epoch": 5.073971761817066, + "grad_norm": 0.2176963835954666, + "learning_rate": 5.1237543015975986e-05, + "loss": 1.7079, + "step": 16531 + }, + { + "epoch": 5.074278698588091, + "grad_norm": 0.20284616947174072, + "learning_rate": 5.1232573972738e-05, + "loss": 1.7235, + "step": 16532 + }, + { + "epoch": 5.074585635359116, + "grad_norm": 0.20140530169010162, + "learning_rate": 5.1227604917318984e-05, + "loss": 1.7014, + "step": 16533 + }, + { + "epoch": 5.074892572130141, + "grad_norm": 0.2407023161649704, + "learning_rate": 5.1222635849768066e-05, + "loss": 1.7493, + "step": 16534 + }, + { + "epoch": 5.075199508901166, + "grad_norm": 0.2013770490884781, + "learning_rate": 5.121766677013433e-05, + "loss": 1.7601, + "step": 16535 + }, + { + "epoch": 5.0755064456721914, + "grad_norm": 0.23889221251010895, + "learning_rate": 5.1212697678466916e-05, + "loss": 1.7282, + "step": 16536 + }, + { + "epoch": 5.075813382443217, + "grad_norm": 0.2411198765039444, + "learning_rate": 5.120772857481489e-05, + "loss": 1.8138, + "step": 16537 + }, + { + "epoch": 5.076120319214242, + "grad_norm": 0.24521365761756897, + "learning_rate": 5.12027594592274e-05, + "loss": 1.7659, + "step": 16538 + }, + { + "epoch": 5.0764272559852675, + "grad_norm": 0.2841372787952423, + "learning_rate": 5.119779033175354e-05, + "loss": 1.7973, + "step": 16539 + }, + { + "epoch": 5.076734192756292, + "grad_norm": 0.21796928346157074, + "learning_rate": 5.1192821192442395e-05, + "loss": 1.6985, + "step": 16540 + }, + { + "epoch": 5.077041129527317, + "grad_norm": 0.2244848757982254, + "learning_rate": 5.118785204134311e-05, + "loss": 1.7413, + "step": 16541 + }, + { + "epoch": 5.077348066298343, + "grad_norm": 0.22581063210964203, + "learning_rate": 5.1182882878504766e-05, + "loss": 1.7706, + "step": 16542 + }, + { + "epoch": 5.077655003069368, + "grad_norm": 0.24478016793727875, + "learning_rate": 5.117791370397647e-05, + "loss": 1.7628, + "step": 16543 + }, + { + "epoch": 5.077961939840393, + "grad_norm": 0.31270188093185425, + "learning_rate": 5.117294451780734e-05, + "loss": 1.8254, + "step": 16544 + }, + { + "epoch": 5.078268876611418, + "grad_norm": 0.3547368049621582, + "learning_rate": 5.11679753200465e-05, + "loss": 1.781, + "step": 16545 + }, + { + "epoch": 5.078575813382443, + "grad_norm": 0.24920180439949036, + "learning_rate": 5.116300611074304e-05, + "loss": 1.7748, + "step": 16546 + }, + { + "epoch": 5.078882750153468, + "grad_norm": 0.2368776649236679, + "learning_rate": 5.115803688994607e-05, + "loss": 1.7459, + "step": 16547 + }, + { + "epoch": 5.079189686924494, + "grad_norm": 0.28341975808143616, + "learning_rate": 5.115306765770471e-05, + "loss": 1.6694, + "step": 16548 + }, + { + "epoch": 5.079496623695519, + "grad_norm": 0.2521432936191559, + "learning_rate": 5.114809841406804e-05, + "loss": 1.7544, + "step": 16549 + }, + { + "epoch": 5.0798035604665435, + "grad_norm": 0.21199844777584076, + "learning_rate": 5.11431291590852e-05, + "loss": 1.7215, + "step": 16550 + }, + { + "epoch": 5.080110497237569, + "grad_norm": 0.25157347321510315, + "learning_rate": 5.113815989280528e-05, + "loss": 1.8021, + "step": 16551 + }, + { + "epoch": 5.080417434008594, + "grad_norm": 0.2284129559993744, + "learning_rate": 5.1133190615277414e-05, + "loss": 1.7125, + "step": 16552 + }, + { + "epoch": 5.0807243707796195, + "grad_norm": 0.2297726720571518, + "learning_rate": 5.11282213265507e-05, + "loss": 1.7602, + "step": 16553 + }, + { + "epoch": 5.081031307550645, + "grad_norm": 0.22392617166042328, + "learning_rate": 5.112325202667421e-05, + "loss": 1.7251, + "step": 16554 + }, + { + "epoch": 5.08133824432167, + "grad_norm": 0.22406147420406342, + "learning_rate": 5.11182827156971e-05, + "loss": 1.7232, + "step": 16555 + }, + { + "epoch": 5.081645181092695, + "grad_norm": 0.2547284960746765, + "learning_rate": 5.111331339366846e-05, + "loss": 1.7335, + "step": 16556 + }, + { + "epoch": 5.08195211786372, + "grad_norm": 0.216146782040596, + "learning_rate": 5.1108344060637415e-05, + "loss": 1.7469, + "step": 16557 + }, + { + "epoch": 5.082259054634745, + "grad_norm": 0.1926967352628708, + "learning_rate": 5.110337471665306e-05, + "loss": 1.7492, + "step": 16558 + }, + { + "epoch": 5.082565991405771, + "grad_norm": 0.30311331152915955, + "learning_rate": 5.109840536176451e-05, + "loss": 1.8129, + "step": 16559 + }, + { + "epoch": 5.082872928176796, + "grad_norm": 0.24273787438869476, + "learning_rate": 5.109343599602087e-05, + "loss": 1.7206, + "step": 16560 + }, + { + "epoch": 5.08317986494782, + "grad_norm": 0.22736592590808868, + "learning_rate": 5.1088466619471255e-05, + "loss": 1.732, + "step": 16561 + }, + { + "epoch": 5.083486801718846, + "grad_norm": 0.21457640826702118, + "learning_rate": 5.1083497232164777e-05, + "loss": 1.726, + "step": 16562 + }, + { + "epoch": 5.083793738489871, + "grad_norm": 0.20968590676784515, + "learning_rate": 5.107852783415055e-05, + "loss": 1.8095, + "step": 16563 + }, + { + "epoch": 5.084100675260896, + "grad_norm": 0.2846728265285492, + "learning_rate": 5.107355842547768e-05, + "loss": 1.7524, + "step": 16564 + }, + { + "epoch": 5.084407612031922, + "grad_norm": 0.21162885427474976, + "learning_rate": 5.106858900619526e-05, + "loss": 1.753, + "step": 16565 + }, + { + "epoch": 5.084714548802946, + "grad_norm": 0.24349012970924377, + "learning_rate": 5.106361957635242e-05, + "loss": 1.7003, + "step": 16566 + }, + { + "epoch": 5.0850214855739715, + "grad_norm": 0.24532537162303925, + "learning_rate": 5.105865013599828e-05, + "loss": 1.7818, + "step": 16567 + }, + { + "epoch": 5.085328422344997, + "grad_norm": 0.22788558900356293, + "learning_rate": 5.1053680685181926e-05, + "loss": 1.7291, + "step": 16568 + }, + { + "epoch": 5.085635359116022, + "grad_norm": 0.22402508556842804, + "learning_rate": 5.10487112239525e-05, + "loss": 1.8292, + "step": 16569 + }, + { + "epoch": 5.0859422958870475, + "grad_norm": 0.2396162748336792, + "learning_rate": 5.1043741752359085e-05, + "loss": 1.7441, + "step": 16570 + }, + { + "epoch": 5.086249232658073, + "grad_norm": 0.22364887595176697, + "learning_rate": 5.1038772270450796e-05, + "loss": 1.7356, + "step": 16571 + }, + { + "epoch": 5.086556169429097, + "grad_norm": 0.20385414361953735, + "learning_rate": 5.103380277827676e-05, + "loss": 1.774, + "step": 16572 + }, + { + "epoch": 5.086863106200123, + "grad_norm": 0.2050715535879135, + "learning_rate": 5.102883327588608e-05, + "loss": 1.7217, + "step": 16573 + }, + { + "epoch": 5.087170042971148, + "grad_norm": 0.23750410974025726, + "learning_rate": 5.102386376332786e-05, + "loss": 1.7605, + "step": 16574 + }, + { + "epoch": 5.087476979742173, + "grad_norm": 0.24313338100910187, + "learning_rate": 5.101889424065122e-05, + "loss": 1.7498, + "step": 16575 + }, + { + "epoch": 5.087783916513199, + "grad_norm": 0.22145850956439972, + "learning_rate": 5.101392470790527e-05, + "loss": 1.7827, + "step": 16576 + }, + { + "epoch": 5.088090853284223, + "grad_norm": 0.23073779046535492, + "learning_rate": 5.100895516513912e-05, + "loss": 1.7722, + "step": 16577 + }, + { + "epoch": 5.088397790055248, + "grad_norm": 0.2112295925617218, + "learning_rate": 5.100398561240188e-05, + "loss": 1.7755, + "step": 16578 + }, + { + "epoch": 5.088704726826274, + "grad_norm": 0.23263800144195557, + "learning_rate": 5.0999016049742675e-05, + "loss": 1.7593, + "step": 16579 + }, + { + "epoch": 5.089011663597299, + "grad_norm": 0.23011381924152374, + "learning_rate": 5.09940464772106e-05, + "loss": 1.704, + "step": 16580 + }, + { + "epoch": 5.089318600368324, + "grad_norm": 0.1930779367685318, + "learning_rate": 5.0989076894854785e-05, + "loss": 1.7038, + "step": 16581 + }, + { + "epoch": 5.08962553713935, + "grad_norm": 0.2100505381822586, + "learning_rate": 5.098410730272433e-05, + "loss": 1.7671, + "step": 16582 + }, + { + "epoch": 5.089932473910374, + "grad_norm": 0.1919277459383011, + "learning_rate": 5.097913770086833e-05, + "loss": 1.651, + "step": 16583 + }, + { + "epoch": 5.0902394106813995, + "grad_norm": 0.23310615122318268, + "learning_rate": 5.097416808933594e-05, + "loss": 1.8294, + "step": 16584 + }, + { + "epoch": 5.090546347452425, + "grad_norm": 0.26191771030426025, + "learning_rate": 5.096919846817624e-05, + "loss": 1.7522, + "step": 16585 + }, + { + "epoch": 5.09085328422345, + "grad_norm": 0.2508419156074524, + "learning_rate": 5.096422883743835e-05, + "loss": 1.8025, + "step": 16586 + }, + { + "epoch": 5.0911602209944755, + "grad_norm": 0.23192499577999115, + "learning_rate": 5.0959259197171414e-05, + "loss": 1.7885, + "step": 16587 + }, + { + "epoch": 5.0914671577655, + "grad_norm": 0.2164602279663086, + "learning_rate": 5.095428954742448e-05, + "loss": 1.7299, + "step": 16588 + }, + { + "epoch": 5.091774094536525, + "grad_norm": 0.21431668102741241, + "learning_rate": 5.094931988824671e-05, + "loss": 1.7122, + "step": 16589 + }, + { + "epoch": 5.092081031307551, + "grad_norm": 0.20563583076000214, + "learning_rate": 5.094435021968722e-05, + "loss": 1.7118, + "step": 16590 + }, + { + "epoch": 5.092387968078576, + "grad_norm": 0.20916326344013214, + "learning_rate": 5.093938054179509e-05, + "loss": 1.7639, + "step": 16591 + }, + { + "epoch": 5.092694904849601, + "grad_norm": 0.21197481453418732, + "learning_rate": 5.0934410854619454e-05, + "loss": 1.7357, + "step": 16592 + }, + { + "epoch": 5.093001841620626, + "grad_norm": 0.21085995435714722, + "learning_rate": 5.092944115820942e-05, + "loss": 1.6921, + "step": 16593 + }, + { + "epoch": 5.093308778391651, + "grad_norm": 0.2608145773410797, + "learning_rate": 5.09244714526141e-05, + "loss": 1.7541, + "step": 16594 + }, + { + "epoch": 5.093615715162676, + "grad_norm": 0.2138587087392807, + "learning_rate": 5.0919501737882624e-05, + "loss": 1.727, + "step": 16595 + }, + { + "epoch": 5.093922651933702, + "grad_norm": 0.230251282453537, + "learning_rate": 5.0914532014064084e-05, + "loss": 1.7828, + "step": 16596 + }, + { + "epoch": 5.094229588704727, + "grad_norm": 0.2162851244211197, + "learning_rate": 5.0909562281207614e-05, + "loss": 1.6905, + "step": 16597 + }, + { + "epoch": 5.094536525475752, + "grad_norm": 0.20637664198875427, + "learning_rate": 5.090459253936231e-05, + "loss": 1.7484, + "step": 16598 + }, + { + "epoch": 5.094843462246777, + "grad_norm": 0.19427815079689026, + "learning_rate": 5.089962278857728e-05, + "loss": 1.7379, + "step": 16599 + }, + { + "epoch": 5.095150399017802, + "grad_norm": 0.1877593845129013, + "learning_rate": 5.089465302890165e-05, + "loss": 1.7017, + "step": 16600 + }, + { + "epoch": 5.0954573357888275, + "grad_norm": 0.19219037890434265, + "learning_rate": 5.0889683260384543e-05, + "loss": 1.7379, + "step": 16601 + }, + { + "epoch": 5.095764272559853, + "grad_norm": 0.19855685532093048, + "learning_rate": 5.088471348307507e-05, + "loss": 1.7171, + "step": 16602 + }, + { + "epoch": 5.096071209330878, + "grad_norm": 0.19119660556316376, + "learning_rate": 5.087974369702235e-05, + "loss": 1.6912, + "step": 16603 + }, + { + "epoch": 5.096378146101903, + "grad_norm": 0.2102670818567276, + "learning_rate": 5.0874773902275476e-05, + "loss": 1.6825, + "step": 16604 + }, + { + "epoch": 5.096685082872928, + "grad_norm": 0.2120765596628189, + "learning_rate": 5.0869804098883564e-05, + "loss": 1.7055, + "step": 16605 + }, + { + "epoch": 5.096992019643953, + "grad_norm": 0.25874772667884827, + "learning_rate": 5.0864834286895745e-05, + "loss": 1.7193, + "step": 16606 + }, + { + "epoch": 5.097298956414979, + "grad_norm": 0.20822012424468994, + "learning_rate": 5.085986446636113e-05, + "loss": 1.6748, + "step": 16607 + }, + { + "epoch": 5.097605893186004, + "grad_norm": 0.21364718675613403, + "learning_rate": 5.085489463732883e-05, + "loss": 1.7762, + "step": 16608 + }, + { + "epoch": 5.097912829957028, + "grad_norm": 0.21961788833141327, + "learning_rate": 5.084992479984796e-05, + "loss": 1.7243, + "step": 16609 + }, + { + "epoch": 5.098219766728054, + "grad_norm": 0.22056026756763458, + "learning_rate": 5.0844954953967624e-05, + "loss": 1.6983, + "step": 16610 + }, + { + "epoch": 5.098526703499079, + "grad_norm": 0.21347738802433014, + "learning_rate": 5.083998509973695e-05, + "loss": 1.7319, + "step": 16611 + }, + { + "epoch": 5.098833640270104, + "grad_norm": 0.23593664169311523, + "learning_rate": 5.083501523720506e-05, + "loss": 1.7121, + "step": 16612 + }, + { + "epoch": 5.09914057704113, + "grad_norm": 0.2088623344898224, + "learning_rate": 5.0830045366421055e-05, + "loss": 1.72, + "step": 16613 + }, + { + "epoch": 5.099447513812155, + "grad_norm": 0.2293832004070282, + "learning_rate": 5.082507548743406e-05, + "loss": 1.7548, + "step": 16614 + }, + { + "epoch": 5.0997544505831796, + "grad_norm": 0.2509057819843292, + "learning_rate": 5.082010560029319e-05, + "loss": 1.7729, + "step": 16615 + }, + { + "epoch": 5.100061387354205, + "grad_norm": 0.1925390362739563, + "learning_rate": 5.081513570504755e-05, + "loss": 1.7109, + "step": 16616 + }, + { + "epoch": 5.10036832412523, + "grad_norm": 0.20876559615135193, + "learning_rate": 5.081016580174626e-05, + "loss": 1.7031, + "step": 16617 + }, + { + "epoch": 5.100675260896256, + "grad_norm": 0.2038683146238327, + "learning_rate": 5.080519589043842e-05, + "loss": 1.7489, + "step": 16618 + }, + { + "epoch": 5.100982197667281, + "grad_norm": 0.25018224120140076, + "learning_rate": 5.080022597117318e-05, + "loss": 1.7884, + "step": 16619 + }, + { + "epoch": 5.101289134438305, + "grad_norm": 0.24430342018604279, + "learning_rate": 5.079525604399965e-05, + "loss": 1.7558, + "step": 16620 + }, + { + "epoch": 5.101596071209331, + "grad_norm": 0.22151432931423187, + "learning_rate": 5.079028610896692e-05, + "loss": 1.7543, + "step": 16621 + }, + { + "epoch": 5.101903007980356, + "grad_norm": 0.2313055694103241, + "learning_rate": 5.0785316166124107e-05, + "loss": 1.7755, + "step": 16622 + }, + { + "epoch": 5.102209944751381, + "grad_norm": 0.27405816316604614, + "learning_rate": 5.0780346215520355e-05, + "loss": 1.7006, + "step": 16623 + }, + { + "epoch": 5.102516881522407, + "grad_norm": 0.2209920734167099, + "learning_rate": 5.077537625720476e-05, + "loss": 1.6877, + "step": 16624 + }, + { + "epoch": 5.102823818293431, + "grad_norm": 0.20993784070014954, + "learning_rate": 5.077040629122645e-05, + "loss": 1.7558, + "step": 16625 + }, + { + "epoch": 5.1031307550644565, + "grad_norm": 0.25554344058036804, + "learning_rate": 5.076543631763453e-05, + "loss": 1.7142, + "step": 16626 + }, + { + "epoch": 5.103437691835482, + "grad_norm": 0.28980588912963867, + "learning_rate": 5.0760466336478116e-05, + "loss": 1.7632, + "step": 16627 + }, + { + "epoch": 5.103744628606507, + "grad_norm": 0.20144744217395782, + "learning_rate": 5.075549634780633e-05, + "loss": 1.7472, + "step": 16628 + }, + { + "epoch": 5.1040515653775325, + "grad_norm": 0.30335596203804016, + "learning_rate": 5.075052635166827e-05, + "loss": 1.7283, + "step": 16629 + }, + { + "epoch": 5.104358502148558, + "grad_norm": 0.3014097213745117, + "learning_rate": 5.074555634811309e-05, + "loss": 1.7273, + "step": 16630 + }, + { + "epoch": 5.104665438919582, + "grad_norm": 0.20123563706874847, + "learning_rate": 5.074058633718988e-05, + "loss": 1.7119, + "step": 16631 + }, + { + "epoch": 5.104972375690608, + "grad_norm": 0.3375137746334076, + "learning_rate": 5.073561631894776e-05, + "loss": 1.7594, + "step": 16632 + }, + { + "epoch": 5.105279312461633, + "grad_norm": 0.3471776247024536, + "learning_rate": 5.0730646293435846e-05, + "loss": 1.729, + "step": 16633 + }, + { + "epoch": 5.105586249232658, + "grad_norm": 0.26405471563339233, + "learning_rate": 5.072567626070327e-05, + "loss": 1.7472, + "step": 16634 + }, + { + "epoch": 5.105893186003684, + "grad_norm": 0.2339334636926651, + "learning_rate": 5.072070622079911e-05, + "loss": 1.7285, + "step": 16635 + }, + { + "epoch": 5.106200122774708, + "grad_norm": 0.26267752051353455, + "learning_rate": 5.0715736173772534e-05, + "loss": 1.7171, + "step": 16636 + }, + { + "epoch": 5.106507059545733, + "grad_norm": 0.22254765033721924, + "learning_rate": 5.0710766119672626e-05, + "loss": 1.7702, + "step": 16637 + }, + { + "epoch": 5.106813996316759, + "grad_norm": 0.2457888424396515, + "learning_rate": 5.070579605854852e-05, + "loss": 1.7987, + "step": 16638 + }, + { + "epoch": 5.107120933087784, + "grad_norm": 0.24500930309295654, + "learning_rate": 5.070082599044931e-05, + "loss": 1.8103, + "step": 16639 + }, + { + "epoch": 5.107427869858809, + "grad_norm": 0.24446405470371246, + "learning_rate": 5.0695855915424116e-05, + "loss": 1.7058, + "step": 16640 + }, + { + "epoch": 5.107734806629834, + "grad_norm": 0.22352534532546997, + "learning_rate": 5.0690885833522086e-05, + "loss": 1.7503, + "step": 16641 + }, + { + "epoch": 5.108041743400859, + "grad_norm": 0.2308795005083084, + "learning_rate": 5.068591574479231e-05, + "loss": 1.8064, + "step": 16642 + }, + { + "epoch": 5.1083486801718845, + "grad_norm": 0.23804180324077606, + "learning_rate": 5.068094564928392e-05, + "loss": 1.7603, + "step": 16643 + }, + { + "epoch": 5.10865561694291, + "grad_norm": 0.1956508308649063, + "learning_rate": 5.0675975547046016e-05, + "loss": 1.7448, + "step": 16644 + }, + { + "epoch": 5.108962553713935, + "grad_norm": 0.24438725411891937, + "learning_rate": 5.067100543812773e-05, + "loss": 1.7706, + "step": 16645 + }, + { + "epoch": 5.1092694904849605, + "grad_norm": 0.26129621267318726, + "learning_rate": 5.066603532257817e-05, + "loss": 1.7321, + "step": 16646 + }, + { + "epoch": 5.109576427255985, + "grad_norm": 0.2024240493774414, + "learning_rate": 5.066106520044646e-05, + "loss": 1.7033, + "step": 16647 + }, + { + "epoch": 5.10988336402701, + "grad_norm": 0.2096802294254303, + "learning_rate": 5.0656095071781716e-05, + "loss": 1.716, + "step": 16648 + }, + { + "epoch": 5.110190300798036, + "grad_norm": 0.20643317699432373, + "learning_rate": 5.0651124936633054e-05, + "loss": 1.7473, + "step": 16649 + }, + { + "epoch": 5.110497237569061, + "grad_norm": 0.2268853783607483, + "learning_rate": 5.0646154795049604e-05, + "loss": 1.7844, + "step": 16650 + }, + { + "epoch": 5.110804174340086, + "grad_norm": 0.20215095579624176, + "learning_rate": 5.064118464708046e-05, + "loss": 1.7138, + "step": 16651 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 0.19411569833755493, + "learning_rate": 5.063621449277476e-05, + "loss": 1.7526, + "step": 16652 + }, + { + "epoch": 5.111418047882136, + "grad_norm": 0.20199783146381378, + "learning_rate": 5.063124433218161e-05, + "loss": 1.806, + "step": 16653 + }, + { + "epoch": 5.111724984653161, + "grad_norm": 0.23351836204528809, + "learning_rate": 5.0626274165350165e-05, + "loss": 1.7529, + "step": 16654 + }, + { + "epoch": 5.112031921424187, + "grad_norm": 0.21098989248275757, + "learning_rate": 5.062130399232948e-05, + "loss": 1.7647, + "step": 16655 + }, + { + "epoch": 5.112338858195212, + "grad_norm": 0.21959169209003448, + "learning_rate": 5.0616333813168714e-05, + "loss": 1.7462, + "step": 16656 + }, + { + "epoch": 5.112645794966237, + "grad_norm": 0.21173696219921112, + "learning_rate": 5.061136362791696e-05, + "loss": 1.7413, + "step": 16657 + }, + { + "epoch": 5.112952731737262, + "grad_norm": 0.22357577085494995, + "learning_rate": 5.0606393436623365e-05, + "loss": 1.7163, + "step": 16658 + }, + { + "epoch": 5.113259668508287, + "grad_norm": 0.24364936351776123, + "learning_rate": 5.060142323933704e-05, + "loss": 1.8139, + "step": 16659 + }, + { + "epoch": 5.1135666052793125, + "grad_norm": 0.21646073460578918, + "learning_rate": 5.05964530361071e-05, + "loss": 1.741, + "step": 16660 + }, + { + "epoch": 5.113873542050338, + "grad_norm": 0.24261775612831116, + "learning_rate": 5.059148282698265e-05, + "loss": 1.7162, + "step": 16661 + }, + { + "epoch": 5.114180478821363, + "grad_norm": 0.22883281111717224, + "learning_rate": 5.058651261201283e-05, + "loss": 1.7342, + "step": 16662 + }, + { + "epoch": 5.114487415592388, + "grad_norm": 0.2616727352142334, + "learning_rate": 5.058154239124674e-05, + "loss": 1.8054, + "step": 16663 + }, + { + "epoch": 5.114794352363413, + "grad_norm": 0.21293358504772186, + "learning_rate": 5.0576572164733505e-05, + "loss": 1.742, + "step": 16664 + }, + { + "epoch": 5.115101289134438, + "grad_norm": 0.20037685334682465, + "learning_rate": 5.057160193252225e-05, + "loss": 1.7518, + "step": 16665 + }, + { + "epoch": 5.115408225905464, + "grad_norm": 0.19102689623832703, + "learning_rate": 5.056663169466209e-05, + "loss": 1.6892, + "step": 16666 + }, + { + "epoch": 5.115715162676489, + "grad_norm": 0.22261591255664825, + "learning_rate": 5.056166145120216e-05, + "loss": 1.7744, + "step": 16667 + }, + { + "epoch": 5.116022099447513, + "grad_norm": 0.23966702818870544, + "learning_rate": 5.055669120219154e-05, + "loss": 1.7786, + "step": 16668 + }, + { + "epoch": 5.116329036218539, + "grad_norm": 0.22008271515369415, + "learning_rate": 5.055172094767937e-05, + "loss": 1.7501, + "step": 16669 + }, + { + "epoch": 5.116635972989564, + "grad_norm": 0.21643415093421936, + "learning_rate": 5.054675068771478e-05, + "loss": 1.7548, + "step": 16670 + }, + { + "epoch": 5.116942909760589, + "grad_norm": 0.24661116302013397, + "learning_rate": 5.0541780422346894e-05, + "loss": 1.8117, + "step": 16671 + }, + { + "epoch": 5.117249846531615, + "grad_norm": 0.21393093466758728, + "learning_rate": 5.05368101516248e-05, + "loss": 1.7341, + "step": 16672 + }, + { + "epoch": 5.11755678330264, + "grad_norm": 0.30949896574020386, + "learning_rate": 5.053183987559763e-05, + "loss": 1.7703, + "step": 16673 + }, + { + "epoch": 5.1178637200736645, + "grad_norm": 0.22236786782741547, + "learning_rate": 5.052686959431451e-05, + "loss": 1.719, + "step": 16674 + }, + { + "epoch": 5.11817065684469, + "grad_norm": 0.26826921105384827, + "learning_rate": 5.052189930782455e-05, + "loss": 1.741, + "step": 16675 + }, + { + "epoch": 5.118477593615715, + "grad_norm": 0.2608947455883026, + "learning_rate": 5.051692901617688e-05, + "loss": 1.7062, + "step": 16676 + }, + { + "epoch": 5.1187845303867405, + "grad_norm": 0.20709002017974854, + "learning_rate": 5.051195871942063e-05, + "loss": 1.703, + "step": 16677 + }, + { + "epoch": 5.119091467157766, + "grad_norm": 0.18957734107971191, + "learning_rate": 5.0506988417604885e-05, + "loss": 1.762, + "step": 16678 + }, + { + "epoch": 5.11939840392879, + "grad_norm": 0.21578781306743622, + "learning_rate": 5.050201811077879e-05, + "loss": 1.7167, + "step": 16679 + }, + { + "epoch": 5.119705340699816, + "grad_norm": 0.2253631353378296, + "learning_rate": 5.049704779899145e-05, + "loss": 1.7374, + "step": 16680 + }, + { + "epoch": 5.120012277470841, + "grad_norm": 0.1977664828300476, + "learning_rate": 5.049207748229199e-05, + "loss": 1.7399, + "step": 16681 + }, + { + "epoch": 5.120319214241866, + "grad_norm": 0.2964428663253784, + "learning_rate": 5.048710716072954e-05, + "loss": 1.8359, + "step": 16682 + }, + { + "epoch": 5.120626151012892, + "grad_norm": 0.24788637459278107, + "learning_rate": 5.0482136834353224e-05, + "loss": 1.7593, + "step": 16683 + }, + { + "epoch": 5.120933087783916, + "grad_norm": 0.21537743508815765, + "learning_rate": 5.0477166503212135e-05, + "loss": 1.7472, + "step": 16684 + }, + { + "epoch": 5.121240024554941, + "grad_norm": 0.2055196613073349, + "learning_rate": 5.047219616735541e-05, + "loss": 1.7106, + "step": 16685 + }, + { + "epoch": 5.121546961325967, + "grad_norm": 0.19770687818527222, + "learning_rate": 5.046722582683215e-05, + "loss": 1.6887, + "step": 16686 + }, + { + "epoch": 5.121853898096992, + "grad_norm": 0.20407389104366302, + "learning_rate": 5.046225548169151e-05, + "loss": 1.7412, + "step": 16687 + }, + { + "epoch": 5.122160834868017, + "grad_norm": 0.20153474807739258, + "learning_rate": 5.045728513198259e-05, + "loss": 1.7643, + "step": 16688 + }, + { + "epoch": 5.122467771639043, + "grad_norm": 0.18737752735614777, + "learning_rate": 5.045231477775452e-05, + "loss": 1.763, + "step": 16689 + }, + { + "epoch": 5.122774708410067, + "grad_norm": 0.19790658354759216, + "learning_rate": 5.0447344419056385e-05, + "loss": 1.7446, + "step": 16690 + }, + { + "epoch": 5.1230816451810925, + "grad_norm": 0.21496973931789398, + "learning_rate": 5.0442374055937336e-05, + "loss": 1.7756, + "step": 16691 + }, + { + "epoch": 5.123388581952118, + "grad_norm": 0.19318655133247375, + "learning_rate": 5.043740368844649e-05, + "loss": 1.7687, + "step": 16692 + }, + { + "epoch": 5.123695518723143, + "grad_norm": 0.2237338423728943, + "learning_rate": 5.0432433316632976e-05, + "loss": 1.7258, + "step": 16693 + }, + { + "epoch": 5.1240024554941686, + "grad_norm": 0.2257162630558014, + "learning_rate": 5.042746294054589e-05, + "loss": 1.7462, + "step": 16694 + }, + { + "epoch": 5.124309392265193, + "grad_norm": 0.25666359066963196, + "learning_rate": 5.0422492560234366e-05, + "loss": 1.7318, + "step": 16695 + }, + { + "epoch": 5.124616329036218, + "grad_norm": 0.2615324556827545, + "learning_rate": 5.0417522175747536e-05, + "loss": 1.7533, + "step": 16696 + }, + { + "epoch": 5.124923265807244, + "grad_norm": 0.2372874766588211, + "learning_rate": 5.0412551787134475e-05, + "loss": 1.7361, + "step": 16697 + }, + { + "epoch": 5.125230202578269, + "grad_norm": 0.25976815819740295, + "learning_rate": 5.040758139444436e-05, + "loss": 1.7542, + "step": 16698 + }, + { + "epoch": 5.125537139349294, + "grad_norm": 0.36173003911972046, + "learning_rate": 5.040261099772629e-05, + "loss": 1.7421, + "step": 16699 + }, + { + "epoch": 5.12584407612032, + "grad_norm": 0.2767728269100189, + "learning_rate": 5.039764059702937e-05, + "loss": 1.7341, + "step": 16700 + }, + { + "epoch": 5.126151012891344, + "grad_norm": 0.20185241103172302, + "learning_rate": 5.039267019240275e-05, + "loss": 1.7068, + "step": 16701 + }, + { + "epoch": 5.1264579496623695, + "grad_norm": 0.26872581243515015, + "learning_rate": 5.0387699783895514e-05, + "loss": 1.7404, + "step": 16702 + }, + { + "epoch": 5.126764886433395, + "grad_norm": 0.2867858111858368, + "learning_rate": 5.038272937155682e-05, + "loss": 1.7702, + "step": 16703 + }, + { + "epoch": 5.12707182320442, + "grad_norm": 0.20939521491527557, + "learning_rate": 5.037775895543574e-05, + "loss": 1.7653, + "step": 16704 + }, + { + "epoch": 5.1273787599754455, + "grad_norm": 0.2674047648906708, + "learning_rate": 5.037278853558146e-05, + "loss": 1.701, + "step": 16705 + }, + { + "epoch": 5.12768569674647, + "grad_norm": 0.20776906609535217, + "learning_rate": 5.036781811204304e-05, + "loss": 1.7476, + "step": 16706 + }, + { + "epoch": 5.127992633517495, + "grad_norm": 0.2695952355861664, + "learning_rate": 5.036284768486964e-05, + "loss": 1.7206, + "step": 16707 + }, + { + "epoch": 5.128299570288521, + "grad_norm": 0.30661383271217346, + "learning_rate": 5.0357877254110363e-05, + "loss": 1.72, + "step": 16708 + }, + { + "epoch": 5.128606507059546, + "grad_norm": 0.2527785003185272, + "learning_rate": 5.0352906819814316e-05, + "loss": 1.6936, + "step": 16709 + }, + { + "epoch": 5.128913443830571, + "grad_norm": 0.23000696301460266, + "learning_rate": 5.034793638203066e-05, + "loss": 1.7634, + "step": 16710 + }, + { + "epoch": 5.129220380601596, + "grad_norm": 0.33594760298728943, + "learning_rate": 5.0342965940808486e-05, + "loss": 1.6952, + "step": 16711 + }, + { + "epoch": 5.129527317372621, + "grad_norm": 0.22834168374538422, + "learning_rate": 5.033799549619692e-05, + "loss": 1.7537, + "step": 16712 + }, + { + "epoch": 5.129834254143646, + "grad_norm": 0.26585114002227783, + "learning_rate": 5.033302504824509e-05, + "loss": 1.7554, + "step": 16713 + }, + { + "epoch": 5.130141190914672, + "grad_norm": 0.25632211565971375, + "learning_rate": 5.032805459700211e-05, + "loss": 1.8141, + "step": 16714 + }, + { + "epoch": 5.130448127685697, + "grad_norm": 0.256523996591568, + "learning_rate": 5.0323084142517084e-05, + "loss": 1.777, + "step": 16715 + }, + { + "epoch": 5.1307550644567215, + "grad_norm": 0.31409457325935364, + "learning_rate": 5.0318113684839166e-05, + "loss": 1.7414, + "step": 16716 + }, + { + "epoch": 5.131062001227747, + "grad_norm": 0.21156816184520721, + "learning_rate": 5.0313143224017455e-05, + "loss": 1.7397, + "step": 16717 + }, + { + "epoch": 5.131368937998772, + "grad_norm": 0.23596547544002533, + "learning_rate": 5.030817276010109e-05, + "loss": 1.752, + "step": 16718 + }, + { + "epoch": 5.1316758747697975, + "grad_norm": 0.2587638199329376, + "learning_rate": 5.0303202293139186e-05, + "loss": 1.7645, + "step": 16719 + }, + { + "epoch": 5.131982811540823, + "grad_norm": 0.2006666213274002, + "learning_rate": 5.029823182318084e-05, + "loss": 1.7009, + "step": 16720 + }, + { + "epoch": 5.132289748311848, + "grad_norm": 0.3075694739818573, + "learning_rate": 5.029326135027521e-05, + "loss": 1.749, + "step": 16721 + }, + { + "epoch": 5.132596685082873, + "grad_norm": 0.3116205334663391, + "learning_rate": 5.028829087447139e-05, + "loss": 1.7458, + "step": 16722 + }, + { + "epoch": 5.132903621853898, + "grad_norm": 0.17925913631916046, + "learning_rate": 5.028332039581851e-05, + "loss": 1.6502, + "step": 16723 + }, + { + "epoch": 5.133210558624923, + "grad_norm": 0.21779952943325043, + "learning_rate": 5.0278349914365694e-05, + "loss": 1.7656, + "step": 16724 + }, + { + "epoch": 5.133517495395949, + "grad_norm": 0.20085318386554718, + "learning_rate": 5.027337943016207e-05, + "loss": 1.7662, + "step": 16725 + }, + { + "epoch": 5.133824432166974, + "grad_norm": 0.19975553452968597, + "learning_rate": 5.026840894325673e-05, + "loss": 1.7392, + "step": 16726 + }, + { + "epoch": 5.134131368937998, + "grad_norm": 0.20610745251178741, + "learning_rate": 5.026343845369883e-05, + "loss": 1.7221, + "step": 16727 + }, + { + "epoch": 5.134438305709024, + "grad_norm": 0.21451768279075623, + "learning_rate": 5.025846796153747e-05, + "loss": 1.8381, + "step": 16728 + }, + { + "epoch": 5.134745242480049, + "grad_norm": 0.19518613815307617, + "learning_rate": 5.0253497466821786e-05, + "loss": 1.7483, + "step": 16729 + }, + { + "epoch": 5.135052179251074, + "grad_norm": 0.24284996092319489, + "learning_rate": 5.024852696960088e-05, + "loss": 1.7895, + "step": 16730 + }, + { + "epoch": 5.1353591160221, + "grad_norm": 0.23962461948394775, + "learning_rate": 5.0243556469923905e-05, + "loss": 1.8468, + "step": 16731 + }, + { + "epoch": 5.135666052793125, + "grad_norm": 0.20455054938793182, + "learning_rate": 5.023858596783993e-05, + "loss": 1.6973, + "step": 16732 + }, + { + "epoch": 5.1359729895641495, + "grad_norm": 0.20629842579364777, + "learning_rate": 5.023361546339813e-05, + "loss": 1.7608, + "step": 16733 + }, + { + "epoch": 5.136279926335175, + "grad_norm": 0.19375818967819214, + "learning_rate": 5.0228644956647606e-05, + "loss": 1.7327, + "step": 16734 + }, + { + "epoch": 5.1365868631062, + "grad_norm": 0.20960548520088196, + "learning_rate": 5.022367444763748e-05, + "loss": 1.7227, + "step": 16735 + }, + { + "epoch": 5.1368937998772255, + "grad_norm": 0.24732786417007446, + "learning_rate": 5.021870393641687e-05, + "loss": 1.8144, + "step": 16736 + }, + { + "epoch": 5.137200736648251, + "grad_norm": 0.22190099954605103, + "learning_rate": 5.021373342303489e-05, + "loss": 1.705, + "step": 16737 + }, + { + "epoch": 5.137507673419275, + "grad_norm": 0.2091664969921112, + "learning_rate": 5.020876290754069e-05, + "loss": 1.7926, + "step": 16738 + }, + { + "epoch": 5.137814610190301, + "grad_norm": 0.22298938035964966, + "learning_rate": 5.020379238998335e-05, + "loss": 1.7782, + "step": 16739 + }, + { + "epoch": 5.138121546961326, + "grad_norm": 0.20843006670475006, + "learning_rate": 5.019882187041203e-05, + "loss": 1.7245, + "step": 16740 + }, + { + "epoch": 5.138428483732351, + "grad_norm": 0.23383544385433197, + "learning_rate": 5.019385134887583e-05, + "loss": 1.6834, + "step": 16741 + }, + { + "epoch": 5.138735420503377, + "grad_norm": 0.3015683889389038, + "learning_rate": 5.018888082542388e-05, + "loss": 1.7636, + "step": 16742 + }, + { + "epoch": 5.139042357274401, + "grad_norm": 0.2253810614347458, + "learning_rate": 5.0183910300105284e-05, + "loss": 1.7375, + "step": 16743 + }, + { + "epoch": 5.139349294045426, + "grad_norm": 0.2064623087644577, + "learning_rate": 5.01789397729692e-05, + "loss": 1.7683, + "step": 16744 + }, + { + "epoch": 5.139656230816452, + "grad_norm": 0.2106693685054779, + "learning_rate": 5.0173969244064724e-05, + "loss": 1.7432, + "step": 16745 + }, + { + "epoch": 5.139963167587477, + "grad_norm": 0.19944638013839722, + "learning_rate": 5.016899871344097e-05, + "loss": 1.701, + "step": 16746 + }, + { + "epoch": 5.140270104358502, + "grad_norm": 0.23210744559764862, + "learning_rate": 5.016402818114708e-05, + "loss": 1.8008, + "step": 16747 + }, + { + "epoch": 5.140577041129528, + "grad_norm": 0.26014089584350586, + "learning_rate": 5.015905764723217e-05, + "loss": 1.7131, + "step": 16748 + }, + { + "epoch": 5.140883977900552, + "grad_norm": 0.25526607036590576, + "learning_rate": 5.015408711174535e-05, + "loss": 1.7525, + "step": 16749 + }, + { + "epoch": 5.1411909146715775, + "grad_norm": 0.2092386782169342, + "learning_rate": 5.0149116574735756e-05, + "loss": 1.7502, + "step": 16750 + }, + { + "epoch": 5.141497851442603, + "grad_norm": 0.21560105681419373, + "learning_rate": 5.01441460362525e-05, + "loss": 1.7903, + "step": 16751 + }, + { + "epoch": 5.141804788213628, + "grad_norm": 0.23538467288017273, + "learning_rate": 5.013917549634471e-05, + "loss": 1.6995, + "step": 16752 + }, + { + "epoch": 5.1421117249846535, + "grad_norm": 0.26545262336730957, + "learning_rate": 5.0134204955061526e-05, + "loss": 1.7511, + "step": 16753 + }, + { + "epoch": 5.142418661755678, + "grad_norm": 0.23030948638916016, + "learning_rate": 5.012923441245203e-05, + "loss": 1.7271, + "step": 16754 + }, + { + "epoch": 5.142725598526703, + "grad_norm": 0.22395408153533936, + "learning_rate": 5.012426386856537e-05, + "loss": 1.7273, + "step": 16755 + }, + { + "epoch": 5.143032535297729, + "grad_norm": 0.21355997025966644, + "learning_rate": 5.011929332345066e-05, + "loss": 1.7347, + "step": 16756 + }, + { + "epoch": 5.143339472068754, + "grad_norm": 0.2355809509754181, + "learning_rate": 5.011432277715702e-05, + "loss": 1.8289, + "step": 16757 + }, + { + "epoch": 5.143646408839779, + "grad_norm": 0.24319802224636078, + "learning_rate": 5.0109352229733584e-05, + "loss": 1.7621, + "step": 16758 + }, + { + "epoch": 5.143953345610804, + "grad_norm": 0.2591453492641449, + "learning_rate": 5.010438168122946e-05, + "loss": 1.8043, + "step": 16759 + }, + { + "epoch": 5.144260282381829, + "grad_norm": 0.22595751285552979, + "learning_rate": 5.009941113169376e-05, + "loss": 1.8137, + "step": 16760 + }, + { + "epoch": 5.144567219152854, + "grad_norm": 0.220921128988266, + "learning_rate": 5.009444058117564e-05, + "loss": 1.7105, + "step": 16761 + }, + { + "epoch": 5.14487415592388, + "grad_norm": 0.25713789463043213, + "learning_rate": 5.0089470029724195e-05, + "loss": 1.8184, + "step": 16762 + }, + { + "epoch": 5.145181092694905, + "grad_norm": 0.19849328696727753, + "learning_rate": 5.008449947738856e-05, + "loss": 1.7331, + "step": 16763 + }, + { + "epoch": 5.14548802946593, + "grad_norm": 0.2073405385017395, + "learning_rate": 5.007952892421785e-05, + "loss": 1.7053, + "step": 16764 + }, + { + "epoch": 5.145794966236955, + "grad_norm": 0.22307951748371124, + "learning_rate": 5.007455837026119e-05, + "loss": 1.7724, + "step": 16765 + }, + { + "epoch": 5.14610190300798, + "grad_norm": 0.22160649299621582, + "learning_rate": 5.006958781556769e-05, + "loss": 1.7191, + "step": 16766 + }, + { + "epoch": 5.1464088397790055, + "grad_norm": 0.2202252298593521, + "learning_rate": 5.0064617260186487e-05, + "loss": 1.7339, + "step": 16767 + }, + { + "epoch": 5.146715776550031, + "grad_norm": 0.23693829774856567, + "learning_rate": 5.005964670416671e-05, + "loss": 1.7143, + "step": 16768 + }, + { + "epoch": 5.147022713321056, + "grad_norm": 0.22675764560699463, + "learning_rate": 5.005467614755746e-05, + "loss": 1.7913, + "step": 16769 + }, + { + "epoch": 5.147329650092081, + "grad_norm": 0.21288467943668365, + "learning_rate": 5.0049705590407866e-05, + "loss": 1.7581, + "step": 16770 + }, + { + "epoch": 5.147636586863106, + "grad_norm": 0.216839998960495, + "learning_rate": 5.0044735032767064e-05, + "loss": 1.7305, + "step": 16771 + }, + { + "epoch": 5.147943523634131, + "grad_norm": 0.2111063450574875, + "learning_rate": 5.003976447468416e-05, + "loss": 1.7444, + "step": 16772 + }, + { + "epoch": 5.148250460405157, + "grad_norm": 0.2536773085594177, + "learning_rate": 5.003479391620827e-05, + "loss": 1.6952, + "step": 16773 + }, + { + "epoch": 5.148557397176182, + "grad_norm": 0.23585477471351624, + "learning_rate": 5.002982335738854e-05, + "loss": 1.6921, + "step": 16774 + }, + { + "epoch": 5.148864333947207, + "grad_norm": 0.1927027702331543, + "learning_rate": 5.002485279827407e-05, + "loss": 1.7781, + "step": 16775 + }, + { + "epoch": 5.149171270718232, + "grad_norm": 0.22545355558395386, + "learning_rate": 5.001988223891399e-05, + "loss": 1.7582, + "step": 16776 + }, + { + "epoch": 5.149478207489257, + "grad_norm": 0.20837660133838654, + "learning_rate": 5.001491167935741e-05, + "loss": 1.7379, + "step": 16777 + }, + { + "epoch": 5.149785144260282, + "grad_norm": 0.20510734617710114, + "learning_rate": 5.000994111965348e-05, + "loss": 1.7568, + "step": 16778 + }, + { + "epoch": 5.150092081031308, + "grad_norm": 0.2629711329936981, + "learning_rate": 5.00049705598513e-05, + "loss": 1.7613, + "step": 16779 + }, + { + "epoch": 5.150399017802333, + "grad_norm": 0.2390555888414383, + "learning_rate": 5e-05, + "loss": 1.7099, + "step": 16780 + }, + { + "epoch": 5.150705954573358, + "grad_norm": 0.19643893837928772, + "learning_rate": 4.9995029440148715e-05, + "loss": 1.7012, + "step": 16781 + }, + { + "epoch": 5.151012891344383, + "grad_norm": 0.1881607472896576, + "learning_rate": 4.999005888034653e-05, + "loss": 1.705, + "step": 16782 + }, + { + "epoch": 5.151319828115408, + "grad_norm": 0.3219485282897949, + "learning_rate": 4.99850883206426e-05, + "loss": 1.8089, + "step": 16783 + }, + { + "epoch": 5.151626764886434, + "grad_norm": 0.22285562753677368, + "learning_rate": 4.998011776108602e-05, + "loss": 1.7343, + "step": 16784 + }, + { + "epoch": 5.151933701657459, + "grad_norm": 0.1981910616159439, + "learning_rate": 4.9975147201725955e-05, + "loss": 1.6939, + "step": 16785 + }, + { + "epoch": 5.152240638428483, + "grad_norm": 0.2338661551475525, + "learning_rate": 4.997017664261148e-05, + "loss": 1.6833, + "step": 16786 + }, + { + "epoch": 5.152547575199509, + "grad_norm": 0.2613268792629242, + "learning_rate": 4.996520608379175e-05, + "loss": 1.7251, + "step": 16787 + }, + { + "epoch": 5.152854511970534, + "grad_norm": 0.26063668727874756, + "learning_rate": 4.996023552531586e-05, + "loss": 1.8444, + "step": 16788 + }, + { + "epoch": 5.153161448741559, + "grad_norm": 0.2711321711540222, + "learning_rate": 4.9955264967232954e-05, + "loss": 1.7257, + "step": 16789 + }, + { + "epoch": 5.153468385512585, + "grad_norm": 0.30134227871894836, + "learning_rate": 4.995029440959213e-05, + "loss": 1.7599, + "step": 16790 + }, + { + "epoch": 5.153775322283609, + "grad_norm": 0.22983741760253906, + "learning_rate": 4.994532385244255e-05, + "loss": 1.7944, + "step": 16791 + }, + { + "epoch": 5.1540822590546345, + "grad_norm": 0.2992973327636719, + "learning_rate": 4.994035329583329e-05, + "loss": 1.7507, + "step": 16792 + }, + { + "epoch": 5.15438919582566, + "grad_norm": 0.2659669518470764, + "learning_rate": 4.993538273981352e-05, + "loss": 1.7246, + "step": 16793 + }, + { + "epoch": 5.154696132596685, + "grad_norm": 0.24235470592975616, + "learning_rate": 4.9930412184432315e-05, + "loss": 1.8378, + "step": 16794 + }, + { + "epoch": 5.1550030693677105, + "grad_norm": 0.30005061626434326, + "learning_rate": 4.992544162973882e-05, + "loss": 1.7526, + "step": 16795 + }, + { + "epoch": 5.155310006138736, + "grad_norm": 0.2183740884065628, + "learning_rate": 4.992047107578215e-05, + "loss": 1.7197, + "step": 16796 + }, + { + "epoch": 5.15561694290976, + "grad_norm": 0.35874706506729126, + "learning_rate": 4.991550052261145e-05, + "loss": 1.8196, + "step": 16797 + }, + { + "epoch": 5.155923879680786, + "grad_norm": 0.42146921157836914, + "learning_rate": 4.991052997027583e-05, + "loss": 1.7165, + "step": 16798 + }, + { + "epoch": 5.156230816451811, + "grad_norm": 0.2738321125507355, + "learning_rate": 4.990555941882437e-05, + "loss": 1.7042, + "step": 16799 + }, + { + "epoch": 5.156537753222836, + "grad_norm": 0.26304566860198975, + "learning_rate": 4.990058886830625e-05, + "loss": 1.7551, + "step": 16800 + }, + { + "epoch": 5.156844689993862, + "grad_norm": 0.4301520586013794, + "learning_rate": 4.9895618318770556e-05, + "loss": 1.7219, + "step": 16801 + }, + { + "epoch": 5.157151626764886, + "grad_norm": 0.3316499590873718, + "learning_rate": 4.989064777026644e-05, + "loss": 1.8034, + "step": 16802 + }, + { + "epoch": 5.157458563535911, + "grad_norm": 0.30105581879615784, + "learning_rate": 4.9885677222842984e-05, + "loss": 1.7022, + "step": 16803 + }, + { + "epoch": 5.157765500306937, + "grad_norm": 0.3830905854701996, + "learning_rate": 4.988070667654937e-05, + "loss": 1.7898, + "step": 16804 + }, + { + "epoch": 5.158072437077962, + "grad_norm": 0.2204640656709671, + "learning_rate": 4.9875736131434644e-05, + "loss": 1.7081, + "step": 16805 + }, + { + "epoch": 5.158379373848987, + "grad_norm": 0.3620772063732147, + "learning_rate": 4.9870765587547976e-05, + "loss": 1.7345, + "step": 16806 + }, + { + "epoch": 5.158686310620013, + "grad_norm": 0.3268207907676697, + "learning_rate": 4.986579504493848e-05, + "loss": 1.7364, + "step": 16807 + }, + { + "epoch": 5.158993247391037, + "grad_norm": 0.2499808967113495, + "learning_rate": 4.986082450365529e-05, + "loss": 1.7836, + "step": 16808 + }, + { + "epoch": 5.1593001841620625, + "grad_norm": 0.3696226477622986, + "learning_rate": 4.98558539637475e-05, + "loss": 1.8094, + "step": 16809 + }, + { + "epoch": 5.159607120933088, + "grad_norm": 0.3239068388938904, + "learning_rate": 4.9850883425264256e-05, + "loss": 1.7448, + "step": 16810 + }, + { + "epoch": 5.159914057704113, + "grad_norm": 0.19875772297382355, + "learning_rate": 4.9845912888254655e-05, + "loss": 1.6945, + "step": 16811 + }, + { + "epoch": 5.1602209944751385, + "grad_norm": 0.3952203691005707, + "learning_rate": 4.984094235276784e-05, + "loss": 1.8457, + "step": 16812 + }, + { + "epoch": 5.160527931246163, + "grad_norm": 0.3052334785461426, + "learning_rate": 4.9835971818852916e-05, + "loss": 1.7371, + "step": 16813 + }, + { + "epoch": 5.160834868017188, + "grad_norm": 0.2874486446380615, + "learning_rate": 4.983100128655904e-05, + "loss": 1.7194, + "step": 16814 + }, + { + "epoch": 5.161141804788214, + "grad_norm": 0.39117491245269775, + "learning_rate": 4.98260307559353e-05, + "loss": 1.7919, + "step": 16815 + }, + { + "epoch": 5.161448741559239, + "grad_norm": 0.2532150149345398, + "learning_rate": 4.982106022703081e-05, + "loss": 1.8103, + "step": 16816 + }, + { + "epoch": 5.161755678330264, + "grad_norm": 0.3545167148113251, + "learning_rate": 4.981608969989473e-05, + "loss": 1.8093, + "step": 16817 + }, + { + "epoch": 5.162062615101289, + "grad_norm": 0.397806316614151, + "learning_rate": 4.981111917457613e-05, + "loss": 1.7885, + "step": 16818 + }, + { + "epoch": 5.162369551872314, + "grad_norm": 0.2523536682128906, + "learning_rate": 4.980614865112419e-05, + "loss": 1.797, + "step": 16819 + }, + { + "epoch": 5.162676488643339, + "grad_norm": 0.3666839301586151, + "learning_rate": 4.980117812958798e-05, + "loss": 1.7859, + "step": 16820 + }, + { + "epoch": 5.162983425414365, + "grad_norm": 0.3392138183116913, + "learning_rate": 4.9796207610016664e-05, + "loss": 1.7717, + "step": 16821 + }, + { + "epoch": 5.16329036218539, + "grad_norm": 0.21040666103363037, + "learning_rate": 4.9791237092459325e-05, + "loss": 1.7447, + "step": 16822 + }, + { + "epoch": 5.163597298956415, + "grad_norm": 0.3140225112438202, + "learning_rate": 4.978626657696512e-05, + "loss": 1.7405, + "step": 16823 + }, + { + "epoch": 5.16390423572744, + "grad_norm": 0.23963581025600433, + "learning_rate": 4.978129606358313e-05, + "loss": 1.7041, + "step": 16824 + }, + { + "epoch": 5.164211172498465, + "grad_norm": 0.32476937770843506, + "learning_rate": 4.977632555236253e-05, + "loss": 1.736, + "step": 16825 + }, + { + "epoch": 5.1645181092694905, + "grad_norm": 0.4362463653087616, + "learning_rate": 4.977135504335239e-05, + "loss": 1.7657, + "step": 16826 + }, + { + "epoch": 5.164825046040516, + "grad_norm": 0.26118260622024536, + "learning_rate": 4.976638453660188e-05, + "loss": 1.7339, + "step": 16827 + }, + { + "epoch": 5.165131982811541, + "grad_norm": 0.27284330129623413, + "learning_rate": 4.9761414032160065e-05, + "loss": 1.8086, + "step": 16828 + }, + { + "epoch": 5.165438919582566, + "grad_norm": 0.2942579388618469, + "learning_rate": 4.975644353007611e-05, + "loss": 1.7869, + "step": 16829 + }, + { + "epoch": 5.165745856353591, + "grad_norm": 0.23257993161678314, + "learning_rate": 4.975147303039912e-05, + "loss": 1.8048, + "step": 16830 + }, + { + "epoch": 5.166052793124616, + "grad_norm": 0.28638842701911926, + "learning_rate": 4.9746502533178225e-05, + "loss": 1.7744, + "step": 16831 + }, + { + "epoch": 5.166359729895642, + "grad_norm": 0.21571335196495056, + "learning_rate": 4.974153203846255e-05, + "loss": 1.7842, + "step": 16832 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 0.268883615732193, + "learning_rate": 4.9736561546301185e-05, + "loss": 1.7194, + "step": 16833 + }, + { + "epoch": 5.166973603437691, + "grad_norm": 0.22934168577194214, + "learning_rate": 4.9731591056743285e-05, + "loss": 1.757, + "step": 16834 + }, + { + "epoch": 5.167280540208717, + "grad_norm": 0.26321718096733093, + "learning_rate": 4.9726620569837946e-05, + "loss": 1.7675, + "step": 16835 + }, + { + "epoch": 5.167587476979742, + "grad_norm": 0.2893882393836975, + "learning_rate": 4.9721650085634325e-05, + "loss": 1.7134, + "step": 16836 + }, + { + "epoch": 5.167894413750767, + "grad_norm": 0.24130617082118988, + "learning_rate": 4.97166796041815e-05, + "loss": 1.7119, + "step": 16837 + }, + { + "epoch": 5.168201350521793, + "grad_norm": 0.23614190518856049, + "learning_rate": 4.9711709125528635e-05, + "loss": 1.7556, + "step": 16838 + }, + { + "epoch": 5.168508287292818, + "grad_norm": 0.2031065821647644, + "learning_rate": 4.97067386497248e-05, + "loss": 1.7678, + "step": 16839 + }, + { + "epoch": 5.1688152240638425, + "grad_norm": 0.30695948004722595, + "learning_rate": 4.970176817681917e-05, + "loss": 1.7907, + "step": 16840 + }, + { + "epoch": 5.169122160834868, + "grad_norm": 0.31256723403930664, + "learning_rate": 4.969679770686082e-05, + "loss": 1.7448, + "step": 16841 + }, + { + "epoch": 5.169429097605893, + "grad_norm": 0.24183644354343414, + "learning_rate": 4.969182723989892e-05, + "loss": 1.7259, + "step": 16842 + }, + { + "epoch": 5.1697360343769185, + "grad_norm": 0.22440548241138458, + "learning_rate": 4.9686856775982536e-05, + "loss": 1.7949, + "step": 16843 + }, + { + "epoch": 5.170042971147944, + "grad_norm": 0.29006195068359375, + "learning_rate": 4.9681886315160846e-05, + "loss": 1.7128, + "step": 16844 + }, + { + "epoch": 5.170349907918968, + "grad_norm": 0.2189658135175705, + "learning_rate": 4.967691585748292e-05, + "loss": 1.7375, + "step": 16845 + }, + { + "epoch": 5.170656844689994, + "grad_norm": 0.289909690618515, + "learning_rate": 4.967194540299791e-05, + "loss": 1.779, + "step": 16846 + }, + { + "epoch": 5.170963781461019, + "grad_norm": 0.28279590606689453, + "learning_rate": 4.966697495175492e-05, + "loss": 1.7368, + "step": 16847 + }, + { + "epoch": 5.171270718232044, + "grad_norm": 0.2056259959936142, + "learning_rate": 4.966200450380309e-05, + "loss": 1.7548, + "step": 16848 + }, + { + "epoch": 5.17157765500307, + "grad_norm": 0.2607482969760895, + "learning_rate": 4.965703405919154e-05, + "loss": 1.7178, + "step": 16849 + }, + { + "epoch": 5.171884591774095, + "grad_norm": 0.26085609197616577, + "learning_rate": 4.965206361796935e-05, + "loss": 1.751, + "step": 16850 + }, + { + "epoch": 5.172191528545119, + "grad_norm": 0.17960335314273834, + "learning_rate": 4.964709318018569e-05, + "loss": 1.6932, + "step": 16851 + }, + { + "epoch": 5.172498465316145, + "grad_norm": 0.2617340385913849, + "learning_rate": 4.964212274588965e-05, + "loss": 1.7753, + "step": 16852 + }, + { + "epoch": 5.17280540208717, + "grad_norm": 0.2454555630683899, + "learning_rate": 4.9637152315130383e-05, + "loss": 1.7587, + "step": 16853 + }, + { + "epoch": 5.173112338858195, + "grad_norm": 0.19221605360507965, + "learning_rate": 4.963218188795696e-05, + "loss": 1.7337, + "step": 16854 + }, + { + "epoch": 5.173419275629221, + "grad_norm": 0.24314738810062408, + "learning_rate": 4.9627211464418565e-05, + "loss": 1.725, + "step": 16855 + }, + { + "epoch": 5.173726212400245, + "grad_norm": 0.2533986568450928, + "learning_rate": 4.962224104456426e-05, + "loss": 1.7502, + "step": 16856 + }, + { + "epoch": 5.1740331491712706, + "grad_norm": 0.21800079941749573, + "learning_rate": 4.9617270628443195e-05, + "loss": 1.7622, + "step": 16857 + }, + { + "epoch": 5.174340085942296, + "grad_norm": 0.22742362320423126, + "learning_rate": 4.96123002161045e-05, + "loss": 1.7078, + "step": 16858 + }, + { + "epoch": 5.174647022713321, + "grad_norm": 0.22729982435703278, + "learning_rate": 4.960732980759727e-05, + "loss": 1.8349, + "step": 16859 + }, + { + "epoch": 5.1749539594843466, + "grad_norm": 0.28869518637657166, + "learning_rate": 4.9602359402970625e-05, + "loss": 1.8932, + "step": 16860 + }, + { + "epoch": 5.175260896255371, + "grad_norm": 0.21931354701519012, + "learning_rate": 4.9597389002273725e-05, + "loss": 1.6989, + "step": 16861 + }, + { + "epoch": 5.175567833026396, + "grad_norm": 0.2130192667245865, + "learning_rate": 4.959241860555564e-05, + "loss": 1.752, + "step": 16862 + }, + { + "epoch": 5.175874769797422, + "grad_norm": 0.21272781491279602, + "learning_rate": 4.958744821286553e-05, + "loss": 1.7402, + "step": 16863 + }, + { + "epoch": 5.176181706568447, + "grad_norm": 0.20279285311698914, + "learning_rate": 4.958247782425248e-05, + "loss": 1.7103, + "step": 16864 + }, + { + "epoch": 5.176488643339472, + "grad_norm": 0.23561790585517883, + "learning_rate": 4.957750743976564e-05, + "loss": 1.7742, + "step": 16865 + }, + { + "epoch": 5.176795580110497, + "grad_norm": 0.27608510851860046, + "learning_rate": 4.957253705945413e-05, + "loss": 1.7505, + "step": 16866 + }, + { + "epoch": 5.177102516881522, + "grad_norm": 0.20624001324176788, + "learning_rate": 4.956756668336704e-05, + "loss": 1.7032, + "step": 16867 + }, + { + "epoch": 5.1774094536525475, + "grad_norm": 0.23743939399719238, + "learning_rate": 4.956259631155352e-05, + "loss": 1.7469, + "step": 16868 + }, + { + "epoch": 5.177716390423573, + "grad_norm": 0.27421119809150696, + "learning_rate": 4.9557625944062675e-05, + "loss": 1.7028, + "step": 16869 + }, + { + "epoch": 5.178023327194598, + "grad_norm": 0.23788046836853027, + "learning_rate": 4.955265558094363e-05, + "loss": 1.7468, + "step": 16870 + }, + { + "epoch": 5.1783302639656235, + "grad_norm": 0.24712958931922913, + "learning_rate": 4.95476852222455e-05, + "loss": 1.7348, + "step": 16871 + }, + { + "epoch": 5.178637200736648, + "grad_norm": 0.21558570861816406, + "learning_rate": 4.9542714868017424e-05, + "loss": 1.7599, + "step": 16872 + }, + { + "epoch": 5.178944137507673, + "grad_norm": 0.2561664283275604, + "learning_rate": 4.953774451830849e-05, + "loss": 1.7673, + "step": 16873 + }, + { + "epoch": 5.179251074278699, + "grad_norm": 0.19761815667152405, + "learning_rate": 4.953277417316786e-05, + "loss": 1.743, + "step": 16874 + }, + { + "epoch": 5.179558011049724, + "grad_norm": 0.24140769243240356, + "learning_rate": 4.95278038326446e-05, + "loss": 1.8229, + "step": 16875 + }, + { + "epoch": 5.179864947820749, + "grad_norm": 0.21686211228370667, + "learning_rate": 4.9522833496787876e-05, + "loss": 1.7914, + "step": 16876 + }, + { + "epoch": 5.180171884591774, + "grad_norm": 0.2537819743156433, + "learning_rate": 4.951786316564678e-05, + "loss": 1.7532, + "step": 16877 + }, + { + "epoch": 5.180478821362799, + "grad_norm": 0.24567632377147675, + "learning_rate": 4.951289283927046e-05, + "loss": 1.7528, + "step": 16878 + }, + { + "epoch": 5.180785758133824, + "grad_norm": 0.1958467960357666, + "learning_rate": 4.9507922517708e-05, + "loss": 1.6922, + "step": 16879 + }, + { + "epoch": 5.18109269490485, + "grad_norm": 0.2012091726064682, + "learning_rate": 4.950295220100857e-05, + "loss": 1.7509, + "step": 16880 + }, + { + "epoch": 5.181399631675875, + "grad_norm": 0.2416311800479889, + "learning_rate": 4.9497981889221226e-05, + "loss": 1.7341, + "step": 16881 + }, + { + "epoch": 5.1817065684469, + "grad_norm": 0.21407842636108398, + "learning_rate": 4.949301158239513e-05, + "loss": 1.7493, + "step": 16882 + }, + { + "epoch": 5.182013505217925, + "grad_norm": 0.2354930192232132, + "learning_rate": 4.94880412805794e-05, + "loss": 1.7726, + "step": 16883 + }, + { + "epoch": 5.18232044198895, + "grad_norm": 0.2168428748846054, + "learning_rate": 4.948307098382313e-05, + "loss": 1.77, + "step": 16884 + }, + { + "epoch": 5.1826273787599755, + "grad_norm": 0.19605880975723267, + "learning_rate": 4.947810069217547e-05, + "loss": 1.7292, + "step": 16885 + }, + { + "epoch": 5.182934315531001, + "grad_norm": 0.23066702485084534, + "learning_rate": 4.947313040568551e-05, + "loss": 1.7265, + "step": 16886 + }, + { + "epoch": 5.183241252302026, + "grad_norm": 0.20139534771442413, + "learning_rate": 4.9468160124402386e-05, + "loss": 1.7443, + "step": 16887 + }, + { + "epoch": 5.183548189073051, + "grad_norm": 0.25097572803497314, + "learning_rate": 4.946318984837521e-05, + "loss": 1.7537, + "step": 16888 + }, + { + "epoch": 5.183855125844076, + "grad_norm": 0.26215067505836487, + "learning_rate": 4.945821957765313e-05, + "loss": 1.8397, + "step": 16889 + }, + { + "epoch": 5.184162062615101, + "grad_norm": 0.22072140872478485, + "learning_rate": 4.9453249312285215e-05, + "loss": 1.7052, + "step": 16890 + }, + { + "epoch": 5.184468999386127, + "grad_norm": 0.20372305810451508, + "learning_rate": 4.944827905232064e-05, + "loss": 1.7228, + "step": 16891 + }, + { + "epoch": 5.184775936157152, + "grad_norm": 0.20383495092391968, + "learning_rate": 4.944330879780847e-05, + "loss": 1.7063, + "step": 16892 + }, + { + "epoch": 5.185082872928176, + "grad_norm": 0.1903693675994873, + "learning_rate": 4.943833854879786e-05, + "loss": 1.6435, + "step": 16893 + }, + { + "epoch": 5.185389809699202, + "grad_norm": 0.20357775688171387, + "learning_rate": 4.94333683053379e-05, + "loss": 1.7485, + "step": 16894 + }, + { + "epoch": 5.185696746470227, + "grad_norm": 0.24776104092597961, + "learning_rate": 4.942839806747775e-05, + "loss": 1.718, + "step": 16895 + }, + { + "epoch": 5.186003683241252, + "grad_norm": 0.2455051839351654, + "learning_rate": 4.942342783526649e-05, + "loss": 1.7124, + "step": 16896 + }, + { + "epoch": 5.186310620012278, + "grad_norm": 0.2102014273405075, + "learning_rate": 4.941845760875328e-05, + "loss": 1.7584, + "step": 16897 + }, + { + "epoch": 5.186617556783303, + "grad_norm": 0.2177651822566986, + "learning_rate": 4.941348738798718e-05, + "loss": 1.7019, + "step": 16898 + }, + { + "epoch": 5.1869244935543275, + "grad_norm": 0.21296697854995728, + "learning_rate": 4.9408517173017355e-05, + "loss": 1.7299, + "step": 16899 + }, + { + "epoch": 5.187231430325353, + "grad_norm": 0.23485495150089264, + "learning_rate": 4.940354696389292e-05, + "loss": 1.7271, + "step": 16900 + }, + { + "epoch": 5.187538367096378, + "grad_norm": 0.27287766337394714, + "learning_rate": 4.939857676066297e-05, + "loss": 1.7601, + "step": 16901 + }, + { + "epoch": 5.1878453038674035, + "grad_norm": 0.2060246467590332, + "learning_rate": 4.939360656337665e-05, + "loss": 1.7064, + "step": 16902 + }, + { + "epoch": 5.188152240638429, + "grad_norm": 0.25422418117523193, + "learning_rate": 4.938863637208305e-05, + "loss": 1.7423, + "step": 16903 + }, + { + "epoch": 5.188459177409453, + "grad_norm": 0.2798483669757843, + "learning_rate": 4.9383666186831304e-05, + "loss": 1.7132, + "step": 16904 + }, + { + "epoch": 5.188766114180479, + "grad_norm": 0.23505693674087524, + "learning_rate": 4.9378696007670525e-05, + "loss": 1.7759, + "step": 16905 + }, + { + "epoch": 5.189073050951504, + "grad_norm": 0.23761989176273346, + "learning_rate": 4.937372583464987e-05, + "loss": 1.7076, + "step": 16906 + }, + { + "epoch": 5.189379987722529, + "grad_norm": 0.3005945086479187, + "learning_rate": 4.9368755667818385e-05, + "loss": 1.6957, + "step": 16907 + }, + { + "epoch": 5.189686924493555, + "grad_norm": 0.2502881586551666, + "learning_rate": 4.936378550722525e-05, + "loss": 1.7352, + "step": 16908 + }, + { + "epoch": 5.189993861264579, + "grad_norm": 0.24194179475307465, + "learning_rate": 4.9358815352919544e-05, + "loss": 1.738, + "step": 16909 + }, + { + "epoch": 5.190300798035604, + "grad_norm": 0.27478742599487305, + "learning_rate": 4.935384520495041e-05, + "loss": 1.7118, + "step": 16910 + }, + { + "epoch": 5.19060773480663, + "grad_norm": 0.22327560186386108, + "learning_rate": 4.9348875063366944e-05, + "loss": 1.7697, + "step": 16911 + }, + { + "epoch": 5.190914671577655, + "grad_norm": 0.21844418346881866, + "learning_rate": 4.9343904928218295e-05, + "loss": 1.7733, + "step": 16912 + }, + { + "epoch": 5.19122160834868, + "grad_norm": 0.25267866253852844, + "learning_rate": 4.933893479955354e-05, + "loss": 1.7313, + "step": 16913 + }, + { + "epoch": 5.191528545119706, + "grad_norm": 0.22045068442821503, + "learning_rate": 4.933396467742185e-05, + "loss": 1.7856, + "step": 16914 + }, + { + "epoch": 5.19183548189073, + "grad_norm": 0.22642305493354797, + "learning_rate": 4.932899456187229e-05, + "loss": 1.7326, + "step": 16915 + }, + { + "epoch": 5.1921424186617555, + "grad_norm": 0.20601733028888702, + "learning_rate": 4.9324024452953995e-05, + "loss": 1.7743, + "step": 16916 + }, + { + "epoch": 5.192449355432781, + "grad_norm": 0.25580671429634094, + "learning_rate": 4.931905435071611e-05, + "loss": 1.7705, + "step": 16917 + }, + { + "epoch": 5.192756292203806, + "grad_norm": 0.38173142075538635, + "learning_rate": 4.9314084255207706e-05, + "loss": 1.7504, + "step": 16918 + }, + { + "epoch": 5.1930632289748315, + "grad_norm": 0.2254420667886734, + "learning_rate": 4.930911416647794e-05, + "loss": 1.7344, + "step": 16919 + }, + { + "epoch": 5.193370165745856, + "grad_norm": 0.2354312688112259, + "learning_rate": 4.9304144084575896e-05, + "loss": 1.7607, + "step": 16920 + }, + { + "epoch": 5.193677102516881, + "grad_norm": 0.23879510164260864, + "learning_rate": 4.9299174009550716e-05, + "loss": 1.683, + "step": 16921 + }, + { + "epoch": 5.193984039287907, + "grad_norm": 0.228669211268425, + "learning_rate": 4.9294203941451494e-05, + "loss": 1.7776, + "step": 16922 + }, + { + "epoch": 5.194290976058932, + "grad_norm": 0.2266843616962433, + "learning_rate": 4.928923388032739e-05, + "loss": 1.7563, + "step": 16923 + }, + { + "epoch": 5.194597912829957, + "grad_norm": 0.2581404745578766, + "learning_rate": 4.928426382622747e-05, + "loss": 1.8112, + "step": 16924 + }, + { + "epoch": 5.194904849600983, + "grad_norm": 0.25179803371429443, + "learning_rate": 4.92792937792009e-05, + "loss": 1.7661, + "step": 16925 + }, + { + "epoch": 5.195211786372007, + "grad_norm": 0.23408514261245728, + "learning_rate": 4.9274323739296746e-05, + "loss": 1.7618, + "step": 16926 + }, + { + "epoch": 5.195518723143032, + "grad_norm": 0.23110872507095337, + "learning_rate": 4.926935370656416e-05, + "loss": 1.6945, + "step": 16927 + }, + { + "epoch": 5.195825659914058, + "grad_norm": 0.2863025665283203, + "learning_rate": 4.926438368105224e-05, + "loss": 1.8659, + "step": 16928 + }, + { + "epoch": 5.196132596685083, + "grad_norm": 0.2156454175710678, + "learning_rate": 4.925941366281013e-05, + "loss": 1.7281, + "step": 16929 + }, + { + "epoch": 5.196439533456108, + "grad_norm": 0.2338300198316574, + "learning_rate": 4.925444365188691e-05, + "loss": 1.7271, + "step": 16930 + }, + { + "epoch": 5.196746470227133, + "grad_norm": 0.21434102952480316, + "learning_rate": 4.924947364833173e-05, + "loss": 1.7342, + "step": 16931 + }, + { + "epoch": 5.197053406998158, + "grad_norm": 0.21619778871536255, + "learning_rate": 4.924450365219369e-05, + "loss": 1.7493, + "step": 16932 + }, + { + "epoch": 5.1973603437691835, + "grad_norm": 0.24532032012939453, + "learning_rate": 4.9239533663521896e-05, + "loss": 1.7707, + "step": 16933 + }, + { + "epoch": 5.197667280540209, + "grad_norm": 0.21795547008514404, + "learning_rate": 4.923456368236549e-05, + "loss": 1.7642, + "step": 16934 + }, + { + "epoch": 5.197974217311234, + "grad_norm": 0.2070101797580719, + "learning_rate": 4.922959370877356e-05, + "loss": 1.7377, + "step": 16935 + }, + { + "epoch": 5.198281154082259, + "grad_norm": 0.22546489536762238, + "learning_rate": 4.9224623742795256e-05, + "loss": 1.7766, + "step": 16936 + }, + { + "epoch": 5.198588090853284, + "grad_norm": 0.20723624527454376, + "learning_rate": 4.921965378447965e-05, + "loss": 1.7316, + "step": 16937 + }, + { + "epoch": 5.198895027624309, + "grad_norm": 0.21870547533035278, + "learning_rate": 4.9214683833875905e-05, + "loss": 1.7653, + "step": 16938 + }, + { + "epoch": 5.199201964395335, + "grad_norm": 0.19606490433216095, + "learning_rate": 4.920971389103309e-05, + "loss": 1.7181, + "step": 16939 + }, + { + "epoch": 5.19950890116636, + "grad_norm": 0.18372730910778046, + "learning_rate": 4.920474395600037e-05, + "loss": 1.7041, + "step": 16940 + }, + { + "epoch": 5.199815837937384, + "grad_norm": 0.22051765024662018, + "learning_rate": 4.919977402882682e-05, + "loss": 1.7172, + "step": 16941 + }, + { + "epoch": 5.20012277470841, + "grad_norm": 0.2135835587978363, + "learning_rate": 4.919480410956159e-05, + "loss": 1.6918, + "step": 16942 + }, + { + "epoch": 5.200429711479435, + "grad_norm": 0.19619768857955933, + "learning_rate": 4.918983419825376e-05, + "loss": 1.7005, + "step": 16943 + }, + { + "epoch": 5.2007366482504604, + "grad_norm": 0.22726574540138245, + "learning_rate": 4.918486429495246e-05, + "loss": 1.6775, + "step": 16944 + }, + { + "epoch": 5.201043585021486, + "grad_norm": 0.21471361815929413, + "learning_rate": 4.9179894399706815e-05, + "loss": 1.7102, + "step": 16945 + }, + { + "epoch": 5.201350521792511, + "grad_norm": 0.20113740861415863, + "learning_rate": 4.917492451256595e-05, + "loss": 1.7548, + "step": 16946 + }, + { + "epoch": 5.201657458563536, + "grad_norm": 0.2337827831506729, + "learning_rate": 4.916995463357894e-05, + "loss": 1.818, + "step": 16947 + }, + { + "epoch": 5.201964395334561, + "grad_norm": 0.2649554908275604, + "learning_rate": 4.9164984762794955e-05, + "loss": 1.7784, + "step": 16948 + }, + { + "epoch": 5.202271332105586, + "grad_norm": 0.2297617793083191, + "learning_rate": 4.916001490026306e-05, + "loss": 1.7484, + "step": 16949 + }, + { + "epoch": 5.202578268876612, + "grad_norm": 0.20791979134082794, + "learning_rate": 4.915504504603238e-05, + "loss": 1.7164, + "step": 16950 + }, + { + "epoch": 5.202885205647637, + "grad_norm": 0.21769596636295319, + "learning_rate": 4.915007520015207e-05, + "loss": 1.7783, + "step": 16951 + }, + { + "epoch": 5.203192142418661, + "grad_norm": 0.21038469672203064, + "learning_rate": 4.914510536267118e-05, + "loss": 1.6863, + "step": 16952 + }, + { + "epoch": 5.203499079189687, + "grad_norm": 0.20725449919700623, + "learning_rate": 4.914013553363889e-05, + "loss": 1.6855, + "step": 16953 + }, + { + "epoch": 5.203806015960712, + "grad_norm": 0.23879854381084442, + "learning_rate": 4.9135165713104266e-05, + "loss": 1.6986, + "step": 16954 + }, + { + "epoch": 5.204112952731737, + "grad_norm": 0.20515915751457214, + "learning_rate": 4.913019590111645e-05, + "loss": 1.6912, + "step": 16955 + }, + { + "epoch": 5.204419889502763, + "grad_norm": 0.2252528965473175, + "learning_rate": 4.912522609772453e-05, + "loss": 1.6974, + "step": 16956 + }, + { + "epoch": 5.204726826273788, + "grad_norm": 0.1946130096912384, + "learning_rate": 4.9120256302977665e-05, + "loss": 1.7009, + "step": 16957 + }, + { + "epoch": 5.2050337630448125, + "grad_norm": 0.21323645114898682, + "learning_rate": 4.9115286516924925e-05, + "loss": 1.7746, + "step": 16958 + }, + { + "epoch": 5.205340699815838, + "grad_norm": 0.20721712708473206, + "learning_rate": 4.911031673961546e-05, + "loss": 1.7103, + "step": 16959 + }, + { + "epoch": 5.205647636586863, + "grad_norm": 0.19630689918994904, + "learning_rate": 4.910534697109834e-05, + "loss": 1.7042, + "step": 16960 + }, + { + "epoch": 5.2059545733578885, + "grad_norm": 0.2036786526441574, + "learning_rate": 4.910037721142273e-05, + "loss": 1.7713, + "step": 16961 + }, + { + "epoch": 5.206261510128914, + "grad_norm": 0.20518352091312408, + "learning_rate": 4.9095407460637696e-05, + "loss": 1.7456, + "step": 16962 + }, + { + "epoch": 5.206568446899938, + "grad_norm": 0.199858620762825, + "learning_rate": 4.9090437718792404e-05, + "loss": 1.7598, + "step": 16963 + }, + { + "epoch": 5.206875383670964, + "grad_norm": 0.22860252857208252, + "learning_rate": 4.9085467985935914e-05, + "loss": 1.7947, + "step": 16964 + }, + { + "epoch": 5.207182320441989, + "grad_norm": 0.22179929912090302, + "learning_rate": 4.9080498262117395e-05, + "loss": 1.7537, + "step": 16965 + }, + { + "epoch": 5.207489257213014, + "grad_norm": 0.24737581610679626, + "learning_rate": 4.9075528547385906e-05, + "loss": 1.7932, + "step": 16966 + }, + { + "epoch": 5.20779619398404, + "grad_norm": 0.2653762400150299, + "learning_rate": 4.907055884179059e-05, + "loss": 1.7683, + "step": 16967 + }, + { + "epoch": 5.208103130755064, + "grad_norm": 0.2891876697540283, + "learning_rate": 4.9065589145380564e-05, + "loss": 1.7867, + "step": 16968 + }, + { + "epoch": 5.208410067526089, + "grad_norm": 0.23162086308002472, + "learning_rate": 4.906061945820492e-05, + "loss": 1.7981, + "step": 16969 + }, + { + "epoch": 5.208717004297115, + "grad_norm": 0.2746187150478363, + "learning_rate": 4.9055649780312805e-05, + "loss": 1.7215, + "step": 16970 + }, + { + "epoch": 5.20902394106814, + "grad_norm": 0.3217853605747223, + "learning_rate": 4.905068011175329e-05, + "loss": 1.8027, + "step": 16971 + }, + { + "epoch": 5.209330877839165, + "grad_norm": 0.21517686545848846, + "learning_rate": 4.904571045257553e-05, + "loss": 1.7055, + "step": 16972 + }, + { + "epoch": 5.209637814610191, + "grad_norm": 0.23613709211349487, + "learning_rate": 4.90407408028286e-05, + "loss": 1.751, + "step": 16973 + }, + { + "epoch": 5.209944751381215, + "grad_norm": 0.35093945264816284, + "learning_rate": 4.903577116256165e-05, + "loss": 1.7749, + "step": 16974 + }, + { + "epoch": 5.2102516881522405, + "grad_norm": 0.3289217948913574, + "learning_rate": 4.903080153182376e-05, + "loss": 1.7722, + "step": 16975 + }, + { + "epoch": 5.210558624923266, + "grad_norm": 0.29387256503105164, + "learning_rate": 4.9025831910664074e-05, + "loss": 1.8121, + "step": 16976 + }, + { + "epoch": 5.210865561694291, + "grad_norm": 0.44418805837631226, + "learning_rate": 4.9020862299131664e-05, + "loss": 1.7744, + "step": 16977 + }, + { + "epoch": 5.2111724984653165, + "grad_norm": 0.39242252707481384, + "learning_rate": 4.901589269727568e-05, + "loss": 1.7183, + "step": 16978 + }, + { + "epoch": 5.211479435236341, + "grad_norm": 0.2028690129518509, + "learning_rate": 4.901092310514522e-05, + "loss": 1.7101, + "step": 16979 + }, + { + "epoch": 5.211786372007366, + "grad_norm": 0.4025843143463135, + "learning_rate": 4.900595352278941e-05, + "loss": 1.7545, + "step": 16980 + }, + { + "epoch": 5.212093308778392, + "grad_norm": 0.284568727016449, + "learning_rate": 4.900098395025733e-05, + "loss": 1.7758, + "step": 16981 + }, + { + "epoch": 5.212400245549417, + "grad_norm": 0.2527516484260559, + "learning_rate": 4.899601438759813e-05, + "loss": 1.695, + "step": 16982 + }, + { + "epoch": 5.212707182320442, + "grad_norm": 0.3063630759716034, + "learning_rate": 4.89910448348609e-05, + "loss": 1.714, + "step": 16983 + }, + { + "epoch": 5.213014119091467, + "grad_norm": 0.22754468023777008, + "learning_rate": 4.898607529209474e-05, + "loss": 1.8315, + "step": 16984 + }, + { + "epoch": 5.213321055862492, + "grad_norm": 0.29594969749450684, + "learning_rate": 4.89811057593488e-05, + "loss": 1.6669, + "step": 16985 + }, + { + "epoch": 5.213627992633517, + "grad_norm": 0.21486569941043854, + "learning_rate": 4.897613623667215e-05, + "loss": 1.7425, + "step": 16986 + }, + { + "epoch": 5.213934929404543, + "grad_norm": 0.30908775329589844, + "learning_rate": 4.897116672411395e-05, + "loss": 1.7915, + "step": 16987 + }, + { + "epoch": 5.214241866175568, + "grad_norm": 0.23515601456165314, + "learning_rate": 4.896619722172325e-05, + "loss": 1.7226, + "step": 16988 + }, + { + "epoch": 5.214548802946593, + "grad_norm": 0.2847287952899933, + "learning_rate": 4.8961227729549215e-05, + "loss": 1.7641, + "step": 16989 + }, + { + "epoch": 5.214855739717618, + "grad_norm": 0.2986287772655487, + "learning_rate": 4.895625824764092e-05, + "loss": 1.8025, + "step": 16990 + }, + { + "epoch": 5.215162676488643, + "grad_norm": 0.23454971611499786, + "learning_rate": 4.8951288776047514e-05, + "loss": 1.7057, + "step": 16991 + }, + { + "epoch": 5.2154696132596685, + "grad_norm": 0.2578633725643158, + "learning_rate": 4.894631931481807e-05, + "loss": 1.7267, + "step": 16992 + }, + { + "epoch": 5.215776550030694, + "grad_norm": 0.29975566267967224, + "learning_rate": 4.894134986400174e-05, + "loss": 1.7452, + "step": 16993 + }, + { + "epoch": 5.216083486801719, + "grad_norm": 0.22313638031482697, + "learning_rate": 4.893638042364758e-05, + "loss": 1.6917, + "step": 16994 + }, + { + "epoch": 5.216390423572744, + "grad_norm": 0.258297860622406, + "learning_rate": 4.893141099380475e-05, + "loss": 1.7816, + "step": 16995 + }, + { + "epoch": 5.216697360343769, + "grad_norm": 0.2656872272491455, + "learning_rate": 4.892644157452233e-05, + "loss": 1.7248, + "step": 16996 + }, + { + "epoch": 5.217004297114794, + "grad_norm": 0.20239698886871338, + "learning_rate": 4.8921472165849464e-05, + "loss": 1.7629, + "step": 16997 + }, + { + "epoch": 5.21731123388582, + "grad_norm": 0.2575492262840271, + "learning_rate": 4.891650276783523e-05, + "loss": 1.719, + "step": 16998 + }, + { + "epoch": 5.217618170656845, + "grad_norm": 0.27563637495040894, + "learning_rate": 4.8911533380528756e-05, + "loss": 1.718, + "step": 16999 + }, + { + "epoch": 5.21792510742787, + "grad_norm": 0.1969723105430603, + "learning_rate": 4.890656400397915e-05, + "loss": 1.7557, + "step": 17000 + }, + { + "epoch": 5.218232044198895, + "grad_norm": 0.24336831271648407, + "learning_rate": 4.89015946382355e-05, + "loss": 1.6861, + "step": 17001 + }, + { + "epoch": 5.21853898096992, + "grad_norm": 0.2804388403892517, + "learning_rate": 4.889662528334696e-05, + "loss": 1.7411, + "step": 17002 + }, + { + "epoch": 5.218845917740945, + "grad_norm": 0.21116352081298828, + "learning_rate": 4.8891655939362596e-05, + "loss": 1.7135, + "step": 17003 + }, + { + "epoch": 5.219152854511971, + "grad_norm": 0.21042904257774353, + "learning_rate": 4.8886686606331556e-05, + "loss": 1.7224, + "step": 17004 + }, + { + "epoch": 5.219459791282996, + "grad_norm": 0.22463755309581757, + "learning_rate": 4.888171728430291e-05, + "loss": 1.8272, + "step": 17005 + }, + { + "epoch": 5.2197667280540205, + "grad_norm": 0.25604158639907837, + "learning_rate": 4.8876747973325805e-05, + "loss": 1.674, + "step": 17006 + }, + { + "epoch": 5.220073664825046, + "grad_norm": 0.3108421564102173, + "learning_rate": 4.887177867344932e-05, + "loss": 1.761, + "step": 17007 + }, + { + "epoch": 5.220380601596071, + "grad_norm": 0.25135359168052673, + "learning_rate": 4.88668093847226e-05, + "loss": 1.7455, + "step": 17008 + }, + { + "epoch": 5.2206875383670965, + "grad_norm": 0.24508307874202728, + "learning_rate": 4.886184010719471e-05, + "loss": 1.7632, + "step": 17009 + }, + { + "epoch": 5.220994475138122, + "grad_norm": 0.26777148246765137, + "learning_rate": 4.8856870840914816e-05, + "loss": 1.7814, + "step": 17010 + }, + { + "epoch": 5.221301411909146, + "grad_norm": 0.22404739260673523, + "learning_rate": 4.8851901585931967e-05, + "loss": 1.7441, + "step": 17011 + }, + { + "epoch": 5.221608348680172, + "grad_norm": 0.2406606674194336, + "learning_rate": 4.884693234229531e-05, + "loss": 1.7789, + "step": 17012 + }, + { + "epoch": 5.221915285451197, + "grad_norm": 0.27320384979248047, + "learning_rate": 4.884196311005394e-05, + "loss": 1.8046, + "step": 17013 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.3393586277961731, + "learning_rate": 4.8836993889256965e-05, + "loss": 1.7155, + "step": 17014 + }, + { + "epoch": 5.222529158993248, + "grad_norm": 0.3069504499435425, + "learning_rate": 4.88320246799535e-05, + "loss": 1.6985, + "step": 17015 + }, + { + "epoch": 5.222836095764273, + "grad_norm": 0.22184616327285767, + "learning_rate": 4.8827055482192664e-05, + "loss": 1.7996, + "step": 17016 + }, + { + "epoch": 5.223143032535297, + "grad_norm": 0.2791864573955536, + "learning_rate": 4.8822086296023544e-05, + "loss": 1.7223, + "step": 17017 + }, + { + "epoch": 5.223449969306323, + "grad_norm": 0.259726345539093, + "learning_rate": 4.8817117121495245e-05, + "loss": 1.7481, + "step": 17018 + }, + { + "epoch": 5.223756906077348, + "grad_norm": 0.19968681037425995, + "learning_rate": 4.8812147958656916e-05, + "loss": 1.702, + "step": 17019 + }, + { + "epoch": 5.224063842848373, + "grad_norm": 0.20161856710910797, + "learning_rate": 4.8807178807557616e-05, + "loss": 1.6689, + "step": 17020 + }, + { + "epoch": 5.224370779619399, + "grad_norm": 0.2365240454673767, + "learning_rate": 4.880220966824649e-05, + "loss": 1.7742, + "step": 17021 + }, + { + "epoch": 5.224677716390423, + "grad_norm": 0.20116381347179413, + "learning_rate": 4.879724054077261e-05, + "loss": 1.7584, + "step": 17022 + }, + { + "epoch": 5.2249846531614486, + "grad_norm": 0.22845037281513214, + "learning_rate": 4.879227142518511e-05, + "loss": 1.7794, + "step": 17023 + }, + { + "epoch": 5.225291589932474, + "grad_norm": 0.251724511384964, + "learning_rate": 4.87873023215331e-05, + "loss": 1.7722, + "step": 17024 + }, + { + "epoch": 5.225598526703499, + "grad_norm": 0.206145241856575, + "learning_rate": 4.878233322986568e-05, + "loss": 1.7452, + "step": 17025 + }, + { + "epoch": 5.225905463474525, + "grad_norm": 0.24065247178077698, + "learning_rate": 4.877736415023194e-05, + "loss": 1.8144, + "step": 17026 + }, + { + "epoch": 5.226212400245549, + "grad_norm": 0.2255484163761139, + "learning_rate": 4.877239508268103e-05, + "loss": 1.706, + "step": 17027 + }, + { + "epoch": 5.226519337016574, + "grad_norm": 0.21035850048065186, + "learning_rate": 4.8767426027262e-05, + "loss": 1.7167, + "step": 17028 + }, + { + "epoch": 5.2268262737876, + "grad_norm": 0.19618964195251465, + "learning_rate": 4.8762456984024025e-05, + "loss": 1.7063, + "step": 17029 + }, + { + "epoch": 5.227133210558625, + "grad_norm": 0.19595398008823395, + "learning_rate": 4.875748795301614e-05, + "loss": 1.7452, + "step": 17030 + }, + { + "epoch": 5.22744014732965, + "grad_norm": 0.22870996594429016, + "learning_rate": 4.8752518934287506e-05, + "loss": 1.8169, + "step": 17031 + }, + { + "epoch": 5.227747084100676, + "grad_norm": 0.24048443138599396, + "learning_rate": 4.87475499278872e-05, + "loss": 1.6988, + "step": 17032 + }, + { + "epoch": 5.2280540208717, + "grad_norm": 0.24177183210849762, + "learning_rate": 4.8742580933864356e-05, + "loss": 1.77, + "step": 17033 + }, + { + "epoch": 5.2283609576427255, + "grad_norm": 0.2023085057735443, + "learning_rate": 4.873761195226806e-05, + "loss": 1.7, + "step": 17034 + }, + { + "epoch": 5.228667894413751, + "grad_norm": 0.2614101767539978, + "learning_rate": 4.873264298314742e-05, + "loss": 1.767, + "step": 17035 + }, + { + "epoch": 5.228974831184776, + "grad_norm": 0.19607602059841156, + "learning_rate": 4.872767402655154e-05, + "loss": 1.7391, + "step": 17036 + }, + { + "epoch": 5.2292817679558015, + "grad_norm": 0.2053994983434677, + "learning_rate": 4.872270508252953e-05, + "loss": 1.7155, + "step": 17037 + }, + { + "epoch": 5.229588704726826, + "grad_norm": 0.18256273865699768, + "learning_rate": 4.871773615113051e-05, + "loss": 1.6999, + "step": 17038 + }, + { + "epoch": 5.229895641497851, + "grad_norm": 0.21956393122673035, + "learning_rate": 4.871276723240356e-05, + "loss": 1.7946, + "step": 17039 + }, + { + "epoch": 5.230202578268877, + "grad_norm": 0.23779109120368958, + "learning_rate": 4.870779832639781e-05, + "loss": 1.8063, + "step": 17040 + }, + { + "epoch": 5.230509515039902, + "grad_norm": 0.21662941575050354, + "learning_rate": 4.8702829433162346e-05, + "loss": 1.7276, + "step": 17041 + }, + { + "epoch": 5.230816451810927, + "grad_norm": 0.21578755974769592, + "learning_rate": 4.869786055274628e-05, + "loss": 1.7577, + "step": 17042 + }, + { + "epoch": 5.231123388581952, + "grad_norm": 0.23229347169399261, + "learning_rate": 4.8692891685198715e-05, + "loss": 1.7884, + "step": 17043 + }, + { + "epoch": 5.231430325352977, + "grad_norm": 0.2302366942167282, + "learning_rate": 4.868792283056878e-05, + "loss": 1.7823, + "step": 17044 + }, + { + "epoch": 5.231737262124002, + "grad_norm": 0.2181033343076706, + "learning_rate": 4.868295398890554e-05, + "loss": 1.7027, + "step": 17045 + }, + { + "epoch": 5.232044198895028, + "grad_norm": 0.20863409340381622, + "learning_rate": 4.8677985160258135e-05, + "loss": 1.7247, + "step": 17046 + }, + { + "epoch": 5.232351135666053, + "grad_norm": 0.2242976278066635, + "learning_rate": 4.867301634467564e-05, + "loss": 1.7799, + "step": 17047 + }, + { + "epoch": 5.232658072437078, + "grad_norm": 0.19934964179992676, + "learning_rate": 4.866804754220719e-05, + "loss": 1.6973, + "step": 17048 + }, + { + "epoch": 5.232965009208103, + "grad_norm": 0.22056198120117188, + "learning_rate": 4.8663078752901855e-05, + "loss": 1.7677, + "step": 17049 + }, + { + "epoch": 5.233271945979128, + "grad_norm": 0.2303200513124466, + "learning_rate": 4.865810997680879e-05, + "loss": 1.7517, + "step": 17050 + }, + { + "epoch": 5.2335788827501535, + "grad_norm": 0.21193410456180573, + "learning_rate": 4.8653141213977066e-05, + "loss": 1.7478, + "step": 17051 + }, + { + "epoch": 5.233885819521179, + "grad_norm": 0.18498395383358002, + "learning_rate": 4.864817246445577e-05, + "loss": 1.6891, + "step": 17052 + }, + { + "epoch": 5.234192756292204, + "grad_norm": 0.22879233956336975, + "learning_rate": 4.8643203728294036e-05, + "loss": 1.7166, + "step": 17053 + }, + { + "epoch": 5.234499693063229, + "grad_norm": 0.2128525823354721, + "learning_rate": 4.8638235005540944e-05, + "loss": 1.7993, + "step": 17054 + }, + { + "epoch": 5.234806629834254, + "grad_norm": 0.21245025098323822, + "learning_rate": 4.8633266296245634e-05, + "loss": 1.7436, + "step": 17055 + }, + { + "epoch": 5.235113566605279, + "grad_norm": 0.20301629602909088, + "learning_rate": 4.8628297600457165e-05, + "loss": 1.7774, + "step": 17056 + }, + { + "epoch": 5.235420503376305, + "grad_norm": 0.23251961171627045, + "learning_rate": 4.8623328918224687e-05, + "loss": 1.7897, + "step": 17057 + }, + { + "epoch": 5.23572744014733, + "grad_norm": 0.2272956669330597, + "learning_rate": 4.861836024959726e-05, + "loss": 1.7668, + "step": 17058 + }, + { + "epoch": 5.236034376918354, + "grad_norm": 0.20540569722652435, + "learning_rate": 4.8613391594624013e-05, + "loss": 1.7549, + "step": 17059 + }, + { + "epoch": 5.23634131368938, + "grad_norm": 0.20306967198848724, + "learning_rate": 4.8608422953354034e-05, + "loss": 1.6993, + "step": 17060 + }, + { + "epoch": 5.236648250460405, + "grad_norm": 0.19415293633937836, + "learning_rate": 4.8603454325836455e-05, + "loss": 1.7313, + "step": 17061 + }, + { + "epoch": 5.23695518723143, + "grad_norm": 0.2058337777853012, + "learning_rate": 4.859848571212034e-05, + "loss": 1.7994, + "step": 17062 + }, + { + "epoch": 5.237262124002456, + "grad_norm": 0.24489709734916687, + "learning_rate": 4.859351711225483e-05, + "loss": 1.7555, + "step": 17063 + }, + { + "epoch": 5.237569060773481, + "grad_norm": 0.22589795291423798, + "learning_rate": 4.858854852628899e-05, + "loss": 1.7136, + "step": 17064 + }, + { + "epoch": 5.2378759975445055, + "grad_norm": 0.21404492855072021, + "learning_rate": 4.858357995427195e-05, + "loss": 1.7598, + "step": 17065 + }, + { + "epoch": 5.238182934315531, + "grad_norm": 0.24936965107917786, + "learning_rate": 4.8578611396252786e-05, + "loss": 1.8027, + "step": 17066 + }, + { + "epoch": 5.238489871086556, + "grad_norm": 0.23391515016555786, + "learning_rate": 4.857364285228065e-05, + "loss": 1.7704, + "step": 17067 + }, + { + "epoch": 5.2387968078575815, + "grad_norm": 0.22633357346057892, + "learning_rate": 4.85686743224046e-05, + "loss": 1.7075, + "step": 17068 + }, + { + "epoch": 5.239103744628607, + "grad_norm": 0.221492201089859, + "learning_rate": 4.8563705806673736e-05, + "loss": 1.7755, + "step": 17069 + }, + { + "epoch": 5.239410681399631, + "grad_norm": 0.2381046712398529, + "learning_rate": 4.855873730513719e-05, + "loss": 1.7971, + "step": 17070 + }, + { + "epoch": 5.239717618170657, + "grad_norm": 0.21930988132953644, + "learning_rate": 4.855376881784402e-05, + "loss": 1.7295, + "step": 17071 + }, + { + "epoch": 5.240024554941682, + "grad_norm": 0.20897921919822693, + "learning_rate": 4.854880034484339e-05, + "loss": 1.7796, + "step": 17072 + }, + { + "epoch": 5.240331491712707, + "grad_norm": 0.26616254448890686, + "learning_rate": 4.8543831886184334e-05, + "loss": 1.7095, + "step": 17073 + }, + { + "epoch": 5.240638428483733, + "grad_norm": 0.19513870775699615, + "learning_rate": 4.853886344191601e-05, + "loss": 1.7181, + "step": 17074 + }, + { + "epoch": 5.240945365254758, + "grad_norm": 0.23476530611515045, + "learning_rate": 4.853389501208747e-05, + "loss": 1.7928, + "step": 17075 + }, + { + "epoch": 5.241252302025782, + "grad_norm": 0.18197014927864075, + "learning_rate": 4.852892659674785e-05, + "loss": 1.6888, + "step": 17076 + }, + { + "epoch": 5.241559238796808, + "grad_norm": 0.20317208766937256, + "learning_rate": 4.852395819594623e-05, + "loss": 1.7828, + "step": 17077 + }, + { + "epoch": 5.241866175567833, + "grad_norm": 0.1953772008419037, + "learning_rate": 4.851898980973175e-05, + "loss": 1.7394, + "step": 17078 + }, + { + "epoch": 5.242173112338858, + "grad_norm": 0.19714407622814178, + "learning_rate": 4.851402143815345e-05, + "loss": 1.7261, + "step": 17079 + }, + { + "epoch": 5.242480049109884, + "grad_norm": 0.2196008861064911, + "learning_rate": 4.850905308126048e-05, + "loss": 1.7387, + "step": 17080 + }, + { + "epoch": 5.242786985880908, + "grad_norm": 0.2337818443775177, + "learning_rate": 4.85040847391019e-05, + "loss": 1.7448, + "step": 17081 + }, + { + "epoch": 5.2430939226519335, + "grad_norm": 0.20940040051937103, + "learning_rate": 4.849911641172685e-05, + "loss": 1.7354, + "step": 17082 + }, + { + "epoch": 5.243400859422959, + "grad_norm": 0.2242170125246048, + "learning_rate": 4.849414809918439e-05, + "loss": 1.7325, + "step": 17083 + }, + { + "epoch": 5.243707796193984, + "grad_norm": 0.2322687953710556, + "learning_rate": 4.8489179801523675e-05, + "loss": 1.7557, + "step": 17084 + }, + { + "epoch": 5.2440147329650095, + "grad_norm": 0.20303767919540405, + "learning_rate": 4.8484211518793764e-05, + "loss": 1.7063, + "step": 17085 + }, + { + "epoch": 5.244321669736034, + "grad_norm": 0.2446853369474411, + "learning_rate": 4.8479243251043746e-05, + "loss": 1.7587, + "step": 17086 + }, + { + "epoch": 5.244628606507059, + "grad_norm": 0.22901636362075806, + "learning_rate": 4.8474274998322735e-05, + "loss": 1.7992, + "step": 17087 + }, + { + "epoch": 5.244935543278085, + "grad_norm": 0.29676303267478943, + "learning_rate": 4.846930676067984e-05, + "loss": 1.7688, + "step": 17088 + }, + { + "epoch": 5.24524248004911, + "grad_norm": 0.24160240590572357, + "learning_rate": 4.846433853816416e-05, + "loss": 1.7367, + "step": 17089 + }, + { + "epoch": 5.245549416820135, + "grad_norm": 0.2097402662038803, + "learning_rate": 4.8459370330824774e-05, + "loss": 1.721, + "step": 17090 + }, + { + "epoch": 5.245856353591161, + "grad_norm": 0.26451143622398376, + "learning_rate": 4.8454402138710814e-05, + "loss": 1.7707, + "step": 17091 + }, + { + "epoch": 5.246163290362185, + "grad_norm": 0.30428358912467957, + "learning_rate": 4.844943396187133e-05, + "loss": 1.7232, + "step": 17092 + }, + { + "epoch": 5.24647022713321, + "grad_norm": 0.24332918226718903, + "learning_rate": 4.8444465800355466e-05, + "loss": 1.8215, + "step": 17093 + }, + { + "epoch": 5.246777163904236, + "grad_norm": 0.292703777551651, + "learning_rate": 4.843949765421229e-05, + "loss": 1.7199, + "step": 17094 + }, + { + "epoch": 5.247084100675261, + "grad_norm": 0.2458789199590683, + "learning_rate": 4.843452952349094e-05, + "loss": 1.7615, + "step": 17095 + }, + { + "epoch": 5.247391037446286, + "grad_norm": 0.22538037598133087, + "learning_rate": 4.842956140824045e-05, + "loss": 1.7279, + "step": 17096 + }, + { + "epoch": 5.247697974217311, + "grad_norm": 0.2959176003932953, + "learning_rate": 4.842459330850999e-05, + "loss": 1.767, + "step": 17097 + }, + { + "epoch": 5.248004910988336, + "grad_norm": 0.26158571243286133, + "learning_rate": 4.84196252243486e-05, + "loss": 1.7387, + "step": 17098 + }, + { + "epoch": 5.2483118477593615, + "grad_norm": 0.22855687141418457, + "learning_rate": 4.84146571558054e-05, + "loss": 1.7497, + "step": 17099 + }, + { + "epoch": 5.248618784530387, + "grad_norm": 0.22470593452453613, + "learning_rate": 4.840968910292949e-05, + "loss": 1.7705, + "step": 17100 + }, + { + "epoch": 5.248925721301412, + "grad_norm": 0.24680538475513458, + "learning_rate": 4.840472106576998e-05, + "loss": 1.7426, + "step": 17101 + }, + { + "epoch": 5.249232658072437, + "grad_norm": 0.23919185996055603, + "learning_rate": 4.839975304437594e-05, + "loss": 1.78, + "step": 17102 + }, + { + "epoch": 5.249539594843462, + "grad_norm": 0.24717366695404053, + "learning_rate": 4.839478503879647e-05, + "loss": 1.7373, + "step": 17103 + }, + { + "epoch": 5.249846531614487, + "grad_norm": 0.20463785529136658, + "learning_rate": 4.838981704908068e-05, + "loss": 1.702, + "step": 17104 + }, + { + "epoch": 5.250153468385513, + "grad_norm": 0.19791419804096222, + "learning_rate": 4.838484907527766e-05, + "loss": 1.746, + "step": 17105 + }, + { + "epoch": 5.250460405156538, + "grad_norm": 0.26169353723526, + "learning_rate": 4.837988111743652e-05, + "loss": 1.7227, + "step": 17106 + }, + { + "epoch": 5.250767341927563, + "grad_norm": 0.23545648157596588, + "learning_rate": 4.837491317560633e-05, + "loss": 1.7104, + "step": 17107 + }, + { + "epoch": 5.251074278698588, + "grad_norm": 0.21569804847240448, + "learning_rate": 4.836994524983622e-05, + "loss": 1.7883, + "step": 17108 + }, + { + "epoch": 5.251381215469613, + "grad_norm": 0.2730300724506378, + "learning_rate": 4.836497734017524e-05, + "loss": 1.7105, + "step": 17109 + }, + { + "epoch": 5.2516881522406385, + "grad_norm": 0.2834697663784027, + "learning_rate": 4.836000944667253e-05, + "loss": 1.8041, + "step": 17110 + }, + { + "epoch": 5.251995089011664, + "grad_norm": 0.31536951661109924, + "learning_rate": 4.835504156937715e-05, + "loss": 1.7708, + "step": 17111 + }, + { + "epoch": 5.252302025782689, + "grad_norm": 0.3830285668373108, + "learning_rate": 4.835007370833824e-05, + "loss": 1.7464, + "step": 17112 + }, + { + "epoch": 5.252608962553714, + "grad_norm": 0.23248349130153656, + "learning_rate": 4.834510586360485e-05, + "loss": 1.7274, + "step": 17113 + }, + { + "epoch": 5.252915899324739, + "grad_norm": 0.4755091071128845, + "learning_rate": 4.834013803522611e-05, + "loss": 1.7853, + "step": 17114 + }, + { + "epoch": 5.253222836095764, + "grad_norm": 0.4267823398113251, + "learning_rate": 4.8335170223251073e-05, + "loss": 1.7424, + "step": 17115 + }, + { + "epoch": 5.25352977286679, + "grad_norm": 0.17621731758117676, + "learning_rate": 4.8330202427728876e-05, + "loss": 1.7415, + "step": 17116 + }, + { + "epoch": 5.253836709637815, + "grad_norm": 0.37484630942344666, + "learning_rate": 4.832523464870859e-05, + "loss": 1.7357, + "step": 17117 + }, + { + "epoch": 5.25414364640884, + "grad_norm": 0.27773791551589966, + "learning_rate": 4.832026688623933e-05, + "loss": 1.717, + "step": 17118 + }, + { + "epoch": 5.254450583179865, + "grad_norm": 0.31190845370292664, + "learning_rate": 4.8315299140370183e-05, + "loss": 1.7226, + "step": 17119 + }, + { + "epoch": 5.25475751995089, + "grad_norm": 0.4321303367614746, + "learning_rate": 4.8310331411150215e-05, + "loss": 1.8003, + "step": 17120 + }, + { + "epoch": 5.255064456721915, + "grad_norm": 0.31622835993766785, + "learning_rate": 4.830536369862855e-05, + "loss": 1.8462, + "step": 17121 + }, + { + "epoch": 5.255371393492941, + "grad_norm": 0.2144850194454193, + "learning_rate": 4.830039600285427e-05, + "loss": 1.8153, + "step": 17122 + }, + { + "epoch": 5.255678330263966, + "grad_norm": 0.3107511103153229, + "learning_rate": 4.829542832387649e-05, + "loss": 1.7271, + "step": 17123 + }, + { + "epoch": 5.2559852670349905, + "grad_norm": 0.24607159197330475, + "learning_rate": 4.8290460661744265e-05, + "loss": 1.7946, + "step": 17124 + }, + { + "epoch": 5.256292203806016, + "grad_norm": 0.226362943649292, + "learning_rate": 4.828549301650673e-05, + "loss": 1.7338, + "step": 17125 + }, + { + "epoch": 5.256599140577041, + "grad_norm": 0.29993724822998047, + "learning_rate": 4.828052538821294e-05, + "loss": 1.8, + "step": 17126 + }, + { + "epoch": 5.2569060773480665, + "grad_norm": 0.25639984011650085, + "learning_rate": 4.8275557776912014e-05, + "loss": 1.8009, + "step": 17127 + }, + { + "epoch": 5.257213014119092, + "grad_norm": 0.2308105081319809, + "learning_rate": 4.8270590182653024e-05, + "loss": 1.7468, + "step": 17128 + }, + { + "epoch": 5.257519950890116, + "grad_norm": 0.27337542176246643, + "learning_rate": 4.82656226054851e-05, + "loss": 1.7725, + "step": 17129 + }, + { + "epoch": 5.257826887661142, + "grad_norm": 0.24848094582557678, + "learning_rate": 4.826065504545729e-05, + "loss": 1.8084, + "step": 17130 + }, + { + "epoch": 5.258133824432167, + "grad_norm": 0.35026392340660095, + "learning_rate": 4.825568750261872e-05, + "loss": 1.7705, + "step": 17131 + }, + { + "epoch": 5.258440761203192, + "grad_norm": 0.3207968473434448, + "learning_rate": 4.825071997701846e-05, + "loss": 1.7329, + "step": 17132 + }, + { + "epoch": 5.258747697974218, + "grad_norm": 0.20949263870716095, + "learning_rate": 4.8245752468705614e-05, + "loss": 1.7658, + "step": 17133 + }, + { + "epoch": 5.259054634745242, + "grad_norm": 0.3158881366252899, + "learning_rate": 4.824078497772926e-05, + "loss": 1.7249, + "step": 17134 + }, + { + "epoch": 5.259361571516267, + "grad_norm": 0.2283414602279663, + "learning_rate": 4.823581750413852e-05, + "loss": 1.7177, + "step": 17135 + }, + { + "epoch": 5.259668508287293, + "grad_norm": 0.24753578007221222, + "learning_rate": 4.823085004798247e-05, + "loss": 1.7232, + "step": 17136 + }, + { + "epoch": 5.259975445058318, + "grad_norm": 0.20381587743759155, + "learning_rate": 4.822588260931017e-05, + "loss": 1.7049, + "step": 17137 + }, + { + "epoch": 5.260282381829343, + "grad_norm": 0.21220643818378448, + "learning_rate": 4.8220915188170746e-05, + "loss": 1.7221, + "step": 17138 + }, + { + "epoch": 5.260589318600369, + "grad_norm": 0.19324758648872375, + "learning_rate": 4.8215947784613276e-05, + "loss": 1.7168, + "step": 17139 + }, + { + "epoch": 5.260896255371393, + "grad_norm": 0.26500338315963745, + "learning_rate": 4.821098039868688e-05, + "loss": 1.7627, + "step": 17140 + }, + { + "epoch": 5.2612031921424185, + "grad_norm": 0.19597655534744263, + "learning_rate": 4.82060130304406e-05, + "loss": 1.7214, + "step": 17141 + }, + { + "epoch": 5.261510128913444, + "grad_norm": 0.2105483114719391, + "learning_rate": 4.820104567992357e-05, + "loss": 1.6742, + "step": 17142 + }, + { + "epoch": 5.261817065684469, + "grad_norm": 0.20020028948783875, + "learning_rate": 4.8196078347184837e-05, + "loss": 1.7721, + "step": 17143 + }, + { + "epoch": 5.2621240024554945, + "grad_norm": 0.2313549965620041, + "learning_rate": 4.819111103227353e-05, + "loss": 1.7644, + "step": 17144 + }, + { + "epoch": 5.262430939226519, + "grad_norm": 0.31893789768218994, + "learning_rate": 4.818614373523871e-05, + "loss": 1.747, + "step": 17145 + }, + { + "epoch": 5.262737875997544, + "grad_norm": 0.2531197667121887, + "learning_rate": 4.8181176456129505e-05, + "loss": 1.7713, + "step": 17146 + }, + { + "epoch": 5.26304481276857, + "grad_norm": 0.2063976377248764, + "learning_rate": 4.817620919499496e-05, + "loss": 1.7254, + "step": 17147 + }, + { + "epoch": 5.263351749539595, + "grad_norm": 0.22220590710639954, + "learning_rate": 4.8171241951884204e-05, + "loss": 1.7345, + "step": 17148 + }, + { + "epoch": 5.26365868631062, + "grad_norm": 0.24240384995937347, + "learning_rate": 4.8166274726846286e-05, + "loss": 1.7302, + "step": 17149 + }, + { + "epoch": 5.263965623081646, + "grad_norm": 0.215829998254776, + "learning_rate": 4.8161307519930326e-05, + "loss": 1.7725, + "step": 17150 + }, + { + "epoch": 5.26427255985267, + "grad_norm": 0.2697906494140625, + "learning_rate": 4.815634033118541e-05, + "loss": 1.7156, + "step": 17151 + }, + { + "epoch": 5.264579496623695, + "grad_norm": 0.21649456024169922, + "learning_rate": 4.815137316066061e-05, + "loss": 1.745, + "step": 17152 + }, + { + "epoch": 5.264886433394721, + "grad_norm": 0.22773787379264832, + "learning_rate": 4.8146406008405033e-05, + "loss": 1.7592, + "step": 17153 + }, + { + "epoch": 5.265193370165746, + "grad_norm": 0.2920280396938324, + "learning_rate": 4.8141438874467745e-05, + "loss": 1.8301, + "step": 17154 + }, + { + "epoch": 5.265500306936771, + "grad_norm": 0.23919162154197693, + "learning_rate": 4.813647175889785e-05, + "loss": 1.7687, + "step": 17155 + }, + { + "epoch": 5.265807243707796, + "grad_norm": 0.24617896974086761, + "learning_rate": 4.8131504661744425e-05, + "loss": 1.8279, + "step": 17156 + }, + { + "epoch": 5.266114180478821, + "grad_norm": 0.22756172716617584, + "learning_rate": 4.812653758305659e-05, + "loss": 1.7595, + "step": 17157 + }, + { + "epoch": 5.2664211172498465, + "grad_norm": 0.22939376533031464, + "learning_rate": 4.812157052288339e-05, + "loss": 1.7445, + "step": 17158 + }, + { + "epoch": 5.266728054020872, + "grad_norm": 0.21021319925785065, + "learning_rate": 4.811660348127395e-05, + "loss": 1.7875, + "step": 17159 + }, + { + "epoch": 5.267034990791897, + "grad_norm": 0.2271810919046402, + "learning_rate": 4.811163645827732e-05, + "loss": 1.74, + "step": 17160 + }, + { + "epoch": 5.267341927562922, + "grad_norm": 0.238374263048172, + "learning_rate": 4.81066694539426e-05, + "loss": 1.7717, + "step": 17161 + }, + { + "epoch": 5.267648864333947, + "grad_norm": 0.20655091106891632, + "learning_rate": 4.8101702468318885e-05, + "loss": 1.7447, + "step": 17162 + }, + { + "epoch": 5.267955801104972, + "grad_norm": 0.24652259051799774, + "learning_rate": 4.809673550145528e-05, + "loss": 1.7755, + "step": 17163 + }, + { + "epoch": 5.268262737875998, + "grad_norm": 0.20256781578063965, + "learning_rate": 4.809176855340083e-05, + "loss": 1.7689, + "step": 17164 + }, + { + "epoch": 5.268569674647023, + "grad_norm": 0.27023112773895264, + "learning_rate": 4.8086801624204665e-05, + "loss": 1.8364, + "step": 17165 + }, + { + "epoch": 5.268876611418047, + "grad_norm": 0.251638799905777, + "learning_rate": 4.808183471391582e-05, + "loss": 1.7924, + "step": 17166 + }, + { + "epoch": 5.269183548189073, + "grad_norm": 0.22897782921791077, + "learning_rate": 4.807686782258342e-05, + "loss": 1.7378, + "step": 17167 + }, + { + "epoch": 5.269490484960098, + "grad_norm": 0.19141456484794617, + "learning_rate": 4.807190095025655e-05, + "loss": 1.6911, + "step": 17168 + }, + { + "epoch": 5.269797421731123, + "grad_norm": 0.19960568845272064, + "learning_rate": 4.806693409698427e-05, + "loss": 1.71, + "step": 17169 + }, + { + "epoch": 5.270104358502149, + "grad_norm": 0.23332087695598602, + "learning_rate": 4.8061967262815694e-05, + "loss": 1.7993, + "step": 17170 + }, + { + "epoch": 5.270411295273174, + "grad_norm": 0.24831432104110718, + "learning_rate": 4.8057000447799876e-05, + "loss": 1.7459, + "step": 17171 + }, + { + "epoch": 5.2707182320441985, + "grad_norm": 0.24735838174819946, + "learning_rate": 4.805203365198593e-05, + "loss": 1.7751, + "step": 17172 + }, + { + "epoch": 5.271025168815224, + "grad_norm": 0.32630103826522827, + "learning_rate": 4.804706687542291e-05, + "loss": 1.7885, + "step": 17173 + }, + { + "epoch": 5.271332105586249, + "grad_norm": 0.29055842757225037, + "learning_rate": 4.804210011815995e-05, + "loss": 1.6819, + "step": 17174 + }, + { + "epoch": 5.2716390423572745, + "grad_norm": 0.22968806326389313, + "learning_rate": 4.803713338024608e-05, + "loss": 1.8146, + "step": 17175 + }, + { + "epoch": 5.2719459791283, + "grad_norm": 0.23430144786834717, + "learning_rate": 4.8032166661730434e-05, + "loss": 1.7401, + "step": 17176 + }, + { + "epoch": 5.272252915899324, + "grad_norm": 0.26312723755836487, + "learning_rate": 4.802719996266204e-05, + "loss": 1.8319, + "step": 17177 + }, + { + "epoch": 5.27255985267035, + "grad_norm": 0.23715369403362274, + "learning_rate": 4.802223328309003e-05, + "loss": 1.8014, + "step": 17178 + }, + { + "epoch": 5.272866789441375, + "grad_norm": 0.23943877220153809, + "learning_rate": 4.801726662306347e-05, + "loss": 1.7181, + "step": 17179 + }, + { + "epoch": 5.2731737262124, + "grad_norm": 0.2366543412208557, + "learning_rate": 4.8012299982631435e-05, + "loss": 1.6685, + "step": 17180 + }, + { + "epoch": 5.273480662983426, + "grad_norm": 0.20688587427139282, + "learning_rate": 4.8007333361843016e-05, + "loss": 1.7089, + "step": 17181 + }, + { + "epoch": 5.273787599754451, + "grad_norm": 0.2069951444864273, + "learning_rate": 4.8002366760747314e-05, + "loss": 1.7447, + "step": 17182 + }, + { + "epoch": 5.274094536525475, + "grad_norm": 0.26072344183921814, + "learning_rate": 4.7997400179393374e-05, + "loss": 1.7346, + "step": 17183 + }, + { + "epoch": 5.274401473296501, + "grad_norm": 0.2397938072681427, + "learning_rate": 4.799243361783031e-05, + "loss": 1.7556, + "step": 17184 + }, + { + "epoch": 5.274708410067526, + "grad_norm": 0.23606348037719727, + "learning_rate": 4.798746707610721e-05, + "loss": 1.732, + "step": 17185 + }, + { + "epoch": 5.2750153468385514, + "grad_norm": 0.21078252792358398, + "learning_rate": 4.798250055427311e-05, + "loss": 1.7571, + "step": 17186 + }, + { + "epoch": 5.275322283609577, + "grad_norm": 0.21331414580345154, + "learning_rate": 4.797753405237714e-05, + "loss": 1.732, + "step": 17187 + }, + { + "epoch": 5.275629220380601, + "grad_norm": 0.23700307309627533, + "learning_rate": 4.7972567570468354e-05, + "loss": 1.7354, + "step": 17188 + }, + { + "epoch": 5.275936157151627, + "grad_norm": 0.20519722998142242, + "learning_rate": 4.7967601108595845e-05, + "loss": 1.7435, + "step": 17189 + }, + { + "epoch": 5.276243093922652, + "grad_norm": 0.22358302772045135, + "learning_rate": 4.79626346668087e-05, + "loss": 1.7891, + "step": 17190 + }, + { + "epoch": 5.276550030693677, + "grad_norm": 0.2434413880109787, + "learning_rate": 4.795766824515598e-05, + "loss": 1.814, + "step": 17191 + }, + { + "epoch": 5.276856967464703, + "grad_norm": 0.2198423594236374, + "learning_rate": 4.795270184368678e-05, + "loss": 1.7212, + "step": 17192 + }, + { + "epoch": 5.277163904235728, + "grad_norm": 0.23587806522846222, + "learning_rate": 4.7947735462450205e-05, + "loss": 1.8337, + "step": 17193 + }, + { + "epoch": 5.277470841006752, + "grad_norm": 0.234666645526886, + "learning_rate": 4.794276910149528e-05, + "loss": 1.7548, + "step": 17194 + }, + { + "epoch": 5.277777777777778, + "grad_norm": 0.23363247513771057, + "learning_rate": 4.793780276087115e-05, + "loss": 1.7587, + "step": 17195 + }, + { + "epoch": 5.278084714548803, + "grad_norm": 0.23191119730472565, + "learning_rate": 4.793283644062683e-05, + "loss": 1.7691, + "step": 17196 + }, + { + "epoch": 5.278391651319828, + "grad_norm": 0.2363097071647644, + "learning_rate": 4.7927870140811445e-05, + "loss": 1.8139, + "step": 17197 + }, + { + "epoch": 5.278698588090854, + "grad_norm": 0.2852413058280945, + "learning_rate": 4.7922903861474056e-05, + "loss": 1.7905, + "step": 17198 + }, + { + "epoch": 5.279005524861878, + "grad_norm": 0.23633842170238495, + "learning_rate": 4.7917937602663764e-05, + "loss": 1.8014, + "step": 17199 + }, + { + "epoch": 5.2793124616329035, + "grad_norm": 0.27007919549942017, + "learning_rate": 4.791297136442961e-05, + "loss": 1.7242, + "step": 17200 + }, + { + "epoch": 5.279619398403929, + "grad_norm": 0.29482147097587585, + "learning_rate": 4.790800514682072e-05, + "loss": 1.7154, + "step": 17201 + }, + { + "epoch": 5.279926335174954, + "grad_norm": 0.27772340178489685, + "learning_rate": 4.790303894988614e-05, + "loss": 1.7771, + "step": 17202 + }, + { + "epoch": 5.2802332719459795, + "grad_norm": 0.21761848032474518, + "learning_rate": 4.789807277367495e-05, + "loss": 1.6983, + "step": 17203 + }, + { + "epoch": 5.280540208717004, + "grad_norm": 0.22621290385723114, + "learning_rate": 4.789310661823626e-05, + "loss": 1.7667, + "step": 17204 + }, + { + "epoch": 5.280847145488029, + "grad_norm": 0.2284683883190155, + "learning_rate": 4.7888140483619095e-05, + "loss": 1.7419, + "step": 17205 + }, + { + "epoch": 5.281154082259055, + "grad_norm": 0.20145639777183533, + "learning_rate": 4.788317436987259e-05, + "loss": 1.7068, + "step": 17206 + }, + { + "epoch": 5.28146101903008, + "grad_norm": 0.23146072030067444, + "learning_rate": 4.7878208277045775e-05, + "loss": 1.7195, + "step": 17207 + }, + { + "epoch": 5.281767955801105, + "grad_norm": 0.24014149606227875, + "learning_rate": 4.787324220518776e-05, + "loss": 1.8148, + "step": 17208 + }, + { + "epoch": 5.28207489257213, + "grad_norm": 0.21067874133586884, + "learning_rate": 4.7868276154347595e-05, + "loss": 1.7754, + "step": 17209 + }, + { + "epoch": 5.282381829343155, + "grad_norm": 0.2313496321439743, + "learning_rate": 4.786331012457441e-05, + "loss": 1.7693, + "step": 17210 + }, + { + "epoch": 5.28268876611418, + "grad_norm": 0.24190983176231384, + "learning_rate": 4.7858344115917214e-05, + "loss": 1.7342, + "step": 17211 + }, + { + "epoch": 5.282995702885206, + "grad_norm": 0.24541905522346497, + "learning_rate": 4.785337812842514e-05, + "loss": 1.7721, + "step": 17212 + }, + { + "epoch": 5.283302639656231, + "grad_norm": 0.21989032626152039, + "learning_rate": 4.784841216214722e-05, + "loss": 1.7522, + "step": 17213 + }, + { + "epoch": 5.283609576427256, + "grad_norm": 0.20637241005897522, + "learning_rate": 4.784344621713256e-05, + "loss": 1.7418, + "step": 17214 + }, + { + "epoch": 5.283916513198281, + "grad_norm": 0.22538220882415771, + "learning_rate": 4.783848029343023e-05, + "loss": 1.8287, + "step": 17215 + }, + { + "epoch": 5.284223449969306, + "grad_norm": 0.24478071928024292, + "learning_rate": 4.7833514391089315e-05, + "loss": 1.7419, + "step": 17216 + }, + { + "epoch": 5.2845303867403315, + "grad_norm": 0.22707650065422058, + "learning_rate": 4.782854851015886e-05, + "loss": 1.7831, + "step": 17217 + }, + { + "epoch": 5.284837323511357, + "grad_norm": 0.2843529284000397, + "learning_rate": 4.7823582650687984e-05, + "loss": 1.7704, + "step": 17218 + }, + { + "epoch": 5.285144260282382, + "grad_norm": 0.21647678315639496, + "learning_rate": 4.781861681272573e-05, + "loss": 1.7514, + "step": 17219 + }, + { + "epoch": 5.285451197053407, + "grad_norm": 0.2279205620288849, + "learning_rate": 4.781365099632117e-05, + "loss": 1.6803, + "step": 17220 + }, + { + "epoch": 5.285758133824432, + "grad_norm": 0.2287401556968689, + "learning_rate": 4.7808685201523417e-05, + "loss": 1.7278, + "step": 17221 + }, + { + "epoch": 5.286065070595457, + "grad_norm": 0.2103174477815628, + "learning_rate": 4.78037194283815e-05, + "loss": 1.7667, + "step": 17222 + }, + { + "epoch": 5.286372007366483, + "grad_norm": 0.24339279532432556, + "learning_rate": 4.7798753676944536e-05, + "loss": 1.7828, + "step": 17223 + }, + { + "epoch": 5.286678944137508, + "grad_norm": 0.2343035340309143, + "learning_rate": 4.779378794726156e-05, + "loss": 1.7277, + "step": 17224 + }, + { + "epoch": 5.286985880908533, + "grad_norm": 0.22456331551074982, + "learning_rate": 4.778882223938167e-05, + "loss": 1.756, + "step": 17225 + }, + { + "epoch": 5.287292817679558, + "grad_norm": 0.2211158126592636, + "learning_rate": 4.778385655335392e-05, + "loss": 1.7733, + "step": 17226 + }, + { + "epoch": 5.287599754450583, + "grad_norm": 0.2731948792934418, + "learning_rate": 4.777889088922743e-05, + "loss": 1.787, + "step": 17227 + }, + { + "epoch": 5.287906691221608, + "grad_norm": 0.19578024744987488, + "learning_rate": 4.7773925247051215e-05, + "loss": 1.7474, + "step": 17228 + }, + { + "epoch": 5.288213627992634, + "grad_norm": 0.277332067489624, + "learning_rate": 4.77689596268744e-05, + "loss": 1.7432, + "step": 17229 + }, + { + "epoch": 5.288520564763659, + "grad_norm": 0.2979765832424164, + "learning_rate": 4.7763994028746003e-05, + "loss": 1.8198, + "step": 17230 + }, + { + "epoch": 5.2888275015346835, + "grad_norm": 0.23176288604736328, + "learning_rate": 4.775902845271515e-05, + "loss": 1.7317, + "step": 17231 + }, + { + "epoch": 5.289134438305709, + "grad_norm": 0.35821911692619324, + "learning_rate": 4.7754062898830876e-05, + "loss": 1.7287, + "step": 17232 + }, + { + "epoch": 5.289441375076734, + "grad_norm": 0.2881525158882141, + "learning_rate": 4.7749097367142296e-05, + "loss": 1.7391, + "step": 17233 + }, + { + "epoch": 5.2897483118477595, + "grad_norm": 0.22021767497062683, + "learning_rate": 4.774413185769842e-05, + "loss": 1.7462, + "step": 17234 + }, + { + "epoch": 5.290055248618785, + "grad_norm": 0.3286842703819275, + "learning_rate": 4.7739166370548385e-05, + "loss": 1.7749, + "step": 17235 + }, + { + "epoch": 5.290362185389809, + "grad_norm": 0.3298519253730774, + "learning_rate": 4.773420090574122e-05, + "loss": 1.7548, + "step": 17236 + }, + { + "epoch": 5.290669122160835, + "grad_norm": 0.20910575985908508, + "learning_rate": 4.7729235463326005e-05, + "loss": 1.7308, + "step": 17237 + }, + { + "epoch": 5.29097605893186, + "grad_norm": 0.3324633240699768, + "learning_rate": 4.7724270043351835e-05, + "loss": 1.7328, + "step": 17238 + }, + { + "epoch": 5.291282995702885, + "grad_norm": 0.21235628426074982, + "learning_rate": 4.771930464586774e-05, + "loss": 1.7186, + "step": 17239 + }, + { + "epoch": 5.291589932473911, + "grad_norm": 0.2971087694168091, + "learning_rate": 4.771433927092283e-05, + "loss": 1.7947, + "step": 17240 + }, + { + "epoch": 5.291896869244935, + "grad_norm": 0.3637695908546448, + "learning_rate": 4.770937391856614e-05, + "loss": 1.7753, + "step": 17241 + }, + { + "epoch": 5.29220380601596, + "grad_norm": 0.2503713369369507, + "learning_rate": 4.770440858884678e-05, + "loss": 1.684, + "step": 17242 + }, + { + "epoch": 5.292510742786986, + "grad_norm": 0.25510790944099426, + "learning_rate": 4.7699443281813774e-05, + "loss": 1.7517, + "step": 17243 + }, + { + "epoch": 5.292817679558011, + "grad_norm": 0.3189590871334076, + "learning_rate": 4.7694477997516244e-05, + "loss": 1.7488, + "step": 17244 + }, + { + "epoch": 5.293124616329036, + "grad_norm": 0.2807229161262512, + "learning_rate": 4.7689512736003215e-05, + "loss": 1.7962, + "step": 17245 + }, + { + "epoch": 5.293431553100062, + "grad_norm": 0.2166406810283661, + "learning_rate": 4.76845474973238e-05, + "loss": 1.7423, + "step": 17246 + }, + { + "epoch": 5.293738489871086, + "grad_norm": 0.29000815749168396, + "learning_rate": 4.767958228152702e-05, + "loss": 1.7508, + "step": 17247 + }, + { + "epoch": 5.2940454266421115, + "grad_norm": 0.19301612675189972, + "learning_rate": 4.767461708866198e-05, + "loss": 1.7223, + "step": 17248 + }, + { + "epoch": 5.294352363413137, + "grad_norm": 0.2828899323940277, + "learning_rate": 4.766965191877772e-05, + "loss": 1.8139, + "step": 17249 + }, + { + "epoch": 5.294659300184162, + "grad_norm": 0.32610374689102173, + "learning_rate": 4.766468677192335e-05, + "loss": 1.7744, + "step": 17250 + }, + { + "epoch": 5.2949662369551875, + "grad_norm": 0.2175719439983368, + "learning_rate": 4.7659721648147895e-05, + "loss": 1.7345, + "step": 17251 + }, + { + "epoch": 5.295273173726212, + "grad_norm": 0.24777816236019135, + "learning_rate": 4.7654756547500457e-05, + "loss": 1.7382, + "step": 17252 + }, + { + "epoch": 5.295580110497237, + "grad_norm": 0.25927749276161194, + "learning_rate": 4.764979147003008e-05, + "loss": 1.7625, + "step": 17253 + }, + { + "epoch": 5.295887047268263, + "grad_norm": 0.2271798849105835, + "learning_rate": 4.7644826415785834e-05, + "loss": 1.6928, + "step": 17254 + }, + { + "epoch": 5.296193984039288, + "grad_norm": 0.30804958939552307, + "learning_rate": 4.763986138481682e-05, + "loss": 1.743, + "step": 17255 + }, + { + "epoch": 5.296500920810313, + "grad_norm": 0.2247130572795868, + "learning_rate": 4.763489637717205e-05, + "loss": 1.7593, + "step": 17256 + }, + { + "epoch": 5.296807857581339, + "grad_norm": 0.22203052043914795, + "learning_rate": 4.7629931392900645e-05, + "loss": 1.6923, + "step": 17257 + }, + { + "epoch": 5.297114794352363, + "grad_norm": 0.23044714331626892, + "learning_rate": 4.7624966432051624e-05, + "loss": 1.7676, + "step": 17258 + }, + { + "epoch": 5.297421731123388, + "grad_norm": 0.2824070155620575, + "learning_rate": 4.7620001494674096e-05, + "loss": 1.8272, + "step": 17259 + }, + { + "epoch": 5.297728667894414, + "grad_norm": 0.27077800035476685, + "learning_rate": 4.761503658081709e-05, + "loss": 1.8106, + "step": 17260 + }, + { + "epoch": 5.298035604665439, + "grad_norm": 0.2333833873271942, + "learning_rate": 4.7610071690529706e-05, + "loss": 1.6841, + "step": 17261 + }, + { + "epoch": 5.298342541436464, + "grad_norm": 0.2542032301425934, + "learning_rate": 4.760510682386098e-05, + "loss": 1.7656, + "step": 17262 + }, + { + "epoch": 5.298649478207489, + "grad_norm": 0.30680081248283386, + "learning_rate": 4.760014198086002e-05, + "loss": 1.7443, + "step": 17263 + }, + { + "epoch": 5.298956414978514, + "grad_norm": 0.21580225229263306, + "learning_rate": 4.759517716157583e-05, + "loss": 1.7907, + "step": 17264 + }, + { + "epoch": 5.2992633517495396, + "grad_norm": 0.2644323408603668, + "learning_rate": 4.7590212366057516e-05, + "loss": 1.6835, + "step": 17265 + }, + { + "epoch": 5.299570288520565, + "grad_norm": 0.23600110411643982, + "learning_rate": 4.758524759435414e-05, + "loss": 1.7481, + "step": 17266 + }, + { + "epoch": 5.29987722529159, + "grad_norm": 0.23825959861278534, + "learning_rate": 4.758028284651477e-05, + "loss": 1.7267, + "step": 17267 + }, + { + "epoch": 5.300184162062616, + "grad_norm": 0.2659476101398468, + "learning_rate": 4.757531812258845e-05, + "loss": 1.7303, + "step": 17268 + }, + { + "epoch": 5.30049109883364, + "grad_norm": 0.30770114064216614, + "learning_rate": 4.757035342262428e-05, + "loss": 1.7636, + "step": 17269 + }, + { + "epoch": 5.300798035604665, + "grad_norm": 0.27921241521835327, + "learning_rate": 4.756538874667129e-05, + "loss": 1.7736, + "step": 17270 + }, + { + "epoch": 5.301104972375691, + "grad_norm": 0.2518016993999481, + "learning_rate": 4.756042409477855e-05, + "loss": 1.7942, + "step": 17271 + }, + { + "epoch": 5.301411909146716, + "grad_norm": 0.2678029537200928, + "learning_rate": 4.755545946699514e-05, + "loss": 1.7179, + "step": 17272 + }, + { + "epoch": 5.301718845917741, + "grad_norm": 0.3082284927368164, + "learning_rate": 4.7550494863370094e-05, + "loss": 1.7282, + "step": 17273 + }, + { + "epoch": 5.302025782688766, + "grad_norm": 0.23269952833652496, + "learning_rate": 4.754553028395251e-05, + "loss": 1.755, + "step": 17274 + }, + { + "epoch": 5.302332719459791, + "grad_norm": 0.2273751199245453, + "learning_rate": 4.754056572879142e-05, + "loss": 1.7661, + "step": 17275 + }, + { + "epoch": 5.3026396562308165, + "grad_norm": 0.2175082415342331, + "learning_rate": 4.7535601197935915e-05, + "loss": 1.7034, + "step": 17276 + }, + { + "epoch": 5.302946593001842, + "grad_norm": 0.20551301538944244, + "learning_rate": 4.753063669143503e-05, + "loss": 1.7329, + "step": 17277 + }, + { + "epoch": 5.303253529772867, + "grad_norm": 0.2350638061761856, + "learning_rate": 4.752567220933785e-05, + "loss": 1.8361, + "step": 17278 + }, + { + "epoch": 5.303560466543892, + "grad_norm": 0.20268140733242035, + "learning_rate": 4.752070775169342e-05, + "loss": 1.6736, + "step": 17279 + }, + { + "epoch": 5.303867403314917, + "grad_norm": 0.1891544908285141, + "learning_rate": 4.7515743318550823e-05, + "loss": 1.7241, + "step": 17280 + }, + { + "epoch": 5.304174340085942, + "grad_norm": 0.22900860011577606, + "learning_rate": 4.751077890995909e-05, + "loss": 1.7321, + "step": 17281 + }, + { + "epoch": 5.304481276856968, + "grad_norm": 0.25827866792678833, + "learning_rate": 4.7505814525967304e-05, + "loss": 1.8021, + "step": 17282 + }, + { + "epoch": 5.304788213627993, + "grad_norm": 0.22459273040294647, + "learning_rate": 4.7500850166624514e-05, + "loss": 1.7845, + "step": 17283 + }, + { + "epoch": 5.305095150399017, + "grad_norm": 0.23737964034080505, + "learning_rate": 4.7495885831979816e-05, + "loss": 1.7274, + "step": 17284 + }, + { + "epoch": 5.305402087170043, + "grad_norm": 0.2267502397298813, + "learning_rate": 4.749092152208221e-05, + "loss": 1.7747, + "step": 17285 + }, + { + "epoch": 5.305709023941068, + "grad_norm": 0.31811007857322693, + "learning_rate": 4.748595723698081e-05, + "loss": 1.7852, + "step": 17286 + }, + { + "epoch": 5.306015960712093, + "grad_norm": 0.42865583300590515, + "learning_rate": 4.7480992976724655e-05, + "loss": 1.7711, + "step": 17287 + }, + { + "epoch": 5.306322897483119, + "grad_norm": 0.3211027979850769, + "learning_rate": 4.747602874136278e-05, + "loss": 1.7813, + "step": 17288 + }, + { + "epoch": 5.306629834254144, + "grad_norm": 0.22552837431430817, + "learning_rate": 4.7471064530944295e-05, + "loss": 1.7407, + "step": 17289 + }, + { + "epoch": 5.3069367710251685, + "grad_norm": 0.3119906485080719, + "learning_rate": 4.746610034551821e-05, + "loss": 1.7255, + "step": 17290 + }, + { + "epoch": 5.307243707796194, + "grad_norm": 0.26405754685401917, + "learning_rate": 4.7461136185133623e-05, + "loss": 1.6945, + "step": 17291 + }, + { + "epoch": 5.307550644567219, + "grad_norm": 0.21759621798992157, + "learning_rate": 4.7456172049839566e-05, + "loss": 1.7319, + "step": 17292 + }, + { + "epoch": 5.3078575813382445, + "grad_norm": 0.26193925738334656, + "learning_rate": 4.745120793968511e-05, + "loss": 1.7508, + "step": 17293 + }, + { + "epoch": 5.30816451810927, + "grad_norm": 0.2549780011177063, + "learning_rate": 4.74462438547193e-05, + "loss": 1.7153, + "step": 17294 + }, + { + "epoch": 5.308471454880294, + "grad_norm": 0.21164020895957947, + "learning_rate": 4.7441279794991235e-05, + "loss": 1.7315, + "step": 17295 + }, + { + "epoch": 5.30877839165132, + "grad_norm": 0.20548345148563385, + "learning_rate": 4.7436315760549914e-05, + "loss": 1.68, + "step": 17296 + }, + { + "epoch": 5.309085328422345, + "grad_norm": 0.23997166752815247, + "learning_rate": 4.7431351751444446e-05, + "loss": 1.8528, + "step": 17297 + }, + { + "epoch": 5.30939226519337, + "grad_norm": 0.2639109194278717, + "learning_rate": 4.7426387767723845e-05, + "loss": 1.8041, + "step": 17298 + }, + { + "epoch": 5.309699201964396, + "grad_norm": 0.2285986840724945, + "learning_rate": 4.7421423809437196e-05, + "loss": 1.8188, + "step": 17299 + }, + { + "epoch": 5.310006138735421, + "grad_norm": 0.22183369100093842, + "learning_rate": 4.741645987663355e-05, + "loss": 1.7581, + "step": 17300 + }, + { + "epoch": 5.310313075506445, + "grad_norm": 0.22716040909290314, + "learning_rate": 4.741149596936197e-05, + "loss": 1.7438, + "step": 17301 + }, + { + "epoch": 5.310620012277471, + "grad_norm": 0.24641327559947968, + "learning_rate": 4.740653208767148e-05, + "loss": 1.761, + "step": 17302 + }, + { + "epoch": 5.310926949048496, + "grad_norm": 0.28470689058303833, + "learning_rate": 4.7401568231611194e-05, + "loss": 1.7512, + "step": 17303 + }, + { + "epoch": 5.311233885819521, + "grad_norm": 0.23279942572116852, + "learning_rate": 4.739660440123012e-05, + "loss": 1.7797, + "step": 17304 + }, + { + "epoch": 5.311540822590547, + "grad_norm": 0.26397696137428284, + "learning_rate": 4.739164059657731e-05, + "loss": 1.748, + "step": 17305 + }, + { + "epoch": 5.311847759361571, + "grad_norm": 0.25072020292282104, + "learning_rate": 4.7386676817701856e-05, + "loss": 1.7571, + "step": 17306 + }, + { + "epoch": 5.3121546961325965, + "grad_norm": 0.20815810561180115, + "learning_rate": 4.7381713064652774e-05, + "loss": 1.7566, + "step": 17307 + }, + { + "epoch": 5.312461632903622, + "grad_norm": 0.23104289174079895, + "learning_rate": 4.7376749337479174e-05, + "loss": 1.7308, + "step": 17308 + }, + { + "epoch": 5.312768569674647, + "grad_norm": 0.21978867053985596, + "learning_rate": 4.737178563623004e-05, + "loss": 1.7997, + "step": 17309 + }, + { + "epoch": 5.3130755064456725, + "grad_norm": 0.34588614106178284, + "learning_rate": 4.736682196095447e-05, + "loss": 1.8414, + "step": 17310 + }, + { + "epoch": 5.313382443216697, + "grad_norm": 0.3475342094898224, + "learning_rate": 4.73618583117015e-05, + "loss": 1.7823, + "step": 17311 + }, + { + "epoch": 5.313689379987722, + "grad_norm": 0.1965305358171463, + "learning_rate": 4.7356894688520215e-05, + "loss": 1.7597, + "step": 17312 + }, + { + "epoch": 5.313996316758748, + "grad_norm": 0.3035048246383667, + "learning_rate": 4.7351931091459624e-05, + "loss": 1.6803, + "step": 17313 + }, + { + "epoch": 5.314303253529773, + "grad_norm": 0.27722910046577454, + "learning_rate": 4.7346967520568827e-05, + "loss": 1.7472, + "step": 17314 + }, + { + "epoch": 5.314610190300798, + "grad_norm": 0.21481415629386902, + "learning_rate": 4.734200397589682e-05, + "loss": 1.7319, + "step": 17315 + }, + { + "epoch": 5.314917127071823, + "grad_norm": 0.2570357918739319, + "learning_rate": 4.733704045749271e-05, + "loss": 1.7392, + "step": 17316 + }, + { + "epoch": 5.315224063842848, + "grad_norm": 0.2404400259256363, + "learning_rate": 4.733207696540551e-05, + "loss": 1.7231, + "step": 17317 + }, + { + "epoch": 5.315531000613873, + "grad_norm": 0.222911074757576, + "learning_rate": 4.732711349968432e-05, + "loss": 1.7584, + "step": 17318 + }, + { + "epoch": 5.315837937384899, + "grad_norm": 0.22908064723014832, + "learning_rate": 4.732215006037813e-05, + "loss": 1.7242, + "step": 17319 + }, + { + "epoch": 5.316144874155924, + "grad_norm": 0.2432398796081543, + "learning_rate": 4.7317186647536044e-05, + "loss": 1.7056, + "step": 17320 + }, + { + "epoch": 5.316451810926949, + "grad_norm": 0.1994420737028122, + "learning_rate": 4.7312223261207086e-05, + "loss": 1.6667, + "step": 17321 + }, + { + "epoch": 5.316758747697974, + "grad_norm": 0.22314350306987762, + "learning_rate": 4.73072599014403e-05, + "loss": 1.7945, + "step": 17322 + }, + { + "epoch": 5.317065684468999, + "grad_norm": 0.2309068888425827, + "learning_rate": 4.730229656828477e-05, + "loss": 1.7099, + "step": 17323 + }, + { + "epoch": 5.3173726212400245, + "grad_norm": 0.22388015687465668, + "learning_rate": 4.729733326178951e-05, + "loss": 1.7053, + "step": 17324 + }, + { + "epoch": 5.31767955801105, + "grad_norm": 0.20203040540218353, + "learning_rate": 4.72923699820036e-05, + "loss": 1.6992, + "step": 17325 + }, + { + "epoch": 5.317986494782075, + "grad_norm": 0.24416297674179077, + "learning_rate": 4.728740672897606e-05, + "loss": 1.7455, + "step": 17326 + }, + { + "epoch": 5.3182934315531, + "grad_norm": 0.2501862049102783, + "learning_rate": 4.728244350275597e-05, + "loss": 1.7609, + "step": 17327 + }, + { + "epoch": 5.318600368324125, + "grad_norm": 0.21482665836811066, + "learning_rate": 4.727748030339235e-05, + "loss": 1.7614, + "step": 17328 + }, + { + "epoch": 5.31890730509515, + "grad_norm": 0.2241419404745102, + "learning_rate": 4.727251713093429e-05, + "loss": 1.736, + "step": 17329 + }, + { + "epoch": 5.319214241866176, + "grad_norm": 0.1757260262966156, + "learning_rate": 4.726755398543079e-05, + "loss": 1.6646, + "step": 17330 + }, + { + "epoch": 5.319521178637201, + "grad_norm": 0.18697243928909302, + "learning_rate": 4.726259086693095e-05, + "loss": 1.7512, + "step": 17331 + }, + { + "epoch": 5.319828115408226, + "grad_norm": 0.22584228217601776, + "learning_rate": 4.725762777548376e-05, + "loss": 1.7439, + "step": 17332 + }, + { + "epoch": 5.320135052179251, + "grad_norm": 0.18673470616340637, + "learning_rate": 4.725266471113832e-05, + "loss": 1.7007, + "step": 17333 + }, + { + "epoch": 5.320441988950276, + "grad_norm": 0.23030288517475128, + "learning_rate": 4.7247701673943656e-05, + "loss": 1.8021, + "step": 17334 + }, + { + "epoch": 5.320748925721301, + "grad_norm": 0.19333480298519135, + "learning_rate": 4.7242738663948813e-05, + "loss": 1.6659, + "step": 17335 + }, + { + "epoch": 5.321055862492327, + "grad_norm": 0.278097003698349, + "learning_rate": 4.723777568120284e-05, + "loss": 1.7302, + "step": 17336 + }, + { + "epoch": 5.321362799263352, + "grad_norm": 0.2146742343902588, + "learning_rate": 4.72328127257548e-05, + "loss": 1.7644, + "step": 17337 + }, + { + "epoch": 5.3216697360343765, + "grad_norm": 0.25582969188690186, + "learning_rate": 4.722784979765372e-05, + "loss": 1.7872, + "step": 17338 + }, + { + "epoch": 5.321976672805402, + "grad_norm": 0.20411577820777893, + "learning_rate": 4.722288689694864e-05, + "loss": 1.7167, + "step": 17339 + }, + { + "epoch": 5.322283609576427, + "grad_norm": 0.20894703269004822, + "learning_rate": 4.7217924023688645e-05, + "loss": 1.7526, + "step": 17340 + }, + { + "epoch": 5.3225905463474525, + "grad_norm": 0.20197831094264984, + "learning_rate": 4.721296117792273e-05, + "loss": 1.711, + "step": 17341 + }, + { + "epoch": 5.322897483118478, + "grad_norm": 0.20490549504756927, + "learning_rate": 4.720799835969999e-05, + "loss": 1.7303, + "step": 17342 + }, + { + "epoch": 5.323204419889503, + "grad_norm": 0.20666229724884033, + "learning_rate": 4.720303556906943e-05, + "loss": 1.6738, + "step": 17343 + }, + { + "epoch": 5.323511356660528, + "grad_norm": 0.21899856626987457, + "learning_rate": 4.719807280608011e-05, + "loss": 1.7632, + "step": 17344 + }, + { + "epoch": 5.323818293431553, + "grad_norm": 0.2310410887002945, + "learning_rate": 4.719311007078108e-05, + "loss": 1.7568, + "step": 17345 + }, + { + "epoch": 5.324125230202578, + "grad_norm": 0.20057427883148193, + "learning_rate": 4.7188147363221394e-05, + "loss": 1.6716, + "step": 17346 + }, + { + "epoch": 5.324432166973604, + "grad_norm": 0.21361050009727478, + "learning_rate": 4.718318468345006e-05, + "loss": 1.7224, + "step": 17347 + }, + { + "epoch": 5.324739103744629, + "grad_norm": 0.28389376401901245, + "learning_rate": 4.7178222031516173e-05, + "loss": 1.8519, + "step": 17348 + }, + { + "epoch": 5.3250460405156534, + "grad_norm": 0.2094416618347168, + "learning_rate": 4.717325940746872e-05, + "loss": 1.7763, + "step": 17349 + }, + { + "epoch": 5.325352977286679, + "grad_norm": 0.2263312190771103, + "learning_rate": 4.716829681135681e-05, + "loss": 1.7961, + "step": 17350 + }, + { + "epoch": 5.325659914057704, + "grad_norm": 0.2685631811618805, + "learning_rate": 4.7163334243229417e-05, + "loss": 1.7763, + "step": 17351 + }, + { + "epoch": 5.3259668508287294, + "grad_norm": 0.2029418647289276, + "learning_rate": 4.7158371703135636e-05, + "loss": 1.7662, + "step": 17352 + }, + { + "epoch": 5.326273787599755, + "grad_norm": 0.3109094798564911, + "learning_rate": 4.715340919112447e-05, + "loss": 1.7064, + "step": 17353 + }, + { + "epoch": 5.326580724370779, + "grad_norm": 0.24679912626743317, + "learning_rate": 4.714844670724502e-05, + "loss": 1.6903, + "step": 17354 + }, + { + "epoch": 5.326887661141805, + "grad_norm": 0.2004890739917755, + "learning_rate": 4.714348425154627e-05, + "loss": 1.7242, + "step": 17355 + }, + { + "epoch": 5.32719459791283, + "grad_norm": 0.27442196011543274, + "learning_rate": 4.7138521824077284e-05, + "loss": 1.826, + "step": 17356 + }, + { + "epoch": 5.327501534683855, + "grad_norm": 0.19933666288852692, + "learning_rate": 4.713355942488711e-05, + "loss": 1.748, + "step": 17357 + }, + { + "epoch": 5.327808471454881, + "grad_norm": 0.2306378185749054, + "learning_rate": 4.712859705402476e-05, + "loss": 1.7426, + "step": 17358 + }, + { + "epoch": 5.328115408225905, + "grad_norm": 0.22484014928340912, + "learning_rate": 4.7123634711539324e-05, + "loss": 1.7355, + "step": 17359 + }, + { + "epoch": 5.32842234499693, + "grad_norm": 0.2501749098300934, + "learning_rate": 4.711867239747979e-05, + "loss": 1.7502, + "step": 17360 + }, + { + "epoch": 5.328729281767956, + "grad_norm": 0.1940663903951645, + "learning_rate": 4.711371011189525e-05, + "loss": 1.7423, + "step": 17361 + }, + { + "epoch": 5.329036218538981, + "grad_norm": 0.28115448355674744, + "learning_rate": 4.71087478548347e-05, + "loss": 1.7134, + "step": 17362 + }, + { + "epoch": 5.329343155310006, + "grad_norm": 0.29717928171157837, + "learning_rate": 4.71037856263472e-05, + "loss": 1.8145, + "step": 17363 + }, + { + "epoch": 5.329650092081032, + "grad_norm": 0.24278375506401062, + "learning_rate": 4.709882342648179e-05, + "loss": 1.689, + "step": 17364 + }, + { + "epoch": 5.329957028852056, + "grad_norm": 0.26382890343666077, + "learning_rate": 4.709386125528751e-05, + "loss": 1.801, + "step": 17365 + }, + { + "epoch": 5.3302639656230815, + "grad_norm": 0.237087219953537, + "learning_rate": 4.708889911281339e-05, + "loss": 1.7019, + "step": 17366 + }, + { + "epoch": 5.330570902394107, + "grad_norm": 0.21994253993034363, + "learning_rate": 4.7083936999108494e-05, + "loss": 1.707, + "step": 17367 + }, + { + "epoch": 5.330877839165132, + "grad_norm": 0.3028903901576996, + "learning_rate": 4.707897491422182e-05, + "loss": 1.7992, + "step": 17368 + }, + { + "epoch": 5.3311847759361575, + "grad_norm": 0.24991434812545776, + "learning_rate": 4.7074012858202435e-05, + "loss": 1.7894, + "step": 17369 + }, + { + "epoch": 5.331491712707182, + "grad_norm": 0.20631250739097595, + "learning_rate": 4.706905083109936e-05, + "loss": 1.6816, + "step": 17370 + }, + { + "epoch": 5.331798649478207, + "grad_norm": 0.23300573229789734, + "learning_rate": 4.7064088832961666e-05, + "loss": 1.7101, + "step": 17371 + }, + { + "epoch": 5.332105586249233, + "grad_norm": 0.22331316769123077, + "learning_rate": 4.705912686383837e-05, + "loss": 1.861, + "step": 17372 + }, + { + "epoch": 5.332412523020258, + "grad_norm": 0.204593226313591, + "learning_rate": 4.7054164923778485e-05, + "loss": 1.7062, + "step": 17373 + }, + { + "epoch": 5.332719459791283, + "grad_norm": 0.22207681834697723, + "learning_rate": 4.704920301283107e-05, + "loss": 1.7546, + "step": 17374 + }, + { + "epoch": 5.333026396562309, + "grad_norm": 0.2508530020713806, + "learning_rate": 4.7044241131045157e-05, + "loss": 1.7881, + "step": 17375 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.26084616780281067, + "learning_rate": 4.7039279278469804e-05, + "loss": 1.7292, + "step": 17376 + }, + { + "epoch": 5.333640270104358, + "grad_norm": 0.2122940719127655, + "learning_rate": 4.7034317455154006e-05, + "loss": 1.7493, + "step": 17377 + }, + { + "epoch": 5.333947206875384, + "grad_norm": 0.2627449333667755, + "learning_rate": 4.702935566114685e-05, + "loss": 1.759, + "step": 17378 + }, + { + "epoch": 5.334254143646409, + "grad_norm": 0.20637977123260498, + "learning_rate": 4.702439389649732e-05, + "loss": 1.8043, + "step": 17379 + }, + { + "epoch": 5.334561080417434, + "grad_norm": 0.28783395886421204, + "learning_rate": 4.701943216125447e-05, + "loss": 1.7256, + "step": 17380 + }, + { + "epoch": 5.334868017188459, + "grad_norm": 0.21130618453025818, + "learning_rate": 4.701447045546734e-05, + "loss": 1.7161, + "step": 17381 + }, + { + "epoch": 5.335174953959484, + "grad_norm": 0.2793416678905487, + "learning_rate": 4.7009508779184984e-05, + "loss": 1.7659, + "step": 17382 + }, + { + "epoch": 5.3354818907305095, + "grad_norm": 0.3088020384311676, + "learning_rate": 4.700454713245639e-05, + "loss": 1.6877, + "step": 17383 + }, + { + "epoch": 5.335788827501535, + "grad_norm": 0.19697681069374084, + "learning_rate": 4.6999585515330646e-05, + "loss": 1.7111, + "step": 17384 + }, + { + "epoch": 5.33609576427256, + "grad_norm": 0.29234182834625244, + "learning_rate": 4.699462392785673e-05, + "loss": 1.7136, + "step": 17385 + }, + { + "epoch": 5.336402701043585, + "grad_norm": 0.2593611776828766, + "learning_rate": 4.698966237008371e-05, + "loss": 1.7531, + "step": 17386 + }, + { + "epoch": 5.33670963781461, + "grad_norm": 0.20024444162845612, + "learning_rate": 4.6984700842060604e-05, + "loss": 1.7035, + "step": 17387 + }, + { + "epoch": 5.337016574585635, + "grad_norm": 0.2929787039756775, + "learning_rate": 4.697973934383647e-05, + "loss": 1.7212, + "step": 17388 + }, + { + "epoch": 5.337323511356661, + "grad_norm": 0.2425665408372879, + "learning_rate": 4.697477787546032e-05, + "loss": 1.7191, + "step": 17389 + }, + { + "epoch": 5.337630448127686, + "grad_norm": 0.19175556302070618, + "learning_rate": 4.6969816436981176e-05, + "loss": 1.7291, + "step": 17390 + }, + { + "epoch": 5.337937384898711, + "grad_norm": 0.2602384686470032, + "learning_rate": 4.696485502844809e-05, + "loss": 1.7035, + "step": 17391 + }, + { + "epoch": 5.338244321669736, + "grad_norm": 0.19117408990859985, + "learning_rate": 4.695989364991006e-05, + "loss": 1.707, + "step": 17392 + }, + { + "epoch": 5.338551258440761, + "grad_norm": 0.31086108088493347, + "learning_rate": 4.6954932301416174e-05, + "loss": 1.7397, + "step": 17393 + }, + { + "epoch": 5.338858195211786, + "grad_norm": 0.27402472496032715, + "learning_rate": 4.694997098301542e-05, + "loss": 1.7144, + "step": 17394 + }, + { + "epoch": 5.339165131982812, + "grad_norm": 0.20345155894756317, + "learning_rate": 4.694500969475685e-05, + "loss": 1.7492, + "step": 17395 + }, + { + "epoch": 5.339472068753837, + "grad_norm": 0.23786045610904694, + "learning_rate": 4.694004843668947e-05, + "loss": 1.7781, + "step": 17396 + }, + { + "epoch": 5.3397790055248615, + "grad_norm": 0.19747424125671387, + "learning_rate": 4.6935087208862335e-05, + "loss": 1.7353, + "step": 17397 + }, + { + "epoch": 5.340085942295887, + "grad_norm": 0.224543035030365, + "learning_rate": 4.693012601132445e-05, + "loss": 1.7229, + "step": 17398 + }, + { + "epoch": 5.340392879066912, + "grad_norm": 0.20840135216712952, + "learning_rate": 4.692516484412488e-05, + "loss": 1.7557, + "step": 17399 + }, + { + "epoch": 5.3406998158379375, + "grad_norm": 0.21019098162651062, + "learning_rate": 4.692020370731261e-05, + "loss": 1.7793, + "step": 17400 + }, + { + "epoch": 5.341006752608963, + "grad_norm": 0.20540091395378113, + "learning_rate": 4.691524260093672e-05, + "loss": 1.6925, + "step": 17401 + }, + { + "epoch": 5.341313689379987, + "grad_norm": 0.2414131462574005, + "learning_rate": 4.691028152504619e-05, + "loss": 1.7706, + "step": 17402 + }, + { + "epoch": 5.341620626151013, + "grad_norm": 0.19627155363559723, + "learning_rate": 4.6905320479690073e-05, + "loss": 1.6356, + "step": 17403 + }, + { + "epoch": 5.341927562922038, + "grad_norm": 0.20978952944278717, + "learning_rate": 4.690035946491741e-05, + "loss": 1.7487, + "step": 17404 + }, + { + "epoch": 5.342234499693063, + "grad_norm": 0.2524566054344177, + "learning_rate": 4.689539848077719e-05, + "loss": 1.7713, + "step": 17405 + }, + { + "epoch": 5.342541436464089, + "grad_norm": 0.1967654973268509, + "learning_rate": 4.689043752731847e-05, + "loss": 1.7358, + "step": 17406 + }, + { + "epoch": 5.342848373235114, + "grad_norm": 0.2085377424955368, + "learning_rate": 4.688547660459026e-05, + "loss": 1.7104, + "step": 17407 + }, + { + "epoch": 5.343155310006138, + "grad_norm": 0.21294310688972473, + "learning_rate": 4.688051571264161e-05, + "loss": 1.7349, + "step": 17408 + }, + { + "epoch": 5.343462246777164, + "grad_norm": 0.23702891170978546, + "learning_rate": 4.6875554851521514e-05, + "loss": 1.8048, + "step": 17409 + }, + { + "epoch": 5.343769183548189, + "grad_norm": 0.2513964772224426, + "learning_rate": 4.687059402127904e-05, + "loss": 1.6669, + "step": 17410 + }, + { + "epoch": 5.344076120319214, + "grad_norm": 0.259540855884552, + "learning_rate": 4.6865633221963165e-05, + "loss": 1.7763, + "step": 17411 + }, + { + "epoch": 5.34438305709024, + "grad_norm": 0.28354617953300476, + "learning_rate": 4.6860672453622966e-05, + "loss": 1.7912, + "step": 17412 + }, + { + "epoch": 5.344689993861264, + "grad_norm": 0.2503860592842102, + "learning_rate": 4.685571171630742e-05, + "loss": 1.6817, + "step": 17413 + }, + { + "epoch": 5.3449969306322895, + "grad_norm": 0.2317555695772171, + "learning_rate": 4.685075101006558e-05, + "loss": 1.7652, + "step": 17414 + }, + { + "epoch": 5.345303867403315, + "grad_norm": 0.23333363234996796, + "learning_rate": 4.684579033494646e-05, + "loss": 1.722, + "step": 17415 + }, + { + "epoch": 5.34561080417434, + "grad_norm": 0.22507359087467194, + "learning_rate": 4.6840829690999104e-05, + "loss": 1.7522, + "step": 17416 + }, + { + "epoch": 5.3459177409453655, + "grad_norm": 0.2298288643360138, + "learning_rate": 4.6835869078272504e-05, + "loss": 1.7425, + "step": 17417 + }, + { + "epoch": 5.346224677716391, + "grad_norm": 0.2829224765300751, + "learning_rate": 4.683090849681572e-05, + "loss": 1.7798, + "step": 17418 + }, + { + "epoch": 5.346531614487415, + "grad_norm": 0.18153807520866394, + "learning_rate": 4.682594794667773e-05, + "loss": 1.6846, + "step": 17419 + }, + { + "epoch": 5.346838551258441, + "grad_norm": 0.24153028428554535, + "learning_rate": 4.6820987427907596e-05, + "loss": 1.7474, + "step": 17420 + }, + { + "epoch": 5.347145488029466, + "grad_norm": 0.2529772222042084, + "learning_rate": 4.681602694055434e-05, + "loss": 1.7465, + "step": 17421 + }, + { + "epoch": 5.347452424800491, + "grad_norm": 0.20414131879806519, + "learning_rate": 4.681106648466696e-05, + "loss": 1.7704, + "step": 17422 + }, + { + "epoch": 5.347759361571517, + "grad_norm": 0.27280452847480774, + "learning_rate": 4.68061060602945e-05, + "loss": 1.791, + "step": 17423 + }, + { + "epoch": 5.348066298342541, + "grad_norm": 0.20767468214035034, + "learning_rate": 4.680114566748595e-05, + "loss": 1.7744, + "step": 17424 + }, + { + "epoch": 5.348373235113566, + "grad_norm": 0.2661697566509247, + "learning_rate": 4.679618530629036e-05, + "loss": 1.7999, + "step": 17425 + }, + { + "epoch": 5.348680171884592, + "grad_norm": 0.23666872084140778, + "learning_rate": 4.679122497675674e-05, + "loss": 1.7204, + "step": 17426 + }, + { + "epoch": 5.348987108655617, + "grad_norm": 0.2688015401363373, + "learning_rate": 4.678626467893414e-05, + "loss": 1.7619, + "step": 17427 + }, + { + "epoch": 5.349294045426642, + "grad_norm": 0.23924420773983002, + "learning_rate": 4.678130441287153e-05, + "loss": 1.7754, + "step": 17428 + }, + { + "epoch": 5.349600982197667, + "grad_norm": 0.25724148750305176, + "learning_rate": 4.677634417861798e-05, + "loss": 1.761, + "step": 17429 + }, + { + "epoch": 5.349907918968692, + "grad_norm": 0.2633780241012573, + "learning_rate": 4.6771383976222464e-05, + "loss": 1.8705, + "step": 17430 + }, + { + "epoch": 5.350214855739718, + "grad_norm": 0.24774575233459473, + "learning_rate": 4.6766423805734036e-05, + "loss": 1.7127, + "step": 17431 + }, + { + "epoch": 5.350521792510743, + "grad_norm": 0.29887545108795166, + "learning_rate": 4.6761463667201695e-05, + "loss": 1.7651, + "step": 17432 + }, + { + "epoch": 5.350828729281768, + "grad_norm": 0.2231605499982834, + "learning_rate": 4.6756503560674486e-05, + "loss": 1.7636, + "step": 17433 + }, + { + "epoch": 5.351135666052793, + "grad_norm": 0.27977073192596436, + "learning_rate": 4.675154348620139e-05, + "loss": 1.7108, + "step": 17434 + }, + { + "epoch": 5.351442602823818, + "grad_norm": 0.26866039633750916, + "learning_rate": 4.674658344383146e-05, + "loss": 1.7593, + "step": 17435 + }, + { + "epoch": 5.351749539594843, + "grad_norm": 0.2154620885848999, + "learning_rate": 4.6741623433613685e-05, + "loss": 1.7536, + "step": 17436 + }, + { + "epoch": 5.352056476365869, + "grad_norm": 0.276656836271286, + "learning_rate": 4.673666345559711e-05, + "loss": 1.803, + "step": 17437 + }, + { + "epoch": 5.352363413136894, + "grad_norm": 0.22247640788555145, + "learning_rate": 4.6731703509830744e-05, + "loss": 1.7273, + "step": 17438 + }, + { + "epoch": 5.352670349907919, + "grad_norm": 0.2399090677499771, + "learning_rate": 4.6726743596363574e-05, + "loss": 1.7708, + "step": 17439 + }, + { + "epoch": 5.352977286678944, + "grad_norm": 0.2550101578235626, + "learning_rate": 4.6721783715244674e-05, + "loss": 1.7016, + "step": 17440 + }, + { + "epoch": 5.353284223449969, + "grad_norm": 0.19929546117782593, + "learning_rate": 4.6716823866523e-05, + "loss": 1.7417, + "step": 17441 + }, + { + "epoch": 5.3535911602209945, + "grad_norm": 0.2496672421693802, + "learning_rate": 4.671186405024761e-05, + "loss": 1.72, + "step": 17442 + }, + { + "epoch": 5.35389809699202, + "grad_norm": 0.19827665388584137, + "learning_rate": 4.67069042664675e-05, + "loss": 1.7515, + "step": 17443 + }, + { + "epoch": 5.354205033763045, + "grad_norm": 0.2528775930404663, + "learning_rate": 4.670194451523171e-05, + "loss": 1.7429, + "step": 17444 + }, + { + "epoch": 5.35451197053407, + "grad_norm": 0.19569729268550873, + "learning_rate": 4.6696984796589215e-05, + "loss": 1.7314, + "step": 17445 + }, + { + "epoch": 5.354818907305095, + "grad_norm": 0.21892370283603668, + "learning_rate": 4.669202511058908e-05, + "loss": 1.7331, + "step": 17446 + }, + { + "epoch": 5.35512584407612, + "grad_norm": 0.21609409153461456, + "learning_rate": 4.668706545728026e-05, + "loss": 1.7267, + "step": 17447 + }, + { + "epoch": 5.355432780847146, + "grad_norm": 0.2631370425224304, + "learning_rate": 4.668210583671182e-05, + "loss": 1.7513, + "step": 17448 + }, + { + "epoch": 5.355739717618171, + "grad_norm": 0.31327441334724426, + "learning_rate": 4.667714624893274e-05, + "loss": 1.7936, + "step": 17449 + }, + { + "epoch": 5.356046654389196, + "grad_norm": 0.21602430939674377, + "learning_rate": 4.667218669399207e-05, + "loss": 1.7387, + "step": 17450 + }, + { + "epoch": 5.356353591160221, + "grad_norm": 0.2895040214061737, + "learning_rate": 4.6667227171938784e-05, + "loss": 1.7293, + "step": 17451 + }, + { + "epoch": 5.356660527931246, + "grad_norm": 0.35150307416915894, + "learning_rate": 4.666226768282193e-05, + "loss": 1.8215, + "step": 17452 + }, + { + "epoch": 5.356967464702271, + "grad_norm": 0.19034281373023987, + "learning_rate": 4.665730822669048e-05, + "loss": 1.702, + "step": 17453 + }, + { + "epoch": 5.357274401473297, + "grad_norm": 0.25586241483688354, + "learning_rate": 4.6652348803593484e-05, + "loss": 1.7809, + "step": 17454 + }, + { + "epoch": 5.357581338244322, + "grad_norm": 0.23919305205345154, + "learning_rate": 4.6647389413579944e-05, + "loss": 1.7555, + "step": 17455 + }, + { + "epoch": 5.3578882750153465, + "grad_norm": 0.22707165777683258, + "learning_rate": 4.664243005669885e-05, + "loss": 1.7633, + "step": 17456 + }, + { + "epoch": 5.358195211786372, + "grad_norm": 0.20666839182376862, + "learning_rate": 4.663747073299925e-05, + "loss": 1.6522, + "step": 17457 + }, + { + "epoch": 5.358502148557397, + "grad_norm": 0.20557542145252228, + "learning_rate": 4.663251144253012e-05, + "loss": 1.73, + "step": 17458 + }, + { + "epoch": 5.3588090853284225, + "grad_norm": 0.22375571727752686, + "learning_rate": 4.662755218534049e-05, + "loss": 1.7189, + "step": 17459 + }, + { + "epoch": 5.359116022099448, + "grad_norm": 0.261393278837204, + "learning_rate": 4.662259296147936e-05, + "loss": 1.6863, + "step": 17460 + }, + { + "epoch": 5.359422958870473, + "grad_norm": 0.2279379516839981, + "learning_rate": 4.6617633770995764e-05, + "loss": 1.7332, + "step": 17461 + }, + { + "epoch": 5.359729895641498, + "grad_norm": 0.2194606065750122, + "learning_rate": 4.6612674613938666e-05, + "loss": 1.7324, + "step": 17462 + }, + { + "epoch": 5.360036832412523, + "grad_norm": 0.27714410424232483, + "learning_rate": 4.660771549035713e-05, + "loss": 1.7386, + "step": 17463 + }, + { + "epoch": 5.360343769183548, + "grad_norm": 0.2118787169456482, + "learning_rate": 4.660275640030012e-05, + "loss": 1.7587, + "step": 17464 + }, + { + "epoch": 5.360650705954574, + "grad_norm": 0.2546979784965515, + "learning_rate": 4.6597797343816665e-05, + "loss": 1.7756, + "step": 17465 + }, + { + "epoch": 5.360957642725599, + "grad_norm": 0.194237619638443, + "learning_rate": 4.659283832095577e-05, + "loss": 1.7351, + "step": 17466 + }, + { + "epoch": 5.361264579496623, + "grad_norm": 0.23448583483695984, + "learning_rate": 4.658787933176646e-05, + "loss": 1.7051, + "step": 17467 + }, + { + "epoch": 5.361571516267649, + "grad_norm": 0.22796298563480377, + "learning_rate": 4.65829203762977e-05, + "loss": 1.7395, + "step": 17468 + }, + { + "epoch": 5.361878453038674, + "grad_norm": 0.22674904763698578, + "learning_rate": 4.657796145459855e-05, + "loss": 1.714, + "step": 17469 + }, + { + "epoch": 5.362185389809699, + "grad_norm": 0.2697311341762543, + "learning_rate": 4.657300256671797e-05, + "loss": 1.8271, + "step": 17470 + }, + { + "epoch": 5.362492326580725, + "grad_norm": 0.28040480613708496, + "learning_rate": 4.6568043712705004e-05, + "loss": 1.8192, + "step": 17471 + }, + { + "epoch": 5.362799263351749, + "grad_norm": 0.21100232005119324, + "learning_rate": 4.6563084892608644e-05, + "loss": 1.7285, + "step": 17472 + }, + { + "epoch": 5.3631062001227745, + "grad_norm": 0.23545897006988525, + "learning_rate": 4.655812610647787e-05, + "loss": 1.7302, + "step": 17473 + }, + { + "epoch": 5.3634131368938, + "grad_norm": 0.23278315365314484, + "learning_rate": 4.655316735436174e-05, + "loss": 1.7749, + "step": 17474 + }, + { + "epoch": 5.363720073664825, + "grad_norm": 0.333763986825943, + "learning_rate": 4.65482086363092e-05, + "loss": 1.7393, + "step": 17475 + }, + { + "epoch": 5.3640270104358505, + "grad_norm": 0.2743878662586212, + "learning_rate": 4.6543249952369306e-05, + "loss": 1.7274, + "step": 17476 + }, + { + "epoch": 5.364333947206875, + "grad_norm": 0.234402596950531, + "learning_rate": 4.6538291302591024e-05, + "loss": 1.7848, + "step": 17477 + }, + { + "epoch": 5.3646408839779, + "grad_norm": 0.29100897908210754, + "learning_rate": 4.65333326870234e-05, + "loss": 1.7698, + "step": 17478 + }, + { + "epoch": 5.364947820748926, + "grad_norm": 0.24178378283977509, + "learning_rate": 4.652837410571539e-05, + "loss": 1.8142, + "step": 17479 + }, + { + "epoch": 5.365254757519951, + "grad_norm": 0.4189155101776123, + "learning_rate": 4.652341555871605e-05, + "loss": 1.7435, + "step": 17480 + }, + { + "epoch": 5.365561694290976, + "grad_norm": 0.40106773376464844, + "learning_rate": 4.651845704607433e-05, + "loss": 1.837, + "step": 17481 + }, + { + "epoch": 5.365868631062002, + "grad_norm": 0.24127443134784698, + "learning_rate": 4.651349856783927e-05, + "loss": 1.7257, + "step": 17482 + }, + { + "epoch": 5.366175567833026, + "grad_norm": 0.412812739610672, + "learning_rate": 4.650854012405985e-05, + "loss": 1.762, + "step": 17483 + }, + { + "epoch": 5.366482504604051, + "grad_norm": 0.2636469602584839, + "learning_rate": 4.65035817147851e-05, + "loss": 1.7995, + "step": 17484 + }, + { + "epoch": 5.366789441375077, + "grad_norm": 0.282186895608902, + "learning_rate": 4.649862334006399e-05, + "loss": 1.75, + "step": 17485 + }, + { + "epoch": 5.367096378146102, + "grad_norm": 0.3280154764652252, + "learning_rate": 4.649366499994555e-05, + "loss": 1.7668, + "step": 17486 + }, + { + "epoch": 5.367403314917127, + "grad_norm": 0.24608035385608673, + "learning_rate": 4.648870669447875e-05, + "loss": 1.8332, + "step": 17487 + }, + { + "epoch": 5.367710251688152, + "grad_norm": 0.21927174925804138, + "learning_rate": 4.648374842371262e-05, + "loss": 1.7365, + "step": 17488 + }, + { + "epoch": 5.368017188459177, + "grad_norm": 0.2658425569534302, + "learning_rate": 4.6478790187696164e-05, + "loss": 1.841, + "step": 17489 + }, + { + "epoch": 5.3683241252302025, + "grad_norm": 0.2302858531475067, + "learning_rate": 4.647383198647834e-05, + "loss": 1.7882, + "step": 17490 + }, + { + "epoch": 5.368631062001228, + "grad_norm": 0.2562740743160248, + "learning_rate": 4.64688738201082e-05, + "loss": 1.7188, + "step": 17491 + }, + { + "epoch": 5.368937998772253, + "grad_norm": 0.28140220046043396, + "learning_rate": 4.646391568863469e-05, + "loss": 1.7482, + "step": 17492 + }, + { + "epoch": 5.3692449355432785, + "grad_norm": 0.21040008962154388, + "learning_rate": 4.6458957592106855e-05, + "loss": 1.7695, + "step": 17493 + }, + { + "epoch": 5.369551872314303, + "grad_norm": 0.25322291254997253, + "learning_rate": 4.645399953057367e-05, + "loss": 1.7127, + "step": 17494 + }, + { + "epoch": 5.369858809085328, + "grad_norm": 0.2239738404750824, + "learning_rate": 4.644904150408415e-05, + "loss": 1.7376, + "step": 17495 + }, + { + "epoch": 5.370165745856354, + "grad_norm": 0.21432901918888092, + "learning_rate": 4.644408351268727e-05, + "loss": 1.7156, + "step": 17496 + }, + { + "epoch": 5.370472682627379, + "grad_norm": 0.3057272732257843, + "learning_rate": 4.643912555643205e-05, + "loss": 1.7706, + "step": 17497 + }, + { + "epoch": 5.370779619398404, + "grad_norm": 0.2826928496360779, + "learning_rate": 4.643416763536748e-05, + "loss": 1.8298, + "step": 17498 + }, + { + "epoch": 5.371086556169429, + "grad_norm": 0.2395278513431549, + "learning_rate": 4.642920974954255e-05, + "loss": 1.7357, + "step": 17499 + }, + { + "epoch": 5.371393492940454, + "grad_norm": 0.21004743874073029, + "learning_rate": 4.642425189900626e-05, + "loss": 1.7263, + "step": 17500 + }, + { + "epoch": 5.371700429711479, + "grad_norm": 0.23981697857379913, + "learning_rate": 4.641929408380761e-05, + "loss": 1.7341, + "step": 17501 + }, + { + "epoch": 5.372007366482505, + "grad_norm": 0.1984727531671524, + "learning_rate": 4.641433630399559e-05, + "loss": 1.7133, + "step": 17502 + }, + { + "epoch": 5.37231430325353, + "grad_norm": 0.22153446078300476, + "learning_rate": 4.640937855961922e-05, + "loss": 1.8028, + "step": 17503 + }, + { + "epoch": 5.3726212400245545, + "grad_norm": 0.24257974326610565, + "learning_rate": 4.6404420850727455e-05, + "loss": 1.7842, + "step": 17504 + }, + { + "epoch": 5.37292817679558, + "grad_norm": 0.19444705545902252, + "learning_rate": 4.6399463177369316e-05, + "loss": 1.7296, + "step": 17505 + }, + { + "epoch": 5.373235113566605, + "grad_norm": 0.2068849354982376, + "learning_rate": 4.6394505539593806e-05, + "loss": 1.6949, + "step": 17506 + }, + { + "epoch": 5.3735420503376305, + "grad_norm": 0.21762309968471527, + "learning_rate": 4.638954793744989e-05, + "loss": 1.7556, + "step": 17507 + }, + { + "epoch": 5.373848987108656, + "grad_norm": 0.20791584253311157, + "learning_rate": 4.638459037098659e-05, + "loss": 1.7442, + "step": 17508 + }, + { + "epoch": 5.37415592387968, + "grad_norm": 0.27774497866630554, + "learning_rate": 4.6379632840252875e-05, + "loss": 1.7834, + "step": 17509 + }, + { + "epoch": 5.374462860650706, + "grad_norm": 0.24211421608924866, + "learning_rate": 4.637467534529775e-05, + "loss": 1.819, + "step": 17510 + }, + { + "epoch": 5.374769797421731, + "grad_norm": 0.24857789278030396, + "learning_rate": 4.636971788617022e-05, + "loss": 1.7483, + "step": 17511 + }, + { + "epoch": 5.375076734192756, + "grad_norm": 0.25142937898635864, + "learning_rate": 4.636476046291925e-05, + "loss": 1.7405, + "step": 17512 + }, + { + "epoch": 5.375383670963782, + "grad_norm": 0.25860801339149475, + "learning_rate": 4.6359803075593846e-05, + "loss": 1.7821, + "step": 17513 + }, + { + "epoch": 5.375690607734807, + "grad_norm": 0.25223109126091003, + "learning_rate": 4.635484572424302e-05, + "loss": 1.738, + "step": 17514 + }, + { + "epoch": 5.3759975445058314, + "grad_norm": 0.22931768000125885, + "learning_rate": 4.634988840891573e-05, + "loss": 1.7717, + "step": 17515 + }, + { + "epoch": 5.376304481276857, + "grad_norm": 0.21371231973171234, + "learning_rate": 4.6344931129661e-05, + "loss": 1.7741, + "step": 17516 + }, + { + "epoch": 5.376611418047882, + "grad_norm": 0.2653632164001465, + "learning_rate": 4.633997388652778e-05, + "loss": 1.7548, + "step": 17517 + }, + { + "epoch": 5.3769183548189075, + "grad_norm": 0.2559951841831207, + "learning_rate": 4.6335016679565094e-05, + "loss": 1.7833, + "step": 17518 + }, + { + "epoch": 5.377225291589933, + "grad_norm": 0.22560031712055206, + "learning_rate": 4.6330059508821914e-05, + "loss": 1.6929, + "step": 17519 + }, + { + "epoch": 5.377532228360957, + "grad_norm": 0.3084852695465088, + "learning_rate": 4.6325102374347255e-05, + "loss": 1.8107, + "step": 17520 + }, + { + "epoch": 5.377839165131983, + "grad_norm": 0.3329267203807831, + "learning_rate": 4.632014527619007e-05, + "loss": 1.6791, + "step": 17521 + }, + { + "epoch": 5.378146101903008, + "grad_norm": 0.26274019479751587, + "learning_rate": 4.631518821439939e-05, + "loss": 1.7187, + "step": 17522 + }, + { + "epoch": 5.378453038674033, + "grad_norm": 0.3769492208957672, + "learning_rate": 4.6310231189024165e-05, + "loss": 1.8366, + "step": 17523 + }, + { + "epoch": 5.378759975445059, + "grad_norm": 0.2503921687602997, + "learning_rate": 4.6305274200113385e-05, + "loss": 1.7281, + "step": 17524 + }, + { + "epoch": 5.379066912216084, + "grad_norm": 0.26305708289146423, + "learning_rate": 4.6300317247716074e-05, + "loss": 1.7231, + "step": 17525 + }, + { + "epoch": 5.379373848987108, + "grad_norm": 0.31899142265319824, + "learning_rate": 4.629536033188118e-05, + "loss": 1.8025, + "step": 17526 + }, + { + "epoch": 5.379680785758134, + "grad_norm": 0.21400104463100433, + "learning_rate": 4.629040345265772e-05, + "loss": 1.7481, + "step": 17527 + }, + { + "epoch": 5.379987722529159, + "grad_norm": 0.23147371411323547, + "learning_rate": 4.628544661009465e-05, + "loss": 1.7049, + "step": 17528 + }, + { + "epoch": 5.380294659300184, + "grad_norm": 0.21156759560108185, + "learning_rate": 4.628048980424099e-05, + "loss": 1.806, + "step": 17529 + }, + { + "epoch": 5.38060159607121, + "grad_norm": 0.22061556577682495, + "learning_rate": 4.6275533035145685e-05, + "loss": 1.7606, + "step": 17530 + }, + { + "epoch": 5.380908532842234, + "grad_norm": 0.23379987478256226, + "learning_rate": 4.6270576302857774e-05, + "loss": 1.7874, + "step": 17531 + }, + { + "epoch": 5.3812154696132595, + "grad_norm": 0.24738669395446777, + "learning_rate": 4.62656196074262e-05, + "loss": 1.7611, + "step": 17532 + }, + { + "epoch": 5.381522406384285, + "grad_norm": 0.19738905131816864, + "learning_rate": 4.6260662948899974e-05, + "loss": 1.7375, + "step": 17533 + }, + { + "epoch": 5.38182934315531, + "grad_norm": 0.2327810823917389, + "learning_rate": 4.6255706327328044e-05, + "loss": 1.7188, + "step": 17534 + }, + { + "epoch": 5.3821362799263355, + "grad_norm": 0.18944145739078522, + "learning_rate": 4.625074974275944e-05, + "loss": 1.6672, + "step": 17535 + }, + { + "epoch": 5.382443216697361, + "grad_norm": 0.20943734049797058, + "learning_rate": 4.624579319524311e-05, + "loss": 1.7238, + "step": 17536 + }, + { + "epoch": 5.382750153468385, + "grad_norm": 0.2060960829257965, + "learning_rate": 4.6240836684828074e-05, + "loss": 1.744, + "step": 17537 + }, + { + "epoch": 5.383057090239411, + "grad_norm": 0.19089816510677338, + "learning_rate": 4.6235880211563264e-05, + "loss": 1.6884, + "step": 17538 + }, + { + "epoch": 5.383364027010436, + "grad_norm": 0.22362665832042694, + "learning_rate": 4.623092377549772e-05, + "loss": 1.7076, + "step": 17539 + }, + { + "epoch": 5.383670963781461, + "grad_norm": 0.19429968297481537, + "learning_rate": 4.622596737668039e-05, + "loss": 1.7315, + "step": 17540 + }, + { + "epoch": 5.383977900552487, + "grad_norm": 0.20481903851032257, + "learning_rate": 4.622101101516024e-05, + "loss": 1.711, + "step": 17541 + }, + { + "epoch": 5.384284837323511, + "grad_norm": 0.19181163609027863, + "learning_rate": 4.6216054690986304e-05, + "loss": 1.6879, + "step": 17542 + }, + { + "epoch": 5.384591774094536, + "grad_norm": 0.23105846345424652, + "learning_rate": 4.6211098404207514e-05, + "loss": 1.7797, + "step": 17543 + }, + { + "epoch": 5.384898710865562, + "grad_norm": 0.2742008864879608, + "learning_rate": 4.6206142154872886e-05, + "loss": 1.7404, + "step": 17544 + }, + { + "epoch": 5.385205647636587, + "grad_norm": 0.2256750613451004, + "learning_rate": 4.6201185943031365e-05, + "loss": 1.7616, + "step": 17545 + }, + { + "epoch": 5.385512584407612, + "grad_norm": 0.23230868577957153, + "learning_rate": 4.6196229768731964e-05, + "loss": 1.7457, + "step": 17546 + }, + { + "epoch": 5.385819521178637, + "grad_norm": 0.2200126200914383, + "learning_rate": 4.6191273632023634e-05, + "loss": 1.7835, + "step": 17547 + }, + { + "epoch": 5.386126457949662, + "grad_norm": 0.21903863549232483, + "learning_rate": 4.6186317532955395e-05, + "loss": 1.7315, + "step": 17548 + }, + { + "epoch": 5.3864333947206875, + "grad_norm": 0.1915556788444519, + "learning_rate": 4.6181361471576186e-05, + "loss": 1.6786, + "step": 17549 + }, + { + "epoch": 5.386740331491713, + "grad_norm": 0.20177799463272095, + "learning_rate": 4.617640544793501e-05, + "loss": 1.7453, + "step": 17550 + }, + { + "epoch": 5.387047268262738, + "grad_norm": 0.2598256766796112, + "learning_rate": 4.617144946208083e-05, + "loss": 1.7931, + "step": 17551 + }, + { + "epoch": 5.387354205033763, + "grad_norm": 0.2357153594493866, + "learning_rate": 4.616649351406263e-05, + "loss": 1.7932, + "step": 17552 + }, + { + "epoch": 5.387661141804788, + "grad_norm": 0.2228964865207672, + "learning_rate": 4.616153760392938e-05, + "loss": 1.7725, + "step": 17553 + }, + { + "epoch": 5.387968078575813, + "grad_norm": 0.20811811089515686, + "learning_rate": 4.6156581731730085e-05, + "loss": 1.744, + "step": 17554 + }, + { + "epoch": 5.388275015346839, + "grad_norm": 0.20008429884910583, + "learning_rate": 4.615162589751369e-05, + "loss": 1.6973, + "step": 17555 + }, + { + "epoch": 5.388581952117864, + "grad_norm": 0.20487523078918457, + "learning_rate": 4.614667010132919e-05, + "loss": 1.7712, + "step": 17556 + }, + { + "epoch": 5.388888888888889, + "grad_norm": 0.21279677748680115, + "learning_rate": 4.6141714343225554e-05, + "loss": 1.7783, + "step": 17557 + }, + { + "epoch": 5.389195825659914, + "grad_norm": 0.28035736083984375, + "learning_rate": 4.613675862325174e-05, + "loss": 1.767, + "step": 17558 + }, + { + "epoch": 5.389502762430939, + "grad_norm": 0.27426794171333313, + "learning_rate": 4.613180294145677e-05, + "loss": 1.7909, + "step": 17559 + }, + { + "epoch": 5.389809699201964, + "grad_norm": 0.22420327365398407, + "learning_rate": 4.612684729788957e-05, + "loss": 1.6902, + "step": 17560 + }, + { + "epoch": 5.39011663597299, + "grad_norm": 0.19799382984638214, + "learning_rate": 4.612189169259915e-05, + "loss": 1.7276, + "step": 17561 + }, + { + "epoch": 5.390423572744015, + "grad_norm": 0.2508823573589325, + "learning_rate": 4.611693612563445e-05, + "loss": 1.7445, + "step": 17562 + }, + { + "epoch": 5.3907305095150395, + "grad_norm": 0.20835694670677185, + "learning_rate": 4.611198059704448e-05, + "loss": 1.696, + "step": 17563 + }, + { + "epoch": 5.391037446286065, + "grad_norm": 0.22136010229587555, + "learning_rate": 4.6107025106878176e-05, + "loss": 1.7701, + "step": 17564 + }, + { + "epoch": 5.39134438305709, + "grad_norm": 0.23835612833499908, + "learning_rate": 4.610206965518456e-05, + "loss": 1.7494, + "step": 17565 + }, + { + "epoch": 5.3916513198281155, + "grad_norm": 0.26142916083335876, + "learning_rate": 4.6097114242012554e-05, + "loss": 1.7616, + "step": 17566 + }, + { + "epoch": 5.391958256599141, + "grad_norm": 0.3366851806640625, + "learning_rate": 4.6092158867411175e-05, + "loss": 1.7409, + "step": 17567 + }, + { + "epoch": 5.392265193370166, + "grad_norm": 0.2592991292476654, + "learning_rate": 4.608720353142935e-05, + "loss": 1.7469, + "step": 17568 + }, + { + "epoch": 5.392572130141191, + "grad_norm": 0.25810322165489197, + "learning_rate": 4.608224823411608e-05, + "loss": 1.7345, + "step": 17569 + }, + { + "epoch": 5.392879066912216, + "grad_norm": 0.26776888966560364, + "learning_rate": 4.607729297552032e-05, + "loss": 1.7698, + "step": 17570 + }, + { + "epoch": 5.393186003683241, + "grad_norm": 0.21023939549922943, + "learning_rate": 4.607233775569107e-05, + "loss": 1.7681, + "step": 17571 + }, + { + "epoch": 5.393492940454267, + "grad_norm": 0.24452096223831177, + "learning_rate": 4.6067382574677265e-05, + "loss": 1.8154, + "step": 17572 + }, + { + "epoch": 5.393799877225292, + "grad_norm": 0.27084338665008545, + "learning_rate": 4.606242743252791e-05, + "loss": 1.7106, + "step": 17573 + }, + { + "epoch": 5.394106813996316, + "grad_norm": 0.24783825874328613, + "learning_rate": 4.605747232929195e-05, + "loss": 1.713, + "step": 17574 + }, + { + "epoch": 5.394413750767342, + "grad_norm": 0.2528151869773865, + "learning_rate": 4.6052517265018333e-05, + "loss": 1.8475, + "step": 17575 + }, + { + "epoch": 5.394720687538367, + "grad_norm": 0.24361065030097961, + "learning_rate": 4.604756223975609e-05, + "loss": 1.7414, + "step": 17576 + }, + { + "epoch": 5.395027624309392, + "grad_norm": 0.2751234769821167, + "learning_rate": 4.604260725355412e-05, + "loss": 1.7603, + "step": 17577 + }, + { + "epoch": 5.395334561080418, + "grad_norm": 0.23183637857437134, + "learning_rate": 4.603765230646146e-05, + "loss": 1.7053, + "step": 17578 + }, + { + "epoch": 5.395641497851442, + "grad_norm": 0.27462145686149597, + "learning_rate": 4.6032697398527005e-05, + "loss": 1.746, + "step": 17579 + }, + { + "epoch": 5.3959484346224675, + "grad_norm": 0.3665321171283722, + "learning_rate": 4.602774252979978e-05, + "loss": 1.6883, + "step": 17580 + }, + { + "epoch": 5.396255371393493, + "grad_norm": 0.22438424825668335, + "learning_rate": 4.602278770032872e-05, + "loss": 1.7473, + "step": 17581 + }, + { + "epoch": 5.396562308164518, + "grad_norm": 0.38713687658309937, + "learning_rate": 4.601783291016282e-05, + "loss": 1.7993, + "step": 17582 + }, + { + "epoch": 5.3968692449355435, + "grad_norm": 0.3399868905544281, + "learning_rate": 4.6012878159351015e-05, + "loss": 1.7709, + "step": 17583 + }, + { + "epoch": 5.397176181706568, + "grad_norm": 0.21916119754314423, + "learning_rate": 4.60079234479423e-05, + "loss": 1.7351, + "step": 17584 + }, + { + "epoch": 5.397483118477593, + "grad_norm": 0.3796394467353821, + "learning_rate": 4.600296877598561e-05, + "loss": 1.7534, + "step": 17585 + }, + { + "epoch": 5.397790055248619, + "grad_norm": 0.27824562788009644, + "learning_rate": 4.599801414352993e-05, + "loss": 1.6962, + "step": 17586 + }, + { + "epoch": 5.398096992019644, + "grad_norm": 0.21037112176418304, + "learning_rate": 4.599305955062421e-05, + "loss": 1.7062, + "step": 17587 + }, + { + "epoch": 5.398403928790669, + "grad_norm": 0.3373035192489624, + "learning_rate": 4.598810499731745e-05, + "loss": 1.8263, + "step": 17588 + }, + { + "epoch": 5.398710865561695, + "grad_norm": 0.2560507357120514, + "learning_rate": 4.5983150483658564e-05, + "loss": 1.7232, + "step": 17589 + }, + { + "epoch": 5.399017802332719, + "grad_norm": 0.23010993003845215, + "learning_rate": 4.5978196009696564e-05, + "loss": 1.805, + "step": 17590 + }, + { + "epoch": 5.399324739103744, + "grad_norm": 0.32955634593963623, + "learning_rate": 4.597324157548037e-05, + "loss": 1.7018, + "step": 17591 + }, + { + "epoch": 5.39963167587477, + "grad_norm": 0.2534363865852356, + "learning_rate": 4.5968287181058953e-05, + "loss": 1.6919, + "step": 17592 + }, + { + "epoch": 5.399938612645795, + "grad_norm": 0.23179130256175995, + "learning_rate": 4.5963332826481314e-05, + "loss": 1.7237, + "step": 17593 + }, + { + "epoch": 5.4002455494168204, + "grad_norm": 0.37712663412094116, + "learning_rate": 4.5958378511796365e-05, + "loss": 1.7694, + "step": 17594 + }, + { + "epoch": 5.400552486187845, + "grad_norm": 0.21228717267513275, + "learning_rate": 4.59534242370531e-05, + "loss": 1.7528, + "step": 17595 + }, + { + "epoch": 5.40085942295887, + "grad_norm": 0.2818812429904938, + "learning_rate": 4.5948470002300454e-05, + "loss": 1.8214, + "step": 17596 + }, + { + "epoch": 5.401166359729896, + "grad_norm": 0.24916675686836243, + "learning_rate": 4.5943515807587415e-05, + "loss": 1.7792, + "step": 17597 + }, + { + "epoch": 5.401473296500921, + "grad_norm": 0.2096913456916809, + "learning_rate": 4.593856165296291e-05, + "loss": 1.6983, + "step": 17598 + }, + { + "epoch": 5.401780233271946, + "grad_norm": 0.271124005317688, + "learning_rate": 4.593360753847595e-05, + "loss": 1.7534, + "step": 17599 + }, + { + "epoch": 5.402087170042972, + "grad_norm": 0.24798092246055603, + "learning_rate": 4.5928653464175435e-05, + "loss": 1.7783, + "step": 17600 + }, + { + "epoch": 5.402394106813996, + "grad_norm": 0.3531748056411743, + "learning_rate": 4.592369943011038e-05, + "loss": 1.7834, + "step": 17601 + }, + { + "epoch": 5.402701043585021, + "grad_norm": 0.29650232195854187, + "learning_rate": 4.591874543632969e-05, + "loss": 1.7186, + "step": 17602 + }, + { + "epoch": 5.403007980356047, + "grad_norm": 0.25578248500823975, + "learning_rate": 4.591379148288236e-05, + "loss": 1.7849, + "step": 17603 + }, + { + "epoch": 5.403314917127072, + "grad_norm": 0.3790532946586609, + "learning_rate": 4.590883756981733e-05, + "loss": 1.7192, + "step": 17604 + }, + { + "epoch": 5.403621853898097, + "grad_norm": 0.23684249818325043, + "learning_rate": 4.590388369718359e-05, + "loss": 1.7171, + "step": 17605 + }, + { + "epoch": 5.403928790669122, + "grad_norm": 0.267702579498291, + "learning_rate": 4.589892986503005e-05, + "loss": 1.7181, + "step": 17606 + }, + { + "epoch": 5.404235727440147, + "grad_norm": 0.29105648398399353, + "learning_rate": 4.5893976073405704e-05, + "loss": 1.7395, + "step": 17607 + }, + { + "epoch": 5.4045426642111725, + "grad_norm": 0.2266589254140854, + "learning_rate": 4.588902232235949e-05, + "loss": 1.7244, + "step": 17608 + }, + { + "epoch": 5.404849600982198, + "grad_norm": 0.24065524339675903, + "learning_rate": 4.588406861194035e-05, + "loss": 1.7398, + "step": 17609 + }, + { + "epoch": 5.405156537753223, + "grad_norm": 0.23166650533676147, + "learning_rate": 4.587911494219728e-05, + "loss": 1.7592, + "step": 17610 + }, + { + "epoch": 5.4054634745242485, + "grad_norm": 0.19882038235664368, + "learning_rate": 4.5874161313179186e-05, + "loss": 1.7087, + "step": 17611 + }, + { + "epoch": 5.405770411295273, + "grad_norm": 0.2688273787498474, + "learning_rate": 4.5869207724935076e-05, + "loss": 1.7791, + "step": 17612 + }, + { + "epoch": 5.406077348066298, + "grad_norm": 0.1970982402563095, + "learning_rate": 4.5864254177513855e-05, + "loss": 1.7079, + "step": 17613 + }, + { + "epoch": 5.406384284837324, + "grad_norm": 0.2531265318393707, + "learning_rate": 4.585930067096451e-05, + "loss": 1.716, + "step": 17614 + }, + { + "epoch": 5.406691221608349, + "grad_norm": 0.2610352337360382, + "learning_rate": 4.585434720533596e-05, + "loss": 1.7133, + "step": 17615 + }, + { + "epoch": 5.406998158379374, + "grad_norm": 0.2420870065689087, + "learning_rate": 4.5849393780677216e-05, + "loss": 1.7044, + "step": 17616 + }, + { + "epoch": 5.407305095150399, + "grad_norm": 0.24078647792339325, + "learning_rate": 4.584444039703717e-05, + "loss": 1.7486, + "step": 17617 + }, + { + "epoch": 5.407612031921424, + "grad_norm": 0.19324539601802826, + "learning_rate": 4.583948705446481e-05, + "loss": 1.7439, + "step": 17618 + }, + { + "epoch": 5.407918968692449, + "grad_norm": 0.2311750054359436, + "learning_rate": 4.5834533753009065e-05, + "loss": 1.7794, + "step": 17619 + }, + { + "epoch": 5.408225905463475, + "grad_norm": 0.2554466128349304, + "learning_rate": 4.5829580492718914e-05, + "loss": 1.7146, + "step": 17620 + }, + { + "epoch": 5.4085328422345, + "grad_norm": 0.2679688334465027, + "learning_rate": 4.582462727364328e-05, + "loss": 1.7677, + "step": 17621 + }, + { + "epoch": 5.4088397790055245, + "grad_norm": 0.19292913377285004, + "learning_rate": 4.5819674095831146e-05, + "loss": 1.7544, + "step": 17622 + }, + { + "epoch": 5.40914671577655, + "grad_norm": 0.2146623730659485, + "learning_rate": 4.5814720959331425e-05, + "loss": 1.7182, + "step": 17623 + }, + { + "epoch": 5.409453652547575, + "grad_norm": 0.23098216950893402, + "learning_rate": 4.5809767864193096e-05, + "loss": 1.6844, + "step": 17624 + }, + { + "epoch": 5.4097605893186005, + "grad_norm": 0.22482910752296448, + "learning_rate": 4.5804814810465096e-05, + "loss": 1.7921, + "step": 17625 + }, + { + "epoch": 5.410067526089626, + "grad_norm": 0.22098569571971893, + "learning_rate": 4.579986179819636e-05, + "loss": 1.7419, + "step": 17626 + }, + { + "epoch": 5.41037446286065, + "grad_norm": 0.2131706178188324, + "learning_rate": 4.579490882743588e-05, + "loss": 1.7587, + "step": 17627 + }, + { + "epoch": 5.410681399631676, + "grad_norm": 0.22448734939098358, + "learning_rate": 4.578995589823254e-05, + "loss": 1.6959, + "step": 17628 + }, + { + "epoch": 5.410988336402701, + "grad_norm": 0.22372964024543762, + "learning_rate": 4.578500301063536e-05, + "loss": 1.7462, + "step": 17629 + }, + { + "epoch": 5.411295273173726, + "grad_norm": 0.22140730917453766, + "learning_rate": 4.578005016469322e-05, + "loss": 1.8348, + "step": 17630 + }, + { + "epoch": 5.411602209944752, + "grad_norm": 0.21697622537612915, + "learning_rate": 4.577509736045511e-05, + "loss": 1.7634, + "step": 17631 + }, + { + "epoch": 5.411909146715777, + "grad_norm": 0.2044363021850586, + "learning_rate": 4.5770144597969954e-05, + "loss": 1.7095, + "step": 17632 + }, + { + "epoch": 5.412216083486801, + "grad_norm": 0.1910451501607895, + "learning_rate": 4.576519187728674e-05, + "loss": 1.7022, + "step": 17633 + }, + { + "epoch": 5.412523020257827, + "grad_norm": 0.21787554025650024, + "learning_rate": 4.576023919845434e-05, + "loss": 1.7206, + "step": 17634 + }, + { + "epoch": 5.412829957028852, + "grad_norm": 0.2363428920507431, + "learning_rate": 4.575528656152178e-05, + "loss": 1.8052, + "step": 17635 + }, + { + "epoch": 5.413136893799877, + "grad_norm": 0.22830195724964142, + "learning_rate": 4.575033396653793e-05, + "loss": 1.7432, + "step": 17636 + }, + { + "epoch": 5.413443830570903, + "grad_norm": 0.24867239594459534, + "learning_rate": 4.5745381413551794e-05, + "loss": 1.7011, + "step": 17637 + }, + { + "epoch": 5.413750767341927, + "grad_norm": 0.19329775869846344, + "learning_rate": 4.574042890261228e-05, + "loss": 1.7749, + "step": 17638 + }, + { + "epoch": 5.4140577041129525, + "grad_norm": 0.22917115688323975, + "learning_rate": 4.573547643376836e-05, + "loss": 1.7478, + "step": 17639 + }, + { + "epoch": 5.414364640883978, + "grad_norm": 0.23882724344730377, + "learning_rate": 4.573052400706894e-05, + "loss": 1.7396, + "step": 17640 + }, + { + "epoch": 5.414671577655003, + "grad_norm": 0.19127070903778076, + "learning_rate": 4.572557162256301e-05, + "loss": 1.6791, + "step": 17641 + }, + { + "epoch": 5.4149785144260285, + "grad_norm": 0.18385560810565948, + "learning_rate": 4.5720619280299475e-05, + "loss": 1.7288, + "step": 17642 + }, + { + "epoch": 5.415285451197054, + "grad_norm": 0.19845189154148102, + "learning_rate": 4.571566698032728e-05, + "loss": 1.7525, + "step": 17643 + }, + { + "epoch": 5.415592387968078, + "grad_norm": 0.18987210094928741, + "learning_rate": 4.571071472269539e-05, + "loss": 1.7253, + "step": 17644 + }, + { + "epoch": 5.415899324739104, + "grad_norm": 0.18257199227809906, + "learning_rate": 4.570576250745271e-05, + "loss": 1.7051, + "step": 17645 + }, + { + "epoch": 5.416206261510129, + "grad_norm": 0.22803467512130737, + "learning_rate": 4.570081033464823e-05, + "loss": 1.7478, + "step": 17646 + }, + { + "epoch": 5.416513198281154, + "grad_norm": 0.18763841688632965, + "learning_rate": 4.569585820433084e-05, + "loss": 1.7316, + "step": 17647 + }, + { + "epoch": 5.41682013505218, + "grad_norm": 0.23974654078483582, + "learning_rate": 4.56909061165495e-05, + "loss": 1.7566, + "step": 17648 + }, + { + "epoch": 5.417127071823204, + "grad_norm": 0.24336253106594086, + "learning_rate": 4.568595407135315e-05, + "loss": 1.7468, + "step": 17649 + }, + { + "epoch": 5.417434008594229, + "grad_norm": 0.23891226947307587, + "learning_rate": 4.5681002068790755e-05, + "loss": 1.7201, + "step": 17650 + }, + { + "epoch": 5.417740945365255, + "grad_norm": 0.19209685921669006, + "learning_rate": 4.56760501089112e-05, + "loss": 1.713, + "step": 17651 + }, + { + "epoch": 5.41804788213628, + "grad_norm": 0.2407880276441574, + "learning_rate": 4.567109819176349e-05, + "loss": 1.7073, + "step": 17652 + }, + { + "epoch": 5.418354818907305, + "grad_norm": 0.2385055273771286, + "learning_rate": 4.5666146317396485e-05, + "loss": 1.7387, + "step": 17653 + }, + { + "epoch": 5.41866175567833, + "grad_norm": 0.22068475186824799, + "learning_rate": 4.566119448585918e-05, + "loss": 1.7116, + "step": 17654 + }, + { + "epoch": 5.418968692449355, + "grad_norm": 0.318375825881958, + "learning_rate": 4.5656242697200496e-05, + "loss": 1.7659, + "step": 17655 + }, + { + "epoch": 5.4192756292203805, + "grad_norm": 0.25311973690986633, + "learning_rate": 4.5651290951469366e-05, + "loss": 1.7814, + "step": 17656 + }, + { + "epoch": 5.419582565991406, + "grad_norm": 0.18701443076133728, + "learning_rate": 4.5646339248714735e-05, + "loss": 1.6993, + "step": 17657 + }, + { + "epoch": 5.419889502762431, + "grad_norm": 0.2964496314525604, + "learning_rate": 4.5641387588985516e-05, + "loss": 1.8254, + "step": 17658 + }, + { + "epoch": 5.420196439533456, + "grad_norm": 0.19447220861911774, + "learning_rate": 4.563643597233067e-05, + "loss": 1.7208, + "step": 17659 + }, + { + "epoch": 5.420503376304481, + "grad_norm": 0.21666039526462555, + "learning_rate": 4.5631484398799105e-05, + "loss": 1.6695, + "step": 17660 + }, + { + "epoch": 5.420810313075506, + "grad_norm": 0.23104412853717804, + "learning_rate": 4.5626532868439796e-05, + "loss": 1.7449, + "step": 17661 + }, + { + "epoch": 5.421117249846532, + "grad_norm": 0.20463459193706512, + "learning_rate": 4.562158138130163e-05, + "loss": 1.6714, + "step": 17662 + }, + { + "epoch": 5.421424186617557, + "grad_norm": 0.21948079764842987, + "learning_rate": 4.561662993743359e-05, + "loss": 1.6957, + "step": 17663 + }, + { + "epoch": 5.421731123388582, + "grad_norm": 0.2672746777534485, + "learning_rate": 4.561167853688455e-05, + "loss": 1.7137, + "step": 17664 + }, + { + "epoch": 5.422038060159607, + "grad_norm": 0.2652325928211212, + "learning_rate": 4.5606727179703493e-05, + "loss": 1.7943, + "step": 17665 + }, + { + "epoch": 5.422344996930632, + "grad_norm": 0.17761313915252686, + "learning_rate": 4.560177586593933e-05, + "loss": 1.7072, + "step": 17666 + }, + { + "epoch": 5.422651933701657, + "grad_norm": 0.24759770929813385, + "learning_rate": 4.5596824595641e-05, + "loss": 1.7807, + "step": 17667 + }, + { + "epoch": 5.422958870472683, + "grad_norm": 0.22191929817199707, + "learning_rate": 4.5591873368857416e-05, + "loss": 1.7668, + "step": 17668 + }, + { + "epoch": 5.423265807243708, + "grad_norm": 0.21293842792510986, + "learning_rate": 4.5586922185637546e-05, + "loss": 1.7304, + "step": 17669 + }, + { + "epoch": 5.4235727440147325, + "grad_norm": 0.2646051049232483, + "learning_rate": 4.5581971046030277e-05, + "loss": 1.7258, + "step": 17670 + }, + { + "epoch": 5.423879680785758, + "grad_norm": 0.1894550621509552, + "learning_rate": 4.5577019950084574e-05, + "loss": 1.7066, + "step": 17671 + }, + { + "epoch": 5.424186617556783, + "grad_norm": 0.2533467710018158, + "learning_rate": 4.557206889784934e-05, + "loss": 1.7668, + "step": 17672 + }, + { + "epoch": 5.4244935543278086, + "grad_norm": 0.1972150355577469, + "learning_rate": 4.556711788937352e-05, + "loss": 1.7306, + "step": 17673 + }, + { + "epoch": 5.424800491098834, + "grad_norm": 0.2726735472679138, + "learning_rate": 4.5562166924706054e-05, + "loss": 1.7281, + "step": 17674 + }, + { + "epoch": 5.425107427869859, + "grad_norm": 0.2244454175233841, + "learning_rate": 4.555721600389584e-05, + "loss": 1.7461, + "step": 17675 + }, + { + "epoch": 5.425414364640884, + "grad_norm": 0.19486510753631592, + "learning_rate": 4.555226512699182e-05, + "loss": 1.7361, + "step": 17676 + }, + { + "epoch": 5.425721301411909, + "grad_norm": 0.18128283321857452, + "learning_rate": 4.554731429404293e-05, + "loss": 1.7637, + "step": 17677 + }, + { + "epoch": 5.426028238182934, + "grad_norm": 0.24709749221801758, + "learning_rate": 4.5542363505098084e-05, + "loss": 1.7928, + "step": 17678 + }, + { + "epoch": 5.42633517495396, + "grad_norm": 0.2236633151769638, + "learning_rate": 4.553741276020621e-05, + "loss": 1.8262, + "step": 17679 + }, + { + "epoch": 5.426642111724985, + "grad_norm": 0.2592087984085083, + "learning_rate": 4.553246205941626e-05, + "loss": 1.675, + "step": 17680 + }, + { + "epoch": 5.4269490484960095, + "grad_norm": 0.27751871943473816, + "learning_rate": 4.552751140277712e-05, + "loss": 1.7344, + "step": 17681 + }, + { + "epoch": 5.427255985267035, + "grad_norm": 0.23752287030220032, + "learning_rate": 4.5522560790337746e-05, + "loss": 1.7748, + "step": 17682 + }, + { + "epoch": 5.42756292203806, + "grad_norm": 0.3259925842285156, + "learning_rate": 4.5517610222147035e-05, + "loss": 1.7855, + "step": 17683 + }, + { + "epoch": 5.4278698588090855, + "grad_norm": 0.2579646706581116, + "learning_rate": 4.551265969825394e-05, + "loss": 1.7978, + "step": 17684 + }, + { + "epoch": 5.428176795580111, + "grad_norm": 0.3217744827270508, + "learning_rate": 4.550770921870735e-05, + "loss": 1.7793, + "step": 17685 + }, + { + "epoch": 5.428483732351136, + "grad_norm": 0.2930903434753418, + "learning_rate": 4.550275878355624e-05, + "loss": 1.7226, + "step": 17686 + }, + { + "epoch": 5.428790669122161, + "grad_norm": 0.1982879489660263, + "learning_rate": 4.549780839284948e-05, + "loss": 1.6841, + "step": 17687 + }, + { + "epoch": 5.429097605893186, + "grad_norm": 0.20843900740146637, + "learning_rate": 4.5492858046636046e-05, + "loss": 1.7201, + "step": 17688 + }, + { + "epoch": 5.429404542664211, + "grad_norm": 0.23116534948349, + "learning_rate": 4.5487907744964794e-05, + "loss": 1.7565, + "step": 17689 + }, + { + "epoch": 5.429711479435237, + "grad_norm": 0.19177772104740143, + "learning_rate": 4.548295748788471e-05, + "loss": 1.7479, + "step": 17690 + }, + { + "epoch": 5.430018416206262, + "grad_norm": 0.22261449694633484, + "learning_rate": 4.547800727544469e-05, + "loss": 1.7785, + "step": 17691 + }, + { + "epoch": 5.430325352977286, + "grad_norm": 0.20073406398296356, + "learning_rate": 4.547305710769363e-05, + "loss": 1.741, + "step": 17692 + }, + { + "epoch": 5.430632289748312, + "grad_norm": 0.21662208437919617, + "learning_rate": 4.546810698468049e-05, + "loss": 1.7269, + "step": 17693 + }, + { + "epoch": 5.430939226519337, + "grad_norm": 0.19540879130363464, + "learning_rate": 4.546315690645416e-05, + "loss": 1.7141, + "step": 17694 + }, + { + "epoch": 5.431246163290362, + "grad_norm": 0.20063656568527222, + "learning_rate": 4.545820687306358e-05, + "loss": 1.7244, + "step": 17695 + }, + { + "epoch": 5.431553100061388, + "grad_norm": 0.2172660082578659, + "learning_rate": 4.545325688455765e-05, + "loss": 1.7172, + "step": 17696 + }, + { + "epoch": 5.431860036832412, + "grad_norm": 0.2480388581752777, + "learning_rate": 4.5448306940985326e-05, + "loss": 1.6994, + "step": 17697 + }, + { + "epoch": 5.4321669736034375, + "grad_norm": 0.22499477863311768, + "learning_rate": 4.544335704239547e-05, + "loss": 1.7405, + "step": 17698 + }, + { + "epoch": 5.432473910374463, + "grad_norm": 0.20655590295791626, + "learning_rate": 4.5438407188837065e-05, + "loss": 1.6867, + "step": 17699 + }, + { + "epoch": 5.432780847145488, + "grad_norm": 0.2045906037092209, + "learning_rate": 4.543345738035896e-05, + "loss": 1.7752, + "step": 17700 + }, + { + "epoch": 5.4330877839165135, + "grad_norm": 0.2092052847146988, + "learning_rate": 4.542850761701013e-05, + "loss": 1.7389, + "step": 17701 + }, + { + "epoch": 5.433394720687538, + "grad_norm": 0.1943730264902115, + "learning_rate": 4.5423557898839446e-05, + "loss": 1.7276, + "step": 17702 + }, + { + "epoch": 5.433701657458563, + "grad_norm": 0.23487289249897003, + "learning_rate": 4.541860822589587e-05, + "loss": 1.8119, + "step": 17703 + }, + { + "epoch": 5.434008594229589, + "grad_norm": 0.204689159989357, + "learning_rate": 4.541365859822827e-05, + "loss": 1.7865, + "step": 17704 + }, + { + "epoch": 5.434315531000614, + "grad_norm": 0.20850931107997894, + "learning_rate": 4.5408709015885604e-05, + "loss": 1.7733, + "step": 17705 + }, + { + "epoch": 5.434622467771639, + "grad_norm": 0.18685877323150635, + "learning_rate": 4.540375947891675e-05, + "loss": 1.7526, + "step": 17706 + }, + { + "epoch": 5.434929404542665, + "grad_norm": 0.2009890079498291, + "learning_rate": 4.539880998737064e-05, + "loss": 1.6904, + "step": 17707 + }, + { + "epoch": 5.435236341313689, + "grad_norm": 0.16602718830108643, + "learning_rate": 4.5393860541296205e-05, + "loss": 1.689, + "step": 17708 + }, + { + "epoch": 5.435543278084714, + "grad_norm": 0.24318818747997284, + "learning_rate": 4.5388911140742315e-05, + "loss": 1.7993, + "step": 17709 + }, + { + "epoch": 5.43585021485574, + "grad_norm": 0.24094417691230774, + "learning_rate": 4.538396178575793e-05, + "loss": 1.7235, + "step": 17710 + }, + { + "epoch": 5.436157151626765, + "grad_norm": 0.20361751317977905, + "learning_rate": 4.537901247639192e-05, + "loss": 1.7198, + "step": 17711 + }, + { + "epoch": 5.43646408839779, + "grad_norm": 0.2563718259334564, + "learning_rate": 4.537406321269323e-05, + "loss": 1.795, + "step": 17712 + }, + { + "epoch": 5.436771025168815, + "grad_norm": 0.29895591735839844, + "learning_rate": 4.536911399471075e-05, + "loss": 1.7515, + "step": 17713 + }, + { + "epoch": 5.43707796193984, + "grad_norm": 0.22535841166973114, + "learning_rate": 4.536416482249342e-05, + "loss": 1.6998, + "step": 17714 + }, + { + "epoch": 5.4373848987108655, + "grad_norm": 0.26025068759918213, + "learning_rate": 4.53592156960901e-05, + "loss": 1.7821, + "step": 17715 + }, + { + "epoch": 5.437691835481891, + "grad_norm": 0.3473168611526489, + "learning_rate": 4.535426661554975e-05, + "loss": 1.7035, + "step": 17716 + }, + { + "epoch": 5.437998772252916, + "grad_norm": 0.22207199037075043, + "learning_rate": 4.534931758092126e-05, + "loss": 1.7485, + "step": 17717 + }, + { + "epoch": 5.4383057090239415, + "grad_norm": 0.26839709281921387, + "learning_rate": 4.534436859225353e-05, + "loss": 1.7272, + "step": 17718 + }, + { + "epoch": 5.438612645794966, + "grad_norm": 0.37715891003608704, + "learning_rate": 4.5339419649595476e-05, + "loss": 1.7254, + "step": 17719 + }, + { + "epoch": 5.438919582565991, + "grad_norm": 0.21485768258571625, + "learning_rate": 4.533447075299603e-05, + "loss": 1.7349, + "step": 17720 + }, + { + "epoch": 5.439226519337017, + "grad_norm": 0.29502415657043457, + "learning_rate": 4.5329521902504055e-05, + "loss": 1.7325, + "step": 17721 + }, + { + "epoch": 5.439533456108042, + "grad_norm": 0.29448410868644714, + "learning_rate": 4.5324573098168505e-05, + "loss": 1.768, + "step": 17722 + }, + { + "epoch": 5.439840392879067, + "grad_norm": 0.1892058402299881, + "learning_rate": 4.5319624340038244e-05, + "loss": 1.6866, + "step": 17723 + }, + { + "epoch": 5.440147329650092, + "grad_norm": 0.3365040123462677, + "learning_rate": 4.531467562816221e-05, + "loss": 1.7662, + "step": 17724 + }, + { + "epoch": 5.440454266421117, + "grad_norm": 0.2960789203643799, + "learning_rate": 4.53097269625893e-05, + "loss": 1.746, + "step": 17725 + }, + { + "epoch": 5.440761203192142, + "grad_norm": 0.21623700857162476, + "learning_rate": 4.530477834336841e-05, + "loss": 1.7619, + "step": 17726 + }, + { + "epoch": 5.441068139963168, + "grad_norm": 0.29010120034217834, + "learning_rate": 4.5299829770548456e-05, + "loss": 1.717, + "step": 17727 + }, + { + "epoch": 5.441375076734193, + "grad_norm": 0.18467605113983154, + "learning_rate": 4.529488124417833e-05, + "loss": 1.6938, + "step": 17728 + }, + { + "epoch": 5.4416820135052175, + "grad_norm": 0.2875411808490753, + "learning_rate": 4.528993276430695e-05, + "loss": 1.7633, + "step": 17729 + }, + { + "epoch": 5.441988950276243, + "grad_norm": 0.24252675473690033, + "learning_rate": 4.528498433098321e-05, + "loss": 1.6477, + "step": 17730 + }, + { + "epoch": 5.442295887047268, + "grad_norm": 0.18885886669158936, + "learning_rate": 4.5280035944256035e-05, + "loss": 1.7241, + "step": 17731 + }, + { + "epoch": 5.4426028238182935, + "grad_norm": 0.2594204246997833, + "learning_rate": 4.527508760417429e-05, + "loss": 1.6697, + "step": 17732 + }, + { + "epoch": 5.442909760589319, + "grad_norm": 0.23796287178993225, + "learning_rate": 4.527013931078692e-05, + "loss": 1.7035, + "step": 17733 + }, + { + "epoch": 5.443216697360343, + "grad_norm": 0.2591552436351776, + "learning_rate": 4.5265191064142787e-05, + "loss": 1.8014, + "step": 17734 + }, + { + "epoch": 5.443523634131369, + "grad_norm": 0.3316073417663574, + "learning_rate": 4.526024286429082e-05, + "loss": 1.752, + "step": 17735 + }, + { + "epoch": 5.443830570902394, + "grad_norm": 0.2409597635269165, + "learning_rate": 4.52552947112799e-05, + "loss": 1.7662, + "step": 17736 + }, + { + "epoch": 5.444137507673419, + "grad_norm": 0.2896713614463806, + "learning_rate": 4.5250346605158964e-05, + "loss": 1.7168, + "step": 17737 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 0.30870527029037476, + "learning_rate": 4.524539854597686e-05, + "loss": 1.704, + "step": 17738 + }, + { + "epoch": 5.44475138121547, + "grad_norm": 0.2476932406425476, + "learning_rate": 4.524045053378254e-05, + "loss": 1.7649, + "step": 17739 + }, + { + "epoch": 5.445058317986494, + "grad_norm": 0.2937077283859253, + "learning_rate": 4.5235502568624855e-05, + "loss": 1.7028, + "step": 17740 + }, + { + "epoch": 5.44536525475752, + "grad_norm": 0.22881117463111877, + "learning_rate": 4.523055465055273e-05, + "loss": 1.7539, + "step": 17741 + }, + { + "epoch": 5.445672191528545, + "grad_norm": 0.2551842927932739, + "learning_rate": 4.522560677961508e-05, + "loss": 1.7601, + "step": 17742 + }, + { + "epoch": 5.44597912829957, + "grad_norm": 0.27533504366874695, + "learning_rate": 4.5220658955860754e-05, + "loss": 1.7695, + "step": 17743 + }, + { + "epoch": 5.446286065070596, + "grad_norm": 0.23387418687343597, + "learning_rate": 4.5215711179338706e-05, + "loss": 1.7218, + "step": 17744 + }, + { + "epoch": 5.44659300184162, + "grad_norm": 0.37932485342025757, + "learning_rate": 4.521076345009777e-05, + "loss": 1.7685, + "step": 17745 + }, + { + "epoch": 5.4468999386126455, + "grad_norm": 0.2668898105621338, + "learning_rate": 4.520581576818691e-05, + "loss": 1.7217, + "step": 17746 + }, + { + "epoch": 5.447206875383671, + "grad_norm": 0.2417856752872467, + "learning_rate": 4.520086813365496e-05, + "loss": 1.692, + "step": 17747 + }, + { + "epoch": 5.447513812154696, + "grad_norm": 0.3170008063316345, + "learning_rate": 4.519592054655086e-05, + "loss": 1.7565, + "step": 17748 + }, + { + "epoch": 5.4478207489257215, + "grad_norm": 0.20711660385131836, + "learning_rate": 4.519097300692348e-05, + "loss": 1.6708, + "step": 17749 + }, + { + "epoch": 5.448127685696747, + "grad_norm": 0.2196272760629654, + "learning_rate": 4.5186025514821746e-05, + "loss": 1.7335, + "step": 17750 + }, + { + "epoch": 5.448434622467771, + "grad_norm": 0.27563074231147766, + "learning_rate": 4.5181078070294505e-05, + "loss": 1.7383, + "step": 17751 + }, + { + "epoch": 5.448741559238797, + "grad_norm": 0.185418501496315, + "learning_rate": 4.517613067339068e-05, + "loss": 1.6841, + "step": 17752 + }, + { + "epoch": 5.449048496009822, + "grad_norm": 0.26787856221199036, + "learning_rate": 4.517118332415915e-05, + "loss": 1.7733, + "step": 17753 + }, + { + "epoch": 5.449355432780847, + "grad_norm": 0.22114823758602142, + "learning_rate": 4.516623602264885e-05, + "loss": 1.7153, + "step": 17754 + }, + { + "epoch": 5.449662369551873, + "grad_norm": 0.23090483248233795, + "learning_rate": 4.51612887689086e-05, + "loss": 1.7063, + "step": 17755 + }, + { + "epoch": 5.449969306322897, + "grad_norm": 0.3227362632751465, + "learning_rate": 4.515634156298736e-05, + "loss": 1.7528, + "step": 17756 + }, + { + "epoch": 5.4502762430939224, + "grad_norm": 0.24202494323253632, + "learning_rate": 4.515139440493397e-05, + "loss": 1.8119, + "step": 17757 + }, + { + "epoch": 5.450583179864948, + "grad_norm": 0.3778383731842041, + "learning_rate": 4.5146447294797356e-05, + "loss": 1.7589, + "step": 17758 + }, + { + "epoch": 5.450890116635973, + "grad_norm": 0.3726772964000702, + "learning_rate": 4.51415002326264e-05, + "loss": 1.7095, + "step": 17759 + }, + { + "epoch": 5.4511970534069984, + "grad_norm": 0.2424323409795761, + "learning_rate": 4.5136553218469966e-05, + "loss": 1.7374, + "step": 17760 + }, + { + "epoch": 5.451503990178024, + "grad_norm": 0.4347550570964813, + "learning_rate": 4.513160625237699e-05, + "loss": 1.8339, + "step": 17761 + }, + { + "epoch": 5.451810926949048, + "grad_norm": 0.2556018829345703, + "learning_rate": 4.512665933439631e-05, + "loss": 1.7024, + "step": 17762 + }, + { + "epoch": 5.452117863720074, + "grad_norm": 0.36380240321159363, + "learning_rate": 4.512171246457685e-05, + "loss": 1.7706, + "step": 17763 + }, + { + "epoch": 5.452424800491099, + "grad_norm": 0.42120790481567383, + "learning_rate": 4.5116765642967476e-05, + "loss": 1.7609, + "step": 17764 + }, + { + "epoch": 5.452731737262124, + "grad_norm": 0.20573028922080994, + "learning_rate": 4.51118188696171e-05, + "loss": 1.7521, + "step": 17765 + }, + { + "epoch": 5.45303867403315, + "grad_norm": 0.39001402258872986, + "learning_rate": 4.510687214457458e-05, + "loss": 1.7097, + "step": 17766 + }, + { + "epoch": 5.453345610804174, + "grad_norm": 0.2778739333152771, + "learning_rate": 4.510192546788884e-05, + "loss": 1.7677, + "step": 17767 + }, + { + "epoch": 5.453652547575199, + "grad_norm": 0.2500934600830078, + "learning_rate": 4.509697883960872e-05, + "loss": 1.7322, + "step": 17768 + }, + { + "epoch": 5.453959484346225, + "grad_norm": 0.23733557760715485, + "learning_rate": 4.509203225978314e-05, + "loss": 1.7426, + "step": 17769 + }, + { + "epoch": 5.45426642111725, + "grad_norm": 0.20033739507198334, + "learning_rate": 4.508708572846096e-05, + "loss": 1.7093, + "step": 17770 + }, + { + "epoch": 5.454573357888275, + "grad_norm": 0.202667698264122, + "learning_rate": 4.508213924569111e-05, + "loss": 1.6807, + "step": 17771 + }, + { + "epoch": 5.4548802946593, + "grad_norm": 0.1980566531419754, + "learning_rate": 4.507719281152241e-05, + "loss": 1.7102, + "step": 17772 + }, + { + "epoch": 5.455187231430325, + "grad_norm": 0.20612162351608276, + "learning_rate": 4.507224642600381e-05, + "loss": 1.7692, + "step": 17773 + }, + { + "epoch": 5.4554941682013505, + "grad_norm": 0.22859175503253937, + "learning_rate": 4.506730008918412e-05, + "loss": 1.7887, + "step": 17774 + }, + { + "epoch": 5.455801104972376, + "grad_norm": 0.19720709323883057, + "learning_rate": 4.5062353801112285e-05, + "loss": 1.7557, + "step": 17775 + }, + { + "epoch": 5.456108041743401, + "grad_norm": 0.23289217054843903, + "learning_rate": 4.505740756183717e-05, + "loss": 1.7023, + "step": 17776 + }, + { + "epoch": 5.456414978514426, + "grad_norm": 0.2120361477136612, + "learning_rate": 4.505246137140763e-05, + "loss": 1.7249, + "step": 17777 + }, + { + "epoch": 5.456721915285451, + "grad_norm": 0.2094341218471527, + "learning_rate": 4.504751522987259e-05, + "loss": 1.7586, + "step": 17778 + }, + { + "epoch": 5.457028852056476, + "grad_norm": 0.22361092269420624, + "learning_rate": 4.504256913728088e-05, + "loss": 1.737, + "step": 17779 + }, + { + "epoch": 5.457335788827502, + "grad_norm": 0.2100353240966797, + "learning_rate": 4.5037623093681424e-05, + "loss": 1.704, + "step": 17780 + }, + { + "epoch": 5.457642725598527, + "grad_norm": 0.20550231635570526, + "learning_rate": 4.503267709912308e-05, + "loss": 1.7732, + "step": 17781 + }, + { + "epoch": 5.457949662369552, + "grad_norm": 0.22843749821186066, + "learning_rate": 4.502773115365474e-05, + "loss": 1.6916, + "step": 17782 + }, + { + "epoch": 5.458256599140577, + "grad_norm": 0.2351907640695572, + "learning_rate": 4.502278525732526e-05, + "loss": 1.8043, + "step": 17783 + }, + { + "epoch": 5.458563535911602, + "grad_norm": 0.271028071641922, + "learning_rate": 4.501783941018355e-05, + "loss": 1.7665, + "step": 17784 + }, + { + "epoch": 5.458870472682627, + "grad_norm": 0.1974802166223526, + "learning_rate": 4.501289361227846e-05, + "loss": 1.718, + "step": 17785 + }, + { + "epoch": 5.459177409453653, + "grad_norm": 0.23726068437099457, + "learning_rate": 4.5007947863658884e-05, + "loss": 1.7507, + "step": 17786 + }, + { + "epoch": 5.459484346224678, + "grad_norm": 0.2112259715795517, + "learning_rate": 4.5003002164373684e-05, + "loss": 1.8116, + "step": 17787 + }, + { + "epoch": 5.4597912829957025, + "grad_norm": 0.2676105201244354, + "learning_rate": 4.4998056514471764e-05, + "loss": 1.7013, + "step": 17788 + }, + { + "epoch": 5.460098219766728, + "grad_norm": 0.2735576033592224, + "learning_rate": 4.4993110914001956e-05, + "loss": 1.7516, + "step": 17789 + }, + { + "epoch": 5.460405156537753, + "grad_norm": 0.1925152987241745, + "learning_rate": 4.498816536301319e-05, + "loss": 1.7018, + "step": 17790 + }, + { + "epoch": 5.4607120933087785, + "grad_norm": 0.25037717819213867, + "learning_rate": 4.498321986155429e-05, + "loss": 1.7207, + "step": 17791 + }, + { + "epoch": 5.461019030079804, + "grad_norm": 0.20481008291244507, + "learning_rate": 4.497827440967415e-05, + "loss": 1.6988, + "step": 17792 + }, + { + "epoch": 5.461325966850829, + "grad_norm": 0.19434049725532532, + "learning_rate": 4.4973329007421673e-05, + "loss": 1.7363, + "step": 17793 + }, + { + "epoch": 5.461632903621854, + "grad_norm": 0.21797434985637665, + "learning_rate": 4.496838365484567e-05, + "loss": 1.7218, + "step": 17794 + }, + { + "epoch": 5.461939840392879, + "grad_norm": 0.18477453291416168, + "learning_rate": 4.496343835199508e-05, + "loss": 1.7204, + "step": 17795 + }, + { + "epoch": 5.462246777163904, + "grad_norm": 0.21657803654670715, + "learning_rate": 4.495849309891872e-05, + "loss": 1.7671, + "step": 17796 + }, + { + "epoch": 5.46255371393493, + "grad_norm": 0.21027342975139618, + "learning_rate": 4.495354789566549e-05, + "loss": 1.7424, + "step": 17797 + }, + { + "epoch": 5.462860650705955, + "grad_norm": 0.2016189992427826, + "learning_rate": 4.4948602742284256e-05, + "loss": 1.7706, + "step": 17798 + }, + { + "epoch": 5.463167587476979, + "grad_norm": 0.2155935913324356, + "learning_rate": 4.494365763882391e-05, + "loss": 1.7314, + "step": 17799 + }, + { + "epoch": 5.463474524248005, + "grad_norm": 0.22079701721668243, + "learning_rate": 4.493871258533328e-05, + "loss": 1.7938, + "step": 17800 + }, + { + "epoch": 5.46378146101903, + "grad_norm": 0.1907699704170227, + "learning_rate": 4.4933767581861283e-05, + "loss": 1.6958, + "step": 17801 + }, + { + "epoch": 5.464088397790055, + "grad_norm": 0.2784879207611084, + "learning_rate": 4.4928822628456735e-05, + "loss": 1.7285, + "step": 17802 + }, + { + "epoch": 5.464395334561081, + "grad_norm": 0.29470255970954895, + "learning_rate": 4.492387772516855e-05, + "loss": 1.7363, + "step": 17803 + }, + { + "epoch": 5.464702271332105, + "grad_norm": 0.21387436985969543, + "learning_rate": 4.4918932872045575e-05, + "loss": 1.7414, + "step": 17804 + }, + { + "epoch": 5.4650092081031305, + "grad_norm": 0.3102552890777588, + "learning_rate": 4.49139880691367e-05, + "loss": 1.7359, + "step": 17805 + }, + { + "epoch": 5.465316144874156, + "grad_norm": 0.2312939465045929, + "learning_rate": 4.490904331649075e-05, + "loss": 1.7609, + "step": 17806 + }, + { + "epoch": 5.465623081645181, + "grad_norm": 0.323913037776947, + "learning_rate": 4.4904098614156645e-05, + "loss": 1.7693, + "step": 17807 + }, + { + "epoch": 5.4659300184162065, + "grad_norm": 0.2975599467754364, + "learning_rate": 4.48991539621832e-05, + "loss": 1.7506, + "step": 17808 + }, + { + "epoch": 5.466236955187231, + "grad_norm": 0.24702571332454681, + "learning_rate": 4.4894209360619316e-05, + "loss": 1.8258, + "step": 17809 + }, + { + "epoch": 5.466543891958256, + "grad_norm": 0.29016581177711487, + "learning_rate": 4.488926480951386e-05, + "loss": 1.7096, + "step": 17810 + }, + { + "epoch": 5.466850828729282, + "grad_norm": 0.2194555252790451, + "learning_rate": 4.488432030891566e-05, + "loss": 1.788, + "step": 17811 + }, + { + "epoch": 5.467157765500307, + "grad_norm": 0.2504041790962219, + "learning_rate": 4.487937585887363e-05, + "loss": 1.7672, + "step": 17812 + }, + { + "epoch": 5.467464702271332, + "grad_norm": 0.2362445741891861, + "learning_rate": 4.487443145943659e-05, + "loss": 1.7426, + "step": 17813 + }, + { + "epoch": 5.467771639042358, + "grad_norm": 0.20075896382331848, + "learning_rate": 4.486948711065343e-05, + "loss": 1.7406, + "step": 17814 + }, + { + "epoch": 5.468078575813382, + "grad_norm": 0.2219153791666031, + "learning_rate": 4.486454281257299e-05, + "loss": 1.683, + "step": 17815 + }, + { + "epoch": 5.468385512584407, + "grad_norm": 0.22551953792572021, + "learning_rate": 4.4859598565244176e-05, + "loss": 1.7896, + "step": 17816 + }, + { + "epoch": 5.468692449355433, + "grad_norm": 0.2385476976633072, + "learning_rate": 4.48546543687158e-05, + "loss": 1.7799, + "step": 17817 + }, + { + "epoch": 5.468999386126458, + "grad_norm": 0.24263370037078857, + "learning_rate": 4.4849710223036764e-05, + "loss": 1.682, + "step": 17818 + }, + { + "epoch": 5.469306322897483, + "grad_norm": 0.24301160871982574, + "learning_rate": 4.484476612825589e-05, + "loss": 1.8121, + "step": 17819 + }, + { + "epoch": 5.469613259668508, + "grad_norm": 0.2516932487487793, + "learning_rate": 4.483982208442207e-05, + "loss": 1.7344, + "step": 17820 + }, + { + "epoch": 5.469920196439533, + "grad_norm": 0.24309395253658295, + "learning_rate": 4.4834878091584156e-05, + "loss": 1.7746, + "step": 17821 + }, + { + "epoch": 5.4702271332105585, + "grad_norm": 0.24711866676807404, + "learning_rate": 4.4829934149790996e-05, + "loss": 1.7887, + "step": 17822 + }, + { + "epoch": 5.470534069981584, + "grad_norm": 0.2923797369003296, + "learning_rate": 4.4824990259091445e-05, + "loss": 1.7017, + "step": 17823 + }, + { + "epoch": 5.470841006752609, + "grad_norm": 0.21658629179000854, + "learning_rate": 4.482004641953441e-05, + "loss": 1.725, + "step": 17824 + }, + { + "epoch": 5.4711479435236345, + "grad_norm": 0.233424574136734, + "learning_rate": 4.481510263116868e-05, + "loss": 1.74, + "step": 17825 + }, + { + "epoch": 5.471454880294659, + "grad_norm": 0.28997600078582764, + "learning_rate": 4.481015889404315e-05, + "loss": 1.8418, + "step": 17826 + }, + { + "epoch": 5.471761817065684, + "grad_norm": 0.2245558649301529, + "learning_rate": 4.480521520820669e-05, + "loss": 1.7519, + "step": 17827 + }, + { + "epoch": 5.47206875383671, + "grad_norm": 0.21008887887001038, + "learning_rate": 4.480027157370812e-05, + "loss": 1.6977, + "step": 17828 + }, + { + "epoch": 5.472375690607735, + "grad_norm": 0.1990261971950531, + "learning_rate": 4.479532799059633e-05, + "loss": 1.7004, + "step": 17829 + }, + { + "epoch": 5.47268262737876, + "grad_norm": 0.2354540079832077, + "learning_rate": 4.479038445892014e-05, + "loss": 1.7755, + "step": 17830 + }, + { + "epoch": 5.472989564149785, + "grad_norm": 0.21904973685741425, + "learning_rate": 4.478544097872843e-05, + "loss": 1.8328, + "step": 17831 + }, + { + "epoch": 5.47329650092081, + "grad_norm": 0.21188503503799438, + "learning_rate": 4.4780497550070055e-05, + "loss": 1.7105, + "step": 17832 + }, + { + "epoch": 5.473603437691835, + "grad_norm": 0.2196870595216751, + "learning_rate": 4.477555417299386e-05, + "loss": 1.7261, + "step": 17833 + }, + { + "epoch": 5.473910374462861, + "grad_norm": 0.24522331357002258, + "learning_rate": 4.477061084754869e-05, + "loss": 1.8101, + "step": 17834 + }, + { + "epoch": 5.474217311233886, + "grad_norm": 0.24073927104473114, + "learning_rate": 4.476566757378343e-05, + "loss": 1.8295, + "step": 17835 + }, + { + "epoch": 5.474524248004911, + "grad_norm": 0.3724605143070221, + "learning_rate": 4.476072435174689e-05, + "loss": 1.7785, + "step": 17836 + }, + { + "epoch": 5.474831184775936, + "grad_norm": 0.25552257895469666, + "learning_rate": 4.475578118148797e-05, + "loss": 1.6978, + "step": 17837 + }, + { + "epoch": 5.475138121546961, + "grad_norm": 0.22402255237102509, + "learning_rate": 4.475083806305546e-05, + "loss": 1.697, + "step": 17838 + }, + { + "epoch": 5.475445058317987, + "grad_norm": 0.25869324803352356, + "learning_rate": 4.474589499649826e-05, + "loss": 1.7026, + "step": 17839 + }, + { + "epoch": 5.475751995089012, + "grad_norm": 0.249742329120636, + "learning_rate": 4.47409519818652e-05, + "loss": 1.7738, + "step": 17840 + }, + { + "epoch": 5.476058931860037, + "grad_norm": 0.28722140192985535, + "learning_rate": 4.473600901920515e-05, + "loss": 1.7555, + "step": 17841 + }, + { + "epoch": 5.476365868631062, + "grad_norm": 0.250964879989624, + "learning_rate": 4.4731066108566926e-05, + "loss": 1.6951, + "step": 17842 + }, + { + "epoch": 5.476672805402087, + "grad_norm": 0.20562006533145905, + "learning_rate": 4.472612324999942e-05, + "loss": 1.7109, + "step": 17843 + }, + { + "epoch": 5.476979742173112, + "grad_norm": 0.26964858174324036, + "learning_rate": 4.472118044355144e-05, + "loss": 1.7468, + "step": 17844 + }, + { + "epoch": 5.477286678944138, + "grad_norm": 0.25700438022613525, + "learning_rate": 4.471623768927184e-05, + "loss": 1.7046, + "step": 17845 + }, + { + "epoch": 5.477593615715163, + "grad_norm": 0.2152809500694275, + "learning_rate": 4.47112949872095e-05, + "loss": 1.7464, + "step": 17846 + }, + { + "epoch": 5.4779005524861875, + "grad_norm": 0.26429688930511475, + "learning_rate": 4.470635233741321e-05, + "loss": 1.7629, + "step": 17847 + }, + { + "epoch": 5.478207489257213, + "grad_norm": 0.18546637892723083, + "learning_rate": 4.470140973993188e-05, + "loss": 1.7143, + "step": 17848 + }, + { + "epoch": 5.478514426028238, + "grad_norm": 0.1927761435508728, + "learning_rate": 4.46964671948143e-05, + "loss": 1.6919, + "step": 17849 + }, + { + "epoch": 5.4788213627992635, + "grad_norm": 0.21581199765205383, + "learning_rate": 4.469152470210935e-05, + "loss": 1.7596, + "step": 17850 + }, + { + "epoch": 5.479128299570289, + "grad_norm": 0.20244133472442627, + "learning_rate": 4.468658226186586e-05, + "loss": 1.7372, + "step": 17851 + }, + { + "epoch": 5.479435236341313, + "grad_norm": 0.2467198520898819, + "learning_rate": 4.468163987413269e-05, + "loss": 1.7361, + "step": 17852 + }, + { + "epoch": 5.479742173112339, + "grad_norm": 0.22134411334991455, + "learning_rate": 4.467669753895866e-05, + "loss": 1.7276, + "step": 17853 + }, + { + "epoch": 5.480049109883364, + "grad_norm": 0.1953750103712082, + "learning_rate": 4.4671755256392636e-05, + "loss": 1.6931, + "step": 17854 + }, + { + "epoch": 5.480356046654389, + "grad_norm": 0.21492068469524384, + "learning_rate": 4.466681302648343e-05, + "loss": 1.7437, + "step": 17855 + }, + { + "epoch": 5.480662983425415, + "grad_norm": 0.24377848207950592, + "learning_rate": 4.466187084927993e-05, + "loss": 1.7869, + "step": 17856 + }, + { + "epoch": 5.48096992019644, + "grad_norm": 0.23674219846725464, + "learning_rate": 4.465692872483093e-05, + "loss": 1.8142, + "step": 17857 + }, + { + "epoch": 5.481276856967464, + "grad_norm": 0.25036486983299255, + "learning_rate": 4.4651986653185304e-05, + "loss": 1.8075, + "step": 17858 + }, + { + "epoch": 5.48158379373849, + "grad_norm": 0.32649150490760803, + "learning_rate": 4.4647044634391867e-05, + "loss": 1.7177, + "step": 17859 + }, + { + "epoch": 5.481890730509515, + "grad_norm": 0.20300604403018951, + "learning_rate": 4.46421026684995e-05, + "loss": 1.6912, + "step": 17860 + }, + { + "epoch": 5.48219766728054, + "grad_norm": 0.24630679190158844, + "learning_rate": 4.4637160755557e-05, + "loss": 1.8312, + "step": 17861 + }, + { + "epoch": 5.482504604051566, + "grad_norm": 0.2263093739748001, + "learning_rate": 4.46322188956132e-05, + "loss": 1.7214, + "step": 17862 + }, + { + "epoch": 5.48281154082259, + "grad_norm": 0.22949177026748657, + "learning_rate": 4.462727708871699e-05, + "loss": 1.6882, + "step": 17863 + }, + { + "epoch": 5.4831184775936155, + "grad_norm": 0.23389381170272827, + "learning_rate": 4.4622335334917156e-05, + "loss": 1.7613, + "step": 17864 + }, + { + "epoch": 5.483425414364641, + "grad_norm": 0.2259683907032013, + "learning_rate": 4.461739363426257e-05, + "loss": 1.7021, + "step": 17865 + }, + { + "epoch": 5.483732351135666, + "grad_norm": 0.3213486969470978, + "learning_rate": 4.4612451986802036e-05, + "loss": 1.7469, + "step": 17866 + }, + { + "epoch": 5.4840392879066915, + "grad_norm": 0.3415670096874237, + "learning_rate": 4.4607510392584426e-05, + "loss": 1.7605, + "step": 17867 + }, + { + "epoch": 5.484346224677717, + "grad_norm": 0.2079494297504425, + "learning_rate": 4.460256885165855e-05, + "loss": 1.7832, + "step": 17868 + }, + { + "epoch": 5.484653161448741, + "grad_norm": 0.30334988236427307, + "learning_rate": 4.459762736407327e-05, + "loss": 1.6825, + "step": 17869 + }, + { + "epoch": 5.484960098219767, + "grad_norm": 0.22320730984210968, + "learning_rate": 4.4592685929877374e-05, + "loss": 1.7452, + "step": 17870 + }, + { + "epoch": 5.485267034990792, + "grad_norm": 0.25325682759284973, + "learning_rate": 4.458774454911975e-05, + "loss": 1.7359, + "step": 17871 + }, + { + "epoch": 5.485573971761817, + "grad_norm": 0.305501788854599, + "learning_rate": 4.458280322184919e-05, + "loss": 1.7161, + "step": 17872 + }, + { + "epoch": 5.485880908532843, + "grad_norm": 0.19486182928085327, + "learning_rate": 4.457786194811455e-05, + "loss": 1.7097, + "step": 17873 + }, + { + "epoch": 5.486187845303867, + "grad_norm": 0.3306363821029663, + "learning_rate": 4.457292072796465e-05, + "loss": 1.7653, + "step": 17874 + }, + { + "epoch": 5.486494782074892, + "grad_norm": 0.25172874331474304, + "learning_rate": 4.456797956144835e-05, + "loss": 1.7289, + "step": 17875 + }, + { + "epoch": 5.486801718845918, + "grad_norm": 0.24508661031723022, + "learning_rate": 4.456303844861444e-05, + "loss": 1.7255, + "step": 17876 + }, + { + "epoch": 5.487108655616943, + "grad_norm": 0.3043360114097595, + "learning_rate": 4.455809738951178e-05, + "loss": 1.7852, + "step": 17877 + }, + { + "epoch": 5.487415592387968, + "grad_norm": 0.22181758284568787, + "learning_rate": 4.4553156384189186e-05, + "loss": 1.7887, + "step": 17878 + }, + { + "epoch": 5.487722529158993, + "grad_norm": 0.2174321413040161, + "learning_rate": 4.454821543269549e-05, + "loss": 1.7024, + "step": 17879 + }, + { + "epoch": 5.488029465930018, + "grad_norm": 0.19634750485420227, + "learning_rate": 4.4543274535079535e-05, + "loss": 1.7451, + "step": 17880 + }, + { + "epoch": 5.4883364027010435, + "grad_norm": 0.20481908321380615, + "learning_rate": 4.4538333691390125e-05, + "loss": 1.7068, + "step": 17881 + }, + { + "epoch": 5.488643339472069, + "grad_norm": 0.2025458663702011, + "learning_rate": 4.453339290167612e-05, + "loss": 1.72, + "step": 17882 + }, + { + "epoch": 5.488950276243094, + "grad_norm": 0.21013019979000092, + "learning_rate": 4.452845216598632e-05, + "loss": 1.7113, + "step": 17883 + }, + { + "epoch": 5.4892572130141195, + "grad_norm": 0.2057499885559082, + "learning_rate": 4.452351148436956e-05, + "loss": 1.7007, + "step": 17884 + }, + { + "epoch": 5.489564149785144, + "grad_norm": 0.19957664608955383, + "learning_rate": 4.4518570856874666e-05, + "loss": 1.6999, + "step": 17885 + }, + { + "epoch": 5.489871086556169, + "grad_norm": 0.22609412670135498, + "learning_rate": 4.451363028355048e-05, + "loss": 1.8124, + "step": 17886 + }, + { + "epoch": 5.490178023327195, + "grad_norm": 0.27350863814353943, + "learning_rate": 4.4508689764445805e-05, + "loss": 1.8042, + "step": 17887 + }, + { + "epoch": 5.49048496009822, + "grad_norm": 0.23416854441165924, + "learning_rate": 4.450374929960949e-05, + "loss": 1.7607, + "step": 17888 + }, + { + "epoch": 5.490791896869245, + "grad_norm": 0.2891421318054199, + "learning_rate": 4.449880888909033e-05, + "loss": 1.7419, + "step": 17889 + }, + { + "epoch": 5.49109883364027, + "grad_norm": 0.2458745837211609, + "learning_rate": 4.449386853293717e-05, + "loss": 1.7234, + "step": 17890 + }, + { + "epoch": 5.491405770411295, + "grad_norm": 0.23390449583530426, + "learning_rate": 4.4488928231198826e-05, + "loss": 1.7482, + "step": 17891 + }, + { + "epoch": 5.49171270718232, + "grad_norm": 0.3509657084941864, + "learning_rate": 4.448398798392414e-05, + "loss": 1.7639, + "step": 17892 + }, + { + "epoch": 5.492019643953346, + "grad_norm": 0.2487955242395401, + "learning_rate": 4.4479047791161916e-05, + "loss": 1.7163, + "step": 17893 + }, + { + "epoch": 5.492326580724371, + "grad_norm": 0.22630274295806885, + "learning_rate": 4.4474107652960956e-05, + "loss": 1.7449, + "step": 17894 + }, + { + "epoch": 5.4926335174953955, + "grad_norm": 0.25909537076950073, + "learning_rate": 4.446916756937012e-05, + "loss": 1.7396, + "step": 17895 + }, + { + "epoch": 5.492940454266421, + "grad_norm": 0.29732683300971985, + "learning_rate": 4.446422754043819e-05, + "loss": 1.8109, + "step": 17896 + }, + { + "epoch": 5.493247391037446, + "grad_norm": 0.22436772286891937, + "learning_rate": 4.4459287566214035e-05, + "loss": 1.7657, + "step": 17897 + }, + { + "epoch": 5.4935543278084715, + "grad_norm": 0.24584892392158508, + "learning_rate": 4.445434764674643e-05, + "loss": 1.73, + "step": 17898 + }, + { + "epoch": 5.493861264579497, + "grad_norm": 0.27446454763412476, + "learning_rate": 4.444940778208423e-05, + "loss": 1.7428, + "step": 17899 + }, + { + "epoch": 5.494168201350522, + "grad_norm": 0.20442110300064087, + "learning_rate": 4.4444467972276215e-05, + "loss": 1.6911, + "step": 17900 + }, + { + "epoch": 5.494475138121547, + "grad_norm": 0.23089268803596497, + "learning_rate": 4.4439528217371236e-05, + "loss": 1.7192, + "step": 17901 + }, + { + "epoch": 5.494782074892572, + "grad_norm": 0.19402450323104858, + "learning_rate": 4.443458851741808e-05, + "loss": 1.7304, + "step": 17902 + }, + { + "epoch": 5.495089011663597, + "grad_norm": 0.2310219705104828, + "learning_rate": 4.442964887246561e-05, + "loss": 1.6963, + "step": 17903 + }, + { + "epoch": 5.495395948434623, + "grad_norm": 0.25573140382766724, + "learning_rate": 4.44247092825626e-05, + "loss": 1.7781, + "step": 17904 + }, + { + "epoch": 5.495702885205648, + "grad_norm": 0.20298753678798676, + "learning_rate": 4.4419769747757894e-05, + "loss": 1.763, + "step": 17905 + }, + { + "epoch": 5.496009821976672, + "grad_norm": 0.22243307530879974, + "learning_rate": 4.441483026810027e-05, + "loss": 1.7345, + "step": 17906 + }, + { + "epoch": 5.496316758747698, + "grad_norm": 0.19801411032676697, + "learning_rate": 4.4409890843638584e-05, + "loss": 1.7504, + "step": 17907 + }, + { + "epoch": 5.496623695518723, + "grad_norm": 0.2804374396800995, + "learning_rate": 4.440495147442162e-05, + "loss": 1.7985, + "step": 17908 + }, + { + "epoch": 5.496930632289748, + "grad_norm": 0.21824021637439728, + "learning_rate": 4.440001216049822e-05, + "loss": 1.6703, + "step": 17909 + }, + { + "epoch": 5.497237569060774, + "grad_norm": 0.23335935175418854, + "learning_rate": 4.439507290191719e-05, + "loss": 1.7426, + "step": 17910 + }, + { + "epoch": 5.497544505831799, + "grad_norm": 0.2093769609928131, + "learning_rate": 4.4390133698727315e-05, + "loss": 1.7178, + "step": 17911 + }, + { + "epoch": 5.4978514426028235, + "grad_norm": 0.18354324996471405, + "learning_rate": 4.438519455097743e-05, + "loss": 1.6849, + "step": 17912 + }, + { + "epoch": 5.498158379373849, + "grad_norm": 0.26826491951942444, + "learning_rate": 4.438025545871633e-05, + "loss": 1.7804, + "step": 17913 + }, + { + "epoch": 5.498465316144874, + "grad_norm": 0.29171738028526306, + "learning_rate": 4.437531642199288e-05, + "loss": 1.764, + "step": 17914 + }, + { + "epoch": 5.4987722529158995, + "grad_norm": 0.17870590090751648, + "learning_rate": 4.437037744085581e-05, + "loss": 1.6789, + "step": 17915 + }, + { + "epoch": 5.499079189686925, + "grad_norm": 0.25412192940711975, + "learning_rate": 4.4365438515354e-05, + "loss": 1.7536, + "step": 17916 + }, + { + "epoch": 5.499386126457949, + "grad_norm": 0.24465163052082062, + "learning_rate": 4.4360499645536203e-05, + "loss": 1.7582, + "step": 17917 + }, + { + "epoch": 5.499693063228975, + "grad_norm": 0.21248452365398407, + "learning_rate": 4.4355560831451264e-05, + "loss": 1.7209, + "step": 17918 + }, + { + "epoch": 5.5, + "grad_norm": 0.21018685400485992, + "learning_rate": 4.435062207314797e-05, + "loss": 1.7461, + "step": 17919 + }, + { + "epoch": 5.500306936771025, + "grad_norm": 0.1880551278591156, + "learning_rate": 4.434568337067517e-05, + "loss": 1.6818, + "step": 17920 + }, + { + "epoch": 5.500613873542051, + "grad_norm": 0.2224894016981125, + "learning_rate": 4.434074472408161e-05, + "loss": 1.8211, + "step": 17921 + }, + { + "epoch": 5.500920810313076, + "grad_norm": 0.19419749081134796, + "learning_rate": 4.433580613341615e-05, + "loss": 1.7625, + "step": 17922 + }, + { + "epoch": 5.5012277470841005, + "grad_norm": 0.2167430967092514, + "learning_rate": 4.433086759872756e-05, + "loss": 1.745, + "step": 17923 + }, + { + "epoch": 5.501534683855126, + "grad_norm": 0.1926383525133133, + "learning_rate": 4.4325929120064665e-05, + "loss": 1.7353, + "step": 17924 + }, + { + "epoch": 5.501841620626151, + "grad_norm": 0.22943224012851715, + "learning_rate": 4.432099069747625e-05, + "loss": 1.6903, + "step": 17925 + }, + { + "epoch": 5.5021485573971765, + "grad_norm": 0.18218693137168884, + "learning_rate": 4.431605233101116e-05, + "loss": 1.742, + "step": 17926 + }, + { + "epoch": 5.502455494168201, + "grad_norm": 0.2660788893699646, + "learning_rate": 4.431111402071817e-05, + "loss": 1.7208, + "step": 17927 + }, + { + "epoch": 5.502762430939226, + "grad_norm": 0.20015788078308105, + "learning_rate": 4.430617576664606e-05, + "loss": 1.721, + "step": 17928 + }, + { + "epoch": 5.503069367710252, + "grad_norm": 0.20011179149150848, + "learning_rate": 4.430123756884368e-05, + "loss": 1.7488, + "step": 17929 + }, + { + "epoch": 5.503376304481277, + "grad_norm": 0.22541452944278717, + "learning_rate": 4.429629942735979e-05, + "loss": 1.7997, + "step": 17930 + }, + { + "epoch": 5.503683241252302, + "grad_norm": 0.21067193150520325, + "learning_rate": 4.4291361342243236e-05, + "loss": 1.6652, + "step": 17931 + }, + { + "epoch": 5.503990178023328, + "grad_norm": 0.38401395082473755, + "learning_rate": 4.428642331354278e-05, + "loss": 1.815, + "step": 17932 + }, + { + "epoch": 5.504297114794352, + "grad_norm": 0.22600100934505463, + "learning_rate": 4.428148534130725e-05, + "loss": 1.7593, + "step": 17933 + }, + { + "epoch": 5.504604051565377, + "grad_norm": 0.21340666711330414, + "learning_rate": 4.427654742558542e-05, + "loss": 1.7447, + "step": 17934 + }, + { + "epoch": 5.504910988336403, + "grad_norm": 0.20676501095294952, + "learning_rate": 4.427160956642611e-05, + "loss": 1.7174, + "step": 17935 + }, + { + "epoch": 5.505217925107428, + "grad_norm": 0.2374252825975418, + "learning_rate": 4.42666717638781e-05, + "loss": 1.703, + "step": 17936 + }, + { + "epoch": 5.505524861878453, + "grad_norm": 0.20975756645202637, + "learning_rate": 4.426173401799022e-05, + "loss": 1.7076, + "step": 17937 + }, + { + "epoch": 5.505831798649478, + "grad_norm": 0.23778517544269562, + "learning_rate": 4.4256796328811226e-05, + "loss": 1.7647, + "step": 17938 + }, + { + "epoch": 5.506138735420503, + "grad_norm": 0.2088557481765747, + "learning_rate": 4.425185869638996e-05, + "loss": 1.764, + "step": 17939 + }, + { + "epoch": 5.5064456721915285, + "grad_norm": 0.26953455805778503, + "learning_rate": 4.424692112077518e-05, + "loss": 1.7351, + "step": 17940 + }, + { + "epoch": 5.506752608962554, + "grad_norm": 0.2762589454650879, + "learning_rate": 4.42419836020157e-05, + "loss": 1.7051, + "step": 17941 + }, + { + "epoch": 5.507059545733579, + "grad_norm": 0.19611702859401703, + "learning_rate": 4.4237046140160306e-05, + "loss": 1.7445, + "step": 17942 + }, + { + "epoch": 5.5073664825046045, + "grad_norm": 0.2708270251750946, + "learning_rate": 4.4232108735257824e-05, + "loss": 1.7284, + "step": 17943 + }, + { + "epoch": 5.507673419275629, + "grad_norm": 0.24194146692752838, + "learning_rate": 4.422717138735701e-05, + "loss": 1.7302, + "step": 17944 + }, + { + "epoch": 5.507980356046654, + "grad_norm": 0.21558286249637604, + "learning_rate": 4.422223409650666e-05, + "loss": 1.7435, + "step": 17945 + }, + { + "epoch": 5.50828729281768, + "grad_norm": 0.1842707246541977, + "learning_rate": 4.4217296862755597e-05, + "loss": 1.6579, + "step": 17946 + }, + { + "epoch": 5.508594229588705, + "grad_norm": 0.20211941003799438, + "learning_rate": 4.4212359686152576e-05, + "loss": 1.8017, + "step": 17947 + }, + { + "epoch": 5.50890116635973, + "grad_norm": 0.23749016225337982, + "learning_rate": 4.420742256674644e-05, + "loss": 1.6721, + "step": 17948 + }, + { + "epoch": 5.509208103130755, + "grad_norm": 0.2076852172613144, + "learning_rate": 4.420248550458592e-05, + "loss": 1.7102, + "step": 17949 + }, + { + "epoch": 5.50951503990178, + "grad_norm": 0.2599447965621948, + "learning_rate": 4.419754849971986e-05, + "loss": 1.7819, + "step": 17950 + }, + { + "epoch": 5.509821976672805, + "grad_norm": 0.2017187476158142, + "learning_rate": 4.4192611552197e-05, + "loss": 1.6812, + "step": 17951 + }, + { + "epoch": 5.510128913443831, + "grad_norm": 0.21972116827964783, + "learning_rate": 4.418767466206617e-05, + "loss": 1.7122, + "step": 17952 + }, + { + "epoch": 5.510435850214856, + "grad_norm": 0.21750569343566895, + "learning_rate": 4.418273782937613e-05, + "loss": 1.7285, + "step": 17953 + }, + { + "epoch": 5.510742786985881, + "grad_norm": 0.19349125027656555, + "learning_rate": 4.417780105417572e-05, + "loss": 1.7383, + "step": 17954 + }, + { + "epoch": 5.511049723756906, + "grad_norm": 0.2094268798828125, + "learning_rate": 4.417286433651366e-05, + "loss": 1.7107, + "step": 17955 + }, + { + "epoch": 5.511356660527931, + "grad_norm": 0.2684331238269806, + "learning_rate": 4.41679276764388e-05, + "loss": 1.7336, + "step": 17956 + }, + { + "epoch": 5.5116635972989565, + "grad_norm": 0.27616915106773376, + "learning_rate": 4.416299107399987e-05, + "loss": 1.7439, + "step": 17957 + }, + { + "epoch": 5.511970534069982, + "grad_norm": 0.23874540627002716, + "learning_rate": 4.415805452924569e-05, + "loss": 1.7979, + "step": 17958 + }, + { + "epoch": 5.512277470841006, + "grad_norm": 0.21870921552181244, + "learning_rate": 4.415311804222503e-05, + "loss": 1.6674, + "step": 17959 + }, + { + "epoch": 5.512584407612032, + "grad_norm": 0.23042429983615875, + "learning_rate": 4.414818161298671e-05, + "loss": 1.7588, + "step": 17960 + }, + { + "epoch": 5.512891344383057, + "grad_norm": 0.2957153916358948, + "learning_rate": 4.4143245241579486e-05, + "loss": 1.8412, + "step": 17961 + }, + { + "epoch": 5.513198281154082, + "grad_norm": 0.28292644023895264, + "learning_rate": 4.413830892805213e-05, + "loss": 1.7915, + "step": 17962 + }, + { + "epoch": 5.513505217925108, + "grad_norm": 0.26526281237602234, + "learning_rate": 4.413337267245344e-05, + "loss": 1.7199, + "step": 17963 + }, + { + "epoch": 5.513812154696133, + "grad_norm": 0.41243693232536316, + "learning_rate": 4.4128436474832204e-05, + "loss": 1.7419, + "step": 17964 + }, + { + "epoch": 5.514119091467157, + "grad_norm": 0.2747771739959717, + "learning_rate": 4.4123500335237214e-05, + "loss": 1.7449, + "step": 17965 + }, + { + "epoch": 5.514426028238183, + "grad_norm": 0.25944122672080994, + "learning_rate": 4.4118564253717216e-05, + "loss": 1.7667, + "step": 17966 + }, + { + "epoch": 5.514732965009208, + "grad_norm": 0.32558533549308777, + "learning_rate": 4.411362823032103e-05, + "loss": 1.7292, + "step": 17967 + }, + { + "epoch": 5.515039901780233, + "grad_norm": 0.20190958678722382, + "learning_rate": 4.4108692265097404e-05, + "loss": 1.7529, + "step": 17968 + }, + { + "epoch": 5.515346838551259, + "grad_norm": 0.35485807061195374, + "learning_rate": 4.410375635809514e-05, + "loss": 1.7335, + "step": 17969 + }, + { + "epoch": 5.515653775322283, + "grad_norm": 0.2670159935951233, + "learning_rate": 4.409882050936301e-05, + "loss": 1.6789, + "step": 17970 + }, + { + "epoch": 5.5159607120933085, + "grad_norm": 0.19106578826904297, + "learning_rate": 4.409388471894981e-05, + "loss": 1.708, + "step": 17971 + }, + { + "epoch": 5.516267648864334, + "grad_norm": 0.2707268297672272, + "learning_rate": 4.4088948986904286e-05, + "loss": 1.7917, + "step": 17972 + }, + { + "epoch": 5.516574585635359, + "grad_norm": 0.2329230159521103, + "learning_rate": 4.408401331327525e-05, + "loss": 1.7378, + "step": 17973 + }, + { + "epoch": 5.5168815224063845, + "grad_norm": 0.22164998948574066, + "learning_rate": 4.4079077698111436e-05, + "loss": 1.7287, + "step": 17974 + }, + { + "epoch": 5.51718845917741, + "grad_norm": 0.25895699858665466, + "learning_rate": 4.4074142141461665e-05, + "loss": 1.7158, + "step": 17975 + }, + { + "epoch": 5.517495395948434, + "grad_norm": 0.2617860436439514, + "learning_rate": 4.4069206643374695e-05, + "loss": 1.7767, + "step": 17976 + }, + { + "epoch": 5.51780233271946, + "grad_norm": 0.20443588495254517, + "learning_rate": 4.40642712038993e-05, + "loss": 1.7371, + "step": 17977 + }, + { + "epoch": 5.518109269490485, + "grad_norm": 0.26251545548439026, + "learning_rate": 4.4059335823084266e-05, + "loss": 1.8154, + "step": 17978 + }, + { + "epoch": 5.51841620626151, + "grad_norm": 0.2315993458032608, + "learning_rate": 4.405440050097833e-05, + "loss": 1.7426, + "step": 17979 + }, + { + "epoch": 5.518723143032536, + "grad_norm": 0.19467706978321075, + "learning_rate": 4.404946523763031e-05, + "loss": 1.7418, + "step": 17980 + }, + { + "epoch": 5.51903007980356, + "grad_norm": 0.2387837916612625, + "learning_rate": 4.4044530033088946e-05, + "loss": 1.7648, + "step": 17981 + }, + { + "epoch": 5.519337016574585, + "grad_norm": 0.21097531914710999, + "learning_rate": 4.403959488740306e-05, + "loss": 1.7198, + "step": 17982 + }, + { + "epoch": 5.519643953345611, + "grad_norm": 0.22303247451782227, + "learning_rate": 4.403465980062136e-05, + "loss": 1.7679, + "step": 17983 + }, + { + "epoch": 5.519950890116636, + "grad_norm": 0.19705620408058167, + "learning_rate": 4.4029724772792666e-05, + "loss": 1.7747, + "step": 17984 + }, + { + "epoch": 5.520257826887661, + "grad_norm": 0.20864570140838623, + "learning_rate": 4.4024789803965715e-05, + "loss": 1.6797, + "step": 17985 + }, + { + "epoch": 5.520564763658687, + "grad_norm": 0.1917724758386612, + "learning_rate": 4.401985489418931e-05, + "loss": 1.7246, + "step": 17986 + }, + { + "epoch": 5.520871700429711, + "grad_norm": 0.25668975710868835, + "learning_rate": 4.401492004351219e-05, + "loss": 1.7245, + "step": 17987 + }, + { + "epoch": 5.5211786372007365, + "grad_norm": 0.22576093673706055, + "learning_rate": 4.4009985251983146e-05, + "loss": 1.6766, + "step": 17988 + }, + { + "epoch": 5.521485573971762, + "grad_norm": 0.18614664673805237, + "learning_rate": 4.400505051965093e-05, + "loss": 1.7379, + "step": 17989 + }, + { + "epoch": 5.521792510742787, + "grad_norm": 0.21472783386707306, + "learning_rate": 4.4000115846564335e-05, + "loss": 1.7203, + "step": 17990 + }, + { + "epoch": 5.5220994475138125, + "grad_norm": 0.201142817735672, + "learning_rate": 4.39951812327721e-05, + "loss": 1.7049, + "step": 17991 + }, + { + "epoch": 5.522406384284837, + "grad_norm": 0.193614661693573, + "learning_rate": 4.3990246678323e-05, + "loss": 1.6938, + "step": 17992 + }, + { + "epoch": 5.522713321055862, + "grad_norm": 0.23343239724636078, + "learning_rate": 4.398531218326582e-05, + "loss": 1.744, + "step": 17993 + }, + { + "epoch": 5.523020257826888, + "grad_norm": 0.26271605491638184, + "learning_rate": 4.3980377747649305e-05, + "loss": 1.7458, + "step": 17994 + }, + { + "epoch": 5.523327194597913, + "grad_norm": 0.2048577219247818, + "learning_rate": 4.397544337152223e-05, + "loss": 1.763, + "step": 17995 + }, + { + "epoch": 5.523634131368938, + "grad_norm": 0.27748194336891174, + "learning_rate": 4.397050905493334e-05, + "loss": 1.7346, + "step": 17996 + }, + { + "epoch": 5.523941068139964, + "grad_norm": 0.3040253520011902, + "learning_rate": 4.3965574797931417e-05, + "loss": 1.7396, + "step": 17997 + }, + { + "epoch": 5.524248004910988, + "grad_norm": 0.3310317397117615, + "learning_rate": 4.396064060056523e-05, + "loss": 1.8094, + "step": 17998 + }, + { + "epoch": 5.524554941682013, + "grad_norm": 0.21845392882823944, + "learning_rate": 4.395570646288352e-05, + "loss": 1.7013, + "step": 17999 + }, + { + "epoch": 5.524861878453039, + "grad_norm": 0.319876492023468, + "learning_rate": 4.395077238493506e-05, + "loss": 1.7985, + "step": 18000 + }, + { + "epoch": 5.525168815224064, + "grad_norm": 0.28261950612068176, + "learning_rate": 4.394583836676863e-05, + "loss": 1.7979, + "step": 18001 + }, + { + "epoch": 5.525475751995089, + "grad_norm": 0.20874030888080597, + "learning_rate": 4.394090440843296e-05, + "loss": 1.7363, + "step": 18002 + }, + { + "epoch": 5.525782688766114, + "grad_norm": 0.28587406873703003, + "learning_rate": 4.393597050997684e-05, + "loss": 1.6787, + "step": 18003 + }, + { + "epoch": 5.526089625537139, + "grad_norm": 0.2719021439552307, + "learning_rate": 4.393103667144899e-05, + "loss": 1.7625, + "step": 18004 + }, + { + "epoch": 5.526396562308165, + "grad_norm": 0.22485414147377014, + "learning_rate": 4.392610289289821e-05, + "loss": 1.6847, + "step": 18005 + }, + { + "epoch": 5.52670349907919, + "grad_norm": 0.3500347435474396, + "learning_rate": 4.392116917437322e-05, + "loss": 1.7244, + "step": 18006 + }, + { + "epoch": 5.527010435850215, + "grad_norm": 0.26308783888816833, + "learning_rate": 4.3916235515922836e-05, + "loss": 1.7738, + "step": 18007 + }, + { + "epoch": 5.52731737262124, + "grad_norm": 0.27030646800994873, + "learning_rate": 4.391130191759574e-05, + "loss": 1.7149, + "step": 18008 + }, + { + "epoch": 5.527624309392265, + "grad_norm": 0.4137318730354309, + "learning_rate": 4.390636837944076e-05, + "loss": 1.7581, + "step": 18009 + }, + { + "epoch": 5.52793124616329, + "grad_norm": 0.2462068647146225, + "learning_rate": 4.390143490150659e-05, + "loss": 1.7767, + "step": 18010 + }, + { + "epoch": 5.528238182934316, + "grad_norm": 0.27424392104148865, + "learning_rate": 4.3896501483842036e-05, + "loss": 1.7701, + "step": 18011 + }, + { + "epoch": 5.528545119705341, + "grad_norm": 0.31268683075904846, + "learning_rate": 4.389156812649583e-05, + "loss": 1.7342, + "step": 18012 + }, + { + "epoch": 5.5288520564763655, + "grad_norm": 0.20428471267223358, + "learning_rate": 4.388663482951671e-05, + "loss": 1.7083, + "step": 18013 + }, + { + "epoch": 5.529158993247391, + "grad_norm": 0.322344034910202, + "learning_rate": 4.3881701592953475e-05, + "loss": 1.7423, + "step": 18014 + }, + { + "epoch": 5.529465930018416, + "grad_norm": 0.2267894744873047, + "learning_rate": 4.387676841685483e-05, + "loss": 1.7309, + "step": 18015 + }, + { + "epoch": 5.5297728667894415, + "grad_norm": 0.23041954636573792, + "learning_rate": 4.387183530126955e-05, + "loss": 1.7352, + "step": 18016 + }, + { + "epoch": 5.530079803560467, + "grad_norm": 0.31139662861824036, + "learning_rate": 4.386690224624638e-05, + "loss": 1.7223, + "step": 18017 + }, + { + "epoch": 5.530386740331492, + "grad_norm": 0.20144063234329224, + "learning_rate": 4.38619692518341e-05, + "loss": 1.7607, + "step": 18018 + }, + { + "epoch": 5.530693677102517, + "grad_norm": 0.23812296986579895, + "learning_rate": 4.385703631808142e-05, + "loss": 1.7599, + "step": 18019 + }, + { + "epoch": 5.531000613873542, + "grad_norm": 0.2442231923341751, + "learning_rate": 4.385210344503712e-05, + "loss": 1.7094, + "step": 18020 + }, + { + "epoch": 5.531307550644567, + "grad_norm": 0.19497406482696533, + "learning_rate": 4.384717063274992e-05, + "loss": 1.7686, + "step": 18021 + }, + { + "epoch": 5.531614487415593, + "grad_norm": 0.29085835814476013, + "learning_rate": 4.38422378812686e-05, + "loss": 1.7454, + "step": 18022 + }, + { + "epoch": 5.531921424186618, + "grad_norm": 0.2701610028743744, + "learning_rate": 4.3837305190641876e-05, + "loss": 1.7376, + "step": 18023 + }, + { + "epoch": 5.532228360957642, + "grad_norm": 0.21232132613658905, + "learning_rate": 4.383237256091854e-05, + "loss": 1.7773, + "step": 18024 + }, + { + "epoch": 5.532535297728668, + "grad_norm": 0.24131610989570618, + "learning_rate": 4.382743999214729e-05, + "loss": 1.7899, + "step": 18025 + }, + { + "epoch": 5.532842234499693, + "grad_norm": 0.2752540409564972, + "learning_rate": 4.382250748437692e-05, + "loss": 1.7603, + "step": 18026 + }, + { + "epoch": 5.533149171270718, + "grad_norm": 0.2007865607738495, + "learning_rate": 4.381757503765613e-05, + "loss": 1.7553, + "step": 18027 + }, + { + "epoch": 5.533456108041744, + "grad_norm": 0.23768723011016846, + "learning_rate": 4.38126426520337e-05, + "loss": 1.757, + "step": 18028 + }, + { + "epoch": 5.533763044812769, + "grad_norm": 0.22198502719402313, + "learning_rate": 4.3807710327558366e-05, + "loss": 1.7578, + "step": 18029 + }, + { + "epoch": 5.5340699815837935, + "grad_norm": 0.22432352602481842, + "learning_rate": 4.380277806427885e-05, + "loss": 1.75, + "step": 18030 + }, + { + "epoch": 5.534376918354819, + "grad_norm": 0.23029591143131256, + "learning_rate": 4.379784586224394e-05, + "loss": 1.7829, + "step": 18031 + }, + { + "epoch": 5.534683855125844, + "grad_norm": 0.23901896178722382, + "learning_rate": 4.379291372150232e-05, + "loss": 1.7461, + "step": 18032 + }, + { + "epoch": 5.5349907918968695, + "grad_norm": 0.20958681404590607, + "learning_rate": 4.378798164210278e-05, + "loss": 1.7224, + "step": 18033 + }, + { + "epoch": 5.535297728667894, + "grad_norm": 0.21619680523872375, + "learning_rate": 4.3783049624094036e-05, + "loss": 1.7605, + "step": 18034 + }, + { + "epoch": 5.535604665438919, + "grad_norm": 0.22988620400428772, + "learning_rate": 4.3778117667524867e-05, + "loss": 1.7668, + "step": 18035 + }, + { + "epoch": 5.535911602209945, + "grad_norm": 0.20107243955135345, + "learning_rate": 4.377318577244395e-05, + "loss": 1.7932, + "step": 18036 + }, + { + "epoch": 5.53621853898097, + "grad_norm": 0.25803956389427185, + "learning_rate": 4.376825393890009e-05, + "loss": 1.7409, + "step": 18037 + }, + { + "epoch": 5.536525475751995, + "grad_norm": 0.34292399883270264, + "learning_rate": 4.376332216694198e-05, + "loss": 1.8554, + "step": 18038 + }, + { + "epoch": 5.536832412523021, + "grad_norm": 0.23147790133953094, + "learning_rate": 4.375839045661839e-05, + "loss": 1.7918, + "step": 18039 + }, + { + "epoch": 5.537139349294045, + "grad_norm": 0.2387644350528717, + "learning_rate": 4.375345880797802e-05, + "loss": 1.7391, + "step": 18040 + }, + { + "epoch": 5.53744628606507, + "grad_norm": 0.21463727951049805, + "learning_rate": 4.374852722106966e-05, + "loss": 1.6812, + "step": 18041 + }, + { + "epoch": 5.537753222836096, + "grad_norm": 0.21994563937187195, + "learning_rate": 4.3743595695941994e-05, + "loss": 1.7727, + "step": 18042 + }, + { + "epoch": 5.538060159607121, + "grad_norm": 0.21102699637413025, + "learning_rate": 4.373866423264381e-05, + "loss": 1.7854, + "step": 18043 + }, + { + "epoch": 5.538367096378146, + "grad_norm": 0.21742786467075348, + "learning_rate": 4.3733732831223794e-05, + "loss": 1.7352, + "step": 18044 + }, + { + "epoch": 5.538674033149171, + "grad_norm": 0.20080791413784027, + "learning_rate": 4.372880149173071e-05, + "loss": 1.7264, + "step": 18045 + }, + { + "epoch": 5.538980969920196, + "grad_norm": 0.21027569472789764, + "learning_rate": 4.372387021421329e-05, + "loss": 1.766, + "step": 18046 + }, + { + "epoch": 5.5392879066912215, + "grad_norm": 0.22870683670043945, + "learning_rate": 4.371893899872025e-05, + "loss": 1.7746, + "step": 18047 + }, + { + "epoch": 5.539594843462247, + "grad_norm": 0.21248690783977509, + "learning_rate": 4.371400784530036e-05, + "loss": 1.7447, + "step": 18048 + }, + { + "epoch": 5.539901780233272, + "grad_norm": 0.23059454560279846, + "learning_rate": 4.37090767540023e-05, + "loss": 1.7827, + "step": 18049 + }, + { + "epoch": 5.5402087170042975, + "grad_norm": 0.2519036531448364, + "learning_rate": 4.370414572487485e-05, + "loss": 1.7984, + "step": 18050 + }, + { + "epoch": 5.540515653775322, + "grad_norm": 0.23621398210525513, + "learning_rate": 4.36992147579667e-05, + "loss": 1.7517, + "step": 18051 + }, + { + "epoch": 5.540822590546347, + "grad_norm": 0.24267609417438507, + "learning_rate": 4.3694283853326625e-05, + "loss": 1.8285, + "step": 18052 + }, + { + "epoch": 5.541129527317373, + "grad_norm": 0.23209960758686066, + "learning_rate": 4.368935301100332e-05, + "loss": 1.7765, + "step": 18053 + }, + { + "epoch": 5.541436464088398, + "grad_norm": 0.21277187764644623, + "learning_rate": 4.368442223104555e-05, + "loss": 1.7182, + "step": 18054 + }, + { + "epoch": 5.541743400859423, + "grad_norm": 0.20821616053581238, + "learning_rate": 4.367949151350199e-05, + "loss": 1.6766, + "step": 18055 + }, + { + "epoch": 5.542050337630448, + "grad_norm": 0.23019999265670776, + "learning_rate": 4.3674560858421414e-05, + "loss": 1.7438, + "step": 18056 + }, + { + "epoch": 5.542357274401473, + "grad_norm": 0.21547134220600128, + "learning_rate": 4.366963026585253e-05, + "loss": 1.7003, + "step": 18057 + }, + { + "epoch": 5.542664211172498, + "grad_norm": 0.22454513609409332, + "learning_rate": 4.3664699735844084e-05, + "loss": 1.7072, + "step": 18058 + }, + { + "epoch": 5.542971147943524, + "grad_norm": 0.22228482365608215, + "learning_rate": 4.365976926844477e-05, + "loss": 1.7557, + "step": 18059 + }, + { + "epoch": 5.543278084714549, + "grad_norm": 0.25762560963630676, + "learning_rate": 4.365483886370335e-05, + "loss": 1.7751, + "step": 18060 + }, + { + "epoch": 5.543585021485574, + "grad_norm": 0.2086205631494522, + "learning_rate": 4.3649908521668516e-05, + "loss": 1.7399, + "step": 18061 + }, + { + "epoch": 5.543891958256599, + "grad_norm": 0.2759089767932892, + "learning_rate": 4.3644978242389014e-05, + "loss": 1.7503, + "step": 18062 + }, + { + "epoch": 5.544198895027624, + "grad_norm": 0.2235182225704193, + "learning_rate": 4.364004802591358e-05, + "loss": 1.7313, + "step": 18063 + }, + { + "epoch": 5.5445058317986495, + "grad_norm": 0.23074570298194885, + "learning_rate": 4.3635117872290885e-05, + "loss": 1.7649, + "step": 18064 + }, + { + "epoch": 5.544812768569675, + "grad_norm": 0.24929538369178772, + "learning_rate": 4.363018778156972e-05, + "loss": 1.732, + "step": 18065 + }, + { + "epoch": 5.5451197053407, + "grad_norm": 0.26422035694122314, + "learning_rate": 4.362525775379874e-05, + "loss": 1.7276, + "step": 18066 + }, + { + "epoch": 5.545426642111725, + "grad_norm": 0.3160388767719269, + "learning_rate": 4.362032778902672e-05, + "loss": 1.7777, + "step": 18067 + }, + { + "epoch": 5.54573357888275, + "grad_norm": 0.20791196823120117, + "learning_rate": 4.3615397887302345e-05, + "loss": 1.7058, + "step": 18068 + }, + { + "epoch": 5.546040515653775, + "grad_norm": 0.31438156962394714, + "learning_rate": 4.361046804867437e-05, + "loss": 1.8102, + "step": 18069 + }, + { + "epoch": 5.546347452424801, + "grad_norm": 0.3008113205432892, + "learning_rate": 4.3605538273191475e-05, + "loss": 1.7297, + "step": 18070 + }, + { + "epoch": 5.546654389195826, + "grad_norm": 0.21147282421588898, + "learning_rate": 4.3600608560902425e-05, + "loss": 1.776, + "step": 18071 + }, + { + "epoch": 5.546961325966851, + "grad_norm": 0.25202393531799316, + "learning_rate": 4.3595678911855884e-05, + "loss": 1.7273, + "step": 18072 + }, + { + "epoch": 5.547268262737876, + "grad_norm": 0.18881210684776306, + "learning_rate": 4.3590749326100614e-05, + "loss": 1.7026, + "step": 18073 + }, + { + "epoch": 5.547575199508901, + "grad_norm": 0.25075671076774597, + "learning_rate": 4.3585819803685295e-05, + "loss": 1.7694, + "step": 18074 + }, + { + "epoch": 5.547882136279926, + "grad_norm": 0.2625887989997864, + "learning_rate": 4.358089034465869e-05, + "loss": 1.7338, + "step": 18075 + }, + { + "epoch": 5.548189073050952, + "grad_norm": 0.27278679609298706, + "learning_rate": 4.357596094906947e-05, + "loss": 1.7684, + "step": 18076 + }, + { + "epoch": 5.548496009821976, + "grad_norm": 0.283964604139328, + "learning_rate": 4.3571031616966396e-05, + "loss": 1.7539, + "step": 18077 + }, + { + "epoch": 5.5488029465930016, + "grad_norm": 0.2702009975910187, + "learning_rate": 4.3566102348398124e-05, + "loss": 1.8064, + "step": 18078 + }, + { + "epoch": 5.549109883364027, + "grad_norm": 0.449733167886734, + "learning_rate": 4.356117314341342e-05, + "loss": 1.7258, + "step": 18079 + }, + { + "epoch": 5.549416820135052, + "grad_norm": 0.3199995160102844, + "learning_rate": 4.3556244002060975e-05, + "loss": 1.7526, + "step": 18080 + }, + { + "epoch": 5.5497237569060776, + "grad_norm": 0.2803747355937958, + "learning_rate": 4.3551314924389494e-05, + "loss": 1.764, + "step": 18081 + }, + { + "epoch": 5.550030693677103, + "grad_norm": 0.28995978832244873, + "learning_rate": 4.3546385910447715e-05, + "loss": 1.7617, + "step": 18082 + }, + { + "epoch": 5.550337630448127, + "grad_norm": 0.24313311278820038, + "learning_rate": 4.354145696028431e-05, + "loss": 1.7515, + "step": 18083 + }, + { + "epoch": 5.550644567219153, + "grad_norm": 0.2668032944202423, + "learning_rate": 4.3536528073948025e-05, + "loss": 1.743, + "step": 18084 + }, + { + "epoch": 5.550951503990178, + "grad_norm": 0.22831310331821442, + "learning_rate": 4.353159925148755e-05, + "loss": 1.7971, + "step": 18085 + }, + { + "epoch": 5.551258440761203, + "grad_norm": 0.22047942876815796, + "learning_rate": 4.352667049295162e-05, + "loss": 1.6983, + "step": 18086 + }, + { + "epoch": 5.551565377532229, + "grad_norm": 0.22895069420337677, + "learning_rate": 4.35217417983889e-05, + "loss": 1.7866, + "step": 18087 + }, + { + "epoch": 5.551872314303253, + "grad_norm": 0.19946368038654327, + "learning_rate": 4.3516813167848156e-05, + "loss": 1.7129, + "step": 18088 + }, + { + "epoch": 5.5521792510742785, + "grad_norm": 0.21508903801441193, + "learning_rate": 4.351188460137804e-05, + "loss": 1.7154, + "step": 18089 + }, + { + "epoch": 5.552486187845304, + "grad_norm": 0.24813953042030334, + "learning_rate": 4.3506956099027294e-05, + "loss": 1.8326, + "step": 18090 + }, + { + "epoch": 5.552793124616329, + "grad_norm": 0.21306444704532623, + "learning_rate": 4.35020276608446e-05, + "loss": 1.7651, + "step": 18091 + }, + { + "epoch": 5.5531000613873545, + "grad_norm": 0.22041217982769012, + "learning_rate": 4.34970992868787e-05, + "loss": 1.6852, + "step": 18092 + }, + { + "epoch": 5.55340699815838, + "grad_norm": 0.21699896454811096, + "learning_rate": 4.349217097717826e-05, + "loss": 1.7524, + "step": 18093 + }, + { + "epoch": 5.553713934929404, + "grad_norm": 0.23086662590503693, + "learning_rate": 4.3487242731792015e-05, + "loss": 1.7441, + "step": 18094 + }, + { + "epoch": 5.55402087170043, + "grad_norm": 0.21898184716701508, + "learning_rate": 4.348231455076864e-05, + "loss": 1.7131, + "step": 18095 + }, + { + "epoch": 5.554327808471455, + "grad_norm": 0.17392560839653015, + "learning_rate": 4.3477386434156854e-05, + "loss": 1.7049, + "step": 18096 + }, + { + "epoch": 5.55463474524248, + "grad_norm": 0.1984172910451889, + "learning_rate": 4.3472458382005374e-05, + "loss": 1.7136, + "step": 18097 + }, + { + "epoch": 5.554941682013506, + "grad_norm": 0.19227837026119232, + "learning_rate": 4.3467530394362866e-05, + "loss": 1.7468, + "step": 18098 + }, + { + "epoch": 5.55524861878453, + "grad_norm": 0.2307087779045105, + "learning_rate": 4.346260247127807e-05, + "loss": 1.7004, + "step": 18099 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.21496252715587616, + "learning_rate": 4.345767461279965e-05, + "loss": 1.7508, + "step": 18100 + }, + { + "epoch": 5.555862492326581, + "grad_norm": 0.21119998395442963, + "learning_rate": 4.3452746818976333e-05, + "loss": 1.7965, + "step": 18101 + }, + { + "epoch": 5.556169429097606, + "grad_norm": 0.2416355311870575, + "learning_rate": 4.34478190898568e-05, + "loss": 1.7006, + "step": 18102 + }, + { + "epoch": 5.556476365868631, + "grad_norm": 0.2009642869234085, + "learning_rate": 4.344289142548978e-05, + "loss": 1.7567, + "step": 18103 + }, + { + "epoch": 5.556783302639657, + "grad_norm": 0.2387058436870575, + "learning_rate": 4.343796382592393e-05, + "loss": 1.7898, + "step": 18104 + }, + { + "epoch": 5.557090239410681, + "grad_norm": 0.19835951924324036, + "learning_rate": 4.343303629120798e-05, + "loss": 1.7888, + "step": 18105 + }, + { + "epoch": 5.5573971761817065, + "grad_norm": 0.23324637115001678, + "learning_rate": 4.3428108821390604e-05, + "loss": 1.7923, + "step": 18106 + }, + { + "epoch": 5.557704112952732, + "grad_norm": 0.22334477305412292, + "learning_rate": 4.342318141652052e-05, + "loss": 1.7234, + "step": 18107 + }, + { + "epoch": 5.558011049723757, + "grad_norm": 0.20220427215099335, + "learning_rate": 4.341825407664639e-05, + "loss": 1.7639, + "step": 18108 + }, + { + "epoch": 5.558317986494782, + "grad_norm": 0.23658546805381775, + "learning_rate": 4.3413326801816964e-05, + "loss": 1.7505, + "step": 18109 + }, + { + "epoch": 5.558624923265807, + "grad_norm": 0.21157726645469666, + "learning_rate": 4.3408399592080875e-05, + "loss": 1.7655, + "step": 18110 + }, + { + "epoch": 5.558931860036832, + "grad_norm": 0.2139829397201538, + "learning_rate": 4.340347244748687e-05, + "loss": 1.767, + "step": 18111 + }, + { + "epoch": 5.559238796807858, + "grad_norm": 0.17811299860477448, + "learning_rate": 4.339854536808359e-05, + "loss": 1.6629, + "step": 18112 + }, + { + "epoch": 5.559545733578883, + "grad_norm": 0.2005898356437683, + "learning_rate": 4.339361835391977e-05, + "loss": 1.7269, + "step": 18113 + }, + { + "epoch": 5.559852670349908, + "grad_norm": 0.21514086425304413, + "learning_rate": 4.338869140504409e-05, + "loss": 1.7806, + "step": 18114 + }, + { + "epoch": 5.560159607120933, + "grad_norm": 0.23163840174674988, + "learning_rate": 4.338376452150522e-05, + "loss": 1.7259, + "step": 18115 + }, + { + "epoch": 5.560466543891958, + "grad_norm": 0.23657509684562683, + "learning_rate": 4.337883770335189e-05, + "loss": 1.7778, + "step": 18116 + }, + { + "epoch": 5.560773480662983, + "grad_norm": 0.20135201513767242, + "learning_rate": 4.337391095063274e-05, + "loss": 1.7359, + "step": 18117 + }, + { + "epoch": 5.561080417434009, + "grad_norm": 0.22871774435043335, + "learning_rate": 4.33689842633965e-05, + "loss": 1.7658, + "step": 18118 + }, + { + "epoch": 5.561387354205034, + "grad_norm": 0.21755221486091614, + "learning_rate": 4.3364057641691835e-05, + "loss": 1.7408, + "step": 18119 + }, + { + "epoch": 5.5616942909760585, + "grad_norm": 0.215267151594162, + "learning_rate": 4.335913108556746e-05, + "loss": 1.7175, + "step": 18120 + }, + { + "epoch": 5.562001227747084, + "grad_norm": 0.25724974274635315, + "learning_rate": 4.335420459507202e-05, + "loss": 1.7197, + "step": 18121 + }, + { + "epoch": 5.562308164518109, + "grad_norm": 0.25375521183013916, + "learning_rate": 4.3349278170254254e-05, + "loss": 1.7251, + "step": 18122 + }, + { + "epoch": 5.5626151012891345, + "grad_norm": 0.24768905341625214, + "learning_rate": 4.334435181116279e-05, + "loss": 1.7405, + "step": 18123 + }, + { + "epoch": 5.56292203806016, + "grad_norm": 0.21281081438064575, + "learning_rate": 4.333942551784636e-05, + "loss": 1.7131, + "step": 18124 + }, + { + "epoch": 5.563228974831185, + "grad_norm": 0.2129398137331009, + "learning_rate": 4.333449929035361e-05, + "loss": 1.7049, + "step": 18125 + }, + { + "epoch": 5.56353591160221, + "grad_norm": 0.24582397937774658, + "learning_rate": 4.332957312873328e-05, + "loss": 1.7205, + "step": 18126 + }, + { + "epoch": 5.563842848373235, + "grad_norm": 0.21282973885536194, + "learning_rate": 4.332464703303399e-05, + "loss": 1.7655, + "step": 18127 + }, + { + "epoch": 5.56414978514426, + "grad_norm": 0.2302251160144806, + "learning_rate": 4.331972100330447e-05, + "loss": 1.7597, + "step": 18128 + }, + { + "epoch": 5.564456721915286, + "grad_norm": 0.23453226685523987, + "learning_rate": 4.331479503959336e-05, + "loss": 1.7028, + "step": 18129 + }, + { + "epoch": 5.564763658686311, + "grad_norm": 0.19723562896251678, + "learning_rate": 4.330986914194938e-05, + "loss": 1.7101, + "step": 18130 + }, + { + "epoch": 5.565070595457335, + "grad_norm": 0.22021643817424774, + "learning_rate": 4.33049433104212e-05, + "loss": 1.7123, + "step": 18131 + }, + { + "epoch": 5.565377532228361, + "grad_norm": 0.25540977716445923, + "learning_rate": 4.3300017545057484e-05, + "loss": 1.7392, + "step": 18132 + }, + { + "epoch": 5.565684468999386, + "grad_norm": 0.23482176661491394, + "learning_rate": 4.329509184590693e-05, + "loss": 1.7175, + "step": 18133 + }, + { + "epoch": 5.565991405770411, + "grad_norm": 0.19537311792373657, + "learning_rate": 4.329016621301819e-05, + "loss": 1.7583, + "step": 18134 + }, + { + "epoch": 5.566298342541437, + "grad_norm": 0.21828842163085938, + "learning_rate": 4.328524064643997e-05, + "loss": 1.7411, + "step": 18135 + }, + { + "epoch": 5.566605279312462, + "grad_norm": 0.24589122831821442, + "learning_rate": 4.328031514622093e-05, + "loss": 1.7769, + "step": 18136 + }, + { + "epoch": 5.5669122160834865, + "grad_norm": 0.20964545011520386, + "learning_rate": 4.327538971240978e-05, + "loss": 1.7743, + "step": 18137 + }, + { + "epoch": 5.567219152854512, + "grad_norm": 0.2210713028907776, + "learning_rate": 4.327046434505514e-05, + "loss": 1.7671, + "step": 18138 + }, + { + "epoch": 5.567526089625537, + "grad_norm": 0.21382687985897064, + "learning_rate": 4.3265539044205736e-05, + "loss": 1.793, + "step": 18139 + }, + { + "epoch": 5.5678330263965625, + "grad_norm": 0.23289678990840912, + "learning_rate": 4.326061380991021e-05, + "loss": 1.738, + "step": 18140 + }, + { + "epoch": 5.568139963167588, + "grad_norm": 0.23789258301258087, + "learning_rate": 4.325568864221725e-05, + "loss": 1.8315, + "step": 18141 + }, + { + "epoch": 5.568446899938612, + "grad_norm": 0.1925022453069687, + "learning_rate": 4.325076354117554e-05, + "loss": 1.6956, + "step": 18142 + }, + { + "epoch": 5.568753836709638, + "grad_norm": 0.22522561252117157, + "learning_rate": 4.324583850683373e-05, + "loss": 1.7957, + "step": 18143 + }, + { + "epoch": 5.569060773480663, + "grad_norm": 0.2787671387195587, + "learning_rate": 4.324091353924049e-05, + "loss": 1.7325, + "step": 18144 + }, + { + "epoch": 5.569367710251688, + "grad_norm": 0.2723194658756256, + "learning_rate": 4.3235988638444536e-05, + "loss": 1.7668, + "step": 18145 + }, + { + "epoch": 5.569674647022714, + "grad_norm": 0.2241704910993576, + "learning_rate": 4.3231063804494484e-05, + "loss": 1.7977, + "step": 18146 + }, + { + "epoch": 5.569981583793739, + "grad_norm": 0.2627747356891632, + "learning_rate": 4.322613903743903e-05, + "loss": 1.6775, + "step": 18147 + }, + { + "epoch": 5.570288520564763, + "grad_norm": 0.2644255757331848, + "learning_rate": 4.322121433732686e-05, + "loss": 1.7404, + "step": 18148 + }, + { + "epoch": 5.570595457335789, + "grad_norm": 0.2386743575334549, + "learning_rate": 4.321628970420659e-05, + "loss": 1.7386, + "step": 18149 + }, + { + "epoch": 5.570902394106814, + "grad_norm": 0.22444583475589752, + "learning_rate": 4.3211365138126945e-05, + "loss": 1.7482, + "step": 18150 + }, + { + "epoch": 5.571209330877839, + "grad_norm": 0.21770013868808746, + "learning_rate": 4.3206440639136554e-05, + "loss": 1.7322, + "step": 18151 + }, + { + "epoch": 5.571516267648864, + "grad_norm": 0.22356587648391724, + "learning_rate": 4.320151620728411e-05, + "loss": 1.751, + "step": 18152 + }, + { + "epoch": 5.571823204419889, + "grad_norm": 0.2040669322013855, + "learning_rate": 4.319659184261826e-05, + "loss": 1.712, + "step": 18153 + }, + { + "epoch": 5.5721301411909145, + "grad_norm": 0.20951713621616364, + "learning_rate": 4.319166754518768e-05, + "loss": 1.7308, + "step": 18154 + }, + { + "epoch": 5.57243707796194, + "grad_norm": 0.186195969581604, + "learning_rate": 4.3186743315041025e-05, + "loss": 1.7133, + "step": 18155 + }, + { + "epoch": 5.572744014732965, + "grad_norm": 0.2098865509033203, + "learning_rate": 4.318181915222698e-05, + "loss": 1.7645, + "step": 18156 + }, + { + "epoch": 5.5730509515039905, + "grad_norm": 0.20552097260951996, + "learning_rate": 4.317689505679418e-05, + "loss": 1.7156, + "step": 18157 + }, + { + "epoch": 5.573357888275015, + "grad_norm": 0.22506964206695557, + "learning_rate": 4.3171971028791314e-05, + "loss": 1.7192, + "step": 18158 + }, + { + "epoch": 5.57366482504604, + "grad_norm": 0.2296760082244873, + "learning_rate": 4.316704706826702e-05, + "loss": 1.7534, + "step": 18159 + }, + { + "epoch": 5.573971761817066, + "grad_norm": 0.20140253007411957, + "learning_rate": 4.316212317526998e-05, + "loss": 1.6906, + "step": 18160 + }, + { + "epoch": 5.574278698588091, + "grad_norm": 0.23313316702842712, + "learning_rate": 4.315719934984884e-05, + "loss": 1.6929, + "step": 18161 + }, + { + "epoch": 5.574585635359116, + "grad_norm": 0.23398169875144958, + "learning_rate": 4.315227559205228e-05, + "loss": 1.7254, + "step": 18162 + }, + { + "epoch": 5.574892572130141, + "grad_norm": 0.20836731791496277, + "learning_rate": 4.314735190192894e-05, + "loss": 1.7335, + "step": 18163 + }, + { + "epoch": 5.575199508901166, + "grad_norm": 0.19899079203605652, + "learning_rate": 4.3142428279527485e-05, + "loss": 1.69, + "step": 18164 + }, + { + "epoch": 5.5755064456721914, + "grad_norm": 0.24623680114746094, + "learning_rate": 4.313750472489657e-05, + "loss": 1.7413, + "step": 18165 + }, + { + "epoch": 5.575813382443217, + "grad_norm": 0.2432616949081421, + "learning_rate": 4.313258123808484e-05, + "loss": 1.7426, + "step": 18166 + }, + { + "epoch": 5.576120319214242, + "grad_norm": 0.22773970663547516, + "learning_rate": 4.3127657819141006e-05, + "loss": 1.7986, + "step": 18167 + }, + { + "epoch": 5.5764272559852675, + "grad_norm": 0.19891540706157684, + "learning_rate": 4.312273446811366e-05, + "loss": 1.7007, + "step": 18168 + }, + { + "epoch": 5.576734192756292, + "grad_norm": 0.23402714729309082, + "learning_rate": 4.311781118505149e-05, + "loss": 1.7774, + "step": 18169 + }, + { + "epoch": 5.577041129527317, + "grad_norm": 0.2248220294713974, + "learning_rate": 4.3112887970003134e-05, + "loss": 1.7079, + "step": 18170 + }, + { + "epoch": 5.577348066298343, + "grad_norm": 0.20901209115982056, + "learning_rate": 4.310796482301726e-05, + "loss": 1.7336, + "step": 18171 + }, + { + "epoch": 5.577655003069368, + "grad_norm": 0.21872754395008087, + "learning_rate": 4.3103041744142516e-05, + "loss": 1.7742, + "step": 18172 + }, + { + "epoch": 5.577961939840393, + "grad_norm": 0.2567403018474579, + "learning_rate": 4.309811873342757e-05, + "loss": 1.7894, + "step": 18173 + }, + { + "epoch": 5.578268876611418, + "grad_norm": 0.219998300075531, + "learning_rate": 4.3093195790921035e-05, + "loss": 1.7283, + "step": 18174 + }, + { + "epoch": 5.578575813382443, + "grad_norm": 0.1944747269153595, + "learning_rate": 4.3088272916671614e-05, + "loss": 1.7129, + "step": 18175 + }, + { + "epoch": 5.578882750153468, + "grad_norm": 0.19492141902446747, + "learning_rate": 4.308335011072791e-05, + "loss": 1.7286, + "step": 18176 + }, + { + "epoch": 5.579189686924494, + "grad_norm": 0.22383002936840057, + "learning_rate": 4.3078427373138604e-05, + "loss": 1.733, + "step": 18177 + }, + { + "epoch": 5.579496623695519, + "grad_norm": 0.20238643884658813, + "learning_rate": 4.307350470395232e-05, + "loss": 1.7522, + "step": 18178 + }, + { + "epoch": 5.579803560466544, + "grad_norm": 0.21456125378608704, + "learning_rate": 4.3068582103217755e-05, + "loss": 1.7298, + "step": 18179 + }, + { + "epoch": 5.580110497237569, + "grad_norm": 0.28084230422973633, + "learning_rate": 4.3063659570983514e-05, + "loss": 1.7805, + "step": 18180 + }, + { + "epoch": 5.580417434008594, + "grad_norm": 0.21319706737995148, + "learning_rate": 4.305873710729824e-05, + "loss": 1.6801, + "step": 18181 + }, + { + "epoch": 5.5807243707796195, + "grad_norm": 0.2279660850763321, + "learning_rate": 4.30538147122106e-05, + "loss": 1.752, + "step": 18182 + }, + { + "epoch": 5.581031307550645, + "grad_norm": 0.1958594173192978, + "learning_rate": 4.304889238576922e-05, + "loss": 1.7487, + "step": 18183 + }, + { + "epoch": 5.581338244321669, + "grad_norm": 0.19484321773052216, + "learning_rate": 4.304397012802279e-05, + "loss": 1.7222, + "step": 18184 + }, + { + "epoch": 5.581645181092695, + "grad_norm": 0.19863305985927582, + "learning_rate": 4.3039047939019906e-05, + "loss": 1.7296, + "step": 18185 + }, + { + "epoch": 5.58195211786372, + "grad_norm": 0.18674087524414062, + "learning_rate": 4.303412581880924e-05, + "loss": 1.6753, + "step": 18186 + }, + { + "epoch": 5.582259054634745, + "grad_norm": 0.22263208031654358, + "learning_rate": 4.302920376743941e-05, + "loss": 1.7431, + "step": 18187 + }, + { + "epoch": 5.582565991405771, + "grad_norm": 0.1926872879266739, + "learning_rate": 4.302428178495909e-05, + "loss": 1.7662, + "step": 18188 + }, + { + "epoch": 5.582872928176796, + "grad_norm": 0.23190459609031677, + "learning_rate": 4.301935987141689e-05, + "loss": 1.7271, + "step": 18189 + }, + { + "epoch": 5.58317986494782, + "grad_norm": 0.30057230591773987, + "learning_rate": 4.301443802686148e-05, + "loss": 1.7957, + "step": 18190 + }, + { + "epoch": 5.583486801718846, + "grad_norm": 0.2520695626735687, + "learning_rate": 4.3009516251341475e-05, + "loss": 1.7501, + "step": 18191 + }, + { + "epoch": 5.583793738489871, + "grad_norm": 0.19143317639827728, + "learning_rate": 4.300459454490555e-05, + "loss": 1.7091, + "step": 18192 + }, + { + "epoch": 5.584100675260896, + "grad_norm": 0.2064475119113922, + "learning_rate": 4.299967290760229e-05, + "loss": 1.6849, + "step": 18193 + }, + { + "epoch": 5.584407612031922, + "grad_norm": 0.3093598484992981, + "learning_rate": 4.299475133948039e-05, + "loss": 1.8479, + "step": 18194 + }, + { + "epoch": 5.584714548802946, + "grad_norm": 0.2875300943851471, + "learning_rate": 4.298982984058845e-05, + "loss": 1.7296, + "step": 18195 + }, + { + "epoch": 5.5850214855739715, + "grad_norm": 0.33194443583488464, + "learning_rate": 4.298490841097514e-05, + "loss": 1.7668, + "step": 18196 + }, + { + "epoch": 5.585328422344997, + "grad_norm": 0.20940829813480377, + "learning_rate": 4.297998705068908e-05, + "loss": 1.7316, + "step": 18197 + }, + { + "epoch": 5.585635359116022, + "grad_norm": 0.32381999492645264, + "learning_rate": 4.297506575977887e-05, + "loss": 1.7212, + "step": 18198 + }, + { + "epoch": 5.5859422958870475, + "grad_norm": 0.31585511565208435, + "learning_rate": 4.29701445382932e-05, + "loss": 1.7695, + "step": 18199 + }, + { + "epoch": 5.586249232658073, + "grad_norm": 0.2272588014602661, + "learning_rate": 4.2965223386280664e-05, + "loss": 1.7105, + "step": 18200 + }, + { + "epoch": 5.586556169429097, + "grad_norm": 0.2949761152267456, + "learning_rate": 4.296030230378993e-05, + "loss": 1.803, + "step": 18201 + }, + { + "epoch": 5.586863106200123, + "grad_norm": 0.20512579381465912, + "learning_rate": 4.29553812908696e-05, + "loss": 1.759, + "step": 18202 + }, + { + "epoch": 5.587170042971148, + "grad_norm": 0.21143598854541779, + "learning_rate": 4.295046034756835e-05, + "loss": 1.7286, + "step": 18203 + }, + { + "epoch": 5.587476979742173, + "grad_norm": 0.22148001194000244, + "learning_rate": 4.294553947393476e-05, + "loss": 1.7258, + "step": 18204 + }, + { + "epoch": 5.587783916513199, + "grad_norm": 0.17245957255363464, + "learning_rate": 4.2940618670017484e-05, + "loss": 1.6863, + "step": 18205 + }, + { + "epoch": 5.588090853284223, + "grad_norm": 0.20260390639305115, + "learning_rate": 4.293569793586515e-05, + "loss": 1.6866, + "step": 18206 + }, + { + "epoch": 5.588397790055248, + "grad_norm": 0.20671936869621277, + "learning_rate": 4.293077727152641e-05, + "loss": 1.7849, + "step": 18207 + }, + { + "epoch": 5.588704726826274, + "grad_norm": 0.21415838599205017, + "learning_rate": 4.292585667704984e-05, + "loss": 1.7279, + "step": 18208 + }, + { + "epoch": 5.589011663597299, + "grad_norm": 0.18668091297149658, + "learning_rate": 4.2920936152484134e-05, + "loss": 1.7087, + "step": 18209 + }, + { + "epoch": 5.589318600368324, + "grad_norm": 0.2253870815038681, + "learning_rate": 4.291601569787786e-05, + "loss": 1.769, + "step": 18210 + }, + { + "epoch": 5.58962553713935, + "grad_norm": 0.22426939010620117, + "learning_rate": 4.291109531327968e-05, + "loss": 1.7382, + "step": 18211 + }, + { + "epoch": 5.589932473910374, + "grad_norm": 0.21552452445030212, + "learning_rate": 4.29061749987382e-05, + "loss": 1.7316, + "step": 18212 + }, + { + "epoch": 5.5902394106813995, + "grad_norm": 0.2337147295475006, + "learning_rate": 4.290125475430209e-05, + "loss": 1.7836, + "step": 18213 + }, + { + "epoch": 5.590546347452425, + "grad_norm": 0.21780124306678772, + "learning_rate": 4.289633458001992e-05, + "loss": 1.6923, + "step": 18214 + }, + { + "epoch": 5.59085328422345, + "grad_norm": 0.20009608566761017, + "learning_rate": 4.289141447594033e-05, + "loss": 1.719, + "step": 18215 + }, + { + "epoch": 5.5911602209944755, + "grad_norm": 0.18165744841098785, + "learning_rate": 4.288649444211196e-05, + "loss": 1.6825, + "step": 18216 + }, + { + "epoch": 5.5914671577655, + "grad_norm": 0.2244826704263687, + "learning_rate": 4.288157447858341e-05, + "loss": 1.7323, + "step": 18217 + }, + { + "epoch": 5.591774094536525, + "grad_norm": 0.16875946521759033, + "learning_rate": 4.2876654585403325e-05, + "loss": 1.6787, + "step": 18218 + }, + { + "epoch": 5.592081031307551, + "grad_norm": 0.19244243204593658, + "learning_rate": 4.28717347626203e-05, + "loss": 1.7225, + "step": 18219 + }, + { + "epoch": 5.592387968078576, + "grad_norm": 0.21081633865833282, + "learning_rate": 4.286681501028299e-05, + "loss": 1.7063, + "step": 18220 + }, + { + "epoch": 5.592694904849601, + "grad_norm": 0.20926406979560852, + "learning_rate": 4.286189532843997e-05, + "loss": 1.7307, + "step": 18221 + }, + { + "epoch": 5.593001841620627, + "grad_norm": 0.20258775353431702, + "learning_rate": 4.28569757171399e-05, + "loss": 1.6917, + "step": 18222 + }, + { + "epoch": 5.593308778391651, + "grad_norm": 0.21956230700016022, + "learning_rate": 4.285205617643137e-05, + "loss": 1.7127, + "step": 18223 + }, + { + "epoch": 5.593615715162676, + "grad_norm": 0.2071436047554016, + "learning_rate": 4.284713670636303e-05, + "loss": 1.7487, + "step": 18224 + }, + { + "epoch": 5.593922651933702, + "grad_norm": 0.2002478390932083, + "learning_rate": 4.2842217306983464e-05, + "loss": 1.6544, + "step": 18225 + }, + { + "epoch": 5.594229588704727, + "grad_norm": 0.20691382884979248, + "learning_rate": 4.283729797834132e-05, + "loss": 1.768, + "step": 18226 + }, + { + "epoch": 5.5945365254757515, + "grad_norm": 0.18423563241958618, + "learning_rate": 4.283237872048517e-05, + "loss": 1.7563, + "step": 18227 + }, + { + "epoch": 5.594843462246777, + "grad_norm": 0.23055453598499298, + "learning_rate": 4.2827459533463665e-05, + "loss": 1.8083, + "step": 18228 + }, + { + "epoch": 5.595150399017802, + "grad_norm": 0.20735648274421692, + "learning_rate": 4.2822540417325396e-05, + "loss": 1.7761, + "step": 18229 + }, + { + "epoch": 5.5954573357888275, + "grad_norm": 0.2919909656047821, + "learning_rate": 4.281762137211902e-05, + "loss": 1.7836, + "step": 18230 + }, + { + "epoch": 5.595764272559853, + "grad_norm": 0.22636881470680237, + "learning_rate": 4.2812702397893113e-05, + "loss": 1.7389, + "step": 18231 + }, + { + "epoch": 5.596071209330878, + "grad_norm": 0.23788630962371826, + "learning_rate": 4.280778349469627e-05, + "loss": 1.7536, + "step": 18232 + }, + { + "epoch": 5.596378146101903, + "grad_norm": 0.22089426219463348, + "learning_rate": 4.280286466257715e-05, + "loss": 1.7584, + "step": 18233 + }, + { + "epoch": 5.596685082872928, + "grad_norm": 0.20486171543598175, + "learning_rate": 4.279794590158431e-05, + "loss": 1.7182, + "step": 18234 + }, + { + "epoch": 5.596992019643953, + "grad_norm": 0.2343701422214508, + "learning_rate": 4.2793027211766425e-05, + "loss": 1.751, + "step": 18235 + }, + { + "epoch": 5.597298956414979, + "grad_norm": 0.21734023094177246, + "learning_rate": 4.2788108593172036e-05, + "loss": 1.7084, + "step": 18236 + }, + { + "epoch": 5.597605893186004, + "grad_norm": 0.20593903958797455, + "learning_rate": 4.278319004584982e-05, + "loss": 1.6805, + "step": 18237 + }, + { + "epoch": 5.597912829957028, + "grad_norm": 0.20877878367900848, + "learning_rate": 4.2778271569848324e-05, + "loss": 1.7011, + "step": 18238 + }, + { + "epoch": 5.598219766728054, + "grad_norm": 0.23915995657444, + "learning_rate": 4.277335316521619e-05, + "loss": 1.732, + "step": 18239 + }, + { + "epoch": 5.598526703499079, + "grad_norm": 0.24310529232025146, + "learning_rate": 4.2768434832002004e-05, + "loss": 1.7859, + "step": 18240 + }, + { + "epoch": 5.598833640270104, + "grad_norm": 0.23189407587051392, + "learning_rate": 4.27635165702544e-05, + "loss": 1.7237, + "step": 18241 + }, + { + "epoch": 5.59914057704113, + "grad_norm": 0.2708875834941864, + "learning_rate": 4.275859838002195e-05, + "loss": 1.7046, + "step": 18242 + }, + { + "epoch": 5.599447513812155, + "grad_norm": 0.23692840337753296, + "learning_rate": 4.27536802613533e-05, + "loss": 1.8556, + "step": 18243 + }, + { + "epoch": 5.5997544505831796, + "grad_norm": 0.28285983204841614, + "learning_rate": 4.274876221429701e-05, + "loss": 1.6734, + "step": 18244 + }, + { + "epoch": 5.600061387354205, + "grad_norm": 0.20602203905582428, + "learning_rate": 4.27438442389017e-05, + "loss": 1.7113, + "step": 18245 + }, + { + "epoch": 5.60036832412523, + "grad_norm": 0.19719314575195312, + "learning_rate": 4.273892633521598e-05, + "loss": 1.7229, + "step": 18246 + }, + { + "epoch": 5.600675260896256, + "grad_norm": 0.2396705001592636, + "learning_rate": 4.273400850328846e-05, + "loss": 1.6986, + "step": 18247 + }, + { + "epoch": 5.600982197667281, + "grad_norm": 0.1974172443151474, + "learning_rate": 4.2729090743167724e-05, + "loss": 1.7445, + "step": 18248 + }, + { + "epoch": 5.601289134438305, + "grad_norm": 0.2193709760904312, + "learning_rate": 4.272417305490235e-05, + "loss": 1.7657, + "step": 18249 + }, + { + "epoch": 5.601596071209331, + "grad_norm": 0.24138681590557098, + "learning_rate": 4.271925543854098e-05, + "loss": 1.7388, + "step": 18250 + }, + { + "epoch": 5.601903007980356, + "grad_norm": 0.19056223332881927, + "learning_rate": 4.271433789413219e-05, + "loss": 1.6897, + "step": 18251 + }, + { + "epoch": 5.602209944751381, + "grad_norm": 0.20533505082130432, + "learning_rate": 4.270942042172459e-05, + "loss": 1.7222, + "step": 18252 + }, + { + "epoch": 5.602516881522407, + "grad_norm": 0.20570224523544312, + "learning_rate": 4.270450302136675e-05, + "loss": 1.8089, + "step": 18253 + }, + { + "epoch": 5.602823818293432, + "grad_norm": 0.2822209298610687, + "learning_rate": 4.269958569310732e-05, + "loss": 1.7523, + "step": 18254 + }, + { + "epoch": 5.6031307550644565, + "grad_norm": 0.2994859218597412, + "learning_rate": 4.269466843699484e-05, + "loss": 1.7538, + "step": 18255 + }, + { + "epoch": 5.603437691835482, + "grad_norm": 0.24851159751415253, + "learning_rate": 4.2689751253077925e-05, + "loss": 1.8162, + "step": 18256 + }, + { + "epoch": 5.603744628606507, + "grad_norm": 0.20387138426303864, + "learning_rate": 4.268483414140517e-05, + "loss": 1.6803, + "step": 18257 + }, + { + "epoch": 5.6040515653775325, + "grad_norm": 0.21620385348796844, + "learning_rate": 4.2679917102025204e-05, + "loss": 1.7236, + "step": 18258 + }, + { + "epoch": 5.604358502148557, + "grad_norm": 0.1925734579563141, + "learning_rate": 4.267500013498655e-05, + "loss": 1.7295, + "step": 18259 + }, + { + "epoch": 5.604665438919582, + "grad_norm": 0.22216086089611053, + "learning_rate": 4.267008324033787e-05, + "loss": 1.6844, + "step": 18260 + }, + { + "epoch": 5.604972375690608, + "grad_norm": 0.20293502509593964, + "learning_rate": 4.26651664181277e-05, + "loss": 1.7065, + "step": 18261 + }, + { + "epoch": 5.605279312461633, + "grad_norm": 0.21269507706165314, + "learning_rate": 4.266024966840466e-05, + "loss": 1.7573, + "step": 18262 + }, + { + "epoch": 5.605586249232658, + "grad_norm": 0.23574227094650269, + "learning_rate": 4.2655332991217334e-05, + "loss": 1.7625, + "step": 18263 + }, + { + "epoch": 5.605893186003684, + "grad_norm": 0.1875103861093521, + "learning_rate": 4.265041638661433e-05, + "loss": 1.7266, + "step": 18264 + }, + { + "epoch": 5.606200122774708, + "grad_norm": 0.20348483324050903, + "learning_rate": 4.264549985464421e-05, + "loss": 1.731, + "step": 18265 + }, + { + "epoch": 5.606507059545733, + "grad_norm": 0.2345927655696869, + "learning_rate": 4.264058339535556e-05, + "loss": 1.7809, + "step": 18266 + }, + { + "epoch": 5.606813996316759, + "grad_norm": 0.21142496168613434, + "learning_rate": 4.2635667008796985e-05, + "loss": 1.7362, + "step": 18267 + }, + { + "epoch": 5.607120933087784, + "grad_norm": 0.19670210778713226, + "learning_rate": 4.263075069501705e-05, + "loss": 1.7029, + "step": 18268 + }, + { + "epoch": 5.607427869858809, + "grad_norm": 0.20985090732574463, + "learning_rate": 4.262583445406439e-05, + "loss": 1.7478, + "step": 18269 + }, + { + "epoch": 5.607734806629834, + "grad_norm": 0.20972272753715515, + "learning_rate": 4.262091828598752e-05, + "loss": 1.7561, + "step": 18270 + }, + { + "epoch": 5.608041743400859, + "grad_norm": 0.20006676018238068, + "learning_rate": 4.261600219083509e-05, + "loss": 1.7584, + "step": 18271 + }, + { + "epoch": 5.6083486801718845, + "grad_norm": 0.21590086817741394, + "learning_rate": 4.2611086168655635e-05, + "loss": 1.7405, + "step": 18272 + }, + { + "epoch": 5.60865561694291, + "grad_norm": 0.19330906867980957, + "learning_rate": 4.260617021949776e-05, + "loss": 1.6797, + "step": 18273 + }, + { + "epoch": 5.608962553713935, + "grad_norm": 0.1955050528049469, + "learning_rate": 4.260125434341004e-05, + "loss": 1.7174, + "step": 18274 + }, + { + "epoch": 5.6092694904849605, + "grad_norm": 0.2117784321308136, + "learning_rate": 4.2596338540441086e-05, + "loss": 1.743, + "step": 18275 + }, + { + "epoch": 5.609576427255985, + "grad_norm": 0.21788950264453888, + "learning_rate": 4.2591422810639425e-05, + "loss": 1.7603, + "step": 18276 + }, + { + "epoch": 5.60988336402701, + "grad_norm": 0.2092670351266861, + "learning_rate": 4.258650715405369e-05, + "loss": 1.7379, + "step": 18277 + }, + { + "epoch": 5.610190300798036, + "grad_norm": 0.1941552758216858, + "learning_rate": 4.2581591570732414e-05, + "loss": 1.7547, + "step": 18278 + }, + { + "epoch": 5.610497237569061, + "grad_norm": 0.21306751668453217, + "learning_rate": 4.2576676060724215e-05, + "loss": 1.7284, + "step": 18279 + }, + { + "epoch": 5.610804174340086, + "grad_norm": 0.18618693947792053, + "learning_rate": 4.2571760624077635e-05, + "loss": 1.7268, + "step": 18280 + }, + { + "epoch": 5.611111111111111, + "grad_norm": 0.21530354022979736, + "learning_rate": 4.256684526084129e-05, + "loss": 1.7036, + "step": 18281 + }, + { + "epoch": 5.611418047882136, + "grad_norm": 0.23363792896270752, + "learning_rate": 4.256192997106375e-05, + "loss": 1.7797, + "step": 18282 + }, + { + "epoch": 5.611724984653161, + "grad_norm": 0.1786416620016098, + "learning_rate": 4.2557014754793544e-05, + "loss": 1.7008, + "step": 18283 + }, + { + "epoch": 5.612031921424187, + "grad_norm": 0.2042730301618576, + "learning_rate": 4.25520996120793e-05, + "loss": 1.7667, + "step": 18284 + }, + { + "epoch": 5.612338858195212, + "grad_norm": 0.2275264412164688, + "learning_rate": 4.2547184542969554e-05, + "loss": 1.8277, + "step": 18285 + }, + { + "epoch": 5.612645794966237, + "grad_norm": 0.21252553164958954, + "learning_rate": 4.2542269547512925e-05, + "loss": 1.7272, + "step": 18286 + }, + { + "epoch": 5.612952731737262, + "grad_norm": 0.20384398102760315, + "learning_rate": 4.2537354625757934e-05, + "loss": 1.6707, + "step": 18287 + }, + { + "epoch": 5.613259668508287, + "grad_norm": 0.19805553555488586, + "learning_rate": 4.253243977775321e-05, + "loss": 1.7443, + "step": 18288 + }, + { + "epoch": 5.6135666052793125, + "grad_norm": 0.20447707176208496, + "learning_rate": 4.2527525003547256e-05, + "loss": 1.7392, + "step": 18289 + }, + { + "epoch": 5.613873542050338, + "grad_norm": 0.21025662124156952, + "learning_rate": 4.25226103031887e-05, + "loss": 1.7856, + "step": 18290 + }, + { + "epoch": 5.614180478821363, + "grad_norm": 0.2131013125181198, + "learning_rate": 4.2517695676726085e-05, + "loss": 1.7521, + "step": 18291 + }, + { + "epoch": 5.614487415592388, + "grad_norm": 0.2511558532714844, + "learning_rate": 4.2512781124208e-05, + "loss": 1.6873, + "step": 18292 + }, + { + "epoch": 5.614794352363413, + "grad_norm": 0.19668610394001007, + "learning_rate": 4.2507866645682984e-05, + "loss": 1.6808, + "step": 18293 + }, + { + "epoch": 5.615101289134438, + "grad_norm": 0.22313621640205383, + "learning_rate": 4.2502952241199637e-05, + "loss": 1.7794, + "step": 18294 + }, + { + "epoch": 5.615408225905464, + "grad_norm": 0.2053089439868927, + "learning_rate": 4.249803791080649e-05, + "loss": 1.7405, + "step": 18295 + }, + { + "epoch": 5.615715162676489, + "grad_norm": 0.2052931934595108, + "learning_rate": 4.249312365455215e-05, + "loss": 1.6698, + "step": 18296 + }, + { + "epoch": 5.616022099447514, + "grad_norm": 0.223783478140831, + "learning_rate": 4.248820947248515e-05, + "loss": 1.7696, + "step": 18297 + }, + { + "epoch": 5.616329036218539, + "grad_norm": 0.3424001932144165, + "learning_rate": 4.248329536465407e-05, + "loss": 1.7724, + "step": 18298 + }, + { + "epoch": 5.616635972989564, + "grad_norm": 0.25015103816986084, + "learning_rate": 4.247838133110749e-05, + "loss": 1.7188, + "step": 18299 + }, + { + "epoch": 5.616942909760589, + "grad_norm": 0.239765465259552, + "learning_rate": 4.247346737189392e-05, + "loss": 1.695, + "step": 18300 + }, + { + "epoch": 5.617249846531615, + "grad_norm": 0.42259401082992554, + "learning_rate": 4.246855348706197e-05, + "loss": 1.6882, + "step": 18301 + }, + { + "epoch": 5.617556783302639, + "grad_norm": 0.2985959053039551, + "learning_rate": 4.246363967666018e-05, + "loss": 1.7236, + "step": 18302 + }, + { + "epoch": 5.6178637200736645, + "grad_norm": 0.22437956929206848, + "learning_rate": 4.245872594073714e-05, + "loss": 1.7158, + "step": 18303 + }, + { + "epoch": 5.61817065684469, + "grad_norm": 0.3165835440158844, + "learning_rate": 4.245381227934138e-05, + "loss": 1.7543, + "step": 18304 + }, + { + "epoch": 5.618477593615715, + "grad_norm": 0.2565564513206482, + "learning_rate": 4.244889869252148e-05, + "loss": 1.7863, + "step": 18305 + }, + { + "epoch": 5.6187845303867405, + "grad_norm": 0.25741446018218994, + "learning_rate": 4.244398518032597e-05, + "loss": 1.721, + "step": 18306 + }, + { + "epoch": 5.619091467157766, + "grad_norm": 0.26492297649383545, + "learning_rate": 4.2439071742803435e-05, + "loss": 1.7697, + "step": 18307 + }, + { + "epoch": 5.61939840392879, + "grad_norm": 0.2086823433637619, + "learning_rate": 4.243415838000243e-05, + "loss": 1.7072, + "step": 18308 + }, + { + "epoch": 5.619705340699816, + "grad_norm": 0.26784422993659973, + "learning_rate": 4.24292450919715e-05, + "loss": 1.7826, + "step": 18309 + }, + { + "epoch": 5.620012277470841, + "grad_norm": 0.21774251759052277, + "learning_rate": 4.242433187875921e-05, + "loss": 1.7204, + "step": 18310 + }, + { + "epoch": 5.620319214241866, + "grad_norm": 0.29547446966171265, + "learning_rate": 4.241941874041412e-05, + "loss": 1.7303, + "step": 18311 + }, + { + "epoch": 5.620626151012892, + "grad_norm": 0.20278988778591156, + "learning_rate": 4.241450567698476e-05, + "loss": 1.692, + "step": 18312 + }, + { + "epoch": 5.620933087783916, + "grad_norm": 0.2084289938211441, + "learning_rate": 4.240959268851971e-05, + "loss": 1.7069, + "step": 18313 + }, + { + "epoch": 5.621240024554941, + "grad_norm": 0.19901904463768005, + "learning_rate": 4.240467977506752e-05, + "loss": 1.6798, + "step": 18314 + }, + { + "epoch": 5.621546961325967, + "grad_norm": 0.24629411101341248, + "learning_rate": 4.2399766936676735e-05, + "loss": 1.775, + "step": 18315 + }, + { + "epoch": 5.621853898096992, + "grad_norm": 0.2532403767108917, + "learning_rate": 4.239485417339591e-05, + "loss": 1.7669, + "step": 18316 + }, + { + "epoch": 5.622160834868017, + "grad_norm": 0.22495722770690918, + "learning_rate": 4.2389941485273576e-05, + "loss": 1.7772, + "step": 18317 + }, + { + "epoch": 5.622467771639043, + "grad_norm": 0.2789733111858368, + "learning_rate": 4.2385028872358316e-05, + "loss": 1.751, + "step": 18318 + }, + { + "epoch": 5.622774708410067, + "grad_norm": 0.2266954481601715, + "learning_rate": 4.238011633469866e-05, + "loss": 1.7213, + "step": 18319 + }, + { + "epoch": 5.6230816451810925, + "grad_norm": 0.2163502722978592, + "learning_rate": 4.237520387234316e-05, + "loss": 1.7781, + "step": 18320 + }, + { + "epoch": 5.623388581952118, + "grad_norm": 0.25249144434928894, + "learning_rate": 4.237029148534036e-05, + "loss": 1.7293, + "step": 18321 + }, + { + "epoch": 5.623695518723143, + "grad_norm": 0.2320011854171753, + "learning_rate": 4.2365379173738826e-05, + "loss": 1.7909, + "step": 18322 + }, + { + "epoch": 5.6240024554941686, + "grad_norm": 0.22074681520462036, + "learning_rate": 4.2360466937587074e-05, + "loss": 1.743, + "step": 18323 + }, + { + "epoch": 5.624309392265193, + "grad_norm": 0.20864775776863098, + "learning_rate": 4.235555477693368e-05, + "loss": 1.726, + "step": 18324 + }, + { + "epoch": 5.624616329036218, + "grad_norm": 0.24547792971134186, + "learning_rate": 4.235064269182716e-05, + "loss": 1.7646, + "step": 18325 + }, + { + "epoch": 5.624923265807244, + "grad_norm": 0.29965806007385254, + "learning_rate": 4.234573068231607e-05, + "loss": 1.7789, + "step": 18326 + }, + { + "epoch": 5.625230202578269, + "grad_norm": 0.20844583213329315, + "learning_rate": 4.234081874844896e-05, + "loss": 1.7007, + "step": 18327 + }, + { + "epoch": 5.625537139349294, + "grad_norm": 0.2455398142337799, + "learning_rate": 4.2335906890274385e-05, + "loss": 1.7094, + "step": 18328 + }, + { + "epoch": 5.62584407612032, + "grad_norm": 0.17839518189430237, + "learning_rate": 4.233099510784085e-05, + "loss": 1.6849, + "step": 18329 + }, + { + "epoch": 5.626151012891344, + "grad_norm": 0.20219004154205322, + "learning_rate": 4.232608340119693e-05, + "loss": 1.716, + "step": 18330 + }, + { + "epoch": 5.6264579496623695, + "grad_norm": 0.23570619523525238, + "learning_rate": 4.232117177039114e-05, + "loss": 1.7622, + "step": 18331 + }, + { + "epoch": 5.626764886433395, + "grad_norm": 0.23534397780895233, + "learning_rate": 4.231626021547204e-05, + "loss": 1.7758, + "step": 18332 + }, + { + "epoch": 5.62707182320442, + "grad_norm": 0.2177352011203766, + "learning_rate": 4.231134873648817e-05, + "loss": 1.7102, + "step": 18333 + }, + { + "epoch": 5.627378759975445, + "grad_norm": 0.22886058688163757, + "learning_rate": 4.230643733348803e-05, + "loss": 1.7766, + "step": 18334 + }, + { + "epoch": 5.62768569674647, + "grad_norm": 0.20723696053028107, + "learning_rate": 4.2301526006520215e-05, + "loss": 1.7287, + "step": 18335 + }, + { + "epoch": 5.627992633517495, + "grad_norm": 0.18612104654312134, + "learning_rate": 4.229661475563321e-05, + "loss": 1.7255, + "step": 18336 + }, + { + "epoch": 5.628299570288521, + "grad_norm": 0.26456236839294434, + "learning_rate": 4.229170358087558e-05, + "loss": 1.7388, + "step": 18337 + }, + { + "epoch": 5.628606507059546, + "grad_norm": 0.25253555178642273, + "learning_rate": 4.2286792482295845e-05, + "loss": 1.7031, + "step": 18338 + }, + { + "epoch": 5.628913443830571, + "grad_norm": 0.23093348741531372, + "learning_rate": 4.228188145994257e-05, + "loss": 1.8032, + "step": 18339 + }, + { + "epoch": 5.629220380601596, + "grad_norm": 0.24142487347126007, + "learning_rate": 4.227697051386424e-05, + "loss": 1.6621, + "step": 18340 + }, + { + "epoch": 5.629527317372621, + "grad_norm": 0.2883392572402954, + "learning_rate": 4.227205964410944e-05, + "loss": 1.7125, + "step": 18341 + }, + { + "epoch": 5.629834254143646, + "grad_norm": 0.22670713067054749, + "learning_rate": 4.226714885072665e-05, + "loss": 1.7659, + "step": 18342 + }, + { + "epoch": 5.630141190914672, + "grad_norm": 0.2795337438583374, + "learning_rate": 4.226223813376444e-05, + "loss": 1.7559, + "step": 18343 + }, + { + "epoch": 5.630448127685697, + "grad_norm": 0.2513083219528198, + "learning_rate": 4.225732749327132e-05, + "loss": 1.6969, + "step": 18344 + }, + { + "epoch": 5.6307550644567215, + "grad_norm": 0.24588467180728912, + "learning_rate": 4.225241692929585e-05, + "loss": 1.7724, + "step": 18345 + }, + { + "epoch": 5.631062001227747, + "grad_norm": 0.41726353764533997, + "learning_rate": 4.224750644188651e-05, + "loss": 1.7308, + "step": 18346 + }, + { + "epoch": 5.631368937998772, + "grad_norm": 0.2512385845184326, + "learning_rate": 4.2242596031091886e-05, + "loss": 1.7068, + "step": 18347 + }, + { + "epoch": 5.6316758747697975, + "grad_norm": 0.3077464997768402, + "learning_rate": 4.223768569696044e-05, + "loss": 1.7383, + "step": 18348 + }, + { + "epoch": 5.631982811540823, + "grad_norm": 0.3460720479488373, + "learning_rate": 4.2232775439540756e-05, + "loss": 1.7317, + "step": 18349 + }, + { + "epoch": 5.632289748311848, + "grad_norm": 0.24827539920806885, + "learning_rate": 4.222786525888134e-05, + "loss": 1.6871, + "step": 18350 + }, + { + "epoch": 5.632596685082873, + "grad_norm": 0.24851584434509277, + "learning_rate": 4.22229551550307e-05, + "loss": 1.7058, + "step": 18351 + }, + { + "epoch": 5.632903621853898, + "grad_norm": 0.31132519245147705, + "learning_rate": 4.2218045128037396e-05, + "loss": 1.7523, + "step": 18352 + }, + { + "epoch": 5.633210558624923, + "grad_norm": 0.3104027807712555, + "learning_rate": 4.2213135177949906e-05, + "loss": 1.7669, + "step": 18353 + }, + { + "epoch": 5.633517495395949, + "grad_norm": 0.31351104378700256, + "learning_rate": 4.2208225304816795e-05, + "loss": 1.7031, + "step": 18354 + }, + { + "epoch": 5.633824432166974, + "grad_norm": 0.3217851221561432, + "learning_rate": 4.2203315508686555e-05, + "loss": 1.7694, + "step": 18355 + }, + { + "epoch": 5.634131368937998, + "grad_norm": 0.22287796437740326, + "learning_rate": 4.2198405789607745e-05, + "loss": 1.7742, + "step": 18356 + }, + { + "epoch": 5.634438305709024, + "grad_norm": 0.20288340747356415, + "learning_rate": 4.219349614762883e-05, + "loss": 1.7113, + "step": 18357 + }, + { + "epoch": 5.634745242480049, + "grad_norm": 0.19823449850082397, + "learning_rate": 4.218858658279839e-05, + "loss": 1.7433, + "step": 18358 + }, + { + "epoch": 5.635052179251074, + "grad_norm": 0.2756347358226776, + "learning_rate": 4.2183677095164895e-05, + "loss": 1.8278, + "step": 18359 + }, + { + "epoch": 5.6353591160221, + "grad_norm": 0.2303706556558609, + "learning_rate": 4.2178767684776895e-05, + "loss": 1.6943, + "step": 18360 + }, + { + "epoch": 5.635666052793125, + "grad_norm": 0.25089216232299805, + "learning_rate": 4.217385835168288e-05, + "loss": 1.6562, + "step": 18361 + }, + { + "epoch": 5.6359729895641495, + "grad_norm": 0.3013486862182617, + "learning_rate": 4.216894909593141e-05, + "loss": 1.7323, + "step": 18362 + }, + { + "epoch": 5.636279926335175, + "grad_norm": 0.19471928477287292, + "learning_rate": 4.2164039917570956e-05, + "loss": 1.7301, + "step": 18363 + }, + { + "epoch": 5.6365868631062, + "grad_norm": 0.3257733881473541, + "learning_rate": 4.2159130816650075e-05, + "loss": 1.7522, + "step": 18364 + }, + { + "epoch": 5.6368937998772255, + "grad_norm": 0.3065868020057678, + "learning_rate": 4.215422179321723e-05, + "loss": 1.7077, + "step": 18365 + }, + { + "epoch": 5.637200736648251, + "grad_norm": 0.20643819868564606, + "learning_rate": 4.214931284732098e-05, + "loss": 1.8033, + "step": 18366 + }, + { + "epoch": 5.637507673419275, + "grad_norm": 0.23551981151103973, + "learning_rate": 4.2144403979009826e-05, + "loss": 1.7391, + "step": 18367 + }, + { + "epoch": 5.637814610190301, + "grad_norm": 0.20602314174175262, + "learning_rate": 4.2139495188332265e-05, + "loss": 1.7593, + "step": 18368 + }, + { + "epoch": 5.638121546961326, + "grad_norm": 0.27911239862442017, + "learning_rate": 4.2134586475336834e-05, + "loss": 1.7212, + "step": 18369 + }, + { + "epoch": 5.638428483732351, + "grad_norm": 0.2700496017932892, + "learning_rate": 4.212967784007201e-05, + "loss": 1.7755, + "step": 18370 + }, + { + "epoch": 5.638735420503377, + "grad_norm": 0.24988985061645508, + "learning_rate": 4.2124769282586334e-05, + "loss": 1.7364, + "step": 18371 + }, + { + "epoch": 5.639042357274402, + "grad_norm": 0.20491284132003784, + "learning_rate": 4.211986080292829e-05, + "loss": 1.7477, + "step": 18372 + }, + { + "epoch": 5.639349294045426, + "grad_norm": 0.24953459203243256, + "learning_rate": 4.211495240114643e-05, + "loss": 1.7712, + "step": 18373 + }, + { + "epoch": 5.639656230816452, + "grad_norm": 0.2028491199016571, + "learning_rate": 4.2110044077289204e-05, + "loss": 1.701, + "step": 18374 + }, + { + "epoch": 5.639963167587477, + "grad_norm": 0.22320568561553955, + "learning_rate": 4.210513583140517e-05, + "loss": 1.7818, + "step": 18375 + }, + { + "epoch": 5.640270104358502, + "grad_norm": 0.22680947184562683, + "learning_rate": 4.210022766354278e-05, + "loss": 1.7631, + "step": 18376 + }, + { + "epoch": 5.640577041129527, + "grad_norm": 0.20724014937877655, + "learning_rate": 4.2095319573750596e-05, + "loss": 1.7757, + "step": 18377 + }, + { + "epoch": 5.640883977900552, + "grad_norm": 0.21785953640937805, + "learning_rate": 4.209041156207708e-05, + "loss": 1.7161, + "step": 18378 + }, + { + "epoch": 5.6411909146715775, + "grad_norm": 0.21751803159713745, + "learning_rate": 4.208550362857078e-05, + "loss": 1.7449, + "step": 18379 + }, + { + "epoch": 5.641497851442603, + "grad_norm": 0.1765962839126587, + "learning_rate": 4.208059577328014e-05, + "loss": 1.7191, + "step": 18380 + }, + { + "epoch": 5.641804788213628, + "grad_norm": 0.22720913589000702, + "learning_rate": 4.2075687996253724e-05, + "loss": 1.7037, + "step": 18381 + }, + { + "epoch": 5.6421117249846535, + "grad_norm": 0.23589655756950378, + "learning_rate": 4.2070780297539976e-05, + "loss": 1.8147, + "step": 18382 + }, + { + "epoch": 5.642418661755678, + "grad_norm": 0.21187056601047516, + "learning_rate": 4.2065872677187435e-05, + "loss": 1.7655, + "step": 18383 + }, + { + "epoch": 5.642725598526703, + "grad_norm": 0.24153946340084076, + "learning_rate": 4.2060965135244606e-05, + "loss": 1.7841, + "step": 18384 + }, + { + "epoch": 5.643032535297729, + "grad_norm": 0.2059229612350464, + "learning_rate": 4.205605767175995e-05, + "loss": 1.6718, + "step": 18385 + }, + { + "epoch": 5.643339472068754, + "grad_norm": 0.20235973596572876, + "learning_rate": 4.205115028678201e-05, + "loss": 1.6931, + "step": 18386 + }, + { + "epoch": 5.643646408839779, + "grad_norm": 0.25149911642074585, + "learning_rate": 4.204624298035924e-05, + "loss": 1.7465, + "step": 18387 + }, + { + "epoch": 5.643953345610804, + "grad_norm": 0.2050812691450119, + "learning_rate": 4.204133575254017e-05, + "loss": 1.7147, + "step": 18388 + }, + { + "epoch": 5.644260282381829, + "grad_norm": 0.20906420052051544, + "learning_rate": 4.2036428603373274e-05, + "loss": 1.6762, + "step": 18389 + }, + { + "epoch": 5.644567219152854, + "grad_norm": 0.20150595903396606, + "learning_rate": 4.2031521532907075e-05, + "loss": 1.678, + "step": 18390 + }, + { + "epoch": 5.64487415592388, + "grad_norm": 0.2141568511724472, + "learning_rate": 4.202661454119004e-05, + "loss": 1.7274, + "step": 18391 + }, + { + "epoch": 5.645181092694905, + "grad_norm": 0.2641741931438446, + "learning_rate": 4.202170762827069e-05, + "loss": 1.7975, + "step": 18392 + }, + { + "epoch": 5.64548802946593, + "grad_norm": 0.22928468883037567, + "learning_rate": 4.201680079419747e-05, + "loss": 1.7687, + "step": 18393 + }, + { + "epoch": 5.645794966236955, + "grad_norm": 0.22713731229305267, + "learning_rate": 4.2011894039018925e-05, + "loss": 1.7475, + "step": 18394 + }, + { + "epoch": 5.64610190300798, + "grad_norm": 0.25602981448173523, + "learning_rate": 4.200698736278351e-05, + "loss": 1.7356, + "step": 18395 + }, + { + "epoch": 5.6464088397790055, + "grad_norm": 0.2619759738445282, + "learning_rate": 4.200208076553975e-05, + "loss": 1.7334, + "step": 18396 + }, + { + "epoch": 5.646715776550031, + "grad_norm": 0.24756783246994019, + "learning_rate": 4.19971742473361e-05, + "loss": 1.7253, + "step": 18397 + }, + { + "epoch": 5.647022713321056, + "grad_norm": 0.2068249136209488, + "learning_rate": 4.199226780822109e-05, + "loss": 1.7246, + "step": 18398 + }, + { + "epoch": 5.647329650092081, + "grad_norm": 0.23219087719917297, + "learning_rate": 4.1987361448243165e-05, + "loss": 1.7388, + "step": 18399 + }, + { + "epoch": 5.647636586863106, + "grad_norm": 0.2051403522491455, + "learning_rate": 4.198245516745082e-05, + "loss": 1.7775, + "step": 18400 + }, + { + "epoch": 5.647943523634131, + "grad_norm": 0.26408639550209045, + "learning_rate": 4.1977548965892575e-05, + "loss": 1.8069, + "step": 18401 + }, + { + "epoch": 5.648250460405157, + "grad_norm": 0.2104891538619995, + "learning_rate": 4.197264284361687e-05, + "loss": 1.7335, + "step": 18402 + }, + { + "epoch": 5.648557397176182, + "grad_norm": 0.23963849246501923, + "learning_rate": 4.196773680067224e-05, + "loss": 1.7254, + "step": 18403 + }, + { + "epoch": 5.648864333947207, + "grad_norm": 0.2770128846168518, + "learning_rate": 4.1962830837107117e-05, + "loss": 1.7848, + "step": 18404 + }, + { + "epoch": 5.649171270718232, + "grad_norm": 0.23342710733413696, + "learning_rate": 4.195792495297002e-05, + "loss": 1.7818, + "step": 18405 + }, + { + "epoch": 5.649478207489257, + "grad_norm": 0.23835061490535736, + "learning_rate": 4.195301914830941e-05, + "loss": 1.7453, + "step": 18406 + }, + { + "epoch": 5.649785144260282, + "grad_norm": 0.21896767616271973, + "learning_rate": 4.194811342317381e-05, + "loss": 1.7205, + "step": 18407 + }, + { + "epoch": 5.650092081031308, + "grad_norm": 0.20222818851470947, + "learning_rate": 4.1943207777611646e-05, + "loss": 1.6833, + "step": 18408 + }, + { + "epoch": 5.650399017802332, + "grad_norm": 0.2182089239358902, + "learning_rate": 4.193830221167146e-05, + "loss": 1.7296, + "step": 18409 + }, + { + "epoch": 5.650705954573358, + "grad_norm": 0.19981688261032104, + "learning_rate": 4.1933396725401655e-05, + "loss": 1.7327, + "step": 18410 + }, + { + "epoch": 5.651012891344383, + "grad_norm": 0.23925067484378815, + "learning_rate": 4.192849131885077e-05, + "loss": 1.7545, + "step": 18411 + }, + { + "epoch": 5.651319828115408, + "grad_norm": 0.21967993676662445, + "learning_rate": 4.192358599206725e-05, + "loss": 1.6973, + "step": 18412 + }, + { + "epoch": 5.651626764886434, + "grad_norm": 0.2273840606212616, + "learning_rate": 4.1918680745099614e-05, + "loss": 1.8229, + "step": 18413 + }, + { + "epoch": 5.651933701657459, + "grad_norm": 0.26950231194496155, + "learning_rate": 4.1913775577996286e-05, + "loss": 1.7666, + "step": 18414 + }, + { + "epoch": 5.652240638428484, + "grad_norm": 0.26608848571777344, + "learning_rate": 4.190887049080579e-05, + "loss": 1.8279, + "step": 18415 + }, + { + "epoch": 5.652547575199509, + "grad_norm": 0.20856785774230957, + "learning_rate": 4.190396548357658e-05, + "loss": 1.7224, + "step": 18416 + }, + { + "epoch": 5.652854511970534, + "grad_norm": 0.2894255816936493, + "learning_rate": 4.18990605563571e-05, + "loss": 1.7308, + "step": 18417 + }, + { + "epoch": 5.653161448741559, + "grad_norm": 0.2047591209411621, + "learning_rate": 4.189415570919588e-05, + "loss": 1.758, + "step": 18418 + }, + { + "epoch": 5.653468385512585, + "grad_norm": 0.37161269783973694, + "learning_rate": 4.1889250942141346e-05, + "loss": 1.7926, + "step": 18419 + }, + { + "epoch": 5.653775322283609, + "grad_norm": 0.37338340282440186, + "learning_rate": 4.1884346255242e-05, + "loss": 1.7491, + "step": 18420 + }, + { + "epoch": 5.6540822590546345, + "grad_norm": 0.24279838800430298, + "learning_rate": 4.187944164854629e-05, + "loss": 1.7103, + "step": 18421 + }, + { + "epoch": 5.65438919582566, + "grad_norm": 0.219639852643013, + "learning_rate": 4.18745371221027e-05, + "loss": 1.7824, + "step": 18422 + }, + { + "epoch": 5.654696132596685, + "grad_norm": 0.22248409688472748, + "learning_rate": 4.186963267595969e-05, + "loss": 1.8098, + "step": 18423 + }, + { + "epoch": 5.6550030693677105, + "grad_norm": 0.2115657478570938, + "learning_rate": 4.1864728310165755e-05, + "loss": 1.72, + "step": 18424 + }, + { + "epoch": 5.655310006138736, + "grad_norm": 0.19723005592823029, + "learning_rate": 4.1859824024769325e-05, + "loss": 1.6818, + "step": 18425 + }, + { + "epoch": 5.65561694290976, + "grad_norm": 0.1828317642211914, + "learning_rate": 4.185491981981891e-05, + "loss": 1.7243, + "step": 18426 + }, + { + "epoch": 5.655923879680786, + "grad_norm": 0.271781861782074, + "learning_rate": 4.185001569536292e-05, + "loss": 1.7688, + "step": 18427 + }, + { + "epoch": 5.656230816451811, + "grad_norm": 0.3140811324119568, + "learning_rate": 4.184511165144986e-05, + "loss": 1.7319, + "step": 18428 + }, + { + "epoch": 5.656537753222836, + "grad_norm": 0.20013047754764557, + "learning_rate": 4.184020768812818e-05, + "loss": 1.7104, + "step": 18429 + }, + { + "epoch": 5.656844689993862, + "grad_norm": 0.2615044414997101, + "learning_rate": 4.183530380544638e-05, + "loss": 1.7314, + "step": 18430 + }, + { + "epoch": 5.657151626764886, + "grad_norm": 0.2645856440067291, + "learning_rate": 4.183040000345287e-05, + "loss": 1.7431, + "step": 18431 + }, + { + "epoch": 5.657458563535911, + "grad_norm": 0.1916145384311676, + "learning_rate": 4.182549628219615e-05, + "loss": 1.7013, + "step": 18432 + }, + { + "epoch": 5.657765500306937, + "grad_norm": 0.2647114396095276, + "learning_rate": 4.182059264172466e-05, + "loss": 1.7278, + "step": 18433 + }, + { + "epoch": 5.658072437077962, + "grad_norm": 0.20201756060123444, + "learning_rate": 4.1815689082086854e-05, + "loss": 1.7065, + "step": 18434 + }, + { + "epoch": 5.658379373848987, + "grad_norm": 0.23892022669315338, + "learning_rate": 4.181078560333123e-05, + "loss": 1.7365, + "step": 18435 + }, + { + "epoch": 5.658686310620013, + "grad_norm": 0.3125975728034973, + "learning_rate": 4.18058822055062e-05, + "loss": 1.7152, + "step": 18436 + }, + { + "epoch": 5.658993247391037, + "grad_norm": 0.18924804031848907, + "learning_rate": 4.180097888866027e-05, + "loss": 1.7763, + "step": 18437 + }, + { + "epoch": 5.6593001841620625, + "grad_norm": 0.28476929664611816, + "learning_rate": 4.1796075652841845e-05, + "loss": 1.7517, + "step": 18438 + }, + { + "epoch": 5.659607120933088, + "grad_norm": 0.30616337060928345, + "learning_rate": 4.1791172498099416e-05, + "loss": 1.7446, + "step": 18439 + }, + { + "epoch": 5.659914057704113, + "grad_norm": 0.3219330608844757, + "learning_rate": 4.1786269424481426e-05, + "loss": 1.8374, + "step": 18440 + }, + { + "epoch": 5.6602209944751385, + "grad_norm": 0.34074151515960693, + "learning_rate": 4.1781366432036364e-05, + "loss": 1.7915, + "step": 18441 + }, + { + "epoch": 5.660527931246163, + "grad_norm": 0.2321610003709793, + "learning_rate": 4.177646352081263e-05, + "loss": 1.7361, + "step": 18442 + }, + { + "epoch": 5.660834868017188, + "grad_norm": 0.34283575415611267, + "learning_rate": 4.1771560690858716e-05, + "loss": 1.6859, + "step": 18443 + }, + { + "epoch": 5.661141804788214, + "grad_norm": 0.32274290919303894, + "learning_rate": 4.1766657942223055e-05, + "loss": 1.7376, + "step": 18444 + }, + { + "epoch": 5.661448741559239, + "grad_norm": 0.23960906267166138, + "learning_rate": 4.1761755274954105e-05, + "loss": 1.7198, + "step": 18445 + }, + { + "epoch": 5.661755678330264, + "grad_norm": 0.2622305154800415, + "learning_rate": 4.175685268910031e-05, + "loss": 1.6997, + "step": 18446 + }, + { + "epoch": 5.66206261510129, + "grad_norm": 0.19836951792240143, + "learning_rate": 4.1751950184710157e-05, + "loss": 1.6612, + "step": 18447 + }, + { + "epoch": 5.662369551872314, + "grad_norm": 0.29541507363319397, + "learning_rate": 4.174704776183204e-05, + "loss": 1.7606, + "step": 18448 + }, + { + "epoch": 5.662676488643339, + "grad_norm": 0.21632203459739685, + "learning_rate": 4.174214542051445e-05, + "loss": 1.7108, + "step": 18449 + }, + { + "epoch": 5.662983425414365, + "grad_norm": 0.2851164638996124, + "learning_rate": 4.173724316080582e-05, + "loss": 1.747, + "step": 18450 + }, + { + "epoch": 5.66329036218539, + "grad_norm": 0.30293309688568115, + "learning_rate": 4.173234098275458e-05, + "loss": 1.7549, + "step": 18451 + }, + { + "epoch": 5.6635972989564145, + "grad_norm": 0.2131963074207306, + "learning_rate": 4.172743888640921e-05, + "loss": 1.7804, + "step": 18452 + }, + { + "epoch": 5.66390423572744, + "grad_norm": 0.234910249710083, + "learning_rate": 4.172253687181812e-05, + "loss": 1.7149, + "step": 18453 + }, + { + "epoch": 5.664211172498465, + "grad_norm": 0.21238654851913452, + "learning_rate": 4.171763493902979e-05, + "loss": 1.7272, + "step": 18454 + }, + { + "epoch": 5.6645181092694905, + "grad_norm": 0.20571236312389374, + "learning_rate": 4.171273308809263e-05, + "loss": 1.713, + "step": 18455 + }, + { + "epoch": 5.664825046040516, + "grad_norm": 0.24867361783981323, + "learning_rate": 4.1707831319055104e-05, + "loss": 1.682, + "step": 18456 + }, + { + "epoch": 5.665131982811541, + "grad_norm": 0.20556440949440002, + "learning_rate": 4.170292963196564e-05, + "loss": 1.7126, + "step": 18457 + }, + { + "epoch": 5.665438919582566, + "grad_norm": 0.26431065797805786, + "learning_rate": 4.169802802687271e-05, + "loss": 1.8142, + "step": 18458 + }, + { + "epoch": 5.665745856353591, + "grad_norm": 0.26041486859321594, + "learning_rate": 4.169312650382471e-05, + "loss": 1.7206, + "step": 18459 + }, + { + "epoch": 5.666052793124616, + "grad_norm": 0.2190525084733963, + "learning_rate": 4.1688225062870126e-05, + "loss": 1.787, + "step": 18460 + }, + { + "epoch": 5.666359729895642, + "grad_norm": 0.24726425111293793, + "learning_rate": 4.1683323704057354e-05, + "loss": 1.7677, + "step": 18461 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.22206442058086395, + "learning_rate": 4.167842242743486e-05, + "loss": 1.73, + "step": 18462 + }, + { + "epoch": 5.666973603437691, + "grad_norm": 0.22501195967197418, + "learning_rate": 4.167352123305108e-05, + "loss": 1.7213, + "step": 18463 + }, + { + "epoch": 5.667280540208717, + "grad_norm": 0.26164770126342773, + "learning_rate": 4.166862012095443e-05, + "loss": 1.7839, + "step": 18464 + }, + { + "epoch": 5.667587476979742, + "grad_norm": 0.19480809569358826, + "learning_rate": 4.166371909119336e-05, + "loss": 1.7562, + "step": 18465 + }, + { + "epoch": 5.667894413750767, + "grad_norm": 0.26677292585372925, + "learning_rate": 4.165881814381632e-05, + "loss": 1.776, + "step": 18466 + }, + { + "epoch": 5.668201350521793, + "grad_norm": 0.22019581496715546, + "learning_rate": 4.165391727887172e-05, + "loss": 1.7575, + "step": 18467 + }, + { + "epoch": 5.668508287292818, + "grad_norm": 0.23851899802684784, + "learning_rate": 4.1649016496407986e-05, + "loss": 1.7346, + "step": 18468 + }, + { + "epoch": 5.6688152240638425, + "grad_norm": 0.3118130564689636, + "learning_rate": 4.1644115796473596e-05, + "loss": 1.7808, + "step": 18469 + }, + { + "epoch": 5.669122160834868, + "grad_norm": 0.22783879935741425, + "learning_rate": 4.163921517911692e-05, + "loss": 1.831, + "step": 18470 + }, + { + "epoch": 5.669429097605893, + "grad_norm": 0.2203773707151413, + "learning_rate": 4.163431464438645e-05, + "loss": 1.7034, + "step": 18471 + }, + { + "epoch": 5.6697360343769185, + "grad_norm": 0.21838103234767914, + "learning_rate": 4.162941419233056e-05, + "loss": 1.7553, + "step": 18472 + }, + { + "epoch": 5.670042971147944, + "grad_norm": 0.18453563749790192, + "learning_rate": 4.162451382299771e-05, + "loss": 1.7139, + "step": 18473 + }, + { + "epoch": 5.670349907918968, + "grad_norm": 0.25308313965797424, + "learning_rate": 4.161961353643633e-05, + "loss": 1.7291, + "step": 18474 + }, + { + "epoch": 5.670656844689994, + "grad_norm": 0.2528827488422394, + "learning_rate": 4.1614713332694845e-05, + "loss": 1.781, + "step": 18475 + }, + { + "epoch": 5.670963781461019, + "grad_norm": 0.24774135649204254, + "learning_rate": 4.160981321182166e-05, + "loss": 1.7808, + "step": 18476 + }, + { + "epoch": 5.671270718232044, + "grad_norm": 0.25225830078125, + "learning_rate": 4.160491317386524e-05, + "loss": 1.739, + "step": 18477 + }, + { + "epoch": 5.67157765500307, + "grad_norm": 0.2095808982849121, + "learning_rate": 4.160001321887397e-05, + "loss": 1.7242, + "step": 18478 + }, + { + "epoch": 5.671884591774095, + "grad_norm": 0.23906216025352478, + "learning_rate": 4.159511334689631e-05, + "loss": 1.7071, + "step": 18479 + }, + { + "epoch": 5.672191528545119, + "grad_norm": 0.21851155161857605, + "learning_rate": 4.159021355798065e-05, + "loss": 1.7171, + "step": 18480 + }, + { + "epoch": 5.672498465316145, + "grad_norm": 0.2005140632390976, + "learning_rate": 4.158531385217544e-05, + "loss": 1.7483, + "step": 18481 + }, + { + "epoch": 5.67280540208717, + "grad_norm": 0.2230832278728485, + "learning_rate": 4.1580414229529074e-05, + "loss": 1.7386, + "step": 18482 + }, + { + "epoch": 5.673112338858195, + "grad_norm": 0.22402967512607574, + "learning_rate": 4.1575514690090014e-05, + "loss": 1.7989, + "step": 18483 + }, + { + "epoch": 5.67341927562922, + "grad_norm": 0.20350080728530884, + "learning_rate": 4.157061523390665e-05, + "loss": 1.6856, + "step": 18484 + }, + { + "epoch": 5.673726212400245, + "grad_norm": 0.2039422243833542, + "learning_rate": 4.15657158610274e-05, + "loss": 1.7262, + "step": 18485 + }, + { + "epoch": 5.6740331491712706, + "grad_norm": 0.20411522686481476, + "learning_rate": 4.156081657150069e-05, + "loss": 1.738, + "step": 18486 + }, + { + "epoch": 5.674340085942296, + "grad_norm": 0.2693086862564087, + "learning_rate": 4.155591736537493e-05, + "loss": 1.731, + "step": 18487 + }, + { + "epoch": 5.674647022713321, + "grad_norm": 0.20745019614696503, + "learning_rate": 4.1551018242698567e-05, + "loss": 1.7138, + "step": 18488 + }, + { + "epoch": 5.6749539594843466, + "grad_norm": 0.22033964097499847, + "learning_rate": 4.1546119203519964e-05, + "loss": 1.8144, + "step": 18489 + }, + { + "epoch": 5.675260896255372, + "grad_norm": 0.22859029471874237, + "learning_rate": 4.154122024788759e-05, + "loss": 1.6724, + "step": 18490 + }, + { + "epoch": 5.675567833026396, + "grad_norm": 0.2226465791463852, + "learning_rate": 4.153632137584982e-05, + "loss": 1.731, + "step": 18491 + }, + { + "epoch": 5.675874769797422, + "grad_norm": 0.19657716155052185, + "learning_rate": 4.1531422587455086e-05, + "loss": 1.6937, + "step": 18492 + }, + { + "epoch": 5.676181706568447, + "grad_norm": 0.23167578876018524, + "learning_rate": 4.152652388275179e-05, + "loss": 1.7444, + "step": 18493 + }, + { + "epoch": 5.676488643339472, + "grad_norm": 0.24468563497066498, + "learning_rate": 4.1521625261788374e-05, + "loss": 1.7173, + "step": 18494 + }, + { + "epoch": 5.676795580110497, + "grad_norm": 0.27125802636146545, + "learning_rate": 4.1516726724613206e-05, + "loss": 1.7424, + "step": 18495 + }, + { + "epoch": 5.677102516881522, + "grad_norm": 0.23816901445388794, + "learning_rate": 4.151182827127473e-05, + "loss": 1.6911, + "step": 18496 + }, + { + "epoch": 5.6774094536525475, + "grad_norm": 0.26058733463287354, + "learning_rate": 4.150692990182133e-05, + "loss": 1.7142, + "step": 18497 + }, + { + "epoch": 5.677716390423573, + "grad_norm": 0.20207929611206055, + "learning_rate": 4.150203161630143e-05, + "loss": 1.7506, + "step": 18498 + }, + { + "epoch": 5.678023327194598, + "grad_norm": 0.259857714176178, + "learning_rate": 4.1497133414763435e-05, + "loss": 1.7181, + "step": 18499 + }, + { + "epoch": 5.6783302639656235, + "grad_norm": 0.2607496380805969, + "learning_rate": 4.149223529725577e-05, + "loss": 1.7829, + "step": 18500 + }, + { + "epoch": 5.678637200736648, + "grad_norm": 0.23265719413757324, + "learning_rate": 4.148733726382681e-05, + "loss": 1.7028, + "step": 18501 + }, + { + "epoch": 5.678944137507673, + "grad_norm": 0.26610276103019714, + "learning_rate": 4.1482439314524964e-05, + "loss": 1.8604, + "step": 18502 + }, + { + "epoch": 5.679251074278699, + "grad_norm": 0.24022582173347473, + "learning_rate": 4.147754144939865e-05, + "loss": 1.7142, + "step": 18503 + }, + { + "epoch": 5.679558011049724, + "grad_norm": 0.2849755585193634, + "learning_rate": 4.1472643668496255e-05, + "loss": 1.6956, + "step": 18504 + }, + { + "epoch": 5.679864947820749, + "grad_norm": 0.24330341815948486, + "learning_rate": 4.1467745971866216e-05, + "loss": 1.7617, + "step": 18505 + }, + { + "epoch": 5.680171884591774, + "grad_norm": 0.21072770655155182, + "learning_rate": 4.146284835955689e-05, + "loss": 1.6999, + "step": 18506 + }, + { + "epoch": 5.680478821362799, + "grad_norm": 0.1971336454153061, + "learning_rate": 4.145795083161673e-05, + "loss": 1.6756, + "step": 18507 + }, + { + "epoch": 5.680785758133824, + "grad_norm": 0.18576614558696747, + "learning_rate": 4.1453053388094073e-05, + "loss": 1.6885, + "step": 18508 + }, + { + "epoch": 5.68109269490485, + "grad_norm": 0.21335965394973755, + "learning_rate": 4.144815602903737e-05, + "loss": 1.7278, + "step": 18509 + }, + { + "epoch": 5.681399631675875, + "grad_norm": 0.21756233274936676, + "learning_rate": 4.1443258754494986e-05, + "loss": 1.7549, + "step": 18510 + }, + { + "epoch": 5.6817065684469, + "grad_norm": 0.2214142084121704, + "learning_rate": 4.143836156451536e-05, + "loss": 1.6654, + "step": 18511 + }, + { + "epoch": 5.682013505217925, + "grad_norm": 0.2230863869190216, + "learning_rate": 4.143346445914684e-05, + "loss": 1.7286, + "step": 18512 + }, + { + "epoch": 5.68232044198895, + "grad_norm": 0.2283746749162674, + "learning_rate": 4.142856743843787e-05, + "loss": 1.7652, + "step": 18513 + }, + { + "epoch": 5.6826273787599755, + "grad_norm": 0.20059749484062195, + "learning_rate": 4.142367050243679e-05, + "loss": 1.6854, + "step": 18514 + }, + { + "epoch": 5.682934315531001, + "grad_norm": 0.17887794971466064, + "learning_rate": 4.141877365119204e-05, + "loss": 1.6975, + "step": 18515 + }, + { + "epoch": 5.683241252302026, + "grad_norm": 0.21266087889671326, + "learning_rate": 4.141387688475199e-05, + "loss": 1.7361, + "step": 18516 + }, + { + "epoch": 5.683548189073051, + "grad_norm": 0.20075422525405884, + "learning_rate": 4.140898020316506e-05, + "loss": 1.7496, + "step": 18517 + }, + { + "epoch": 5.683855125844076, + "grad_norm": 0.21430443227291107, + "learning_rate": 4.140408360647963e-05, + "loss": 1.7481, + "step": 18518 + }, + { + "epoch": 5.684162062615101, + "grad_norm": 0.1951984018087387, + "learning_rate": 4.139918709474405e-05, + "loss": 1.713, + "step": 18519 + }, + { + "epoch": 5.684468999386127, + "grad_norm": 0.21636274456977844, + "learning_rate": 4.1394290668006764e-05, + "loss": 1.8169, + "step": 18520 + }, + { + "epoch": 5.684775936157152, + "grad_norm": 0.21003715693950653, + "learning_rate": 4.138939432631613e-05, + "loss": 1.7453, + "step": 18521 + }, + { + "epoch": 5.685082872928177, + "grad_norm": 0.23559699952602386, + "learning_rate": 4.138449806972057e-05, + "loss": 1.7534, + "step": 18522 + }, + { + "epoch": 5.685389809699202, + "grad_norm": 0.23322029411792755, + "learning_rate": 4.137960189826843e-05, + "loss": 1.7535, + "step": 18523 + }, + { + "epoch": 5.685696746470227, + "grad_norm": 0.1998462826013565, + "learning_rate": 4.137470581200813e-05, + "loss": 1.7025, + "step": 18524 + }, + { + "epoch": 5.686003683241252, + "grad_norm": 0.22321350872516632, + "learning_rate": 4.1369809810988025e-05, + "loss": 1.7666, + "step": 18525 + }, + { + "epoch": 5.686310620012278, + "grad_norm": 0.20851604640483856, + "learning_rate": 4.136491389525653e-05, + "loss": 1.6958, + "step": 18526 + }, + { + "epoch": 5.686617556783302, + "grad_norm": 0.21494868397712708, + "learning_rate": 4.136001806486201e-05, + "loss": 1.7703, + "step": 18527 + }, + { + "epoch": 5.6869244935543275, + "grad_norm": 0.19872798025608063, + "learning_rate": 4.135512231985287e-05, + "loss": 1.7451, + "step": 18528 + }, + { + "epoch": 5.687231430325353, + "grad_norm": 0.2424371987581253, + "learning_rate": 4.1350226660277456e-05, + "loss": 1.8153, + "step": 18529 + }, + { + "epoch": 5.687538367096378, + "grad_norm": 0.20388297736644745, + "learning_rate": 4.1345331086184196e-05, + "loss": 1.6882, + "step": 18530 + }, + { + "epoch": 5.6878453038674035, + "grad_norm": 0.22662605345249176, + "learning_rate": 4.134043559762143e-05, + "loss": 1.7532, + "step": 18531 + }, + { + "epoch": 5.688152240638429, + "grad_norm": 0.2281452864408493, + "learning_rate": 4.133554019463756e-05, + "loss": 1.769, + "step": 18532 + }, + { + "epoch": 5.688459177409453, + "grad_norm": 0.2303505390882492, + "learning_rate": 4.1330644877280955e-05, + "loss": 1.7176, + "step": 18533 + }, + { + "epoch": 5.688766114180479, + "grad_norm": 0.24411743879318237, + "learning_rate": 4.132574964560001e-05, + "loss": 1.7557, + "step": 18534 + }, + { + "epoch": 5.689073050951504, + "grad_norm": 0.2674088776111603, + "learning_rate": 4.13208544996431e-05, + "loss": 1.6997, + "step": 18535 + }, + { + "epoch": 5.689379987722529, + "grad_norm": 0.22232958674430847, + "learning_rate": 4.1315959439458565e-05, + "loss": 1.7731, + "step": 18536 + }, + { + "epoch": 5.689686924493555, + "grad_norm": 0.23894453048706055, + "learning_rate": 4.131106446509483e-05, + "loss": 1.7454, + "step": 18537 + }, + { + "epoch": 5.689993861264579, + "grad_norm": 0.19710026681423187, + "learning_rate": 4.1306169576600226e-05, + "loss": 1.6872, + "step": 18538 + }, + { + "epoch": 5.690300798035604, + "grad_norm": 0.1879546344280243, + "learning_rate": 4.130127477402318e-05, + "loss": 1.6929, + "step": 18539 + }, + { + "epoch": 5.69060773480663, + "grad_norm": 0.1964653730392456, + "learning_rate": 4.129638005741201e-05, + "loss": 1.7778, + "step": 18540 + }, + { + "epoch": 5.690914671577655, + "grad_norm": 0.20161493122577667, + "learning_rate": 4.129148542681513e-05, + "loss": 1.7388, + "step": 18541 + }, + { + "epoch": 5.69122160834868, + "grad_norm": 0.26742830872535706, + "learning_rate": 4.1286590882280886e-05, + "loss": 1.7472, + "step": 18542 + }, + { + "epoch": 5.691528545119706, + "grad_norm": 0.2613312900066376, + "learning_rate": 4.128169642385766e-05, + "loss": 1.7656, + "step": 18543 + }, + { + "epoch": 5.69183548189073, + "grad_norm": 0.17979474365711212, + "learning_rate": 4.127680205159381e-05, + "loss": 1.6992, + "step": 18544 + }, + { + "epoch": 5.6921424186617555, + "grad_norm": 0.23575037717819214, + "learning_rate": 4.1271907765537745e-05, + "loss": 1.7399, + "step": 18545 + }, + { + "epoch": 5.692449355432781, + "grad_norm": 0.19461458921432495, + "learning_rate": 4.126701356573777e-05, + "loss": 1.709, + "step": 18546 + }, + { + "epoch": 5.692756292203806, + "grad_norm": 0.19715365767478943, + "learning_rate": 4.1262119452242306e-05, + "loss": 1.7634, + "step": 18547 + }, + { + "epoch": 5.6930632289748315, + "grad_norm": 0.21454904973506927, + "learning_rate": 4.125722542509969e-05, + "loss": 1.7663, + "step": 18548 + }, + { + "epoch": 5.693370165745856, + "grad_norm": 0.19884896278381348, + "learning_rate": 4.12523314843583e-05, + "loss": 1.7618, + "step": 18549 + }, + { + "epoch": 5.693677102516881, + "grad_norm": 0.2080020159482956, + "learning_rate": 4.124743763006648e-05, + "loss": 1.7379, + "step": 18550 + }, + { + "epoch": 5.693984039287907, + "grad_norm": 0.18780875205993652, + "learning_rate": 4.124254386227264e-05, + "loss": 1.7036, + "step": 18551 + }, + { + "epoch": 5.694290976058932, + "grad_norm": 0.2114439308643341, + "learning_rate": 4.123765018102512e-05, + "loss": 1.6873, + "step": 18552 + }, + { + "epoch": 5.694597912829957, + "grad_norm": 0.1712789535522461, + "learning_rate": 4.123275658637225e-05, + "loss": 1.6772, + "step": 18553 + }, + { + "epoch": 5.694904849600983, + "grad_norm": 0.2435859888792038, + "learning_rate": 4.122786307836243e-05, + "loss": 1.7946, + "step": 18554 + }, + { + "epoch": 5.695211786372007, + "grad_norm": 0.20587889850139618, + "learning_rate": 4.122296965704399e-05, + "loss": 1.7459, + "step": 18555 + }, + { + "epoch": 5.695518723143032, + "grad_norm": 0.2183443009853363, + "learning_rate": 4.121807632246534e-05, + "loss": 1.7036, + "step": 18556 + }, + { + "epoch": 5.695825659914058, + "grad_norm": 0.19276869297027588, + "learning_rate": 4.121318307467478e-05, + "loss": 1.7371, + "step": 18557 + }, + { + "epoch": 5.696132596685083, + "grad_norm": 0.19815512001514435, + "learning_rate": 4.120828991372072e-05, + "loss": 1.7038, + "step": 18558 + }, + { + "epoch": 5.696439533456108, + "grad_norm": 0.18509675562381744, + "learning_rate": 4.120339683965146e-05, + "loss": 1.6936, + "step": 18559 + }, + { + "epoch": 5.696746470227133, + "grad_norm": 0.2296193689107895, + "learning_rate": 4.1198503852515416e-05, + "loss": 1.7626, + "step": 18560 + }, + { + "epoch": 5.697053406998158, + "grad_norm": 0.2064799964427948, + "learning_rate": 4.11936109523609e-05, + "loss": 1.7387, + "step": 18561 + }, + { + "epoch": 5.6973603437691835, + "grad_norm": 0.20171360671520233, + "learning_rate": 4.1188718139236296e-05, + "loss": 1.7372, + "step": 18562 + }, + { + "epoch": 5.697667280540209, + "grad_norm": 0.19421936571598053, + "learning_rate": 4.118382541318993e-05, + "loss": 1.7187, + "step": 18563 + }, + { + "epoch": 5.697974217311234, + "grad_norm": 0.22517532110214233, + "learning_rate": 4.117893277427018e-05, + "loss": 1.7503, + "step": 18564 + }, + { + "epoch": 5.6982811540822595, + "grad_norm": 0.2293393909931183, + "learning_rate": 4.1174040222525366e-05, + "loss": 1.7174, + "step": 18565 + }, + { + "epoch": 5.698588090853284, + "grad_norm": 0.24003073573112488, + "learning_rate": 4.1169147758003876e-05, + "loss": 1.7829, + "step": 18566 + }, + { + "epoch": 5.698895027624309, + "grad_norm": 0.21476133167743683, + "learning_rate": 4.1164255380754034e-05, + "loss": 1.7906, + "step": 18567 + }, + { + "epoch": 5.699201964395335, + "grad_norm": 0.21347576379776, + "learning_rate": 4.115936309082422e-05, + "loss": 1.6986, + "step": 18568 + }, + { + "epoch": 5.69950890116636, + "grad_norm": 0.22650402784347534, + "learning_rate": 4.115447088826276e-05, + "loss": 1.7949, + "step": 18569 + }, + { + "epoch": 5.699815837937384, + "grad_norm": 0.25815197825431824, + "learning_rate": 4.114957877311799e-05, + "loss": 1.7499, + "step": 18570 + }, + { + "epoch": 5.70012277470841, + "grad_norm": 0.22644442319869995, + "learning_rate": 4.1144686745438265e-05, + "loss": 1.7689, + "step": 18571 + }, + { + "epoch": 5.700429711479435, + "grad_norm": 0.241188645362854, + "learning_rate": 4.113979480527194e-05, + "loss": 1.7341, + "step": 18572 + }, + { + "epoch": 5.7007366482504604, + "grad_norm": 0.20984862744808197, + "learning_rate": 4.1134902952667365e-05, + "loss": 1.7091, + "step": 18573 + }, + { + "epoch": 5.701043585021486, + "grad_norm": 0.25150877237319946, + "learning_rate": 4.113001118767286e-05, + "loss": 1.723, + "step": 18574 + }, + { + "epoch": 5.701350521792511, + "grad_norm": 0.21693028509616852, + "learning_rate": 4.1125119510336804e-05, + "loss": 1.7483, + "step": 18575 + }, + { + "epoch": 5.701657458563536, + "grad_norm": 0.2620212733745575, + "learning_rate": 4.11202279207075e-05, + "loss": 1.8159, + "step": 18576 + }, + { + "epoch": 5.701964395334561, + "grad_norm": 0.18722239136695862, + "learning_rate": 4.111533641883332e-05, + "loss": 1.7197, + "step": 18577 + }, + { + "epoch": 5.702271332105586, + "grad_norm": 0.21321091055870056, + "learning_rate": 4.111044500476258e-05, + "loss": 1.7408, + "step": 18578 + }, + { + "epoch": 5.702578268876612, + "grad_norm": 0.24459265172481537, + "learning_rate": 4.110555367854365e-05, + "loss": 1.8304, + "step": 18579 + }, + { + "epoch": 5.702885205647637, + "grad_norm": 0.24987100064754486, + "learning_rate": 4.110066244022483e-05, + "loss": 1.7051, + "step": 18580 + }, + { + "epoch": 5.703192142418661, + "grad_norm": 0.19059090316295624, + "learning_rate": 4.1095771289854506e-05, + "loss": 1.7489, + "step": 18581 + }, + { + "epoch": 5.703499079189687, + "grad_norm": 0.23020480573177338, + "learning_rate": 4.1090880227480966e-05, + "loss": 1.7101, + "step": 18582 + }, + { + "epoch": 5.703806015960712, + "grad_norm": 0.18733634054660797, + "learning_rate": 4.108598925315258e-05, + "loss": 1.7116, + "step": 18583 + }, + { + "epoch": 5.704112952731737, + "grad_norm": 0.1959095001220703, + "learning_rate": 4.108109836691766e-05, + "loss": 1.7283, + "step": 18584 + }, + { + "epoch": 5.704419889502763, + "grad_norm": 0.22685091197490692, + "learning_rate": 4.107620756882457e-05, + "loss": 1.7588, + "step": 18585 + }, + { + "epoch": 5.704726826273788, + "grad_norm": 0.1998603790998459, + "learning_rate": 4.107131685892164e-05, + "loss": 1.7071, + "step": 18586 + }, + { + "epoch": 5.7050337630448125, + "grad_norm": 0.2018733024597168, + "learning_rate": 4.106642623725717e-05, + "loss": 1.6782, + "step": 18587 + }, + { + "epoch": 5.705340699815838, + "grad_norm": 0.21826615929603577, + "learning_rate": 4.106153570387951e-05, + "loss": 1.736, + "step": 18588 + }, + { + "epoch": 5.705647636586863, + "grad_norm": 0.20197603106498718, + "learning_rate": 4.105664525883699e-05, + "loss": 1.6921, + "step": 18589 + }, + { + "epoch": 5.7059545733578885, + "grad_norm": 0.20943905413150787, + "learning_rate": 4.105175490217796e-05, + "loss": 1.665, + "step": 18590 + }, + { + "epoch": 5.706261510128914, + "grad_norm": 0.202060267329216, + "learning_rate": 4.104686463395071e-05, + "loss": 1.714, + "step": 18591 + }, + { + "epoch": 5.706568446899938, + "grad_norm": 0.220698744058609, + "learning_rate": 4.1041974454203623e-05, + "loss": 1.8076, + "step": 18592 + }, + { + "epoch": 5.706875383670964, + "grad_norm": 0.21536946296691895, + "learning_rate": 4.103708436298497e-05, + "loss": 1.6801, + "step": 18593 + }, + { + "epoch": 5.707182320441989, + "grad_norm": 0.21442468464374542, + "learning_rate": 4.103219436034311e-05, + "loss": 1.6921, + "step": 18594 + }, + { + "epoch": 5.707489257213014, + "grad_norm": 0.2047559767961502, + "learning_rate": 4.1027304446326356e-05, + "loss": 1.7861, + "step": 18595 + }, + { + "epoch": 5.70779619398404, + "grad_norm": 0.20304669439792633, + "learning_rate": 4.102241462098305e-05, + "loss": 1.7751, + "step": 18596 + }, + { + "epoch": 5.708103130755065, + "grad_norm": 0.18702620267868042, + "learning_rate": 4.101752488436149e-05, + "loss": 1.6951, + "step": 18597 + }, + { + "epoch": 5.708410067526089, + "grad_norm": 0.1821923404932022, + "learning_rate": 4.1012635236510034e-05, + "loss": 1.711, + "step": 18598 + }, + { + "epoch": 5.708717004297115, + "grad_norm": 0.19422096014022827, + "learning_rate": 4.100774567747696e-05, + "loss": 1.7202, + "step": 18599 + }, + { + "epoch": 5.70902394106814, + "grad_norm": 0.20800530910491943, + "learning_rate": 4.100285620731063e-05, + "loss": 1.7403, + "step": 18600 + }, + { + "epoch": 5.709330877839165, + "grad_norm": 0.221746027469635, + "learning_rate": 4.099796682605934e-05, + "loss": 1.7769, + "step": 18601 + }, + { + "epoch": 5.70963781461019, + "grad_norm": 0.19284313917160034, + "learning_rate": 4.099307753377143e-05, + "loss": 1.692, + "step": 18602 + }, + { + "epoch": 5.709944751381215, + "grad_norm": 0.17635129392147064, + "learning_rate": 4.0988188330495216e-05, + "loss": 1.7212, + "step": 18603 + }, + { + "epoch": 5.7102516881522405, + "grad_norm": 0.17728061974048615, + "learning_rate": 4.098329921627898e-05, + "loss": 1.7217, + "step": 18604 + }, + { + "epoch": 5.710558624923266, + "grad_norm": 0.19998152554035187, + "learning_rate": 4.097841019117108e-05, + "loss": 1.7583, + "step": 18605 + }, + { + "epoch": 5.710865561694291, + "grad_norm": 0.18840095400810242, + "learning_rate": 4.09735212552198e-05, + "loss": 1.7353, + "step": 18606 + }, + { + "epoch": 5.7111724984653165, + "grad_norm": 0.2528367042541504, + "learning_rate": 4.09686324084735e-05, + "loss": 1.7576, + "step": 18607 + }, + { + "epoch": 5.711479435236341, + "grad_norm": 0.27240338921546936, + "learning_rate": 4.096374365098045e-05, + "loss": 1.7303, + "step": 18608 + }, + { + "epoch": 5.711786372007366, + "grad_norm": 0.20187151432037354, + "learning_rate": 4.0958854982789e-05, + "loss": 1.7599, + "step": 18609 + }, + { + "epoch": 5.712093308778392, + "grad_norm": 0.24890528619289398, + "learning_rate": 4.095396640394742e-05, + "loss": 1.7737, + "step": 18610 + }, + { + "epoch": 5.712400245549417, + "grad_norm": 0.21524454653263092, + "learning_rate": 4.094907791450406e-05, + "loss": 1.7704, + "step": 18611 + }, + { + "epoch": 5.712707182320442, + "grad_norm": 0.20070379972457886, + "learning_rate": 4.094418951450721e-05, + "loss": 1.7358, + "step": 18612 + }, + { + "epoch": 5.713014119091467, + "grad_norm": 0.2252196967601776, + "learning_rate": 4.09393012040052e-05, + "loss": 1.7262, + "step": 18613 + }, + { + "epoch": 5.713321055862492, + "grad_norm": 0.19511987268924713, + "learning_rate": 4.093441298304631e-05, + "loss": 1.7146, + "step": 18614 + }, + { + "epoch": 5.713627992633517, + "grad_norm": 0.2047072798013687, + "learning_rate": 4.092952485167888e-05, + "loss": 1.7864, + "step": 18615 + }, + { + "epoch": 5.713934929404543, + "grad_norm": 0.21794871985912323, + "learning_rate": 4.092463680995119e-05, + "loss": 1.7759, + "step": 18616 + }, + { + "epoch": 5.714241866175568, + "grad_norm": 0.23863841593265533, + "learning_rate": 4.0919748857911566e-05, + "loss": 1.7207, + "step": 18617 + }, + { + "epoch": 5.714548802946593, + "grad_norm": 0.19706958532333374, + "learning_rate": 4.09148609956083e-05, + "loss": 1.7247, + "step": 18618 + }, + { + "epoch": 5.714855739717618, + "grad_norm": 0.23663771152496338, + "learning_rate": 4.090997322308971e-05, + "loss": 1.7929, + "step": 18619 + }, + { + "epoch": 5.715162676488643, + "grad_norm": 0.23079079389572144, + "learning_rate": 4.09050855404041e-05, + "loss": 1.763, + "step": 18620 + }, + { + "epoch": 5.7154696132596685, + "grad_norm": 0.23883379995822906, + "learning_rate": 4.0900197947599736e-05, + "loss": 1.7995, + "step": 18621 + }, + { + "epoch": 5.715776550030694, + "grad_norm": 0.2125123143196106, + "learning_rate": 4.0895310444724974e-05, + "loss": 1.8045, + "step": 18622 + }, + { + "epoch": 5.716083486801719, + "grad_norm": 0.21062424778938293, + "learning_rate": 4.0890423031828076e-05, + "loss": 1.7348, + "step": 18623 + }, + { + "epoch": 5.716390423572744, + "grad_norm": 0.24079614877700806, + "learning_rate": 4.088553570895737e-05, + "loss": 1.7462, + "step": 18624 + }, + { + "epoch": 5.716697360343769, + "grad_norm": 0.2120666354894638, + "learning_rate": 4.088064847616113e-05, + "loss": 1.7235, + "step": 18625 + }, + { + "epoch": 5.717004297114794, + "grad_norm": 0.19663050770759583, + "learning_rate": 4.0875761333487685e-05, + "loss": 1.6743, + "step": 18626 + }, + { + "epoch": 5.71731123388582, + "grad_norm": 0.24010685086250305, + "learning_rate": 4.0870874280985295e-05, + "loss": 1.6742, + "step": 18627 + }, + { + "epoch": 5.717618170656845, + "grad_norm": 0.22140294313430786, + "learning_rate": 4.086598731870228e-05, + "loss": 1.7601, + "step": 18628 + }, + { + "epoch": 5.71792510742787, + "grad_norm": 0.2876693308353424, + "learning_rate": 4.086110044668694e-05, + "loss": 1.7601, + "step": 18629 + }, + { + "epoch": 5.718232044198895, + "grad_norm": 0.3103853464126587, + "learning_rate": 4.085621366498756e-05, + "loss": 1.6824, + "step": 18630 + }, + { + "epoch": 5.71853898096992, + "grad_norm": 0.18194396793842316, + "learning_rate": 4.0851326973652424e-05, + "loss": 1.6976, + "step": 18631 + }, + { + "epoch": 5.718845917740945, + "grad_norm": 0.28400903940200806, + "learning_rate": 4.0846440372729854e-05, + "loss": 1.7352, + "step": 18632 + }, + { + "epoch": 5.719152854511971, + "grad_norm": 0.23753583431243896, + "learning_rate": 4.084155386226811e-05, + "loss": 1.7418, + "step": 18633 + }, + { + "epoch": 5.719459791282996, + "grad_norm": 0.215620756149292, + "learning_rate": 4.0836667442315514e-05, + "loss": 1.7602, + "step": 18634 + }, + { + "epoch": 5.7197667280540205, + "grad_norm": 0.21057941019535065, + "learning_rate": 4.083178111292034e-05, + "loss": 1.6818, + "step": 18635 + }, + { + "epoch": 5.720073664825046, + "grad_norm": 0.2169445902109146, + "learning_rate": 4.0826894874130863e-05, + "loss": 1.7942, + "step": 18636 + }, + { + "epoch": 5.720380601596071, + "grad_norm": 0.2779453992843628, + "learning_rate": 4.082200872599541e-05, + "loss": 1.7432, + "step": 18637 + }, + { + "epoch": 5.7206875383670965, + "grad_norm": 0.22556698322296143, + "learning_rate": 4.0817122668562224e-05, + "loss": 1.7748, + "step": 18638 + }, + { + "epoch": 5.720994475138122, + "grad_norm": 0.2570365071296692, + "learning_rate": 4.081223670187962e-05, + "loss": 1.7314, + "step": 18639 + }, + { + "epoch": 5.721301411909147, + "grad_norm": 0.266176700592041, + "learning_rate": 4.080735082599588e-05, + "loss": 1.689, + "step": 18640 + }, + { + "epoch": 5.721608348680172, + "grad_norm": 0.20190037786960602, + "learning_rate": 4.080246504095929e-05, + "loss": 1.7467, + "step": 18641 + }, + { + "epoch": 5.721915285451197, + "grad_norm": 0.2498215138912201, + "learning_rate": 4.079757934681813e-05, + "loss": 1.7063, + "step": 18642 + }, + { + "epoch": 5.722222222222222, + "grad_norm": 0.25594204664230347, + "learning_rate": 4.0792693743620695e-05, + "loss": 1.7096, + "step": 18643 + }, + { + "epoch": 5.722529158993248, + "grad_norm": 0.22674626111984253, + "learning_rate": 4.0787808231415233e-05, + "loss": 1.715, + "step": 18644 + }, + { + "epoch": 5.722836095764272, + "grad_norm": 0.267140656709671, + "learning_rate": 4.078292281025007e-05, + "loss": 1.7747, + "step": 18645 + }, + { + "epoch": 5.723143032535297, + "grad_norm": 0.21161147952079773, + "learning_rate": 4.077803748017345e-05, + "loss": 1.7312, + "step": 18646 + }, + { + "epoch": 5.723449969306323, + "grad_norm": 0.2580260634422302, + "learning_rate": 4.077315224123368e-05, + "loss": 1.7246, + "step": 18647 + }, + { + "epoch": 5.723756906077348, + "grad_norm": 0.23766927421092987, + "learning_rate": 4.076826709347902e-05, + "loss": 1.7147, + "step": 18648 + }, + { + "epoch": 5.724063842848373, + "grad_norm": 0.22764286398887634, + "learning_rate": 4.076338203695776e-05, + "loss": 1.7034, + "step": 18649 + }, + { + "epoch": 5.724370779619399, + "grad_norm": 0.28205159306526184, + "learning_rate": 4.075849707171817e-05, + "loss": 1.7472, + "step": 18650 + }, + { + "epoch": 5.724677716390423, + "grad_norm": 0.2091183066368103, + "learning_rate": 4.075361219780854e-05, + "loss": 1.7693, + "step": 18651 + }, + { + "epoch": 5.7249846531614486, + "grad_norm": 0.29513829946517944, + "learning_rate": 4.074872741527713e-05, + "loss": 1.7286, + "step": 18652 + }, + { + "epoch": 5.725291589932474, + "grad_norm": 0.226357102394104, + "learning_rate": 4.07438427241722e-05, + "loss": 1.7658, + "step": 18653 + }, + { + "epoch": 5.725598526703499, + "grad_norm": 0.23732580244541168, + "learning_rate": 4.073895812454207e-05, + "loss": 1.7591, + "step": 18654 + }, + { + "epoch": 5.725905463474525, + "grad_norm": 0.2835488021373749, + "learning_rate": 4.0734073616434956e-05, + "loss": 1.757, + "step": 18655 + }, + { + "epoch": 5.726212400245549, + "grad_norm": 0.1986306756734848, + "learning_rate": 4.0729189199899186e-05, + "loss": 1.714, + "step": 18656 + }, + { + "epoch": 5.726519337016574, + "grad_norm": 0.25071820616722107, + "learning_rate": 4.072430487498298e-05, + "loss": 1.7334, + "step": 18657 + }, + { + "epoch": 5.7268262737876, + "grad_norm": 0.19989889860153198, + "learning_rate": 4.0719420641734634e-05, + "loss": 1.7472, + "step": 18658 + }, + { + "epoch": 5.727133210558625, + "grad_norm": 0.30006101727485657, + "learning_rate": 4.071453650020241e-05, + "loss": 1.7846, + "step": 18659 + }, + { + "epoch": 5.72744014732965, + "grad_norm": 0.19856922328472137, + "learning_rate": 4.070965245043459e-05, + "loss": 1.6965, + "step": 18660 + }, + { + "epoch": 5.727747084100676, + "grad_norm": 0.20139823853969574, + "learning_rate": 4.070476849247941e-05, + "loss": 1.7265, + "step": 18661 + }, + { + "epoch": 5.7280540208717, + "grad_norm": 0.21507953107357025, + "learning_rate": 4.0699884626385184e-05, + "loss": 1.762, + "step": 18662 + }, + { + "epoch": 5.7283609576427255, + "grad_norm": 0.1885843127965927, + "learning_rate": 4.069500085220013e-05, + "loss": 1.6721, + "step": 18663 + }, + { + "epoch": 5.728667894413751, + "grad_norm": 0.2076897919178009, + "learning_rate": 4.069011716997253e-05, + "loss": 1.7399, + "step": 18664 + }, + { + "epoch": 5.728974831184776, + "grad_norm": 0.21482045948505402, + "learning_rate": 4.068523357975065e-05, + "loss": 1.7105, + "step": 18665 + }, + { + "epoch": 5.7292817679558015, + "grad_norm": 0.20438800752162933, + "learning_rate": 4.0680350081582765e-05, + "loss": 1.7408, + "step": 18666 + }, + { + "epoch": 5.729588704726826, + "grad_norm": 0.2137845903635025, + "learning_rate": 4.0675466675517104e-05, + "loss": 1.7814, + "step": 18667 + }, + { + "epoch": 5.729895641497851, + "grad_norm": 0.23009657859802246, + "learning_rate": 4.067058336160197e-05, + "loss": 1.7311, + "step": 18668 + }, + { + "epoch": 5.730202578268877, + "grad_norm": 0.20602397620677948, + "learning_rate": 4.066570013988558e-05, + "loss": 1.741, + "step": 18669 + }, + { + "epoch": 5.730509515039902, + "grad_norm": 0.24884814023971558, + "learning_rate": 4.066081701041621e-05, + "loss": 1.7222, + "step": 18670 + }, + { + "epoch": 5.730816451810927, + "grad_norm": 0.17906342446804047, + "learning_rate": 4.065593397324214e-05, + "loss": 1.6879, + "step": 18671 + }, + { + "epoch": 5.731123388581953, + "grad_norm": 0.20345427095890045, + "learning_rate": 4.0651051028411586e-05, + "loss": 1.7713, + "step": 18672 + }, + { + "epoch": 5.731430325352977, + "grad_norm": 0.21115002036094666, + "learning_rate": 4.0646168175972846e-05, + "loss": 1.7666, + "step": 18673 + }, + { + "epoch": 5.731737262124002, + "grad_norm": 0.22189734876155853, + "learning_rate": 4.064128541597413e-05, + "loss": 1.6989, + "step": 18674 + }, + { + "epoch": 5.732044198895028, + "grad_norm": 0.24036027491092682, + "learning_rate": 4.063640274846373e-05, + "loss": 1.707, + "step": 18675 + }, + { + "epoch": 5.732351135666053, + "grad_norm": 0.23091022670269012, + "learning_rate": 4.063152017348988e-05, + "loss": 1.7072, + "step": 18676 + }, + { + "epoch": 5.7326580724370775, + "grad_norm": 0.3142668306827545, + "learning_rate": 4.062663769110085e-05, + "loss": 1.7641, + "step": 18677 + }, + { + "epoch": 5.732965009208103, + "grad_norm": 0.2634848356246948, + "learning_rate": 4.0621755301344875e-05, + "loss": 1.7007, + "step": 18678 + }, + { + "epoch": 5.733271945979128, + "grad_norm": 0.21296904981136322, + "learning_rate": 4.061687300427022e-05, + "loss": 1.7201, + "step": 18679 + }, + { + "epoch": 5.7335788827501535, + "grad_norm": 0.24943144619464874, + "learning_rate": 4.0611990799925104e-05, + "loss": 1.7186, + "step": 18680 + }, + { + "epoch": 5.733885819521179, + "grad_norm": 0.2574152946472168, + "learning_rate": 4.060710868835781e-05, + "loss": 1.8671, + "step": 18681 + }, + { + "epoch": 5.734192756292204, + "grad_norm": 0.26023826003074646, + "learning_rate": 4.0602226669616564e-05, + "loss": 1.7618, + "step": 18682 + }, + { + "epoch": 5.734499693063229, + "grad_norm": 0.21078336238861084, + "learning_rate": 4.0597344743749645e-05, + "loss": 1.7548, + "step": 18683 + }, + { + "epoch": 5.734806629834254, + "grad_norm": 0.2195056676864624, + "learning_rate": 4.059246291080525e-05, + "loss": 1.6843, + "step": 18684 + }, + { + "epoch": 5.735113566605279, + "grad_norm": 0.20719893276691437, + "learning_rate": 4.058758117083168e-05, + "loss": 1.692, + "step": 18685 + }, + { + "epoch": 5.735420503376305, + "grad_norm": 0.23012077808380127, + "learning_rate": 4.058269952387713e-05, + "loss": 1.7072, + "step": 18686 + }, + { + "epoch": 5.73572744014733, + "grad_norm": 0.18598411977291107, + "learning_rate": 4.057781796998986e-05, + "loss": 1.6983, + "step": 18687 + }, + { + "epoch": 5.736034376918354, + "grad_norm": 0.20211926102638245, + "learning_rate": 4.057293650921813e-05, + "loss": 1.6818, + "step": 18688 + }, + { + "epoch": 5.73634131368938, + "grad_norm": 0.1957080215215683, + "learning_rate": 4.056805514161015e-05, + "loss": 1.7154, + "step": 18689 + }, + { + "epoch": 5.736648250460405, + "grad_norm": 0.23581798374652863, + "learning_rate": 4.0563173867214196e-05, + "loss": 1.7724, + "step": 18690 + }, + { + "epoch": 5.73695518723143, + "grad_norm": 0.22706671059131622, + "learning_rate": 4.055829268607847e-05, + "loss": 1.7387, + "step": 18691 + }, + { + "epoch": 5.737262124002456, + "grad_norm": 0.20050427317619324, + "learning_rate": 4.055341159825124e-05, + "loss": 1.7585, + "step": 18692 + }, + { + "epoch": 5.737569060773481, + "grad_norm": 0.18666231632232666, + "learning_rate": 4.054853060378072e-05, + "loss": 1.6996, + "step": 18693 + }, + { + "epoch": 5.7378759975445055, + "grad_norm": 0.23018911480903625, + "learning_rate": 4.0543649702715186e-05, + "loss": 1.7167, + "step": 18694 + }, + { + "epoch": 5.738182934315531, + "grad_norm": 0.21207039058208466, + "learning_rate": 4.053876889510282e-05, + "loss": 1.7539, + "step": 18695 + }, + { + "epoch": 5.738489871086556, + "grad_norm": 0.22042523324489594, + "learning_rate": 4.0533888180991915e-05, + "loss": 1.8145, + "step": 18696 + }, + { + "epoch": 5.7387968078575815, + "grad_norm": 0.20705139636993408, + "learning_rate": 4.0529007560430646e-05, + "loss": 1.7612, + "step": 18697 + }, + { + "epoch": 5.739103744628607, + "grad_norm": 0.20673857629299164, + "learning_rate": 4.052412703346729e-05, + "loss": 1.7338, + "step": 18698 + }, + { + "epoch": 5.739410681399631, + "grad_norm": 0.20742641389369965, + "learning_rate": 4.051924660015005e-05, + "loss": 1.7497, + "step": 18699 + }, + { + "epoch": 5.739717618170657, + "grad_norm": 0.22352617979049683, + "learning_rate": 4.05143662605272e-05, + "loss": 1.7568, + "step": 18700 + }, + { + "epoch": 5.740024554941682, + "grad_norm": 0.20306691527366638, + "learning_rate": 4.050948601464692e-05, + "loss": 1.7416, + "step": 18701 + }, + { + "epoch": 5.740331491712707, + "grad_norm": 0.22972522675991058, + "learning_rate": 4.050460586255748e-05, + "loss": 1.7907, + "step": 18702 + }, + { + "epoch": 5.740638428483733, + "grad_norm": 0.2056068629026413, + "learning_rate": 4.0499725804307084e-05, + "loss": 1.7584, + "step": 18703 + }, + { + "epoch": 5.740945365254758, + "grad_norm": 0.2150508463382721, + "learning_rate": 4.049484583994395e-05, + "loss": 1.7695, + "step": 18704 + }, + { + "epoch": 5.741252302025782, + "grad_norm": 0.20274797081947327, + "learning_rate": 4.048996596951634e-05, + "loss": 1.7398, + "step": 18705 + }, + { + "epoch": 5.741559238796808, + "grad_norm": 0.20521290600299835, + "learning_rate": 4.0485086193072444e-05, + "loss": 1.7529, + "step": 18706 + }, + { + "epoch": 5.741866175567833, + "grad_norm": 0.22344307601451874, + "learning_rate": 4.0480206510660527e-05, + "loss": 1.6729, + "step": 18707 + }, + { + "epoch": 5.742173112338858, + "grad_norm": 0.20007841289043427, + "learning_rate": 4.047532692232876e-05, + "loss": 1.7004, + "step": 18708 + }, + { + "epoch": 5.742480049109884, + "grad_norm": 0.2455853819847107, + "learning_rate": 4.047044742812541e-05, + "loss": 1.7324, + "step": 18709 + }, + { + "epoch": 5.742786985880908, + "grad_norm": 0.29901546239852905, + "learning_rate": 4.046556802809867e-05, + "loss": 1.7138, + "step": 18710 + }, + { + "epoch": 5.7430939226519335, + "grad_norm": 0.19636842608451843, + "learning_rate": 4.04606887222968e-05, + "loss": 1.7098, + "step": 18711 + }, + { + "epoch": 5.743400859422959, + "grad_norm": 0.24916070699691772, + "learning_rate": 4.045580951076797e-05, + "loss": 1.7073, + "step": 18712 + }, + { + "epoch": 5.743707796193984, + "grad_norm": 0.2122841477394104, + "learning_rate": 4.0450930393560453e-05, + "loss": 1.7608, + "step": 18713 + }, + { + "epoch": 5.7440147329650095, + "grad_norm": 0.25119176506996155, + "learning_rate": 4.044605137072241e-05, + "loss": 1.7528, + "step": 18714 + }, + { + "epoch": 5.744321669736035, + "grad_norm": 0.2128097116947174, + "learning_rate": 4.0441172442302104e-05, + "loss": 1.6834, + "step": 18715 + }, + { + "epoch": 5.744628606507059, + "grad_norm": 0.1771443784236908, + "learning_rate": 4.043629360834772e-05, + "loss": 1.6699, + "step": 18716 + }, + { + "epoch": 5.744935543278085, + "grad_norm": 0.2360549122095108, + "learning_rate": 4.043141486890751e-05, + "loss": 1.7704, + "step": 18717 + }, + { + "epoch": 5.74524248004911, + "grad_norm": 0.22453519701957703, + "learning_rate": 4.0426536224029645e-05, + "loss": 1.7305, + "step": 18718 + }, + { + "epoch": 5.745549416820135, + "grad_norm": 0.2170165628194809, + "learning_rate": 4.042165767376238e-05, + "loss": 1.7859, + "step": 18719 + }, + { + "epoch": 5.74585635359116, + "grad_norm": 0.233921617269516, + "learning_rate": 4.0416779218153896e-05, + "loss": 1.7622, + "step": 18720 + }, + { + "epoch": 5.746163290362185, + "grad_norm": 0.2698482871055603, + "learning_rate": 4.041190085725242e-05, + "loss": 1.7419, + "step": 18721 + }, + { + "epoch": 5.74647022713321, + "grad_norm": 0.28437280654907227, + "learning_rate": 4.0407022591106165e-05, + "loss": 1.7242, + "step": 18722 + }, + { + "epoch": 5.746777163904236, + "grad_norm": 0.2087356448173523, + "learning_rate": 4.040214441976332e-05, + "loss": 1.747, + "step": 18723 + }, + { + "epoch": 5.747084100675261, + "grad_norm": 0.2028181403875351, + "learning_rate": 4.039726634327213e-05, + "loss": 1.7843, + "step": 18724 + }, + { + "epoch": 5.747391037446286, + "grad_norm": 0.18513897061347961, + "learning_rate": 4.039238836168076e-05, + "loss": 1.692, + "step": 18725 + }, + { + "epoch": 5.747697974217311, + "grad_norm": 0.2308989316225052, + "learning_rate": 4.038751047503745e-05, + "loss": 1.6625, + "step": 18726 + }, + { + "epoch": 5.748004910988336, + "grad_norm": 0.23922030627727509, + "learning_rate": 4.0382632683390386e-05, + "loss": 1.7407, + "step": 18727 + }, + { + "epoch": 5.7483118477593615, + "grad_norm": 0.17225340008735657, + "learning_rate": 4.0377754986787806e-05, + "loss": 1.6888, + "step": 18728 + }, + { + "epoch": 5.748618784530387, + "grad_norm": 0.1898551732301712, + "learning_rate": 4.037287738527786e-05, + "loss": 1.6931, + "step": 18729 + }, + { + "epoch": 5.748925721301412, + "grad_norm": 0.22900012135505676, + "learning_rate": 4.036799987890881e-05, + "loss": 1.751, + "step": 18730 + }, + { + "epoch": 5.749232658072437, + "grad_norm": 0.21106193959712982, + "learning_rate": 4.0363122467728815e-05, + "loss": 1.6919, + "step": 18731 + }, + { + "epoch": 5.749539594843462, + "grad_norm": 0.19944290816783905, + "learning_rate": 4.03582451517861e-05, + "loss": 1.7232, + "step": 18732 + }, + { + "epoch": 5.749846531614487, + "grad_norm": 0.1833256036043167, + "learning_rate": 4.035336793112885e-05, + "loss": 1.7199, + "step": 18733 + }, + { + "epoch": 5.750153468385513, + "grad_norm": 0.2596902847290039, + "learning_rate": 4.0348490805805287e-05, + "loss": 1.7386, + "step": 18734 + }, + { + "epoch": 5.750460405156538, + "grad_norm": 0.23708637058734894, + "learning_rate": 4.034361377586357e-05, + "loss": 1.7697, + "step": 18735 + }, + { + "epoch": 5.750767341927563, + "grad_norm": 0.20476554334163666, + "learning_rate": 4.033873684135195e-05, + "loss": 1.7804, + "step": 18736 + }, + { + "epoch": 5.751074278698588, + "grad_norm": 0.2625868320465088, + "learning_rate": 4.033386000231858e-05, + "loss": 1.7046, + "step": 18737 + }, + { + "epoch": 5.751381215469613, + "grad_norm": 0.23011820018291473, + "learning_rate": 4.032898325881166e-05, + "loss": 1.7758, + "step": 18738 + }, + { + "epoch": 5.7516881522406385, + "grad_norm": 0.23972748219966888, + "learning_rate": 4.032410661087943e-05, + "loss": 1.7165, + "step": 18739 + }, + { + "epoch": 5.751995089011664, + "grad_norm": 0.2241208404302597, + "learning_rate": 4.031923005857001e-05, + "loss": 1.713, + "step": 18740 + }, + { + "epoch": 5.752302025782689, + "grad_norm": 0.22316952049732208, + "learning_rate": 4.0314353601931665e-05, + "loss": 1.7655, + "step": 18741 + }, + { + "epoch": 5.752608962553714, + "grad_norm": 0.2177707403898239, + "learning_rate": 4.030947724101253e-05, + "loss": 1.7517, + "step": 18742 + }, + { + "epoch": 5.752915899324739, + "grad_norm": 0.21731823682785034, + "learning_rate": 4.030460097586083e-05, + "loss": 1.718, + "step": 18743 + }, + { + "epoch": 5.753222836095764, + "grad_norm": 0.1700165718793869, + "learning_rate": 4.0299724806524744e-05, + "loss": 1.6536, + "step": 18744 + }, + { + "epoch": 5.75352977286679, + "grad_norm": 0.21920062601566315, + "learning_rate": 4.029484873305247e-05, + "loss": 1.7298, + "step": 18745 + }, + { + "epoch": 5.753836709637815, + "grad_norm": 0.22648905217647552, + "learning_rate": 4.028997275549218e-05, + "loss": 1.7878, + "step": 18746 + }, + { + "epoch": 5.75414364640884, + "grad_norm": 0.19443005323410034, + "learning_rate": 4.028509687389208e-05, + "loss": 1.7582, + "step": 18747 + }, + { + "epoch": 5.754450583179865, + "grad_norm": 0.21973860263824463, + "learning_rate": 4.028022108830034e-05, + "loss": 1.8215, + "step": 18748 + }, + { + "epoch": 5.75475751995089, + "grad_norm": 0.2215481847524643, + "learning_rate": 4.0275345398765155e-05, + "loss": 1.7092, + "step": 18749 + }, + { + "epoch": 5.755064456721915, + "grad_norm": 0.18789733946323395, + "learning_rate": 4.0270469805334696e-05, + "loss": 1.7089, + "step": 18750 + }, + { + "epoch": 5.755371393492941, + "grad_norm": 0.2423657774925232, + "learning_rate": 4.0265594308057175e-05, + "loss": 1.7412, + "step": 18751 + }, + { + "epoch": 5.755678330263965, + "grad_norm": 0.22020475566387177, + "learning_rate": 4.026071890698074e-05, + "loss": 1.7644, + "step": 18752 + }, + { + "epoch": 5.7559852670349905, + "grad_norm": 0.31772032380104065, + "learning_rate": 4.025584360215361e-05, + "loss": 1.7326, + "step": 18753 + }, + { + "epoch": 5.756292203806016, + "grad_norm": 0.23786257207393646, + "learning_rate": 4.025096839362393e-05, + "loss": 1.7652, + "step": 18754 + }, + { + "epoch": 5.756599140577041, + "grad_norm": 0.24288083612918854, + "learning_rate": 4.024609328143989e-05, + "loss": 1.6797, + "step": 18755 + }, + { + "epoch": 5.7569060773480665, + "grad_norm": 0.30519670248031616, + "learning_rate": 4.024121826564969e-05, + "loss": 1.7442, + "step": 18756 + }, + { + "epoch": 5.757213014119092, + "grad_norm": 0.218281090259552, + "learning_rate": 4.023634334630147e-05, + "loss": 1.7498, + "step": 18757 + }, + { + "epoch": 5.757519950890116, + "grad_norm": 0.215846985578537, + "learning_rate": 4.023146852344345e-05, + "loss": 1.7728, + "step": 18758 + }, + { + "epoch": 5.757826887661142, + "grad_norm": 0.2883944511413574, + "learning_rate": 4.022659379712376e-05, + "loss": 1.8098, + "step": 18759 + }, + { + "epoch": 5.758133824432167, + "grad_norm": 0.25141629576683044, + "learning_rate": 4.022171916739062e-05, + "loss": 1.6574, + "step": 18760 + }, + { + "epoch": 5.758440761203192, + "grad_norm": 0.22118757665157318, + "learning_rate": 4.021684463429216e-05, + "loss": 1.7542, + "step": 18761 + }, + { + "epoch": 5.758747697974218, + "grad_norm": 0.2437646985054016, + "learning_rate": 4.02119701978766e-05, + "loss": 1.7182, + "step": 18762 + }, + { + "epoch": 5.759054634745242, + "grad_norm": 0.24247203767299652, + "learning_rate": 4.020709585819206e-05, + "loss": 1.7134, + "step": 18763 + }, + { + "epoch": 5.759361571516267, + "grad_norm": 0.208528533577919, + "learning_rate": 4.020222161528677e-05, + "loss": 1.6966, + "step": 18764 + }, + { + "epoch": 5.759668508287293, + "grad_norm": 0.19645826518535614, + "learning_rate": 4.0197347469208843e-05, + "loss": 1.7261, + "step": 18765 + }, + { + "epoch": 5.759975445058318, + "grad_norm": 0.20066291093826294, + "learning_rate": 4.019247342000648e-05, + "loss": 1.7197, + "step": 18766 + }, + { + "epoch": 5.760282381829343, + "grad_norm": 0.25344669818878174, + "learning_rate": 4.0187599467727845e-05, + "loss": 1.7957, + "step": 18767 + }, + { + "epoch": 5.760589318600369, + "grad_norm": 0.1917620301246643, + "learning_rate": 4.018272561242111e-05, + "loss": 1.6868, + "step": 18768 + }, + { + "epoch": 5.760896255371393, + "grad_norm": 0.21996566653251648, + "learning_rate": 4.0177851854134424e-05, + "loss": 1.7128, + "step": 18769 + }, + { + "epoch": 5.7612031921424185, + "grad_norm": 0.23226283490657806, + "learning_rate": 4.017297819291598e-05, + "loss": 1.7079, + "step": 18770 + }, + { + "epoch": 5.761510128913444, + "grad_norm": 0.30606213212013245, + "learning_rate": 4.016810462881391e-05, + "loss": 1.8087, + "step": 18771 + }, + { + "epoch": 5.761817065684469, + "grad_norm": 0.2171698361635208, + "learning_rate": 4.016323116187639e-05, + "loss": 1.7377, + "step": 18772 + }, + { + "epoch": 5.7621240024554945, + "grad_norm": 0.24234412610530853, + "learning_rate": 4.01583577921516e-05, + "loss": 1.734, + "step": 18773 + }, + { + "epoch": 5.762430939226519, + "grad_norm": 0.2648961544036865, + "learning_rate": 4.015348451968767e-05, + "loss": 1.7423, + "step": 18774 + }, + { + "epoch": 5.762737875997544, + "grad_norm": 0.18316571414470673, + "learning_rate": 4.01486113445328e-05, + "loss": 1.6708, + "step": 18775 + }, + { + "epoch": 5.76304481276857, + "grad_norm": 0.241583451628685, + "learning_rate": 4.0143738266735104e-05, + "loss": 1.708, + "step": 18776 + }, + { + "epoch": 5.763351749539595, + "grad_norm": 0.2268480360507965, + "learning_rate": 4.0138865286342775e-05, + "loss": 1.7106, + "step": 18777 + }, + { + "epoch": 5.76365868631062, + "grad_norm": 0.2038748860359192, + "learning_rate": 4.0133992403403944e-05, + "loss": 1.7349, + "step": 18778 + }, + { + "epoch": 5.763965623081646, + "grad_norm": 0.24422483146190643, + "learning_rate": 4.0129119617966805e-05, + "loss": 1.659, + "step": 18779 + }, + { + "epoch": 5.76427255985267, + "grad_norm": 0.19925715029239655, + "learning_rate": 4.0124246930079476e-05, + "loss": 1.6983, + "step": 18780 + }, + { + "epoch": 5.764579496623695, + "grad_norm": 0.29671359062194824, + "learning_rate": 4.0119374339790136e-05, + "loss": 1.7188, + "step": 18781 + }, + { + "epoch": 5.764886433394721, + "grad_norm": 0.2752140760421753, + "learning_rate": 4.011450184714692e-05, + "loss": 1.738, + "step": 18782 + }, + { + "epoch": 5.765193370165746, + "grad_norm": 0.2112676352262497, + "learning_rate": 4.0109629452198e-05, + "loss": 1.7529, + "step": 18783 + }, + { + "epoch": 5.765500306936771, + "grad_norm": 0.2091330885887146, + "learning_rate": 4.010475715499151e-05, + "loss": 1.6771, + "step": 18784 + }, + { + "epoch": 5.765807243707796, + "grad_norm": 0.26556238532066345, + "learning_rate": 4.009988495557562e-05, + "loss": 1.7721, + "step": 18785 + }, + { + "epoch": 5.766114180478821, + "grad_norm": 0.20728638768196106, + "learning_rate": 4.009501285399846e-05, + "loss": 1.6893, + "step": 18786 + }, + { + "epoch": 5.7664211172498465, + "grad_norm": 0.213730126619339, + "learning_rate": 4.00901408503082e-05, + "loss": 1.704, + "step": 18787 + }, + { + "epoch": 5.766728054020872, + "grad_norm": 0.21422363817691803, + "learning_rate": 4.0085268944552975e-05, + "loss": 1.7571, + "step": 18788 + }, + { + "epoch": 5.767034990791897, + "grad_norm": 0.20936815440654755, + "learning_rate": 4.0080397136780915e-05, + "loss": 1.7423, + "step": 18789 + }, + { + "epoch": 5.7673419275629225, + "grad_norm": 0.26223674416542053, + "learning_rate": 4.007552542704021e-05, + "loss": 1.7687, + "step": 18790 + }, + { + "epoch": 5.767648864333947, + "grad_norm": 0.3524645268917084, + "learning_rate": 4.0070653815378954e-05, + "loss": 1.7754, + "step": 18791 + }, + { + "epoch": 5.767955801104972, + "grad_norm": 0.20238324999809265, + "learning_rate": 4.006578230184534e-05, + "loss": 1.7043, + "step": 18792 + }, + { + "epoch": 5.768262737875998, + "grad_norm": 0.2739984393119812, + "learning_rate": 4.006091088648747e-05, + "loss": 1.7596, + "step": 18793 + }, + { + "epoch": 5.768569674647023, + "grad_norm": 0.29209306836128235, + "learning_rate": 4.0056039569353515e-05, + "loss": 1.6857, + "step": 18794 + }, + { + "epoch": 5.768876611418047, + "grad_norm": 0.21838447451591492, + "learning_rate": 4.005116835049161e-05, + "loss": 1.7531, + "step": 18795 + }, + { + "epoch": 5.769183548189073, + "grad_norm": 0.21940091252326965, + "learning_rate": 4.0046297229949884e-05, + "loss": 1.7363, + "step": 18796 + }, + { + "epoch": 5.769490484960098, + "grad_norm": 0.22679758071899414, + "learning_rate": 4.004142620777647e-05, + "loss": 1.7586, + "step": 18797 + }, + { + "epoch": 5.769797421731123, + "grad_norm": 0.23782022297382355, + "learning_rate": 4.003655528401954e-05, + "loss": 1.7154, + "step": 18798 + }, + { + "epoch": 5.770104358502149, + "grad_norm": 0.20452092587947845, + "learning_rate": 4.0031684458727194e-05, + "loss": 1.7078, + "step": 18799 + }, + { + "epoch": 5.770411295273174, + "grad_norm": 0.22733618319034576, + "learning_rate": 4.0026813731947594e-05, + "loss": 1.6989, + "step": 18800 + }, + { + "epoch": 5.7707182320441985, + "grad_norm": 0.2322154939174652, + "learning_rate": 4.002194310372886e-05, + "loss": 1.7508, + "step": 18801 + }, + { + "epoch": 5.771025168815224, + "grad_norm": 0.24573352932929993, + "learning_rate": 4.001707257411914e-05, + "loss": 1.7245, + "step": 18802 + }, + { + "epoch": 5.771332105586249, + "grad_norm": 0.19692079722881317, + "learning_rate": 4.001220214316655e-05, + "loss": 1.7116, + "step": 18803 + }, + { + "epoch": 5.7716390423572745, + "grad_norm": 0.20525199174880981, + "learning_rate": 4.000733181091925e-05, + "loss": 1.7503, + "step": 18804 + }, + { + "epoch": 5.7719459791283, + "grad_norm": 0.2097626030445099, + "learning_rate": 4.0002461577425344e-05, + "loss": 1.8204, + "step": 18805 + }, + { + "epoch": 5.772252915899324, + "grad_norm": 0.23059608042240143, + "learning_rate": 3.9997591442732975e-05, + "loss": 1.7747, + "step": 18806 + }, + { + "epoch": 5.77255985267035, + "grad_norm": 0.22085745632648468, + "learning_rate": 3.9992721406890265e-05, + "loss": 1.7579, + "step": 18807 + }, + { + "epoch": 5.772866789441375, + "grad_norm": 0.21529869735240936, + "learning_rate": 3.9987851469945334e-05, + "loss": 1.711, + "step": 18808 + }, + { + "epoch": 5.7731737262124, + "grad_norm": 0.20563572645187378, + "learning_rate": 3.998298163194636e-05, + "loss": 1.761, + "step": 18809 + }, + { + "epoch": 5.773480662983426, + "grad_norm": 0.2081122100353241, + "learning_rate": 3.9978111892941394e-05, + "loss": 1.7112, + "step": 18810 + }, + { + "epoch": 5.773787599754451, + "grad_norm": 0.2373751550912857, + "learning_rate": 3.9973242252978635e-05, + "loss": 1.7726, + "step": 18811 + }, + { + "epoch": 5.774094536525475, + "grad_norm": 0.2742944359779358, + "learning_rate": 3.996837271210615e-05, + "loss": 1.7743, + "step": 18812 + }, + { + "epoch": 5.774401473296501, + "grad_norm": 0.20724992454051971, + "learning_rate": 3.996350327037208e-05, + "loss": 1.7052, + "step": 18813 + }, + { + "epoch": 5.774708410067526, + "grad_norm": 0.22324968874454498, + "learning_rate": 3.995863392782456e-05, + "loss": 1.7865, + "step": 18814 + }, + { + "epoch": 5.7750153468385514, + "grad_norm": 0.22314245998859406, + "learning_rate": 3.995376468451172e-05, + "loss": 1.7705, + "step": 18815 + }, + { + "epoch": 5.775322283609577, + "grad_norm": 0.20793841779232025, + "learning_rate": 3.994889554048165e-05, + "loss": 1.739, + "step": 18816 + }, + { + "epoch": 5.775629220380601, + "grad_norm": 0.20117145776748657, + "learning_rate": 3.994402649578249e-05, + "loss": 1.7256, + "step": 18817 + }, + { + "epoch": 5.775936157151627, + "grad_norm": 0.24406170845031738, + "learning_rate": 3.993915755046235e-05, + "loss": 1.8015, + "step": 18818 + }, + { + "epoch": 5.776243093922652, + "grad_norm": 0.20912545919418335, + "learning_rate": 3.993428870456935e-05, + "loss": 1.7038, + "step": 18819 + }, + { + "epoch": 5.776550030693677, + "grad_norm": 0.2587272822856903, + "learning_rate": 3.992941995815162e-05, + "loss": 1.7918, + "step": 18820 + }, + { + "epoch": 5.776856967464703, + "grad_norm": 0.2996658980846405, + "learning_rate": 3.9924551311257266e-05, + "loss": 1.7513, + "step": 18821 + }, + { + "epoch": 5.777163904235728, + "grad_norm": 0.24603547155857086, + "learning_rate": 3.991968276393441e-05, + "loss": 1.7329, + "step": 18822 + }, + { + "epoch": 5.777470841006752, + "grad_norm": 0.2321038693189621, + "learning_rate": 3.991481431623113e-05, + "loss": 1.7406, + "step": 18823 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.3397100269794464, + "learning_rate": 3.990994596819558e-05, + "loss": 1.8129, + "step": 18824 + }, + { + "epoch": 5.778084714548803, + "grad_norm": 0.2807735800743103, + "learning_rate": 3.990507771987584e-05, + "loss": 1.7579, + "step": 18825 + }, + { + "epoch": 5.778391651319828, + "grad_norm": 0.1952899694442749, + "learning_rate": 3.990020957132007e-05, + "loss": 1.7153, + "step": 18826 + }, + { + "epoch": 5.778698588090853, + "grad_norm": 0.28998714685440063, + "learning_rate": 3.989534152257632e-05, + "loss": 1.7844, + "step": 18827 + }, + { + "epoch": 5.779005524861878, + "grad_norm": 0.20929136872291565, + "learning_rate": 3.989047357369275e-05, + "loss": 1.7499, + "step": 18828 + }, + { + "epoch": 5.7793124616329035, + "grad_norm": 0.31144043803215027, + "learning_rate": 3.9885605724717436e-05, + "loss": 1.7745, + "step": 18829 + }, + { + "epoch": 5.779619398403929, + "grad_norm": 0.22598792612552643, + "learning_rate": 3.988073797569849e-05, + "loss": 1.7226, + "step": 18830 + }, + { + "epoch": 5.779926335174954, + "grad_norm": 0.1971752643585205, + "learning_rate": 3.987587032668402e-05, + "loss": 1.7033, + "step": 18831 + }, + { + "epoch": 5.7802332719459795, + "grad_norm": 0.221087247133255, + "learning_rate": 3.9871002777722156e-05, + "loss": 1.7281, + "step": 18832 + }, + { + "epoch": 5.780540208717004, + "grad_norm": 0.21678583323955536, + "learning_rate": 3.986613532886095e-05, + "loss": 1.7207, + "step": 18833 + }, + { + "epoch": 5.780847145488029, + "grad_norm": 0.2511122226715088, + "learning_rate": 3.9861267980148566e-05, + "loss": 1.7091, + "step": 18834 + }, + { + "epoch": 5.781154082259055, + "grad_norm": 0.2883855104446411, + "learning_rate": 3.985640073163304e-05, + "loss": 1.7963, + "step": 18835 + }, + { + "epoch": 5.78146101903008, + "grad_norm": 0.21786242723464966, + "learning_rate": 3.985153358336253e-05, + "loss": 1.6883, + "step": 18836 + }, + { + "epoch": 5.781767955801105, + "grad_norm": 0.18529155850410461, + "learning_rate": 3.98466665353851e-05, + "loss": 1.7194, + "step": 18837 + }, + { + "epoch": 5.78207489257213, + "grad_norm": 0.20535743236541748, + "learning_rate": 3.984179958774888e-05, + "loss": 1.6943, + "step": 18838 + }, + { + "epoch": 5.782381829343155, + "grad_norm": 0.19377392530441284, + "learning_rate": 3.983693274050195e-05, + "loss": 1.6732, + "step": 18839 + }, + { + "epoch": 5.78268876611418, + "grad_norm": 0.22373615205287933, + "learning_rate": 3.983206599369239e-05, + "loss": 1.7668, + "step": 18840 + }, + { + "epoch": 5.782995702885206, + "grad_norm": 0.2132388800382614, + "learning_rate": 3.982719934736832e-05, + "loss": 1.7155, + "step": 18841 + }, + { + "epoch": 5.783302639656231, + "grad_norm": 0.24871744215488434, + "learning_rate": 3.982233280157782e-05, + "loss": 1.7232, + "step": 18842 + }, + { + "epoch": 5.783609576427256, + "grad_norm": 0.1861848086118698, + "learning_rate": 3.981746635636902e-05, + "loss": 1.707, + "step": 18843 + }, + { + "epoch": 5.783916513198281, + "grad_norm": 0.21882779896259308, + "learning_rate": 3.981260001178995e-05, + "loss": 1.7165, + "step": 18844 + }, + { + "epoch": 5.784223449969306, + "grad_norm": 0.22144648432731628, + "learning_rate": 3.980773376788877e-05, + "loss": 1.7799, + "step": 18845 + }, + { + "epoch": 5.7845303867403315, + "grad_norm": 0.210894376039505, + "learning_rate": 3.980286762471351e-05, + "loss": 1.7539, + "step": 18846 + }, + { + "epoch": 5.784837323511357, + "grad_norm": 0.20435640215873718, + "learning_rate": 3.9798001582312305e-05, + "loss": 1.6736, + "step": 18847 + }, + { + "epoch": 5.785144260282382, + "grad_norm": 0.18998762965202332, + "learning_rate": 3.979313564073322e-05, + "loss": 1.7045, + "step": 18848 + }, + { + "epoch": 5.785451197053407, + "grad_norm": 0.19869361817836761, + "learning_rate": 3.978826980002437e-05, + "loss": 1.7444, + "step": 18849 + }, + { + "epoch": 5.785758133824432, + "grad_norm": 0.2175174504518509, + "learning_rate": 3.97834040602338e-05, + "loss": 1.7565, + "step": 18850 + }, + { + "epoch": 5.786065070595457, + "grad_norm": 0.22726793587207794, + "learning_rate": 3.977853842140964e-05, + "loss": 1.713, + "step": 18851 + }, + { + "epoch": 5.786372007366483, + "grad_norm": 0.26518720388412476, + "learning_rate": 3.9773672883599934e-05, + "loss": 1.6892, + "step": 18852 + }, + { + "epoch": 5.786678944137508, + "grad_norm": 0.20721858739852905, + "learning_rate": 3.97688074468528e-05, + "loss": 1.724, + "step": 18853 + }, + { + "epoch": 5.786985880908533, + "grad_norm": 0.22739483416080475, + "learning_rate": 3.976394211121629e-05, + "loss": 1.762, + "step": 18854 + }, + { + "epoch": 5.787292817679558, + "grad_norm": 0.21918894350528717, + "learning_rate": 3.975907687673853e-05, + "loss": 1.6812, + "step": 18855 + }, + { + "epoch": 5.787599754450583, + "grad_norm": 0.20931273698806763, + "learning_rate": 3.9754211743467574e-05, + "loss": 1.6874, + "step": 18856 + }, + { + "epoch": 5.787906691221608, + "grad_norm": 0.2015041708946228, + "learning_rate": 3.974934671145148e-05, + "loss": 1.7248, + "step": 18857 + }, + { + "epoch": 5.788213627992634, + "grad_norm": 0.21632663905620575, + "learning_rate": 3.974448178073836e-05, + "loss": 1.7313, + "step": 18858 + }, + { + "epoch": 5.788520564763659, + "grad_norm": 0.18995213508605957, + "learning_rate": 3.973961695137627e-05, + "loss": 1.6761, + "step": 18859 + }, + { + "epoch": 5.7888275015346835, + "grad_norm": 0.18678395450115204, + "learning_rate": 3.973475222341333e-05, + "loss": 1.7082, + "step": 18860 + }, + { + "epoch": 5.789134438305709, + "grad_norm": 0.1889343559741974, + "learning_rate": 3.972988759689756e-05, + "loss": 1.7296, + "step": 18861 + }, + { + "epoch": 5.789441375076734, + "grad_norm": 0.20196790993213654, + "learning_rate": 3.9725023071877074e-05, + "loss": 1.6876, + "step": 18862 + }, + { + "epoch": 5.7897483118477595, + "grad_norm": 0.198349729180336, + "learning_rate": 3.972015864839992e-05, + "loss": 1.6826, + "step": 18863 + }, + { + "epoch": 5.790055248618785, + "grad_norm": 0.21323837339878082, + "learning_rate": 3.9715294326514185e-05, + "loss": 1.7444, + "step": 18864 + }, + { + "epoch": 5.79036218538981, + "grad_norm": 0.18581731617450714, + "learning_rate": 3.9710430106267934e-05, + "loss": 1.7731, + "step": 18865 + }, + { + "epoch": 5.790669122160835, + "grad_norm": 0.21925146877765656, + "learning_rate": 3.970556598770927e-05, + "loss": 1.7505, + "step": 18866 + }, + { + "epoch": 5.79097605893186, + "grad_norm": 0.20773115754127502, + "learning_rate": 3.970070197088621e-05, + "loss": 1.7408, + "step": 18867 + }, + { + "epoch": 5.791282995702885, + "grad_norm": 0.1805189698934555, + "learning_rate": 3.9695838055846865e-05, + "loss": 1.6871, + "step": 18868 + }, + { + "epoch": 5.791589932473911, + "grad_norm": 0.24685314297676086, + "learning_rate": 3.969097424263928e-05, + "loss": 1.7186, + "step": 18869 + }, + { + "epoch": 5.791896869244935, + "grad_norm": 0.18801769614219666, + "learning_rate": 3.9686110531311526e-05, + "loss": 1.7196, + "step": 18870 + }, + { + "epoch": 5.79220380601596, + "grad_norm": 0.22717779874801636, + "learning_rate": 3.968124692191168e-05, + "loss": 1.7309, + "step": 18871 + }, + { + "epoch": 5.792510742786986, + "grad_norm": 0.23058642446994781, + "learning_rate": 3.9676383414487806e-05, + "loss": 1.6993, + "step": 18872 + }, + { + "epoch": 5.792817679558011, + "grad_norm": 0.24307532608509064, + "learning_rate": 3.967152000908796e-05, + "loss": 1.6986, + "step": 18873 + }, + { + "epoch": 5.793124616329036, + "grad_norm": 0.3032459318637848, + "learning_rate": 3.9666656705760195e-05, + "loss": 1.677, + "step": 18874 + }, + { + "epoch": 5.793431553100062, + "grad_norm": 0.22669538855552673, + "learning_rate": 3.966179350455259e-05, + "loss": 1.7361, + "step": 18875 + }, + { + "epoch": 5.793738489871086, + "grad_norm": 0.27729150652885437, + "learning_rate": 3.96569304055132e-05, + "loss": 1.746, + "step": 18876 + }, + { + "epoch": 5.7940454266421115, + "grad_norm": 0.3422098755836487, + "learning_rate": 3.96520674086901e-05, + "loss": 1.783, + "step": 18877 + }, + { + "epoch": 5.794352363413137, + "grad_norm": 0.2114052176475525, + "learning_rate": 3.964720451413131e-05, + "loss": 1.7127, + "step": 18878 + }, + { + "epoch": 5.794659300184162, + "grad_norm": 0.22928549349308014, + "learning_rate": 3.964234172188494e-05, + "loss": 1.6579, + "step": 18879 + }, + { + "epoch": 5.7949662369551875, + "grad_norm": 0.24813635647296906, + "learning_rate": 3.9637479031999e-05, + "loss": 1.728, + "step": 18880 + }, + { + "epoch": 5.795273173726212, + "grad_norm": 0.19779744744300842, + "learning_rate": 3.963261644452158e-05, + "loss": 1.7338, + "step": 18881 + }, + { + "epoch": 5.795580110497237, + "grad_norm": 0.2424263060092926, + "learning_rate": 3.96277539595007e-05, + "loss": 1.7762, + "step": 18882 + }, + { + "epoch": 5.795887047268263, + "grad_norm": 0.24621224403381348, + "learning_rate": 3.9622891576984456e-05, + "loss": 1.7746, + "step": 18883 + }, + { + "epoch": 5.796193984039288, + "grad_norm": 0.1973372846841812, + "learning_rate": 3.961802929702086e-05, + "loss": 1.7243, + "step": 18884 + }, + { + "epoch": 5.796500920810313, + "grad_norm": 0.22170570492744446, + "learning_rate": 3.961316711965801e-05, + "loss": 1.764, + "step": 18885 + }, + { + "epoch": 5.796807857581339, + "grad_norm": 0.22319282591342926, + "learning_rate": 3.9608305044943906e-05, + "loss": 1.6795, + "step": 18886 + }, + { + "epoch": 5.797114794352363, + "grad_norm": 0.20000022649765015, + "learning_rate": 3.9603443072926635e-05, + "loss": 1.7587, + "step": 18887 + }, + { + "epoch": 5.797421731123388, + "grad_norm": 0.25041815638542175, + "learning_rate": 3.959858120365424e-05, + "loss": 1.7631, + "step": 18888 + }, + { + "epoch": 5.797728667894414, + "grad_norm": 0.23383729159832, + "learning_rate": 3.959371943717474e-05, + "loss": 1.741, + "step": 18889 + }, + { + "epoch": 5.798035604665439, + "grad_norm": 0.18609663844108582, + "learning_rate": 3.958885777353623e-05, + "loss": 1.6981, + "step": 18890 + }, + { + "epoch": 5.798342541436464, + "grad_norm": 0.29523593187332153, + "learning_rate": 3.9583996212786706e-05, + "loss": 1.8018, + "step": 18891 + }, + { + "epoch": 5.798649478207489, + "grad_norm": 0.20356589555740356, + "learning_rate": 3.9579134754974244e-05, + "loss": 1.7157, + "step": 18892 + }, + { + "epoch": 5.798956414978514, + "grad_norm": 0.2901862561702728, + "learning_rate": 3.957427340014688e-05, + "loss": 1.7249, + "step": 18893 + }, + { + "epoch": 5.7992633517495396, + "grad_norm": 0.24768278002738953, + "learning_rate": 3.956941214835267e-05, + "loss": 1.6894, + "step": 18894 + }, + { + "epoch": 5.799570288520565, + "grad_norm": 0.2417999804019928, + "learning_rate": 3.956455099963962e-05, + "loss": 1.7203, + "step": 18895 + }, + { + "epoch": 5.79987722529159, + "grad_norm": 0.2889639437198639, + "learning_rate": 3.9559689954055814e-05, + "loss": 1.7531, + "step": 18896 + }, + { + "epoch": 5.800184162062616, + "grad_norm": 0.21204611659049988, + "learning_rate": 3.955482901164926e-05, + "loss": 1.7521, + "step": 18897 + }, + { + "epoch": 5.80049109883364, + "grad_norm": 0.2961438298225403, + "learning_rate": 3.954996817246801e-05, + "loss": 1.8102, + "step": 18898 + }, + { + "epoch": 5.800798035604665, + "grad_norm": 0.36562761664390564, + "learning_rate": 3.9545107436560084e-05, + "loss": 1.6722, + "step": 18899 + }, + { + "epoch": 5.801104972375691, + "grad_norm": 0.22423696517944336, + "learning_rate": 3.954024680397357e-05, + "loss": 1.7101, + "step": 18900 + }, + { + "epoch": 5.801411909146716, + "grad_norm": 0.3122335970401764, + "learning_rate": 3.953538627475644e-05, + "loss": 1.7314, + "step": 18901 + }, + { + "epoch": 5.8017188459177405, + "grad_norm": 0.39004257321357727, + "learning_rate": 3.953052584895677e-05, + "loss": 1.762, + "step": 18902 + }, + { + "epoch": 5.802025782688766, + "grad_norm": 0.1827487200498581, + "learning_rate": 3.952566552662256e-05, + "loss": 1.6935, + "step": 18903 + }, + { + "epoch": 5.802332719459791, + "grad_norm": 0.3025164306163788, + "learning_rate": 3.952080530780188e-05, + "loss": 1.7448, + "step": 18904 + }, + { + "epoch": 5.8026396562308165, + "grad_norm": 0.2313300520181656, + "learning_rate": 3.9515945192542754e-05, + "loss": 1.7686, + "step": 18905 + }, + { + "epoch": 5.802946593001842, + "grad_norm": 0.3501042425632477, + "learning_rate": 3.9511085180893184e-05, + "loss": 1.775, + "step": 18906 + }, + { + "epoch": 5.803253529772867, + "grad_norm": 0.4111124873161316, + "learning_rate": 3.950622527290123e-05, + "loss": 1.7561, + "step": 18907 + }, + { + "epoch": 5.803560466543892, + "grad_norm": 0.20877736806869507, + "learning_rate": 3.950136546861489e-05, + "loss": 1.7356, + "step": 18908 + }, + { + "epoch": 5.803867403314917, + "grad_norm": 0.33404025435447693, + "learning_rate": 3.949650576808222e-05, + "loss": 1.7289, + "step": 18909 + }, + { + "epoch": 5.804174340085942, + "grad_norm": 0.2183927446603775, + "learning_rate": 3.9491646171351234e-05, + "loss": 1.7136, + "step": 18910 + }, + { + "epoch": 5.804481276856968, + "grad_norm": 0.27149543166160583, + "learning_rate": 3.948678667846997e-05, + "loss": 1.7516, + "step": 18911 + }, + { + "epoch": 5.804788213627993, + "grad_norm": 0.2369886338710785, + "learning_rate": 3.948192728948643e-05, + "loss": 1.6767, + "step": 18912 + }, + { + "epoch": 5.805095150399017, + "grad_norm": 0.20671069622039795, + "learning_rate": 3.947706800444867e-05, + "loss": 1.7831, + "step": 18913 + }, + { + "epoch": 5.805402087170043, + "grad_norm": 0.23622260987758636, + "learning_rate": 3.9472208823404665e-05, + "loss": 1.7121, + "step": 18914 + }, + { + "epoch": 5.805709023941068, + "grad_norm": 0.21099595725536346, + "learning_rate": 3.946734974640247e-05, + "loss": 1.7137, + "step": 18915 + }, + { + "epoch": 5.806015960712093, + "grad_norm": 0.2205580472946167, + "learning_rate": 3.9462490773490094e-05, + "loss": 1.713, + "step": 18916 + }, + { + "epoch": 5.806322897483119, + "grad_norm": 0.20183326303958893, + "learning_rate": 3.9457631904715584e-05, + "loss": 1.7316, + "step": 18917 + }, + { + "epoch": 5.806629834254144, + "grad_norm": 0.27381497621536255, + "learning_rate": 3.9452773140126906e-05, + "loss": 1.7577, + "step": 18918 + }, + { + "epoch": 5.8069367710251685, + "grad_norm": 0.29962384700775146, + "learning_rate": 3.944791447977214e-05, + "loss": 1.7579, + "step": 18919 + }, + { + "epoch": 5.807243707796194, + "grad_norm": 0.22385326027870178, + "learning_rate": 3.944305592369923e-05, + "loss": 1.7795, + "step": 18920 + }, + { + "epoch": 5.807550644567219, + "grad_norm": 0.2954902648925781, + "learning_rate": 3.943819747195625e-05, + "loss": 1.6655, + "step": 18921 + }, + { + "epoch": 5.8078575813382445, + "grad_norm": 0.18947024643421173, + "learning_rate": 3.94333391245912e-05, + "loss": 1.6803, + "step": 18922 + }, + { + "epoch": 5.80816451810927, + "grad_norm": 0.26797959208488464, + "learning_rate": 3.942848088165206e-05, + "loss": 1.7671, + "step": 18923 + }, + { + "epoch": 5.808471454880294, + "grad_norm": 0.23453201353549957, + "learning_rate": 3.94236227431869e-05, + "loss": 1.7472, + "step": 18924 + }, + { + "epoch": 5.80877839165132, + "grad_norm": 0.24471673369407654, + "learning_rate": 3.941876470924367e-05, + "loss": 1.7482, + "step": 18925 + }, + { + "epoch": 5.809085328422345, + "grad_norm": 0.22249098122119904, + "learning_rate": 3.9413906779870426e-05, + "loss": 1.6794, + "step": 18926 + }, + { + "epoch": 5.80939226519337, + "grad_norm": 0.1985001564025879, + "learning_rate": 3.9409048955115144e-05, + "loss": 1.7278, + "step": 18927 + }, + { + "epoch": 5.809699201964396, + "grad_norm": 0.22482000291347504, + "learning_rate": 3.940419123502587e-05, + "loss": 1.7658, + "step": 18928 + }, + { + "epoch": 5.810006138735421, + "grad_norm": 0.18513578176498413, + "learning_rate": 3.939933361965057e-05, + "loss": 1.7154, + "step": 18929 + }, + { + "epoch": 5.810313075506445, + "grad_norm": 0.1984710991382599, + "learning_rate": 3.939447610903729e-05, + "loss": 1.7324, + "step": 18930 + }, + { + "epoch": 5.810620012277471, + "grad_norm": 0.26089081168174744, + "learning_rate": 3.938961870323399e-05, + "loss": 1.774, + "step": 18931 + }, + { + "epoch": 5.810926949048496, + "grad_norm": 0.2059585452079773, + "learning_rate": 3.9384761402288706e-05, + "loss": 1.7059, + "step": 18932 + }, + { + "epoch": 5.811233885819521, + "grad_norm": 0.1887979656457901, + "learning_rate": 3.937990420624942e-05, + "loss": 1.6829, + "step": 18933 + }, + { + "epoch": 5.811540822590547, + "grad_norm": 0.2589145600795746, + "learning_rate": 3.937504711516417e-05, + "loss": 1.7301, + "step": 18934 + }, + { + "epoch": 5.811847759361571, + "grad_norm": 0.209516704082489, + "learning_rate": 3.9370190129080907e-05, + "loss": 1.7716, + "step": 18935 + }, + { + "epoch": 5.8121546961325965, + "grad_norm": 0.3321632146835327, + "learning_rate": 3.936533324804768e-05, + "loss": 1.7754, + "step": 18936 + }, + { + "epoch": 5.812461632903622, + "grad_norm": 0.236944317817688, + "learning_rate": 3.9360476472112446e-05, + "loss": 1.7546, + "step": 18937 + }, + { + "epoch": 5.812768569674647, + "grad_norm": 0.29667431116104126, + "learning_rate": 3.9355619801323226e-05, + "loss": 1.7712, + "step": 18938 + }, + { + "epoch": 5.8130755064456725, + "grad_norm": 0.3071129620075226, + "learning_rate": 3.935076323572802e-05, + "loss": 1.7351, + "step": 18939 + }, + { + "epoch": 5.813382443216698, + "grad_norm": 0.22747032344341278, + "learning_rate": 3.934590677537479e-05, + "loss": 1.7788, + "step": 18940 + }, + { + "epoch": 5.813689379987722, + "grad_norm": 0.2575854957103729, + "learning_rate": 3.934105042031158e-05, + "loss": 1.705, + "step": 18941 + }, + { + "epoch": 5.813996316758748, + "grad_norm": 0.2561504542827606, + "learning_rate": 3.9336194170586325e-05, + "loss": 1.7309, + "step": 18942 + }, + { + "epoch": 5.814303253529773, + "grad_norm": 0.21570482850074768, + "learning_rate": 3.933133802624707e-05, + "loss": 1.7408, + "step": 18943 + }, + { + "epoch": 5.814610190300798, + "grad_norm": 0.29227179288864136, + "learning_rate": 3.932648198734177e-05, + "loss": 1.7415, + "step": 18944 + }, + { + "epoch": 5.814917127071823, + "grad_norm": 0.17847758531570435, + "learning_rate": 3.9321626053918456e-05, + "loss": 1.7926, + "step": 18945 + }, + { + "epoch": 5.815224063842848, + "grad_norm": 0.24604015052318573, + "learning_rate": 3.931677022602507e-05, + "loss": 1.7519, + "step": 18946 + }, + { + "epoch": 5.815531000613873, + "grad_norm": 0.23843185603618622, + "learning_rate": 3.931191450370965e-05, + "loss": 1.7206, + "step": 18947 + }, + { + "epoch": 5.815837937384899, + "grad_norm": 0.23431400954723358, + "learning_rate": 3.9307058887020126e-05, + "loss": 1.7743, + "step": 18948 + }, + { + "epoch": 5.816144874155924, + "grad_norm": 0.23685097694396973, + "learning_rate": 3.9302203376004525e-05, + "loss": 1.7485, + "step": 18949 + }, + { + "epoch": 5.816451810926949, + "grad_norm": 0.2129819542169571, + "learning_rate": 3.929734797071082e-05, + "loss": 1.6897, + "step": 18950 + }, + { + "epoch": 5.816758747697974, + "grad_norm": 0.24736030399799347, + "learning_rate": 3.9292492671187e-05, + "loss": 1.7292, + "step": 18951 + }, + { + "epoch": 5.817065684468999, + "grad_norm": 0.28659793734550476, + "learning_rate": 3.9287637477481025e-05, + "loss": 1.6772, + "step": 18952 + }, + { + "epoch": 5.8173726212400245, + "grad_norm": 0.22304075956344604, + "learning_rate": 3.928278238964092e-05, + "loss": 1.7991, + "step": 18953 + }, + { + "epoch": 5.81767955801105, + "grad_norm": 0.25354304909706116, + "learning_rate": 3.927792740771462e-05, + "loss": 1.7407, + "step": 18954 + }, + { + "epoch": 5.817986494782075, + "grad_norm": 0.3014552593231201, + "learning_rate": 3.927307253175014e-05, + "loss": 1.7714, + "step": 18955 + }, + { + "epoch": 5.8182934315531, + "grad_norm": 0.20537856221199036, + "learning_rate": 3.926821776179545e-05, + "loss": 1.6992, + "step": 18956 + }, + { + "epoch": 5.818600368324125, + "grad_norm": 0.29656440019607544, + "learning_rate": 3.92633630978985e-05, + "loss": 1.7476, + "step": 18957 + }, + { + "epoch": 5.81890730509515, + "grad_norm": 0.20956869423389435, + "learning_rate": 3.925850854010732e-05, + "loss": 1.808, + "step": 18958 + }, + { + "epoch": 5.819214241866176, + "grad_norm": 0.29395633935928345, + "learning_rate": 3.925365408846983e-05, + "loss": 1.7787, + "step": 18959 + }, + { + "epoch": 5.819521178637201, + "grad_norm": 0.31101030111312866, + "learning_rate": 3.9248799743034025e-05, + "loss": 1.7685, + "step": 18960 + }, + { + "epoch": 5.819828115408226, + "grad_norm": 0.2109794020652771, + "learning_rate": 3.9243945503847894e-05, + "loss": 1.7307, + "step": 18961 + }, + { + "epoch": 5.820135052179251, + "grad_norm": 0.2503393292427063, + "learning_rate": 3.9239091370959405e-05, + "loss": 1.763, + "step": 18962 + }, + { + "epoch": 5.820441988950276, + "grad_norm": 0.21757015585899353, + "learning_rate": 3.92342373444165e-05, + "loss": 1.7862, + "step": 18963 + }, + { + "epoch": 5.820748925721301, + "grad_norm": 0.22108088433742523, + "learning_rate": 3.9229383424267197e-05, + "loss": 1.6845, + "step": 18964 + }, + { + "epoch": 5.821055862492327, + "grad_norm": 0.20059655606746674, + "learning_rate": 3.922452961055941e-05, + "loss": 1.7523, + "step": 18965 + }, + { + "epoch": 5.821362799263352, + "grad_norm": 0.22009585797786713, + "learning_rate": 3.921967590334117e-05, + "loss": 1.7802, + "step": 18966 + }, + { + "epoch": 5.8216697360343765, + "grad_norm": 0.22554142773151398, + "learning_rate": 3.9214822302660386e-05, + "loss": 1.7911, + "step": 18967 + }, + { + "epoch": 5.821976672805402, + "grad_norm": 0.23434770107269287, + "learning_rate": 3.920996880856506e-05, + "loss": 1.6755, + "step": 18968 + }, + { + "epoch": 5.822283609576427, + "grad_norm": 0.2162926346063614, + "learning_rate": 3.920511542110314e-05, + "loss": 1.7145, + "step": 18969 + }, + { + "epoch": 5.8225905463474525, + "grad_norm": 0.18654806911945343, + "learning_rate": 3.9200262140322616e-05, + "loss": 1.7076, + "step": 18970 + }, + { + "epoch": 5.822897483118478, + "grad_norm": 0.22357499599456787, + "learning_rate": 3.9195408966271404e-05, + "loss": 1.791, + "step": 18971 + }, + { + "epoch": 5.823204419889503, + "grad_norm": 0.21073313057422638, + "learning_rate": 3.919055589899752e-05, + "loss": 1.7976, + "step": 18972 + }, + { + "epoch": 5.823511356660528, + "grad_norm": 0.21481956541538239, + "learning_rate": 3.9185702938548886e-05, + "loss": 1.7468, + "step": 18973 + }, + { + "epoch": 5.823818293431553, + "grad_norm": 0.22051872313022614, + "learning_rate": 3.9180850084973464e-05, + "loss": 1.7201, + "step": 18974 + }, + { + "epoch": 5.824125230202578, + "grad_norm": 0.24410493671894073, + "learning_rate": 3.917599733831924e-05, + "loss": 1.7774, + "step": 18975 + }, + { + "epoch": 5.824432166973604, + "grad_norm": 0.19711458683013916, + "learning_rate": 3.917114469863414e-05, + "loss": 1.7907, + "step": 18976 + }, + { + "epoch": 5.824739103744628, + "grad_norm": 0.2045203000307083, + "learning_rate": 3.9166292165966155e-05, + "loss": 1.7105, + "step": 18977 + }, + { + "epoch": 5.8250460405156534, + "grad_norm": 0.21570880711078644, + "learning_rate": 3.9161439740363196e-05, + "loss": 1.7312, + "step": 18978 + }, + { + "epoch": 5.825352977286679, + "grad_norm": 0.21203923225402832, + "learning_rate": 3.915658742187325e-05, + "loss": 1.7869, + "step": 18979 + }, + { + "epoch": 5.825659914057704, + "grad_norm": 0.26233312487602234, + "learning_rate": 3.915173521054426e-05, + "loss": 1.7453, + "step": 18980 + }, + { + "epoch": 5.8259668508287294, + "grad_norm": 0.23792949318885803, + "learning_rate": 3.91468831064242e-05, + "loss": 1.6886, + "step": 18981 + }, + { + "epoch": 5.826273787599755, + "grad_norm": 0.20325250923633575, + "learning_rate": 3.914203110956098e-05, + "loss": 1.7538, + "step": 18982 + }, + { + "epoch": 5.82658072437078, + "grad_norm": 0.28146329522132874, + "learning_rate": 3.9137179220002596e-05, + "loss": 1.7674, + "step": 18983 + }, + { + "epoch": 5.826887661141805, + "grad_norm": 0.2319503277540207, + "learning_rate": 3.9132327437796946e-05, + "loss": 1.7864, + "step": 18984 + }, + { + "epoch": 5.82719459791283, + "grad_norm": 0.22653794288635254, + "learning_rate": 3.9127475762992025e-05, + "loss": 1.7424, + "step": 18985 + }, + { + "epoch": 5.827501534683855, + "grad_norm": 0.26855236291885376, + "learning_rate": 3.912262419563574e-05, + "loss": 1.762, + "step": 18986 + }, + { + "epoch": 5.827808471454881, + "grad_norm": 0.18356221914291382, + "learning_rate": 3.9117772735776095e-05, + "loss": 1.7199, + "step": 18987 + }, + { + "epoch": 5.828115408225905, + "grad_norm": 0.2802455425262451, + "learning_rate": 3.911292138346096e-05, + "loss": 1.7142, + "step": 18988 + }, + { + "epoch": 5.82842234499693, + "grad_norm": 0.2638777494430542, + "learning_rate": 3.910807013873835e-05, + "loss": 1.6759, + "step": 18989 + }, + { + "epoch": 5.828729281767956, + "grad_norm": 0.18397162854671478, + "learning_rate": 3.910321900165615e-05, + "loss": 1.693, + "step": 18990 + }, + { + "epoch": 5.829036218538981, + "grad_norm": 0.20967607200145721, + "learning_rate": 3.909836797226233e-05, + "loss": 1.6908, + "step": 18991 + }, + { + "epoch": 5.829343155310006, + "grad_norm": 0.21123014390468597, + "learning_rate": 3.909351705060485e-05, + "loss": 1.7875, + "step": 18992 + }, + { + "epoch": 5.829650092081032, + "grad_norm": 0.1988777220249176, + "learning_rate": 3.90886662367316e-05, + "loss": 1.7254, + "step": 18993 + }, + { + "epoch": 5.829957028852056, + "grad_norm": 0.17793473601341248, + "learning_rate": 3.9083815530690564e-05, + "loss": 1.7233, + "step": 18994 + }, + { + "epoch": 5.8302639656230815, + "grad_norm": 0.2289644330739975, + "learning_rate": 3.9078964932529645e-05, + "loss": 1.7739, + "step": 18995 + }, + { + "epoch": 5.830570902394107, + "grad_norm": 0.18145552277565002, + "learning_rate": 3.9074114442296804e-05, + "loss": 1.6989, + "step": 18996 + }, + { + "epoch": 5.830877839165132, + "grad_norm": 0.1941588670015335, + "learning_rate": 3.9069264060039956e-05, + "loss": 1.6981, + "step": 18997 + }, + { + "epoch": 5.8311847759361575, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.9064413785807075e-05, + "loss": 1.7163, + "step": 18998 + }, + { + "epoch": 5.831491712707182, + "grad_norm": 0.19494447112083435, + "learning_rate": 3.905956361964604e-05, + "loss": 1.7481, + "step": 18999 + }, + { + "epoch": 5.831798649478207, + "grad_norm": 0.2127624899148941, + "learning_rate": 3.9054713561604826e-05, + "loss": 1.7494, + "step": 19000 + }, + { + "epoch": 5.832105586249233, + "grad_norm": 0.20107653737068176, + "learning_rate": 3.9049863611731334e-05, + "loss": 1.7483, + "step": 19001 + }, + { + "epoch": 5.832412523020258, + "grad_norm": 0.22574639320373535, + "learning_rate": 3.904501377007352e-05, + "loss": 1.8184, + "step": 19002 + }, + { + "epoch": 5.832719459791283, + "grad_norm": 0.20027579367160797, + "learning_rate": 3.9040164036679285e-05, + "loss": 1.6995, + "step": 19003 + }, + { + "epoch": 5.833026396562309, + "grad_norm": 0.21599887311458588, + "learning_rate": 3.90353144115966e-05, + "loss": 1.7487, + "step": 19004 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.21122781932353973, + "learning_rate": 3.9030464894873334e-05, + "loss": 1.7332, + "step": 19005 + }, + { + "epoch": 5.833640270104358, + "grad_norm": 0.19006453454494476, + "learning_rate": 3.902561548655747e-05, + "loss": 1.688, + "step": 19006 + }, + { + "epoch": 5.833947206875384, + "grad_norm": 0.22979344427585602, + "learning_rate": 3.9020766186696895e-05, + "loss": 1.7495, + "step": 19007 + }, + { + "epoch": 5.834254143646409, + "grad_norm": 0.18405365943908691, + "learning_rate": 3.901591699533953e-05, + "loss": 1.7395, + "step": 19008 + }, + { + "epoch": 5.834561080417434, + "grad_norm": 0.26198676228523254, + "learning_rate": 3.901106791253334e-05, + "loss": 1.8286, + "step": 19009 + }, + { + "epoch": 5.834868017188459, + "grad_norm": 0.2535797357559204, + "learning_rate": 3.900621893832619e-05, + "loss": 1.757, + "step": 19010 + }, + { + "epoch": 5.835174953959484, + "grad_norm": 0.24599581956863403, + "learning_rate": 3.900137007276605e-05, + "loss": 1.7266, + "step": 19011 + }, + { + "epoch": 5.8354818907305095, + "grad_norm": 0.25688427686691284, + "learning_rate": 3.8996521315900805e-05, + "loss": 1.7255, + "step": 19012 + }, + { + "epoch": 5.835788827501535, + "grad_norm": 0.24668128788471222, + "learning_rate": 3.8991672667778385e-05, + "loss": 1.737, + "step": 19013 + }, + { + "epoch": 5.83609576427256, + "grad_norm": 0.28365740180015564, + "learning_rate": 3.8986824128446695e-05, + "loss": 1.7129, + "step": 19014 + }, + { + "epoch": 5.8364027010435855, + "grad_norm": 0.2543952465057373, + "learning_rate": 3.89819756979537e-05, + "loss": 1.7249, + "step": 19015 + }, + { + "epoch": 5.83670963781461, + "grad_norm": 0.2868666350841522, + "learning_rate": 3.8977127376347245e-05, + "loss": 1.6985, + "step": 19016 + }, + { + "epoch": 5.837016574585635, + "grad_norm": 0.3818367123603821, + "learning_rate": 3.897227916367531e-05, + "loss": 1.6954, + "step": 19017 + }, + { + "epoch": 5.837323511356661, + "grad_norm": 0.20922113955020905, + "learning_rate": 3.896743105998574e-05, + "loss": 1.7571, + "step": 19018 + }, + { + "epoch": 5.837630448127686, + "grad_norm": 0.3669843375682831, + "learning_rate": 3.89625830653265e-05, + "loss": 1.8041, + "step": 19019 + }, + { + "epoch": 5.83793738489871, + "grad_norm": 0.2889872193336487, + "learning_rate": 3.895773517974548e-05, + "loss": 1.7775, + "step": 19020 + }, + { + "epoch": 5.838244321669736, + "grad_norm": 0.22619491815567017, + "learning_rate": 3.89528874032906e-05, + "loss": 1.7019, + "step": 19021 + }, + { + "epoch": 5.838551258440761, + "grad_norm": 0.4169046878814697, + "learning_rate": 3.894803973600976e-05, + "loss": 1.8282, + "step": 19022 + }, + { + "epoch": 5.838858195211786, + "grad_norm": 0.2567043900489807, + "learning_rate": 3.894319217795087e-05, + "loss": 1.733, + "step": 19023 + }, + { + "epoch": 5.839165131982812, + "grad_norm": 0.2435060739517212, + "learning_rate": 3.8938344729161834e-05, + "loss": 1.7208, + "step": 19024 + }, + { + "epoch": 5.839472068753837, + "grad_norm": 0.2941838204860687, + "learning_rate": 3.893349738969055e-05, + "loss": 1.7202, + "step": 19025 + }, + { + "epoch": 5.8397790055248615, + "grad_norm": 0.23542317748069763, + "learning_rate": 3.892865015958495e-05, + "loss": 1.7571, + "step": 19026 + }, + { + "epoch": 5.840085942295887, + "grad_norm": 0.3248259723186493, + "learning_rate": 3.8923803038892897e-05, + "loss": 1.7118, + "step": 19027 + }, + { + "epoch": 5.840392879066912, + "grad_norm": 0.24359026551246643, + "learning_rate": 3.891895602766234e-05, + "loss": 1.8126, + "step": 19028 + }, + { + "epoch": 5.8406998158379375, + "grad_norm": 0.3053695559501648, + "learning_rate": 3.8914109125941126e-05, + "loss": 1.6632, + "step": 19029 + }, + { + "epoch": 5.841006752608963, + "grad_norm": 0.3194943368434906, + "learning_rate": 3.8909262333777195e-05, + "loss": 1.8432, + "step": 19030 + }, + { + "epoch": 5.841313689379987, + "grad_norm": 0.23532693088054657, + "learning_rate": 3.8904415651218426e-05, + "loss": 1.716, + "step": 19031 + }, + { + "epoch": 5.841620626151013, + "grad_norm": 0.2941347062587738, + "learning_rate": 3.889956907831275e-05, + "loss": 1.7737, + "step": 19032 + }, + { + "epoch": 5.841927562922038, + "grad_norm": 0.2265428602695465, + "learning_rate": 3.889472261510801e-05, + "loss": 1.7111, + "step": 19033 + }, + { + "epoch": 5.842234499693063, + "grad_norm": 0.3023710548877716, + "learning_rate": 3.888987626165216e-05, + "loss": 1.7845, + "step": 19034 + }, + { + "epoch": 5.842541436464089, + "grad_norm": 0.2855348289012909, + "learning_rate": 3.8885030017993026e-05, + "loss": 1.8009, + "step": 19035 + }, + { + "epoch": 5.842848373235114, + "grad_norm": 0.23046357929706573, + "learning_rate": 3.888018388417857e-05, + "loss": 1.8225, + "step": 19036 + }, + { + "epoch": 5.843155310006138, + "grad_norm": 0.23732341825962067, + "learning_rate": 3.8875337860256634e-05, + "loss": 1.7542, + "step": 19037 + }, + { + "epoch": 5.843462246777164, + "grad_norm": 0.18987004458904266, + "learning_rate": 3.887049194627516e-05, + "loss": 1.7327, + "step": 19038 + }, + { + "epoch": 5.843769183548189, + "grad_norm": 0.21539908647537231, + "learning_rate": 3.8865646142281974e-05, + "loss": 1.715, + "step": 19039 + }, + { + "epoch": 5.844076120319214, + "grad_norm": 0.2991954982280731, + "learning_rate": 3.8860800448325024e-05, + "loss": 1.7728, + "step": 19040 + }, + { + "epoch": 5.84438305709024, + "grad_norm": 0.19066409766674042, + "learning_rate": 3.885595486445216e-05, + "loss": 1.7128, + "step": 19041 + }, + { + "epoch": 5.844689993861264, + "grad_norm": 0.21643762290477753, + "learning_rate": 3.885110939071128e-05, + "loss": 1.7584, + "step": 19042 + }, + { + "epoch": 5.8449969306322895, + "grad_norm": 0.20227304100990295, + "learning_rate": 3.884626402715029e-05, + "loss": 1.7053, + "step": 19043 + }, + { + "epoch": 5.845303867403315, + "grad_norm": 0.20429107546806335, + "learning_rate": 3.884141877381703e-05, + "loss": 1.761, + "step": 19044 + }, + { + "epoch": 5.84561080417434, + "grad_norm": 0.1873873621225357, + "learning_rate": 3.8836573630759435e-05, + "loss": 1.7251, + "step": 19045 + }, + { + "epoch": 5.8459177409453655, + "grad_norm": 0.18025323748588562, + "learning_rate": 3.883172859802534e-05, + "loss": 1.6696, + "step": 19046 + }, + { + "epoch": 5.846224677716391, + "grad_norm": 0.22011777758598328, + "learning_rate": 3.8826883675662664e-05, + "loss": 1.7148, + "step": 19047 + }, + { + "epoch": 5.846531614487415, + "grad_norm": 0.17827673256397247, + "learning_rate": 3.882203886371925e-05, + "loss": 1.69, + "step": 19048 + }, + { + "epoch": 5.846838551258441, + "grad_norm": 0.200766459107399, + "learning_rate": 3.881719416224303e-05, + "loss": 1.7773, + "step": 19049 + }, + { + "epoch": 5.847145488029466, + "grad_norm": 0.22770950198173523, + "learning_rate": 3.8812349571281834e-05, + "loss": 1.7156, + "step": 19050 + }, + { + "epoch": 5.847452424800491, + "grad_norm": 0.19483895599842072, + "learning_rate": 3.880750509088357e-05, + "loss": 1.7304, + "step": 19051 + }, + { + "epoch": 5.847759361571516, + "grad_norm": 0.1988774836063385, + "learning_rate": 3.8802660721096086e-05, + "loss": 1.7428, + "step": 19052 + }, + { + "epoch": 5.848066298342541, + "grad_norm": 0.19881510734558105, + "learning_rate": 3.879781646196727e-05, + "loss": 1.7268, + "step": 19053 + }, + { + "epoch": 5.848373235113566, + "grad_norm": 0.21257543563842773, + "learning_rate": 3.8792972313545e-05, + "loss": 1.7532, + "step": 19054 + }, + { + "epoch": 5.848680171884592, + "grad_norm": 0.21000613272190094, + "learning_rate": 3.878812827587716e-05, + "loss": 1.7782, + "step": 19055 + }, + { + "epoch": 5.848987108655617, + "grad_norm": 0.2136746346950531, + "learning_rate": 3.878328434901159e-05, + "loss": 1.6875, + "step": 19056 + }, + { + "epoch": 5.849294045426642, + "grad_norm": 0.20291505753993988, + "learning_rate": 3.8778440532996204e-05, + "loss": 1.74, + "step": 19057 + }, + { + "epoch": 5.849600982197668, + "grad_norm": 0.22568103671073914, + "learning_rate": 3.877359682787883e-05, + "loss": 1.7074, + "step": 19058 + }, + { + "epoch": 5.849907918968692, + "grad_norm": 0.24398963153362274, + "learning_rate": 3.876875323370734e-05, + "loss": 1.6825, + "step": 19059 + }, + { + "epoch": 5.850214855739718, + "grad_norm": 0.19684453308582306, + "learning_rate": 3.876390975052964e-05, + "loss": 1.7143, + "step": 19060 + }, + { + "epoch": 5.850521792510743, + "grad_norm": 0.2786783277988434, + "learning_rate": 3.8759066378393544e-05, + "loss": 1.8339, + "step": 19061 + }, + { + "epoch": 5.850828729281768, + "grad_norm": 0.1977633833885193, + "learning_rate": 3.875422311734697e-05, + "loss": 1.742, + "step": 19062 + }, + { + "epoch": 5.851135666052793, + "grad_norm": 0.260643869638443, + "learning_rate": 3.874937996743772e-05, + "loss": 1.7728, + "step": 19063 + }, + { + "epoch": 5.851442602823818, + "grad_norm": 0.20998433232307434, + "learning_rate": 3.874453692871372e-05, + "loss": 1.768, + "step": 19064 + }, + { + "epoch": 5.851749539594843, + "grad_norm": 0.2603224217891693, + "learning_rate": 3.873969400122278e-05, + "loss": 1.8015, + "step": 19065 + }, + { + "epoch": 5.852056476365869, + "grad_norm": 0.24428118765354156, + "learning_rate": 3.87348511850128e-05, + "loss": 1.8133, + "step": 19066 + }, + { + "epoch": 5.852363413136894, + "grad_norm": 0.19380085170269012, + "learning_rate": 3.873000848013161e-05, + "loss": 1.7331, + "step": 19067 + }, + { + "epoch": 5.852670349907919, + "grad_norm": 0.20088011026382446, + "learning_rate": 3.87251658866271e-05, + "loss": 1.7501, + "step": 19068 + }, + { + "epoch": 5.852977286678944, + "grad_norm": 0.21920672059059143, + "learning_rate": 3.8720323404547095e-05, + "loss": 1.6848, + "step": 19069 + }, + { + "epoch": 5.853284223449969, + "grad_norm": 0.21692565083503723, + "learning_rate": 3.871548103393947e-05, + "loss": 1.7132, + "step": 19070 + }, + { + "epoch": 5.8535911602209945, + "grad_norm": 0.19463133811950684, + "learning_rate": 3.871063877485207e-05, + "loss": 1.7263, + "step": 19071 + }, + { + "epoch": 5.85389809699202, + "grad_norm": 0.21563300490379333, + "learning_rate": 3.870579662733277e-05, + "loss": 1.7271, + "step": 19072 + }, + { + "epoch": 5.854205033763045, + "grad_norm": 0.19901902973651886, + "learning_rate": 3.870095459142939e-05, + "loss": 1.7153, + "step": 19073 + }, + { + "epoch": 5.85451197053407, + "grad_norm": 0.2053879052400589, + "learning_rate": 3.869611266718982e-05, + "loss": 1.7769, + "step": 19074 + }, + { + "epoch": 5.854818907305095, + "grad_norm": 0.18877504765987396, + "learning_rate": 3.869127085466188e-05, + "loss": 1.7427, + "step": 19075 + }, + { + "epoch": 5.85512584407612, + "grad_norm": 0.2000892460346222, + "learning_rate": 3.8686429153893414e-05, + "loss": 1.7245, + "step": 19076 + }, + { + "epoch": 5.855432780847146, + "grad_norm": 0.23791030049324036, + "learning_rate": 3.868158756493231e-05, + "loss": 1.7128, + "step": 19077 + }, + { + "epoch": 5.855739717618171, + "grad_norm": 0.20807631313800812, + "learning_rate": 3.8676746087826374e-05, + "loss": 1.7235, + "step": 19078 + }, + { + "epoch": 5.856046654389196, + "grad_norm": 0.2603290379047394, + "learning_rate": 3.867190472262349e-05, + "loss": 1.7272, + "step": 19079 + }, + { + "epoch": 5.856353591160221, + "grad_norm": 0.25234153866767883, + "learning_rate": 3.8667063469371456e-05, + "loss": 1.7818, + "step": 19080 + }, + { + "epoch": 5.856660527931246, + "grad_norm": 0.20621159672737122, + "learning_rate": 3.866222232811816e-05, + "loss": 1.7318, + "step": 19081 + }, + { + "epoch": 5.856967464702271, + "grad_norm": 0.19565562903881073, + "learning_rate": 3.865738129891141e-05, + "loss": 1.6364, + "step": 19082 + }, + { + "epoch": 5.857274401473297, + "grad_norm": 0.2090953141450882, + "learning_rate": 3.86525403817991e-05, + "loss": 1.7763, + "step": 19083 + }, + { + "epoch": 5.857581338244322, + "grad_norm": 0.21286322176456451, + "learning_rate": 3.864769957682901e-05, + "loss": 1.7652, + "step": 19084 + }, + { + "epoch": 5.8578882750153465, + "grad_norm": 0.20606130361557007, + "learning_rate": 3.864285888404902e-05, + "loss": 1.7267, + "step": 19085 + }, + { + "epoch": 5.858195211786372, + "grad_norm": 0.18837152421474457, + "learning_rate": 3.863801830350694e-05, + "loss": 1.7013, + "step": 19086 + }, + { + "epoch": 5.858502148557397, + "grad_norm": 0.19374001026153564, + "learning_rate": 3.8633177835250636e-05, + "loss": 1.7462, + "step": 19087 + }, + { + "epoch": 5.8588090853284225, + "grad_norm": 0.19090552628040314, + "learning_rate": 3.8628337479327914e-05, + "loss": 1.7321, + "step": 19088 + }, + { + "epoch": 5.859116022099448, + "grad_norm": 0.19487829506397247, + "learning_rate": 3.8623497235786656e-05, + "loss": 1.7323, + "step": 19089 + }, + { + "epoch": 5.859422958870473, + "grad_norm": 0.23836077749729156, + "learning_rate": 3.861865710467464e-05, + "loss": 1.7277, + "step": 19090 + }, + { + "epoch": 5.859729895641498, + "grad_norm": 0.22283829748630524, + "learning_rate": 3.861381708603974e-05, + "loss": 1.7521, + "step": 19091 + }, + { + "epoch": 5.860036832412523, + "grad_norm": 0.2094828337430954, + "learning_rate": 3.8608977179929774e-05, + "loss": 1.763, + "step": 19092 + }, + { + "epoch": 5.860343769183548, + "grad_norm": 0.30857667326927185, + "learning_rate": 3.860413738639256e-05, + "loss": 1.7112, + "step": 19093 + }, + { + "epoch": 5.860650705954574, + "grad_norm": 0.22634989023208618, + "learning_rate": 3.8599297705475954e-05, + "loss": 1.7076, + "step": 19094 + }, + { + "epoch": 5.860957642725598, + "grad_norm": 0.20488132536411285, + "learning_rate": 3.8594458137227757e-05, + "loss": 1.6821, + "step": 19095 + }, + { + "epoch": 5.861264579496623, + "grad_norm": 0.22760719060897827, + "learning_rate": 3.8589618681695826e-05, + "loss": 1.6981, + "step": 19096 + }, + { + "epoch": 5.861571516267649, + "grad_norm": 0.21168997883796692, + "learning_rate": 3.858477933892795e-05, + "loss": 1.7396, + "step": 19097 + }, + { + "epoch": 5.861878453038674, + "grad_norm": 0.24725143611431122, + "learning_rate": 3.8579940108971984e-05, + "loss": 1.791, + "step": 19098 + }, + { + "epoch": 5.862185389809699, + "grad_norm": 0.2245369702577591, + "learning_rate": 3.857510099187573e-05, + "loss": 1.7643, + "step": 19099 + }, + { + "epoch": 5.862492326580725, + "grad_norm": 0.20065639913082123, + "learning_rate": 3.8570261987687056e-05, + "loss": 1.715, + "step": 19100 + }, + { + "epoch": 5.862799263351749, + "grad_norm": 0.1857454925775528, + "learning_rate": 3.856542309645373e-05, + "loss": 1.6833, + "step": 19101 + }, + { + "epoch": 5.8631062001227745, + "grad_norm": 0.18816804885864258, + "learning_rate": 3.856058431822361e-05, + "loss": 1.7049, + "step": 19102 + }, + { + "epoch": 5.8634131368938, + "grad_norm": 0.2861626148223877, + "learning_rate": 3.855574565304448e-05, + "loss": 1.8275, + "step": 19103 + }, + { + "epoch": 5.863720073664825, + "grad_norm": 0.19937226176261902, + "learning_rate": 3.8550907100964196e-05, + "loss": 1.7137, + "step": 19104 + }, + { + "epoch": 5.8640270104358505, + "grad_norm": 0.2040586620569229, + "learning_rate": 3.854606866203055e-05, + "loss": 1.725, + "step": 19105 + }, + { + "epoch": 5.864333947206875, + "grad_norm": 0.21082650125026703, + "learning_rate": 3.854123033629137e-05, + "loss": 1.7143, + "step": 19106 + }, + { + "epoch": 5.8646408839779, + "grad_norm": 0.1977517306804657, + "learning_rate": 3.853639212379446e-05, + "loss": 1.7482, + "step": 19107 + }, + { + "epoch": 5.864947820748926, + "grad_norm": 0.2272191196680069, + "learning_rate": 3.8531554024587655e-05, + "loss": 1.7678, + "step": 19108 + }, + { + "epoch": 5.865254757519951, + "grad_norm": 0.22765736281871796, + "learning_rate": 3.852671603871876e-05, + "loss": 1.7721, + "step": 19109 + }, + { + "epoch": 5.865561694290976, + "grad_norm": 0.20707197487354279, + "learning_rate": 3.852187816623556e-05, + "loss": 1.7509, + "step": 19110 + }, + { + "epoch": 5.865868631062002, + "grad_norm": 0.2699931561946869, + "learning_rate": 3.851704040718591e-05, + "loss": 1.6845, + "step": 19111 + }, + { + "epoch": 5.866175567833026, + "grad_norm": 0.24394196271896362, + "learning_rate": 3.8512202761617575e-05, + "loss": 1.6895, + "step": 19112 + }, + { + "epoch": 5.866482504604051, + "grad_norm": 0.21921835839748383, + "learning_rate": 3.850736522957841e-05, + "loss": 1.7739, + "step": 19113 + }, + { + "epoch": 5.866789441375077, + "grad_norm": 0.2268306314945221, + "learning_rate": 3.8502527811116175e-05, + "loss": 1.7773, + "step": 19114 + }, + { + "epoch": 5.867096378146102, + "grad_norm": 0.2165728509426117, + "learning_rate": 3.84976905062787e-05, + "loss": 1.7567, + "step": 19115 + }, + { + "epoch": 5.867403314917127, + "grad_norm": 0.188106968998909, + "learning_rate": 3.8492853315113804e-05, + "loss": 1.7209, + "step": 19116 + }, + { + "epoch": 5.867710251688152, + "grad_norm": 0.20750530064105988, + "learning_rate": 3.848801623766927e-05, + "loss": 1.6999, + "step": 19117 + }, + { + "epoch": 5.868017188459177, + "grad_norm": 0.2475438266992569, + "learning_rate": 3.84831792739929e-05, + "loss": 1.7535, + "step": 19118 + }, + { + "epoch": 5.8683241252302025, + "grad_norm": 0.23291872441768646, + "learning_rate": 3.847834242413252e-05, + "loss": 1.7137, + "step": 19119 + }, + { + "epoch": 5.868631062001228, + "grad_norm": 0.18381048738956451, + "learning_rate": 3.847350568813589e-05, + "loss": 1.7657, + "step": 19120 + }, + { + "epoch": 5.868937998772253, + "grad_norm": 0.19330385327339172, + "learning_rate": 3.8468669066050845e-05, + "loss": 1.7109, + "step": 19121 + }, + { + "epoch": 5.8692449355432785, + "grad_norm": 0.22503000497817993, + "learning_rate": 3.846383255792517e-05, + "loss": 1.7668, + "step": 19122 + }, + { + "epoch": 5.869551872314303, + "grad_norm": 0.2147306352853775, + "learning_rate": 3.845899616380667e-05, + "loss": 1.74, + "step": 19123 + }, + { + "epoch": 5.869858809085328, + "grad_norm": 0.18493011593818665, + "learning_rate": 3.845415988374312e-05, + "loss": 1.7066, + "step": 19124 + }, + { + "epoch": 5.870165745856354, + "grad_norm": 0.28276753425598145, + "learning_rate": 3.844932371778235e-05, + "loss": 1.7925, + "step": 19125 + }, + { + "epoch": 5.870472682627379, + "grad_norm": 0.23486676812171936, + "learning_rate": 3.844448766597212e-05, + "loss": 1.8216, + "step": 19126 + }, + { + "epoch": 5.870779619398404, + "grad_norm": 0.24370723962783813, + "learning_rate": 3.843965172836024e-05, + "loss": 1.709, + "step": 19127 + }, + { + "epoch": 5.871086556169429, + "grad_norm": 0.22540852427482605, + "learning_rate": 3.843481590499449e-05, + "loss": 1.7608, + "step": 19128 + }, + { + "epoch": 5.871393492940454, + "grad_norm": 0.20578467845916748, + "learning_rate": 3.8429980195922666e-05, + "loss": 1.7288, + "step": 19129 + }, + { + "epoch": 5.871700429711479, + "grad_norm": 0.265325129032135, + "learning_rate": 3.842514460119258e-05, + "loss": 1.7711, + "step": 19130 + }, + { + "epoch": 5.872007366482505, + "grad_norm": 0.20076121389865875, + "learning_rate": 3.842030912085197e-05, + "loss": 1.6764, + "step": 19131 + }, + { + "epoch": 5.87231430325353, + "grad_norm": 0.23941899836063385, + "learning_rate": 3.841547375494868e-05, + "loss": 1.8157, + "step": 19132 + }, + { + "epoch": 5.872621240024555, + "grad_norm": 0.23184041678905487, + "learning_rate": 3.841063850353044e-05, + "loss": 1.6948, + "step": 19133 + }, + { + "epoch": 5.87292817679558, + "grad_norm": 0.20299546420574188, + "learning_rate": 3.840580336664508e-05, + "loss": 1.7812, + "step": 19134 + }, + { + "epoch": 5.873235113566605, + "grad_norm": 0.24654673039913177, + "learning_rate": 3.840096834434036e-05, + "loss": 1.7999, + "step": 19135 + }, + { + "epoch": 5.8735420503376305, + "grad_norm": 0.21144285798072815, + "learning_rate": 3.8396133436664085e-05, + "loss": 1.7033, + "step": 19136 + }, + { + "epoch": 5.873848987108656, + "grad_norm": 0.22186708450317383, + "learning_rate": 3.8391298643663997e-05, + "loss": 1.7292, + "step": 19137 + }, + { + "epoch": 5.87415592387968, + "grad_norm": 0.21017275750637054, + "learning_rate": 3.838646396538793e-05, + "loss": 1.6989, + "step": 19138 + }, + { + "epoch": 5.874462860650706, + "grad_norm": 0.19430704414844513, + "learning_rate": 3.83816294018836e-05, + "loss": 1.7446, + "step": 19139 + }, + { + "epoch": 5.874769797421731, + "grad_norm": 0.25048547983169556, + "learning_rate": 3.8376794953198836e-05, + "loss": 1.7358, + "step": 19140 + }, + { + "epoch": 5.875076734192756, + "grad_norm": 0.21869583427906036, + "learning_rate": 3.8371960619381406e-05, + "loss": 1.7017, + "step": 19141 + }, + { + "epoch": 5.875383670963782, + "grad_norm": 0.2053002119064331, + "learning_rate": 3.836712640047905e-05, + "loss": 1.7077, + "step": 19142 + }, + { + "epoch": 5.875690607734807, + "grad_norm": 0.2222425490617752, + "learning_rate": 3.83622922965396e-05, + "loss": 1.7259, + "step": 19143 + }, + { + "epoch": 5.8759975445058314, + "grad_norm": 0.20682495832443237, + "learning_rate": 3.8357458307610774e-05, + "loss": 1.7597, + "step": 19144 + }, + { + "epoch": 5.876304481276857, + "grad_norm": 0.2001802772283554, + "learning_rate": 3.835262443374038e-05, + "loss": 1.7546, + "step": 19145 + }, + { + "epoch": 5.876611418047882, + "grad_norm": 0.20499882102012634, + "learning_rate": 3.8347790674976166e-05, + "loss": 1.6741, + "step": 19146 + }, + { + "epoch": 5.8769183548189075, + "grad_norm": 0.17830348014831543, + "learning_rate": 3.834295703136593e-05, + "loss": 1.7067, + "step": 19147 + }, + { + "epoch": 5.877225291589933, + "grad_norm": 0.25055429339408875, + "learning_rate": 3.833812350295741e-05, + "loss": 1.753, + "step": 19148 + }, + { + "epoch": 5.877532228360957, + "grad_norm": 0.19037213921546936, + "learning_rate": 3.8333290089798415e-05, + "loss": 1.7336, + "step": 19149 + }, + { + "epoch": 5.877839165131983, + "grad_norm": 0.18041233718395233, + "learning_rate": 3.8328456791936656e-05, + "loss": 1.7172, + "step": 19150 + }, + { + "epoch": 5.878146101903008, + "grad_norm": 0.21531802415847778, + "learning_rate": 3.832362360941994e-05, + "loss": 1.7328, + "step": 19151 + }, + { + "epoch": 5.878453038674033, + "grad_norm": 0.23101283609867096, + "learning_rate": 3.831879054229601e-05, + "loss": 1.7548, + "step": 19152 + }, + { + "epoch": 5.878759975445059, + "grad_norm": 0.19029635190963745, + "learning_rate": 3.831395759061266e-05, + "loss": 1.6852, + "step": 19153 + }, + { + "epoch": 5.879066912216084, + "grad_norm": 0.20305602252483368, + "learning_rate": 3.830912475441761e-05, + "loss": 1.6982, + "step": 19154 + }, + { + "epoch": 5.879373848987108, + "grad_norm": 0.19752593338489532, + "learning_rate": 3.830429203375866e-05, + "loss": 1.7726, + "step": 19155 + }, + { + "epoch": 5.879680785758134, + "grad_norm": 0.2109406590461731, + "learning_rate": 3.8299459428683526e-05, + "loss": 1.7629, + "step": 19156 + }, + { + "epoch": 5.879987722529159, + "grad_norm": 0.19448740780353546, + "learning_rate": 3.829462693924001e-05, + "loss": 1.6981, + "step": 19157 + }, + { + "epoch": 5.880294659300184, + "grad_norm": 0.19344154000282288, + "learning_rate": 3.828979456547586e-05, + "loss": 1.6822, + "step": 19158 + }, + { + "epoch": 5.88060159607121, + "grad_norm": 0.24466145038604736, + "learning_rate": 3.82849623074388e-05, + "loss": 1.7575, + "step": 19159 + }, + { + "epoch": 5.880908532842234, + "grad_norm": 0.20174476504325867, + "learning_rate": 3.828013016517663e-05, + "loss": 1.7267, + "step": 19160 + }, + { + "epoch": 5.8812154696132595, + "grad_norm": 0.23560820519924164, + "learning_rate": 3.827529813873706e-05, + "loss": 1.7125, + "step": 19161 + }, + { + "epoch": 5.881522406384285, + "grad_norm": 0.18118280172348022, + "learning_rate": 3.827046622816789e-05, + "loss": 1.7436, + "step": 19162 + }, + { + "epoch": 5.88182934315531, + "grad_norm": 0.27250152826309204, + "learning_rate": 3.8265634433516824e-05, + "loss": 1.7249, + "step": 19163 + }, + { + "epoch": 5.8821362799263355, + "grad_norm": 0.23510734736919403, + "learning_rate": 3.826080275483166e-05, + "loss": 1.7502, + "step": 19164 + }, + { + "epoch": 5.882443216697361, + "grad_norm": 0.22708909213542938, + "learning_rate": 3.82559711921601e-05, + "loss": 1.7478, + "step": 19165 + }, + { + "epoch": 5.882750153468385, + "grad_norm": 0.292584627866745, + "learning_rate": 3.825113974554995e-05, + "loss": 1.6757, + "step": 19166 + }, + { + "epoch": 5.883057090239411, + "grad_norm": 0.22186334431171417, + "learning_rate": 3.8246308415048884e-05, + "loss": 1.7061, + "step": 19167 + }, + { + "epoch": 5.883364027010436, + "grad_norm": 0.23995520174503326, + "learning_rate": 3.8241477200704714e-05, + "loss": 1.6962, + "step": 19168 + }, + { + "epoch": 5.883670963781461, + "grad_norm": 0.25545260310173035, + "learning_rate": 3.823664610256513e-05, + "loss": 1.7582, + "step": 19169 + }, + { + "epoch": 5.883977900552486, + "grad_norm": 0.2209167629480362, + "learning_rate": 3.823181512067794e-05, + "loss": 1.7212, + "step": 19170 + }, + { + "epoch": 5.884284837323511, + "grad_norm": 0.24626508355140686, + "learning_rate": 3.8226984255090824e-05, + "loss": 1.7356, + "step": 19171 + }, + { + "epoch": 5.884591774094536, + "grad_norm": 0.22982320189476013, + "learning_rate": 3.822215350585157e-05, + "loss": 1.7516, + "step": 19172 + }, + { + "epoch": 5.884898710865562, + "grad_norm": 0.19458627700805664, + "learning_rate": 3.8217322873007874e-05, + "loss": 1.7097, + "step": 19173 + }, + { + "epoch": 5.885205647636587, + "grad_norm": 0.2030913233757019, + "learning_rate": 3.8212492356607524e-05, + "loss": 1.7273, + "step": 19174 + }, + { + "epoch": 5.885512584407612, + "grad_norm": 0.20174767076969147, + "learning_rate": 3.820766195669823e-05, + "loss": 1.7167, + "step": 19175 + }, + { + "epoch": 5.885819521178637, + "grad_norm": 0.22572553157806396, + "learning_rate": 3.820283167332772e-05, + "loss": 1.8034, + "step": 19176 + }, + { + "epoch": 5.886126457949662, + "grad_norm": 0.24423041939735413, + "learning_rate": 3.819800150654376e-05, + "loss": 1.7188, + "step": 19177 + }, + { + "epoch": 5.8864333947206875, + "grad_norm": 0.20805509388446808, + "learning_rate": 3.819317145639404e-05, + "loss": 1.7252, + "step": 19178 + }, + { + "epoch": 5.886740331491713, + "grad_norm": 0.2731400728225708, + "learning_rate": 3.8188341522926334e-05, + "loss": 1.7778, + "step": 19179 + }, + { + "epoch": 5.887047268262738, + "grad_norm": 0.2604491412639618, + "learning_rate": 3.818351170618835e-05, + "loss": 1.7524, + "step": 19180 + }, + { + "epoch": 5.887354205033763, + "grad_norm": 0.20043112337589264, + "learning_rate": 3.817868200622785e-05, + "loss": 1.7176, + "step": 19181 + }, + { + "epoch": 5.887661141804788, + "grad_norm": 0.2224988341331482, + "learning_rate": 3.817385242309253e-05, + "loss": 1.7267, + "step": 19182 + }, + { + "epoch": 5.887968078575813, + "grad_norm": 0.24603894352912903, + "learning_rate": 3.8169022956830135e-05, + "loss": 1.716, + "step": 19183 + }, + { + "epoch": 5.888275015346839, + "grad_norm": 0.19959969818592072, + "learning_rate": 3.816419360748839e-05, + "loss": 1.7461, + "step": 19184 + }, + { + "epoch": 5.888581952117864, + "grad_norm": 0.21907947957515717, + "learning_rate": 3.815936437511501e-05, + "loss": 1.6982, + "step": 19185 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.1920289248228073, + "learning_rate": 3.8154535259757735e-05, + "loss": 1.7213, + "step": 19186 + }, + { + "epoch": 5.889195825659914, + "grad_norm": 0.21930737793445587, + "learning_rate": 3.81497062614643e-05, + "loss": 1.7389, + "step": 19187 + }, + { + "epoch": 5.889502762430939, + "grad_norm": 0.1972137838602066, + "learning_rate": 3.814487738028239e-05, + "loss": 1.7317, + "step": 19188 + }, + { + "epoch": 5.889809699201964, + "grad_norm": 0.20000529289245605, + "learning_rate": 3.8140048616259785e-05, + "loss": 1.7148, + "step": 19189 + }, + { + "epoch": 5.89011663597299, + "grad_norm": 0.18828663229942322, + "learning_rate": 3.8135219969444135e-05, + "loss": 1.725, + "step": 19190 + }, + { + "epoch": 5.890423572744015, + "grad_norm": 0.2237224131822586, + "learning_rate": 3.8130391439883216e-05, + "loss": 1.7252, + "step": 19191 + }, + { + "epoch": 5.8907305095150395, + "grad_norm": 0.19954712688922882, + "learning_rate": 3.812556302762473e-05, + "loss": 1.7071, + "step": 19192 + }, + { + "epoch": 5.891037446286065, + "grad_norm": 0.23509685695171356, + "learning_rate": 3.812073473271637e-05, + "loss": 1.7603, + "step": 19193 + }, + { + "epoch": 5.89134438305709, + "grad_norm": 0.28477707505226135, + "learning_rate": 3.81159065552059e-05, + "loss": 1.8193, + "step": 19194 + }, + { + "epoch": 5.8916513198281155, + "grad_norm": 0.1936045140028, + "learning_rate": 3.811107849514098e-05, + "loss": 1.7438, + "step": 19195 + }, + { + "epoch": 5.891958256599141, + "grad_norm": 0.288253515958786, + "learning_rate": 3.810625055256936e-05, + "loss": 1.8042, + "step": 19196 + }, + { + "epoch": 5.892265193370166, + "grad_norm": 0.19256485998630524, + "learning_rate": 3.810142272753873e-05, + "loss": 1.6997, + "step": 19197 + }, + { + "epoch": 5.892572130141191, + "grad_norm": 0.2823546826839447, + "learning_rate": 3.809659502009684e-05, + "loss": 1.7133, + "step": 19198 + }, + { + "epoch": 5.892879066912216, + "grad_norm": 0.25116851925849915, + "learning_rate": 3.809176743029136e-05, + "loss": 1.7402, + "step": 19199 + }, + { + "epoch": 5.893186003683241, + "grad_norm": 0.19840675592422485, + "learning_rate": 3.808693995817003e-05, + "loss": 1.7009, + "step": 19200 + }, + { + "epoch": 5.893492940454267, + "grad_norm": 0.2703700363636017, + "learning_rate": 3.808211260378051e-05, + "loss": 1.741, + "step": 19201 + }, + { + "epoch": 5.893799877225292, + "grad_norm": 0.25683698058128357, + "learning_rate": 3.807728536717056e-05, + "loss": 1.7431, + "step": 19202 + }, + { + "epoch": 5.894106813996316, + "grad_norm": 0.19033822417259216, + "learning_rate": 3.8072458248387855e-05, + "loss": 1.7423, + "step": 19203 + }, + { + "epoch": 5.894413750767342, + "grad_norm": 0.2771024703979492, + "learning_rate": 3.806763124748012e-05, + "loss": 1.7376, + "step": 19204 + }, + { + "epoch": 5.894720687538367, + "grad_norm": 0.30265524983406067, + "learning_rate": 3.806280436449504e-05, + "loss": 1.7124, + "step": 19205 + }, + { + "epoch": 5.895027624309392, + "grad_norm": 0.21838776767253876, + "learning_rate": 3.805797759948033e-05, + "loss": 1.7319, + "step": 19206 + }, + { + "epoch": 5.895334561080418, + "grad_norm": 0.22244395315647125, + "learning_rate": 3.805315095248368e-05, + "loss": 1.7034, + "step": 19207 + }, + { + "epoch": 5.895641497851443, + "grad_norm": 0.20621941983699799, + "learning_rate": 3.8048324423552786e-05, + "loss": 1.7231, + "step": 19208 + }, + { + "epoch": 5.8959484346224675, + "grad_norm": 0.23735111951828003, + "learning_rate": 3.804349801273538e-05, + "loss": 1.7484, + "step": 19209 + }, + { + "epoch": 5.896255371393493, + "grad_norm": 0.33221447467803955, + "learning_rate": 3.803867172007911e-05, + "loss": 1.7782, + "step": 19210 + }, + { + "epoch": 5.896562308164518, + "grad_norm": 0.20859810709953308, + "learning_rate": 3.803384554563172e-05, + "loss": 1.688, + "step": 19211 + }, + { + "epoch": 5.8968692449355435, + "grad_norm": 0.25731268525123596, + "learning_rate": 3.8029019489440855e-05, + "loss": 1.7463, + "step": 19212 + }, + { + "epoch": 5.897176181706568, + "grad_norm": 0.26556700468063354, + "learning_rate": 3.802419355155425e-05, + "loss": 1.7251, + "step": 19213 + }, + { + "epoch": 5.897483118477593, + "grad_norm": 0.20397205650806427, + "learning_rate": 3.801936773201957e-05, + "loss": 1.6785, + "step": 19214 + }, + { + "epoch": 5.897790055248619, + "grad_norm": 0.2198234349489212, + "learning_rate": 3.8014542030884544e-05, + "loss": 1.7608, + "step": 19215 + }, + { + "epoch": 5.898096992019644, + "grad_norm": 0.22619546949863434, + "learning_rate": 3.800971644819681e-05, + "loss": 1.8034, + "step": 19216 + }, + { + "epoch": 5.898403928790669, + "grad_norm": 0.22074444591999054, + "learning_rate": 3.800489098400412e-05, + "loss": 1.777, + "step": 19217 + }, + { + "epoch": 5.898710865561695, + "grad_norm": 0.2555946707725525, + "learning_rate": 3.80000656383541e-05, + "loss": 1.7578, + "step": 19218 + }, + { + "epoch": 5.899017802332719, + "grad_norm": 0.2130863517522812, + "learning_rate": 3.7995240411294474e-05, + "loss": 1.7312, + "step": 19219 + }, + { + "epoch": 5.899324739103744, + "grad_norm": 0.2574099898338318, + "learning_rate": 3.799041530287291e-05, + "loss": 1.7509, + "step": 19220 + }, + { + "epoch": 5.89963167587477, + "grad_norm": 0.2556573152542114, + "learning_rate": 3.798559031313712e-05, + "loss": 1.7624, + "step": 19221 + }, + { + "epoch": 5.899938612645795, + "grad_norm": 0.19909335672855377, + "learning_rate": 3.798076544213475e-05, + "loss": 1.7466, + "step": 19222 + }, + { + "epoch": 5.9002455494168204, + "grad_norm": 0.19832594692707062, + "learning_rate": 3.7975940689913526e-05, + "loss": 1.6896, + "step": 19223 + }, + { + "epoch": 5.900552486187845, + "grad_norm": 0.18473665416240692, + "learning_rate": 3.7971116056521076e-05, + "loss": 1.7167, + "step": 19224 + }, + { + "epoch": 5.90085942295887, + "grad_norm": 0.21106892824172974, + "learning_rate": 3.796629154200512e-05, + "loss": 1.8071, + "step": 19225 + }, + { + "epoch": 5.901166359729896, + "grad_norm": 0.20903728902339935, + "learning_rate": 3.796146714641333e-05, + "loss": 1.6946, + "step": 19226 + }, + { + "epoch": 5.901473296500921, + "grad_norm": 0.21518728137016296, + "learning_rate": 3.795664286979336e-05, + "loss": 1.6899, + "step": 19227 + }, + { + "epoch": 5.901780233271946, + "grad_norm": 0.1948135644197464, + "learning_rate": 3.7951818712192926e-05, + "loss": 1.7568, + "step": 19228 + }, + { + "epoch": 5.902087170042972, + "grad_norm": 0.2222091257572174, + "learning_rate": 3.7946994673659667e-05, + "loss": 1.8118, + "step": 19229 + }, + { + "epoch": 5.902394106813996, + "grad_norm": 0.2173513025045395, + "learning_rate": 3.794217075424127e-05, + "loss": 1.7194, + "step": 19230 + }, + { + "epoch": 5.902701043585021, + "grad_norm": 0.2026323676109314, + "learning_rate": 3.79373469539854e-05, + "loss": 1.6944, + "step": 19231 + }, + { + "epoch": 5.903007980356047, + "grad_norm": 0.22178098559379578, + "learning_rate": 3.7932523272939765e-05, + "loss": 1.7328, + "step": 19232 + }, + { + "epoch": 5.903314917127072, + "grad_norm": 0.22846719622612, + "learning_rate": 3.792769971115198e-05, + "loss": 1.8065, + "step": 19233 + }, + { + "epoch": 5.903621853898097, + "grad_norm": 0.2086053490638733, + "learning_rate": 3.792287626866977e-05, + "loss": 1.7511, + "step": 19234 + }, + { + "epoch": 5.903928790669122, + "grad_norm": 0.22444705665111542, + "learning_rate": 3.791805294554075e-05, + "loss": 1.742, + "step": 19235 + }, + { + "epoch": 5.904235727440147, + "grad_norm": 0.24630236625671387, + "learning_rate": 3.7913229741812625e-05, + "loss": 1.7531, + "step": 19236 + }, + { + "epoch": 5.9045426642111725, + "grad_norm": 0.2618274986743927, + "learning_rate": 3.7908406657533036e-05, + "loss": 1.7387, + "step": 19237 + }, + { + "epoch": 5.904849600982198, + "grad_norm": 0.25871509313583374, + "learning_rate": 3.790358369274968e-05, + "loss": 1.7822, + "step": 19238 + }, + { + "epoch": 5.905156537753223, + "grad_norm": 0.22675062716007233, + "learning_rate": 3.789876084751018e-05, + "loss": 1.7788, + "step": 19239 + }, + { + "epoch": 5.9054634745242485, + "grad_norm": 0.26623663306236267, + "learning_rate": 3.789393812186224e-05, + "loss": 1.7092, + "step": 19240 + }, + { + "epoch": 5.905770411295273, + "grad_norm": 0.19448868930339813, + "learning_rate": 3.788911551585348e-05, + "loss": 1.7164, + "step": 19241 + }, + { + "epoch": 5.906077348066298, + "grad_norm": 0.22451938688755035, + "learning_rate": 3.788429302953158e-05, + "loss": 1.667, + "step": 19242 + }, + { + "epoch": 5.906384284837324, + "grad_norm": 0.2323608547449112, + "learning_rate": 3.7879470662944214e-05, + "loss": 1.7992, + "step": 19243 + }, + { + "epoch": 5.906691221608349, + "grad_norm": 0.2508258819580078, + "learning_rate": 3.7874648416139e-05, + "loss": 1.7681, + "step": 19244 + }, + { + "epoch": 5.906998158379373, + "grad_norm": 0.22333547472953796, + "learning_rate": 3.786982628916364e-05, + "loss": 1.7006, + "step": 19245 + }, + { + "epoch": 5.907305095150399, + "grad_norm": 0.19816327095031738, + "learning_rate": 3.786500428206575e-05, + "loss": 1.7458, + "step": 19246 + }, + { + "epoch": 5.907612031921424, + "grad_norm": 0.2047683447599411, + "learning_rate": 3.7860182394893006e-05, + "loss": 1.7385, + "step": 19247 + }, + { + "epoch": 5.907918968692449, + "grad_norm": 0.2124621719121933, + "learning_rate": 3.785536062769304e-05, + "loss": 1.7373, + "step": 19248 + }, + { + "epoch": 5.908225905463475, + "grad_norm": 0.200453981757164, + "learning_rate": 3.785053898051355e-05, + "loss": 1.7754, + "step": 19249 + }, + { + "epoch": 5.9085328422345, + "grad_norm": 0.19543224573135376, + "learning_rate": 3.784571745340212e-05, + "loss": 1.724, + "step": 19250 + }, + { + "epoch": 5.9088397790055245, + "grad_norm": 0.17079658806324005, + "learning_rate": 3.784089604640647e-05, + "loss": 1.6843, + "step": 19251 + }, + { + "epoch": 5.90914671577655, + "grad_norm": 0.22792236506938934, + "learning_rate": 3.783607475957418e-05, + "loss": 1.7442, + "step": 19252 + }, + { + "epoch": 5.909453652547575, + "grad_norm": 0.20699752867221832, + "learning_rate": 3.783125359295294e-05, + "loss": 1.7868, + "step": 19253 + }, + { + "epoch": 5.9097605893186005, + "grad_norm": 0.2156144678592682, + "learning_rate": 3.782643254659038e-05, + "loss": 1.7443, + "step": 19254 + }, + { + "epoch": 5.910067526089626, + "grad_norm": 0.2021300345659256, + "learning_rate": 3.782161162053417e-05, + "loss": 1.7749, + "step": 19255 + }, + { + "epoch": 5.91037446286065, + "grad_norm": 0.17613129317760468, + "learning_rate": 3.7816790814831905e-05, + "loss": 1.7001, + "step": 19256 + }, + { + "epoch": 5.910681399631676, + "grad_norm": 0.18911564350128174, + "learning_rate": 3.781197012953128e-05, + "loss": 1.6817, + "step": 19257 + }, + { + "epoch": 5.910988336402701, + "grad_norm": 0.18920689821243286, + "learning_rate": 3.780714956467989e-05, + "loss": 1.7554, + "step": 19258 + }, + { + "epoch": 5.911295273173726, + "grad_norm": 0.22030571103096008, + "learning_rate": 3.7802329120325396e-05, + "loss": 1.7554, + "step": 19259 + }, + { + "epoch": 5.911602209944752, + "grad_norm": 0.21164962649345398, + "learning_rate": 3.779750879651545e-05, + "loss": 1.74, + "step": 19260 + }, + { + "epoch": 5.911909146715777, + "grad_norm": 0.2205103188753128, + "learning_rate": 3.779268859329766e-05, + "loss": 1.7424, + "step": 19261 + }, + { + "epoch": 5.912216083486801, + "grad_norm": 0.19262658059597015, + "learning_rate": 3.7787868510719685e-05, + "loss": 1.7157, + "step": 19262 + }, + { + "epoch": 5.912523020257827, + "grad_norm": 0.19583287835121155, + "learning_rate": 3.778304854882914e-05, + "loss": 1.7343, + "step": 19263 + }, + { + "epoch": 5.912829957028852, + "grad_norm": 0.18275529146194458, + "learning_rate": 3.777822870767368e-05, + "loss": 1.6938, + "step": 19264 + }, + { + "epoch": 5.913136893799877, + "grad_norm": 0.21268916130065918, + "learning_rate": 3.7773408987300914e-05, + "loss": 1.7546, + "step": 19265 + }, + { + "epoch": 5.913443830570903, + "grad_norm": 0.20878887176513672, + "learning_rate": 3.77685893877585e-05, + "loss": 1.8109, + "step": 19266 + }, + { + "epoch": 5.913750767341927, + "grad_norm": 0.2326175421476364, + "learning_rate": 3.776376990909404e-05, + "loss": 1.7248, + "step": 19267 + }, + { + "epoch": 5.9140577041129525, + "grad_norm": 0.28189611434936523, + "learning_rate": 3.7758950551355204e-05, + "loss": 1.7796, + "step": 19268 + }, + { + "epoch": 5.914364640883978, + "grad_norm": 0.1922682821750641, + "learning_rate": 3.775413131458957e-05, + "loss": 1.7096, + "step": 19269 + }, + { + "epoch": 5.914671577655003, + "grad_norm": 0.2839193642139435, + "learning_rate": 3.774931219884479e-05, + "loss": 1.7341, + "step": 19270 + }, + { + "epoch": 5.9149785144260285, + "grad_norm": 0.2075256109237671, + "learning_rate": 3.7744493204168495e-05, + "loss": 1.7565, + "step": 19271 + }, + { + "epoch": 5.915285451197054, + "grad_norm": 0.2780497372150421, + "learning_rate": 3.7739674330608306e-05, + "loss": 1.7186, + "step": 19272 + }, + { + "epoch": 5.915592387968078, + "grad_norm": 0.26129212975502014, + "learning_rate": 3.773485557821182e-05, + "loss": 1.8468, + "step": 19273 + }, + { + "epoch": 5.915899324739104, + "grad_norm": 0.3299194276332855, + "learning_rate": 3.773003694702671e-05, + "loss": 1.7705, + "step": 19274 + }, + { + "epoch": 5.916206261510129, + "grad_norm": 0.3011106848716736, + "learning_rate": 3.772521843710054e-05, + "loss": 1.748, + "step": 19275 + }, + { + "epoch": 5.916513198281154, + "grad_norm": 0.21370603144168854, + "learning_rate": 3.7720400048480966e-05, + "loss": 1.7709, + "step": 19276 + }, + { + "epoch": 5.91682013505218, + "grad_norm": 0.29374879598617554, + "learning_rate": 3.771558178121561e-05, + "loss": 1.6948, + "step": 19277 + }, + { + "epoch": 5.917127071823204, + "grad_norm": 0.2545807659626007, + "learning_rate": 3.771076363535205e-05, + "loss": 1.7974, + "step": 19278 + }, + { + "epoch": 5.917434008594229, + "grad_norm": 0.24210263788700104, + "learning_rate": 3.7705945610937954e-05, + "loss": 1.7438, + "step": 19279 + }, + { + "epoch": 5.917740945365255, + "grad_norm": 0.26224827766418457, + "learning_rate": 3.770112770802088e-05, + "loss": 1.7294, + "step": 19280 + }, + { + "epoch": 5.91804788213628, + "grad_norm": 0.23358991742134094, + "learning_rate": 3.7696309926648486e-05, + "loss": 1.7973, + "step": 19281 + }, + { + "epoch": 5.918354818907305, + "grad_norm": 0.3466563820838928, + "learning_rate": 3.769149226686837e-05, + "loss": 1.784, + "step": 19282 + }, + { + "epoch": 5.918661755678331, + "grad_norm": 0.2416994869709015, + "learning_rate": 3.768667472872814e-05, + "loss": 1.6957, + "step": 19283 + }, + { + "epoch": 5.918968692449355, + "grad_norm": 0.2285085767507553, + "learning_rate": 3.768185731227539e-05, + "loss": 1.71, + "step": 19284 + }, + { + "epoch": 5.9192756292203805, + "grad_norm": 0.2566430866718292, + "learning_rate": 3.7677040017557775e-05, + "loss": 1.792, + "step": 19285 + }, + { + "epoch": 5.919582565991406, + "grad_norm": 0.21566689014434814, + "learning_rate": 3.767222284462285e-05, + "loss": 1.8085, + "step": 19286 + }, + { + "epoch": 5.919889502762431, + "grad_norm": 0.24078889191150665, + "learning_rate": 3.7667405793518264e-05, + "loss": 1.7221, + "step": 19287 + }, + { + "epoch": 5.920196439533456, + "grad_norm": 0.22127531468868256, + "learning_rate": 3.7662588864291584e-05, + "loss": 1.7173, + "step": 19288 + }, + { + "epoch": 5.920503376304481, + "grad_norm": 0.18165946006774902, + "learning_rate": 3.765777205699045e-05, + "loss": 1.7518, + "step": 19289 + }, + { + "epoch": 5.920810313075506, + "grad_norm": 0.2569290101528168, + "learning_rate": 3.765295537166242e-05, + "loss": 1.7716, + "step": 19290 + }, + { + "epoch": 5.921117249846532, + "grad_norm": 0.19010202586650848, + "learning_rate": 3.764813880835515e-05, + "loss": 1.7146, + "step": 19291 + }, + { + "epoch": 5.921424186617557, + "grad_norm": 0.2882116436958313, + "learning_rate": 3.7643322367116195e-05, + "loss": 1.7677, + "step": 19292 + }, + { + "epoch": 5.921731123388582, + "grad_norm": 0.30711185932159424, + "learning_rate": 3.763850604799319e-05, + "loss": 1.7506, + "step": 19293 + }, + { + "epoch": 5.922038060159607, + "grad_norm": 0.19295164942741394, + "learning_rate": 3.76336898510337e-05, + "loss": 1.715, + "step": 19294 + }, + { + "epoch": 5.922344996930632, + "grad_norm": 0.24849168956279755, + "learning_rate": 3.762887377628533e-05, + "loss": 1.6807, + "step": 19295 + }, + { + "epoch": 5.922651933701657, + "grad_norm": 0.23573634028434753, + "learning_rate": 3.7624057823795696e-05, + "loss": 1.7363, + "step": 19296 + }, + { + "epoch": 5.922958870472683, + "grad_norm": 0.24384267628192902, + "learning_rate": 3.761924199361235e-05, + "loss": 1.726, + "step": 19297 + }, + { + "epoch": 5.923265807243708, + "grad_norm": 0.2589210271835327, + "learning_rate": 3.761442628578294e-05, + "loss": 1.7771, + "step": 19298 + }, + { + "epoch": 5.9235727440147325, + "grad_norm": 0.23527951538562775, + "learning_rate": 3.760961070035501e-05, + "loss": 1.6561, + "step": 19299 + }, + { + "epoch": 5.923879680785758, + "grad_norm": 0.20286870002746582, + "learning_rate": 3.7604795237376175e-05, + "loss": 1.7464, + "step": 19300 + }, + { + "epoch": 5.924186617556783, + "grad_norm": 0.22705033421516418, + "learning_rate": 3.759997989689401e-05, + "loss": 1.7814, + "step": 19301 + }, + { + "epoch": 5.9244935543278086, + "grad_norm": 0.21780981123447418, + "learning_rate": 3.7595164678956135e-05, + "loss": 1.7601, + "step": 19302 + }, + { + "epoch": 5.924800491098834, + "grad_norm": 0.2030021697282791, + "learning_rate": 3.759034958361009e-05, + "loss": 1.7222, + "step": 19303 + }, + { + "epoch": 5.925107427869859, + "grad_norm": 0.22956500947475433, + "learning_rate": 3.758553461090351e-05, + "loss": 1.674, + "step": 19304 + }, + { + "epoch": 5.925414364640884, + "grad_norm": 0.2368287444114685, + "learning_rate": 3.758071976088392e-05, + "loss": 1.7483, + "step": 19305 + }, + { + "epoch": 5.925721301411909, + "grad_norm": 0.22852632403373718, + "learning_rate": 3.757590503359896e-05, + "loss": 1.7561, + "step": 19306 + }, + { + "epoch": 5.926028238182934, + "grad_norm": 0.21657361090183258, + "learning_rate": 3.757109042909617e-05, + "loss": 1.7814, + "step": 19307 + }, + { + "epoch": 5.92633517495396, + "grad_norm": 0.21996551752090454, + "learning_rate": 3.756627594742317e-05, + "loss": 1.732, + "step": 19308 + }, + { + "epoch": 5.926642111724985, + "grad_norm": 0.23319712281227112, + "learning_rate": 3.75614615886275e-05, + "loss": 1.6807, + "step": 19309 + }, + { + "epoch": 5.9269490484960095, + "grad_norm": 0.17926698923110962, + "learning_rate": 3.755664735275677e-05, + "loss": 1.6925, + "step": 19310 + }, + { + "epoch": 5.927255985267035, + "grad_norm": 0.18986931443214417, + "learning_rate": 3.755183323985855e-05, + "loss": 1.7002, + "step": 19311 + }, + { + "epoch": 5.92756292203806, + "grad_norm": 0.18753086030483246, + "learning_rate": 3.7547019249980385e-05, + "loss": 1.695, + "step": 19312 + }, + { + "epoch": 5.9278698588090855, + "grad_norm": 0.21354973316192627, + "learning_rate": 3.7542205383169904e-05, + "loss": 1.6629, + "step": 19313 + }, + { + "epoch": 5.928176795580111, + "grad_norm": 0.19713245332241058, + "learning_rate": 3.753739163947463e-05, + "loss": 1.707, + "step": 19314 + }, + { + "epoch": 5.928483732351136, + "grad_norm": 0.2122458517551422, + "learning_rate": 3.753257801894217e-05, + "loss": 1.7309, + "step": 19315 + }, + { + "epoch": 5.928790669122161, + "grad_norm": 0.20360666513442993, + "learning_rate": 3.7527764521620065e-05, + "loss": 1.6861, + "step": 19316 + }, + { + "epoch": 5.929097605893186, + "grad_norm": 0.2652932405471802, + "learning_rate": 3.752295114755592e-05, + "loss": 1.7662, + "step": 19317 + }, + { + "epoch": 5.929404542664211, + "grad_norm": 0.18292152881622314, + "learning_rate": 3.751813789679726e-05, + "loss": 1.6691, + "step": 19318 + }, + { + "epoch": 5.929711479435237, + "grad_norm": 0.25630465149879456, + "learning_rate": 3.75133247693917e-05, + "loss": 1.7647, + "step": 19319 + }, + { + "epoch": 5.930018416206261, + "grad_norm": 0.2463291883468628, + "learning_rate": 3.750851176538677e-05, + "loss": 1.7252, + "step": 19320 + }, + { + "epoch": 5.930325352977286, + "grad_norm": 0.19977931678295135, + "learning_rate": 3.750369888483007e-05, + "loss": 1.7694, + "step": 19321 + }, + { + "epoch": 5.930632289748312, + "grad_norm": 0.19523118436336517, + "learning_rate": 3.7498886127769116e-05, + "loss": 1.7095, + "step": 19322 + }, + { + "epoch": 5.930939226519337, + "grad_norm": 0.19273912906646729, + "learning_rate": 3.749407349425151e-05, + "loss": 1.7009, + "step": 19323 + }, + { + "epoch": 5.931246163290362, + "grad_norm": 0.2419402152299881, + "learning_rate": 3.748926098432479e-05, + "loss": 1.7167, + "step": 19324 + }, + { + "epoch": 5.931553100061388, + "grad_norm": 0.22429771721363068, + "learning_rate": 3.7484448598036534e-05, + "loss": 1.6957, + "step": 19325 + }, + { + "epoch": 5.931860036832412, + "grad_norm": 0.23211807012557983, + "learning_rate": 3.747963633543429e-05, + "loss": 1.767, + "step": 19326 + }, + { + "epoch": 5.9321669736034375, + "grad_norm": 0.23204533755779266, + "learning_rate": 3.7474824196565625e-05, + "loss": 1.7405, + "step": 19327 + }, + { + "epoch": 5.932473910374463, + "grad_norm": 0.24068887531757355, + "learning_rate": 3.747001218147809e-05, + "loss": 1.7539, + "step": 19328 + }, + { + "epoch": 5.932780847145488, + "grad_norm": 0.18140049278736115, + "learning_rate": 3.746520029021922e-05, + "loss": 1.6956, + "step": 19329 + }, + { + "epoch": 5.9330877839165135, + "grad_norm": 0.28421929478645325, + "learning_rate": 3.746038852283661e-05, + "loss": 1.8539, + "step": 19330 + }, + { + "epoch": 5.933394720687538, + "grad_norm": 0.21984805166721344, + "learning_rate": 3.745557687937777e-05, + "loss": 1.7469, + "step": 19331 + }, + { + "epoch": 5.933701657458563, + "grad_norm": 0.2500358819961548, + "learning_rate": 3.7450765359890294e-05, + "loss": 1.7184, + "step": 19332 + }, + { + "epoch": 5.934008594229589, + "grad_norm": 0.2608816623687744, + "learning_rate": 3.744595396442169e-05, + "loss": 1.6825, + "step": 19333 + }, + { + "epoch": 5.934315531000614, + "grad_norm": 0.20359274744987488, + "learning_rate": 3.7441142693019526e-05, + "loss": 1.7535, + "step": 19334 + }, + { + "epoch": 5.934622467771639, + "grad_norm": 0.24795760214328766, + "learning_rate": 3.743633154573135e-05, + "loss": 1.7829, + "step": 19335 + }, + { + "epoch": 5.934929404542665, + "grad_norm": 0.20762503147125244, + "learning_rate": 3.7431520522604736e-05, + "loss": 1.7657, + "step": 19336 + }, + { + "epoch": 5.935236341313689, + "grad_norm": 0.24349527060985565, + "learning_rate": 3.7426709623687174e-05, + "loss": 1.7037, + "step": 19337 + }, + { + "epoch": 5.935543278084714, + "grad_norm": 0.2138780951499939, + "learning_rate": 3.742189884902626e-05, + "loss": 1.7302, + "step": 19338 + }, + { + "epoch": 5.93585021485574, + "grad_norm": 0.24776574969291687, + "learning_rate": 3.741708819866949e-05, + "loss": 1.7293, + "step": 19339 + }, + { + "epoch": 5.936157151626765, + "grad_norm": 0.297888845205307, + "learning_rate": 3.7412277672664444e-05, + "loss": 1.8341, + "step": 19340 + }, + { + "epoch": 5.93646408839779, + "grad_norm": 0.2811104953289032, + "learning_rate": 3.740746727105864e-05, + "loss": 1.7188, + "step": 19341 + }, + { + "epoch": 5.936771025168815, + "grad_norm": 0.37908127903938293, + "learning_rate": 3.740265699389964e-05, + "loss": 1.765, + "step": 19342 + }, + { + "epoch": 5.93707796193984, + "grad_norm": 0.24403691291809082, + "learning_rate": 3.739784684123495e-05, + "loss": 1.6897, + "step": 19343 + }, + { + "epoch": 5.9373848987108655, + "grad_norm": 0.2393181174993515, + "learning_rate": 3.7393036813112135e-05, + "loss": 1.6843, + "step": 19344 + }, + { + "epoch": 5.937691835481891, + "grad_norm": 0.2927580177783966, + "learning_rate": 3.738822690957872e-05, + "loss": 1.6946, + "step": 19345 + }, + { + "epoch": 5.937998772252916, + "grad_norm": 0.23423373699188232, + "learning_rate": 3.738341713068223e-05, + "loss": 1.7409, + "step": 19346 + }, + { + "epoch": 5.9383057090239415, + "grad_norm": 0.2544272840023041, + "learning_rate": 3.7378607476470216e-05, + "loss": 1.698, + "step": 19347 + }, + { + "epoch": 5.938612645794966, + "grad_norm": 0.2120404839515686, + "learning_rate": 3.737379794699019e-05, + "loss": 1.7412, + "step": 19348 + }, + { + "epoch": 5.938919582565991, + "grad_norm": 0.2076033353805542, + "learning_rate": 3.736898854228971e-05, + "loss": 1.752, + "step": 19349 + }, + { + "epoch": 5.939226519337017, + "grad_norm": 0.20122376084327698, + "learning_rate": 3.736417926241627e-05, + "loss": 1.6741, + "step": 19350 + }, + { + "epoch": 5.939533456108042, + "grad_norm": 0.1856858730316162, + "learning_rate": 3.735937010741742e-05, + "loss": 1.6959, + "step": 19351 + }, + { + "epoch": 5.939840392879067, + "grad_norm": 0.22192558646202087, + "learning_rate": 3.7354561077340684e-05, + "loss": 1.7597, + "step": 19352 + }, + { + "epoch": 5.940147329650092, + "grad_norm": 0.2653545141220093, + "learning_rate": 3.73497521722336e-05, + "loss": 1.7324, + "step": 19353 + }, + { + "epoch": 5.940454266421117, + "grad_norm": 0.1975676715373993, + "learning_rate": 3.734494339214366e-05, + "loss": 1.6852, + "step": 19354 + }, + { + "epoch": 5.940761203192142, + "grad_norm": 0.26949796080589294, + "learning_rate": 3.734013473711843e-05, + "loss": 1.7695, + "step": 19355 + }, + { + "epoch": 5.941068139963168, + "grad_norm": 0.2272176742553711, + "learning_rate": 3.733532620720539e-05, + "loss": 1.745, + "step": 19356 + }, + { + "epoch": 5.941375076734193, + "grad_norm": 0.25740066170692444, + "learning_rate": 3.733051780245208e-05, + "loss": 1.7701, + "step": 19357 + }, + { + "epoch": 5.941682013505218, + "grad_norm": 0.1910635381937027, + "learning_rate": 3.732570952290602e-05, + "loss": 1.7276, + "step": 19358 + }, + { + "epoch": 5.941988950276243, + "grad_norm": 0.24896447360515594, + "learning_rate": 3.732090136861474e-05, + "loss": 1.7717, + "step": 19359 + }, + { + "epoch": 5.942295887047268, + "grad_norm": 0.20696721971035004, + "learning_rate": 3.731609333962572e-05, + "loss": 1.7053, + "step": 19360 + }, + { + "epoch": 5.9426028238182935, + "grad_norm": 0.18822510540485382, + "learning_rate": 3.731128543598653e-05, + "loss": 1.6869, + "step": 19361 + }, + { + "epoch": 5.942909760589319, + "grad_norm": 0.20757299661636353, + "learning_rate": 3.730647765774464e-05, + "loss": 1.7214, + "step": 19362 + }, + { + "epoch": 5.943216697360343, + "grad_norm": 0.21238471567630768, + "learning_rate": 3.7301670004947574e-05, + "loss": 1.6953, + "step": 19363 + }, + { + "epoch": 5.943523634131369, + "grad_norm": 0.19326119124889374, + "learning_rate": 3.729686247764286e-05, + "loss": 1.7224, + "step": 19364 + }, + { + "epoch": 5.943830570902394, + "grad_norm": 0.17631326615810394, + "learning_rate": 3.729205507587798e-05, + "loss": 1.6471, + "step": 19365 + }, + { + "epoch": 5.944137507673419, + "grad_norm": 0.1741493195295334, + "learning_rate": 3.728724779970048e-05, + "loss": 1.7169, + "step": 19366 + }, + { + "epoch": 5.944444444444445, + "grad_norm": 0.18203428387641907, + "learning_rate": 3.728244064915782e-05, + "loss": 1.7301, + "step": 19367 + }, + { + "epoch": 5.94475138121547, + "grad_norm": 0.2063162475824356, + "learning_rate": 3.727763362429756e-05, + "loss": 1.7274, + "step": 19368 + }, + { + "epoch": 5.945058317986494, + "grad_norm": 0.17239537835121155, + "learning_rate": 3.7272826725167164e-05, + "loss": 1.7194, + "step": 19369 + }, + { + "epoch": 5.94536525475752, + "grad_norm": 0.1910972148180008, + "learning_rate": 3.726801995181418e-05, + "loss": 1.7017, + "step": 19370 + }, + { + "epoch": 5.945672191528545, + "grad_norm": 0.18822111189365387, + "learning_rate": 3.726321330428606e-05, + "loss": 1.723, + "step": 19371 + }, + { + "epoch": 5.94597912829957, + "grad_norm": 0.19680333137512207, + "learning_rate": 3.725840678263035e-05, + "loss": 1.685, + "step": 19372 + }, + { + "epoch": 5.946286065070596, + "grad_norm": 0.19016215205192566, + "learning_rate": 3.725360038689451e-05, + "loss": 1.7148, + "step": 19373 + }, + { + "epoch": 5.94659300184162, + "grad_norm": 0.1992037147283554, + "learning_rate": 3.7248794117126075e-05, + "loss": 1.7278, + "step": 19374 + }, + { + "epoch": 5.9468999386126455, + "grad_norm": 0.1892910748720169, + "learning_rate": 3.724398797337252e-05, + "loss": 1.7093, + "step": 19375 + }, + { + "epoch": 5.947206875383671, + "grad_norm": 0.23379561305046082, + "learning_rate": 3.723918195568137e-05, + "loss": 1.768, + "step": 19376 + }, + { + "epoch": 5.947513812154696, + "grad_norm": 0.1986081600189209, + "learning_rate": 3.7234376064100104e-05, + "loss": 1.719, + "step": 19377 + }, + { + "epoch": 5.9478207489257215, + "grad_norm": 0.20901642739772797, + "learning_rate": 3.7229570298676195e-05, + "loss": 1.7066, + "step": 19378 + }, + { + "epoch": 5.948127685696747, + "grad_norm": 0.2102847546339035, + "learning_rate": 3.722476465945718e-05, + "loss": 1.7354, + "step": 19379 + }, + { + "epoch": 5.948434622467771, + "grad_norm": 0.1857316792011261, + "learning_rate": 3.72199591464905e-05, + "loss": 1.7159, + "step": 19380 + }, + { + "epoch": 5.948741559238797, + "grad_norm": 0.3045661151409149, + "learning_rate": 3.721515375982371e-05, + "loss": 1.8782, + "step": 19381 + }, + { + "epoch": 5.949048496009822, + "grad_norm": 0.24114711582660675, + "learning_rate": 3.7210348499504236e-05, + "loss": 1.6819, + "step": 19382 + }, + { + "epoch": 5.949355432780847, + "grad_norm": 0.20186996459960938, + "learning_rate": 3.720554336557961e-05, + "loss": 1.8028, + "step": 19383 + }, + { + "epoch": 5.949662369551873, + "grad_norm": 0.25385335087776184, + "learning_rate": 3.7200738358097295e-05, + "loss": 1.7278, + "step": 19384 + }, + { + "epoch": 5.949969306322897, + "grad_norm": 0.23390468955039978, + "learning_rate": 3.719593347710478e-05, + "loss": 1.7775, + "step": 19385 + }, + { + "epoch": 5.9502762430939224, + "grad_norm": 0.22577936947345734, + "learning_rate": 3.719112872264956e-05, + "loss": 1.7567, + "step": 19386 + }, + { + "epoch": 5.950583179864948, + "grad_norm": 0.2540932297706604, + "learning_rate": 3.718632409477912e-05, + "loss": 1.6749, + "step": 19387 + }, + { + "epoch": 5.950890116635973, + "grad_norm": 0.1994820535182953, + "learning_rate": 3.718151959354093e-05, + "loss": 1.6809, + "step": 19388 + }, + { + "epoch": 5.9511970534069984, + "grad_norm": 0.27669432759284973, + "learning_rate": 3.717671521898249e-05, + "loss": 1.7633, + "step": 19389 + }, + { + "epoch": 5.951503990178024, + "grad_norm": 0.2533062994480133, + "learning_rate": 3.717191097115125e-05, + "loss": 1.7536, + "step": 19390 + }, + { + "epoch": 5.951810926949048, + "grad_norm": 0.22249148786067963, + "learning_rate": 3.716710685009471e-05, + "loss": 1.7325, + "step": 19391 + }, + { + "epoch": 5.952117863720074, + "grad_norm": 0.3085922598838806, + "learning_rate": 3.716230285586033e-05, + "loss": 1.7046, + "step": 19392 + }, + { + "epoch": 5.952424800491099, + "grad_norm": 0.2591574192047119, + "learning_rate": 3.715749898849562e-05, + "loss": 1.7165, + "step": 19393 + }, + { + "epoch": 5.952731737262124, + "grad_norm": 0.24586348235607147, + "learning_rate": 3.715269524804803e-05, + "loss": 1.749, + "step": 19394 + }, + { + "epoch": 5.953038674033149, + "grad_norm": 0.3424640893936157, + "learning_rate": 3.714789163456502e-05, + "loss": 1.7143, + "step": 19395 + }, + { + "epoch": 5.953345610804174, + "grad_norm": 0.24856910109519958, + "learning_rate": 3.714308814809408e-05, + "loss": 1.868, + "step": 19396 + }, + { + "epoch": 5.953652547575199, + "grad_norm": 0.2758113145828247, + "learning_rate": 3.7138284788682676e-05, + "loss": 1.6722, + "step": 19397 + }, + { + "epoch": 5.953959484346225, + "grad_norm": 0.25981786847114563, + "learning_rate": 3.71334815563783e-05, + "loss": 1.764, + "step": 19398 + }, + { + "epoch": 5.95426642111725, + "grad_norm": 0.27885568141937256, + "learning_rate": 3.7128678451228385e-05, + "loss": 1.7422, + "step": 19399 + }, + { + "epoch": 5.954573357888275, + "grad_norm": 0.2909421920776367, + "learning_rate": 3.712387547328042e-05, + "loss": 1.7862, + "step": 19400 + }, + { + "epoch": 5.9548802946593, + "grad_norm": 0.2288074642419815, + "learning_rate": 3.711907262258185e-05, + "loss": 1.7054, + "step": 19401 + }, + { + "epoch": 5.955187231430325, + "grad_norm": 0.2986883819103241, + "learning_rate": 3.711426989918017e-05, + "loss": 1.7555, + "step": 19402 + }, + { + "epoch": 5.9554941682013505, + "grad_norm": 0.23201194405555725, + "learning_rate": 3.710946730312281e-05, + "loss": 1.8186, + "step": 19403 + }, + { + "epoch": 5.955801104972376, + "grad_norm": 0.2609403431415558, + "learning_rate": 3.710466483445728e-05, + "loss": 1.7743, + "step": 19404 + }, + { + "epoch": 5.956108041743401, + "grad_norm": 0.31131741404533386, + "learning_rate": 3.709986249323098e-05, + "loss": 1.7938, + "step": 19405 + }, + { + "epoch": 5.956414978514426, + "grad_norm": 0.20544753968715668, + "learning_rate": 3.7095060279491424e-05, + "loss": 1.7278, + "step": 19406 + }, + { + "epoch": 5.956721915285451, + "grad_norm": 0.3063479959964752, + "learning_rate": 3.709025819328602e-05, + "loss": 1.7544, + "step": 19407 + }, + { + "epoch": 5.957028852056476, + "grad_norm": 0.34868693351745605, + "learning_rate": 3.708545623466227e-05, + "loss": 1.7536, + "step": 19408 + }, + { + "epoch": 5.957335788827502, + "grad_norm": 0.20847822725772858, + "learning_rate": 3.70806544036676e-05, + "loss": 1.7003, + "step": 19409 + }, + { + "epoch": 5.957642725598527, + "grad_norm": 0.3250095844268799, + "learning_rate": 3.707585270034949e-05, + "loss": 1.6815, + "step": 19410 + }, + { + "epoch": 5.957949662369552, + "grad_norm": 0.24854284524917603, + "learning_rate": 3.707105112475539e-05, + "loss": 1.7665, + "step": 19411 + }, + { + "epoch": 5.958256599140577, + "grad_norm": 0.2921455502510071, + "learning_rate": 3.706624967693271e-05, + "loss": 1.7039, + "step": 19412 + }, + { + "epoch": 5.958563535911602, + "grad_norm": 0.2659071385860443, + "learning_rate": 3.706144835692894e-05, + "loss": 1.7641, + "step": 19413 + }, + { + "epoch": 5.958870472682627, + "grad_norm": 0.30329519510269165, + "learning_rate": 3.7056647164791516e-05, + "loss": 1.7962, + "step": 19414 + }, + { + "epoch": 5.959177409453653, + "grad_norm": 0.4023756682872772, + "learning_rate": 3.7051846100567906e-05, + "loss": 1.7624, + "step": 19415 + }, + { + "epoch": 5.959484346224678, + "grad_norm": 0.24528828263282776, + "learning_rate": 3.704704516430553e-05, + "loss": 1.8156, + "step": 19416 + }, + { + "epoch": 5.9597912829957025, + "grad_norm": 0.46833130717277527, + "learning_rate": 3.704224435605186e-05, + "loss": 1.798, + "step": 19417 + }, + { + "epoch": 5.960098219766728, + "grad_norm": 0.26952674984931946, + "learning_rate": 3.70374436758543e-05, + "loss": 1.743, + "step": 19418 + }, + { + "epoch": 5.960405156537753, + "grad_norm": 0.3126155734062195, + "learning_rate": 3.703264312376034e-05, + "loss": 1.8003, + "step": 19419 + }, + { + "epoch": 5.9607120933087785, + "grad_norm": 0.2833348512649536, + "learning_rate": 3.702784269981738e-05, + "loss": 1.7524, + "step": 19420 + }, + { + "epoch": 5.961019030079804, + "grad_norm": 0.25425654649734497, + "learning_rate": 3.7023042404072916e-05, + "loss": 1.7241, + "step": 19421 + }, + { + "epoch": 5.961325966850829, + "grad_norm": 0.29460933804512024, + "learning_rate": 3.701824223657433e-05, + "loss": 1.676, + "step": 19422 + }, + { + "epoch": 5.961632903621854, + "grad_norm": 0.21040670573711395, + "learning_rate": 3.7013442197369094e-05, + "loss": 1.71, + "step": 19423 + }, + { + "epoch": 5.961939840392879, + "grad_norm": 0.3200007379055023, + "learning_rate": 3.7008642286504624e-05, + "loss": 1.7108, + "step": 19424 + }, + { + "epoch": 5.962246777163904, + "grad_norm": 0.20397430658340454, + "learning_rate": 3.7003842504028366e-05, + "loss": 1.7472, + "step": 19425 + }, + { + "epoch": 5.96255371393493, + "grad_norm": 0.24811354279518127, + "learning_rate": 3.699904284998776e-05, + "loss": 1.7116, + "step": 19426 + }, + { + "epoch": 5.962860650705955, + "grad_norm": 0.20980580151081085, + "learning_rate": 3.699424332443023e-05, + "loss": 1.786, + "step": 19427 + }, + { + "epoch": 5.963167587476979, + "grad_norm": 0.1967400163412094, + "learning_rate": 3.698944392740322e-05, + "loss": 1.7141, + "step": 19428 + }, + { + "epoch": 5.963474524248005, + "grad_norm": 0.21907822787761688, + "learning_rate": 3.698464465895414e-05, + "loss": 1.6983, + "step": 19429 + }, + { + "epoch": 5.96378146101903, + "grad_norm": 0.19938960671424866, + "learning_rate": 3.697984551913043e-05, + "loss": 1.6811, + "step": 19430 + }, + { + "epoch": 5.964088397790055, + "grad_norm": 0.22280220687389374, + "learning_rate": 3.6975046507979506e-05, + "loss": 1.6838, + "step": 19431 + }, + { + "epoch": 5.964395334561081, + "grad_norm": 0.2530672550201416, + "learning_rate": 3.697024762554883e-05, + "loss": 1.8116, + "step": 19432 + }, + { + "epoch": 5.964702271332106, + "grad_norm": 0.21853135526180267, + "learning_rate": 3.696544887188579e-05, + "loss": 1.692, + "step": 19433 + }, + { + "epoch": 5.9650092081031305, + "grad_norm": 0.18738535046577454, + "learning_rate": 3.696065024703783e-05, + "loss": 1.6971, + "step": 19434 + }, + { + "epoch": 5.965316144874156, + "grad_norm": 0.21199190616607666, + "learning_rate": 3.695585175105236e-05, + "loss": 1.7526, + "step": 19435 + }, + { + "epoch": 5.965623081645181, + "grad_norm": 0.22184251248836517, + "learning_rate": 3.695105338397681e-05, + "loss": 1.8075, + "step": 19436 + }, + { + "epoch": 5.9659300184162065, + "grad_norm": 0.20191644132137299, + "learning_rate": 3.6946255145858605e-05, + "loss": 1.7427, + "step": 19437 + }, + { + "epoch": 5.966236955187231, + "grad_norm": 0.2113640457391739, + "learning_rate": 3.694145703674515e-05, + "loss": 1.7556, + "step": 19438 + }, + { + "epoch": 5.966543891958256, + "grad_norm": 0.21834735572338104, + "learning_rate": 3.693665905668387e-05, + "loss": 1.7673, + "step": 19439 + }, + { + "epoch": 5.966850828729282, + "grad_norm": 0.2260274887084961, + "learning_rate": 3.6931861205722197e-05, + "loss": 1.8168, + "step": 19440 + }, + { + "epoch": 5.967157765500307, + "grad_norm": 0.24090524017810822, + "learning_rate": 3.692706348390751e-05, + "loss": 1.821, + "step": 19441 + }, + { + "epoch": 5.967464702271332, + "grad_norm": 0.27469882369041443, + "learning_rate": 3.6922265891287256e-05, + "loss": 1.7114, + "step": 19442 + }, + { + "epoch": 5.967771639042358, + "grad_norm": 0.23479801416397095, + "learning_rate": 3.6917468427908833e-05, + "loss": 1.7334, + "step": 19443 + }, + { + "epoch": 5.968078575813382, + "grad_norm": 0.21109704673290253, + "learning_rate": 3.6912671093819663e-05, + "loss": 1.7047, + "step": 19444 + }, + { + "epoch": 5.968385512584407, + "grad_norm": 0.21141986548900604, + "learning_rate": 3.690787388906715e-05, + "loss": 1.6868, + "step": 19445 + }, + { + "epoch": 5.968692449355433, + "grad_norm": 0.21836397051811218, + "learning_rate": 3.690307681369868e-05, + "loss": 1.6923, + "step": 19446 + }, + { + "epoch": 5.968999386126458, + "grad_norm": 0.21733662486076355, + "learning_rate": 3.6898279867761695e-05, + "loss": 1.7699, + "step": 19447 + }, + { + "epoch": 5.969306322897483, + "grad_norm": 0.19220437109470367, + "learning_rate": 3.689348305130359e-05, + "loss": 1.7002, + "step": 19448 + }, + { + "epoch": 5.969613259668508, + "grad_norm": 0.22644726932048798, + "learning_rate": 3.688868636437176e-05, + "loss": 1.7024, + "step": 19449 + }, + { + "epoch": 5.969920196439533, + "grad_norm": 0.1832779198884964, + "learning_rate": 3.688388980701361e-05, + "loss": 1.699, + "step": 19450 + }, + { + "epoch": 5.9702271332105585, + "grad_norm": 0.20793284475803375, + "learning_rate": 3.687909337927658e-05, + "loss": 1.7557, + "step": 19451 + }, + { + "epoch": 5.970534069981584, + "grad_norm": 0.19485175609588623, + "learning_rate": 3.6874297081207995e-05, + "loss": 1.7641, + "step": 19452 + }, + { + "epoch": 5.970841006752609, + "grad_norm": 0.20980949699878693, + "learning_rate": 3.686950091285534e-05, + "loss": 1.7542, + "step": 19453 + }, + { + "epoch": 5.9711479435236345, + "grad_norm": 0.24902600049972534, + "learning_rate": 3.686470487426594e-05, + "loss": 1.7342, + "step": 19454 + }, + { + "epoch": 5.971454880294659, + "grad_norm": 0.20191124081611633, + "learning_rate": 3.685990896548724e-05, + "loss": 1.6844, + "step": 19455 + }, + { + "epoch": 5.971761817065684, + "grad_norm": 0.23217806220054626, + "learning_rate": 3.685511318656662e-05, + "loss": 1.7054, + "step": 19456 + }, + { + "epoch": 5.97206875383671, + "grad_norm": 0.23383383452892303, + "learning_rate": 3.6850317537551484e-05, + "loss": 1.6903, + "step": 19457 + }, + { + "epoch": 5.972375690607735, + "grad_norm": 0.2147756665945053, + "learning_rate": 3.6845522018489196e-05, + "loss": 1.736, + "step": 19458 + }, + { + "epoch": 5.97268262737876, + "grad_norm": 0.23864400386810303, + "learning_rate": 3.68407266294272e-05, + "loss": 1.7483, + "step": 19459 + }, + { + "epoch": 5.972989564149785, + "grad_norm": 0.18702742457389832, + "learning_rate": 3.6835931370412836e-05, + "loss": 1.6874, + "step": 19460 + }, + { + "epoch": 5.97329650092081, + "grad_norm": 0.2167401760816574, + "learning_rate": 3.683113624149351e-05, + "loss": 1.652, + "step": 19461 + }, + { + "epoch": 5.973603437691835, + "grad_norm": 0.17105139791965485, + "learning_rate": 3.6826341242716636e-05, + "loss": 1.7029, + "step": 19462 + }, + { + "epoch": 5.973910374462861, + "grad_norm": 0.2189798206090927, + "learning_rate": 3.682154637412956e-05, + "loss": 1.7203, + "step": 19463 + }, + { + "epoch": 5.974217311233886, + "grad_norm": 0.17864444851875305, + "learning_rate": 3.68167516357797e-05, + "loss": 1.7176, + "step": 19464 + }, + { + "epoch": 5.974524248004911, + "grad_norm": 0.22356030344963074, + "learning_rate": 3.681195702771442e-05, + "loss": 1.7492, + "step": 19465 + }, + { + "epoch": 5.974831184775936, + "grad_norm": 0.19020728766918182, + "learning_rate": 3.68071625499811e-05, + "loss": 1.6925, + "step": 19466 + }, + { + "epoch": 5.975138121546961, + "grad_norm": 0.19092151522636414, + "learning_rate": 3.680236820262714e-05, + "loss": 1.7253, + "step": 19467 + }, + { + "epoch": 5.975445058317987, + "grad_norm": 0.20842085778713226, + "learning_rate": 3.6797573985699926e-05, + "loss": 1.7251, + "step": 19468 + }, + { + "epoch": 5.975751995089012, + "grad_norm": 0.2245844155550003, + "learning_rate": 3.6792779899246796e-05, + "loss": 1.7351, + "step": 19469 + }, + { + "epoch": 5.976058931860036, + "grad_norm": 0.18867328763008118, + "learning_rate": 3.678798594331519e-05, + "loss": 1.6646, + "step": 19470 + }, + { + "epoch": 5.976365868631062, + "grad_norm": 0.2892500162124634, + "learning_rate": 3.678319211795242e-05, + "loss": 1.7146, + "step": 19471 + }, + { + "epoch": 5.976672805402087, + "grad_norm": 0.22490514814853668, + "learning_rate": 3.677839842320591e-05, + "loss": 1.7147, + "step": 19472 + }, + { + "epoch": 5.976979742173112, + "grad_norm": 0.296724796295166, + "learning_rate": 3.677360485912301e-05, + "loss": 1.7714, + "step": 19473 + }, + { + "epoch": 5.977286678944138, + "grad_norm": 0.2784444987773895, + "learning_rate": 3.676881142575111e-05, + "loss": 1.7198, + "step": 19474 + }, + { + "epoch": 5.977593615715163, + "grad_norm": 0.20270293951034546, + "learning_rate": 3.676401812313755e-05, + "loss": 1.7336, + "step": 19475 + }, + { + "epoch": 5.9779005524861875, + "grad_norm": 0.23352907598018646, + "learning_rate": 3.6759224951329745e-05, + "loss": 1.7428, + "step": 19476 + }, + { + "epoch": 5.978207489257213, + "grad_norm": 0.1892426460981369, + "learning_rate": 3.675443191037502e-05, + "loss": 1.6636, + "step": 19477 + }, + { + "epoch": 5.978514426028238, + "grad_norm": 0.22216783463954926, + "learning_rate": 3.6749639000320766e-05, + "loss": 1.7446, + "step": 19478 + }, + { + "epoch": 5.9788213627992635, + "grad_norm": 0.19465389847755432, + "learning_rate": 3.6744846221214364e-05, + "loss": 1.7403, + "step": 19479 + }, + { + "epoch": 5.979128299570289, + "grad_norm": 0.1918177455663681, + "learning_rate": 3.674005357310314e-05, + "loss": 1.6974, + "step": 19480 + }, + { + "epoch": 5.979435236341313, + "grad_norm": 0.19065791368484497, + "learning_rate": 3.673526105603449e-05, + "loss": 1.7299, + "step": 19481 + }, + { + "epoch": 5.979742173112339, + "grad_norm": 0.24036844074726105, + "learning_rate": 3.673046867005575e-05, + "loss": 1.7441, + "step": 19482 + }, + { + "epoch": 5.980049109883364, + "grad_norm": 0.22352568805217743, + "learning_rate": 3.6725676415214305e-05, + "loss": 1.7556, + "step": 19483 + }, + { + "epoch": 5.980356046654389, + "grad_norm": 0.2492935210466385, + "learning_rate": 3.67208842915575e-05, + "loss": 1.6833, + "step": 19484 + }, + { + "epoch": 5.980662983425415, + "grad_norm": 0.2554415762424469, + "learning_rate": 3.671609229913272e-05, + "loss": 1.7426, + "step": 19485 + }, + { + "epoch": 5.98096992019644, + "grad_norm": 0.24076475203037262, + "learning_rate": 3.671130043798728e-05, + "loss": 1.7362, + "step": 19486 + }, + { + "epoch": 5.981276856967464, + "grad_norm": 0.24297118186950684, + "learning_rate": 3.670650870816858e-05, + "loss": 1.7493, + "step": 19487 + }, + { + "epoch": 5.98158379373849, + "grad_norm": 0.19533030688762665, + "learning_rate": 3.6701717109723924e-05, + "loss": 1.7397, + "step": 19488 + }, + { + "epoch": 5.981890730509515, + "grad_norm": 0.24731193482875824, + "learning_rate": 3.669692564270071e-05, + "loss": 1.7483, + "step": 19489 + }, + { + "epoch": 5.98219766728054, + "grad_norm": 0.23274390399456024, + "learning_rate": 3.669213430714626e-05, + "loss": 1.7677, + "step": 19490 + }, + { + "epoch": 5.982504604051566, + "grad_norm": 0.180234894156456, + "learning_rate": 3.668734310310796e-05, + "loss": 1.7065, + "step": 19491 + }, + { + "epoch": 5.98281154082259, + "grad_norm": 0.19045281410217285, + "learning_rate": 3.6682552030633125e-05, + "loss": 1.7089, + "step": 19492 + }, + { + "epoch": 5.9831184775936155, + "grad_norm": 0.17261318862438202, + "learning_rate": 3.667776108976914e-05, + "loss": 1.7227, + "step": 19493 + }, + { + "epoch": 5.983425414364641, + "grad_norm": 0.2156316339969635, + "learning_rate": 3.667297028056329e-05, + "loss": 1.7025, + "step": 19494 + }, + { + "epoch": 5.983732351135666, + "grad_norm": 0.22288112342357635, + "learning_rate": 3.666817960306298e-05, + "loss": 1.7123, + "step": 19495 + }, + { + "epoch": 5.9840392879066915, + "grad_norm": 0.21983082592487335, + "learning_rate": 3.6663389057315543e-05, + "loss": 1.7688, + "step": 19496 + }, + { + "epoch": 5.984346224677717, + "grad_norm": 0.1804746687412262, + "learning_rate": 3.665859864336829e-05, + "loss": 1.759, + "step": 19497 + }, + { + "epoch": 5.984653161448741, + "grad_norm": 0.22762230038642883, + "learning_rate": 3.6653808361268605e-05, + "loss": 1.8128, + "step": 19498 + }, + { + "epoch": 5.984960098219767, + "grad_norm": 0.21779340505599976, + "learning_rate": 3.664901821106379e-05, + "loss": 1.7316, + "step": 19499 + }, + { + "epoch": 5.985267034990792, + "grad_norm": 0.18899449706077576, + "learning_rate": 3.664422819280121e-05, + "loss": 1.7535, + "step": 19500 + }, + { + "epoch": 5.985573971761817, + "grad_norm": 0.22799427807331085, + "learning_rate": 3.663943830652819e-05, + "loss": 1.7626, + "step": 19501 + }, + { + "epoch": 5.985880908532843, + "grad_norm": 0.19936929643154144, + "learning_rate": 3.6634648552292086e-05, + "loss": 1.6887, + "step": 19502 + }, + { + "epoch": 5.986187845303867, + "grad_norm": 0.22482532262802124, + "learning_rate": 3.6629858930140206e-05, + "loss": 1.6867, + "step": 19503 + }, + { + "epoch": 5.986494782074892, + "grad_norm": 0.23543842136859894, + "learning_rate": 3.662506944011991e-05, + "loss": 1.7715, + "step": 19504 + }, + { + "epoch": 5.986801718845918, + "grad_norm": 0.230603888630867, + "learning_rate": 3.6620280082278495e-05, + "loss": 1.7514, + "step": 19505 + }, + { + "epoch": 5.987108655616943, + "grad_norm": 0.26767033338546753, + "learning_rate": 3.6615490856663334e-05, + "loss": 1.6862, + "step": 19506 + }, + { + "epoch": 5.987415592387968, + "grad_norm": 0.18282492458820343, + "learning_rate": 3.661070176332172e-05, + "loss": 1.6569, + "step": 19507 + }, + { + "epoch": 5.987722529158994, + "grad_norm": 0.255426824092865, + "learning_rate": 3.6605912802301016e-05, + "loss": 1.7623, + "step": 19508 + }, + { + "epoch": 5.988029465930018, + "grad_norm": 0.25026118755340576, + "learning_rate": 3.6601123973648524e-05, + "loss": 1.6907, + "step": 19509 + }, + { + "epoch": 5.9883364027010435, + "grad_norm": 0.19193407893180847, + "learning_rate": 3.659633527741159e-05, + "loss": 1.7647, + "step": 19510 + }, + { + "epoch": 5.988643339472069, + "grad_norm": 0.25562727451324463, + "learning_rate": 3.6591546713637506e-05, + "loss": 1.6806, + "step": 19511 + }, + { + "epoch": 5.988950276243094, + "grad_norm": 0.2296016663312912, + "learning_rate": 3.6586758282373624e-05, + "loss": 1.7747, + "step": 19512 + }, + { + "epoch": 5.989257213014119, + "grad_norm": 0.22875753045082092, + "learning_rate": 3.6581969983667275e-05, + "loss": 1.7847, + "step": 19513 + }, + { + "epoch": 5.989564149785144, + "grad_norm": 0.24469317495822906, + "learning_rate": 3.6577181817565736e-05, + "loss": 1.6784, + "step": 19514 + }, + { + "epoch": 5.989871086556169, + "grad_norm": 0.22855928540229797, + "learning_rate": 3.657239378411638e-05, + "loss": 1.788, + "step": 19515 + }, + { + "epoch": 5.990178023327195, + "grad_norm": 0.28745612502098083, + "learning_rate": 3.656760588336647e-05, + "loss": 1.6836, + "step": 19516 + }, + { + "epoch": 5.99048496009822, + "grad_norm": 0.18221193552017212, + "learning_rate": 3.656281811536337e-05, + "loss": 1.6687, + "step": 19517 + }, + { + "epoch": 5.990791896869245, + "grad_norm": 0.2556660771369934, + "learning_rate": 3.655803048015437e-05, + "loss": 1.7351, + "step": 19518 + }, + { + "epoch": 5.99109883364027, + "grad_norm": 0.18791422247886658, + "learning_rate": 3.6553242977786803e-05, + "loss": 1.6749, + "step": 19519 + }, + { + "epoch": 5.991405770411295, + "grad_norm": 0.28149592876434326, + "learning_rate": 3.654845560830796e-05, + "loss": 1.7333, + "step": 19520 + }, + { + "epoch": 5.99171270718232, + "grad_norm": 0.24631322920322418, + "learning_rate": 3.654366837176517e-05, + "loss": 1.7672, + "step": 19521 + }, + { + "epoch": 5.992019643953346, + "grad_norm": 0.22054782509803772, + "learning_rate": 3.653888126820573e-05, + "loss": 1.7499, + "step": 19522 + }, + { + "epoch": 5.992326580724371, + "grad_norm": 0.23334862291812897, + "learning_rate": 3.653409429767696e-05, + "loss": 1.7133, + "step": 19523 + }, + { + "epoch": 5.9926335174953955, + "grad_norm": 0.19809292256832123, + "learning_rate": 3.6529307460226145e-05, + "loss": 1.6965, + "step": 19524 + }, + { + "epoch": 5.992940454266421, + "grad_norm": 0.23769772052764893, + "learning_rate": 3.652452075590064e-05, + "loss": 1.699, + "step": 19525 + }, + { + "epoch": 5.993247391037446, + "grad_norm": 0.19045031070709229, + "learning_rate": 3.6519734184747686e-05, + "loss": 1.7043, + "step": 19526 + }, + { + "epoch": 5.9935543278084715, + "grad_norm": 0.20795129239559174, + "learning_rate": 3.651494774681465e-05, + "loss": 1.7159, + "step": 19527 + }, + { + "epoch": 5.993861264579497, + "grad_norm": 0.1933370679616928, + "learning_rate": 3.651016144214878e-05, + "loss": 1.6999, + "step": 19528 + }, + { + "epoch": 5.994168201350522, + "grad_norm": 0.18360544741153717, + "learning_rate": 3.650537527079742e-05, + "loss": 1.7525, + "step": 19529 + }, + { + "epoch": 5.994475138121547, + "grad_norm": 0.21080785989761353, + "learning_rate": 3.650058923280786e-05, + "loss": 1.6832, + "step": 19530 + }, + { + "epoch": 5.994782074892572, + "grad_norm": 0.19701606035232544, + "learning_rate": 3.649580332822736e-05, + "loss": 1.7104, + "step": 19531 + }, + { + "epoch": 5.995089011663597, + "grad_norm": 0.24208703637123108, + "learning_rate": 3.6491017557103266e-05, + "loss": 1.726, + "step": 19532 + }, + { + "epoch": 5.995395948434623, + "grad_norm": 0.25981345772743225, + "learning_rate": 3.648623191948284e-05, + "loss": 1.7644, + "step": 19533 + }, + { + "epoch": 5.995702885205648, + "grad_norm": 0.24137455224990845, + "learning_rate": 3.64814464154134e-05, + "loss": 1.7354, + "step": 19534 + }, + { + "epoch": 5.996009821976672, + "grad_norm": 0.2140759378671646, + "learning_rate": 3.647666104494222e-05, + "loss": 1.7244, + "step": 19535 + }, + { + "epoch": 5.996316758747698, + "grad_norm": 0.2801622748374939, + "learning_rate": 3.647187580811663e-05, + "loss": 1.6996, + "step": 19536 + }, + { + "epoch": 5.996623695518723, + "grad_norm": 0.21048817038536072, + "learning_rate": 3.6467090704983856e-05, + "loss": 1.7378, + "step": 19537 + }, + { + "epoch": 5.996930632289748, + "grad_norm": 0.2935819625854492, + "learning_rate": 3.6462305735591254e-05, + "loss": 1.7066, + "step": 19538 + }, + { + "epoch": 5.997237569060774, + "grad_norm": 0.22473880648612976, + "learning_rate": 3.645752089998606e-05, + "loss": 1.7539, + "step": 19539 + }, + { + "epoch": 5.997544505831799, + "grad_norm": 0.20606113970279694, + "learning_rate": 3.6452736198215585e-05, + "loss": 1.7338, + "step": 19540 + }, + { + "epoch": 5.9978514426028235, + "grad_norm": 0.2702842950820923, + "learning_rate": 3.6447951630327116e-05, + "loss": 1.7171, + "step": 19541 + }, + { + "epoch": 5.998158379373849, + "grad_norm": 0.19971637427806854, + "learning_rate": 3.6443167196367946e-05, + "loss": 1.7132, + "step": 19542 + }, + { + "epoch": 5.998465316144874, + "grad_norm": 0.2352653592824936, + "learning_rate": 3.643838289638531e-05, + "loss": 1.787, + "step": 19543 + }, + { + "epoch": 5.9987722529158995, + "grad_norm": 0.2324669510126114, + "learning_rate": 3.643359873042656e-05, + "loss": 1.7039, + "step": 19544 + }, + { + "epoch": 5.999079189686924, + "grad_norm": 0.1935029774904251, + "learning_rate": 3.6428814698538914e-05, + "loss": 1.6846, + "step": 19545 + }, + { + "epoch": 5.999386126457949, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.642403080076968e-05, + "loss": 1.7018, + "step": 19546 + }, + { + "epoch": 5.999693063228975, + "grad_norm": 0.19364693760871887, + "learning_rate": 3.6419247037166146e-05, + "loss": 1.6901, + "step": 19547 + }, + { + "epoch": 6.0, + "grad_norm": 0.23718556761741638, + "learning_rate": 3.641446340777556e-05, + "loss": 1.7743, + "step": 19548 + }, + { + "epoch": 6.000306936771025, + "grad_norm": 0.23907634615898132, + "learning_rate": 3.640967991264521e-05, + "loss": 1.8225, + "step": 19549 + }, + { + "epoch": 6.000613873542051, + "grad_norm": 0.18895737826824188, + "learning_rate": 3.6404896551822365e-05, + "loss": 1.7004, + "step": 19550 + }, + { + "epoch": 6.000920810313075, + "grad_norm": 0.20192188024520874, + "learning_rate": 3.64001133253543e-05, + "loss": 1.7304, + "step": 19551 + }, + { + "epoch": 6.0012277470841005, + "grad_norm": 0.1961488425731659, + "learning_rate": 3.6395330233288285e-05, + "loss": 1.6839, + "step": 19552 + }, + { + "epoch": 6.001534683855126, + "grad_norm": 0.271635502576828, + "learning_rate": 3.639054727567161e-05, + "loss": 1.8182, + "step": 19553 + }, + { + "epoch": 6.001841620626151, + "grad_norm": 0.20838679373264313, + "learning_rate": 3.63857644525515e-05, + "loss": 1.7688, + "step": 19554 + }, + { + "epoch": 6.0021485573971765, + "grad_norm": 0.23661796748638153, + "learning_rate": 3.6380981763975266e-05, + "loss": 1.6785, + "step": 19555 + }, + { + "epoch": 6.002455494168202, + "grad_norm": 0.1728433072566986, + "learning_rate": 3.637619920999013e-05, + "loss": 1.6648, + "step": 19556 + }, + { + "epoch": 6.002762430939226, + "grad_norm": 0.2845853269100189, + "learning_rate": 3.6371416790643395e-05, + "loss": 1.7592, + "step": 19557 + }, + { + "epoch": 6.003069367710252, + "grad_norm": 0.3246566951274872, + "learning_rate": 3.636663450598229e-05, + "loss": 1.7045, + "step": 19558 + }, + { + "epoch": 6.003376304481277, + "grad_norm": 0.21857120096683502, + "learning_rate": 3.636185235605412e-05, + "loss": 1.756, + "step": 19559 + }, + { + "epoch": 6.003683241252302, + "grad_norm": 0.3583754599094391, + "learning_rate": 3.63570703409061e-05, + "loss": 1.6828, + "step": 19560 + }, + { + "epoch": 6.003990178023328, + "grad_norm": 0.25527241826057434, + "learning_rate": 3.635228846058552e-05, + "loss": 1.7611, + "step": 19561 + }, + { + "epoch": 6.004297114794352, + "grad_norm": 0.29662930965423584, + "learning_rate": 3.6347506715139604e-05, + "loss": 1.747, + "step": 19562 + }, + { + "epoch": 6.004604051565377, + "grad_norm": 0.2588978707790375, + "learning_rate": 3.634272510461564e-05, + "loss": 1.7153, + "step": 19563 + }, + { + "epoch": 6.004910988336403, + "grad_norm": 0.23874366283416748, + "learning_rate": 3.633794362906089e-05, + "loss": 1.7285, + "step": 19564 + }, + { + "epoch": 6.005217925107428, + "grad_norm": 0.2898634374141693, + "learning_rate": 3.633316228852256e-05, + "loss": 1.7539, + "step": 19565 + }, + { + "epoch": 6.005524861878453, + "grad_norm": 0.2578127682209015, + "learning_rate": 3.6328381083047946e-05, + "loss": 1.7504, + "step": 19566 + }, + { + "epoch": 6.005831798649478, + "grad_norm": 0.3094595968723297, + "learning_rate": 3.632360001268427e-05, + "loss": 1.7076, + "step": 19567 + }, + { + "epoch": 6.006138735420503, + "grad_norm": 0.27825623750686646, + "learning_rate": 3.63188190774788e-05, + "loss": 1.7651, + "step": 19568 + }, + { + "epoch": 6.0064456721915285, + "grad_norm": 0.27732032537460327, + "learning_rate": 3.631403827747878e-05, + "loss": 1.7209, + "step": 19569 + }, + { + "epoch": 6.006752608962554, + "grad_norm": 0.36446672677993774, + "learning_rate": 3.6309257612731475e-05, + "loss": 1.7191, + "step": 19570 + }, + { + "epoch": 6.007059545733579, + "grad_norm": 0.19071432948112488, + "learning_rate": 3.6304477083284076e-05, + "loss": 1.6981, + "step": 19571 + }, + { + "epoch": 6.0073664825046045, + "grad_norm": 0.40523234009742737, + "learning_rate": 3.6299696689183895e-05, + "loss": 1.7259, + "step": 19572 + }, + { + "epoch": 6.007673419275629, + "grad_norm": 0.30279576778411865, + "learning_rate": 3.6294916430478116e-05, + "loss": 1.8017, + "step": 19573 + }, + { + "epoch": 6.007980356046654, + "grad_norm": 0.2944689989089966, + "learning_rate": 3.629013630721402e-05, + "loss": 1.7347, + "step": 19574 + }, + { + "epoch": 6.00828729281768, + "grad_norm": 0.3557213246822357, + "learning_rate": 3.6285356319438814e-05, + "loss": 1.7308, + "step": 19575 + }, + { + "epoch": 6.008594229588705, + "grad_norm": 0.19888661801815033, + "learning_rate": 3.628057646719978e-05, + "loss": 1.7571, + "step": 19576 + }, + { + "epoch": 6.00890116635973, + "grad_norm": 0.34002986550331116, + "learning_rate": 3.627579675054411e-05, + "loss": 1.7417, + "step": 19577 + }, + { + "epoch": 6.009208103130755, + "grad_norm": 0.2756921350955963, + "learning_rate": 3.627101716951908e-05, + "loss": 1.7351, + "step": 19578 + }, + { + "epoch": 6.00951503990178, + "grad_norm": 0.3520946502685547, + "learning_rate": 3.6266237724171885e-05, + "loss": 1.7056, + "step": 19579 + }, + { + "epoch": 6.009821976672805, + "grad_norm": 0.3673728406429291, + "learning_rate": 3.6261458414549786e-05, + "loss": 1.6388, + "step": 19580 + }, + { + "epoch": 6.010128913443831, + "grad_norm": 0.2247757613658905, + "learning_rate": 3.625667924070003e-05, + "loss": 1.7772, + "step": 19581 + }, + { + "epoch": 6.010435850214856, + "grad_norm": 0.4387452006340027, + "learning_rate": 3.6251900202669795e-05, + "loss": 1.7629, + "step": 19582 + }, + { + "epoch": 6.0107427869858805, + "grad_norm": 0.23595796525478363, + "learning_rate": 3.624712130050636e-05, + "loss": 1.8044, + "step": 19583 + }, + { + "epoch": 6.011049723756906, + "grad_norm": 0.31198835372924805, + "learning_rate": 3.624234253425691e-05, + "loss": 1.7623, + "step": 19584 + }, + { + "epoch": 6.011356660527931, + "grad_norm": 0.25283896923065186, + "learning_rate": 3.6237563903968705e-05, + "loss": 1.7771, + "step": 19585 + }, + { + "epoch": 6.0116635972989565, + "grad_norm": 0.2595483064651489, + "learning_rate": 3.6232785409688954e-05, + "loss": 1.7405, + "step": 19586 + }, + { + "epoch": 6.011970534069982, + "grad_norm": 0.302273690700531, + "learning_rate": 3.622800705146491e-05, + "loss": 1.7236, + "step": 19587 + }, + { + "epoch": 6.012277470841007, + "grad_norm": 0.20444928109645844, + "learning_rate": 3.622322882934375e-05, + "loss": 1.6863, + "step": 19588 + }, + { + "epoch": 6.012584407612032, + "grad_norm": 0.2682531774044037, + "learning_rate": 3.621845074337273e-05, + "loss": 1.752, + "step": 19589 + }, + { + "epoch": 6.012891344383057, + "grad_norm": 0.25617173314094543, + "learning_rate": 3.621367279359905e-05, + "loss": 1.7496, + "step": 19590 + }, + { + "epoch": 6.013198281154082, + "grad_norm": 0.24514207243919373, + "learning_rate": 3.620889498006994e-05, + "loss": 1.6568, + "step": 19591 + }, + { + "epoch": 6.013505217925108, + "grad_norm": 0.2799128293991089, + "learning_rate": 3.6204117302832616e-05, + "loss": 1.7284, + "step": 19592 + }, + { + "epoch": 6.013812154696133, + "grad_norm": 0.2025543451309204, + "learning_rate": 3.619933976193428e-05, + "loss": 1.7172, + "step": 19593 + }, + { + "epoch": 6.014119091467157, + "grad_norm": 0.24697700142860413, + "learning_rate": 3.619456235742216e-05, + "loss": 1.7316, + "step": 19594 + }, + { + "epoch": 6.014426028238183, + "grad_norm": 0.2518150210380554, + "learning_rate": 3.618978508934348e-05, + "loss": 1.8183, + "step": 19595 + }, + { + "epoch": 6.014732965009208, + "grad_norm": 0.165326327085495, + "learning_rate": 3.618500795774542e-05, + "loss": 1.665, + "step": 19596 + }, + { + "epoch": 6.015039901780233, + "grad_norm": 0.19158180058002472, + "learning_rate": 3.6180230962675216e-05, + "loss": 1.7232, + "step": 19597 + }, + { + "epoch": 6.015346838551259, + "grad_norm": 0.19456413388252258, + "learning_rate": 3.6175454104180086e-05, + "loss": 1.7153, + "step": 19598 + }, + { + "epoch": 6.015653775322283, + "grad_norm": 0.233373761177063, + "learning_rate": 3.6170677382307195e-05, + "loss": 1.7914, + "step": 19599 + }, + { + "epoch": 6.0159607120933085, + "grad_norm": 0.18567882478237152, + "learning_rate": 3.6165900797103796e-05, + "loss": 1.6793, + "step": 19600 + }, + { + "epoch": 6.016267648864334, + "grad_norm": 0.2119273990392685, + "learning_rate": 3.616112434861706e-05, + "loss": 1.689, + "step": 19601 + }, + { + "epoch": 6.016574585635359, + "grad_norm": 0.1915217787027359, + "learning_rate": 3.61563480368942e-05, + "loss": 1.6835, + "step": 19602 + }, + { + "epoch": 6.0168815224063845, + "grad_norm": 0.24824760854244232, + "learning_rate": 3.615157186198244e-05, + "loss": 1.8411, + "step": 19603 + }, + { + "epoch": 6.01718845917741, + "grad_norm": 0.2198900282382965, + "learning_rate": 3.6146795823928955e-05, + "loss": 1.7311, + "step": 19604 + }, + { + "epoch": 6.017495395948434, + "grad_norm": 0.22993668913841248, + "learning_rate": 3.614201992278095e-05, + "loss": 1.7249, + "step": 19605 + }, + { + "epoch": 6.01780233271946, + "grad_norm": 0.20677974820137024, + "learning_rate": 3.613724415858564e-05, + "loss": 1.7137, + "step": 19606 + }, + { + "epoch": 6.018109269490485, + "grad_norm": 0.1844938099384308, + "learning_rate": 3.6132468531390184e-05, + "loss": 1.6512, + "step": 19607 + }, + { + "epoch": 6.01841620626151, + "grad_norm": 0.224154993891716, + "learning_rate": 3.6127693041241815e-05, + "loss": 1.7116, + "step": 19608 + }, + { + "epoch": 6.018723143032536, + "grad_norm": 0.17322199046611786, + "learning_rate": 3.612291768818772e-05, + "loss": 1.6743, + "step": 19609 + }, + { + "epoch": 6.01903007980356, + "grad_norm": 0.24451903998851776, + "learning_rate": 3.611814247227508e-05, + "loss": 1.8332, + "step": 19610 + }, + { + "epoch": 6.019337016574585, + "grad_norm": 0.1911642849445343, + "learning_rate": 3.611336739355109e-05, + "loss": 1.707, + "step": 19611 + }, + { + "epoch": 6.019643953345611, + "grad_norm": 0.20917518436908722, + "learning_rate": 3.6108592452062954e-05, + "loss": 1.7328, + "step": 19612 + }, + { + "epoch": 6.019950890116636, + "grad_norm": 0.2314450889825821, + "learning_rate": 3.610381764785784e-05, + "loss": 1.7575, + "step": 19613 + }, + { + "epoch": 6.020257826887661, + "grad_norm": 0.20701734721660614, + "learning_rate": 3.609904298098296e-05, + "loss": 1.6958, + "step": 19614 + }, + { + "epoch": 6.020564763658686, + "grad_norm": 0.2494465857744217, + "learning_rate": 3.609426845148547e-05, + "loss": 1.706, + "step": 19615 + }, + { + "epoch": 6.020871700429711, + "grad_norm": 0.25842729210853577, + "learning_rate": 3.608949405941256e-05, + "loss": 1.7667, + "step": 19616 + }, + { + "epoch": 6.0211786372007365, + "grad_norm": 0.19831863045692444, + "learning_rate": 3.608471980481145e-05, + "loss": 1.7135, + "step": 19617 + }, + { + "epoch": 6.021485573971762, + "grad_norm": 0.21611735224723816, + "learning_rate": 3.607994568772927e-05, + "loss": 1.7416, + "step": 19618 + }, + { + "epoch": 6.021792510742787, + "grad_norm": 0.2356715202331543, + "learning_rate": 3.607517170821324e-05, + "loss": 1.7696, + "step": 19619 + }, + { + "epoch": 6.0220994475138125, + "grad_norm": 0.24737675487995148, + "learning_rate": 3.6070397866310514e-05, + "loss": 1.7189, + "step": 19620 + }, + { + "epoch": 6.022406384284837, + "grad_norm": 0.19260701537132263, + "learning_rate": 3.6065624162068284e-05, + "loss": 1.7292, + "step": 19621 + }, + { + "epoch": 6.022713321055862, + "grad_norm": 0.29366952180862427, + "learning_rate": 3.6060850595533716e-05, + "loss": 1.7875, + "step": 19622 + }, + { + "epoch": 6.023020257826888, + "grad_norm": 0.2038174718618393, + "learning_rate": 3.605607716675401e-05, + "loss": 1.6777, + "step": 19623 + }, + { + "epoch": 6.023327194597913, + "grad_norm": 0.28923583030700684, + "learning_rate": 3.605130387577631e-05, + "loss": 1.7175, + "step": 19624 + }, + { + "epoch": 6.023634131368938, + "grad_norm": 0.3004317283630371, + "learning_rate": 3.6046530722647816e-05, + "loss": 1.8059, + "step": 19625 + }, + { + "epoch": 6.023941068139963, + "grad_norm": 0.19832390546798706, + "learning_rate": 3.6041757707415666e-05, + "loss": 1.7197, + "step": 19626 + }, + { + "epoch": 6.024248004910988, + "grad_norm": 0.2782927453517914, + "learning_rate": 3.6036984830127054e-05, + "loss": 1.6563, + "step": 19627 + }, + { + "epoch": 6.024554941682013, + "grad_norm": 0.20395785570144653, + "learning_rate": 3.603221209082913e-05, + "loss": 1.6972, + "step": 19628 + }, + { + "epoch": 6.024861878453039, + "grad_norm": 0.26302096247673035, + "learning_rate": 3.60274394895691e-05, + "loss": 1.7348, + "step": 19629 + }, + { + "epoch": 6.025168815224064, + "grad_norm": 0.26376327872276306, + "learning_rate": 3.6022667026394095e-05, + "loss": 1.7183, + "step": 19630 + }, + { + "epoch": 6.0254757519950894, + "grad_norm": 0.20590877532958984, + "learning_rate": 3.601789470135127e-05, + "loss": 1.7114, + "step": 19631 + }, + { + "epoch": 6.025782688766114, + "grad_norm": 0.2873607277870178, + "learning_rate": 3.6013122514487815e-05, + "loss": 1.7598, + "step": 19632 + }, + { + "epoch": 6.026089625537139, + "grad_norm": 0.24324963986873627, + "learning_rate": 3.600835046585087e-05, + "loss": 1.8844, + "step": 19633 + }, + { + "epoch": 6.026396562308165, + "grad_norm": 0.27910730242729187, + "learning_rate": 3.6003578555487624e-05, + "loss": 1.8598, + "step": 19634 + }, + { + "epoch": 6.02670349907919, + "grad_norm": 0.22766844928264618, + "learning_rate": 3.59988067834452e-05, + "loss": 1.7281, + "step": 19635 + }, + { + "epoch": 6.027010435850215, + "grad_norm": 0.2390190064907074, + "learning_rate": 3.5994035149770804e-05, + "loss": 1.7355, + "step": 19636 + }, + { + "epoch": 6.02731737262124, + "grad_norm": 0.23422548174858093, + "learning_rate": 3.598926365451153e-05, + "loss": 1.7226, + "step": 19637 + }, + { + "epoch": 6.027624309392265, + "grad_norm": 0.20240288972854614, + "learning_rate": 3.598449229771458e-05, + "loss": 1.7523, + "step": 19638 + }, + { + "epoch": 6.02793124616329, + "grad_norm": 0.26388832926750183, + "learning_rate": 3.597972107942708e-05, + "loss": 1.7003, + "step": 19639 + }, + { + "epoch": 6.028238182934316, + "grad_norm": 0.19814053177833557, + "learning_rate": 3.597494999969622e-05, + "loss": 1.7087, + "step": 19640 + }, + { + "epoch": 6.028545119705341, + "grad_norm": 0.2779136896133423, + "learning_rate": 3.5970179058569095e-05, + "loss": 1.7581, + "step": 19641 + }, + { + "epoch": 6.0288520564763655, + "grad_norm": 0.220394566655159, + "learning_rate": 3.5965408256092905e-05, + "loss": 1.7236, + "step": 19642 + }, + { + "epoch": 6.029158993247391, + "grad_norm": 0.28568828105926514, + "learning_rate": 3.596063759231476e-05, + "loss": 1.7933, + "step": 19643 + }, + { + "epoch": 6.029465930018416, + "grad_norm": 0.19509564340114594, + "learning_rate": 3.595586706728183e-05, + "loss": 1.6803, + "step": 19644 + }, + { + "epoch": 6.0297728667894415, + "grad_norm": 0.30855104327201843, + "learning_rate": 3.595109668104124e-05, + "loss": 1.7345, + "step": 19645 + }, + { + "epoch": 6.030079803560467, + "grad_norm": 0.24195496737957, + "learning_rate": 3.5946326433640174e-05, + "loss": 1.7493, + "step": 19646 + }, + { + "epoch": 6.030386740331492, + "grad_norm": 0.28324684500694275, + "learning_rate": 3.5941556325125744e-05, + "loss": 1.7959, + "step": 19647 + }, + { + "epoch": 6.030693677102517, + "grad_norm": 0.25351646542549133, + "learning_rate": 3.593678635554508e-05, + "loss": 1.7298, + "step": 19648 + }, + { + "epoch": 6.031000613873542, + "grad_norm": 0.2608177959918976, + "learning_rate": 3.593201652494534e-05, + "loss": 1.7072, + "step": 19649 + }, + { + "epoch": 6.031307550644567, + "grad_norm": 0.3182333707809448, + "learning_rate": 3.592724683337365e-05, + "loss": 1.6976, + "step": 19650 + }, + { + "epoch": 6.031614487415593, + "grad_norm": 0.19296859204769135, + "learning_rate": 3.592247728087717e-05, + "loss": 1.6879, + "step": 19651 + }, + { + "epoch": 6.031921424186618, + "grad_norm": 0.3927764594554901, + "learning_rate": 3.591770786750301e-05, + "loss": 1.6824, + "step": 19652 + }, + { + "epoch": 6.032228360957642, + "grad_norm": 0.23609496653079987, + "learning_rate": 3.591293859329833e-05, + "loss": 1.7224, + "step": 19653 + }, + { + "epoch": 6.032535297728668, + "grad_norm": 0.40787333250045776, + "learning_rate": 3.590816945831023e-05, + "loss": 1.7206, + "step": 19654 + }, + { + "epoch": 6.032842234499693, + "grad_norm": 0.31101885437965393, + "learning_rate": 3.590340046258586e-05, + "loss": 1.7446, + "step": 19655 + }, + { + "epoch": 6.033149171270718, + "grad_norm": 0.19401656091213226, + "learning_rate": 3.589863160617235e-05, + "loss": 1.6778, + "step": 19656 + }, + { + "epoch": 6.033456108041744, + "grad_norm": 0.3309115469455719, + "learning_rate": 3.589386288911684e-05, + "loss": 1.7196, + "step": 19657 + }, + { + "epoch": 6.033763044812768, + "grad_norm": 0.22281408309936523, + "learning_rate": 3.588909431146643e-05, + "loss": 1.7122, + "step": 19658 + }, + { + "epoch": 6.0340699815837935, + "grad_norm": 0.2903781831264496, + "learning_rate": 3.5884325873268275e-05, + "loss": 1.7428, + "step": 19659 + }, + { + "epoch": 6.034376918354819, + "grad_norm": 0.2529856562614441, + "learning_rate": 3.587955757456947e-05, + "loss": 1.7075, + "step": 19660 + }, + { + "epoch": 6.034683855125844, + "grad_norm": 0.2445102334022522, + "learning_rate": 3.587478941541716e-05, + "loss": 1.6631, + "step": 19661 + }, + { + "epoch": 6.0349907918968695, + "grad_norm": 0.31834688782691956, + "learning_rate": 3.5870021395858454e-05, + "loss": 1.7009, + "step": 19662 + }, + { + "epoch": 6.035297728667895, + "grad_norm": 0.20666317641735077, + "learning_rate": 3.5865253515940496e-05, + "loss": 1.7252, + "step": 19663 + }, + { + "epoch": 6.035604665438919, + "grad_norm": 0.3070019483566284, + "learning_rate": 3.586048577571039e-05, + "loss": 1.7139, + "step": 19664 + }, + { + "epoch": 6.035911602209945, + "grad_norm": 0.22463096678256989, + "learning_rate": 3.585571817521522e-05, + "loss": 1.7574, + "step": 19665 + }, + { + "epoch": 6.03621853898097, + "grad_norm": 0.25405722856521606, + "learning_rate": 3.585095071450216e-05, + "loss": 1.7135, + "step": 19666 + }, + { + "epoch": 6.036525475751995, + "grad_norm": 0.24543432891368866, + "learning_rate": 3.584618339361828e-05, + "loss": 1.7312, + "step": 19667 + }, + { + "epoch": 6.036832412523021, + "grad_norm": 0.2454189658164978, + "learning_rate": 3.584141621261073e-05, + "loss": 1.7905, + "step": 19668 + }, + { + "epoch": 6.037139349294045, + "grad_norm": 0.2163272649049759, + "learning_rate": 3.583664917152658e-05, + "loss": 1.7042, + "step": 19669 + }, + { + "epoch": 6.03744628606507, + "grad_norm": 0.2088690549135208, + "learning_rate": 3.5831882270412994e-05, + "loss": 1.7905, + "step": 19670 + }, + { + "epoch": 6.037753222836096, + "grad_norm": 0.26145869493484497, + "learning_rate": 3.5827115509317024e-05, + "loss": 1.7487, + "step": 19671 + }, + { + "epoch": 6.038060159607121, + "grad_norm": 0.20306496322155, + "learning_rate": 3.582234888828582e-05, + "loss": 1.7103, + "step": 19672 + }, + { + "epoch": 6.038367096378146, + "grad_norm": 0.2504192292690277, + "learning_rate": 3.5817582407366454e-05, + "loss": 1.7397, + "step": 19673 + }, + { + "epoch": 6.038674033149171, + "grad_norm": 0.22803208231925964, + "learning_rate": 3.5812816066606084e-05, + "loss": 1.7105, + "step": 19674 + }, + { + "epoch": 6.038980969920196, + "grad_norm": 0.24963071942329407, + "learning_rate": 3.580804986605176e-05, + "loss": 1.734, + "step": 19675 + }, + { + "epoch": 6.0392879066912215, + "grad_norm": 0.2468494027853012, + "learning_rate": 3.580328380575062e-05, + "loss": 1.6866, + "step": 19676 + }, + { + "epoch": 6.039594843462247, + "grad_norm": 0.17628586292266846, + "learning_rate": 3.579851788574973e-05, + "loss": 1.7106, + "step": 19677 + }, + { + "epoch": 6.039901780233272, + "grad_norm": 0.23965299129486084, + "learning_rate": 3.579375210609622e-05, + "loss": 1.7675, + "step": 19678 + }, + { + "epoch": 6.0402087170042975, + "grad_norm": 0.19638453423976898, + "learning_rate": 3.5788986466837175e-05, + "loss": 1.7242, + "step": 19679 + }, + { + "epoch": 6.040515653775322, + "grad_norm": 0.2602851092815399, + "learning_rate": 3.578422096801971e-05, + "loss": 1.7287, + "step": 19680 + }, + { + "epoch": 6.040822590546347, + "grad_norm": 0.25868186354637146, + "learning_rate": 3.577945560969091e-05, + "loss": 1.7604, + "step": 19681 + }, + { + "epoch": 6.041129527317373, + "grad_norm": 0.1996527463197708, + "learning_rate": 3.577469039189784e-05, + "loss": 1.7469, + "step": 19682 + }, + { + "epoch": 6.041436464088398, + "grad_norm": 0.29909980297088623, + "learning_rate": 3.576992531468763e-05, + "loss": 1.682, + "step": 19683 + }, + { + "epoch": 6.041743400859423, + "grad_norm": 0.20064286887645721, + "learning_rate": 3.576516037810734e-05, + "loss": 1.7125, + "step": 19684 + }, + { + "epoch": 6.042050337630448, + "grad_norm": 0.2134515345096588, + "learning_rate": 3.576039558220411e-05, + "loss": 1.7371, + "step": 19685 + }, + { + "epoch": 6.042357274401473, + "grad_norm": 0.20365437865257263, + "learning_rate": 3.575563092702497e-05, + "loss": 1.7446, + "step": 19686 + }, + { + "epoch": 6.042664211172498, + "grad_norm": 0.24526065587997437, + "learning_rate": 3.5750866412617054e-05, + "loss": 1.759, + "step": 19687 + }, + { + "epoch": 6.042971147943524, + "grad_norm": 0.24521295726299286, + "learning_rate": 3.5746102039027414e-05, + "loss": 1.7589, + "step": 19688 + }, + { + "epoch": 6.043278084714549, + "grad_norm": 0.2151515632867813, + "learning_rate": 3.5741337806303155e-05, + "loss": 1.761, + "step": 19689 + }, + { + "epoch": 6.043585021485574, + "grad_norm": 0.25733521580696106, + "learning_rate": 3.573657371449134e-05, + "loss": 1.7171, + "step": 19690 + }, + { + "epoch": 6.043891958256599, + "grad_norm": 0.18520839512348175, + "learning_rate": 3.5731809763639084e-05, + "loss": 1.6691, + "step": 19691 + }, + { + "epoch": 6.044198895027624, + "grad_norm": 0.24617944657802582, + "learning_rate": 3.572704595379342e-05, + "loss": 1.7869, + "step": 19692 + }, + { + "epoch": 6.0445058317986495, + "grad_norm": 0.20246629416942596, + "learning_rate": 3.5722282285001493e-05, + "loss": 1.7667, + "step": 19693 + }, + { + "epoch": 6.044812768569675, + "grad_norm": 0.21190209686756134, + "learning_rate": 3.5717518757310305e-05, + "loss": 1.6839, + "step": 19694 + }, + { + "epoch": 6.0451197053407, + "grad_norm": 0.19021087884902954, + "learning_rate": 3.571275537076699e-05, + "loss": 1.7023, + "step": 19695 + }, + { + "epoch": 6.045426642111725, + "grad_norm": 0.1793040931224823, + "learning_rate": 3.570799212541858e-05, + "loss": 1.7022, + "step": 19696 + }, + { + "epoch": 6.04573357888275, + "grad_norm": 0.19105301797389984, + "learning_rate": 3.570322902131219e-05, + "loss": 1.7151, + "step": 19697 + }, + { + "epoch": 6.046040515653775, + "grad_norm": 0.22083842754364014, + "learning_rate": 3.569846605849487e-05, + "loss": 1.7097, + "step": 19698 + }, + { + "epoch": 6.046347452424801, + "grad_norm": 0.2607622444629669, + "learning_rate": 3.569370323701368e-05, + "loss": 1.7508, + "step": 19699 + }, + { + "epoch": 6.046654389195826, + "grad_norm": 0.22349929809570312, + "learning_rate": 3.56889405569157e-05, + "loss": 1.7131, + "step": 19700 + }, + { + "epoch": 6.04696132596685, + "grad_norm": 0.19442661106586456, + "learning_rate": 3.5684178018247996e-05, + "loss": 1.7476, + "step": 19701 + }, + { + "epoch": 6.047268262737876, + "grad_norm": 0.2002776861190796, + "learning_rate": 3.5679415621057646e-05, + "loss": 1.7982, + "step": 19702 + }, + { + "epoch": 6.047575199508901, + "grad_norm": 0.21558646857738495, + "learning_rate": 3.567465336539169e-05, + "loss": 1.7231, + "step": 19703 + }, + { + "epoch": 6.047882136279926, + "grad_norm": 0.20468449592590332, + "learning_rate": 3.5669891251297224e-05, + "loss": 1.6426, + "step": 19704 + }, + { + "epoch": 6.048189073050952, + "grad_norm": 0.23098553717136383, + "learning_rate": 3.566512927882127e-05, + "loss": 1.7763, + "step": 19705 + }, + { + "epoch": 6.048496009821977, + "grad_norm": 0.22959274053573608, + "learning_rate": 3.566036744801092e-05, + "loss": 1.7663, + "step": 19706 + }, + { + "epoch": 6.0488029465930016, + "grad_norm": 0.18519435822963715, + "learning_rate": 3.5655605758913215e-05, + "loss": 1.6995, + "step": 19707 + }, + { + "epoch": 6.049109883364027, + "grad_norm": 0.2529381513595581, + "learning_rate": 3.565084421157524e-05, + "loss": 1.754, + "step": 19708 + }, + { + "epoch": 6.049416820135052, + "grad_norm": 0.2208617776632309, + "learning_rate": 3.5646082806044015e-05, + "loss": 1.6939, + "step": 19709 + }, + { + "epoch": 6.0497237569060776, + "grad_norm": 0.18433862924575806, + "learning_rate": 3.564132154236663e-05, + "loss": 1.7145, + "step": 19710 + }, + { + "epoch": 6.050030693677103, + "grad_norm": 0.1963127702474594, + "learning_rate": 3.563656042059011e-05, + "loss": 1.7101, + "step": 19711 + }, + { + "epoch": 6.050337630448127, + "grad_norm": 0.19860461354255676, + "learning_rate": 3.5631799440761526e-05, + "loss": 1.7218, + "step": 19712 + }, + { + "epoch": 6.050644567219153, + "grad_norm": 0.19304174184799194, + "learning_rate": 3.5627038602927905e-05, + "loss": 1.7575, + "step": 19713 + }, + { + "epoch": 6.050951503990178, + "grad_norm": 0.20402809977531433, + "learning_rate": 3.5622277907136335e-05, + "loss": 1.7438, + "step": 19714 + }, + { + "epoch": 6.051258440761203, + "grad_norm": 0.20821911096572876, + "learning_rate": 3.5617517353433844e-05, + "loss": 1.7381, + "step": 19715 + }, + { + "epoch": 6.051565377532229, + "grad_norm": 0.24375931918621063, + "learning_rate": 3.561275694186745e-05, + "loss": 1.8377, + "step": 19716 + }, + { + "epoch": 6.051872314303253, + "grad_norm": 0.19745339453220367, + "learning_rate": 3.560799667248424e-05, + "loss": 1.6839, + "step": 19717 + }, + { + "epoch": 6.0521792510742785, + "grad_norm": 0.2039431631565094, + "learning_rate": 3.560323654533124e-05, + "loss": 1.692, + "step": 19718 + }, + { + "epoch": 6.052486187845304, + "grad_norm": 0.23229047656059265, + "learning_rate": 3.559847656045551e-05, + "loss": 1.7408, + "step": 19719 + }, + { + "epoch": 6.052793124616329, + "grad_norm": 0.20387259125709534, + "learning_rate": 3.559371671790404e-05, + "loss": 1.7215, + "step": 19720 + }, + { + "epoch": 6.0531000613873545, + "grad_norm": 0.23960062861442566, + "learning_rate": 3.5588957017723944e-05, + "loss": 1.8048, + "step": 19721 + }, + { + "epoch": 6.05340699815838, + "grad_norm": 0.1979944109916687, + "learning_rate": 3.5584197459962196e-05, + "loss": 1.7307, + "step": 19722 + }, + { + "epoch": 6.053713934929404, + "grad_norm": 0.21914203464984894, + "learning_rate": 3.557943804466586e-05, + "loss": 1.6999, + "step": 19723 + }, + { + "epoch": 6.05402087170043, + "grad_norm": 0.22338175773620605, + "learning_rate": 3.557467877188197e-05, + "loss": 1.6977, + "step": 19724 + }, + { + "epoch": 6.054327808471455, + "grad_norm": 0.2692863643169403, + "learning_rate": 3.5569919641657576e-05, + "loss": 1.7664, + "step": 19725 + }, + { + "epoch": 6.05463474524248, + "grad_norm": 0.2882823944091797, + "learning_rate": 3.5565160654039675e-05, + "loss": 1.6943, + "step": 19726 + }, + { + "epoch": 6.054941682013506, + "grad_norm": 0.2114996612071991, + "learning_rate": 3.5560401809075336e-05, + "loss": 1.7426, + "step": 19727 + }, + { + "epoch": 6.05524861878453, + "grad_norm": 0.19616106152534485, + "learning_rate": 3.5555643106811546e-05, + "loss": 1.6616, + "step": 19728 + }, + { + "epoch": 6.055555555555555, + "grad_norm": 0.241346076130867, + "learning_rate": 3.555088454729537e-05, + "loss": 1.7423, + "step": 19729 + }, + { + "epoch": 6.055862492326581, + "grad_norm": 0.24495846033096313, + "learning_rate": 3.554612613057381e-05, + "loss": 1.7699, + "step": 19730 + }, + { + "epoch": 6.056169429097606, + "grad_norm": 0.233306422829628, + "learning_rate": 3.554136785669393e-05, + "loss": 1.7201, + "step": 19731 + }, + { + "epoch": 6.056476365868631, + "grad_norm": 0.23820927739143372, + "learning_rate": 3.553660972570272e-05, + "loss": 1.7694, + "step": 19732 + }, + { + "epoch": 6.056783302639656, + "grad_norm": 0.20664167404174805, + "learning_rate": 3.553185173764719e-05, + "loss": 1.7151, + "step": 19733 + }, + { + "epoch": 6.057090239410681, + "grad_norm": 0.22572578489780426, + "learning_rate": 3.5527093892574394e-05, + "loss": 1.7715, + "step": 19734 + }, + { + "epoch": 6.0573971761817065, + "grad_norm": 0.18554186820983887, + "learning_rate": 3.552233619053133e-05, + "loss": 1.7481, + "step": 19735 + }, + { + "epoch": 6.057704112952732, + "grad_norm": 0.2434636950492859, + "learning_rate": 3.551757863156504e-05, + "loss": 1.7992, + "step": 19736 + }, + { + "epoch": 6.058011049723757, + "grad_norm": 0.1949392408132553, + "learning_rate": 3.5512821215722514e-05, + "loss": 1.7439, + "step": 19737 + }, + { + "epoch": 6.0583179864947825, + "grad_norm": 0.2696731686592102, + "learning_rate": 3.55080639430508e-05, + "loss": 1.7092, + "step": 19738 + }, + { + "epoch": 6.058624923265807, + "grad_norm": 0.1963263303041458, + "learning_rate": 3.550330681359686e-05, + "loss": 1.6726, + "step": 19739 + }, + { + "epoch": 6.058931860036832, + "grad_norm": 0.20115122199058533, + "learning_rate": 3.549854982740776e-05, + "loss": 1.7459, + "step": 19740 + }, + { + "epoch": 6.059238796807858, + "grad_norm": 0.21378284692764282, + "learning_rate": 3.549379298453048e-05, + "loss": 1.7028, + "step": 19741 + }, + { + "epoch": 6.059545733578883, + "grad_norm": 0.21954336762428284, + "learning_rate": 3.5489036285012055e-05, + "loss": 1.7209, + "step": 19742 + }, + { + "epoch": 6.059852670349908, + "grad_norm": 0.20117704570293427, + "learning_rate": 3.548427972889946e-05, + "loss": 1.7273, + "step": 19743 + }, + { + "epoch": 6.060159607120933, + "grad_norm": 0.23786263167858124, + "learning_rate": 3.5479523316239745e-05, + "loss": 1.7519, + "step": 19744 + }, + { + "epoch": 6.060466543891958, + "grad_norm": 0.17704391479492188, + "learning_rate": 3.5474767047079864e-05, + "loss": 1.6644, + "step": 19745 + }, + { + "epoch": 6.060773480662983, + "grad_norm": 0.1883699744939804, + "learning_rate": 3.547001092146687e-05, + "loss": 1.6586, + "step": 19746 + }, + { + "epoch": 6.061080417434009, + "grad_norm": 0.19101519882678986, + "learning_rate": 3.546525493944773e-05, + "loss": 1.7575, + "step": 19747 + }, + { + "epoch": 6.061387354205034, + "grad_norm": 0.1924263834953308, + "learning_rate": 3.546049910106947e-05, + "loss": 1.743, + "step": 19748 + }, + { + "epoch": 6.0616942909760585, + "grad_norm": 0.1853020042181015, + "learning_rate": 3.5455743406379084e-05, + "loss": 1.7466, + "step": 19749 + }, + { + "epoch": 6.062001227747084, + "grad_norm": 0.21322499215602875, + "learning_rate": 3.545098785542355e-05, + "loss": 1.7625, + "step": 19750 + }, + { + "epoch": 6.062308164518109, + "grad_norm": 0.1567271500825882, + "learning_rate": 3.544623244824989e-05, + "loss": 1.6531, + "step": 19751 + }, + { + "epoch": 6.0626151012891345, + "grad_norm": 0.2125476449728012, + "learning_rate": 3.544147718490508e-05, + "loss": 1.7547, + "step": 19752 + }, + { + "epoch": 6.06292203806016, + "grad_norm": 0.19470059871673584, + "learning_rate": 3.543672206543615e-05, + "loss": 1.7327, + "step": 19753 + }, + { + "epoch": 6.063228974831185, + "grad_norm": 0.1690339744091034, + "learning_rate": 3.543196708989004e-05, + "loss": 1.6621, + "step": 19754 + }, + { + "epoch": 6.06353591160221, + "grad_norm": 0.17322230339050293, + "learning_rate": 3.54272122583138e-05, + "loss": 1.7018, + "step": 19755 + }, + { + "epoch": 6.063842848373235, + "grad_norm": 0.22174575924873352, + "learning_rate": 3.5422457570754365e-05, + "loss": 1.724, + "step": 19756 + }, + { + "epoch": 6.06414978514426, + "grad_norm": 0.20233364403247833, + "learning_rate": 3.541770302725875e-05, + "loss": 1.6518, + "step": 19757 + }, + { + "epoch": 6.064456721915286, + "grad_norm": 0.1585279405117035, + "learning_rate": 3.541294862787395e-05, + "loss": 1.6985, + "step": 19758 + }, + { + "epoch": 6.064763658686311, + "grad_norm": 0.2180105745792389, + "learning_rate": 3.540819437264694e-05, + "loss": 1.6728, + "step": 19759 + }, + { + "epoch": 6.065070595457335, + "grad_norm": 0.2295975238084793, + "learning_rate": 3.5403440261624696e-05, + "loss": 1.7566, + "step": 19760 + }, + { + "epoch": 6.065377532228361, + "grad_norm": 0.17460396885871887, + "learning_rate": 3.5398686294854234e-05, + "loss": 1.6977, + "step": 19761 + }, + { + "epoch": 6.065684468999386, + "grad_norm": 0.20828662812709808, + "learning_rate": 3.539393247238249e-05, + "loss": 1.7789, + "step": 19762 + }, + { + "epoch": 6.065991405770411, + "grad_norm": 0.2273385375738144, + "learning_rate": 3.5389178794256476e-05, + "loss": 1.7316, + "step": 19763 + }, + { + "epoch": 6.066298342541437, + "grad_norm": 0.2332257330417633, + "learning_rate": 3.538442526052316e-05, + "loss": 1.7355, + "step": 19764 + }, + { + "epoch": 6.066605279312462, + "grad_norm": 0.17953866720199585, + "learning_rate": 3.537967187122952e-05, + "loss": 1.7107, + "step": 19765 + }, + { + "epoch": 6.0669122160834865, + "grad_norm": 0.2334052473306656, + "learning_rate": 3.537491862642254e-05, + "loss": 1.7572, + "step": 19766 + }, + { + "epoch": 6.067219152854512, + "grad_norm": 0.2427968829870224, + "learning_rate": 3.5370165526149165e-05, + "loss": 1.7254, + "step": 19767 + }, + { + "epoch": 6.067526089625537, + "grad_norm": 0.2701692283153534, + "learning_rate": 3.53654125704564e-05, + "loss": 1.7525, + "step": 19768 + }, + { + "epoch": 6.0678330263965625, + "grad_norm": 0.3775569796562195, + "learning_rate": 3.536065975939121e-05, + "loss": 1.7516, + "step": 19769 + }, + { + "epoch": 6.068139963167588, + "grad_norm": 0.18971984088420868, + "learning_rate": 3.535590709300056e-05, + "loss": 1.6777, + "step": 19770 + }, + { + "epoch": 6.068446899938612, + "grad_norm": 0.2710094749927521, + "learning_rate": 3.535115457133141e-05, + "loss": 1.7612, + "step": 19771 + }, + { + "epoch": 6.068753836709638, + "grad_norm": 0.19414621591567993, + "learning_rate": 3.534640219443075e-05, + "loss": 1.6795, + "step": 19772 + }, + { + "epoch": 6.069060773480663, + "grad_norm": 0.2384893298149109, + "learning_rate": 3.534164996234552e-05, + "loss": 1.7869, + "step": 19773 + }, + { + "epoch": 6.069367710251688, + "grad_norm": 0.2206166833639145, + "learning_rate": 3.533689787512271e-05, + "loss": 1.7332, + "step": 19774 + }, + { + "epoch": 6.069674647022714, + "grad_norm": 0.19740800559520721, + "learning_rate": 3.533214593280926e-05, + "loss": 1.6744, + "step": 19775 + }, + { + "epoch": 6.069981583793738, + "grad_norm": 0.2098212093114853, + "learning_rate": 3.532739413545214e-05, + "loss": 1.731, + "step": 19776 + }, + { + "epoch": 6.070288520564763, + "grad_norm": 0.2508943974971771, + "learning_rate": 3.5322642483098304e-05, + "loss": 1.7682, + "step": 19777 + }, + { + "epoch": 6.070595457335789, + "grad_norm": 0.22202368080615997, + "learning_rate": 3.531789097579474e-05, + "loss": 1.6965, + "step": 19778 + }, + { + "epoch": 6.070902394106814, + "grad_norm": 0.19276803731918335, + "learning_rate": 3.5313139613588355e-05, + "loss": 1.6855, + "step": 19779 + }, + { + "epoch": 6.071209330877839, + "grad_norm": 0.23910140991210938, + "learning_rate": 3.530838839652616e-05, + "loss": 1.8099, + "step": 19780 + }, + { + "epoch": 6.071516267648865, + "grad_norm": 0.19440437853336334, + "learning_rate": 3.530363732465506e-05, + "loss": 1.67, + "step": 19781 + }, + { + "epoch": 6.071823204419889, + "grad_norm": 0.1954154074192047, + "learning_rate": 3.529888639802204e-05, + "loss": 1.7154, + "step": 19782 + }, + { + "epoch": 6.0721301411909145, + "grad_norm": 0.20836392045021057, + "learning_rate": 3.529413561667405e-05, + "loss": 1.7451, + "step": 19783 + }, + { + "epoch": 6.07243707796194, + "grad_norm": 0.20521731674671173, + "learning_rate": 3.5289384980658016e-05, + "loss": 1.7008, + "step": 19784 + }, + { + "epoch": 6.072744014732965, + "grad_norm": 0.22885540127754211, + "learning_rate": 3.528463449002092e-05, + "loss": 1.7605, + "step": 19785 + }, + { + "epoch": 6.0730509515039905, + "grad_norm": 0.27740219235420227, + "learning_rate": 3.5279884144809664e-05, + "loss": 1.7816, + "step": 19786 + }, + { + "epoch": 6.073357888275015, + "grad_norm": 0.24747557938098907, + "learning_rate": 3.527513394507124e-05, + "loss": 1.7207, + "step": 19787 + }, + { + "epoch": 6.07366482504604, + "grad_norm": 0.20127782225608826, + "learning_rate": 3.527038389085256e-05, + "loss": 1.702, + "step": 19788 + }, + { + "epoch": 6.073971761817066, + "grad_norm": 0.20683316886425018, + "learning_rate": 3.5265633982200595e-05, + "loss": 1.7022, + "step": 19789 + }, + { + "epoch": 6.074278698588091, + "grad_norm": 0.17829765379428864, + "learning_rate": 3.5260884219162256e-05, + "loss": 1.7099, + "step": 19790 + }, + { + "epoch": 6.074585635359116, + "grad_norm": 0.256964772939682, + "learning_rate": 3.525613460178452e-05, + "loss": 1.7226, + "step": 19791 + }, + { + "epoch": 6.074892572130141, + "grad_norm": 0.22840122878551483, + "learning_rate": 3.525138513011428e-05, + "loss": 1.7738, + "step": 19792 + }, + { + "epoch": 6.075199508901166, + "grad_norm": 0.18988655507564545, + "learning_rate": 3.52466358041985e-05, + "loss": 1.6775, + "step": 19793 + }, + { + "epoch": 6.0755064456721914, + "grad_norm": 0.21857139468193054, + "learning_rate": 3.524188662408411e-05, + "loss": 1.7596, + "step": 19794 + }, + { + "epoch": 6.075813382443217, + "grad_norm": 0.22910535335540771, + "learning_rate": 3.523713758981807e-05, + "loss": 1.7969, + "step": 19795 + }, + { + "epoch": 6.076120319214242, + "grad_norm": 0.20885716378688812, + "learning_rate": 3.523238870144726e-05, + "loss": 1.7407, + "step": 19796 + }, + { + "epoch": 6.0764272559852675, + "grad_norm": 0.2056209295988083, + "learning_rate": 3.5227639959018666e-05, + "loss": 1.759, + "step": 19797 + }, + { + "epoch": 6.076734192756292, + "grad_norm": 0.17485356330871582, + "learning_rate": 3.522289136257917e-05, + "loss": 1.6988, + "step": 19798 + }, + { + "epoch": 6.077041129527317, + "grad_norm": 0.2103404402732849, + "learning_rate": 3.521814291217573e-05, + "loss": 1.766, + "step": 19799 + }, + { + "epoch": 6.077348066298343, + "grad_norm": 0.21852105855941772, + "learning_rate": 3.521339460785528e-05, + "loss": 1.7435, + "step": 19800 + }, + { + "epoch": 6.077655003069368, + "grad_norm": 0.21578362584114075, + "learning_rate": 3.520864644966471e-05, + "loss": 1.7281, + "step": 19801 + }, + { + "epoch": 6.077961939840393, + "grad_norm": 0.20405036211013794, + "learning_rate": 3.520389843765099e-05, + "loss": 1.7367, + "step": 19802 + }, + { + "epoch": 6.078268876611418, + "grad_norm": 0.2578286826610565, + "learning_rate": 3.5199150571860996e-05, + "loss": 1.7625, + "step": 19803 + }, + { + "epoch": 6.078575813382443, + "grad_norm": 0.240324467420578, + "learning_rate": 3.519440285234168e-05, + "loss": 1.6979, + "step": 19804 + }, + { + "epoch": 6.078882750153468, + "grad_norm": 0.220765620470047, + "learning_rate": 3.5189655279139935e-05, + "loss": 1.7679, + "step": 19805 + }, + { + "epoch": 6.079189686924494, + "grad_norm": 0.2731996774673462, + "learning_rate": 3.518490785230273e-05, + "loss": 1.6723, + "step": 19806 + }, + { + "epoch": 6.079496623695519, + "grad_norm": 0.2593478262424469, + "learning_rate": 3.518016057187692e-05, + "loss": 1.7232, + "step": 19807 + }, + { + "epoch": 6.0798035604665435, + "grad_norm": 0.34642404317855835, + "learning_rate": 3.517541343790947e-05, + "loss": 1.8265, + "step": 19808 + }, + { + "epoch": 6.080110497237569, + "grad_norm": 0.3187299370765686, + "learning_rate": 3.5170666450447255e-05, + "loss": 1.6847, + "step": 19809 + }, + { + "epoch": 6.080417434008594, + "grad_norm": 0.20413202047348022, + "learning_rate": 3.5165919609537215e-05, + "loss": 1.6533, + "step": 19810 + }, + { + "epoch": 6.0807243707796195, + "grad_norm": 0.2753545343875885, + "learning_rate": 3.516117291522625e-05, + "loss": 1.7491, + "step": 19811 + }, + { + "epoch": 6.081031307550645, + "grad_norm": 0.20174793899059296, + "learning_rate": 3.515642636756128e-05, + "loss": 1.6902, + "step": 19812 + }, + { + "epoch": 6.08133824432167, + "grad_norm": 0.22567492723464966, + "learning_rate": 3.515167996658919e-05, + "loss": 1.7165, + "step": 19813 + }, + { + "epoch": 6.081645181092695, + "grad_norm": 0.2115732729434967, + "learning_rate": 3.514693371235692e-05, + "loss": 1.6888, + "step": 19814 + }, + { + "epoch": 6.08195211786372, + "grad_norm": 0.2141808122396469, + "learning_rate": 3.514218760491134e-05, + "loss": 1.7152, + "step": 19815 + }, + { + "epoch": 6.082259054634745, + "grad_norm": 0.19767558574676514, + "learning_rate": 3.513744164429938e-05, + "loss": 1.6926, + "step": 19816 + }, + { + "epoch": 6.082565991405771, + "grad_norm": 0.20220023393630981, + "learning_rate": 3.5132695830567944e-05, + "loss": 1.6727, + "step": 19817 + }, + { + "epoch": 6.082872928176796, + "grad_norm": 0.19589759409427643, + "learning_rate": 3.5127950163763896e-05, + "loss": 1.7545, + "step": 19818 + }, + { + "epoch": 6.08317986494782, + "grad_norm": 0.21303611993789673, + "learning_rate": 3.512320464393418e-05, + "loss": 1.753, + "step": 19819 + }, + { + "epoch": 6.083486801718846, + "grad_norm": 0.19438377022743225, + "learning_rate": 3.511845927112566e-05, + "loss": 1.7022, + "step": 19820 + }, + { + "epoch": 6.083793738489871, + "grad_norm": 0.21282976865768433, + "learning_rate": 3.511371404538526e-05, + "loss": 1.7099, + "step": 19821 + }, + { + "epoch": 6.084100675260896, + "grad_norm": 0.1874496042728424, + "learning_rate": 3.5108968966759846e-05, + "loss": 1.7033, + "step": 19822 + }, + { + "epoch": 6.084407612031922, + "grad_norm": 0.21199075877666473, + "learning_rate": 3.510422403529636e-05, + "loss": 1.7088, + "step": 19823 + }, + { + "epoch": 6.084714548802946, + "grad_norm": 0.21847110986709595, + "learning_rate": 3.5099479251041634e-05, + "loss": 1.7395, + "step": 19824 + }, + { + "epoch": 6.0850214855739715, + "grad_norm": 0.201395645737648, + "learning_rate": 3.509473461404261e-05, + "loss": 1.7522, + "step": 19825 + }, + { + "epoch": 6.085328422344997, + "grad_norm": 0.19637656211853027, + "learning_rate": 3.5089990124346135e-05, + "loss": 1.6774, + "step": 19826 + }, + { + "epoch": 6.085635359116022, + "grad_norm": 0.25918442010879517, + "learning_rate": 3.5085245781999124e-05, + "loss": 1.7704, + "step": 19827 + }, + { + "epoch": 6.0859422958870475, + "grad_norm": 0.21271947026252747, + "learning_rate": 3.508050158704844e-05, + "loss": 1.6902, + "step": 19828 + }, + { + "epoch": 6.086249232658073, + "grad_norm": 0.2065698802471161, + "learning_rate": 3.5075757539541024e-05, + "loss": 1.7945, + "step": 19829 + }, + { + "epoch": 6.086556169429097, + "grad_norm": 0.20247824490070343, + "learning_rate": 3.5071013639523684e-05, + "loss": 1.7532, + "step": 19830 + }, + { + "epoch": 6.086863106200123, + "grad_norm": 0.19705431163311005, + "learning_rate": 3.506626988704336e-05, + "loss": 1.6353, + "step": 19831 + }, + { + "epoch": 6.087170042971148, + "grad_norm": 0.20158523321151733, + "learning_rate": 3.5061526282146886e-05, + "loss": 1.6596, + "step": 19832 + }, + { + "epoch": 6.087476979742173, + "grad_norm": 0.19492848217487335, + "learning_rate": 3.505678282488118e-05, + "loss": 1.7107, + "step": 19833 + }, + { + "epoch": 6.087783916513199, + "grad_norm": 0.2403736114501953, + "learning_rate": 3.505203951529312e-05, + "loss": 1.7456, + "step": 19834 + }, + { + "epoch": 6.088090853284223, + "grad_norm": 0.25649771094322205, + "learning_rate": 3.504729635342954e-05, + "loss": 1.7513, + "step": 19835 + }, + { + "epoch": 6.088397790055248, + "grad_norm": 0.20172113180160522, + "learning_rate": 3.504255333933736e-05, + "loss": 1.7737, + "step": 19836 + }, + { + "epoch": 6.088704726826274, + "grad_norm": 0.2715936303138733, + "learning_rate": 3.5037810473063414e-05, + "loss": 1.759, + "step": 19837 + }, + { + "epoch": 6.089011663597299, + "grad_norm": 0.23145076632499695, + "learning_rate": 3.503306775465461e-05, + "loss": 1.7811, + "step": 19838 + }, + { + "epoch": 6.089318600368324, + "grad_norm": 0.1953691691160202, + "learning_rate": 3.502832518415778e-05, + "loss": 1.752, + "step": 19839 + }, + { + "epoch": 6.08962553713935, + "grad_norm": 0.1927584707736969, + "learning_rate": 3.502358276161986e-05, + "loss": 1.6865, + "step": 19840 + }, + { + "epoch": 6.089932473910374, + "grad_norm": 0.19294732809066772, + "learning_rate": 3.501884048708763e-05, + "loss": 1.6838, + "step": 19841 + }, + { + "epoch": 6.0902394106813995, + "grad_norm": 0.23351021111011505, + "learning_rate": 3.501409836060803e-05, + "loss": 1.8029, + "step": 19842 + }, + { + "epoch": 6.090546347452425, + "grad_norm": 0.21615718305110931, + "learning_rate": 3.5009356382227877e-05, + "loss": 1.7441, + "step": 19843 + }, + { + "epoch": 6.09085328422345, + "grad_norm": 0.19091549515724182, + "learning_rate": 3.500461455199405e-05, + "loss": 1.7056, + "step": 19844 + }, + { + "epoch": 6.0911602209944755, + "grad_norm": 0.21189090609550476, + "learning_rate": 3.499987286995341e-05, + "loss": 1.6853, + "step": 19845 + }, + { + "epoch": 6.0914671577655, + "grad_norm": 0.22545887529850006, + "learning_rate": 3.499513133615283e-05, + "loss": 1.7854, + "step": 19846 + }, + { + "epoch": 6.091774094536525, + "grad_norm": 0.21960650384426117, + "learning_rate": 3.4990389950639144e-05, + "loss": 1.7558, + "step": 19847 + }, + { + "epoch": 6.092081031307551, + "grad_norm": 0.20825782418251038, + "learning_rate": 3.4985648713459244e-05, + "loss": 1.7103, + "step": 19848 + }, + { + "epoch": 6.092387968078576, + "grad_norm": 0.20886415243148804, + "learning_rate": 3.498090762465993e-05, + "loss": 1.6897, + "step": 19849 + }, + { + "epoch": 6.092694904849601, + "grad_norm": 0.19306892156600952, + "learning_rate": 3.4976166684288115e-05, + "loss": 1.7506, + "step": 19850 + }, + { + "epoch": 6.093001841620626, + "grad_norm": 0.2178204357624054, + "learning_rate": 3.497142589239063e-05, + "loss": 1.6774, + "step": 19851 + }, + { + "epoch": 6.093308778391651, + "grad_norm": 0.1914307177066803, + "learning_rate": 3.4966685249014294e-05, + "loss": 1.7182, + "step": 19852 + }, + { + "epoch": 6.093615715162676, + "grad_norm": 0.22006092965602875, + "learning_rate": 3.496194475420602e-05, + "loss": 1.7209, + "step": 19853 + }, + { + "epoch": 6.093922651933702, + "grad_norm": 0.20621439814567566, + "learning_rate": 3.49572044080126e-05, + "loss": 1.7403, + "step": 19854 + }, + { + "epoch": 6.094229588704727, + "grad_norm": 0.24079272150993347, + "learning_rate": 3.495246421048091e-05, + "loss": 1.7619, + "step": 19855 + }, + { + "epoch": 6.094536525475752, + "grad_norm": 0.19073884189128876, + "learning_rate": 3.494772416165777e-05, + "loss": 1.6677, + "step": 19856 + }, + { + "epoch": 6.094843462246777, + "grad_norm": 0.18217229843139648, + "learning_rate": 3.494298426159007e-05, + "loss": 1.7162, + "step": 19857 + }, + { + "epoch": 6.095150399017802, + "grad_norm": 0.21901506185531616, + "learning_rate": 3.493824451032461e-05, + "loss": 1.7173, + "step": 19858 + }, + { + "epoch": 6.0954573357888275, + "grad_norm": 0.22156217694282532, + "learning_rate": 3.493350490790826e-05, + "loss": 1.8029, + "step": 19859 + }, + { + "epoch": 6.095764272559853, + "grad_norm": 0.1663675606250763, + "learning_rate": 3.4928765454387824e-05, + "loss": 1.7306, + "step": 19860 + }, + { + "epoch": 6.096071209330878, + "grad_norm": 0.19684657454490662, + "learning_rate": 3.4924026149810175e-05, + "loss": 1.6944, + "step": 19861 + }, + { + "epoch": 6.096378146101903, + "grad_norm": 0.19163468480110168, + "learning_rate": 3.4919286994222125e-05, + "loss": 1.7331, + "step": 19862 + }, + { + "epoch": 6.096685082872928, + "grad_norm": 0.20134083926677704, + "learning_rate": 3.491454798767054e-05, + "loss": 1.7365, + "step": 19863 + }, + { + "epoch": 6.096992019643953, + "grad_norm": 0.23877696692943573, + "learning_rate": 3.490980913020221e-05, + "loss": 1.753, + "step": 19864 + }, + { + "epoch": 6.097298956414979, + "grad_norm": 0.207699254155159, + "learning_rate": 3.490507042186402e-05, + "loss": 1.6835, + "step": 19865 + }, + { + "epoch": 6.097605893186004, + "grad_norm": 0.20608612895011902, + "learning_rate": 3.490033186270274e-05, + "loss": 1.7379, + "step": 19866 + }, + { + "epoch": 6.097912829957028, + "grad_norm": 0.25086313486099243, + "learning_rate": 3.489559345276524e-05, + "loss": 1.7692, + "step": 19867 + }, + { + "epoch": 6.098219766728054, + "grad_norm": 0.22025549411773682, + "learning_rate": 3.489085519209836e-05, + "loss": 1.6579, + "step": 19868 + }, + { + "epoch": 6.098526703499079, + "grad_norm": 0.23805730044841766, + "learning_rate": 3.4886117080748875e-05, + "loss": 1.7695, + "step": 19869 + }, + { + "epoch": 6.098833640270104, + "grad_norm": 0.23271869122982025, + "learning_rate": 3.4881379118763666e-05, + "loss": 1.7268, + "step": 19870 + }, + { + "epoch": 6.09914057704113, + "grad_norm": 0.21795618534088135, + "learning_rate": 3.4876641306189505e-05, + "loss": 1.6996, + "step": 19871 + }, + { + "epoch": 6.099447513812155, + "grad_norm": 0.22064761817455292, + "learning_rate": 3.487190364307326e-05, + "loss": 1.7032, + "step": 19872 + }, + { + "epoch": 6.0997544505831796, + "grad_norm": 0.23834183812141418, + "learning_rate": 3.4867166129461706e-05, + "loss": 1.6942, + "step": 19873 + }, + { + "epoch": 6.100061387354205, + "grad_norm": 0.21143686771392822, + "learning_rate": 3.486242876540171e-05, + "loss": 1.6904, + "step": 19874 + }, + { + "epoch": 6.10036832412523, + "grad_norm": 0.18099969625473022, + "learning_rate": 3.485769155094004e-05, + "loss": 1.6669, + "step": 19875 + }, + { + "epoch": 6.100675260896256, + "grad_norm": 0.25324884057044983, + "learning_rate": 3.4852954486123566e-05, + "loss": 1.7878, + "step": 19876 + }, + { + "epoch": 6.100982197667281, + "grad_norm": 0.2252139449119568, + "learning_rate": 3.4848217570999055e-05, + "loss": 1.7674, + "step": 19877 + }, + { + "epoch": 6.101289134438305, + "grad_norm": 0.19629882276058197, + "learning_rate": 3.4843480805613346e-05, + "loss": 1.6898, + "step": 19878 + }, + { + "epoch": 6.101596071209331, + "grad_norm": 0.1858786642551422, + "learning_rate": 3.483874419001323e-05, + "loss": 1.6856, + "step": 19879 + }, + { + "epoch": 6.101903007980356, + "grad_norm": 0.1842946857213974, + "learning_rate": 3.483400772424555e-05, + "loss": 1.7229, + "step": 19880 + }, + { + "epoch": 6.102209944751381, + "grad_norm": 0.18981511890888214, + "learning_rate": 3.482927140835708e-05, + "loss": 1.75, + "step": 19881 + }, + { + "epoch": 6.102516881522407, + "grad_norm": 0.19914525747299194, + "learning_rate": 3.482453524239466e-05, + "loss": 1.7702, + "step": 19882 + }, + { + "epoch": 6.102823818293431, + "grad_norm": 0.1960345208644867, + "learning_rate": 3.481979922640507e-05, + "loss": 1.7189, + "step": 19883 + }, + { + "epoch": 6.1031307550644565, + "grad_norm": 0.20309221744537354, + "learning_rate": 3.48150633604351e-05, + "loss": 1.7888, + "step": 19884 + }, + { + "epoch": 6.103437691835482, + "grad_norm": 0.20090891420841217, + "learning_rate": 3.48103276445316e-05, + "loss": 1.8017, + "step": 19885 + }, + { + "epoch": 6.103744628606507, + "grad_norm": 0.22500385344028473, + "learning_rate": 3.480559207874133e-05, + "loss": 1.7061, + "step": 19886 + }, + { + "epoch": 6.1040515653775325, + "grad_norm": 0.22594885528087616, + "learning_rate": 3.480085666311113e-05, + "loss": 1.7659, + "step": 19887 + }, + { + "epoch": 6.104358502148558, + "grad_norm": 0.2769651710987091, + "learning_rate": 3.479612139768774e-05, + "loss": 1.7668, + "step": 19888 + }, + { + "epoch": 6.104665438919582, + "grad_norm": 0.24251700937747955, + "learning_rate": 3.4791386282518e-05, + "loss": 1.8068, + "step": 19889 + }, + { + "epoch": 6.104972375690608, + "grad_norm": 0.23325790464878082, + "learning_rate": 3.478665131764869e-05, + "loss": 1.7116, + "step": 19890 + }, + { + "epoch": 6.105279312461633, + "grad_norm": 0.19998812675476074, + "learning_rate": 3.478191650312663e-05, + "loss": 1.7116, + "step": 19891 + }, + { + "epoch": 6.105586249232658, + "grad_norm": 0.20933640003204346, + "learning_rate": 3.4777181838998566e-05, + "loss": 1.7138, + "step": 19892 + }, + { + "epoch": 6.105893186003684, + "grad_norm": 0.24344035983085632, + "learning_rate": 3.477244732531134e-05, + "loss": 1.784, + "step": 19893 + }, + { + "epoch": 6.106200122774708, + "grad_norm": 0.2220575362443924, + "learning_rate": 3.4767712962111686e-05, + "loss": 1.7479, + "step": 19894 + }, + { + "epoch": 6.106507059545733, + "grad_norm": 0.2222832590341568, + "learning_rate": 3.476297874944644e-05, + "loss": 1.7278, + "step": 19895 + }, + { + "epoch": 6.106813996316759, + "grad_norm": 0.222265362739563, + "learning_rate": 3.4758244687362353e-05, + "loss": 1.7321, + "step": 19896 + }, + { + "epoch": 6.107120933087784, + "grad_norm": 0.2921304702758789, + "learning_rate": 3.475351077590625e-05, + "loss": 1.7848, + "step": 19897 + }, + { + "epoch": 6.107427869858809, + "grad_norm": 0.21015208959579468, + "learning_rate": 3.4748777015124856e-05, + "loss": 1.7987, + "step": 19898 + }, + { + "epoch": 6.107734806629834, + "grad_norm": 0.19510969519615173, + "learning_rate": 3.474404340506502e-05, + "loss": 1.7317, + "step": 19899 + }, + { + "epoch": 6.108041743400859, + "grad_norm": 0.21978609263896942, + "learning_rate": 3.473930994577348e-05, + "loss": 1.6943, + "step": 19900 + }, + { + "epoch": 6.1083486801718845, + "grad_norm": 0.1793510913848877, + "learning_rate": 3.4734576637297004e-05, + "loss": 1.6659, + "step": 19901 + }, + { + "epoch": 6.10865561694291, + "grad_norm": 0.2029319554567337, + "learning_rate": 3.4729843479682414e-05, + "loss": 1.7127, + "step": 19902 + }, + { + "epoch": 6.108962553713935, + "grad_norm": 0.2001914530992508, + "learning_rate": 3.472511047297644e-05, + "loss": 1.691, + "step": 19903 + }, + { + "epoch": 6.1092694904849605, + "grad_norm": 0.2194693237543106, + "learning_rate": 3.47203776172259e-05, + "loss": 1.7181, + "step": 19904 + }, + { + "epoch": 6.109576427255985, + "grad_norm": 0.1865277737379074, + "learning_rate": 3.4715644912477515e-05, + "loss": 1.6786, + "step": 19905 + }, + { + "epoch": 6.10988336402701, + "grad_norm": 0.20574906468391418, + "learning_rate": 3.471091235877811e-05, + "loss": 1.7681, + "step": 19906 + }, + { + "epoch": 6.110190300798036, + "grad_norm": 0.21072493493556976, + "learning_rate": 3.470617995617441e-05, + "loss": 1.7494, + "step": 19907 + }, + { + "epoch": 6.110497237569061, + "grad_norm": 0.2411658763885498, + "learning_rate": 3.470144770471323e-05, + "loss": 1.7183, + "step": 19908 + }, + { + "epoch": 6.110804174340086, + "grad_norm": 0.19782759249210358, + "learning_rate": 3.4696715604441285e-05, + "loss": 1.6823, + "step": 19909 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.315026193857193, + "learning_rate": 3.469198365540539e-05, + "loss": 1.691, + "step": 19910 + }, + { + "epoch": 6.111418047882136, + "grad_norm": 0.19840773940086365, + "learning_rate": 3.468725185765226e-05, + "loss": 1.7413, + "step": 19911 + }, + { + "epoch": 6.111724984653161, + "grad_norm": 0.1813160926103592, + "learning_rate": 3.46825202112287e-05, + "loss": 1.7095, + "step": 19912 + }, + { + "epoch": 6.112031921424187, + "grad_norm": 0.21025459468364716, + "learning_rate": 3.467778871618145e-05, + "loss": 1.7783, + "step": 19913 + }, + { + "epoch": 6.112338858195212, + "grad_norm": 0.20088298618793488, + "learning_rate": 3.4673057372557265e-05, + "loss": 1.7671, + "step": 19914 + }, + { + "epoch": 6.112645794966237, + "grad_norm": 0.21919472515583038, + "learning_rate": 3.466832618040291e-05, + "loss": 1.7052, + "step": 19915 + }, + { + "epoch": 6.112952731737262, + "grad_norm": 0.19135436415672302, + "learning_rate": 3.466359513976516e-05, + "loss": 1.7862, + "step": 19916 + }, + { + "epoch": 6.113259668508287, + "grad_norm": 0.19943594932556152, + "learning_rate": 3.465886425069074e-05, + "loss": 1.6926, + "step": 19917 + }, + { + "epoch": 6.1135666052793125, + "grad_norm": 0.19390980899333954, + "learning_rate": 3.46541335132264e-05, + "loss": 1.761, + "step": 19918 + }, + { + "epoch": 6.113873542050338, + "grad_norm": 0.22745995223522186, + "learning_rate": 3.4649402927418935e-05, + "loss": 1.7147, + "step": 19919 + }, + { + "epoch": 6.114180478821363, + "grad_norm": 0.17792920768260956, + "learning_rate": 3.4644672493315045e-05, + "loss": 1.6946, + "step": 19920 + }, + { + "epoch": 6.114487415592388, + "grad_norm": 0.2009986788034439, + "learning_rate": 3.463994221096152e-05, + "loss": 1.6977, + "step": 19921 + }, + { + "epoch": 6.114794352363413, + "grad_norm": 0.2448386251926422, + "learning_rate": 3.4635212080405066e-05, + "loss": 1.7169, + "step": 19922 + }, + { + "epoch": 6.115101289134438, + "grad_norm": 0.21506112813949585, + "learning_rate": 3.463048210169247e-05, + "loss": 1.6632, + "step": 19923 + }, + { + "epoch": 6.115408225905464, + "grad_norm": 0.1805233359336853, + "learning_rate": 3.462575227487045e-05, + "loss": 1.6742, + "step": 19924 + }, + { + "epoch": 6.115715162676489, + "grad_norm": 0.20023848116397858, + "learning_rate": 3.4621022599985766e-05, + "loss": 1.7106, + "step": 19925 + }, + { + "epoch": 6.116022099447513, + "grad_norm": 0.20388077199459076, + "learning_rate": 3.461629307708513e-05, + "loss": 1.7065, + "step": 19926 + }, + { + "epoch": 6.116329036218539, + "grad_norm": 0.23886005580425262, + "learning_rate": 3.461156370621533e-05, + "loss": 1.7177, + "step": 19927 + }, + { + "epoch": 6.116635972989564, + "grad_norm": 0.2054048627614975, + "learning_rate": 3.460683448742306e-05, + "loss": 1.6773, + "step": 19928 + }, + { + "epoch": 6.116942909760589, + "grad_norm": 0.1909634917974472, + "learning_rate": 3.460210542075508e-05, + "loss": 1.7562, + "step": 19929 + }, + { + "epoch": 6.117249846531615, + "grad_norm": 0.20221595466136932, + "learning_rate": 3.459737650625812e-05, + "loss": 1.7948, + "step": 19930 + }, + { + "epoch": 6.11755678330264, + "grad_norm": 0.25445356965065, + "learning_rate": 3.459264774397891e-05, + "loss": 1.7964, + "step": 19931 + }, + { + "epoch": 6.1178637200736645, + "grad_norm": 0.2227735072374344, + "learning_rate": 3.4587919133964176e-05, + "loss": 1.7833, + "step": 19932 + }, + { + "epoch": 6.11817065684469, + "grad_norm": 0.20591853559017181, + "learning_rate": 3.458319067626068e-05, + "loss": 1.7535, + "step": 19933 + }, + { + "epoch": 6.118477593615715, + "grad_norm": 0.22087402641773224, + "learning_rate": 3.4578462370915115e-05, + "loss": 1.7228, + "step": 19934 + }, + { + "epoch": 6.1187845303867405, + "grad_norm": 0.234156996011734, + "learning_rate": 3.457373421797423e-05, + "loss": 1.7167, + "step": 19935 + }, + { + "epoch": 6.119091467157766, + "grad_norm": 0.209685817360878, + "learning_rate": 3.4569006217484746e-05, + "loss": 1.6633, + "step": 19936 + }, + { + "epoch": 6.11939840392879, + "grad_norm": 0.18499237298965454, + "learning_rate": 3.4564278369493366e-05, + "loss": 1.6769, + "step": 19937 + }, + { + "epoch": 6.119705340699816, + "grad_norm": 0.2600767910480499, + "learning_rate": 3.455955067404686e-05, + "loss": 1.7788, + "step": 19938 + }, + { + "epoch": 6.120012277470841, + "grad_norm": 0.21499377489089966, + "learning_rate": 3.455482313119191e-05, + "loss": 1.789, + "step": 19939 + }, + { + "epoch": 6.120319214241866, + "grad_norm": 0.19618432223796844, + "learning_rate": 3.455009574097527e-05, + "loss": 1.7162, + "step": 19940 + }, + { + "epoch": 6.120626151012892, + "grad_norm": 0.23219916224479675, + "learning_rate": 3.4545368503443616e-05, + "loss": 1.7871, + "step": 19941 + }, + { + "epoch": 6.120933087783916, + "grad_norm": 0.22315794229507446, + "learning_rate": 3.45406414186437e-05, + "loss": 1.6944, + "step": 19942 + }, + { + "epoch": 6.121240024554941, + "grad_norm": 0.22536693513393402, + "learning_rate": 3.453591448662221e-05, + "loss": 1.7727, + "step": 19943 + }, + { + "epoch": 6.121546961325967, + "grad_norm": 0.21811100840568542, + "learning_rate": 3.45311877074259e-05, + "loss": 1.7037, + "step": 19944 + }, + { + "epoch": 6.121853898096992, + "grad_norm": 0.1957094967365265, + "learning_rate": 3.452646108110145e-05, + "loss": 1.7734, + "step": 19945 + }, + { + "epoch": 6.122160834868017, + "grad_norm": 0.185706228017807, + "learning_rate": 3.452173460769559e-05, + "loss": 1.6715, + "step": 19946 + }, + { + "epoch": 6.122467771639043, + "grad_norm": 0.21081562340259552, + "learning_rate": 3.4517008287255005e-05, + "loss": 1.7798, + "step": 19947 + }, + { + "epoch": 6.122774708410067, + "grad_norm": 0.24175535142421722, + "learning_rate": 3.451228211982642e-05, + "loss": 1.7111, + "step": 19948 + }, + { + "epoch": 6.1230816451810925, + "grad_norm": 0.244124636054039, + "learning_rate": 3.450755610545654e-05, + "loss": 1.7263, + "step": 19949 + }, + { + "epoch": 6.123388581952118, + "grad_norm": 0.21109984815120697, + "learning_rate": 3.45028302441921e-05, + "loss": 1.7556, + "step": 19950 + }, + { + "epoch": 6.123695518723143, + "grad_norm": 0.21721722185611725, + "learning_rate": 3.449810453607976e-05, + "loss": 1.7416, + "step": 19951 + }, + { + "epoch": 6.1240024554941686, + "grad_norm": 0.18695317208766937, + "learning_rate": 3.4493378981166216e-05, + "loss": 1.7128, + "step": 19952 + }, + { + "epoch": 6.124309392265193, + "grad_norm": 0.19175554811954498, + "learning_rate": 3.4488653579498206e-05, + "loss": 1.7014, + "step": 19953 + }, + { + "epoch": 6.124616329036218, + "grad_norm": 0.22297006845474243, + "learning_rate": 3.4483928331122405e-05, + "loss": 1.7231, + "step": 19954 + }, + { + "epoch": 6.124923265807244, + "grad_norm": 0.2407974898815155, + "learning_rate": 3.447920323608553e-05, + "loss": 1.7354, + "step": 19955 + }, + { + "epoch": 6.125230202578269, + "grad_norm": 0.19767232239246368, + "learning_rate": 3.447447829443425e-05, + "loss": 1.7487, + "step": 19956 + }, + { + "epoch": 6.125537139349294, + "grad_norm": 0.20033477246761322, + "learning_rate": 3.446975350621529e-05, + "loss": 1.7232, + "step": 19957 + }, + { + "epoch": 6.12584407612032, + "grad_norm": 0.20310243964195251, + "learning_rate": 3.446502887147532e-05, + "loss": 1.6946, + "step": 19958 + }, + { + "epoch": 6.126151012891344, + "grad_norm": 0.2322724461555481, + "learning_rate": 3.446030439026104e-05, + "loss": 1.7071, + "step": 19959 + }, + { + "epoch": 6.1264579496623695, + "grad_norm": 0.24134255945682526, + "learning_rate": 3.445558006261914e-05, + "loss": 1.7259, + "step": 19960 + }, + { + "epoch": 6.126764886433395, + "grad_norm": 0.22821731865406036, + "learning_rate": 3.445085588859632e-05, + "loss": 1.7488, + "step": 19961 + }, + { + "epoch": 6.12707182320442, + "grad_norm": 0.258241206407547, + "learning_rate": 3.444613186823924e-05, + "loss": 1.7403, + "step": 19962 + }, + { + "epoch": 6.1273787599754455, + "grad_norm": 0.18758481740951538, + "learning_rate": 3.4441408001594625e-05, + "loss": 1.7079, + "step": 19963 + }, + { + "epoch": 6.12768569674647, + "grad_norm": 0.24032682180404663, + "learning_rate": 3.443668428870911e-05, + "loss": 1.7377, + "step": 19964 + }, + { + "epoch": 6.127992633517495, + "grad_norm": 0.24468545615673065, + "learning_rate": 3.4431960729629406e-05, + "loss": 1.7724, + "step": 19965 + }, + { + "epoch": 6.128299570288521, + "grad_norm": 0.23840154707431793, + "learning_rate": 3.4427237324402197e-05, + "loss": 1.7813, + "step": 19966 + }, + { + "epoch": 6.128606507059546, + "grad_norm": 0.2476109117269516, + "learning_rate": 3.4422514073074165e-05, + "loss": 1.7578, + "step": 19967 + }, + { + "epoch": 6.128913443830571, + "grad_norm": 0.2109041064977646, + "learning_rate": 3.4417790975691974e-05, + "loss": 1.6917, + "step": 19968 + }, + { + "epoch": 6.129220380601596, + "grad_norm": 0.21841584146022797, + "learning_rate": 3.4413068032302296e-05, + "loss": 1.7511, + "step": 19969 + }, + { + "epoch": 6.129527317372621, + "grad_norm": 0.2111930102109909, + "learning_rate": 3.440834524295182e-05, + "loss": 1.7194, + "step": 19970 + }, + { + "epoch": 6.129834254143646, + "grad_norm": 0.21868006885051727, + "learning_rate": 3.440362260768721e-05, + "loss": 1.7933, + "step": 19971 + }, + { + "epoch": 6.130141190914672, + "grad_norm": 0.19846780598163605, + "learning_rate": 3.439890012655516e-05, + "loss": 1.6985, + "step": 19972 + }, + { + "epoch": 6.130448127685697, + "grad_norm": 0.218460813164711, + "learning_rate": 3.439417779960231e-05, + "loss": 1.7205, + "step": 19973 + }, + { + "epoch": 6.1307550644567215, + "grad_norm": 0.22504402697086334, + "learning_rate": 3.438945562687535e-05, + "loss": 1.7437, + "step": 19974 + }, + { + "epoch": 6.131062001227747, + "grad_norm": 0.35414671897888184, + "learning_rate": 3.438473360842093e-05, + "loss": 1.7641, + "step": 19975 + }, + { + "epoch": 6.131368937998772, + "grad_norm": 0.21090710163116455, + "learning_rate": 3.4380011744285726e-05, + "loss": 1.6817, + "step": 19976 + }, + { + "epoch": 6.1316758747697975, + "grad_norm": 0.19118748605251312, + "learning_rate": 3.437529003451639e-05, + "loss": 1.694, + "step": 19977 + }, + { + "epoch": 6.131982811540823, + "grad_norm": 0.2341139018535614, + "learning_rate": 3.437056847915962e-05, + "loss": 1.781, + "step": 19978 + }, + { + "epoch": 6.132289748311848, + "grad_norm": 0.19120962917804718, + "learning_rate": 3.4365847078262033e-05, + "loss": 1.6974, + "step": 19979 + }, + { + "epoch": 6.132596685082873, + "grad_norm": 0.1998066008090973, + "learning_rate": 3.436112583187033e-05, + "loss": 1.6933, + "step": 19980 + }, + { + "epoch": 6.132903621853898, + "grad_norm": 0.19839663803577423, + "learning_rate": 3.4356404740031123e-05, + "loss": 1.6867, + "step": 19981 + }, + { + "epoch": 6.133210558624923, + "grad_norm": 0.19892877340316772, + "learning_rate": 3.4351683802791114e-05, + "loss": 1.7349, + "step": 19982 + }, + { + "epoch": 6.133517495395949, + "grad_norm": 0.23215502500534058, + "learning_rate": 3.434696302019692e-05, + "loss": 1.7411, + "step": 19983 + }, + { + "epoch": 6.133824432166974, + "grad_norm": 0.21246971189975739, + "learning_rate": 3.4342242392295225e-05, + "loss": 1.6918, + "step": 19984 + }, + { + "epoch": 6.134131368937998, + "grad_norm": 0.18585935235023499, + "learning_rate": 3.4337521919132675e-05, + "loss": 1.71, + "step": 19985 + }, + { + "epoch": 6.134438305709024, + "grad_norm": 0.24194715917110443, + "learning_rate": 3.4332801600755896e-05, + "loss": 1.7314, + "step": 19986 + }, + { + "epoch": 6.134745242480049, + "grad_norm": 0.19925665855407715, + "learning_rate": 3.432808143721156e-05, + "loss": 1.7425, + "step": 19987 + }, + { + "epoch": 6.135052179251074, + "grad_norm": 0.22253449261188507, + "learning_rate": 3.43233614285463e-05, + "loss": 1.702, + "step": 19988 + }, + { + "epoch": 6.1353591160221, + "grad_norm": 0.22180478274822235, + "learning_rate": 3.4318641574806796e-05, + "loss": 1.6659, + "step": 19989 + }, + { + "epoch": 6.135666052793125, + "grad_norm": 0.19818264245986938, + "learning_rate": 3.431392187603964e-05, + "loss": 1.8057, + "step": 19990 + }, + { + "epoch": 6.1359729895641495, + "grad_norm": 0.34630170464515686, + "learning_rate": 3.4309202332291526e-05, + "loss": 1.7233, + "step": 19991 + }, + { + "epoch": 6.136279926335175, + "grad_norm": 0.2633006274700165, + "learning_rate": 3.430448294360905e-05, + "loss": 1.7421, + "step": 19992 + }, + { + "epoch": 6.1365868631062, + "grad_norm": 0.1976388394832611, + "learning_rate": 3.429976371003888e-05, + "loss": 1.7474, + "step": 19993 + }, + { + "epoch": 6.1368937998772255, + "grad_norm": 0.2386583834886551, + "learning_rate": 3.429504463162764e-05, + "loss": 1.7026, + "step": 19994 + }, + { + "epoch": 6.137200736648251, + "grad_norm": 0.20853812992572784, + "learning_rate": 3.4290325708422e-05, + "loss": 1.7846, + "step": 19995 + }, + { + "epoch": 6.137507673419275, + "grad_norm": 0.24667194485664368, + "learning_rate": 3.428560694046854e-05, + "loss": 1.6446, + "step": 19996 + }, + { + "epoch": 6.137814610190301, + "grad_norm": 0.24396342039108276, + "learning_rate": 3.428088832781394e-05, + "loss": 1.7368, + "step": 19997 + }, + { + "epoch": 6.138121546961326, + "grad_norm": 0.1958172619342804, + "learning_rate": 3.4276169870504804e-05, + "loss": 1.7197, + "step": 19998 + }, + { + "epoch": 6.138428483732351, + "grad_norm": 0.21487464010715485, + "learning_rate": 3.427145156858778e-05, + "loss": 1.7318, + "step": 19999 + }, + { + "epoch": 6.138735420503377, + "grad_norm": 0.2152775675058365, + "learning_rate": 3.4266733422109476e-05, + "loss": 1.7924, + "step": 20000 + }, + { + "epoch": 6.139042357274401, + "grad_norm": 0.17151346802711487, + "learning_rate": 3.426201543111656e-05, + "loss": 1.6915, + "step": 20001 + }, + { + "epoch": 6.139349294045426, + "grad_norm": 0.22197338938713074, + "learning_rate": 3.425729759565563e-05, + "loss": 1.8028, + "step": 20002 + }, + { + "epoch": 6.139656230816452, + "grad_norm": 0.23111973702907562, + "learning_rate": 3.42525799157733e-05, + "loss": 1.7515, + "step": 20003 + }, + { + "epoch": 6.139963167587477, + "grad_norm": 0.2829805314540863, + "learning_rate": 3.42478623915162e-05, + "loss": 1.8379, + "step": 20004 + }, + { + "epoch": 6.140270104358502, + "grad_norm": 0.23467600345611572, + "learning_rate": 3.424314502293096e-05, + "loss": 1.7755, + "step": 20005 + }, + { + "epoch": 6.140577041129528, + "grad_norm": 0.2047930657863617, + "learning_rate": 3.42384278100642e-05, + "loss": 1.7198, + "step": 20006 + }, + { + "epoch": 6.140883977900552, + "grad_norm": 0.1893673986196518, + "learning_rate": 3.423371075296253e-05, + "loss": 1.7318, + "step": 20007 + }, + { + "epoch": 6.1411909146715775, + "grad_norm": 0.21514710783958435, + "learning_rate": 3.422899385167259e-05, + "loss": 1.7499, + "step": 20008 + }, + { + "epoch": 6.141497851442603, + "grad_norm": 0.20030297338962555, + "learning_rate": 3.422427710624095e-05, + "loss": 1.7109, + "step": 20009 + }, + { + "epoch": 6.141804788213628, + "grad_norm": 0.23581266403198242, + "learning_rate": 3.421956051671426e-05, + "loss": 1.7834, + "step": 20010 + }, + { + "epoch": 6.1421117249846535, + "grad_norm": 0.22492484748363495, + "learning_rate": 3.421484408313911e-05, + "loss": 1.785, + "step": 20011 + }, + { + "epoch": 6.142418661755678, + "grad_norm": 0.34137019515037537, + "learning_rate": 3.421012780556215e-05, + "loss": 1.8101, + "step": 20012 + }, + { + "epoch": 6.142725598526703, + "grad_norm": 0.28489169478416443, + "learning_rate": 3.420541168402994e-05, + "loss": 1.7945, + "step": 20013 + }, + { + "epoch": 6.143032535297729, + "grad_norm": 0.259362131357193, + "learning_rate": 3.420069571858913e-05, + "loss": 1.7011, + "step": 20014 + }, + { + "epoch": 6.143339472068754, + "grad_norm": 0.3628309667110443, + "learning_rate": 3.419597990928628e-05, + "loss": 1.8273, + "step": 20015 + }, + { + "epoch": 6.143646408839779, + "grad_norm": 0.22306841611862183, + "learning_rate": 3.419126425616803e-05, + "loss": 1.7447, + "step": 20016 + }, + { + "epoch": 6.143953345610804, + "grad_norm": 0.36336812376976013, + "learning_rate": 3.4186548759280964e-05, + "loss": 1.7076, + "step": 20017 + }, + { + "epoch": 6.144260282381829, + "grad_norm": 0.23167413473129272, + "learning_rate": 3.418183341867172e-05, + "loss": 1.6924, + "step": 20018 + }, + { + "epoch": 6.144567219152854, + "grad_norm": 0.2541113495826721, + "learning_rate": 3.417711823438686e-05, + "loss": 1.755, + "step": 20019 + }, + { + "epoch": 6.14487415592388, + "grad_norm": 0.3733784854412079, + "learning_rate": 3.4172403206472975e-05, + "loss": 1.7087, + "step": 20020 + }, + { + "epoch": 6.145181092694905, + "grad_norm": 0.1940508335828781, + "learning_rate": 3.416768833497669e-05, + "loss": 1.717, + "step": 20021 + }, + { + "epoch": 6.14548802946593, + "grad_norm": 0.2707524001598358, + "learning_rate": 3.416297361994457e-05, + "loss": 1.7422, + "step": 20022 + }, + { + "epoch": 6.145794966236955, + "grad_norm": 0.25535452365875244, + "learning_rate": 3.415825906142326e-05, + "loss": 1.6915, + "step": 20023 + }, + { + "epoch": 6.14610190300798, + "grad_norm": 0.24094220995903015, + "learning_rate": 3.415354465945929e-05, + "loss": 1.7192, + "step": 20024 + }, + { + "epoch": 6.1464088397790055, + "grad_norm": 0.28329676389694214, + "learning_rate": 3.4148830414099306e-05, + "loss": 1.7272, + "step": 20025 + }, + { + "epoch": 6.146715776550031, + "grad_norm": 0.217180535197258, + "learning_rate": 3.414411632538984e-05, + "loss": 1.7195, + "step": 20026 + }, + { + "epoch": 6.147022713321056, + "grad_norm": 0.22693867981433868, + "learning_rate": 3.413940239337753e-05, + "loss": 1.6889, + "step": 20027 + }, + { + "epoch": 6.147329650092081, + "grad_norm": 0.30376315116882324, + "learning_rate": 3.413468861810892e-05, + "loss": 1.7741, + "step": 20028 + }, + { + "epoch": 6.147636586863106, + "grad_norm": 0.1928185671567917, + "learning_rate": 3.412997499963065e-05, + "loss": 1.6986, + "step": 20029 + }, + { + "epoch": 6.147943523634131, + "grad_norm": 0.260929137468338, + "learning_rate": 3.412526153798924e-05, + "loss": 1.7044, + "step": 20030 + }, + { + "epoch": 6.148250460405157, + "grad_norm": 0.23274847865104675, + "learning_rate": 3.4120548233231326e-05, + "loss": 1.7626, + "step": 20031 + }, + { + "epoch": 6.148557397176182, + "grad_norm": 0.2389308512210846, + "learning_rate": 3.411583508540344e-05, + "loss": 1.71, + "step": 20032 + }, + { + "epoch": 6.148864333947207, + "grad_norm": 0.2745562195777893, + "learning_rate": 3.411112209455219e-05, + "loss": 1.7144, + "step": 20033 + }, + { + "epoch": 6.149171270718232, + "grad_norm": 0.2369096428155899, + "learning_rate": 3.4106409260724135e-05, + "loss": 1.7879, + "step": 20034 + }, + { + "epoch": 6.149478207489257, + "grad_norm": 0.3103141486644745, + "learning_rate": 3.4101696583965874e-05, + "loss": 1.7862, + "step": 20035 + }, + { + "epoch": 6.149785144260282, + "grad_norm": 0.18625277280807495, + "learning_rate": 3.409698406432397e-05, + "loss": 1.7717, + "step": 20036 + }, + { + "epoch": 6.150092081031308, + "grad_norm": 0.2539508640766144, + "learning_rate": 3.409227170184497e-05, + "loss": 1.7023, + "step": 20037 + }, + { + "epoch": 6.150399017802333, + "grad_norm": 0.2185351699590683, + "learning_rate": 3.4087559496575474e-05, + "loss": 1.7283, + "step": 20038 + }, + { + "epoch": 6.150705954573358, + "grad_norm": 0.21225227415561676, + "learning_rate": 3.408284744856204e-05, + "loss": 1.7055, + "step": 20039 + }, + { + "epoch": 6.151012891344383, + "grad_norm": 0.23623189330101013, + "learning_rate": 3.407813555785125e-05, + "loss": 1.6862, + "step": 20040 + }, + { + "epoch": 6.151319828115408, + "grad_norm": 0.19061312079429626, + "learning_rate": 3.4073423824489634e-05, + "loss": 1.7501, + "step": 20041 + }, + { + "epoch": 6.151626764886434, + "grad_norm": 0.22176402807235718, + "learning_rate": 3.4068712248523804e-05, + "loss": 1.7417, + "step": 20042 + }, + { + "epoch": 6.151933701657459, + "grad_norm": 0.20093770325183868, + "learning_rate": 3.406400083000028e-05, + "loss": 1.7283, + "step": 20043 + }, + { + "epoch": 6.152240638428483, + "grad_norm": 0.21968910098075867, + "learning_rate": 3.4059289568965635e-05, + "loss": 1.7187, + "step": 20044 + }, + { + "epoch": 6.152547575199509, + "grad_norm": 0.19038841128349304, + "learning_rate": 3.4054578465466435e-05, + "loss": 1.7131, + "step": 20045 + }, + { + "epoch": 6.152854511970534, + "grad_norm": 0.2239457368850708, + "learning_rate": 3.404986751954925e-05, + "loss": 1.7643, + "step": 20046 + }, + { + "epoch": 6.153161448741559, + "grad_norm": 0.2357017546892166, + "learning_rate": 3.404515673126061e-05, + "loss": 1.7196, + "step": 20047 + }, + { + "epoch": 6.153468385512585, + "grad_norm": 0.2633310556411743, + "learning_rate": 3.4040446100647104e-05, + "loss": 1.7613, + "step": 20048 + }, + { + "epoch": 6.153775322283609, + "grad_norm": 0.28470975160598755, + "learning_rate": 3.403573562775524e-05, + "loss": 1.7564, + "step": 20049 + }, + { + "epoch": 6.1540822590546345, + "grad_norm": 0.37435805797576904, + "learning_rate": 3.40310253126316e-05, + "loss": 1.8365, + "step": 20050 + }, + { + "epoch": 6.15438919582566, + "grad_norm": 0.1706259697675705, + "learning_rate": 3.402631515532272e-05, + "loss": 1.7373, + "step": 20051 + }, + { + "epoch": 6.154696132596685, + "grad_norm": 0.30885928869247437, + "learning_rate": 3.402160515587518e-05, + "loss": 1.7152, + "step": 20052 + }, + { + "epoch": 6.1550030693677105, + "grad_norm": 0.21448500454425812, + "learning_rate": 3.40168953143355e-05, + "loss": 1.7463, + "step": 20053 + }, + { + "epoch": 6.155310006138736, + "grad_norm": 0.23774586617946625, + "learning_rate": 3.4012185630750204e-05, + "loss": 1.7268, + "step": 20054 + }, + { + "epoch": 6.15561694290976, + "grad_norm": 0.1943385899066925, + "learning_rate": 3.400747610516588e-05, + "loss": 1.6578, + "step": 20055 + }, + { + "epoch": 6.155923879680786, + "grad_norm": 0.27488210797309875, + "learning_rate": 3.400276673762903e-05, + "loss": 1.8204, + "step": 20056 + }, + { + "epoch": 6.156230816451811, + "grad_norm": 0.1871461570262909, + "learning_rate": 3.3998057528186244e-05, + "loss": 1.6775, + "step": 20057 + }, + { + "epoch": 6.156537753222836, + "grad_norm": 0.23566775023937225, + "learning_rate": 3.399334847688401e-05, + "loss": 1.7089, + "step": 20058 + }, + { + "epoch": 6.156844689993862, + "grad_norm": 0.26842471957206726, + "learning_rate": 3.398863958376891e-05, + "loss": 1.7554, + "step": 20059 + }, + { + "epoch": 6.157151626764886, + "grad_norm": 0.19267809391021729, + "learning_rate": 3.3983930848887435e-05, + "loss": 1.6709, + "step": 20060 + }, + { + "epoch": 6.157458563535911, + "grad_norm": 0.21130084991455078, + "learning_rate": 3.3979222272286156e-05, + "loss": 1.7312, + "step": 20061 + }, + { + "epoch": 6.157765500306937, + "grad_norm": 0.2322172224521637, + "learning_rate": 3.397451385401158e-05, + "loss": 1.8069, + "step": 20062 + }, + { + "epoch": 6.158072437077962, + "grad_norm": 0.21852418780326843, + "learning_rate": 3.396980559411027e-05, + "loss": 1.715, + "step": 20063 + }, + { + "epoch": 6.158379373848987, + "grad_norm": 0.21385829150676727, + "learning_rate": 3.3965097492628714e-05, + "loss": 1.6804, + "step": 20064 + }, + { + "epoch": 6.158686310620013, + "grad_norm": 0.21639080345630646, + "learning_rate": 3.3960389549613494e-05, + "loss": 1.655, + "step": 20065 + }, + { + "epoch": 6.158993247391037, + "grad_norm": 0.19219942390918732, + "learning_rate": 3.395568176511107e-05, + "loss": 1.7325, + "step": 20066 + }, + { + "epoch": 6.1593001841620625, + "grad_norm": 0.21853557229042053, + "learning_rate": 3.3950974139168024e-05, + "loss": 1.7204, + "step": 20067 + }, + { + "epoch": 6.159607120933088, + "grad_norm": 0.24144381284713745, + "learning_rate": 3.3946266671830854e-05, + "loss": 1.754, + "step": 20068 + }, + { + "epoch": 6.159914057704113, + "grad_norm": 0.2014230340719223, + "learning_rate": 3.394155936314609e-05, + "loss": 1.6905, + "step": 20069 + }, + { + "epoch": 6.1602209944751385, + "grad_norm": 0.26940762996673584, + "learning_rate": 3.393685221316025e-05, + "loss": 1.729, + "step": 20070 + }, + { + "epoch": 6.160527931246163, + "grad_norm": 0.1937808394432068, + "learning_rate": 3.3932145221919843e-05, + "loss": 1.7492, + "step": 20071 + }, + { + "epoch": 6.160834868017188, + "grad_norm": 0.2586243450641632, + "learning_rate": 3.39274383894714e-05, + "loss": 1.7706, + "step": 20072 + }, + { + "epoch": 6.161141804788214, + "grad_norm": 0.21995361149311066, + "learning_rate": 3.3922731715861416e-05, + "loss": 1.7716, + "step": 20073 + }, + { + "epoch": 6.161448741559239, + "grad_norm": 0.22915497422218323, + "learning_rate": 3.391802520113645e-05, + "loss": 1.716, + "step": 20074 + }, + { + "epoch": 6.161755678330264, + "grad_norm": 0.24317315220832825, + "learning_rate": 3.3913318845342956e-05, + "loss": 1.7392, + "step": 20075 + }, + { + "epoch": 6.162062615101289, + "grad_norm": 0.20439307391643524, + "learning_rate": 3.390861264852749e-05, + "loss": 1.7076, + "step": 20076 + }, + { + "epoch": 6.162369551872314, + "grad_norm": 0.2197176069021225, + "learning_rate": 3.3903906610736534e-05, + "loss": 1.7334, + "step": 20077 + }, + { + "epoch": 6.162676488643339, + "grad_norm": 0.21651993691921234, + "learning_rate": 3.389920073201662e-05, + "loss": 1.7651, + "step": 20078 + }, + { + "epoch": 6.162983425414365, + "grad_norm": 0.1999540627002716, + "learning_rate": 3.389449501241424e-05, + "loss": 1.7031, + "step": 20079 + }, + { + "epoch": 6.16329036218539, + "grad_norm": 0.21965044736862183, + "learning_rate": 3.38897894519759e-05, + "loss": 1.7243, + "step": 20080 + }, + { + "epoch": 6.163597298956415, + "grad_norm": 0.20127563178539276, + "learning_rate": 3.388508405074808e-05, + "loss": 1.693, + "step": 20081 + }, + { + "epoch": 6.16390423572744, + "grad_norm": 0.2143397182226181, + "learning_rate": 3.3880378808777336e-05, + "loss": 1.7304, + "step": 20082 + }, + { + "epoch": 6.164211172498465, + "grad_norm": 0.23116083443164825, + "learning_rate": 3.387567372611012e-05, + "loss": 1.7558, + "step": 20083 + }, + { + "epoch": 6.1645181092694905, + "grad_norm": 0.25513985753059387, + "learning_rate": 3.3870968802792946e-05, + "loss": 1.7169, + "step": 20084 + }, + { + "epoch": 6.164825046040516, + "grad_norm": 0.20549121499061584, + "learning_rate": 3.386626403887232e-05, + "loss": 1.7147, + "step": 20085 + }, + { + "epoch": 6.165131982811541, + "grad_norm": 0.2850625514984131, + "learning_rate": 3.386155943439473e-05, + "loss": 1.7865, + "step": 20086 + }, + { + "epoch": 6.165438919582566, + "grad_norm": 0.2689895033836365, + "learning_rate": 3.3856854989406675e-05, + "loss": 1.7576, + "step": 20087 + }, + { + "epoch": 6.165745856353591, + "grad_norm": 0.21677634119987488, + "learning_rate": 3.385215070395462e-05, + "loss": 1.7186, + "step": 20088 + }, + { + "epoch": 6.166052793124616, + "grad_norm": 0.19525155425071716, + "learning_rate": 3.384744657808509e-05, + "loss": 1.6713, + "step": 20089 + }, + { + "epoch": 6.166359729895642, + "grad_norm": 0.23097296059131622, + "learning_rate": 3.3842742611844555e-05, + "loss": 1.6975, + "step": 20090 + }, + { + "epoch": 6.166666666666667, + "grad_norm": 0.22210827469825745, + "learning_rate": 3.3838038805279516e-05, + "loss": 1.733, + "step": 20091 + }, + { + "epoch": 6.166973603437691, + "grad_norm": 0.3336607813835144, + "learning_rate": 3.383333515843643e-05, + "loss": 1.7441, + "step": 20092 + }, + { + "epoch": 6.167280540208717, + "grad_norm": 0.25274014472961426, + "learning_rate": 3.382863167136183e-05, + "loss": 1.7235, + "step": 20093 + }, + { + "epoch": 6.167587476979742, + "grad_norm": 0.3228790760040283, + "learning_rate": 3.3823928344102144e-05, + "loss": 1.8096, + "step": 20094 + }, + { + "epoch": 6.167894413750767, + "grad_norm": 0.34542208909988403, + "learning_rate": 3.381922517670389e-05, + "loss": 1.7431, + "step": 20095 + }, + { + "epoch": 6.168201350521793, + "grad_norm": 0.1921117901802063, + "learning_rate": 3.381452216921355e-05, + "loss": 1.787, + "step": 20096 + }, + { + "epoch": 6.168508287292818, + "grad_norm": 0.29019802808761597, + "learning_rate": 3.380981932167757e-05, + "loss": 1.7122, + "step": 20097 + }, + { + "epoch": 6.1688152240638425, + "grad_norm": 0.17999929189682007, + "learning_rate": 3.380511663414244e-05, + "loss": 1.7153, + "step": 20098 + }, + { + "epoch": 6.169122160834868, + "grad_norm": 0.2641841471195221, + "learning_rate": 3.380041410665466e-05, + "loss": 1.7317, + "step": 20099 + }, + { + "epoch": 6.169429097605893, + "grad_norm": 0.25492918491363525, + "learning_rate": 3.379571173926067e-05, + "loss": 1.6975, + "step": 20100 + }, + { + "epoch": 6.1697360343769185, + "grad_norm": 0.2554764151573181, + "learning_rate": 3.379100953200697e-05, + "loss": 1.7539, + "step": 20101 + }, + { + "epoch": 6.170042971147944, + "grad_norm": 0.2339072823524475, + "learning_rate": 3.378630748493999e-05, + "loss": 1.6871, + "step": 20102 + }, + { + "epoch": 6.170349907918968, + "grad_norm": 0.19663162529468536, + "learning_rate": 3.3781605598106236e-05, + "loss": 1.7419, + "step": 20103 + }, + { + "epoch": 6.170656844689994, + "grad_norm": 0.2479846328496933, + "learning_rate": 3.3776903871552166e-05, + "loss": 1.7849, + "step": 20104 + }, + { + "epoch": 6.170963781461019, + "grad_norm": 0.18630735576152802, + "learning_rate": 3.377220230532423e-05, + "loss": 1.7412, + "step": 20105 + }, + { + "epoch": 6.171270718232044, + "grad_norm": 0.2211095094680786, + "learning_rate": 3.376750089946892e-05, + "loss": 1.7445, + "step": 20106 + }, + { + "epoch": 6.17157765500307, + "grad_norm": 0.20783299207687378, + "learning_rate": 3.3762799654032653e-05, + "loss": 1.7346, + "step": 20107 + }, + { + "epoch": 6.171884591774095, + "grad_norm": 0.18022862076759338, + "learning_rate": 3.3758098569061934e-05, + "loss": 1.7083, + "step": 20108 + }, + { + "epoch": 6.172191528545119, + "grad_norm": 0.23707088828086853, + "learning_rate": 3.375339764460319e-05, + "loss": 1.8542, + "step": 20109 + }, + { + "epoch": 6.172498465316145, + "grad_norm": 0.2289234846830368, + "learning_rate": 3.3748696880702913e-05, + "loss": 1.7564, + "step": 20110 + }, + { + "epoch": 6.17280540208717, + "grad_norm": 0.28396767377853394, + "learning_rate": 3.374399627740752e-05, + "loss": 1.7349, + "step": 20111 + }, + { + "epoch": 6.173112338858195, + "grad_norm": 0.20154817402362823, + "learning_rate": 3.373929583476351e-05, + "loss": 1.7356, + "step": 20112 + }, + { + "epoch": 6.173419275629221, + "grad_norm": 0.22590605914592743, + "learning_rate": 3.373459555281728e-05, + "loss": 1.7291, + "step": 20113 + }, + { + "epoch": 6.173726212400245, + "grad_norm": 0.2145034223794937, + "learning_rate": 3.372989543161532e-05, + "loss": 1.7544, + "step": 20114 + }, + { + "epoch": 6.1740331491712706, + "grad_norm": 0.26797109842300415, + "learning_rate": 3.372519547120407e-05, + "loss": 1.743, + "step": 20115 + }, + { + "epoch": 6.174340085942296, + "grad_norm": 0.2795363664627075, + "learning_rate": 3.372049567162999e-05, + "loss": 1.7278, + "step": 20116 + }, + { + "epoch": 6.174647022713321, + "grad_norm": 0.21436716616153717, + "learning_rate": 3.3715796032939494e-05, + "loss": 1.7306, + "step": 20117 + }, + { + "epoch": 6.1749539594843466, + "grad_norm": 0.2593919336795807, + "learning_rate": 3.3711096555179064e-05, + "loss": 1.7323, + "step": 20118 + }, + { + "epoch": 6.175260896255371, + "grad_norm": 0.19639115035533905, + "learning_rate": 3.3706397238395124e-05, + "loss": 1.7444, + "step": 20119 + }, + { + "epoch": 6.175567833026396, + "grad_norm": 0.23408278822898865, + "learning_rate": 3.370169808263409e-05, + "loss": 1.7461, + "step": 20120 + }, + { + "epoch": 6.175874769797422, + "grad_norm": 0.21200022101402283, + "learning_rate": 3.369699908794246e-05, + "loss": 1.7588, + "step": 20121 + }, + { + "epoch": 6.176181706568447, + "grad_norm": 0.17609953880310059, + "learning_rate": 3.369230025436662e-05, + "loss": 1.6608, + "step": 20122 + }, + { + "epoch": 6.176488643339472, + "grad_norm": 0.19895964860916138, + "learning_rate": 3.3687601581953046e-05, + "loss": 1.729, + "step": 20123 + }, + { + "epoch": 6.176795580110497, + "grad_norm": 0.22833310067653656, + "learning_rate": 3.368290307074814e-05, + "loss": 1.7148, + "step": 20124 + }, + { + "epoch": 6.177102516881522, + "grad_norm": 0.1847219169139862, + "learning_rate": 3.367820472079835e-05, + "loss": 1.6894, + "step": 20125 + }, + { + "epoch": 6.1774094536525475, + "grad_norm": 0.20269884169101715, + "learning_rate": 3.36735065321501e-05, + "loss": 1.794, + "step": 20126 + }, + { + "epoch": 6.177716390423573, + "grad_norm": 0.19277122616767883, + "learning_rate": 3.3668808504849845e-05, + "loss": 1.6936, + "step": 20127 + }, + { + "epoch": 6.178023327194598, + "grad_norm": 0.23804394900798798, + "learning_rate": 3.3664110638943985e-05, + "loss": 1.746, + "step": 20128 + }, + { + "epoch": 6.1783302639656235, + "grad_norm": 0.20946018397808075, + "learning_rate": 3.365941293447897e-05, + "loss": 1.6952, + "step": 20129 + }, + { + "epoch": 6.178637200736648, + "grad_norm": 0.21680596470832825, + "learning_rate": 3.36547153915012e-05, + "loss": 1.7709, + "step": 20130 + }, + { + "epoch": 6.178944137507673, + "grad_norm": 0.22549709677696228, + "learning_rate": 3.365001801005712e-05, + "loss": 1.6814, + "step": 20131 + }, + { + "epoch": 6.179251074278699, + "grad_norm": 0.20660072565078735, + "learning_rate": 3.3645320790193136e-05, + "loss": 1.6992, + "step": 20132 + }, + { + "epoch": 6.179558011049724, + "grad_norm": 0.23697195947170258, + "learning_rate": 3.36406237319557e-05, + "loss": 1.7325, + "step": 20133 + }, + { + "epoch": 6.179864947820749, + "grad_norm": 0.20847748219966888, + "learning_rate": 3.363592683539118e-05, + "loss": 1.7066, + "step": 20134 + }, + { + "epoch": 6.180171884591774, + "grad_norm": 0.24317312240600586, + "learning_rate": 3.363123010054605e-05, + "loss": 1.7259, + "step": 20135 + }, + { + "epoch": 6.180478821362799, + "grad_norm": 0.22137925028800964, + "learning_rate": 3.3626533527466686e-05, + "loss": 1.7492, + "step": 20136 + }, + { + "epoch": 6.180785758133824, + "grad_norm": 0.23857460916042328, + "learning_rate": 3.362183711619951e-05, + "loss": 1.6671, + "step": 20137 + }, + { + "epoch": 6.18109269490485, + "grad_norm": 0.20017468929290771, + "learning_rate": 3.361714086679095e-05, + "loss": 1.7151, + "step": 20138 + }, + { + "epoch": 6.181399631675875, + "grad_norm": 0.21566617488861084, + "learning_rate": 3.361244477928739e-05, + "loss": 1.7659, + "step": 20139 + }, + { + "epoch": 6.1817065684469, + "grad_norm": 0.21695555746555328, + "learning_rate": 3.360774885373528e-05, + "loss": 1.7463, + "step": 20140 + }, + { + "epoch": 6.182013505217925, + "grad_norm": 0.19326116144657135, + "learning_rate": 3.360305309018098e-05, + "loss": 1.7182, + "step": 20141 + }, + { + "epoch": 6.18232044198895, + "grad_norm": 0.2135429084300995, + "learning_rate": 3.359835748867093e-05, + "loss": 1.8001, + "step": 20142 + }, + { + "epoch": 6.1826273787599755, + "grad_norm": 0.20097343623638153, + "learning_rate": 3.359366204925151e-05, + "loss": 1.7442, + "step": 20143 + }, + { + "epoch": 6.182934315531001, + "grad_norm": 0.212847501039505, + "learning_rate": 3.358896677196916e-05, + "loss": 1.7418, + "step": 20144 + }, + { + "epoch": 6.183241252302026, + "grad_norm": 0.18414677679538727, + "learning_rate": 3.358427165687024e-05, + "loss": 1.6813, + "step": 20145 + }, + { + "epoch": 6.183548189073051, + "grad_norm": 0.23170427978038788, + "learning_rate": 3.357957670400119e-05, + "loss": 1.7722, + "step": 20146 + }, + { + "epoch": 6.183855125844076, + "grad_norm": 0.28952550888061523, + "learning_rate": 3.357488191340837e-05, + "loss": 1.7785, + "step": 20147 + }, + { + "epoch": 6.184162062615101, + "grad_norm": 0.2126605361700058, + "learning_rate": 3.35701872851382e-05, + "loss": 1.7064, + "step": 20148 + }, + { + "epoch": 6.184468999386127, + "grad_norm": 0.2376919537782669, + "learning_rate": 3.356549281923706e-05, + "loss": 1.7322, + "step": 20149 + }, + { + "epoch": 6.184775936157152, + "grad_norm": 0.24168729782104492, + "learning_rate": 3.3560798515751375e-05, + "loss": 1.7296, + "step": 20150 + }, + { + "epoch": 6.185082872928176, + "grad_norm": 0.19746467471122742, + "learning_rate": 3.355610437472749e-05, + "loss": 1.7816, + "step": 20151 + }, + { + "epoch": 6.185389809699202, + "grad_norm": 0.2399774193763733, + "learning_rate": 3.3551410396211844e-05, + "loss": 1.7309, + "step": 20152 + }, + { + "epoch": 6.185696746470227, + "grad_norm": 0.20560777187347412, + "learning_rate": 3.3546716580250785e-05, + "loss": 1.7134, + "step": 20153 + }, + { + "epoch": 6.186003683241252, + "grad_norm": 0.22640523314476013, + "learning_rate": 3.354202292689072e-05, + "loss": 1.7572, + "step": 20154 + }, + { + "epoch": 6.186310620012278, + "grad_norm": 0.20796974003314972, + "learning_rate": 3.353732943617803e-05, + "loss": 1.6897, + "step": 20155 + }, + { + "epoch": 6.186617556783303, + "grad_norm": 0.19902797043323517, + "learning_rate": 3.35326361081591e-05, + "loss": 1.6836, + "step": 20156 + }, + { + "epoch": 6.1869244935543275, + "grad_norm": 0.30999818444252014, + "learning_rate": 3.352794294288032e-05, + "loss": 1.7704, + "step": 20157 + }, + { + "epoch": 6.187231430325353, + "grad_norm": 0.20634675025939941, + "learning_rate": 3.3523249940388045e-05, + "loss": 1.7599, + "step": 20158 + }, + { + "epoch": 6.187538367096378, + "grad_norm": 0.25650453567504883, + "learning_rate": 3.3518557100728674e-05, + "loss": 1.7441, + "step": 20159 + }, + { + "epoch": 6.1878453038674035, + "grad_norm": 0.2400079369544983, + "learning_rate": 3.351386442394858e-05, + "loss": 1.6836, + "step": 20160 + }, + { + "epoch": 6.188152240638429, + "grad_norm": 0.23734217882156372, + "learning_rate": 3.350917191009416e-05, + "loss": 1.7, + "step": 20161 + }, + { + "epoch": 6.188459177409453, + "grad_norm": 0.29579323530197144, + "learning_rate": 3.3504479559211755e-05, + "loss": 1.71, + "step": 20162 + }, + { + "epoch": 6.188766114180479, + "grad_norm": 0.18999184668064117, + "learning_rate": 3.349978737134776e-05, + "loss": 1.7396, + "step": 20163 + }, + { + "epoch": 6.189073050951504, + "grad_norm": 0.26760223507881165, + "learning_rate": 3.3495095346548525e-05, + "loss": 1.7846, + "step": 20164 + }, + { + "epoch": 6.189379987722529, + "grad_norm": 0.18416397273540497, + "learning_rate": 3.349040348486044e-05, + "loss": 1.6911, + "step": 20165 + }, + { + "epoch": 6.189686924493555, + "grad_norm": 0.23761679232120514, + "learning_rate": 3.348571178632986e-05, + "loss": 1.6776, + "step": 20166 + }, + { + "epoch": 6.189993861264579, + "grad_norm": 0.2056473195552826, + "learning_rate": 3.348102025100316e-05, + "loss": 1.697, + "step": 20167 + }, + { + "epoch": 6.190300798035604, + "grad_norm": 0.23916250467300415, + "learning_rate": 3.3476328878926685e-05, + "loss": 1.7943, + "step": 20168 + }, + { + "epoch": 6.19060773480663, + "grad_norm": 0.2205415964126587, + "learning_rate": 3.347163767014684e-05, + "loss": 1.8037, + "step": 20169 + }, + { + "epoch": 6.190914671577655, + "grad_norm": 0.28907346725463867, + "learning_rate": 3.346694662470995e-05, + "loss": 1.6875, + "step": 20170 + }, + { + "epoch": 6.19122160834868, + "grad_norm": 0.2382480502128601, + "learning_rate": 3.3462255742662364e-05, + "loss": 1.7116, + "step": 20171 + }, + { + "epoch": 6.191528545119706, + "grad_norm": 0.25309205055236816, + "learning_rate": 3.3457565024050485e-05, + "loss": 1.7584, + "step": 20172 + }, + { + "epoch": 6.19183548189073, + "grad_norm": 0.3959091901779175, + "learning_rate": 3.3452874468920626e-05, + "loss": 1.7054, + "step": 20173 + }, + { + "epoch": 6.1921424186617555, + "grad_norm": 0.22697016596794128, + "learning_rate": 3.344818407731918e-05, + "loss": 1.7373, + "step": 20174 + }, + { + "epoch": 6.192449355432781, + "grad_norm": 0.298178493976593, + "learning_rate": 3.3443493849292465e-05, + "loss": 1.7192, + "step": 20175 + }, + { + "epoch": 6.192756292203806, + "grad_norm": 0.2742854058742523, + "learning_rate": 3.343880378488685e-05, + "loss": 1.7538, + "step": 20176 + }, + { + "epoch": 6.1930632289748315, + "grad_norm": 0.23367546498775482, + "learning_rate": 3.343411388414867e-05, + "loss": 1.694, + "step": 20177 + }, + { + "epoch": 6.193370165745856, + "grad_norm": 0.2932305932044983, + "learning_rate": 3.342942414712431e-05, + "loss": 1.7291, + "step": 20178 + }, + { + "epoch": 6.193677102516881, + "grad_norm": 0.24306413531303406, + "learning_rate": 3.342473457386007e-05, + "loss": 1.6959, + "step": 20179 + }, + { + "epoch": 6.193984039287907, + "grad_norm": 0.30828577280044556, + "learning_rate": 3.3420045164402344e-05, + "loss": 1.6848, + "step": 20180 + }, + { + "epoch": 6.194290976058932, + "grad_norm": 0.18766994774341583, + "learning_rate": 3.341535591879743e-05, + "loss": 1.7261, + "step": 20181 + }, + { + "epoch": 6.194597912829957, + "grad_norm": 0.300778329372406, + "learning_rate": 3.3410666837091696e-05, + "loss": 1.7539, + "step": 20182 + }, + { + "epoch": 6.194904849600983, + "grad_norm": 0.20148977637290955, + "learning_rate": 3.340597791933147e-05, + "loss": 1.7496, + "step": 20183 + }, + { + "epoch": 6.195211786372007, + "grad_norm": 0.2746329605579376, + "learning_rate": 3.340128916556311e-05, + "loss": 1.6458, + "step": 20184 + }, + { + "epoch": 6.195518723143032, + "grad_norm": 0.2715265452861786, + "learning_rate": 3.339660057583292e-05, + "loss": 1.7799, + "step": 20185 + }, + { + "epoch": 6.195825659914058, + "grad_norm": 0.2145555317401886, + "learning_rate": 3.339191215018728e-05, + "loss": 1.6854, + "step": 20186 + }, + { + "epoch": 6.196132596685083, + "grad_norm": 0.3018960654735565, + "learning_rate": 3.338722388867248e-05, + "loss": 1.7569, + "step": 20187 + }, + { + "epoch": 6.196439533456108, + "grad_norm": 0.24876931309700012, + "learning_rate": 3.338253579133487e-05, + "loss": 1.7434, + "step": 20188 + }, + { + "epoch": 6.196746470227133, + "grad_norm": 0.3609273433685303, + "learning_rate": 3.337784785822079e-05, + "loss": 1.737, + "step": 20189 + }, + { + "epoch": 6.197053406998158, + "grad_norm": 0.21586830914020538, + "learning_rate": 3.337316008937655e-05, + "loss": 1.7553, + "step": 20190 + }, + { + "epoch": 6.1973603437691835, + "grad_norm": 0.23542988300323486, + "learning_rate": 3.3368472484848504e-05, + "loss": 1.7174, + "step": 20191 + }, + { + "epoch": 6.197667280540209, + "grad_norm": 0.19861294329166412, + "learning_rate": 3.336378504468294e-05, + "loss": 1.7268, + "step": 20192 + }, + { + "epoch": 6.197974217311234, + "grad_norm": 0.26865682005882263, + "learning_rate": 3.335909776892622e-05, + "loss": 1.7656, + "step": 20193 + }, + { + "epoch": 6.198281154082259, + "grad_norm": 0.343078076839447, + "learning_rate": 3.3354410657624624e-05, + "loss": 1.734, + "step": 20194 + }, + { + "epoch": 6.198588090853284, + "grad_norm": 0.21613667905330658, + "learning_rate": 3.334972371082453e-05, + "loss": 1.7777, + "step": 20195 + }, + { + "epoch": 6.198895027624309, + "grad_norm": 0.22268854081630707, + "learning_rate": 3.3345036928572207e-05, + "loss": 1.667, + "step": 20196 + }, + { + "epoch": 6.199201964395335, + "grad_norm": 0.22870087623596191, + "learning_rate": 3.3340350310914e-05, + "loss": 1.7532, + "step": 20197 + }, + { + "epoch": 6.19950890116636, + "grad_norm": 0.1969831883907318, + "learning_rate": 3.3335663857896205e-05, + "loss": 1.7821, + "step": 20198 + }, + { + "epoch": 6.199815837937384, + "grad_norm": 0.20414133369922638, + "learning_rate": 3.3330977569565154e-05, + "loss": 1.7449, + "step": 20199 + }, + { + "epoch": 6.20012277470841, + "grad_norm": 0.21947748959064484, + "learning_rate": 3.332629144596714e-05, + "loss": 1.6888, + "step": 20200 + }, + { + "epoch": 6.200429711479435, + "grad_norm": 0.20943035185337067, + "learning_rate": 3.332160548714851e-05, + "loss": 1.7278, + "step": 20201 + }, + { + "epoch": 6.2007366482504604, + "grad_norm": 0.22410117089748383, + "learning_rate": 3.331691969315553e-05, + "loss": 1.721, + "step": 20202 + }, + { + "epoch": 6.201043585021486, + "grad_norm": 0.21422281861305237, + "learning_rate": 3.3312234064034555e-05, + "loss": 1.7199, + "step": 20203 + }, + { + "epoch": 6.201350521792511, + "grad_norm": 0.21021418273448944, + "learning_rate": 3.330754859983184e-05, + "loss": 1.7972, + "step": 20204 + }, + { + "epoch": 6.201657458563536, + "grad_norm": 0.21155185997486115, + "learning_rate": 3.330286330059371e-05, + "loss": 1.7463, + "step": 20205 + }, + { + "epoch": 6.201964395334561, + "grad_norm": 0.20241162180900574, + "learning_rate": 3.329817816636649e-05, + "loss": 1.7804, + "step": 20206 + }, + { + "epoch": 6.202271332105586, + "grad_norm": 0.19882376492023468, + "learning_rate": 3.329349319719644e-05, + "loss": 1.7564, + "step": 20207 + }, + { + "epoch": 6.202578268876612, + "grad_norm": 0.20528686046600342, + "learning_rate": 3.328880839312991e-05, + "loss": 1.751, + "step": 20208 + }, + { + "epoch": 6.202885205647637, + "grad_norm": 0.2708488404750824, + "learning_rate": 3.328412375421315e-05, + "loss": 1.8008, + "step": 20209 + }, + { + "epoch": 6.203192142418661, + "grad_norm": 0.1986229121685028, + "learning_rate": 3.3279439280492486e-05, + "loss": 1.6833, + "step": 20210 + }, + { + "epoch": 6.203499079189687, + "grad_norm": 0.2700355350971222, + "learning_rate": 3.3274754972014186e-05, + "loss": 1.8071, + "step": 20211 + }, + { + "epoch": 6.203806015960712, + "grad_norm": 0.23060421645641327, + "learning_rate": 3.327007082882458e-05, + "loss": 1.6856, + "step": 20212 + }, + { + "epoch": 6.204112952731737, + "grad_norm": 0.20798510313034058, + "learning_rate": 3.3265386850969926e-05, + "loss": 1.7421, + "step": 20213 + }, + { + "epoch": 6.204419889502763, + "grad_norm": 0.21828265488147736, + "learning_rate": 3.3260703038496556e-05, + "loss": 1.7212, + "step": 20214 + }, + { + "epoch": 6.204726826273788, + "grad_norm": 0.1965378224849701, + "learning_rate": 3.325601939145069e-05, + "loss": 1.6987, + "step": 20215 + }, + { + "epoch": 6.2050337630448125, + "grad_norm": 0.23897121846675873, + "learning_rate": 3.325133590987868e-05, + "loss": 1.7501, + "step": 20216 + }, + { + "epoch": 6.205340699815838, + "grad_norm": 0.18647781014442444, + "learning_rate": 3.324665259382676e-05, + "loss": 1.688, + "step": 20217 + }, + { + "epoch": 6.205647636586863, + "grad_norm": 0.19906121492385864, + "learning_rate": 3.324196944334127e-05, + "loss": 1.749, + "step": 20218 + }, + { + "epoch": 6.2059545733578885, + "grad_norm": 0.2061154991388321, + "learning_rate": 3.3237286458468444e-05, + "loss": 1.757, + "step": 20219 + }, + { + "epoch": 6.206261510128914, + "grad_norm": 0.19410182535648346, + "learning_rate": 3.323260363925459e-05, + "loss": 1.6826, + "step": 20220 + }, + { + "epoch": 6.206568446899938, + "grad_norm": 0.2017979919910431, + "learning_rate": 3.322792098574597e-05, + "loss": 1.7568, + "step": 20221 + }, + { + "epoch": 6.206875383670964, + "grad_norm": 0.19491736590862274, + "learning_rate": 3.322323849798885e-05, + "loss": 1.7082, + "step": 20222 + }, + { + "epoch": 6.207182320441989, + "grad_norm": 0.19826333224773407, + "learning_rate": 3.321855617602954e-05, + "loss": 1.7654, + "step": 20223 + }, + { + "epoch": 6.207489257213014, + "grad_norm": 0.18185383081436157, + "learning_rate": 3.321387401991428e-05, + "loss": 1.6826, + "step": 20224 + }, + { + "epoch": 6.20779619398404, + "grad_norm": 0.22402678430080414, + "learning_rate": 3.320919202968937e-05, + "loss": 1.795, + "step": 20225 + }, + { + "epoch": 6.208103130755064, + "grad_norm": 0.201541468501091, + "learning_rate": 3.320451020540105e-05, + "loss": 1.6838, + "step": 20226 + }, + { + "epoch": 6.208410067526089, + "grad_norm": 0.25479504466056824, + "learning_rate": 3.3199828547095616e-05, + "loss": 1.7881, + "step": 20227 + }, + { + "epoch": 6.208717004297115, + "grad_norm": 0.2057993859052658, + "learning_rate": 3.31951470548193e-05, + "loss": 1.737, + "step": 20228 + }, + { + "epoch": 6.20902394106814, + "grad_norm": 0.183469757437706, + "learning_rate": 3.319046572861842e-05, + "loss": 1.6989, + "step": 20229 + }, + { + "epoch": 6.209330877839165, + "grad_norm": 0.21723738312721252, + "learning_rate": 3.318578456853919e-05, + "loss": 1.7537, + "step": 20230 + }, + { + "epoch": 6.209637814610191, + "grad_norm": 0.21919457614421844, + "learning_rate": 3.318110357462791e-05, + "loss": 1.7444, + "step": 20231 + }, + { + "epoch": 6.209944751381215, + "grad_norm": 0.17009909451007843, + "learning_rate": 3.317642274693081e-05, + "loss": 1.6885, + "step": 20232 + }, + { + "epoch": 6.2102516881522405, + "grad_norm": 0.19625195860862732, + "learning_rate": 3.317174208549416e-05, + "loss": 1.7255, + "step": 20233 + }, + { + "epoch": 6.210558624923266, + "grad_norm": 0.2131364941596985, + "learning_rate": 3.316706159036422e-05, + "loss": 1.7047, + "step": 20234 + }, + { + "epoch": 6.210865561694291, + "grad_norm": 0.18454425036907196, + "learning_rate": 3.316238126158725e-05, + "loss": 1.7536, + "step": 20235 + }, + { + "epoch": 6.2111724984653165, + "grad_norm": 0.2124820202589035, + "learning_rate": 3.3157701099209485e-05, + "loss": 1.7456, + "step": 20236 + }, + { + "epoch": 6.211479435236341, + "grad_norm": 0.1929594725370407, + "learning_rate": 3.3153021103277206e-05, + "loss": 1.7118, + "step": 20237 + }, + { + "epoch": 6.211786372007366, + "grad_norm": 0.19876480102539062, + "learning_rate": 3.314834127383664e-05, + "loss": 1.6855, + "step": 20238 + }, + { + "epoch": 6.212093308778392, + "grad_norm": 0.18902665376663208, + "learning_rate": 3.314366161093403e-05, + "loss": 1.7052, + "step": 20239 + }, + { + "epoch": 6.212400245549417, + "grad_norm": 0.1859758198261261, + "learning_rate": 3.313898211461566e-05, + "loss": 1.7277, + "step": 20240 + }, + { + "epoch": 6.212707182320442, + "grad_norm": 0.2160472422838211, + "learning_rate": 3.313430278492773e-05, + "loss": 1.6787, + "step": 20241 + }, + { + "epoch": 6.213014119091467, + "grad_norm": 0.24482262134552002, + "learning_rate": 3.312962362191652e-05, + "loss": 1.7439, + "step": 20242 + }, + { + "epoch": 6.213321055862492, + "grad_norm": 0.2343531847000122, + "learning_rate": 3.312494462562824e-05, + "loss": 1.7981, + "step": 20243 + }, + { + "epoch": 6.213627992633517, + "grad_norm": 0.2385960817337036, + "learning_rate": 3.3120265796109163e-05, + "loss": 1.7144, + "step": 20244 + }, + { + "epoch": 6.213934929404543, + "grad_norm": 0.21878042817115784, + "learning_rate": 3.3115587133405503e-05, + "loss": 1.7057, + "step": 20245 + }, + { + "epoch": 6.214241866175568, + "grad_norm": 0.23426075279712677, + "learning_rate": 3.311090863756351e-05, + "loss": 1.7372, + "step": 20246 + }, + { + "epoch": 6.214548802946593, + "grad_norm": 0.2369524985551834, + "learning_rate": 3.310623030862942e-05, + "loss": 1.7502, + "step": 20247 + }, + { + "epoch": 6.214855739717618, + "grad_norm": 0.31635788083076477, + "learning_rate": 3.3101552146649474e-05, + "loss": 1.7616, + "step": 20248 + }, + { + "epoch": 6.215162676488643, + "grad_norm": 0.2312999814748764, + "learning_rate": 3.309687415166986e-05, + "loss": 1.6991, + "step": 20249 + }, + { + "epoch": 6.2154696132596685, + "grad_norm": 0.23423358798027039, + "learning_rate": 3.309219632373688e-05, + "loss": 1.7737, + "step": 20250 + }, + { + "epoch": 6.215776550030694, + "grad_norm": 0.28763437271118164, + "learning_rate": 3.308751866289671e-05, + "loss": 1.7822, + "step": 20251 + }, + { + "epoch": 6.216083486801719, + "grad_norm": 0.20754525065422058, + "learning_rate": 3.30828411691956e-05, + "loss": 1.7427, + "step": 20252 + }, + { + "epoch": 6.216390423572744, + "grad_norm": 0.31858858466148376, + "learning_rate": 3.307816384267975e-05, + "loss": 1.7384, + "step": 20253 + }, + { + "epoch": 6.216697360343769, + "grad_norm": 0.21968062222003937, + "learning_rate": 3.307348668339543e-05, + "loss": 1.6896, + "step": 20254 + }, + { + "epoch": 6.217004297114794, + "grad_norm": 0.21643556654453278, + "learning_rate": 3.306880969138882e-05, + "loss": 1.7353, + "step": 20255 + }, + { + "epoch": 6.21731123388582, + "grad_norm": 0.22141097486019135, + "learning_rate": 3.306413286670616e-05, + "loss": 1.7254, + "step": 20256 + }, + { + "epoch": 6.217618170656845, + "grad_norm": 0.17666983604431152, + "learning_rate": 3.305945620939367e-05, + "loss": 1.7198, + "step": 20257 + }, + { + "epoch": 6.21792510742787, + "grad_norm": 0.25182467699050903, + "learning_rate": 3.3054779719497544e-05, + "loss": 1.7562, + "step": 20258 + }, + { + "epoch": 6.218232044198895, + "grad_norm": 0.23481281101703644, + "learning_rate": 3.305010339706404e-05, + "loss": 1.8293, + "step": 20259 + }, + { + "epoch": 6.21853898096992, + "grad_norm": 0.23981143534183502, + "learning_rate": 3.304542724213933e-05, + "loss": 1.7619, + "step": 20260 + }, + { + "epoch": 6.218845917740945, + "grad_norm": 0.2388351708650589, + "learning_rate": 3.3040751254769665e-05, + "loss": 1.7471, + "step": 20261 + }, + { + "epoch": 6.219152854511971, + "grad_norm": 0.2039698362350464, + "learning_rate": 3.3036075435001216e-05, + "loss": 1.6893, + "step": 20262 + }, + { + "epoch": 6.219459791282996, + "grad_norm": 0.218357652425766, + "learning_rate": 3.3031399782880224e-05, + "loss": 1.753, + "step": 20263 + }, + { + "epoch": 6.2197667280540205, + "grad_norm": 0.25466734170913696, + "learning_rate": 3.302672429845288e-05, + "loss": 1.7496, + "step": 20264 + }, + { + "epoch": 6.220073664825046, + "grad_norm": 0.1853330284357071, + "learning_rate": 3.302204898176541e-05, + "loss": 1.7779, + "step": 20265 + }, + { + "epoch": 6.220380601596071, + "grad_norm": 0.24044091999530792, + "learning_rate": 3.3017373832863976e-05, + "loss": 1.8226, + "step": 20266 + }, + { + "epoch": 6.2206875383670965, + "grad_norm": 0.2209070324897766, + "learning_rate": 3.3012698851794835e-05, + "loss": 1.7069, + "step": 20267 + }, + { + "epoch": 6.220994475138122, + "grad_norm": 0.2775282561779022, + "learning_rate": 3.3008024038604135e-05, + "loss": 1.7048, + "step": 20268 + }, + { + "epoch": 6.221301411909146, + "grad_norm": 0.22873717546463013, + "learning_rate": 3.3003349393338116e-05, + "loss": 1.7956, + "step": 20269 + }, + { + "epoch": 6.221608348680172, + "grad_norm": 0.27883464097976685, + "learning_rate": 3.2998674916042946e-05, + "loss": 1.6955, + "step": 20270 + }, + { + "epoch": 6.221915285451197, + "grad_norm": 0.2383071482181549, + "learning_rate": 3.2994000606764865e-05, + "loss": 1.7645, + "step": 20271 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 0.26280200481414795, + "learning_rate": 3.298932646555003e-05, + "loss": 1.7854, + "step": 20272 + }, + { + "epoch": 6.222529158993248, + "grad_norm": 0.2387673407793045, + "learning_rate": 3.2984652492444625e-05, + "loss": 1.679, + "step": 20273 + }, + { + "epoch": 6.222836095764273, + "grad_norm": 0.2136983871459961, + "learning_rate": 3.297997868749486e-05, + "loss": 1.7313, + "step": 20274 + }, + { + "epoch": 6.223143032535297, + "grad_norm": 0.2629627585411072, + "learning_rate": 3.297530505074692e-05, + "loss": 1.7452, + "step": 20275 + }, + { + "epoch": 6.223449969306323, + "grad_norm": 0.22018705308437347, + "learning_rate": 3.2970631582247e-05, + "loss": 1.7368, + "step": 20276 + }, + { + "epoch": 6.223756906077348, + "grad_norm": 0.19277356564998627, + "learning_rate": 3.296595828204128e-05, + "loss": 1.7084, + "step": 20277 + }, + { + "epoch": 6.224063842848373, + "grad_norm": 0.18806682527065277, + "learning_rate": 3.2961285150175944e-05, + "loss": 1.6576, + "step": 20278 + }, + { + "epoch": 6.224370779619399, + "grad_norm": 0.2019709348678589, + "learning_rate": 3.295661218669717e-05, + "loss": 1.7594, + "step": 20279 + }, + { + "epoch": 6.224677716390423, + "grad_norm": 0.19662119448184967, + "learning_rate": 3.295193939165114e-05, + "loss": 1.6946, + "step": 20280 + }, + { + "epoch": 6.2249846531614486, + "grad_norm": 0.1880662590265274, + "learning_rate": 3.294726676508404e-05, + "loss": 1.7232, + "step": 20281 + }, + { + "epoch": 6.225291589932474, + "grad_norm": 0.23242273926734924, + "learning_rate": 3.294259430704206e-05, + "loss": 1.7331, + "step": 20282 + }, + { + "epoch": 6.225598526703499, + "grad_norm": 0.19915202260017395, + "learning_rate": 3.293792201757134e-05, + "loss": 1.7844, + "step": 20283 + }, + { + "epoch": 6.225905463474525, + "grad_norm": 0.1845373958349228, + "learning_rate": 3.2933249896718097e-05, + "loss": 1.6803, + "step": 20284 + }, + { + "epoch": 6.226212400245549, + "grad_norm": 0.19340910017490387, + "learning_rate": 3.292857794452846e-05, + "loss": 1.6929, + "step": 20285 + }, + { + "epoch": 6.226519337016574, + "grad_norm": 0.21429216861724854, + "learning_rate": 3.292390616104863e-05, + "loss": 1.6833, + "step": 20286 + }, + { + "epoch": 6.2268262737876, + "grad_norm": 0.2267037034034729, + "learning_rate": 3.291923454632476e-05, + "loss": 1.7271, + "step": 20287 + }, + { + "epoch": 6.227133210558625, + "grad_norm": 0.23121988773345947, + "learning_rate": 3.2914563100403054e-05, + "loss": 1.8443, + "step": 20288 + }, + { + "epoch": 6.22744014732965, + "grad_norm": 0.20980899035930634, + "learning_rate": 3.290989182332964e-05, + "loss": 1.6907, + "step": 20289 + }, + { + "epoch": 6.227747084100676, + "grad_norm": 0.28162500262260437, + "learning_rate": 3.290522071515067e-05, + "loss": 1.7497, + "step": 20290 + }, + { + "epoch": 6.2280540208717, + "grad_norm": 0.2163640707731247, + "learning_rate": 3.290054977591234e-05, + "loss": 1.736, + "step": 20291 + }, + { + "epoch": 6.2283609576427255, + "grad_norm": 0.19144479930400848, + "learning_rate": 3.289587900566079e-05, + "loss": 1.7222, + "step": 20292 + }, + { + "epoch": 6.228667894413751, + "grad_norm": 0.24952897429466248, + "learning_rate": 3.2891208404442216e-05, + "loss": 1.7095, + "step": 20293 + }, + { + "epoch": 6.228974831184776, + "grad_norm": 0.19421981275081635, + "learning_rate": 3.288653797230272e-05, + "loss": 1.7231, + "step": 20294 + }, + { + "epoch": 6.2292817679558015, + "grad_norm": 0.22837944328784943, + "learning_rate": 3.288186770928851e-05, + "loss": 1.7404, + "step": 20295 + }, + { + "epoch": 6.229588704726826, + "grad_norm": 0.2292151004076004, + "learning_rate": 3.2877197615445685e-05, + "loss": 1.6999, + "step": 20296 + }, + { + "epoch": 6.229895641497851, + "grad_norm": 0.18376365303993225, + "learning_rate": 3.2872527690820456e-05, + "loss": 1.681, + "step": 20297 + }, + { + "epoch": 6.230202578268877, + "grad_norm": 0.21331918239593506, + "learning_rate": 3.286785793545893e-05, + "loss": 1.7362, + "step": 20298 + }, + { + "epoch": 6.230509515039902, + "grad_norm": 0.21247150003910065, + "learning_rate": 3.286318834940729e-05, + "loss": 1.7816, + "step": 20299 + }, + { + "epoch": 6.230816451810927, + "grad_norm": 0.19166043400764465, + "learning_rate": 3.285851893271165e-05, + "loss": 1.7209, + "step": 20300 + }, + { + "epoch": 6.231123388581952, + "grad_norm": 0.2139919251203537, + "learning_rate": 3.2853849685418195e-05, + "loss": 1.6946, + "step": 20301 + }, + { + "epoch": 6.231430325352977, + "grad_norm": 0.20296575129032135, + "learning_rate": 3.284918060757303e-05, + "loss": 1.6829, + "step": 20302 + }, + { + "epoch": 6.231737262124002, + "grad_norm": 0.2465996891260147, + "learning_rate": 3.2844511699222314e-05, + "loss": 1.751, + "step": 20303 + }, + { + "epoch": 6.232044198895028, + "grad_norm": 0.23327109217643738, + "learning_rate": 3.283984296041219e-05, + "loss": 1.736, + "step": 20304 + }, + { + "epoch": 6.232351135666053, + "grad_norm": 0.24316997826099396, + "learning_rate": 3.2835174391188806e-05, + "loss": 1.7187, + "step": 20305 + }, + { + "epoch": 6.232658072437078, + "grad_norm": 0.25280308723449707, + "learning_rate": 3.2830505991598294e-05, + "loss": 1.7087, + "step": 20306 + }, + { + "epoch": 6.232965009208103, + "grad_norm": 0.19143202900886536, + "learning_rate": 3.282583776168676e-05, + "loss": 1.674, + "step": 20307 + }, + { + "epoch": 6.233271945979128, + "grad_norm": 0.2667979598045349, + "learning_rate": 3.282116970150038e-05, + "loss": 1.7978, + "step": 20308 + }, + { + "epoch": 6.2335788827501535, + "grad_norm": 0.18397411704063416, + "learning_rate": 3.281650181108526e-05, + "loss": 1.7669, + "step": 20309 + }, + { + "epoch": 6.233885819521179, + "grad_norm": 0.2842588722705841, + "learning_rate": 3.281183409048756e-05, + "loss": 1.8238, + "step": 20310 + }, + { + "epoch": 6.234192756292204, + "grad_norm": 0.20290467143058777, + "learning_rate": 3.280716653975336e-05, + "loss": 1.7317, + "step": 20311 + }, + { + "epoch": 6.234499693063229, + "grad_norm": 0.224524587392807, + "learning_rate": 3.280249915892885e-05, + "loss": 1.8166, + "step": 20312 + }, + { + "epoch": 6.234806629834254, + "grad_norm": 0.28204405307769775, + "learning_rate": 3.2797831948060096e-05, + "loss": 1.7435, + "step": 20313 + }, + { + "epoch": 6.235113566605279, + "grad_norm": 0.2101798951625824, + "learning_rate": 3.2793164907193264e-05, + "loss": 1.6747, + "step": 20314 + }, + { + "epoch": 6.235420503376305, + "grad_norm": 0.1961289346218109, + "learning_rate": 3.278849803637445e-05, + "loss": 1.7131, + "step": 20315 + }, + { + "epoch": 6.23572744014733, + "grad_norm": 0.30541354417800903, + "learning_rate": 3.27838313356498e-05, + "loss": 1.8036, + "step": 20316 + }, + { + "epoch": 6.236034376918354, + "grad_norm": 0.21517200767993927, + "learning_rate": 3.277916480506541e-05, + "loss": 1.7684, + "step": 20317 + }, + { + "epoch": 6.23634131368938, + "grad_norm": 0.22871750593185425, + "learning_rate": 3.2774498444667426e-05, + "loss": 1.7545, + "step": 20318 + }, + { + "epoch": 6.236648250460405, + "grad_norm": 0.24596424400806427, + "learning_rate": 3.276983225450192e-05, + "loss": 1.6705, + "step": 20319 + }, + { + "epoch": 6.23695518723143, + "grad_norm": 0.19123119115829468, + "learning_rate": 3.2765166234615044e-05, + "loss": 1.7402, + "step": 20320 + }, + { + "epoch": 6.237262124002456, + "grad_norm": 0.25287121534347534, + "learning_rate": 3.276050038505288e-05, + "loss": 1.741, + "step": 20321 + }, + { + "epoch": 6.237569060773481, + "grad_norm": 0.19741536676883698, + "learning_rate": 3.275583470586158e-05, + "loss": 1.736, + "step": 20322 + }, + { + "epoch": 6.2378759975445055, + "grad_norm": 0.24529922008514404, + "learning_rate": 3.275116919708723e-05, + "loss": 1.6696, + "step": 20323 + }, + { + "epoch": 6.238182934315531, + "grad_norm": 0.25428420305252075, + "learning_rate": 3.274650385877591e-05, + "loss": 1.696, + "step": 20324 + }, + { + "epoch": 6.238489871086556, + "grad_norm": 0.19502994418144226, + "learning_rate": 3.274183869097377e-05, + "loss": 1.6976, + "step": 20325 + }, + { + "epoch": 6.2387968078575815, + "grad_norm": 0.23710335791110992, + "learning_rate": 3.273717369372688e-05, + "loss": 1.7395, + "step": 20326 + }, + { + "epoch": 6.239103744628607, + "grad_norm": 0.20904341340065002, + "learning_rate": 3.273250886708138e-05, + "loss": 1.7455, + "step": 20327 + }, + { + "epoch": 6.239410681399631, + "grad_norm": 0.2112383097410202, + "learning_rate": 3.272784421108332e-05, + "loss": 1.7401, + "step": 20328 + }, + { + "epoch": 6.239717618170657, + "grad_norm": 0.2310914695262909, + "learning_rate": 3.272317972577886e-05, + "loss": 1.8049, + "step": 20329 + }, + { + "epoch": 6.240024554941682, + "grad_norm": 0.18222108483314514, + "learning_rate": 3.271851541121404e-05, + "loss": 1.7119, + "step": 20330 + }, + { + "epoch": 6.240331491712707, + "grad_norm": 0.18739092350006104, + "learning_rate": 3.2713851267434984e-05, + "loss": 1.744, + "step": 20331 + }, + { + "epoch": 6.240638428483733, + "grad_norm": 0.17722012102603912, + "learning_rate": 3.2709187294487775e-05, + "loss": 1.7054, + "step": 20332 + }, + { + "epoch": 6.240945365254758, + "grad_norm": 0.18650192022323608, + "learning_rate": 3.270452349241854e-05, + "loss": 1.7272, + "step": 20333 + }, + { + "epoch": 6.241252302025782, + "grad_norm": 0.2004886120557785, + "learning_rate": 3.269985986127331e-05, + "loss": 1.6777, + "step": 20334 + }, + { + "epoch": 6.241559238796808, + "grad_norm": 0.1855446845293045, + "learning_rate": 3.269519640109823e-05, + "loss": 1.6823, + "step": 20335 + }, + { + "epoch": 6.241866175567833, + "grad_norm": 0.1950632780790329, + "learning_rate": 3.269053311193934e-05, + "loss": 1.7052, + "step": 20336 + }, + { + "epoch": 6.242173112338858, + "grad_norm": 0.19386698305606842, + "learning_rate": 3.268586999384276e-05, + "loss": 1.7431, + "step": 20337 + }, + { + "epoch": 6.242480049109884, + "grad_norm": 0.2266446053981781, + "learning_rate": 3.268120704685454e-05, + "loss": 1.735, + "step": 20338 + }, + { + "epoch": 6.242786985880908, + "grad_norm": 0.24133828282356262, + "learning_rate": 3.2676544271020814e-05, + "loss": 1.7707, + "step": 20339 + }, + { + "epoch": 6.2430939226519335, + "grad_norm": 0.22397162020206451, + "learning_rate": 3.267188166638763e-05, + "loss": 1.6943, + "step": 20340 + }, + { + "epoch": 6.243400859422959, + "grad_norm": 0.1614205688238144, + "learning_rate": 3.266721923300104e-05, + "loss": 1.6801, + "step": 20341 + }, + { + "epoch": 6.243707796193984, + "grad_norm": 0.22376522421836853, + "learning_rate": 3.2662556970907166e-05, + "loss": 1.6933, + "step": 20342 + }, + { + "epoch": 6.2440147329650095, + "grad_norm": 0.18614265322685242, + "learning_rate": 3.265789488015205e-05, + "loss": 1.7396, + "step": 20343 + }, + { + "epoch": 6.244321669736034, + "grad_norm": 0.2385358214378357, + "learning_rate": 3.265323296078181e-05, + "loss": 1.7782, + "step": 20344 + }, + { + "epoch": 6.244628606507059, + "grad_norm": 0.24316444993019104, + "learning_rate": 3.264857121284246e-05, + "loss": 1.7443, + "step": 20345 + }, + { + "epoch": 6.244935543278085, + "grad_norm": 0.184532031416893, + "learning_rate": 3.264390963638012e-05, + "loss": 1.7603, + "step": 20346 + }, + { + "epoch": 6.24524248004911, + "grad_norm": 0.2018461376428604, + "learning_rate": 3.2639248231440825e-05, + "loss": 1.7289, + "step": 20347 + }, + { + "epoch": 6.245549416820135, + "grad_norm": 0.23732338845729828, + "learning_rate": 3.263458699807066e-05, + "loss": 1.7924, + "step": 20348 + }, + { + "epoch": 6.245856353591161, + "grad_norm": 0.19645710289478302, + "learning_rate": 3.2629925936315674e-05, + "loss": 1.6855, + "step": 20349 + }, + { + "epoch": 6.246163290362185, + "grad_norm": 0.20730608701705933, + "learning_rate": 3.262526504622196e-05, + "loss": 1.7238, + "step": 20350 + }, + { + "epoch": 6.24647022713321, + "grad_norm": 0.21139587461948395, + "learning_rate": 3.2620604327835545e-05, + "loss": 1.7173, + "step": 20351 + }, + { + "epoch": 6.246777163904236, + "grad_norm": 0.22644877433776855, + "learning_rate": 3.261594378120252e-05, + "loss": 1.7976, + "step": 20352 + }, + { + "epoch": 6.247084100675261, + "grad_norm": 0.23719535768032074, + "learning_rate": 3.2611283406368906e-05, + "loss": 1.7549, + "step": 20353 + }, + { + "epoch": 6.247391037446286, + "grad_norm": 0.2046387791633606, + "learning_rate": 3.2606623203380807e-05, + "loss": 1.7343, + "step": 20354 + }, + { + "epoch": 6.247697974217311, + "grad_norm": 0.19325366616249084, + "learning_rate": 3.260196317228422e-05, + "loss": 1.7352, + "step": 20355 + }, + { + "epoch": 6.248004910988336, + "grad_norm": 0.2315458059310913, + "learning_rate": 3.259730331312526e-05, + "loss": 1.7838, + "step": 20356 + }, + { + "epoch": 6.2483118477593615, + "grad_norm": 0.24549536406993866, + "learning_rate": 3.2592643625949956e-05, + "loss": 1.7418, + "step": 20357 + }, + { + "epoch": 6.248618784530387, + "grad_norm": 0.2702246606349945, + "learning_rate": 3.258798411080432e-05, + "loss": 1.7651, + "step": 20358 + }, + { + "epoch": 6.248925721301412, + "grad_norm": 0.20515258610248566, + "learning_rate": 3.2583324767734444e-05, + "loss": 1.6866, + "step": 20359 + }, + { + "epoch": 6.249232658072437, + "grad_norm": 0.2696690261363983, + "learning_rate": 3.257866559678635e-05, + "loss": 1.7446, + "step": 20360 + }, + { + "epoch": 6.249539594843462, + "grad_norm": 0.19707174599170685, + "learning_rate": 3.2574006598006114e-05, + "loss": 1.6835, + "step": 20361 + }, + { + "epoch": 6.249846531614487, + "grad_norm": 0.23478952050209045, + "learning_rate": 3.256934777143974e-05, + "loss": 1.7344, + "step": 20362 + }, + { + "epoch": 6.250153468385513, + "grad_norm": 0.24214082956314087, + "learning_rate": 3.2564689117133306e-05, + "loss": 1.722, + "step": 20363 + }, + { + "epoch": 6.250460405156538, + "grad_norm": 0.18361221253871918, + "learning_rate": 3.256003063513281e-05, + "loss": 1.7336, + "step": 20364 + }, + { + "epoch": 6.250767341927563, + "grad_norm": 0.18548928201198578, + "learning_rate": 3.255537232548433e-05, + "loss": 1.6586, + "step": 20365 + }, + { + "epoch": 6.251074278698588, + "grad_norm": 0.2121812105178833, + "learning_rate": 3.2550714188233874e-05, + "loss": 1.7273, + "step": 20366 + }, + { + "epoch": 6.251381215469613, + "grad_norm": 0.2351878583431244, + "learning_rate": 3.25460562234275e-05, + "loss": 1.7101, + "step": 20367 + }, + { + "epoch": 6.2516881522406385, + "grad_norm": 0.20723144710063934, + "learning_rate": 3.2541398431111216e-05, + "loss": 1.7042, + "step": 20368 + }, + { + "epoch": 6.251995089011664, + "grad_norm": 0.19093643128871918, + "learning_rate": 3.2536740811331084e-05, + "loss": 1.7585, + "step": 20369 + }, + { + "epoch": 6.252302025782689, + "grad_norm": 0.27191361784935, + "learning_rate": 3.2532083364133094e-05, + "loss": 1.7734, + "step": 20370 + }, + { + "epoch": 6.252608962553714, + "grad_norm": 0.21019349992275238, + "learning_rate": 3.2527426089563306e-05, + "loss": 1.7015, + "step": 20371 + }, + { + "epoch": 6.252915899324739, + "grad_norm": 0.2300454080104828, + "learning_rate": 3.2522768987667744e-05, + "loss": 1.7311, + "step": 20372 + }, + { + "epoch": 6.253222836095764, + "grad_norm": 0.24723999202251434, + "learning_rate": 3.25181120584924e-05, + "loss": 1.674, + "step": 20373 + }, + { + "epoch": 6.25352977286679, + "grad_norm": 0.20302192866802216, + "learning_rate": 3.251345530208335e-05, + "loss": 1.6999, + "step": 20374 + }, + { + "epoch": 6.253836709637815, + "grad_norm": 0.25393861532211304, + "learning_rate": 3.250879871848655e-05, + "loss": 1.6761, + "step": 20375 + }, + { + "epoch": 6.25414364640884, + "grad_norm": 0.1879536211490631, + "learning_rate": 3.2504142307748064e-05, + "loss": 1.7233, + "step": 20376 + }, + { + "epoch": 6.254450583179865, + "grad_norm": 0.22197771072387695, + "learning_rate": 3.24994860699139e-05, + "loss": 1.6994, + "step": 20377 + }, + { + "epoch": 6.25475751995089, + "grad_norm": 0.24946242570877075, + "learning_rate": 3.249483000503008e-05, + "loss": 1.8488, + "step": 20378 + }, + { + "epoch": 6.255064456721915, + "grad_norm": 0.25218987464904785, + "learning_rate": 3.2490174113142594e-05, + "loss": 1.7947, + "step": 20379 + }, + { + "epoch": 6.255371393492941, + "grad_norm": 0.23970970511436462, + "learning_rate": 3.248551839429749e-05, + "loss": 1.785, + "step": 20380 + }, + { + "epoch": 6.255678330263966, + "grad_norm": 0.243649423122406, + "learning_rate": 3.248086284854074e-05, + "loss": 1.8089, + "step": 20381 + }, + { + "epoch": 6.2559852670349905, + "grad_norm": 0.18813125789165497, + "learning_rate": 3.247620747591838e-05, + "loss": 1.6892, + "step": 20382 + }, + { + "epoch": 6.256292203806016, + "grad_norm": 0.2495514154434204, + "learning_rate": 3.2471552276476404e-05, + "loss": 1.7573, + "step": 20383 + }, + { + "epoch": 6.256599140577041, + "grad_norm": 0.200107604265213, + "learning_rate": 3.2466897250260835e-05, + "loss": 1.7292, + "step": 20384 + }, + { + "epoch": 6.2569060773480665, + "grad_norm": 0.25782206654548645, + "learning_rate": 3.246224239731765e-05, + "loss": 1.8533, + "step": 20385 + }, + { + "epoch": 6.257213014119092, + "grad_norm": 0.1966158151626587, + "learning_rate": 3.245758771769288e-05, + "loss": 1.648, + "step": 20386 + }, + { + "epoch": 6.257519950890116, + "grad_norm": 0.23248116672039032, + "learning_rate": 3.245293321143249e-05, + "loss": 1.7277, + "step": 20387 + }, + { + "epoch": 6.257826887661142, + "grad_norm": 0.26347780227661133, + "learning_rate": 3.244827887858251e-05, + "loss": 1.7429, + "step": 20388 + }, + { + "epoch": 6.258133824432167, + "grad_norm": 0.20794285833835602, + "learning_rate": 3.244362471918894e-05, + "loss": 1.7358, + "step": 20389 + }, + { + "epoch": 6.258440761203192, + "grad_norm": 0.200898677110672, + "learning_rate": 3.243897073329774e-05, + "loss": 1.6661, + "step": 20390 + }, + { + "epoch": 6.258747697974218, + "grad_norm": 0.20945283770561218, + "learning_rate": 3.2434316920954935e-05, + "loss": 1.7036, + "step": 20391 + }, + { + "epoch": 6.259054634745242, + "grad_norm": 0.3154161274433136, + "learning_rate": 3.242966328220649e-05, + "loss": 1.8174, + "step": 20392 + }, + { + "epoch": 6.259361571516267, + "grad_norm": 0.19321799278259277, + "learning_rate": 3.242500981709843e-05, + "loss": 1.6823, + "step": 20393 + }, + { + "epoch": 6.259668508287293, + "grad_norm": 0.22610130906105042, + "learning_rate": 3.2420356525676696e-05, + "loss": 1.6865, + "step": 20394 + }, + { + "epoch": 6.259975445058318, + "grad_norm": 0.19190505146980286, + "learning_rate": 3.241570340798734e-05, + "loss": 1.6663, + "step": 20395 + }, + { + "epoch": 6.260282381829343, + "grad_norm": 0.21956418454647064, + "learning_rate": 3.2411050464076276e-05, + "loss": 1.7279, + "step": 20396 + }, + { + "epoch": 6.260589318600369, + "grad_norm": 0.2448553591966629, + "learning_rate": 3.240639769398956e-05, + "loss": 1.7438, + "step": 20397 + }, + { + "epoch": 6.260896255371393, + "grad_norm": 0.19194214046001434, + "learning_rate": 3.2401745097773096e-05, + "loss": 1.7429, + "step": 20398 + }, + { + "epoch": 6.2612031921424185, + "grad_norm": 0.2567521333694458, + "learning_rate": 3.239709267547291e-05, + "loss": 1.7051, + "step": 20399 + }, + { + "epoch": 6.261510128913444, + "grad_norm": 0.18335886299610138, + "learning_rate": 3.239244042713498e-05, + "loss": 1.6828, + "step": 20400 + }, + { + "epoch": 6.261817065684469, + "grad_norm": 0.20112362504005432, + "learning_rate": 3.238778835280527e-05, + "loss": 1.6887, + "step": 20401 + }, + { + "epoch": 6.2621240024554945, + "grad_norm": 0.17095179855823517, + "learning_rate": 3.238313645252975e-05, + "loss": 1.7202, + "step": 20402 + }, + { + "epoch": 6.262430939226519, + "grad_norm": 0.24681979417800903, + "learning_rate": 3.237848472635442e-05, + "loss": 1.7196, + "step": 20403 + }, + { + "epoch": 6.262737875997544, + "grad_norm": 0.2022300660610199, + "learning_rate": 3.237383317432522e-05, + "loss": 1.7265, + "step": 20404 + }, + { + "epoch": 6.26304481276857, + "grad_norm": 0.2900621294975281, + "learning_rate": 3.236918179648813e-05, + "loss": 1.7051, + "step": 20405 + }, + { + "epoch": 6.263351749539595, + "grad_norm": 0.37675586342811584, + "learning_rate": 3.2364530592889135e-05, + "loss": 1.7747, + "step": 20406 + }, + { + "epoch": 6.26365868631062, + "grad_norm": 0.19033703207969666, + "learning_rate": 3.235987956357416e-05, + "loss": 1.7529, + "step": 20407 + }, + { + "epoch": 6.263965623081646, + "grad_norm": 0.2877013385295868, + "learning_rate": 3.235522870858922e-05, + "loss": 1.6942, + "step": 20408 + }, + { + "epoch": 6.26427255985267, + "grad_norm": 0.22717125713825226, + "learning_rate": 3.235057802798023e-05, + "loss": 1.7302, + "step": 20409 + }, + { + "epoch": 6.264579496623695, + "grad_norm": 0.2571920156478882, + "learning_rate": 3.2345927521793185e-05, + "loss": 1.6782, + "step": 20410 + }, + { + "epoch": 6.264886433394721, + "grad_norm": 0.43085625767707825, + "learning_rate": 3.234127719007403e-05, + "loss": 1.7946, + "step": 20411 + }, + { + "epoch": 6.265193370165746, + "grad_norm": 0.19355928897857666, + "learning_rate": 3.2336627032868726e-05, + "loss": 1.7288, + "step": 20412 + }, + { + "epoch": 6.265500306936771, + "grad_norm": 0.24871474504470825, + "learning_rate": 3.233197705022322e-05, + "loss": 1.6862, + "step": 20413 + }, + { + "epoch": 6.265807243707796, + "grad_norm": 0.26919320225715637, + "learning_rate": 3.232732724218348e-05, + "loss": 1.8061, + "step": 20414 + }, + { + "epoch": 6.266114180478821, + "grad_norm": 0.21714363992214203, + "learning_rate": 3.2322677608795436e-05, + "loss": 1.7036, + "step": 20415 + }, + { + "epoch": 6.2664211172498465, + "grad_norm": 0.24496719241142273, + "learning_rate": 3.231802815010506e-05, + "loss": 1.7334, + "step": 20416 + }, + { + "epoch": 6.266728054020872, + "grad_norm": 0.22501519322395325, + "learning_rate": 3.231337886615831e-05, + "loss": 1.7545, + "step": 20417 + }, + { + "epoch": 6.267034990791897, + "grad_norm": 0.2683655917644501, + "learning_rate": 3.23087297570011e-05, + "loss": 1.7235, + "step": 20418 + }, + { + "epoch": 6.267341927562922, + "grad_norm": 0.23341359198093414, + "learning_rate": 3.230408082267938e-05, + "loss": 1.7389, + "step": 20419 + }, + { + "epoch": 6.267648864333947, + "grad_norm": 0.2914128601551056, + "learning_rate": 3.229943206323913e-05, + "loss": 1.7223, + "step": 20420 + }, + { + "epoch": 6.267955801104972, + "grad_norm": 0.2072528451681137, + "learning_rate": 3.229478347872625e-05, + "loss": 1.7422, + "step": 20421 + }, + { + "epoch": 6.268262737875998, + "grad_norm": 0.22678662836551666, + "learning_rate": 3.229013506918671e-05, + "loss": 1.6973, + "step": 20422 + }, + { + "epoch": 6.268569674647023, + "grad_norm": 0.1928883194923401, + "learning_rate": 3.228548683466643e-05, + "loss": 1.7235, + "step": 20423 + }, + { + "epoch": 6.268876611418047, + "grad_norm": 0.2402963638305664, + "learning_rate": 3.2280838775211345e-05, + "loss": 1.7587, + "step": 20424 + }, + { + "epoch": 6.269183548189073, + "grad_norm": 0.20416294038295746, + "learning_rate": 3.227619089086742e-05, + "loss": 1.7591, + "step": 20425 + }, + { + "epoch": 6.269490484960098, + "grad_norm": 0.20308947563171387, + "learning_rate": 3.227154318168053e-05, + "loss": 1.7264, + "step": 20426 + }, + { + "epoch": 6.269797421731123, + "grad_norm": 0.18733863532543182, + "learning_rate": 3.226689564769667e-05, + "loss": 1.6943, + "step": 20427 + }, + { + "epoch": 6.270104358502149, + "grad_norm": 0.183793842792511, + "learning_rate": 3.226224828896173e-05, + "loss": 1.7082, + "step": 20428 + }, + { + "epoch": 6.270411295273174, + "grad_norm": 0.20471547544002533, + "learning_rate": 3.225760110552165e-05, + "loss": 1.7352, + "step": 20429 + }, + { + "epoch": 6.2707182320441985, + "grad_norm": 0.23386713862419128, + "learning_rate": 3.225295409742234e-05, + "loss": 1.7666, + "step": 20430 + }, + { + "epoch": 6.271025168815224, + "grad_norm": 0.2024994194507599, + "learning_rate": 3.224830726470976e-05, + "loss": 1.6573, + "step": 20431 + }, + { + "epoch": 6.271332105586249, + "grad_norm": 0.2352776825428009, + "learning_rate": 3.2243660607429805e-05, + "loss": 1.7884, + "step": 20432 + }, + { + "epoch": 6.2716390423572745, + "grad_norm": 0.19755585491657257, + "learning_rate": 3.223901412562841e-05, + "loss": 1.6964, + "step": 20433 + }, + { + "epoch": 6.2719459791283, + "grad_norm": 0.25833839178085327, + "learning_rate": 3.223436781935148e-05, + "loss": 1.715, + "step": 20434 + }, + { + "epoch": 6.272252915899324, + "grad_norm": 0.2110220193862915, + "learning_rate": 3.222972168864493e-05, + "loss": 1.7617, + "step": 20435 + }, + { + "epoch": 6.27255985267035, + "grad_norm": 0.23262515664100647, + "learning_rate": 3.2225075733554685e-05, + "loss": 1.7616, + "step": 20436 + }, + { + "epoch": 6.272866789441375, + "grad_norm": 0.1926576942205429, + "learning_rate": 3.222042995412669e-05, + "loss": 1.6956, + "step": 20437 + }, + { + "epoch": 6.2731737262124, + "grad_norm": 0.20662757754325867, + "learning_rate": 3.22157843504068e-05, + "loss": 1.703, + "step": 20438 + }, + { + "epoch": 6.273480662983426, + "grad_norm": 0.22137406468391418, + "learning_rate": 3.2211138922440975e-05, + "loss": 1.6961, + "step": 20439 + }, + { + "epoch": 6.273787599754451, + "grad_norm": 0.25777003169059753, + "learning_rate": 3.2206493670275086e-05, + "loss": 1.704, + "step": 20440 + }, + { + "epoch": 6.274094536525475, + "grad_norm": 0.20540094375610352, + "learning_rate": 3.2201848593955046e-05, + "loss": 1.6759, + "step": 20441 + }, + { + "epoch": 6.274401473296501, + "grad_norm": 0.2447255402803421, + "learning_rate": 3.21972036935268e-05, + "loss": 1.7379, + "step": 20442 + }, + { + "epoch": 6.274708410067526, + "grad_norm": 0.2017194777727127, + "learning_rate": 3.219255896903619e-05, + "loss": 1.6518, + "step": 20443 + }, + { + "epoch": 6.2750153468385514, + "grad_norm": 0.22742003202438354, + "learning_rate": 3.2187914420529174e-05, + "loss": 1.7568, + "step": 20444 + }, + { + "epoch": 6.275322283609577, + "grad_norm": 0.2065356969833374, + "learning_rate": 3.218327004805161e-05, + "loss": 1.643, + "step": 20445 + }, + { + "epoch": 6.275629220380601, + "grad_norm": 0.18083053827285767, + "learning_rate": 3.217862585164942e-05, + "loss": 1.77, + "step": 20446 + }, + { + "epoch": 6.275936157151627, + "grad_norm": 0.2175968736410141, + "learning_rate": 3.2173981831368484e-05, + "loss": 1.738, + "step": 20447 + }, + { + "epoch": 6.276243093922652, + "grad_norm": 0.17635080218315125, + "learning_rate": 3.216933798725473e-05, + "loss": 1.7109, + "step": 20448 + }, + { + "epoch": 6.276550030693677, + "grad_norm": 0.22289423644542694, + "learning_rate": 3.216469431935401e-05, + "loss": 1.7853, + "step": 20449 + }, + { + "epoch": 6.276856967464703, + "grad_norm": 0.21214549243450165, + "learning_rate": 3.216005082771225e-05, + "loss": 1.8196, + "step": 20450 + }, + { + "epoch": 6.277163904235728, + "grad_norm": 0.21992212533950806, + "learning_rate": 3.215540751237531e-05, + "loss": 1.7445, + "step": 20451 + }, + { + "epoch": 6.277470841006752, + "grad_norm": 0.16256563365459442, + "learning_rate": 3.2150764373389096e-05, + "loss": 1.6582, + "step": 20452 + }, + { + "epoch": 6.277777777777778, + "grad_norm": 0.1885976791381836, + "learning_rate": 3.214612141079949e-05, + "loss": 1.7491, + "step": 20453 + }, + { + "epoch": 6.278084714548803, + "grad_norm": 0.24101774394512177, + "learning_rate": 3.2141478624652386e-05, + "loss": 1.7476, + "step": 20454 + }, + { + "epoch": 6.278391651319828, + "grad_norm": 0.23378998041152954, + "learning_rate": 3.213683601499364e-05, + "loss": 1.7575, + "step": 20455 + }, + { + "epoch": 6.278698588090854, + "grad_norm": 0.2032867670059204, + "learning_rate": 3.213219358186917e-05, + "loss": 1.6999, + "step": 20456 + }, + { + "epoch": 6.279005524861878, + "grad_norm": 0.21332181990146637, + "learning_rate": 3.2127551325324836e-05, + "loss": 1.6634, + "step": 20457 + }, + { + "epoch": 6.2793124616329035, + "grad_norm": 0.23767098784446716, + "learning_rate": 3.2122909245406494e-05, + "loss": 1.8023, + "step": 20458 + }, + { + "epoch": 6.279619398403929, + "grad_norm": 0.19987638294696808, + "learning_rate": 3.211826734216007e-05, + "loss": 1.6848, + "step": 20459 + }, + { + "epoch": 6.279926335174954, + "grad_norm": 0.22169579565525055, + "learning_rate": 3.2113625615631385e-05, + "loss": 1.7599, + "step": 20460 + }, + { + "epoch": 6.2802332719459795, + "grad_norm": 0.1768191009759903, + "learning_rate": 3.210898406586634e-05, + "loss": 1.6894, + "step": 20461 + }, + { + "epoch": 6.280540208717004, + "grad_norm": 0.1923041045665741, + "learning_rate": 3.21043426929108e-05, + "loss": 1.7379, + "step": 20462 + }, + { + "epoch": 6.280847145488029, + "grad_norm": 0.1836252212524414, + "learning_rate": 3.2099701496810644e-05, + "loss": 1.6748, + "step": 20463 + }, + { + "epoch": 6.281154082259055, + "grad_norm": 0.2203192561864853, + "learning_rate": 3.2095060477611705e-05, + "loss": 1.6969, + "step": 20464 + }, + { + "epoch": 6.28146101903008, + "grad_norm": 0.25511759519577026, + "learning_rate": 3.20904196353599e-05, + "loss": 1.7806, + "step": 20465 + }, + { + "epoch": 6.281767955801105, + "grad_norm": 0.19464822113513947, + "learning_rate": 3.208577897010106e-05, + "loss": 1.6784, + "step": 20466 + }, + { + "epoch": 6.28207489257213, + "grad_norm": 0.1949714869260788, + "learning_rate": 3.208113848188105e-05, + "loss": 1.713, + "step": 20467 + }, + { + "epoch": 6.282381829343155, + "grad_norm": 0.22094127535820007, + "learning_rate": 3.207649817074572e-05, + "loss": 1.7397, + "step": 20468 + }, + { + "epoch": 6.28268876611418, + "grad_norm": 0.22343899309635162, + "learning_rate": 3.2071858036740954e-05, + "loss": 1.717, + "step": 20469 + }, + { + "epoch": 6.282995702885206, + "grad_norm": 0.20854893326759338, + "learning_rate": 3.2067218079912584e-05, + "loss": 1.7255, + "step": 20470 + }, + { + "epoch": 6.283302639656231, + "grad_norm": 0.21306286752223969, + "learning_rate": 3.206257830030649e-05, + "loss": 1.7251, + "step": 20471 + }, + { + "epoch": 6.283609576427256, + "grad_norm": 0.24995777010917664, + "learning_rate": 3.20579386979685e-05, + "loss": 1.7892, + "step": 20472 + }, + { + "epoch": 6.283916513198281, + "grad_norm": 0.23720023036003113, + "learning_rate": 3.2053299272944486e-05, + "loss": 1.7843, + "step": 20473 + }, + { + "epoch": 6.284223449969306, + "grad_norm": 0.2042113095521927, + "learning_rate": 3.204866002528029e-05, + "loss": 1.7318, + "step": 20474 + }, + { + "epoch": 6.2845303867403315, + "grad_norm": 0.22996367514133453, + "learning_rate": 3.2044020955021735e-05, + "loss": 1.6875, + "step": 20475 + }, + { + "epoch": 6.284837323511357, + "grad_norm": 0.187749981880188, + "learning_rate": 3.203938206221471e-05, + "loss": 1.7297, + "step": 20476 + }, + { + "epoch": 6.285144260282382, + "grad_norm": 0.18279509246349335, + "learning_rate": 3.2034743346905025e-05, + "loss": 1.6858, + "step": 20477 + }, + { + "epoch": 6.285451197053407, + "grad_norm": 0.1871512532234192, + "learning_rate": 3.203010480913855e-05, + "loss": 1.7224, + "step": 20478 + }, + { + "epoch": 6.285758133824432, + "grad_norm": 0.17732922732830048, + "learning_rate": 3.202546644896109e-05, + "loss": 1.6872, + "step": 20479 + }, + { + "epoch": 6.286065070595457, + "grad_norm": 0.21146097779273987, + "learning_rate": 3.2020828266418527e-05, + "loss": 1.797, + "step": 20480 + }, + { + "epoch": 6.286372007366483, + "grad_norm": 0.18914340436458588, + "learning_rate": 3.201619026155666e-05, + "loss": 1.7149, + "step": 20481 + }, + { + "epoch": 6.286678944137508, + "grad_norm": 0.20919133722782135, + "learning_rate": 3.2011552434421364e-05, + "loss": 1.7803, + "step": 20482 + }, + { + "epoch": 6.286985880908533, + "grad_norm": 0.17882505059242249, + "learning_rate": 3.200691478505843e-05, + "loss": 1.757, + "step": 20483 + }, + { + "epoch": 6.287292817679558, + "grad_norm": 0.1850014477968216, + "learning_rate": 3.200227731351373e-05, + "loss": 1.7006, + "step": 20484 + }, + { + "epoch": 6.287599754450583, + "grad_norm": 0.19999323785305023, + "learning_rate": 3.1997640019833056e-05, + "loss": 1.702, + "step": 20485 + }, + { + "epoch": 6.287906691221608, + "grad_norm": 0.20464713871479034, + "learning_rate": 3.1993002904062255e-05, + "loss": 1.7272, + "step": 20486 + }, + { + "epoch": 6.288213627992634, + "grad_norm": 0.2105564922094345, + "learning_rate": 3.1988365966247154e-05, + "loss": 1.7062, + "step": 20487 + }, + { + "epoch": 6.288520564763659, + "grad_norm": 0.26322871446609497, + "learning_rate": 3.198372920643359e-05, + "loss": 1.7309, + "step": 20488 + }, + { + "epoch": 6.2888275015346835, + "grad_norm": 0.22787201404571533, + "learning_rate": 3.197909262466736e-05, + "loss": 1.7797, + "step": 20489 + }, + { + "epoch": 6.289134438305709, + "grad_norm": 0.21409621834754944, + "learning_rate": 3.1974456220994314e-05, + "loss": 1.8211, + "step": 20490 + }, + { + "epoch": 6.289441375076734, + "grad_norm": 0.2241450846195221, + "learning_rate": 3.196981999546025e-05, + "loss": 1.7255, + "step": 20491 + }, + { + "epoch": 6.2897483118477595, + "grad_norm": 0.23141883313655853, + "learning_rate": 3.1965183948110985e-05, + "loss": 1.7695, + "step": 20492 + }, + { + "epoch": 6.290055248618785, + "grad_norm": 0.209358349442482, + "learning_rate": 3.196054807899236e-05, + "loss": 1.6808, + "step": 20493 + }, + { + "epoch": 6.290362185389809, + "grad_norm": 0.20730538666248322, + "learning_rate": 3.195591238815015e-05, + "loss": 1.6847, + "step": 20494 + }, + { + "epoch": 6.290669122160835, + "grad_norm": 0.2568998634815216, + "learning_rate": 3.195127687563021e-05, + "loss": 1.664, + "step": 20495 + }, + { + "epoch": 6.29097605893186, + "grad_norm": 0.238932803273201, + "learning_rate": 3.1946641541478316e-05, + "loss": 1.7166, + "step": 20496 + }, + { + "epoch": 6.291282995702885, + "grad_norm": 0.235393688082695, + "learning_rate": 3.19420063857403e-05, + "loss": 1.6572, + "step": 20497 + }, + { + "epoch": 6.291589932473911, + "grad_norm": 0.2888807952404022, + "learning_rate": 3.1937371408461944e-05, + "loss": 1.7484, + "step": 20498 + }, + { + "epoch": 6.291896869244935, + "grad_norm": 0.18588709831237793, + "learning_rate": 3.1932736609689096e-05, + "loss": 1.7027, + "step": 20499 + }, + { + "epoch": 6.29220380601596, + "grad_norm": 0.3065604865550995, + "learning_rate": 3.1928101989467514e-05, + "loss": 1.8051, + "step": 20500 + }, + { + "epoch": 6.292510742786986, + "grad_norm": 0.2480497658252716, + "learning_rate": 3.192346754784304e-05, + "loss": 1.7749, + "step": 20501 + }, + { + "epoch": 6.292817679558011, + "grad_norm": 0.268686443567276, + "learning_rate": 3.1918833284861436e-05, + "loss": 1.7062, + "step": 20502 + }, + { + "epoch": 6.293124616329036, + "grad_norm": 0.337510883808136, + "learning_rate": 3.191419920056853e-05, + "loss": 1.745, + "step": 20503 + }, + { + "epoch": 6.293431553100062, + "grad_norm": 0.18532821536064148, + "learning_rate": 3.190956529501009e-05, + "loss": 1.7098, + "step": 20504 + }, + { + "epoch": 6.293738489871086, + "grad_norm": 0.27805468440055847, + "learning_rate": 3.1904931568231956e-05, + "loss": 1.7252, + "step": 20505 + }, + { + "epoch": 6.2940454266421115, + "grad_norm": 0.22137443721294403, + "learning_rate": 3.190029802027987e-05, + "loss": 1.7595, + "step": 20506 + }, + { + "epoch": 6.294352363413137, + "grad_norm": 0.23159445822238922, + "learning_rate": 3.189566465119968e-05, + "loss": 1.7503, + "step": 20507 + }, + { + "epoch": 6.294659300184162, + "grad_norm": 0.2089100182056427, + "learning_rate": 3.189103146103712e-05, + "loss": 1.7021, + "step": 20508 + }, + { + "epoch": 6.2949662369551875, + "grad_norm": 0.1985119879245758, + "learning_rate": 3.1886398449838e-05, + "loss": 1.7468, + "step": 20509 + }, + { + "epoch": 6.295273173726212, + "grad_norm": 0.18612028658390045, + "learning_rate": 3.188176561764812e-05, + "loss": 1.6657, + "step": 20510 + }, + { + "epoch": 6.295580110497237, + "grad_norm": 0.22453728318214417, + "learning_rate": 3.1877132964513226e-05, + "loss": 1.7223, + "step": 20511 + }, + { + "epoch": 6.295887047268263, + "grad_norm": 0.270304799079895, + "learning_rate": 3.187250049047916e-05, + "loss": 1.7548, + "step": 20512 + }, + { + "epoch": 6.296193984039288, + "grad_norm": 0.19762152433395386, + "learning_rate": 3.1867868195591643e-05, + "loss": 1.6945, + "step": 20513 + }, + { + "epoch": 6.296500920810313, + "grad_norm": 0.25173795223236084, + "learning_rate": 3.1863236079896486e-05, + "loss": 1.7303, + "step": 20514 + }, + { + "epoch": 6.296807857581339, + "grad_norm": 0.2073308676481247, + "learning_rate": 3.185860414343945e-05, + "loss": 1.7327, + "step": 20515 + }, + { + "epoch": 6.297114794352363, + "grad_norm": 0.24174070358276367, + "learning_rate": 3.185397238626635e-05, + "loss": 1.7577, + "step": 20516 + }, + { + "epoch": 6.297421731123388, + "grad_norm": 0.1950366348028183, + "learning_rate": 3.1849340808422905e-05, + "loss": 1.7137, + "step": 20517 + }, + { + "epoch": 6.297728667894414, + "grad_norm": 0.23416653275489807, + "learning_rate": 3.1844709409954936e-05, + "loss": 1.7547, + "step": 20518 + }, + { + "epoch": 6.298035604665439, + "grad_norm": 0.1939592808485031, + "learning_rate": 3.184007819090817e-05, + "loss": 1.7215, + "step": 20519 + }, + { + "epoch": 6.298342541436464, + "grad_norm": 0.21807245910167694, + "learning_rate": 3.1835447151328405e-05, + "loss": 1.7021, + "step": 20520 + }, + { + "epoch": 6.298649478207489, + "grad_norm": 0.21653762459754944, + "learning_rate": 3.183081629126138e-05, + "loss": 1.7426, + "step": 20521 + }, + { + "epoch": 6.298956414978514, + "grad_norm": 0.20749153196811676, + "learning_rate": 3.18261856107529e-05, + "loss": 1.7302, + "step": 20522 + }, + { + "epoch": 6.2992633517495396, + "grad_norm": 0.23450545966625214, + "learning_rate": 3.182155510984869e-05, + "loss": 1.7414, + "step": 20523 + }, + { + "epoch": 6.299570288520565, + "grad_norm": 0.17081578075885773, + "learning_rate": 3.181692478859455e-05, + "loss": 1.7017, + "step": 20524 + }, + { + "epoch": 6.29987722529159, + "grad_norm": 0.20244698226451874, + "learning_rate": 3.18122946470362e-05, + "loss": 1.6765, + "step": 20525 + }, + { + "epoch": 6.300184162062616, + "grad_norm": 0.20153406262397766, + "learning_rate": 3.180766468521941e-05, + "loss": 1.7437, + "step": 20526 + }, + { + "epoch": 6.30049109883364, + "grad_norm": 0.21135647594928741, + "learning_rate": 3.180303490318996e-05, + "loss": 1.7202, + "step": 20527 + }, + { + "epoch": 6.300798035604665, + "grad_norm": 0.20342735946178436, + "learning_rate": 3.1798405300993555e-05, + "loss": 1.7268, + "step": 20528 + }, + { + "epoch": 6.301104972375691, + "grad_norm": 0.21153734624385834, + "learning_rate": 3.1793775878676e-05, + "loss": 1.7455, + "step": 20529 + }, + { + "epoch": 6.301411909146716, + "grad_norm": 0.2197744995355606, + "learning_rate": 3.1789146636283015e-05, + "loss": 1.7876, + "step": 20530 + }, + { + "epoch": 6.301718845917741, + "grad_norm": 0.2236124575138092, + "learning_rate": 3.1784517573860356e-05, + "loss": 1.7454, + "step": 20531 + }, + { + "epoch": 6.302025782688766, + "grad_norm": 0.22071333229541779, + "learning_rate": 3.177988869145376e-05, + "loss": 1.7197, + "step": 20532 + }, + { + "epoch": 6.302332719459791, + "grad_norm": 0.20137591660022736, + "learning_rate": 3.177525998910901e-05, + "loss": 1.7153, + "step": 20533 + }, + { + "epoch": 6.3026396562308165, + "grad_norm": 0.18981720507144928, + "learning_rate": 3.17706314668718e-05, + "loss": 1.6948, + "step": 20534 + }, + { + "epoch": 6.302946593001842, + "grad_norm": 0.20803335309028625, + "learning_rate": 3.176600312478791e-05, + "loss": 1.7454, + "step": 20535 + }, + { + "epoch": 6.303253529772867, + "grad_norm": 0.2224191278219223, + "learning_rate": 3.176137496290305e-05, + "loss": 1.708, + "step": 20536 + }, + { + "epoch": 6.303560466543892, + "grad_norm": 0.21110501885414124, + "learning_rate": 3.175674698126298e-05, + "loss": 1.6976, + "step": 20537 + }, + { + "epoch": 6.303867403314917, + "grad_norm": 0.19902437925338745, + "learning_rate": 3.175211917991342e-05, + "loss": 1.7246, + "step": 20538 + }, + { + "epoch": 6.304174340085942, + "grad_norm": 0.1930927336215973, + "learning_rate": 3.174749155890013e-05, + "loss": 1.7849, + "step": 20539 + }, + { + "epoch": 6.304481276856968, + "grad_norm": 0.19350691139698029, + "learning_rate": 3.174286411826881e-05, + "loss": 1.7441, + "step": 20540 + }, + { + "epoch": 6.304788213627993, + "grad_norm": 0.18532924354076385, + "learning_rate": 3.173823685806523e-05, + "loss": 1.6675, + "step": 20541 + }, + { + "epoch": 6.305095150399017, + "grad_norm": 0.18890263140201569, + "learning_rate": 3.173360977833508e-05, + "loss": 1.7889, + "step": 20542 + }, + { + "epoch": 6.305402087170043, + "grad_norm": 0.20418904721736908, + "learning_rate": 3.17289828791241e-05, + "loss": 1.8298, + "step": 20543 + }, + { + "epoch": 6.305709023941068, + "grad_norm": 0.2298857718706131, + "learning_rate": 3.172435616047804e-05, + "loss": 1.7889, + "step": 20544 + }, + { + "epoch": 6.306015960712093, + "grad_norm": 0.20661889016628265, + "learning_rate": 3.171972962244258e-05, + "loss": 1.74, + "step": 20545 + }, + { + "epoch": 6.306322897483119, + "grad_norm": 0.17712774872779846, + "learning_rate": 3.1715103265063496e-05, + "loss": 1.72, + "step": 20546 + }, + { + "epoch": 6.306629834254144, + "grad_norm": 0.16776354610919952, + "learning_rate": 3.1710477088386456e-05, + "loss": 1.6715, + "step": 20547 + }, + { + "epoch": 6.3069367710251685, + "grad_norm": 0.21919682621955872, + "learning_rate": 3.170585109245721e-05, + "loss": 1.7232, + "step": 20548 + }, + { + "epoch": 6.307243707796194, + "grad_norm": 0.2026829719543457, + "learning_rate": 3.170122527732144e-05, + "loss": 1.7551, + "step": 20549 + }, + { + "epoch": 6.307550644567219, + "grad_norm": 0.18783780932426453, + "learning_rate": 3.169659964302493e-05, + "loss": 1.7024, + "step": 20550 + }, + { + "epoch": 6.3078575813382445, + "grad_norm": 0.2058420479297638, + "learning_rate": 3.1691974189613316e-05, + "loss": 1.7006, + "step": 20551 + }, + { + "epoch": 6.30816451810927, + "grad_norm": 0.21351832151412964, + "learning_rate": 3.168734891713237e-05, + "loss": 1.7586, + "step": 20552 + }, + { + "epoch": 6.308471454880294, + "grad_norm": 0.19816654920578003, + "learning_rate": 3.168272382562776e-05, + "loss": 1.7532, + "step": 20553 + }, + { + "epoch": 6.30877839165132, + "grad_norm": 0.18253186345100403, + "learning_rate": 3.16780989151452e-05, + "loss": 1.7413, + "step": 20554 + }, + { + "epoch": 6.309085328422345, + "grad_norm": 0.23097483813762665, + "learning_rate": 3.167347418573042e-05, + "loss": 1.7355, + "step": 20555 + }, + { + "epoch": 6.30939226519337, + "grad_norm": 0.1984725296497345, + "learning_rate": 3.166884963742911e-05, + "loss": 1.6754, + "step": 20556 + }, + { + "epoch": 6.309699201964396, + "grad_norm": 0.2385166734457016, + "learning_rate": 3.166422527028696e-05, + "loss": 1.7322, + "step": 20557 + }, + { + "epoch": 6.310006138735421, + "grad_norm": 0.23216524720191956, + "learning_rate": 3.165960108434971e-05, + "loss": 1.7426, + "step": 20558 + }, + { + "epoch": 6.310313075506445, + "grad_norm": 0.22017790377140045, + "learning_rate": 3.165497707966301e-05, + "loss": 1.6977, + "step": 20559 + }, + { + "epoch": 6.310620012277471, + "grad_norm": 0.2934584617614746, + "learning_rate": 3.165035325627257e-05, + "loss": 1.7252, + "step": 20560 + }, + { + "epoch": 6.310926949048496, + "grad_norm": 0.21830198168754578, + "learning_rate": 3.1645729614224126e-05, + "loss": 1.781, + "step": 20561 + }, + { + "epoch": 6.311233885819521, + "grad_norm": 0.3082836866378784, + "learning_rate": 3.1641106153563306e-05, + "loss": 1.8015, + "step": 20562 + }, + { + "epoch": 6.311540822590547, + "grad_norm": 0.22441358864307404, + "learning_rate": 3.163648287433586e-05, + "loss": 1.8058, + "step": 20563 + }, + { + "epoch": 6.311847759361571, + "grad_norm": 0.36623889207839966, + "learning_rate": 3.163185977658744e-05, + "loss": 1.7092, + "step": 20564 + }, + { + "epoch": 6.3121546961325965, + "grad_norm": 0.22231145203113556, + "learning_rate": 3.1627236860363755e-05, + "loss": 1.6432, + "step": 20565 + }, + { + "epoch": 6.312461632903622, + "grad_norm": 0.25871971249580383, + "learning_rate": 3.162261412571047e-05, + "loss": 1.7156, + "step": 20566 + }, + { + "epoch": 6.312768569674647, + "grad_norm": 0.24574241042137146, + "learning_rate": 3.16179915726733e-05, + "loss": 1.7977, + "step": 20567 + }, + { + "epoch": 6.3130755064456725, + "grad_norm": 0.197379007935524, + "learning_rate": 3.1613369201297895e-05, + "loss": 1.6966, + "step": 20568 + }, + { + "epoch": 6.313382443216697, + "grad_norm": 0.2149469256401062, + "learning_rate": 3.1608747011629975e-05, + "loss": 1.7385, + "step": 20569 + }, + { + "epoch": 6.313689379987722, + "grad_norm": 0.21942345798015594, + "learning_rate": 3.1604125003715174e-05, + "loss": 1.7369, + "step": 20570 + }, + { + "epoch": 6.313996316758748, + "grad_norm": 0.20977036654949188, + "learning_rate": 3.1599503177599197e-05, + "loss": 1.7429, + "step": 20571 + }, + { + "epoch": 6.314303253529773, + "grad_norm": 0.20113405585289001, + "learning_rate": 3.159488153332772e-05, + "loss": 1.7163, + "step": 20572 + }, + { + "epoch": 6.314610190300798, + "grad_norm": 0.22031868994235992, + "learning_rate": 3.1590260070946414e-05, + "loss": 1.7085, + "step": 20573 + }, + { + "epoch": 6.314917127071823, + "grad_norm": 0.24137777090072632, + "learning_rate": 3.158563879050094e-05, + "loss": 1.7169, + "step": 20574 + }, + { + "epoch": 6.315224063842848, + "grad_norm": 0.20265905559062958, + "learning_rate": 3.1581017692036985e-05, + "loss": 1.7466, + "step": 20575 + }, + { + "epoch": 6.315531000613873, + "grad_norm": 0.2997782528400421, + "learning_rate": 3.1576396775600206e-05, + "loss": 1.7287, + "step": 20576 + }, + { + "epoch": 6.315837937384899, + "grad_norm": 0.19672340154647827, + "learning_rate": 3.157177604123628e-05, + "loss": 1.7121, + "step": 20577 + }, + { + "epoch": 6.316144874155924, + "grad_norm": 0.26618507504463196, + "learning_rate": 3.156715548899085e-05, + "loss": 1.6958, + "step": 20578 + }, + { + "epoch": 6.316451810926949, + "grad_norm": 0.18854503333568573, + "learning_rate": 3.156253511890959e-05, + "loss": 1.7751, + "step": 20579 + }, + { + "epoch": 6.316758747697974, + "grad_norm": 0.2306061089038849, + "learning_rate": 3.155791493103819e-05, + "loss": 1.6853, + "step": 20580 + }, + { + "epoch": 6.317065684468999, + "grad_norm": 0.20650778710842133, + "learning_rate": 3.1553294925422254e-05, + "loss": 1.7021, + "step": 20581 + }, + { + "epoch": 6.3173726212400245, + "grad_norm": 0.19474658370018005, + "learning_rate": 3.1548675102107494e-05, + "loss": 1.7146, + "step": 20582 + }, + { + "epoch": 6.31767955801105, + "grad_norm": 0.2150747925043106, + "learning_rate": 3.154405546113952e-05, + "loss": 1.7473, + "step": 20583 + }, + { + "epoch": 6.317986494782075, + "grad_norm": 0.19304975867271423, + "learning_rate": 3.153943600256402e-05, + "loss": 1.7209, + "step": 20584 + }, + { + "epoch": 6.3182934315531, + "grad_norm": 0.22610948979854584, + "learning_rate": 3.153481672642662e-05, + "loss": 1.717, + "step": 20585 + }, + { + "epoch": 6.318600368324125, + "grad_norm": 0.18705105781555176, + "learning_rate": 3.1530197632773006e-05, + "loss": 1.7326, + "step": 20586 + }, + { + "epoch": 6.31890730509515, + "grad_norm": 0.25632867217063904, + "learning_rate": 3.152557872164878e-05, + "loss": 1.7391, + "step": 20587 + }, + { + "epoch": 6.319214241866176, + "grad_norm": 0.18723119795322418, + "learning_rate": 3.152095999309964e-05, + "loss": 1.7193, + "step": 20588 + }, + { + "epoch": 6.319521178637201, + "grad_norm": 0.1759091317653656, + "learning_rate": 3.1516341447171184e-05, + "loss": 1.7024, + "step": 20589 + }, + { + "epoch": 6.319828115408226, + "grad_norm": 0.1838626265525818, + "learning_rate": 3.1511723083909084e-05, + "loss": 1.7027, + "step": 20590 + }, + { + "epoch": 6.320135052179251, + "grad_norm": 0.2615656554698944, + "learning_rate": 3.1507104903358964e-05, + "loss": 1.7798, + "step": 20591 + }, + { + "epoch": 6.320441988950276, + "grad_norm": 0.18816477060317993, + "learning_rate": 3.150248690556649e-05, + "loss": 1.6778, + "step": 20592 + }, + { + "epoch": 6.320748925721301, + "grad_norm": 0.20011866092681885, + "learning_rate": 3.149786909057728e-05, + "loss": 1.6653, + "step": 20593 + }, + { + "epoch": 6.321055862492327, + "grad_norm": 0.26681140065193176, + "learning_rate": 3.149325145843696e-05, + "loss": 1.7523, + "step": 20594 + }, + { + "epoch": 6.321362799263352, + "grad_norm": 0.2062411904335022, + "learning_rate": 3.1488634009191177e-05, + "loss": 1.7584, + "step": 20595 + }, + { + "epoch": 6.3216697360343765, + "grad_norm": 0.22355243563652039, + "learning_rate": 3.148401674288556e-05, + "loss": 1.7106, + "step": 20596 + }, + { + "epoch": 6.321976672805402, + "grad_norm": 0.20189255475997925, + "learning_rate": 3.147939965956576e-05, + "loss": 1.6775, + "step": 20597 + }, + { + "epoch": 6.322283609576427, + "grad_norm": 0.23753875494003296, + "learning_rate": 3.147478275927736e-05, + "loss": 1.7661, + "step": 20598 + }, + { + "epoch": 6.3225905463474525, + "grad_norm": 0.18658648431301117, + "learning_rate": 3.147016604206604e-05, + "loss": 1.7562, + "step": 20599 + }, + { + "epoch": 6.322897483118478, + "grad_norm": 0.2610020637512207, + "learning_rate": 3.146554950797738e-05, + "loss": 1.7217, + "step": 20600 + }, + { + "epoch": 6.323204419889503, + "grad_norm": 0.18329289555549622, + "learning_rate": 3.146093315705704e-05, + "loss": 1.7206, + "step": 20601 + }, + { + "epoch": 6.323511356660528, + "grad_norm": 0.2393725961446762, + "learning_rate": 3.1456316989350606e-05, + "loss": 1.7646, + "step": 20602 + }, + { + "epoch": 6.323818293431553, + "grad_norm": 0.23535947501659393, + "learning_rate": 3.1451701004903736e-05, + "loss": 1.7718, + "step": 20603 + }, + { + "epoch": 6.324125230202578, + "grad_norm": 0.23179253935813904, + "learning_rate": 3.1447085203762014e-05, + "loss": 1.7311, + "step": 20604 + }, + { + "epoch": 6.324432166973604, + "grad_norm": 0.24929681420326233, + "learning_rate": 3.144246958597109e-05, + "loss": 1.7728, + "step": 20605 + }, + { + "epoch": 6.324739103744629, + "grad_norm": 0.22520960867404938, + "learning_rate": 3.1437854151576526e-05, + "loss": 1.749, + "step": 20606 + }, + { + "epoch": 6.3250460405156534, + "grad_norm": 0.3005391061306, + "learning_rate": 3.1433238900623997e-05, + "loss": 1.7725, + "step": 20607 + }, + { + "epoch": 6.325352977286679, + "grad_norm": 0.22625432908535004, + "learning_rate": 3.142862383315908e-05, + "loss": 1.7083, + "step": 20608 + }, + { + "epoch": 6.325659914057704, + "grad_norm": 0.28015029430389404, + "learning_rate": 3.142400894922737e-05, + "loss": 1.6862, + "step": 20609 + }, + { + "epoch": 6.3259668508287294, + "grad_norm": 0.2520587146282196, + "learning_rate": 3.141939424887451e-05, + "loss": 1.7059, + "step": 20610 + }, + { + "epoch": 6.326273787599755, + "grad_norm": 0.24668551981449127, + "learning_rate": 3.141477973214607e-05, + "loss": 1.6858, + "step": 20611 + }, + { + "epoch": 6.326580724370779, + "grad_norm": 0.2524704337120056, + "learning_rate": 3.1410165399087675e-05, + "loss": 1.6884, + "step": 20612 + }, + { + "epoch": 6.326887661141805, + "grad_norm": 0.18849264085292816, + "learning_rate": 3.1405551249744916e-05, + "loss": 1.6984, + "step": 20613 + }, + { + "epoch": 6.32719459791283, + "grad_norm": 0.2411552518606186, + "learning_rate": 3.140093728416342e-05, + "loss": 1.7455, + "step": 20614 + }, + { + "epoch": 6.327501534683855, + "grad_norm": 0.2268913835287094, + "learning_rate": 3.139632350238874e-05, + "loss": 1.7124, + "step": 20615 + }, + { + "epoch": 6.327808471454881, + "grad_norm": 0.3118770718574524, + "learning_rate": 3.1391709904466515e-05, + "loss": 1.7322, + "step": 20616 + }, + { + "epoch": 6.328115408225905, + "grad_norm": 0.25166428089141846, + "learning_rate": 3.1387096490442294e-05, + "loss": 1.7136, + "step": 20617 + }, + { + "epoch": 6.32842234499693, + "grad_norm": 0.2733297049999237, + "learning_rate": 3.138248326036172e-05, + "loss": 1.7939, + "step": 20618 + }, + { + "epoch": 6.328729281767956, + "grad_norm": 0.24583236873149872, + "learning_rate": 3.1377870214270334e-05, + "loss": 1.7105, + "step": 20619 + }, + { + "epoch": 6.329036218538981, + "grad_norm": 0.2533528506755829, + "learning_rate": 3.137325735221377e-05, + "loss": 1.7828, + "step": 20620 + }, + { + "epoch": 6.329343155310006, + "grad_norm": 0.27662715315818787, + "learning_rate": 3.136864467423758e-05, + "loss": 1.6969, + "step": 20621 + }, + { + "epoch": 6.329650092081032, + "grad_norm": 0.20107655227184296, + "learning_rate": 3.136403218038738e-05, + "loss": 1.6659, + "step": 20622 + }, + { + "epoch": 6.329957028852056, + "grad_norm": 0.21126115322113037, + "learning_rate": 3.135941987070872e-05, + "loss": 1.7372, + "step": 20623 + }, + { + "epoch": 6.3302639656230815, + "grad_norm": 0.1840609908103943, + "learning_rate": 3.1354807745247206e-05, + "loss": 1.7219, + "step": 20624 + }, + { + "epoch": 6.330570902394107, + "grad_norm": 0.23623648285865784, + "learning_rate": 3.135019580404842e-05, + "loss": 1.8059, + "step": 20625 + }, + { + "epoch": 6.330877839165132, + "grad_norm": 0.19853124022483826, + "learning_rate": 3.134558404715792e-05, + "loss": 1.7336, + "step": 20626 + }, + { + "epoch": 6.3311847759361575, + "grad_norm": 0.2261304259300232, + "learning_rate": 3.13409724746213e-05, + "loss": 1.7508, + "step": 20627 + }, + { + "epoch": 6.331491712707182, + "grad_norm": 0.1797952800989151, + "learning_rate": 3.1336361086484104e-05, + "loss": 1.6569, + "step": 20628 + }, + { + "epoch": 6.331798649478207, + "grad_norm": 0.21610359847545624, + "learning_rate": 3.133174988279195e-05, + "loss": 1.7093, + "step": 20629 + }, + { + "epoch": 6.332105586249233, + "grad_norm": 0.1818271279335022, + "learning_rate": 3.1327138863590365e-05, + "loss": 1.6951, + "step": 20630 + }, + { + "epoch": 6.332412523020258, + "grad_norm": 0.20425963401794434, + "learning_rate": 3.1322528028924956e-05, + "loss": 1.7399, + "step": 20631 + }, + { + "epoch": 6.332719459791283, + "grad_norm": 0.20357854664325714, + "learning_rate": 3.131791737884126e-05, + "loss": 1.693, + "step": 20632 + }, + { + "epoch": 6.333026396562309, + "grad_norm": 0.25307130813598633, + "learning_rate": 3.1313306913384874e-05, + "loss": 1.674, + "step": 20633 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 0.21596084535121918, + "learning_rate": 3.130869663260132e-05, + "loss": 1.7521, + "step": 20634 + }, + { + "epoch": 6.333640270104358, + "grad_norm": 0.24110902845859528, + "learning_rate": 3.1304086536536194e-05, + "loss": 1.6723, + "step": 20635 + }, + { + "epoch": 6.333947206875384, + "grad_norm": 0.21365956962108612, + "learning_rate": 3.129947662523503e-05, + "loss": 1.7702, + "step": 20636 + }, + { + "epoch": 6.334254143646409, + "grad_norm": 0.21873877942562103, + "learning_rate": 3.129486689874341e-05, + "loss": 1.7176, + "step": 20637 + }, + { + "epoch": 6.334561080417434, + "grad_norm": 0.2543679475784302, + "learning_rate": 3.129025735710687e-05, + "loss": 1.7733, + "step": 20638 + }, + { + "epoch": 6.334868017188459, + "grad_norm": 0.24591630697250366, + "learning_rate": 3.1285648000370996e-05, + "loss": 1.7212, + "step": 20639 + }, + { + "epoch": 6.335174953959484, + "grad_norm": 0.2453039139509201, + "learning_rate": 3.128103882858129e-05, + "loss": 1.7316, + "step": 20640 + }, + { + "epoch": 6.3354818907305095, + "grad_norm": 0.239897683262825, + "learning_rate": 3.127642984178334e-05, + "loss": 1.7495, + "step": 20641 + }, + { + "epoch": 6.335788827501535, + "grad_norm": 0.20719192922115326, + "learning_rate": 3.12718210400227e-05, + "loss": 1.7242, + "step": 20642 + }, + { + "epoch": 6.33609576427256, + "grad_norm": 0.1813955008983612, + "learning_rate": 3.126721242334487e-05, + "loss": 1.672, + "step": 20643 + }, + { + "epoch": 6.336402701043585, + "grad_norm": 0.20045650005340576, + "learning_rate": 3.126260399179546e-05, + "loss": 1.7854, + "step": 20644 + }, + { + "epoch": 6.33670963781461, + "grad_norm": 0.23010976612567902, + "learning_rate": 3.125799574541995e-05, + "loss": 1.7508, + "step": 20645 + }, + { + "epoch": 6.337016574585635, + "grad_norm": 0.1854519248008728, + "learning_rate": 3.1253387684263924e-05, + "loss": 1.7049, + "step": 20646 + }, + { + "epoch": 6.337323511356661, + "grad_norm": 0.2062511295080185, + "learning_rate": 3.1248779808372894e-05, + "loss": 1.6894, + "step": 20647 + }, + { + "epoch": 6.337630448127686, + "grad_norm": 0.19851341843605042, + "learning_rate": 3.124417211779244e-05, + "loss": 1.7332, + "step": 20648 + }, + { + "epoch": 6.337937384898711, + "grad_norm": 0.2099175751209259, + "learning_rate": 3.1239564612568054e-05, + "loss": 1.7577, + "step": 20649 + }, + { + "epoch": 6.338244321669736, + "grad_norm": 0.2152891904115677, + "learning_rate": 3.123495729274529e-05, + "loss": 1.7691, + "step": 20650 + }, + { + "epoch": 6.338551258440761, + "grad_norm": 0.19431835412979126, + "learning_rate": 3.123035015836967e-05, + "loss": 1.7035, + "step": 20651 + }, + { + "epoch": 6.338858195211786, + "grad_norm": 0.20863930881023407, + "learning_rate": 3.122574320948674e-05, + "loss": 1.7166, + "step": 20652 + }, + { + "epoch": 6.339165131982812, + "grad_norm": 0.17948369681835175, + "learning_rate": 3.122113644614201e-05, + "loss": 1.732, + "step": 20653 + }, + { + "epoch": 6.339472068753837, + "grad_norm": 0.2329161912202835, + "learning_rate": 3.121652986838103e-05, + "loss": 1.6934, + "step": 20654 + }, + { + "epoch": 6.3397790055248615, + "grad_norm": 0.23563681542873383, + "learning_rate": 3.12119234762493e-05, + "loss": 1.7329, + "step": 20655 + }, + { + "epoch": 6.340085942295887, + "grad_norm": 0.22654885053634644, + "learning_rate": 3.120731726979236e-05, + "loss": 1.767, + "step": 20656 + }, + { + "epoch": 6.340392879066912, + "grad_norm": 0.2507181465625763, + "learning_rate": 3.1202711249055715e-05, + "loss": 1.7071, + "step": 20657 + }, + { + "epoch": 6.3406998158379375, + "grad_norm": 0.20573864877223969, + "learning_rate": 3.1198105414084906e-05, + "loss": 1.7566, + "step": 20658 + }, + { + "epoch": 6.341006752608963, + "grad_norm": 0.23311644792556763, + "learning_rate": 3.119349976492545e-05, + "loss": 1.6778, + "step": 20659 + }, + { + "epoch": 6.341313689379987, + "grad_norm": 0.18166053295135498, + "learning_rate": 3.118889430162283e-05, + "loss": 1.7109, + "step": 20660 + }, + { + "epoch": 6.341620626151013, + "grad_norm": 0.21054090559482574, + "learning_rate": 3.11842890242226e-05, + "loss": 1.7255, + "step": 20661 + }, + { + "epoch": 6.341927562922038, + "grad_norm": 0.19898973405361176, + "learning_rate": 3.1179683932770235e-05, + "loss": 1.7017, + "step": 20662 + }, + { + "epoch": 6.342234499693063, + "grad_norm": 0.17782434821128845, + "learning_rate": 3.117507902731127e-05, + "loss": 1.6858, + "step": 20663 + }, + { + "epoch": 6.342541436464089, + "grad_norm": 0.19286927580833435, + "learning_rate": 3.117047430789121e-05, + "loss": 1.707, + "step": 20664 + }, + { + "epoch": 6.342848373235114, + "grad_norm": 0.18578651547431946, + "learning_rate": 3.1165869774555565e-05, + "loss": 1.7331, + "step": 20665 + }, + { + "epoch": 6.343155310006138, + "grad_norm": 0.19728249311447144, + "learning_rate": 3.1161265427349826e-05, + "loss": 1.7165, + "step": 20666 + }, + { + "epoch": 6.343462246777164, + "grad_norm": 0.18240176141262054, + "learning_rate": 3.115666126631952e-05, + "loss": 1.7167, + "step": 20667 + }, + { + "epoch": 6.343769183548189, + "grad_norm": 0.1928495317697525, + "learning_rate": 3.115205729151011e-05, + "loss": 1.7431, + "step": 20668 + }, + { + "epoch": 6.344076120319214, + "grad_norm": 0.19459952414035797, + "learning_rate": 3.1147453502967125e-05, + "loss": 1.7294, + "step": 20669 + }, + { + "epoch": 6.34438305709024, + "grad_norm": 0.18829894065856934, + "learning_rate": 3.1142849900736046e-05, + "loss": 1.7512, + "step": 20670 + }, + { + "epoch": 6.344689993861264, + "grad_norm": 0.19678451120853424, + "learning_rate": 3.11382464848624e-05, + "loss": 1.673, + "step": 20671 + }, + { + "epoch": 6.3449969306322895, + "grad_norm": 0.22256550192832947, + "learning_rate": 3.1133643255391635e-05, + "loss": 1.7044, + "step": 20672 + }, + { + "epoch": 6.345303867403315, + "grad_norm": 0.24741628766059875, + "learning_rate": 3.112904021236929e-05, + "loss": 1.7904, + "step": 20673 + }, + { + "epoch": 6.34561080417434, + "grad_norm": 0.20286159217357635, + "learning_rate": 3.11244373558408e-05, + "loss": 1.6976, + "step": 20674 + }, + { + "epoch": 6.3459177409453655, + "grad_norm": 0.2005387842655182, + "learning_rate": 3.11198346858517e-05, + "loss": 1.7083, + "step": 20675 + }, + { + "epoch": 6.346224677716391, + "grad_norm": 0.22312256693840027, + "learning_rate": 3.111523220244747e-05, + "loss": 1.7575, + "step": 20676 + }, + { + "epoch": 6.346531614487415, + "grad_norm": 0.2968841791152954, + "learning_rate": 3.111062990567356e-05, + "loss": 1.7813, + "step": 20677 + }, + { + "epoch": 6.346838551258441, + "grad_norm": 0.22900697588920593, + "learning_rate": 3.1106027795575496e-05, + "loss": 1.6818, + "step": 20678 + }, + { + "epoch": 6.347145488029466, + "grad_norm": 0.1912240833044052, + "learning_rate": 3.110142587219873e-05, + "loss": 1.7174, + "step": 20679 + }, + { + "epoch": 6.347452424800491, + "grad_norm": 0.20461280643939972, + "learning_rate": 3.1096824135588754e-05, + "loss": 1.6945, + "step": 20680 + }, + { + "epoch": 6.347759361571517, + "grad_norm": 0.19344913959503174, + "learning_rate": 3.109222258579103e-05, + "loss": 1.7064, + "step": 20681 + }, + { + "epoch": 6.348066298342541, + "grad_norm": 0.1833983063697815, + "learning_rate": 3.108762122285106e-05, + "loss": 1.702, + "step": 20682 + }, + { + "epoch": 6.348373235113566, + "grad_norm": 0.20344893634319305, + "learning_rate": 3.108302004681429e-05, + "loss": 1.7323, + "step": 20683 + }, + { + "epoch": 6.348680171884592, + "grad_norm": 0.18629617989063263, + "learning_rate": 3.107841905772622e-05, + "loss": 1.6841, + "step": 20684 + }, + { + "epoch": 6.348987108655617, + "grad_norm": 0.19279471039772034, + "learning_rate": 3.107381825563228e-05, + "loss": 1.7581, + "step": 20685 + }, + { + "epoch": 6.349294045426642, + "grad_norm": 0.21727058291435242, + "learning_rate": 3.106921764057798e-05, + "loss": 1.7231, + "step": 20686 + }, + { + "epoch": 6.349600982197667, + "grad_norm": 0.20952723920345306, + "learning_rate": 3.1064617212608747e-05, + "loss": 1.713, + "step": 20687 + }, + { + "epoch": 6.349907918968692, + "grad_norm": 0.2358582466840744, + "learning_rate": 3.10600169717701e-05, + "loss": 1.7291, + "step": 20688 + }, + { + "epoch": 6.350214855739718, + "grad_norm": 0.21846619248390198, + "learning_rate": 3.105541691810743e-05, + "loss": 1.7365, + "step": 20689 + }, + { + "epoch": 6.350521792510743, + "grad_norm": 0.22137843072414398, + "learning_rate": 3.1050817051666256e-05, + "loss": 1.7404, + "step": 20690 + }, + { + "epoch": 6.350828729281768, + "grad_norm": 0.2301674485206604, + "learning_rate": 3.1046217372492e-05, + "loss": 1.7422, + "step": 20691 + }, + { + "epoch": 6.351135666052793, + "grad_norm": 0.18955166637897491, + "learning_rate": 3.104161788063015e-05, + "loss": 1.7063, + "step": 20692 + }, + { + "epoch": 6.351442602823818, + "grad_norm": 0.21172095835208893, + "learning_rate": 3.103701857612614e-05, + "loss": 1.6856, + "step": 20693 + }, + { + "epoch": 6.351749539594843, + "grad_norm": 0.20921260118484497, + "learning_rate": 3.103241945902541e-05, + "loss": 1.7384, + "step": 20694 + }, + { + "epoch": 6.352056476365869, + "grad_norm": 0.21005603671073914, + "learning_rate": 3.102782052937345e-05, + "loss": 1.7118, + "step": 20695 + }, + { + "epoch": 6.352363413136894, + "grad_norm": 0.20888659358024597, + "learning_rate": 3.102322178721567e-05, + "loss": 1.7172, + "step": 20696 + }, + { + "epoch": 6.352670349907919, + "grad_norm": 0.194463849067688, + "learning_rate": 3.101862323259754e-05, + "loss": 1.6909, + "step": 20697 + }, + { + "epoch": 6.352977286678944, + "grad_norm": 0.20848685503005981, + "learning_rate": 3.1014024865564494e-05, + "loss": 1.7846, + "step": 20698 + }, + { + "epoch": 6.353284223449969, + "grad_norm": 0.18669761717319489, + "learning_rate": 3.100942668616201e-05, + "loss": 1.7542, + "step": 20699 + }, + { + "epoch": 6.3535911602209945, + "grad_norm": 0.23618464171886444, + "learning_rate": 3.100482869443547e-05, + "loss": 1.7292, + "step": 20700 + }, + { + "epoch": 6.35389809699202, + "grad_norm": 0.19389905035495758, + "learning_rate": 3.100023089043037e-05, + "loss": 1.6847, + "step": 20701 + }, + { + "epoch": 6.354205033763045, + "grad_norm": 0.20346343517303467, + "learning_rate": 3.09956332741921e-05, + "loss": 1.7096, + "step": 20702 + }, + { + "epoch": 6.35451197053407, + "grad_norm": 0.20825842022895813, + "learning_rate": 3.099103584576614e-05, + "loss": 1.6974, + "step": 20703 + }, + { + "epoch": 6.354818907305095, + "grad_norm": 0.2093508094549179, + "learning_rate": 3.0986438605197895e-05, + "loss": 1.6849, + "step": 20704 + }, + { + "epoch": 6.35512584407612, + "grad_norm": 0.2576633393764496, + "learning_rate": 3.098184155253282e-05, + "loss": 1.7974, + "step": 20705 + }, + { + "epoch": 6.355432780847146, + "grad_norm": 0.18197253346443176, + "learning_rate": 3.097724468781632e-05, + "loss": 1.6723, + "step": 20706 + }, + { + "epoch": 6.355739717618171, + "grad_norm": 0.24809512495994568, + "learning_rate": 3.0972648011093855e-05, + "loss": 1.7378, + "step": 20707 + }, + { + "epoch": 6.356046654389196, + "grad_norm": 0.2046923190355301, + "learning_rate": 3.0968051522410814e-05, + "loss": 1.7502, + "step": 20708 + }, + { + "epoch": 6.356353591160221, + "grad_norm": 0.20443019270896912, + "learning_rate": 3.096345522181265e-05, + "loss": 1.7179, + "step": 20709 + }, + { + "epoch": 6.356660527931246, + "grad_norm": 0.1906277984380722, + "learning_rate": 3.09588591093448e-05, + "loss": 1.7167, + "step": 20710 + }, + { + "epoch": 6.356967464702271, + "grad_norm": 0.20729197561740875, + "learning_rate": 3.095426318505263e-05, + "loss": 1.7193, + "step": 20711 + }, + { + "epoch": 6.357274401473297, + "grad_norm": 0.23446644842624664, + "learning_rate": 3.094966744898162e-05, + "loss": 1.7341, + "step": 20712 + }, + { + "epoch": 6.357581338244322, + "grad_norm": 0.18882590532302856, + "learning_rate": 3.094507190117715e-05, + "loss": 1.7001, + "step": 20713 + }, + { + "epoch": 6.3578882750153465, + "grad_norm": 0.27240705490112305, + "learning_rate": 3.094047654168465e-05, + "loss": 1.7641, + "step": 20714 + }, + { + "epoch": 6.358195211786372, + "grad_norm": 0.19616954028606415, + "learning_rate": 3.093588137054952e-05, + "loss": 1.751, + "step": 20715 + }, + { + "epoch": 6.358502148557397, + "grad_norm": 0.23402562737464905, + "learning_rate": 3.093128638781721e-05, + "loss": 1.7274, + "step": 20716 + }, + { + "epoch": 6.3588090853284225, + "grad_norm": 0.18189528584480286, + "learning_rate": 3.092669159353309e-05, + "loss": 1.7079, + "step": 20717 + }, + { + "epoch": 6.359116022099448, + "grad_norm": 0.21583771705627441, + "learning_rate": 3.092209698774259e-05, + "loss": 1.6811, + "step": 20718 + }, + { + "epoch": 6.359422958870473, + "grad_norm": 0.2477681040763855, + "learning_rate": 3.091750257049109e-05, + "loss": 1.6963, + "step": 20719 + }, + { + "epoch": 6.359729895641498, + "grad_norm": 0.2883109152317047, + "learning_rate": 3.091290834182403e-05, + "loss": 1.8349, + "step": 20720 + }, + { + "epoch": 6.360036832412523, + "grad_norm": 0.23407170176506042, + "learning_rate": 3.09083143017868e-05, + "loss": 1.7271, + "step": 20721 + }, + { + "epoch": 6.360343769183548, + "grad_norm": 0.2818833589553833, + "learning_rate": 3.090372045042479e-05, + "loss": 1.7852, + "step": 20722 + }, + { + "epoch": 6.360650705954574, + "grad_norm": 0.24415317177772522, + "learning_rate": 3.089912678778341e-05, + "loss": 1.6826, + "step": 20723 + }, + { + "epoch": 6.360957642725599, + "grad_norm": 0.26786303520202637, + "learning_rate": 3.0894533313908056e-05, + "loss": 1.7616, + "step": 20724 + }, + { + "epoch": 6.361264579496623, + "grad_norm": 0.3235633969306946, + "learning_rate": 3.088994002884411e-05, + "loss": 1.7637, + "step": 20725 + }, + { + "epoch": 6.361571516267649, + "grad_norm": 0.18675416707992554, + "learning_rate": 3.0885346932637e-05, + "loss": 1.7037, + "step": 20726 + }, + { + "epoch": 6.361878453038674, + "grad_norm": 0.295802503824234, + "learning_rate": 3.0880754025332084e-05, + "loss": 1.7435, + "step": 20727 + }, + { + "epoch": 6.362185389809699, + "grad_norm": 0.18665561079978943, + "learning_rate": 3.0876161306974756e-05, + "loss": 1.684, + "step": 20728 + }, + { + "epoch": 6.362492326580725, + "grad_norm": 0.2530463635921478, + "learning_rate": 3.087156877761043e-05, + "loss": 1.7934, + "step": 20729 + }, + { + "epoch": 6.362799263351749, + "grad_norm": 0.17860126495361328, + "learning_rate": 3.086697643728445e-05, + "loss": 1.6977, + "step": 20730 + }, + { + "epoch": 6.3631062001227745, + "grad_norm": 0.20118845999240875, + "learning_rate": 3.086238428604223e-05, + "loss": 1.7241, + "step": 20731 + }, + { + "epoch": 6.3634131368938, + "grad_norm": 0.18811924755573273, + "learning_rate": 3.085779232392915e-05, + "loss": 1.6918, + "step": 20732 + }, + { + "epoch": 6.363720073664825, + "grad_norm": 0.1841908097267151, + "learning_rate": 3.085320055099058e-05, + "loss": 1.735, + "step": 20733 + }, + { + "epoch": 6.3640270104358505, + "grad_norm": 0.1956033855676651, + "learning_rate": 3.08486089672719e-05, + "loss": 1.7203, + "step": 20734 + }, + { + "epoch": 6.364333947206875, + "grad_norm": 0.19844500720500946, + "learning_rate": 3.084401757281851e-05, + "loss": 1.6767, + "step": 20735 + }, + { + "epoch": 6.3646408839779, + "grad_norm": 0.2018919438123703, + "learning_rate": 3.083942636767575e-05, + "loss": 1.6912, + "step": 20736 + }, + { + "epoch": 6.364947820748926, + "grad_norm": 0.18929271399974823, + "learning_rate": 3.083483535188901e-05, + "loss": 1.6838, + "step": 20737 + }, + { + "epoch": 6.365254757519951, + "grad_norm": 0.19833499193191528, + "learning_rate": 3.0830244525503674e-05, + "loss": 1.7139, + "step": 20738 + }, + { + "epoch": 6.365561694290976, + "grad_norm": 0.17029902338981628, + "learning_rate": 3.082565388856509e-05, + "loss": 1.6665, + "step": 20739 + }, + { + "epoch": 6.365868631062002, + "grad_norm": 0.19526802003383636, + "learning_rate": 3.082106344111861e-05, + "loss": 1.7021, + "step": 20740 + }, + { + "epoch": 6.366175567833026, + "grad_norm": 0.19061279296875, + "learning_rate": 3.081647318320966e-05, + "loss": 1.7134, + "step": 20741 + }, + { + "epoch": 6.366482504604051, + "grad_norm": 0.17782293260097504, + "learning_rate": 3.081188311488354e-05, + "loss": 1.741, + "step": 20742 + }, + { + "epoch": 6.366789441375077, + "grad_norm": 0.20002372562885284, + "learning_rate": 3.080729323618565e-05, + "loss": 1.6943, + "step": 20743 + }, + { + "epoch": 6.367096378146102, + "grad_norm": 0.22873486578464508, + "learning_rate": 3.080270354716134e-05, + "loss": 1.7223, + "step": 20744 + }, + { + "epoch": 6.367403314917127, + "grad_norm": 0.191136434674263, + "learning_rate": 3.079811404785595e-05, + "loss": 1.6774, + "step": 20745 + }, + { + "epoch": 6.367710251688152, + "grad_norm": 0.20446795225143433, + "learning_rate": 3.0793524738314874e-05, + "loss": 1.7443, + "step": 20746 + }, + { + "epoch": 6.368017188459177, + "grad_norm": 0.20668596029281616, + "learning_rate": 3.078893561858341e-05, + "loss": 1.7553, + "step": 20747 + }, + { + "epoch": 6.3683241252302025, + "grad_norm": 0.18445394933223724, + "learning_rate": 3.078434668870698e-05, + "loss": 1.7365, + "step": 20748 + }, + { + "epoch": 6.368631062001228, + "grad_norm": 0.1824318915605545, + "learning_rate": 3.077975794873088e-05, + "loss": 1.7248, + "step": 20749 + }, + { + "epoch": 6.368937998772253, + "grad_norm": 0.18452249467372894, + "learning_rate": 3.077516939870047e-05, + "loss": 1.7095, + "step": 20750 + }, + { + "epoch": 6.3692449355432785, + "grad_norm": 0.17254458367824554, + "learning_rate": 3.077058103866112e-05, + "loss": 1.6937, + "step": 20751 + }, + { + "epoch": 6.369551872314303, + "grad_norm": 0.2022976130247116, + "learning_rate": 3.0765992868658154e-05, + "loss": 1.7593, + "step": 20752 + }, + { + "epoch": 6.369858809085328, + "grad_norm": 0.19274397194385529, + "learning_rate": 3.076140488873691e-05, + "loss": 1.7288, + "step": 20753 + }, + { + "epoch": 6.370165745856354, + "grad_norm": 0.18847523629665375, + "learning_rate": 3.075681709894276e-05, + "loss": 1.7293, + "step": 20754 + }, + { + "epoch": 6.370472682627379, + "grad_norm": 0.21054589748382568, + "learning_rate": 3.075222949932101e-05, + "loss": 1.7688, + "step": 20755 + }, + { + "epoch": 6.370779619398404, + "grad_norm": 0.16934558749198914, + "learning_rate": 3.0747642089917005e-05, + "loss": 1.7092, + "step": 20756 + }, + { + "epoch": 6.371086556169429, + "grad_norm": 0.19154684245586395, + "learning_rate": 3.0743054870776075e-05, + "loss": 1.6827, + "step": 20757 + }, + { + "epoch": 6.371393492940454, + "grad_norm": 0.2622900605201721, + "learning_rate": 3.0738467841943594e-05, + "loss": 1.748, + "step": 20758 + }, + { + "epoch": 6.371700429711479, + "grad_norm": 0.1767888218164444, + "learning_rate": 3.073388100346484e-05, + "loss": 1.717, + "step": 20759 + }, + { + "epoch": 6.372007366482505, + "grad_norm": 0.21692602336406708, + "learning_rate": 3.072929435538518e-05, + "loss": 1.7543, + "step": 20760 + }, + { + "epoch": 6.37231430325353, + "grad_norm": 0.19853977859020233, + "learning_rate": 3.0724707897749926e-05, + "loss": 1.7599, + "step": 20761 + }, + { + "epoch": 6.3726212400245545, + "grad_norm": 0.1904703676700592, + "learning_rate": 3.0720121630604396e-05, + "loss": 1.7094, + "step": 20762 + }, + { + "epoch": 6.37292817679558, + "grad_norm": 0.1961483359336853, + "learning_rate": 3.071553555399395e-05, + "loss": 1.7363, + "step": 20763 + }, + { + "epoch": 6.373235113566605, + "grad_norm": 0.16419392824172974, + "learning_rate": 3.071094966796385e-05, + "loss": 1.7073, + "step": 20764 + }, + { + "epoch": 6.3735420503376305, + "grad_norm": 0.1784946471452713, + "learning_rate": 3.0706363972559476e-05, + "loss": 1.699, + "step": 20765 + }, + { + "epoch": 6.373848987108656, + "grad_norm": 0.19472888112068176, + "learning_rate": 3.070177846782611e-05, + "loss": 1.7541, + "step": 20766 + }, + { + "epoch": 6.37415592387968, + "grad_norm": 0.2355004847049713, + "learning_rate": 3.0697193153809076e-05, + "loss": 1.7389, + "step": 20767 + }, + { + "epoch": 6.374462860650706, + "grad_norm": 0.1956906020641327, + "learning_rate": 3.069260803055369e-05, + "loss": 1.7197, + "step": 20768 + }, + { + "epoch": 6.374769797421731, + "grad_norm": 0.21212655305862427, + "learning_rate": 3.068802309810529e-05, + "loss": 1.7291, + "step": 20769 + }, + { + "epoch": 6.375076734192756, + "grad_norm": 0.22920182347297668, + "learning_rate": 3.068343835650914e-05, + "loss": 1.7397, + "step": 20770 + }, + { + "epoch": 6.375383670963782, + "grad_norm": 0.2143404483795166, + "learning_rate": 3.0678853805810605e-05, + "loss": 1.76, + "step": 20771 + }, + { + "epoch": 6.375690607734807, + "grad_norm": 0.1848321557044983, + "learning_rate": 3.067426944605492e-05, + "loss": 1.7127, + "step": 20772 + }, + { + "epoch": 6.3759975445058314, + "grad_norm": 0.23339331150054932, + "learning_rate": 3.0669685277287465e-05, + "loss": 1.7828, + "step": 20773 + }, + { + "epoch": 6.376304481276857, + "grad_norm": 0.19590741395950317, + "learning_rate": 3.066510129955349e-05, + "loss": 1.7224, + "step": 20774 + }, + { + "epoch": 6.376611418047882, + "grad_norm": 0.19986604154109955, + "learning_rate": 3.066051751289833e-05, + "loss": 1.7412, + "step": 20775 + }, + { + "epoch": 6.3769183548189075, + "grad_norm": 0.18629087507724762, + "learning_rate": 3.0655933917367266e-05, + "loss": 1.695, + "step": 20776 + }, + { + "epoch": 6.377225291589933, + "grad_norm": 0.2248111218214035, + "learning_rate": 3.0651350513005605e-05, + "loss": 1.7685, + "step": 20777 + }, + { + "epoch": 6.377532228360957, + "grad_norm": 0.1803683638572693, + "learning_rate": 3.064676729985864e-05, + "loss": 1.7206, + "step": 20778 + }, + { + "epoch": 6.377839165131983, + "grad_norm": 0.23836754262447357, + "learning_rate": 3.064218427797165e-05, + "loss": 1.7428, + "step": 20779 + }, + { + "epoch": 6.378146101903008, + "grad_norm": 0.22549279034137726, + "learning_rate": 3.063760144738996e-05, + "loss": 1.7314, + "step": 20780 + }, + { + "epoch": 6.378453038674033, + "grad_norm": 0.20714345574378967, + "learning_rate": 3.063301880815882e-05, + "loss": 1.7179, + "step": 20781 + }, + { + "epoch": 6.378759975445059, + "grad_norm": 0.17024052143096924, + "learning_rate": 3.0628436360323565e-05, + "loss": 1.6602, + "step": 20782 + }, + { + "epoch": 6.379066912216084, + "grad_norm": 0.20378601551055908, + "learning_rate": 3.062385410392943e-05, + "loss": 1.7708, + "step": 20783 + }, + { + "epoch": 6.379373848987108, + "grad_norm": 0.1885673850774765, + "learning_rate": 3.0619272039021734e-05, + "loss": 1.7034, + "step": 20784 + }, + { + "epoch": 6.379680785758134, + "grad_norm": 0.18746556341648102, + "learning_rate": 3.0614690165645746e-05, + "loss": 1.6946, + "step": 20785 + }, + { + "epoch": 6.379987722529159, + "grad_norm": 0.19569392502307892, + "learning_rate": 3.061010848384677e-05, + "loss": 1.7298, + "step": 20786 + }, + { + "epoch": 6.380294659300184, + "grad_norm": 0.21114139258861542, + "learning_rate": 3.0605526993670046e-05, + "loss": 1.795, + "step": 20787 + }, + { + "epoch": 6.38060159607121, + "grad_norm": 0.20940302312374115, + "learning_rate": 3.06009456951609e-05, + "loss": 1.6747, + "step": 20788 + }, + { + "epoch": 6.380908532842234, + "grad_norm": 0.21008993685245514, + "learning_rate": 3.059636458836455e-05, + "loss": 1.7219, + "step": 20789 + }, + { + "epoch": 6.3812154696132595, + "grad_norm": 0.17642457783222198, + "learning_rate": 3.0591783673326304e-05, + "loss": 1.6555, + "step": 20790 + }, + { + "epoch": 6.381522406384285, + "grad_norm": 0.2786177396774292, + "learning_rate": 3.058720295009143e-05, + "loss": 1.8463, + "step": 20791 + }, + { + "epoch": 6.38182934315531, + "grad_norm": 0.21209503710269928, + "learning_rate": 3.058262241870521e-05, + "loss": 1.6848, + "step": 20792 + }, + { + "epoch": 6.3821362799263355, + "grad_norm": 0.1880561262369156, + "learning_rate": 3.057804207921287e-05, + "loss": 1.7401, + "step": 20793 + }, + { + "epoch": 6.382443216697361, + "grad_norm": 0.22108516097068787, + "learning_rate": 3.0573461931659726e-05, + "loss": 1.7482, + "step": 20794 + }, + { + "epoch": 6.382750153468385, + "grad_norm": 0.2161533385515213, + "learning_rate": 3.0568881976091006e-05, + "loss": 1.7425, + "step": 20795 + }, + { + "epoch": 6.383057090239411, + "grad_norm": 0.22933612763881683, + "learning_rate": 3.0564302212551975e-05, + "loss": 1.7424, + "step": 20796 + }, + { + "epoch": 6.383364027010436, + "grad_norm": 0.19572989642620087, + "learning_rate": 3.0559722641087916e-05, + "loss": 1.6763, + "step": 20797 + }, + { + "epoch": 6.383670963781461, + "grad_norm": 0.2181084007024765, + "learning_rate": 3.0555143261744056e-05, + "loss": 1.7164, + "step": 20798 + }, + { + "epoch": 6.383977900552487, + "grad_norm": 0.1927991509437561, + "learning_rate": 3.055056407456569e-05, + "loss": 1.6833, + "step": 20799 + }, + { + "epoch": 6.384284837323511, + "grad_norm": 0.20569704473018646, + "learning_rate": 3.0545985079598025e-05, + "loss": 1.7716, + "step": 20800 + }, + { + "epoch": 6.384591774094536, + "grad_norm": 0.1856541931629181, + "learning_rate": 3.054140627688635e-05, + "loss": 1.6939, + "step": 20801 + }, + { + "epoch": 6.384898710865562, + "grad_norm": 0.2450970858335495, + "learning_rate": 3.05368276664759e-05, + "loss": 1.8197, + "step": 20802 + }, + { + "epoch": 6.385205647636587, + "grad_norm": 0.23325784504413605, + "learning_rate": 3.053224924841194e-05, + "loss": 1.7195, + "step": 20803 + }, + { + "epoch": 6.385512584407612, + "grad_norm": 0.19614358246326447, + "learning_rate": 3.052767102273968e-05, + "loss": 1.6966, + "step": 20804 + }, + { + "epoch": 6.385819521178637, + "grad_norm": 0.20615628361701965, + "learning_rate": 3.0523092989504415e-05, + "loss": 1.7429, + "step": 20805 + }, + { + "epoch": 6.386126457949662, + "grad_norm": 0.18418943881988525, + "learning_rate": 3.0518515148751336e-05, + "loss": 1.7612, + "step": 20806 + }, + { + "epoch": 6.3864333947206875, + "grad_norm": 0.17176245152950287, + "learning_rate": 3.0513937500525725e-05, + "loss": 1.6918, + "step": 20807 + }, + { + "epoch": 6.386740331491713, + "grad_norm": 0.22239255905151367, + "learning_rate": 3.0509360044872787e-05, + "loss": 1.8072, + "step": 20808 + }, + { + "epoch": 6.387047268262738, + "grad_norm": 0.20312704145908356, + "learning_rate": 3.0504782781837798e-05, + "loss": 1.7348, + "step": 20809 + }, + { + "epoch": 6.387354205033763, + "grad_norm": 0.23198208212852478, + "learning_rate": 3.0500205711465958e-05, + "loss": 1.7516, + "step": 20810 + }, + { + "epoch": 6.387661141804788, + "grad_norm": 0.2244081050157547, + "learning_rate": 3.0495628833802526e-05, + "loss": 1.731, + "step": 20811 + }, + { + "epoch": 6.387968078575813, + "grad_norm": 0.18282169103622437, + "learning_rate": 3.0491052148892717e-05, + "loss": 1.6743, + "step": 20812 + }, + { + "epoch": 6.388275015346839, + "grad_norm": 0.19108405709266663, + "learning_rate": 3.0486475656781753e-05, + "loss": 1.7485, + "step": 20813 + }, + { + "epoch": 6.388581952117864, + "grad_norm": 0.20574834942817688, + "learning_rate": 3.0481899357514898e-05, + "loss": 1.6979, + "step": 20814 + }, + { + "epoch": 6.388888888888889, + "grad_norm": 0.21263298392295837, + "learning_rate": 3.047732325113733e-05, + "loss": 1.687, + "step": 20815 + }, + { + "epoch": 6.389195825659914, + "grad_norm": 0.22646664083003998, + "learning_rate": 3.047274733769432e-05, + "loss": 1.7593, + "step": 20816 + }, + { + "epoch": 6.389502762430939, + "grad_norm": 0.1846906542778015, + "learning_rate": 3.046817161723104e-05, + "loss": 1.7271, + "step": 20817 + }, + { + "epoch": 6.389809699201964, + "grad_norm": 0.1965247541666031, + "learning_rate": 3.0463596089792746e-05, + "loss": 1.7121, + "step": 20818 + }, + { + "epoch": 6.39011663597299, + "grad_norm": 0.255577951669693, + "learning_rate": 3.045902075542464e-05, + "loss": 1.7311, + "step": 20819 + }, + { + "epoch": 6.390423572744015, + "grad_norm": 0.1837676465511322, + "learning_rate": 3.0454445614171966e-05, + "loss": 1.7177, + "step": 20820 + }, + { + "epoch": 6.3907305095150395, + "grad_norm": 0.24845893681049347, + "learning_rate": 3.0449870666079895e-05, + "loss": 1.6902, + "step": 20821 + }, + { + "epoch": 6.391037446286065, + "grad_norm": 0.28572577238082886, + "learning_rate": 3.0445295911193678e-05, + "loss": 1.7942, + "step": 20822 + }, + { + "epoch": 6.39134438305709, + "grad_norm": 0.20460839569568634, + "learning_rate": 3.044072134955849e-05, + "loss": 1.6747, + "step": 20823 + }, + { + "epoch": 6.3916513198281155, + "grad_norm": 0.3547010123729706, + "learning_rate": 3.0436146981219565e-05, + "loss": 1.7359, + "step": 20824 + }, + { + "epoch": 6.391958256599141, + "grad_norm": 0.20490451157093048, + "learning_rate": 3.04315728062221e-05, + "loss": 1.6863, + "step": 20825 + }, + { + "epoch": 6.392265193370166, + "grad_norm": 0.25874415040016174, + "learning_rate": 3.0426998824611307e-05, + "loss": 1.6798, + "step": 20826 + }, + { + "epoch": 6.392572130141191, + "grad_norm": 0.27858632802963257, + "learning_rate": 3.0422425036432378e-05, + "loss": 1.6943, + "step": 20827 + }, + { + "epoch": 6.392879066912216, + "grad_norm": 0.20951922237873077, + "learning_rate": 3.041785144173054e-05, + "loss": 1.7025, + "step": 20828 + }, + { + "epoch": 6.393186003683241, + "grad_norm": 0.3158397674560547, + "learning_rate": 3.0413278040550952e-05, + "loss": 1.7193, + "step": 20829 + }, + { + "epoch": 6.393492940454267, + "grad_norm": 0.18556484580039978, + "learning_rate": 3.0408704832938824e-05, + "loss": 1.7017, + "step": 20830 + }, + { + "epoch": 6.393799877225292, + "grad_norm": 0.31651169061660767, + "learning_rate": 3.0404131818939376e-05, + "loss": 1.7716, + "step": 20831 + }, + { + "epoch": 6.394106813996316, + "grad_norm": 0.2850388288497925, + "learning_rate": 3.0399558998597765e-05, + "loss": 1.7144, + "step": 20832 + }, + { + "epoch": 6.394413750767342, + "grad_norm": 0.19256308674812317, + "learning_rate": 3.0394986371959223e-05, + "loss": 1.6603, + "step": 20833 + }, + { + "epoch": 6.394720687538367, + "grad_norm": 0.2654922604560852, + "learning_rate": 3.0390413939068896e-05, + "loss": 1.6825, + "step": 20834 + }, + { + "epoch": 6.395027624309392, + "grad_norm": 0.19514231383800507, + "learning_rate": 3.0385841699971997e-05, + "loss": 1.7226, + "step": 20835 + }, + { + "epoch": 6.395334561080418, + "grad_norm": 0.27765151858329773, + "learning_rate": 3.0381269654713702e-05, + "loss": 1.7599, + "step": 20836 + }, + { + "epoch": 6.395641497851442, + "grad_norm": 0.2056504338979721, + "learning_rate": 3.0376697803339215e-05, + "loss": 1.7237, + "step": 20837 + }, + { + "epoch": 6.3959484346224675, + "grad_norm": 0.22516649961471558, + "learning_rate": 3.0372126145893688e-05, + "loss": 1.7566, + "step": 20838 + }, + { + "epoch": 6.396255371393493, + "grad_norm": 0.17632099986076355, + "learning_rate": 3.0367554682422327e-05, + "loss": 1.7014, + "step": 20839 + }, + { + "epoch": 6.396562308164518, + "grad_norm": 0.21872831881046295, + "learning_rate": 3.036298341297028e-05, + "loss": 1.6935, + "step": 20840 + }, + { + "epoch": 6.3968692449355435, + "grad_norm": 0.22132672369480133, + "learning_rate": 3.0358412337582752e-05, + "loss": 1.6735, + "step": 20841 + }, + { + "epoch": 6.397176181706568, + "grad_norm": 0.17865684628486633, + "learning_rate": 3.0353841456304895e-05, + "loss": 1.7097, + "step": 20842 + }, + { + "epoch": 6.397483118477593, + "grad_norm": 0.2069701999425888, + "learning_rate": 3.0349270769181914e-05, + "loss": 1.7592, + "step": 20843 + }, + { + "epoch": 6.397790055248619, + "grad_norm": 0.19800925254821777, + "learning_rate": 3.034470027625893e-05, + "loss": 1.6943, + "step": 20844 + }, + { + "epoch": 6.398096992019644, + "grad_norm": 0.24116787314414978, + "learning_rate": 3.0340129977581165e-05, + "loss": 1.7126, + "step": 20845 + }, + { + "epoch": 6.398403928790669, + "grad_norm": 0.1995212435722351, + "learning_rate": 3.033555987319375e-05, + "loss": 1.75, + "step": 20846 + }, + { + "epoch": 6.398710865561695, + "grad_norm": 0.23717111349105835, + "learning_rate": 3.0330989963141843e-05, + "loss": 1.7338, + "step": 20847 + }, + { + "epoch": 6.399017802332719, + "grad_norm": 0.18372474610805511, + "learning_rate": 3.0326420247470643e-05, + "loss": 1.7034, + "step": 20848 + }, + { + "epoch": 6.399324739103744, + "grad_norm": 0.25953924655914307, + "learning_rate": 3.0321850726225265e-05, + "loss": 1.731, + "step": 20849 + }, + { + "epoch": 6.39963167587477, + "grad_norm": 0.24846702814102173, + "learning_rate": 3.031728139945092e-05, + "loss": 1.7559, + "step": 20850 + }, + { + "epoch": 6.399938612645795, + "grad_norm": 0.20783887803554535, + "learning_rate": 3.0312712267192713e-05, + "loss": 1.7229, + "step": 20851 + }, + { + "epoch": 6.4002455494168204, + "grad_norm": 0.1904737949371338, + "learning_rate": 3.030814332949583e-05, + "loss": 1.6986, + "step": 20852 + }, + { + "epoch": 6.400552486187845, + "grad_norm": 0.2275397777557373, + "learning_rate": 3.030357458640541e-05, + "loss": 1.708, + "step": 20853 + }, + { + "epoch": 6.40085942295887, + "grad_norm": 0.20119737088680267, + "learning_rate": 3.0299006037966628e-05, + "loss": 1.7727, + "step": 20854 + }, + { + "epoch": 6.401166359729896, + "grad_norm": 0.17214249074459076, + "learning_rate": 3.0294437684224596e-05, + "loss": 1.6674, + "step": 20855 + }, + { + "epoch": 6.401473296500921, + "grad_norm": 0.21268978714942932, + "learning_rate": 3.02898695252245e-05, + "loss": 1.7182, + "step": 20856 + }, + { + "epoch": 6.401780233271946, + "grad_norm": 0.19911682605743408, + "learning_rate": 3.0285301561011448e-05, + "loss": 1.6861, + "step": 20857 + }, + { + "epoch": 6.402087170042972, + "grad_norm": 0.194064199924469, + "learning_rate": 3.0280733791630613e-05, + "loss": 1.6768, + "step": 20858 + }, + { + "epoch": 6.402394106813996, + "grad_norm": 0.17554323375225067, + "learning_rate": 3.027616621712711e-05, + "loss": 1.6987, + "step": 20859 + }, + { + "epoch": 6.402701043585021, + "grad_norm": 0.205257385969162, + "learning_rate": 3.027159883754611e-05, + "loss": 1.7951, + "step": 20860 + }, + { + "epoch": 6.403007980356047, + "grad_norm": 0.1766849011182785, + "learning_rate": 3.0267031652932743e-05, + "loss": 1.7157, + "step": 20861 + }, + { + "epoch": 6.403314917127072, + "grad_norm": 0.17106789350509644, + "learning_rate": 3.0262464663332106e-05, + "loss": 1.685, + "step": 20862 + }, + { + "epoch": 6.403621853898097, + "grad_norm": 0.17380768060684204, + "learning_rate": 3.0257897868789377e-05, + "loss": 1.708, + "step": 20863 + }, + { + "epoch": 6.403928790669122, + "grad_norm": 0.15817396342754364, + "learning_rate": 3.0253331269349662e-05, + "loss": 1.6629, + "step": 20864 + }, + { + "epoch": 6.404235727440147, + "grad_norm": 0.18253934383392334, + "learning_rate": 3.0248764865058122e-05, + "loss": 1.6877, + "step": 20865 + }, + { + "epoch": 6.4045426642111725, + "grad_norm": 0.20645618438720703, + "learning_rate": 3.0244198655959843e-05, + "loss": 1.7238, + "step": 20866 + }, + { + "epoch": 6.404849600982198, + "grad_norm": 0.2216680645942688, + "learning_rate": 3.0239632642099992e-05, + "loss": 1.7721, + "step": 20867 + }, + { + "epoch": 6.405156537753223, + "grad_norm": 0.21479755640029907, + "learning_rate": 3.023506682352365e-05, + "loss": 1.6686, + "step": 20868 + }, + { + "epoch": 6.4054634745242485, + "grad_norm": 0.21274925768375397, + "learning_rate": 3.0230501200275974e-05, + "loss": 1.7245, + "step": 20869 + }, + { + "epoch": 6.405770411295273, + "grad_norm": 0.19894039630889893, + "learning_rate": 3.0225935772402064e-05, + "loss": 1.6734, + "step": 20870 + }, + { + "epoch": 6.406077348066298, + "grad_norm": 0.24450170993804932, + "learning_rate": 3.022137053994707e-05, + "loss": 1.7103, + "step": 20871 + }, + { + "epoch": 6.406384284837324, + "grad_norm": 0.18289846181869507, + "learning_rate": 3.0216805502956057e-05, + "loss": 1.7866, + "step": 20872 + }, + { + "epoch": 6.406691221608349, + "grad_norm": 0.2884466350078583, + "learning_rate": 3.021224066147419e-05, + "loss": 1.7817, + "step": 20873 + }, + { + "epoch": 6.406998158379374, + "grad_norm": 0.21871373057365417, + "learning_rate": 3.0207676015546537e-05, + "loss": 1.6871, + "step": 20874 + }, + { + "epoch": 6.407305095150399, + "grad_norm": 0.239889994263649, + "learning_rate": 3.0203111565218244e-05, + "loss": 1.6412, + "step": 20875 + }, + { + "epoch": 6.407612031921424, + "grad_norm": 0.26960206031799316, + "learning_rate": 3.019854731053441e-05, + "loss": 1.7537, + "step": 20876 + }, + { + "epoch": 6.407918968692449, + "grad_norm": 0.32872483134269714, + "learning_rate": 3.019398325154013e-05, + "loss": 1.7718, + "step": 20877 + }, + { + "epoch": 6.408225905463475, + "grad_norm": 0.27766308188438416, + "learning_rate": 3.018941938828053e-05, + "loss": 1.7537, + "step": 20878 + }, + { + "epoch": 6.4085328422345, + "grad_norm": 0.1989286094903946, + "learning_rate": 3.0184855720800674e-05, + "loss": 1.7373, + "step": 20879 + }, + { + "epoch": 6.4088397790055245, + "grad_norm": 0.19748768210411072, + "learning_rate": 3.0180292249145703e-05, + "loss": 1.6821, + "step": 20880 + }, + { + "epoch": 6.40914671577655, + "grad_norm": 0.20632879436016083, + "learning_rate": 3.0175728973360694e-05, + "loss": 1.7641, + "step": 20881 + }, + { + "epoch": 6.409453652547575, + "grad_norm": 0.23808124661445618, + "learning_rate": 3.017116589349076e-05, + "loss": 1.7434, + "step": 20882 + }, + { + "epoch": 6.4097605893186005, + "grad_norm": 0.265514612197876, + "learning_rate": 3.0166603009580974e-05, + "loss": 1.7877, + "step": 20883 + }, + { + "epoch": 6.410067526089626, + "grad_norm": 0.21031250059604645, + "learning_rate": 3.0162040321676465e-05, + "loss": 1.738, + "step": 20884 + }, + { + "epoch": 6.41037446286065, + "grad_norm": 0.3011578619480133, + "learning_rate": 3.015747782982228e-05, + "loss": 1.7063, + "step": 20885 + }, + { + "epoch": 6.410681399631676, + "grad_norm": 0.28601503372192383, + "learning_rate": 3.015291553406353e-05, + "loss": 1.7021, + "step": 20886 + }, + { + "epoch": 6.410988336402701, + "grad_norm": 0.2433992624282837, + "learning_rate": 3.014835343444531e-05, + "loss": 1.6887, + "step": 20887 + }, + { + "epoch": 6.411295273173726, + "grad_norm": 0.3342660963535309, + "learning_rate": 3.014379153101269e-05, + "loss": 1.7798, + "step": 20888 + }, + { + "epoch": 6.411602209944752, + "grad_norm": 0.2390800267457962, + "learning_rate": 3.0139229823810757e-05, + "loss": 1.774, + "step": 20889 + }, + { + "epoch": 6.411909146715777, + "grad_norm": 0.2659217417240143, + "learning_rate": 3.0134668312884613e-05, + "loss": 1.7396, + "step": 20890 + }, + { + "epoch": 6.412216083486801, + "grad_norm": 0.22885620594024658, + "learning_rate": 3.0130106998279294e-05, + "loss": 1.7303, + "step": 20891 + }, + { + "epoch": 6.412523020257827, + "grad_norm": 0.20651856064796448, + "learning_rate": 3.0125545880039925e-05, + "loss": 1.7796, + "step": 20892 + }, + { + "epoch": 6.412829957028852, + "grad_norm": 0.26611828804016113, + "learning_rate": 3.0120984958211552e-05, + "loss": 1.7019, + "step": 20893 + }, + { + "epoch": 6.413136893799877, + "grad_norm": 0.2526776194572449, + "learning_rate": 3.0116424232839258e-05, + "loss": 1.7062, + "step": 20894 + }, + { + "epoch": 6.413443830570903, + "grad_norm": 0.2087634801864624, + "learning_rate": 3.0111863703968128e-05, + "loss": 1.7011, + "step": 20895 + }, + { + "epoch": 6.413750767341927, + "grad_norm": 0.20656780898571014, + "learning_rate": 3.0107303371643197e-05, + "loss": 1.7637, + "step": 20896 + }, + { + "epoch": 6.4140577041129525, + "grad_norm": 0.2083009034395218, + "learning_rate": 3.010274323590956e-05, + "loss": 1.7213, + "step": 20897 + }, + { + "epoch": 6.414364640883978, + "grad_norm": 0.22496090829372406, + "learning_rate": 3.0098183296812277e-05, + "loss": 1.7793, + "step": 20898 + }, + { + "epoch": 6.414671577655003, + "grad_norm": 0.2601132392883301, + "learning_rate": 3.0093623554396416e-05, + "loss": 1.8358, + "step": 20899 + }, + { + "epoch": 6.4149785144260285, + "grad_norm": 0.2364497184753418, + "learning_rate": 3.0089064008707026e-05, + "loss": 1.7299, + "step": 20900 + }, + { + "epoch": 6.415285451197054, + "grad_norm": 0.2011861503124237, + "learning_rate": 3.0084504659789186e-05, + "loss": 1.7521, + "step": 20901 + }, + { + "epoch": 6.415592387968078, + "grad_norm": 0.20605513453483582, + "learning_rate": 3.007994550768793e-05, + "loss": 1.7099, + "step": 20902 + }, + { + "epoch": 6.415899324739104, + "grad_norm": 0.20890796184539795, + "learning_rate": 3.0075386552448337e-05, + "loss": 1.7383, + "step": 20903 + }, + { + "epoch": 6.416206261510129, + "grad_norm": 0.20005083084106445, + "learning_rate": 3.0070827794115452e-05, + "loss": 1.6999, + "step": 20904 + }, + { + "epoch": 6.416513198281154, + "grad_norm": 0.20547670125961304, + "learning_rate": 3.006626923273433e-05, + "loss": 1.7424, + "step": 20905 + }, + { + "epoch": 6.41682013505218, + "grad_norm": 0.20799006521701813, + "learning_rate": 3.0061710868350003e-05, + "loss": 1.7266, + "step": 20906 + }, + { + "epoch": 6.417127071823204, + "grad_norm": 0.22234687209129333, + "learning_rate": 3.0057152701007563e-05, + "loss": 1.7755, + "step": 20907 + }, + { + "epoch": 6.417434008594229, + "grad_norm": 0.21947267651557922, + "learning_rate": 3.0052594730752005e-05, + "loss": 1.826, + "step": 20908 + }, + { + "epoch": 6.417740945365255, + "grad_norm": 0.2183268964290619, + "learning_rate": 3.0048036957628416e-05, + "loss": 1.7772, + "step": 20909 + }, + { + "epoch": 6.41804788213628, + "grad_norm": 0.1967134177684784, + "learning_rate": 3.0043479381681805e-05, + "loss": 1.6833, + "step": 20910 + }, + { + "epoch": 6.418354818907305, + "grad_norm": 0.2016787827014923, + "learning_rate": 3.003892200295723e-05, + "loss": 1.773, + "step": 20911 + }, + { + "epoch": 6.41866175567833, + "grad_norm": 0.2192344218492508, + "learning_rate": 3.0034364821499745e-05, + "loss": 1.7124, + "step": 20912 + }, + { + "epoch": 6.418968692449355, + "grad_norm": 0.24924327433109283, + "learning_rate": 3.002980783735434e-05, + "loss": 1.6882, + "step": 20913 + }, + { + "epoch": 6.4192756292203805, + "grad_norm": 0.2221844494342804, + "learning_rate": 3.0025251050566106e-05, + "loss": 1.8028, + "step": 20914 + }, + { + "epoch": 6.419582565991406, + "grad_norm": 0.27141162753105164, + "learning_rate": 3.0020694461180033e-05, + "loss": 1.698, + "step": 20915 + }, + { + "epoch": 6.419889502762431, + "grad_norm": 0.18856655061244965, + "learning_rate": 3.001613806924117e-05, + "loss": 1.7112, + "step": 20916 + }, + { + "epoch": 6.420196439533456, + "grad_norm": 0.2226688265800476, + "learning_rate": 3.0011581874794537e-05, + "loss": 1.6967, + "step": 20917 + }, + { + "epoch": 6.420503376304481, + "grad_norm": 0.2070344239473343, + "learning_rate": 3.000702587788518e-05, + "loss": 1.742, + "step": 20918 + }, + { + "epoch": 6.420810313075506, + "grad_norm": 0.22616387903690338, + "learning_rate": 3.00024700785581e-05, + "loss": 1.6865, + "step": 20919 + }, + { + "epoch": 6.421117249846532, + "grad_norm": 0.19745604693889618, + "learning_rate": 2.9997914476858348e-05, + "loss": 1.7328, + "step": 20920 + }, + { + "epoch": 6.421424186617557, + "grad_norm": 0.20654593408107758, + "learning_rate": 2.9993359072830906e-05, + "loss": 1.7811, + "step": 20921 + }, + { + "epoch": 6.421731123388582, + "grad_norm": 0.19188611209392548, + "learning_rate": 2.9988803866520832e-05, + "loss": 1.6808, + "step": 20922 + }, + { + "epoch": 6.422038060159607, + "grad_norm": 0.19907493889331818, + "learning_rate": 2.9984248857973118e-05, + "loss": 1.7326, + "step": 20923 + }, + { + "epoch": 6.422344996930632, + "grad_norm": 0.17484794557094574, + "learning_rate": 2.9979694047232804e-05, + "loss": 1.7166, + "step": 20924 + }, + { + "epoch": 6.422651933701657, + "grad_norm": 0.21412795782089233, + "learning_rate": 2.997513943434487e-05, + "loss": 1.7926, + "step": 20925 + }, + { + "epoch": 6.422958870472683, + "grad_norm": 0.17554008960723877, + "learning_rate": 2.9970585019354357e-05, + "loss": 1.6931, + "step": 20926 + }, + { + "epoch": 6.423265807243708, + "grad_norm": 0.16687868535518646, + "learning_rate": 2.9966030802306256e-05, + "loss": 1.6911, + "step": 20927 + }, + { + "epoch": 6.4235727440147325, + "grad_norm": 0.1802106350660324, + "learning_rate": 2.9961476783245578e-05, + "loss": 1.6921, + "step": 20928 + }, + { + "epoch": 6.423879680785758, + "grad_norm": 0.1968134343624115, + "learning_rate": 2.9956922962217347e-05, + "loss": 1.7035, + "step": 20929 + }, + { + "epoch": 6.424186617556783, + "grad_norm": 0.17703908681869507, + "learning_rate": 2.9952369339266538e-05, + "loss": 1.7122, + "step": 20930 + }, + { + "epoch": 6.4244935543278086, + "grad_norm": 0.22176744043827057, + "learning_rate": 2.9947815914438175e-05, + "loss": 1.7189, + "step": 20931 + }, + { + "epoch": 6.424800491098834, + "grad_norm": 0.19128306210041046, + "learning_rate": 2.9943262687777236e-05, + "loss": 1.7208, + "step": 20932 + }, + { + "epoch": 6.425107427869859, + "grad_norm": 0.2285725623369217, + "learning_rate": 2.9938709659328735e-05, + "loss": 1.7859, + "step": 20933 + }, + { + "epoch": 6.425414364640884, + "grad_norm": 0.1998651921749115, + "learning_rate": 2.9934156829137653e-05, + "loss": 1.6912, + "step": 20934 + }, + { + "epoch": 6.425721301411909, + "grad_norm": 0.1879023313522339, + "learning_rate": 2.9929604197249016e-05, + "loss": 1.7164, + "step": 20935 + }, + { + "epoch": 6.426028238182934, + "grad_norm": 0.2675700783729553, + "learning_rate": 2.992505176370778e-05, + "loss": 1.7475, + "step": 20936 + }, + { + "epoch": 6.42633517495396, + "grad_norm": 0.22345949709415436, + "learning_rate": 2.992049952855896e-05, + "loss": 1.6867, + "step": 20937 + }, + { + "epoch": 6.426642111724985, + "grad_norm": 0.17801997065544128, + "learning_rate": 2.9915947491847517e-05, + "loss": 1.736, + "step": 20938 + }, + { + "epoch": 6.4269490484960095, + "grad_norm": 0.22132502496242523, + "learning_rate": 2.991139565361846e-05, + "loss": 1.7244, + "step": 20939 + }, + { + "epoch": 6.427255985267035, + "grad_norm": 0.1899508535861969, + "learning_rate": 2.9906844013916758e-05, + "loss": 1.6781, + "step": 20940 + }, + { + "epoch": 6.42756292203806, + "grad_norm": 0.21948131918907166, + "learning_rate": 2.9902292572787414e-05, + "loss": 1.6911, + "step": 20941 + }, + { + "epoch": 6.4278698588090855, + "grad_norm": 0.16277503967285156, + "learning_rate": 2.9897741330275387e-05, + "loss": 1.702, + "step": 20942 + }, + { + "epoch": 6.428176795580111, + "grad_norm": 0.22303056716918945, + "learning_rate": 2.989319028642567e-05, + "loss": 1.7573, + "step": 20943 + }, + { + "epoch": 6.428483732351136, + "grad_norm": 0.21077899634838104, + "learning_rate": 2.9888639441283217e-05, + "loss": 1.7903, + "step": 20944 + }, + { + "epoch": 6.428790669122161, + "grad_norm": 0.23918256163597107, + "learning_rate": 2.988408879489303e-05, + "loss": 1.7112, + "step": 20945 + }, + { + "epoch": 6.429097605893186, + "grad_norm": 0.22226610779762268, + "learning_rate": 2.9879538347300074e-05, + "loss": 1.7039, + "step": 20946 + }, + { + "epoch": 6.429404542664211, + "grad_norm": 0.18605270981788635, + "learning_rate": 2.987498809854929e-05, + "loss": 1.7102, + "step": 20947 + }, + { + "epoch": 6.429711479435237, + "grad_norm": 0.24812746047973633, + "learning_rate": 2.987043804868569e-05, + "loss": 1.7112, + "step": 20948 + }, + { + "epoch": 6.430018416206262, + "grad_norm": 0.1869048923254013, + "learning_rate": 2.9865888197754206e-05, + "loss": 1.6946, + "step": 20949 + }, + { + "epoch": 6.430325352977286, + "grad_norm": 0.30707576870918274, + "learning_rate": 2.986133854579982e-05, + "loss": 1.7596, + "step": 20950 + }, + { + "epoch": 6.430632289748312, + "grad_norm": 0.20475640892982483, + "learning_rate": 2.985678909286748e-05, + "loss": 1.7162, + "step": 20951 + }, + { + "epoch": 6.430939226519337, + "grad_norm": 0.24273128807544708, + "learning_rate": 2.9852239839002182e-05, + "loss": 1.6803, + "step": 20952 + }, + { + "epoch": 6.431246163290362, + "grad_norm": 0.27484890818595886, + "learning_rate": 2.9847690784248834e-05, + "loss": 1.7948, + "step": 20953 + }, + { + "epoch": 6.431553100061388, + "grad_norm": 0.2204331010580063, + "learning_rate": 2.984314192865244e-05, + "loss": 1.769, + "step": 20954 + }, + { + "epoch": 6.431860036832412, + "grad_norm": 0.262463241815567, + "learning_rate": 2.9838593272257907e-05, + "loss": 1.7483, + "step": 20955 + }, + { + "epoch": 6.4321669736034375, + "grad_norm": 0.225942924618721, + "learning_rate": 2.983404481511023e-05, + "loss": 1.7228, + "step": 20956 + }, + { + "epoch": 6.432473910374463, + "grad_norm": 0.22381044924259186, + "learning_rate": 2.982949655725432e-05, + "loss": 1.7579, + "step": 20957 + }, + { + "epoch": 6.432780847145488, + "grad_norm": 0.1937711238861084, + "learning_rate": 2.982494849873518e-05, + "loss": 1.6833, + "step": 20958 + }, + { + "epoch": 6.4330877839165135, + "grad_norm": 0.2609664499759674, + "learning_rate": 2.9820400639597702e-05, + "loss": 1.7524, + "step": 20959 + }, + { + "epoch": 6.433394720687538, + "grad_norm": 0.2891463041305542, + "learning_rate": 2.981585297988686e-05, + "loss": 1.7672, + "step": 20960 + }, + { + "epoch": 6.433701657458563, + "grad_norm": 0.19604064524173737, + "learning_rate": 2.9811305519647582e-05, + "loss": 1.6684, + "step": 20961 + }, + { + "epoch": 6.434008594229589, + "grad_norm": 0.23522239923477173, + "learning_rate": 2.9806758258924822e-05, + "loss": 1.7461, + "step": 20962 + }, + { + "epoch": 6.434315531000614, + "grad_norm": 0.24907514452934265, + "learning_rate": 2.9802211197763525e-05, + "loss": 1.7702, + "step": 20963 + }, + { + "epoch": 6.434622467771639, + "grad_norm": 0.21963126957416534, + "learning_rate": 2.9797664336208592e-05, + "loss": 1.7263, + "step": 20964 + }, + { + "epoch": 6.434929404542665, + "grad_norm": 0.23124000430107117, + "learning_rate": 2.9793117674305004e-05, + "loss": 1.7362, + "step": 20965 + }, + { + "epoch": 6.435236341313689, + "grad_norm": 0.1917882263660431, + "learning_rate": 2.978857121209765e-05, + "loss": 1.7505, + "step": 20966 + }, + { + "epoch": 6.435543278084714, + "grad_norm": 0.24407804012298584, + "learning_rate": 2.9784024949631484e-05, + "loss": 1.7898, + "step": 20967 + }, + { + "epoch": 6.43585021485574, + "grad_norm": 0.210384339094162, + "learning_rate": 2.977947888695143e-05, + "loss": 1.7515, + "step": 20968 + }, + { + "epoch": 6.436157151626765, + "grad_norm": 0.20764803886413574, + "learning_rate": 2.9774933024102436e-05, + "loss": 1.7628, + "step": 20969 + }, + { + "epoch": 6.43646408839779, + "grad_norm": 0.21542097628116608, + "learning_rate": 2.9770387361129387e-05, + "loss": 1.7882, + "step": 20970 + }, + { + "epoch": 6.436771025168815, + "grad_norm": 0.1768570989370346, + "learning_rate": 2.976584189807725e-05, + "loss": 1.7471, + "step": 20971 + }, + { + "epoch": 6.43707796193984, + "grad_norm": 0.2398732751607895, + "learning_rate": 2.97612966349909e-05, + "loss": 1.6676, + "step": 20972 + }, + { + "epoch": 6.4373848987108655, + "grad_norm": 0.18291664123535156, + "learning_rate": 2.9756751571915286e-05, + "loss": 1.6791, + "step": 20973 + }, + { + "epoch": 6.437691835481891, + "grad_norm": 0.2769327759742737, + "learning_rate": 2.9752206708895314e-05, + "loss": 1.7675, + "step": 20974 + }, + { + "epoch": 6.437998772252916, + "grad_norm": 0.24859526753425598, + "learning_rate": 2.974766204597592e-05, + "loss": 1.7661, + "step": 20975 + }, + { + "epoch": 6.4383057090239415, + "grad_norm": 0.20495273172855377, + "learning_rate": 2.9743117583201984e-05, + "loss": 1.6774, + "step": 20976 + }, + { + "epoch": 6.438612645794966, + "grad_norm": 0.24650859832763672, + "learning_rate": 2.9738573320618447e-05, + "loss": 1.759, + "step": 20977 + }, + { + "epoch": 6.438919582565991, + "grad_norm": 0.21430176496505737, + "learning_rate": 2.973402925827019e-05, + "loss": 1.7273, + "step": 20978 + }, + { + "epoch": 6.439226519337017, + "grad_norm": 0.22392596304416656, + "learning_rate": 2.972948539620214e-05, + "loss": 1.7506, + "step": 20979 + }, + { + "epoch": 6.439533456108042, + "grad_norm": 0.24393923580646515, + "learning_rate": 2.9724941734459205e-05, + "loss": 1.7815, + "step": 20980 + }, + { + "epoch": 6.439840392879067, + "grad_norm": 0.2873772084712982, + "learning_rate": 2.9720398273086264e-05, + "loss": 1.7863, + "step": 20981 + }, + { + "epoch": 6.440147329650092, + "grad_norm": 0.218470498919487, + "learning_rate": 2.9715855012128246e-05, + "loss": 1.7347, + "step": 20982 + }, + { + "epoch": 6.440454266421117, + "grad_norm": 0.24520666897296906, + "learning_rate": 2.971131195163003e-05, + "loss": 1.6892, + "step": 20983 + }, + { + "epoch": 6.440761203192142, + "grad_norm": 0.2255270928144455, + "learning_rate": 2.970676909163652e-05, + "loss": 1.7179, + "step": 20984 + }, + { + "epoch": 6.441068139963168, + "grad_norm": 0.25171026587486267, + "learning_rate": 2.9702226432192604e-05, + "loss": 1.7087, + "step": 20985 + }, + { + "epoch": 6.441375076734193, + "grad_norm": 0.27045872807502747, + "learning_rate": 2.9697683973343204e-05, + "loss": 1.732, + "step": 20986 + }, + { + "epoch": 6.4416820135052175, + "grad_norm": 0.25374144315719604, + "learning_rate": 2.9693141715133177e-05, + "loss": 1.7688, + "step": 20987 + }, + { + "epoch": 6.441988950276243, + "grad_norm": 0.22694779932498932, + "learning_rate": 2.9688599657607442e-05, + "loss": 1.7105, + "step": 20988 + }, + { + "epoch": 6.442295887047268, + "grad_norm": 0.23455791175365448, + "learning_rate": 2.9684057800810845e-05, + "loss": 1.8007, + "step": 20989 + }, + { + "epoch": 6.4426028238182935, + "grad_norm": 0.23054158687591553, + "learning_rate": 2.9679516144788312e-05, + "loss": 1.6787, + "step": 20990 + }, + { + "epoch": 6.442909760589319, + "grad_norm": 0.22110030055046082, + "learning_rate": 2.9674974689584696e-05, + "loss": 1.8048, + "step": 20991 + }, + { + "epoch": 6.443216697360343, + "grad_norm": 0.22141657769680023, + "learning_rate": 2.9670433435244915e-05, + "loss": 1.7691, + "step": 20992 + }, + { + "epoch": 6.443523634131369, + "grad_norm": 0.18511974811553955, + "learning_rate": 2.9665892381813807e-05, + "loss": 1.6825, + "step": 20993 + }, + { + "epoch": 6.443830570902394, + "grad_norm": 0.21904997527599335, + "learning_rate": 2.966135152933629e-05, + "loss": 1.7711, + "step": 20994 + }, + { + "epoch": 6.444137507673419, + "grad_norm": 0.19334301352500916, + "learning_rate": 2.9656810877857196e-05, + "loss": 1.687, + "step": 20995 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 0.1766969859600067, + "learning_rate": 2.9652270427421426e-05, + "loss": 1.7211, + "step": 20996 + }, + { + "epoch": 6.44475138121547, + "grad_norm": 0.1821468323469162, + "learning_rate": 2.9647730178073864e-05, + "loss": 1.7086, + "step": 20997 + }, + { + "epoch": 6.445058317986494, + "grad_norm": 0.20812760293483734, + "learning_rate": 2.9643190129859333e-05, + "loss": 1.6844, + "step": 20998 + }, + { + "epoch": 6.44536525475752, + "grad_norm": 0.259042352437973, + "learning_rate": 2.9638650282822754e-05, + "loss": 1.7971, + "step": 20999 + }, + { + "epoch": 6.445672191528545, + "grad_norm": 0.2134076952934265, + "learning_rate": 2.9634110637008948e-05, + "loss": 1.7061, + "step": 21000 + }, + { + "epoch": 6.44597912829957, + "grad_norm": 0.21120613813400269, + "learning_rate": 2.962957119246281e-05, + "loss": 1.6708, + "step": 21001 + }, + { + "epoch": 6.446286065070596, + "grad_norm": 0.18577797710895538, + "learning_rate": 2.9625031949229176e-05, + "loss": 1.719, + "step": 21002 + }, + { + "epoch": 6.44659300184162, + "grad_norm": 0.21755708754062653, + "learning_rate": 2.962049290735294e-05, + "loss": 1.7203, + "step": 21003 + }, + { + "epoch": 6.4468999386126455, + "grad_norm": 0.2161538451910019, + "learning_rate": 2.961595406687891e-05, + "loss": 1.7254, + "step": 21004 + }, + { + "epoch": 6.447206875383671, + "grad_norm": 0.19979329407215118, + "learning_rate": 2.9611415427851995e-05, + "loss": 1.7203, + "step": 21005 + }, + { + "epoch": 6.447513812154696, + "grad_norm": 0.2103399932384491, + "learning_rate": 2.9606876990317e-05, + "loss": 1.7291, + "step": 21006 + }, + { + "epoch": 6.4478207489257215, + "grad_norm": 0.19513745605945587, + "learning_rate": 2.9602338754318815e-05, + "loss": 1.7574, + "step": 21007 + }, + { + "epoch": 6.448127685696747, + "grad_norm": 0.19819851219654083, + "learning_rate": 2.9597800719902256e-05, + "loss": 1.6913, + "step": 21008 + }, + { + "epoch": 6.448434622467771, + "grad_norm": 0.1847768872976303, + "learning_rate": 2.9593262887112215e-05, + "loss": 1.6987, + "step": 21009 + }, + { + "epoch": 6.448741559238797, + "grad_norm": 0.22399301826953888, + "learning_rate": 2.9588725255993487e-05, + "loss": 1.8328, + "step": 21010 + }, + { + "epoch": 6.449048496009822, + "grad_norm": 0.20540264248847961, + "learning_rate": 2.958418782659097e-05, + "loss": 1.765, + "step": 21011 + }, + { + "epoch": 6.449355432780847, + "grad_norm": 0.183661550283432, + "learning_rate": 2.9579650598949442e-05, + "loss": 1.7128, + "step": 21012 + }, + { + "epoch": 6.449662369551873, + "grad_norm": 0.1972927302122116, + "learning_rate": 2.9575113573113788e-05, + "loss": 1.717, + "step": 21013 + }, + { + "epoch": 6.449969306322897, + "grad_norm": 0.20188379287719727, + "learning_rate": 2.9570576749128846e-05, + "loss": 1.7603, + "step": 21014 + }, + { + "epoch": 6.4502762430939224, + "grad_norm": 0.20789781212806702, + "learning_rate": 2.9566040127039418e-05, + "loss": 1.7142, + "step": 21015 + }, + { + "epoch": 6.450583179864948, + "grad_norm": 0.19319608807563782, + "learning_rate": 2.956150370689038e-05, + "loss": 1.7524, + "step": 21016 + }, + { + "epoch": 6.450890116635973, + "grad_norm": 0.2153816968202591, + "learning_rate": 2.9556967488726516e-05, + "loss": 1.7325, + "step": 21017 + }, + { + "epoch": 6.4511970534069984, + "grad_norm": 0.19134823977947235, + "learning_rate": 2.9552431472592702e-05, + "loss": 1.7547, + "step": 21018 + }, + { + "epoch": 6.451503990178024, + "grad_norm": 0.21069955825805664, + "learning_rate": 2.9547895658533725e-05, + "loss": 1.7038, + "step": 21019 + }, + { + "epoch": 6.451810926949048, + "grad_norm": 0.20742546021938324, + "learning_rate": 2.9543360046594455e-05, + "loss": 1.7151, + "step": 21020 + }, + { + "epoch": 6.452117863720074, + "grad_norm": 0.16917672753334045, + "learning_rate": 2.9538824636819666e-05, + "loss": 1.6957, + "step": 21021 + }, + { + "epoch": 6.452424800491099, + "grad_norm": 0.21134577691555023, + "learning_rate": 2.953428942925423e-05, + "loss": 1.711, + "step": 21022 + }, + { + "epoch": 6.452731737262124, + "grad_norm": 0.19403810799121857, + "learning_rate": 2.9529754423942918e-05, + "loss": 1.734, + "step": 21023 + }, + { + "epoch": 6.45303867403315, + "grad_norm": 0.18534770607948303, + "learning_rate": 2.9525219620930582e-05, + "loss": 1.6857, + "step": 21024 + }, + { + "epoch": 6.453345610804174, + "grad_norm": 0.24268858134746552, + "learning_rate": 2.9520685020262016e-05, + "loss": 1.7316, + "step": 21025 + }, + { + "epoch": 6.453652547575199, + "grad_norm": 0.17590615153312683, + "learning_rate": 2.9516150621982063e-05, + "loss": 1.6608, + "step": 21026 + }, + { + "epoch": 6.453959484346225, + "grad_norm": 0.1949763298034668, + "learning_rate": 2.9511616426135504e-05, + "loss": 1.7955, + "step": 21027 + }, + { + "epoch": 6.45426642111725, + "grad_norm": 0.2424435019493103, + "learning_rate": 2.950708243276717e-05, + "loss": 1.7334, + "step": 21028 + }, + { + "epoch": 6.454573357888275, + "grad_norm": 0.22753369808197021, + "learning_rate": 2.950254864192184e-05, + "loss": 1.733, + "step": 21029 + }, + { + "epoch": 6.4548802946593, + "grad_norm": 0.1706271469593048, + "learning_rate": 2.949801505364435e-05, + "loss": 1.7424, + "step": 21030 + }, + { + "epoch": 6.455187231430325, + "grad_norm": 0.21614442765712738, + "learning_rate": 2.9493481667979506e-05, + "loss": 1.7813, + "step": 21031 + }, + { + "epoch": 6.4554941682013505, + "grad_norm": 0.1793162226676941, + "learning_rate": 2.9488948484972068e-05, + "loss": 1.7076, + "step": 21032 + }, + { + "epoch": 6.455801104972376, + "grad_norm": 0.19251759350299835, + "learning_rate": 2.9484415504666885e-05, + "loss": 1.7487, + "step": 21033 + }, + { + "epoch": 6.456108041743401, + "grad_norm": 0.1817556619644165, + "learning_rate": 2.947988272710871e-05, + "loss": 1.6958, + "step": 21034 + }, + { + "epoch": 6.456414978514426, + "grad_norm": 0.24368418753147125, + "learning_rate": 2.9475350152342378e-05, + "loss": 1.7867, + "step": 21035 + }, + { + "epoch": 6.456721915285451, + "grad_norm": 0.2362157702445984, + "learning_rate": 2.9470817780412653e-05, + "loss": 1.7241, + "step": 21036 + }, + { + "epoch": 6.457028852056476, + "grad_norm": 0.21049003303050995, + "learning_rate": 2.9466285611364358e-05, + "loss": 1.7146, + "step": 21037 + }, + { + "epoch": 6.457335788827502, + "grad_norm": 0.2516530454158783, + "learning_rate": 2.9461753645242246e-05, + "loss": 1.7349, + "step": 21038 + }, + { + "epoch": 6.457642725598527, + "grad_norm": 0.23165179789066315, + "learning_rate": 2.945722188209114e-05, + "loss": 1.7285, + "step": 21039 + }, + { + "epoch": 6.457949662369552, + "grad_norm": 0.27345010638237, + "learning_rate": 2.945269032195579e-05, + "loss": 1.7266, + "step": 21040 + }, + { + "epoch": 6.458256599140577, + "grad_norm": 0.16312900185585022, + "learning_rate": 2.9448158964881e-05, + "loss": 1.6781, + "step": 21041 + }, + { + "epoch": 6.458563535911602, + "grad_norm": 0.238658607006073, + "learning_rate": 2.9443627810911557e-05, + "loss": 1.6819, + "step": 21042 + }, + { + "epoch": 6.458870472682627, + "grad_norm": 0.19861388206481934, + "learning_rate": 2.943909686009223e-05, + "loss": 1.7397, + "step": 21043 + }, + { + "epoch": 6.459177409453653, + "grad_norm": 0.22675637900829315, + "learning_rate": 2.9434566112467793e-05, + "loss": 1.7231, + "step": 21044 + }, + { + "epoch": 6.459484346224678, + "grad_norm": 0.22638066112995148, + "learning_rate": 2.9430035568083043e-05, + "loss": 1.7466, + "step": 21045 + }, + { + "epoch": 6.4597912829957025, + "grad_norm": 0.2237064391374588, + "learning_rate": 2.942550522698272e-05, + "loss": 1.7373, + "step": 21046 + }, + { + "epoch": 6.460098219766728, + "grad_norm": 0.2613731324672699, + "learning_rate": 2.942097508921162e-05, + "loss": 1.7567, + "step": 21047 + }, + { + "epoch": 6.460405156537753, + "grad_norm": 0.21602070331573486, + "learning_rate": 2.941644515481452e-05, + "loss": 1.7512, + "step": 21048 + }, + { + "epoch": 6.4607120933087785, + "grad_norm": 0.30129116773605347, + "learning_rate": 2.941191542383615e-05, + "loss": 1.761, + "step": 21049 + }, + { + "epoch": 6.461019030079804, + "grad_norm": 0.2303919792175293, + "learning_rate": 2.940738589632132e-05, + "loss": 1.742, + "step": 21050 + }, + { + "epoch": 6.461325966850829, + "grad_norm": 0.2195158153772354, + "learning_rate": 2.940285657231475e-05, + "loss": 1.7169, + "step": 21051 + }, + { + "epoch": 6.461632903621854, + "grad_norm": 0.19029918313026428, + "learning_rate": 2.9398327451861242e-05, + "loss": 1.6721, + "step": 21052 + }, + { + "epoch": 6.461939840392879, + "grad_norm": 0.2006317377090454, + "learning_rate": 2.939379853500553e-05, + "loss": 1.7393, + "step": 21053 + }, + { + "epoch": 6.462246777163904, + "grad_norm": 0.222677081823349, + "learning_rate": 2.9389269821792377e-05, + "loss": 1.7858, + "step": 21054 + }, + { + "epoch": 6.46255371393493, + "grad_norm": 0.20772451162338257, + "learning_rate": 2.938474131226654e-05, + "loss": 1.735, + "step": 21055 + }, + { + "epoch": 6.462860650705955, + "grad_norm": 0.21006503701210022, + "learning_rate": 2.9380213006472778e-05, + "loss": 1.7197, + "step": 21056 + }, + { + "epoch": 6.463167587476979, + "grad_norm": 0.23545250296592712, + "learning_rate": 2.9375684904455825e-05, + "loss": 1.8278, + "step": 21057 + }, + { + "epoch": 6.463474524248005, + "grad_norm": 0.24590329825878143, + "learning_rate": 2.937115700626045e-05, + "loss": 1.6411, + "step": 21058 + }, + { + "epoch": 6.46378146101903, + "grad_norm": 0.22359445691108704, + "learning_rate": 2.9366629311931393e-05, + "loss": 1.7901, + "step": 21059 + }, + { + "epoch": 6.464088397790055, + "grad_norm": 0.22807523608207703, + "learning_rate": 2.93621018215134e-05, + "loss": 1.7472, + "step": 21060 + }, + { + "epoch": 6.464395334561081, + "grad_norm": 0.24183115363121033, + "learning_rate": 2.93575745350512e-05, + "loss": 1.7553, + "step": 21061 + }, + { + "epoch": 6.464702271332105, + "grad_norm": 0.23809055984020233, + "learning_rate": 2.935304745258958e-05, + "loss": 1.7451, + "step": 21062 + }, + { + "epoch": 6.4650092081031305, + "grad_norm": 0.28455644845962524, + "learning_rate": 2.934852057417321e-05, + "loss": 1.8112, + "step": 21063 + }, + { + "epoch": 6.465316144874156, + "grad_norm": 0.22193321585655212, + "learning_rate": 2.9343993899846888e-05, + "loss": 1.747, + "step": 21064 + }, + { + "epoch": 6.465623081645181, + "grad_norm": 0.30524322390556335, + "learning_rate": 2.933946742965532e-05, + "loss": 1.7117, + "step": 21065 + }, + { + "epoch": 6.4659300184162065, + "grad_norm": 0.19748717546463013, + "learning_rate": 2.9334941163643233e-05, + "loss": 1.6899, + "step": 21066 + }, + { + "epoch": 6.466236955187231, + "grad_norm": 0.25551193952560425, + "learning_rate": 2.933041510185539e-05, + "loss": 1.7264, + "step": 21067 + }, + { + "epoch": 6.466543891958256, + "grad_norm": 0.20016206800937653, + "learning_rate": 2.932588924433648e-05, + "loss": 1.6613, + "step": 21068 + }, + { + "epoch": 6.466850828729282, + "grad_norm": 0.31049394607543945, + "learning_rate": 2.932136359113127e-05, + "loss": 1.6575, + "step": 21069 + }, + { + "epoch": 6.467157765500307, + "grad_norm": 0.29408347606658936, + "learning_rate": 2.9316838142284436e-05, + "loss": 1.72, + "step": 21070 + }, + { + "epoch": 6.467464702271332, + "grad_norm": 0.18981193006038666, + "learning_rate": 2.9312312897840748e-05, + "loss": 1.6799, + "step": 21071 + }, + { + "epoch": 6.467771639042358, + "grad_norm": 0.26828575134277344, + "learning_rate": 2.9307787857844905e-05, + "loss": 1.6983, + "step": 21072 + }, + { + "epoch": 6.468078575813382, + "grad_norm": 0.2605530321598053, + "learning_rate": 2.9303263022341642e-05, + "loss": 1.7973, + "step": 21073 + }, + { + "epoch": 6.468385512584407, + "grad_norm": 0.389957070350647, + "learning_rate": 2.9298738391375648e-05, + "loss": 1.7288, + "step": 21074 + }, + { + "epoch": 6.468692449355433, + "grad_norm": 0.20525416731834412, + "learning_rate": 2.9294213964991667e-05, + "loss": 1.7526, + "step": 21075 + }, + { + "epoch": 6.468999386126458, + "grad_norm": 0.3628186285495758, + "learning_rate": 2.9289689743234387e-05, + "loss": 1.7055, + "step": 21076 + }, + { + "epoch": 6.469306322897483, + "grad_norm": 0.21661829948425293, + "learning_rate": 2.9285165726148545e-05, + "loss": 1.7806, + "step": 21077 + }, + { + "epoch": 6.469613259668508, + "grad_norm": 0.3815501034259796, + "learning_rate": 2.9280641913778816e-05, + "loss": 1.7257, + "step": 21078 + }, + { + "epoch": 6.469920196439533, + "grad_norm": 0.19470983743667603, + "learning_rate": 2.9276118306169957e-05, + "loss": 1.7055, + "step": 21079 + }, + { + "epoch": 6.4702271332105585, + "grad_norm": 0.36236056685447693, + "learning_rate": 2.927159490336662e-05, + "loss": 1.6748, + "step": 21080 + }, + { + "epoch": 6.470534069981584, + "grad_norm": 0.201282799243927, + "learning_rate": 2.9267071705413552e-05, + "loss": 1.6987, + "step": 21081 + }, + { + "epoch": 6.470841006752609, + "grad_norm": 0.3806697130203247, + "learning_rate": 2.9262548712355425e-05, + "loss": 1.7386, + "step": 21082 + }, + { + "epoch": 6.4711479435236345, + "grad_norm": 0.3023025691509247, + "learning_rate": 2.9258025924236933e-05, + "loss": 1.7183, + "step": 21083 + }, + { + "epoch": 6.471454880294659, + "grad_norm": 0.2648932635784149, + "learning_rate": 2.9253503341102806e-05, + "loss": 1.6755, + "step": 21084 + }, + { + "epoch": 6.471761817065684, + "grad_norm": 0.2647169828414917, + "learning_rate": 2.9248980962997707e-05, + "loss": 1.7326, + "step": 21085 + }, + { + "epoch": 6.47206875383671, + "grad_norm": 0.23535950481891632, + "learning_rate": 2.9244458789966355e-05, + "loss": 1.7541, + "step": 21086 + }, + { + "epoch": 6.472375690607735, + "grad_norm": 0.2551584541797638, + "learning_rate": 2.9239936822053403e-05, + "loss": 1.6907, + "step": 21087 + }, + { + "epoch": 6.47268262737876, + "grad_norm": 0.23313823342323303, + "learning_rate": 2.923541505930357e-05, + "loss": 1.705, + "step": 21088 + }, + { + "epoch": 6.472989564149785, + "grad_norm": 0.2368597686290741, + "learning_rate": 2.9230893501761534e-05, + "loss": 1.6666, + "step": 21089 + }, + { + "epoch": 6.47329650092081, + "grad_norm": 0.17861969769001007, + "learning_rate": 2.9226372149472003e-05, + "loss": 1.6927, + "step": 21090 + }, + { + "epoch": 6.473603437691835, + "grad_norm": 0.2212727665901184, + "learning_rate": 2.9221851002479616e-05, + "loss": 1.6972, + "step": 21091 + }, + { + "epoch": 6.473910374462861, + "grad_norm": 0.19382402300834656, + "learning_rate": 2.9217330060829096e-05, + "loss": 1.7602, + "step": 21092 + }, + { + "epoch": 6.474217311233886, + "grad_norm": 0.2762092053890228, + "learning_rate": 2.9212809324565076e-05, + "loss": 1.7642, + "step": 21093 + }, + { + "epoch": 6.474524248004911, + "grad_norm": 0.22068747878074646, + "learning_rate": 2.9208288793732274e-05, + "loss": 1.7477, + "step": 21094 + }, + { + "epoch": 6.474831184775936, + "grad_norm": 0.19979839026927948, + "learning_rate": 2.9203768468375337e-05, + "loss": 1.7266, + "step": 21095 + }, + { + "epoch": 6.475138121546961, + "grad_norm": 0.23038682341575623, + "learning_rate": 2.9199248348538965e-05, + "loss": 1.7428, + "step": 21096 + }, + { + "epoch": 6.475445058317987, + "grad_norm": 0.16841283440589905, + "learning_rate": 2.91947284342678e-05, + "loss": 1.6788, + "step": 21097 + }, + { + "epoch": 6.475751995089012, + "grad_norm": 0.22812627255916595, + "learning_rate": 2.9190208725606528e-05, + "loss": 1.7513, + "step": 21098 + }, + { + "epoch": 6.476058931860037, + "grad_norm": 0.18409393727779388, + "learning_rate": 2.9185689222599832e-05, + "loss": 1.6834, + "step": 21099 + }, + { + "epoch": 6.476365868631062, + "grad_norm": 0.26226910948753357, + "learning_rate": 2.9181169925292313e-05, + "loss": 1.7375, + "step": 21100 + }, + { + "epoch": 6.476672805402087, + "grad_norm": 0.1915685385465622, + "learning_rate": 2.9176650833728697e-05, + "loss": 1.7521, + "step": 21101 + }, + { + "epoch": 6.476979742173112, + "grad_norm": 0.22342176735401154, + "learning_rate": 2.917213194795362e-05, + "loss": 1.8018, + "step": 21102 + }, + { + "epoch": 6.477286678944138, + "grad_norm": 0.18338742852210999, + "learning_rate": 2.9167613268011745e-05, + "loss": 1.6817, + "step": 21103 + }, + { + "epoch": 6.477593615715163, + "grad_norm": 0.23008635640144348, + "learning_rate": 2.9163094793947728e-05, + "loss": 1.7037, + "step": 21104 + }, + { + "epoch": 6.4779005524861875, + "grad_norm": 0.20954197645187378, + "learning_rate": 2.9158576525806215e-05, + "loss": 1.7565, + "step": 21105 + }, + { + "epoch": 6.478207489257213, + "grad_norm": 0.21065562963485718, + "learning_rate": 2.9154058463631874e-05, + "loss": 1.6899, + "step": 21106 + }, + { + "epoch": 6.478514426028238, + "grad_norm": 0.20217828452587128, + "learning_rate": 2.9149540607469335e-05, + "loss": 1.7055, + "step": 21107 + }, + { + "epoch": 6.4788213627992635, + "grad_norm": 0.19058823585510254, + "learning_rate": 2.9145022957363244e-05, + "loss": 1.6794, + "step": 21108 + }, + { + "epoch": 6.479128299570289, + "grad_norm": 0.2308664619922638, + "learning_rate": 2.9140505513358297e-05, + "loss": 1.7322, + "step": 21109 + }, + { + "epoch": 6.479435236341313, + "grad_norm": 0.18911845982074738, + "learning_rate": 2.9135988275499056e-05, + "loss": 1.7255, + "step": 21110 + }, + { + "epoch": 6.479742173112339, + "grad_norm": 0.21459296345710754, + "learning_rate": 2.9131471243830256e-05, + "loss": 1.6599, + "step": 21111 + }, + { + "epoch": 6.480049109883364, + "grad_norm": 0.20521530508995056, + "learning_rate": 2.912695441839644e-05, + "loss": 1.7564, + "step": 21112 + }, + { + "epoch": 6.480356046654389, + "grad_norm": 0.21924994885921478, + "learning_rate": 2.912243779924232e-05, + "loss": 1.6922, + "step": 21113 + }, + { + "epoch": 6.480662983425415, + "grad_norm": 0.18219491839408875, + "learning_rate": 2.911792138641253e-05, + "loss": 1.6907, + "step": 21114 + }, + { + "epoch": 6.48096992019644, + "grad_norm": 0.23122453689575195, + "learning_rate": 2.9113405179951626e-05, + "loss": 1.7665, + "step": 21115 + }, + { + "epoch": 6.481276856967464, + "grad_norm": 0.18411210179328918, + "learning_rate": 2.9108889179904348e-05, + "loss": 1.7216, + "step": 21116 + }, + { + "epoch": 6.48158379373849, + "grad_norm": 0.2251562923192978, + "learning_rate": 2.9104373386315225e-05, + "loss": 1.7605, + "step": 21117 + }, + { + "epoch": 6.481890730509515, + "grad_norm": 0.2252185344696045, + "learning_rate": 2.9099857799228957e-05, + "loss": 1.7345, + "step": 21118 + }, + { + "epoch": 6.48219766728054, + "grad_norm": 0.20799386501312256, + "learning_rate": 2.909534241869014e-05, + "loss": 1.7497, + "step": 21119 + }, + { + "epoch": 6.482504604051566, + "grad_norm": 0.2059052586555481, + "learning_rate": 2.90908272447434e-05, + "loss": 1.7444, + "step": 21120 + }, + { + "epoch": 6.48281154082259, + "grad_norm": 0.17851221561431885, + "learning_rate": 2.9086312277433362e-05, + "loss": 1.7208, + "step": 21121 + }, + { + "epoch": 6.4831184775936155, + "grad_norm": 0.20561498403549194, + "learning_rate": 2.908179751680465e-05, + "loss": 1.731, + "step": 21122 + }, + { + "epoch": 6.483425414364641, + "grad_norm": 0.2386128008365631, + "learning_rate": 2.9077282962901868e-05, + "loss": 1.7493, + "step": 21123 + }, + { + "epoch": 6.483732351135666, + "grad_norm": 0.21024827659130096, + "learning_rate": 2.9072768615769642e-05, + "loss": 1.7353, + "step": 21124 + }, + { + "epoch": 6.4840392879066915, + "grad_norm": 0.23443256318569183, + "learning_rate": 2.9068254475452582e-05, + "loss": 1.7419, + "step": 21125 + }, + { + "epoch": 6.484346224677717, + "grad_norm": 0.1849295198917389, + "learning_rate": 2.90637405419953e-05, + "loss": 1.7239, + "step": 21126 + }, + { + "epoch": 6.484653161448741, + "grad_norm": 0.1967659890651703, + "learning_rate": 2.9059226815442385e-05, + "loss": 1.7163, + "step": 21127 + }, + { + "epoch": 6.484960098219767, + "grad_norm": 0.20395416021347046, + "learning_rate": 2.9054713295838505e-05, + "loss": 1.7108, + "step": 21128 + }, + { + "epoch": 6.485267034990792, + "grad_norm": 0.24162746965885162, + "learning_rate": 2.9050199983228184e-05, + "loss": 1.7666, + "step": 21129 + }, + { + "epoch": 6.485573971761817, + "grad_norm": 0.18104900419712067, + "learning_rate": 2.9045686877656086e-05, + "loss": 1.6863, + "step": 21130 + }, + { + "epoch": 6.485880908532843, + "grad_norm": 0.18469318747520447, + "learning_rate": 2.9041173979166813e-05, + "loss": 1.7344, + "step": 21131 + }, + { + "epoch": 6.486187845303867, + "grad_norm": 0.18488821387290955, + "learning_rate": 2.90366612878049e-05, + "loss": 1.694, + "step": 21132 + }, + { + "epoch": 6.486494782074892, + "grad_norm": 0.2030600905418396, + "learning_rate": 2.903214880361503e-05, + "loss": 1.7079, + "step": 21133 + }, + { + "epoch": 6.486801718845918, + "grad_norm": 0.2222873419523239, + "learning_rate": 2.902763652664171e-05, + "loss": 1.7193, + "step": 21134 + }, + { + "epoch": 6.487108655616943, + "grad_norm": 0.1936846524477005, + "learning_rate": 2.9023124456929608e-05, + "loss": 1.7152, + "step": 21135 + }, + { + "epoch": 6.487415592387968, + "grad_norm": 0.25259360671043396, + "learning_rate": 2.9018612594523274e-05, + "loss": 1.776, + "step": 21136 + }, + { + "epoch": 6.487722529158993, + "grad_norm": 0.22994543612003326, + "learning_rate": 2.9014100939467316e-05, + "loss": 1.7437, + "step": 21137 + }, + { + "epoch": 6.488029465930018, + "grad_norm": 0.2646990716457367, + "learning_rate": 2.900958949180631e-05, + "loss": 1.7535, + "step": 21138 + }, + { + "epoch": 6.4883364027010435, + "grad_norm": 0.22973869740962982, + "learning_rate": 2.9005078251584843e-05, + "loss": 1.6772, + "step": 21139 + }, + { + "epoch": 6.488643339472069, + "grad_norm": 0.21261750161647797, + "learning_rate": 2.9000567218847497e-05, + "loss": 1.6899, + "step": 21140 + }, + { + "epoch": 6.488950276243094, + "grad_norm": 0.24828271567821503, + "learning_rate": 2.8996056393638858e-05, + "loss": 1.7994, + "step": 21141 + }, + { + "epoch": 6.4892572130141195, + "grad_norm": 0.18308857083320618, + "learning_rate": 2.8991545776003497e-05, + "loss": 1.7847, + "step": 21142 + }, + { + "epoch": 6.489564149785144, + "grad_norm": 0.22744092345237732, + "learning_rate": 2.8987035365985994e-05, + "loss": 1.7789, + "step": 21143 + }, + { + "epoch": 6.489871086556169, + "grad_norm": 0.18573936820030212, + "learning_rate": 2.8982525163630903e-05, + "loss": 1.6649, + "step": 21144 + }, + { + "epoch": 6.490178023327195, + "grad_norm": 0.26056674122810364, + "learning_rate": 2.8978015168982863e-05, + "loss": 1.68, + "step": 21145 + }, + { + "epoch": 6.49048496009822, + "grad_norm": 0.1912553906440735, + "learning_rate": 2.897350538208635e-05, + "loss": 1.7011, + "step": 21146 + }, + { + "epoch": 6.490791896869245, + "grad_norm": 0.25937187671661377, + "learning_rate": 2.896899580298603e-05, + "loss": 1.7409, + "step": 21147 + }, + { + "epoch": 6.49109883364027, + "grad_norm": 0.22148750722408295, + "learning_rate": 2.8964486431726397e-05, + "loss": 1.6921, + "step": 21148 + }, + { + "epoch": 6.491405770411295, + "grad_norm": 0.23678559064865112, + "learning_rate": 2.8959977268352012e-05, + "loss": 1.6833, + "step": 21149 + }, + { + "epoch": 6.49171270718232, + "grad_norm": 0.2942093312740326, + "learning_rate": 2.8955468312907506e-05, + "loss": 1.7119, + "step": 21150 + }, + { + "epoch": 6.492019643953346, + "grad_norm": 0.18726128339767456, + "learning_rate": 2.8950959565437365e-05, + "loss": 1.7067, + "step": 21151 + }, + { + "epoch": 6.492326580724371, + "grad_norm": 0.23851951956748962, + "learning_rate": 2.894645102598621e-05, + "loss": 1.73, + "step": 21152 + }, + { + "epoch": 6.4926335174953955, + "grad_norm": 0.18054445087909698, + "learning_rate": 2.8941942694598533e-05, + "loss": 1.7243, + "step": 21153 + }, + { + "epoch": 6.492940454266421, + "grad_norm": 0.21889349818229675, + "learning_rate": 2.8937434571318934e-05, + "loss": 1.7789, + "step": 21154 + }, + { + "epoch": 6.493247391037446, + "grad_norm": 0.18788981437683105, + "learning_rate": 2.893292665619195e-05, + "loss": 1.7496, + "step": 21155 + }, + { + "epoch": 6.4935543278084715, + "grad_norm": 0.1964103877544403, + "learning_rate": 2.8928418949262138e-05, + "loss": 1.6732, + "step": 21156 + }, + { + "epoch": 6.493861264579497, + "grad_norm": 0.21939502656459808, + "learning_rate": 2.8923911450574043e-05, + "loss": 1.7149, + "step": 21157 + }, + { + "epoch": 6.494168201350522, + "grad_norm": 0.16927817463874817, + "learning_rate": 2.8919404160172203e-05, + "loss": 1.7093, + "step": 21158 + }, + { + "epoch": 6.494475138121547, + "grad_norm": 0.19907668232917786, + "learning_rate": 2.8914897078101166e-05, + "loss": 1.718, + "step": 21159 + }, + { + "epoch": 6.494782074892572, + "grad_norm": 0.18071576952934265, + "learning_rate": 2.891039020440548e-05, + "loss": 1.7241, + "step": 21160 + }, + { + "epoch": 6.495089011663597, + "grad_norm": 0.17780692875385284, + "learning_rate": 2.890588353912965e-05, + "loss": 1.7013, + "step": 21161 + }, + { + "epoch": 6.495395948434623, + "grad_norm": 0.20762500166893005, + "learning_rate": 2.8901377082318292e-05, + "loss": 1.8149, + "step": 21162 + }, + { + "epoch": 6.495702885205648, + "grad_norm": 0.21616768836975098, + "learning_rate": 2.889687083401585e-05, + "loss": 1.7467, + "step": 21163 + }, + { + "epoch": 6.496009821976672, + "grad_norm": 0.20075570046901703, + "learning_rate": 2.8892364794266935e-05, + "loss": 1.6643, + "step": 21164 + }, + { + "epoch": 6.496316758747698, + "grad_norm": 0.18893925845623016, + "learning_rate": 2.8887858963116028e-05, + "loss": 1.7362, + "step": 21165 + }, + { + "epoch": 6.496623695518723, + "grad_norm": 0.20031611621379852, + "learning_rate": 2.888335334060765e-05, + "loss": 1.6902, + "step": 21166 + }, + { + "epoch": 6.496930632289748, + "grad_norm": 0.2959407866001129, + "learning_rate": 2.887884792678639e-05, + "loss": 1.7874, + "step": 21167 + }, + { + "epoch": 6.497237569060774, + "grad_norm": 0.17434875667095184, + "learning_rate": 2.8874342721696697e-05, + "loss": 1.7353, + "step": 21168 + }, + { + "epoch": 6.497544505831799, + "grad_norm": 0.19451481103897095, + "learning_rate": 2.8869837725383163e-05, + "loss": 1.6942, + "step": 21169 + }, + { + "epoch": 6.4978514426028235, + "grad_norm": 0.17984920740127563, + "learning_rate": 2.886533293789025e-05, + "loss": 1.7461, + "step": 21170 + }, + { + "epoch": 6.498158379373849, + "grad_norm": 0.18166208267211914, + "learning_rate": 2.8860828359262516e-05, + "loss": 1.7202, + "step": 21171 + }, + { + "epoch": 6.498465316144874, + "grad_norm": 0.1849331557750702, + "learning_rate": 2.8856323989544472e-05, + "loss": 1.6862, + "step": 21172 + }, + { + "epoch": 6.4987722529158995, + "grad_norm": 0.17846204340457916, + "learning_rate": 2.8851819828780623e-05, + "loss": 1.7446, + "step": 21173 + }, + { + "epoch": 6.499079189686925, + "grad_norm": 0.1963818222284317, + "learning_rate": 2.8847315877015486e-05, + "loss": 1.7366, + "step": 21174 + }, + { + "epoch": 6.499386126457949, + "grad_norm": 0.1917402446269989, + "learning_rate": 2.8842812134293574e-05, + "loss": 1.7362, + "step": 21175 + }, + { + "epoch": 6.499693063228975, + "grad_norm": 0.16559138894081116, + "learning_rate": 2.883830860065939e-05, + "loss": 1.6735, + "step": 21176 + }, + { + "epoch": 6.5, + "grad_norm": 0.1820032149553299, + "learning_rate": 2.8833805276157442e-05, + "loss": 1.7107, + "step": 21177 + }, + { + "epoch": 6.500306936771025, + "grad_norm": 0.23760980367660522, + "learning_rate": 2.882930216083222e-05, + "loss": 1.7024, + "step": 21178 + }, + { + "epoch": 6.500613873542051, + "grad_norm": 0.22314296662807465, + "learning_rate": 2.8824799254728285e-05, + "loss": 1.714, + "step": 21179 + }, + { + "epoch": 6.500920810313076, + "grad_norm": 0.21919335424900055, + "learning_rate": 2.8820296557890046e-05, + "loss": 1.7625, + "step": 21180 + }, + { + "epoch": 6.5012277470841005, + "grad_norm": 0.21632128953933716, + "learning_rate": 2.88157940703621e-05, + "loss": 1.6589, + "step": 21181 + }, + { + "epoch": 6.501534683855126, + "grad_norm": 0.17998506128787994, + "learning_rate": 2.8811291792188867e-05, + "loss": 1.7528, + "step": 21182 + }, + { + "epoch": 6.501841620626151, + "grad_norm": 0.19783075153827667, + "learning_rate": 2.880678972341485e-05, + "loss": 1.6908, + "step": 21183 + }, + { + "epoch": 6.5021485573971765, + "grad_norm": 0.20510388910770416, + "learning_rate": 2.88022878640846e-05, + "loss": 1.7342, + "step": 21184 + }, + { + "epoch": 6.502455494168201, + "grad_norm": 0.24218666553497314, + "learning_rate": 2.879778621424253e-05, + "loss": 1.8, + "step": 21185 + }, + { + "epoch": 6.502762430939226, + "grad_norm": 0.1901179403066635, + "learning_rate": 2.8793284773933195e-05, + "loss": 1.699, + "step": 21186 + }, + { + "epoch": 6.503069367710252, + "grad_norm": 0.2652232348918915, + "learning_rate": 2.8788783543201007e-05, + "loss": 1.8394, + "step": 21187 + }, + { + "epoch": 6.503376304481277, + "grad_norm": 0.17701558768749237, + "learning_rate": 2.878428252209052e-05, + "loss": 1.6674, + "step": 21188 + }, + { + "epoch": 6.503683241252302, + "grad_norm": 0.17464707791805267, + "learning_rate": 2.8779781710646185e-05, + "loss": 1.6894, + "step": 21189 + }, + { + "epoch": 6.503990178023328, + "grad_norm": 0.19469478726387024, + "learning_rate": 2.877528110891249e-05, + "loss": 1.7487, + "step": 21190 + }, + { + "epoch": 6.504297114794352, + "grad_norm": 0.21656417846679688, + "learning_rate": 2.87707807169339e-05, + "loss": 1.641, + "step": 21191 + }, + { + "epoch": 6.504604051565377, + "grad_norm": 0.20374895632266998, + "learning_rate": 2.8766280534754896e-05, + "loss": 1.6692, + "step": 21192 + }, + { + "epoch": 6.504910988336403, + "grad_norm": 0.26638445258140564, + "learning_rate": 2.876178056241996e-05, + "loss": 1.7415, + "step": 21193 + }, + { + "epoch": 6.505217925107428, + "grad_norm": 0.1852893978357315, + "learning_rate": 2.8757280799973557e-05, + "loss": 1.6981, + "step": 21194 + }, + { + "epoch": 6.505524861878453, + "grad_norm": 0.20518383383750916, + "learning_rate": 2.875278124746013e-05, + "loss": 1.781, + "step": 21195 + }, + { + "epoch": 6.505831798649478, + "grad_norm": 0.19968904554843903, + "learning_rate": 2.874828190492422e-05, + "loss": 1.6813, + "step": 21196 + }, + { + "epoch": 6.506138735420503, + "grad_norm": 0.19164247810840607, + "learning_rate": 2.87437827724102e-05, + "loss": 1.6833, + "step": 21197 + }, + { + "epoch": 6.5064456721915285, + "grad_norm": 0.19305361807346344, + "learning_rate": 2.873928384996262e-05, + "loss": 1.7164, + "step": 21198 + }, + { + "epoch": 6.506752608962554, + "grad_norm": 0.1853758841753006, + "learning_rate": 2.873478513762587e-05, + "loss": 1.7481, + "step": 21199 + }, + { + "epoch": 6.507059545733579, + "grad_norm": 0.20187529921531677, + "learning_rate": 2.8730286635444425e-05, + "loss": 1.7666, + "step": 21200 + }, + { + "epoch": 6.5073664825046045, + "grad_norm": 0.19769401848316193, + "learning_rate": 2.872578834346279e-05, + "loss": 1.798, + "step": 21201 + }, + { + "epoch": 6.507673419275629, + "grad_norm": 0.1936112940311432, + "learning_rate": 2.8721290261725342e-05, + "loss": 1.6992, + "step": 21202 + }, + { + "epoch": 6.507980356046654, + "grad_norm": 0.17090481519699097, + "learning_rate": 2.871679239027662e-05, + "loss": 1.6802, + "step": 21203 + }, + { + "epoch": 6.50828729281768, + "grad_norm": 0.19443605840206146, + "learning_rate": 2.8712294729160987e-05, + "loss": 1.736, + "step": 21204 + }, + { + "epoch": 6.508594229588705, + "grad_norm": 0.19216817617416382, + "learning_rate": 2.8707797278422954e-05, + "loss": 1.7109, + "step": 21205 + }, + { + "epoch": 6.50890116635973, + "grad_norm": 0.19900040328502655, + "learning_rate": 2.8703300038106952e-05, + "loss": 1.7158, + "step": 21206 + }, + { + "epoch": 6.509208103130755, + "grad_norm": 0.17810803651809692, + "learning_rate": 2.8698803008257425e-05, + "loss": 1.6886, + "step": 21207 + }, + { + "epoch": 6.50951503990178, + "grad_norm": 0.1890508532524109, + "learning_rate": 2.8694306188918807e-05, + "loss": 1.7447, + "step": 21208 + }, + { + "epoch": 6.509821976672805, + "grad_norm": 0.17456012964248657, + "learning_rate": 2.868980958013554e-05, + "loss": 1.7094, + "step": 21209 + }, + { + "epoch": 6.510128913443831, + "grad_norm": 0.17089629173278809, + "learning_rate": 2.8685313181952066e-05, + "loss": 1.6827, + "step": 21210 + }, + { + "epoch": 6.510435850214856, + "grad_norm": 0.22681273519992828, + "learning_rate": 2.8680816994412823e-05, + "loss": 1.7374, + "step": 21211 + }, + { + "epoch": 6.510742786985881, + "grad_norm": 0.20642207562923431, + "learning_rate": 2.8676321017562225e-05, + "loss": 1.7609, + "step": 21212 + }, + { + "epoch": 6.511049723756906, + "grad_norm": 0.2360219657421112, + "learning_rate": 2.867182525144475e-05, + "loss": 1.7577, + "step": 21213 + }, + { + "epoch": 6.511356660527931, + "grad_norm": 0.19686923921108246, + "learning_rate": 2.8667329696104766e-05, + "loss": 1.7459, + "step": 21214 + }, + { + "epoch": 6.5116635972989565, + "grad_norm": 0.21280834078788757, + "learning_rate": 2.8662834351586777e-05, + "loss": 1.7837, + "step": 21215 + }, + { + "epoch": 6.511970534069982, + "grad_norm": 0.19297273457050323, + "learning_rate": 2.8658339217935136e-05, + "loss": 1.734, + "step": 21216 + }, + { + "epoch": 6.512277470841006, + "grad_norm": 0.1937931329011917, + "learning_rate": 2.8653844295194283e-05, + "loss": 1.6631, + "step": 21217 + }, + { + "epoch": 6.512584407612032, + "grad_norm": 0.2061077207326889, + "learning_rate": 2.8649349583408692e-05, + "loss": 1.7324, + "step": 21218 + }, + { + "epoch": 6.512891344383057, + "grad_norm": 0.19711358845233917, + "learning_rate": 2.8644855082622695e-05, + "loss": 1.7024, + "step": 21219 + }, + { + "epoch": 6.513198281154082, + "grad_norm": 0.17352496087551117, + "learning_rate": 2.8640360792880804e-05, + "loss": 1.7261, + "step": 21220 + }, + { + "epoch": 6.513505217925108, + "grad_norm": 0.181448295712471, + "learning_rate": 2.8635866714227344e-05, + "loss": 1.7147, + "step": 21221 + }, + { + "epoch": 6.513812154696133, + "grad_norm": 0.1827932894229889, + "learning_rate": 2.8631372846706787e-05, + "loss": 1.7338, + "step": 21222 + }, + { + "epoch": 6.514119091467157, + "grad_norm": 0.20659075677394867, + "learning_rate": 2.862687919036353e-05, + "loss": 1.6611, + "step": 21223 + }, + { + "epoch": 6.514426028238183, + "grad_norm": 0.19185996055603027, + "learning_rate": 2.8622385745241987e-05, + "loss": 1.7834, + "step": 21224 + }, + { + "epoch": 6.514732965009208, + "grad_norm": 0.19825506210327148, + "learning_rate": 2.8617892511386558e-05, + "loss": 1.7608, + "step": 21225 + }, + { + "epoch": 6.515039901780233, + "grad_norm": 0.16927020251750946, + "learning_rate": 2.861339948884164e-05, + "loss": 1.6651, + "step": 21226 + }, + { + "epoch": 6.515346838551259, + "grad_norm": 0.19211016595363617, + "learning_rate": 2.8608906677651646e-05, + "loss": 1.6673, + "step": 21227 + }, + { + "epoch": 6.515653775322283, + "grad_norm": 0.20192545652389526, + "learning_rate": 2.8604414077860974e-05, + "loss": 1.7301, + "step": 21228 + }, + { + "epoch": 6.5159607120933085, + "grad_norm": 0.2075425237417221, + "learning_rate": 2.8599921689514002e-05, + "loss": 1.783, + "step": 21229 + }, + { + "epoch": 6.516267648864334, + "grad_norm": 0.21261392533779144, + "learning_rate": 2.8595429512655192e-05, + "loss": 1.7277, + "step": 21230 + }, + { + "epoch": 6.516574585635359, + "grad_norm": 0.21201452612876892, + "learning_rate": 2.8590937547328844e-05, + "loss": 1.6582, + "step": 21231 + }, + { + "epoch": 6.5168815224063845, + "grad_norm": 0.2071799635887146, + "learning_rate": 2.858644579357944e-05, + "loss": 1.7559, + "step": 21232 + }, + { + "epoch": 6.51718845917741, + "grad_norm": 0.20225903391838074, + "learning_rate": 2.858195425145132e-05, + "loss": 1.7507, + "step": 21233 + }, + { + "epoch": 6.517495395948434, + "grad_norm": 0.2738147974014282, + "learning_rate": 2.8577462920988852e-05, + "loss": 1.7073, + "step": 21234 + }, + { + "epoch": 6.51780233271946, + "grad_norm": 0.17878220975399017, + "learning_rate": 2.8572971802236498e-05, + "loss": 1.6598, + "step": 21235 + }, + { + "epoch": 6.518109269490485, + "grad_norm": 0.21365594863891602, + "learning_rate": 2.8568480895238552e-05, + "loss": 1.7404, + "step": 21236 + }, + { + "epoch": 6.51841620626151, + "grad_norm": 0.18392804265022278, + "learning_rate": 2.856399020003948e-05, + "loss": 1.706, + "step": 21237 + }, + { + "epoch": 6.518723143032536, + "grad_norm": 0.16268405318260193, + "learning_rate": 2.855949971668358e-05, + "loss": 1.6725, + "step": 21238 + }, + { + "epoch": 6.51903007980356, + "grad_norm": 0.19590096175670624, + "learning_rate": 2.855500944521529e-05, + "loss": 1.7269, + "step": 21239 + }, + { + "epoch": 6.519337016574585, + "grad_norm": 0.19443263113498688, + "learning_rate": 2.8550519385678965e-05, + "loss": 1.686, + "step": 21240 + }, + { + "epoch": 6.519643953345611, + "grad_norm": 0.2112705111503601, + "learning_rate": 2.8546029538118985e-05, + "loss": 1.6904, + "step": 21241 + }, + { + "epoch": 6.519950890116636, + "grad_norm": 0.21015888452529907, + "learning_rate": 2.8541539902579712e-05, + "loss": 1.6972, + "step": 21242 + }, + { + "epoch": 6.520257826887661, + "grad_norm": 0.2853320837020874, + "learning_rate": 2.853705047910552e-05, + "loss": 1.7415, + "step": 21243 + }, + { + "epoch": 6.520564763658687, + "grad_norm": 0.20927128195762634, + "learning_rate": 2.853256126774077e-05, + "loss": 1.6955, + "step": 21244 + }, + { + "epoch": 6.520871700429711, + "grad_norm": 0.27824920415878296, + "learning_rate": 2.8528072268529836e-05, + "loss": 1.7666, + "step": 21245 + }, + { + "epoch": 6.5211786372007365, + "grad_norm": 0.21164646744728088, + "learning_rate": 2.8523583481517057e-05, + "loss": 1.75, + "step": 21246 + }, + { + "epoch": 6.521485573971762, + "grad_norm": 0.249397411942482, + "learning_rate": 2.851909490674686e-05, + "loss": 1.6767, + "step": 21247 + }, + { + "epoch": 6.521792510742787, + "grad_norm": 0.2311551868915558, + "learning_rate": 2.8514606544263507e-05, + "loss": 1.8071, + "step": 21248 + }, + { + "epoch": 6.5220994475138125, + "grad_norm": 0.21878042817115784, + "learning_rate": 2.8510118394111453e-05, + "loss": 1.6881, + "step": 21249 + }, + { + "epoch": 6.522406384284837, + "grad_norm": 0.2095690816640854, + "learning_rate": 2.8505630456334974e-05, + "loss": 1.6526, + "step": 21250 + }, + { + "epoch": 6.522713321055862, + "grad_norm": 0.2303982526063919, + "learning_rate": 2.850114273097844e-05, + "loss": 1.7256, + "step": 21251 + }, + { + "epoch": 6.523020257826888, + "grad_norm": 0.22640225291252136, + "learning_rate": 2.8496655218086255e-05, + "loss": 1.7797, + "step": 21252 + }, + { + "epoch": 6.523327194597913, + "grad_norm": 0.24268805980682373, + "learning_rate": 2.8492167917702683e-05, + "loss": 1.7673, + "step": 21253 + }, + { + "epoch": 6.523634131368938, + "grad_norm": 0.1988469958305359, + "learning_rate": 2.8487680829872158e-05, + "loss": 1.7126, + "step": 21254 + }, + { + "epoch": 6.523941068139964, + "grad_norm": 0.18385496735572815, + "learning_rate": 2.8483193954638942e-05, + "loss": 1.7113, + "step": 21255 + }, + { + "epoch": 6.524248004910988, + "grad_norm": 0.21865327656269073, + "learning_rate": 2.847870729204743e-05, + "loss": 1.6686, + "step": 21256 + }, + { + "epoch": 6.524554941682013, + "grad_norm": 0.16982951760292053, + "learning_rate": 2.8474220842141946e-05, + "loss": 1.6865, + "step": 21257 + }, + { + "epoch": 6.524861878453039, + "grad_norm": 0.23028478026390076, + "learning_rate": 2.8469734604966834e-05, + "loss": 1.7647, + "step": 21258 + }, + { + "epoch": 6.525168815224064, + "grad_norm": 0.1805485039949417, + "learning_rate": 2.8465248580566415e-05, + "loss": 1.7524, + "step": 21259 + }, + { + "epoch": 6.525475751995089, + "grad_norm": 0.18652063608169556, + "learning_rate": 2.8460762768985037e-05, + "loss": 1.7028, + "step": 21260 + }, + { + "epoch": 6.525782688766114, + "grad_norm": 0.22772997617721558, + "learning_rate": 2.845627717026703e-05, + "loss": 1.7866, + "step": 21261 + }, + { + "epoch": 6.526089625537139, + "grad_norm": 0.19889821112155914, + "learning_rate": 2.8451791784456718e-05, + "loss": 1.7076, + "step": 21262 + }, + { + "epoch": 6.526396562308165, + "grad_norm": 0.24747174978256226, + "learning_rate": 2.8447306611598402e-05, + "loss": 1.7615, + "step": 21263 + }, + { + "epoch": 6.52670349907919, + "grad_norm": 0.1988009363412857, + "learning_rate": 2.8442821651736473e-05, + "loss": 1.7853, + "step": 21264 + }, + { + "epoch": 6.527010435850215, + "grad_norm": 0.250032901763916, + "learning_rate": 2.8438336904915185e-05, + "loss": 1.6906, + "step": 21265 + }, + { + "epoch": 6.52731737262124, + "grad_norm": 0.15398284792900085, + "learning_rate": 2.8433852371178925e-05, + "loss": 1.6437, + "step": 21266 + }, + { + "epoch": 6.527624309392265, + "grad_norm": 0.33137503266334534, + "learning_rate": 2.8429368050571958e-05, + "loss": 1.8213, + "step": 21267 + }, + { + "epoch": 6.52793124616329, + "grad_norm": 0.23827852308750153, + "learning_rate": 2.8424883943138593e-05, + "loss": 1.7148, + "step": 21268 + }, + { + "epoch": 6.528238182934316, + "grad_norm": 0.21171489357948303, + "learning_rate": 2.8420400048923217e-05, + "loss": 1.7729, + "step": 21269 + }, + { + "epoch": 6.528545119705341, + "grad_norm": 0.21698513627052307, + "learning_rate": 2.8415916367970053e-05, + "loss": 1.7267, + "step": 21270 + }, + { + "epoch": 6.5288520564763655, + "grad_norm": 0.2217913120985031, + "learning_rate": 2.8411432900323498e-05, + "loss": 1.7259, + "step": 21271 + }, + { + "epoch": 6.529158993247391, + "grad_norm": 0.25518202781677246, + "learning_rate": 2.8406949646027768e-05, + "loss": 1.7754, + "step": 21272 + }, + { + "epoch": 6.529465930018416, + "grad_norm": 0.22206325829029083, + "learning_rate": 2.8402466605127247e-05, + "loss": 1.755, + "step": 21273 + }, + { + "epoch": 6.5297728667894415, + "grad_norm": 0.26918017864227295, + "learning_rate": 2.8397983777666206e-05, + "loss": 1.783, + "step": 21274 + }, + { + "epoch": 6.530079803560467, + "grad_norm": 0.19280646741390228, + "learning_rate": 2.8393501163688952e-05, + "loss": 1.6942, + "step": 21275 + }, + { + "epoch": 6.530386740331492, + "grad_norm": 0.24567140638828278, + "learning_rate": 2.8389018763239784e-05, + "loss": 1.7316, + "step": 21276 + }, + { + "epoch": 6.530693677102517, + "grad_norm": 0.21791695058345795, + "learning_rate": 2.8384536576362997e-05, + "loss": 1.7627, + "step": 21277 + }, + { + "epoch": 6.531000613873542, + "grad_norm": 0.2441660761833191, + "learning_rate": 2.8380054603102885e-05, + "loss": 1.7112, + "step": 21278 + }, + { + "epoch": 6.531307550644567, + "grad_norm": 0.1768653243780136, + "learning_rate": 2.837557284350375e-05, + "loss": 1.6906, + "step": 21279 + }, + { + "epoch": 6.531614487415593, + "grad_norm": 0.21037769317626953, + "learning_rate": 2.8371091297609877e-05, + "loss": 1.7197, + "step": 21280 + }, + { + "epoch": 6.531921424186618, + "grad_norm": 0.23989829421043396, + "learning_rate": 2.8366609965465563e-05, + "loss": 1.7693, + "step": 21281 + }, + { + "epoch": 6.532228360957642, + "grad_norm": 0.18302181363105774, + "learning_rate": 2.836212884711506e-05, + "loss": 1.6643, + "step": 21282 + }, + { + "epoch": 6.532535297728668, + "grad_norm": 0.2068471908569336, + "learning_rate": 2.835764794260273e-05, + "loss": 1.7431, + "step": 21283 + }, + { + "epoch": 6.532842234499693, + "grad_norm": 0.18803778290748596, + "learning_rate": 2.8353167251972777e-05, + "loss": 1.7506, + "step": 21284 + }, + { + "epoch": 6.533149171270718, + "grad_norm": 0.20789632201194763, + "learning_rate": 2.8348686775269507e-05, + "loss": 1.7174, + "step": 21285 + }, + { + "epoch": 6.533456108041744, + "grad_norm": 0.18927012383937836, + "learning_rate": 2.834420651253723e-05, + "loss": 1.6723, + "step": 21286 + }, + { + "epoch": 6.533763044812769, + "grad_norm": 0.22616887092590332, + "learning_rate": 2.8339726463820172e-05, + "loss": 1.7045, + "step": 21287 + }, + { + "epoch": 6.5340699815837935, + "grad_norm": 0.23880253732204437, + "learning_rate": 2.8335246629162658e-05, + "loss": 1.7255, + "step": 21288 + }, + { + "epoch": 6.534376918354819, + "grad_norm": 0.24279431998729706, + "learning_rate": 2.8330767008608904e-05, + "loss": 1.7548, + "step": 21289 + }, + { + "epoch": 6.534683855125844, + "grad_norm": 0.20542044937610626, + "learning_rate": 2.832628760220323e-05, + "loss": 1.6851, + "step": 21290 + }, + { + "epoch": 6.5349907918968695, + "grad_norm": 0.19426794350147247, + "learning_rate": 2.832180840998988e-05, + "loss": 1.7528, + "step": 21291 + }, + { + "epoch": 6.535297728667894, + "grad_norm": 0.2744491398334503, + "learning_rate": 2.8317329432013136e-05, + "loss": 1.7821, + "step": 21292 + }, + { + "epoch": 6.535604665438919, + "grad_norm": 0.2692170739173889, + "learning_rate": 2.8312850668317243e-05, + "loss": 1.6626, + "step": 21293 + }, + { + "epoch": 6.535911602209945, + "grad_norm": 0.24998809397220612, + "learning_rate": 2.830837211894647e-05, + "loss": 1.7031, + "step": 21294 + }, + { + "epoch": 6.53621853898097, + "grad_norm": 0.22888946533203125, + "learning_rate": 2.830389378394508e-05, + "loss": 1.7706, + "step": 21295 + }, + { + "epoch": 6.536525475751995, + "grad_norm": 0.21685005724430084, + "learning_rate": 2.8299415663357332e-05, + "loss": 1.681, + "step": 21296 + }, + { + "epoch": 6.536832412523021, + "grad_norm": 0.23309725522994995, + "learning_rate": 2.8294937757227475e-05, + "loss": 1.7781, + "step": 21297 + }, + { + "epoch": 6.537139349294045, + "grad_norm": 0.26712173223495483, + "learning_rate": 2.829046006559976e-05, + "loss": 1.6966, + "step": 21298 + }, + { + "epoch": 6.53744628606507, + "grad_norm": 0.1836499124765396, + "learning_rate": 2.8285982588518428e-05, + "loss": 1.7192, + "step": 21299 + }, + { + "epoch": 6.537753222836096, + "grad_norm": 0.24073021113872528, + "learning_rate": 2.828150532602778e-05, + "loss": 1.6997, + "step": 21300 + }, + { + "epoch": 6.538060159607121, + "grad_norm": 0.16308051347732544, + "learning_rate": 2.8277028278172014e-05, + "loss": 1.6901, + "step": 21301 + }, + { + "epoch": 6.538367096378146, + "grad_norm": 0.2330634444952011, + "learning_rate": 2.8272551444995376e-05, + "loss": 1.7426, + "step": 21302 + }, + { + "epoch": 6.538674033149171, + "grad_norm": 0.18600425124168396, + "learning_rate": 2.8268074826542123e-05, + "loss": 1.6906, + "step": 21303 + }, + { + "epoch": 6.538980969920196, + "grad_norm": 0.24717238545417786, + "learning_rate": 2.8263598422856475e-05, + "loss": 1.6962, + "step": 21304 + }, + { + "epoch": 6.5392879066912215, + "grad_norm": 0.1907368302345276, + "learning_rate": 2.8259122233982727e-05, + "loss": 1.7083, + "step": 21305 + }, + { + "epoch": 6.539594843462247, + "grad_norm": 0.22698798775672913, + "learning_rate": 2.8254646259965035e-05, + "loss": 1.7377, + "step": 21306 + }, + { + "epoch": 6.539901780233272, + "grad_norm": 0.19169457256793976, + "learning_rate": 2.8250170500847696e-05, + "loss": 1.7416, + "step": 21307 + }, + { + "epoch": 6.5402087170042975, + "grad_norm": 0.18730394542217255, + "learning_rate": 2.8245694956674918e-05, + "loss": 1.7273, + "step": 21308 + }, + { + "epoch": 6.540515653775322, + "grad_norm": 0.19813422858715057, + "learning_rate": 2.8241219627490927e-05, + "loss": 1.7638, + "step": 21309 + }, + { + "epoch": 6.540822590546347, + "grad_norm": 0.20460368692874908, + "learning_rate": 2.8236744513339965e-05, + "loss": 1.7266, + "step": 21310 + }, + { + "epoch": 6.541129527317373, + "grad_norm": 0.20448380708694458, + "learning_rate": 2.823226961426625e-05, + "loss": 1.7335, + "step": 21311 + }, + { + "epoch": 6.541436464088398, + "grad_norm": 0.21458712220191956, + "learning_rate": 2.8227794930314e-05, + "loss": 1.7274, + "step": 21312 + }, + { + "epoch": 6.541743400859423, + "grad_norm": 0.1964675635099411, + "learning_rate": 2.8223320461527442e-05, + "loss": 1.7514, + "step": 21313 + }, + { + "epoch": 6.542050337630448, + "grad_norm": 0.18982458114624023, + "learning_rate": 2.82188462079508e-05, + "loss": 1.6858, + "step": 21314 + }, + { + "epoch": 6.542357274401473, + "grad_norm": 0.21377761662006378, + "learning_rate": 2.8214372169628277e-05, + "loss": 1.727, + "step": 21315 + }, + { + "epoch": 6.542664211172498, + "grad_norm": 0.19484922289848328, + "learning_rate": 2.8209898346604087e-05, + "loss": 1.7646, + "step": 21316 + }, + { + "epoch": 6.542971147943524, + "grad_norm": 0.20614980161190033, + "learning_rate": 2.8205424738922488e-05, + "loss": 1.6705, + "step": 21317 + }, + { + "epoch": 6.543278084714549, + "grad_norm": 0.1888885796070099, + "learning_rate": 2.8200951346627636e-05, + "loss": 1.7854, + "step": 21318 + }, + { + "epoch": 6.543585021485574, + "grad_norm": 0.20957863330841064, + "learning_rate": 2.8196478169763763e-05, + "loss": 1.6971, + "step": 21319 + }, + { + "epoch": 6.543891958256599, + "grad_norm": 0.20744509994983673, + "learning_rate": 2.8192005208375073e-05, + "loss": 1.7408, + "step": 21320 + }, + { + "epoch": 6.544198895027624, + "grad_norm": 0.20038767158985138, + "learning_rate": 2.818753246250574e-05, + "loss": 1.7355, + "step": 21321 + }, + { + "epoch": 6.5445058317986495, + "grad_norm": 0.18535862863063812, + "learning_rate": 2.818305993220004e-05, + "loss": 1.7229, + "step": 21322 + }, + { + "epoch": 6.544812768569675, + "grad_norm": 0.2191225290298462, + "learning_rate": 2.8178587617502095e-05, + "loss": 1.7364, + "step": 21323 + }, + { + "epoch": 6.5451197053407, + "grad_norm": 0.2055424451828003, + "learning_rate": 2.8174115518456175e-05, + "loss": 1.7488, + "step": 21324 + }, + { + "epoch": 6.545426642111725, + "grad_norm": 0.22267968952655792, + "learning_rate": 2.8169643635106398e-05, + "loss": 1.6936, + "step": 21325 + }, + { + "epoch": 6.54573357888275, + "grad_norm": 0.20295512676239014, + "learning_rate": 2.8165171967497018e-05, + "loss": 1.7651, + "step": 21326 + }, + { + "epoch": 6.546040515653775, + "grad_norm": 0.25859618186950684, + "learning_rate": 2.81607005156722e-05, + "loss": 1.7264, + "step": 21327 + }, + { + "epoch": 6.546347452424801, + "grad_norm": 0.22232379019260406, + "learning_rate": 2.8156229279676143e-05, + "loss": 1.7282, + "step": 21328 + }, + { + "epoch": 6.546654389195826, + "grad_norm": 0.2548457682132721, + "learning_rate": 2.8151758259553035e-05, + "loss": 1.7137, + "step": 21329 + }, + { + "epoch": 6.546961325966851, + "grad_norm": 0.22040672600269318, + "learning_rate": 2.8147287455347055e-05, + "loss": 1.7553, + "step": 21330 + }, + { + "epoch": 6.547268262737876, + "grad_norm": 0.19622360169887543, + "learning_rate": 2.8142816867102388e-05, + "loss": 1.6502, + "step": 21331 + }, + { + "epoch": 6.547575199508901, + "grad_norm": 0.20849336683750153, + "learning_rate": 2.813834649486322e-05, + "loss": 1.6824, + "step": 21332 + }, + { + "epoch": 6.547882136279926, + "grad_norm": 0.18474788963794708, + "learning_rate": 2.8133876338673703e-05, + "loss": 1.7136, + "step": 21333 + }, + { + "epoch": 6.548189073050952, + "grad_norm": 0.2421834021806717, + "learning_rate": 2.8129406398578074e-05, + "loss": 1.7841, + "step": 21334 + }, + { + "epoch": 6.548496009821976, + "grad_norm": 0.18089748919010162, + "learning_rate": 2.812493667462045e-05, + "loss": 1.6918, + "step": 21335 + }, + { + "epoch": 6.5488029465930016, + "grad_norm": 0.18575069308280945, + "learning_rate": 2.8120467166845022e-05, + "loss": 1.7098, + "step": 21336 + }, + { + "epoch": 6.549109883364027, + "grad_norm": 0.20840388536453247, + "learning_rate": 2.811599787529596e-05, + "loss": 1.7405, + "step": 21337 + }, + { + "epoch": 6.549416820135052, + "grad_norm": 0.19018858671188354, + "learning_rate": 2.811152880001742e-05, + "loss": 1.7098, + "step": 21338 + }, + { + "epoch": 6.5497237569060776, + "grad_norm": 0.22326117753982544, + "learning_rate": 2.8107059941053627e-05, + "loss": 1.7452, + "step": 21339 + }, + { + "epoch": 6.550030693677103, + "grad_norm": 0.26071304082870483, + "learning_rate": 2.8102591298448643e-05, + "loss": 1.7685, + "step": 21340 + }, + { + "epoch": 6.550337630448127, + "grad_norm": 0.2253575623035431, + "learning_rate": 2.8098122872246734e-05, + "loss": 1.8025, + "step": 21341 + }, + { + "epoch": 6.550644567219153, + "grad_norm": 0.2503850758075714, + "learning_rate": 2.8093654662491975e-05, + "loss": 1.7453, + "step": 21342 + }, + { + "epoch": 6.550951503990178, + "grad_norm": 0.18953700363636017, + "learning_rate": 2.808918666922858e-05, + "loss": 1.7549, + "step": 21343 + }, + { + "epoch": 6.551258440761203, + "grad_norm": 0.21360619366168976, + "learning_rate": 2.8084718892500685e-05, + "loss": 1.7363, + "step": 21344 + }, + { + "epoch": 6.551565377532229, + "grad_norm": 0.24622702598571777, + "learning_rate": 2.8080251332352437e-05, + "loss": 1.7325, + "step": 21345 + }, + { + "epoch": 6.551872314303253, + "grad_norm": 0.20079167187213898, + "learning_rate": 2.8075783988827997e-05, + "loss": 1.7478, + "step": 21346 + }, + { + "epoch": 6.5521792510742785, + "grad_norm": 0.2337643951177597, + "learning_rate": 2.807131686197151e-05, + "loss": 1.6683, + "step": 21347 + }, + { + "epoch": 6.552486187845304, + "grad_norm": 0.20815308392047882, + "learning_rate": 2.8066849951827123e-05, + "loss": 1.7436, + "step": 21348 + }, + { + "epoch": 6.552793124616329, + "grad_norm": 0.2450367957353592, + "learning_rate": 2.8062383258438972e-05, + "loss": 1.7464, + "step": 21349 + }, + { + "epoch": 6.5531000613873545, + "grad_norm": 0.232087641954422, + "learning_rate": 2.8057916781851222e-05, + "loss": 1.7378, + "step": 21350 + }, + { + "epoch": 6.55340699815838, + "grad_norm": 0.2254600077867508, + "learning_rate": 2.8053450522107993e-05, + "loss": 1.7299, + "step": 21351 + }, + { + "epoch": 6.553713934929404, + "grad_norm": 0.23282572627067566, + "learning_rate": 2.8048984479253425e-05, + "loss": 1.7512, + "step": 21352 + }, + { + "epoch": 6.55402087170043, + "grad_norm": 0.21826763451099396, + "learning_rate": 2.8044518653331665e-05, + "loss": 1.706, + "step": 21353 + }, + { + "epoch": 6.554327808471455, + "grad_norm": 0.20807425677776337, + "learning_rate": 2.804005304438683e-05, + "loss": 1.7013, + "step": 21354 + }, + { + "epoch": 6.55463474524248, + "grad_norm": 0.21791879832744598, + "learning_rate": 2.8035587652463046e-05, + "loss": 1.7312, + "step": 21355 + }, + { + "epoch": 6.554941682013506, + "grad_norm": 0.23205329477787018, + "learning_rate": 2.8031122477604505e-05, + "loss": 1.7166, + "step": 21356 + }, + { + "epoch": 6.55524861878453, + "grad_norm": 0.1910320371389389, + "learning_rate": 2.802665751985525e-05, + "loss": 1.694, + "step": 21357 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 0.24150735139846802, + "learning_rate": 2.8022192779259472e-05, + "loss": 1.7934, + "step": 21358 + }, + { + "epoch": 6.555862492326581, + "grad_norm": 0.18308573961257935, + "learning_rate": 2.801772825586123e-05, + "loss": 1.6851, + "step": 21359 + }, + { + "epoch": 6.556169429097606, + "grad_norm": 0.28410083055496216, + "learning_rate": 2.8013263949704705e-05, + "loss": 1.7687, + "step": 21360 + }, + { + "epoch": 6.556476365868631, + "grad_norm": 0.21073146164417267, + "learning_rate": 2.8008799860833996e-05, + "loss": 1.711, + "step": 21361 + }, + { + "epoch": 6.556783302639657, + "grad_norm": 0.22758159041404724, + "learning_rate": 2.8004335989293213e-05, + "loss": 1.7495, + "step": 21362 + }, + { + "epoch": 6.557090239410681, + "grad_norm": 0.2112412452697754, + "learning_rate": 2.799987233512647e-05, + "loss": 1.7125, + "step": 21363 + }, + { + "epoch": 6.5573971761817065, + "grad_norm": 0.1804153323173523, + "learning_rate": 2.7995408898377884e-05, + "loss": 1.689, + "step": 21364 + }, + { + "epoch": 6.557704112952732, + "grad_norm": 0.17632657289505005, + "learning_rate": 2.7990945679091572e-05, + "loss": 1.6868, + "step": 21365 + }, + { + "epoch": 6.558011049723757, + "grad_norm": 0.17942996323108673, + "learning_rate": 2.7986482677311632e-05, + "loss": 1.7082, + "step": 21366 + }, + { + "epoch": 6.558317986494782, + "grad_norm": 0.278486967086792, + "learning_rate": 2.7982019893082167e-05, + "loss": 1.7909, + "step": 21367 + }, + { + "epoch": 6.558624923265807, + "grad_norm": 0.208990678191185, + "learning_rate": 2.797755732644729e-05, + "loss": 1.7643, + "step": 21368 + }, + { + "epoch": 6.558931860036832, + "grad_norm": 0.20375309884548187, + "learning_rate": 2.7973094977451096e-05, + "loss": 1.6957, + "step": 21369 + }, + { + "epoch": 6.559238796807858, + "grad_norm": 0.24685338139533997, + "learning_rate": 2.7968632846137694e-05, + "loss": 1.7574, + "step": 21370 + }, + { + "epoch": 6.559545733578883, + "grad_norm": 0.2237502634525299, + "learning_rate": 2.796417093255117e-05, + "loss": 1.7422, + "step": 21371 + }, + { + "epoch": 6.559852670349908, + "grad_norm": 0.22731846570968628, + "learning_rate": 2.795970923673561e-05, + "loss": 1.7594, + "step": 21372 + }, + { + "epoch": 6.560159607120933, + "grad_norm": 0.2518742084503174, + "learning_rate": 2.7955247758735158e-05, + "loss": 1.6817, + "step": 21373 + }, + { + "epoch": 6.560466543891958, + "grad_norm": 0.21982096135616302, + "learning_rate": 2.7950786498593827e-05, + "loss": 1.7289, + "step": 21374 + }, + { + "epoch": 6.560773480662983, + "grad_norm": 0.19061018526554108, + "learning_rate": 2.7946325456355787e-05, + "loss": 1.6809, + "step": 21375 + }, + { + "epoch": 6.561080417434009, + "grad_norm": 0.2023245394229889, + "learning_rate": 2.794186463206505e-05, + "loss": 1.7053, + "step": 21376 + }, + { + "epoch": 6.561387354205034, + "grad_norm": 0.18003186583518982, + "learning_rate": 2.7937404025765752e-05, + "loss": 1.6447, + "step": 21377 + }, + { + "epoch": 6.5616942909760585, + "grad_norm": 0.19133709371089935, + "learning_rate": 2.7932943637501956e-05, + "loss": 1.7677, + "step": 21378 + }, + { + "epoch": 6.562001227747084, + "grad_norm": 0.18476714193820953, + "learning_rate": 2.7928483467317746e-05, + "loss": 1.685, + "step": 21379 + }, + { + "epoch": 6.562308164518109, + "grad_norm": 0.2065780758857727, + "learning_rate": 2.79240235152572e-05, + "loss": 1.6827, + "step": 21380 + }, + { + "epoch": 6.5626151012891345, + "grad_norm": 0.1885409951210022, + "learning_rate": 2.79195637813644e-05, + "loss": 1.6819, + "step": 21381 + }, + { + "epoch": 6.56292203806016, + "grad_norm": 0.18055391311645508, + "learning_rate": 2.79151042656834e-05, + "loss": 1.7007, + "step": 21382 + }, + { + "epoch": 6.563228974831185, + "grad_norm": 0.25148439407348633, + "learning_rate": 2.7910644968258294e-05, + "loss": 1.7723, + "step": 21383 + }, + { + "epoch": 6.56353591160221, + "grad_norm": 0.2308066487312317, + "learning_rate": 2.7906185889133134e-05, + "loss": 1.7525, + "step": 21384 + }, + { + "epoch": 6.563842848373235, + "grad_norm": 0.19580784440040588, + "learning_rate": 2.7901727028351997e-05, + "loss": 1.7197, + "step": 21385 + }, + { + "epoch": 6.56414978514426, + "grad_norm": 0.19686979055404663, + "learning_rate": 2.7897268385958952e-05, + "loss": 1.6873, + "step": 21386 + }, + { + "epoch": 6.564456721915286, + "grad_norm": 0.2657351493835449, + "learning_rate": 2.7892809961998045e-05, + "loss": 1.7005, + "step": 21387 + }, + { + "epoch": 6.564763658686311, + "grad_norm": 0.20131130516529083, + "learning_rate": 2.7888351756513353e-05, + "loss": 1.7211, + "step": 21388 + }, + { + "epoch": 6.565070595457335, + "grad_norm": 0.2524282932281494, + "learning_rate": 2.7883893769548908e-05, + "loss": 1.7038, + "step": 21389 + }, + { + "epoch": 6.565377532228361, + "grad_norm": 0.1601654291152954, + "learning_rate": 2.787943600114883e-05, + "loss": 1.691, + "step": 21390 + }, + { + "epoch": 6.565684468999386, + "grad_norm": 0.25074124336242676, + "learning_rate": 2.787497845135709e-05, + "loss": 1.688, + "step": 21391 + }, + { + "epoch": 6.565991405770411, + "grad_norm": 0.19491349160671234, + "learning_rate": 2.787052112021782e-05, + "loss": 1.7108, + "step": 21392 + }, + { + "epoch": 6.566298342541437, + "grad_norm": 0.23931637406349182, + "learning_rate": 2.786606400777499e-05, + "loss": 1.7315, + "step": 21393 + }, + { + "epoch": 6.566605279312462, + "grad_norm": 0.1643616110086441, + "learning_rate": 2.786160711407271e-05, + "loss": 1.6745, + "step": 21394 + }, + { + "epoch": 6.5669122160834865, + "grad_norm": 0.17805394530296326, + "learning_rate": 2.7857150439155e-05, + "loss": 1.6817, + "step": 21395 + }, + { + "epoch": 6.567219152854512, + "grad_norm": 0.20370139181613922, + "learning_rate": 2.7852693983065913e-05, + "loss": 1.7173, + "step": 21396 + }, + { + "epoch": 6.567526089625537, + "grad_norm": 0.1620296984910965, + "learning_rate": 2.784823774584948e-05, + "loss": 1.7135, + "step": 21397 + }, + { + "epoch": 6.5678330263965625, + "grad_norm": 0.19116036593914032, + "learning_rate": 2.7843781727549752e-05, + "loss": 1.6815, + "step": 21398 + }, + { + "epoch": 6.568139963167588, + "grad_norm": 0.20118895173072815, + "learning_rate": 2.7839325928210757e-05, + "loss": 1.7336, + "step": 21399 + }, + { + "epoch": 6.568446899938612, + "grad_norm": 0.198282390832901, + "learning_rate": 2.7834870347876528e-05, + "loss": 1.7379, + "step": 21400 + }, + { + "epoch": 6.568753836709638, + "grad_norm": 0.19203920662403107, + "learning_rate": 2.7830414986591104e-05, + "loss": 1.6913, + "step": 21401 + }, + { + "epoch": 6.569060773480663, + "grad_norm": 0.24601610004901886, + "learning_rate": 2.7825959844398507e-05, + "loss": 1.7842, + "step": 21402 + }, + { + "epoch": 6.569367710251688, + "grad_norm": 0.19069935381412506, + "learning_rate": 2.7821504921342777e-05, + "loss": 1.706, + "step": 21403 + }, + { + "epoch": 6.569674647022714, + "grad_norm": 0.20221085846424103, + "learning_rate": 2.7817050217467945e-05, + "loss": 1.7223, + "step": 21404 + }, + { + "epoch": 6.569981583793739, + "grad_norm": 0.2129664123058319, + "learning_rate": 2.781259573281801e-05, + "loss": 1.7429, + "step": 21405 + }, + { + "epoch": 6.570288520564763, + "grad_norm": 0.20684000849723816, + "learning_rate": 2.7808141467436993e-05, + "loss": 1.7349, + "step": 21406 + }, + { + "epoch": 6.570595457335789, + "grad_norm": 0.2153804898262024, + "learning_rate": 2.7803687421368968e-05, + "loss": 1.7245, + "step": 21407 + }, + { + "epoch": 6.570902394106814, + "grad_norm": 0.245448499917984, + "learning_rate": 2.7799233594657875e-05, + "loss": 1.7102, + "step": 21408 + }, + { + "epoch": 6.571209330877839, + "grad_norm": 0.18146783113479614, + "learning_rate": 2.7794779987347807e-05, + "loss": 1.6777, + "step": 21409 + }, + { + "epoch": 6.571516267648864, + "grad_norm": 0.21388854086399078, + "learning_rate": 2.7790326599482698e-05, + "loss": 1.7263, + "step": 21410 + }, + { + "epoch": 6.571823204419889, + "grad_norm": 0.2242165058851242, + "learning_rate": 2.7785873431106625e-05, + "loss": 1.7624, + "step": 21411 + }, + { + "epoch": 6.5721301411909145, + "grad_norm": 0.23132537305355072, + "learning_rate": 2.7781420482263565e-05, + "loss": 1.7013, + "step": 21412 + }, + { + "epoch": 6.57243707796194, + "grad_norm": 0.21074987947940826, + "learning_rate": 2.777696775299753e-05, + "loss": 1.7111, + "step": 21413 + }, + { + "epoch": 6.572744014732965, + "grad_norm": 0.2933674156665802, + "learning_rate": 2.7772515243352525e-05, + "loss": 1.7515, + "step": 21414 + }, + { + "epoch": 6.5730509515039905, + "grad_norm": 0.2100256085395813, + "learning_rate": 2.7768062953372552e-05, + "loss": 1.7425, + "step": 21415 + }, + { + "epoch": 6.573357888275015, + "grad_norm": 0.21765680611133575, + "learning_rate": 2.776361088310161e-05, + "loss": 1.7064, + "step": 21416 + }, + { + "epoch": 6.57366482504604, + "grad_norm": 0.205422043800354, + "learning_rate": 2.7759159032583702e-05, + "loss": 1.7458, + "step": 21417 + }, + { + "epoch": 6.573971761817066, + "grad_norm": 0.2009960114955902, + "learning_rate": 2.775470740186282e-05, + "loss": 1.7111, + "step": 21418 + }, + { + "epoch": 6.574278698588091, + "grad_norm": 0.18974804878234863, + "learning_rate": 2.7750255990982955e-05, + "loss": 1.7385, + "step": 21419 + }, + { + "epoch": 6.574585635359116, + "grad_norm": 0.1784054934978485, + "learning_rate": 2.7745804799988106e-05, + "loss": 1.7129, + "step": 21420 + }, + { + "epoch": 6.574892572130141, + "grad_norm": 0.2047782689332962, + "learning_rate": 2.7741353828922258e-05, + "loss": 1.6972, + "step": 21421 + }, + { + "epoch": 6.575199508901166, + "grad_norm": 0.18886682391166687, + "learning_rate": 2.773690307782939e-05, + "loss": 1.6564, + "step": 21422 + }, + { + "epoch": 6.5755064456721914, + "grad_norm": 0.2088952213525772, + "learning_rate": 2.7732452546753484e-05, + "loss": 1.7309, + "step": 21423 + }, + { + "epoch": 6.575813382443217, + "grad_norm": 0.20526883006095886, + "learning_rate": 2.7728002235738565e-05, + "loss": 1.6811, + "step": 21424 + }, + { + "epoch": 6.576120319214242, + "grad_norm": 0.19648446142673492, + "learning_rate": 2.7723552144828545e-05, + "loss": 1.7237, + "step": 21425 + }, + { + "epoch": 6.5764272559852675, + "grad_norm": 0.22405673563480377, + "learning_rate": 2.7719102274067484e-05, + "loss": 1.7454, + "step": 21426 + }, + { + "epoch": 6.576734192756292, + "grad_norm": 0.24119171500205994, + "learning_rate": 2.7714652623499265e-05, + "loss": 1.7106, + "step": 21427 + }, + { + "epoch": 6.577041129527317, + "grad_norm": 0.2127196192741394, + "learning_rate": 2.771020319316794e-05, + "loss": 1.7895, + "step": 21428 + }, + { + "epoch": 6.577348066298343, + "grad_norm": 0.23805706202983856, + "learning_rate": 2.7705753983117443e-05, + "loss": 1.739, + "step": 21429 + }, + { + "epoch": 6.577655003069368, + "grad_norm": 0.24212954938411713, + "learning_rate": 2.7701304993391753e-05, + "loss": 1.683, + "step": 21430 + }, + { + "epoch": 6.577961939840393, + "grad_norm": 0.1946132481098175, + "learning_rate": 2.769685622403484e-05, + "loss": 1.6953, + "step": 21431 + }, + { + "epoch": 6.578268876611418, + "grad_norm": 0.2465951144695282, + "learning_rate": 2.769240767509067e-05, + "loss": 1.6594, + "step": 21432 + }, + { + "epoch": 6.578575813382443, + "grad_norm": 0.17029622197151184, + "learning_rate": 2.76879593466032e-05, + "loss": 1.6977, + "step": 21433 + }, + { + "epoch": 6.578882750153468, + "grad_norm": 0.23793117702007294, + "learning_rate": 2.7683511238616388e-05, + "loss": 1.6709, + "step": 21434 + }, + { + "epoch": 6.579189686924494, + "grad_norm": 0.20149341225624084, + "learning_rate": 2.76790633511742e-05, + "loss": 1.8074, + "step": 21435 + }, + { + "epoch": 6.579496623695519, + "grad_norm": 0.25029948353767395, + "learning_rate": 2.7674615684320593e-05, + "loss": 1.6649, + "step": 21436 + }, + { + "epoch": 6.579803560466544, + "grad_norm": 0.22212490439414978, + "learning_rate": 2.7670168238099515e-05, + "loss": 1.7322, + "step": 21437 + }, + { + "epoch": 6.580110497237569, + "grad_norm": 0.26087918877601624, + "learning_rate": 2.7665721012554925e-05, + "loss": 1.7285, + "step": 21438 + }, + { + "epoch": 6.580417434008594, + "grad_norm": 0.19286726415157318, + "learning_rate": 2.7661274007730776e-05, + "loss": 1.6912, + "step": 21439 + }, + { + "epoch": 6.5807243707796195, + "grad_norm": 0.23935118317604065, + "learning_rate": 2.7656827223670982e-05, + "loss": 1.6929, + "step": 21440 + }, + { + "epoch": 6.581031307550645, + "grad_norm": 0.2263423204421997, + "learning_rate": 2.7652380660419563e-05, + "loss": 1.6786, + "step": 21441 + }, + { + "epoch": 6.581338244321669, + "grad_norm": 0.19788038730621338, + "learning_rate": 2.7647934318020373e-05, + "loss": 1.7906, + "step": 21442 + }, + { + "epoch": 6.581645181092695, + "grad_norm": 0.25891759991645813, + "learning_rate": 2.7643488196517435e-05, + "loss": 1.7691, + "step": 21443 + }, + { + "epoch": 6.58195211786372, + "grad_norm": 0.25175485014915466, + "learning_rate": 2.7639042295954615e-05, + "loss": 1.7329, + "step": 21444 + }, + { + "epoch": 6.582259054634745, + "grad_norm": 0.1860336810350418, + "learning_rate": 2.7634596616375908e-05, + "loss": 1.7348, + "step": 21445 + }, + { + "epoch": 6.582565991405771, + "grad_norm": 0.2704271972179413, + "learning_rate": 2.7630151157825218e-05, + "loss": 1.7199, + "step": 21446 + }, + { + "epoch": 6.582872928176796, + "grad_norm": 0.16306720674037933, + "learning_rate": 2.762570592034649e-05, + "loss": 1.7174, + "step": 21447 + }, + { + "epoch": 6.58317986494782, + "grad_norm": 0.2585636079311371, + "learning_rate": 2.7621260903983648e-05, + "loss": 1.7392, + "step": 21448 + }, + { + "epoch": 6.583486801718846, + "grad_norm": 0.2086072564125061, + "learning_rate": 2.7616816108780623e-05, + "loss": 1.7417, + "step": 21449 + }, + { + "epoch": 6.583793738489871, + "grad_norm": 0.1747613251209259, + "learning_rate": 2.7612371534781343e-05, + "loss": 1.6607, + "step": 21450 + }, + { + "epoch": 6.584100675260896, + "grad_norm": 0.21026404201984406, + "learning_rate": 2.7607927182029726e-05, + "loss": 1.7725, + "step": 21451 + }, + { + "epoch": 6.584407612031922, + "grad_norm": 0.17881789803504944, + "learning_rate": 2.76034830505697e-05, + "loss": 1.7502, + "step": 21452 + }, + { + "epoch": 6.584714548802946, + "grad_norm": 0.2503713369369507, + "learning_rate": 2.7599039140445182e-05, + "loss": 1.798, + "step": 21453 + }, + { + "epoch": 6.5850214855739715, + "grad_norm": 0.22163939476013184, + "learning_rate": 2.7594595451700083e-05, + "loss": 1.725, + "step": 21454 + }, + { + "epoch": 6.585328422344997, + "grad_norm": 0.2154664546251297, + "learning_rate": 2.759015198437833e-05, + "loss": 1.7917, + "step": 21455 + }, + { + "epoch": 6.585635359116022, + "grad_norm": 0.1814090609550476, + "learning_rate": 2.7585708738523823e-05, + "loss": 1.6562, + "step": 21456 + }, + { + "epoch": 6.5859422958870475, + "grad_norm": 0.18815121054649353, + "learning_rate": 2.758126571418049e-05, + "loss": 1.6833, + "step": 21457 + }, + { + "epoch": 6.586249232658073, + "grad_norm": 0.19383473694324493, + "learning_rate": 2.757682291139222e-05, + "loss": 1.6987, + "step": 21458 + }, + { + "epoch": 6.586556169429097, + "grad_norm": 0.19574831426143646, + "learning_rate": 2.7572380330202912e-05, + "loss": 1.7231, + "step": 21459 + }, + { + "epoch": 6.586863106200123, + "grad_norm": 0.17509032785892487, + "learning_rate": 2.7567937970656527e-05, + "loss": 1.6452, + "step": 21460 + }, + { + "epoch": 6.587170042971148, + "grad_norm": 0.19439785182476044, + "learning_rate": 2.7563495832796886e-05, + "loss": 1.7168, + "step": 21461 + }, + { + "epoch": 6.587476979742173, + "grad_norm": 0.17384520173072815, + "learning_rate": 2.7559053916667953e-05, + "loss": 1.7128, + "step": 21462 + }, + { + "epoch": 6.587783916513199, + "grad_norm": 0.18308506906032562, + "learning_rate": 2.7554612222313597e-05, + "loss": 1.7184, + "step": 21463 + }, + { + "epoch": 6.588090853284223, + "grad_norm": 0.20052805542945862, + "learning_rate": 2.7550170749777726e-05, + "loss": 1.7239, + "step": 21464 + }, + { + "epoch": 6.588397790055248, + "grad_norm": 0.21892015635967255, + "learning_rate": 2.7545729499104215e-05, + "loss": 1.7297, + "step": 21465 + }, + { + "epoch": 6.588704726826274, + "grad_norm": 0.19819483160972595, + "learning_rate": 2.7541288470336973e-05, + "loss": 1.7303, + "step": 21466 + }, + { + "epoch": 6.589011663597299, + "grad_norm": 0.24296818673610687, + "learning_rate": 2.7536847663519884e-05, + "loss": 1.8525, + "step": 21467 + }, + { + "epoch": 6.589318600368324, + "grad_norm": 0.1971593201160431, + "learning_rate": 2.753240707869683e-05, + "loss": 1.7396, + "step": 21468 + }, + { + "epoch": 6.58962553713935, + "grad_norm": 0.24418935179710388, + "learning_rate": 2.7527966715911696e-05, + "loss": 1.7414, + "step": 21469 + }, + { + "epoch": 6.589932473910374, + "grad_norm": 0.2193990796804428, + "learning_rate": 2.7523526575208368e-05, + "loss": 1.7243, + "step": 21470 + }, + { + "epoch": 6.5902394106813995, + "grad_norm": 0.23612114787101746, + "learning_rate": 2.7519086656630722e-05, + "loss": 1.7072, + "step": 21471 + }, + { + "epoch": 6.590546347452425, + "grad_norm": 0.22282655537128448, + "learning_rate": 2.751464696022264e-05, + "loss": 1.7423, + "step": 21472 + }, + { + "epoch": 6.59085328422345, + "grad_norm": 0.21411976218223572, + "learning_rate": 2.7510207486027995e-05, + "loss": 1.7397, + "step": 21473 + }, + { + "epoch": 6.5911602209944755, + "grad_norm": 0.2244768589735031, + "learning_rate": 2.7505768234090663e-05, + "loss": 1.6964, + "step": 21474 + }, + { + "epoch": 6.5914671577655, + "grad_norm": 0.2250032275915146, + "learning_rate": 2.7501329204454512e-05, + "loss": 1.7307, + "step": 21475 + }, + { + "epoch": 6.591774094536525, + "grad_norm": 0.2643435299396515, + "learning_rate": 2.7496890397163395e-05, + "loss": 1.7298, + "step": 21476 + }, + { + "epoch": 6.592081031307551, + "grad_norm": 0.2204463928937912, + "learning_rate": 2.7492451812261232e-05, + "loss": 1.723, + "step": 21477 + }, + { + "epoch": 6.592387968078576, + "grad_norm": 0.2278377115726471, + "learning_rate": 2.7488013449791816e-05, + "loss": 1.7597, + "step": 21478 + }, + { + "epoch": 6.592694904849601, + "grad_norm": 0.18430690467357635, + "learning_rate": 2.7483575309799086e-05, + "loss": 1.6314, + "step": 21479 + }, + { + "epoch": 6.593001841620627, + "grad_norm": 0.26019781827926636, + "learning_rate": 2.7479137392326827e-05, + "loss": 1.7362, + "step": 21480 + }, + { + "epoch": 6.593308778391651, + "grad_norm": 0.2103995382785797, + "learning_rate": 2.7474699697418936e-05, + "loss": 1.7137, + "step": 21481 + }, + { + "epoch": 6.593615715162676, + "grad_norm": 0.220427006483078, + "learning_rate": 2.747026222511928e-05, + "loss": 1.7323, + "step": 21482 + }, + { + "epoch": 6.593922651933702, + "grad_norm": 0.21523109078407288, + "learning_rate": 2.7465824975471693e-05, + "loss": 1.7572, + "step": 21483 + }, + { + "epoch": 6.594229588704727, + "grad_norm": 0.21639512479305267, + "learning_rate": 2.7461387948520033e-05, + "loss": 1.7275, + "step": 21484 + }, + { + "epoch": 6.5945365254757515, + "grad_norm": 0.2043544203042984, + "learning_rate": 2.7456951144308147e-05, + "loss": 1.7454, + "step": 21485 + }, + { + "epoch": 6.594843462246777, + "grad_norm": 0.17847217619419098, + "learning_rate": 2.7452514562879882e-05, + "loss": 1.7356, + "step": 21486 + }, + { + "epoch": 6.595150399017802, + "grad_norm": 0.20756758749485016, + "learning_rate": 2.744807820427908e-05, + "loss": 1.7557, + "step": 21487 + }, + { + "epoch": 6.5954573357888275, + "grad_norm": 0.23579071462154388, + "learning_rate": 2.744364206854959e-05, + "loss": 1.7855, + "step": 21488 + }, + { + "epoch": 6.595764272559853, + "grad_norm": 0.1947307586669922, + "learning_rate": 2.7439206155735254e-05, + "loss": 1.7105, + "step": 21489 + }, + { + "epoch": 6.596071209330878, + "grad_norm": 0.1900642365217209, + "learning_rate": 2.74347704658799e-05, + "loss": 1.6692, + "step": 21490 + }, + { + "epoch": 6.596378146101903, + "grad_norm": 0.16756244003772736, + "learning_rate": 2.7430334999027375e-05, + "loss": 1.7175, + "step": 21491 + }, + { + "epoch": 6.596685082872928, + "grad_norm": 0.18581146001815796, + "learning_rate": 2.7425899755221506e-05, + "loss": 1.72, + "step": 21492 + }, + { + "epoch": 6.596992019643953, + "grad_norm": 0.2384853959083557, + "learning_rate": 2.7421464734506107e-05, + "loss": 1.718, + "step": 21493 + }, + { + "epoch": 6.597298956414979, + "grad_norm": 0.16853606700897217, + "learning_rate": 2.7417029936925065e-05, + "loss": 1.6819, + "step": 21494 + }, + { + "epoch": 6.597605893186004, + "grad_norm": 0.2273230254650116, + "learning_rate": 2.741259536252213e-05, + "loss": 1.7158, + "step": 21495 + }, + { + "epoch": 6.597912829957028, + "grad_norm": 0.2291530966758728, + "learning_rate": 2.7408161011341205e-05, + "loss": 1.7804, + "step": 21496 + }, + { + "epoch": 6.598219766728054, + "grad_norm": 0.17676831781864166, + "learning_rate": 2.740372688342604e-05, + "loss": 1.6693, + "step": 21497 + }, + { + "epoch": 6.598526703499079, + "grad_norm": 0.2386767417192459, + "learning_rate": 2.7399292978820508e-05, + "loss": 1.6932, + "step": 21498 + }, + { + "epoch": 6.598833640270104, + "grad_norm": 0.21329782903194427, + "learning_rate": 2.739485929756841e-05, + "loss": 1.7811, + "step": 21499 + }, + { + "epoch": 6.59914057704113, + "grad_norm": 0.19382116198539734, + "learning_rate": 2.7390425839713556e-05, + "loss": 1.7152, + "step": 21500 + }, + { + "epoch": 6.599447513812155, + "grad_norm": 0.1819920688867569, + "learning_rate": 2.738599260529977e-05, + "loss": 1.6571, + "step": 21501 + }, + { + "epoch": 6.5997544505831796, + "grad_norm": 0.19947806000709534, + "learning_rate": 2.738155959437086e-05, + "loss": 1.7138, + "step": 21502 + }, + { + "epoch": 6.600061387354205, + "grad_norm": 0.1851014792919159, + "learning_rate": 2.7377126806970634e-05, + "loss": 1.7109, + "step": 21503 + }, + { + "epoch": 6.60036832412523, + "grad_norm": 0.20365974307060242, + "learning_rate": 2.7372694243142905e-05, + "loss": 1.7145, + "step": 21504 + }, + { + "epoch": 6.600675260896256, + "grad_norm": 0.2070893943309784, + "learning_rate": 2.736826190293147e-05, + "loss": 1.7172, + "step": 21505 + }, + { + "epoch": 6.600982197667281, + "grad_norm": 0.19077777862548828, + "learning_rate": 2.7363829786380136e-05, + "loss": 1.7059, + "step": 21506 + }, + { + "epoch": 6.601289134438305, + "grad_norm": 0.21168744564056396, + "learning_rate": 2.73593978935327e-05, + "loss": 1.7483, + "step": 21507 + }, + { + "epoch": 6.601596071209331, + "grad_norm": 0.20746631920337677, + "learning_rate": 2.7354966224432965e-05, + "loss": 1.7165, + "step": 21508 + }, + { + "epoch": 6.601903007980356, + "grad_norm": 0.19440631568431854, + "learning_rate": 2.7350534779124732e-05, + "loss": 1.694, + "step": 21509 + }, + { + "epoch": 6.602209944751381, + "grad_norm": 0.20699405670166016, + "learning_rate": 2.7346103557651765e-05, + "loss": 1.7077, + "step": 21510 + }, + { + "epoch": 6.602516881522407, + "grad_norm": 0.19856512546539307, + "learning_rate": 2.7341672560057917e-05, + "loss": 1.77, + "step": 21511 + }, + { + "epoch": 6.602823818293432, + "grad_norm": 0.23978421092033386, + "learning_rate": 2.7337241786386915e-05, + "loss": 1.7531, + "step": 21512 + }, + { + "epoch": 6.6031307550644565, + "grad_norm": 0.1834867000579834, + "learning_rate": 2.73328112366826e-05, + "loss": 1.751, + "step": 21513 + }, + { + "epoch": 6.603437691835482, + "grad_norm": 0.2154606282711029, + "learning_rate": 2.7328380910988694e-05, + "loss": 1.737, + "step": 21514 + }, + { + "epoch": 6.603744628606507, + "grad_norm": 0.20554645359516144, + "learning_rate": 2.7323950809349035e-05, + "loss": 1.7629, + "step": 21515 + }, + { + "epoch": 6.6040515653775325, + "grad_norm": 0.20497548580169678, + "learning_rate": 2.7319520931807386e-05, + "loss": 1.7001, + "step": 21516 + }, + { + "epoch": 6.604358502148557, + "grad_norm": 0.18628253042697906, + "learning_rate": 2.7315091278407523e-05, + "loss": 1.7477, + "step": 21517 + }, + { + "epoch": 6.604665438919582, + "grad_norm": 0.20788705348968506, + "learning_rate": 2.731066184919323e-05, + "loss": 1.7185, + "step": 21518 + }, + { + "epoch": 6.604972375690608, + "grad_norm": 0.17834967374801636, + "learning_rate": 2.730623264420827e-05, + "loss": 1.67, + "step": 21519 + }, + { + "epoch": 6.605279312461633, + "grad_norm": 0.2183784693479538, + "learning_rate": 2.7301803663496417e-05, + "loss": 1.6983, + "step": 21520 + }, + { + "epoch": 6.605586249232658, + "grad_norm": 0.1735544204711914, + "learning_rate": 2.7297374907101447e-05, + "loss": 1.7352, + "step": 21521 + }, + { + "epoch": 6.605893186003684, + "grad_norm": 0.2504538893699646, + "learning_rate": 2.729294637506713e-05, + "loss": 1.7332, + "step": 21522 + }, + { + "epoch": 6.606200122774708, + "grad_norm": 0.1801074892282486, + "learning_rate": 2.728851806743722e-05, + "loss": 1.7251, + "step": 21523 + }, + { + "epoch": 6.606507059545733, + "grad_norm": 0.25701379776000977, + "learning_rate": 2.728408998425549e-05, + "loss": 1.732, + "step": 21524 + }, + { + "epoch": 6.606813996316759, + "grad_norm": 0.1801779717206955, + "learning_rate": 2.7279662125565697e-05, + "loss": 1.6793, + "step": 21525 + }, + { + "epoch": 6.607120933087784, + "grad_norm": 0.21244947612285614, + "learning_rate": 2.7275234491411595e-05, + "loss": 1.7493, + "step": 21526 + }, + { + "epoch": 6.607427869858809, + "grad_norm": 0.20944559574127197, + "learning_rate": 2.7270807081836924e-05, + "loss": 1.722, + "step": 21527 + }, + { + "epoch": 6.607734806629834, + "grad_norm": 0.2526783049106598, + "learning_rate": 2.7266379896885508e-05, + "loss": 1.7628, + "step": 21528 + }, + { + "epoch": 6.608041743400859, + "grad_norm": 0.19788937270641327, + "learning_rate": 2.7261952936601002e-05, + "loss": 1.6538, + "step": 21529 + }, + { + "epoch": 6.6083486801718845, + "grad_norm": 0.2623229920864105, + "learning_rate": 2.725752620102725e-05, + "loss": 1.7694, + "step": 21530 + }, + { + "epoch": 6.60865561694291, + "grad_norm": 0.21503256261348724, + "learning_rate": 2.7253099690207913e-05, + "loss": 1.7553, + "step": 21531 + }, + { + "epoch": 6.608962553713935, + "grad_norm": 0.2114928811788559, + "learning_rate": 2.724867340418679e-05, + "loss": 1.7067, + "step": 21532 + }, + { + "epoch": 6.6092694904849605, + "grad_norm": 0.17945198714733124, + "learning_rate": 2.7244247343007623e-05, + "loss": 1.7419, + "step": 21533 + }, + { + "epoch": 6.609576427255985, + "grad_norm": 0.19239214062690735, + "learning_rate": 2.7239821506714137e-05, + "loss": 1.7644, + "step": 21534 + }, + { + "epoch": 6.60988336402701, + "grad_norm": 0.22906997799873352, + "learning_rate": 2.7235395895350068e-05, + "loss": 1.8063, + "step": 21535 + }, + { + "epoch": 6.610190300798036, + "grad_norm": 0.1965717375278473, + "learning_rate": 2.7230970508959162e-05, + "loss": 1.7841, + "step": 21536 + }, + { + "epoch": 6.610497237569061, + "grad_norm": 0.19944418966770172, + "learning_rate": 2.7226545347585158e-05, + "loss": 1.7382, + "step": 21537 + }, + { + "epoch": 6.610804174340086, + "grad_norm": 0.17155805230140686, + "learning_rate": 2.722212041127178e-05, + "loss": 1.6621, + "step": 21538 + }, + { + "epoch": 6.611111111111111, + "grad_norm": 0.20459938049316406, + "learning_rate": 2.721769570006275e-05, + "loss": 1.7481, + "step": 21539 + }, + { + "epoch": 6.611418047882136, + "grad_norm": 0.1991354376077652, + "learning_rate": 2.7213271214001813e-05, + "loss": 1.7874, + "step": 21540 + }, + { + "epoch": 6.611724984653161, + "grad_norm": 0.25073128938674927, + "learning_rate": 2.7208846953132682e-05, + "loss": 1.7921, + "step": 21541 + }, + { + "epoch": 6.612031921424187, + "grad_norm": 0.24456258118152618, + "learning_rate": 2.7204422917499085e-05, + "loss": 1.7564, + "step": 21542 + }, + { + "epoch": 6.612338858195212, + "grad_norm": 0.18416531383991241, + "learning_rate": 2.7199999107144736e-05, + "loss": 1.7247, + "step": 21543 + }, + { + "epoch": 6.612645794966237, + "grad_norm": 0.18439221382141113, + "learning_rate": 2.7195575522113347e-05, + "loss": 1.6607, + "step": 21544 + }, + { + "epoch": 6.612952731737262, + "grad_norm": 0.20334671437740326, + "learning_rate": 2.7191152162448685e-05, + "loss": 1.7487, + "step": 21545 + }, + { + "epoch": 6.613259668508287, + "grad_norm": 0.17871633172035217, + "learning_rate": 2.718672902819438e-05, + "loss": 1.7355, + "step": 21546 + }, + { + "epoch": 6.6135666052793125, + "grad_norm": 0.23006688058376312, + "learning_rate": 2.718230611939424e-05, + "loss": 1.6489, + "step": 21547 + }, + { + "epoch": 6.613873542050338, + "grad_norm": 0.19141538441181183, + "learning_rate": 2.7177883436091877e-05, + "loss": 1.6793, + "step": 21548 + }, + { + "epoch": 6.614180478821363, + "grad_norm": 0.20549756288528442, + "learning_rate": 2.7173460978331068e-05, + "loss": 1.8331, + "step": 21549 + }, + { + "epoch": 6.614487415592388, + "grad_norm": 0.19106455147266388, + "learning_rate": 2.7169038746155495e-05, + "loss": 1.7295, + "step": 21550 + }, + { + "epoch": 6.614794352363413, + "grad_norm": 0.20190143585205078, + "learning_rate": 2.7164616739608866e-05, + "loss": 1.7032, + "step": 21551 + }, + { + "epoch": 6.615101289134438, + "grad_norm": 0.1969708949327469, + "learning_rate": 2.716019495873488e-05, + "loss": 1.6935, + "step": 21552 + }, + { + "epoch": 6.615408225905464, + "grad_norm": 0.23748311400413513, + "learning_rate": 2.7155773403577235e-05, + "loss": 1.7942, + "step": 21553 + }, + { + "epoch": 6.615715162676489, + "grad_norm": 0.29168081283569336, + "learning_rate": 2.715135207417962e-05, + "loss": 1.7121, + "step": 21554 + }, + { + "epoch": 6.616022099447514, + "grad_norm": 0.2428344041109085, + "learning_rate": 2.7146930970585738e-05, + "loss": 1.7287, + "step": 21555 + }, + { + "epoch": 6.616329036218539, + "grad_norm": 0.2520657479763031, + "learning_rate": 2.714251009283928e-05, + "loss": 1.8462, + "step": 21556 + }, + { + "epoch": 6.616635972989564, + "grad_norm": 0.2426053285598755, + "learning_rate": 2.713808944098394e-05, + "loss": 1.7094, + "step": 21557 + }, + { + "epoch": 6.616942909760589, + "grad_norm": 0.17593255639076233, + "learning_rate": 2.713366901506339e-05, + "loss": 1.6891, + "step": 21558 + }, + { + "epoch": 6.617249846531615, + "grad_norm": 0.20620940625667572, + "learning_rate": 2.7129248815121332e-05, + "loss": 1.7277, + "step": 21559 + }, + { + "epoch": 6.617556783302639, + "grad_norm": 0.21467719972133636, + "learning_rate": 2.7124828841201445e-05, + "loss": 1.7543, + "step": 21560 + }, + { + "epoch": 6.6178637200736645, + "grad_norm": 0.21372607350349426, + "learning_rate": 2.7120409093347378e-05, + "loss": 1.7207, + "step": 21561 + }, + { + "epoch": 6.61817065684469, + "grad_norm": 0.2123684585094452, + "learning_rate": 2.7115989571602884e-05, + "loss": 1.71, + "step": 21562 + }, + { + "epoch": 6.618477593615715, + "grad_norm": 0.19155478477478027, + "learning_rate": 2.711157027601155e-05, + "loss": 1.7182, + "step": 21563 + }, + { + "epoch": 6.6187845303867405, + "grad_norm": 0.23053184151649475, + "learning_rate": 2.7107151206617136e-05, + "loss": 1.7147, + "step": 21564 + }, + { + "epoch": 6.619091467157766, + "grad_norm": 0.1635691374540329, + "learning_rate": 2.7102732363463235e-05, + "loss": 1.6913, + "step": 21565 + }, + { + "epoch": 6.61939840392879, + "grad_norm": 0.19415298104286194, + "learning_rate": 2.709831374659357e-05, + "loss": 1.6813, + "step": 21566 + }, + { + "epoch": 6.619705340699816, + "grad_norm": 0.19547943770885468, + "learning_rate": 2.709389535605179e-05, + "loss": 1.6988, + "step": 21567 + }, + { + "epoch": 6.620012277470841, + "grad_norm": 0.1921805888414383, + "learning_rate": 2.7089477191881564e-05, + "loss": 1.6931, + "step": 21568 + }, + { + "epoch": 6.620319214241866, + "grad_norm": 0.18463274836540222, + "learning_rate": 2.7085059254126554e-05, + "loss": 1.7168, + "step": 21569 + }, + { + "epoch": 6.620626151012892, + "grad_norm": 0.2078532725572586, + "learning_rate": 2.7080641542830414e-05, + "loss": 1.7248, + "step": 21570 + }, + { + "epoch": 6.620933087783916, + "grad_norm": 0.18778283894062042, + "learning_rate": 2.7076224058036813e-05, + "loss": 1.6745, + "step": 21571 + }, + { + "epoch": 6.621240024554941, + "grad_norm": 0.26190707087516785, + "learning_rate": 2.70718067997894e-05, + "loss": 1.7317, + "step": 21572 + }, + { + "epoch": 6.621546961325967, + "grad_norm": 0.20449557900428772, + "learning_rate": 2.7067389768131836e-05, + "loss": 1.7167, + "step": 21573 + }, + { + "epoch": 6.621853898096992, + "grad_norm": 0.22722119092941284, + "learning_rate": 2.706297296310776e-05, + "loss": 1.7262, + "step": 21574 + }, + { + "epoch": 6.622160834868017, + "grad_norm": 0.24897173047065735, + "learning_rate": 2.7058556384760825e-05, + "loss": 1.7273, + "step": 21575 + }, + { + "epoch": 6.622467771639043, + "grad_norm": 0.19774340093135834, + "learning_rate": 2.705414003313469e-05, + "loss": 1.6765, + "step": 21576 + }, + { + "epoch": 6.622774708410067, + "grad_norm": 0.2661767303943634, + "learning_rate": 2.7049723908272995e-05, + "loss": 1.7046, + "step": 21577 + }, + { + "epoch": 6.6230816451810925, + "grad_norm": 0.2013266384601593, + "learning_rate": 2.7045308010219356e-05, + "loss": 1.7156, + "step": 21578 + }, + { + "epoch": 6.623388581952118, + "grad_norm": 0.22952915728092194, + "learning_rate": 2.7040892339017475e-05, + "loss": 1.7601, + "step": 21579 + }, + { + "epoch": 6.623695518723143, + "grad_norm": 0.18262411653995514, + "learning_rate": 2.7036476894710916e-05, + "loss": 1.7334, + "step": 21580 + }, + { + "epoch": 6.6240024554941686, + "grad_norm": 0.18907666206359863, + "learning_rate": 2.703206167734339e-05, + "loss": 1.7196, + "step": 21581 + }, + { + "epoch": 6.624309392265193, + "grad_norm": 0.2192571759223938, + "learning_rate": 2.7027646686958453e-05, + "loss": 1.7046, + "step": 21582 + }, + { + "epoch": 6.624616329036218, + "grad_norm": 0.165769562125206, + "learning_rate": 2.70232319235998e-05, + "loss": 1.7028, + "step": 21583 + }, + { + "epoch": 6.624923265807244, + "grad_norm": 0.19245828688144684, + "learning_rate": 2.701881738731103e-05, + "loss": 1.7153, + "step": 21584 + }, + { + "epoch": 6.625230202578269, + "grad_norm": 0.17638756334781647, + "learning_rate": 2.7014403078135776e-05, + "loss": 1.7071, + "step": 21585 + }, + { + "epoch": 6.625537139349294, + "grad_norm": 0.17205210030078888, + "learning_rate": 2.700998899611767e-05, + "loss": 1.6706, + "step": 21586 + }, + { + "epoch": 6.62584407612032, + "grad_norm": 0.24107681214809418, + "learning_rate": 2.700557514130032e-05, + "loss": 1.8013, + "step": 21587 + }, + { + "epoch": 6.626151012891344, + "grad_norm": 0.1839917004108429, + "learning_rate": 2.7001161513727358e-05, + "loss": 1.7381, + "step": 21588 + }, + { + "epoch": 6.6264579496623695, + "grad_norm": 0.24043352901935577, + "learning_rate": 2.6996748113442394e-05, + "loss": 1.7523, + "step": 21589 + }, + { + "epoch": 6.626764886433395, + "grad_norm": 0.23488068580627441, + "learning_rate": 2.6992334940489056e-05, + "loss": 1.7587, + "step": 21590 + }, + { + "epoch": 6.62707182320442, + "grad_norm": 0.18784530460834503, + "learning_rate": 2.698792199491094e-05, + "loss": 1.7053, + "step": 21591 + }, + { + "epoch": 6.627378759975445, + "grad_norm": 0.2758429944515228, + "learning_rate": 2.6983509276751673e-05, + "loss": 1.6927, + "step": 21592 + }, + { + "epoch": 6.62768569674647, + "grad_norm": 0.2731272280216217, + "learning_rate": 2.697909678605486e-05, + "loss": 1.7351, + "step": 21593 + }, + { + "epoch": 6.627992633517495, + "grad_norm": 0.24450576305389404, + "learning_rate": 2.6974684522864098e-05, + "loss": 1.7126, + "step": 21594 + }, + { + "epoch": 6.628299570288521, + "grad_norm": 0.21820391714572906, + "learning_rate": 2.6970272487222982e-05, + "loss": 1.7075, + "step": 21595 + }, + { + "epoch": 6.628606507059546, + "grad_norm": 0.23647959530353546, + "learning_rate": 2.696586067917517e-05, + "loss": 1.7369, + "step": 21596 + }, + { + "epoch": 6.628913443830571, + "grad_norm": 0.2665121555328369, + "learning_rate": 2.696144909876419e-05, + "loss": 1.7575, + "step": 21597 + }, + { + "epoch": 6.629220380601596, + "grad_norm": 0.19871680438518524, + "learning_rate": 2.695703774603371e-05, + "loss": 1.7334, + "step": 21598 + }, + { + "epoch": 6.629527317372621, + "grad_norm": 0.2363109588623047, + "learning_rate": 2.6952626621027245e-05, + "loss": 1.6878, + "step": 21599 + }, + { + "epoch": 6.629834254143646, + "grad_norm": 0.21958591043949127, + "learning_rate": 2.694821572378845e-05, + "loss": 1.6828, + "step": 21600 + }, + { + "epoch": 6.630141190914672, + "grad_norm": 0.20437858998775482, + "learning_rate": 2.6943805054360906e-05, + "loss": 1.7138, + "step": 21601 + }, + { + "epoch": 6.630448127685697, + "grad_norm": 0.27741923928260803, + "learning_rate": 2.6939394612788193e-05, + "loss": 1.7506, + "step": 21602 + }, + { + "epoch": 6.6307550644567215, + "grad_norm": 0.1885133981704712, + "learning_rate": 2.6934984399113917e-05, + "loss": 1.7669, + "step": 21603 + }, + { + "epoch": 6.631062001227747, + "grad_norm": 0.19453810155391693, + "learning_rate": 2.6930574413381604e-05, + "loss": 1.6837, + "step": 21604 + }, + { + "epoch": 6.631368937998772, + "grad_norm": 0.1685735285282135, + "learning_rate": 2.6926164655634894e-05, + "loss": 1.7045, + "step": 21605 + }, + { + "epoch": 6.6316758747697975, + "grad_norm": 0.2507462203502655, + "learning_rate": 2.6921755125917347e-05, + "loss": 1.7754, + "step": 21606 + }, + { + "epoch": 6.631982811540823, + "grad_norm": 0.1725471317768097, + "learning_rate": 2.691734582427255e-05, + "loss": 1.7219, + "step": 21607 + }, + { + "epoch": 6.632289748311848, + "grad_norm": 0.2633528709411621, + "learning_rate": 2.6912936750744068e-05, + "loss": 1.7362, + "step": 21608 + }, + { + "epoch": 6.632596685082873, + "grad_norm": 0.1808360069990158, + "learning_rate": 2.6908527905375474e-05, + "loss": 1.7338, + "step": 21609 + }, + { + "epoch": 6.632903621853898, + "grad_norm": 0.16186563670635223, + "learning_rate": 2.6904119288210344e-05, + "loss": 1.6752, + "step": 21610 + }, + { + "epoch": 6.633210558624923, + "grad_norm": 0.1954091340303421, + "learning_rate": 2.689971089929224e-05, + "loss": 1.714, + "step": 21611 + }, + { + "epoch": 6.633517495395949, + "grad_norm": 0.18954069912433624, + "learning_rate": 2.689530273866474e-05, + "loss": 1.7869, + "step": 21612 + }, + { + "epoch": 6.633824432166974, + "grad_norm": 0.182058185338974, + "learning_rate": 2.6890894806371392e-05, + "loss": 1.7708, + "step": 21613 + }, + { + "epoch": 6.634131368937998, + "grad_norm": 0.17313501238822937, + "learning_rate": 2.6886487102455755e-05, + "loss": 1.7064, + "step": 21614 + }, + { + "epoch": 6.634438305709024, + "grad_norm": 0.1732148379087448, + "learning_rate": 2.688207962696143e-05, + "loss": 1.7378, + "step": 21615 + }, + { + "epoch": 6.634745242480049, + "grad_norm": 0.17057274281978607, + "learning_rate": 2.687767237993191e-05, + "loss": 1.671, + "step": 21616 + }, + { + "epoch": 6.635052179251074, + "grad_norm": 0.17723220586776733, + "learning_rate": 2.6873265361410805e-05, + "loss": 1.7179, + "step": 21617 + }, + { + "epoch": 6.6353591160221, + "grad_norm": 0.18634437024593353, + "learning_rate": 2.6868858571441645e-05, + "loss": 1.7355, + "step": 21618 + }, + { + "epoch": 6.635666052793125, + "grad_norm": 0.205010786652565, + "learning_rate": 2.6864452010067985e-05, + "loss": 1.7399, + "step": 21619 + }, + { + "epoch": 6.6359729895641495, + "grad_norm": 0.2071879357099533, + "learning_rate": 2.6860045677333383e-05, + "loss": 1.7199, + "step": 21620 + }, + { + "epoch": 6.636279926335175, + "grad_norm": 0.17309685051441193, + "learning_rate": 2.685563957328134e-05, + "loss": 1.6595, + "step": 21621 + }, + { + "epoch": 6.6365868631062, + "grad_norm": 0.3505750000476837, + "learning_rate": 2.685123369795545e-05, + "loss": 1.7601, + "step": 21622 + }, + { + "epoch": 6.6368937998772255, + "grad_norm": 0.19184419512748718, + "learning_rate": 2.684682805139923e-05, + "loss": 1.7225, + "step": 21623 + }, + { + "epoch": 6.637200736648251, + "grad_norm": 0.20142409205436707, + "learning_rate": 2.6842422633656233e-05, + "loss": 1.7201, + "step": 21624 + }, + { + "epoch": 6.637507673419275, + "grad_norm": 0.18348537385463715, + "learning_rate": 2.6838017444769993e-05, + "loss": 1.6983, + "step": 21625 + }, + { + "epoch": 6.637814610190301, + "grad_norm": 0.19275228679180145, + "learning_rate": 2.6833612484784033e-05, + "loss": 1.7028, + "step": 21626 + }, + { + "epoch": 6.638121546961326, + "grad_norm": 0.21269574761390686, + "learning_rate": 2.682920775374189e-05, + "loss": 1.7888, + "step": 21627 + }, + { + "epoch": 6.638428483732351, + "grad_norm": 0.17470422387123108, + "learning_rate": 2.68248032516871e-05, + "loss": 1.7147, + "step": 21628 + }, + { + "epoch": 6.638735420503377, + "grad_norm": 0.15697288513183594, + "learning_rate": 2.6820398978663185e-05, + "loss": 1.6544, + "step": 21629 + }, + { + "epoch": 6.639042357274402, + "grad_norm": 0.18636487424373627, + "learning_rate": 2.6815994934713677e-05, + "loss": 1.721, + "step": 21630 + }, + { + "epoch": 6.639349294045426, + "grad_norm": 0.18091215193271637, + "learning_rate": 2.681159111988208e-05, + "loss": 1.6973, + "step": 21631 + }, + { + "epoch": 6.639656230816452, + "grad_norm": 0.21360217034816742, + "learning_rate": 2.6807187534211965e-05, + "loss": 1.7379, + "step": 21632 + }, + { + "epoch": 6.639963167587477, + "grad_norm": 0.20027592778205872, + "learning_rate": 2.6802784177746777e-05, + "loss": 1.7207, + "step": 21633 + }, + { + "epoch": 6.640270104358502, + "grad_norm": 0.21839644014835358, + "learning_rate": 2.679838105053011e-05, + "loss": 1.715, + "step": 21634 + }, + { + "epoch": 6.640577041129527, + "grad_norm": 0.19237302243709564, + "learning_rate": 2.6793978152605404e-05, + "loss": 1.7415, + "step": 21635 + }, + { + "epoch": 6.640883977900552, + "grad_norm": 0.1979883313179016, + "learning_rate": 2.678957548401623e-05, + "loss": 1.7005, + "step": 21636 + }, + { + "epoch": 6.6411909146715775, + "grad_norm": 0.21867144107818604, + "learning_rate": 2.678517304480609e-05, + "loss": 1.8008, + "step": 21637 + }, + { + "epoch": 6.641497851442603, + "grad_norm": 0.17232954502105713, + "learning_rate": 2.6780770835018433e-05, + "loss": 1.6867, + "step": 21638 + }, + { + "epoch": 6.641804788213628, + "grad_norm": 0.21535196900367737, + "learning_rate": 2.6776368854696853e-05, + "loss": 1.7545, + "step": 21639 + }, + { + "epoch": 6.6421117249846535, + "grad_norm": 0.18891240656375885, + "learning_rate": 2.6771967103884766e-05, + "loss": 1.7164, + "step": 21640 + }, + { + "epoch": 6.642418661755678, + "grad_norm": 0.2558320462703705, + "learning_rate": 2.6767565582625743e-05, + "loss": 1.8125, + "step": 21641 + }, + { + "epoch": 6.642725598526703, + "grad_norm": 0.20400027930736542, + "learning_rate": 2.6763164290963244e-05, + "loss": 1.7335, + "step": 21642 + }, + { + "epoch": 6.643032535297729, + "grad_norm": 0.21388766169548035, + "learning_rate": 2.6758763228940775e-05, + "loss": 1.7788, + "step": 21643 + }, + { + "epoch": 6.643339472068754, + "grad_norm": 0.20607435703277588, + "learning_rate": 2.6754362396601834e-05, + "loss": 1.7481, + "step": 21644 + }, + { + "epoch": 6.643646408839779, + "grad_norm": 0.1608831286430359, + "learning_rate": 2.6749961793989907e-05, + "loss": 1.6577, + "step": 21645 + }, + { + "epoch": 6.643953345610804, + "grad_norm": 0.19074808061122894, + "learning_rate": 2.6745561421148485e-05, + "loss": 1.7335, + "step": 21646 + }, + { + "epoch": 6.644260282381829, + "grad_norm": 0.16517756879329681, + "learning_rate": 2.6741161278121053e-05, + "loss": 1.6663, + "step": 21647 + }, + { + "epoch": 6.644567219152854, + "grad_norm": 0.18976998329162598, + "learning_rate": 2.673676136495108e-05, + "loss": 1.7231, + "step": 21648 + }, + { + "epoch": 6.64487415592388, + "grad_norm": 0.20694875717163086, + "learning_rate": 2.6732361681682106e-05, + "loss": 1.7469, + "step": 21649 + }, + { + "epoch": 6.645181092694905, + "grad_norm": 0.1994311809539795, + "learning_rate": 2.6727962228357533e-05, + "loss": 1.6864, + "step": 21650 + }, + { + "epoch": 6.64548802946593, + "grad_norm": 0.18886511027812958, + "learning_rate": 2.672356300502091e-05, + "loss": 1.6874, + "step": 21651 + }, + { + "epoch": 6.645794966236955, + "grad_norm": 0.2152819186449051, + "learning_rate": 2.6719164011715653e-05, + "loss": 1.7327, + "step": 21652 + }, + { + "epoch": 6.64610190300798, + "grad_norm": 0.20525617897510529, + "learning_rate": 2.6714765248485275e-05, + "loss": 1.7409, + "step": 21653 + }, + { + "epoch": 6.6464088397790055, + "grad_norm": 0.21892790496349335, + "learning_rate": 2.6710366715373254e-05, + "loss": 1.7281, + "step": 21654 + }, + { + "epoch": 6.646715776550031, + "grad_norm": 0.20156462490558624, + "learning_rate": 2.6705968412423e-05, + "loss": 1.7211, + "step": 21655 + }, + { + "epoch": 6.647022713321056, + "grad_norm": 0.19993625581264496, + "learning_rate": 2.670157033967806e-05, + "loss": 1.8058, + "step": 21656 + }, + { + "epoch": 6.647329650092081, + "grad_norm": 0.1970909684896469, + "learning_rate": 2.669717249718182e-05, + "loss": 1.7707, + "step": 21657 + }, + { + "epoch": 6.647636586863106, + "grad_norm": 0.19287796318531036, + "learning_rate": 2.6692774884977796e-05, + "loss": 1.688, + "step": 21658 + }, + { + "epoch": 6.647943523634131, + "grad_norm": 0.17658226191997528, + "learning_rate": 2.668837750310943e-05, + "loss": 1.6936, + "step": 21659 + }, + { + "epoch": 6.648250460405157, + "grad_norm": 0.20234479010105133, + "learning_rate": 2.6683980351620184e-05, + "loss": 1.7069, + "step": 21660 + }, + { + "epoch": 6.648557397176182, + "grad_norm": 0.1957871913909912, + "learning_rate": 2.6679583430553513e-05, + "loss": 1.736, + "step": 21661 + }, + { + "epoch": 6.648864333947207, + "grad_norm": 0.20084553956985474, + "learning_rate": 2.667518673995286e-05, + "loss": 1.7262, + "step": 21662 + }, + { + "epoch": 6.649171270718232, + "grad_norm": 0.18749211728572845, + "learning_rate": 2.667079027986169e-05, + "loss": 1.7127, + "step": 21663 + }, + { + "epoch": 6.649478207489257, + "grad_norm": 0.1747027188539505, + "learning_rate": 2.666639405032344e-05, + "loss": 1.6922, + "step": 21664 + }, + { + "epoch": 6.649785144260282, + "grad_norm": 0.3119397759437561, + "learning_rate": 2.666199805138154e-05, + "loss": 1.7373, + "step": 21665 + }, + { + "epoch": 6.650092081031308, + "grad_norm": 0.25986436009407043, + "learning_rate": 2.6657602283079498e-05, + "loss": 1.7521, + "step": 21666 + }, + { + "epoch": 6.650399017802332, + "grad_norm": 0.20535705983638763, + "learning_rate": 2.6653206745460663e-05, + "loss": 1.7144, + "step": 21667 + }, + { + "epoch": 6.650705954573358, + "grad_norm": 0.20804347097873688, + "learning_rate": 2.6648811438568566e-05, + "loss": 1.7186, + "step": 21668 + }, + { + "epoch": 6.651012891344383, + "grad_norm": 0.20753289759159088, + "learning_rate": 2.6644416362446566e-05, + "loss": 1.7098, + "step": 21669 + }, + { + "epoch": 6.651319828115408, + "grad_norm": 0.18725311756134033, + "learning_rate": 2.6640021517138148e-05, + "loss": 1.7331, + "step": 21670 + }, + { + "epoch": 6.651626764886434, + "grad_norm": 0.1907210648059845, + "learning_rate": 2.663562690268675e-05, + "loss": 1.6677, + "step": 21671 + }, + { + "epoch": 6.651933701657459, + "grad_norm": 0.19124922156333923, + "learning_rate": 2.6631232519135747e-05, + "loss": 1.7337, + "step": 21672 + }, + { + "epoch": 6.652240638428484, + "grad_norm": 0.21045447885990143, + "learning_rate": 2.6626838366528633e-05, + "loss": 1.7028, + "step": 21673 + }, + { + "epoch": 6.652547575199509, + "grad_norm": 0.1891855001449585, + "learning_rate": 2.6622444444908767e-05, + "loss": 1.7247, + "step": 21674 + }, + { + "epoch": 6.652854511970534, + "grad_norm": 0.2236541211605072, + "learning_rate": 2.6618050754319623e-05, + "loss": 1.6986, + "step": 21675 + }, + { + "epoch": 6.653161448741559, + "grad_norm": 0.19088539481163025, + "learning_rate": 2.6613657294804604e-05, + "loss": 1.7118, + "step": 21676 + }, + { + "epoch": 6.653468385512585, + "grad_norm": 0.26210764050483704, + "learning_rate": 2.660926406640714e-05, + "loss": 1.7542, + "step": 21677 + }, + { + "epoch": 6.653775322283609, + "grad_norm": 0.2564029097557068, + "learning_rate": 2.6604871069170632e-05, + "loss": 1.7395, + "step": 21678 + }, + { + "epoch": 6.6540822590546345, + "grad_norm": 0.22974301874637604, + "learning_rate": 2.6600478303138503e-05, + "loss": 1.6905, + "step": 21679 + }, + { + "epoch": 6.65438919582566, + "grad_norm": 0.299772173166275, + "learning_rate": 2.659608576835416e-05, + "loss": 1.7875, + "step": 21680 + }, + { + "epoch": 6.654696132596685, + "grad_norm": 0.26459556818008423, + "learning_rate": 2.6591693464861018e-05, + "loss": 1.7185, + "step": 21681 + }, + { + "epoch": 6.6550030693677105, + "grad_norm": 0.24505311250686646, + "learning_rate": 2.6587301392702457e-05, + "loss": 1.7105, + "step": 21682 + }, + { + "epoch": 6.655310006138736, + "grad_norm": 0.1626308262348175, + "learning_rate": 2.6582909551921953e-05, + "loss": 1.6668, + "step": 21683 + }, + { + "epoch": 6.65561694290976, + "grad_norm": 0.20354291796684265, + "learning_rate": 2.6578517942562813e-05, + "loss": 1.7437, + "step": 21684 + }, + { + "epoch": 6.655923879680786, + "grad_norm": 0.18618443608283997, + "learning_rate": 2.6574126564668532e-05, + "loss": 1.6757, + "step": 21685 + }, + { + "epoch": 6.656230816451811, + "grad_norm": 0.1863735467195511, + "learning_rate": 2.656973541828242e-05, + "loss": 1.6549, + "step": 21686 + }, + { + "epoch": 6.656537753222836, + "grad_norm": 0.2118620127439499, + "learning_rate": 2.6565344503447935e-05, + "loss": 1.6927, + "step": 21687 + }, + { + "epoch": 6.656844689993862, + "grad_norm": 0.24023136496543884, + "learning_rate": 2.6560953820208478e-05, + "loss": 1.6969, + "step": 21688 + }, + { + "epoch": 6.657151626764886, + "grad_norm": 0.21124204993247986, + "learning_rate": 2.6556563368607368e-05, + "loss": 1.6662, + "step": 21689 + }, + { + "epoch": 6.657458563535911, + "grad_norm": 0.16295355558395386, + "learning_rate": 2.6552173148688075e-05, + "loss": 1.7203, + "step": 21690 + }, + { + "epoch": 6.657765500306937, + "grad_norm": 0.18650858104228973, + "learning_rate": 2.6547783160493916e-05, + "loss": 1.7177, + "step": 21691 + }, + { + "epoch": 6.658072437077962, + "grad_norm": 0.20509213209152222, + "learning_rate": 2.6543393404068328e-05, + "loss": 1.723, + "step": 21692 + }, + { + "epoch": 6.658379373848987, + "grad_norm": 0.20985513925552368, + "learning_rate": 2.6539003879454678e-05, + "loss": 1.6679, + "step": 21693 + }, + { + "epoch": 6.658686310620013, + "grad_norm": 0.19907233119010925, + "learning_rate": 2.6534614586696338e-05, + "loss": 1.7028, + "step": 21694 + }, + { + "epoch": 6.658993247391037, + "grad_norm": 0.21793772280216217, + "learning_rate": 2.6530225525836692e-05, + "loss": 1.7706, + "step": 21695 + }, + { + "epoch": 6.6593001841620625, + "grad_norm": 0.24162191152572632, + "learning_rate": 2.6525836696919117e-05, + "loss": 1.806, + "step": 21696 + }, + { + "epoch": 6.659607120933088, + "grad_norm": 0.1735360324382782, + "learning_rate": 2.652144809998698e-05, + "loss": 1.7047, + "step": 21697 + }, + { + "epoch": 6.659914057704113, + "grad_norm": 0.18471799790859222, + "learning_rate": 2.651705973508365e-05, + "loss": 1.7306, + "step": 21698 + }, + { + "epoch": 6.6602209944751385, + "grad_norm": 0.17422814667224884, + "learning_rate": 2.6512671602252482e-05, + "loss": 1.6666, + "step": 21699 + }, + { + "epoch": 6.660527931246163, + "grad_norm": 0.19209833443164825, + "learning_rate": 2.6508283701536897e-05, + "loss": 1.6966, + "step": 21700 + }, + { + "epoch": 6.660834868017188, + "grad_norm": 0.1902640461921692, + "learning_rate": 2.650389603298019e-05, + "loss": 1.7887, + "step": 21701 + }, + { + "epoch": 6.661141804788214, + "grad_norm": 0.18551218509674072, + "learning_rate": 2.6499508596625787e-05, + "loss": 1.6851, + "step": 21702 + }, + { + "epoch": 6.661448741559239, + "grad_norm": 0.2165011614561081, + "learning_rate": 2.6495121392516976e-05, + "loss": 1.7465, + "step": 21703 + }, + { + "epoch": 6.661755678330264, + "grad_norm": 0.22871245443820953, + "learning_rate": 2.6490734420697172e-05, + "loss": 1.7487, + "step": 21704 + }, + { + "epoch": 6.66206261510129, + "grad_norm": 0.21275551617145538, + "learning_rate": 2.6486347681209723e-05, + "loss": 1.7782, + "step": 21705 + }, + { + "epoch": 6.662369551872314, + "grad_norm": 0.2926945984363556, + "learning_rate": 2.6481961174097937e-05, + "loss": 1.7413, + "step": 21706 + }, + { + "epoch": 6.662676488643339, + "grad_norm": 0.17143094539642334, + "learning_rate": 2.6477574899405233e-05, + "loss": 1.6639, + "step": 21707 + }, + { + "epoch": 6.662983425414365, + "grad_norm": 0.22194001078605652, + "learning_rate": 2.647318885717488e-05, + "loss": 1.7035, + "step": 21708 + }, + { + "epoch": 6.66329036218539, + "grad_norm": 0.18232671916484833, + "learning_rate": 2.6468803047450286e-05, + "loss": 1.6977, + "step": 21709 + }, + { + "epoch": 6.6635972989564145, + "grad_norm": 0.2626599371433258, + "learning_rate": 2.6464417470274773e-05, + "loss": 1.7422, + "step": 21710 + }, + { + "epoch": 6.66390423572744, + "grad_norm": 0.2034282237291336, + "learning_rate": 2.6460032125691668e-05, + "loss": 1.7531, + "step": 21711 + }, + { + "epoch": 6.664211172498465, + "grad_norm": 0.2308860868215561, + "learning_rate": 2.645564701374434e-05, + "loss": 1.7271, + "step": 21712 + }, + { + "epoch": 6.6645181092694905, + "grad_norm": 0.2163545936346054, + "learning_rate": 2.64512621344761e-05, + "loss": 1.7632, + "step": 21713 + }, + { + "epoch": 6.664825046040516, + "grad_norm": 0.2566233277320862, + "learning_rate": 2.644687748793029e-05, + "loss": 1.7573, + "step": 21714 + }, + { + "epoch": 6.665131982811541, + "grad_norm": 0.21093623340129852, + "learning_rate": 2.6442493074150244e-05, + "loss": 1.6703, + "step": 21715 + }, + { + "epoch": 6.665438919582566, + "grad_norm": 0.2083086222410202, + "learning_rate": 2.643810889317927e-05, + "loss": 1.6672, + "step": 21716 + }, + { + "epoch": 6.665745856353591, + "grad_norm": 0.20711155235767365, + "learning_rate": 2.643372494506075e-05, + "loss": 1.7276, + "step": 21717 + }, + { + "epoch": 6.666052793124616, + "grad_norm": 0.18977457284927368, + "learning_rate": 2.6429341229837935e-05, + "loss": 1.7207, + "step": 21718 + }, + { + "epoch": 6.666359729895642, + "grad_norm": 0.28336507081985474, + "learning_rate": 2.6424957747554224e-05, + "loss": 1.7473, + "step": 21719 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.1761232167482376, + "learning_rate": 2.642057449825286e-05, + "loss": 1.7172, + "step": 21720 + }, + { + "epoch": 6.666973603437691, + "grad_norm": 0.21672405302524567, + "learning_rate": 2.6416191481977215e-05, + "loss": 1.6561, + "step": 21721 + }, + { + "epoch": 6.667280540208717, + "grad_norm": 0.226834237575531, + "learning_rate": 2.6411808698770613e-05, + "loss": 1.7315, + "step": 21722 + }, + { + "epoch": 6.667587476979742, + "grad_norm": 0.22553586959838867, + "learning_rate": 2.6407426148676307e-05, + "loss": 1.7301, + "step": 21723 + }, + { + "epoch": 6.667894413750767, + "grad_norm": 0.1913517564535141, + "learning_rate": 2.6403043831737672e-05, + "loss": 1.6739, + "step": 21724 + }, + { + "epoch": 6.668201350521793, + "grad_norm": 0.24560052156448364, + "learning_rate": 2.6398661747997955e-05, + "loss": 1.7347, + "step": 21725 + }, + { + "epoch": 6.668508287292818, + "grad_norm": 0.27361172437667847, + "learning_rate": 2.6394279897500517e-05, + "loss": 1.7713, + "step": 21726 + }, + { + "epoch": 6.6688152240638425, + "grad_norm": 0.21486583352088928, + "learning_rate": 2.6389898280288638e-05, + "loss": 1.7504, + "step": 21727 + }, + { + "epoch": 6.669122160834868, + "grad_norm": 0.19056405127048492, + "learning_rate": 2.6385516896405627e-05, + "loss": 1.7457, + "step": 21728 + }, + { + "epoch": 6.669429097605893, + "grad_norm": 0.19316376745700836, + "learning_rate": 2.638113574589478e-05, + "loss": 1.6969, + "step": 21729 + }, + { + "epoch": 6.6697360343769185, + "grad_norm": 0.21700869500637054, + "learning_rate": 2.637675482879939e-05, + "loss": 1.7055, + "step": 21730 + }, + { + "epoch": 6.670042971147944, + "grad_norm": 0.19720883667469025, + "learning_rate": 2.637237414516275e-05, + "loss": 1.7029, + "step": 21731 + }, + { + "epoch": 6.670349907918968, + "grad_norm": 0.16528408229351044, + "learning_rate": 2.6367993695028158e-05, + "loss": 1.6915, + "step": 21732 + }, + { + "epoch": 6.670656844689994, + "grad_norm": 0.19576294720172882, + "learning_rate": 2.636361347843889e-05, + "loss": 1.7034, + "step": 21733 + }, + { + "epoch": 6.670963781461019, + "grad_norm": 0.16859273612499237, + "learning_rate": 2.6359233495438285e-05, + "loss": 1.7114, + "step": 21734 + }, + { + "epoch": 6.671270718232044, + "grad_norm": 0.20480163395404816, + "learning_rate": 2.6354853746069553e-05, + "loss": 1.7304, + "step": 21735 + }, + { + "epoch": 6.67157765500307, + "grad_norm": 0.19104263186454773, + "learning_rate": 2.6350474230376048e-05, + "loss": 1.7026, + "step": 21736 + }, + { + "epoch": 6.671884591774095, + "grad_norm": 0.18243174254894257, + "learning_rate": 2.634609494840098e-05, + "loss": 1.6769, + "step": 21737 + }, + { + "epoch": 6.672191528545119, + "grad_norm": 0.20766063034534454, + "learning_rate": 2.634171590018769e-05, + "loss": 1.7436, + "step": 21738 + }, + { + "epoch": 6.672498465316145, + "grad_norm": 0.22035297751426697, + "learning_rate": 2.6337337085779444e-05, + "loss": 1.8211, + "step": 21739 + }, + { + "epoch": 6.67280540208717, + "grad_norm": 0.18965984880924225, + "learning_rate": 2.6332958505219475e-05, + "loss": 1.7067, + "step": 21740 + }, + { + "epoch": 6.673112338858195, + "grad_norm": 0.21209993958473206, + "learning_rate": 2.632858015855111e-05, + "loss": 1.7743, + "step": 21741 + }, + { + "epoch": 6.67341927562922, + "grad_norm": 0.18409015238285065, + "learning_rate": 2.6324202045817547e-05, + "loss": 1.7494, + "step": 21742 + }, + { + "epoch": 6.673726212400245, + "grad_norm": 0.23252969980239868, + "learning_rate": 2.6319824167062125e-05, + "loss": 1.7459, + "step": 21743 + }, + { + "epoch": 6.6740331491712706, + "grad_norm": 0.16296416521072388, + "learning_rate": 2.631544652232808e-05, + "loss": 1.648, + "step": 21744 + }, + { + "epoch": 6.674340085942296, + "grad_norm": 0.2458602488040924, + "learning_rate": 2.631106911165867e-05, + "loss": 1.6847, + "step": 21745 + }, + { + "epoch": 6.674647022713321, + "grad_norm": 0.21203550696372986, + "learning_rate": 2.6306691935097162e-05, + "loss": 1.713, + "step": 21746 + }, + { + "epoch": 6.6749539594843466, + "grad_norm": 0.19969885051250458, + "learning_rate": 2.6302314992686804e-05, + "loss": 1.7445, + "step": 21747 + }, + { + "epoch": 6.675260896255372, + "grad_norm": 0.21001017093658447, + "learning_rate": 2.629793828447087e-05, + "loss": 1.703, + "step": 21748 + }, + { + "epoch": 6.675567833026396, + "grad_norm": 0.18607214093208313, + "learning_rate": 2.6293561810492595e-05, + "loss": 1.6765, + "step": 21749 + }, + { + "epoch": 6.675874769797422, + "grad_norm": 0.21806176006793976, + "learning_rate": 2.6289185570795223e-05, + "loss": 1.7099, + "step": 21750 + }, + { + "epoch": 6.676181706568447, + "grad_norm": 0.1861930787563324, + "learning_rate": 2.6284809565422052e-05, + "loss": 1.6978, + "step": 21751 + }, + { + "epoch": 6.676488643339472, + "grad_norm": 0.18779867887496948, + "learning_rate": 2.6280433794416254e-05, + "loss": 1.7132, + "step": 21752 + }, + { + "epoch": 6.676795580110497, + "grad_norm": 0.18255293369293213, + "learning_rate": 2.627605825782115e-05, + "loss": 1.7045, + "step": 21753 + }, + { + "epoch": 6.677102516881522, + "grad_norm": 0.22258871793746948, + "learning_rate": 2.6271682955679904e-05, + "loss": 1.7159, + "step": 21754 + }, + { + "epoch": 6.6774094536525475, + "grad_norm": 0.17425768077373505, + "learning_rate": 2.626730788803582e-05, + "loss": 1.6571, + "step": 21755 + }, + { + "epoch": 6.677716390423573, + "grad_norm": 0.1921091377735138, + "learning_rate": 2.6262933054932122e-05, + "loss": 1.8178, + "step": 21756 + }, + { + "epoch": 6.678023327194598, + "grad_norm": 0.16262951493263245, + "learning_rate": 2.6258558456411996e-05, + "loss": 1.6586, + "step": 21757 + }, + { + "epoch": 6.6783302639656235, + "grad_norm": 0.1853780597448349, + "learning_rate": 2.6254184092518752e-05, + "loss": 1.7116, + "step": 21758 + }, + { + "epoch": 6.678637200736648, + "grad_norm": 0.17973974347114563, + "learning_rate": 2.6249809963295536e-05, + "loss": 1.7317, + "step": 21759 + }, + { + "epoch": 6.678944137507673, + "grad_norm": 0.21258050203323364, + "learning_rate": 2.6245436068785634e-05, + "loss": 1.7852, + "step": 21760 + }, + { + "epoch": 6.679251074278699, + "grad_norm": 0.18741287291049957, + "learning_rate": 2.6241062409032262e-05, + "loss": 1.7071, + "step": 21761 + }, + { + "epoch": 6.679558011049724, + "grad_norm": 0.20436155796051025, + "learning_rate": 2.623668898407864e-05, + "loss": 1.7683, + "step": 21762 + }, + { + "epoch": 6.679864947820749, + "grad_norm": 0.18840116262435913, + "learning_rate": 2.6232315793967977e-05, + "loss": 1.7335, + "step": 21763 + }, + { + "epoch": 6.680171884591774, + "grad_norm": 0.1968357264995575, + "learning_rate": 2.62279428387435e-05, + "loss": 1.6848, + "step": 21764 + }, + { + "epoch": 6.680478821362799, + "grad_norm": 0.1774388998746872, + "learning_rate": 2.622357011844844e-05, + "loss": 1.6943, + "step": 21765 + }, + { + "epoch": 6.680785758133824, + "grad_norm": 0.2424328327178955, + "learning_rate": 2.621919763312598e-05, + "loss": 1.7479, + "step": 21766 + }, + { + "epoch": 6.68109269490485, + "grad_norm": 0.21220771968364716, + "learning_rate": 2.6214825382819353e-05, + "loss": 1.7384, + "step": 21767 + }, + { + "epoch": 6.681399631675875, + "grad_norm": 0.23322279751300812, + "learning_rate": 2.6210453367571764e-05, + "loss": 1.6625, + "step": 21768 + }, + { + "epoch": 6.6817065684469, + "grad_norm": 0.1726260483264923, + "learning_rate": 2.620608158742639e-05, + "loss": 1.7055, + "step": 21769 + }, + { + "epoch": 6.682013505217925, + "grad_norm": 0.25436410307884216, + "learning_rate": 2.6201710042426512e-05, + "loss": 1.7449, + "step": 21770 + }, + { + "epoch": 6.68232044198895, + "grad_norm": 0.20275171101093292, + "learning_rate": 2.619733873261524e-05, + "loss": 1.7575, + "step": 21771 + }, + { + "epoch": 6.6826273787599755, + "grad_norm": 0.24221903085708618, + "learning_rate": 2.6192967658035846e-05, + "loss": 1.7312, + "step": 21772 + }, + { + "epoch": 6.682934315531001, + "grad_norm": 0.30804362893104553, + "learning_rate": 2.6188596818731507e-05, + "loss": 1.7669, + "step": 21773 + }, + { + "epoch": 6.683241252302026, + "grad_norm": 0.1818273365497589, + "learning_rate": 2.6184226214745377e-05, + "loss": 1.7102, + "step": 21774 + }, + { + "epoch": 6.683548189073051, + "grad_norm": 0.28026455640792847, + "learning_rate": 2.6179855846120727e-05, + "loss": 1.7313, + "step": 21775 + }, + { + "epoch": 6.683855125844076, + "grad_norm": 0.26503586769104004, + "learning_rate": 2.6175485712900655e-05, + "loss": 1.7622, + "step": 21776 + }, + { + "epoch": 6.684162062615101, + "grad_norm": 0.19122248888015747, + "learning_rate": 2.6171115815128423e-05, + "loss": 1.7347, + "step": 21777 + }, + { + "epoch": 6.684468999386127, + "grad_norm": 0.18789063394069672, + "learning_rate": 2.6166746152847187e-05, + "loss": 1.7158, + "step": 21778 + }, + { + "epoch": 6.684775936157152, + "grad_norm": 0.17315362393856049, + "learning_rate": 2.6162376726100135e-05, + "loss": 1.6561, + "step": 21779 + }, + { + "epoch": 6.685082872928177, + "grad_norm": 0.20659680664539337, + "learning_rate": 2.615800753493045e-05, + "loss": 1.7063, + "step": 21780 + }, + { + "epoch": 6.685389809699202, + "grad_norm": 0.2051183432340622, + "learning_rate": 2.6153638579381307e-05, + "loss": 1.7213, + "step": 21781 + }, + { + "epoch": 6.685696746470227, + "grad_norm": 0.23349207639694214, + "learning_rate": 2.6149269859495884e-05, + "loss": 1.7453, + "step": 21782 + }, + { + "epoch": 6.686003683241252, + "grad_norm": 0.1979275941848755, + "learning_rate": 2.6144901375317355e-05, + "loss": 1.7482, + "step": 21783 + }, + { + "epoch": 6.686310620012278, + "grad_norm": 0.2742067873477936, + "learning_rate": 2.61405331268889e-05, + "loss": 1.7114, + "step": 21784 + }, + { + "epoch": 6.686617556783302, + "grad_norm": 0.18656300008296967, + "learning_rate": 2.6136165114253675e-05, + "loss": 1.7114, + "step": 21785 + }, + { + "epoch": 6.6869244935543275, + "grad_norm": 0.19345268607139587, + "learning_rate": 2.6131797337454834e-05, + "loss": 1.6818, + "step": 21786 + }, + { + "epoch": 6.687231430325353, + "grad_norm": 0.2194962054491043, + "learning_rate": 2.6127429796535597e-05, + "loss": 1.7519, + "step": 21787 + }, + { + "epoch": 6.687538367096378, + "grad_norm": 0.21714645624160767, + "learning_rate": 2.6123062491539054e-05, + "loss": 1.7334, + "step": 21788 + }, + { + "epoch": 6.6878453038674035, + "grad_norm": 0.1684521585702896, + "learning_rate": 2.6118695422508444e-05, + "loss": 1.6843, + "step": 21789 + }, + { + "epoch": 6.688152240638429, + "grad_norm": 0.16155442595481873, + "learning_rate": 2.6114328589486865e-05, + "loss": 1.6541, + "step": 21790 + }, + { + "epoch": 6.688459177409453, + "grad_norm": 0.18483634293079376, + "learning_rate": 2.6109961992517462e-05, + "loss": 1.688, + "step": 21791 + }, + { + "epoch": 6.688766114180479, + "grad_norm": 0.23146624863147736, + "learning_rate": 2.6105595631643466e-05, + "loss": 1.8006, + "step": 21792 + }, + { + "epoch": 6.689073050951504, + "grad_norm": 0.1852748543024063, + "learning_rate": 2.6101229506907937e-05, + "loss": 1.6624, + "step": 21793 + }, + { + "epoch": 6.689379987722529, + "grad_norm": 0.23809482157230377, + "learning_rate": 2.6096863618354105e-05, + "loss": 1.7313, + "step": 21794 + }, + { + "epoch": 6.689686924493555, + "grad_norm": 0.17145361006259918, + "learning_rate": 2.609249796602503e-05, + "loss": 1.6966, + "step": 21795 + }, + { + "epoch": 6.689993861264579, + "grad_norm": 0.1842796355485916, + "learning_rate": 2.6088132549963933e-05, + "loss": 1.6871, + "step": 21796 + }, + { + "epoch": 6.690300798035604, + "grad_norm": 0.1810201108455658, + "learning_rate": 2.608376737021392e-05, + "loss": 1.7509, + "step": 21797 + }, + { + "epoch": 6.69060773480663, + "grad_norm": 0.20428195595741272, + "learning_rate": 2.607940242681814e-05, + "loss": 1.7102, + "step": 21798 + }, + { + "epoch": 6.690914671577655, + "grad_norm": 0.1659073680639267, + "learning_rate": 2.6075037719819716e-05, + "loss": 1.7053, + "step": 21799 + }, + { + "epoch": 6.69122160834868, + "grad_norm": 0.19351087510585785, + "learning_rate": 2.60706732492618e-05, + "loss": 1.6847, + "step": 21800 + }, + { + "epoch": 6.691528545119706, + "grad_norm": 0.1734616905450821, + "learning_rate": 2.6066309015187517e-05, + "loss": 1.6989, + "step": 21801 + }, + { + "epoch": 6.69183548189073, + "grad_norm": 0.1863887459039688, + "learning_rate": 2.6061945017639995e-05, + "loss": 1.665, + "step": 21802 + }, + { + "epoch": 6.6921424186617555, + "grad_norm": 0.20225204527378082, + "learning_rate": 2.6057581256662344e-05, + "loss": 1.718, + "step": 21803 + }, + { + "epoch": 6.692449355432781, + "grad_norm": 0.22148309648036957, + "learning_rate": 2.605321773229774e-05, + "loss": 1.7801, + "step": 21804 + }, + { + "epoch": 6.692756292203806, + "grad_norm": 0.1870507448911667, + "learning_rate": 2.6048854444589242e-05, + "loss": 1.6613, + "step": 21805 + }, + { + "epoch": 6.6930632289748315, + "grad_norm": 0.18597224354743958, + "learning_rate": 2.604449139358004e-05, + "loss": 1.7284, + "step": 21806 + }, + { + "epoch": 6.693370165745856, + "grad_norm": 0.2082163542509079, + "learning_rate": 2.6040128579313193e-05, + "loss": 1.7456, + "step": 21807 + }, + { + "epoch": 6.693677102516881, + "grad_norm": 0.22506757080554962, + "learning_rate": 2.603576600183183e-05, + "loss": 1.7369, + "step": 21808 + }, + { + "epoch": 6.693984039287907, + "grad_norm": 0.20707464218139648, + "learning_rate": 2.60314036611791e-05, + "loss": 1.7176, + "step": 21809 + }, + { + "epoch": 6.694290976058932, + "grad_norm": 0.2306852787733078, + "learning_rate": 2.6027041557398053e-05, + "loss": 1.7582, + "step": 21810 + }, + { + "epoch": 6.694597912829957, + "grad_norm": 0.23120234906673431, + "learning_rate": 2.602267969053187e-05, + "loss": 1.7169, + "step": 21811 + }, + { + "epoch": 6.694904849600983, + "grad_norm": 0.24841509759426117, + "learning_rate": 2.6018318060623582e-05, + "loss": 1.7636, + "step": 21812 + }, + { + "epoch": 6.695211786372007, + "grad_norm": 0.22443681955337524, + "learning_rate": 2.601395666771635e-05, + "loss": 1.7465, + "step": 21813 + }, + { + "epoch": 6.695518723143032, + "grad_norm": 0.2905699908733368, + "learning_rate": 2.6009595511853257e-05, + "loss": 1.779, + "step": 21814 + }, + { + "epoch": 6.695825659914058, + "grad_norm": 0.18677717447280884, + "learning_rate": 2.60052345930774e-05, + "loss": 1.711, + "step": 21815 + }, + { + "epoch": 6.696132596685083, + "grad_norm": 0.2150946855545044, + "learning_rate": 2.6000873911431883e-05, + "loss": 1.7254, + "step": 21816 + }, + { + "epoch": 6.696439533456108, + "grad_norm": 0.20066408812999725, + "learning_rate": 2.5996513466959794e-05, + "loss": 1.7198, + "step": 21817 + }, + { + "epoch": 6.696746470227133, + "grad_norm": 0.23815886676311493, + "learning_rate": 2.5992153259704228e-05, + "loss": 1.749, + "step": 21818 + }, + { + "epoch": 6.697053406998158, + "grad_norm": 0.2067428082227707, + "learning_rate": 2.5987793289708273e-05, + "loss": 1.736, + "step": 21819 + }, + { + "epoch": 6.6973603437691835, + "grad_norm": 0.2126816362142563, + "learning_rate": 2.5983433557015e-05, + "loss": 1.6804, + "step": 21820 + }, + { + "epoch": 6.697667280540209, + "grad_norm": 0.2003033310174942, + "learning_rate": 2.597907406166756e-05, + "loss": 1.7303, + "step": 21821 + }, + { + "epoch": 6.697974217311234, + "grad_norm": 0.238821879029274, + "learning_rate": 2.5974714803708946e-05, + "loss": 1.7399, + "step": 21822 + }, + { + "epoch": 6.6982811540822595, + "grad_norm": 0.21327996253967285, + "learning_rate": 2.597035578318231e-05, + "loss": 1.766, + "step": 21823 + }, + { + "epoch": 6.698588090853284, + "grad_norm": 0.19689476490020752, + "learning_rate": 2.5965997000130694e-05, + "loss": 1.7621, + "step": 21824 + }, + { + "epoch": 6.698895027624309, + "grad_norm": 0.18349261581897736, + "learning_rate": 2.5961638454597158e-05, + "loss": 1.6339, + "step": 21825 + }, + { + "epoch": 6.699201964395335, + "grad_norm": 0.21475930511951447, + "learning_rate": 2.595728014662484e-05, + "loss": 1.6973, + "step": 21826 + }, + { + "epoch": 6.69950890116636, + "grad_norm": 0.2711705267429352, + "learning_rate": 2.5952922076256737e-05, + "loss": 1.7801, + "step": 21827 + }, + { + "epoch": 6.699815837937384, + "grad_norm": 0.2601792514324188, + "learning_rate": 2.5948564243535988e-05, + "loss": 1.7508, + "step": 21828 + }, + { + "epoch": 6.70012277470841, + "grad_norm": 0.206949844956398, + "learning_rate": 2.5944206648505586e-05, + "loss": 1.7853, + "step": 21829 + }, + { + "epoch": 6.700429711479435, + "grad_norm": 0.25003641843795776, + "learning_rate": 2.5939849291208653e-05, + "loss": 1.766, + "step": 21830 + }, + { + "epoch": 6.7007366482504604, + "grad_norm": 0.25864318013191223, + "learning_rate": 2.593549217168823e-05, + "loss": 1.7778, + "step": 21831 + }, + { + "epoch": 6.701043585021486, + "grad_norm": 0.20212729275226593, + "learning_rate": 2.593113528998738e-05, + "loss": 1.7249, + "step": 21832 + }, + { + "epoch": 6.701350521792511, + "grad_norm": 0.2518431842327118, + "learning_rate": 2.5926778646149154e-05, + "loss": 1.7466, + "step": 21833 + }, + { + "epoch": 6.701657458563536, + "grad_norm": 0.24284590780735016, + "learning_rate": 2.5922422240216614e-05, + "loss": 1.8309, + "step": 21834 + }, + { + "epoch": 6.701964395334561, + "grad_norm": 0.21829955279827118, + "learning_rate": 2.5918066072232817e-05, + "loss": 1.7458, + "step": 21835 + }, + { + "epoch": 6.702271332105586, + "grad_norm": 0.2842165231704712, + "learning_rate": 2.5913710142240792e-05, + "loss": 1.7379, + "step": 21836 + }, + { + "epoch": 6.702578268876612, + "grad_norm": 0.19648514688014984, + "learning_rate": 2.590935445028359e-05, + "loss": 1.7141, + "step": 21837 + }, + { + "epoch": 6.702885205647637, + "grad_norm": 0.24336646497249603, + "learning_rate": 2.5904998996404305e-05, + "loss": 1.6719, + "step": 21838 + }, + { + "epoch": 6.703192142418661, + "grad_norm": 0.17288628220558167, + "learning_rate": 2.5900643780645905e-05, + "loss": 1.6982, + "step": 21839 + }, + { + "epoch": 6.703499079189687, + "grad_norm": 0.24906334280967712, + "learning_rate": 2.5896288803051505e-05, + "loss": 1.6873, + "step": 21840 + }, + { + "epoch": 6.703806015960712, + "grad_norm": 0.2177029550075531, + "learning_rate": 2.5891934063664085e-05, + "loss": 1.6884, + "step": 21841 + }, + { + "epoch": 6.704112952731737, + "grad_norm": 0.20478956401348114, + "learning_rate": 2.5887579562526688e-05, + "loss": 1.7342, + "step": 21842 + }, + { + "epoch": 6.704419889502763, + "grad_norm": 0.26212164759635925, + "learning_rate": 2.58832252996824e-05, + "loss": 1.7304, + "step": 21843 + }, + { + "epoch": 6.704726826273788, + "grad_norm": 0.2049340009689331, + "learning_rate": 2.587887127517418e-05, + "loss": 1.7472, + "step": 21844 + }, + { + "epoch": 6.7050337630448125, + "grad_norm": 0.2453075796365738, + "learning_rate": 2.587451748904512e-05, + "loss": 1.7443, + "step": 21845 + }, + { + "epoch": 6.705340699815838, + "grad_norm": 0.19545187056064606, + "learning_rate": 2.5870163941338188e-05, + "loss": 1.7328, + "step": 21846 + }, + { + "epoch": 6.705647636586863, + "grad_norm": 0.24424482882022858, + "learning_rate": 2.5865810632096456e-05, + "loss": 1.6876, + "step": 21847 + }, + { + "epoch": 6.7059545733578885, + "grad_norm": 0.2150830626487732, + "learning_rate": 2.5861457561362922e-05, + "loss": 1.7272, + "step": 21848 + }, + { + "epoch": 6.706261510128914, + "grad_norm": 0.2632520794868469, + "learning_rate": 2.5857104729180626e-05, + "loss": 1.7542, + "step": 21849 + }, + { + "epoch": 6.706568446899938, + "grad_norm": 0.21789421141147614, + "learning_rate": 2.5852752135592563e-05, + "loss": 1.6856, + "step": 21850 + }, + { + "epoch": 6.706875383670964, + "grad_norm": 0.2227005511522293, + "learning_rate": 2.5848399780641758e-05, + "loss": 1.7473, + "step": 21851 + }, + { + "epoch": 6.707182320441989, + "grad_norm": 0.23424866795539856, + "learning_rate": 2.5844047664371218e-05, + "loss": 1.7016, + "step": 21852 + }, + { + "epoch": 6.707489257213014, + "grad_norm": 0.2125028669834137, + "learning_rate": 2.5839695786823964e-05, + "loss": 1.8296, + "step": 21853 + }, + { + "epoch": 6.70779619398404, + "grad_norm": 0.2533423900604248, + "learning_rate": 2.5835344148042972e-05, + "loss": 1.7237, + "step": 21854 + }, + { + "epoch": 6.708103130755065, + "grad_norm": 0.1951744705438614, + "learning_rate": 2.583099274807132e-05, + "loss": 1.6685, + "step": 21855 + }, + { + "epoch": 6.708410067526089, + "grad_norm": 0.2564519941806793, + "learning_rate": 2.5826641586951938e-05, + "loss": 1.7542, + "step": 21856 + }, + { + "epoch": 6.708717004297115, + "grad_norm": 0.2586502134799957, + "learning_rate": 2.5822290664727856e-05, + "loss": 1.7477, + "step": 21857 + }, + { + "epoch": 6.70902394106814, + "grad_norm": 0.30357107520103455, + "learning_rate": 2.5817939981442062e-05, + "loss": 1.7454, + "step": 21858 + }, + { + "epoch": 6.709330877839165, + "grad_norm": 0.20547500252723694, + "learning_rate": 2.5813589537137544e-05, + "loss": 1.7517, + "step": 21859 + }, + { + "epoch": 6.70963781461019, + "grad_norm": 0.2961783707141876, + "learning_rate": 2.5809239331857348e-05, + "loss": 1.698, + "step": 21860 + }, + { + "epoch": 6.709944751381215, + "grad_norm": 0.2062019556760788, + "learning_rate": 2.580488936564439e-05, + "loss": 1.7358, + "step": 21861 + }, + { + "epoch": 6.7102516881522405, + "grad_norm": 0.22287480533123016, + "learning_rate": 2.580053963854173e-05, + "loss": 1.7099, + "step": 21862 + }, + { + "epoch": 6.710558624923266, + "grad_norm": 0.1853112131357193, + "learning_rate": 2.579619015059229e-05, + "loss": 1.7493, + "step": 21863 + }, + { + "epoch": 6.710865561694291, + "grad_norm": 0.24855247139930725, + "learning_rate": 2.5791840901839105e-05, + "loss": 1.7248, + "step": 21864 + }, + { + "epoch": 6.7111724984653165, + "grad_norm": 0.18156948685646057, + "learning_rate": 2.5787491892325126e-05, + "loss": 1.6744, + "step": 21865 + }, + { + "epoch": 6.711479435236341, + "grad_norm": 0.3272082209587097, + "learning_rate": 2.5783143122093357e-05, + "loss": 1.7546, + "step": 21866 + }, + { + "epoch": 6.711786372007366, + "grad_norm": 0.2875421643257141, + "learning_rate": 2.577879459118675e-05, + "loss": 1.6477, + "step": 21867 + }, + { + "epoch": 6.712093308778392, + "grad_norm": 0.19682031869888306, + "learning_rate": 2.5774446299648297e-05, + "loss": 1.7455, + "step": 21868 + }, + { + "epoch": 6.712400245549417, + "grad_norm": 0.32829195261001587, + "learning_rate": 2.5770098247520968e-05, + "loss": 1.7817, + "step": 21869 + }, + { + "epoch": 6.712707182320442, + "grad_norm": 0.26227760314941406, + "learning_rate": 2.5765750434847724e-05, + "loss": 1.763, + "step": 21870 + }, + { + "epoch": 6.713014119091467, + "grad_norm": 0.2902637720108032, + "learning_rate": 2.576140286167152e-05, + "loss": 1.7432, + "step": 21871 + }, + { + "epoch": 6.713321055862492, + "grad_norm": 0.2290763407945633, + "learning_rate": 2.5757055528035377e-05, + "loss": 1.7149, + "step": 21872 + }, + { + "epoch": 6.713627992633517, + "grad_norm": 0.3445907533168793, + "learning_rate": 2.575270843398221e-05, + "loss": 1.7874, + "step": 21873 + }, + { + "epoch": 6.713934929404543, + "grad_norm": 0.1841191053390503, + "learning_rate": 2.574836157955498e-05, + "loss": 1.6954, + "step": 21874 + }, + { + "epoch": 6.714241866175568, + "grad_norm": 0.24168385565280914, + "learning_rate": 2.5744014964796657e-05, + "loss": 1.7153, + "step": 21875 + }, + { + "epoch": 6.714548802946593, + "grad_norm": 0.17855188250541687, + "learning_rate": 2.5739668589750175e-05, + "loss": 1.7329, + "step": 21876 + }, + { + "epoch": 6.714855739717618, + "grad_norm": 0.189789280295372, + "learning_rate": 2.5735322454458554e-05, + "loss": 1.6854, + "step": 21877 + }, + { + "epoch": 6.715162676488643, + "grad_norm": 0.1792519986629486, + "learning_rate": 2.5730976558964647e-05, + "loss": 1.7483, + "step": 21878 + }, + { + "epoch": 6.7154696132596685, + "grad_norm": 0.24460360407829285, + "learning_rate": 2.5726630903311504e-05, + "loss": 1.8337, + "step": 21879 + }, + { + "epoch": 6.715776550030694, + "grad_norm": 0.21612058579921722, + "learning_rate": 2.572228548754198e-05, + "loss": 1.7293, + "step": 21880 + }, + { + "epoch": 6.716083486801719, + "grad_norm": 0.22057892382144928, + "learning_rate": 2.5717940311699078e-05, + "loss": 1.7269, + "step": 21881 + }, + { + "epoch": 6.716390423572744, + "grad_norm": 0.19635777175426483, + "learning_rate": 2.571359537582572e-05, + "loss": 1.6744, + "step": 21882 + }, + { + "epoch": 6.716697360343769, + "grad_norm": 0.20406895875930786, + "learning_rate": 2.570925067996485e-05, + "loss": 1.6866, + "step": 21883 + }, + { + "epoch": 6.717004297114794, + "grad_norm": 0.1942419856786728, + "learning_rate": 2.5704906224159407e-05, + "loss": 1.724, + "step": 21884 + }, + { + "epoch": 6.71731123388582, + "grad_norm": 0.20423445105552673, + "learning_rate": 2.570056200845231e-05, + "loss": 1.6709, + "step": 21885 + }, + { + "epoch": 6.717618170656845, + "grad_norm": 0.27171632647514343, + "learning_rate": 2.569621803288651e-05, + "loss": 1.7532, + "step": 21886 + }, + { + "epoch": 6.71792510742787, + "grad_norm": 0.22753871977329254, + "learning_rate": 2.5691874297504926e-05, + "loss": 1.7534, + "step": 21887 + }, + { + "epoch": 6.718232044198895, + "grad_norm": 0.1907290369272232, + "learning_rate": 2.5687530802350468e-05, + "loss": 1.6696, + "step": 21888 + }, + { + "epoch": 6.71853898096992, + "grad_norm": 0.2226637750864029, + "learning_rate": 2.568318754746612e-05, + "loss": 1.7194, + "step": 21889 + }, + { + "epoch": 6.718845917740945, + "grad_norm": 0.20878726243972778, + "learning_rate": 2.5678844532894742e-05, + "loss": 1.6878, + "step": 21890 + }, + { + "epoch": 6.719152854511971, + "grad_norm": 0.18087267875671387, + "learning_rate": 2.567450175867928e-05, + "loss": 1.7432, + "step": 21891 + }, + { + "epoch": 6.719459791282996, + "grad_norm": 0.19818328320980072, + "learning_rate": 2.567015922486265e-05, + "loss": 1.6959, + "step": 21892 + }, + { + "epoch": 6.7197667280540205, + "grad_norm": 0.19593466818332672, + "learning_rate": 2.566581693148775e-05, + "loss": 1.7357, + "step": 21893 + }, + { + "epoch": 6.720073664825046, + "grad_norm": 0.24518795311450958, + "learning_rate": 2.5661474878597546e-05, + "loss": 1.7948, + "step": 21894 + }, + { + "epoch": 6.720380601596071, + "grad_norm": 0.18471074104309082, + "learning_rate": 2.5657133066234872e-05, + "loss": 1.6983, + "step": 21895 + }, + { + "epoch": 6.7206875383670965, + "grad_norm": 0.20073382556438446, + "learning_rate": 2.5652791494442718e-05, + "loss": 1.7241, + "step": 21896 + }, + { + "epoch": 6.720994475138122, + "grad_norm": 0.21688152849674225, + "learning_rate": 2.5648450163263903e-05, + "loss": 1.7073, + "step": 21897 + }, + { + "epoch": 6.721301411909147, + "grad_norm": 0.17722688615322113, + "learning_rate": 2.5644109072741406e-05, + "loss": 1.7047, + "step": 21898 + }, + { + "epoch": 6.721608348680172, + "grad_norm": 0.2060708999633789, + "learning_rate": 2.5639768222918093e-05, + "loss": 1.7246, + "step": 21899 + }, + { + "epoch": 6.721915285451197, + "grad_norm": 0.26590242981910706, + "learning_rate": 2.563542761383687e-05, + "loss": 1.8141, + "step": 21900 + }, + { + "epoch": 6.722222222222222, + "grad_norm": 0.22498780488967896, + "learning_rate": 2.5631087245540632e-05, + "loss": 1.7211, + "step": 21901 + }, + { + "epoch": 6.722529158993248, + "grad_norm": 0.20546968281269073, + "learning_rate": 2.562674711807227e-05, + "loss": 1.8001, + "step": 21902 + }, + { + "epoch": 6.722836095764272, + "grad_norm": 0.19668535888195038, + "learning_rate": 2.5622407231474683e-05, + "loss": 1.7443, + "step": 21903 + }, + { + "epoch": 6.723143032535297, + "grad_norm": 0.18932129442691803, + "learning_rate": 2.5618067585790752e-05, + "loss": 1.7307, + "step": 21904 + }, + { + "epoch": 6.723449969306323, + "grad_norm": 0.19501622021198273, + "learning_rate": 2.561372818106335e-05, + "loss": 1.7016, + "step": 21905 + }, + { + "epoch": 6.723756906077348, + "grad_norm": 0.21313562989234924, + "learning_rate": 2.5609389017335416e-05, + "loss": 1.8012, + "step": 21906 + }, + { + "epoch": 6.724063842848373, + "grad_norm": 0.174738347530365, + "learning_rate": 2.560505009464978e-05, + "loss": 1.6824, + "step": 21907 + }, + { + "epoch": 6.724370779619399, + "grad_norm": 0.20349650084972382, + "learning_rate": 2.560071141304934e-05, + "loss": 1.7813, + "step": 21908 + }, + { + "epoch": 6.724677716390423, + "grad_norm": 0.21878227591514587, + "learning_rate": 2.5596372972576967e-05, + "loss": 1.8166, + "step": 21909 + }, + { + "epoch": 6.7249846531614486, + "grad_norm": 0.2082633078098297, + "learning_rate": 2.559203477327552e-05, + "loss": 1.7197, + "step": 21910 + }, + { + "epoch": 6.725291589932474, + "grad_norm": 0.17738287150859833, + "learning_rate": 2.558769681518792e-05, + "loss": 1.7093, + "step": 21911 + }, + { + "epoch": 6.725598526703499, + "grad_norm": 0.1930074542760849, + "learning_rate": 2.5583359098356986e-05, + "loss": 1.7702, + "step": 21912 + }, + { + "epoch": 6.725905463474525, + "grad_norm": 0.17668531835079193, + "learning_rate": 2.5579021622825638e-05, + "loss": 1.7466, + "step": 21913 + }, + { + "epoch": 6.726212400245549, + "grad_norm": 0.1737186163663864, + "learning_rate": 2.5574684388636677e-05, + "loss": 1.6876, + "step": 21914 + }, + { + "epoch": 6.726519337016574, + "grad_norm": 0.18352502584457397, + "learning_rate": 2.5570347395833018e-05, + "loss": 1.6745, + "step": 21915 + }, + { + "epoch": 6.7268262737876, + "grad_norm": 0.19047673046588898, + "learning_rate": 2.5566010644457506e-05, + "loss": 1.7465, + "step": 21916 + }, + { + "epoch": 6.727133210558625, + "grad_norm": 0.1762397438287735, + "learning_rate": 2.5561674134553005e-05, + "loss": 1.6767, + "step": 21917 + }, + { + "epoch": 6.72744014732965, + "grad_norm": 0.22884784638881683, + "learning_rate": 2.5557337866162358e-05, + "loss": 1.7054, + "step": 21918 + }, + { + "epoch": 6.727747084100676, + "grad_norm": 0.17476098239421844, + "learning_rate": 2.5553001839328417e-05, + "loss": 1.721, + "step": 21919 + }, + { + "epoch": 6.7280540208717, + "grad_norm": 0.1827213317155838, + "learning_rate": 2.554866605409405e-05, + "loss": 1.78, + "step": 21920 + }, + { + "epoch": 6.7283609576427255, + "grad_norm": 0.21709343791007996, + "learning_rate": 2.554433051050209e-05, + "loss": 1.8064, + "step": 21921 + }, + { + "epoch": 6.728667894413751, + "grad_norm": 0.1972692310810089, + "learning_rate": 2.5539995208595398e-05, + "loss": 1.7231, + "step": 21922 + }, + { + "epoch": 6.728974831184776, + "grad_norm": 0.19464808702468872, + "learning_rate": 2.5535660148416802e-05, + "loss": 1.7931, + "step": 21923 + }, + { + "epoch": 6.7292817679558015, + "grad_norm": 0.19610099494457245, + "learning_rate": 2.5531325330009158e-05, + "loss": 1.7467, + "step": 21924 + }, + { + "epoch": 6.729588704726826, + "grad_norm": 0.21104763448238373, + "learning_rate": 2.5526990753415292e-05, + "loss": 1.7543, + "step": 21925 + }, + { + "epoch": 6.729895641497851, + "grad_norm": 0.1881588101387024, + "learning_rate": 2.5522656418678047e-05, + "loss": 1.7666, + "step": 21926 + }, + { + "epoch": 6.730202578268877, + "grad_norm": 0.2163291722536087, + "learning_rate": 2.551832232584025e-05, + "loss": 1.7321, + "step": 21927 + }, + { + "epoch": 6.730509515039902, + "grad_norm": 0.19252021610736847, + "learning_rate": 2.551398847494477e-05, + "loss": 1.7287, + "step": 21928 + }, + { + "epoch": 6.730816451810927, + "grad_norm": 0.22602233290672302, + "learning_rate": 2.550965486603437e-05, + "loss": 1.767, + "step": 21929 + }, + { + "epoch": 6.731123388581953, + "grad_norm": 0.21509617567062378, + "learning_rate": 2.5505321499151957e-05, + "loss": 1.7637, + "step": 21930 + }, + { + "epoch": 6.731430325352977, + "grad_norm": 0.24291658401489258, + "learning_rate": 2.5500988374340274e-05, + "loss": 1.7312, + "step": 21931 + }, + { + "epoch": 6.731737262124002, + "grad_norm": 0.26562216877937317, + "learning_rate": 2.5496655491642195e-05, + "loss": 1.7763, + "step": 21932 + }, + { + "epoch": 6.732044198895028, + "grad_norm": 0.19785790145397186, + "learning_rate": 2.5492322851100535e-05, + "loss": 1.6979, + "step": 21933 + }, + { + "epoch": 6.732351135666053, + "grad_norm": 0.20044486224651337, + "learning_rate": 2.5487990452758104e-05, + "loss": 1.7359, + "step": 21934 + }, + { + "epoch": 6.7326580724370775, + "grad_norm": 0.20468659698963165, + "learning_rate": 2.548365829665772e-05, + "loss": 1.6996, + "step": 21935 + }, + { + "epoch": 6.732965009208103, + "grad_norm": 0.16516120731830597, + "learning_rate": 2.5479326382842195e-05, + "loss": 1.717, + "step": 21936 + }, + { + "epoch": 6.733271945979128, + "grad_norm": 0.22404411435127258, + "learning_rate": 2.547499471135433e-05, + "loss": 1.7261, + "step": 21937 + }, + { + "epoch": 6.7335788827501535, + "grad_norm": 0.21485663950443268, + "learning_rate": 2.547066328223695e-05, + "loss": 1.7463, + "step": 21938 + }, + { + "epoch": 6.733885819521179, + "grad_norm": 0.330018550157547, + "learning_rate": 2.5466332095532853e-05, + "loss": 1.854, + "step": 21939 + }, + { + "epoch": 6.734192756292204, + "grad_norm": 0.25225213170051575, + "learning_rate": 2.5462001151284842e-05, + "loss": 1.722, + "step": 21940 + }, + { + "epoch": 6.734499693063229, + "grad_norm": 0.2422008365392685, + "learning_rate": 2.5457670449535713e-05, + "loss": 1.6996, + "step": 21941 + }, + { + "epoch": 6.734806629834254, + "grad_norm": 0.2421465814113617, + "learning_rate": 2.5453339990328275e-05, + "loss": 1.7014, + "step": 21942 + }, + { + "epoch": 6.735113566605279, + "grad_norm": 0.2520611882209778, + "learning_rate": 2.5449009773705313e-05, + "loss": 1.7149, + "step": 21943 + }, + { + "epoch": 6.735420503376305, + "grad_norm": 0.24940338730812073, + "learning_rate": 2.5444679799709626e-05, + "loss": 1.7423, + "step": 21944 + }, + { + "epoch": 6.73572744014733, + "grad_norm": 0.2328663021326065, + "learning_rate": 2.544035006838401e-05, + "loss": 1.6893, + "step": 21945 + }, + { + "epoch": 6.736034376918354, + "grad_norm": 0.2190757393836975, + "learning_rate": 2.5436020579771226e-05, + "loss": 1.7375, + "step": 21946 + }, + { + "epoch": 6.73634131368938, + "grad_norm": 0.2204900085926056, + "learning_rate": 2.543169133391413e-05, + "loss": 1.6971, + "step": 21947 + }, + { + "epoch": 6.736648250460405, + "grad_norm": 0.29192328453063965, + "learning_rate": 2.5427362330855415e-05, + "loss": 1.7633, + "step": 21948 + }, + { + "epoch": 6.73695518723143, + "grad_norm": 0.19859355688095093, + "learning_rate": 2.542303357063793e-05, + "loss": 1.7515, + "step": 21949 + }, + { + "epoch": 6.737262124002456, + "grad_norm": 0.23010417819023132, + "learning_rate": 2.5418705053304425e-05, + "loss": 1.7282, + "step": 21950 + }, + { + "epoch": 6.737569060773481, + "grad_norm": 0.2168324589729309, + "learning_rate": 2.5414376778897698e-05, + "loss": 1.7347, + "step": 21951 + }, + { + "epoch": 6.7378759975445055, + "grad_norm": 0.2190646231174469, + "learning_rate": 2.54100487474605e-05, + "loss": 1.7893, + "step": 21952 + }, + { + "epoch": 6.738182934315531, + "grad_norm": 0.23925794661045074, + "learning_rate": 2.5405720959035617e-05, + "loss": 1.7825, + "step": 21953 + }, + { + "epoch": 6.738489871086556, + "grad_norm": 0.17987917363643646, + "learning_rate": 2.5401393413665807e-05, + "loss": 1.724, + "step": 21954 + }, + { + "epoch": 6.7387968078575815, + "grad_norm": 0.2300983965396881, + "learning_rate": 2.5397066111393853e-05, + "loss": 1.7023, + "step": 21955 + }, + { + "epoch": 6.739103744628607, + "grad_norm": 0.2128167450428009, + "learning_rate": 2.539273905226251e-05, + "loss": 1.7218, + "step": 21956 + }, + { + "epoch": 6.739410681399631, + "grad_norm": 0.19105537235736847, + "learning_rate": 2.538841223631454e-05, + "loss": 1.7781, + "step": 21957 + }, + { + "epoch": 6.739717618170657, + "grad_norm": 0.22985289990901947, + "learning_rate": 2.5384085663592704e-05, + "loss": 1.7362, + "step": 21958 + }, + { + "epoch": 6.740024554941682, + "grad_norm": 0.18608705699443817, + "learning_rate": 2.5379759334139768e-05, + "loss": 1.7174, + "step": 21959 + }, + { + "epoch": 6.740331491712707, + "grad_norm": 0.2659450173377991, + "learning_rate": 2.5375433247998482e-05, + "loss": 1.8118, + "step": 21960 + }, + { + "epoch": 6.740638428483733, + "grad_norm": 0.1904401034116745, + "learning_rate": 2.537110740521159e-05, + "loss": 1.6789, + "step": 21961 + }, + { + "epoch": 6.740945365254758, + "grad_norm": 0.1826045662164688, + "learning_rate": 2.5366781805821847e-05, + "loss": 1.6906, + "step": 21962 + }, + { + "epoch": 6.741252302025782, + "grad_norm": 0.1919000893831253, + "learning_rate": 2.5362456449871995e-05, + "loss": 1.7412, + "step": 21963 + }, + { + "epoch": 6.741559238796808, + "grad_norm": 0.1921864151954651, + "learning_rate": 2.5358131337404822e-05, + "loss": 1.7023, + "step": 21964 + }, + { + "epoch": 6.741866175567833, + "grad_norm": 0.1628783494234085, + "learning_rate": 2.5353806468463004e-05, + "loss": 1.6842, + "step": 21965 + }, + { + "epoch": 6.742173112338858, + "grad_norm": 0.19764694571495056, + "learning_rate": 2.534948184308935e-05, + "loss": 1.7238, + "step": 21966 + }, + { + "epoch": 6.742480049109884, + "grad_norm": 0.1845860630273819, + "learning_rate": 2.534515746132653e-05, + "loss": 1.728, + "step": 21967 + }, + { + "epoch": 6.742786985880908, + "grad_norm": 0.20269328355789185, + "learning_rate": 2.5340833323217327e-05, + "loss": 1.7541, + "step": 21968 + }, + { + "epoch": 6.7430939226519335, + "grad_norm": 0.16586242616176605, + "learning_rate": 2.5336509428804468e-05, + "loss": 1.7025, + "step": 21969 + }, + { + "epoch": 6.743400859422959, + "grad_norm": 0.1693086177110672, + "learning_rate": 2.533218577813068e-05, + "loss": 1.6975, + "step": 21970 + }, + { + "epoch": 6.743707796193984, + "grad_norm": 0.2206759750843048, + "learning_rate": 2.5327862371238686e-05, + "loss": 1.764, + "step": 21971 + }, + { + "epoch": 6.7440147329650095, + "grad_norm": 0.1915574073791504, + "learning_rate": 2.532353920817122e-05, + "loss": 1.7576, + "step": 21972 + }, + { + "epoch": 6.744321669736035, + "grad_norm": 0.1741783618927002, + "learning_rate": 2.5319216288971003e-05, + "loss": 1.7394, + "step": 21973 + }, + { + "epoch": 6.744628606507059, + "grad_norm": 0.21624934673309326, + "learning_rate": 2.5314893613680755e-05, + "loss": 1.7358, + "step": 21974 + }, + { + "epoch": 6.744935543278085, + "grad_norm": 0.2350481003522873, + "learning_rate": 2.5310571182343197e-05, + "loss": 1.7801, + "step": 21975 + }, + { + "epoch": 6.74524248004911, + "grad_norm": 0.18618559837341309, + "learning_rate": 2.5306248995001048e-05, + "loss": 1.7012, + "step": 21976 + }, + { + "epoch": 6.745549416820135, + "grad_norm": 0.18479639291763306, + "learning_rate": 2.5301927051697016e-05, + "loss": 1.7238, + "step": 21977 + }, + { + "epoch": 6.74585635359116, + "grad_norm": 0.19978758692741394, + "learning_rate": 2.5297605352473818e-05, + "loss": 1.6636, + "step": 21978 + }, + { + "epoch": 6.746163290362185, + "grad_norm": 0.23122164607048035, + "learning_rate": 2.529328389737416e-05, + "loss": 1.7455, + "step": 21979 + }, + { + "epoch": 6.74647022713321, + "grad_norm": 0.20423240959644318, + "learning_rate": 2.5288962686440732e-05, + "loss": 1.7516, + "step": 21980 + }, + { + "epoch": 6.746777163904236, + "grad_norm": 0.18271920084953308, + "learning_rate": 2.52846417197163e-05, + "loss": 1.762, + "step": 21981 + }, + { + "epoch": 6.747084100675261, + "grad_norm": 0.19280247390270233, + "learning_rate": 2.528032099724349e-05, + "loss": 1.7298, + "step": 21982 + }, + { + "epoch": 6.747391037446286, + "grad_norm": 0.20908337831497192, + "learning_rate": 2.527600051906507e-05, + "loss": 1.7323, + "step": 21983 + }, + { + "epoch": 6.747697974217311, + "grad_norm": 0.18399856984615326, + "learning_rate": 2.5271680285223663e-05, + "loss": 1.6795, + "step": 21984 + }, + { + "epoch": 6.748004910988336, + "grad_norm": 0.2273191213607788, + "learning_rate": 2.5267360295762033e-05, + "loss": 1.6811, + "step": 21985 + }, + { + "epoch": 6.7483118477593615, + "grad_norm": 0.1844841092824936, + "learning_rate": 2.526304055072284e-05, + "loss": 1.7404, + "step": 21986 + }, + { + "epoch": 6.748618784530387, + "grad_norm": 0.25975871086120605, + "learning_rate": 2.5258721050148775e-05, + "loss": 1.6994, + "step": 21987 + }, + { + "epoch": 6.748925721301412, + "grad_norm": 0.1664818376302719, + "learning_rate": 2.5254401794082532e-05, + "loss": 1.6722, + "step": 21988 + }, + { + "epoch": 6.749232658072437, + "grad_norm": 0.2597639560699463, + "learning_rate": 2.5250082782566796e-05, + "loss": 1.7654, + "step": 21989 + }, + { + "epoch": 6.749539594843462, + "grad_norm": 0.19326356053352356, + "learning_rate": 2.5245764015644248e-05, + "loss": 1.668, + "step": 21990 + }, + { + "epoch": 6.749846531614487, + "grad_norm": 0.22924599051475525, + "learning_rate": 2.5241445493357574e-05, + "loss": 1.7522, + "step": 21991 + }, + { + "epoch": 6.750153468385513, + "grad_norm": 0.24588358402252197, + "learning_rate": 2.523712721574944e-05, + "loss": 1.7396, + "step": 21992 + }, + { + "epoch": 6.750460405156538, + "grad_norm": 0.1988971084356308, + "learning_rate": 2.5232809182862526e-05, + "loss": 1.7338, + "step": 21993 + }, + { + "epoch": 6.750767341927563, + "grad_norm": 0.18566425144672394, + "learning_rate": 2.5228491394739518e-05, + "loss": 1.7135, + "step": 21994 + }, + { + "epoch": 6.751074278698588, + "grad_norm": 0.22216622531414032, + "learning_rate": 2.5224173851423073e-05, + "loss": 1.744, + "step": 21995 + }, + { + "epoch": 6.751381215469613, + "grad_norm": 0.18695887923240662, + "learning_rate": 2.5219856552955863e-05, + "loss": 1.7324, + "step": 21996 + }, + { + "epoch": 6.7516881522406385, + "grad_norm": 0.1866987645626068, + "learning_rate": 2.5215539499380535e-05, + "loss": 1.6855, + "step": 21997 + }, + { + "epoch": 6.751995089011664, + "grad_norm": 0.1743573248386383, + "learning_rate": 2.521122269073981e-05, + "loss": 1.6833, + "step": 21998 + }, + { + "epoch": 6.752302025782689, + "grad_norm": 0.2173541784286499, + "learning_rate": 2.5206906127076274e-05, + "loss": 1.7434, + "step": 21999 + }, + { + "epoch": 6.752608962553714, + "grad_norm": 0.17558147013187408, + "learning_rate": 2.5202589808432665e-05, + "loss": 1.6884, + "step": 22000 + }, + { + "epoch": 6.752915899324739, + "grad_norm": 0.16630353033542633, + "learning_rate": 2.5198273734851553e-05, + "loss": 1.7005, + "step": 22001 + }, + { + "epoch": 6.753222836095764, + "grad_norm": 0.1834949105978012, + "learning_rate": 2.519395790637566e-05, + "loss": 1.7123, + "step": 22002 + }, + { + "epoch": 6.75352977286679, + "grad_norm": 0.1806751936674118, + "learning_rate": 2.5189642323047614e-05, + "loss": 1.7305, + "step": 22003 + }, + { + "epoch": 6.753836709637815, + "grad_norm": 0.2350265085697174, + "learning_rate": 2.5185326984910062e-05, + "loss": 1.772, + "step": 22004 + }, + { + "epoch": 6.75414364640884, + "grad_norm": 0.18105818331241608, + "learning_rate": 2.518101189200566e-05, + "loss": 1.7487, + "step": 22005 + }, + { + "epoch": 6.754450583179865, + "grad_norm": 0.17640845477581024, + "learning_rate": 2.517669704437704e-05, + "loss": 1.7178, + "step": 22006 + }, + { + "epoch": 6.75475751995089, + "grad_norm": 0.21648885309696198, + "learning_rate": 2.5172382442066845e-05, + "loss": 1.7144, + "step": 22007 + }, + { + "epoch": 6.755064456721915, + "grad_norm": 0.2042703926563263, + "learning_rate": 2.5168068085117724e-05, + "loss": 1.7476, + "step": 22008 + }, + { + "epoch": 6.755371393492941, + "grad_norm": 0.24397306144237518, + "learning_rate": 2.5163753973572306e-05, + "loss": 1.7033, + "step": 22009 + }, + { + "epoch": 6.755678330263965, + "grad_norm": 0.2030377835035324, + "learning_rate": 2.5159440107473232e-05, + "loss": 1.7353, + "step": 22010 + }, + { + "epoch": 6.7559852670349905, + "grad_norm": 0.2493598908185959, + "learning_rate": 2.5155126486863127e-05, + "loss": 1.7346, + "step": 22011 + }, + { + "epoch": 6.756292203806016, + "grad_norm": 0.17272062599658966, + "learning_rate": 2.5150813111784627e-05, + "loss": 1.7095, + "step": 22012 + }, + { + "epoch": 6.756599140577041, + "grad_norm": 0.2417706698179245, + "learning_rate": 2.514649998228036e-05, + "loss": 1.7631, + "step": 22013 + }, + { + "epoch": 6.7569060773480665, + "grad_norm": 0.17753612995147705, + "learning_rate": 2.5142187098392915e-05, + "loss": 1.697, + "step": 22014 + }, + { + "epoch": 6.757213014119092, + "grad_norm": 0.2246367186307907, + "learning_rate": 2.5137874460164995e-05, + "loss": 1.7216, + "step": 22015 + }, + { + "epoch": 6.757519950890116, + "grad_norm": 0.24141135811805725, + "learning_rate": 2.5133562067639134e-05, + "loss": 1.7368, + "step": 22016 + }, + { + "epoch": 6.757826887661142, + "grad_norm": 0.21253570914268494, + "learning_rate": 2.5129249920858022e-05, + "loss": 1.7029, + "step": 22017 + }, + { + "epoch": 6.758133824432167, + "grad_norm": 0.21176676452159882, + "learning_rate": 2.5124938019864198e-05, + "loss": 1.7472, + "step": 22018 + }, + { + "epoch": 6.758440761203192, + "grad_norm": 0.1990927904844284, + "learning_rate": 2.5120626364700338e-05, + "loss": 1.6686, + "step": 22019 + }, + { + "epoch": 6.758747697974218, + "grad_norm": 0.1736145317554474, + "learning_rate": 2.5116314955409038e-05, + "loss": 1.6984, + "step": 22020 + }, + { + "epoch": 6.759054634745242, + "grad_norm": 0.2618037462234497, + "learning_rate": 2.511200379203289e-05, + "loss": 1.7374, + "step": 22021 + }, + { + "epoch": 6.759361571516267, + "grad_norm": 0.25363266468048096, + "learning_rate": 2.5107692874614507e-05, + "loss": 1.7001, + "step": 22022 + }, + { + "epoch": 6.759668508287293, + "grad_norm": 0.20287153124809265, + "learning_rate": 2.51033822031965e-05, + "loss": 1.7704, + "step": 22023 + }, + { + "epoch": 6.759975445058318, + "grad_norm": 0.2401949167251587, + "learning_rate": 2.509907177782146e-05, + "loss": 1.7157, + "step": 22024 + }, + { + "epoch": 6.760282381829343, + "grad_norm": 0.177081897854805, + "learning_rate": 2.5094761598531985e-05, + "loss": 1.7572, + "step": 22025 + }, + { + "epoch": 6.760589318600369, + "grad_norm": 0.2641974687576294, + "learning_rate": 2.5090451665370674e-05, + "loss": 1.725, + "step": 22026 + }, + { + "epoch": 6.760896255371393, + "grad_norm": 0.20262297987937927, + "learning_rate": 2.5086141978380116e-05, + "loss": 1.6591, + "step": 22027 + }, + { + "epoch": 6.7612031921424185, + "grad_norm": 0.19107301533222198, + "learning_rate": 2.5081832537602913e-05, + "loss": 1.6914, + "step": 22028 + }, + { + "epoch": 6.761510128913444, + "grad_norm": 0.28122687339782715, + "learning_rate": 2.5077523343081643e-05, + "loss": 1.7759, + "step": 22029 + }, + { + "epoch": 6.761817065684469, + "grad_norm": 0.16575101017951965, + "learning_rate": 2.5073214394858897e-05, + "loss": 1.6994, + "step": 22030 + }, + { + "epoch": 6.7621240024554945, + "grad_norm": 0.26933449506759644, + "learning_rate": 2.506890569297723e-05, + "loss": 1.7565, + "step": 22031 + }, + { + "epoch": 6.762430939226519, + "grad_norm": 0.2452966868877411, + "learning_rate": 2.5064597237479292e-05, + "loss": 1.7442, + "step": 22032 + }, + { + "epoch": 6.762737875997544, + "grad_norm": 0.20781855285167694, + "learning_rate": 2.5060289028407585e-05, + "loss": 1.714, + "step": 22033 + }, + { + "epoch": 6.76304481276857, + "grad_norm": 0.1997823268175125, + "learning_rate": 2.5055981065804756e-05, + "loss": 1.7318, + "step": 22034 + }, + { + "epoch": 6.763351749539595, + "grad_norm": 0.2080194652080536, + "learning_rate": 2.50516733497133e-05, + "loss": 1.7466, + "step": 22035 + }, + { + "epoch": 6.76365868631062, + "grad_norm": 0.17558889091014862, + "learning_rate": 2.504736588017585e-05, + "loss": 1.7049, + "step": 22036 + }, + { + "epoch": 6.763965623081646, + "grad_norm": 0.1999572217464447, + "learning_rate": 2.5043058657234957e-05, + "loss": 1.7121, + "step": 22037 + }, + { + "epoch": 6.76427255985267, + "grad_norm": 0.16219176352024078, + "learning_rate": 2.5038751680933185e-05, + "loss": 1.698, + "step": 22038 + }, + { + "epoch": 6.764579496623695, + "grad_norm": 0.17965151369571686, + "learning_rate": 2.50344449513131e-05, + "loss": 1.7021, + "step": 22039 + }, + { + "epoch": 6.764886433394721, + "grad_norm": 0.18831093609333038, + "learning_rate": 2.5030138468417263e-05, + "loss": 1.7049, + "step": 22040 + }, + { + "epoch": 6.765193370165746, + "grad_norm": 0.20622828602790833, + "learning_rate": 2.5025832232288236e-05, + "loss": 1.7834, + "step": 22041 + }, + { + "epoch": 6.765500306936771, + "grad_norm": 0.22746746242046356, + "learning_rate": 2.5021526242968574e-05, + "loss": 1.7426, + "step": 22042 + }, + { + "epoch": 6.765807243707796, + "grad_norm": 0.2048977166414261, + "learning_rate": 2.5017220500500828e-05, + "loss": 1.7192, + "step": 22043 + }, + { + "epoch": 6.766114180478821, + "grad_norm": 0.19647538661956787, + "learning_rate": 2.5012915004927546e-05, + "loss": 1.6738, + "step": 22044 + }, + { + "epoch": 6.7664211172498465, + "grad_norm": 0.2133142054080963, + "learning_rate": 2.5008609756291284e-05, + "loss": 1.7482, + "step": 22045 + }, + { + "epoch": 6.766728054020872, + "grad_norm": 0.23578259348869324, + "learning_rate": 2.500430475463459e-05, + "loss": 1.696, + "step": 22046 + }, + { + "epoch": 6.767034990791897, + "grad_norm": 0.24862529337406158, + "learning_rate": 2.500000000000001e-05, + "loss": 1.7508, + "step": 22047 + }, + { + "epoch": 6.7673419275629225, + "grad_norm": 0.22704963386058807, + "learning_rate": 2.4995695492430066e-05, + "loss": 1.7739, + "step": 22048 + }, + { + "epoch": 6.767648864333947, + "grad_norm": 0.20216481387615204, + "learning_rate": 2.4991391231967347e-05, + "loss": 1.7406, + "step": 22049 + }, + { + "epoch": 6.767955801104972, + "grad_norm": 0.18778519332408905, + "learning_rate": 2.498708721865432e-05, + "loss": 1.683, + "step": 22050 + }, + { + "epoch": 6.768262737875998, + "grad_norm": 0.21680599451065063, + "learning_rate": 2.4982783452533597e-05, + "loss": 1.7652, + "step": 22051 + }, + { + "epoch": 6.768569674647023, + "grad_norm": 0.16952121257781982, + "learning_rate": 2.4978479933647637e-05, + "loss": 1.6551, + "step": 22052 + }, + { + "epoch": 6.768876611418047, + "grad_norm": 0.1979489028453827, + "learning_rate": 2.4974176662039017e-05, + "loss": 1.7399, + "step": 22053 + }, + { + "epoch": 6.769183548189073, + "grad_norm": 0.18934862315654755, + "learning_rate": 2.496987363775025e-05, + "loss": 1.7228, + "step": 22054 + }, + { + "epoch": 6.769490484960098, + "grad_norm": 0.17551462352275848, + "learning_rate": 2.496557086082387e-05, + "loss": 1.6725, + "step": 22055 + }, + { + "epoch": 6.769797421731123, + "grad_norm": 0.23561003804206848, + "learning_rate": 2.496126833130239e-05, + "loss": 1.7606, + "step": 22056 + }, + { + "epoch": 6.770104358502149, + "grad_norm": 0.19105803966522217, + "learning_rate": 2.4956966049228324e-05, + "loss": 1.6975, + "step": 22057 + }, + { + "epoch": 6.770411295273174, + "grad_norm": 0.28581124544143677, + "learning_rate": 2.4952664014644204e-05, + "loss": 1.7408, + "step": 22058 + }, + { + "epoch": 6.7707182320441985, + "grad_norm": 0.20723536610603333, + "learning_rate": 2.494836222759254e-05, + "loss": 1.752, + "step": 22059 + }, + { + "epoch": 6.771025168815224, + "grad_norm": 0.2089354693889618, + "learning_rate": 2.4944060688115846e-05, + "loss": 1.6662, + "step": 22060 + }, + { + "epoch": 6.771332105586249, + "grad_norm": 0.2299557626247406, + "learning_rate": 2.4939759396256625e-05, + "loss": 1.7978, + "step": 22061 + }, + { + "epoch": 6.7716390423572745, + "grad_norm": 0.17900820076465607, + "learning_rate": 2.493545835205739e-05, + "loss": 1.6876, + "step": 22062 + }, + { + "epoch": 6.7719459791283, + "grad_norm": 0.21412713825702667, + "learning_rate": 2.4931157555560648e-05, + "loss": 1.7347, + "step": 22063 + }, + { + "epoch": 6.772252915899324, + "grad_norm": 0.24448172748088837, + "learning_rate": 2.49268570068089e-05, + "loss": 1.7611, + "step": 22064 + }, + { + "epoch": 6.77255985267035, + "grad_norm": 0.20153972506523132, + "learning_rate": 2.4922556705844624e-05, + "loss": 1.7347, + "step": 22065 + }, + { + "epoch": 6.772866789441375, + "grad_norm": 0.2142268568277359, + "learning_rate": 2.4918256652710387e-05, + "loss": 1.7548, + "step": 22066 + }, + { + "epoch": 6.7731737262124, + "grad_norm": 0.19735601544380188, + "learning_rate": 2.4913956847448595e-05, + "loss": 1.7138, + "step": 22067 + }, + { + "epoch": 6.773480662983426, + "grad_norm": 0.1847008913755417, + "learning_rate": 2.4909657290101824e-05, + "loss": 1.6812, + "step": 22068 + }, + { + "epoch": 6.773787599754451, + "grad_norm": 0.18406464159488678, + "learning_rate": 2.4905357980712486e-05, + "loss": 1.6992, + "step": 22069 + }, + { + "epoch": 6.774094536525475, + "grad_norm": 0.19595865905284882, + "learning_rate": 2.490105891932313e-05, + "loss": 1.7118, + "step": 22070 + }, + { + "epoch": 6.774401473296501, + "grad_norm": 0.1929878294467926, + "learning_rate": 2.4896760105976218e-05, + "loss": 1.7187, + "step": 22071 + }, + { + "epoch": 6.774708410067526, + "grad_norm": 0.23972687125205994, + "learning_rate": 2.4892461540714242e-05, + "loss": 1.7293, + "step": 22072 + }, + { + "epoch": 6.7750153468385514, + "grad_norm": 0.18744204938411713, + "learning_rate": 2.4888163223579675e-05, + "loss": 1.7102, + "step": 22073 + }, + { + "epoch": 6.775322283609577, + "grad_norm": 0.20168112218379974, + "learning_rate": 2.4883865154614994e-05, + "loss": 1.7655, + "step": 22074 + }, + { + "epoch": 6.775629220380601, + "grad_norm": 0.22825658321380615, + "learning_rate": 2.487956733386268e-05, + "loss": 1.7251, + "step": 22075 + }, + { + "epoch": 6.775936157151627, + "grad_norm": 0.19441691040992737, + "learning_rate": 2.4875269761365205e-05, + "loss": 1.7657, + "step": 22076 + }, + { + "epoch": 6.776243093922652, + "grad_norm": 0.22861605882644653, + "learning_rate": 2.487097243716504e-05, + "loss": 1.7132, + "step": 22077 + }, + { + "epoch": 6.776550030693677, + "grad_norm": 0.19157674908638, + "learning_rate": 2.486667536130466e-05, + "loss": 1.7448, + "step": 22078 + }, + { + "epoch": 6.776856967464703, + "grad_norm": 0.2203369438648224, + "learning_rate": 2.486237853382652e-05, + "loss": 1.7535, + "step": 22079 + }, + { + "epoch": 6.777163904235728, + "grad_norm": 0.16477027535438538, + "learning_rate": 2.4858081954773088e-05, + "loss": 1.706, + "step": 22080 + }, + { + "epoch": 6.777470841006752, + "grad_norm": 0.16536933183670044, + "learning_rate": 2.4853785624186827e-05, + "loss": 1.6725, + "step": 22081 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 0.18266050517559052, + "learning_rate": 2.4849489542110176e-05, + "loss": 1.6799, + "step": 22082 + }, + { + "epoch": 6.778084714548803, + "grad_norm": 0.21422190964221954, + "learning_rate": 2.4845193708585647e-05, + "loss": 1.7275, + "step": 22083 + }, + { + "epoch": 6.778391651319828, + "grad_norm": 0.19356754422187805, + "learning_rate": 2.4840898123655622e-05, + "loss": 1.7172, + "step": 22084 + }, + { + "epoch": 6.778698588090853, + "grad_norm": 0.21090209484100342, + "learning_rate": 2.4836602787362628e-05, + "loss": 1.6581, + "step": 22085 + }, + { + "epoch": 6.779005524861878, + "grad_norm": 0.20072491466999054, + "learning_rate": 2.483230769974903e-05, + "loss": 1.7398, + "step": 22086 + }, + { + "epoch": 6.7793124616329035, + "grad_norm": 0.20642702281475067, + "learning_rate": 2.482801286085734e-05, + "loss": 1.7505, + "step": 22087 + }, + { + "epoch": 6.779619398403929, + "grad_norm": 0.20322991907596588, + "learning_rate": 2.4823718270729985e-05, + "loss": 1.6693, + "step": 22088 + }, + { + "epoch": 6.779926335174954, + "grad_norm": 0.17060843110084534, + "learning_rate": 2.4819423929409396e-05, + "loss": 1.6746, + "step": 22089 + }, + { + "epoch": 6.7802332719459795, + "grad_norm": 0.20697785913944244, + "learning_rate": 2.4815129836938024e-05, + "loss": 1.7413, + "step": 22090 + }, + { + "epoch": 6.780540208717004, + "grad_norm": 0.19845673441886902, + "learning_rate": 2.48108359933583e-05, + "loss": 1.694, + "step": 22091 + }, + { + "epoch": 6.780847145488029, + "grad_norm": 0.24547794461250305, + "learning_rate": 2.4806542398712657e-05, + "loss": 1.7316, + "step": 22092 + }, + { + "epoch": 6.781154082259055, + "grad_norm": 0.15587118268013, + "learning_rate": 2.4802249053043526e-05, + "loss": 1.667, + "step": 22093 + }, + { + "epoch": 6.78146101903008, + "grad_norm": 0.22754593193531036, + "learning_rate": 2.4797955956393336e-05, + "loss": 1.7504, + "step": 22094 + }, + { + "epoch": 6.781767955801105, + "grad_norm": 0.201420396566391, + "learning_rate": 2.4793663108804528e-05, + "loss": 1.749, + "step": 22095 + }, + { + "epoch": 6.78207489257213, + "grad_norm": 0.1952153891324997, + "learning_rate": 2.4789370510319504e-05, + "loss": 1.7306, + "step": 22096 + }, + { + "epoch": 6.782381829343155, + "grad_norm": 0.16750730574131012, + "learning_rate": 2.4785078160980703e-05, + "loss": 1.6775, + "step": 22097 + }, + { + "epoch": 6.78268876611418, + "grad_norm": 0.19943620264530182, + "learning_rate": 2.4780786060830535e-05, + "loss": 1.7233, + "step": 22098 + }, + { + "epoch": 6.782995702885206, + "grad_norm": 0.21302999556064606, + "learning_rate": 2.4776494209911423e-05, + "loss": 1.798, + "step": 22099 + }, + { + "epoch": 6.783302639656231, + "grad_norm": 0.22949734330177307, + "learning_rate": 2.4772202608265776e-05, + "loss": 1.7678, + "step": 22100 + }, + { + "epoch": 6.783609576427256, + "grad_norm": 0.20945954322814941, + "learning_rate": 2.4767911255935993e-05, + "loss": 1.701, + "step": 22101 + }, + { + "epoch": 6.783916513198281, + "grad_norm": 0.189425989985466, + "learning_rate": 2.476362015296454e-05, + "loss": 1.7152, + "step": 22102 + }, + { + "epoch": 6.784223449969306, + "grad_norm": 0.18826924264431, + "learning_rate": 2.4759329299393747e-05, + "loss": 1.7004, + "step": 22103 + }, + { + "epoch": 6.7845303867403315, + "grad_norm": 0.20359934866428375, + "learning_rate": 2.475503869526607e-05, + "loss": 1.705, + "step": 22104 + }, + { + "epoch": 6.784837323511357, + "grad_norm": 0.22381560504436493, + "learning_rate": 2.4750748340623896e-05, + "loss": 1.7345, + "step": 22105 + }, + { + "epoch": 6.785144260282382, + "grad_norm": 0.1750476062297821, + "learning_rate": 2.474645823550963e-05, + "loss": 1.7084, + "step": 22106 + }, + { + "epoch": 6.785451197053407, + "grad_norm": 0.17943856120109558, + "learning_rate": 2.4742168379965662e-05, + "loss": 1.7417, + "step": 22107 + }, + { + "epoch": 6.785758133824432, + "grad_norm": 0.21809861063957214, + "learning_rate": 2.4737878774034397e-05, + "loss": 1.7197, + "step": 22108 + }, + { + "epoch": 6.786065070595457, + "grad_norm": 0.19761307537555695, + "learning_rate": 2.473358941775821e-05, + "loss": 1.6763, + "step": 22109 + }, + { + "epoch": 6.786372007366483, + "grad_norm": 0.19513878226280212, + "learning_rate": 2.472930031117951e-05, + "loss": 1.6859, + "step": 22110 + }, + { + "epoch": 6.786678944137508, + "grad_norm": 0.21796870231628418, + "learning_rate": 2.4725011454340675e-05, + "loss": 1.6957, + "step": 22111 + }, + { + "epoch": 6.786985880908533, + "grad_norm": 0.1885530948638916, + "learning_rate": 2.4720722847284088e-05, + "loss": 1.731, + "step": 22112 + }, + { + "epoch": 6.787292817679558, + "grad_norm": 0.2108110785484314, + "learning_rate": 2.4716434490052137e-05, + "loss": 1.7985, + "step": 22113 + }, + { + "epoch": 6.787599754450583, + "grad_norm": 0.23425176739692688, + "learning_rate": 2.4712146382687194e-05, + "loss": 1.7177, + "step": 22114 + }, + { + "epoch": 6.787906691221608, + "grad_norm": 0.17368707060813904, + "learning_rate": 2.4707858525231652e-05, + "loss": 1.7158, + "step": 22115 + }, + { + "epoch": 6.788213627992634, + "grad_norm": 0.22731448709964752, + "learning_rate": 2.470357091772787e-05, + "loss": 1.7037, + "step": 22116 + }, + { + "epoch": 6.788520564763659, + "grad_norm": 0.19142407178878784, + "learning_rate": 2.469928356021823e-05, + "loss": 1.7283, + "step": 22117 + }, + { + "epoch": 6.7888275015346835, + "grad_norm": 0.17515631020069122, + "learning_rate": 2.4694996452745072e-05, + "loss": 1.6812, + "step": 22118 + }, + { + "epoch": 6.789134438305709, + "grad_norm": 0.17932391166687012, + "learning_rate": 2.4690709595350838e-05, + "loss": 1.6832, + "step": 22119 + }, + { + "epoch": 6.789441375076734, + "grad_norm": 0.21177144348621368, + "learning_rate": 2.4686422988077802e-05, + "loss": 1.7443, + "step": 22120 + }, + { + "epoch": 6.7897483118477595, + "grad_norm": 0.17952793836593628, + "learning_rate": 2.4682136630968412e-05, + "loss": 1.6794, + "step": 22121 + }, + { + "epoch": 6.790055248618785, + "grad_norm": 0.18464395403862, + "learning_rate": 2.467785052406495e-05, + "loss": 1.6316, + "step": 22122 + }, + { + "epoch": 6.79036218538981, + "grad_norm": 0.1936565786600113, + "learning_rate": 2.4673564667409828e-05, + "loss": 1.6935, + "step": 22123 + }, + { + "epoch": 6.790669122160835, + "grad_norm": 0.21169735491275787, + "learning_rate": 2.4669279061045387e-05, + "loss": 1.7232, + "step": 22124 + }, + { + "epoch": 6.79097605893186, + "grad_norm": 0.199925035238266, + "learning_rate": 2.466499370501397e-05, + "loss": 1.8242, + "step": 22125 + }, + { + "epoch": 6.791282995702885, + "grad_norm": 0.19049705564975739, + "learning_rate": 2.4660708599357963e-05, + "loss": 1.7342, + "step": 22126 + }, + { + "epoch": 6.791589932473911, + "grad_norm": 0.16483616828918457, + "learning_rate": 2.465642374411964e-05, + "loss": 1.7144, + "step": 22127 + }, + { + "epoch": 6.791896869244935, + "grad_norm": 0.17355477809906006, + "learning_rate": 2.4652139139341413e-05, + "loss": 1.6715, + "step": 22128 + }, + { + "epoch": 6.79220380601596, + "grad_norm": 0.17448700964450836, + "learning_rate": 2.4647854785065605e-05, + "loss": 1.6669, + "step": 22129 + }, + { + "epoch": 6.792510742786986, + "grad_norm": 0.19858810305595398, + "learning_rate": 2.4643570681334553e-05, + "loss": 1.6781, + "step": 22130 + }, + { + "epoch": 6.792817679558011, + "grad_norm": 0.17350561916828156, + "learning_rate": 2.46392868281906e-05, + "loss": 1.7005, + "step": 22131 + }, + { + "epoch": 6.793124616329036, + "grad_norm": 0.17494787275791168, + "learning_rate": 2.4635003225676078e-05, + "loss": 1.7204, + "step": 22132 + }, + { + "epoch": 6.793431553100062, + "grad_norm": 0.1988590806722641, + "learning_rate": 2.463071987383332e-05, + "loss": 1.7314, + "step": 22133 + }, + { + "epoch": 6.793738489871086, + "grad_norm": 0.18046239018440247, + "learning_rate": 2.4626436772704658e-05, + "loss": 1.706, + "step": 22134 + }, + { + "epoch": 6.7940454266421115, + "grad_norm": 0.21060462296009064, + "learning_rate": 2.4622153922332402e-05, + "loss": 1.6967, + "step": 22135 + }, + { + "epoch": 6.794352363413137, + "grad_norm": 0.22328679263591766, + "learning_rate": 2.4617871322758934e-05, + "loss": 1.7502, + "step": 22136 + }, + { + "epoch": 6.794659300184162, + "grad_norm": 0.18324224650859833, + "learning_rate": 2.46135889740265e-05, + "loss": 1.7183, + "step": 22137 + }, + { + "epoch": 6.7949662369551875, + "grad_norm": 0.2381133884191513, + "learning_rate": 2.4609306876177496e-05, + "loss": 1.739, + "step": 22138 + }, + { + "epoch": 6.795273173726212, + "grad_norm": 0.21471738815307617, + "learning_rate": 2.4605025029254164e-05, + "loss": 1.7466, + "step": 22139 + }, + { + "epoch": 6.795580110497237, + "grad_norm": 0.209581658244133, + "learning_rate": 2.4600743433298885e-05, + "loss": 1.7495, + "step": 22140 + }, + { + "epoch": 6.795887047268263, + "grad_norm": 0.1806897670030594, + "learning_rate": 2.459646208835394e-05, + "loss": 1.7137, + "step": 22141 + }, + { + "epoch": 6.796193984039288, + "grad_norm": 0.19036264717578888, + "learning_rate": 2.4592180994461644e-05, + "loss": 1.6993, + "step": 22142 + }, + { + "epoch": 6.796500920810313, + "grad_norm": 0.17937630414962769, + "learning_rate": 2.4587900151664335e-05, + "loss": 1.7102, + "step": 22143 + }, + { + "epoch": 6.796807857581339, + "grad_norm": 0.19278483092784882, + "learning_rate": 2.4583619560004244e-05, + "loss": 1.7058, + "step": 22144 + }, + { + "epoch": 6.797114794352363, + "grad_norm": 0.19507993757724762, + "learning_rate": 2.4579339219523744e-05, + "loss": 1.7137, + "step": 22145 + }, + { + "epoch": 6.797421731123388, + "grad_norm": 0.20417597889900208, + "learning_rate": 2.4575059130265115e-05, + "loss": 1.7156, + "step": 22146 + }, + { + "epoch": 6.797728667894414, + "grad_norm": 0.1898338943719864, + "learning_rate": 2.4570779292270658e-05, + "loss": 1.7501, + "step": 22147 + }, + { + "epoch": 6.798035604665439, + "grad_norm": 0.18777382373809814, + "learning_rate": 2.4566499705582656e-05, + "loss": 1.7192, + "step": 22148 + }, + { + "epoch": 6.798342541436464, + "grad_norm": 0.19526423513889313, + "learning_rate": 2.4562220370243415e-05, + "loss": 1.6637, + "step": 22149 + }, + { + "epoch": 6.798649478207489, + "grad_norm": 0.23661594092845917, + "learning_rate": 2.455794128629522e-05, + "loss": 1.7557, + "step": 22150 + }, + { + "epoch": 6.798956414978514, + "grad_norm": 0.27043846249580383, + "learning_rate": 2.4553662453780362e-05, + "loss": 1.7712, + "step": 22151 + }, + { + "epoch": 6.7992633517495396, + "grad_norm": 0.17968088388442993, + "learning_rate": 2.454938387274111e-05, + "loss": 1.6721, + "step": 22152 + }, + { + "epoch": 6.799570288520565, + "grad_norm": 0.21456219255924225, + "learning_rate": 2.45451055432198e-05, + "loss": 1.7249, + "step": 22153 + }, + { + "epoch": 6.79987722529159, + "grad_norm": 0.22433941066265106, + "learning_rate": 2.4540827465258638e-05, + "loss": 1.7319, + "step": 22154 + }, + { + "epoch": 6.800184162062616, + "grad_norm": 0.2808871567249298, + "learning_rate": 2.4536549638899976e-05, + "loss": 1.7802, + "step": 22155 + }, + { + "epoch": 6.80049109883364, + "grad_norm": 0.28654494881629944, + "learning_rate": 2.4532272064186018e-05, + "loss": 1.7431, + "step": 22156 + }, + { + "epoch": 6.800798035604665, + "grad_norm": 0.19476976990699768, + "learning_rate": 2.45279947411591e-05, + "loss": 1.6792, + "step": 22157 + }, + { + "epoch": 6.801104972375691, + "grad_norm": 0.25114744901657104, + "learning_rate": 2.452371766986146e-05, + "loss": 1.7458, + "step": 22158 + }, + { + "epoch": 6.801411909146716, + "grad_norm": 0.18099439144134521, + "learning_rate": 2.451944085033538e-05, + "loss": 1.6952, + "step": 22159 + }, + { + "epoch": 6.8017188459177405, + "grad_norm": 0.21425777673721313, + "learning_rate": 2.4515164282623138e-05, + "loss": 1.7593, + "step": 22160 + }, + { + "epoch": 6.802025782688766, + "grad_norm": 0.19833709299564362, + "learning_rate": 2.4510887966766937e-05, + "loss": 1.6643, + "step": 22161 + }, + { + "epoch": 6.802332719459791, + "grad_norm": 0.20073090493679047, + "learning_rate": 2.45066119028091e-05, + "loss": 1.7112, + "step": 22162 + }, + { + "epoch": 6.8026396562308165, + "grad_norm": 0.18599852919578552, + "learning_rate": 2.4502336090791872e-05, + "loss": 1.7121, + "step": 22163 + }, + { + "epoch": 6.802946593001842, + "grad_norm": 0.22036875784397125, + "learning_rate": 2.4498060530757498e-05, + "loss": 1.7944, + "step": 22164 + }, + { + "epoch": 6.803253529772867, + "grad_norm": 0.19521577656269073, + "learning_rate": 2.4493785222748243e-05, + "loss": 1.7463, + "step": 22165 + }, + { + "epoch": 6.803560466543892, + "grad_norm": 0.22010843455791473, + "learning_rate": 2.448951016680635e-05, + "loss": 1.6951, + "step": 22166 + }, + { + "epoch": 6.803867403314917, + "grad_norm": 0.20490090548992157, + "learning_rate": 2.448523536297407e-05, + "loss": 1.7723, + "step": 22167 + }, + { + "epoch": 6.804174340085942, + "grad_norm": 0.2298613339662552, + "learning_rate": 2.4480960811293648e-05, + "loss": 1.7644, + "step": 22168 + }, + { + "epoch": 6.804481276856968, + "grad_norm": 0.18560375273227692, + "learning_rate": 2.4476686511807306e-05, + "loss": 1.686, + "step": 22169 + }, + { + "epoch": 6.804788213627993, + "grad_norm": 0.24295780062675476, + "learning_rate": 2.4472412464557347e-05, + "loss": 1.7561, + "step": 22170 + }, + { + "epoch": 6.805095150399017, + "grad_norm": 0.1962144672870636, + "learning_rate": 2.4468138669585932e-05, + "loss": 1.7438, + "step": 22171 + }, + { + "epoch": 6.805402087170043, + "grad_norm": 0.21924439072608948, + "learning_rate": 2.4463865126935377e-05, + "loss": 1.7488, + "step": 22172 + }, + { + "epoch": 6.805709023941068, + "grad_norm": 0.1777856945991516, + "learning_rate": 2.4459591836647833e-05, + "loss": 1.6664, + "step": 22173 + }, + { + "epoch": 6.806015960712093, + "grad_norm": 0.24367454648017883, + "learning_rate": 2.4455318798765593e-05, + "loss": 1.7441, + "step": 22174 + }, + { + "epoch": 6.806322897483119, + "grad_norm": 0.2269427478313446, + "learning_rate": 2.4451046013330865e-05, + "loss": 1.7809, + "step": 22175 + }, + { + "epoch": 6.806629834254144, + "grad_norm": 0.21986174583435059, + "learning_rate": 2.444677348038587e-05, + "loss": 1.7453, + "step": 22176 + }, + { + "epoch": 6.8069367710251685, + "grad_norm": 0.1773367077112198, + "learning_rate": 2.4442501199972862e-05, + "loss": 1.6927, + "step": 22177 + }, + { + "epoch": 6.807243707796194, + "grad_norm": 0.20545031130313873, + "learning_rate": 2.4438229172133997e-05, + "loss": 1.7782, + "step": 22178 + }, + { + "epoch": 6.807550644567219, + "grad_norm": 0.1997014880180359, + "learning_rate": 2.443395739691155e-05, + "loss": 1.7295, + "step": 22179 + }, + { + "epoch": 6.8078575813382445, + "grad_norm": 0.19634006917476654, + "learning_rate": 2.4429685874347723e-05, + "loss": 1.7017, + "step": 22180 + }, + { + "epoch": 6.80816451810927, + "grad_norm": 0.2007836550474167, + "learning_rate": 2.442541460448473e-05, + "loss": 1.7252, + "step": 22181 + }, + { + "epoch": 6.808471454880294, + "grad_norm": 0.22204343974590302, + "learning_rate": 2.4421143587364775e-05, + "loss": 1.7526, + "step": 22182 + }, + { + "epoch": 6.80877839165132, + "grad_norm": 0.1906677633523941, + "learning_rate": 2.4416872823030073e-05, + "loss": 1.7121, + "step": 22183 + }, + { + "epoch": 6.809085328422345, + "grad_norm": 0.17165397107601166, + "learning_rate": 2.441260231152283e-05, + "loss": 1.6942, + "step": 22184 + }, + { + "epoch": 6.80939226519337, + "grad_norm": 0.17022575438022614, + "learning_rate": 2.4408332052885246e-05, + "loss": 1.6973, + "step": 22185 + }, + { + "epoch": 6.809699201964396, + "grad_norm": 0.16693587601184845, + "learning_rate": 2.4404062047159503e-05, + "loss": 1.6996, + "step": 22186 + }, + { + "epoch": 6.810006138735421, + "grad_norm": 0.2251187264919281, + "learning_rate": 2.4399792294387864e-05, + "loss": 1.778, + "step": 22187 + }, + { + "epoch": 6.810313075506445, + "grad_norm": 0.20622244477272034, + "learning_rate": 2.439552279461244e-05, + "loss": 1.7273, + "step": 22188 + }, + { + "epoch": 6.810620012277471, + "grad_norm": 0.19736994802951813, + "learning_rate": 2.439125354787551e-05, + "loss": 1.7096, + "step": 22189 + }, + { + "epoch": 6.810926949048496, + "grad_norm": 0.22955237329006195, + "learning_rate": 2.4386984554219182e-05, + "loss": 1.7859, + "step": 22190 + }, + { + "epoch": 6.811233885819521, + "grad_norm": 0.2283364087343216, + "learning_rate": 2.43827158136857e-05, + "loss": 1.6999, + "step": 22191 + }, + { + "epoch": 6.811540822590547, + "grad_norm": 0.18393704295158386, + "learning_rate": 2.4378447326317243e-05, + "loss": 1.654, + "step": 22192 + }, + { + "epoch": 6.811847759361571, + "grad_norm": 0.2031537890434265, + "learning_rate": 2.4374179092155986e-05, + "loss": 1.7353, + "step": 22193 + }, + { + "epoch": 6.8121546961325965, + "grad_norm": 0.1849071979522705, + "learning_rate": 2.4369911111244125e-05, + "loss": 1.7157, + "step": 22194 + }, + { + "epoch": 6.812461632903622, + "grad_norm": 0.20584192872047424, + "learning_rate": 2.4365643383623787e-05, + "loss": 1.7529, + "step": 22195 + }, + { + "epoch": 6.812768569674647, + "grad_norm": 0.24152903258800507, + "learning_rate": 2.436137590933721e-05, + "loss": 1.7662, + "step": 22196 + }, + { + "epoch": 6.8130755064456725, + "grad_norm": 0.26625362038612366, + "learning_rate": 2.4357108688426532e-05, + "loss": 1.7624, + "step": 22197 + }, + { + "epoch": 6.813382443216698, + "grad_norm": 0.27122190594673157, + "learning_rate": 2.435284172093395e-05, + "loss": 1.747, + "step": 22198 + }, + { + "epoch": 6.813689379987722, + "grad_norm": 0.18996810913085938, + "learning_rate": 2.434857500690161e-05, + "loss": 1.7377, + "step": 22199 + }, + { + "epoch": 6.813996316758748, + "grad_norm": 0.22355122864246368, + "learning_rate": 2.4344308546371686e-05, + "loss": 1.6865, + "step": 22200 + }, + { + "epoch": 6.814303253529773, + "grad_norm": 0.18468965590000153, + "learning_rate": 2.4340042339386348e-05, + "loss": 1.7091, + "step": 22201 + }, + { + "epoch": 6.814610190300798, + "grad_norm": 0.25356602668762207, + "learning_rate": 2.4335776385987747e-05, + "loss": 1.7482, + "step": 22202 + }, + { + "epoch": 6.814917127071823, + "grad_norm": 0.22462932765483856, + "learning_rate": 2.433151068621803e-05, + "loss": 1.6985, + "step": 22203 + }, + { + "epoch": 6.815224063842848, + "grad_norm": 0.2540687024593353, + "learning_rate": 2.43272452401194e-05, + "loss": 1.7878, + "step": 22204 + }, + { + "epoch": 6.815531000613873, + "grad_norm": 0.267811119556427, + "learning_rate": 2.432298004773395e-05, + "loss": 1.7862, + "step": 22205 + }, + { + "epoch": 6.815837937384899, + "grad_norm": 0.23089277744293213, + "learning_rate": 2.4318715109103894e-05, + "loss": 1.6892, + "step": 22206 + }, + { + "epoch": 6.816144874155924, + "grad_norm": 0.22740885615348816, + "learning_rate": 2.431445042427131e-05, + "loss": 1.6934, + "step": 22207 + }, + { + "epoch": 6.816451810926949, + "grad_norm": 0.18555034697055817, + "learning_rate": 2.4310185993278405e-05, + "loss": 1.6747, + "step": 22208 + }, + { + "epoch": 6.816758747697974, + "grad_norm": 0.23693101108074188, + "learning_rate": 2.430592181616729e-05, + "loss": 1.7212, + "step": 22209 + }, + { + "epoch": 6.817065684468999, + "grad_norm": 0.20551325380802155, + "learning_rate": 2.4301657892980128e-05, + "loss": 1.711, + "step": 22210 + }, + { + "epoch": 6.8173726212400245, + "grad_norm": 0.20047837495803833, + "learning_rate": 2.4297394223759056e-05, + "loss": 1.729, + "step": 22211 + }, + { + "epoch": 6.81767955801105, + "grad_norm": 0.22111602127552032, + "learning_rate": 2.4293130808546167e-05, + "loss": 1.706, + "step": 22212 + }, + { + "epoch": 6.817986494782075, + "grad_norm": 0.18199655413627625, + "learning_rate": 2.428886764738364e-05, + "loss": 1.7082, + "step": 22213 + }, + { + "epoch": 6.8182934315531, + "grad_norm": 0.18591821193695068, + "learning_rate": 2.4284604740313595e-05, + "loss": 1.6957, + "step": 22214 + }, + { + "epoch": 6.818600368324125, + "grad_norm": 0.19427789747714996, + "learning_rate": 2.4280342087378154e-05, + "loss": 1.7396, + "step": 22215 + }, + { + "epoch": 6.81890730509515, + "grad_norm": 0.233908548951149, + "learning_rate": 2.427607968861945e-05, + "loss": 1.741, + "step": 22216 + }, + { + "epoch": 6.819214241866176, + "grad_norm": 0.168926402926445, + "learning_rate": 2.4271817544079606e-05, + "loss": 1.7023, + "step": 22217 + }, + { + "epoch": 6.819521178637201, + "grad_norm": 0.34345322847366333, + "learning_rate": 2.426755565380074e-05, + "loss": 1.7201, + "step": 22218 + }, + { + "epoch": 6.819828115408226, + "grad_norm": 0.21531274914741516, + "learning_rate": 2.4263294017824974e-05, + "loss": 1.725, + "step": 22219 + }, + { + "epoch": 6.820135052179251, + "grad_norm": 0.25251755118370056, + "learning_rate": 2.4259032636194395e-05, + "loss": 1.6764, + "step": 22220 + }, + { + "epoch": 6.820441988950276, + "grad_norm": 0.246616929769516, + "learning_rate": 2.4254771508951186e-05, + "loss": 1.7971, + "step": 22221 + }, + { + "epoch": 6.820748925721301, + "grad_norm": 0.20998120307922363, + "learning_rate": 2.4250510636137375e-05, + "loss": 1.723, + "step": 22222 + }, + { + "epoch": 6.821055862492327, + "grad_norm": 0.28388240933418274, + "learning_rate": 2.4246250017795148e-05, + "loss": 1.7508, + "step": 22223 + }, + { + "epoch": 6.821362799263352, + "grad_norm": 0.18146218359470367, + "learning_rate": 2.4241989653966535e-05, + "loss": 1.7254, + "step": 22224 + }, + { + "epoch": 6.8216697360343765, + "grad_norm": 0.2384043037891388, + "learning_rate": 2.4237729544693694e-05, + "loss": 1.7624, + "step": 22225 + }, + { + "epoch": 6.821976672805402, + "grad_norm": 0.21908332407474518, + "learning_rate": 2.4233469690018714e-05, + "loss": 1.7595, + "step": 22226 + }, + { + "epoch": 6.822283609576427, + "grad_norm": 0.20963989198207855, + "learning_rate": 2.422921008998369e-05, + "loss": 1.6679, + "step": 22227 + }, + { + "epoch": 6.8225905463474525, + "grad_norm": 0.21045777201652527, + "learning_rate": 2.4224950744630732e-05, + "loss": 1.657, + "step": 22228 + }, + { + "epoch": 6.822897483118478, + "grad_norm": 0.21567417681217194, + "learning_rate": 2.4220691654001883e-05, + "loss": 1.7788, + "step": 22229 + }, + { + "epoch": 6.823204419889503, + "grad_norm": 0.2908889055252075, + "learning_rate": 2.4216432818139283e-05, + "loss": 1.7633, + "step": 22230 + }, + { + "epoch": 6.823511356660528, + "grad_norm": 0.22683843970298767, + "learning_rate": 2.4212174237085007e-05, + "loss": 1.7974, + "step": 22231 + }, + { + "epoch": 6.823818293431553, + "grad_norm": 0.25254085659980774, + "learning_rate": 2.420791591088114e-05, + "loss": 1.6871, + "step": 22232 + }, + { + "epoch": 6.824125230202578, + "grad_norm": 0.1804734766483307, + "learning_rate": 2.420365783956977e-05, + "loss": 1.7331, + "step": 22233 + }, + { + "epoch": 6.824432166973604, + "grad_norm": 0.21634186804294586, + "learning_rate": 2.419940002319297e-05, + "loss": 1.6641, + "step": 22234 + }, + { + "epoch": 6.824739103744628, + "grad_norm": 0.1941644847393036, + "learning_rate": 2.4195142461792818e-05, + "loss": 1.7198, + "step": 22235 + }, + { + "epoch": 6.8250460405156534, + "grad_norm": 0.20209947228431702, + "learning_rate": 2.4190885155411398e-05, + "loss": 1.7137, + "step": 22236 + }, + { + "epoch": 6.825352977286679, + "grad_norm": 0.17161925137043, + "learning_rate": 2.4186628104090757e-05, + "loss": 1.7059, + "step": 22237 + }, + { + "epoch": 6.825659914057704, + "grad_norm": 0.19352135062217712, + "learning_rate": 2.4182371307873025e-05, + "loss": 1.6699, + "step": 22238 + }, + { + "epoch": 6.8259668508287294, + "grad_norm": 0.20384716987609863, + "learning_rate": 2.417811476680019e-05, + "loss": 1.7167, + "step": 22239 + }, + { + "epoch": 6.826273787599755, + "grad_norm": 0.22764970362186432, + "learning_rate": 2.4173858480914402e-05, + "loss": 1.7085, + "step": 22240 + }, + { + "epoch": 6.82658072437078, + "grad_norm": 0.1988842487335205, + "learning_rate": 2.4169602450257645e-05, + "loss": 1.7458, + "step": 22241 + }, + { + "epoch": 6.826887661141805, + "grad_norm": 0.20511481165885925, + "learning_rate": 2.416534667487203e-05, + "loss": 1.7597, + "step": 22242 + }, + { + "epoch": 6.82719459791283, + "grad_norm": 0.20906902849674225, + "learning_rate": 2.4161091154799608e-05, + "loss": 1.7418, + "step": 22243 + }, + { + "epoch": 6.827501534683855, + "grad_norm": 0.22555884718894958, + "learning_rate": 2.4156835890082426e-05, + "loss": 1.8198, + "step": 22244 + }, + { + "epoch": 6.827808471454881, + "grad_norm": 0.25855058431625366, + "learning_rate": 2.4152580880762553e-05, + "loss": 1.7588, + "step": 22245 + }, + { + "epoch": 6.828115408225905, + "grad_norm": 0.16975226998329163, + "learning_rate": 2.4148326126881993e-05, + "loss": 1.6897, + "step": 22246 + }, + { + "epoch": 6.82842234499693, + "grad_norm": 0.2336781919002533, + "learning_rate": 2.414407162848284e-05, + "loss": 1.7412, + "step": 22247 + }, + { + "epoch": 6.828729281767956, + "grad_norm": 0.1660032868385315, + "learning_rate": 2.4139817385607126e-05, + "loss": 1.6221, + "step": 22248 + }, + { + "epoch": 6.829036218538981, + "grad_norm": 0.22926606237888336, + "learning_rate": 2.41355633982969e-05, + "loss": 1.7201, + "step": 22249 + }, + { + "epoch": 6.829343155310006, + "grad_norm": 0.1759374737739563, + "learning_rate": 2.4131309666594193e-05, + "loss": 1.6842, + "step": 22250 + }, + { + "epoch": 6.829650092081032, + "grad_norm": 0.23005764186382294, + "learning_rate": 2.4127056190541042e-05, + "loss": 1.7327, + "step": 22251 + }, + { + "epoch": 6.829957028852056, + "grad_norm": 0.2216579169034958, + "learning_rate": 2.412280297017949e-05, + "loss": 1.7856, + "step": 22252 + }, + { + "epoch": 6.8302639656230815, + "grad_norm": 0.22133000195026398, + "learning_rate": 2.4118550005551565e-05, + "loss": 1.7711, + "step": 22253 + }, + { + "epoch": 6.830570902394107, + "grad_norm": 0.21860742568969727, + "learning_rate": 2.41142972966993e-05, + "loss": 1.7276, + "step": 22254 + }, + { + "epoch": 6.830877839165132, + "grad_norm": 0.2484082579612732, + "learning_rate": 2.4110044843664726e-05, + "loss": 1.7038, + "step": 22255 + }, + { + "epoch": 6.8311847759361575, + "grad_norm": 0.22288921475410461, + "learning_rate": 2.410579264648984e-05, + "loss": 1.7149, + "step": 22256 + }, + { + "epoch": 6.831491712707182, + "grad_norm": 0.23635484278202057, + "learning_rate": 2.4101540705216724e-05, + "loss": 1.7296, + "step": 22257 + }, + { + "epoch": 6.831798649478207, + "grad_norm": 0.24334096908569336, + "learning_rate": 2.4097289019887324e-05, + "loss": 1.7458, + "step": 22258 + }, + { + "epoch": 6.832105586249233, + "grad_norm": 0.23019789159297943, + "learning_rate": 2.4093037590543716e-05, + "loss": 1.7296, + "step": 22259 + }, + { + "epoch": 6.832412523020258, + "grad_norm": 0.23739024996757507, + "learning_rate": 2.4088786417227895e-05, + "loss": 1.7844, + "step": 22260 + }, + { + "epoch": 6.832719459791283, + "grad_norm": 0.1969252973794937, + "learning_rate": 2.4084535499981873e-05, + "loss": 1.6692, + "step": 22261 + }, + { + "epoch": 6.833026396562309, + "grad_norm": 0.20111167430877686, + "learning_rate": 2.4080284838847682e-05, + "loss": 1.7813, + "step": 22262 + }, + { + "epoch": 6.833333333333333, + "grad_norm": 0.26112934947013855, + "learning_rate": 2.4076034433867268e-05, + "loss": 1.6852, + "step": 22263 + }, + { + "epoch": 6.833640270104358, + "grad_norm": 0.24244411289691925, + "learning_rate": 2.40717842850827e-05, + "loss": 1.7054, + "step": 22264 + }, + { + "epoch": 6.833947206875384, + "grad_norm": 0.22703053057193756, + "learning_rate": 2.406753439253595e-05, + "loss": 1.7655, + "step": 22265 + }, + { + "epoch": 6.834254143646409, + "grad_norm": 0.23935651779174805, + "learning_rate": 2.4063284756269027e-05, + "loss": 1.7462, + "step": 22266 + }, + { + "epoch": 6.834561080417434, + "grad_norm": 0.2169155478477478, + "learning_rate": 2.4059035376323928e-05, + "loss": 1.7059, + "step": 22267 + }, + { + "epoch": 6.834868017188459, + "grad_norm": 0.2045663446187973, + "learning_rate": 2.4054786252742645e-05, + "loss": 1.7166, + "step": 22268 + }, + { + "epoch": 6.835174953959484, + "grad_norm": 0.22796253859996796, + "learning_rate": 2.4050537385567172e-05, + "loss": 1.7361, + "step": 22269 + }, + { + "epoch": 6.8354818907305095, + "grad_norm": 0.20807915925979614, + "learning_rate": 2.4046288774839497e-05, + "loss": 1.7007, + "step": 22270 + }, + { + "epoch": 6.835788827501535, + "grad_norm": 0.22157903015613556, + "learning_rate": 2.4042040420601607e-05, + "loss": 1.7409, + "step": 22271 + }, + { + "epoch": 6.83609576427256, + "grad_norm": 0.21494148671627045, + "learning_rate": 2.4037792322895492e-05, + "loss": 1.7975, + "step": 22272 + }, + { + "epoch": 6.8364027010435855, + "grad_norm": 0.2275875061750412, + "learning_rate": 2.403354448176311e-05, + "loss": 1.6759, + "step": 22273 + }, + { + "epoch": 6.83670963781461, + "grad_norm": 0.21105073392391205, + "learning_rate": 2.4029296897246496e-05, + "loss": 1.7229, + "step": 22274 + }, + { + "epoch": 6.837016574585635, + "grad_norm": 0.21957579255104065, + "learning_rate": 2.4025049569387553e-05, + "loss": 1.737, + "step": 22275 + }, + { + "epoch": 6.837323511356661, + "grad_norm": 0.2291470617055893, + "learning_rate": 2.4020802498228335e-05, + "loss": 1.6731, + "step": 22276 + }, + { + "epoch": 6.837630448127686, + "grad_norm": 0.18196065723896027, + "learning_rate": 2.401655568381074e-05, + "loss": 1.6823, + "step": 22277 + }, + { + "epoch": 6.83793738489871, + "grad_norm": 0.20915214717388153, + "learning_rate": 2.401230912617678e-05, + "loss": 1.7038, + "step": 22278 + }, + { + "epoch": 6.838244321669736, + "grad_norm": 0.2060854732990265, + "learning_rate": 2.4008062825368437e-05, + "loss": 1.7514, + "step": 22279 + }, + { + "epoch": 6.838551258440761, + "grad_norm": 0.20858527719974518, + "learning_rate": 2.400381678142762e-05, + "loss": 1.7494, + "step": 22280 + }, + { + "epoch": 6.838858195211786, + "grad_norm": 0.19124718010425568, + "learning_rate": 2.3999570994396352e-05, + "loss": 1.7641, + "step": 22281 + }, + { + "epoch": 6.839165131982812, + "grad_norm": 0.28222304582595825, + "learning_rate": 2.3995325464316525e-05, + "loss": 1.7204, + "step": 22282 + }, + { + "epoch": 6.839472068753837, + "grad_norm": 0.20047026872634888, + "learning_rate": 2.399108019123016e-05, + "loss": 1.7261, + "step": 22283 + }, + { + "epoch": 6.8397790055248615, + "grad_norm": 0.2758225202560425, + "learning_rate": 2.3986835175179178e-05, + "loss": 1.6903, + "step": 22284 + }, + { + "epoch": 6.840085942295887, + "grad_norm": 0.2719727158546448, + "learning_rate": 2.3982590416205535e-05, + "loss": 1.8716, + "step": 22285 + }, + { + "epoch": 6.840392879066912, + "grad_norm": 0.3524060845375061, + "learning_rate": 2.3978345914351193e-05, + "loss": 1.7778, + "step": 22286 + }, + { + "epoch": 6.8406998158379375, + "grad_norm": 0.2711596190929413, + "learning_rate": 2.397410166965808e-05, + "loss": 1.7111, + "step": 22287 + }, + { + "epoch": 6.841006752608963, + "grad_norm": 0.2818336486816406, + "learning_rate": 2.396985768216815e-05, + "loss": 1.7292, + "step": 22288 + }, + { + "epoch": 6.841313689379987, + "grad_norm": 0.19677700102329254, + "learning_rate": 2.3965613951923343e-05, + "loss": 1.6975, + "step": 22289 + }, + { + "epoch": 6.841620626151013, + "grad_norm": 0.300997257232666, + "learning_rate": 2.3961370478965583e-05, + "loss": 1.7014, + "step": 22290 + }, + { + "epoch": 6.841927562922038, + "grad_norm": 0.23549453914165497, + "learning_rate": 2.395712726333686e-05, + "loss": 1.7052, + "step": 22291 + }, + { + "epoch": 6.842234499693063, + "grad_norm": 0.29898303747177124, + "learning_rate": 2.3952884305079026e-05, + "loss": 1.7828, + "step": 22292 + }, + { + "epoch": 6.842541436464089, + "grad_norm": 0.26108843088150024, + "learning_rate": 2.3948641604234096e-05, + "loss": 1.7023, + "step": 22293 + }, + { + "epoch": 6.842848373235114, + "grad_norm": 0.18781059980392456, + "learning_rate": 2.394439916084392e-05, + "loss": 1.6808, + "step": 22294 + }, + { + "epoch": 6.843155310006138, + "grad_norm": 0.22659730911254883, + "learning_rate": 2.3940156974950485e-05, + "loss": 1.7224, + "step": 22295 + }, + { + "epoch": 6.843462246777164, + "grad_norm": 0.17422057688236237, + "learning_rate": 2.3935915046595713e-05, + "loss": 1.668, + "step": 22296 + }, + { + "epoch": 6.843769183548189, + "grad_norm": 0.2008846402168274, + "learning_rate": 2.393167337582146e-05, + "loss": 1.7283, + "step": 22297 + }, + { + "epoch": 6.844076120319214, + "grad_norm": 0.20376072824001312, + "learning_rate": 2.392743196266973e-05, + "loss": 1.74, + "step": 22298 + }, + { + "epoch": 6.84438305709024, + "grad_norm": 0.16353756189346313, + "learning_rate": 2.3923190807182372e-05, + "loss": 1.717, + "step": 22299 + }, + { + "epoch": 6.844689993861264, + "grad_norm": 0.18436652421951294, + "learning_rate": 2.3918949909401335e-05, + "loss": 1.7257, + "step": 22300 + }, + { + "epoch": 6.8449969306322895, + "grad_norm": 0.2038460522890091, + "learning_rate": 2.3914709269368523e-05, + "loss": 1.7254, + "step": 22301 + }, + { + "epoch": 6.845303867403315, + "grad_norm": 0.17111587524414062, + "learning_rate": 2.3910468887125842e-05, + "loss": 1.6993, + "step": 22302 + }, + { + "epoch": 6.84561080417434, + "grad_norm": 0.20049406588077545, + "learning_rate": 2.3906228762715207e-05, + "loss": 1.7099, + "step": 22303 + }, + { + "epoch": 6.8459177409453655, + "grad_norm": 0.2168554663658142, + "learning_rate": 2.39019888961785e-05, + "loss": 1.725, + "step": 22304 + }, + { + "epoch": 6.846224677716391, + "grad_norm": 0.2228514850139618, + "learning_rate": 2.3897749287557647e-05, + "loss": 1.7348, + "step": 22305 + }, + { + "epoch": 6.846531614487415, + "grad_norm": 0.17166151106357574, + "learning_rate": 2.3893509936894532e-05, + "loss": 1.7451, + "step": 22306 + }, + { + "epoch": 6.846838551258441, + "grad_norm": 0.24896936118602753, + "learning_rate": 2.3889270844231026e-05, + "loss": 1.7397, + "step": 22307 + }, + { + "epoch": 6.847145488029466, + "grad_norm": 0.1984332948923111, + "learning_rate": 2.3885032009609098e-05, + "loss": 1.7167, + "step": 22308 + }, + { + "epoch": 6.847452424800491, + "grad_norm": 0.20763449370861053, + "learning_rate": 2.388079343307055e-05, + "loss": 1.7154, + "step": 22309 + }, + { + "epoch": 6.847759361571516, + "grad_norm": 0.21818630397319794, + "learning_rate": 2.3876555114657346e-05, + "loss": 1.7364, + "step": 22310 + }, + { + "epoch": 6.848066298342541, + "grad_norm": 0.21220166981220245, + "learning_rate": 2.3872317054411298e-05, + "loss": 1.74, + "step": 22311 + }, + { + "epoch": 6.848373235113566, + "grad_norm": 0.17486892640590668, + "learning_rate": 2.3868079252374343e-05, + "loss": 1.68, + "step": 22312 + }, + { + "epoch": 6.848680171884592, + "grad_norm": 0.20809298753738403, + "learning_rate": 2.386384170858837e-05, + "loss": 1.8102, + "step": 22313 + }, + { + "epoch": 6.848987108655617, + "grad_norm": 0.19927671551704407, + "learning_rate": 2.385960442309519e-05, + "loss": 1.7742, + "step": 22314 + }, + { + "epoch": 6.849294045426642, + "grad_norm": 0.18705040216445923, + "learning_rate": 2.3855367395936757e-05, + "loss": 1.689, + "step": 22315 + }, + { + "epoch": 6.849600982197668, + "grad_norm": 0.22023466229438782, + "learning_rate": 2.385113062715487e-05, + "loss": 1.7819, + "step": 22316 + }, + { + "epoch": 6.849907918968692, + "grad_norm": 0.24443435668945312, + "learning_rate": 2.384689411679146e-05, + "loss": 1.6533, + "step": 22317 + }, + { + "epoch": 6.850214855739718, + "grad_norm": 0.20103834569454193, + "learning_rate": 2.3842657864888368e-05, + "loss": 1.7274, + "step": 22318 + }, + { + "epoch": 6.850521792510743, + "grad_norm": 0.2265254408121109, + "learning_rate": 2.3838421871487465e-05, + "loss": 1.7874, + "step": 22319 + }, + { + "epoch": 6.850828729281768, + "grad_norm": 0.2775460183620453, + "learning_rate": 2.383418613663061e-05, + "loss": 1.8038, + "step": 22320 + }, + { + "epoch": 6.851135666052793, + "grad_norm": 0.2001011073589325, + "learning_rate": 2.3829950660359663e-05, + "loss": 1.7135, + "step": 22321 + }, + { + "epoch": 6.851442602823818, + "grad_norm": 0.21427330374717712, + "learning_rate": 2.382571544271648e-05, + "loss": 1.7155, + "step": 22322 + }, + { + "epoch": 6.851749539594843, + "grad_norm": 0.18420884013175964, + "learning_rate": 2.382148048374292e-05, + "loss": 1.7178, + "step": 22323 + }, + { + "epoch": 6.852056476365869, + "grad_norm": 0.19436471164226532, + "learning_rate": 2.3817245783480813e-05, + "loss": 1.7396, + "step": 22324 + }, + { + "epoch": 6.852363413136894, + "grad_norm": 0.23191674053668976, + "learning_rate": 2.381301134197207e-05, + "loss": 1.7102, + "step": 22325 + }, + { + "epoch": 6.852670349907919, + "grad_norm": 0.20381706953048706, + "learning_rate": 2.3808777159258462e-05, + "loss": 1.7671, + "step": 22326 + }, + { + "epoch": 6.852977286678944, + "grad_norm": 0.20202197134494781, + "learning_rate": 2.3804543235381897e-05, + "loss": 1.6774, + "step": 22327 + }, + { + "epoch": 6.853284223449969, + "grad_norm": 0.23496322333812714, + "learning_rate": 2.380030957038416e-05, + "loss": 1.7745, + "step": 22328 + }, + { + "epoch": 6.8535911602209945, + "grad_norm": 0.22473813593387604, + "learning_rate": 2.379607616430714e-05, + "loss": 1.7319, + "step": 22329 + }, + { + "epoch": 6.85389809699202, + "grad_norm": 0.2149224430322647, + "learning_rate": 2.3791843017192667e-05, + "loss": 1.77, + "step": 22330 + }, + { + "epoch": 6.854205033763045, + "grad_norm": 0.21146108210086823, + "learning_rate": 2.378761012908253e-05, + "loss": 1.762, + "step": 22331 + }, + { + "epoch": 6.85451197053407, + "grad_norm": 0.2031458169221878, + "learning_rate": 2.3783377500018626e-05, + "loss": 1.7007, + "step": 22332 + }, + { + "epoch": 6.854818907305095, + "grad_norm": 0.19763319194316864, + "learning_rate": 2.377914513004272e-05, + "loss": 1.6899, + "step": 22333 + }, + { + "epoch": 6.85512584407612, + "grad_norm": 0.17337046563625336, + "learning_rate": 2.3774913019196688e-05, + "loss": 1.683, + "step": 22334 + }, + { + "epoch": 6.855432780847146, + "grad_norm": 0.1850815862417221, + "learning_rate": 2.3770681167522328e-05, + "loss": 1.7284, + "step": 22335 + }, + { + "epoch": 6.855739717618171, + "grad_norm": 0.19693362712860107, + "learning_rate": 2.3766449575061477e-05, + "loss": 1.7694, + "step": 22336 + }, + { + "epoch": 6.856046654389196, + "grad_norm": 0.1981547325849533, + "learning_rate": 2.376221824185595e-05, + "loss": 1.736, + "step": 22337 + }, + { + "epoch": 6.856353591160221, + "grad_norm": 0.17638558149337769, + "learning_rate": 2.375798716794756e-05, + "loss": 1.6979, + "step": 22338 + }, + { + "epoch": 6.856660527931246, + "grad_norm": 0.20189990103244781, + "learning_rate": 2.3753756353378116e-05, + "loss": 1.7876, + "step": 22339 + }, + { + "epoch": 6.856967464702271, + "grad_norm": 0.1880224347114563, + "learning_rate": 2.3749525798189438e-05, + "loss": 1.7134, + "step": 22340 + }, + { + "epoch": 6.857274401473297, + "grad_norm": 0.2464265078306198, + "learning_rate": 2.3745295502423316e-05, + "loss": 1.7782, + "step": 22341 + }, + { + "epoch": 6.857581338244322, + "grad_norm": 0.19218963384628296, + "learning_rate": 2.3741065466121604e-05, + "loss": 1.7027, + "step": 22342 + }, + { + "epoch": 6.8578882750153465, + "grad_norm": 0.27446448802948, + "learning_rate": 2.3736835689326043e-05, + "loss": 1.772, + "step": 22343 + }, + { + "epoch": 6.858195211786372, + "grad_norm": 0.19315828382968903, + "learning_rate": 2.3732606172078497e-05, + "loss": 1.6855, + "step": 22344 + }, + { + "epoch": 6.858502148557397, + "grad_norm": 0.2668892741203308, + "learning_rate": 2.372837691442072e-05, + "loss": 1.7703, + "step": 22345 + }, + { + "epoch": 6.8588090853284225, + "grad_norm": 0.23552054166793823, + "learning_rate": 2.3724147916394497e-05, + "loss": 1.7184, + "step": 22346 + }, + { + "epoch": 6.859116022099448, + "grad_norm": 0.3194984793663025, + "learning_rate": 2.3719919178041682e-05, + "loss": 1.7531, + "step": 22347 + }, + { + "epoch": 6.859422958870473, + "grad_norm": 0.19298717379570007, + "learning_rate": 2.371569069940399e-05, + "loss": 1.7064, + "step": 22348 + }, + { + "epoch": 6.859729895641498, + "grad_norm": 0.2990693151950836, + "learning_rate": 2.3711462480523293e-05, + "loss": 1.7434, + "step": 22349 + }, + { + "epoch": 6.860036832412523, + "grad_norm": 0.1976640820503235, + "learning_rate": 2.370723452144129e-05, + "loss": 1.6881, + "step": 22350 + }, + { + "epoch": 6.860343769183548, + "grad_norm": 0.24306917190551758, + "learning_rate": 2.3703006822199825e-05, + "loss": 1.7791, + "step": 22351 + }, + { + "epoch": 6.860650705954574, + "grad_norm": 0.20065687596797943, + "learning_rate": 2.3698779382840657e-05, + "loss": 1.7162, + "step": 22352 + }, + { + "epoch": 6.860957642725598, + "grad_norm": 0.21599936485290527, + "learning_rate": 2.3694552203405574e-05, + "loss": 1.7702, + "step": 22353 + }, + { + "epoch": 6.861264579496623, + "grad_norm": 0.16836890578269958, + "learning_rate": 2.3690325283936338e-05, + "loss": 1.6676, + "step": 22354 + }, + { + "epoch": 6.861571516267649, + "grad_norm": 0.1756831407546997, + "learning_rate": 2.368609862447473e-05, + "loss": 1.6934, + "step": 22355 + }, + { + "epoch": 6.861878453038674, + "grad_norm": 0.18676789104938507, + "learning_rate": 2.3681872225062517e-05, + "loss": 1.6879, + "step": 22356 + }, + { + "epoch": 6.862185389809699, + "grad_norm": 0.18018634617328644, + "learning_rate": 2.3677646085741473e-05, + "loss": 1.7143, + "step": 22357 + }, + { + "epoch": 6.862492326580725, + "grad_norm": 0.1789008378982544, + "learning_rate": 2.3673420206553332e-05, + "loss": 1.6914, + "step": 22358 + }, + { + "epoch": 6.862799263351749, + "grad_norm": 0.1869693398475647, + "learning_rate": 2.366919458753993e-05, + "loss": 1.7431, + "step": 22359 + }, + { + "epoch": 6.8631062001227745, + "grad_norm": 0.1958019733428955, + "learning_rate": 2.3664969228742934e-05, + "loss": 1.7132, + "step": 22360 + }, + { + "epoch": 6.8634131368938, + "grad_norm": 0.199384868144989, + "learning_rate": 2.366074413020419e-05, + "loss": 1.7095, + "step": 22361 + }, + { + "epoch": 6.863720073664825, + "grad_norm": 0.2125246673822403, + "learning_rate": 2.365651929196539e-05, + "loss": 1.7125, + "step": 22362 + }, + { + "epoch": 6.8640270104358505, + "grad_norm": 0.1574707180261612, + "learning_rate": 2.3652294714068284e-05, + "loss": 1.6386, + "step": 22363 + }, + { + "epoch": 6.864333947206875, + "grad_norm": 0.30648529529571533, + "learning_rate": 2.364807039655469e-05, + "loss": 1.7665, + "step": 22364 + }, + { + "epoch": 6.8646408839779, + "grad_norm": 0.19746489822864532, + "learning_rate": 2.364384633946627e-05, + "loss": 1.6736, + "step": 22365 + }, + { + "epoch": 6.864947820748926, + "grad_norm": 0.25084391236305237, + "learning_rate": 2.3639622542844842e-05, + "loss": 1.7346, + "step": 22366 + }, + { + "epoch": 6.865254757519951, + "grad_norm": 0.1884133219718933, + "learning_rate": 2.3635399006732077e-05, + "loss": 1.6868, + "step": 22367 + }, + { + "epoch": 6.865561694290976, + "grad_norm": 0.21225856244564056, + "learning_rate": 2.3631175731169774e-05, + "loss": 1.7438, + "step": 22368 + }, + { + "epoch": 6.865868631062002, + "grad_norm": 0.1863771378993988, + "learning_rate": 2.3626952716199647e-05, + "loss": 1.7677, + "step": 22369 + }, + { + "epoch": 6.866175567833026, + "grad_norm": 0.1839088648557663, + "learning_rate": 2.362272996186343e-05, + "loss": 1.6902, + "step": 22370 + }, + { + "epoch": 6.866482504604051, + "grad_norm": 0.18304915726184845, + "learning_rate": 2.3618507468202856e-05, + "loss": 1.7142, + "step": 22371 + }, + { + "epoch": 6.866789441375077, + "grad_norm": 0.21228280663490295, + "learning_rate": 2.3614285235259655e-05, + "loss": 1.8277, + "step": 22372 + }, + { + "epoch": 6.867096378146102, + "grad_norm": 0.19515320658683777, + "learning_rate": 2.361006326307555e-05, + "loss": 1.7029, + "step": 22373 + }, + { + "epoch": 6.867403314917127, + "grad_norm": 0.16277433931827545, + "learning_rate": 2.360584155169227e-05, + "loss": 1.672, + "step": 22374 + }, + { + "epoch": 6.867710251688152, + "grad_norm": 0.2180202454328537, + "learning_rate": 2.360162010115151e-05, + "loss": 1.7516, + "step": 22375 + }, + { + "epoch": 6.868017188459177, + "grad_norm": 0.17940378189086914, + "learning_rate": 2.3597398911495055e-05, + "loss": 1.6782, + "step": 22376 + }, + { + "epoch": 6.8683241252302025, + "grad_norm": 0.20751933753490448, + "learning_rate": 2.3593177982764543e-05, + "loss": 1.7954, + "step": 22377 + }, + { + "epoch": 6.868631062001228, + "grad_norm": 0.23098444938659668, + "learning_rate": 2.3588957315001758e-05, + "loss": 1.7472, + "step": 22378 + }, + { + "epoch": 6.868937998772253, + "grad_norm": 0.2351236343383789, + "learning_rate": 2.358473690824836e-05, + "loss": 1.7959, + "step": 22379 + }, + { + "epoch": 6.8692449355432785, + "grad_norm": 0.1890626847743988, + "learning_rate": 2.3580516762546055e-05, + "loss": 1.7015, + "step": 22380 + }, + { + "epoch": 6.869551872314303, + "grad_norm": 0.21120475232601166, + "learning_rate": 2.3576296877936604e-05, + "loss": 1.7998, + "step": 22381 + }, + { + "epoch": 6.869858809085328, + "grad_norm": 0.18141280114650726, + "learning_rate": 2.3572077254461638e-05, + "loss": 1.6973, + "step": 22382 + }, + { + "epoch": 6.870165745856354, + "grad_norm": 0.19084444642066956, + "learning_rate": 2.356785789216293e-05, + "loss": 1.6853, + "step": 22383 + }, + { + "epoch": 6.870472682627379, + "grad_norm": 0.18046700954437256, + "learning_rate": 2.356363879108211e-05, + "loss": 1.7476, + "step": 22384 + }, + { + "epoch": 6.870779619398404, + "grad_norm": 0.19875061511993408, + "learning_rate": 2.3559419951260926e-05, + "loss": 1.7223, + "step": 22385 + }, + { + "epoch": 6.871086556169429, + "grad_norm": 0.2377827763557434, + "learning_rate": 2.3555201372741047e-05, + "loss": 1.7976, + "step": 22386 + }, + { + "epoch": 6.871393492940454, + "grad_norm": 0.17645993828773499, + "learning_rate": 2.3550983055564168e-05, + "loss": 1.6726, + "step": 22387 + }, + { + "epoch": 6.871700429711479, + "grad_norm": 0.19499735534191132, + "learning_rate": 2.3546764999771976e-05, + "loss": 1.67, + "step": 22388 + }, + { + "epoch": 6.872007366482505, + "grad_norm": 0.22010546922683716, + "learning_rate": 2.3542547205406163e-05, + "loss": 1.8461, + "step": 22389 + }, + { + "epoch": 6.87231430325353, + "grad_norm": 0.2101692259311676, + "learning_rate": 2.3538329672508396e-05, + "loss": 1.6922, + "step": 22390 + }, + { + "epoch": 6.872621240024555, + "grad_norm": 0.1926269382238388, + "learning_rate": 2.3534112401120372e-05, + "loss": 1.6934, + "step": 22391 + }, + { + "epoch": 6.87292817679558, + "grad_norm": 0.20662687718868256, + "learning_rate": 2.3529895391283742e-05, + "loss": 1.7284, + "step": 22392 + }, + { + "epoch": 6.873235113566605, + "grad_norm": 0.2392960786819458, + "learning_rate": 2.3525678643040235e-05, + "loss": 1.7207, + "step": 22393 + }, + { + "epoch": 6.8735420503376305, + "grad_norm": 0.2067870795726776, + "learning_rate": 2.3521462156431452e-05, + "loss": 1.7269, + "step": 22394 + }, + { + "epoch": 6.873848987108656, + "grad_norm": 0.2544265687465668, + "learning_rate": 2.351724593149914e-05, + "loss": 1.7358, + "step": 22395 + }, + { + "epoch": 6.87415592387968, + "grad_norm": 0.2243366837501526, + "learning_rate": 2.3513029968284907e-05, + "loss": 1.7625, + "step": 22396 + }, + { + "epoch": 6.874462860650706, + "grad_norm": 0.23003467917442322, + "learning_rate": 2.3508814266830414e-05, + "loss": 1.6943, + "step": 22397 + }, + { + "epoch": 6.874769797421731, + "grad_norm": 0.19257886707782745, + "learning_rate": 2.3504598827177383e-05, + "loss": 1.7393, + "step": 22398 + }, + { + "epoch": 6.875076734192756, + "grad_norm": 0.23782171308994293, + "learning_rate": 2.3500383649367404e-05, + "loss": 1.7758, + "step": 22399 + }, + { + "epoch": 6.875383670963782, + "grad_norm": 0.18137066066265106, + "learning_rate": 2.3496168733442197e-05, + "loss": 1.7083, + "step": 22400 + }, + { + "epoch": 6.875690607734807, + "grad_norm": 0.21970662474632263, + "learning_rate": 2.3491954079443344e-05, + "loss": 1.7552, + "step": 22401 + }, + { + "epoch": 6.8759975445058314, + "grad_norm": 0.2032134085893631, + "learning_rate": 2.3487739687412562e-05, + "loss": 1.7653, + "step": 22402 + }, + { + "epoch": 6.876304481276857, + "grad_norm": 0.22016118466854095, + "learning_rate": 2.348352555739148e-05, + "loss": 1.7277, + "step": 22403 + }, + { + "epoch": 6.876611418047882, + "grad_norm": 0.2250203788280487, + "learning_rate": 2.3479311689421736e-05, + "loss": 1.7451, + "step": 22404 + }, + { + "epoch": 6.8769183548189075, + "grad_norm": 0.19726359844207764, + "learning_rate": 2.3475098083544977e-05, + "loss": 1.728, + "step": 22405 + }, + { + "epoch": 6.877225291589933, + "grad_norm": 0.21295994520187378, + "learning_rate": 2.3470884739802844e-05, + "loss": 1.7438, + "step": 22406 + }, + { + "epoch": 6.877532228360957, + "grad_norm": 0.19653508067131042, + "learning_rate": 2.346667165823698e-05, + "loss": 1.7189, + "step": 22407 + }, + { + "epoch": 6.877839165131983, + "grad_norm": 0.21406517922878265, + "learning_rate": 2.3462458838889016e-05, + "loss": 1.7475, + "step": 22408 + }, + { + "epoch": 6.878146101903008, + "grad_norm": 0.20569753646850586, + "learning_rate": 2.3458246281800595e-05, + "loss": 1.7262, + "step": 22409 + }, + { + "epoch": 6.878453038674033, + "grad_norm": 0.19365517795085907, + "learning_rate": 2.3454033987013334e-05, + "loss": 1.6938, + "step": 22410 + }, + { + "epoch": 6.878759975445059, + "grad_norm": 0.20935405790805817, + "learning_rate": 2.344982195456885e-05, + "loss": 1.724, + "step": 22411 + }, + { + "epoch": 6.879066912216084, + "grad_norm": 0.2104228436946869, + "learning_rate": 2.3445610184508826e-05, + "loss": 1.7474, + "step": 22412 + }, + { + "epoch": 6.879373848987108, + "grad_norm": 0.19795742630958557, + "learning_rate": 2.3441398676874826e-05, + "loss": 1.7572, + "step": 22413 + }, + { + "epoch": 6.879680785758134, + "grad_norm": 0.20640577375888824, + "learning_rate": 2.3437187431708472e-05, + "loss": 1.7258, + "step": 22414 + }, + { + "epoch": 6.879987722529159, + "grad_norm": 0.2092565894126892, + "learning_rate": 2.3432976449051442e-05, + "loss": 1.7437, + "step": 22415 + }, + { + "epoch": 6.880294659300184, + "grad_norm": 0.2083825170993805, + "learning_rate": 2.3428765728945275e-05, + "loss": 1.7127, + "step": 22416 + }, + { + "epoch": 6.88060159607121, + "grad_norm": 0.20619866251945496, + "learning_rate": 2.3424555271431647e-05, + "loss": 1.7729, + "step": 22417 + }, + { + "epoch": 6.880908532842234, + "grad_norm": 0.22689959406852722, + "learning_rate": 2.3420345076552107e-05, + "loss": 1.7142, + "step": 22418 + }, + { + "epoch": 6.8812154696132595, + "grad_norm": 0.16664449870586395, + "learning_rate": 2.3416135144348316e-05, + "loss": 1.6857, + "step": 22419 + }, + { + "epoch": 6.881522406384285, + "grad_norm": 0.1895827353000641, + "learning_rate": 2.3411925474861856e-05, + "loss": 1.7075, + "step": 22420 + }, + { + "epoch": 6.88182934315531, + "grad_norm": 0.2058400958776474, + "learning_rate": 2.3407716068134334e-05, + "loss": 1.7623, + "step": 22421 + }, + { + "epoch": 6.8821362799263355, + "grad_norm": 0.18390826880931854, + "learning_rate": 2.3403506924207346e-05, + "loss": 1.6686, + "step": 22422 + }, + { + "epoch": 6.882443216697361, + "grad_norm": 0.1742098331451416, + "learning_rate": 2.3399298043122497e-05, + "loss": 1.6846, + "step": 22423 + }, + { + "epoch": 6.882750153468385, + "grad_norm": 0.18958622217178345, + "learning_rate": 2.3395089424921368e-05, + "loss": 1.7603, + "step": 22424 + }, + { + "epoch": 6.883057090239411, + "grad_norm": 0.21827174723148346, + "learning_rate": 2.3390881069645564e-05, + "loss": 1.6706, + "step": 22425 + }, + { + "epoch": 6.883364027010436, + "grad_norm": 0.17859303951263428, + "learning_rate": 2.338667297733667e-05, + "loss": 1.7612, + "step": 22426 + }, + { + "epoch": 6.883670963781461, + "grad_norm": 0.22383756935596466, + "learning_rate": 2.338246514803627e-05, + "loss": 1.7507, + "step": 22427 + }, + { + "epoch": 6.883977900552486, + "grad_norm": 0.20317313075065613, + "learning_rate": 2.3378257581785934e-05, + "loss": 1.6912, + "step": 22428 + }, + { + "epoch": 6.884284837323511, + "grad_norm": 0.20238614082336426, + "learning_rate": 2.3374050278627297e-05, + "loss": 1.7336, + "step": 22429 + }, + { + "epoch": 6.884591774094536, + "grad_norm": 0.2134159654378891, + "learning_rate": 2.336984323860188e-05, + "loss": 1.7252, + "step": 22430 + }, + { + "epoch": 6.884898710865562, + "grad_norm": 0.17153076827526093, + "learning_rate": 2.3365636461751277e-05, + "loss": 1.6769, + "step": 22431 + }, + { + "epoch": 6.885205647636587, + "grad_norm": 0.19001254439353943, + "learning_rate": 2.3361429948117075e-05, + "loss": 1.7812, + "step": 22432 + }, + { + "epoch": 6.885512584407612, + "grad_norm": 0.2074522078037262, + "learning_rate": 2.335722369774081e-05, + "loss": 1.7433, + "step": 22433 + }, + { + "epoch": 6.885819521178637, + "grad_norm": 0.22863705456256866, + "learning_rate": 2.3353017710664117e-05, + "loss": 1.7476, + "step": 22434 + }, + { + "epoch": 6.886126457949662, + "grad_norm": 0.19350804388523102, + "learning_rate": 2.334881198692848e-05, + "loss": 1.7071, + "step": 22435 + }, + { + "epoch": 6.8864333947206875, + "grad_norm": 0.22915633022785187, + "learning_rate": 2.3344606526575524e-05, + "loss": 1.7283, + "step": 22436 + }, + { + "epoch": 6.886740331491713, + "grad_norm": 0.21576058864593506, + "learning_rate": 2.3340401329646795e-05, + "loss": 1.7062, + "step": 22437 + }, + { + "epoch": 6.887047268262738, + "grad_norm": 0.17844067513942719, + "learning_rate": 2.333619639618384e-05, + "loss": 1.6994, + "step": 22438 + }, + { + "epoch": 6.887354205033763, + "grad_norm": 0.21019738912582397, + "learning_rate": 2.333199172622822e-05, + "loss": 1.6654, + "step": 22439 + }, + { + "epoch": 6.887661141804788, + "grad_norm": 0.1901654452085495, + "learning_rate": 2.3327787319821486e-05, + "loss": 1.7847, + "step": 22440 + }, + { + "epoch": 6.887968078575813, + "grad_norm": 0.21838930249214172, + "learning_rate": 2.3323583177005198e-05, + "loss": 1.6517, + "step": 22441 + }, + { + "epoch": 6.888275015346839, + "grad_norm": 0.16078172624111176, + "learning_rate": 2.3319379297820892e-05, + "loss": 1.7052, + "step": 22442 + }, + { + "epoch": 6.888581952117864, + "grad_norm": 0.19161897897720337, + "learning_rate": 2.331517568231012e-05, + "loss": 1.675, + "step": 22443 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 0.1874416172504425, + "learning_rate": 2.331097233051442e-05, + "loss": 1.7025, + "step": 22444 + }, + { + "epoch": 6.889195825659914, + "grad_norm": 0.1817546933889389, + "learning_rate": 2.3306769242475318e-05, + "loss": 1.7103, + "step": 22445 + }, + { + "epoch": 6.889502762430939, + "grad_norm": 0.18423372507095337, + "learning_rate": 2.3302566418234406e-05, + "loss": 1.6883, + "step": 22446 + }, + { + "epoch": 6.889809699201964, + "grad_norm": 0.1712140440940857, + "learning_rate": 2.3298363857833162e-05, + "loss": 1.7076, + "step": 22447 + }, + { + "epoch": 6.89011663597299, + "grad_norm": 0.15992864966392517, + "learning_rate": 2.3294161561313133e-05, + "loss": 1.6514, + "step": 22448 + }, + { + "epoch": 6.890423572744015, + "grad_norm": 0.24126072227954865, + "learning_rate": 2.3289959528715855e-05, + "loss": 1.7385, + "step": 22449 + }, + { + "epoch": 6.8907305095150395, + "grad_norm": 0.18130798637866974, + "learning_rate": 2.3285757760082832e-05, + "loss": 1.691, + "step": 22450 + }, + { + "epoch": 6.891037446286065, + "grad_norm": 0.20070049166679382, + "learning_rate": 2.3281556255455644e-05, + "loss": 1.7166, + "step": 22451 + }, + { + "epoch": 6.89134438305709, + "grad_norm": 0.20706996321678162, + "learning_rate": 2.327735501487574e-05, + "loss": 1.6763, + "step": 22452 + }, + { + "epoch": 6.8916513198281155, + "grad_norm": 0.22404810786247253, + "learning_rate": 2.327315403838472e-05, + "loss": 1.761, + "step": 22453 + }, + { + "epoch": 6.891958256599141, + "grad_norm": 0.21240194141864777, + "learning_rate": 2.3268953326024013e-05, + "loss": 1.7038, + "step": 22454 + }, + { + "epoch": 6.892265193370166, + "grad_norm": 0.24251966178417206, + "learning_rate": 2.32647528778352e-05, + "loss": 1.7829, + "step": 22455 + }, + { + "epoch": 6.892572130141191, + "grad_norm": 0.21213467419147491, + "learning_rate": 2.3260552693859765e-05, + "loss": 1.7433, + "step": 22456 + }, + { + "epoch": 6.892879066912216, + "grad_norm": 0.18008530139923096, + "learning_rate": 2.325635277413922e-05, + "loss": 1.7238, + "step": 22457 + }, + { + "epoch": 6.893186003683241, + "grad_norm": 0.18252789974212646, + "learning_rate": 2.325215311871508e-05, + "loss": 1.7143, + "step": 22458 + }, + { + "epoch": 6.893492940454267, + "grad_norm": 0.17830567061901093, + "learning_rate": 2.3247953727628833e-05, + "loss": 1.687, + "step": 22459 + }, + { + "epoch": 6.893799877225292, + "grad_norm": 0.19980686902999878, + "learning_rate": 2.3243754600921992e-05, + "loss": 1.7096, + "step": 22460 + }, + { + "epoch": 6.894106813996316, + "grad_norm": 0.1713438183069229, + "learning_rate": 2.3239555738636044e-05, + "loss": 1.6791, + "step": 22461 + }, + { + "epoch": 6.894413750767342, + "grad_norm": 0.17678281664848328, + "learning_rate": 2.3235357140812475e-05, + "loss": 1.6689, + "step": 22462 + }, + { + "epoch": 6.894720687538367, + "grad_norm": 0.20409992337226868, + "learning_rate": 2.3231158807492837e-05, + "loss": 1.7746, + "step": 22463 + }, + { + "epoch": 6.895027624309392, + "grad_norm": 0.19227825105190277, + "learning_rate": 2.3226960738718552e-05, + "loss": 1.7101, + "step": 22464 + }, + { + "epoch": 6.895334561080418, + "grad_norm": 0.24029433727264404, + "learning_rate": 2.3222762934531132e-05, + "loss": 1.7842, + "step": 22465 + }, + { + "epoch": 6.895641497851443, + "grad_norm": 0.21887856721878052, + "learning_rate": 2.321856539497207e-05, + "loss": 1.7032, + "step": 22466 + }, + { + "epoch": 6.8959484346224675, + "grad_norm": 0.17346082627773285, + "learning_rate": 2.321436812008282e-05, + "loss": 1.683, + "step": 22467 + }, + { + "epoch": 6.896255371393493, + "grad_norm": 0.18920177221298218, + "learning_rate": 2.3210171109904914e-05, + "loss": 1.7057, + "step": 22468 + }, + { + "epoch": 6.896562308164518, + "grad_norm": 0.21199388802051544, + "learning_rate": 2.320597436447977e-05, + "loss": 1.7534, + "step": 22469 + }, + { + "epoch": 6.8968692449355435, + "grad_norm": 0.1867530792951584, + "learning_rate": 2.320177788384893e-05, + "loss": 1.7185, + "step": 22470 + }, + { + "epoch": 6.897176181706568, + "grad_norm": 0.21009495854377747, + "learning_rate": 2.3197581668053785e-05, + "loss": 1.7379, + "step": 22471 + }, + { + "epoch": 6.897483118477593, + "grad_norm": 0.20078743994235992, + "learning_rate": 2.3193385717135874e-05, + "loss": 1.7226, + "step": 22472 + }, + { + "epoch": 6.897790055248619, + "grad_norm": 0.2135045975446701, + "learning_rate": 2.318919003113663e-05, + "loss": 1.7531, + "step": 22473 + }, + { + "epoch": 6.898096992019644, + "grad_norm": 0.18811136484146118, + "learning_rate": 2.3184994610097526e-05, + "loss": 1.6542, + "step": 22474 + }, + { + "epoch": 6.898403928790669, + "grad_norm": 0.2323937565088272, + "learning_rate": 2.3180799454060025e-05, + "loss": 1.7369, + "step": 22475 + }, + { + "epoch": 6.898710865561695, + "grad_norm": 0.19270992279052734, + "learning_rate": 2.317660456306558e-05, + "loss": 1.6818, + "step": 22476 + }, + { + "epoch": 6.899017802332719, + "grad_norm": 0.18951043486595154, + "learning_rate": 2.3172409937155654e-05, + "loss": 1.7183, + "step": 22477 + }, + { + "epoch": 6.899324739103744, + "grad_norm": 0.1758934110403061, + "learning_rate": 2.3168215576371694e-05, + "loss": 1.6826, + "step": 22478 + }, + { + "epoch": 6.89963167587477, + "grad_norm": 0.2048143893480301, + "learning_rate": 2.3164021480755133e-05, + "loss": 1.7769, + "step": 22479 + }, + { + "epoch": 6.899938612645795, + "grad_norm": 0.20538486540317535, + "learning_rate": 2.315982765034748e-05, + "loss": 1.7035, + "step": 22480 + }, + { + "epoch": 6.9002455494168204, + "grad_norm": 0.18417708575725555, + "learning_rate": 2.3155634085190124e-05, + "loss": 1.7533, + "step": 22481 + }, + { + "epoch": 6.900552486187845, + "grad_norm": 0.1978628784418106, + "learning_rate": 2.315144078532453e-05, + "loss": 1.691, + "step": 22482 + }, + { + "epoch": 6.90085942295887, + "grad_norm": 0.17665794491767883, + "learning_rate": 2.3147247750792128e-05, + "loss": 1.7018, + "step": 22483 + }, + { + "epoch": 6.901166359729896, + "grad_norm": 0.20218273997306824, + "learning_rate": 2.314305498163435e-05, + "loss": 1.7277, + "step": 22484 + }, + { + "epoch": 6.901473296500921, + "grad_norm": 0.18791642785072327, + "learning_rate": 2.3138862477892674e-05, + "loss": 1.7247, + "step": 22485 + }, + { + "epoch": 6.901780233271946, + "grad_norm": 0.1945842206478119, + "learning_rate": 2.313467023960847e-05, + "loss": 1.6648, + "step": 22486 + }, + { + "epoch": 6.902087170042972, + "grad_norm": 0.1871321201324463, + "learning_rate": 2.3130478266823237e-05, + "loss": 1.6978, + "step": 22487 + }, + { + "epoch": 6.902394106813996, + "grad_norm": 0.20094287395477295, + "learning_rate": 2.312628655957833e-05, + "loss": 1.7763, + "step": 22488 + }, + { + "epoch": 6.902701043585021, + "grad_norm": 0.1804366111755371, + "learning_rate": 2.3122095117915226e-05, + "loss": 1.689, + "step": 22489 + }, + { + "epoch": 6.903007980356047, + "grad_norm": 0.1846652776002884, + "learning_rate": 2.311790394187534e-05, + "loss": 1.7088, + "step": 22490 + }, + { + "epoch": 6.903314917127072, + "grad_norm": 0.18339675664901733, + "learning_rate": 2.311371303150008e-05, + "loss": 1.6974, + "step": 22491 + }, + { + "epoch": 6.903621853898097, + "grad_norm": 0.21333162486553192, + "learning_rate": 2.3109522386830863e-05, + "loss": 1.7614, + "step": 22492 + }, + { + "epoch": 6.903928790669122, + "grad_norm": 0.19845318794250488, + "learning_rate": 2.3105332007909104e-05, + "loss": 1.6895, + "step": 22493 + }, + { + "epoch": 6.904235727440147, + "grad_norm": 0.21082347631454468, + "learning_rate": 2.3101141894776224e-05, + "loss": 1.7397, + "step": 22494 + }, + { + "epoch": 6.9045426642111725, + "grad_norm": 0.16360893845558167, + "learning_rate": 2.3096952047473623e-05, + "loss": 1.6716, + "step": 22495 + }, + { + "epoch": 6.904849600982198, + "grad_norm": 0.2287478744983673, + "learning_rate": 2.3092762466042687e-05, + "loss": 1.7673, + "step": 22496 + }, + { + "epoch": 6.905156537753223, + "grad_norm": 0.17231078445911407, + "learning_rate": 2.308857315052489e-05, + "loss": 1.6744, + "step": 22497 + }, + { + "epoch": 6.9054634745242485, + "grad_norm": 0.2887173295021057, + "learning_rate": 2.3084384100961565e-05, + "loss": 1.7358, + "step": 22498 + }, + { + "epoch": 6.905770411295273, + "grad_norm": 0.1977192759513855, + "learning_rate": 2.3080195317394127e-05, + "loss": 1.7514, + "step": 22499 + }, + { + "epoch": 6.906077348066298, + "grad_norm": 0.24933035671710968, + "learning_rate": 2.307600679986398e-05, + "loss": 1.6845, + "step": 22500 + }, + { + "epoch": 6.906384284837324, + "grad_norm": 0.17288708686828613, + "learning_rate": 2.30718185484125e-05, + "loss": 1.7211, + "step": 22501 + }, + { + "epoch": 6.906691221608349, + "grad_norm": 0.22192007303237915, + "learning_rate": 2.306763056308112e-05, + "loss": 1.6924, + "step": 22502 + }, + { + "epoch": 6.906998158379373, + "grad_norm": 0.20500123500823975, + "learning_rate": 2.3063442843911172e-05, + "loss": 1.7412, + "step": 22503 + }, + { + "epoch": 6.907305095150399, + "grad_norm": 0.30658698081970215, + "learning_rate": 2.30592553909441e-05, + "loss": 1.7965, + "step": 22504 + }, + { + "epoch": 6.907612031921424, + "grad_norm": 0.177829772233963, + "learning_rate": 2.3055068204221224e-05, + "loss": 1.6914, + "step": 22505 + }, + { + "epoch": 6.907918968692449, + "grad_norm": 0.20281876623630524, + "learning_rate": 2.3050881283783977e-05, + "loss": 1.6946, + "step": 22506 + }, + { + "epoch": 6.908225905463475, + "grad_norm": 0.16111700236797333, + "learning_rate": 2.3046694629673716e-05, + "loss": 1.7004, + "step": 22507 + }, + { + "epoch": 6.9085328422345, + "grad_norm": 0.1911575049161911, + "learning_rate": 2.3042508241931814e-05, + "loss": 1.7013, + "step": 22508 + }, + { + "epoch": 6.9088397790055245, + "grad_norm": 0.17862342298030853, + "learning_rate": 2.303832212059965e-05, + "loss": 1.7053, + "step": 22509 + }, + { + "epoch": 6.90914671577655, + "grad_norm": 0.2268948256969452, + "learning_rate": 2.303413626571858e-05, + "loss": 1.7241, + "step": 22510 + }, + { + "epoch": 6.909453652547575, + "grad_norm": 0.1997457593679428, + "learning_rate": 2.3029950677329992e-05, + "loss": 1.6927, + "step": 22511 + }, + { + "epoch": 6.9097605893186005, + "grad_norm": 0.22120819985866547, + "learning_rate": 2.3025765355475232e-05, + "loss": 1.7447, + "step": 22512 + }, + { + "epoch": 6.910067526089626, + "grad_norm": 0.22097964584827423, + "learning_rate": 2.302158030019565e-05, + "loss": 1.7399, + "step": 22513 + }, + { + "epoch": 6.91037446286065, + "grad_norm": 0.2171044498682022, + "learning_rate": 2.3017395511532664e-05, + "loss": 1.7252, + "step": 22514 + }, + { + "epoch": 6.910681399631676, + "grad_norm": 0.1987348347902298, + "learning_rate": 2.301321098952757e-05, + "loss": 1.7071, + "step": 22515 + }, + { + "epoch": 6.910988336402701, + "grad_norm": 0.2131081372499466, + "learning_rate": 2.3009026734221746e-05, + "loss": 1.7314, + "step": 22516 + }, + { + "epoch": 6.911295273173726, + "grad_norm": 0.18867900967597961, + "learning_rate": 2.3004842745656536e-05, + "loss": 1.7431, + "step": 22517 + }, + { + "epoch": 6.911602209944752, + "grad_norm": 0.22853058576583862, + "learning_rate": 2.3000659023873277e-05, + "loss": 1.7234, + "step": 22518 + }, + { + "epoch": 6.911909146715777, + "grad_norm": 0.23441165685653687, + "learning_rate": 2.2996475568913366e-05, + "loss": 1.7535, + "step": 22519 + }, + { + "epoch": 6.912216083486801, + "grad_norm": 0.2376382052898407, + "learning_rate": 2.299229238081807e-05, + "loss": 1.7582, + "step": 22520 + }, + { + "epoch": 6.912523020257827, + "grad_norm": 0.2571510076522827, + "learning_rate": 2.2988109459628814e-05, + "loss": 1.722, + "step": 22521 + }, + { + "epoch": 6.912829957028852, + "grad_norm": 0.19782103598117828, + "learning_rate": 2.298392680538685e-05, + "loss": 1.7052, + "step": 22522 + }, + { + "epoch": 6.913136893799877, + "grad_norm": 0.24070625007152557, + "learning_rate": 2.297974441813358e-05, + "loss": 1.7306, + "step": 22523 + }, + { + "epoch": 6.913443830570903, + "grad_norm": 0.1783500611782074, + "learning_rate": 2.2975562297910307e-05, + "loss": 1.7077, + "step": 22524 + }, + { + "epoch": 6.913750767341927, + "grad_norm": 0.19469089806079865, + "learning_rate": 2.2971380444758373e-05, + "loss": 1.7275, + "step": 22525 + }, + { + "epoch": 6.9140577041129525, + "grad_norm": 0.21449480950832367, + "learning_rate": 2.2967198858719092e-05, + "loss": 1.7682, + "step": 22526 + }, + { + "epoch": 6.914364640883978, + "grad_norm": 0.21686261892318726, + "learning_rate": 2.2963017539833803e-05, + "loss": 1.6794, + "step": 22527 + }, + { + "epoch": 6.914671577655003, + "grad_norm": 0.2061273604631424, + "learning_rate": 2.2958836488143813e-05, + "loss": 1.7612, + "step": 22528 + }, + { + "epoch": 6.9149785144260285, + "grad_norm": 0.2708517611026764, + "learning_rate": 2.295465570369046e-05, + "loss": 1.7291, + "step": 22529 + }, + { + "epoch": 6.915285451197054, + "grad_norm": 0.17011860013008118, + "learning_rate": 2.295047518651503e-05, + "loss": 1.6541, + "step": 22530 + }, + { + "epoch": 6.915592387968078, + "grad_norm": 0.255305677652359, + "learning_rate": 2.294629493665889e-05, + "loss": 1.7063, + "step": 22531 + }, + { + "epoch": 6.915899324739104, + "grad_norm": 0.20172207057476044, + "learning_rate": 2.2942114954163306e-05, + "loss": 1.6678, + "step": 22532 + }, + { + "epoch": 6.916206261510129, + "grad_norm": 0.23726679384708405, + "learning_rate": 2.2937935239069603e-05, + "loss": 1.6762, + "step": 22533 + }, + { + "epoch": 6.916513198281154, + "grad_norm": 0.17716684937477112, + "learning_rate": 2.2933755791419082e-05, + "loss": 1.7302, + "step": 22534 + }, + { + "epoch": 6.91682013505218, + "grad_norm": 0.2513270974159241, + "learning_rate": 2.2929576611253035e-05, + "loss": 1.7371, + "step": 22535 + }, + { + "epoch": 6.917127071823204, + "grad_norm": 0.21994394063949585, + "learning_rate": 2.292539769861281e-05, + "loss": 1.7007, + "step": 22536 + }, + { + "epoch": 6.917434008594229, + "grad_norm": 0.2095540314912796, + "learning_rate": 2.292121905353964e-05, + "loss": 1.71, + "step": 22537 + }, + { + "epoch": 6.917740945365255, + "grad_norm": 0.24400855600833893, + "learning_rate": 2.2917040676074892e-05, + "loss": 1.7859, + "step": 22538 + }, + { + "epoch": 6.91804788213628, + "grad_norm": 0.23217935860157013, + "learning_rate": 2.2912862566259785e-05, + "loss": 1.8218, + "step": 22539 + }, + { + "epoch": 6.918354818907305, + "grad_norm": 0.23555497825145721, + "learning_rate": 2.2908684724135666e-05, + "loss": 1.7145, + "step": 22540 + }, + { + "epoch": 6.918661755678331, + "grad_norm": 0.17844347655773163, + "learning_rate": 2.2904507149743804e-05, + "loss": 1.6767, + "step": 22541 + }, + { + "epoch": 6.918968692449355, + "grad_norm": 0.20810428261756897, + "learning_rate": 2.290032984312548e-05, + "loss": 1.7359, + "step": 22542 + }, + { + "epoch": 6.9192756292203805, + "grad_norm": 0.20082542300224304, + "learning_rate": 2.289615280432198e-05, + "loss": 1.7623, + "step": 22543 + }, + { + "epoch": 6.919582565991406, + "grad_norm": 0.2005007117986679, + "learning_rate": 2.2891976033374584e-05, + "loss": 1.745, + "step": 22544 + }, + { + "epoch": 6.919889502762431, + "grad_norm": 0.18054969608783722, + "learning_rate": 2.2887799530324572e-05, + "loss": 1.6959, + "step": 22545 + }, + { + "epoch": 6.920196439533456, + "grad_norm": 0.18410442769527435, + "learning_rate": 2.2883623295213214e-05, + "loss": 1.7052, + "step": 22546 + }, + { + "epoch": 6.920503376304481, + "grad_norm": 0.17380426824092865, + "learning_rate": 2.2879447328081765e-05, + "loss": 1.6735, + "step": 22547 + }, + { + "epoch": 6.920810313075506, + "grad_norm": 0.19082246720790863, + "learning_rate": 2.2875271628971557e-05, + "loss": 1.7192, + "step": 22548 + }, + { + "epoch": 6.921117249846532, + "grad_norm": 0.17682792246341705, + "learning_rate": 2.2871096197923784e-05, + "loss": 1.649, + "step": 22549 + }, + { + "epoch": 6.921424186617557, + "grad_norm": 0.19127340614795685, + "learning_rate": 2.286692103497975e-05, + "loss": 1.7366, + "step": 22550 + }, + { + "epoch": 6.921731123388582, + "grad_norm": 0.1636040210723877, + "learning_rate": 2.2862746140180696e-05, + "loss": 1.6749, + "step": 22551 + }, + { + "epoch": 6.922038060159607, + "grad_norm": 0.2121013104915619, + "learning_rate": 2.285857151356788e-05, + "loss": 1.7342, + "step": 22552 + }, + { + "epoch": 6.922344996930632, + "grad_norm": 0.19183295965194702, + "learning_rate": 2.28543971551826e-05, + "loss": 1.7506, + "step": 22553 + }, + { + "epoch": 6.922651933701657, + "grad_norm": 0.23838891088962555, + "learning_rate": 2.285022306506604e-05, + "loss": 1.6875, + "step": 22554 + }, + { + "epoch": 6.922958870472683, + "grad_norm": 0.17147624492645264, + "learning_rate": 2.2846049243259526e-05, + "loss": 1.7074, + "step": 22555 + }, + { + "epoch": 6.923265807243708, + "grad_norm": 0.2254270762205124, + "learning_rate": 2.2841875689804236e-05, + "loss": 1.7589, + "step": 22556 + }, + { + "epoch": 6.9235727440147325, + "grad_norm": 0.249015673995018, + "learning_rate": 2.2837702404741462e-05, + "loss": 1.7708, + "step": 22557 + }, + { + "epoch": 6.923879680785758, + "grad_norm": 0.19401927292346954, + "learning_rate": 2.283352938811244e-05, + "loss": 1.696, + "step": 22558 + }, + { + "epoch": 6.924186617556783, + "grad_norm": 0.21134993433952332, + "learning_rate": 2.2829356639958398e-05, + "loss": 1.7136, + "step": 22559 + }, + { + "epoch": 6.9244935543278086, + "grad_norm": 0.17600105702877045, + "learning_rate": 2.2825184160320578e-05, + "loss": 1.679, + "step": 22560 + }, + { + "epoch": 6.924800491098834, + "grad_norm": 0.2426912486553192, + "learning_rate": 2.282101194924022e-05, + "loss": 1.7011, + "step": 22561 + }, + { + "epoch": 6.925107427869859, + "grad_norm": 0.20040342211723328, + "learning_rate": 2.281684000675855e-05, + "loss": 1.6844, + "step": 22562 + }, + { + "epoch": 6.925414364640884, + "grad_norm": 0.23790770769119263, + "learning_rate": 2.2812668332916798e-05, + "loss": 1.7318, + "step": 22563 + }, + { + "epoch": 6.925721301411909, + "grad_norm": 0.21387948095798492, + "learning_rate": 2.2808496927756196e-05, + "loss": 1.6903, + "step": 22564 + }, + { + "epoch": 6.926028238182934, + "grad_norm": 0.20471405982971191, + "learning_rate": 2.280432579131796e-05, + "loss": 1.7231, + "step": 22565 + }, + { + "epoch": 6.92633517495396, + "grad_norm": 0.1953156590461731, + "learning_rate": 2.280015492364332e-05, + "loss": 1.7322, + "step": 22566 + }, + { + "epoch": 6.926642111724985, + "grad_norm": 0.3107415437698364, + "learning_rate": 2.279598432477349e-05, + "loss": 1.7833, + "step": 22567 + }, + { + "epoch": 6.9269490484960095, + "grad_norm": 0.2114095836877823, + "learning_rate": 2.279181399474969e-05, + "loss": 1.6923, + "step": 22568 + }, + { + "epoch": 6.927255985267035, + "grad_norm": 0.21373972296714783, + "learning_rate": 2.2787643933613107e-05, + "loss": 1.6897, + "step": 22569 + }, + { + "epoch": 6.92756292203806, + "grad_norm": 0.17955096065998077, + "learning_rate": 2.278347414140502e-05, + "loss": 1.7443, + "step": 22570 + }, + { + "epoch": 6.9278698588090855, + "grad_norm": 0.19275230169296265, + "learning_rate": 2.2779304618166554e-05, + "loss": 1.7109, + "step": 22571 + }, + { + "epoch": 6.928176795580111, + "grad_norm": 0.16774436831474304, + "learning_rate": 2.277513536393899e-05, + "loss": 1.7059, + "step": 22572 + }, + { + "epoch": 6.928483732351136, + "grad_norm": 0.25093573331832886, + "learning_rate": 2.2770966378763457e-05, + "loss": 1.7501, + "step": 22573 + }, + { + "epoch": 6.928790669122161, + "grad_norm": 0.24859540164470673, + "learning_rate": 2.2766797662681216e-05, + "loss": 1.7315, + "step": 22574 + }, + { + "epoch": 6.929097605893186, + "grad_norm": 0.1736115962266922, + "learning_rate": 2.2762629215733438e-05, + "loss": 1.7422, + "step": 22575 + }, + { + "epoch": 6.929404542664211, + "grad_norm": 0.23705001175403595, + "learning_rate": 2.2758461037961326e-05, + "loss": 1.7818, + "step": 22576 + }, + { + "epoch": 6.929711479435237, + "grad_norm": 0.21123656630516052, + "learning_rate": 2.2754293129406073e-05, + "loss": 1.7652, + "step": 22577 + }, + { + "epoch": 6.930018416206261, + "grad_norm": 0.2195751667022705, + "learning_rate": 2.2750125490108858e-05, + "loss": 1.7103, + "step": 22578 + }, + { + "epoch": 6.930325352977286, + "grad_norm": 0.17324887216091156, + "learning_rate": 2.274595812011088e-05, + "loss": 1.7386, + "step": 22579 + }, + { + "epoch": 6.930632289748312, + "grad_norm": 0.3175726532936096, + "learning_rate": 2.2741791019453313e-05, + "loss": 1.7608, + "step": 22580 + }, + { + "epoch": 6.930939226519337, + "grad_norm": 0.26266980171203613, + "learning_rate": 2.273762418817734e-05, + "loss": 1.691, + "step": 22581 + }, + { + "epoch": 6.931246163290362, + "grad_norm": 0.21905983984470367, + "learning_rate": 2.273345762632415e-05, + "loss": 1.6886, + "step": 22582 + }, + { + "epoch": 6.931553100061388, + "grad_norm": 0.2201247364282608, + "learning_rate": 2.2729291333934914e-05, + "loss": 1.7313, + "step": 22583 + }, + { + "epoch": 6.931860036832412, + "grad_norm": 0.2844204306602478, + "learning_rate": 2.2725125311050805e-05, + "loss": 1.6918, + "step": 22584 + }, + { + "epoch": 6.9321669736034375, + "grad_norm": 0.22451715171337128, + "learning_rate": 2.272095955771299e-05, + "loss": 1.699, + "step": 22585 + }, + { + "epoch": 6.932473910374463, + "grad_norm": 0.27357545495033264, + "learning_rate": 2.2716794073962645e-05, + "loss": 1.7709, + "step": 22586 + }, + { + "epoch": 6.932780847145488, + "grad_norm": 0.2605188190937042, + "learning_rate": 2.271262885984093e-05, + "loss": 1.7812, + "step": 22587 + }, + { + "epoch": 6.9330877839165135, + "grad_norm": 0.1866278201341629, + "learning_rate": 2.270846391538899e-05, + "loss": 1.7204, + "step": 22588 + }, + { + "epoch": 6.933394720687538, + "grad_norm": 0.24624690413475037, + "learning_rate": 2.2704299240648043e-05, + "loss": 1.7345, + "step": 22589 + }, + { + "epoch": 6.933701657458563, + "grad_norm": 0.18003861606121063, + "learning_rate": 2.2700134835659175e-05, + "loss": 1.73, + "step": 22590 + }, + { + "epoch": 6.934008594229589, + "grad_norm": 0.2330949604511261, + "learning_rate": 2.269597070046359e-05, + "loss": 1.7614, + "step": 22591 + }, + { + "epoch": 6.934315531000614, + "grad_norm": 0.18806515634059906, + "learning_rate": 2.269180683510243e-05, + "loss": 1.7364, + "step": 22592 + }, + { + "epoch": 6.934622467771639, + "grad_norm": 0.23998546600341797, + "learning_rate": 2.268764323961684e-05, + "loss": 1.6858, + "step": 22593 + }, + { + "epoch": 6.934929404542665, + "grad_norm": 0.1707296371459961, + "learning_rate": 2.268347991404797e-05, + "loss": 1.6703, + "step": 22594 + }, + { + "epoch": 6.935236341313689, + "grad_norm": 0.19724871218204498, + "learning_rate": 2.267931685843696e-05, + "loss": 1.7338, + "step": 22595 + }, + { + "epoch": 6.935543278084714, + "grad_norm": 0.20384611189365387, + "learning_rate": 2.2675154072824955e-05, + "loss": 1.7224, + "step": 22596 + }, + { + "epoch": 6.93585021485574, + "grad_norm": 0.18632391095161438, + "learning_rate": 2.2670991557253092e-05, + "loss": 1.7006, + "step": 22597 + }, + { + "epoch": 6.936157151626765, + "grad_norm": 0.22928105294704437, + "learning_rate": 2.2666829311762505e-05, + "loss": 1.7462, + "step": 22598 + }, + { + "epoch": 6.93646408839779, + "grad_norm": 0.1905689388513565, + "learning_rate": 2.266266733639434e-05, + "loss": 1.7071, + "step": 22599 + }, + { + "epoch": 6.936771025168815, + "grad_norm": 0.2051437795162201, + "learning_rate": 2.2658505631189708e-05, + "loss": 1.6872, + "step": 22600 + }, + { + "epoch": 6.93707796193984, + "grad_norm": 0.178196981549263, + "learning_rate": 2.265434419618976e-05, + "loss": 1.7044, + "step": 22601 + }, + { + "epoch": 6.9373848987108655, + "grad_norm": 0.21399027109146118, + "learning_rate": 2.26501830314356e-05, + "loss": 1.7529, + "step": 22602 + }, + { + "epoch": 6.937691835481891, + "grad_norm": 0.21747443079948425, + "learning_rate": 2.264602213696837e-05, + "loss": 1.7662, + "step": 22603 + }, + { + "epoch": 6.937998772252916, + "grad_norm": 0.1939898282289505, + "learning_rate": 2.2641861512829177e-05, + "loss": 1.7194, + "step": 22604 + }, + { + "epoch": 6.9383057090239415, + "grad_norm": 0.2183499038219452, + "learning_rate": 2.2637701159059128e-05, + "loss": 1.6659, + "step": 22605 + }, + { + "epoch": 6.938612645794966, + "grad_norm": 0.21971984207630157, + "learning_rate": 2.2633541075699387e-05, + "loss": 1.7729, + "step": 22606 + }, + { + "epoch": 6.938919582565991, + "grad_norm": 0.2611743211746216, + "learning_rate": 2.2629381262790998e-05, + "loss": 1.8, + "step": 22607 + }, + { + "epoch": 6.939226519337017, + "grad_norm": 0.22962158918380737, + "learning_rate": 2.2625221720375144e-05, + "loss": 1.7244, + "step": 22608 + }, + { + "epoch": 6.939533456108042, + "grad_norm": 0.20961032807826996, + "learning_rate": 2.2621062448492858e-05, + "loss": 1.7107, + "step": 22609 + }, + { + "epoch": 6.939840392879067, + "grad_norm": 0.2370155155658722, + "learning_rate": 2.2616903447185293e-05, + "loss": 1.7185, + "step": 22610 + }, + { + "epoch": 6.940147329650092, + "grad_norm": 0.19033893942832947, + "learning_rate": 2.2612744716493544e-05, + "loss": 1.7034, + "step": 22611 + }, + { + "epoch": 6.940454266421117, + "grad_norm": 0.22657649219036102, + "learning_rate": 2.2608586256458704e-05, + "loss": 1.6987, + "step": 22612 + }, + { + "epoch": 6.940761203192142, + "grad_norm": 0.17767953872680664, + "learning_rate": 2.2604428067121862e-05, + "loss": 1.6934, + "step": 22613 + }, + { + "epoch": 6.941068139963168, + "grad_norm": 0.209768146276474, + "learning_rate": 2.2600270148524123e-05, + "loss": 1.7148, + "step": 22614 + }, + { + "epoch": 6.941375076734193, + "grad_norm": 0.21234147250652313, + "learning_rate": 2.2596112500706574e-05, + "loss": 1.7147, + "step": 22615 + }, + { + "epoch": 6.941682013505218, + "grad_norm": 0.17608872056007385, + "learning_rate": 2.2591955123710307e-05, + "loss": 1.6873, + "step": 22616 + }, + { + "epoch": 6.941988950276243, + "grad_norm": 0.1743561178445816, + "learning_rate": 2.25877980175764e-05, + "loss": 1.7273, + "step": 22617 + }, + { + "epoch": 6.942295887047268, + "grad_norm": 0.22064091265201569, + "learning_rate": 2.258364118234594e-05, + "loss": 1.7785, + "step": 22618 + }, + { + "epoch": 6.9426028238182935, + "grad_norm": 0.20353585481643677, + "learning_rate": 2.2579484618060005e-05, + "loss": 1.7518, + "step": 22619 + }, + { + "epoch": 6.942909760589319, + "grad_norm": 0.23978710174560547, + "learning_rate": 2.2575328324759676e-05, + "loss": 1.7576, + "step": 22620 + }, + { + "epoch": 6.943216697360343, + "grad_norm": 0.24991966784000397, + "learning_rate": 2.257117230248603e-05, + "loss": 1.7383, + "step": 22621 + }, + { + "epoch": 6.943523634131369, + "grad_norm": 0.20734381675720215, + "learning_rate": 2.256701655128011e-05, + "loss": 1.7063, + "step": 22622 + }, + { + "epoch": 6.943830570902394, + "grad_norm": 0.20097215473651886, + "learning_rate": 2.2562861071183057e-05, + "loss": 1.7647, + "step": 22623 + }, + { + "epoch": 6.944137507673419, + "grad_norm": 0.20144836604595184, + "learning_rate": 2.2558705862235852e-05, + "loss": 1.7165, + "step": 22624 + }, + { + "epoch": 6.944444444444445, + "grad_norm": 0.20394138991832733, + "learning_rate": 2.255455092447964e-05, + "loss": 1.7048, + "step": 22625 + }, + { + "epoch": 6.94475138121547, + "grad_norm": 0.21430160105228424, + "learning_rate": 2.2550396257955396e-05, + "loss": 1.7233, + "step": 22626 + }, + { + "epoch": 6.945058317986494, + "grad_norm": 0.19071494042873383, + "learning_rate": 2.254624186270425e-05, + "loss": 1.7407, + "step": 22627 + }, + { + "epoch": 6.94536525475752, + "grad_norm": 0.19658641517162323, + "learning_rate": 2.2542087738767232e-05, + "loss": 1.6371, + "step": 22628 + }, + { + "epoch": 6.945672191528545, + "grad_norm": 0.19009098410606384, + "learning_rate": 2.25379338861854e-05, + "loss": 1.7515, + "step": 22629 + }, + { + "epoch": 6.94597912829957, + "grad_norm": 0.21250933408737183, + "learning_rate": 2.2533780304999796e-05, + "loss": 1.7308, + "step": 22630 + }, + { + "epoch": 6.946286065070596, + "grad_norm": 0.22148491442203522, + "learning_rate": 2.2529626995251475e-05, + "loss": 1.705, + "step": 22631 + }, + { + "epoch": 6.94659300184162, + "grad_norm": 0.190248504281044, + "learning_rate": 2.252547395698148e-05, + "loss": 1.7507, + "step": 22632 + }, + { + "epoch": 6.9468999386126455, + "grad_norm": 0.20005743205547333, + "learning_rate": 2.2521321190230855e-05, + "loss": 1.7622, + "step": 22633 + }, + { + "epoch": 6.947206875383671, + "grad_norm": 0.24233438074588776, + "learning_rate": 2.251716869504064e-05, + "loss": 1.7119, + "step": 22634 + }, + { + "epoch": 6.947513812154696, + "grad_norm": 0.20823299884796143, + "learning_rate": 2.2513016471451874e-05, + "loss": 1.69, + "step": 22635 + }, + { + "epoch": 6.9478207489257215, + "grad_norm": 0.21486341953277588, + "learning_rate": 2.250886451950559e-05, + "loss": 1.6528, + "step": 22636 + }, + { + "epoch": 6.948127685696747, + "grad_norm": 0.22201848030090332, + "learning_rate": 2.2504712839242813e-05, + "loss": 1.7454, + "step": 22637 + }, + { + "epoch": 6.948434622467771, + "grad_norm": 0.25179341435432434, + "learning_rate": 2.2500561430704588e-05, + "loss": 1.7226, + "step": 22638 + }, + { + "epoch": 6.948741559238797, + "grad_norm": 0.2510581910610199, + "learning_rate": 2.2496410293931913e-05, + "loss": 1.7048, + "step": 22639 + }, + { + "epoch": 6.949048496009822, + "grad_norm": 0.2406487911939621, + "learning_rate": 2.2492259428965866e-05, + "loss": 1.6751, + "step": 22640 + }, + { + "epoch": 6.949355432780847, + "grad_norm": 0.2555276155471802, + "learning_rate": 2.24881088358474e-05, + "loss": 1.7369, + "step": 22641 + }, + { + "epoch": 6.949662369551873, + "grad_norm": 0.19703364372253418, + "learning_rate": 2.2483958514617597e-05, + "loss": 1.7196, + "step": 22642 + }, + { + "epoch": 6.949969306322897, + "grad_norm": 0.18491938710212708, + "learning_rate": 2.2479808465317414e-05, + "loss": 1.6923, + "step": 22643 + }, + { + "epoch": 6.9502762430939224, + "grad_norm": 0.21588458120822906, + "learning_rate": 2.247565868798791e-05, + "loss": 1.6797, + "step": 22644 + }, + { + "epoch": 6.950583179864948, + "grad_norm": 0.18480601906776428, + "learning_rate": 2.247150918267008e-05, + "loss": 1.6672, + "step": 22645 + }, + { + "epoch": 6.950890116635973, + "grad_norm": 0.261846125125885, + "learning_rate": 2.246735994940493e-05, + "loss": 1.7594, + "step": 22646 + }, + { + "epoch": 6.9511970534069984, + "grad_norm": 0.24510261416435242, + "learning_rate": 2.2463210988233468e-05, + "loss": 1.7712, + "step": 22647 + }, + { + "epoch": 6.951503990178024, + "grad_norm": 0.25896379351615906, + "learning_rate": 2.24590622991967e-05, + "loss": 1.6811, + "step": 22648 + }, + { + "epoch": 6.951810926949048, + "grad_norm": 0.26284709572792053, + "learning_rate": 2.245491388233561e-05, + "loss": 1.7269, + "step": 22649 + }, + { + "epoch": 6.952117863720074, + "grad_norm": 0.1613062471151352, + "learning_rate": 2.245076573769121e-05, + "loss": 1.6162, + "step": 22650 + }, + { + "epoch": 6.952424800491099, + "grad_norm": 0.203482523560524, + "learning_rate": 2.244661786530449e-05, + "loss": 1.7124, + "step": 22651 + }, + { + "epoch": 6.952731737262124, + "grad_norm": 0.18294258415699005, + "learning_rate": 2.2442470265216446e-05, + "loss": 1.7101, + "step": 22652 + }, + { + "epoch": 6.953038674033149, + "grad_norm": 0.1841319352388382, + "learning_rate": 2.2438322937468058e-05, + "loss": 1.723, + "step": 22653 + }, + { + "epoch": 6.953345610804174, + "grad_norm": 0.1600010097026825, + "learning_rate": 2.2434175882100322e-05, + "loss": 1.6867, + "step": 22654 + }, + { + "epoch": 6.953652547575199, + "grad_norm": 0.16904005408287048, + "learning_rate": 2.243002909915421e-05, + "loss": 1.6993, + "step": 22655 + }, + { + "epoch": 6.953959484346225, + "grad_norm": 0.20069406926631927, + "learning_rate": 2.2425882588670692e-05, + "loss": 1.6995, + "step": 22656 + }, + { + "epoch": 6.95426642111725, + "grad_norm": 0.170061394572258, + "learning_rate": 2.2421736350690808e-05, + "loss": 1.7217, + "step": 22657 + }, + { + "epoch": 6.954573357888275, + "grad_norm": 0.20549608767032623, + "learning_rate": 2.241759038525545e-05, + "loss": 1.7229, + "step": 22658 + }, + { + "epoch": 6.9548802946593, + "grad_norm": 0.20916205644607544, + "learning_rate": 2.241344469240566e-05, + "loss": 1.7499, + "step": 22659 + }, + { + "epoch": 6.955187231430325, + "grad_norm": 0.156641885638237, + "learning_rate": 2.2409299272182348e-05, + "loss": 1.6827, + "step": 22660 + }, + { + "epoch": 6.9554941682013505, + "grad_norm": 0.17876049876213074, + "learning_rate": 2.240515412462653e-05, + "loss": 1.6745, + "step": 22661 + }, + { + "epoch": 6.955801104972376, + "grad_norm": 0.17265759408473969, + "learning_rate": 2.2401009249779153e-05, + "loss": 1.7687, + "step": 22662 + }, + { + "epoch": 6.956108041743401, + "grad_norm": 0.18822525441646576, + "learning_rate": 2.2396864647681175e-05, + "loss": 1.6974, + "step": 22663 + }, + { + "epoch": 6.956414978514426, + "grad_norm": 0.18686626851558685, + "learning_rate": 2.2392720318373567e-05, + "loss": 1.7522, + "step": 22664 + }, + { + "epoch": 6.956721915285451, + "grad_norm": 0.1668211668729782, + "learning_rate": 2.238857626189727e-05, + "loss": 1.7198, + "step": 22665 + }, + { + "epoch": 6.957028852056476, + "grad_norm": 0.23307017982006073, + "learning_rate": 2.238443247829325e-05, + "loss": 1.7377, + "step": 22666 + }, + { + "epoch": 6.957335788827502, + "grad_norm": 0.1771896481513977, + "learning_rate": 2.2380288967602453e-05, + "loss": 1.7626, + "step": 22667 + }, + { + "epoch": 6.957642725598527, + "grad_norm": 0.185984805226326, + "learning_rate": 2.237614572986583e-05, + "loss": 1.7328, + "step": 22668 + }, + { + "epoch": 6.957949662369552, + "grad_norm": 0.3076271414756775, + "learning_rate": 2.2372002765124327e-05, + "loss": 1.7081, + "step": 22669 + }, + { + "epoch": 6.958256599140577, + "grad_norm": 0.17874667048454285, + "learning_rate": 2.2367860073418885e-05, + "loss": 1.6752, + "step": 22670 + }, + { + "epoch": 6.958563535911602, + "grad_norm": 0.2044304609298706, + "learning_rate": 2.2363717654790445e-05, + "loss": 1.7325, + "step": 22671 + }, + { + "epoch": 6.958870472682627, + "grad_norm": 0.19335824251174927, + "learning_rate": 2.2359575509279945e-05, + "loss": 1.7192, + "step": 22672 + }, + { + "epoch": 6.959177409453653, + "grad_norm": 0.19514116644859314, + "learning_rate": 2.23554336369283e-05, + "loss": 1.7186, + "step": 22673 + }, + { + "epoch": 6.959484346224678, + "grad_norm": 0.2779110372066498, + "learning_rate": 2.23512920377765e-05, + "loss": 1.7391, + "step": 22674 + }, + { + "epoch": 6.9597912829957025, + "grad_norm": 0.17390480637550354, + "learning_rate": 2.2347150711865406e-05, + "loss": 1.6538, + "step": 22675 + }, + { + "epoch": 6.960098219766728, + "grad_norm": 0.1640262007713318, + "learning_rate": 2.234300965923601e-05, + "loss": 1.6534, + "step": 22676 + }, + { + "epoch": 6.960405156537753, + "grad_norm": 0.17519034445285797, + "learning_rate": 2.2338868879929165e-05, + "loss": 1.6931, + "step": 22677 + }, + { + "epoch": 6.9607120933087785, + "grad_norm": 0.16885873675346375, + "learning_rate": 2.2334728373985847e-05, + "loss": 1.7204, + "step": 22678 + }, + { + "epoch": 6.961019030079804, + "grad_norm": 0.16997110843658447, + "learning_rate": 2.2330588141446963e-05, + "loss": 1.7063, + "step": 22679 + }, + { + "epoch": 6.961325966850829, + "grad_norm": 0.17793773114681244, + "learning_rate": 2.2326448182353422e-05, + "loss": 1.7382, + "step": 22680 + }, + { + "epoch": 6.961632903621854, + "grad_norm": 0.1809101551771164, + "learning_rate": 2.2322308496746134e-05, + "loss": 1.6874, + "step": 22681 + }, + { + "epoch": 6.961939840392879, + "grad_norm": 0.19095295667648315, + "learning_rate": 2.2318169084666023e-05, + "loss": 1.7122, + "step": 22682 + }, + { + "epoch": 6.962246777163904, + "grad_norm": 0.19206218421459198, + "learning_rate": 2.2314029946153992e-05, + "loss": 1.6733, + "step": 22683 + }, + { + "epoch": 6.96255371393493, + "grad_norm": 0.21243152022361755, + "learning_rate": 2.2309891081250938e-05, + "loss": 1.7026, + "step": 22684 + }, + { + "epoch": 6.962860650705955, + "grad_norm": 0.17602933943271637, + "learning_rate": 2.2305752489997777e-05, + "loss": 1.7073, + "step": 22685 + }, + { + "epoch": 6.963167587476979, + "grad_norm": 0.21810807287693024, + "learning_rate": 2.2301614172435398e-05, + "loss": 1.7323, + "step": 22686 + }, + { + "epoch": 6.963474524248005, + "grad_norm": 0.20711791515350342, + "learning_rate": 2.2297476128604706e-05, + "loss": 1.7228, + "step": 22687 + }, + { + "epoch": 6.96378146101903, + "grad_norm": 0.20376695692539215, + "learning_rate": 2.2293338358546583e-05, + "loss": 1.715, + "step": 22688 + }, + { + "epoch": 6.964088397790055, + "grad_norm": 0.20096196234226227, + "learning_rate": 2.228920086230194e-05, + "loss": 1.7239, + "step": 22689 + }, + { + "epoch": 6.964395334561081, + "grad_norm": 0.24215486645698547, + "learning_rate": 2.228506363991163e-05, + "loss": 1.7879, + "step": 22690 + }, + { + "epoch": 6.964702271332106, + "grad_norm": 0.1917567104101181, + "learning_rate": 2.2280926691416603e-05, + "loss": 1.6903, + "step": 22691 + }, + { + "epoch": 6.9650092081031305, + "grad_norm": 0.19827421009540558, + "learning_rate": 2.2276790016857673e-05, + "loss": 1.7654, + "step": 22692 + }, + { + "epoch": 6.965316144874156, + "grad_norm": 0.20852476358413696, + "learning_rate": 2.2272653616275784e-05, + "loss": 1.7452, + "step": 22693 + }, + { + "epoch": 6.965623081645181, + "grad_norm": 0.21223776042461395, + "learning_rate": 2.2268517489711755e-05, + "loss": 1.6973, + "step": 22694 + }, + { + "epoch": 6.9659300184162065, + "grad_norm": 0.1903543621301651, + "learning_rate": 2.22643816372065e-05, + "loss": 1.7398, + "step": 22695 + }, + { + "epoch": 6.966236955187231, + "grad_norm": 0.21726597845554352, + "learning_rate": 2.2260246058800888e-05, + "loss": 1.7813, + "step": 22696 + }, + { + "epoch": 6.966543891958256, + "grad_norm": 0.1710241734981537, + "learning_rate": 2.225611075453578e-05, + "loss": 1.6647, + "step": 22697 + }, + { + "epoch": 6.966850828729282, + "grad_norm": 0.199532151222229, + "learning_rate": 2.2251975724452045e-05, + "loss": 1.7503, + "step": 22698 + }, + { + "epoch": 6.967157765500307, + "grad_norm": 0.18966728448867798, + "learning_rate": 2.224784096859055e-05, + "loss": 1.8113, + "step": 22699 + }, + { + "epoch": 6.967464702271332, + "grad_norm": 0.1977413445711136, + "learning_rate": 2.2243706486992162e-05, + "loss": 1.7036, + "step": 22700 + }, + { + "epoch": 6.967771639042358, + "grad_norm": 0.1794840395450592, + "learning_rate": 2.223957227969773e-05, + "loss": 1.714, + "step": 22701 + }, + { + "epoch": 6.968078575813382, + "grad_norm": 0.1811632663011551, + "learning_rate": 2.2235438346748117e-05, + "loss": 1.6845, + "step": 22702 + }, + { + "epoch": 6.968385512584407, + "grad_norm": 0.17478540539741516, + "learning_rate": 2.2231304688184172e-05, + "loss": 1.7078, + "step": 22703 + }, + { + "epoch": 6.968692449355433, + "grad_norm": 0.22631226480007172, + "learning_rate": 2.2227171304046756e-05, + "loss": 1.7576, + "step": 22704 + }, + { + "epoch": 6.968999386126458, + "grad_norm": 0.20498304069042206, + "learning_rate": 2.2223038194376712e-05, + "loss": 1.7342, + "step": 22705 + }, + { + "epoch": 6.969306322897483, + "grad_norm": 0.18556833267211914, + "learning_rate": 2.221890535921488e-05, + "loss": 1.6583, + "step": 22706 + }, + { + "epoch": 6.969613259668508, + "grad_norm": 0.19878216087818146, + "learning_rate": 2.221477279860209e-05, + "loss": 1.7536, + "step": 22707 + }, + { + "epoch": 6.969920196439533, + "grad_norm": 0.20304621756076813, + "learning_rate": 2.221064051257924e-05, + "loss": 1.7263, + "step": 22708 + }, + { + "epoch": 6.9702271332105585, + "grad_norm": 0.18725872039794922, + "learning_rate": 2.220650850118709e-05, + "loss": 1.7174, + "step": 22709 + }, + { + "epoch": 6.970534069981584, + "grad_norm": 0.28994759917259216, + "learning_rate": 2.2202376764466554e-05, + "loss": 1.7401, + "step": 22710 + }, + { + "epoch": 6.970841006752609, + "grad_norm": 0.19320951402187347, + "learning_rate": 2.2198245302458383e-05, + "loss": 1.7204, + "step": 22711 + }, + { + "epoch": 6.9711479435236345, + "grad_norm": 0.24737104773521423, + "learning_rate": 2.2194114115203464e-05, + "loss": 1.7418, + "step": 22712 + }, + { + "epoch": 6.971454880294659, + "grad_norm": 0.18811406195163727, + "learning_rate": 2.218998320274261e-05, + "loss": 1.6999, + "step": 22713 + }, + { + "epoch": 6.971761817065684, + "grad_norm": 0.20729362964630127, + "learning_rate": 2.2185852565116638e-05, + "loss": 1.6833, + "step": 22714 + }, + { + "epoch": 6.97206875383671, + "grad_norm": 0.1862284392118454, + "learning_rate": 2.2181722202366378e-05, + "loss": 1.7232, + "step": 22715 + }, + { + "epoch": 6.972375690607735, + "grad_norm": 0.24128347635269165, + "learning_rate": 2.217759211453264e-05, + "loss": 1.7081, + "step": 22716 + }, + { + "epoch": 6.97268262737876, + "grad_norm": 0.2007059007883072, + "learning_rate": 2.217346230165625e-05, + "loss": 1.7383, + "step": 22717 + }, + { + "epoch": 6.972989564149785, + "grad_norm": 0.2177598625421524, + "learning_rate": 2.216933276377801e-05, + "loss": 1.7494, + "step": 22718 + }, + { + "epoch": 6.97329650092081, + "grad_norm": 0.20965704321861267, + "learning_rate": 2.2165203500938735e-05, + "loss": 1.7326, + "step": 22719 + }, + { + "epoch": 6.973603437691835, + "grad_norm": 0.17255879938602448, + "learning_rate": 2.2161074513179237e-05, + "loss": 1.6713, + "step": 22720 + }, + { + "epoch": 6.973910374462861, + "grad_norm": 0.21480637788772583, + "learning_rate": 2.215694580054032e-05, + "loss": 1.7248, + "step": 22721 + }, + { + "epoch": 6.974217311233886, + "grad_norm": 0.15835267305374146, + "learning_rate": 2.215281736306278e-05, + "loss": 1.7086, + "step": 22722 + }, + { + "epoch": 6.974524248004911, + "grad_norm": 0.20524290204048157, + "learning_rate": 2.2148689200787415e-05, + "loss": 1.7472, + "step": 22723 + }, + { + "epoch": 6.974831184775936, + "grad_norm": 0.16152524948120117, + "learning_rate": 2.214456131375502e-05, + "loss": 1.6373, + "step": 22724 + }, + { + "epoch": 6.975138121546961, + "grad_norm": 0.1995699107646942, + "learning_rate": 2.2140433702006425e-05, + "loss": 1.6949, + "step": 22725 + }, + { + "epoch": 6.975445058317987, + "grad_norm": 0.19927829504013062, + "learning_rate": 2.213630636558236e-05, + "loss": 1.7875, + "step": 22726 + }, + { + "epoch": 6.975751995089012, + "grad_norm": 0.19159351289272308, + "learning_rate": 2.213217930452368e-05, + "loss": 1.7067, + "step": 22727 + }, + { + "epoch": 6.976058931860036, + "grad_norm": 0.21832366287708282, + "learning_rate": 2.2128052518871107e-05, + "loss": 1.6952, + "step": 22728 + }, + { + "epoch": 6.976365868631062, + "grad_norm": 0.2433125376701355, + "learning_rate": 2.212392600866547e-05, + "loss": 1.7503, + "step": 22729 + }, + { + "epoch": 6.976672805402087, + "grad_norm": 0.25504401326179504, + "learning_rate": 2.2119799773947535e-05, + "loss": 1.7289, + "step": 22730 + }, + { + "epoch": 6.976979742173112, + "grad_norm": 0.20463863015174866, + "learning_rate": 2.211567381475808e-05, + "loss": 1.7442, + "step": 22731 + }, + { + "epoch": 6.977286678944138, + "grad_norm": 0.21862375736236572, + "learning_rate": 2.2111548131137883e-05, + "loss": 1.7266, + "step": 22732 + }, + { + "epoch": 6.977593615715163, + "grad_norm": 0.2124018520116806, + "learning_rate": 2.210742272312771e-05, + "loss": 1.7555, + "step": 22733 + }, + { + "epoch": 6.9779005524861875, + "grad_norm": 0.2911135256290436, + "learning_rate": 2.2103297590768334e-05, + "loss": 1.711, + "step": 22734 + }, + { + "epoch": 6.978207489257213, + "grad_norm": 0.2172393649816513, + "learning_rate": 2.2099172734100525e-05, + "loss": 1.7054, + "step": 22735 + }, + { + "epoch": 6.978514426028238, + "grad_norm": 0.28964513540267944, + "learning_rate": 2.2095048153165043e-05, + "loss": 1.7231, + "step": 22736 + }, + { + "epoch": 6.9788213627992635, + "grad_norm": 0.2557905316352844, + "learning_rate": 2.209092384800265e-05, + "loss": 1.7219, + "step": 22737 + }, + { + "epoch": 6.979128299570289, + "grad_norm": 0.23358628153800964, + "learning_rate": 2.2086799818654102e-05, + "loss": 1.7627, + "step": 22738 + }, + { + "epoch": 6.979435236341313, + "grad_norm": 0.18856312334537506, + "learning_rate": 2.2082676065160163e-05, + "loss": 1.6577, + "step": 22739 + }, + { + "epoch": 6.979742173112339, + "grad_norm": 0.18412479758262634, + "learning_rate": 2.207855258756158e-05, + "loss": 1.6661, + "step": 22740 + }, + { + "epoch": 6.980049109883364, + "grad_norm": 0.20592401921749115, + "learning_rate": 2.207442938589911e-05, + "loss": 1.6737, + "step": 22741 + }, + { + "epoch": 6.980356046654389, + "grad_norm": 0.2015630006790161, + "learning_rate": 2.2070306460213493e-05, + "loss": 1.73, + "step": 22742 + }, + { + "epoch": 6.980662983425415, + "grad_norm": 0.23446126282215118, + "learning_rate": 2.2066183810545454e-05, + "loss": 1.7391, + "step": 22743 + }, + { + "epoch": 6.98096992019644, + "grad_norm": 0.1810954511165619, + "learning_rate": 2.2062061436935803e-05, + "loss": 1.689, + "step": 22744 + }, + { + "epoch": 6.981276856967464, + "grad_norm": 0.25031471252441406, + "learning_rate": 2.20579393394252e-05, + "loss": 1.8161, + "step": 22745 + }, + { + "epoch": 6.98158379373849, + "grad_norm": 0.183212012052536, + "learning_rate": 2.2053817518054433e-05, + "loss": 1.6494, + "step": 22746 + }, + { + "epoch": 6.981890730509515, + "grad_norm": 0.2115766555070877, + "learning_rate": 2.204969597286422e-05, + "loss": 1.6912, + "step": 22747 + }, + { + "epoch": 6.98219766728054, + "grad_norm": 0.19966226816177368, + "learning_rate": 2.2045574703895296e-05, + "loss": 1.7002, + "step": 22748 + }, + { + "epoch": 6.982504604051566, + "grad_norm": 0.20601172745227814, + "learning_rate": 2.2041453711188385e-05, + "loss": 1.7839, + "step": 22749 + }, + { + "epoch": 6.98281154082259, + "grad_norm": 0.2174808531999588, + "learning_rate": 2.2037332994784222e-05, + "loss": 1.7169, + "step": 22750 + }, + { + "epoch": 6.9831184775936155, + "grad_norm": 0.1921808421611786, + "learning_rate": 2.2033212554723514e-05, + "loss": 1.6754, + "step": 22751 + }, + { + "epoch": 6.983425414364641, + "grad_norm": 0.1977350264787674, + "learning_rate": 2.2029092391046997e-05, + "loss": 1.7408, + "step": 22752 + }, + { + "epoch": 6.983732351135666, + "grad_norm": 0.18366695940494537, + "learning_rate": 2.2024972503795383e-05, + "loss": 1.6818, + "step": 22753 + }, + { + "epoch": 6.9840392879066915, + "grad_norm": 0.18127809464931488, + "learning_rate": 2.2020852893009387e-05, + "loss": 1.7392, + "step": 22754 + }, + { + "epoch": 6.984346224677717, + "grad_norm": 0.1973503679037094, + "learning_rate": 2.2016733558729718e-05, + "loss": 1.7416, + "step": 22755 + }, + { + "epoch": 6.984653161448741, + "grad_norm": 0.1971634328365326, + "learning_rate": 2.2012614500997096e-05, + "loss": 1.7545, + "step": 22756 + }, + { + "epoch": 6.984960098219767, + "grad_norm": 0.17244087159633636, + "learning_rate": 2.2008495719852218e-05, + "loss": 1.7348, + "step": 22757 + }, + { + "epoch": 6.985267034990792, + "grad_norm": 0.19024424254894257, + "learning_rate": 2.200437721533579e-05, + "loss": 1.6647, + "step": 22758 + }, + { + "epoch": 6.985573971761817, + "grad_norm": 0.18455122411251068, + "learning_rate": 2.200025898748852e-05, + "loss": 1.7528, + "step": 22759 + }, + { + "epoch": 6.985880908532843, + "grad_norm": 0.24437187612056732, + "learning_rate": 2.199614103635108e-05, + "loss": 1.7101, + "step": 22760 + }, + { + "epoch": 6.986187845303867, + "grad_norm": 0.18844331800937653, + "learning_rate": 2.1992023361964224e-05, + "loss": 1.6864, + "step": 22761 + }, + { + "epoch": 6.986494782074892, + "grad_norm": 0.18768003582954407, + "learning_rate": 2.1987905964368576e-05, + "loss": 1.6482, + "step": 22762 + }, + { + "epoch": 6.986801718845918, + "grad_norm": 0.19491778314113617, + "learning_rate": 2.1983788843604898e-05, + "loss": 1.7106, + "step": 22763 + }, + { + "epoch": 6.987108655616943, + "grad_norm": 0.23565757274627686, + "learning_rate": 2.1979671999713797e-05, + "loss": 1.7362, + "step": 22764 + }, + { + "epoch": 6.987415592387968, + "grad_norm": 0.2097240835428238, + "learning_rate": 2.1975555432736018e-05, + "loss": 1.7305, + "step": 22765 + }, + { + "epoch": 6.987722529158994, + "grad_norm": 0.2171555608510971, + "learning_rate": 2.197143914271223e-05, + "loss": 1.7213, + "step": 22766 + }, + { + "epoch": 6.988029465930018, + "grad_norm": 0.1993926763534546, + "learning_rate": 2.196732312968311e-05, + "loss": 1.6901, + "step": 22767 + }, + { + "epoch": 6.9883364027010435, + "grad_norm": 0.2345978319644928, + "learning_rate": 2.1963207393689346e-05, + "loss": 1.7456, + "step": 22768 + }, + { + "epoch": 6.988643339472069, + "grad_norm": 0.20831161737442017, + "learning_rate": 2.1959091934771564e-05, + "loss": 1.764, + "step": 22769 + }, + { + "epoch": 6.988950276243094, + "grad_norm": 0.24944809079170227, + "learning_rate": 2.195497675297049e-05, + "loss": 1.7398, + "step": 22770 + }, + { + "epoch": 6.989257213014119, + "grad_norm": 0.25463199615478516, + "learning_rate": 2.1950861848326777e-05, + "loss": 1.7002, + "step": 22771 + }, + { + "epoch": 6.989564149785144, + "grad_norm": 0.2298898696899414, + "learning_rate": 2.194674722088108e-05, + "loss": 1.755, + "step": 22772 + }, + { + "epoch": 6.989871086556169, + "grad_norm": 0.21839721500873566, + "learning_rate": 2.194263287067408e-05, + "loss": 1.6667, + "step": 22773 + }, + { + "epoch": 6.990178023327195, + "grad_norm": 0.2197437435388565, + "learning_rate": 2.1938518797746417e-05, + "loss": 1.6774, + "step": 22774 + }, + { + "epoch": 6.99048496009822, + "grad_norm": 0.23588024079799652, + "learning_rate": 2.1934405002138763e-05, + "loss": 1.6916, + "step": 22775 + }, + { + "epoch": 6.990791896869245, + "grad_norm": 0.20632316172122955, + "learning_rate": 2.1930291483891767e-05, + "loss": 1.6682, + "step": 22776 + }, + { + "epoch": 6.99109883364027, + "grad_norm": 0.22786293923854828, + "learning_rate": 2.192617824304607e-05, + "loss": 1.7138, + "step": 22777 + }, + { + "epoch": 6.991405770411295, + "grad_norm": 0.3235599994659424, + "learning_rate": 2.1922065279642363e-05, + "loss": 1.7545, + "step": 22778 + }, + { + "epoch": 6.99171270718232, + "grad_norm": 0.1919393390417099, + "learning_rate": 2.191795259372123e-05, + "loss": 1.7422, + "step": 22779 + }, + { + "epoch": 6.992019643953346, + "grad_norm": 0.16472585499286652, + "learning_rate": 2.1913840185323385e-05, + "loss": 1.6824, + "step": 22780 + }, + { + "epoch": 6.992326580724371, + "grad_norm": 0.21422579884529114, + "learning_rate": 2.1909728054489397e-05, + "loss": 1.696, + "step": 22781 + }, + { + "epoch": 6.9926335174953955, + "grad_norm": 0.18965782225131989, + "learning_rate": 2.190561620125996e-05, + "loss": 1.7026, + "step": 22782 + }, + { + "epoch": 6.992940454266421, + "grad_norm": 0.184856116771698, + "learning_rate": 2.190150462567569e-05, + "loss": 1.7202, + "step": 22783 + }, + { + "epoch": 6.993247391037446, + "grad_norm": 0.18382076919078827, + "learning_rate": 2.1897393327777223e-05, + "loss": 1.7525, + "step": 22784 + }, + { + "epoch": 6.9935543278084715, + "grad_norm": 0.17239750921726227, + "learning_rate": 2.1893282307605202e-05, + "loss": 1.7297, + "step": 22785 + }, + { + "epoch": 6.993861264579497, + "grad_norm": 0.18522322177886963, + "learning_rate": 2.18891715652002e-05, + "loss": 1.6952, + "step": 22786 + }, + { + "epoch": 6.994168201350522, + "grad_norm": 0.1946135014295578, + "learning_rate": 2.18850611006029e-05, + "loss": 1.6879, + "step": 22787 + }, + { + "epoch": 6.994475138121547, + "grad_norm": 0.2028069645166397, + "learning_rate": 2.188095091385391e-05, + "loss": 1.7412, + "step": 22788 + }, + { + "epoch": 6.994782074892572, + "grad_norm": 0.18794523179531097, + "learning_rate": 2.1876841004993838e-05, + "loss": 1.6936, + "step": 22789 + }, + { + "epoch": 6.995089011663597, + "grad_norm": 0.1912194788455963, + "learning_rate": 2.187273137406331e-05, + "loss": 1.7051, + "step": 22790 + }, + { + "epoch": 6.995395948434623, + "grad_norm": 0.1528688222169876, + "learning_rate": 2.1868622021102934e-05, + "loss": 1.6816, + "step": 22791 + }, + { + "epoch": 6.995702885205648, + "grad_norm": 0.2108357548713684, + "learning_rate": 2.1864512946153325e-05, + "loss": 1.7018, + "step": 22792 + }, + { + "epoch": 6.996009821976672, + "grad_norm": 0.16667310893535614, + "learning_rate": 2.1860404149255092e-05, + "loss": 1.7235, + "step": 22793 + }, + { + "epoch": 6.996316758747698, + "grad_norm": 0.16995872557163239, + "learning_rate": 2.185629563044882e-05, + "loss": 1.7086, + "step": 22794 + }, + { + "epoch": 6.996623695518723, + "grad_norm": 0.1962304711341858, + "learning_rate": 2.1852187389775165e-05, + "loss": 1.7523, + "step": 22795 + }, + { + "epoch": 6.996930632289748, + "grad_norm": 0.17774102091789246, + "learning_rate": 2.1848079427274655e-05, + "loss": 1.6649, + "step": 22796 + }, + { + "epoch": 6.997237569060774, + "grad_norm": 0.18844567239284515, + "learning_rate": 2.184397174298796e-05, + "loss": 1.7281, + "step": 22797 + }, + { + "epoch": 6.997544505831799, + "grad_norm": 0.15324150025844574, + "learning_rate": 2.1839864336955607e-05, + "loss": 1.6496, + "step": 22798 + }, + { + "epoch": 6.9978514426028235, + "grad_norm": 0.25148099660873413, + "learning_rate": 2.1835757209218233e-05, + "loss": 1.7889, + "step": 22799 + }, + { + "epoch": 6.998158379373849, + "grad_norm": 0.22258763015270233, + "learning_rate": 2.1831650359816414e-05, + "loss": 1.7303, + "step": 22800 + }, + { + "epoch": 6.998465316144874, + "grad_norm": 0.21465472877025604, + "learning_rate": 2.182754378879074e-05, + "loss": 1.733, + "step": 22801 + }, + { + "epoch": 6.9987722529158995, + "grad_norm": 0.1894017904996872, + "learning_rate": 2.182343749618181e-05, + "loss": 1.7104, + "step": 22802 + }, + { + "epoch": 6.999079189686924, + "grad_norm": 0.19616369903087616, + "learning_rate": 2.181933148203014e-05, + "loss": 1.7015, + "step": 22803 + }, + { + "epoch": 6.999386126457949, + "grad_norm": 0.1720295250415802, + "learning_rate": 2.181522574637638e-05, + "loss": 1.6609, + "step": 22804 + }, + { + "epoch": 6.999693063228975, + "grad_norm": 0.2508579194545746, + "learning_rate": 2.1811120289261077e-05, + "loss": 1.7485, + "step": 22805 + }, + { + "epoch": 7.0, + "grad_norm": 0.1701229363679886, + "learning_rate": 2.1807015110724805e-05, + "loss": 1.6822, + "step": 22806 + }, + { + "epoch": 7.000306936771025, + "grad_norm": 0.17413921654224396, + "learning_rate": 2.1802910210808135e-05, + "loss": 1.6944, + "step": 22807 + }, + { + "epoch": 7.000613873542051, + "grad_norm": 0.22573722898960114, + "learning_rate": 2.179880558955163e-05, + "loss": 1.7499, + "step": 22808 + }, + { + "epoch": 7.000920810313075, + "grad_norm": 0.2477746456861496, + "learning_rate": 2.1794701246995857e-05, + "loss": 1.7663, + "step": 22809 + }, + { + "epoch": 7.0012277470841005, + "grad_norm": 0.15338411927223206, + "learning_rate": 2.1790597183181384e-05, + "loss": 1.6425, + "step": 22810 + }, + { + "epoch": 7.001534683855126, + "grad_norm": 0.2119540572166443, + "learning_rate": 2.1786493398148738e-05, + "loss": 1.6695, + "step": 22811 + }, + { + "epoch": 7.001841620626151, + "grad_norm": 0.283037930727005, + "learning_rate": 2.178238989193854e-05, + "loss": 1.7479, + "step": 22812 + }, + { + "epoch": 7.0021485573971765, + "grad_norm": 0.2939838767051697, + "learning_rate": 2.1778286664591276e-05, + "loss": 1.733, + "step": 22813 + }, + { + "epoch": 7.002455494168202, + "grad_norm": 0.21681749820709229, + "learning_rate": 2.1774183716147552e-05, + "loss": 1.6804, + "step": 22814 + }, + { + "epoch": 7.002762430939226, + "grad_norm": 0.29066696763038635, + "learning_rate": 2.177008104664785e-05, + "loss": 1.7435, + "step": 22815 + }, + { + "epoch": 7.003069367710252, + "grad_norm": 0.17104873061180115, + "learning_rate": 2.1765978656132773e-05, + "loss": 1.6637, + "step": 22816 + }, + { + "epoch": 7.003376304481277, + "grad_norm": 0.29808685183525085, + "learning_rate": 2.1761876544642846e-05, + "loss": 1.7342, + "step": 22817 + }, + { + "epoch": 7.003683241252302, + "grad_norm": 0.20467214286327362, + "learning_rate": 2.1757774712218603e-05, + "loss": 1.7638, + "step": 22818 + }, + { + "epoch": 7.003990178023328, + "grad_norm": 0.23166583478450775, + "learning_rate": 2.1753673158900607e-05, + "loss": 1.6972, + "step": 22819 + }, + { + "epoch": 7.004297114794352, + "grad_norm": 0.20098255574703217, + "learning_rate": 2.1749571884729332e-05, + "loss": 1.6973, + "step": 22820 + }, + { + "epoch": 7.004604051565377, + "grad_norm": 0.212421715259552, + "learning_rate": 2.1745470889745358e-05, + "loss": 1.7183, + "step": 22821 + }, + { + "epoch": 7.004910988336403, + "grad_norm": 0.2496720403432846, + "learning_rate": 2.17413701739892e-05, + "loss": 1.7928, + "step": 22822 + }, + { + "epoch": 7.005217925107428, + "grad_norm": 0.21050602197647095, + "learning_rate": 2.1737269737501394e-05, + "loss": 1.7379, + "step": 22823 + }, + { + "epoch": 7.005524861878453, + "grad_norm": 0.18321558833122253, + "learning_rate": 2.1733169580322448e-05, + "loss": 1.733, + "step": 22824 + }, + { + "epoch": 7.005831798649478, + "grad_norm": 0.19890302419662476, + "learning_rate": 2.1729069702492887e-05, + "loss": 1.6799, + "step": 22825 + }, + { + "epoch": 7.006138735420503, + "grad_norm": 0.19961030781269073, + "learning_rate": 2.172497010405323e-05, + "loss": 1.6754, + "step": 22826 + }, + { + "epoch": 7.0064456721915285, + "grad_norm": 0.19672131538391113, + "learning_rate": 2.1720870785043988e-05, + "loss": 1.7099, + "step": 22827 + }, + { + "epoch": 7.006752608962554, + "grad_norm": 0.16798892617225647, + "learning_rate": 2.1716771745505666e-05, + "loss": 1.7096, + "step": 22828 + }, + { + "epoch": 7.007059545733579, + "grad_norm": 0.2276654690504074, + "learning_rate": 2.1712672985478815e-05, + "loss": 1.7627, + "step": 22829 + }, + { + "epoch": 7.0073664825046045, + "grad_norm": 0.17108316719532013, + "learning_rate": 2.1708574505003872e-05, + "loss": 1.6941, + "step": 22830 + }, + { + "epoch": 7.007673419275629, + "grad_norm": 0.2094760239124298, + "learning_rate": 2.1704476304121413e-05, + "loss": 1.7152, + "step": 22831 + }, + { + "epoch": 7.007980356046654, + "grad_norm": 0.17183393239974976, + "learning_rate": 2.1700378382871872e-05, + "loss": 1.6668, + "step": 22832 + }, + { + "epoch": 7.00828729281768, + "grad_norm": 0.2075900435447693, + "learning_rate": 2.1696280741295795e-05, + "loss": 1.7732, + "step": 22833 + }, + { + "epoch": 7.008594229588705, + "grad_norm": 0.20075511932373047, + "learning_rate": 2.169218337943368e-05, + "loss": 1.7228, + "step": 22834 + }, + { + "epoch": 7.00890116635973, + "grad_norm": 0.19461359083652496, + "learning_rate": 2.168808629732596e-05, + "loss": 1.6942, + "step": 22835 + }, + { + "epoch": 7.009208103130755, + "grad_norm": 0.18972480297088623, + "learning_rate": 2.16839894950132e-05, + "loss": 1.7087, + "step": 22836 + }, + { + "epoch": 7.00951503990178, + "grad_norm": 0.19522632658481598, + "learning_rate": 2.167989297253582e-05, + "loss": 1.7427, + "step": 22837 + }, + { + "epoch": 7.009821976672805, + "grad_norm": 0.2088990956544876, + "learning_rate": 2.1675796729934355e-05, + "loss": 1.786, + "step": 22838 + }, + { + "epoch": 7.010128913443831, + "grad_norm": 0.2052021473646164, + "learning_rate": 2.167170076724927e-05, + "loss": 1.765, + "step": 22839 + }, + { + "epoch": 7.010435850214856, + "grad_norm": 0.19566771388053894, + "learning_rate": 2.1667605084521043e-05, + "loss": 1.703, + "step": 22840 + }, + { + "epoch": 7.0107427869858805, + "grad_norm": 0.24589677155017853, + "learning_rate": 2.166350968179014e-05, + "loss": 1.7544, + "step": 22841 + }, + { + "epoch": 7.011049723756906, + "grad_norm": 0.28059569001197815, + "learning_rate": 2.1659414559097053e-05, + "loss": 1.7081, + "step": 22842 + }, + { + "epoch": 7.011356660527931, + "grad_norm": 0.20781446993350983, + "learning_rate": 2.1655319716482237e-05, + "loss": 1.6968, + "step": 22843 + }, + { + "epoch": 7.0116635972989565, + "grad_norm": 0.31703317165374756, + "learning_rate": 2.1651225153986167e-05, + "loss": 1.704, + "step": 22844 + }, + { + "epoch": 7.011970534069982, + "grad_norm": 0.19668029248714447, + "learning_rate": 2.1647130871649283e-05, + "loss": 1.738, + "step": 22845 + }, + { + "epoch": 7.012277470841007, + "grad_norm": 0.3768141567707062, + "learning_rate": 2.1643036869512105e-05, + "loss": 1.7407, + "step": 22846 + }, + { + "epoch": 7.012584407612032, + "grad_norm": 0.22228674590587616, + "learning_rate": 2.1638943147615032e-05, + "loss": 1.7162, + "step": 22847 + }, + { + "epoch": 7.012891344383057, + "grad_norm": 0.26087433099746704, + "learning_rate": 2.1634849705998572e-05, + "loss": 1.6916, + "step": 22848 + }, + { + "epoch": 7.013198281154082, + "grad_norm": 0.19660449028015137, + "learning_rate": 2.1630756544703117e-05, + "loss": 1.7024, + "step": 22849 + }, + { + "epoch": 7.013505217925108, + "grad_norm": 0.2287406474351883, + "learning_rate": 2.1626663663769176e-05, + "loss": 1.6761, + "step": 22850 + }, + { + "epoch": 7.013812154696133, + "grad_norm": 0.18974192440509796, + "learning_rate": 2.162257106323719e-05, + "loss": 1.6721, + "step": 22851 + }, + { + "epoch": 7.014119091467157, + "grad_norm": 0.25081944465637207, + "learning_rate": 2.1618478743147558e-05, + "loss": 1.7042, + "step": 22852 + }, + { + "epoch": 7.014426028238183, + "grad_norm": 0.187479630112648, + "learning_rate": 2.1614386703540785e-05, + "loss": 1.7057, + "step": 22853 + }, + { + "epoch": 7.014732965009208, + "grad_norm": 0.24785932898521423, + "learning_rate": 2.1610294944457243e-05, + "loss": 1.8033, + "step": 22854 + }, + { + "epoch": 7.015039901780233, + "grad_norm": 0.21570228040218353, + "learning_rate": 2.160620346593743e-05, + "loss": 1.7129, + "step": 22855 + }, + { + "epoch": 7.015346838551259, + "grad_norm": 0.19304436445236206, + "learning_rate": 2.160211226802175e-05, + "loss": 1.7384, + "step": 22856 + }, + { + "epoch": 7.015653775322283, + "grad_norm": 0.18901783227920532, + "learning_rate": 2.1598021350750648e-05, + "loss": 1.6851, + "step": 22857 + }, + { + "epoch": 7.0159607120933085, + "grad_norm": 0.21754276752471924, + "learning_rate": 2.159393071416454e-05, + "loss": 1.7242, + "step": 22858 + }, + { + "epoch": 7.016267648864334, + "grad_norm": 0.18334844708442688, + "learning_rate": 2.1589840358303858e-05, + "loss": 1.66, + "step": 22859 + }, + { + "epoch": 7.016574585635359, + "grad_norm": 0.17688371241092682, + "learning_rate": 2.1585750283209026e-05, + "loss": 1.6693, + "step": 22860 + }, + { + "epoch": 7.0168815224063845, + "grad_norm": 0.17173215746879578, + "learning_rate": 2.158166048892047e-05, + "loss": 1.675, + "step": 22861 + }, + { + "epoch": 7.01718845917741, + "grad_norm": 0.2144075632095337, + "learning_rate": 2.157757097547857e-05, + "loss": 1.7843, + "step": 22862 + }, + { + "epoch": 7.017495395948434, + "grad_norm": 0.18811818957328796, + "learning_rate": 2.1573481742923824e-05, + "loss": 1.6932, + "step": 22863 + }, + { + "epoch": 7.01780233271946, + "grad_norm": 0.19978533685207367, + "learning_rate": 2.1569392791296548e-05, + "loss": 1.7426, + "step": 22864 + }, + { + "epoch": 7.018109269490485, + "grad_norm": 0.19639068841934204, + "learning_rate": 2.1565304120637237e-05, + "loss": 1.7479, + "step": 22865 + }, + { + "epoch": 7.01841620626151, + "grad_norm": 0.2269967794418335, + "learning_rate": 2.1561215730986212e-05, + "loss": 1.7507, + "step": 22866 + }, + { + "epoch": 7.018723143032536, + "grad_norm": 0.19511014223098755, + "learning_rate": 2.1557127622383948e-05, + "loss": 1.7317, + "step": 22867 + }, + { + "epoch": 7.01903007980356, + "grad_norm": 0.23975026607513428, + "learning_rate": 2.1553039794870834e-05, + "loss": 1.7901, + "step": 22868 + }, + { + "epoch": 7.019337016574585, + "grad_norm": 0.20757955312728882, + "learning_rate": 2.154895224848722e-05, + "loss": 1.7823, + "step": 22869 + }, + { + "epoch": 7.019643953345611, + "grad_norm": 0.1893112063407898, + "learning_rate": 2.154486498327357e-05, + "loss": 1.6939, + "step": 22870 + }, + { + "epoch": 7.019950890116636, + "grad_norm": 0.23006685078144073, + "learning_rate": 2.1540777999270205e-05, + "loss": 1.8061, + "step": 22871 + }, + { + "epoch": 7.020257826887661, + "grad_norm": 0.25516194105148315, + "learning_rate": 2.1536691296517573e-05, + "loss": 1.6801, + "step": 22872 + }, + { + "epoch": 7.020564763658686, + "grad_norm": 0.2138557732105255, + "learning_rate": 2.153260487505604e-05, + "loss": 1.7689, + "step": 22873 + }, + { + "epoch": 7.020871700429711, + "grad_norm": 0.2618521749973297, + "learning_rate": 2.152851873492599e-05, + "loss": 1.712, + "step": 22874 + }, + { + "epoch": 7.0211786372007365, + "grad_norm": 0.19639171659946442, + "learning_rate": 2.1524432876167812e-05, + "loss": 1.6883, + "step": 22875 + }, + { + "epoch": 7.021485573971762, + "grad_norm": 0.20283572375774384, + "learning_rate": 2.152034729882187e-05, + "loss": 1.7259, + "step": 22876 + }, + { + "epoch": 7.021792510742787, + "grad_norm": 0.247970849275589, + "learning_rate": 2.151626200292855e-05, + "loss": 1.6714, + "step": 22877 + }, + { + "epoch": 7.0220994475138125, + "grad_norm": 0.20877771079540253, + "learning_rate": 2.1512176988528227e-05, + "loss": 1.7378, + "step": 22878 + }, + { + "epoch": 7.022406384284837, + "grad_norm": 0.2515791356563568, + "learning_rate": 2.1508092255661245e-05, + "loss": 1.743, + "step": 22879 + }, + { + "epoch": 7.022713321055862, + "grad_norm": 0.21451319754123688, + "learning_rate": 2.150400780436804e-05, + "loss": 1.7102, + "step": 22880 + }, + { + "epoch": 7.023020257826888, + "grad_norm": 0.23944756388664246, + "learning_rate": 2.1499923634688886e-05, + "loss": 1.7739, + "step": 22881 + }, + { + "epoch": 7.023327194597913, + "grad_norm": 0.22423309087753296, + "learning_rate": 2.149583974666423e-05, + "loss": 1.7598, + "step": 22882 + }, + { + "epoch": 7.023634131368938, + "grad_norm": 0.31337371468544006, + "learning_rate": 2.1491756140334358e-05, + "loss": 1.7417, + "step": 22883 + }, + { + "epoch": 7.023941068139963, + "grad_norm": 0.22430868446826935, + "learning_rate": 2.148767281573968e-05, + "loss": 1.712, + "step": 22884 + }, + { + "epoch": 7.024248004910988, + "grad_norm": 0.26083487272262573, + "learning_rate": 2.148358977292054e-05, + "loss": 1.6816, + "step": 22885 + }, + { + "epoch": 7.024554941682013, + "grad_norm": 0.2283557504415512, + "learning_rate": 2.1479507011917255e-05, + "loss": 1.7539, + "step": 22886 + }, + { + "epoch": 7.024861878453039, + "grad_norm": 0.22732317447662354, + "learning_rate": 2.1475424532770232e-05, + "loss": 1.697, + "step": 22887 + }, + { + "epoch": 7.025168815224064, + "grad_norm": 0.19614318013191223, + "learning_rate": 2.1471342335519746e-05, + "loss": 1.7267, + "step": 22888 + }, + { + "epoch": 7.0254757519950894, + "grad_norm": 0.23076513409614563, + "learning_rate": 2.1467260420206192e-05, + "loss": 1.7749, + "step": 22889 + }, + { + "epoch": 7.025782688766114, + "grad_norm": 0.1969364732503891, + "learning_rate": 2.1463178786869892e-05, + "loss": 1.6975, + "step": 22890 + }, + { + "epoch": 7.026089625537139, + "grad_norm": 0.2126578837633133, + "learning_rate": 2.145909743555119e-05, + "loss": 1.6815, + "step": 22891 + }, + { + "epoch": 7.026396562308165, + "grad_norm": 0.20841559767723083, + "learning_rate": 2.1455016366290414e-05, + "loss": 1.727, + "step": 22892 + }, + { + "epoch": 7.02670349907919, + "grad_norm": 0.2523893713951111, + "learning_rate": 2.1450935579127896e-05, + "loss": 1.7213, + "step": 22893 + }, + { + "epoch": 7.027010435850215, + "grad_norm": 0.16219666600227356, + "learning_rate": 2.1446855074103968e-05, + "loss": 1.6406, + "step": 22894 + }, + { + "epoch": 7.02731737262124, + "grad_norm": 0.28709226846694946, + "learning_rate": 2.144277485125895e-05, + "loss": 1.7021, + "step": 22895 + }, + { + "epoch": 7.027624309392265, + "grad_norm": 0.23238243162631989, + "learning_rate": 2.1438694910633174e-05, + "loss": 1.7347, + "step": 22896 + }, + { + "epoch": 7.02793124616329, + "grad_norm": 0.2692428231239319, + "learning_rate": 2.1434615252266948e-05, + "loss": 1.7192, + "step": 22897 + }, + { + "epoch": 7.028238182934316, + "grad_norm": 0.21163232624530792, + "learning_rate": 2.1430535876200584e-05, + "loss": 1.7437, + "step": 22898 + }, + { + "epoch": 7.028545119705341, + "grad_norm": 0.23896420001983643, + "learning_rate": 2.1426456782474446e-05, + "loss": 1.6773, + "step": 22899 + }, + { + "epoch": 7.0288520564763655, + "grad_norm": 0.19021281599998474, + "learning_rate": 2.142237797112877e-05, + "loss": 1.7084, + "step": 22900 + }, + { + "epoch": 7.029158993247391, + "grad_norm": 0.23483091592788696, + "learning_rate": 2.1418299442203926e-05, + "loss": 1.7678, + "step": 22901 + }, + { + "epoch": 7.029465930018416, + "grad_norm": 0.20831161737442017, + "learning_rate": 2.1414221195740213e-05, + "loss": 1.7454, + "step": 22902 + }, + { + "epoch": 7.0297728667894415, + "grad_norm": 0.1961016207933426, + "learning_rate": 2.141014323177789e-05, + "loss": 1.7231, + "step": 22903 + }, + { + "epoch": 7.030079803560467, + "grad_norm": 0.1877545267343521, + "learning_rate": 2.1406065550357322e-05, + "loss": 1.6925, + "step": 22904 + }, + { + "epoch": 7.030386740331492, + "grad_norm": 0.20815789699554443, + "learning_rate": 2.1401988151518738e-05, + "loss": 1.7762, + "step": 22905 + }, + { + "epoch": 7.030693677102517, + "grad_norm": 0.1902543157339096, + "learning_rate": 2.1397911035302487e-05, + "loss": 1.7663, + "step": 22906 + }, + { + "epoch": 7.031000613873542, + "grad_norm": 0.20552431046962738, + "learning_rate": 2.1393834201748846e-05, + "loss": 1.7048, + "step": 22907 + }, + { + "epoch": 7.031307550644567, + "grad_norm": 0.2380477488040924, + "learning_rate": 2.13897576508981e-05, + "loss": 1.7685, + "step": 22908 + }, + { + "epoch": 7.031614487415593, + "grad_norm": 0.18351083993911743, + "learning_rate": 2.1385681382790536e-05, + "loss": 1.7058, + "step": 22909 + }, + { + "epoch": 7.031921424186618, + "grad_norm": 0.21992792189121246, + "learning_rate": 2.1381605397466442e-05, + "loss": 1.7608, + "step": 22910 + }, + { + "epoch": 7.032228360957642, + "grad_norm": 0.24412932991981506, + "learning_rate": 2.1377529694966097e-05, + "loss": 1.7205, + "step": 22911 + }, + { + "epoch": 7.032535297728668, + "grad_norm": 0.20398534834384918, + "learning_rate": 2.137345427532978e-05, + "loss": 1.7318, + "step": 22912 + }, + { + "epoch": 7.032842234499693, + "grad_norm": 0.2346884161233902, + "learning_rate": 2.136937913859776e-05, + "loss": 1.7159, + "step": 22913 + }, + { + "epoch": 7.033149171270718, + "grad_norm": 0.19422392547130585, + "learning_rate": 2.1365304284810327e-05, + "loss": 1.7229, + "step": 22914 + }, + { + "epoch": 7.033456108041744, + "grad_norm": 0.24088126420974731, + "learning_rate": 2.1361229714007714e-05, + "loss": 1.77, + "step": 22915 + }, + { + "epoch": 7.033763044812768, + "grad_norm": 0.18886598944664001, + "learning_rate": 2.135715542623026e-05, + "loss": 1.7724, + "step": 22916 + }, + { + "epoch": 7.0340699815837935, + "grad_norm": 0.18816733360290527, + "learning_rate": 2.135308142151814e-05, + "loss": 1.7174, + "step": 22917 + }, + { + "epoch": 7.034376918354819, + "grad_norm": 0.184849813580513, + "learning_rate": 2.1349007699911694e-05, + "loss": 1.7026, + "step": 22918 + }, + { + "epoch": 7.034683855125844, + "grad_norm": 0.1638055443763733, + "learning_rate": 2.134493426145113e-05, + "loss": 1.683, + "step": 22919 + }, + { + "epoch": 7.0349907918968695, + "grad_norm": 0.18030275404453278, + "learning_rate": 2.1340861106176713e-05, + "loss": 1.6963, + "step": 22920 + }, + { + "epoch": 7.035297728667895, + "grad_norm": 0.221226304769516, + "learning_rate": 2.133678823412873e-05, + "loss": 1.7851, + "step": 22921 + }, + { + "epoch": 7.035604665438919, + "grad_norm": 0.18877451121807098, + "learning_rate": 2.1332715645347373e-05, + "loss": 1.7111, + "step": 22922 + }, + { + "epoch": 7.035911602209945, + "grad_norm": 0.17179232835769653, + "learning_rate": 2.1328643339872938e-05, + "loss": 1.6737, + "step": 22923 + }, + { + "epoch": 7.03621853898097, + "grad_norm": 0.17912441492080688, + "learning_rate": 2.1324571317745657e-05, + "loss": 1.7798, + "step": 22924 + }, + { + "epoch": 7.036525475751995, + "grad_norm": 0.2120780050754547, + "learning_rate": 2.132049957900577e-05, + "loss": 1.7353, + "step": 22925 + }, + { + "epoch": 7.036832412523021, + "grad_norm": 0.17286419868469238, + "learning_rate": 2.1316428123693517e-05, + "loss": 1.667, + "step": 22926 + }, + { + "epoch": 7.037139349294045, + "grad_norm": 0.1824301779270172, + "learning_rate": 2.1312356951849126e-05, + "loss": 1.6925, + "step": 22927 + }, + { + "epoch": 7.03744628606507, + "grad_norm": 0.16392327845096588, + "learning_rate": 2.1308286063512843e-05, + "loss": 1.7145, + "step": 22928 + }, + { + "epoch": 7.037753222836096, + "grad_norm": 0.18268297612667084, + "learning_rate": 2.1304215458724895e-05, + "loss": 1.7251, + "step": 22929 + }, + { + "epoch": 7.038060159607121, + "grad_norm": 0.19878868758678436, + "learning_rate": 2.1300145137525505e-05, + "loss": 1.7192, + "step": 22930 + }, + { + "epoch": 7.038367096378146, + "grad_norm": 0.18570293486118317, + "learning_rate": 2.1296075099954908e-05, + "loss": 1.718, + "step": 22931 + }, + { + "epoch": 7.038674033149171, + "grad_norm": 0.16497015953063965, + "learning_rate": 2.12920053460533e-05, + "loss": 1.6914, + "step": 22932 + }, + { + "epoch": 7.038980969920196, + "grad_norm": 0.20224586129188538, + "learning_rate": 2.128793587586096e-05, + "loss": 1.6941, + "step": 22933 + }, + { + "epoch": 7.0392879066912215, + "grad_norm": 0.22124920785427094, + "learning_rate": 2.1283866689418024e-05, + "loss": 1.7921, + "step": 22934 + }, + { + "epoch": 7.039594843462247, + "grad_norm": 0.20548123121261597, + "learning_rate": 2.127979778676479e-05, + "loss": 1.7488, + "step": 22935 + }, + { + "epoch": 7.039901780233272, + "grad_norm": 0.17604656517505646, + "learning_rate": 2.1275729167941405e-05, + "loss": 1.7145, + "step": 22936 + }, + { + "epoch": 7.0402087170042975, + "grad_norm": 0.17899781465530396, + "learning_rate": 2.127166083298809e-05, + "loss": 1.6703, + "step": 22937 + }, + { + "epoch": 7.040515653775322, + "grad_norm": 0.16101998090744019, + "learning_rate": 2.126759278194509e-05, + "loss": 1.715, + "step": 22938 + }, + { + "epoch": 7.040822590546347, + "grad_norm": 0.22807051241397858, + "learning_rate": 2.1263525014852542e-05, + "loss": 1.7409, + "step": 22939 + }, + { + "epoch": 7.041129527317373, + "grad_norm": 0.19442932307720184, + "learning_rate": 2.125945753175072e-05, + "loss": 1.6953, + "step": 22940 + }, + { + "epoch": 7.041436464088398, + "grad_norm": 0.24816946685314178, + "learning_rate": 2.1255390332679755e-05, + "loss": 1.7527, + "step": 22941 + }, + { + "epoch": 7.041743400859423, + "grad_norm": 0.26748740673065186, + "learning_rate": 2.1251323417679882e-05, + "loss": 1.7703, + "step": 22942 + }, + { + "epoch": 7.042050337630448, + "grad_norm": 0.19965825974941254, + "learning_rate": 2.124725678679128e-05, + "loss": 1.7303, + "step": 22943 + }, + { + "epoch": 7.042357274401473, + "grad_norm": 0.2442217618227005, + "learning_rate": 2.124319044005414e-05, + "loss": 1.7183, + "step": 22944 + }, + { + "epoch": 7.042664211172498, + "grad_norm": 0.21421664953231812, + "learning_rate": 2.1239124377508646e-05, + "loss": 1.7348, + "step": 22945 + }, + { + "epoch": 7.042971147943524, + "grad_norm": 0.26072144508361816, + "learning_rate": 2.1235058599194984e-05, + "loss": 1.7396, + "step": 22946 + }, + { + "epoch": 7.043278084714549, + "grad_norm": 0.20694412291049957, + "learning_rate": 2.1230993105153335e-05, + "loss": 1.7871, + "step": 22947 + }, + { + "epoch": 7.043585021485574, + "grad_norm": 0.298551082611084, + "learning_rate": 2.122692789542387e-05, + "loss": 1.7051, + "step": 22948 + }, + { + "epoch": 7.043891958256599, + "grad_norm": 0.22547855973243713, + "learning_rate": 2.1222862970046752e-05, + "loss": 1.7392, + "step": 22949 + }, + { + "epoch": 7.044198895027624, + "grad_norm": 0.3150571882724762, + "learning_rate": 2.1218798329062205e-05, + "loss": 1.6705, + "step": 22950 + }, + { + "epoch": 7.0445058317986495, + "grad_norm": 0.2025378942489624, + "learning_rate": 2.1214733972510327e-05, + "loss": 1.7114, + "step": 22951 + }, + { + "epoch": 7.044812768569675, + "grad_norm": 0.29046711325645447, + "learning_rate": 2.1210669900431353e-05, + "loss": 1.7745, + "step": 22952 + }, + { + "epoch": 7.0451197053407, + "grad_norm": 0.23395368456840515, + "learning_rate": 2.1206606112865396e-05, + "loss": 1.7829, + "step": 22953 + }, + { + "epoch": 7.045426642111725, + "grad_norm": 0.21395133435726166, + "learning_rate": 2.1202542609852616e-05, + "loss": 1.7211, + "step": 22954 + }, + { + "epoch": 7.04573357888275, + "grad_norm": 0.18077452480793, + "learning_rate": 2.1198479391433223e-05, + "loss": 1.7584, + "step": 22955 + }, + { + "epoch": 7.046040515653775, + "grad_norm": 0.17318682372570038, + "learning_rate": 2.1194416457647302e-05, + "loss": 1.7525, + "step": 22956 + }, + { + "epoch": 7.046347452424801, + "grad_norm": 0.18798092007637024, + "learning_rate": 2.119035380853508e-05, + "loss": 1.7525, + "step": 22957 + }, + { + "epoch": 7.046654389195826, + "grad_norm": 0.18679840862751007, + "learning_rate": 2.118629144413663e-05, + "loss": 1.7729, + "step": 22958 + }, + { + "epoch": 7.04696132596685, + "grad_norm": 0.17846907675266266, + "learning_rate": 2.1182229364492156e-05, + "loss": 1.7354, + "step": 22959 + }, + { + "epoch": 7.047268262737876, + "grad_norm": 0.22771520912647247, + "learning_rate": 2.1178167569641783e-05, + "loss": 1.7086, + "step": 22960 + }, + { + "epoch": 7.047575199508901, + "grad_norm": 0.1541738212108612, + "learning_rate": 2.1174106059625642e-05, + "loss": 1.67, + "step": 22961 + }, + { + "epoch": 7.047882136279926, + "grad_norm": 0.17698390781879425, + "learning_rate": 2.117004483448389e-05, + "loss": 1.68, + "step": 22962 + }, + { + "epoch": 7.048189073050952, + "grad_norm": 0.2220597118139267, + "learning_rate": 2.1165983894256647e-05, + "loss": 1.7783, + "step": 22963 + }, + { + "epoch": 7.048496009821977, + "grad_norm": 0.20971544086933136, + "learning_rate": 2.1161923238984055e-05, + "loss": 1.7318, + "step": 22964 + }, + { + "epoch": 7.0488029465930016, + "grad_norm": 0.2032100409269333, + "learning_rate": 2.1157862868706242e-05, + "loss": 1.6736, + "step": 22965 + }, + { + "epoch": 7.049109883364027, + "grad_norm": 0.19177256524562836, + "learning_rate": 2.115380278346331e-05, + "loss": 1.74, + "step": 22966 + }, + { + "epoch": 7.049416820135052, + "grad_norm": 0.1956746131181717, + "learning_rate": 2.1149742983295446e-05, + "loss": 1.7251, + "step": 22967 + }, + { + "epoch": 7.0497237569060776, + "grad_norm": 0.16200929880142212, + "learning_rate": 2.114568346824269e-05, + "loss": 1.6735, + "step": 22968 + }, + { + "epoch": 7.050030693677103, + "grad_norm": 0.19551095366477966, + "learning_rate": 2.1141624238345242e-05, + "loss": 1.7185, + "step": 22969 + }, + { + "epoch": 7.050337630448127, + "grad_norm": 0.17967839539051056, + "learning_rate": 2.1137565293643158e-05, + "loss": 1.7262, + "step": 22970 + }, + { + "epoch": 7.050644567219153, + "grad_norm": 0.15093082189559937, + "learning_rate": 2.1133506634176552e-05, + "loss": 1.6695, + "step": 22971 + }, + { + "epoch": 7.050951503990178, + "grad_norm": 0.20207351446151733, + "learning_rate": 2.1129448259985595e-05, + "loss": 1.7448, + "step": 22972 + }, + { + "epoch": 7.051258440761203, + "grad_norm": 0.20243801176548004, + "learning_rate": 2.112539017111031e-05, + "loss": 1.7496, + "step": 22973 + }, + { + "epoch": 7.051565377532229, + "grad_norm": 0.1967451572418213, + "learning_rate": 2.112133236759088e-05, + "loss": 1.718, + "step": 22974 + }, + { + "epoch": 7.051872314303253, + "grad_norm": 0.17668583989143372, + "learning_rate": 2.1117274849467334e-05, + "loss": 1.7295, + "step": 22975 + }, + { + "epoch": 7.0521792510742785, + "grad_norm": 0.17461778223514557, + "learning_rate": 2.1113217616779824e-05, + "loss": 1.7166, + "step": 22976 + }, + { + "epoch": 7.052486187845304, + "grad_norm": 0.18184112012386322, + "learning_rate": 2.110916066956843e-05, + "loss": 1.7092, + "step": 22977 + }, + { + "epoch": 7.052793124616329, + "grad_norm": 0.18001540005207062, + "learning_rate": 2.1105104007873246e-05, + "loss": 1.7129, + "step": 22978 + }, + { + "epoch": 7.0531000613873545, + "grad_norm": 0.15966519713401794, + "learning_rate": 2.1101047631734355e-05, + "loss": 1.7121, + "step": 22979 + }, + { + "epoch": 7.05340699815838, + "grad_norm": 0.20201170444488525, + "learning_rate": 2.109699154119185e-05, + "loss": 1.7266, + "step": 22980 + }, + { + "epoch": 7.053713934929404, + "grad_norm": 0.19559438526630402, + "learning_rate": 2.1092935736285817e-05, + "loss": 1.7492, + "step": 22981 + }, + { + "epoch": 7.05402087170043, + "grad_norm": 0.17783302068710327, + "learning_rate": 2.108888021705634e-05, + "loss": 1.6901, + "step": 22982 + }, + { + "epoch": 7.054327808471455, + "grad_norm": 0.22052957117557526, + "learning_rate": 2.108482498354347e-05, + "loss": 1.6771, + "step": 22983 + }, + { + "epoch": 7.05463474524248, + "grad_norm": 0.1899181455373764, + "learning_rate": 2.1080770035787346e-05, + "loss": 1.7011, + "step": 22984 + }, + { + "epoch": 7.054941682013506, + "grad_norm": 0.19773316383361816, + "learning_rate": 2.1076715373827964e-05, + "loss": 1.7535, + "step": 22985 + }, + { + "epoch": 7.05524861878453, + "grad_norm": 0.2244229018688202, + "learning_rate": 2.1072660997705475e-05, + "loss": 1.7938, + "step": 22986 + }, + { + "epoch": 7.055555555555555, + "grad_norm": 0.18881015479564667, + "learning_rate": 2.106860690745988e-05, + "loss": 1.6753, + "step": 22987 + }, + { + "epoch": 7.055862492326581, + "grad_norm": 0.19642052054405212, + "learning_rate": 2.106455310313126e-05, + "loss": 1.735, + "step": 22988 + }, + { + "epoch": 7.056169429097606, + "grad_norm": 0.23549412190914154, + "learning_rate": 2.106049958475971e-05, + "loss": 1.7705, + "step": 22989 + }, + { + "epoch": 7.056476365868631, + "grad_norm": 0.21001911163330078, + "learning_rate": 2.1056446352385235e-05, + "loss": 1.6802, + "step": 22990 + }, + { + "epoch": 7.056783302639656, + "grad_norm": 0.1821003556251526, + "learning_rate": 2.1052393406047953e-05, + "loss": 1.7144, + "step": 22991 + }, + { + "epoch": 7.057090239410681, + "grad_norm": 0.1979309767484665, + "learning_rate": 2.104834074578786e-05, + "loss": 1.6983, + "step": 22992 + }, + { + "epoch": 7.0573971761817065, + "grad_norm": 0.18264134228229523, + "learning_rate": 2.1044288371645045e-05, + "loss": 1.7001, + "step": 22993 + }, + { + "epoch": 7.057704112952732, + "grad_norm": 0.17276059091091156, + "learning_rate": 2.104023628365954e-05, + "loss": 1.6976, + "step": 22994 + }, + { + "epoch": 7.058011049723757, + "grad_norm": 0.18879400193691254, + "learning_rate": 2.1036184481871402e-05, + "loss": 1.6954, + "step": 22995 + }, + { + "epoch": 7.0583179864947825, + "grad_norm": 0.1956210434436798, + "learning_rate": 2.103213296632066e-05, + "loss": 1.7329, + "step": 22996 + }, + { + "epoch": 7.058624923265807, + "grad_norm": 0.21108154952526093, + "learning_rate": 2.1028081737047356e-05, + "loss": 1.7299, + "step": 22997 + }, + { + "epoch": 7.058931860036832, + "grad_norm": 0.17981186509132385, + "learning_rate": 2.1024030794091537e-05, + "loss": 1.7162, + "step": 22998 + }, + { + "epoch": 7.059238796807858, + "grad_norm": 0.1699269711971283, + "learning_rate": 2.101998013749322e-05, + "loss": 1.6842, + "step": 22999 + }, + { + "epoch": 7.059545733578883, + "grad_norm": 0.17033198475837708, + "learning_rate": 2.1015929767292435e-05, + "loss": 1.6735, + "step": 23000 + }, + { + "epoch": 7.059852670349908, + "grad_norm": 0.18620076775550842, + "learning_rate": 2.101187968352925e-05, + "loss": 1.7328, + "step": 23001 + }, + { + "epoch": 7.060159607120933, + "grad_norm": 0.17528964579105377, + "learning_rate": 2.100782988624363e-05, + "loss": 1.6567, + "step": 23002 + }, + { + "epoch": 7.060466543891958, + "grad_norm": 0.1946999728679657, + "learning_rate": 2.100378037547566e-05, + "loss": 1.7349, + "step": 23003 + }, + { + "epoch": 7.060773480662983, + "grad_norm": 0.23345647752285004, + "learning_rate": 2.0999731151265312e-05, + "loss": 1.7185, + "step": 23004 + }, + { + "epoch": 7.061080417434009, + "grad_norm": 0.20169813930988312, + "learning_rate": 2.0995682213652603e-05, + "loss": 1.7223, + "step": 23005 + }, + { + "epoch": 7.061387354205034, + "grad_norm": 0.2397730052471161, + "learning_rate": 2.0991633562677594e-05, + "loss": 1.7542, + "step": 23006 + }, + { + "epoch": 7.0616942909760585, + "grad_norm": 0.20421954989433289, + "learning_rate": 2.0987585198380227e-05, + "loss": 1.6888, + "step": 23007 + }, + { + "epoch": 7.062001227747084, + "grad_norm": 0.21555101871490479, + "learning_rate": 2.0983537120800584e-05, + "loss": 1.6796, + "step": 23008 + }, + { + "epoch": 7.062308164518109, + "grad_norm": 0.17311134934425354, + "learning_rate": 2.0979489329978603e-05, + "loss": 1.7199, + "step": 23009 + }, + { + "epoch": 7.0626151012891345, + "grad_norm": 0.25064393877983093, + "learning_rate": 2.0975441825954334e-05, + "loss": 1.6947, + "step": 23010 + }, + { + "epoch": 7.06292203806016, + "grad_norm": 0.19135847687721252, + "learning_rate": 2.0971394608767757e-05, + "loss": 1.702, + "step": 23011 + }, + { + "epoch": 7.063228974831185, + "grad_norm": 0.22994364798069, + "learning_rate": 2.0967347678458876e-05, + "loss": 1.6814, + "step": 23012 + }, + { + "epoch": 7.06353591160221, + "grad_norm": 0.21897611021995544, + "learning_rate": 2.0963301035067685e-05, + "loss": 1.7063, + "step": 23013 + }, + { + "epoch": 7.063842848373235, + "grad_norm": 0.23615150153636932, + "learning_rate": 2.0959254678634166e-05, + "loss": 1.7299, + "step": 23014 + }, + { + "epoch": 7.06414978514426, + "grad_norm": 0.1837770640850067, + "learning_rate": 2.0955208609198314e-05, + "loss": 1.7236, + "step": 23015 + }, + { + "epoch": 7.064456721915286, + "grad_norm": 0.16823385655879974, + "learning_rate": 2.0951162826800118e-05, + "loss": 1.6687, + "step": 23016 + }, + { + "epoch": 7.064763658686311, + "grad_norm": 0.17042338848114014, + "learning_rate": 2.094711733147954e-05, + "loss": 1.6907, + "step": 23017 + }, + { + "epoch": 7.065070595457335, + "grad_norm": 0.1753006875514984, + "learning_rate": 2.094307212327661e-05, + "loss": 1.7313, + "step": 23018 + }, + { + "epoch": 7.065377532228361, + "grad_norm": 0.19618375599384308, + "learning_rate": 2.093902720223123e-05, + "loss": 1.7147, + "step": 23019 + }, + { + "epoch": 7.065684468999386, + "grad_norm": 0.20214296877384186, + "learning_rate": 2.093498256838346e-05, + "loss": 1.7056, + "step": 23020 + }, + { + "epoch": 7.065991405770411, + "grad_norm": 0.20230883359909058, + "learning_rate": 2.093093822177321e-05, + "loss": 1.6628, + "step": 23021 + }, + { + "epoch": 7.066298342541437, + "grad_norm": 0.19913128018379211, + "learning_rate": 2.0926894162440446e-05, + "loss": 1.7286, + "step": 23022 + }, + { + "epoch": 7.066605279312462, + "grad_norm": 0.19535091519355774, + "learning_rate": 2.0922850390425193e-05, + "loss": 1.745, + "step": 23023 + }, + { + "epoch": 7.0669122160834865, + "grad_norm": 0.19679825007915497, + "learning_rate": 2.0918806905767337e-05, + "loss": 1.694, + "step": 23024 + }, + { + "epoch": 7.067219152854512, + "grad_norm": 0.1821403056383133, + "learning_rate": 2.0914763708506913e-05, + "loss": 1.7163, + "step": 23025 + }, + { + "epoch": 7.067526089625537, + "grad_norm": 0.17138415575027466, + "learning_rate": 2.0910720798683803e-05, + "loss": 1.6946, + "step": 23026 + }, + { + "epoch": 7.0678330263965625, + "grad_norm": 0.20219111442565918, + "learning_rate": 2.0906678176338017e-05, + "loss": 1.7437, + "step": 23027 + }, + { + "epoch": 7.068139963167588, + "grad_norm": 0.1985882669687271, + "learning_rate": 2.0902635841509494e-05, + "loss": 1.6762, + "step": 23028 + }, + { + "epoch": 7.068446899938612, + "grad_norm": 0.18586322665214539, + "learning_rate": 2.0898593794238174e-05, + "loss": 1.7296, + "step": 23029 + }, + { + "epoch": 7.068753836709638, + "grad_norm": 0.19222751259803772, + "learning_rate": 2.0894552034564013e-05, + "loss": 1.7186, + "step": 23030 + }, + { + "epoch": 7.069060773480663, + "grad_norm": 0.16107569634914398, + "learning_rate": 2.0890510562526944e-05, + "loss": 1.6898, + "step": 23031 + }, + { + "epoch": 7.069367710251688, + "grad_norm": 0.23859064280986786, + "learning_rate": 2.088646937816691e-05, + "loss": 1.7992, + "step": 23032 + }, + { + "epoch": 7.069674647022714, + "grad_norm": 0.22927051782608032, + "learning_rate": 2.0882428481523853e-05, + "loss": 1.7162, + "step": 23033 + }, + { + "epoch": 7.069981583793738, + "grad_norm": 0.18094350397586823, + "learning_rate": 2.0878387872637684e-05, + "loss": 1.7297, + "step": 23034 + }, + { + "epoch": 7.070288520564763, + "grad_norm": 0.20562811195850372, + "learning_rate": 2.087434755154839e-05, + "loss": 1.7475, + "step": 23035 + }, + { + "epoch": 7.070595457335789, + "grad_norm": 0.18405984342098236, + "learning_rate": 2.087030751829583e-05, + "loss": 1.6954, + "step": 23036 + }, + { + "epoch": 7.070902394106814, + "grad_norm": 0.26286160945892334, + "learning_rate": 2.0866267772919994e-05, + "loss": 1.7406, + "step": 23037 + }, + { + "epoch": 7.071209330877839, + "grad_norm": 0.1688467413187027, + "learning_rate": 2.086222831546077e-05, + "loss": 1.7375, + "step": 23038 + }, + { + "epoch": 7.071516267648865, + "grad_norm": 0.25445011258125305, + "learning_rate": 2.0858189145958057e-05, + "loss": 1.7479, + "step": 23039 + }, + { + "epoch": 7.071823204419889, + "grad_norm": 0.20637978613376617, + "learning_rate": 2.085415026445184e-05, + "loss": 1.7653, + "step": 23040 + }, + { + "epoch": 7.0721301411909145, + "grad_norm": 0.21693937480449677, + "learning_rate": 2.0850111670981952e-05, + "loss": 1.7392, + "step": 23041 + }, + { + "epoch": 7.07243707796194, + "grad_norm": 0.1999017745256424, + "learning_rate": 2.0846073365588388e-05, + "loss": 1.753, + "step": 23042 + }, + { + "epoch": 7.072744014732965, + "grad_norm": 0.2271260917186737, + "learning_rate": 2.0842035348310973e-05, + "loss": 1.7136, + "step": 23043 + }, + { + "epoch": 7.0730509515039905, + "grad_norm": 0.1915169358253479, + "learning_rate": 2.0837997619189675e-05, + "loss": 1.7142, + "step": 23044 + }, + { + "epoch": 7.073357888275015, + "grad_norm": 0.2250204086303711, + "learning_rate": 2.0833960178264377e-05, + "loss": 1.8039, + "step": 23045 + }, + { + "epoch": 7.07366482504604, + "grad_norm": 0.20920081436634064, + "learning_rate": 2.0829923025574976e-05, + "loss": 1.767, + "step": 23046 + }, + { + "epoch": 7.073971761817066, + "grad_norm": 0.16039173305034637, + "learning_rate": 2.082588616116138e-05, + "loss": 1.6895, + "step": 23047 + }, + { + "epoch": 7.074278698588091, + "grad_norm": 0.1849806159734726, + "learning_rate": 2.082184958506347e-05, + "loss": 1.7323, + "step": 23048 + }, + { + "epoch": 7.074585635359116, + "grad_norm": 0.22370420396327972, + "learning_rate": 2.081781329732115e-05, + "loss": 1.7478, + "step": 23049 + }, + { + "epoch": 7.074892572130141, + "grad_norm": 0.1600474864244461, + "learning_rate": 2.0813777297974296e-05, + "loss": 1.6754, + "step": 23050 + }, + { + "epoch": 7.075199508901166, + "grad_norm": 0.18357187509536743, + "learning_rate": 2.080974158706281e-05, + "loss": 1.694, + "step": 23051 + }, + { + "epoch": 7.0755064456721914, + "grad_norm": 0.17667005956172943, + "learning_rate": 2.080570616462656e-05, + "loss": 1.7053, + "step": 23052 + }, + { + "epoch": 7.075813382443217, + "grad_norm": 0.19393591582775116, + "learning_rate": 2.0801671030705417e-05, + "loss": 1.7917, + "step": 23053 + }, + { + "epoch": 7.076120319214242, + "grad_norm": 0.19432564079761505, + "learning_rate": 2.0797636185339307e-05, + "loss": 1.7276, + "step": 23054 + }, + { + "epoch": 7.0764272559852675, + "grad_norm": 0.17960594594478607, + "learning_rate": 2.079360162856806e-05, + "loss": 1.6988, + "step": 23055 + }, + { + "epoch": 7.076734192756292, + "grad_norm": 0.183505579829216, + "learning_rate": 2.0789567360431538e-05, + "loss": 1.7106, + "step": 23056 + }, + { + "epoch": 7.077041129527317, + "grad_norm": 0.27859750390052795, + "learning_rate": 2.0785533380969673e-05, + "loss": 1.779, + "step": 23057 + }, + { + "epoch": 7.077348066298343, + "grad_norm": 0.1903255134820938, + "learning_rate": 2.078149969022225e-05, + "loss": 1.7334, + "step": 23058 + }, + { + "epoch": 7.077655003069368, + "grad_norm": 0.2221076786518097, + "learning_rate": 2.0777466288229207e-05, + "loss": 1.6863, + "step": 23059 + }, + { + "epoch": 7.077961939840393, + "grad_norm": 0.15516065061092377, + "learning_rate": 2.0773433175030336e-05, + "loss": 1.6633, + "step": 23060 + }, + { + "epoch": 7.078268876611418, + "grad_norm": 0.20073910057544708, + "learning_rate": 2.0769400350665553e-05, + "loss": 1.7057, + "step": 23061 + }, + { + "epoch": 7.078575813382443, + "grad_norm": 0.1680205762386322, + "learning_rate": 2.076536781517468e-05, + "loss": 1.6659, + "step": 23062 + }, + { + "epoch": 7.078882750153468, + "grad_norm": 0.20825456082820892, + "learning_rate": 2.0761335568597584e-05, + "loss": 1.751, + "step": 23063 + }, + { + "epoch": 7.079189686924494, + "grad_norm": 0.17365674674510956, + "learning_rate": 2.0757303610974098e-05, + "loss": 1.6591, + "step": 23064 + }, + { + "epoch": 7.079496623695519, + "grad_norm": 0.21712929010391235, + "learning_rate": 2.0753271942344087e-05, + "loss": 1.7357, + "step": 23065 + }, + { + "epoch": 7.0798035604665435, + "grad_norm": 0.1841089278459549, + "learning_rate": 2.074924056274738e-05, + "loss": 1.6818, + "step": 23066 + }, + { + "epoch": 7.080110497237569, + "grad_norm": 0.20433486998081207, + "learning_rate": 2.074520947222382e-05, + "loss": 1.76, + "step": 23067 + }, + { + "epoch": 7.080417434008594, + "grad_norm": 0.1712963879108429, + "learning_rate": 2.074117867081325e-05, + "loss": 1.6426, + "step": 23068 + }, + { + "epoch": 7.0807243707796195, + "grad_norm": 0.19894109666347504, + "learning_rate": 2.0737148158555504e-05, + "loss": 1.7529, + "step": 23069 + }, + { + "epoch": 7.081031307550645, + "grad_norm": 0.19338269531726837, + "learning_rate": 2.0733117935490386e-05, + "loss": 1.8274, + "step": 23070 + }, + { + "epoch": 7.08133824432167, + "grad_norm": 0.20883139967918396, + "learning_rate": 2.0729088001657794e-05, + "loss": 1.7275, + "step": 23071 + }, + { + "epoch": 7.081645181092695, + "grad_norm": 0.18498694896697998, + "learning_rate": 2.0725058357097487e-05, + "loss": 1.6648, + "step": 23072 + }, + { + "epoch": 7.08195211786372, + "grad_norm": 0.1727421134710312, + "learning_rate": 2.0721029001849313e-05, + "loss": 1.7709, + "step": 23073 + }, + { + "epoch": 7.082259054634745, + "grad_norm": 0.16965949535369873, + "learning_rate": 2.0716999935953096e-05, + "loss": 1.6876, + "step": 23074 + }, + { + "epoch": 7.082565991405771, + "grad_norm": 0.16905519366264343, + "learning_rate": 2.0712971159448623e-05, + "loss": 1.6576, + "step": 23075 + }, + { + "epoch": 7.082872928176796, + "grad_norm": 0.2863580882549286, + "learning_rate": 2.0708942672375776e-05, + "loss": 1.7631, + "step": 23076 + }, + { + "epoch": 7.08317986494782, + "grad_norm": 0.26248931884765625, + "learning_rate": 2.070491447477429e-05, + "loss": 1.7692, + "step": 23077 + }, + { + "epoch": 7.083486801718846, + "grad_norm": 0.17670878767967224, + "learning_rate": 2.0700886566684024e-05, + "loss": 1.6725, + "step": 23078 + }, + { + "epoch": 7.083793738489871, + "grad_norm": 0.19245800375938416, + "learning_rate": 2.0696858948144775e-05, + "loss": 1.7249, + "step": 23079 + }, + { + "epoch": 7.084100675260896, + "grad_norm": 0.18651939928531647, + "learning_rate": 2.0692831619196335e-05, + "loss": 1.7616, + "step": 23080 + }, + { + "epoch": 7.084407612031922, + "grad_norm": 0.21432510018348694, + "learning_rate": 2.0688804579878514e-05, + "loss": 1.743, + "step": 23081 + }, + { + "epoch": 7.084714548802946, + "grad_norm": 0.18530069291591644, + "learning_rate": 2.0684777830231106e-05, + "loss": 1.7257, + "step": 23082 + }, + { + "epoch": 7.0850214855739715, + "grad_norm": 0.1974172443151474, + "learning_rate": 2.0680751370293903e-05, + "loss": 1.6918, + "step": 23083 + }, + { + "epoch": 7.085328422344997, + "grad_norm": 0.19517268240451813, + "learning_rate": 2.0676725200106706e-05, + "loss": 1.7421, + "step": 23084 + }, + { + "epoch": 7.085635359116022, + "grad_norm": 0.28572699427604675, + "learning_rate": 2.067269931970929e-05, + "loss": 1.7575, + "step": 23085 + }, + { + "epoch": 7.0859422958870475, + "grad_norm": 0.2062397003173828, + "learning_rate": 2.0668673729141452e-05, + "loss": 1.7085, + "step": 23086 + }, + { + "epoch": 7.086249232658073, + "grad_norm": 0.21619725227355957, + "learning_rate": 2.0664648428442973e-05, + "loss": 1.7783, + "step": 23087 + }, + { + "epoch": 7.086556169429097, + "grad_norm": 0.2732481360435486, + "learning_rate": 2.066062341765363e-05, + "loss": 1.7089, + "step": 23088 + }, + { + "epoch": 7.086863106200123, + "grad_norm": 0.19897356629371643, + "learning_rate": 2.06565986968132e-05, + "loss": 1.6487, + "step": 23089 + }, + { + "epoch": 7.087170042971148, + "grad_norm": 0.2578796148300171, + "learning_rate": 2.0652574265961466e-05, + "loss": 1.7385, + "step": 23090 + }, + { + "epoch": 7.087476979742173, + "grad_norm": 0.18980316817760468, + "learning_rate": 2.0648550125138195e-05, + "loss": 1.6651, + "step": 23091 + }, + { + "epoch": 7.087783916513199, + "grad_norm": 0.279580682516098, + "learning_rate": 2.064452627438313e-05, + "loss": 1.7189, + "step": 23092 + }, + { + "epoch": 7.088090853284223, + "grad_norm": 0.18652775883674622, + "learning_rate": 2.0640502713736103e-05, + "loss": 1.7085, + "step": 23093 + }, + { + "epoch": 7.088397790055248, + "grad_norm": 0.2729358673095703, + "learning_rate": 2.06364794432368e-05, + "loss": 1.6812, + "step": 23094 + }, + { + "epoch": 7.088704726826274, + "grad_norm": 0.1756472885608673, + "learning_rate": 2.0632456462925053e-05, + "loss": 1.6835, + "step": 23095 + }, + { + "epoch": 7.089011663597299, + "grad_norm": 0.2352994978427887, + "learning_rate": 2.062843377284055e-05, + "loss": 1.6898, + "step": 23096 + }, + { + "epoch": 7.089318600368324, + "grad_norm": 0.20231495797634125, + "learning_rate": 2.0624411373023093e-05, + "loss": 1.7294, + "step": 23097 + }, + { + "epoch": 7.08962553713935, + "grad_norm": 0.276114821434021, + "learning_rate": 2.0620389263512424e-05, + "loss": 1.6864, + "step": 23098 + }, + { + "epoch": 7.089932473910374, + "grad_norm": 0.2178632766008377, + "learning_rate": 2.0616367444348288e-05, + "loss": 1.7353, + "step": 23099 + }, + { + "epoch": 7.0902394106813995, + "grad_norm": 0.20966552197933197, + "learning_rate": 2.061234591557043e-05, + "loss": 1.6579, + "step": 23100 + }, + { + "epoch": 7.090546347452425, + "grad_norm": 0.16496559977531433, + "learning_rate": 2.0608324677218592e-05, + "loss": 1.7137, + "step": 23101 + }, + { + "epoch": 7.09085328422345, + "grad_norm": 0.19176827371120453, + "learning_rate": 2.0604303729332525e-05, + "loss": 1.6996, + "step": 23102 + }, + { + "epoch": 7.0911602209944755, + "grad_norm": 0.20933480560779572, + "learning_rate": 2.060028307195195e-05, + "loss": 1.7887, + "step": 23103 + }, + { + "epoch": 7.0914671577655, + "grad_norm": 0.1925809681415558, + "learning_rate": 2.0596262705116613e-05, + "loss": 1.6974, + "step": 23104 + }, + { + "epoch": 7.091774094536525, + "grad_norm": 0.1582585573196411, + "learning_rate": 2.0592242628866236e-05, + "loss": 1.6731, + "step": 23105 + }, + { + "epoch": 7.092081031307551, + "grad_norm": 0.20380592346191406, + "learning_rate": 2.058822284324056e-05, + "loss": 1.6911, + "step": 23106 + }, + { + "epoch": 7.092387968078576, + "grad_norm": 0.17984862625598907, + "learning_rate": 2.0584203348279307e-05, + "loss": 1.7218, + "step": 23107 + }, + { + "epoch": 7.092694904849601, + "grad_norm": 0.22097790241241455, + "learning_rate": 2.058018414402219e-05, + "loss": 1.7223, + "step": 23108 + }, + { + "epoch": 7.093001841620626, + "grad_norm": 0.20519912242889404, + "learning_rate": 2.0576165230508926e-05, + "loss": 1.7197, + "step": 23109 + }, + { + "epoch": 7.093308778391651, + "grad_norm": 0.2156807780265808, + "learning_rate": 2.0572146607779274e-05, + "loss": 1.7079, + "step": 23110 + }, + { + "epoch": 7.093615715162676, + "grad_norm": 0.21810726821422577, + "learning_rate": 2.056812827587288e-05, + "loss": 1.7456, + "step": 23111 + }, + { + "epoch": 7.093922651933702, + "grad_norm": 0.2288726568222046, + "learning_rate": 2.0564110234829536e-05, + "loss": 1.8113, + "step": 23112 + }, + { + "epoch": 7.094229588704727, + "grad_norm": 0.21279199421405792, + "learning_rate": 2.056009248468887e-05, + "loss": 1.7554, + "step": 23113 + }, + { + "epoch": 7.094536525475752, + "grad_norm": 0.18577606976032257, + "learning_rate": 2.055607502549064e-05, + "loss": 1.661, + "step": 23114 + }, + { + "epoch": 7.094843462246777, + "grad_norm": 0.17938728630542755, + "learning_rate": 2.0552057857274536e-05, + "loss": 1.6998, + "step": 23115 + }, + { + "epoch": 7.095150399017802, + "grad_norm": 0.1946432888507843, + "learning_rate": 2.0548040980080258e-05, + "loss": 1.7146, + "step": 23116 + }, + { + "epoch": 7.0954573357888275, + "grad_norm": 0.21220463514328003, + "learning_rate": 2.0544024393947496e-05, + "loss": 1.7345, + "step": 23117 + }, + { + "epoch": 7.095764272559853, + "grad_norm": 0.2006370723247528, + "learning_rate": 2.0540008098915954e-05, + "loss": 1.7636, + "step": 23118 + }, + { + "epoch": 7.096071209330878, + "grad_norm": 0.17251192033290863, + "learning_rate": 2.0535992095025312e-05, + "loss": 1.7103, + "step": 23119 + }, + { + "epoch": 7.096378146101903, + "grad_norm": 0.2393570840358734, + "learning_rate": 2.0531976382315277e-05, + "loss": 1.7636, + "step": 23120 + }, + { + "epoch": 7.096685082872928, + "grad_norm": 0.16999265551567078, + "learning_rate": 2.0527960960825516e-05, + "loss": 1.6571, + "step": 23121 + }, + { + "epoch": 7.096992019643953, + "grad_norm": 0.17626826465129852, + "learning_rate": 2.052394583059572e-05, + "loss": 1.713, + "step": 23122 + }, + { + "epoch": 7.097298956414979, + "grad_norm": 0.18373346328735352, + "learning_rate": 2.051993099166557e-05, + "loss": 1.7102, + "step": 23123 + }, + { + "epoch": 7.097605893186004, + "grad_norm": 0.1913219541311264, + "learning_rate": 2.0515916444074734e-05, + "loss": 1.7441, + "step": 23124 + }, + { + "epoch": 7.097912829957028, + "grad_norm": 0.19664399325847626, + "learning_rate": 2.0511902187862903e-05, + "loss": 1.6866, + "step": 23125 + }, + { + "epoch": 7.098219766728054, + "grad_norm": 0.16524936258792877, + "learning_rate": 2.050788822306971e-05, + "loss": 1.6709, + "step": 23126 + }, + { + "epoch": 7.098526703499079, + "grad_norm": 0.19291190803050995, + "learning_rate": 2.050387454973489e-05, + "loss": 1.7033, + "step": 23127 + }, + { + "epoch": 7.098833640270104, + "grad_norm": 0.19915525615215302, + "learning_rate": 2.0499861167898037e-05, + "loss": 1.7425, + "step": 23128 + }, + { + "epoch": 7.09914057704113, + "grad_norm": 0.21295227110385895, + "learning_rate": 2.0495848077598883e-05, + "loss": 1.7516, + "step": 23129 + }, + { + "epoch": 7.099447513812155, + "grad_norm": 0.21469831466674805, + "learning_rate": 2.0491835278877014e-05, + "loss": 1.7129, + "step": 23130 + }, + { + "epoch": 7.0997544505831796, + "grad_norm": 0.16860374808311462, + "learning_rate": 2.0487822771772143e-05, + "loss": 1.7172, + "step": 23131 + }, + { + "epoch": 7.100061387354205, + "grad_norm": 0.22386015951633453, + "learning_rate": 2.04838105563239e-05, + "loss": 1.7829, + "step": 23132 + }, + { + "epoch": 7.10036832412523, + "grad_norm": 0.22635474801063538, + "learning_rate": 2.047979863257195e-05, + "loss": 1.6956, + "step": 23133 + }, + { + "epoch": 7.100675260896256, + "grad_norm": 0.20508790016174316, + "learning_rate": 2.0475787000555924e-05, + "loss": 1.7404, + "step": 23134 + }, + { + "epoch": 7.100982197667281, + "grad_norm": 0.2055993378162384, + "learning_rate": 2.047177566031548e-05, + "loss": 1.7064, + "step": 23135 + }, + { + "epoch": 7.101289134438305, + "grad_norm": 0.19258326292037964, + "learning_rate": 2.0467764611890254e-05, + "loss": 1.7078, + "step": 23136 + }, + { + "epoch": 7.101596071209331, + "grad_norm": 0.20766718685626984, + "learning_rate": 2.046375385531989e-05, + "loss": 1.6854, + "step": 23137 + }, + { + "epoch": 7.101903007980356, + "grad_norm": 0.17945602536201477, + "learning_rate": 2.045974339064402e-05, + "loss": 1.6986, + "step": 23138 + }, + { + "epoch": 7.102209944751381, + "grad_norm": 0.17283397912979126, + "learning_rate": 2.045573321790228e-05, + "loss": 1.7296, + "step": 23139 + }, + { + "epoch": 7.102516881522407, + "grad_norm": 0.19000805914402008, + "learning_rate": 2.0451723337134298e-05, + "loss": 1.7005, + "step": 23140 + }, + { + "epoch": 7.102823818293431, + "grad_norm": 0.1966131180524826, + "learning_rate": 2.044771374837971e-05, + "loss": 1.7574, + "step": 23141 + }, + { + "epoch": 7.1031307550644565, + "grad_norm": 0.2411719709634781, + "learning_rate": 2.0443704451678137e-05, + "loss": 1.7599, + "step": 23142 + }, + { + "epoch": 7.103437691835482, + "grad_norm": 0.23902751505374908, + "learning_rate": 2.0439695447069173e-05, + "loss": 1.6805, + "step": 23143 + }, + { + "epoch": 7.103744628606507, + "grad_norm": 0.19117529690265656, + "learning_rate": 2.0435686734592508e-05, + "loss": 1.7482, + "step": 23144 + }, + { + "epoch": 7.1040515653775325, + "grad_norm": 0.18491674959659576, + "learning_rate": 2.0431678314287678e-05, + "loss": 1.6764, + "step": 23145 + }, + { + "epoch": 7.104358502148558, + "grad_norm": 0.21000699698925018, + "learning_rate": 2.042767018619437e-05, + "loss": 1.7185, + "step": 23146 + }, + { + "epoch": 7.104665438919582, + "grad_norm": 0.17373491823673248, + "learning_rate": 2.0423662350352117e-05, + "loss": 1.6945, + "step": 23147 + }, + { + "epoch": 7.104972375690608, + "grad_norm": 0.18387937545776367, + "learning_rate": 2.041965480680059e-05, + "loss": 1.766, + "step": 23148 + }, + { + "epoch": 7.105279312461633, + "grad_norm": 0.15976013243198395, + "learning_rate": 2.0415647555579376e-05, + "loss": 1.6446, + "step": 23149 + }, + { + "epoch": 7.105586249232658, + "grad_norm": 0.19251346588134766, + "learning_rate": 2.0411640596728066e-05, + "loss": 1.7122, + "step": 23150 + }, + { + "epoch": 7.105893186003684, + "grad_norm": 0.1640147864818573, + "learning_rate": 2.040763393028627e-05, + "loss": 1.7057, + "step": 23151 + }, + { + "epoch": 7.106200122774708, + "grad_norm": 0.20366166532039642, + "learning_rate": 2.0403627556293577e-05, + "loss": 1.7173, + "step": 23152 + }, + { + "epoch": 7.106507059545733, + "grad_norm": 0.18549348413944244, + "learning_rate": 2.039962147478958e-05, + "loss": 1.7215, + "step": 23153 + }, + { + "epoch": 7.106813996316759, + "grad_norm": 0.16964925825595856, + "learning_rate": 2.039561568581388e-05, + "loss": 1.6931, + "step": 23154 + }, + { + "epoch": 7.107120933087784, + "grad_norm": 0.16923274099826813, + "learning_rate": 2.0391610189406058e-05, + "loss": 1.6976, + "step": 23155 + }, + { + "epoch": 7.107427869858809, + "grad_norm": 0.17707234621047974, + "learning_rate": 2.038760498560569e-05, + "loss": 1.7102, + "step": 23156 + }, + { + "epoch": 7.107734806629834, + "grad_norm": 0.2048260122537613, + "learning_rate": 2.0383600074452376e-05, + "loss": 1.7116, + "step": 23157 + }, + { + "epoch": 7.108041743400859, + "grad_norm": 0.17328095436096191, + "learning_rate": 2.037959545598568e-05, + "loss": 1.6683, + "step": 23158 + }, + { + "epoch": 7.1083486801718845, + "grad_norm": 0.15829013288021088, + "learning_rate": 2.037559113024518e-05, + "loss": 1.6617, + "step": 23159 + }, + { + "epoch": 7.10865561694291, + "grad_norm": 0.21150968968868256, + "learning_rate": 2.037158709727044e-05, + "loss": 1.7057, + "step": 23160 + }, + { + "epoch": 7.108962553713935, + "grad_norm": 0.20321892201900482, + "learning_rate": 2.0367583357101072e-05, + "loss": 1.6811, + "step": 23161 + }, + { + "epoch": 7.1092694904849605, + "grad_norm": 0.19491781294345856, + "learning_rate": 2.0363579909776583e-05, + "loss": 1.6794, + "step": 23162 + }, + { + "epoch": 7.109576427255985, + "grad_norm": 0.155877947807312, + "learning_rate": 2.0359576755336594e-05, + "loss": 1.7434, + "step": 23163 + }, + { + "epoch": 7.10988336402701, + "grad_norm": 0.17822639644145966, + "learning_rate": 2.0355573893820613e-05, + "loss": 1.7029, + "step": 23164 + }, + { + "epoch": 7.110190300798036, + "grad_norm": 0.18152910470962524, + "learning_rate": 2.0351571325268242e-05, + "loss": 1.7277, + "step": 23165 + }, + { + "epoch": 7.110497237569061, + "grad_norm": 0.19928498566150665, + "learning_rate": 2.034756904971902e-05, + "loss": 1.7852, + "step": 23166 + }, + { + "epoch": 7.110804174340086, + "grad_norm": 0.19099318981170654, + "learning_rate": 2.0343567067212504e-05, + "loss": 1.7258, + "step": 23167 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 0.19800841808319092, + "learning_rate": 2.033956537778824e-05, + "loss": 1.7647, + "step": 23168 + }, + { + "epoch": 7.111418047882136, + "grad_norm": 0.20110327005386353, + "learning_rate": 2.0335563981485768e-05, + "loss": 1.7111, + "step": 23169 + }, + { + "epoch": 7.111724984653161, + "grad_norm": 0.1875200718641281, + "learning_rate": 2.0331562878344645e-05, + "loss": 1.7145, + "step": 23170 + }, + { + "epoch": 7.112031921424187, + "grad_norm": 0.17586658895015717, + "learning_rate": 2.032756206840441e-05, + "loss": 1.663, + "step": 23171 + }, + { + "epoch": 7.112338858195212, + "grad_norm": 0.1783432811498642, + "learning_rate": 2.032356155170459e-05, + "loss": 1.7146, + "step": 23172 + }, + { + "epoch": 7.112645794966237, + "grad_norm": 0.16075368225574493, + "learning_rate": 2.0319561328284737e-05, + "loss": 1.6414, + "step": 23173 + }, + { + "epoch": 7.112952731737262, + "grad_norm": 0.22822627425193787, + "learning_rate": 2.0315561398184367e-05, + "loss": 1.7363, + "step": 23174 + }, + { + "epoch": 7.113259668508287, + "grad_norm": 0.1882331818342209, + "learning_rate": 2.0311561761443026e-05, + "loss": 1.7384, + "step": 23175 + }, + { + "epoch": 7.1135666052793125, + "grad_norm": 0.21478623151779175, + "learning_rate": 2.0307562418100228e-05, + "loss": 1.7314, + "step": 23176 + }, + { + "epoch": 7.113873542050338, + "grad_norm": 0.18545235693454742, + "learning_rate": 2.0303563368195483e-05, + "loss": 1.7046, + "step": 23177 + }, + { + "epoch": 7.114180478821363, + "grad_norm": 0.1965286284685135, + "learning_rate": 2.0299564611768367e-05, + "loss": 1.7423, + "step": 23178 + }, + { + "epoch": 7.114487415592388, + "grad_norm": 0.1679733693599701, + "learning_rate": 2.0295566148858332e-05, + "loss": 1.6861, + "step": 23179 + }, + { + "epoch": 7.114794352363413, + "grad_norm": 0.18930186331272125, + "learning_rate": 2.029156797950495e-05, + "loss": 1.6609, + "step": 23180 + }, + { + "epoch": 7.115101289134438, + "grad_norm": 0.20774266123771667, + "learning_rate": 2.0287570103747672e-05, + "loss": 1.6919, + "step": 23181 + }, + { + "epoch": 7.115408225905464, + "grad_norm": 0.1866706907749176, + "learning_rate": 2.028357252162606e-05, + "loss": 1.7385, + "step": 23182 + }, + { + "epoch": 7.115715162676489, + "grad_norm": 0.21728016436100006, + "learning_rate": 2.0279575233179605e-05, + "loss": 1.7574, + "step": 23183 + }, + { + "epoch": 7.116022099447513, + "grad_norm": 0.16665934026241302, + "learning_rate": 2.02755782384478e-05, + "loss": 1.7046, + "step": 23184 + }, + { + "epoch": 7.116329036218539, + "grad_norm": 0.17275744676589966, + "learning_rate": 2.027158153747016e-05, + "loss": 1.6914, + "step": 23185 + }, + { + "epoch": 7.116635972989564, + "grad_norm": 0.15803802013397217, + "learning_rate": 2.026758513028617e-05, + "loss": 1.6932, + "step": 23186 + }, + { + "epoch": 7.116942909760589, + "grad_norm": 0.17434535920619965, + "learning_rate": 2.0263589016935336e-05, + "loss": 1.6714, + "step": 23187 + }, + { + "epoch": 7.117249846531615, + "grad_norm": 0.18005578219890594, + "learning_rate": 2.025959319745714e-05, + "loss": 1.6728, + "step": 23188 + }, + { + "epoch": 7.11755678330264, + "grad_norm": 0.19545695185661316, + "learning_rate": 2.025559767189108e-05, + "loss": 1.7475, + "step": 23189 + }, + { + "epoch": 7.1178637200736645, + "grad_norm": 0.19226810336112976, + "learning_rate": 2.025160244027663e-05, + "loss": 1.7447, + "step": 23190 + }, + { + "epoch": 7.11817065684469, + "grad_norm": 0.1682211458683014, + "learning_rate": 2.0247607502653286e-05, + "loss": 1.687, + "step": 23191 + }, + { + "epoch": 7.118477593615715, + "grad_norm": 0.1883849948644638, + "learning_rate": 2.0243612859060524e-05, + "loss": 1.7556, + "step": 23192 + }, + { + "epoch": 7.1187845303867405, + "grad_norm": 0.16668641567230225, + "learning_rate": 2.0239618509537817e-05, + "loss": 1.6683, + "step": 23193 + }, + { + "epoch": 7.119091467157766, + "grad_norm": 0.21448664367198944, + "learning_rate": 2.023562445412463e-05, + "loss": 1.709, + "step": 23194 + }, + { + "epoch": 7.11939840392879, + "grad_norm": 0.24347564578056335, + "learning_rate": 2.0231630692860476e-05, + "loss": 1.7775, + "step": 23195 + }, + { + "epoch": 7.119705340699816, + "grad_norm": 0.20289309322834015, + "learning_rate": 2.0227637225784767e-05, + "loss": 1.8258, + "step": 23196 + }, + { + "epoch": 7.120012277470841, + "grad_norm": 0.20075447857379913, + "learning_rate": 2.022364405293703e-05, + "loss": 1.686, + "step": 23197 + }, + { + "epoch": 7.120319214241866, + "grad_norm": 0.17129302024841309, + "learning_rate": 2.021965117435666e-05, + "loss": 1.6937, + "step": 23198 + }, + { + "epoch": 7.120626151012892, + "grad_norm": 0.222218856215477, + "learning_rate": 2.0215658590083164e-05, + "loss": 1.6812, + "step": 23199 + }, + { + "epoch": 7.120933087783916, + "grad_norm": 0.1955309957265854, + "learning_rate": 2.0211666300155996e-05, + "loss": 1.7652, + "step": 23200 + }, + { + "epoch": 7.121240024554941, + "grad_norm": 0.20479047298431396, + "learning_rate": 2.0207674304614595e-05, + "loss": 1.7393, + "step": 23201 + }, + { + "epoch": 7.121546961325967, + "grad_norm": 0.14726878702640533, + "learning_rate": 2.020368260349842e-05, + "loss": 1.6766, + "step": 23202 + }, + { + "epoch": 7.121853898096992, + "grad_norm": 0.19149260222911835, + "learning_rate": 2.0199691196846914e-05, + "loss": 1.7176, + "step": 23203 + }, + { + "epoch": 7.122160834868017, + "grad_norm": 0.17182055115699768, + "learning_rate": 2.019570008469953e-05, + "loss": 1.6828, + "step": 23204 + }, + { + "epoch": 7.122467771639043, + "grad_norm": 0.16044408082962036, + "learning_rate": 2.019170926709571e-05, + "loss": 1.6595, + "step": 23205 + }, + { + "epoch": 7.122774708410067, + "grad_norm": 0.21787980198860168, + "learning_rate": 2.0187718744074885e-05, + "loss": 1.7114, + "step": 23206 + }, + { + "epoch": 7.1230816451810925, + "grad_norm": 0.16959737241268158, + "learning_rate": 2.01837285156765e-05, + "loss": 1.7128, + "step": 23207 + }, + { + "epoch": 7.123388581952118, + "grad_norm": 0.28120318055152893, + "learning_rate": 2.0179738581939983e-05, + "loss": 1.8386, + "step": 23208 + }, + { + "epoch": 7.123695518723143, + "grad_norm": 0.19752691686153412, + "learning_rate": 2.017574894290477e-05, + "loss": 1.7123, + "step": 23209 + }, + { + "epoch": 7.1240024554941686, + "grad_norm": 0.19860398769378662, + "learning_rate": 2.0171759598610286e-05, + "loss": 1.7041, + "step": 23210 + }, + { + "epoch": 7.124309392265193, + "grad_norm": 0.17429523169994354, + "learning_rate": 2.0167770549095937e-05, + "loss": 1.6963, + "step": 23211 + }, + { + "epoch": 7.124616329036218, + "grad_norm": 0.27635815739631653, + "learning_rate": 2.01637817944012e-05, + "loss": 1.8261, + "step": 23212 + }, + { + "epoch": 7.124923265807244, + "grad_norm": 0.17512556910514832, + "learning_rate": 2.0159793334565424e-05, + "loss": 1.7311, + "step": 23213 + }, + { + "epoch": 7.125230202578269, + "grad_norm": 0.1964988112449646, + "learning_rate": 2.01558051696281e-05, + "loss": 1.6829, + "step": 23214 + }, + { + "epoch": 7.125537139349294, + "grad_norm": 0.20796819031238556, + "learning_rate": 2.0151817299628563e-05, + "loss": 1.7084, + "step": 23215 + }, + { + "epoch": 7.12584407612032, + "grad_norm": 0.19875051081180573, + "learning_rate": 2.0147829724606278e-05, + "loss": 1.7197, + "step": 23216 + }, + { + "epoch": 7.126151012891344, + "grad_norm": 0.22590650618076324, + "learning_rate": 2.0143842444600635e-05, + "loss": 1.7923, + "step": 23217 + }, + { + "epoch": 7.1264579496623695, + "grad_norm": 0.19106422364711761, + "learning_rate": 2.0139855459651042e-05, + "loss": 1.7096, + "step": 23218 + }, + { + "epoch": 7.126764886433395, + "grad_norm": 0.2105991542339325, + "learning_rate": 2.01358687697969e-05, + "loss": 1.6836, + "step": 23219 + }, + { + "epoch": 7.12707182320442, + "grad_norm": 0.18826960027217865, + "learning_rate": 2.013188237507761e-05, + "loss": 1.7347, + "step": 23220 + }, + { + "epoch": 7.1273787599754455, + "grad_norm": 0.1865578591823578, + "learning_rate": 2.012789627553256e-05, + "loss": 1.7115, + "step": 23221 + }, + { + "epoch": 7.12768569674647, + "grad_norm": 0.18389549851417542, + "learning_rate": 2.0123910471201145e-05, + "loss": 1.6817, + "step": 23222 + }, + { + "epoch": 7.127992633517495, + "grad_norm": 0.18351595103740692, + "learning_rate": 2.0119924962122766e-05, + "loss": 1.6898, + "step": 23223 + }, + { + "epoch": 7.128299570288521, + "grad_norm": 0.1913219839334488, + "learning_rate": 2.01159397483368e-05, + "loss": 1.7536, + "step": 23224 + }, + { + "epoch": 7.128606507059546, + "grad_norm": 0.17707225680351257, + "learning_rate": 2.0111954829882628e-05, + "loss": 1.6894, + "step": 23225 + }, + { + "epoch": 7.128913443830571, + "grad_norm": 0.17774651944637299, + "learning_rate": 2.0107970206799637e-05, + "loss": 1.6599, + "step": 23226 + }, + { + "epoch": 7.129220380601596, + "grad_norm": 0.14530350267887115, + "learning_rate": 2.0103985879127207e-05, + "loss": 1.6264, + "step": 23227 + }, + { + "epoch": 7.129527317372621, + "grad_norm": 0.15673531591892242, + "learning_rate": 2.010000184690471e-05, + "loss": 1.6577, + "step": 23228 + }, + { + "epoch": 7.129834254143646, + "grad_norm": 0.20691752433776855, + "learning_rate": 2.009601811017152e-05, + "loss": 1.7129, + "step": 23229 + }, + { + "epoch": 7.130141190914672, + "grad_norm": 0.16686022281646729, + "learning_rate": 2.0092034668966987e-05, + "loss": 1.6738, + "step": 23230 + }, + { + "epoch": 7.130448127685697, + "grad_norm": 0.17799030244350433, + "learning_rate": 2.0088051523330536e-05, + "loss": 1.7312, + "step": 23231 + }, + { + "epoch": 7.1307550644567215, + "grad_norm": 0.16749511659145355, + "learning_rate": 2.0084068673301454e-05, + "loss": 1.6616, + "step": 23232 + }, + { + "epoch": 7.131062001227747, + "grad_norm": 0.18347670137882233, + "learning_rate": 2.0080086118919156e-05, + "loss": 1.6622, + "step": 23233 + }, + { + "epoch": 7.131368937998772, + "grad_norm": 0.19747060537338257, + "learning_rate": 2.007610386022299e-05, + "loss": 1.7341, + "step": 23234 + }, + { + "epoch": 7.1316758747697975, + "grad_norm": 0.21067634224891663, + "learning_rate": 2.0072121897252295e-05, + "loss": 1.7252, + "step": 23235 + }, + { + "epoch": 7.131982811540823, + "grad_norm": 0.2095600962638855, + "learning_rate": 2.006814023004644e-05, + "loss": 1.7769, + "step": 23236 + }, + { + "epoch": 7.132289748311848, + "grad_norm": 0.23090791702270508, + "learning_rate": 2.0064158858644765e-05, + "loss": 1.7734, + "step": 23237 + }, + { + "epoch": 7.132596685082873, + "grad_norm": 0.19060610234737396, + "learning_rate": 2.0060177783086614e-05, + "loss": 1.7209, + "step": 23238 + }, + { + "epoch": 7.132903621853898, + "grad_norm": 0.18050087988376617, + "learning_rate": 2.0056197003411342e-05, + "loss": 1.6882, + "step": 23239 + }, + { + "epoch": 7.133210558624923, + "grad_norm": 0.1504158228635788, + "learning_rate": 2.005221651965828e-05, + "loss": 1.687, + "step": 23240 + }, + { + "epoch": 7.133517495395949, + "grad_norm": 0.22980810701847076, + "learning_rate": 2.004823633186676e-05, + "loss": 1.7254, + "step": 23241 + }, + { + "epoch": 7.133824432166974, + "grad_norm": 0.20092199742794037, + "learning_rate": 2.004425644007613e-05, + "loss": 1.7234, + "step": 23242 + }, + { + "epoch": 7.134131368937998, + "grad_norm": 0.21002927422523499, + "learning_rate": 2.0040276844325718e-05, + "loss": 1.7272, + "step": 23243 + }, + { + "epoch": 7.134438305709024, + "grad_norm": 0.18524625897407532, + "learning_rate": 2.003629754465484e-05, + "loss": 1.7189, + "step": 23244 + }, + { + "epoch": 7.134745242480049, + "grad_norm": 0.21095192432403564, + "learning_rate": 2.0032318541102845e-05, + "loss": 1.7177, + "step": 23245 + }, + { + "epoch": 7.135052179251074, + "grad_norm": 0.1700662076473236, + "learning_rate": 2.0028339833709037e-05, + "loss": 1.6925, + "step": 23246 + }, + { + "epoch": 7.1353591160221, + "grad_norm": 0.2123938947916031, + "learning_rate": 2.002436142251272e-05, + "loss": 1.7623, + "step": 23247 + }, + { + "epoch": 7.135666052793125, + "grad_norm": 0.194299578666687, + "learning_rate": 2.0020383307553275e-05, + "loss": 1.6898, + "step": 23248 + }, + { + "epoch": 7.1359729895641495, + "grad_norm": 0.18740688264369965, + "learning_rate": 2.001640548886993e-05, + "loss": 1.6519, + "step": 23249 + }, + { + "epoch": 7.136279926335175, + "grad_norm": 0.18891027569770813, + "learning_rate": 2.0012427966502085e-05, + "loss": 1.6895, + "step": 23250 + }, + { + "epoch": 7.1365868631062, + "grad_norm": 0.21313735842704773, + "learning_rate": 2.000845074048896e-05, + "loss": 1.6829, + "step": 23251 + }, + { + "epoch": 7.1368937998772255, + "grad_norm": 0.2438332885503769, + "learning_rate": 2.0004473810869923e-05, + "loss": 1.7723, + "step": 23252 + }, + { + "epoch": 7.137200736648251, + "grad_norm": 0.24475115537643433, + "learning_rate": 2.0000497177684257e-05, + "loss": 1.7192, + "step": 23253 + }, + { + "epoch": 7.137507673419275, + "grad_norm": 0.1936563402414322, + "learning_rate": 1.9996520840971267e-05, + "loss": 1.7462, + "step": 23254 + }, + { + "epoch": 7.137814610190301, + "grad_norm": 0.22365616261959076, + "learning_rate": 1.9992544800770236e-05, + "loss": 1.7405, + "step": 23255 + }, + { + "epoch": 7.138121546961326, + "grad_norm": 0.191316619515419, + "learning_rate": 1.9988569057120472e-05, + "loss": 1.6466, + "step": 23256 + }, + { + "epoch": 7.138428483732351, + "grad_norm": 0.24758055806159973, + "learning_rate": 1.9984593610061253e-05, + "loss": 1.7689, + "step": 23257 + }, + { + "epoch": 7.138735420503377, + "grad_norm": 0.2144414782524109, + "learning_rate": 1.9980618459631874e-05, + "loss": 1.7158, + "step": 23258 + }, + { + "epoch": 7.139042357274401, + "grad_norm": 0.24254034459590912, + "learning_rate": 1.9976643605871614e-05, + "loss": 1.7998, + "step": 23259 + }, + { + "epoch": 7.139349294045426, + "grad_norm": 0.21013480424880981, + "learning_rate": 1.9972669048819765e-05, + "loss": 1.7231, + "step": 23260 + }, + { + "epoch": 7.139656230816452, + "grad_norm": 0.2169421911239624, + "learning_rate": 1.9968694788515603e-05, + "loss": 1.7182, + "step": 23261 + }, + { + "epoch": 7.139963167587477, + "grad_norm": 0.19591476023197174, + "learning_rate": 1.9964720824998395e-05, + "loss": 1.7114, + "step": 23262 + }, + { + "epoch": 7.140270104358502, + "grad_norm": 0.1775221824645996, + "learning_rate": 1.9960747158307417e-05, + "loss": 1.6754, + "step": 23263 + }, + { + "epoch": 7.140577041129528, + "grad_norm": 0.19318300485610962, + "learning_rate": 1.995677378848193e-05, + "loss": 1.6794, + "step": 23264 + }, + { + "epoch": 7.140883977900552, + "grad_norm": 0.19659662246704102, + "learning_rate": 1.995280071556125e-05, + "loss": 1.703, + "step": 23265 + }, + { + "epoch": 7.1411909146715775, + "grad_norm": 0.22100697457790375, + "learning_rate": 1.994882793958457e-05, + "loss": 1.6821, + "step": 23266 + }, + { + "epoch": 7.141497851442603, + "grad_norm": 0.20475365221500397, + "learning_rate": 1.9944855460591217e-05, + "loss": 1.727, + "step": 23267 + }, + { + "epoch": 7.141804788213628, + "grad_norm": 0.2202025055885315, + "learning_rate": 1.9940883278620383e-05, + "loss": 1.7248, + "step": 23268 + }, + { + "epoch": 7.1421117249846535, + "grad_norm": 0.1800462007522583, + "learning_rate": 1.993691139371138e-05, + "loss": 1.7276, + "step": 23269 + }, + { + "epoch": 7.142418661755678, + "grad_norm": 0.2896895110607147, + "learning_rate": 1.9932939805903433e-05, + "loss": 1.7275, + "step": 23270 + }, + { + "epoch": 7.142725598526703, + "grad_norm": 0.21308782696723938, + "learning_rate": 1.99289685152358e-05, + "loss": 1.6645, + "step": 23271 + }, + { + "epoch": 7.143032535297729, + "grad_norm": 0.20210005342960358, + "learning_rate": 1.992499752174773e-05, + "loss": 1.6899, + "step": 23272 + }, + { + "epoch": 7.143339472068754, + "grad_norm": 0.18419797718524933, + "learning_rate": 1.9921026825478455e-05, + "loss": 1.7088, + "step": 23273 + }, + { + "epoch": 7.143646408839779, + "grad_norm": 0.19155149161815643, + "learning_rate": 1.9917056426467227e-05, + "loss": 1.719, + "step": 23274 + }, + { + "epoch": 7.143953345610804, + "grad_norm": 0.17220313847064972, + "learning_rate": 1.9913086324753278e-05, + "loss": 1.7408, + "step": 23275 + }, + { + "epoch": 7.144260282381829, + "grad_norm": 0.18474969267845154, + "learning_rate": 1.990911652037585e-05, + "loss": 1.7189, + "step": 23276 + }, + { + "epoch": 7.144567219152854, + "grad_norm": 0.18529154360294342, + "learning_rate": 1.9905147013374165e-05, + "loss": 1.7075, + "step": 23277 + }, + { + "epoch": 7.14487415592388, + "grad_norm": 0.18569569289684296, + "learning_rate": 1.9901177803787452e-05, + "loss": 1.7116, + "step": 23278 + }, + { + "epoch": 7.145181092694905, + "grad_norm": 0.17149175703525543, + "learning_rate": 1.9897208891654946e-05, + "loss": 1.6873, + "step": 23279 + }, + { + "epoch": 7.14548802946593, + "grad_norm": 0.18012240529060364, + "learning_rate": 1.9893240277015868e-05, + "loss": 1.709, + "step": 23280 + }, + { + "epoch": 7.145794966236955, + "grad_norm": 0.18372172117233276, + "learning_rate": 1.9889271959909412e-05, + "loss": 1.7134, + "step": 23281 + }, + { + "epoch": 7.14610190300798, + "grad_norm": 0.20667128264904022, + "learning_rate": 1.9885303940374856e-05, + "loss": 1.7452, + "step": 23282 + }, + { + "epoch": 7.1464088397790055, + "grad_norm": 0.18145184218883514, + "learning_rate": 1.9881336218451346e-05, + "loss": 1.7358, + "step": 23283 + }, + { + "epoch": 7.146715776550031, + "grad_norm": 0.179911807179451, + "learning_rate": 1.987736879417816e-05, + "loss": 1.6698, + "step": 23284 + }, + { + "epoch": 7.147022713321056, + "grad_norm": 0.18944865465164185, + "learning_rate": 1.9873401667594426e-05, + "loss": 1.7725, + "step": 23285 + }, + { + "epoch": 7.147329650092081, + "grad_norm": 0.1926117241382599, + "learning_rate": 1.986943483873942e-05, + "loss": 1.7829, + "step": 23286 + }, + { + "epoch": 7.147636586863106, + "grad_norm": 0.330503910779953, + "learning_rate": 1.9865468307652318e-05, + "loss": 1.7408, + "step": 23287 + }, + { + "epoch": 7.147943523634131, + "grad_norm": 0.22677597403526306, + "learning_rate": 1.9861502074372324e-05, + "loss": 1.7013, + "step": 23288 + }, + { + "epoch": 7.148250460405157, + "grad_norm": 0.1859201192855835, + "learning_rate": 1.9857536138938627e-05, + "loss": 1.7215, + "step": 23289 + }, + { + "epoch": 7.148557397176182, + "grad_norm": 0.22151269018650055, + "learning_rate": 1.9853570501390427e-05, + "loss": 1.6781, + "step": 23290 + }, + { + "epoch": 7.148864333947207, + "grad_norm": 0.16455405950546265, + "learning_rate": 1.984960516176691e-05, + "loss": 1.6518, + "step": 23291 + }, + { + "epoch": 7.149171270718232, + "grad_norm": 0.19687162339687347, + "learning_rate": 1.9845640120107267e-05, + "loss": 1.7375, + "step": 23292 + }, + { + "epoch": 7.149478207489257, + "grad_norm": 0.19174890220165253, + "learning_rate": 1.9841675376450686e-05, + "loss": 1.7017, + "step": 23293 + }, + { + "epoch": 7.149785144260282, + "grad_norm": 0.18458877503871918, + "learning_rate": 1.983771093083634e-05, + "loss": 1.7256, + "step": 23294 + }, + { + "epoch": 7.150092081031308, + "grad_norm": 0.212035670876503, + "learning_rate": 1.983374678330342e-05, + "loss": 1.698, + "step": 23295 + }, + { + "epoch": 7.150399017802333, + "grad_norm": 0.1793123185634613, + "learning_rate": 1.982978293389109e-05, + "loss": 1.7012, + "step": 23296 + }, + { + "epoch": 7.150705954573358, + "grad_norm": 0.2359405905008316, + "learning_rate": 1.9825819382638526e-05, + "loss": 1.7423, + "step": 23297 + }, + { + "epoch": 7.151012891344383, + "grad_norm": 0.17125526070594788, + "learning_rate": 1.9821856129584888e-05, + "loss": 1.6825, + "step": 23298 + }, + { + "epoch": 7.151319828115408, + "grad_norm": 0.2084828019142151, + "learning_rate": 1.9817893174769392e-05, + "loss": 1.6991, + "step": 23299 + }, + { + "epoch": 7.151626764886434, + "grad_norm": 0.27647483348846436, + "learning_rate": 1.9813930518231127e-05, + "loss": 1.7425, + "step": 23300 + }, + { + "epoch": 7.151933701657459, + "grad_norm": 0.23517926037311554, + "learning_rate": 1.980996816000933e-05, + "loss": 1.8411, + "step": 23301 + }, + { + "epoch": 7.152240638428483, + "grad_norm": 0.19960010051727295, + "learning_rate": 1.980600610014309e-05, + "loss": 1.7302, + "step": 23302 + }, + { + "epoch": 7.152547575199509, + "grad_norm": 0.18953165411949158, + "learning_rate": 1.9802044338671604e-05, + "loss": 1.7252, + "step": 23303 + }, + { + "epoch": 7.152854511970534, + "grad_norm": 0.1718905121088028, + "learning_rate": 1.979808287563402e-05, + "loss": 1.656, + "step": 23304 + }, + { + "epoch": 7.153161448741559, + "grad_norm": 0.17233465611934662, + "learning_rate": 1.9794121711069487e-05, + "loss": 1.6732, + "step": 23305 + }, + { + "epoch": 7.153468385512585, + "grad_norm": 0.17677003145217896, + "learning_rate": 1.979016084501714e-05, + "loss": 1.7266, + "step": 23306 + }, + { + "epoch": 7.153775322283609, + "grad_norm": 0.1815326064825058, + "learning_rate": 1.9786200277516136e-05, + "loss": 1.7029, + "step": 23307 + }, + { + "epoch": 7.1540822590546345, + "grad_norm": 0.20937341451644897, + "learning_rate": 1.978224000860561e-05, + "loss": 1.711, + "step": 23308 + }, + { + "epoch": 7.15438919582566, + "grad_norm": 0.2045155018568039, + "learning_rate": 1.97782800383247e-05, + "loss": 1.7557, + "step": 23309 + }, + { + "epoch": 7.154696132596685, + "grad_norm": 0.16426041722297668, + "learning_rate": 1.9774320366712533e-05, + "loss": 1.7373, + "step": 23310 + }, + { + "epoch": 7.1550030693677105, + "grad_norm": 0.18058224022388458, + "learning_rate": 1.977036099380825e-05, + "loss": 1.6957, + "step": 23311 + }, + { + "epoch": 7.155310006138736, + "grad_norm": 0.23552078008651733, + "learning_rate": 1.9766401919650983e-05, + "loss": 1.8032, + "step": 23312 + }, + { + "epoch": 7.15561694290976, + "grad_norm": 0.19097596406936646, + "learning_rate": 1.9762443144279852e-05, + "loss": 1.7447, + "step": 23313 + }, + { + "epoch": 7.155923879680786, + "grad_norm": 0.17892403900623322, + "learning_rate": 1.975848466773398e-05, + "loss": 1.7117, + "step": 23314 + }, + { + "epoch": 7.156230816451811, + "grad_norm": 0.18331217765808105, + "learning_rate": 1.9754526490052467e-05, + "loss": 1.6669, + "step": 23315 + }, + { + "epoch": 7.156537753222836, + "grad_norm": 0.19914311170578003, + "learning_rate": 1.975056861127449e-05, + "loss": 1.6731, + "step": 23316 + }, + { + "epoch": 7.156844689993862, + "grad_norm": 0.21710485219955444, + "learning_rate": 1.9746611031439083e-05, + "loss": 1.7214, + "step": 23317 + }, + { + "epoch": 7.157151626764886, + "grad_norm": 0.19703111052513123, + "learning_rate": 1.9742653750585437e-05, + "loss": 1.7185, + "step": 23318 + }, + { + "epoch": 7.157458563535911, + "grad_norm": 0.18581365048885345, + "learning_rate": 1.9738696768752585e-05, + "loss": 1.7113, + "step": 23319 + }, + { + "epoch": 7.157765500306937, + "grad_norm": 0.1703677624464035, + "learning_rate": 1.9734740085979687e-05, + "loss": 1.6755, + "step": 23320 + }, + { + "epoch": 7.158072437077962, + "grad_norm": 0.16760937869548798, + "learning_rate": 1.9730783702305826e-05, + "loss": 1.7082, + "step": 23321 + }, + { + "epoch": 7.158379373848987, + "grad_norm": 0.20183983445167542, + "learning_rate": 1.97268276177701e-05, + "loss": 1.7503, + "step": 23322 + }, + { + "epoch": 7.158686310620013, + "grad_norm": 0.18407952785491943, + "learning_rate": 1.972287183241163e-05, + "loss": 1.6807, + "step": 23323 + }, + { + "epoch": 7.158993247391037, + "grad_norm": 0.20135276019573212, + "learning_rate": 1.9718916346269446e-05, + "loss": 1.8001, + "step": 23324 + }, + { + "epoch": 7.1593001841620625, + "grad_norm": 0.1781267672777176, + "learning_rate": 1.9714961159382693e-05, + "loss": 1.683, + "step": 23325 + }, + { + "epoch": 7.159607120933088, + "grad_norm": 0.24990373849868774, + "learning_rate": 1.971100627179045e-05, + "loss": 1.7235, + "step": 23326 + }, + { + "epoch": 7.159914057704113, + "grad_norm": 0.19463174045085907, + "learning_rate": 1.9707051683531796e-05, + "loss": 1.735, + "step": 23327 + }, + { + "epoch": 7.1602209944751385, + "grad_norm": 0.1988895982503891, + "learning_rate": 1.9703097394645813e-05, + "loss": 1.7495, + "step": 23328 + }, + { + "epoch": 7.160527931246163, + "grad_norm": 0.1760931760072708, + "learning_rate": 1.9699143405171576e-05, + "loss": 1.6914, + "step": 23329 + }, + { + "epoch": 7.160834868017188, + "grad_norm": 0.18537557125091553, + "learning_rate": 1.9695189715148166e-05, + "loss": 1.7601, + "step": 23330 + }, + { + "epoch": 7.161141804788214, + "grad_norm": 0.2476375252008438, + "learning_rate": 1.9691236324614654e-05, + "loss": 1.8218, + "step": 23331 + }, + { + "epoch": 7.161448741559239, + "grad_norm": 0.17736093699932098, + "learning_rate": 1.968728323361009e-05, + "loss": 1.6872, + "step": 23332 + }, + { + "epoch": 7.161755678330264, + "grad_norm": 0.1851162612438202, + "learning_rate": 1.9683330442173598e-05, + "loss": 1.712, + "step": 23333 + }, + { + "epoch": 7.162062615101289, + "grad_norm": 0.20326650142669678, + "learning_rate": 1.967937795034417e-05, + "loss": 1.7668, + "step": 23334 + }, + { + "epoch": 7.162369551872314, + "grad_norm": 0.21020451188087463, + "learning_rate": 1.9675425758160925e-05, + "loss": 1.7135, + "step": 23335 + }, + { + "epoch": 7.162676488643339, + "grad_norm": 0.21629111468791962, + "learning_rate": 1.967147386566287e-05, + "loss": 1.7181, + "step": 23336 + }, + { + "epoch": 7.162983425414365, + "grad_norm": 0.18086732923984528, + "learning_rate": 1.9667522272889104e-05, + "loss": 1.7107, + "step": 23337 + }, + { + "epoch": 7.16329036218539, + "grad_norm": 0.16542381048202515, + "learning_rate": 1.9663570979878658e-05, + "loss": 1.7156, + "step": 23338 + }, + { + "epoch": 7.163597298956415, + "grad_norm": 0.18775032460689545, + "learning_rate": 1.9659619986670587e-05, + "loss": 1.6955, + "step": 23339 + }, + { + "epoch": 7.16390423572744, + "grad_norm": 0.19227592647075653, + "learning_rate": 1.9655669293303953e-05, + "loss": 1.7545, + "step": 23340 + }, + { + "epoch": 7.164211172498465, + "grad_norm": 0.1935085654258728, + "learning_rate": 1.9651718899817746e-05, + "loss": 1.7183, + "step": 23341 + }, + { + "epoch": 7.1645181092694905, + "grad_norm": 0.17873792350292206, + "learning_rate": 1.9647768806251056e-05, + "loss": 1.6644, + "step": 23342 + }, + { + "epoch": 7.164825046040516, + "grad_norm": 0.25024256110191345, + "learning_rate": 1.96438190126429e-05, + "loss": 1.7621, + "step": 23343 + }, + { + "epoch": 7.165131982811541, + "grad_norm": 0.15957331657409668, + "learning_rate": 1.9639869519032323e-05, + "loss": 1.6525, + "step": 23344 + }, + { + "epoch": 7.165438919582566, + "grad_norm": 0.19967027008533478, + "learning_rate": 1.9635920325458347e-05, + "loss": 1.7533, + "step": 23345 + }, + { + "epoch": 7.165745856353591, + "grad_norm": 0.17413713037967682, + "learning_rate": 1.9631971431960005e-05, + "loss": 1.6962, + "step": 23346 + }, + { + "epoch": 7.166052793124616, + "grad_norm": 0.19787384569644928, + "learning_rate": 1.9628022838576315e-05, + "loss": 1.7369, + "step": 23347 + }, + { + "epoch": 7.166359729895642, + "grad_norm": 0.1726577877998352, + "learning_rate": 1.962407454534631e-05, + "loss": 1.7004, + "step": 23348 + }, + { + "epoch": 7.166666666666667, + "grad_norm": 0.2136315256357193, + "learning_rate": 1.962012655230899e-05, + "loss": 1.7411, + "step": 23349 + }, + { + "epoch": 7.166973603437691, + "grad_norm": 0.18257126212120056, + "learning_rate": 1.9616178859503414e-05, + "loss": 1.7155, + "step": 23350 + }, + { + "epoch": 7.167280540208717, + "grad_norm": 0.18696577847003937, + "learning_rate": 1.961223146696854e-05, + "loss": 1.7272, + "step": 23351 + }, + { + "epoch": 7.167587476979742, + "grad_norm": 0.16375793516635895, + "learning_rate": 1.9608284374743435e-05, + "loss": 1.6706, + "step": 23352 + }, + { + "epoch": 7.167894413750767, + "grad_norm": 0.19589200615882874, + "learning_rate": 1.960433758286704e-05, + "loss": 1.7018, + "step": 23353 + }, + { + "epoch": 7.168201350521793, + "grad_norm": 0.18434208631515503, + "learning_rate": 1.9600391091378417e-05, + "loss": 1.6776, + "step": 23354 + }, + { + "epoch": 7.168508287292818, + "grad_norm": 0.23839476704597473, + "learning_rate": 1.9596444900316545e-05, + "loss": 1.7501, + "step": 23355 + }, + { + "epoch": 7.1688152240638425, + "grad_norm": 0.20229686796665192, + "learning_rate": 1.9592499009720428e-05, + "loss": 1.7249, + "step": 23356 + }, + { + "epoch": 7.169122160834868, + "grad_norm": 0.2422642856836319, + "learning_rate": 1.9588553419629076e-05, + "loss": 1.7621, + "step": 23357 + }, + { + "epoch": 7.169429097605893, + "grad_norm": 0.21856555342674255, + "learning_rate": 1.9584608130081422e-05, + "loss": 1.7362, + "step": 23358 + }, + { + "epoch": 7.1697360343769185, + "grad_norm": 0.19434040784835815, + "learning_rate": 1.958066314111652e-05, + "loss": 1.6888, + "step": 23359 + }, + { + "epoch": 7.170042971147944, + "grad_norm": 0.19806630909442902, + "learning_rate": 1.9576718452773335e-05, + "loss": 1.7461, + "step": 23360 + }, + { + "epoch": 7.170349907918968, + "grad_norm": 0.19190531969070435, + "learning_rate": 1.957277406509085e-05, + "loss": 1.6992, + "step": 23361 + }, + { + "epoch": 7.170656844689994, + "grad_norm": 0.20990152657032013, + "learning_rate": 1.9568829978108044e-05, + "loss": 1.7095, + "step": 23362 + }, + { + "epoch": 7.170963781461019, + "grad_norm": 0.18638263642787933, + "learning_rate": 1.9564886191863897e-05, + "loss": 1.7024, + "step": 23363 + }, + { + "epoch": 7.171270718232044, + "grad_norm": 0.1974666863679886, + "learning_rate": 1.9560942706397383e-05, + "loss": 1.6901, + "step": 23364 + }, + { + "epoch": 7.17157765500307, + "grad_norm": 0.171469047665596, + "learning_rate": 1.955699952174747e-05, + "loss": 1.717, + "step": 23365 + }, + { + "epoch": 7.171884591774095, + "grad_norm": 0.17386725544929504, + "learning_rate": 1.955305663795312e-05, + "loss": 1.7069, + "step": 23366 + }, + { + "epoch": 7.172191528545119, + "grad_norm": 0.1869814246892929, + "learning_rate": 1.954911405505334e-05, + "loss": 1.7478, + "step": 23367 + }, + { + "epoch": 7.172498465316145, + "grad_norm": 0.19253556430339813, + "learning_rate": 1.9545171773087033e-05, + "loss": 1.7129, + "step": 23368 + }, + { + "epoch": 7.17280540208717, + "grad_norm": 0.1625998616218567, + "learning_rate": 1.954122979209322e-05, + "loss": 1.7055, + "step": 23369 + }, + { + "epoch": 7.173112338858195, + "grad_norm": 0.172325998544693, + "learning_rate": 1.953728811211079e-05, + "loss": 1.71, + "step": 23370 + }, + { + "epoch": 7.173419275629221, + "grad_norm": 0.22542965412139893, + "learning_rate": 1.9533346733178753e-05, + "loss": 1.7548, + "step": 23371 + }, + { + "epoch": 7.173726212400245, + "grad_norm": 0.1547299474477768, + "learning_rate": 1.9529405655336042e-05, + "loss": 1.6509, + "step": 23372 + }, + { + "epoch": 7.1740331491712706, + "grad_norm": 0.21720515191555023, + "learning_rate": 1.95254648786216e-05, + "loss": 1.7427, + "step": 23373 + }, + { + "epoch": 7.174340085942296, + "grad_norm": 0.18855944275856018, + "learning_rate": 1.95215244030744e-05, + "loss": 1.7471, + "step": 23374 + }, + { + "epoch": 7.174647022713321, + "grad_norm": 0.21088628470897675, + "learning_rate": 1.951758422873332e-05, + "loss": 1.7457, + "step": 23375 + }, + { + "epoch": 7.1749539594843466, + "grad_norm": 0.20596840977668762, + "learning_rate": 1.951364435563736e-05, + "loss": 1.7098, + "step": 23376 + }, + { + "epoch": 7.175260896255371, + "grad_norm": 0.20098064839839935, + "learning_rate": 1.9509704783825433e-05, + "loss": 1.7225, + "step": 23377 + }, + { + "epoch": 7.175567833026396, + "grad_norm": 0.20860125124454498, + "learning_rate": 1.950576551333647e-05, + "loss": 1.7071, + "step": 23378 + }, + { + "epoch": 7.175874769797422, + "grad_norm": 0.1914912760257721, + "learning_rate": 1.950182654420941e-05, + "loss": 1.7262, + "step": 23379 + }, + { + "epoch": 7.176181706568447, + "grad_norm": 0.21109424531459808, + "learning_rate": 1.9497887876483178e-05, + "loss": 1.6601, + "step": 23380 + }, + { + "epoch": 7.176488643339472, + "grad_norm": 0.20514877140522003, + "learning_rate": 1.949394951019669e-05, + "loss": 1.7612, + "step": 23381 + }, + { + "epoch": 7.176795580110497, + "grad_norm": 0.20280246436595917, + "learning_rate": 1.949001144538888e-05, + "loss": 1.6754, + "step": 23382 + }, + { + "epoch": 7.177102516881522, + "grad_norm": 0.1724841594696045, + "learning_rate": 1.9486073682098654e-05, + "loss": 1.7252, + "step": 23383 + }, + { + "epoch": 7.1774094536525475, + "grad_norm": 0.16961625218391418, + "learning_rate": 1.948213622036493e-05, + "loss": 1.6835, + "step": 23384 + }, + { + "epoch": 7.177716390423573, + "grad_norm": 0.17938925325870514, + "learning_rate": 1.947819906022661e-05, + "loss": 1.6909, + "step": 23385 + }, + { + "epoch": 7.178023327194598, + "grad_norm": 0.19711901247501373, + "learning_rate": 1.9474262201722655e-05, + "loss": 1.7275, + "step": 23386 + }, + { + "epoch": 7.1783302639656235, + "grad_norm": 0.19549165666103363, + "learning_rate": 1.947032564489189e-05, + "loss": 1.7609, + "step": 23387 + }, + { + "epoch": 7.178637200736648, + "grad_norm": 0.20358525216579437, + "learning_rate": 1.9466389389773284e-05, + "loss": 1.7127, + "step": 23388 + }, + { + "epoch": 7.178944137507673, + "grad_norm": 0.18345355987548828, + "learning_rate": 1.946245343640571e-05, + "loss": 1.6807, + "step": 23389 + }, + { + "epoch": 7.179251074278699, + "grad_norm": 0.20261847972869873, + "learning_rate": 1.9458517784828074e-05, + "loss": 1.717, + "step": 23390 + }, + { + "epoch": 7.179558011049724, + "grad_norm": 0.18042106926441193, + "learning_rate": 1.9454582435079275e-05, + "loss": 1.7415, + "step": 23391 + }, + { + "epoch": 7.179864947820749, + "grad_norm": 0.1731836199760437, + "learning_rate": 1.945064738719817e-05, + "loss": 1.6661, + "step": 23392 + }, + { + "epoch": 7.180171884591774, + "grad_norm": 0.1971052885055542, + "learning_rate": 1.9446712641223685e-05, + "loss": 1.753, + "step": 23393 + }, + { + "epoch": 7.180478821362799, + "grad_norm": 0.22370313107967377, + "learning_rate": 1.94427781971947e-05, + "loss": 1.7118, + "step": 23394 + }, + { + "epoch": 7.180785758133824, + "grad_norm": 0.23129026591777802, + "learning_rate": 1.9438844055150086e-05, + "loss": 1.8087, + "step": 23395 + }, + { + "epoch": 7.18109269490485, + "grad_norm": 0.26353758573532104, + "learning_rate": 1.9434910215128727e-05, + "loss": 1.7147, + "step": 23396 + }, + { + "epoch": 7.181399631675875, + "grad_norm": 0.22333624958992004, + "learning_rate": 1.9430976677169504e-05, + "loss": 1.7403, + "step": 23397 + }, + { + "epoch": 7.1817065684469, + "grad_norm": 0.22191296517848969, + "learning_rate": 1.9427043441311284e-05, + "loss": 1.7125, + "step": 23398 + }, + { + "epoch": 7.182013505217925, + "grad_norm": 0.19174177944660187, + "learning_rate": 1.942311050759294e-05, + "loss": 1.7026, + "step": 23399 + }, + { + "epoch": 7.18232044198895, + "grad_norm": 0.2175525426864624, + "learning_rate": 1.9419177876053342e-05, + "loss": 1.6947, + "step": 23400 + }, + { + "epoch": 7.1826273787599755, + "grad_norm": 0.19419047236442566, + "learning_rate": 1.9415245546731348e-05, + "loss": 1.7309, + "step": 23401 + }, + { + "epoch": 7.182934315531001, + "grad_norm": 0.22568467259407043, + "learning_rate": 1.9411313519665806e-05, + "loss": 1.7177, + "step": 23402 + }, + { + "epoch": 7.183241252302026, + "grad_norm": 0.26983609795570374, + "learning_rate": 1.9407381794895635e-05, + "loss": 1.6779, + "step": 23403 + }, + { + "epoch": 7.183548189073051, + "grad_norm": 0.1651962548494339, + "learning_rate": 1.9403450372459602e-05, + "loss": 1.6718, + "step": 23404 + }, + { + "epoch": 7.183855125844076, + "grad_norm": 0.2337920367717743, + "learning_rate": 1.9399519252396653e-05, + "loss": 1.7271, + "step": 23405 + }, + { + "epoch": 7.184162062615101, + "grad_norm": 0.20093166828155518, + "learning_rate": 1.9395588434745547e-05, + "loss": 1.7274, + "step": 23406 + }, + { + "epoch": 7.184468999386127, + "grad_norm": 0.22497716546058655, + "learning_rate": 1.9391657919545193e-05, + "loss": 1.7419, + "step": 23407 + }, + { + "epoch": 7.184775936157152, + "grad_norm": 0.22474822402000427, + "learning_rate": 1.938772770683443e-05, + "loss": 1.8317, + "step": 23408 + }, + { + "epoch": 7.185082872928176, + "grad_norm": 0.18015392124652863, + "learning_rate": 1.9383797796652052e-05, + "loss": 1.6568, + "step": 23409 + }, + { + "epoch": 7.185389809699202, + "grad_norm": 0.18696026504039764, + "learning_rate": 1.9379868189036947e-05, + "loss": 1.6722, + "step": 23410 + }, + { + "epoch": 7.185696746470227, + "grad_norm": 0.1828698217868805, + "learning_rate": 1.9375938884027934e-05, + "loss": 1.7477, + "step": 23411 + }, + { + "epoch": 7.186003683241252, + "grad_norm": 0.20442047715187073, + "learning_rate": 1.937200988166384e-05, + "loss": 1.7269, + "step": 23412 + }, + { + "epoch": 7.186310620012278, + "grad_norm": 0.17201031744480133, + "learning_rate": 1.9368081181983494e-05, + "loss": 1.6893, + "step": 23413 + }, + { + "epoch": 7.186617556783303, + "grad_norm": 0.21501687169075012, + "learning_rate": 1.9364152785025723e-05, + "loss": 1.771, + "step": 23414 + }, + { + "epoch": 7.1869244935543275, + "grad_norm": 0.18059030175209045, + "learning_rate": 1.936022469082936e-05, + "loss": 1.7088, + "step": 23415 + }, + { + "epoch": 7.187231430325353, + "grad_norm": 0.18079128861427307, + "learning_rate": 1.9356296899433206e-05, + "loss": 1.764, + "step": 23416 + }, + { + "epoch": 7.187538367096378, + "grad_norm": 0.1960453987121582, + "learning_rate": 1.9352369410876086e-05, + "loss": 1.7302, + "step": 23417 + }, + { + "epoch": 7.1878453038674035, + "grad_norm": 0.19896337389945984, + "learning_rate": 1.9348442225196815e-05, + "loss": 1.7228, + "step": 23418 + }, + { + "epoch": 7.188152240638429, + "grad_norm": 0.19272227585315704, + "learning_rate": 1.9344515342434192e-05, + "loss": 1.7164, + "step": 23419 + }, + { + "epoch": 7.188459177409453, + "grad_norm": 0.16746973991394043, + "learning_rate": 1.9340588762627066e-05, + "loss": 1.696, + "step": 23420 + }, + { + "epoch": 7.188766114180479, + "grad_norm": 0.2421095222234726, + "learning_rate": 1.9336662485814178e-05, + "loss": 1.766, + "step": 23421 + }, + { + "epoch": 7.189073050951504, + "grad_norm": 0.17857256531715393, + "learning_rate": 1.93327365120344e-05, + "loss": 1.7216, + "step": 23422 + }, + { + "epoch": 7.189379987722529, + "grad_norm": 0.19336672127246857, + "learning_rate": 1.932881084132646e-05, + "loss": 1.7124, + "step": 23423 + }, + { + "epoch": 7.189686924493555, + "grad_norm": 0.1555519700050354, + "learning_rate": 1.9324885473729204e-05, + "loss": 1.6491, + "step": 23424 + }, + { + "epoch": 7.189993861264579, + "grad_norm": 0.17879530787467957, + "learning_rate": 1.9320960409281425e-05, + "loss": 1.697, + "step": 23425 + }, + { + "epoch": 7.190300798035604, + "grad_norm": 0.17966939508914948, + "learning_rate": 1.9317035648021862e-05, + "loss": 1.6786, + "step": 23426 + }, + { + "epoch": 7.19060773480663, + "grad_norm": 0.21742603182792664, + "learning_rate": 1.9313111189989375e-05, + "loss": 1.734, + "step": 23427 + }, + { + "epoch": 7.190914671577655, + "grad_norm": 0.22135521471500397, + "learning_rate": 1.9309187035222675e-05, + "loss": 1.7154, + "step": 23428 + }, + { + "epoch": 7.19122160834868, + "grad_norm": 0.17866137623786926, + "learning_rate": 1.930526318376059e-05, + "loss": 1.6723, + "step": 23429 + }, + { + "epoch": 7.191528545119706, + "grad_norm": 0.26034823060035706, + "learning_rate": 1.9301339635641887e-05, + "loss": 1.6975, + "step": 23430 + }, + { + "epoch": 7.19183548189073, + "grad_norm": 0.21550825238227844, + "learning_rate": 1.929741639090534e-05, + "loss": 1.7401, + "step": 23431 + }, + { + "epoch": 7.1921424186617555, + "grad_norm": 0.19205132126808167, + "learning_rate": 1.9293493449589718e-05, + "loss": 1.6543, + "step": 23432 + }, + { + "epoch": 7.192449355432781, + "grad_norm": 0.18724635243415833, + "learning_rate": 1.928957081173379e-05, + "loss": 1.7752, + "step": 23433 + }, + { + "epoch": 7.192756292203806, + "grad_norm": 0.2392650544643402, + "learning_rate": 1.928564847737633e-05, + "loss": 1.7008, + "step": 23434 + }, + { + "epoch": 7.1930632289748315, + "grad_norm": 0.18950903415679932, + "learning_rate": 1.9281726446556088e-05, + "loss": 1.7193, + "step": 23435 + }, + { + "epoch": 7.193370165745856, + "grad_norm": 0.2542276978492737, + "learning_rate": 1.9277804719311808e-05, + "loss": 1.7192, + "step": 23436 + }, + { + "epoch": 7.193677102516881, + "grad_norm": 0.1987142711877823, + "learning_rate": 1.927388329568231e-05, + "loss": 1.6943, + "step": 23437 + }, + { + "epoch": 7.193984039287907, + "grad_norm": 0.18837273120880127, + "learning_rate": 1.9269962175706275e-05, + "loss": 1.7443, + "step": 23438 + }, + { + "epoch": 7.194290976058932, + "grad_norm": 0.20432044565677643, + "learning_rate": 1.9266041359422514e-05, + "loss": 1.741, + "step": 23439 + }, + { + "epoch": 7.194597912829957, + "grad_norm": 0.17763052880764008, + "learning_rate": 1.9262120846869715e-05, + "loss": 1.6696, + "step": 23440 + }, + { + "epoch": 7.194904849600983, + "grad_norm": 0.1747766137123108, + "learning_rate": 1.9258200638086665e-05, + "loss": 1.6727, + "step": 23441 + }, + { + "epoch": 7.195211786372007, + "grad_norm": 0.22058527171611786, + "learning_rate": 1.9254280733112117e-05, + "loss": 1.7387, + "step": 23442 + }, + { + "epoch": 7.195518723143032, + "grad_norm": 0.2247757911682129, + "learning_rate": 1.925036113198475e-05, + "loss": 1.7828, + "step": 23443 + }, + { + "epoch": 7.195825659914058, + "grad_norm": 0.16923101246356964, + "learning_rate": 1.924644183474337e-05, + "loss": 1.6655, + "step": 23444 + }, + { + "epoch": 7.196132596685083, + "grad_norm": 0.1599757820367813, + "learning_rate": 1.924252284142665e-05, + "loss": 1.7002, + "step": 23445 + }, + { + "epoch": 7.196439533456108, + "grad_norm": 0.1916438341140747, + "learning_rate": 1.9238604152073358e-05, + "loss": 1.71, + "step": 23446 + }, + { + "epoch": 7.196746470227133, + "grad_norm": 0.18037991225719452, + "learning_rate": 1.9234685766722216e-05, + "loss": 1.6786, + "step": 23447 + }, + { + "epoch": 7.197053406998158, + "grad_norm": 0.20671263337135315, + "learning_rate": 1.9230767685411938e-05, + "loss": 1.7228, + "step": 23448 + }, + { + "epoch": 7.1973603437691835, + "grad_norm": 0.18949514627456665, + "learning_rate": 1.9226849908181243e-05, + "loss": 1.7794, + "step": 23449 + }, + { + "epoch": 7.197667280540209, + "grad_norm": 0.19457660615444183, + "learning_rate": 1.9222932435068857e-05, + "loss": 1.7153, + "step": 23450 + }, + { + "epoch": 7.197974217311234, + "grad_norm": 0.16834792494773865, + "learning_rate": 1.9219015266113494e-05, + "loss": 1.646, + "step": 23451 + }, + { + "epoch": 7.198281154082259, + "grad_norm": 0.21668508648872375, + "learning_rate": 1.9215098401353866e-05, + "loss": 1.7232, + "step": 23452 + }, + { + "epoch": 7.198588090853284, + "grad_norm": 0.1675579994916916, + "learning_rate": 1.9211181840828656e-05, + "loss": 1.6963, + "step": 23453 + }, + { + "epoch": 7.198895027624309, + "grad_norm": 0.19915352761745453, + "learning_rate": 1.9207265584576627e-05, + "loss": 1.7043, + "step": 23454 + }, + { + "epoch": 7.199201964395335, + "grad_norm": 0.23872216045856476, + "learning_rate": 1.920334963263642e-05, + "loss": 1.7784, + "step": 23455 + }, + { + "epoch": 7.19950890116636, + "grad_norm": 0.261321485042572, + "learning_rate": 1.919943398504679e-05, + "loss": 1.8024, + "step": 23456 + }, + { + "epoch": 7.199815837937384, + "grad_norm": 0.17026741802692413, + "learning_rate": 1.9195518641846377e-05, + "loss": 1.7451, + "step": 23457 + }, + { + "epoch": 7.20012277470841, + "grad_norm": 0.20935678482055664, + "learning_rate": 1.9191603603073915e-05, + "loss": 1.752, + "step": 23458 + }, + { + "epoch": 7.200429711479435, + "grad_norm": 0.1756788194179535, + "learning_rate": 1.9187688868768107e-05, + "loss": 1.7008, + "step": 23459 + }, + { + "epoch": 7.2007366482504604, + "grad_norm": 0.23286345601081848, + "learning_rate": 1.9183774438967577e-05, + "loss": 1.7603, + "step": 23460 + }, + { + "epoch": 7.201043585021486, + "grad_norm": 0.17519986629486084, + "learning_rate": 1.917986031371109e-05, + "loss": 1.7127, + "step": 23461 + }, + { + "epoch": 7.201350521792511, + "grad_norm": 0.2603212893009186, + "learning_rate": 1.917594649303725e-05, + "loss": 1.7169, + "step": 23462 + }, + { + "epoch": 7.201657458563536, + "grad_norm": 0.2664981484413147, + "learning_rate": 1.9172032976984792e-05, + "loss": 1.7349, + "step": 23463 + }, + { + "epoch": 7.201964395334561, + "grad_norm": 0.15484265983104706, + "learning_rate": 1.9168119765592375e-05, + "loss": 1.6753, + "step": 23464 + }, + { + "epoch": 7.202271332105586, + "grad_norm": 0.22310250997543335, + "learning_rate": 1.9164206858898664e-05, + "loss": 1.6994, + "step": 23465 + }, + { + "epoch": 7.202578268876612, + "grad_norm": 0.1998710036277771, + "learning_rate": 1.9160294256942336e-05, + "loss": 1.7556, + "step": 23466 + }, + { + "epoch": 7.202885205647637, + "grad_norm": 0.2092670500278473, + "learning_rate": 1.9156381959762058e-05, + "loss": 1.6883, + "step": 23467 + }, + { + "epoch": 7.203192142418661, + "grad_norm": 0.20657336711883545, + "learning_rate": 1.915246996739649e-05, + "loss": 1.8035, + "step": 23468 + }, + { + "epoch": 7.203499079189687, + "grad_norm": 0.2175077497959137, + "learning_rate": 1.9148558279884294e-05, + "loss": 1.7173, + "step": 23469 + }, + { + "epoch": 7.203806015960712, + "grad_norm": 0.16851630806922913, + "learning_rate": 1.9144646897264114e-05, + "loss": 1.6874, + "step": 23470 + }, + { + "epoch": 7.204112952731737, + "grad_norm": 0.23194117844104767, + "learning_rate": 1.9140735819574647e-05, + "loss": 1.7156, + "step": 23471 + }, + { + "epoch": 7.204419889502763, + "grad_norm": 0.17139053344726562, + "learning_rate": 1.9136825046854483e-05, + "loss": 1.6997, + "step": 23472 + }, + { + "epoch": 7.204726826273788, + "grad_norm": 0.18561725318431854, + "learning_rate": 1.913291457914234e-05, + "loss": 1.6575, + "step": 23473 + }, + { + "epoch": 7.2050337630448125, + "grad_norm": 0.2333156019449234, + "learning_rate": 1.9129004416476793e-05, + "loss": 1.7453, + "step": 23474 + }, + { + "epoch": 7.205340699815838, + "grad_norm": 0.2594338655471802, + "learning_rate": 1.9125094558896534e-05, + "loss": 1.7087, + "step": 23475 + }, + { + "epoch": 7.205647636586863, + "grad_norm": 0.16303664445877075, + "learning_rate": 1.91211850064402e-05, + "loss": 1.6985, + "step": 23476 + }, + { + "epoch": 7.2059545733578885, + "grad_norm": 0.2592144012451172, + "learning_rate": 1.9117275759146387e-05, + "loss": 1.7196, + "step": 23477 + }, + { + "epoch": 7.206261510128914, + "grad_norm": 0.1643611341714859, + "learning_rate": 1.9113366817053784e-05, + "loss": 1.686, + "step": 23478 + }, + { + "epoch": 7.206568446899938, + "grad_norm": 0.19730710983276367, + "learning_rate": 1.9109458180200966e-05, + "loss": 1.6883, + "step": 23479 + }, + { + "epoch": 7.206875383670964, + "grad_norm": 0.16942749917507172, + "learning_rate": 1.9105549848626602e-05, + "loss": 1.7272, + "step": 23480 + }, + { + "epoch": 7.207182320441989, + "grad_norm": 0.21967467665672302, + "learning_rate": 1.91016418223693e-05, + "loss": 1.7501, + "step": 23481 + }, + { + "epoch": 7.207489257213014, + "grad_norm": 0.17037035524845123, + "learning_rate": 1.9097734101467684e-05, + "loss": 1.72, + "step": 23482 + }, + { + "epoch": 7.20779619398404, + "grad_norm": 0.21497979760169983, + "learning_rate": 1.9093826685960374e-05, + "loss": 1.6993, + "step": 23483 + }, + { + "epoch": 7.208103130755064, + "grad_norm": 0.1462371051311493, + "learning_rate": 1.9089919575885985e-05, + "loss": 1.6249, + "step": 23484 + }, + { + "epoch": 7.208410067526089, + "grad_norm": 0.1863165646791458, + "learning_rate": 1.9086012771283122e-05, + "loss": 1.6343, + "step": 23485 + }, + { + "epoch": 7.208717004297115, + "grad_norm": 0.1705196648836136, + "learning_rate": 1.9082106272190403e-05, + "loss": 1.7115, + "step": 23486 + }, + { + "epoch": 7.20902394106814, + "grad_norm": 0.20928895473480225, + "learning_rate": 1.9078200078646413e-05, + "loss": 1.6953, + "step": 23487 + }, + { + "epoch": 7.209330877839165, + "grad_norm": 0.2172931581735611, + "learning_rate": 1.9074294190689812e-05, + "loss": 1.7436, + "step": 23488 + }, + { + "epoch": 7.209637814610191, + "grad_norm": 0.1760822981595993, + "learning_rate": 1.9070388608359124e-05, + "loss": 1.6898, + "step": 23489 + }, + { + "epoch": 7.209944751381215, + "grad_norm": 0.28154727816581726, + "learning_rate": 1.9066483331693018e-05, + "loss": 1.7583, + "step": 23490 + }, + { + "epoch": 7.2102516881522405, + "grad_norm": 0.28375890851020813, + "learning_rate": 1.9062578360730027e-05, + "loss": 1.7428, + "step": 23491 + }, + { + "epoch": 7.210558624923266, + "grad_norm": 0.2173614352941513, + "learning_rate": 1.905867369550878e-05, + "loss": 1.6902, + "step": 23492 + }, + { + "epoch": 7.210865561694291, + "grad_norm": 0.2525392174720764, + "learning_rate": 1.9054769336067875e-05, + "loss": 1.7205, + "step": 23493 + }, + { + "epoch": 7.2111724984653165, + "grad_norm": 0.22913219034671783, + "learning_rate": 1.905086528244584e-05, + "loss": 1.7269, + "step": 23494 + }, + { + "epoch": 7.211479435236341, + "grad_norm": 0.2174263298511505, + "learning_rate": 1.9046961534681327e-05, + "loss": 1.7058, + "step": 23495 + }, + { + "epoch": 7.211786372007366, + "grad_norm": 0.2277042120695114, + "learning_rate": 1.9043058092812848e-05, + "loss": 1.7048, + "step": 23496 + }, + { + "epoch": 7.212093308778392, + "grad_norm": 0.17835062742233276, + "learning_rate": 1.9039154956879036e-05, + "loss": 1.7258, + "step": 23497 + }, + { + "epoch": 7.212400245549417, + "grad_norm": 0.22751156985759735, + "learning_rate": 1.903525212691844e-05, + "loss": 1.708, + "step": 23498 + }, + { + "epoch": 7.212707182320442, + "grad_norm": 0.21247950196266174, + "learning_rate": 1.903134960296963e-05, + "loss": 1.7142, + "step": 23499 + }, + { + "epoch": 7.213014119091467, + "grad_norm": 0.2256091684103012, + "learning_rate": 1.9027447385071175e-05, + "loss": 1.6826, + "step": 23500 + }, + { + "epoch": 7.213321055862492, + "grad_norm": 0.16704921424388885, + "learning_rate": 1.902354547326164e-05, + "loss": 1.6639, + "step": 23501 + }, + { + "epoch": 7.213627992633517, + "grad_norm": 0.20211774110794067, + "learning_rate": 1.901964386757958e-05, + "loss": 1.7448, + "step": 23502 + }, + { + "epoch": 7.213934929404543, + "grad_norm": 0.2090187519788742, + "learning_rate": 1.901574256806356e-05, + "loss": 1.7425, + "step": 23503 + }, + { + "epoch": 7.214241866175568, + "grad_norm": 0.1942494809627533, + "learning_rate": 1.9011841574752114e-05, + "loss": 1.721, + "step": 23504 + }, + { + "epoch": 7.214548802946593, + "grad_norm": 0.1842714548110962, + "learning_rate": 1.900794088768385e-05, + "loss": 1.7092, + "step": 23505 + }, + { + "epoch": 7.214855739717618, + "grad_norm": 0.16807401180267334, + "learning_rate": 1.900404050689724e-05, + "loss": 1.6788, + "step": 23506 + }, + { + "epoch": 7.215162676488643, + "grad_norm": 0.16467349231243134, + "learning_rate": 1.9000140432430907e-05, + "loss": 1.6544, + "step": 23507 + }, + { + "epoch": 7.2154696132596685, + "grad_norm": 0.1806645542383194, + "learning_rate": 1.899624066432332e-05, + "loss": 1.6871, + "step": 23508 + }, + { + "epoch": 7.215776550030694, + "grad_norm": 0.16891708970069885, + "learning_rate": 1.8992341202613073e-05, + "loss": 1.6912, + "step": 23509 + }, + { + "epoch": 7.216083486801719, + "grad_norm": 0.21191391348838806, + "learning_rate": 1.89884420473387e-05, + "loss": 1.7843, + "step": 23510 + }, + { + "epoch": 7.216390423572744, + "grad_norm": 0.18484020233154297, + "learning_rate": 1.8984543198538684e-05, + "loss": 1.699, + "step": 23511 + }, + { + "epoch": 7.216697360343769, + "grad_norm": 0.2106105536222458, + "learning_rate": 1.8980644656251627e-05, + "loss": 1.7239, + "step": 23512 + }, + { + "epoch": 7.217004297114794, + "grad_norm": 0.19923320412635803, + "learning_rate": 1.8976746420515988e-05, + "loss": 1.7989, + "step": 23513 + }, + { + "epoch": 7.21731123388582, + "grad_norm": 0.21371988952159882, + "learning_rate": 1.897284849137034e-05, + "loss": 1.7071, + "step": 23514 + }, + { + "epoch": 7.217618170656845, + "grad_norm": 0.20450851321220398, + "learning_rate": 1.8968950868853184e-05, + "loss": 1.7051, + "step": 23515 + }, + { + "epoch": 7.21792510742787, + "grad_norm": 0.22700995206832886, + "learning_rate": 1.8965053553003055e-05, + "loss": 1.7556, + "step": 23516 + }, + { + "epoch": 7.218232044198895, + "grad_norm": 0.26295945048332214, + "learning_rate": 1.896115654385845e-05, + "loss": 1.7893, + "step": 23517 + }, + { + "epoch": 7.21853898096992, + "grad_norm": 0.17091867327690125, + "learning_rate": 1.8957259841457885e-05, + "loss": 1.7289, + "step": 23518 + }, + { + "epoch": 7.218845917740945, + "grad_norm": 0.24840304255485535, + "learning_rate": 1.8953363445839877e-05, + "loss": 1.6958, + "step": 23519 + }, + { + "epoch": 7.219152854511971, + "grad_norm": 0.20042046904563904, + "learning_rate": 1.8949467357042926e-05, + "loss": 1.743, + "step": 23520 + }, + { + "epoch": 7.219459791282996, + "grad_norm": 0.18286047875881195, + "learning_rate": 1.894557157510552e-05, + "loss": 1.7065, + "step": 23521 + }, + { + "epoch": 7.2197667280540205, + "grad_norm": 0.18324656784534454, + "learning_rate": 1.894167610006622e-05, + "loss": 1.7083, + "step": 23522 + }, + { + "epoch": 7.220073664825046, + "grad_norm": 0.17110426723957062, + "learning_rate": 1.8937780931963432e-05, + "loss": 1.7016, + "step": 23523 + }, + { + "epoch": 7.220380601596071, + "grad_norm": 0.19164881110191345, + "learning_rate": 1.8933886070835743e-05, + "loss": 1.7011, + "step": 23524 + }, + { + "epoch": 7.2206875383670965, + "grad_norm": 0.16899923980236053, + "learning_rate": 1.892999151672157e-05, + "loss": 1.7227, + "step": 23525 + }, + { + "epoch": 7.220994475138122, + "grad_norm": 0.18763495981693268, + "learning_rate": 1.8926097269659437e-05, + "loss": 1.6956, + "step": 23526 + }, + { + "epoch": 7.221301411909146, + "grad_norm": 0.1665162295103073, + "learning_rate": 1.8922203329687847e-05, + "loss": 1.7039, + "step": 23527 + }, + { + "epoch": 7.221608348680172, + "grad_norm": 0.20766250789165497, + "learning_rate": 1.8918309696845226e-05, + "loss": 1.7703, + "step": 23528 + }, + { + "epoch": 7.221915285451197, + "grad_norm": 0.1813010275363922, + "learning_rate": 1.891441637117012e-05, + "loss": 1.6709, + "step": 23529 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 0.15327073633670807, + "learning_rate": 1.891052335270094e-05, + "loss": 1.6518, + "step": 23530 + }, + { + "epoch": 7.222529158993248, + "grad_norm": 0.17191094160079956, + "learning_rate": 1.8906630641476203e-05, + "loss": 1.7193, + "step": 23531 + }, + { + "epoch": 7.222836095764273, + "grad_norm": 0.17976176738739014, + "learning_rate": 1.8902738237534363e-05, + "loss": 1.7162, + "step": 23532 + }, + { + "epoch": 7.223143032535297, + "grad_norm": 0.1828993558883667, + "learning_rate": 1.8898846140913894e-05, + "loss": 1.7163, + "step": 23533 + }, + { + "epoch": 7.223449969306323, + "grad_norm": 0.15828034281730652, + "learning_rate": 1.889495435165326e-05, + "loss": 1.6734, + "step": 23534 + }, + { + "epoch": 7.223756906077348, + "grad_norm": 0.2171369194984436, + "learning_rate": 1.8891062869790915e-05, + "loss": 1.7508, + "step": 23535 + }, + { + "epoch": 7.224063842848373, + "grad_norm": 0.18747110664844513, + "learning_rate": 1.888717169536532e-05, + "loss": 1.7162, + "step": 23536 + }, + { + "epoch": 7.224370779619399, + "grad_norm": 0.19177328050136566, + "learning_rate": 1.8883280828414927e-05, + "loss": 1.7044, + "step": 23537 + }, + { + "epoch": 7.224677716390423, + "grad_norm": 0.175906702876091, + "learning_rate": 1.88793902689782e-05, + "loss": 1.7126, + "step": 23538 + }, + { + "epoch": 7.2249846531614486, + "grad_norm": 0.17842896282672882, + "learning_rate": 1.887550001709357e-05, + "loss": 1.7469, + "step": 23539 + }, + { + "epoch": 7.225291589932474, + "grad_norm": 0.23797607421875, + "learning_rate": 1.8871610072799478e-05, + "loss": 1.7343, + "step": 23540 + }, + { + "epoch": 7.225598526703499, + "grad_norm": 0.2297922819852829, + "learning_rate": 1.8867720436134412e-05, + "loss": 1.7453, + "step": 23541 + }, + { + "epoch": 7.225905463474525, + "grad_norm": 0.19950568675994873, + "learning_rate": 1.8863831107136748e-05, + "loss": 1.6984, + "step": 23542 + }, + { + "epoch": 7.226212400245549, + "grad_norm": 0.2809087038040161, + "learning_rate": 1.8859942085844974e-05, + "loss": 1.7815, + "step": 23543 + }, + { + "epoch": 7.226519337016574, + "grad_norm": 0.20534642040729523, + "learning_rate": 1.8856053372297515e-05, + "loss": 1.7455, + "step": 23544 + }, + { + "epoch": 7.2268262737876, + "grad_norm": 0.20052307844161987, + "learning_rate": 1.885216496653276e-05, + "loss": 1.6655, + "step": 23545 + }, + { + "epoch": 7.227133210558625, + "grad_norm": 0.1948573738336563, + "learning_rate": 1.8848276868589205e-05, + "loss": 1.7036, + "step": 23546 + }, + { + "epoch": 7.22744014732965, + "grad_norm": 0.16764269769191742, + "learning_rate": 1.8844389078505197e-05, + "loss": 1.6605, + "step": 23547 + }, + { + "epoch": 7.227747084100676, + "grad_norm": 0.17951633036136627, + "learning_rate": 1.8840501596319214e-05, + "loss": 1.6948, + "step": 23548 + }, + { + "epoch": 7.2280540208717, + "grad_norm": 0.1906418353319168, + "learning_rate": 1.883661442206966e-05, + "loss": 1.7122, + "step": 23549 + }, + { + "epoch": 7.2283609576427255, + "grad_norm": 0.19535204768180847, + "learning_rate": 1.8832727555794943e-05, + "loss": 1.7089, + "step": 23550 + }, + { + "epoch": 7.228667894413751, + "grad_norm": 0.20654071867465973, + "learning_rate": 1.8828840997533488e-05, + "loss": 1.7113, + "step": 23551 + }, + { + "epoch": 7.228974831184776, + "grad_norm": 0.18860456347465515, + "learning_rate": 1.8824954747323692e-05, + "loss": 1.7475, + "step": 23552 + }, + { + "epoch": 7.2292817679558015, + "grad_norm": 0.21949729323387146, + "learning_rate": 1.882106880520396e-05, + "loss": 1.7819, + "step": 23553 + }, + { + "epoch": 7.229588704726826, + "grad_norm": 0.2177286595106125, + "learning_rate": 1.881718317121271e-05, + "loss": 1.7554, + "step": 23554 + }, + { + "epoch": 7.229895641497851, + "grad_norm": 0.21143296360969543, + "learning_rate": 1.8813297845388328e-05, + "loss": 1.7811, + "step": 23555 + }, + { + "epoch": 7.230202578268877, + "grad_norm": 0.24787208437919617, + "learning_rate": 1.880941282776922e-05, + "loss": 1.707, + "step": 23556 + }, + { + "epoch": 7.230509515039902, + "grad_norm": 0.18048164248466492, + "learning_rate": 1.880552811839375e-05, + "loss": 1.6841, + "step": 23557 + }, + { + "epoch": 7.230816451810927, + "grad_norm": 0.24056772887706757, + "learning_rate": 1.8801643717300375e-05, + "loss": 1.7868, + "step": 23558 + }, + { + "epoch": 7.231123388581952, + "grad_norm": 0.18564146757125854, + "learning_rate": 1.879775962452741e-05, + "loss": 1.7506, + "step": 23559 + }, + { + "epoch": 7.231430325352977, + "grad_norm": 0.25965458154678345, + "learning_rate": 1.87938758401133e-05, + "loss": 1.7307, + "step": 23560 + }, + { + "epoch": 7.231737262124002, + "grad_norm": 0.17774315178394318, + "learning_rate": 1.8789992364096394e-05, + "loss": 1.7089, + "step": 23561 + }, + { + "epoch": 7.232044198895028, + "grad_norm": 0.2488560527563095, + "learning_rate": 1.878610919651505e-05, + "loss": 1.6811, + "step": 23562 + }, + { + "epoch": 7.232351135666053, + "grad_norm": 0.1963108628988266, + "learning_rate": 1.8782226337407703e-05, + "loss": 1.6512, + "step": 23563 + }, + { + "epoch": 7.232658072437078, + "grad_norm": 0.25702449679374695, + "learning_rate": 1.8778343786812663e-05, + "loss": 1.7697, + "step": 23564 + }, + { + "epoch": 7.232965009208103, + "grad_norm": 0.18145591020584106, + "learning_rate": 1.8774461544768347e-05, + "loss": 1.6842, + "step": 23565 + }, + { + "epoch": 7.233271945979128, + "grad_norm": 0.2482728213071823, + "learning_rate": 1.87705796113131e-05, + "loss": 1.7028, + "step": 23566 + }, + { + "epoch": 7.2335788827501535, + "grad_norm": 0.16365976631641388, + "learning_rate": 1.8766697986485293e-05, + "loss": 1.7266, + "step": 23567 + }, + { + "epoch": 7.233885819521179, + "grad_norm": 0.1877463459968567, + "learning_rate": 1.876281667032328e-05, + "loss": 1.6909, + "step": 23568 + }, + { + "epoch": 7.234192756292204, + "grad_norm": 0.19121702015399933, + "learning_rate": 1.8758935662865423e-05, + "loss": 1.7303, + "step": 23569 + }, + { + "epoch": 7.234499693063229, + "grad_norm": 0.1783505082130432, + "learning_rate": 1.8755054964150072e-05, + "loss": 1.7209, + "step": 23570 + }, + { + "epoch": 7.234806629834254, + "grad_norm": 0.172771617770195, + "learning_rate": 1.8751174574215585e-05, + "loss": 1.6824, + "step": 23571 + }, + { + "epoch": 7.235113566605279, + "grad_norm": 0.1675102859735489, + "learning_rate": 1.8747294493100304e-05, + "loss": 1.6664, + "step": 23572 + }, + { + "epoch": 7.235420503376305, + "grad_norm": 0.18213391304016113, + "learning_rate": 1.8743414720842578e-05, + "loss": 1.6725, + "step": 23573 + }, + { + "epoch": 7.23572744014733, + "grad_norm": 0.2204304337501526, + "learning_rate": 1.8739535257480728e-05, + "loss": 1.7662, + "step": 23574 + }, + { + "epoch": 7.236034376918354, + "grad_norm": 0.22732098400592804, + "learning_rate": 1.873565610305315e-05, + "loss": 1.7808, + "step": 23575 + }, + { + "epoch": 7.23634131368938, + "grad_norm": 0.17859263718128204, + "learning_rate": 1.8731777257598128e-05, + "loss": 1.6767, + "step": 23576 + }, + { + "epoch": 7.236648250460405, + "grad_norm": 0.16690675914287567, + "learning_rate": 1.8727898721154007e-05, + "loss": 1.6523, + "step": 23577 + }, + { + "epoch": 7.23695518723143, + "grad_norm": 0.17576774954795837, + "learning_rate": 1.872402049375912e-05, + "loss": 1.6951, + "step": 23578 + }, + { + "epoch": 7.237262124002456, + "grad_norm": 0.20455172657966614, + "learning_rate": 1.8720142575451777e-05, + "loss": 1.7402, + "step": 23579 + }, + { + "epoch": 7.237569060773481, + "grad_norm": 0.2122879922389984, + "learning_rate": 1.8716264966270352e-05, + "loss": 1.7571, + "step": 23580 + }, + { + "epoch": 7.2378759975445055, + "grad_norm": 0.17752611637115479, + "learning_rate": 1.87123876662531e-05, + "loss": 1.7185, + "step": 23581 + }, + { + "epoch": 7.238182934315531, + "grad_norm": 0.21253602206707, + "learning_rate": 1.87085106754384e-05, + "loss": 1.7281, + "step": 23582 + }, + { + "epoch": 7.238489871086556, + "grad_norm": 0.19470329582691193, + "learning_rate": 1.8704633993864514e-05, + "loss": 1.6772, + "step": 23583 + }, + { + "epoch": 7.2387968078575815, + "grad_norm": 0.19556869566440582, + "learning_rate": 1.8700757621569786e-05, + "loss": 1.6888, + "step": 23584 + }, + { + "epoch": 7.239103744628607, + "grad_norm": 0.20525780320167542, + "learning_rate": 1.869688155859252e-05, + "loss": 1.7517, + "step": 23585 + }, + { + "epoch": 7.239410681399631, + "grad_norm": 0.23367032408714294, + "learning_rate": 1.869300580497102e-05, + "loss": 1.781, + "step": 23586 + }, + { + "epoch": 7.239717618170657, + "grad_norm": 0.1893240362405777, + "learning_rate": 1.8689130360743583e-05, + "loss": 1.7265, + "step": 23587 + }, + { + "epoch": 7.240024554941682, + "grad_norm": 0.17136700451374054, + "learning_rate": 1.868525522594851e-05, + "loss": 1.6631, + "step": 23588 + }, + { + "epoch": 7.240331491712707, + "grad_norm": 0.1984632909297943, + "learning_rate": 1.8681380400624103e-05, + "loss": 1.7337, + "step": 23589 + }, + { + "epoch": 7.240638428483733, + "grad_norm": 0.19046886265277863, + "learning_rate": 1.867750588480865e-05, + "loss": 1.7094, + "step": 23590 + }, + { + "epoch": 7.240945365254758, + "grad_norm": 0.18242189288139343, + "learning_rate": 1.8673631678540427e-05, + "loss": 1.692, + "step": 23591 + }, + { + "epoch": 7.241252302025782, + "grad_norm": 0.1741522252559662, + "learning_rate": 1.8669757781857768e-05, + "loss": 1.6975, + "step": 23592 + }, + { + "epoch": 7.241559238796808, + "grad_norm": 0.1778191328048706, + "learning_rate": 1.866588419479891e-05, + "loss": 1.7092, + "step": 23593 + }, + { + "epoch": 7.241866175567833, + "grad_norm": 0.17402158677577972, + "learning_rate": 1.866201091740215e-05, + "loss": 1.7072, + "step": 23594 + }, + { + "epoch": 7.242173112338858, + "grad_norm": 0.22215119004249573, + "learning_rate": 1.8658137949705763e-05, + "loss": 1.7205, + "step": 23595 + }, + { + "epoch": 7.242480049109884, + "grad_norm": 0.15291182696819305, + "learning_rate": 1.8654265291748013e-05, + "loss": 1.7341, + "step": 23596 + }, + { + "epoch": 7.242786985880908, + "grad_norm": 0.18226875364780426, + "learning_rate": 1.8650392943567217e-05, + "loss": 1.6731, + "step": 23597 + }, + { + "epoch": 7.2430939226519335, + "grad_norm": 0.19169047474861145, + "learning_rate": 1.864652090520158e-05, + "loss": 1.777, + "step": 23598 + }, + { + "epoch": 7.243400859422959, + "grad_norm": 0.2063349187374115, + "learning_rate": 1.8642649176689437e-05, + "loss": 1.7258, + "step": 23599 + }, + { + "epoch": 7.243707796193984, + "grad_norm": 0.18550212681293488, + "learning_rate": 1.863877775806898e-05, + "loss": 1.7041, + "step": 23600 + }, + { + "epoch": 7.2440147329650095, + "grad_norm": 0.21196649968624115, + "learning_rate": 1.8634906649378514e-05, + "loss": 1.6672, + "step": 23601 + }, + { + "epoch": 7.244321669736034, + "grad_norm": 0.26801541447639465, + "learning_rate": 1.863103585065629e-05, + "loss": 1.6981, + "step": 23602 + }, + { + "epoch": 7.244628606507059, + "grad_norm": 0.1854090690612793, + "learning_rate": 1.862716536194055e-05, + "loss": 1.7406, + "step": 23603 + }, + { + "epoch": 7.244935543278085, + "grad_norm": 0.15906888246536255, + "learning_rate": 1.8623295183269556e-05, + "loss": 1.6721, + "step": 23604 + }, + { + "epoch": 7.24524248004911, + "grad_norm": 0.2210245132446289, + "learning_rate": 1.8619425314681547e-05, + "loss": 1.7717, + "step": 23605 + }, + { + "epoch": 7.245549416820135, + "grad_norm": 0.17654140293598175, + "learning_rate": 1.861555575621477e-05, + "loss": 1.7428, + "step": 23606 + }, + { + "epoch": 7.245856353591161, + "grad_norm": 0.1582319736480713, + "learning_rate": 1.8611686507907466e-05, + "loss": 1.6814, + "step": 23607 + }, + { + "epoch": 7.246163290362185, + "grad_norm": 0.18817248940467834, + "learning_rate": 1.8607817569797852e-05, + "loss": 1.74, + "step": 23608 + }, + { + "epoch": 7.24647022713321, + "grad_norm": 0.26141074299812317, + "learning_rate": 1.8603948941924227e-05, + "loss": 1.6966, + "step": 23609 + }, + { + "epoch": 7.246777163904236, + "grad_norm": 0.16877111792564392, + "learning_rate": 1.8600080624324757e-05, + "loss": 1.6849, + "step": 23610 + }, + { + "epoch": 7.247084100675261, + "grad_norm": 0.16188141703605652, + "learning_rate": 1.8596212617037694e-05, + "loss": 1.6342, + "step": 23611 + }, + { + "epoch": 7.247391037446286, + "grad_norm": 0.19506491720676422, + "learning_rate": 1.8592344920101267e-05, + "loss": 1.6874, + "step": 23612 + }, + { + "epoch": 7.247697974217311, + "grad_norm": 0.1865006536245346, + "learning_rate": 1.8588477533553677e-05, + "loss": 1.7365, + "step": 23613 + }, + { + "epoch": 7.248004910988336, + "grad_norm": 0.16737428307533264, + "learning_rate": 1.85846104574332e-05, + "loss": 1.6971, + "step": 23614 + }, + { + "epoch": 7.2483118477593615, + "grad_norm": 0.1754695028066635, + "learning_rate": 1.858074369177798e-05, + "loss": 1.7133, + "step": 23615 + }, + { + "epoch": 7.248618784530387, + "grad_norm": 0.21066173911094666, + "learning_rate": 1.85768772366263e-05, + "loss": 1.7737, + "step": 23616 + }, + { + "epoch": 7.248925721301412, + "grad_norm": 0.2530418932437897, + "learning_rate": 1.8573011092016303e-05, + "loss": 1.7962, + "step": 23617 + }, + { + "epoch": 7.249232658072437, + "grad_norm": 0.17780029773712158, + "learning_rate": 1.8569145257986247e-05, + "loss": 1.6691, + "step": 23618 + }, + { + "epoch": 7.249539594843462, + "grad_norm": 0.2105826437473297, + "learning_rate": 1.856527973457432e-05, + "loss": 1.6943, + "step": 23619 + }, + { + "epoch": 7.249846531614487, + "grad_norm": 0.20929837226867676, + "learning_rate": 1.856141452181872e-05, + "loss": 1.7223, + "step": 23620 + }, + { + "epoch": 7.250153468385513, + "grad_norm": 0.17105531692504883, + "learning_rate": 1.8557549619757653e-05, + "loss": 1.6956, + "step": 23621 + }, + { + "epoch": 7.250460405156538, + "grad_norm": 0.21282736957073212, + "learning_rate": 1.8553685028429306e-05, + "loss": 1.7299, + "step": 23622 + }, + { + "epoch": 7.250767341927563, + "grad_norm": 0.1673511266708374, + "learning_rate": 1.8549820747871882e-05, + "loss": 1.7184, + "step": 23623 + }, + { + "epoch": 7.251074278698588, + "grad_norm": 0.1877487152814865, + "learning_rate": 1.854595677812356e-05, + "loss": 1.6989, + "step": 23624 + }, + { + "epoch": 7.251381215469613, + "grad_norm": 0.1709173619747162, + "learning_rate": 1.8542093119222504e-05, + "loss": 1.6994, + "step": 23625 + }, + { + "epoch": 7.2516881522406385, + "grad_norm": 0.18894633650779724, + "learning_rate": 1.8538229771206962e-05, + "loss": 1.665, + "step": 23626 + }, + { + "epoch": 7.251995089011664, + "grad_norm": 0.17623448371887207, + "learning_rate": 1.8534366734115056e-05, + "loss": 1.6999, + "step": 23627 + }, + { + "epoch": 7.252302025782689, + "grad_norm": 0.20008981227874756, + "learning_rate": 1.8530504007984982e-05, + "loss": 1.7147, + "step": 23628 + }, + { + "epoch": 7.252608962553714, + "grad_norm": 0.2506260573863983, + "learning_rate": 1.852664159285491e-05, + "loss": 1.7485, + "step": 23629 + }, + { + "epoch": 7.252915899324739, + "grad_norm": 0.17746438086032867, + "learning_rate": 1.8522779488763e-05, + "loss": 1.7534, + "step": 23630 + }, + { + "epoch": 7.253222836095764, + "grad_norm": 0.1910836547613144, + "learning_rate": 1.8518917695747462e-05, + "loss": 1.7167, + "step": 23631 + }, + { + "epoch": 7.25352977286679, + "grad_norm": 0.18009543418884277, + "learning_rate": 1.8515056213846398e-05, + "loss": 1.6849, + "step": 23632 + }, + { + "epoch": 7.253836709637815, + "grad_norm": 0.18150615692138672, + "learning_rate": 1.851119504309804e-05, + "loss": 1.7077, + "step": 23633 + }, + { + "epoch": 7.25414364640884, + "grad_norm": 0.1874052882194519, + "learning_rate": 1.850733418354047e-05, + "loss": 1.7398, + "step": 23634 + }, + { + "epoch": 7.254450583179865, + "grad_norm": 0.18285217881202698, + "learning_rate": 1.8503473635211897e-05, + "loss": 1.7433, + "step": 23635 + }, + { + "epoch": 7.25475751995089, + "grad_norm": 0.19326861202716827, + "learning_rate": 1.8499613398150463e-05, + "loss": 1.7095, + "step": 23636 + }, + { + "epoch": 7.255064456721915, + "grad_norm": 0.21128259599208832, + "learning_rate": 1.849575347239431e-05, + "loss": 1.7352, + "step": 23637 + }, + { + "epoch": 7.255371393492941, + "grad_norm": 0.19309113919734955, + "learning_rate": 1.849189385798159e-05, + "loss": 1.7098, + "step": 23638 + }, + { + "epoch": 7.255678330263966, + "grad_norm": 0.1877751648426056, + "learning_rate": 1.848803455495044e-05, + "loss": 1.7279, + "step": 23639 + }, + { + "epoch": 7.2559852670349905, + "grad_norm": 0.18840502202510834, + "learning_rate": 1.8484175563339e-05, + "loss": 1.7174, + "step": 23640 + }, + { + "epoch": 7.256292203806016, + "grad_norm": 0.1912582963705063, + "learning_rate": 1.848031688318541e-05, + "loss": 1.6964, + "step": 23641 + }, + { + "epoch": 7.256599140577041, + "grad_norm": 0.188243106007576, + "learning_rate": 1.847645851452779e-05, + "loss": 1.7296, + "step": 23642 + }, + { + "epoch": 7.2569060773480665, + "grad_norm": 0.15838554501533508, + "learning_rate": 1.8472600457404317e-05, + "loss": 1.6276, + "step": 23643 + }, + { + "epoch": 7.257213014119092, + "grad_norm": 0.1605941653251648, + "learning_rate": 1.8468742711853065e-05, + "loss": 1.7015, + "step": 23644 + }, + { + "epoch": 7.257519950890116, + "grad_norm": 0.23647825419902802, + "learning_rate": 1.846488527791218e-05, + "loss": 1.775, + "step": 23645 + }, + { + "epoch": 7.257826887661142, + "grad_norm": 0.2414257973432541, + "learning_rate": 1.846102815561978e-05, + "loss": 1.7456, + "step": 23646 + }, + { + "epoch": 7.258133824432167, + "grad_norm": 0.221851646900177, + "learning_rate": 1.845717134501397e-05, + "loss": 1.6875, + "step": 23647 + }, + { + "epoch": 7.258440761203192, + "grad_norm": 0.20732705295085907, + "learning_rate": 1.8453314846132914e-05, + "loss": 1.6619, + "step": 23648 + }, + { + "epoch": 7.258747697974218, + "grad_norm": 0.18818728625774384, + "learning_rate": 1.8449458659014657e-05, + "loss": 1.6961, + "step": 23649 + }, + { + "epoch": 7.259054634745242, + "grad_norm": 0.19335074722766876, + "learning_rate": 1.8445602783697374e-05, + "loss": 1.6816, + "step": 23650 + }, + { + "epoch": 7.259361571516267, + "grad_norm": 0.27334100008010864, + "learning_rate": 1.844174722021911e-05, + "loss": 1.7435, + "step": 23651 + }, + { + "epoch": 7.259668508287293, + "grad_norm": 0.18763858079910278, + "learning_rate": 1.843789196861801e-05, + "loss": 1.713, + "step": 23652 + }, + { + "epoch": 7.259975445058318, + "grad_norm": 0.2585131525993347, + "learning_rate": 1.843403702893216e-05, + "loss": 1.7151, + "step": 23653 + }, + { + "epoch": 7.260282381829343, + "grad_norm": 0.182148277759552, + "learning_rate": 1.843018240119966e-05, + "loss": 1.7018, + "step": 23654 + }, + { + "epoch": 7.260589318600369, + "grad_norm": 0.31881436705589294, + "learning_rate": 1.84263280854586e-05, + "loss": 1.7428, + "step": 23655 + }, + { + "epoch": 7.260896255371393, + "grad_norm": 0.20997895300388336, + "learning_rate": 1.8422474081747073e-05, + "loss": 1.724, + "step": 23656 + }, + { + "epoch": 7.2612031921424185, + "grad_norm": 0.25038522481918335, + "learning_rate": 1.8418620390103163e-05, + "loss": 1.739, + "step": 23657 + }, + { + "epoch": 7.261510128913444, + "grad_norm": 0.22313323616981506, + "learning_rate": 1.841476701056496e-05, + "loss": 1.7493, + "step": 23658 + }, + { + "epoch": 7.261817065684469, + "grad_norm": 0.22516389191150665, + "learning_rate": 1.8410913943170522e-05, + "loss": 1.79, + "step": 23659 + }, + { + "epoch": 7.2621240024554945, + "grad_norm": 0.1966279298067093, + "learning_rate": 1.8407061187957982e-05, + "loss": 1.7418, + "step": 23660 + }, + { + "epoch": 7.262430939226519, + "grad_norm": 0.18697889149188995, + "learning_rate": 1.840320874496536e-05, + "loss": 1.7347, + "step": 23661 + }, + { + "epoch": 7.262737875997544, + "grad_norm": 0.18226566910743713, + "learning_rate": 1.8399356614230755e-05, + "loss": 1.6979, + "step": 23662 + }, + { + "epoch": 7.26304481276857, + "grad_norm": 0.18880577385425568, + "learning_rate": 1.839550479579223e-05, + "loss": 1.6612, + "step": 23663 + }, + { + "epoch": 7.263351749539595, + "grad_norm": 0.2048085480928421, + "learning_rate": 1.8391653289687826e-05, + "loss": 1.7313, + "step": 23664 + }, + { + "epoch": 7.26365868631062, + "grad_norm": 0.238912895321846, + "learning_rate": 1.838780209595567e-05, + "loss": 1.7522, + "step": 23665 + }, + { + "epoch": 7.263965623081646, + "grad_norm": 0.1656452864408493, + "learning_rate": 1.838395121463375e-05, + "loss": 1.6742, + "step": 23666 + }, + { + "epoch": 7.26427255985267, + "grad_norm": 0.2209266573190689, + "learning_rate": 1.8380100645760186e-05, + "loss": 1.6592, + "step": 23667 + }, + { + "epoch": 7.264579496623695, + "grad_norm": 0.19701217114925385, + "learning_rate": 1.8376250389372967e-05, + "loss": 1.7211, + "step": 23668 + }, + { + "epoch": 7.264886433394721, + "grad_norm": 0.229326069355011, + "learning_rate": 1.837240044551019e-05, + "loss": 1.7044, + "step": 23669 + }, + { + "epoch": 7.265193370165746, + "grad_norm": 0.18499960005283356, + "learning_rate": 1.8368550814209894e-05, + "loss": 1.705, + "step": 23670 + }, + { + "epoch": 7.265500306936771, + "grad_norm": 0.25504955649375916, + "learning_rate": 1.8364701495510117e-05, + "loss": 1.7246, + "step": 23671 + }, + { + "epoch": 7.265807243707796, + "grad_norm": 0.25998997688293457, + "learning_rate": 1.8360852489448903e-05, + "loss": 1.8311, + "step": 23672 + }, + { + "epoch": 7.266114180478821, + "grad_norm": 0.2437162697315216, + "learning_rate": 1.8357003796064294e-05, + "loss": 1.6467, + "step": 23673 + }, + { + "epoch": 7.2664211172498465, + "grad_norm": 0.20784614980220795, + "learning_rate": 1.8353155415394315e-05, + "loss": 1.7361, + "step": 23674 + }, + { + "epoch": 7.266728054020872, + "grad_norm": 0.22633932530879974, + "learning_rate": 1.8349307347476998e-05, + "loss": 1.6518, + "step": 23675 + }, + { + "epoch": 7.267034990791897, + "grad_norm": 0.19307547807693481, + "learning_rate": 1.8345459592350367e-05, + "loss": 1.7469, + "step": 23676 + }, + { + "epoch": 7.267341927562922, + "grad_norm": 0.20418168604373932, + "learning_rate": 1.8341612150052483e-05, + "loss": 1.6892, + "step": 23677 + }, + { + "epoch": 7.267648864333947, + "grad_norm": 0.1574825942516327, + "learning_rate": 1.8337765020621332e-05, + "loss": 1.6682, + "step": 23678 + }, + { + "epoch": 7.267955801104972, + "grad_norm": 0.31023111939430237, + "learning_rate": 1.8333918204094947e-05, + "loss": 1.7382, + "step": 23679 + }, + { + "epoch": 7.268262737875998, + "grad_norm": 0.18148623406887054, + "learning_rate": 1.833007170051134e-05, + "loss": 1.726, + "step": 23680 + }, + { + "epoch": 7.268569674647023, + "grad_norm": 0.19278696179389954, + "learning_rate": 1.832622550990851e-05, + "loss": 1.7176, + "step": 23681 + }, + { + "epoch": 7.268876611418047, + "grad_norm": 0.18298377096652985, + "learning_rate": 1.832237963232452e-05, + "loss": 1.6703, + "step": 23682 + }, + { + "epoch": 7.269183548189073, + "grad_norm": 0.2019357681274414, + "learning_rate": 1.8318534067797304e-05, + "loss": 1.7771, + "step": 23683 + }, + { + "epoch": 7.269490484960098, + "grad_norm": 0.21978864073753357, + "learning_rate": 1.8314688816364944e-05, + "loss": 1.7938, + "step": 23684 + }, + { + "epoch": 7.269797421731123, + "grad_norm": 0.20009377598762512, + "learning_rate": 1.831084387806536e-05, + "loss": 1.7312, + "step": 23685 + }, + { + "epoch": 7.270104358502149, + "grad_norm": 0.16587263345718384, + "learning_rate": 1.8306999252936608e-05, + "loss": 1.7098, + "step": 23686 + }, + { + "epoch": 7.270411295273174, + "grad_norm": 0.20567362010478973, + "learning_rate": 1.8303154941016666e-05, + "loss": 1.6893, + "step": 23687 + }, + { + "epoch": 7.2707182320441985, + "grad_norm": 0.1916830986738205, + "learning_rate": 1.8299310942343527e-05, + "loss": 1.7995, + "step": 23688 + }, + { + "epoch": 7.271025168815224, + "grad_norm": 0.18361486494541168, + "learning_rate": 1.8295467256955174e-05, + "loss": 1.6708, + "step": 23689 + }, + { + "epoch": 7.271332105586249, + "grad_norm": 0.20620734989643097, + "learning_rate": 1.8291623884889597e-05, + "loss": 1.7314, + "step": 23690 + }, + { + "epoch": 7.2716390423572745, + "grad_norm": 0.22560660541057587, + "learning_rate": 1.828778082618478e-05, + "loss": 1.7418, + "step": 23691 + }, + { + "epoch": 7.2719459791283, + "grad_norm": 0.2113492786884308, + "learning_rate": 1.8283938080878697e-05, + "loss": 1.724, + "step": 23692 + }, + { + "epoch": 7.272252915899324, + "grad_norm": 0.26234012842178345, + "learning_rate": 1.8280095649009334e-05, + "loss": 1.7723, + "step": 23693 + }, + { + "epoch": 7.27255985267035, + "grad_norm": 0.1675095111131668, + "learning_rate": 1.827625353061465e-05, + "loss": 1.7473, + "step": 23694 + }, + { + "epoch": 7.272866789441375, + "grad_norm": 0.17751236259937286, + "learning_rate": 1.8272411725732623e-05, + "loss": 1.7374, + "step": 23695 + }, + { + "epoch": 7.2731737262124, + "grad_norm": 0.23158904910087585, + "learning_rate": 1.826857023440122e-05, + "loss": 1.8111, + "step": 23696 + }, + { + "epoch": 7.273480662983426, + "grad_norm": 0.17262183129787445, + "learning_rate": 1.8264729056658407e-05, + "loss": 1.7546, + "step": 23697 + }, + { + "epoch": 7.273787599754451, + "grad_norm": 0.20811094343662262, + "learning_rate": 1.8260888192542126e-05, + "loss": 1.8059, + "step": 23698 + }, + { + "epoch": 7.274094536525475, + "grad_norm": 0.17156411707401276, + "learning_rate": 1.825704764209038e-05, + "loss": 1.7261, + "step": 23699 + }, + { + "epoch": 7.274401473296501, + "grad_norm": 0.18523572385311127, + "learning_rate": 1.8253207405341067e-05, + "loss": 1.7139, + "step": 23700 + }, + { + "epoch": 7.274708410067526, + "grad_norm": 0.20626066625118256, + "learning_rate": 1.824936748233219e-05, + "loss": 1.7269, + "step": 23701 + }, + { + "epoch": 7.2750153468385514, + "grad_norm": 0.1717548966407776, + "learning_rate": 1.8245527873101647e-05, + "loss": 1.7168, + "step": 23702 + }, + { + "epoch": 7.275322283609577, + "grad_norm": 0.16322405636310577, + "learning_rate": 1.8241688577687426e-05, + "loss": 1.7392, + "step": 23703 + }, + { + "epoch": 7.275629220380601, + "grad_norm": 0.19775766134262085, + "learning_rate": 1.8237849596127447e-05, + "loss": 1.7055, + "step": 23704 + }, + { + "epoch": 7.275936157151627, + "grad_norm": 0.1969427913427353, + "learning_rate": 1.823401092845966e-05, + "loss": 1.7418, + "step": 23705 + }, + { + "epoch": 7.276243093922652, + "grad_norm": 0.1791812628507614, + "learning_rate": 1.8230172574721992e-05, + "loss": 1.6512, + "step": 23706 + }, + { + "epoch": 7.276550030693677, + "grad_norm": 0.18583156168460846, + "learning_rate": 1.8226334534952384e-05, + "loss": 1.7357, + "step": 23707 + }, + { + "epoch": 7.276856967464703, + "grad_norm": 0.20729652047157288, + "learning_rate": 1.822249680918876e-05, + "loss": 1.7323, + "step": 23708 + }, + { + "epoch": 7.277163904235728, + "grad_norm": 0.20089028775691986, + "learning_rate": 1.8218659397469045e-05, + "loss": 1.6835, + "step": 23709 + }, + { + "epoch": 7.277470841006752, + "grad_norm": 0.16569854319095612, + "learning_rate": 1.8214822299831168e-05, + "loss": 1.7486, + "step": 23710 + }, + { + "epoch": 7.277777777777778, + "grad_norm": 0.19979944825172424, + "learning_rate": 1.8210985516313044e-05, + "loss": 1.7338, + "step": 23711 + }, + { + "epoch": 7.278084714548803, + "grad_norm": 0.23528912663459778, + "learning_rate": 1.82071490469526e-05, + "loss": 1.8086, + "step": 23712 + }, + { + "epoch": 7.278391651319828, + "grad_norm": 0.18231599032878876, + "learning_rate": 1.8203312891787737e-05, + "loss": 1.744, + "step": 23713 + }, + { + "epoch": 7.278698588090854, + "grad_norm": 0.2208651602268219, + "learning_rate": 1.8199477050856374e-05, + "loss": 1.7592, + "step": 23714 + }, + { + "epoch": 7.279005524861878, + "grad_norm": 0.22329792380332947, + "learning_rate": 1.8195641524196417e-05, + "loss": 1.7242, + "step": 23715 + }, + { + "epoch": 7.2793124616329035, + "grad_norm": 0.17745757102966309, + "learning_rate": 1.8191806311845778e-05, + "loss": 1.7162, + "step": 23716 + }, + { + "epoch": 7.279619398403929, + "grad_norm": 0.19536735117435455, + "learning_rate": 1.8187971413842324e-05, + "loss": 1.6814, + "step": 23717 + }, + { + "epoch": 7.279926335174954, + "grad_norm": 0.21853455901145935, + "learning_rate": 1.8184136830224025e-05, + "loss": 1.7049, + "step": 23718 + }, + { + "epoch": 7.2802332719459795, + "grad_norm": 0.1701575070619583, + "learning_rate": 1.8180302561028696e-05, + "loss": 1.6879, + "step": 23719 + }, + { + "epoch": 7.280540208717004, + "grad_norm": 0.18729525804519653, + "learning_rate": 1.8176468606294288e-05, + "loss": 1.6944, + "step": 23720 + }, + { + "epoch": 7.280847145488029, + "grad_norm": 0.20020832121372223, + "learning_rate": 1.8172634966058667e-05, + "loss": 1.7415, + "step": 23721 + }, + { + "epoch": 7.281154082259055, + "grad_norm": 0.1983461081981659, + "learning_rate": 1.8168801640359724e-05, + "loss": 1.7198, + "step": 23722 + }, + { + "epoch": 7.28146101903008, + "grad_norm": 0.17578791081905365, + "learning_rate": 1.8164968629235334e-05, + "loss": 1.7155, + "step": 23723 + }, + { + "epoch": 7.281767955801105, + "grad_norm": 0.1944401115179062, + "learning_rate": 1.8161135932723388e-05, + "loss": 1.7579, + "step": 23724 + }, + { + "epoch": 7.28207489257213, + "grad_norm": 0.20413067936897278, + "learning_rate": 1.8157303550861753e-05, + "loss": 1.7105, + "step": 23725 + }, + { + "epoch": 7.282381829343155, + "grad_norm": 0.17515964806079865, + "learning_rate": 1.8153471483688318e-05, + "loss": 1.7448, + "step": 23726 + }, + { + "epoch": 7.28268876611418, + "grad_norm": 0.2039034515619278, + "learning_rate": 1.8149639731240938e-05, + "loss": 1.691, + "step": 23727 + }, + { + "epoch": 7.282995702885206, + "grad_norm": 0.2136354148387909, + "learning_rate": 1.8145808293557483e-05, + "loss": 1.656, + "step": 23728 + }, + { + "epoch": 7.283302639656231, + "grad_norm": 0.23029537498950958, + "learning_rate": 1.814197717067582e-05, + "loss": 1.7588, + "step": 23729 + }, + { + "epoch": 7.283609576427256, + "grad_norm": 0.371910035610199, + "learning_rate": 1.8138146362633816e-05, + "loss": 1.8138, + "step": 23730 + }, + { + "epoch": 7.283916513198281, + "grad_norm": 0.2273472249507904, + "learning_rate": 1.8134315869469327e-05, + "loss": 1.6985, + "step": 23731 + }, + { + "epoch": 7.284223449969306, + "grad_norm": 0.33206698298454285, + "learning_rate": 1.81304856912202e-05, + "loss": 1.7015, + "step": 23732 + }, + { + "epoch": 7.2845303867403315, + "grad_norm": 0.20799405872821808, + "learning_rate": 1.8126655827924295e-05, + "loss": 1.6932, + "step": 23733 + }, + { + "epoch": 7.284837323511357, + "grad_norm": 0.28721246123313904, + "learning_rate": 1.8122826279619437e-05, + "loss": 1.7726, + "step": 23734 + }, + { + "epoch": 7.285144260282382, + "grad_norm": 0.2365201711654663, + "learning_rate": 1.8118997046343533e-05, + "loss": 1.7609, + "step": 23735 + }, + { + "epoch": 7.285451197053407, + "grad_norm": 0.24772630631923676, + "learning_rate": 1.811516812813435e-05, + "loss": 1.7057, + "step": 23736 + }, + { + "epoch": 7.285758133824432, + "grad_norm": 0.19344007968902588, + "learning_rate": 1.8111339525029802e-05, + "loss": 1.7526, + "step": 23737 + }, + { + "epoch": 7.286065070595457, + "grad_norm": 0.2454877346754074, + "learning_rate": 1.8107511237067648e-05, + "loss": 1.6474, + "step": 23738 + }, + { + "epoch": 7.286372007366483, + "grad_norm": 0.18084865808486938, + "learning_rate": 1.810368326428578e-05, + "loss": 1.7381, + "step": 23739 + }, + { + "epoch": 7.286678944137508, + "grad_norm": 0.26264744997024536, + "learning_rate": 1.8099855606722012e-05, + "loss": 1.6585, + "step": 23740 + }, + { + "epoch": 7.286985880908533, + "grad_norm": 0.20219333469867706, + "learning_rate": 1.809602826441416e-05, + "loss": 1.7552, + "step": 23741 + }, + { + "epoch": 7.287292817679558, + "grad_norm": 0.23982326686382294, + "learning_rate": 1.8092201237400064e-05, + "loss": 1.6784, + "step": 23742 + }, + { + "epoch": 7.287599754450583, + "grad_norm": 0.22838538885116577, + "learning_rate": 1.8088374525717534e-05, + "loss": 1.6976, + "step": 23743 + }, + { + "epoch": 7.287906691221608, + "grad_norm": 0.22077307105064392, + "learning_rate": 1.8084548129404395e-05, + "loss": 1.721, + "step": 23744 + }, + { + "epoch": 7.288213627992634, + "grad_norm": 0.19811047613620758, + "learning_rate": 1.8080722048498448e-05, + "loss": 1.7317, + "step": 23745 + }, + { + "epoch": 7.288520564763659, + "grad_norm": 0.25160667300224304, + "learning_rate": 1.8076896283037525e-05, + "loss": 1.7725, + "step": 23746 + }, + { + "epoch": 7.2888275015346835, + "grad_norm": 0.19819392263889313, + "learning_rate": 1.807307083305942e-05, + "loss": 1.7243, + "step": 23747 + }, + { + "epoch": 7.289134438305709, + "grad_norm": 0.21769097447395325, + "learning_rate": 1.806924569860194e-05, + "loss": 1.74, + "step": 23748 + }, + { + "epoch": 7.289441375076734, + "grad_norm": 0.23126530647277832, + "learning_rate": 1.806542087970289e-05, + "loss": 1.7479, + "step": 23749 + }, + { + "epoch": 7.2897483118477595, + "grad_norm": 0.21002748608589172, + "learning_rate": 1.8061596376400065e-05, + "loss": 1.6547, + "step": 23750 + }, + { + "epoch": 7.290055248618785, + "grad_norm": 0.242569699883461, + "learning_rate": 1.8057772188731255e-05, + "loss": 1.7587, + "step": 23751 + }, + { + "epoch": 7.290362185389809, + "grad_norm": 0.19619157910346985, + "learning_rate": 1.8053948316734287e-05, + "loss": 1.6619, + "step": 23752 + }, + { + "epoch": 7.290669122160835, + "grad_norm": 0.2086232304573059, + "learning_rate": 1.8050124760446896e-05, + "loss": 1.6535, + "step": 23753 + }, + { + "epoch": 7.29097605893186, + "grad_norm": 0.1955464631319046, + "learning_rate": 1.8046301519906932e-05, + "loss": 1.6814, + "step": 23754 + }, + { + "epoch": 7.291282995702885, + "grad_norm": 0.20373155176639557, + "learning_rate": 1.8042478595152117e-05, + "loss": 1.7006, + "step": 23755 + }, + { + "epoch": 7.291589932473911, + "grad_norm": 0.20233015716075897, + "learning_rate": 1.8038655986220272e-05, + "loss": 1.7478, + "step": 23756 + }, + { + "epoch": 7.291896869244935, + "grad_norm": 0.18800894916057587, + "learning_rate": 1.803483369314916e-05, + "loss": 1.747, + "step": 23757 + }, + { + "epoch": 7.29220380601596, + "grad_norm": 0.1838926076889038, + "learning_rate": 1.8031011715976558e-05, + "loss": 1.7086, + "step": 23758 + }, + { + "epoch": 7.292510742786986, + "grad_norm": 0.1806635707616806, + "learning_rate": 1.8027190054740234e-05, + "loss": 1.6682, + "step": 23759 + }, + { + "epoch": 7.292817679558011, + "grad_norm": 0.19762687385082245, + "learning_rate": 1.802336870947796e-05, + "loss": 1.7514, + "step": 23760 + }, + { + "epoch": 7.293124616329036, + "grad_norm": 0.1739082932472229, + "learning_rate": 1.80195476802275e-05, + "loss": 1.7031, + "step": 23761 + }, + { + "epoch": 7.293431553100062, + "grad_norm": 0.18887469172477722, + "learning_rate": 1.8015726967026615e-05, + "loss": 1.7199, + "step": 23762 + }, + { + "epoch": 7.293738489871086, + "grad_norm": 0.17344269156455994, + "learning_rate": 1.8011906569913056e-05, + "loss": 1.693, + "step": 23763 + }, + { + "epoch": 7.2940454266421115, + "grad_norm": 0.16480129957199097, + "learning_rate": 1.800808648892459e-05, + "loss": 1.722, + "step": 23764 + }, + { + "epoch": 7.294352363413137, + "grad_norm": 0.17336638271808624, + "learning_rate": 1.8004266724098963e-05, + "loss": 1.6635, + "step": 23765 + }, + { + "epoch": 7.294659300184162, + "grad_norm": 0.16539151966571808, + "learning_rate": 1.8000447275473925e-05, + "loss": 1.7709, + "step": 23766 + }, + { + "epoch": 7.2949662369551875, + "grad_norm": 0.20660065114498138, + "learning_rate": 1.7996628143087226e-05, + "loss": 1.7262, + "step": 23767 + }, + { + "epoch": 7.295273173726212, + "grad_norm": 0.2292039543390274, + "learning_rate": 1.7992809326976584e-05, + "loss": 1.7444, + "step": 23768 + }, + { + "epoch": 7.295580110497237, + "grad_norm": 0.20323103666305542, + "learning_rate": 1.7988990827179795e-05, + "loss": 1.7456, + "step": 23769 + }, + { + "epoch": 7.295887047268263, + "grad_norm": 0.16919885575771332, + "learning_rate": 1.7985172643734532e-05, + "loss": 1.7304, + "step": 23770 + }, + { + "epoch": 7.296193984039288, + "grad_norm": 0.19135236740112305, + "learning_rate": 1.798135477667859e-05, + "loss": 1.7067, + "step": 23771 + }, + { + "epoch": 7.296500920810313, + "grad_norm": 0.19812993705272675, + "learning_rate": 1.7977537226049627e-05, + "loss": 1.7701, + "step": 23772 + }, + { + "epoch": 7.296807857581339, + "grad_norm": 0.22823916375637054, + "learning_rate": 1.797371999188543e-05, + "loss": 1.737, + "step": 23773 + }, + { + "epoch": 7.297114794352363, + "grad_norm": 0.1862197369337082, + "learning_rate": 1.7969903074223705e-05, + "loss": 1.675, + "step": 23774 + }, + { + "epoch": 7.297421731123388, + "grad_norm": 0.18780425190925598, + "learning_rate": 1.7966086473102168e-05, + "loss": 1.7237, + "step": 23775 + }, + { + "epoch": 7.297728667894414, + "grad_norm": 0.174093559384346, + "learning_rate": 1.7962270188558543e-05, + "loss": 1.7129, + "step": 23776 + }, + { + "epoch": 7.298035604665439, + "grad_norm": 0.22659943997859955, + "learning_rate": 1.7958454220630543e-05, + "loss": 1.7257, + "step": 23777 + }, + { + "epoch": 7.298342541436464, + "grad_norm": 0.18077917397022247, + "learning_rate": 1.7954638569355875e-05, + "loss": 1.6972, + "step": 23778 + }, + { + "epoch": 7.298649478207489, + "grad_norm": 0.18380658328533173, + "learning_rate": 1.795082323477225e-05, + "loss": 1.6577, + "step": 23779 + }, + { + "epoch": 7.298956414978514, + "grad_norm": 0.17016704380512238, + "learning_rate": 1.7947008216917384e-05, + "loss": 1.7222, + "step": 23780 + }, + { + "epoch": 7.2992633517495396, + "grad_norm": 0.2016153484582901, + "learning_rate": 1.794319351582896e-05, + "loss": 1.6833, + "step": 23781 + }, + { + "epoch": 7.299570288520565, + "grad_norm": 0.26723918318748474, + "learning_rate": 1.7939379131544687e-05, + "loss": 1.7417, + "step": 23782 + }, + { + "epoch": 7.29987722529159, + "grad_norm": 0.2555576264858246, + "learning_rate": 1.7935565064102267e-05, + "loss": 1.7373, + "step": 23783 + }, + { + "epoch": 7.300184162062616, + "grad_norm": 0.2036418914794922, + "learning_rate": 1.793175131353938e-05, + "loss": 1.7052, + "step": 23784 + }, + { + "epoch": 7.30049109883364, + "grad_norm": 0.1789570152759552, + "learning_rate": 1.792793787989371e-05, + "loss": 1.6327, + "step": 23785 + }, + { + "epoch": 7.300798035604665, + "grad_norm": 0.2353249490261078, + "learning_rate": 1.7924124763202987e-05, + "loss": 1.7771, + "step": 23786 + }, + { + "epoch": 7.301104972375691, + "grad_norm": 0.19072949886322021, + "learning_rate": 1.792031196350483e-05, + "loss": 1.7095, + "step": 23787 + }, + { + "epoch": 7.301411909146716, + "grad_norm": 0.24063248932361603, + "learning_rate": 1.791649948083699e-05, + "loss": 1.7247, + "step": 23788 + }, + { + "epoch": 7.301718845917741, + "grad_norm": 0.1916036456823349, + "learning_rate": 1.791268731523707e-05, + "loss": 1.6844, + "step": 23789 + }, + { + "epoch": 7.302025782688766, + "grad_norm": 0.2606290876865387, + "learning_rate": 1.7908875466742797e-05, + "loss": 1.771, + "step": 23790 + }, + { + "epoch": 7.302332719459791, + "grad_norm": 0.23444804549217224, + "learning_rate": 1.7905063935391824e-05, + "loss": 1.747, + "step": 23791 + }, + { + "epoch": 7.3026396562308165, + "grad_norm": 0.28058725595474243, + "learning_rate": 1.7901252721221822e-05, + "loss": 1.7284, + "step": 23792 + }, + { + "epoch": 7.302946593001842, + "grad_norm": 0.23268578946590424, + "learning_rate": 1.7897441824270456e-05, + "loss": 1.7222, + "step": 23793 + }, + { + "epoch": 7.303253529772867, + "grad_norm": 0.275336354970932, + "learning_rate": 1.789363124457539e-05, + "loss": 1.7495, + "step": 23794 + }, + { + "epoch": 7.303560466543892, + "grad_norm": 0.21838977932929993, + "learning_rate": 1.788982098217427e-05, + "loss": 1.725, + "step": 23795 + }, + { + "epoch": 7.303867403314917, + "grad_norm": 0.24108058214187622, + "learning_rate": 1.7886011037104767e-05, + "loss": 1.7804, + "step": 23796 + }, + { + "epoch": 7.304174340085942, + "grad_norm": 0.23003144562244415, + "learning_rate": 1.788220140940452e-05, + "loss": 1.8189, + "step": 23797 + }, + { + "epoch": 7.304481276856968, + "grad_norm": 0.20129653811454773, + "learning_rate": 1.7878392099111186e-05, + "loss": 1.6603, + "step": 23798 + }, + { + "epoch": 7.304788213627993, + "grad_norm": 0.26172930002212524, + "learning_rate": 1.7874583106262404e-05, + "loss": 1.7095, + "step": 23799 + }, + { + "epoch": 7.305095150399017, + "grad_norm": 0.212156742811203, + "learning_rate": 1.7870774430895825e-05, + "loss": 1.7272, + "step": 23800 + }, + { + "epoch": 7.305402087170043, + "grad_norm": 0.2775247097015381, + "learning_rate": 1.7866966073049084e-05, + "loss": 1.773, + "step": 23801 + }, + { + "epoch": 7.305709023941068, + "grad_norm": 0.23456308245658875, + "learning_rate": 1.7863158032759803e-05, + "loss": 1.7173, + "step": 23802 + }, + { + "epoch": 7.306015960712093, + "grad_norm": 0.23986588418483734, + "learning_rate": 1.785935031006566e-05, + "loss": 1.6924, + "step": 23803 + }, + { + "epoch": 7.306322897483119, + "grad_norm": 0.1909915804862976, + "learning_rate": 1.7855542905004225e-05, + "loss": 1.7047, + "step": 23804 + }, + { + "epoch": 7.306629834254144, + "grad_norm": 0.20676325261592865, + "learning_rate": 1.7851735817613192e-05, + "loss": 1.6606, + "step": 23805 + }, + { + "epoch": 7.3069367710251685, + "grad_norm": 0.1910121887922287, + "learning_rate": 1.7847929047930106e-05, + "loss": 1.7555, + "step": 23806 + }, + { + "epoch": 7.307243707796194, + "grad_norm": 0.22737936675548553, + "learning_rate": 1.784412259599265e-05, + "loss": 1.7346, + "step": 23807 + }, + { + "epoch": 7.307550644567219, + "grad_norm": 0.1553424894809723, + "learning_rate": 1.7840316461838426e-05, + "loss": 1.6755, + "step": 23808 + }, + { + "epoch": 7.3078575813382445, + "grad_norm": 0.17937089502811432, + "learning_rate": 1.7836510645505044e-05, + "loss": 1.684, + "step": 23809 + }, + { + "epoch": 7.30816451810927, + "grad_norm": 0.20183639228343964, + "learning_rate": 1.783270514703011e-05, + "loss": 1.7617, + "step": 23810 + }, + { + "epoch": 7.308471454880294, + "grad_norm": 0.21359068155288696, + "learning_rate": 1.782889996645124e-05, + "loss": 1.6897, + "step": 23811 + }, + { + "epoch": 7.30877839165132, + "grad_norm": 0.19640007615089417, + "learning_rate": 1.782509510380604e-05, + "loss": 1.7029, + "step": 23812 + }, + { + "epoch": 7.309085328422345, + "grad_norm": 0.22678261995315552, + "learning_rate": 1.7821290559132104e-05, + "loss": 1.7241, + "step": 23813 + }, + { + "epoch": 7.30939226519337, + "grad_norm": 0.1797642707824707, + "learning_rate": 1.7817486332467037e-05, + "loss": 1.7127, + "step": 23814 + }, + { + "epoch": 7.309699201964396, + "grad_norm": 0.18758134543895721, + "learning_rate": 1.7813682423848432e-05, + "loss": 1.7394, + "step": 23815 + }, + { + "epoch": 7.310006138735421, + "grad_norm": 0.2064354121685028, + "learning_rate": 1.7809878833313887e-05, + "loss": 1.7477, + "step": 23816 + }, + { + "epoch": 7.310313075506445, + "grad_norm": 0.30564701557159424, + "learning_rate": 1.780607556090098e-05, + "loss": 1.7006, + "step": 23817 + }, + { + "epoch": 7.310620012277471, + "grad_norm": 0.23694200813770294, + "learning_rate": 1.7802272606647308e-05, + "loss": 1.7821, + "step": 23818 + }, + { + "epoch": 7.310926949048496, + "grad_norm": 0.20436422526836395, + "learning_rate": 1.779846997059043e-05, + "loss": 1.6681, + "step": 23819 + }, + { + "epoch": 7.311233885819521, + "grad_norm": 0.21899428963661194, + "learning_rate": 1.779466765276798e-05, + "loss": 1.7416, + "step": 23820 + }, + { + "epoch": 7.311540822590547, + "grad_norm": 0.24186378717422485, + "learning_rate": 1.779086565321747e-05, + "loss": 1.7258, + "step": 23821 + }, + { + "epoch": 7.311847759361571, + "grad_norm": 0.22940407693386078, + "learning_rate": 1.778706397197653e-05, + "loss": 1.7211, + "step": 23822 + }, + { + "epoch": 7.3121546961325965, + "grad_norm": 0.18643233180046082, + "learning_rate": 1.778326260908268e-05, + "loss": 1.6778, + "step": 23823 + }, + { + "epoch": 7.312461632903622, + "grad_norm": 0.25372037291526794, + "learning_rate": 1.7779461564573526e-05, + "loss": 1.7252, + "step": 23824 + }, + { + "epoch": 7.312768569674647, + "grad_norm": 0.21126380562782288, + "learning_rate": 1.7775660838486612e-05, + "loss": 1.6655, + "step": 23825 + }, + { + "epoch": 7.3130755064456725, + "grad_norm": 0.19614748656749725, + "learning_rate": 1.777186043085951e-05, + "loss": 1.7223, + "step": 23826 + }, + { + "epoch": 7.313382443216697, + "grad_norm": 0.2111951857805252, + "learning_rate": 1.7768060341729768e-05, + "loss": 1.708, + "step": 23827 + }, + { + "epoch": 7.313689379987722, + "grad_norm": 0.2675856053829193, + "learning_rate": 1.7764260571134956e-05, + "loss": 1.7387, + "step": 23828 + }, + { + "epoch": 7.313996316758748, + "grad_norm": 0.19827900826931, + "learning_rate": 1.7760461119112603e-05, + "loss": 1.6809, + "step": 23829 + }, + { + "epoch": 7.314303253529773, + "grad_norm": 0.24213160574436188, + "learning_rate": 1.775666198570028e-05, + "loss": 1.7064, + "step": 23830 + }, + { + "epoch": 7.314610190300798, + "grad_norm": 0.20035916566848755, + "learning_rate": 1.7752863170935514e-05, + "loss": 1.6874, + "step": 23831 + }, + { + "epoch": 7.314917127071823, + "grad_norm": 0.23662878572940826, + "learning_rate": 1.774906467485586e-05, + "loss": 1.7651, + "step": 23832 + }, + { + "epoch": 7.315224063842848, + "grad_norm": 0.18523871898651123, + "learning_rate": 1.7745266497498847e-05, + "loss": 1.7003, + "step": 23833 + }, + { + "epoch": 7.315531000613873, + "grad_norm": 0.21452756226062775, + "learning_rate": 1.7741468638902016e-05, + "loss": 1.7012, + "step": 23834 + }, + { + "epoch": 7.315837937384899, + "grad_norm": 0.17513468861579895, + "learning_rate": 1.7737671099102904e-05, + "loss": 1.6965, + "step": 23835 + }, + { + "epoch": 7.316144874155924, + "grad_norm": 0.29025998711586, + "learning_rate": 1.7733873878139012e-05, + "loss": 1.7347, + "step": 23836 + }, + { + "epoch": 7.316451810926949, + "grad_norm": 0.14812500774860382, + "learning_rate": 1.7730076976047926e-05, + "loss": 1.6469, + "step": 23837 + }, + { + "epoch": 7.316758747697974, + "grad_norm": 0.23575027287006378, + "learning_rate": 1.77262803928671e-05, + "loss": 1.7267, + "step": 23838 + }, + { + "epoch": 7.317065684468999, + "grad_norm": 0.17986448109149933, + "learning_rate": 1.7722484128634125e-05, + "loss": 1.7206, + "step": 23839 + }, + { + "epoch": 7.3173726212400245, + "grad_norm": 0.22515927255153656, + "learning_rate": 1.7718688183386446e-05, + "loss": 1.7216, + "step": 23840 + }, + { + "epoch": 7.31767955801105, + "grad_norm": 0.1903398036956787, + "learning_rate": 1.7714892557161624e-05, + "loss": 1.7108, + "step": 23841 + }, + { + "epoch": 7.317986494782075, + "grad_norm": 0.23623183369636536, + "learning_rate": 1.7711097249997162e-05, + "loss": 1.6866, + "step": 23842 + }, + { + "epoch": 7.3182934315531, + "grad_norm": 0.18501855432987213, + "learning_rate": 1.7707302261930554e-05, + "loss": 1.6643, + "step": 23843 + }, + { + "epoch": 7.318600368324125, + "grad_norm": 0.21865275502204895, + "learning_rate": 1.770350759299932e-05, + "loss": 1.6932, + "step": 23844 + }, + { + "epoch": 7.31890730509515, + "grad_norm": 0.22363261878490448, + "learning_rate": 1.7699713243240945e-05, + "loss": 1.721, + "step": 23845 + }, + { + "epoch": 7.319214241866176, + "grad_norm": 0.25587835907936096, + "learning_rate": 1.769591921269294e-05, + "loss": 1.7375, + "step": 23846 + }, + { + "epoch": 7.319521178637201, + "grad_norm": 0.22086483240127563, + "learning_rate": 1.76921255013928e-05, + "loss": 1.6957, + "step": 23847 + }, + { + "epoch": 7.319828115408226, + "grad_norm": 0.21197499334812164, + "learning_rate": 1.7688332109378007e-05, + "loss": 1.6993, + "step": 23848 + }, + { + "epoch": 7.320135052179251, + "grad_norm": 0.21211451292037964, + "learning_rate": 1.7684539036686054e-05, + "loss": 1.7329, + "step": 23849 + }, + { + "epoch": 7.320441988950276, + "grad_norm": 0.16938872635364532, + "learning_rate": 1.7680746283354433e-05, + "loss": 1.6895, + "step": 23850 + }, + { + "epoch": 7.320748925721301, + "grad_norm": 0.21465681493282318, + "learning_rate": 1.7676953849420613e-05, + "loss": 1.7156, + "step": 23851 + }, + { + "epoch": 7.321055862492327, + "grad_norm": 0.16188180446624756, + "learning_rate": 1.7673161734922084e-05, + "loss": 1.6307, + "step": 23852 + }, + { + "epoch": 7.321362799263352, + "grad_norm": 0.2152155190706253, + "learning_rate": 1.7669369939896302e-05, + "loss": 1.7135, + "step": 23853 + }, + { + "epoch": 7.3216697360343765, + "grad_norm": 0.15789814293384552, + "learning_rate": 1.7665578464380788e-05, + "loss": 1.7269, + "step": 23854 + }, + { + "epoch": 7.321976672805402, + "grad_norm": 0.17263127863407135, + "learning_rate": 1.7661787308412948e-05, + "loss": 1.6624, + "step": 23855 + }, + { + "epoch": 7.322283609576427, + "grad_norm": 0.19711650907993317, + "learning_rate": 1.7657996472030308e-05, + "loss": 1.7837, + "step": 23856 + }, + { + "epoch": 7.3225905463474525, + "grad_norm": 0.1847725212574005, + "learning_rate": 1.765420595527027e-05, + "loss": 1.707, + "step": 23857 + }, + { + "epoch": 7.322897483118478, + "grad_norm": 0.21316368877887726, + "learning_rate": 1.7650415758170345e-05, + "loss": 1.715, + "step": 23858 + }, + { + "epoch": 7.323204419889503, + "grad_norm": 0.1912030428647995, + "learning_rate": 1.7646625880767976e-05, + "loss": 1.7465, + "step": 23859 + }, + { + "epoch": 7.323511356660528, + "grad_norm": 0.16245616972446442, + "learning_rate": 1.7642836323100614e-05, + "loss": 1.7365, + "step": 23860 + }, + { + "epoch": 7.323818293431553, + "grad_norm": 0.20665429532527924, + "learning_rate": 1.76390470852057e-05, + "loss": 1.7435, + "step": 23861 + }, + { + "epoch": 7.324125230202578, + "grad_norm": 0.17079970240592957, + "learning_rate": 1.76352581671207e-05, + "loss": 1.7094, + "step": 23862 + }, + { + "epoch": 7.324432166973604, + "grad_norm": 0.17388395965099335, + "learning_rate": 1.7631469568883042e-05, + "loss": 1.7275, + "step": 23863 + }, + { + "epoch": 7.324739103744629, + "grad_norm": 0.20209765434265137, + "learning_rate": 1.7627681290530175e-05, + "loss": 1.7755, + "step": 23864 + }, + { + "epoch": 7.3250460405156534, + "grad_norm": 0.16459977626800537, + "learning_rate": 1.7623893332099538e-05, + "loss": 1.6765, + "step": 23865 + }, + { + "epoch": 7.325352977286679, + "grad_norm": 0.18313255906105042, + "learning_rate": 1.7620105693628556e-05, + "loss": 1.6792, + "step": 23866 + }, + { + "epoch": 7.325659914057704, + "grad_norm": 0.1651672124862671, + "learning_rate": 1.761631837515468e-05, + "loss": 1.6999, + "step": 23867 + }, + { + "epoch": 7.3259668508287294, + "grad_norm": 0.17414255440235138, + "learning_rate": 1.7612531376715317e-05, + "loss": 1.69, + "step": 23868 + }, + { + "epoch": 7.326273787599755, + "grad_norm": 0.1824718415737152, + "learning_rate": 1.7608744698347908e-05, + "loss": 1.6822, + "step": 23869 + }, + { + "epoch": 7.326580724370779, + "grad_norm": 0.19557121396064758, + "learning_rate": 1.760495834008986e-05, + "loss": 1.6852, + "step": 23870 + }, + { + "epoch": 7.326887661141805, + "grad_norm": 0.17803436517715454, + "learning_rate": 1.7601172301978606e-05, + "loss": 1.7523, + "step": 23871 + }, + { + "epoch": 7.32719459791283, + "grad_norm": 0.24077050387859344, + "learning_rate": 1.7597386584051545e-05, + "loss": 1.8044, + "step": 23872 + }, + { + "epoch": 7.327501534683855, + "grad_norm": 0.20061948895454407, + "learning_rate": 1.7593601186346127e-05, + "loss": 1.7298, + "step": 23873 + }, + { + "epoch": 7.327808471454881, + "grad_norm": 0.17362944781780243, + "learning_rate": 1.758981610889971e-05, + "loss": 1.7116, + "step": 23874 + }, + { + "epoch": 7.328115408225905, + "grad_norm": 0.20858663320541382, + "learning_rate": 1.758603135174974e-05, + "loss": 1.6765, + "step": 23875 + }, + { + "epoch": 7.32842234499693, + "grad_norm": 0.1805036962032318, + "learning_rate": 1.7582246914933604e-05, + "loss": 1.694, + "step": 23876 + }, + { + "epoch": 7.328729281767956, + "grad_norm": 0.26010429859161377, + "learning_rate": 1.7578462798488704e-05, + "loss": 1.7373, + "step": 23877 + }, + { + "epoch": 7.329036218538981, + "grad_norm": 0.19902443885803223, + "learning_rate": 1.7574679002452444e-05, + "loss": 1.72, + "step": 23878 + }, + { + "epoch": 7.329343155310006, + "grad_norm": 0.21231114864349365, + "learning_rate": 1.7570895526862202e-05, + "loss": 1.7526, + "step": 23879 + }, + { + "epoch": 7.329650092081032, + "grad_norm": 0.2075740098953247, + "learning_rate": 1.7567112371755384e-05, + "loss": 1.773, + "step": 23880 + }, + { + "epoch": 7.329957028852056, + "grad_norm": 0.21381771564483643, + "learning_rate": 1.756332953716937e-05, + "loss": 1.733, + "step": 23881 + }, + { + "epoch": 7.3302639656230815, + "grad_norm": 0.21689461171627045, + "learning_rate": 1.755954702314155e-05, + "loss": 1.7234, + "step": 23882 + }, + { + "epoch": 7.330570902394107, + "grad_norm": 0.21094383299350739, + "learning_rate": 1.755576482970929e-05, + "loss": 1.7074, + "step": 23883 + }, + { + "epoch": 7.330877839165132, + "grad_norm": 0.18460774421691895, + "learning_rate": 1.7551982956909985e-05, + "loss": 1.6706, + "step": 23884 + }, + { + "epoch": 7.3311847759361575, + "grad_norm": 0.18868015706539154, + "learning_rate": 1.7548201404781e-05, + "loss": 1.6371, + "step": 23885 + }, + { + "epoch": 7.331491712707182, + "grad_norm": 0.18036094307899475, + "learning_rate": 1.7544420173359715e-05, + "loss": 1.7115, + "step": 23886 + }, + { + "epoch": 7.331798649478207, + "grad_norm": 0.17143553495407104, + "learning_rate": 1.754063926268349e-05, + "loss": 1.668, + "step": 23887 + }, + { + "epoch": 7.332105586249233, + "grad_norm": 0.1700706034898758, + "learning_rate": 1.7536858672789684e-05, + "loss": 1.7244, + "step": 23888 + }, + { + "epoch": 7.332412523020258, + "grad_norm": 0.1740385890007019, + "learning_rate": 1.7533078403715665e-05, + "loss": 1.7163, + "step": 23889 + }, + { + "epoch": 7.332719459791283, + "grad_norm": 0.206922248005867, + "learning_rate": 1.752929845549882e-05, + "loss": 1.7572, + "step": 23890 + }, + { + "epoch": 7.333026396562309, + "grad_norm": 0.22770223021507263, + "learning_rate": 1.7525518828176445e-05, + "loss": 1.7391, + "step": 23891 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.203486829996109, + "learning_rate": 1.7521739521785962e-05, + "loss": 1.7664, + "step": 23892 + }, + { + "epoch": 7.333640270104358, + "grad_norm": 0.15539827942848206, + "learning_rate": 1.7517960536364652e-05, + "loss": 1.675, + "step": 23893 + }, + { + "epoch": 7.333947206875384, + "grad_norm": 0.18226636946201324, + "learning_rate": 1.7514181871949913e-05, + "loss": 1.7097, + "step": 23894 + }, + { + "epoch": 7.334254143646409, + "grad_norm": 0.1522573083639145, + "learning_rate": 1.751040352857907e-05, + "loss": 1.6783, + "step": 23895 + }, + { + "epoch": 7.334561080417434, + "grad_norm": 0.18082024157047272, + "learning_rate": 1.750662550628946e-05, + "loss": 1.752, + "step": 23896 + }, + { + "epoch": 7.334868017188459, + "grad_norm": 0.1968161165714264, + "learning_rate": 1.750284780511844e-05, + "loss": 1.7773, + "step": 23897 + }, + { + "epoch": 7.335174953959484, + "grad_norm": 0.17520470917224884, + "learning_rate": 1.7499070425103286e-05, + "loss": 1.7244, + "step": 23898 + }, + { + "epoch": 7.3354818907305095, + "grad_norm": 0.32224342226982117, + "learning_rate": 1.749529336628139e-05, + "loss": 1.8087, + "step": 23899 + }, + { + "epoch": 7.335788827501535, + "grad_norm": 0.25473707914352417, + "learning_rate": 1.7491516628690053e-05, + "loss": 1.7677, + "step": 23900 + }, + { + "epoch": 7.33609576427256, + "grad_norm": 0.20730654895305634, + "learning_rate": 1.7487740212366604e-05, + "loss": 1.7261, + "step": 23901 + }, + { + "epoch": 7.336402701043585, + "grad_norm": 0.22070205211639404, + "learning_rate": 1.748396411734836e-05, + "loss": 1.8024, + "step": 23902 + }, + { + "epoch": 7.33670963781461, + "grad_norm": 0.16921460628509521, + "learning_rate": 1.7480188343672647e-05, + "loss": 1.6823, + "step": 23903 + }, + { + "epoch": 7.337016574585635, + "grad_norm": 0.16576658189296722, + "learning_rate": 1.747641289137677e-05, + "loss": 1.6563, + "step": 23904 + }, + { + "epoch": 7.337323511356661, + "grad_norm": 0.19541388750076294, + "learning_rate": 1.7472637760498046e-05, + "loss": 1.8023, + "step": 23905 + }, + { + "epoch": 7.337630448127686, + "grad_norm": 0.19848179817199707, + "learning_rate": 1.7468862951073754e-05, + "loss": 1.7395, + "step": 23906 + }, + { + "epoch": 7.337937384898711, + "grad_norm": 0.1627921313047409, + "learning_rate": 1.746508846314127e-05, + "loss": 1.6569, + "step": 23907 + }, + { + "epoch": 7.338244321669736, + "grad_norm": 0.1798046976327896, + "learning_rate": 1.7461314296737813e-05, + "loss": 1.6927, + "step": 23908 + }, + { + "epoch": 7.338551258440761, + "grad_norm": 0.17935742437839508, + "learning_rate": 1.7457540451900757e-05, + "loss": 1.701, + "step": 23909 + }, + { + "epoch": 7.338858195211786, + "grad_norm": 0.16761814057826996, + "learning_rate": 1.745376692866732e-05, + "loss": 1.6701, + "step": 23910 + }, + { + "epoch": 7.339165131982812, + "grad_norm": 0.1733570694923401, + "learning_rate": 1.7449993727074855e-05, + "loss": 1.705, + "step": 23911 + }, + { + "epoch": 7.339472068753837, + "grad_norm": 0.21162372827529907, + "learning_rate": 1.7446220847160626e-05, + "loss": 1.7703, + "step": 23912 + }, + { + "epoch": 7.3397790055248615, + "grad_norm": 0.18743988871574402, + "learning_rate": 1.7442448288961928e-05, + "loss": 1.6899, + "step": 23913 + }, + { + "epoch": 7.340085942295887, + "grad_norm": 0.19185546040534973, + "learning_rate": 1.743867605251605e-05, + "loss": 1.7483, + "step": 23914 + }, + { + "epoch": 7.340392879066912, + "grad_norm": 0.23066233098506927, + "learning_rate": 1.7434904137860232e-05, + "loss": 1.7564, + "step": 23915 + }, + { + "epoch": 7.3406998158379375, + "grad_norm": 0.18159757554531097, + "learning_rate": 1.743113254503179e-05, + "loss": 1.7136, + "step": 23916 + }, + { + "epoch": 7.341006752608963, + "grad_norm": 0.22666020691394806, + "learning_rate": 1.7427361274067995e-05, + "loss": 1.7589, + "step": 23917 + }, + { + "epoch": 7.341313689379987, + "grad_norm": 0.18986108899116516, + "learning_rate": 1.74235903250061e-05, + "loss": 1.7429, + "step": 23918 + }, + { + "epoch": 7.341620626151013, + "grad_norm": 0.17987726628780365, + "learning_rate": 1.741981969788338e-05, + "loss": 1.7457, + "step": 23919 + }, + { + "epoch": 7.341927562922038, + "grad_norm": 0.2370992749929428, + "learning_rate": 1.7416049392737093e-05, + "loss": 1.7594, + "step": 23920 + }, + { + "epoch": 7.342234499693063, + "grad_norm": 0.18698690831661224, + "learning_rate": 1.7412279409604508e-05, + "loss": 1.7555, + "step": 23921 + }, + { + "epoch": 7.342541436464089, + "grad_norm": 0.18401117622852325, + "learning_rate": 1.7408509748522882e-05, + "loss": 1.7355, + "step": 23922 + }, + { + "epoch": 7.342848373235114, + "grad_norm": 0.22045543789863586, + "learning_rate": 1.7404740409529448e-05, + "loss": 1.7227, + "step": 23923 + }, + { + "epoch": 7.343155310006138, + "grad_norm": 0.24414709210395813, + "learning_rate": 1.7400971392661502e-05, + "loss": 1.7551, + "step": 23924 + }, + { + "epoch": 7.343462246777164, + "grad_norm": 0.1906892955303192, + "learning_rate": 1.739720269795623e-05, + "loss": 1.7204, + "step": 23925 + }, + { + "epoch": 7.343769183548189, + "grad_norm": 0.1840149164199829, + "learning_rate": 1.7393434325450948e-05, + "loss": 1.74, + "step": 23926 + }, + { + "epoch": 7.344076120319214, + "grad_norm": 0.21434549987316132, + "learning_rate": 1.7389666275182825e-05, + "loss": 1.6961, + "step": 23927 + }, + { + "epoch": 7.34438305709024, + "grad_norm": 0.19110503792762756, + "learning_rate": 1.7385898547189146e-05, + "loss": 1.7731, + "step": 23928 + }, + { + "epoch": 7.344689993861264, + "grad_norm": 0.18905460834503174, + "learning_rate": 1.7382131141507136e-05, + "loss": 1.6925, + "step": 23929 + }, + { + "epoch": 7.3449969306322895, + "grad_norm": 0.16336308419704437, + "learning_rate": 1.7378364058174024e-05, + "loss": 1.7073, + "step": 23930 + }, + { + "epoch": 7.345303867403315, + "grad_norm": 0.16707782447338104, + "learning_rate": 1.7374597297227056e-05, + "loss": 1.7036, + "step": 23931 + }, + { + "epoch": 7.34561080417434, + "grad_norm": 0.19958938658237457, + "learning_rate": 1.7370830858703406e-05, + "loss": 1.7035, + "step": 23932 + }, + { + "epoch": 7.3459177409453655, + "grad_norm": 0.18446899950504303, + "learning_rate": 1.7367064742640348e-05, + "loss": 1.754, + "step": 23933 + }, + { + "epoch": 7.346224677716391, + "grad_norm": 0.19238999485969543, + "learning_rate": 1.736329894907508e-05, + "loss": 1.6903, + "step": 23934 + }, + { + "epoch": 7.346531614487415, + "grad_norm": 0.1985396146774292, + "learning_rate": 1.7359533478044825e-05, + "loss": 1.7342, + "step": 23935 + }, + { + "epoch": 7.346838551258441, + "grad_norm": 0.19200150668621063, + "learning_rate": 1.7355768329586784e-05, + "loss": 1.6915, + "step": 23936 + }, + { + "epoch": 7.347145488029466, + "grad_norm": 0.19772231578826904, + "learning_rate": 1.7352003503738186e-05, + "loss": 1.7341, + "step": 23937 + }, + { + "epoch": 7.347452424800491, + "grad_norm": 0.1961035579442978, + "learning_rate": 1.7348239000536214e-05, + "loss": 1.7395, + "step": 23938 + }, + { + "epoch": 7.347759361571517, + "grad_norm": 0.15188434720039368, + "learning_rate": 1.7344474820018087e-05, + "loss": 1.635, + "step": 23939 + }, + { + "epoch": 7.348066298342541, + "grad_norm": 0.18748410046100616, + "learning_rate": 1.734071096222098e-05, + "loss": 1.6878, + "step": 23940 + }, + { + "epoch": 7.348373235113566, + "grad_norm": 0.19337952136993408, + "learning_rate": 1.7336947427182143e-05, + "loss": 1.7532, + "step": 23941 + }, + { + "epoch": 7.348680171884592, + "grad_norm": 0.14804427325725555, + "learning_rate": 1.73331842149387e-05, + "loss": 1.683, + "step": 23942 + }, + { + "epoch": 7.348987108655617, + "grad_norm": 0.18310968577861786, + "learning_rate": 1.7329421325527916e-05, + "loss": 1.718, + "step": 23943 + }, + { + "epoch": 7.349294045426642, + "grad_norm": 0.18589583039283752, + "learning_rate": 1.7325658758986906e-05, + "loss": 1.7115, + "step": 23944 + }, + { + "epoch": 7.349600982197667, + "grad_norm": 0.1618955284357071, + "learning_rate": 1.7321896515352904e-05, + "loss": 1.6757, + "step": 23945 + }, + { + "epoch": 7.349907918968692, + "grad_norm": 0.20092655718326569, + "learning_rate": 1.731813459466307e-05, + "loss": 1.7537, + "step": 23946 + }, + { + "epoch": 7.350214855739718, + "grad_norm": 0.17287038266658783, + "learning_rate": 1.7314372996954592e-05, + "loss": 1.6744, + "step": 23947 + }, + { + "epoch": 7.350521792510743, + "grad_norm": 0.19176220893859863, + "learning_rate": 1.731061172226465e-05, + "loss": 1.7279, + "step": 23948 + }, + { + "epoch": 7.350828729281768, + "grad_norm": 0.2060871571302414, + "learning_rate": 1.7306850770630367e-05, + "loss": 1.7802, + "step": 23949 + }, + { + "epoch": 7.351135666052793, + "grad_norm": 0.27185341715812683, + "learning_rate": 1.7303090142088967e-05, + "loss": 1.7234, + "step": 23950 + }, + { + "epoch": 7.351442602823818, + "grad_norm": 0.19845733046531677, + "learning_rate": 1.729932983667759e-05, + "loss": 1.7503, + "step": 23951 + }, + { + "epoch": 7.351749539594843, + "grad_norm": 0.19455648958683014, + "learning_rate": 1.729556985443341e-05, + "loss": 1.8096, + "step": 23952 + }, + { + "epoch": 7.352056476365869, + "grad_norm": 0.19090545177459717, + "learning_rate": 1.729181019539357e-05, + "loss": 1.6776, + "step": 23953 + }, + { + "epoch": 7.352363413136894, + "grad_norm": 0.16086700558662415, + "learning_rate": 1.728805085959524e-05, + "loss": 1.6829, + "step": 23954 + }, + { + "epoch": 7.352670349907919, + "grad_norm": 0.2156524360179901, + "learning_rate": 1.7284291847075555e-05, + "loss": 1.7147, + "step": 23955 + }, + { + "epoch": 7.352977286678944, + "grad_norm": 0.20258861780166626, + "learning_rate": 1.728053315787168e-05, + "loss": 1.7085, + "step": 23956 + }, + { + "epoch": 7.353284223449969, + "grad_norm": 0.1877330094575882, + "learning_rate": 1.7276774792020735e-05, + "loss": 1.7311, + "step": 23957 + }, + { + "epoch": 7.3535911602209945, + "grad_norm": 0.22096484899520874, + "learning_rate": 1.727301674955992e-05, + "loss": 1.6712, + "step": 23958 + }, + { + "epoch": 7.35389809699202, + "grad_norm": 0.21456706523895264, + "learning_rate": 1.726925903052629e-05, + "loss": 1.7773, + "step": 23959 + }, + { + "epoch": 7.354205033763045, + "grad_norm": 0.2114667296409607, + "learning_rate": 1.7265501634957072e-05, + "loss": 1.669, + "step": 23960 + }, + { + "epoch": 7.35451197053407, + "grad_norm": 0.1676410287618637, + "learning_rate": 1.726174456288931e-05, + "loss": 1.6673, + "step": 23961 + }, + { + "epoch": 7.354818907305095, + "grad_norm": 0.19883838295936584, + "learning_rate": 1.72579878143602e-05, + "loss": 1.6821, + "step": 23962 + }, + { + "epoch": 7.35512584407612, + "grad_norm": 0.19240599870681763, + "learning_rate": 1.725423138940684e-05, + "loss": 1.741, + "step": 23963 + }, + { + "epoch": 7.355432780847146, + "grad_norm": 0.230613574385643, + "learning_rate": 1.7250475288066363e-05, + "loss": 1.6937, + "step": 23964 + }, + { + "epoch": 7.355739717618171, + "grad_norm": 0.17126981914043427, + "learning_rate": 1.7246719510375898e-05, + "loss": 1.6791, + "step": 23965 + }, + { + "epoch": 7.356046654389196, + "grad_norm": 0.1852734386920929, + "learning_rate": 1.7242964056372518e-05, + "loss": 1.7196, + "step": 23966 + }, + { + "epoch": 7.356353591160221, + "grad_norm": 0.1922985464334488, + "learning_rate": 1.723920892609338e-05, + "loss": 1.794, + "step": 23967 + }, + { + "epoch": 7.356660527931246, + "grad_norm": 0.1918993592262268, + "learning_rate": 1.7235454119575582e-05, + "loss": 1.7725, + "step": 23968 + }, + { + "epoch": 7.356967464702271, + "grad_norm": 0.21787014603614807, + "learning_rate": 1.723169963685623e-05, + "loss": 1.7382, + "step": 23969 + }, + { + "epoch": 7.357274401473297, + "grad_norm": 0.23753544688224792, + "learning_rate": 1.722794547797243e-05, + "loss": 1.7924, + "step": 23970 + }, + { + "epoch": 7.357581338244322, + "grad_norm": 0.2251000851392746, + "learning_rate": 1.722419164296128e-05, + "loss": 1.6794, + "step": 23971 + }, + { + "epoch": 7.3578882750153465, + "grad_norm": 0.21573983132839203, + "learning_rate": 1.7220438131859878e-05, + "loss": 1.796, + "step": 23972 + }, + { + "epoch": 7.358195211786372, + "grad_norm": 0.217384472489357, + "learning_rate": 1.721668494470532e-05, + "loss": 1.7305, + "step": 23973 + }, + { + "epoch": 7.358502148557397, + "grad_norm": 0.21815331280231476, + "learning_rate": 1.7212932081534677e-05, + "loss": 1.7348, + "step": 23974 + }, + { + "epoch": 7.3588090853284225, + "grad_norm": 0.19974499940872192, + "learning_rate": 1.7209179542385097e-05, + "loss": 1.7383, + "step": 23975 + }, + { + "epoch": 7.359116022099448, + "grad_norm": 0.20518191158771515, + "learning_rate": 1.7205427327293582e-05, + "loss": 1.7087, + "step": 23976 + }, + { + "epoch": 7.359422958870473, + "grad_norm": 0.17104744911193848, + "learning_rate": 1.7201675436297293e-05, + "loss": 1.718, + "step": 23977 + }, + { + "epoch": 7.359729895641498, + "grad_norm": 0.2165975421667099, + "learning_rate": 1.7197923869433235e-05, + "loss": 1.7907, + "step": 23978 + }, + { + "epoch": 7.360036832412523, + "grad_norm": 0.1784742921590805, + "learning_rate": 1.719417262673854e-05, + "loss": 1.6354, + "step": 23979 + }, + { + "epoch": 7.360343769183548, + "grad_norm": 0.1867162138223648, + "learning_rate": 1.719042170825026e-05, + "loss": 1.7264, + "step": 23980 + }, + { + "epoch": 7.360650705954574, + "grad_norm": 0.19704937934875488, + "learning_rate": 1.7186671114005458e-05, + "loss": 1.72, + "step": 23981 + }, + { + "epoch": 7.360957642725599, + "grad_norm": 0.20316866040229797, + "learning_rate": 1.718292084404123e-05, + "loss": 1.759, + "step": 23982 + }, + { + "epoch": 7.361264579496623, + "grad_norm": 0.20339833199977875, + "learning_rate": 1.717917089839457e-05, + "loss": 1.7537, + "step": 23983 + }, + { + "epoch": 7.361571516267649, + "grad_norm": 0.18114012479782104, + "learning_rate": 1.71754212771026e-05, + "loss": 1.7207, + "step": 23984 + }, + { + "epoch": 7.361878453038674, + "grad_norm": 0.16071686148643494, + "learning_rate": 1.7171671980202353e-05, + "loss": 1.6534, + "step": 23985 + }, + { + "epoch": 7.362185389809699, + "grad_norm": 0.15212370455265045, + "learning_rate": 1.7167923007730892e-05, + "loss": 1.6638, + "step": 23986 + }, + { + "epoch": 7.362492326580725, + "grad_norm": 0.16284595429897308, + "learning_rate": 1.7164174359725253e-05, + "loss": 1.7442, + "step": 23987 + }, + { + "epoch": 7.362799263351749, + "grad_norm": 0.18302884697914124, + "learning_rate": 1.7160426036222494e-05, + "loss": 1.7087, + "step": 23988 + }, + { + "epoch": 7.3631062001227745, + "grad_norm": 0.18764640390872955, + "learning_rate": 1.715667803725965e-05, + "loss": 1.702, + "step": 23989 + }, + { + "epoch": 7.3634131368938, + "grad_norm": 0.16912522912025452, + "learning_rate": 1.7152930362873758e-05, + "loss": 1.742, + "step": 23990 + }, + { + "epoch": 7.363720073664825, + "grad_norm": 0.21137015521526337, + "learning_rate": 1.714918301310185e-05, + "loss": 1.7074, + "step": 23991 + }, + { + "epoch": 7.3640270104358505, + "grad_norm": 0.17562401294708252, + "learning_rate": 1.7145435987981008e-05, + "loss": 1.69, + "step": 23992 + }, + { + "epoch": 7.364333947206875, + "grad_norm": 0.15575642883777618, + "learning_rate": 1.714168928754818e-05, + "loss": 1.6986, + "step": 23993 + }, + { + "epoch": 7.3646408839779, + "grad_norm": 0.18057680130004883, + "learning_rate": 1.7137942911840477e-05, + "loss": 1.7661, + "step": 23994 + }, + { + "epoch": 7.364947820748926, + "grad_norm": 0.18899883329868317, + "learning_rate": 1.7134196860894853e-05, + "loss": 1.6841, + "step": 23995 + }, + { + "epoch": 7.365254757519951, + "grad_norm": 0.15350781381130219, + "learning_rate": 1.7130451134748367e-05, + "loss": 1.7005, + "step": 23996 + }, + { + "epoch": 7.365561694290976, + "grad_norm": 0.20394811034202576, + "learning_rate": 1.7126705733438037e-05, + "loss": 1.7342, + "step": 23997 + }, + { + "epoch": 7.365868631062002, + "grad_norm": 0.1881636083126068, + "learning_rate": 1.7122960657000864e-05, + "loss": 1.6985, + "step": 23998 + }, + { + "epoch": 7.366175567833026, + "grad_norm": 0.1619534194469452, + "learning_rate": 1.711921590547388e-05, + "loss": 1.6579, + "step": 23999 + }, + { + "epoch": 7.366482504604051, + "grad_norm": 0.16795861721038818, + "learning_rate": 1.711547147889404e-05, + "loss": 1.717, + "step": 24000 + }, + { + "epoch": 7.366789441375077, + "grad_norm": 0.1452684998512268, + "learning_rate": 1.711172737729841e-05, + "loss": 1.6792, + "step": 24001 + }, + { + "epoch": 7.367096378146102, + "grad_norm": 0.14940062165260315, + "learning_rate": 1.710798360072396e-05, + "loss": 1.6731, + "step": 24002 + }, + { + "epoch": 7.367403314917127, + "grad_norm": 0.21277321875095367, + "learning_rate": 1.7104240149207694e-05, + "loss": 1.7145, + "step": 24003 + }, + { + "epoch": 7.367710251688152, + "grad_norm": 0.17097726464271545, + "learning_rate": 1.710049702278661e-05, + "loss": 1.7052, + "step": 24004 + }, + { + "epoch": 7.368017188459177, + "grad_norm": 0.15970511734485626, + "learning_rate": 1.7096754221497702e-05, + "loss": 1.6586, + "step": 24005 + }, + { + "epoch": 7.3683241252302025, + "grad_norm": 0.198451429605484, + "learning_rate": 1.7093011745377945e-05, + "loss": 1.7449, + "step": 24006 + }, + { + "epoch": 7.368631062001228, + "grad_norm": 0.19554266333580017, + "learning_rate": 1.7089269594464342e-05, + "loss": 1.7455, + "step": 24007 + }, + { + "epoch": 7.368937998772253, + "grad_norm": 0.1854190230369568, + "learning_rate": 1.7085527768793847e-05, + "loss": 1.7355, + "step": 24008 + }, + { + "epoch": 7.3692449355432785, + "grad_norm": 0.17093004286289215, + "learning_rate": 1.708178626840349e-05, + "loss": 1.6813, + "step": 24009 + }, + { + "epoch": 7.369551872314303, + "grad_norm": 0.15385115146636963, + "learning_rate": 1.707804509333018e-05, + "loss": 1.664, + "step": 24010 + }, + { + "epoch": 7.369858809085328, + "grad_norm": 0.18747489154338837, + "learning_rate": 1.7074304243610963e-05, + "loss": 1.787, + "step": 24011 + }, + { + "epoch": 7.370165745856354, + "grad_norm": 0.21749509871006012, + "learning_rate": 1.7070563719282734e-05, + "loss": 1.723, + "step": 24012 + }, + { + "epoch": 7.370472682627379, + "grad_norm": 0.18973985314369202, + "learning_rate": 1.7066823520382508e-05, + "loss": 1.7415, + "step": 24013 + }, + { + "epoch": 7.370779619398404, + "grad_norm": 0.24844922125339508, + "learning_rate": 1.706308364694724e-05, + "loss": 1.7617, + "step": 24014 + }, + { + "epoch": 7.371086556169429, + "grad_norm": 0.16565518081188202, + "learning_rate": 1.705934409901388e-05, + "loss": 1.6781, + "step": 24015 + }, + { + "epoch": 7.371393492940454, + "grad_norm": 0.22595234215259552, + "learning_rate": 1.705560487661941e-05, + "loss": 1.7706, + "step": 24016 + }, + { + "epoch": 7.371700429711479, + "grad_norm": 0.2452661544084549, + "learning_rate": 1.7051865979800723e-05, + "loss": 1.8227, + "step": 24017 + }, + { + "epoch": 7.372007366482505, + "grad_norm": 0.2285550981760025, + "learning_rate": 1.7048127408594834e-05, + "loss": 1.7554, + "step": 24018 + }, + { + "epoch": 7.37231430325353, + "grad_norm": 0.22723950445652008, + "learning_rate": 1.7044389163038656e-05, + "loss": 1.7152, + "step": 24019 + }, + { + "epoch": 7.3726212400245545, + "grad_norm": 0.20335997641086578, + "learning_rate": 1.7040651243169143e-05, + "loss": 1.6661, + "step": 24020 + }, + { + "epoch": 7.37292817679558, + "grad_norm": 0.27618682384490967, + "learning_rate": 1.703691364902323e-05, + "loss": 1.8375, + "step": 24021 + }, + { + "epoch": 7.373235113566605, + "grad_norm": 0.24076996743679047, + "learning_rate": 1.7033176380637856e-05, + "loss": 1.7581, + "step": 24022 + }, + { + "epoch": 7.3735420503376305, + "grad_norm": 0.21615716814994812, + "learning_rate": 1.702943943804996e-05, + "loss": 1.7047, + "step": 24023 + }, + { + "epoch": 7.373848987108656, + "grad_norm": 0.23503927886486053, + "learning_rate": 1.7025702821296462e-05, + "loss": 1.7926, + "step": 24024 + }, + { + "epoch": 7.37415592387968, + "grad_norm": 0.2344675064086914, + "learning_rate": 1.7021966530414303e-05, + "loss": 1.747, + "step": 24025 + }, + { + "epoch": 7.374462860650706, + "grad_norm": 0.20946700870990753, + "learning_rate": 1.701823056544039e-05, + "loss": 1.746, + "step": 24026 + }, + { + "epoch": 7.374769797421731, + "grad_norm": 0.26749730110168457, + "learning_rate": 1.7014494926411645e-05, + "loss": 1.7375, + "step": 24027 + }, + { + "epoch": 7.375076734192756, + "grad_norm": 0.19716335833072662, + "learning_rate": 1.701075961336503e-05, + "loss": 1.6677, + "step": 24028 + }, + { + "epoch": 7.375383670963782, + "grad_norm": 0.1999496966600418, + "learning_rate": 1.7007024626337382e-05, + "loss": 1.6665, + "step": 24029 + }, + { + "epoch": 7.375690607734807, + "grad_norm": 0.188812255859375, + "learning_rate": 1.7003289965365676e-05, + "loss": 1.7344, + "step": 24030 + }, + { + "epoch": 7.3759975445058314, + "grad_norm": 0.20171904563903809, + "learning_rate": 1.6999555630486795e-05, + "loss": 1.7452, + "step": 24031 + }, + { + "epoch": 7.376304481276857, + "grad_norm": 0.21260966360569, + "learning_rate": 1.6995821621737655e-05, + "loss": 1.7759, + "step": 24032 + }, + { + "epoch": 7.376611418047882, + "grad_norm": 0.1913561075925827, + "learning_rate": 1.699208793915516e-05, + "loss": 1.7342, + "step": 24033 + }, + { + "epoch": 7.3769183548189075, + "grad_norm": 0.1907757967710495, + "learning_rate": 1.6988354582776166e-05, + "loss": 1.6511, + "step": 24034 + }, + { + "epoch": 7.377225291589933, + "grad_norm": 0.15012076497077942, + "learning_rate": 1.6984621552637625e-05, + "loss": 1.6638, + "step": 24035 + }, + { + "epoch": 7.377532228360957, + "grad_norm": 0.17761732637882233, + "learning_rate": 1.6980888848776394e-05, + "loss": 1.7035, + "step": 24036 + }, + { + "epoch": 7.377839165131983, + "grad_norm": 0.15940140187740326, + "learning_rate": 1.6977156471229376e-05, + "loss": 1.6532, + "step": 24037 + }, + { + "epoch": 7.378146101903008, + "grad_norm": 0.19022013247013092, + "learning_rate": 1.6973424420033455e-05, + "loss": 1.7545, + "step": 24038 + }, + { + "epoch": 7.378453038674033, + "grad_norm": 0.1900233030319214, + "learning_rate": 1.6969692695225513e-05, + "loss": 1.7051, + "step": 24039 + }, + { + "epoch": 7.378759975445059, + "grad_norm": 0.17687582969665527, + "learning_rate": 1.6965961296842425e-05, + "loss": 1.6819, + "step": 24040 + }, + { + "epoch": 7.379066912216084, + "grad_norm": 0.16323260962963104, + "learning_rate": 1.696223022492107e-05, + "loss": 1.6642, + "step": 24041 + }, + { + "epoch": 7.379373848987108, + "grad_norm": 0.21163886785507202, + "learning_rate": 1.695849947949832e-05, + "loss": 1.6973, + "step": 24042 + }, + { + "epoch": 7.379680785758134, + "grad_norm": 0.1713307648897171, + "learning_rate": 1.6954769060611043e-05, + "loss": 1.677, + "step": 24043 + }, + { + "epoch": 7.379987722529159, + "grad_norm": 0.19575951993465424, + "learning_rate": 1.695103896829609e-05, + "loss": 1.7305, + "step": 24044 + }, + { + "epoch": 7.380294659300184, + "grad_norm": 0.16087177395820618, + "learning_rate": 1.6947309202590377e-05, + "loss": 1.6435, + "step": 24045 + }, + { + "epoch": 7.38060159607121, + "grad_norm": 0.2088652402162552, + "learning_rate": 1.6943579763530692e-05, + "loss": 1.7136, + "step": 24046 + }, + { + "epoch": 7.380908532842234, + "grad_norm": 0.18253973126411438, + "learning_rate": 1.693985065115396e-05, + "loss": 1.7461, + "step": 24047 + }, + { + "epoch": 7.3812154696132595, + "grad_norm": 0.272062212228775, + "learning_rate": 1.6936121865496967e-05, + "loss": 1.7455, + "step": 24048 + }, + { + "epoch": 7.381522406384285, + "grad_norm": 0.1884320080280304, + "learning_rate": 1.6932393406596613e-05, + "loss": 1.7242, + "step": 24049 + }, + { + "epoch": 7.38182934315531, + "grad_norm": 0.22986121475696564, + "learning_rate": 1.6928665274489748e-05, + "loss": 1.7461, + "step": 24050 + }, + { + "epoch": 7.3821362799263355, + "grad_norm": 0.19400665163993835, + "learning_rate": 1.6924937469213158e-05, + "loss": 1.7468, + "step": 24051 + }, + { + "epoch": 7.382443216697361, + "grad_norm": 0.1990167796611786, + "learning_rate": 1.6921209990803744e-05, + "loss": 1.7253, + "step": 24052 + }, + { + "epoch": 7.382750153468385, + "grad_norm": 0.16667480766773224, + "learning_rate": 1.691748283929832e-05, + "loss": 1.6763, + "step": 24053 + }, + { + "epoch": 7.383057090239411, + "grad_norm": 0.20539991557598114, + "learning_rate": 1.691375601473372e-05, + "loss": 1.7408, + "step": 24054 + }, + { + "epoch": 7.383364027010436, + "grad_norm": 0.18021859228610992, + "learning_rate": 1.6910029517146776e-05, + "loss": 1.7075, + "step": 24055 + }, + { + "epoch": 7.383670963781461, + "grad_norm": 0.17450939118862152, + "learning_rate": 1.6906303346574314e-05, + "loss": 1.7074, + "step": 24056 + }, + { + "epoch": 7.383977900552487, + "grad_norm": 0.1690986454486847, + "learning_rate": 1.690257750305316e-05, + "loss": 1.6911, + "step": 24057 + }, + { + "epoch": 7.384284837323511, + "grad_norm": 0.19716380536556244, + "learning_rate": 1.6898851986620136e-05, + "loss": 1.7075, + "step": 24058 + }, + { + "epoch": 7.384591774094536, + "grad_norm": 0.20165397226810455, + "learning_rate": 1.6895126797312054e-05, + "loss": 1.7201, + "step": 24059 + }, + { + "epoch": 7.384898710865562, + "grad_norm": 0.22149543464183807, + "learning_rate": 1.6891401935165734e-05, + "loss": 1.7407, + "step": 24060 + }, + { + "epoch": 7.385205647636587, + "grad_norm": 0.1575438529253006, + "learning_rate": 1.6887677400217966e-05, + "loss": 1.6451, + "step": 24061 + }, + { + "epoch": 7.385512584407612, + "grad_norm": 0.18075503408908844, + "learning_rate": 1.688395319250562e-05, + "loss": 1.7084, + "step": 24062 + }, + { + "epoch": 7.385819521178637, + "grad_norm": 0.16428421437740326, + "learning_rate": 1.6880229312065414e-05, + "loss": 1.7047, + "step": 24063 + }, + { + "epoch": 7.386126457949662, + "grad_norm": 0.18372805416584015, + "learning_rate": 1.6876505758934237e-05, + "loss": 1.6726, + "step": 24064 + }, + { + "epoch": 7.3864333947206875, + "grad_norm": 0.199292853474617, + "learning_rate": 1.687278253314882e-05, + "loss": 1.7472, + "step": 24065 + }, + { + "epoch": 7.386740331491713, + "grad_norm": 0.20381483435630798, + "learning_rate": 1.686905963474597e-05, + "loss": 1.7128, + "step": 24066 + }, + { + "epoch": 7.387047268262738, + "grad_norm": 0.18497546017169952, + "learning_rate": 1.6865337063762527e-05, + "loss": 1.736, + "step": 24067 + }, + { + "epoch": 7.387354205033763, + "grad_norm": 0.21320439875125885, + "learning_rate": 1.6861614820235206e-05, + "loss": 1.7391, + "step": 24068 + }, + { + "epoch": 7.387661141804788, + "grad_norm": 0.22324618697166443, + "learning_rate": 1.6857892904200863e-05, + "loss": 1.7384, + "step": 24069 + }, + { + "epoch": 7.387968078575813, + "grad_norm": 0.18035978078842163, + "learning_rate": 1.6854171315696216e-05, + "loss": 1.7029, + "step": 24070 + }, + { + "epoch": 7.388275015346839, + "grad_norm": 0.1727912276983261, + "learning_rate": 1.6850450054758092e-05, + "loss": 1.6649, + "step": 24071 + }, + { + "epoch": 7.388581952117864, + "grad_norm": 0.19713124632835388, + "learning_rate": 1.6846729121423256e-05, + "loss": 1.7508, + "step": 24072 + }, + { + "epoch": 7.388888888888889, + "grad_norm": 0.19403581321239471, + "learning_rate": 1.6843008515728464e-05, + "loss": 1.7807, + "step": 24073 + }, + { + "epoch": 7.389195825659914, + "grad_norm": 0.20204444229602814, + "learning_rate": 1.6839288237710503e-05, + "loss": 1.778, + "step": 24074 + }, + { + "epoch": 7.389502762430939, + "grad_norm": 0.20021478831768036, + "learning_rate": 1.6835568287406127e-05, + "loss": 1.7544, + "step": 24075 + }, + { + "epoch": 7.389809699201964, + "grad_norm": 0.2247730791568756, + "learning_rate": 1.6831848664852107e-05, + "loss": 1.7422, + "step": 24076 + }, + { + "epoch": 7.39011663597299, + "grad_norm": 0.21600402891635895, + "learning_rate": 1.68281293700852e-05, + "loss": 1.7491, + "step": 24077 + }, + { + "epoch": 7.390423572744015, + "grad_norm": 0.1854497194290161, + "learning_rate": 1.6824410403142145e-05, + "loss": 1.7292, + "step": 24078 + }, + { + "epoch": 7.3907305095150395, + "grad_norm": 0.21738949418067932, + "learning_rate": 1.6820691764059736e-05, + "loss": 1.6996, + "step": 24079 + }, + { + "epoch": 7.391037446286065, + "grad_norm": 0.20114775002002716, + "learning_rate": 1.6816973452874674e-05, + "loss": 1.7299, + "step": 24080 + }, + { + "epoch": 7.39134438305709, + "grad_norm": 0.17267082631587982, + "learning_rate": 1.681325546962376e-05, + "loss": 1.7181, + "step": 24081 + }, + { + "epoch": 7.3916513198281155, + "grad_norm": 0.1681009829044342, + "learning_rate": 1.680953781434369e-05, + "loss": 1.6826, + "step": 24082 + }, + { + "epoch": 7.391958256599141, + "grad_norm": 0.18807077407836914, + "learning_rate": 1.6805820487071205e-05, + "loss": 1.6934, + "step": 24083 + }, + { + "epoch": 7.392265193370166, + "grad_norm": 0.1859835982322693, + "learning_rate": 1.680210348784309e-05, + "loss": 1.7065, + "step": 24084 + }, + { + "epoch": 7.392572130141191, + "grad_norm": 0.20433956384658813, + "learning_rate": 1.679838681669601e-05, + "loss": 1.7934, + "step": 24085 + }, + { + "epoch": 7.392879066912216, + "grad_norm": 0.2428809553384781, + "learning_rate": 1.679467047366677e-05, + "loss": 1.7619, + "step": 24086 + }, + { + "epoch": 7.393186003683241, + "grad_norm": 0.25117191672325134, + "learning_rate": 1.6790954458792025e-05, + "loss": 1.7254, + "step": 24087 + }, + { + "epoch": 7.393492940454267, + "grad_norm": 0.19429172575473785, + "learning_rate": 1.6787238772108544e-05, + "loss": 1.6946, + "step": 24088 + }, + { + "epoch": 7.393799877225292, + "grad_norm": 0.18574993312358856, + "learning_rate": 1.678352341365304e-05, + "loss": 1.6953, + "step": 24089 + }, + { + "epoch": 7.394106813996316, + "grad_norm": 0.21022208034992218, + "learning_rate": 1.6779808383462227e-05, + "loss": 1.7866, + "step": 24090 + }, + { + "epoch": 7.394413750767342, + "grad_norm": 0.16711890697479248, + "learning_rate": 1.6776093681572818e-05, + "loss": 1.6988, + "step": 24091 + }, + { + "epoch": 7.394720687538367, + "grad_norm": 0.23661695420742035, + "learning_rate": 1.6772379308021524e-05, + "loss": 1.7152, + "step": 24092 + }, + { + "epoch": 7.395027624309392, + "grad_norm": 0.18410098552703857, + "learning_rate": 1.6768665262845052e-05, + "loss": 1.6643, + "step": 24093 + }, + { + "epoch": 7.395334561080418, + "grad_norm": 0.19566760957241058, + "learning_rate": 1.676495154608011e-05, + "loss": 1.7371, + "step": 24094 + }, + { + "epoch": 7.395641497851442, + "grad_norm": 0.18130381405353546, + "learning_rate": 1.6761238157763375e-05, + "loss": 1.6934, + "step": 24095 + }, + { + "epoch": 7.3959484346224675, + "grad_norm": 0.16141927242279053, + "learning_rate": 1.6757525097931603e-05, + "loss": 1.6629, + "step": 24096 + }, + { + "epoch": 7.396255371393493, + "grad_norm": 0.18370656669139862, + "learning_rate": 1.6753812366621418e-05, + "loss": 1.6931, + "step": 24097 + }, + { + "epoch": 7.396562308164518, + "grad_norm": 0.17368416488170624, + "learning_rate": 1.675009996386958e-05, + "loss": 1.7028, + "step": 24098 + }, + { + "epoch": 7.3968692449355435, + "grad_norm": 0.1704222410917282, + "learning_rate": 1.6746387889712722e-05, + "loss": 1.7241, + "step": 24099 + }, + { + "epoch": 7.397176181706568, + "grad_norm": 0.19127961993217468, + "learning_rate": 1.674267614418754e-05, + "loss": 1.6606, + "step": 24100 + }, + { + "epoch": 7.397483118477593, + "grad_norm": 0.20173178613185883, + "learning_rate": 1.673896472733075e-05, + "loss": 1.7293, + "step": 24101 + }, + { + "epoch": 7.397790055248619, + "grad_norm": 0.194651797413826, + "learning_rate": 1.6735253639178977e-05, + "loss": 1.6889, + "step": 24102 + }, + { + "epoch": 7.398096992019644, + "grad_norm": 0.16184480488300323, + "learning_rate": 1.6731542879768957e-05, + "loss": 1.6929, + "step": 24103 + }, + { + "epoch": 7.398403928790669, + "grad_norm": 0.21806742250919342, + "learning_rate": 1.67278324491373e-05, + "loss": 1.6944, + "step": 24104 + }, + { + "epoch": 7.398710865561695, + "grad_norm": 0.1599469929933548, + "learning_rate": 1.6724122347320715e-05, + "loss": 1.7107, + "step": 24105 + }, + { + "epoch": 7.399017802332719, + "grad_norm": 0.18621234595775604, + "learning_rate": 1.672041257435586e-05, + "loss": 1.6856, + "step": 24106 + }, + { + "epoch": 7.399324739103744, + "grad_norm": 0.20682603120803833, + "learning_rate": 1.6716703130279393e-05, + "loss": 1.7699, + "step": 24107 + }, + { + "epoch": 7.39963167587477, + "grad_norm": 0.19649554789066315, + "learning_rate": 1.6712994015127976e-05, + "loss": 1.7049, + "step": 24108 + }, + { + "epoch": 7.399938612645795, + "grad_norm": 0.15894706547260284, + "learning_rate": 1.6709285228938255e-05, + "loss": 1.7352, + "step": 24109 + }, + { + "epoch": 7.4002455494168204, + "grad_norm": 0.22186337411403656, + "learning_rate": 1.6705576771746896e-05, + "loss": 1.7353, + "step": 24110 + }, + { + "epoch": 7.400552486187845, + "grad_norm": 0.14689651131629944, + "learning_rate": 1.670186864359054e-05, + "loss": 1.7155, + "step": 24111 + }, + { + "epoch": 7.40085942295887, + "grad_norm": 0.2055603563785553, + "learning_rate": 1.6698160844505817e-05, + "loss": 1.6897, + "step": 24112 + }, + { + "epoch": 7.401166359729896, + "grad_norm": 0.1641531139612198, + "learning_rate": 1.6694453374529423e-05, + "loss": 1.67, + "step": 24113 + }, + { + "epoch": 7.401473296500921, + "grad_norm": 0.21150687336921692, + "learning_rate": 1.6690746233697923e-05, + "loss": 1.7507, + "step": 24114 + }, + { + "epoch": 7.401780233271946, + "grad_norm": 0.1844765543937683, + "learning_rate": 1.6687039422048035e-05, + "loss": 1.702, + "step": 24115 + }, + { + "epoch": 7.402087170042972, + "grad_norm": 0.1695966124534607, + "learning_rate": 1.6683332939616326e-05, + "loss": 1.6683, + "step": 24116 + }, + { + "epoch": 7.402394106813996, + "grad_norm": 0.17938567698001862, + "learning_rate": 1.667962678643943e-05, + "loss": 1.6947, + "step": 24117 + }, + { + "epoch": 7.402701043585021, + "grad_norm": 0.16420964896678925, + "learning_rate": 1.6675920962554027e-05, + "loss": 1.755, + "step": 24118 + }, + { + "epoch": 7.403007980356047, + "grad_norm": 0.16095438599586487, + "learning_rate": 1.667221546799667e-05, + "loss": 1.6855, + "step": 24119 + }, + { + "epoch": 7.403314917127072, + "grad_norm": 0.2089291363954544, + "learning_rate": 1.6668510302804052e-05, + "loss": 1.7213, + "step": 24120 + }, + { + "epoch": 7.403621853898097, + "grad_norm": 0.18369436264038086, + "learning_rate": 1.6664805467012717e-05, + "loss": 1.6913, + "step": 24121 + }, + { + "epoch": 7.403928790669122, + "grad_norm": 0.16405323147773743, + "learning_rate": 1.6661100960659326e-05, + "loss": 1.6529, + "step": 24122 + }, + { + "epoch": 7.404235727440147, + "grad_norm": 0.20792648196220398, + "learning_rate": 1.6657396783780477e-05, + "loss": 1.6855, + "step": 24123 + }, + { + "epoch": 7.4045426642111725, + "grad_norm": 0.17733097076416016, + "learning_rate": 1.6653692936412773e-05, + "loss": 1.727, + "step": 24124 + }, + { + "epoch": 7.404849600982198, + "grad_norm": 0.16196851432323456, + "learning_rate": 1.6649989418592825e-05, + "loss": 1.7376, + "step": 24125 + }, + { + "epoch": 7.405156537753223, + "grad_norm": 0.17193716764450073, + "learning_rate": 1.664628623035723e-05, + "loss": 1.6802, + "step": 24126 + }, + { + "epoch": 7.4054634745242485, + "grad_norm": 0.22076182067394257, + "learning_rate": 1.6642583371742576e-05, + "loss": 1.7512, + "step": 24127 + }, + { + "epoch": 7.405770411295273, + "grad_norm": 0.20766951143741608, + "learning_rate": 1.663888084278547e-05, + "loss": 1.7457, + "step": 24128 + }, + { + "epoch": 7.406077348066298, + "grad_norm": 0.16815492510795593, + "learning_rate": 1.663517864352248e-05, + "loss": 1.6867, + "step": 24129 + }, + { + "epoch": 7.406384284837324, + "grad_norm": 0.19644804298877716, + "learning_rate": 1.6631476773990246e-05, + "loss": 1.6996, + "step": 24130 + }, + { + "epoch": 7.406691221608349, + "grad_norm": 0.18717117607593536, + "learning_rate": 1.662777523422528e-05, + "loss": 1.7745, + "step": 24131 + }, + { + "epoch": 7.406998158379374, + "grad_norm": 0.1679331511259079, + "learning_rate": 1.662407402426423e-05, + "loss": 1.7213, + "step": 24132 + }, + { + "epoch": 7.407305095150399, + "grad_norm": 0.1721929907798767, + "learning_rate": 1.662037314414363e-05, + "loss": 1.6759, + "step": 24133 + }, + { + "epoch": 7.407612031921424, + "grad_norm": 0.15507890284061432, + "learning_rate": 1.661667259390005e-05, + "loss": 1.6658, + "step": 24134 + }, + { + "epoch": 7.407918968692449, + "grad_norm": 0.20528049767017365, + "learning_rate": 1.6612972373570114e-05, + "loss": 1.7508, + "step": 24135 + }, + { + "epoch": 7.408225905463475, + "grad_norm": 0.20593658089637756, + "learning_rate": 1.6609272483190315e-05, + "loss": 1.8078, + "step": 24136 + }, + { + "epoch": 7.4085328422345, + "grad_norm": 0.19905441999435425, + "learning_rate": 1.6605572922797292e-05, + "loss": 1.7933, + "step": 24137 + }, + { + "epoch": 7.4088397790055245, + "grad_norm": 0.17571881413459778, + "learning_rate": 1.6601873692427537e-05, + "loss": 1.6908, + "step": 24138 + }, + { + "epoch": 7.40914671577655, + "grad_norm": 0.2244982272386551, + "learning_rate": 1.6598174792117655e-05, + "loss": 1.6998, + "step": 24139 + }, + { + "epoch": 7.409453652547575, + "grad_norm": 0.15267951786518097, + "learning_rate": 1.6594476221904193e-05, + "loss": 1.6399, + "step": 24140 + }, + { + "epoch": 7.4097605893186005, + "grad_norm": 0.24161390960216522, + "learning_rate": 1.659077798182369e-05, + "loss": 1.6776, + "step": 24141 + }, + { + "epoch": 7.410067526089626, + "grad_norm": 0.17184343934059143, + "learning_rate": 1.658708007191271e-05, + "loss": 1.7169, + "step": 24142 + }, + { + "epoch": 7.41037446286065, + "grad_norm": 0.1589801162481308, + "learning_rate": 1.6583382492207778e-05, + "loss": 1.6727, + "step": 24143 + }, + { + "epoch": 7.410681399631676, + "grad_norm": 0.18666890263557434, + "learning_rate": 1.6579685242745452e-05, + "loss": 1.7429, + "step": 24144 + }, + { + "epoch": 7.410988336402701, + "grad_norm": 0.22418901324272156, + "learning_rate": 1.6575988323562265e-05, + "loss": 1.7834, + "step": 24145 + }, + { + "epoch": 7.411295273173726, + "grad_norm": 0.1897875964641571, + "learning_rate": 1.6572291734694734e-05, + "loss": 1.7271, + "step": 24146 + }, + { + "epoch": 7.411602209944752, + "grad_norm": 0.18204644322395325, + "learning_rate": 1.6568595476179445e-05, + "loss": 1.7003, + "step": 24147 + }, + { + "epoch": 7.411909146715777, + "grad_norm": 0.19130240380764008, + "learning_rate": 1.6564899548052853e-05, + "loss": 1.6803, + "step": 24148 + }, + { + "epoch": 7.412216083486801, + "grad_norm": 0.19467706978321075, + "learning_rate": 1.6561203950351554e-05, + "loss": 1.7529, + "step": 24149 + }, + { + "epoch": 7.412523020257827, + "grad_norm": 0.20290352404117584, + "learning_rate": 1.655750868311202e-05, + "loss": 1.7742, + "step": 24150 + }, + { + "epoch": 7.412829957028852, + "grad_norm": 0.18538729846477509, + "learning_rate": 1.6553813746370772e-05, + "loss": 1.68, + "step": 24151 + }, + { + "epoch": 7.413136893799877, + "grad_norm": 0.23339742422103882, + "learning_rate": 1.655011914016437e-05, + "loss": 1.7499, + "step": 24152 + }, + { + "epoch": 7.413443830570903, + "grad_norm": 0.21964092552661896, + "learning_rate": 1.654642486452927e-05, + "loss": 1.7394, + "step": 24153 + }, + { + "epoch": 7.413750767341927, + "grad_norm": 0.2131531536579132, + "learning_rate": 1.6542730919502032e-05, + "loss": 1.6928, + "step": 24154 + }, + { + "epoch": 7.4140577041129525, + "grad_norm": 0.20840130746364594, + "learning_rate": 1.653903730511911e-05, + "loss": 1.6785, + "step": 24155 + }, + { + "epoch": 7.414364640883978, + "grad_norm": 0.1519836038351059, + "learning_rate": 1.653534402141705e-05, + "loss": 1.6882, + "step": 24156 + }, + { + "epoch": 7.414671577655003, + "grad_norm": 0.21539351344108582, + "learning_rate": 1.653165106843233e-05, + "loss": 1.7041, + "step": 24157 + }, + { + "epoch": 7.4149785144260285, + "grad_norm": 0.2050703912973404, + "learning_rate": 1.6527958446201453e-05, + "loss": 1.7854, + "step": 24158 + }, + { + "epoch": 7.415285451197054, + "grad_norm": 0.21595771610736847, + "learning_rate": 1.652426615476091e-05, + "loss": 1.7305, + "step": 24159 + }, + { + "epoch": 7.415592387968078, + "grad_norm": 0.19248713552951813, + "learning_rate": 1.6520574194147186e-05, + "loss": 1.6834, + "step": 24160 + }, + { + "epoch": 7.415899324739104, + "grad_norm": 0.178158700466156, + "learning_rate": 1.6516882564396774e-05, + "loss": 1.7312, + "step": 24161 + }, + { + "epoch": 7.416206261510129, + "grad_norm": 0.18686197698116302, + "learning_rate": 1.6513191265546152e-05, + "loss": 1.7025, + "step": 24162 + }, + { + "epoch": 7.416513198281154, + "grad_norm": 0.1544325053691864, + "learning_rate": 1.6509500297631787e-05, + "loss": 1.6773, + "step": 24163 + }, + { + "epoch": 7.41682013505218, + "grad_norm": 0.1787567138671875, + "learning_rate": 1.6505809660690197e-05, + "loss": 1.6941, + "step": 24164 + }, + { + "epoch": 7.417127071823204, + "grad_norm": 0.16545183956623077, + "learning_rate": 1.65021193547578e-05, + "loss": 1.6618, + "step": 24165 + }, + { + "epoch": 7.417434008594229, + "grad_norm": 0.23889821767807007, + "learning_rate": 1.6498429379871126e-05, + "loss": 1.7651, + "step": 24166 + }, + { + "epoch": 7.417740945365255, + "grad_norm": 0.2012832909822464, + "learning_rate": 1.649473973606659e-05, + "loss": 1.7477, + "step": 24167 + }, + { + "epoch": 7.41804788213628, + "grad_norm": 0.18035975098609924, + "learning_rate": 1.6491050423380662e-05, + "loss": 1.6747, + "step": 24168 + }, + { + "epoch": 7.418354818907305, + "grad_norm": 0.14925292134284973, + "learning_rate": 1.6487361441849842e-05, + "loss": 1.6817, + "step": 24169 + }, + { + "epoch": 7.41866175567833, + "grad_norm": 0.19253355264663696, + "learning_rate": 1.6483672791510523e-05, + "loss": 1.6943, + "step": 24170 + }, + { + "epoch": 7.418968692449355, + "grad_norm": 0.17203082144260406, + "learning_rate": 1.6479984472399234e-05, + "loss": 1.692, + "step": 24171 + }, + { + "epoch": 7.4192756292203805, + "grad_norm": 0.19132022559642792, + "learning_rate": 1.647629648455235e-05, + "loss": 1.7029, + "step": 24172 + }, + { + "epoch": 7.419582565991406, + "grad_norm": 0.17949101328849792, + "learning_rate": 1.647260882800637e-05, + "loss": 1.6944, + "step": 24173 + }, + { + "epoch": 7.419889502762431, + "grad_norm": 0.17752930521965027, + "learning_rate": 1.646892150279772e-05, + "loss": 1.6875, + "step": 24174 + }, + { + "epoch": 7.420196439533456, + "grad_norm": 0.19464492797851562, + "learning_rate": 1.6465234508962836e-05, + "loss": 1.6988, + "step": 24175 + }, + { + "epoch": 7.420503376304481, + "grad_norm": 0.20154574513435364, + "learning_rate": 1.6461547846538168e-05, + "loss": 1.7305, + "step": 24176 + }, + { + "epoch": 7.420810313075506, + "grad_norm": 0.20944970846176147, + "learning_rate": 1.6457861515560136e-05, + "loss": 1.7699, + "step": 24177 + }, + { + "epoch": 7.421117249846532, + "grad_norm": 0.22422203421592712, + "learning_rate": 1.6454175516065175e-05, + "loss": 1.6607, + "step": 24178 + }, + { + "epoch": 7.421424186617557, + "grad_norm": 0.16106431186199188, + "learning_rate": 1.6450489848089717e-05, + "loss": 1.7204, + "step": 24179 + }, + { + "epoch": 7.421731123388582, + "grad_norm": 0.24394269287586212, + "learning_rate": 1.644680451167018e-05, + "loss": 1.7161, + "step": 24180 + }, + { + "epoch": 7.422038060159607, + "grad_norm": 0.1999186873435974, + "learning_rate": 1.644311950684299e-05, + "loss": 1.7486, + "step": 24181 + }, + { + "epoch": 7.422344996930632, + "grad_norm": 0.1865876019001007, + "learning_rate": 1.6439434833644545e-05, + "loss": 1.737, + "step": 24182 + }, + { + "epoch": 7.422651933701657, + "grad_norm": 0.18088236451148987, + "learning_rate": 1.643575049211131e-05, + "loss": 1.6821, + "step": 24183 + }, + { + "epoch": 7.422958870472683, + "grad_norm": 0.17456914484500885, + "learning_rate": 1.643206648227964e-05, + "loss": 1.7379, + "step": 24184 + }, + { + "epoch": 7.423265807243708, + "grad_norm": 0.18160004913806915, + "learning_rate": 1.642838280418595e-05, + "loss": 1.7364, + "step": 24185 + }, + { + "epoch": 7.4235727440147325, + "grad_norm": 0.18081973493099213, + "learning_rate": 1.6424699457866688e-05, + "loss": 1.7591, + "step": 24186 + }, + { + "epoch": 7.423879680785758, + "grad_norm": 0.20753513276576996, + "learning_rate": 1.6421016443358195e-05, + "loss": 1.7299, + "step": 24187 + }, + { + "epoch": 7.424186617556783, + "grad_norm": 0.2102874517440796, + "learning_rate": 1.641733376069693e-05, + "loss": 1.7876, + "step": 24188 + }, + { + "epoch": 7.4244935543278086, + "grad_norm": 0.19360920786857605, + "learning_rate": 1.6413651409919224e-05, + "loss": 1.7578, + "step": 24189 + }, + { + "epoch": 7.424800491098834, + "grad_norm": 0.1954938918352127, + "learning_rate": 1.6409969391061514e-05, + "loss": 1.7074, + "step": 24190 + }, + { + "epoch": 7.425107427869859, + "grad_norm": 0.2228705734014511, + "learning_rate": 1.6406287704160177e-05, + "loss": 1.7261, + "step": 24191 + }, + { + "epoch": 7.425414364640884, + "grad_norm": 0.18695802986621857, + "learning_rate": 1.6402606349251597e-05, + "loss": 1.7074, + "step": 24192 + }, + { + "epoch": 7.425721301411909, + "grad_norm": 0.19026046991348267, + "learning_rate": 1.639892532637215e-05, + "loss": 1.7546, + "step": 24193 + }, + { + "epoch": 7.426028238182934, + "grad_norm": 0.2086167335510254, + "learning_rate": 1.639524463555822e-05, + "loss": 1.7551, + "step": 24194 + }, + { + "epoch": 7.42633517495396, + "grad_norm": 0.201420396566391, + "learning_rate": 1.639156427684618e-05, + "loss": 1.6961, + "step": 24195 + }, + { + "epoch": 7.426642111724985, + "grad_norm": 0.1735599786043167, + "learning_rate": 1.6387884250272394e-05, + "loss": 1.7461, + "step": 24196 + }, + { + "epoch": 7.4269490484960095, + "grad_norm": 0.23944853246212006, + "learning_rate": 1.6384204555873238e-05, + "loss": 1.7001, + "step": 24197 + }, + { + "epoch": 7.427255985267035, + "grad_norm": 0.15605413913726807, + "learning_rate": 1.638052519368508e-05, + "loss": 1.7105, + "step": 24198 + }, + { + "epoch": 7.42756292203806, + "grad_norm": 0.21450987458229065, + "learning_rate": 1.6376846163744257e-05, + "loss": 1.7309, + "step": 24199 + }, + { + "epoch": 7.4278698588090855, + "grad_norm": 0.20542307198047638, + "learning_rate": 1.637316746608718e-05, + "loss": 1.72, + "step": 24200 + }, + { + "epoch": 7.428176795580111, + "grad_norm": 0.18612053990364075, + "learning_rate": 1.6369489100750157e-05, + "loss": 1.6714, + "step": 24201 + }, + { + "epoch": 7.428483732351136, + "grad_norm": 0.16587957739830017, + "learning_rate": 1.6365811067769553e-05, + "loss": 1.7494, + "step": 24202 + }, + { + "epoch": 7.428790669122161, + "grad_norm": 0.247777059674263, + "learning_rate": 1.636213336718172e-05, + "loss": 1.7048, + "step": 24203 + }, + { + "epoch": 7.429097605893186, + "grad_norm": 0.2000289410352707, + "learning_rate": 1.635845599902298e-05, + "loss": 1.7568, + "step": 24204 + }, + { + "epoch": 7.429404542664211, + "grad_norm": 0.21887128055095673, + "learning_rate": 1.6354778963329732e-05, + "loss": 1.6708, + "step": 24205 + }, + { + "epoch": 7.429711479435237, + "grad_norm": 0.18932145833969116, + "learning_rate": 1.6351102260138247e-05, + "loss": 1.7184, + "step": 24206 + }, + { + "epoch": 7.430018416206262, + "grad_norm": 0.20103856921195984, + "learning_rate": 1.63474258894849e-05, + "loss": 1.7031, + "step": 24207 + }, + { + "epoch": 7.430325352977286, + "grad_norm": 0.22598737478256226, + "learning_rate": 1.634374985140602e-05, + "loss": 1.7803, + "step": 24208 + }, + { + "epoch": 7.430632289748312, + "grad_norm": 0.22468316555023193, + "learning_rate": 1.6340074145937934e-05, + "loss": 1.7635, + "step": 24209 + }, + { + "epoch": 7.430939226519337, + "grad_norm": 0.16173744201660156, + "learning_rate": 1.6336398773116962e-05, + "loss": 1.6877, + "step": 24210 + }, + { + "epoch": 7.431246163290362, + "grad_norm": 0.17869406938552856, + "learning_rate": 1.6332723732979426e-05, + "loss": 1.6436, + "step": 24211 + }, + { + "epoch": 7.431553100061388, + "grad_norm": 0.1828129142522812, + "learning_rate": 1.6329049025561648e-05, + "loss": 1.7191, + "step": 24212 + }, + { + "epoch": 7.431860036832412, + "grad_norm": 0.19169248640537262, + "learning_rate": 1.6325374650899944e-05, + "loss": 1.7607, + "step": 24213 + }, + { + "epoch": 7.4321669736034375, + "grad_norm": 0.1680343598127365, + "learning_rate": 1.632170060903062e-05, + "loss": 1.6736, + "step": 24214 + }, + { + "epoch": 7.432473910374463, + "grad_norm": 0.20647180080413818, + "learning_rate": 1.6318026899989996e-05, + "loss": 1.7875, + "step": 24215 + }, + { + "epoch": 7.432780847145488, + "grad_norm": 0.29225587844848633, + "learning_rate": 1.6314353523814352e-05, + "loss": 1.8164, + "step": 24216 + }, + { + "epoch": 7.4330877839165135, + "grad_norm": 0.1633446216583252, + "learning_rate": 1.6310680480540048e-05, + "loss": 1.6529, + "step": 24217 + }, + { + "epoch": 7.433394720687538, + "grad_norm": 0.21215081214904785, + "learning_rate": 1.6307007770203326e-05, + "loss": 1.6323, + "step": 24218 + }, + { + "epoch": 7.433701657458563, + "grad_norm": 0.1934979110956192, + "learning_rate": 1.63033353928405e-05, + "loss": 1.7299, + "step": 24219 + }, + { + "epoch": 7.434008594229589, + "grad_norm": 0.2581390142440796, + "learning_rate": 1.6299663348487865e-05, + "loss": 1.7308, + "step": 24220 + }, + { + "epoch": 7.434315531000614, + "grad_norm": 0.2711075246334076, + "learning_rate": 1.629599163718169e-05, + "loss": 1.8736, + "step": 24221 + }, + { + "epoch": 7.434622467771639, + "grad_norm": 0.2620790898799896, + "learning_rate": 1.6292320258958316e-05, + "loss": 1.7326, + "step": 24222 + }, + { + "epoch": 7.434929404542665, + "grad_norm": 0.16254334151744843, + "learning_rate": 1.6288649213853958e-05, + "loss": 1.6996, + "step": 24223 + }, + { + "epoch": 7.435236341313689, + "grad_norm": 0.22968515753746033, + "learning_rate": 1.628497850190496e-05, + "loss": 1.694, + "step": 24224 + }, + { + "epoch": 7.435543278084714, + "grad_norm": 0.20458953082561493, + "learning_rate": 1.6281308123147533e-05, + "loss": 1.7558, + "step": 24225 + }, + { + "epoch": 7.43585021485574, + "grad_norm": 0.2327413409948349, + "learning_rate": 1.6277638077617995e-05, + "loss": 1.7581, + "step": 24226 + }, + { + "epoch": 7.436157151626765, + "grad_norm": 0.18312111496925354, + "learning_rate": 1.6273968365352604e-05, + "loss": 1.6713, + "step": 24227 + }, + { + "epoch": 7.43646408839779, + "grad_norm": 0.15935418009757996, + "learning_rate": 1.6270298986387628e-05, + "loss": 1.6996, + "step": 24228 + }, + { + "epoch": 7.436771025168815, + "grad_norm": 0.17424416542053223, + "learning_rate": 1.6266629940759322e-05, + "loss": 1.6826, + "step": 24229 + }, + { + "epoch": 7.43707796193984, + "grad_norm": 0.18982923030853271, + "learning_rate": 1.6262961228503953e-05, + "loss": 1.741, + "step": 24230 + }, + { + "epoch": 7.4373848987108655, + "grad_norm": 0.16608789563179016, + "learning_rate": 1.6259292849657777e-05, + "loss": 1.7205, + "step": 24231 + }, + { + "epoch": 7.437691835481891, + "grad_norm": 0.19830825924873352, + "learning_rate": 1.625562480425704e-05, + "loss": 1.7159, + "step": 24232 + }, + { + "epoch": 7.437998772252916, + "grad_norm": 0.1889072209596634, + "learning_rate": 1.6251957092337988e-05, + "loss": 1.7427, + "step": 24233 + }, + { + "epoch": 7.4383057090239415, + "grad_norm": 0.18454046547412872, + "learning_rate": 1.6248289713936903e-05, + "loss": 1.6962, + "step": 24234 + }, + { + "epoch": 7.438612645794966, + "grad_norm": 0.20041033625602722, + "learning_rate": 1.6244622669089987e-05, + "loss": 1.7763, + "step": 24235 + }, + { + "epoch": 7.438919582565991, + "grad_norm": 0.17226676642894745, + "learning_rate": 1.62409559578335e-05, + "loss": 1.6783, + "step": 24236 + }, + { + "epoch": 7.439226519337017, + "grad_norm": 0.1761687994003296, + "learning_rate": 1.6237289580203662e-05, + "loss": 1.6761, + "step": 24237 + }, + { + "epoch": 7.439533456108042, + "grad_norm": 0.24213027954101562, + "learning_rate": 1.6233623536236707e-05, + "loss": 1.724, + "step": 24238 + }, + { + "epoch": 7.439840392879067, + "grad_norm": 0.15541739761829376, + "learning_rate": 1.6229957825968913e-05, + "loss": 1.6594, + "step": 24239 + }, + { + "epoch": 7.440147329650092, + "grad_norm": 0.20755749940872192, + "learning_rate": 1.622629244943643e-05, + "loss": 1.7229, + "step": 24240 + }, + { + "epoch": 7.440454266421117, + "grad_norm": 0.20716612040996552, + "learning_rate": 1.6222627406675555e-05, + "loss": 1.699, + "step": 24241 + }, + { + "epoch": 7.440761203192142, + "grad_norm": 0.17423541843891144, + "learning_rate": 1.621896269772244e-05, + "loss": 1.7175, + "step": 24242 + }, + { + "epoch": 7.441068139963168, + "grad_norm": 0.17913730442523956, + "learning_rate": 1.6215298322613347e-05, + "loss": 1.7287, + "step": 24243 + }, + { + "epoch": 7.441375076734193, + "grad_norm": 0.21801607310771942, + "learning_rate": 1.6211634281384486e-05, + "loss": 1.8157, + "step": 24244 + }, + { + "epoch": 7.4416820135052175, + "grad_norm": 0.23132582008838654, + "learning_rate": 1.6207970574072056e-05, + "loss": 1.7921, + "step": 24245 + }, + { + "epoch": 7.441988950276243, + "grad_norm": 0.18289685249328613, + "learning_rate": 1.6204307200712266e-05, + "loss": 1.7222, + "step": 24246 + }, + { + "epoch": 7.442295887047268, + "grad_norm": 0.15289388597011566, + "learning_rate": 1.620064416134132e-05, + "loss": 1.6409, + "step": 24247 + }, + { + "epoch": 7.4426028238182935, + "grad_norm": 0.1684839129447937, + "learning_rate": 1.619698145599542e-05, + "loss": 1.7362, + "step": 24248 + }, + { + "epoch": 7.442909760589319, + "grad_norm": 0.16812102496623993, + "learning_rate": 1.619331908471076e-05, + "loss": 1.6849, + "step": 24249 + }, + { + "epoch": 7.443216697360343, + "grad_norm": 0.16095775365829468, + "learning_rate": 1.6189657047523526e-05, + "loss": 1.7032, + "step": 24250 + }, + { + "epoch": 7.443523634131369, + "grad_norm": 0.167144313454628, + "learning_rate": 1.6185995344469946e-05, + "loss": 1.6539, + "step": 24251 + }, + { + "epoch": 7.443830570902394, + "grad_norm": 0.18129989504814148, + "learning_rate": 1.618233397558616e-05, + "loss": 1.7057, + "step": 24252 + }, + { + "epoch": 7.444137507673419, + "grad_norm": 0.17299556732177734, + "learning_rate": 1.6178672940908374e-05, + "loss": 1.6965, + "step": 24253 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 0.14944438636302948, + "learning_rate": 1.6175012240472765e-05, + "loss": 1.6666, + "step": 24254 + }, + { + "epoch": 7.44475138121547, + "grad_norm": 0.20333626866340637, + "learning_rate": 1.6171351874315494e-05, + "loss": 1.748, + "step": 24255 + }, + { + "epoch": 7.445058317986494, + "grad_norm": 0.2233068197965622, + "learning_rate": 1.6167691842472783e-05, + "loss": 1.7662, + "step": 24256 + }, + { + "epoch": 7.44536525475752, + "grad_norm": 0.22628507018089294, + "learning_rate": 1.6164032144980738e-05, + "loss": 1.747, + "step": 24257 + }, + { + "epoch": 7.445672191528545, + "grad_norm": 0.18167820572853088, + "learning_rate": 1.6160372781875594e-05, + "loss": 1.7311, + "step": 24258 + }, + { + "epoch": 7.44597912829957, + "grad_norm": 0.1975218504667282, + "learning_rate": 1.6156713753193446e-05, + "loss": 1.7334, + "step": 24259 + }, + { + "epoch": 7.446286065070596, + "grad_norm": 0.18606813251972198, + "learning_rate": 1.6153055058970508e-05, + "loss": 1.7118, + "step": 24260 + }, + { + "epoch": 7.44659300184162, + "grad_norm": 0.14817847311496735, + "learning_rate": 1.6149396699242914e-05, + "loss": 1.6385, + "step": 24261 + }, + { + "epoch": 7.4468999386126455, + "grad_norm": 0.19018684327602386, + "learning_rate": 1.6145738674046825e-05, + "loss": 1.7511, + "step": 24262 + }, + { + "epoch": 7.447206875383671, + "grad_norm": 0.17089374363422394, + "learning_rate": 1.6142080983418385e-05, + "loss": 1.7523, + "step": 24263 + }, + { + "epoch": 7.447513812154696, + "grad_norm": 0.16370832920074463, + "learning_rate": 1.613842362739375e-05, + "loss": 1.6636, + "step": 24264 + }, + { + "epoch": 7.4478207489257215, + "grad_norm": 0.16432829201221466, + "learning_rate": 1.6134766606009055e-05, + "loss": 1.7355, + "step": 24265 + }, + { + "epoch": 7.448127685696747, + "grad_norm": 0.15270906686782837, + "learning_rate": 1.6131109919300453e-05, + "loss": 1.7169, + "step": 24266 + }, + { + "epoch": 7.448434622467771, + "grad_norm": 0.14986950159072876, + "learning_rate": 1.6127453567304053e-05, + "loss": 1.7021, + "step": 24267 + }, + { + "epoch": 7.448741559238797, + "grad_norm": 0.17727383971214294, + "learning_rate": 1.6123797550056042e-05, + "loss": 1.7144, + "step": 24268 + }, + { + "epoch": 7.449048496009822, + "grad_norm": 0.1471523940563202, + "learning_rate": 1.6120141867592504e-05, + "loss": 1.694, + "step": 24269 + }, + { + "epoch": 7.449355432780847, + "grad_norm": 0.15561319887638092, + "learning_rate": 1.611648651994958e-05, + "loss": 1.6672, + "step": 24270 + }, + { + "epoch": 7.449662369551873, + "grad_norm": 0.19121745228767395, + "learning_rate": 1.61128315071634e-05, + "loss": 1.7317, + "step": 24271 + }, + { + "epoch": 7.449969306322897, + "grad_norm": 0.27333202958106995, + "learning_rate": 1.6109176829270062e-05, + "loss": 1.7943, + "step": 24272 + }, + { + "epoch": 7.4502762430939224, + "grad_norm": 0.16996058821678162, + "learning_rate": 1.6105522486305736e-05, + "loss": 1.6883, + "step": 24273 + }, + { + "epoch": 7.450583179864948, + "grad_norm": 0.17687207460403442, + "learning_rate": 1.610186847830647e-05, + "loss": 1.6967, + "step": 24274 + }, + { + "epoch": 7.450890116635973, + "grad_norm": 0.2191249281167984, + "learning_rate": 1.6098214805308436e-05, + "loss": 1.7644, + "step": 24275 + }, + { + "epoch": 7.4511970534069984, + "grad_norm": 0.17267808318138123, + "learning_rate": 1.6094561467347684e-05, + "loss": 1.6963, + "step": 24276 + }, + { + "epoch": 7.451503990178024, + "grad_norm": 0.16276031732559204, + "learning_rate": 1.609090846446037e-05, + "loss": 1.6795, + "step": 24277 + }, + { + "epoch": 7.451810926949048, + "grad_norm": 0.16677677631378174, + "learning_rate": 1.6087255796682572e-05, + "loss": 1.699, + "step": 24278 + }, + { + "epoch": 7.452117863720074, + "grad_norm": 0.17163679003715515, + "learning_rate": 1.6083603464050383e-05, + "loss": 1.6906, + "step": 24279 + }, + { + "epoch": 7.452424800491099, + "grad_norm": 0.16087757050991058, + "learning_rate": 1.6079951466599908e-05, + "loss": 1.7173, + "step": 24280 + }, + { + "epoch": 7.452731737262124, + "grad_norm": 0.19389556348323822, + "learning_rate": 1.6076299804367228e-05, + "loss": 1.6985, + "step": 24281 + }, + { + "epoch": 7.45303867403315, + "grad_norm": 0.20400559902191162, + "learning_rate": 1.6072648477388447e-05, + "loss": 1.7336, + "step": 24282 + }, + { + "epoch": 7.453345610804174, + "grad_norm": 0.16443994641304016, + "learning_rate": 1.6068997485699632e-05, + "loss": 1.6909, + "step": 24283 + }, + { + "epoch": 7.453652547575199, + "grad_norm": 0.18333028256893158, + "learning_rate": 1.606534682933686e-05, + "loss": 1.6749, + "step": 24284 + }, + { + "epoch": 7.453959484346225, + "grad_norm": 0.21596840023994446, + "learning_rate": 1.6061696508336244e-05, + "loss": 1.7856, + "step": 24285 + }, + { + "epoch": 7.45426642111725, + "grad_norm": 0.18656609952449799, + "learning_rate": 1.6058046522733827e-05, + "loss": 1.6892, + "step": 24286 + }, + { + "epoch": 7.454573357888275, + "grad_norm": 0.18110665678977966, + "learning_rate": 1.6054396872565687e-05, + "loss": 1.7063, + "step": 24287 + }, + { + "epoch": 7.4548802946593, + "grad_norm": 0.19452248513698578, + "learning_rate": 1.605074755786789e-05, + "loss": 1.7637, + "step": 24288 + }, + { + "epoch": 7.455187231430325, + "grad_norm": 0.18945640325546265, + "learning_rate": 1.604709857867649e-05, + "loss": 1.7498, + "step": 24289 + }, + { + "epoch": 7.4554941682013505, + "grad_norm": 0.1847696155309677, + "learning_rate": 1.6043449935027592e-05, + "loss": 1.702, + "step": 24290 + }, + { + "epoch": 7.455801104972376, + "grad_norm": 0.18882444500923157, + "learning_rate": 1.6039801626957197e-05, + "loss": 1.728, + "step": 24291 + }, + { + "epoch": 7.456108041743401, + "grad_norm": 0.1981150358915329, + "learning_rate": 1.603615365450142e-05, + "loss": 1.7114, + "step": 24292 + }, + { + "epoch": 7.456414978514426, + "grad_norm": 0.2305375188589096, + "learning_rate": 1.6032506017696242e-05, + "loss": 1.7234, + "step": 24293 + }, + { + "epoch": 7.456721915285451, + "grad_norm": 0.17539730668067932, + "learning_rate": 1.6028858716577764e-05, + "loss": 1.6305, + "step": 24294 + }, + { + "epoch": 7.457028852056476, + "grad_norm": 0.19684432446956635, + "learning_rate": 1.602521175118202e-05, + "loss": 1.6958, + "step": 24295 + }, + { + "epoch": 7.457335788827502, + "grad_norm": 0.20957234501838684, + "learning_rate": 1.602156512154504e-05, + "loss": 1.6984, + "step": 24296 + }, + { + "epoch": 7.457642725598527, + "grad_norm": 0.18523702025413513, + "learning_rate": 1.6017918827702877e-05, + "loss": 1.7817, + "step": 24297 + }, + { + "epoch": 7.457949662369552, + "grad_norm": 0.1964758187532425, + "learning_rate": 1.601427286969155e-05, + "loss": 1.7597, + "step": 24298 + }, + { + "epoch": 7.458256599140577, + "grad_norm": 0.199961856007576, + "learning_rate": 1.6010627247547106e-05, + "loss": 1.6988, + "step": 24299 + }, + { + "epoch": 7.458563535911602, + "grad_norm": 0.16149461269378662, + "learning_rate": 1.6006981961305555e-05, + "loss": 1.6673, + "step": 24300 + }, + { + "epoch": 7.458870472682627, + "grad_norm": 0.2198258489370346, + "learning_rate": 1.600333701100293e-05, + "loss": 1.7159, + "step": 24301 + }, + { + "epoch": 7.459177409453653, + "grad_norm": 0.157994344830513, + "learning_rate": 1.5999692396675277e-05, + "loss": 1.7118, + "step": 24302 + }, + { + "epoch": 7.459484346224678, + "grad_norm": 0.21911758184432983, + "learning_rate": 1.5996048118358575e-05, + "loss": 1.7209, + "step": 24303 + }, + { + "epoch": 7.4597912829957025, + "grad_norm": 0.20648738741874695, + "learning_rate": 1.599240417608886e-05, + "loss": 1.7844, + "step": 24304 + }, + { + "epoch": 7.460098219766728, + "grad_norm": 0.18746837973594666, + "learning_rate": 1.598876056990214e-05, + "loss": 1.7079, + "step": 24305 + }, + { + "epoch": 7.460405156537753, + "grad_norm": 0.17767341434955597, + "learning_rate": 1.5985117299834407e-05, + "loss": 1.7579, + "step": 24306 + }, + { + "epoch": 7.4607120933087785, + "grad_norm": 0.18997585773468018, + "learning_rate": 1.598147436592171e-05, + "loss": 1.7556, + "step": 24307 + }, + { + "epoch": 7.461019030079804, + "grad_norm": 0.19356711208820343, + "learning_rate": 1.597783176819999e-05, + "loss": 1.7315, + "step": 24308 + }, + { + "epoch": 7.461325966850829, + "grad_norm": 0.23354102671146393, + "learning_rate": 1.597418950670531e-05, + "loss": 1.7622, + "step": 24309 + }, + { + "epoch": 7.461632903621854, + "grad_norm": 0.18773409724235535, + "learning_rate": 1.5970547581473604e-05, + "loss": 1.6582, + "step": 24310 + }, + { + "epoch": 7.461939840392879, + "grad_norm": 0.23704196512699127, + "learning_rate": 1.596690599254091e-05, + "loss": 1.7207, + "step": 24311 + }, + { + "epoch": 7.462246777163904, + "grad_norm": 0.1943788379430771, + "learning_rate": 1.596326473994319e-05, + "loss": 1.696, + "step": 24312 + }, + { + "epoch": 7.46255371393493, + "grad_norm": 0.22303985059261322, + "learning_rate": 1.595962382371644e-05, + "loss": 1.6963, + "step": 24313 + }, + { + "epoch": 7.462860650705955, + "grad_norm": 0.20158524811267853, + "learning_rate": 1.5955983243896643e-05, + "loss": 1.7017, + "step": 24314 + }, + { + "epoch": 7.463167587476979, + "grad_norm": 0.18768194317817688, + "learning_rate": 1.595234300051977e-05, + "loss": 1.6743, + "step": 24315 + }, + { + "epoch": 7.463474524248005, + "grad_norm": 0.27407020330429077, + "learning_rate": 1.5948703093621803e-05, + "loss": 1.7522, + "step": 24316 + }, + { + "epoch": 7.46378146101903, + "grad_norm": 0.2027997523546219, + "learning_rate": 1.5945063523238706e-05, + "loss": 1.7515, + "step": 24317 + }, + { + "epoch": 7.464088397790055, + "grad_norm": 0.2728271782398224, + "learning_rate": 1.5941424289406454e-05, + "loss": 1.7611, + "step": 24318 + }, + { + "epoch": 7.464395334561081, + "grad_norm": 0.1704578548669815, + "learning_rate": 1.593778539216101e-05, + "loss": 1.6602, + "step": 24319 + }, + { + "epoch": 7.464702271332105, + "grad_norm": 0.19684311747550964, + "learning_rate": 1.5934146831538332e-05, + "loss": 1.6824, + "step": 24320 + }, + { + "epoch": 7.4650092081031305, + "grad_norm": 0.196905255317688, + "learning_rate": 1.5930508607574386e-05, + "loss": 1.691, + "step": 24321 + }, + { + "epoch": 7.465316144874156, + "grad_norm": 0.18543855845928192, + "learning_rate": 1.5926870720305122e-05, + "loss": 1.6936, + "step": 24322 + }, + { + "epoch": 7.465623081645181, + "grad_norm": 0.24634000658988953, + "learning_rate": 1.592323316976647e-05, + "loss": 1.6857, + "step": 24323 + }, + { + "epoch": 7.4659300184162065, + "grad_norm": 0.1976090669631958, + "learning_rate": 1.5919595955994444e-05, + "loss": 1.7248, + "step": 24324 + }, + { + "epoch": 7.466236955187231, + "grad_norm": 0.21902409195899963, + "learning_rate": 1.5915959079024907e-05, + "loss": 1.7184, + "step": 24325 + }, + { + "epoch": 7.466543891958256, + "grad_norm": 0.14501455426216125, + "learning_rate": 1.591232253889387e-05, + "loss": 1.6351, + "step": 24326 + }, + { + "epoch": 7.466850828729282, + "grad_norm": 0.20591090619564056, + "learning_rate": 1.5908686335637213e-05, + "loss": 1.7188, + "step": 24327 + }, + { + "epoch": 7.467157765500307, + "grad_norm": 0.17669445276260376, + "learning_rate": 1.590505046929091e-05, + "loss": 1.6735, + "step": 24328 + }, + { + "epoch": 7.467464702271332, + "grad_norm": 0.19642697274684906, + "learning_rate": 1.590141493989089e-05, + "loss": 1.6599, + "step": 24329 + }, + { + "epoch": 7.467771639042358, + "grad_norm": 0.2049490511417389, + "learning_rate": 1.589777974747307e-05, + "loss": 1.77, + "step": 24330 + }, + { + "epoch": 7.468078575813382, + "grad_norm": 0.1877276450395584, + "learning_rate": 1.5894144892073377e-05, + "loss": 1.6774, + "step": 24331 + }, + { + "epoch": 7.468385512584407, + "grad_norm": 0.18437768518924713, + "learning_rate": 1.5890510373727735e-05, + "loss": 1.7054, + "step": 24332 + }, + { + "epoch": 7.468692449355433, + "grad_norm": 0.1850978136062622, + "learning_rate": 1.5886876192472062e-05, + "loss": 1.6664, + "step": 24333 + }, + { + "epoch": 7.468999386126458, + "grad_norm": 0.16257111728191376, + "learning_rate": 1.588324234834227e-05, + "loss": 1.7438, + "step": 24334 + }, + { + "epoch": 7.469306322897483, + "grad_norm": 0.1776656061410904, + "learning_rate": 1.5879608841374277e-05, + "loss": 1.6913, + "step": 24335 + }, + { + "epoch": 7.469613259668508, + "grad_norm": 0.183144673705101, + "learning_rate": 1.587597567160398e-05, + "loss": 1.6737, + "step": 24336 + }, + { + "epoch": 7.469920196439533, + "grad_norm": 0.15030701458454132, + "learning_rate": 1.5872342839067306e-05, + "loss": 1.6776, + "step": 24337 + }, + { + "epoch": 7.4702271332105585, + "grad_norm": 0.1987701952457428, + "learning_rate": 1.586871034380013e-05, + "loss": 1.7119, + "step": 24338 + }, + { + "epoch": 7.470534069981584, + "grad_norm": 0.20000997185707092, + "learning_rate": 1.5865078185838373e-05, + "loss": 1.6794, + "step": 24339 + }, + { + "epoch": 7.470841006752609, + "grad_norm": 0.1674201786518097, + "learning_rate": 1.5861446365217902e-05, + "loss": 1.6826, + "step": 24340 + }, + { + "epoch": 7.4711479435236345, + "grad_norm": 0.22385969758033752, + "learning_rate": 1.585781488197466e-05, + "loss": 1.7012, + "step": 24341 + }, + { + "epoch": 7.471454880294659, + "grad_norm": 0.18635201454162598, + "learning_rate": 1.585418373614446e-05, + "loss": 1.7086, + "step": 24342 + }, + { + "epoch": 7.471761817065684, + "grad_norm": 0.17345300316810608, + "learning_rate": 1.5850552927763274e-05, + "loss": 1.7068, + "step": 24343 + }, + { + "epoch": 7.47206875383671, + "grad_norm": 0.1777433305978775, + "learning_rate": 1.5846922456866904e-05, + "loss": 1.6618, + "step": 24344 + }, + { + "epoch": 7.472375690607735, + "grad_norm": 0.1821276843547821, + "learning_rate": 1.584329232349128e-05, + "loss": 1.7451, + "step": 24345 + }, + { + "epoch": 7.47268262737876, + "grad_norm": 0.1714404970407486, + "learning_rate": 1.5839662527672262e-05, + "loss": 1.7289, + "step": 24346 + }, + { + "epoch": 7.472989564149785, + "grad_norm": 0.159423828125, + "learning_rate": 1.583603306944572e-05, + "loss": 1.667, + "step": 24347 + }, + { + "epoch": 7.47329650092081, + "grad_norm": 0.22563552856445312, + "learning_rate": 1.5832403948847523e-05, + "loss": 1.7755, + "step": 24348 + }, + { + "epoch": 7.473603437691835, + "grad_norm": 0.17239433526992798, + "learning_rate": 1.582877516591354e-05, + "loss": 1.6577, + "step": 24349 + }, + { + "epoch": 7.473910374462861, + "grad_norm": 0.1671951860189438, + "learning_rate": 1.5825146720679624e-05, + "loss": 1.7438, + "step": 24350 + }, + { + "epoch": 7.474217311233886, + "grad_norm": 0.1802397519350052, + "learning_rate": 1.582151861318164e-05, + "loss": 1.686, + "step": 24351 + }, + { + "epoch": 7.474524248004911, + "grad_norm": 0.21424922347068787, + "learning_rate": 1.5817890843455442e-05, + "loss": 1.7871, + "step": 24352 + }, + { + "epoch": 7.474831184775936, + "grad_norm": 0.2275305986404419, + "learning_rate": 1.5814263411536884e-05, + "loss": 1.7461, + "step": 24353 + }, + { + "epoch": 7.475138121546961, + "grad_norm": 0.1682458072900772, + "learning_rate": 1.581063631746181e-05, + "loss": 1.6362, + "step": 24354 + }, + { + "epoch": 7.475445058317987, + "grad_norm": 0.165358304977417, + "learning_rate": 1.5807009561266068e-05, + "loss": 1.7057, + "step": 24355 + }, + { + "epoch": 7.475751995089012, + "grad_norm": 0.18032164871692657, + "learning_rate": 1.5803383142985496e-05, + "loss": 1.7645, + "step": 24356 + }, + { + "epoch": 7.476058931860037, + "grad_norm": 0.1694670170545578, + "learning_rate": 1.5799757062655935e-05, + "loss": 1.6848, + "step": 24357 + }, + { + "epoch": 7.476365868631062, + "grad_norm": 0.17879679799079895, + "learning_rate": 1.5796131320313225e-05, + "loss": 1.7425, + "step": 24358 + }, + { + "epoch": 7.476672805402087, + "grad_norm": 0.16042493283748627, + "learning_rate": 1.579250591599317e-05, + "loss": 1.6389, + "step": 24359 + }, + { + "epoch": 7.476979742173112, + "grad_norm": 0.19134685397148132, + "learning_rate": 1.5788880849731658e-05, + "loss": 1.7504, + "step": 24360 + }, + { + "epoch": 7.477286678944138, + "grad_norm": 0.16545429825782776, + "learning_rate": 1.578525612156444e-05, + "loss": 1.7184, + "step": 24361 + }, + { + "epoch": 7.477593615715163, + "grad_norm": 0.18139231204986572, + "learning_rate": 1.5781631731527397e-05, + "loss": 1.6794, + "step": 24362 + }, + { + "epoch": 7.4779005524861875, + "grad_norm": 0.19043901562690735, + "learning_rate": 1.5778007679656326e-05, + "loss": 1.7184, + "step": 24363 + }, + { + "epoch": 7.478207489257213, + "grad_norm": 0.19410157203674316, + "learning_rate": 1.577438396598703e-05, + "loss": 1.7599, + "step": 24364 + }, + { + "epoch": 7.478514426028238, + "grad_norm": 0.18464741110801697, + "learning_rate": 1.5770760590555344e-05, + "loss": 1.652, + "step": 24365 + }, + { + "epoch": 7.4788213627992635, + "grad_norm": 0.19959059357643127, + "learning_rate": 1.576713755339706e-05, + "loss": 1.7509, + "step": 24366 + }, + { + "epoch": 7.479128299570289, + "grad_norm": 0.20312312245368958, + "learning_rate": 1.576351485454799e-05, + "loss": 1.758, + "step": 24367 + }, + { + "epoch": 7.479435236341313, + "grad_norm": 0.23994365334510803, + "learning_rate": 1.5759892494043933e-05, + "loss": 1.7124, + "step": 24368 + }, + { + "epoch": 7.479742173112339, + "grad_norm": 0.22661323845386505, + "learning_rate": 1.575627047192068e-05, + "loss": 1.7251, + "step": 24369 + }, + { + "epoch": 7.480049109883364, + "grad_norm": 0.2599529027938843, + "learning_rate": 1.5752648788214038e-05, + "loss": 1.7351, + "step": 24370 + }, + { + "epoch": 7.480356046654389, + "grad_norm": 0.17298145592212677, + "learning_rate": 1.5749027442959795e-05, + "loss": 1.681, + "step": 24371 + }, + { + "epoch": 7.480662983425415, + "grad_norm": 0.18189257383346558, + "learning_rate": 1.574540643619373e-05, + "loss": 1.6938, + "step": 24372 + }, + { + "epoch": 7.48096992019644, + "grad_norm": 0.2658606767654419, + "learning_rate": 1.5741785767951645e-05, + "loss": 1.7043, + "step": 24373 + }, + { + "epoch": 7.481276856967464, + "grad_norm": 0.17898595333099365, + "learning_rate": 1.573816543826931e-05, + "loss": 1.7299, + "step": 24374 + }, + { + "epoch": 7.48158379373849, + "grad_norm": 0.2529693841934204, + "learning_rate": 1.573454544718251e-05, + "loss": 1.6378, + "step": 24375 + }, + { + "epoch": 7.481890730509515, + "grad_norm": 0.1542833298444748, + "learning_rate": 1.5730925794726993e-05, + "loss": 1.6847, + "step": 24376 + }, + { + "epoch": 7.48219766728054, + "grad_norm": 0.24731594324111938, + "learning_rate": 1.5727306480938586e-05, + "loss": 1.7028, + "step": 24377 + }, + { + "epoch": 7.482504604051566, + "grad_norm": 0.21095556020736694, + "learning_rate": 1.572368750585299e-05, + "loss": 1.7371, + "step": 24378 + }, + { + "epoch": 7.48281154082259, + "grad_norm": 0.24208855628967285, + "learning_rate": 1.5720068869506037e-05, + "loss": 1.7982, + "step": 24379 + }, + { + "epoch": 7.4831184775936155, + "grad_norm": 0.23290614783763885, + "learning_rate": 1.571645057193343e-05, + "loss": 1.7443, + "step": 24380 + }, + { + "epoch": 7.483425414364641, + "grad_norm": 0.2146376222372055, + "learning_rate": 1.5712832613170963e-05, + "loss": 1.7258, + "step": 24381 + }, + { + "epoch": 7.483732351135666, + "grad_norm": 0.20540264248847961, + "learning_rate": 1.5709214993254385e-05, + "loss": 1.6495, + "step": 24382 + }, + { + "epoch": 7.4840392879066915, + "grad_norm": 0.16472755372524261, + "learning_rate": 1.570559771221944e-05, + "loss": 1.7118, + "step": 24383 + }, + { + "epoch": 7.484346224677717, + "grad_norm": 0.194668248295784, + "learning_rate": 1.5701980770101876e-05, + "loss": 1.6948, + "step": 24384 + }, + { + "epoch": 7.484653161448741, + "grad_norm": 0.19188909232616425, + "learning_rate": 1.569836416693744e-05, + "loss": 1.7376, + "step": 24385 + }, + { + "epoch": 7.484960098219767, + "grad_norm": 0.1935901939868927, + "learning_rate": 1.569474790276188e-05, + "loss": 1.7009, + "step": 24386 + }, + { + "epoch": 7.485267034990792, + "grad_norm": 0.18449221551418304, + "learning_rate": 1.5691131977610924e-05, + "loss": 1.7542, + "step": 24387 + }, + { + "epoch": 7.485573971761817, + "grad_norm": 0.18543820083141327, + "learning_rate": 1.568751639152031e-05, + "loss": 1.7125, + "step": 24388 + }, + { + "epoch": 7.485880908532843, + "grad_norm": 0.17343461513519287, + "learning_rate": 1.5683901144525776e-05, + "loss": 1.7189, + "step": 24389 + }, + { + "epoch": 7.486187845303867, + "grad_norm": 0.16813276708126068, + "learning_rate": 1.568028623666304e-05, + "loss": 1.6416, + "step": 24390 + }, + { + "epoch": 7.486494782074892, + "grad_norm": 0.16296882927417755, + "learning_rate": 1.567667166796783e-05, + "loss": 1.6971, + "step": 24391 + }, + { + "epoch": 7.486801718845918, + "grad_norm": 0.206793412566185, + "learning_rate": 1.5673057438475875e-05, + "loss": 1.8139, + "step": 24392 + }, + { + "epoch": 7.487108655616943, + "grad_norm": 0.1937340795993805, + "learning_rate": 1.566944354822286e-05, + "loss": 1.7606, + "step": 24393 + }, + { + "epoch": 7.487415592387968, + "grad_norm": 0.19251857697963715, + "learning_rate": 1.566582999724456e-05, + "loss": 1.7225, + "step": 24394 + }, + { + "epoch": 7.487722529158993, + "grad_norm": 0.1551857739686966, + "learning_rate": 1.566221678557663e-05, + "loss": 1.6546, + "step": 24395 + }, + { + "epoch": 7.488029465930018, + "grad_norm": 0.19435563683509827, + "learning_rate": 1.565860391325482e-05, + "loss": 1.7444, + "step": 24396 + }, + { + "epoch": 7.4883364027010435, + "grad_norm": 0.21196971833705902, + "learning_rate": 1.565499138031479e-05, + "loss": 1.7124, + "step": 24397 + }, + { + "epoch": 7.488643339472069, + "grad_norm": 0.2145242542028427, + "learning_rate": 1.5651379186792276e-05, + "loss": 1.7571, + "step": 24398 + }, + { + "epoch": 7.488950276243094, + "grad_norm": 0.17056338489055634, + "learning_rate": 1.5647767332722964e-05, + "loss": 1.6514, + "step": 24399 + }, + { + "epoch": 7.4892572130141195, + "grad_norm": 0.17161786556243896, + "learning_rate": 1.5644155818142553e-05, + "loss": 1.675, + "step": 24400 + }, + { + "epoch": 7.489564149785144, + "grad_norm": 0.18978877365589142, + "learning_rate": 1.564054464308673e-05, + "loss": 1.7123, + "step": 24401 + }, + { + "epoch": 7.489871086556169, + "grad_norm": 0.16004881262779236, + "learning_rate": 1.5636933807591186e-05, + "loss": 1.6555, + "step": 24402 + }, + { + "epoch": 7.490178023327195, + "grad_norm": 0.19739225506782532, + "learning_rate": 1.56333233116916e-05, + "loss": 1.7441, + "step": 24403 + }, + { + "epoch": 7.49048496009822, + "grad_norm": 0.20770032703876495, + "learning_rate": 1.5629713155423657e-05, + "loss": 1.6704, + "step": 24404 + }, + { + "epoch": 7.490791896869245, + "grad_norm": 0.17897675931453705, + "learning_rate": 1.5626103338823033e-05, + "loss": 1.7281, + "step": 24405 + }, + { + "epoch": 7.49109883364027, + "grad_norm": 0.20801669359207153, + "learning_rate": 1.5622493861925402e-05, + "loss": 1.7008, + "step": 24406 + }, + { + "epoch": 7.491405770411295, + "grad_norm": 0.2027266025543213, + "learning_rate": 1.5618884724766442e-05, + "loss": 1.7619, + "step": 24407 + }, + { + "epoch": 7.49171270718232, + "grad_norm": 0.19207318127155304, + "learning_rate": 1.5615275927381806e-05, + "loss": 1.6985, + "step": 24408 + }, + { + "epoch": 7.492019643953346, + "grad_norm": 0.19694732129573822, + "learning_rate": 1.5611667469807175e-05, + "loss": 1.7455, + "step": 24409 + }, + { + "epoch": 7.492326580724371, + "grad_norm": 0.170238196849823, + "learning_rate": 1.560805935207818e-05, + "loss": 1.7179, + "step": 24410 + }, + { + "epoch": 7.4926335174953955, + "grad_norm": 0.16890759766101837, + "learning_rate": 1.5604451574230532e-05, + "loss": 1.7323, + "step": 24411 + }, + { + "epoch": 7.492940454266421, + "grad_norm": 0.18043142557144165, + "learning_rate": 1.5600844136299824e-05, + "loss": 1.6958, + "step": 24412 + }, + { + "epoch": 7.493247391037446, + "grad_norm": 0.23966364562511444, + "learning_rate": 1.5597237038321764e-05, + "loss": 1.754, + "step": 24413 + }, + { + "epoch": 7.4935543278084715, + "grad_norm": 0.23342584073543549, + "learning_rate": 1.5593630280331945e-05, + "loss": 1.8008, + "step": 24414 + }, + { + "epoch": 7.493861264579497, + "grad_norm": 0.17365418374538422, + "learning_rate": 1.5590023862366054e-05, + "loss": 1.7166, + "step": 24415 + }, + { + "epoch": 7.494168201350522, + "grad_norm": 0.1934911608695984, + "learning_rate": 1.558641778445971e-05, + "loss": 1.7113, + "step": 24416 + }, + { + "epoch": 7.494475138121547, + "grad_norm": 0.1935805231332779, + "learning_rate": 1.558281204664856e-05, + "loss": 1.7549, + "step": 24417 + }, + { + "epoch": 7.494782074892572, + "grad_norm": 0.18467992544174194, + "learning_rate": 1.5579206648968236e-05, + "loss": 1.6889, + "step": 24418 + }, + { + "epoch": 7.495089011663597, + "grad_norm": 0.17173317074775696, + "learning_rate": 1.5575601591454365e-05, + "loss": 1.686, + "step": 24419 + }, + { + "epoch": 7.495395948434623, + "grad_norm": 0.1706855744123459, + "learning_rate": 1.5571996874142574e-05, + "loss": 1.6747, + "step": 24420 + }, + { + "epoch": 7.495702885205648, + "grad_norm": 0.2233184576034546, + "learning_rate": 1.556839249706849e-05, + "loss": 1.7855, + "step": 24421 + }, + { + "epoch": 7.496009821976672, + "grad_norm": 0.22118456661701202, + "learning_rate": 1.5564788460267733e-05, + "loss": 1.7487, + "step": 24422 + }, + { + "epoch": 7.496316758747698, + "grad_norm": 0.21284142136573792, + "learning_rate": 1.5561184763775916e-05, + "loss": 1.7367, + "step": 24423 + }, + { + "epoch": 7.496623695518723, + "grad_norm": 0.17366403341293335, + "learning_rate": 1.5557581407628656e-05, + "loss": 1.655, + "step": 24424 + }, + { + "epoch": 7.496930632289748, + "grad_norm": 0.19864381849765778, + "learning_rate": 1.555397839186157e-05, + "loss": 1.6691, + "step": 24425 + }, + { + "epoch": 7.497237569060774, + "grad_norm": 0.1787605881690979, + "learning_rate": 1.555037571651025e-05, + "loss": 1.7063, + "step": 24426 + }, + { + "epoch": 7.497544505831799, + "grad_norm": 0.19520068168640137, + "learning_rate": 1.5546773381610302e-05, + "loss": 1.7044, + "step": 24427 + }, + { + "epoch": 7.4978514426028235, + "grad_norm": 0.18771123886108398, + "learning_rate": 1.5543171387197362e-05, + "loss": 1.6959, + "step": 24428 + }, + { + "epoch": 7.498158379373849, + "grad_norm": 0.21876849234104156, + "learning_rate": 1.5539569733306964e-05, + "loss": 1.7486, + "step": 24429 + }, + { + "epoch": 7.498465316144874, + "grad_norm": 0.21685563027858734, + "learning_rate": 1.5535968419974772e-05, + "loss": 1.7541, + "step": 24430 + }, + { + "epoch": 7.4987722529158995, + "grad_norm": 0.19595225155353546, + "learning_rate": 1.5532367447236307e-05, + "loss": 1.6882, + "step": 24431 + }, + { + "epoch": 7.499079189686925, + "grad_norm": 0.18359199166297913, + "learning_rate": 1.5528766815127198e-05, + "loss": 1.687, + "step": 24432 + }, + { + "epoch": 7.499386126457949, + "grad_norm": 0.17955231666564941, + "learning_rate": 1.5525166523683028e-05, + "loss": 1.6759, + "step": 24433 + }, + { + "epoch": 7.499693063228975, + "grad_norm": 0.18786758184432983, + "learning_rate": 1.5521566572939368e-05, + "loss": 1.7118, + "step": 24434 + }, + { + "epoch": 7.5, + "grad_norm": 0.16672605276107788, + "learning_rate": 1.551796696293179e-05, + "loss": 1.6618, + "step": 24435 + }, + { + "epoch": 7.500306936771025, + "grad_norm": 0.17066839337348938, + "learning_rate": 1.5514367693695875e-05, + "loss": 1.6974, + "step": 24436 + }, + { + "epoch": 7.500613873542051, + "grad_norm": 0.17299650609493256, + "learning_rate": 1.5510768765267193e-05, + "loss": 1.7074, + "step": 24437 + }, + { + "epoch": 7.500920810313076, + "grad_norm": 0.17507639527320862, + "learning_rate": 1.5507170177681306e-05, + "loss": 1.7295, + "step": 24438 + }, + { + "epoch": 7.5012277470841005, + "grad_norm": 0.1909082531929016, + "learning_rate": 1.5503571930973786e-05, + "loss": 1.7153, + "step": 24439 + }, + { + "epoch": 7.501534683855126, + "grad_norm": 0.2334289401769638, + "learning_rate": 1.5499974025180185e-05, + "loss": 1.713, + "step": 24440 + }, + { + "epoch": 7.501841620626151, + "grad_norm": 0.18382340669631958, + "learning_rate": 1.5496376460336058e-05, + "loss": 1.6706, + "step": 24441 + }, + { + "epoch": 7.5021485573971765, + "grad_norm": 0.1901310533285141, + "learning_rate": 1.5492779236476967e-05, + "loss": 1.7106, + "step": 24442 + }, + { + "epoch": 7.502455494168201, + "grad_norm": 0.17336180806159973, + "learning_rate": 1.5489182353638452e-05, + "loss": 1.7467, + "step": 24443 + }, + { + "epoch": 7.502762430939226, + "grad_norm": 0.18670998513698578, + "learning_rate": 1.548558581185605e-05, + "loss": 1.7101, + "step": 24444 + }, + { + "epoch": 7.503069367710252, + "grad_norm": 0.18341238796710968, + "learning_rate": 1.5481989611165353e-05, + "loss": 1.719, + "step": 24445 + }, + { + "epoch": 7.503376304481277, + "grad_norm": 0.21832694113254547, + "learning_rate": 1.5478393751601833e-05, + "loss": 1.7143, + "step": 24446 + }, + { + "epoch": 7.503683241252302, + "grad_norm": 0.1715303659439087, + "learning_rate": 1.5474798233201094e-05, + "loss": 1.6962, + "step": 24447 + }, + { + "epoch": 7.503990178023328, + "grad_norm": 0.26411953568458557, + "learning_rate": 1.5471203055998595e-05, + "loss": 1.7182, + "step": 24448 + }, + { + "epoch": 7.504297114794352, + "grad_norm": 0.1646965742111206, + "learning_rate": 1.5467608220029926e-05, + "loss": 1.6979, + "step": 24449 + }, + { + "epoch": 7.504604051565377, + "grad_norm": 0.1664915233850479, + "learning_rate": 1.5464013725330595e-05, + "loss": 1.6809, + "step": 24450 + }, + { + "epoch": 7.504910988336403, + "grad_norm": 0.1711970716714859, + "learning_rate": 1.5460419571936125e-05, + "loss": 1.6975, + "step": 24451 + }, + { + "epoch": 7.505217925107428, + "grad_norm": 0.19235998392105103, + "learning_rate": 1.5456825759882028e-05, + "loss": 1.7515, + "step": 24452 + }, + { + "epoch": 7.505524861878453, + "grad_norm": 0.2137441486120224, + "learning_rate": 1.5453232289203822e-05, + "loss": 1.7575, + "step": 24453 + }, + { + "epoch": 7.505831798649478, + "grad_norm": 0.19337041676044464, + "learning_rate": 1.544963915993703e-05, + "loss": 1.776, + "step": 24454 + }, + { + "epoch": 7.506138735420503, + "grad_norm": 0.227366104722023, + "learning_rate": 1.5446046372117152e-05, + "loss": 1.7736, + "step": 24455 + }, + { + "epoch": 7.5064456721915285, + "grad_norm": 0.1712712198495865, + "learning_rate": 1.5442453925779694e-05, + "loss": 1.6663, + "step": 24456 + }, + { + "epoch": 7.506752608962554, + "grad_norm": 0.19359993934631348, + "learning_rate": 1.5438861820960164e-05, + "loss": 1.6826, + "step": 24457 + }, + { + "epoch": 7.507059545733579, + "grad_norm": 0.22883851826190948, + "learning_rate": 1.5435270057694056e-05, + "loss": 1.7782, + "step": 24458 + }, + { + "epoch": 7.5073664825046045, + "grad_norm": 0.17109328508377075, + "learning_rate": 1.543167863601687e-05, + "loss": 1.7435, + "step": 24459 + }, + { + "epoch": 7.507673419275629, + "grad_norm": 0.21545098721981049, + "learning_rate": 1.54280875559641e-05, + "loss": 1.7277, + "step": 24460 + }, + { + "epoch": 7.507980356046654, + "grad_norm": 0.18345774710178375, + "learning_rate": 1.542449681757121e-05, + "loss": 1.7255, + "step": 24461 + }, + { + "epoch": 7.50828729281768, + "grad_norm": 0.15472757816314697, + "learning_rate": 1.5420906420873744e-05, + "loss": 1.6615, + "step": 24462 + }, + { + "epoch": 7.508594229588705, + "grad_norm": 0.2084251195192337, + "learning_rate": 1.5417316365907113e-05, + "loss": 1.6747, + "step": 24463 + }, + { + "epoch": 7.50890116635973, + "grad_norm": 0.19010984897613525, + "learning_rate": 1.5413726652706868e-05, + "loss": 1.7188, + "step": 24464 + }, + { + "epoch": 7.509208103130755, + "grad_norm": 0.22481444478034973, + "learning_rate": 1.5410137281308408e-05, + "loss": 1.8028, + "step": 24465 + }, + { + "epoch": 7.50951503990178, + "grad_norm": 0.22309516370296478, + "learning_rate": 1.5406548251747266e-05, + "loss": 1.7806, + "step": 24466 + }, + { + "epoch": 7.509821976672805, + "grad_norm": 0.19050204753875732, + "learning_rate": 1.540295956405889e-05, + "loss": 1.7188, + "step": 24467 + }, + { + "epoch": 7.510128913443831, + "grad_norm": 0.1956445276737213, + "learning_rate": 1.5399371218278745e-05, + "loss": 1.7468, + "step": 24468 + }, + { + "epoch": 7.510435850214856, + "grad_norm": 0.3492142856121063, + "learning_rate": 1.5395783214442294e-05, + "loss": 1.7502, + "step": 24469 + }, + { + "epoch": 7.510742786985881, + "grad_norm": 0.15318654477596283, + "learning_rate": 1.5392195552584997e-05, + "loss": 1.6782, + "step": 24470 + }, + { + "epoch": 7.511049723756906, + "grad_norm": 0.18576723337173462, + "learning_rate": 1.5388608232742308e-05, + "loss": 1.7455, + "step": 24471 + }, + { + "epoch": 7.511356660527931, + "grad_norm": 0.14923253655433655, + "learning_rate": 1.5385021254949677e-05, + "loss": 1.687, + "step": 24472 + }, + { + "epoch": 7.5116635972989565, + "grad_norm": 0.17453742027282715, + "learning_rate": 1.5381434619242553e-05, + "loss": 1.7072, + "step": 24473 + }, + { + "epoch": 7.511970534069982, + "grad_norm": 0.18869875371456146, + "learning_rate": 1.5377848325656384e-05, + "loss": 1.7681, + "step": 24474 + }, + { + "epoch": 7.512277470841006, + "grad_norm": 0.22205953299999237, + "learning_rate": 1.5374262374226612e-05, + "loss": 1.7526, + "step": 24475 + }, + { + "epoch": 7.512584407612032, + "grad_norm": 0.1634155809879303, + "learning_rate": 1.537067676498867e-05, + "loss": 1.704, + "step": 24476 + }, + { + "epoch": 7.512891344383057, + "grad_norm": 0.19530873000621796, + "learning_rate": 1.5367091497978004e-05, + "loss": 1.7469, + "step": 24477 + }, + { + "epoch": 7.513198281154082, + "grad_norm": 0.17038139700889587, + "learning_rate": 1.5363506573230017e-05, + "loss": 1.6363, + "step": 24478 + }, + { + "epoch": 7.513505217925108, + "grad_norm": 0.17695361375808716, + "learning_rate": 1.535992199078019e-05, + "loss": 1.7191, + "step": 24479 + }, + { + "epoch": 7.513812154696133, + "grad_norm": 0.2216692715883255, + "learning_rate": 1.535633775066389e-05, + "loss": 1.8042, + "step": 24480 + }, + { + "epoch": 7.514119091467157, + "grad_norm": 0.16862058639526367, + "learning_rate": 1.5352753852916595e-05, + "loss": 1.697, + "step": 24481 + }, + { + "epoch": 7.514426028238183, + "grad_norm": 0.20376496016979218, + "learning_rate": 1.5349170297573662e-05, + "loss": 1.7274, + "step": 24482 + }, + { + "epoch": 7.514732965009208, + "grad_norm": 0.16290763020515442, + "learning_rate": 1.5345587084670554e-05, + "loss": 1.6929, + "step": 24483 + }, + { + "epoch": 7.515039901780233, + "grad_norm": 0.21416328847408295, + "learning_rate": 1.5342004214242667e-05, + "loss": 1.756, + "step": 24484 + }, + { + "epoch": 7.515346838551259, + "grad_norm": 0.14708222448825836, + "learning_rate": 1.533842168632541e-05, + "loss": 1.6816, + "step": 24485 + }, + { + "epoch": 7.515653775322283, + "grad_norm": 0.1860494166612625, + "learning_rate": 1.5334839500954178e-05, + "loss": 1.7114, + "step": 24486 + }, + { + "epoch": 7.5159607120933085, + "grad_norm": 0.16551998257637024, + "learning_rate": 1.533125765816439e-05, + "loss": 1.6564, + "step": 24487 + }, + { + "epoch": 7.516267648864334, + "grad_norm": 0.16971731185913086, + "learning_rate": 1.5327676157991428e-05, + "loss": 1.6722, + "step": 24488 + }, + { + "epoch": 7.516574585635359, + "grad_norm": 0.17433905601501465, + "learning_rate": 1.532409500047069e-05, + "loss": 1.6944, + "step": 24489 + }, + { + "epoch": 7.5168815224063845, + "grad_norm": 0.15625490248203278, + "learning_rate": 1.5320514185637575e-05, + "loss": 1.6997, + "step": 24490 + }, + { + "epoch": 7.51718845917741, + "grad_norm": 0.19038623571395874, + "learning_rate": 1.531693371352746e-05, + "loss": 1.6999, + "step": 24491 + }, + { + "epoch": 7.517495395948434, + "grad_norm": 0.16037517786026, + "learning_rate": 1.5313353584175736e-05, + "loss": 1.6568, + "step": 24492 + }, + { + "epoch": 7.51780233271946, + "grad_norm": 0.1515430361032486, + "learning_rate": 1.5309773797617787e-05, + "loss": 1.693, + "step": 24493 + }, + { + "epoch": 7.518109269490485, + "grad_norm": 0.1792028695344925, + "learning_rate": 1.530619435388898e-05, + "loss": 1.7034, + "step": 24494 + }, + { + "epoch": 7.51841620626151, + "grad_norm": 0.18456964194774628, + "learning_rate": 1.530261525302468e-05, + "loss": 1.7565, + "step": 24495 + }, + { + "epoch": 7.518723143032536, + "grad_norm": 0.17504090070724487, + "learning_rate": 1.529903649506031e-05, + "loss": 1.7121, + "step": 24496 + }, + { + "epoch": 7.51903007980356, + "grad_norm": 0.19688715040683746, + "learning_rate": 1.529545808003116e-05, + "loss": 1.7507, + "step": 24497 + }, + { + "epoch": 7.519337016574585, + "grad_norm": 0.21039338409900665, + "learning_rate": 1.529188000797267e-05, + "loss": 1.709, + "step": 24498 + }, + { + "epoch": 7.519643953345611, + "grad_norm": 0.18255522847175598, + "learning_rate": 1.5288302278920136e-05, + "loss": 1.7497, + "step": 24499 + }, + { + "epoch": 7.519950890116636, + "grad_norm": 0.19913412630558014, + "learning_rate": 1.5284724892908958e-05, + "loss": 1.7244, + "step": 24500 + }, + { + "epoch": 7.520257826887661, + "grad_norm": 0.15792223811149597, + "learning_rate": 1.5281147849974476e-05, + "loss": 1.6916, + "step": 24501 + }, + { + "epoch": 7.520564763658687, + "grad_norm": 0.2078406661748886, + "learning_rate": 1.5277571150152038e-05, + "loss": 1.6959, + "step": 24502 + }, + { + "epoch": 7.520871700429711, + "grad_norm": 0.15596020221710205, + "learning_rate": 1.5273994793477e-05, + "loss": 1.7217, + "step": 24503 + }, + { + "epoch": 7.5211786372007365, + "grad_norm": 0.18951189517974854, + "learning_rate": 1.527041877998469e-05, + "loss": 1.7322, + "step": 24504 + }, + { + "epoch": 7.521485573971762, + "grad_norm": 0.16445964574813843, + "learning_rate": 1.526684310971046e-05, + "loss": 1.6668, + "step": 24505 + }, + { + "epoch": 7.521792510742787, + "grad_norm": 0.19513604044914246, + "learning_rate": 1.5263267782689644e-05, + "loss": 1.7464, + "step": 24506 + }, + { + "epoch": 7.5220994475138125, + "grad_norm": 0.20289716124534607, + "learning_rate": 1.525969279895758e-05, + "loss": 1.7472, + "step": 24507 + }, + { + "epoch": 7.522406384284837, + "grad_norm": 0.1716226041316986, + "learning_rate": 1.5256118158549588e-05, + "loss": 1.6872, + "step": 24508 + }, + { + "epoch": 7.522713321055862, + "grad_norm": 0.18939872086048126, + "learning_rate": 1.5252543861501006e-05, + "loss": 1.7365, + "step": 24509 + }, + { + "epoch": 7.523020257826888, + "grad_norm": 0.21382616460323334, + "learning_rate": 1.524896990784715e-05, + "loss": 1.7129, + "step": 24510 + }, + { + "epoch": 7.523327194597913, + "grad_norm": 0.18226614594459534, + "learning_rate": 1.5245396297623338e-05, + "loss": 1.7426, + "step": 24511 + }, + { + "epoch": 7.523634131368938, + "grad_norm": 0.15880146622657776, + "learning_rate": 1.5241823030864893e-05, + "loss": 1.6848, + "step": 24512 + }, + { + "epoch": 7.523941068139964, + "grad_norm": 0.1782255917787552, + "learning_rate": 1.5238250107607121e-05, + "loss": 1.7263, + "step": 24513 + }, + { + "epoch": 7.524248004910988, + "grad_norm": 0.20365844666957855, + "learning_rate": 1.5234677527885328e-05, + "loss": 1.7035, + "step": 24514 + }, + { + "epoch": 7.524554941682013, + "grad_norm": 0.1776183694601059, + "learning_rate": 1.5231105291734855e-05, + "loss": 1.6837, + "step": 24515 + }, + { + "epoch": 7.524861878453039, + "grad_norm": 0.14594987034797668, + "learning_rate": 1.5227533399190946e-05, + "loss": 1.6428, + "step": 24516 + }, + { + "epoch": 7.525168815224064, + "grad_norm": 0.19371397793293, + "learning_rate": 1.5223961850288947e-05, + "loss": 1.7108, + "step": 24517 + }, + { + "epoch": 7.525475751995089, + "grad_norm": 0.1695355474948883, + "learning_rate": 1.5220390645064148e-05, + "loss": 1.6777, + "step": 24518 + }, + { + "epoch": 7.525782688766114, + "grad_norm": 0.14815635979175568, + "learning_rate": 1.5216819783551828e-05, + "loss": 1.6967, + "step": 24519 + }, + { + "epoch": 7.526089625537139, + "grad_norm": 0.19655495882034302, + "learning_rate": 1.5213249265787283e-05, + "loss": 1.7358, + "step": 24520 + }, + { + "epoch": 7.526396562308165, + "grad_norm": 0.1817864030599594, + "learning_rate": 1.5209679091805795e-05, + "loss": 1.7132, + "step": 24521 + }, + { + "epoch": 7.52670349907919, + "grad_norm": 0.209315687417984, + "learning_rate": 1.5206109261642654e-05, + "loss": 1.7161, + "step": 24522 + }, + { + "epoch": 7.527010435850215, + "grad_norm": 0.18493252992630005, + "learning_rate": 1.520253977533313e-05, + "loss": 1.7136, + "step": 24523 + }, + { + "epoch": 7.52731737262124, + "grad_norm": 0.21916678547859192, + "learning_rate": 1.5198970632912508e-05, + "loss": 1.7464, + "step": 24524 + }, + { + "epoch": 7.527624309392265, + "grad_norm": 0.14470849931240082, + "learning_rate": 1.519540183441605e-05, + "loss": 1.6676, + "step": 24525 + }, + { + "epoch": 7.52793124616329, + "grad_norm": 0.20077016949653625, + "learning_rate": 1.5191833379879033e-05, + "loss": 1.7052, + "step": 24526 + }, + { + "epoch": 7.528238182934316, + "grad_norm": 0.17593151330947876, + "learning_rate": 1.5188265269336722e-05, + "loss": 1.7309, + "step": 24527 + }, + { + "epoch": 7.528545119705341, + "grad_norm": 0.20170791447162628, + "learning_rate": 1.518469750282438e-05, + "loss": 1.7335, + "step": 24528 + }, + { + "epoch": 7.5288520564763655, + "grad_norm": 0.1703701615333557, + "learning_rate": 1.518113008037726e-05, + "loss": 1.7141, + "step": 24529 + }, + { + "epoch": 7.529158993247391, + "grad_norm": 0.1897478848695755, + "learning_rate": 1.517756300203062e-05, + "loss": 1.7059, + "step": 24530 + }, + { + "epoch": 7.529465930018416, + "grad_norm": 0.17487141489982605, + "learning_rate": 1.5173996267819695e-05, + "loss": 1.7559, + "step": 24531 + }, + { + "epoch": 7.5297728667894415, + "grad_norm": 0.19167299568653107, + "learning_rate": 1.5170429877779785e-05, + "loss": 1.7287, + "step": 24532 + }, + { + "epoch": 7.530079803560467, + "grad_norm": 0.19433172047138214, + "learning_rate": 1.5166863831946072e-05, + "loss": 1.7182, + "step": 24533 + }, + { + "epoch": 7.530386740331492, + "grad_norm": 0.293734073638916, + "learning_rate": 1.5163298130353853e-05, + "loss": 1.7362, + "step": 24534 + }, + { + "epoch": 7.530693677102517, + "grad_norm": 0.18647685647010803, + "learning_rate": 1.515973277303831e-05, + "loss": 1.7271, + "step": 24535 + }, + { + "epoch": 7.531000613873542, + "grad_norm": 0.20918485522270203, + "learning_rate": 1.5156167760034729e-05, + "loss": 1.7225, + "step": 24536 + }, + { + "epoch": 7.531307550644567, + "grad_norm": 0.22056303918361664, + "learning_rate": 1.5152603091378315e-05, + "loss": 1.6524, + "step": 24537 + }, + { + "epoch": 7.531614487415593, + "grad_norm": 0.13695760071277618, + "learning_rate": 1.5149038767104307e-05, + "loss": 1.6639, + "step": 24538 + }, + { + "epoch": 7.531921424186618, + "grad_norm": 0.25396111607551575, + "learning_rate": 1.514547478724792e-05, + "loss": 1.7025, + "step": 24539 + }, + { + "epoch": 7.532228360957642, + "grad_norm": 0.18192961812019348, + "learning_rate": 1.5141911151844384e-05, + "loss": 1.7288, + "step": 24540 + }, + { + "epoch": 7.532535297728668, + "grad_norm": 0.24748951196670532, + "learning_rate": 1.5138347860928908e-05, + "loss": 1.7379, + "step": 24541 + }, + { + "epoch": 7.532842234499693, + "grad_norm": 0.1841045767068863, + "learning_rate": 1.5134784914536715e-05, + "loss": 1.7876, + "step": 24542 + }, + { + "epoch": 7.533149171270718, + "grad_norm": 0.21867021918296814, + "learning_rate": 1.5131222312703014e-05, + "loss": 1.7608, + "step": 24543 + }, + { + "epoch": 7.533456108041744, + "grad_norm": 0.1972149908542633, + "learning_rate": 1.512766005546301e-05, + "loss": 1.6927, + "step": 24544 + }, + { + "epoch": 7.533763044812769, + "grad_norm": 0.1728486567735672, + "learning_rate": 1.5124098142851906e-05, + "loss": 1.7656, + "step": 24545 + }, + { + "epoch": 7.5340699815837935, + "grad_norm": 0.2591659724712372, + "learning_rate": 1.512053657490491e-05, + "loss": 1.6844, + "step": 24546 + }, + { + "epoch": 7.534376918354819, + "grad_norm": 0.17187906801700592, + "learning_rate": 1.5116975351657215e-05, + "loss": 1.707, + "step": 24547 + }, + { + "epoch": 7.534683855125844, + "grad_norm": 0.26111504435539246, + "learning_rate": 1.5113414473143993e-05, + "loss": 1.7273, + "step": 24548 + }, + { + "epoch": 7.5349907918968695, + "grad_norm": 0.2153446227312088, + "learning_rate": 1.5109853939400498e-05, + "loss": 1.7458, + "step": 24549 + }, + { + "epoch": 7.535297728667894, + "grad_norm": 0.20768530666828156, + "learning_rate": 1.5106293750461835e-05, + "loss": 1.749, + "step": 24550 + }, + { + "epoch": 7.535604665438919, + "grad_norm": 0.2211574763059616, + "learning_rate": 1.5102733906363264e-05, + "loss": 1.7236, + "step": 24551 + }, + { + "epoch": 7.535911602209945, + "grad_norm": 0.15983305871486664, + "learning_rate": 1.5099174407139905e-05, + "loss": 1.6682, + "step": 24552 + }, + { + "epoch": 7.53621853898097, + "grad_norm": 0.23821383714675903, + "learning_rate": 1.5095615252826967e-05, + "loss": 1.7173, + "step": 24553 + }, + { + "epoch": 7.536525475751995, + "grad_norm": 0.1726350039243698, + "learning_rate": 1.5092056443459624e-05, + "loss": 1.7566, + "step": 24554 + }, + { + "epoch": 7.536832412523021, + "grad_norm": 0.19859814643859863, + "learning_rate": 1.5088497979073035e-05, + "loss": 1.7005, + "step": 24555 + }, + { + "epoch": 7.537139349294045, + "grad_norm": 0.14776331186294556, + "learning_rate": 1.508493985970239e-05, + "loss": 1.68, + "step": 24556 + }, + { + "epoch": 7.53744628606507, + "grad_norm": 0.20928993821144104, + "learning_rate": 1.50813820853828e-05, + "loss": 1.7536, + "step": 24557 + }, + { + "epoch": 7.537753222836096, + "grad_norm": 0.18914662301540375, + "learning_rate": 1.5077824656149475e-05, + "loss": 1.7476, + "step": 24558 + }, + { + "epoch": 7.538060159607121, + "grad_norm": 0.24415937066078186, + "learning_rate": 1.5074267572037554e-05, + "loss": 1.7225, + "step": 24559 + }, + { + "epoch": 7.538367096378146, + "grad_norm": 0.18504458665847778, + "learning_rate": 1.5070710833082196e-05, + "loss": 1.7028, + "step": 24560 + }, + { + "epoch": 7.538674033149171, + "grad_norm": 0.1846696138381958, + "learning_rate": 1.5067154439318542e-05, + "loss": 1.7204, + "step": 24561 + }, + { + "epoch": 7.538980969920196, + "grad_norm": 0.20846717059612274, + "learning_rate": 1.5063598390781747e-05, + "loss": 1.73, + "step": 24562 + }, + { + "epoch": 7.5392879066912215, + "grad_norm": 0.1950647234916687, + "learning_rate": 1.5060042687506943e-05, + "loss": 1.7008, + "step": 24563 + }, + { + "epoch": 7.539594843462247, + "grad_norm": 0.1880638748407364, + "learning_rate": 1.5056487329529278e-05, + "loss": 1.6965, + "step": 24564 + }, + { + "epoch": 7.539901780233272, + "grad_norm": 0.24405652284622192, + "learning_rate": 1.5052932316883872e-05, + "loss": 1.7407, + "step": 24565 + }, + { + "epoch": 7.5402087170042975, + "grad_norm": 0.15719062089920044, + "learning_rate": 1.5049377649605906e-05, + "loss": 1.6613, + "step": 24566 + }, + { + "epoch": 7.540515653775322, + "grad_norm": 0.20888090133666992, + "learning_rate": 1.5045823327730441e-05, + "loss": 1.7805, + "step": 24567 + }, + { + "epoch": 7.540822590546347, + "grad_norm": 0.1656443029642105, + "learning_rate": 1.504226935129267e-05, + "loss": 1.7047, + "step": 24568 + }, + { + "epoch": 7.541129527317373, + "grad_norm": 0.28847959637641907, + "learning_rate": 1.503871572032765e-05, + "loss": 1.8711, + "step": 24569 + }, + { + "epoch": 7.541436464088398, + "grad_norm": 0.1724858433008194, + "learning_rate": 1.5035162434870548e-05, + "loss": 1.6734, + "step": 24570 + }, + { + "epoch": 7.541743400859423, + "grad_norm": 0.2064351737499237, + "learning_rate": 1.5031609494956484e-05, + "loss": 1.7032, + "step": 24571 + }, + { + "epoch": 7.542050337630448, + "grad_norm": 0.175388365983963, + "learning_rate": 1.5028056900620513e-05, + "loss": 1.6606, + "step": 24572 + }, + { + "epoch": 7.542357274401473, + "grad_norm": 0.20802471041679382, + "learning_rate": 1.5024504651897814e-05, + "loss": 1.7324, + "step": 24573 + }, + { + "epoch": 7.542664211172498, + "grad_norm": 0.187152698636055, + "learning_rate": 1.502095274882343e-05, + "loss": 1.7222, + "step": 24574 + }, + { + "epoch": 7.542971147943524, + "grad_norm": 0.20112092792987823, + "learning_rate": 1.5017401191432511e-05, + "loss": 1.6959, + "step": 24575 + }, + { + "epoch": 7.543278084714549, + "grad_norm": 0.17968857288360596, + "learning_rate": 1.5013849979760136e-05, + "loss": 1.6957, + "step": 24576 + }, + { + "epoch": 7.543585021485574, + "grad_norm": 0.20532584190368652, + "learning_rate": 1.5010299113841397e-05, + "loss": 1.7471, + "step": 24577 + }, + { + "epoch": 7.543891958256599, + "grad_norm": 0.16475969552993774, + "learning_rate": 1.5006748593711394e-05, + "loss": 1.7665, + "step": 24578 + }, + { + "epoch": 7.544198895027624, + "grad_norm": 0.17632076144218445, + "learning_rate": 1.5003198419405213e-05, + "loss": 1.7317, + "step": 24579 + }, + { + "epoch": 7.5445058317986495, + "grad_norm": 0.18197286128997803, + "learning_rate": 1.4999648590957937e-05, + "loss": 1.7278, + "step": 24580 + }, + { + "epoch": 7.544812768569675, + "grad_norm": 0.18043744564056396, + "learning_rate": 1.4996099108404648e-05, + "loss": 1.7335, + "step": 24581 + }, + { + "epoch": 7.5451197053407, + "grad_norm": 0.17072297632694244, + "learning_rate": 1.4992549971780407e-05, + "loss": 1.7236, + "step": 24582 + }, + { + "epoch": 7.545426642111725, + "grad_norm": 0.17413046956062317, + "learning_rate": 1.4989001181120338e-05, + "loss": 1.6794, + "step": 24583 + }, + { + "epoch": 7.54573357888275, + "grad_norm": 0.1684887856245041, + "learning_rate": 1.4985452736459443e-05, + "loss": 1.718, + "step": 24584 + }, + { + "epoch": 7.546040515653775, + "grad_norm": 0.19497069716453552, + "learning_rate": 1.4981904637832866e-05, + "loss": 1.7323, + "step": 24585 + }, + { + "epoch": 7.546347452424801, + "grad_norm": 0.24838820099830627, + "learning_rate": 1.4978356885275596e-05, + "loss": 1.7584, + "step": 24586 + }, + { + "epoch": 7.546654389195826, + "grad_norm": 0.20870071649551392, + "learning_rate": 1.4974809478822749e-05, + "loss": 1.738, + "step": 24587 + }, + { + "epoch": 7.546961325966851, + "grad_norm": 0.21980242431163788, + "learning_rate": 1.497126241850938e-05, + "loss": 1.763, + "step": 24588 + }, + { + "epoch": 7.547268262737876, + "grad_norm": 0.2156188189983368, + "learning_rate": 1.4967715704370488e-05, + "loss": 1.7357, + "step": 24589 + }, + { + "epoch": 7.547575199508901, + "grad_norm": 0.1864207684993744, + "learning_rate": 1.4964169336441202e-05, + "loss": 1.676, + "step": 24590 + }, + { + "epoch": 7.547882136279926, + "grad_norm": 0.18940003216266632, + "learning_rate": 1.4960623314756494e-05, + "loss": 1.7614, + "step": 24591 + }, + { + "epoch": 7.548189073050952, + "grad_norm": 0.19220350682735443, + "learning_rate": 1.4957077639351463e-05, + "loss": 1.7266, + "step": 24592 + }, + { + "epoch": 7.548496009821976, + "grad_norm": 0.15492811799049377, + "learning_rate": 1.4953532310261126e-05, + "loss": 1.7359, + "step": 24593 + }, + { + "epoch": 7.5488029465930016, + "grad_norm": 0.25591567158699036, + "learning_rate": 1.4949987327520526e-05, + "loss": 1.7, + "step": 24594 + }, + { + "epoch": 7.549109883364027, + "grad_norm": 0.18157868087291718, + "learning_rate": 1.4946442691164697e-05, + "loss": 1.7204, + "step": 24595 + }, + { + "epoch": 7.549416820135052, + "grad_norm": 0.17679910361766815, + "learning_rate": 1.4942898401228662e-05, + "loss": 1.6871, + "step": 24596 + }, + { + "epoch": 7.5497237569060776, + "grad_norm": 0.2000853717327118, + "learning_rate": 1.4939354457747456e-05, + "loss": 1.7186, + "step": 24597 + }, + { + "epoch": 7.550030693677103, + "grad_norm": 0.19947710633277893, + "learning_rate": 1.49358108607561e-05, + "loss": 1.6853, + "step": 24598 + }, + { + "epoch": 7.550337630448127, + "grad_norm": 0.16325148940086365, + "learning_rate": 1.4932267610289596e-05, + "loss": 1.7027, + "step": 24599 + }, + { + "epoch": 7.550644567219153, + "grad_norm": 0.22839638590812683, + "learning_rate": 1.4928724706383007e-05, + "loss": 1.7887, + "step": 24600 + }, + { + "epoch": 7.550951503990178, + "grad_norm": 0.16242358088493347, + "learning_rate": 1.4925182149071286e-05, + "loss": 1.6617, + "step": 24601 + }, + { + "epoch": 7.551258440761203, + "grad_norm": 0.1674090027809143, + "learning_rate": 1.4921639938389504e-05, + "loss": 1.656, + "step": 24602 + }, + { + "epoch": 7.551565377532229, + "grad_norm": 0.1628156453371048, + "learning_rate": 1.4918098074372605e-05, + "loss": 1.683, + "step": 24603 + }, + { + "epoch": 7.551872314303253, + "grad_norm": 0.19156567752361298, + "learning_rate": 1.4914556557055637e-05, + "loss": 1.7174, + "step": 24604 + }, + { + "epoch": 7.5521792510742785, + "grad_norm": 0.19634003937244415, + "learning_rate": 1.4911015386473603e-05, + "loss": 1.6605, + "step": 24605 + }, + { + "epoch": 7.552486187845304, + "grad_norm": 0.19273599982261658, + "learning_rate": 1.490747456266145e-05, + "loss": 1.7092, + "step": 24606 + }, + { + "epoch": 7.552793124616329, + "grad_norm": 0.23641756176948547, + "learning_rate": 1.4903934085654231e-05, + "loss": 1.7524, + "step": 24607 + }, + { + "epoch": 7.5531000613873545, + "grad_norm": 0.19623206555843353, + "learning_rate": 1.490039395548688e-05, + "loss": 1.7281, + "step": 24608 + }, + { + "epoch": 7.55340699815838, + "grad_norm": 0.1978278011083603, + "learning_rate": 1.489685417219442e-05, + "loss": 1.7099, + "step": 24609 + }, + { + "epoch": 7.553713934929404, + "grad_norm": 0.19635866582393646, + "learning_rate": 1.489331473581182e-05, + "loss": 1.7146, + "step": 24610 + }, + { + "epoch": 7.55402087170043, + "grad_norm": 0.2121066302061081, + "learning_rate": 1.4889775646374065e-05, + "loss": 1.7598, + "step": 24611 + }, + { + "epoch": 7.554327808471455, + "grad_norm": 0.17944596707820892, + "learning_rate": 1.4886236903916122e-05, + "loss": 1.6778, + "step": 24612 + }, + { + "epoch": 7.55463474524248, + "grad_norm": 0.15834666788578033, + "learning_rate": 1.488269850847297e-05, + "loss": 1.6498, + "step": 24613 + }, + { + "epoch": 7.554941682013506, + "grad_norm": 0.18597754836082458, + "learning_rate": 1.4879160460079573e-05, + "loss": 1.7145, + "step": 24614 + }, + { + "epoch": 7.55524861878453, + "grad_norm": 0.18300876021385193, + "learning_rate": 1.4875622758770897e-05, + "loss": 1.7253, + "step": 24615 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 0.17805244028568268, + "learning_rate": 1.4872085404581887e-05, + "loss": 1.7152, + "step": 24616 + }, + { + "epoch": 7.555862492326581, + "grad_norm": 0.1987949162721634, + "learning_rate": 1.486854839754755e-05, + "loss": 1.7501, + "step": 24617 + }, + { + "epoch": 7.556169429097606, + "grad_norm": 0.17301858961582184, + "learning_rate": 1.4865011737702777e-05, + "loss": 1.7122, + "step": 24618 + }, + { + "epoch": 7.556476365868631, + "grad_norm": 0.180507093667984, + "learning_rate": 1.4861475425082583e-05, + "loss": 1.7192, + "step": 24619 + }, + { + "epoch": 7.556783302639657, + "grad_norm": 0.16658489406108856, + "learning_rate": 1.4857939459721854e-05, + "loss": 1.6879, + "step": 24620 + }, + { + "epoch": 7.557090239410681, + "grad_norm": 0.19498902559280396, + "learning_rate": 1.4854403841655578e-05, + "loss": 1.7395, + "step": 24621 + }, + { + "epoch": 7.5573971761817065, + "grad_norm": 0.1737620085477829, + "learning_rate": 1.4850868570918702e-05, + "loss": 1.7029, + "step": 24622 + }, + { + "epoch": 7.557704112952732, + "grad_norm": 0.1600165218114853, + "learning_rate": 1.4847333647546113e-05, + "loss": 1.7194, + "step": 24623 + }, + { + "epoch": 7.558011049723757, + "grad_norm": 0.18392407894134521, + "learning_rate": 1.4843799071572806e-05, + "loss": 1.6838, + "step": 24624 + }, + { + "epoch": 7.558317986494782, + "grad_norm": 0.19074605405330658, + "learning_rate": 1.4840264843033651e-05, + "loss": 1.7069, + "step": 24625 + }, + { + "epoch": 7.558624923265807, + "grad_norm": 0.18156903982162476, + "learning_rate": 1.4836730961963619e-05, + "loss": 1.6494, + "step": 24626 + }, + { + "epoch": 7.558931860036832, + "grad_norm": 0.16716471314430237, + "learning_rate": 1.4833197428397627e-05, + "loss": 1.7516, + "step": 24627 + }, + { + "epoch": 7.559238796807858, + "grad_norm": 0.18882833421230316, + "learning_rate": 1.4829664242370588e-05, + "loss": 1.7117, + "step": 24628 + }, + { + "epoch": 7.559545733578883, + "grad_norm": 0.19933676719665527, + "learning_rate": 1.482613140391742e-05, + "loss": 1.6928, + "step": 24629 + }, + { + "epoch": 7.559852670349908, + "grad_norm": 0.15574946999549866, + "learning_rate": 1.4822598913073039e-05, + "loss": 1.702, + "step": 24630 + }, + { + "epoch": 7.560159607120933, + "grad_norm": 0.1953001618385315, + "learning_rate": 1.4819066769872353e-05, + "loss": 1.75, + "step": 24631 + }, + { + "epoch": 7.560466543891958, + "grad_norm": 0.18364208936691284, + "learning_rate": 1.481553497435027e-05, + "loss": 1.6697, + "step": 24632 + }, + { + "epoch": 7.560773480662983, + "grad_norm": 0.16670002043247223, + "learning_rate": 1.4812003526541673e-05, + "loss": 1.6919, + "step": 24633 + }, + { + "epoch": 7.561080417434009, + "grad_norm": 0.19388388097286224, + "learning_rate": 1.4808472426481518e-05, + "loss": 1.7412, + "step": 24634 + }, + { + "epoch": 7.561387354205034, + "grad_norm": 0.19203592836856842, + "learning_rate": 1.4804941674204631e-05, + "loss": 1.7128, + "step": 24635 + }, + { + "epoch": 7.5616942909760585, + "grad_norm": 0.18893340229988098, + "learning_rate": 1.4801411269745974e-05, + "loss": 1.7018, + "step": 24636 + }, + { + "epoch": 7.562001227747084, + "grad_norm": 0.1825447529554367, + "learning_rate": 1.4797881213140363e-05, + "loss": 1.7216, + "step": 24637 + }, + { + "epoch": 7.562308164518109, + "grad_norm": 0.19031697511672974, + "learning_rate": 1.4794351504422743e-05, + "loss": 1.7479, + "step": 24638 + }, + { + "epoch": 7.5626151012891345, + "grad_norm": 0.18328487873077393, + "learning_rate": 1.4790822143627991e-05, + "loss": 1.7222, + "step": 24639 + }, + { + "epoch": 7.56292203806016, + "grad_norm": 0.17531271278858185, + "learning_rate": 1.4787293130790941e-05, + "loss": 1.7197, + "step": 24640 + }, + { + "epoch": 7.563228974831185, + "grad_norm": 0.17078469693660736, + "learning_rate": 1.4783764465946526e-05, + "loss": 1.7715, + "step": 24641 + }, + { + "epoch": 7.56353591160221, + "grad_norm": 0.1859765648841858, + "learning_rate": 1.4780236149129567e-05, + "loss": 1.698, + "step": 24642 + }, + { + "epoch": 7.563842848373235, + "grad_norm": 0.18488194048404694, + "learning_rate": 1.4776708180374965e-05, + "loss": 1.6943, + "step": 24643 + }, + { + "epoch": 7.56414978514426, + "grad_norm": 0.1741705685853958, + "learning_rate": 1.4773180559717586e-05, + "loss": 1.6966, + "step": 24644 + }, + { + "epoch": 7.564456721915286, + "grad_norm": 0.20310313999652863, + "learning_rate": 1.476965328719228e-05, + "loss": 1.7572, + "step": 24645 + }, + { + "epoch": 7.564763658686311, + "grad_norm": 0.20557743310928345, + "learning_rate": 1.476612636283391e-05, + "loss": 1.7419, + "step": 24646 + }, + { + "epoch": 7.565070595457335, + "grad_norm": 0.20597940683364868, + "learning_rate": 1.4762599786677329e-05, + "loss": 1.7147, + "step": 24647 + }, + { + "epoch": 7.565377532228361, + "grad_norm": 0.21609526872634888, + "learning_rate": 1.4759073558757391e-05, + "loss": 1.7678, + "step": 24648 + }, + { + "epoch": 7.565684468999386, + "grad_norm": 0.2233472615480423, + "learning_rate": 1.4755547679108945e-05, + "loss": 1.7381, + "step": 24649 + }, + { + "epoch": 7.565991405770411, + "grad_norm": 0.19561493396759033, + "learning_rate": 1.4752022147766814e-05, + "loss": 1.7254, + "step": 24650 + }, + { + "epoch": 7.566298342541437, + "grad_norm": 0.16491469740867615, + "learning_rate": 1.4748496964765896e-05, + "loss": 1.6834, + "step": 24651 + }, + { + "epoch": 7.566605279312462, + "grad_norm": 0.16946618258953094, + "learning_rate": 1.4744972130140955e-05, + "loss": 1.7154, + "step": 24652 + }, + { + "epoch": 7.5669122160834865, + "grad_norm": 0.1625654697418213, + "learning_rate": 1.4741447643926904e-05, + "loss": 1.6941, + "step": 24653 + }, + { + "epoch": 7.567219152854512, + "grad_norm": 0.16875535249710083, + "learning_rate": 1.4737923506158491e-05, + "loss": 1.6875, + "step": 24654 + }, + { + "epoch": 7.567526089625537, + "grad_norm": 0.1625872105360031, + "learning_rate": 1.4734399716870607e-05, + "loss": 1.6558, + "step": 24655 + }, + { + "epoch": 7.5678330263965625, + "grad_norm": 0.17323140799999237, + "learning_rate": 1.4730876276098071e-05, + "loss": 1.7468, + "step": 24656 + }, + { + "epoch": 7.568139963167588, + "grad_norm": 0.18788693845272064, + "learning_rate": 1.472735318387566e-05, + "loss": 1.7345, + "step": 24657 + }, + { + "epoch": 7.568446899938612, + "grad_norm": 0.18096889555454254, + "learning_rate": 1.472383044023824e-05, + "loss": 1.725, + "step": 24658 + }, + { + "epoch": 7.568753836709638, + "grad_norm": 0.2327791154384613, + "learning_rate": 1.4720308045220577e-05, + "loss": 1.7367, + "step": 24659 + }, + { + "epoch": 7.569060773480663, + "grad_norm": 0.187728151679039, + "learning_rate": 1.4716785998857525e-05, + "loss": 1.6967, + "step": 24660 + }, + { + "epoch": 7.569367710251688, + "grad_norm": 0.18520617485046387, + "learning_rate": 1.4713264301183876e-05, + "loss": 1.6576, + "step": 24661 + }, + { + "epoch": 7.569674647022714, + "grad_norm": 0.20537808537483215, + "learning_rate": 1.4709742952234428e-05, + "loss": 1.6911, + "step": 24662 + }, + { + "epoch": 7.569981583793739, + "grad_norm": 0.18872039020061493, + "learning_rate": 1.4706221952043986e-05, + "loss": 1.745, + "step": 24663 + }, + { + "epoch": 7.570288520564763, + "grad_norm": 0.16083933413028717, + "learning_rate": 1.4702701300647343e-05, + "loss": 1.6875, + "step": 24664 + }, + { + "epoch": 7.570595457335789, + "grad_norm": 0.19390366971492767, + "learning_rate": 1.4699180998079293e-05, + "loss": 1.6996, + "step": 24665 + }, + { + "epoch": 7.570902394106814, + "grad_norm": 0.20478816330432892, + "learning_rate": 1.4695661044374632e-05, + "loss": 1.7359, + "step": 24666 + }, + { + "epoch": 7.571209330877839, + "grad_norm": 0.17485570907592773, + "learning_rate": 1.4692141439568136e-05, + "loss": 1.696, + "step": 24667 + }, + { + "epoch": 7.571516267648864, + "grad_norm": 0.18266968429088593, + "learning_rate": 1.4688622183694594e-05, + "loss": 1.713, + "step": 24668 + }, + { + "epoch": 7.571823204419889, + "grad_norm": 0.14412200450897217, + "learning_rate": 1.468510327678877e-05, + "loss": 1.6938, + "step": 24669 + }, + { + "epoch": 7.5721301411909145, + "grad_norm": 0.18144819140434265, + "learning_rate": 1.4681584718885488e-05, + "loss": 1.7523, + "step": 24670 + }, + { + "epoch": 7.57243707796194, + "grad_norm": 0.32198768854141235, + "learning_rate": 1.467806651001945e-05, + "loss": 1.71, + "step": 24671 + }, + { + "epoch": 7.572744014732965, + "grad_norm": 0.1535005122423172, + "learning_rate": 1.4674548650225483e-05, + "loss": 1.6912, + "step": 24672 + }, + { + "epoch": 7.5730509515039905, + "grad_norm": 0.17982423305511475, + "learning_rate": 1.4671031139538343e-05, + "loss": 1.6928, + "step": 24673 + }, + { + "epoch": 7.573357888275015, + "grad_norm": 0.16811783611774445, + "learning_rate": 1.4667513977992747e-05, + "loss": 1.6954, + "step": 24674 + }, + { + "epoch": 7.57366482504604, + "grad_norm": 0.18918997049331665, + "learning_rate": 1.4663997165623522e-05, + "loss": 1.6967, + "step": 24675 + }, + { + "epoch": 7.573971761817066, + "grad_norm": 0.16559816896915436, + "learning_rate": 1.4660480702465357e-05, + "loss": 1.7097, + "step": 24676 + }, + { + "epoch": 7.574278698588091, + "grad_norm": 0.20471042394638062, + "learning_rate": 1.4656964588553046e-05, + "loss": 1.7032, + "step": 24677 + }, + { + "epoch": 7.574585635359116, + "grad_norm": 0.16387851536273956, + "learning_rate": 1.4653448823921329e-05, + "loss": 1.7066, + "step": 24678 + }, + { + "epoch": 7.574892572130141, + "grad_norm": 0.19144418835639954, + "learning_rate": 1.4649933408604949e-05, + "loss": 1.7272, + "step": 24679 + }, + { + "epoch": 7.575199508901166, + "grad_norm": 0.17270216345787048, + "learning_rate": 1.4646418342638646e-05, + "loss": 1.7456, + "step": 24680 + }, + { + "epoch": 7.5755064456721914, + "grad_norm": 0.1937440037727356, + "learning_rate": 1.4642903626057159e-05, + "loss": 1.6973, + "step": 24681 + }, + { + "epoch": 7.575813382443217, + "grad_norm": 0.18958482146263123, + "learning_rate": 1.463938925889522e-05, + "loss": 1.7549, + "step": 24682 + }, + { + "epoch": 7.576120319214242, + "grad_norm": 0.20584101974964142, + "learning_rate": 1.4635875241187558e-05, + "loss": 1.7013, + "step": 24683 + }, + { + "epoch": 7.5764272559852675, + "grad_norm": 0.22839057445526123, + "learning_rate": 1.463236157296891e-05, + "loss": 1.7282, + "step": 24684 + }, + { + "epoch": 7.576734192756292, + "grad_norm": 0.19894570112228394, + "learning_rate": 1.4628848254273996e-05, + "loss": 1.7115, + "step": 24685 + }, + { + "epoch": 7.577041129527317, + "grad_norm": 0.1880837082862854, + "learning_rate": 1.4625335285137515e-05, + "loss": 1.6526, + "step": 24686 + }, + { + "epoch": 7.577348066298343, + "grad_norm": 0.21545001864433289, + "learning_rate": 1.4621822665594238e-05, + "loss": 1.6709, + "step": 24687 + }, + { + "epoch": 7.577655003069368, + "grad_norm": 0.2091502994298935, + "learning_rate": 1.4618310395678813e-05, + "loss": 1.6792, + "step": 24688 + }, + { + "epoch": 7.577961939840393, + "grad_norm": 0.2100556343793869, + "learning_rate": 1.4614798475426018e-05, + "loss": 1.7112, + "step": 24689 + }, + { + "epoch": 7.578268876611418, + "grad_norm": 0.17702727019786835, + "learning_rate": 1.4611286904870502e-05, + "loss": 1.6353, + "step": 24690 + }, + { + "epoch": 7.578575813382443, + "grad_norm": 0.1935967355966568, + "learning_rate": 1.4607775684046975e-05, + "loss": 1.6638, + "step": 24691 + }, + { + "epoch": 7.578882750153468, + "grad_norm": 0.13495506346225739, + "learning_rate": 1.4604264812990193e-05, + "loss": 1.6526, + "step": 24692 + }, + { + "epoch": 7.579189686924494, + "grad_norm": 0.20418134331703186, + "learning_rate": 1.4600754291734774e-05, + "loss": 1.731, + "step": 24693 + }, + { + "epoch": 7.579496623695519, + "grad_norm": 0.1541702151298523, + "learning_rate": 1.4597244120315467e-05, + "loss": 1.7047, + "step": 24694 + }, + { + "epoch": 7.579803560466544, + "grad_norm": 0.2106262892484665, + "learning_rate": 1.4593734298766942e-05, + "loss": 1.696, + "step": 24695 + }, + { + "epoch": 7.580110497237569, + "grad_norm": 0.15727077424526215, + "learning_rate": 1.4590224827123889e-05, + "loss": 1.6782, + "step": 24696 + }, + { + "epoch": 7.580417434008594, + "grad_norm": 0.19231721758842468, + "learning_rate": 1.4586715705420983e-05, + "loss": 1.7832, + "step": 24697 + }, + { + "epoch": 7.5807243707796195, + "grad_norm": 0.18290117383003235, + "learning_rate": 1.4583206933692916e-05, + "loss": 1.6715, + "step": 24698 + }, + { + "epoch": 7.581031307550645, + "grad_norm": 0.21551427245140076, + "learning_rate": 1.4579698511974355e-05, + "loss": 1.7326, + "step": 24699 + }, + { + "epoch": 7.581338244321669, + "grad_norm": 0.21561767160892487, + "learning_rate": 1.457619044029997e-05, + "loss": 1.6682, + "step": 24700 + }, + { + "epoch": 7.581645181092695, + "grad_norm": 0.15537963807582855, + "learning_rate": 1.457268271870444e-05, + "loss": 1.719, + "step": 24701 + }, + { + "epoch": 7.58195211786372, + "grad_norm": 0.18738612532615662, + "learning_rate": 1.456917534722242e-05, + "loss": 1.7415, + "step": 24702 + }, + { + "epoch": 7.582259054634745, + "grad_norm": 0.15522584319114685, + "learning_rate": 1.456566832588856e-05, + "loss": 1.6931, + "step": 24703 + }, + { + "epoch": 7.582565991405771, + "grad_norm": 0.192890003323555, + "learning_rate": 1.4562161654737567e-05, + "loss": 1.7726, + "step": 24704 + }, + { + "epoch": 7.582872928176796, + "grad_norm": 0.2163987159729004, + "learning_rate": 1.4558655333804028e-05, + "loss": 1.7459, + "step": 24705 + }, + { + "epoch": 7.58317986494782, + "grad_norm": 0.1635672152042389, + "learning_rate": 1.4555149363122667e-05, + "loss": 1.7407, + "step": 24706 + }, + { + "epoch": 7.583486801718846, + "grad_norm": 0.1858159899711609, + "learning_rate": 1.4551643742728072e-05, + "loss": 1.7175, + "step": 24707 + }, + { + "epoch": 7.583793738489871, + "grad_norm": 0.23077011108398438, + "learning_rate": 1.4548138472654904e-05, + "loss": 1.7739, + "step": 24708 + }, + { + "epoch": 7.584100675260896, + "grad_norm": 0.22413180768489838, + "learning_rate": 1.4544633552937836e-05, + "loss": 1.7208, + "step": 24709 + }, + { + "epoch": 7.584407612031922, + "grad_norm": 0.16147246956825256, + "learning_rate": 1.4541128983611445e-05, + "loss": 1.7021, + "step": 24710 + }, + { + "epoch": 7.584714548802946, + "grad_norm": 0.17363815009593964, + "learning_rate": 1.4537624764710439e-05, + "loss": 1.6863, + "step": 24711 + }, + { + "epoch": 7.5850214855739715, + "grad_norm": 0.14971798658370972, + "learning_rate": 1.4534120896269377e-05, + "loss": 1.655, + "step": 24712 + }, + { + "epoch": 7.585328422344997, + "grad_norm": 0.15934213995933533, + "learning_rate": 1.4530617378322937e-05, + "loss": 1.6771, + "step": 24713 + }, + { + "epoch": 7.585635359116022, + "grad_norm": 0.17807291448116302, + "learning_rate": 1.4527114210905724e-05, + "loss": 1.7419, + "step": 24714 + }, + { + "epoch": 7.5859422958870475, + "grad_norm": 0.1727002114057541, + "learning_rate": 1.4523611394052356e-05, + "loss": 1.7232, + "step": 24715 + }, + { + "epoch": 7.586249232658073, + "grad_norm": 0.1625738888978958, + "learning_rate": 1.452010892779746e-05, + "loss": 1.6967, + "step": 24716 + }, + { + "epoch": 7.586556169429097, + "grad_norm": 0.2153816670179367, + "learning_rate": 1.4516606812175636e-05, + "loss": 1.7339, + "step": 24717 + }, + { + "epoch": 7.586863106200123, + "grad_norm": 0.19343912601470947, + "learning_rate": 1.451310504722151e-05, + "loss": 1.7059, + "step": 24718 + }, + { + "epoch": 7.587170042971148, + "grad_norm": 0.16220279037952423, + "learning_rate": 1.450960363296967e-05, + "loss": 1.6825, + "step": 24719 + }, + { + "epoch": 7.587476979742173, + "grad_norm": 0.1678459346294403, + "learning_rate": 1.4506102569454716e-05, + "loss": 1.728, + "step": 24720 + }, + { + "epoch": 7.587783916513199, + "grad_norm": 0.19833502173423767, + "learning_rate": 1.4502601856711295e-05, + "loss": 1.7733, + "step": 24721 + }, + { + "epoch": 7.588090853284223, + "grad_norm": 0.1593111902475357, + "learning_rate": 1.4499101494773931e-05, + "loss": 1.7017, + "step": 24722 + }, + { + "epoch": 7.588397790055248, + "grad_norm": 0.2083328664302826, + "learning_rate": 1.449560148367729e-05, + "loss": 1.7661, + "step": 24723 + }, + { + "epoch": 7.588704726826274, + "grad_norm": 0.19797182083129883, + "learning_rate": 1.4492101823455906e-05, + "loss": 1.788, + "step": 24724 + }, + { + "epoch": 7.589011663597299, + "grad_norm": 0.15613096952438354, + "learning_rate": 1.4488602514144373e-05, + "loss": 1.7295, + "step": 24725 + }, + { + "epoch": 7.589318600368324, + "grad_norm": 0.18078529834747314, + "learning_rate": 1.4485103555777307e-05, + "loss": 1.7165, + "step": 24726 + }, + { + "epoch": 7.58962553713935, + "grad_norm": 0.14951148629188538, + "learning_rate": 1.4481604948389238e-05, + "loss": 1.6431, + "step": 24727 + }, + { + "epoch": 7.589932473910374, + "grad_norm": 0.19518490135669708, + "learning_rate": 1.4478106692014797e-05, + "loss": 1.7332, + "step": 24728 + }, + { + "epoch": 7.5902394106813995, + "grad_norm": 0.17438004910945892, + "learning_rate": 1.4474608786688493e-05, + "loss": 1.6677, + "step": 24729 + }, + { + "epoch": 7.590546347452425, + "grad_norm": 0.2767544090747833, + "learning_rate": 1.4471111232444944e-05, + "loss": 1.7649, + "step": 24730 + }, + { + "epoch": 7.59085328422345, + "grad_norm": 0.21649987995624542, + "learning_rate": 1.4467614029318699e-05, + "loss": 1.7349, + "step": 24731 + }, + { + "epoch": 7.5911602209944755, + "grad_norm": 0.26566463708877563, + "learning_rate": 1.4464117177344316e-05, + "loss": 1.7474, + "step": 24732 + }, + { + "epoch": 7.5914671577655, + "grad_norm": 0.19050925970077515, + "learning_rate": 1.4460620676556358e-05, + "loss": 1.7066, + "step": 24733 + }, + { + "epoch": 7.591774094536525, + "grad_norm": 0.20030665397644043, + "learning_rate": 1.4457124526989375e-05, + "loss": 1.6589, + "step": 24734 + }, + { + "epoch": 7.592081031307551, + "grad_norm": 0.18715742230415344, + "learning_rate": 1.4453628728677921e-05, + "loss": 1.7186, + "step": 24735 + }, + { + "epoch": 7.592387968078576, + "grad_norm": 0.241498664021492, + "learning_rate": 1.4450133281656542e-05, + "loss": 1.6686, + "step": 24736 + }, + { + "epoch": 7.592694904849601, + "grad_norm": 0.20305299758911133, + "learning_rate": 1.4446638185959765e-05, + "loss": 1.7351, + "step": 24737 + }, + { + "epoch": 7.593001841620627, + "grad_norm": 0.177521750330925, + "learning_rate": 1.444314344162218e-05, + "loss": 1.6383, + "step": 24738 + }, + { + "epoch": 7.593308778391651, + "grad_norm": 0.19877439737319946, + "learning_rate": 1.443964904867826e-05, + "loss": 1.7335, + "step": 24739 + }, + { + "epoch": 7.593615715162676, + "grad_norm": 0.16544201970100403, + "learning_rate": 1.4436155007162605e-05, + "loss": 1.6952, + "step": 24740 + }, + { + "epoch": 7.593922651933702, + "grad_norm": 0.20925499498844147, + "learning_rate": 1.443266131710969e-05, + "loss": 1.7042, + "step": 24741 + }, + { + "epoch": 7.594229588704727, + "grad_norm": 0.16688574850559235, + "learning_rate": 1.4429167978554054e-05, + "loss": 1.6797, + "step": 24742 + }, + { + "epoch": 7.5945365254757515, + "grad_norm": 0.2231293022632599, + "learning_rate": 1.4425674991530258e-05, + "loss": 1.8697, + "step": 24743 + }, + { + "epoch": 7.594843462246777, + "grad_norm": 0.2114260196685791, + "learning_rate": 1.442218235607276e-05, + "loss": 1.7404, + "step": 24744 + }, + { + "epoch": 7.595150399017802, + "grad_norm": 0.1842830628156662, + "learning_rate": 1.441869007221614e-05, + "loss": 1.7687, + "step": 24745 + }, + { + "epoch": 7.5954573357888275, + "grad_norm": 0.17780441045761108, + "learning_rate": 1.4415198139994846e-05, + "loss": 1.7492, + "step": 24746 + }, + { + "epoch": 7.595764272559853, + "grad_norm": 0.18805068731307983, + "learning_rate": 1.4411706559443438e-05, + "loss": 1.757, + "step": 24747 + }, + { + "epoch": 7.596071209330878, + "grad_norm": 0.18918974697589874, + "learning_rate": 1.4408215330596403e-05, + "loss": 1.7006, + "step": 24748 + }, + { + "epoch": 7.596378146101903, + "grad_norm": 0.17850689589977264, + "learning_rate": 1.440472445348825e-05, + "loss": 1.6565, + "step": 24749 + }, + { + "epoch": 7.596685082872928, + "grad_norm": 0.20043544471263885, + "learning_rate": 1.4401233928153468e-05, + "loss": 1.7314, + "step": 24750 + }, + { + "epoch": 7.596992019643953, + "grad_norm": 0.1963229477405548, + "learning_rate": 1.4397743754626564e-05, + "loss": 1.6946, + "step": 24751 + }, + { + "epoch": 7.597298956414979, + "grad_norm": 0.2203695923089981, + "learning_rate": 1.4394253932942014e-05, + "loss": 1.7128, + "step": 24752 + }, + { + "epoch": 7.597605893186004, + "grad_norm": 0.19254128634929657, + "learning_rate": 1.4390764463134322e-05, + "loss": 1.6748, + "step": 24753 + }, + { + "epoch": 7.597912829957028, + "grad_norm": 0.19880495965480804, + "learning_rate": 1.438727534523795e-05, + "loss": 1.7155, + "step": 24754 + }, + { + "epoch": 7.598219766728054, + "grad_norm": 0.17486177384853363, + "learning_rate": 1.4383786579287428e-05, + "loss": 1.7484, + "step": 24755 + }, + { + "epoch": 7.598526703499079, + "grad_norm": 0.17247791588306427, + "learning_rate": 1.4380298165317168e-05, + "loss": 1.7225, + "step": 24756 + }, + { + "epoch": 7.598833640270104, + "grad_norm": 0.1802847534418106, + "learning_rate": 1.4376810103361714e-05, + "loss": 1.7009, + "step": 24757 + }, + { + "epoch": 7.59914057704113, + "grad_norm": 0.1934153437614441, + "learning_rate": 1.4373322393455485e-05, + "loss": 1.6957, + "step": 24758 + }, + { + "epoch": 7.599447513812155, + "grad_norm": 0.1508229374885559, + "learning_rate": 1.436983503563295e-05, + "loss": 1.6677, + "step": 24759 + }, + { + "epoch": 7.5997544505831796, + "grad_norm": 0.16684283316135406, + "learning_rate": 1.4366348029928623e-05, + "loss": 1.7394, + "step": 24760 + }, + { + "epoch": 7.600061387354205, + "grad_norm": 0.22492031753063202, + "learning_rate": 1.4362861376376896e-05, + "loss": 1.7302, + "step": 24761 + }, + { + "epoch": 7.60036832412523, + "grad_norm": 0.1654716283082962, + "learning_rate": 1.4359375075012294e-05, + "loss": 1.6487, + "step": 24762 + }, + { + "epoch": 7.600675260896256, + "grad_norm": 0.17514392733573914, + "learning_rate": 1.4355889125869198e-05, + "loss": 1.6952, + "step": 24763 + }, + { + "epoch": 7.600982197667281, + "grad_norm": 0.21000738441944122, + "learning_rate": 1.4352403528982123e-05, + "loss": 1.714, + "step": 24764 + }, + { + "epoch": 7.601289134438305, + "grad_norm": 0.18791960179805756, + "learning_rate": 1.4348918284385481e-05, + "loss": 1.7334, + "step": 24765 + }, + { + "epoch": 7.601596071209331, + "grad_norm": 0.267089307308197, + "learning_rate": 1.4345433392113734e-05, + "loss": 1.7567, + "step": 24766 + }, + { + "epoch": 7.601903007980356, + "grad_norm": 0.1814621239900589, + "learning_rate": 1.4341948852201304e-05, + "loss": 1.7031, + "step": 24767 + }, + { + "epoch": 7.602209944751381, + "grad_norm": 0.16144737601280212, + "learning_rate": 1.4338464664682639e-05, + "loss": 1.6844, + "step": 24768 + }, + { + "epoch": 7.602516881522407, + "grad_norm": 0.14824162423610687, + "learning_rate": 1.433498082959217e-05, + "loss": 1.6854, + "step": 24769 + }, + { + "epoch": 7.602823818293432, + "grad_norm": 0.1837405115365982, + "learning_rate": 1.4331497346964318e-05, + "loss": 1.7087, + "step": 24770 + }, + { + "epoch": 7.6031307550644565, + "grad_norm": 0.20706148445606232, + "learning_rate": 1.4328014216833508e-05, + "loss": 1.7816, + "step": 24771 + }, + { + "epoch": 7.603437691835482, + "grad_norm": 0.16134382784366608, + "learning_rate": 1.4324531439234196e-05, + "loss": 1.7095, + "step": 24772 + }, + { + "epoch": 7.603744628606507, + "grad_norm": 0.15924426913261414, + "learning_rate": 1.4321049014200737e-05, + "loss": 1.7115, + "step": 24773 + }, + { + "epoch": 7.6040515653775325, + "grad_norm": 0.14942041039466858, + "learning_rate": 1.4317566941767625e-05, + "loss": 1.6872, + "step": 24774 + }, + { + "epoch": 7.604358502148557, + "grad_norm": 0.1646505445241928, + "learning_rate": 1.4314085221969209e-05, + "loss": 1.663, + "step": 24775 + }, + { + "epoch": 7.604665438919582, + "grad_norm": 0.17342600226402283, + "learning_rate": 1.4310603854839904e-05, + "loss": 1.7702, + "step": 24776 + }, + { + "epoch": 7.604972375690608, + "grad_norm": 0.17148490250110626, + "learning_rate": 1.4307122840414167e-05, + "loss": 1.7392, + "step": 24777 + }, + { + "epoch": 7.605279312461633, + "grad_norm": 0.22112305462360382, + "learning_rate": 1.4303642178726328e-05, + "loss": 1.6784, + "step": 24778 + }, + { + "epoch": 7.605586249232658, + "grad_norm": 0.22548529505729675, + "learning_rate": 1.4300161869810846e-05, + "loss": 1.7405, + "step": 24779 + }, + { + "epoch": 7.605893186003684, + "grad_norm": 0.179958313703537, + "learning_rate": 1.4296681913702065e-05, + "loss": 1.6848, + "step": 24780 + }, + { + "epoch": 7.606200122774708, + "grad_norm": 0.16872282326221466, + "learning_rate": 1.4293202310434407e-05, + "loss": 1.6973, + "step": 24781 + }, + { + "epoch": 7.606507059545733, + "grad_norm": 0.20554648339748383, + "learning_rate": 1.428972306004226e-05, + "loss": 1.7111, + "step": 24782 + }, + { + "epoch": 7.606813996316759, + "grad_norm": 0.1803034543991089, + "learning_rate": 1.4286244162559993e-05, + "loss": 1.6895, + "step": 24783 + }, + { + "epoch": 7.607120933087784, + "grad_norm": 0.18902915716171265, + "learning_rate": 1.4282765618021999e-05, + "loss": 1.766, + "step": 24784 + }, + { + "epoch": 7.607427869858809, + "grad_norm": 0.16692081093788147, + "learning_rate": 1.4279287426462646e-05, + "loss": 1.688, + "step": 24785 + }, + { + "epoch": 7.607734806629834, + "grad_norm": 0.1538083851337433, + "learning_rate": 1.4275809587916317e-05, + "loss": 1.6611, + "step": 24786 + }, + { + "epoch": 7.608041743400859, + "grad_norm": 0.1921710968017578, + "learning_rate": 1.4272332102417369e-05, + "loss": 1.7338, + "step": 24787 + }, + { + "epoch": 7.6083486801718845, + "grad_norm": 0.1812380999326706, + "learning_rate": 1.4268854970000167e-05, + "loss": 1.7613, + "step": 24788 + }, + { + "epoch": 7.60865561694291, + "grad_norm": 0.1762949675321579, + "learning_rate": 1.4265378190699108e-05, + "loss": 1.6796, + "step": 24789 + }, + { + "epoch": 7.608962553713935, + "grad_norm": 0.17698180675506592, + "learning_rate": 1.4261901764548497e-05, + "loss": 1.7065, + "step": 24790 + }, + { + "epoch": 7.6092694904849605, + "grad_norm": 0.18398644030094147, + "learning_rate": 1.4258425691582756e-05, + "loss": 1.7322, + "step": 24791 + }, + { + "epoch": 7.609576427255985, + "grad_norm": 0.18370044231414795, + "learning_rate": 1.425494997183618e-05, + "loss": 1.7565, + "step": 24792 + }, + { + "epoch": 7.60988336402701, + "grad_norm": 0.19615988433361053, + "learning_rate": 1.4251474605343124e-05, + "loss": 1.7507, + "step": 24793 + }, + { + "epoch": 7.610190300798036, + "grad_norm": 0.17218533158302307, + "learning_rate": 1.4247999592137979e-05, + "loss": 1.6692, + "step": 24794 + }, + { + "epoch": 7.610497237569061, + "grad_norm": 0.19105172157287598, + "learning_rate": 1.4244524932255027e-05, + "loss": 1.7421, + "step": 24795 + }, + { + "epoch": 7.610804174340086, + "grad_norm": 0.21565218269824982, + "learning_rate": 1.424105062572867e-05, + "loss": 1.7143, + "step": 24796 + }, + { + "epoch": 7.611111111111111, + "grad_norm": 0.17394152283668518, + "learning_rate": 1.4237576672593178e-05, + "loss": 1.7202, + "step": 24797 + }, + { + "epoch": 7.611418047882136, + "grad_norm": 0.18680404126644135, + "learning_rate": 1.4234103072882926e-05, + "loss": 1.7155, + "step": 24798 + }, + { + "epoch": 7.611724984653161, + "grad_norm": 0.16173312067985535, + "learning_rate": 1.4230629826632237e-05, + "loss": 1.6549, + "step": 24799 + }, + { + "epoch": 7.612031921424187, + "grad_norm": 0.2055300772190094, + "learning_rate": 1.4227156933875423e-05, + "loss": 1.7382, + "step": 24800 + }, + { + "epoch": 7.612338858195212, + "grad_norm": 0.17331050336360931, + "learning_rate": 1.4223684394646813e-05, + "loss": 1.719, + "step": 24801 + }, + { + "epoch": 7.612645794966237, + "grad_norm": 0.23106786608695984, + "learning_rate": 1.4220212208980727e-05, + "loss": 1.7083, + "step": 24802 + }, + { + "epoch": 7.612952731737262, + "grad_norm": 0.21011751890182495, + "learning_rate": 1.4216740376911469e-05, + "loss": 1.7629, + "step": 24803 + }, + { + "epoch": 7.613259668508287, + "grad_norm": 0.15120279788970947, + "learning_rate": 1.4213268898473359e-05, + "loss": 1.673, + "step": 24804 + }, + { + "epoch": 7.6135666052793125, + "grad_norm": 0.17431862652301788, + "learning_rate": 1.4209797773700684e-05, + "loss": 1.672, + "step": 24805 + }, + { + "epoch": 7.613873542050338, + "grad_norm": 0.1592133790254593, + "learning_rate": 1.42063270026278e-05, + "loss": 1.7102, + "step": 24806 + }, + { + "epoch": 7.614180478821363, + "grad_norm": 0.22535641491413116, + "learning_rate": 1.4202856585288954e-05, + "loss": 1.7177, + "step": 24807 + }, + { + "epoch": 7.614487415592388, + "grad_norm": 0.2111314982175827, + "learning_rate": 1.4199386521718455e-05, + "loss": 1.7399, + "step": 24808 + }, + { + "epoch": 7.614794352363413, + "grad_norm": 0.18377532064914703, + "learning_rate": 1.419591681195061e-05, + "loss": 1.6713, + "step": 24809 + }, + { + "epoch": 7.615101289134438, + "grad_norm": 0.19743949174880981, + "learning_rate": 1.4192447456019681e-05, + "loss": 1.7761, + "step": 24810 + }, + { + "epoch": 7.615408225905464, + "grad_norm": 0.17827409505844116, + "learning_rate": 1.4188978453960006e-05, + "loss": 1.7091, + "step": 24811 + }, + { + "epoch": 7.615715162676489, + "grad_norm": 0.18304505944252014, + "learning_rate": 1.4185509805805802e-05, + "loss": 1.7496, + "step": 24812 + }, + { + "epoch": 7.616022099447514, + "grad_norm": 0.19510503113269806, + "learning_rate": 1.4182041511591415e-05, + "loss": 1.7436, + "step": 24813 + }, + { + "epoch": 7.616329036218539, + "grad_norm": 0.17127136886119843, + "learning_rate": 1.4178573571351056e-05, + "loss": 1.6598, + "step": 24814 + }, + { + "epoch": 7.616635972989564, + "grad_norm": 0.20133370161056519, + "learning_rate": 1.4175105985119041e-05, + "loss": 1.7802, + "step": 24815 + }, + { + "epoch": 7.616942909760589, + "grad_norm": 0.17706145346164703, + "learning_rate": 1.4171638752929634e-05, + "loss": 1.7105, + "step": 24816 + }, + { + "epoch": 7.617249846531615, + "grad_norm": 0.179647758603096, + "learning_rate": 1.4168171874817088e-05, + "loss": 1.732, + "step": 24817 + }, + { + "epoch": 7.617556783302639, + "grad_norm": 0.16380085051059723, + "learning_rate": 1.4164705350815665e-05, + "loss": 1.6671, + "step": 24818 + }, + { + "epoch": 7.6178637200736645, + "grad_norm": 0.19407404959201813, + "learning_rate": 1.4161239180959635e-05, + "loss": 1.7261, + "step": 24819 + }, + { + "epoch": 7.61817065684469, + "grad_norm": 0.1647375524044037, + "learning_rate": 1.415777336528324e-05, + "loss": 1.7438, + "step": 24820 + }, + { + "epoch": 7.618477593615715, + "grad_norm": 0.21532754600048065, + "learning_rate": 1.4154307903820735e-05, + "loss": 1.7674, + "step": 24821 + }, + { + "epoch": 7.6187845303867405, + "grad_norm": 0.1834939867258072, + "learning_rate": 1.4150842796606372e-05, + "loss": 1.7027, + "step": 24822 + }, + { + "epoch": 7.619091467157766, + "grad_norm": 0.15102218091487885, + "learning_rate": 1.4147378043674397e-05, + "loss": 1.6858, + "step": 24823 + }, + { + "epoch": 7.61939840392879, + "grad_norm": 0.161713644862175, + "learning_rate": 1.4143913645059038e-05, + "loss": 1.7149, + "step": 24824 + }, + { + "epoch": 7.619705340699816, + "grad_norm": 0.15568867325782776, + "learning_rate": 1.4140449600794547e-05, + "loss": 1.6642, + "step": 24825 + }, + { + "epoch": 7.620012277470841, + "grad_norm": 0.15993504226207733, + "learning_rate": 1.4136985910915147e-05, + "loss": 1.6497, + "step": 24826 + }, + { + "epoch": 7.620319214241866, + "grad_norm": 0.16981028020381927, + "learning_rate": 1.4133522575455055e-05, + "loss": 1.7347, + "step": 24827 + }, + { + "epoch": 7.620626151012892, + "grad_norm": 0.16143053770065308, + "learning_rate": 1.4130059594448547e-05, + "loss": 1.7166, + "step": 24828 + }, + { + "epoch": 7.620933087783916, + "grad_norm": 0.16914571821689606, + "learning_rate": 1.4126596967929789e-05, + "loss": 1.7008, + "step": 24829 + }, + { + "epoch": 7.621240024554941, + "grad_norm": 0.20040032267570496, + "learning_rate": 1.4123134695933049e-05, + "loss": 1.7099, + "step": 24830 + }, + { + "epoch": 7.621546961325967, + "grad_norm": 0.17086143791675568, + "learning_rate": 1.4119672778492493e-05, + "loss": 1.6913, + "step": 24831 + }, + { + "epoch": 7.621853898096992, + "grad_norm": 0.16268399357795715, + "learning_rate": 1.4116211215642378e-05, + "loss": 1.6919, + "step": 24832 + }, + { + "epoch": 7.622160834868017, + "grad_norm": 0.21211197972297668, + "learning_rate": 1.4112750007416891e-05, + "loss": 1.7493, + "step": 24833 + }, + { + "epoch": 7.622467771639043, + "grad_norm": 0.16767694056034088, + "learning_rate": 1.4109289153850247e-05, + "loss": 1.6863, + "step": 24834 + }, + { + "epoch": 7.622774708410067, + "grad_norm": 0.1769869178533554, + "learning_rate": 1.4105828654976639e-05, + "loss": 1.7303, + "step": 24835 + }, + { + "epoch": 7.6230816451810925, + "grad_norm": 0.2202748954296112, + "learning_rate": 1.4102368510830278e-05, + "loss": 1.7648, + "step": 24836 + }, + { + "epoch": 7.623388581952118, + "grad_norm": 0.18347454071044922, + "learning_rate": 1.4098908721445342e-05, + "loss": 1.7615, + "step": 24837 + }, + { + "epoch": 7.623695518723143, + "grad_norm": 0.17966698110103607, + "learning_rate": 1.4095449286856039e-05, + "loss": 1.7031, + "step": 24838 + }, + { + "epoch": 7.6240024554941686, + "grad_norm": 0.1794397532939911, + "learning_rate": 1.409199020709655e-05, + "loss": 1.7129, + "step": 24839 + }, + { + "epoch": 7.624309392265193, + "grad_norm": 0.1838780641555786, + "learning_rate": 1.4088531482201056e-05, + "loss": 1.6936, + "step": 24840 + }, + { + "epoch": 7.624616329036218, + "grad_norm": 0.1940378099679947, + "learning_rate": 1.4085073112203745e-05, + "loss": 1.71, + "step": 24841 + }, + { + "epoch": 7.624923265807244, + "grad_norm": 0.17340345680713654, + "learning_rate": 1.4081615097138796e-05, + "loss": 1.711, + "step": 24842 + }, + { + "epoch": 7.625230202578269, + "grad_norm": 0.23193266987800598, + "learning_rate": 1.4078157437040374e-05, + "loss": 1.7366, + "step": 24843 + }, + { + "epoch": 7.625537139349294, + "grad_norm": 0.1742531955242157, + "learning_rate": 1.4074700131942653e-05, + "loss": 1.7179, + "step": 24844 + }, + { + "epoch": 7.62584407612032, + "grad_norm": 0.22453147172927856, + "learning_rate": 1.4071243181879806e-05, + "loss": 1.708, + "step": 24845 + }, + { + "epoch": 7.626151012891344, + "grad_norm": 0.16176854074001312, + "learning_rate": 1.4067786586885977e-05, + "loss": 1.7012, + "step": 24846 + }, + { + "epoch": 7.6264579496623695, + "grad_norm": 0.16796015202999115, + "learning_rate": 1.4064330346995369e-05, + "loss": 1.6918, + "step": 24847 + }, + { + "epoch": 7.626764886433395, + "grad_norm": 0.1737142950296402, + "learning_rate": 1.4060874462242085e-05, + "loss": 1.6908, + "step": 24848 + }, + { + "epoch": 7.62707182320442, + "grad_norm": 0.1697089523077011, + "learning_rate": 1.4057418932660315e-05, + "loss": 1.6811, + "step": 24849 + }, + { + "epoch": 7.627378759975445, + "grad_norm": 0.19860011339187622, + "learning_rate": 1.40539637582842e-05, + "loss": 1.7803, + "step": 24850 + }, + { + "epoch": 7.62768569674647, + "grad_norm": 0.16383512318134308, + "learning_rate": 1.4050508939147883e-05, + "loss": 1.7004, + "step": 24851 + }, + { + "epoch": 7.627992633517495, + "grad_norm": 0.18878768384456635, + "learning_rate": 1.404705447528551e-05, + "loss": 1.6916, + "step": 24852 + }, + { + "epoch": 7.628299570288521, + "grad_norm": 0.1417449563741684, + "learning_rate": 1.4043600366731213e-05, + "loss": 1.6908, + "step": 24853 + }, + { + "epoch": 7.628606507059546, + "grad_norm": 0.19786077737808228, + "learning_rate": 1.4040146613519134e-05, + "loss": 1.7307, + "step": 24854 + }, + { + "epoch": 7.628913443830571, + "grad_norm": 0.17295710742473602, + "learning_rate": 1.40366932156834e-05, + "loss": 1.7111, + "step": 24855 + }, + { + "epoch": 7.629220380601596, + "grad_norm": 0.2160167098045349, + "learning_rate": 1.4033240173258144e-05, + "loss": 1.71, + "step": 24856 + }, + { + "epoch": 7.629527317372621, + "grad_norm": 0.1741226315498352, + "learning_rate": 1.402978748627749e-05, + "loss": 1.7024, + "step": 24857 + }, + { + "epoch": 7.629834254143646, + "grad_norm": 0.18043182790279388, + "learning_rate": 1.4026335154775561e-05, + "loss": 1.7046, + "step": 24858 + }, + { + "epoch": 7.630141190914672, + "grad_norm": 0.1592903584241867, + "learning_rate": 1.4022883178786472e-05, + "loss": 1.6913, + "step": 24859 + }, + { + "epoch": 7.630448127685697, + "grad_norm": 0.25504007935523987, + "learning_rate": 1.4019431558344337e-05, + "loss": 1.7221, + "step": 24860 + }, + { + "epoch": 7.6307550644567215, + "grad_norm": 0.15307627618312836, + "learning_rate": 1.4015980293483272e-05, + "loss": 1.6725, + "step": 24861 + }, + { + "epoch": 7.631062001227747, + "grad_norm": 0.2595232129096985, + "learning_rate": 1.4012529384237372e-05, + "loss": 1.7309, + "step": 24862 + }, + { + "epoch": 7.631368937998772, + "grad_norm": 0.19494156539440155, + "learning_rate": 1.4009078830640743e-05, + "loss": 1.737, + "step": 24863 + }, + { + "epoch": 7.6316758747697975, + "grad_norm": 0.19264118373394012, + "learning_rate": 1.4005628632727518e-05, + "loss": 1.7337, + "step": 24864 + }, + { + "epoch": 7.631982811540823, + "grad_norm": 0.18758688867092133, + "learning_rate": 1.400217879053174e-05, + "loss": 1.684, + "step": 24865 + }, + { + "epoch": 7.632289748311848, + "grad_norm": 0.17094476521015167, + "learning_rate": 1.399872930408756e-05, + "loss": 1.6724, + "step": 24866 + }, + { + "epoch": 7.632596685082873, + "grad_norm": 0.18967430293560028, + "learning_rate": 1.3995280173429003e-05, + "loss": 1.6852, + "step": 24867 + }, + { + "epoch": 7.632903621853898, + "grad_norm": 0.1686837375164032, + "learning_rate": 1.399183139859021e-05, + "loss": 1.6673, + "step": 24868 + }, + { + "epoch": 7.633210558624923, + "grad_norm": 0.19091126322746277, + "learning_rate": 1.398838297960524e-05, + "loss": 1.7423, + "step": 24869 + }, + { + "epoch": 7.633517495395949, + "grad_norm": 0.20197629928588867, + "learning_rate": 1.3984934916508186e-05, + "loss": 1.7217, + "step": 24870 + }, + { + "epoch": 7.633824432166974, + "grad_norm": 0.1490679830312729, + "learning_rate": 1.3981487209333105e-05, + "loss": 1.6367, + "step": 24871 + }, + { + "epoch": 7.634131368937998, + "grad_norm": 0.14664824306964874, + "learning_rate": 1.3978039858114084e-05, + "loss": 1.68, + "step": 24872 + }, + { + "epoch": 7.634438305709024, + "grad_norm": 0.19181138277053833, + "learning_rate": 1.3974592862885182e-05, + "loss": 1.766, + "step": 24873 + }, + { + "epoch": 7.634745242480049, + "grad_norm": 0.17716391384601593, + "learning_rate": 1.397114622368047e-05, + "loss": 1.7479, + "step": 24874 + }, + { + "epoch": 7.635052179251074, + "grad_norm": 0.16603589057922363, + "learning_rate": 1.3967699940534006e-05, + "loss": 1.6455, + "step": 24875 + }, + { + "epoch": 7.6353591160221, + "grad_norm": 0.19060885906219482, + "learning_rate": 1.3964254013479855e-05, + "loss": 1.7367, + "step": 24876 + }, + { + "epoch": 7.635666052793125, + "grad_norm": 0.18182092905044556, + "learning_rate": 1.3960808442552064e-05, + "loss": 1.7235, + "step": 24877 + }, + { + "epoch": 7.6359729895641495, + "grad_norm": 0.22578656673431396, + "learning_rate": 1.3957363227784691e-05, + "loss": 1.7229, + "step": 24878 + }, + { + "epoch": 7.636279926335175, + "grad_norm": 0.25397053360939026, + "learning_rate": 1.3953918369211776e-05, + "loss": 1.7094, + "step": 24879 + }, + { + "epoch": 7.6365868631062, + "grad_norm": 0.164917454123497, + "learning_rate": 1.3950473866867353e-05, + "loss": 1.695, + "step": 24880 + }, + { + "epoch": 7.6368937998772255, + "grad_norm": 0.18737520277500153, + "learning_rate": 1.3947029720785503e-05, + "loss": 1.6719, + "step": 24881 + }, + { + "epoch": 7.637200736648251, + "grad_norm": 0.1839492917060852, + "learning_rate": 1.3943585931000213e-05, + "loss": 1.7136, + "step": 24882 + }, + { + "epoch": 7.637507673419275, + "grad_norm": 0.17182856798171997, + "learning_rate": 1.3940142497545566e-05, + "loss": 1.678, + "step": 24883 + }, + { + "epoch": 7.637814610190301, + "grad_norm": 0.20733827352523804, + "learning_rate": 1.393669942045554e-05, + "loss": 1.6398, + "step": 24884 + }, + { + "epoch": 7.638121546961326, + "grad_norm": 0.19326196610927582, + "learning_rate": 1.3933256699764196e-05, + "loss": 1.7351, + "step": 24885 + }, + { + "epoch": 7.638428483732351, + "grad_norm": 0.2368818074464798, + "learning_rate": 1.3929814335505552e-05, + "loss": 1.7567, + "step": 24886 + }, + { + "epoch": 7.638735420503377, + "grad_norm": 0.16702532768249512, + "learning_rate": 1.3926372327713626e-05, + "loss": 1.6791, + "step": 24887 + }, + { + "epoch": 7.639042357274402, + "grad_norm": 0.18634511530399323, + "learning_rate": 1.3922930676422435e-05, + "loss": 1.691, + "step": 24888 + }, + { + "epoch": 7.639349294045426, + "grad_norm": 0.19349521398544312, + "learning_rate": 1.3919489381665985e-05, + "loss": 1.7037, + "step": 24889 + }, + { + "epoch": 7.639656230816452, + "grad_norm": 0.16760465502738953, + "learning_rate": 1.3916048443478286e-05, + "loss": 1.6871, + "step": 24890 + }, + { + "epoch": 7.639963167587477, + "grad_norm": 0.25489017367362976, + "learning_rate": 1.3912607861893351e-05, + "loss": 1.6914, + "step": 24891 + }, + { + "epoch": 7.640270104358502, + "grad_norm": 0.17488406598567963, + "learning_rate": 1.390916763694517e-05, + "loss": 1.6826, + "step": 24892 + }, + { + "epoch": 7.640577041129527, + "grad_norm": 0.2128411829471588, + "learning_rate": 1.3905727768667753e-05, + "loss": 1.711, + "step": 24893 + }, + { + "epoch": 7.640883977900552, + "grad_norm": 0.17478415369987488, + "learning_rate": 1.3902288257095087e-05, + "loss": 1.7174, + "step": 24894 + }, + { + "epoch": 7.6411909146715775, + "grad_norm": 0.20493042469024658, + "learning_rate": 1.3898849102261168e-05, + "loss": 1.7649, + "step": 24895 + }, + { + "epoch": 7.641497851442603, + "grad_norm": 0.16712170839309692, + "learning_rate": 1.3895410304199979e-05, + "loss": 1.6785, + "step": 24896 + }, + { + "epoch": 7.641804788213628, + "grad_norm": 0.18580594658851624, + "learning_rate": 1.3891971862945497e-05, + "loss": 1.7001, + "step": 24897 + }, + { + "epoch": 7.6421117249846535, + "grad_norm": 0.19040817022323608, + "learning_rate": 1.3888533778531737e-05, + "loss": 1.709, + "step": 24898 + }, + { + "epoch": 7.642418661755678, + "grad_norm": 0.17573465406894684, + "learning_rate": 1.3885096050992624e-05, + "loss": 1.7205, + "step": 24899 + }, + { + "epoch": 7.642725598526703, + "grad_norm": 0.19123490154743195, + "learning_rate": 1.3881658680362186e-05, + "loss": 1.6882, + "step": 24900 + }, + { + "epoch": 7.643032535297729, + "grad_norm": 0.18465565145015717, + "learning_rate": 1.387822166667434e-05, + "loss": 1.7294, + "step": 24901 + }, + { + "epoch": 7.643339472068754, + "grad_norm": 0.17927341163158417, + "learning_rate": 1.3874785009963098e-05, + "loss": 1.7625, + "step": 24902 + }, + { + "epoch": 7.643646408839779, + "grad_norm": 0.15983298420906067, + "learning_rate": 1.38713487102624e-05, + "loss": 1.6939, + "step": 24903 + }, + { + "epoch": 7.643953345610804, + "grad_norm": 0.20288127660751343, + "learning_rate": 1.3867912767606211e-05, + "loss": 1.7461, + "step": 24904 + }, + { + "epoch": 7.644260282381829, + "grad_norm": 0.18587160110473633, + "learning_rate": 1.3864477182028484e-05, + "loss": 1.7389, + "step": 24905 + }, + { + "epoch": 7.644567219152854, + "grad_norm": 0.17089903354644775, + "learning_rate": 1.3861041953563175e-05, + "loss": 1.6697, + "step": 24906 + }, + { + "epoch": 7.64487415592388, + "grad_norm": 0.20302993059158325, + "learning_rate": 1.3857607082244228e-05, + "loss": 1.7199, + "step": 24907 + }, + { + "epoch": 7.645181092694905, + "grad_norm": 0.14781002700328827, + "learning_rate": 1.3854172568105594e-05, + "loss": 1.687, + "step": 24908 + }, + { + "epoch": 7.64548802946593, + "grad_norm": 0.17847368121147156, + "learning_rate": 1.3850738411181214e-05, + "loss": 1.6511, + "step": 24909 + }, + { + "epoch": 7.645794966236955, + "grad_norm": 0.1448936015367508, + "learning_rate": 1.3847304611505019e-05, + "loss": 1.6601, + "step": 24910 + }, + { + "epoch": 7.64610190300798, + "grad_norm": 0.19413447380065918, + "learning_rate": 1.3843871169110955e-05, + "loss": 1.6901, + "step": 24911 + }, + { + "epoch": 7.6464088397790055, + "grad_norm": 0.18118292093276978, + "learning_rate": 1.3840438084032947e-05, + "loss": 1.7574, + "step": 24912 + }, + { + "epoch": 7.646715776550031, + "grad_norm": 0.16136041283607483, + "learning_rate": 1.3837005356304921e-05, + "loss": 1.6826, + "step": 24913 + }, + { + "epoch": 7.647022713321056, + "grad_norm": 0.1773926019668579, + "learning_rate": 1.3833572985960792e-05, + "loss": 1.7136, + "step": 24914 + }, + { + "epoch": 7.647329650092081, + "grad_norm": 0.15100078284740448, + "learning_rate": 1.3830140973034522e-05, + "loss": 1.7331, + "step": 24915 + }, + { + "epoch": 7.647636586863106, + "grad_norm": 0.16588352620601654, + "learning_rate": 1.3826709317559966e-05, + "loss": 1.6883, + "step": 24916 + }, + { + "epoch": 7.647943523634131, + "grad_norm": 0.14271478354930878, + "learning_rate": 1.3823278019571106e-05, + "loss": 1.6566, + "step": 24917 + }, + { + "epoch": 7.648250460405157, + "grad_norm": 0.18383146822452545, + "learning_rate": 1.3819847079101782e-05, + "loss": 1.7006, + "step": 24918 + }, + { + "epoch": 7.648557397176182, + "grad_norm": 0.20069970190525055, + "learning_rate": 1.3816416496185952e-05, + "loss": 1.696, + "step": 24919 + }, + { + "epoch": 7.648864333947207, + "grad_norm": 0.15686273574829102, + "learning_rate": 1.3812986270857497e-05, + "loss": 1.6998, + "step": 24920 + }, + { + "epoch": 7.649171270718232, + "grad_norm": 0.14733602106571198, + "learning_rate": 1.3809556403150326e-05, + "loss": 1.6692, + "step": 24921 + }, + { + "epoch": 7.649478207489257, + "grad_norm": 0.16720153391361237, + "learning_rate": 1.3806126893098332e-05, + "loss": 1.6841, + "step": 24922 + }, + { + "epoch": 7.649785144260282, + "grad_norm": 0.1548861712217331, + "learning_rate": 1.3802697740735404e-05, + "loss": 1.6914, + "step": 24923 + }, + { + "epoch": 7.650092081031308, + "grad_norm": 0.1591617912054062, + "learning_rate": 1.3799268946095433e-05, + "loss": 1.7121, + "step": 24924 + }, + { + "epoch": 7.650399017802332, + "grad_norm": 0.19735665619373322, + "learning_rate": 1.3795840509212305e-05, + "loss": 1.741, + "step": 24925 + }, + { + "epoch": 7.650705954573358, + "grad_norm": 0.16886921226978302, + "learning_rate": 1.37924124301199e-05, + "loss": 1.7166, + "step": 24926 + }, + { + "epoch": 7.651012891344383, + "grad_norm": 0.2084806114435196, + "learning_rate": 1.3788984708852098e-05, + "loss": 1.7525, + "step": 24927 + }, + { + "epoch": 7.651319828115408, + "grad_norm": 0.15286533534526825, + "learning_rate": 1.3785557345442773e-05, + "loss": 1.6754, + "step": 24928 + }, + { + "epoch": 7.651626764886434, + "grad_norm": 0.19647163152694702, + "learning_rate": 1.3782130339925792e-05, + "loss": 1.7114, + "step": 24929 + }, + { + "epoch": 7.651933701657459, + "grad_norm": 0.18526645004749298, + "learning_rate": 1.3778703692335031e-05, + "loss": 1.7258, + "step": 24930 + }, + { + "epoch": 7.652240638428484, + "grad_norm": 0.19880451261997223, + "learning_rate": 1.3775277402704334e-05, + "loss": 1.7065, + "step": 24931 + }, + { + "epoch": 7.652547575199509, + "grad_norm": 0.18702107667922974, + "learning_rate": 1.377185147106761e-05, + "loss": 1.7171, + "step": 24932 + }, + { + "epoch": 7.652854511970534, + "grad_norm": 0.1455291509628296, + "learning_rate": 1.3768425897458654e-05, + "loss": 1.6824, + "step": 24933 + }, + { + "epoch": 7.653161448741559, + "grad_norm": 0.16770213842391968, + "learning_rate": 1.3765000681911377e-05, + "loss": 1.6544, + "step": 24934 + }, + { + "epoch": 7.653468385512585, + "grad_norm": 0.18496285378932953, + "learning_rate": 1.3761575824459572e-05, + "loss": 1.7206, + "step": 24935 + }, + { + "epoch": 7.653775322283609, + "grad_norm": 0.1832813024520874, + "learning_rate": 1.3758151325137131e-05, + "loss": 1.7673, + "step": 24936 + }, + { + "epoch": 7.6540822590546345, + "grad_norm": 0.20916350185871124, + "learning_rate": 1.3754727183977878e-05, + "loss": 1.7224, + "step": 24937 + }, + { + "epoch": 7.65438919582566, + "grad_norm": 0.1878765970468521, + "learning_rate": 1.3751303401015653e-05, + "loss": 1.6966, + "step": 24938 + }, + { + "epoch": 7.654696132596685, + "grad_norm": 0.17944355309009552, + "learning_rate": 1.37478799762843e-05, + "loss": 1.6752, + "step": 24939 + }, + { + "epoch": 7.6550030693677105, + "grad_norm": 0.20930083096027374, + "learning_rate": 1.3744456909817638e-05, + "loss": 1.7632, + "step": 24940 + }, + { + "epoch": 7.655310006138736, + "grad_norm": 0.19838237762451172, + "learning_rate": 1.3741034201649511e-05, + "loss": 1.7039, + "step": 24941 + }, + { + "epoch": 7.65561694290976, + "grad_norm": 0.233023539185524, + "learning_rate": 1.373761185181373e-05, + "loss": 1.7117, + "step": 24942 + }, + { + "epoch": 7.655923879680786, + "grad_norm": 0.16270874440670013, + "learning_rate": 1.3734189860344127e-05, + "loss": 1.6603, + "step": 24943 + }, + { + "epoch": 7.656230816451811, + "grad_norm": 0.18456563353538513, + "learning_rate": 1.373076822727451e-05, + "loss": 1.6891, + "step": 24944 + }, + { + "epoch": 7.656537753222836, + "grad_norm": 0.17064985632896423, + "learning_rate": 1.3727346952638703e-05, + "loss": 1.6788, + "step": 24945 + }, + { + "epoch": 7.656844689993862, + "grad_norm": 0.17548689246177673, + "learning_rate": 1.3723926036470513e-05, + "loss": 1.6699, + "step": 24946 + }, + { + "epoch": 7.657151626764886, + "grad_norm": 0.1660275012254715, + "learning_rate": 1.3720505478803753e-05, + "loss": 1.6706, + "step": 24947 + }, + { + "epoch": 7.657458563535911, + "grad_norm": 0.2977990508079529, + "learning_rate": 1.3717085279672199e-05, + "loss": 1.7463, + "step": 24948 + }, + { + "epoch": 7.657765500306937, + "grad_norm": 0.24440810084342957, + "learning_rate": 1.3713665439109708e-05, + "loss": 1.7528, + "step": 24949 + }, + { + "epoch": 7.658072437077962, + "grad_norm": 0.1579941064119339, + "learning_rate": 1.3710245957150015e-05, + "loss": 1.6902, + "step": 24950 + }, + { + "epoch": 7.658379373848987, + "grad_norm": 0.197731152176857, + "learning_rate": 1.3706826833826968e-05, + "loss": 1.7377, + "step": 24951 + }, + { + "epoch": 7.658686310620013, + "grad_norm": 0.16704770922660828, + "learning_rate": 1.3703408069174301e-05, + "loss": 1.7057, + "step": 24952 + }, + { + "epoch": 7.658993247391037, + "grad_norm": 0.2167888730764389, + "learning_rate": 1.3699989663225848e-05, + "loss": 1.7668, + "step": 24953 + }, + { + "epoch": 7.6593001841620625, + "grad_norm": 0.16870343685150146, + "learning_rate": 1.369657161601537e-05, + "loss": 1.6781, + "step": 24954 + }, + { + "epoch": 7.659607120933088, + "grad_norm": 0.22422032058238983, + "learning_rate": 1.3693153927576646e-05, + "loss": 1.7034, + "step": 24955 + }, + { + "epoch": 7.659914057704113, + "grad_norm": 0.20777738094329834, + "learning_rate": 1.3689736597943465e-05, + "loss": 1.7401, + "step": 24956 + }, + { + "epoch": 7.6602209944751385, + "grad_norm": 0.17802980542182922, + "learning_rate": 1.3686319627149579e-05, + "loss": 1.7067, + "step": 24957 + }, + { + "epoch": 7.660527931246163, + "grad_norm": 0.21444065868854523, + "learning_rate": 1.368290301522877e-05, + "loss": 1.6731, + "step": 24958 + }, + { + "epoch": 7.660834868017188, + "grad_norm": 0.17638131976127625, + "learning_rate": 1.3679486762214805e-05, + "loss": 1.738, + "step": 24959 + }, + { + "epoch": 7.661141804788214, + "grad_norm": 0.1900044083595276, + "learning_rate": 1.3676070868141432e-05, + "loss": 1.7673, + "step": 24960 + }, + { + "epoch": 7.661448741559239, + "grad_norm": 0.20749469101428986, + "learning_rate": 1.3672655333042422e-05, + "loss": 1.7341, + "step": 24961 + }, + { + "epoch": 7.661755678330264, + "grad_norm": 0.21292604506015778, + "learning_rate": 1.3669240156951518e-05, + "loss": 1.7114, + "step": 24962 + }, + { + "epoch": 7.66206261510129, + "grad_norm": 0.21506401896476746, + "learning_rate": 1.3665825339902482e-05, + "loss": 1.7412, + "step": 24963 + }, + { + "epoch": 7.662369551872314, + "grad_norm": 0.21838976442813873, + "learning_rate": 1.3662410881929055e-05, + "loss": 1.7178, + "step": 24964 + }, + { + "epoch": 7.662676488643339, + "grad_norm": 0.18973253667354584, + "learning_rate": 1.365899678306497e-05, + "loss": 1.7161, + "step": 24965 + }, + { + "epoch": 7.662983425414365, + "grad_norm": 0.19278603792190552, + "learning_rate": 1.3655583043344006e-05, + "loss": 1.6952, + "step": 24966 + }, + { + "epoch": 7.66329036218539, + "grad_norm": 0.2025471180677414, + "learning_rate": 1.365216966279984e-05, + "loss": 1.6893, + "step": 24967 + }, + { + "epoch": 7.6635972989564145, + "grad_norm": 0.14461325109004974, + "learning_rate": 1.364875664146627e-05, + "loss": 1.6762, + "step": 24968 + }, + { + "epoch": 7.66390423572744, + "grad_norm": 0.22851425409317017, + "learning_rate": 1.3645343979376962e-05, + "loss": 1.7743, + "step": 24969 + }, + { + "epoch": 7.664211172498465, + "grad_norm": 0.16862350702285767, + "learning_rate": 1.3641931676565688e-05, + "loss": 1.6385, + "step": 24970 + }, + { + "epoch": 7.6645181092694905, + "grad_norm": 0.20482461154460907, + "learning_rate": 1.3638519733066157e-05, + "loss": 1.7824, + "step": 24971 + }, + { + "epoch": 7.664825046040516, + "grad_norm": 0.18505734205245972, + "learning_rate": 1.3635108148912085e-05, + "loss": 1.6845, + "step": 24972 + }, + { + "epoch": 7.665131982811541, + "grad_norm": 0.18774990737438202, + "learning_rate": 1.3631696924137189e-05, + "loss": 1.7091, + "step": 24973 + }, + { + "epoch": 7.665438919582566, + "grad_norm": 0.1967296153306961, + "learning_rate": 1.362828605877518e-05, + "loss": 1.6953, + "step": 24974 + }, + { + "epoch": 7.665745856353591, + "grad_norm": 0.16951262950897217, + "learning_rate": 1.3624875552859767e-05, + "loss": 1.7302, + "step": 24975 + }, + { + "epoch": 7.666052793124616, + "grad_norm": 0.21003109216690063, + "learning_rate": 1.3621465406424656e-05, + "loss": 1.7567, + "step": 24976 + }, + { + "epoch": 7.666359729895642, + "grad_norm": 0.19087877869606018, + "learning_rate": 1.361805561950354e-05, + "loss": 1.7373, + "step": 24977 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 0.17799946665763855, + "learning_rate": 1.3614646192130126e-05, + "loss": 1.7121, + "step": 24978 + }, + { + "epoch": 7.666973603437691, + "grad_norm": 0.15956062078475952, + "learning_rate": 1.3611237124338105e-05, + "loss": 1.6654, + "step": 24979 + }, + { + "epoch": 7.667280540208717, + "grad_norm": 0.1963697075843811, + "learning_rate": 1.3607828416161167e-05, + "loss": 1.7902, + "step": 24980 + }, + { + "epoch": 7.667587476979742, + "grad_norm": 0.22204460203647614, + "learning_rate": 1.3604420067632995e-05, + "loss": 1.8199, + "step": 24981 + }, + { + "epoch": 7.667894413750767, + "grad_norm": 0.20523740351200104, + "learning_rate": 1.3601012078787268e-05, + "loss": 1.7253, + "step": 24982 + }, + { + "epoch": 7.668201350521793, + "grad_norm": 0.18693773448467255, + "learning_rate": 1.3597604449657697e-05, + "loss": 1.7032, + "step": 24983 + }, + { + "epoch": 7.668508287292818, + "grad_norm": 0.17661312222480774, + "learning_rate": 1.3594197180277906e-05, + "loss": 1.6648, + "step": 24984 + }, + { + "epoch": 7.6688152240638425, + "grad_norm": 0.19099490344524384, + "learning_rate": 1.3590790270681631e-05, + "loss": 1.7107, + "step": 24985 + }, + { + "epoch": 7.669122160834868, + "grad_norm": 0.1854488104581833, + "learning_rate": 1.3587383720902469e-05, + "loss": 1.7241, + "step": 24986 + }, + { + "epoch": 7.669429097605893, + "grad_norm": 0.18763068318367004, + "learning_rate": 1.3583977530974146e-05, + "loss": 1.7207, + "step": 24987 + }, + { + "epoch": 7.6697360343769185, + "grad_norm": 0.15608854591846466, + "learning_rate": 1.3580571700930295e-05, + "loss": 1.6835, + "step": 24988 + }, + { + "epoch": 7.670042971147944, + "grad_norm": 0.1587948501110077, + "learning_rate": 1.3577166230804584e-05, + "loss": 1.6801, + "step": 24989 + }, + { + "epoch": 7.670349907918968, + "grad_norm": 0.21106089651584625, + "learning_rate": 1.3573761120630668e-05, + "loss": 1.7411, + "step": 24990 + }, + { + "epoch": 7.670656844689994, + "grad_norm": 0.17361705005168915, + "learning_rate": 1.3570356370442188e-05, + "loss": 1.7123, + "step": 24991 + }, + { + "epoch": 7.670963781461019, + "grad_norm": 0.16272610425949097, + "learning_rate": 1.3566951980272802e-05, + "loss": 1.7002, + "step": 24992 + }, + { + "epoch": 7.671270718232044, + "grad_norm": 0.18787643313407898, + "learning_rate": 1.3563547950156147e-05, + "loss": 1.7364, + "step": 24993 + }, + { + "epoch": 7.67157765500307, + "grad_norm": 0.18257403373718262, + "learning_rate": 1.3560144280125869e-05, + "loss": 1.6783, + "step": 24994 + }, + { + "epoch": 7.671884591774095, + "grad_norm": 0.21298269927501678, + "learning_rate": 1.3556740970215608e-05, + "loss": 1.815, + "step": 24995 + }, + { + "epoch": 7.672191528545119, + "grad_norm": 0.1805877983570099, + "learning_rate": 1.3553338020458988e-05, + "loss": 1.719, + "step": 24996 + }, + { + "epoch": 7.672498465316145, + "grad_norm": 0.210116446018219, + "learning_rate": 1.3549935430889643e-05, + "loss": 1.7603, + "step": 24997 + }, + { + "epoch": 7.67280540208717, + "grad_norm": 0.18893682956695557, + "learning_rate": 1.35465332015412e-05, + "loss": 1.6681, + "step": 24998 + }, + { + "epoch": 7.673112338858195, + "grad_norm": 0.17718489468097687, + "learning_rate": 1.354313133244729e-05, + "loss": 1.6799, + "step": 24999 + }, + { + "epoch": 7.67341927562922, + "grad_norm": 0.20092631876468658, + "learning_rate": 1.3539729823641517e-05, + "loss": 1.7273, + "step": 25000 + }, + { + "epoch": 7.673726212400245, + "grad_norm": 0.20800542831420898, + "learning_rate": 1.353632867515749e-05, + "loss": 1.7214, + "step": 25001 + }, + { + "epoch": 7.6740331491712706, + "grad_norm": 0.2119656354188919, + "learning_rate": 1.3532927887028861e-05, + "loss": 1.6701, + "step": 25002 + }, + { + "epoch": 7.674340085942296, + "grad_norm": 0.1645115315914154, + "learning_rate": 1.3529527459289188e-05, + "loss": 1.7199, + "step": 25003 + }, + { + "epoch": 7.674647022713321, + "grad_norm": 0.24434153735637665, + "learning_rate": 1.3526127391972116e-05, + "loss": 1.7295, + "step": 25004 + }, + { + "epoch": 7.6749539594843466, + "grad_norm": 0.20978261530399323, + "learning_rate": 1.3522727685111231e-05, + "loss": 1.8069, + "step": 25005 + }, + { + "epoch": 7.675260896255372, + "grad_norm": 0.19354932010173798, + "learning_rate": 1.3519328338740128e-05, + "loss": 1.7601, + "step": 25006 + }, + { + "epoch": 7.675567833026396, + "grad_norm": 0.19636447727680206, + "learning_rate": 1.3515929352892403e-05, + "loss": 1.7871, + "step": 25007 + }, + { + "epoch": 7.675874769797422, + "grad_norm": 0.18915504217147827, + "learning_rate": 1.3512530727601653e-05, + "loss": 1.6926, + "step": 25008 + }, + { + "epoch": 7.676181706568447, + "grad_norm": 0.18168985843658447, + "learning_rate": 1.3509132462901458e-05, + "loss": 1.7272, + "step": 25009 + }, + { + "epoch": 7.676488643339472, + "grad_norm": 0.17246222496032715, + "learning_rate": 1.3505734558825406e-05, + "loss": 1.7186, + "step": 25010 + }, + { + "epoch": 7.676795580110497, + "grad_norm": 0.2694617211818695, + "learning_rate": 1.3502337015407074e-05, + "loss": 1.8334, + "step": 25011 + }, + { + "epoch": 7.677102516881522, + "grad_norm": 0.1549377590417862, + "learning_rate": 1.3498939832680035e-05, + "loss": 1.7003, + "step": 25012 + }, + { + "epoch": 7.6774094536525475, + "grad_norm": 0.1559179425239563, + "learning_rate": 1.349554301067787e-05, + "loss": 1.7028, + "step": 25013 + }, + { + "epoch": 7.677716390423573, + "grad_norm": 0.17349909245967865, + "learning_rate": 1.3492146549434149e-05, + "loss": 1.6749, + "step": 25014 + }, + { + "epoch": 7.678023327194598, + "grad_norm": 0.19697749614715576, + "learning_rate": 1.348875044898243e-05, + "loss": 1.8291, + "step": 25015 + }, + { + "epoch": 7.6783302639656235, + "grad_norm": 0.17260968685150146, + "learning_rate": 1.3485354709356279e-05, + "loss": 1.6686, + "step": 25016 + }, + { + "epoch": 7.678637200736648, + "grad_norm": 0.16892582178115845, + "learning_rate": 1.3481959330589255e-05, + "loss": 1.755, + "step": 25017 + }, + { + "epoch": 7.678944137507673, + "grad_norm": 0.17961645126342773, + "learning_rate": 1.3478564312714898e-05, + "loss": 1.6937, + "step": 25018 + }, + { + "epoch": 7.679251074278699, + "grad_norm": 0.20795513689517975, + "learning_rate": 1.34751696557668e-05, + "loss": 1.799, + "step": 25019 + }, + { + "epoch": 7.679558011049724, + "grad_norm": 0.16439545154571533, + "learning_rate": 1.3471775359778461e-05, + "loss": 1.6942, + "step": 25020 + }, + { + "epoch": 7.679864947820749, + "grad_norm": 0.19526144862174988, + "learning_rate": 1.3468381424783472e-05, + "loss": 1.7255, + "step": 25021 + }, + { + "epoch": 7.680171884591774, + "grad_norm": 0.18183457851409912, + "learning_rate": 1.3464987850815319e-05, + "loss": 1.7027, + "step": 25022 + }, + { + "epoch": 7.680478821362799, + "grad_norm": 0.18443404138088226, + "learning_rate": 1.3461594637907587e-05, + "loss": 1.6973, + "step": 25023 + }, + { + "epoch": 7.680785758133824, + "grad_norm": 0.18545331060886383, + "learning_rate": 1.3458201786093794e-05, + "loss": 1.7479, + "step": 25024 + }, + { + "epoch": 7.68109269490485, + "grad_norm": 0.18329958617687225, + "learning_rate": 1.3454809295407467e-05, + "loss": 1.7301, + "step": 25025 + }, + { + "epoch": 7.681399631675875, + "grad_norm": 0.19131959974765778, + "learning_rate": 1.3451417165882136e-05, + "loss": 1.7402, + "step": 25026 + }, + { + "epoch": 7.6817065684469, + "grad_norm": 0.1782912164926529, + "learning_rate": 1.3448025397551323e-05, + "loss": 1.6771, + "step": 25027 + }, + { + "epoch": 7.682013505217925, + "grad_norm": 0.1757265031337738, + "learning_rate": 1.3444633990448546e-05, + "loss": 1.7336, + "step": 25028 + }, + { + "epoch": 7.68232044198895, + "grad_norm": 0.16550128161907196, + "learning_rate": 1.3441242944607318e-05, + "loss": 1.6335, + "step": 25029 + }, + { + "epoch": 7.6826273787599755, + "grad_norm": 0.18069832026958466, + "learning_rate": 1.3437852260061162e-05, + "loss": 1.7172, + "step": 25030 + }, + { + "epoch": 7.682934315531001, + "grad_norm": 0.21195535361766815, + "learning_rate": 1.3434461936843573e-05, + "loss": 1.7248, + "step": 25031 + }, + { + "epoch": 7.683241252302026, + "grad_norm": 0.17209839820861816, + "learning_rate": 1.3431071974988068e-05, + "loss": 1.666, + "step": 25032 + }, + { + "epoch": 7.683548189073051, + "grad_norm": 0.20565249025821686, + "learning_rate": 1.342768237452814e-05, + "loss": 1.7839, + "step": 25033 + }, + { + "epoch": 7.683855125844076, + "grad_norm": 0.2549617290496826, + "learning_rate": 1.342429313549729e-05, + "loss": 1.714, + "step": 25034 + }, + { + "epoch": 7.684162062615101, + "grad_norm": 0.1980191171169281, + "learning_rate": 1.3420904257929001e-05, + "loss": 1.7267, + "step": 25035 + }, + { + "epoch": 7.684468999386127, + "grad_norm": 0.1763298362493515, + "learning_rate": 1.3417515741856806e-05, + "loss": 1.6754, + "step": 25036 + }, + { + "epoch": 7.684775936157152, + "grad_norm": 0.15831413865089417, + "learning_rate": 1.341412758731413e-05, + "loss": 1.6885, + "step": 25037 + }, + { + "epoch": 7.685082872928177, + "grad_norm": 0.15696564316749573, + "learning_rate": 1.341073979433452e-05, + "loss": 1.7032, + "step": 25038 + }, + { + "epoch": 7.685389809699202, + "grad_norm": 0.19193214178085327, + "learning_rate": 1.3407352362951392e-05, + "loss": 1.7708, + "step": 25039 + }, + { + "epoch": 7.685696746470227, + "grad_norm": 0.1886630803346634, + "learning_rate": 1.3403965293198273e-05, + "loss": 1.7323, + "step": 25040 + }, + { + "epoch": 7.686003683241252, + "grad_norm": 0.16137991845607758, + "learning_rate": 1.340057858510862e-05, + "loss": 1.703, + "step": 25041 + }, + { + "epoch": 7.686310620012278, + "grad_norm": 0.21111373603343964, + "learning_rate": 1.33971922387159e-05, + "loss": 1.7428, + "step": 25042 + }, + { + "epoch": 7.686617556783302, + "grad_norm": 0.20256482064723969, + "learning_rate": 1.3393806254053582e-05, + "loss": 1.7651, + "step": 25043 + }, + { + "epoch": 7.6869244935543275, + "grad_norm": 0.19125118851661682, + "learning_rate": 1.3390420631155121e-05, + "loss": 1.7253, + "step": 25044 + }, + { + "epoch": 7.687231430325353, + "grad_norm": 0.22446562349796295, + "learning_rate": 1.3387035370053985e-05, + "loss": 1.7363, + "step": 25045 + }, + { + "epoch": 7.687538367096378, + "grad_norm": 0.17356424033641815, + "learning_rate": 1.3383650470783621e-05, + "loss": 1.7384, + "step": 25046 + }, + { + "epoch": 7.6878453038674035, + "grad_norm": 0.27287909388542175, + "learning_rate": 1.3380265933377489e-05, + "loss": 1.6754, + "step": 25047 + }, + { + "epoch": 7.688152240638429, + "grad_norm": 0.14978452026844025, + "learning_rate": 1.3376881757869032e-05, + "loss": 1.6693, + "step": 25048 + }, + { + "epoch": 7.688459177409453, + "grad_norm": 0.1746874898672104, + "learning_rate": 1.3373497944291691e-05, + "loss": 1.6878, + "step": 25049 + }, + { + "epoch": 7.688766114180479, + "grad_norm": 0.18032371997833252, + "learning_rate": 1.3370114492678915e-05, + "loss": 1.7153, + "step": 25050 + }, + { + "epoch": 7.689073050951504, + "grad_norm": 0.23111680150032043, + "learning_rate": 1.3366731403064131e-05, + "loss": 1.7132, + "step": 25051 + }, + { + "epoch": 7.689379987722529, + "grad_norm": 0.1587868630886078, + "learning_rate": 1.3363348675480768e-05, + "loss": 1.6692, + "step": 25052 + }, + { + "epoch": 7.689686924493555, + "grad_norm": 0.14336444437503815, + "learning_rate": 1.3359966309962301e-05, + "loss": 1.6648, + "step": 25053 + }, + { + "epoch": 7.689993861264579, + "grad_norm": 0.3048984408378601, + "learning_rate": 1.3356584306542086e-05, + "loss": 1.8109, + "step": 25054 + }, + { + "epoch": 7.690300798035604, + "grad_norm": 0.19389018416404724, + "learning_rate": 1.3353202665253617e-05, + "loss": 1.6725, + "step": 25055 + }, + { + "epoch": 7.69060773480663, + "grad_norm": 0.19246982038021088, + "learning_rate": 1.3349821386130246e-05, + "loss": 1.726, + "step": 25056 + }, + { + "epoch": 7.690914671577655, + "grad_norm": 0.19062727689743042, + "learning_rate": 1.3346440469205435e-05, + "loss": 1.7685, + "step": 25057 + }, + { + "epoch": 7.69122160834868, + "grad_norm": 0.16987577080726624, + "learning_rate": 1.3343059914512585e-05, + "loss": 1.7032, + "step": 25058 + }, + { + "epoch": 7.691528545119706, + "grad_norm": 0.17328599095344543, + "learning_rate": 1.3339679722085103e-05, + "loss": 1.7271, + "step": 25059 + }, + { + "epoch": 7.69183548189073, + "grad_norm": 0.2677443325519562, + "learning_rate": 1.3336299891956405e-05, + "loss": 1.8, + "step": 25060 + }, + { + "epoch": 7.6921424186617555, + "grad_norm": 0.18369975686073303, + "learning_rate": 1.333292042415985e-05, + "loss": 1.7483, + "step": 25061 + }, + { + "epoch": 7.692449355432781, + "grad_norm": 0.17269635200500488, + "learning_rate": 1.3329541318728883e-05, + "loss": 1.7016, + "step": 25062 + }, + { + "epoch": 7.692756292203806, + "grad_norm": 0.17280563712120056, + "learning_rate": 1.3326162575696889e-05, + "loss": 1.742, + "step": 25063 + }, + { + "epoch": 7.6930632289748315, + "grad_norm": 0.2000025361776352, + "learning_rate": 1.3322784195097243e-05, + "loss": 1.6947, + "step": 25064 + }, + { + "epoch": 7.693370165745856, + "grad_norm": 0.17853626608848572, + "learning_rate": 1.3319406176963344e-05, + "loss": 1.7075, + "step": 25065 + }, + { + "epoch": 7.693677102516881, + "grad_norm": 0.18445543944835663, + "learning_rate": 1.3316028521328571e-05, + "loss": 1.7138, + "step": 25066 + }, + { + "epoch": 7.693984039287907, + "grad_norm": 0.1965894103050232, + "learning_rate": 1.3312651228226302e-05, + "loss": 1.6904, + "step": 25067 + }, + { + "epoch": 7.694290976058932, + "grad_norm": 0.1890837699174881, + "learning_rate": 1.3309274297689923e-05, + "loss": 1.7307, + "step": 25068 + }, + { + "epoch": 7.694597912829957, + "grad_norm": 0.2157326638698578, + "learning_rate": 1.3305897729752787e-05, + "loss": 1.7466, + "step": 25069 + }, + { + "epoch": 7.694904849600983, + "grad_norm": 0.19773493707180023, + "learning_rate": 1.3302521524448302e-05, + "loss": 1.7265, + "step": 25070 + }, + { + "epoch": 7.695211786372007, + "grad_norm": 0.16688357293605804, + "learning_rate": 1.3299145681809776e-05, + "loss": 1.7049, + "step": 25071 + }, + { + "epoch": 7.695518723143032, + "grad_norm": 0.24347764253616333, + "learning_rate": 1.3295770201870639e-05, + "loss": 1.7706, + "step": 25072 + }, + { + "epoch": 7.695825659914058, + "grad_norm": 0.16198144853115082, + "learning_rate": 1.3292395084664183e-05, + "loss": 1.6873, + "step": 25073 + }, + { + "epoch": 7.696132596685083, + "grad_norm": 0.17321841418743134, + "learning_rate": 1.3289020330223806e-05, + "loss": 1.7463, + "step": 25074 + }, + { + "epoch": 7.696439533456108, + "grad_norm": 0.2611647844314575, + "learning_rate": 1.3285645938582847e-05, + "loss": 1.811, + "step": 25075 + }, + { + "epoch": 7.696746470227133, + "grad_norm": 0.18129383027553558, + "learning_rate": 1.3282271909774657e-05, + "loss": 1.7257, + "step": 25076 + }, + { + "epoch": 7.697053406998158, + "grad_norm": 0.19985437393188477, + "learning_rate": 1.3278898243832588e-05, + "loss": 1.7311, + "step": 25077 + }, + { + "epoch": 7.6973603437691835, + "grad_norm": 0.21517722308635712, + "learning_rate": 1.3275524940789941e-05, + "loss": 1.7582, + "step": 25078 + }, + { + "epoch": 7.697667280540209, + "grad_norm": 0.2302769422531128, + "learning_rate": 1.32721520006801e-05, + "loss": 1.7192, + "step": 25079 + }, + { + "epoch": 7.697974217311234, + "grad_norm": 0.18356913328170776, + "learning_rate": 1.3268779423536375e-05, + "loss": 1.6916, + "step": 25080 + }, + { + "epoch": 7.6982811540822595, + "grad_norm": 0.19134142994880676, + "learning_rate": 1.3265407209392105e-05, + "loss": 1.7309, + "step": 25081 + }, + { + "epoch": 7.698588090853284, + "grad_norm": 0.17634150385856628, + "learning_rate": 1.3262035358280605e-05, + "loss": 1.7537, + "step": 25082 + }, + { + "epoch": 7.698895027624309, + "grad_norm": 0.1921558827161789, + "learning_rate": 1.325866387023521e-05, + "loss": 1.7102, + "step": 25083 + }, + { + "epoch": 7.699201964395335, + "grad_norm": 0.15972480177879333, + "learning_rate": 1.3255292745289233e-05, + "loss": 1.6759, + "step": 25084 + }, + { + "epoch": 7.69950890116636, + "grad_norm": 0.15172120928764343, + "learning_rate": 1.325192198347599e-05, + "loss": 1.6766, + "step": 25085 + }, + { + "epoch": 7.699815837937384, + "grad_norm": 0.17827558517456055, + "learning_rate": 1.3248551584828777e-05, + "loss": 1.7421, + "step": 25086 + }, + { + "epoch": 7.70012277470841, + "grad_norm": 0.1675274819135666, + "learning_rate": 1.3245181549380948e-05, + "loss": 1.701, + "step": 25087 + }, + { + "epoch": 7.700429711479435, + "grad_norm": 0.17937950789928436, + "learning_rate": 1.3241811877165744e-05, + "loss": 1.7284, + "step": 25088 + }, + { + "epoch": 7.7007366482504604, + "grad_norm": 0.16373637318611145, + "learning_rate": 1.3238442568216535e-05, + "loss": 1.6834, + "step": 25089 + }, + { + "epoch": 7.701043585021486, + "grad_norm": 0.16055652499198914, + "learning_rate": 1.3235073622566552e-05, + "loss": 1.7087, + "step": 25090 + }, + { + "epoch": 7.701350521792511, + "grad_norm": 0.15083225071430206, + "learning_rate": 1.3231705040249131e-05, + "loss": 1.7313, + "step": 25091 + }, + { + "epoch": 7.701657458563536, + "grad_norm": 0.21110820770263672, + "learning_rate": 1.322833682129756e-05, + "loss": 1.6758, + "step": 25092 + }, + { + "epoch": 7.701964395334561, + "grad_norm": 0.18439972400665283, + "learning_rate": 1.322496896574511e-05, + "loss": 1.737, + "step": 25093 + }, + { + "epoch": 7.702271332105586, + "grad_norm": 0.18655124306678772, + "learning_rate": 1.322160147362509e-05, + "loss": 1.7268, + "step": 25094 + }, + { + "epoch": 7.702578268876612, + "grad_norm": 0.17620640993118286, + "learning_rate": 1.3218234344970725e-05, + "loss": 1.6829, + "step": 25095 + }, + { + "epoch": 7.702885205647637, + "grad_norm": 0.19085893034934998, + "learning_rate": 1.3214867579815343e-05, + "loss": 1.7382, + "step": 25096 + }, + { + "epoch": 7.703192142418661, + "grad_norm": 0.2206689864397049, + "learning_rate": 1.3211501178192203e-05, + "loss": 1.7666, + "step": 25097 + }, + { + "epoch": 7.703499079189687, + "grad_norm": 0.2047509402036667, + "learning_rate": 1.320813514013457e-05, + "loss": 1.7209, + "step": 25098 + }, + { + "epoch": 7.703806015960712, + "grad_norm": 0.22249147295951843, + "learning_rate": 1.3204769465675709e-05, + "loss": 1.8067, + "step": 25099 + }, + { + "epoch": 7.704112952731737, + "grad_norm": 0.16225707530975342, + "learning_rate": 1.3201404154848885e-05, + "loss": 1.6715, + "step": 25100 + }, + { + "epoch": 7.704419889502763, + "grad_norm": 0.19165070354938507, + "learning_rate": 1.3198039207687352e-05, + "loss": 1.7233, + "step": 25101 + }, + { + "epoch": 7.704726826273788, + "grad_norm": 0.18720564246177673, + "learning_rate": 1.3194674624224368e-05, + "loss": 1.7129, + "step": 25102 + }, + { + "epoch": 7.7050337630448125, + "grad_norm": 0.16703814268112183, + "learning_rate": 1.3191310404493163e-05, + "loss": 1.7314, + "step": 25103 + }, + { + "epoch": 7.705340699815838, + "grad_norm": 0.20206168293952942, + "learning_rate": 1.3187946548527036e-05, + "loss": 1.7278, + "step": 25104 + }, + { + "epoch": 7.705647636586863, + "grad_norm": 0.1774030476808548, + "learning_rate": 1.3184583056359163e-05, + "loss": 1.6986, + "step": 25105 + }, + { + "epoch": 7.7059545733578885, + "grad_norm": 0.1729336827993393, + "learning_rate": 1.3181219928022853e-05, + "loss": 1.7251, + "step": 25106 + }, + { + "epoch": 7.706261510128914, + "grad_norm": 0.23351258039474487, + "learning_rate": 1.3177857163551276e-05, + "loss": 1.7311, + "step": 25107 + }, + { + "epoch": 7.706568446899938, + "grad_norm": 0.2041054517030716, + "learning_rate": 1.3174494762977713e-05, + "loss": 1.7122, + "step": 25108 + }, + { + "epoch": 7.706875383670964, + "grad_norm": 0.178013876080513, + "learning_rate": 1.3171132726335373e-05, + "loss": 1.7255, + "step": 25109 + }, + { + "epoch": 7.707182320441989, + "grad_norm": 0.19265221059322357, + "learning_rate": 1.3167771053657491e-05, + "loss": 1.6747, + "step": 25110 + }, + { + "epoch": 7.707489257213014, + "grad_norm": 0.18968601524829865, + "learning_rate": 1.3164409744977297e-05, + "loss": 1.71, + "step": 25111 + }, + { + "epoch": 7.70779619398404, + "grad_norm": 0.17041562497615814, + "learning_rate": 1.3161048800327963e-05, + "loss": 1.7202, + "step": 25112 + }, + { + "epoch": 7.708103130755065, + "grad_norm": 0.20094618201255798, + "learning_rate": 1.3157688219742754e-05, + "loss": 1.7375, + "step": 25113 + }, + { + "epoch": 7.708410067526089, + "grad_norm": 0.14012686908245087, + "learning_rate": 1.3154328003254862e-05, + "loss": 1.6426, + "step": 25114 + }, + { + "epoch": 7.708717004297115, + "grad_norm": 0.18826791644096375, + "learning_rate": 1.3150968150897497e-05, + "loss": 1.7114, + "step": 25115 + }, + { + "epoch": 7.70902394106814, + "grad_norm": 0.15521864593029022, + "learning_rate": 1.3147608662703864e-05, + "loss": 1.7031, + "step": 25116 + }, + { + "epoch": 7.709330877839165, + "grad_norm": 0.19424815475940704, + "learning_rate": 1.314424953870716e-05, + "loss": 1.6815, + "step": 25117 + }, + { + "epoch": 7.70963781461019, + "grad_norm": 0.30089494585990906, + "learning_rate": 1.3140890778940584e-05, + "loss": 1.7444, + "step": 25118 + }, + { + "epoch": 7.709944751381215, + "grad_norm": 0.1784239560365677, + "learning_rate": 1.3137532383437334e-05, + "loss": 1.6659, + "step": 25119 + }, + { + "epoch": 7.7102516881522405, + "grad_norm": 0.18670935928821564, + "learning_rate": 1.3134174352230571e-05, + "loss": 1.7007, + "step": 25120 + }, + { + "epoch": 7.710558624923266, + "grad_norm": 0.21140475571155548, + "learning_rate": 1.3130816685353541e-05, + "loss": 1.7716, + "step": 25121 + }, + { + "epoch": 7.710865561694291, + "grad_norm": 0.20546187460422516, + "learning_rate": 1.3127459382839363e-05, + "loss": 1.6434, + "step": 25122 + }, + { + "epoch": 7.7111724984653165, + "grad_norm": 0.15188902616500854, + "learning_rate": 1.312410244472127e-05, + "loss": 1.6843, + "step": 25123 + }, + { + "epoch": 7.711479435236341, + "grad_norm": 0.2020019143819809, + "learning_rate": 1.3120745871032375e-05, + "loss": 1.6846, + "step": 25124 + }, + { + "epoch": 7.711786372007366, + "grad_norm": 0.19839881360530853, + "learning_rate": 1.3117389661805907e-05, + "loss": 1.7026, + "step": 25125 + }, + { + "epoch": 7.712093308778392, + "grad_norm": 0.19400818645954132, + "learning_rate": 1.311403381707501e-05, + "loss": 1.705, + "step": 25126 + }, + { + "epoch": 7.712400245549417, + "grad_norm": 0.21366959810256958, + "learning_rate": 1.311067833687285e-05, + "loss": 1.7184, + "step": 25127 + }, + { + "epoch": 7.712707182320442, + "grad_norm": 0.17402227222919464, + "learning_rate": 1.3107323221232604e-05, + "loss": 1.6613, + "step": 25128 + }, + { + "epoch": 7.713014119091467, + "grad_norm": 0.24356254935264587, + "learning_rate": 1.3103968470187384e-05, + "loss": 1.7343, + "step": 25129 + }, + { + "epoch": 7.713321055862492, + "grad_norm": 0.18612951040267944, + "learning_rate": 1.3100614083770386e-05, + "loss": 1.7298, + "step": 25130 + }, + { + "epoch": 7.713627992633517, + "grad_norm": 0.27073535323143005, + "learning_rate": 1.3097260062014743e-05, + "loss": 1.7554, + "step": 25131 + }, + { + "epoch": 7.713934929404543, + "grad_norm": 0.1498921662569046, + "learning_rate": 1.309390640495361e-05, + "loss": 1.6506, + "step": 25132 + }, + { + "epoch": 7.714241866175568, + "grad_norm": 0.2159748524427414, + "learning_rate": 1.309055311262013e-05, + "loss": 1.6549, + "step": 25133 + }, + { + "epoch": 7.714548802946593, + "grad_norm": 0.2060365229845047, + "learning_rate": 1.3087200185047433e-05, + "loss": 1.7224, + "step": 25134 + }, + { + "epoch": 7.714855739717618, + "grad_norm": 0.22525639832019806, + "learning_rate": 1.3083847622268659e-05, + "loss": 1.7508, + "step": 25135 + }, + { + "epoch": 7.715162676488643, + "grad_norm": 0.20023567974567413, + "learning_rate": 1.3080495424316936e-05, + "loss": 1.7277, + "step": 25136 + }, + { + "epoch": 7.7154696132596685, + "grad_norm": 0.19702760875225067, + "learning_rate": 1.3077143591225389e-05, + "loss": 1.7291, + "step": 25137 + }, + { + "epoch": 7.715776550030694, + "grad_norm": 0.1713123917579651, + "learning_rate": 1.3073792123027173e-05, + "loss": 1.689, + "step": 25138 + }, + { + "epoch": 7.716083486801719, + "grad_norm": 0.17696695029735565, + "learning_rate": 1.3070441019755358e-05, + "loss": 1.6816, + "step": 25139 + }, + { + "epoch": 7.716390423572744, + "grad_norm": 0.1802004724740982, + "learning_rate": 1.3067090281443122e-05, + "loss": 1.754, + "step": 25140 + }, + { + "epoch": 7.716697360343769, + "grad_norm": 0.1829070895910263, + "learning_rate": 1.3063739908123518e-05, + "loss": 1.7389, + "step": 25141 + }, + { + "epoch": 7.717004297114794, + "grad_norm": 0.16842049360275269, + "learning_rate": 1.30603898998297e-05, + "loss": 1.7257, + "step": 25142 + }, + { + "epoch": 7.71731123388582, + "grad_norm": 0.18215791881084442, + "learning_rate": 1.305704025659476e-05, + "loss": 1.6765, + "step": 25143 + }, + { + "epoch": 7.717618170656845, + "grad_norm": 0.16992273926734924, + "learning_rate": 1.3053690978451799e-05, + "loss": 1.6729, + "step": 25144 + }, + { + "epoch": 7.71792510742787, + "grad_norm": 0.1847899854183197, + "learning_rate": 1.3050342065433935e-05, + "loss": 1.6972, + "step": 25145 + }, + { + "epoch": 7.718232044198895, + "grad_norm": 0.18730273842811584, + "learning_rate": 1.3046993517574219e-05, + "loss": 1.6996, + "step": 25146 + }, + { + "epoch": 7.71853898096992, + "grad_norm": 0.1695355772972107, + "learning_rate": 1.304364533490578e-05, + "loss": 1.7581, + "step": 25147 + }, + { + "epoch": 7.718845917740945, + "grad_norm": 0.17106328904628754, + "learning_rate": 1.3040297517461709e-05, + "loss": 1.6479, + "step": 25148 + }, + { + "epoch": 7.719152854511971, + "grad_norm": 0.1726374626159668, + "learning_rate": 1.3036950065275072e-05, + "loss": 1.7078, + "step": 25149 + }, + { + "epoch": 7.719459791282996, + "grad_norm": 0.21725010871887207, + "learning_rate": 1.3033602978378962e-05, + "loss": 1.8195, + "step": 25150 + }, + { + "epoch": 7.7197667280540205, + "grad_norm": 0.24786241352558136, + "learning_rate": 1.3030256256806455e-05, + "loss": 1.7439, + "step": 25151 + }, + { + "epoch": 7.720073664825046, + "grad_norm": 0.16550323367118835, + "learning_rate": 1.3026909900590622e-05, + "loss": 1.7267, + "step": 25152 + }, + { + "epoch": 7.720380601596071, + "grad_norm": 0.1833605021238327, + "learning_rate": 1.3023563909764542e-05, + "loss": 1.6675, + "step": 25153 + }, + { + "epoch": 7.7206875383670965, + "grad_norm": 0.16360491514205933, + "learning_rate": 1.3020218284361268e-05, + "loss": 1.684, + "step": 25154 + }, + { + "epoch": 7.720994475138122, + "grad_norm": 0.20423299074172974, + "learning_rate": 1.3016873024413878e-05, + "loss": 1.708, + "step": 25155 + }, + { + "epoch": 7.721301411909147, + "grad_norm": 0.1743123084306717, + "learning_rate": 1.301352812995541e-05, + "loss": 1.7497, + "step": 25156 + }, + { + "epoch": 7.721608348680172, + "grad_norm": 0.237883523106575, + "learning_rate": 1.301018360101896e-05, + "loss": 1.6859, + "step": 25157 + }, + { + "epoch": 7.721915285451197, + "grad_norm": 0.17953886091709137, + "learning_rate": 1.300683943763753e-05, + "loss": 1.6948, + "step": 25158 + }, + { + "epoch": 7.722222222222222, + "grad_norm": 0.19036953151226044, + "learning_rate": 1.3003495639844209e-05, + "loss": 1.7207, + "step": 25159 + }, + { + "epoch": 7.722529158993248, + "grad_norm": 0.17385275661945343, + "learning_rate": 1.3000152207672028e-05, + "loss": 1.7088, + "step": 25160 + }, + { + "epoch": 7.722836095764272, + "grad_norm": 0.1848379373550415, + "learning_rate": 1.2996809141154031e-05, + "loss": 1.7351, + "step": 25161 + }, + { + "epoch": 7.723143032535297, + "grad_norm": 0.1964390128850937, + "learning_rate": 1.2993466440323271e-05, + "loss": 1.7243, + "step": 25162 + }, + { + "epoch": 7.723449969306323, + "grad_norm": 0.23729266226291656, + "learning_rate": 1.299012410521273e-05, + "loss": 1.7588, + "step": 25163 + }, + { + "epoch": 7.723756906077348, + "grad_norm": 0.16980098187923431, + "learning_rate": 1.2986782135855496e-05, + "loss": 1.7092, + "step": 25164 + }, + { + "epoch": 7.724063842848373, + "grad_norm": 0.1993054747581482, + "learning_rate": 1.2983440532284568e-05, + "loss": 1.7245, + "step": 25165 + }, + { + "epoch": 7.724370779619399, + "grad_norm": 0.18817138671875, + "learning_rate": 1.2980099294532982e-05, + "loss": 1.7019, + "step": 25166 + }, + { + "epoch": 7.724677716390423, + "grad_norm": 0.20675966143608093, + "learning_rate": 1.297675842263375e-05, + "loss": 1.6949, + "step": 25167 + }, + { + "epoch": 7.7249846531614486, + "grad_norm": 0.21214626729488373, + "learning_rate": 1.2973417916619895e-05, + "loss": 1.7056, + "step": 25168 + }, + { + "epoch": 7.725291589932474, + "grad_norm": 0.1676976978778839, + "learning_rate": 1.2970077776524426e-05, + "loss": 1.7183, + "step": 25169 + }, + { + "epoch": 7.725598526703499, + "grad_norm": 0.2368413507938385, + "learning_rate": 1.2966738002380347e-05, + "loss": 1.7868, + "step": 25170 + }, + { + "epoch": 7.725905463474525, + "grad_norm": 0.22054153680801392, + "learning_rate": 1.2963398594220672e-05, + "loss": 1.7214, + "step": 25171 + }, + { + "epoch": 7.726212400245549, + "grad_norm": 0.20026426017284393, + "learning_rate": 1.2960059552078402e-05, + "loss": 1.7703, + "step": 25172 + }, + { + "epoch": 7.726519337016574, + "grad_norm": 0.1900193840265274, + "learning_rate": 1.2956720875986516e-05, + "loss": 1.7513, + "step": 25173 + }, + { + "epoch": 7.7268262737876, + "grad_norm": 0.17151880264282227, + "learning_rate": 1.2953382565978057e-05, + "loss": 1.7382, + "step": 25174 + }, + { + "epoch": 7.727133210558625, + "grad_norm": 0.2654723525047302, + "learning_rate": 1.2950044622085955e-05, + "loss": 1.7526, + "step": 25175 + }, + { + "epoch": 7.72744014732965, + "grad_norm": 0.19927532970905304, + "learning_rate": 1.2946707044343259e-05, + "loss": 1.7208, + "step": 25176 + }, + { + "epoch": 7.727747084100676, + "grad_norm": 0.3037160038948059, + "learning_rate": 1.2943369832782887e-05, + "loss": 1.8081, + "step": 25177 + }, + { + "epoch": 7.7280540208717, + "grad_norm": 0.20067723095417023, + "learning_rate": 1.2940032987437873e-05, + "loss": 1.685, + "step": 25178 + }, + { + "epoch": 7.7283609576427255, + "grad_norm": 0.16820429265499115, + "learning_rate": 1.2936696508341189e-05, + "loss": 1.7328, + "step": 25179 + }, + { + "epoch": 7.728667894413751, + "grad_norm": 0.15474672615528107, + "learning_rate": 1.2933360395525763e-05, + "loss": 1.708, + "step": 25180 + }, + { + "epoch": 7.728974831184776, + "grad_norm": 0.17825615406036377, + "learning_rate": 1.2930024649024609e-05, + "loss": 1.7416, + "step": 25181 + }, + { + "epoch": 7.7292817679558015, + "grad_norm": 0.20498061180114746, + "learning_rate": 1.292668926887068e-05, + "loss": 1.736, + "step": 25182 + }, + { + "epoch": 7.729588704726826, + "grad_norm": 0.22965869307518005, + "learning_rate": 1.2923354255096937e-05, + "loss": 1.7167, + "step": 25183 + }, + { + "epoch": 7.729895641497851, + "grad_norm": 0.1687164008617401, + "learning_rate": 1.2920019607736338e-05, + "loss": 1.6988, + "step": 25184 + }, + { + "epoch": 7.730202578268877, + "grad_norm": 0.18255390226840973, + "learning_rate": 1.2916685326821842e-05, + "loss": 1.6891, + "step": 25185 + }, + { + "epoch": 7.730509515039902, + "grad_norm": 0.1519697606563568, + "learning_rate": 1.2913351412386393e-05, + "loss": 1.6553, + "step": 25186 + }, + { + "epoch": 7.730816451810927, + "grad_norm": 0.19137845933437347, + "learning_rate": 1.2910017864462942e-05, + "loss": 1.7246, + "step": 25187 + }, + { + "epoch": 7.731123388581953, + "grad_norm": 0.19998718798160553, + "learning_rate": 1.2906684683084436e-05, + "loss": 1.7324, + "step": 25188 + }, + { + "epoch": 7.731430325352977, + "grad_norm": 0.18066956102848053, + "learning_rate": 1.2903351868283808e-05, + "loss": 1.7299, + "step": 25189 + }, + { + "epoch": 7.731737262124002, + "grad_norm": 0.18489640951156616, + "learning_rate": 1.290001942009399e-05, + "loss": 1.7249, + "step": 25190 + }, + { + "epoch": 7.732044198895028, + "grad_norm": 0.14994095265865326, + "learning_rate": 1.2896687338547958e-05, + "loss": 1.6466, + "step": 25191 + }, + { + "epoch": 7.732351135666053, + "grad_norm": 0.19937917590141296, + "learning_rate": 1.2893355623678571e-05, + "loss": 1.7298, + "step": 25192 + }, + { + "epoch": 7.7326580724370775, + "grad_norm": 0.1435725837945938, + "learning_rate": 1.2890024275518826e-05, + "loss": 1.7384, + "step": 25193 + }, + { + "epoch": 7.732965009208103, + "grad_norm": 0.23283594846725464, + "learning_rate": 1.2886693294101582e-05, + "loss": 1.7765, + "step": 25194 + }, + { + "epoch": 7.733271945979128, + "grad_norm": 0.15489891171455383, + "learning_rate": 1.2883362679459803e-05, + "loss": 1.6911, + "step": 25195 + }, + { + "epoch": 7.7335788827501535, + "grad_norm": 0.17880970239639282, + "learning_rate": 1.2880032431626404e-05, + "loss": 1.6557, + "step": 25196 + }, + { + "epoch": 7.733885819521179, + "grad_norm": 0.1717783808708191, + "learning_rate": 1.287670255063425e-05, + "loss": 1.7112, + "step": 25197 + }, + { + "epoch": 7.734192756292204, + "grad_norm": 0.17371709644794464, + "learning_rate": 1.2873373036516313e-05, + "loss": 1.7591, + "step": 25198 + }, + { + "epoch": 7.734499693063229, + "grad_norm": 0.15894445776939392, + "learning_rate": 1.2870043889305432e-05, + "loss": 1.6615, + "step": 25199 + }, + { + "epoch": 7.734806629834254, + "grad_norm": 0.17047199606895447, + "learning_rate": 1.2866715109034554e-05, + "loss": 1.7376, + "step": 25200 + }, + { + "epoch": 7.735113566605279, + "grad_norm": 0.17434459924697876, + "learning_rate": 1.2863386695736562e-05, + "loss": 1.6871, + "step": 25201 + }, + { + "epoch": 7.735420503376305, + "grad_norm": 0.18515460193157196, + "learning_rate": 1.2860058649444351e-05, + "loss": 1.7475, + "step": 25202 + }, + { + "epoch": 7.73572744014733, + "grad_norm": 0.1510036140680313, + "learning_rate": 1.2856730970190806e-05, + "loss": 1.7101, + "step": 25203 + }, + { + "epoch": 7.736034376918354, + "grad_norm": 0.1886061728000641, + "learning_rate": 1.2853403658008817e-05, + "loss": 1.7253, + "step": 25204 + }, + { + "epoch": 7.73634131368938, + "grad_norm": 0.15830372273921967, + "learning_rate": 1.2850076712931269e-05, + "loss": 1.7024, + "step": 25205 + }, + { + "epoch": 7.736648250460405, + "grad_norm": 0.3030432462692261, + "learning_rate": 1.2846750134991031e-05, + "loss": 1.7702, + "step": 25206 + }, + { + "epoch": 7.73695518723143, + "grad_norm": 0.1946970373392105, + "learning_rate": 1.2843423924220977e-05, + "loss": 1.7199, + "step": 25207 + }, + { + "epoch": 7.737262124002456, + "grad_norm": 0.19842801988124847, + "learning_rate": 1.2840098080654012e-05, + "loss": 1.7435, + "step": 25208 + }, + { + "epoch": 7.737569060773481, + "grad_norm": 0.17269715666770935, + "learning_rate": 1.2836772604322945e-05, + "loss": 1.6837, + "step": 25209 + }, + { + "epoch": 7.7378759975445055, + "grad_norm": 0.14366893470287323, + "learning_rate": 1.2833447495260703e-05, + "loss": 1.6453, + "step": 25210 + }, + { + "epoch": 7.738182934315531, + "grad_norm": 0.2189856618642807, + "learning_rate": 1.283012275350009e-05, + "loss": 1.7341, + "step": 25211 + }, + { + "epoch": 7.738489871086556, + "grad_norm": 0.14334678649902344, + "learning_rate": 1.2826798379074007e-05, + "loss": 1.6505, + "step": 25212 + }, + { + "epoch": 7.7387968078575815, + "grad_norm": 0.2020469605922699, + "learning_rate": 1.2823474372015304e-05, + "loss": 1.7915, + "step": 25213 + }, + { + "epoch": 7.739103744628607, + "grad_norm": 0.14702250063419342, + "learning_rate": 1.2820150732356783e-05, + "loss": 1.6682, + "step": 25214 + }, + { + "epoch": 7.739410681399631, + "grad_norm": 0.2310563623905182, + "learning_rate": 1.281682746013136e-05, + "loss": 1.7447, + "step": 25215 + }, + { + "epoch": 7.739717618170657, + "grad_norm": 0.16534216701984406, + "learning_rate": 1.2813504555371808e-05, + "loss": 1.6641, + "step": 25216 + }, + { + "epoch": 7.740024554941682, + "grad_norm": 0.1390565037727356, + "learning_rate": 1.2810182018111012e-05, + "loss": 1.6912, + "step": 25217 + }, + { + "epoch": 7.740331491712707, + "grad_norm": 0.16568928956985474, + "learning_rate": 1.2806859848381797e-05, + "loss": 1.7375, + "step": 25218 + }, + { + "epoch": 7.740638428483733, + "grad_norm": 0.18870174884796143, + "learning_rate": 1.2803538046216995e-05, + "loss": 1.7158, + "step": 25219 + }, + { + "epoch": 7.740945365254758, + "grad_norm": 0.18347607553005219, + "learning_rate": 1.2800216611649429e-05, + "loss": 1.7766, + "step": 25220 + }, + { + "epoch": 7.741252302025782, + "grad_norm": 0.21285377442836761, + "learning_rate": 1.2796895544711929e-05, + "loss": 1.6876, + "step": 25221 + }, + { + "epoch": 7.741559238796808, + "grad_norm": 0.26524603366851807, + "learning_rate": 1.2793574845437311e-05, + "loss": 1.6679, + "step": 25222 + }, + { + "epoch": 7.741866175567833, + "grad_norm": 0.1671147346496582, + "learning_rate": 1.2790254513858397e-05, + "loss": 1.6853, + "step": 25223 + }, + { + "epoch": 7.742173112338858, + "grad_norm": 0.21713866293430328, + "learning_rate": 1.2786934550007979e-05, + "loss": 1.8124, + "step": 25224 + }, + { + "epoch": 7.742480049109884, + "grad_norm": 0.17161360383033752, + "learning_rate": 1.2783614953918916e-05, + "loss": 1.6862, + "step": 25225 + }, + { + "epoch": 7.742786985880908, + "grad_norm": 0.1513087898492813, + "learning_rate": 1.2780295725623947e-05, + "loss": 1.6644, + "step": 25226 + }, + { + "epoch": 7.7430939226519335, + "grad_norm": 0.13013005256652832, + "learning_rate": 1.2776976865155948e-05, + "loss": 1.6612, + "step": 25227 + }, + { + "epoch": 7.743400859422959, + "grad_norm": 0.15204063057899475, + "learning_rate": 1.2773658372547648e-05, + "loss": 1.6391, + "step": 25228 + }, + { + "epoch": 7.743707796193984, + "grad_norm": 0.15421196818351746, + "learning_rate": 1.2770340247831891e-05, + "loss": 1.7005, + "step": 25229 + }, + { + "epoch": 7.7440147329650095, + "grad_norm": 0.14045587182044983, + "learning_rate": 1.276702249104147e-05, + "loss": 1.6448, + "step": 25230 + }, + { + "epoch": 7.744321669736035, + "grad_norm": 0.17244049906730652, + "learning_rate": 1.2763705102209123e-05, + "loss": 1.6737, + "step": 25231 + }, + { + "epoch": 7.744628606507059, + "grad_norm": 0.16891124844551086, + "learning_rate": 1.2760388081367697e-05, + "loss": 1.6625, + "step": 25232 + }, + { + "epoch": 7.744935543278085, + "grad_norm": 0.18271134793758392, + "learning_rate": 1.275707142854991e-05, + "loss": 1.6963, + "step": 25233 + }, + { + "epoch": 7.74524248004911, + "grad_norm": 0.18582625687122345, + "learning_rate": 1.2753755143788593e-05, + "loss": 1.6731, + "step": 25234 + }, + { + "epoch": 7.745549416820135, + "grad_norm": 0.17610707879066467, + "learning_rate": 1.2750439227116495e-05, + "loss": 1.6976, + "step": 25235 + }, + { + "epoch": 7.74585635359116, + "grad_norm": 0.20406337082386017, + "learning_rate": 1.2747123678566391e-05, + "loss": 1.7287, + "step": 25236 + }, + { + "epoch": 7.746163290362185, + "grad_norm": 0.16879913210868835, + "learning_rate": 1.2743808498171046e-05, + "loss": 1.6594, + "step": 25237 + }, + { + "epoch": 7.74647022713321, + "grad_norm": 0.1405191272497177, + "learning_rate": 1.2740493685963217e-05, + "loss": 1.6565, + "step": 25238 + }, + { + "epoch": 7.746777163904236, + "grad_norm": 0.1460784375667572, + "learning_rate": 1.2737179241975671e-05, + "loss": 1.6336, + "step": 25239 + }, + { + "epoch": 7.747084100675261, + "grad_norm": 0.16206084191799164, + "learning_rate": 1.273386516624116e-05, + "loss": 1.7501, + "step": 25240 + }, + { + "epoch": 7.747391037446286, + "grad_norm": 0.17040394246578217, + "learning_rate": 1.2730551458792422e-05, + "loss": 1.7532, + "step": 25241 + }, + { + "epoch": 7.747697974217311, + "grad_norm": 0.15487439930438995, + "learning_rate": 1.2727238119662243e-05, + "loss": 1.6757, + "step": 25242 + }, + { + "epoch": 7.748004910988336, + "grad_norm": 0.139495387673378, + "learning_rate": 1.272392514888332e-05, + "loss": 1.6431, + "step": 25243 + }, + { + "epoch": 7.7483118477593615, + "grad_norm": 0.16329489648342133, + "learning_rate": 1.2720612546488447e-05, + "loss": 1.7353, + "step": 25244 + }, + { + "epoch": 7.748618784530387, + "grad_norm": 0.14997398853302002, + "learning_rate": 1.27173003125103e-05, + "loss": 1.6977, + "step": 25245 + }, + { + "epoch": 7.748925721301412, + "grad_norm": 0.2005717009305954, + "learning_rate": 1.2713988446981656e-05, + "loss": 1.757, + "step": 25246 + }, + { + "epoch": 7.749232658072437, + "grad_norm": 0.2027040272951126, + "learning_rate": 1.2710676949935246e-05, + "loss": 1.7506, + "step": 25247 + }, + { + "epoch": 7.749539594843462, + "grad_norm": 0.18176981806755066, + "learning_rate": 1.2707365821403755e-05, + "loss": 1.7132, + "step": 25248 + }, + { + "epoch": 7.749846531614487, + "grad_norm": 0.18690772354602814, + "learning_rate": 1.2704055061419961e-05, + "loss": 1.7725, + "step": 25249 + }, + { + "epoch": 7.750153468385513, + "grad_norm": 0.18360945582389832, + "learning_rate": 1.270074467001653e-05, + "loss": 1.6779, + "step": 25250 + }, + { + "epoch": 7.750460405156538, + "grad_norm": 0.18498149514198303, + "learning_rate": 1.269743464722621e-05, + "loss": 1.7105, + "step": 25251 + }, + { + "epoch": 7.750767341927563, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.2694124993081707e-05, + "loss": 1.7273, + "step": 25252 + }, + { + "epoch": 7.751074278698588, + "grad_norm": 0.17312094569206238, + "learning_rate": 1.2690815707615727e-05, + "loss": 1.7532, + "step": 25253 + }, + { + "epoch": 7.751381215469613, + "grad_norm": 0.18758632242679596, + "learning_rate": 1.2687506790860976e-05, + "loss": 1.7394, + "step": 25254 + }, + { + "epoch": 7.7516881522406385, + "grad_norm": 0.1642044633626938, + "learning_rate": 1.2684198242850149e-05, + "loss": 1.6699, + "step": 25255 + }, + { + "epoch": 7.751995089011664, + "grad_norm": 0.34566664695739746, + "learning_rate": 1.2680890063615947e-05, + "loss": 1.7048, + "step": 25256 + }, + { + "epoch": 7.752302025782689, + "grad_norm": 0.15046556293964386, + "learning_rate": 1.2677582253191066e-05, + "loss": 1.659, + "step": 25257 + }, + { + "epoch": 7.752608962553714, + "grad_norm": 0.1504966914653778, + "learning_rate": 1.2674274811608171e-05, + "loss": 1.6841, + "step": 25258 + }, + { + "epoch": 7.752915899324739, + "grad_norm": 0.2226656973361969, + "learning_rate": 1.2670967738900009e-05, + "loss": 1.7139, + "step": 25259 + }, + { + "epoch": 7.753222836095764, + "grad_norm": 0.18797673285007477, + "learning_rate": 1.2667661035099188e-05, + "loss": 1.7726, + "step": 25260 + }, + { + "epoch": 7.75352977286679, + "grad_norm": 0.15428531169891357, + "learning_rate": 1.266435470023845e-05, + "loss": 1.6831, + "step": 25261 + }, + { + "epoch": 7.753836709637815, + "grad_norm": 0.20027057826519012, + "learning_rate": 1.2661048734350412e-05, + "loss": 1.741, + "step": 25262 + }, + { + "epoch": 7.75414364640884, + "grad_norm": 0.14779487252235413, + "learning_rate": 1.2657743137467793e-05, + "loss": 1.6974, + "step": 25263 + }, + { + "epoch": 7.754450583179865, + "grad_norm": 0.17618241906166077, + "learning_rate": 1.2654437909623258e-05, + "loss": 1.7374, + "step": 25264 + }, + { + "epoch": 7.75475751995089, + "grad_norm": 0.18769881129264832, + "learning_rate": 1.2651133050849423e-05, + "loss": 1.7241, + "step": 25265 + }, + { + "epoch": 7.755064456721915, + "grad_norm": 0.18645870685577393, + "learning_rate": 1.2647828561179015e-05, + "loss": 1.7176, + "step": 25266 + }, + { + "epoch": 7.755371393492941, + "grad_norm": 0.17507290840148926, + "learning_rate": 1.2644524440644628e-05, + "loss": 1.6994, + "step": 25267 + }, + { + "epoch": 7.755678330263965, + "grad_norm": 0.15264524519443512, + "learning_rate": 1.264122068927896e-05, + "loss": 1.6993, + "step": 25268 + }, + { + "epoch": 7.7559852670349905, + "grad_norm": 0.1749732941389084, + "learning_rate": 1.263791730711465e-05, + "loss": 1.7265, + "step": 25269 + }, + { + "epoch": 7.756292203806016, + "grad_norm": 0.15777049958705902, + "learning_rate": 1.2634614294184332e-05, + "loss": 1.6219, + "step": 25270 + }, + { + "epoch": 7.756599140577041, + "grad_norm": 0.17740310728549957, + "learning_rate": 1.263131165052066e-05, + "loss": 1.7373, + "step": 25271 + }, + { + "epoch": 7.7569060773480665, + "grad_norm": 0.22577044367790222, + "learning_rate": 1.262800937615627e-05, + "loss": 1.7492, + "step": 25272 + }, + { + "epoch": 7.757213014119092, + "grad_norm": 0.155413419008255, + "learning_rate": 1.2624707471123791e-05, + "loss": 1.7037, + "step": 25273 + }, + { + "epoch": 7.757519950890116, + "grad_norm": 0.1755802482366562, + "learning_rate": 1.2621405935455866e-05, + "loss": 1.7057, + "step": 25274 + }, + { + "epoch": 7.757826887661142, + "grad_norm": 0.15870101749897003, + "learning_rate": 1.2618104769185096e-05, + "loss": 1.6951, + "step": 25275 + }, + { + "epoch": 7.758133824432167, + "grad_norm": 0.18285419046878815, + "learning_rate": 1.2614803972344158e-05, + "loss": 1.7443, + "step": 25276 + }, + { + "epoch": 7.758440761203192, + "grad_norm": 0.1669059544801712, + "learning_rate": 1.2611503544965609e-05, + "loss": 1.6442, + "step": 25277 + }, + { + "epoch": 7.758747697974218, + "grad_norm": 0.17830590903759003, + "learning_rate": 1.2608203487082121e-05, + "loss": 1.7432, + "step": 25278 + }, + { + "epoch": 7.759054634745242, + "grad_norm": 0.18318989872932434, + "learning_rate": 1.2604903798726259e-05, + "loss": 1.7128, + "step": 25279 + }, + { + "epoch": 7.759361571516267, + "grad_norm": 0.17735294997692108, + "learning_rate": 1.2601604479930663e-05, + "loss": 1.6719, + "step": 25280 + }, + { + "epoch": 7.759668508287293, + "grad_norm": 0.14324752986431122, + "learning_rate": 1.2598305530727949e-05, + "loss": 1.688, + "step": 25281 + }, + { + "epoch": 7.759975445058318, + "grad_norm": 0.17677859961986542, + "learning_rate": 1.2595006951150678e-05, + "loss": 1.7016, + "step": 25282 + }, + { + "epoch": 7.760282381829343, + "grad_norm": 0.16832831501960754, + "learning_rate": 1.2591708741231495e-05, + "loss": 1.6669, + "step": 25283 + }, + { + "epoch": 7.760589318600369, + "grad_norm": 0.20717547833919525, + "learning_rate": 1.2588410901002944e-05, + "loss": 1.7275, + "step": 25284 + }, + { + "epoch": 7.760896255371393, + "grad_norm": 0.2471853792667389, + "learning_rate": 1.2585113430497658e-05, + "loss": 1.779, + "step": 25285 + }, + { + "epoch": 7.7612031921424185, + "grad_norm": 0.2646878957748413, + "learning_rate": 1.2581816329748214e-05, + "loss": 1.8003, + "step": 25286 + }, + { + "epoch": 7.761510128913444, + "grad_norm": 0.2102949321269989, + "learning_rate": 1.2578519598787191e-05, + "loss": 1.764, + "step": 25287 + }, + { + "epoch": 7.761817065684469, + "grad_norm": 0.16151423752307892, + "learning_rate": 1.2575223237647171e-05, + "loss": 1.7233, + "step": 25288 + }, + { + "epoch": 7.7621240024554945, + "grad_norm": 0.22221817076206207, + "learning_rate": 1.2571927246360727e-05, + "loss": 1.7485, + "step": 25289 + }, + { + "epoch": 7.762430939226519, + "grad_norm": 0.16470851004123688, + "learning_rate": 1.2568631624960441e-05, + "loss": 1.6844, + "step": 25290 + }, + { + "epoch": 7.762737875997544, + "grad_norm": 0.17529261112213135, + "learning_rate": 1.256533637347887e-05, + "loss": 1.7409, + "step": 25291 + }, + { + "epoch": 7.76304481276857, + "grad_norm": 0.19055718183517456, + "learning_rate": 1.2562041491948579e-05, + "loss": 1.6861, + "step": 25292 + }, + { + "epoch": 7.763351749539595, + "grad_norm": 0.19183041155338287, + "learning_rate": 1.2558746980402159e-05, + "loss": 1.7493, + "step": 25293 + }, + { + "epoch": 7.76365868631062, + "grad_norm": 0.20031596720218658, + "learning_rate": 1.2555452838872123e-05, + "loss": 1.705, + "step": 25294 + }, + { + "epoch": 7.763965623081646, + "grad_norm": 0.16234149038791656, + "learning_rate": 1.2552159067391072e-05, + "loss": 1.7407, + "step": 25295 + }, + { + "epoch": 7.76427255985267, + "grad_norm": 0.15412569046020508, + "learning_rate": 1.254886566599151e-05, + "loss": 1.6599, + "step": 25296 + }, + { + "epoch": 7.764579496623695, + "grad_norm": 0.17393885552883148, + "learning_rate": 1.2545572634706022e-05, + "loss": 1.7372, + "step": 25297 + }, + { + "epoch": 7.764886433394721, + "grad_norm": 0.18662036955356598, + "learning_rate": 1.254227997356715e-05, + "loss": 1.7681, + "step": 25298 + }, + { + "epoch": 7.765193370165746, + "grad_norm": 0.16661690175533295, + "learning_rate": 1.2538987682607395e-05, + "loss": 1.754, + "step": 25299 + }, + { + "epoch": 7.765500306936771, + "grad_norm": 0.21453191339969635, + "learning_rate": 1.253569576185935e-05, + "loss": 1.7802, + "step": 25300 + }, + { + "epoch": 7.765807243707796, + "grad_norm": 0.14639903604984283, + "learning_rate": 1.2532404211355486e-05, + "loss": 1.6478, + "step": 25301 + }, + { + "epoch": 7.766114180478821, + "grad_norm": 0.17430682480335236, + "learning_rate": 1.2529113031128382e-05, + "loss": 1.687, + "step": 25302 + }, + { + "epoch": 7.7664211172498465, + "grad_norm": 0.21582552790641785, + "learning_rate": 1.2525822221210543e-05, + "loss": 1.7723, + "step": 25303 + }, + { + "epoch": 7.766728054020872, + "grad_norm": 0.21142803132534027, + "learning_rate": 1.2522531781634495e-05, + "loss": 1.7986, + "step": 25304 + }, + { + "epoch": 7.767034990791897, + "grad_norm": 0.1637791097164154, + "learning_rate": 1.251924171243275e-05, + "loss": 1.6884, + "step": 25305 + }, + { + "epoch": 7.7673419275629225, + "grad_norm": 0.19218359887599945, + "learning_rate": 1.2515952013637832e-05, + "loss": 1.7972, + "step": 25306 + }, + { + "epoch": 7.767648864333947, + "grad_norm": 0.14534975588321686, + "learning_rate": 1.2512662685282245e-05, + "loss": 1.6602, + "step": 25307 + }, + { + "epoch": 7.767955801104972, + "grad_norm": 0.2955080568790436, + "learning_rate": 1.2509373727398494e-05, + "loss": 1.763, + "step": 25308 + }, + { + "epoch": 7.768262737875998, + "grad_norm": 0.17220059037208557, + "learning_rate": 1.2506085140019086e-05, + "loss": 1.672, + "step": 25309 + }, + { + "epoch": 7.768569674647023, + "grad_norm": 0.17092043161392212, + "learning_rate": 1.2502796923176524e-05, + "loss": 1.7014, + "step": 25310 + }, + { + "epoch": 7.768876611418047, + "grad_norm": 0.2363509237766266, + "learning_rate": 1.2499509076903288e-05, + "loss": 1.7489, + "step": 25311 + }, + { + "epoch": 7.769183548189073, + "grad_norm": 0.19223156571388245, + "learning_rate": 1.2496221601231906e-05, + "loss": 1.7194, + "step": 25312 + }, + { + "epoch": 7.769490484960098, + "grad_norm": 0.18292652070522308, + "learning_rate": 1.249293449619483e-05, + "loss": 1.7422, + "step": 25313 + }, + { + "epoch": 7.769797421731123, + "grad_norm": 0.17120866477489471, + "learning_rate": 1.2489647761824547e-05, + "loss": 1.7367, + "step": 25314 + }, + { + "epoch": 7.770104358502149, + "grad_norm": 0.22178049385547638, + "learning_rate": 1.248636139815358e-05, + "loss": 1.7451, + "step": 25315 + }, + { + "epoch": 7.770411295273174, + "grad_norm": 0.15707750618457794, + "learning_rate": 1.2483075405214346e-05, + "loss": 1.6748, + "step": 25316 + }, + { + "epoch": 7.7707182320441985, + "grad_norm": 0.1570693850517273, + "learning_rate": 1.2479789783039381e-05, + "loss": 1.6895, + "step": 25317 + }, + { + "epoch": 7.771025168815224, + "grad_norm": 0.1687897890806198, + "learning_rate": 1.2476504531661093e-05, + "loss": 1.7145, + "step": 25318 + }, + { + "epoch": 7.771332105586249, + "grad_norm": 0.16047275066375732, + "learning_rate": 1.2473219651112e-05, + "loss": 1.6675, + "step": 25319 + }, + { + "epoch": 7.7716390423572745, + "grad_norm": 0.16817785799503326, + "learning_rate": 1.2469935141424544e-05, + "loss": 1.6678, + "step": 25320 + }, + { + "epoch": 7.7719459791283, + "grad_norm": 0.1511528342962265, + "learning_rate": 1.246665100263118e-05, + "loss": 1.7054, + "step": 25321 + }, + { + "epoch": 7.772252915899324, + "grad_norm": 0.145367830991745, + "learning_rate": 1.2463367234764373e-05, + "loss": 1.7037, + "step": 25322 + }, + { + "epoch": 7.77255985267035, + "grad_norm": 0.1794048696756363, + "learning_rate": 1.2460083837856573e-05, + "loss": 1.7372, + "step": 25323 + }, + { + "epoch": 7.772866789441375, + "grad_norm": 0.21238376200199127, + "learning_rate": 1.2456800811940227e-05, + "loss": 1.7796, + "step": 25324 + }, + { + "epoch": 7.7731737262124, + "grad_norm": 0.23305723071098328, + "learning_rate": 1.2453518157047784e-05, + "loss": 1.7124, + "step": 25325 + }, + { + "epoch": 7.773480662983426, + "grad_norm": 0.18229269981384277, + "learning_rate": 1.2450235873211673e-05, + "loss": 1.7202, + "step": 25326 + }, + { + "epoch": 7.773787599754451, + "grad_norm": 0.19145874679088593, + "learning_rate": 1.2446953960464346e-05, + "loss": 1.6701, + "step": 25327 + }, + { + "epoch": 7.774094536525475, + "grad_norm": 0.26310765743255615, + "learning_rate": 1.2443672418838215e-05, + "loss": 1.7674, + "step": 25328 + }, + { + "epoch": 7.774401473296501, + "grad_norm": 0.18370535969734192, + "learning_rate": 1.2440391248365756e-05, + "loss": 1.7027, + "step": 25329 + }, + { + "epoch": 7.774708410067526, + "grad_norm": 0.24704128503799438, + "learning_rate": 1.2437110449079348e-05, + "loss": 1.7238, + "step": 25330 + }, + { + "epoch": 7.7750153468385514, + "grad_norm": 0.194215789437294, + "learning_rate": 1.2433830021011433e-05, + "loss": 1.735, + "step": 25331 + }, + { + "epoch": 7.775322283609577, + "grad_norm": 0.24099037051200867, + "learning_rate": 1.2430549964194427e-05, + "loss": 1.7335, + "step": 25332 + }, + { + "epoch": 7.775629220380601, + "grad_norm": 0.1665026843547821, + "learning_rate": 1.242727027866073e-05, + "loss": 1.7245, + "step": 25333 + }, + { + "epoch": 7.775936157151627, + "grad_norm": 0.18005968630313873, + "learning_rate": 1.24239909644428e-05, + "loss": 1.6227, + "step": 25334 + }, + { + "epoch": 7.776243093922652, + "grad_norm": 0.2306728959083557, + "learning_rate": 1.2420712021572983e-05, + "loss": 1.7136, + "step": 25335 + }, + { + "epoch": 7.776550030693677, + "grad_norm": 0.1916062831878662, + "learning_rate": 1.2417433450083738e-05, + "loss": 1.7912, + "step": 25336 + }, + { + "epoch": 7.776856967464703, + "grad_norm": 0.1999555081129074, + "learning_rate": 1.2414155250007437e-05, + "loss": 1.7685, + "step": 25337 + }, + { + "epoch": 7.777163904235728, + "grad_norm": 0.18222710490226746, + "learning_rate": 1.2410877421376488e-05, + "loss": 1.7024, + "step": 25338 + }, + { + "epoch": 7.777470841006752, + "grad_norm": 0.22534650564193726, + "learning_rate": 1.2407599964223276e-05, + "loss": 1.7263, + "step": 25339 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.3313053250312805, + "learning_rate": 1.2404322878580199e-05, + "loss": 1.6988, + "step": 25340 + }, + { + "epoch": 7.778084714548803, + "grad_norm": 0.23691575229167938, + "learning_rate": 1.2401046164479635e-05, + "loss": 1.7771, + "step": 25341 + }, + { + "epoch": 7.778391651319828, + "grad_norm": 0.2119995355606079, + "learning_rate": 1.2397769821953976e-05, + "loss": 1.709, + "step": 25342 + }, + { + "epoch": 7.778698588090853, + "grad_norm": 0.20468266308307648, + "learning_rate": 1.2394493851035588e-05, + "loss": 1.7914, + "step": 25343 + }, + { + "epoch": 7.779005524861878, + "grad_norm": 0.19825033843517303, + "learning_rate": 1.2391218251756854e-05, + "loss": 1.727, + "step": 25344 + }, + { + "epoch": 7.7793124616329035, + "grad_norm": 0.19072072207927704, + "learning_rate": 1.2387943024150134e-05, + "loss": 1.7498, + "step": 25345 + }, + { + "epoch": 7.779619398403929, + "grad_norm": 0.15986371040344238, + "learning_rate": 1.2384668168247832e-05, + "loss": 1.6807, + "step": 25346 + }, + { + "epoch": 7.779926335174954, + "grad_norm": 0.1731162816286087, + "learning_rate": 1.238139368408227e-05, + "loss": 1.7, + "step": 25347 + }, + { + "epoch": 7.7802332719459795, + "grad_norm": 0.1496593952178955, + "learning_rate": 1.237811957168583e-05, + "loss": 1.6558, + "step": 25348 + }, + { + "epoch": 7.780540208717004, + "grad_norm": 0.1982542872428894, + "learning_rate": 1.2374845831090859e-05, + "loss": 1.7888, + "step": 25349 + }, + { + "epoch": 7.780847145488029, + "grad_norm": 0.1517801433801651, + "learning_rate": 1.2371572462329706e-05, + "loss": 1.6743, + "step": 25350 + }, + { + "epoch": 7.781154082259055, + "grad_norm": 0.23794496059417725, + "learning_rate": 1.2368299465434752e-05, + "loss": 1.7332, + "step": 25351 + }, + { + "epoch": 7.78146101903008, + "grad_norm": 0.20220822095870972, + "learning_rate": 1.2365026840438288e-05, + "loss": 1.7444, + "step": 25352 + }, + { + "epoch": 7.781767955801105, + "grad_norm": 0.18997377157211304, + "learning_rate": 1.236175458737272e-05, + "loss": 1.771, + "step": 25353 + }, + { + "epoch": 7.78207489257213, + "grad_norm": 0.15465202927589417, + "learning_rate": 1.2358482706270325e-05, + "loss": 1.7072, + "step": 25354 + }, + { + "epoch": 7.782381829343155, + "grad_norm": 0.1759808510541916, + "learning_rate": 1.235521119716348e-05, + "loss": 1.6761, + "step": 25355 + }, + { + "epoch": 7.78268876611418, + "grad_norm": 0.17520606517791748, + "learning_rate": 1.2351940060084505e-05, + "loss": 1.6702, + "step": 25356 + }, + { + "epoch": 7.782995702885206, + "grad_norm": 0.20305509865283966, + "learning_rate": 1.2348669295065717e-05, + "loss": 1.746, + "step": 25357 + }, + { + "epoch": 7.783302639656231, + "grad_norm": 0.14459536969661713, + "learning_rate": 1.2345398902139454e-05, + "loss": 1.6907, + "step": 25358 + }, + { + "epoch": 7.783609576427256, + "grad_norm": 0.18058347702026367, + "learning_rate": 1.2342128881338027e-05, + "loss": 1.796, + "step": 25359 + }, + { + "epoch": 7.783916513198281, + "grad_norm": 0.1778976023197174, + "learning_rate": 1.2338859232693756e-05, + "loss": 1.715, + "step": 25360 + }, + { + "epoch": 7.784223449969306, + "grad_norm": 0.1644120067358017, + "learning_rate": 1.2335589956238953e-05, + "loss": 1.6786, + "step": 25361 + }, + { + "epoch": 7.7845303867403315, + "grad_norm": 0.15315432846546173, + "learning_rate": 1.2332321052005907e-05, + "loss": 1.6503, + "step": 25362 + }, + { + "epoch": 7.784837323511357, + "grad_norm": 0.19160087406635284, + "learning_rate": 1.2329052520026973e-05, + "loss": 1.7131, + "step": 25363 + }, + { + "epoch": 7.785144260282382, + "grad_norm": 0.1778041124343872, + "learning_rate": 1.2325784360334408e-05, + "loss": 1.754, + "step": 25364 + }, + { + "epoch": 7.785451197053407, + "grad_norm": 0.17478828132152557, + "learning_rate": 1.2322516572960519e-05, + "loss": 1.7122, + "step": 25365 + }, + { + "epoch": 7.785758133824432, + "grad_norm": 0.2239549458026886, + "learning_rate": 1.2319249157937612e-05, + "loss": 1.7589, + "step": 25366 + }, + { + "epoch": 7.786065070595457, + "grad_norm": 0.21565821766853333, + "learning_rate": 1.2315982115297953e-05, + "loss": 1.7468, + "step": 25367 + }, + { + "epoch": 7.786372007366483, + "grad_norm": 0.1859208643436432, + "learning_rate": 1.231271544507387e-05, + "loss": 1.7289, + "step": 25368 + }, + { + "epoch": 7.786678944137508, + "grad_norm": 0.14813102781772614, + "learning_rate": 1.2309449147297596e-05, + "loss": 1.6543, + "step": 25369 + }, + { + "epoch": 7.786985880908533, + "grad_norm": 0.14101989567279816, + "learning_rate": 1.2306183222001472e-05, + "loss": 1.6775, + "step": 25370 + }, + { + "epoch": 7.787292817679558, + "grad_norm": 0.2041245847940445, + "learning_rate": 1.2302917669217701e-05, + "loss": 1.6874, + "step": 25371 + }, + { + "epoch": 7.787599754450583, + "grad_norm": 0.17343124747276306, + "learning_rate": 1.2299652488978614e-05, + "loss": 1.7005, + "step": 25372 + }, + { + "epoch": 7.787906691221608, + "grad_norm": 0.20174655318260193, + "learning_rate": 1.2296387681316451e-05, + "loss": 1.8073, + "step": 25373 + }, + { + "epoch": 7.788213627992634, + "grad_norm": 0.21615192294120789, + "learning_rate": 1.2293123246263488e-05, + "loss": 1.7045, + "step": 25374 + }, + { + "epoch": 7.788520564763659, + "grad_norm": 0.18587705492973328, + "learning_rate": 1.2289859183851981e-05, + "loss": 1.7497, + "step": 25375 + }, + { + "epoch": 7.7888275015346835, + "grad_norm": 0.16649113595485687, + "learning_rate": 1.228659549411419e-05, + "loss": 1.6695, + "step": 25376 + }, + { + "epoch": 7.789134438305709, + "grad_norm": 0.16547587513923645, + "learning_rate": 1.2283332177082362e-05, + "loss": 1.7119, + "step": 25377 + }, + { + "epoch": 7.789441375076734, + "grad_norm": 0.17672663927078247, + "learning_rate": 1.2280069232788755e-05, + "loss": 1.7458, + "step": 25378 + }, + { + "epoch": 7.7897483118477595, + "grad_norm": 0.15436655282974243, + "learning_rate": 1.22768066612656e-05, + "loss": 1.723, + "step": 25379 + }, + { + "epoch": 7.790055248618785, + "grad_norm": 0.1699141561985016, + "learning_rate": 1.2273544462545178e-05, + "loss": 1.7083, + "step": 25380 + }, + { + "epoch": 7.79036218538981, + "grad_norm": 0.18014399707317352, + "learning_rate": 1.2270282636659686e-05, + "loss": 1.7512, + "step": 25381 + }, + { + "epoch": 7.790669122160835, + "grad_norm": 0.1807268261909485, + "learning_rate": 1.2267021183641375e-05, + "loss": 1.7404, + "step": 25382 + }, + { + "epoch": 7.79097605893186, + "grad_norm": 0.16704204678535461, + "learning_rate": 1.2263760103522481e-05, + "loss": 1.6723, + "step": 25383 + }, + { + "epoch": 7.791282995702885, + "grad_norm": 0.1551518738269806, + "learning_rate": 1.2260499396335206e-05, + "loss": 1.7, + "step": 25384 + }, + { + "epoch": 7.791589932473911, + "grad_norm": 0.16270415484905243, + "learning_rate": 1.225723906211183e-05, + "loss": 1.7238, + "step": 25385 + }, + { + "epoch": 7.791896869244935, + "grad_norm": 0.19548700749874115, + "learning_rate": 1.225397910088451e-05, + "loss": 1.7192, + "step": 25386 + }, + { + "epoch": 7.79220380601596, + "grad_norm": 0.19115851819515228, + "learning_rate": 1.225071951268552e-05, + "loss": 1.753, + "step": 25387 + }, + { + "epoch": 7.792510742786986, + "grad_norm": 0.1557070016860962, + "learning_rate": 1.224746029754702e-05, + "loss": 1.6791, + "step": 25388 + }, + { + "epoch": 7.792817679558011, + "grad_norm": 0.16580358147621155, + "learning_rate": 1.2244201455501252e-05, + "loss": 1.6799, + "step": 25389 + }, + { + "epoch": 7.793124616329036, + "grad_norm": 0.18099573254585266, + "learning_rate": 1.2240942986580422e-05, + "loss": 1.7546, + "step": 25390 + }, + { + "epoch": 7.793431553100062, + "grad_norm": 0.2411479502916336, + "learning_rate": 1.223768489081672e-05, + "loss": 1.7315, + "step": 25391 + }, + { + "epoch": 7.793738489871086, + "grad_norm": 0.14678087830543518, + "learning_rate": 1.2234427168242351e-05, + "loss": 1.6733, + "step": 25392 + }, + { + "epoch": 7.7940454266421115, + "grad_norm": 0.17501497268676758, + "learning_rate": 1.223116981888951e-05, + "loss": 1.7416, + "step": 25393 + }, + { + "epoch": 7.794352363413137, + "grad_norm": 0.25460878014564514, + "learning_rate": 1.2227912842790384e-05, + "loss": 1.7873, + "step": 25394 + }, + { + "epoch": 7.794659300184162, + "grad_norm": 0.1701650321483612, + "learning_rate": 1.2224656239977161e-05, + "loss": 1.686, + "step": 25395 + }, + { + "epoch": 7.7949662369551875, + "grad_norm": 0.15684448182582855, + "learning_rate": 1.2221400010482009e-05, + "loss": 1.6768, + "step": 25396 + }, + { + "epoch": 7.795273173726212, + "grad_norm": 0.19048964977264404, + "learning_rate": 1.2218144154337158e-05, + "loss": 1.744, + "step": 25397 + }, + { + "epoch": 7.795580110497237, + "grad_norm": 0.20939184725284576, + "learning_rate": 1.2214888671574737e-05, + "loss": 1.818, + "step": 25398 + }, + { + "epoch": 7.795887047268263, + "grad_norm": 0.18450765311717987, + "learning_rate": 1.2211633562226932e-05, + "loss": 1.6972, + "step": 25399 + }, + { + "epoch": 7.796193984039288, + "grad_norm": 0.20349545776844025, + "learning_rate": 1.2208378826325912e-05, + "loss": 1.7784, + "step": 25400 + }, + { + "epoch": 7.796500920810313, + "grad_norm": 0.17835615575313568, + "learning_rate": 1.2205124463903828e-05, + "loss": 1.7203, + "step": 25401 + }, + { + "epoch": 7.796807857581339, + "grad_norm": 0.1525154411792755, + "learning_rate": 1.2201870474992882e-05, + "loss": 1.7194, + "step": 25402 + }, + { + "epoch": 7.797114794352363, + "grad_norm": 0.15197598934173584, + "learning_rate": 1.2198616859625184e-05, + "loss": 1.6787, + "step": 25403 + }, + { + "epoch": 7.797421731123388, + "grad_norm": 0.1602524071931839, + "learning_rate": 1.2195363617832934e-05, + "loss": 1.6919, + "step": 25404 + }, + { + "epoch": 7.797728667894414, + "grad_norm": 0.15638625621795654, + "learning_rate": 1.2192110749648233e-05, + "loss": 1.6945, + "step": 25405 + }, + { + "epoch": 7.798035604665439, + "grad_norm": 0.15247012674808502, + "learning_rate": 1.2188858255103264e-05, + "loss": 1.673, + "step": 25406 + }, + { + "epoch": 7.798342541436464, + "grad_norm": 0.16753807663917542, + "learning_rate": 1.218560613423016e-05, + "loss": 1.7088, + "step": 25407 + }, + { + "epoch": 7.798649478207489, + "grad_norm": 0.17434635758399963, + "learning_rate": 1.2182354387061063e-05, + "loss": 1.7279, + "step": 25408 + }, + { + "epoch": 7.798956414978514, + "grad_norm": 0.21984371542930603, + "learning_rate": 1.2179103013628108e-05, + "loss": 1.7203, + "step": 25409 + }, + { + "epoch": 7.7992633517495396, + "grad_norm": 0.18304525315761566, + "learning_rate": 1.2175852013963418e-05, + "loss": 1.6937, + "step": 25410 + }, + { + "epoch": 7.799570288520565, + "grad_norm": 0.20372866094112396, + "learning_rate": 1.2172601388099131e-05, + "loss": 1.6911, + "step": 25411 + }, + { + "epoch": 7.79987722529159, + "grad_norm": 0.2012174129486084, + "learning_rate": 1.216935113606737e-05, + "loss": 1.7365, + "step": 25412 + }, + { + "epoch": 7.800184162062616, + "grad_norm": 0.2146923542022705, + "learning_rate": 1.2166101257900236e-05, + "loss": 1.711, + "step": 25413 + }, + { + "epoch": 7.80049109883364, + "grad_norm": 0.202762633562088, + "learning_rate": 1.2162851753629895e-05, + "loss": 1.7459, + "step": 25414 + }, + { + "epoch": 7.800798035604665, + "grad_norm": 0.19161204993724823, + "learning_rate": 1.2159602623288418e-05, + "loss": 1.687, + "step": 25415 + }, + { + "epoch": 7.801104972375691, + "grad_norm": 0.2027188539505005, + "learning_rate": 1.2156353866907927e-05, + "loss": 1.7482, + "step": 25416 + }, + { + "epoch": 7.801411909146716, + "grad_norm": 0.17790403962135315, + "learning_rate": 1.2153105484520521e-05, + "loss": 1.7047, + "step": 25417 + }, + { + "epoch": 7.8017188459177405, + "grad_norm": 0.18325060606002808, + "learning_rate": 1.21498574761583e-05, + "loss": 1.693, + "step": 25418 + }, + { + "epoch": 7.802025782688766, + "grad_norm": 0.14223991334438324, + "learning_rate": 1.2146609841853401e-05, + "loss": 1.7168, + "step": 25419 + }, + { + "epoch": 7.802332719459791, + "grad_norm": 0.18397340178489685, + "learning_rate": 1.2143362581637863e-05, + "loss": 1.7234, + "step": 25420 + }, + { + "epoch": 7.8026396562308165, + "grad_norm": 0.16903668642044067, + "learning_rate": 1.214011569554383e-05, + "loss": 1.6884, + "step": 25421 + }, + { + "epoch": 7.802946593001842, + "grad_norm": 0.15086103975772858, + "learning_rate": 1.2136869183603339e-05, + "loss": 1.6712, + "step": 25422 + }, + { + "epoch": 7.803253529772867, + "grad_norm": 0.1743185818195343, + "learning_rate": 1.2133623045848507e-05, + "loss": 1.7167, + "step": 25423 + }, + { + "epoch": 7.803560466543892, + "grad_norm": 0.160976842045784, + "learning_rate": 1.2130377282311411e-05, + "loss": 1.7749, + "step": 25424 + }, + { + "epoch": 7.803867403314917, + "grad_norm": 0.2554323971271515, + "learning_rate": 1.2127131893024123e-05, + "loss": 1.7156, + "step": 25425 + }, + { + "epoch": 7.804174340085942, + "grad_norm": 0.1582731157541275, + "learning_rate": 1.2123886878018714e-05, + "loss": 1.7088, + "step": 25426 + }, + { + "epoch": 7.804481276856968, + "grad_norm": 0.18008622527122498, + "learning_rate": 1.2120642237327257e-05, + "loss": 1.6928, + "step": 25427 + }, + { + "epoch": 7.804788213627993, + "grad_norm": 0.29349491000175476, + "learning_rate": 1.2117397970981815e-05, + "loss": 1.7596, + "step": 25428 + }, + { + "epoch": 7.805095150399017, + "grad_norm": 0.20927627384662628, + "learning_rate": 1.211415407901445e-05, + "loss": 1.7113, + "step": 25429 + }, + { + "epoch": 7.805402087170043, + "grad_norm": 0.2126142680644989, + "learning_rate": 1.21109105614572e-05, + "loss": 1.7125, + "step": 25430 + }, + { + "epoch": 7.805709023941068, + "grad_norm": 0.20456665754318237, + "learning_rate": 1.2107667418342172e-05, + "loss": 1.7619, + "step": 25431 + }, + { + "epoch": 7.806015960712093, + "grad_norm": 0.17268066108226776, + "learning_rate": 1.2104424649701373e-05, + "loss": 1.6462, + "step": 25432 + }, + { + "epoch": 7.806322897483119, + "grad_norm": 0.16213946044445038, + "learning_rate": 1.2101182255566856e-05, + "loss": 1.6787, + "step": 25433 + }, + { + "epoch": 7.806629834254144, + "grad_norm": 0.17202046513557434, + "learning_rate": 1.2097940235970673e-05, + "loss": 1.7081, + "step": 25434 + }, + { + "epoch": 7.8069367710251685, + "grad_norm": 0.2076229751110077, + "learning_rate": 1.2094698590944842e-05, + "loss": 1.6832, + "step": 25435 + }, + { + "epoch": 7.807243707796194, + "grad_norm": 0.17209482192993164, + "learning_rate": 1.2091457320521448e-05, + "loss": 1.7722, + "step": 25436 + }, + { + "epoch": 7.807550644567219, + "grad_norm": 0.2185208946466446, + "learning_rate": 1.2088216424732463e-05, + "loss": 1.7536, + "step": 25437 + }, + { + "epoch": 7.8078575813382445, + "grad_norm": 0.1812329739332199, + "learning_rate": 1.2084975903609968e-05, + "loss": 1.7275, + "step": 25438 + }, + { + "epoch": 7.80816451810927, + "grad_norm": 0.20143690705299377, + "learning_rate": 1.208173575718594e-05, + "loss": 1.7533, + "step": 25439 + }, + { + "epoch": 7.808471454880294, + "grad_norm": 0.18351776897907257, + "learning_rate": 1.2078495985492433e-05, + "loss": 1.6831, + "step": 25440 + }, + { + "epoch": 7.80877839165132, + "grad_norm": 0.15470999479293823, + "learning_rate": 1.2075256588561462e-05, + "loss": 1.6862, + "step": 25441 + }, + { + "epoch": 7.809085328422345, + "grad_norm": 0.1751607209444046, + "learning_rate": 1.2072017566425032e-05, + "loss": 1.7182, + "step": 25442 + }, + { + "epoch": 7.80939226519337, + "grad_norm": 0.16465237736701965, + "learning_rate": 1.2068778919115153e-05, + "loss": 1.7055, + "step": 25443 + }, + { + "epoch": 7.809699201964396, + "grad_norm": 0.13899528980255127, + "learning_rate": 1.2065540646663832e-05, + "loss": 1.634, + "step": 25444 + }, + { + "epoch": 7.810006138735421, + "grad_norm": 0.21526047587394714, + "learning_rate": 1.2062302749103072e-05, + "loss": 1.759, + "step": 25445 + }, + { + "epoch": 7.810313075506445, + "grad_norm": 0.1628599315881729, + "learning_rate": 1.2059065226464872e-05, + "loss": 1.6782, + "step": 25446 + }, + { + "epoch": 7.810620012277471, + "grad_norm": 0.16853751242160797, + "learning_rate": 1.2055828078781217e-05, + "loss": 1.7123, + "step": 25447 + }, + { + "epoch": 7.810926949048496, + "grad_norm": 0.17399325966835022, + "learning_rate": 1.2052591306084138e-05, + "loss": 1.7394, + "step": 25448 + }, + { + "epoch": 7.811233885819521, + "grad_norm": 0.16147997975349426, + "learning_rate": 1.2049354908405574e-05, + "loss": 1.66, + "step": 25449 + }, + { + "epoch": 7.811540822590547, + "grad_norm": 0.1806066632270813, + "learning_rate": 1.204611888577753e-05, + "loss": 1.7193, + "step": 25450 + }, + { + "epoch": 7.811847759361571, + "grad_norm": 0.14491340517997742, + "learning_rate": 1.2042883238231984e-05, + "loss": 1.6996, + "step": 25451 + }, + { + "epoch": 7.8121546961325965, + "grad_norm": 0.24257591366767883, + "learning_rate": 1.2039647965800905e-05, + "loss": 1.734, + "step": 25452 + }, + { + "epoch": 7.812461632903622, + "grad_norm": 0.17281031608581543, + "learning_rate": 1.2036413068516295e-05, + "loss": 1.7469, + "step": 25453 + }, + { + "epoch": 7.812768569674647, + "grad_norm": 0.16350387036800385, + "learning_rate": 1.2033178546410073e-05, + "loss": 1.6755, + "step": 25454 + }, + { + "epoch": 7.8130755064456725, + "grad_norm": 0.21092571318149567, + "learning_rate": 1.202994439951427e-05, + "loss": 1.7538, + "step": 25455 + }, + { + "epoch": 7.813382443216698, + "grad_norm": 0.13705989718437195, + "learning_rate": 1.2026710627860777e-05, + "loss": 1.6563, + "step": 25456 + }, + { + "epoch": 7.813689379987722, + "grad_norm": 0.2368711531162262, + "learning_rate": 1.20234772314816e-05, + "loss": 1.7685, + "step": 25457 + }, + { + "epoch": 7.813996316758748, + "grad_norm": 0.19303718209266663, + "learning_rate": 1.2020244210408682e-05, + "loss": 1.7286, + "step": 25458 + }, + { + "epoch": 7.814303253529773, + "grad_norm": 0.17113862931728363, + "learning_rate": 1.2017011564673974e-05, + "loss": 1.6336, + "step": 25459 + }, + { + "epoch": 7.814610190300798, + "grad_norm": 0.2151467204093933, + "learning_rate": 1.2013779294309418e-05, + "loss": 1.7585, + "step": 25460 + }, + { + "epoch": 7.814917127071823, + "grad_norm": 0.21620413661003113, + "learning_rate": 1.2010547399346961e-05, + "loss": 1.7058, + "step": 25461 + }, + { + "epoch": 7.815224063842848, + "grad_norm": 0.20134735107421875, + "learning_rate": 1.2007315879818537e-05, + "loss": 1.7833, + "step": 25462 + }, + { + "epoch": 7.815531000613873, + "grad_norm": 0.16653650999069214, + "learning_rate": 1.2004084735756088e-05, + "loss": 1.7022, + "step": 25463 + }, + { + "epoch": 7.815837937384899, + "grad_norm": 0.2135760486125946, + "learning_rate": 1.2000853967191527e-05, + "loss": 1.7502, + "step": 25464 + }, + { + "epoch": 7.816144874155924, + "grad_norm": 0.19773945212364197, + "learning_rate": 1.199762357415683e-05, + "loss": 1.7369, + "step": 25465 + }, + { + "epoch": 7.816451810926949, + "grad_norm": 0.1873825341463089, + "learning_rate": 1.1994393556683876e-05, + "loss": 1.6921, + "step": 25466 + }, + { + "epoch": 7.816758747697974, + "grad_norm": 0.19304445385932922, + "learning_rate": 1.1991163914804604e-05, + "loss": 1.6934, + "step": 25467 + }, + { + "epoch": 7.817065684468999, + "grad_norm": 0.16338905692100525, + "learning_rate": 1.1987934648550924e-05, + "loss": 1.6523, + "step": 25468 + }, + { + "epoch": 7.8173726212400245, + "grad_norm": 0.16972069442272186, + "learning_rate": 1.198470575795474e-05, + "loss": 1.6907, + "step": 25469 + }, + { + "epoch": 7.81767955801105, + "grad_norm": 0.17251834273338318, + "learning_rate": 1.1981477243048e-05, + "loss": 1.7336, + "step": 25470 + }, + { + "epoch": 7.817986494782075, + "grad_norm": 0.17767611145973206, + "learning_rate": 1.197824910386256e-05, + "loss": 1.6809, + "step": 25471 + }, + { + "epoch": 7.8182934315531, + "grad_norm": 0.1854296773672104, + "learning_rate": 1.197502134043038e-05, + "loss": 1.6938, + "step": 25472 + }, + { + "epoch": 7.818600368324125, + "grad_norm": 0.15811395645141602, + "learning_rate": 1.1971793952783295e-05, + "loss": 1.6346, + "step": 25473 + }, + { + "epoch": 7.81890730509515, + "grad_norm": 0.1668241322040558, + "learning_rate": 1.196856694095324e-05, + "loss": 1.7014, + "step": 25474 + }, + { + "epoch": 7.819214241866176, + "grad_norm": 0.16705112159252167, + "learning_rate": 1.1965340304972105e-05, + "loss": 1.7509, + "step": 25475 + }, + { + "epoch": 7.819521178637201, + "grad_norm": 0.1737189143896103, + "learning_rate": 1.1962114044871764e-05, + "loss": 1.6934, + "step": 25476 + }, + { + "epoch": 7.819828115408226, + "grad_norm": 0.21887148916721344, + "learning_rate": 1.1958888160684112e-05, + "loss": 1.7163, + "step": 25477 + }, + { + "epoch": 7.820135052179251, + "grad_norm": 0.19267810881137848, + "learning_rate": 1.1955662652441018e-05, + "loss": 1.6941, + "step": 25478 + }, + { + "epoch": 7.820441988950276, + "grad_norm": 0.19797572493553162, + "learning_rate": 1.195243752017437e-05, + "loss": 1.7067, + "step": 25479 + }, + { + "epoch": 7.820748925721301, + "grad_norm": 0.20177066326141357, + "learning_rate": 1.1949212763916035e-05, + "loss": 1.7186, + "step": 25480 + }, + { + "epoch": 7.821055862492327, + "grad_norm": 0.1789240539073944, + "learning_rate": 1.1945988383697876e-05, + "loss": 1.7533, + "step": 25481 + }, + { + "epoch": 7.821362799263352, + "grad_norm": 0.2210909128189087, + "learning_rate": 1.1942764379551769e-05, + "loss": 1.7255, + "step": 25482 + }, + { + "epoch": 7.8216697360343765, + "grad_norm": 0.17705149948596954, + "learning_rate": 1.193954075150957e-05, + "loss": 1.6797, + "step": 25483 + }, + { + "epoch": 7.821976672805402, + "grad_norm": 0.17962488532066345, + "learning_rate": 1.1936317499603134e-05, + "loss": 1.7134, + "step": 25484 + }, + { + "epoch": 7.822283609576427, + "grad_norm": 0.2144375741481781, + "learning_rate": 1.193309462386432e-05, + "loss": 1.6837, + "step": 25485 + }, + { + "epoch": 7.8225905463474525, + "grad_norm": 0.19018805027008057, + "learning_rate": 1.1929872124324976e-05, + "loss": 1.7377, + "step": 25486 + }, + { + "epoch": 7.822897483118478, + "grad_norm": 0.2281246781349182, + "learning_rate": 1.1926650001016953e-05, + "loss": 1.755, + "step": 25487 + }, + { + "epoch": 7.823204419889503, + "grad_norm": 0.17724375426769257, + "learning_rate": 1.1923428253972069e-05, + "loss": 1.7018, + "step": 25488 + }, + { + "epoch": 7.823511356660528, + "grad_norm": 0.19313837587833405, + "learning_rate": 1.1920206883222218e-05, + "loss": 1.705, + "step": 25489 + }, + { + "epoch": 7.823818293431553, + "grad_norm": 0.1883455514907837, + "learning_rate": 1.191698588879917e-05, + "loss": 1.66, + "step": 25490 + }, + { + "epoch": 7.824125230202578, + "grad_norm": 0.20110155642032623, + "learning_rate": 1.1913765270734805e-05, + "loss": 1.7456, + "step": 25491 + }, + { + "epoch": 7.824432166973604, + "grad_norm": 0.23234841227531433, + "learning_rate": 1.1910545029060938e-05, + "loss": 1.6987, + "step": 25492 + }, + { + "epoch": 7.824739103744628, + "grad_norm": 0.208989679813385, + "learning_rate": 1.1907325163809386e-05, + "loss": 1.7753, + "step": 25493 + }, + { + "epoch": 7.8250460405156534, + "grad_norm": 0.19063059985637665, + "learning_rate": 1.1904105675011972e-05, + "loss": 1.6664, + "step": 25494 + }, + { + "epoch": 7.825352977286679, + "grad_norm": 0.16878041625022888, + "learning_rate": 1.1900886562700519e-05, + "loss": 1.6886, + "step": 25495 + }, + { + "epoch": 7.825659914057704, + "grad_norm": 0.19139298796653748, + "learning_rate": 1.1897667826906834e-05, + "loss": 1.7195, + "step": 25496 + }, + { + "epoch": 7.8259668508287294, + "grad_norm": 0.255795419216156, + "learning_rate": 1.1894449467662728e-05, + "loss": 1.7835, + "step": 25497 + }, + { + "epoch": 7.826273787599755, + "grad_norm": 0.17967084050178528, + "learning_rate": 1.1891231485000004e-05, + "loss": 1.6959, + "step": 25498 + }, + { + "epoch": 7.82658072437078, + "grad_norm": 0.23582984507083893, + "learning_rate": 1.1888013878950471e-05, + "loss": 1.7252, + "step": 25499 + }, + { + "epoch": 7.826887661141805, + "grad_norm": 0.189914271235466, + "learning_rate": 1.188479664954592e-05, + "loss": 1.7216, + "step": 25500 + }, + { + "epoch": 7.82719459791283, + "grad_norm": 0.19840605556964874, + "learning_rate": 1.1881579796818148e-05, + "loss": 1.714, + "step": 25501 + }, + { + "epoch": 7.827501534683855, + "grad_norm": 0.25255537033081055, + "learning_rate": 1.1878363320798946e-05, + "loss": 1.7008, + "step": 25502 + }, + { + "epoch": 7.827808471454881, + "grad_norm": 0.1863456666469574, + "learning_rate": 1.1875147221520105e-05, + "loss": 1.7804, + "step": 25503 + }, + { + "epoch": 7.828115408225905, + "grad_norm": 0.2700684368610382, + "learning_rate": 1.1871931499013405e-05, + "loss": 1.6756, + "step": 25504 + }, + { + "epoch": 7.82842234499693, + "grad_norm": 0.19838537275791168, + "learning_rate": 1.1868716153310604e-05, + "loss": 1.6828, + "step": 25505 + }, + { + "epoch": 7.828729281767956, + "grad_norm": 0.1896767020225525, + "learning_rate": 1.1865501184443533e-05, + "loss": 1.7014, + "step": 25506 + }, + { + "epoch": 7.829036218538981, + "grad_norm": 0.2330249398946762, + "learning_rate": 1.1862286592443905e-05, + "loss": 1.7509, + "step": 25507 + }, + { + "epoch": 7.829343155310006, + "grad_norm": 0.17078560590744019, + "learning_rate": 1.1859072377343539e-05, + "loss": 1.6742, + "step": 25508 + }, + { + "epoch": 7.829650092081032, + "grad_norm": 0.2834900915622711, + "learning_rate": 1.1855858539174146e-05, + "loss": 1.7676, + "step": 25509 + }, + { + "epoch": 7.829957028852056, + "grad_norm": 0.18936461210250854, + "learning_rate": 1.1852645077967533e-05, + "loss": 1.7374, + "step": 25510 + }, + { + "epoch": 7.8302639656230815, + "grad_norm": 0.2720448970794678, + "learning_rate": 1.1849431993755439e-05, + "loss": 1.7001, + "step": 25511 + }, + { + "epoch": 7.830570902394107, + "grad_norm": 0.18198262155056, + "learning_rate": 1.184621928656962e-05, + "loss": 1.6679, + "step": 25512 + }, + { + "epoch": 7.830877839165132, + "grad_norm": 0.16957701742649078, + "learning_rate": 1.1843006956441821e-05, + "loss": 1.7064, + "step": 25513 + }, + { + "epoch": 7.8311847759361575, + "grad_norm": 0.18632464110851288, + "learning_rate": 1.1839795003403798e-05, + "loss": 1.6857, + "step": 25514 + }, + { + "epoch": 7.831491712707182, + "grad_norm": 0.15639352798461914, + "learning_rate": 1.183658342748728e-05, + "loss": 1.695, + "step": 25515 + }, + { + "epoch": 7.831798649478207, + "grad_norm": 0.17000986635684967, + "learning_rate": 1.1833372228724016e-05, + "loss": 1.696, + "step": 25516 + }, + { + "epoch": 7.832105586249233, + "grad_norm": 0.23334810137748718, + "learning_rate": 1.1830161407145735e-05, + "loss": 1.7574, + "step": 25517 + }, + { + "epoch": 7.832412523020258, + "grad_norm": 0.16260294616222382, + "learning_rate": 1.1826950962784177e-05, + "loss": 1.667, + "step": 25518 + }, + { + "epoch": 7.832719459791283, + "grad_norm": 0.18244150280952454, + "learning_rate": 1.1823740895671059e-05, + "loss": 1.6836, + "step": 25519 + }, + { + "epoch": 7.833026396562309, + "grad_norm": 0.18404243886470795, + "learning_rate": 1.182053120583811e-05, + "loss": 1.6922, + "step": 25520 + }, + { + "epoch": 7.833333333333333, + "grad_norm": 0.22713635861873627, + "learning_rate": 1.1817321893317052e-05, + "loss": 1.8055, + "step": 25521 + }, + { + "epoch": 7.833640270104358, + "grad_norm": 0.14314736425876617, + "learning_rate": 1.1814112958139577e-05, + "loss": 1.6624, + "step": 25522 + }, + { + "epoch": 7.833947206875384, + "grad_norm": 0.1947709321975708, + "learning_rate": 1.1810904400337458e-05, + "loss": 1.8108, + "step": 25523 + }, + { + "epoch": 7.834254143646409, + "grad_norm": 0.1811491698026657, + "learning_rate": 1.1807696219942326e-05, + "loss": 1.7258, + "step": 25524 + }, + { + "epoch": 7.834561080417434, + "grad_norm": 0.16776522994041443, + "learning_rate": 1.1804488416985966e-05, + "loss": 1.6834, + "step": 25525 + }, + { + "epoch": 7.834868017188459, + "grad_norm": 0.1590484231710434, + "learning_rate": 1.1801280991500002e-05, + "loss": 1.6797, + "step": 25526 + }, + { + "epoch": 7.835174953959484, + "grad_norm": 0.1564435064792633, + "learning_rate": 1.179807394351618e-05, + "loss": 1.7035, + "step": 25527 + }, + { + "epoch": 7.8354818907305095, + "grad_norm": 0.17740637063980103, + "learning_rate": 1.1794867273066184e-05, + "loss": 1.6844, + "step": 25528 + }, + { + "epoch": 7.835788827501535, + "grad_norm": 0.17152990400791168, + "learning_rate": 1.1791660980181707e-05, + "loss": 1.6745, + "step": 25529 + }, + { + "epoch": 7.83609576427256, + "grad_norm": 0.17763324081897736, + "learning_rate": 1.1788455064894427e-05, + "loss": 1.6941, + "step": 25530 + }, + { + "epoch": 7.8364027010435855, + "grad_norm": 0.16168560087680817, + "learning_rate": 1.178524952723603e-05, + "loss": 1.6955, + "step": 25531 + }, + { + "epoch": 7.83670963781461, + "grad_norm": 0.1819266527891159, + "learning_rate": 1.1782044367238199e-05, + "loss": 1.6838, + "step": 25532 + }, + { + "epoch": 7.837016574585635, + "grad_norm": 0.16239593923091888, + "learning_rate": 1.1778839584932605e-05, + "loss": 1.7045, + "step": 25533 + }, + { + "epoch": 7.837323511356661, + "grad_norm": 0.18346372246742249, + "learning_rate": 1.177563518035092e-05, + "loss": 1.7418, + "step": 25534 + }, + { + "epoch": 7.837630448127686, + "grad_norm": 0.18437781929969788, + "learning_rate": 1.177243115352481e-05, + "loss": 1.7138, + "step": 25535 + }, + { + "epoch": 7.83793738489871, + "grad_norm": 0.16199420392513275, + "learning_rate": 1.1769227504485942e-05, + "loss": 1.7115, + "step": 25536 + }, + { + "epoch": 7.838244321669736, + "grad_norm": 0.174173504114151, + "learning_rate": 1.1766024233265977e-05, + "loss": 1.7115, + "step": 25537 + }, + { + "epoch": 7.838551258440761, + "grad_norm": 0.1924828737974167, + "learning_rate": 1.1762821339896567e-05, + "loss": 1.7343, + "step": 25538 + }, + { + "epoch": 7.838858195211786, + "grad_norm": 0.20509763062000275, + "learning_rate": 1.1759618824409357e-05, + "loss": 1.7296, + "step": 25539 + }, + { + "epoch": 7.839165131982812, + "grad_norm": 0.1762499213218689, + "learning_rate": 1.1756416686836035e-05, + "loss": 1.6721, + "step": 25540 + }, + { + "epoch": 7.839472068753837, + "grad_norm": 0.17260326445102692, + "learning_rate": 1.175321492720819e-05, + "loss": 1.7238, + "step": 25541 + }, + { + "epoch": 7.8397790055248615, + "grad_norm": 0.21378587186336517, + "learning_rate": 1.175001354555752e-05, + "loss": 1.7442, + "step": 25542 + }, + { + "epoch": 7.840085942295887, + "grad_norm": 0.20900048315525055, + "learning_rate": 1.1746812541915608e-05, + "loss": 1.7426, + "step": 25543 + }, + { + "epoch": 7.840392879066912, + "grad_norm": 0.2082734853029251, + "learning_rate": 1.1743611916314129e-05, + "loss": 1.7209, + "step": 25544 + }, + { + "epoch": 7.8406998158379375, + "grad_norm": 0.1696191281080246, + "learning_rate": 1.1740411668784701e-05, + "loss": 1.7039, + "step": 25545 + }, + { + "epoch": 7.841006752608963, + "grad_norm": 0.18812915682792664, + "learning_rate": 1.173721179935895e-05, + "loss": 1.6873, + "step": 25546 + }, + { + "epoch": 7.841313689379987, + "grad_norm": 0.19983457028865814, + "learning_rate": 1.1734012308068493e-05, + "loss": 1.701, + "step": 25547 + }, + { + "epoch": 7.841620626151013, + "grad_norm": 0.18811485171318054, + "learning_rate": 1.1730813194944962e-05, + "loss": 1.7466, + "step": 25548 + }, + { + "epoch": 7.841927562922038, + "grad_norm": 0.16648226976394653, + "learning_rate": 1.172761446001996e-05, + "loss": 1.7449, + "step": 25549 + }, + { + "epoch": 7.842234499693063, + "grad_norm": 0.17902494966983795, + "learning_rate": 1.1724416103325104e-05, + "loss": 1.7395, + "step": 25550 + }, + { + "epoch": 7.842541436464089, + "grad_norm": 0.2420952469110489, + "learning_rate": 1.1721218124892003e-05, + "loss": 1.728, + "step": 25551 + }, + { + "epoch": 7.842848373235114, + "grad_norm": 0.16240666806697845, + "learning_rate": 1.1718020524752266e-05, + "loss": 1.6368, + "step": 25552 + }, + { + "epoch": 7.843155310006138, + "grad_norm": 0.17968396842479706, + "learning_rate": 1.1714823302937483e-05, + "loss": 1.729, + "step": 25553 + }, + { + "epoch": 7.843462246777164, + "grad_norm": 0.17617417871952057, + "learning_rate": 1.1711626459479252e-05, + "loss": 1.6975, + "step": 25554 + }, + { + "epoch": 7.843769183548189, + "grad_norm": 0.1679859161376953, + "learning_rate": 1.1708429994409176e-05, + "loss": 1.6955, + "step": 25555 + }, + { + "epoch": 7.844076120319214, + "grad_norm": 0.1653962880373001, + "learning_rate": 1.1705233907758823e-05, + "loss": 1.7107, + "step": 25556 + }, + { + "epoch": 7.84438305709024, + "grad_norm": 0.190699502825737, + "learning_rate": 1.1702038199559817e-05, + "loss": 1.75, + "step": 25557 + }, + { + "epoch": 7.844689993861264, + "grad_norm": 0.17185768485069275, + "learning_rate": 1.1698842869843696e-05, + "loss": 1.7087, + "step": 25558 + }, + { + "epoch": 7.8449969306322895, + "grad_norm": 0.17880931496620178, + "learning_rate": 1.1695647918642084e-05, + "loss": 1.7082, + "step": 25559 + }, + { + "epoch": 7.845303867403315, + "grad_norm": 0.15360671281814575, + "learning_rate": 1.1692453345986498e-05, + "loss": 1.7028, + "step": 25560 + }, + { + "epoch": 7.84561080417434, + "grad_norm": 0.16576705873012543, + "learning_rate": 1.168925915190856e-05, + "loss": 1.7147, + "step": 25561 + }, + { + "epoch": 7.8459177409453655, + "grad_norm": 0.14623773097991943, + "learning_rate": 1.1686065336439817e-05, + "loss": 1.682, + "step": 25562 + }, + { + "epoch": 7.846224677716391, + "grad_norm": 0.16677425801753998, + "learning_rate": 1.168287189961183e-05, + "loss": 1.7089, + "step": 25563 + }, + { + "epoch": 7.846531614487415, + "grad_norm": 0.160381019115448, + "learning_rate": 1.1679678841456164e-05, + "loss": 1.6929, + "step": 25564 + }, + { + "epoch": 7.846838551258441, + "grad_norm": 0.1775302290916443, + "learning_rate": 1.1676486162004374e-05, + "loss": 1.6947, + "step": 25565 + }, + { + "epoch": 7.847145488029466, + "grad_norm": 0.1681419014930725, + "learning_rate": 1.1673293861288003e-05, + "loss": 1.7173, + "step": 25566 + }, + { + "epoch": 7.847452424800491, + "grad_norm": 0.18374401330947876, + "learning_rate": 1.1670101939338613e-05, + "loss": 1.7175, + "step": 25567 + }, + { + "epoch": 7.847759361571516, + "grad_norm": 0.19383086264133453, + "learning_rate": 1.1666910396187736e-05, + "loss": 1.6962, + "step": 25568 + }, + { + "epoch": 7.848066298342541, + "grad_norm": 0.16849574446678162, + "learning_rate": 1.1663719231866921e-05, + "loss": 1.6717, + "step": 25569 + }, + { + "epoch": 7.848373235113566, + "grad_norm": 0.2510664165019989, + "learning_rate": 1.1660528446407703e-05, + "loss": 1.7983, + "step": 25570 + }, + { + "epoch": 7.848680171884592, + "grad_norm": 0.21037714183330536, + "learning_rate": 1.1657338039841614e-05, + "loss": 1.7287, + "step": 25571 + }, + { + "epoch": 7.848987108655617, + "grad_norm": 0.15170596539974213, + "learning_rate": 1.1654148012200184e-05, + "loss": 1.7076, + "step": 25572 + }, + { + "epoch": 7.849294045426642, + "grad_norm": 0.2093864530324936, + "learning_rate": 1.1650958363514919e-05, + "loss": 1.7469, + "step": 25573 + }, + { + "epoch": 7.849600982197668, + "grad_norm": 0.15684813261032104, + "learning_rate": 1.1647769093817395e-05, + "loss": 1.6731, + "step": 25574 + }, + { + "epoch": 7.849907918968692, + "grad_norm": 0.1600468009710312, + "learning_rate": 1.1644580203139066e-05, + "loss": 1.6394, + "step": 25575 + }, + { + "epoch": 7.850214855739718, + "grad_norm": 0.1863955557346344, + "learning_rate": 1.1641391691511505e-05, + "loss": 1.7025, + "step": 25576 + }, + { + "epoch": 7.850521792510743, + "grad_norm": 0.189132422208786, + "learning_rate": 1.1638203558966166e-05, + "loss": 1.7095, + "step": 25577 + }, + { + "epoch": 7.850828729281768, + "grad_norm": 0.166460782289505, + "learning_rate": 1.1635015805534593e-05, + "loss": 1.6756, + "step": 25578 + }, + { + "epoch": 7.851135666052793, + "grad_norm": 0.15910424292087555, + "learning_rate": 1.1631828431248288e-05, + "loss": 1.6664, + "step": 25579 + }, + { + "epoch": 7.851442602823818, + "grad_norm": 0.14848501980304718, + "learning_rate": 1.1628641436138738e-05, + "loss": 1.6434, + "step": 25580 + }, + { + "epoch": 7.851749539594843, + "grad_norm": 0.1700928956270218, + "learning_rate": 1.1625454820237446e-05, + "loss": 1.7039, + "step": 25581 + }, + { + "epoch": 7.852056476365869, + "grad_norm": 0.17468976974487305, + "learning_rate": 1.1622268583575902e-05, + "loss": 1.7073, + "step": 25582 + }, + { + "epoch": 7.852363413136894, + "grad_norm": 0.18980912864208221, + "learning_rate": 1.1619082726185587e-05, + "loss": 1.6939, + "step": 25583 + }, + { + "epoch": 7.852670349907919, + "grad_norm": 0.1658385694026947, + "learning_rate": 1.1615897248098e-05, + "loss": 1.6892, + "step": 25584 + }, + { + "epoch": 7.852977286678944, + "grad_norm": 0.18137763440608978, + "learning_rate": 1.1612712149344612e-05, + "loss": 1.6608, + "step": 25585 + }, + { + "epoch": 7.853284223449969, + "grad_norm": 0.1642989218235016, + "learning_rate": 1.16095274299569e-05, + "loss": 1.6527, + "step": 25586 + }, + { + "epoch": 7.8535911602209945, + "grad_norm": 0.17476631700992584, + "learning_rate": 1.1606343089966343e-05, + "loss": 1.6622, + "step": 25587 + }, + { + "epoch": 7.85389809699202, + "grad_norm": 0.14995649456977844, + "learning_rate": 1.16031591294044e-05, + "loss": 1.6382, + "step": 25588 + }, + { + "epoch": 7.854205033763045, + "grad_norm": 0.16073103249073029, + "learning_rate": 1.1599975548302549e-05, + "loss": 1.6888, + "step": 25589 + }, + { + "epoch": 7.85451197053407, + "grad_norm": 0.1630357801914215, + "learning_rate": 1.159679234669223e-05, + "loss": 1.6717, + "step": 25590 + }, + { + "epoch": 7.854818907305095, + "grad_norm": 0.1537420153617859, + "learning_rate": 1.1593609524604948e-05, + "loss": 1.6836, + "step": 25591 + }, + { + "epoch": 7.85512584407612, + "grad_norm": 0.16389401257038116, + "learning_rate": 1.1590427082072103e-05, + "loss": 1.6941, + "step": 25592 + }, + { + "epoch": 7.855432780847146, + "grad_norm": 0.24554979801177979, + "learning_rate": 1.1587245019125192e-05, + "loss": 1.8018, + "step": 25593 + }, + { + "epoch": 7.855739717618171, + "grad_norm": 0.15020978450775146, + "learning_rate": 1.1584063335795614e-05, + "loss": 1.6815, + "step": 25594 + }, + { + "epoch": 7.856046654389196, + "grad_norm": 0.1830887496471405, + "learning_rate": 1.1580882032114853e-05, + "loss": 1.7134, + "step": 25595 + }, + { + "epoch": 7.856353591160221, + "grad_norm": 0.2381841540336609, + "learning_rate": 1.157770110811433e-05, + "loss": 1.7505, + "step": 25596 + }, + { + "epoch": 7.856660527931246, + "grad_norm": 0.210253044962883, + "learning_rate": 1.1574520563825491e-05, + "loss": 1.8048, + "step": 25597 + }, + { + "epoch": 7.856967464702271, + "grad_norm": 0.15428896248340607, + "learning_rate": 1.1571340399279756e-05, + "loss": 1.6624, + "step": 25598 + }, + { + "epoch": 7.857274401473297, + "grad_norm": 0.2932582199573517, + "learning_rate": 1.1568160614508567e-05, + "loss": 1.7192, + "step": 25599 + }, + { + "epoch": 7.857581338244322, + "grad_norm": 0.19450223445892334, + "learning_rate": 1.156498120954333e-05, + "loss": 1.753, + "step": 25600 + }, + { + "epoch": 7.8578882750153465, + "grad_norm": 0.16950540244579315, + "learning_rate": 1.1561802184415482e-05, + "loss": 1.7107, + "step": 25601 + }, + { + "epoch": 7.858195211786372, + "grad_norm": 0.18616287410259247, + "learning_rate": 1.1558623539156433e-05, + "loss": 1.6747, + "step": 25602 + }, + { + "epoch": 7.858502148557397, + "grad_norm": 0.20991890132427216, + "learning_rate": 1.1555445273797599e-05, + "loss": 1.6635, + "step": 25603 + }, + { + "epoch": 7.8588090853284225, + "grad_norm": 0.18592311441898346, + "learning_rate": 1.1552267388370386e-05, + "loss": 1.7327, + "step": 25604 + }, + { + "epoch": 7.859116022099448, + "grad_norm": 0.16478584706783295, + "learning_rate": 1.1549089882906206e-05, + "loss": 1.6523, + "step": 25605 + }, + { + "epoch": 7.859422958870473, + "grad_norm": 0.17281852662563324, + "learning_rate": 1.154591275743645e-05, + "loss": 1.7282, + "step": 25606 + }, + { + "epoch": 7.859729895641498, + "grad_norm": 0.17098689079284668, + "learning_rate": 1.1542736011992512e-05, + "loss": 1.7533, + "step": 25607 + }, + { + "epoch": 7.860036832412523, + "grad_norm": 0.1766287386417389, + "learning_rate": 1.1539559646605824e-05, + "loss": 1.6338, + "step": 25608 + }, + { + "epoch": 7.860343769183548, + "grad_norm": 0.15519756078720093, + "learning_rate": 1.1536383661307726e-05, + "loss": 1.6908, + "step": 25609 + }, + { + "epoch": 7.860650705954574, + "grad_norm": 0.18422503769397736, + "learning_rate": 1.1533208056129651e-05, + "loss": 1.6983, + "step": 25610 + }, + { + "epoch": 7.860957642725598, + "grad_norm": 0.1900123953819275, + "learning_rate": 1.1530032831102933e-05, + "loss": 1.7082, + "step": 25611 + }, + { + "epoch": 7.861264579496623, + "grad_norm": 0.15542784333229065, + "learning_rate": 1.1526857986259e-05, + "loss": 1.6979, + "step": 25612 + }, + { + "epoch": 7.861571516267649, + "grad_norm": 0.17173884809017181, + "learning_rate": 1.1523683521629197e-05, + "loss": 1.7329, + "step": 25613 + }, + { + "epoch": 7.861878453038674, + "grad_norm": 0.2399773746728897, + "learning_rate": 1.1520509437244908e-05, + "loss": 1.7224, + "step": 25614 + }, + { + "epoch": 7.862185389809699, + "grad_norm": 0.14101925492286682, + "learning_rate": 1.1517335733137502e-05, + "loss": 1.6676, + "step": 25615 + }, + { + "epoch": 7.862492326580725, + "grad_norm": 0.18625333905220032, + "learning_rate": 1.1514162409338336e-05, + "loss": 1.7269, + "step": 25616 + }, + { + "epoch": 7.862799263351749, + "grad_norm": 0.18385125696659088, + "learning_rate": 1.1510989465878774e-05, + "loss": 1.7197, + "step": 25617 + }, + { + "epoch": 7.8631062001227745, + "grad_norm": 0.16189569234848022, + "learning_rate": 1.1507816902790176e-05, + "loss": 1.662, + "step": 25618 + }, + { + "epoch": 7.8634131368938, + "grad_norm": 0.18526791036128998, + "learning_rate": 1.1504644720103885e-05, + "loss": 1.7521, + "step": 25619 + }, + { + "epoch": 7.863720073664825, + "grad_norm": 0.16588367521762848, + "learning_rate": 1.1501472917851263e-05, + "loss": 1.7238, + "step": 25620 + }, + { + "epoch": 7.8640270104358505, + "grad_norm": 0.15427199006080627, + "learning_rate": 1.1498301496063652e-05, + "loss": 1.6566, + "step": 25621 + }, + { + "epoch": 7.864333947206875, + "grad_norm": 0.1694655865430832, + "learning_rate": 1.149513045477239e-05, + "loss": 1.7446, + "step": 25622 + }, + { + "epoch": 7.8646408839779, + "grad_norm": 0.18305882811546326, + "learning_rate": 1.1491959794008823e-05, + "loss": 1.7093, + "step": 25623 + }, + { + "epoch": 7.864947820748926, + "grad_norm": 0.15975148975849152, + "learning_rate": 1.148878951380426e-05, + "loss": 1.6911, + "step": 25624 + }, + { + "epoch": 7.865254757519951, + "grad_norm": 0.18298782408237457, + "learning_rate": 1.148561961419008e-05, + "loss": 1.7188, + "step": 25625 + }, + { + "epoch": 7.865561694290976, + "grad_norm": 0.16258102655410767, + "learning_rate": 1.148245009519755e-05, + "loss": 1.6901, + "step": 25626 + }, + { + "epoch": 7.865868631062002, + "grad_norm": 0.19591568410396576, + "learning_rate": 1.1479280956858057e-05, + "loss": 1.7521, + "step": 25627 + }, + { + "epoch": 7.866175567833026, + "grad_norm": 0.15821373462677002, + "learning_rate": 1.1476112199202853e-05, + "loss": 1.6503, + "step": 25628 + }, + { + "epoch": 7.866482504604051, + "grad_norm": 0.1531122773885727, + "learning_rate": 1.147294382226331e-05, + "loss": 1.6802, + "step": 25629 + }, + { + "epoch": 7.866789441375077, + "grad_norm": 0.2105177342891693, + "learning_rate": 1.1469775826070711e-05, + "loss": 1.7705, + "step": 25630 + }, + { + "epoch": 7.867096378146102, + "grad_norm": 0.22782234847545624, + "learning_rate": 1.1466608210656377e-05, + "loss": 1.6813, + "step": 25631 + }, + { + "epoch": 7.867403314917127, + "grad_norm": 0.1824047863483429, + "learning_rate": 1.1463440976051598e-05, + "loss": 1.7149, + "step": 25632 + }, + { + "epoch": 7.867710251688152, + "grad_norm": 0.19195812940597534, + "learning_rate": 1.1460274122287685e-05, + "loss": 1.6912, + "step": 25633 + }, + { + "epoch": 7.868017188459177, + "grad_norm": 0.22274719178676605, + "learning_rate": 1.1457107649395937e-05, + "loss": 1.8499, + "step": 25634 + }, + { + "epoch": 7.8683241252302025, + "grad_norm": 0.21217535436153412, + "learning_rate": 1.1453941557407638e-05, + "loss": 1.7345, + "step": 25635 + }, + { + "epoch": 7.868631062001228, + "grad_norm": 0.20042434334754944, + "learning_rate": 1.1450775846354078e-05, + "loss": 1.6902, + "step": 25636 + }, + { + "epoch": 7.868937998772253, + "grad_norm": 0.17045147716999054, + "learning_rate": 1.1447610516266548e-05, + "loss": 1.6641, + "step": 25637 + }, + { + "epoch": 7.8692449355432785, + "grad_norm": 0.18817269802093506, + "learning_rate": 1.1444445567176326e-05, + "loss": 1.7063, + "step": 25638 + }, + { + "epoch": 7.869551872314303, + "grad_norm": 0.1746743619441986, + "learning_rate": 1.1441280999114694e-05, + "loss": 1.6838, + "step": 25639 + }, + { + "epoch": 7.869858809085328, + "grad_norm": 0.1734321415424347, + "learning_rate": 1.1438116812112925e-05, + "loss": 1.6939, + "step": 25640 + }, + { + "epoch": 7.870165745856354, + "grad_norm": 0.1745334416627884, + "learning_rate": 1.1434953006202281e-05, + "loss": 1.71, + "step": 25641 + }, + { + "epoch": 7.870472682627379, + "grad_norm": 0.20883594453334808, + "learning_rate": 1.1431789581414043e-05, + "loss": 1.6941, + "step": 25642 + }, + { + "epoch": 7.870779619398404, + "grad_norm": 0.1664251685142517, + "learning_rate": 1.1428626537779447e-05, + "loss": 1.6995, + "step": 25643 + }, + { + "epoch": 7.871086556169429, + "grad_norm": 0.16561046242713928, + "learning_rate": 1.1425463875329795e-05, + "loss": 1.7093, + "step": 25644 + }, + { + "epoch": 7.871393492940454, + "grad_norm": 0.21409009397029877, + "learning_rate": 1.1422301594096297e-05, + "loss": 1.6919, + "step": 25645 + }, + { + "epoch": 7.871700429711479, + "grad_norm": 0.19574479758739471, + "learning_rate": 1.1419139694110236e-05, + "loss": 1.777, + "step": 25646 + }, + { + "epoch": 7.872007366482505, + "grad_norm": 0.15032227337360382, + "learning_rate": 1.1415978175402853e-05, + "loss": 1.6759, + "step": 25647 + }, + { + "epoch": 7.87231430325353, + "grad_norm": 0.18372420966625214, + "learning_rate": 1.1412817038005386e-05, + "loss": 1.7304, + "step": 25648 + }, + { + "epoch": 7.872621240024555, + "grad_norm": 0.16073383390903473, + "learning_rate": 1.1409656281949077e-05, + "loss": 1.6784, + "step": 25649 + }, + { + "epoch": 7.87292817679558, + "grad_norm": 0.15698374807834625, + "learning_rate": 1.1406495907265163e-05, + "loss": 1.6877, + "step": 25650 + }, + { + "epoch": 7.873235113566605, + "grad_norm": 0.18749327957630157, + "learning_rate": 1.140333591398488e-05, + "loss": 1.708, + "step": 25651 + }, + { + "epoch": 7.8735420503376305, + "grad_norm": 0.15412451326847076, + "learning_rate": 1.1400176302139448e-05, + "loss": 1.6661, + "step": 25652 + }, + { + "epoch": 7.873848987108656, + "grad_norm": 0.22467148303985596, + "learning_rate": 1.1397017071760102e-05, + "loss": 1.8204, + "step": 25653 + }, + { + "epoch": 7.87415592387968, + "grad_norm": 0.14625288546085358, + "learning_rate": 1.1393858222878063e-05, + "loss": 1.7008, + "step": 25654 + }, + { + "epoch": 7.874462860650706, + "grad_norm": 0.14440159499645233, + "learning_rate": 1.1390699755524537e-05, + "loss": 1.652, + "step": 25655 + }, + { + "epoch": 7.874769797421731, + "grad_norm": 0.14738808572292328, + "learning_rate": 1.138754166973075e-05, + "loss": 1.6305, + "step": 25656 + }, + { + "epoch": 7.875076734192756, + "grad_norm": 0.17714212834835052, + "learning_rate": 1.1384383965527906e-05, + "loss": 1.7011, + "step": 25657 + }, + { + "epoch": 7.875383670963782, + "grad_norm": 0.17601121962070465, + "learning_rate": 1.1381226642947213e-05, + "loss": 1.7425, + "step": 25658 + }, + { + "epoch": 7.875690607734807, + "grad_norm": 0.1893182396888733, + "learning_rate": 1.1378069702019877e-05, + "loss": 1.7215, + "step": 25659 + }, + { + "epoch": 7.8759975445058314, + "grad_norm": 0.20073552429676056, + "learning_rate": 1.1374913142777077e-05, + "loss": 1.7025, + "step": 25660 + }, + { + "epoch": 7.876304481276857, + "grad_norm": 0.17025165259838104, + "learning_rate": 1.1371756965250052e-05, + "loss": 1.7046, + "step": 25661 + }, + { + "epoch": 7.876611418047882, + "grad_norm": 0.17612501978874207, + "learning_rate": 1.1368601169469933e-05, + "loss": 1.7452, + "step": 25662 + }, + { + "epoch": 7.8769183548189075, + "grad_norm": 0.2542072534561157, + "learning_rate": 1.1365445755467974e-05, + "loss": 1.765, + "step": 25663 + }, + { + "epoch": 7.877225291589933, + "grad_norm": 0.25291866064071655, + "learning_rate": 1.1362290723275293e-05, + "loss": 1.7477, + "step": 25664 + }, + { + "epoch": 7.877532228360957, + "grad_norm": 0.1848495602607727, + "learning_rate": 1.1359136072923121e-05, + "loss": 1.7278, + "step": 25665 + }, + { + "epoch": 7.877839165131983, + "grad_norm": 0.18354780972003937, + "learning_rate": 1.1355981804442605e-05, + "loss": 1.7469, + "step": 25666 + }, + { + "epoch": 7.878146101903008, + "grad_norm": 0.1843772530555725, + "learning_rate": 1.1352827917864934e-05, + "loss": 1.7654, + "step": 25667 + }, + { + "epoch": 7.878453038674033, + "grad_norm": 0.144758403301239, + "learning_rate": 1.1349674413221267e-05, + "loss": 1.6649, + "step": 25668 + }, + { + "epoch": 7.878759975445059, + "grad_norm": 0.15747511386871338, + "learning_rate": 1.1346521290542772e-05, + "loss": 1.6386, + "step": 25669 + }, + { + "epoch": 7.879066912216084, + "grad_norm": 0.17898736894130707, + "learning_rate": 1.134336854986061e-05, + "loss": 1.7, + "step": 25670 + }, + { + "epoch": 7.879373848987108, + "grad_norm": 0.19453589618206024, + "learning_rate": 1.1340216191205939e-05, + "loss": 1.7108, + "step": 25671 + }, + { + "epoch": 7.879680785758134, + "grad_norm": 0.17470498383045197, + "learning_rate": 1.1337064214609905e-05, + "loss": 1.7705, + "step": 25672 + }, + { + "epoch": 7.879987722529159, + "grad_norm": 0.1897793561220169, + "learning_rate": 1.1333912620103665e-05, + "loss": 1.7358, + "step": 25673 + }, + { + "epoch": 7.880294659300184, + "grad_norm": 0.1659744381904602, + "learning_rate": 1.1330761407718366e-05, + "loss": 1.724, + "step": 25674 + }, + { + "epoch": 7.88060159607121, + "grad_norm": 0.15303891897201538, + "learning_rate": 1.1327610577485148e-05, + "loss": 1.6878, + "step": 25675 + }, + { + "epoch": 7.880908532842234, + "grad_norm": 0.16346490383148193, + "learning_rate": 1.1324460129435144e-05, + "loss": 1.6544, + "step": 25676 + }, + { + "epoch": 7.8812154696132595, + "grad_norm": 0.19887791574001312, + "learning_rate": 1.1321310063599483e-05, + "loss": 1.7169, + "step": 25677 + }, + { + "epoch": 7.881522406384285, + "grad_norm": 0.1658533811569214, + "learning_rate": 1.1318160380009334e-05, + "loss": 1.6902, + "step": 25678 + }, + { + "epoch": 7.88182934315531, + "grad_norm": 0.16859948635101318, + "learning_rate": 1.131501107869577e-05, + "loss": 1.7015, + "step": 25679 + }, + { + "epoch": 7.8821362799263355, + "grad_norm": 0.20775821805000305, + "learning_rate": 1.1311862159689968e-05, + "loss": 1.7519, + "step": 25680 + }, + { + "epoch": 7.882443216697361, + "grad_norm": 0.18174295127391815, + "learning_rate": 1.1308713623022987e-05, + "loss": 1.7161, + "step": 25681 + }, + { + "epoch": 7.882750153468385, + "grad_norm": 0.1843954473733902, + "learning_rate": 1.1305565468725993e-05, + "loss": 1.6753, + "step": 25682 + }, + { + "epoch": 7.883057090239411, + "grad_norm": 0.1856461614370346, + "learning_rate": 1.130241769683008e-05, + "loss": 1.7139, + "step": 25683 + }, + { + "epoch": 7.883364027010436, + "grad_norm": 0.15803632140159607, + "learning_rate": 1.129927030736636e-05, + "loss": 1.6705, + "step": 25684 + }, + { + "epoch": 7.883670963781461, + "grad_norm": 0.1680101901292801, + "learning_rate": 1.1296123300365947e-05, + "loss": 1.6757, + "step": 25685 + }, + { + "epoch": 7.883977900552486, + "grad_norm": 0.157195046544075, + "learning_rate": 1.1292976675859895e-05, + "loss": 1.6922, + "step": 25686 + }, + { + "epoch": 7.884284837323511, + "grad_norm": 0.17270046472549438, + "learning_rate": 1.1289830433879356e-05, + "loss": 1.6909, + "step": 25687 + }, + { + "epoch": 7.884591774094536, + "grad_norm": 0.1880030781030655, + "learning_rate": 1.1286684574455398e-05, + "loss": 1.7139, + "step": 25688 + }, + { + "epoch": 7.884898710865562, + "grad_norm": 0.1882653832435608, + "learning_rate": 1.1283539097619112e-05, + "loss": 1.7464, + "step": 25689 + }, + { + "epoch": 7.885205647636587, + "grad_norm": 0.2060890644788742, + "learning_rate": 1.128039400340159e-05, + "loss": 1.6749, + "step": 25690 + }, + { + "epoch": 7.885512584407612, + "grad_norm": 0.20780493319034576, + "learning_rate": 1.1277249291833903e-05, + "loss": 1.7581, + "step": 25691 + }, + { + "epoch": 7.885819521178637, + "grad_norm": 0.1929686814546585, + "learning_rate": 1.1274104962947135e-05, + "loss": 1.6962, + "step": 25692 + }, + { + "epoch": 7.886126457949662, + "grad_norm": 0.21474432945251465, + "learning_rate": 1.1270961016772363e-05, + "loss": 1.6984, + "step": 25693 + }, + { + "epoch": 7.8864333947206875, + "grad_norm": 0.17453257739543915, + "learning_rate": 1.126781745334064e-05, + "loss": 1.679, + "step": 25694 + }, + { + "epoch": 7.886740331491713, + "grad_norm": 0.21506772935390472, + "learning_rate": 1.1264674272683073e-05, + "loss": 1.7209, + "step": 25695 + }, + { + "epoch": 7.887047268262738, + "grad_norm": 0.2470129430294037, + "learning_rate": 1.1261531474830672e-05, + "loss": 1.7183, + "step": 25696 + }, + { + "epoch": 7.887354205033763, + "grad_norm": 0.2026570737361908, + "learning_rate": 1.1258389059814545e-05, + "loss": 1.6579, + "step": 25697 + }, + { + "epoch": 7.887661141804788, + "grad_norm": 0.18859948217868805, + "learning_rate": 1.1255247027665699e-05, + "loss": 1.6831, + "step": 25698 + }, + { + "epoch": 7.887968078575813, + "grad_norm": 0.2106257677078247, + "learning_rate": 1.1252105378415229e-05, + "loss": 1.724, + "step": 25699 + }, + { + "epoch": 7.888275015346839, + "grad_norm": 0.17260697484016418, + "learning_rate": 1.1248964112094162e-05, + "loss": 1.6875, + "step": 25700 + }, + { + "epoch": 7.888581952117864, + "grad_norm": 0.20596550405025482, + "learning_rate": 1.1245823228733542e-05, + "loss": 1.7569, + "step": 25701 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 0.1724967509508133, + "learning_rate": 1.1242682728364428e-05, + "loss": 1.7063, + "step": 25702 + }, + { + "epoch": 7.889195825659914, + "grad_norm": 0.2189379185438156, + "learning_rate": 1.123954261101781e-05, + "loss": 1.789, + "step": 25703 + }, + { + "epoch": 7.889502762430939, + "grad_norm": 0.1539442539215088, + "learning_rate": 1.1236402876724766e-05, + "loss": 1.6573, + "step": 25704 + }, + { + "epoch": 7.889809699201964, + "grad_norm": 0.2854970693588257, + "learning_rate": 1.1233263525516313e-05, + "loss": 1.7683, + "step": 25705 + }, + { + "epoch": 7.89011663597299, + "grad_norm": 0.18263237178325653, + "learning_rate": 1.1230124557423465e-05, + "loss": 1.6911, + "step": 25706 + }, + { + "epoch": 7.890423572744015, + "grad_norm": 0.2098342627286911, + "learning_rate": 1.122698597247725e-05, + "loss": 1.7306, + "step": 25707 + }, + { + "epoch": 7.8907305095150395, + "grad_norm": 0.20822781324386597, + "learning_rate": 1.122384777070869e-05, + "loss": 1.7777, + "step": 25708 + }, + { + "epoch": 7.891037446286065, + "grad_norm": 0.24466483294963837, + "learning_rate": 1.122070995214879e-05, + "loss": 1.6966, + "step": 25709 + }, + { + "epoch": 7.89134438305709, + "grad_norm": 0.1500372439622879, + "learning_rate": 1.1217572516828561e-05, + "loss": 1.6787, + "step": 25710 + }, + { + "epoch": 7.8916513198281155, + "grad_norm": 0.2238166481256485, + "learning_rate": 1.1214435464779006e-05, + "loss": 1.7957, + "step": 25711 + }, + { + "epoch": 7.891958256599141, + "grad_norm": 0.22993433475494385, + "learning_rate": 1.1211298796031156e-05, + "loss": 1.7142, + "step": 25712 + }, + { + "epoch": 7.892265193370166, + "grad_norm": 0.15912945568561554, + "learning_rate": 1.1208162510615955e-05, + "loss": 1.7188, + "step": 25713 + }, + { + "epoch": 7.892572130141191, + "grad_norm": 0.2096986174583435, + "learning_rate": 1.1205026608564461e-05, + "loss": 1.7409, + "step": 25714 + }, + { + "epoch": 7.892879066912216, + "grad_norm": 0.18928684294223785, + "learning_rate": 1.1201891089907601e-05, + "loss": 1.6703, + "step": 25715 + }, + { + "epoch": 7.893186003683241, + "grad_norm": 0.19096077978610992, + "learning_rate": 1.119875595467641e-05, + "loss": 1.7393, + "step": 25716 + }, + { + "epoch": 7.893492940454267, + "grad_norm": 0.2286420315504074, + "learning_rate": 1.1195621202901851e-05, + "loss": 1.6995, + "step": 25717 + }, + { + "epoch": 7.893799877225292, + "grad_norm": 0.16288414597511292, + "learning_rate": 1.1192486834614912e-05, + "loss": 1.7334, + "step": 25718 + }, + { + "epoch": 7.894106813996316, + "grad_norm": 0.17358547449111938, + "learning_rate": 1.118935284984658e-05, + "loss": 1.7114, + "step": 25719 + }, + { + "epoch": 7.894413750767342, + "grad_norm": 0.16833151876926422, + "learning_rate": 1.1186219248627777e-05, + "loss": 1.6998, + "step": 25720 + }, + { + "epoch": 7.894720687538367, + "grad_norm": 0.14409767091274261, + "learning_rate": 1.118308603098952e-05, + "loss": 1.713, + "step": 25721 + }, + { + "epoch": 7.895027624309392, + "grad_norm": 0.18832024931907654, + "learning_rate": 1.1179953196962761e-05, + "loss": 1.6862, + "step": 25722 + }, + { + "epoch": 7.895334561080418, + "grad_norm": 0.1837761402130127, + "learning_rate": 1.1176820746578454e-05, + "loss": 1.6674, + "step": 25723 + }, + { + "epoch": 7.895641497851443, + "grad_norm": 0.14717474579811096, + "learning_rate": 1.1173688679867561e-05, + "loss": 1.6619, + "step": 25724 + }, + { + "epoch": 7.8959484346224675, + "grad_norm": 0.13512545824050903, + "learning_rate": 1.1170556996861032e-05, + "loss": 1.664, + "step": 25725 + }, + { + "epoch": 7.896255371393493, + "grad_norm": 0.21533837914466858, + "learning_rate": 1.1167425697589817e-05, + "loss": 1.7205, + "step": 25726 + }, + { + "epoch": 7.896562308164518, + "grad_norm": 0.15241803228855133, + "learning_rate": 1.1164294782084866e-05, + "loss": 1.6838, + "step": 25727 + }, + { + "epoch": 7.8968692449355435, + "grad_norm": 0.14889933168888092, + "learning_rate": 1.1161164250377099e-05, + "loss": 1.7197, + "step": 25728 + }, + { + "epoch": 7.897176181706568, + "grad_norm": 0.15948614478111267, + "learning_rate": 1.11580341024975e-05, + "loss": 1.6948, + "step": 25729 + }, + { + "epoch": 7.897483118477593, + "grad_norm": 0.17862235009670258, + "learning_rate": 1.1154904338476946e-05, + "loss": 1.743, + "step": 25730 + }, + { + "epoch": 7.897790055248619, + "grad_norm": 0.18168844282627106, + "learning_rate": 1.1151774958346422e-05, + "loss": 1.7291, + "step": 25731 + }, + { + "epoch": 7.898096992019644, + "grad_norm": 0.17636772990226746, + "learning_rate": 1.11486459621368e-05, + "loss": 1.7428, + "step": 25732 + }, + { + "epoch": 7.898403928790669, + "grad_norm": 0.1677904576063156, + "learning_rate": 1.1145517349879048e-05, + "loss": 1.7026, + "step": 25733 + }, + { + "epoch": 7.898710865561695, + "grad_norm": 0.1851150244474411, + "learning_rate": 1.1142389121604063e-05, + "loss": 1.7743, + "step": 25734 + }, + { + "epoch": 7.899017802332719, + "grad_norm": 0.19713786244392395, + "learning_rate": 1.1139261277342767e-05, + "loss": 1.7287, + "step": 25735 + }, + { + "epoch": 7.899324739103744, + "grad_norm": 0.2060006707906723, + "learning_rate": 1.1136133817126076e-05, + "loss": 1.7377, + "step": 25736 + }, + { + "epoch": 7.89963167587477, + "grad_norm": 0.18026013672351837, + "learning_rate": 1.1133006740984864e-05, + "loss": 1.7322, + "step": 25737 + }, + { + "epoch": 7.899938612645795, + "grad_norm": 0.1787644922733307, + "learning_rate": 1.1129880048950075e-05, + "loss": 1.7457, + "step": 25738 + }, + { + "epoch": 7.9002455494168204, + "grad_norm": 0.16092467308044434, + "learning_rate": 1.1126753741052593e-05, + "loss": 1.7451, + "step": 25739 + }, + { + "epoch": 7.900552486187845, + "grad_norm": 0.15322941541671753, + "learning_rate": 1.1123627817323318e-05, + "loss": 1.667, + "step": 25740 + }, + { + "epoch": 7.90085942295887, + "grad_norm": 0.1488087922334671, + "learning_rate": 1.1120502277793137e-05, + "loss": 1.684, + "step": 25741 + }, + { + "epoch": 7.901166359729896, + "grad_norm": 0.15332907438278198, + "learning_rate": 1.111737712249294e-05, + "loss": 1.6646, + "step": 25742 + }, + { + "epoch": 7.901473296500921, + "grad_norm": 0.19801980257034302, + "learning_rate": 1.1114252351453614e-05, + "loss": 1.7469, + "step": 25743 + }, + { + "epoch": 7.901780233271946, + "grad_norm": 0.17123407125473022, + "learning_rate": 1.1111127964706035e-05, + "loss": 1.7319, + "step": 25744 + }, + { + "epoch": 7.902087170042972, + "grad_norm": 0.1753319650888443, + "learning_rate": 1.1108003962281066e-05, + "loss": 1.7212, + "step": 25745 + }, + { + "epoch": 7.902394106813996, + "grad_norm": 0.1598043441772461, + "learning_rate": 1.1104880344209634e-05, + "loss": 1.6823, + "step": 25746 + }, + { + "epoch": 7.902701043585021, + "grad_norm": 0.14227038621902466, + "learning_rate": 1.1101757110522538e-05, + "loss": 1.6665, + "step": 25747 + }, + { + "epoch": 7.903007980356047, + "grad_norm": 0.1531791388988495, + "learning_rate": 1.1098634261250706e-05, + "loss": 1.717, + "step": 25748 + }, + { + "epoch": 7.903314917127072, + "grad_norm": 0.18077540397644043, + "learning_rate": 1.109551179642494e-05, + "loss": 1.7237, + "step": 25749 + }, + { + "epoch": 7.903621853898097, + "grad_norm": 0.22373250126838684, + "learning_rate": 1.1092389716076145e-05, + "loss": 1.7678, + "step": 25750 + }, + { + "epoch": 7.903928790669122, + "grad_norm": 0.16022193431854248, + "learning_rate": 1.1089268020235166e-05, + "loss": 1.6985, + "step": 25751 + }, + { + "epoch": 7.904235727440147, + "grad_norm": 0.17306078970432281, + "learning_rate": 1.1086146708932837e-05, + "loss": 1.6653, + "step": 25752 + }, + { + "epoch": 7.9045426642111725, + "grad_norm": 0.16284874081611633, + "learning_rate": 1.1083025782200035e-05, + "loss": 1.6762, + "step": 25753 + }, + { + "epoch": 7.904849600982198, + "grad_norm": 0.17309556901454926, + "learning_rate": 1.107990524006755e-05, + "loss": 1.7103, + "step": 25754 + }, + { + "epoch": 7.905156537753223, + "grad_norm": 0.1508374810218811, + "learning_rate": 1.107678508256627e-05, + "loss": 1.6932, + "step": 25755 + }, + { + "epoch": 7.9054634745242485, + "grad_norm": 0.1941400021314621, + "learning_rate": 1.1073665309727016e-05, + "loss": 1.7922, + "step": 25756 + }, + { + "epoch": 7.905770411295273, + "grad_norm": 0.1890190988779068, + "learning_rate": 1.107054592158061e-05, + "loss": 1.6765, + "step": 25757 + }, + { + "epoch": 7.906077348066298, + "grad_norm": 0.19425363838672638, + "learning_rate": 1.1067426918157892e-05, + "loss": 1.7284, + "step": 25758 + }, + { + "epoch": 7.906384284837324, + "grad_norm": 0.18147888779640198, + "learning_rate": 1.1064308299489678e-05, + "loss": 1.7099, + "step": 25759 + }, + { + "epoch": 7.906691221608349, + "grad_norm": 0.19644278287887573, + "learning_rate": 1.106119006560679e-05, + "loss": 1.7691, + "step": 25760 + }, + { + "epoch": 7.906998158379373, + "grad_norm": 0.14809735119342804, + "learning_rate": 1.1058072216540045e-05, + "loss": 1.6735, + "step": 25761 + }, + { + "epoch": 7.907305095150399, + "grad_norm": 0.17835088074207306, + "learning_rate": 1.105495475232024e-05, + "loss": 1.6928, + "step": 25762 + }, + { + "epoch": 7.907612031921424, + "grad_norm": 0.18341144919395447, + "learning_rate": 1.1051837672978227e-05, + "loss": 1.7393, + "step": 25763 + }, + { + "epoch": 7.907918968692449, + "grad_norm": 0.2026391327381134, + "learning_rate": 1.1048720978544753e-05, + "loss": 1.7037, + "step": 25764 + }, + { + "epoch": 7.908225905463475, + "grad_norm": 0.19855152070522308, + "learning_rate": 1.104560466905068e-05, + "loss": 1.7341, + "step": 25765 + }, + { + "epoch": 7.9085328422345, + "grad_norm": 0.18974080681800842, + "learning_rate": 1.1042488744526741e-05, + "loss": 1.6717, + "step": 25766 + }, + { + "epoch": 7.9088397790055245, + "grad_norm": 0.1727920025587082, + "learning_rate": 1.1039373205003784e-05, + "loss": 1.6994, + "step": 25767 + }, + { + "epoch": 7.90914671577655, + "grad_norm": 0.20549818873405457, + "learning_rate": 1.1036258050512566e-05, + "loss": 1.7055, + "step": 25768 + }, + { + "epoch": 7.909453652547575, + "grad_norm": 0.15696507692337036, + "learning_rate": 1.1033143281083891e-05, + "loss": 1.678, + "step": 25769 + }, + { + "epoch": 7.9097605893186005, + "grad_norm": 0.1568988859653473, + "learning_rate": 1.1030028896748546e-05, + "loss": 1.6855, + "step": 25770 + }, + { + "epoch": 7.910067526089626, + "grad_norm": 0.17795592546463013, + "learning_rate": 1.1026914897537266e-05, + "loss": 1.7306, + "step": 25771 + }, + { + "epoch": 7.91037446286065, + "grad_norm": 0.19906511902809143, + "learning_rate": 1.1023801283480872e-05, + "loss": 1.7125, + "step": 25772 + }, + { + "epoch": 7.910681399631676, + "grad_norm": 0.16972185671329498, + "learning_rate": 1.1020688054610118e-05, + "loss": 1.714, + "step": 25773 + }, + { + "epoch": 7.910988336402701, + "grad_norm": 0.20585502684116364, + "learning_rate": 1.1017575210955772e-05, + "loss": 1.7342, + "step": 25774 + }, + { + "epoch": 7.911295273173726, + "grad_norm": 0.1772177368402481, + "learning_rate": 1.1014462752548592e-05, + "loss": 1.7091, + "step": 25775 + }, + { + "epoch": 7.911602209944752, + "grad_norm": 0.1818380057811737, + "learning_rate": 1.1011350679419341e-05, + "loss": 1.7131, + "step": 25776 + }, + { + "epoch": 7.911909146715777, + "grad_norm": 0.17451459169387817, + "learning_rate": 1.1008238991598779e-05, + "loss": 1.6633, + "step": 25777 + }, + { + "epoch": 7.912216083486801, + "grad_norm": 0.18837687373161316, + "learning_rate": 1.100512768911765e-05, + "loss": 1.7132, + "step": 25778 + }, + { + "epoch": 7.912523020257827, + "grad_norm": 0.15283817052841187, + "learning_rate": 1.1002016772006695e-05, + "loss": 1.6833, + "step": 25779 + }, + { + "epoch": 7.912829957028852, + "grad_norm": 0.15264299511909485, + "learning_rate": 1.0998906240296692e-05, + "loss": 1.7098, + "step": 25780 + }, + { + "epoch": 7.913136893799877, + "grad_norm": 0.18866822123527527, + "learning_rate": 1.099579609401833e-05, + "loss": 1.7173, + "step": 25781 + }, + { + "epoch": 7.913443830570903, + "grad_norm": 0.19261083006858826, + "learning_rate": 1.0992686333202401e-05, + "loss": 1.7269, + "step": 25782 + }, + { + "epoch": 7.913750767341927, + "grad_norm": 0.19681799411773682, + "learning_rate": 1.0989576957879577e-05, + "loss": 1.6594, + "step": 25783 + }, + { + "epoch": 7.9140577041129525, + "grad_norm": 0.21298938989639282, + "learning_rate": 1.0986467968080639e-05, + "loss": 1.8509, + "step": 25784 + }, + { + "epoch": 7.914364640883978, + "grad_norm": 0.17769277095794678, + "learning_rate": 1.0983359363836287e-05, + "loss": 1.7177, + "step": 25785 + }, + { + "epoch": 7.914671577655003, + "grad_norm": 0.19831274449825287, + "learning_rate": 1.0980251145177246e-05, + "loss": 1.7107, + "step": 25786 + }, + { + "epoch": 7.9149785144260285, + "grad_norm": 0.16204139590263367, + "learning_rate": 1.0977143312134248e-05, + "loss": 1.7052, + "step": 25787 + }, + { + "epoch": 7.915285451197054, + "grad_norm": 0.1709459275007248, + "learning_rate": 1.0974035864737958e-05, + "loss": 1.6944, + "step": 25788 + }, + { + "epoch": 7.915592387968078, + "grad_norm": 0.17710284888744354, + "learning_rate": 1.0970928803019142e-05, + "loss": 1.7253, + "step": 25789 + }, + { + "epoch": 7.915899324739104, + "grad_norm": 0.17316623032093048, + "learning_rate": 1.0967822127008481e-05, + "loss": 1.6458, + "step": 25790 + }, + { + "epoch": 7.916206261510129, + "grad_norm": 0.15644441545009613, + "learning_rate": 1.0964715836736677e-05, + "loss": 1.6749, + "step": 25791 + }, + { + "epoch": 7.916513198281154, + "grad_norm": 0.1425870954990387, + "learning_rate": 1.096160993223443e-05, + "loss": 1.7283, + "step": 25792 + }, + { + "epoch": 7.91682013505218, + "grad_norm": 0.1724596619606018, + "learning_rate": 1.0958504413532438e-05, + "loss": 1.7152, + "step": 25793 + }, + { + "epoch": 7.917127071823204, + "grad_norm": 0.20472319424152374, + "learning_rate": 1.0955399280661383e-05, + "loss": 1.7818, + "step": 25794 + }, + { + "epoch": 7.917434008594229, + "grad_norm": 0.18012158572673798, + "learning_rate": 1.0952294533651963e-05, + "loss": 1.6995, + "step": 25795 + }, + { + "epoch": 7.917740945365255, + "grad_norm": 0.1460564136505127, + "learning_rate": 1.0949190172534851e-05, + "loss": 1.6752, + "step": 25796 + }, + { + "epoch": 7.91804788213628, + "grad_norm": 0.16467545926570892, + "learning_rate": 1.0946086197340733e-05, + "loss": 1.7, + "step": 25797 + }, + { + "epoch": 7.918354818907305, + "grad_norm": 0.20123273134231567, + "learning_rate": 1.0942982608100266e-05, + "loss": 1.7423, + "step": 25798 + }, + { + "epoch": 7.918661755678331, + "grad_norm": 0.160671204328537, + "learning_rate": 1.0939879404844167e-05, + "loss": 1.6992, + "step": 25799 + }, + { + "epoch": 7.918968692449355, + "grad_norm": 0.18679293990135193, + "learning_rate": 1.0936776587603043e-05, + "loss": 1.7789, + "step": 25800 + }, + { + "epoch": 7.9192756292203805, + "grad_norm": 0.1598452925682068, + "learning_rate": 1.0933674156407602e-05, + "loss": 1.6961, + "step": 25801 + }, + { + "epoch": 7.919582565991406, + "grad_norm": 0.13918142020702362, + "learning_rate": 1.0930572111288506e-05, + "loss": 1.6727, + "step": 25802 + }, + { + "epoch": 7.919889502762431, + "grad_norm": 0.16652320325374603, + "learning_rate": 1.0927470452276367e-05, + "loss": 1.7135, + "step": 25803 + }, + { + "epoch": 7.920196439533456, + "grad_norm": 0.1637706309556961, + "learning_rate": 1.0924369179401893e-05, + "loss": 1.7078, + "step": 25804 + }, + { + "epoch": 7.920503376304481, + "grad_norm": 0.19709086418151855, + "learning_rate": 1.092126829269568e-05, + "loss": 1.7425, + "step": 25805 + }, + { + "epoch": 7.920810313075506, + "grad_norm": 0.13402192294597626, + "learning_rate": 1.091816779218841e-05, + "loss": 1.663, + "step": 25806 + }, + { + "epoch": 7.921117249846532, + "grad_norm": 0.18932323157787323, + "learning_rate": 1.0915067677910718e-05, + "loss": 1.7651, + "step": 25807 + }, + { + "epoch": 7.921424186617557, + "grad_norm": 0.1586374193429947, + "learning_rate": 1.0911967949893231e-05, + "loss": 1.6709, + "step": 25808 + }, + { + "epoch": 7.921731123388582, + "grad_norm": 0.1570933312177658, + "learning_rate": 1.0908868608166589e-05, + "loss": 1.7166, + "step": 25809 + }, + { + "epoch": 7.922038060159607, + "grad_norm": 0.19786952435970306, + "learning_rate": 1.0905769652761416e-05, + "loss": 1.7347, + "step": 25810 + }, + { + "epoch": 7.922344996930632, + "grad_norm": 0.14969857037067413, + "learning_rate": 1.0902671083708343e-05, + "loss": 1.6471, + "step": 25811 + }, + { + "epoch": 7.922651933701657, + "grad_norm": 0.17460933327674866, + "learning_rate": 1.089957290103799e-05, + "loss": 1.7594, + "step": 25812 + }, + { + "epoch": 7.922958870472683, + "grad_norm": 0.17380566895008087, + "learning_rate": 1.0896475104780974e-05, + "loss": 1.6721, + "step": 25813 + }, + { + "epoch": 7.923265807243708, + "grad_norm": 0.1599249392747879, + "learning_rate": 1.0893377694967916e-05, + "loss": 1.6842, + "step": 25814 + }, + { + "epoch": 7.9235727440147325, + "grad_norm": 0.15319927036762238, + "learning_rate": 1.0890280671629398e-05, + "loss": 1.6529, + "step": 25815 + }, + { + "epoch": 7.923879680785758, + "grad_norm": 0.20122043788433075, + "learning_rate": 1.0887184034796082e-05, + "loss": 1.8009, + "step": 25816 + }, + { + "epoch": 7.924186617556783, + "grad_norm": 0.1726430058479309, + "learning_rate": 1.0884087784498515e-05, + "loss": 1.7595, + "step": 25817 + }, + { + "epoch": 7.9244935543278086, + "grad_norm": 0.1657346487045288, + "learning_rate": 1.0880991920767336e-05, + "loss": 1.7051, + "step": 25818 + }, + { + "epoch": 7.924800491098834, + "grad_norm": 0.19500960409641266, + "learning_rate": 1.0877896443633117e-05, + "loss": 1.6809, + "step": 25819 + }, + { + "epoch": 7.925107427869859, + "grad_norm": 0.18751180171966553, + "learning_rate": 1.087480135312644e-05, + "loss": 1.7613, + "step": 25820 + }, + { + "epoch": 7.925414364640884, + "grad_norm": 0.20735877752304077, + "learning_rate": 1.0871706649277935e-05, + "loss": 1.7515, + "step": 25821 + }, + { + "epoch": 7.925721301411909, + "grad_norm": 0.19349408149719238, + "learning_rate": 1.0868612332118133e-05, + "loss": 1.7053, + "step": 25822 + }, + { + "epoch": 7.926028238182934, + "grad_norm": 0.15639854967594147, + "learning_rate": 1.0865518401677649e-05, + "loss": 1.6907, + "step": 25823 + }, + { + "epoch": 7.92633517495396, + "grad_norm": 0.18366692960262299, + "learning_rate": 1.0862424857987059e-05, + "loss": 1.6791, + "step": 25824 + }, + { + "epoch": 7.926642111724985, + "grad_norm": 0.1648077666759491, + "learning_rate": 1.0859331701076913e-05, + "loss": 1.6671, + "step": 25825 + }, + { + "epoch": 7.9269490484960095, + "grad_norm": 0.17894984781742096, + "learning_rate": 1.0856238930977802e-05, + "loss": 1.736, + "step": 25826 + }, + { + "epoch": 7.927255985267035, + "grad_norm": 0.13542817533016205, + "learning_rate": 1.0853146547720278e-05, + "loss": 1.6613, + "step": 25827 + }, + { + "epoch": 7.92756292203806, + "grad_norm": 0.1598762571811676, + "learning_rate": 1.0850054551334905e-05, + "loss": 1.6828, + "step": 25828 + }, + { + "epoch": 7.9278698588090855, + "grad_norm": 0.19212616980075836, + "learning_rate": 1.0846962941852235e-05, + "loss": 1.8198, + "step": 25829 + }, + { + "epoch": 7.928176795580111, + "grad_norm": 0.19344113767147064, + "learning_rate": 1.0843871719302829e-05, + "loss": 1.7804, + "step": 25830 + }, + { + "epoch": 7.928483732351136, + "grad_norm": 0.15460920333862305, + "learning_rate": 1.0840780883717233e-05, + "loss": 1.7372, + "step": 25831 + }, + { + "epoch": 7.928790669122161, + "grad_norm": 0.19987867772579193, + "learning_rate": 1.083769043512598e-05, + "loss": 1.6923, + "step": 25832 + }, + { + "epoch": 7.929097605893186, + "grad_norm": 0.15390315651893616, + "learning_rate": 1.083460037355965e-05, + "loss": 1.6864, + "step": 25833 + }, + { + "epoch": 7.929404542664211, + "grad_norm": 0.18596698343753815, + "learning_rate": 1.0831510699048724e-05, + "loss": 1.7135, + "step": 25834 + }, + { + "epoch": 7.929711479435237, + "grad_norm": 0.172935351729393, + "learning_rate": 1.0828421411623796e-05, + "loss": 1.7426, + "step": 25835 + }, + { + "epoch": 7.930018416206261, + "grad_norm": 0.2046828418970108, + "learning_rate": 1.0825332511315356e-05, + "loss": 1.7178, + "step": 25836 + }, + { + "epoch": 7.930325352977286, + "grad_norm": 0.1382901519536972, + "learning_rate": 1.0822243998153925e-05, + "loss": 1.6811, + "step": 25837 + }, + { + "epoch": 7.930632289748312, + "grad_norm": 0.1675405353307724, + "learning_rate": 1.0819155872170068e-05, + "loss": 1.7278, + "step": 25838 + }, + { + "epoch": 7.930939226519337, + "grad_norm": 0.16732639074325562, + "learning_rate": 1.0816068133394252e-05, + "loss": 1.6847, + "step": 25839 + }, + { + "epoch": 7.931246163290362, + "grad_norm": 0.17154982686042786, + "learning_rate": 1.0812980781857047e-05, + "loss": 1.7411, + "step": 25840 + }, + { + "epoch": 7.931553100061388, + "grad_norm": 0.16475310921669006, + "learning_rate": 1.08098938175889e-05, + "loss": 1.7222, + "step": 25841 + }, + { + "epoch": 7.931860036832412, + "grad_norm": 0.1613023579120636, + "learning_rate": 1.080680724062037e-05, + "loss": 1.718, + "step": 25842 + }, + { + "epoch": 7.9321669736034375, + "grad_norm": 0.16330939531326294, + "learning_rate": 1.0803721050981941e-05, + "loss": 1.7087, + "step": 25843 + }, + { + "epoch": 7.932473910374463, + "grad_norm": 0.15881259739398956, + "learning_rate": 1.0800635248704117e-05, + "loss": 1.7309, + "step": 25844 + }, + { + "epoch": 7.932780847145488, + "grad_norm": 0.19191724061965942, + "learning_rate": 1.0797549833817389e-05, + "loss": 1.7131, + "step": 25845 + }, + { + "epoch": 7.9330877839165135, + "grad_norm": 0.17083698511123657, + "learning_rate": 1.079446480635225e-05, + "loss": 1.7117, + "step": 25846 + }, + { + "epoch": 7.933394720687538, + "grad_norm": 0.18097929656505585, + "learning_rate": 1.0791380166339193e-05, + "loss": 1.7017, + "step": 25847 + }, + { + "epoch": 7.933701657458563, + "grad_norm": 0.1556827276945114, + "learning_rate": 1.0788295913808694e-05, + "loss": 1.7589, + "step": 25848 + }, + { + "epoch": 7.934008594229589, + "grad_norm": 0.1667819619178772, + "learning_rate": 1.0785212048791226e-05, + "loss": 1.6735, + "step": 25849 + }, + { + "epoch": 7.934315531000614, + "grad_norm": 0.18772241473197937, + "learning_rate": 1.0782128571317302e-05, + "loss": 1.6984, + "step": 25850 + }, + { + "epoch": 7.934622467771639, + "grad_norm": 0.1752445250749588, + "learning_rate": 1.0779045481417343e-05, + "loss": 1.6662, + "step": 25851 + }, + { + "epoch": 7.934929404542665, + "grad_norm": 0.16619165241718292, + "learning_rate": 1.0775962779121873e-05, + "loss": 1.765, + "step": 25852 + }, + { + "epoch": 7.935236341313689, + "grad_norm": 0.1685585081577301, + "learning_rate": 1.0772880464461316e-05, + "loss": 1.6692, + "step": 25853 + }, + { + "epoch": 7.935543278084714, + "grad_norm": 0.16806848347187042, + "learning_rate": 1.076979853746613e-05, + "loss": 1.7081, + "step": 25854 + }, + { + "epoch": 7.93585021485574, + "grad_norm": 0.14273032546043396, + "learning_rate": 1.076671699816682e-05, + "loss": 1.6668, + "step": 25855 + }, + { + "epoch": 7.936157151626765, + "grad_norm": 0.24727863073349, + "learning_rate": 1.0763635846593778e-05, + "loss": 1.7624, + "step": 25856 + }, + { + "epoch": 7.93646408839779, + "grad_norm": 0.15679748356342316, + "learning_rate": 1.0760555082777506e-05, + "loss": 1.6851, + "step": 25857 + }, + { + "epoch": 7.936771025168815, + "grad_norm": 0.23388828337192535, + "learning_rate": 1.075747470674841e-05, + "loss": 1.7557, + "step": 25858 + }, + { + "epoch": 7.93707796193984, + "grad_norm": 0.15266747772693634, + "learning_rate": 1.0754394718536958e-05, + "loss": 1.6559, + "step": 25859 + }, + { + "epoch": 7.9373848987108655, + "grad_norm": 0.1945476084947586, + "learning_rate": 1.0751315118173577e-05, + "loss": 1.745, + "step": 25860 + }, + { + "epoch": 7.937691835481891, + "grad_norm": 0.18018878996372223, + "learning_rate": 1.0748235905688709e-05, + "loss": 1.7016, + "step": 25861 + }, + { + "epoch": 7.937998772252916, + "grad_norm": 0.1748870611190796, + "learning_rate": 1.0745157081112777e-05, + "loss": 1.6989, + "step": 25862 + }, + { + "epoch": 7.9383057090239415, + "grad_norm": 0.18253664672374725, + "learning_rate": 1.0742078644476217e-05, + "loss": 1.7554, + "step": 25863 + }, + { + "epoch": 7.938612645794966, + "grad_norm": 0.17009632289409637, + "learning_rate": 1.073900059580944e-05, + "loss": 1.7244, + "step": 25864 + }, + { + "epoch": 7.938919582565991, + "grad_norm": 0.17612707614898682, + "learning_rate": 1.0735922935142873e-05, + "loss": 1.6939, + "step": 25865 + }, + { + "epoch": 7.939226519337017, + "grad_norm": 0.21207575500011444, + "learning_rate": 1.0732845662506913e-05, + "loss": 1.7097, + "step": 25866 + }, + { + "epoch": 7.939533456108042, + "grad_norm": 0.2073012739419937, + "learning_rate": 1.0729768777932014e-05, + "loss": 1.7658, + "step": 25867 + }, + { + "epoch": 7.939840392879067, + "grad_norm": 0.18888477981090546, + "learning_rate": 1.072669228144853e-05, + "loss": 1.7496, + "step": 25868 + }, + { + "epoch": 7.940147329650092, + "grad_norm": 0.1822361946105957, + "learning_rate": 1.0723616173086926e-05, + "loss": 1.7344, + "step": 25869 + }, + { + "epoch": 7.940454266421117, + "grad_norm": 0.18642890453338623, + "learning_rate": 1.0720540452877547e-05, + "loss": 1.7135, + "step": 25870 + }, + { + "epoch": 7.940761203192142, + "grad_norm": 0.19198815524578094, + "learning_rate": 1.0717465120850795e-05, + "loss": 1.7128, + "step": 25871 + }, + { + "epoch": 7.941068139963168, + "grad_norm": 0.1886969953775406, + "learning_rate": 1.0714390177037109e-05, + "loss": 1.7161, + "step": 25872 + }, + { + "epoch": 7.941375076734193, + "grad_norm": 0.19693820178508759, + "learning_rate": 1.0711315621466816e-05, + "loss": 1.7086, + "step": 25873 + }, + { + "epoch": 7.941682013505218, + "grad_norm": 0.19052870571613312, + "learning_rate": 1.0708241454170353e-05, + "loss": 1.7274, + "step": 25874 + }, + { + "epoch": 7.941988950276243, + "grad_norm": 0.23586300015449524, + "learning_rate": 1.0705167675178057e-05, + "loss": 1.7169, + "step": 25875 + }, + { + "epoch": 7.942295887047268, + "grad_norm": 0.2077670842409134, + "learning_rate": 1.0702094284520336e-05, + "loss": 1.7573, + "step": 25876 + }, + { + "epoch": 7.9426028238182935, + "grad_norm": 0.20345431566238403, + "learning_rate": 1.069902128222755e-05, + "loss": 1.6821, + "step": 25877 + }, + { + "epoch": 7.942909760589319, + "grad_norm": 0.1869240552186966, + "learning_rate": 1.0695948668330075e-05, + "loss": 1.6978, + "step": 25878 + }, + { + "epoch": 7.943216697360343, + "grad_norm": 0.17814506590366364, + "learning_rate": 1.0692876442858274e-05, + "loss": 1.7027, + "step": 25879 + }, + { + "epoch": 7.943523634131369, + "grad_norm": 0.19093535840511322, + "learning_rate": 1.0689804605842502e-05, + "loss": 1.7863, + "step": 25880 + }, + { + "epoch": 7.943830570902394, + "grad_norm": 0.17859873175621033, + "learning_rate": 1.0686733157313123e-05, + "loss": 1.7431, + "step": 25881 + }, + { + "epoch": 7.944137507673419, + "grad_norm": 0.16613568365573883, + "learning_rate": 1.0683662097300484e-05, + "loss": 1.7517, + "step": 25882 + }, + { + "epoch": 7.944444444444445, + "grad_norm": 0.1588357836008072, + "learning_rate": 1.0680591425834934e-05, + "loss": 1.7017, + "step": 25883 + }, + { + "epoch": 7.94475138121547, + "grad_norm": 0.1667826622724533, + "learning_rate": 1.067752114294685e-05, + "loss": 1.6965, + "step": 25884 + }, + { + "epoch": 7.945058317986494, + "grad_norm": 0.2015296071767807, + "learning_rate": 1.0674451248666522e-05, + "loss": 1.7625, + "step": 25885 + }, + { + "epoch": 7.94536525475752, + "grad_norm": 0.17073483765125275, + "learning_rate": 1.0671381743024344e-05, + "loss": 1.7194, + "step": 25886 + }, + { + "epoch": 7.945672191528545, + "grad_norm": 0.16649815440177917, + "learning_rate": 1.0668312626050608e-05, + "loss": 1.7233, + "step": 25887 + }, + { + "epoch": 7.94597912829957, + "grad_norm": 0.14395855367183685, + "learning_rate": 1.0665243897775645e-05, + "loss": 1.6859, + "step": 25888 + }, + { + "epoch": 7.946286065070596, + "grad_norm": 0.18934515118598938, + "learning_rate": 1.0662175558229826e-05, + "loss": 1.6832, + "step": 25889 + }, + { + "epoch": 7.94659300184162, + "grad_norm": 0.16819562017917633, + "learning_rate": 1.0659107607443419e-05, + "loss": 1.7592, + "step": 25890 + }, + { + "epoch": 7.9468999386126455, + "grad_norm": 0.1701207458972931, + "learning_rate": 1.0656040045446798e-05, + "loss": 1.6909, + "step": 25891 + }, + { + "epoch": 7.947206875383671, + "grad_norm": 0.18011561036109924, + "learning_rate": 1.0652972872270217e-05, + "loss": 1.7687, + "step": 25892 + }, + { + "epoch": 7.947513812154696, + "grad_norm": 0.15422853827476501, + "learning_rate": 1.0649906087944034e-05, + "loss": 1.6957, + "step": 25893 + }, + { + "epoch": 7.9478207489257215, + "grad_norm": 0.17223568260669708, + "learning_rate": 1.0646839692498545e-05, + "loss": 1.7368, + "step": 25894 + }, + { + "epoch": 7.948127685696747, + "grad_norm": 0.16706988215446472, + "learning_rate": 1.0643773685964053e-05, + "loss": 1.6981, + "step": 25895 + }, + { + "epoch": 7.948434622467771, + "grad_norm": 0.15490150451660156, + "learning_rate": 1.0640708068370853e-05, + "loss": 1.705, + "step": 25896 + }, + { + "epoch": 7.948741559238797, + "grad_norm": 0.16119123995304108, + "learning_rate": 1.0637642839749246e-05, + "loss": 1.7519, + "step": 25897 + }, + { + "epoch": 7.949048496009822, + "grad_norm": 0.1669061779975891, + "learning_rate": 1.0634578000129524e-05, + "loss": 1.7228, + "step": 25898 + }, + { + "epoch": 7.949355432780847, + "grad_norm": 0.1974606215953827, + "learning_rate": 1.0631513549541976e-05, + "loss": 1.7188, + "step": 25899 + }, + { + "epoch": 7.949662369551873, + "grad_norm": 0.204077810049057, + "learning_rate": 1.0628449488016873e-05, + "loss": 1.7397, + "step": 25900 + }, + { + "epoch": 7.949969306322897, + "grad_norm": 0.13561539351940155, + "learning_rate": 1.0625385815584537e-05, + "loss": 1.6457, + "step": 25901 + }, + { + "epoch": 7.9502762430939224, + "grad_norm": 0.1736447811126709, + "learning_rate": 1.0622322532275186e-05, + "loss": 1.7278, + "step": 25902 + }, + { + "epoch": 7.950583179864948, + "grad_norm": 0.1712762862443924, + "learning_rate": 1.061925963811915e-05, + "loss": 1.7208, + "step": 25903 + }, + { + "epoch": 7.950890116635973, + "grad_norm": 0.15313011407852173, + "learning_rate": 1.0616197133146661e-05, + "loss": 1.671, + "step": 25904 + }, + { + "epoch": 7.9511970534069984, + "grad_norm": 0.15110735595226288, + "learning_rate": 1.0613135017387981e-05, + "loss": 1.6568, + "step": 25905 + }, + { + "epoch": 7.951503990178024, + "grad_norm": 0.22678901255130768, + "learning_rate": 1.0610073290873413e-05, + "loss": 1.7415, + "step": 25906 + }, + { + "epoch": 7.951810926949048, + "grad_norm": 0.16936101019382477, + "learning_rate": 1.0607011953633162e-05, + "loss": 1.6983, + "step": 25907 + }, + { + "epoch": 7.952117863720074, + "grad_norm": 0.18443427979946136, + "learning_rate": 1.0603951005697533e-05, + "loss": 1.7334, + "step": 25908 + }, + { + "epoch": 7.952424800491099, + "grad_norm": 0.2290949672460556, + "learning_rate": 1.0600890447096729e-05, + "loss": 1.7219, + "step": 25909 + }, + { + "epoch": 7.952731737262124, + "grad_norm": 0.19244399666786194, + "learning_rate": 1.0597830277861026e-05, + "loss": 1.7047, + "step": 25910 + }, + { + "epoch": 7.953038674033149, + "grad_norm": 0.15806549787521362, + "learning_rate": 1.0594770498020657e-05, + "loss": 1.667, + "step": 25911 + }, + { + "epoch": 7.953345610804174, + "grad_norm": 0.23782655596733093, + "learning_rate": 1.0591711107605867e-05, + "loss": 1.7271, + "step": 25912 + }, + { + "epoch": 7.953652547575199, + "grad_norm": 0.18427079916000366, + "learning_rate": 1.0588652106646885e-05, + "loss": 1.7644, + "step": 25913 + }, + { + "epoch": 7.953959484346225, + "grad_norm": 0.18687991797924042, + "learning_rate": 1.058559349517394e-05, + "loss": 1.7045, + "step": 25914 + }, + { + "epoch": 7.95426642111725, + "grad_norm": 0.17435906827449799, + "learning_rate": 1.0582535273217265e-05, + "loss": 1.6681, + "step": 25915 + }, + { + "epoch": 7.954573357888275, + "grad_norm": 0.17601260542869568, + "learning_rate": 1.0579477440807079e-05, + "loss": 1.7141, + "step": 25916 + }, + { + "epoch": 7.9548802946593, + "grad_norm": 0.19225506484508514, + "learning_rate": 1.0576419997973586e-05, + "loss": 1.7224, + "step": 25917 + }, + { + "epoch": 7.955187231430325, + "grad_norm": 0.18801991641521454, + "learning_rate": 1.0573362944747045e-05, + "loss": 1.715, + "step": 25918 + }, + { + "epoch": 7.9554941682013505, + "grad_norm": 0.21490465104579926, + "learning_rate": 1.0570306281157616e-05, + "loss": 1.7931, + "step": 25919 + }, + { + "epoch": 7.955801104972376, + "grad_norm": 0.1877163052558899, + "learning_rate": 1.0567250007235557e-05, + "loss": 1.7365, + "step": 25920 + }, + { + "epoch": 7.956108041743401, + "grad_norm": 0.18460121750831604, + "learning_rate": 1.0564194123011029e-05, + "loss": 1.7092, + "step": 25921 + }, + { + "epoch": 7.956414978514426, + "grad_norm": 0.1663859337568283, + "learning_rate": 1.0561138628514239e-05, + "loss": 1.6847, + "step": 25922 + }, + { + "epoch": 7.956721915285451, + "grad_norm": 0.1676093488931656, + "learning_rate": 1.0558083523775413e-05, + "loss": 1.6788, + "step": 25923 + }, + { + "epoch": 7.957028852056476, + "grad_norm": 0.17470842599868774, + "learning_rate": 1.0555028808824702e-05, + "loss": 1.7658, + "step": 25924 + }, + { + "epoch": 7.957335788827502, + "grad_norm": 0.17770788073539734, + "learning_rate": 1.0551974483692346e-05, + "loss": 1.6875, + "step": 25925 + }, + { + "epoch": 7.957642725598527, + "grad_norm": 0.17924711108207703, + "learning_rate": 1.054892054840847e-05, + "loss": 1.7024, + "step": 25926 + }, + { + "epoch": 7.957949662369552, + "grad_norm": 0.19387175142765045, + "learning_rate": 1.0545867003003296e-05, + "loss": 1.7806, + "step": 25927 + }, + { + "epoch": 7.958256599140577, + "grad_norm": 0.176667258143425, + "learning_rate": 1.0542813847506988e-05, + "loss": 1.7187, + "step": 25928 + }, + { + "epoch": 7.958563535911602, + "grad_norm": 0.1730370670557022, + "learning_rate": 1.0539761081949723e-05, + "loss": 1.6912, + "step": 25929 + }, + { + "epoch": 7.958870472682627, + "grad_norm": 0.1836516112089157, + "learning_rate": 1.0536708706361665e-05, + "loss": 1.684, + "step": 25930 + }, + { + "epoch": 7.959177409453653, + "grad_norm": 0.17236517369747162, + "learning_rate": 1.0533656720772983e-05, + "loss": 1.6799, + "step": 25931 + }, + { + "epoch": 7.959484346224678, + "grad_norm": 0.1655581295490265, + "learning_rate": 1.0530605125213832e-05, + "loss": 1.755, + "step": 25932 + }, + { + "epoch": 7.9597912829957025, + "grad_norm": 0.1801871806383133, + "learning_rate": 1.0527553919714383e-05, + "loss": 1.6998, + "step": 25933 + }, + { + "epoch": 7.960098219766728, + "grad_norm": 0.20504651963710785, + "learning_rate": 1.052450310430476e-05, + "loss": 1.7793, + "step": 25934 + }, + { + "epoch": 7.960405156537753, + "grad_norm": 0.2522159516811371, + "learning_rate": 1.052145267901517e-05, + "loss": 1.754, + "step": 25935 + }, + { + "epoch": 7.9607120933087785, + "grad_norm": 0.18074269592761993, + "learning_rate": 1.0518402643875691e-05, + "loss": 1.717, + "step": 25936 + }, + { + "epoch": 7.961019030079804, + "grad_norm": 0.16463595628738403, + "learning_rate": 1.0515352998916527e-05, + "loss": 1.6994, + "step": 25937 + }, + { + "epoch": 7.961325966850829, + "grad_norm": 0.17102178931236267, + "learning_rate": 1.0512303744167778e-05, + "loss": 1.6571, + "step": 25938 + }, + { + "epoch": 7.961632903621854, + "grad_norm": 0.14453014731407166, + "learning_rate": 1.0509254879659569e-05, + "loss": 1.6725, + "step": 25939 + }, + { + "epoch": 7.961939840392879, + "grad_norm": 0.1980808526277542, + "learning_rate": 1.050620640542208e-05, + "loss": 1.6847, + "step": 25940 + }, + { + "epoch": 7.962246777163904, + "grad_norm": 0.15021857619285583, + "learning_rate": 1.0503158321485378e-05, + "loss": 1.6896, + "step": 25941 + }, + { + "epoch": 7.96255371393493, + "grad_norm": 0.2223394513130188, + "learning_rate": 1.0500110627879639e-05, + "loss": 1.7167, + "step": 25942 + }, + { + "epoch": 7.962860650705955, + "grad_norm": 0.17636358737945557, + "learning_rate": 1.0497063324634937e-05, + "loss": 1.6625, + "step": 25943 + }, + { + "epoch": 7.963167587476979, + "grad_norm": 0.1823662370443344, + "learning_rate": 1.049401641178142e-05, + "loss": 1.7139, + "step": 25944 + }, + { + "epoch": 7.963474524248005, + "grad_norm": 0.1740594059228897, + "learning_rate": 1.0490969889349189e-05, + "loss": 1.7447, + "step": 25945 + }, + { + "epoch": 7.96378146101903, + "grad_norm": 0.15838129818439484, + "learning_rate": 1.0487923757368351e-05, + "loss": 1.7051, + "step": 25946 + }, + { + "epoch": 7.964088397790055, + "grad_norm": 0.4309011399745941, + "learning_rate": 1.0484878015869005e-05, + "loss": 1.7442, + "step": 25947 + }, + { + "epoch": 7.964395334561081, + "grad_norm": 0.17090202867984772, + "learning_rate": 1.0481832664881257e-05, + "loss": 1.652, + "step": 25948 + }, + { + "epoch": 7.964702271332106, + "grad_norm": 0.16977159678936005, + "learning_rate": 1.0478787704435206e-05, + "loss": 1.6894, + "step": 25949 + }, + { + "epoch": 7.9650092081031305, + "grad_norm": 0.20473513007164001, + "learning_rate": 1.0475743134560934e-05, + "loss": 1.8141, + "step": 25950 + }, + { + "epoch": 7.965316144874156, + "grad_norm": 0.1775660663843155, + "learning_rate": 1.0472698955288535e-05, + "loss": 1.7204, + "step": 25951 + }, + { + "epoch": 7.965623081645181, + "grad_norm": 0.21351923048496246, + "learning_rate": 1.046965516664809e-05, + "loss": 1.7364, + "step": 25952 + }, + { + "epoch": 7.9659300184162065, + "grad_norm": 0.2034255862236023, + "learning_rate": 1.0466611768669671e-05, + "loss": 1.7096, + "step": 25953 + }, + { + "epoch": 7.966236955187231, + "grad_norm": 0.17075900733470917, + "learning_rate": 1.0463568761383396e-05, + "loss": 1.6928, + "step": 25954 + }, + { + "epoch": 7.966543891958256, + "grad_norm": 0.18142712116241455, + "learning_rate": 1.0460526144819288e-05, + "loss": 1.7146, + "step": 25955 + }, + { + "epoch": 7.966850828729282, + "grad_norm": 0.14901846647262573, + "learning_rate": 1.0457483919007427e-05, + "loss": 1.6841, + "step": 25956 + }, + { + "epoch": 7.967157765500307, + "grad_norm": 0.17380031943321228, + "learning_rate": 1.0454442083977912e-05, + "loss": 1.6911, + "step": 25957 + }, + { + "epoch": 7.967464702271332, + "grad_norm": 0.15983760356903076, + "learning_rate": 1.045140063976075e-05, + "loss": 1.6866, + "step": 25958 + }, + { + "epoch": 7.967771639042358, + "grad_norm": 0.1559101641178131, + "learning_rate": 1.0448359586386058e-05, + "loss": 1.6793, + "step": 25959 + }, + { + "epoch": 7.968078575813382, + "grad_norm": 0.14843949675559998, + "learning_rate": 1.0445318923883829e-05, + "loss": 1.6835, + "step": 25960 + }, + { + "epoch": 7.968385512584407, + "grad_norm": 0.16452330350875854, + "learning_rate": 1.0442278652284155e-05, + "loss": 1.7304, + "step": 25961 + }, + { + "epoch": 7.968692449355433, + "grad_norm": 0.18997763097286224, + "learning_rate": 1.0439238771617066e-05, + "loss": 1.7425, + "step": 25962 + }, + { + "epoch": 7.968999386126458, + "grad_norm": 0.1654025912284851, + "learning_rate": 1.0436199281912611e-05, + "loss": 1.6909, + "step": 25963 + }, + { + "epoch": 7.969306322897483, + "grad_norm": 0.1313011646270752, + "learning_rate": 1.0433160183200823e-05, + "loss": 1.6572, + "step": 25964 + }, + { + "epoch": 7.969613259668508, + "grad_norm": 0.1584165096282959, + "learning_rate": 1.043012147551174e-05, + "loss": 1.7257, + "step": 25965 + }, + { + "epoch": 7.969920196439533, + "grad_norm": 0.17830775678157806, + "learning_rate": 1.0427083158875384e-05, + "loss": 1.7382, + "step": 25966 + }, + { + "epoch": 7.9702271332105585, + "grad_norm": 0.19006042182445526, + "learning_rate": 1.0424045233321788e-05, + "loss": 1.7366, + "step": 25967 + }, + { + "epoch": 7.970534069981584, + "grad_norm": 0.15366297960281372, + "learning_rate": 1.0421007698880974e-05, + "loss": 1.7235, + "step": 25968 + }, + { + "epoch": 7.970841006752609, + "grad_norm": 0.14415831863880157, + "learning_rate": 1.0417970555582963e-05, + "loss": 1.6945, + "step": 25969 + }, + { + "epoch": 7.9711479435236345, + "grad_norm": 0.16916446387767792, + "learning_rate": 1.041493380345775e-05, + "loss": 1.7099, + "step": 25970 + }, + { + "epoch": 7.971454880294659, + "grad_norm": 0.1456119269132614, + "learning_rate": 1.041189744253539e-05, + "loss": 1.6544, + "step": 25971 + }, + { + "epoch": 7.971761817065684, + "grad_norm": 0.20085962116718292, + "learning_rate": 1.040886147284585e-05, + "loss": 1.699, + "step": 25972 + }, + { + "epoch": 7.97206875383671, + "grad_norm": 0.1815454363822937, + "learning_rate": 1.0405825894419141e-05, + "loss": 1.7503, + "step": 25973 + }, + { + "epoch": 7.972375690607735, + "grad_norm": 0.2010805308818817, + "learning_rate": 1.040279070728527e-05, + "loss": 1.7061, + "step": 25974 + }, + { + "epoch": 7.97268262737876, + "grad_norm": 0.22105813026428223, + "learning_rate": 1.0399755911474218e-05, + "loss": 1.7262, + "step": 25975 + }, + { + "epoch": 7.972989564149785, + "grad_norm": 0.16186046600341797, + "learning_rate": 1.0396721507016017e-05, + "loss": 1.7229, + "step": 25976 + }, + { + "epoch": 7.97329650092081, + "grad_norm": 0.19990484416484833, + "learning_rate": 1.0393687493940597e-05, + "loss": 1.7006, + "step": 25977 + }, + { + "epoch": 7.973603437691835, + "grad_norm": 0.2377716600894928, + "learning_rate": 1.0390653872277983e-05, + "loss": 1.7302, + "step": 25978 + }, + { + "epoch": 7.973910374462861, + "grad_norm": 0.14087189733982086, + "learning_rate": 1.0387620642058148e-05, + "loss": 1.6563, + "step": 25979 + }, + { + "epoch": 7.974217311233886, + "grad_norm": 0.246252179145813, + "learning_rate": 1.0384587803311063e-05, + "loss": 1.6661, + "step": 25980 + }, + { + "epoch": 7.974524248004911, + "grad_norm": 0.18734396994113922, + "learning_rate": 1.0381555356066697e-05, + "loss": 1.7566, + "step": 25981 + }, + { + "epoch": 7.974831184775936, + "grad_norm": 0.1621570736169815, + "learning_rate": 1.0378523300355025e-05, + "loss": 1.6863, + "step": 25982 + }, + { + "epoch": 7.975138121546961, + "grad_norm": 0.2571845054626465, + "learning_rate": 1.0375491636206002e-05, + "loss": 1.7589, + "step": 25983 + }, + { + "epoch": 7.975445058317987, + "grad_norm": 0.1880367249250412, + "learning_rate": 1.0372460363649606e-05, + "loss": 1.6999, + "step": 25984 + }, + { + "epoch": 7.975751995089012, + "grad_norm": 0.20473778247833252, + "learning_rate": 1.0369429482715776e-05, + "loss": 1.749, + "step": 25985 + }, + { + "epoch": 7.976058931860036, + "grad_norm": 0.19917427003383636, + "learning_rate": 1.0366398993434473e-05, + "loss": 1.701, + "step": 25986 + }, + { + "epoch": 7.976365868631062, + "grad_norm": 0.1758740097284317, + "learning_rate": 1.0363368895835635e-05, + "loss": 1.6774, + "step": 25987 + }, + { + "epoch": 7.976672805402087, + "grad_norm": 0.26412737369537354, + "learning_rate": 1.0360339189949242e-05, + "loss": 1.6778, + "step": 25988 + }, + { + "epoch": 7.976979742173112, + "grad_norm": 0.19599425792694092, + "learning_rate": 1.0357309875805194e-05, + "loss": 1.777, + "step": 25989 + }, + { + "epoch": 7.977286678944138, + "grad_norm": 0.2095821648836136, + "learning_rate": 1.0354280953433449e-05, + "loss": 1.7106, + "step": 25990 + }, + { + "epoch": 7.977593615715163, + "grad_norm": 0.1743748039007187, + "learning_rate": 1.0351252422863934e-05, + "loss": 1.6891, + "step": 25991 + }, + { + "epoch": 7.9779005524861875, + "grad_norm": 0.17273737490177155, + "learning_rate": 1.0348224284126573e-05, + "loss": 1.7254, + "step": 25992 + }, + { + "epoch": 7.978207489257213, + "grad_norm": 0.2032385915517807, + "learning_rate": 1.0345196537251322e-05, + "loss": 1.707, + "step": 25993 + }, + { + "epoch": 7.978514426028238, + "grad_norm": 0.17978399991989136, + "learning_rate": 1.0342169182268057e-05, + "loss": 1.695, + "step": 25994 + }, + { + "epoch": 7.9788213627992635, + "grad_norm": 0.20567134022712708, + "learning_rate": 1.0339142219206744e-05, + "loss": 1.6726, + "step": 25995 + }, + { + "epoch": 7.979128299570289, + "grad_norm": 0.19649706780910492, + "learning_rate": 1.033611564809725e-05, + "loss": 1.737, + "step": 25996 + }, + { + "epoch": 7.979435236341313, + "grad_norm": 0.1640859991312027, + "learning_rate": 1.033308946896952e-05, + "loss": 1.6993, + "step": 25997 + }, + { + "epoch": 7.979742173112339, + "grad_norm": 0.21497343480587006, + "learning_rate": 1.0330063681853452e-05, + "loss": 1.7387, + "step": 25998 + }, + { + "epoch": 7.980049109883364, + "grad_norm": 0.14995479583740234, + "learning_rate": 1.0327038286778946e-05, + "loss": 1.6671, + "step": 25999 + }, + { + "epoch": 7.980356046654389, + "grad_norm": 0.1836833655834198, + "learning_rate": 1.0324013283775895e-05, + "loss": 1.7279, + "step": 26000 + }, + { + "epoch": 7.980662983425415, + "grad_norm": 0.14769285917282104, + "learning_rate": 1.032098867287421e-05, + "loss": 1.707, + "step": 26001 + }, + { + "epoch": 7.98096992019644, + "grad_norm": 0.24206426739692688, + "learning_rate": 1.0317964454103762e-05, + "loss": 1.8122, + "step": 26002 + }, + { + "epoch": 7.981276856967464, + "grad_norm": 0.16573204100131989, + "learning_rate": 1.0314940627494451e-05, + "loss": 1.7079, + "step": 26003 + }, + { + "epoch": 7.98158379373849, + "grad_norm": 0.1825968325138092, + "learning_rate": 1.0311917193076143e-05, + "loss": 1.6795, + "step": 26004 + }, + { + "epoch": 7.981890730509515, + "grad_norm": 0.14462140202522278, + "learning_rate": 1.0308894150878761e-05, + "loss": 1.7152, + "step": 26005 + }, + { + "epoch": 7.98219766728054, + "grad_norm": 0.15220513939857483, + "learning_rate": 1.0305871500932135e-05, + "loss": 1.6657, + "step": 26006 + }, + { + "epoch": 7.982504604051566, + "grad_norm": 0.17780731618404388, + "learning_rate": 1.030284924326615e-05, + "loss": 1.6852, + "step": 26007 + }, + { + "epoch": 7.98281154082259, + "grad_norm": 0.13492488861083984, + "learning_rate": 1.0299827377910681e-05, + "loss": 1.6331, + "step": 26008 + }, + { + "epoch": 7.9831184775936155, + "grad_norm": 0.1566525399684906, + "learning_rate": 1.0296805904895568e-05, + "loss": 1.6918, + "step": 26009 + }, + { + "epoch": 7.983425414364641, + "grad_norm": 0.17075398564338684, + "learning_rate": 1.0293784824250725e-05, + "loss": 1.7107, + "step": 26010 + }, + { + "epoch": 7.983732351135666, + "grad_norm": 0.16693715751171112, + "learning_rate": 1.0290764136005937e-05, + "loss": 1.6773, + "step": 26011 + }, + { + "epoch": 7.9840392879066915, + "grad_norm": 0.23020583391189575, + "learning_rate": 1.0287743840191122e-05, + "loss": 1.7389, + "step": 26012 + }, + { + "epoch": 7.984346224677717, + "grad_norm": 0.2185986489057541, + "learning_rate": 1.0284723936836071e-05, + "loss": 1.7039, + "step": 26013 + }, + { + "epoch": 7.984653161448741, + "grad_norm": 0.1527925282716751, + "learning_rate": 1.0281704425970673e-05, + "loss": 1.6981, + "step": 26014 + }, + { + "epoch": 7.984960098219767, + "grad_norm": 0.23389141261577606, + "learning_rate": 1.0278685307624747e-05, + "loss": 1.7511, + "step": 26015 + }, + { + "epoch": 7.985267034990792, + "grad_norm": 0.1481025218963623, + "learning_rate": 1.0275666581828137e-05, + "loss": 1.6551, + "step": 26016 + }, + { + "epoch": 7.985573971761817, + "grad_norm": 0.18131811916828156, + "learning_rate": 1.0272648248610672e-05, + "loss": 1.7024, + "step": 26017 + }, + { + "epoch": 7.985880908532843, + "grad_norm": 0.15969321131706238, + "learning_rate": 1.0269630308002182e-05, + "loss": 1.7269, + "step": 26018 + }, + { + "epoch": 7.986187845303867, + "grad_norm": 0.16655376553535461, + "learning_rate": 1.026661276003249e-05, + "loss": 1.6649, + "step": 26019 + }, + { + "epoch": 7.986494782074892, + "grad_norm": 0.16438528895378113, + "learning_rate": 1.0263595604731425e-05, + "loss": 1.6901, + "step": 26020 + }, + { + "epoch": 7.986801718845918, + "grad_norm": 0.23586809635162354, + "learning_rate": 1.0260578842128782e-05, + "loss": 1.7983, + "step": 26021 + }, + { + "epoch": 7.987108655616943, + "grad_norm": 0.15142324566841125, + "learning_rate": 1.0257562472254417e-05, + "loss": 1.6327, + "step": 26022 + }, + { + "epoch": 7.987415592387968, + "grad_norm": 0.17198510468006134, + "learning_rate": 1.0254546495138096e-05, + "loss": 1.7119, + "step": 26023 + }, + { + "epoch": 7.987722529158994, + "grad_norm": 0.1675531417131424, + "learning_rate": 1.0251530910809648e-05, + "loss": 1.695, + "step": 26024 + }, + { + "epoch": 7.988029465930018, + "grad_norm": 0.17403315007686615, + "learning_rate": 1.0248515719298867e-05, + "loss": 1.7216, + "step": 26025 + }, + { + "epoch": 7.9883364027010435, + "grad_norm": 0.16039720177650452, + "learning_rate": 1.0245500920635537e-05, + "loss": 1.7315, + "step": 26026 + }, + { + "epoch": 7.988643339472069, + "grad_norm": 0.19715416431427002, + "learning_rate": 1.0242486514849498e-05, + "loss": 1.7308, + "step": 26027 + }, + { + "epoch": 7.988950276243094, + "grad_norm": 0.14576783776283264, + "learning_rate": 1.0239472501970482e-05, + "loss": 1.6589, + "step": 26028 + }, + { + "epoch": 7.989257213014119, + "grad_norm": 0.1631615310907364, + "learning_rate": 1.0236458882028333e-05, + "loss": 1.7494, + "step": 26029 + }, + { + "epoch": 7.989564149785144, + "grad_norm": 0.19368192553520203, + "learning_rate": 1.023344565505277e-05, + "loss": 1.735, + "step": 26030 + }, + { + "epoch": 7.989871086556169, + "grad_norm": 0.1902317851781845, + "learning_rate": 1.023043282107362e-05, + "loss": 1.7573, + "step": 26031 + }, + { + "epoch": 7.990178023327195, + "grad_norm": 0.18496233224868774, + "learning_rate": 1.0227420380120651e-05, + "loss": 1.7368, + "step": 26032 + }, + { + "epoch": 7.99048496009822, + "grad_norm": 0.172613263130188, + "learning_rate": 1.0224408332223617e-05, + "loss": 1.6943, + "step": 26033 + }, + { + "epoch": 7.990791896869245, + "grad_norm": 0.19840112328529358, + "learning_rate": 1.0221396677412293e-05, + "loss": 1.7562, + "step": 26034 + }, + { + "epoch": 7.99109883364027, + "grad_norm": 0.18129339814186096, + "learning_rate": 1.0218385415716441e-05, + "loss": 1.6746, + "step": 26035 + }, + { + "epoch": 7.991405770411295, + "grad_norm": 0.17933470010757446, + "learning_rate": 1.021537454716583e-05, + "loss": 1.7324, + "step": 26036 + }, + { + "epoch": 7.99171270718232, + "grad_norm": 0.14947326481342316, + "learning_rate": 1.0212364071790198e-05, + "loss": 1.632, + "step": 26037 + }, + { + "epoch": 7.992019643953346, + "grad_norm": 0.18452878296375275, + "learning_rate": 1.0209353989619291e-05, + "loss": 1.6737, + "step": 26038 + }, + { + "epoch": 7.992326580724371, + "grad_norm": 0.18882198631763458, + "learning_rate": 1.0206344300682901e-05, + "loss": 1.7529, + "step": 26039 + }, + { + "epoch": 7.9926335174953955, + "grad_norm": 0.1855655312538147, + "learning_rate": 1.0203335005010722e-05, + "loss": 1.7347, + "step": 26040 + }, + { + "epoch": 7.992940454266421, + "grad_norm": 0.16447728872299194, + "learning_rate": 1.0200326102632518e-05, + "loss": 1.6659, + "step": 26041 + }, + { + "epoch": 7.993247391037446, + "grad_norm": 0.17379891872406006, + "learning_rate": 1.0197317593578016e-05, + "loss": 1.6962, + "step": 26042 + }, + { + "epoch": 7.9935543278084715, + "grad_norm": 0.16298875212669373, + "learning_rate": 1.0194309477876934e-05, + "loss": 1.6815, + "step": 26043 + }, + { + "epoch": 7.993861264579497, + "grad_norm": 0.1883227378129959, + "learning_rate": 1.0191301755559047e-05, + "loss": 1.7053, + "step": 26044 + }, + { + "epoch": 7.994168201350522, + "grad_norm": 0.20746919512748718, + "learning_rate": 1.0188294426654021e-05, + "loss": 1.7476, + "step": 26045 + }, + { + "epoch": 7.994475138121547, + "grad_norm": 0.1882137805223465, + "learning_rate": 1.0185287491191631e-05, + "loss": 1.7078, + "step": 26046 + }, + { + "epoch": 7.994782074892572, + "grad_norm": 0.21140792965888977, + "learning_rate": 1.0182280949201539e-05, + "loss": 1.7729, + "step": 26047 + }, + { + "epoch": 7.995089011663597, + "grad_norm": 0.18779736757278442, + "learning_rate": 1.0179274800713501e-05, + "loss": 1.7413, + "step": 26048 + }, + { + "epoch": 7.995395948434623, + "grad_norm": 0.1841782033443451, + "learning_rate": 1.0176269045757202e-05, + "loss": 1.7058, + "step": 26049 + }, + { + "epoch": 7.995702885205648, + "grad_norm": 0.19872064888477325, + "learning_rate": 1.017326368436236e-05, + "loss": 1.7522, + "step": 26050 + }, + { + "epoch": 7.996009821976672, + "grad_norm": 0.1763429492712021, + "learning_rate": 1.0170258716558667e-05, + "loss": 1.7178, + "step": 26051 + }, + { + "epoch": 7.996316758747698, + "grad_norm": 0.20209169387817383, + "learning_rate": 1.0167254142375826e-05, + "loss": 1.723, + "step": 26052 + }, + { + "epoch": 7.996623695518723, + "grad_norm": 0.15985172986984253, + "learning_rate": 1.0164249961843519e-05, + "loss": 1.6985, + "step": 26053 + }, + { + "epoch": 7.996930632289748, + "grad_norm": 0.1985132247209549, + "learning_rate": 1.0161246174991451e-05, + "loss": 1.7982, + "step": 26054 + }, + { + "epoch": 7.997237569060774, + "grad_norm": 0.17600803077220917, + "learning_rate": 1.0158242781849292e-05, + "loss": 1.7009, + "step": 26055 + }, + { + "epoch": 7.997544505831799, + "grad_norm": 0.15485480427742004, + "learning_rate": 1.015523978244673e-05, + "loss": 1.675, + "step": 26056 + }, + { + "epoch": 7.9978514426028235, + "grad_norm": 0.18465322256088257, + "learning_rate": 1.0152237176813446e-05, + "loss": 1.7156, + "step": 26057 + }, + { + "epoch": 7.998158379373849, + "grad_norm": 0.2183876633644104, + "learning_rate": 1.014923496497911e-05, + "loss": 1.7805, + "step": 26058 + }, + { + "epoch": 7.998465316144874, + "grad_norm": 0.18724960088729858, + "learning_rate": 1.014623314697339e-05, + "loss": 1.7047, + "step": 26059 + }, + { + "epoch": 7.9987722529158995, + "grad_norm": 0.15459159016609192, + "learning_rate": 1.0143231722825936e-05, + "loss": 1.6595, + "step": 26060 + }, + { + "epoch": 7.999079189686924, + "grad_norm": 0.16338171064853668, + "learning_rate": 1.0140230692566454e-05, + "loss": 1.6907, + "step": 26061 + }, + { + "epoch": 7.999386126457949, + "grad_norm": 0.16223935782909393, + "learning_rate": 1.013723005622455e-05, + "loss": 1.6866, + "step": 26062 + }, + { + "epoch": 7.999693063228975, + "grad_norm": 0.18934771418571472, + "learning_rate": 1.0134229813829931e-05, + "loss": 1.706, + "step": 26063 + }, + { + "epoch": 8.0, + "grad_norm": 0.19117574393749237, + "learning_rate": 1.0131229965412191e-05, + "loss": 1.7392, + "step": 26064 + } + ], + "logging_steps": 1.0, + "max_steps": 32580, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.706030188146333e+20, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-26064/training_args.bin b/checkpoint-26064/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9db7ad91da5423a229826113feb3e9db3ef40c31 --- /dev/null +++ b/checkpoint-26064/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682b697e933b6e2693e5f9af9a0654effab1ca392c8500bf8af0eb089116a263 +size 7288 diff --git a/checkpoint-26064/zero_to_fp32.py b/checkpoint-26064/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-26064/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-29322/config.json b/checkpoint-29322/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a29af639fbf705188c21aae22660a85fee1ca26e --- /dev/null +++ b/checkpoint-29322/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "freeze_mm_mlp_adapter": false, + "gen_hidden_size": 1792, + "gen_pooling": "early_pool2d_4", + "gen_vision_tower": "eva-clip-E-14-plus", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "image_aspect_ratio": "square", + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "llava_llama", + "n_query": 64, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "tokenizer_model_max_length": 256, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "tune_mm_mlp_adapter": false, + "use_cache": false, + "use_mm_proj": true, + "vision_tower_pretrained": null, + "vocab_size": 128260 +} diff --git a/checkpoint-29322/generation_config.json b/checkpoint-29322/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..05892c70fa899883072c585fa444b4aa7175d6bc --- /dev/null +++ b/checkpoint-29322/generation_config.json @@ -0,0 +1,13 @@ +{ + "attn_implementation": "flash_attention_2", + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-29322/latest b/checkpoint-29322/latest new file mode 100644 index 0000000000000000000000000000000000000000..f1ad1f94884ec552257d6ca8dc30775b67e37af5 --- /dev/null +++ b/checkpoint-29322/latest @@ -0,0 +1 @@ +global_step29322 \ No newline at end of file diff --git a/checkpoint-29322/model-00001-of-00003.safetensors b/checkpoint-29322/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..96f2959c3414c2e3e98b007f7394f89870ef063c --- /dev/null +++ b/checkpoint-29322/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec87b75ef3e511a62b16dddbd005d9b768aec82d0d4f0947d8e638b9e39ccd1 +size 4955415870 diff --git a/checkpoint-29322/model-00002-of-00003.safetensors b/checkpoint-29322/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1c6f3bf70f8abb1e7ffb233219debc10bc20bfc --- /dev/null +++ b/checkpoint-29322/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b088e0e2c4fb5916f448522fa5aef361db713e2c2c0ceac534662c8d52e330d +size 4971563008 diff --git a/checkpoint-29322/model-00003-of-00003.safetensors b/checkpoint-29322/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3f3a35ef6b0516414d32d180b5e99337b93b93a0 --- /dev/null +++ b/checkpoint-29322/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22ff86b3e4325f0087c8523875edd02eec0da43f4126b14d3800b221b8c0a4ca +size 4180840856 diff --git a/checkpoint-29322/model.safetensors.index.json b/checkpoint-29322/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c911c94f46f802ae304903dd7796da96c28604 --- /dev/null +++ b/checkpoint-29322/model.safetensors.index.json @@ -0,0 +1,2358 @@ +{ + "metadata": { + "total_size": 14107506086 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.bias": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.cls_token": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.pos_embed": "model-00001-of-00003.safetensors", + "model.latent_queries": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/checkpoint-29322/rng_state_0.pth b/checkpoint-29322/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..586ada19879c8e5bfb531243769aacf12a10b59f --- /dev/null +++ b/checkpoint-29322/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:426839dc43a2348d6701d753c63b5de5c7151ec6e18123f0c6b621343110507f +size 15984 diff --git a/checkpoint-29322/rng_state_1.pth b/checkpoint-29322/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7921c4f9cb7917c074b030fd06d954703e951c76 --- /dev/null +++ b/checkpoint-29322/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27c7d5dab66416dddd85b92278d21b04107dda0a5fcc19080193b0985d0f07d7 +size 15984 diff --git a/checkpoint-29322/rng_state_10.pth b/checkpoint-29322/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..f393eb62e4d7f85fc32e1ad02b62b46ba95097f2 --- /dev/null +++ b/checkpoint-29322/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:367f2dfb4b40a7fffd3c25830c699f2c2a6664fc2deac32ba4f87d394d46df3c +size 15997 diff --git a/checkpoint-29322/rng_state_11.pth b/checkpoint-29322/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..cdaa4a1518a560e15db9a9af74b03a7105f2d403 --- /dev/null +++ b/checkpoint-29322/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41696ea4a7618d698a0750d131899d7e6dac262fa73c1f2ddee412a744f49f0a +size 15997 diff --git a/checkpoint-29322/rng_state_12.pth b/checkpoint-29322/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..53e5cc15d152b0acd35d0e510848f8fad69a8f45 --- /dev/null +++ b/checkpoint-29322/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d5a3277ed918f26e95ab3847465cd3c0e8efb2fb7fba757ed27c984173bbb72 +size 15997 diff --git a/checkpoint-29322/rng_state_13.pth b/checkpoint-29322/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..596ca3cb4cc17ec74fd722b3e652995346e9c4c4 --- /dev/null +++ b/checkpoint-29322/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef6628714a757cf49a462ec382daacae849ea5b707307985349fa05e9452d6d2 +size 15997 diff --git a/checkpoint-29322/rng_state_14.pth b/checkpoint-29322/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf3f7ae607d21715d5ce2634a9078f0aff327f4b --- /dev/null +++ b/checkpoint-29322/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38a84757da67c3f8e393f6ade156f1180899e49cc6461a1150f2e2a400d77956 +size 15997 diff --git a/checkpoint-29322/rng_state_15.pth b/checkpoint-29322/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..db4f8a33baecc0a0b8ae19999b1c815c403d660e --- /dev/null +++ b/checkpoint-29322/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53cfa2de0cee12c8ffc427b71d9eb77391878f68726e847e76f6963e250384f2 +size 15997 diff --git a/checkpoint-29322/rng_state_16.pth b/checkpoint-29322/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..8bad095076516d69db716f2cd9f878a8b3fd1cee --- /dev/null +++ b/checkpoint-29322/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94030f49066980e741aa1e7537262400d8b05a3d9b02233c294721c501e4f177 +size 15997 diff --git a/checkpoint-29322/rng_state_17.pth b/checkpoint-29322/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..50f37f4baa2002169fc8c89f739b2e20894ab1a6 --- /dev/null +++ b/checkpoint-29322/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d3dd9c52973c5709f1b77df92be90c212b53dc61c47bce82dcfdb2413f0420e +size 15997 diff --git a/checkpoint-29322/rng_state_18.pth b/checkpoint-29322/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..08a64629b27bf93bcbd0a53798e1e0f932504f00 --- /dev/null +++ b/checkpoint-29322/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f94d74e040b44695db4aac20f7a8a9dbc09db3338e89346c77e78fc9bf9e1cb +size 15997 diff --git a/checkpoint-29322/rng_state_19.pth b/checkpoint-29322/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..99758d47cccbc466ce935193ef26c7a55fdfc9e1 --- /dev/null +++ b/checkpoint-29322/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9141875aaf965b4137b83c9446520c138b748f7ea31e3296eb99521a05efca7d +size 15997 diff --git a/checkpoint-29322/rng_state_2.pth b/checkpoint-29322/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..43632a4c1a9855d64ecfcab2c6bcd46e1ce5c1b4 --- /dev/null +++ b/checkpoint-29322/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a74653c5984ca08b3a66d50b733c0c6c198c2a48ed9f7b71035fa64d328673e5 +size 15984 diff --git a/checkpoint-29322/rng_state_20.pth b/checkpoint-29322/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..685f696b0aa253016cb2c91c4565a6e7e7b0cf82 --- /dev/null +++ b/checkpoint-29322/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c244d5eebd2565e89728410e292845f556e45558208c48928547acf0d519da9e +size 15997 diff --git a/checkpoint-29322/rng_state_21.pth b/checkpoint-29322/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f636ae4dbdef3b38bba72592bed0ed727ba0e51 --- /dev/null +++ b/checkpoint-29322/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:278b5f8e1e018ae2c9917a5658f0296f08bb0fefa3d898d5776cfbbb93ad970b +size 15997 diff --git a/checkpoint-29322/rng_state_22.pth b/checkpoint-29322/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..d59dce19e4b29b824ce2ff599818a133384ed587 --- /dev/null +++ b/checkpoint-29322/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef5d13e90b4fc992bb1ccea723b7d90856c9444edaa4b5f227ba02f442e1a018 +size 15997 diff --git a/checkpoint-29322/rng_state_23.pth b/checkpoint-29322/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..5dfdf2ac87a653729bee3d32bc5da6979ea7f931 --- /dev/null +++ b/checkpoint-29322/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42200c85aada2ecfece7722ad202b6d2c509e37e35aba29c7a78912f21f0f8c7 +size 15997 diff --git a/checkpoint-29322/rng_state_24.pth b/checkpoint-29322/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..21490ae92a006af61046082c03b953e40187e818 --- /dev/null +++ b/checkpoint-29322/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:757a5ce7c63985a7c7b0c4bb374264a0019c2ed7228dfeb7319aa08e2642bb25 +size 15997 diff --git a/checkpoint-29322/rng_state_25.pth b/checkpoint-29322/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..898da4b9480ac15a8e0e6440d6d741598730e519 --- /dev/null +++ b/checkpoint-29322/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03782f5ee63dd383e8db3f67aa404695d09c3d575bd840a9288403fedbb4d6a2 +size 15997 diff --git a/checkpoint-29322/rng_state_26.pth b/checkpoint-29322/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..431f5ae8f77f91413d8ed4ba2b60adac9eb1ce92 --- /dev/null +++ b/checkpoint-29322/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a1ecd30f1a2d3f88a1e02ae000b37ad56205f22c341060bfd7cf15bf0485807 +size 15997 diff --git a/checkpoint-29322/rng_state_27.pth b/checkpoint-29322/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..3d9ff5398345ab9a9d54b8a2d055925a54ddc44f --- /dev/null +++ b/checkpoint-29322/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a524892a3ad365738b6aba5c80b9655a966f51aa559d6edbbf66ba35da09326 +size 15997 diff --git a/checkpoint-29322/rng_state_28.pth b/checkpoint-29322/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..99683511724cee47c99f9c8bcde9eac49c54afe0 --- /dev/null +++ b/checkpoint-29322/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5950f6d024cb705aad8430dff8cfd8442e97c5c7a0d48fcfed46d2a34c366f9a +size 15997 diff --git a/checkpoint-29322/rng_state_29.pth b/checkpoint-29322/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..507a91be297bd8ddde1c18d609f506716a3e0992 --- /dev/null +++ b/checkpoint-29322/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb6e66f95dfc7966f5285f9f25f73aa2c6f7ed8c7061ff3cfdd0c355f8cf937f +size 15997 diff --git a/checkpoint-29322/rng_state_3.pth b/checkpoint-29322/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..39b4a3526db16b3b6b2076d5eaca36b0234001c6 --- /dev/null +++ b/checkpoint-29322/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71392f6d8b3e07e0b0310db7d9d7b5d624133659ab3a5086d4d82eda3a04222b +size 15984 diff --git a/checkpoint-29322/rng_state_30.pth b/checkpoint-29322/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e4b5c2d8a39b24b78dbac4c5f6305b5d9c5f00d --- /dev/null +++ b/checkpoint-29322/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73c593e01a56f5ff5da813f502bff4d4960e6f5dbd67ee6d78c8f198a994e832 +size 15997 diff --git a/checkpoint-29322/rng_state_31.pth b/checkpoint-29322/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..65b32a55ccf1e2afad57a9d38617a4c8544671c6 --- /dev/null +++ b/checkpoint-29322/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:917efa1e33700ab456633dc1d7c1ea69a1a8df6c32a12b371263a7b493809479 +size 15997 diff --git a/checkpoint-29322/rng_state_32.pth b/checkpoint-29322/rng_state_32.pth new file mode 100644 index 0000000000000000000000000000000000000000..f51f941f21314e8d0945ae34a2932462f9acb2b2 --- /dev/null +++ b/checkpoint-29322/rng_state_32.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9eb5d4b3c57b8228fbe1214f8cfe9f0122620ec96f27b9fc7cb8f5bc393b21b +size 15997 diff --git a/checkpoint-29322/rng_state_33.pth b/checkpoint-29322/rng_state_33.pth new file mode 100644 index 0000000000000000000000000000000000000000..51d1fe68239527a993d6b1ffba2f8f4a7331130c --- /dev/null +++ b/checkpoint-29322/rng_state_33.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e49f60f4c7bda2f4d8c80ccfde55bf701b6be122f8d935136fdf6a8d46e0fe4 +size 15997 diff --git a/checkpoint-29322/rng_state_34.pth b/checkpoint-29322/rng_state_34.pth new file mode 100644 index 0000000000000000000000000000000000000000..2ad6b217436e044a9db82837aec6c66e4bb87f9a --- /dev/null +++ b/checkpoint-29322/rng_state_34.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d85adce830725bc2760d2fb96a440f9240af6bb17077b4f659ef5e2e56dc726 +size 15997 diff --git a/checkpoint-29322/rng_state_35.pth b/checkpoint-29322/rng_state_35.pth new file mode 100644 index 0000000000000000000000000000000000000000..697eab06b81696467ebe96b397163ab4d3200888 --- /dev/null +++ b/checkpoint-29322/rng_state_35.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef7a3a0ad3a14538ccb390785d5947f8dcb10dfaae6dfc52bb523c4a04256c1c +size 15997 diff --git a/checkpoint-29322/rng_state_36.pth b/checkpoint-29322/rng_state_36.pth new file mode 100644 index 0000000000000000000000000000000000000000..95796d122ada3fd3177cbb57cde335df1b222d0b --- /dev/null +++ b/checkpoint-29322/rng_state_36.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12a6d23dde2b8c4d41431e7744e619523dde1eb63d6ec0463ee12f3669e06ef6 +size 15997 diff --git a/checkpoint-29322/rng_state_37.pth b/checkpoint-29322/rng_state_37.pth new file mode 100644 index 0000000000000000000000000000000000000000..dddd43b59a30d3435355d1dc7a67226cbf094d79 --- /dev/null +++ b/checkpoint-29322/rng_state_37.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eed90822f970878d10681d3d8e6c7ed487ddf5bfcf14a154b312496221b2d29 +size 15997 diff --git a/checkpoint-29322/rng_state_38.pth b/checkpoint-29322/rng_state_38.pth new file mode 100644 index 0000000000000000000000000000000000000000..89a9cad018d033314b7f4b7a43bfd16b42d58b6b --- /dev/null +++ b/checkpoint-29322/rng_state_38.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aabc82223fd9ce0e491c619222cee3f6232086a84ca452af0727abe5efbeed7 +size 15997 diff --git a/checkpoint-29322/rng_state_39.pth b/checkpoint-29322/rng_state_39.pth new file mode 100644 index 0000000000000000000000000000000000000000..3d3baf2b4938aabd572b52aaf1ea20fff565809a --- /dev/null +++ b/checkpoint-29322/rng_state_39.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7b1b4c08f9292c9fe024634bfa99685d9984ee19152e5328031d2f894ef2975 +size 15997 diff --git a/checkpoint-29322/rng_state_4.pth b/checkpoint-29322/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c00e92e42d35831240028b8ce2fab974d387a569 --- /dev/null +++ b/checkpoint-29322/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be5778a2119fdb9e682e12c015580ac4b1bd10ac0fefb2569cb990a7167862a7 +size 15984 diff --git a/checkpoint-29322/rng_state_40.pth b/checkpoint-29322/rng_state_40.pth new file mode 100644 index 0000000000000000000000000000000000000000..c0e59b0812e7b1d6eb1de32fc66a3397b653aa75 --- /dev/null +++ b/checkpoint-29322/rng_state_40.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3618ce7f27bb1405926236d7c15692986967531ac60a6c9e611b689281d4e84f +size 15997 diff --git a/checkpoint-29322/rng_state_41.pth b/checkpoint-29322/rng_state_41.pth new file mode 100644 index 0000000000000000000000000000000000000000..06e6334bf083ece31a32819820ab6d61756f82c2 --- /dev/null +++ b/checkpoint-29322/rng_state_41.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:392fed3765ecdf22e6cb850fd21cb1492d818e3ae93297445471061cdeb22325 +size 15997 diff --git a/checkpoint-29322/rng_state_42.pth b/checkpoint-29322/rng_state_42.pth new file mode 100644 index 0000000000000000000000000000000000000000..903d747135a1c73c77207a0c84aabc6165f13060 --- /dev/null +++ b/checkpoint-29322/rng_state_42.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af26eeab5bead3c84f606f4d842805eb663b0713e792cd5f296344b109269edf +size 15997 diff --git a/checkpoint-29322/rng_state_43.pth b/checkpoint-29322/rng_state_43.pth new file mode 100644 index 0000000000000000000000000000000000000000..c965ab154875a7a4cab5ed524e6b68763d0ada88 --- /dev/null +++ b/checkpoint-29322/rng_state_43.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a67a06ec59932813ed5dd5c030076cc4b01316f24cf666a2b0d5d10cd79a1f3 +size 15997 diff --git a/checkpoint-29322/rng_state_44.pth b/checkpoint-29322/rng_state_44.pth new file mode 100644 index 0000000000000000000000000000000000000000..8dd983aa7fb42ec5c4d191cad1ee1d293113e0f7 --- /dev/null +++ b/checkpoint-29322/rng_state_44.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0eee7b9a598873c58547f2aa0137f10f3ab7c2c435d49f59bfdb0c8062f035a8 +size 15997 diff --git a/checkpoint-29322/rng_state_45.pth b/checkpoint-29322/rng_state_45.pth new file mode 100644 index 0000000000000000000000000000000000000000..5ba44db66330a3df5e58cc379bd911ea48bbd562 --- /dev/null +++ b/checkpoint-29322/rng_state_45.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d829236abe2199fbf4f43f14a721fc79f35807d8743a5b595404030d0f0d40f5 +size 15997 diff --git a/checkpoint-29322/rng_state_46.pth b/checkpoint-29322/rng_state_46.pth new file mode 100644 index 0000000000000000000000000000000000000000..68f17979cd7c692f389e0d5ce9453307f4fa1616 --- /dev/null +++ b/checkpoint-29322/rng_state_46.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb498b8816b1f3a1ad415e3ce5ef5de10059029312727a37839c99a0259196de +size 15997 diff --git a/checkpoint-29322/rng_state_47.pth b/checkpoint-29322/rng_state_47.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f1572a0702cf1fd102e045b8d91106160b6685b --- /dev/null +++ b/checkpoint-29322/rng_state_47.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c70a92c8f8f9894540cd83faf002f335f671dc0a2ebb9d553ffd1481263aa535 +size 15997 diff --git a/checkpoint-29322/rng_state_48.pth b/checkpoint-29322/rng_state_48.pth new file mode 100644 index 0000000000000000000000000000000000000000..73301e4fbb1c76086b1a295424a0774d8fab5862 --- /dev/null +++ b/checkpoint-29322/rng_state_48.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95b780e770088f56672e8b17b077f53e135ab554cfaa7a00bc62b9c26581be40 +size 15997 diff --git a/checkpoint-29322/rng_state_49.pth b/checkpoint-29322/rng_state_49.pth new file mode 100644 index 0000000000000000000000000000000000000000..f05b9943deaeb0c3634e1224b2a1db50cbfe2983 --- /dev/null +++ b/checkpoint-29322/rng_state_49.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb04a39dc365e7161dc1fe4c5f23306b9c4e0f95615c486426e2bb306e6ac20 +size 15997 diff --git a/checkpoint-29322/rng_state_5.pth b/checkpoint-29322/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c4e40e2a55b1ac0f3b121f7c995f767eec3f4d53 --- /dev/null +++ b/checkpoint-29322/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:354e7d08f632113a03996c4628dd6977547e2415a2ab1e4222cf9d01b9894c51 +size 15984 diff --git a/checkpoint-29322/rng_state_50.pth b/checkpoint-29322/rng_state_50.pth new file mode 100644 index 0000000000000000000000000000000000000000..42bcc324de1f7be79d3a9b42b625de87aac670c0 --- /dev/null +++ b/checkpoint-29322/rng_state_50.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e2cfff60d464b6b6dd3f1dcbad2aea3947069dd333ff8c2d6a10a480408cd4a +size 15997 diff --git a/checkpoint-29322/rng_state_51.pth b/checkpoint-29322/rng_state_51.pth new file mode 100644 index 0000000000000000000000000000000000000000..6c7deb20c5a4637c0cfaf33d4f416401e9bb2c48 --- /dev/null +++ b/checkpoint-29322/rng_state_51.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:319dd657c6238849f7a9b8c32a13448413eb7465e3e5d1e29eadaf3a06157987 +size 15997 diff --git a/checkpoint-29322/rng_state_52.pth b/checkpoint-29322/rng_state_52.pth new file mode 100644 index 0000000000000000000000000000000000000000..76d1980c6704ef13fec06c87967151697ca2cc66 --- /dev/null +++ b/checkpoint-29322/rng_state_52.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:420c344e4facf6010ff7db01f77403b5d3f3e101e961c839f57f8a1262c46d35 +size 15997 diff --git a/checkpoint-29322/rng_state_53.pth b/checkpoint-29322/rng_state_53.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e8770df8e28206698a955096c3b32f6424887d0 --- /dev/null +++ b/checkpoint-29322/rng_state_53.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:560fb850582816920911b8d29da5706a4469037900872647e24cebfe35f019f2 +size 15997 diff --git a/checkpoint-29322/rng_state_54.pth b/checkpoint-29322/rng_state_54.pth new file mode 100644 index 0000000000000000000000000000000000000000..f5fd3b2d2635a131321ec91faf54460a0c661ba3 --- /dev/null +++ b/checkpoint-29322/rng_state_54.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bcf8c1fdaf526b31560890b3a1d63a3b32b056f06d08c1f994ab082ca076a00 +size 15997 diff --git a/checkpoint-29322/rng_state_55.pth b/checkpoint-29322/rng_state_55.pth new file mode 100644 index 0000000000000000000000000000000000000000..55e321c069590af9585803c0874982bd3594c7d5 --- /dev/null +++ b/checkpoint-29322/rng_state_55.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df13a56f4e8d289ceab6d23a2738e5e86789e8645a30e2ca99728606923f1de2 +size 15997 diff --git a/checkpoint-29322/rng_state_56.pth b/checkpoint-29322/rng_state_56.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea8659eb8fe3ea0bf9bbc8bea4b031289fae0564 --- /dev/null +++ b/checkpoint-29322/rng_state_56.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9357ad739e9ea67ee396e05e27e929a9d0cb9e1906d5f0f086f1cb4a801eec67 +size 15997 diff --git a/checkpoint-29322/rng_state_57.pth b/checkpoint-29322/rng_state_57.pth new file mode 100644 index 0000000000000000000000000000000000000000..26618a83f6390f06e8eba7508eb7d093349cbe67 --- /dev/null +++ b/checkpoint-29322/rng_state_57.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ea40af452e0d7d430be90ec6978d4819ceb576ac6c0650ba60800400571c638 +size 15997 diff --git a/checkpoint-29322/rng_state_58.pth b/checkpoint-29322/rng_state_58.pth new file mode 100644 index 0000000000000000000000000000000000000000..53416d56ccfe26a3f190ea65017bb8722563bcf7 --- /dev/null +++ b/checkpoint-29322/rng_state_58.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11c51b918e9fa3d45bae8c23ae1a485d1485bc1f221a8733f511693ae3c43e7c +size 15997 diff --git a/checkpoint-29322/rng_state_59.pth b/checkpoint-29322/rng_state_59.pth new file mode 100644 index 0000000000000000000000000000000000000000..efbde27cbb262d04847e75aa499a34381b2a077d --- /dev/null +++ b/checkpoint-29322/rng_state_59.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4addfe6654ac31dca0ee7af59aef27ec180add66497fbaff0443cfb11df86827 +size 15997 diff --git a/checkpoint-29322/rng_state_6.pth b/checkpoint-29322/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..df0ebe9f8f79d151e49bdb34628fa866394c7436 --- /dev/null +++ b/checkpoint-29322/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a292c5529381d3460a7518ebb1f15c3e95ed2ffcb188c745aba169ea3acce43f +size 15984 diff --git a/checkpoint-29322/rng_state_60.pth b/checkpoint-29322/rng_state_60.pth new file mode 100644 index 0000000000000000000000000000000000000000..e57df6d7f4304809aab0e76e6ec153ad20220fc1 --- /dev/null +++ b/checkpoint-29322/rng_state_60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcc5e02d7055b295f883921bcb7482843862059f4e5cd23e10c3937f2cba64a7 +size 15997 diff --git a/checkpoint-29322/rng_state_61.pth b/checkpoint-29322/rng_state_61.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1e4bc75be15d8a5f9fd728f9fd0d60184d3f102 --- /dev/null +++ b/checkpoint-29322/rng_state_61.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afa9311316d86b8ca14aaaf697a60937afeb4aa17b45182b4606f6ede44896f7 +size 15997 diff --git a/checkpoint-29322/rng_state_62.pth b/checkpoint-29322/rng_state_62.pth new file mode 100644 index 0000000000000000000000000000000000000000..53a0ff006fcf2571c5c831149405f0d695c68270 --- /dev/null +++ b/checkpoint-29322/rng_state_62.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30cce336e6a69c7fe5a2c14dd6c35c3e67e9346dff650d0e19e6c8638c177aca +size 15997 diff --git a/checkpoint-29322/rng_state_63.pth b/checkpoint-29322/rng_state_63.pth new file mode 100644 index 0000000000000000000000000000000000000000..b01efb160d1820e120d182efc9683b9df2adf90f --- /dev/null +++ b/checkpoint-29322/rng_state_63.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40256fb77ce4a8c7c59acadc3f240f1a52e3db2f8b4e78c629d1aee5403a88c6 +size 15997 diff --git a/checkpoint-29322/rng_state_7.pth b/checkpoint-29322/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..91ca6e53fbfd1425e7b817bc3719d2b4c06d440b --- /dev/null +++ b/checkpoint-29322/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ca77f7f5f3ba7af700e23f1eace0f44c5762b47dc791e610aa33f2beaa6402 +size 15984 diff --git a/checkpoint-29322/rng_state_8.pth b/checkpoint-29322/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..c542547ecf12d13c65a5e919d22377f59a9aa192 --- /dev/null +++ b/checkpoint-29322/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:529228ec237e5b5c86c4c9c07cd759a62ec221d65692e9b319efaa0ea302df7f +size 15984 diff --git a/checkpoint-29322/rng_state_9.pth b/checkpoint-29322/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..003f98141b6678343c87b6aee2dbc4cfc5dbce9e --- /dev/null +++ b/checkpoint-29322/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:591ec2b2a1a16fb04ad2955eac166e83acfc33bdbc74d9f959bf2bb3ac8b87ee +size 15984 diff --git a/checkpoint-29322/scheduler.pt b/checkpoint-29322/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec961bcad7def4a08355a65a0450fee825a2b24f --- /dev/null +++ b/checkpoint-29322/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a65476b9e523bb98efca19d63854bb2b897318c19360c526776f9fd63d3b24f6 +size 1064 diff --git a/checkpoint-29322/special_tokens_map.json b/checkpoint-29322/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad43db72a0e94321a5a9455dce616c68d1f9673 --- /dev/null +++ b/checkpoint-29322/special_tokens_map.json @@ -0,0 +1,46 @@ +{ + "additional_special_tokens": [ + { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-29322/tokenizer.json b/checkpoint-29322/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..444d43e1c25d11b63381073024becd006c83d4f6 --- /dev/null +++ b/checkpoint-29322/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fbef9068a1d82c7fafc3fdfd7c717524c8bfbcaea19c14ce4f8a4e616deb57 +size 17210651 diff --git a/checkpoint-29322/tokenizer_config.json b/checkpoint-29322/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a54102d00c210427fe2da524cea00c5ace13686 --- /dev/null +++ b/checkpoint-29322/tokenizer_config.json @@ -0,0 +1,2102 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "[IMG]", + "[/IMG]", + "" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 256, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-29322/trainer_state.json b/checkpoint-29322/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..870f602d1158160daed776f32470fed50738656e --- /dev/null +++ b/checkpoint-29322/trainer_state.json @@ -0,0 +1,205288 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 29322, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003069367710251688, + "grad_norm": 1.3492016792297363, + "learning_rate": 0.0, + "loss": 6.5185, + "step": 1 + }, + { + "epoch": 0.0006138735420503376, + "grad_norm": 1.4303781986236572, + "learning_rate": 1.0224948875255626e-07, + "loss": 6.5124, + "step": 2 + }, + { + "epoch": 0.0009208103130755065, + "grad_norm": 1.3981783390045166, + "learning_rate": 2.0449897750511251e-07, + "loss": 6.5204, + "step": 3 + }, + { + "epoch": 0.0012277470841006752, + "grad_norm": 1.3760672807693481, + "learning_rate": 3.0674846625766876e-07, + "loss": 6.502, + "step": 4 + }, + { + "epoch": 0.001534683855125844, + "grad_norm": 1.3704107999801636, + "learning_rate": 4.0899795501022503e-07, + "loss": 6.5021, + "step": 5 + }, + { + "epoch": 0.001841620626151013, + "grad_norm": 1.3109549283981323, + "learning_rate": 5.112474437627812e-07, + "loss": 6.521, + "step": 6 + }, + { + "epoch": 0.002148557397176182, + "grad_norm": 1.475183367729187, + "learning_rate": 6.134969325153375e-07, + "loss": 6.521, + "step": 7 + }, + { + "epoch": 0.0024554941682013503, + "grad_norm": 1.4563297033309937, + "learning_rate": 7.157464212678937e-07, + "loss": 6.5075, + "step": 8 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 1.437183141708374, + "learning_rate": 8.179959100204501e-07, + "loss": 6.5135, + "step": 9 + }, + { + "epoch": 0.003069367710251688, + "grad_norm": 1.336928129196167, + "learning_rate": 9.202453987730062e-07, + "loss": 6.5138, + "step": 10 + }, + { + "epoch": 0.003376304481276857, + "grad_norm": 1.3220698833465576, + "learning_rate": 1.0224948875255625e-06, + "loss": 6.5187, + "step": 11 + }, + { + "epoch": 0.003683241252302026, + "grad_norm": 1.3990652561187744, + "learning_rate": 1.1247443762781187e-06, + "loss": 6.5129, + "step": 12 + }, + { + "epoch": 0.003990178023327195, + "grad_norm": 1.4394340515136719, + "learning_rate": 1.226993865030675e-06, + "loss": 6.5078, + "step": 13 + }, + { + "epoch": 0.004297114794352364, + "grad_norm": 1.3675259351730347, + "learning_rate": 1.3292433537832312e-06, + "loss": 6.5115, + "step": 14 + }, + { + "epoch": 0.004604051565377533, + "grad_norm": 1.3085063695907593, + "learning_rate": 1.4314928425357874e-06, + "loss": 6.5092, + "step": 15 + }, + { + "epoch": 0.004910988336402701, + "grad_norm": 1.4214227199554443, + "learning_rate": 1.5337423312883435e-06, + "loss": 6.5026, + "step": 16 + }, + { + "epoch": 0.0052179251074278695, + "grad_norm": 1.377146601676941, + "learning_rate": 1.6359918200409001e-06, + "loss": 6.4882, + "step": 17 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.3461124897003174, + "learning_rate": 1.7382413087934563e-06, + "loss": 6.4935, + "step": 18 + }, + { + "epoch": 0.005831798649478207, + "grad_norm": 1.3161669969558716, + "learning_rate": 1.8404907975460124e-06, + "loss": 6.4795, + "step": 19 + }, + { + "epoch": 0.006138735420503376, + "grad_norm": 1.2915974855422974, + "learning_rate": 1.942740286298569e-06, + "loss": 6.4529, + "step": 20 + }, + { + "epoch": 0.006445672191528545, + "grad_norm": 1.2675414085388184, + "learning_rate": 2.044989775051125e-06, + "loss": 6.454, + "step": 21 + }, + { + "epoch": 0.006752608962553714, + "grad_norm": 1.2769283056259155, + "learning_rate": 2.147239263803681e-06, + "loss": 6.4574, + "step": 22 + }, + { + "epoch": 0.007059545733578883, + "grad_norm": 1.2556813955307007, + "learning_rate": 2.2494887525562373e-06, + "loss": 6.4486, + "step": 23 + }, + { + "epoch": 0.007366482504604052, + "grad_norm": 1.2158268690109253, + "learning_rate": 2.3517382413087935e-06, + "loss": 6.4357, + "step": 24 + }, + { + "epoch": 0.007673419275629221, + "grad_norm": 1.2383767366409302, + "learning_rate": 2.45398773006135e-06, + "loss": 6.4347, + "step": 25 + }, + { + "epoch": 0.00798035604665439, + "grad_norm": 1.2865383625030518, + "learning_rate": 2.5562372188139062e-06, + "loss": 6.3611, + "step": 26 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 1.1501989364624023, + "learning_rate": 2.6584867075664624e-06, + "loss": 6.3247, + "step": 27 + }, + { + "epoch": 0.008594229588704727, + "grad_norm": 1.0971378087997437, + "learning_rate": 2.7607361963190186e-06, + "loss": 6.3078, + "step": 28 + }, + { + "epoch": 0.008901166359729895, + "grad_norm": 1.1365599632263184, + "learning_rate": 2.8629856850715747e-06, + "loss": 6.3211, + "step": 29 + }, + { + "epoch": 0.009208103130755065, + "grad_norm": 1.1228944063186646, + "learning_rate": 2.965235173824131e-06, + "loss": 6.3185, + "step": 30 + }, + { + "epoch": 0.009515039901780233, + "grad_norm": 1.126287579536438, + "learning_rate": 3.067484662576687e-06, + "loss": 6.2845, + "step": 31 + }, + { + "epoch": 0.009821976672805401, + "grad_norm": 1.1070353984832764, + "learning_rate": 3.1697341513292436e-06, + "loss": 6.2855, + "step": 32 + }, + { + "epoch": 0.010128913443830571, + "grad_norm": 1.101291537284851, + "learning_rate": 3.2719836400818002e-06, + "loss": 6.2764, + "step": 33 + }, + { + "epoch": 0.010435850214855739, + "grad_norm": 1.0643113851547241, + "learning_rate": 3.374233128834356e-06, + "loss": 6.2363, + "step": 34 + }, + { + "epoch": 0.010742786985880909, + "grad_norm": 0.9714563488960266, + "learning_rate": 3.4764826175869125e-06, + "loss": 6.1771, + "step": 35 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8998560309410095, + "learning_rate": 3.5787321063394683e-06, + "loss": 6.1202, + "step": 36 + }, + { + "epoch": 0.011356660527931247, + "grad_norm": 0.8481987714767456, + "learning_rate": 3.680981595092025e-06, + "loss": 6.0954, + "step": 37 + }, + { + "epoch": 0.011663597298956415, + "grad_norm": 0.8124909996986389, + "learning_rate": 3.783231083844581e-06, + "loss": 6.0832, + "step": 38 + }, + { + "epoch": 0.011970534069981584, + "grad_norm": 0.7968178391456604, + "learning_rate": 3.885480572597138e-06, + "loss": 6.0661, + "step": 39 + }, + { + "epoch": 0.012277470841006752, + "grad_norm": 0.7714207768440247, + "learning_rate": 3.987730061349693e-06, + "loss": 6.0385, + "step": 40 + }, + { + "epoch": 0.012584407612031922, + "grad_norm": 0.7436742782592773, + "learning_rate": 4.08997955010225e-06, + "loss": 6.0227, + "step": 41 + }, + { + "epoch": 0.01289134438305709, + "grad_norm": 0.7447277307510376, + "learning_rate": 4.192229038854806e-06, + "loss": 6.0208, + "step": 42 + }, + { + "epoch": 0.013198281154082258, + "grad_norm": 0.6983785629272461, + "learning_rate": 4.294478527607362e-06, + "loss": 6.0295, + "step": 43 + }, + { + "epoch": 0.013505217925107428, + "grad_norm": 0.6630908250808716, + "learning_rate": 4.3967280163599184e-06, + "loss": 6.004, + "step": 44 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.6481929421424866, + "learning_rate": 4.498977505112475e-06, + "loss": 5.9986, + "step": 45 + }, + { + "epoch": 0.014119091467157766, + "grad_norm": 0.7187685966491699, + "learning_rate": 4.601226993865031e-06, + "loss": 6.0008, + "step": 46 + }, + { + "epoch": 0.014426028238182934, + "grad_norm": 0.6550983190536499, + "learning_rate": 4.703476482617587e-06, + "loss": 5.9735, + "step": 47 + }, + { + "epoch": 0.014732965009208104, + "grad_norm": 0.6780675649642944, + "learning_rate": 4.805725971370143e-06, + "loss": 5.9568, + "step": 48 + }, + { + "epoch": 0.015039901780233272, + "grad_norm": 0.703427791595459, + "learning_rate": 4.9079754601227e-06, + "loss": 5.961, + "step": 49 + }, + { + "epoch": 0.015346838551258441, + "grad_norm": 0.6507543921470642, + "learning_rate": 5.0102249488752554e-06, + "loss": 5.9557, + "step": 50 + }, + { + "epoch": 0.01565377532228361, + "grad_norm": 0.5959481000900269, + "learning_rate": 5.1124744376278124e-06, + "loss": 5.9391, + "step": 51 + }, + { + "epoch": 0.01596071209330878, + "grad_norm": 0.5798730254173279, + "learning_rate": 5.214723926380368e-06, + "loss": 5.9488, + "step": 52 + }, + { + "epoch": 0.016267648864333947, + "grad_norm": 0.5932896137237549, + "learning_rate": 5.316973415132925e-06, + "loss": 5.9176, + "step": 53 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5772561430931091, + "learning_rate": 5.419222903885481e-06, + "loss": 5.9069, + "step": 54 + }, + { + "epoch": 0.016881522406384283, + "grad_norm": 0.5578178763389587, + "learning_rate": 5.521472392638037e-06, + "loss": 5.8924, + "step": 55 + }, + { + "epoch": 0.017188459177409455, + "grad_norm": 0.5458457469940186, + "learning_rate": 5.623721881390593e-06, + "loss": 5.9001, + "step": 56 + }, + { + "epoch": 0.017495395948434623, + "grad_norm": 0.5381231904029846, + "learning_rate": 5.7259713701431494e-06, + "loss": 5.8827, + "step": 57 + }, + { + "epoch": 0.01780233271945979, + "grad_norm": 0.540920615196228, + "learning_rate": 5.828220858895706e-06, + "loss": 5.8763, + "step": 58 + }, + { + "epoch": 0.01810926949048496, + "grad_norm": 0.5378615260124207, + "learning_rate": 5.930470347648262e-06, + "loss": 5.865, + "step": 59 + }, + { + "epoch": 0.01841620626151013, + "grad_norm": 0.5139282941818237, + "learning_rate": 6.032719836400819e-06, + "loss": 5.873, + "step": 60 + }, + { + "epoch": 0.0187231430325353, + "grad_norm": 0.5298904776573181, + "learning_rate": 6.134969325153374e-06, + "loss": 5.861, + "step": 61 + }, + { + "epoch": 0.019030079803560467, + "grad_norm": 0.503131628036499, + "learning_rate": 6.237218813905931e-06, + "loss": 5.844, + "step": 62 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.5133433938026428, + "learning_rate": 6.339468302658487e-06, + "loss": 5.8535, + "step": 63 + }, + { + "epoch": 0.019643953345610803, + "grad_norm": 0.4909187853336334, + "learning_rate": 6.4417177914110434e-06, + "loss": 5.8378, + "step": 64 + }, + { + "epoch": 0.019950890116635974, + "grad_norm": 0.6916642785072327, + "learning_rate": 6.5439672801636004e-06, + "loss": 5.8385, + "step": 65 + }, + { + "epoch": 0.020257826887661142, + "grad_norm": 0.4801484942436218, + "learning_rate": 6.646216768916155e-06, + "loss": 5.8089, + "step": 66 + }, + { + "epoch": 0.02056476365868631, + "grad_norm": 0.47745251655578613, + "learning_rate": 6.748466257668712e-06, + "loss": 5.8119, + "step": 67 + }, + { + "epoch": 0.020871700429711478, + "grad_norm": 0.4693359136581421, + "learning_rate": 6.850715746421268e-06, + "loss": 5.8038, + "step": 68 + }, + { + "epoch": 0.02117863720073665, + "grad_norm": 0.46996453404426575, + "learning_rate": 6.952965235173825e-06, + "loss": 5.7966, + "step": 69 + }, + { + "epoch": 0.021485573971761818, + "grad_norm": 0.45779168605804443, + "learning_rate": 7.05521472392638e-06, + "loss": 5.7959, + "step": 70 + }, + { + "epoch": 0.021792510742786986, + "grad_norm": 0.49008259177207947, + "learning_rate": 7.1574642126789366e-06, + "loss": 5.7861, + "step": 71 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.44727766513824463, + "learning_rate": 7.259713701431494e-06, + "loss": 5.7716, + "step": 72 + }, + { + "epoch": 0.022406384284837322, + "grad_norm": 0.4392741918563843, + "learning_rate": 7.36196319018405e-06, + "loss": 5.7776, + "step": 73 + }, + { + "epoch": 0.022713321055862493, + "grad_norm": 0.43525391817092896, + "learning_rate": 7.464212678936605e-06, + "loss": 5.7687, + "step": 74 + }, + { + "epoch": 0.02302025782688766, + "grad_norm": 0.4370710253715515, + "learning_rate": 7.566462167689162e-06, + "loss": 5.7504, + "step": 75 + }, + { + "epoch": 0.02332719459791283, + "grad_norm": 0.4349770247936249, + "learning_rate": 7.668711656441718e-06, + "loss": 5.7425, + "step": 76 + }, + { + "epoch": 0.023634131368937997, + "grad_norm": 0.42710933089256287, + "learning_rate": 7.770961145194275e-06, + "loss": 5.7562, + "step": 77 + }, + { + "epoch": 0.02394106813996317, + "grad_norm": 0.42816224694252014, + "learning_rate": 7.87321063394683e-06, + "loss": 5.7301, + "step": 78 + }, + { + "epoch": 0.024248004910988337, + "grad_norm": 0.4183364510536194, + "learning_rate": 7.975460122699386e-06, + "loss": 5.7131, + "step": 79 + }, + { + "epoch": 0.024554941682013505, + "grad_norm": 0.4179428517818451, + "learning_rate": 8.077709611451943e-06, + "loss": 5.7057, + "step": 80 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.40880727767944336, + "learning_rate": 8.1799591002045e-06, + "loss": 5.7179, + "step": 81 + }, + { + "epoch": 0.025168815224063844, + "grad_norm": 0.40961235761642456, + "learning_rate": 8.282208588957055e-06, + "loss": 5.7008, + "step": 82 + }, + { + "epoch": 0.025475751995089013, + "grad_norm": 0.46789029240608215, + "learning_rate": 8.384458077709612e-06, + "loss": 5.7071, + "step": 83 + }, + { + "epoch": 0.02578268876611418, + "grad_norm": 0.4776248335838318, + "learning_rate": 8.486707566462168e-06, + "loss": 5.6829, + "step": 84 + }, + { + "epoch": 0.02608962553713935, + "grad_norm": 0.40660589933395386, + "learning_rate": 8.588957055214725e-06, + "loss": 5.6732, + "step": 85 + }, + { + "epoch": 0.026396562308164517, + "grad_norm": 0.3984324038028717, + "learning_rate": 8.69120654396728e-06, + "loss": 5.6777, + "step": 86 + }, + { + "epoch": 0.026703499079189688, + "grad_norm": 0.3972148597240448, + "learning_rate": 8.793456032719837e-06, + "loss": 5.6598, + "step": 87 + }, + { + "epoch": 0.027010435850214856, + "grad_norm": 0.3906182050704956, + "learning_rate": 8.895705521472392e-06, + "loss": 5.6468, + "step": 88 + }, + { + "epoch": 0.027317372621240024, + "grad_norm": 0.38598939776420593, + "learning_rate": 8.99795501022495e-06, + "loss": 5.6452, + "step": 89 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.405943363904953, + "learning_rate": 9.100204498977506e-06, + "loss": 5.6408, + "step": 90 + }, + { + "epoch": 0.027931246163290364, + "grad_norm": 0.3859459161758423, + "learning_rate": 9.202453987730062e-06, + "loss": 5.613, + "step": 91 + }, + { + "epoch": 0.028238182934315532, + "grad_norm": 0.3773545026779175, + "learning_rate": 9.304703476482619e-06, + "loss": 5.6277, + "step": 92 + }, + { + "epoch": 0.0285451197053407, + "grad_norm": 0.36915943026542664, + "learning_rate": 9.406952965235174e-06, + "loss": 5.618, + "step": 93 + }, + { + "epoch": 0.028852056476365868, + "grad_norm": 0.3732316792011261, + "learning_rate": 9.509202453987731e-06, + "loss": 5.6066, + "step": 94 + }, + { + "epoch": 0.029158993247391036, + "grad_norm": 0.3670802414417267, + "learning_rate": 9.611451942740286e-06, + "loss": 5.6189, + "step": 95 + }, + { + "epoch": 0.029465930018416207, + "grad_norm": 0.3672202229499817, + "learning_rate": 9.713701431492843e-06, + "loss": 5.6046, + "step": 96 + }, + { + "epoch": 0.029772866789441375, + "grad_norm": 0.3624509871006012, + "learning_rate": 9.8159509202454e-06, + "loss": 5.585, + "step": 97 + }, + { + "epoch": 0.030079803560466543, + "grad_norm": 0.36265870928764343, + "learning_rate": 9.918200408997956e-06, + "loss": 5.5867, + "step": 98 + }, + { + "epoch": 0.03038674033149171, + "grad_norm": 0.3606979548931122, + "learning_rate": 1.0020449897750511e-05, + "loss": 5.5658, + "step": 99 + }, + { + "epoch": 0.030693677102516883, + "grad_norm": 0.36800363659858704, + "learning_rate": 1.0122699386503068e-05, + "loss": 5.5494, + "step": 100 + }, + { + "epoch": 0.03100061387354205, + "grad_norm": 0.3641016483306885, + "learning_rate": 1.0224948875255625e-05, + "loss": 5.5553, + "step": 101 + }, + { + "epoch": 0.03130755064456722, + "grad_norm": 0.36807990074157715, + "learning_rate": 1.032719836400818e-05, + "loss": 5.5315, + "step": 102 + }, + { + "epoch": 0.03161448741559239, + "grad_norm": 0.37071728706359863, + "learning_rate": 1.0429447852760736e-05, + "loss": 5.522, + "step": 103 + }, + { + "epoch": 0.03192142418661756, + "grad_norm": 0.3549076020717621, + "learning_rate": 1.0531697341513293e-05, + "loss": 5.5354, + "step": 104 + }, + { + "epoch": 0.03222836095764273, + "grad_norm": 0.3589537441730499, + "learning_rate": 1.063394683026585e-05, + "loss": 5.534, + "step": 105 + }, + { + "epoch": 0.032535297728667895, + "grad_norm": 0.4341397285461426, + "learning_rate": 1.0736196319018407e-05, + "loss": 5.5088, + "step": 106 + }, + { + "epoch": 0.03284223449969306, + "grad_norm": 0.37220680713653564, + "learning_rate": 1.0838445807770962e-05, + "loss": 5.5213, + "step": 107 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.3776145875453949, + "learning_rate": 1.0940695296523517e-05, + "loss": 5.4955, + "step": 108 + }, + { + "epoch": 0.0334561080417434, + "grad_norm": 0.38651829957962036, + "learning_rate": 1.1042944785276074e-05, + "loss": 5.4916, + "step": 109 + }, + { + "epoch": 0.03376304481276857, + "grad_norm": 0.3749970495700836, + "learning_rate": 1.1145194274028631e-05, + "loss": 5.4686, + "step": 110 + }, + { + "epoch": 0.03406998158379374, + "grad_norm": 0.38184404373168945, + "learning_rate": 1.1247443762781187e-05, + "loss": 5.4694, + "step": 111 + }, + { + "epoch": 0.03437691835481891, + "grad_norm": 0.38783952593803406, + "learning_rate": 1.1349693251533742e-05, + "loss": 5.4447, + "step": 112 + }, + { + "epoch": 0.03468385512584408, + "grad_norm": 0.369125097990036, + "learning_rate": 1.1451942740286299e-05, + "loss": 5.4506, + "step": 113 + }, + { + "epoch": 0.034990791896869246, + "grad_norm": 0.3773012161254883, + "learning_rate": 1.1554192229038856e-05, + "loss": 5.4637, + "step": 114 + }, + { + "epoch": 0.035297728667894414, + "grad_norm": 0.47702446579933167, + "learning_rate": 1.1656441717791411e-05, + "loss": 5.4487, + "step": 115 + }, + { + "epoch": 0.03560466543891958, + "grad_norm": 0.5288241505622864, + "learning_rate": 1.1758691206543968e-05, + "loss": 5.4216, + "step": 116 + }, + { + "epoch": 0.03591160220994475, + "grad_norm": 0.49916699528694153, + "learning_rate": 1.1860940695296524e-05, + "loss": 5.4055, + "step": 117 + }, + { + "epoch": 0.03621853898096992, + "grad_norm": 0.5027921795845032, + "learning_rate": 1.196319018404908e-05, + "loss": 5.4141, + "step": 118 + }, + { + "epoch": 0.036525475751995086, + "grad_norm": 0.5069209933280945, + "learning_rate": 1.2065439672801638e-05, + "loss": 5.4277, + "step": 119 + }, + { + "epoch": 0.03683241252302026, + "grad_norm": 0.5208525657653809, + "learning_rate": 1.2167689161554193e-05, + "loss": 5.4023, + "step": 120 + }, + { + "epoch": 0.03713934929404543, + "grad_norm": 0.7059593796730042, + "learning_rate": 1.2269938650306748e-05, + "loss": 5.3797, + "step": 121 + }, + { + "epoch": 0.0374462860650706, + "grad_norm": 0.71112060546875, + "learning_rate": 1.2372188139059305e-05, + "loss": 5.3619, + "step": 122 + }, + { + "epoch": 0.037753222836095765, + "grad_norm": 0.5095361471176147, + "learning_rate": 1.2474437627811862e-05, + "loss": 5.3667, + "step": 123 + }, + { + "epoch": 0.03806015960712093, + "grad_norm": 0.986062228679657, + "learning_rate": 1.2576687116564418e-05, + "loss": 5.3459, + "step": 124 + }, + { + "epoch": 0.0383670963781461, + "grad_norm": 0.693392813205719, + "learning_rate": 1.2678936605316975e-05, + "loss": 5.3165, + "step": 125 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7835625410079956, + "learning_rate": 1.278118609406953e-05, + "loss": 5.3205, + "step": 126 + }, + { + "epoch": 0.03898096992019644, + "grad_norm": 0.6314569711685181, + "learning_rate": 1.2883435582822087e-05, + "loss": 5.3287, + "step": 127 + }, + { + "epoch": 0.039287906691221605, + "grad_norm": 0.9079526662826538, + "learning_rate": 1.2985685071574644e-05, + "loss": 5.2935, + "step": 128 + }, + { + "epoch": 0.03959484346224678, + "grad_norm": 0.6998131275177002, + "learning_rate": 1.3087934560327201e-05, + "loss": 5.315, + "step": 129 + }, + { + "epoch": 0.03990178023327195, + "grad_norm": 0.7570182085037231, + "learning_rate": 1.3190184049079754e-05, + "loss": 5.293, + "step": 130 + }, + { + "epoch": 0.040208717004297116, + "grad_norm": 0.6972737908363342, + "learning_rate": 1.329243353783231e-05, + "loss": 5.2863, + "step": 131 + }, + { + "epoch": 0.040515653775322284, + "grad_norm": 0.8841190934181213, + "learning_rate": 1.3394683026584867e-05, + "loss": 5.2518, + "step": 132 + }, + { + "epoch": 0.04082259054634745, + "grad_norm": 0.6792641282081604, + "learning_rate": 1.3496932515337424e-05, + "loss": 5.2386, + "step": 133 + }, + { + "epoch": 0.04112952731737262, + "grad_norm": 0.9234145879745483, + "learning_rate": 1.359918200408998e-05, + "loss": 5.2418, + "step": 134 + }, + { + "epoch": 0.04143646408839779, + "grad_norm": 1.1438226699829102, + "learning_rate": 1.3701431492842536e-05, + "loss": 5.2298, + "step": 135 + }, + { + "epoch": 0.041743400859422956, + "grad_norm": 0.910861074924469, + "learning_rate": 1.3803680981595093e-05, + "loss": 5.2437, + "step": 136 + }, + { + "epoch": 0.042050337630448124, + "grad_norm": 0.8995844721794128, + "learning_rate": 1.390593047034765e-05, + "loss": 5.2456, + "step": 137 + }, + { + "epoch": 0.0423572744014733, + "grad_norm": 0.8543404936790466, + "learning_rate": 1.4008179959100204e-05, + "loss": 5.1888, + "step": 138 + }, + { + "epoch": 0.04266421117249847, + "grad_norm": 0.7565917372703552, + "learning_rate": 1.411042944785276e-05, + "loss": 5.1939, + "step": 139 + }, + { + "epoch": 0.042971147943523635, + "grad_norm": 0.7103878259658813, + "learning_rate": 1.4212678936605318e-05, + "loss": 5.1693, + "step": 140 + }, + { + "epoch": 0.0432780847145488, + "grad_norm": 1.008686900138855, + "learning_rate": 1.4314928425357873e-05, + "loss": 5.1467, + "step": 141 + }, + { + "epoch": 0.04358502148557397, + "grad_norm": 1.1134421825408936, + "learning_rate": 1.441717791411043e-05, + "loss": 5.1695, + "step": 142 + }, + { + "epoch": 0.04389195825659914, + "grad_norm": 0.7418283820152283, + "learning_rate": 1.4519427402862987e-05, + "loss": 5.1556, + "step": 143 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 1.3332276344299316, + "learning_rate": 1.4621676891615542e-05, + "loss": 5.1736, + "step": 144 + }, + { + "epoch": 0.044505831798649476, + "grad_norm": 0.99709153175354, + "learning_rate": 1.47239263803681e-05, + "loss": 5.1326, + "step": 145 + }, + { + "epoch": 0.044812768569674644, + "grad_norm": 2.0185158252716064, + "learning_rate": 1.4826175869120657e-05, + "loss": 5.1075, + "step": 146 + }, + { + "epoch": 0.04511970534069982, + "grad_norm": 0.9810693264007568, + "learning_rate": 1.492842535787321e-05, + "loss": 5.1181, + "step": 147 + }, + { + "epoch": 0.04542664211172499, + "grad_norm": 1.3122087717056274, + "learning_rate": 1.5030674846625767e-05, + "loss": 5.1104, + "step": 148 + }, + { + "epoch": 0.045733578882750155, + "grad_norm": 1.230662226676941, + "learning_rate": 1.5132924335378324e-05, + "loss": 5.0721, + "step": 149 + }, + { + "epoch": 0.04604051565377532, + "grad_norm": 0.9584419131278992, + "learning_rate": 1.523517382413088e-05, + "loss": 5.0574, + "step": 150 + }, + { + "epoch": 0.04634745242480049, + "grad_norm": 1.3933353424072266, + "learning_rate": 1.5337423312883436e-05, + "loss": 5.0468, + "step": 151 + }, + { + "epoch": 0.04665438919582566, + "grad_norm": 1.2336134910583496, + "learning_rate": 1.5439672801635993e-05, + "loss": 5.0596, + "step": 152 + }, + { + "epoch": 0.04696132596685083, + "grad_norm": 1.3005256652832031, + "learning_rate": 1.554192229038855e-05, + "loss": 5.0236, + "step": 153 + }, + { + "epoch": 0.047268262737875995, + "grad_norm": 1.2528692483901978, + "learning_rate": 1.5644171779141108e-05, + "loss": 5.0269, + "step": 154 + }, + { + "epoch": 0.04757519950890117, + "grad_norm": 1.0448148250579834, + "learning_rate": 1.574642126789366e-05, + "loss": 5.0338, + "step": 155 + }, + { + "epoch": 0.04788213627992634, + "grad_norm": 1.2372045516967773, + "learning_rate": 1.5848670756646218e-05, + "loss": 4.9544, + "step": 156 + }, + { + "epoch": 0.048189073050951506, + "grad_norm": 1.2700645923614502, + "learning_rate": 1.5950920245398772e-05, + "loss": 4.9723, + "step": 157 + }, + { + "epoch": 0.048496009821976674, + "grad_norm": 1.1283228397369385, + "learning_rate": 1.605316973415133e-05, + "loss": 4.9801, + "step": 158 + }, + { + "epoch": 0.04880294659300184, + "grad_norm": 1.5563665628433228, + "learning_rate": 1.6155419222903886e-05, + "loss": 4.9118, + "step": 159 + }, + { + "epoch": 0.04910988336402701, + "grad_norm": 1.3759487867355347, + "learning_rate": 1.6257668711656443e-05, + "loss": 4.9552, + "step": 160 + }, + { + "epoch": 0.04941682013505218, + "grad_norm": 1.2167878150939941, + "learning_rate": 1.6359918200409e-05, + "loss": 4.9186, + "step": 161 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.6424930095672607, + "learning_rate": 1.6462167689161557e-05, + "loss": 4.9143, + "step": 162 + }, + { + "epoch": 0.050030693677102514, + "grad_norm": 1.0009948015213013, + "learning_rate": 1.656441717791411e-05, + "loss": 4.8615, + "step": 163 + }, + { + "epoch": 0.05033763044812769, + "grad_norm": 1.8803274631500244, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.8558, + "step": 164 + }, + { + "epoch": 0.05064456721915286, + "grad_norm": 1.1819735765457153, + "learning_rate": 1.6768916155419224e-05, + "loss": 4.8453, + "step": 165 + }, + { + "epoch": 0.050951503990178025, + "grad_norm": 1.9724273681640625, + "learning_rate": 1.6871165644171778e-05, + "loss": 4.8573, + "step": 166 + }, + { + "epoch": 0.05125844076120319, + "grad_norm": 1.4624557495117188, + "learning_rate": 1.6973415132924335e-05, + "loss": 4.8494, + "step": 167 + }, + { + "epoch": 0.05156537753222836, + "grad_norm": 1.4750267267227173, + "learning_rate": 1.7075664621676892e-05, + "loss": 4.8296, + "step": 168 + }, + { + "epoch": 0.05187231430325353, + "grad_norm": 1.3206923007965088, + "learning_rate": 1.717791411042945e-05, + "loss": 4.7834, + "step": 169 + }, + { + "epoch": 0.0521792510742787, + "grad_norm": 1.4332681894302368, + "learning_rate": 1.7280163599182006e-05, + "loss": 4.8008, + "step": 170 + }, + { + "epoch": 0.052486187845303865, + "grad_norm": 1.612804651260376, + "learning_rate": 1.738241308793456e-05, + "loss": 4.7885, + "step": 171 + }, + { + "epoch": 0.05279312461632903, + "grad_norm": 1.3880311250686646, + "learning_rate": 1.7484662576687117e-05, + "loss": 4.8034, + "step": 172 + }, + { + "epoch": 0.05310006138735421, + "grad_norm": 1.7550631761550903, + "learning_rate": 1.7586912065439674e-05, + "loss": 4.7568, + "step": 173 + }, + { + "epoch": 0.053406998158379376, + "grad_norm": 1.653678297996521, + "learning_rate": 1.768916155419223e-05, + "loss": 4.7294, + "step": 174 + }, + { + "epoch": 0.053713934929404544, + "grad_norm": 1.6094826459884644, + "learning_rate": 1.7791411042944784e-05, + "loss": 4.7409, + "step": 175 + }, + { + "epoch": 0.05402087170042971, + "grad_norm": 1.7453033924102783, + "learning_rate": 1.789366053169734e-05, + "loss": 4.7191, + "step": 176 + }, + { + "epoch": 0.05432780847145488, + "grad_norm": 1.3073794841766357, + "learning_rate": 1.79959100204499e-05, + "loss": 4.7347, + "step": 177 + }, + { + "epoch": 0.05463474524248005, + "grad_norm": 2.096515655517578, + "learning_rate": 1.8098159509202455e-05, + "loss": 4.7396, + "step": 178 + }, + { + "epoch": 0.054941682013505216, + "grad_norm": 1.3826024532318115, + "learning_rate": 1.8200408997955012e-05, + "loss": 4.6988, + "step": 179 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 1.9290310144424438, + "learning_rate": 1.8302658486707566e-05, + "loss": 4.6653, + "step": 180 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.7404149770736694, + "learning_rate": 1.8404907975460123e-05, + "loss": 4.7102, + "step": 181 + }, + { + "epoch": 0.05586249232658073, + "grad_norm": 1.7535779476165771, + "learning_rate": 1.850715746421268e-05, + "loss": 4.7124, + "step": 182 + }, + { + "epoch": 0.056169429097605895, + "grad_norm": 1.7792351245880127, + "learning_rate": 1.8609406952965237e-05, + "loss": 4.6969, + "step": 183 + }, + { + "epoch": 0.056476365868631064, + "grad_norm": 2.048332452774048, + "learning_rate": 1.8711656441717794e-05, + "loss": 4.6134, + "step": 184 + }, + { + "epoch": 0.05678330263965623, + "grad_norm": 1.9558366537094116, + "learning_rate": 1.8813905930470348e-05, + "loss": 4.6739, + "step": 185 + }, + { + "epoch": 0.0570902394106814, + "grad_norm": 2.5299644470214844, + "learning_rate": 1.8916155419222905e-05, + "loss": 4.6248, + "step": 186 + }, + { + "epoch": 0.05739717618170657, + "grad_norm": 2.143704891204834, + "learning_rate": 1.9018404907975462e-05, + "loss": 4.6664, + "step": 187 + }, + { + "epoch": 0.057704112952731736, + "grad_norm": 1.925010323524475, + "learning_rate": 1.9120654396728015e-05, + "loss": 4.5657, + "step": 188 + }, + { + "epoch": 0.058011049723756904, + "grad_norm": 1.8223596811294556, + "learning_rate": 1.9222903885480572e-05, + "loss": 4.6124, + "step": 189 + }, + { + "epoch": 0.05831798649478207, + "grad_norm": 1.9519827365875244, + "learning_rate": 1.932515337423313e-05, + "loss": 4.5937, + "step": 190 + }, + { + "epoch": 0.05862492326580725, + "grad_norm": 2.062534809112549, + "learning_rate": 1.9427402862985686e-05, + "loss": 4.6023, + "step": 191 + }, + { + "epoch": 0.058931860036832415, + "grad_norm": 1.8512892723083496, + "learning_rate": 1.9529652351738243e-05, + "loss": 4.5709, + "step": 192 + }, + { + "epoch": 0.05923879680785758, + "grad_norm": 2.7771248817443848, + "learning_rate": 1.96319018404908e-05, + "loss": 4.5902, + "step": 193 + }, + { + "epoch": 0.05954573357888275, + "grad_norm": 1.8911874294281006, + "learning_rate": 1.9734151329243354e-05, + "loss": 4.4973, + "step": 194 + }, + { + "epoch": 0.05985267034990792, + "grad_norm": 2.261096715927124, + "learning_rate": 1.983640081799591e-05, + "loss": 4.5343, + "step": 195 + }, + { + "epoch": 0.06015960712093309, + "grad_norm": 1.833983302116394, + "learning_rate": 1.9938650306748465e-05, + "loss": 4.5604, + "step": 196 + }, + { + "epoch": 0.060466543891958255, + "grad_norm": 2.6909141540527344, + "learning_rate": 2.0040899795501022e-05, + "loss": 4.5411, + "step": 197 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.8085883855819702, + "learning_rate": 2.014314928425358e-05, + "loss": 4.5127, + "step": 198 + }, + { + "epoch": 0.06108041743400859, + "grad_norm": 3.082063913345337, + "learning_rate": 2.0245398773006136e-05, + "loss": 4.5055, + "step": 199 + }, + { + "epoch": 0.061387354205033766, + "grad_norm": 1.6942392587661743, + "learning_rate": 2.0347648261758693e-05, + "loss": 4.4852, + "step": 200 + }, + { + "epoch": 0.061694290976058934, + "grad_norm": 2.428569793701172, + "learning_rate": 2.044989775051125e-05, + "loss": 4.4876, + "step": 201 + }, + { + "epoch": 0.0620012277470841, + "grad_norm": 2.1669068336486816, + "learning_rate": 2.0552147239263807e-05, + "loss": 4.5156, + "step": 202 + }, + { + "epoch": 0.06230816451810927, + "grad_norm": 1.8558237552642822, + "learning_rate": 2.065439672801636e-05, + "loss": 4.495, + "step": 203 + }, + { + "epoch": 0.06261510128913444, + "grad_norm": 2.86224627494812, + "learning_rate": 2.0756646216768917e-05, + "loss": 4.4881, + "step": 204 + }, + { + "epoch": 0.06292203806015961, + "grad_norm": 2.263230562210083, + "learning_rate": 2.085889570552147e-05, + "loss": 4.4349, + "step": 205 + }, + { + "epoch": 0.06322897483118478, + "grad_norm": 2.533039093017578, + "learning_rate": 2.0961145194274028e-05, + "loss": 4.4921, + "step": 206 + }, + { + "epoch": 0.06353591160220995, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.1063394683026585e-05, + "loss": 4.4581, + "step": 207 + }, + { + "epoch": 0.06384284837323512, + "grad_norm": 1.9801981449127197, + "learning_rate": 2.1165644171779142e-05, + "loss": 4.4646, + "step": 208 + }, + { + "epoch": 0.06414978514426029, + "grad_norm": 2.8499860763549805, + "learning_rate": 2.12678936605317e-05, + "loss": 4.3913, + "step": 209 + }, + { + "epoch": 0.06445672191528545, + "grad_norm": 1.8176993131637573, + "learning_rate": 2.1370143149284256e-05, + "loss": 4.4414, + "step": 210 + }, + { + "epoch": 0.06476365868631062, + "grad_norm": 3.1497061252593994, + "learning_rate": 2.1472392638036813e-05, + "loss": 4.4164, + "step": 211 + }, + { + "epoch": 0.06507059545733579, + "grad_norm": 2.0509049892425537, + "learning_rate": 2.1574642126789367e-05, + "loss": 4.4198, + "step": 212 + }, + { + "epoch": 0.06537753222836096, + "grad_norm": 2.5346014499664307, + "learning_rate": 2.1676891615541924e-05, + "loss": 4.3628, + "step": 213 + }, + { + "epoch": 0.06568446899938613, + "grad_norm": 2.281947135925293, + "learning_rate": 2.1779141104294477e-05, + "loss": 4.3824, + "step": 214 + }, + { + "epoch": 0.0659914057704113, + "grad_norm": 2.9005074501037598, + "learning_rate": 2.1881390593047034e-05, + "loss": 4.4227, + "step": 215 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 2.5869741439819336, + "learning_rate": 2.198364008179959e-05, + "loss": 4.4231, + "step": 216 + }, + { + "epoch": 0.06660527931246163, + "grad_norm": 2.339655637741089, + "learning_rate": 2.208588957055215e-05, + "loss": 4.3901, + "step": 217 + }, + { + "epoch": 0.0669122160834868, + "grad_norm": 2.430664539337158, + "learning_rate": 2.2188139059304705e-05, + "loss": 4.3487, + "step": 218 + }, + { + "epoch": 0.06721915285451197, + "grad_norm": 2.1791040897369385, + "learning_rate": 2.2290388548057262e-05, + "loss": 4.3404, + "step": 219 + }, + { + "epoch": 0.06752608962553713, + "grad_norm": 2.7054920196533203, + "learning_rate": 2.239263803680982e-05, + "loss": 4.4186, + "step": 220 + }, + { + "epoch": 0.0678330263965623, + "grad_norm": 2.516566514968872, + "learning_rate": 2.2494887525562373e-05, + "loss": 4.4102, + "step": 221 + }, + { + "epoch": 0.06813996316758748, + "grad_norm": 2.3522324562072754, + "learning_rate": 2.259713701431493e-05, + "loss": 4.4062, + "step": 222 + }, + { + "epoch": 0.06844689993861265, + "grad_norm": 2.557600259780884, + "learning_rate": 2.2699386503067484e-05, + "loss": 4.3711, + "step": 223 + }, + { + "epoch": 0.06875383670963782, + "grad_norm": 2.0590531826019287, + "learning_rate": 2.280163599182004e-05, + "loss": 4.3546, + "step": 224 + }, + { + "epoch": 0.06906077348066299, + "grad_norm": 4.704878330230713, + "learning_rate": 2.2903885480572598e-05, + "loss": 4.39, + "step": 225 + }, + { + "epoch": 0.06936771025168816, + "grad_norm": 2.237440347671509, + "learning_rate": 2.3006134969325155e-05, + "loss": 4.3425, + "step": 226 + }, + { + "epoch": 0.06967464702271332, + "grad_norm": 3.9394450187683105, + "learning_rate": 2.3108384458077712e-05, + "loss": 4.3641, + "step": 227 + }, + { + "epoch": 0.06998158379373849, + "grad_norm": 2.4857213497161865, + "learning_rate": 2.321063394683027e-05, + "loss": 4.3435, + "step": 228 + }, + { + "epoch": 0.07028852056476366, + "grad_norm": 2.893437147140503, + "learning_rate": 2.3312883435582822e-05, + "loss": 4.329, + "step": 229 + }, + { + "epoch": 0.07059545733578883, + "grad_norm": 2.6498284339904785, + "learning_rate": 2.341513292433538e-05, + "loss": 4.3058, + "step": 230 + }, + { + "epoch": 0.070902394106814, + "grad_norm": 2.4182214736938477, + "learning_rate": 2.3517382413087936e-05, + "loss": 4.3147, + "step": 231 + }, + { + "epoch": 0.07120933087783916, + "grad_norm": 2.532050371170044, + "learning_rate": 2.361963190184049e-05, + "loss": 4.3388, + "step": 232 + }, + { + "epoch": 0.07151626764886433, + "grad_norm": 2.5818533897399902, + "learning_rate": 2.3721881390593047e-05, + "loss": 4.3023, + "step": 233 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 2.1860098838806152, + "learning_rate": 2.3824130879345604e-05, + "loss": 4.2571, + "step": 234 + }, + { + "epoch": 0.07213014119091467, + "grad_norm": 3.5780131816864014, + "learning_rate": 2.392638036809816e-05, + "loss": 4.3336, + "step": 235 + }, + { + "epoch": 0.07243707796193984, + "grad_norm": 2.24653697013855, + "learning_rate": 2.4028629856850718e-05, + "loss": 4.3013, + "step": 236 + }, + { + "epoch": 0.072744014732965, + "grad_norm": 3.59663987159729, + "learning_rate": 2.4130879345603275e-05, + "loss": 4.3248, + "step": 237 + }, + { + "epoch": 0.07305095150399017, + "grad_norm": 2.818321943283081, + "learning_rate": 2.423312883435583e-05, + "loss": 4.2876, + "step": 238 + }, + { + "epoch": 0.07335788827501534, + "grad_norm": 2.457371950149536, + "learning_rate": 2.4335378323108386e-05, + "loss": 4.2584, + "step": 239 + }, + { + "epoch": 0.07366482504604052, + "grad_norm": 3.6243598461151123, + "learning_rate": 2.4437627811860943e-05, + "loss": 4.2786, + "step": 240 + }, + { + "epoch": 0.07397176181706569, + "grad_norm": 2.113060474395752, + "learning_rate": 2.4539877300613496e-05, + "loss": 4.2071, + "step": 241 + }, + { + "epoch": 0.07427869858809086, + "grad_norm": 5.355374813079834, + "learning_rate": 2.4642126789366053e-05, + "loss": 4.2871, + "step": 242 + }, + { + "epoch": 0.07458563535911603, + "grad_norm": 2.4509847164154053, + "learning_rate": 2.474437627811861e-05, + "loss": 4.2073, + "step": 243 + }, + { + "epoch": 0.0748925721301412, + "grad_norm": 3.313793659210205, + "learning_rate": 2.4846625766871167e-05, + "loss": 4.2938, + "step": 244 + }, + { + "epoch": 0.07519950890116636, + "grad_norm": 2.731903553009033, + "learning_rate": 2.4948875255623724e-05, + "loss": 4.2023, + "step": 245 + }, + { + "epoch": 0.07550644567219153, + "grad_norm": 2.6218042373657227, + "learning_rate": 2.505112474437628e-05, + "loss": 4.2492, + "step": 246 + }, + { + "epoch": 0.0758133824432167, + "grad_norm": 3.2865426540374756, + "learning_rate": 2.5153374233128835e-05, + "loss": 4.2358, + "step": 247 + }, + { + "epoch": 0.07612031921424187, + "grad_norm": 2.21870756149292, + "learning_rate": 2.5255623721881395e-05, + "loss": 4.1989, + "step": 248 + }, + { + "epoch": 0.07642725598526703, + "grad_norm": 4.095842361450195, + "learning_rate": 2.535787321063395e-05, + "loss": 4.2484, + "step": 249 + }, + { + "epoch": 0.0767341927562922, + "grad_norm": 2.21420955657959, + "learning_rate": 2.5460122699386503e-05, + "loss": 4.1985, + "step": 250 + }, + { + "epoch": 0.07704112952731737, + "grad_norm": 3.011272668838501, + "learning_rate": 2.556237218813906e-05, + "loss": 4.2182, + "step": 251 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 2.930999279022217, + "learning_rate": 2.5664621676891613e-05, + "loss": 4.1985, + "step": 252 + }, + { + "epoch": 0.0776550030693677, + "grad_norm": 2.8528032302856445, + "learning_rate": 2.5766871165644174e-05, + "loss": 4.1859, + "step": 253 + }, + { + "epoch": 0.07796193984039287, + "grad_norm": 3.215587854385376, + "learning_rate": 2.5869120654396727e-05, + "loss": 4.2416, + "step": 254 + }, + { + "epoch": 0.07826887661141804, + "grad_norm": 3.1349990367889404, + "learning_rate": 2.5971370143149288e-05, + "loss": 4.2204, + "step": 255 + }, + { + "epoch": 0.07857581338244321, + "grad_norm": 3.146942377090454, + "learning_rate": 2.607361963190184e-05, + "loss": 4.17, + "step": 256 + }, + { + "epoch": 0.07888275015346839, + "grad_norm": 2.2611942291259766, + "learning_rate": 2.6175869120654402e-05, + "loss": 4.191, + "step": 257 + }, + { + "epoch": 0.07918968692449356, + "grad_norm": 3.434574604034424, + "learning_rate": 2.6278118609406955e-05, + "loss": 4.1854, + "step": 258 + }, + { + "epoch": 0.07949662369551873, + "grad_norm": 2.3132400512695312, + "learning_rate": 2.638036809815951e-05, + "loss": 4.233, + "step": 259 + }, + { + "epoch": 0.0798035604665439, + "grad_norm": 3.2676596641540527, + "learning_rate": 2.6482617586912066e-05, + "loss": 4.1586, + "step": 260 + }, + { + "epoch": 0.08011049723756906, + "grad_norm": 2.6182920932769775, + "learning_rate": 2.658486707566462e-05, + "loss": 4.164, + "step": 261 + }, + { + "epoch": 0.08041743400859423, + "grad_norm": 2.872018814086914, + "learning_rate": 2.668711656441718e-05, + "loss": 4.1642, + "step": 262 + }, + { + "epoch": 0.0807243707796194, + "grad_norm": 3.147237539291382, + "learning_rate": 2.6789366053169734e-05, + "loss": 4.147, + "step": 263 + }, + { + "epoch": 0.08103130755064457, + "grad_norm": 2.363360643386841, + "learning_rate": 2.6891615541922294e-05, + "loss": 4.1388, + "step": 264 + }, + { + "epoch": 0.08133824432166974, + "grad_norm": 3.364442825317383, + "learning_rate": 2.6993865030674848e-05, + "loss": 4.1678, + "step": 265 + }, + { + "epoch": 0.0816451810926949, + "grad_norm": 2.393705368041992, + "learning_rate": 2.7096114519427408e-05, + "loss": 4.1626, + "step": 266 + }, + { + "epoch": 0.08195211786372007, + "grad_norm": 3.8512558937072754, + "learning_rate": 2.719836400817996e-05, + "loss": 4.1613, + "step": 267 + }, + { + "epoch": 0.08225905463474524, + "grad_norm": 3.0992584228515625, + "learning_rate": 2.7300613496932515e-05, + "loss": 4.1486, + "step": 268 + }, + { + "epoch": 0.08256599140577041, + "grad_norm": 3.481079578399658, + "learning_rate": 2.7402862985685072e-05, + "loss": 4.1772, + "step": 269 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 3.2167513370513916, + "learning_rate": 2.7505112474437626e-05, + "loss": 4.1253, + "step": 270 + }, + { + "epoch": 0.08317986494782074, + "grad_norm": 2.9698429107666016, + "learning_rate": 2.7607361963190186e-05, + "loss": 4.0897, + "step": 271 + }, + { + "epoch": 0.08348680171884591, + "grad_norm": 3.2549962997436523, + "learning_rate": 2.770961145194274e-05, + "loss": 4.0851, + "step": 272 + }, + { + "epoch": 0.08379373848987108, + "grad_norm": 3.089301824569702, + "learning_rate": 2.78118609406953e-05, + "loss": 4.1378, + "step": 273 + }, + { + "epoch": 0.08410067526089625, + "grad_norm": 3.1799745559692383, + "learning_rate": 2.7914110429447854e-05, + "loss": 4.159, + "step": 274 + }, + { + "epoch": 0.08440761203192143, + "grad_norm": 2.7577199935913086, + "learning_rate": 2.8016359918200408e-05, + "loss": 4.0524, + "step": 275 + }, + { + "epoch": 0.0847145488029466, + "grad_norm": 3.709740161895752, + "learning_rate": 2.8118609406952968e-05, + "loss": 4.0877, + "step": 276 + }, + { + "epoch": 0.08502148557397177, + "grad_norm": 2.930482864379883, + "learning_rate": 2.822085889570552e-05, + "loss": 4.0408, + "step": 277 + }, + { + "epoch": 0.08532842234499693, + "grad_norm": 3.8216278553009033, + "learning_rate": 2.832310838445808e-05, + "loss": 4.0915, + "step": 278 + }, + { + "epoch": 0.0856353591160221, + "grad_norm": 2.7614903450012207, + "learning_rate": 2.8425357873210636e-05, + "loss": 4.0793, + "step": 279 + }, + { + "epoch": 0.08594229588704727, + "grad_norm": 4.005281448364258, + "learning_rate": 2.8527607361963193e-05, + "loss": 4.1234, + "step": 280 + }, + { + "epoch": 0.08624923265807244, + "grad_norm": 2.731640338897705, + "learning_rate": 2.8629856850715746e-05, + "loss": 4.1408, + "step": 281 + }, + { + "epoch": 0.0865561694290976, + "grad_norm": 4.439471244812012, + "learning_rate": 2.8732106339468307e-05, + "loss": 4.08, + "step": 282 + }, + { + "epoch": 0.08686310620012277, + "grad_norm": 2.929032564163208, + "learning_rate": 2.883435582822086e-05, + "loss": 4.0521, + "step": 283 + }, + { + "epoch": 0.08717004297114794, + "grad_norm": 3.3943557739257812, + "learning_rate": 2.8936605316973414e-05, + "loss": 4.0936, + "step": 284 + }, + { + "epoch": 0.08747697974217311, + "grad_norm": 2.9899704456329346, + "learning_rate": 2.9038854805725974e-05, + "loss": 4.0985, + "step": 285 + }, + { + "epoch": 0.08778391651319828, + "grad_norm": 2.8169870376586914, + "learning_rate": 2.9141104294478528e-05, + "loss": 4.1044, + "step": 286 + }, + { + "epoch": 0.08809085328422345, + "grad_norm": 4.312693119049072, + "learning_rate": 2.9243353783231085e-05, + "loss": 4.0515, + "step": 287 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 2.9270846843719482, + "learning_rate": 2.9345603271983642e-05, + "loss": 4.0221, + "step": 288 + }, + { + "epoch": 0.08870472682627378, + "grad_norm": 3.9831974506378174, + "learning_rate": 2.94478527607362e-05, + "loss": 4.0807, + "step": 289 + }, + { + "epoch": 0.08901166359729895, + "grad_norm": 2.721794605255127, + "learning_rate": 2.9550102249488753e-05, + "loss": 4.0732, + "step": 290 + }, + { + "epoch": 0.08931860036832412, + "grad_norm": 4.721047878265381, + "learning_rate": 2.9652351738241313e-05, + "loss": 4.0457, + "step": 291 + }, + { + "epoch": 0.08962553713934929, + "grad_norm": 2.785738229751587, + "learning_rate": 2.9754601226993867e-05, + "loss": 4.0288, + "step": 292 + }, + { + "epoch": 0.08993247391037447, + "grad_norm": 4.842009544372559, + "learning_rate": 2.985685071574642e-05, + "loss": 4.1193, + "step": 293 + }, + { + "epoch": 0.09023941068139964, + "grad_norm": 2.802044153213501, + "learning_rate": 2.995910020449898e-05, + "loss": 4.0055, + "step": 294 + }, + { + "epoch": 0.0905463474524248, + "grad_norm": 3.7060954570770264, + "learning_rate": 3.0061349693251534e-05, + "loss": 4.0478, + "step": 295 + }, + { + "epoch": 0.09085328422344997, + "grad_norm": 2.8033370971679688, + "learning_rate": 3.0163599182004095e-05, + "loss": 4.0344, + "step": 296 + }, + { + "epoch": 0.09116022099447514, + "grad_norm": 3.148653984069824, + "learning_rate": 3.026584867075665e-05, + "loss": 3.9825, + "step": 297 + }, + { + "epoch": 0.09146715776550031, + "grad_norm": 3.925459384918213, + "learning_rate": 3.0368098159509205e-05, + "loss": 4.0253, + "step": 298 + }, + { + "epoch": 0.09177409453652548, + "grad_norm": 2.8502724170684814, + "learning_rate": 3.047034764826176e-05, + "loss": 4.0192, + "step": 299 + }, + { + "epoch": 0.09208103130755065, + "grad_norm": 3.8444268703460693, + "learning_rate": 3.057259713701431e-05, + "loss": 4.0354, + "step": 300 + }, + { + "epoch": 0.09238796807857581, + "grad_norm": 2.935976982116699, + "learning_rate": 3.067484662576687e-05, + "loss": 4.0397, + "step": 301 + }, + { + "epoch": 0.09269490484960098, + "grad_norm": 2.9375271797180176, + "learning_rate": 3.0777096114519427e-05, + "loss": 3.975, + "step": 302 + }, + { + "epoch": 0.09300184162062615, + "grad_norm": 3.7623329162597656, + "learning_rate": 3.087934560327199e-05, + "loss": 4.0259, + "step": 303 + }, + { + "epoch": 0.09330877839165132, + "grad_norm": 3.1480228900909424, + "learning_rate": 3.098159509202454e-05, + "loss": 3.9676, + "step": 304 + }, + { + "epoch": 0.09361571516267649, + "grad_norm": 4.572622299194336, + "learning_rate": 3.10838445807771e-05, + "loss": 4.0123, + "step": 305 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 2.469806671142578, + "learning_rate": 3.1186094069529655e-05, + "loss": 4.012, + "step": 306 + }, + { + "epoch": 0.09422958870472682, + "grad_norm": 5.133090019226074, + "learning_rate": 3.1288343558282215e-05, + "loss": 3.9892, + "step": 307 + }, + { + "epoch": 0.09453652547575199, + "grad_norm": 3.379105567932129, + "learning_rate": 3.139059304703477e-05, + "loss": 4.0286, + "step": 308 + }, + { + "epoch": 0.09484346224677716, + "grad_norm": 3.1413521766662598, + "learning_rate": 3.149284253578732e-05, + "loss": 4.0238, + "step": 309 + }, + { + "epoch": 0.09515039901780234, + "grad_norm": 2.832242250442505, + "learning_rate": 3.159509202453988e-05, + "loss": 3.9955, + "step": 310 + }, + { + "epoch": 0.09545733578882751, + "grad_norm": 4.405134201049805, + "learning_rate": 3.1697341513292436e-05, + "loss": 4.0093, + "step": 311 + }, + { + "epoch": 0.09576427255985268, + "grad_norm": 2.8928587436676025, + "learning_rate": 3.179959100204499e-05, + "loss": 3.9518, + "step": 312 + }, + { + "epoch": 0.09607120933087784, + "grad_norm": 3.8899731636047363, + "learning_rate": 3.1901840490797544e-05, + "loss": 3.9773, + "step": 313 + }, + { + "epoch": 0.09637814610190301, + "grad_norm": 2.768199920654297, + "learning_rate": 3.2004089979550104e-05, + "loss": 3.9671, + "step": 314 + }, + { + "epoch": 0.09668508287292818, + "grad_norm": 3.834092378616333, + "learning_rate": 3.210633946830266e-05, + "loss": 3.9641, + "step": 315 + }, + { + "epoch": 0.09699201964395335, + "grad_norm": 3.566220998764038, + "learning_rate": 3.220858895705521e-05, + "loss": 3.9585, + "step": 316 + }, + { + "epoch": 0.09729895641497852, + "grad_norm": 3.1876113414764404, + "learning_rate": 3.231083844580777e-05, + "loss": 3.9689, + "step": 317 + }, + { + "epoch": 0.09760589318600368, + "grad_norm": 3.122142791748047, + "learning_rate": 3.2413087934560325e-05, + "loss": 3.9601, + "step": 318 + }, + { + "epoch": 0.09791282995702885, + "grad_norm": 3.825195789337158, + "learning_rate": 3.2515337423312886e-05, + "loss": 3.9413, + "step": 319 + }, + { + "epoch": 0.09821976672805402, + "grad_norm": 3.3126778602600098, + "learning_rate": 3.261758691206544e-05, + "loss": 4.0414, + "step": 320 + }, + { + "epoch": 0.09852670349907919, + "grad_norm": 3.7704360485076904, + "learning_rate": 3.2719836400818e-05, + "loss": 3.9224, + "step": 321 + }, + { + "epoch": 0.09883364027010436, + "grad_norm": 2.997194290161133, + "learning_rate": 3.282208588957055e-05, + "loss": 3.9454, + "step": 322 + }, + { + "epoch": 0.09914057704112952, + "grad_norm": 3.4990131855010986, + "learning_rate": 3.2924335378323114e-05, + "loss": 3.8682, + "step": 323 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 3.146879196166992, + "learning_rate": 3.302658486707567e-05, + "loss": 3.8863, + "step": 324 + }, + { + "epoch": 0.09975445058317986, + "grad_norm": 4.963291645050049, + "learning_rate": 3.312883435582822e-05, + "loss": 3.9951, + "step": 325 + }, + { + "epoch": 0.10006138735420503, + "grad_norm": 2.4511775970458984, + "learning_rate": 3.323108384458078e-05, + "loss": 3.875, + "step": 326 + }, + { + "epoch": 0.1003683241252302, + "grad_norm": 5.670922756195068, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.0446, + "step": 327 + }, + { + "epoch": 0.10067526089625538, + "grad_norm": 3.54237699508667, + "learning_rate": 3.3435582822085895e-05, + "loss": 3.9877, + "step": 328 + }, + { + "epoch": 0.10098219766728055, + "grad_norm": 2.9059271812438965, + "learning_rate": 3.353783231083845e-05, + "loss": 3.949, + "step": 329 + }, + { + "epoch": 0.10128913443830571, + "grad_norm": 3.870962381362915, + "learning_rate": 3.3640081799591e-05, + "loss": 3.8985, + "step": 330 + }, + { + "epoch": 0.10159607120933088, + "grad_norm": 3.275129556655884, + "learning_rate": 3.3742331288343556e-05, + "loss": 4.0209, + "step": 331 + }, + { + "epoch": 0.10190300798035605, + "grad_norm": 3.040931224822998, + "learning_rate": 3.3844580777096117e-05, + "loss": 3.9938, + "step": 332 + }, + { + "epoch": 0.10220994475138122, + "grad_norm": 4.3355584144592285, + "learning_rate": 3.394683026584867e-05, + "loss": 3.876, + "step": 333 + }, + { + "epoch": 0.10251688152240639, + "grad_norm": 3.0981085300445557, + "learning_rate": 3.4049079754601224e-05, + "loss": 3.9014, + "step": 334 + }, + { + "epoch": 0.10282381829343155, + "grad_norm": 3.2902655601501465, + "learning_rate": 3.4151329243353784e-05, + "loss": 3.9599, + "step": 335 + }, + { + "epoch": 0.10313075506445672, + "grad_norm": 3.496514081954956, + "learning_rate": 3.425357873210634e-05, + "loss": 3.9005, + "step": 336 + }, + { + "epoch": 0.10343769183548189, + "grad_norm": 3.4680685997009277, + "learning_rate": 3.43558282208589e-05, + "loss": 3.8591, + "step": 337 + }, + { + "epoch": 0.10374462860650706, + "grad_norm": 3.3041694164276123, + "learning_rate": 3.445807770961145e-05, + "loss": 3.9566, + "step": 338 + }, + { + "epoch": 0.10405156537753223, + "grad_norm": 3.519709825515747, + "learning_rate": 3.456032719836401e-05, + "loss": 3.9219, + "step": 339 + }, + { + "epoch": 0.1043585021485574, + "grad_norm": 3.932344436645508, + "learning_rate": 3.4662576687116566e-05, + "loss": 3.9155, + "step": 340 + }, + { + "epoch": 0.10466543891958256, + "grad_norm": 3.3109822273254395, + "learning_rate": 3.476482617586912e-05, + "loss": 3.9729, + "step": 341 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 4.556341648101807, + "learning_rate": 3.486707566462168e-05, + "loss": 3.9459, + "step": 342 + }, + { + "epoch": 0.1052793124616329, + "grad_norm": 2.9105725288391113, + "learning_rate": 3.4969325153374234e-05, + "loss": 3.9384, + "step": 343 + }, + { + "epoch": 0.10558624923265807, + "grad_norm": 3.865682601928711, + "learning_rate": 3.5071574642126794e-05, + "loss": 3.9826, + "step": 344 + }, + { + "epoch": 0.10589318600368323, + "grad_norm": 2.8606700897216797, + "learning_rate": 3.517382413087935e-05, + "loss": 3.8184, + "step": 345 + }, + { + "epoch": 0.10620012277470842, + "grad_norm": 4.323507785797119, + "learning_rate": 3.527607361963191e-05, + "loss": 3.8772, + "step": 346 + }, + { + "epoch": 0.10650705954573358, + "grad_norm": 2.890390157699585, + "learning_rate": 3.537832310838446e-05, + "loss": 3.8769, + "step": 347 + }, + { + "epoch": 0.10681399631675875, + "grad_norm": 4.008283615112305, + "learning_rate": 3.5480572597137015e-05, + "loss": 3.8796, + "step": 348 + }, + { + "epoch": 0.10712093308778392, + "grad_norm": 3.3605823516845703, + "learning_rate": 3.558282208588957e-05, + "loss": 3.8924, + "step": 349 + }, + { + "epoch": 0.10742786985880909, + "grad_norm": 3.6573123931884766, + "learning_rate": 3.568507157464213e-05, + "loss": 3.812, + "step": 350 + }, + { + "epoch": 0.10773480662983426, + "grad_norm": 3.0771777629852295, + "learning_rate": 3.578732106339468e-05, + "loss": 3.8958, + "step": 351 + }, + { + "epoch": 0.10804174340085942, + "grad_norm": 3.6483314037323, + "learning_rate": 3.5889570552147236e-05, + "loss": 3.8863, + "step": 352 + }, + { + "epoch": 0.10834868017188459, + "grad_norm": 3.1320669651031494, + "learning_rate": 3.59918200408998e-05, + "loss": 3.8194, + "step": 353 + }, + { + "epoch": 0.10865561694290976, + "grad_norm": 3.6510627269744873, + "learning_rate": 3.609406952965235e-05, + "loss": 3.8916, + "step": 354 + }, + { + "epoch": 0.10896255371393493, + "grad_norm": 3.0419273376464844, + "learning_rate": 3.619631901840491e-05, + "loss": 3.7907, + "step": 355 + }, + { + "epoch": 0.1092694904849601, + "grad_norm": 4.519289493560791, + "learning_rate": 3.6298568507157465e-05, + "loss": 3.8902, + "step": 356 + }, + { + "epoch": 0.10957642725598526, + "grad_norm": 2.938493251800537, + "learning_rate": 3.6400817995910025e-05, + "loss": 3.8675, + "step": 357 + }, + { + "epoch": 0.10988336402701043, + "grad_norm": 4.398004531860352, + "learning_rate": 3.650306748466258e-05, + "loss": 3.9535, + "step": 358 + }, + { + "epoch": 0.1101903007980356, + "grad_norm": 2.9128408432006836, + "learning_rate": 3.660531697341513e-05, + "loss": 3.944, + "step": 359 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 5.364169597625732, + "learning_rate": 3.670756646216769e-05, + "loss": 3.9289, + "step": 360 + }, + { + "epoch": 0.11080417434008594, + "grad_norm": 2.8434085845947266, + "learning_rate": 3.6809815950920246e-05, + "loss": 3.8204, + "step": 361 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.494234561920166, + "learning_rate": 3.6912065439672807e-05, + "loss": 3.8518, + "step": 362 + }, + { + "epoch": 0.11141804788213629, + "grad_norm": 2.959608554840088, + "learning_rate": 3.701431492842536e-05, + "loss": 3.8365, + "step": 363 + }, + { + "epoch": 0.11172498465316145, + "grad_norm": 3.4115726947784424, + "learning_rate": 3.711656441717792e-05, + "loss": 3.8507, + "step": 364 + }, + { + "epoch": 0.11203192142418662, + "grad_norm": 3.8023531436920166, + "learning_rate": 3.7218813905930474e-05, + "loss": 3.8544, + "step": 365 + }, + { + "epoch": 0.11233885819521179, + "grad_norm": 3.0639398097991943, + "learning_rate": 3.732106339468303e-05, + "loss": 3.8772, + "step": 366 + }, + { + "epoch": 0.11264579496623696, + "grad_norm": 4.241199016571045, + "learning_rate": 3.742331288343559e-05, + "loss": 3.7739, + "step": 367 + }, + { + "epoch": 0.11295273173726213, + "grad_norm": 2.977330446243286, + "learning_rate": 3.752556237218814e-05, + "loss": 3.8376, + "step": 368 + }, + { + "epoch": 0.1132596685082873, + "grad_norm": 4.574001789093018, + "learning_rate": 3.7627811860940696e-05, + "loss": 3.8761, + "step": 369 + }, + { + "epoch": 0.11356660527931246, + "grad_norm": 3.1499617099761963, + "learning_rate": 3.773006134969325e-05, + "loss": 3.8884, + "step": 370 + }, + { + "epoch": 0.11387354205033763, + "grad_norm": 3.81887149810791, + "learning_rate": 3.783231083844581e-05, + "loss": 3.8474, + "step": 371 + }, + { + "epoch": 0.1141804788213628, + "grad_norm": 3.424117088317871, + "learning_rate": 3.793456032719836e-05, + "loss": 3.8715, + "step": 372 + }, + { + "epoch": 0.11448741559238797, + "grad_norm": 4.431595325469971, + "learning_rate": 3.8036809815950924e-05, + "loss": 3.8305, + "step": 373 + }, + { + "epoch": 0.11479435236341314, + "grad_norm": 3.1664443016052246, + "learning_rate": 3.813905930470348e-05, + "loss": 3.8203, + "step": 374 + }, + { + "epoch": 0.1151012891344383, + "grad_norm": 4.312273025512695, + "learning_rate": 3.824130879345603e-05, + "loss": 3.8195, + "step": 375 + }, + { + "epoch": 0.11540822590546347, + "grad_norm": 3.0893726348876953, + "learning_rate": 3.834355828220859e-05, + "loss": 3.8248, + "step": 376 + }, + { + "epoch": 0.11571516267648864, + "grad_norm": 4.526726722717285, + "learning_rate": 3.8445807770961145e-05, + "loss": 3.8505, + "step": 377 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 2.5805325508117676, + "learning_rate": 3.8548057259713705e-05, + "loss": 3.8153, + "step": 378 + }, + { + "epoch": 0.11632903621853898, + "grad_norm": 4.6043381690979, + "learning_rate": 3.865030674846626e-05, + "loss": 3.8248, + "step": 379 + }, + { + "epoch": 0.11663597298956414, + "grad_norm": 3.0713136196136475, + "learning_rate": 3.875255623721882e-05, + "loss": 3.7687, + "step": 380 + }, + { + "epoch": 0.11694290976058933, + "grad_norm": 3.6344685554504395, + "learning_rate": 3.885480572597137e-05, + "loss": 3.8061, + "step": 381 + }, + { + "epoch": 0.1172498465316145, + "grad_norm": 3.6261723041534424, + "learning_rate": 3.895705521472393e-05, + "loss": 3.7939, + "step": 382 + }, + { + "epoch": 0.11755678330263966, + "grad_norm": 3.811779260635376, + "learning_rate": 3.905930470347649e-05, + "loss": 3.7973, + "step": 383 + }, + { + "epoch": 0.11786372007366483, + "grad_norm": 3.741685628890991, + "learning_rate": 3.916155419222904e-05, + "loss": 3.8149, + "step": 384 + }, + { + "epoch": 0.11817065684469, + "grad_norm": 3.330526351928711, + "learning_rate": 3.92638036809816e-05, + "loss": 3.8058, + "step": 385 + }, + { + "epoch": 0.11847759361571517, + "grad_norm": 3.2102115154266357, + "learning_rate": 3.9366053169734155e-05, + "loss": 3.7199, + "step": 386 + }, + { + "epoch": 0.11878453038674033, + "grad_norm": 3.670474052429199, + "learning_rate": 3.946830265848671e-05, + "loss": 3.8087, + "step": 387 + }, + { + "epoch": 0.1190914671577655, + "grad_norm": 3.218390941619873, + "learning_rate": 3.957055214723926e-05, + "loss": 3.7631, + "step": 388 + }, + { + "epoch": 0.11939840392879067, + "grad_norm": 4.2256693840026855, + "learning_rate": 3.967280163599182e-05, + "loss": 3.7624, + "step": 389 + }, + { + "epoch": 0.11970534069981584, + "grad_norm": 2.86247181892395, + "learning_rate": 3.9775051124744376e-05, + "loss": 3.7638, + "step": 390 + }, + { + "epoch": 0.120012277470841, + "grad_norm": 4.083118915557861, + "learning_rate": 3.987730061349693e-05, + "loss": 3.7581, + "step": 391 + }, + { + "epoch": 0.12031921424186617, + "grad_norm": 2.836794376373291, + "learning_rate": 3.997955010224949e-05, + "loss": 3.7466, + "step": 392 + }, + { + "epoch": 0.12062615101289134, + "grad_norm": 4.071137428283691, + "learning_rate": 4.0081799591002043e-05, + "loss": 3.7836, + "step": 393 + }, + { + "epoch": 0.12093308778391651, + "grad_norm": 3.3141064643859863, + "learning_rate": 4.0184049079754604e-05, + "loss": 3.754, + "step": 394 + }, + { + "epoch": 0.12124002455494168, + "grad_norm": 3.6064393520355225, + "learning_rate": 4.028629856850716e-05, + "loss": 3.8379, + "step": 395 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 3.7306606769561768, + "learning_rate": 4.038854805725972e-05, + "loss": 3.6848, + "step": 396 + }, + { + "epoch": 0.12185389809699201, + "grad_norm": 3.5877859592437744, + "learning_rate": 4.049079754601227e-05, + "loss": 3.8201, + "step": 397 + }, + { + "epoch": 0.12216083486801718, + "grad_norm": 3.930271625518799, + "learning_rate": 4.059304703476483e-05, + "loss": 3.7507, + "step": 398 + }, + { + "epoch": 0.12246777163904236, + "grad_norm": 2.974968194961548, + "learning_rate": 4.0695296523517386e-05, + "loss": 3.7545, + "step": 399 + }, + { + "epoch": 0.12277470841006753, + "grad_norm": 4.655934810638428, + "learning_rate": 4.079754601226994e-05, + "loss": 3.8093, + "step": 400 + }, + { + "epoch": 0.1230816451810927, + "grad_norm": 3.201986312866211, + "learning_rate": 4.08997955010225e-05, + "loss": 3.7252, + "step": 401 + }, + { + "epoch": 0.12338858195211787, + "grad_norm": 4.447626113891602, + "learning_rate": 4.100204498977505e-05, + "loss": 3.7132, + "step": 402 + }, + { + "epoch": 0.12369551872314304, + "grad_norm": 2.6518118381500244, + "learning_rate": 4.1104294478527614e-05, + "loss": 3.7637, + "step": 403 + }, + { + "epoch": 0.1240024554941682, + "grad_norm": 5.116448402404785, + "learning_rate": 4.120654396728017e-05, + "loss": 3.6991, + "step": 404 + }, + { + "epoch": 0.12430939226519337, + "grad_norm": 2.7780613899230957, + "learning_rate": 4.130879345603272e-05, + "loss": 3.7555, + "step": 405 + }, + { + "epoch": 0.12461632903621854, + "grad_norm": 4.281010627746582, + "learning_rate": 4.1411042944785274e-05, + "loss": 3.688, + "step": 406 + }, + { + "epoch": 0.12492326580724371, + "grad_norm": 2.851562023162842, + "learning_rate": 4.1513292433537835e-05, + "loss": 3.7557, + "step": 407 + }, + { + "epoch": 0.1252302025782689, + "grad_norm": 4.092229843139648, + "learning_rate": 4.161554192229039e-05, + "loss": 3.7179, + "step": 408 + }, + { + "epoch": 0.12553713934929406, + "grad_norm": 3.410094976425171, + "learning_rate": 4.171779141104294e-05, + "loss": 3.7292, + "step": 409 + }, + { + "epoch": 0.12584407612031923, + "grad_norm": 4.266562461853027, + "learning_rate": 4.18200408997955e-05, + "loss": 3.8204, + "step": 410 + }, + { + "epoch": 0.1261510128913444, + "grad_norm": 2.997642755508423, + "learning_rate": 4.1922290388548056e-05, + "loss": 3.7773, + "step": 411 + }, + { + "epoch": 0.12645794966236956, + "grad_norm": 4.50873327255249, + "learning_rate": 4.2024539877300617e-05, + "loss": 3.7255, + "step": 412 + }, + { + "epoch": 0.12676488643339473, + "grad_norm": 3.65312123298645, + "learning_rate": 4.212678936605317e-05, + "loss": 3.6472, + "step": 413 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 3.985487222671509, + "learning_rate": 4.222903885480573e-05, + "loss": 3.6915, + "step": 414 + }, + { + "epoch": 0.12737875997544507, + "grad_norm": 3.6020219326019287, + "learning_rate": 4.2331288343558284e-05, + "loss": 3.7299, + "step": 415 + }, + { + "epoch": 0.12768569674647023, + "grad_norm": 3.414529323577881, + "learning_rate": 4.243353783231084e-05, + "loss": 3.7827, + "step": 416 + }, + { + "epoch": 0.1279926335174954, + "grad_norm": 3.537292718887329, + "learning_rate": 4.25357873210634e-05, + "loss": 3.751, + "step": 417 + }, + { + "epoch": 0.12829957028852057, + "grad_norm": 3.5442280769348145, + "learning_rate": 4.263803680981595e-05, + "loss": 3.6828, + "step": 418 + }, + { + "epoch": 0.12860650705954574, + "grad_norm": 3.9816019535064697, + "learning_rate": 4.274028629856851e-05, + "loss": 3.7668, + "step": 419 + }, + { + "epoch": 0.1289134438305709, + "grad_norm": 3.1632657051086426, + "learning_rate": 4.2842535787321066e-05, + "loss": 3.6946, + "step": 420 + }, + { + "epoch": 0.12922038060159607, + "grad_norm": 4.731013298034668, + "learning_rate": 4.2944785276073626e-05, + "loss": 3.7078, + "step": 421 + }, + { + "epoch": 0.12952731737262124, + "grad_norm": 2.7973382472991943, + "learning_rate": 4.304703476482618e-05, + "loss": 3.5934, + "step": 422 + }, + { + "epoch": 0.1298342541436464, + "grad_norm": 4.555461406707764, + "learning_rate": 4.3149284253578733e-05, + "loss": 3.7406, + "step": 423 + }, + { + "epoch": 0.13014119091467158, + "grad_norm": 3.25795841217041, + "learning_rate": 4.3251533742331294e-05, + "loss": 3.6302, + "step": 424 + }, + { + "epoch": 0.13044812768569675, + "grad_norm": 3.9974427223205566, + "learning_rate": 4.335378323108385e-05, + "loss": 3.6995, + "step": 425 + }, + { + "epoch": 0.13075506445672191, + "grad_norm": 3.4234917163848877, + "learning_rate": 4.34560327198364e-05, + "loss": 3.727, + "step": 426 + }, + { + "epoch": 0.13106200122774708, + "grad_norm": 3.40573787689209, + "learning_rate": 4.3558282208588955e-05, + "loss": 3.6964, + "step": 427 + }, + { + "epoch": 0.13136893799877225, + "grad_norm": 3.6903765201568604, + "learning_rate": 4.3660531697341515e-05, + "loss": 3.7139, + "step": 428 + }, + { + "epoch": 0.13167587476979742, + "grad_norm": 3.3252439498901367, + "learning_rate": 4.376278118609407e-05, + "loss": 3.7221, + "step": 429 + }, + { + "epoch": 0.1319828115408226, + "grad_norm": 3.591610908508301, + "learning_rate": 4.386503067484663e-05, + "loss": 3.6592, + "step": 430 + }, + { + "epoch": 0.13228974831184775, + "grad_norm": 3.584683418273926, + "learning_rate": 4.396728016359918e-05, + "loss": 3.695, + "step": 431 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 3.5093443393707275, + "learning_rate": 4.4069529652351736e-05, + "loss": 3.6368, + "step": 432 + }, + { + "epoch": 0.1329036218538981, + "grad_norm": 3.5040347576141357, + "learning_rate": 4.41717791411043e-05, + "loss": 3.6463, + "step": 433 + }, + { + "epoch": 0.13321055862492326, + "grad_norm": 3.534536361694336, + "learning_rate": 4.427402862985685e-05, + "loss": 3.681, + "step": 434 + }, + { + "epoch": 0.13351749539594843, + "grad_norm": 4.016106605529785, + "learning_rate": 4.437627811860941e-05, + "loss": 3.7592, + "step": 435 + }, + { + "epoch": 0.1338244321669736, + "grad_norm": 3.4661898612976074, + "learning_rate": 4.4478527607361964e-05, + "loss": 3.6437, + "step": 436 + }, + { + "epoch": 0.13413136893799876, + "grad_norm": 3.917189359664917, + "learning_rate": 4.4580777096114525e-05, + "loss": 3.6809, + "step": 437 + }, + { + "epoch": 0.13443830570902393, + "grad_norm": 3.472147226333618, + "learning_rate": 4.468302658486708e-05, + "loss": 3.5978, + "step": 438 + }, + { + "epoch": 0.1347452424800491, + "grad_norm": 3.2357044219970703, + "learning_rate": 4.478527607361964e-05, + "loss": 3.6758, + "step": 439 + }, + { + "epoch": 0.13505217925107427, + "grad_norm": 3.8607826232910156, + "learning_rate": 4.488752556237219e-05, + "loss": 3.7155, + "step": 440 + }, + { + "epoch": 0.13535911602209943, + "grad_norm": 3.085242509841919, + "learning_rate": 4.4989775051124746e-05, + "loss": 3.674, + "step": 441 + }, + { + "epoch": 0.1356660527931246, + "grad_norm": 4.0473432540893555, + "learning_rate": 4.5092024539877307e-05, + "loss": 3.6542, + "step": 442 + }, + { + "epoch": 0.1359729895641498, + "grad_norm": 3.4742088317871094, + "learning_rate": 4.519427402862986e-05, + "loss": 3.6226, + "step": 443 + }, + { + "epoch": 0.13627992633517497, + "grad_norm": 3.8838884830474854, + "learning_rate": 4.5296523517382414e-05, + "loss": 3.695, + "step": 444 + }, + { + "epoch": 0.13658686310620013, + "grad_norm": 3.1551895141601562, + "learning_rate": 4.539877300613497e-05, + "loss": 3.6886, + "step": 445 + }, + { + "epoch": 0.1368937998772253, + "grad_norm": 3.6824824810028076, + "learning_rate": 4.550102249488753e-05, + "loss": 3.6397, + "step": 446 + }, + { + "epoch": 0.13720073664825047, + "grad_norm": 3.3671298027038574, + "learning_rate": 4.560327198364008e-05, + "loss": 3.5983, + "step": 447 + }, + { + "epoch": 0.13750767341927564, + "grad_norm": 4.11976957321167, + "learning_rate": 4.570552147239264e-05, + "loss": 3.6371, + "step": 448 + }, + { + "epoch": 0.1378146101903008, + "grad_norm": 3.2035205364227295, + "learning_rate": 4.5807770961145195e-05, + "loss": 3.6097, + "step": 449 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 4.944174289703369, + "learning_rate": 4.591002044989775e-05, + "loss": 3.6317, + "step": 450 + }, + { + "epoch": 0.13842848373235114, + "grad_norm": 3.0040266513824463, + "learning_rate": 4.601226993865031e-05, + "loss": 3.6407, + "step": 451 + }, + { + "epoch": 0.1387354205033763, + "grad_norm": 5.124639511108398, + "learning_rate": 4.611451942740286e-05, + "loss": 3.6539, + "step": 452 + }, + { + "epoch": 0.13904235727440148, + "grad_norm": 2.792884349822998, + "learning_rate": 4.6216768916155423e-05, + "loss": 3.6542, + "step": 453 + }, + { + "epoch": 0.13934929404542665, + "grad_norm": 4.394725799560547, + "learning_rate": 4.631901840490798e-05, + "loss": 3.6811, + "step": 454 + }, + { + "epoch": 0.13965623081645182, + "grad_norm": 3.209400177001953, + "learning_rate": 4.642126789366054e-05, + "loss": 3.6635, + "step": 455 + }, + { + "epoch": 0.13996316758747698, + "grad_norm": 3.6599526405334473, + "learning_rate": 4.652351738241309e-05, + "loss": 3.5732, + "step": 456 + }, + { + "epoch": 0.14027010435850215, + "grad_norm": 3.6527204513549805, + "learning_rate": 4.6625766871165645e-05, + "loss": 3.5979, + "step": 457 + }, + { + "epoch": 0.14057704112952732, + "grad_norm": 3.4562110900878906, + "learning_rate": 4.6728016359918205e-05, + "loss": 3.6761, + "step": 458 + }, + { + "epoch": 0.1408839779005525, + "grad_norm": 3.5935721397399902, + "learning_rate": 4.683026584867076e-05, + "loss": 3.6598, + "step": 459 + }, + { + "epoch": 0.14119091467157766, + "grad_norm": 3.4518251419067383, + "learning_rate": 4.693251533742332e-05, + "loss": 3.5707, + "step": 460 + }, + { + "epoch": 0.14149785144260282, + "grad_norm": 3.3248815536499023, + "learning_rate": 4.703476482617587e-05, + "loss": 3.6949, + "step": 461 + }, + { + "epoch": 0.141804788213628, + "grad_norm": 3.6379971504211426, + "learning_rate": 4.7137014314928426e-05, + "loss": 3.6265, + "step": 462 + }, + { + "epoch": 0.14211172498465316, + "grad_norm": 4.068325996398926, + "learning_rate": 4.723926380368098e-05, + "loss": 3.6096, + "step": 463 + }, + { + "epoch": 0.14241866175567833, + "grad_norm": 3.0870959758758545, + "learning_rate": 4.734151329243354e-05, + "loss": 3.5201, + "step": 464 + }, + { + "epoch": 0.1427255985267035, + "grad_norm": 4.013638973236084, + "learning_rate": 4.7443762781186094e-05, + "loss": 3.5845, + "step": 465 + }, + { + "epoch": 0.14303253529772866, + "grad_norm": 3.421921968460083, + "learning_rate": 4.754601226993865e-05, + "loss": 3.6718, + "step": 466 + }, + { + "epoch": 0.14333947206875383, + "grad_norm": 3.4814112186431885, + "learning_rate": 4.764826175869121e-05, + "loss": 3.6225, + "step": 467 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 2.9323105812072754, + "learning_rate": 4.775051124744376e-05, + "loss": 3.5881, + "step": 468 + }, + { + "epoch": 0.14395334561080417, + "grad_norm": 3.862344264984131, + "learning_rate": 4.785276073619632e-05, + "loss": 3.6264, + "step": 469 + }, + { + "epoch": 0.14426028238182934, + "grad_norm": 2.950495481491089, + "learning_rate": 4.7955010224948876e-05, + "loss": 3.5891, + "step": 470 + }, + { + "epoch": 0.1445672191528545, + "grad_norm": 4.360744476318359, + "learning_rate": 4.8057259713701436e-05, + "loss": 3.6746, + "step": 471 + }, + { + "epoch": 0.14487415592387967, + "grad_norm": 2.689297914505005, + "learning_rate": 4.815950920245399e-05, + "loss": 3.616, + "step": 472 + }, + { + "epoch": 0.14518109269490484, + "grad_norm": 4.433006286621094, + "learning_rate": 4.826175869120655e-05, + "loss": 3.6259, + "step": 473 + }, + { + "epoch": 0.14548802946593, + "grad_norm": 2.9184467792510986, + "learning_rate": 4.8364008179959104e-05, + "loss": 3.59, + "step": 474 + }, + { + "epoch": 0.14579496623695518, + "grad_norm": 4.472714424133301, + "learning_rate": 4.846625766871166e-05, + "loss": 3.5608, + "step": 475 + }, + { + "epoch": 0.14610190300798034, + "grad_norm": 3.0839431285858154, + "learning_rate": 4.856850715746422e-05, + "loss": 3.6069, + "step": 476 + }, + { + "epoch": 0.1464088397790055, + "grad_norm": 3.8900411128997803, + "learning_rate": 4.867075664621677e-05, + "loss": 3.5387, + "step": 477 + }, + { + "epoch": 0.14671577655003068, + "grad_norm": 3.0446956157684326, + "learning_rate": 4.877300613496933e-05, + "loss": 3.5374, + "step": 478 + }, + { + "epoch": 0.14702271332105588, + "grad_norm": 3.805018901824951, + "learning_rate": 4.8875255623721885e-05, + "loss": 3.6032, + "step": 479 + }, + { + "epoch": 0.14732965009208104, + "grad_norm": 2.9937491416931152, + "learning_rate": 4.897750511247444e-05, + "loss": 3.548, + "step": 480 + }, + { + "epoch": 0.1476365868631062, + "grad_norm": 4.103757858276367, + "learning_rate": 4.907975460122699e-05, + "loss": 3.6292, + "step": 481 + }, + { + "epoch": 0.14794352363413138, + "grad_norm": 2.8275530338287354, + "learning_rate": 4.918200408997955e-05, + "loss": 3.5885, + "step": 482 + }, + { + "epoch": 0.14825046040515655, + "grad_norm": 4.104444980621338, + "learning_rate": 4.928425357873211e-05, + "loss": 3.5566, + "step": 483 + }, + { + "epoch": 0.14855739717618172, + "grad_norm": 2.820648670196533, + "learning_rate": 4.938650306748466e-05, + "loss": 3.6576, + "step": 484 + }, + { + "epoch": 0.14886433394720688, + "grad_norm": 4.639568328857422, + "learning_rate": 4.948875255623722e-05, + "loss": 3.583, + "step": 485 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 2.8675858974456787, + "learning_rate": 4.9591002044989774e-05, + "loss": 3.5982, + "step": 486 + }, + { + "epoch": 0.14947820748925722, + "grad_norm": 4.820484638214111, + "learning_rate": 4.9693251533742335e-05, + "loss": 3.5479, + "step": 487 + }, + { + "epoch": 0.1497851442602824, + "grad_norm": 2.9569075107574463, + "learning_rate": 4.979550102249489e-05, + "loss": 3.5846, + "step": 488 + }, + { + "epoch": 0.15009208103130756, + "grad_norm": 4.402152061462402, + "learning_rate": 4.989775051124745e-05, + "loss": 3.5368, + "step": 489 + }, + { + "epoch": 0.15039901780233272, + "grad_norm": 3.0454704761505127, + "learning_rate": 5e-05, + "loss": 3.5233, + "step": 490 + }, + { + "epoch": 0.1507059545733579, + "grad_norm": 3.564425468444824, + "learning_rate": 5.010224948875256e-05, + "loss": 3.5747, + "step": 491 + }, + { + "epoch": 0.15101289134438306, + "grad_norm": 3.2065536975860596, + "learning_rate": 5.020449897750511e-05, + "loss": 3.4803, + "step": 492 + }, + { + "epoch": 0.15131982811540823, + "grad_norm": 4.06170129776001, + "learning_rate": 5.030674846625767e-05, + "loss": 3.5867, + "step": 493 + }, + { + "epoch": 0.1516267648864334, + "grad_norm": 2.937181234359741, + "learning_rate": 5.040899795501023e-05, + "loss": 3.5098, + "step": 494 + }, + { + "epoch": 0.15193370165745856, + "grad_norm": 3.7272653579711914, + "learning_rate": 5.051124744376279e-05, + "loss": 3.5959, + "step": 495 + }, + { + "epoch": 0.15224063842848373, + "grad_norm": 2.8606886863708496, + "learning_rate": 5.061349693251534e-05, + "loss": 3.4881, + "step": 496 + }, + { + "epoch": 0.1525475751995089, + "grad_norm": 3.4861185550689697, + "learning_rate": 5.07157464212679e-05, + "loss": 3.563, + "step": 497 + }, + { + "epoch": 0.15285451197053407, + "grad_norm": 3.1362967491149902, + "learning_rate": 5.081799591002045e-05, + "loss": 3.5564, + "step": 498 + }, + { + "epoch": 0.15316144874155924, + "grad_norm": 3.360508441925049, + "learning_rate": 5.0920245398773005e-05, + "loss": 3.5307, + "step": 499 + }, + { + "epoch": 0.1534683855125844, + "grad_norm": 3.2896840572357178, + "learning_rate": 5.1022494887525566e-05, + "loss": 3.4843, + "step": 500 + }, + { + "epoch": 0.15377532228360957, + "grad_norm": 3.320429801940918, + "learning_rate": 5.112474437627812e-05, + "loss": 3.484, + "step": 501 + }, + { + "epoch": 0.15408225905463474, + "grad_norm": 3.409586191177368, + "learning_rate": 5.122699386503068e-05, + "loss": 3.506, + "step": 502 + }, + { + "epoch": 0.1543891958256599, + "grad_norm": 3.0944409370422363, + "learning_rate": 5.1329243353783227e-05, + "loss": 3.5011, + "step": 503 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 3.7220418453216553, + "learning_rate": 5.143149284253579e-05, + "loss": 3.5629, + "step": 504 + }, + { + "epoch": 0.15500306936771024, + "grad_norm": 3.217435359954834, + "learning_rate": 5.153374233128835e-05, + "loss": 3.4957, + "step": 505 + }, + { + "epoch": 0.1553100061387354, + "grad_norm": 4.0457444190979, + "learning_rate": 5.163599182004091e-05, + "loss": 3.5152, + "step": 506 + }, + { + "epoch": 0.15561694290976058, + "grad_norm": 2.9380006790161133, + "learning_rate": 5.1738241308793455e-05, + "loss": 3.5261, + "step": 507 + }, + { + "epoch": 0.15592387968078575, + "grad_norm": 4.134535312652588, + "learning_rate": 5.1840490797546015e-05, + "loss": 3.5622, + "step": 508 + }, + { + "epoch": 0.15623081645181092, + "grad_norm": 2.8209407329559326, + "learning_rate": 5.1942740286298575e-05, + "loss": 3.5335, + "step": 509 + }, + { + "epoch": 0.15653775322283608, + "grad_norm": 4.4260711669921875, + "learning_rate": 5.204498977505112e-05, + "loss": 3.5554, + "step": 510 + }, + { + "epoch": 0.15684468999386125, + "grad_norm": 2.8649590015411377, + "learning_rate": 5.214723926380368e-05, + "loss": 3.4989, + "step": 511 + }, + { + "epoch": 0.15715162676488642, + "grad_norm": 4.0349812507629395, + "learning_rate": 5.224948875255624e-05, + "loss": 3.4883, + "step": 512 + }, + { + "epoch": 0.1574585635359116, + "grad_norm": 2.841923475265503, + "learning_rate": 5.2351738241308803e-05, + "loss": 3.4748, + "step": 513 + }, + { + "epoch": 0.15776550030693678, + "grad_norm": 3.8810653686523438, + "learning_rate": 5.245398773006135e-05, + "loss": 3.5403, + "step": 514 + }, + { + "epoch": 0.15807243707796195, + "grad_norm": 3.0830774307250977, + "learning_rate": 5.255623721881391e-05, + "loss": 3.513, + "step": 515 + }, + { + "epoch": 0.15837937384898712, + "grad_norm": 3.8688604831695557, + "learning_rate": 5.265848670756647e-05, + "loss": 3.5409, + "step": 516 + }, + { + "epoch": 0.1586863106200123, + "grad_norm": 2.854600429534912, + "learning_rate": 5.276073619631902e-05, + "loss": 3.4441, + "step": 517 + }, + { + "epoch": 0.15899324739103746, + "grad_norm": 3.9125611782073975, + "learning_rate": 5.286298568507158e-05, + "loss": 3.4953, + "step": 518 + }, + { + "epoch": 0.15930018416206262, + "grad_norm": 2.8626177310943604, + "learning_rate": 5.296523517382413e-05, + "loss": 3.5279, + "step": 519 + }, + { + "epoch": 0.1596071209330878, + "grad_norm": 3.5023677349090576, + "learning_rate": 5.306748466257669e-05, + "loss": 3.4886, + "step": 520 + }, + { + "epoch": 0.15991405770411296, + "grad_norm": 2.960505962371826, + "learning_rate": 5.316973415132924e-05, + "loss": 3.5278, + "step": 521 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 3.976245164871216, + "learning_rate": 5.32719836400818e-05, + "loss": 3.5236, + "step": 522 + }, + { + "epoch": 0.1605279312461633, + "grad_norm": 3.078248977661133, + "learning_rate": 5.337423312883436e-05, + "loss": 3.5194, + "step": 523 + }, + { + "epoch": 0.16083486801718846, + "grad_norm": 3.7498552799224854, + "learning_rate": 5.347648261758691e-05, + "loss": 3.5315, + "step": 524 + }, + { + "epoch": 0.16114180478821363, + "grad_norm": 2.87638258934021, + "learning_rate": 5.357873210633947e-05, + "loss": 3.434, + "step": 525 + }, + { + "epoch": 0.1614487415592388, + "grad_norm": 3.786454677581787, + "learning_rate": 5.368098159509203e-05, + "loss": 3.4985, + "step": 526 + }, + { + "epoch": 0.16175567833026397, + "grad_norm": 2.915156364440918, + "learning_rate": 5.378323108384459e-05, + "loss": 3.4979, + "step": 527 + }, + { + "epoch": 0.16206261510128914, + "grad_norm": 4.095824718475342, + "learning_rate": 5.3885480572597135e-05, + "loss": 3.4605, + "step": 528 + }, + { + "epoch": 0.1623695518723143, + "grad_norm": 2.793501853942871, + "learning_rate": 5.3987730061349695e-05, + "loss": 3.476, + "step": 529 + }, + { + "epoch": 0.16267648864333947, + "grad_norm": 3.9074480533599854, + "learning_rate": 5.4089979550102256e-05, + "loss": 3.4636, + "step": 530 + }, + { + "epoch": 0.16298342541436464, + "grad_norm": 2.8382515907287598, + "learning_rate": 5.4192229038854816e-05, + "loss": 3.4364, + "step": 531 + }, + { + "epoch": 0.1632903621853898, + "grad_norm": 3.4670751094818115, + "learning_rate": 5.429447852760736e-05, + "loss": 3.5033, + "step": 532 + }, + { + "epoch": 0.16359729895641498, + "grad_norm": 2.8805580139160156, + "learning_rate": 5.439672801635992e-05, + "loss": 3.471, + "step": 533 + }, + { + "epoch": 0.16390423572744015, + "grad_norm": 3.745434522628784, + "learning_rate": 5.4498977505112484e-05, + "loss": 3.4565, + "step": 534 + }, + { + "epoch": 0.1642111724984653, + "grad_norm": 3.290579319000244, + "learning_rate": 5.460122699386503e-05, + "loss": 3.47, + "step": 535 + }, + { + "epoch": 0.16451810926949048, + "grad_norm": 3.2988481521606445, + "learning_rate": 5.470347648261759e-05, + "loss": 3.3781, + "step": 536 + }, + { + "epoch": 0.16482504604051565, + "grad_norm": 3.3673248291015625, + "learning_rate": 5.4805725971370145e-05, + "loss": 3.4891, + "step": 537 + }, + { + "epoch": 0.16513198281154082, + "grad_norm": 3.1917717456817627, + "learning_rate": 5.4907975460122705e-05, + "loss": 3.4493, + "step": 538 + }, + { + "epoch": 0.16543891958256599, + "grad_norm": 3.3869614601135254, + "learning_rate": 5.501022494887525e-05, + "loss": 3.3954, + "step": 539 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 2.896742820739746, + "learning_rate": 5.511247443762781e-05, + "loss": 3.4465, + "step": 540 + }, + { + "epoch": 0.16605279312461632, + "grad_norm": 3.771268844604492, + "learning_rate": 5.521472392638037e-05, + "loss": 3.4889, + "step": 541 + }, + { + "epoch": 0.1663597298956415, + "grad_norm": 2.8693349361419678, + "learning_rate": 5.531697341513292e-05, + "loss": 3.3661, + "step": 542 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.093103885650635, + "learning_rate": 5.541922290388548e-05, + "loss": 3.4451, + "step": 543 + }, + { + "epoch": 0.16697360343769183, + "grad_norm": 3.050361394882202, + "learning_rate": 5.552147239263804e-05, + "loss": 3.4203, + "step": 544 + }, + { + "epoch": 0.167280540208717, + "grad_norm": 3.041480302810669, + "learning_rate": 5.56237218813906e-05, + "loss": 3.4173, + "step": 545 + }, + { + "epoch": 0.16758747697974216, + "grad_norm": 3.385680675506592, + "learning_rate": 5.572597137014315e-05, + "loss": 3.4408, + "step": 546 + }, + { + "epoch": 0.16789441375076733, + "grad_norm": 2.88845157623291, + "learning_rate": 5.582822085889571e-05, + "loss": 3.4536, + "step": 547 + }, + { + "epoch": 0.1682013505217925, + "grad_norm": 3.7155961990356445, + "learning_rate": 5.593047034764827e-05, + "loss": 3.4392, + "step": 548 + }, + { + "epoch": 0.1685082872928177, + "grad_norm": 3.4626615047454834, + "learning_rate": 5.6032719836400815e-05, + "loss": 3.4395, + "step": 549 + }, + { + "epoch": 0.16881522406384286, + "grad_norm": 3.182154417037964, + "learning_rate": 5.6134969325153376e-05, + "loss": 3.5239, + "step": 550 + }, + { + "epoch": 0.16912216083486803, + "grad_norm": 3.478602886199951, + "learning_rate": 5.6237218813905936e-05, + "loss": 3.4258, + "step": 551 + }, + { + "epoch": 0.1694290976058932, + "grad_norm": 2.9652369022369385, + "learning_rate": 5.6339468302658496e-05, + "loss": 3.3919, + "step": 552 + }, + { + "epoch": 0.16973603437691837, + "grad_norm": 3.736821413040161, + "learning_rate": 5.644171779141104e-05, + "loss": 3.4491, + "step": 553 + }, + { + "epoch": 0.17004297114794353, + "grad_norm": 2.7791361808776855, + "learning_rate": 5.6543967280163604e-05, + "loss": 3.4748, + "step": 554 + }, + { + "epoch": 0.1703499079189687, + "grad_norm": 4.583637714385986, + "learning_rate": 5.664621676891616e-05, + "loss": 3.4554, + "step": 555 + }, + { + "epoch": 0.17065684468999387, + "grad_norm": 2.8527474403381348, + "learning_rate": 5.674846625766872e-05, + "loss": 3.4327, + "step": 556 + }, + { + "epoch": 0.17096378146101904, + "grad_norm": 4.116163730621338, + "learning_rate": 5.685071574642127e-05, + "loss": 3.4043, + "step": 557 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 3.0130903720855713, + "learning_rate": 5.6952965235173825e-05, + "loss": 3.4823, + "step": 558 + }, + { + "epoch": 0.17157765500306937, + "grad_norm": 3.3556432723999023, + "learning_rate": 5.7055214723926385e-05, + "loss": 3.4464, + "step": 559 + }, + { + "epoch": 0.17188459177409454, + "grad_norm": 2.854952573776245, + "learning_rate": 5.715746421267893e-05, + "loss": 3.3768, + "step": 560 + }, + { + "epoch": 0.1721915285451197, + "grad_norm": 3.9891982078552246, + "learning_rate": 5.725971370143149e-05, + "loss": 3.3949, + "step": 561 + }, + { + "epoch": 0.17249846531614488, + "grad_norm": 2.980468511581421, + "learning_rate": 5.736196319018405e-05, + "loss": 3.459, + "step": 562 + }, + { + "epoch": 0.17280540208717005, + "grad_norm": 3.453510284423828, + "learning_rate": 5.7464212678936613e-05, + "loss": 3.4549, + "step": 563 + }, + { + "epoch": 0.1731123388581952, + "grad_norm": 2.8926782608032227, + "learning_rate": 5.756646216768916e-05, + "loss": 3.392, + "step": 564 + }, + { + "epoch": 0.17341927562922038, + "grad_norm": 3.3722894191741943, + "learning_rate": 5.766871165644172e-05, + "loss": 3.4002, + "step": 565 + }, + { + "epoch": 0.17372621240024555, + "grad_norm": 2.8093647956848145, + "learning_rate": 5.777096114519428e-05, + "loss": 3.3862, + "step": 566 + }, + { + "epoch": 0.17403314917127072, + "grad_norm": 4.1722731590271, + "learning_rate": 5.787321063394683e-05, + "loss": 3.3903, + "step": 567 + }, + { + "epoch": 0.17434008594229589, + "grad_norm": 2.778069257736206, + "learning_rate": 5.797546012269939e-05, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.17464702271332105, + "grad_norm": 3.8501908779144287, + "learning_rate": 5.807770961145195e-05, + "loss": 3.4094, + "step": 569 + }, + { + "epoch": 0.17495395948434622, + "grad_norm": 2.5164549350738525, + "learning_rate": 5.817995910020451e-05, + "loss": 3.4343, + "step": 570 + }, + { + "epoch": 0.1752608962553714, + "grad_norm": 4.0673065185546875, + "learning_rate": 5.8282208588957056e-05, + "loss": 3.3993, + "step": 571 + }, + { + "epoch": 0.17556783302639656, + "grad_norm": 2.7882072925567627, + "learning_rate": 5.8384458077709616e-05, + "loss": 3.4759, + "step": 572 + }, + { + "epoch": 0.17587476979742173, + "grad_norm": 3.3252487182617188, + "learning_rate": 5.848670756646217e-05, + "loss": 3.3562, + "step": 573 + }, + { + "epoch": 0.1761817065684469, + "grad_norm": 2.7499115467071533, + "learning_rate": 5.8588957055214724e-05, + "loss": 3.3376, + "step": 574 + }, + { + "epoch": 0.17648864333947206, + "grad_norm": 4.061224460601807, + "learning_rate": 5.8691206543967284e-05, + "loss": 3.3521, + "step": 575 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 3.022193431854248, + "learning_rate": 5.879345603271984e-05, + "loss": 3.3933, + "step": 576 + }, + { + "epoch": 0.1771025168815224, + "grad_norm": 3.2442128658294678, + "learning_rate": 5.88957055214724e-05, + "loss": 3.4531, + "step": 577 + }, + { + "epoch": 0.17740945365254757, + "grad_norm": 2.9524872303009033, + "learning_rate": 5.8997955010224945e-05, + "loss": 3.332, + "step": 578 + }, + { + "epoch": 0.17771639042357273, + "grad_norm": 3.4604902267456055, + "learning_rate": 5.9100204498977505e-05, + "loss": 3.3706, + "step": 579 + }, + { + "epoch": 0.1780233271945979, + "grad_norm": 3.05216646194458, + "learning_rate": 5.9202453987730066e-05, + "loss": 3.463, + "step": 580 + }, + { + "epoch": 0.17833026396562307, + "grad_norm": 3.427311658859253, + "learning_rate": 5.9304703476482626e-05, + "loss": 3.4204, + "step": 581 + }, + { + "epoch": 0.17863720073664824, + "grad_norm": 2.5583856105804443, + "learning_rate": 5.940695296523517e-05, + "loss": 3.4686, + "step": 582 + }, + { + "epoch": 0.1789441375076734, + "grad_norm": 3.85471248626709, + "learning_rate": 5.950920245398773e-05, + "loss": 3.4518, + "step": 583 + }, + { + "epoch": 0.17925107427869857, + "grad_norm": 2.6894235610961914, + "learning_rate": 5.9611451942740294e-05, + "loss": 3.4179, + "step": 584 + }, + { + "epoch": 0.17955801104972377, + "grad_norm": 3.7592904567718506, + "learning_rate": 5.971370143149284e-05, + "loss": 3.3197, + "step": 585 + }, + { + "epoch": 0.17986494782074894, + "grad_norm": 2.8180313110351562, + "learning_rate": 5.98159509202454e-05, + "loss": 3.4098, + "step": 586 + }, + { + "epoch": 0.1801718845917741, + "grad_norm": 3.5678224563598633, + "learning_rate": 5.991820040899796e-05, + "loss": 3.3644, + "step": 587 + }, + { + "epoch": 0.18047882136279927, + "grad_norm": 2.920607328414917, + "learning_rate": 6.002044989775052e-05, + "loss": 3.4158, + "step": 588 + }, + { + "epoch": 0.18078575813382444, + "grad_norm": 2.9465436935424805, + "learning_rate": 6.012269938650307e-05, + "loss": 3.3369, + "step": 589 + }, + { + "epoch": 0.1810926949048496, + "grad_norm": 3.8760533332824707, + "learning_rate": 6.022494887525563e-05, + "loss": 3.4205, + "step": 590 + }, + { + "epoch": 0.18139963167587478, + "grad_norm": 3.2972259521484375, + "learning_rate": 6.032719836400819e-05, + "loss": 3.3234, + "step": 591 + }, + { + "epoch": 0.18170656844689995, + "grad_norm": 2.8855841159820557, + "learning_rate": 6.0429447852760736e-05, + "loss": 3.4172, + "step": 592 + }, + { + "epoch": 0.18201350521792511, + "grad_norm": 3.3035166263580322, + "learning_rate": 6.05316973415133e-05, + "loss": 3.3235, + "step": 593 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 2.5975232124328613, + "learning_rate": 6.063394683026585e-05, + "loss": 3.3245, + "step": 594 + }, + { + "epoch": 0.18262737875997545, + "grad_norm": 3.68007755279541, + "learning_rate": 6.073619631901841e-05, + "loss": 3.4348, + "step": 595 + }, + { + "epoch": 0.18293431553100062, + "grad_norm": 2.774419069290161, + "learning_rate": 6.083844580777096e-05, + "loss": 3.2763, + "step": 596 + }, + { + "epoch": 0.1832412523020258, + "grad_norm": 3.686140298843384, + "learning_rate": 6.094069529652352e-05, + "loss": 3.29, + "step": 597 + }, + { + "epoch": 0.18354818907305095, + "grad_norm": 2.71142315864563, + "learning_rate": 6.104294478527609e-05, + "loss": 3.3899, + "step": 598 + }, + { + "epoch": 0.18385512584407612, + "grad_norm": 3.725736141204834, + "learning_rate": 6.114519427402863e-05, + "loss": 3.3844, + "step": 599 + }, + { + "epoch": 0.1841620626151013, + "grad_norm": 2.691237211227417, + "learning_rate": 6.124744376278119e-05, + "loss": 3.3138, + "step": 600 + }, + { + "epoch": 0.18446899938612646, + "grad_norm": 3.467499256134033, + "learning_rate": 6.134969325153375e-05, + "loss": 3.3501, + "step": 601 + }, + { + "epoch": 0.18477593615715163, + "grad_norm": 2.776309013366699, + "learning_rate": 6.14519427402863e-05, + "loss": 3.3278, + "step": 602 + }, + { + "epoch": 0.1850828729281768, + "grad_norm": 3.4674019813537598, + "learning_rate": 6.155419222903885e-05, + "loss": 3.262, + "step": 603 + }, + { + "epoch": 0.18538980969920196, + "grad_norm": 2.8091421127319336, + "learning_rate": 6.165644171779141e-05, + "loss": 3.3296, + "step": 604 + }, + { + "epoch": 0.18569674647022713, + "grad_norm": 3.4938528537750244, + "learning_rate": 6.175869120654397e-05, + "loss": 3.4028, + "step": 605 + }, + { + "epoch": 0.1860036832412523, + "grad_norm": 2.5200188159942627, + "learning_rate": 6.186094069529653e-05, + "loss": 3.3726, + "step": 606 + }, + { + "epoch": 0.18631062001227747, + "grad_norm": 3.6415109634399414, + "learning_rate": 6.196319018404908e-05, + "loss": 3.3539, + "step": 607 + }, + { + "epoch": 0.18661755678330263, + "grad_norm": 2.553532123565674, + "learning_rate": 6.206543967280163e-05, + "loss": 3.2971, + "step": 608 + }, + { + "epoch": 0.1869244935543278, + "grad_norm": 3.7287046909332275, + "learning_rate": 6.21676891615542e-05, + "loss": 3.3987, + "step": 609 + }, + { + "epoch": 0.18723143032535297, + "grad_norm": 2.6285226345062256, + "learning_rate": 6.226993865030674e-05, + "loss": 3.2446, + "step": 610 + }, + { + "epoch": 0.18753836709637814, + "grad_norm": 3.453766107559204, + "learning_rate": 6.237218813905931e-05, + "loss": 3.2644, + "step": 611 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 2.7924115657806396, + "learning_rate": 6.247443762781186e-05, + "loss": 3.3056, + "step": 612 + }, + { + "epoch": 0.18815224063842848, + "grad_norm": 3.4854533672332764, + "learning_rate": 6.257668711656443e-05, + "loss": 3.3468, + "step": 613 + }, + { + "epoch": 0.18845917740945364, + "grad_norm": 2.8738653659820557, + "learning_rate": 6.267893660531697e-05, + "loss": 3.3079, + "step": 614 + }, + { + "epoch": 0.1887661141804788, + "grad_norm": 3.496342420578003, + "learning_rate": 6.278118609406954e-05, + "loss": 3.3453, + "step": 615 + }, + { + "epoch": 0.18907305095150398, + "grad_norm": 3.1935245990753174, + "learning_rate": 6.288343558282209e-05, + "loss": 3.303, + "step": 616 + }, + { + "epoch": 0.18937998772252915, + "grad_norm": 2.9726579189300537, + "learning_rate": 6.298568507157464e-05, + "loss": 3.284, + "step": 617 + }, + { + "epoch": 0.18968692449355432, + "grad_norm": 2.8515241146087646, + "learning_rate": 6.30879345603272e-05, + "loss": 3.2748, + "step": 618 + }, + { + "epoch": 0.18999386126457948, + "grad_norm": 3.216681480407715, + "learning_rate": 6.319018404907977e-05, + "loss": 3.2613, + "step": 619 + }, + { + "epoch": 0.19030079803560468, + "grad_norm": 2.9164562225341797, + "learning_rate": 6.329243353783232e-05, + "loss": 3.3234, + "step": 620 + }, + { + "epoch": 0.19060773480662985, + "grad_norm": 2.6724259853363037, + "learning_rate": 6.339468302658487e-05, + "loss": 3.3271, + "step": 621 + }, + { + "epoch": 0.19091467157765502, + "grad_norm": 3.298551082611084, + "learning_rate": 6.349693251533743e-05, + "loss": 3.2715, + "step": 622 + }, + { + "epoch": 0.19122160834868018, + "grad_norm": 2.609632968902588, + "learning_rate": 6.359918200408998e-05, + "loss": 3.2392, + "step": 623 + }, + { + "epoch": 0.19152854511970535, + "grad_norm": 3.6469385623931885, + "learning_rate": 6.370143149284253e-05, + "loss": 3.428, + "step": 624 + }, + { + "epoch": 0.19183548189073052, + "grad_norm": 2.4231622219085693, + "learning_rate": 6.380368098159509e-05, + "loss": 3.3436, + "step": 625 + }, + { + "epoch": 0.1921424186617557, + "grad_norm": 3.9182474613189697, + "learning_rate": 6.390593047034765e-05, + "loss": 3.3375, + "step": 626 + }, + { + "epoch": 0.19244935543278086, + "grad_norm": 2.3975942134857178, + "learning_rate": 6.400817995910021e-05, + "loss": 3.2711, + "step": 627 + }, + { + "epoch": 0.19275629220380602, + "grad_norm": 3.061039447784424, + "learning_rate": 6.411042944785276e-05, + "loss": 3.3124, + "step": 628 + }, + { + "epoch": 0.1930632289748312, + "grad_norm": 2.9461817741394043, + "learning_rate": 6.421267893660532e-05, + "loss": 3.2954, + "step": 629 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 2.6603662967681885, + "learning_rate": 6.431492842535788e-05, + "loss": 3.2138, + "step": 630 + }, + { + "epoch": 0.19367710251688153, + "grad_norm": 3.339444875717163, + "learning_rate": 6.441717791411042e-05, + "loss": 3.2796, + "step": 631 + }, + { + "epoch": 0.1939840392879067, + "grad_norm": 2.59061861038208, + "learning_rate": 6.451942740286299e-05, + "loss": 3.3906, + "step": 632 + }, + { + "epoch": 0.19429097605893186, + "grad_norm": 3.704300880432129, + "learning_rate": 6.462167689161554e-05, + "loss": 3.2604, + "step": 633 + }, + { + "epoch": 0.19459791282995703, + "grad_norm": 3.110203266143799, + "learning_rate": 6.472392638036811e-05, + "loss": 3.3236, + "step": 634 + }, + { + "epoch": 0.1949048496009822, + "grad_norm": 3.016730308532715, + "learning_rate": 6.482617586912065e-05, + "loss": 3.2911, + "step": 635 + }, + { + "epoch": 0.19521178637200737, + "grad_norm": 2.896956205368042, + "learning_rate": 6.492842535787322e-05, + "loss": 3.35, + "step": 636 + }, + { + "epoch": 0.19551872314303254, + "grad_norm": 2.7913663387298584, + "learning_rate": 6.503067484662577e-05, + "loss": 3.3474, + "step": 637 + }, + { + "epoch": 0.1958256599140577, + "grad_norm": 3.285518169403076, + "learning_rate": 6.513292433537832e-05, + "loss": 3.2131, + "step": 638 + }, + { + "epoch": 0.19613259668508287, + "grad_norm": 2.588491201400757, + "learning_rate": 6.523517382413088e-05, + "loss": 3.2955, + "step": 639 + }, + { + "epoch": 0.19643953345610804, + "grad_norm": 2.9417827129364014, + "learning_rate": 6.533742331288345e-05, + "loss": 3.2917, + "step": 640 + }, + { + "epoch": 0.1967464702271332, + "grad_norm": 3.2209408283233643, + "learning_rate": 6.5439672801636e-05, + "loss": 3.233, + "step": 641 + }, + { + "epoch": 0.19705340699815838, + "grad_norm": 2.8424925804138184, + "learning_rate": 6.554192229038855e-05, + "loss": 3.3194, + "step": 642 + }, + { + "epoch": 0.19736034376918354, + "grad_norm": 2.9005842208862305, + "learning_rate": 6.56441717791411e-05, + "loss": 3.275, + "step": 643 + }, + { + "epoch": 0.1976672805402087, + "grad_norm": 3.0277016162872314, + "learning_rate": 6.574642126789366e-05, + "loss": 3.2881, + "step": 644 + }, + { + "epoch": 0.19797421731123388, + "grad_norm": 2.8932368755340576, + "learning_rate": 6.584867075664623e-05, + "loss": 3.2799, + "step": 645 + }, + { + "epoch": 0.19828115408225905, + "grad_norm": 2.994464635848999, + "learning_rate": 6.595092024539877e-05, + "loss": 3.258, + "step": 646 + }, + { + "epoch": 0.19858809085328422, + "grad_norm": 2.943040132522583, + "learning_rate": 6.605316973415133e-05, + "loss": 3.1994, + "step": 647 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 2.942765712738037, + "learning_rate": 6.615541922290389e-05, + "loss": 3.1802, + "step": 648 + }, + { + "epoch": 0.19920196439533455, + "grad_norm": 2.8036246299743652, + "learning_rate": 6.625766871165644e-05, + "loss": 3.2426, + "step": 649 + }, + { + "epoch": 0.19950890116635972, + "grad_norm": 2.814507484436035, + "learning_rate": 6.6359918200409e-05, + "loss": 3.2978, + "step": 650 + }, + { + "epoch": 0.1998158379373849, + "grad_norm": 2.8133158683776855, + "learning_rate": 6.646216768916156e-05, + "loss": 3.2435, + "step": 651 + }, + { + "epoch": 0.20012277470841006, + "grad_norm": 2.8596129417419434, + "learning_rate": 6.656441717791412e-05, + "loss": 3.2154, + "step": 652 + }, + { + "epoch": 0.20042971147943522, + "grad_norm": 2.663926839828491, + "learning_rate": 6.666666666666667e-05, + "loss": 3.2487, + "step": 653 + }, + { + "epoch": 0.2007366482504604, + "grad_norm": 3.40561580657959, + "learning_rate": 6.676891615541922e-05, + "loss": 3.1509, + "step": 654 + }, + { + "epoch": 0.20104358502148556, + "grad_norm": 2.5786798000335693, + "learning_rate": 6.687116564417179e-05, + "loss": 3.2686, + "step": 655 + }, + { + "epoch": 0.20135052179251076, + "grad_norm": 3.007436752319336, + "learning_rate": 6.697341513292433e-05, + "loss": 3.2543, + "step": 656 + }, + { + "epoch": 0.20165745856353592, + "grad_norm": 2.5966951847076416, + "learning_rate": 6.70756646216769e-05, + "loss": 3.2643, + "step": 657 + }, + { + "epoch": 0.2019643953345611, + "grad_norm": 3.2698333263397217, + "learning_rate": 6.717791411042945e-05, + "loss": 3.2002, + "step": 658 + }, + { + "epoch": 0.20227133210558626, + "grad_norm": 2.513129472732544, + "learning_rate": 6.7280163599182e-05, + "loss": 3.1551, + "step": 659 + }, + { + "epoch": 0.20257826887661143, + "grad_norm": 2.9690299034118652, + "learning_rate": 6.738241308793456e-05, + "loss": 3.3037, + "step": 660 + }, + { + "epoch": 0.2028852056476366, + "grad_norm": 2.6644227504730225, + "learning_rate": 6.748466257668711e-05, + "loss": 3.3225, + "step": 661 + }, + { + "epoch": 0.20319214241866176, + "grad_norm": 2.6990232467651367, + "learning_rate": 6.758691206543968e-05, + "loss": 3.227, + "step": 662 + }, + { + "epoch": 0.20349907918968693, + "grad_norm": 3.6271350383758545, + "learning_rate": 6.768916155419223e-05, + "loss": 3.32, + "step": 663 + }, + { + "epoch": 0.2038060159607121, + "grad_norm": 2.6351428031921387, + "learning_rate": 6.779141104294479e-05, + "loss": 3.2104, + "step": 664 + }, + { + "epoch": 0.20411295273173727, + "grad_norm": 3.980685234069824, + "learning_rate": 6.789366053169734e-05, + "loss": 3.2602, + "step": 665 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 2.5207509994506836, + "learning_rate": 6.799591002044991e-05, + "loss": 3.2256, + "step": 666 + }, + { + "epoch": 0.2047268262737876, + "grad_norm": 3.0568666458129883, + "learning_rate": 6.809815950920245e-05, + "loss": 3.2918, + "step": 667 + }, + { + "epoch": 0.20503376304481277, + "grad_norm": 2.6476826667785645, + "learning_rate": 6.820040899795501e-05, + "loss": 3.2745, + "step": 668 + }, + { + "epoch": 0.20534069981583794, + "grad_norm": 3.0413191318511963, + "learning_rate": 6.830265848670757e-05, + "loss": 3.2683, + "step": 669 + }, + { + "epoch": 0.2056476365868631, + "grad_norm": 2.6214709281921387, + "learning_rate": 6.840490797546014e-05, + "loss": 3.1399, + "step": 670 + }, + { + "epoch": 0.20595457335788828, + "grad_norm": 3.0577988624572754, + "learning_rate": 6.850715746421268e-05, + "loss": 3.2131, + "step": 671 + }, + { + "epoch": 0.20626151012891344, + "grad_norm": 2.795365571975708, + "learning_rate": 6.860940695296524e-05, + "loss": 3.1633, + "step": 672 + }, + { + "epoch": 0.2065684468999386, + "grad_norm": 3.3030495643615723, + "learning_rate": 6.87116564417178e-05, + "loss": 3.2036, + "step": 673 + }, + { + "epoch": 0.20687538367096378, + "grad_norm": 2.3182966709136963, + "learning_rate": 6.881390593047035e-05, + "loss": 3.2154, + "step": 674 + }, + { + "epoch": 0.20718232044198895, + "grad_norm": 3.133702039718628, + "learning_rate": 6.89161554192229e-05, + "loss": 3.1828, + "step": 675 + }, + { + "epoch": 0.20748925721301412, + "grad_norm": 2.555358409881592, + "learning_rate": 6.901840490797547e-05, + "loss": 3.1434, + "step": 676 + }, + { + "epoch": 0.20779619398403928, + "grad_norm": 2.990675687789917, + "learning_rate": 6.912065439672802e-05, + "loss": 3.2182, + "step": 677 + }, + { + "epoch": 0.20810313075506445, + "grad_norm": 2.5072035789489746, + "learning_rate": 6.922290388548058e-05, + "loss": 3.2735, + "step": 678 + }, + { + "epoch": 0.20841006752608962, + "grad_norm": 3.311474323272705, + "learning_rate": 6.932515337423313e-05, + "loss": 3.2152, + "step": 679 + }, + { + "epoch": 0.2087170042971148, + "grad_norm": 2.7110986709594727, + "learning_rate": 6.942740286298569e-05, + "loss": 3.1633, + "step": 680 + }, + { + "epoch": 0.20902394106813996, + "grad_norm": 2.6963095664978027, + "learning_rate": 6.952965235173824e-05, + "loss": 3.2097, + "step": 681 + }, + { + "epoch": 0.20933087783916512, + "grad_norm": 2.7126448154449463, + "learning_rate": 6.963190184049079e-05, + "loss": 3.232, + "step": 682 + }, + { + "epoch": 0.2096378146101903, + "grad_norm": 2.723257541656494, + "learning_rate": 6.973415132924336e-05, + "loss": 3.1024, + "step": 683 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 2.985407829284668, + "learning_rate": 6.983640081799591e-05, + "loss": 3.215, + "step": 684 + }, + { + "epoch": 0.21025168815224063, + "grad_norm": 2.4878063201904297, + "learning_rate": 6.993865030674847e-05, + "loss": 3.2543, + "step": 685 + }, + { + "epoch": 0.2105586249232658, + "grad_norm": 3.417191505432129, + "learning_rate": 7.004089979550102e-05, + "loss": 3.217, + "step": 686 + }, + { + "epoch": 0.21086556169429096, + "grad_norm": 2.606513738632202, + "learning_rate": 7.014314928425359e-05, + "loss": 3.1831, + "step": 687 + }, + { + "epoch": 0.21117249846531613, + "grad_norm": 2.777334213256836, + "learning_rate": 7.024539877300614e-05, + "loss": 3.1513, + "step": 688 + }, + { + "epoch": 0.2114794352363413, + "grad_norm": 2.718494415283203, + "learning_rate": 7.03476482617587e-05, + "loss": 3.1695, + "step": 689 + }, + { + "epoch": 0.21178637200736647, + "grad_norm": 3.041794776916504, + "learning_rate": 7.044989775051125e-05, + "loss": 3.2078, + "step": 690 + }, + { + "epoch": 0.21209330877839166, + "grad_norm": 2.6473169326782227, + "learning_rate": 7.055214723926382e-05, + "loss": 3.177, + "step": 691 + }, + { + "epoch": 0.21240024554941683, + "grad_norm": 3.2349517345428467, + "learning_rate": 7.065439672801636e-05, + "loss": 3.2144, + "step": 692 + }, + { + "epoch": 0.212707182320442, + "grad_norm": 2.6024651527404785, + "learning_rate": 7.075664621676892e-05, + "loss": 3.2204, + "step": 693 + }, + { + "epoch": 0.21301411909146717, + "grad_norm": 2.9090511798858643, + "learning_rate": 7.085889570552148e-05, + "loss": 3.2473, + "step": 694 + }, + { + "epoch": 0.21332105586249234, + "grad_norm": 3.230525255203247, + "learning_rate": 7.096114519427403e-05, + "loss": 3.2552, + "step": 695 + }, + { + "epoch": 0.2136279926335175, + "grad_norm": 2.2609128952026367, + "learning_rate": 7.106339468302658e-05, + "loss": 3.1302, + "step": 696 + }, + { + "epoch": 0.21393492940454267, + "grad_norm": 3.484372854232788, + "learning_rate": 7.116564417177914e-05, + "loss": 3.1578, + "step": 697 + }, + { + "epoch": 0.21424186617556784, + "grad_norm": 2.130702257156372, + "learning_rate": 7.12678936605317e-05, + "loss": 3.2089, + "step": 698 + }, + { + "epoch": 0.214548802946593, + "grad_norm": 3.0673611164093018, + "learning_rate": 7.137014314928426e-05, + "loss": 3.214, + "step": 699 + }, + { + "epoch": 0.21485573971761818, + "grad_norm": 2.572826862335205, + "learning_rate": 7.147239263803681e-05, + "loss": 3.1824, + "step": 700 + }, + { + "epoch": 0.21516267648864335, + "grad_norm": 2.8327746391296387, + "learning_rate": 7.157464212678937e-05, + "loss": 3.2384, + "step": 701 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 2.863041877746582, + "learning_rate": 7.167689161554193e-05, + "loss": 3.1102, + "step": 702 + }, + { + "epoch": 0.21577655003069368, + "grad_norm": 2.2519750595092773, + "learning_rate": 7.177914110429447e-05, + "loss": 3.1541, + "step": 703 + }, + { + "epoch": 0.21608348680171885, + "grad_norm": 3.197129011154175, + "learning_rate": 7.188139059304704e-05, + "loss": 3.2407, + "step": 704 + }, + { + "epoch": 0.21639042357274402, + "grad_norm": 2.32582426071167, + "learning_rate": 7.19836400817996e-05, + "loss": 3.1895, + "step": 705 + }, + { + "epoch": 0.21669736034376919, + "grad_norm": 3.0128488540649414, + "learning_rate": 7.208588957055215e-05, + "loss": 3.2839, + "step": 706 + }, + { + "epoch": 0.21700429711479435, + "grad_norm": 2.503342390060425, + "learning_rate": 7.21881390593047e-05, + "loss": 3.2093, + "step": 707 + }, + { + "epoch": 0.21731123388581952, + "grad_norm": 2.7540833950042725, + "learning_rate": 7.229038854805727e-05, + "loss": 3.2143, + "step": 708 + }, + { + "epoch": 0.2176181706568447, + "grad_norm": 2.8838772773742676, + "learning_rate": 7.239263803680982e-05, + "loss": 3.2051, + "step": 709 + }, + { + "epoch": 0.21792510742786986, + "grad_norm": 2.7495758533477783, + "learning_rate": 7.249488752556238e-05, + "loss": 3.0701, + "step": 710 + }, + { + "epoch": 0.21823204419889503, + "grad_norm": 2.684539794921875, + "learning_rate": 7.259713701431493e-05, + "loss": 3.1917, + "step": 711 + }, + { + "epoch": 0.2185389809699202, + "grad_norm": 2.8330819606781006, + "learning_rate": 7.26993865030675e-05, + "loss": 3.1685, + "step": 712 + }, + { + "epoch": 0.21884591774094536, + "grad_norm": 2.6974711418151855, + "learning_rate": 7.280163599182005e-05, + "loss": 3.0953, + "step": 713 + }, + { + "epoch": 0.21915285451197053, + "grad_norm": 2.5129306316375732, + "learning_rate": 7.29038854805726e-05, + "loss": 3.1371, + "step": 714 + }, + { + "epoch": 0.2194597912829957, + "grad_norm": 2.7884230613708496, + "learning_rate": 7.300613496932516e-05, + "loss": 3.1386, + "step": 715 + }, + { + "epoch": 0.21976672805402087, + "grad_norm": 2.296306610107422, + "learning_rate": 7.310838445807771e-05, + "loss": 3.1735, + "step": 716 + }, + { + "epoch": 0.22007366482504603, + "grad_norm": 2.777911424636841, + "learning_rate": 7.321063394683026e-05, + "loss": 3.1726, + "step": 717 + }, + { + "epoch": 0.2203806015960712, + "grad_norm": 2.5349695682525635, + "learning_rate": 7.331288343558282e-05, + "loss": 3.1603, + "step": 718 + }, + { + "epoch": 0.22068753836709637, + "grad_norm": 2.415412425994873, + "learning_rate": 7.341513292433539e-05, + "loss": 3.1378, + "step": 719 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 2.7188358306884766, + "learning_rate": 7.351738241308794e-05, + "loss": 3.1321, + "step": 720 + }, + { + "epoch": 0.2213014119091467, + "grad_norm": 2.4872183799743652, + "learning_rate": 7.361963190184049e-05, + "loss": 3.1283, + "step": 721 + }, + { + "epoch": 0.22160834868017187, + "grad_norm": 2.454535961151123, + "learning_rate": 7.372188139059305e-05, + "loss": 3.1085, + "step": 722 + }, + { + "epoch": 0.22191528545119704, + "grad_norm": 2.5621426105499268, + "learning_rate": 7.382413087934561e-05, + "loss": 3.1307, + "step": 723 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.513777256011963, + "learning_rate": 7.392638036809815e-05, + "loss": 3.1103, + "step": 724 + }, + { + "epoch": 0.22252915899324738, + "grad_norm": 2.596559762954712, + "learning_rate": 7.402862985685072e-05, + "loss": 3.1563, + "step": 725 + }, + { + "epoch": 0.22283609576427257, + "grad_norm": 2.371487617492676, + "learning_rate": 7.413087934560327e-05, + "loss": 3.1344, + "step": 726 + }, + { + "epoch": 0.22314303253529774, + "grad_norm": 2.7252206802368164, + "learning_rate": 7.423312883435584e-05, + "loss": 3.2139, + "step": 727 + }, + { + "epoch": 0.2234499693063229, + "grad_norm": 2.2834722995758057, + "learning_rate": 7.433537832310838e-05, + "loss": 3.1461, + "step": 728 + }, + { + "epoch": 0.22375690607734808, + "grad_norm": 3.0965540409088135, + "learning_rate": 7.443762781186095e-05, + "loss": 3.1433, + "step": 729 + }, + { + "epoch": 0.22406384284837325, + "grad_norm": 2.351365804672241, + "learning_rate": 7.45398773006135e-05, + "loss": 3.1737, + "step": 730 + }, + { + "epoch": 0.2243707796193984, + "grad_norm": 3.0938596725463867, + "learning_rate": 7.464212678936606e-05, + "loss": 3.1689, + "step": 731 + }, + { + "epoch": 0.22467771639042358, + "grad_norm": 2.415039300918579, + "learning_rate": 7.474437627811861e-05, + "loss": 3.1146, + "step": 732 + }, + { + "epoch": 0.22498465316144875, + "grad_norm": 2.8242318630218506, + "learning_rate": 7.484662576687118e-05, + "loss": 3.0812, + "step": 733 + }, + { + "epoch": 0.22529158993247392, + "grad_norm": 2.4347777366638184, + "learning_rate": 7.494887525562373e-05, + "loss": 3.203, + "step": 734 + }, + { + "epoch": 0.22559852670349909, + "grad_norm": 2.953418016433716, + "learning_rate": 7.505112474437628e-05, + "loss": 3.109, + "step": 735 + }, + { + "epoch": 0.22590546347452425, + "grad_norm": 2.600888252258301, + "learning_rate": 7.515337423312884e-05, + "loss": 3.1859, + "step": 736 + }, + { + "epoch": 0.22621240024554942, + "grad_norm": 2.7484869956970215, + "learning_rate": 7.525562372188139e-05, + "loss": 3.1169, + "step": 737 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 2.4797677993774414, + "learning_rate": 7.535787321063396e-05, + "loss": 3.0696, + "step": 738 + }, + { + "epoch": 0.22682627378759976, + "grad_norm": 2.641873359680176, + "learning_rate": 7.54601226993865e-05, + "loss": 3.1545, + "step": 739 + }, + { + "epoch": 0.22713321055862493, + "grad_norm": 2.3956825733184814, + "learning_rate": 7.556237218813907e-05, + "loss": 3.1295, + "step": 740 + }, + { + "epoch": 0.2274401473296501, + "grad_norm": 2.8832130432128906, + "learning_rate": 7.566462167689162e-05, + "loss": 3.1119, + "step": 741 + }, + { + "epoch": 0.22774708410067526, + "grad_norm": 2.3001184463500977, + "learning_rate": 7.576687116564417e-05, + "loss": 3.0068, + "step": 742 + }, + { + "epoch": 0.22805402087170043, + "grad_norm": 2.8682122230529785, + "learning_rate": 7.586912065439673e-05, + "loss": 3.0562, + "step": 743 + }, + { + "epoch": 0.2283609576427256, + "grad_norm": 2.2176413536071777, + "learning_rate": 7.59713701431493e-05, + "loss": 3.1395, + "step": 744 + }, + { + "epoch": 0.22866789441375077, + "grad_norm": 3.698274612426758, + "learning_rate": 7.607361963190185e-05, + "loss": 3.209, + "step": 745 + }, + { + "epoch": 0.22897483118477593, + "grad_norm": 2.141063928604126, + "learning_rate": 7.61758691206544e-05, + "loss": 3.1734, + "step": 746 + }, + { + "epoch": 0.2292817679558011, + "grad_norm": 2.728498697280884, + "learning_rate": 7.627811860940695e-05, + "loss": 3.1498, + "step": 747 + }, + { + "epoch": 0.22958870472682627, + "grad_norm": 2.271678924560547, + "learning_rate": 7.638036809815952e-05, + "loss": 3.1538, + "step": 748 + }, + { + "epoch": 0.22989564149785144, + "grad_norm": 2.6095521450042725, + "learning_rate": 7.648261758691206e-05, + "loss": 3.155, + "step": 749 + }, + { + "epoch": 0.2302025782688766, + "grad_norm": 2.410792112350464, + "learning_rate": 7.658486707566463e-05, + "loss": 3.0478, + "step": 750 + }, + { + "epoch": 0.23050951503990177, + "grad_norm": 2.6980888843536377, + "learning_rate": 7.668711656441718e-05, + "loss": 3.1369, + "step": 751 + }, + { + "epoch": 0.23081645181092694, + "grad_norm": 2.353308916091919, + "learning_rate": 7.678936605316974e-05, + "loss": 3.0052, + "step": 752 + }, + { + "epoch": 0.2311233885819521, + "grad_norm": 2.4530155658721924, + "learning_rate": 7.689161554192229e-05, + "loss": 3.1348, + "step": 753 + }, + { + "epoch": 0.23143032535297728, + "grad_norm": 2.393601894378662, + "learning_rate": 7.699386503067484e-05, + "loss": 2.9941, + "step": 754 + }, + { + "epoch": 0.23173726212400245, + "grad_norm": 2.576876401901245, + "learning_rate": 7.709611451942741e-05, + "loss": 3.114, + "step": 755 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 2.0420913696289062, + "learning_rate": 7.719836400817996e-05, + "loss": 3.132, + "step": 756 + }, + { + "epoch": 0.23235113566605278, + "grad_norm": 3.0095622539520264, + "learning_rate": 7.730061349693252e-05, + "loss": 3.1763, + "step": 757 + }, + { + "epoch": 0.23265807243707795, + "grad_norm": 2.224005937576294, + "learning_rate": 7.740286298568507e-05, + "loss": 3.0703, + "step": 758 + }, + { + "epoch": 0.23296500920810312, + "grad_norm": 2.7559845447540283, + "learning_rate": 7.750511247443764e-05, + "loss": 3.1026, + "step": 759 + }, + { + "epoch": 0.2332719459791283, + "grad_norm": 2.2965753078460693, + "learning_rate": 7.760736196319018e-05, + "loss": 3.0284, + "step": 760 + }, + { + "epoch": 0.23357888275015345, + "grad_norm": 2.374398708343506, + "learning_rate": 7.770961145194275e-05, + "loss": 3.0636, + "step": 761 + }, + { + "epoch": 0.23388581952117865, + "grad_norm": 2.4315314292907715, + "learning_rate": 7.78118609406953e-05, + "loss": 3.0906, + "step": 762 + }, + { + "epoch": 0.23419275629220382, + "grad_norm": 2.5609946250915527, + "learning_rate": 7.791411042944787e-05, + "loss": 3.0692, + "step": 763 + }, + { + "epoch": 0.234499693063229, + "grad_norm": 2.419597864151001, + "learning_rate": 7.80163599182004e-05, + "loss": 3.1934, + "step": 764 + }, + { + "epoch": 0.23480662983425415, + "grad_norm": 3.0499062538146973, + "learning_rate": 7.811860940695297e-05, + "loss": 3.18, + "step": 765 + }, + { + "epoch": 0.23511356660527932, + "grad_norm": 2.464421510696411, + "learning_rate": 7.822085889570553e-05, + "loss": 3.1591, + "step": 766 + }, + { + "epoch": 0.2354205033763045, + "grad_norm": 3.4370174407958984, + "learning_rate": 7.832310838445808e-05, + "loss": 3.1156, + "step": 767 + }, + { + "epoch": 0.23572744014732966, + "grad_norm": 2.207406520843506, + "learning_rate": 7.842535787321063e-05, + "loss": 3.0557, + "step": 768 + }, + { + "epoch": 0.23603437691835483, + "grad_norm": 2.484807014465332, + "learning_rate": 7.85276073619632e-05, + "loss": 3.1003, + "step": 769 + }, + { + "epoch": 0.23634131368938, + "grad_norm": 2.33217716217041, + "learning_rate": 7.862985685071576e-05, + "loss": 3.0707, + "step": 770 + }, + { + "epoch": 0.23664825046040516, + "grad_norm": 2.493717670440674, + "learning_rate": 7.873210633946831e-05, + "loss": 3.127, + "step": 771 + }, + { + "epoch": 0.23695518723143033, + "grad_norm": 2.5824413299560547, + "learning_rate": 7.883435582822086e-05, + "loss": 3.1042, + "step": 772 + }, + { + "epoch": 0.2372621240024555, + "grad_norm": 2.4137654304504395, + "learning_rate": 7.893660531697342e-05, + "loss": 3.136, + "step": 773 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 2.4657833576202393, + "learning_rate": 7.903885480572597e-05, + "loss": 3.038, + "step": 774 + }, + { + "epoch": 0.23787599754450584, + "grad_norm": 2.426260471343994, + "learning_rate": 7.914110429447852e-05, + "loss": 3.0102, + "step": 775 + }, + { + "epoch": 0.238182934315531, + "grad_norm": 2.4658050537109375, + "learning_rate": 7.924335378323109e-05, + "loss": 3.0645, + "step": 776 + }, + { + "epoch": 0.23848987108655617, + "grad_norm": 2.186267614364624, + "learning_rate": 7.934560327198364e-05, + "loss": 3.0585, + "step": 777 + }, + { + "epoch": 0.23879680785758134, + "grad_norm": 2.8824141025543213, + "learning_rate": 7.94478527607362e-05, + "loss": 3.0796, + "step": 778 + }, + { + "epoch": 0.2391037446286065, + "grad_norm": 1.9940539598464966, + "learning_rate": 7.955010224948875e-05, + "loss": 2.9894, + "step": 779 + }, + { + "epoch": 0.23941068139963168, + "grad_norm": 2.9386861324310303, + "learning_rate": 7.965235173824132e-05, + "loss": 3.1147, + "step": 780 + }, + { + "epoch": 0.23971761817065684, + "grad_norm": 2.241983413696289, + "learning_rate": 7.975460122699386e-05, + "loss": 2.9977, + "step": 781 + }, + { + "epoch": 0.240024554941682, + "grad_norm": 2.4796900749206543, + "learning_rate": 7.985685071574643e-05, + "loss": 3.0507, + "step": 782 + }, + { + "epoch": 0.24033149171270718, + "grad_norm": 2.6178741455078125, + "learning_rate": 7.995910020449898e-05, + "loss": 3.0299, + "step": 783 + }, + { + "epoch": 0.24063842848373235, + "grad_norm": 2.157179594039917, + "learning_rate": 8.006134969325155e-05, + "loss": 3.0419, + "step": 784 + }, + { + "epoch": 0.24094536525475752, + "grad_norm": 2.49029541015625, + "learning_rate": 8.016359918200409e-05, + "loss": 3.0785, + "step": 785 + }, + { + "epoch": 0.24125230202578268, + "grad_norm": 2.254014492034912, + "learning_rate": 8.026584867075665e-05, + "loss": 3.0009, + "step": 786 + }, + { + "epoch": 0.24155923879680785, + "grad_norm": 2.514465570449829, + "learning_rate": 8.036809815950921e-05, + "loss": 3.0221, + "step": 787 + }, + { + "epoch": 0.24186617556783302, + "grad_norm": 2.309812545776367, + "learning_rate": 8.047034764826176e-05, + "loss": 2.9822, + "step": 788 + }, + { + "epoch": 0.2421731123388582, + "grad_norm": 2.5367796421051025, + "learning_rate": 8.057259713701431e-05, + "loss": 2.966, + "step": 789 + }, + { + "epoch": 0.24248004910988336, + "grad_norm": 2.4668943881988525, + "learning_rate": 8.067484662576688e-05, + "loss": 3.1177, + "step": 790 + }, + { + "epoch": 0.24278698588090852, + "grad_norm": 2.9424917697906494, + "learning_rate": 8.077709611451944e-05, + "loss": 3.078, + "step": 791 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 2.3068933486938477, + "learning_rate": 8.087934560327199e-05, + "loss": 3.0415, + "step": 792 + }, + { + "epoch": 0.24340085942295886, + "grad_norm": 2.675631284713745, + "learning_rate": 8.098159509202454e-05, + "loss": 3.012, + "step": 793 + }, + { + "epoch": 0.24370779619398403, + "grad_norm": 2.0261662006378174, + "learning_rate": 8.10838445807771e-05, + "loss": 3.0023, + "step": 794 + }, + { + "epoch": 0.2440147329650092, + "grad_norm": 3.32330322265625, + "learning_rate": 8.118609406952966e-05, + "loss": 3.0992, + "step": 795 + }, + { + "epoch": 0.24432166973603436, + "grad_norm": 2.1587088108062744, + "learning_rate": 8.12883435582822e-05, + "loss": 3.0922, + "step": 796 + }, + { + "epoch": 0.24462860650705956, + "grad_norm": 2.639254331588745, + "learning_rate": 8.139059304703477e-05, + "loss": 2.9856, + "step": 797 + }, + { + "epoch": 0.24493554327808473, + "grad_norm": 1.9976975917816162, + "learning_rate": 8.149284253578732e-05, + "loss": 3.0015, + "step": 798 + }, + { + "epoch": 0.2452424800491099, + "grad_norm": 2.763504981994629, + "learning_rate": 8.159509202453988e-05, + "loss": 3.0437, + "step": 799 + }, + { + "epoch": 0.24554941682013506, + "grad_norm": 1.9080138206481934, + "learning_rate": 8.169734151329243e-05, + "loss": 3.0009, + "step": 800 + }, + { + "epoch": 0.24585635359116023, + "grad_norm": 3.1276164054870605, + "learning_rate": 8.1799591002045e-05, + "loss": 3.0433, + "step": 801 + }, + { + "epoch": 0.2461632903621854, + "grad_norm": 2.0463218688964844, + "learning_rate": 8.190184049079755e-05, + "loss": 2.988, + "step": 802 + }, + { + "epoch": 0.24647022713321057, + "grad_norm": 2.8476648330688477, + "learning_rate": 8.20040899795501e-05, + "loss": 3.0238, + "step": 803 + }, + { + "epoch": 0.24677716390423574, + "grad_norm": 1.9715898036956787, + "learning_rate": 8.210633946830266e-05, + "loss": 3.0657, + "step": 804 + }, + { + "epoch": 0.2470841006752609, + "grad_norm": 3.369995594024658, + "learning_rate": 8.220858895705523e-05, + "loss": 3.0181, + "step": 805 + }, + { + "epoch": 0.24739103744628607, + "grad_norm": 2.0333900451660156, + "learning_rate": 8.231083844580777e-05, + "loss": 3.0589, + "step": 806 + }, + { + "epoch": 0.24769797421731124, + "grad_norm": 2.5702931880950928, + "learning_rate": 8.241308793456033e-05, + "loss": 2.9908, + "step": 807 + }, + { + "epoch": 0.2480049109883364, + "grad_norm": 2.12131929397583, + "learning_rate": 8.251533742331289e-05, + "loss": 3.0519, + "step": 808 + }, + { + "epoch": 0.24831184775936158, + "grad_norm": 2.5457377433776855, + "learning_rate": 8.261758691206544e-05, + "loss": 3.019, + "step": 809 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 2.0954740047454834, + "learning_rate": 8.2719836400818e-05, + "loss": 2.9805, + "step": 810 + }, + { + "epoch": 0.2489257213014119, + "grad_norm": 2.2456700801849365, + "learning_rate": 8.282208588957055e-05, + "loss": 3.0627, + "step": 811 + }, + { + "epoch": 0.24923265807243708, + "grad_norm": 2.4453790187835693, + "learning_rate": 8.292433537832312e-05, + "loss": 3.0447, + "step": 812 + }, + { + "epoch": 0.24953959484346225, + "grad_norm": 2.1835873126983643, + "learning_rate": 8.302658486707567e-05, + "loss": 3.0008, + "step": 813 + }, + { + "epoch": 0.24984653161448742, + "grad_norm": 2.292989492416382, + "learning_rate": 8.312883435582822e-05, + "loss": 2.9175, + "step": 814 + }, + { + "epoch": 0.2501534683855126, + "grad_norm": 2.408888816833496, + "learning_rate": 8.323108384458078e-05, + "loss": 2.9649, + "step": 815 + }, + { + "epoch": 0.2504604051565378, + "grad_norm": 2.1873834133148193, + "learning_rate": 8.333333333333334e-05, + "loss": 2.9812, + "step": 816 + }, + { + "epoch": 0.25076734192756295, + "grad_norm": 2.2599284648895264, + "learning_rate": 8.343558282208588e-05, + "loss": 3.0086, + "step": 817 + }, + { + "epoch": 0.2510742786985881, + "grad_norm": 2.1902761459350586, + "learning_rate": 8.353783231083845e-05, + "loss": 2.9295, + "step": 818 + }, + { + "epoch": 0.2513812154696133, + "grad_norm": 2.4830422401428223, + "learning_rate": 8.3640081799591e-05, + "loss": 2.9808, + "step": 819 + }, + { + "epoch": 0.25168815224063845, + "grad_norm": 2.2274281978607178, + "learning_rate": 8.374233128834357e-05, + "loss": 2.9525, + "step": 820 + }, + { + "epoch": 0.2519950890116636, + "grad_norm": 2.2949111461639404, + "learning_rate": 8.384458077709611e-05, + "loss": 3.0313, + "step": 821 + }, + { + "epoch": 0.2523020257826888, + "grad_norm": 2.2345564365386963, + "learning_rate": 8.394683026584868e-05, + "loss": 2.9024, + "step": 822 + }, + { + "epoch": 0.25260896255371396, + "grad_norm": 2.488744020462036, + "learning_rate": 8.404907975460123e-05, + "loss": 2.9907, + "step": 823 + }, + { + "epoch": 0.2529158993247391, + "grad_norm": 1.9192837476730347, + "learning_rate": 8.415132924335379e-05, + "loss": 2.9792, + "step": 824 + }, + { + "epoch": 0.2532228360957643, + "grad_norm": 2.6426947116851807, + "learning_rate": 8.425357873210634e-05, + "loss": 2.972, + "step": 825 + }, + { + "epoch": 0.25352977286678946, + "grad_norm": 1.9950047731399536, + "learning_rate": 8.435582822085891e-05, + "loss": 2.9885, + "step": 826 + }, + { + "epoch": 0.25383670963781463, + "grad_norm": 2.30191969871521, + "learning_rate": 8.445807770961146e-05, + "loss": 2.9358, + "step": 827 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 2.1111395359039307, + "learning_rate": 8.456032719836401e-05, + "loss": 3.0343, + "step": 828 + }, + { + "epoch": 0.25445058317986496, + "grad_norm": 2.7292258739471436, + "learning_rate": 8.466257668711657e-05, + "loss": 2.9465, + "step": 829 + }, + { + "epoch": 0.25475751995089013, + "grad_norm": 1.9130604267120361, + "learning_rate": 8.476482617586912e-05, + "loss": 2.9443, + "step": 830 + }, + { + "epoch": 0.2550644567219153, + "grad_norm": 2.4240024089813232, + "learning_rate": 8.486707566462168e-05, + "loss": 2.963, + "step": 831 + }, + { + "epoch": 0.25537139349294047, + "grad_norm": 2.062875509262085, + "learning_rate": 8.496932515337423e-05, + "loss": 3.0127, + "step": 832 + }, + { + "epoch": 0.25567833026396564, + "grad_norm": 2.223639726638794, + "learning_rate": 8.50715746421268e-05, + "loss": 2.944, + "step": 833 + }, + { + "epoch": 0.2559852670349908, + "grad_norm": 2.2969272136688232, + "learning_rate": 8.517382413087935e-05, + "loss": 2.9495, + "step": 834 + }, + { + "epoch": 0.256292203806016, + "grad_norm": 2.1343178749084473, + "learning_rate": 8.52760736196319e-05, + "loss": 3.0383, + "step": 835 + }, + { + "epoch": 0.25659914057704114, + "grad_norm": 2.2348313331604004, + "learning_rate": 8.537832310838446e-05, + "loss": 2.9205, + "step": 836 + }, + { + "epoch": 0.2569060773480663, + "grad_norm": 2.2653896808624268, + "learning_rate": 8.548057259713702e-05, + "loss": 2.9699, + "step": 837 + }, + { + "epoch": 0.2572130141190915, + "grad_norm": 2.1332547664642334, + "learning_rate": 8.558282208588958e-05, + "loss": 2.9318, + "step": 838 + }, + { + "epoch": 0.25751995089011664, + "grad_norm": 2.5935778617858887, + "learning_rate": 8.568507157464213e-05, + "loss": 2.9754, + "step": 839 + }, + { + "epoch": 0.2578268876611418, + "grad_norm": 2.073923110961914, + "learning_rate": 8.578732106339469e-05, + "loss": 3.0396, + "step": 840 + }, + { + "epoch": 0.258133824432167, + "grad_norm": 2.485049247741699, + "learning_rate": 8.588957055214725e-05, + "loss": 2.9297, + "step": 841 + }, + { + "epoch": 0.25844076120319215, + "grad_norm": 1.9425253868103027, + "learning_rate": 8.599182004089979e-05, + "loss": 3.0131, + "step": 842 + }, + { + "epoch": 0.2587476979742173, + "grad_norm": 2.6248724460601807, + "learning_rate": 8.609406952965236e-05, + "loss": 3.0345, + "step": 843 + }, + { + "epoch": 0.2590546347452425, + "grad_norm": 1.9123374223709106, + "learning_rate": 8.619631901840491e-05, + "loss": 3.0259, + "step": 844 + }, + { + "epoch": 0.25936157151626765, + "grad_norm": 2.457913637161255, + "learning_rate": 8.629856850715747e-05, + "loss": 3.0015, + "step": 845 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 2.0444202423095703, + "learning_rate": 8.640081799591002e-05, + "loss": 2.9663, + "step": 846 + }, + { + "epoch": 0.259975445058318, + "grad_norm": 2.1673583984375, + "learning_rate": 8.650306748466259e-05, + "loss": 3.0646, + "step": 847 + }, + { + "epoch": 0.26028238182934316, + "grad_norm": 2.1198627948760986, + "learning_rate": 8.660531697341514e-05, + "loss": 2.8769, + "step": 848 + }, + { + "epoch": 0.2605893186003683, + "grad_norm": 2.379960775375366, + "learning_rate": 8.67075664621677e-05, + "loss": 2.9637, + "step": 849 + }, + { + "epoch": 0.2608962553713935, + "grad_norm": 2.3954226970672607, + "learning_rate": 8.680981595092025e-05, + "loss": 3.025, + "step": 850 + }, + { + "epoch": 0.26120319214241866, + "grad_norm": 2.254746198654175, + "learning_rate": 8.69120654396728e-05, + "loss": 2.9962, + "step": 851 + }, + { + "epoch": 0.26151012891344383, + "grad_norm": 2.0851991176605225, + "learning_rate": 8.701431492842537e-05, + "loss": 2.9399, + "step": 852 + }, + { + "epoch": 0.261817065684469, + "grad_norm": 2.2800698280334473, + "learning_rate": 8.711656441717791e-05, + "loss": 2.9465, + "step": 853 + }, + { + "epoch": 0.26212400245549416, + "grad_norm": 2.3628437519073486, + "learning_rate": 8.721881390593048e-05, + "loss": 3.0298, + "step": 854 + }, + { + "epoch": 0.26243093922651933, + "grad_norm": 1.9642207622528076, + "learning_rate": 8.732106339468303e-05, + "loss": 2.8462, + "step": 855 + }, + { + "epoch": 0.2627378759975445, + "grad_norm": 2.5833423137664795, + "learning_rate": 8.742331288343558e-05, + "loss": 2.9024, + "step": 856 + }, + { + "epoch": 0.26304481276856967, + "grad_norm": 1.7022998332977295, + "learning_rate": 8.752556237218814e-05, + "loss": 2.9948, + "step": 857 + }, + { + "epoch": 0.26335174953959484, + "grad_norm": 3.181725025177002, + "learning_rate": 8.76278118609407e-05, + "loss": 3.0634, + "step": 858 + }, + { + "epoch": 0.26365868631062, + "grad_norm": 1.8931077718734741, + "learning_rate": 8.773006134969326e-05, + "loss": 2.9974, + "step": 859 + }, + { + "epoch": 0.2639656230816452, + "grad_norm": 2.5016703605651855, + "learning_rate": 8.783231083844581e-05, + "loss": 3.0109, + "step": 860 + }, + { + "epoch": 0.26427255985267034, + "grad_norm": 1.810957908630371, + "learning_rate": 8.793456032719837e-05, + "loss": 3.0143, + "step": 861 + }, + { + "epoch": 0.2645794966236955, + "grad_norm": 2.3004086017608643, + "learning_rate": 8.803680981595093e-05, + "loss": 2.9825, + "step": 862 + }, + { + "epoch": 0.2648864333947207, + "grad_norm": 2.23740816116333, + "learning_rate": 8.813905930470347e-05, + "loss": 2.8897, + "step": 863 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 2.441157579421997, + "learning_rate": 8.824130879345604e-05, + "loss": 2.8966, + "step": 864 + }, + { + "epoch": 0.265500306936771, + "grad_norm": 2.063201665878296, + "learning_rate": 8.83435582822086e-05, + "loss": 2.9468, + "step": 865 + }, + { + "epoch": 0.2658072437077962, + "grad_norm": 2.1484951972961426, + "learning_rate": 8.844580777096115e-05, + "loss": 2.9199, + "step": 866 + }, + { + "epoch": 0.26611418047882135, + "grad_norm": 2.167827844619751, + "learning_rate": 8.85480572597137e-05, + "loss": 2.9403, + "step": 867 + }, + { + "epoch": 0.2664211172498465, + "grad_norm": 2.193556070327759, + "learning_rate": 8.865030674846625e-05, + "loss": 2.9171, + "step": 868 + }, + { + "epoch": 0.2667280540208717, + "grad_norm": 2.0754151344299316, + "learning_rate": 8.875255623721882e-05, + "loss": 2.9605, + "step": 869 + }, + { + "epoch": 0.26703499079189685, + "grad_norm": 2.1351094245910645, + "learning_rate": 8.885480572597138e-05, + "loss": 2.9272, + "step": 870 + }, + { + "epoch": 0.267341927562922, + "grad_norm": 2.0486347675323486, + "learning_rate": 8.895705521472393e-05, + "loss": 3.0308, + "step": 871 + }, + { + "epoch": 0.2676488643339472, + "grad_norm": 2.3303308486938477, + "learning_rate": 8.905930470347648e-05, + "loss": 2.9061, + "step": 872 + }, + { + "epoch": 0.26795580110497236, + "grad_norm": 1.9345083236694336, + "learning_rate": 8.916155419222905e-05, + "loss": 2.9644, + "step": 873 + }, + { + "epoch": 0.2682627378759975, + "grad_norm": 2.451918601989746, + "learning_rate": 8.926380368098159e-05, + "loss": 2.9536, + "step": 874 + }, + { + "epoch": 0.2685696746470227, + "grad_norm": 1.6964573860168457, + "learning_rate": 8.936605316973416e-05, + "loss": 2.9228, + "step": 875 + }, + { + "epoch": 0.26887661141804786, + "grad_norm": 2.2414000034332275, + "learning_rate": 8.946830265848671e-05, + "loss": 2.9776, + "step": 876 + }, + { + "epoch": 0.26918354818907303, + "grad_norm": 1.725002408027649, + "learning_rate": 8.957055214723928e-05, + "loss": 2.9837, + "step": 877 + }, + { + "epoch": 0.2694904849600982, + "grad_norm": 2.1498587131500244, + "learning_rate": 8.967280163599182e-05, + "loss": 2.8684, + "step": 878 + }, + { + "epoch": 0.26979742173112337, + "grad_norm": 1.814738392829895, + "learning_rate": 8.977505112474438e-05, + "loss": 2.9077, + "step": 879 + }, + { + "epoch": 0.27010435850214853, + "grad_norm": 2.3086628913879395, + "learning_rate": 8.987730061349694e-05, + "loss": 2.9482, + "step": 880 + }, + { + "epoch": 0.2704112952731737, + "grad_norm": 1.7470855712890625, + "learning_rate": 8.997955010224949e-05, + "loss": 2.9775, + "step": 881 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 2.2822775840759277, + "learning_rate": 9.008179959100205e-05, + "loss": 3.0004, + "step": 882 + }, + { + "epoch": 0.27102516881522404, + "grad_norm": 1.9530903100967407, + "learning_rate": 9.018404907975461e-05, + "loss": 2.949, + "step": 883 + }, + { + "epoch": 0.2713321055862492, + "grad_norm": 2.0626885890960693, + "learning_rate": 9.028629856850717e-05, + "loss": 2.9184, + "step": 884 + }, + { + "epoch": 0.2716390423572744, + "grad_norm": 2.0040712356567383, + "learning_rate": 9.038854805725972e-05, + "loss": 2.8562, + "step": 885 + }, + { + "epoch": 0.2719459791282996, + "grad_norm": 2.026193141937256, + "learning_rate": 9.049079754601227e-05, + "loss": 2.883, + "step": 886 + }, + { + "epoch": 0.27225291589932477, + "grad_norm": 1.8337095975875854, + "learning_rate": 9.059304703476483e-05, + "loss": 2.8512, + "step": 887 + }, + { + "epoch": 0.27255985267034993, + "grad_norm": 2.1098122596740723, + "learning_rate": 9.069529652351738e-05, + "loss": 2.9024, + "step": 888 + }, + { + "epoch": 0.2728667894413751, + "grad_norm": 2.065650701522827, + "learning_rate": 9.079754601226993e-05, + "loss": 2.9291, + "step": 889 + }, + { + "epoch": 0.27317372621240027, + "grad_norm": 2.204819679260254, + "learning_rate": 9.08997955010225e-05, + "loss": 2.9153, + "step": 890 + }, + { + "epoch": 0.27348066298342544, + "grad_norm": 1.7931475639343262, + "learning_rate": 9.100204498977506e-05, + "loss": 2.9104, + "step": 891 + }, + { + "epoch": 0.2737875997544506, + "grad_norm": 2.4288859367370605, + "learning_rate": 9.110429447852761e-05, + "loss": 2.9974, + "step": 892 + }, + { + "epoch": 0.2740945365254758, + "grad_norm": 2.095872640609741, + "learning_rate": 9.120654396728016e-05, + "loss": 2.8446, + "step": 893 + }, + { + "epoch": 0.27440147329650094, + "grad_norm": 2.054410696029663, + "learning_rate": 9.130879345603273e-05, + "loss": 2.9008, + "step": 894 + }, + { + "epoch": 0.2747084100675261, + "grad_norm": 2.1989710330963135, + "learning_rate": 9.141104294478528e-05, + "loss": 2.8808, + "step": 895 + }, + { + "epoch": 0.2750153468385513, + "grad_norm": 2.531081199645996, + "learning_rate": 9.151329243353784e-05, + "loss": 2.8928, + "step": 896 + }, + { + "epoch": 0.27532228360957645, + "grad_norm": 2.010425567626953, + "learning_rate": 9.161554192229039e-05, + "loss": 2.9051, + "step": 897 + }, + { + "epoch": 0.2756292203806016, + "grad_norm": 1.9320241212844849, + "learning_rate": 9.171779141104296e-05, + "loss": 2.8675, + "step": 898 + }, + { + "epoch": 0.2759361571516268, + "grad_norm": 2.2280430793762207, + "learning_rate": 9.18200408997955e-05, + "loss": 2.9082, + "step": 899 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 1.9172335863113403, + "learning_rate": 9.192229038854807e-05, + "loss": 2.8947, + "step": 900 + }, + { + "epoch": 0.2765500306936771, + "grad_norm": 2.0846056938171387, + "learning_rate": 9.202453987730062e-05, + "loss": 2.9161, + "step": 901 + }, + { + "epoch": 0.2768569674647023, + "grad_norm": 1.875034213066101, + "learning_rate": 9.212678936605317e-05, + "loss": 2.8937, + "step": 902 + }, + { + "epoch": 0.27716390423572745, + "grad_norm": 2.230164051055908, + "learning_rate": 9.222903885480573e-05, + "loss": 2.8396, + "step": 903 + }, + { + "epoch": 0.2774708410067526, + "grad_norm": 1.6204382181167603, + "learning_rate": 9.233128834355828e-05, + "loss": 2.9367, + "step": 904 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.4218156337738037, + "learning_rate": 9.243353783231085e-05, + "loss": 2.9727, + "step": 905 + }, + { + "epoch": 0.27808471454880296, + "grad_norm": 1.7401793003082275, + "learning_rate": 9.25357873210634e-05, + "loss": 2.8957, + "step": 906 + }, + { + "epoch": 0.2783916513198281, + "grad_norm": 2.2128076553344727, + "learning_rate": 9.263803680981595e-05, + "loss": 2.8725, + "step": 907 + }, + { + "epoch": 0.2786985880908533, + "grad_norm": 2.004179000854492, + "learning_rate": 9.274028629856851e-05, + "loss": 2.8879, + "step": 908 + }, + { + "epoch": 0.27900552486187846, + "grad_norm": 2.198784112930298, + "learning_rate": 9.284253578732107e-05, + "loss": 2.9655, + "step": 909 + }, + { + "epoch": 0.27931246163290363, + "grad_norm": 1.8064004182815552, + "learning_rate": 9.294478527607362e-05, + "loss": 2.7801, + "step": 910 + }, + { + "epoch": 0.2796193984039288, + "grad_norm": 2.1273581981658936, + "learning_rate": 9.304703476482618e-05, + "loss": 2.8615, + "step": 911 + }, + { + "epoch": 0.27992633517495397, + "grad_norm": 1.7843197584152222, + "learning_rate": 9.314928425357874e-05, + "loss": 2.8735, + "step": 912 + }, + { + "epoch": 0.28023327194597913, + "grad_norm": 2.234886884689331, + "learning_rate": 9.325153374233129e-05, + "loss": 2.9444, + "step": 913 + }, + { + "epoch": 0.2805402087170043, + "grad_norm": 2.0565783977508545, + "learning_rate": 9.335378323108384e-05, + "loss": 2.9784, + "step": 914 + }, + { + "epoch": 0.28084714548802947, + "grad_norm": 1.836901068687439, + "learning_rate": 9.345603271983641e-05, + "loss": 2.9217, + "step": 915 + }, + { + "epoch": 0.28115408225905464, + "grad_norm": 2.0981357097625732, + "learning_rate": 9.355828220858896e-05, + "loss": 2.9091, + "step": 916 + }, + { + "epoch": 0.2814610190300798, + "grad_norm": 1.9199821949005127, + "learning_rate": 9.366053169734152e-05, + "loss": 2.8882, + "step": 917 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 1.9928756952285767, + "learning_rate": 9.376278118609407e-05, + "loss": 2.8463, + "step": 918 + }, + { + "epoch": 0.28207489257213014, + "grad_norm": 1.9580156803131104, + "learning_rate": 9.386503067484664e-05, + "loss": 2.7814, + "step": 919 + }, + { + "epoch": 0.2823818293431553, + "grad_norm": 2.016144275665283, + "learning_rate": 9.396728016359919e-05, + "loss": 2.8725, + "step": 920 + }, + { + "epoch": 0.2826887661141805, + "grad_norm": 1.967668890953064, + "learning_rate": 9.406952965235175e-05, + "loss": 2.912, + "step": 921 + }, + { + "epoch": 0.28299570288520565, + "grad_norm": 1.8826593160629272, + "learning_rate": 9.41717791411043e-05, + "loss": 2.7885, + "step": 922 + }, + { + "epoch": 0.2833026396562308, + "grad_norm": 2.0615732669830322, + "learning_rate": 9.427402862985685e-05, + "loss": 2.9111, + "step": 923 + }, + { + "epoch": 0.283609576427256, + "grad_norm": 1.7132701873779297, + "learning_rate": 9.43762781186094e-05, + "loss": 2.89, + "step": 924 + }, + { + "epoch": 0.28391651319828115, + "grad_norm": 2.1561272144317627, + "learning_rate": 9.447852760736196e-05, + "loss": 2.8741, + "step": 925 + }, + { + "epoch": 0.2842234499693063, + "grad_norm": 1.727338433265686, + "learning_rate": 9.458077709611453e-05, + "loss": 2.8449, + "step": 926 + }, + { + "epoch": 0.2845303867403315, + "grad_norm": 2.19234299659729, + "learning_rate": 9.468302658486708e-05, + "loss": 2.8499, + "step": 927 + }, + { + "epoch": 0.28483732351135665, + "grad_norm": 1.7370812892913818, + "learning_rate": 9.478527607361963e-05, + "loss": 2.882, + "step": 928 + }, + { + "epoch": 0.2851442602823818, + "grad_norm": 2.0576157569885254, + "learning_rate": 9.488752556237219e-05, + "loss": 2.7869, + "step": 929 + }, + { + "epoch": 0.285451197053407, + "grad_norm": 1.7926486730575562, + "learning_rate": 9.498977505112476e-05, + "loss": 2.906, + "step": 930 + }, + { + "epoch": 0.28575813382443216, + "grad_norm": 1.6877856254577637, + "learning_rate": 9.50920245398773e-05, + "loss": 2.8422, + "step": 931 + }, + { + "epoch": 0.2860650705954573, + "grad_norm": 2.3053178787231445, + "learning_rate": 9.519427402862986e-05, + "loss": 2.9039, + "step": 932 + }, + { + "epoch": 0.2863720073664825, + "grad_norm": 1.7746092081069946, + "learning_rate": 9.529652351738242e-05, + "loss": 2.9082, + "step": 933 + }, + { + "epoch": 0.28667894413750766, + "grad_norm": 2.1900086402893066, + "learning_rate": 9.539877300613498e-05, + "loss": 2.8511, + "step": 934 + }, + { + "epoch": 0.28698588090853283, + "grad_norm": 1.781988501548767, + "learning_rate": 9.550102249488752e-05, + "loss": 2.8264, + "step": 935 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 1.845797061920166, + "learning_rate": 9.560327198364009e-05, + "loss": 2.8657, + "step": 936 + }, + { + "epoch": 0.28759975445058317, + "grad_norm": 1.8794586658477783, + "learning_rate": 9.570552147239264e-05, + "loss": 2.8365, + "step": 937 + }, + { + "epoch": 0.28790669122160834, + "grad_norm": 2.078359603881836, + "learning_rate": 9.58077709611452e-05, + "loss": 2.8829, + "step": 938 + }, + { + "epoch": 0.2882136279926335, + "grad_norm": 1.8091285228729248, + "learning_rate": 9.591002044989775e-05, + "loss": 2.8083, + "step": 939 + }, + { + "epoch": 0.28852056476365867, + "grad_norm": 2.0130608081817627, + "learning_rate": 9.601226993865032e-05, + "loss": 2.8922, + "step": 940 + }, + { + "epoch": 0.28882750153468384, + "grad_norm": 1.8504360914230347, + "learning_rate": 9.611451942740287e-05, + "loss": 2.8034, + "step": 941 + }, + { + "epoch": 0.289134438305709, + "grad_norm": 1.860420823097229, + "learning_rate": 9.621676891615543e-05, + "loss": 2.8249, + "step": 942 + }, + { + "epoch": 0.2894413750767342, + "grad_norm": 2.157158374786377, + "learning_rate": 9.631901840490798e-05, + "loss": 2.8629, + "step": 943 + }, + { + "epoch": 0.28974831184775934, + "grad_norm": 1.8066895008087158, + "learning_rate": 9.642126789366053e-05, + "loss": 2.7965, + "step": 944 + }, + { + "epoch": 0.2900552486187845, + "grad_norm": 1.9674500226974487, + "learning_rate": 9.65235173824131e-05, + "loss": 2.8043, + "step": 945 + }, + { + "epoch": 0.2903621853898097, + "grad_norm": 1.7899354696273804, + "learning_rate": 9.662576687116564e-05, + "loss": 2.8803, + "step": 946 + }, + { + "epoch": 0.29066912216083485, + "grad_norm": 2.220201015472412, + "learning_rate": 9.672801635991821e-05, + "loss": 2.8201, + "step": 947 + }, + { + "epoch": 0.29097605893186, + "grad_norm": 1.76320219039917, + "learning_rate": 9.683026584867076e-05, + "loss": 2.8921, + "step": 948 + }, + { + "epoch": 0.2912829957028852, + "grad_norm": 1.6863081455230713, + "learning_rate": 9.693251533742331e-05, + "loss": 2.8208, + "step": 949 + }, + { + "epoch": 0.29158993247391035, + "grad_norm": 2.1578476428985596, + "learning_rate": 9.703476482617587e-05, + "loss": 2.8972, + "step": 950 + }, + { + "epoch": 0.2918968692449355, + "grad_norm": 1.6925181150436401, + "learning_rate": 9.713701431492844e-05, + "loss": 2.8225, + "step": 951 + }, + { + "epoch": 0.2922038060159607, + "grad_norm": 1.8861147165298462, + "learning_rate": 9.723926380368099e-05, + "loss": 2.8707, + "step": 952 + }, + { + "epoch": 0.29251074278698586, + "grad_norm": 1.5894604921340942, + "learning_rate": 9.734151329243354e-05, + "loss": 2.7576, + "step": 953 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 1.9092673063278198, + "learning_rate": 9.74437627811861e-05, + "loss": 2.8659, + "step": 954 + }, + { + "epoch": 0.2931246163290362, + "grad_norm": 1.8600605726242065, + "learning_rate": 9.754601226993866e-05, + "loss": 2.752, + "step": 955 + }, + { + "epoch": 0.29343155310006136, + "grad_norm": 2.005805015563965, + "learning_rate": 9.76482617586912e-05, + "loss": 2.8511, + "step": 956 + }, + { + "epoch": 0.2937384898710866, + "grad_norm": 1.9485148191452026, + "learning_rate": 9.775051124744377e-05, + "loss": 2.9726, + "step": 957 + }, + { + "epoch": 0.29404542664211175, + "grad_norm": 1.9197280406951904, + "learning_rate": 9.785276073619632e-05, + "loss": 2.7753, + "step": 958 + }, + { + "epoch": 0.2943523634131369, + "grad_norm": 1.6279773712158203, + "learning_rate": 9.795501022494888e-05, + "loss": 2.8855, + "step": 959 + }, + { + "epoch": 0.2946593001841621, + "grad_norm": 2.0233097076416016, + "learning_rate": 9.805725971370143e-05, + "loss": 2.749, + "step": 960 + }, + { + "epoch": 0.29496623695518726, + "grad_norm": 1.550295352935791, + "learning_rate": 9.815950920245399e-05, + "loss": 2.7991, + "step": 961 + }, + { + "epoch": 0.2952731737262124, + "grad_norm": 2.3194360733032227, + "learning_rate": 9.826175869120655e-05, + "loss": 2.8208, + "step": 962 + }, + { + "epoch": 0.2955801104972376, + "grad_norm": 1.634867787361145, + "learning_rate": 9.83640081799591e-05, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.29588704726826276, + "grad_norm": 2.1152596473693848, + "learning_rate": 9.846625766871166e-05, + "loss": 2.7667, + "step": 964 + }, + { + "epoch": 0.2961939840392879, + "grad_norm": 1.8927233219146729, + "learning_rate": 9.856850715746421e-05, + "loss": 2.8308, + "step": 965 + }, + { + "epoch": 0.2965009208103131, + "grad_norm": 1.765026330947876, + "learning_rate": 9.867075664621678e-05, + "loss": 2.7546, + "step": 966 + }, + { + "epoch": 0.29680785758133826, + "grad_norm": 1.7491015195846558, + "learning_rate": 9.877300613496932e-05, + "loss": 2.8156, + "step": 967 + }, + { + "epoch": 0.29711479435236343, + "grad_norm": 1.8352077007293701, + "learning_rate": 9.887525562372189e-05, + "loss": 2.8542, + "step": 968 + }, + { + "epoch": 0.2974217311233886, + "grad_norm": 1.8892323970794678, + "learning_rate": 9.897750511247444e-05, + "loss": 2.8216, + "step": 969 + }, + { + "epoch": 0.29772866789441377, + "grad_norm": 1.7171403169631958, + "learning_rate": 9.907975460122701e-05, + "loss": 2.8428, + "step": 970 + }, + { + "epoch": 0.29803560466543894, + "grad_norm": 1.8318040370941162, + "learning_rate": 9.918200408997955e-05, + "loss": 2.7821, + "step": 971 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 1.5829975605010986, + "learning_rate": 9.928425357873212e-05, + "loss": 2.9091, + "step": 972 + }, + { + "epoch": 0.29864947820748927, + "grad_norm": 1.7248235940933228, + "learning_rate": 9.938650306748467e-05, + "loss": 2.7914, + "step": 973 + }, + { + "epoch": 0.29895641497851444, + "grad_norm": 1.7741187810897827, + "learning_rate": 9.948875255623722e-05, + "loss": 2.8711, + "step": 974 + }, + { + "epoch": 0.2992633517495396, + "grad_norm": 1.7419151067733765, + "learning_rate": 9.959100204498978e-05, + "loss": 2.8933, + "step": 975 + }, + { + "epoch": 0.2995702885205648, + "grad_norm": 1.6603926420211792, + "learning_rate": 9.969325153374234e-05, + "loss": 2.7138, + "step": 976 + }, + { + "epoch": 0.29987722529158994, + "grad_norm": 1.8423576354980469, + "learning_rate": 9.97955010224949e-05, + "loss": 2.7776, + "step": 977 + }, + { + "epoch": 0.3001841620626151, + "grad_norm": 1.5548568964004517, + "learning_rate": 9.989775051124745e-05, + "loss": 2.8193, + "step": 978 + }, + { + "epoch": 0.3004910988336403, + "grad_norm": 1.711785078048706, + "learning_rate": 0.0001, + "loss": 2.7082, + "step": 979 + }, + { + "epoch": 0.30079803560466545, + "grad_norm": 1.6395221948623657, + "learning_rate": 9.999999975293535e-05, + "loss": 2.7526, + "step": 980 + }, + { + "epoch": 0.3011049723756906, + "grad_norm": 1.829174518585205, + "learning_rate": 9.999999901174139e-05, + "loss": 2.7555, + "step": 981 + }, + { + "epoch": 0.3014119091467158, + "grad_norm": 1.5807569026947021, + "learning_rate": 9.999999777641814e-05, + "loss": 2.848, + "step": 982 + }, + { + "epoch": 0.30171884591774095, + "grad_norm": 2.014803171157837, + "learning_rate": 9.99999960469656e-05, + "loss": 2.8318, + "step": 983 + }, + { + "epoch": 0.3020257826887661, + "grad_norm": 1.4732542037963867, + "learning_rate": 9.99999938233838e-05, + "loss": 2.8143, + "step": 984 + }, + { + "epoch": 0.3023327194597913, + "grad_norm": 2.4888343811035156, + "learning_rate": 9.999999110567275e-05, + "loss": 2.7979, + "step": 985 + }, + { + "epoch": 0.30263965623081646, + "grad_norm": 1.4265737533569336, + "learning_rate": 9.99999878938325e-05, + "loss": 2.7968, + "step": 986 + }, + { + "epoch": 0.3029465930018416, + "grad_norm": 2.0397326946258545, + "learning_rate": 9.999998418786303e-05, + "loss": 2.7413, + "step": 987 + }, + { + "epoch": 0.3032535297728668, + "grad_norm": 1.6565579175949097, + "learning_rate": 9.999997998776443e-05, + "loss": 2.8249, + "step": 988 + }, + { + "epoch": 0.30356046654389196, + "grad_norm": 1.8470033407211304, + "learning_rate": 9.999997529353673e-05, + "loss": 2.7815, + "step": 989 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 1.571768045425415, + "learning_rate": 9.999997010517995e-05, + "loss": 2.7202, + "step": 990 + }, + { + "epoch": 0.3041743400859423, + "grad_norm": 1.6217811107635498, + "learning_rate": 9.999996442269417e-05, + "loss": 2.832, + "step": 991 + }, + { + "epoch": 0.30448127685696746, + "grad_norm": 1.745591640472412, + "learning_rate": 9.999995824607943e-05, + "loss": 2.8271, + "step": 992 + }, + { + "epoch": 0.30478821362799263, + "grad_norm": 1.6469355821609497, + "learning_rate": 9.99999515753358e-05, + "loss": 2.7699, + "step": 993 + }, + { + "epoch": 0.3050951503990178, + "grad_norm": 1.733182430267334, + "learning_rate": 9.999994441046334e-05, + "loss": 2.7927, + "step": 994 + }, + { + "epoch": 0.30540208717004297, + "grad_norm": 1.6043230295181274, + "learning_rate": 9.999993675146213e-05, + "loss": 2.7536, + "step": 995 + }, + { + "epoch": 0.30570902394106814, + "grad_norm": 1.8154711723327637, + "learning_rate": 9.999992859833222e-05, + "loss": 2.7795, + "step": 996 + }, + { + "epoch": 0.3060159607120933, + "grad_norm": 1.7553666830062866, + "learning_rate": 9.999991995107374e-05, + "loss": 2.8128, + "step": 997 + }, + { + "epoch": 0.3063228974831185, + "grad_norm": 1.702697992324829, + "learning_rate": 9.999991080968672e-05, + "loss": 2.7234, + "step": 998 + }, + { + "epoch": 0.30662983425414364, + "grad_norm": 1.512619972229004, + "learning_rate": 9.99999011741713e-05, + "loss": 2.7555, + "step": 999 + }, + { + "epoch": 0.3069367710251688, + "grad_norm": 1.735844612121582, + "learning_rate": 9.999989104452753e-05, + "loss": 2.7847, + "step": 1000 + }, + { + "epoch": 0.307243707796194, + "grad_norm": 1.4687904119491577, + "learning_rate": 9.999988042075555e-05, + "loss": 2.8039, + "step": 1001 + }, + { + "epoch": 0.30755064456721914, + "grad_norm": 1.6867917776107788, + "learning_rate": 9.999986930285542e-05, + "loss": 2.7643, + "step": 1002 + }, + { + "epoch": 0.3078575813382443, + "grad_norm": 1.6974400281906128, + "learning_rate": 9.99998576908273e-05, + "loss": 2.7284, + "step": 1003 + }, + { + "epoch": 0.3081645181092695, + "grad_norm": 1.6622353792190552, + "learning_rate": 9.999984558467126e-05, + "loss": 2.8364, + "step": 1004 + }, + { + "epoch": 0.30847145488029465, + "grad_norm": 1.7920496463775635, + "learning_rate": 9.999983298438744e-05, + "loss": 2.7769, + "step": 1005 + }, + { + "epoch": 0.3087783916513198, + "grad_norm": 1.7111997604370117, + "learning_rate": 9.999981988997598e-05, + "loss": 2.7323, + "step": 1006 + }, + { + "epoch": 0.309085328422345, + "grad_norm": 1.6372064352035522, + "learning_rate": 9.9999806301437e-05, + "loss": 2.8128, + "step": 1007 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 1.841002345085144, + "learning_rate": 9.999979221877061e-05, + "loss": 2.7049, + "step": 1008 + }, + { + "epoch": 0.3096992019643953, + "grad_norm": 1.4474141597747803, + "learning_rate": 9.999977764197697e-05, + "loss": 2.64, + "step": 1009 + }, + { + "epoch": 0.3100061387354205, + "grad_norm": 1.6599560976028442, + "learning_rate": 9.999976257105622e-05, + "loss": 2.7989, + "step": 1010 + }, + { + "epoch": 0.31031307550644566, + "grad_norm": 1.7502890825271606, + "learning_rate": 9.999974700600851e-05, + "loss": 2.7949, + "step": 1011 + }, + { + "epoch": 0.3106200122774708, + "grad_norm": 1.8119313716888428, + "learning_rate": 9.9999730946834e-05, + "loss": 2.7577, + "step": 1012 + }, + { + "epoch": 0.310926949048496, + "grad_norm": 1.4398404359817505, + "learning_rate": 9.999971439353284e-05, + "loss": 2.7369, + "step": 1013 + }, + { + "epoch": 0.31123388581952116, + "grad_norm": 1.8501840829849243, + "learning_rate": 9.999969734610522e-05, + "loss": 2.6651, + "step": 1014 + }, + { + "epoch": 0.31154082259054633, + "grad_norm": 1.450804352760315, + "learning_rate": 9.999967980455125e-05, + "loss": 2.7231, + "step": 1015 + }, + { + "epoch": 0.3118477593615715, + "grad_norm": 1.9445282220840454, + "learning_rate": 9.999966176887115e-05, + "loss": 2.795, + "step": 1016 + }, + { + "epoch": 0.31215469613259667, + "grad_norm": 1.6361008882522583, + "learning_rate": 9.99996432390651e-05, + "loss": 2.8894, + "step": 1017 + }, + { + "epoch": 0.31246163290362183, + "grad_norm": 2.0804831981658936, + "learning_rate": 9.999962421513325e-05, + "loss": 2.8313, + "step": 1018 + }, + { + "epoch": 0.312768569674647, + "grad_norm": 1.3779852390289307, + "learning_rate": 9.999960469707582e-05, + "loss": 2.6776, + "step": 1019 + }, + { + "epoch": 0.31307550644567217, + "grad_norm": 1.7727700471878052, + "learning_rate": 9.999958468489299e-05, + "loss": 2.8076, + "step": 1020 + }, + { + "epoch": 0.31338244321669734, + "grad_norm": 1.5273795127868652, + "learning_rate": 9.999956417858496e-05, + "loss": 2.7069, + "step": 1021 + }, + { + "epoch": 0.3136893799877225, + "grad_norm": 1.8135402202606201, + "learning_rate": 9.999954317815193e-05, + "loss": 2.7375, + "step": 1022 + }, + { + "epoch": 0.3139963167587477, + "grad_norm": 1.6642818450927734, + "learning_rate": 9.99995216835941e-05, + "loss": 2.8085, + "step": 1023 + }, + { + "epoch": 0.31430325352977284, + "grad_norm": 1.681378722190857, + "learning_rate": 9.999949969491169e-05, + "loss": 2.807, + "step": 1024 + }, + { + "epoch": 0.314610190300798, + "grad_norm": 1.5521160364151, + "learning_rate": 9.999947721210493e-05, + "loss": 2.7266, + "step": 1025 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 1.486830711364746, + "learning_rate": 9.999945423517403e-05, + "loss": 2.774, + "step": 1026 + }, + { + "epoch": 0.3152240638428484, + "grad_norm": 1.5730900764465332, + "learning_rate": 9.99994307641192e-05, + "loss": 2.7101, + "step": 1027 + }, + { + "epoch": 0.31553100061387357, + "grad_norm": 1.4835596084594727, + "learning_rate": 9.999940679894071e-05, + "loss": 2.8195, + "step": 1028 + }, + { + "epoch": 0.31583793738489874, + "grad_norm": 1.7885956764221191, + "learning_rate": 9.999938233963877e-05, + "loss": 2.796, + "step": 1029 + }, + { + "epoch": 0.3161448741559239, + "grad_norm": 1.4036259651184082, + "learning_rate": 9.999935738621362e-05, + "loss": 2.7167, + "step": 1030 + }, + { + "epoch": 0.3164518109269491, + "grad_norm": 1.7480512857437134, + "learning_rate": 9.999933193866554e-05, + "loss": 2.6774, + "step": 1031 + }, + { + "epoch": 0.31675874769797424, + "grad_norm": 1.66177499294281, + "learning_rate": 9.999930599699473e-05, + "loss": 2.7635, + "step": 1032 + }, + { + "epoch": 0.3170656844689994, + "grad_norm": 1.5088306665420532, + "learning_rate": 9.999927956120147e-05, + "loss": 2.7284, + "step": 1033 + }, + { + "epoch": 0.3173726212400246, + "grad_norm": 1.6847199201583862, + "learning_rate": 9.999925263128605e-05, + "loss": 2.8287, + "step": 1034 + }, + { + "epoch": 0.31767955801104975, + "grad_norm": 1.6092369556427002, + "learning_rate": 9.999922520724869e-05, + "loss": 2.7189, + "step": 1035 + }, + { + "epoch": 0.3179864947820749, + "grad_norm": 1.41717529296875, + "learning_rate": 9.999919728908969e-05, + "loss": 2.7134, + "step": 1036 + }, + { + "epoch": 0.3182934315531001, + "grad_norm": 1.6256498098373413, + "learning_rate": 9.999916887680931e-05, + "loss": 2.7312, + "step": 1037 + }, + { + "epoch": 0.31860036832412525, + "grad_norm": 1.4934377670288086, + "learning_rate": 9.999913997040784e-05, + "loss": 2.7548, + "step": 1038 + }, + { + "epoch": 0.3189073050951504, + "grad_norm": 1.6037719249725342, + "learning_rate": 9.999911056988557e-05, + "loss": 2.7682, + "step": 1039 + }, + { + "epoch": 0.3192142418661756, + "grad_norm": 1.4746284484863281, + "learning_rate": 9.999908067524277e-05, + "loss": 2.7256, + "step": 1040 + }, + { + "epoch": 0.31952117863720075, + "grad_norm": 1.4633710384368896, + "learning_rate": 9.999905028647976e-05, + "loss": 2.6779, + "step": 1041 + }, + { + "epoch": 0.3198281154082259, + "grad_norm": 1.6108646392822266, + "learning_rate": 9.999901940359684e-05, + "loss": 2.781, + "step": 1042 + }, + { + "epoch": 0.3201350521792511, + "grad_norm": 1.4130996465682983, + "learning_rate": 9.999898802659428e-05, + "loss": 2.6327, + "step": 1043 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 2.110307455062866, + "learning_rate": 9.999895615547244e-05, + "loss": 2.7965, + "step": 1044 + }, + { + "epoch": 0.3207489257213014, + "grad_norm": 1.500618815422058, + "learning_rate": 9.99989237902316e-05, + "loss": 2.7874, + "step": 1045 + }, + { + "epoch": 0.3210558624923266, + "grad_norm": 1.577890157699585, + "learning_rate": 9.999889093087207e-05, + "loss": 2.6816, + "step": 1046 + }, + { + "epoch": 0.32136279926335176, + "grad_norm": 1.2820981740951538, + "learning_rate": 9.999885757739422e-05, + "loss": 2.6799, + "step": 1047 + }, + { + "epoch": 0.32166973603437693, + "grad_norm": 1.629936695098877, + "learning_rate": 9.999882372979835e-05, + "loss": 2.6783, + "step": 1048 + }, + { + "epoch": 0.3219766728054021, + "grad_norm": 1.3119972944259644, + "learning_rate": 9.999878938808478e-05, + "loss": 2.6403, + "step": 1049 + }, + { + "epoch": 0.32228360957642727, + "grad_norm": 1.720093846321106, + "learning_rate": 9.999875455225389e-05, + "loss": 2.709, + "step": 1050 + }, + { + "epoch": 0.32259054634745243, + "grad_norm": 1.446273922920227, + "learning_rate": 9.999871922230599e-05, + "loss": 2.6463, + "step": 1051 + }, + { + "epoch": 0.3228974831184776, + "grad_norm": 1.5000908374786377, + "learning_rate": 9.999868339824145e-05, + "loss": 2.7502, + "step": 1052 + }, + { + "epoch": 0.32320441988950277, + "grad_norm": 1.6257869005203247, + "learning_rate": 9.999864708006061e-05, + "loss": 2.6984, + "step": 1053 + }, + { + "epoch": 0.32351135666052794, + "grad_norm": 1.509638786315918, + "learning_rate": 9.999861026776384e-05, + "loss": 2.6931, + "step": 1054 + }, + { + "epoch": 0.3238182934315531, + "grad_norm": 1.5305874347686768, + "learning_rate": 9.999857296135149e-05, + "loss": 2.8423, + "step": 1055 + }, + { + "epoch": 0.3241252302025783, + "grad_norm": 1.7664300203323364, + "learning_rate": 9.999853516082394e-05, + "loss": 2.7703, + "step": 1056 + }, + { + "epoch": 0.32443216697360344, + "grad_norm": 1.4633153676986694, + "learning_rate": 9.999849686618157e-05, + "loss": 2.7588, + "step": 1057 + }, + { + "epoch": 0.3247391037446286, + "grad_norm": 1.5177773237228394, + "learning_rate": 9.999845807742473e-05, + "loss": 2.7376, + "step": 1058 + }, + { + "epoch": 0.3250460405156538, + "grad_norm": 1.6122089624404907, + "learning_rate": 9.999841879455383e-05, + "loss": 2.7871, + "step": 1059 + }, + { + "epoch": 0.32535297728667895, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.999837901756926e-05, + "loss": 2.6602, + "step": 1060 + }, + { + "epoch": 0.3256599140577041, + "grad_norm": 1.5714327096939087, + "learning_rate": 9.99983387464714e-05, + "loss": 2.6279, + "step": 1061 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 1.399731993675232, + "learning_rate": 9.999829798126065e-05, + "loss": 2.7957, + "step": 1062 + }, + { + "epoch": 0.32627378759975445, + "grad_norm": 1.694368839263916, + "learning_rate": 9.999825672193741e-05, + "loss": 2.6859, + "step": 1063 + }, + { + "epoch": 0.3265807243707796, + "grad_norm": 1.2585967779159546, + "learning_rate": 9.99982149685021e-05, + "loss": 2.7964, + "step": 1064 + }, + { + "epoch": 0.3268876611418048, + "grad_norm": 1.802262306213379, + "learning_rate": 9.999817272095512e-05, + "loss": 2.6325, + "step": 1065 + }, + { + "epoch": 0.32719459791282995, + "grad_norm": 1.213222861289978, + "learning_rate": 9.99981299792969e-05, + "loss": 2.718, + "step": 1066 + }, + { + "epoch": 0.3275015346838551, + "grad_norm": 1.5745760202407837, + "learning_rate": 9.999808674352785e-05, + "loss": 2.8589, + "step": 1067 + }, + { + "epoch": 0.3278084714548803, + "grad_norm": 1.516995906829834, + "learning_rate": 9.999804301364839e-05, + "loss": 2.6691, + "step": 1068 + }, + { + "epoch": 0.32811540822590546, + "grad_norm": 1.4223122596740723, + "learning_rate": 9.999799878965897e-05, + "loss": 2.6899, + "step": 1069 + }, + { + "epoch": 0.3284223449969306, + "grad_norm": 1.4502828121185303, + "learning_rate": 9.999795407156003e-05, + "loss": 2.7801, + "step": 1070 + }, + { + "epoch": 0.3287292817679558, + "grad_norm": 1.4692026376724243, + "learning_rate": 9.999790885935198e-05, + "loss": 2.6869, + "step": 1071 + }, + { + "epoch": 0.32903621853898096, + "grad_norm": 1.4182246923446655, + "learning_rate": 9.999786315303532e-05, + "loss": 2.7802, + "step": 1072 + }, + { + "epoch": 0.32934315531000613, + "grad_norm": 1.781173586845398, + "learning_rate": 9.999781695261046e-05, + "loss": 2.7522, + "step": 1073 + }, + { + "epoch": 0.3296500920810313, + "grad_norm": 1.3958306312561035, + "learning_rate": 9.999777025807786e-05, + "loss": 2.6894, + "step": 1074 + }, + { + "epoch": 0.32995702885205647, + "grad_norm": 1.7938110828399658, + "learning_rate": 9.9997723069438e-05, + "loss": 2.6468, + "step": 1075 + }, + { + "epoch": 0.33026396562308163, + "grad_norm": 1.2314528226852417, + "learning_rate": 9.999767538669134e-05, + "loss": 2.7446, + "step": 1076 + }, + { + "epoch": 0.3305709023941068, + "grad_norm": 1.4881565570831299, + "learning_rate": 9.999762720983835e-05, + "loss": 2.6904, + "step": 1077 + }, + { + "epoch": 0.33087783916513197, + "grad_norm": 1.3903130292892456, + "learning_rate": 9.999757853887948e-05, + "loss": 2.7315, + "step": 1078 + }, + { + "epoch": 0.33118477593615714, + "grad_norm": 1.491129755973816, + "learning_rate": 9.999752937381525e-05, + "loss": 2.7325, + "step": 1079 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 1.4748190641403198, + "learning_rate": 9.999747971464612e-05, + "loss": 2.7288, + "step": 1080 + }, + { + "epoch": 0.3317986494782075, + "grad_norm": 1.5664055347442627, + "learning_rate": 9.99974295613726e-05, + "loss": 2.8225, + "step": 1081 + }, + { + "epoch": 0.33210558624923264, + "grad_norm": 1.4422696828842163, + "learning_rate": 9.999737891399518e-05, + "loss": 2.6537, + "step": 1082 + }, + { + "epoch": 0.3324125230202578, + "grad_norm": 1.397817850112915, + "learning_rate": 9.999732777251436e-05, + "loss": 2.6329, + "step": 1083 + }, + { + "epoch": 0.332719459791283, + "grad_norm": 1.4253548383712769, + "learning_rate": 9.999727613693063e-05, + "loss": 2.7028, + "step": 1084 + }, + { + "epoch": 0.33302639656230815, + "grad_norm": 1.4327688217163086, + "learning_rate": 9.999722400724451e-05, + "loss": 2.6524, + "step": 1085 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.999717138345654e-05, + "loss": 2.7278, + "step": 1086 + }, + { + "epoch": 0.3336402701043585, + "grad_norm": 1.536656379699707, + "learning_rate": 9.999711826556719e-05, + "loss": 2.5858, + "step": 1087 + }, + { + "epoch": 0.33394720687538365, + "grad_norm": 1.4210286140441895, + "learning_rate": 9.999706465357703e-05, + "loss": 2.7057, + "step": 1088 + }, + { + "epoch": 0.3342541436464088, + "grad_norm": 1.4605839252471924, + "learning_rate": 9.999701054748657e-05, + "loss": 2.6461, + "step": 1089 + }, + { + "epoch": 0.334561080417434, + "grad_norm": 1.4764037132263184, + "learning_rate": 9.999695594729636e-05, + "loss": 2.608, + "step": 1090 + }, + { + "epoch": 0.33486801718845915, + "grad_norm": 1.630843162536621, + "learning_rate": 9.99969008530069e-05, + "loss": 2.6165, + "step": 1091 + }, + { + "epoch": 0.3351749539594843, + "grad_norm": 1.3693522214889526, + "learning_rate": 9.999684526461879e-05, + "loss": 2.72, + "step": 1092 + }, + { + "epoch": 0.3354818907305095, + "grad_norm": 1.609580636024475, + "learning_rate": 9.999678918213254e-05, + "loss": 2.7602, + "step": 1093 + }, + { + "epoch": 0.33578882750153466, + "grad_norm": 1.3815720081329346, + "learning_rate": 9.999673260554872e-05, + "loss": 2.6297, + "step": 1094 + }, + { + "epoch": 0.3360957642725598, + "grad_norm": 1.4511120319366455, + "learning_rate": 9.999667553486787e-05, + "loss": 2.7515, + "step": 1095 + }, + { + "epoch": 0.336402701043585, + "grad_norm": 1.486387848854065, + "learning_rate": 9.999661797009057e-05, + "loss": 2.6839, + "step": 1096 + }, + { + "epoch": 0.33670963781461016, + "grad_norm": 1.239160180091858, + "learning_rate": 9.999655991121739e-05, + "loss": 2.6033, + "step": 1097 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 1.499598741531372, + "learning_rate": 9.999650135824891e-05, + "loss": 2.5582, + "step": 1098 + }, + { + "epoch": 0.33732351135666055, + "grad_norm": 1.32973051071167, + "learning_rate": 9.999644231118571e-05, + "loss": 2.6253, + "step": 1099 + }, + { + "epoch": 0.3376304481276857, + "grad_norm": 1.4025259017944336, + "learning_rate": 9.999638277002833e-05, + "loss": 2.6199, + "step": 1100 + }, + { + "epoch": 0.3379373848987109, + "grad_norm": 1.3162082433700562, + "learning_rate": 9.999632273477742e-05, + "loss": 2.5528, + "step": 1101 + }, + { + "epoch": 0.33824432166973606, + "grad_norm": 1.5454723834991455, + "learning_rate": 9.999626220543352e-05, + "loss": 2.6724, + "step": 1102 + }, + { + "epoch": 0.3385512584407612, + "grad_norm": 1.45896315574646, + "learning_rate": 9.999620118199727e-05, + "loss": 2.688, + "step": 1103 + }, + { + "epoch": 0.3388581952117864, + "grad_norm": 1.3940998315811157, + "learning_rate": 9.999613966446926e-05, + "loss": 2.6991, + "step": 1104 + }, + { + "epoch": 0.33916513198281156, + "grad_norm": 1.4427480697631836, + "learning_rate": 9.999607765285009e-05, + "loss": 2.6869, + "step": 1105 + }, + { + "epoch": 0.33947206875383673, + "grad_norm": 1.260373830795288, + "learning_rate": 9.999601514714036e-05, + "loss": 2.7011, + "step": 1106 + }, + { + "epoch": 0.3397790055248619, + "grad_norm": 1.5985103845596313, + "learning_rate": 9.999595214734072e-05, + "loss": 2.599, + "step": 1107 + }, + { + "epoch": 0.34008594229588707, + "grad_norm": 1.1968494653701782, + "learning_rate": 9.999588865345179e-05, + "loss": 2.6346, + "step": 1108 + }, + { + "epoch": 0.34039287906691224, + "grad_norm": 1.4565916061401367, + "learning_rate": 9.999582466547417e-05, + "loss": 2.6303, + "step": 1109 + }, + { + "epoch": 0.3406998158379374, + "grad_norm": 1.2992361783981323, + "learning_rate": 9.999576018340851e-05, + "loss": 2.6121, + "step": 1110 + }, + { + "epoch": 0.34100675260896257, + "grad_norm": 1.402471899986267, + "learning_rate": 9.999569520725543e-05, + "loss": 2.6697, + "step": 1111 + }, + { + "epoch": 0.34131368937998774, + "grad_norm": 1.3006439208984375, + "learning_rate": 9.99956297370156e-05, + "loss": 2.6347, + "step": 1112 + }, + { + "epoch": 0.3416206261510129, + "grad_norm": 1.4235650300979614, + "learning_rate": 9.999556377268966e-05, + "loss": 2.6869, + "step": 1113 + }, + { + "epoch": 0.3419275629220381, + "grad_norm": 1.3288183212280273, + "learning_rate": 9.999549731427824e-05, + "loss": 2.5834, + "step": 1114 + }, + { + "epoch": 0.34223449969306324, + "grad_norm": 1.430736780166626, + "learning_rate": 9.999543036178203e-05, + "loss": 2.6248, + "step": 1115 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 1.467417597770691, + "learning_rate": 9.999536291520167e-05, + "loss": 2.6563, + "step": 1116 + }, + { + "epoch": 0.3428483732351136, + "grad_norm": 1.3988397121429443, + "learning_rate": 9.999529497453782e-05, + "loss": 2.6634, + "step": 1117 + }, + { + "epoch": 0.34315531000613875, + "grad_norm": 1.2072746753692627, + "learning_rate": 9.999522653979117e-05, + "loss": 2.6129, + "step": 1118 + }, + { + "epoch": 0.3434622467771639, + "grad_norm": 1.5297373533248901, + "learning_rate": 9.999515761096239e-05, + "loss": 2.6359, + "step": 1119 + }, + { + "epoch": 0.3437691835481891, + "grad_norm": 1.2022082805633545, + "learning_rate": 9.999508818805214e-05, + "loss": 2.6934, + "step": 1120 + }, + { + "epoch": 0.34407612031921425, + "grad_norm": 1.5655800104141235, + "learning_rate": 9.999501827106114e-05, + "loss": 2.6132, + "step": 1121 + }, + { + "epoch": 0.3443830570902394, + "grad_norm": 1.1639407873153687, + "learning_rate": 9.999494785999007e-05, + "loss": 2.6416, + "step": 1122 + }, + { + "epoch": 0.3446899938612646, + "grad_norm": 1.5784116983413696, + "learning_rate": 9.999487695483962e-05, + "loss": 2.5967, + "step": 1123 + }, + { + "epoch": 0.34499693063228976, + "grad_norm": 1.1812770366668701, + "learning_rate": 9.999480555561049e-05, + "loss": 2.6303, + "step": 1124 + }, + { + "epoch": 0.3453038674033149, + "grad_norm": 1.5105888843536377, + "learning_rate": 9.99947336623034e-05, + "loss": 2.58, + "step": 1125 + }, + { + "epoch": 0.3456108041743401, + "grad_norm": 1.2969506978988647, + "learning_rate": 9.999466127491904e-05, + "loss": 2.6857, + "step": 1126 + }, + { + "epoch": 0.34591774094536526, + "grad_norm": 1.679018259048462, + "learning_rate": 9.999458839345812e-05, + "loss": 2.6304, + "step": 1127 + }, + { + "epoch": 0.3462246777163904, + "grad_norm": 1.2718015909194946, + "learning_rate": 9.99945150179214e-05, + "loss": 2.6929, + "step": 1128 + }, + { + "epoch": 0.3465316144874156, + "grad_norm": 1.5834014415740967, + "learning_rate": 9.999444114830957e-05, + "loss": 2.6477, + "step": 1129 + }, + { + "epoch": 0.34683855125844076, + "grad_norm": 1.1575955152511597, + "learning_rate": 9.999436678462338e-05, + "loss": 2.6908, + "step": 1130 + }, + { + "epoch": 0.34714548802946593, + "grad_norm": 1.6231988668441772, + "learning_rate": 9.999429192686352e-05, + "loss": 2.6741, + "step": 1131 + }, + { + "epoch": 0.3474524248004911, + "grad_norm": 1.1616390943527222, + "learning_rate": 9.99942165750308e-05, + "loss": 2.5977, + "step": 1132 + }, + { + "epoch": 0.34775936157151627, + "grad_norm": 1.6188498735427856, + "learning_rate": 9.999414072912592e-05, + "loss": 2.6776, + "step": 1133 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 1.3885529041290283, + "learning_rate": 9.999406438914962e-05, + "loss": 2.7136, + "step": 1134 + }, + { + "epoch": 0.3483732351135666, + "grad_norm": 1.4522851705551147, + "learning_rate": 9.999398755510269e-05, + "loss": 2.6817, + "step": 1135 + }, + { + "epoch": 0.34868017188459177, + "grad_norm": 1.2695082426071167, + "learning_rate": 9.999391022698588e-05, + "loss": 2.6257, + "step": 1136 + }, + { + "epoch": 0.34898710865561694, + "grad_norm": 1.1735594272613525, + "learning_rate": 9.999383240479993e-05, + "loss": 2.5908, + "step": 1137 + }, + { + "epoch": 0.3492940454266421, + "grad_norm": 1.4158523082733154, + "learning_rate": 9.999375408854564e-05, + "loss": 2.572, + "step": 1138 + }, + { + "epoch": 0.3496009821976673, + "grad_norm": 1.1342333555221558, + "learning_rate": 9.999367527822376e-05, + "loss": 2.6918, + "step": 1139 + }, + { + "epoch": 0.34990791896869244, + "grad_norm": 1.4462997913360596, + "learning_rate": 9.999359597383509e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 0.3502148557397176, + "grad_norm": 1.254346251487732, + "learning_rate": 9.99935161753804e-05, + "loss": 2.6426, + "step": 1141 + }, + { + "epoch": 0.3505217925107428, + "grad_norm": 1.5101851224899292, + "learning_rate": 9.999343588286048e-05, + "loss": 2.6261, + "step": 1142 + }, + { + "epoch": 0.35082872928176795, + "grad_norm": 1.2910065650939941, + "learning_rate": 9.999335509627612e-05, + "loss": 2.5587, + "step": 1143 + }, + { + "epoch": 0.3511356660527931, + "grad_norm": 1.4421133995056152, + "learning_rate": 9.999327381562812e-05, + "loss": 2.6812, + "step": 1144 + }, + { + "epoch": 0.3514426028238183, + "grad_norm": 1.3265037536621094, + "learning_rate": 9.999319204091728e-05, + "loss": 2.6506, + "step": 1145 + }, + { + "epoch": 0.35174953959484345, + "grad_norm": 1.346258521080017, + "learning_rate": 9.999310977214443e-05, + "loss": 2.7038, + "step": 1146 + }, + { + "epoch": 0.3520564763658686, + "grad_norm": 1.3683836460113525, + "learning_rate": 9.999302700931037e-05, + "loss": 2.5823, + "step": 1147 + }, + { + "epoch": 0.3523634131368938, + "grad_norm": 1.3593783378601074, + "learning_rate": 9.99929437524159e-05, + "loss": 2.5705, + "step": 1148 + }, + { + "epoch": 0.35267034990791896, + "grad_norm": 1.4077095985412598, + "learning_rate": 9.999286000146186e-05, + "loss": 2.6259, + "step": 1149 + }, + { + "epoch": 0.3529772866789441, + "grad_norm": 1.3095922470092773, + "learning_rate": 9.99927757564491e-05, + "loss": 2.683, + "step": 1150 + }, + { + "epoch": 0.3532842234499693, + "grad_norm": 1.4188631772994995, + "learning_rate": 9.999269101737841e-05, + "loss": 2.619, + "step": 1151 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 1.2483123540878296, + "learning_rate": 9.999260578425063e-05, + "loss": 2.6477, + "step": 1152 + }, + { + "epoch": 0.35389809699201963, + "grad_norm": 1.4601099491119385, + "learning_rate": 9.999252005706663e-05, + "loss": 2.5861, + "step": 1153 + }, + { + "epoch": 0.3542050337630448, + "grad_norm": 1.107335090637207, + "learning_rate": 9.999243383582726e-05, + "loss": 2.6308, + "step": 1154 + }, + { + "epoch": 0.35451197053406996, + "grad_norm": 1.60590398311615, + "learning_rate": 9.999234712053334e-05, + "loss": 2.7057, + "step": 1155 + }, + { + "epoch": 0.35481890730509513, + "grad_norm": 1.2256578207015991, + "learning_rate": 9.999225991118575e-05, + "loss": 2.6371, + "step": 1156 + }, + { + "epoch": 0.3551258440761203, + "grad_norm": 1.4451910257339478, + "learning_rate": 9.999217220778535e-05, + "loss": 2.6424, + "step": 1157 + }, + { + "epoch": 0.35543278084714547, + "grad_norm": 1.184781789779663, + "learning_rate": 9.999208401033299e-05, + "loss": 2.6576, + "step": 1158 + }, + { + "epoch": 0.35573971761817064, + "grad_norm": 1.3395711183547974, + "learning_rate": 9.999199531882956e-05, + "loss": 2.6109, + "step": 1159 + }, + { + "epoch": 0.3560466543891958, + "grad_norm": 1.2052571773529053, + "learning_rate": 9.999190613327594e-05, + "loss": 2.5486, + "step": 1160 + }, + { + "epoch": 0.356353591160221, + "grad_norm": 1.2690850496292114, + "learning_rate": 9.999181645367299e-05, + "loss": 2.6457, + "step": 1161 + }, + { + "epoch": 0.35666052793124614, + "grad_norm": 1.2832787036895752, + "learning_rate": 9.999172628002162e-05, + "loss": 2.6097, + "step": 1162 + }, + { + "epoch": 0.3569674647022713, + "grad_norm": 1.3791579008102417, + "learning_rate": 9.999163561232272e-05, + "loss": 2.7458, + "step": 1163 + }, + { + "epoch": 0.3572744014732965, + "grad_norm": 1.260743498802185, + "learning_rate": 9.999154445057715e-05, + "loss": 2.594, + "step": 1164 + }, + { + "epoch": 0.35758133824432164, + "grad_norm": 1.1595406532287598, + "learning_rate": 9.999145279478585e-05, + "loss": 2.5315, + "step": 1165 + }, + { + "epoch": 0.3578882750153468, + "grad_norm": 1.3424396514892578, + "learning_rate": 9.999136064494972e-05, + "loss": 2.6017, + "step": 1166 + }, + { + "epoch": 0.358195211786372, + "grad_norm": 1.317750334739685, + "learning_rate": 9.999126800106963e-05, + "loss": 2.5787, + "step": 1167 + }, + { + "epoch": 0.35850214855739715, + "grad_norm": 1.104471206665039, + "learning_rate": 9.999117486314657e-05, + "loss": 2.6801, + "step": 1168 + }, + { + "epoch": 0.3588090853284224, + "grad_norm": 1.5555830001831055, + "learning_rate": 9.99910812311814e-05, + "loss": 2.6575, + "step": 1169 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 1.1883453130722046, + "learning_rate": 9.999098710517507e-05, + "loss": 2.5801, + "step": 1170 + }, + { + "epoch": 0.3594229588704727, + "grad_norm": 1.3885222673416138, + "learning_rate": 9.99908924851285e-05, + "loss": 2.5637, + "step": 1171 + }, + { + "epoch": 0.3597298956414979, + "grad_norm": 1.1860510110855103, + "learning_rate": 9.999079737104262e-05, + "loss": 2.6528, + "step": 1172 + }, + { + "epoch": 0.36003683241252304, + "grad_norm": 1.4319096803665161, + "learning_rate": 9.99907017629184e-05, + "loss": 2.579, + "step": 1173 + }, + { + "epoch": 0.3603437691835482, + "grad_norm": 1.256819725036621, + "learning_rate": 9.999060566075676e-05, + "loss": 2.5638, + "step": 1174 + }, + { + "epoch": 0.3606507059545734, + "grad_norm": 1.5452641248703003, + "learning_rate": 9.999050906455865e-05, + "loss": 2.6318, + "step": 1175 + }, + { + "epoch": 0.36095764272559855, + "grad_norm": 1.1933847665786743, + "learning_rate": 9.999041197432503e-05, + "loss": 2.5451, + "step": 1176 + }, + { + "epoch": 0.3612645794966237, + "grad_norm": 1.245689034461975, + "learning_rate": 9.999031439005684e-05, + "loss": 2.5452, + "step": 1177 + }, + { + "epoch": 0.3615715162676489, + "grad_norm": 1.2228111028671265, + "learning_rate": 9.99902163117551e-05, + "loss": 2.5856, + "step": 1178 + }, + { + "epoch": 0.36187845303867405, + "grad_norm": 1.3547098636627197, + "learning_rate": 9.999011773942071e-05, + "loss": 2.6604, + "step": 1179 + }, + { + "epoch": 0.3621853898096992, + "grad_norm": 1.25395929813385, + "learning_rate": 9.999001867305469e-05, + "loss": 2.5947, + "step": 1180 + }, + { + "epoch": 0.3624923265807244, + "grad_norm": 1.1676687002182007, + "learning_rate": 9.9989919112658e-05, + "loss": 2.5728, + "step": 1181 + }, + { + "epoch": 0.36279926335174956, + "grad_norm": 1.2076375484466553, + "learning_rate": 9.998981905823163e-05, + "loss": 2.569, + "step": 1182 + }, + { + "epoch": 0.3631062001227747, + "grad_norm": 1.3417900800704956, + "learning_rate": 9.998971850977659e-05, + "loss": 2.5552, + "step": 1183 + }, + { + "epoch": 0.3634131368937999, + "grad_norm": 1.135088324546814, + "learning_rate": 9.998961746729383e-05, + "loss": 2.5883, + "step": 1184 + }, + { + "epoch": 0.36372007366482506, + "grad_norm": 1.3329869508743286, + "learning_rate": 9.998951593078438e-05, + "loss": 2.6398, + "step": 1185 + }, + { + "epoch": 0.36402701043585023, + "grad_norm": 1.1681292057037354, + "learning_rate": 9.998941390024923e-05, + "loss": 2.6082, + "step": 1186 + }, + { + "epoch": 0.3643339472068754, + "grad_norm": 1.4083843231201172, + "learning_rate": 9.998931137568939e-05, + "loss": 2.6585, + "step": 1187 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 1.0879896879196167, + "learning_rate": 9.998920835710587e-05, + "loss": 2.4779, + "step": 1188 + }, + { + "epoch": 0.36494782074892573, + "grad_norm": 1.2977828979492188, + "learning_rate": 9.99891048444997e-05, + "loss": 2.6586, + "step": 1189 + }, + { + "epoch": 0.3652547575199509, + "grad_norm": 1.2552378177642822, + "learning_rate": 9.998900083787188e-05, + "loss": 2.5211, + "step": 1190 + }, + { + "epoch": 0.36556169429097607, + "grad_norm": 1.178227186203003, + "learning_rate": 9.998889633722348e-05, + "loss": 2.5365, + "step": 1191 + }, + { + "epoch": 0.36586863106200124, + "grad_norm": 1.36601722240448, + "learning_rate": 9.99887913425555e-05, + "loss": 2.6108, + "step": 1192 + }, + { + "epoch": 0.3661755678330264, + "grad_norm": 1.1947816610336304, + "learning_rate": 9.998868585386898e-05, + "loss": 2.5269, + "step": 1193 + }, + { + "epoch": 0.3664825046040516, + "grad_norm": 1.3113429546356201, + "learning_rate": 9.998857987116497e-05, + "loss": 2.5241, + "step": 1194 + }, + { + "epoch": 0.36678944137507674, + "grad_norm": 1.1573466062545776, + "learning_rate": 9.99884733944445e-05, + "loss": 2.5772, + "step": 1195 + }, + { + "epoch": 0.3670963781461019, + "grad_norm": 1.3841795921325684, + "learning_rate": 9.998836642370866e-05, + "loss": 2.6254, + "step": 1196 + }, + { + "epoch": 0.3674033149171271, + "grad_norm": 1.3332045078277588, + "learning_rate": 9.998825895895848e-05, + "loss": 2.6846, + "step": 1197 + }, + { + "epoch": 0.36771025168815225, + "grad_norm": 1.1578748226165771, + "learning_rate": 9.9988151000195e-05, + "loss": 2.4717, + "step": 1198 + }, + { + "epoch": 0.3680171884591774, + "grad_norm": 1.1045753955841064, + "learning_rate": 9.998804254741934e-05, + "loss": 2.6433, + "step": 1199 + }, + { + "epoch": 0.3683241252302026, + "grad_norm": 1.3260962963104248, + "learning_rate": 9.998793360063254e-05, + "loss": 2.6385, + "step": 1200 + }, + { + "epoch": 0.36863106200122775, + "grad_norm": 1.1483805179595947, + "learning_rate": 9.998782415983568e-05, + "loss": 2.6013, + "step": 1201 + }, + { + "epoch": 0.3689379987722529, + "grad_norm": 1.1897181272506714, + "learning_rate": 9.998771422502984e-05, + "loss": 2.485, + "step": 1202 + }, + { + "epoch": 0.3692449355432781, + "grad_norm": 1.2124346494674683, + "learning_rate": 9.99876037962161e-05, + "loss": 2.6271, + "step": 1203 + }, + { + "epoch": 0.36955187231430325, + "grad_norm": 1.2274240255355835, + "learning_rate": 9.998749287339557e-05, + "loss": 2.6072, + "step": 1204 + }, + { + "epoch": 0.3698588090853284, + "grad_norm": 1.2045015096664429, + "learning_rate": 9.998738145656934e-05, + "loss": 2.5567, + "step": 1205 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 1.187698483467102, + "learning_rate": 9.998726954573852e-05, + "loss": 2.6251, + "step": 1206 + }, + { + "epoch": 0.37047268262737876, + "grad_norm": 1.1760836839675903, + "learning_rate": 9.998715714090419e-05, + "loss": 2.6544, + "step": 1207 + }, + { + "epoch": 0.3707796193984039, + "grad_norm": 1.2181260585784912, + "learning_rate": 9.998704424206746e-05, + "loss": 2.6258, + "step": 1208 + }, + { + "epoch": 0.3710865561694291, + "grad_norm": 1.2106094360351562, + "learning_rate": 9.998693084922947e-05, + "loss": 2.5932, + "step": 1209 + }, + { + "epoch": 0.37139349294045426, + "grad_norm": 1.2973625659942627, + "learning_rate": 9.998681696239133e-05, + "loss": 2.5257, + "step": 1210 + }, + { + "epoch": 0.37170042971147943, + "grad_norm": 1.2477924823760986, + "learning_rate": 9.998670258155417e-05, + "loss": 2.6579, + "step": 1211 + }, + { + "epoch": 0.3720073664825046, + "grad_norm": 1.3301422595977783, + "learning_rate": 9.998658770671913e-05, + "loss": 2.4903, + "step": 1212 + }, + { + "epoch": 0.37231430325352977, + "grad_norm": 1.224321722984314, + "learning_rate": 9.998647233788732e-05, + "loss": 2.5865, + "step": 1213 + }, + { + "epoch": 0.37262124002455493, + "grad_norm": 1.3110655546188354, + "learning_rate": 9.99863564750599e-05, + "loss": 2.6134, + "step": 1214 + }, + { + "epoch": 0.3729281767955801, + "grad_norm": 1.2323014736175537, + "learning_rate": 9.998624011823801e-05, + "loss": 2.5892, + "step": 1215 + }, + { + "epoch": 0.37323511356660527, + "grad_norm": 1.0873770713806152, + "learning_rate": 9.998612326742279e-05, + "loss": 2.4897, + "step": 1216 + }, + { + "epoch": 0.37354205033763044, + "grad_norm": 1.2789679765701294, + "learning_rate": 9.998600592261539e-05, + "loss": 2.5603, + "step": 1217 + }, + { + "epoch": 0.3738489871086556, + "grad_norm": 1.1311540603637695, + "learning_rate": 9.998588808381699e-05, + "loss": 2.5327, + "step": 1218 + }, + { + "epoch": 0.3741559238796808, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.998576975102876e-05, + "loss": 2.4789, + "step": 1219 + }, + { + "epoch": 0.37446286065070594, + "grad_norm": 1.1840651035308838, + "learning_rate": 9.998565092425182e-05, + "loss": 2.5026, + "step": 1220 + }, + { + "epoch": 0.3747697974217311, + "grad_norm": 1.3145099878311157, + "learning_rate": 9.998553160348743e-05, + "loss": 2.5424, + "step": 1221 + }, + { + "epoch": 0.3750767341927563, + "grad_norm": 1.2192758321762085, + "learning_rate": 9.998541178873668e-05, + "loss": 2.5556, + "step": 1222 + }, + { + "epoch": 0.37538367096378145, + "grad_norm": 1.1329905986785889, + "learning_rate": 9.99852914800008e-05, + "loss": 2.4624, + "step": 1223 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 1.2490339279174805, + "learning_rate": 9.9985170677281e-05, + "loss": 2.5016, + "step": 1224 + }, + { + "epoch": 0.3759975445058318, + "grad_norm": 1.1884582042694092, + "learning_rate": 9.998504938057841e-05, + "loss": 2.5345, + "step": 1225 + }, + { + "epoch": 0.37630448127685695, + "grad_norm": 1.2075775861740112, + "learning_rate": 9.998492758989428e-05, + "loss": 2.5206, + "step": 1226 + }, + { + "epoch": 0.3766114180478821, + "grad_norm": 1.238457441329956, + "learning_rate": 9.99848053052298e-05, + "loss": 2.6748, + "step": 1227 + }, + { + "epoch": 0.3769183548189073, + "grad_norm": 1.3056883811950684, + "learning_rate": 9.998468252658618e-05, + "loss": 2.6146, + "step": 1228 + }, + { + "epoch": 0.37722529158993245, + "grad_norm": 1.191575050354004, + "learning_rate": 9.998455925396461e-05, + "loss": 2.4743, + "step": 1229 + }, + { + "epoch": 0.3775322283609576, + "grad_norm": 1.2834603786468506, + "learning_rate": 9.998443548736635e-05, + "loss": 2.5504, + "step": 1230 + }, + { + "epoch": 0.3778391651319828, + "grad_norm": 1.3023632764816284, + "learning_rate": 9.99843112267926e-05, + "loss": 2.5832, + "step": 1231 + }, + { + "epoch": 0.37814610190300796, + "grad_norm": 1.1219336986541748, + "learning_rate": 9.998418647224458e-05, + "loss": 2.5715, + "step": 1232 + }, + { + "epoch": 0.3784530386740331, + "grad_norm": 1.0666810274124146, + "learning_rate": 9.998406122372354e-05, + "loss": 2.4865, + "step": 1233 + }, + { + "epoch": 0.3787599754450583, + "grad_norm": 1.3699263334274292, + "learning_rate": 9.998393548123072e-05, + "loss": 2.5523, + "step": 1234 + }, + { + "epoch": 0.37906691221608346, + "grad_norm": 1.1383014917373657, + "learning_rate": 9.998380924476733e-05, + "loss": 2.7054, + "step": 1235 + }, + { + "epoch": 0.37937384898710863, + "grad_norm": 1.1304205656051636, + "learning_rate": 9.998368251433465e-05, + "loss": 2.5007, + "step": 1236 + }, + { + "epoch": 0.3796807857581338, + "grad_norm": 1.2220405340194702, + "learning_rate": 9.998355528993394e-05, + "loss": 2.5635, + "step": 1237 + }, + { + "epoch": 0.37998772252915897, + "grad_norm": 1.1126691102981567, + "learning_rate": 9.998342757156642e-05, + "loss": 2.5795, + "step": 1238 + }, + { + "epoch": 0.38029465930018413, + "grad_norm": 1.1675945520401, + "learning_rate": 9.998329935923339e-05, + "loss": 2.564, + "step": 1239 + }, + { + "epoch": 0.38060159607120936, + "grad_norm": 1.1286569833755493, + "learning_rate": 9.998317065293607e-05, + "loss": 2.5476, + "step": 1240 + }, + { + "epoch": 0.3809085328422345, + "grad_norm": 1.1252213716506958, + "learning_rate": 9.998304145267579e-05, + "loss": 2.5406, + "step": 1241 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 1.1931700706481934, + "learning_rate": 9.998291175845378e-05, + "loss": 2.5277, + "step": 1242 + }, + { + "epoch": 0.38152240638428486, + "grad_norm": 1.2148306369781494, + "learning_rate": 9.998278157027136e-05, + "loss": 2.5178, + "step": 1243 + }, + { + "epoch": 0.38182934315531003, + "grad_norm": 1.1597660779953003, + "learning_rate": 9.998265088812978e-05, + "loss": 2.5522, + "step": 1244 + }, + { + "epoch": 0.3821362799263352, + "grad_norm": 1.105973243713379, + "learning_rate": 9.998251971203035e-05, + "loss": 2.4558, + "step": 1245 + }, + { + "epoch": 0.38244321669736037, + "grad_norm": 1.1082781553268433, + "learning_rate": 9.998238804197437e-05, + "loss": 2.5504, + "step": 1246 + }, + { + "epoch": 0.38275015346838553, + "grad_norm": 1.2124732732772827, + "learning_rate": 9.998225587796312e-05, + "loss": 2.5536, + "step": 1247 + }, + { + "epoch": 0.3830570902394107, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.998212321999795e-05, + "loss": 2.4837, + "step": 1248 + }, + { + "epoch": 0.38336402701043587, + "grad_norm": 1.353562355041504, + "learning_rate": 9.998199006808014e-05, + "loss": 2.4554, + "step": 1249 + }, + { + "epoch": 0.38367096378146104, + "grad_norm": 1.2103357315063477, + "learning_rate": 9.998185642221098e-05, + "loss": 2.4843, + "step": 1250 + }, + { + "epoch": 0.3839779005524862, + "grad_norm": 1.2572352886199951, + "learning_rate": 9.998172228239185e-05, + "loss": 2.497, + "step": 1251 + }, + { + "epoch": 0.3842848373235114, + "grad_norm": 1.0910226106643677, + "learning_rate": 9.998158764862402e-05, + "loss": 2.577, + "step": 1252 + }, + { + "epoch": 0.38459177409453654, + "grad_norm": 1.2550606727600098, + "learning_rate": 9.998145252090886e-05, + "loss": 2.5087, + "step": 1253 + }, + { + "epoch": 0.3848987108655617, + "grad_norm": 1.0103787183761597, + "learning_rate": 9.998131689924768e-05, + "loss": 2.5306, + "step": 1254 + }, + { + "epoch": 0.3852056476365869, + "grad_norm": 1.2965941429138184, + "learning_rate": 9.998118078364184e-05, + "loss": 2.5622, + "step": 1255 + }, + { + "epoch": 0.38551258440761205, + "grad_norm": 1.0791535377502441, + "learning_rate": 9.998104417409269e-05, + "loss": 2.5608, + "step": 1256 + }, + { + "epoch": 0.3858195211786372, + "grad_norm": 1.3277596235275269, + "learning_rate": 9.998090707060155e-05, + "loss": 2.5748, + "step": 1257 + }, + { + "epoch": 0.3861264579496624, + "grad_norm": 1.004031777381897, + "learning_rate": 9.99807694731698e-05, + "loss": 2.5532, + "step": 1258 + }, + { + "epoch": 0.38643339472068755, + "grad_norm": 1.4802277088165283, + "learning_rate": 9.998063138179877e-05, + "loss": 2.585, + "step": 1259 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 1.0821146965026855, + "learning_rate": 9.998049279648987e-05, + "loss": 2.5248, + "step": 1260 + }, + { + "epoch": 0.3870472682627379, + "grad_norm": 1.2902108430862427, + "learning_rate": 9.998035371724443e-05, + "loss": 2.5134, + "step": 1261 + }, + { + "epoch": 0.38735420503376305, + "grad_norm": 1.082943320274353, + "learning_rate": 9.998021414406385e-05, + "loss": 2.5937, + "step": 1262 + }, + { + "epoch": 0.3876611418047882, + "grad_norm": 1.2164193391799927, + "learning_rate": 9.998007407694949e-05, + "loss": 2.5106, + "step": 1263 + }, + { + "epoch": 0.3879680785758134, + "grad_norm": 1.0999115705490112, + "learning_rate": 9.997993351590276e-05, + "loss": 2.5458, + "step": 1264 + }, + { + "epoch": 0.38827501534683856, + "grad_norm": 1.2275537252426147, + "learning_rate": 9.997979246092503e-05, + "loss": 2.5664, + "step": 1265 + }, + { + "epoch": 0.3885819521178637, + "grad_norm": 1.3246204853057861, + "learning_rate": 9.997965091201769e-05, + "loss": 2.5289, + "step": 1266 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.2404677867889404, + "learning_rate": 9.997950886918214e-05, + "loss": 2.5302, + "step": 1267 + }, + { + "epoch": 0.38919582565991406, + "grad_norm": 1.0993810892105103, + "learning_rate": 9.99793663324198e-05, + "loss": 2.5085, + "step": 1268 + }, + { + "epoch": 0.38950276243093923, + "grad_norm": 1.3394049406051636, + "learning_rate": 9.997922330173206e-05, + "loss": 2.5882, + "step": 1269 + }, + { + "epoch": 0.3898096992019644, + "grad_norm": 1.1464321613311768, + "learning_rate": 9.997907977712036e-05, + "loss": 2.5211, + "step": 1270 + }, + { + "epoch": 0.39011663597298957, + "grad_norm": 1.1246297359466553, + "learning_rate": 9.997893575858608e-05, + "loss": 2.4204, + "step": 1271 + }, + { + "epoch": 0.39042357274401474, + "grad_norm": 1.1278076171875, + "learning_rate": 9.997879124613067e-05, + "loss": 2.4405, + "step": 1272 + }, + { + "epoch": 0.3907305095150399, + "grad_norm": 1.2284942865371704, + "learning_rate": 9.997864623975555e-05, + "loss": 2.5674, + "step": 1273 + }, + { + "epoch": 0.39103744628606507, + "grad_norm": 1.1243138313293457, + "learning_rate": 9.997850073946215e-05, + "loss": 2.489, + "step": 1274 + }, + { + "epoch": 0.39134438305709024, + "grad_norm": 1.198461890220642, + "learning_rate": 9.997835474525193e-05, + "loss": 2.51, + "step": 1275 + }, + { + "epoch": 0.3916513198281154, + "grad_norm": 1.1643213033676147, + "learning_rate": 9.997820825712629e-05, + "loss": 2.5688, + "step": 1276 + }, + { + "epoch": 0.3919582565991406, + "grad_norm": 1.2107082605361938, + "learning_rate": 9.997806127508671e-05, + "loss": 2.5614, + "step": 1277 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 1.1856440305709839, + "learning_rate": 9.997791379913464e-05, + "loss": 2.5893, + "step": 1278 + }, + { + "epoch": 0.3925721301411909, + "grad_norm": 1.166395664215088, + "learning_rate": 9.997776582927153e-05, + "loss": 2.539, + "step": 1279 + }, + { + "epoch": 0.3928790669122161, + "grad_norm": 1.1638765335083008, + "learning_rate": 9.997761736549886e-05, + "loss": 2.5384, + "step": 1280 + }, + { + "epoch": 0.39318600368324125, + "grad_norm": 1.107485055923462, + "learning_rate": 9.997746840781806e-05, + "loss": 2.559, + "step": 1281 + }, + { + "epoch": 0.3934929404542664, + "grad_norm": 1.174592137336731, + "learning_rate": 9.997731895623063e-05, + "loss": 2.5132, + "step": 1282 + }, + { + "epoch": 0.3937998772252916, + "grad_norm": 1.0407745838165283, + "learning_rate": 9.997716901073806e-05, + "loss": 2.4871, + "step": 1283 + }, + { + "epoch": 0.39410681399631675, + "grad_norm": 1.059743046760559, + "learning_rate": 9.997701857134179e-05, + "loss": 2.4865, + "step": 1284 + }, + { + "epoch": 0.3944137507673419, + "grad_norm": 1.0606070756912231, + "learning_rate": 9.997686763804335e-05, + "loss": 2.5651, + "step": 1285 + }, + { + "epoch": 0.3947206875383671, + "grad_norm": 1.0753284692764282, + "learning_rate": 9.99767162108442e-05, + "loss": 2.4699, + "step": 1286 + }, + { + "epoch": 0.39502762430939226, + "grad_norm": 1.1155509948730469, + "learning_rate": 9.997656428974585e-05, + "loss": 2.5326, + "step": 1287 + }, + { + "epoch": 0.3953345610804174, + "grad_norm": 1.2243739366531372, + "learning_rate": 9.99764118747498e-05, + "loss": 2.5189, + "step": 1288 + }, + { + "epoch": 0.3956414978514426, + "grad_norm": 1.2526514530181885, + "learning_rate": 9.997625896585757e-05, + "loss": 2.5464, + "step": 1289 + }, + { + "epoch": 0.39594843462246776, + "grad_norm": 1.297153115272522, + "learning_rate": 9.997610556307062e-05, + "loss": 2.5752, + "step": 1290 + }, + { + "epoch": 0.39625537139349293, + "grad_norm": 1.1064956188201904, + "learning_rate": 9.997595166639054e-05, + "loss": 2.5743, + "step": 1291 + }, + { + "epoch": 0.3965623081645181, + "grad_norm": 1.255810022354126, + "learning_rate": 9.997579727581879e-05, + "loss": 2.7087, + "step": 1292 + }, + { + "epoch": 0.39686924493554326, + "grad_norm": 1.4290298223495483, + "learning_rate": 9.997564239135692e-05, + "loss": 2.5417, + "step": 1293 + }, + { + "epoch": 0.39717618170656843, + "grad_norm": 1.1937109231948853, + "learning_rate": 9.997548701300648e-05, + "loss": 2.4862, + "step": 1294 + }, + { + "epoch": 0.3974831184775936, + "grad_norm": 1.1707425117492676, + "learning_rate": 9.997533114076897e-05, + "loss": 2.4715, + "step": 1295 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 1.1248551607131958, + "learning_rate": 9.997517477464596e-05, + "loss": 2.4859, + "step": 1296 + }, + { + "epoch": 0.39809699201964394, + "grad_norm": 1.1656453609466553, + "learning_rate": 9.997501791463897e-05, + "loss": 2.5402, + "step": 1297 + }, + { + "epoch": 0.3984039287906691, + "grad_norm": 0.9916674494743347, + "learning_rate": 9.997486056074956e-05, + "loss": 2.5116, + "step": 1298 + }, + { + "epoch": 0.39871086556169427, + "grad_norm": 1.3229619264602661, + "learning_rate": 9.997470271297928e-05, + "loss": 2.5565, + "step": 1299 + }, + { + "epoch": 0.39901780233271944, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.997454437132971e-05, + "loss": 2.5191, + "step": 1300 + }, + { + "epoch": 0.3993247391037446, + "grad_norm": 1.2117778062820435, + "learning_rate": 9.997438553580241e-05, + "loss": 2.558, + "step": 1301 + }, + { + "epoch": 0.3996316758747698, + "grad_norm": 1.1083563566207886, + "learning_rate": 9.997422620639892e-05, + "loss": 2.4734, + "step": 1302 + }, + { + "epoch": 0.39993861264579494, + "grad_norm": 0.9662174582481384, + "learning_rate": 9.997406638312084e-05, + "loss": 2.4866, + "step": 1303 + }, + { + "epoch": 0.4002455494168201, + "grad_norm": 1.0886632204055786, + "learning_rate": 9.997390606596976e-05, + "loss": 2.5397, + "step": 1304 + }, + { + "epoch": 0.4005524861878453, + "grad_norm": 1.2318742275238037, + "learning_rate": 9.997374525494723e-05, + "loss": 2.6281, + "step": 1305 + }, + { + "epoch": 0.40085942295887045, + "grad_norm": 1.1717815399169922, + "learning_rate": 9.997358395005487e-05, + "loss": 2.5202, + "step": 1306 + }, + { + "epoch": 0.4011663597298956, + "grad_norm": 1.0533723831176758, + "learning_rate": 9.997342215129427e-05, + "loss": 2.5096, + "step": 1307 + }, + { + "epoch": 0.4014732965009208, + "grad_norm": 1.0814248323440552, + "learning_rate": 9.997325985866701e-05, + "loss": 2.5513, + "step": 1308 + }, + { + "epoch": 0.40178023327194595, + "grad_norm": 1.078261137008667, + "learning_rate": 9.997309707217472e-05, + "loss": 2.5115, + "step": 1309 + }, + { + "epoch": 0.4020871700429711, + "grad_norm": 1.0834710597991943, + "learning_rate": 9.997293379181897e-05, + "loss": 2.4754, + "step": 1310 + }, + { + "epoch": 0.40239410681399634, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.997277001760142e-05, + "loss": 2.5068, + "step": 1311 + }, + { + "epoch": 0.4027010435850215, + "grad_norm": 1.3008345365524292, + "learning_rate": 9.997260574952366e-05, + "loss": 2.4675, + "step": 1312 + }, + { + "epoch": 0.4030079803560467, + "grad_norm": 1.176858901977539, + "learning_rate": 9.997244098758732e-05, + "loss": 2.4786, + "step": 1313 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 1.0121303796768188, + "learning_rate": 9.997227573179403e-05, + "loss": 2.476, + "step": 1314 + }, + { + "epoch": 0.403621853898097, + "grad_norm": 1.326298713684082, + "learning_rate": 9.997210998214542e-05, + "loss": 2.4093, + "step": 1315 + }, + { + "epoch": 0.4039287906691222, + "grad_norm": 0.9008898735046387, + "learning_rate": 9.997194373864314e-05, + "loss": 2.4523, + "step": 1316 + }, + { + "epoch": 0.40423572744014735, + "grad_norm": 1.0441854000091553, + "learning_rate": 9.99717770012888e-05, + "loss": 2.5419, + "step": 1317 + }, + { + "epoch": 0.4045426642111725, + "grad_norm": 1.0490028858184814, + "learning_rate": 9.997160977008408e-05, + "loss": 2.4855, + "step": 1318 + }, + { + "epoch": 0.4048496009821977, + "grad_norm": 1.0244388580322266, + "learning_rate": 9.997144204503063e-05, + "loss": 2.4555, + "step": 1319 + }, + { + "epoch": 0.40515653775322286, + "grad_norm": 1.1217700242996216, + "learning_rate": 9.99712738261301e-05, + "loss": 2.4872, + "step": 1320 + }, + { + "epoch": 0.405463474524248, + "grad_norm": 1.031691551208496, + "learning_rate": 9.997110511338414e-05, + "loss": 2.4094, + "step": 1321 + }, + { + "epoch": 0.4057704112952732, + "grad_norm": 1.1658705472946167, + "learning_rate": 9.997093590679444e-05, + "loss": 2.407, + "step": 1322 + }, + { + "epoch": 0.40607734806629836, + "grad_norm": 1.1527072191238403, + "learning_rate": 9.997076620636266e-05, + "loss": 2.5041, + "step": 1323 + }, + { + "epoch": 0.40638428483732353, + "grad_norm": 1.2039116621017456, + "learning_rate": 9.997059601209049e-05, + "loss": 2.4682, + "step": 1324 + }, + { + "epoch": 0.4066912216083487, + "grad_norm": 1.142160177230835, + "learning_rate": 9.997042532397957e-05, + "loss": 2.4629, + "step": 1325 + }, + { + "epoch": 0.40699815837937386, + "grad_norm": 0.972081184387207, + "learning_rate": 9.997025414203164e-05, + "loss": 2.3941, + "step": 1326 + }, + { + "epoch": 0.40730509515039903, + "grad_norm": 1.0181753635406494, + "learning_rate": 9.99700824662484e-05, + "loss": 2.5649, + "step": 1327 + }, + { + "epoch": 0.4076120319214242, + "grad_norm": 1.145769715309143, + "learning_rate": 9.996991029663148e-05, + "loss": 2.5284, + "step": 1328 + }, + { + "epoch": 0.40791896869244937, + "grad_norm": 1.0604028701782227, + "learning_rate": 9.996973763318262e-05, + "loss": 2.4488, + "step": 1329 + }, + { + "epoch": 0.40822590546347454, + "grad_norm": 1.161383867263794, + "learning_rate": 9.996956447590354e-05, + "loss": 2.6081, + "step": 1330 + }, + { + "epoch": 0.4085328422344997, + "grad_norm": 1.0880714654922485, + "learning_rate": 9.996939082479591e-05, + "loss": 2.4695, + "step": 1331 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 1.036556601524353, + "learning_rate": 9.99692166798615e-05, + "loss": 2.4428, + "step": 1332 + }, + { + "epoch": 0.40914671577655004, + "grad_norm": 1.079179286956787, + "learning_rate": 9.996904204110198e-05, + "loss": 2.4543, + "step": 1333 + }, + { + "epoch": 0.4094536525475752, + "grad_norm": 1.0588144063949585, + "learning_rate": 9.996886690851912e-05, + "loss": 2.4755, + "step": 1334 + }, + { + "epoch": 0.4097605893186004, + "grad_norm": 1.0359580516815186, + "learning_rate": 9.996869128211462e-05, + "loss": 2.4933, + "step": 1335 + }, + { + "epoch": 0.41006752608962554, + "grad_norm": 1.0067389011383057, + "learning_rate": 9.996851516189021e-05, + "loss": 2.4291, + "step": 1336 + }, + { + "epoch": 0.4103744628606507, + "grad_norm": 1.0173524618148804, + "learning_rate": 9.996833854784766e-05, + "loss": 2.4856, + "step": 1337 + }, + { + "epoch": 0.4106813996316759, + "grad_norm": 1.0740927457809448, + "learning_rate": 9.99681614399887e-05, + "loss": 2.5248, + "step": 1338 + }, + { + "epoch": 0.41098833640270105, + "grad_norm": 0.9638547301292419, + "learning_rate": 9.99679838383151e-05, + "loss": 2.4777, + "step": 1339 + }, + { + "epoch": 0.4112952731737262, + "grad_norm": 1.0349369049072266, + "learning_rate": 9.996780574282856e-05, + "loss": 2.5188, + "step": 1340 + }, + { + "epoch": 0.4116022099447514, + "grad_norm": 1.099743127822876, + "learning_rate": 9.996762715353089e-05, + "loss": 2.4141, + "step": 1341 + }, + { + "epoch": 0.41190914671577655, + "grad_norm": 1.027178406715393, + "learning_rate": 9.996744807042386e-05, + "loss": 2.5134, + "step": 1342 + }, + { + "epoch": 0.4122160834868017, + "grad_norm": 1.1933472156524658, + "learning_rate": 9.996726849350922e-05, + "loss": 2.4821, + "step": 1343 + }, + { + "epoch": 0.4125230202578269, + "grad_norm": 1.1663923263549805, + "learning_rate": 9.996708842278872e-05, + "loss": 2.4593, + "step": 1344 + }, + { + "epoch": 0.41282995702885206, + "grad_norm": 1.2633854150772095, + "learning_rate": 9.996690785826418e-05, + "loss": 2.5524, + "step": 1345 + }, + { + "epoch": 0.4131368937998772, + "grad_norm": 1.03873610496521, + "learning_rate": 9.996672679993737e-05, + "loss": 2.5403, + "step": 1346 + }, + { + "epoch": 0.4134438305709024, + "grad_norm": 1.106656789779663, + "learning_rate": 9.996654524781009e-05, + "loss": 2.5172, + "step": 1347 + }, + { + "epoch": 0.41375076734192756, + "grad_norm": 1.015608310699463, + "learning_rate": 9.996636320188411e-05, + "loss": 2.423, + "step": 1348 + }, + { + "epoch": 0.41405770411295273, + "grad_norm": 1.0672087669372559, + "learning_rate": 9.996618066216124e-05, + "loss": 2.4861, + "step": 1349 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 1.1289842128753662, + "learning_rate": 9.996599762864329e-05, + "loss": 2.3944, + "step": 1350 + }, + { + "epoch": 0.41467157765500307, + "grad_norm": 1.080428957939148, + "learning_rate": 9.996581410133207e-05, + "loss": 2.4563, + "step": 1351 + }, + { + "epoch": 0.41497851442602823, + "grad_norm": 1.257104516029358, + "learning_rate": 9.996563008022939e-05, + "loss": 2.437, + "step": 1352 + }, + { + "epoch": 0.4152854511970534, + "grad_norm": 1.039293646812439, + "learning_rate": 9.996544556533706e-05, + "loss": 2.4654, + "step": 1353 + }, + { + "epoch": 0.41559238796807857, + "grad_norm": 1.0976085662841797, + "learning_rate": 9.996526055665692e-05, + "loss": 2.4755, + "step": 1354 + }, + { + "epoch": 0.41589932473910374, + "grad_norm": 0.937647819519043, + "learning_rate": 9.996507505419078e-05, + "loss": 2.4687, + "step": 1355 + }, + { + "epoch": 0.4162062615101289, + "grad_norm": 1.0461267232894897, + "learning_rate": 9.996488905794047e-05, + "loss": 2.4092, + "step": 1356 + }, + { + "epoch": 0.4165131982811541, + "grad_norm": 1.0510658025741577, + "learning_rate": 9.996470256790787e-05, + "loss": 2.4806, + "step": 1357 + }, + { + "epoch": 0.41682013505217924, + "grad_norm": 1.2323371171951294, + "learning_rate": 9.996451558409478e-05, + "loss": 2.5017, + "step": 1358 + }, + { + "epoch": 0.4171270718232044, + "grad_norm": 0.9880139827728271, + "learning_rate": 9.996432810650307e-05, + "loss": 2.5171, + "step": 1359 + }, + { + "epoch": 0.4174340085942296, + "grad_norm": 1.2572466135025024, + "learning_rate": 9.996414013513458e-05, + "loss": 2.4285, + "step": 1360 + }, + { + "epoch": 0.41774094536525475, + "grad_norm": 0.8809559941291809, + "learning_rate": 9.996395166999118e-05, + "loss": 2.398, + "step": 1361 + }, + { + "epoch": 0.4180478821362799, + "grad_norm": 0.9389429688453674, + "learning_rate": 9.996376271107471e-05, + "loss": 2.4539, + "step": 1362 + }, + { + "epoch": 0.4183548189073051, + "grad_norm": 0.8821789026260376, + "learning_rate": 9.996357325838705e-05, + "loss": 2.4762, + "step": 1363 + }, + { + "epoch": 0.41866175567833025, + "grad_norm": 1.0148484706878662, + "learning_rate": 9.99633833119301e-05, + "loss": 2.5292, + "step": 1364 + }, + { + "epoch": 0.4189686924493554, + "grad_norm": 0.9861947894096375, + "learning_rate": 9.996319287170569e-05, + "loss": 2.4285, + "step": 1365 + }, + { + "epoch": 0.4192756292203806, + "grad_norm": 1.1907099485397339, + "learning_rate": 9.996300193771573e-05, + "loss": 2.4325, + "step": 1366 + }, + { + "epoch": 0.41958256599140575, + "grad_norm": 1.0746681690216064, + "learning_rate": 9.99628105099621e-05, + "loss": 2.3349, + "step": 1367 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 1.2040268182754517, + "learning_rate": 9.996261858844669e-05, + "loss": 2.4427, + "step": 1368 + }, + { + "epoch": 0.4201964395334561, + "grad_norm": 1.0487430095672607, + "learning_rate": 9.99624261731714e-05, + "loss": 2.4305, + "step": 1369 + }, + { + "epoch": 0.42050337630448126, + "grad_norm": 1.0047999620437622, + "learning_rate": 9.996223326413812e-05, + "loss": 2.4442, + "step": 1370 + }, + { + "epoch": 0.4208103130755064, + "grad_norm": 1.147078275680542, + "learning_rate": 9.996203986134879e-05, + "loss": 2.5189, + "step": 1371 + }, + { + "epoch": 0.4211172498465316, + "grad_norm": 1.2269455194473267, + "learning_rate": 9.996184596480529e-05, + "loss": 2.3905, + "step": 1372 + }, + { + "epoch": 0.42142418661755676, + "grad_norm": 0.9716771245002747, + "learning_rate": 9.996165157450954e-05, + "loss": 2.4246, + "step": 1373 + }, + { + "epoch": 0.42173112338858193, + "grad_norm": 1.0569939613342285, + "learning_rate": 9.996145669046347e-05, + "loss": 2.529, + "step": 1374 + }, + { + "epoch": 0.4220380601596071, + "grad_norm": 1.1145942211151123, + "learning_rate": 9.996126131266899e-05, + "loss": 2.3965, + "step": 1375 + }, + { + "epoch": 0.42234499693063227, + "grad_norm": 0.9990974068641663, + "learning_rate": 9.996106544112805e-05, + "loss": 2.4991, + "step": 1376 + }, + { + "epoch": 0.42265193370165743, + "grad_norm": 0.9536247253417969, + "learning_rate": 9.99608690758426e-05, + "loss": 2.4347, + "step": 1377 + }, + { + "epoch": 0.4229588704726826, + "grad_norm": 1.0053460597991943, + "learning_rate": 9.996067221681452e-05, + "loss": 2.4213, + "step": 1378 + }, + { + "epoch": 0.42326580724370777, + "grad_norm": 1.0727168321609497, + "learning_rate": 9.99604748640458e-05, + "loss": 2.4479, + "step": 1379 + }, + { + "epoch": 0.42357274401473294, + "grad_norm": 1.2539277076721191, + "learning_rate": 9.996027701753841e-05, + "loss": 2.4721, + "step": 1380 + }, + { + "epoch": 0.4238796807857581, + "grad_norm": 1.0348230600357056, + "learning_rate": 9.996007867729427e-05, + "loss": 2.4263, + "step": 1381 + }, + { + "epoch": 0.42418661755678333, + "grad_norm": 1.051802158355713, + "learning_rate": 9.995987984331533e-05, + "loss": 2.4492, + "step": 1382 + }, + { + "epoch": 0.4244935543278085, + "grad_norm": 1.0394505262374878, + "learning_rate": 9.995968051560361e-05, + "loss": 2.4625, + "step": 1383 + }, + { + "epoch": 0.42480049109883367, + "grad_norm": 1.1121852397918701, + "learning_rate": 9.995948069416103e-05, + "loss": 2.4999, + "step": 1384 + }, + { + "epoch": 0.42510742786985883, + "grad_norm": 0.9693613052368164, + "learning_rate": 9.995928037898957e-05, + "loss": 2.4112, + "step": 1385 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 1.1416810750961304, + "learning_rate": 9.995907957009123e-05, + "loss": 2.5452, + "step": 1386 + }, + { + "epoch": 0.42572130141190917, + "grad_norm": 1.010640025138855, + "learning_rate": 9.995887826746797e-05, + "loss": 2.412, + "step": 1387 + }, + { + "epoch": 0.42602823818293434, + "grad_norm": 1.0800373554229736, + "learning_rate": 9.99586764711218e-05, + "loss": 2.4451, + "step": 1388 + }, + { + "epoch": 0.4263351749539595, + "grad_norm": 1.058931589126587, + "learning_rate": 9.995847418105471e-05, + "loss": 2.474, + "step": 1389 + }, + { + "epoch": 0.4266421117249847, + "grad_norm": 1.0727131366729736, + "learning_rate": 9.99582713972687e-05, + "loss": 2.468, + "step": 1390 + }, + { + "epoch": 0.42694904849600984, + "grad_norm": 1.0237464904785156, + "learning_rate": 9.995806811976576e-05, + "loss": 2.5208, + "step": 1391 + }, + { + "epoch": 0.427255985267035, + "grad_norm": 1.036582112312317, + "learning_rate": 9.995786434854793e-05, + "loss": 2.4338, + "step": 1392 + }, + { + "epoch": 0.4275629220380602, + "grad_norm": 0.9617817997932434, + "learning_rate": 9.995766008361719e-05, + "loss": 2.4465, + "step": 1393 + }, + { + "epoch": 0.42786985880908535, + "grad_norm": 1.2188911437988281, + "learning_rate": 9.995745532497556e-05, + "loss": 2.5069, + "step": 1394 + }, + { + "epoch": 0.4281767955801105, + "grad_norm": 1.0796585083007812, + "learning_rate": 9.99572500726251e-05, + "loss": 2.4839, + "step": 1395 + }, + { + "epoch": 0.4284837323511357, + "grad_norm": 0.9843130111694336, + "learning_rate": 9.99570443265678e-05, + "loss": 2.4968, + "step": 1396 + }, + { + "epoch": 0.42879066912216085, + "grad_norm": 1.0441415309906006, + "learning_rate": 9.99568380868057e-05, + "loss": 2.4134, + "step": 1397 + }, + { + "epoch": 0.429097605893186, + "grad_norm": 0.9156177639961243, + "learning_rate": 9.995663135334085e-05, + "loss": 2.4891, + "step": 1398 + }, + { + "epoch": 0.4294045426642112, + "grad_norm": 1.1159545183181763, + "learning_rate": 9.995642412617529e-05, + "loss": 2.4507, + "step": 1399 + }, + { + "epoch": 0.42971147943523635, + "grad_norm": 0.8944577574729919, + "learning_rate": 9.995621640531107e-05, + "loss": 2.4465, + "step": 1400 + }, + { + "epoch": 0.4300184162062615, + "grad_norm": 0.9043408036231995, + "learning_rate": 9.995600819075025e-05, + "loss": 2.3726, + "step": 1401 + }, + { + "epoch": 0.4303253529772867, + "grad_norm": 0.9028464555740356, + "learning_rate": 9.995579948249486e-05, + "loss": 2.427, + "step": 1402 + }, + { + "epoch": 0.43063228974831186, + "grad_norm": 0.9497705101966858, + "learning_rate": 9.995559028054699e-05, + "loss": 2.4666, + "step": 1403 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.927601158618927, + "learning_rate": 9.995538058490868e-05, + "loss": 2.3679, + "step": 1404 + }, + { + "epoch": 0.4312461632903622, + "grad_norm": 1.050394892692566, + "learning_rate": 9.995517039558204e-05, + "loss": 2.4096, + "step": 1405 + }, + { + "epoch": 0.43155310006138736, + "grad_norm": 1.3011974096298218, + "learning_rate": 9.995495971256911e-05, + "loss": 2.4439, + "step": 1406 + }, + { + "epoch": 0.43186003683241253, + "grad_norm": 1.0740708112716675, + "learning_rate": 9.9954748535872e-05, + "loss": 2.4891, + "step": 1407 + }, + { + "epoch": 0.4321669736034377, + "grad_norm": 1.1132466793060303, + "learning_rate": 9.995453686549279e-05, + "loss": 2.46, + "step": 1408 + }, + { + "epoch": 0.43247391037446287, + "grad_norm": 1.063275933265686, + "learning_rate": 9.995432470143356e-05, + "loss": 2.5035, + "step": 1409 + }, + { + "epoch": 0.43278084714548803, + "grad_norm": 1.065679669380188, + "learning_rate": 9.99541120436964e-05, + "loss": 2.4471, + "step": 1410 + }, + { + "epoch": 0.4330877839165132, + "grad_norm": 1.017587423324585, + "learning_rate": 9.995389889228344e-05, + "loss": 2.4879, + "step": 1411 + }, + { + "epoch": 0.43339472068753837, + "grad_norm": 0.9744442701339722, + "learning_rate": 9.995368524719678e-05, + "loss": 2.3923, + "step": 1412 + }, + { + "epoch": 0.43370165745856354, + "grad_norm": 0.8916706442832947, + "learning_rate": 9.995347110843851e-05, + "loss": 2.3965, + "step": 1413 + }, + { + "epoch": 0.4340085942295887, + "grad_norm": 0.916221559047699, + "learning_rate": 9.995325647601075e-05, + "loss": 2.4742, + "step": 1414 + }, + { + "epoch": 0.4343155310006139, + "grad_norm": 0.9388782978057861, + "learning_rate": 9.995304134991565e-05, + "loss": 2.453, + "step": 1415 + }, + { + "epoch": 0.43462246777163904, + "grad_norm": 1.057085633277893, + "learning_rate": 9.995282573015532e-05, + "loss": 2.5791, + "step": 1416 + }, + { + "epoch": 0.4349294045426642, + "grad_norm": 1.055145025253296, + "learning_rate": 9.995260961673187e-05, + "loss": 2.3565, + "step": 1417 + }, + { + "epoch": 0.4352363413136894, + "grad_norm": 1.0733528137207031, + "learning_rate": 9.995239300964747e-05, + "loss": 2.5413, + "step": 1418 + }, + { + "epoch": 0.43554327808471455, + "grad_norm": 1.1478198766708374, + "learning_rate": 9.995217590890425e-05, + "loss": 2.4093, + "step": 1419 + }, + { + "epoch": 0.4358502148557397, + "grad_norm": 0.8663081526756287, + "learning_rate": 9.995195831450432e-05, + "loss": 2.3968, + "step": 1420 + }, + { + "epoch": 0.4361571516267649, + "grad_norm": 0.9811860918998718, + "learning_rate": 9.995174022644988e-05, + "loss": 2.3536, + "step": 1421 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.9883477687835693, + "learning_rate": 9.995152164474306e-05, + "loss": 2.5372, + "step": 1422 + }, + { + "epoch": 0.4367710251688152, + "grad_norm": 1.2196532487869263, + "learning_rate": 9.995130256938603e-05, + "loss": 2.429, + "step": 1423 + }, + { + "epoch": 0.4370779619398404, + "grad_norm": 1.000264286994934, + "learning_rate": 9.995108300038096e-05, + "loss": 2.4116, + "step": 1424 + }, + { + "epoch": 0.43738489871086556, + "grad_norm": 1.1259286403656006, + "learning_rate": 9.995086293773e-05, + "loss": 2.4405, + "step": 1425 + }, + { + "epoch": 0.4376918354818907, + "grad_norm": 0.9334595203399658, + "learning_rate": 9.995064238143533e-05, + "loss": 2.3849, + "step": 1426 + }, + { + "epoch": 0.4379987722529159, + "grad_norm": 0.8880285620689392, + "learning_rate": 9.995042133149914e-05, + "loss": 2.4177, + "step": 1427 + }, + { + "epoch": 0.43830570902394106, + "grad_norm": 0.8823251724243164, + "learning_rate": 9.995019978792362e-05, + "loss": 2.4876, + "step": 1428 + }, + { + "epoch": 0.4386126457949662, + "grad_norm": 0.9289014339447021, + "learning_rate": 9.994997775071094e-05, + "loss": 2.4725, + "step": 1429 + }, + { + "epoch": 0.4389195825659914, + "grad_norm": 0.9100427627563477, + "learning_rate": 9.994975521986329e-05, + "loss": 2.3834, + "step": 1430 + }, + { + "epoch": 0.43922651933701656, + "grad_norm": 0.8956978917121887, + "learning_rate": 9.99495321953829e-05, + "loss": 2.4418, + "step": 1431 + }, + { + "epoch": 0.43953345610804173, + "grad_norm": 1.1248396635055542, + "learning_rate": 9.994930867727195e-05, + "loss": 2.4389, + "step": 1432 + }, + { + "epoch": 0.4398403928790669, + "grad_norm": 0.9285669922828674, + "learning_rate": 9.994908466553266e-05, + "loss": 2.3922, + "step": 1433 + }, + { + "epoch": 0.44014732965009207, + "grad_norm": 0.9604844450950623, + "learning_rate": 9.994886016016723e-05, + "loss": 2.4365, + "step": 1434 + }, + { + "epoch": 0.44045426642111724, + "grad_norm": 1.0534024238586426, + "learning_rate": 9.99486351611779e-05, + "loss": 2.4377, + "step": 1435 + }, + { + "epoch": 0.4407612031921424, + "grad_norm": 1.1028003692626953, + "learning_rate": 9.994840966856686e-05, + "loss": 2.4299, + "step": 1436 + }, + { + "epoch": 0.44106813996316757, + "grad_norm": 1.119832158088684, + "learning_rate": 9.994818368233639e-05, + "loss": 2.4656, + "step": 1437 + }, + { + "epoch": 0.44137507673419274, + "grad_norm": 0.9782878160476685, + "learning_rate": 9.994795720248867e-05, + "loss": 2.3661, + "step": 1438 + }, + { + "epoch": 0.4416820135052179, + "grad_norm": 1.0002741813659668, + "learning_rate": 9.994773022902597e-05, + "loss": 2.4157, + "step": 1439 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 1.051486611366272, + "learning_rate": 9.994750276195053e-05, + "loss": 2.452, + "step": 1440 + }, + { + "epoch": 0.44229588704726824, + "grad_norm": 1.0375488996505737, + "learning_rate": 9.994727480126457e-05, + "loss": 2.4406, + "step": 1441 + }, + { + "epoch": 0.4426028238182934, + "grad_norm": 0.9407445192337036, + "learning_rate": 9.99470463469704e-05, + "loss": 2.3434, + "step": 1442 + }, + { + "epoch": 0.4429097605893186, + "grad_norm": 1.0371474027633667, + "learning_rate": 9.994681739907022e-05, + "loss": 2.5094, + "step": 1443 + }, + { + "epoch": 0.44321669736034375, + "grad_norm": 1.057519555091858, + "learning_rate": 9.994658795756632e-05, + "loss": 2.4501, + "step": 1444 + }, + { + "epoch": 0.4435236341313689, + "grad_norm": 0.9340078234672546, + "learning_rate": 9.994635802246097e-05, + "loss": 2.4151, + "step": 1445 + }, + { + "epoch": 0.4438305709023941, + "grad_norm": 0.8906050324440002, + "learning_rate": 9.994612759375644e-05, + "loss": 2.3837, + "step": 1446 + }, + { + "epoch": 0.44413750767341925, + "grad_norm": 0.8349595665931702, + "learning_rate": 9.994589667145497e-05, + "loss": 2.4317, + "step": 1447 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.9362117648124695, + "learning_rate": 9.994566525555891e-05, + "loss": 2.4586, + "step": 1448 + }, + { + "epoch": 0.4447513812154696, + "grad_norm": 0.869215190410614, + "learning_rate": 9.99454333460705e-05, + "loss": 2.4458, + "step": 1449 + }, + { + "epoch": 0.44505831798649476, + "grad_norm": 0.904531717300415, + "learning_rate": 9.994520094299204e-05, + "loss": 2.4198, + "step": 1450 + }, + { + "epoch": 0.4453652547575199, + "grad_norm": 0.9153178930282593, + "learning_rate": 9.994496804632583e-05, + "loss": 2.3718, + "step": 1451 + }, + { + "epoch": 0.44567219152854515, + "grad_norm": 1.0229307413101196, + "learning_rate": 9.994473465607418e-05, + "loss": 2.3787, + "step": 1452 + }, + { + "epoch": 0.4459791282995703, + "grad_norm": 1.0449415445327759, + "learning_rate": 9.994450077223938e-05, + "loss": 2.4965, + "step": 1453 + }, + { + "epoch": 0.4462860650705955, + "grad_norm": 1.0524135828018188, + "learning_rate": 9.994426639482375e-05, + "loss": 2.3518, + "step": 1454 + }, + { + "epoch": 0.44659300184162065, + "grad_norm": 1.0612086057662964, + "learning_rate": 9.994403152382961e-05, + "loss": 2.4501, + "step": 1455 + }, + { + "epoch": 0.4468999386126458, + "grad_norm": 1.0568779706954956, + "learning_rate": 9.994379615925929e-05, + "loss": 2.3754, + "step": 1456 + }, + { + "epoch": 0.447206875383671, + "grad_norm": 1.0984265804290771, + "learning_rate": 9.994356030111509e-05, + "loss": 2.4318, + "step": 1457 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.9227646589279175, + "learning_rate": 9.994332394939936e-05, + "loss": 2.3928, + "step": 1458 + }, + { + "epoch": 0.4478207489257213, + "grad_norm": 1.0073471069335938, + "learning_rate": 9.994308710411442e-05, + "loss": 2.4203, + "step": 1459 + }, + { + "epoch": 0.4481276856967465, + "grad_norm": 1.1347973346710205, + "learning_rate": 9.994284976526263e-05, + "loss": 2.4991, + "step": 1460 + }, + { + "epoch": 0.44843462246777166, + "grad_norm": 0.9912654757499695, + "learning_rate": 9.994261193284631e-05, + "loss": 2.471, + "step": 1461 + }, + { + "epoch": 0.4487415592387968, + "grad_norm": 1.0599550008773804, + "learning_rate": 9.994237360686784e-05, + "loss": 2.505, + "step": 1462 + }, + { + "epoch": 0.449048496009822, + "grad_norm": 0.9811004996299744, + "learning_rate": 9.994213478732957e-05, + "loss": 2.3868, + "step": 1463 + }, + { + "epoch": 0.44935543278084716, + "grad_norm": 0.8389631509780884, + "learning_rate": 9.994189547423384e-05, + "loss": 2.4766, + "step": 1464 + }, + { + "epoch": 0.44966236955187233, + "grad_norm": 0.8475043773651123, + "learning_rate": 9.994165566758302e-05, + "loss": 2.3666, + "step": 1465 + }, + { + "epoch": 0.4499693063228975, + "grad_norm": 0.8922824859619141, + "learning_rate": 9.994141536737951e-05, + "loss": 2.3823, + "step": 1466 + }, + { + "epoch": 0.45027624309392267, + "grad_norm": 1.0286083221435547, + "learning_rate": 9.994117457362564e-05, + "loss": 2.4639, + "step": 1467 + }, + { + "epoch": 0.45058317986494784, + "grad_norm": 1.094282865524292, + "learning_rate": 9.994093328632383e-05, + "loss": 2.3984, + "step": 1468 + }, + { + "epoch": 0.450890116635973, + "grad_norm": 1.0993603467941284, + "learning_rate": 9.994069150547642e-05, + "loss": 2.3719, + "step": 1469 + }, + { + "epoch": 0.45119705340699817, + "grad_norm": 1.0274133682250977, + "learning_rate": 9.994044923108585e-05, + "loss": 2.3644, + "step": 1470 + }, + { + "epoch": 0.45150399017802334, + "grad_norm": 0.8834434747695923, + "learning_rate": 9.994020646315448e-05, + "loss": 2.4955, + "step": 1471 + }, + { + "epoch": 0.4518109269490485, + "grad_norm": 0.8540776968002319, + "learning_rate": 9.993996320168473e-05, + "loss": 2.4292, + "step": 1472 + }, + { + "epoch": 0.4521178637200737, + "grad_norm": 0.8735383749008179, + "learning_rate": 9.993971944667897e-05, + "loss": 2.4343, + "step": 1473 + }, + { + "epoch": 0.45242480049109884, + "grad_norm": 0.976224422454834, + "learning_rate": 9.993947519813965e-05, + "loss": 2.4173, + "step": 1474 + }, + { + "epoch": 0.452731737262124, + "grad_norm": 0.9638139009475708, + "learning_rate": 9.993923045606917e-05, + "loss": 2.4322, + "step": 1475 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.9689927697181702, + "learning_rate": 9.993898522046992e-05, + "loss": 2.4625, + "step": 1476 + }, + { + "epoch": 0.45334561080417435, + "grad_norm": 1.0496052503585815, + "learning_rate": 9.993873949134437e-05, + "loss": 2.4788, + "step": 1477 + }, + { + "epoch": 0.4536525475751995, + "grad_norm": 1.0285090208053589, + "learning_rate": 9.993849326869491e-05, + "loss": 2.4119, + "step": 1478 + }, + { + "epoch": 0.4539594843462247, + "grad_norm": 0.9423730373382568, + "learning_rate": 9.993824655252401e-05, + "loss": 2.3919, + "step": 1479 + }, + { + "epoch": 0.45426642111724985, + "grad_norm": 1.0312988758087158, + "learning_rate": 9.993799934283407e-05, + "loss": 2.3829, + "step": 1480 + }, + { + "epoch": 0.454573357888275, + "grad_norm": 1.0985655784606934, + "learning_rate": 9.993775163962755e-05, + "loss": 2.3958, + "step": 1481 + }, + { + "epoch": 0.4548802946593002, + "grad_norm": 0.9346623420715332, + "learning_rate": 9.993750344290691e-05, + "loss": 2.3611, + "step": 1482 + }, + { + "epoch": 0.45518723143032536, + "grad_norm": 1.039681315422058, + "learning_rate": 9.993725475267459e-05, + "loss": 2.3989, + "step": 1483 + }, + { + "epoch": 0.4554941682013505, + "grad_norm": 0.9941854476928711, + "learning_rate": 9.993700556893304e-05, + "loss": 2.3092, + "step": 1484 + }, + { + "epoch": 0.4558011049723757, + "grad_norm": 0.9752130508422852, + "learning_rate": 9.993675589168473e-05, + "loss": 2.3727, + "step": 1485 + }, + { + "epoch": 0.45610804174340086, + "grad_norm": 0.9946039319038391, + "learning_rate": 9.993650572093216e-05, + "loss": 2.4121, + "step": 1486 + }, + { + "epoch": 0.45641497851442603, + "grad_norm": 1.1340489387512207, + "learning_rate": 9.993625505667774e-05, + "loss": 2.4477, + "step": 1487 + }, + { + "epoch": 0.4567219152854512, + "grad_norm": 0.9300981760025024, + "learning_rate": 9.993600389892399e-05, + "loss": 2.4045, + "step": 1488 + }, + { + "epoch": 0.45702885205647636, + "grad_norm": 0.8670973181724548, + "learning_rate": 9.993575224767338e-05, + "loss": 2.3596, + "step": 1489 + }, + { + "epoch": 0.45733578882750153, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.99355001029284e-05, + "loss": 2.4191, + "step": 1490 + }, + { + "epoch": 0.4576427255985267, + "grad_norm": 0.9099079370498657, + "learning_rate": 9.993524746469154e-05, + "loss": 2.4139, + "step": 1491 + }, + { + "epoch": 0.45794966236955187, + "grad_norm": 0.9740153551101685, + "learning_rate": 9.99349943329653e-05, + "loss": 2.4269, + "step": 1492 + }, + { + "epoch": 0.45825659914057704, + "grad_norm": 0.9112171530723572, + "learning_rate": 9.993474070775217e-05, + "loss": 2.3575, + "step": 1493 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 1.124553918838501, + "learning_rate": 9.993448658905466e-05, + "loss": 2.5518, + "step": 1494 + }, + { + "epoch": 0.4588704726826274, + "grad_norm": 1.1732012033462524, + "learning_rate": 9.99342319768753e-05, + "loss": 2.4346, + "step": 1495 + }, + { + "epoch": 0.45917740945365254, + "grad_norm": 0.8880025148391724, + "learning_rate": 9.993397687121659e-05, + "loss": 2.3593, + "step": 1496 + }, + { + "epoch": 0.4594843462246777, + "grad_norm": 0.9916797876358032, + "learning_rate": 9.993372127208105e-05, + "loss": 2.3283, + "step": 1497 + }, + { + "epoch": 0.4597912829957029, + "grad_norm": 0.9372622966766357, + "learning_rate": 9.99334651794712e-05, + "loss": 2.3868, + "step": 1498 + }, + { + "epoch": 0.46009821976672804, + "grad_norm": 1.0630989074707031, + "learning_rate": 9.99332085933896e-05, + "loss": 2.3605, + "step": 1499 + }, + { + "epoch": 0.4604051565377532, + "grad_norm": 1.000473976135254, + "learning_rate": 9.993295151383874e-05, + "loss": 2.3478, + "step": 1500 + }, + { + "epoch": 0.4607120933087784, + "grad_norm": 1.0269688367843628, + "learning_rate": 9.99326939408212e-05, + "loss": 2.4104, + "step": 1501 + }, + { + "epoch": 0.46101903007980355, + "grad_norm": 0.9003174901008606, + "learning_rate": 9.993243587433952e-05, + "loss": 2.3461, + "step": 1502 + }, + { + "epoch": 0.4613259668508287, + "grad_norm": 0.7938058972358704, + "learning_rate": 9.993217731439623e-05, + "loss": 2.3463, + "step": 1503 + }, + { + "epoch": 0.4616329036218539, + "grad_norm": 0.8715407252311707, + "learning_rate": 9.993191826099391e-05, + "loss": 2.3962, + "step": 1504 + }, + { + "epoch": 0.46193984039287905, + "grad_norm": 0.8319756984710693, + "learning_rate": 9.99316587141351e-05, + "loss": 2.342, + "step": 1505 + }, + { + "epoch": 0.4622467771639042, + "grad_norm": 0.846592903137207, + "learning_rate": 9.993139867382238e-05, + "loss": 2.4064, + "step": 1506 + }, + { + "epoch": 0.4625537139349294, + "grad_norm": 0.8567312955856323, + "learning_rate": 9.99311381400583e-05, + "loss": 2.3603, + "step": 1507 + }, + { + "epoch": 0.46286065070595456, + "grad_norm": 0.8784321546554565, + "learning_rate": 9.993087711284546e-05, + "loss": 2.4031, + "step": 1508 + }, + { + "epoch": 0.4631675874769797, + "grad_norm": 0.838233232498169, + "learning_rate": 9.993061559218641e-05, + "loss": 2.3156, + "step": 1509 + }, + { + "epoch": 0.4634745242480049, + "grad_norm": 0.8804462552070618, + "learning_rate": 9.993035357808376e-05, + "loss": 2.4322, + "step": 1510 + }, + { + "epoch": 0.46378146101903006, + "grad_norm": 1.1055982112884521, + "learning_rate": 9.99300910705401e-05, + "loss": 2.5006, + "step": 1511 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.9872145056724548, + "learning_rate": 9.992982806955799e-05, + "loss": 2.3547, + "step": 1512 + }, + { + "epoch": 0.4643953345610804, + "grad_norm": 1.0710479021072388, + "learning_rate": 9.99295645751401e-05, + "loss": 2.4867, + "step": 1513 + }, + { + "epoch": 0.46470227133210557, + "grad_norm": 0.9858919382095337, + "learning_rate": 9.992930058728894e-05, + "loss": 2.2986, + "step": 1514 + }, + { + "epoch": 0.46500920810313073, + "grad_norm": 0.9031065702438354, + "learning_rate": 9.992903610600719e-05, + "loss": 2.3172, + "step": 1515 + }, + { + "epoch": 0.4653161448741559, + "grad_norm": 0.923160970211029, + "learning_rate": 9.992877113129744e-05, + "loss": 2.4231, + "step": 1516 + }, + { + "epoch": 0.46562308164518107, + "grad_norm": 1.0130947828292847, + "learning_rate": 9.992850566316231e-05, + "loss": 2.3593, + "step": 1517 + }, + { + "epoch": 0.46593001841620624, + "grad_norm": 0.8947033286094666, + "learning_rate": 9.992823970160441e-05, + "loss": 2.3324, + "step": 1518 + }, + { + "epoch": 0.4662369551872314, + "grad_norm": 0.8819900155067444, + "learning_rate": 9.992797324662639e-05, + "loss": 2.2885, + "step": 1519 + }, + { + "epoch": 0.4665438919582566, + "grad_norm": 0.9434374570846558, + "learning_rate": 9.99277062982309e-05, + "loss": 2.427, + "step": 1520 + }, + { + "epoch": 0.46685082872928174, + "grad_norm": 0.9568646550178528, + "learning_rate": 9.99274388564205e-05, + "loss": 2.4059, + "step": 1521 + }, + { + "epoch": 0.4671577655003069, + "grad_norm": 0.9125105142593384, + "learning_rate": 9.992717092119794e-05, + "loss": 2.3306, + "step": 1522 + }, + { + "epoch": 0.46746470227133213, + "grad_norm": 0.8893206715583801, + "learning_rate": 9.992690249256578e-05, + "loss": 2.4211, + "step": 1523 + }, + { + "epoch": 0.4677716390423573, + "grad_norm": 0.8655402660369873, + "learning_rate": 9.992663357052672e-05, + "loss": 2.3493, + "step": 1524 + }, + { + "epoch": 0.46807857581338247, + "grad_norm": 0.7973037958145142, + "learning_rate": 9.99263641550834e-05, + "loss": 2.4255, + "step": 1525 + }, + { + "epoch": 0.46838551258440764, + "grad_norm": 0.8158934116363525, + "learning_rate": 9.992609424623849e-05, + "loss": 2.3518, + "step": 1526 + }, + { + "epoch": 0.4686924493554328, + "grad_norm": 0.7919436693191528, + "learning_rate": 9.992582384399465e-05, + "loss": 2.3762, + "step": 1527 + }, + { + "epoch": 0.468999386126458, + "grad_norm": 0.911490261554718, + "learning_rate": 9.992555294835455e-05, + "loss": 2.454, + "step": 1528 + }, + { + "epoch": 0.46930632289748314, + "grad_norm": 0.9504674077033997, + "learning_rate": 9.992528155932088e-05, + "loss": 2.3554, + "step": 1529 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.9833991527557373, + "learning_rate": 9.99250096768963e-05, + "loss": 2.4245, + "step": 1530 + }, + { + "epoch": 0.4699201964395335, + "grad_norm": 0.9994687438011169, + "learning_rate": 9.992473730108354e-05, + "loss": 2.3269, + "step": 1531 + }, + { + "epoch": 0.47022713321055865, + "grad_norm": 0.977237343788147, + "learning_rate": 9.992446443188526e-05, + "loss": 2.3938, + "step": 1532 + }, + { + "epoch": 0.4705340699815838, + "grad_norm": 1.018334150314331, + "learning_rate": 9.992419106930415e-05, + "loss": 2.3076, + "step": 1533 + }, + { + "epoch": 0.470841006752609, + "grad_norm": 0.9752077460289001, + "learning_rate": 9.992391721334293e-05, + "loss": 2.4224, + "step": 1534 + }, + { + "epoch": 0.47114794352363415, + "grad_norm": 0.9457291960716248, + "learning_rate": 9.992364286400428e-05, + "loss": 2.3859, + "step": 1535 + }, + { + "epoch": 0.4714548802946593, + "grad_norm": 0.9112275838851929, + "learning_rate": 9.992336802129096e-05, + "loss": 2.3343, + "step": 1536 + }, + { + "epoch": 0.4717618170656845, + "grad_norm": 0.7701164484024048, + "learning_rate": 9.992309268520563e-05, + "loss": 2.3912, + "step": 1537 + }, + { + "epoch": 0.47206875383670965, + "grad_norm": 0.826822817325592, + "learning_rate": 9.992281685575105e-05, + "loss": 2.3794, + "step": 1538 + }, + { + "epoch": 0.4723756906077348, + "grad_norm": 0.8690019249916077, + "learning_rate": 9.992254053292994e-05, + "loss": 2.3474, + "step": 1539 + }, + { + "epoch": 0.47268262737876, + "grad_norm": 0.935954213142395, + "learning_rate": 9.9922263716745e-05, + "loss": 2.3794, + "step": 1540 + }, + { + "epoch": 0.47298956414978516, + "grad_norm": 1.0606616735458374, + "learning_rate": 9.992198640719901e-05, + "loss": 2.3491, + "step": 1541 + }, + { + "epoch": 0.4732965009208103, + "grad_norm": 1.0020630359649658, + "learning_rate": 9.992170860429469e-05, + "loss": 2.4723, + "step": 1542 + }, + { + "epoch": 0.4736034376918355, + "grad_norm": 0.9738268256187439, + "learning_rate": 9.992143030803476e-05, + "loss": 2.4282, + "step": 1543 + }, + { + "epoch": 0.47391037446286066, + "grad_norm": 1.0320461988449097, + "learning_rate": 9.992115151842203e-05, + "loss": 2.3935, + "step": 1544 + }, + { + "epoch": 0.47421731123388583, + "grad_norm": 0.926980197429657, + "learning_rate": 9.992087223545921e-05, + "loss": 2.4403, + "step": 1545 + }, + { + "epoch": 0.474524248004911, + "grad_norm": 0.8760805130004883, + "learning_rate": 9.992059245914906e-05, + "loss": 2.3282, + "step": 1546 + }, + { + "epoch": 0.47483118477593617, + "grad_norm": 0.807569146156311, + "learning_rate": 9.992031218949435e-05, + "loss": 2.351, + "step": 1547 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.7491574883460999, + "learning_rate": 9.992003142649788e-05, + "loss": 2.3788, + "step": 1548 + }, + { + "epoch": 0.4754450583179865, + "grad_norm": 0.8402566909790039, + "learning_rate": 9.99197501701624e-05, + "loss": 2.4025, + "step": 1549 + }, + { + "epoch": 0.47575199508901167, + "grad_norm": 0.9501824975013733, + "learning_rate": 9.991946842049067e-05, + "loss": 2.4433, + "step": 1550 + }, + { + "epoch": 0.47605893186003684, + "grad_norm": 1.0070267915725708, + "learning_rate": 9.99191861774855e-05, + "loss": 2.4267, + "step": 1551 + }, + { + "epoch": 0.476365868631062, + "grad_norm": 0.9052779078483582, + "learning_rate": 9.991890344114969e-05, + "loss": 2.37, + "step": 1552 + }, + { + "epoch": 0.4766728054020872, + "grad_norm": 0.9453344345092773, + "learning_rate": 9.9918620211486e-05, + "loss": 2.4687, + "step": 1553 + }, + { + "epoch": 0.47697974217311234, + "grad_norm": 0.9836863875389099, + "learning_rate": 9.991833648849725e-05, + "loss": 2.4005, + "step": 1554 + }, + { + "epoch": 0.4772866789441375, + "grad_norm": 0.856532633304596, + "learning_rate": 9.991805227218624e-05, + "loss": 2.329, + "step": 1555 + }, + { + "epoch": 0.4775936157151627, + "grad_norm": 0.8338705897331238, + "learning_rate": 9.991776756255579e-05, + "loss": 2.3648, + "step": 1556 + }, + { + "epoch": 0.47790055248618785, + "grad_norm": 0.7738644480705261, + "learning_rate": 9.991748235960869e-05, + "loss": 2.2784, + "step": 1557 + }, + { + "epoch": 0.478207489257213, + "grad_norm": 0.7771223783493042, + "learning_rate": 9.991719666334778e-05, + "loss": 2.2747, + "step": 1558 + }, + { + "epoch": 0.4785144260282382, + "grad_norm": 0.7564612627029419, + "learning_rate": 9.991691047377588e-05, + "loss": 2.2964, + "step": 1559 + }, + { + "epoch": 0.47882136279926335, + "grad_norm": 0.7877290844917297, + "learning_rate": 9.99166237908958e-05, + "loss": 2.3149, + "step": 1560 + }, + { + "epoch": 0.4791282995702885, + "grad_norm": 0.7967450022697449, + "learning_rate": 9.991633661471039e-05, + "loss": 2.4035, + "step": 1561 + }, + { + "epoch": 0.4794352363413137, + "grad_norm": 0.8993534445762634, + "learning_rate": 9.991604894522248e-05, + "loss": 2.4028, + "step": 1562 + }, + { + "epoch": 0.47974217311233885, + "grad_norm": 0.9135516881942749, + "learning_rate": 9.991576078243494e-05, + "loss": 2.3968, + "step": 1563 + }, + { + "epoch": 0.480049109883364, + "grad_norm": 0.8438525795936584, + "learning_rate": 9.991547212635057e-05, + "loss": 2.3589, + "step": 1564 + }, + { + "epoch": 0.4803560466543892, + "grad_norm": 0.8979686498641968, + "learning_rate": 9.991518297697226e-05, + "loss": 2.3835, + "step": 1565 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.8821539878845215, + "learning_rate": 9.991489333430286e-05, + "loss": 2.3503, + "step": 1566 + }, + { + "epoch": 0.4809699201964395, + "grad_norm": 0.8649077415466309, + "learning_rate": 9.991460319834523e-05, + "loss": 2.3806, + "step": 1567 + }, + { + "epoch": 0.4812768569674647, + "grad_norm": 0.8360965847969055, + "learning_rate": 9.991431256910223e-05, + "loss": 2.3997, + "step": 1568 + }, + { + "epoch": 0.48158379373848986, + "grad_norm": 0.9178828597068787, + "learning_rate": 9.991402144657673e-05, + "loss": 2.3611, + "step": 1569 + }, + { + "epoch": 0.48189073050951503, + "grad_norm": 0.7961607575416565, + "learning_rate": 9.991372983077161e-05, + "loss": 2.3588, + "step": 1570 + }, + { + "epoch": 0.4821976672805402, + "grad_norm": 0.8136993646621704, + "learning_rate": 9.991343772168978e-05, + "loss": 2.3241, + "step": 1571 + }, + { + "epoch": 0.48250460405156537, + "grad_norm": 0.8421273231506348, + "learning_rate": 9.991314511933407e-05, + "loss": 2.3493, + "step": 1572 + }, + { + "epoch": 0.48281154082259053, + "grad_norm": 0.774861752986908, + "learning_rate": 9.991285202370743e-05, + "loss": 2.362, + "step": 1573 + }, + { + "epoch": 0.4831184775936157, + "grad_norm": 0.9181589484214783, + "learning_rate": 9.991255843481273e-05, + "loss": 2.443, + "step": 1574 + }, + { + "epoch": 0.48342541436464087, + "grad_norm": 0.873884379863739, + "learning_rate": 9.991226435265286e-05, + "loss": 2.3819, + "step": 1575 + }, + { + "epoch": 0.48373235113566604, + "grad_norm": 0.923200786113739, + "learning_rate": 9.991196977723077e-05, + "loss": 2.4152, + "step": 1576 + }, + { + "epoch": 0.4840392879066912, + "grad_norm": 0.9097923040390015, + "learning_rate": 9.99116747085493e-05, + "loss": 2.4072, + "step": 1577 + }, + { + "epoch": 0.4843462246777164, + "grad_norm": 0.8885805010795593, + "learning_rate": 9.991137914661143e-05, + "loss": 2.3963, + "step": 1578 + }, + { + "epoch": 0.48465316144874154, + "grad_norm": 0.9016655683517456, + "learning_rate": 9.991108309142006e-05, + "loss": 2.4287, + "step": 1579 + }, + { + "epoch": 0.4849600982197667, + "grad_norm": 0.957548201084137, + "learning_rate": 9.99107865429781e-05, + "loss": 2.4306, + "step": 1580 + }, + { + "epoch": 0.4852670349907919, + "grad_norm": 0.9604195356369019, + "learning_rate": 9.99104895012885e-05, + "loss": 2.3721, + "step": 1581 + }, + { + "epoch": 0.48557397176181705, + "grad_norm": 1.0423815250396729, + "learning_rate": 9.991019196635419e-05, + "loss": 2.3847, + "step": 1582 + }, + { + "epoch": 0.4858809085328422, + "grad_norm": 0.9538045525550842, + "learning_rate": 9.990989393817809e-05, + "loss": 2.4307, + "step": 1583 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 1.0103334188461304, + "learning_rate": 9.990959541676318e-05, + "loss": 2.409, + "step": 1584 + }, + { + "epoch": 0.48649478207489255, + "grad_norm": 1.0780646800994873, + "learning_rate": 9.99092964021124e-05, + "loss": 2.3314, + "step": 1585 + }, + { + "epoch": 0.4868017188459177, + "grad_norm": 1.0062072277069092, + "learning_rate": 9.99089968942287e-05, + "loss": 2.3922, + "step": 1586 + }, + { + "epoch": 0.4871086556169429, + "grad_norm": 1.0575196743011475, + "learning_rate": 9.990869689311504e-05, + "loss": 2.4156, + "step": 1587 + }, + { + "epoch": 0.48741559238796806, + "grad_norm": 0.9953998923301697, + "learning_rate": 9.990839639877438e-05, + "loss": 2.381, + "step": 1588 + }, + { + "epoch": 0.4877225291589932, + "grad_norm": 0.8848470449447632, + "learning_rate": 9.99080954112097e-05, + "loss": 2.4178, + "step": 1589 + }, + { + "epoch": 0.4880294659300184, + "grad_norm": 0.7849117517471313, + "learning_rate": 9.990779393042397e-05, + "loss": 2.3021, + "step": 1590 + }, + { + "epoch": 0.48833640270104356, + "grad_norm": 0.7611599564552307, + "learning_rate": 9.990749195642016e-05, + "loss": 2.4426, + "step": 1591 + }, + { + "epoch": 0.4886433394720687, + "grad_norm": 0.8361895084381104, + "learning_rate": 9.990718948920127e-05, + "loss": 2.3442, + "step": 1592 + }, + { + "epoch": 0.4889502762430939, + "grad_norm": 0.8249576687812805, + "learning_rate": 9.990688652877028e-05, + "loss": 2.2745, + "step": 1593 + }, + { + "epoch": 0.4892572130141191, + "grad_norm": 0.763889729976654, + "learning_rate": 9.990658307513019e-05, + "loss": 2.3123, + "step": 1594 + }, + { + "epoch": 0.4895641497851443, + "grad_norm": 0.7517281770706177, + "learning_rate": 9.990627912828399e-05, + "loss": 2.3811, + "step": 1595 + }, + { + "epoch": 0.48987108655616945, + "grad_norm": 0.8254112005233765, + "learning_rate": 9.990597468823468e-05, + "loss": 2.4269, + "step": 1596 + }, + { + "epoch": 0.4901780233271946, + "grad_norm": 0.8267236948013306, + "learning_rate": 9.99056697549853e-05, + "loss": 2.354, + "step": 1597 + }, + { + "epoch": 0.4904849600982198, + "grad_norm": 0.8511303067207336, + "learning_rate": 9.990536432853881e-05, + "loss": 2.3755, + "step": 1598 + }, + { + "epoch": 0.49079189686924496, + "grad_norm": 0.8639636635780334, + "learning_rate": 9.990505840889828e-05, + "loss": 2.3828, + "step": 1599 + }, + { + "epoch": 0.4910988336402701, + "grad_norm": 0.8371795415878296, + "learning_rate": 9.990475199606672e-05, + "loss": 2.4235, + "step": 1600 + }, + { + "epoch": 0.4914057704112953, + "grad_norm": 0.7639186382293701, + "learning_rate": 9.990444509004713e-05, + "loss": 2.3547, + "step": 1601 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.7835492491722107, + "learning_rate": 9.990413769084257e-05, + "loss": 2.2983, + "step": 1602 + }, + { + "epoch": 0.49201964395334563, + "grad_norm": 0.8301565647125244, + "learning_rate": 9.990382979845609e-05, + "loss": 2.4109, + "step": 1603 + }, + { + "epoch": 0.4923265807243708, + "grad_norm": 0.9005976915359497, + "learning_rate": 9.99035214128907e-05, + "loss": 2.3618, + "step": 1604 + }, + { + "epoch": 0.49263351749539597, + "grad_norm": 1.0234936475753784, + "learning_rate": 9.990321253414945e-05, + "loss": 2.4622, + "step": 1605 + }, + { + "epoch": 0.49294045426642114, + "grad_norm": 1.1613819599151611, + "learning_rate": 9.990290316223542e-05, + "loss": 2.3231, + "step": 1606 + }, + { + "epoch": 0.4932473910374463, + "grad_norm": 0.9382983446121216, + "learning_rate": 9.990259329715165e-05, + "loss": 2.357, + "step": 1607 + }, + { + "epoch": 0.49355432780847147, + "grad_norm": 1.0277435779571533, + "learning_rate": 9.990228293890121e-05, + "loss": 2.3497, + "step": 1608 + }, + { + "epoch": 0.49386126457949664, + "grad_norm": 0.9809542894363403, + "learning_rate": 9.990197208748716e-05, + "loss": 2.363, + "step": 1609 + }, + { + "epoch": 0.4941682013505218, + "grad_norm": 1.151412844657898, + "learning_rate": 9.990166074291255e-05, + "loss": 2.4859, + "step": 1610 + }, + { + "epoch": 0.494475138121547, + "grad_norm": 0.9663482308387756, + "learning_rate": 9.990134890518051e-05, + "loss": 2.3848, + "step": 1611 + }, + { + "epoch": 0.49478207489257214, + "grad_norm": 0.9619266986846924, + "learning_rate": 9.990103657429405e-05, + "loss": 2.3381, + "step": 1612 + }, + { + "epoch": 0.4950890116635973, + "grad_norm": 1.1306475400924683, + "learning_rate": 9.990072375025634e-05, + "loss": 2.3859, + "step": 1613 + }, + { + "epoch": 0.4953959484346225, + "grad_norm": 1.127801537513733, + "learning_rate": 9.990041043307043e-05, + "loss": 2.4259, + "step": 1614 + }, + { + "epoch": 0.49570288520564765, + "grad_norm": 0.9880200624465942, + "learning_rate": 9.990009662273941e-05, + "loss": 2.3629, + "step": 1615 + }, + { + "epoch": 0.4960098219766728, + "grad_norm": 0.940493643283844, + "learning_rate": 9.989978231926636e-05, + "loss": 2.3716, + "step": 1616 + }, + { + "epoch": 0.496316758747698, + "grad_norm": 0.7923702597618103, + "learning_rate": 9.989946752265445e-05, + "loss": 2.3017, + "step": 1617 + }, + { + "epoch": 0.49662369551872315, + "grad_norm": 0.7668408155441284, + "learning_rate": 9.989915223290673e-05, + "loss": 2.3273, + "step": 1618 + }, + { + "epoch": 0.4969306322897483, + "grad_norm": 0.7134098410606384, + "learning_rate": 9.989883645002636e-05, + "loss": 2.302, + "step": 1619 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.6878800392150879, + "learning_rate": 9.989852017401643e-05, + "loss": 2.3047, + "step": 1620 + }, + { + "epoch": 0.49754450583179866, + "grad_norm": 0.8099397420883179, + "learning_rate": 9.989820340488008e-05, + "loss": 2.4747, + "step": 1621 + }, + { + "epoch": 0.4978514426028238, + "grad_norm": 0.9677640795707703, + "learning_rate": 9.989788614262043e-05, + "loss": 2.3347, + "step": 1622 + }, + { + "epoch": 0.498158379373849, + "grad_norm": 0.7592893838882446, + "learning_rate": 9.989756838724064e-05, + "loss": 2.3238, + "step": 1623 + }, + { + "epoch": 0.49846531614487416, + "grad_norm": 0.872529923915863, + "learning_rate": 9.989725013874382e-05, + "loss": 2.4117, + "step": 1624 + }, + { + "epoch": 0.49877225291589933, + "grad_norm": 1.023362159729004, + "learning_rate": 9.989693139713315e-05, + "loss": 2.3307, + "step": 1625 + }, + { + "epoch": 0.4990791896869245, + "grad_norm": 0.8994693756103516, + "learning_rate": 9.989661216241172e-05, + "loss": 2.3661, + "step": 1626 + }, + { + "epoch": 0.49938612645794966, + "grad_norm": 0.8854429125785828, + "learning_rate": 9.989629243458275e-05, + "loss": 2.311, + "step": 1627 + }, + { + "epoch": 0.49969306322897483, + "grad_norm": 0.8326926231384277, + "learning_rate": 9.989597221364937e-05, + "loss": 2.302, + "step": 1628 + }, + { + "epoch": 0.5, + "grad_norm": 0.8778239488601685, + "learning_rate": 9.989565149961475e-05, + "loss": 2.4653, + "step": 1629 + }, + { + "epoch": 0.5003069367710252, + "grad_norm": 0.9369759559631348, + "learning_rate": 9.989533029248205e-05, + "loss": 2.4165, + "step": 1630 + }, + { + "epoch": 0.5006138735420503, + "grad_norm": 0.8510915637016296, + "learning_rate": 9.989500859225445e-05, + "loss": 2.3345, + "step": 1631 + }, + { + "epoch": 0.5009208103130756, + "grad_norm": 0.787972629070282, + "learning_rate": 9.989468639893513e-05, + "loss": 2.283, + "step": 1632 + }, + { + "epoch": 0.5012277470841007, + "grad_norm": 0.7370568513870239, + "learning_rate": 9.989436371252729e-05, + "loss": 2.2867, + "step": 1633 + }, + { + "epoch": 0.5015346838551259, + "grad_norm": 0.8459502458572388, + "learning_rate": 9.989404053303409e-05, + "loss": 2.2875, + "step": 1634 + }, + { + "epoch": 0.501841620626151, + "grad_norm": 0.9123181700706482, + "learning_rate": 9.989371686045874e-05, + "loss": 2.2653, + "step": 1635 + }, + { + "epoch": 0.5021485573971762, + "grad_norm": 1.1908178329467773, + "learning_rate": 9.989339269480445e-05, + "loss": 2.4849, + "step": 1636 + }, + { + "epoch": 0.5024554941682013, + "grad_norm": 0.8162623643875122, + "learning_rate": 9.989306803607439e-05, + "loss": 2.2409, + "step": 1637 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.9289522171020508, + "learning_rate": 9.98927428842718e-05, + "loss": 2.455, + "step": 1638 + }, + { + "epoch": 0.5030693677102517, + "grad_norm": 1.212346076965332, + "learning_rate": 9.989241723939988e-05, + "loss": 2.3461, + "step": 1639 + }, + { + "epoch": 0.5033763044812769, + "grad_norm": 0.8971593976020813, + "learning_rate": 9.989209110146184e-05, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.503683241252302, + "grad_norm": 0.9293156862258911, + "learning_rate": 9.989176447046092e-05, + "loss": 2.3235, + "step": 1641 + }, + { + "epoch": 0.5039901780233272, + "grad_norm": 0.8665596842765808, + "learning_rate": 9.989143734640034e-05, + "loss": 2.4694, + "step": 1642 + }, + { + "epoch": 0.5042971147943524, + "grad_norm": 0.7732648253440857, + "learning_rate": 9.989110972928333e-05, + "loss": 2.1985, + "step": 1643 + }, + { + "epoch": 0.5046040515653776, + "grad_norm": 0.8124692440032959, + "learning_rate": 9.989078161911314e-05, + "loss": 2.315, + "step": 1644 + }, + { + "epoch": 0.5049109883364027, + "grad_norm": 0.8534342050552368, + "learning_rate": 9.989045301589301e-05, + "loss": 2.3491, + "step": 1645 + }, + { + "epoch": 0.5052179251074279, + "grad_norm": 0.8351274132728577, + "learning_rate": 9.989012391962617e-05, + "loss": 2.3416, + "step": 1646 + }, + { + "epoch": 0.505524861878453, + "grad_norm": 0.9143189787864685, + "learning_rate": 9.988979433031588e-05, + "loss": 2.4665, + "step": 1647 + }, + { + "epoch": 0.5058317986494782, + "grad_norm": 0.8978474140167236, + "learning_rate": 9.988946424796542e-05, + "loss": 2.389, + "step": 1648 + }, + { + "epoch": 0.5061387354205034, + "grad_norm": 1.0245648622512817, + "learning_rate": 9.988913367257802e-05, + "loss": 2.3391, + "step": 1649 + }, + { + "epoch": 0.5064456721915286, + "grad_norm": 0.9991573691368103, + "learning_rate": 9.988880260415695e-05, + "loss": 2.405, + "step": 1650 + }, + { + "epoch": 0.5067526089625537, + "grad_norm": 1.042378306388855, + "learning_rate": 9.98884710427055e-05, + "loss": 2.3467, + "step": 1651 + }, + { + "epoch": 0.5070595457335789, + "grad_norm": 0.9569510817527771, + "learning_rate": 9.988813898822694e-05, + "loss": 2.31, + "step": 1652 + }, + { + "epoch": 0.507366482504604, + "grad_norm": 0.9343158006668091, + "learning_rate": 9.988780644072456e-05, + "loss": 2.3659, + "step": 1653 + }, + { + "epoch": 0.5076734192756293, + "grad_norm": 0.7857093811035156, + "learning_rate": 9.988747340020162e-05, + "loss": 2.3424, + "step": 1654 + }, + { + "epoch": 0.5079803560466544, + "grad_norm": 0.7613041996955872, + "learning_rate": 9.988713986666144e-05, + "loss": 2.2698, + "step": 1655 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.8077516555786133, + "learning_rate": 9.98868058401073e-05, + "loss": 2.3827, + "step": 1656 + }, + { + "epoch": 0.5085942295887047, + "grad_norm": 0.8794304132461548, + "learning_rate": 9.98864713205425e-05, + "loss": 2.3079, + "step": 1657 + }, + { + "epoch": 0.5089011663597299, + "grad_norm": 0.8333674073219299, + "learning_rate": 9.988613630797036e-05, + "loss": 2.3622, + "step": 1658 + }, + { + "epoch": 0.509208103130755, + "grad_norm": 0.9654781222343445, + "learning_rate": 9.988580080239417e-05, + "loss": 2.3979, + "step": 1659 + }, + { + "epoch": 0.5095150399017803, + "grad_norm": 0.9278727769851685, + "learning_rate": 9.988546480381727e-05, + "loss": 2.3728, + "step": 1660 + }, + { + "epoch": 0.5098219766728054, + "grad_norm": 0.7971704006195068, + "learning_rate": 9.988512831224298e-05, + "loss": 2.2983, + "step": 1661 + }, + { + "epoch": 0.5101289134438306, + "grad_norm": 0.8991698026657104, + "learning_rate": 9.988479132767459e-05, + "loss": 2.3992, + "step": 1662 + }, + { + "epoch": 0.5104358502148557, + "grad_norm": 1.0208392143249512, + "learning_rate": 9.988445385011546e-05, + "loss": 2.3847, + "step": 1663 + }, + { + "epoch": 0.5107427869858809, + "grad_norm": 0.878237247467041, + "learning_rate": 9.988411587956891e-05, + "loss": 2.2851, + "step": 1664 + }, + { + "epoch": 0.511049723756906, + "grad_norm": 0.903287410736084, + "learning_rate": 9.98837774160383e-05, + "loss": 2.4233, + "step": 1665 + }, + { + "epoch": 0.5113566605279313, + "grad_norm": 0.8845674991607666, + "learning_rate": 9.988343845952697e-05, + "loss": 2.2923, + "step": 1666 + }, + { + "epoch": 0.5116635972989564, + "grad_norm": 0.7729392051696777, + "learning_rate": 9.988309901003825e-05, + "loss": 2.3044, + "step": 1667 + }, + { + "epoch": 0.5119705340699816, + "grad_norm": 0.719302237033844, + "learning_rate": 9.988275906757551e-05, + "loss": 2.3207, + "step": 1668 + }, + { + "epoch": 0.5122774708410067, + "grad_norm": 0.7205179333686829, + "learning_rate": 9.988241863214211e-05, + "loss": 2.341, + "step": 1669 + }, + { + "epoch": 0.512584407612032, + "grad_norm": 0.7318145036697388, + "learning_rate": 9.988207770374142e-05, + "loss": 2.3419, + "step": 1670 + }, + { + "epoch": 0.5128913443830571, + "grad_norm": 0.770630955696106, + "learning_rate": 9.98817362823768e-05, + "loss": 2.27, + "step": 1671 + }, + { + "epoch": 0.5131982811540823, + "grad_norm": 0.6485452651977539, + "learning_rate": 9.988139436805162e-05, + "loss": 2.2715, + "step": 1672 + }, + { + "epoch": 0.5135052179251074, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.988105196076925e-05, + "loss": 2.2806, + "step": 1673 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.695818305015564, + "learning_rate": 9.98807090605331e-05, + "loss": 2.3387, + "step": 1674 + }, + { + "epoch": 0.5141190914671577, + "grad_norm": 0.7685426473617554, + "learning_rate": 9.988036566734655e-05, + "loss": 2.2921, + "step": 1675 + }, + { + "epoch": 0.514426028238183, + "grad_norm": 0.6522897481918335, + "learning_rate": 9.988002178121301e-05, + "loss": 2.2507, + "step": 1676 + }, + { + "epoch": 0.5147329650092081, + "grad_norm": 0.7442181706428528, + "learning_rate": 9.987967740213583e-05, + "loss": 2.3292, + "step": 1677 + }, + { + "epoch": 0.5150399017802333, + "grad_norm": 0.8093023300170898, + "learning_rate": 9.987933253011846e-05, + "loss": 2.3384, + "step": 1678 + }, + { + "epoch": 0.5153468385512584, + "grad_norm": 0.8014655113220215, + "learning_rate": 9.987898716516428e-05, + "loss": 2.3619, + "step": 1679 + }, + { + "epoch": 0.5156537753222836, + "grad_norm": 0.8230258822441101, + "learning_rate": 9.987864130727671e-05, + "loss": 2.3242, + "step": 1680 + }, + { + "epoch": 0.5159607120933087, + "grad_norm": 0.9222247004508972, + "learning_rate": 9.987829495645918e-05, + "loss": 2.3907, + "step": 1681 + }, + { + "epoch": 0.516267648864334, + "grad_norm": 0.9293351769447327, + "learning_rate": 9.987794811271511e-05, + "loss": 2.3632, + "step": 1682 + }, + { + "epoch": 0.5165745856353591, + "grad_norm": 0.9555168747901917, + "learning_rate": 9.987760077604791e-05, + "loss": 2.3273, + "step": 1683 + }, + { + "epoch": 0.5168815224063843, + "grad_norm": 0.9839370250701904, + "learning_rate": 9.987725294646102e-05, + "loss": 2.3451, + "step": 1684 + }, + { + "epoch": 0.5171884591774094, + "grad_norm": 1.097970962524414, + "learning_rate": 9.987690462395791e-05, + "loss": 2.308, + "step": 1685 + }, + { + "epoch": 0.5174953959484346, + "grad_norm": 0.9345484972000122, + "learning_rate": 9.987655580854198e-05, + "loss": 2.3051, + "step": 1686 + }, + { + "epoch": 0.5178023327194597, + "grad_norm": 0.8075851798057556, + "learning_rate": 9.987620650021668e-05, + "loss": 2.3005, + "step": 1687 + }, + { + "epoch": 0.518109269490485, + "grad_norm": 0.7287935614585876, + "learning_rate": 9.987585669898549e-05, + "loss": 2.3709, + "step": 1688 + }, + { + "epoch": 0.5184162062615101, + "grad_norm": 0.7611173987388611, + "learning_rate": 9.987550640485184e-05, + "loss": 2.3265, + "step": 1689 + }, + { + "epoch": 0.5187231430325353, + "grad_norm": 0.7932588458061218, + "learning_rate": 9.987515561781921e-05, + "loss": 2.3625, + "step": 1690 + }, + { + "epoch": 0.5190300798035604, + "grad_norm": 0.7837479114532471, + "learning_rate": 9.987480433789106e-05, + "loss": 2.2614, + "step": 1691 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.905799925327301, + "learning_rate": 9.987445256507085e-05, + "loss": 2.2915, + "step": 1692 + }, + { + "epoch": 0.5196439533456108, + "grad_norm": 0.9417183995246887, + "learning_rate": 9.987410029936208e-05, + "loss": 2.3624, + "step": 1693 + }, + { + "epoch": 0.519950890116636, + "grad_norm": 0.9971327185630798, + "learning_rate": 9.987374754076822e-05, + "loss": 2.3913, + "step": 1694 + }, + { + "epoch": 0.5202578268876611, + "grad_norm": 0.8719072341918945, + "learning_rate": 9.987339428929274e-05, + "loss": 2.3412, + "step": 1695 + }, + { + "epoch": 0.5205647636586863, + "grad_norm": 0.8198116421699524, + "learning_rate": 9.987304054493916e-05, + "loss": 2.333, + "step": 1696 + }, + { + "epoch": 0.5208717004297114, + "grad_norm": 0.7450931668281555, + "learning_rate": 9.987268630771096e-05, + "loss": 2.2817, + "step": 1697 + }, + { + "epoch": 0.5211786372007366, + "grad_norm": 0.6867587566375732, + "learning_rate": 9.987233157761164e-05, + "loss": 2.3456, + "step": 1698 + }, + { + "epoch": 0.5214855739717618, + "grad_norm": 0.7537778615951538, + "learning_rate": 9.987197635464471e-05, + "loss": 2.176, + "step": 1699 + }, + { + "epoch": 0.521792510742787, + "grad_norm": 0.8347577452659607, + "learning_rate": 9.987162063881366e-05, + "loss": 2.3296, + "step": 1700 + }, + { + "epoch": 0.5220994475138122, + "grad_norm": 0.8714643120765686, + "learning_rate": 9.987126443012205e-05, + "loss": 2.3648, + "step": 1701 + }, + { + "epoch": 0.5224063842848373, + "grad_norm": 0.8579849004745483, + "learning_rate": 9.987090772857336e-05, + "loss": 2.4189, + "step": 1702 + }, + { + "epoch": 0.5227133210558625, + "grad_norm": 0.8651238083839417, + "learning_rate": 9.987055053417114e-05, + "loss": 2.3036, + "step": 1703 + }, + { + "epoch": 0.5230202578268877, + "grad_norm": 0.8447873592376709, + "learning_rate": 9.98701928469189e-05, + "loss": 2.3243, + "step": 1704 + }, + { + "epoch": 0.5233271945979129, + "grad_norm": 0.8218941688537598, + "learning_rate": 9.986983466682019e-05, + "loss": 2.3888, + "step": 1705 + }, + { + "epoch": 0.523634131368938, + "grad_norm": 0.7862920761108398, + "learning_rate": 9.986947599387855e-05, + "loss": 2.335, + "step": 1706 + }, + { + "epoch": 0.5239410681399632, + "grad_norm": 0.8096200227737427, + "learning_rate": 9.986911682809749e-05, + "loss": 2.4034, + "step": 1707 + }, + { + "epoch": 0.5242480049109883, + "grad_norm": 0.8217427730560303, + "learning_rate": 9.986875716948062e-05, + "loss": 2.2659, + "step": 1708 + }, + { + "epoch": 0.5245549416820136, + "grad_norm": 0.7676928043365479, + "learning_rate": 9.986839701803146e-05, + "loss": 2.2736, + "step": 1709 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.7783572673797607, + "learning_rate": 9.986803637375356e-05, + "loss": 2.3611, + "step": 1710 + }, + { + "epoch": 0.5251688152240639, + "grad_norm": 0.7657338380813599, + "learning_rate": 9.98676752366505e-05, + "loss": 2.3573, + "step": 1711 + }, + { + "epoch": 0.525475751995089, + "grad_norm": 0.8946976065635681, + "learning_rate": 9.986731360672585e-05, + "loss": 2.3443, + "step": 1712 + }, + { + "epoch": 0.5257826887661142, + "grad_norm": 0.8047227263450623, + "learning_rate": 9.986695148398318e-05, + "loss": 2.345, + "step": 1713 + }, + { + "epoch": 0.5260896255371393, + "grad_norm": 0.8407939672470093, + "learning_rate": 9.986658886842605e-05, + "loss": 2.2828, + "step": 1714 + }, + { + "epoch": 0.5263965623081646, + "grad_norm": 0.8460215330123901, + "learning_rate": 9.986622576005806e-05, + "loss": 2.2786, + "step": 1715 + }, + { + "epoch": 0.5267034990791897, + "grad_norm": 0.8291949033737183, + "learning_rate": 9.986586215888283e-05, + "loss": 2.3491, + "step": 1716 + }, + { + "epoch": 0.5270104358502149, + "grad_norm": 0.8812628388404846, + "learning_rate": 9.98654980649039e-05, + "loss": 2.3392, + "step": 1717 + }, + { + "epoch": 0.52731737262124, + "grad_norm": 0.8666933178901672, + "learning_rate": 9.98651334781249e-05, + "loss": 2.2585, + "step": 1718 + }, + { + "epoch": 0.5276243093922652, + "grad_norm": 0.8393275737762451, + "learning_rate": 9.986476839854941e-05, + "loss": 2.3315, + "step": 1719 + }, + { + "epoch": 0.5279312461632903, + "grad_norm": 0.8431777954101562, + "learning_rate": 9.986440282618105e-05, + "loss": 2.268, + "step": 1720 + }, + { + "epoch": 0.5282381829343156, + "grad_norm": 0.8020747900009155, + "learning_rate": 9.986403676102346e-05, + "loss": 2.2306, + "step": 1721 + }, + { + "epoch": 0.5285451197053407, + "grad_norm": 0.817395806312561, + "learning_rate": 9.986367020308022e-05, + "loss": 2.2914, + "step": 1722 + }, + { + "epoch": 0.5288520564763659, + "grad_norm": 0.8034493327140808, + "learning_rate": 9.986330315235497e-05, + "loss": 2.3598, + "step": 1723 + }, + { + "epoch": 0.529158993247391, + "grad_norm": 0.9001252055168152, + "learning_rate": 9.986293560885131e-05, + "loss": 2.3456, + "step": 1724 + }, + { + "epoch": 0.5294659300184162, + "grad_norm": 0.9782349467277527, + "learning_rate": 9.986256757257293e-05, + "loss": 2.231, + "step": 1725 + }, + { + "epoch": 0.5297728667894414, + "grad_norm": 1.0022578239440918, + "learning_rate": 9.98621990435234e-05, + "loss": 2.3457, + "step": 1726 + }, + { + "epoch": 0.5300798035604666, + "grad_norm": 1.0705206394195557, + "learning_rate": 9.986183002170642e-05, + "loss": 2.2775, + "step": 1727 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.8464064598083496, + "learning_rate": 9.98614605071256e-05, + "loss": 2.4006, + "step": 1728 + }, + { + "epoch": 0.5306936771025169, + "grad_norm": 0.7128132581710815, + "learning_rate": 9.98610904997846e-05, + "loss": 2.3273, + "step": 1729 + }, + { + "epoch": 0.531000613873542, + "grad_norm": 0.8113927245140076, + "learning_rate": 9.986071999968706e-05, + "loss": 2.3467, + "step": 1730 + }, + { + "epoch": 0.5313075506445673, + "grad_norm": 0.9236831665039062, + "learning_rate": 9.986034900683669e-05, + "loss": 2.3815, + "step": 1731 + }, + { + "epoch": 0.5316144874155924, + "grad_norm": 0.9325668811798096, + "learning_rate": 9.985997752123713e-05, + "loss": 2.3411, + "step": 1732 + }, + { + "epoch": 0.5319214241866176, + "grad_norm": 0.9585117101669312, + "learning_rate": 9.985960554289203e-05, + "loss": 2.3309, + "step": 1733 + }, + { + "epoch": 0.5322283609576427, + "grad_norm": 0.9459986686706543, + "learning_rate": 9.98592330718051e-05, + "loss": 2.3525, + "step": 1734 + }, + { + "epoch": 0.5325352977286679, + "grad_norm": 0.971592366695404, + "learning_rate": 9.985886010797997e-05, + "loss": 2.3665, + "step": 1735 + }, + { + "epoch": 0.532842234499693, + "grad_norm": 0.8533779978752136, + "learning_rate": 9.985848665142039e-05, + "loss": 2.26, + "step": 1736 + }, + { + "epoch": 0.5331491712707183, + "grad_norm": 0.8224228620529175, + "learning_rate": 9.985811270213002e-05, + "loss": 2.3523, + "step": 1737 + }, + { + "epoch": 0.5334561080417434, + "grad_norm": 0.8649810552597046, + "learning_rate": 9.985773826011255e-05, + "loss": 2.3262, + "step": 1738 + }, + { + "epoch": 0.5337630448127686, + "grad_norm": 0.8099339604377747, + "learning_rate": 9.98573633253717e-05, + "loss": 2.3038, + "step": 1739 + }, + { + "epoch": 0.5340699815837937, + "grad_norm": 0.6788219213485718, + "learning_rate": 9.985698789791115e-05, + "loss": 2.3278, + "step": 1740 + }, + { + "epoch": 0.5343769183548189, + "grad_norm": 0.8716040253639221, + "learning_rate": 9.985661197773464e-05, + "loss": 2.2955, + "step": 1741 + }, + { + "epoch": 0.534683855125844, + "grad_norm": 0.8377614617347717, + "learning_rate": 9.985623556484587e-05, + "loss": 2.2801, + "step": 1742 + }, + { + "epoch": 0.5349907918968693, + "grad_norm": 0.8452683091163635, + "learning_rate": 9.985585865924853e-05, + "loss": 2.3313, + "step": 1743 + }, + { + "epoch": 0.5352977286678944, + "grad_norm": 0.8226203918457031, + "learning_rate": 9.98554812609464e-05, + "loss": 2.3464, + "step": 1744 + }, + { + "epoch": 0.5356046654389196, + "grad_norm": 0.7476974725723267, + "learning_rate": 9.985510336994316e-05, + "loss": 2.3721, + "step": 1745 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.7132230997085571, + "learning_rate": 9.98547249862426e-05, + "loss": 2.2657, + "step": 1746 + }, + { + "epoch": 0.5362185389809699, + "grad_norm": 0.7022002339363098, + "learning_rate": 9.98543461098484e-05, + "loss": 2.2656, + "step": 1747 + }, + { + "epoch": 0.536525475751995, + "grad_norm": 0.7174789309501648, + "learning_rate": 9.985396674076435e-05, + "loss": 2.2914, + "step": 1748 + }, + { + "epoch": 0.5368324125230203, + "grad_norm": 0.78509920835495, + "learning_rate": 9.985358687899417e-05, + "loss": 2.3155, + "step": 1749 + }, + { + "epoch": 0.5371393492940454, + "grad_norm": 0.7670894861221313, + "learning_rate": 9.985320652454162e-05, + "loss": 2.2608, + "step": 1750 + }, + { + "epoch": 0.5374462860650706, + "grad_norm": 0.6196603178977966, + "learning_rate": 9.985282567741047e-05, + "loss": 2.2796, + "step": 1751 + }, + { + "epoch": 0.5377532228360957, + "grad_norm": 0.7119829058647156, + "learning_rate": 9.985244433760448e-05, + "loss": 2.2262, + "step": 1752 + }, + { + "epoch": 0.538060159607121, + "grad_norm": 0.6665359735488892, + "learning_rate": 9.98520625051274e-05, + "loss": 2.2714, + "step": 1753 + }, + { + "epoch": 0.5383670963781461, + "grad_norm": 0.7960934042930603, + "learning_rate": 9.985168017998303e-05, + "loss": 2.3703, + "step": 1754 + }, + { + "epoch": 0.5386740331491713, + "grad_norm": 0.9428521394729614, + "learning_rate": 9.985129736217513e-05, + "loss": 2.3334, + "step": 1755 + }, + { + "epoch": 0.5389809699201964, + "grad_norm": 0.9900842905044556, + "learning_rate": 9.985091405170751e-05, + "loss": 2.2369, + "step": 1756 + }, + { + "epoch": 0.5392879066912216, + "grad_norm": 0.9340593814849854, + "learning_rate": 9.985053024858393e-05, + "loss": 2.4332, + "step": 1757 + }, + { + "epoch": 0.5395948434622467, + "grad_norm": 0.9241896271705627, + "learning_rate": 9.985014595280818e-05, + "loss": 2.3484, + "step": 1758 + }, + { + "epoch": 0.539901780233272, + "grad_norm": 0.7724506258964539, + "learning_rate": 9.984976116438408e-05, + "loss": 2.282, + "step": 1759 + }, + { + "epoch": 0.5402087170042971, + "grad_norm": 0.9098101854324341, + "learning_rate": 9.984937588331543e-05, + "loss": 2.3039, + "step": 1760 + }, + { + "epoch": 0.5405156537753223, + "grad_norm": 0.9430370330810547, + "learning_rate": 9.984899010960601e-05, + "loss": 2.2555, + "step": 1761 + }, + { + "epoch": 0.5408225905463474, + "grad_norm": 0.8927021026611328, + "learning_rate": 9.984860384325965e-05, + "loss": 2.3034, + "step": 1762 + }, + { + "epoch": 0.5411295273173726, + "grad_norm": 0.8331896662712097, + "learning_rate": 9.98482170842802e-05, + "loss": 2.3341, + "step": 1763 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.8311246633529663, + "learning_rate": 9.984782983267142e-05, + "loss": 2.3913, + "step": 1764 + }, + { + "epoch": 0.541743400859423, + "grad_norm": 0.7459335923194885, + "learning_rate": 9.98474420884372e-05, + "loss": 2.2912, + "step": 1765 + }, + { + "epoch": 0.5420503376304481, + "grad_norm": 0.84760981798172, + "learning_rate": 9.984705385158131e-05, + "loss": 2.316, + "step": 1766 + }, + { + "epoch": 0.5423572744014733, + "grad_norm": 0.888793408870697, + "learning_rate": 9.984666512210762e-05, + "loss": 2.3452, + "step": 1767 + }, + { + "epoch": 0.5426642111724984, + "grad_norm": 0.7977499961853027, + "learning_rate": 9.984627590001999e-05, + "loss": 2.3325, + "step": 1768 + }, + { + "epoch": 0.5429711479435236, + "grad_norm": 0.8059934377670288, + "learning_rate": 9.984588618532224e-05, + "loss": 2.3347, + "step": 1769 + }, + { + "epoch": 0.5432780847145487, + "grad_norm": 0.8190197348594666, + "learning_rate": 9.984549597801822e-05, + "loss": 2.3446, + "step": 1770 + }, + { + "epoch": 0.543585021485574, + "grad_norm": 0.774773895740509, + "learning_rate": 9.98451052781118e-05, + "loss": 2.2598, + "step": 1771 + }, + { + "epoch": 0.5438919582565992, + "grad_norm": 0.7341485023498535, + "learning_rate": 9.984471408560682e-05, + "loss": 2.2728, + "step": 1772 + }, + { + "epoch": 0.5441988950276243, + "grad_norm": 0.6881145238876343, + "learning_rate": 9.984432240050719e-05, + "loss": 2.2922, + "step": 1773 + }, + { + "epoch": 0.5445058317986495, + "grad_norm": 0.6896151304244995, + "learning_rate": 9.984393022281673e-05, + "loss": 2.2915, + "step": 1774 + }, + { + "epoch": 0.5448127685696746, + "grad_norm": 0.6902059316635132, + "learning_rate": 9.984353755253932e-05, + "loss": 2.31, + "step": 1775 + }, + { + "epoch": 0.5451197053406999, + "grad_norm": 0.7594140768051147, + "learning_rate": 9.984314438967888e-05, + "loss": 2.3092, + "step": 1776 + }, + { + "epoch": 0.545426642111725, + "grad_norm": 0.8682328462600708, + "learning_rate": 9.984275073423927e-05, + "loss": 2.2851, + "step": 1777 + }, + { + "epoch": 0.5457335788827502, + "grad_norm": 0.8747107982635498, + "learning_rate": 9.98423565862244e-05, + "loss": 2.2927, + "step": 1778 + }, + { + "epoch": 0.5460405156537753, + "grad_norm": 0.9824326038360596, + "learning_rate": 9.984196194563813e-05, + "loss": 2.3622, + "step": 1779 + }, + { + "epoch": 0.5463474524248005, + "grad_norm": 1.0006790161132812, + "learning_rate": 9.984156681248438e-05, + "loss": 2.2531, + "step": 1780 + }, + { + "epoch": 0.5466543891958257, + "grad_norm": 0.9501944184303284, + "learning_rate": 9.984117118676705e-05, + "loss": 2.3902, + "step": 1781 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.7835353016853333, + "learning_rate": 9.984077506849005e-05, + "loss": 2.2754, + "step": 1782 + }, + { + "epoch": 0.547268262737876, + "grad_norm": 0.7310026288032532, + "learning_rate": 9.984037845765732e-05, + "loss": 2.2742, + "step": 1783 + }, + { + "epoch": 0.5475751995089012, + "grad_norm": 0.9469361901283264, + "learning_rate": 9.983998135427275e-05, + "loss": 2.4026, + "step": 1784 + }, + { + "epoch": 0.5478821362799263, + "grad_norm": 1.0639240741729736, + "learning_rate": 9.983958375834025e-05, + "loss": 2.3522, + "step": 1785 + }, + { + "epoch": 0.5481890730509515, + "grad_norm": 0.7771989703178406, + "learning_rate": 9.983918566986379e-05, + "loss": 2.216, + "step": 1786 + }, + { + "epoch": 0.5484960098219767, + "grad_norm": 0.6809307932853699, + "learning_rate": 9.983878708884728e-05, + "loss": 2.256, + "step": 1787 + }, + { + "epoch": 0.5488029465930019, + "grad_norm": 0.7300165891647339, + "learning_rate": 9.983838801529469e-05, + "loss": 2.3156, + "step": 1788 + }, + { + "epoch": 0.549109883364027, + "grad_norm": 0.8352389335632324, + "learning_rate": 9.98379884492099e-05, + "loss": 2.3344, + "step": 1789 + }, + { + "epoch": 0.5494168201350522, + "grad_norm": 0.830585777759552, + "learning_rate": 9.983758839059692e-05, + "loss": 2.3076, + "step": 1790 + }, + { + "epoch": 0.5497237569060773, + "grad_norm": 0.7384640574455261, + "learning_rate": 9.983718783945968e-05, + "loss": 2.2387, + "step": 1791 + }, + { + "epoch": 0.5500306936771026, + "grad_norm": 0.7133243083953857, + "learning_rate": 9.983678679580213e-05, + "loss": 2.2933, + "step": 1792 + }, + { + "epoch": 0.5503376304481277, + "grad_norm": 0.8462459444999695, + "learning_rate": 9.983638525962823e-05, + "loss": 2.3294, + "step": 1793 + }, + { + "epoch": 0.5506445672191529, + "grad_norm": 0.7841110825538635, + "learning_rate": 9.983598323094199e-05, + "loss": 2.3156, + "step": 1794 + }, + { + "epoch": 0.550951503990178, + "grad_norm": 0.8454114198684692, + "learning_rate": 9.983558070974735e-05, + "loss": 2.2203, + "step": 1795 + }, + { + "epoch": 0.5512584407612032, + "grad_norm": 0.7741531729698181, + "learning_rate": 9.983517769604826e-05, + "loss": 2.2585, + "step": 1796 + }, + { + "epoch": 0.5515653775322283, + "grad_norm": 0.717714250087738, + "learning_rate": 9.983477418984876e-05, + "loss": 2.3127, + "step": 1797 + }, + { + "epoch": 0.5518723143032536, + "grad_norm": 0.7546361088752747, + "learning_rate": 9.983437019115283e-05, + "loss": 2.2591, + "step": 1798 + }, + { + "epoch": 0.5521792510742787, + "grad_norm": 0.7947681546211243, + "learning_rate": 9.983396569996442e-05, + "loss": 2.337, + "step": 1799 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.9286270141601562, + "learning_rate": 9.983356071628756e-05, + "loss": 2.371, + "step": 1800 + }, + { + "epoch": 0.552793124616329, + "grad_norm": 1.0236682891845703, + "learning_rate": 9.983315524012625e-05, + "loss": 2.2673, + "step": 1801 + }, + { + "epoch": 0.5531000613873542, + "grad_norm": 1.043534278869629, + "learning_rate": 9.983274927148447e-05, + "loss": 2.3204, + "step": 1802 + }, + { + "epoch": 0.5534069981583793, + "grad_norm": 0.9694257378578186, + "learning_rate": 9.983234281036626e-05, + "loss": 2.2642, + "step": 1803 + }, + { + "epoch": 0.5537139349294046, + "grad_norm": 0.8890992403030396, + "learning_rate": 9.983193585677563e-05, + "loss": 2.2546, + "step": 1804 + }, + { + "epoch": 0.5540208717004297, + "grad_norm": 0.8109140396118164, + "learning_rate": 9.983152841071662e-05, + "loss": 2.3088, + "step": 1805 + }, + { + "epoch": 0.5543278084714549, + "grad_norm": 0.7762413620948792, + "learning_rate": 9.983112047219323e-05, + "loss": 2.2277, + "step": 1806 + }, + { + "epoch": 0.55463474524248, + "grad_norm": 0.7949336767196655, + "learning_rate": 9.983071204120951e-05, + "loss": 2.3004, + "step": 1807 + }, + { + "epoch": 0.5549416820135052, + "grad_norm": 0.9118300080299377, + "learning_rate": 9.983030311776946e-05, + "loss": 2.3986, + "step": 1808 + }, + { + "epoch": 0.5552486187845304, + "grad_norm": 0.874891996383667, + "learning_rate": 9.982989370187717e-05, + "loss": 2.2721, + "step": 1809 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.8089940547943115, + "learning_rate": 9.982948379353667e-05, + "loss": 2.2846, + "step": 1810 + }, + { + "epoch": 0.5558624923265807, + "grad_norm": 0.7407395839691162, + "learning_rate": 9.982907339275198e-05, + "loss": 2.2848, + "step": 1811 + }, + { + "epoch": 0.5561694290976059, + "grad_norm": 0.7487329244613647, + "learning_rate": 9.982866249952721e-05, + "loss": 2.266, + "step": 1812 + }, + { + "epoch": 0.556476365868631, + "grad_norm": 0.7910557389259338, + "learning_rate": 9.982825111386638e-05, + "loss": 2.2975, + "step": 1813 + }, + { + "epoch": 0.5567833026396563, + "grad_norm": 0.767186164855957, + "learning_rate": 9.982783923577356e-05, + "loss": 2.2867, + "step": 1814 + }, + { + "epoch": 0.5570902394106814, + "grad_norm": 0.7296959757804871, + "learning_rate": 9.982742686525284e-05, + "loss": 2.2167, + "step": 1815 + }, + { + "epoch": 0.5573971761817066, + "grad_norm": 0.6536411643028259, + "learning_rate": 9.982701400230827e-05, + "loss": 2.2278, + "step": 1816 + }, + { + "epoch": 0.5577041129527317, + "grad_norm": 0.7393643260002136, + "learning_rate": 9.982660064694394e-05, + "loss": 2.3275, + "step": 1817 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.7837240099906921, + "learning_rate": 9.982618679916396e-05, + "loss": 2.3516, + "step": 1818 + }, + { + "epoch": 0.558317986494782, + "grad_norm": 0.8186847567558289, + "learning_rate": 9.982577245897238e-05, + "loss": 2.4104, + "step": 1819 + }, + { + "epoch": 0.5586249232658073, + "grad_norm": 0.733651340007782, + "learning_rate": 9.98253576263733e-05, + "loss": 2.2151, + "step": 1820 + }, + { + "epoch": 0.5589318600368324, + "grad_norm": 0.7452411651611328, + "learning_rate": 9.982494230137086e-05, + "loss": 2.3288, + "step": 1821 + }, + { + "epoch": 0.5592387968078576, + "grad_norm": 0.7369456887245178, + "learning_rate": 9.982452648396913e-05, + "loss": 2.3023, + "step": 1822 + }, + { + "epoch": 0.5595457335788827, + "grad_norm": 0.794789731502533, + "learning_rate": 9.982411017417222e-05, + "loss": 2.2774, + "step": 1823 + }, + { + "epoch": 0.5598526703499079, + "grad_norm": 0.7677412033081055, + "learning_rate": 9.982369337198425e-05, + "loss": 2.3213, + "step": 1824 + }, + { + "epoch": 0.560159607120933, + "grad_norm": 0.8195241689682007, + "learning_rate": 9.982327607740934e-05, + "loss": 2.3721, + "step": 1825 + }, + { + "epoch": 0.5604665438919583, + "grad_norm": 0.867115318775177, + "learning_rate": 9.982285829045162e-05, + "loss": 2.3653, + "step": 1826 + }, + { + "epoch": 0.5607734806629834, + "grad_norm": 0.8519865870475769, + "learning_rate": 9.98224400111152e-05, + "loss": 2.3646, + "step": 1827 + }, + { + "epoch": 0.5610804174340086, + "grad_norm": 0.9408721923828125, + "learning_rate": 9.982202123940425e-05, + "loss": 2.2051, + "step": 1828 + }, + { + "epoch": 0.5613873542050337, + "grad_norm": 0.985325813293457, + "learning_rate": 9.982160197532287e-05, + "loss": 2.3402, + "step": 1829 + }, + { + "epoch": 0.5616942909760589, + "grad_norm": 1.018094539642334, + "learning_rate": 9.982118221887521e-05, + "loss": 2.2712, + "step": 1830 + }, + { + "epoch": 0.562001227747084, + "grad_norm": 0.9246920347213745, + "learning_rate": 9.982076197006543e-05, + "loss": 2.3808, + "step": 1831 + }, + { + "epoch": 0.5623081645181093, + "grad_norm": 0.8519729971885681, + "learning_rate": 9.982034122889768e-05, + "loss": 2.3774, + "step": 1832 + }, + { + "epoch": 0.5626151012891344, + "grad_norm": 0.801567018032074, + "learning_rate": 9.981991999537612e-05, + "loss": 2.2713, + "step": 1833 + }, + { + "epoch": 0.5629220380601596, + "grad_norm": 0.7212518453598022, + "learning_rate": 9.981949826950492e-05, + "loss": 2.1902, + "step": 1834 + }, + { + "epoch": 0.5632289748311847, + "grad_norm": 0.7644798755645752, + "learning_rate": 9.981907605128822e-05, + "loss": 2.2751, + "step": 1835 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.7941999435424805, + "learning_rate": 9.981865334073022e-05, + "loss": 2.2991, + "step": 1836 + }, + { + "epoch": 0.5638428483732351, + "grad_norm": 0.7274888753890991, + "learning_rate": 9.981823013783508e-05, + "loss": 2.3536, + "step": 1837 + }, + { + "epoch": 0.5641497851442603, + "grad_norm": 0.845024585723877, + "learning_rate": 9.9817806442607e-05, + "loss": 2.2796, + "step": 1838 + }, + { + "epoch": 0.5644567219152854, + "grad_norm": 0.8225597739219666, + "learning_rate": 9.981738225505015e-05, + "loss": 2.3339, + "step": 1839 + }, + { + "epoch": 0.5647636586863106, + "grad_norm": 0.8456425070762634, + "learning_rate": 9.981695757516873e-05, + "loss": 2.2583, + "step": 1840 + }, + { + "epoch": 0.5650705954573357, + "grad_norm": 1.0066497325897217, + "learning_rate": 9.981653240296695e-05, + "loss": 2.3628, + "step": 1841 + }, + { + "epoch": 0.565377532228361, + "grad_norm": 0.9574379920959473, + "learning_rate": 9.981610673844899e-05, + "loss": 2.306, + "step": 1842 + }, + { + "epoch": 0.5656844689993862, + "grad_norm": 0.7427437901496887, + "learning_rate": 9.981568058161905e-05, + "loss": 2.267, + "step": 1843 + }, + { + "epoch": 0.5659914057704113, + "grad_norm": 0.6984857320785522, + "learning_rate": 9.981525393248138e-05, + "loss": 2.2095, + "step": 1844 + }, + { + "epoch": 0.5662983425414365, + "grad_norm": 0.748062789440155, + "learning_rate": 9.981482679104016e-05, + "loss": 2.211, + "step": 1845 + }, + { + "epoch": 0.5666052793124616, + "grad_norm": 0.7978217005729675, + "learning_rate": 9.981439915729964e-05, + "loss": 2.2437, + "step": 1846 + }, + { + "epoch": 0.5669122160834869, + "grad_norm": 0.807849109172821, + "learning_rate": 9.981397103126401e-05, + "loss": 2.3063, + "step": 1847 + }, + { + "epoch": 0.567219152854512, + "grad_norm": 0.8626619577407837, + "learning_rate": 9.981354241293752e-05, + "loss": 2.3616, + "step": 1848 + }, + { + "epoch": 0.5675260896255372, + "grad_norm": 0.8991526961326599, + "learning_rate": 9.981311330232442e-05, + "loss": 2.2355, + "step": 1849 + }, + { + "epoch": 0.5678330263965623, + "grad_norm": 0.7399953007698059, + "learning_rate": 9.981268369942894e-05, + "loss": 2.2452, + "step": 1850 + }, + { + "epoch": 0.5681399631675875, + "grad_norm": 0.7787104845046997, + "learning_rate": 9.981225360425533e-05, + "loss": 2.4141, + "step": 1851 + }, + { + "epoch": 0.5684468999386126, + "grad_norm": 0.8570892214775085, + "learning_rate": 9.98118230168078e-05, + "loss": 2.2487, + "step": 1852 + }, + { + "epoch": 0.5687538367096379, + "grad_norm": 0.8277538418769836, + "learning_rate": 9.981139193709068e-05, + "loss": 2.2602, + "step": 1853 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.7638106942176819, + "learning_rate": 9.981096036510817e-05, + "loss": 2.2886, + "step": 1854 + }, + { + "epoch": 0.5693677102516882, + "grad_norm": 0.8480616807937622, + "learning_rate": 9.981052830086454e-05, + "loss": 2.2893, + "step": 1855 + }, + { + "epoch": 0.5696746470227133, + "grad_norm": 0.8568599820137024, + "learning_rate": 9.98100957443641e-05, + "loss": 2.3802, + "step": 1856 + }, + { + "epoch": 0.5699815837937385, + "grad_norm": 0.7863987684249878, + "learning_rate": 9.98096626956111e-05, + "loss": 2.2996, + "step": 1857 + }, + { + "epoch": 0.5702885205647636, + "grad_norm": 0.7636334896087646, + "learning_rate": 9.980922915460979e-05, + "loss": 2.2569, + "step": 1858 + }, + { + "epoch": 0.5705954573357889, + "grad_norm": 0.7514677047729492, + "learning_rate": 9.98087951213645e-05, + "loss": 2.3317, + "step": 1859 + }, + { + "epoch": 0.570902394106814, + "grad_norm": 0.717637300491333, + "learning_rate": 9.980836059587951e-05, + "loss": 2.2855, + "step": 1860 + }, + { + "epoch": 0.5712093308778392, + "grad_norm": 0.728518545627594, + "learning_rate": 9.98079255781591e-05, + "loss": 2.3166, + "step": 1861 + }, + { + "epoch": 0.5715162676488643, + "grad_norm": 0.7158043384552002, + "learning_rate": 9.980749006820757e-05, + "loss": 2.2639, + "step": 1862 + }, + { + "epoch": 0.5718232044198895, + "grad_norm": 0.7565107941627502, + "learning_rate": 9.980705406602924e-05, + "loss": 2.2833, + "step": 1863 + }, + { + "epoch": 0.5721301411909147, + "grad_norm": 0.7873388528823853, + "learning_rate": 9.980661757162841e-05, + "loss": 2.201, + "step": 1864 + }, + { + "epoch": 0.5724370779619399, + "grad_norm": 0.7818259596824646, + "learning_rate": 9.980618058500939e-05, + "loss": 2.242, + "step": 1865 + }, + { + "epoch": 0.572744014732965, + "grad_norm": 0.7464665770530701, + "learning_rate": 9.98057431061765e-05, + "loss": 2.2325, + "step": 1866 + }, + { + "epoch": 0.5730509515039902, + "grad_norm": 0.7778184413909912, + "learning_rate": 9.980530513513406e-05, + "loss": 2.3258, + "step": 1867 + }, + { + "epoch": 0.5733578882750153, + "grad_norm": 0.825661301612854, + "learning_rate": 9.980486667188642e-05, + "loss": 2.3477, + "step": 1868 + }, + { + "epoch": 0.5736648250460405, + "grad_norm": 0.8448848724365234, + "learning_rate": 9.980442771643788e-05, + "loss": 2.3523, + "step": 1869 + }, + { + "epoch": 0.5739717618170657, + "grad_norm": 0.8330404758453369, + "learning_rate": 9.98039882687928e-05, + "loss": 2.2274, + "step": 1870 + }, + { + "epoch": 0.5742786985880909, + "grad_norm": 0.7520943284034729, + "learning_rate": 9.98035483289555e-05, + "loss": 2.2773, + "step": 1871 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.8312448263168335, + "learning_rate": 9.980310789693037e-05, + "loss": 2.302, + "step": 1872 + }, + { + "epoch": 0.5748925721301412, + "grad_norm": 0.7383994460105896, + "learning_rate": 9.980266697272173e-05, + "loss": 2.2168, + "step": 1873 + }, + { + "epoch": 0.5751995089011663, + "grad_norm": 0.9612922072410583, + "learning_rate": 9.980222555633394e-05, + "loss": 2.3558, + "step": 1874 + }, + { + "epoch": 0.5755064456721916, + "grad_norm": 0.9921227097511292, + "learning_rate": 9.980178364777136e-05, + "loss": 2.2913, + "step": 1875 + }, + { + "epoch": 0.5758133824432167, + "grad_norm": 0.9152889847755432, + "learning_rate": 9.980134124703837e-05, + "loss": 2.2615, + "step": 1876 + }, + { + "epoch": 0.5761203192142419, + "grad_norm": 0.8090541362762451, + "learning_rate": 9.980089835413936e-05, + "loss": 2.2661, + "step": 1877 + }, + { + "epoch": 0.576427255985267, + "grad_norm": 0.8074322938919067, + "learning_rate": 9.980045496907865e-05, + "loss": 2.3209, + "step": 1878 + }, + { + "epoch": 0.5767341927562922, + "grad_norm": 0.784649670124054, + "learning_rate": 9.980001109186065e-05, + "loss": 2.241, + "step": 1879 + }, + { + "epoch": 0.5770411295273173, + "grad_norm": 0.768108069896698, + "learning_rate": 9.979956672248978e-05, + "loss": 2.3333, + "step": 1880 + }, + { + "epoch": 0.5773480662983426, + "grad_norm": 0.798058271408081, + "learning_rate": 9.97991218609704e-05, + "loss": 2.3564, + "step": 1881 + }, + { + "epoch": 0.5776550030693677, + "grad_norm": 0.7606865763664246, + "learning_rate": 9.97986765073069e-05, + "loss": 2.2277, + "step": 1882 + }, + { + "epoch": 0.5779619398403929, + "grad_norm": 0.8320558667182922, + "learning_rate": 9.979823066150369e-05, + "loss": 2.3715, + "step": 1883 + }, + { + "epoch": 0.578268876611418, + "grad_norm": 0.7935798168182373, + "learning_rate": 9.979778432356517e-05, + "loss": 2.2605, + "step": 1884 + }, + { + "epoch": 0.5785758133824432, + "grad_norm": 0.6914796829223633, + "learning_rate": 9.979733749349578e-05, + "loss": 2.2699, + "step": 1885 + }, + { + "epoch": 0.5788827501534684, + "grad_norm": 0.6546899676322937, + "learning_rate": 9.979689017129989e-05, + "loss": 2.1908, + "step": 1886 + }, + { + "epoch": 0.5791896869244936, + "grad_norm": 0.7231267094612122, + "learning_rate": 9.979644235698195e-05, + "loss": 2.2084, + "step": 1887 + }, + { + "epoch": 0.5794966236955187, + "grad_norm": 0.668933093547821, + "learning_rate": 9.979599405054639e-05, + "loss": 2.2722, + "step": 1888 + }, + { + "epoch": 0.5798035604665439, + "grad_norm": 0.678191602230072, + "learning_rate": 9.979554525199763e-05, + "loss": 2.2312, + "step": 1889 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.6407462954521179, + "learning_rate": 9.97950959613401e-05, + "loss": 2.2381, + "step": 1890 + }, + { + "epoch": 0.5804174340085942, + "grad_norm": 0.6920403242111206, + "learning_rate": 9.979464617857826e-05, + "loss": 2.2678, + "step": 1891 + }, + { + "epoch": 0.5807243707796194, + "grad_norm": 0.6907110810279846, + "learning_rate": 9.979419590371651e-05, + "loss": 2.2579, + "step": 1892 + }, + { + "epoch": 0.5810313075506446, + "grad_norm": 0.7683933973312378, + "learning_rate": 9.979374513675935e-05, + "loss": 2.2184, + "step": 1893 + }, + { + "epoch": 0.5813382443216697, + "grad_norm": 0.797286868095398, + "learning_rate": 9.979329387771121e-05, + "loss": 2.2518, + "step": 1894 + }, + { + "epoch": 0.5816451810926949, + "grad_norm": 0.8192877769470215, + "learning_rate": 9.979284212657657e-05, + "loss": 2.2271, + "step": 1895 + }, + { + "epoch": 0.58195211786372, + "grad_norm": 0.7510090470314026, + "learning_rate": 9.979238988335986e-05, + "loss": 2.2864, + "step": 1896 + }, + { + "epoch": 0.5822590546347453, + "grad_norm": 0.7541393041610718, + "learning_rate": 9.979193714806558e-05, + "loss": 2.239, + "step": 1897 + }, + { + "epoch": 0.5825659914057704, + "grad_norm": 0.7353073358535767, + "learning_rate": 9.97914839206982e-05, + "loss": 2.2145, + "step": 1898 + }, + { + "epoch": 0.5828729281767956, + "grad_norm": 0.6813456416130066, + "learning_rate": 9.979103020126218e-05, + "loss": 2.194, + "step": 1899 + }, + { + "epoch": 0.5831798649478207, + "grad_norm": 0.6922066807746887, + "learning_rate": 9.979057598976202e-05, + "loss": 2.2335, + "step": 1900 + }, + { + "epoch": 0.5834868017188459, + "grad_norm": 0.5800344944000244, + "learning_rate": 9.97901212862022e-05, + "loss": 2.2159, + "step": 1901 + }, + { + "epoch": 0.583793738489871, + "grad_norm": 0.5770835280418396, + "learning_rate": 9.978966609058722e-05, + "loss": 2.2217, + "step": 1902 + }, + { + "epoch": 0.5841006752608963, + "grad_norm": 0.6217128038406372, + "learning_rate": 9.978921040292158e-05, + "loss": 2.2703, + "step": 1903 + }, + { + "epoch": 0.5844076120319214, + "grad_norm": 0.6684436798095703, + "learning_rate": 9.97887542232098e-05, + "loss": 2.2747, + "step": 1904 + }, + { + "epoch": 0.5847145488029466, + "grad_norm": 0.6261670589447021, + "learning_rate": 9.978829755145633e-05, + "loss": 2.2867, + "step": 1905 + }, + { + "epoch": 0.5850214855739717, + "grad_norm": 0.646051824092865, + "learning_rate": 9.978784038766575e-05, + "loss": 2.2493, + "step": 1906 + }, + { + "epoch": 0.5853284223449969, + "grad_norm": 0.6757060885429382, + "learning_rate": 9.978738273184254e-05, + "loss": 2.218, + "step": 1907 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.7867937684059143, + "learning_rate": 9.978692458399122e-05, + "loss": 2.3405, + "step": 1908 + }, + { + "epoch": 0.5859422958870473, + "grad_norm": 0.8349789381027222, + "learning_rate": 9.978646594411636e-05, + "loss": 2.3292, + "step": 1909 + }, + { + "epoch": 0.5862492326580724, + "grad_norm": 0.8739562034606934, + "learning_rate": 9.978600681222243e-05, + "loss": 2.2132, + "step": 1910 + }, + { + "epoch": 0.5865561694290976, + "grad_norm": 0.8187520503997803, + "learning_rate": 9.978554718831402e-05, + "loss": 2.3078, + "step": 1911 + }, + { + "epoch": 0.5868631062001227, + "grad_norm": 0.8463271856307983, + "learning_rate": 9.978508707239565e-05, + "loss": 2.1924, + "step": 1912 + }, + { + "epoch": 0.5871700429711479, + "grad_norm": 0.8674206733703613, + "learning_rate": 9.978462646447187e-05, + "loss": 2.2185, + "step": 1913 + }, + { + "epoch": 0.5874769797421732, + "grad_norm": 0.7828893065452576, + "learning_rate": 9.978416536454722e-05, + "loss": 2.3137, + "step": 1914 + }, + { + "epoch": 0.5877839165131983, + "grad_norm": 0.7868914604187012, + "learning_rate": 9.978370377262629e-05, + "loss": 2.2202, + "step": 1915 + }, + { + "epoch": 0.5880908532842235, + "grad_norm": 0.811596155166626, + "learning_rate": 9.97832416887136e-05, + "loss": 2.3463, + "step": 1916 + }, + { + "epoch": 0.5883977900552486, + "grad_norm": 0.9281075596809387, + "learning_rate": 9.978277911281375e-05, + "loss": 2.2394, + "step": 1917 + }, + { + "epoch": 0.5887047268262738, + "grad_norm": 0.8862313628196716, + "learning_rate": 9.978231604493129e-05, + "loss": 2.2456, + "step": 1918 + }, + { + "epoch": 0.589011663597299, + "grad_norm": 0.8411116600036621, + "learning_rate": 9.978185248507081e-05, + "loss": 2.2409, + "step": 1919 + }, + { + "epoch": 0.5893186003683242, + "grad_norm": 0.8205060958862305, + "learning_rate": 9.978138843323688e-05, + "loss": 2.2468, + "step": 1920 + }, + { + "epoch": 0.5896255371393493, + "grad_norm": 0.8103171586990356, + "learning_rate": 9.97809238894341e-05, + "loss": 2.2979, + "step": 1921 + }, + { + "epoch": 0.5899324739103745, + "grad_norm": 0.7937025427818298, + "learning_rate": 9.978045885366704e-05, + "loss": 2.3582, + "step": 1922 + }, + { + "epoch": 0.5902394106813996, + "grad_norm": 0.7983896136283875, + "learning_rate": 9.977999332594032e-05, + "loss": 2.2725, + "step": 1923 + }, + { + "epoch": 0.5905463474524248, + "grad_norm": 0.8274399042129517, + "learning_rate": 9.977952730625852e-05, + "loss": 2.3091, + "step": 1924 + }, + { + "epoch": 0.59085328422345, + "grad_norm": 0.9385362863540649, + "learning_rate": 9.977906079462627e-05, + "loss": 2.4322, + "step": 1925 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.8405537009239197, + "learning_rate": 9.977859379104814e-05, + "loss": 2.1606, + "step": 1926 + }, + { + "epoch": 0.5914671577655003, + "grad_norm": 0.8082418441772461, + "learning_rate": 9.97781262955288e-05, + "loss": 2.2929, + "step": 1927 + }, + { + "epoch": 0.5917740945365255, + "grad_norm": 0.7444280385971069, + "learning_rate": 9.977765830807283e-05, + "loss": 2.3217, + "step": 1928 + }, + { + "epoch": 0.5920810313075506, + "grad_norm": 0.7369982600212097, + "learning_rate": 9.977718982868485e-05, + "loss": 2.2658, + "step": 1929 + }, + { + "epoch": 0.5923879680785759, + "grad_norm": 0.6842257380485535, + "learning_rate": 9.977672085736951e-05, + "loss": 2.2243, + "step": 1930 + }, + { + "epoch": 0.592694904849601, + "grad_norm": 0.6954882740974426, + "learning_rate": 9.977625139413145e-05, + "loss": 2.2802, + "step": 1931 + }, + { + "epoch": 0.5930018416206262, + "grad_norm": 0.749829888343811, + "learning_rate": 9.97757814389753e-05, + "loss": 2.3166, + "step": 1932 + }, + { + "epoch": 0.5933087783916513, + "grad_norm": 0.7725609540939331, + "learning_rate": 9.977531099190569e-05, + "loss": 2.2367, + "step": 1933 + }, + { + "epoch": 0.5936157151626765, + "grad_norm": 0.7467440366744995, + "learning_rate": 9.977484005292728e-05, + "loss": 2.2704, + "step": 1934 + }, + { + "epoch": 0.5939226519337016, + "grad_norm": 0.7104424834251404, + "learning_rate": 9.977436862204475e-05, + "loss": 2.1983, + "step": 1935 + }, + { + "epoch": 0.5942295887047269, + "grad_norm": 0.7562711834907532, + "learning_rate": 9.977389669926272e-05, + "loss": 2.2857, + "step": 1936 + }, + { + "epoch": 0.594536525475752, + "grad_norm": 0.7803298830986023, + "learning_rate": 9.977342428458585e-05, + "loss": 2.3526, + "step": 1937 + }, + { + "epoch": 0.5948434622467772, + "grad_norm": 0.7487826943397522, + "learning_rate": 9.977295137801885e-05, + "loss": 2.2338, + "step": 1938 + }, + { + "epoch": 0.5951503990178023, + "grad_norm": 0.6969291567802429, + "learning_rate": 9.977247797956639e-05, + "loss": 2.2185, + "step": 1939 + }, + { + "epoch": 0.5954573357888275, + "grad_norm": 0.6293052434921265, + "learning_rate": 9.977200408923311e-05, + "loss": 2.2767, + "step": 1940 + }, + { + "epoch": 0.5957642725598526, + "grad_norm": 0.7457680702209473, + "learning_rate": 9.97715297070237e-05, + "loss": 2.2688, + "step": 1941 + }, + { + "epoch": 0.5960712093308779, + "grad_norm": 0.7255130410194397, + "learning_rate": 9.977105483294288e-05, + "loss": 2.2157, + "step": 1942 + }, + { + "epoch": 0.596378146101903, + "grad_norm": 0.739815890789032, + "learning_rate": 9.977057946699532e-05, + "loss": 2.306, + "step": 1943 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.7493855357170105, + "learning_rate": 9.977010360918571e-05, + "loss": 2.1893, + "step": 1944 + }, + { + "epoch": 0.5969920196439533, + "grad_norm": 0.7976173758506775, + "learning_rate": 9.976962725951878e-05, + "loss": 2.3288, + "step": 1945 + }, + { + "epoch": 0.5972989564149785, + "grad_norm": 0.9487287998199463, + "learning_rate": 9.976915041799921e-05, + "loss": 2.4484, + "step": 1946 + }, + { + "epoch": 0.5976058931860037, + "grad_norm": 0.9866845011711121, + "learning_rate": 9.976867308463174e-05, + "loss": 2.3223, + "step": 1947 + }, + { + "epoch": 0.5979128299570289, + "grad_norm": 0.9258660674095154, + "learning_rate": 9.976819525942107e-05, + "loss": 2.2358, + "step": 1948 + }, + { + "epoch": 0.598219766728054, + "grad_norm": 0.9822832345962524, + "learning_rate": 9.976771694237192e-05, + "loss": 2.2951, + "step": 1949 + }, + { + "epoch": 0.5985267034990792, + "grad_norm": 1.005528450012207, + "learning_rate": 9.976723813348902e-05, + "loss": 2.2604, + "step": 1950 + }, + { + "epoch": 0.5988336402701043, + "grad_norm": 0.8988018035888672, + "learning_rate": 9.976675883277711e-05, + "loss": 2.3419, + "step": 1951 + }, + { + "epoch": 0.5991405770411296, + "grad_norm": 0.7386319041252136, + "learning_rate": 9.976627904024091e-05, + "loss": 2.2357, + "step": 1952 + }, + { + "epoch": 0.5994475138121547, + "grad_norm": 0.7715404033660889, + "learning_rate": 9.976579875588518e-05, + "loss": 2.3482, + "step": 1953 + }, + { + "epoch": 0.5997544505831799, + "grad_norm": 0.7529712319374084, + "learning_rate": 9.976531797971464e-05, + "loss": 2.1735, + "step": 1954 + }, + { + "epoch": 0.600061387354205, + "grad_norm": 0.8589643836021423, + "learning_rate": 9.97648367117341e-05, + "loss": 2.305, + "step": 1955 + }, + { + "epoch": 0.6003683241252302, + "grad_norm": 0.9038915634155273, + "learning_rate": 9.976435495194823e-05, + "loss": 2.2123, + "step": 1956 + }, + { + "epoch": 0.6006752608962553, + "grad_norm": 0.9388678073883057, + "learning_rate": 9.976387270036186e-05, + "loss": 2.1792, + "step": 1957 + }, + { + "epoch": 0.6009821976672806, + "grad_norm": 0.7970952391624451, + "learning_rate": 9.976338995697974e-05, + "loss": 2.2425, + "step": 1958 + }, + { + "epoch": 0.6012891344383057, + "grad_norm": 0.7219900488853455, + "learning_rate": 9.976290672180662e-05, + "loss": 2.1984, + "step": 1959 + }, + { + "epoch": 0.6015960712093309, + "grad_norm": 0.639715313911438, + "learning_rate": 9.976242299484728e-05, + "loss": 2.2796, + "step": 1960 + }, + { + "epoch": 0.601903007980356, + "grad_norm": 0.6734911799430847, + "learning_rate": 9.976193877610652e-05, + "loss": 2.3066, + "step": 1961 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.8328932523727417, + "learning_rate": 9.976145406558912e-05, + "loss": 2.3958, + "step": 1962 + }, + { + "epoch": 0.6025168815224063, + "grad_norm": 0.9552088379859924, + "learning_rate": 9.976096886329986e-05, + "loss": 2.3246, + "step": 1963 + }, + { + "epoch": 0.6028238182934316, + "grad_norm": 0.8407328128814697, + "learning_rate": 9.976048316924354e-05, + "loss": 2.2922, + "step": 1964 + }, + { + "epoch": 0.6031307550644567, + "grad_norm": 0.6899709105491638, + "learning_rate": 9.975999698342495e-05, + "loss": 2.1808, + "step": 1965 + }, + { + "epoch": 0.6034376918354819, + "grad_norm": 0.8114390969276428, + "learning_rate": 9.975951030584892e-05, + "loss": 2.3516, + "step": 1966 + }, + { + "epoch": 0.603744628606507, + "grad_norm": 0.8071461319923401, + "learning_rate": 9.975902313652024e-05, + "loss": 2.2044, + "step": 1967 + }, + { + "epoch": 0.6040515653775322, + "grad_norm": 0.8767913579940796, + "learning_rate": 9.975853547544372e-05, + "loss": 2.24, + "step": 1968 + }, + { + "epoch": 0.6043585021485574, + "grad_norm": 0.817095935344696, + "learning_rate": 9.975804732262419e-05, + "loss": 2.169, + "step": 1969 + }, + { + "epoch": 0.6046654389195826, + "grad_norm": 0.6818623542785645, + "learning_rate": 9.975755867806648e-05, + "loss": 2.2869, + "step": 1970 + }, + { + "epoch": 0.6049723756906077, + "grad_norm": 0.7248693704605103, + "learning_rate": 9.97570695417754e-05, + "loss": 2.2159, + "step": 1971 + }, + { + "epoch": 0.6052793124616329, + "grad_norm": 0.6425455212593079, + "learning_rate": 9.975657991375581e-05, + "loss": 2.2173, + "step": 1972 + }, + { + "epoch": 0.605586249232658, + "grad_norm": 0.6856566071510315, + "learning_rate": 9.975608979401252e-05, + "loss": 2.2994, + "step": 1973 + }, + { + "epoch": 0.6058931860036832, + "grad_norm": 0.6731004118919373, + "learning_rate": 9.97555991825504e-05, + "loss": 2.2286, + "step": 1974 + }, + { + "epoch": 0.6062001227747084, + "grad_norm": 0.7461759448051453, + "learning_rate": 9.975510807937428e-05, + "loss": 2.2057, + "step": 1975 + }, + { + "epoch": 0.6065070595457336, + "grad_norm": 0.7256236672401428, + "learning_rate": 9.975461648448902e-05, + "loss": 2.2686, + "step": 1976 + }, + { + "epoch": 0.6068139963167587, + "grad_norm": 0.7254514098167419, + "learning_rate": 9.975412439789949e-05, + "loss": 2.2748, + "step": 1977 + }, + { + "epoch": 0.6071209330877839, + "grad_norm": 0.7280047535896301, + "learning_rate": 9.975363181961052e-05, + "loss": 2.27, + "step": 1978 + }, + { + "epoch": 0.607427869858809, + "grad_norm": 0.6801813244819641, + "learning_rate": 9.9753138749627e-05, + "loss": 2.2356, + "step": 1979 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.841946005821228, + "learning_rate": 9.975264518795382e-05, + "loss": 2.3887, + "step": 1980 + }, + { + "epoch": 0.6080417434008594, + "grad_norm": 0.9610007405281067, + "learning_rate": 9.975215113459582e-05, + "loss": 2.2857, + "step": 1981 + }, + { + "epoch": 0.6083486801718846, + "grad_norm": 0.8726536631584167, + "learning_rate": 9.975165658955791e-05, + "loss": 2.3137, + "step": 1982 + }, + { + "epoch": 0.6086556169429097, + "grad_norm": 0.9275946021080017, + "learning_rate": 9.975116155284498e-05, + "loss": 2.291, + "step": 1983 + }, + { + "epoch": 0.6089625537139349, + "grad_norm": 0.9045402407646179, + "learning_rate": 9.97506660244619e-05, + "loss": 2.2183, + "step": 1984 + }, + { + "epoch": 0.6092694904849602, + "grad_norm": 0.7913599610328674, + "learning_rate": 9.975017000441358e-05, + "loss": 2.349, + "step": 1985 + }, + { + "epoch": 0.6095764272559853, + "grad_norm": 0.714824378490448, + "learning_rate": 9.974967349270492e-05, + "loss": 2.2163, + "step": 1986 + }, + { + "epoch": 0.6098833640270105, + "grad_norm": 0.7178559899330139, + "learning_rate": 9.974917648934084e-05, + "loss": 2.2338, + "step": 1987 + }, + { + "epoch": 0.6101903007980356, + "grad_norm": 0.8417280912399292, + "learning_rate": 9.97486789943262e-05, + "loss": 2.1961, + "step": 1988 + }, + { + "epoch": 0.6104972375690608, + "grad_norm": 0.8488532304763794, + "learning_rate": 9.9748181007666e-05, + "loss": 2.2509, + "step": 1989 + }, + { + "epoch": 0.6108041743400859, + "grad_norm": 0.796309769153595, + "learning_rate": 9.974768252936509e-05, + "loss": 2.2948, + "step": 1990 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.7163965702056885, + "learning_rate": 9.974718355942843e-05, + "loss": 2.2136, + "step": 1991 + }, + { + "epoch": 0.6114180478821363, + "grad_norm": 0.6620060205459595, + "learning_rate": 9.974668409786095e-05, + "loss": 2.2442, + "step": 1992 + }, + { + "epoch": 0.6117249846531615, + "grad_norm": 0.6843542456626892, + "learning_rate": 9.974618414466759e-05, + "loss": 2.1972, + "step": 1993 + }, + { + "epoch": 0.6120319214241866, + "grad_norm": 0.699847936630249, + "learning_rate": 9.974568369985327e-05, + "loss": 2.2194, + "step": 1994 + }, + { + "epoch": 0.6123388581952118, + "grad_norm": 0.693384051322937, + "learning_rate": 9.974518276342293e-05, + "loss": 2.2446, + "step": 1995 + }, + { + "epoch": 0.612645794966237, + "grad_norm": 0.6022316813468933, + "learning_rate": 9.974468133538155e-05, + "loss": 2.2037, + "step": 1996 + }, + { + "epoch": 0.6129527317372622, + "grad_norm": 0.6317062377929688, + "learning_rate": 9.974417941573409e-05, + "loss": 2.1855, + "step": 1997 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.7291355133056641, + "learning_rate": 9.974367700448547e-05, + "loss": 2.2179, + "step": 1998 + }, + { + "epoch": 0.6135666052793125, + "grad_norm": 0.6776867508888245, + "learning_rate": 9.97431741016407e-05, + "loss": 2.2437, + "step": 1999 + }, + { + "epoch": 0.6138735420503376, + "grad_norm": 0.6598517298698425, + "learning_rate": 9.97426707072047e-05, + "loss": 2.2775, + "step": 2000 + }, + { + "epoch": 0.6141804788213628, + "grad_norm": 0.6681709289550781, + "learning_rate": 9.974216682118249e-05, + "loss": 2.2004, + "step": 2001 + }, + { + "epoch": 0.614487415592388, + "grad_norm": 0.6725168228149414, + "learning_rate": 9.974166244357903e-05, + "loss": 2.2922, + "step": 2002 + }, + { + "epoch": 0.6147943523634132, + "grad_norm": 0.6547908782958984, + "learning_rate": 9.974115757439931e-05, + "loss": 2.2195, + "step": 2003 + }, + { + "epoch": 0.6151012891344383, + "grad_norm": 0.7195348739624023, + "learning_rate": 9.974065221364831e-05, + "loss": 2.2862, + "step": 2004 + }, + { + "epoch": 0.6154082259054635, + "grad_norm": 0.7992655038833618, + "learning_rate": 9.974014636133103e-05, + "loss": 2.3109, + "step": 2005 + }, + { + "epoch": 0.6157151626764886, + "grad_norm": 0.7932934165000916, + "learning_rate": 9.973964001745249e-05, + "loss": 2.2869, + "step": 2006 + }, + { + "epoch": 0.6160220994475138, + "grad_norm": 0.7778924107551575, + "learning_rate": 9.973913318201763e-05, + "loss": 2.2046, + "step": 2007 + }, + { + "epoch": 0.616329036218539, + "grad_norm": 0.7951294183731079, + "learning_rate": 9.973862585503155e-05, + "loss": 2.221, + "step": 2008 + }, + { + "epoch": 0.6166359729895642, + "grad_norm": 0.729552686214447, + "learning_rate": 9.97381180364992e-05, + "loss": 2.2929, + "step": 2009 + }, + { + "epoch": 0.6169429097605893, + "grad_norm": 0.731516420841217, + "learning_rate": 9.973760972642561e-05, + "loss": 2.2673, + "step": 2010 + }, + { + "epoch": 0.6172498465316145, + "grad_norm": 0.6950094103813171, + "learning_rate": 9.973710092481581e-05, + "loss": 2.2029, + "step": 2011 + }, + { + "epoch": 0.6175567833026396, + "grad_norm": 0.6260825395584106, + "learning_rate": 9.973659163167484e-05, + "loss": 2.3037, + "step": 2012 + }, + { + "epoch": 0.6178637200736649, + "grad_norm": 0.6949467658996582, + "learning_rate": 9.97360818470077e-05, + "loss": 2.2699, + "step": 2013 + }, + { + "epoch": 0.61817065684469, + "grad_norm": 0.7322572469711304, + "learning_rate": 9.973557157081945e-05, + "loss": 2.2921, + "step": 2014 + }, + { + "epoch": 0.6184775936157152, + "grad_norm": 0.8999563455581665, + "learning_rate": 9.973506080311514e-05, + "loss": 2.2499, + "step": 2015 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.9269914031028748, + "learning_rate": 9.973454954389981e-05, + "loss": 2.2676, + "step": 2016 + }, + { + "epoch": 0.6190914671577655, + "grad_norm": 0.8630712628364563, + "learning_rate": 9.973403779317852e-05, + "loss": 2.1379, + "step": 2017 + }, + { + "epoch": 0.6193984039287906, + "grad_norm": 0.8249645233154297, + "learning_rate": 9.97335255509563e-05, + "loss": 2.3109, + "step": 2018 + }, + { + "epoch": 0.6197053406998159, + "grad_norm": 0.7832711338996887, + "learning_rate": 9.973301281723824e-05, + "loss": 2.1316, + "step": 2019 + }, + { + "epoch": 0.620012277470841, + "grad_norm": 0.7502821683883667, + "learning_rate": 9.97324995920294e-05, + "loss": 2.2188, + "step": 2020 + }, + { + "epoch": 0.6203192142418662, + "grad_norm": 0.7804487347602844, + "learning_rate": 9.973198587533483e-05, + "loss": 2.2639, + "step": 2021 + }, + { + "epoch": 0.6206261510128913, + "grad_norm": 0.9198356866836548, + "learning_rate": 9.973147166715963e-05, + "loss": 2.2574, + "step": 2022 + }, + { + "epoch": 0.6209330877839165, + "grad_norm": 0.8792869448661804, + "learning_rate": 9.97309569675089e-05, + "loss": 2.2228, + "step": 2023 + }, + { + "epoch": 0.6212400245549416, + "grad_norm": 0.779772937297821, + "learning_rate": 9.97304417763877e-05, + "loss": 2.2179, + "step": 2024 + }, + { + "epoch": 0.6215469613259669, + "grad_norm": 0.7702100276947021, + "learning_rate": 9.972992609380111e-05, + "loss": 2.3872, + "step": 2025 + }, + { + "epoch": 0.621853898096992, + "grad_norm": 0.8576669096946716, + "learning_rate": 9.972940991975426e-05, + "loss": 2.2279, + "step": 2026 + }, + { + "epoch": 0.6221608348680172, + "grad_norm": 0.8312802314758301, + "learning_rate": 9.972889325425223e-05, + "loss": 2.3507, + "step": 2027 + }, + { + "epoch": 0.6224677716390423, + "grad_norm": 0.7873719930648804, + "learning_rate": 9.972837609730013e-05, + "loss": 2.2252, + "step": 2028 + }, + { + "epoch": 0.6227747084100675, + "grad_norm": 0.7763897180557251, + "learning_rate": 9.972785844890307e-05, + "loss": 2.2559, + "step": 2029 + }, + { + "epoch": 0.6230816451810927, + "grad_norm": 0.7053700685501099, + "learning_rate": 9.972734030906617e-05, + "loss": 2.2248, + "step": 2030 + }, + { + "epoch": 0.6233885819521179, + "grad_norm": 0.8800643682479858, + "learning_rate": 9.972682167779453e-05, + "loss": 2.3111, + "step": 2031 + }, + { + "epoch": 0.623695518723143, + "grad_norm": 0.7237632274627686, + "learning_rate": 9.97263025550933e-05, + "loss": 2.2255, + "step": 2032 + }, + { + "epoch": 0.6240024554941682, + "grad_norm": 0.7139064073562622, + "learning_rate": 9.97257829409676e-05, + "loss": 2.2065, + "step": 2033 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.6514315009117126, + "learning_rate": 9.972526283542259e-05, + "loss": 2.2176, + "step": 2034 + }, + { + "epoch": 0.6246163290362186, + "grad_norm": 0.726828932762146, + "learning_rate": 9.972474223846337e-05, + "loss": 2.2236, + "step": 2035 + }, + { + "epoch": 0.6249232658072437, + "grad_norm": 0.7121313810348511, + "learning_rate": 9.97242211500951e-05, + "loss": 2.2696, + "step": 2036 + }, + { + "epoch": 0.6252302025782689, + "grad_norm": 0.7203021049499512, + "learning_rate": 9.972369957032293e-05, + "loss": 2.2418, + "step": 2037 + }, + { + "epoch": 0.625537139349294, + "grad_norm": 0.6843051910400391, + "learning_rate": 9.972317749915203e-05, + "loss": 2.2408, + "step": 2038 + }, + { + "epoch": 0.6258440761203192, + "grad_norm": 0.6523141264915466, + "learning_rate": 9.972265493658754e-05, + "loss": 2.1693, + "step": 2039 + }, + { + "epoch": 0.6261510128913443, + "grad_norm": 0.6263946294784546, + "learning_rate": 9.972213188263463e-05, + "loss": 2.2477, + "step": 2040 + }, + { + "epoch": 0.6264579496623696, + "grad_norm": 0.6428464651107788, + "learning_rate": 9.972160833729847e-05, + "loss": 2.2131, + "step": 2041 + }, + { + "epoch": 0.6267648864333947, + "grad_norm": 0.6333484649658203, + "learning_rate": 9.972108430058423e-05, + "loss": 2.2806, + "step": 2042 + }, + { + "epoch": 0.6270718232044199, + "grad_norm": 0.7168832421302795, + "learning_rate": 9.97205597724971e-05, + "loss": 2.2468, + "step": 2043 + }, + { + "epoch": 0.627378759975445, + "grad_norm": 0.7522227168083191, + "learning_rate": 9.972003475304226e-05, + "loss": 2.249, + "step": 2044 + }, + { + "epoch": 0.6276856967464702, + "grad_norm": 0.6810066103935242, + "learning_rate": 9.971950924222488e-05, + "loss": 2.1988, + "step": 2045 + }, + { + "epoch": 0.6279926335174953, + "grad_norm": 0.6983187198638916, + "learning_rate": 9.971898324005018e-05, + "loss": 2.2444, + "step": 2046 + }, + { + "epoch": 0.6282995702885206, + "grad_norm": 0.7261439561843872, + "learning_rate": 9.971845674652333e-05, + "loss": 2.1789, + "step": 2047 + }, + { + "epoch": 0.6286065070595457, + "grad_norm": 0.6844322681427002, + "learning_rate": 9.971792976164957e-05, + "loss": 2.2666, + "step": 2048 + }, + { + "epoch": 0.6289134438305709, + "grad_norm": 0.7166746258735657, + "learning_rate": 9.971740228543407e-05, + "loss": 2.3002, + "step": 2049 + }, + { + "epoch": 0.629220380601596, + "grad_norm": 0.7386785745620728, + "learning_rate": 9.971687431788207e-05, + "loss": 2.1798, + "step": 2050 + }, + { + "epoch": 0.6295273173726212, + "grad_norm": 0.6873611211776733, + "learning_rate": 9.971634585899878e-05, + "loss": 2.184, + "step": 2051 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.8005948066711426, + "learning_rate": 9.971581690878941e-05, + "loss": 2.2778, + "step": 2052 + }, + { + "epoch": 0.6301411909146716, + "grad_norm": 0.8972415924072266, + "learning_rate": 9.971528746725922e-05, + "loss": 2.2822, + "step": 2053 + }, + { + "epoch": 0.6304481276856968, + "grad_norm": 0.7935822010040283, + "learning_rate": 9.97147575344134e-05, + "loss": 2.1732, + "step": 2054 + }, + { + "epoch": 0.6307550644567219, + "grad_norm": 0.7891644239425659, + "learning_rate": 9.971422711025721e-05, + "loss": 2.2765, + "step": 2055 + }, + { + "epoch": 0.6310620012277471, + "grad_norm": 0.7857005000114441, + "learning_rate": 9.971369619479589e-05, + "loss": 2.2386, + "step": 2056 + }, + { + "epoch": 0.6313689379987723, + "grad_norm": 0.6909852623939514, + "learning_rate": 9.97131647880347e-05, + "loss": 2.1251, + "step": 2057 + }, + { + "epoch": 0.6316758747697975, + "grad_norm": 0.6352387070655823, + "learning_rate": 9.971263288997885e-05, + "loss": 2.1883, + "step": 2058 + }, + { + "epoch": 0.6319828115408226, + "grad_norm": 0.5811386704444885, + "learning_rate": 9.971210050063364e-05, + "loss": 2.281, + "step": 2059 + }, + { + "epoch": 0.6322897483118478, + "grad_norm": 0.6227630376815796, + "learning_rate": 9.971156762000432e-05, + "loss": 2.1346, + "step": 2060 + }, + { + "epoch": 0.6325966850828729, + "grad_norm": 0.6628422737121582, + "learning_rate": 9.971103424809616e-05, + "loss": 2.2617, + "step": 2061 + }, + { + "epoch": 0.6329036218538981, + "grad_norm": 0.7212308645248413, + "learning_rate": 9.97105003849144e-05, + "loss": 2.1764, + "step": 2062 + }, + { + "epoch": 0.6332105586249233, + "grad_norm": 0.8368894457817078, + "learning_rate": 9.970996603046435e-05, + "loss": 2.2897, + "step": 2063 + }, + { + "epoch": 0.6335174953959485, + "grad_norm": 0.8797467350959778, + "learning_rate": 9.970943118475129e-05, + "loss": 2.1987, + "step": 2064 + }, + { + "epoch": 0.6338244321669736, + "grad_norm": 0.9241101145744324, + "learning_rate": 9.970889584778047e-05, + "loss": 2.2759, + "step": 2065 + }, + { + "epoch": 0.6341313689379988, + "grad_norm": 0.8636183142662048, + "learning_rate": 9.970836001955723e-05, + "loss": 2.2188, + "step": 2066 + }, + { + "epoch": 0.6344383057090239, + "grad_norm": 0.8965754508972168, + "learning_rate": 9.970782370008682e-05, + "loss": 2.2845, + "step": 2067 + }, + { + "epoch": 0.6347452424800492, + "grad_norm": 0.9064372777938843, + "learning_rate": 9.970728688937459e-05, + "loss": 2.1787, + "step": 2068 + }, + { + "epoch": 0.6350521792510743, + "grad_norm": 0.7387171387672424, + "learning_rate": 9.970674958742579e-05, + "loss": 2.1805, + "step": 2069 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.6220484972000122, + "learning_rate": 9.970621179424578e-05, + "loss": 2.2762, + "step": 2070 + }, + { + "epoch": 0.6356660527931246, + "grad_norm": 0.6268464922904968, + "learning_rate": 9.970567350983984e-05, + "loss": 2.2491, + "step": 2071 + }, + { + "epoch": 0.6359729895641498, + "grad_norm": 0.6385738253593445, + "learning_rate": 9.97051347342133e-05, + "loss": 2.2126, + "step": 2072 + }, + { + "epoch": 0.6362799263351749, + "grad_norm": 0.7084285020828247, + "learning_rate": 9.970459546737148e-05, + "loss": 2.2364, + "step": 2073 + }, + { + "epoch": 0.6365868631062002, + "grad_norm": 0.6957145929336548, + "learning_rate": 9.97040557093197e-05, + "loss": 2.266, + "step": 2074 + }, + { + "epoch": 0.6368937998772253, + "grad_norm": 0.6037309169769287, + "learning_rate": 9.970351546006334e-05, + "loss": 2.1514, + "step": 2075 + }, + { + "epoch": 0.6372007366482505, + "grad_norm": 0.6342970132827759, + "learning_rate": 9.97029747196077e-05, + "loss": 2.1602, + "step": 2076 + }, + { + "epoch": 0.6375076734192756, + "grad_norm": 0.5793863534927368, + "learning_rate": 9.970243348795812e-05, + "loss": 2.1853, + "step": 2077 + }, + { + "epoch": 0.6378146101903008, + "grad_norm": 0.5420103073120117, + "learning_rate": 9.970189176511997e-05, + "loss": 2.1885, + "step": 2078 + }, + { + "epoch": 0.638121546961326, + "grad_norm": 0.6713188886642456, + "learning_rate": 9.97013495510986e-05, + "loss": 2.2641, + "step": 2079 + }, + { + "epoch": 0.6384284837323512, + "grad_norm": 0.7410796880722046, + "learning_rate": 9.970080684589935e-05, + "loss": 2.2248, + "step": 2080 + }, + { + "epoch": 0.6387354205033763, + "grad_norm": 0.7138017416000366, + "learning_rate": 9.970026364952761e-05, + "loss": 2.1975, + "step": 2081 + }, + { + "epoch": 0.6390423572744015, + "grad_norm": 0.7553584575653076, + "learning_rate": 9.969971996198873e-05, + "loss": 2.2482, + "step": 2082 + }, + { + "epoch": 0.6393492940454266, + "grad_norm": 0.7082852125167847, + "learning_rate": 9.969917578328808e-05, + "loss": 2.1681, + "step": 2083 + }, + { + "epoch": 0.6396562308164518, + "grad_norm": 0.6190223097801208, + "learning_rate": 9.969863111343105e-05, + "loss": 2.1995, + "step": 2084 + }, + { + "epoch": 0.639963167587477, + "grad_norm": 0.6640429496765137, + "learning_rate": 9.969808595242302e-05, + "loss": 2.2969, + "step": 2085 + }, + { + "epoch": 0.6402701043585022, + "grad_norm": 0.761377215385437, + "learning_rate": 9.969754030026936e-05, + "loss": 2.2412, + "step": 2086 + }, + { + "epoch": 0.6405770411295273, + "grad_norm": 0.7226401567459106, + "learning_rate": 9.969699415697551e-05, + "loss": 2.1852, + "step": 2087 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.6474639177322388, + "learning_rate": 9.969644752254681e-05, + "loss": 2.1867, + "step": 2088 + }, + { + "epoch": 0.6411909146715776, + "grad_norm": 0.6725835800170898, + "learning_rate": 9.96959003969887e-05, + "loss": 2.1962, + "step": 2089 + }, + { + "epoch": 0.6414978514426029, + "grad_norm": 0.6669641733169556, + "learning_rate": 9.969535278030657e-05, + "loss": 2.2045, + "step": 2090 + }, + { + "epoch": 0.641804788213628, + "grad_norm": 0.7604048252105713, + "learning_rate": 9.969480467250583e-05, + "loss": 2.2543, + "step": 2091 + }, + { + "epoch": 0.6421117249846532, + "grad_norm": 0.9369953870773315, + "learning_rate": 9.969425607359191e-05, + "loss": 2.2461, + "step": 2092 + }, + { + "epoch": 0.6424186617556783, + "grad_norm": 1.116156816482544, + "learning_rate": 9.969370698357022e-05, + "loss": 2.2447, + "step": 2093 + }, + { + "epoch": 0.6427255985267035, + "grad_norm": 0.9179674983024597, + "learning_rate": 9.96931574024462e-05, + "loss": 2.2164, + "step": 2094 + }, + { + "epoch": 0.6430325352977286, + "grad_norm": 0.7629393339157104, + "learning_rate": 9.969260733022526e-05, + "loss": 2.22, + "step": 2095 + }, + { + "epoch": 0.6433394720687539, + "grad_norm": 0.7152948379516602, + "learning_rate": 9.969205676691286e-05, + "loss": 2.1967, + "step": 2096 + }, + { + "epoch": 0.643646408839779, + "grad_norm": 0.7527763247489929, + "learning_rate": 9.969150571251442e-05, + "loss": 2.2263, + "step": 2097 + }, + { + "epoch": 0.6439533456108042, + "grad_norm": 0.9889422655105591, + "learning_rate": 9.96909541670354e-05, + "loss": 2.2127, + "step": 2098 + }, + { + "epoch": 0.6442602823818293, + "grad_norm": 1.0340619087219238, + "learning_rate": 9.969040213048125e-05, + "loss": 2.2392, + "step": 2099 + }, + { + "epoch": 0.6445672191528545, + "grad_norm": 0.735322892665863, + "learning_rate": 9.968984960285743e-05, + "loss": 2.1351, + "step": 2100 + }, + { + "epoch": 0.6448741559238796, + "grad_norm": 0.6575397849082947, + "learning_rate": 9.968929658416936e-05, + "loss": 2.2481, + "step": 2101 + }, + { + "epoch": 0.6451810926949049, + "grad_norm": 0.6891960501670837, + "learning_rate": 9.968874307442258e-05, + "loss": 2.2164, + "step": 2102 + }, + { + "epoch": 0.64548802946593, + "grad_norm": 0.792298436164856, + "learning_rate": 9.968818907362248e-05, + "loss": 2.1681, + "step": 2103 + }, + { + "epoch": 0.6457949662369552, + "grad_norm": 0.8438142538070679, + "learning_rate": 9.968763458177459e-05, + "loss": 2.2123, + "step": 2104 + }, + { + "epoch": 0.6461019030079803, + "grad_norm": 0.7494921088218689, + "learning_rate": 9.968707959888436e-05, + "loss": 2.1863, + "step": 2105 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.7049927115440369, + "learning_rate": 9.968652412495731e-05, + "loss": 2.2364, + "step": 2106 + }, + { + "epoch": 0.6467157765500307, + "grad_norm": 0.7586455345153809, + "learning_rate": 9.968596815999889e-05, + "loss": 2.1976, + "step": 2107 + }, + { + "epoch": 0.6470227133210559, + "grad_norm": 0.7762691974639893, + "learning_rate": 9.968541170401462e-05, + "loss": 2.2323, + "step": 2108 + }, + { + "epoch": 0.647329650092081, + "grad_norm": 0.8127642869949341, + "learning_rate": 9.968485475700998e-05, + "loss": 2.1577, + "step": 2109 + }, + { + "epoch": 0.6476365868631062, + "grad_norm": 0.6762635111808777, + "learning_rate": 9.968429731899049e-05, + "loss": 2.1972, + "step": 2110 + }, + { + "epoch": 0.6479435236341313, + "grad_norm": 0.675707995891571, + "learning_rate": 9.968373938996165e-05, + "loss": 2.1932, + "step": 2111 + }, + { + "epoch": 0.6482504604051565, + "grad_norm": 0.6996815204620361, + "learning_rate": 9.968318096992898e-05, + "loss": 2.2695, + "step": 2112 + }, + { + "epoch": 0.6485573971761817, + "grad_norm": 0.8519851565361023, + "learning_rate": 9.968262205889799e-05, + "loss": 2.2662, + "step": 2113 + }, + { + "epoch": 0.6488643339472069, + "grad_norm": 0.7621145844459534, + "learning_rate": 9.968206265687421e-05, + "loss": 2.2888, + "step": 2114 + }, + { + "epoch": 0.649171270718232, + "grad_norm": 0.786609411239624, + "learning_rate": 9.968150276386317e-05, + "loss": 2.3354, + "step": 2115 + }, + { + "epoch": 0.6494782074892572, + "grad_norm": 0.7693428993225098, + "learning_rate": 9.96809423798704e-05, + "loss": 2.1981, + "step": 2116 + }, + { + "epoch": 0.6497851442602823, + "grad_norm": 0.72762131690979, + "learning_rate": 9.968038150490145e-05, + "loss": 2.2387, + "step": 2117 + }, + { + "epoch": 0.6500920810313076, + "grad_norm": 0.737617015838623, + "learning_rate": 9.967982013896184e-05, + "loss": 2.258, + "step": 2118 + }, + { + "epoch": 0.6503990178023327, + "grad_norm": 0.7320968508720398, + "learning_rate": 9.967925828205712e-05, + "loss": 2.3248, + "step": 2119 + }, + { + "epoch": 0.6507059545733579, + "grad_norm": 0.7904484868049622, + "learning_rate": 9.967869593419286e-05, + "loss": 2.2121, + "step": 2120 + }, + { + "epoch": 0.651012891344383, + "grad_norm": 0.7519722580909729, + "learning_rate": 9.967813309537461e-05, + "loss": 2.1999, + "step": 2121 + }, + { + "epoch": 0.6513198281154082, + "grad_norm": 0.7201504707336426, + "learning_rate": 9.967756976560793e-05, + "loss": 2.2022, + "step": 2122 + }, + { + "epoch": 0.6516267648864333, + "grad_norm": 0.6134514808654785, + "learning_rate": 9.96770059448984e-05, + "loss": 2.2105, + "step": 2123 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.6086028218269348, + "learning_rate": 9.967644163325156e-05, + "loss": 2.212, + "step": 2124 + }, + { + "epoch": 0.6522406384284838, + "grad_norm": 0.6550475358963013, + "learning_rate": 9.967587683067302e-05, + "loss": 2.181, + "step": 2125 + }, + { + "epoch": 0.6525475751995089, + "grad_norm": 0.7557916045188904, + "learning_rate": 9.967531153716835e-05, + "loss": 2.3194, + "step": 2126 + }, + { + "epoch": 0.6528545119705341, + "grad_norm": 0.8859965801239014, + "learning_rate": 9.967474575274314e-05, + "loss": 2.2104, + "step": 2127 + }, + { + "epoch": 0.6531614487415592, + "grad_norm": 0.8049005270004272, + "learning_rate": 9.967417947740296e-05, + "loss": 2.2949, + "step": 2128 + }, + { + "epoch": 0.6534683855125845, + "grad_norm": 0.708297073841095, + "learning_rate": 9.967361271115343e-05, + "loss": 2.1703, + "step": 2129 + }, + { + "epoch": 0.6537753222836096, + "grad_norm": 0.6764169335365295, + "learning_rate": 9.967304545400016e-05, + "loss": 2.2177, + "step": 2130 + }, + { + "epoch": 0.6540822590546348, + "grad_norm": 0.6987971067428589, + "learning_rate": 9.967247770594872e-05, + "loss": 2.1699, + "step": 2131 + }, + { + "epoch": 0.6543891958256599, + "grad_norm": 0.7212976217269897, + "learning_rate": 9.967190946700476e-05, + "loss": 2.1217, + "step": 2132 + }, + { + "epoch": 0.6546961325966851, + "grad_norm": 0.6805562973022461, + "learning_rate": 9.967134073717386e-05, + "loss": 2.2295, + "step": 2133 + }, + { + "epoch": 0.6550030693677102, + "grad_norm": 0.665428102016449, + "learning_rate": 9.967077151646167e-05, + "loss": 2.1742, + "step": 2134 + }, + { + "epoch": 0.6553100061387355, + "grad_norm": 0.6691353917121887, + "learning_rate": 9.967020180487378e-05, + "loss": 2.2313, + "step": 2135 + }, + { + "epoch": 0.6556169429097606, + "grad_norm": 0.7095547914505005, + "learning_rate": 9.966963160241587e-05, + "loss": 2.1367, + "step": 2136 + }, + { + "epoch": 0.6559238796807858, + "grad_norm": 0.7050215601921082, + "learning_rate": 9.966906090909353e-05, + "loss": 2.3234, + "step": 2137 + }, + { + "epoch": 0.6562308164518109, + "grad_norm": 0.7592353820800781, + "learning_rate": 9.966848972491245e-05, + "loss": 2.1722, + "step": 2138 + }, + { + "epoch": 0.6565377532228361, + "grad_norm": 0.6520100831985474, + "learning_rate": 9.96679180498782e-05, + "loss": 2.2401, + "step": 2139 + }, + { + "epoch": 0.6568446899938613, + "grad_norm": 0.6650902628898621, + "learning_rate": 9.966734588399651e-05, + "loss": 2.2094, + "step": 2140 + }, + { + "epoch": 0.6571516267648865, + "grad_norm": 0.7236151099205017, + "learning_rate": 9.966677322727299e-05, + "loss": 2.3021, + "step": 2141 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.7160753011703491, + "learning_rate": 9.966620007971327e-05, + "loss": 2.1992, + "step": 2142 + }, + { + "epoch": 0.6577655003069368, + "grad_norm": 0.6761705279350281, + "learning_rate": 9.966562644132309e-05, + "loss": 2.1853, + "step": 2143 + }, + { + "epoch": 0.6580724370779619, + "grad_norm": 0.7017555236816406, + "learning_rate": 9.966505231210806e-05, + "loss": 2.208, + "step": 2144 + }, + { + "epoch": 0.6583793738489871, + "grad_norm": 0.7652586102485657, + "learning_rate": 9.966447769207387e-05, + "loss": 2.3065, + "step": 2145 + }, + { + "epoch": 0.6586863106200123, + "grad_norm": 0.7148436307907104, + "learning_rate": 9.966390258122621e-05, + "loss": 2.1388, + "step": 2146 + }, + { + "epoch": 0.6589932473910375, + "grad_norm": 0.5885360240936279, + "learning_rate": 9.966332697957076e-05, + "loss": 2.1463, + "step": 2147 + }, + { + "epoch": 0.6593001841620626, + "grad_norm": 0.6800816655158997, + "learning_rate": 9.966275088711321e-05, + "loss": 2.3397, + "step": 2148 + }, + { + "epoch": 0.6596071209330878, + "grad_norm": 0.6856956481933594, + "learning_rate": 9.966217430385925e-05, + "loss": 2.0893, + "step": 2149 + }, + { + "epoch": 0.6599140577041129, + "grad_norm": 0.6302888989448547, + "learning_rate": 9.966159722981456e-05, + "loss": 2.1108, + "step": 2150 + }, + { + "epoch": 0.6602209944751382, + "grad_norm": 0.6145252585411072, + "learning_rate": 9.966101966498486e-05, + "loss": 2.2668, + "step": 2151 + }, + { + "epoch": 0.6605279312461633, + "grad_norm": 0.7258949279785156, + "learning_rate": 9.966044160937586e-05, + "loss": 2.2163, + "step": 2152 + }, + { + "epoch": 0.6608348680171885, + "grad_norm": 0.6809847950935364, + "learning_rate": 9.965986306299327e-05, + "loss": 2.1828, + "step": 2153 + }, + { + "epoch": 0.6611418047882136, + "grad_norm": 0.6673223376274109, + "learning_rate": 9.96592840258428e-05, + "loss": 2.232, + "step": 2154 + }, + { + "epoch": 0.6614487415592388, + "grad_norm": 0.6483572721481323, + "learning_rate": 9.96587044979302e-05, + "loss": 2.199, + "step": 2155 + }, + { + "epoch": 0.6617556783302639, + "grad_norm": 0.6227185726165771, + "learning_rate": 9.965812447926115e-05, + "loss": 2.166, + "step": 2156 + }, + { + "epoch": 0.6620626151012892, + "grad_norm": 0.5982463955879211, + "learning_rate": 9.965754396984142e-05, + "loss": 2.2074, + "step": 2157 + }, + { + "epoch": 0.6623695518723143, + "grad_norm": 0.6357809901237488, + "learning_rate": 9.965696296967673e-05, + "loss": 2.2086, + "step": 2158 + }, + { + "epoch": 0.6626764886433395, + "grad_norm": 0.5908147692680359, + "learning_rate": 9.965638147877283e-05, + "loss": 2.1103, + "step": 2159 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.591332733631134, + "learning_rate": 9.965579949713545e-05, + "loss": 2.1698, + "step": 2160 + }, + { + "epoch": 0.6632903621853898, + "grad_norm": 0.5748336911201477, + "learning_rate": 9.965521702477038e-05, + "loss": 2.1812, + "step": 2161 + }, + { + "epoch": 0.663597298956415, + "grad_norm": 0.6643908023834229, + "learning_rate": 9.965463406168334e-05, + "loss": 2.2129, + "step": 2162 + }, + { + "epoch": 0.6639042357274402, + "grad_norm": 0.637627124786377, + "learning_rate": 9.965405060788011e-05, + "loss": 2.226, + "step": 2163 + }, + { + "epoch": 0.6642111724984653, + "grad_norm": 0.6170387268066406, + "learning_rate": 9.965346666336644e-05, + "loss": 2.2025, + "step": 2164 + }, + { + "epoch": 0.6645181092694905, + "grad_norm": 0.6038833260536194, + "learning_rate": 9.965288222814812e-05, + "loss": 2.1761, + "step": 2165 + }, + { + "epoch": 0.6648250460405156, + "grad_norm": 0.5705585479736328, + "learning_rate": 9.965229730223092e-05, + "loss": 2.1511, + "step": 2166 + }, + { + "epoch": 0.6651319828115408, + "grad_norm": 0.5994759798049927, + "learning_rate": 9.965171188562059e-05, + "loss": 2.1763, + "step": 2167 + }, + { + "epoch": 0.665438919582566, + "grad_norm": 0.5887313485145569, + "learning_rate": 9.965112597832296e-05, + "loss": 2.2185, + "step": 2168 + }, + { + "epoch": 0.6657458563535912, + "grad_norm": 0.5688689947128296, + "learning_rate": 9.96505395803438e-05, + "loss": 2.2387, + "step": 2169 + }, + { + "epoch": 0.6660527931246163, + "grad_norm": 0.6121554970741272, + "learning_rate": 9.96499526916889e-05, + "loss": 2.1938, + "step": 2170 + }, + { + "epoch": 0.6663597298956415, + "grad_norm": 0.6048038005828857, + "learning_rate": 9.964936531236407e-05, + "loss": 2.197, + "step": 2171 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6442995071411133, + "learning_rate": 9.96487774423751e-05, + "loss": 2.1725, + "step": 2172 + }, + { + "epoch": 0.6669736034376919, + "grad_norm": 0.7136862874031067, + "learning_rate": 9.964818908172783e-05, + "loss": 2.2166, + "step": 2173 + }, + { + "epoch": 0.667280540208717, + "grad_norm": 0.6902804970741272, + "learning_rate": 9.964760023042805e-05, + "loss": 2.2318, + "step": 2174 + }, + { + "epoch": 0.6675874769797422, + "grad_norm": 0.6946488618850708, + "learning_rate": 9.964701088848158e-05, + "loss": 2.177, + "step": 2175 + }, + { + "epoch": 0.6678944137507673, + "grad_norm": 0.6283712983131409, + "learning_rate": 9.964642105589425e-05, + "loss": 2.2227, + "step": 2176 + }, + { + "epoch": 0.6682013505217925, + "grad_norm": 0.5768510103225708, + "learning_rate": 9.96458307326719e-05, + "loss": 2.1559, + "step": 2177 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.6045784950256348, + "learning_rate": 9.964523991882035e-05, + "loss": 2.2018, + "step": 2178 + }, + { + "epoch": 0.6688152240638429, + "grad_norm": 0.5962889790534973, + "learning_rate": 9.964464861434544e-05, + "loss": 2.1898, + "step": 2179 + }, + { + "epoch": 0.669122160834868, + "grad_norm": 0.6611660718917847, + "learning_rate": 9.964405681925301e-05, + "loss": 2.1989, + "step": 2180 + }, + { + "epoch": 0.6694290976058932, + "grad_norm": 0.6764575242996216, + "learning_rate": 9.964346453354891e-05, + "loss": 2.2764, + "step": 2181 + }, + { + "epoch": 0.6697360343769183, + "grad_norm": 0.6795048117637634, + "learning_rate": 9.964287175723899e-05, + "loss": 2.1313, + "step": 2182 + }, + { + "epoch": 0.6700429711479435, + "grad_norm": 0.6697003841400146, + "learning_rate": 9.964227849032914e-05, + "loss": 2.1999, + "step": 2183 + }, + { + "epoch": 0.6703499079189686, + "grad_norm": 0.669682502746582, + "learning_rate": 9.964168473282519e-05, + "loss": 2.202, + "step": 2184 + }, + { + "epoch": 0.6706568446899939, + "grad_norm": 0.6823530793190002, + "learning_rate": 9.9641090484733e-05, + "loss": 2.2326, + "step": 2185 + }, + { + "epoch": 0.670963781461019, + "grad_norm": 0.7460775971412659, + "learning_rate": 9.964049574605848e-05, + "loss": 2.1594, + "step": 2186 + }, + { + "epoch": 0.6712707182320442, + "grad_norm": 0.8075460195541382, + "learning_rate": 9.963990051680744e-05, + "loss": 2.1506, + "step": 2187 + }, + { + "epoch": 0.6715776550030693, + "grad_norm": 0.8041695356369019, + "learning_rate": 9.963930479698585e-05, + "loss": 2.123, + "step": 2188 + }, + { + "epoch": 0.6718845917740945, + "grad_norm": 0.9129732251167297, + "learning_rate": 9.963870858659955e-05, + "loss": 2.116, + "step": 2189 + }, + { + "epoch": 0.6721915285451197, + "grad_norm": 0.9989685416221619, + "learning_rate": 9.963811188565444e-05, + "loss": 2.3194, + "step": 2190 + }, + { + "epoch": 0.6724984653161449, + "grad_norm": 1.0353670120239258, + "learning_rate": 9.96375146941564e-05, + "loss": 2.113, + "step": 2191 + }, + { + "epoch": 0.67280540208717, + "grad_norm": 0.897750735282898, + "learning_rate": 9.963691701211135e-05, + "loss": 2.1038, + "step": 2192 + }, + { + "epoch": 0.6731123388581952, + "grad_norm": 0.7353916168212891, + "learning_rate": 9.96363188395252e-05, + "loss": 2.2185, + "step": 2193 + }, + { + "epoch": 0.6734192756292203, + "grad_norm": 0.6474063992500305, + "learning_rate": 9.963572017640385e-05, + "loss": 2.2229, + "step": 2194 + }, + { + "epoch": 0.6737262124002455, + "grad_norm": 0.7194583415985107, + "learning_rate": 9.963512102275322e-05, + "loss": 2.2172, + "step": 2195 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.6638131737709045, + "learning_rate": 9.963452137857926e-05, + "loss": 2.2212, + "step": 2196 + }, + { + "epoch": 0.6743400859422959, + "grad_norm": 0.7219048738479614, + "learning_rate": 9.963392124388782e-05, + "loss": 2.3302, + "step": 2197 + }, + { + "epoch": 0.6746470227133211, + "grad_norm": 0.7941164374351501, + "learning_rate": 9.963332061868491e-05, + "loss": 2.2982, + "step": 2198 + }, + { + "epoch": 0.6749539594843462, + "grad_norm": 0.7356888055801392, + "learning_rate": 9.963271950297643e-05, + "loss": 2.1761, + "step": 2199 + }, + { + "epoch": 0.6752608962553714, + "grad_norm": 0.6705774664878845, + "learning_rate": 9.963211789676831e-05, + "loss": 2.2483, + "step": 2200 + }, + { + "epoch": 0.6755678330263966, + "grad_norm": 0.7958056926727295, + "learning_rate": 9.963151580006653e-05, + "loss": 2.2209, + "step": 2201 + }, + { + "epoch": 0.6758747697974218, + "grad_norm": 0.7215412259101868, + "learning_rate": 9.9630913212877e-05, + "loss": 2.1676, + "step": 2202 + }, + { + "epoch": 0.6761817065684469, + "grad_norm": 0.705649197101593, + "learning_rate": 9.963031013520572e-05, + "loss": 2.1855, + "step": 2203 + }, + { + "epoch": 0.6764886433394721, + "grad_norm": 0.7050254344940186, + "learning_rate": 9.962970656705861e-05, + "loss": 2.171, + "step": 2204 + }, + { + "epoch": 0.6767955801104972, + "grad_norm": 0.7163556218147278, + "learning_rate": 9.962910250844167e-05, + "loss": 2.1295, + "step": 2205 + }, + { + "epoch": 0.6771025168815225, + "grad_norm": 0.7195280194282532, + "learning_rate": 9.962849795936083e-05, + "loss": 2.1436, + "step": 2206 + }, + { + "epoch": 0.6774094536525476, + "grad_norm": 0.7356030344963074, + "learning_rate": 9.962789291982208e-05, + "loss": 2.2739, + "step": 2207 + }, + { + "epoch": 0.6777163904235728, + "grad_norm": 0.783649742603302, + "learning_rate": 9.962728738983143e-05, + "loss": 2.2461, + "step": 2208 + }, + { + "epoch": 0.6780233271945979, + "grad_norm": 0.6966754794120789, + "learning_rate": 9.962668136939481e-05, + "loss": 2.1977, + "step": 2209 + }, + { + "epoch": 0.6783302639656231, + "grad_norm": 0.6986487507820129, + "learning_rate": 9.962607485851825e-05, + "loss": 2.1806, + "step": 2210 + }, + { + "epoch": 0.6786372007366482, + "grad_norm": 0.6502536535263062, + "learning_rate": 9.962546785720774e-05, + "loss": 2.174, + "step": 2211 + }, + { + "epoch": 0.6789441375076735, + "grad_norm": 0.6797144412994385, + "learning_rate": 9.962486036546926e-05, + "loss": 2.2635, + "step": 2212 + }, + { + "epoch": 0.6792510742786986, + "grad_norm": 0.7190150022506714, + "learning_rate": 9.962425238330884e-05, + "loss": 2.2231, + "step": 2213 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.6770560145378113, + "learning_rate": 9.962364391073245e-05, + "loss": 2.1639, + "step": 2214 + }, + { + "epoch": 0.6798649478207489, + "grad_norm": 0.624911904335022, + "learning_rate": 9.962303494774614e-05, + "loss": 2.1754, + "step": 2215 + }, + { + "epoch": 0.6801718845917741, + "grad_norm": 0.7127423286437988, + "learning_rate": 9.96224254943559e-05, + "loss": 2.2047, + "step": 2216 + }, + { + "epoch": 0.6804788213627992, + "grad_norm": 0.6729345321655273, + "learning_rate": 9.962181555056778e-05, + "loss": 2.2245, + "step": 2217 + }, + { + "epoch": 0.6807857581338245, + "grad_norm": 0.7142044901847839, + "learning_rate": 9.96212051163878e-05, + "loss": 2.1827, + "step": 2218 + }, + { + "epoch": 0.6810926949048496, + "grad_norm": 0.686295211315155, + "learning_rate": 9.962059419182196e-05, + "loss": 2.1784, + "step": 2219 + }, + { + "epoch": 0.6813996316758748, + "grad_norm": 0.7207211256027222, + "learning_rate": 9.961998277687634e-05, + "loss": 2.2603, + "step": 2220 + }, + { + "epoch": 0.6817065684468999, + "grad_norm": 0.814552903175354, + "learning_rate": 9.961937087155697e-05, + "loss": 2.2328, + "step": 2221 + }, + { + "epoch": 0.6820135052179251, + "grad_norm": 0.851860761642456, + "learning_rate": 9.96187584758699e-05, + "loss": 2.2334, + "step": 2222 + }, + { + "epoch": 0.6823204419889503, + "grad_norm": 0.9232058525085449, + "learning_rate": 9.961814558982117e-05, + "loss": 2.2259, + "step": 2223 + }, + { + "epoch": 0.6826273787599755, + "grad_norm": 0.8393358588218689, + "learning_rate": 9.961753221341684e-05, + "loss": 2.1347, + "step": 2224 + }, + { + "epoch": 0.6829343155310006, + "grad_norm": 0.7124439477920532, + "learning_rate": 9.961691834666297e-05, + "loss": 2.195, + "step": 2225 + }, + { + "epoch": 0.6832412523020258, + "grad_norm": 0.644290566444397, + "learning_rate": 9.961630398956565e-05, + "loss": 2.1967, + "step": 2226 + }, + { + "epoch": 0.6835481890730509, + "grad_norm": 0.6896283030509949, + "learning_rate": 9.961568914213092e-05, + "loss": 2.1781, + "step": 2227 + }, + { + "epoch": 0.6838551258440762, + "grad_norm": 0.711643636226654, + "learning_rate": 9.961507380436487e-05, + "loss": 2.1091, + "step": 2228 + }, + { + "epoch": 0.6841620626151013, + "grad_norm": 0.7056689858436584, + "learning_rate": 9.961445797627358e-05, + "loss": 2.1848, + "step": 2229 + }, + { + "epoch": 0.6844689993861265, + "grad_norm": 0.60573410987854, + "learning_rate": 9.961384165786314e-05, + "loss": 2.1156, + "step": 2230 + }, + { + "epoch": 0.6847759361571516, + "grad_norm": 0.5612443089485168, + "learning_rate": 9.961322484913963e-05, + "loss": 2.2311, + "step": 2231 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.6356449723243713, + "learning_rate": 9.961260755010916e-05, + "loss": 2.1945, + "step": 2232 + }, + { + "epoch": 0.6853898096992019, + "grad_norm": 0.7393341660499573, + "learning_rate": 9.961198976077782e-05, + "loss": 2.2743, + "step": 2233 + }, + { + "epoch": 0.6856967464702272, + "grad_norm": 0.7658794522285461, + "learning_rate": 9.961137148115171e-05, + "loss": 2.1729, + "step": 2234 + }, + { + "epoch": 0.6860036832412523, + "grad_norm": 0.790540337562561, + "learning_rate": 9.961075271123697e-05, + "loss": 2.1372, + "step": 2235 + }, + { + "epoch": 0.6863106200122775, + "grad_norm": 0.71295565366745, + "learning_rate": 9.961013345103968e-05, + "loss": 2.1325, + "step": 2236 + }, + { + "epoch": 0.6866175567833026, + "grad_norm": 0.6648302674293518, + "learning_rate": 9.960951370056597e-05, + "loss": 2.1626, + "step": 2237 + }, + { + "epoch": 0.6869244935543278, + "grad_norm": 0.6276865601539612, + "learning_rate": 9.960889345982198e-05, + "loss": 2.1848, + "step": 2238 + }, + { + "epoch": 0.6872314303253529, + "grad_norm": 0.6786942481994629, + "learning_rate": 9.960827272881383e-05, + "loss": 2.2402, + "step": 2239 + }, + { + "epoch": 0.6875383670963782, + "grad_norm": 0.7752293348312378, + "learning_rate": 9.960765150754764e-05, + "loss": 2.2187, + "step": 2240 + }, + { + "epoch": 0.6878453038674033, + "grad_norm": 0.7958577871322632, + "learning_rate": 9.960702979602956e-05, + "loss": 2.1995, + "step": 2241 + }, + { + "epoch": 0.6881522406384285, + "grad_norm": 0.7327582240104675, + "learning_rate": 9.960640759426575e-05, + "loss": 2.1709, + "step": 2242 + }, + { + "epoch": 0.6884591774094536, + "grad_norm": 0.7002710103988647, + "learning_rate": 9.960578490226233e-05, + "loss": 2.1966, + "step": 2243 + }, + { + "epoch": 0.6887661141804788, + "grad_norm": 0.6163785457611084, + "learning_rate": 9.960516172002548e-05, + "loss": 2.2012, + "step": 2244 + }, + { + "epoch": 0.689073050951504, + "grad_norm": 0.6808127760887146, + "learning_rate": 9.960453804756134e-05, + "loss": 2.1704, + "step": 2245 + }, + { + "epoch": 0.6893799877225292, + "grad_norm": 0.6571208834648132, + "learning_rate": 9.960391388487609e-05, + "loss": 2.17, + "step": 2246 + }, + { + "epoch": 0.6896869244935543, + "grad_norm": 0.7180834412574768, + "learning_rate": 9.960328923197588e-05, + "loss": 2.229, + "step": 2247 + }, + { + "epoch": 0.6899938612645795, + "grad_norm": 0.7283746600151062, + "learning_rate": 9.96026640888669e-05, + "loss": 2.195, + "step": 2248 + }, + { + "epoch": 0.6903007980356046, + "grad_norm": 0.6808122992515564, + "learning_rate": 9.960203845555531e-05, + "loss": 2.1327, + "step": 2249 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.7105094790458679, + "learning_rate": 9.960141233204731e-05, + "loss": 2.2747, + "step": 2250 + }, + { + "epoch": 0.690914671577655, + "grad_norm": 0.7650291919708252, + "learning_rate": 9.960078571834909e-05, + "loss": 2.2751, + "step": 2251 + }, + { + "epoch": 0.6912216083486802, + "grad_norm": 0.8347647786140442, + "learning_rate": 9.960015861446684e-05, + "loss": 2.2101, + "step": 2252 + }, + { + "epoch": 0.6915285451197053, + "grad_norm": 0.7774063348770142, + "learning_rate": 9.959953102040672e-05, + "loss": 2.1275, + "step": 2253 + }, + { + "epoch": 0.6918354818907305, + "grad_norm": 0.7466274499893188, + "learning_rate": 9.959890293617497e-05, + "loss": 2.1352, + "step": 2254 + }, + { + "epoch": 0.6921424186617556, + "grad_norm": 0.7451669573783875, + "learning_rate": 9.959827436177781e-05, + "loss": 2.1229, + "step": 2255 + }, + { + "epoch": 0.6924493554327809, + "grad_norm": 0.651746392250061, + "learning_rate": 9.959764529722142e-05, + "loss": 2.1416, + "step": 2256 + }, + { + "epoch": 0.692756292203806, + "grad_norm": 0.6267968416213989, + "learning_rate": 9.959701574251203e-05, + "loss": 2.1346, + "step": 2257 + }, + { + "epoch": 0.6930632289748312, + "grad_norm": 0.6087000966072083, + "learning_rate": 9.959638569765586e-05, + "loss": 2.2136, + "step": 2258 + }, + { + "epoch": 0.6933701657458563, + "grad_norm": 0.6032208204269409, + "learning_rate": 9.959575516265914e-05, + "loss": 2.1211, + "step": 2259 + }, + { + "epoch": 0.6936771025168815, + "grad_norm": 0.83074551820755, + "learning_rate": 9.95951241375281e-05, + "loss": 2.2951, + "step": 2260 + }, + { + "epoch": 0.6939840392879066, + "grad_norm": 0.8564106225967407, + "learning_rate": 9.959449262226897e-05, + "loss": 2.1496, + "step": 2261 + }, + { + "epoch": 0.6942909760589319, + "grad_norm": 0.8558153510093689, + "learning_rate": 9.9593860616888e-05, + "loss": 2.2325, + "step": 2262 + }, + { + "epoch": 0.694597912829957, + "grad_norm": 0.7391008734703064, + "learning_rate": 9.959322812139143e-05, + "loss": 2.1133, + "step": 2263 + }, + { + "epoch": 0.6949048496009822, + "grad_norm": 0.6090536713600159, + "learning_rate": 9.959259513578552e-05, + "loss": 2.1453, + "step": 2264 + }, + { + "epoch": 0.6952117863720073, + "grad_norm": 0.5893986821174622, + "learning_rate": 9.95919616600765e-05, + "loss": 2.2035, + "step": 2265 + }, + { + "epoch": 0.6955187231430325, + "grad_norm": 0.6274020671844482, + "learning_rate": 9.959132769427065e-05, + "loss": 2.2118, + "step": 2266 + }, + { + "epoch": 0.6958256599140578, + "grad_norm": 0.6287395358085632, + "learning_rate": 9.959069323837424e-05, + "loss": 2.2167, + "step": 2267 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.6281611323356628, + "learning_rate": 9.959005829239354e-05, + "loss": 2.1945, + "step": 2268 + }, + { + "epoch": 0.6964395334561081, + "grad_norm": 0.6422389149665833, + "learning_rate": 9.958942285633481e-05, + "loss": 2.1826, + "step": 2269 + }, + { + "epoch": 0.6967464702271332, + "grad_norm": 0.6461887955665588, + "learning_rate": 9.958878693020434e-05, + "loss": 2.2454, + "step": 2270 + }, + { + "epoch": 0.6970534069981584, + "grad_norm": 0.562102735042572, + "learning_rate": 9.958815051400841e-05, + "loss": 2.1375, + "step": 2271 + }, + { + "epoch": 0.6973603437691835, + "grad_norm": 0.5737003087997437, + "learning_rate": 9.958751360775331e-05, + "loss": 2.2344, + "step": 2272 + }, + { + "epoch": 0.6976672805402088, + "grad_norm": 0.5516494512557983, + "learning_rate": 9.958687621144535e-05, + "loss": 2.249, + "step": 2273 + }, + { + "epoch": 0.6979742173112339, + "grad_norm": 0.7148357629776001, + "learning_rate": 9.958623832509081e-05, + "loss": 2.2383, + "step": 2274 + }, + { + "epoch": 0.6982811540822591, + "grad_norm": 0.7151525020599365, + "learning_rate": 9.958559994869599e-05, + "loss": 2.1697, + "step": 2275 + }, + { + "epoch": 0.6985880908532842, + "grad_norm": 0.6927846670150757, + "learning_rate": 9.958496108226722e-05, + "loss": 2.1534, + "step": 2276 + }, + { + "epoch": 0.6988950276243094, + "grad_norm": 0.811660647392273, + "learning_rate": 9.958432172581079e-05, + "loss": 2.2197, + "step": 2277 + }, + { + "epoch": 0.6992019643953346, + "grad_norm": 0.9680081009864807, + "learning_rate": 9.958368187933305e-05, + "loss": 2.2241, + "step": 2278 + }, + { + "epoch": 0.6995089011663598, + "grad_norm": 0.9996320605278015, + "learning_rate": 9.958304154284028e-05, + "loss": 2.1598, + "step": 2279 + }, + { + "epoch": 0.6998158379373849, + "grad_norm": 1.008695363998413, + "learning_rate": 9.958240071633884e-05, + "loss": 2.2082, + "step": 2280 + }, + { + "epoch": 0.7001227747084101, + "grad_norm": 0.9931860566139221, + "learning_rate": 9.958175939983506e-05, + "loss": 2.1478, + "step": 2281 + }, + { + "epoch": 0.7004297114794352, + "grad_norm": 0.8637800812721252, + "learning_rate": 9.958111759333528e-05, + "loss": 2.149, + "step": 2282 + }, + { + "epoch": 0.7007366482504604, + "grad_norm": 0.7089012861251831, + "learning_rate": 9.958047529684582e-05, + "loss": 2.1845, + "step": 2283 + }, + { + "epoch": 0.7010435850214856, + "grad_norm": 0.6083673238754272, + "learning_rate": 9.957983251037303e-05, + "loss": 2.1542, + "step": 2284 + }, + { + "epoch": 0.7013505217925108, + "grad_norm": 0.7092905044555664, + "learning_rate": 9.957918923392331e-05, + "loss": 2.2305, + "step": 2285 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.8416675925254822, + "learning_rate": 9.957854546750297e-05, + "loss": 2.2975, + "step": 2286 + }, + { + "epoch": 0.7019643953345611, + "grad_norm": 0.7778663039207458, + "learning_rate": 9.957790121111838e-05, + "loss": 2.2363, + "step": 2287 + }, + { + "epoch": 0.7022713321055862, + "grad_norm": 0.7886617183685303, + "learning_rate": 9.957725646477592e-05, + "loss": 2.1547, + "step": 2288 + }, + { + "epoch": 0.7025782688766115, + "grad_norm": 0.6596038937568665, + "learning_rate": 9.957661122848194e-05, + "loss": 2.1537, + "step": 2289 + }, + { + "epoch": 0.7028852056476366, + "grad_norm": 0.6441544890403748, + "learning_rate": 9.957596550224285e-05, + "loss": 2.1678, + "step": 2290 + }, + { + "epoch": 0.7031921424186618, + "grad_norm": 0.7106116414070129, + "learning_rate": 9.957531928606499e-05, + "loss": 2.2039, + "step": 2291 + }, + { + "epoch": 0.7034990791896869, + "grad_norm": 0.6948207020759583, + "learning_rate": 9.957467257995476e-05, + "loss": 2.176, + "step": 2292 + }, + { + "epoch": 0.7038060159607121, + "grad_norm": 0.6834874153137207, + "learning_rate": 9.957402538391859e-05, + "loss": 2.2182, + "step": 2293 + }, + { + "epoch": 0.7041129527317372, + "grad_norm": 0.6246630549430847, + "learning_rate": 9.957337769796282e-05, + "loss": 2.1181, + "step": 2294 + }, + { + "epoch": 0.7044198895027625, + "grad_norm": 0.6421988606452942, + "learning_rate": 9.957272952209389e-05, + "loss": 2.1352, + "step": 2295 + }, + { + "epoch": 0.7047268262737876, + "grad_norm": 0.5955870151519775, + "learning_rate": 9.95720808563182e-05, + "loss": 2.1852, + "step": 2296 + }, + { + "epoch": 0.7050337630448128, + "grad_norm": 0.6961265206336975, + "learning_rate": 9.957143170064214e-05, + "loss": 2.242, + "step": 2297 + }, + { + "epoch": 0.7053406998158379, + "grad_norm": 0.6966063380241394, + "learning_rate": 9.957078205507213e-05, + "loss": 2.1505, + "step": 2298 + }, + { + "epoch": 0.7056476365868631, + "grad_norm": 0.6155996322631836, + "learning_rate": 9.957013191961459e-05, + "loss": 2.1928, + "step": 2299 + }, + { + "epoch": 0.7059545733578882, + "grad_norm": 0.6092718839645386, + "learning_rate": 9.956948129427597e-05, + "loss": 2.138, + "step": 2300 + }, + { + "epoch": 0.7062615101289135, + "grad_norm": 0.645746111869812, + "learning_rate": 9.95688301790627e-05, + "loss": 2.2334, + "step": 2301 + }, + { + "epoch": 0.7065684468999386, + "grad_norm": 0.5959149599075317, + "learning_rate": 9.956817857398116e-05, + "loss": 2.1985, + "step": 2302 + }, + { + "epoch": 0.7068753836709638, + "grad_norm": 0.7127073407173157, + "learning_rate": 9.956752647903785e-05, + "loss": 2.2157, + "step": 2303 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.5589274764060974, + "learning_rate": 9.956687389423917e-05, + "loss": 2.1251, + "step": 2304 + }, + { + "epoch": 0.7074892572130141, + "grad_norm": 0.5502300262451172, + "learning_rate": 9.95662208195916e-05, + "loss": 2.1344, + "step": 2305 + }, + { + "epoch": 0.7077961939840393, + "grad_norm": 0.6577275991439819, + "learning_rate": 9.95655672551016e-05, + "loss": 2.1646, + "step": 2306 + }, + { + "epoch": 0.7081031307550645, + "grad_norm": 0.6241618394851685, + "learning_rate": 9.956491320077559e-05, + "loss": 2.1153, + "step": 2307 + }, + { + "epoch": 0.7084100675260896, + "grad_norm": 0.5846728086471558, + "learning_rate": 9.956425865662007e-05, + "loss": 2.1477, + "step": 2308 + }, + { + "epoch": 0.7087170042971148, + "grad_norm": 0.6005275249481201, + "learning_rate": 9.95636036226415e-05, + "loss": 2.2034, + "step": 2309 + }, + { + "epoch": 0.7090239410681399, + "grad_norm": 0.6545519828796387, + "learning_rate": 9.956294809884635e-05, + "loss": 2.23, + "step": 2310 + }, + { + "epoch": 0.7093308778391652, + "grad_norm": 0.7513750791549683, + "learning_rate": 9.956229208524108e-05, + "loss": 2.2497, + "step": 2311 + }, + { + "epoch": 0.7096378146101903, + "grad_norm": 0.7308349609375, + "learning_rate": 9.956163558183219e-05, + "loss": 2.166, + "step": 2312 + }, + { + "epoch": 0.7099447513812155, + "grad_norm": 0.6278798580169678, + "learning_rate": 9.956097858862619e-05, + "loss": 2.1994, + "step": 2313 + }, + { + "epoch": 0.7102516881522406, + "grad_norm": 0.6725621223449707, + "learning_rate": 9.956032110562953e-05, + "loss": 2.2212, + "step": 2314 + }, + { + "epoch": 0.7105586249232658, + "grad_norm": 0.7116945385932922, + "learning_rate": 9.955966313284872e-05, + "loss": 2.2033, + "step": 2315 + }, + { + "epoch": 0.7108655616942909, + "grad_norm": 0.5906245112419128, + "learning_rate": 9.95590046702903e-05, + "loss": 2.1419, + "step": 2316 + }, + { + "epoch": 0.7111724984653162, + "grad_norm": 0.6911863684654236, + "learning_rate": 9.955834571796073e-05, + "loss": 2.1697, + "step": 2317 + }, + { + "epoch": 0.7114794352363413, + "grad_norm": 0.600350558757782, + "learning_rate": 9.955768627586655e-05, + "loss": 2.0864, + "step": 2318 + }, + { + "epoch": 0.7117863720073665, + "grad_norm": 0.6246278285980225, + "learning_rate": 9.955702634401427e-05, + "loss": 2.1549, + "step": 2319 + }, + { + "epoch": 0.7120933087783916, + "grad_norm": 0.6530009508132935, + "learning_rate": 9.95563659224104e-05, + "loss": 2.1457, + "step": 2320 + }, + { + "epoch": 0.7124002455494168, + "grad_norm": 0.6566256880760193, + "learning_rate": 9.955570501106148e-05, + "loss": 2.1589, + "step": 2321 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.6607041358947754, + "learning_rate": 9.955504360997404e-05, + "loss": 2.1692, + "step": 2322 + }, + { + "epoch": 0.7130141190914672, + "grad_norm": 0.7257810235023499, + "learning_rate": 9.95543817191546e-05, + "loss": 2.2067, + "step": 2323 + }, + { + "epoch": 0.7133210558624923, + "grad_norm": 0.7413349151611328, + "learning_rate": 9.955371933860973e-05, + "loss": 2.1817, + "step": 2324 + }, + { + "epoch": 0.7136279926335175, + "grad_norm": 0.6968317031860352, + "learning_rate": 9.955305646834596e-05, + "loss": 2.2574, + "step": 2325 + }, + { + "epoch": 0.7139349294045426, + "grad_norm": 0.8065732717514038, + "learning_rate": 9.955239310836983e-05, + "loss": 2.1957, + "step": 2326 + }, + { + "epoch": 0.7142418661755678, + "grad_norm": 0.7563133835792542, + "learning_rate": 9.955172925868792e-05, + "loss": 2.2113, + "step": 2327 + }, + { + "epoch": 0.714548802946593, + "grad_norm": 0.6790496110916138, + "learning_rate": 9.955106491930678e-05, + "loss": 2.103, + "step": 2328 + }, + { + "epoch": 0.7148557397176182, + "grad_norm": 0.65167236328125, + "learning_rate": 9.955040009023298e-05, + "loss": 2.1919, + "step": 2329 + }, + { + "epoch": 0.7151626764886433, + "grad_norm": 0.6869332790374756, + "learning_rate": 9.954973477147307e-05, + "loss": 2.2141, + "step": 2330 + }, + { + "epoch": 0.7154696132596685, + "grad_norm": 0.8613699078559875, + "learning_rate": 9.954906896303363e-05, + "loss": 2.1962, + "step": 2331 + }, + { + "epoch": 0.7157765500306936, + "grad_norm": 0.8827282786369324, + "learning_rate": 9.954840266492127e-05, + "loss": 2.216, + "step": 2332 + }, + { + "epoch": 0.7160834868017188, + "grad_norm": 0.9737905263900757, + "learning_rate": 9.954773587714255e-05, + "loss": 2.2118, + "step": 2333 + }, + { + "epoch": 0.716390423572744, + "grad_norm": 0.9978635311126709, + "learning_rate": 9.954706859970404e-05, + "loss": 2.0998, + "step": 2334 + }, + { + "epoch": 0.7166973603437692, + "grad_norm": 0.8694623112678528, + "learning_rate": 9.954640083261238e-05, + "loss": 2.1533, + "step": 2335 + }, + { + "epoch": 0.7170042971147943, + "grad_norm": 0.641293466091156, + "learning_rate": 9.954573257587415e-05, + "loss": 2.2095, + "step": 2336 + }, + { + "epoch": 0.7173112338858195, + "grad_norm": 0.6289860010147095, + "learning_rate": 9.954506382949594e-05, + "loss": 2.1683, + "step": 2337 + }, + { + "epoch": 0.7176181706568447, + "grad_norm": 0.8292246460914612, + "learning_rate": 9.954439459348437e-05, + "loss": 2.1729, + "step": 2338 + }, + { + "epoch": 0.7179251074278699, + "grad_norm": 0.8990920782089233, + "learning_rate": 9.954372486784605e-05, + "loss": 2.0888, + "step": 2339 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.7905614376068115, + "learning_rate": 9.954305465258762e-05, + "loss": 2.2262, + "step": 2340 + }, + { + "epoch": 0.7185389809699202, + "grad_norm": 0.7142611145973206, + "learning_rate": 9.954238394771567e-05, + "loss": 2.1311, + "step": 2341 + }, + { + "epoch": 0.7188459177409454, + "grad_norm": 0.68161541223526, + "learning_rate": 9.954171275323684e-05, + "loss": 2.2622, + "step": 2342 + }, + { + "epoch": 0.7191528545119705, + "grad_norm": 0.7524895668029785, + "learning_rate": 9.954104106915779e-05, + "loss": 2.1709, + "step": 2343 + }, + { + "epoch": 0.7194597912829958, + "grad_norm": 0.7419885396957397, + "learning_rate": 9.954036889548511e-05, + "loss": 2.1528, + "step": 2344 + }, + { + "epoch": 0.7197667280540209, + "grad_norm": 0.8045634031295776, + "learning_rate": 9.953969623222547e-05, + "loss": 2.1774, + "step": 2345 + }, + { + "epoch": 0.7200736648250461, + "grad_norm": 0.6680217385292053, + "learning_rate": 9.953902307938554e-05, + "loss": 2.2345, + "step": 2346 + }, + { + "epoch": 0.7203806015960712, + "grad_norm": 0.6900907754898071, + "learning_rate": 9.953834943697193e-05, + "loss": 2.1696, + "step": 2347 + }, + { + "epoch": 0.7206875383670964, + "grad_norm": 0.7231009006500244, + "learning_rate": 9.953767530499132e-05, + "loss": 2.2556, + "step": 2348 + }, + { + "epoch": 0.7209944751381215, + "grad_norm": 0.7766092419624329, + "learning_rate": 9.953700068345036e-05, + "loss": 2.1522, + "step": 2349 + }, + { + "epoch": 0.7213014119091468, + "grad_norm": 0.7361852526664734, + "learning_rate": 9.953632557235574e-05, + "loss": 2.2427, + "step": 2350 + }, + { + "epoch": 0.7216083486801719, + "grad_norm": 0.7170109152793884, + "learning_rate": 9.953564997171411e-05, + "loss": 2.2439, + "step": 2351 + }, + { + "epoch": 0.7219152854511971, + "grad_norm": 0.7192662954330444, + "learning_rate": 9.953497388153214e-05, + "loss": 2.1242, + "step": 2352 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.7363288402557373, + "learning_rate": 9.953429730181653e-05, + "loss": 2.2748, + "step": 2353 + }, + { + "epoch": 0.7225291589932474, + "grad_norm": 0.8516983985900879, + "learning_rate": 9.953362023257397e-05, + "loss": 2.2471, + "step": 2354 + }, + { + "epoch": 0.7228360957642725, + "grad_norm": 0.7928574681282043, + "learning_rate": 9.953294267381114e-05, + "loss": 2.164, + "step": 2355 + }, + { + "epoch": 0.7231430325352978, + "grad_norm": 0.6803320646286011, + "learning_rate": 9.953226462553474e-05, + "loss": 2.1671, + "step": 2356 + }, + { + "epoch": 0.7234499693063229, + "grad_norm": 0.6811994910240173, + "learning_rate": 9.953158608775147e-05, + "loss": 2.1042, + "step": 2357 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.6077840328216553, + "learning_rate": 9.953090706046804e-05, + "loss": 2.2161, + "step": 2358 + }, + { + "epoch": 0.7240638428483732, + "grad_norm": 0.5938412547111511, + "learning_rate": 9.953022754369114e-05, + "loss": 2.1177, + "step": 2359 + }, + { + "epoch": 0.7243707796193984, + "grad_norm": 0.6752299070358276, + "learning_rate": 9.952954753742751e-05, + "loss": 2.2255, + "step": 2360 + }, + { + "epoch": 0.7246777163904236, + "grad_norm": 0.6745245456695557, + "learning_rate": 9.952886704168387e-05, + "loss": 2.1817, + "step": 2361 + }, + { + "epoch": 0.7249846531614488, + "grad_norm": 0.6645397543907166, + "learning_rate": 9.95281860564669e-05, + "loss": 2.2495, + "step": 2362 + }, + { + "epoch": 0.7252915899324739, + "grad_norm": 0.6758745312690735, + "learning_rate": 9.95275045817834e-05, + "loss": 2.2059, + "step": 2363 + }, + { + "epoch": 0.7255985267034991, + "grad_norm": 0.6584516763687134, + "learning_rate": 9.952682261764006e-05, + "loss": 2.1868, + "step": 2364 + }, + { + "epoch": 0.7259054634745242, + "grad_norm": 0.6335561871528625, + "learning_rate": 9.952614016404363e-05, + "loss": 2.1352, + "step": 2365 + }, + { + "epoch": 0.7262124002455494, + "grad_norm": 0.6656816601753235, + "learning_rate": 9.952545722100087e-05, + "loss": 2.1805, + "step": 2366 + }, + { + "epoch": 0.7265193370165746, + "grad_norm": 0.6262782216072083, + "learning_rate": 9.95247737885185e-05, + "loss": 2.1435, + "step": 2367 + }, + { + "epoch": 0.7268262737875998, + "grad_norm": 0.569795548915863, + "learning_rate": 9.952408986660329e-05, + "loss": 2.1547, + "step": 2368 + }, + { + "epoch": 0.7271332105586249, + "grad_norm": 0.5249118208885193, + "learning_rate": 9.952340545526199e-05, + "loss": 2.1213, + "step": 2369 + }, + { + "epoch": 0.7274401473296501, + "grad_norm": 0.5581740140914917, + "learning_rate": 9.952272055450139e-05, + "loss": 2.1866, + "step": 2370 + }, + { + "epoch": 0.7277470841006752, + "grad_norm": 0.5986969470977783, + "learning_rate": 9.952203516432821e-05, + "loss": 2.143, + "step": 2371 + }, + { + "epoch": 0.7280540208717005, + "grad_norm": 0.6426723599433899, + "learning_rate": 9.952134928474926e-05, + "loss": 2.2132, + "step": 2372 + }, + { + "epoch": 0.7283609576427256, + "grad_norm": 0.5856953263282776, + "learning_rate": 9.952066291577133e-05, + "loss": 2.1502, + "step": 2373 + }, + { + "epoch": 0.7286678944137508, + "grad_norm": 0.5420570969581604, + "learning_rate": 9.951997605740117e-05, + "loss": 2.1213, + "step": 2374 + }, + { + "epoch": 0.7289748311847759, + "grad_norm": 0.6201688647270203, + "learning_rate": 9.951928870964558e-05, + "loss": 2.218, + "step": 2375 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.7023850083351135, + "learning_rate": 9.951860087251137e-05, + "loss": 2.2787, + "step": 2376 + }, + { + "epoch": 0.7295887047268262, + "grad_norm": 0.733650803565979, + "learning_rate": 9.951791254600532e-05, + "loss": 2.1861, + "step": 2377 + }, + { + "epoch": 0.7298956414978515, + "grad_norm": 0.7177363038063049, + "learning_rate": 9.951722373013421e-05, + "loss": 2.1905, + "step": 2378 + }, + { + "epoch": 0.7302025782688766, + "grad_norm": 0.7963547706604004, + "learning_rate": 9.95165344249049e-05, + "loss": 2.1842, + "step": 2379 + }, + { + "epoch": 0.7305095150399018, + "grad_norm": 0.8466546535491943, + "learning_rate": 9.951584463032416e-05, + "loss": 2.1661, + "step": 2380 + }, + { + "epoch": 0.7308164518109269, + "grad_norm": 0.7288870811462402, + "learning_rate": 9.951515434639882e-05, + "loss": 2.1153, + "step": 2381 + }, + { + "epoch": 0.7311233885819521, + "grad_norm": 0.6168704032897949, + "learning_rate": 9.951446357313571e-05, + "loss": 2.121, + "step": 2382 + }, + { + "epoch": 0.7314303253529773, + "grad_norm": 0.6534848809242249, + "learning_rate": 9.951377231054166e-05, + "loss": 2.2087, + "step": 2383 + }, + { + "epoch": 0.7317372621240025, + "grad_norm": 0.7872020602226257, + "learning_rate": 9.951308055862347e-05, + "loss": 2.2428, + "step": 2384 + }, + { + "epoch": 0.7320441988950276, + "grad_norm": 0.864799439907074, + "learning_rate": 9.9512388317388e-05, + "loss": 2.2392, + "step": 2385 + }, + { + "epoch": 0.7323511356660528, + "grad_norm": 0.7365485429763794, + "learning_rate": 9.95116955868421e-05, + "loss": 2.1614, + "step": 2386 + }, + { + "epoch": 0.7326580724370779, + "grad_norm": 0.6509390473365784, + "learning_rate": 9.95110023669926e-05, + "loss": 2.1917, + "step": 2387 + }, + { + "epoch": 0.7329650092081031, + "grad_norm": 0.7660403847694397, + "learning_rate": 9.951030865784635e-05, + "loss": 2.2414, + "step": 2388 + }, + { + "epoch": 0.7332719459791283, + "grad_norm": 0.9997872114181519, + "learning_rate": 9.950961445941022e-05, + "loss": 2.2063, + "step": 2389 + }, + { + "epoch": 0.7335788827501535, + "grad_norm": 1.0113418102264404, + "learning_rate": 9.950891977169106e-05, + "loss": 2.1898, + "step": 2390 + }, + { + "epoch": 0.7338858195211786, + "grad_norm": 0.8849206566810608, + "learning_rate": 9.950822459469573e-05, + "loss": 2.1503, + "step": 2391 + }, + { + "epoch": 0.7341927562922038, + "grad_norm": 0.6561055779457092, + "learning_rate": 9.950752892843112e-05, + "loss": 2.1234, + "step": 2392 + }, + { + "epoch": 0.7344996930632289, + "grad_norm": 0.5568758845329285, + "learning_rate": 9.950683277290407e-05, + "loss": 2.2129, + "step": 2393 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.7019078135490417, + "learning_rate": 9.950613612812149e-05, + "loss": 2.1162, + "step": 2394 + }, + { + "epoch": 0.7351135666052793, + "grad_norm": 0.7633521556854248, + "learning_rate": 9.950543899409026e-05, + "loss": 2.2427, + "step": 2395 + }, + { + "epoch": 0.7354205033763045, + "grad_norm": 0.6743205785751343, + "learning_rate": 9.950474137081726e-05, + "loss": 2.2213, + "step": 2396 + }, + { + "epoch": 0.7357274401473296, + "grad_norm": 0.6008336544036865, + "learning_rate": 9.950404325830941e-05, + "loss": 2.1605, + "step": 2397 + }, + { + "epoch": 0.7360343769183548, + "grad_norm": 0.648760199546814, + "learning_rate": 9.950334465657357e-05, + "loss": 2.2298, + "step": 2398 + }, + { + "epoch": 0.7363413136893799, + "grad_norm": 0.6996559500694275, + "learning_rate": 9.950264556561667e-05, + "loss": 2.1616, + "step": 2399 + }, + { + "epoch": 0.7366482504604052, + "grad_norm": 0.741629421710968, + "learning_rate": 9.950194598544561e-05, + "loss": 2.2162, + "step": 2400 + }, + { + "epoch": 0.7369551872314303, + "grad_norm": 0.6144673824310303, + "learning_rate": 9.95012459160673e-05, + "loss": 2.15, + "step": 2401 + }, + { + "epoch": 0.7372621240024555, + "grad_norm": 0.5826541781425476, + "learning_rate": 9.950054535748867e-05, + "loss": 2.1792, + "step": 2402 + }, + { + "epoch": 0.7375690607734806, + "grad_norm": 0.6489288806915283, + "learning_rate": 9.949984430971665e-05, + "loss": 2.1703, + "step": 2403 + }, + { + "epoch": 0.7378759975445058, + "grad_norm": 0.6752250790596008, + "learning_rate": 9.949914277275814e-05, + "loss": 2.2561, + "step": 2404 + }, + { + "epoch": 0.738182934315531, + "grad_norm": 0.5570092797279358, + "learning_rate": 9.94984407466201e-05, + "loss": 2.1418, + "step": 2405 + }, + { + "epoch": 0.7384898710865562, + "grad_norm": 0.5966812968254089, + "learning_rate": 9.949773823130944e-05, + "loss": 2.2168, + "step": 2406 + }, + { + "epoch": 0.7387968078575813, + "grad_norm": 0.6253142952919006, + "learning_rate": 9.949703522683314e-05, + "loss": 2.1646, + "step": 2407 + }, + { + "epoch": 0.7391037446286065, + "grad_norm": 0.6673659086227417, + "learning_rate": 9.94963317331981e-05, + "loss": 2.1904, + "step": 2408 + }, + { + "epoch": 0.7394106813996317, + "grad_norm": 0.6243279576301575, + "learning_rate": 9.949562775041133e-05, + "loss": 2.2568, + "step": 2409 + }, + { + "epoch": 0.7397176181706568, + "grad_norm": 0.7014298439025879, + "learning_rate": 9.949492327847973e-05, + "loss": 2.2331, + "step": 2410 + }, + { + "epoch": 0.7400245549416821, + "grad_norm": 0.698403537273407, + "learning_rate": 9.94942183174103e-05, + "loss": 2.1928, + "step": 2411 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.6354022026062012, + "learning_rate": 9.949351286721001e-05, + "loss": 2.0975, + "step": 2412 + }, + { + "epoch": 0.7406384284837324, + "grad_norm": 0.595302164554596, + "learning_rate": 9.949280692788579e-05, + "loss": 2.177, + "step": 2413 + }, + { + "epoch": 0.7409453652547575, + "grad_norm": 0.6844484210014343, + "learning_rate": 9.949210049944465e-05, + "loss": 2.1962, + "step": 2414 + }, + { + "epoch": 0.7412523020257827, + "grad_norm": 0.6242616176605225, + "learning_rate": 9.949139358189357e-05, + "loss": 2.2143, + "step": 2415 + }, + { + "epoch": 0.7415592387968079, + "grad_norm": 0.6524595022201538, + "learning_rate": 9.949068617523954e-05, + "loss": 2.1438, + "step": 2416 + }, + { + "epoch": 0.7418661755678331, + "grad_norm": 0.6667510867118835, + "learning_rate": 9.948997827948953e-05, + "loss": 2.2115, + "step": 2417 + }, + { + "epoch": 0.7421731123388582, + "grad_norm": 0.7688906192779541, + "learning_rate": 9.948926989465056e-05, + "loss": 2.1887, + "step": 2418 + }, + { + "epoch": 0.7424800491098834, + "grad_norm": 0.6888165473937988, + "learning_rate": 9.948856102072958e-05, + "loss": 2.1349, + "step": 2419 + }, + { + "epoch": 0.7427869858809085, + "grad_norm": 0.5672495365142822, + "learning_rate": 9.948785165773367e-05, + "loss": 2.1109, + "step": 2420 + }, + { + "epoch": 0.7430939226519337, + "grad_norm": 0.5714489221572876, + "learning_rate": 9.94871418056698e-05, + "loss": 2.1483, + "step": 2421 + }, + { + "epoch": 0.7434008594229589, + "grad_norm": 0.6061533093452454, + "learning_rate": 9.948643146454498e-05, + "loss": 2.211, + "step": 2422 + }, + { + "epoch": 0.7437077961939841, + "grad_norm": 0.6132726073265076, + "learning_rate": 9.948572063436625e-05, + "loss": 2.23, + "step": 2423 + }, + { + "epoch": 0.7440147329650092, + "grad_norm": 0.684301495552063, + "learning_rate": 9.948500931514062e-05, + "loss": 2.129, + "step": 2424 + }, + { + "epoch": 0.7443216697360344, + "grad_norm": 0.6325442790985107, + "learning_rate": 9.948429750687512e-05, + "loss": 2.129, + "step": 2425 + }, + { + "epoch": 0.7446286065070595, + "grad_norm": 0.6245989203453064, + "learning_rate": 9.948358520957678e-05, + "loss": 2.1999, + "step": 2426 + }, + { + "epoch": 0.7449355432780848, + "grad_norm": 0.6638534069061279, + "learning_rate": 9.948287242325267e-05, + "loss": 2.203, + "step": 2427 + }, + { + "epoch": 0.7452424800491099, + "grad_norm": 0.6121437549591064, + "learning_rate": 9.94821591479098e-05, + "loss": 2.1204, + "step": 2428 + }, + { + "epoch": 0.7455494168201351, + "grad_norm": 0.7919846177101135, + "learning_rate": 9.948144538355522e-05, + "loss": 2.2353, + "step": 2429 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.7246984839439392, + "learning_rate": 9.948073113019602e-05, + "loss": 2.1284, + "step": 2430 + }, + { + "epoch": 0.7461632903621854, + "grad_norm": 0.6120265126228333, + "learning_rate": 9.948001638783921e-05, + "loss": 2.0873, + "step": 2431 + }, + { + "epoch": 0.7464702271332105, + "grad_norm": 0.628588080406189, + "learning_rate": 9.947930115649189e-05, + "loss": 2.1713, + "step": 2432 + }, + { + "epoch": 0.7467771639042358, + "grad_norm": 0.63116854429245, + "learning_rate": 9.947858543616111e-05, + "loss": 2.123, + "step": 2433 + }, + { + "epoch": 0.7470841006752609, + "grad_norm": 0.6533017754554749, + "learning_rate": 9.947786922685394e-05, + "loss": 2.1593, + "step": 2434 + }, + { + "epoch": 0.7473910374462861, + "grad_norm": 0.6854177117347717, + "learning_rate": 9.947715252857749e-05, + "loss": 2.162, + "step": 2435 + }, + { + "epoch": 0.7476979742173112, + "grad_norm": 0.7257967591285706, + "learning_rate": 9.94764353413388e-05, + "loss": 2.2644, + "step": 2436 + }, + { + "epoch": 0.7480049109883364, + "grad_norm": 0.6806700825691223, + "learning_rate": 9.947571766514498e-05, + "loss": 2.0875, + "step": 2437 + }, + { + "epoch": 0.7483118477593615, + "grad_norm": 0.6616181135177612, + "learning_rate": 9.947499950000312e-05, + "loss": 2.1353, + "step": 2438 + }, + { + "epoch": 0.7486187845303868, + "grad_norm": 0.7249685525894165, + "learning_rate": 9.947428084592032e-05, + "loss": 2.148, + "step": 2439 + }, + { + "epoch": 0.7489257213014119, + "grad_norm": 0.6372905969619751, + "learning_rate": 9.947356170290369e-05, + "loss": 2.1749, + "step": 2440 + }, + { + "epoch": 0.7492326580724371, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.947284207096031e-05, + "loss": 2.1909, + "step": 2441 + }, + { + "epoch": 0.7495395948434622, + "grad_norm": 0.5830507278442383, + "learning_rate": 9.94721219500973e-05, + "loss": 2.1351, + "step": 2442 + }, + { + "epoch": 0.7498465316144874, + "grad_norm": 0.650262713432312, + "learning_rate": 9.94714013403218e-05, + "loss": 2.2602, + "step": 2443 + }, + { + "epoch": 0.7501534683855126, + "grad_norm": 0.6658717393875122, + "learning_rate": 9.947068024164091e-05, + "loss": 2.0919, + "step": 2444 + }, + { + "epoch": 0.7504604051565378, + "grad_norm": 0.7299105525016785, + "learning_rate": 9.946995865406177e-05, + "loss": 2.2079, + "step": 2445 + }, + { + "epoch": 0.7507673419275629, + "grad_norm": 0.762246310710907, + "learning_rate": 9.946923657759148e-05, + "loss": 2.2225, + "step": 2446 + }, + { + "epoch": 0.7510742786985881, + "grad_norm": 0.7019835710525513, + "learning_rate": 9.946851401223722e-05, + "loss": 2.175, + "step": 2447 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.6214791536331177, + "learning_rate": 9.946779095800611e-05, + "loss": 2.2095, + "step": 2448 + }, + { + "epoch": 0.7516881522406385, + "grad_norm": 0.6380667090415955, + "learning_rate": 9.94670674149053e-05, + "loss": 2.2325, + "step": 2449 + }, + { + "epoch": 0.7519950890116636, + "grad_norm": 0.6175886392593384, + "learning_rate": 9.946634338294191e-05, + "loss": 2.1431, + "step": 2450 + }, + { + "epoch": 0.7523020257826888, + "grad_norm": 0.6642621159553528, + "learning_rate": 9.946561886212315e-05, + "loss": 2.1538, + "step": 2451 + }, + { + "epoch": 0.7526089625537139, + "grad_norm": 0.7078617215156555, + "learning_rate": 9.946489385245614e-05, + "loss": 2.1544, + "step": 2452 + }, + { + "epoch": 0.7529158993247391, + "grad_norm": 0.6939398050308228, + "learning_rate": 9.946416835394806e-05, + "loss": 2.1131, + "step": 2453 + }, + { + "epoch": 0.7532228360957642, + "grad_norm": 0.7080716490745544, + "learning_rate": 9.946344236660608e-05, + "loss": 2.2135, + "step": 2454 + }, + { + "epoch": 0.7535297728667895, + "grad_norm": 0.7451115250587463, + "learning_rate": 9.946271589043736e-05, + "loss": 2.1475, + "step": 2455 + }, + { + "epoch": 0.7538367096378146, + "grad_norm": 0.6718367338180542, + "learning_rate": 9.946198892544909e-05, + "loss": 2.1853, + "step": 2456 + }, + { + "epoch": 0.7541436464088398, + "grad_norm": 0.7071637511253357, + "learning_rate": 9.946126147164847e-05, + "loss": 2.0981, + "step": 2457 + }, + { + "epoch": 0.7544505831798649, + "grad_norm": 0.6745624542236328, + "learning_rate": 9.946053352904267e-05, + "loss": 2.1914, + "step": 2458 + }, + { + "epoch": 0.7547575199508901, + "grad_norm": 0.7267486453056335, + "learning_rate": 9.945980509763888e-05, + "loss": 2.1091, + "step": 2459 + }, + { + "epoch": 0.7550644567219152, + "grad_norm": 0.6128695607185364, + "learning_rate": 9.94590761774443e-05, + "loss": 2.1721, + "step": 2460 + }, + { + "epoch": 0.7553713934929405, + "grad_norm": 0.6574678421020508, + "learning_rate": 9.945834676846615e-05, + "loss": 2.1609, + "step": 2461 + }, + { + "epoch": 0.7556783302639656, + "grad_norm": 0.6209995150566101, + "learning_rate": 9.945761687071164e-05, + "loss": 2.1889, + "step": 2462 + }, + { + "epoch": 0.7559852670349908, + "grad_norm": 0.7425361275672913, + "learning_rate": 9.945688648418795e-05, + "loss": 2.2189, + "step": 2463 + }, + { + "epoch": 0.7562922038060159, + "grad_norm": 1.0604934692382812, + "learning_rate": 9.945615560890234e-05, + "loss": 2.1858, + "step": 2464 + }, + { + "epoch": 0.7565991405770411, + "grad_norm": 0.7162829041481018, + "learning_rate": 9.945542424486201e-05, + "loss": 2.101, + "step": 2465 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.6361207962036133, + "learning_rate": 9.945469239207416e-05, + "loss": 2.0836, + "step": 2466 + }, + { + "epoch": 0.7572130141190915, + "grad_norm": 0.5858156085014343, + "learning_rate": 9.945396005054609e-05, + "loss": 2.2059, + "step": 2467 + }, + { + "epoch": 0.7575199508901166, + "grad_norm": 0.7322074174880981, + "learning_rate": 9.945322722028498e-05, + "loss": 2.2295, + "step": 2468 + }, + { + "epoch": 0.7578268876611418, + "grad_norm": 0.775900661945343, + "learning_rate": 9.945249390129811e-05, + "loss": 2.2171, + "step": 2469 + }, + { + "epoch": 0.7581338244321669, + "grad_norm": 0.8801379799842834, + "learning_rate": 9.94517600935927e-05, + "loss": 2.1632, + "step": 2470 + }, + { + "epoch": 0.7584407612031921, + "grad_norm": 0.8258405923843384, + "learning_rate": 9.945102579717602e-05, + "loss": 2.1591, + "step": 2471 + }, + { + "epoch": 0.7587476979742173, + "grad_norm": 0.7472482323646545, + "learning_rate": 9.945029101205532e-05, + "loss": 2.2242, + "step": 2472 + }, + { + "epoch": 0.7590546347452425, + "grad_norm": 0.6594643592834473, + "learning_rate": 9.944955573823785e-05, + "loss": 2.1217, + "step": 2473 + }, + { + "epoch": 0.7593615715162676, + "grad_norm": 0.6547524333000183, + "learning_rate": 9.944881997573088e-05, + "loss": 2.131, + "step": 2474 + }, + { + "epoch": 0.7596685082872928, + "grad_norm": 0.6630129814147949, + "learning_rate": 9.94480837245417e-05, + "loss": 2.1264, + "step": 2475 + }, + { + "epoch": 0.7599754450583179, + "grad_norm": 0.6877384781837463, + "learning_rate": 9.944734698467757e-05, + "loss": 2.2453, + "step": 2476 + }, + { + "epoch": 0.7602823818293432, + "grad_norm": 0.6736158728599548, + "learning_rate": 9.944660975614579e-05, + "loss": 2.1425, + "step": 2477 + }, + { + "epoch": 0.7605893186003683, + "grad_norm": 0.6140786409378052, + "learning_rate": 9.944587203895361e-05, + "loss": 2.1345, + "step": 2478 + }, + { + "epoch": 0.7608962553713935, + "grad_norm": 0.5515910387039185, + "learning_rate": 9.944513383310837e-05, + "loss": 2.086, + "step": 2479 + }, + { + "epoch": 0.7612031921424187, + "grad_norm": 0.49419671297073364, + "learning_rate": 9.944439513861731e-05, + "loss": 2.1069, + "step": 2480 + }, + { + "epoch": 0.7615101289134438, + "grad_norm": 0.5526577234268188, + "learning_rate": 9.944365595548777e-05, + "loss": 2.1702, + "step": 2481 + }, + { + "epoch": 0.761817065684469, + "grad_norm": 0.5430580973625183, + "learning_rate": 9.944291628372702e-05, + "loss": 2.121, + "step": 2482 + }, + { + "epoch": 0.7621240024554942, + "grad_norm": 0.5333554148674011, + "learning_rate": 9.94421761233424e-05, + "loss": 2.1154, + "step": 2483 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.5856761932373047, + "learning_rate": 9.944143547434124e-05, + "loss": 2.1734, + "step": 2484 + }, + { + "epoch": 0.7627378759975445, + "grad_norm": 0.6619083881378174, + "learning_rate": 9.944069433673082e-05, + "loss": 2.2068, + "step": 2485 + }, + { + "epoch": 0.7630448127685697, + "grad_norm": 0.5791018009185791, + "learning_rate": 9.943995271051849e-05, + "loss": 2.0834, + "step": 2486 + }, + { + "epoch": 0.7633517495395948, + "grad_norm": 0.5942522287368774, + "learning_rate": 9.943921059571155e-05, + "loss": 2.2001, + "step": 2487 + }, + { + "epoch": 0.7636586863106201, + "grad_norm": 0.6285880208015442, + "learning_rate": 9.943846799231738e-05, + "loss": 2.1601, + "step": 2488 + }, + { + "epoch": 0.7639656230816452, + "grad_norm": 0.6337715983390808, + "learning_rate": 9.943772490034326e-05, + "loss": 2.1722, + "step": 2489 + }, + { + "epoch": 0.7642725598526704, + "grad_norm": 0.6912121772766113, + "learning_rate": 9.94369813197966e-05, + "loss": 2.1933, + "step": 2490 + }, + { + "epoch": 0.7645794966236955, + "grad_norm": 0.8028284311294556, + "learning_rate": 9.943623725068469e-05, + "loss": 2.129, + "step": 2491 + }, + { + "epoch": 0.7648864333947207, + "grad_norm": 0.8527138233184814, + "learning_rate": 9.943549269301491e-05, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.7651933701657458, + "grad_norm": 0.8422580361366272, + "learning_rate": 9.943474764679462e-05, + "loss": 2.2958, + "step": 2493 + }, + { + "epoch": 0.7655003069367711, + "grad_norm": 0.7698150873184204, + "learning_rate": 9.943400211203118e-05, + "loss": 2.1415, + "step": 2494 + }, + { + "epoch": 0.7658072437077962, + "grad_norm": 0.6360690593719482, + "learning_rate": 9.943325608873196e-05, + "loss": 2.1188, + "step": 2495 + }, + { + "epoch": 0.7661141804788214, + "grad_norm": 0.6225799918174744, + "learning_rate": 9.943250957690433e-05, + "loss": 2.1006, + "step": 2496 + }, + { + "epoch": 0.7664211172498465, + "grad_norm": 0.6694490909576416, + "learning_rate": 9.943176257655567e-05, + "loss": 2.2455, + "step": 2497 + }, + { + "epoch": 0.7667280540208717, + "grad_norm": 0.6188158988952637, + "learning_rate": 9.943101508769335e-05, + "loss": 2.0853, + "step": 2498 + }, + { + "epoch": 0.7670349907918969, + "grad_norm": 0.5934504866600037, + "learning_rate": 9.943026711032477e-05, + "loss": 2.0718, + "step": 2499 + }, + { + "epoch": 0.7673419275629221, + "grad_norm": 0.6261292695999146, + "learning_rate": 9.942951864445732e-05, + "loss": 2.1747, + "step": 2500 + }, + { + "epoch": 0.7676488643339472, + "grad_norm": 0.5891184210777283, + "learning_rate": 9.94287696900984e-05, + "loss": 2.1637, + "step": 2501 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.5321740508079529, + "learning_rate": 9.94280202472554e-05, + "loss": 2.0717, + "step": 2502 + }, + { + "epoch": 0.7682627378759975, + "grad_norm": 0.5563281178474426, + "learning_rate": 9.942727031593573e-05, + "loss": 2.1654, + "step": 2503 + }, + { + "epoch": 0.7685696746470227, + "grad_norm": 0.5672664046287537, + "learning_rate": 9.942651989614681e-05, + "loss": 2.0853, + "step": 2504 + }, + { + "epoch": 0.7688766114180479, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.942576898789606e-05, + "loss": 2.0636, + "step": 2505 + }, + { + "epoch": 0.7691835481890731, + "grad_norm": 0.5802470445632935, + "learning_rate": 9.942501759119088e-05, + "loss": 2.0924, + "step": 2506 + }, + { + "epoch": 0.7694904849600982, + "grad_norm": 0.5630003213882446, + "learning_rate": 9.94242657060387e-05, + "loss": 2.1975, + "step": 2507 + }, + { + "epoch": 0.7697974217311234, + "grad_norm": 0.6001835465431213, + "learning_rate": 9.942351333244697e-05, + "loss": 2.1187, + "step": 2508 + }, + { + "epoch": 0.7701043585021485, + "grad_norm": 0.6702088117599487, + "learning_rate": 9.942276047042311e-05, + "loss": 2.1489, + "step": 2509 + }, + { + "epoch": 0.7704112952731738, + "grad_norm": 0.7941808700561523, + "learning_rate": 9.942200711997456e-05, + "loss": 2.1404, + "step": 2510 + }, + { + "epoch": 0.7707182320441989, + "grad_norm": 0.8202539682388306, + "learning_rate": 9.942125328110876e-05, + "loss": 2.1242, + "step": 2511 + }, + { + "epoch": 0.7710251688152241, + "grad_norm": 0.7667655348777771, + "learning_rate": 9.942049895383319e-05, + "loss": 2.118, + "step": 2512 + }, + { + "epoch": 0.7713321055862492, + "grad_norm": 0.6766887307167053, + "learning_rate": 9.941974413815527e-05, + "loss": 2.2632, + "step": 2513 + }, + { + "epoch": 0.7716390423572744, + "grad_norm": 0.5923287272453308, + "learning_rate": 9.941898883408248e-05, + "loss": 2.1096, + "step": 2514 + }, + { + "epoch": 0.7719459791282995, + "grad_norm": 0.8847586512565613, + "learning_rate": 9.941823304162227e-05, + "loss": 2.2629, + "step": 2515 + }, + { + "epoch": 0.7722529158993248, + "grad_norm": 1.2274069786071777, + "learning_rate": 9.941747676078211e-05, + "loss": 2.2493, + "step": 2516 + }, + { + "epoch": 0.7725598526703499, + "grad_norm": 0.8637729287147522, + "learning_rate": 9.94167199915695e-05, + "loss": 2.1545, + "step": 2517 + }, + { + "epoch": 0.7728667894413751, + "grad_norm": 0.7852178812026978, + "learning_rate": 9.941596273399187e-05, + "loss": 2.1984, + "step": 2518 + }, + { + "epoch": 0.7731737262124002, + "grad_norm": 0.6839576959609985, + "learning_rate": 9.941520498805677e-05, + "loss": 2.1913, + "step": 2519 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.7051649689674377, + "learning_rate": 9.941444675377163e-05, + "loss": 2.1678, + "step": 2520 + }, + { + "epoch": 0.7737875997544506, + "grad_norm": 0.702549159526825, + "learning_rate": 9.941368803114395e-05, + "loss": 2.1426, + "step": 2521 + }, + { + "epoch": 0.7740945365254758, + "grad_norm": 0.6717942953109741, + "learning_rate": 9.941292882018127e-05, + "loss": 2.1873, + "step": 2522 + }, + { + "epoch": 0.7744014732965009, + "grad_norm": 0.6705282926559448, + "learning_rate": 9.941216912089104e-05, + "loss": 2.1363, + "step": 2523 + }, + { + "epoch": 0.7747084100675261, + "grad_norm": 0.5858317017555237, + "learning_rate": 9.941140893328082e-05, + "loss": 2.1019, + "step": 2524 + }, + { + "epoch": 0.7750153468385512, + "grad_norm": 0.6353682279586792, + "learning_rate": 9.941064825735808e-05, + "loss": 2.1765, + "step": 2525 + }, + { + "epoch": 0.7753222836095764, + "grad_norm": 0.6573354601860046, + "learning_rate": 9.940988709313035e-05, + "loss": 2.0636, + "step": 2526 + }, + { + "epoch": 0.7756292203806016, + "grad_norm": 0.6040489077568054, + "learning_rate": 9.940912544060517e-05, + "loss": 2.0902, + "step": 2527 + }, + { + "epoch": 0.7759361571516268, + "grad_norm": 0.7024530172348022, + "learning_rate": 9.940836329979004e-05, + "loss": 2.2198, + "step": 2528 + }, + { + "epoch": 0.7762430939226519, + "grad_norm": 0.6910196542739868, + "learning_rate": 9.940760067069251e-05, + "loss": 2.0546, + "step": 2529 + }, + { + "epoch": 0.7765500306936771, + "grad_norm": 0.6841506361961365, + "learning_rate": 9.940683755332012e-05, + "loss": 2.2159, + "step": 2530 + }, + { + "epoch": 0.7768569674647022, + "grad_norm": 0.6503066420555115, + "learning_rate": 9.940607394768038e-05, + "loss": 2.2156, + "step": 2531 + }, + { + "epoch": 0.7771639042357275, + "grad_norm": 0.6512146592140198, + "learning_rate": 9.940530985378089e-05, + "loss": 2.1417, + "step": 2532 + }, + { + "epoch": 0.7774708410067526, + "grad_norm": 0.6234787106513977, + "learning_rate": 9.940454527162914e-05, + "loss": 2.1315, + "step": 2533 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.6279457211494446, + "learning_rate": 9.940378020123273e-05, + "loss": 2.2699, + "step": 2534 + }, + { + "epoch": 0.7780847145488029, + "grad_norm": 0.6793956160545349, + "learning_rate": 9.940301464259921e-05, + "loss": 2.2488, + "step": 2535 + }, + { + "epoch": 0.7783916513198281, + "grad_norm": 0.721234142780304, + "learning_rate": 9.940224859573614e-05, + "loss": 2.1183, + "step": 2536 + }, + { + "epoch": 0.7786985880908532, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.94014820606511e-05, + "loss": 2.0995, + "step": 2537 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.6358578205108643, + "learning_rate": 9.940071503735165e-05, + "loss": 2.2024, + "step": 2538 + }, + { + "epoch": 0.7793124616329036, + "grad_norm": 0.6250868439674377, + "learning_rate": 9.939994752584538e-05, + "loss": 2.1574, + "step": 2539 + }, + { + "epoch": 0.7796193984039288, + "grad_norm": 0.7657763361930847, + "learning_rate": 9.939917952613989e-05, + "loss": 2.2625, + "step": 2540 + }, + { + "epoch": 0.7799263351749539, + "grad_norm": 0.7625400424003601, + "learning_rate": 9.939841103824275e-05, + "loss": 2.1809, + "step": 2541 + }, + { + "epoch": 0.7802332719459791, + "grad_norm": 0.8593107461929321, + "learning_rate": 9.939764206216155e-05, + "loss": 2.2359, + "step": 2542 + }, + { + "epoch": 0.7805402087170042, + "grad_norm": 0.8441007733345032, + "learning_rate": 9.93968725979039e-05, + "loss": 2.1844, + "step": 2543 + }, + { + "epoch": 0.7808471454880295, + "grad_norm": 0.6408470273017883, + "learning_rate": 9.93961026454774e-05, + "loss": 2.1871, + "step": 2544 + }, + { + "epoch": 0.7811540822590546, + "grad_norm": 0.6779976487159729, + "learning_rate": 9.939533220488966e-05, + "loss": 2.1651, + "step": 2545 + }, + { + "epoch": 0.7814610190300798, + "grad_norm": 0.5885556936264038, + "learning_rate": 9.93945612761483e-05, + "loss": 2.0172, + "step": 2546 + }, + { + "epoch": 0.7817679558011049, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.939378985926094e-05, + "loss": 2.1358, + "step": 2547 + }, + { + "epoch": 0.7820748925721301, + "grad_norm": 0.685183584690094, + "learning_rate": 9.939301795423519e-05, + "loss": 2.1822, + "step": 2548 + }, + { + "epoch": 0.7823818293431553, + "grad_norm": 0.6666997671127319, + "learning_rate": 9.939224556107869e-05, + "loss": 2.288, + "step": 2549 + }, + { + "epoch": 0.7826887661141805, + "grad_norm": 0.6401170492172241, + "learning_rate": 9.939147267979905e-05, + "loss": 2.1038, + "step": 2550 + }, + { + "epoch": 0.7829957028852057, + "grad_norm": 0.645182728767395, + "learning_rate": 9.939069931040396e-05, + "loss": 2.1285, + "step": 2551 + }, + { + "epoch": 0.7833026396562308, + "grad_norm": 0.6795851588249207, + "learning_rate": 9.9389925452901e-05, + "loss": 2.1844, + "step": 2552 + }, + { + "epoch": 0.783609576427256, + "grad_norm": 0.7027488946914673, + "learning_rate": 9.938915110729788e-05, + "loss": 2.1712, + "step": 2553 + }, + { + "epoch": 0.7839165131982812, + "grad_norm": 0.7076524496078491, + "learning_rate": 9.93883762736022e-05, + "loss": 2.1812, + "step": 2554 + }, + { + "epoch": 0.7842234499693064, + "grad_norm": 0.5979459881782532, + "learning_rate": 9.938760095182165e-05, + "loss": 2.0877, + "step": 2555 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.6408665776252747, + "learning_rate": 9.938682514196387e-05, + "loss": 2.191, + "step": 2556 + }, + { + "epoch": 0.7848373235113567, + "grad_norm": 0.6545908451080322, + "learning_rate": 9.938604884403654e-05, + "loss": 2.0933, + "step": 2557 + }, + { + "epoch": 0.7851442602823818, + "grad_norm": 0.7271838784217834, + "learning_rate": 9.938527205804733e-05, + "loss": 2.1804, + "step": 2558 + }, + { + "epoch": 0.785451197053407, + "grad_norm": 0.6371840834617615, + "learning_rate": 9.938449478400391e-05, + "loss": 2.1161, + "step": 2559 + }, + { + "epoch": 0.7857581338244322, + "grad_norm": 0.5922467708587646, + "learning_rate": 9.938371702191398e-05, + "loss": 2.0929, + "step": 2560 + }, + { + "epoch": 0.7860650705954574, + "grad_norm": 0.536125898361206, + "learning_rate": 9.938293877178522e-05, + "loss": 2.0815, + "step": 2561 + }, + { + "epoch": 0.7863720073664825, + "grad_norm": 0.6026225090026855, + "learning_rate": 9.93821600336253e-05, + "loss": 2.1719, + "step": 2562 + }, + { + "epoch": 0.7866789441375077, + "grad_norm": 0.584267795085907, + "learning_rate": 9.938138080744192e-05, + "loss": 2.1515, + "step": 2563 + }, + { + "epoch": 0.7869858809085328, + "grad_norm": 0.6616362929344177, + "learning_rate": 9.938060109324281e-05, + "loss": 2.2425, + "step": 2564 + }, + { + "epoch": 0.787292817679558, + "grad_norm": 0.669987678527832, + "learning_rate": 9.937982089103566e-05, + "loss": 2.1883, + "step": 2565 + }, + { + "epoch": 0.7875997544505832, + "grad_norm": 0.6769465208053589, + "learning_rate": 9.937904020082815e-05, + "loss": 2.1508, + "step": 2566 + }, + { + "epoch": 0.7879066912216084, + "grad_norm": 0.5796112418174744, + "learning_rate": 9.937825902262805e-05, + "loss": 2.0925, + "step": 2567 + }, + { + "epoch": 0.7882136279926335, + "grad_norm": 0.5895870923995972, + "learning_rate": 9.937747735644305e-05, + "loss": 2.1002, + "step": 2568 + }, + { + "epoch": 0.7885205647636587, + "grad_norm": 0.5870219469070435, + "learning_rate": 9.937669520228088e-05, + "loss": 2.1189, + "step": 2569 + }, + { + "epoch": 0.7888275015346838, + "grad_norm": 0.6191404461860657, + "learning_rate": 9.937591256014925e-05, + "loss": 2.1783, + "step": 2570 + }, + { + "epoch": 0.7891344383057091, + "grad_norm": 0.6033806204795837, + "learning_rate": 9.937512943005592e-05, + "loss": 2.1507, + "step": 2571 + }, + { + "epoch": 0.7894413750767342, + "grad_norm": 0.6319470405578613, + "learning_rate": 9.937434581200863e-05, + "loss": 2.2088, + "step": 2572 + }, + { + "epoch": 0.7897483118477594, + "grad_norm": 0.621004581451416, + "learning_rate": 9.93735617060151e-05, + "loss": 2.1523, + "step": 2573 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.6069821715354919, + "learning_rate": 9.937277711208311e-05, + "loss": 2.1437, + "step": 2574 + }, + { + "epoch": 0.7903621853898097, + "grad_norm": 0.6186996102333069, + "learning_rate": 9.937199203022039e-05, + "loss": 2.1541, + "step": 2575 + }, + { + "epoch": 0.7906691221608348, + "grad_norm": 0.6531949639320374, + "learning_rate": 9.937120646043471e-05, + "loss": 2.1928, + "step": 2576 + }, + { + "epoch": 0.7909760589318601, + "grad_norm": 0.5974560379981995, + "learning_rate": 9.937042040273383e-05, + "loss": 2.1814, + "step": 2577 + }, + { + "epoch": 0.7912829957028852, + "grad_norm": 0.59506756067276, + "learning_rate": 9.936963385712552e-05, + "loss": 2.2143, + "step": 2578 + }, + { + "epoch": 0.7915899324739104, + "grad_norm": 0.5878757834434509, + "learning_rate": 9.936884682361755e-05, + "loss": 2.0718, + "step": 2579 + }, + { + "epoch": 0.7918968692449355, + "grad_norm": 0.6318243145942688, + "learning_rate": 9.936805930221769e-05, + "loss": 2.1465, + "step": 2580 + }, + { + "epoch": 0.7922038060159607, + "grad_norm": 0.6474836468696594, + "learning_rate": 9.936727129293376e-05, + "loss": 2.0869, + "step": 2581 + }, + { + "epoch": 0.7925107427869859, + "grad_norm": 0.6589438915252686, + "learning_rate": 9.936648279577349e-05, + "loss": 2.1422, + "step": 2582 + }, + { + "epoch": 0.7928176795580111, + "grad_norm": 0.6935134530067444, + "learning_rate": 9.93656938107447e-05, + "loss": 2.1571, + "step": 2583 + }, + { + "epoch": 0.7931246163290362, + "grad_norm": 0.655430793762207, + "learning_rate": 9.936490433785522e-05, + "loss": 2.1044, + "step": 2584 + }, + { + "epoch": 0.7934315531000614, + "grad_norm": 0.6856111288070679, + "learning_rate": 9.93641143771128e-05, + "loss": 2.0551, + "step": 2585 + }, + { + "epoch": 0.7937384898710865, + "grad_norm": 0.6783097386360168, + "learning_rate": 9.936332392852527e-05, + "loss": 2.1475, + "step": 2586 + }, + { + "epoch": 0.7940454266421118, + "grad_norm": 0.6746678948402405, + "learning_rate": 9.936253299210045e-05, + "loss": 2.1462, + "step": 2587 + }, + { + "epoch": 0.7943523634131369, + "grad_norm": 0.6854017972946167, + "learning_rate": 9.936174156784614e-05, + "loss": 2.1649, + "step": 2588 + }, + { + "epoch": 0.7946593001841621, + "grad_norm": 0.6740380525588989, + "learning_rate": 9.936094965577017e-05, + "loss": 2.06, + "step": 2589 + }, + { + "epoch": 0.7949662369551872, + "grad_norm": 0.6354179978370667, + "learning_rate": 9.936015725588037e-05, + "loss": 2.1938, + "step": 2590 + }, + { + "epoch": 0.7952731737262124, + "grad_norm": 0.6496716141700745, + "learning_rate": 9.935936436818453e-05, + "loss": 2.089, + "step": 2591 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.5996106266975403, + "learning_rate": 9.935857099269057e-05, + "loss": 2.2254, + "step": 2592 + }, + { + "epoch": 0.7958870472682628, + "grad_norm": 0.5630382895469666, + "learning_rate": 9.935777712940625e-05, + "loss": 2.069, + "step": 2593 + }, + { + "epoch": 0.7961939840392879, + "grad_norm": 0.5480468273162842, + "learning_rate": 9.935698277833946e-05, + "loss": 2.1288, + "step": 2594 + }, + { + "epoch": 0.7965009208103131, + "grad_norm": 0.5127096772193909, + "learning_rate": 9.935618793949803e-05, + "loss": 2.0753, + "step": 2595 + }, + { + "epoch": 0.7968078575813382, + "grad_norm": 0.6451439261436462, + "learning_rate": 9.935539261288983e-05, + "loss": 2.3005, + "step": 2596 + }, + { + "epoch": 0.7971147943523634, + "grad_norm": 0.7047737836837769, + "learning_rate": 9.935459679852271e-05, + "loss": 2.1307, + "step": 2597 + }, + { + "epoch": 0.7974217311233885, + "grad_norm": 0.6382983922958374, + "learning_rate": 9.935380049640454e-05, + "loss": 2.1136, + "step": 2598 + }, + { + "epoch": 0.7977286678944138, + "grad_norm": 0.7337773442268372, + "learning_rate": 9.935300370654317e-05, + "loss": 2.0719, + "step": 2599 + }, + { + "epoch": 0.7980356046654389, + "grad_norm": 0.7481197118759155, + "learning_rate": 9.935220642894652e-05, + "loss": 2.2263, + "step": 2600 + }, + { + "epoch": 0.7983425414364641, + "grad_norm": 0.7383365631103516, + "learning_rate": 9.93514086636224e-05, + "loss": 2.2207, + "step": 2601 + }, + { + "epoch": 0.7986494782074892, + "grad_norm": 0.800762951374054, + "learning_rate": 9.935061041057876e-05, + "loss": 2.1848, + "step": 2602 + }, + { + "epoch": 0.7989564149785144, + "grad_norm": 0.6972829699516296, + "learning_rate": 9.934981166982346e-05, + "loss": 2.1301, + "step": 2603 + }, + { + "epoch": 0.7992633517495396, + "grad_norm": 0.5842304229736328, + "learning_rate": 9.93490124413644e-05, + "loss": 2.1311, + "step": 2604 + }, + { + "epoch": 0.7995702885205648, + "grad_norm": 0.6070491075515747, + "learning_rate": 9.934821272520946e-05, + "loss": 2.2226, + "step": 2605 + }, + { + "epoch": 0.7998772252915899, + "grad_norm": 0.6141406297683716, + "learning_rate": 9.934741252136656e-05, + "loss": 2.1425, + "step": 2606 + }, + { + "epoch": 0.8001841620626151, + "grad_norm": 0.5515148043632507, + "learning_rate": 9.934661182984363e-05, + "loss": 2.1138, + "step": 2607 + }, + { + "epoch": 0.8004910988336402, + "grad_norm": 0.5819688439369202, + "learning_rate": 9.934581065064854e-05, + "loss": 2.0835, + "step": 2608 + }, + { + "epoch": 0.8007980356046654, + "grad_norm": 0.593979001045227, + "learning_rate": 9.934500898378922e-05, + "loss": 2.2262, + "step": 2609 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.6978363990783691, + "learning_rate": 9.934420682927361e-05, + "loss": 2.1283, + "step": 2610 + }, + { + "epoch": 0.8014119091467158, + "grad_norm": 0.6205853223800659, + "learning_rate": 9.934340418710963e-05, + "loss": 2.1254, + "step": 2611 + }, + { + "epoch": 0.8017188459177409, + "grad_norm": 0.5547113418579102, + "learning_rate": 9.93426010573052e-05, + "loss": 2.0895, + "step": 2612 + }, + { + "epoch": 0.8020257826887661, + "grad_norm": 0.5652415156364441, + "learning_rate": 9.934179743986827e-05, + "loss": 2.1496, + "step": 2613 + }, + { + "epoch": 0.8023327194597912, + "grad_norm": 0.5833094120025635, + "learning_rate": 9.934099333480678e-05, + "loss": 2.1159, + "step": 2614 + }, + { + "epoch": 0.8026396562308165, + "grad_norm": 0.5929473638534546, + "learning_rate": 9.934018874212866e-05, + "loss": 2.1512, + "step": 2615 + }, + { + "epoch": 0.8029465930018416, + "grad_norm": 0.6359207630157471, + "learning_rate": 9.93393836618419e-05, + "loss": 2.1384, + "step": 2616 + }, + { + "epoch": 0.8032535297728668, + "grad_norm": 0.5934728384017944, + "learning_rate": 9.933857809395441e-05, + "loss": 2.1087, + "step": 2617 + }, + { + "epoch": 0.8035604665438919, + "grad_norm": 0.5685787796974182, + "learning_rate": 9.933777203847418e-05, + "loss": 2.1521, + "step": 2618 + }, + { + "epoch": 0.8038674033149171, + "grad_norm": 0.6276339292526245, + "learning_rate": 9.933696549540918e-05, + "loss": 2.1151, + "step": 2619 + }, + { + "epoch": 0.8041743400859422, + "grad_norm": 0.6206804513931274, + "learning_rate": 9.933615846476736e-05, + "loss": 2.1872, + "step": 2620 + }, + { + "epoch": 0.8044812768569675, + "grad_norm": 0.6645623445510864, + "learning_rate": 9.933535094655671e-05, + "loss": 2.217, + "step": 2621 + }, + { + "epoch": 0.8047882136279927, + "grad_norm": 0.6639950275421143, + "learning_rate": 9.93345429407852e-05, + "loss": 2.1479, + "step": 2622 + }, + { + "epoch": 0.8050951503990178, + "grad_norm": 0.6284301280975342, + "learning_rate": 9.933373444746081e-05, + "loss": 2.1763, + "step": 2623 + }, + { + "epoch": 0.805402087170043, + "grad_norm": 0.5974198579788208, + "learning_rate": 9.933292546659156e-05, + "loss": 2.1453, + "step": 2624 + }, + { + "epoch": 0.8057090239410681, + "grad_norm": 0.6465814113616943, + "learning_rate": 9.933211599818541e-05, + "loss": 2.1999, + "step": 2625 + }, + { + "epoch": 0.8060159607120934, + "grad_norm": 0.6099503040313721, + "learning_rate": 9.933130604225038e-05, + "loss": 2.1523, + "step": 2626 + }, + { + "epoch": 0.8063228974831185, + "grad_norm": 0.5749596953392029, + "learning_rate": 9.933049559879448e-05, + "loss": 2.0802, + "step": 2627 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.5541282892227173, + "learning_rate": 9.93296846678257e-05, + "loss": 2.0851, + "step": 2628 + }, + { + "epoch": 0.8069367710251688, + "grad_norm": 0.5884469747543335, + "learning_rate": 9.932887324935207e-05, + "loss": 2.1824, + "step": 2629 + }, + { + "epoch": 0.807243707796194, + "grad_norm": 0.7330854535102844, + "learning_rate": 9.93280613433816e-05, + "loss": 2.1463, + "step": 2630 + }, + { + "epoch": 0.8075506445672191, + "grad_norm": 0.7012677192687988, + "learning_rate": 9.932724894992232e-05, + "loss": 2.0907, + "step": 2631 + }, + { + "epoch": 0.8078575813382444, + "grad_norm": 0.6487980484962463, + "learning_rate": 9.932643606898224e-05, + "loss": 2.2131, + "step": 2632 + }, + { + "epoch": 0.8081645181092695, + "grad_norm": 0.7956567406654358, + "learning_rate": 9.932562270056941e-05, + "loss": 2.2289, + "step": 2633 + }, + { + "epoch": 0.8084714548802947, + "grad_norm": 0.7904889583587646, + "learning_rate": 9.932480884469187e-05, + "loss": 2.195, + "step": 2634 + }, + { + "epoch": 0.8087783916513198, + "grad_norm": 0.8088505864143372, + "learning_rate": 9.932399450135766e-05, + "loss": 2.1199, + "step": 2635 + }, + { + "epoch": 0.809085328422345, + "grad_norm": 0.7557070851325989, + "learning_rate": 9.932317967057483e-05, + "loss": 2.177, + "step": 2636 + }, + { + "epoch": 0.8093922651933702, + "grad_norm": 0.8585113286972046, + "learning_rate": 9.932236435235143e-05, + "loss": 2.2215, + "step": 2637 + }, + { + "epoch": 0.8096992019643954, + "grad_norm": 0.9541242718696594, + "learning_rate": 9.932154854669551e-05, + "loss": 2.0971, + "step": 2638 + }, + { + "epoch": 0.8100061387354205, + "grad_norm": 0.9696017503738403, + "learning_rate": 9.932073225361513e-05, + "loss": 2.1723, + "step": 2639 + }, + { + "epoch": 0.8103130755064457, + "grad_norm": 0.9876028895378113, + "learning_rate": 9.931991547311839e-05, + "loss": 2.2266, + "step": 2640 + }, + { + "epoch": 0.8106200122774708, + "grad_norm": 0.9169884324073792, + "learning_rate": 9.931909820521332e-05, + "loss": 2.1453, + "step": 2641 + }, + { + "epoch": 0.810926949048496, + "grad_norm": 0.7645174860954285, + "learning_rate": 9.931828044990801e-05, + "loss": 2.1683, + "step": 2642 + }, + { + "epoch": 0.8112338858195212, + "grad_norm": 0.6733110547065735, + "learning_rate": 9.931746220721056e-05, + "loss": 2.0869, + "step": 2643 + }, + { + "epoch": 0.8115408225905464, + "grad_norm": 0.6033461689949036, + "learning_rate": 9.931664347712904e-05, + "loss": 2.1395, + "step": 2644 + }, + { + "epoch": 0.8118477593615715, + "grad_norm": 0.5953301191329956, + "learning_rate": 9.931582425967154e-05, + "loss": 2.0886, + "step": 2645 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.6587704420089722, + "learning_rate": 9.931500455484616e-05, + "loss": 2.1846, + "step": 2646 + }, + { + "epoch": 0.8124616329036218, + "grad_norm": 0.5837808847427368, + "learning_rate": 9.931418436266101e-05, + "loss": 2.0953, + "step": 2647 + }, + { + "epoch": 0.8127685696746471, + "grad_norm": 0.5593163967132568, + "learning_rate": 9.931336368312417e-05, + "loss": 2.1044, + "step": 2648 + }, + { + "epoch": 0.8130755064456722, + "grad_norm": 0.5758668780326843, + "learning_rate": 9.931254251624378e-05, + "loss": 2.1813, + "step": 2649 + }, + { + "epoch": 0.8133824432166974, + "grad_norm": 0.7128240466117859, + "learning_rate": 9.931172086202793e-05, + "loss": 2.1743, + "step": 2650 + }, + { + "epoch": 0.8136893799877225, + "grad_norm": 0.6214346885681152, + "learning_rate": 9.931089872048476e-05, + "loss": 2.0566, + "step": 2651 + }, + { + "epoch": 0.8139963167587477, + "grad_norm": 0.6279975771903992, + "learning_rate": 9.931007609162239e-05, + "loss": 2.1487, + "step": 2652 + }, + { + "epoch": 0.8143032535297728, + "grad_norm": 0.6137428879737854, + "learning_rate": 9.930925297544895e-05, + "loss": 2.1281, + "step": 2653 + }, + { + "epoch": 0.8146101903007981, + "grad_norm": 0.7433622479438782, + "learning_rate": 9.930842937197255e-05, + "loss": 2.2398, + "step": 2654 + }, + { + "epoch": 0.8149171270718232, + "grad_norm": 0.7490934729576111, + "learning_rate": 9.930760528120137e-05, + "loss": 2.0626, + "step": 2655 + }, + { + "epoch": 0.8152240638428484, + "grad_norm": 0.6829020380973816, + "learning_rate": 9.930678070314352e-05, + "loss": 2.0685, + "step": 2656 + }, + { + "epoch": 0.8155310006138735, + "grad_norm": 0.6328942775726318, + "learning_rate": 9.930595563780718e-05, + "loss": 2.1415, + "step": 2657 + }, + { + "epoch": 0.8158379373848987, + "grad_norm": 0.6919183135032654, + "learning_rate": 9.930513008520048e-05, + "loss": 2.1764, + "step": 2658 + }, + { + "epoch": 0.8161448741559238, + "grad_norm": 0.6600683331489563, + "learning_rate": 9.930430404533158e-05, + "loss": 2.2252, + "step": 2659 + }, + { + "epoch": 0.8164518109269491, + "grad_norm": 0.6614112257957458, + "learning_rate": 9.930347751820866e-05, + "loss": 2.0842, + "step": 2660 + }, + { + "epoch": 0.8167587476979742, + "grad_norm": 0.634395182132721, + "learning_rate": 9.930265050383987e-05, + "loss": 2.1784, + "step": 2661 + }, + { + "epoch": 0.8170656844689994, + "grad_norm": 0.6563819050788879, + "learning_rate": 9.930182300223338e-05, + "loss": 2.1845, + "step": 2662 + }, + { + "epoch": 0.8173726212400245, + "grad_norm": 0.7023175954818726, + "learning_rate": 9.93009950133974e-05, + "loss": 2.1913, + "step": 2663 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.6042037010192871, + "learning_rate": 9.930016653734007e-05, + "loss": 2.1624, + "step": 2664 + }, + { + "epoch": 0.8179864947820749, + "grad_norm": 0.5729875564575195, + "learning_rate": 9.929933757406962e-05, + "loss": 2.0439, + "step": 2665 + }, + { + "epoch": 0.8182934315531001, + "grad_norm": 0.5399687886238098, + "learning_rate": 9.929850812359421e-05, + "loss": 2.1438, + "step": 2666 + }, + { + "epoch": 0.8186003683241252, + "grad_norm": 0.6325745582580566, + "learning_rate": 9.929767818592205e-05, + "loss": 2.1644, + "step": 2667 + }, + { + "epoch": 0.8189073050951504, + "grad_norm": 0.6303146481513977, + "learning_rate": 9.929684776106134e-05, + "loss": 2.1106, + "step": 2668 + }, + { + "epoch": 0.8192142418661755, + "grad_norm": 0.6482712030410767, + "learning_rate": 9.929601684902027e-05, + "loss": 2.0877, + "step": 2669 + }, + { + "epoch": 0.8195211786372008, + "grad_norm": 0.6858036518096924, + "learning_rate": 9.92951854498071e-05, + "loss": 2.1263, + "step": 2670 + }, + { + "epoch": 0.8198281154082259, + "grad_norm": 0.6214284896850586, + "learning_rate": 9.929435356343e-05, + "loss": 2.1516, + "step": 2671 + }, + { + "epoch": 0.8201350521792511, + "grad_norm": 0.5486865639686584, + "learning_rate": 9.92935211898972e-05, + "loss": 2.1199, + "step": 2672 + }, + { + "epoch": 0.8204419889502762, + "grad_norm": 0.62936931848526, + "learning_rate": 9.929268832921693e-05, + "loss": 2.1555, + "step": 2673 + }, + { + "epoch": 0.8207489257213014, + "grad_norm": 0.6402064561843872, + "learning_rate": 9.929185498139744e-05, + "loss": 2.1017, + "step": 2674 + }, + { + "epoch": 0.8210558624923265, + "grad_norm": 0.7254593372344971, + "learning_rate": 9.929102114644693e-05, + "loss": 2.1145, + "step": 2675 + }, + { + "epoch": 0.8213627992633518, + "grad_norm": 0.776472806930542, + "learning_rate": 9.929018682437366e-05, + "loss": 2.2582, + "step": 2676 + }, + { + "epoch": 0.8216697360343769, + "grad_norm": 0.7073757648468018, + "learning_rate": 9.928935201518587e-05, + "loss": 2.1135, + "step": 2677 + }, + { + "epoch": 0.8219766728054021, + "grad_norm": 0.7075079679489136, + "learning_rate": 9.928851671889184e-05, + "loss": 2.128, + "step": 2678 + }, + { + "epoch": 0.8222836095764272, + "grad_norm": 0.7937450408935547, + "learning_rate": 9.928768093549979e-05, + "loss": 2.1401, + "step": 2679 + }, + { + "epoch": 0.8225905463474524, + "grad_norm": 0.7523970603942871, + "learning_rate": 9.928684466501797e-05, + "loss": 2.2055, + "step": 2680 + }, + { + "epoch": 0.8228974831184775, + "grad_norm": 0.6644876599311829, + "learning_rate": 9.928600790745466e-05, + "loss": 2.1449, + "step": 2681 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.6054069399833679, + "learning_rate": 9.928517066281816e-05, + "loss": 2.1191, + "step": 2682 + }, + { + "epoch": 0.8235113566605279, + "grad_norm": 0.6610973477363586, + "learning_rate": 9.92843329311167e-05, + "loss": 2.2247, + "step": 2683 + }, + { + "epoch": 0.8238182934315531, + "grad_norm": 0.69968181848526, + "learning_rate": 9.928349471235858e-05, + "loss": 2.149, + "step": 2684 + }, + { + "epoch": 0.8241252302025782, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.928265600655206e-05, + "loss": 2.1906, + "step": 2685 + }, + { + "epoch": 0.8244321669736034, + "grad_norm": 0.6621972918510437, + "learning_rate": 9.928181681370547e-05, + "loss": 2.1259, + "step": 2686 + }, + { + "epoch": 0.8247391037446286, + "grad_norm": 0.6452053785324097, + "learning_rate": 9.928097713382708e-05, + "loss": 2.1301, + "step": 2687 + }, + { + "epoch": 0.8250460405156538, + "grad_norm": 0.6137326955795288, + "learning_rate": 9.928013696692519e-05, + "loss": 2.0942, + "step": 2688 + }, + { + "epoch": 0.8253529772866789, + "grad_norm": 0.6449215412139893, + "learning_rate": 9.92792963130081e-05, + "loss": 2.2135, + "step": 2689 + }, + { + "epoch": 0.8256599140577041, + "grad_norm": 0.5838732123374939, + "learning_rate": 9.927845517208411e-05, + "loss": 2.1161, + "step": 2690 + }, + { + "epoch": 0.8259668508287292, + "grad_norm": 0.6642805337905884, + "learning_rate": 9.927761354416157e-05, + "loss": 2.1228, + "step": 2691 + }, + { + "epoch": 0.8262737875997545, + "grad_norm": 0.653274416923523, + "learning_rate": 9.927677142924874e-05, + "loss": 2.1777, + "step": 2692 + }, + { + "epoch": 0.8265807243707797, + "grad_norm": 0.6471827030181885, + "learning_rate": 9.927592882735398e-05, + "loss": 2.0756, + "step": 2693 + }, + { + "epoch": 0.8268876611418048, + "grad_norm": 0.6215457916259766, + "learning_rate": 9.927508573848562e-05, + "loss": 2.0691, + "step": 2694 + }, + { + "epoch": 0.82719459791283, + "grad_norm": 0.6343390345573425, + "learning_rate": 9.927424216265198e-05, + "loss": 2.2145, + "step": 2695 + }, + { + "epoch": 0.8275015346838551, + "grad_norm": 0.5296334624290466, + "learning_rate": 9.927339809986138e-05, + "loss": 2.0861, + "step": 2696 + }, + { + "epoch": 0.8278084714548803, + "grad_norm": 0.6457146406173706, + "learning_rate": 9.92725535501222e-05, + "loss": 2.1703, + "step": 2697 + }, + { + "epoch": 0.8281154082259055, + "grad_norm": 0.753579318523407, + "learning_rate": 9.927170851344276e-05, + "loss": 2.1628, + "step": 2698 + }, + { + "epoch": 0.8284223449969307, + "grad_norm": 0.7327163815498352, + "learning_rate": 9.927086298983141e-05, + "loss": 2.105, + "step": 2699 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.7786175608634949, + "learning_rate": 9.927001697929653e-05, + "loss": 2.084, + "step": 2700 + }, + { + "epoch": 0.829036218538981, + "grad_norm": 0.6370857357978821, + "learning_rate": 9.926917048184646e-05, + "loss": 2.0888, + "step": 2701 + }, + { + "epoch": 0.8293431553100061, + "grad_norm": 0.6600006818771362, + "learning_rate": 9.926832349748955e-05, + "loss": 2.148, + "step": 2702 + }, + { + "epoch": 0.8296500920810314, + "grad_norm": 0.6266845464706421, + "learning_rate": 9.926747602623422e-05, + "loss": 2.2182, + "step": 2703 + }, + { + "epoch": 0.8299570288520565, + "grad_norm": 0.588934600353241, + "learning_rate": 9.92666280680888e-05, + "loss": 2.1879, + "step": 2704 + }, + { + "epoch": 0.8302639656230817, + "grad_norm": 0.6467881202697754, + "learning_rate": 9.926577962306168e-05, + "loss": 2.1082, + "step": 2705 + }, + { + "epoch": 0.8305709023941068, + "grad_norm": 0.6256638765335083, + "learning_rate": 9.926493069116127e-05, + "loss": 2.1007, + "step": 2706 + }, + { + "epoch": 0.830877839165132, + "grad_norm": 0.5710256099700928, + "learning_rate": 9.926408127239592e-05, + "loss": 2.0783, + "step": 2707 + }, + { + "epoch": 0.8311847759361571, + "grad_norm": 0.5836597681045532, + "learning_rate": 9.926323136677405e-05, + "loss": 2.1292, + "step": 2708 + }, + { + "epoch": 0.8314917127071824, + "grad_norm": 0.6420408487319946, + "learning_rate": 9.926238097430405e-05, + "loss": 2.1191, + "step": 2709 + }, + { + "epoch": 0.8317986494782075, + "grad_norm": 0.6192520260810852, + "learning_rate": 9.926153009499433e-05, + "loss": 2.1401, + "step": 2710 + }, + { + "epoch": 0.8321055862492327, + "grad_norm": 0.5986925959587097, + "learning_rate": 9.92606787288533e-05, + "loss": 2.0466, + "step": 2711 + }, + { + "epoch": 0.8324125230202578, + "grad_norm": 0.6386710405349731, + "learning_rate": 9.925982687588937e-05, + "loss": 2.1975, + "step": 2712 + }, + { + "epoch": 0.832719459791283, + "grad_norm": 0.6678250432014465, + "learning_rate": 9.925897453611095e-05, + "loss": 2.1744, + "step": 2713 + }, + { + "epoch": 0.8330263965623081, + "grad_norm": 0.628873348236084, + "learning_rate": 9.925812170952648e-05, + "loss": 2.0901, + "step": 2714 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6365368366241455, + "learning_rate": 9.925726839614438e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.8336402701043585, + "grad_norm": 0.6812825798988342, + "learning_rate": 9.925641459597309e-05, + "loss": 2.1163, + "step": 2716 + }, + { + "epoch": 0.8339472068753837, + "grad_norm": 0.6961301565170288, + "learning_rate": 9.925556030902103e-05, + "loss": 2.1634, + "step": 2717 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.687017023563385, + "learning_rate": 9.925470553529666e-05, + "loss": 2.1921, + "step": 2718 + }, + { + "epoch": 0.834561080417434, + "grad_norm": 0.6528787612915039, + "learning_rate": 9.925385027480841e-05, + "loss": 2.1148, + "step": 2719 + }, + { + "epoch": 0.8348680171884592, + "grad_norm": 0.6092917323112488, + "learning_rate": 9.925299452756476e-05, + "loss": 2.0154, + "step": 2720 + }, + { + "epoch": 0.8351749539594844, + "grad_norm": 0.6537092328071594, + "learning_rate": 9.925213829357413e-05, + "loss": 2.1775, + "step": 2721 + }, + { + "epoch": 0.8354818907305095, + "grad_norm": 0.6560773849487305, + "learning_rate": 9.925128157284503e-05, + "loss": 2.1628, + "step": 2722 + }, + { + "epoch": 0.8357888275015347, + "grad_norm": 0.5976104140281677, + "learning_rate": 9.925042436538588e-05, + "loss": 2.1527, + "step": 2723 + }, + { + "epoch": 0.8360957642725598, + "grad_norm": 0.6577131152153015, + "learning_rate": 9.924956667120516e-05, + "loss": 2.1449, + "step": 2724 + }, + { + "epoch": 0.836402701043585, + "grad_norm": 0.6574232578277588, + "learning_rate": 9.924870849031136e-05, + "loss": 2.0517, + "step": 2725 + }, + { + "epoch": 0.8367096378146102, + "grad_norm": 0.5988326072692871, + "learning_rate": 9.924784982271297e-05, + "loss": 2.0975, + "step": 2726 + }, + { + "epoch": 0.8370165745856354, + "grad_norm": 0.5970706939697266, + "learning_rate": 9.924699066841845e-05, + "loss": 2.1754, + "step": 2727 + }, + { + "epoch": 0.8373235113566605, + "grad_norm": 0.6547200679779053, + "learning_rate": 9.924613102743632e-05, + "loss": 2.1651, + "step": 2728 + }, + { + "epoch": 0.8376304481276857, + "grad_norm": 0.643358588218689, + "learning_rate": 9.924527089977504e-05, + "loss": 2.1355, + "step": 2729 + }, + { + "epoch": 0.8379373848987108, + "grad_norm": 0.6696504950523376, + "learning_rate": 9.924441028544314e-05, + "loss": 2.1444, + "step": 2730 + }, + { + "epoch": 0.8382443216697361, + "grad_norm": 0.5923263430595398, + "learning_rate": 9.924354918444911e-05, + "loss": 2.1656, + "step": 2731 + }, + { + "epoch": 0.8385512584407612, + "grad_norm": 0.6507698893547058, + "learning_rate": 9.924268759680146e-05, + "loss": 2.1172, + "step": 2732 + }, + { + "epoch": 0.8388581952117864, + "grad_norm": 0.6240561008453369, + "learning_rate": 9.924182552250873e-05, + "loss": 2.113, + "step": 2733 + }, + { + "epoch": 0.8391651319828115, + "grad_norm": 0.7350605726242065, + "learning_rate": 9.92409629615794e-05, + "loss": 2.2099, + "step": 2734 + }, + { + "epoch": 0.8394720687538367, + "grad_norm": 0.679027795791626, + "learning_rate": 9.924009991402202e-05, + "loss": 2.1202, + "step": 2735 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.7187801003456116, + "learning_rate": 9.923923637984512e-05, + "loss": 2.1994, + "step": 2736 + }, + { + "epoch": 0.8400859422958871, + "grad_norm": 0.7437569499015808, + "learning_rate": 9.92383723590572e-05, + "loss": 2.1778, + "step": 2737 + }, + { + "epoch": 0.8403928790669122, + "grad_norm": 0.7004902958869934, + "learning_rate": 9.923750785166686e-05, + "loss": 2.1478, + "step": 2738 + }, + { + "epoch": 0.8406998158379374, + "grad_norm": 0.632478654384613, + "learning_rate": 9.923664285768258e-05, + "loss": 2.1785, + "step": 2739 + }, + { + "epoch": 0.8410067526089625, + "grad_norm": 0.6399826407432556, + "learning_rate": 9.923577737711295e-05, + "loss": 2.1708, + "step": 2740 + }, + { + "epoch": 0.8413136893799877, + "grad_norm": 0.649340033531189, + "learning_rate": 9.92349114099665e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.8416206261510129, + "grad_norm": 0.6143749952316284, + "learning_rate": 9.923404495625182e-05, + "loss": 2.0696, + "step": 2742 + }, + { + "epoch": 0.8419275629220381, + "grad_norm": 0.655846357345581, + "learning_rate": 9.923317801597742e-05, + "loss": 2.1163, + "step": 2743 + }, + { + "epoch": 0.8422344996930632, + "grad_norm": 0.588096022605896, + "learning_rate": 9.923231058915192e-05, + "loss": 2.0893, + "step": 2744 + }, + { + "epoch": 0.8425414364640884, + "grad_norm": 0.5445908904075623, + "learning_rate": 9.923144267578386e-05, + "loss": 2.1223, + "step": 2745 + }, + { + "epoch": 0.8428483732351135, + "grad_norm": 0.5372910499572754, + "learning_rate": 9.923057427588182e-05, + "loss": 2.1386, + "step": 2746 + }, + { + "epoch": 0.8431553100061387, + "grad_norm": 0.5118899345397949, + "learning_rate": 9.922970538945442e-05, + "loss": 2.0532, + "step": 2747 + }, + { + "epoch": 0.8434622467771639, + "grad_norm": 0.5252440571784973, + "learning_rate": 9.922883601651019e-05, + "loss": 2.1679, + "step": 2748 + }, + { + "epoch": 0.8437691835481891, + "grad_norm": 0.5978875160217285, + "learning_rate": 9.922796615705776e-05, + "loss": 2.2054, + "step": 2749 + }, + { + "epoch": 0.8440761203192142, + "grad_norm": 0.5642610788345337, + "learning_rate": 9.922709581110572e-05, + "loss": 2.1886, + "step": 2750 + }, + { + "epoch": 0.8443830570902394, + "grad_norm": 0.6332407593727112, + "learning_rate": 9.922622497866265e-05, + "loss": 2.1618, + "step": 2751 + }, + { + "epoch": 0.8446899938612645, + "grad_norm": 0.6971728801727295, + "learning_rate": 9.922535365973718e-05, + "loss": 2.1011, + "step": 2752 + }, + { + "epoch": 0.8449969306322898, + "grad_norm": 0.6917250156402588, + "learning_rate": 9.922448185433792e-05, + "loss": 2.1408, + "step": 2753 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.748960554599762, + "learning_rate": 9.922360956247348e-05, + "loss": 2.1612, + "step": 2754 + }, + { + "epoch": 0.8456108041743401, + "grad_norm": 0.6739722490310669, + "learning_rate": 9.922273678415245e-05, + "loss": 2.1234, + "step": 2755 + }, + { + "epoch": 0.8459177409453652, + "grad_norm": 0.6310722827911377, + "learning_rate": 9.922186351938351e-05, + "loss": 2.1476, + "step": 2756 + }, + { + "epoch": 0.8462246777163904, + "grad_norm": 0.5992079973220825, + "learning_rate": 9.922098976817527e-05, + "loss": 2.1009, + "step": 2757 + }, + { + "epoch": 0.8465316144874155, + "grad_norm": 0.5697188973426819, + "learning_rate": 9.922011553053637e-05, + "loss": 2.1277, + "step": 2758 + }, + { + "epoch": 0.8468385512584408, + "grad_norm": 0.7005256414413452, + "learning_rate": 9.921924080647541e-05, + "loss": 2.1592, + "step": 2759 + }, + { + "epoch": 0.8471454880294659, + "grad_norm": 0.7664382457733154, + "learning_rate": 9.921836559600109e-05, + "loss": 2.2328, + "step": 2760 + }, + { + "epoch": 0.8474524248004911, + "grad_norm": 0.8668230772018433, + "learning_rate": 9.921748989912201e-05, + "loss": 2.2285, + "step": 2761 + }, + { + "epoch": 0.8477593615715162, + "grad_norm": 0.9423169493675232, + "learning_rate": 9.921661371584685e-05, + "loss": 2.1172, + "step": 2762 + }, + { + "epoch": 0.8480662983425414, + "grad_norm": 0.8547552824020386, + "learning_rate": 9.921573704618428e-05, + "loss": 2.1426, + "step": 2763 + }, + { + "epoch": 0.8483732351135667, + "grad_norm": 0.7568690776824951, + "learning_rate": 9.921485989014294e-05, + "loss": 2.0861, + "step": 2764 + }, + { + "epoch": 0.8486801718845918, + "grad_norm": 0.6535828709602356, + "learning_rate": 9.92139822477315e-05, + "loss": 2.1705, + "step": 2765 + }, + { + "epoch": 0.848987108655617, + "grad_norm": 0.6099218130111694, + "learning_rate": 9.921310411895867e-05, + "loss": 2.1666, + "step": 2766 + }, + { + "epoch": 0.8492940454266421, + "grad_norm": 0.6315065026283264, + "learning_rate": 9.92122255038331e-05, + "loss": 2.1868, + "step": 2767 + }, + { + "epoch": 0.8496009821976673, + "grad_norm": 0.6861329078674316, + "learning_rate": 9.921134640236344e-05, + "loss": 2.1056, + "step": 2768 + }, + { + "epoch": 0.8499079189686924, + "grad_norm": 0.6357519626617432, + "learning_rate": 9.921046681455844e-05, + "loss": 2.1272, + "step": 2769 + }, + { + "epoch": 0.8502148557397177, + "grad_norm": 0.6245810389518738, + "learning_rate": 9.920958674042676e-05, + "loss": 2.1313, + "step": 2770 + }, + { + "epoch": 0.8505217925107428, + "grad_norm": 0.6087192296981812, + "learning_rate": 9.920870617997709e-05, + "loss": 2.123, + "step": 2771 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.6384228467941284, + "learning_rate": 9.920782513321814e-05, + "loss": 2.1343, + "step": 2772 + }, + { + "epoch": 0.8511356660527931, + "grad_norm": 0.6143882274627686, + "learning_rate": 9.920694360015863e-05, + "loss": 2.0706, + "step": 2773 + }, + { + "epoch": 0.8514426028238183, + "grad_norm": 0.5561975240707397, + "learning_rate": 9.920606158080725e-05, + "loss": 2.1015, + "step": 2774 + }, + { + "epoch": 0.8517495395948435, + "grad_norm": 0.5434146523475647, + "learning_rate": 9.920517907517275e-05, + "loss": 2.1306, + "step": 2775 + }, + { + "epoch": 0.8520564763658687, + "grad_norm": 0.6028591990470886, + "learning_rate": 9.920429608326382e-05, + "loss": 2.1665, + "step": 2776 + }, + { + "epoch": 0.8523634131368938, + "grad_norm": 0.6491599082946777, + "learning_rate": 9.920341260508918e-05, + "loss": 2.0715, + "step": 2777 + }, + { + "epoch": 0.852670349907919, + "grad_norm": 0.6350167989730835, + "learning_rate": 9.92025286406576e-05, + "loss": 2.1492, + "step": 2778 + }, + { + "epoch": 0.8529772866789441, + "grad_norm": 0.5726897120475769, + "learning_rate": 9.92016441899778e-05, + "loss": 2.1128, + "step": 2779 + }, + { + "epoch": 0.8532842234499693, + "grad_norm": 0.5680630207061768, + "learning_rate": 9.92007592530585e-05, + "loss": 2.0718, + "step": 2780 + }, + { + "epoch": 0.8535911602209945, + "grad_norm": 0.5901346802711487, + "learning_rate": 9.919987382990845e-05, + "loss": 2.0577, + "step": 2781 + }, + { + "epoch": 0.8538980969920197, + "grad_norm": 0.5756994485855103, + "learning_rate": 9.919898792053643e-05, + "loss": 2.106, + "step": 2782 + }, + { + "epoch": 0.8542050337630448, + "grad_norm": 0.5831238031387329, + "learning_rate": 9.919810152495116e-05, + "loss": 2.0507, + "step": 2783 + }, + { + "epoch": 0.85451197053407, + "grad_norm": 0.529931902885437, + "learning_rate": 9.919721464316143e-05, + "loss": 2.0934, + "step": 2784 + }, + { + "epoch": 0.8548189073050951, + "grad_norm": 0.603672981262207, + "learning_rate": 9.919632727517597e-05, + "loss": 2.164, + "step": 2785 + }, + { + "epoch": 0.8551258440761204, + "grad_norm": 0.5741528868675232, + "learning_rate": 9.919543942100357e-05, + "loss": 2.0948, + "step": 2786 + }, + { + "epoch": 0.8554327808471455, + "grad_norm": 0.5689142942428589, + "learning_rate": 9.919455108065303e-05, + "loss": 2.1572, + "step": 2787 + }, + { + "epoch": 0.8557397176181707, + "grad_norm": 0.5767523646354675, + "learning_rate": 9.919366225413308e-05, + "loss": 2.0528, + "step": 2788 + }, + { + "epoch": 0.8560466543891958, + "grad_norm": 0.6004374623298645, + "learning_rate": 9.919277294145252e-05, + "loss": 2.1078, + "step": 2789 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.6199560761451721, + "learning_rate": 9.919188314262017e-05, + "loss": 2.034, + "step": 2790 + }, + { + "epoch": 0.8566605279312461, + "grad_norm": 0.5928464531898499, + "learning_rate": 9.919099285764478e-05, + "loss": 2.1226, + "step": 2791 + }, + { + "epoch": 0.8569674647022714, + "grad_norm": 0.5620111227035522, + "learning_rate": 9.919010208653517e-05, + "loss": 2.1387, + "step": 2792 + }, + { + "epoch": 0.8572744014732965, + "grad_norm": 0.6035314798355103, + "learning_rate": 9.918921082930015e-05, + "loss": 2.0888, + "step": 2793 + }, + { + "epoch": 0.8575813382443217, + "grad_norm": 0.6842171549797058, + "learning_rate": 9.91883190859485e-05, + "loss": 2.15, + "step": 2794 + }, + { + "epoch": 0.8578882750153468, + "grad_norm": 0.7600229978561401, + "learning_rate": 9.918742685648906e-05, + "loss": 2.1776, + "step": 2795 + }, + { + "epoch": 0.858195211786372, + "grad_norm": 0.641504168510437, + "learning_rate": 9.918653414093065e-05, + "loss": 2.086, + "step": 2796 + }, + { + "epoch": 0.8585021485573971, + "grad_norm": 0.6062462329864502, + "learning_rate": 9.918564093928207e-05, + "loss": 2.0772, + "step": 2797 + }, + { + "epoch": 0.8588090853284224, + "grad_norm": 0.5259165167808533, + "learning_rate": 9.918474725155214e-05, + "loss": 2.1034, + "step": 2798 + }, + { + "epoch": 0.8591160220994475, + "grad_norm": 0.532511830329895, + "learning_rate": 9.918385307774973e-05, + "loss": 2.103, + "step": 2799 + }, + { + "epoch": 0.8594229588704727, + "grad_norm": 0.5996485352516174, + "learning_rate": 9.918295841788366e-05, + "loss": 2.1698, + "step": 2800 + }, + { + "epoch": 0.8597298956414978, + "grad_norm": 0.5895976424217224, + "learning_rate": 9.918206327196276e-05, + "loss": 2.132, + "step": 2801 + }, + { + "epoch": 0.860036832412523, + "grad_norm": 0.6363179087638855, + "learning_rate": 9.918116763999588e-05, + "loss": 2.0967, + "step": 2802 + }, + { + "epoch": 0.8603437691835482, + "grad_norm": 0.6594113707542419, + "learning_rate": 9.918027152199187e-05, + "loss": 2.1266, + "step": 2803 + }, + { + "epoch": 0.8606507059545734, + "grad_norm": 0.694879412651062, + "learning_rate": 9.917937491795961e-05, + "loss": 2.0694, + "step": 2804 + }, + { + "epoch": 0.8609576427255985, + "grad_norm": 0.6310710906982422, + "learning_rate": 9.917847782790793e-05, + "loss": 2.1546, + "step": 2805 + }, + { + "epoch": 0.8612645794966237, + "grad_norm": 0.6166081428527832, + "learning_rate": 9.917758025184572e-05, + "loss": 2.131, + "step": 2806 + }, + { + "epoch": 0.8615715162676488, + "grad_norm": 0.5857066512107849, + "learning_rate": 9.917668218978182e-05, + "loss": 2.1529, + "step": 2807 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.6374151706695557, + "learning_rate": 9.917578364172513e-05, + "loss": 2.151, + "step": 2808 + }, + { + "epoch": 0.8621853898096992, + "grad_norm": 0.6760959625244141, + "learning_rate": 9.917488460768453e-05, + "loss": 2.1955, + "step": 2809 + }, + { + "epoch": 0.8624923265807244, + "grad_norm": 0.6308501362800598, + "learning_rate": 9.917398508766889e-05, + "loss": 2.1449, + "step": 2810 + }, + { + "epoch": 0.8627992633517495, + "grad_norm": 0.615181028842926, + "learning_rate": 9.91730850816871e-05, + "loss": 2.0326, + "step": 2811 + }, + { + "epoch": 0.8631062001227747, + "grad_norm": 0.6746891736984253, + "learning_rate": 9.917218458974809e-05, + "loss": 2.1472, + "step": 2812 + }, + { + "epoch": 0.8634131368937998, + "grad_norm": 0.6594959497451782, + "learning_rate": 9.91712836118607e-05, + "loss": 2.0879, + "step": 2813 + }, + { + "epoch": 0.8637200736648251, + "grad_norm": 0.6843087077140808, + "learning_rate": 9.91703821480339e-05, + "loss": 2.13, + "step": 2814 + }, + { + "epoch": 0.8640270104358502, + "grad_norm": 0.7513928413391113, + "learning_rate": 9.916948019827653e-05, + "loss": 2.1866, + "step": 2815 + }, + { + "epoch": 0.8643339472068754, + "grad_norm": 0.7352319955825806, + "learning_rate": 9.916857776259755e-05, + "loss": 2.0844, + "step": 2816 + }, + { + "epoch": 0.8646408839779005, + "grad_norm": 0.6901769638061523, + "learning_rate": 9.916767484100587e-05, + "loss": 2.086, + "step": 2817 + }, + { + "epoch": 0.8649478207489257, + "grad_norm": 0.621734619140625, + "learning_rate": 9.91667714335104e-05, + "loss": 2.0764, + "step": 2818 + }, + { + "epoch": 0.8652547575199508, + "grad_norm": 0.5779813528060913, + "learning_rate": 9.916586754012008e-05, + "loss": 2.0568, + "step": 2819 + }, + { + "epoch": 0.8655616942909761, + "grad_norm": 0.566251814365387, + "learning_rate": 9.916496316084385e-05, + "loss": 2.1624, + "step": 2820 + }, + { + "epoch": 0.8658686310620012, + "grad_norm": 0.6039763689041138, + "learning_rate": 9.916405829569062e-05, + "loss": 2.0412, + "step": 2821 + }, + { + "epoch": 0.8661755678330264, + "grad_norm": 0.587469220161438, + "learning_rate": 9.916315294466935e-05, + "loss": 2.1513, + "step": 2822 + }, + { + "epoch": 0.8664825046040515, + "grad_norm": 0.5792883634567261, + "learning_rate": 9.916224710778901e-05, + "loss": 2.055, + "step": 2823 + }, + { + "epoch": 0.8667894413750767, + "grad_norm": 0.5533844232559204, + "learning_rate": 9.916134078505852e-05, + "loss": 2.1237, + "step": 2824 + }, + { + "epoch": 0.8670963781461019, + "grad_norm": 0.6140845417976379, + "learning_rate": 9.916043397648685e-05, + "loss": 2.1481, + "step": 2825 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.6092365384101868, + "learning_rate": 9.915952668208295e-05, + "loss": 2.1567, + "step": 2826 + }, + { + "epoch": 0.8677102516881522, + "grad_norm": 0.5712884068489075, + "learning_rate": 9.915861890185578e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.8680171884591774, + "grad_norm": 0.5314213633537292, + "learning_rate": 9.915771063581434e-05, + "loss": 2.0408, + "step": 2828 + }, + { + "epoch": 0.8683241252302025, + "grad_norm": 0.5258345007896423, + "learning_rate": 9.915680188396759e-05, + "loss": 2.0968, + "step": 2829 + }, + { + "epoch": 0.8686310620012277, + "grad_norm": 0.6071497797966003, + "learning_rate": 9.915589264632453e-05, + "loss": 2.0924, + "step": 2830 + }, + { + "epoch": 0.8689379987722529, + "grad_norm": 0.6742420792579651, + "learning_rate": 9.915498292289408e-05, + "loss": 2.1276, + "step": 2831 + }, + { + "epoch": 0.8692449355432781, + "grad_norm": 0.7642729878425598, + "learning_rate": 9.915407271368533e-05, + "loss": 2.204, + "step": 2832 + }, + { + "epoch": 0.8695518723143032, + "grad_norm": 0.8024489283561707, + "learning_rate": 9.915316201870718e-05, + "loss": 2.163, + "step": 2833 + }, + { + "epoch": 0.8698588090853284, + "grad_norm": 0.8268367648124695, + "learning_rate": 9.915225083796871e-05, + "loss": 2.117, + "step": 2834 + }, + { + "epoch": 0.8701657458563536, + "grad_norm": 0.7761407494544983, + "learning_rate": 9.915133917147888e-05, + "loss": 2.0727, + "step": 2835 + }, + { + "epoch": 0.8704726826273788, + "grad_norm": 0.7515753507614136, + "learning_rate": 9.91504270192467e-05, + "loss": 2.075, + "step": 2836 + }, + { + "epoch": 0.870779619398404, + "grad_norm": 0.6203973889350891, + "learning_rate": 9.914951438128119e-05, + "loss": 2.1163, + "step": 2837 + }, + { + "epoch": 0.8710865561694291, + "grad_norm": 0.6056976318359375, + "learning_rate": 9.914860125759138e-05, + "loss": 2.1515, + "step": 2838 + }, + { + "epoch": 0.8713934929404543, + "grad_norm": 0.6472234725952148, + "learning_rate": 9.914768764818627e-05, + "loss": 2.1618, + "step": 2839 + }, + { + "epoch": 0.8717004297114794, + "grad_norm": 0.5981749892234802, + "learning_rate": 9.914677355307491e-05, + "loss": 2.0763, + "step": 2840 + }, + { + "epoch": 0.8720073664825047, + "grad_norm": 0.5721938014030457, + "learning_rate": 9.914585897226634e-05, + "loss": 2.0916, + "step": 2841 + }, + { + "epoch": 0.8723143032535298, + "grad_norm": 0.6079535484313965, + "learning_rate": 9.914494390576958e-05, + "loss": 2.0767, + "step": 2842 + }, + { + "epoch": 0.872621240024555, + "grad_norm": 0.6684066653251648, + "learning_rate": 9.914402835359368e-05, + "loss": 2.2712, + "step": 2843 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.6992711424827576, + "learning_rate": 9.91431123157477e-05, + "loss": 2.0813, + "step": 2844 + }, + { + "epoch": 0.8732351135666053, + "grad_norm": 0.6585392951965332, + "learning_rate": 9.914219579224065e-05, + "loss": 2.1303, + "step": 2845 + }, + { + "epoch": 0.8735420503376304, + "grad_norm": 0.7267395257949829, + "learning_rate": 9.914127878308164e-05, + "loss": 2.2253, + "step": 2846 + }, + { + "epoch": 0.8738489871086557, + "grad_norm": 0.6764006018638611, + "learning_rate": 9.91403612882797e-05, + "loss": 2.0886, + "step": 2847 + }, + { + "epoch": 0.8741559238796808, + "grad_norm": 0.612808108329773, + "learning_rate": 9.91394433078439e-05, + "loss": 2.0469, + "step": 2848 + }, + { + "epoch": 0.874462860650706, + "grad_norm": 0.5598782896995544, + "learning_rate": 9.913852484178334e-05, + "loss": 2.1745, + "step": 2849 + }, + { + "epoch": 0.8747697974217311, + "grad_norm": 0.6498168706893921, + "learning_rate": 9.913760589010707e-05, + "loss": 2.2657, + "step": 2850 + }, + { + "epoch": 0.8750767341927563, + "grad_norm": 0.6796014904975891, + "learning_rate": 9.913668645282418e-05, + "loss": 2.1056, + "step": 2851 + }, + { + "epoch": 0.8753836709637814, + "grad_norm": 0.7409440279006958, + "learning_rate": 9.913576652994376e-05, + "loss": 2.1533, + "step": 2852 + }, + { + "epoch": 0.8756906077348067, + "grad_norm": 0.7044464945793152, + "learning_rate": 9.913484612147488e-05, + "loss": 2.2088, + "step": 2853 + }, + { + "epoch": 0.8759975445058318, + "grad_norm": 0.6333544254302979, + "learning_rate": 9.913392522742666e-05, + "loss": 2.132, + "step": 2854 + }, + { + "epoch": 0.876304481276857, + "grad_norm": 0.603382408618927, + "learning_rate": 9.91330038478082e-05, + "loss": 2.0657, + "step": 2855 + }, + { + "epoch": 0.8766114180478821, + "grad_norm": 0.5919856429100037, + "learning_rate": 9.913208198262858e-05, + "loss": 2.0854, + "step": 2856 + }, + { + "epoch": 0.8769183548189073, + "grad_norm": 0.6033365726470947, + "learning_rate": 9.913115963189694e-05, + "loss": 2.0825, + "step": 2857 + }, + { + "epoch": 0.8772252915899325, + "grad_norm": 0.5917964577674866, + "learning_rate": 9.913023679562238e-05, + "loss": 2.1608, + "step": 2858 + }, + { + "epoch": 0.8775322283609577, + "grad_norm": 0.5953360795974731, + "learning_rate": 9.912931347381402e-05, + "loss": 2.1454, + "step": 2859 + }, + { + "epoch": 0.8778391651319828, + "grad_norm": 0.5949352979660034, + "learning_rate": 9.9128389666481e-05, + "loss": 2.1575, + "step": 2860 + }, + { + "epoch": 0.878146101903008, + "grad_norm": 0.5468181371688843, + "learning_rate": 9.912746537363243e-05, + "loss": 2.151, + "step": 2861 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.5476632714271545, + "learning_rate": 9.912654059527746e-05, + "loss": 2.1015, + "step": 2862 + }, + { + "epoch": 0.8787599754450584, + "grad_norm": 0.6881390810012817, + "learning_rate": 9.912561533142521e-05, + "loss": 2.2002, + "step": 2863 + }, + { + "epoch": 0.8790669122160835, + "grad_norm": 0.6663404703140259, + "learning_rate": 9.912468958208486e-05, + "loss": 2.0691, + "step": 2864 + }, + { + "epoch": 0.8793738489871087, + "grad_norm": 0.5739100575447083, + "learning_rate": 9.91237633472655e-05, + "loss": 2.0852, + "step": 2865 + }, + { + "epoch": 0.8796807857581338, + "grad_norm": 0.5227558016777039, + "learning_rate": 9.912283662697635e-05, + "loss": 2.1144, + "step": 2866 + }, + { + "epoch": 0.879987722529159, + "grad_norm": 0.5626821517944336, + "learning_rate": 9.912190942122652e-05, + "loss": 2.0796, + "step": 2867 + }, + { + "epoch": 0.8802946593001841, + "grad_norm": 0.5367855429649353, + "learning_rate": 9.912098173002518e-05, + "loss": 2.0768, + "step": 2868 + }, + { + "epoch": 0.8806015960712094, + "grad_norm": 0.5285482406616211, + "learning_rate": 9.912005355338152e-05, + "loss": 2.0832, + "step": 2869 + }, + { + "epoch": 0.8809085328422345, + "grad_norm": 0.5384502410888672, + "learning_rate": 9.91191248913047e-05, + "loss": 2.0187, + "step": 2870 + }, + { + "epoch": 0.8812154696132597, + "grad_norm": 0.5099567770957947, + "learning_rate": 9.91181957438039e-05, + "loss": 2.0865, + "step": 2871 + }, + { + "epoch": 0.8815224063842848, + "grad_norm": 0.5513966679573059, + "learning_rate": 9.911726611088831e-05, + "loss": 2.1097, + "step": 2872 + }, + { + "epoch": 0.88182934315531, + "grad_norm": 0.5411790609359741, + "learning_rate": 9.911633599256709e-05, + "loss": 2.0964, + "step": 2873 + }, + { + "epoch": 0.8821362799263351, + "grad_norm": 0.6151100397109985, + "learning_rate": 9.911540538884947e-05, + "loss": 2.1006, + "step": 2874 + }, + { + "epoch": 0.8824432166973604, + "grad_norm": 0.754391610622406, + "learning_rate": 9.911447429974461e-05, + "loss": 2.1493, + "step": 2875 + }, + { + "epoch": 0.8827501534683855, + "grad_norm": 0.7485715746879578, + "learning_rate": 9.911354272526172e-05, + "loss": 2.1136, + "step": 2876 + }, + { + "epoch": 0.8830570902394107, + "grad_norm": 0.6808591485023499, + "learning_rate": 9.911261066541003e-05, + "loss": 2.1238, + "step": 2877 + }, + { + "epoch": 0.8833640270104358, + "grad_norm": 0.5771127343177795, + "learning_rate": 9.911167812019874e-05, + "loss": 2.0846, + "step": 2878 + }, + { + "epoch": 0.883670963781461, + "grad_norm": 0.5991767048835754, + "learning_rate": 9.911074508963705e-05, + "loss": 2.1486, + "step": 2879 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.6899440884590149, + "learning_rate": 9.91098115737342e-05, + "loss": 2.1357, + "step": 2880 + }, + { + "epoch": 0.8842848373235114, + "grad_norm": 0.7102574110031128, + "learning_rate": 9.91088775724994e-05, + "loss": 2.1269, + "step": 2881 + }, + { + "epoch": 0.8845917740945365, + "grad_norm": 0.7238754034042358, + "learning_rate": 9.910794308594189e-05, + "loss": 2.0829, + "step": 2882 + }, + { + "epoch": 0.8848987108655617, + "grad_norm": 0.7232441902160645, + "learning_rate": 9.91070081140709e-05, + "loss": 2.1704, + "step": 2883 + }, + { + "epoch": 0.8852056476365868, + "grad_norm": 0.7136173844337463, + "learning_rate": 9.910607265689569e-05, + "loss": 2.1553, + "step": 2884 + }, + { + "epoch": 0.885512584407612, + "grad_norm": 0.6566216945648193, + "learning_rate": 9.910513671442547e-05, + "loss": 2.0856, + "step": 2885 + }, + { + "epoch": 0.8858195211786372, + "grad_norm": 0.5712916851043701, + "learning_rate": 9.910420028666951e-05, + "loss": 2.1399, + "step": 2886 + }, + { + "epoch": 0.8861264579496624, + "grad_norm": 0.727664589881897, + "learning_rate": 9.910326337363707e-05, + "loss": 2.088, + "step": 2887 + }, + { + "epoch": 0.8864333947206875, + "grad_norm": 0.799963653087616, + "learning_rate": 9.91023259753374e-05, + "loss": 2.0984, + "step": 2888 + }, + { + "epoch": 0.8867403314917127, + "grad_norm": 0.9462977051734924, + "learning_rate": 9.910138809177975e-05, + "loss": 2.1262, + "step": 2889 + }, + { + "epoch": 0.8870472682627378, + "grad_norm": 0.9130533933639526, + "learning_rate": 9.910044972297343e-05, + "loss": 2.1967, + "step": 2890 + }, + { + "epoch": 0.887354205033763, + "grad_norm": 0.6971304416656494, + "learning_rate": 9.909951086892767e-05, + "loss": 2.0797, + "step": 2891 + }, + { + "epoch": 0.8876611418047882, + "grad_norm": 0.5822353363037109, + "learning_rate": 9.909857152965176e-05, + "loss": 2.1152, + "step": 2892 + }, + { + "epoch": 0.8879680785758134, + "grad_norm": 0.5885453820228577, + "learning_rate": 9.9097631705155e-05, + "loss": 2.0323, + "step": 2893 + }, + { + "epoch": 0.8882750153468385, + "grad_norm": 0.6249284744262695, + "learning_rate": 9.909669139544666e-05, + "loss": 2.1076, + "step": 2894 + }, + { + "epoch": 0.8885819521178637, + "grad_norm": 0.6117702722549438, + "learning_rate": 9.909575060053604e-05, + "loss": 2.0608, + "step": 2895 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.560357928276062, + "learning_rate": 9.909480932043245e-05, + "loss": 2.145, + "step": 2896 + }, + { + "epoch": 0.8891958256599141, + "grad_norm": 0.5442607998847961, + "learning_rate": 9.909386755514516e-05, + "loss": 2.1091, + "step": 2897 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.5653077363967896, + "learning_rate": 9.909292530468351e-05, + "loss": 2.1097, + "step": 2898 + }, + { + "epoch": 0.8898096992019644, + "grad_norm": 0.531939685344696, + "learning_rate": 9.909198256905679e-05, + "loss": 2.0866, + "step": 2899 + }, + { + "epoch": 0.8901166359729895, + "grad_norm": 0.6238400340080261, + "learning_rate": 9.909103934827433e-05, + "loss": 2.1421, + "step": 2900 + }, + { + "epoch": 0.8904235727440147, + "grad_norm": 0.5685901045799255, + "learning_rate": 9.909009564234543e-05, + "loss": 2.0019, + "step": 2901 + }, + { + "epoch": 0.8907305095150398, + "grad_norm": 0.5979083180427551, + "learning_rate": 9.908915145127945e-05, + "loss": 2.0891, + "step": 2902 + }, + { + "epoch": 0.8910374462860651, + "grad_norm": 0.5847237706184387, + "learning_rate": 9.90882067750857e-05, + "loss": 2.1165, + "step": 2903 + }, + { + "epoch": 0.8913443830570903, + "grad_norm": 0.6281530261039734, + "learning_rate": 9.908726161377351e-05, + "loss": 2.1396, + "step": 2904 + }, + { + "epoch": 0.8916513198281154, + "grad_norm": 0.5685252547264099, + "learning_rate": 9.908631596735225e-05, + "loss": 2.0781, + "step": 2905 + }, + { + "epoch": 0.8919582565991406, + "grad_norm": 0.5427065491676331, + "learning_rate": 9.908536983583123e-05, + "loss": 2.1387, + "step": 2906 + }, + { + "epoch": 0.8922651933701657, + "grad_norm": 0.5972270965576172, + "learning_rate": 9.908442321921982e-05, + "loss": 2.0546, + "step": 2907 + }, + { + "epoch": 0.892572130141191, + "grad_norm": 0.562685489654541, + "learning_rate": 9.908347611752735e-05, + "loss": 2.093, + "step": 2908 + }, + { + "epoch": 0.8928790669122161, + "grad_norm": 0.6781734824180603, + "learning_rate": 9.908252853076323e-05, + "loss": 2.1589, + "step": 2909 + }, + { + "epoch": 0.8931860036832413, + "grad_norm": 0.7591540813446045, + "learning_rate": 9.908158045893678e-05, + "loss": 2.164, + "step": 2910 + }, + { + "epoch": 0.8934929404542664, + "grad_norm": 0.7161938548088074, + "learning_rate": 9.908063190205738e-05, + "loss": 2.079, + "step": 2911 + }, + { + "epoch": 0.8937998772252916, + "grad_norm": 0.7338036298751831, + "learning_rate": 9.907968286013442e-05, + "loss": 2.0033, + "step": 2912 + }, + { + "epoch": 0.8941068139963168, + "grad_norm": 0.7641176581382751, + "learning_rate": 9.907873333317727e-05, + "loss": 2.187, + "step": 2913 + }, + { + "epoch": 0.894413750767342, + "grad_norm": 0.6073760390281677, + "learning_rate": 9.90777833211953e-05, + "loss": 2.0589, + "step": 2914 + }, + { + "epoch": 0.8947206875383671, + "grad_norm": 0.49493756890296936, + "learning_rate": 9.907683282419791e-05, + "loss": 2.0555, + "step": 2915 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.6428996920585632, + "learning_rate": 9.907588184219449e-05, + "loss": 2.1083, + "step": 2916 + }, + { + "epoch": 0.8953345610804174, + "grad_norm": 0.6752644777297974, + "learning_rate": 9.907493037519447e-05, + "loss": 2.0987, + "step": 2917 + }, + { + "epoch": 0.8956414978514426, + "grad_norm": 0.5719494223594666, + "learning_rate": 9.907397842320719e-05, + "loss": 2.1735, + "step": 2918 + }, + { + "epoch": 0.8959484346224678, + "grad_norm": 0.5799626111984253, + "learning_rate": 9.907302598624211e-05, + "loss": 2.0978, + "step": 2919 + }, + { + "epoch": 0.896255371393493, + "grad_norm": 0.5407500267028809, + "learning_rate": 9.907207306430861e-05, + "loss": 2.0303, + "step": 2920 + }, + { + "epoch": 0.8965623081645181, + "grad_norm": 0.5950884222984314, + "learning_rate": 9.907111965741614e-05, + "loss": 2.0721, + "step": 2921 + }, + { + "epoch": 0.8968692449355433, + "grad_norm": 0.7711441516876221, + "learning_rate": 9.907016576557409e-05, + "loss": 2.1693, + "step": 2922 + }, + { + "epoch": 0.8971761817065684, + "grad_norm": 0.5522177815437317, + "learning_rate": 9.906921138879191e-05, + "loss": 2.1057, + "step": 2923 + }, + { + "epoch": 0.8974831184775937, + "grad_norm": 0.5743894577026367, + "learning_rate": 9.906825652707903e-05, + "loss": 2.119, + "step": 2924 + }, + { + "epoch": 0.8977900552486188, + "grad_norm": 0.5996440649032593, + "learning_rate": 9.906730118044486e-05, + "loss": 2.1251, + "step": 2925 + }, + { + "epoch": 0.898096992019644, + "grad_norm": 0.691302478313446, + "learning_rate": 9.906634534889887e-05, + "loss": 2.1459, + "step": 2926 + }, + { + "epoch": 0.8984039287906691, + "grad_norm": 0.6125866770744324, + "learning_rate": 9.90653890324505e-05, + "loss": 2.0739, + "step": 2927 + }, + { + "epoch": 0.8987108655616943, + "grad_norm": 0.5285681486129761, + "learning_rate": 9.906443223110919e-05, + "loss": 2.0398, + "step": 2928 + }, + { + "epoch": 0.8990178023327194, + "grad_norm": 0.5747935771942139, + "learning_rate": 9.90634749448844e-05, + "loss": 2.0688, + "step": 2929 + }, + { + "epoch": 0.8993247391037447, + "grad_norm": 0.5686646103858948, + "learning_rate": 9.90625171737856e-05, + "loss": 2.1196, + "step": 2930 + }, + { + "epoch": 0.8996316758747698, + "grad_norm": 0.5320247411727905, + "learning_rate": 9.906155891782225e-05, + "loss": 2.1069, + "step": 2931 + }, + { + "epoch": 0.899938612645795, + "grad_norm": 0.5626047849655151, + "learning_rate": 9.906060017700383e-05, + "loss": 2.1091, + "step": 2932 + }, + { + "epoch": 0.9002455494168201, + "grad_norm": 0.5284978151321411, + "learning_rate": 9.905964095133979e-05, + "loss": 2.036, + "step": 2933 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.5362093448638916, + "learning_rate": 9.905868124083962e-05, + "loss": 2.1273, + "step": 2934 + }, + { + "epoch": 0.9008594229588704, + "grad_norm": 0.5583781599998474, + "learning_rate": 9.90577210455128e-05, + "loss": 2.0871, + "step": 2935 + }, + { + "epoch": 0.9011663597298957, + "grad_norm": 0.5552016496658325, + "learning_rate": 9.905676036536883e-05, + "loss": 2.0785, + "step": 2936 + }, + { + "epoch": 0.9014732965009208, + "grad_norm": 0.6875657439231873, + "learning_rate": 9.905579920041724e-05, + "loss": 2.083, + "step": 2937 + }, + { + "epoch": 0.901780233271946, + "grad_norm": 0.5396340489387512, + "learning_rate": 9.905483755066744e-05, + "loss": 2.0717, + "step": 2938 + }, + { + "epoch": 0.9020871700429711, + "grad_norm": 0.594739556312561, + "learning_rate": 9.9053875416129e-05, + "loss": 2.1305, + "step": 2939 + }, + { + "epoch": 0.9023941068139963, + "grad_norm": 0.6208831667900085, + "learning_rate": 9.905291279681143e-05, + "loss": 2.0034, + "step": 2940 + }, + { + "epoch": 0.9027010435850215, + "grad_norm": 0.5154325366020203, + "learning_rate": 9.90519496927242e-05, + "loss": 2.098, + "step": 2941 + }, + { + "epoch": 0.9030079803560467, + "grad_norm": 0.5217738151550293, + "learning_rate": 9.905098610387687e-05, + "loss": 2.0467, + "step": 2942 + }, + { + "epoch": 0.9033149171270718, + "grad_norm": 0.5623623728752136, + "learning_rate": 9.905002203027894e-05, + "loss": 2.1854, + "step": 2943 + }, + { + "epoch": 0.903621853898097, + "grad_norm": 0.5365456938743591, + "learning_rate": 9.904905747193993e-05, + "loss": 2.1021, + "step": 2944 + }, + { + "epoch": 0.9039287906691221, + "grad_norm": 0.5391906499862671, + "learning_rate": 9.904809242886941e-05, + "loss": 2.1102, + "step": 2945 + }, + { + "epoch": 0.9042357274401474, + "grad_norm": 0.5439971685409546, + "learning_rate": 9.904712690107687e-05, + "loss": 2.0691, + "step": 2946 + }, + { + "epoch": 0.9045426642111725, + "grad_norm": 0.539383053779602, + "learning_rate": 9.904616088857189e-05, + "loss": 2.0514, + "step": 2947 + }, + { + "epoch": 0.9048496009821977, + "grad_norm": 0.5370060801506042, + "learning_rate": 9.904519439136399e-05, + "loss": 2.1069, + "step": 2948 + }, + { + "epoch": 0.9051565377532228, + "grad_norm": 0.5136541724205017, + "learning_rate": 9.904422740946274e-05, + "loss": 2.0519, + "step": 2949 + }, + { + "epoch": 0.905463474524248, + "grad_norm": 0.4970051348209381, + "learning_rate": 9.904325994287768e-05, + "loss": 2.0624, + "step": 2950 + }, + { + "epoch": 0.9057704112952731, + "grad_norm": 0.5003986954689026, + "learning_rate": 9.90422919916184e-05, + "loss": 2.135, + "step": 2951 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.5559821724891663, + "learning_rate": 9.904132355569443e-05, + "loss": 2.0733, + "step": 2952 + }, + { + "epoch": 0.9063842848373235, + "grad_norm": 0.5450533628463745, + "learning_rate": 9.904035463511537e-05, + "loss": 2.1491, + "step": 2953 + }, + { + "epoch": 0.9066912216083487, + "grad_norm": 0.5789141058921814, + "learning_rate": 9.903938522989076e-05, + "loss": 2.0604, + "step": 2954 + }, + { + "epoch": 0.9069981583793738, + "grad_norm": 0.6327412128448486, + "learning_rate": 9.903841534003023e-05, + "loss": 2.1307, + "step": 2955 + }, + { + "epoch": 0.907305095150399, + "grad_norm": 0.5694023966789246, + "learning_rate": 9.90374449655433e-05, + "loss": 2.1322, + "step": 2956 + }, + { + "epoch": 0.9076120319214241, + "grad_norm": 0.6241337060928345, + "learning_rate": 9.903647410643963e-05, + "loss": 2.1026, + "step": 2957 + }, + { + "epoch": 0.9079189686924494, + "grad_norm": 0.6257766485214233, + "learning_rate": 9.903550276272878e-05, + "loss": 2.0449, + "step": 2958 + }, + { + "epoch": 0.9082259054634745, + "grad_norm": 0.708626389503479, + "learning_rate": 9.903453093442032e-05, + "loss": 2.095, + "step": 2959 + }, + { + "epoch": 0.9085328422344997, + "grad_norm": 0.6769086122512817, + "learning_rate": 9.903355862152391e-05, + "loss": 2.0939, + "step": 2960 + }, + { + "epoch": 0.9088397790055248, + "grad_norm": 0.6221890449523926, + "learning_rate": 9.903258582404913e-05, + "loss": 2.1552, + "step": 2961 + }, + { + "epoch": 0.90914671577655, + "grad_norm": 0.7477858662605286, + "learning_rate": 9.903161254200561e-05, + "loss": 2.1155, + "step": 2962 + }, + { + "epoch": 0.9094536525475752, + "grad_norm": 0.665538489818573, + "learning_rate": 9.903063877540294e-05, + "loss": 2.1032, + "step": 2963 + }, + { + "epoch": 0.9097605893186004, + "grad_norm": 0.5973435044288635, + "learning_rate": 9.902966452425076e-05, + "loss": 2.0793, + "step": 2964 + }, + { + "epoch": 0.9100675260896255, + "grad_norm": 0.6544547080993652, + "learning_rate": 9.90286897885587e-05, + "loss": 2.1566, + "step": 2965 + }, + { + "epoch": 0.9103744628606507, + "grad_norm": 0.7162452936172485, + "learning_rate": 9.90277145683364e-05, + "loss": 2.1234, + "step": 2966 + }, + { + "epoch": 0.9106813996316758, + "grad_norm": 0.8400503993034363, + "learning_rate": 9.902673886359349e-05, + "loss": 2.216, + "step": 2967 + }, + { + "epoch": 0.910988336402701, + "grad_norm": 1.0350611209869385, + "learning_rate": 9.902576267433961e-05, + "loss": 2.0785, + "step": 2968 + }, + { + "epoch": 0.9112952731737262, + "grad_norm": 0.9551987051963806, + "learning_rate": 9.90247860005844e-05, + "loss": 2.0652, + "step": 2969 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.839712381362915, + "learning_rate": 9.902380884233751e-05, + "loss": 2.1197, + "step": 2970 + }, + { + "epoch": 0.9119091467157765, + "grad_norm": 0.6588022708892822, + "learning_rate": 9.902283119960863e-05, + "loss": 2.155, + "step": 2971 + }, + { + "epoch": 0.9122160834868017, + "grad_norm": 0.6532430052757263, + "learning_rate": 9.902185307240739e-05, + "loss": 2.0947, + "step": 2972 + }, + { + "epoch": 0.9125230202578268, + "grad_norm": 0.7890481352806091, + "learning_rate": 9.902087446074346e-05, + "loss": 2.0246, + "step": 2973 + }, + { + "epoch": 0.9128299570288521, + "grad_norm": 0.6234511137008667, + "learning_rate": 9.901989536462652e-05, + "loss": 2.1033, + "step": 2974 + }, + { + "epoch": 0.9131368937998773, + "grad_norm": 0.5875300168991089, + "learning_rate": 9.901891578406623e-05, + "loss": 2.0553, + "step": 2975 + }, + { + "epoch": 0.9134438305709024, + "grad_norm": 0.6868174076080322, + "learning_rate": 9.901793571907231e-05, + "loss": 2.1398, + "step": 2976 + }, + { + "epoch": 0.9137507673419276, + "grad_norm": 0.7423301339149475, + "learning_rate": 9.90169551696544e-05, + "loss": 2.1034, + "step": 2977 + }, + { + "epoch": 0.9140577041129527, + "grad_norm": 0.588916003704071, + "learning_rate": 9.901597413582222e-05, + "loss": 2.078, + "step": 2978 + }, + { + "epoch": 0.914364640883978, + "grad_norm": 0.5895309448242188, + "learning_rate": 9.901499261758544e-05, + "loss": 2.0902, + "step": 2979 + }, + { + "epoch": 0.9146715776550031, + "grad_norm": 0.5403301119804382, + "learning_rate": 9.901401061495379e-05, + "loss": 2.0291, + "step": 2980 + }, + { + "epoch": 0.9149785144260283, + "grad_norm": 0.6102077960968018, + "learning_rate": 9.901302812793696e-05, + "loss": 2.0415, + "step": 2981 + }, + { + "epoch": 0.9152854511970534, + "grad_norm": 0.6728450059890747, + "learning_rate": 9.901204515654465e-05, + "loss": 2.105, + "step": 2982 + }, + { + "epoch": 0.9155923879680786, + "grad_norm": 0.5886163711547852, + "learning_rate": 9.901106170078657e-05, + "loss": 2.0186, + "step": 2983 + }, + { + "epoch": 0.9158993247391037, + "grad_norm": 0.539252758026123, + "learning_rate": 9.901007776067247e-05, + "loss": 2.0604, + "step": 2984 + }, + { + "epoch": 0.916206261510129, + "grad_norm": 0.6169516444206238, + "learning_rate": 9.900909333621205e-05, + "loss": 2.1257, + "step": 2985 + }, + { + "epoch": 0.9165131982811541, + "grad_norm": 0.5624274015426636, + "learning_rate": 9.900810842741506e-05, + "loss": 2.0325, + "step": 2986 + }, + { + "epoch": 0.9168201350521793, + "grad_norm": 0.5931735634803772, + "learning_rate": 9.900712303429119e-05, + "loss": 2.0815, + "step": 2987 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.5720505714416504, + "learning_rate": 9.900613715685023e-05, + "loss": 2.1261, + "step": 2988 + }, + { + "epoch": 0.9174340085942296, + "grad_norm": 0.5752067565917969, + "learning_rate": 9.900515079510189e-05, + "loss": 2.1402, + "step": 2989 + }, + { + "epoch": 0.9177409453652547, + "grad_norm": 0.5836917757987976, + "learning_rate": 9.900416394905591e-05, + "loss": 2.0523, + "step": 2990 + }, + { + "epoch": 0.91804788213628, + "grad_norm": 0.6408325433731079, + "learning_rate": 9.900317661872209e-05, + "loss": 2.1874, + "step": 2991 + }, + { + "epoch": 0.9183548189073051, + "grad_norm": 0.6188341379165649, + "learning_rate": 9.900218880411013e-05, + "loss": 2.0903, + "step": 2992 + }, + { + "epoch": 0.9186617556783303, + "grad_norm": 0.5740565657615662, + "learning_rate": 9.900120050522985e-05, + "loss": 2.1243, + "step": 2993 + }, + { + "epoch": 0.9189686924493554, + "grad_norm": 0.635638952255249, + "learning_rate": 9.900021172209096e-05, + "loss": 2.089, + "step": 2994 + }, + { + "epoch": 0.9192756292203806, + "grad_norm": 0.5538209676742554, + "learning_rate": 9.899922245470326e-05, + "loss": 2.0489, + "step": 2995 + }, + { + "epoch": 0.9195825659914058, + "grad_norm": 0.5440292954444885, + "learning_rate": 9.899823270307654e-05, + "loss": 2.0534, + "step": 2996 + }, + { + "epoch": 0.919889502762431, + "grad_norm": 0.6203792691230774, + "learning_rate": 9.899724246722055e-05, + "loss": 2.2799, + "step": 2997 + }, + { + "epoch": 0.9201964395334561, + "grad_norm": 0.6299278140068054, + "learning_rate": 9.89962517471451e-05, + "loss": 2.0813, + "step": 2998 + }, + { + "epoch": 0.9205033763044813, + "grad_norm": 0.6156774759292603, + "learning_rate": 9.899526054285997e-05, + "loss": 2.1345, + "step": 2999 + }, + { + "epoch": 0.9208103130755064, + "grad_norm": 0.5940032601356506, + "learning_rate": 9.899426885437496e-05, + "loss": 2.133, + "step": 3000 + }, + { + "epoch": 0.9211172498465316, + "grad_norm": 0.6210232377052307, + "learning_rate": 9.899327668169987e-05, + "loss": 2.0275, + "step": 3001 + }, + { + "epoch": 0.9214241866175568, + "grad_norm": 0.5578985214233398, + "learning_rate": 9.89922840248445e-05, + "loss": 2.0806, + "step": 3002 + }, + { + "epoch": 0.921731123388582, + "grad_norm": 0.5264963507652283, + "learning_rate": 9.899129088381866e-05, + "loss": 2.1233, + "step": 3003 + }, + { + "epoch": 0.9220380601596071, + "grad_norm": 0.5414119958877563, + "learning_rate": 9.899029725863218e-05, + "loss": 2.1052, + "step": 3004 + }, + { + "epoch": 0.9223449969306323, + "grad_norm": 0.5933207869529724, + "learning_rate": 9.898930314929486e-05, + "loss": 2.108, + "step": 3005 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.6170317530632019, + "learning_rate": 9.898830855581654e-05, + "loss": 2.0997, + "step": 3006 + }, + { + "epoch": 0.9229588704726827, + "grad_norm": 0.5930282473564148, + "learning_rate": 9.898731347820705e-05, + "loss": 2.0507, + "step": 3007 + }, + { + "epoch": 0.9232658072437078, + "grad_norm": 0.5894142985343933, + "learning_rate": 9.898631791647619e-05, + "loss": 2.0687, + "step": 3008 + }, + { + "epoch": 0.923572744014733, + "grad_norm": 0.6560437083244324, + "learning_rate": 9.898532187063383e-05, + "loss": 2.096, + "step": 3009 + }, + { + "epoch": 0.9238796807857581, + "grad_norm": 0.6083245873451233, + "learning_rate": 9.898432534068983e-05, + "loss": 2.0526, + "step": 3010 + }, + { + "epoch": 0.9241866175567833, + "grad_norm": 0.5152565240859985, + "learning_rate": 9.8983328326654e-05, + "loss": 2.0802, + "step": 3011 + }, + { + "epoch": 0.9244935543278084, + "grad_norm": 0.6326588988304138, + "learning_rate": 9.89823308285362e-05, + "loss": 2.1246, + "step": 3012 + }, + { + "epoch": 0.9248004910988337, + "grad_norm": 0.6821309328079224, + "learning_rate": 9.898133284634632e-05, + "loss": 2.1106, + "step": 3013 + }, + { + "epoch": 0.9251074278698588, + "grad_norm": 0.6192164421081543, + "learning_rate": 9.898033438009419e-05, + "loss": 2.0475, + "step": 3014 + }, + { + "epoch": 0.925414364640884, + "grad_norm": 0.6112427115440369, + "learning_rate": 9.897933542978967e-05, + "loss": 2.0904, + "step": 3015 + }, + { + "epoch": 0.9257213014119091, + "grad_norm": 0.5729427933692932, + "learning_rate": 9.897833599544268e-05, + "loss": 2.1151, + "step": 3016 + }, + { + "epoch": 0.9260282381829343, + "grad_norm": 0.6200255751609802, + "learning_rate": 9.897733607706305e-05, + "loss": 2.0815, + "step": 3017 + }, + { + "epoch": 0.9263351749539595, + "grad_norm": 0.635920524597168, + "learning_rate": 9.897633567466068e-05, + "loss": 2.0724, + "step": 3018 + }, + { + "epoch": 0.9266421117249847, + "grad_norm": 0.5916038155555725, + "learning_rate": 9.897533478824546e-05, + "loss": 2.1527, + "step": 3019 + }, + { + "epoch": 0.9269490484960098, + "grad_norm": 0.5552941560745239, + "learning_rate": 9.897433341782727e-05, + "loss": 2.0958, + "step": 3020 + }, + { + "epoch": 0.927255985267035, + "grad_norm": 0.562383770942688, + "learning_rate": 9.897333156341602e-05, + "loss": 2.0939, + "step": 3021 + }, + { + "epoch": 0.9275629220380601, + "grad_norm": 0.5227869153022766, + "learning_rate": 9.897232922502158e-05, + "loss": 2.1358, + "step": 3022 + }, + { + "epoch": 0.9278698588090853, + "grad_norm": 0.5671074986457825, + "learning_rate": 9.897132640265391e-05, + "loss": 2.0877, + "step": 3023 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.5176356434822083, + "learning_rate": 9.897032309632287e-05, + "loss": 2.0392, + "step": 3024 + }, + { + "epoch": 0.9284837323511357, + "grad_norm": 0.5160155296325684, + "learning_rate": 9.89693193060384e-05, + "loss": 2.069, + "step": 3025 + }, + { + "epoch": 0.9287906691221608, + "grad_norm": 0.5034440159797668, + "learning_rate": 9.896831503181042e-05, + "loss": 2.0348, + "step": 3026 + }, + { + "epoch": 0.929097605893186, + "grad_norm": 0.5146151781082153, + "learning_rate": 9.896731027364884e-05, + "loss": 2.0884, + "step": 3027 + }, + { + "epoch": 0.9294045426642111, + "grad_norm": 0.7153071165084839, + "learning_rate": 9.896630503156361e-05, + "loss": 2.2295, + "step": 3028 + }, + { + "epoch": 0.9297114794352364, + "grad_norm": 0.7201753258705139, + "learning_rate": 9.896529930556464e-05, + "loss": 2.1285, + "step": 3029 + }, + { + "epoch": 0.9300184162062615, + "grad_norm": 0.7110029458999634, + "learning_rate": 9.89642930956619e-05, + "loss": 2.1371, + "step": 3030 + }, + { + "epoch": 0.9303253529772867, + "grad_norm": 0.695444643497467, + "learning_rate": 9.896328640186531e-05, + "loss": 2.0698, + "step": 3031 + }, + { + "epoch": 0.9306322897483118, + "grad_norm": 0.6157357096672058, + "learning_rate": 9.896227922418482e-05, + "loss": 2.1294, + "step": 3032 + }, + { + "epoch": 0.930939226519337, + "grad_norm": 0.5473730564117432, + "learning_rate": 9.896127156263039e-05, + "loss": 2.0487, + "step": 3033 + }, + { + "epoch": 0.9312461632903621, + "grad_norm": 0.6400229334831238, + "learning_rate": 9.896026341721198e-05, + "loss": 2.0422, + "step": 3034 + }, + { + "epoch": 0.9315531000613874, + "grad_norm": 0.5046324729919434, + "learning_rate": 9.895925478793955e-05, + "loss": 2.0715, + "step": 3035 + }, + { + "epoch": 0.9318600368324125, + "grad_norm": 0.5316528081893921, + "learning_rate": 9.895824567482307e-05, + "loss": 2.11, + "step": 3036 + }, + { + "epoch": 0.9321669736034377, + "grad_norm": 0.5760478973388672, + "learning_rate": 9.895723607787251e-05, + "loss": 2.0885, + "step": 3037 + }, + { + "epoch": 0.9324739103744628, + "grad_norm": 0.5034705996513367, + "learning_rate": 9.895622599709785e-05, + "loss": 2.0024, + "step": 3038 + }, + { + "epoch": 0.932780847145488, + "grad_norm": 0.46088743209838867, + "learning_rate": 9.895521543250906e-05, + "loss": 2.0794, + "step": 3039 + }, + { + "epoch": 0.9330877839165131, + "grad_norm": 0.5219544172286987, + "learning_rate": 9.895420438411616e-05, + "loss": 2.1002, + "step": 3040 + }, + { + "epoch": 0.9333947206875384, + "grad_norm": 0.5363453030586243, + "learning_rate": 9.89531928519291e-05, + "loss": 2.0629, + "step": 3041 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.5860787630081177, + "learning_rate": 9.89521808359579e-05, + "loss": 2.0999, + "step": 3042 + }, + { + "epoch": 0.9340085942295887, + "grad_norm": 0.7155836224555969, + "learning_rate": 9.895116833621255e-05, + "loss": 2.1674, + "step": 3043 + }, + { + "epoch": 0.9343155310006138, + "grad_norm": 0.8029196262359619, + "learning_rate": 9.895015535270307e-05, + "loss": 2.0776, + "step": 3044 + }, + { + "epoch": 0.934622467771639, + "grad_norm": 0.6973832845687866, + "learning_rate": 9.894914188543946e-05, + "loss": 2.0537, + "step": 3045 + }, + { + "epoch": 0.9349294045426643, + "grad_norm": 0.6646706461906433, + "learning_rate": 9.894812793443175e-05, + "loss": 2.0857, + "step": 3046 + }, + { + "epoch": 0.9352363413136894, + "grad_norm": 0.6343888640403748, + "learning_rate": 9.894711349968995e-05, + "loss": 2.0832, + "step": 3047 + }, + { + "epoch": 0.9355432780847146, + "grad_norm": 0.54819256067276, + "learning_rate": 9.894609858122407e-05, + "loss": 2.1576, + "step": 3048 + }, + { + "epoch": 0.9358502148557397, + "grad_norm": 0.6905701160430908, + "learning_rate": 9.894508317904419e-05, + "loss": 2.0685, + "step": 3049 + }, + { + "epoch": 0.9361571516267649, + "grad_norm": 0.605591356754303, + "learning_rate": 9.894406729316028e-05, + "loss": 2.0931, + "step": 3050 + }, + { + "epoch": 0.93646408839779, + "grad_norm": 0.5702943801879883, + "learning_rate": 9.89430509235824e-05, + "loss": 2.1224, + "step": 3051 + }, + { + "epoch": 0.9367710251688153, + "grad_norm": 0.5855122804641724, + "learning_rate": 9.894203407032064e-05, + "loss": 2.0747, + "step": 3052 + }, + { + "epoch": 0.9370779619398404, + "grad_norm": 0.6002167463302612, + "learning_rate": 9.894101673338498e-05, + "loss": 2.0991, + "step": 3053 + }, + { + "epoch": 0.9373848987108656, + "grad_norm": 0.5914842486381531, + "learning_rate": 9.893999891278553e-05, + "loss": 2.0427, + "step": 3054 + }, + { + "epoch": 0.9376918354818907, + "grad_norm": 0.6283048391342163, + "learning_rate": 9.893898060853232e-05, + "loss": 2.0558, + "step": 3055 + }, + { + "epoch": 0.937998772252916, + "grad_norm": 0.5955209136009216, + "learning_rate": 9.893796182063542e-05, + "loss": 2.1286, + "step": 3056 + }, + { + "epoch": 0.9383057090239411, + "grad_norm": 0.5579878687858582, + "learning_rate": 9.893694254910489e-05, + "loss": 2.0799, + "step": 3057 + }, + { + "epoch": 0.9386126457949663, + "grad_norm": 0.5690281391143799, + "learning_rate": 9.893592279395082e-05, + "loss": 2.0699, + "step": 3058 + }, + { + "epoch": 0.9389195825659914, + "grad_norm": 0.5189259648323059, + "learning_rate": 9.893490255518327e-05, + "loss": 2.0627, + "step": 3059 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.5205439925193787, + "learning_rate": 9.893388183281233e-05, + "loss": 2.0136, + "step": 3060 + }, + { + "epoch": 0.9395334561080417, + "grad_norm": 0.492593914270401, + "learning_rate": 9.89328606268481e-05, + "loss": 2.0799, + "step": 3061 + }, + { + "epoch": 0.939840392879067, + "grad_norm": 0.6511666178703308, + "learning_rate": 9.893183893730067e-05, + "loss": 2.1297, + "step": 3062 + }, + { + "epoch": 0.9401473296500921, + "grad_norm": 0.7640050053596497, + "learning_rate": 9.89308167641801e-05, + "loss": 2.1384, + "step": 3063 + }, + { + "epoch": 0.9404542664211173, + "grad_norm": 0.7526536583900452, + "learning_rate": 9.892979410749654e-05, + "loss": 2.0454, + "step": 3064 + }, + { + "epoch": 0.9407612031921424, + "grad_norm": 0.7140639424324036, + "learning_rate": 9.892877096726007e-05, + "loss": 2.0219, + "step": 3065 + }, + { + "epoch": 0.9410681399631676, + "grad_norm": 0.6584374308586121, + "learning_rate": 9.89277473434808e-05, + "loss": 2.0943, + "step": 3066 + }, + { + "epoch": 0.9413750767341927, + "grad_norm": 0.5889024138450623, + "learning_rate": 9.892672323616888e-05, + "loss": 2.1088, + "step": 3067 + }, + { + "epoch": 0.941682013505218, + "grad_norm": 0.6196749806404114, + "learning_rate": 9.892569864533438e-05, + "loss": 2.101, + "step": 3068 + }, + { + "epoch": 0.9419889502762431, + "grad_norm": 0.6432211399078369, + "learning_rate": 9.892467357098744e-05, + "loss": 2.0828, + "step": 3069 + }, + { + "epoch": 0.9422958870472683, + "grad_norm": 0.6448069214820862, + "learning_rate": 9.892364801313823e-05, + "loss": 2.1389, + "step": 3070 + }, + { + "epoch": 0.9426028238182934, + "grad_norm": 0.597197949886322, + "learning_rate": 9.892262197179682e-05, + "loss": 2.0902, + "step": 3071 + }, + { + "epoch": 0.9429097605893186, + "grad_norm": 0.625348687171936, + "learning_rate": 9.892159544697341e-05, + "loss": 2.0659, + "step": 3072 + }, + { + "epoch": 0.9432166973603437, + "grad_norm": 0.5109166502952576, + "learning_rate": 9.892056843867812e-05, + "loss": 2.0895, + "step": 3073 + }, + { + "epoch": 0.943523634131369, + "grad_norm": 0.5917959213256836, + "learning_rate": 9.891954094692108e-05, + "loss": 2.0646, + "step": 3074 + }, + { + "epoch": 0.9438305709023941, + "grad_norm": 0.5320633053779602, + "learning_rate": 9.891851297171249e-05, + "loss": 2.107, + "step": 3075 + }, + { + "epoch": 0.9441375076734193, + "grad_norm": 0.5271332263946533, + "learning_rate": 9.891748451306246e-05, + "loss": 2.0984, + "step": 3076 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.5389983057975769, + "learning_rate": 9.89164555709812e-05, + "loss": 2.1097, + "step": 3077 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.5536573529243469, + "learning_rate": 9.891542614547885e-05, + "loss": 2.1271, + "step": 3078 + }, + { + "epoch": 0.9450583179864948, + "grad_norm": 0.5481712222099304, + "learning_rate": 9.891439623656558e-05, + "loss": 2.0975, + "step": 3079 + }, + { + "epoch": 0.94536525475752, + "grad_norm": 0.626431941986084, + "learning_rate": 9.891336584425157e-05, + "loss": 2.1561, + "step": 3080 + }, + { + "epoch": 0.9456721915285451, + "grad_norm": 0.7452689409255981, + "learning_rate": 9.891233496854702e-05, + "loss": 2.0791, + "step": 3081 + }, + { + "epoch": 0.9459791282995703, + "grad_norm": 0.9399113059043884, + "learning_rate": 9.89113036094621e-05, + "loss": 2.0706, + "step": 3082 + }, + { + "epoch": 0.9462860650705954, + "grad_norm": 1.0733267068862915, + "learning_rate": 9.891027176700701e-05, + "loss": 2.0705, + "step": 3083 + }, + { + "epoch": 0.9465930018416207, + "grad_norm": 0.7521542906761169, + "learning_rate": 9.890923944119194e-05, + "loss": 2.0862, + "step": 3084 + }, + { + "epoch": 0.9468999386126458, + "grad_norm": 0.5447198152542114, + "learning_rate": 9.890820663202713e-05, + "loss": 2.1047, + "step": 3085 + }, + { + "epoch": 0.947206875383671, + "grad_norm": 0.5733833312988281, + "learning_rate": 9.890717333952273e-05, + "loss": 2.121, + "step": 3086 + }, + { + "epoch": 0.9475138121546961, + "grad_norm": 0.7225440144538879, + "learning_rate": 9.890613956368899e-05, + "loss": 2.0533, + "step": 3087 + }, + { + "epoch": 0.9478207489257213, + "grad_norm": 0.6377096176147461, + "learning_rate": 9.89051053045361e-05, + "loss": 2.07, + "step": 3088 + }, + { + "epoch": 0.9481276856967464, + "grad_norm": 0.556656002998352, + "learning_rate": 9.890407056207432e-05, + "loss": 2.1103, + "step": 3089 + }, + { + "epoch": 0.9484346224677717, + "grad_norm": 0.6807621121406555, + "learning_rate": 9.890303533631382e-05, + "loss": 2.1351, + "step": 3090 + }, + { + "epoch": 0.9487415592387968, + "grad_norm": 0.7187803983688354, + "learning_rate": 9.890199962726487e-05, + "loss": 2.0582, + "step": 3091 + }, + { + "epoch": 0.949048496009822, + "grad_norm": 0.6201196908950806, + "learning_rate": 9.890096343493771e-05, + "loss": 2.0799, + "step": 3092 + }, + { + "epoch": 0.9493554327808471, + "grad_norm": 0.6258496046066284, + "learning_rate": 9.889992675934257e-05, + "loss": 2.156, + "step": 3093 + }, + { + "epoch": 0.9496623695518723, + "grad_norm": 0.6191570162773132, + "learning_rate": 9.889888960048967e-05, + "loss": 2.0121, + "step": 3094 + }, + { + "epoch": 0.9499693063228974, + "grad_norm": 0.5668848752975464, + "learning_rate": 9.88978519583893e-05, + "loss": 2.0954, + "step": 3095 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.5596859455108643, + "learning_rate": 9.88968138330517e-05, + "loss": 2.1274, + "step": 3096 + }, + { + "epoch": 0.9505831798649478, + "grad_norm": 0.6199706196784973, + "learning_rate": 9.889577522448712e-05, + "loss": 2.0588, + "step": 3097 + }, + { + "epoch": 0.950890116635973, + "grad_norm": 0.5129860639572144, + "learning_rate": 9.889473613270584e-05, + "loss": 2.0722, + "step": 3098 + }, + { + "epoch": 0.9511970534069981, + "grad_norm": 0.513263463973999, + "learning_rate": 9.88936965577181e-05, + "loss": 2.0298, + "step": 3099 + }, + { + "epoch": 0.9515039901780233, + "grad_norm": 0.4870156943798065, + "learning_rate": 9.88926564995342e-05, + "loss": 2.025, + "step": 3100 + }, + { + "epoch": 0.9518109269490485, + "grad_norm": 0.5310595035552979, + "learning_rate": 9.889161595816442e-05, + "loss": 2.0767, + "step": 3101 + }, + { + "epoch": 0.9521178637200737, + "grad_norm": 0.5993812084197998, + "learning_rate": 9.889057493361903e-05, + "loss": 2.1931, + "step": 3102 + }, + { + "epoch": 0.9524248004910988, + "grad_norm": 0.6157637238502502, + "learning_rate": 9.888953342590832e-05, + "loss": 2.0757, + "step": 3103 + }, + { + "epoch": 0.952731737262124, + "grad_norm": 0.6280032992362976, + "learning_rate": 9.88884914350426e-05, + "loss": 2.0042, + "step": 3104 + }, + { + "epoch": 0.9530386740331491, + "grad_norm": 0.6740781664848328, + "learning_rate": 9.888744896103212e-05, + "loss": 2.0663, + "step": 3105 + }, + { + "epoch": 0.9533456108041743, + "grad_norm": 0.5851804614067078, + "learning_rate": 9.888640600388725e-05, + "loss": 2.0585, + "step": 3106 + }, + { + "epoch": 0.9536525475751995, + "grad_norm": 0.6590312719345093, + "learning_rate": 9.888536256361825e-05, + "loss": 2.0698, + "step": 3107 + }, + { + "epoch": 0.9539594843462247, + "grad_norm": 0.5356595516204834, + "learning_rate": 9.888431864023544e-05, + "loss": 2.1019, + "step": 3108 + }, + { + "epoch": 0.9542664211172498, + "grad_norm": 0.6401084661483765, + "learning_rate": 9.888327423374915e-05, + "loss": 2.1176, + "step": 3109 + }, + { + "epoch": 0.954573357888275, + "grad_norm": 0.6582900285720825, + "learning_rate": 9.888222934416968e-05, + "loss": 2.0375, + "step": 3110 + }, + { + "epoch": 0.9548802946593001, + "grad_norm": 0.6245424151420593, + "learning_rate": 9.888118397150738e-05, + "loss": 1.9913, + "step": 3111 + }, + { + "epoch": 0.9551872314303254, + "grad_norm": 0.5871780514717102, + "learning_rate": 9.888013811577256e-05, + "loss": 2.1434, + "step": 3112 + }, + { + "epoch": 0.9554941682013505, + "grad_norm": 0.6295487284660339, + "learning_rate": 9.887909177697559e-05, + "loss": 2.0805, + "step": 3113 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.5844045877456665, + "learning_rate": 9.887804495512676e-05, + "loss": 2.076, + "step": 3114 + }, + { + "epoch": 0.9561080417434008, + "grad_norm": 0.5581921339035034, + "learning_rate": 9.887699765023645e-05, + "loss": 2.131, + "step": 3115 + }, + { + "epoch": 0.956414978514426, + "grad_norm": 0.6659174561500549, + "learning_rate": 9.8875949862315e-05, + "loss": 2.0759, + "step": 3116 + }, + { + "epoch": 0.9567219152854513, + "grad_norm": 0.5852961540222168, + "learning_rate": 9.887490159137276e-05, + "loss": 2.0486, + "step": 3117 + }, + { + "epoch": 0.9570288520564764, + "grad_norm": 0.6077566146850586, + "learning_rate": 9.887385283742011e-05, + "loss": 2.1132, + "step": 3118 + }, + { + "epoch": 0.9573357888275016, + "grad_norm": 0.5991361141204834, + "learning_rate": 9.88728036004674e-05, + "loss": 2.0322, + "step": 3119 + }, + { + "epoch": 0.9576427255985267, + "grad_norm": 0.5832391977310181, + "learning_rate": 9.887175388052499e-05, + "loss": 2.135, + "step": 3120 + }, + { + "epoch": 0.9579496623695519, + "grad_norm": 0.5479732751846313, + "learning_rate": 9.887070367760327e-05, + "loss": 2.1222, + "step": 3121 + }, + { + "epoch": 0.958256599140577, + "grad_norm": 0.5630220770835876, + "learning_rate": 9.88696529917126e-05, + "loss": 2.1247, + "step": 3122 + }, + { + "epoch": 0.9585635359116023, + "grad_norm": 0.7052439451217651, + "learning_rate": 9.88686018228634e-05, + "loss": 2.204, + "step": 3123 + }, + { + "epoch": 0.9588704726826274, + "grad_norm": 0.5995638370513916, + "learning_rate": 9.8867550171066e-05, + "loss": 2.0153, + "step": 3124 + }, + { + "epoch": 0.9591774094536526, + "grad_norm": 0.5689408779144287, + "learning_rate": 9.886649803633086e-05, + "loss": 2.0341, + "step": 3125 + }, + { + "epoch": 0.9594843462246777, + "grad_norm": 0.5247456431388855, + "learning_rate": 9.886544541866832e-05, + "loss": 2.0657, + "step": 3126 + }, + { + "epoch": 0.9597912829957029, + "grad_norm": 0.5596463084220886, + "learning_rate": 9.886439231808882e-05, + "loss": 2.0829, + "step": 3127 + }, + { + "epoch": 0.960098219766728, + "grad_norm": 0.4993874430656433, + "learning_rate": 9.886333873460275e-05, + "loss": 2.0517, + "step": 3128 + }, + { + "epoch": 0.9604051565377533, + "grad_norm": 0.5776910185813904, + "learning_rate": 9.886228466822054e-05, + "loss": 2.0124, + "step": 3129 + }, + { + "epoch": 0.9607120933087784, + "grad_norm": 0.5871354341506958, + "learning_rate": 9.886123011895258e-05, + "loss": 2.0327, + "step": 3130 + }, + { + "epoch": 0.9610190300798036, + "grad_norm": 0.5873207449913025, + "learning_rate": 9.886017508680931e-05, + "loss": 2.0756, + "step": 3131 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.6422720551490784, + "learning_rate": 9.885911957180113e-05, + "loss": 2.0649, + "step": 3132 + }, + { + "epoch": 0.9616329036218539, + "grad_norm": 0.6040814518928528, + "learning_rate": 9.885806357393853e-05, + "loss": 2.066, + "step": 3133 + }, + { + "epoch": 0.961939840392879, + "grad_norm": 0.6629621982574463, + "learning_rate": 9.885700709323189e-05, + "loss": 2.0824, + "step": 3134 + }, + { + "epoch": 0.9622467771639043, + "grad_norm": 0.572485625743866, + "learning_rate": 9.885595012969168e-05, + "loss": 2.0572, + "step": 3135 + }, + { + "epoch": 0.9625537139349294, + "grad_norm": 0.5050783753395081, + "learning_rate": 9.885489268332833e-05, + "loss": 2.0645, + "step": 3136 + }, + { + "epoch": 0.9628606507059546, + "grad_norm": 0.5744417309761047, + "learning_rate": 9.885383475415229e-05, + "loss": 2.0549, + "step": 3137 + }, + { + "epoch": 0.9631675874769797, + "grad_norm": 0.5604275465011597, + "learning_rate": 9.885277634217403e-05, + "loss": 2.1339, + "step": 3138 + }, + { + "epoch": 0.963474524248005, + "grad_norm": 0.6182584762573242, + "learning_rate": 9.8851717447404e-05, + "loss": 2.0397, + "step": 3139 + }, + { + "epoch": 0.9637814610190301, + "grad_norm": 0.510515570640564, + "learning_rate": 9.885065806985266e-05, + "loss": 1.9761, + "step": 3140 + }, + { + "epoch": 0.9640883977900553, + "grad_norm": 0.4881763756275177, + "learning_rate": 9.884959820953048e-05, + "loss": 2.005, + "step": 3141 + }, + { + "epoch": 0.9643953345610804, + "grad_norm": 0.47206851840019226, + "learning_rate": 9.884853786644794e-05, + "loss": 2.0661, + "step": 3142 + }, + { + "epoch": 0.9647022713321056, + "grad_norm": 0.5691676735877991, + "learning_rate": 9.884747704061552e-05, + "loss": 2.1316, + "step": 3143 + }, + { + "epoch": 0.9650092081031307, + "grad_norm": 0.5338765978813171, + "learning_rate": 9.884641573204372e-05, + "loss": 2.0715, + "step": 3144 + }, + { + "epoch": 0.965316144874156, + "grad_norm": 0.5721597075462341, + "learning_rate": 9.884535394074299e-05, + "loss": 2.1004, + "step": 3145 + }, + { + "epoch": 0.9656230816451811, + "grad_norm": 0.5269518494606018, + "learning_rate": 9.884429166672384e-05, + "loss": 2.1233, + "step": 3146 + }, + { + "epoch": 0.9659300184162063, + "grad_norm": 0.5264385342597961, + "learning_rate": 9.884322890999678e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.9662369551872314, + "grad_norm": 0.6094604730606079, + "learning_rate": 9.88421656705723e-05, + "loss": 2.1009, + "step": 3148 + }, + { + "epoch": 0.9665438919582566, + "grad_norm": 0.5538906455039978, + "learning_rate": 9.884110194846093e-05, + "loss": 2.0055, + "step": 3149 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.591526985168457, + "learning_rate": 9.884003774367313e-05, + "loss": 2.0655, + "step": 3150 + }, + { + "epoch": 0.967157765500307, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.883897305621948e-05, + "loss": 2.0775, + "step": 3151 + }, + { + "epoch": 0.9674647022713321, + "grad_norm": 0.5074640512466431, + "learning_rate": 9.883790788611045e-05, + "loss": 2.0322, + "step": 3152 + }, + { + "epoch": 0.9677716390423573, + "grad_norm": 0.5111376047134399, + "learning_rate": 9.883684223335661e-05, + "loss": 2.0972, + "step": 3153 + }, + { + "epoch": 0.9680785758133824, + "grad_norm": 0.5187644362449646, + "learning_rate": 9.883577609796846e-05, + "loss": 2.072, + "step": 3154 + }, + { + "epoch": 0.9683855125844076, + "grad_norm": 0.5285201072692871, + "learning_rate": 9.883470947995654e-05, + "loss": 2.0468, + "step": 3155 + }, + { + "epoch": 0.9686924493554327, + "grad_norm": 0.49360916018486023, + "learning_rate": 9.883364237933142e-05, + "loss": 2.07, + "step": 3156 + }, + { + "epoch": 0.968999386126458, + "grad_norm": 0.6359294056892395, + "learning_rate": 9.88325747961036e-05, + "loss": 2.1169, + "step": 3157 + }, + { + "epoch": 0.9693063228974831, + "grad_norm": 0.6274764537811279, + "learning_rate": 9.883150673028367e-05, + "loss": 2.1412, + "step": 3158 + }, + { + "epoch": 0.9696132596685083, + "grad_norm": 0.5755917429924011, + "learning_rate": 9.883043818188215e-05, + "loss": 2.0547, + "step": 3159 + }, + { + "epoch": 0.9699201964395334, + "grad_norm": 0.4765770137310028, + "learning_rate": 9.882936915090964e-05, + "loss": 2.02, + "step": 3160 + }, + { + "epoch": 0.9702271332105586, + "grad_norm": 0.5085053443908691, + "learning_rate": 9.882829963737667e-05, + "loss": 2.0355, + "step": 3161 + }, + { + "epoch": 0.9705340699815838, + "grad_norm": 0.49804505705833435, + "learning_rate": 9.882722964129385e-05, + "loss": 2.1274, + "step": 3162 + }, + { + "epoch": 0.970841006752609, + "grad_norm": 0.5575076341629028, + "learning_rate": 9.882615916267171e-05, + "loss": 2.0661, + "step": 3163 + }, + { + "epoch": 0.9711479435236341, + "grad_norm": 0.5678727626800537, + "learning_rate": 9.882508820152084e-05, + "loss": 2.1135, + "step": 3164 + }, + { + "epoch": 0.9714548802946593, + "grad_norm": 0.5505611896514893, + "learning_rate": 9.882401675785185e-05, + "loss": 2.0888, + "step": 3165 + }, + { + "epoch": 0.9717618170656844, + "grad_norm": 0.5224125385284424, + "learning_rate": 9.88229448316753e-05, + "loss": 2.0492, + "step": 3166 + }, + { + "epoch": 0.9720687538367097, + "grad_norm": 0.437215656042099, + "learning_rate": 9.882187242300178e-05, + "loss": 1.9927, + "step": 3167 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.4914848804473877, + "learning_rate": 9.882079953184192e-05, + "loss": 2.0309, + "step": 3168 + }, + { + "epoch": 0.97268262737876, + "grad_norm": 0.4990764260292053, + "learning_rate": 9.88197261582063e-05, + "loss": 2.0408, + "step": 3169 + }, + { + "epoch": 0.9729895641497851, + "grad_norm": 0.5283234715461731, + "learning_rate": 9.881865230210552e-05, + "loss": 2.0627, + "step": 3170 + }, + { + "epoch": 0.9732965009208103, + "grad_norm": 0.5771347284317017, + "learning_rate": 9.88175779635502e-05, + "loss": 2.1591, + "step": 3171 + }, + { + "epoch": 0.9736034376918354, + "grad_norm": 0.5020268559455872, + "learning_rate": 9.881650314255098e-05, + "loss": 2.0311, + "step": 3172 + }, + { + "epoch": 0.9739103744628607, + "grad_norm": 0.5476529002189636, + "learning_rate": 9.881542783911846e-05, + "loss": 2.1114, + "step": 3173 + }, + { + "epoch": 0.9742173112338858, + "grad_norm": 0.5630559921264648, + "learning_rate": 9.881435205326327e-05, + "loss": 2.0617, + "step": 3174 + }, + { + "epoch": 0.974524248004911, + "grad_norm": 0.5931001305580139, + "learning_rate": 9.881327578499604e-05, + "loss": 2.0376, + "step": 3175 + }, + { + "epoch": 0.9748311847759361, + "grad_norm": 0.6123979091644287, + "learning_rate": 9.881219903432742e-05, + "loss": 2.0995, + "step": 3176 + }, + { + "epoch": 0.9751381215469613, + "grad_norm": 0.6064465641975403, + "learning_rate": 9.881112180126802e-05, + "loss": 2.0533, + "step": 3177 + }, + { + "epoch": 0.9754450583179864, + "grad_norm": 0.6071485877037048, + "learning_rate": 9.881004408582852e-05, + "loss": 2.1007, + "step": 3178 + }, + { + "epoch": 0.9757519950890117, + "grad_norm": 0.6021482944488525, + "learning_rate": 9.880896588801954e-05, + "loss": 2.0528, + "step": 3179 + }, + { + "epoch": 0.9760589318600368, + "grad_norm": 0.5204832553863525, + "learning_rate": 9.880788720785177e-05, + "loss": 2.0489, + "step": 3180 + }, + { + "epoch": 0.976365868631062, + "grad_norm": 0.5347138047218323, + "learning_rate": 9.880680804533585e-05, + "loss": 2.1021, + "step": 3181 + }, + { + "epoch": 0.9766728054020871, + "grad_norm": 0.6318790912628174, + "learning_rate": 9.880572840048243e-05, + "loss": 2.0808, + "step": 3182 + }, + { + "epoch": 0.9769797421731123, + "grad_norm": 0.6978665590286255, + "learning_rate": 9.88046482733022e-05, + "loss": 2.0067, + "step": 3183 + }, + { + "epoch": 0.9772866789441375, + "grad_norm": 0.7986917495727539, + "learning_rate": 9.880356766380582e-05, + "loss": 2.0239, + "step": 3184 + }, + { + "epoch": 0.9775936157151627, + "grad_norm": 0.853898286819458, + "learning_rate": 9.880248657200402e-05, + "loss": 2.085, + "step": 3185 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.8207793235778809, + "learning_rate": 9.880140499790741e-05, + "loss": 2.0504, + "step": 3186 + }, + { + "epoch": 0.978207489257213, + "grad_norm": 0.7750336527824402, + "learning_rate": 9.880032294152673e-05, + "loss": 2.0962, + "step": 3187 + }, + { + "epoch": 0.9785144260282382, + "grad_norm": 0.7141241431236267, + "learning_rate": 9.879924040287263e-05, + "loss": 2.0655, + "step": 3188 + }, + { + "epoch": 0.9788213627992634, + "grad_norm": 0.6119080781936646, + "learning_rate": 9.879815738195585e-05, + "loss": 2.0611, + "step": 3189 + }, + { + "epoch": 0.9791282995702886, + "grad_norm": 0.5963751673698425, + "learning_rate": 9.879707387878708e-05, + "loss": 2.0978, + "step": 3190 + }, + { + "epoch": 0.9794352363413137, + "grad_norm": 0.5016428828239441, + "learning_rate": 9.879598989337703e-05, + "loss": 2.0323, + "step": 3191 + }, + { + "epoch": 0.9797421731123389, + "grad_norm": 0.5610151290893555, + "learning_rate": 9.87949054257364e-05, + "loss": 2.1362, + "step": 3192 + }, + { + "epoch": 0.980049109883364, + "grad_norm": 0.5687069296836853, + "learning_rate": 9.879382047587591e-05, + "loss": 2.0234, + "step": 3193 + }, + { + "epoch": 0.9803560466543892, + "grad_norm": 0.6210914254188538, + "learning_rate": 9.87927350438063e-05, + "loss": 2.0455, + "step": 3194 + }, + { + "epoch": 0.9806629834254144, + "grad_norm": 0.530215322971344, + "learning_rate": 9.879164912953827e-05, + "loss": 2.0607, + "step": 3195 + }, + { + "epoch": 0.9809699201964396, + "grad_norm": 0.5462486147880554, + "learning_rate": 9.879056273308258e-05, + "loss": 2.1229, + "step": 3196 + }, + { + "epoch": 0.9812768569674647, + "grad_norm": 0.5765405297279358, + "learning_rate": 9.878947585444994e-05, + "loss": 2.0575, + "step": 3197 + }, + { + "epoch": 0.9815837937384899, + "grad_norm": 0.531679630279541, + "learning_rate": 9.878838849365111e-05, + "loss": 2.0208, + "step": 3198 + }, + { + "epoch": 0.981890730509515, + "grad_norm": 0.5190781950950623, + "learning_rate": 9.878730065069683e-05, + "loss": 2.0073, + "step": 3199 + }, + { + "epoch": 0.9821976672805403, + "grad_norm": 0.6260761022567749, + "learning_rate": 9.878621232559784e-05, + "loss": 2.1144, + "step": 3200 + }, + { + "epoch": 0.9825046040515654, + "grad_norm": 0.664830207824707, + "learning_rate": 9.878512351836491e-05, + "loss": 2.1423, + "step": 3201 + }, + { + "epoch": 0.9828115408225906, + "grad_norm": 0.7107433676719666, + "learning_rate": 9.878403422900881e-05, + "loss": 2.0851, + "step": 3202 + }, + { + "epoch": 0.9831184775936157, + "grad_norm": 0.7426268458366394, + "learning_rate": 9.878294445754027e-05, + "loss": 2.0637, + "step": 3203 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.7643515467643738, + "learning_rate": 9.878185420397008e-05, + "loss": 2.0623, + "step": 3204 + }, + { + "epoch": 0.983732351135666, + "grad_norm": 0.644257664680481, + "learning_rate": 9.878076346830904e-05, + "loss": 2.103, + "step": 3205 + }, + { + "epoch": 0.9840392879066913, + "grad_norm": 0.5871284008026123, + "learning_rate": 9.877967225056787e-05, + "loss": 2.0695, + "step": 3206 + }, + { + "epoch": 0.9843462246777164, + "grad_norm": 0.6907737851142883, + "learning_rate": 9.877858055075742e-05, + "loss": 2.1148, + "step": 3207 + }, + { + "epoch": 0.9846531614487416, + "grad_norm": 0.6685691475868225, + "learning_rate": 9.877748836888843e-05, + "loss": 2.0356, + "step": 3208 + }, + { + "epoch": 0.9849600982197667, + "grad_norm": 0.797210156917572, + "learning_rate": 9.87763957049717e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.9852670349907919, + "grad_norm": 0.8397588133811951, + "learning_rate": 9.877530255901806e-05, + "loss": 2.0697, + "step": 3210 + }, + { + "epoch": 0.985573971761817, + "grad_norm": 0.6988976001739502, + "learning_rate": 9.877420893103828e-05, + "loss": 2.0676, + "step": 3211 + }, + { + "epoch": 0.9858809085328423, + "grad_norm": 0.5828577876091003, + "learning_rate": 9.877311482104319e-05, + "loss": 2.0988, + "step": 3212 + }, + { + "epoch": 0.9861878453038674, + "grad_norm": 0.66143798828125, + "learning_rate": 9.877202022904359e-05, + "loss": 2.101, + "step": 3213 + }, + { + "epoch": 0.9864947820748926, + "grad_norm": 0.7351155877113342, + "learning_rate": 9.877092515505028e-05, + "loss": 2.0198, + "step": 3214 + }, + { + "epoch": 0.9868017188459177, + "grad_norm": 0.6817437410354614, + "learning_rate": 9.876982959907413e-05, + "loss": 2.1182, + "step": 3215 + }, + { + "epoch": 0.9871086556169429, + "grad_norm": 0.6640676259994507, + "learning_rate": 9.876873356112592e-05, + "loss": 2.1264, + "step": 3216 + }, + { + "epoch": 0.987415592387968, + "grad_norm": 0.6146695017814636, + "learning_rate": 9.876763704121652e-05, + "loss": 2.0378, + "step": 3217 + }, + { + "epoch": 0.9877225291589933, + "grad_norm": 0.6681298017501831, + "learning_rate": 9.876654003935672e-05, + "loss": 2.1916, + "step": 3218 + }, + { + "epoch": 0.9880294659300184, + "grad_norm": 0.7407983541488647, + "learning_rate": 9.876544255555742e-05, + "loss": 2.0996, + "step": 3219 + }, + { + "epoch": 0.9883364027010436, + "grad_norm": 0.5995208621025085, + "learning_rate": 9.876434458982941e-05, + "loss": 2.0023, + "step": 3220 + }, + { + "epoch": 0.9886433394720687, + "grad_norm": 0.6491377949714661, + "learning_rate": 9.876324614218357e-05, + "loss": 2.129, + "step": 3221 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.6356569528579712, + "learning_rate": 9.876214721263074e-05, + "loss": 2.1396, + "step": 3222 + }, + { + "epoch": 0.9892572130141191, + "grad_norm": 0.6149557828903198, + "learning_rate": 9.876104780118182e-05, + "loss": 2.0204, + "step": 3223 + }, + { + "epoch": 0.9895641497851443, + "grad_norm": 0.600841224193573, + "learning_rate": 9.875994790784764e-05, + "loss": 2.0585, + "step": 3224 + }, + { + "epoch": 0.9898710865561694, + "grad_norm": 0.6398041248321533, + "learning_rate": 9.875884753263906e-05, + "loss": 2.1296, + "step": 3225 + }, + { + "epoch": 0.9901780233271946, + "grad_norm": 0.5978466272354126, + "learning_rate": 9.875774667556697e-05, + "loss": 1.9765, + "step": 3226 + }, + { + "epoch": 0.9904849600982197, + "grad_norm": 0.49499931931495667, + "learning_rate": 9.875664533664227e-05, + "loss": 2.0516, + "step": 3227 + }, + { + "epoch": 0.990791896869245, + "grad_norm": 0.5660768151283264, + "learning_rate": 9.875554351587579e-05, + "loss": 2.0743, + "step": 3228 + }, + { + "epoch": 0.9910988336402701, + "grad_norm": 0.56971275806427, + "learning_rate": 9.875444121327849e-05, + "loss": 2.0794, + "step": 3229 + }, + { + "epoch": 0.9914057704112953, + "grad_norm": 0.5806300044059753, + "learning_rate": 9.87533384288612e-05, + "loss": 2.1636, + "step": 3230 + }, + { + "epoch": 0.9917127071823204, + "grad_norm": 0.5485837459564209, + "learning_rate": 9.875223516263485e-05, + "loss": 2.025, + "step": 3231 + }, + { + "epoch": 0.9920196439533456, + "grad_norm": 0.6353451013565063, + "learning_rate": 9.875113141461034e-05, + "loss": 2.1033, + "step": 3232 + }, + { + "epoch": 0.9923265807243707, + "grad_norm": 0.577608048915863, + "learning_rate": 9.875002718479858e-05, + "loss": 2.1306, + "step": 3233 + }, + { + "epoch": 0.992633517495396, + "grad_norm": 0.5305901765823364, + "learning_rate": 9.874892247321046e-05, + "loss": 2.1123, + "step": 3234 + }, + { + "epoch": 0.9929404542664211, + "grad_norm": 0.5554118752479553, + "learning_rate": 9.874781727985693e-05, + "loss": 2.0524, + "step": 3235 + }, + { + "epoch": 0.9932473910374463, + "grad_norm": 0.48555269837379456, + "learning_rate": 9.87467116047489e-05, + "loss": 2.0699, + "step": 3236 + }, + { + "epoch": 0.9935543278084714, + "grad_norm": 0.578976035118103, + "learning_rate": 9.874560544789729e-05, + "loss": 2.0747, + "step": 3237 + }, + { + "epoch": 0.9938612645794966, + "grad_norm": 0.5508282780647278, + "learning_rate": 9.874449880931304e-05, + "loss": 2.0947, + "step": 3238 + }, + { + "epoch": 0.9941682013505218, + "grad_norm": 0.5458595752716064, + "learning_rate": 9.874339168900707e-05, + "loss": 2.0417, + "step": 3239 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.5668261647224426, + "learning_rate": 9.874228408699035e-05, + "loss": 2.0948, + "step": 3240 + }, + { + "epoch": 0.9947820748925721, + "grad_norm": 0.6127253174781799, + "learning_rate": 9.87411760032738e-05, + "loss": 2.0904, + "step": 3241 + }, + { + "epoch": 0.9950890116635973, + "grad_norm": 0.5736191868782043, + "learning_rate": 9.874006743786839e-05, + "loss": 2.0637, + "step": 3242 + }, + { + "epoch": 0.9953959484346224, + "grad_norm": 0.574163019657135, + "learning_rate": 9.873895839078507e-05, + "loss": 2.0925, + "step": 3243 + }, + { + "epoch": 0.9957028852056476, + "grad_norm": 0.5660602450370789, + "learning_rate": 9.873784886203478e-05, + "loss": 2.0743, + "step": 3244 + }, + { + "epoch": 0.9960098219766728, + "grad_norm": 0.6037993431091309, + "learning_rate": 9.87367388516285e-05, + "loss": 2.1274, + "step": 3245 + }, + { + "epoch": 0.996316758747698, + "grad_norm": 0.5664488673210144, + "learning_rate": 9.873562835957722e-05, + "loss": 2.0403, + "step": 3246 + }, + { + "epoch": 0.9966236955187231, + "grad_norm": 0.6170254349708557, + "learning_rate": 9.873451738589188e-05, + "loss": 2.0198, + "step": 3247 + }, + { + "epoch": 0.9969306322897483, + "grad_norm": 0.5582032799720764, + "learning_rate": 9.873340593058348e-05, + "loss": 2.1494, + "step": 3248 + }, + { + "epoch": 0.9972375690607734, + "grad_norm": 0.5565598607063293, + "learning_rate": 9.8732293993663e-05, + "loss": 2.1062, + "step": 3249 + }, + { + "epoch": 0.9975445058317987, + "grad_norm": 0.5526474118232727, + "learning_rate": 9.873118157514142e-05, + "loss": 2.1184, + "step": 3250 + }, + { + "epoch": 0.9978514426028238, + "grad_norm": 0.5864302515983582, + "learning_rate": 9.873006867502975e-05, + "loss": 2.1389, + "step": 3251 + }, + { + "epoch": 0.998158379373849, + "grad_norm": 0.5295118689537048, + "learning_rate": 9.872895529333899e-05, + "loss": 2.05, + "step": 3252 + }, + { + "epoch": 0.9984653161448741, + "grad_norm": 0.553537905216217, + "learning_rate": 9.872784143008012e-05, + "loss": 2.0464, + "step": 3253 + }, + { + "epoch": 0.9987722529158993, + "grad_norm": 0.558159589767456, + "learning_rate": 9.872672708526416e-05, + "loss": 2.1013, + "step": 3254 + }, + { + "epoch": 0.9990791896869244, + "grad_norm": 0.5479860901832581, + "learning_rate": 9.872561225890211e-05, + "loss": 2.0497, + "step": 3255 + }, + { + "epoch": 0.9993861264579497, + "grad_norm": 0.5538234114646912, + "learning_rate": 9.872449695100503e-05, + "loss": 2.1239, + "step": 3256 + }, + { + "epoch": 0.9996930632289748, + "grad_norm": 0.5970771908760071, + "learning_rate": 9.872338116158389e-05, + "loss": 2.0693, + "step": 3257 + }, + { + "epoch": 1.0, + "grad_norm": 0.5118132829666138, + "learning_rate": 9.872226489064975e-05, + "loss": 2.0302, + "step": 3258 + }, + { + "epoch": 1.0003069367710251, + "grad_norm": 0.538902223110199, + "learning_rate": 9.872114813821363e-05, + "loss": 2.0604, + "step": 3259 + }, + { + "epoch": 1.0006138735420504, + "grad_norm": 0.47124916315078735, + "learning_rate": 9.872003090428657e-05, + "loss": 2.054, + "step": 3260 + }, + { + "epoch": 1.0009208103130756, + "grad_norm": 0.5109235048294067, + "learning_rate": 9.87189131888796e-05, + "loss": 2.0107, + "step": 3261 + }, + { + "epoch": 1.0012277470841007, + "grad_norm": 0.5530306696891785, + "learning_rate": 9.871779499200377e-05, + "loss": 2.0914, + "step": 3262 + }, + { + "epoch": 1.0015346838551258, + "grad_norm": 0.6271992325782776, + "learning_rate": 9.871667631367017e-05, + "loss": 1.9855, + "step": 3263 + }, + { + "epoch": 1.0018416206261511, + "grad_norm": 0.5752004384994507, + "learning_rate": 9.871555715388978e-05, + "loss": 2.0689, + "step": 3264 + }, + { + "epoch": 1.0021485573971762, + "grad_norm": 0.6185278296470642, + "learning_rate": 9.871443751267373e-05, + "loss": 2.0751, + "step": 3265 + }, + { + "epoch": 1.0024554941682013, + "grad_norm": 0.625248908996582, + "learning_rate": 9.871331739003304e-05, + "loss": 2.102, + "step": 3266 + }, + { + "epoch": 1.0027624309392265, + "grad_norm": 0.6345300078392029, + "learning_rate": 9.87121967859788e-05, + "loss": 2.0898, + "step": 3267 + }, + { + "epoch": 1.0030693677102518, + "grad_norm": 0.6836622953414917, + "learning_rate": 9.871107570052207e-05, + "loss": 2.1348, + "step": 3268 + }, + { + "epoch": 1.003376304481277, + "grad_norm": 0.699739933013916, + "learning_rate": 9.870995413367397e-05, + "loss": 2.0085, + "step": 3269 + }, + { + "epoch": 1.003683241252302, + "grad_norm": 0.650558590888977, + "learning_rate": 9.870883208544553e-05, + "loss": 2.0927, + "step": 3270 + }, + { + "epoch": 1.0039901780233271, + "grad_norm": 0.6837300658226013, + "learning_rate": 9.870770955584785e-05, + "loss": 2.1415, + "step": 3271 + }, + { + "epoch": 1.0042971147943525, + "grad_norm": 0.595761239528656, + "learning_rate": 9.870658654489206e-05, + "loss": 2.0372, + "step": 3272 + }, + { + "epoch": 1.0046040515653776, + "grad_norm": 0.5177203416824341, + "learning_rate": 9.870546305258922e-05, + "loss": 2.053, + "step": 3273 + }, + { + "epoch": 1.0049109883364027, + "grad_norm": 0.5392438173294067, + "learning_rate": 9.870433907895045e-05, + "loss": 2.0886, + "step": 3274 + }, + { + "epoch": 1.0052179251074278, + "grad_norm": 0.594776451587677, + "learning_rate": 9.870321462398686e-05, + "loss": 2.0158, + "step": 3275 + }, + { + "epoch": 1.0055248618784531, + "grad_norm": 0.6363179683685303, + "learning_rate": 9.870208968770955e-05, + "loss": 2.0532, + "step": 3276 + }, + { + "epoch": 1.0058317986494782, + "grad_norm": 0.7506567239761353, + "learning_rate": 9.870096427012965e-05, + "loss": 2.1288, + "step": 3277 + }, + { + "epoch": 1.0061387354205034, + "grad_norm": 0.7155289053916931, + "learning_rate": 9.869983837125828e-05, + "loss": 2.0859, + "step": 3278 + }, + { + "epoch": 1.0064456721915285, + "grad_norm": 0.7589760422706604, + "learning_rate": 9.869871199110656e-05, + "loss": 2.1668, + "step": 3279 + }, + { + "epoch": 1.0067526089625538, + "grad_norm": 0.6161168217658997, + "learning_rate": 9.869758512968562e-05, + "loss": 2.0421, + "step": 3280 + }, + { + "epoch": 1.007059545733579, + "grad_norm": 0.5722637176513672, + "learning_rate": 9.86964577870066e-05, + "loss": 2.1333, + "step": 3281 + }, + { + "epoch": 1.007366482504604, + "grad_norm": 0.6443020701408386, + "learning_rate": 9.869532996308065e-05, + "loss": 2.0227, + "step": 3282 + }, + { + "epoch": 1.0076734192756291, + "grad_norm": 0.6603342890739441, + "learning_rate": 9.869420165791891e-05, + "loss": 2.0888, + "step": 3283 + }, + { + "epoch": 1.0079803560466545, + "grad_norm": 0.6666482090950012, + "learning_rate": 9.869307287153251e-05, + "loss": 2.0132, + "step": 3284 + }, + { + "epoch": 1.0082872928176796, + "grad_norm": 0.6691575646400452, + "learning_rate": 9.869194360393264e-05, + "loss": 2.0752, + "step": 3285 + }, + { + "epoch": 1.0085942295887047, + "grad_norm": 0.6142565011978149, + "learning_rate": 9.869081385513044e-05, + "loss": 2.0491, + "step": 3286 + }, + { + "epoch": 1.0089011663597298, + "grad_norm": 0.5869930386543274, + "learning_rate": 9.868968362513708e-05, + "loss": 2.1252, + "step": 3287 + }, + { + "epoch": 1.0092081031307552, + "grad_norm": 0.532183825969696, + "learning_rate": 9.868855291396373e-05, + "loss": 2.0589, + "step": 3288 + }, + { + "epoch": 1.0095150399017803, + "grad_norm": 0.616374135017395, + "learning_rate": 9.868742172162156e-05, + "loss": 2.0808, + "step": 3289 + }, + { + "epoch": 1.0098219766728054, + "grad_norm": 0.5750923156738281, + "learning_rate": 9.868629004812176e-05, + "loss": 2.0407, + "step": 3290 + }, + { + "epoch": 1.0101289134438305, + "grad_norm": 0.6161531209945679, + "learning_rate": 9.86851578934755e-05, + "loss": 2.0938, + "step": 3291 + }, + { + "epoch": 1.0104358502148558, + "grad_norm": 0.5369158983230591, + "learning_rate": 9.868402525769397e-05, + "loss": 2.1298, + "step": 3292 + }, + { + "epoch": 1.010742786985881, + "grad_norm": 0.5134824514389038, + "learning_rate": 9.868289214078837e-05, + "loss": 2.0345, + "step": 3293 + }, + { + "epoch": 1.011049723756906, + "grad_norm": 0.4972594082355499, + "learning_rate": 9.868175854276991e-05, + "loss": 2.1264, + "step": 3294 + }, + { + "epoch": 1.0113566605279312, + "grad_norm": 0.5727534890174866, + "learning_rate": 9.868062446364976e-05, + "loss": 2.1668, + "step": 3295 + }, + { + "epoch": 1.0116635972989565, + "grad_norm": 0.6384626030921936, + "learning_rate": 9.867948990343915e-05, + "loss": 2.1125, + "step": 3296 + }, + { + "epoch": 1.0119705340699816, + "grad_norm": 0.7591070532798767, + "learning_rate": 9.867835486214929e-05, + "loss": 2.0975, + "step": 3297 + }, + { + "epoch": 1.0122774708410067, + "grad_norm": 0.7940282821655273, + "learning_rate": 9.86772193397914e-05, + "loss": 2.0107, + "step": 3298 + }, + { + "epoch": 1.0125844076120318, + "grad_norm": 0.6877933144569397, + "learning_rate": 9.86760833363767e-05, + "loss": 2.0684, + "step": 3299 + }, + { + "epoch": 1.0128913443830572, + "grad_norm": 0.5361137986183167, + "learning_rate": 9.867494685191641e-05, + "loss": 2.0426, + "step": 3300 + }, + { + "epoch": 1.0131982811540823, + "grad_norm": 0.5104349851608276, + "learning_rate": 9.867380988642177e-05, + "loss": 2.0849, + "step": 3301 + }, + { + "epoch": 1.0135052179251074, + "grad_norm": 0.6133849024772644, + "learning_rate": 9.867267243990399e-05, + "loss": 2.0789, + "step": 3302 + }, + { + "epoch": 1.0138121546961325, + "grad_norm": 0.6607559323310852, + "learning_rate": 9.867153451237436e-05, + "loss": 2.0978, + "step": 3303 + }, + { + "epoch": 1.0141190914671578, + "grad_norm": 0.6853774189949036, + "learning_rate": 9.867039610384409e-05, + "loss": 2.1612, + "step": 3304 + }, + { + "epoch": 1.014426028238183, + "grad_norm": 0.6326626539230347, + "learning_rate": 9.866925721432442e-05, + "loss": 2.0887, + "step": 3305 + }, + { + "epoch": 1.014732965009208, + "grad_norm": 0.5483830571174622, + "learning_rate": 9.866811784382665e-05, + "loss": 2.0522, + "step": 3306 + }, + { + "epoch": 1.0150399017802332, + "grad_norm": 0.5980744957923889, + "learning_rate": 9.866697799236201e-05, + "loss": 2.0666, + "step": 3307 + }, + { + "epoch": 1.0153468385512585, + "grad_norm": 0.6047075986862183, + "learning_rate": 9.866583765994177e-05, + "loss": 2.0924, + "step": 3308 + }, + { + "epoch": 1.0156537753222836, + "grad_norm": 0.5932674407958984, + "learning_rate": 9.86646968465772e-05, + "loss": 2.0426, + "step": 3309 + }, + { + "epoch": 1.0159607120933087, + "grad_norm": 0.5349873304367065, + "learning_rate": 9.866355555227957e-05, + "loss": 2.027, + "step": 3310 + }, + { + "epoch": 1.0162676488643339, + "grad_norm": 0.5090891122817993, + "learning_rate": 9.866241377706015e-05, + "loss": 2.0554, + "step": 3311 + }, + { + "epoch": 1.0165745856353592, + "grad_norm": 0.605268120765686, + "learning_rate": 9.866127152093025e-05, + "loss": 2.0788, + "step": 3312 + }, + { + "epoch": 1.0168815224063843, + "grad_norm": 0.6006563305854797, + "learning_rate": 9.866012878390113e-05, + "loss": 2.0154, + "step": 3313 + }, + { + "epoch": 1.0171884591774094, + "grad_norm": 0.6412727236747742, + "learning_rate": 9.865898556598409e-05, + "loss": 2.0948, + "step": 3314 + }, + { + "epoch": 1.0174953959484345, + "grad_norm": 0.512140154838562, + "learning_rate": 9.865784186719046e-05, + "loss": 2.0314, + "step": 3315 + }, + { + "epoch": 1.0178023327194599, + "grad_norm": 0.48285913467407227, + "learning_rate": 9.865669768753151e-05, + "loss": 1.9689, + "step": 3316 + }, + { + "epoch": 1.018109269490485, + "grad_norm": 0.6067737340927124, + "learning_rate": 9.865555302701854e-05, + "loss": 2.1042, + "step": 3317 + }, + { + "epoch": 1.01841620626151, + "grad_norm": 0.6272363662719727, + "learning_rate": 9.865440788566289e-05, + "loss": 2.1092, + "step": 3318 + }, + { + "epoch": 1.0187231430325352, + "grad_norm": 0.6264182925224304, + "learning_rate": 9.865326226347586e-05, + "loss": 2.0445, + "step": 3319 + }, + { + "epoch": 1.0190300798035605, + "grad_norm": 0.5642834901809692, + "learning_rate": 9.86521161604688e-05, + "loss": 2.1041, + "step": 3320 + }, + { + "epoch": 1.0193370165745856, + "grad_norm": 0.5188324451446533, + "learning_rate": 9.865096957665297e-05, + "loss": 2.0174, + "step": 3321 + }, + { + "epoch": 1.0196439533456108, + "grad_norm": 0.5204416513442993, + "learning_rate": 9.864982251203976e-05, + "loss": 2.0927, + "step": 3322 + }, + { + "epoch": 1.0199508901166359, + "grad_norm": 0.5845292806625366, + "learning_rate": 9.86486749666405e-05, + "loss": 2.0751, + "step": 3323 + }, + { + "epoch": 1.0202578268876612, + "grad_norm": 0.5514994263648987, + "learning_rate": 9.86475269404665e-05, + "loss": 2.0976, + "step": 3324 + }, + { + "epoch": 1.0205647636586863, + "grad_norm": 0.6578981280326843, + "learning_rate": 9.864637843352915e-05, + "loss": 2.0668, + "step": 3325 + }, + { + "epoch": 1.0208717004297114, + "grad_norm": 0.6396434307098389, + "learning_rate": 9.864522944583976e-05, + "loss": 2.0648, + "step": 3326 + }, + { + "epoch": 1.0211786372007365, + "grad_norm": 0.548759400844574, + "learning_rate": 9.86440799774097e-05, + "loss": 2.0873, + "step": 3327 + }, + { + "epoch": 1.0214855739717619, + "grad_norm": 0.5739279985427856, + "learning_rate": 9.864293002825033e-05, + "loss": 2.0623, + "step": 3328 + }, + { + "epoch": 1.021792510742787, + "grad_norm": 0.5882315039634705, + "learning_rate": 9.864177959837303e-05, + "loss": 2.0399, + "step": 3329 + }, + { + "epoch": 1.022099447513812, + "grad_norm": 0.563359797000885, + "learning_rate": 9.864062868778914e-05, + "loss": 2.0839, + "step": 3330 + }, + { + "epoch": 1.0224063842848374, + "grad_norm": 0.6162607073783875, + "learning_rate": 9.863947729651006e-05, + "loss": 2.0439, + "step": 3331 + }, + { + "epoch": 1.0227133210558625, + "grad_norm": 0.6540365815162659, + "learning_rate": 9.863832542454715e-05, + "loss": 2.1234, + "step": 3332 + }, + { + "epoch": 1.0230202578268877, + "grad_norm": 0.6401089429855347, + "learning_rate": 9.86371730719118e-05, + "loss": 2.0418, + "step": 3333 + }, + { + "epoch": 1.0233271945979128, + "grad_norm": 0.6456391215324402, + "learning_rate": 9.86360202386154e-05, + "loss": 2.1191, + "step": 3334 + }, + { + "epoch": 1.023634131368938, + "grad_norm": 0.59992516040802, + "learning_rate": 9.863486692466933e-05, + "loss": 2.0582, + "step": 3335 + }, + { + "epoch": 1.0239410681399632, + "grad_norm": 0.5932520627975464, + "learning_rate": 9.8633713130085e-05, + "loss": 2.1812, + "step": 3336 + }, + { + "epoch": 1.0242480049109883, + "grad_norm": 0.6322866082191467, + "learning_rate": 9.863255885487384e-05, + "loss": 2.1523, + "step": 3337 + }, + { + "epoch": 1.0245549416820134, + "grad_norm": 0.6291313171386719, + "learning_rate": 9.863140409904719e-05, + "loss": 2.0495, + "step": 3338 + }, + { + "epoch": 1.0248618784530388, + "grad_norm": 0.6272565126419067, + "learning_rate": 9.863024886261653e-05, + "loss": 1.9812, + "step": 3339 + }, + { + "epoch": 1.025168815224064, + "grad_norm": 0.6485729217529297, + "learning_rate": 9.862909314559323e-05, + "loss": 2.0826, + "step": 3340 + }, + { + "epoch": 1.025475751995089, + "grad_norm": 0.608239471912384, + "learning_rate": 9.862793694798875e-05, + "loss": 2.0519, + "step": 3341 + }, + { + "epoch": 1.0257826887661141, + "grad_norm": 0.5492779612541199, + "learning_rate": 9.862678026981447e-05, + "loss": 1.9901, + "step": 3342 + }, + { + "epoch": 1.0260896255371394, + "grad_norm": 0.524030327796936, + "learning_rate": 9.862562311108187e-05, + "loss": 2.0695, + "step": 3343 + }, + { + "epoch": 1.0263965623081646, + "grad_norm": 0.6835227608680725, + "learning_rate": 9.862446547180235e-05, + "loss": 2.1312, + "step": 3344 + }, + { + "epoch": 1.0267034990791897, + "grad_norm": 0.6771748065948486, + "learning_rate": 9.862330735198736e-05, + "loss": 2.0566, + "step": 3345 + }, + { + "epoch": 1.0270104358502148, + "grad_norm": 0.609993577003479, + "learning_rate": 9.862214875164835e-05, + "loss": 2.1463, + "step": 3346 + }, + { + "epoch": 1.0273173726212401, + "grad_norm": 0.6617777347564697, + "learning_rate": 9.862098967079677e-05, + "loss": 2.0485, + "step": 3347 + }, + { + "epoch": 1.0276243093922652, + "grad_norm": 0.7935113906860352, + "learning_rate": 9.861983010944407e-05, + "loss": 2.0528, + "step": 3348 + }, + { + "epoch": 1.0279312461632903, + "grad_norm": 0.7510255575180054, + "learning_rate": 9.861867006760172e-05, + "loss": 1.9803, + "step": 3349 + }, + { + "epoch": 1.0282381829343155, + "grad_norm": 0.6944519281387329, + "learning_rate": 9.861750954528117e-05, + "loss": 2.0488, + "step": 3350 + }, + { + "epoch": 1.0285451197053408, + "grad_norm": 0.6057126522064209, + "learning_rate": 9.861634854249389e-05, + "loss": 2.1465, + "step": 3351 + }, + { + "epoch": 1.028852056476366, + "grad_norm": 0.6156182289123535, + "learning_rate": 9.861518705925135e-05, + "loss": 2.1227, + "step": 3352 + }, + { + "epoch": 1.029158993247391, + "grad_norm": 0.6016978621482849, + "learning_rate": 9.861402509556506e-05, + "loss": 2.0238, + "step": 3353 + }, + { + "epoch": 1.0294659300184161, + "grad_norm": 0.5987950563430786, + "learning_rate": 9.861286265144648e-05, + "loss": 2.0529, + "step": 3354 + }, + { + "epoch": 1.0297728667894415, + "grad_norm": 0.6011384725570679, + "learning_rate": 9.861169972690707e-05, + "loss": 2.0612, + "step": 3355 + }, + { + "epoch": 1.0300798035604666, + "grad_norm": 0.5217840671539307, + "learning_rate": 9.861053632195838e-05, + "loss": 2.0472, + "step": 3356 + }, + { + "epoch": 1.0303867403314917, + "grad_norm": 0.5202180743217468, + "learning_rate": 9.860937243661186e-05, + "loss": 2.1301, + "step": 3357 + }, + { + "epoch": 1.0306936771025168, + "grad_norm": 0.572290301322937, + "learning_rate": 9.860820807087905e-05, + "loss": 2.0309, + "step": 3358 + }, + { + "epoch": 1.0310006138735421, + "grad_norm": 0.5088694095611572, + "learning_rate": 9.860704322477142e-05, + "loss": 2.0789, + "step": 3359 + }, + { + "epoch": 1.0313075506445673, + "grad_norm": 0.5546056032180786, + "learning_rate": 9.860587789830052e-05, + "loss": 1.9708, + "step": 3360 + }, + { + "epoch": 1.0316144874155924, + "grad_norm": 0.5152996182441711, + "learning_rate": 9.860471209147782e-05, + "loss": 2.0656, + "step": 3361 + }, + { + "epoch": 1.0319214241866175, + "grad_norm": 0.4997018873691559, + "learning_rate": 9.860354580431488e-05, + "loss": 2.1404, + "step": 3362 + }, + { + "epoch": 1.0322283609576428, + "grad_norm": 0.5464209318161011, + "learning_rate": 9.860237903682321e-05, + "loss": 2.0013, + "step": 3363 + }, + { + "epoch": 1.032535297728668, + "grad_norm": 0.4934932589530945, + "learning_rate": 9.860121178901435e-05, + "loss": 2.0873, + "step": 3364 + }, + { + "epoch": 1.032842234499693, + "grad_norm": 0.5755184292793274, + "learning_rate": 9.860004406089982e-05, + "loss": 2.0706, + "step": 3365 + }, + { + "epoch": 1.0331491712707181, + "grad_norm": 0.6155427098274231, + "learning_rate": 9.859887585249117e-05, + "loss": 2.1153, + "step": 3366 + }, + { + "epoch": 1.0334561080417435, + "grad_norm": 0.6251068711280823, + "learning_rate": 9.859770716379995e-05, + "loss": 1.9988, + "step": 3367 + }, + { + "epoch": 1.0337630448127686, + "grad_norm": 0.5652515888214111, + "learning_rate": 9.85965379948377e-05, + "loss": 1.9834, + "step": 3368 + }, + { + "epoch": 1.0340699815837937, + "grad_norm": 0.49031418561935425, + "learning_rate": 9.859536834561599e-05, + "loss": 2.0719, + "step": 3369 + }, + { + "epoch": 1.0343769183548188, + "grad_norm": 0.5014585852622986, + "learning_rate": 9.859419821614635e-05, + "loss": 2.0309, + "step": 3370 + }, + { + "epoch": 1.0346838551258442, + "grad_norm": 0.5657221674919128, + "learning_rate": 9.859302760644036e-05, + "loss": 2.048, + "step": 3371 + }, + { + "epoch": 1.0349907918968693, + "grad_norm": 0.7023506164550781, + "learning_rate": 9.85918565165096e-05, + "loss": 2.033, + "step": 3372 + }, + { + "epoch": 1.0352977286678944, + "grad_norm": 0.5712850689888, + "learning_rate": 9.859068494636565e-05, + "loss": 2.1006, + "step": 3373 + }, + { + "epoch": 1.0356046654389195, + "grad_norm": 0.5352653861045837, + "learning_rate": 9.858951289602004e-05, + "loss": 1.9775, + "step": 3374 + }, + { + "epoch": 1.0359116022099448, + "grad_norm": 0.5282073616981506, + "learning_rate": 9.85883403654844e-05, + "loss": 2.0388, + "step": 3375 + }, + { + "epoch": 1.03621853898097, + "grad_norm": 0.6164727210998535, + "learning_rate": 9.85871673547703e-05, + "loss": 2.0758, + "step": 3376 + }, + { + "epoch": 1.036525475751995, + "grad_norm": 0.6034660935401917, + "learning_rate": 9.858599386388933e-05, + "loss": 2.0619, + "step": 3377 + }, + { + "epoch": 1.0368324125230202, + "grad_norm": 0.6129952073097229, + "learning_rate": 9.85848198928531e-05, + "loss": 2.0709, + "step": 3378 + }, + { + "epoch": 1.0371393492940455, + "grad_norm": 0.6287248134613037, + "learning_rate": 9.85836454416732e-05, + "loss": 2.1493, + "step": 3379 + }, + { + "epoch": 1.0374462860650706, + "grad_norm": 0.675419807434082, + "learning_rate": 9.858247051036124e-05, + "loss": 2.0558, + "step": 3380 + }, + { + "epoch": 1.0377532228360957, + "grad_norm": 0.6493481397628784, + "learning_rate": 9.858129509892882e-05, + "loss": 2.2019, + "step": 3381 + }, + { + "epoch": 1.0380601596071208, + "grad_norm": 0.6690036058425903, + "learning_rate": 9.85801192073876e-05, + "loss": 2.0069, + "step": 3382 + }, + { + "epoch": 1.0383670963781462, + "grad_norm": 0.6682954430580139, + "learning_rate": 9.857894283574913e-05, + "loss": 2.0559, + "step": 3383 + }, + { + "epoch": 1.0386740331491713, + "grad_norm": 0.6408236622810364, + "learning_rate": 9.857776598402508e-05, + "loss": 2.0837, + "step": 3384 + }, + { + "epoch": 1.0389809699201964, + "grad_norm": 0.7896385192871094, + "learning_rate": 9.85765886522271e-05, + "loss": 2.1344, + "step": 3385 + }, + { + "epoch": 1.0392879066912215, + "grad_norm": 0.7404007911682129, + "learning_rate": 9.857541084036677e-05, + "loss": 2.0937, + "step": 3386 + }, + { + "epoch": 1.0395948434622468, + "grad_norm": 0.6780609488487244, + "learning_rate": 9.857423254845577e-05, + "loss": 2.0279, + "step": 3387 + }, + { + "epoch": 1.039901780233272, + "grad_norm": 0.5989474654197693, + "learning_rate": 9.857305377650574e-05, + "loss": 2.0997, + "step": 3388 + }, + { + "epoch": 1.040208717004297, + "grad_norm": 0.5449484586715698, + "learning_rate": 9.857187452452832e-05, + "loss": 2.0544, + "step": 3389 + }, + { + "epoch": 1.0405156537753222, + "grad_norm": 0.6261779069900513, + "learning_rate": 9.857069479253516e-05, + "loss": 2.024, + "step": 3390 + }, + { + "epoch": 1.0408225905463475, + "grad_norm": 0.6665713787078857, + "learning_rate": 9.856951458053794e-05, + "loss": 2.1139, + "step": 3391 + }, + { + "epoch": 1.0411295273173726, + "grad_norm": 0.5861490964889526, + "learning_rate": 9.856833388854829e-05, + "loss": 2.0087, + "step": 3392 + }, + { + "epoch": 1.0414364640883977, + "grad_norm": 0.5511623620986938, + "learning_rate": 9.856715271657793e-05, + "loss": 2.106, + "step": 3393 + }, + { + "epoch": 1.0417434008594229, + "grad_norm": 0.5450705885887146, + "learning_rate": 9.856597106463848e-05, + "loss": 2.0669, + "step": 3394 + }, + { + "epoch": 1.0420503376304482, + "grad_norm": 0.5172801613807678, + "learning_rate": 9.856478893274163e-05, + "loss": 2.0492, + "step": 3395 + }, + { + "epoch": 1.0423572744014733, + "grad_norm": 0.580157458782196, + "learning_rate": 9.856360632089907e-05, + "loss": 2.0794, + "step": 3396 + }, + { + "epoch": 1.0426642111724984, + "grad_norm": 0.5138662457466125, + "learning_rate": 9.856242322912251e-05, + "loss": 2.0813, + "step": 3397 + }, + { + "epoch": 1.0429711479435237, + "grad_norm": 0.5626689791679382, + "learning_rate": 9.85612396574236e-05, + "loss": 2.071, + "step": 3398 + }, + { + "epoch": 1.0432780847145489, + "grad_norm": 0.6069894433021545, + "learning_rate": 9.856005560581407e-05, + "loss": 2.132, + "step": 3399 + }, + { + "epoch": 1.043585021485574, + "grad_norm": 0.547346293926239, + "learning_rate": 9.85588710743056e-05, + "loss": 2.0572, + "step": 3400 + }, + { + "epoch": 1.043891958256599, + "grad_norm": 0.5712311863899231, + "learning_rate": 9.855768606290992e-05, + "loss": 2.0943, + "step": 3401 + }, + { + "epoch": 1.0441988950276242, + "grad_norm": 0.5945014953613281, + "learning_rate": 9.85565005716387e-05, + "loss": 2.1004, + "step": 3402 + }, + { + "epoch": 1.0445058317986495, + "grad_norm": 0.5712563395500183, + "learning_rate": 9.85553146005037e-05, + "loss": 2.0817, + "step": 3403 + }, + { + "epoch": 1.0448127685696746, + "grad_norm": 0.552578866481781, + "learning_rate": 9.855412814951661e-05, + "loss": 2.0514, + "step": 3404 + }, + { + "epoch": 1.0451197053406998, + "grad_norm": 0.5654930472373962, + "learning_rate": 9.855294121868918e-05, + "loss": 2.1342, + "step": 3405 + }, + { + "epoch": 1.045426642111725, + "grad_norm": 0.516094446182251, + "learning_rate": 9.855175380803312e-05, + "loss": 2.01, + "step": 3406 + }, + { + "epoch": 1.0457335788827502, + "grad_norm": 0.5198549628257751, + "learning_rate": 9.855056591756018e-05, + "loss": 2.0423, + "step": 3407 + }, + { + "epoch": 1.0460405156537753, + "grad_norm": 0.45312678813934326, + "learning_rate": 9.854937754728209e-05, + "loss": 1.9767, + "step": 3408 + }, + { + "epoch": 1.0463474524248004, + "grad_norm": 0.4647958278656006, + "learning_rate": 9.854818869721059e-05, + "loss": 2.107, + "step": 3409 + }, + { + "epoch": 1.0466543891958258, + "grad_norm": 0.5034347772598267, + "learning_rate": 9.854699936735742e-05, + "loss": 2.0358, + "step": 3410 + }, + { + "epoch": 1.0469613259668509, + "grad_norm": 0.48189103603363037, + "learning_rate": 9.854580955773435e-05, + "loss": 2.0441, + "step": 3411 + }, + { + "epoch": 1.047268262737876, + "grad_norm": 0.5315099954605103, + "learning_rate": 9.854461926835316e-05, + "loss": 2.0222, + "step": 3412 + }, + { + "epoch": 1.047575199508901, + "grad_norm": 0.6013970971107483, + "learning_rate": 9.854342849922557e-05, + "loss": 2.09, + "step": 3413 + }, + { + "epoch": 1.0478821362799264, + "grad_norm": 0.7554240226745605, + "learning_rate": 9.854223725036339e-05, + "loss": 2.0411, + "step": 3414 + }, + { + "epoch": 1.0481890730509515, + "grad_norm": 0.7160158157348633, + "learning_rate": 9.854104552177835e-05, + "loss": 2.0858, + "step": 3415 + }, + { + "epoch": 1.0484960098219767, + "grad_norm": 0.5641576051712036, + "learning_rate": 9.853985331348225e-05, + "loss": 2.0287, + "step": 3416 + }, + { + "epoch": 1.0488029465930018, + "grad_norm": 0.5947676301002502, + "learning_rate": 9.853866062548687e-05, + "loss": 2.1177, + "step": 3417 + }, + { + "epoch": 1.049109883364027, + "grad_norm": 0.5780991911888123, + "learning_rate": 9.853746745780401e-05, + "loss": 2.024, + "step": 3418 + }, + { + "epoch": 1.0494168201350522, + "grad_norm": 0.6753053665161133, + "learning_rate": 9.853627381044543e-05, + "loss": 2.1303, + "step": 3419 + }, + { + "epoch": 1.0497237569060773, + "grad_norm": 0.7183442711830139, + "learning_rate": 9.853507968342295e-05, + "loss": 2.0845, + "step": 3420 + }, + { + "epoch": 1.0500306936771024, + "grad_norm": 0.6768840551376343, + "learning_rate": 9.853388507674837e-05, + "loss": 2.0991, + "step": 3421 + }, + { + "epoch": 1.0503376304481278, + "grad_norm": 0.624703049659729, + "learning_rate": 9.85326899904335e-05, + "loss": 2.0952, + "step": 3422 + }, + { + "epoch": 1.050644567219153, + "grad_norm": 0.523289144039154, + "learning_rate": 9.853149442449013e-05, + "loss": 2.0244, + "step": 3423 + }, + { + "epoch": 1.050951503990178, + "grad_norm": 0.4939860701560974, + "learning_rate": 9.853029837893008e-05, + "loss": 2.0312, + "step": 3424 + }, + { + "epoch": 1.0512584407612031, + "grad_norm": 0.5685132145881653, + "learning_rate": 9.852910185376519e-05, + "loss": 2.0863, + "step": 3425 + }, + { + "epoch": 1.0515653775322285, + "grad_norm": 0.5713129639625549, + "learning_rate": 9.852790484900725e-05, + "loss": 2.1182, + "step": 3426 + }, + { + "epoch": 1.0518723143032536, + "grad_norm": 0.5626100301742554, + "learning_rate": 9.852670736466813e-05, + "loss": 2.0187, + "step": 3427 + }, + { + "epoch": 1.0521792510742787, + "grad_norm": 0.5129684805870056, + "learning_rate": 9.852550940075965e-05, + "loss": 2.0354, + "step": 3428 + }, + { + "epoch": 1.0524861878453038, + "grad_norm": 0.6123769879341125, + "learning_rate": 9.852431095729361e-05, + "loss": 2.1315, + "step": 3429 + }, + { + "epoch": 1.0527931246163291, + "grad_norm": 0.66834956407547, + "learning_rate": 9.852311203428192e-05, + "loss": 2.1642, + "step": 3430 + }, + { + "epoch": 1.0531000613873542, + "grad_norm": 0.6253052353858948, + "learning_rate": 9.85219126317364e-05, + "loss": 2.0651, + "step": 3431 + }, + { + "epoch": 1.0534069981583793, + "grad_norm": 0.5162510871887207, + "learning_rate": 9.852071274966888e-05, + "loss": 2.0029, + "step": 3432 + }, + { + "epoch": 1.0537139349294045, + "grad_norm": 0.5725626349449158, + "learning_rate": 9.851951238809125e-05, + "loss": 2.0875, + "step": 3433 + }, + { + "epoch": 1.0540208717004298, + "grad_norm": 0.5319885611534119, + "learning_rate": 9.851831154701537e-05, + "loss": 2.0042, + "step": 3434 + }, + { + "epoch": 1.054327808471455, + "grad_norm": 0.5030925273895264, + "learning_rate": 9.851711022645307e-05, + "loss": 1.9805, + "step": 3435 + }, + { + "epoch": 1.05463474524248, + "grad_norm": 0.5786148309707642, + "learning_rate": 9.851590842641627e-05, + "loss": 2.1456, + "step": 3436 + }, + { + "epoch": 1.0549416820135051, + "grad_norm": 0.6246622800827026, + "learning_rate": 9.851470614691682e-05, + "loss": 2.042, + "step": 3437 + }, + { + "epoch": 1.0552486187845305, + "grad_norm": 0.5181210041046143, + "learning_rate": 9.851350338796662e-05, + "loss": 2.0423, + "step": 3438 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.5505120754241943, + "learning_rate": 9.851230014957754e-05, + "loss": 2.0478, + "step": 3439 + }, + { + "epoch": 1.0558624923265807, + "grad_norm": 0.6193632483482361, + "learning_rate": 9.851109643176147e-05, + "loss": 1.9904, + "step": 3440 + }, + { + "epoch": 1.0561694290976058, + "grad_norm": 0.6332803964614868, + "learning_rate": 9.85098922345303e-05, + "loss": 2.0037, + "step": 3441 + }, + { + "epoch": 1.0564763658686311, + "grad_norm": 0.5601481199264526, + "learning_rate": 9.850868755789595e-05, + "loss": 2.141, + "step": 3442 + }, + { + "epoch": 1.0567833026396563, + "grad_norm": 0.588182270526886, + "learning_rate": 9.850748240187033e-05, + "loss": 2.17, + "step": 3443 + }, + { + "epoch": 1.0570902394106814, + "grad_norm": 0.5955865383148193, + "learning_rate": 9.850627676646533e-05, + "loss": 2.1004, + "step": 3444 + }, + { + "epoch": 1.0573971761817065, + "grad_norm": 0.6412670612335205, + "learning_rate": 9.850507065169288e-05, + "loss": 2.0642, + "step": 3445 + }, + { + "epoch": 1.0577041129527318, + "grad_norm": 0.5597305297851562, + "learning_rate": 9.850386405756489e-05, + "loss": 2.0412, + "step": 3446 + }, + { + "epoch": 1.058011049723757, + "grad_norm": 0.5633887052536011, + "learning_rate": 9.850265698409328e-05, + "loss": 1.9976, + "step": 3447 + }, + { + "epoch": 1.058317986494782, + "grad_norm": 0.5924213528633118, + "learning_rate": 9.850144943128998e-05, + "loss": 2.0715, + "step": 3448 + }, + { + "epoch": 1.0586249232658071, + "grad_norm": 0.5968048572540283, + "learning_rate": 9.850024139916694e-05, + "loss": 2.0755, + "step": 3449 + }, + { + "epoch": 1.0589318600368325, + "grad_norm": 0.5745044946670532, + "learning_rate": 9.849903288773609e-05, + "loss": 2.0615, + "step": 3450 + }, + { + "epoch": 1.0592387968078576, + "grad_norm": 0.5154273509979248, + "learning_rate": 9.849782389700936e-05, + "loss": 2.0429, + "step": 3451 + }, + { + "epoch": 1.0595457335788827, + "grad_norm": 0.5307286977767944, + "learning_rate": 9.849661442699871e-05, + "loss": 2.0788, + "step": 3452 + }, + { + "epoch": 1.0598526703499078, + "grad_norm": 0.5445010662078857, + "learning_rate": 9.84954044777161e-05, + "loss": 2.0598, + "step": 3453 + }, + { + "epoch": 1.0601596071209332, + "grad_norm": 0.5858064889907837, + "learning_rate": 9.849419404917347e-05, + "loss": 2.069, + "step": 3454 + }, + { + "epoch": 1.0604665438919583, + "grad_norm": 0.5906962156295776, + "learning_rate": 9.84929831413828e-05, + "loss": 2.1256, + "step": 3455 + }, + { + "epoch": 1.0607734806629834, + "grad_norm": 0.6632845997810364, + "learning_rate": 9.849177175435605e-05, + "loss": 2.1002, + "step": 3456 + }, + { + "epoch": 1.0610804174340085, + "grad_norm": 0.6352782845497131, + "learning_rate": 9.849055988810518e-05, + "loss": 2.0901, + "step": 3457 + }, + { + "epoch": 1.0613873542050338, + "grad_norm": 0.5406731963157654, + "learning_rate": 9.848934754264218e-05, + "loss": 2.0562, + "step": 3458 + }, + { + "epoch": 1.061694290976059, + "grad_norm": 0.6067590117454529, + "learning_rate": 9.848813471797902e-05, + "loss": 2.0914, + "step": 3459 + }, + { + "epoch": 1.062001227747084, + "grad_norm": 0.5876826047897339, + "learning_rate": 9.84869214141277e-05, + "loss": 2.0065, + "step": 3460 + }, + { + "epoch": 1.0623081645181092, + "grad_norm": 0.611648440361023, + "learning_rate": 9.84857076311002e-05, + "loss": 2.1252, + "step": 3461 + }, + { + "epoch": 1.0626151012891345, + "grad_norm": 0.568358302116394, + "learning_rate": 9.848449336890853e-05, + "loss": 2.0312, + "step": 3462 + }, + { + "epoch": 1.0629220380601596, + "grad_norm": 0.5303518772125244, + "learning_rate": 9.848327862756466e-05, + "loss": 1.9989, + "step": 3463 + }, + { + "epoch": 1.0632289748311847, + "grad_norm": 0.5377182960510254, + "learning_rate": 9.848206340708062e-05, + "loss": 2.0759, + "step": 3464 + }, + { + "epoch": 1.06353591160221, + "grad_norm": 0.5178431868553162, + "learning_rate": 9.848084770746842e-05, + "loss": 2.0613, + "step": 3465 + }, + { + "epoch": 1.0638428483732352, + "grad_norm": 0.4605518877506256, + "learning_rate": 9.847963152874007e-05, + "loss": 1.9961, + "step": 3466 + }, + { + "epoch": 1.0641497851442603, + "grad_norm": 0.5262506604194641, + "learning_rate": 9.847841487090758e-05, + "loss": 2.032, + "step": 3467 + }, + { + "epoch": 1.0644567219152854, + "grad_norm": 0.5210484862327576, + "learning_rate": 9.847719773398298e-05, + "loss": 2.106, + "step": 3468 + }, + { + "epoch": 1.0647636586863105, + "grad_norm": 0.5159584283828735, + "learning_rate": 9.84759801179783e-05, + "loss": 2.07, + "step": 3469 + }, + { + "epoch": 1.0650705954573358, + "grad_norm": 0.5094224810600281, + "learning_rate": 9.847476202290557e-05, + "loss": 2.1379, + "step": 3470 + }, + { + "epoch": 1.065377532228361, + "grad_norm": 0.5180851221084595, + "learning_rate": 9.847354344877684e-05, + "loss": 2.0911, + "step": 3471 + }, + { + "epoch": 1.065684468999386, + "grad_norm": 0.5476199984550476, + "learning_rate": 9.847232439560412e-05, + "loss": 2.0654, + "step": 3472 + }, + { + "epoch": 1.0659914057704114, + "grad_norm": 0.5314182639122009, + "learning_rate": 9.84711048633995e-05, + "loss": 1.9829, + "step": 3473 + }, + { + "epoch": 1.0662983425414365, + "grad_norm": 0.549379825592041, + "learning_rate": 9.8469884852175e-05, + "loss": 2.0876, + "step": 3474 + }, + { + "epoch": 1.0666052793124616, + "grad_norm": 0.6280861496925354, + "learning_rate": 9.84686643619427e-05, + "loss": 2.1026, + "step": 3475 + }, + { + "epoch": 1.0669122160834867, + "grad_norm": 0.5838838219642639, + "learning_rate": 9.846744339271464e-05, + "loss": 2.0553, + "step": 3476 + }, + { + "epoch": 1.0672191528545119, + "grad_norm": 0.6090747117996216, + "learning_rate": 9.84662219445029e-05, + "loss": 2.0983, + "step": 3477 + }, + { + "epoch": 1.0675260896255372, + "grad_norm": 0.515504002571106, + "learning_rate": 9.846500001731955e-05, + "loss": 2.0992, + "step": 3478 + }, + { + "epoch": 1.0678330263965623, + "grad_norm": 0.5083954930305481, + "learning_rate": 9.846377761117667e-05, + "loss": 1.9851, + "step": 3479 + }, + { + "epoch": 1.0681399631675874, + "grad_norm": 0.5102222561836243, + "learning_rate": 9.846255472608632e-05, + "loss": 2.0553, + "step": 3480 + }, + { + "epoch": 1.0684468999386127, + "grad_norm": 0.5123574137687683, + "learning_rate": 9.846133136206061e-05, + "loss": 2.0382, + "step": 3481 + }, + { + "epoch": 1.0687538367096379, + "grad_norm": 0.5657833814620972, + "learning_rate": 9.84601075191116e-05, + "loss": 2.0735, + "step": 3482 + }, + { + "epoch": 1.069060773480663, + "grad_norm": 0.5460711121559143, + "learning_rate": 9.845888319725143e-05, + "loss": 2.0445, + "step": 3483 + }, + { + "epoch": 1.069367710251688, + "grad_norm": 0.42860034108161926, + "learning_rate": 9.845765839649217e-05, + "loss": 2.0166, + "step": 3484 + }, + { + "epoch": 1.0696746470227134, + "grad_norm": 0.5413190126419067, + "learning_rate": 9.845643311684592e-05, + "loss": 1.9923, + "step": 3485 + }, + { + "epoch": 1.0699815837937385, + "grad_norm": 0.4982166290283203, + "learning_rate": 9.84552073583248e-05, + "loss": 2.0279, + "step": 3486 + }, + { + "epoch": 1.0702885205647636, + "grad_norm": 0.4824393689632416, + "learning_rate": 9.845398112094091e-05, + "loss": 1.9661, + "step": 3487 + }, + { + "epoch": 1.0705954573357888, + "grad_norm": 0.5690898895263672, + "learning_rate": 9.845275440470639e-05, + "loss": 2.0866, + "step": 3488 + }, + { + "epoch": 1.070902394106814, + "grad_norm": 0.6087098717689514, + "learning_rate": 9.845152720963335e-05, + "loss": 2.055, + "step": 3489 + }, + { + "epoch": 1.0712093308778392, + "grad_norm": 0.5754218101501465, + "learning_rate": 9.845029953573392e-05, + "loss": 2.0577, + "step": 3490 + }, + { + "epoch": 1.0715162676488643, + "grad_norm": 0.619746744632721, + "learning_rate": 9.844907138302023e-05, + "loss": 2.0694, + "step": 3491 + }, + { + "epoch": 1.0718232044198894, + "grad_norm": 0.5165389776229858, + "learning_rate": 9.844784275150442e-05, + "loss": 1.9618, + "step": 3492 + }, + { + "epoch": 1.0721301411909148, + "grad_norm": 0.5098079442977905, + "learning_rate": 9.844661364119863e-05, + "loss": 2.0021, + "step": 3493 + }, + { + "epoch": 1.0724370779619399, + "grad_norm": 0.5978688597679138, + "learning_rate": 9.8445384052115e-05, + "loss": 2.0861, + "step": 3494 + }, + { + "epoch": 1.072744014732965, + "grad_norm": 0.5498695373535156, + "learning_rate": 9.844415398426572e-05, + "loss": 2.095, + "step": 3495 + }, + { + "epoch": 1.07305095150399, + "grad_norm": 0.4890369474887848, + "learning_rate": 9.844292343766289e-05, + "loss": 1.9819, + "step": 3496 + }, + { + "epoch": 1.0733578882750154, + "grad_norm": 0.49551400542259216, + "learning_rate": 9.844169241231871e-05, + "loss": 2.109, + "step": 3497 + }, + { + "epoch": 1.0736648250460405, + "grad_norm": 0.5358633399009705, + "learning_rate": 9.844046090824533e-05, + "loss": 2.0579, + "step": 3498 + }, + { + "epoch": 1.0739717618170657, + "grad_norm": 0.5990919470787048, + "learning_rate": 9.843922892545492e-05, + "loss": 2.1962, + "step": 3499 + }, + { + "epoch": 1.0742786985880908, + "grad_norm": 0.5973169207572937, + "learning_rate": 9.843799646395967e-05, + "loss": 2.0691, + "step": 3500 + }, + { + "epoch": 1.074585635359116, + "grad_norm": 0.5875831246376038, + "learning_rate": 9.843676352377172e-05, + "loss": 2.0807, + "step": 3501 + }, + { + "epoch": 1.0748925721301412, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.84355301049033e-05, + "loss": 2.0694, + "step": 3502 + }, + { + "epoch": 1.0751995089011663, + "grad_norm": 0.7694209814071655, + "learning_rate": 9.843429620736659e-05, + "loss": 2.1504, + "step": 3503 + }, + { + "epoch": 1.0755064456721914, + "grad_norm": 0.7930089831352234, + "learning_rate": 9.843306183117376e-05, + "loss": 2.0635, + "step": 3504 + }, + { + "epoch": 1.0758133824432168, + "grad_norm": 0.6518469452857971, + "learning_rate": 9.843182697633704e-05, + "loss": 2.0395, + "step": 3505 + }, + { + "epoch": 1.076120319214242, + "grad_norm": 0.49737605452537537, + "learning_rate": 9.843059164286861e-05, + "loss": 1.9875, + "step": 3506 + }, + { + "epoch": 1.076427255985267, + "grad_norm": 0.5311492085456848, + "learning_rate": 9.84293558307807e-05, + "loss": 2.1331, + "step": 3507 + }, + { + "epoch": 1.0767341927562921, + "grad_norm": 0.6801449656486511, + "learning_rate": 9.842811954008551e-05, + "loss": 2.0991, + "step": 3508 + }, + { + "epoch": 1.0770411295273175, + "grad_norm": 0.5404406189918518, + "learning_rate": 9.842688277079523e-05, + "loss": 2.0482, + "step": 3509 + }, + { + "epoch": 1.0773480662983426, + "grad_norm": 0.6136532425880432, + "learning_rate": 9.842564552292215e-05, + "loss": 2.1016, + "step": 3510 + }, + { + "epoch": 1.0776550030693677, + "grad_norm": 0.5874183773994446, + "learning_rate": 9.842440779647843e-05, + "loss": 2.0495, + "step": 3511 + }, + { + "epoch": 1.0779619398403928, + "grad_norm": 0.4891047775745392, + "learning_rate": 9.842316959147635e-05, + "loss": 2.0592, + "step": 3512 + }, + { + "epoch": 1.0782688766114181, + "grad_norm": 0.5115689635276794, + "learning_rate": 9.84219309079281e-05, + "loss": 2.0084, + "step": 3513 + }, + { + "epoch": 1.0785758133824432, + "grad_norm": 0.5662370324134827, + "learning_rate": 9.842069174584597e-05, + "loss": 2.1134, + "step": 3514 + }, + { + "epoch": 1.0788827501534684, + "grad_norm": 0.6859605312347412, + "learning_rate": 9.841945210524217e-05, + "loss": 2.1144, + "step": 3515 + }, + { + "epoch": 1.0791896869244935, + "grad_norm": 0.8003933429718018, + "learning_rate": 9.841821198612897e-05, + "loss": 2.0353, + "step": 3516 + }, + { + "epoch": 1.0794966236955188, + "grad_norm": 0.8481027483940125, + "learning_rate": 9.841697138851863e-05, + "loss": 2.1012, + "step": 3517 + }, + { + "epoch": 1.079803560466544, + "grad_norm": 0.7234178185462952, + "learning_rate": 9.84157303124234e-05, + "loss": 2.1134, + "step": 3518 + }, + { + "epoch": 1.080110497237569, + "grad_norm": 0.6129522919654846, + "learning_rate": 9.841448875785553e-05, + "loss": 2.0736, + "step": 3519 + }, + { + "epoch": 1.0804174340085941, + "grad_norm": 0.4983314573764801, + "learning_rate": 9.841324672482732e-05, + "loss": 2.0334, + "step": 3520 + }, + { + "epoch": 1.0807243707796195, + "grad_norm": 0.6069099307060242, + "learning_rate": 9.841200421335101e-05, + "loss": 2.0506, + "step": 3521 + }, + { + "epoch": 1.0810313075506446, + "grad_norm": 0.5841798186302185, + "learning_rate": 9.841076122343893e-05, + "loss": 2.0491, + "step": 3522 + }, + { + "epoch": 1.0813382443216697, + "grad_norm": 0.5629861354827881, + "learning_rate": 9.84095177551033e-05, + "loss": 2.0435, + "step": 3523 + }, + { + "epoch": 1.0816451810926948, + "grad_norm": 0.48676446080207825, + "learning_rate": 9.840827380835646e-05, + "loss": 2.0543, + "step": 3524 + }, + { + "epoch": 1.0819521178637201, + "grad_norm": 0.5119389295578003, + "learning_rate": 9.840702938321069e-05, + "loss": 2.0461, + "step": 3525 + }, + { + "epoch": 1.0822590546347453, + "grad_norm": 0.47259917855262756, + "learning_rate": 9.840578447967827e-05, + "loss": 2.0494, + "step": 3526 + }, + { + "epoch": 1.0825659914057704, + "grad_norm": 0.5083605647087097, + "learning_rate": 9.840453909777153e-05, + "loss": 2.0518, + "step": 3527 + }, + { + "epoch": 1.0828729281767955, + "grad_norm": 0.46149778366088867, + "learning_rate": 9.840329323750276e-05, + "loss": 2.0087, + "step": 3528 + }, + { + "epoch": 1.0831798649478208, + "grad_norm": 0.4698919951915741, + "learning_rate": 9.840204689888427e-05, + "loss": 2.0715, + "step": 3529 + }, + { + "epoch": 1.083486801718846, + "grad_norm": 0.514570951461792, + "learning_rate": 9.840080008192838e-05, + "loss": 2.1067, + "step": 3530 + }, + { + "epoch": 1.083793738489871, + "grad_norm": 0.5938723087310791, + "learning_rate": 9.839955278664743e-05, + "loss": 2.1246, + "step": 3531 + }, + { + "epoch": 1.0841006752608962, + "grad_norm": 0.58525550365448, + "learning_rate": 9.839830501305372e-05, + "loss": 2.0695, + "step": 3532 + }, + { + "epoch": 1.0844076120319215, + "grad_norm": 0.5693490505218506, + "learning_rate": 9.83970567611596e-05, + "loss": 2.0166, + "step": 3533 + }, + { + "epoch": 1.0847145488029466, + "grad_norm": 0.544964075088501, + "learning_rate": 9.839580803097738e-05, + "loss": 2.0093, + "step": 3534 + }, + { + "epoch": 1.0850214855739717, + "grad_norm": 0.5509639978408813, + "learning_rate": 9.839455882251945e-05, + "loss": 2.0511, + "step": 3535 + }, + { + "epoch": 1.0853284223449968, + "grad_norm": 0.5092516541481018, + "learning_rate": 9.83933091357981e-05, + "loss": 2.0586, + "step": 3536 + }, + { + "epoch": 1.0856353591160222, + "grad_norm": 0.5163968205451965, + "learning_rate": 9.83920589708257e-05, + "loss": 2.0541, + "step": 3537 + }, + { + "epoch": 1.0859422958870473, + "grad_norm": 0.49756479263305664, + "learning_rate": 9.839080832761464e-05, + "loss": 2.0495, + "step": 3538 + }, + { + "epoch": 1.0862492326580724, + "grad_norm": 0.6246916055679321, + "learning_rate": 9.838955720617722e-05, + "loss": 2.2082, + "step": 3539 + }, + { + "epoch": 1.0865561694290977, + "grad_norm": 0.5826153755187988, + "learning_rate": 9.838830560652585e-05, + "loss": 2.0318, + "step": 3540 + }, + { + "epoch": 1.0868631062001228, + "grad_norm": 0.6131548285484314, + "learning_rate": 9.838705352867287e-05, + "loss": 2.1172, + "step": 3541 + }, + { + "epoch": 1.087170042971148, + "grad_norm": 0.7028201818466187, + "learning_rate": 9.838580097263068e-05, + "loss": 2.061, + "step": 3542 + }, + { + "epoch": 1.087476979742173, + "grad_norm": 0.7061073780059814, + "learning_rate": 9.838454793841166e-05, + "loss": 2.0944, + "step": 3543 + }, + { + "epoch": 1.0877839165131982, + "grad_norm": 0.6820229887962341, + "learning_rate": 9.838329442602814e-05, + "loss": 2.072, + "step": 3544 + }, + { + "epoch": 1.0880908532842235, + "grad_norm": 0.5658139586448669, + "learning_rate": 9.838204043549257e-05, + "loss": 2.0499, + "step": 3545 + }, + { + "epoch": 1.0883977900552486, + "grad_norm": 0.5714126825332642, + "learning_rate": 9.838078596681731e-05, + "loss": 2.06, + "step": 3546 + }, + { + "epoch": 1.0887047268262737, + "grad_norm": 0.5343610048294067, + "learning_rate": 9.837953102001477e-05, + "loss": 2.0932, + "step": 3547 + }, + { + "epoch": 1.089011663597299, + "grad_norm": 0.5799851417541504, + "learning_rate": 9.837827559509735e-05, + "loss": 2.0615, + "step": 3548 + }, + { + "epoch": 1.0893186003683242, + "grad_norm": 0.5679401159286499, + "learning_rate": 9.837701969207745e-05, + "loss": 2.0161, + "step": 3549 + }, + { + "epoch": 1.0896255371393493, + "grad_norm": 0.5369420647621155, + "learning_rate": 9.83757633109675e-05, + "loss": 2.0066, + "step": 3550 + }, + { + "epoch": 1.0899324739103744, + "grad_norm": 0.5276355147361755, + "learning_rate": 9.837450645177988e-05, + "loss": 2.03, + "step": 3551 + }, + { + "epoch": 1.0902394106813997, + "grad_norm": 0.49717894196510315, + "learning_rate": 9.837324911452705e-05, + "loss": 1.9897, + "step": 3552 + }, + { + "epoch": 1.0905463474524248, + "grad_norm": 0.460783451795578, + "learning_rate": 9.837199129922142e-05, + "loss": 2.089, + "step": 3553 + }, + { + "epoch": 1.09085328422345, + "grad_norm": 0.505473792552948, + "learning_rate": 9.837073300587541e-05, + "loss": 2.035, + "step": 3554 + }, + { + "epoch": 1.091160220994475, + "grad_norm": 0.4588155150413513, + "learning_rate": 9.836947423450147e-05, + "loss": 2.0029, + "step": 3555 + }, + { + "epoch": 1.0914671577655004, + "grad_norm": 0.5151825547218323, + "learning_rate": 9.836821498511203e-05, + "loss": 2.1075, + "step": 3556 + }, + { + "epoch": 1.0917740945365255, + "grad_norm": 0.46669647097587585, + "learning_rate": 9.836695525771955e-05, + "loss": 2.0468, + "step": 3557 + }, + { + "epoch": 1.0920810313075506, + "grad_norm": 0.49291539192199707, + "learning_rate": 9.836569505233647e-05, + "loss": 2.1201, + "step": 3558 + }, + { + "epoch": 1.0923879680785757, + "grad_norm": 0.49323126673698425, + "learning_rate": 9.836443436897525e-05, + "loss": 1.9796, + "step": 3559 + }, + { + "epoch": 1.092694904849601, + "grad_norm": 0.4784039258956909, + "learning_rate": 9.836317320764832e-05, + "loss": 2.0267, + "step": 3560 + }, + { + "epoch": 1.0930018416206262, + "grad_norm": 0.5402999520301819, + "learning_rate": 9.836191156836818e-05, + "loss": 2.07, + "step": 3561 + }, + { + "epoch": 1.0933087783916513, + "grad_norm": 0.5989857912063599, + "learning_rate": 9.83606494511473e-05, + "loss": 2.0518, + "step": 3562 + }, + { + "epoch": 1.0936157151626764, + "grad_norm": 0.685855507850647, + "learning_rate": 9.835938685599811e-05, + "loss": 2.0632, + "step": 3563 + }, + { + "epoch": 1.0939226519337018, + "grad_norm": 0.7716066837310791, + "learning_rate": 9.835812378293312e-05, + "loss": 2.0758, + "step": 3564 + }, + { + "epoch": 1.0942295887047269, + "grad_norm": 0.6822659969329834, + "learning_rate": 9.835686023196481e-05, + "loss": 2.0077, + "step": 3565 + }, + { + "epoch": 1.094536525475752, + "grad_norm": 0.5031718611717224, + "learning_rate": 9.835559620310566e-05, + "loss": 2.0432, + "step": 3566 + }, + { + "epoch": 1.094843462246777, + "grad_norm": 0.5570902228355408, + "learning_rate": 9.835433169636818e-05, + "loss": 2.1203, + "step": 3567 + }, + { + "epoch": 1.0951503990178024, + "grad_norm": 0.6224993467330933, + "learning_rate": 9.835306671176484e-05, + "loss": 2.0281, + "step": 3568 + }, + { + "epoch": 1.0954573357888275, + "grad_norm": 0.67215895652771, + "learning_rate": 9.835180124930816e-05, + "loss": 2.1158, + "step": 3569 + }, + { + "epoch": 1.0957642725598526, + "grad_norm": 0.5764983892440796, + "learning_rate": 9.835053530901064e-05, + "loss": 1.9735, + "step": 3570 + }, + { + "epoch": 1.0960712093308778, + "grad_norm": 0.48459672927856445, + "learning_rate": 9.834926889088478e-05, + "loss": 2.0074, + "step": 3571 + }, + { + "epoch": 1.096378146101903, + "grad_norm": 0.4789890944957733, + "learning_rate": 9.834800199494312e-05, + "loss": 1.9942, + "step": 3572 + }, + { + "epoch": 1.0966850828729282, + "grad_norm": 0.5133237838745117, + "learning_rate": 9.834673462119817e-05, + "loss": 2.0204, + "step": 3573 + }, + { + "epoch": 1.0969920196439533, + "grad_norm": 0.638518750667572, + "learning_rate": 9.834546676966244e-05, + "loss": 2.1396, + "step": 3574 + }, + { + "epoch": 1.0972989564149784, + "grad_norm": 0.5471677780151367, + "learning_rate": 9.834419844034848e-05, + "loss": 1.99, + "step": 3575 + }, + { + "epoch": 1.0976058931860038, + "grad_norm": 0.5372926592826843, + "learning_rate": 9.83429296332688e-05, + "loss": 2.0241, + "step": 3576 + }, + { + "epoch": 1.0979128299570289, + "grad_norm": 0.5284983515739441, + "learning_rate": 9.834166034843597e-05, + "loss": 2.0705, + "step": 3577 + }, + { + "epoch": 1.098219766728054, + "grad_norm": 0.5212574601173401, + "learning_rate": 9.834039058586252e-05, + "loss": 2.0648, + "step": 3578 + }, + { + "epoch": 1.098526703499079, + "grad_norm": 0.439454048871994, + "learning_rate": 9.833912034556099e-05, + "loss": 1.9981, + "step": 3579 + }, + { + "epoch": 1.0988336402701044, + "grad_norm": 0.529550313949585, + "learning_rate": 9.833784962754394e-05, + "loss": 2.0092, + "step": 3580 + }, + { + "epoch": 1.0991405770411296, + "grad_norm": 0.5555844902992249, + "learning_rate": 9.833657843182394e-05, + "loss": 2.0457, + "step": 3581 + }, + { + "epoch": 1.0994475138121547, + "grad_norm": 0.56191086769104, + "learning_rate": 9.833530675841352e-05, + "loss": 2.0742, + "step": 3582 + }, + { + "epoch": 1.0997544505831798, + "grad_norm": 0.5119436383247375, + "learning_rate": 9.833403460732529e-05, + "loss": 2.0836, + "step": 3583 + }, + { + "epoch": 1.1000613873542051, + "grad_norm": 0.48049578070640564, + "learning_rate": 9.833276197857179e-05, + "loss": 2.0018, + "step": 3584 + }, + { + "epoch": 1.1003683241252302, + "grad_norm": 0.48501092195510864, + "learning_rate": 9.83314888721656e-05, + "loss": 2.0158, + "step": 3585 + }, + { + "epoch": 1.1006752608962553, + "grad_norm": 0.528548538684845, + "learning_rate": 9.833021528811932e-05, + "loss": 2.0327, + "step": 3586 + }, + { + "epoch": 1.1009821976672804, + "grad_norm": 0.5243194699287415, + "learning_rate": 9.832894122644551e-05, + "loss": 1.9874, + "step": 3587 + }, + { + "epoch": 1.1012891344383058, + "grad_norm": 0.46920302510261536, + "learning_rate": 9.832766668715681e-05, + "loss": 2.0487, + "step": 3588 + }, + { + "epoch": 1.101596071209331, + "grad_norm": 0.45994171500205994, + "learning_rate": 9.832639167026575e-05, + "loss": 2.0926, + "step": 3589 + }, + { + "epoch": 1.101903007980356, + "grad_norm": 0.5337465405464172, + "learning_rate": 9.832511617578497e-05, + "loss": 1.9957, + "step": 3590 + }, + { + "epoch": 1.1022099447513811, + "grad_norm": 0.5920217633247375, + "learning_rate": 9.832384020372707e-05, + "loss": 2.0571, + "step": 3591 + }, + { + "epoch": 1.1025168815224065, + "grad_norm": 0.651720404624939, + "learning_rate": 9.832256375410466e-05, + "loss": 2.0382, + "step": 3592 + }, + { + "epoch": 1.1028238182934316, + "grad_norm": 0.6063461899757385, + "learning_rate": 9.832128682693035e-05, + "loss": 1.9932, + "step": 3593 + }, + { + "epoch": 1.1031307550644567, + "grad_norm": 0.5111881494522095, + "learning_rate": 9.832000942221676e-05, + "loss": 1.9821, + "step": 3594 + }, + { + "epoch": 1.1034376918354818, + "grad_norm": 0.5419835448265076, + "learning_rate": 9.831873153997652e-05, + "loss": 2.0535, + "step": 3595 + }, + { + "epoch": 1.1037446286065071, + "grad_norm": 0.5685762763023376, + "learning_rate": 9.831745318022226e-05, + "loss": 2.0715, + "step": 3596 + }, + { + "epoch": 1.1040515653775322, + "grad_norm": 0.6095051765441895, + "learning_rate": 9.831617434296659e-05, + "loss": 2.0382, + "step": 3597 + }, + { + "epoch": 1.1043585021485574, + "grad_norm": 0.548292338848114, + "learning_rate": 9.831489502822217e-05, + "loss": 1.98, + "step": 3598 + }, + { + "epoch": 1.1046654389195825, + "grad_norm": 0.5056986808776855, + "learning_rate": 9.831361523600165e-05, + "loss": 2.0271, + "step": 3599 + }, + { + "epoch": 1.1049723756906078, + "grad_norm": 0.48790082335472107, + "learning_rate": 9.831233496631767e-05, + "loss": 1.9555, + "step": 3600 + }, + { + "epoch": 1.105279312461633, + "grad_norm": 0.4663766622543335, + "learning_rate": 9.831105421918287e-05, + "loss": 1.9985, + "step": 3601 + }, + { + "epoch": 1.105586249232658, + "grad_norm": 0.4549616277217865, + "learning_rate": 9.83097729946099e-05, + "loss": 2.0543, + "step": 3602 + }, + { + "epoch": 1.1058931860036831, + "grad_norm": 0.46699193120002747, + "learning_rate": 9.830849129261146e-05, + "loss": 2.0395, + "step": 3603 + }, + { + "epoch": 1.1062001227747085, + "grad_norm": 0.4600387215614319, + "learning_rate": 9.830720911320019e-05, + "loss": 2.0155, + "step": 3604 + }, + { + "epoch": 1.1065070595457336, + "grad_norm": 0.4854283034801483, + "learning_rate": 9.830592645638877e-05, + "loss": 2.0698, + "step": 3605 + }, + { + "epoch": 1.1068139963167587, + "grad_norm": 0.5249526500701904, + "learning_rate": 9.830464332218987e-05, + "loss": 2.0842, + "step": 3606 + }, + { + "epoch": 1.107120933087784, + "grad_norm": 0.6377332806587219, + "learning_rate": 9.830335971061616e-05, + "loss": 2.1399, + "step": 3607 + }, + { + "epoch": 1.1074278698588091, + "grad_norm": 0.632194995880127, + "learning_rate": 9.830207562168034e-05, + "loss": 2.1203, + "step": 3608 + }, + { + "epoch": 1.1077348066298343, + "grad_norm": 0.5585857629776001, + "learning_rate": 9.830079105539512e-05, + "loss": 2.0219, + "step": 3609 + }, + { + "epoch": 1.1080417434008594, + "grad_norm": 0.5613297820091248, + "learning_rate": 9.829950601177316e-05, + "loss": 2.0464, + "step": 3610 + }, + { + "epoch": 1.1083486801718845, + "grad_norm": 0.5213276743888855, + "learning_rate": 9.829822049082716e-05, + "loss": 2.0134, + "step": 3611 + }, + { + "epoch": 1.1086556169429098, + "grad_norm": 0.5008644461631775, + "learning_rate": 9.829693449256984e-05, + "loss": 1.9952, + "step": 3612 + }, + { + "epoch": 1.108962553713935, + "grad_norm": 0.5565455555915833, + "learning_rate": 9.829564801701392e-05, + "loss": 1.9737, + "step": 3613 + }, + { + "epoch": 1.10926949048496, + "grad_norm": 0.6150243878364563, + "learning_rate": 9.82943610641721e-05, + "loss": 2.0414, + "step": 3614 + }, + { + "epoch": 1.1095764272559854, + "grad_norm": 0.6731769442558289, + "learning_rate": 9.829307363405709e-05, + "loss": 2.0262, + "step": 3615 + }, + { + "epoch": 1.1098833640270105, + "grad_norm": 0.5681004524230957, + "learning_rate": 9.829178572668162e-05, + "loss": 2.0303, + "step": 3616 + }, + { + "epoch": 1.1101903007980356, + "grad_norm": 0.4748475253582001, + "learning_rate": 9.829049734205841e-05, + "loss": 1.9756, + "step": 3617 + }, + { + "epoch": 1.1104972375690607, + "grad_norm": 0.4218698740005493, + "learning_rate": 9.82892084802002e-05, + "loss": 2.0243, + "step": 3618 + }, + { + "epoch": 1.1108041743400858, + "grad_norm": 0.47928178310394287, + "learning_rate": 9.828791914111976e-05, + "loss": 2.0368, + "step": 3619 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5805749297142029, + "learning_rate": 9.828662932482977e-05, + "loss": 2.0071, + "step": 3620 + }, + { + "epoch": 1.1114180478821363, + "grad_norm": 0.5580070614814758, + "learning_rate": 9.828533903134302e-05, + "loss": 1.9568, + "step": 3621 + }, + { + "epoch": 1.1117249846531614, + "grad_norm": 0.572694718837738, + "learning_rate": 9.828404826067224e-05, + "loss": 2.0128, + "step": 3622 + }, + { + "epoch": 1.1120319214241867, + "grad_norm": 0.605338990688324, + "learning_rate": 9.828275701283021e-05, + "loss": 2.0638, + "step": 3623 + }, + { + "epoch": 1.1123388581952118, + "grad_norm": 0.550521969795227, + "learning_rate": 9.828146528782967e-05, + "loss": 2.118, + "step": 3624 + }, + { + "epoch": 1.112645794966237, + "grad_norm": 0.5420751571655273, + "learning_rate": 9.828017308568337e-05, + "loss": 2.0685, + "step": 3625 + }, + { + "epoch": 1.112952731737262, + "grad_norm": 0.5761057734489441, + "learning_rate": 9.827888040640414e-05, + "loss": 2.1111, + "step": 3626 + }, + { + "epoch": 1.1132596685082874, + "grad_norm": 0.5724154710769653, + "learning_rate": 9.827758725000468e-05, + "loss": 2.0596, + "step": 3627 + }, + { + "epoch": 1.1135666052793125, + "grad_norm": 0.5120618343353271, + "learning_rate": 9.827629361649783e-05, + "loss": 1.9811, + "step": 3628 + }, + { + "epoch": 1.1138735420503376, + "grad_norm": 0.4449520409107208, + "learning_rate": 9.827499950589633e-05, + "loss": 1.9935, + "step": 3629 + }, + { + "epoch": 1.1141804788213627, + "grad_norm": 0.5478667616844177, + "learning_rate": 9.827370491821302e-05, + "loss": 2.0142, + "step": 3630 + }, + { + "epoch": 1.114487415592388, + "grad_norm": 0.6170383095741272, + "learning_rate": 9.827240985346064e-05, + "loss": 2.0588, + "step": 3631 + }, + { + "epoch": 1.1147943523634132, + "grad_norm": 0.5950221419334412, + "learning_rate": 9.827111431165202e-05, + "loss": 2.0187, + "step": 3632 + }, + { + "epoch": 1.1151012891344383, + "grad_norm": 0.5250533819198608, + "learning_rate": 9.826981829279995e-05, + "loss": 2.0288, + "step": 3633 + }, + { + "epoch": 1.1154082259054634, + "grad_norm": 0.6252482533454895, + "learning_rate": 9.826852179691725e-05, + "loss": 2.1834, + "step": 3634 + }, + { + "epoch": 1.1157151626764887, + "grad_norm": 0.5258986353874207, + "learning_rate": 9.826722482401673e-05, + "loss": 1.9894, + "step": 3635 + }, + { + "epoch": 1.1160220994475138, + "grad_norm": 0.5532206892967224, + "learning_rate": 9.82659273741112e-05, + "loss": 2.013, + "step": 3636 + }, + { + "epoch": 1.116329036218539, + "grad_norm": 0.5178828835487366, + "learning_rate": 9.826462944721349e-05, + "loss": 1.955, + "step": 3637 + }, + { + "epoch": 1.116635972989564, + "grad_norm": 0.5466227531433105, + "learning_rate": 9.826333104333642e-05, + "loss": 2.1073, + "step": 3638 + }, + { + "epoch": 1.1169429097605894, + "grad_norm": 0.5513507723808289, + "learning_rate": 9.826203216249282e-05, + "loss": 2.0735, + "step": 3639 + }, + { + "epoch": 1.1172498465316145, + "grad_norm": 0.5485204458236694, + "learning_rate": 9.826073280469554e-05, + "loss": 2.0699, + "step": 3640 + }, + { + "epoch": 1.1175567833026396, + "grad_norm": 0.5148037075996399, + "learning_rate": 9.825943296995741e-05, + "loss": 1.9364, + "step": 3641 + }, + { + "epoch": 1.1178637200736647, + "grad_norm": 0.5639125108718872, + "learning_rate": 9.825813265829127e-05, + "loss": 2.078, + "step": 3642 + }, + { + "epoch": 1.11817065684469, + "grad_norm": 0.581631064414978, + "learning_rate": 9.825683186970997e-05, + "loss": 2.0404, + "step": 3643 + }, + { + "epoch": 1.1184775936157152, + "grad_norm": 0.5630286335945129, + "learning_rate": 9.82555306042264e-05, + "loss": 2.0615, + "step": 3644 + }, + { + "epoch": 1.1187845303867403, + "grad_norm": 0.5661062598228455, + "learning_rate": 9.825422886185338e-05, + "loss": 2.0432, + "step": 3645 + }, + { + "epoch": 1.1190914671577654, + "grad_norm": 0.4960556626319885, + "learning_rate": 9.825292664260379e-05, + "loss": 2.0576, + "step": 3646 + }, + { + "epoch": 1.1193984039287908, + "grad_norm": 0.5052362084388733, + "learning_rate": 9.825162394649048e-05, + "loss": 2.0615, + "step": 3647 + }, + { + "epoch": 1.1197053406998159, + "grad_norm": 0.566758930683136, + "learning_rate": 9.825032077352636e-05, + "loss": 2.0821, + "step": 3648 + }, + { + "epoch": 1.120012277470841, + "grad_norm": 0.5705568790435791, + "learning_rate": 9.824901712372429e-05, + "loss": 2.1455, + "step": 3649 + }, + { + "epoch": 1.120319214241866, + "grad_norm": 0.5584011673927307, + "learning_rate": 9.824771299709714e-05, + "loss": 2.0911, + "step": 3650 + }, + { + "epoch": 1.1206261510128914, + "grad_norm": 0.5621497631072998, + "learning_rate": 9.824640839365782e-05, + "loss": 2.1209, + "step": 3651 + }, + { + "epoch": 1.1209330877839165, + "grad_norm": 0.4893646240234375, + "learning_rate": 9.824510331341921e-05, + "loss": 1.977, + "step": 3652 + }, + { + "epoch": 1.1212400245549416, + "grad_norm": 0.5626688599586487, + "learning_rate": 9.82437977563942e-05, + "loss": 2.1114, + "step": 3653 + }, + { + "epoch": 1.1215469613259668, + "grad_norm": 0.5714966058731079, + "learning_rate": 9.824249172259573e-05, + "loss": 2.021, + "step": 3654 + }, + { + "epoch": 1.121853898096992, + "grad_norm": 0.5190821886062622, + "learning_rate": 9.824118521203666e-05, + "loss": 1.9788, + "step": 3655 + }, + { + "epoch": 1.1221608348680172, + "grad_norm": 0.46421363949775696, + "learning_rate": 9.823987822472994e-05, + "loss": 1.9762, + "step": 3656 + }, + { + "epoch": 1.1224677716390423, + "grad_norm": 0.5071156620979309, + "learning_rate": 9.823857076068846e-05, + "loss": 1.9625, + "step": 3657 + }, + { + "epoch": 1.1227747084100674, + "grad_norm": 0.5762679576873779, + "learning_rate": 9.823726281992515e-05, + "loss": 2.0543, + "step": 3658 + }, + { + "epoch": 1.1230816451810928, + "grad_norm": 0.6275226473808289, + "learning_rate": 9.823595440245294e-05, + "loss": 2.0878, + "step": 3659 + }, + { + "epoch": 1.1233885819521179, + "grad_norm": 0.6893213391304016, + "learning_rate": 9.823464550828476e-05, + "loss": 2.1059, + "step": 3660 + }, + { + "epoch": 1.123695518723143, + "grad_norm": 0.5521993041038513, + "learning_rate": 9.823333613743353e-05, + "loss": 2.035, + "step": 3661 + }, + { + "epoch": 1.124002455494168, + "grad_norm": 0.4918796718120575, + "learning_rate": 9.823202628991221e-05, + "loss": 1.9873, + "step": 3662 + }, + { + "epoch": 1.1243093922651934, + "grad_norm": 0.5177932977676392, + "learning_rate": 9.823071596573373e-05, + "loss": 2.0376, + "step": 3663 + }, + { + "epoch": 1.1246163290362186, + "grad_norm": 0.5337314009666443, + "learning_rate": 9.822940516491106e-05, + "loss": 2.1065, + "step": 3664 + }, + { + "epoch": 1.1249232658072437, + "grad_norm": 0.5179010629653931, + "learning_rate": 9.822809388745713e-05, + "loss": 1.9642, + "step": 3665 + }, + { + "epoch": 1.125230202578269, + "grad_norm": 0.5394679307937622, + "learning_rate": 9.82267821333849e-05, + "loss": 2.0275, + "step": 3666 + }, + { + "epoch": 1.1255371393492941, + "grad_norm": 0.582873523235321, + "learning_rate": 9.822546990270735e-05, + "loss": 2.0369, + "step": 3667 + }, + { + "epoch": 1.1258440761203192, + "grad_norm": 0.6595674753189087, + "learning_rate": 9.822415719543745e-05, + "loss": 1.9776, + "step": 3668 + }, + { + "epoch": 1.1261510128913443, + "grad_norm": 0.8103840947151184, + "learning_rate": 9.822284401158814e-05, + "loss": 2.0784, + "step": 3669 + }, + { + "epoch": 1.1264579496623695, + "grad_norm": 0.9062070250511169, + "learning_rate": 9.822153035117245e-05, + "loss": 1.9886, + "step": 3670 + }, + { + "epoch": 1.1267648864333948, + "grad_norm": 0.8718156814575195, + "learning_rate": 9.822021621420333e-05, + "loss": 2.0499, + "step": 3671 + }, + { + "epoch": 1.12707182320442, + "grad_norm": 0.6499583721160889, + "learning_rate": 9.821890160069375e-05, + "loss": 2.0734, + "step": 3672 + }, + { + "epoch": 1.127378759975445, + "grad_norm": 0.4573141932487488, + "learning_rate": 9.821758651065673e-05, + "loss": 2.0306, + "step": 3673 + }, + { + "epoch": 1.1276856967464703, + "grad_norm": 0.6441135406494141, + "learning_rate": 9.821627094410526e-05, + "loss": 2.051, + "step": 3674 + }, + { + "epoch": 1.1279926335174955, + "grad_norm": 0.7201390266418457, + "learning_rate": 9.821495490105235e-05, + "loss": 2.0187, + "step": 3675 + }, + { + "epoch": 1.1282995702885206, + "grad_norm": 0.6751874685287476, + "learning_rate": 9.821363838151099e-05, + "loss": 2.0363, + "step": 3676 + }, + { + "epoch": 1.1286065070595457, + "grad_norm": 0.5435949563980103, + "learning_rate": 9.821232138549419e-05, + "loss": 1.939, + "step": 3677 + }, + { + "epoch": 1.1289134438305708, + "grad_norm": 0.605248212814331, + "learning_rate": 9.821100391301497e-05, + "loss": 2.146, + "step": 3678 + }, + { + "epoch": 1.1292203806015961, + "grad_norm": 0.6798139810562134, + "learning_rate": 9.820968596408636e-05, + "loss": 2.0423, + "step": 3679 + }, + { + "epoch": 1.1295273173726212, + "grad_norm": 0.6683683395385742, + "learning_rate": 9.820836753872137e-05, + "loss": 1.9768, + "step": 3680 + }, + { + "epoch": 1.1298342541436464, + "grad_norm": 0.578346312046051, + "learning_rate": 9.820704863693304e-05, + "loss": 1.9313, + "step": 3681 + }, + { + "epoch": 1.1301411909146717, + "grad_norm": 0.5639599561691284, + "learning_rate": 9.820572925873441e-05, + "loss": 2.0706, + "step": 3682 + }, + { + "epoch": 1.1304481276856968, + "grad_norm": 0.5749368071556091, + "learning_rate": 9.82044094041385e-05, + "loss": 2.0072, + "step": 3683 + }, + { + "epoch": 1.130755064456722, + "grad_norm": 0.6490229368209839, + "learning_rate": 9.820308907315836e-05, + "loss": 1.9947, + "step": 3684 + }, + { + "epoch": 1.131062001227747, + "grad_norm": 0.6207692623138428, + "learning_rate": 9.820176826580705e-05, + "loss": 2.1426, + "step": 3685 + }, + { + "epoch": 1.1313689379987721, + "grad_norm": 0.6421573162078857, + "learning_rate": 9.82004469820976e-05, + "loss": 2.0558, + "step": 3686 + }, + { + "epoch": 1.1316758747697975, + "grad_norm": 0.5462764501571655, + "learning_rate": 9.81991252220431e-05, + "loss": 2.0072, + "step": 3687 + }, + { + "epoch": 1.1319828115408226, + "grad_norm": 0.49791282415390015, + "learning_rate": 9.819780298565657e-05, + "loss": 1.9949, + "step": 3688 + }, + { + "epoch": 1.1322897483118477, + "grad_norm": 0.5120366215705872, + "learning_rate": 9.819648027295112e-05, + "loss": 2.0503, + "step": 3689 + }, + { + "epoch": 1.132596685082873, + "grad_norm": 0.5118343830108643, + "learning_rate": 9.81951570839398e-05, + "loss": 2.0104, + "step": 3690 + }, + { + "epoch": 1.1329036218538981, + "grad_norm": 0.44520822167396545, + "learning_rate": 9.81938334186357e-05, + "loss": 2.0024, + "step": 3691 + }, + { + "epoch": 1.1332105586249233, + "grad_norm": 0.5505960583686829, + "learning_rate": 9.819250927705188e-05, + "loss": 2.0924, + "step": 3692 + }, + { + "epoch": 1.1335174953959484, + "grad_norm": 0.5269182920455933, + "learning_rate": 9.819118465920143e-05, + "loss": 2.0553, + "step": 3693 + }, + { + "epoch": 1.1338244321669735, + "grad_norm": 0.4864311218261719, + "learning_rate": 9.818985956509745e-05, + "loss": 2.0405, + "step": 3694 + }, + { + "epoch": 1.1341313689379988, + "grad_norm": 0.515202522277832, + "learning_rate": 9.818853399475304e-05, + "loss": 2.0211, + "step": 3695 + }, + { + "epoch": 1.134438305709024, + "grad_norm": 0.5360483527183533, + "learning_rate": 9.818720794818128e-05, + "loss": 2.1077, + "step": 3696 + }, + { + "epoch": 1.134745242480049, + "grad_norm": 0.5469255447387695, + "learning_rate": 9.818588142539531e-05, + "loss": 1.9538, + "step": 3697 + }, + { + "epoch": 1.1350521792510744, + "grad_norm": 0.5042214393615723, + "learning_rate": 9.818455442640819e-05, + "loss": 2.0477, + "step": 3698 + }, + { + "epoch": 1.1353591160220995, + "grad_norm": 0.5678744316101074, + "learning_rate": 9.81832269512331e-05, + "loss": 2.0871, + "step": 3699 + }, + { + "epoch": 1.1356660527931246, + "grad_norm": 0.5218677520751953, + "learning_rate": 9.818189899988308e-05, + "loss": 2.1014, + "step": 3700 + }, + { + "epoch": 1.1359729895641497, + "grad_norm": 0.5141727924346924, + "learning_rate": 9.818057057237132e-05, + "loss": 2.0385, + "step": 3701 + }, + { + "epoch": 1.136279926335175, + "grad_norm": 0.5288038849830627, + "learning_rate": 9.81792416687109e-05, + "loss": 2.0736, + "step": 3702 + }, + { + "epoch": 1.1365868631062002, + "grad_norm": 0.5533168911933899, + "learning_rate": 9.817791228891499e-05, + "loss": 2.032, + "step": 3703 + }, + { + "epoch": 1.1368937998772253, + "grad_norm": 0.4840674102306366, + "learning_rate": 9.81765824329967e-05, + "loss": 2.027, + "step": 3704 + }, + { + "epoch": 1.1372007366482504, + "grad_norm": 0.5060023069381714, + "learning_rate": 9.817525210096921e-05, + "loss": 2.0561, + "step": 3705 + }, + { + "epoch": 1.1375076734192757, + "grad_norm": 0.48830488324165344, + "learning_rate": 9.817392129284561e-05, + "loss": 1.9807, + "step": 3706 + }, + { + "epoch": 1.1378146101903008, + "grad_norm": 0.4644564390182495, + "learning_rate": 9.817259000863911e-05, + "loss": 1.9871, + "step": 3707 + }, + { + "epoch": 1.138121546961326, + "grad_norm": 0.4644739329814911, + "learning_rate": 9.817125824836283e-05, + "loss": 2.0253, + "step": 3708 + }, + { + "epoch": 1.138428483732351, + "grad_norm": 0.5376463532447815, + "learning_rate": 9.816992601202994e-05, + "loss": 2.0693, + "step": 3709 + }, + { + "epoch": 1.1387354205033764, + "grad_norm": 0.49980148673057556, + "learning_rate": 9.816859329965363e-05, + "loss": 2.0123, + "step": 3710 + }, + { + "epoch": 1.1390423572744015, + "grad_norm": 0.5452225208282471, + "learning_rate": 9.816726011124702e-05, + "loss": 2.0725, + "step": 3711 + }, + { + "epoch": 1.1393492940454266, + "grad_norm": 0.5428896546363831, + "learning_rate": 9.816592644682332e-05, + "loss": 2.0446, + "step": 3712 + }, + { + "epoch": 1.1396562308164517, + "grad_norm": 0.5448847413063049, + "learning_rate": 9.816459230639571e-05, + "loss": 2.0262, + "step": 3713 + }, + { + "epoch": 1.139963167587477, + "grad_norm": 0.48574572801589966, + "learning_rate": 9.816325768997736e-05, + "loss": 2.0105, + "step": 3714 + }, + { + "epoch": 1.1402701043585022, + "grad_norm": 0.5566397905349731, + "learning_rate": 9.816192259758147e-05, + "loss": 2.0665, + "step": 3715 + }, + { + "epoch": 1.1405770411295273, + "grad_norm": 0.6098625659942627, + "learning_rate": 9.816058702922124e-05, + "loss": 2.0589, + "step": 3716 + }, + { + "epoch": 1.1408839779005524, + "grad_norm": 0.6118699312210083, + "learning_rate": 9.815925098490985e-05, + "loss": 2.0683, + "step": 3717 + }, + { + "epoch": 1.1411909146715777, + "grad_norm": 0.5213121175765991, + "learning_rate": 9.815791446466053e-05, + "loss": 2.0226, + "step": 3718 + }, + { + "epoch": 1.1414978514426029, + "grad_norm": 0.45717960596084595, + "learning_rate": 9.815657746848648e-05, + "loss": 2.0371, + "step": 3719 + }, + { + "epoch": 1.141804788213628, + "grad_norm": 0.4613656997680664, + "learning_rate": 9.815523999640088e-05, + "loss": 2.0702, + "step": 3720 + }, + { + "epoch": 1.142111724984653, + "grad_norm": 0.4527476727962494, + "learning_rate": 9.8153902048417e-05, + "loss": 1.9893, + "step": 3721 + }, + { + "epoch": 1.1424186617556784, + "grad_norm": 0.4524305462837219, + "learning_rate": 9.815256362454801e-05, + "loss": 1.975, + "step": 3722 + }, + { + "epoch": 1.1427255985267035, + "grad_norm": 0.4421180188655853, + "learning_rate": 9.815122472480718e-05, + "loss": 1.9987, + "step": 3723 + }, + { + "epoch": 1.1430325352977286, + "grad_norm": 0.4833788275718689, + "learning_rate": 9.814988534920771e-05, + "loss": 2.0246, + "step": 3724 + }, + { + "epoch": 1.1433394720687537, + "grad_norm": 0.46547624468803406, + "learning_rate": 9.814854549776287e-05, + "loss": 2.0007, + "step": 3725 + }, + { + "epoch": 1.143646408839779, + "grad_norm": 0.43220648169517517, + "learning_rate": 9.814720517048587e-05, + "loss": 1.9845, + "step": 3726 + }, + { + "epoch": 1.1439533456108042, + "grad_norm": 0.473910391330719, + "learning_rate": 9.814586436738998e-05, + "loss": 2.0518, + "step": 3727 + }, + { + "epoch": 1.1442602823818293, + "grad_norm": 0.507354199886322, + "learning_rate": 9.814452308848843e-05, + "loss": 2.0708, + "step": 3728 + }, + { + "epoch": 1.1445672191528544, + "grad_norm": 0.4585053622722626, + "learning_rate": 9.814318133379448e-05, + "loss": 2.0124, + "step": 3729 + }, + { + "epoch": 1.1448741559238798, + "grad_norm": 0.5280457735061646, + "learning_rate": 9.81418391033214e-05, + "loss": 2.0424, + "step": 3730 + }, + { + "epoch": 1.1451810926949049, + "grad_norm": 0.5173056125640869, + "learning_rate": 9.814049639708245e-05, + "loss": 1.9666, + "step": 3731 + }, + { + "epoch": 1.14548802946593, + "grad_norm": 0.5850839018821716, + "learning_rate": 9.81391532150909e-05, + "loss": 2.0765, + "step": 3732 + }, + { + "epoch": 1.145794966236955, + "grad_norm": 0.5450417995452881, + "learning_rate": 9.813780955736002e-05, + "loss": 2.0696, + "step": 3733 + }, + { + "epoch": 1.1461019030079804, + "grad_norm": 0.4577319622039795, + "learning_rate": 9.81364654239031e-05, + "loss": 2.0493, + "step": 3734 + }, + { + "epoch": 1.1464088397790055, + "grad_norm": 0.5211838483810425, + "learning_rate": 9.813512081473339e-05, + "loss": 2.0578, + "step": 3735 + }, + { + "epoch": 1.1467157765500307, + "grad_norm": 0.6763051152229309, + "learning_rate": 9.813377572986422e-05, + "loss": 2.0859, + "step": 3736 + }, + { + "epoch": 1.1470227133210558, + "grad_norm": 0.8591815233230591, + "learning_rate": 9.813243016930887e-05, + "loss": 1.9743, + "step": 3737 + }, + { + "epoch": 1.147329650092081, + "grad_norm": 0.8573755025863647, + "learning_rate": 9.813108413308063e-05, + "loss": 2.048, + "step": 3738 + }, + { + "epoch": 1.1476365868631062, + "grad_norm": 0.6887713074684143, + "learning_rate": 9.812973762119281e-05, + "loss": 2.0184, + "step": 3739 + }, + { + "epoch": 1.1479435236341313, + "grad_norm": 0.5491438508033752, + "learning_rate": 9.81283906336587e-05, + "loss": 2.0373, + "step": 3740 + }, + { + "epoch": 1.1482504604051567, + "grad_norm": 0.6413923501968384, + "learning_rate": 9.812704317049164e-05, + "loss": 2.067, + "step": 3741 + }, + { + "epoch": 1.1485573971761818, + "grad_norm": 0.8731338381767273, + "learning_rate": 9.812569523170492e-05, + "loss": 1.9996, + "step": 3742 + }, + { + "epoch": 1.1488643339472069, + "grad_norm": 0.8043886423110962, + "learning_rate": 9.812434681731189e-05, + "loss": 2.0464, + "step": 3743 + }, + { + "epoch": 1.149171270718232, + "grad_norm": 0.5458279252052307, + "learning_rate": 9.812299792732584e-05, + "loss": 2.0026, + "step": 3744 + }, + { + "epoch": 1.149478207489257, + "grad_norm": 0.5135432481765747, + "learning_rate": 9.812164856176011e-05, + "loss": 2.0302, + "step": 3745 + }, + { + "epoch": 1.1497851442602824, + "grad_norm": 0.6673153638839722, + "learning_rate": 9.812029872062807e-05, + "loss": 2.0435, + "step": 3746 + }, + { + "epoch": 1.1500920810313076, + "grad_norm": 0.6777083873748779, + "learning_rate": 9.811894840394302e-05, + "loss": 2.0591, + "step": 3747 + }, + { + "epoch": 1.1503990178023327, + "grad_norm": 0.6660524010658264, + "learning_rate": 9.811759761171833e-05, + "loss": 2.0461, + "step": 3748 + }, + { + "epoch": 1.150705954573358, + "grad_norm": 0.6079594492912292, + "learning_rate": 9.811624634396733e-05, + "loss": 2.0708, + "step": 3749 + }, + { + "epoch": 1.1510128913443831, + "grad_norm": 0.5242465734481812, + "learning_rate": 9.811489460070337e-05, + "loss": 2.0513, + "step": 3750 + }, + { + "epoch": 1.1513198281154082, + "grad_norm": 0.7091820240020752, + "learning_rate": 9.811354238193984e-05, + "loss": 2.0356, + "step": 3751 + }, + { + "epoch": 1.1516267648864333, + "grad_norm": 0.6781896948814392, + "learning_rate": 9.811218968769007e-05, + "loss": 2.0693, + "step": 3752 + }, + { + "epoch": 1.1519337016574585, + "grad_norm": 0.6036314368247986, + "learning_rate": 9.811083651796744e-05, + "loss": 2.134, + "step": 3753 + }, + { + "epoch": 1.1522406384284838, + "grad_norm": 0.6173892617225647, + "learning_rate": 9.810948287278534e-05, + "loss": 2.056, + "step": 3754 + }, + { + "epoch": 1.152547575199509, + "grad_norm": 0.4903198182582855, + "learning_rate": 9.810812875215712e-05, + "loss": 2.0037, + "step": 3755 + }, + { + "epoch": 1.152854511970534, + "grad_norm": 0.5527236461639404, + "learning_rate": 9.810677415609619e-05, + "loss": 2.0334, + "step": 3756 + }, + { + "epoch": 1.1531614487415593, + "grad_norm": 0.5342993140220642, + "learning_rate": 9.81054190846159e-05, + "loss": 2.0376, + "step": 3757 + }, + { + "epoch": 1.1534683855125845, + "grad_norm": 0.4860527515411377, + "learning_rate": 9.810406353772968e-05, + "loss": 2.0009, + "step": 3758 + }, + { + "epoch": 1.1537753222836096, + "grad_norm": 0.49722176790237427, + "learning_rate": 9.810270751545089e-05, + "loss": 2.051, + "step": 3759 + }, + { + "epoch": 1.1540822590546347, + "grad_norm": 0.4714743196964264, + "learning_rate": 9.810135101779296e-05, + "loss": 2.0474, + "step": 3760 + }, + { + "epoch": 1.1543891958256598, + "grad_norm": 0.5183619856834412, + "learning_rate": 9.80999940447693e-05, + "loss": 2.1032, + "step": 3761 + }, + { + "epoch": 1.1546961325966851, + "grad_norm": 0.6118659377098083, + "learning_rate": 9.809863659639328e-05, + "loss": 2.0967, + "step": 3762 + }, + { + "epoch": 1.1550030693677102, + "grad_norm": 0.49166184663772583, + "learning_rate": 9.809727867267838e-05, + "loss": 2.0683, + "step": 3763 + }, + { + "epoch": 1.1553100061387354, + "grad_norm": 0.5190026164054871, + "learning_rate": 9.809592027363795e-05, + "loss": 2.0161, + "step": 3764 + }, + { + "epoch": 1.1556169429097607, + "grad_norm": 0.516914427280426, + "learning_rate": 9.809456139928546e-05, + "loss": 2.0886, + "step": 3765 + }, + { + "epoch": 1.1559238796807858, + "grad_norm": 0.49737948179244995, + "learning_rate": 9.809320204963433e-05, + "loss": 2.0111, + "step": 3766 + }, + { + "epoch": 1.156230816451811, + "grad_norm": 0.44676536321640015, + "learning_rate": 9.809184222469796e-05, + "loss": 2.0571, + "step": 3767 + }, + { + "epoch": 1.156537753222836, + "grad_norm": 0.5008999109268188, + "learning_rate": 9.809048192448983e-05, + "loss": 2.0489, + "step": 3768 + }, + { + "epoch": 1.1568446899938611, + "grad_norm": 0.5116657614707947, + "learning_rate": 9.80891211490234e-05, + "loss": 1.9571, + "step": 3769 + }, + { + "epoch": 1.1571516267648865, + "grad_norm": 0.49909651279449463, + "learning_rate": 9.808775989831207e-05, + "loss": 2.0568, + "step": 3770 + }, + { + "epoch": 1.1574585635359116, + "grad_norm": 0.5186662077903748, + "learning_rate": 9.80863981723693e-05, + "loss": 2.0283, + "step": 3771 + }, + { + "epoch": 1.1577655003069367, + "grad_norm": 0.4974740445613861, + "learning_rate": 9.808503597120858e-05, + "loss": 1.9525, + "step": 3772 + }, + { + "epoch": 1.158072437077962, + "grad_norm": 0.5369553565979004, + "learning_rate": 9.808367329484333e-05, + "loss": 1.9627, + "step": 3773 + }, + { + "epoch": 1.1583793738489871, + "grad_norm": 0.5084113478660583, + "learning_rate": 9.808231014328704e-05, + "loss": 1.9563, + "step": 3774 + }, + { + "epoch": 1.1586863106200123, + "grad_norm": 0.6059956550598145, + "learning_rate": 9.808094651655319e-05, + "loss": 2.078, + "step": 3775 + }, + { + "epoch": 1.1589932473910374, + "grad_norm": 0.5677124261856079, + "learning_rate": 9.807958241465523e-05, + "loss": 1.9977, + "step": 3776 + }, + { + "epoch": 1.1593001841620627, + "grad_norm": 0.5582616329193115, + "learning_rate": 9.807821783760667e-05, + "loss": 2.0053, + "step": 3777 + }, + { + "epoch": 1.1596071209330878, + "grad_norm": 0.5558032989501953, + "learning_rate": 9.807685278542097e-05, + "loss": 2.0015, + "step": 3778 + }, + { + "epoch": 1.159914057704113, + "grad_norm": 0.553292989730835, + "learning_rate": 9.807548725811165e-05, + "loss": 2.133, + "step": 3779 + }, + { + "epoch": 1.160220994475138, + "grad_norm": 0.5281317234039307, + "learning_rate": 9.807412125569217e-05, + "loss": 2.0018, + "step": 3780 + }, + { + "epoch": 1.1605279312461634, + "grad_norm": 0.45385050773620605, + "learning_rate": 9.807275477817605e-05, + "loss": 1.9986, + "step": 3781 + }, + { + "epoch": 1.1608348680171885, + "grad_norm": 0.5843673944473267, + "learning_rate": 9.80713878255768e-05, + "loss": 2.0653, + "step": 3782 + }, + { + "epoch": 1.1611418047882136, + "grad_norm": 0.6193283796310425, + "learning_rate": 9.807002039790792e-05, + "loss": 1.9646, + "step": 3783 + }, + { + "epoch": 1.1614487415592387, + "grad_norm": 0.5831897258758545, + "learning_rate": 9.806865249518292e-05, + "loss": 1.9708, + "step": 3784 + }, + { + "epoch": 1.161755678330264, + "grad_norm": 0.49771901965141296, + "learning_rate": 9.806728411741533e-05, + "loss": 1.9953, + "step": 3785 + }, + { + "epoch": 1.1620626151012892, + "grad_norm": 0.5003515481948853, + "learning_rate": 9.806591526461864e-05, + "loss": 2.0503, + "step": 3786 + }, + { + "epoch": 1.1623695518723143, + "grad_norm": 0.5710052847862244, + "learning_rate": 9.806454593680642e-05, + "loss": 1.9976, + "step": 3787 + }, + { + "epoch": 1.1626764886433394, + "grad_norm": 0.5180788040161133, + "learning_rate": 9.806317613399218e-05, + "loss": 1.9872, + "step": 3788 + }, + { + "epoch": 1.1629834254143647, + "grad_norm": 0.5202008485794067, + "learning_rate": 9.806180585618949e-05, + "loss": 1.9628, + "step": 3789 + }, + { + "epoch": 1.1632903621853898, + "grad_norm": 0.47358211874961853, + "learning_rate": 9.806043510341183e-05, + "loss": 1.9994, + "step": 3790 + }, + { + "epoch": 1.163597298956415, + "grad_norm": 0.4258720278739929, + "learning_rate": 9.80590638756728e-05, + "loss": 1.9547, + "step": 3791 + }, + { + "epoch": 1.16390423572744, + "grad_norm": 0.4487614035606384, + "learning_rate": 9.805769217298593e-05, + "loss": 1.9912, + "step": 3792 + }, + { + "epoch": 1.1642111724984654, + "grad_norm": 0.4970495104789734, + "learning_rate": 9.805631999536477e-05, + "loss": 2.0568, + "step": 3793 + }, + { + "epoch": 1.1645181092694905, + "grad_norm": 0.4535474479198456, + "learning_rate": 9.805494734282289e-05, + "loss": 2.0088, + "step": 3794 + }, + { + "epoch": 1.1648250460405156, + "grad_norm": 0.44582805037498474, + "learning_rate": 9.805357421537385e-05, + "loss": 1.9694, + "step": 3795 + }, + { + "epoch": 1.1651319828115407, + "grad_norm": 0.43872734904289246, + "learning_rate": 9.805220061303125e-05, + "loss": 2.0041, + "step": 3796 + }, + { + "epoch": 1.165438919582566, + "grad_norm": 0.5050458908081055, + "learning_rate": 9.805082653580861e-05, + "loss": 1.9963, + "step": 3797 + }, + { + "epoch": 1.1657458563535912, + "grad_norm": 0.5346884727478027, + "learning_rate": 9.804945198371956e-05, + "loss": 2.0334, + "step": 3798 + }, + { + "epoch": 1.1660527931246163, + "grad_norm": 0.5607240796089172, + "learning_rate": 9.804807695677764e-05, + "loss": 2.0474, + "step": 3799 + }, + { + "epoch": 1.1663597298956414, + "grad_norm": 0.5343592166900635, + "learning_rate": 9.804670145499648e-05, + "loss": 2.0542, + "step": 3800 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5195753574371338, + "learning_rate": 9.804532547838964e-05, + "loss": 2.0816, + "step": 3801 + }, + { + "epoch": 1.1669736034376919, + "grad_norm": 0.575821042060852, + "learning_rate": 9.804394902697075e-05, + "loss": 2.0182, + "step": 3802 + }, + { + "epoch": 1.167280540208717, + "grad_norm": 0.6385466456413269, + "learning_rate": 9.804257210075339e-05, + "loss": 2.0519, + "step": 3803 + }, + { + "epoch": 1.167587476979742, + "grad_norm": 0.7202457785606384, + "learning_rate": 9.804119469975117e-05, + "loss": 1.9871, + "step": 3804 + }, + { + "epoch": 1.1678944137507674, + "grad_norm": 0.696793258190155, + "learning_rate": 9.803981682397772e-05, + "loss": 2.1018, + "step": 3805 + }, + { + "epoch": 1.1682013505217925, + "grad_norm": 0.6217656135559082, + "learning_rate": 9.803843847344662e-05, + "loss": 2.1009, + "step": 3806 + }, + { + "epoch": 1.1685082872928176, + "grad_norm": 0.5296351313591003, + "learning_rate": 9.803705964817153e-05, + "loss": 2.1057, + "step": 3807 + }, + { + "epoch": 1.168815224063843, + "grad_norm": 0.5280975699424744, + "learning_rate": 9.803568034816606e-05, + "loss": 2.0019, + "step": 3808 + }, + { + "epoch": 1.169122160834868, + "grad_norm": 0.4981881380081177, + "learning_rate": 9.803430057344385e-05, + "loss": 1.9918, + "step": 3809 + }, + { + "epoch": 1.1694290976058932, + "grad_norm": 0.43662941455841064, + "learning_rate": 9.803292032401852e-05, + "loss": 2.0273, + "step": 3810 + }, + { + "epoch": 1.1697360343769183, + "grad_norm": 0.5039259791374207, + "learning_rate": 9.80315395999037e-05, + "loss": 2.0475, + "step": 3811 + }, + { + "epoch": 1.1700429711479434, + "grad_norm": 0.4330410957336426, + "learning_rate": 9.803015840111308e-05, + "loss": 1.99, + "step": 3812 + }, + { + "epoch": 1.1703499079189688, + "grad_norm": 0.4603813886642456, + "learning_rate": 9.802877672766026e-05, + "loss": 2.0288, + "step": 3813 + }, + { + "epoch": 1.1706568446899939, + "grad_norm": 0.45815590023994446, + "learning_rate": 9.802739457955894e-05, + "loss": 2.0026, + "step": 3814 + }, + { + "epoch": 1.170963781461019, + "grad_norm": 0.46995803713798523, + "learning_rate": 9.802601195682275e-05, + "loss": 2.0608, + "step": 3815 + }, + { + "epoch": 1.1712707182320443, + "grad_norm": 0.4511576294898987, + "learning_rate": 9.802462885946536e-05, + "loss": 1.9793, + "step": 3816 + }, + { + "epoch": 1.1715776550030694, + "grad_norm": 0.49079468846321106, + "learning_rate": 9.802324528750044e-05, + "loss": 2.0049, + "step": 3817 + }, + { + "epoch": 1.1718845917740945, + "grad_norm": 0.47245466709136963, + "learning_rate": 9.802186124094166e-05, + "loss": 1.9562, + "step": 3818 + }, + { + "epoch": 1.1721915285451197, + "grad_norm": 0.485575795173645, + "learning_rate": 9.80204767198027e-05, + "loss": 2.0212, + "step": 3819 + }, + { + "epoch": 1.1724984653161448, + "grad_norm": 0.5924440622329712, + "learning_rate": 9.801909172409724e-05, + "loss": 1.9875, + "step": 3820 + }, + { + "epoch": 1.17280540208717, + "grad_norm": 0.48908641934394836, + "learning_rate": 9.801770625383899e-05, + "loss": 1.9778, + "step": 3821 + }, + { + "epoch": 1.1731123388581952, + "grad_norm": 0.4372415840625763, + "learning_rate": 9.80163203090416e-05, + "loss": 1.9368, + "step": 3822 + }, + { + "epoch": 1.1734192756292203, + "grad_norm": 0.5811094641685486, + "learning_rate": 9.801493388971881e-05, + "loss": 2.1293, + "step": 3823 + }, + { + "epoch": 1.1737262124002457, + "grad_norm": 0.516983151435852, + "learning_rate": 9.801354699588428e-05, + "loss": 2.039, + "step": 3824 + }, + { + "epoch": 1.1740331491712708, + "grad_norm": 0.53409343957901, + "learning_rate": 9.801215962755175e-05, + "loss": 2.0294, + "step": 3825 + }, + { + "epoch": 1.1743400859422959, + "grad_norm": 0.5703202486038208, + "learning_rate": 9.801077178473492e-05, + "loss": 2.0241, + "step": 3826 + }, + { + "epoch": 1.174647022713321, + "grad_norm": 0.49341192841529846, + "learning_rate": 9.80093834674475e-05, + "loss": 1.9092, + "step": 3827 + }, + { + "epoch": 1.174953959484346, + "grad_norm": 0.46960577368736267, + "learning_rate": 9.800799467570321e-05, + "loss": 1.9994, + "step": 3828 + }, + { + "epoch": 1.1752608962553714, + "grad_norm": 0.468108594417572, + "learning_rate": 9.800660540951577e-05, + "loss": 1.9471, + "step": 3829 + }, + { + "epoch": 1.1755678330263966, + "grad_norm": 0.4133259057998657, + "learning_rate": 9.800521566889893e-05, + "loss": 2.0159, + "step": 3830 + }, + { + "epoch": 1.1758747697974217, + "grad_norm": 0.44991979002952576, + "learning_rate": 9.800382545386641e-05, + "loss": 2.0179, + "step": 3831 + }, + { + "epoch": 1.176181706568447, + "grad_norm": 0.43111294507980347, + "learning_rate": 9.800243476443195e-05, + "loss": 2.1092, + "step": 3832 + }, + { + "epoch": 1.1764886433394721, + "grad_norm": 0.4859693944454193, + "learning_rate": 9.800104360060929e-05, + "loss": 2.0134, + "step": 3833 + }, + { + "epoch": 1.1767955801104972, + "grad_norm": 0.474960058927536, + "learning_rate": 9.799965196241219e-05, + "loss": 2.0288, + "step": 3834 + }, + { + "epoch": 1.1771025168815223, + "grad_norm": 0.5269008278846741, + "learning_rate": 9.79982598498544e-05, + "loss": 2.063, + "step": 3835 + }, + { + "epoch": 1.1774094536525475, + "grad_norm": 0.4923003613948822, + "learning_rate": 9.799686726294965e-05, + "loss": 1.9506, + "step": 3836 + }, + { + "epoch": 1.1777163904235728, + "grad_norm": 0.5355561971664429, + "learning_rate": 9.799547420171175e-05, + "loss": 2.0066, + "step": 3837 + }, + { + "epoch": 1.178023327194598, + "grad_norm": 0.6095728874206543, + "learning_rate": 9.799408066615443e-05, + "loss": 1.9799, + "step": 3838 + }, + { + "epoch": 1.178330263965623, + "grad_norm": 0.5268104672431946, + "learning_rate": 9.799268665629148e-05, + "loss": 2.0409, + "step": 3839 + }, + { + "epoch": 1.1786372007366483, + "grad_norm": 0.4478130340576172, + "learning_rate": 9.799129217213667e-05, + "loss": 1.9521, + "step": 3840 + }, + { + "epoch": 1.1789441375076735, + "grad_norm": 0.4691653847694397, + "learning_rate": 9.798989721370379e-05, + "loss": 2.0432, + "step": 3841 + }, + { + "epoch": 1.1792510742786986, + "grad_norm": 0.5602376461029053, + "learning_rate": 9.798850178100661e-05, + "loss": 2.0557, + "step": 3842 + }, + { + "epoch": 1.1795580110497237, + "grad_norm": 0.5619905591011047, + "learning_rate": 9.798710587405893e-05, + "loss": 2.0258, + "step": 3843 + }, + { + "epoch": 1.179864947820749, + "grad_norm": 0.5845574736595154, + "learning_rate": 9.798570949287454e-05, + "loss": 2.0637, + "step": 3844 + }, + { + "epoch": 1.1801718845917741, + "grad_norm": 0.5339313745498657, + "learning_rate": 9.798431263746725e-05, + "loss": 2.0265, + "step": 3845 + }, + { + "epoch": 1.1804788213627992, + "grad_norm": 0.45720914006233215, + "learning_rate": 9.798291530785086e-05, + "loss": 1.9745, + "step": 3846 + }, + { + "epoch": 1.1807857581338244, + "grad_norm": 0.5121282935142517, + "learning_rate": 9.798151750403917e-05, + "loss": 2.0427, + "step": 3847 + }, + { + "epoch": 1.1810926949048497, + "grad_norm": 0.48100459575653076, + "learning_rate": 9.7980119226046e-05, + "loss": 2.0307, + "step": 3848 + }, + { + "epoch": 1.1813996316758748, + "grad_norm": 0.4424034655094147, + "learning_rate": 9.797872047388517e-05, + "loss": 1.9697, + "step": 3849 + }, + { + "epoch": 1.1817065684469, + "grad_norm": 0.45154938101768494, + "learning_rate": 9.797732124757051e-05, + "loss": 1.9689, + "step": 3850 + }, + { + "epoch": 1.182013505217925, + "grad_norm": 0.4807071387767792, + "learning_rate": 9.797592154711584e-05, + "loss": 1.9616, + "step": 3851 + }, + { + "epoch": 1.1823204419889504, + "grad_norm": 0.5113904476165771, + "learning_rate": 9.797452137253498e-05, + "loss": 2.0158, + "step": 3852 + }, + { + "epoch": 1.1826273787599755, + "grad_norm": 0.5456753969192505, + "learning_rate": 9.797312072384179e-05, + "loss": 1.977, + "step": 3853 + }, + { + "epoch": 1.1829343155310006, + "grad_norm": 0.5545704364776611, + "learning_rate": 9.797171960105012e-05, + "loss": 2.0622, + "step": 3854 + }, + { + "epoch": 1.1832412523020257, + "grad_norm": 0.651498556137085, + "learning_rate": 9.797031800417377e-05, + "loss": 2.0739, + "step": 3855 + }, + { + "epoch": 1.183548189073051, + "grad_norm": 0.748968780040741, + "learning_rate": 9.796891593322665e-05, + "loss": 2.0713, + "step": 3856 + }, + { + "epoch": 1.1838551258440762, + "grad_norm": 0.8724157214164734, + "learning_rate": 9.796751338822256e-05, + "loss": 2.0224, + "step": 3857 + }, + { + "epoch": 1.1841620626151013, + "grad_norm": 0.8158844709396362, + "learning_rate": 9.796611036917542e-05, + "loss": 2.0165, + "step": 3858 + }, + { + "epoch": 1.1844689993861264, + "grad_norm": 0.6231487989425659, + "learning_rate": 9.796470687609904e-05, + "loss": 1.9607, + "step": 3859 + }, + { + "epoch": 1.1847759361571517, + "grad_norm": 0.49367067217826843, + "learning_rate": 9.796330290900731e-05, + "loss": 2.0074, + "step": 3860 + }, + { + "epoch": 1.1850828729281768, + "grad_norm": 0.5546393990516663, + "learning_rate": 9.796189846791413e-05, + "loss": 1.9688, + "step": 3861 + }, + { + "epoch": 1.185389809699202, + "grad_norm": 0.5880963802337646, + "learning_rate": 9.796049355283333e-05, + "loss": 2.0192, + "step": 3862 + }, + { + "epoch": 1.185696746470227, + "grad_norm": 0.6064910292625427, + "learning_rate": 9.795908816377884e-05, + "loss": 2.0236, + "step": 3863 + }, + { + "epoch": 1.1860036832412524, + "grad_norm": 0.524116575717926, + "learning_rate": 9.795768230076454e-05, + "loss": 2.0315, + "step": 3864 + }, + { + "epoch": 1.1863106200122775, + "grad_norm": 0.449158251285553, + "learning_rate": 9.79562759638043e-05, + "loss": 1.9423, + "step": 3865 + }, + { + "epoch": 1.1866175567833026, + "grad_norm": 0.5623016953468323, + "learning_rate": 9.795486915291203e-05, + "loss": 2.096, + "step": 3866 + }, + { + "epoch": 1.1869244935543277, + "grad_norm": 0.6107217073440552, + "learning_rate": 9.795346186810164e-05, + "loss": 1.9994, + "step": 3867 + }, + { + "epoch": 1.187231430325353, + "grad_norm": 0.5559211373329163, + "learning_rate": 9.795205410938704e-05, + "loss": 2.0138, + "step": 3868 + }, + { + "epoch": 1.1875383670963782, + "grad_norm": 0.5022037029266357, + "learning_rate": 9.795064587678212e-05, + "loss": 2.0835, + "step": 3869 + }, + { + "epoch": 1.1878453038674033, + "grad_norm": 0.5760810971260071, + "learning_rate": 9.794923717030082e-05, + "loss": 2.0839, + "step": 3870 + }, + { + "epoch": 1.1881522406384284, + "grad_norm": 0.559018075466156, + "learning_rate": 9.794782798995706e-05, + "loss": 2.0397, + "step": 3871 + }, + { + "epoch": 1.1884591774094537, + "grad_norm": 0.48842501640319824, + "learning_rate": 9.794641833576477e-05, + "loss": 2.022, + "step": 3872 + }, + { + "epoch": 1.1887661141804788, + "grad_norm": 0.47267377376556396, + "learning_rate": 9.794500820773785e-05, + "loss": 1.9677, + "step": 3873 + }, + { + "epoch": 1.189073050951504, + "grad_norm": 0.5107980966567993, + "learning_rate": 9.794359760589026e-05, + "loss": 2.124, + "step": 3874 + }, + { + "epoch": 1.189379987722529, + "grad_norm": 0.4993875026702881, + "learning_rate": 9.794218653023595e-05, + "loss": 1.9528, + "step": 3875 + }, + { + "epoch": 1.1896869244935544, + "grad_norm": 0.49543896317481995, + "learning_rate": 9.794077498078885e-05, + "loss": 2.0257, + "step": 3876 + }, + { + "epoch": 1.1899938612645795, + "grad_norm": 0.5207403302192688, + "learning_rate": 9.79393629575629e-05, + "loss": 2.0853, + "step": 3877 + }, + { + "epoch": 1.1903007980356046, + "grad_norm": 0.44884833693504333, + "learning_rate": 9.793795046057208e-05, + "loss": 1.9366, + "step": 3878 + }, + { + "epoch": 1.1906077348066297, + "grad_norm": 0.47921934723854065, + "learning_rate": 9.793653748983033e-05, + "loss": 2.0614, + "step": 3879 + }, + { + "epoch": 1.190914671577655, + "grad_norm": 0.5371566414833069, + "learning_rate": 9.793512404535163e-05, + "loss": 2.0433, + "step": 3880 + }, + { + "epoch": 1.1912216083486802, + "grad_norm": 0.48760104179382324, + "learning_rate": 9.793371012714994e-05, + "loss": 2.0061, + "step": 3881 + }, + { + "epoch": 1.1915285451197053, + "grad_norm": 0.47291669249534607, + "learning_rate": 9.793229573523922e-05, + "loss": 2.0661, + "step": 3882 + }, + { + "epoch": 1.1918354818907306, + "grad_norm": 0.5348502397537231, + "learning_rate": 9.793088086963347e-05, + "loss": 2.0131, + "step": 3883 + }, + { + "epoch": 1.1921424186617557, + "grad_norm": 0.6291812062263489, + "learning_rate": 9.792946553034666e-05, + "loss": 2.0312, + "step": 3884 + }, + { + "epoch": 1.1924493554327809, + "grad_norm": 0.5620503425598145, + "learning_rate": 9.792804971739276e-05, + "loss": 2.0429, + "step": 3885 + }, + { + "epoch": 1.192756292203806, + "grad_norm": 0.4984607696533203, + "learning_rate": 9.792663343078581e-05, + "loss": 2.0183, + "step": 3886 + }, + { + "epoch": 1.193063228974831, + "grad_norm": 0.5867961645126343, + "learning_rate": 9.792521667053975e-05, + "loss": 2.0609, + "step": 3887 + }, + { + "epoch": 1.1933701657458564, + "grad_norm": 0.5819169282913208, + "learning_rate": 9.792379943666863e-05, + "loss": 1.9412, + "step": 3888 + }, + { + "epoch": 1.1936771025168815, + "grad_norm": 0.6232548952102661, + "learning_rate": 9.792238172918643e-05, + "loss": 2.0607, + "step": 3889 + }, + { + "epoch": 1.1939840392879066, + "grad_norm": 0.5859619379043579, + "learning_rate": 9.792096354810716e-05, + "loss": 2.0718, + "step": 3890 + }, + { + "epoch": 1.194290976058932, + "grad_norm": 0.47209057211875916, + "learning_rate": 9.791954489344485e-05, + "loss": 1.9872, + "step": 3891 + }, + { + "epoch": 1.194597912829957, + "grad_norm": 0.5183662176132202, + "learning_rate": 9.79181257652135e-05, + "loss": 2.0782, + "step": 3892 + }, + { + "epoch": 1.1949048496009822, + "grad_norm": 0.551873505115509, + "learning_rate": 9.791670616342715e-05, + "loss": 2.0477, + "step": 3893 + }, + { + "epoch": 1.1952117863720073, + "grad_norm": 0.47254955768585205, + "learning_rate": 9.791528608809984e-05, + "loss": 1.9859, + "step": 3894 + }, + { + "epoch": 1.1955187231430324, + "grad_norm": 0.45482897758483887, + "learning_rate": 9.791386553924556e-05, + "loss": 1.9939, + "step": 3895 + }, + { + "epoch": 1.1958256599140578, + "grad_norm": 0.4687066078186035, + "learning_rate": 9.79124445168784e-05, + "loss": 1.9982, + "step": 3896 + }, + { + "epoch": 1.1961325966850829, + "grad_norm": 0.4855460524559021, + "learning_rate": 9.791102302101236e-05, + "loss": 1.9667, + "step": 3897 + }, + { + "epoch": 1.196439533456108, + "grad_norm": 0.48152467608451843, + "learning_rate": 9.790960105166153e-05, + "loss": 1.9914, + "step": 3898 + }, + { + "epoch": 1.1967464702271333, + "grad_norm": 0.48487406969070435, + "learning_rate": 9.790817860883993e-05, + "loss": 1.9978, + "step": 3899 + }, + { + "epoch": 1.1970534069981584, + "grad_norm": 0.47665563225746155, + "learning_rate": 9.790675569256162e-05, + "loss": 1.9995, + "step": 3900 + }, + { + "epoch": 1.1973603437691835, + "grad_norm": 0.48938530683517456, + "learning_rate": 9.790533230284069e-05, + "loss": 2.0461, + "step": 3901 + }, + { + "epoch": 1.1976672805402087, + "grad_norm": 0.6336411237716675, + "learning_rate": 9.790390843969119e-05, + "loss": 2.0003, + "step": 3902 + }, + { + "epoch": 1.1979742173112338, + "grad_norm": 0.6946616172790527, + "learning_rate": 9.790248410312717e-05, + "loss": 1.9979, + "step": 3903 + }, + { + "epoch": 1.198281154082259, + "grad_norm": 0.7829384803771973, + "learning_rate": 9.790105929316274e-05, + "loss": 2.015, + "step": 3904 + }, + { + "epoch": 1.1985880908532842, + "grad_norm": 0.6874059438705444, + "learning_rate": 9.789963400981197e-05, + "loss": 1.9887, + "step": 3905 + }, + { + "epoch": 1.1988950276243093, + "grad_norm": 0.6074720025062561, + "learning_rate": 9.789820825308893e-05, + "loss": 2.0287, + "step": 3906 + }, + { + "epoch": 1.1992019643953347, + "grad_norm": 0.49311673641204834, + "learning_rate": 9.789678202300774e-05, + "loss": 1.9846, + "step": 3907 + }, + { + "epoch": 1.1995089011663598, + "grad_norm": 0.5266487002372742, + "learning_rate": 9.789535531958244e-05, + "loss": 2.017, + "step": 3908 + }, + { + "epoch": 1.1998158379373849, + "grad_norm": 0.6170570850372314, + "learning_rate": 9.789392814282721e-05, + "loss": 2.0615, + "step": 3909 + }, + { + "epoch": 1.20012277470841, + "grad_norm": 0.5820409059524536, + "learning_rate": 9.789250049275609e-05, + "loss": 2.0459, + "step": 3910 + }, + { + "epoch": 1.2004297114794351, + "grad_norm": 0.5220739841461182, + "learning_rate": 9.78910723693832e-05, + "loss": 2.0843, + "step": 3911 + }, + { + "epoch": 1.2007366482504604, + "grad_norm": 0.5884750485420227, + "learning_rate": 9.788964377272267e-05, + "loss": 2.1068, + "step": 3912 + }, + { + "epoch": 1.2010435850214856, + "grad_norm": 0.5634950995445251, + "learning_rate": 9.788821470278861e-05, + "loss": 2.0206, + "step": 3913 + }, + { + "epoch": 1.2013505217925107, + "grad_norm": 0.5219514966011047, + "learning_rate": 9.788678515959517e-05, + "loss": 2.0802, + "step": 3914 + }, + { + "epoch": 1.201657458563536, + "grad_norm": 0.5870078206062317, + "learning_rate": 9.788535514315642e-05, + "loss": 2.0149, + "step": 3915 + }, + { + "epoch": 1.2019643953345611, + "grad_norm": 0.4850577414035797, + "learning_rate": 9.788392465348653e-05, + "loss": 2.0424, + "step": 3916 + }, + { + "epoch": 1.2022713321055862, + "grad_norm": 0.5354881882667542, + "learning_rate": 9.788249369059964e-05, + "loss": 2.0822, + "step": 3917 + }, + { + "epoch": 1.2025782688766113, + "grad_norm": 0.5817529559135437, + "learning_rate": 9.788106225450988e-05, + "loss": 2.0384, + "step": 3918 + }, + { + "epoch": 1.2028852056476367, + "grad_norm": 0.5685575008392334, + "learning_rate": 9.78796303452314e-05, + "loss": 1.9777, + "step": 3919 + }, + { + "epoch": 1.2031921424186618, + "grad_norm": 0.5086472034454346, + "learning_rate": 9.787819796277835e-05, + "loss": 1.9109, + "step": 3920 + }, + { + "epoch": 1.203499079189687, + "grad_norm": 0.45905008912086487, + "learning_rate": 9.787676510716488e-05, + "loss": 1.9945, + "step": 3921 + }, + { + "epoch": 1.203806015960712, + "grad_norm": 0.6052672863006592, + "learning_rate": 9.787533177840516e-05, + "loss": 2.0873, + "step": 3922 + }, + { + "epoch": 1.2041129527317374, + "grad_norm": 0.636320173740387, + "learning_rate": 9.787389797651334e-05, + "loss": 1.954, + "step": 3923 + }, + { + "epoch": 1.2044198895027625, + "grad_norm": 0.5775459408760071, + "learning_rate": 9.78724637015036e-05, + "loss": 1.9632, + "step": 3924 + }, + { + "epoch": 1.2047268262737876, + "grad_norm": 0.4593936502933502, + "learning_rate": 9.787102895339013e-05, + "loss": 1.948, + "step": 3925 + }, + { + "epoch": 1.2050337630448127, + "grad_norm": 0.4568643867969513, + "learning_rate": 9.78695937321871e-05, + "loss": 1.977, + "step": 3926 + }, + { + "epoch": 1.205340699815838, + "grad_norm": 0.6079357266426086, + "learning_rate": 9.786815803790867e-05, + "loss": 1.9738, + "step": 3927 + }, + { + "epoch": 1.2056476365868631, + "grad_norm": 0.5991626977920532, + "learning_rate": 9.786672187056905e-05, + "loss": 1.9603, + "step": 3928 + }, + { + "epoch": 1.2059545733578882, + "grad_norm": 0.4844282865524292, + "learning_rate": 9.786528523018242e-05, + "loss": 1.9739, + "step": 3929 + }, + { + "epoch": 1.2062615101289134, + "grad_norm": 0.43694475293159485, + "learning_rate": 9.786384811676298e-05, + "loss": 1.957, + "step": 3930 + }, + { + "epoch": 1.2065684468999387, + "grad_norm": 0.5742451548576355, + "learning_rate": 9.786241053032496e-05, + "loss": 1.9872, + "step": 3931 + }, + { + "epoch": 1.2068753836709638, + "grad_norm": 0.6246824860572815, + "learning_rate": 9.786097247088255e-05, + "loss": 2.0747, + "step": 3932 + }, + { + "epoch": 1.207182320441989, + "grad_norm": 0.5364731550216675, + "learning_rate": 9.785953393844996e-05, + "loss": 1.9793, + "step": 3933 + }, + { + "epoch": 1.207489257213014, + "grad_norm": 0.42909273505210876, + "learning_rate": 9.785809493304139e-05, + "loss": 1.9959, + "step": 3934 + }, + { + "epoch": 1.2077961939840394, + "grad_norm": 0.43952879309654236, + "learning_rate": 9.785665545467108e-05, + "loss": 2.0019, + "step": 3935 + }, + { + "epoch": 1.2081031307550645, + "grad_norm": 0.45972180366516113, + "learning_rate": 9.785521550335323e-05, + "loss": 1.9504, + "step": 3936 + }, + { + "epoch": 1.2084100675260896, + "grad_norm": 0.5592246651649475, + "learning_rate": 9.785377507910212e-05, + "loss": 2.0214, + "step": 3937 + }, + { + "epoch": 1.2087170042971147, + "grad_norm": 0.6084285378456116, + "learning_rate": 9.785233418193196e-05, + "loss": 2.08, + "step": 3938 + }, + { + "epoch": 1.20902394106814, + "grad_norm": 0.5370670557022095, + "learning_rate": 9.785089281185698e-05, + "loss": 2.0877, + "step": 3939 + }, + { + "epoch": 1.2093308778391652, + "grad_norm": 0.466501921415329, + "learning_rate": 9.784945096889143e-05, + "loss": 1.9795, + "step": 3940 + }, + { + "epoch": 1.2096378146101903, + "grad_norm": 0.48617517948150635, + "learning_rate": 9.784800865304954e-05, + "loss": 2.0099, + "step": 3941 + }, + { + "epoch": 1.2099447513812154, + "grad_norm": 0.528110921382904, + "learning_rate": 9.78465658643456e-05, + "loss": 2.0597, + "step": 3942 + }, + { + "epoch": 1.2102516881522407, + "grad_norm": 0.47355538606643677, + "learning_rate": 9.784512260279385e-05, + "loss": 2.0145, + "step": 3943 + }, + { + "epoch": 1.2105586249232658, + "grad_norm": 0.46970823407173157, + "learning_rate": 9.784367886840856e-05, + "loss": 2.0533, + "step": 3944 + }, + { + "epoch": 1.210865561694291, + "grad_norm": 0.41206037998199463, + "learning_rate": 9.784223466120399e-05, + "loss": 1.9226, + "step": 3945 + }, + { + "epoch": 1.211172498465316, + "grad_norm": 0.4298155605792999, + "learning_rate": 9.784078998119442e-05, + "loss": 2.0686, + "step": 3946 + }, + { + "epoch": 1.2114794352363414, + "grad_norm": 0.4616359770298004, + "learning_rate": 9.783934482839412e-05, + "loss": 2.0063, + "step": 3947 + }, + { + "epoch": 1.2117863720073665, + "grad_norm": 0.476726233959198, + "learning_rate": 9.783789920281737e-05, + "loss": 1.9868, + "step": 3948 + }, + { + "epoch": 1.2120933087783916, + "grad_norm": 0.5075610876083374, + "learning_rate": 9.783645310447846e-05, + "loss": 2.1019, + "step": 3949 + }, + { + "epoch": 1.212400245549417, + "grad_norm": 0.49806225299835205, + "learning_rate": 9.78350065333917e-05, + "loss": 2.0503, + "step": 3950 + }, + { + "epoch": 1.212707182320442, + "grad_norm": 0.5278452634811401, + "learning_rate": 9.783355948957134e-05, + "loss": 2.0513, + "step": 3951 + }, + { + "epoch": 1.2130141190914672, + "grad_norm": 0.5634627938270569, + "learning_rate": 9.783211197303174e-05, + "loss": 2.1135, + "step": 3952 + }, + { + "epoch": 1.2133210558624923, + "grad_norm": 0.5152999758720398, + "learning_rate": 9.783066398378715e-05, + "loss": 2.0392, + "step": 3953 + }, + { + "epoch": 1.2136279926335174, + "grad_norm": 0.48095864057540894, + "learning_rate": 9.782921552185191e-05, + "loss": 1.982, + "step": 3954 + }, + { + "epoch": 1.2139349294045427, + "grad_norm": 0.47377893328666687, + "learning_rate": 9.782776658724034e-05, + "loss": 1.9538, + "step": 3955 + }, + { + "epoch": 1.2142418661755678, + "grad_norm": 0.5260181427001953, + "learning_rate": 9.782631717996675e-05, + "loss": 2.1197, + "step": 3956 + }, + { + "epoch": 1.214548802946593, + "grad_norm": 0.5640038251876831, + "learning_rate": 9.782486730004544e-05, + "loss": 2.0338, + "step": 3957 + }, + { + "epoch": 1.2148557397176183, + "grad_norm": 0.5091645121574402, + "learning_rate": 9.782341694749078e-05, + "loss": 1.9921, + "step": 3958 + }, + { + "epoch": 1.2151626764886434, + "grad_norm": 0.48285624384880066, + "learning_rate": 9.782196612231706e-05, + "loss": 2.0358, + "step": 3959 + }, + { + "epoch": 1.2154696132596685, + "grad_norm": 0.5013573169708252, + "learning_rate": 9.782051482453867e-05, + "loss": 1.9378, + "step": 3960 + }, + { + "epoch": 1.2157765500306936, + "grad_norm": 0.42000052332878113, + "learning_rate": 9.781906305416991e-05, + "loss": 1.9232, + "step": 3961 + }, + { + "epoch": 1.2160834868017187, + "grad_norm": 0.4651196599006653, + "learning_rate": 9.781761081122514e-05, + "loss": 2.0244, + "step": 3962 + }, + { + "epoch": 1.216390423572744, + "grad_norm": 0.48081469535827637, + "learning_rate": 9.781615809571871e-05, + "loss": 1.938, + "step": 3963 + }, + { + "epoch": 1.2166973603437692, + "grad_norm": 0.4692462086677551, + "learning_rate": 9.7814704907665e-05, + "loss": 1.9592, + "step": 3964 + }, + { + "epoch": 1.2170042971147943, + "grad_norm": 0.5545635223388672, + "learning_rate": 9.781325124707832e-05, + "loss": 2.0882, + "step": 3965 + }, + { + "epoch": 1.2173112338858196, + "grad_norm": 0.47801801562309265, + "learning_rate": 9.78117971139731e-05, + "loss": 2.0127, + "step": 3966 + }, + { + "epoch": 1.2176181706568447, + "grad_norm": 0.4705824851989746, + "learning_rate": 9.781034250836364e-05, + "loss": 2.0659, + "step": 3967 + }, + { + "epoch": 1.2179251074278699, + "grad_norm": 0.4757092297077179, + "learning_rate": 9.78088874302644e-05, + "loss": 1.9177, + "step": 3968 + }, + { + "epoch": 1.218232044198895, + "grad_norm": 0.4563291370868683, + "learning_rate": 9.780743187968968e-05, + "loss": 1.991, + "step": 3969 + }, + { + "epoch": 1.21853898096992, + "grad_norm": 0.4641762375831604, + "learning_rate": 9.78059758566539e-05, + "loss": 2.0357, + "step": 3970 + }, + { + "epoch": 1.2188459177409454, + "grad_norm": 0.510754406452179, + "learning_rate": 9.780451936117145e-05, + "loss": 2.0754, + "step": 3971 + }, + { + "epoch": 1.2191528545119705, + "grad_norm": 0.5595460534095764, + "learning_rate": 9.780306239325671e-05, + "loss": 2.0449, + "step": 3972 + }, + { + "epoch": 1.2194597912829956, + "grad_norm": 0.5778231620788574, + "learning_rate": 9.780160495292412e-05, + "loss": 2.0187, + "step": 3973 + }, + { + "epoch": 1.219766728054021, + "grad_norm": 0.5098022818565369, + "learning_rate": 9.780014704018803e-05, + "loss": 1.9881, + "step": 3974 + }, + { + "epoch": 1.220073664825046, + "grad_norm": 0.46725937724113464, + "learning_rate": 9.779868865506288e-05, + "loss": 1.9929, + "step": 3975 + }, + { + "epoch": 1.2203806015960712, + "grad_norm": 0.48517540097236633, + "learning_rate": 9.779722979756304e-05, + "loss": 1.9446, + "step": 3976 + }, + { + "epoch": 1.2206875383670963, + "grad_norm": 0.5013269186019897, + "learning_rate": 9.7795770467703e-05, + "loss": 2.0256, + "step": 3977 + }, + { + "epoch": 1.2209944751381214, + "grad_norm": 0.4918982982635498, + "learning_rate": 9.779431066549713e-05, + "loss": 1.9732, + "step": 3978 + }, + { + "epoch": 1.2213014119091468, + "grad_norm": 0.45646655559539795, + "learning_rate": 9.779285039095987e-05, + "loss": 1.9672, + "step": 3979 + }, + { + "epoch": 1.2216083486801719, + "grad_norm": 0.4712901711463928, + "learning_rate": 9.779138964410565e-05, + "loss": 2.0074, + "step": 3980 + }, + { + "epoch": 1.221915285451197, + "grad_norm": 0.4901394844055176, + "learning_rate": 9.77899284249489e-05, + "loss": 2.0073, + "step": 3981 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.579247772693634, + "learning_rate": 9.778846673350407e-05, + "loss": 2.0983, + "step": 3982 + }, + { + "epoch": 1.2225291589932474, + "grad_norm": 0.6108444929122925, + "learning_rate": 9.77870045697856e-05, + "loss": 2.0268, + "step": 3983 + }, + { + "epoch": 1.2228360957642725, + "grad_norm": 0.5592121481895447, + "learning_rate": 9.778554193380796e-05, + "loss": 2.0549, + "step": 3984 + }, + { + "epoch": 1.2231430325352977, + "grad_norm": 0.538088858127594, + "learning_rate": 9.778407882558556e-05, + "loss": 1.9398, + "step": 3985 + }, + { + "epoch": 1.223449969306323, + "grad_norm": 0.5928295850753784, + "learning_rate": 9.77826152451329e-05, + "loss": 2.0341, + "step": 3986 + }, + { + "epoch": 1.223756906077348, + "grad_norm": 0.566687822341919, + "learning_rate": 9.778115119246442e-05, + "loss": 2.0629, + "step": 3987 + }, + { + "epoch": 1.2240638428483732, + "grad_norm": 0.7019027471542358, + "learning_rate": 9.777968666759461e-05, + "loss": 1.9979, + "step": 3988 + }, + { + "epoch": 1.2243707796193983, + "grad_norm": 0.7198969721794128, + "learning_rate": 9.777822167053793e-05, + "loss": 1.9898, + "step": 3989 + }, + { + "epoch": 1.2246777163904237, + "grad_norm": 0.6319006085395813, + "learning_rate": 9.777675620130887e-05, + "loss": 1.9591, + "step": 3990 + }, + { + "epoch": 1.2249846531614488, + "grad_norm": 0.5372903347015381, + "learning_rate": 9.777529025992187e-05, + "loss": 1.9605, + "step": 3991 + }, + { + "epoch": 1.225291589932474, + "grad_norm": 0.47436487674713135, + "learning_rate": 9.777382384639147e-05, + "loss": 1.9667, + "step": 3992 + }, + { + "epoch": 1.225598526703499, + "grad_norm": 0.5885797739028931, + "learning_rate": 9.777235696073214e-05, + "loss": 2.0363, + "step": 3993 + }, + { + "epoch": 1.2259054634745243, + "grad_norm": 0.6333138346672058, + "learning_rate": 9.777088960295838e-05, + "loss": 1.9352, + "step": 3994 + }, + { + "epoch": 1.2262124002455494, + "grad_norm": 0.6364251971244812, + "learning_rate": 9.776942177308468e-05, + "loss": 1.9577, + "step": 3995 + }, + { + "epoch": 1.2265193370165746, + "grad_norm": 0.5114668607711792, + "learning_rate": 9.776795347112557e-05, + "loss": 2.0241, + "step": 3996 + }, + { + "epoch": 1.2268262737875997, + "grad_norm": 0.6139995455741882, + "learning_rate": 9.776648469709556e-05, + "loss": 1.9847, + "step": 3997 + }, + { + "epoch": 1.227133210558625, + "grad_norm": 0.6104671955108643, + "learning_rate": 9.776501545100911e-05, + "loss": 1.9311, + "step": 3998 + }, + { + "epoch": 1.2274401473296501, + "grad_norm": 0.5099297761917114, + "learning_rate": 9.776354573288081e-05, + "loss": 2.0877, + "step": 3999 + }, + { + "epoch": 1.2277470841006752, + "grad_norm": 0.48199233412742615, + "learning_rate": 9.776207554272516e-05, + "loss": 1.9802, + "step": 4000 + }, + { + "epoch": 1.2280540208717003, + "grad_norm": 0.5323067307472229, + "learning_rate": 9.776060488055667e-05, + "loss": 2.0278, + "step": 4001 + }, + { + "epoch": 1.2283609576427257, + "grad_norm": 0.49086472392082214, + "learning_rate": 9.775913374638988e-05, + "loss": 2.0242, + "step": 4002 + }, + { + "epoch": 1.2286678944137508, + "grad_norm": 0.4812946319580078, + "learning_rate": 9.775766214023936e-05, + "loss": 1.9762, + "step": 4003 + }, + { + "epoch": 1.228974831184776, + "grad_norm": 0.44118809700012207, + "learning_rate": 9.775619006211962e-05, + "loss": 1.9242, + "step": 4004 + }, + { + "epoch": 1.229281767955801, + "grad_norm": 0.4507352113723755, + "learning_rate": 9.775471751204522e-05, + "loss": 2.0015, + "step": 4005 + }, + { + "epoch": 1.2295887047268264, + "grad_norm": 0.4620691239833832, + "learning_rate": 9.775324449003072e-05, + "loss": 2.0269, + "step": 4006 + }, + { + "epoch": 1.2298956414978515, + "grad_norm": 0.5053025484085083, + "learning_rate": 9.775177099609065e-05, + "loss": 1.9764, + "step": 4007 + }, + { + "epoch": 1.2302025782688766, + "grad_norm": 0.5113483667373657, + "learning_rate": 9.775029703023961e-05, + "loss": 2.0583, + "step": 4008 + }, + { + "epoch": 1.2305095150399017, + "grad_norm": 0.517400324344635, + "learning_rate": 9.774882259249214e-05, + "loss": 2.0918, + "step": 4009 + }, + { + "epoch": 1.230816451810927, + "grad_norm": 0.5575035214424133, + "learning_rate": 9.774734768286282e-05, + "loss": 2.0573, + "step": 4010 + }, + { + "epoch": 1.2311233885819521, + "grad_norm": 0.5556582808494568, + "learning_rate": 9.774587230136622e-05, + "loss": 1.9612, + "step": 4011 + }, + { + "epoch": 1.2314303253529773, + "grad_norm": 0.541752815246582, + "learning_rate": 9.774439644801693e-05, + "loss": 2.0165, + "step": 4012 + }, + { + "epoch": 1.2317372621240024, + "grad_norm": 0.46944886445999146, + "learning_rate": 9.774292012282953e-05, + "loss": 2.0068, + "step": 4013 + }, + { + "epoch": 1.2320441988950277, + "grad_norm": 0.5507385730743408, + "learning_rate": 9.77414433258186e-05, + "loss": 2.0092, + "step": 4014 + }, + { + "epoch": 1.2323511356660528, + "grad_norm": 0.550862193107605, + "learning_rate": 9.773996605699875e-05, + "loss": 1.9887, + "step": 4015 + }, + { + "epoch": 1.232658072437078, + "grad_norm": 0.5281004905700684, + "learning_rate": 9.77384883163846e-05, + "loss": 2.0214, + "step": 4016 + }, + { + "epoch": 1.232965009208103, + "grad_norm": 0.5682541131973267, + "learning_rate": 9.77370101039907e-05, + "loss": 2.0021, + "step": 4017 + }, + { + "epoch": 1.2332719459791284, + "grad_norm": 0.5083168745040894, + "learning_rate": 9.77355314198317e-05, + "loss": 1.9589, + "step": 4018 + }, + { + "epoch": 1.2335788827501535, + "grad_norm": 0.48763957619667053, + "learning_rate": 9.773405226392218e-05, + "loss": 1.9517, + "step": 4019 + }, + { + "epoch": 1.2338858195211786, + "grad_norm": 0.4721868634223938, + "learning_rate": 9.77325726362768e-05, + "loss": 1.959, + "step": 4020 + }, + { + "epoch": 1.2341927562922037, + "grad_norm": 0.5072606205940247, + "learning_rate": 9.773109253691016e-05, + "loss": 2.0252, + "step": 4021 + }, + { + "epoch": 1.234499693063229, + "grad_norm": 0.483260840177536, + "learning_rate": 9.772961196583686e-05, + "loss": 2.0205, + "step": 4022 + }, + { + "epoch": 1.2348066298342542, + "grad_norm": 0.4468609392642975, + "learning_rate": 9.772813092307158e-05, + "loss": 2.0182, + "step": 4023 + }, + { + "epoch": 1.2351135666052793, + "grad_norm": 0.4950753152370453, + "learning_rate": 9.772664940862893e-05, + "loss": 2.0276, + "step": 4024 + }, + { + "epoch": 1.2354205033763046, + "grad_norm": 0.45740416646003723, + "learning_rate": 9.772516742252356e-05, + "loss": 1.9519, + "step": 4025 + }, + { + "epoch": 1.2357274401473297, + "grad_norm": 0.409072607755661, + "learning_rate": 9.772368496477011e-05, + "loss": 1.9441, + "step": 4026 + }, + { + "epoch": 1.2360343769183548, + "grad_norm": 0.44857287406921387, + "learning_rate": 9.772220203538325e-05, + "loss": 1.9941, + "step": 4027 + }, + { + "epoch": 1.23634131368938, + "grad_norm": 0.4610998034477234, + "learning_rate": 9.77207186343776e-05, + "loss": 1.9855, + "step": 4028 + }, + { + "epoch": 1.236648250460405, + "grad_norm": 0.4809660017490387, + "learning_rate": 9.771923476176784e-05, + "loss": 1.9596, + "step": 4029 + }, + { + "epoch": 1.2369551872314304, + "grad_norm": 0.5011657476425171, + "learning_rate": 9.771775041756865e-05, + "loss": 1.9537, + "step": 4030 + }, + { + "epoch": 1.2372621240024555, + "grad_norm": 0.476001501083374, + "learning_rate": 9.771626560179465e-05, + "loss": 1.9447, + "step": 4031 + }, + { + "epoch": 1.2375690607734806, + "grad_norm": 0.4733816385269165, + "learning_rate": 9.771478031446057e-05, + "loss": 2.08, + "step": 4032 + }, + { + "epoch": 1.237875997544506, + "grad_norm": 0.4763995409011841, + "learning_rate": 9.771329455558108e-05, + "loss": 1.9483, + "step": 4033 + }, + { + "epoch": 1.238182934315531, + "grad_norm": 0.4906281530857086, + "learning_rate": 9.771180832517082e-05, + "loss": 1.9619, + "step": 4034 + }, + { + "epoch": 1.2384898710865562, + "grad_norm": 0.48713672161102295, + "learning_rate": 9.77103216232445e-05, + "loss": 1.9753, + "step": 4035 + }, + { + "epoch": 1.2387968078575813, + "grad_norm": 0.5214180946350098, + "learning_rate": 9.770883444981683e-05, + "loss": 2.0407, + "step": 4036 + }, + { + "epoch": 1.2391037446286064, + "grad_norm": 0.5161129236221313, + "learning_rate": 9.77073468049025e-05, + "loss": 2.0298, + "step": 4037 + }, + { + "epoch": 1.2394106813996317, + "grad_norm": 0.5041607022285461, + "learning_rate": 9.770585868851621e-05, + "loss": 1.9898, + "step": 4038 + }, + { + "epoch": 1.2397176181706568, + "grad_norm": 0.5076795220375061, + "learning_rate": 9.770437010067264e-05, + "loss": 1.9899, + "step": 4039 + }, + { + "epoch": 1.240024554941682, + "grad_norm": 0.47992074489593506, + "learning_rate": 9.770288104138654e-05, + "loss": 1.9923, + "step": 4040 + }, + { + "epoch": 1.2403314917127073, + "grad_norm": 0.4655405580997467, + "learning_rate": 9.770139151067261e-05, + "loss": 2.0082, + "step": 4041 + }, + { + "epoch": 1.2406384284837324, + "grad_norm": 0.499953031539917, + "learning_rate": 9.769990150854558e-05, + "loss": 2.0412, + "step": 4042 + }, + { + "epoch": 1.2409453652547575, + "grad_norm": 0.5288184285163879, + "learning_rate": 9.769841103502016e-05, + "loss": 2.0163, + "step": 4043 + }, + { + "epoch": 1.2412523020257826, + "grad_norm": 0.6660463809967041, + "learning_rate": 9.769692009011107e-05, + "loss": 2.1644, + "step": 4044 + }, + { + "epoch": 1.2415592387968077, + "grad_norm": 0.7020677328109741, + "learning_rate": 9.769542867383306e-05, + "loss": 1.9921, + "step": 4045 + }, + { + "epoch": 1.241866175567833, + "grad_norm": 0.8394366502761841, + "learning_rate": 9.769393678620089e-05, + "loss": 2.0099, + "step": 4046 + }, + { + "epoch": 1.2421731123388582, + "grad_norm": 0.9541008472442627, + "learning_rate": 9.769244442722927e-05, + "loss": 2.0035, + "step": 4047 + }, + { + "epoch": 1.2424800491098833, + "grad_norm": 0.8454573750495911, + "learning_rate": 9.769095159693296e-05, + "loss": 2.0075, + "step": 4048 + }, + { + "epoch": 1.2427869858809086, + "grad_norm": 0.6634951233863831, + "learning_rate": 9.768945829532672e-05, + "loss": 2.0352, + "step": 4049 + }, + { + "epoch": 1.2430939226519337, + "grad_norm": 0.5453166365623474, + "learning_rate": 9.76879645224253e-05, + "loss": 2.0259, + "step": 4050 + }, + { + "epoch": 1.2434008594229589, + "grad_norm": 0.8018995523452759, + "learning_rate": 9.768647027824344e-05, + "loss": 2.0175, + "step": 4051 + }, + { + "epoch": 1.243707796193984, + "grad_norm": 0.8518994450569153, + "learning_rate": 9.768497556279596e-05, + "loss": 1.986, + "step": 4052 + }, + { + "epoch": 1.244014732965009, + "grad_norm": 0.670764684677124, + "learning_rate": 9.76834803760976e-05, + "loss": 1.9779, + "step": 4053 + }, + { + "epoch": 1.2443216697360344, + "grad_norm": 0.5042433142662048, + "learning_rate": 9.768198471816312e-05, + "loss": 1.9808, + "step": 4054 + }, + { + "epoch": 1.2446286065070595, + "grad_norm": 0.45487603545188904, + "learning_rate": 9.768048858900733e-05, + "loss": 2.011, + "step": 4055 + }, + { + "epoch": 1.2449355432780846, + "grad_norm": 0.5012104511260986, + "learning_rate": 9.767899198864502e-05, + "loss": 1.9945, + "step": 4056 + }, + { + "epoch": 1.24524248004911, + "grad_norm": 0.6275805234909058, + "learning_rate": 9.767749491709095e-05, + "loss": 2.0397, + "step": 4057 + }, + { + "epoch": 1.245549416820135, + "grad_norm": 0.601513683795929, + "learning_rate": 9.767599737435993e-05, + "loss": 2.0201, + "step": 4058 + }, + { + "epoch": 1.2458563535911602, + "grad_norm": 0.531112551689148, + "learning_rate": 9.767449936046678e-05, + "loss": 2.0449, + "step": 4059 + }, + { + "epoch": 1.2461632903621853, + "grad_norm": 0.48515528440475464, + "learning_rate": 9.767300087542626e-05, + "loss": 2.0318, + "step": 4060 + }, + { + "epoch": 1.2464702271332107, + "grad_norm": 0.49292388558387756, + "learning_rate": 9.767150191925321e-05, + "loss": 2.0004, + "step": 4061 + }, + { + "epoch": 1.2467771639042358, + "grad_norm": 0.6046907901763916, + "learning_rate": 9.767000249196242e-05, + "loss": 2.0141, + "step": 4062 + }, + { + "epoch": 1.2470841006752609, + "grad_norm": 0.5311875939369202, + "learning_rate": 9.766850259356876e-05, + "loss": 1.9909, + "step": 4063 + }, + { + "epoch": 1.247391037446286, + "grad_norm": 0.535664975643158, + "learning_rate": 9.7667002224087e-05, + "loss": 2.07, + "step": 4064 + }, + { + "epoch": 1.2476979742173113, + "grad_norm": 0.594886839389801, + "learning_rate": 9.766550138353199e-05, + "loss": 1.9646, + "step": 4065 + }, + { + "epoch": 1.2480049109883364, + "grad_norm": 0.6726763844490051, + "learning_rate": 9.766400007191856e-05, + "loss": 1.9778, + "step": 4066 + }, + { + "epoch": 1.2483118477593615, + "grad_norm": 0.6045297384262085, + "learning_rate": 9.766249828926154e-05, + "loss": 2.0215, + "step": 4067 + }, + { + "epoch": 1.2486187845303867, + "grad_norm": 0.56207275390625, + "learning_rate": 9.766099603557576e-05, + "loss": 2.0252, + "step": 4068 + }, + { + "epoch": 1.248925721301412, + "grad_norm": 0.6623022556304932, + "learning_rate": 9.765949331087611e-05, + "loss": 1.975, + "step": 4069 + }, + { + "epoch": 1.249232658072437, + "grad_norm": 0.6274738311767578, + "learning_rate": 9.76579901151774e-05, + "loss": 2.037, + "step": 4070 + }, + { + "epoch": 1.2495395948434622, + "grad_norm": 0.5161643028259277, + "learning_rate": 9.76564864484945e-05, + "loss": 1.969, + "step": 4071 + }, + { + "epoch": 1.2498465316144873, + "grad_norm": 0.5624449849128723, + "learning_rate": 9.765498231084227e-05, + "loss": 2.0322, + "step": 4072 + }, + { + "epoch": 1.2501534683855127, + "grad_norm": 0.6198796629905701, + "learning_rate": 9.765347770223556e-05, + "loss": 1.986, + "step": 4073 + }, + { + "epoch": 1.2504604051565378, + "grad_norm": 0.5928165316581726, + "learning_rate": 9.765197262268927e-05, + "loss": 1.9886, + "step": 4074 + }, + { + "epoch": 1.250767341927563, + "grad_norm": 0.476484090089798, + "learning_rate": 9.765046707221825e-05, + "loss": 2.0476, + "step": 4075 + }, + { + "epoch": 1.2510742786985882, + "grad_norm": 0.5001220703125, + "learning_rate": 9.764896105083738e-05, + "loss": 1.9222, + "step": 4076 + }, + { + "epoch": 1.2513812154696133, + "grad_norm": 0.5429214239120483, + "learning_rate": 9.764745455856156e-05, + "loss": 2.0005, + "step": 4077 + }, + { + "epoch": 1.2516881522406385, + "grad_norm": 0.49443748593330383, + "learning_rate": 9.764594759540566e-05, + "loss": 1.9746, + "step": 4078 + }, + { + "epoch": 1.2519950890116636, + "grad_norm": 0.46963369846343994, + "learning_rate": 9.764444016138458e-05, + "loss": 1.9133, + "step": 4079 + }, + { + "epoch": 1.2523020257826887, + "grad_norm": 0.5112172365188599, + "learning_rate": 9.764293225651324e-05, + "loss": 1.9488, + "step": 4080 + }, + { + "epoch": 1.252608962553714, + "grad_norm": 0.4584117829799652, + "learning_rate": 9.764142388080648e-05, + "loss": 1.9895, + "step": 4081 + }, + { + "epoch": 1.2529158993247391, + "grad_norm": 0.48059090971946716, + "learning_rate": 9.763991503427927e-05, + "loss": 2.0436, + "step": 4082 + }, + { + "epoch": 1.2532228360957642, + "grad_norm": 0.5877810120582581, + "learning_rate": 9.763840571694649e-05, + "loss": 1.97, + "step": 4083 + }, + { + "epoch": 1.2535297728667896, + "grad_norm": 0.5370834469795227, + "learning_rate": 9.763689592882306e-05, + "loss": 2.0369, + "step": 4084 + }, + { + "epoch": 1.2538367096378147, + "grad_norm": 0.5483170747756958, + "learning_rate": 9.763538566992392e-05, + "loss": 2.066, + "step": 4085 + }, + { + "epoch": 1.2541436464088398, + "grad_norm": 0.5209359526634216, + "learning_rate": 9.763387494026396e-05, + "loss": 2.0685, + "step": 4086 + }, + { + "epoch": 1.254450583179865, + "grad_norm": 0.5569130182266235, + "learning_rate": 9.763236373985813e-05, + "loss": 2.0253, + "step": 4087 + }, + { + "epoch": 1.25475751995089, + "grad_norm": 0.48483753204345703, + "learning_rate": 9.763085206872136e-05, + "loss": 1.9851, + "step": 4088 + }, + { + "epoch": 1.2550644567219154, + "grad_norm": 0.4289563000202179, + "learning_rate": 9.76293399268686e-05, + "loss": 1.9374, + "step": 4089 + }, + { + "epoch": 1.2553713934929405, + "grad_norm": 0.4691961109638214, + "learning_rate": 9.762782731431478e-05, + "loss": 1.9588, + "step": 4090 + }, + { + "epoch": 1.2556783302639656, + "grad_norm": 0.49626582860946655, + "learning_rate": 9.762631423107488e-05, + "loss": 1.999, + "step": 4091 + }, + { + "epoch": 1.255985267034991, + "grad_norm": 0.5099872946739197, + "learning_rate": 9.762480067716381e-05, + "loss": 2.013, + "step": 4092 + }, + { + "epoch": 1.256292203806016, + "grad_norm": 0.47525838017463684, + "learning_rate": 9.762328665259654e-05, + "loss": 1.9953, + "step": 4093 + }, + { + "epoch": 1.2565991405770411, + "grad_norm": 0.4277878999710083, + "learning_rate": 9.762177215738804e-05, + "loss": 1.9623, + "step": 4094 + }, + { + "epoch": 1.2569060773480663, + "grad_norm": 0.46068885922431946, + "learning_rate": 9.762025719155328e-05, + "loss": 2.0012, + "step": 4095 + }, + { + "epoch": 1.2572130141190914, + "grad_norm": 0.4566059410572052, + "learning_rate": 9.761874175510723e-05, + "loss": 1.9666, + "step": 4096 + }, + { + "epoch": 1.2575199508901167, + "grad_norm": 0.44656631350517273, + "learning_rate": 9.761722584806487e-05, + "loss": 1.9912, + "step": 4097 + }, + { + "epoch": 1.2578268876611418, + "grad_norm": 0.5149295330047607, + "learning_rate": 9.761570947044117e-05, + "loss": 1.9876, + "step": 4098 + }, + { + "epoch": 1.258133824432167, + "grad_norm": 0.5265617370605469, + "learning_rate": 9.761419262225111e-05, + "loss": 2.0817, + "step": 4099 + }, + { + "epoch": 1.2584407612031923, + "grad_norm": 0.5015068054199219, + "learning_rate": 9.76126753035097e-05, + "loss": 1.9767, + "step": 4100 + }, + { + "epoch": 1.2587476979742174, + "grad_norm": 0.5178890228271484, + "learning_rate": 9.761115751423192e-05, + "loss": 1.9968, + "step": 4101 + }, + { + "epoch": 1.2590546347452425, + "grad_norm": 0.46565014123916626, + "learning_rate": 9.760963925443279e-05, + "loss": 1.8977, + "step": 4102 + }, + { + "epoch": 1.2593615715162676, + "grad_norm": 0.466398686170578, + "learning_rate": 9.760812052412728e-05, + "loss": 2.0317, + "step": 4103 + }, + { + "epoch": 1.2596685082872927, + "grad_norm": 0.48445576429367065, + "learning_rate": 9.760660132333043e-05, + "loss": 1.9953, + "step": 4104 + }, + { + "epoch": 1.259975445058318, + "grad_norm": 0.5716978907585144, + "learning_rate": 9.760508165205724e-05, + "loss": 2.0468, + "step": 4105 + }, + { + "epoch": 1.2602823818293432, + "grad_norm": 0.5168376564979553, + "learning_rate": 9.760356151032273e-05, + "loss": 1.9896, + "step": 4106 + }, + { + "epoch": 1.2605893186003683, + "grad_norm": 0.5014469027519226, + "learning_rate": 9.760204089814192e-05, + "loss": 2.0855, + "step": 4107 + }, + { + "epoch": 1.2608962553713936, + "grad_norm": 0.5283352732658386, + "learning_rate": 9.760051981552984e-05, + "loss": 2.0477, + "step": 4108 + }, + { + "epoch": 1.2612031921424187, + "grad_norm": 0.4526209533214569, + "learning_rate": 9.759899826250153e-05, + "loss": 1.9638, + "step": 4109 + }, + { + "epoch": 1.2615101289134438, + "grad_norm": 0.4565027058124542, + "learning_rate": 9.759747623907203e-05, + "loss": 1.9401, + "step": 4110 + }, + { + "epoch": 1.261817065684469, + "grad_norm": 0.48825928568840027, + "learning_rate": 9.759595374525636e-05, + "loss": 1.9721, + "step": 4111 + }, + { + "epoch": 1.262124002455494, + "grad_norm": 0.4922933578491211, + "learning_rate": 9.759443078106958e-05, + "loss": 1.969, + "step": 4112 + }, + { + "epoch": 1.2624309392265194, + "grad_norm": 0.5227758884429932, + "learning_rate": 9.759290734652674e-05, + "loss": 2.0144, + "step": 4113 + }, + { + "epoch": 1.2627378759975445, + "grad_norm": 0.48013919591903687, + "learning_rate": 9.759138344164289e-05, + "loss": 1.9889, + "step": 4114 + }, + { + "epoch": 1.2630448127685696, + "grad_norm": 0.5039379596710205, + "learning_rate": 9.758985906643309e-05, + "loss": 1.9313, + "step": 4115 + }, + { + "epoch": 1.263351749539595, + "grad_norm": 0.5248776078224182, + "learning_rate": 9.758833422091244e-05, + "loss": 2.0091, + "step": 4116 + }, + { + "epoch": 1.26365868631062, + "grad_norm": 0.4788825809955597, + "learning_rate": 9.758680890509595e-05, + "loss": 2.0197, + "step": 4117 + }, + { + "epoch": 1.2639656230816452, + "grad_norm": 0.4926285743713379, + "learning_rate": 9.758528311899873e-05, + "loss": 2.0558, + "step": 4118 + }, + { + "epoch": 1.2642725598526703, + "grad_norm": 0.44785842299461365, + "learning_rate": 9.758375686263586e-05, + "loss": 1.9505, + "step": 4119 + }, + { + "epoch": 1.2645794966236954, + "grad_norm": 0.44693484902381897, + "learning_rate": 9.75822301360224e-05, + "loss": 1.9734, + "step": 4120 + }, + { + "epoch": 1.2648864333947207, + "grad_norm": 0.4691752791404724, + "learning_rate": 9.758070293917346e-05, + "loss": 2.0069, + "step": 4121 + }, + { + "epoch": 1.2651933701657458, + "grad_norm": 0.4718364477157593, + "learning_rate": 9.757917527210413e-05, + "loss": 1.9926, + "step": 4122 + }, + { + "epoch": 1.265500306936771, + "grad_norm": 0.47527435421943665, + "learning_rate": 9.757764713482949e-05, + "loss": 2.0304, + "step": 4123 + }, + { + "epoch": 1.2658072437077963, + "grad_norm": 0.5030924677848816, + "learning_rate": 9.757611852736467e-05, + "loss": 2.0281, + "step": 4124 + }, + { + "epoch": 1.2661141804788214, + "grad_norm": 0.5260440707206726, + "learning_rate": 9.757458944972475e-05, + "loss": 1.9952, + "step": 4125 + }, + { + "epoch": 1.2664211172498465, + "grad_norm": 0.5542300939559937, + "learning_rate": 9.757305990192486e-05, + "loss": 1.979, + "step": 4126 + }, + { + "epoch": 1.2667280540208716, + "grad_norm": 0.5589221715927124, + "learning_rate": 9.757152988398011e-05, + "loss": 2.0123, + "step": 4127 + }, + { + "epoch": 1.2670349907918967, + "grad_norm": 0.48933175206184387, + "learning_rate": 9.75699993959056e-05, + "loss": 1.9671, + "step": 4128 + }, + { + "epoch": 1.267341927562922, + "grad_norm": 0.4785501956939697, + "learning_rate": 9.75684684377165e-05, + "loss": 1.9452, + "step": 4129 + }, + { + "epoch": 1.2676488643339472, + "grad_norm": 0.5000367760658264, + "learning_rate": 9.75669370094279e-05, + "loss": 1.9637, + "step": 4130 + }, + { + "epoch": 1.2679558011049723, + "grad_norm": 0.5292743444442749, + "learning_rate": 9.756540511105496e-05, + "loss": 2.0464, + "step": 4131 + }, + { + "epoch": 1.2682627378759976, + "grad_norm": 0.4979592561721802, + "learning_rate": 9.75638727426128e-05, + "loss": 1.9863, + "step": 4132 + }, + { + "epoch": 1.2685696746470227, + "grad_norm": 0.4681611657142639, + "learning_rate": 9.756233990411656e-05, + "loss": 1.9978, + "step": 4133 + }, + { + "epoch": 1.2688766114180479, + "grad_norm": 0.5034354329109192, + "learning_rate": 9.756080659558142e-05, + "loss": 2.0332, + "step": 4134 + }, + { + "epoch": 1.269183548189073, + "grad_norm": 0.4815942347049713, + "learning_rate": 9.75592728170225e-05, + "loss": 1.9669, + "step": 4135 + }, + { + "epoch": 1.269490484960098, + "grad_norm": 0.49555137753486633, + "learning_rate": 9.755773856845498e-05, + "loss": 1.9774, + "step": 4136 + }, + { + "epoch": 1.2697974217311234, + "grad_norm": 0.5533550381660461, + "learning_rate": 9.755620384989401e-05, + "loss": 2.0236, + "step": 4137 + }, + { + "epoch": 1.2701043585021485, + "grad_norm": 0.49497511982917786, + "learning_rate": 9.755466866135476e-05, + "loss": 1.9266, + "step": 4138 + }, + { + "epoch": 1.2704112952731736, + "grad_norm": 0.5009804964065552, + "learning_rate": 9.755313300285239e-05, + "loss": 1.9463, + "step": 4139 + }, + { + "epoch": 1.270718232044199, + "grad_norm": 0.49870428442955017, + "learning_rate": 9.755159687440209e-05, + "loss": 1.9566, + "step": 4140 + }, + { + "epoch": 1.271025168815224, + "grad_norm": 0.49113500118255615, + "learning_rate": 9.755006027601905e-05, + "loss": 2.0075, + "step": 4141 + }, + { + "epoch": 1.2713321055862492, + "grad_norm": 0.45977187156677246, + "learning_rate": 9.754852320771845e-05, + "loss": 1.9358, + "step": 4142 + }, + { + "epoch": 1.2716390423572743, + "grad_norm": 0.5493664145469666, + "learning_rate": 9.754698566951545e-05, + "loss": 1.9996, + "step": 4143 + }, + { + "epoch": 1.2719459791282997, + "grad_norm": 0.4791078567504883, + "learning_rate": 9.75454476614253e-05, + "loss": 1.9426, + "step": 4144 + }, + { + "epoch": 1.2722529158993248, + "grad_norm": 0.4809282720088959, + "learning_rate": 9.754390918346315e-05, + "loss": 2.0197, + "step": 4145 + }, + { + "epoch": 1.2725598526703499, + "grad_norm": 0.5380387902259827, + "learning_rate": 9.754237023564423e-05, + "loss": 2.0261, + "step": 4146 + }, + { + "epoch": 1.272866789441375, + "grad_norm": 0.48302608728408813, + "learning_rate": 9.754083081798374e-05, + "loss": 2.0539, + "step": 4147 + }, + { + "epoch": 1.2731737262124003, + "grad_norm": 0.5752124786376953, + "learning_rate": 9.75392909304969e-05, + "loss": 2.0901, + "step": 4148 + }, + { + "epoch": 1.2734806629834254, + "grad_norm": 0.5538807511329651, + "learning_rate": 9.75377505731989e-05, + "loss": 1.9721, + "step": 4149 + }, + { + "epoch": 1.2737875997544506, + "grad_norm": 0.6331756114959717, + "learning_rate": 9.753620974610502e-05, + "loss": 2.0124, + "step": 4150 + }, + { + "epoch": 1.2740945365254759, + "grad_norm": 0.6422140598297119, + "learning_rate": 9.753466844923042e-05, + "loss": 2.0115, + "step": 4151 + }, + { + "epoch": 1.274401473296501, + "grad_norm": 0.6650347113609314, + "learning_rate": 9.753312668259038e-05, + "loss": 1.9735, + "step": 4152 + }, + { + "epoch": 1.274708410067526, + "grad_norm": 0.587230384349823, + "learning_rate": 9.753158444620013e-05, + "loss": 1.9382, + "step": 4153 + }, + { + "epoch": 1.2750153468385512, + "grad_norm": 0.5357664823532104, + "learning_rate": 9.75300417400749e-05, + "loss": 2.0437, + "step": 4154 + }, + { + "epoch": 1.2753222836095763, + "grad_norm": 0.5058115720748901, + "learning_rate": 9.752849856422994e-05, + "loss": 2.0031, + "step": 4155 + }, + { + "epoch": 1.2756292203806017, + "grad_norm": 0.5913745164871216, + "learning_rate": 9.75269549186805e-05, + "loss": 1.9923, + "step": 4156 + }, + { + "epoch": 1.2759361571516268, + "grad_norm": 0.6766920685768127, + "learning_rate": 9.752541080344181e-05, + "loss": 1.9619, + "step": 4157 + }, + { + "epoch": 1.276243093922652, + "grad_norm": 0.606132984161377, + "learning_rate": 9.752386621852919e-05, + "loss": 1.9689, + "step": 4158 + }, + { + "epoch": 1.2765500306936772, + "grad_norm": 0.521133542060852, + "learning_rate": 9.752232116395785e-05, + "loss": 1.9602, + "step": 4159 + }, + { + "epoch": 1.2768569674647023, + "grad_norm": 0.45266324281692505, + "learning_rate": 9.75207756397431e-05, + "loss": 2.0032, + "step": 4160 + }, + { + "epoch": 1.2771639042357275, + "grad_norm": 0.5078892707824707, + "learning_rate": 9.751922964590017e-05, + "loss": 2.0656, + "step": 4161 + }, + { + "epoch": 1.2774708410067526, + "grad_norm": 0.5042154788970947, + "learning_rate": 9.751768318244437e-05, + "loss": 1.9356, + "step": 4162 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.5866135954856873, + "learning_rate": 9.751613624939098e-05, + "loss": 1.9655, + "step": 4163 + }, + { + "epoch": 1.278084714548803, + "grad_norm": 0.6038163304328918, + "learning_rate": 9.751458884675527e-05, + "loss": 1.9445, + "step": 4164 + }, + { + "epoch": 1.2783916513198281, + "grad_norm": 0.4938269555568695, + "learning_rate": 9.751304097455254e-05, + "loss": 2.0164, + "step": 4165 + }, + { + "epoch": 1.2786985880908532, + "grad_norm": 0.4289272427558899, + "learning_rate": 9.75114926327981e-05, + "loss": 1.912, + "step": 4166 + }, + { + "epoch": 1.2790055248618786, + "grad_norm": 0.524058997631073, + "learning_rate": 9.750994382150724e-05, + "loss": 1.9279, + "step": 4167 + }, + { + "epoch": 1.2793124616329037, + "grad_norm": 0.6318224668502808, + "learning_rate": 9.750839454069527e-05, + "loss": 1.98, + "step": 4168 + }, + { + "epoch": 1.2796193984039288, + "grad_norm": 0.5709670782089233, + "learning_rate": 9.750684479037749e-05, + "loss": 2.0029, + "step": 4169 + }, + { + "epoch": 1.279926335174954, + "grad_norm": 0.4621523916721344, + "learning_rate": 9.750529457056924e-05, + "loss": 2.0295, + "step": 4170 + }, + { + "epoch": 1.280233271945979, + "grad_norm": 0.5812001824378967, + "learning_rate": 9.750374388128581e-05, + "loss": 2.0839, + "step": 4171 + }, + { + "epoch": 1.2805402087170044, + "grad_norm": 0.6389874219894409, + "learning_rate": 9.750219272254256e-05, + "loss": 2.0825, + "step": 4172 + }, + { + "epoch": 1.2808471454880295, + "grad_norm": 0.49902382493019104, + "learning_rate": 9.750064109435478e-05, + "loss": 1.8902, + "step": 4173 + }, + { + "epoch": 1.2811540822590546, + "grad_norm": 0.5641525983810425, + "learning_rate": 9.749908899673783e-05, + "loss": 2.0463, + "step": 4174 + }, + { + "epoch": 1.28146101903008, + "grad_norm": 0.5977841019630432, + "learning_rate": 9.749753642970704e-05, + "loss": 2.0253, + "step": 4175 + }, + { + "epoch": 1.281767955801105, + "grad_norm": 0.5438104271888733, + "learning_rate": 9.749598339327777e-05, + "loss": 1.9862, + "step": 4176 + }, + { + "epoch": 1.2820748925721301, + "grad_norm": 0.4542587697505951, + "learning_rate": 9.749442988746535e-05, + "loss": 1.9476, + "step": 4177 + }, + { + "epoch": 1.2823818293431553, + "grad_norm": 0.4900791347026825, + "learning_rate": 9.749287591228513e-05, + "loss": 2.0093, + "step": 4178 + }, + { + "epoch": 1.2826887661141804, + "grad_norm": 0.5837534666061401, + "learning_rate": 9.749132146775247e-05, + "loss": 2.0699, + "step": 4179 + }, + { + "epoch": 1.2829957028852057, + "grad_norm": 0.5315881967544556, + "learning_rate": 9.748976655388274e-05, + "loss": 1.9514, + "step": 4180 + }, + { + "epoch": 1.2833026396562308, + "grad_norm": 0.5284895300865173, + "learning_rate": 9.74882111706913e-05, + "loss": 2.0171, + "step": 4181 + }, + { + "epoch": 1.283609576427256, + "grad_norm": 0.521202802658081, + "learning_rate": 9.748665531819352e-05, + "loss": 2.025, + "step": 4182 + }, + { + "epoch": 1.2839165131982813, + "grad_norm": 0.5437573194503784, + "learning_rate": 9.748509899640479e-05, + "loss": 2.0352, + "step": 4183 + }, + { + "epoch": 1.2842234499693064, + "grad_norm": 0.5394143462181091, + "learning_rate": 9.748354220534048e-05, + "loss": 2.0245, + "step": 4184 + }, + { + "epoch": 1.2845303867403315, + "grad_norm": 0.47468093037605286, + "learning_rate": 9.748198494501597e-05, + "loss": 1.9719, + "step": 4185 + }, + { + "epoch": 1.2848373235113566, + "grad_norm": 0.5312216877937317, + "learning_rate": 9.748042721544666e-05, + "loss": 2.0111, + "step": 4186 + }, + { + "epoch": 1.2851442602823817, + "grad_norm": 0.525694727897644, + "learning_rate": 9.747886901664794e-05, + "loss": 2.0582, + "step": 4187 + }, + { + "epoch": 1.285451197053407, + "grad_norm": 0.4965955317020416, + "learning_rate": 9.74773103486352e-05, + "loss": 1.9777, + "step": 4188 + }, + { + "epoch": 1.2857581338244322, + "grad_norm": 0.4391513466835022, + "learning_rate": 9.747575121142385e-05, + "loss": 1.9725, + "step": 4189 + }, + { + "epoch": 1.2860650705954573, + "grad_norm": 0.48999011516571045, + "learning_rate": 9.74741916050293e-05, + "loss": 1.953, + "step": 4190 + }, + { + "epoch": 1.2863720073664826, + "grad_norm": 0.5297304391860962, + "learning_rate": 9.747263152946698e-05, + "loss": 2.0484, + "step": 4191 + }, + { + "epoch": 1.2866789441375077, + "grad_norm": 0.4878230690956116, + "learning_rate": 9.747107098475226e-05, + "loss": 2.0423, + "step": 4192 + }, + { + "epoch": 1.2869858809085328, + "grad_norm": 0.538070023059845, + "learning_rate": 9.74695099709006e-05, + "loss": 2.0699, + "step": 4193 + }, + { + "epoch": 1.287292817679558, + "grad_norm": 0.6656436324119568, + "learning_rate": 9.746794848792743e-05, + "loss": 2.0689, + "step": 4194 + }, + { + "epoch": 1.287599754450583, + "grad_norm": 0.6416848301887512, + "learning_rate": 9.746638653584819e-05, + "loss": 1.9796, + "step": 4195 + }, + { + "epoch": 1.2879066912216084, + "grad_norm": 0.5917447805404663, + "learning_rate": 9.746482411467827e-05, + "loss": 2.0324, + "step": 4196 + }, + { + "epoch": 1.2882136279926335, + "grad_norm": 0.5234537124633789, + "learning_rate": 9.746326122443314e-05, + "loss": 2.0468, + "step": 4197 + }, + { + "epoch": 1.2885205647636586, + "grad_norm": 0.4885808229446411, + "learning_rate": 9.746169786512827e-05, + "loss": 1.9619, + "step": 4198 + }, + { + "epoch": 1.288827501534684, + "grad_norm": 0.5776945948600769, + "learning_rate": 9.746013403677905e-05, + "loss": 2.0167, + "step": 4199 + }, + { + "epoch": 1.289134438305709, + "grad_norm": 0.5722271203994751, + "learning_rate": 9.745856973940099e-05, + "loss": 1.9751, + "step": 4200 + }, + { + "epoch": 1.2894413750767342, + "grad_norm": 0.49253931641578674, + "learning_rate": 9.745700497300951e-05, + "loss": 1.9821, + "step": 4201 + }, + { + "epoch": 1.2897483118477593, + "grad_norm": 0.4739282727241516, + "learning_rate": 9.74554397376201e-05, + "loss": 1.9926, + "step": 4202 + }, + { + "epoch": 1.2900552486187844, + "grad_norm": 0.5133153200149536, + "learning_rate": 9.745387403324823e-05, + "loss": 1.9655, + "step": 4203 + }, + { + "epoch": 1.2903621853898097, + "grad_norm": 0.48941388726234436, + "learning_rate": 9.745230785990935e-05, + "loss": 1.9401, + "step": 4204 + }, + { + "epoch": 1.2906691221608348, + "grad_norm": 0.5998152494430542, + "learning_rate": 9.745074121761896e-05, + "loss": 2.0223, + "step": 4205 + }, + { + "epoch": 1.29097605893186, + "grad_norm": 0.4423331618309021, + "learning_rate": 9.744917410639253e-05, + "loss": 1.9602, + "step": 4206 + }, + { + "epoch": 1.2912829957028853, + "grad_norm": 0.5387418866157532, + "learning_rate": 9.744760652624553e-05, + "loss": 2.0631, + "step": 4207 + }, + { + "epoch": 1.2915899324739104, + "grad_norm": 0.5992900729179382, + "learning_rate": 9.744603847719352e-05, + "loss": 1.9805, + "step": 4208 + }, + { + "epoch": 1.2918968692449355, + "grad_norm": 0.5033924579620361, + "learning_rate": 9.744446995925192e-05, + "loss": 1.9817, + "step": 4209 + }, + { + "epoch": 1.2922038060159606, + "grad_norm": 0.47493448853492737, + "learning_rate": 9.744290097243624e-05, + "loss": 2.0259, + "step": 4210 + }, + { + "epoch": 1.2925107427869857, + "grad_norm": 0.5161942839622498, + "learning_rate": 9.744133151676203e-05, + "loss": 1.9686, + "step": 4211 + }, + { + "epoch": 1.292817679558011, + "grad_norm": 0.4476351737976074, + "learning_rate": 9.743976159224477e-05, + "loss": 1.9488, + "step": 4212 + }, + { + "epoch": 1.2931246163290362, + "grad_norm": 0.5168361663818359, + "learning_rate": 9.743819119889999e-05, + "loss": 2.0645, + "step": 4213 + }, + { + "epoch": 1.2934315531000613, + "grad_norm": 0.5098811984062195, + "learning_rate": 9.743662033674319e-05, + "loss": 1.9889, + "step": 4214 + }, + { + "epoch": 1.2937384898710866, + "grad_norm": 0.5559372305870056, + "learning_rate": 9.74350490057899e-05, + "loss": 2.0348, + "step": 4215 + }, + { + "epoch": 1.2940454266421118, + "grad_norm": 0.5274948477745056, + "learning_rate": 9.743347720605566e-05, + "loss": 2.0566, + "step": 4216 + }, + { + "epoch": 1.2943523634131369, + "grad_norm": 0.5009967088699341, + "learning_rate": 9.743190493755601e-05, + "loss": 1.9915, + "step": 4217 + }, + { + "epoch": 1.2946593001841622, + "grad_norm": 0.5365834832191467, + "learning_rate": 9.743033220030646e-05, + "loss": 2.0581, + "step": 4218 + }, + { + "epoch": 1.2949662369551873, + "grad_norm": 0.519478976726532, + "learning_rate": 9.742875899432255e-05, + "loss": 1.9766, + "step": 4219 + }, + { + "epoch": 1.2952731737262124, + "grad_norm": 0.48030364513397217, + "learning_rate": 9.742718531961988e-05, + "loss": 2.0006, + "step": 4220 + }, + { + "epoch": 1.2955801104972375, + "grad_norm": 0.5257472991943359, + "learning_rate": 9.742561117621394e-05, + "loss": 2.0636, + "step": 4221 + }, + { + "epoch": 1.2958870472682626, + "grad_norm": 0.44784319400787354, + "learning_rate": 9.742403656412034e-05, + "loss": 1.9975, + "step": 4222 + }, + { + "epoch": 1.296193984039288, + "grad_norm": 0.4997022747993469, + "learning_rate": 9.742246148335459e-05, + "loss": 2.0167, + "step": 4223 + }, + { + "epoch": 1.296500920810313, + "grad_norm": 0.43378305435180664, + "learning_rate": 9.742088593393228e-05, + "loss": 1.9202, + "step": 4224 + }, + { + "epoch": 1.2968078575813382, + "grad_norm": 0.5256497859954834, + "learning_rate": 9.741930991586899e-05, + "loss": 2.0306, + "step": 4225 + }, + { + "epoch": 1.2971147943523635, + "grad_norm": 0.5017027258872986, + "learning_rate": 9.741773342918028e-05, + "loss": 2.0124, + "step": 4226 + }, + { + "epoch": 1.2974217311233887, + "grad_norm": 0.5393915176391602, + "learning_rate": 9.741615647388175e-05, + "loss": 2.0255, + "step": 4227 + }, + { + "epoch": 1.2977286678944138, + "grad_norm": 0.48618295788764954, + "learning_rate": 9.741457904998896e-05, + "loss": 1.9863, + "step": 4228 + }, + { + "epoch": 1.2980356046654389, + "grad_norm": 0.48060059547424316, + "learning_rate": 9.741300115751752e-05, + "loss": 2.0787, + "step": 4229 + }, + { + "epoch": 1.298342541436464, + "grad_norm": 0.4966236650943756, + "learning_rate": 9.741142279648298e-05, + "loss": 1.9818, + "step": 4230 + }, + { + "epoch": 1.2986494782074893, + "grad_norm": 0.5178021788597107, + "learning_rate": 9.7409843966901e-05, + "loss": 1.9847, + "step": 4231 + }, + { + "epoch": 1.2989564149785144, + "grad_norm": 0.5161063075065613, + "learning_rate": 9.740826466878716e-05, + "loss": 2.0028, + "step": 4232 + }, + { + "epoch": 1.2992633517495396, + "grad_norm": 0.5972462296485901, + "learning_rate": 9.740668490215705e-05, + "loss": 2.0205, + "step": 4233 + }, + { + "epoch": 1.2995702885205649, + "grad_norm": 0.5929185152053833, + "learning_rate": 9.740510466702629e-05, + "loss": 1.9802, + "step": 4234 + }, + { + "epoch": 1.29987722529159, + "grad_norm": 0.5496684908866882, + "learning_rate": 9.74035239634105e-05, + "loss": 1.9331, + "step": 4235 + }, + { + "epoch": 1.3001841620626151, + "grad_norm": 0.5822622179985046, + "learning_rate": 9.740194279132531e-05, + "loss": 2.1079, + "step": 4236 + }, + { + "epoch": 1.3004910988336402, + "grad_norm": 0.5886369943618774, + "learning_rate": 9.740036115078634e-05, + "loss": 1.9938, + "step": 4237 + }, + { + "epoch": 1.3007980356046653, + "grad_norm": 0.5259171724319458, + "learning_rate": 9.73987790418092e-05, + "loss": 2.0787, + "step": 4238 + }, + { + "epoch": 1.3011049723756907, + "grad_norm": 0.6112152934074402, + "learning_rate": 9.739719646440956e-05, + "loss": 2.0488, + "step": 4239 + }, + { + "epoch": 1.3014119091467158, + "grad_norm": 0.5786338448524475, + "learning_rate": 9.739561341860306e-05, + "loss": 1.9917, + "step": 4240 + }, + { + "epoch": 1.301718845917741, + "grad_norm": 0.5099230408668518, + "learning_rate": 9.739402990440531e-05, + "loss": 1.9949, + "step": 4241 + }, + { + "epoch": 1.3020257826887662, + "grad_norm": 0.5040346384048462, + "learning_rate": 9.739244592183198e-05, + "loss": 1.9368, + "step": 4242 + }, + { + "epoch": 1.3023327194597913, + "grad_norm": 0.48172008991241455, + "learning_rate": 9.739086147089871e-05, + "loss": 1.97, + "step": 4243 + }, + { + "epoch": 1.3026396562308165, + "grad_norm": 0.5350810885429382, + "learning_rate": 9.738927655162119e-05, + "loss": 2.0584, + "step": 4244 + }, + { + "epoch": 1.3029465930018416, + "grad_norm": 0.566371738910675, + "learning_rate": 9.738769116401505e-05, + "loss": 2.0138, + "step": 4245 + }, + { + "epoch": 1.3032535297728667, + "grad_norm": 0.5697746872901917, + "learning_rate": 9.738610530809598e-05, + "loss": 2.0319, + "step": 4246 + }, + { + "epoch": 1.303560466543892, + "grad_norm": 0.5186757445335388, + "learning_rate": 9.738451898387964e-05, + "loss": 1.9958, + "step": 4247 + }, + { + "epoch": 1.3038674033149171, + "grad_norm": 0.5318703651428223, + "learning_rate": 9.73829321913817e-05, + "loss": 2.0857, + "step": 4248 + }, + { + "epoch": 1.3041743400859422, + "grad_norm": 0.5013560056686401, + "learning_rate": 9.738134493061786e-05, + "loss": 1.9545, + "step": 4249 + }, + { + "epoch": 1.3044812768569676, + "grad_norm": 0.499009907245636, + "learning_rate": 9.737975720160382e-05, + "loss": 1.9773, + "step": 4250 + }, + { + "epoch": 1.3047882136279927, + "grad_norm": 0.5187140703201294, + "learning_rate": 9.737816900435522e-05, + "loss": 1.9826, + "step": 4251 + }, + { + "epoch": 1.3050951503990178, + "grad_norm": 0.4950683116912842, + "learning_rate": 9.73765803388878e-05, + "loss": 2.0061, + "step": 4252 + }, + { + "epoch": 1.305402087170043, + "grad_norm": 0.40729087591171265, + "learning_rate": 9.737499120521722e-05, + "loss": 1.9502, + "step": 4253 + }, + { + "epoch": 1.305709023941068, + "grad_norm": 0.4959156811237335, + "learning_rate": 9.737340160335924e-05, + "loss": 2.0975, + "step": 4254 + }, + { + "epoch": 1.3060159607120934, + "grad_norm": 0.5127618312835693, + "learning_rate": 9.737181153332952e-05, + "loss": 2.0098, + "step": 4255 + }, + { + "epoch": 1.3063228974831185, + "grad_norm": 0.45458972454071045, + "learning_rate": 9.737022099514381e-05, + "loss": 1.9475, + "step": 4256 + }, + { + "epoch": 1.3066298342541436, + "grad_norm": 0.5024627447128296, + "learning_rate": 9.736862998881779e-05, + "loss": 2.0682, + "step": 4257 + }, + { + "epoch": 1.306936771025169, + "grad_norm": 0.5217326283454895, + "learning_rate": 9.736703851436722e-05, + "loss": 2.0363, + "step": 4258 + }, + { + "epoch": 1.307243707796194, + "grad_norm": 0.4798679053783417, + "learning_rate": 9.736544657180781e-05, + "loss": 2.0357, + "step": 4259 + }, + { + "epoch": 1.3075506445672191, + "grad_norm": 0.6031736135482788, + "learning_rate": 9.73638541611553e-05, + "loss": 2.0143, + "step": 4260 + }, + { + "epoch": 1.3078575813382443, + "grad_norm": 0.4914969801902771, + "learning_rate": 9.736226128242542e-05, + "loss": 1.9292, + "step": 4261 + }, + { + "epoch": 1.3081645181092694, + "grad_norm": 0.40556418895721436, + "learning_rate": 9.736066793563392e-05, + "loss": 1.9528, + "step": 4262 + }, + { + "epoch": 1.3084714548802947, + "grad_norm": 0.45605841279029846, + "learning_rate": 9.735907412079652e-05, + "loss": 2.0704, + "step": 4263 + }, + { + "epoch": 1.3087783916513198, + "grad_norm": 0.4992324113845825, + "learning_rate": 9.7357479837929e-05, + "loss": 2.0211, + "step": 4264 + }, + { + "epoch": 1.309085328422345, + "grad_norm": 0.4904097020626068, + "learning_rate": 9.735588508704712e-05, + "loss": 1.987, + "step": 4265 + }, + { + "epoch": 1.3093922651933703, + "grad_norm": 0.5436086058616638, + "learning_rate": 9.735428986816661e-05, + "loss": 2.0704, + "step": 4266 + }, + { + "epoch": 1.3096992019643954, + "grad_norm": 0.4850294589996338, + "learning_rate": 9.735269418130326e-05, + "loss": 1.9576, + "step": 4267 + }, + { + "epoch": 1.3100061387354205, + "grad_norm": 0.44082164764404297, + "learning_rate": 9.735109802647283e-05, + "loss": 2.0018, + "step": 4268 + }, + { + "epoch": 1.3103130755064456, + "grad_norm": 0.4844531714916229, + "learning_rate": 9.73495014036911e-05, + "loss": 1.9852, + "step": 4269 + }, + { + "epoch": 1.3106200122774707, + "grad_norm": 0.547596275806427, + "learning_rate": 9.734790431297384e-05, + "loss": 2.0632, + "step": 4270 + }, + { + "epoch": 1.310926949048496, + "grad_norm": 0.517882764339447, + "learning_rate": 9.734630675433684e-05, + "loss": 1.9851, + "step": 4271 + }, + { + "epoch": 1.3112338858195212, + "grad_norm": 0.5148623585700989, + "learning_rate": 9.734470872779589e-05, + "loss": 2.0446, + "step": 4272 + }, + { + "epoch": 1.3115408225905463, + "grad_norm": 0.5872887372970581, + "learning_rate": 9.734311023336678e-05, + "loss": 2.0588, + "step": 4273 + }, + { + "epoch": 1.3118477593615716, + "grad_norm": 0.7116255164146423, + "learning_rate": 9.73415112710653e-05, + "loss": 2.0213, + "step": 4274 + }, + { + "epoch": 1.3121546961325967, + "grad_norm": 0.8191964626312256, + "learning_rate": 9.733991184090725e-05, + "loss": 1.9528, + "step": 4275 + }, + { + "epoch": 1.3124616329036218, + "grad_norm": 0.8214605450630188, + "learning_rate": 9.733831194290846e-05, + "loss": 1.9614, + "step": 4276 + }, + { + "epoch": 1.312768569674647, + "grad_norm": 0.7057182788848877, + "learning_rate": 9.733671157708472e-05, + "loss": 2.0767, + "step": 4277 + }, + { + "epoch": 1.313075506445672, + "grad_norm": 0.5114007592201233, + "learning_rate": 9.733511074345185e-05, + "loss": 1.946, + "step": 4278 + }, + { + "epoch": 1.3133824432166974, + "grad_norm": 0.5347970128059387, + "learning_rate": 9.733350944202566e-05, + "loss": 1.9658, + "step": 4279 + }, + { + "epoch": 1.3136893799877225, + "grad_norm": 0.6962214112281799, + "learning_rate": 9.733190767282202e-05, + "loss": 2.0943, + "step": 4280 + }, + { + "epoch": 1.3139963167587476, + "grad_norm": 0.5942707657814026, + "learning_rate": 9.733030543585668e-05, + "loss": 2.0101, + "step": 4281 + }, + { + "epoch": 1.314303253529773, + "grad_norm": 0.46218639612197876, + "learning_rate": 9.732870273114556e-05, + "loss": 2.0292, + "step": 4282 + }, + { + "epoch": 1.314610190300798, + "grad_norm": 0.5194444060325623, + "learning_rate": 9.732709955870445e-05, + "loss": 2.0666, + "step": 4283 + }, + { + "epoch": 1.3149171270718232, + "grad_norm": 0.5112141370773315, + "learning_rate": 9.732549591854918e-05, + "loss": 2.0205, + "step": 4284 + }, + { + "epoch": 1.3152240638428485, + "grad_norm": 0.5282790660858154, + "learning_rate": 9.732389181069566e-05, + "loss": 2.0704, + "step": 4285 + }, + { + "epoch": 1.3155310006138736, + "grad_norm": 0.4598311185836792, + "learning_rate": 9.732228723515968e-05, + "loss": 1.9485, + "step": 4286 + }, + { + "epoch": 1.3158379373848987, + "grad_norm": 0.4700186550617218, + "learning_rate": 9.732068219195711e-05, + "loss": 2.0329, + "step": 4287 + }, + { + "epoch": 1.3161448741559238, + "grad_norm": 0.4512452781200409, + "learning_rate": 9.731907668110384e-05, + "loss": 1.9829, + "step": 4288 + }, + { + "epoch": 1.316451810926949, + "grad_norm": 0.5053353309631348, + "learning_rate": 9.731747070261572e-05, + "loss": 2.0583, + "step": 4289 + }, + { + "epoch": 1.3167587476979743, + "grad_norm": 0.48143625259399414, + "learning_rate": 9.73158642565086e-05, + "loss": 2.014, + "step": 4290 + }, + { + "epoch": 1.3170656844689994, + "grad_norm": 0.4843716025352478, + "learning_rate": 9.73142573427984e-05, + "loss": 1.9951, + "step": 4291 + }, + { + "epoch": 1.3173726212400245, + "grad_norm": 0.45646217465400696, + "learning_rate": 9.731264996150098e-05, + "loss": 1.9701, + "step": 4292 + }, + { + "epoch": 1.3176795580110499, + "grad_norm": 0.5176306962966919, + "learning_rate": 9.73110421126322e-05, + "loss": 1.9915, + "step": 4293 + }, + { + "epoch": 1.317986494782075, + "grad_norm": 0.4862259328365326, + "learning_rate": 9.730943379620799e-05, + "loss": 2.0157, + "step": 4294 + }, + { + "epoch": 1.3182934315531, + "grad_norm": 0.4941593110561371, + "learning_rate": 9.730782501224423e-05, + "loss": 2.0164, + "step": 4295 + }, + { + "epoch": 1.3186003683241252, + "grad_norm": 0.46818530559539795, + "learning_rate": 9.73062157607568e-05, + "loss": 1.9749, + "step": 4296 + }, + { + "epoch": 1.3189073050951503, + "grad_norm": 0.41685113310813904, + "learning_rate": 9.730460604176163e-05, + "loss": 1.9443, + "step": 4297 + }, + { + "epoch": 1.3192142418661756, + "grad_norm": 0.40586861968040466, + "learning_rate": 9.73029958552746e-05, + "loss": 1.9227, + "step": 4298 + }, + { + "epoch": 1.3195211786372008, + "grad_norm": 0.3946068286895752, + "learning_rate": 9.730138520131167e-05, + "loss": 1.9073, + "step": 4299 + }, + { + "epoch": 1.3198281154082259, + "grad_norm": 0.3722321093082428, + "learning_rate": 9.729977407988871e-05, + "loss": 1.9299, + "step": 4300 + }, + { + "epoch": 1.3201350521792512, + "grad_norm": 0.39335691928863525, + "learning_rate": 9.729816249102164e-05, + "loss": 1.9673, + "step": 4301 + }, + { + "epoch": 1.3204419889502763, + "grad_norm": 0.4342779815196991, + "learning_rate": 9.729655043472643e-05, + "loss": 2.0704, + "step": 4302 + }, + { + "epoch": 1.3207489257213014, + "grad_norm": 0.46981000900268555, + "learning_rate": 9.729493791101899e-05, + "loss": 2.0593, + "step": 4303 + }, + { + "epoch": 1.3210558624923265, + "grad_norm": 0.4319849908351898, + "learning_rate": 9.729332491991524e-05, + "loss": 1.9378, + "step": 4304 + }, + { + "epoch": 1.3213627992633517, + "grad_norm": 0.4555012285709381, + "learning_rate": 9.729171146143115e-05, + "loss": 1.993, + "step": 4305 + }, + { + "epoch": 1.321669736034377, + "grad_norm": 0.5122297406196594, + "learning_rate": 9.729009753558262e-05, + "loss": 2.0237, + "step": 4306 + }, + { + "epoch": 1.321976672805402, + "grad_norm": 0.4814549386501312, + "learning_rate": 9.728848314238566e-05, + "loss": 2.0063, + "step": 4307 + }, + { + "epoch": 1.3222836095764272, + "grad_norm": 0.45410022139549255, + "learning_rate": 9.728686828185618e-05, + "loss": 2.0262, + "step": 4308 + }, + { + "epoch": 1.3225905463474525, + "grad_norm": 0.44759154319763184, + "learning_rate": 9.728525295401014e-05, + "loss": 1.9746, + "step": 4309 + }, + { + "epoch": 1.3228974831184777, + "grad_norm": 0.41539889574050903, + "learning_rate": 9.728363715886352e-05, + "loss": 1.9197, + "step": 4310 + }, + { + "epoch": 1.3232044198895028, + "grad_norm": 0.549961268901825, + "learning_rate": 9.72820208964323e-05, + "loss": 2.0168, + "step": 4311 + }, + { + "epoch": 1.3235113566605279, + "grad_norm": 0.6832249164581299, + "learning_rate": 9.728040416673243e-05, + "loss": 1.9711, + "step": 4312 + }, + { + "epoch": 1.323818293431553, + "grad_norm": 0.7458481788635254, + "learning_rate": 9.727878696977988e-05, + "loss": 2.1677, + "step": 4313 + }, + { + "epoch": 1.3241252302025783, + "grad_norm": 0.6268119812011719, + "learning_rate": 9.727716930559066e-05, + "loss": 2.0222, + "step": 4314 + }, + { + "epoch": 1.3244321669736034, + "grad_norm": 0.540987491607666, + "learning_rate": 9.727555117418075e-05, + "loss": 2.0552, + "step": 4315 + }, + { + "epoch": 1.3247391037446286, + "grad_norm": 0.6105024814605713, + "learning_rate": 9.727393257556612e-05, + "loss": 1.9287, + "step": 4316 + }, + { + "epoch": 1.325046040515654, + "grad_norm": 0.594327449798584, + "learning_rate": 9.727231350976277e-05, + "loss": 1.9737, + "step": 4317 + }, + { + "epoch": 1.325352977286679, + "grad_norm": 0.5686312913894653, + "learning_rate": 9.727069397678674e-05, + "loss": 1.988, + "step": 4318 + }, + { + "epoch": 1.3256599140577041, + "grad_norm": 0.5335875153541565, + "learning_rate": 9.726907397665399e-05, + "loss": 1.9992, + "step": 4319 + }, + { + "epoch": 1.3259668508287292, + "grad_norm": 0.514209508895874, + "learning_rate": 9.726745350938055e-05, + "loss": 2.0928, + "step": 4320 + }, + { + "epoch": 1.3262737875997543, + "grad_norm": 0.58844393491745, + "learning_rate": 9.726583257498242e-05, + "loss": 1.968, + "step": 4321 + }, + { + "epoch": 1.3265807243707797, + "grad_norm": 0.5247591733932495, + "learning_rate": 9.726421117347563e-05, + "loss": 1.9529, + "step": 4322 + }, + { + "epoch": 1.3268876611418048, + "grad_norm": 0.5057464241981506, + "learning_rate": 9.726258930487622e-05, + "loss": 2.0595, + "step": 4323 + }, + { + "epoch": 1.32719459791283, + "grad_norm": 0.564689040184021, + "learning_rate": 9.726096696920019e-05, + "loss": 1.9974, + "step": 4324 + }, + { + "epoch": 1.3275015346838552, + "grad_norm": 0.5755618214607239, + "learning_rate": 9.725934416646358e-05, + "loss": 1.9949, + "step": 4325 + }, + { + "epoch": 1.3278084714548803, + "grad_norm": 0.5969316959381104, + "learning_rate": 9.725772089668243e-05, + "loss": 1.972, + "step": 4326 + }, + { + "epoch": 1.3281154082259055, + "grad_norm": 0.5776877403259277, + "learning_rate": 9.725609715987278e-05, + "loss": 2.1018, + "step": 4327 + }, + { + "epoch": 1.3284223449969306, + "grad_norm": 0.5471270680427551, + "learning_rate": 9.725447295605071e-05, + "loss": 2.0153, + "step": 4328 + }, + { + "epoch": 1.3287292817679557, + "grad_norm": 0.49090373516082764, + "learning_rate": 9.725284828523222e-05, + "loss": 1.9651, + "step": 4329 + }, + { + "epoch": 1.329036218538981, + "grad_norm": 0.49420034885406494, + "learning_rate": 9.725122314743337e-05, + "loss": 2.0119, + "step": 4330 + }, + { + "epoch": 1.3293431553100061, + "grad_norm": 0.4841148853302002, + "learning_rate": 9.724959754267027e-05, + "loss": 1.974, + "step": 4331 + }, + { + "epoch": 1.3296500920810312, + "grad_norm": 0.42349007725715637, + "learning_rate": 9.724797147095893e-05, + "loss": 1.9779, + "step": 4332 + }, + { + "epoch": 1.3299570288520566, + "grad_norm": 0.47239863872528076, + "learning_rate": 9.724634493231545e-05, + "loss": 1.9184, + "step": 4333 + }, + { + "epoch": 1.3302639656230817, + "grad_norm": 0.5583773255348206, + "learning_rate": 9.72447179267559e-05, + "loss": 2.0742, + "step": 4334 + }, + { + "epoch": 1.3305709023941068, + "grad_norm": 0.486937552690506, + "learning_rate": 9.724309045429636e-05, + "loss": 2.0101, + "step": 4335 + }, + { + "epoch": 1.330877839165132, + "grad_norm": 0.42204493284225464, + "learning_rate": 9.724146251495289e-05, + "loss": 1.9564, + "step": 4336 + }, + { + "epoch": 1.331184775936157, + "grad_norm": 0.451628714799881, + "learning_rate": 9.723983410874163e-05, + "loss": 1.9949, + "step": 4337 + }, + { + "epoch": 1.3314917127071824, + "grad_norm": 0.4453491270542145, + "learning_rate": 9.723820523567861e-05, + "loss": 1.9415, + "step": 4338 + }, + { + "epoch": 1.3317986494782075, + "grad_norm": 0.4628424644470215, + "learning_rate": 9.723657589577999e-05, + "loss": 2.0296, + "step": 4339 + }, + { + "epoch": 1.3321055862492326, + "grad_norm": 0.5362148284912109, + "learning_rate": 9.723494608906181e-05, + "loss": 2.0719, + "step": 4340 + }, + { + "epoch": 1.332412523020258, + "grad_norm": 0.45357146859169006, + "learning_rate": 9.723331581554023e-05, + "loss": 1.9107, + "step": 4341 + }, + { + "epoch": 1.332719459791283, + "grad_norm": 0.5042485594749451, + "learning_rate": 9.723168507523133e-05, + "loss": 1.9838, + "step": 4342 + }, + { + "epoch": 1.3330263965623081, + "grad_norm": 0.4797585606575012, + "learning_rate": 9.723005386815123e-05, + "loss": 1.9779, + "step": 4343 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4489155113697052, + "learning_rate": 9.722842219431607e-05, + "loss": 1.9805, + "step": 4344 + }, + { + "epoch": 1.3336402701043584, + "grad_norm": 0.43091216683387756, + "learning_rate": 9.722679005374196e-05, + "loss": 1.9708, + "step": 4345 + }, + { + "epoch": 1.3339472068753837, + "grad_norm": 0.453937292098999, + "learning_rate": 9.722515744644502e-05, + "loss": 2.0038, + "step": 4346 + }, + { + "epoch": 1.3342541436464088, + "grad_norm": 0.38905346393585205, + "learning_rate": 9.722352437244138e-05, + "loss": 2.0042, + "step": 4347 + }, + { + "epoch": 1.334561080417434, + "grad_norm": 0.46686118841171265, + "learning_rate": 9.722189083174722e-05, + "loss": 2.0733, + "step": 4348 + }, + { + "epoch": 1.3348680171884593, + "grad_norm": 0.42737439274787903, + "learning_rate": 9.722025682437865e-05, + "loss": 1.9572, + "step": 4349 + }, + { + "epoch": 1.3351749539594844, + "grad_norm": 0.3857511878013611, + "learning_rate": 9.721862235035181e-05, + "loss": 1.9288, + "step": 4350 + }, + { + "epoch": 1.3354818907305095, + "grad_norm": 0.42448824644088745, + "learning_rate": 9.721698740968288e-05, + "loss": 1.99, + "step": 4351 + }, + { + "epoch": 1.3357888275015346, + "grad_norm": 0.4753642976284027, + "learning_rate": 9.721535200238802e-05, + "loss": 2.0268, + "step": 4352 + }, + { + "epoch": 1.3360957642725597, + "grad_norm": 0.5248960256576538, + "learning_rate": 9.721371612848336e-05, + "loss": 2.008, + "step": 4353 + }, + { + "epoch": 1.336402701043585, + "grad_norm": 0.5046865344047546, + "learning_rate": 9.721207978798507e-05, + "loss": 1.9248, + "step": 4354 + }, + { + "epoch": 1.3367096378146102, + "grad_norm": 0.48205190896987915, + "learning_rate": 9.721044298090937e-05, + "loss": 1.9895, + "step": 4355 + }, + { + "epoch": 1.3370165745856353, + "grad_norm": 0.46149346232414246, + "learning_rate": 9.720880570727238e-05, + "loss": 2.0001, + "step": 4356 + }, + { + "epoch": 1.3373235113566606, + "grad_norm": 0.6212405562400818, + "learning_rate": 9.72071679670903e-05, + "loss": 2.0772, + "step": 4357 + }, + { + "epoch": 1.3376304481276857, + "grad_norm": 0.6935828924179077, + "learning_rate": 9.720552976037934e-05, + "loss": 1.9865, + "step": 4358 + }, + { + "epoch": 1.3379373848987108, + "grad_norm": 0.6850154399871826, + "learning_rate": 9.720389108715564e-05, + "loss": 1.9964, + "step": 4359 + }, + { + "epoch": 1.3382443216697362, + "grad_norm": 0.5925734043121338, + "learning_rate": 9.720225194743544e-05, + "loss": 2.0109, + "step": 4360 + }, + { + "epoch": 1.3385512584407613, + "grad_norm": 0.47503459453582764, + "learning_rate": 9.720061234123492e-05, + "loss": 2.0406, + "step": 4361 + }, + { + "epoch": 1.3388581952117864, + "grad_norm": 0.44226083159446716, + "learning_rate": 9.719897226857026e-05, + "loss": 1.953, + "step": 4362 + }, + { + "epoch": 1.3391651319828115, + "grad_norm": 0.5688608884811401, + "learning_rate": 9.719733172945772e-05, + "loss": 1.9422, + "step": 4363 + }, + { + "epoch": 1.3394720687538366, + "grad_norm": 0.6097545027732849, + "learning_rate": 9.719569072391347e-05, + "loss": 2.0204, + "step": 4364 + }, + { + "epoch": 1.339779005524862, + "grad_norm": 0.44313064217567444, + "learning_rate": 9.719404925195374e-05, + "loss": 1.9458, + "step": 4365 + }, + { + "epoch": 1.340085942295887, + "grad_norm": 0.495632141828537, + "learning_rate": 9.719240731359476e-05, + "loss": 1.9682, + "step": 4366 + }, + { + "epoch": 1.3403928790669122, + "grad_norm": 0.5843736529350281, + "learning_rate": 9.719076490885275e-05, + "loss": 1.9948, + "step": 4367 + }, + { + "epoch": 1.3406998158379375, + "grad_norm": 0.6249645352363586, + "learning_rate": 9.718912203774395e-05, + "loss": 1.9675, + "step": 4368 + }, + { + "epoch": 1.3410067526089626, + "grad_norm": 0.48386043310165405, + "learning_rate": 9.718747870028457e-05, + "loss": 1.9678, + "step": 4369 + }, + { + "epoch": 1.3413136893799877, + "grad_norm": 0.4797835648059845, + "learning_rate": 9.718583489649088e-05, + "loss": 2.0118, + "step": 4370 + }, + { + "epoch": 1.3416206261510129, + "grad_norm": 0.6131169199943542, + "learning_rate": 9.718419062637911e-05, + "loss": 2.0057, + "step": 4371 + }, + { + "epoch": 1.341927562922038, + "grad_norm": 0.6230120062828064, + "learning_rate": 9.718254588996552e-05, + "loss": 1.9871, + "step": 4372 + }, + { + "epoch": 1.3422344996930633, + "grad_norm": 0.5323978662490845, + "learning_rate": 9.718090068726633e-05, + "loss": 1.9389, + "step": 4373 + }, + { + "epoch": 1.3425414364640884, + "grad_norm": 0.429446280002594, + "learning_rate": 9.717925501829786e-05, + "loss": 1.9928, + "step": 4374 + }, + { + "epoch": 1.3428483732351135, + "grad_norm": 0.5588231086730957, + "learning_rate": 9.717760888307632e-05, + "loss": 2.0197, + "step": 4375 + }, + { + "epoch": 1.3431553100061389, + "grad_norm": 0.608248770236969, + "learning_rate": 9.7175962281618e-05, + "loss": 1.9486, + "step": 4376 + }, + { + "epoch": 1.343462246777164, + "grad_norm": 0.6100868582725525, + "learning_rate": 9.717431521393918e-05, + "loss": 2.044, + "step": 4377 + }, + { + "epoch": 1.343769183548189, + "grad_norm": 0.5428611636161804, + "learning_rate": 9.717266768005611e-05, + "loss": 2.0078, + "step": 4378 + }, + { + "epoch": 1.3440761203192142, + "grad_norm": 0.4338260889053345, + "learning_rate": 9.71710196799851e-05, + "loss": 1.9206, + "step": 4379 + }, + { + "epoch": 1.3443830570902393, + "grad_norm": 0.4879632294178009, + "learning_rate": 9.716937121374243e-05, + "loss": 1.9852, + "step": 4380 + }, + { + "epoch": 1.3446899938612646, + "grad_norm": 0.5174580216407776, + "learning_rate": 9.716772228134438e-05, + "loss": 1.9328, + "step": 4381 + }, + { + "epoch": 1.3449969306322898, + "grad_norm": 0.4461662173271179, + "learning_rate": 9.716607288280726e-05, + "loss": 1.9653, + "step": 4382 + }, + { + "epoch": 1.3453038674033149, + "grad_norm": 0.49747103452682495, + "learning_rate": 9.716442301814735e-05, + "loss": 1.9904, + "step": 4383 + }, + { + "epoch": 1.3456108041743402, + "grad_norm": 0.5059060454368591, + "learning_rate": 9.716277268738097e-05, + "loss": 1.9408, + "step": 4384 + }, + { + "epoch": 1.3459177409453653, + "grad_norm": 0.47981831431388855, + "learning_rate": 9.716112189052445e-05, + "loss": 1.9604, + "step": 4385 + }, + { + "epoch": 1.3462246777163904, + "grad_norm": 0.48941048979759216, + "learning_rate": 9.715947062759405e-05, + "loss": 2.0005, + "step": 4386 + }, + { + "epoch": 1.3465316144874155, + "grad_norm": 0.4544732868671417, + "learning_rate": 9.715781889860613e-05, + "loss": 1.9641, + "step": 4387 + }, + { + "epoch": 1.3468385512584407, + "grad_norm": 0.4564060866832733, + "learning_rate": 9.715616670357701e-05, + "loss": 1.8786, + "step": 4388 + }, + { + "epoch": 1.347145488029466, + "grad_norm": 0.4216209352016449, + "learning_rate": 9.715451404252301e-05, + "loss": 1.9402, + "step": 4389 + }, + { + "epoch": 1.347452424800491, + "grad_norm": 0.5024694204330444, + "learning_rate": 9.715286091546046e-05, + "loss": 1.9815, + "step": 4390 + }, + { + "epoch": 1.3477593615715162, + "grad_norm": 0.523953378200531, + "learning_rate": 9.715120732240571e-05, + "loss": 2.008, + "step": 4391 + }, + { + "epoch": 1.3480662983425415, + "grad_norm": 0.5068427920341492, + "learning_rate": 9.714955326337508e-05, + "loss": 1.9984, + "step": 4392 + }, + { + "epoch": 1.3483732351135667, + "grad_norm": 0.4349055290222168, + "learning_rate": 9.714789873838494e-05, + "loss": 1.9576, + "step": 4393 + }, + { + "epoch": 1.3486801718845918, + "grad_norm": 0.4677357077598572, + "learning_rate": 9.714624374745162e-05, + "loss": 2.0491, + "step": 4394 + }, + { + "epoch": 1.3489871086556169, + "grad_norm": 0.5942007899284363, + "learning_rate": 9.71445882905915e-05, + "loss": 1.9951, + "step": 4395 + }, + { + "epoch": 1.349294045426642, + "grad_norm": 0.5354358553886414, + "learning_rate": 9.714293236782092e-05, + "loss": 2.0033, + "step": 4396 + }, + { + "epoch": 1.3496009821976673, + "grad_norm": 0.5081890821456909, + "learning_rate": 9.714127597915625e-05, + "loss": 1.9944, + "step": 4397 + }, + { + "epoch": 1.3499079189686924, + "grad_norm": 0.5279759764671326, + "learning_rate": 9.713961912461386e-05, + "loss": 2.025, + "step": 4398 + }, + { + "epoch": 1.3502148557397176, + "grad_norm": 0.41777312755584717, + "learning_rate": 9.713796180421012e-05, + "loss": 1.9214, + "step": 4399 + }, + { + "epoch": 1.350521792510743, + "grad_norm": 0.48946598172187805, + "learning_rate": 9.713630401796141e-05, + "loss": 1.9851, + "step": 4400 + }, + { + "epoch": 1.350828729281768, + "grad_norm": 0.45182350277900696, + "learning_rate": 9.713464576588413e-05, + "loss": 1.9825, + "step": 4401 + }, + { + "epoch": 1.3511356660527931, + "grad_norm": 0.4178939461708069, + "learning_rate": 9.713298704799465e-05, + "loss": 1.8944, + "step": 4402 + }, + { + "epoch": 1.3514426028238182, + "grad_norm": 0.4178236424922943, + "learning_rate": 9.713132786430937e-05, + "loss": 1.9884, + "step": 4403 + }, + { + "epoch": 1.3517495395948433, + "grad_norm": 0.45951130986213684, + "learning_rate": 9.712966821484467e-05, + "loss": 2.0786, + "step": 4404 + }, + { + "epoch": 1.3520564763658687, + "grad_norm": 0.4884461760520935, + "learning_rate": 9.712800809961697e-05, + "loss": 2.0494, + "step": 4405 + }, + { + "epoch": 1.3523634131368938, + "grad_norm": 0.5342240929603577, + "learning_rate": 9.712634751864268e-05, + "loss": 2.1068, + "step": 4406 + }, + { + "epoch": 1.352670349907919, + "grad_norm": 0.5503208637237549, + "learning_rate": 9.71246864719382e-05, + "loss": 1.9588, + "step": 4407 + }, + { + "epoch": 1.3529772866789442, + "grad_norm": 0.5576291084289551, + "learning_rate": 9.712302495951994e-05, + "loss": 2.0461, + "step": 4408 + }, + { + "epoch": 1.3532842234499693, + "grad_norm": 0.5063806772232056, + "learning_rate": 9.712136298140433e-05, + "loss": 1.9606, + "step": 4409 + }, + { + "epoch": 1.3535911602209945, + "grad_norm": 0.5391512513160706, + "learning_rate": 9.71197005376078e-05, + "loss": 2.0115, + "step": 4410 + }, + { + "epoch": 1.3538980969920196, + "grad_norm": 0.4934769868850708, + "learning_rate": 9.711803762814676e-05, + "loss": 1.9966, + "step": 4411 + }, + { + "epoch": 1.3542050337630447, + "grad_norm": 0.4658334255218506, + "learning_rate": 9.711637425303766e-05, + "loss": 1.9477, + "step": 4412 + }, + { + "epoch": 1.35451197053407, + "grad_norm": 0.4407191574573517, + "learning_rate": 9.711471041229693e-05, + "loss": 1.9334, + "step": 4413 + }, + { + "epoch": 1.3548189073050951, + "grad_norm": 0.5043092370033264, + "learning_rate": 9.711304610594104e-05, + "loss": 2.0068, + "step": 4414 + }, + { + "epoch": 1.3551258440761202, + "grad_norm": 0.4502009451389313, + "learning_rate": 9.711138133398639e-05, + "loss": 1.9389, + "step": 4415 + }, + { + "epoch": 1.3554327808471456, + "grad_norm": 0.41863033175468445, + "learning_rate": 9.710971609644945e-05, + "loss": 1.9244, + "step": 4416 + }, + { + "epoch": 1.3557397176181707, + "grad_norm": 0.47590091824531555, + "learning_rate": 9.71080503933467e-05, + "loss": 2.0144, + "step": 4417 + }, + { + "epoch": 1.3560466543891958, + "grad_norm": 0.47155439853668213, + "learning_rate": 9.71063842246946e-05, + "loss": 2.0729, + "step": 4418 + }, + { + "epoch": 1.356353591160221, + "grad_norm": 0.5231152176856995, + "learning_rate": 9.710471759050957e-05, + "loss": 2.0654, + "step": 4419 + }, + { + "epoch": 1.356660527931246, + "grad_norm": 0.5952544212341309, + "learning_rate": 9.710305049080812e-05, + "loss": 1.9983, + "step": 4420 + }, + { + "epoch": 1.3569674647022714, + "grad_norm": 0.4810022711753845, + "learning_rate": 9.710138292560673e-05, + "loss": 1.9725, + "step": 4421 + }, + { + "epoch": 1.3572744014732965, + "grad_norm": 0.553421676158905, + "learning_rate": 9.709971489492185e-05, + "loss": 2.0666, + "step": 4422 + }, + { + "epoch": 1.3575813382443216, + "grad_norm": 0.48790663480758667, + "learning_rate": 9.709804639877001e-05, + "loss": 1.9312, + "step": 4423 + }, + { + "epoch": 1.357888275015347, + "grad_norm": 0.42968273162841797, + "learning_rate": 9.709637743716764e-05, + "loss": 1.9061, + "step": 4424 + }, + { + "epoch": 1.358195211786372, + "grad_norm": 0.40183690190315247, + "learning_rate": 9.709470801013128e-05, + "loss": 2.0547, + "step": 4425 + }, + { + "epoch": 1.3585021485573971, + "grad_norm": 0.5162881016731262, + "learning_rate": 9.70930381176774e-05, + "loss": 2.0246, + "step": 4426 + }, + { + "epoch": 1.3588090853284225, + "grad_norm": 0.517995297908783, + "learning_rate": 9.709136775982252e-05, + "loss": 2.0029, + "step": 4427 + }, + { + "epoch": 1.3591160220994476, + "grad_norm": 0.47416025400161743, + "learning_rate": 9.708969693658314e-05, + "loss": 1.9517, + "step": 4428 + }, + { + "epoch": 1.3594229588704727, + "grad_norm": 0.4192255437374115, + "learning_rate": 9.708802564797578e-05, + "loss": 1.9138, + "step": 4429 + }, + { + "epoch": 1.3597298956414978, + "grad_norm": 0.4643617868423462, + "learning_rate": 9.708635389401697e-05, + "loss": 1.9753, + "step": 4430 + }, + { + "epoch": 1.360036832412523, + "grad_norm": 0.5007988214492798, + "learning_rate": 9.708468167472317e-05, + "loss": 1.9654, + "step": 4431 + }, + { + "epoch": 1.3603437691835483, + "grad_norm": 0.5188244581222534, + "learning_rate": 9.708300899011098e-05, + "loss": 1.9959, + "step": 4432 + }, + { + "epoch": 1.3606507059545734, + "grad_norm": 0.5209388732910156, + "learning_rate": 9.70813358401969e-05, + "loss": 2.0028, + "step": 4433 + }, + { + "epoch": 1.3609576427255985, + "grad_norm": 0.48829126358032227, + "learning_rate": 9.707966222499745e-05, + "loss": 2.0554, + "step": 4434 + }, + { + "epoch": 1.3612645794966238, + "grad_norm": 0.4373438358306885, + "learning_rate": 9.707798814452919e-05, + "loss": 1.9611, + "step": 4435 + }, + { + "epoch": 1.361571516267649, + "grad_norm": 0.4294830858707428, + "learning_rate": 9.707631359880867e-05, + "loss": 1.9049, + "step": 4436 + }, + { + "epoch": 1.361878453038674, + "grad_norm": 0.46988123655319214, + "learning_rate": 9.70746385878524e-05, + "loss": 1.9221, + "step": 4437 + }, + { + "epoch": 1.3621853898096992, + "grad_norm": 0.4956746995449066, + "learning_rate": 9.707296311167697e-05, + "loss": 1.9215, + "step": 4438 + }, + { + "epoch": 1.3624923265807243, + "grad_norm": 0.43748801946640015, + "learning_rate": 9.707128717029894e-05, + "loss": 1.9882, + "step": 4439 + }, + { + "epoch": 1.3627992633517496, + "grad_norm": 0.4926415979862213, + "learning_rate": 9.706961076373485e-05, + "loss": 1.9664, + "step": 4440 + }, + { + "epoch": 1.3631062001227747, + "grad_norm": 0.5239415764808655, + "learning_rate": 9.706793389200129e-05, + "loss": 1.9809, + "step": 4441 + }, + { + "epoch": 1.3634131368937998, + "grad_norm": 0.5134629607200623, + "learning_rate": 9.706625655511481e-05, + "loss": 1.9559, + "step": 4442 + }, + { + "epoch": 1.3637200736648252, + "grad_norm": 0.49562570452690125, + "learning_rate": 9.706457875309198e-05, + "loss": 1.9603, + "step": 4443 + }, + { + "epoch": 1.3640270104358503, + "grad_norm": 0.45000702142715454, + "learning_rate": 9.706290048594942e-05, + "loss": 1.9395, + "step": 4444 + }, + { + "epoch": 1.3643339472068754, + "grad_norm": 0.4216759502887726, + "learning_rate": 9.70612217537037e-05, + "loss": 1.8857, + "step": 4445 + }, + { + "epoch": 1.3646408839779005, + "grad_norm": 0.5022158622741699, + "learning_rate": 9.705954255637138e-05, + "loss": 1.9388, + "step": 4446 + }, + { + "epoch": 1.3649478207489256, + "grad_norm": 0.5086642503738403, + "learning_rate": 9.70578628939691e-05, + "loss": 1.9325, + "step": 4447 + }, + { + "epoch": 1.365254757519951, + "grad_norm": 0.4891139566898346, + "learning_rate": 9.705618276651342e-05, + "loss": 1.9068, + "step": 4448 + }, + { + "epoch": 1.365561694290976, + "grad_norm": 0.42479926347732544, + "learning_rate": 9.705450217402096e-05, + "loss": 2.0345, + "step": 4449 + }, + { + "epoch": 1.3658686310620012, + "grad_norm": 0.45347172021865845, + "learning_rate": 9.705282111650834e-05, + "loss": 1.9343, + "step": 4450 + }, + { + "epoch": 1.3661755678330265, + "grad_norm": 0.5443231463432312, + "learning_rate": 9.705113959399217e-05, + "loss": 2.0428, + "step": 4451 + }, + { + "epoch": 1.3664825046040516, + "grad_norm": 0.5320110321044922, + "learning_rate": 9.704945760648905e-05, + "loss": 2.0015, + "step": 4452 + }, + { + "epoch": 1.3667894413750767, + "grad_norm": 0.5018410086631775, + "learning_rate": 9.704777515401561e-05, + "loss": 1.9284, + "step": 4453 + }, + { + "epoch": 1.3670963781461019, + "grad_norm": 0.4587440490722656, + "learning_rate": 9.704609223658848e-05, + "loss": 1.8945, + "step": 4454 + }, + { + "epoch": 1.367403314917127, + "grad_norm": 0.4634784758090973, + "learning_rate": 9.70444088542243e-05, + "loss": 1.9564, + "step": 4455 + }, + { + "epoch": 1.3677102516881523, + "grad_norm": 0.43047839403152466, + "learning_rate": 9.70427250069397e-05, + "loss": 2.0417, + "step": 4456 + }, + { + "epoch": 1.3680171884591774, + "grad_norm": 0.46661630272865295, + "learning_rate": 9.70410406947513e-05, + "loss": 2.0563, + "step": 4457 + }, + { + "epoch": 1.3683241252302025, + "grad_norm": 0.46544912457466125, + "learning_rate": 9.703935591767579e-05, + "loss": 2.0115, + "step": 4458 + }, + { + "epoch": 1.3686310620012279, + "grad_norm": 0.466172993183136, + "learning_rate": 9.703767067572977e-05, + "loss": 1.9177, + "step": 4459 + }, + { + "epoch": 1.368937998772253, + "grad_norm": 0.44513949751853943, + "learning_rate": 9.703598496892994e-05, + "loss": 1.9954, + "step": 4460 + }, + { + "epoch": 1.369244935543278, + "grad_norm": 0.4502551257610321, + "learning_rate": 9.703429879729293e-05, + "loss": 1.9155, + "step": 4461 + }, + { + "epoch": 1.3695518723143032, + "grad_norm": 0.4618416726589203, + "learning_rate": 9.703261216083541e-05, + "loss": 2.015, + "step": 4462 + }, + { + "epoch": 1.3698588090853283, + "grad_norm": 0.4691082239151001, + "learning_rate": 9.703092505957405e-05, + "loss": 2.0332, + "step": 4463 + }, + { + "epoch": 1.3701657458563536, + "grad_norm": 0.5674530863761902, + "learning_rate": 9.702923749352553e-05, + "loss": 2.0, + "step": 4464 + }, + { + "epoch": 1.3704726826273788, + "grad_norm": 0.5828661322593689, + "learning_rate": 9.702754946270651e-05, + "loss": 1.9727, + "step": 4465 + }, + { + "epoch": 1.3707796193984039, + "grad_norm": 0.5861548781394958, + "learning_rate": 9.702586096713369e-05, + "loss": 2.0337, + "step": 4466 + }, + { + "epoch": 1.3710865561694292, + "grad_norm": 0.5607923865318298, + "learning_rate": 9.702417200682374e-05, + "loss": 1.9639, + "step": 4467 + }, + { + "epoch": 1.3713934929404543, + "grad_norm": 0.553827702999115, + "learning_rate": 9.702248258179337e-05, + "loss": 1.9644, + "step": 4468 + }, + { + "epoch": 1.3717004297114794, + "grad_norm": 0.6120470762252808, + "learning_rate": 9.702079269205925e-05, + "loss": 1.9562, + "step": 4469 + }, + { + "epoch": 1.3720073664825045, + "grad_norm": 0.6354473829269409, + "learning_rate": 9.70191023376381e-05, + "loss": 2.0984, + "step": 4470 + }, + { + "epoch": 1.3723143032535297, + "grad_norm": 0.5426626801490784, + "learning_rate": 9.701741151854665e-05, + "loss": 1.9473, + "step": 4471 + }, + { + "epoch": 1.372621240024555, + "grad_norm": 0.5632089376449585, + "learning_rate": 9.701572023480156e-05, + "loss": 2.0167, + "step": 4472 + }, + { + "epoch": 1.37292817679558, + "grad_norm": 0.5315039157867432, + "learning_rate": 9.701402848641957e-05, + "loss": 1.9537, + "step": 4473 + }, + { + "epoch": 1.3732351135666052, + "grad_norm": 0.4552931785583496, + "learning_rate": 9.70123362734174e-05, + "loss": 1.9553, + "step": 4474 + }, + { + "epoch": 1.3735420503376305, + "grad_norm": 0.49282166361808777, + "learning_rate": 9.701064359581176e-05, + "loss": 2.0409, + "step": 4475 + }, + { + "epoch": 1.3738489871086557, + "grad_norm": 0.46548575162887573, + "learning_rate": 9.700895045361939e-05, + "loss": 1.9707, + "step": 4476 + }, + { + "epoch": 1.3741559238796808, + "grad_norm": 0.4619027078151703, + "learning_rate": 9.7007256846857e-05, + "loss": 1.9531, + "step": 4477 + }, + { + "epoch": 1.3744628606507059, + "grad_norm": 0.5122626423835754, + "learning_rate": 9.700556277554138e-05, + "loss": 2.0625, + "step": 4478 + }, + { + "epoch": 1.374769797421731, + "grad_norm": 0.487246036529541, + "learning_rate": 9.700386823968922e-05, + "loss": 1.9667, + "step": 4479 + }, + { + "epoch": 1.3750767341927563, + "grad_norm": 0.5093865990638733, + "learning_rate": 9.700217323931729e-05, + "loss": 1.9982, + "step": 4480 + }, + { + "epoch": 1.3753836709637814, + "grad_norm": 0.47049981355667114, + "learning_rate": 9.700047777444232e-05, + "loss": 1.9876, + "step": 4481 + }, + { + "epoch": 1.3756906077348066, + "grad_norm": 0.4997411370277405, + "learning_rate": 9.699878184508109e-05, + "loss": 1.9925, + "step": 4482 + }, + { + "epoch": 1.375997544505832, + "grad_norm": 0.49374327063560486, + "learning_rate": 9.699708545125034e-05, + "loss": 1.9468, + "step": 4483 + }, + { + "epoch": 1.376304481276857, + "grad_norm": 0.44101378321647644, + "learning_rate": 9.699538859296686e-05, + "loss": 2.0577, + "step": 4484 + }, + { + "epoch": 1.3766114180478821, + "grad_norm": 0.47289925813674927, + "learning_rate": 9.699369127024741e-05, + "loss": 1.9611, + "step": 4485 + }, + { + "epoch": 1.3769183548189072, + "grad_norm": 0.4616342782974243, + "learning_rate": 9.699199348310875e-05, + "loss": 2.0196, + "step": 4486 + }, + { + "epoch": 1.3772252915899323, + "grad_norm": 0.45797309279441833, + "learning_rate": 9.699029523156766e-05, + "loss": 2.0168, + "step": 4487 + }, + { + "epoch": 1.3775322283609577, + "grad_norm": 0.5224477648735046, + "learning_rate": 9.698859651564095e-05, + "loss": 2.0312, + "step": 4488 + }, + { + "epoch": 1.3778391651319828, + "grad_norm": 0.4831027388572693, + "learning_rate": 9.698689733534539e-05, + "loss": 2.0084, + "step": 4489 + }, + { + "epoch": 1.378146101903008, + "grad_norm": 0.49492040276527405, + "learning_rate": 9.698519769069774e-05, + "loss": 1.9474, + "step": 4490 + }, + { + "epoch": 1.3784530386740332, + "grad_norm": 0.4911774694919586, + "learning_rate": 9.698349758171486e-05, + "loss": 1.987, + "step": 4491 + }, + { + "epoch": 1.3787599754450584, + "grad_norm": 0.5415390729904175, + "learning_rate": 9.69817970084135e-05, + "loss": 1.9927, + "step": 4492 + }, + { + "epoch": 1.3790669122160835, + "grad_norm": 0.6870381832122803, + "learning_rate": 9.698009597081048e-05, + "loss": 2.0348, + "step": 4493 + }, + { + "epoch": 1.3793738489871086, + "grad_norm": 0.6322616934776306, + "learning_rate": 9.697839446892263e-05, + "loss": 2.0119, + "step": 4494 + }, + { + "epoch": 1.3796807857581337, + "grad_norm": 0.5950151681900024, + "learning_rate": 9.697669250276675e-05, + "loss": 2.002, + "step": 4495 + }, + { + "epoch": 1.379987722529159, + "grad_norm": 0.4321151673793793, + "learning_rate": 9.697499007235966e-05, + "loss": 1.9173, + "step": 4496 + }, + { + "epoch": 1.3802946593001841, + "grad_norm": 0.4627344608306885, + "learning_rate": 9.697328717771818e-05, + "loss": 2.0289, + "step": 4497 + }, + { + "epoch": 1.3806015960712092, + "grad_norm": 0.5040726661682129, + "learning_rate": 9.697158381885915e-05, + "loss": 1.9844, + "step": 4498 + }, + { + "epoch": 1.3809085328422346, + "grad_norm": 0.5219398736953735, + "learning_rate": 9.696987999579939e-05, + "loss": 1.9536, + "step": 4499 + }, + { + "epoch": 1.3812154696132597, + "grad_norm": 0.487734317779541, + "learning_rate": 9.696817570855575e-05, + "loss": 1.9655, + "step": 4500 + }, + { + "epoch": 1.3815224063842848, + "grad_norm": 0.40818822383880615, + "learning_rate": 9.696647095714506e-05, + "loss": 1.9524, + "step": 4501 + }, + { + "epoch": 1.3818293431553101, + "grad_norm": 0.41752889752388, + "learning_rate": 9.69647657415842e-05, + "loss": 1.9927, + "step": 4502 + }, + { + "epoch": 1.3821362799263353, + "grad_norm": 0.44540464878082275, + "learning_rate": 9.696306006188998e-05, + "loss": 1.9207, + "step": 4503 + }, + { + "epoch": 1.3824432166973604, + "grad_norm": 0.44818806648254395, + "learning_rate": 9.696135391807927e-05, + "loss": 1.9054, + "step": 4504 + }, + { + "epoch": 1.3827501534683855, + "grad_norm": 0.430758535861969, + "learning_rate": 9.695964731016896e-05, + "loss": 1.9644, + "step": 4505 + }, + { + "epoch": 1.3830570902394106, + "grad_norm": 0.3787635564804077, + "learning_rate": 9.695794023817586e-05, + "loss": 1.9601, + "step": 4506 + }, + { + "epoch": 1.383364027010436, + "grad_norm": 0.42520588636398315, + "learning_rate": 9.695623270211689e-05, + "loss": 1.9681, + "step": 4507 + }, + { + "epoch": 1.383670963781461, + "grad_norm": 0.39063912630081177, + "learning_rate": 9.69545247020089e-05, + "loss": 2.0323, + "step": 4508 + }, + { + "epoch": 1.3839779005524862, + "grad_norm": 0.41405799984931946, + "learning_rate": 9.695281623786879e-05, + "loss": 1.9239, + "step": 4509 + }, + { + "epoch": 1.3842848373235115, + "grad_norm": 0.4275501072406769, + "learning_rate": 9.695110730971342e-05, + "loss": 1.941, + "step": 4510 + }, + { + "epoch": 1.3845917740945366, + "grad_norm": 0.5254966616630554, + "learning_rate": 9.694939791755968e-05, + "loss": 1.9997, + "step": 4511 + }, + { + "epoch": 1.3848987108655617, + "grad_norm": 0.581857442855835, + "learning_rate": 9.694768806142448e-05, + "loss": 2.0085, + "step": 4512 + }, + { + "epoch": 1.3852056476365868, + "grad_norm": 0.6330662965774536, + "learning_rate": 9.69459777413247e-05, + "loss": 1.9898, + "step": 4513 + }, + { + "epoch": 1.385512584407612, + "grad_norm": 0.693536639213562, + "learning_rate": 9.694426695727727e-05, + "loss": 1.9466, + "step": 4514 + }, + { + "epoch": 1.3858195211786373, + "grad_norm": 0.6494079232215881, + "learning_rate": 9.694255570929906e-05, + "loss": 1.9523, + "step": 4515 + }, + { + "epoch": 1.3861264579496624, + "grad_norm": 0.573515772819519, + "learning_rate": 9.694084399740701e-05, + "loss": 1.9789, + "step": 4516 + }, + { + "epoch": 1.3864333947206875, + "grad_norm": 0.5253448486328125, + "learning_rate": 9.693913182161805e-05, + "loss": 2.0348, + "step": 4517 + }, + { + "epoch": 1.3867403314917128, + "grad_norm": 0.49921590089797974, + "learning_rate": 9.693741918194904e-05, + "loss": 1.9684, + "step": 4518 + }, + { + "epoch": 1.387047268262738, + "grad_norm": 0.5164174437522888, + "learning_rate": 9.693570607841696e-05, + "loss": 2.0104, + "step": 4519 + }, + { + "epoch": 1.387354205033763, + "grad_norm": 0.5620231032371521, + "learning_rate": 9.693399251103872e-05, + "loss": 1.9969, + "step": 4520 + }, + { + "epoch": 1.3876611418047882, + "grad_norm": 0.495890349149704, + "learning_rate": 9.693227847983126e-05, + "loss": 2.0037, + "step": 4521 + }, + { + "epoch": 1.3879680785758133, + "grad_norm": 0.4942645728588104, + "learning_rate": 9.693056398481151e-05, + "loss": 2.0199, + "step": 4522 + }, + { + "epoch": 1.3882750153468386, + "grad_norm": 0.5366860628128052, + "learning_rate": 9.692884902599643e-05, + "loss": 2.0395, + "step": 4523 + }, + { + "epoch": 1.3885819521178637, + "grad_norm": 0.48179951310157776, + "learning_rate": 9.692713360340295e-05, + "loss": 2.0292, + "step": 4524 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.4709320366382599, + "learning_rate": 9.692541771704804e-05, + "loss": 2.006, + "step": 4525 + }, + { + "epoch": 1.3891958256599142, + "grad_norm": 0.4311975836753845, + "learning_rate": 9.692370136694864e-05, + "loss": 2.0122, + "step": 4526 + }, + { + "epoch": 1.3895027624309393, + "grad_norm": 0.4489841163158417, + "learning_rate": 9.692198455312172e-05, + "loss": 1.9635, + "step": 4527 + }, + { + "epoch": 1.3898096992019644, + "grad_norm": 0.40383243560791016, + "learning_rate": 9.692026727558425e-05, + "loss": 1.9352, + "step": 4528 + }, + { + "epoch": 1.3901166359729895, + "grad_norm": 0.4732677638530731, + "learning_rate": 9.691854953435319e-05, + "loss": 1.9882, + "step": 4529 + }, + { + "epoch": 1.3904235727440146, + "grad_norm": 0.5124688744544983, + "learning_rate": 9.691683132944553e-05, + "loss": 2.0068, + "step": 4530 + }, + { + "epoch": 1.39073050951504, + "grad_norm": 0.4810490906238556, + "learning_rate": 9.691511266087824e-05, + "loss": 2.0163, + "step": 4531 + }, + { + "epoch": 1.391037446286065, + "grad_norm": 0.4019710421562195, + "learning_rate": 9.691339352866831e-05, + "loss": 1.8943, + "step": 4532 + }, + { + "epoch": 1.3913443830570902, + "grad_norm": 0.4144287705421448, + "learning_rate": 9.691167393283274e-05, + "loss": 1.9457, + "step": 4533 + }, + { + "epoch": 1.3916513198281155, + "grad_norm": 0.42622655630111694, + "learning_rate": 9.690995387338851e-05, + "loss": 1.9618, + "step": 4534 + }, + { + "epoch": 1.3919582565991406, + "grad_norm": 0.4547794461250305, + "learning_rate": 9.690823335035259e-05, + "loss": 2.0243, + "step": 4535 + }, + { + "epoch": 1.3922651933701657, + "grad_norm": 0.4298909604549408, + "learning_rate": 9.690651236374205e-05, + "loss": 1.9872, + "step": 4536 + }, + { + "epoch": 1.3925721301411909, + "grad_norm": 0.40463829040527344, + "learning_rate": 9.690479091357386e-05, + "loss": 1.9617, + "step": 4537 + }, + { + "epoch": 1.392879066912216, + "grad_norm": 0.441487580537796, + "learning_rate": 9.690306899986502e-05, + "loss": 1.8965, + "step": 4538 + }, + { + "epoch": 1.3931860036832413, + "grad_norm": 0.4713582694530487, + "learning_rate": 9.690134662263256e-05, + "loss": 2.0112, + "step": 4539 + }, + { + "epoch": 1.3934929404542664, + "grad_norm": 0.5772922039031982, + "learning_rate": 9.689962378189351e-05, + "loss": 1.9903, + "step": 4540 + }, + { + "epoch": 1.3937998772252915, + "grad_norm": 0.6658890247344971, + "learning_rate": 9.689790047766489e-05, + "loss": 2.0569, + "step": 4541 + }, + { + "epoch": 1.3941068139963169, + "grad_norm": 0.6710116267204285, + "learning_rate": 9.689617670996372e-05, + "loss": 1.9692, + "step": 4542 + }, + { + "epoch": 1.394413750767342, + "grad_norm": 0.6778390407562256, + "learning_rate": 9.689445247880707e-05, + "loss": 2.0363, + "step": 4543 + }, + { + "epoch": 1.394720687538367, + "grad_norm": 0.6921203136444092, + "learning_rate": 9.689272778421192e-05, + "loss": 2.0104, + "step": 4544 + }, + { + "epoch": 1.3950276243093922, + "grad_norm": 0.48772117495536804, + "learning_rate": 9.689100262619537e-05, + "loss": 2.0006, + "step": 4545 + }, + { + "epoch": 1.3953345610804173, + "grad_norm": 0.4956360459327698, + "learning_rate": 9.688927700477445e-05, + "loss": 1.9724, + "step": 4546 + }, + { + "epoch": 1.3956414978514426, + "grad_norm": 0.6304072141647339, + "learning_rate": 9.68875509199662e-05, + "loss": 1.9904, + "step": 4547 + }, + { + "epoch": 1.3959484346224678, + "grad_norm": 0.6372275948524475, + "learning_rate": 9.68858243717877e-05, + "loss": 2.0328, + "step": 4548 + }, + { + "epoch": 1.3962553713934929, + "grad_norm": 0.48642870783805847, + "learning_rate": 9.688409736025601e-05, + "loss": 1.9898, + "step": 4549 + }, + { + "epoch": 1.3965623081645182, + "grad_norm": 0.41096800565719604, + "learning_rate": 9.688236988538817e-05, + "loss": 1.8945, + "step": 4550 + }, + { + "epoch": 1.3968692449355433, + "grad_norm": 0.48746830224990845, + "learning_rate": 9.68806419472013e-05, + "loss": 1.9809, + "step": 4551 + }, + { + "epoch": 1.3971761817065684, + "grad_norm": 0.5296676754951477, + "learning_rate": 9.687891354571242e-05, + "loss": 1.9194, + "step": 4552 + }, + { + "epoch": 1.3974831184775935, + "grad_norm": 0.43177086114883423, + "learning_rate": 9.687718468093865e-05, + "loss": 1.8785, + "step": 4553 + }, + { + "epoch": 1.3977900552486187, + "grad_norm": 0.4617565870285034, + "learning_rate": 9.687545535289705e-05, + "loss": 2.0021, + "step": 4554 + }, + { + "epoch": 1.398096992019644, + "grad_norm": 0.4460168182849884, + "learning_rate": 9.687372556160477e-05, + "loss": 1.9368, + "step": 4555 + }, + { + "epoch": 1.398403928790669, + "grad_norm": 0.5051010847091675, + "learning_rate": 9.687199530707882e-05, + "loss": 2.0321, + "step": 4556 + }, + { + "epoch": 1.3987108655616942, + "grad_norm": 0.5623685717582703, + "learning_rate": 9.687026458933636e-05, + "loss": 2.007, + "step": 4557 + }, + { + "epoch": 1.3990178023327196, + "grad_norm": 0.48149919509887695, + "learning_rate": 9.686853340839446e-05, + "loss": 1.9346, + "step": 4558 + }, + { + "epoch": 1.3993247391037447, + "grad_norm": 0.4651631712913513, + "learning_rate": 9.686680176427025e-05, + "loss": 1.9603, + "step": 4559 + }, + { + "epoch": 1.3996316758747698, + "grad_norm": 0.5255021452903748, + "learning_rate": 9.686506965698083e-05, + "loss": 2.0206, + "step": 4560 + }, + { + "epoch": 1.3999386126457949, + "grad_norm": 0.5137404799461365, + "learning_rate": 9.686333708654334e-05, + "loss": 1.9736, + "step": 4561 + }, + { + "epoch": 1.40024554941682, + "grad_norm": 0.5037943124771118, + "learning_rate": 9.686160405297487e-05, + "loss": 1.9886, + "step": 4562 + }, + { + "epoch": 1.4005524861878453, + "grad_norm": 0.46424365043640137, + "learning_rate": 9.685987055629256e-05, + "loss": 1.9316, + "step": 4563 + }, + { + "epoch": 1.4008594229588704, + "grad_norm": 0.4839535355567932, + "learning_rate": 9.685813659651355e-05, + "loss": 1.9651, + "step": 4564 + }, + { + "epoch": 1.4011663597298956, + "grad_norm": 0.48972323536872864, + "learning_rate": 9.685640217365497e-05, + "loss": 1.9544, + "step": 4565 + }, + { + "epoch": 1.401473296500921, + "grad_norm": 0.43038102984428406, + "learning_rate": 9.685466728773396e-05, + "loss": 1.9522, + "step": 4566 + }, + { + "epoch": 1.401780233271946, + "grad_norm": 0.5174641013145447, + "learning_rate": 9.685293193876765e-05, + "loss": 2.046, + "step": 4567 + }, + { + "epoch": 1.4020871700429711, + "grad_norm": 0.6731263995170593, + "learning_rate": 9.685119612677323e-05, + "loss": 2.0123, + "step": 4568 + }, + { + "epoch": 1.4023941068139965, + "grad_norm": 0.5863515734672546, + "learning_rate": 9.684945985176782e-05, + "loss": 1.9951, + "step": 4569 + }, + { + "epoch": 1.4027010435850216, + "grad_norm": 0.4479050934314728, + "learning_rate": 9.684772311376859e-05, + "loss": 1.9287, + "step": 4570 + }, + { + "epoch": 1.4030079803560467, + "grad_norm": 0.432740718126297, + "learning_rate": 9.68459859127927e-05, + "loss": 1.955, + "step": 4571 + }, + { + "epoch": 1.4033149171270718, + "grad_norm": 0.571775496006012, + "learning_rate": 9.684424824885731e-05, + "loss": 1.9519, + "step": 4572 + }, + { + "epoch": 1.403621853898097, + "grad_norm": 0.6454880237579346, + "learning_rate": 9.684251012197963e-05, + "loss": 1.9858, + "step": 4573 + }, + { + "epoch": 1.4039287906691222, + "grad_norm": 0.5274731516838074, + "learning_rate": 9.684077153217677e-05, + "loss": 1.9956, + "step": 4574 + }, + { + "epoch": 1.4042357274401474, + "grad_norm": 0.4459272027015686, + "learning_rate": 9.683903247946597e-05, + "loss": 2.0412, + "step": 4575 + }, + { + "epoch": 1.4045426642111725, + "grad_norm": 0.47089213132858276, + "learning_rate": 9.683729296386441e-05, + "loss": 1.9247, + "step": 4576 + }, + { + "epoch": 1.4048496009821978, + "grad_norm": 0.628490149974823, + "learning_rate": 9.683555298538927e-05, + "loss": 2.1311, + "step": 4577 + }, + { + "epoch": 1.405156537753223, + "grad_norm": 0.5498626232147217, + "learning_rate": 9.683381254405773e-05, + "loss": 1.9538, + "step": 4578 + }, + { + "epoch": 1.405463474524248, + "grad_norm": 0.4556458294391632, + "learning_rate": 9.6832071639887e-05, + "loss": 1.9957, + "step": 4579 + }, + { + "epoch": 1.4057704112952731, + "grad_norm": 0.5684164762496948, + "learning_rate": 9.68303302728943e-05, + "loss": 1.9339, + "step": 4580 + }, + { + "epoch": 1.4060773480662982, + "grad_norm": 0.5723292231559753, + "learning_rate": 9.682858844309682e-05, + "loss": 2.0043, + "step": 4581 + }, + { + "epoch": 1.4063842848373236, + "grad_norm": 0.4734770953655243, + "learning_rate": 9.682684615051178e-05, + "loss": 1.9854, + "step": 4582 + }, + { + "epoch": 1.4066912216083487, + "grad_norm": 0.49376189708709717, + "learning_rate": 9.682510339515642e-05, + "loss": 2.0436, + "step": 4583 + }, + { + "epoch": 1.4069981583793738, + "grad_norm": 0.6263520121574402, + "learning_rate": 9.682336017704793e-05, + "loss": 1.9426, + "step": 4584 + }, + { + "epoch": 1.4073050951503991, + "grad_norm": 0.5852357745170593, + "learning_rate": 9.682161649620355e-05, + "loss": 1.9865, + "step": 4585 + }, + { + "epoch": 1.4076120319214243, + "grad_norm": 0.45548367500305176, + "learning_rate": 9.681987235264052e-05, + "loss": 2.0454, + "step": 4586 + }, + { + "epoch": 1.4079189686924494, + "grad_norm": 0.4961472153663635, + "learning_rate": 9.681812774637607e-05, + "loss": 2.0414, + "step": 4587 + }, + { + "epoch": 1.4082259054634745, + "grad_norm": 0.5739028453826904, + "learning_rate": 9.681638267742741e-05, + "loss": 1.9591, + "step": 4588 + }, + { + "epoch": 1.4085328422344996, + "grad_norm": 0.546283483505249, + "learning_rate": 9.681463714581184e-05, + "loss": 1.9631, + "step": 4589 + }, + { + "epoch": 1.408839779005525, + "grad_norm": 0.4757421910762787, + "learning_rate": 9.681289115154659e-05, + "loss": 1.954, + "step": 4590 + }, + { + "epoch": 1.40914671577655, + "grad_norm": 0.5116898417472839, + "learning_rate": 9.681114469464891e-05, + "loss": 1.9816, + "step": 4591 + }, + { + "epoch": 1.4094536525475752, + "grad_norm": 0.6128544807434082, + "learning_rate": 9.680939777513607e-05, + "loss": 1.9408, + "step": 4592 + }, + { + "epoch": 1.4097605893186005, + "grad_norm": 0.5577036142349243, + "learning_rate": 9.680765039302531e-05, + "loss": 1.906, + "step": 4593 + }, + { + "epoch": 1.4100675260896256, + "grad_norm": 0.4608074128627777, + "learning_rate": 9.680590254833393e-05, + "loss": 1.9421, + "step": 4594 + }, + { + "epoch": 1.4103744628606507, + "grad_norm": 0.4221206307411194, + "learning_rate": 9.680415424107917e-05, + "loss": 1.9596, + "step": 4595 + }, + { + "epoch": 1.4106813996316758, + "grad_norm": 0.4278069734573364, + "learning_rate": 9.680240547127832e-05, + "loss": 1.9718, + "step": 4596 + }, + { + "epoch": 1.410988336402701, + "grad_norm": 0.48608019948005676, + "learning_rate": 9.680065623894869e-05, + "loss": 2.0595, + "step": 4597 + }, + { + "epoch": 1.4112952731737263, + "grad_norm": 0.4559817910194397, + "learning_rate": 9.679890654410753e-05, + "loss": 1.959, + "step": 4598 + }, + { + "epoch": 1.4116022099447514, + "grad_norm": 0.5122750997543335, + "learning_rate": 9.679715638677216e-05, + "loss": 2.0669, + "step": 4599 + }, + { + "epoch": 1.4119091467157765, + "grad_norm": 0.5203170776367188, + "learning_rate": 9.679540576695985e-05, + "loss": 1.9475, + "step": 4600 + }, + { + "epoch": 1.4122160834868018, + "grad_norm": 0.5420581698417664, + "learning_rate": 9.679365468468791e-05, + "loss": 1.9603, + "step": 4601 + }, + { + "epoch": 1.412523020257827, + "grad_norm": 0.527387261390686, + "learning_rate": 9.679190313997364e-05, + "loss": 1.9172, + "step": 4602 + }, + { + "epoch": 1.412829957028852, + "grad_norm": 0.48417946696281433, + "learning_rate": 9.679015113283438e-05, + "loss": 1.9619, + "step": 4603 + }, + { + "epoch": 1.4131368937998772, + "grad_norm": 0.49174100160598755, + "learning_rate": 9.678839866328742e-05, + "loss": 1.9959, + "step": 4604 + }, + { + "epoch": 1.4134438305709023, + "grad_norm": 0.5096092224121094, + "learning_rate": 9.678664573135006e-05, + "loss": 2.0046, + "step": 4605 + }, + { + "epoch": 1.4137507673419276, + "grad_norm": 0.4536958634853363, + "learning_rate": 9.678489233703965e-05, + "loss": 1.9289, + "step": 4606 + }, + { + "epoch": 1.4140577041129527, + "grad_norm": 0.40438196063041687, + "learning_rate": 9.678313848037353e-05, + "loss": 1.9488, + "step": 4607 + }, + { + "epoch": 1.4143646408839778, + "grad_norm": 0.4447456896305084, + "learning_rate": 9.6781384161369e-05, + "loss": 1.9638, + "step": 4608 + }, + { + "epoch": 1.4146715776550032, + "grad_norm": 0.44451746344566345, + "learning_rate": 9.677962938004342e-05, + "loss": 1.9026, + "step": 4609 + }, + { + "epoch": 1.4149785144260283, + "grad_norm": 0.4262266457080841, + "learning_rate": 9.677787413641412e-05, + "loss": 1.9408, + "step": 4610 + }, + { + "epoch": 1.4152854511970534, + "grad_norm": 0.42755937576293945, + "learning_rate": 9.677611843049845e-05, + "loss": 1.9542, + "step": 4611 + }, + { + "epoch": 1.4155923879680785, + "grad_norm": 0.43264830112457275, + "learning_rate": 9.677436226231375e-05, + "loss": 2.0244, + "step": 4612 + }, + { + "epoch": 1.4158993247391036, + "grad_norm": 0.4521278142929077, + "learning_rate": 9.67726056318774e-05, + "loss": 2.0343, + "step": 4613 + }, + { + "epoch": 1.416206261510129, + "grad_norm": 0.45257535576820374, + "learning_rate": 9.677084853920675e-05, + "loss": 1.9743, + "step": 4614 + }, + { + "epoch": 1.416513198281154, + "grad_norm": 0.42859771847724915, + "learning_rate": 9.676909098431915e-05, + "loss": 2.0067, + "step": 4615 + }, + { + "epoch": 1.4168201350521792, + "grad_norm": 0.4057050049304962, + "learning_rate": 9.6767332967232e-05, + "loss": 1.9074, + "step": 4616 + }, + { + "epoch": 1.4171270718232045, + "grad_norm": 0.46177807450294495, + "learning_rate": 9.676557448796264e-05, + "loss": 1.9899, + "step": 4617 + }, + { + "epoch": 1.4174340085942296, + "grad_norm": 0.44164395332336426, + "learning_rate": 9.676381554652846e-05, + "loss": 1.9759, + "step": 4618 + }, + { + "epoch": 1.4177409453652547, + "grad_norm": 0.42987993359565735, + "learning_rate": 9.676205614294684e-05, + "loss": 1.8783, + "step": 4619 + }, + { + "epoch": 1.4180478821362799, + "grad_norm": 0.541702389717102, + "learning_rate": 9.67602962772352e-05, + "loss": 2.0099, + "step": 4620 + }, + { + "epoch": 1.418354818907305, + "grad_norm": 0.42173272371292114, + "learning_rate": 9.67585359494109e-05, + "loss": 1.9281, + "step": 4621 + }, + { + "epoch": 1.4186617556783303, + "grad_norm": 0.432476669549942, + "learning_rate": 9.67567751594913e-05, + "loss": 1.9124, + "step": 4622 + }, + { + "epoch": 1.4189686924493554, + "grad_norm": 0.4952125549316406, + "learning_rate": 9.675501390749388e-05, + "loss": 1.973, + "step": 4623 + }, + { + "epoch": 1.4192756292203805, + "grad_norm": 0.5270698070526123, + "learning_rate": 9.6753252193436e-05, + "loss": 2.003, + "step": 4624 + }, + { + "epoch": 1.4195825659914059, + "grad_norm": 0.5735524892807007, + "learning_rate": 9.67514900173351e-05, + "loss": 1.9266, + "step": 4625 + }, + { + "epoch": 1.419889502762431, + "grad_norm": 0.508196234703064, + "learning_rate": 9.674972737920855e-05, + "loss": 1.9633, + "step": 4626 + }, + { + "epoch": 1.420196439533456, + "grad_norm": 0.4321250319480896, + "learning_rate": 9.674796427907379e-05, + "loss": 1.9994, + "step": 4627 + }, + { + "epoch": 1.4205033763044812, + "grad_norm": 0.5697643756866455, + "learning_rate": 9.674620071694826e-05, + "loss": 2.0018, + "step": 4628 + }, + { + "epoch": 1.4208103130755063, + "grad_norm": 0.6797513365745544, + "learning_rate": 9.674443669284936e-05, + "loss": 2.0514, + "step": 4629 + }, + { + "epoch": 1.4211172498465316, + "grad_norm": 0.6622742414474487, + "learning_rate": 9.674267220679456e-05, + "loss": 1.9315, + "step": 4630 + }, + { + "epoch": 1.4214241866175568, + "grad_norm": 0.5143589377403259, + "learning_rate": 9.674090725880125e-05, + "loss": 1.9691, + "step": 4631 + }, + { + "epoch": 1.4217311233885819, + "grad_norm": 0.4472220838069916, + "learning_rate": 9.673914184888692e-05, + "loss": 1.9629, + "step": 4632 + }, + { + "epoch": 1.4220380601596072, + "grad_norm": 0.4992378354072571, + "learning_rate": 9.6737375977069e-05, + "loss": 1.9202, + "step": 4633 + }, + { + "epoch": 1.4223449969306323, + "grad_norm": 0.5463345646858215, + "learning_rate": 9.673560964336493e-05, + "loss": 2.0143, + "step": 4634 + }, + { + "epoch": 1.4226519337016574, + "grad_norm": 0.4566437304019928, + "learning_rate": 9.673384284779217e-05, + "loss": 1.8907, + "step": 4635 + }, + { + "epoch": 1.4229588704726825, + "grad_norm": 0.41718652844429016, + "learning_rate": 9.673207559036816e-05, + "loss": 1.8955, + "step": 4636 + }, + { + "epoch": 1.4232658072437077, + "grad_norm": 0.5017329454421997, + "learning_rate": 9.673030787111043e-05, + "loss": 1.9745, + "step": 4637 + }, + { + "epoch": 1.423572744014733, + "grad_norm": 0.48890092968940735, + "learning_rate": 9.67285396900364e-05, + "loss": 1.9448, + "step": 4638 + }, + { + "epoch": 1.423879680785758, + "grad_norm": 0.4519537687301636, + "learning_rate": 9.672677104716352e-05, + "loss": 1.9572, + "step": 4639 + }, + { + "epoch": 1.4241866175567832, + "grad_norm": 0.4786919355392456, + "learning_rate": 9.672500194250932e-05, + "loss": 2.0212, + "step": 4640 + }, + { + "epoch": 1.4244935543278086, + "grad_norm": 0.4938487112522125, + "learning_rate": 9.672323237609127e-05, + "loss": 1.9842, + "step": 4641 + }, + { + "epoch": 1.4248004910988337, + "grad_norm": 0.5786599516868591, + "learning_rate": 9.672146234792686e-05, + "loss": 1.9575, + "step": 4642 + }, + { + "epoch": 1.4251074278698588, + "grad_norm": 0.5532247424125671, + "learning_rate": 9.671969185803356e-05, + "loss": 1.9972, + "step": 4643 + }, + { + "epoch": 1.4254143646408841, + "grad_norm": 0.5058014988899231, + "learning_rate": 9.671792090642889e-05, + "loss": 2.0042, + "step": 4644 + }, + { + "epoch": 1.4257213014119092, + "grad_norm": 0.46545106172561646, + "learning_rate": 9.671614949313033e-05, + "loss": 1.9853, + "step": 4645 + }, + { + "epoch": 1.4260282381829343, + "grad_norm": 0.47626879811286926, + "learning_rate": 9.671437761815541e-05, + "loss": 1.9725, + "step": 4646 + }, + { + "epoch": 1.4263351749539595, + "grad_norm": 0.4476237893104553, + "learning_rate": 9.671260528152165e-05, + "loss": 1.8876, + "step": 4647 + }, + { + "epoch": 1.4266421117249846, + "grad_norm": 0.4290693700313568, + "learning_rate": 9.671083248324651e-05, + "loss": 1.9766, + "step": 4648 + }, + { + "epoch": 1.42694904849601, + "grad_norm": 0.443131685256958, + "learning_rate": 9.670905922334757e-05, + "loss": 2.0201, + "step": 4649 + }, + { + "epoch": 1.427255985267035, + "grad_norm": 0.5181389451026917, + "learning_rate": 9.670728550184231e-05, + "loss": 2.0013, + "step": 4650 + }, + { + "epoch": 1.4275629220380601, + "grad_norm": 0.48453402519226074, + "learning_rate": 9.670551131874829e-05, + "loss": 1.9536, + "step": 4651 + }, + { + "epoch": 1.4278698588090855, + "grad_norm": 0.49652302265167236, + "learning_rate": 9.670373667408303e-05, + "loss": 1.9934, + "step": 4652 + }, + { + "epoch": 1.4281767955801106, + "grad_norm": 0.47071191668510437, + "learning_rate": 9.670196156786406e-05, + "loss": 2.0319, + "step": 4653 + }, + { + "epoch": 1.4284837323511357, + "grad_norm": 0.46828708052635193, + "learning_rate": 9.670018600010894e-05, + "loss": 1.9248, + "step": 4654 + }, + { + "epoch": 1.4287906691221608, + "grad_norm": 0.48472490906715393, + "learning_rate": 9.669840997083524e-05, + "loss": 1.9681, + "step": 4655 + }, + { + "epoch": 1.429097605893186, + "grad_norm": 0.48628562688827515, + "learning_rate": 9.669663348006044e-05, + "loss": 1.9818, + "step": 4656 + }, + { + "epoch": 1.4294045426642112, + "grad_norm": 0.40770742297172546, + "learning_rate": 9.669485652780215e-05, + "loss": 1.927, + "step": 4657 + }, + { + "epoch": 1.4297114794352364, + "grad_norm": 0.5005267858505249, + "learning_rate": 9.669307911407794e-05, + "loss": 2.0564, + "step": 4658 + }, + { + "epoch": 1.4300184162062615, + "grad_norm": 0.42432111501693726, + "learning_rate": 9.669130123890533e-05, + "loss": 1.9344, + "step": 4659 + }, + { + "epoch": 1.4303253529772868, + "grad_norm": 0.42347240447998047, + "learning_rate": 9.668952290230192e-05, + "loss": 1.962, + "step": 4660 + }, + { + "epoch": 1.430632289748312, + "grad_norm": 0.4718005955219269, + "learning_rate": 9.668774410428529e-05, + "loss": 2.0081, + "step": 4661 + }, + { + "epoch": 1.430939226519337, + "grad_norm": 0.45922374725341797, + "learning_rate": 9.6685964844873e-05, + "loss": 1.9378, + "step": 4662 + }, + { + "epoch": 1.4312461632903621, + "grad_norm": 0.43764227628707886, + "learning_rate": 9.668418512408263e-05, + "loss": 2.0084, + "step": 4663 + }, + { + "epoch": 1.4315531000613873, + "grad_norm": 0.42079678177833557, + "learning_rate": 9.668240494193179e-05, + "loss": 1.9675, + "step": 4664 + }, + { + "epoch": 1.4318600368324126, + "grad_norm": 0.4470539093017578, + "learning_rate": 9.668062429843808e-05, + "loss": 1.9781, + "step": 4665 + }, + { + "epoch": 1.4321669736034377, + "grad_norm": 0.4903084337711334, + "learning_rate": 9.667884319361906e-05, + "loss": 1.9612, + "step": 4666 + }, + { + "epoch": 1.4324739103744628, + "grad_norm": 0.4906228482723236, + "learning_rate": 9.667706162749234e-05, + "loss": 2.0115, + "step": 4667 + }, + { + "epoch": 1.4327808471454881, + "grad_norm": 0.4868105351924896, + "learning_rate": 9.667527960007556e-05, + "loss": 1.9648, + "step": 4668 + }, + { + "epoch": 1.4330877839165133, + "grad_norm": 0.5115882754325867, + "learning_rate": 9.667349711138632e-05, + "loss": 2.0366, + "step": 4669 + }, + { + "epoch": 1.4333947206875384, + "grad_norm": 0.47366276383399963, + "learning_rate": 9.66717141614422e-05, + "loss": 1.9467, + "step": 4670 + }, + { + "epoch": 1.4337016574585635, + "grad_norm": 0.6110171675682068, + "learning_rate": 9.666993075026086e-05, + "loss": 1.9272, + "step": 4671 + }, + { + "epoch": 1.4340085942295886, + "grad_norm": 0.5915683507919312, + "learning_rate": 9.66681468778599e-05, + "loss": 2.0444, + "step": 4672 + }, + { + "epoch": 1.434315531000614, + "grad_norm": 0.5783519744873047, + "learning_rate": 9.666636254425697e-05, + "loss": 1.9579, + "step": 4673 + }, + { + "epoch": 1.434622467771639, + "grad_norm": 0.4646502137184143, + "learning_rate": 9.66645777494697e-05, + "loss": 1.9172, + "step": 4674 + }, + { + "epoch": 1.4349294045426642, + "grad_norm": 0.4184744656085968, + "learning_rate": 9.666279249351571e-05, + "loss": 1.9189, + "step": 4675 + }, + { + "epoch": 1.4352363413136895, + "grad_norm": 0.5444575548171997, + "learning_rate": 9.666100677641266e-05, + "loss": 2.045, + "step": 4676 + }, + { + "epoch": 1.4355432780847146, + "grad_norm": 0.5232846140861511, + "learning_rate": 9.665922059817818e-05, + "loss": 2.0059, + "step": 4677 + }, + { + "epoch": 1.4358502148557397, + "grad_norm": 0.439259797334671, + "learning_rate": 9.665743395882994e-05, + "loss": 1.9164, + "step": 4678 + }, + { + "epoch": 1.4361571516267648, + "grad_norm": 0.405073344707489, + "learning_rate": 9.66556468583856e-05, + "loss": 1.9211, + "step": 4679 + }, + { + "epoch": 1.43646408839779, + "grad_norm": 0.47113174200057983, + "learning_rate": 9.665385929686279e-05, + "loss": 2.0732, + "step": 4680 + }, + { + "epoch": 1.4367710251688153, + "grad_norm": 0.4710143506526947, + "learning_rate": 9.665207127427923e-05, + "loss": 1.9153, + "step": 4681 + }, + { + "epoch": 1.4370779619398404, + "grad_norm": 0.41988152265548706, + "learning_rate": 9.665028279065254e-05, + "loss": 1.9985, + "step": 4682 + }, + { + "epoch": 1.4373848987108655, + "grad_norm": 0.4629889130592346, + "learning_rate": 9.664849384600042e-05, + "loss": 2.0188, + "step": 4683 + }, + { + "epoch": 1.4376918354818908, + "grad_norm": 0.42099106311798096, + "learning_rate": 9.664670444034051e-05, + "loss": 1.8915, + "step": 4684 + }, + { + "epoch": 1.437998772252916, + "grad_norm": 0.4132508337497711, + "learning_rate": 9.664491457369056e-05, + "loss": 1.9842, + "step": 4685 + }, + { + "epoch": 1.438305709023941, + "grad_norm": 0.4019499123096466, + "learning_rate": 9.664312424606822e-05, + "loss": 1.8653, + "step": 4686 + }, + { + "epoch": 1.4386126457949662, + "grad_norm": 0.40366294980049133, + "learning_rate": 9.664133345749118e-05, + "loss": 1.8993, + "step": 4687 + }, + { + "epoch": 1.4389195825659913, + "grad_norm": 0.4391988217830658, + "learning_rate": 9.663954220797715e-05, + "loss": 1.9471, + "step": 4688 + }, + { + "epoch": 1.4392265193370166, + "grad_norm": 0.44109684228897095, + "learning_rate": 9.663775049754382e-05, + "loss": 1.9579, + "step": 4689 + }, + { + "epoch": 1.4395334561080417, + "grad_norm": 0.45682960748672485, + "learning_rate": 9.663595832620891e-05, + "loss": 1.9757, + "step": 4690 + }, + { + "epoch": 1.4398403928790668, + "grad_norm": 0.4106207489967346, + "learning_rate": 9.663416569399013e-05, + "loss": 2.0038, + "step": 4691 + }, + { + "epoch": 1.4401473296500922, + "grad_norm": 0.4627512991428375, + "learning_rate": 9.66323726009052e-05, + "loss": 2.0253, + "step": 4692 + }, + { + "epoch": 1.4404542664211173, + "grad_norm": 0.43822941184043884, + "learning_rate": 9.663057904697182e-05, + "loss": 1.9565, + "step": 4693 + }, + { + "epoch": 1.4407612031921424, + "grad_norm": 0.46254315972328186, + "learning_rate": 9.662878503220772e-05, + "loss": 2.0042, + "step": 4694 + }, + { + "epoch": 1.4410681399631675, + "grad_norm": 0.49801671504974365, + "learning_rate": 9.662699055663065e-05, + "loss": 1.9725, + "step": 4695 + }, + { + "epoch": 1.4413750767341926, + "grad_norm": 0.40280646085739136, + "learning_rate": 9.662519562025832e-05, + "loss": 1.9016, + "step": 4696 + }, + { + "epoch": 1.441682013505218, + "grad_norm": 0.4095497131347656, + "learning_rate": 9.662340022310848e-05, + "loss": 2.0054, + "step": 4697 + }, + { + "epoch": 1.441988950276243, + "grad_norm": 0.44916659593582153, + "learning_rate": 9.662160436519889e-05, + "loss": 2.0126, + "step": 4698 + }, + { + "epoch": 1.4422958870472682, + "grad_norm": 0.47450655698776245, + "learning_rate": 9.661980804654725e-05, + "loss": 1.9679, + "step": 4699 + }, + { + "epoch": 1.4426028238182935, + "grad_norm": 0.4454696774482727, + "learning_rate": 9.661801126717136e-05, + "loss": 1.9335, + "step": 4700 + }, + { + "epoch": 1.4429097605893186, + "grad_norm": 0.5009927153587341, + "learning_rate": 9.661621402708896e-05, + "loss": 1.9777, + "step": 4701 + }, + { + "epoch": 1.4432166973603437, + "grad_norm": 0.49912458658218384, + "learning_rate": 9.66144163263178e-05, + "loss": 2.0095, + "step": 4702 + }, + { + "epoch": 1.4435236341313689, + "grad_norm": 0.4477069079875946, + "learning_rate": 9.661261816487568e-05, + "loss": 1.9265, + "step": 4703 + }, + { + "epoch": 1.443830570902394, + "grad_norm": 0.4170798361301422, + "learning_rate": 9.661081954278033e-05, + "loss": 1.9458, + "step": 4704 + }, + { + "epoch": 1.4441375076734193, + "grad_norm": 0.45160573720932007, + "learning_rate": 9.660902046004953e-05, + "loss": 1.9596, + "step": 4705 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4391551911830902, + "learning_rate": 9.660722091670109e-05, + "loss": 1.9158, + "step": 4706 + }, + { + "epoch": 1.4447513812154695, + "grad_norm": 0.5183218121528625, + "learning_rate": 9.660542091275276e-05, + "loss": 2.0055, + "step": 4707 + }, + { + "epoch": 1.4450583179864949, + "grad_norm": 0.49749481678009033, + "learning_rate": 9.660362044822235e-05, + "loss": 1.9695, + "step": 4708 + }, + { + "epoch": 1.44536525475752, + "grad_norm": 0.4839307963848114, + "learning_rate": 9.660181952312766e-05, + "loss": 1.9447, + "step": 4709 + }, + { + "epoch": 1.445672191528545, + "grad_norm": 0.5218588709831238, + "learning_rate": 9.660001813748647e-05, + "loss": 1.9892, + "step": 4710 + }, + { + "epoch": 1.4459791282995704, + "grad_norm": 0.5628986954689026, + "learning_rate": 9.659821629131658e-05, + "loss": 2.0598, + "step": 4711 + }, + { + "epoch": 1.4462860650705955, + "grad_norm": 0.5226300358772278, + "learning_rate": 9.65964139846358e-05, + "loss": 1.977, + "step": 4712 + }, + { + "epoch": 1.4465930018416207, + "grad_norm": 0.4345463216304779, + "learning_rate": 9.659461121746196e-05, + "loss": 1.9649, + "step": 4713 + }, + { + "epoch": 1.4468999386126458, + "grad_norm": 0.47233885526657104, + "learning_rate": 9.659280798981285e-05, + "loss": 1.9791, + "step": 4714 + }, + { + "epoch": 1.4472068753836709, + "grad_norm": 0.5272542238235474, + "learning_rate": 9.659100430170631e-05, + "loss": 2.0153, + "step": 4715 + }, + { + "epoch": 1.4475138121546962, + "grad_norm": 0.5567492246627808, + "learning_rate": 9.658920015316015e-05, + "loss": 2.0196, + "step": 4716 + }, + { + "epoch": 1.4478207489257213, + "grad_norm": 0.5393046140670776, + "learning_rate": 9.658739554419222e-05, + "loss": 1.9871, + "step": 4717 + }, + { + "epoch": 1.4481276856967464, + "grad_norm": 0.46408072113990784, + "learning_rate": 9.658559047482034e-05, + "loss": 1.9896, + "step": 4718 + }, + { + "epoch": 1.4484346224677718, + "grad_norm": 0.47001218795776367, + "learning_rate": 9.658378494506234e-05, + "loss": 2.0281, + "step": 4719 + }, + { + "epoch": 1.4487415592387969, + "grad_norm": 0.555749773979187, + "learning_rate": 9.658197895493608e-05, + "loss": 2.0184, + "step": 4720 + }, + { + "epoch": 1.449048496009822, + "grad_norm": 0.6206443905830383, + "learning_rate": 9.65801725044594e-05, + "loss": 1.9788, + "step": 4721 + }, + { + "epoch": 1.449355432780847, + "grad_norm": 0.533336877822876, + "learning_rate": 9.657836559365016e-05, + "loss": 1.9755, + "step": 4722 + }, + { + "epoch": 1.4496623695518722, + "grad_norm": 0.4553185701370239, + "learning_rate": 9.65765582225262e-05, + "loss": 1.9791, + "step": 4723 + }, + { + "epoch": 1.4499693063228976, + "grad_norm": 0.5754305124282837, + "learning_rate": 9.65747503911054e-05, + "loss": 1.9485, + "step": 4724 + }, + { + "epoch": 1.4502762430939227, + "grad_norm": 0.6812698245048523, + "learning_rate": 9.657294209940562e-05, + "loss": 2.0326, + "step": 4725 + }, + { + "epoch": 1.4505831798649478, + "grad_norm": 0.7532522678375244, + "learning_rate": 9.657113334744472e-05, + "loss": 1.9387, + "step": 4726 + }, + { + "epoch": 1.4508901166359731, + "grad_norm": 0.5618684887886047, + "learning_rate": 9.656932413524058e-05, + "loss": 1.9395, + "step": 4727 + }, + { + "epoch": 1.4511970534069982, + "grad_norm": 0.4818387031555176, + "learning_rate": 9.65675144628111e-05, + "loss": 1.9473, + "step": 4728 + }, + { + "epoch": 1.4515039901780233, + "grad_norm": 0.5152607560157776, + "learning_rate": 9.656570433017413e-05, + "loss": 1.894, + "step": 4729 + }, + { + "epoch": 1.4518109269490485, + "grad_norm": 0.5098578333854675, + "learning_rate": 9.656389373734759e-05, + "loss": 1.9519, + "step": 4730 + }, + { + "epoch": 1.4521178637200736, + "grad_norm": 0.5862317681312561, + "learning_rate": 9.656208268434936e-05, + "loss": 1.9968, + "step": 4731 + }, + { + "epoch": 1.452424800491099, + "grad_norm": 0.501220703125, + "learning_rate": 9.656027117119732e-05, + "loss": 1.993, + "step": 4732 + }, + { + "epoch": 1.452731737262124, + "grad_norm": 0.4974796772003174, + "learning_rate": 9.655845919790943e-05, + "loss": 2.0007, + "step": 4733 + }, + { + "epoch": 1.4530386740331491, + "grad_norm": 0.513671875, + "learning_rate": 9.655664676450351e-05, + "loss": 1.9321, + "step": 4734 + }, + { + "epoch": 1.4533456108041745, + "grad_norm": 0.5111755728721619, + "learning_rate": 9.655483387099756e-05, + "loss": 2.0187, + "step": 4735 + }, + { + "epoch": 1.4536525475751996, + "grad_norm": 0.47103258967399597, + "learning_rate": 9.655302051740942e-05, + "loss": 1.9716, + "step": 4736 + }, + { + "epoch": 1.4539594843462247, + "grad_norm": 0.4526553750038147, + "learning_rate": 9.655120670375707e-05, + "loss": 2.0424, + "step": 4737 + }, + { + "epoch": 1.4542664211172498, + "grad_norm": 0.44393640756607056, + "learning_rate": 9.65493924300584e-05, + "loss": 1.9318, + "step": 4738 + }, + { + "epoch": 1.454573357888275, + "grad_norm": 0.4070759415626526, + "learning_rate": 9.654757769633136e-05, + "loss": 1.9292, + "step": 4739 + }, + { + "epoch": 1.4548802946593002, + "grad_norm": 0.4010253846645355, + "learning_rate": 9.654576250259387e-05, + "loss": 1.9641, + "step": 4740 + }, + { + "epoch": 1.4551872314303254, + "grad_norm": 0.39156264066696167, + "learning_rate": 9.654394684886387e-05, + "loss": 1.9575, + "step": 4741 + }, + { + "epoch": 1.4554941682013505, + "grad_norm": 0.4360155463218689, + "learning_rate": 9.65421307351593e-05, + "loss": 1.9615, + "step": 4742 + }, + { + "epoch": 1.4558011049723758, + "grad_norm": 0.4203348755836487, + "learning_rate": 9.654031416149813e-05, + "loss": 1.9629, + "step": 4743 + }, + { + "epoch": 1.456108041743401, + "grad_norm": 0.42294225096702576, + "learning_rate": 9.653849712789828e-05, + "loss": 1.9756, + "step": 4744 + }, + { + "epoch": 1.456414978514426, + "grad_norm": 0.46253907680511475, + "learning_rate": 9.653667963437775e-05, + "loss": 2.0128, + "step": 4745 + }, + { + "epoch": 1.4567219152854511, + "grad_norm": 0.41743987798690796, + "learning_rate": 9.653486168095446e-05, + "loss": 1.938, + "step": 4746 + }, + { + "epoch": 1.4570288520564763, + "grad_norm": 0.43411263823509216, + "learning_rate": 9.653304326764639e-05, + "loss": 1.9744, + "step": 4747 + }, + { + "epoch": 1.4573357888275016, + "grad_norm": 0.4569607973098755, + "learning_rate": 9.653122439447151e-05, + "loss": 1.9844, + "step": 4748 + }, + { + "epoch": 1.4576427255985267, + "grad_norm": 0.41858115792274475, + "learning_rate": 9.652940506144781e-05, + "loss": 1.9835, + "step": 4749 + }, + { + "epoch": 1.4579496623695518, + "grad_norm": 0.4259703755378723, + "learning_rate": 9.652758526859324e-05, + "loss": 1.9467, + "step": 4750 + }, + { + "epoch": 1.4582565991405771, + "grad_norm": 0.49847620725631714, + "learning_rate": 9.652576501592583e-05, + "loss": 1.989, + "step": 4751 + }, + { + "epoch": 1.4585635359116023, + "grad_norm": 0.5898705720901489, + "learning_rate": 9.652394430346352e-05, + "loss": 1.9896, + "step": 4752 + }, + { + "epoch": 1.4588704726826274, + "grad_norm": 0.6528434157371521, + "learning_rate": 9.652212313122433e-05, + "loss": 1.9814, + "step": 4753 + }, + { + "epoch": 1.4591774094536525, + "grad_norm": 0.5704251527786255, + "learning_rate": 9.652030149922624e-05, + "loss": 1.9735, + "step": 4754 + }, + { + "epoch": 1.4594843462246776, + "grad_norm": 0.4349142014980316, + "learning_rate": 9.651847940748727e-05, + "loss": 1.9923, + "step": 4755 + }, + { + "epoch": 1.459791282995703, + "grad_norm": 0.43891096115112305, + "learning_rate": 9.651665685602542e-05, + "loss": 1.9429, + "step": 4756 + }, + { + "epoch": 1.460098219766728, + "grad_norm": 0.5881633758544922, + "learning_rate": 9.651483384485871e-05, + "loss": 2.0075, + "step": 4757 + }, + { + "epoch": 1.4604051565377532, + "grad_norm": 0.569064736366272, + "learning_rate": 9.651301037400515e-05, + "loss": 1.9968, + "step": 4758 + }, + { + "epoch": 1.4607120933087785, + "grad_norm": 0.49636805057525635, + "learning_rate": 9.651118644348276e-05, + "loss": 2.0844, + "step": 4759 + }, + { + "epoch": 1.4610190300798036, + "grad_norm": 0.4893283247947693, + "learning_rate": 9.650936205330955e-05, + "loss": 1.9635, + "step": 4760 + }, + { + "epoch": 1.4613259668508287, + "grad_norm": 0.5199632048606873, + "learning_rate": 9.650753720350358e-05, + "loss": 1.8934, + "step": 4761 + }, + { + "epoch": 1.4616329036218538, + "grad_norm": 0.5655859708786011, + "learning_rate": 9.650571189408287e-05, + "loss": 2.0473, + "step": 4762 + }, + { + "epoch": 1.461939840392879, + "grad_norm": 0.5004158020019531, + "learning_rate": 9.650388612506545e-05, + "loss": 1.9388, + "step": 4763 + }, + { + "epoch": 1.4622467771639043, + "grad_norm": 0.5075541734695435, + "learning_rate": 9.650205989646937e-05, + "loss": 2.0362, + "step": 4764 + }, + { + "epoch": 1.4625537139349294, + "grad_norm": 0.52835613489151, + "learning_rate": 9.650023320831267e-05, + "loss": 1.9849, + "step": 4765 + }, + { + "epoch": 1.4628606507059545, + "grad_norm": 0.5208338499069214, + "learning_rate": 9.649840606061342e-05, + "loss": 1.9619, + "step": 4766 + }, + { + "epoch": 1.4631675874769798, + "grad_norm": 0.4954691529273987, + "learning_rate": 9.649657845338966e-05, + "loss": 1.9282, + "step": 4767 + }, + { + "epoch": 1.463474524248005, + "grad_norm": 0.4260660409927368, + "learning_rate": 9.649475038665947e-05, + "loss": 2.0108, + "step": 4768 + }, + { + "epoch": 1.46378146101903, + "grad_norm": 0.4954771101474762, + "learning_rate": 9.64929218604409e-05, + "loss": 1.9995, + "step": 4769 + }, + { + "epoch": 1.4640883977900552, + "grad_norm": 0.6004415154457092, + "learning_rate": 9.649109287475202e-05, + "loss": 1.9816, + "step": 4770 + }, + { + "epoch": 1.4643953345610803, + "grad_norm": 0.6472858190536499, + "learning_rate": 9.648926342961092e-05, + "loss": 1.927, + "step": 4771 + }, + { + "epoch": 1.4647022713321056, + "grad_norm": 0.5293224453926086, + "learning_rate": 9.648743352503567e-05, + "loss": 1.9082, + "step": 4772 + }, + { + "epoch": 1.4650092081031307, + "grad_norm": 0.4413148760795593, + "learning_rate": 9.648560316104435e-05, + "loss": 1.9368, + "step": 4773 + }, + { + "epoch": 1.4653161448741558, + "grad_norm": 0.4727863371372223, + "learning_rate": 9.648377233765507e-05, + "loss": 1.944, + "step": 4774 + }, + { + "epoch": 1.4656230816451812, + "grad_norm": 0.5681154131889343, + "learning_rate": 9.648194105488589e-05, + "loss": 2.0003, + "step": 4775 + }, + { + "epoch": 1.4659300184162063, + "grad_norm": 0.5893644690513611, + "learning_rate": 9.648010931275493e-05, + "loss": 1.936, + "step": 4776 + }, + { + "epoch": 1.4662369551872314, + "grad_norm": 0.5034298300743103, + "learning_rate": 9.647827711128029e-05, + "loss": 2.0318, + "step": 4777 + }, + { + "epoch": 1.4665438919582565, + "grad_norm": 0.4954885244369507, + "learning_rate": 9.647644445048006e-05, + "loss": 2.0053, + "step": 4778 + }, + { + "epoch": 1.4668508287292816, + "grad_norm": 0.475923627614975, + "learning_rate": 9.647461133037236e-05, + "loss": 1.8911, + "step": 4779 + }, + { + "epoch": 1.467157765500307, + "grad_norm": 0.4725008010864258, + "learning_rate": 9.647277775097534e-05, + "loss": 1.8954, + "step": 4780 + }, + { + "epoch": 1.467464702271332, + "grad_norm": 0.4183707535266876, + "learning_rate": 9.647094371230707e-05, + "loss": 1.9891, + "step": 4781 + }, + { + "epoch": 1.4677716390423572, + "grad_norm": 0.4862513244152069, + "learning_rate": 9.64691092143857e-05, + "loss": 2.0364, + "step": 4782 + }, + { + "epoch": 1.4680785758133825, + "grad_norm": 0.5038082599639893, + "learning_rate": 9.646727425722936e-05, + "loss": 1.9304, + "step": 4783 + }, + { + "epoch": 1.4683855125844076, + "grad_norm": 0.47281327843666077, + "learning_rate": 9.646543884085618e-05, + "loss": 1.9453, + "step": 4784 + }, + { + "epoch": 1.4686924493554327, + "grad_norm": 0.42275354266166687, + "learning_rate": 9.646360296528431e-05, + "loss": 1.9434, + "step": 4785 + }, + { + "epoch": 1.468999386126458, + "grad_norm": 0.5757746696472168, + "learning_rate": 9.646176663053185e-05, + "loss": 2.0241, + "step": 4786 + }, + { + "epoch": 1.4693063228974832, + "grad_norm": 0.6757779121398926, + "learning_rate": 9.645992983661701e-05, + "loss": 1.9823, + "step": 4787 + }, + { + "epoch": 1.4696132596685083, + "grad_norm": 0.7052981853485107, + "learning_rate": 9.645809258355792e-05, + "loss": 2.0553, + "step": 4788 + }, + { + "epoch": 1.4699201964395334, + "grad_norm": 0.5630238652229309, + "learning_rate": 9.64562548713727e-05, + "loss": 2.0241, + "step": 4789 + }, + { + "epoch": 1.4702271332105585, + "grad_norm": 0.5034958124160767, + "learning_rate": 9.645441670007955e-05, + "loss": 1.9788, + "step": 4790 + }, + { + "epoch": 1.4705340699815839, + "grad_norm": 0.48978129029273987, + "learning_rate": 9.645257806969663e-05, + "loss": 1.9415, + "step": 4791 + }, + { + "epoch": 1.470841006752609, + "grad_norm": 0.4718508720397949, + "learning_rate": 9.645073898024211e-05, + "loss": 1.9657, + "step": 4792 + }, + { + "epoch": 1.471147943523634, + "grad_norm": 0.5171064734458923, + "learning_rate": 9.644889943173417e-05, + "loss": 1.9311, + "step": 4793 + }, + { + "epoch": 1.4714548802946594, + "grad_norm": 0.4556005597114563, + "learning_rate": 9.644705942419097e-05, + "loss": 1.9093, + "step": 4794 + }, + { + "epoch": 1.4717618170656845, + "grad_norm": 0.44836321473121643, + "learning_rate": 9.64452189576307e-05, + "loss": 1.9715, + "step": 4795 + }, + { + "epoch": 1.4720687538367097, + "grad_norm": 0.5139105916023254, + "learning_rate": 9.644337803207155e-05, + "loss": 1.967, + "step": 4796 + }, + { + "epoch": 1.4723756906077348, + "grad_norm": 0.49145743250846863, + "learning_rate": 9.644153664753173e-05, + "loss": 1.9679, + "step": 4797 + }, + { + "epoch": 1.4726826273787599, + "grad_norm": 0.4353790283203125, + "learning_rate": 9.643969480402942e-05, + "loss": 1.9438, + "step": 4798 + }, + { + "epoch": 1.4729895641497852, + "grad_norm": 0.39393118023872375, + "learning_rate": 9.643785250158283e-05, + "loss": 1.91, + "step": 4799 + }, + { + "epoch": 1.4732965009208103, + "grad_norm": 0.4250284731388092, + "learning_rate": 9.643600974021017e-05, + "loss": 1.9315, + "step": 4800 + }, + { + "epoch": 1.4736034376918354, + "grad_norm": 0.40301406383514404, + "learning_rate": 9.643416651992962e-05, + "loss": 1.9344, + "step": 4801 + }, + { + "epoch": 1.4739103744628608, + "grad_norm": 0.4428589940071106, + "learning_rate": 9.643232284075944e-05, + "loss": 1.9767, + "step": 4802 + }, + { + "epoch": 1.4742173112338859, + "grad_norm": 0.5098150372505188, + "learning_rate": 9.643047870271783e-05, + "loss": 2.0471, + "step": 4803 + }, + { + "epoch": 1.474524248004911, + "grad_norm": 0.5230079293251038, + "learning_rate": 9.642863410582302e-05, + "loss": 1.9647, + "step": 4804 + }, + { + "epoch": 1.474831184775936, + "grad_norm": 0.44200628995895386, + "learning_rate": 9.642678905009322e-05, + "loss": 1.9046, + "step": 4805 + }, + { + "epoch": 1.4751381215469612, + "grad_norm": 0.42684751749038696, + "learning_rate": 9.642494353554669e-05, + "loss": 1.82, + "step": 4806 + }, + { + "epoch": 1.4754450583179866, + "grad_norm": 0.3907437324523926, + "learning_rate": 9.642309756220165e-05, + "loss": 1.9257, + "step": 4807 + }, + { + "epoch": 1.4757519950890117, + "grad_norm": 0.43622660636901855, + "learning_rate": 9.642125113007636e-05, + "loss": 1.9319, + "step": 4808 + }, + { + "epoch": 1.4760589318600368, + "grad_norm": 0.4553097188472748, + "learning_rate": 9.641940423918905e-05, + "loss": 1.9699, + "step": 4809 + }, + { + "epoch": 1.4763658686310621, + "grad_norm": 0.48997193574905396, + "learning_rate": 9.641755688955798e-05, + "loss": 1.9843, + "step": 4810 + }, + { + "epoch": 1.4766728054020872, + "grad_norm": 0.5008227825164795, + "learning_rate": 9.641570908120141e-05, + "loss": 1.9616, + "step": 4811 + }, + { + "epoch": 1.4769797421731123, + "grad_norm": 0.49788615107536316, + "learning_rate": 9.64138608141376e-05, + "loss": 2.0233, + "step": 4812 + }, + { + "epoch": 1.4772866789441375, + "grad_norm": 0.509159505367279, + "learning_rate": 9.64120120883848e-05, + "loss": 1.9982, + "step": 4813 + }, + { + "epoch": 1.4775936157151626, + "grad_norm": 0.4976164996623993, + "learning_rate": 9.641016290396132e-05, + "loss": 1.9944, + "step": 4814 + }, + { + "epoch": 1.477900552486188, + "grad_norm": 0.4925370514392853, + "learning_rate": 9.640831326088539e-05, + "loss": 1.9547, + "step": 4815 + }, + { + "epoch": 1.478207489257213, + "grad_norm": 0.5058705806732178, + "learning_rate": 9.64064631591753e-05, + "loss": 2.0147, + "step": 4816 + }, + { + "epoch": 1.4785144260282381, + "grad_norm": 0.5614715814590454, + "learning_rate": 9.640461259884937e-05, + "loss": 1.9475, + "step": 4817 + }, + { + "epoch": 1.4788213627992635, + "grad_norm": 0.4417608380317688, + "learning_rate": 9.640276157992582e-05, + "loss": 1.9422, + "step": 4818 + }, + { + "epoch": 1.4791282995702886, + "grad_norm": 0.5124607682228088, + "learning_rate": 9.6400910102423e-05, + "loss": 1.9489, + "step": 4819 + }, + { + "epoch": 1.4794352363413137, + "grad_norm": 0.4931279420852661, + "learning_rate": 9.63990581663592e-05, + "loss": 1.9717, + "step": 4820 + }, + { + "epoch": 1.4797421731123388, + "grad_norm": 0.4716447591781616, + "learning_rate": 9.639720577175271e-05, + "loss": 1.9758, + "step": 4821 + }, + { + "epoch": 1.480049109883364, + "grad_norm": 0.4613695740699768, + "learning_rate": 9.639535291862183e-05, + "loss": 1.8998, + "step": 4822 + }, + { + "epoch": 1.4803560466543892, + "grad_norm": 0.4430600702762604, + "learning_rate": 9.639349960698489e-05, + "loss": 1.9539, + "step": 4823 + }, + { + "epoch": 1.4806629834254144, + "grad_norm": 0.45596009492874146, + "learning_rate": 9.639164583686018e-05, + "loss": 1.9626, + "step": 4824 + }, + { + "epoch": 1.4809699201964395, + "grad_norm": 0.4248705804347992, + "learning_rate": 9.638979160826604e-05, + "loss": 1.9627, + "step": 4825 + }, + { + "epoch": 1.4812768569674648, + "grad_norm": 0.43419960141181946, + "learning_rate": 9.63879369212208e-05, + "loss": 1.9589, + "step": 4826 + }, + { + "epoch": 1.48158379373849, + "grad_norm": 0.4715637266635895, + "learning_rate": 9.638608177574278e-05, + "loss": 1.981, + "step": 4827 + }, + { + "epoch": 1.481890730509515, + "grad_norm": 0.41809993982315063, + "learning_rate": 9.63842261718503e-05, + "loss": 1.9587, + "step": 4828 + }, + { + "epoch": 1.4821976672805401, + "grad_norm": 0.4085060656070709, + "learning_rate": 9.63823701095617e-05, + "loss": 1.9497, + "step": 4829 + }, + { + "epoch": 1.4825046040515653, + "grad_norm": 0.4199173152446747, + "learning_rate": 9.638051358889535e-05, + "loss": 1.9543, + "step": 4830 + }, + { + "epoch": 1.4828115408225906, + "grad_norm": 0.4560040235519409, + "learning_rate": 9.637865660986958e-05, + "loss": 1.9451, + "step": 4831 + }, + { + "epoch": 1.4831184775936157, + "grad_norm": 0.4059405028820038, + "learning_rate": 9.637679917250272e-05, + "loss": 1.9154, + "step": 4832 + }, + { + "epoch": 1.4834254143646408, + "grad_norm": 0.43314236402511597, + "learning_rate": 9.637494127681318e-05, + "loss": 1.9589, + "step": 4833 + }, + { + "epoch": 1.4837323511356661, + "grad_norm": 0.3866138458251953, + "learning_rate": 9.637308292281928e-05, + "loss": 1.9239, + "step": 4834 + }, + { + "epoch": 1.4840392879066913, + "grad_norm": 0.40781381726264954, + "learning_rate": 9.637122411053939e-05, + "loss": 1.9805, + "step": 4835 + }, + { + "epoch": 1.4843462246777164, + "grad_norm": 0.4605334401130676, + "learning_rate": 9.636936483999189e-05, + "loss": 1.9571, + "step": 4836 + }, + { + "epoch": 1.4846531614487415, + "grad_norm": 0.4730539917945862, + "learning_rate": 9.636750511119513e-05, + "loss": 1.9429, + "step": 4837 + }, + { + "epoch": 1.4849600982197666, + "grad_norm": 0.47973817586898804, + "learning_rate": 9.636564492416753e-05, + "loss": 1.9865, + "step": 4838 + }, + { + "epoch": 1.485267034990792, + "grad_norm": 0.4541794955730438, + "learning_rate": 9.636378427892744e-05, + "loss": 1.9796, + "step": 4839 + }, + { + "epoch": 1.485573971761817, + "grad_norm": 0.4863722026348114, + "learning_rate": 9.636192317549327e-05, + "loss": 1.9581, + "step": 4840 + }, + { + "epoch": 1.4858809085328422, + "grad_norm": 0.4559536278247833, + "learning_rate": 9.636006161388338e-05, + "loss": 1.9444, + "step": 4841 + }, + { + "epoch": 1.4861878453038675, + "grad_norm": 0.4385206401348114, + "learning_rate": 9.63581995941162e-05, + "loss": 1.9323, + "step": 4842 + }, + { + "epoch": 1.4864947820748926, + "grad_norm": 0.48802945017814636, + "learning_rate": 9.635633711621012e-05, + "loss": 1.9643, + "step": 4843 + }, + { + "epoch": 1.4868017188459177, + "grad_norm": 0.4051367938518524, + "learning_rate": 9.635447418018355e-05, + "loss": 1.9342, + "step": 4844 + }, + { + "epoch": 1.4871086556169428, + "grad_norm": 0.46384257078170776, + "learning_rate": 9.63526107860549e-05, + "loss": 1.9656, + "step": 4845 + }, + { + "epoch": 1.487415592387968, + "grad_norm": 0.3950713574886322, + "learning_rate": 9.635074693384257e-05, + "loss": 1.8673, + "step": 4846 + }, + { + "epoch": 1.4877225291589933, + "grad_norm": 0.4694644808769226, + "learning_rate": 9.634888262356501e-05, + "loss": 1.9484, + "step": 4847 + }, + { + "epoch": 1.4880294659300184, + "grad_norm": 0.45068567991256714, + "learning_rate": 9.63470178552406e-05, + "loss": 1.9221, + "step": 4848 + }, + { + "epoch": 1.4883364027010435, + "grad_norm": 0.44717836380004883, + "learning_rate": 9.634515262888781e-05, + "loss": 1.9968, + "step": 4849 + }, + { + "epoch": 1.4886433394720688, + "grad_norm": 0.42189615964889526, + "learning_rate": 9.634328694452506e-05, + "loss": 2.0262, + "step": 4850 + }, + { + "epoch": 1.488950276243094, + "grad_norm": 0.4895322322845459, + "learning_rate": 9.63414208021708e-05, + "loss": 2.0628, + "step": 4851 + }, + { + "epoch": 1.489257213014119, + "grad_norm": 0.4732883870601654, + "learning_rate": 9.633955420184342e-05, + "loss": 1.9487, + "step": 4852 + }, + { + "epoch": 1.4895641497851444, + "grad_norm": 0.4426051676273346, + "learning_rate": 9.633768714356143e-05, + "loss": 2.0181, + "step": 4853 + }, + { + "epoch": 1.4898710865561695, + "grad_norm": 0.5831739902496338, + "learning_rate": 9.633581962734326e-05, + "loss": 1.9311, + "step": 4854 + }, + { + "epoch": 1.4901780233271946, + "grad_norm": 0.6048587560653687, + "learning_rate": 9.633395165320734e-05, + "loss": 1.9159, + "step": 4855 + }, + { + "epoch": 1.4904849600982197, + "grad_norm": 0.60125732421875, + "learning_rate": 9.633208322117218e-05, + "loss": 1.9732, + "step": 4856 + }, + { + "epoch": 1.4907918968692448, + "grad_norm": 0.4806794822216034, + "learning_rate": 9.63302143312562e-05, + "loss": 1.9101, + "step": 4857 + }, + { + "epoch": 1.4910988336402702, + "grad_norm": 0.4032946228981018, + "learning_rate": 9.632834498347789e-05, + "loss": 1.9097, + "step": 4858 + }, + { + "epoch": 1.4914057704112953, + "grad_norm": 0.400632381439209, + "learning_rate": 9.632647517785571e-05, + "loss": 1.9949, + "step": 4859 + }, + { + "epoch": 1.4917127071823204, + "grad_norm": 0.49766576290130615, + "learning_rate": 9.632460491440818e-05, + "loss": 1.9762, + "step": 4860 + }, + { + "epoch": 1.4920196439533457, + "grad_norm": 0.6273209452629089, + "learning_rate": 9.632273419315372e-05, + "loss": 2.0797, + "step": 4861 + }, + { + "epoch": 1.4923265807243709, + "grad_norm": 0.5848406553268433, + "learning_rate": 9.632086301411087e-05, + "loss": 1.9366, + "step": 4862 + }, + { + "epoch": 1.492633517495396, + "grad_norm": 0.4683595597743988, + "learning_rate": 9.631899137729809e-05, + "loss": 1.9802, + "step": 4863 + }, + { + "epoch": 1.492940454266421, + "grad_norm": 0.43066033720970154, + "learning_rate": 9.63171192827339e-05, + "loss": 1.9621, + "step": 4864 + }, + { + "epoch": 1.4932473910374462, + "grad_norm": 0.47469422221183777, + "learning_rate": 9.63152467304368e-05, + "loss": 1.9795, + "step": 4865 + }, + { + "epoch": 1.4935543278084715, + "grad_norm": 0.5453927516937256, + "learning_rate": 9.631337372042526e-05, + "loss": 1.9711, + "step": 4866 + }, + { + "epoch": 1.4938612645794966, + "grad_norm": 0.5361614227294922, + "learning_rate": 9.631150025271782e-05, + "loss": 1.9849, + "step": 4867 + }, + { + "epoch": 1.4941682013505218, + "grad_norm": 0.4773578643798828, + "learning_rate": 9.6309626327333e-05, + "loss": 2.065, + "step": 4868 + }, + { + "epoch": 1.494475138121547, + "grad_norm": 0.428091824054718, + "learning_rate": 9.630775194428932e-05, + "loss": 1.9448, + "step": 4869 + }, + { + "epoch": 1.4947820748925722, + "grad_norm": 0.41679108142852783, + "learning_rate": 9.630587710360527e-05, + "loss": 1.9511, + "step": 4870 + }, + { + "epoch": 1.4950890116635973, + "grad_norm": 0.5072546601295471, + "learning_rate": 9.630400180529942e-05, + "loss": 1.9973, + "step": 4871 + }, + { + "epoch": 1.4953959484346224, + "grad_norm": 0.5230575799942017, + "learning_rate": 9.630212604939026e-05, + "loss": 1.9659, + "step": 4872 + }, + { + "epoch": 1.4957028852056475, + "grad_norm": 0.44307753443717957, + "learning_rate": 9.630024983589638e-05, + "loss": 1.9056, + "step": 4873 + }, + { + "epoch": 1.4960098219766729, + "grad_norm": 0.43783196806907654, + "learning_rate": 9.629837316483628e-05, + "loss": 1.9716, + "step": 4874 + }, + { + "epoch": 1.496316758747698, + "grad_norm": 0.4553990960121155, + "learning_rate": 9.629649603622852e-05, + "loss": 2.044, + "step": 4875 + }, + { + "epoch": 1.496623695518723, + "grad_norm": 0.49152833223342896, + "learning_rate": 9.629461845009164e-05, + "loss": 1.948, + "step": 4876 + }, + { + "epoch": 1.4969306322897484, + "grad_norm": 0.4371738135814667, + "learning_rate": 9.629274040644422e-05, + "loss": 1.9497, + "step": 4877 + }, + { + "epoch": 1.4972375690607735, + "grad_norm": 0.4973873198032379, + "learning_rate": 9.629086190530482e-05, + "loss": 2.0053, + "step": 4878 + }, + { + "epoch": 1.4975445058317987, + "grad_norm": 0.4250672459602356, + "learning_rate": 9.628898294669197e-05, + "loss": 1.9617, + "step": 4879 + }, + { + "epoch": 1.4978514426028238, + "grad_norm": 0.4514639675617218, + "learning_rate": 9.628710353062427e-05, + "loss": 1.9503, + "step": 4880 + }, + { + "epoch": 1.4981583793738489, + "grad_norm": 0.4960804879665375, + "learning_rate": 9.628522365712027e-05, + "loss": 1.9932, + "step": 4881 + }, + { + "epoch": 1.4984653161448742, + "grad_norm": 0.5604363083839417, + "learning_rate": 9.628334332619857e-05, + "loss": 2.0186, + "step": 4882 + }, + { + "epoch": 1.4987722529158993, + "grad_norm": 0.5125443935394287, + "learning_rate": 9.628146253787776e-05, + "loss": 1.9897, + "step": 4883 + }, + { + "epoch": 1.4990791896869244, + "grad_norm": 0.4029771089553833, + "learning_rate": 9.627958129217639e-05, + "loss": 1.9083, + "step": 4884 + }, + { + "epoch": 1.4993861264579498, + "grad_norm": 0.4608222544193268, + "learning_rate": 9.627769958911308e-05, + "loss": 2.0153, + "step": 4885 + }, + { + "epoch": 1.4996930632289749, + "grad_norm": 0.4253246486186981, + "learning_rate": 9.627581742870641e-05, + "loss": 1.9278, + "step": 4886 + }, + { + "epoch": 1.5, + "grad_norm": 0.4247463047504425, + "learning_rate": 9.6273934810975e-05, + "loss": 1.9456, + "step": 4887 + }, + { + "epoch": 1.5003069367710253, + "grad_norm": 0.44055816531181335, + "learning_rate": 9.627205173593744e-05, + "loss": 2.0225, + "step": 4888 + }, + { + "epoch": 1.5006138735420502, + "grad_norm": 0.47912710905075073, + "learning_rate": 9.627016820361235e-05, + "loss": 1.9716, + "step": 4889 + }, + { + "epoch": 1.5009208103130756, + "grad_norm": 0.47608625888824463, + "learning_rate": 9.626828421401832e-05, + "loss": 1.9444, + "step": 4890 + }, + { + "epoch": 1.5012277470841007, + "grad_norm": 0.4757349193096161, + "learning_rate": 9.6266399767174e-05, + "loss": 2.0699, + "step": 4891 + }, + { + "epoch": 1.5015346838551258, + "grad_norm": 0.5556650757789612, + "learning_rate": 9.6264514863098e-05, + "loss": 1.99, + "step": 4892 + }, + { + "epoch": 1.5018416206261511, + "grad_norm": 0.5072291493415833, + "learning_rate": 9.626262950180894e-05, + "loss": 1.9435, + "step": 4893 + }, + { + "epoch": 1.5021485573971762, + "grad_norm": 0.47811564803123474, + "learning_rate": 9.626074368332546e-05, + "loss": 1.9399, + "step": 4894 + }, + { + "epoch": 1.5024554941682013, + "grad_norm": 0.4613232910633087, + "learning_rate": 9.62588574076662e-05, + "loss": 1.9259, + "step": 4895 + }, + { + "epoch": 1.5027624309392267, + "grad_norm": 0.4170697331428528, + "learning_rate": 9.62569706748498e-05, + "loss": 1.9319, + "step": 4896 + }, + { + "epoch": 1.5030693677102516, + "grad_norm": 0.4731575548648834, + "learning_rate": 9.62550834848949e-05, + "loss": 1.9862, + "step": 4897 + }, + { + "epoch": 1.503376304481277, + "grad_norm": 0.49881401658058167, + "learning_rate": 9.625319583782016e-05, + "loss": 1.9837, + "step": 4898 + }, + { + "epoch": 1.503683241252302, + "grad_norm": 0.4689660668373108, + "learning_rate": 9.625130773364424e-05, + "loss": 1.9662, + "step": 4899 + }, + { + "epoch": 1.5039901780233271, + "grad_norm": 0.48389768600463867, + "learning_rate": 9.624941917238577e-05, + "loss": 2.0087, + "step": 4900 + }, + { + "epoch": 1.5042971147943525, + "grad_norm": 0.46716609597206116, + "learning_rate": 9.624753015406342e-05, + "loss": 1.9718, + "step": 4901 + }, + { + "epoch": 1.5046040515653776, + "grad_norm": 0.544793963432312, + "learning_rate": 9.62456406786959e-05, + "loss": 1.9878, + "step": 4902 + }, + { + "epoch": 1.5049109883364027, + "grad_norm": 0.44499701261520386, + "learning_rate": 9.624375074630183e-05, + "loss": 1.8849, + "step": 4903 + }, + { + "epoch": 1.505217925107428, + "grad_norm": 0.42464208602905273, + "learning_rate": 9.624186035689993e-05, + "loss": 1.8995, + "step": 4904 + }, + { + "epoch": 1.505524861878453, + "grad_norm": 0.41650670766830444, + "learning_rate": 9.623996951050885e-05, + "loss": 1.9138, + "step": 4905 + }, + { + "epoch": 1.5058317986494782, + "grad_norm": 0.37955889105796814, + "learning_rate": 9.62380782071473e-05, + "loss": 1.9746, + "step": 4906 + }, + { + "epoch": 1.5061387354205034, + "grad_norm": 0.3799228072166443, + "learning_rate": 9.623618644683394e-05, + "loss": 1.942, + "step": 4907 + }, + { + "epoch": 1.5064456721915285, + "grad_norm": 0.3799766004085541, + "learning_rate": 9.623429422958751e-05, + "loss": 1.9025, + "step": 4908 + }, + { + "epoch": 1.5067526089625538, + "grad_norm": 0.3780234456062317, + "learning_rate": 9.623240155542668e-05, + "loss": 1.9581, + "step": 4909 + }, + { + "epoch": 1.507059545733579, + "grad_norm": 0.36379706859588623, + "learning_rate": 9.623050842437014e-05, + "loss": 1.9299, + "step": 4910 + }, + { + "epoch": 1.507366482504604, + "grad_norm": 0.5230580568313599, + "learning_rate": 9.622861483643663e-05, + "loss": 2.0306, + "step": 4911 + }, + { + "epoch": 1.5076734192756294, + "grad_norm": 0.443945050239563, + "learning_rate": 9.622672079164486e-05, + "loss": 1.9032, + "step": 4912 + }, + { + "epoch": 1.5079803560466543, + "grad_norm": 0.4689701795578003, + "learning_rate": 9.622482629001355e-05, + "loss": 1.9901, + "step": 4913 + }, + { + "epoch": 1.5082872928176796, + "grad_norm": 0.4483632445335388, + "learning_rate": 9.622293133156139e-05, + "loss": 1.948, + "step": 4914 + }, + { + "epoch": 1.5085942295887047, + "grad_norm": 0.4064919948577881, + "learning_rate": 9.622103591630715e-05, + "loss": 1.9487, + "step": 4915 + }, + { + "epoch": 1.5089011663597298, + "grad_norm": 0.44170522689819336, + "learning_rate": 9.621914004426952e-05, + "loss": 1.9929, + "step": 4916 + }, + { + "epoch": 1.5092081031307552, + "grad_norm": 0.45979443192481995, + "learning_rate": 9.621724371546727e-05, + "loss": 1.9428, + "step": 4917 + }, + { + "epoch": 1.5095150399017803, + "grad_norm": 0.5258452892303467, + "learning_rate": 9.621534692991913e-05, + "loss": 2.0049, + "step": 4918 + }, + { + "epoch": 1.5098219766728054, + "grad_norm": 0.45191919803619385, + "learning_rate": 9.621344968764385e-05, + "loss": 2.0364, + "step": 4919 + }, + { + "epoch": 1.5101289134438307, + "grad_norm": 0.539245069026947, + "learning_rate": 9.621155198866016e-05, + "loss": 2.072, + "step": 4920 + }, + { + "epoch": 1.5104358502148556, + "grad_norm": 0.5410256385803223, + "learning_rate": 9.620965383298684e-05, + "loss": 2.0231, + "step": 4921 + }, + { + "epoch": 1.510742786985881, + "grad_norm": 0.4409741759300232, + "learning_rate": 9.620775522064264e-05, + "loss": 1.9024, + "step": 4922 + }, + { + "epoch": 1.511049723756906, + "grad_norm": 0.4911535680294037, + "learning_rate": 9.620585615164631e-05, + "loss": 2.0057, + "step": 4923 + }, + { + "epoch": 1.5113566605279312, + "grad_norm": 0.48139557242393494, + "learning_rate": 9.620395662601663e-05, + "loss": 2.0175, + "step": 4924 + }, + { + "epoch": 1.5116635972989565, + "grad_norm": 0.5130077004432678, + "learning_rate": 9.620205664377238e-05, + "loss": 1.952, + "step": 4925 + }, + { + "epoch": 1.5119705340699816, + "grad_norm": 0.5428542494773865, + "learning_rate": 9.62001562049323e-05, + "loss": 1.977, + "step": 4926 + }, + { + "epoch": 1.5122774708410067, + "grad_norm": 0.4586256444454193, + "learning_rate": 9.619825530951522e-05, + "loss": 1.9997, + "step": 4927 + }, + { + "epoch": 1.512584407612032, + "grad_norm": 0.3941349387168884, + "learning_rate": 9.61963539575399e-05, + "loss": 1.9174, + "step": 4928 + }, + { + "epoch": 1.512891344383057, + "grad_norm": 0.4396456480026245, + "learning_rate": 9.619445214902511e-05, + "loss": 1.9696, + "step": 4929 + }, + { + "epoch": 1.5131982811540823, + "grad_norm": 0.5413886904716492, + "learning_rate": 9.61925498839897e-05, + "loss": 2.0332, + "step": 4930 + }, + { + "epoch": 1.5135052179251074, + "grad_norm": 0.5946230888366699, + "learning_rate": 9.619064716245242e-05, + "loss": 2.0433, + "step": 4931 + }, + { + "epoch": 1.5138121546961325, + "grad_norm": 0.6353569030761719, + "learning_rate": 9.618874398443211e-05, + "loss": 1.9828, + "step": 4932 + }, + { + "epoch": 1.5141190914671578, + "grad_norm": 0.523690938949585, + "learning_rate": 9.618684034994754e-05, + "loss": 1.9024, + "step": 4933 + }, + { + "epoch": 1.514426028238183, + "grad_norm": 0.4437367022037506, + "learning_rate": 9.618493625901754e-05, + "loss": 1.9961, + "step": 4934 + }, + { + "epoch": 1.514732965009208, + "grad_norm": 0.48458734154701233, + "learning_rate": 9.618303171166094e-05, + "loss": 1.9515, + "step": 4935 + }, + { + "epoch": 1.5150399017802334, + "grad_norm": 0.47659310698509216, + "learning_rate": 9.618112670789657e-05, + "loss": 1.9943, + "step": 4936 + }, + { + "epoch": 1.5153468385512583, + "grad_norm": 0.49281415343284607, + "learning_rate": 9.617922124774322e-05, + "loss": 1.9311, + "step": 4937 + }, + { + "epoch": 1.5156537753222836, + "grad_norm": 0.4706041216850281, + "learning_rate": 9.617731533121972e-05, + "loss": 1.9478, + "step": 4938 + }, + { + "epoch": 1.5159607120933087, + "grad_norm": 0.4187149405479431, + "learning_rate": 9.617540895834496e-05, + "loss": 1.9915, + "step": 4939 + }, + { + "epoch": 1.5162676488643339, + "grad_norm": 0.3792540431022644, + "learning_rate": 9.617350212913772e-05, + "loss": 1.8609, + "step": 4940 + }, + { + "epoch": 1.5165745856353592, + "grad_norm": 0.46558165550231934, + "learning_rate": 9.617159484361688e-05, + "loss": 1.9574, + "step": 4941 + }, + { + "epoch": 1.5168815224063843, + "grad_norm": 0.4930344820022583, + "learning_rate": 9.616968710180127e-05, + "loss": 1.9924, + "step": 4942 + }, + { + "epoch": 1.5171884591774094, + "grad_norm": 0.44909337162971497, + "learning_rate": 9.616777890370976e-05, + "loss": 1.9674, + "step": 4943 + }, + { + "epoch": 1.5174953959484347, + "grad_norm": 0.43266600370407104, + "learning_rate": 9.616587024936119e-05, + "loss": 1.8899, + "step": 4944 + }, + { + "epoch": 1.5178023327194596, + "grad_norm": 0.43229207396507263, + "learning_rate": 9.616396113877444e-05, + "loss": 1.9671, + "step": 4945 + }, + { + "epoch": 1.518109269490485, + "grad_norm": 0.4609402120113373, + "learning_rate": 9.616205157196837e-05, + "loss": 1.9844, + "step": 4946 + }, + { + "epoch": 1.51841620626151, + "grad_norm": 0.4598314166069031, + "learning_rate": 9.616014154896184e-05, + "loss": 1.985, + "step": 4947 + }, + { + "epoch": 1.5187231430325352, + "grad_norm": 0.4746960997581482, + "learning_rate": 9.615823106977376e-05, + "loss": 2.0199, + "step": 4948 + }, + { + "epoch": 1.5190300798035605, + "grad_norm": 0.47560420632362366, + "learning_rate": 9.615632013442295e-05, + "loss": 1.8864, + "step": 4949 + }, + { + "epoch": 1.5193370165745856, + "grad_norm": 0.447837233543396, + "learning_rate": 9.615440874292835e-05, + "loss": 1.9699, + "step": 4950 + }, + { + "epoch": 1.5196439533456108, + "grad_norm": 0.49653175473213196, + "learning_rate": 9.615249689530883e-05, + "loss": 2.0645, + "step": 4951 + }, + { + "epoch": 1.519950890116636, + "grad_norm": 0.47083014249801636, + "learning_rate": 9.615058459158328e-05, + "loss": 2.01, + "step": 4952 + }, + { + "epoch": 1.520257826887661, + "grad_norm": 0.5299197435379028, + "learning_rate": 9.614867183177061e-05, + "loss": 2.0232, + "step": 4953 + }, + { + "epoch": 1.5205647636586863, + "grad_norm": 0.5005922317504883, + "learning_rate": 9.614675861588971e-05, + "loss": 1.9703, + "step": 4954 + }, + { + "epoch": 1.5208717004297114, + "grad_norm": 0.5131978392601013, + "learning_rate": 9.61448449439595e-05, + "loss": 1.9921, + "step": 4955 + }, + { + "epoch": 1.5211786372007365, + "grad_norm": 0.5278428196907043, + "learning_rate": 9.614293081599889e-05, + "loss": 1.9111, + "step": 4956 + }, + { + "epoch": 1.5214855739717619, + "grad_norm": 0.4914579689502716, + "learning_rate": 9.614101623202678e-05, + "loss": 2.0398, + "step": 4957 + }, + { + "epoch": 1.521792510742787, + "grad_norm": 0.454863041639328, + "learning_rate": 9.61391011920621e-05, + "loss": 1.9674, + "step": 4958 + }, + { + "epoch": 1.522099447513812, + "grad_norm": 0.464491605758667, + "learning_rate": 9.613718569612379e-05, + "loss": 2.0123, + "step": 4959 + }, + { + "epoch": 1.5224063842848374, + "grad_norm": 0.4252295196056366, + "learning_rate": 9.613526974423078e-05, + "loss": 1.9796, + "step": 4960 + }, + { + "epoch": 1.5227133210558625, + "grad_norm": 0.4643968641757965, + "learning_rate": 9.613335333640199e-05, + "loss": 1.9448, + "step": 4961 + }, + { + "epoch": 1.5230202578268877, + "grad_norm": 0.4204397201538086, + "learning_rate": 9.613143647265635e-05, + "loss": 2.0191, + "step": 4962 + }, + { + "epoch": 1.523327194597913, + "grad_norm": 0.3838767111301422, + "learning_rate": 9.612951915301283e-05, + "loss": 1.9057, + "step": 4963 + }, + { + "epoch": 1.5236341313689379, + "grad_norm": 0.4353863000869751, + "learning_rate": 9.612760137749035e-05, + "loss": 2.0435, + "step": 4964 + }, + { + "epoch": 1.5239410681399632, + "grad_norm": 0.4082738757133484, + "learning_rate": 9.612568314610788e-05, + "loss": 1.9229, + "step": 4965 + }, + { + "epoch": 1.5242480049109883, + "grad_norm": 0.4382591247558594, + "learning_rate": 9.612376445888437e-05, + "loss": 1.9185, + "step": 4966 + }, + { + "epoch": 1.5245549416820134, + "grad_norm": 0.48340749740600586, + "learning_rate": 9.61218453158388e-05, + "loss": 1.9669, + "step": 4967 + }, + { + "epoch": 1.5248618784530388, + "grad_norm": 0.47423556447029114, + "learning_rate": 9.611992571699012e-05, + "loss": 1.9372, + "step": 4968 + }, + { + "epoch": 1.525168815224064, + "grad_norm": 0.4070637822151184, + "learning_rate": 9.611800566235728e-05, + "loss": 2.0201, + "step": 4969 + }, + { + "epoch": 1.525475751995089, + "grad_norm": 0.43758198618888855, + "learning_rate": 9.61160851519593e-05, + "loss": 1.982, + "step": 4970 + }, + { + "epoch": 1.5257826887661143, + "grad_norm": 0.4724174737930298, + "learning_rate": 9.611416418581513e-05, + "loss": 1.9938, + "step": 4971 + }, + { + "epoch": 1.5260896255371392, + "grad_norm": 0.492405503988266, + "learning_rate": 9.611224276394374e-05, + "loss": 1.9462, + "step": 4972 + }, + { + "epoch": 1.5263965623081646, + "grad_norm": 0.5064161419868469, + "learning_rate": 9.611032088636418e-05, + "loss": 2.0326, + "step": 4973 + }, + { + "epoch": 1.5267034990791897, + "grad_norm": 0.4256031811237335, + "learning_rate": 9.610839855309537e-05, + "loss": 1.8885, + "step": 4974 + }, + { + "epoch": 1.5270104358502148, + "grad_norm": 0.4283316731452942, + "learning_rate": 9.610647576415636e-05, + "loss": 2.005, + "step": 4975 + }, + { + "epoch": 1.5273173726212401, + "grad_norm": 0.44234412908554077, + "learning_rate": 9.610455251956614e-05, + "loss": 1.9626, + "step": 4976 + }, + { + "epoch": 1.5276243093922652, + "grad_norm": 0.4135831594467163, + "learning_rate": 9.610262881934369e-05, + "loss": 1.9529, + "step": 4977 + }, + { + "epoch": 1.5279312461632903, + "grad_norm": 0.48090922832489014, + "learning_rate": 9.610070466350805e-05, + "loss": 2.0239, + "step": 4978 + }, + { + "epoch": 1.5282381829343157, + "grad_norm": 0.4546974301338196, + "learning_rate": 9.609878005207822e-05, + "loss": 1.9556, + "step": 4979 + }, + { + "epoch": 1.5285451197053406, + "grad_norm": 0.4197862148284912, + "learning_rate": 9.609685498507323e-05, + "loss": 1.9117, + "step": 4980 + }, + { + "epoch": 1.528852056476366, + "grad_norm": 0.4376974105834961, + "learning_rate": 9.60949294625121e-05, + "loss": 1.9514, + "step": 4981 + }, + { + "epoch": 1.529158993247391, + "grad_norm": 0.3671407401561737, + "learning_rate": 9.609300348441385e-05, + "loss": 1.9042, + "step": 4982 + }, + { + "epoch": 1.5294659300184161, + "grad_norm": 0.4326031506061554, + "learning_rate": 9.609107705079754e-05, + "loss": 1.9606, + "step": 4983 + }, + { + "epoch": 1.5297728667894415, + "grad_norm": 0.423308402299881, + "learning_rate": 9.608915016168218e-05, + "loss": 1.9663, + "step": 4984 + }, + { + "epoch": 1.5300798035604666, + "grad_norm": 0.46309906244277954, + "learning_rate": 9.608722281708683e-05, + "loss": 2.0114, + "step": 4985 + }, + { + "epoch": 1.5303867403314917, + "grad_norm": 0.4619913101196289, + "learning_rate": 9.608529501703053e-05, + "loss": 1.9328, + "step": 4986 + }, + { + "epoch": 1.530693677102517, + "grad_norm": 0.4335738718509674, + "learning_rate": 9.608336676153234e-05, + "loss": 1.9069, + "step": 4987 + }, + { + "epoch": 1.531000613873542, + "grad_norm": 0.40606966614723206, + "learning_rate": 9.608143805061129e-05, + "loss": 1.9243, + "step": 4988 + }, + { + "epoch": 1.5313075506445673, + "grad_norm": 0.45613235235214233, + "learning_rate": 9.607950888428649e-05, + "loss": 1.9943, + "step": 4989 + }, + { + "epoch": 1.5316144874155924, + "grad_norm": 0.4905582666397095, + "learning_rate": 9.607757926257696e-05, + "loss": 1.9649, + "step": 4990 + }, + { + "epoch": 1.5319214241866175, + "grad_norm": 0.44312527775764465, + "learning_rate": 9.607564918550179e-05, + "loss": 1.927, + "step": 4991 + }, + { + "epoch": 1.5322283609576428, + "grad_norm": 0.5193700790405273, + "learning_rate": 9.607371865308004e-05, + "loss": 1.9038, + "step": 4992 + }, + { + "epoch": 1.532535297728668, + "grad_norm": 0.5528806447982788, + "learning_rate": 9.607178766533078e-05, + "loss": 1.9194, + "step": 4993 + }, + { + "epoch": 1.532842234499693, + "grad_norm": 0.6561285257339478, + "learning_rate": 9.606985622227314e-05, + "loss": 2.0098, + "step": 4994 + }, + { + "epoch": 1.5331491712707184, + "grad_norm": 0.5642603635787964, + "learning_rate": 9.606792432392617e-05, + "loss": 1.9781, + "step": 4995 + }, + { + "epoch": 1.5334561080417433, + "grad_norm": 0.4974311590194702, + "learning_rate": 9.606599197030896e-05, + "loss": 1.9558, + "step": 4996 + }, + { + "epoch": 1.5337630448127686, + "grad_norm": 0.4324510395526886, + "learning_rate": 9.606405916144063e-05, + "loss": 1.9749, + "step": 4997 + }, + { + "epoch": 1.5340699815837937, + "grad_norm": 0.45244327187538147, + "learning_rate": 9.606212589734027e-05, + "loss": 1.8902, + "step": 4998 + }, + { + "epoch": 1.5343769183548188, + "grad_norm": 0.5418685078620911, + "learning_rate": 9.606019217802698e-05, + "loss": 1.9766, + "step": 4999 + }, + { + "epoch": 1.5346838551258442, + "grad_norm": 0.48479241132736206, + "learning_rate": 9.605825800351987e-05, + "loss": 1.9949, + "step": 5000 + }, + { + "epoch": 1.5349907918968693, + "grad_norm": 0.4958111643791199, + "learning_rate": 9.605632337383806e-05, + "loss": 1.988, + "step": 5001 + }, + { + "epoch": 1.5352977286678944, + "grad_norm": 0.47347983717918396, + "learning_rate": 9.605438828900067e-05, + "loss": 1.9157, + "step": 5002 + }, + { + "epoch": 1.5356046654389197, + "grad_norm": 0.4018974304199219, + "learning_rate": 9.605245274902684e-05, + "loss": 1.9347, + "step": 5003 + }, + { + "epoch": 1.5359116022099446, + "grad_norm": 0.46161791682243347, + "learning_rate": 9.605051675393565e-05, + "loss": 1.9785, + "step": 5004 + }, + { + "epoch": 1.53621853898097, + "grad_norm": 0.5113234519958496, + "learning_rate": 9.604858030374627e-05, + "loss": 1.9595, + "step": 5005 + }, + { + "epoch": 1.536525475751995, + "grad_norm": 0.6643409132957458, + "learning_rate": 9.604664339847784e-05, + "loss": 2.0395, + "step": 5006 + }, + { + "epoch": 1.5368324125230202, + "grad_norm": 0.6759974360466003, + "learning_rate": 9.604470603814948e-05, + "loss": 1.9058, + "step": 5007 + }, + { + "epoch": 1.5371393492940455, + "grad_norm": 0.5576213598251343, + "learning_rate": 9.604276822278035e-05, + "loss": 1.9326, + "step": 5008 + }, + { + "epoch": 1.5374462860650706, + "grad_norm": 0.4472630023956299, + "learning_rate": 9.60408299523896e-05, + "loss": 1.9553, + "step": 5009 + }, + { + "epoch": 1.5377532228360957, + "grad_norm": 0.48445144295692444, + "learning_rate": 9.603889122699638e-05, + "loss": 2.0136, + "step": 5010 + }, + { + "epoch": 1.538060159607121, + "grad_norm": 0.4793097972869873, + "learning_rate": 9.603695204661987e-05, + "loss": 1.9777, + "step": 5011 + }, + { + "epoch": 1.538367096378146, + "grad_norm": 0.5003167390823364, + "learning_rate": 9.60350124112792e-05, + "loss": 1.9672, + "step": 5012 + }, + { + "epoch": 1.5386740331491713, + "grad_norm": 0.5131042003631592, + "learning_rate": 9.603307232099355e-05, + "loss": 2.0058, + "step": 5013 + }, + { + "epoch": 1.5389809699201964, + "grad_norm": 0.4145869314670563, + "learning_rate": 9.603113177578212e-05, + "loss": 1.9332, + "step": 5014 + }, + { + "epoch": 1.5392879066912215, + "grad_norm": 0.4939991235733032, + "learning_rate": 9.602919077566404e-05, + "loss": 1.9967, + "step": 5015 + }, + { + "epoch": 1.5395948434622468, + "grad_norm": 0.4768902361392975, + "learning_rate": 9.602724932065853e-05, + "loss": 1.873, + "step": 5016 + }, + { + "epoch": 1.539901780233272, + "grad_norm": 0.45381611585617065, + "learning_rate": 9.602530741078476e-05, + "loss": 1.9416, + "step": 5017 + }, + { + "epoch": 1.540208717004297, + "grad_norm": 0.43104392290115356, + "learning_rate": 9.602336504606193e-05, + "loss": 1.9566, + "step": 5018 + }, + { + "epoch": 1.5405156537753224, + "grad_norm": 0.5354776978492737, + "learning_rate": 9.602142222650924e-05, + "loss": 1.9939, + "step": 5019 + }, + { + "epoch": 1.5408225905463473, + "grad_norm": 0.5623740553855896, + "learning_rate": 9.601947895214586e-05, + "loss": 1.9622, + "step": 5020 + }, + { + "epoch": 1.5411295273173726, + "grad_norm": 0.5234485268592834, + "learning_rate": 9.601753522299103e-05, + "loss": 1.9636, + "step": 5021 + }, + { + "epoch": 1.5414364640883977, + "grad_norm": 0.416384756565094, + "learning_rate": 9.601559103906396e-05, + "loss": 1.92, + "step": 5022 + }, + { + "epoch": 1.5417434008594229, + "grad_norm": 0.47080478072166443, + "learning_rate": 9.601364640038384e-05, + "loss": 1.9147, + "step": 5023 + }, + { + "epoch": 1.5420503376304482, + "grad_norm": 0.527463972568512, + "learning_rate": 9.601170130696988e-05, + "loss": 1.9458, + "step": 5024 + }, + { + "epoch": 1.5423572744014733, + "grad_norm": 0.4761022925376892, + "learning_rate": 9.600975575884134e-05, + "loss": 1.95, + "step": 5025 + }, + { + "epoch": 1.5426642111724984, + "grad_norm": 0.48202264308929443, + "learning_rate": 9.600780975601741e-05, + "loss": 1.9618, + "step": 5026 + }, + { + "epoch": 1.5429711479435237, + "grad_norm": 0.43222522735595703, + "learning_rate": 9.600586329851735e-05, + "loss": 1.9869, + "step": 5027 + }, + { + "epoch": 1.5432780847145486, + "grad_norm": 0.40816691517829895, + "learning_rate": 9.600391638636037e-05, + "loss": 1.991, + "step": 5028 + }, + { + "epoch": 1.543585021485574, + "grad_norm": 0.4365478754043579, + "learning_rate": 9.600196901956572e-05, + "loss": 1.9904, + "step": 5029 + }, + { + "epoch": 1.5438919582565993, + "grad_norm": 0.41411092877388, + "learning_rate": 9.600002119815268e-05, + "loss": 1.9449, + "step": 5030 + }, + { + "epoch": 1.5441988950276242, + "grad_norm": 0.41023650765419006, + "learning_rate": 9.599807292214045e-05, + "loss": 1.9318, + "step": 5031 + }, + { + "epoch": 1.5445058317986495, + "grad_norm": 0.4844631254673004, + "learning_rate": 9.599612419154831e-05, + "loss": 1.9884, + "step": 5032 + }, + { + "epoch": 1.5448127685696746, + "grad_norm": 0.4347037374973297, + "learning_rate": 9.59941750063955e-05, + "loss": 1.8992, + "step": 5033 + }, + { + "epoch": 1.5451197053406998, + "grad_norm": 0.6414445638656616, + "learning_rate": 9.59922253667013e-05, + "loss": 2.0268, + "step": 5034 + }, + { + "epoch": 1.545426642111725, + "grad_norm": 0.6607222557067871, + "learning_rate": 9.599027527248498e-05, + "loss": 2.0116, + "step": 5035 + }, + { + "epoch": 1.5457335788827502, + "grad_norm": 0.6406869292259216, + "learning_rate": 9.59883247237658e-05, + "loss": 1.9256, + "step": 5036 + }, + { + "epoch": 1.5460405156537753, + "grad_norm": 0.5388308167457581, + "learning_rate": 9.598637372056303e-05, + "loss": 1.906, + "step": 5037 + }, + { + "epoch": 1.5463474524248007, + "grad_norm": 0.42285510897636414, + "learning_rate": 9.598442226289596e-05, + "loss": 1.9137, + "step": 5038 + }, + { + "epoch": 1.5466543891958255, + "grad_norm": 0.5622994303703308, + "learning_rate": 9.598247035078389e-05, + "loss": 1.9825, + "step": 5039 + }, + { + "epoch": 1.5469613259668509, + "grad_norm": 0.7120574116706848, + "learning_rate": 9.59805179842461e-05, + "loss": 1.9467, + "step": 5040 + }, + { + "epoch": 1.547268262737876, + "grad_norm": 0.7050338983535767, + "learning_rate": 9.597856516330187e-05, + "loss": 1.9763, + "step": 5041 + }, + { + "epoch": 1.547575199508901, + "grad_norm": 0.4908922016620636, + "learning_rate": 9.597661188797051e-05, + "loss": 1.9826, + "step": 5042 + }, + { + "epoch": 1.5478821362799264, + "grad_norm": 0.47363361716270447, + "learning_rate": 9.597465815827133e-05, + "loss": 1.9769, + "step": 5043 + }, + { + "epoch": 1.5481890730509515, + "grad_norm": 0.6289864182472229, + "learning_rate": 9.597270397422364e-05, + "loss": 1.9364, + "step": 5044 + }, + { + "epoch": 1.5484960098219767, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.597074933584673e-05, + "loss": 1.949, + "step": 5045 + }, + { + "epoch": 1.548802946593002, + "grad_norm": 0.559152364730835, + "learning_rate": 9.596879424315993e-05, + "loss": 2.0194, + "step": 5046 + }, + { + "epoch": 1.5491098833640269, + "grad_norm": 0.4613901674747467, + "learning_rate": 9.596683869618257e-05, + "loss": 1.9658, + "step": 5047 + }, + { + "epoch": 1.5494168201350522, + "grad_norm": 0.6245483160018921, + "learning_rate": 9.596488269493396e-05, + "loss": 1.9265, + "step": 5048 + }, + { + "epoch": 1.5497237569060773, + "grad_norm": 0.8100824356079102, + "learning_rate": 9.596292623943343e-05, + "loss": 1.9536, + "step": 5049 + }, + { + "epoch": 1.5500306936771024, + "grad_norm": 0.7486092448234558, + "learning_rate": 9.596096932970035e-05, + "loss": 1.9801, + "step": 5050 + }, + { + "epoch": 1.5503376304481278, + "grad_norm": 0.4803295135498047, + "learning_rate": 9.595901196575401e-05, + "loss": 1.9943, + "step": 5051 + }, + { + "epoch": 1.550644567219153, + "grad_norm": 0.5027125477790833, + "learning_rate": 9.595705414761379e-05, + "loss": 1.9036, + "step": 5052 + }, + { + "epoch": 1.550951503990178, + "grad_norm": 0.5785070657730103, + "learning_rate": 9.595509587529902e-05, + "loss": 1.9489, + "step": 5053 + }, + { + "epoch": 1.5512584407612033, + "grad_norm": 0.6017338633537292, + "learning_rate": 9.595313714882906e-05, + "loss": 1.9964, + "step": 5054 + }, + { + "epoch": 1.5515653775322282, + "grad_norm": 0.5023195147514343, + "learning_rate": 9.595117796822326e-05, + "loss": 1.9778, + "step": 5055 + }, + { + "epoch": 1.5518723143032536, + "grad_norm": 0.4488884508609772, + "learning_rate": 9.594921833350099e-05, + "loss": 2.0141, + "step": 5056 + }, + { + "epoch": 1.5521792510742787, + "grad_norm": 0.47110801935195923, + "learning_rate": 9.59472582446816e-05, + "loss": 1.9294, + "step": 5057 + }, + { + "epoch": 1.5524861878453038, + "grad_norm": 0.5292330980300903, + "learning_rate": 9.594529770178449e-05, + "loss": 2.0427, + "step": 5058 + }, + { + "epoch": 1.5527931246163291, + "grad_norm": 0.522756814956665, + "learning_rate": 9.5943336704829e-05, + "loss": 1.9854, + "step": 5059 + }, + { + "epoch": 1.5531000613873542, + "grad_norm": 0.44659632444381714, + "learning_rate": 9.594137525383455e-05, + "loss": 2.028, + "step": 5060 + }, + { + "epoch": 1.5534069981583793, + "grad_norm": 0.4745616614818573, + "learning_rate": 9.593941334882048e-05, + "loss": 1.9994, + "step": 5061 + }, + { + "epoch": 1.5537139349294047, + "grad_norm": 0.41752973198890686, + "learning_rate": 9.593745098980622e-05, + "loss": 1.9466, + "step": 5062 + }, + { + "epoch": 1.5540208717004296, + "grad_norm": 0.4548248052597046, + "learning_rate": 9.593548817681115e-05, + "loss": 1.9064, + "step": 5063 + }, + { + "epoch": 1.554327808471455, + "grad_norm": 0.45780888199806213, + "learning_rate": 9.593352490985464e-05, + "loss": 2.0254, + "step": 5064 + }, + { + "epoch": 1.55463474524248, + "grad_norm": 0.4118718206882477, + "learning_rate": 9.593156118895613e-05, + "loss": 1.9761, + "step": 5065 + }, + { + "epoch": 1.5549416820135051, + "grad_norm": 0.41350236535072327, + "learning_rate": 9.592959701413501e-05, + "loss": 1.9476, + "step": 5066 + }, + { + "epoch": 1.5552486187845305, + "grad_norm": 0.4116091728210449, + "learning_rate": 9.59276323854107e-05, + "loss": 1.9325, + "step": 5067 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.44039735198020935, + "learning_rate": 9.592566730280259e-05, + "loss": 1.9916, + "step": 5068 + }, + { + "epoch": 1.5558624923265807, + "grad_norm": 0.4028816819190979, + "learning_rate": 9.592370176633012e-05, + "loss": 1.916, + "step": 5069 + }, + { + "epoch": 1.556169429097606, + "grad_norm": 0.42046302556991577, + "learning_rate": 9.592173577601271e-05, + "loss": 1.961, + "step": 5070 + }, + { + "epoch": 1.556476365868631, + "grad_norm": 0.3749450147151947, + "learning_rate": 9.591976933186982e-05, + "loss": 1.9279, + "step": 5071 + }, + { + "epoch": 1.5567833026396563, + "grad_norm": 0.3441384434700012, + "learning_rate": 9.591780243392081e-05, + "loss": 1.8967, + "step": 5072 + }, + { + "epoch": 1.5570902394106814, + "grad_norm": 0.4032546877861023, + "learning_rate": 9.59158350821852e-05, + "loss": 1.9912, + "step": 5073 + }, + { + "epoch": 1.5573971761817065, + "grad_norm": 0.44628265500068665, + "learning_rate": 9.591386727668238e-05, + "loss": 2.0539, + "step": 5074 + }, + { + "epoch": 1.5577041129527318, + "grad_norm": 0.43606969714164734, + "learning_rate": 9.59118990174318e-05, + "loss": 1.97, + "step": 5075 + }, + { + "epoch": 1.558011049723757, + "grad_norm": 0.42076775431632996, + "learning_rate": 9.590993030445295e-05, + "loss": 1.962, + "step": 5076 + }, + { + "epoch": 1.558317986494782, + "grad_norm": 0.34569117426872253, + "learning_rate": 9.590796113776526e-05, + "loss": 1.8815, + "step": 5077 + }, + { + "epoch": 1.5586249232658074, + "grad_norm": 0.3931111693382263, + "learning_rate": 9.590599151738817e-05, + "loss": 1.9016, + "step": 5078 + }, + { + "epoch": 1.5589318600368323, + "grad_norm": 0.3952369689941406, + "learning_rate": 9.590402144334117e-05, + "loss": 1.9277, + "step": 5079 + }, + { + "epoch": 1.5592387968078576, + "grad_norm": 0.3960857689380646, + "learning_rate": 9.590205091564372e-05, + "loss": 1.947, + "step": 5080 + }, + { + "epoch": 1.5595457335788827, + "grad_norm": 0.37946292757987976, + "learning_rate": 9.590007993431532e-05, + "loss": 1.9907, + "step": 5081 + }, + { + "epoch": 1.5598526703499078, + "grad_norm": 0.41619375348091125, + "learning_rate": 9.589810849937541e-05, + "loss": 1.9451, + "step": 5082 + }, + { + "epoch": 1.5601596071209332, + "grad_norm": 0.39266669750213623, + "learning_rate": 9.58961366108435e-05, + "loss": 2.0137, + "step": 5083 + }, + { + "epoch": 1.5604665438919583, + "grad_norm": 0.39510276913642883, + "learning_rate": 9.589416426873907e-05, + "loss": 1.947, + "step": 5084 + }, + { + "epoch": 1.5607734806629834, + "grad_norm": 0.40243181586265564, + "learning_rate": 9.58921914730816e-05, + "loss": 1.8957, + "step": 5085 + }, + { + "epoch": 1.5610804174340087, + "grad_norm": 0.39877578616142273, + "learning_rate": 9.58902182238906e-05, + "loss": 1.9497, + "step": 5086 + }, + { + "epoch": 1.5613873542050336, + "grad_norm": 0.39367151260375977, + "learning_rate": 9.588824452118557e-05, + "loss": 1.9616, + "step": 5087 + }, + { + "epoch": 1.561694290976059, + "grad_norm": 0.35690104961395264, + "learning_rate": 9.5886270364986e-05, + "loss": 1.9108, + "step": 5088 + }, + { + "epoch": 1.562001227747084, + "grad_norm": 0.39512762427330017, + "learning_rate": 9.588429575531141e-05, + "loss": 1.9909, + "step": 5089 + }, + { + "epoch": 1.5623081645181092, + "grad_norm": 0.39253926277160645, + "learning_rate": 9.588232069218132e-05, + "loss": 1.937, + "step": 5090 + }, + { + "epoch": 1.5626151012891345, + "grad_norm": 0.37811553478240967, + "learning_rate": 9.588034517561526e-05, + "loss": 1.8918, + "step": 5091 + }, + { + "epoch": 1.5629220380601596, + "grad_norm": 0.38191986083984375, + "learning_rate": 9.587836920563272e-05, + "loss": 1.9149, + "step": 5092 + }, + { + "epoch": 1.5632289748311847, + "grad_norm": 0.3903779089450836, + "learning_rate": 9.587639278225326e-05, + "loss": 1.9714, + "step": 5093 + }, + { + "epoch": 1.56353591160221, + "grad_norm": 0.4467499554157257, + "learning_rate": 9.587441590549639e-05, + "loss": 1.8822, + "step": 5094 + }, + { + "epoch": 1.563842848373235, + "grad_norm": 0.3819296956062317, + "learning_rate": 9.587243857538164e-05, + "loss": 1.9212, + "step": 5095 + }, + { + "epoch": 1.5641497851442603, + "grad_norm": 0.4305097162723541, + "learning_rate": 9.587046079192858e-05, + "loss": 1.9264, + "step": 5096 + }, + { + "epoch": 1.5644567219152854, + "grad_norm": 0.4135383367538452, + "learning_rate": 9.586848255515675e-05, + "loss": 1.9743, + "step": 5097 + }, + { + "epoch": 1.5647636586863105, + "grad_norm": 0.44688066840171814, + "learning_rate": 9.586650386508566e-05, + "loss": 1.8804, + "step": 5098 + }, + { + "epoch": 1.5650705954573358, + "grad_norm": 0.5358461737632751, + "learning_rate": 9.586452472173492e-05, + "loss": 1.9485, + "step": 5099 + }, + { + "epoch": 1.565377532228361, + "grad_norm": 0.5585343837738037, + "learning_rate": 9.586254512512408e-05, + "loss": 2.0901, + "step": 5100 + }, + { + "epoch": 1.565684468999386, + "grad_norm": 0.4682343602180481, + "learning_rate": 9.586056507527266e-05, + "loss": 1.8877, + "step": 5101 + }, + { + "epoch": 1.5659914057704114, + "grad_norm": 0.44076529145240784, + "learning_rate": 9.585858457220026e-05, + "loss": 1.93, + "step": 5102 + }, + { + "epoch": 1.5662983425414365, + "grad_norm": 0.4613071382045746, + "learning_rate": 9.585660361592646e-05, + "loss": 1.9689, + "step": 5103 + }, + { + "epoch": 1.5666052793124616, + "grad_norm": 0.4589289128780365, + "learning_rate": 9.585462220647082e-05, + "loss": 1.8876, + "step": 5104 + }, + { + "epoch": 1.566912216083487, + "grad_norm": 0.3495907485485077, + "learning_rate": 9.585264034385292e-05, + "loss": 1.9013, + "step": 5105 + }, + { + "epoch": 1.5672191528545119, + "grad_norm": 0.42263728380203247, + "learning_rate": 9.585065802809235e-05, + "loss": 1.8886, + "step": 5106 + }, + { + "epoch": 1.5675260896255372, + "grad_norm": 0.4275301694869995, + "learning_rate": 9.584867525920872e-05, + "loss": 1.9865, + "step": 5107 + }, + { + "epoch": 1.5678330263965623, + "grad_norm": 0.4228142201900482, + "learning_rate": 9.584669203722161e-05, + "loss": 1.8573, + "step": 5108 + }, + { + "epoch": 1.5681399631675874, + "grad_norm": 0.4422524571418762, + "learning_rate": 9.58447083621506e-05, + "loss": 1.924, + "step": 5109 + }, + { + "epoch": 1.5684468999386127, + "grad_norm": 0.41540947556495667, + "learning_rate": 9.584272423401532e-05, + "loss": 1.969, + "step": 5110 + }, + { + "epoch": 1.5687538367096379, + "grad_norm": 0.3963775336742401, + "learning_rate": 9.584073965283538e-05, + "loss": 1.9509, + "step": 5111 + }, + { + "epoch": 1.569060773480663, + "grad_norm": 0.41465985774993896, + "learning_rate": 9.583875461863037e-05, + "loss": 1.9393, + "step": 5112 + }, + { + "epoch": 1.5693677102516883, + "grad_norm": 0.4396083652973175, + "learning_rate": 9.583676913141991e-05, + "loss": 1.9872, + "step": 5113 + }, + { + "epoch": 1.5696746470227132, + "grad_norm": 0.4247182607650757, + "learning_rate": 9.583478319122366e-05, + "loss": 1.9807, + "step": 5114 + }, + { + "epoch": 1.5699815837937385, + "grad_norm": 0.3612080216407776, + "learning_rate": 9.583279679806119e-05, + "loss": 1.9563, + "step": 5115 + }, + { + "epoch": 1.5702885205647636, + "grad_norm": 0.40084055066108704, + "learning_rate": 9.583080995195217e-05, + "loss": 1.9099, + "step": 5116 + }, + { + "epoch": 1.5705954573357888, + "grad_norm": 0.432381272315979, + "learning_rate": 9.582882265291621e-05, + "loss": 2.0167, + "step": 5117 + }, + { + "epoch": 1.570902394106814, + "grad_norm": 0.45490768551826477, + "learning_rate": 9.5826834900973e-05, + "loss": 1.9179, + "step": 5118 + }, + { + "epoch": 1.5712093308778392, + "grad_norm": 0.39158329367637634, + "learning_rate": 9.582484669614211e-05, + "loss": 1.8716, + "step": 5119 + }, + { + "epoch": 1.5715162676488643, + "grad_norm": 0.45607441663742065, + "learning_rate": 9.582285803844324e-05, + "loss": 1.9631, + "step": 5120 + }, + { + "epoch": 1.5718232044198897, + "grad_norm": 0.42591094970703125, + "learning_rate": 9.582086892789604e-05, + "loss": 1.9809, + "step": 5121 + }, + { + "epoch": 1.5721301411909145, + "grad_norm": 0.46772903203964233, + "learning_rate": 9.581887936452015e-05, + "loss": 1.9991, + "step": 5122 + }, + { + "epoch": 1.5724370779619399, + "grad_norm": 0.4450485408306122, + "learning_rate": 9.581688934833524e-05, + "loss": 1.9471, + "step": 5123 + }, + { + "epoch": 1.572744014732965, + "grad_norm": 0.37539350986480713, + "learning_rate": 9.581489887936097e-05, + "loss": 1.8624, + "step": 5124 + }, + { + "epoch": 1.57305095150399, + "grad_norm": 0.4184030294418335, + "learning_rate": 9.581290795761702e-05, + "loss": 1.9746, + "step": 5125 + }, + { + "epoch": 1.5733578882750154, + "grad_norm": 0.43275317549705505, + "learning_rate": 9.581091658312305e-05, + "loss": 2.0484, + "step": 5126 + }, + { + "epoch": 1.5736648250460405, + "grad_norm": 0.48845502734184265, + "learning_rate": 9.580892475589876e-05, + "loss": 1.9331, + "step": 5127 + }, + { + "epoch": 1.5739717618170657, + "grad_norm": 0.4653528034687042, + "learning_rate": 9.580693247596383e-05, + "loss": 1.8888, + "step": 5128 + }, + { + "epoch": 1.574278698588091, + "grad_norm": 0.4371016323566437, + "learning_rate": 9.580493974333794e-05, + "loss": 1.9004, + "step": 5129 + }, + { + "epoch": 1.5745856353591159, + "grad_norm": 0.4274102747440338, + "learning_rate": 9.580294655804079e-05, + "loss": 1.9877, + "step": 5130 + }, + { + "epoch": 1.5748925721301412, + "grad_norm": 0.4053245484828949, + "learning_rate": 9.580095292009208e-05, + "loss": 1.9253, + "step": 5131 + }, + { + "epoch": 1.5751995089011663, + "grad_norm": 0.47868627309799194, + "learning_rate": 9.579895882951151e-05, + "loss": 1.9659, + "step": 5132 + }, + { + "epoch": 1.5755064456721914, + "grad_norm": 0.47420576214790344, + "learning_rate": 9.579696428631877e-05, + "loss": 1.9115, + "step": 5133 + }, + { + "epoch": 1.5758133824432168, + "grad_norm": 0.41192150115966797, + "learning_rate": 9.57949692905336e-05, + "loss": 1.8949, + "step": 5134 + }, + { + "epoch": 1.576120319214242, + "grad_norm": 0.44949471950531006, + "learning_rate": 9.57929738421757e-05, + "loss": 1.9393, + "step": 5135 + }, + { + "epoch": 1.576427255985267, + "grad_norm": 0.38450154662132263, + "learning_rate": 9.57909779412648e-05, + "loss": 1.8399, + "step": 5136 + }, + { + "epoch": 1.5767341927562923, + "grad_norm": 0.43553364276885986, + "learning_rate": 9.57889815878206e-05, + "loss": 1.9477, + "step": 5137 + }, + { + "epoch": 1.5770411295273172, + "grad_norm": 0.4546982944011688, + "learning_rate": 9.578698478186285e-05, + "loss": 1.9169, + "step": 5138 + }, + { + "epoch": 1.5773480662983426, + "grad_norm": 0.47802838683128357, + "learning_rate": 9.57849875234113e-05, + "loss": 1.9204, + "step": 5139 + }, + { + "epoch": 1.5776550030693677, + "grad_norm": 0.3648034930229187, + "learning_rate": 9.578298981248565e-05, + "loss": 1.9157, + "step": 5140 + }, + { + "epoch": 1.5779619398403928, + "grad_norm": 0.41951245069503784, + "learning_rate": 9.578099164910565e-05, + "loss": 1.9171, + "step": 5141 + }, + { + "epoch": 1.5782688766114181, + "grad_norm": 0.5198701620101929, + "learning_rate": 9.577899303329107e-05, + "loss": 1.9786, + "step": 5142 + }, + { + "epoch": 1.5785758133824432, + "grad_norm": 0.45244187116622925, + "learning_rate": 9.577699396506165e-05, + "loss": 2.0044, + "step": 5143 + }, + { + "epoch": 1.5788827501534684, + "grad_norm": 0.3874819874763489, + "learning_rate": 9.577499444443715e-05, + "loss": 1.9385, + "step": 5144 + }, + { + "epoch": 1.5791896869244937, + "grad_norm": 0.4578075110912323, + "learning_rate": 9.577299447143733e-05, + "loss": 1.9679, + "step": 5145 + }, + { + "epoch": 1.5794966236955186, + "grad_norm": 0.6001343727111816, + "learning_rate": 9.577099404608192e-05, + "loss": 1.9331, + "step": 5146 + }, + { + "epoch": 1.579803560466544, + "grad_norm": 0.5592501759529114, + "learning_rate": 9.576899316839074e-05, + "loss": 1.8968, + "step": 5147 + }, + { + "epoch": 1.580110497237569, + "grad_norm": 0.4333004951477051, + "learning_rate": 9.576699183838356e-05, + "loss": 2.0378, + "step": 5148 + }, + { + "epoch": 1.5804174340085941, + "grad_norm": 0.40593892335891724, + "learning_rate": 9.576499005608011e-05, + "loss": 1.9878, + "step": 5149 + }, + { + "epoch": 1.5807243707796195, + "grad_norm": 0.4805290400981903, + "learning_rate": 9.576298782150023e-05, + "loss": 1.9897, + "step": 5150 + }, + { + "epoch": 1.5810313075506446, + "grad_norm": 0.4620860517024994, + "learning_rate": 9.576098513466367e-05, + "loss": 1.9808, + "step": 5151 + }, + { + "epoch": 1.5813382443216697, + "grad_norm": 0.47085410356521606, + "learning_rate": 9.575898199559023e-05, + "loss": 1.9526, + "step": 5152 + }, + { + "epoch": 1.581645181092695, + "grad_norm": 0.512971043586731, + "learning_rate": 9.575697840429971e-05, + "loss": 1.9684, + "step": 5153 + }, + { + "epoch": 1.58195211786372, + "grad_norm": 0.5474939346313477, + "learning_rate": 9.575497436081193e-05, + "loss": 2.0052, + "step": 5154 + }, + { + "epoch": 1.5822590546347453, + "grad_norm": 0.6277830004692078, + "learning_rate": 9.575296986514666e-05, + "loss": 2.042, + "step": 5155 + }, + { + "epoch": 1.5825659914057704, + "grad_norm": 0.46941256523132324, + "learning_rate": 9.575096491732372e-05, + "loss": 1.952, + "step": 5156 + }, + { + "epoch": 1.5828729281767955, + "grad_norm": 0.4948115646839142, + "learning_rate": 9.574895951736294e-05, + "loss": 1.9573, + "step": 5157 + }, + { + "epoch": 1.5831798649478208, + "grad_norm": 0.5677160024642944, + "learning_rate": 9.574695366528411e-05, + "loss": 1.9696, + "step": 5158 + }, + { + "epoch": 1.583486801718846, + "grad_norm": 0.5915918350219727, + "learning_rate": 9.574494736110708e-05, + "loss": 1.9822, + "step": 5159 + }, + { + "epoch": 1.583793738489871, + "grad_norm": 0.556413471698761, + "learning_rate": 9.574294060485168e-05, + "loss": 1.9548, + "step": 5160 + }, + { + "epoch": 1.5841006752608964, + "grad_norm": 0.4706072509288788, + "learning_rate": 9.574093339653772e-05, + "loss": 2.0052, + "step": 5161 + }, + { + "epoch": 1.5844076120319213, + "grad_norm": 0.3931087553501129, + "learning_rate": 9.573892573618505e-05, + "loss": 1.9071, + "step": 5162 + }, + { + "epoch": 1.5847145488029466, + "grad_norm": 0.4590308368206024, + "learning_rate": 9.573691762381349e-05, + "loss": 2.048, + "step": 5163 + }, + { + "epoch": 1.5850214855739717, + "grad_norm": 0.4404078423976898, + "learning_rate": 9.573490905944293e-05, + "loss": 1.9426, + "step": 5164 + }, + { + "epoch": 1.5853284223449968, + "grad_norm": 0.486074298620224, + "learning_rate": 9.573290004309318e-05, + "loss": 1.9937, + "step": 5165 + }, + { + "epoch": 1.5856353591160222, + "grad_norm": 0.4650556445121765, + "learning_rate": 9.57308905747841e-05, + "loss": 1.9821, + "step": 5166 + }, + { + "epoch": 1.5859422958870473, + "grad_norm": 0.48193567991256714, + "learning_rate": 9.572888065453557e-05, + "loss": 2.0143, + "step": 5167 + }, + { + "epoch": 1.5862492326580724, + "grad_norm": 0.43178877234458923, + "learning_rate": 9.572687028236744e-05, + "loss": 2.0066, + "step": 5168 + }, + { + "epoch": 1.5865561694290977, + "grad_norm": 0.5256033539772034, + "learning_rate": 9.572485945829957e-05, + "loss": 2.0431, + "step": 5169 + }, + { + "epoch": 1.5868631062001226, + "grad_norm": 0.4714619517326355, + "learning_rate": 9.572284818235182e-05, + "loss": 1.9411, + "step": 5170 + }, + { + "epoch": 1.587170042971148, + "grad_norm": 0.4224734902381897, + "learning_rate": 9.572083645454411e-05, + "loss": 1.9648, + "step": 5171 + }, + { + "epoch": 1.5874769797421733, + "grad_norm": 0.45965152978897095, + "learning_rate": 9.571882427489628e-05, + "loss": 1.9241, + "step": 5172 + }, + { + "epoch": 1.5877839165131982, + "grad_norm": 0.459114670753479, + "learning_rate": 9.571681164342825e-05, + "loss": 2.0197, + "step": 5173 + }, + { + "epoch": 1.5880908532842235, + "grad_norm": 0.4278501272201538, + "learning_rate": 9.571479856015988e-05, + "loss": 1.9411, + "step": 5174 + }, + { + "epoch": 1.5883977900552486, + "grad_norm": 0.6875150799751282, + "learning_rate": 9.571278502511107e-05, + "loss": 1.8876, + "step": 5175 + }, + { + "epoch": 1.5887047268262737, + "grad_norm": 0.4596772789955139, + "learning_rate": 9.571077103830174e-05, + "loss": 1.9002, + "step": 5176 + }, + { + "epoch": 1.589011663597299, + "grad_norm": 0.47587937116622925, + "learning_rate": 9.570875659975178e-05, + "loss": 2.0034, + "step": 5177 + }, + { + "epoch": 1.5893186003683242, + "grad_norm": 0.42494842410087585, + "learning_rate": 9.570674170948109e-05, + "loss": 1.9668, + "step": 5178 + }, + { + "epoch": 1.5896255371393493, + "grad_norm": 0.4231310784816742, + "learning_rate": 9.570472636750957e-05, + "loss": 1.9365, + "step": 5179 + }, + { + "epoch": 1.5899324739103746, + "grad_norm": 0.4585247337818146, + "learning_rate": 9.570271057385719e-05, + "loss": 1.9707, + "step": 5180 + }, + { + "epoch": 1.5902394106813995, + "grad_norm": 0.4146895408630371, + "learning_rate": 9.570069432854382e-05, + "loss": 1.9405, + "step": 5181 + }, + { + "epoch": 1.5905463474524248, + "grad_norm": 0.42243605852127075, + "learning_rate": 9.56986776315894e-05, + "loss": 1.8893, + "step": 5182 + }, + { + "epoch": 1.59085328422345, + "grad_norm": 0.44299328327178955, + "learning_rate": 9.569666048301386e-05, + "loss": 1.9596, + "step": 5183 + }, + { + "epoch": 1.591160220994475, + "grad_norm": 0.4950970709323883, + "learning_rate": 9.569464288283716e-05, + "loss": 1.9066, + "step": 5184 + }, + { + "epoch": 1.5914671577655004, + "grad_norm": 0.4664969742298126, + "learning_rate": 9.569262483107919e-05, + "loss": 1.9485, + "step": 5185 + }, + { + "epoch": 1.5917740945365255, + "grad_norm": 0.5052160024642944, + "learning_rate": 9.569060632775993e-05, + "loss": 1.9189, + "step": 5186 + }, + { + "epoch": 1.5920810313075506, + "grad_norm": 0.4109063446521759, + "learning_rate": 9.568858737289932e-05, + "loss": 1.9236, + "step": 5187 + }, + { + "epoch": 1.592387968078576, + "grad_norm": 0.4078194499015808, + "learning_rate": 9.568656796651731e-05, + "loss": 1.9465, + "step": 5188 + }, + { + "epoch": 1.5926949048496009, + "grad_norm": 0.43199312686920166, + "learning_rate": 9.568454810863385e-05, + "loss": 1.9537, + "step": 5189 + }, + { + "epoch": 1.5930018416206262, + "grad_norm": 0.46389925479888916, + "learning_rate": 9.568252779926891e-05, + "loss": 1.9463, + "step": 5190 + }, + { + "epoch": 1.5933087783916513, + "grad_norm": 0.4130708575248718, + "learning_rate": 9.568050703844247e-05, + "loss": 1.948, + "step": 5191 + }, + { + "epoch": 1.5936157151626764, + "grad_norm": 0.4699256122112274, + "learning_rate": 9.567848582617448e-05, + "loss": 1.957, + "step": 5192 + }, + { + "epoch": 1.5939226519337018, + "grad_norm": 0.41965460777282715, + "learning_rate": 9.56764641624849e-05, + "loss": 1.9622, + "step": 5193 + }, + { + "epoch": 1.5942295887047269, + "grad_norm": 0.4313151240348816, + "learning_rate": 9.567444204739376e-05, + "loss": 1.981, + "step": 5194 + }, + { + "epoch": 1.594536525475752, + "grad_norm": 0.4149332642555237, + "learning_rate": 9.5672419480921e-05, + "loss": 1.9542, + "step": 5195 + }, + { + "epoch": 1.5948434622467773, + "grad_norm": 0.4456483721733093, + "learning_rate": 9.567039646308661e-05, + "loss": 2.0206, + "step": 5196 + }, + { + "epoch": 1.5951503990178022, + "grad_norm": 0.46637552976608276, + "learning_rate": 9.56683729939106e-05, + "loss": 2.0264, + "step": 5197 + }, + { + "epoch": 1.5954573357888275, + "grad_norm": 0.4809871315956116, + "learning_rate": 9.566634907341297e-05, + "loss": 1.9113, + "step": 5198 + }, + { + "epoch": 1.5957642725598526, + "grad_norm": 0.5220670104026794, + "learning_rate": 9.566432470161371e-05, + "loss": 1.9806, + "step": 5199 + }, + { + "epoch": 1.5960712093308778, + "grad_norm": 0.5020555853843689, + "learning_rate": 9.566229987853283e-05, + "loss": 1.9925, + "step": 5200 + }, + { + "epoch": 1.596378146101903, + "grad_norm": 0.5481683611869812, + "learning_rate": 9.566027460419034e-05, + "loss": 1.978, + "step": 5201 + }, + { + "epoch": 1.5966850828729282, + "grad_norm": 0.5014147758483887, + "learning_rate": 9.565824887860624e-05, + "loss": 1.9402, + "step": 5202 + }, + { + "epoch": 1.5969920196439533, + "grad_norm": 0.43973588943481445, + "learning_rate": 9.565622270180057e-05, + "loss": 1.9877, + "step": 5203 + }, + { + "epoch": 1.5972989564149787, + "grad_norm": 0.5172939300537109, + "learning_rate": 9.565419607379335e-05, + "loss": 1.9304, + "step": 5204 + }, + { + "epoch": 1.5976058931860035, + "grad_norm": 0.4767214357852936, + "learning_rate": 9.56521689946046e-05, + "loss": 1.9063, + "step": 5205 + }, + { + "epoch": 1.5979128299570289, + "grad_norm": 0.48810651898384094, + "learning_rate": 9.565014146425437e-05, + "loss": 1.9473, + "step": 5206 + }, + { + "epoch": 1.598219766728054, + "grad_norm": 0.4204402565956116, + "learning_rate": 9.564811348276269e-05, + "loss": 1.9562, + "step": 5207 + }, + { + "epoch": 1.598526703499079, + "grad_norm": 0.42679163813591003, + "learning_rate": 9.564608505014958e-05, + "loss": 1.8904, + "step": 5208 + }, + { + "epoch": 1.5988336402701044, + "grad_norm": 0.4240354299545288, + "learning_rate": 9.56440561664351e-05, + "loss": 1.9982, + "step": 5209 + }, + { + "epoch": 1.5991405770411296, + "grad_norm": 0.41588497161865234, + "learning_rate": 9.564202683163932e-05, + "loss": 1.9904, + "step": 5210 + }, + { + "epoch": 1.5994475138121547, + "grad_norm": 0.486240029335022, + "learning_rate": 9.563999704578226e-05, + "loss": 1.9379, + "step": 5211 + }, + { + "epoch": 1.59975445058318, + "grad_norm": 0.4628448188304901, + "learning_rate": 9.563796680888403e-05, + "loss": 2.0061, + "step": 5212 + }, + { + "epoch": 1.600061387354205, + "grad_norm": 0.4514544606208801, + "learning_rate": 9.563593612096464e-05, + "loss": 1.9692, + "step": 5213 + }, + { + "epoch": 1.6003683241252302, + "grad_norm": 0.3869803845882416, + "learning_rate": 9.563390498204419e-05, + "loss": 1.8801, + "step": 5214 + }, + { + "epoch": 1.6006752608962553, + "grad_norm": 0.47029098868370056, + "learning_rate": 9.563187339214274e-05, + "loss": 2.0457, + "step": 5215 + }, + { + "epoch": 1.6009821976672804, + "grad_norm": 0.49051982164382935, + "learning_rate": 9.562984135128037e-05, + "loss": 1.9121, + "step": 5216 + }, + { + "epoch": 1.6012891344383058, + "grad_norm": 0.5087830424308777, + "learning_rate": 9.562780885947717e-05, + "loss": 1.9165, + "step": 5217 + }, + { + "epoch": 1.601596071209331, + "grad_norm": 0.4597826600074768, + "learning_rate": 9.562577591675322e-05, + "loss": 1.9037, + "step": 5218 + }, + { + "epoch": 1.601903007980356, + "grad_norm": 0.43610528111457825, + "learning_rate": 9.562374252312858e-05, + "loss": 1.8785, + "step": 5219 + }, + { + "epoch": 1.6022099447513813, + "grad_norm": 0.45797282457351685, + "learning_rate": 9.56217086786234e-05, + "loss": 2.0713, + "step": 5220 + }, + { + "epoch": 1.6025168815224062, + "grad_norm": 0.46097078919410706, + "learning_rate": 9.561967438325777e-05, + "loss": 1.9176, + "step": 5221 + }, + { + "epoch": 1.6028238182934316, + "grad_norm": 0.47368288040161133, + "learning_rate": 9.561763963705176e-05, + "loss": 1.9333, + "step": 5222 + }, + { + "epoch": 1.6031307550644567, + "grad_norm": 0.5048179626464844, + "learning_rate": 9.561560444002551e-05, + "loss": 1.9473, + "step": 5223 + }, + { + "epoch": 1.6034376918354818, + "grad_norm": 0.42069435119628906, + "learning_rate": 9.56135687921991e-05, + "loss": 1.8507, + "step": 5224 + }, + { + "epoch": 1.6037446286065071, + "grad_norm": 0.37166985869407654, + "learning_rate": 9.561153269359269e-05, + "loss": 1.9404, + "step": 5225 + }, + { + "epoch": 1.6040515653775322, + "grad_norm": 0.42752668261528015, + "learning_rate": 9.560949614422637e-05, + "loss": 1.9791, + "step": 5226 + }, + { + "epoch": 1.6043585021485574, + "grad_norm": 0.4334527552127838, + "learning_rate": 9.560745914412029e-05, + "loss": 1.972, + "step": 5227 + }, + { + "epoch": 1.6046654389195827, + "grad_norm": 0.44162631034851074, + "learning_rate": 9.560542169329454e-05, + "loss": 1.9054, + "step": 5228 + }, + { + "epoch": 1.6049723756906076, + "grad_norm": 0.3891509771347046, + "learning_rate": 9.560338379176929e-05, + "loss": 1.9356, + "step": 5229 + }, + { + "epoch": 1.605279312461633, + "grad_norm": 0.3821989893913269, + "learning_rate": 9.56013454395647e-05, + "loss": 1.9197, + "step": 5230 + }, + { + "epoch": 1.605586249232658, + "grad_norm": 0.4338948428630829, + "learning_rate": 9.559930663670084e-05, + "loss": 2.002, + "step": 5231 + }, + { + "epoch": 1.6058931860036831, + "grad_norm": 0.4784114956855774, + "learning_rate": 9.559726738319794e-05, + "loss": 2.0344, + "step": 5232 + }, + { + "epoch": 1.6062001227747085, + "grad_norm": 0.43362441658973694, + "learning_rate": 9.559522767907612e-05, + "loss": 1.9282, + "step": 5233 + }, + { + "epoch": 1.6065070595457336, + "grad_norm": 0.40863800048828125, + "learning_rate": 9.559318752435553e-05, + "loss": 1.8468, + "step": 5234 + }, + { + "epoch": 1.6068139963167587, + "grad_norm": 0.4509727358818054, + "learning_rate": 9.559114691905633e-05, + "loss": 2.0175, + "step": 5235 + }, + { + "epoch": 1.607120933087784, + "grad_norm": 0.4650020897388458, + "learning_rate": 9.55891058631987e-05, + "loss": 1.9946, + "step": 5236 + }, + { + "epoch": 1.607427869858809, + "grad_norm": 0.4315911829471588, + "learning_rate": 9.55870643568028e-05, + "loss": 1.9271, + "step": 5237 + }, + { + "epoch": 1.6077348066298343, + "grad_norm": 0.4109809994697571, + "learning_rate": 9.558502239988882e-05, + "loss": 1.9791, + "step": 5238 + }, + { + "epoch": 1.6080417434008594, + "grad_norm": 0.4323776662349701, + "learning_rate": 9.558297999247692e-05, + "loss": 1.9745, + "step": 5239 + }, + { + "epoch": 1.6083486801718845, + "grad_norm": 0.4255007207393646, + "learning_rate": 9.558093713458729e-05, + "loss": 1.96, + "step": 5240 + }, + { + "epoch": 1.6086556169429098, + "grad_norm": 0.4045571982860565, + "learning_rate": 9.557889382624014e-05, + "loss": 1.9148, + "step": 5241 + }, + { + "epoch": 1.608962553713935, + "grad_norm": 0.39663615822792053, + "learning_rate": 9.557685006745564e-05, + "loss": 1.9313, + "step": 5242 + }, + { + "epoch": 1.60926949048496, + "grad_norm": 0.39130523800849915, + "learning_rate": 9.5574805858254e-05, + "loss": 2.0073, + "step": 5243 + }, + { + "epoch": 1.6095764272559854, + "grad_norm": 0.4071548581123352, + "learning_rate": 9.55727611986554e-05, + "loss": 1.9353, + "step": 5244 + }, + { + "epoch": 1.6098833640270105, + "grad_norm": 0.44347357749938965, + "learning_rate": 9.557071608868007e-05, + "loss": 1.9325, + "step": 5245 + }, + { + "epoch": 1.6101903007980356, + "grad_norm": 0.48900067806243896, + "learning_rate": 9.556867052834821e-05, + "loss": 2.0083, + "step": 5246 + }, + { + "epoch": 1.610497237569061, + "grad_norm": 0.44374197721481323, + "learning_rate": 9.556662451768006e-05, + "loss": 2.0143, + "step": 5247 + }, + { + "epoch": 1.6108041743400858, + "grad_norm": 0.385268896818161, + "learning_rate": 9.556457805669581e-05, + "loss": 1.8981, + "step": 5248 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.5355607867240906, + "learning_rate": 9.556253114541569e-05, + "loss": 2.0413, + "step": 5249 + }, + { + "epoch": 1.6114180478821363, + "grad_norm": 0.5672646164894104, + "learning_rate": 9.556048378385992e-05, + "loss": 1.9429, + "step": 5250 + }, + { + "epoch": 1.6117249846531614, + "grad_norm": 0.46225669980049133, + "learning_rate": 9.555843597204875e-05, + "loss": 1.9883, + "step": 5251 + }, + { + "epoch": 1.6120319214241867, + "grad_norm": 0.43236228823661804, + "learning_rate": 9.555638771000243e-05, + "loss": 1.9641, + "step": 5252 + }, + { + "epoch": 1.6123388581952118, + "grad_norm": 0.4843178987503052, + "learning_rate": 9.555433899774116e-05, + "loss": 1.9224, + "step": 5253 + }, + { + "epoch": 1.612645794966237, + "grad_norm": 0.4693675637245178, + "learning_rate": 9.555228983528523e-05, + "loss": 1.9774, + "step": 5254 + }, + { + "epoch": 1.6129527317372623, + "grad_norm": 0.3968529999256134, + "learning_rate": 9.555024022265487e-05, + "loss": 1.8939, + "step": 5255 + }, + { + "epoch": 1.6132596685082872, + "grad_norm": 0.42781850695610046, + "learning_rate": 9.554819015987033e-05, + "loss": 1.9561, + "step": 5256 + }, + { + "epoch": 1.6135666052793125, + "grad_norm": 0.5241015553474426, + "learning_rate": 9.554613964695189e-05, + "loss": 1.963, + "step": 5257 + }, + { + "epoch": 1.6138735420503376, + "grad_norm": 0.4292888641357422, + "learning_rate": 9.554408868391979e-05, + "loss": 2.0248, + "step": 5258 + }, + { + "epoch": 1.6141804788213627, + "grad_norm": 0.49197763204574585, + "learning_rate": 9.554203727079433e-05, + "loss": 1.9612, + "step": 5259 + }, + { + "epoch": 1.614487415592388, + "grad_norm": 0.45733556151390076, + "learning_rate": 9.553998540759575e-05, + "loss": 1.9093, + "step": 5260 + }, + { + "epoch": 1.6147943523634132, + "grad_norm": 0.4139576256275177, + "learning_rate": 9.553793309434436e-05, + "loss": 1.875, + "step": 5261 + }, + { + "epoch": 1.6151012891344383, + "grad_norm": 0.42295894026756287, + "learning_rate": 9.55358803310604e-05, + "loss": 1.9427, + "step": 5262 + }, + { + "epoch": 1.6154082259054636, + "grad_norm": 0.370761513710022, + "learning_rate": 9.55338271177642e-05, + "loss": 1.932, + "step": 5263 + }, + { + "epoch": 1.6157151626764885, + "grad_norm": 0.38912683725357056, + "learning_rate": 9.553177345447602e-05, + "loss": 1.9606, + "step": 5264 + }, + { + "epoch": 1.6160220994475138, + "grad_norm": 0.3901510238647461, + "learning_rate": 9.552971934121618e-05, + "loss": 1.9455, + "step": 5265 + }, + { + "epoch": 1.616329036218539, + "grad_norm": 0.4517458975315094, + "learning_rate": 9.552766477800494e-05, + "loss": 1.9291, + "step": 5266 + }, + { + "epoch": 1.616635972989564, + "grad_norm": 0.47282713651657104, + "learning_rate": 9.552560976486266e-05, + "loss": 1.9326, + "step": 5267 + }, + { + "epoch": 1.6169429097605894, + "grad_norm": 0.4741488993167877, + "learning_rate": 9.552355430180961e-05, + "loss": 1.9782, + "step": 5268 + }, + { + "epoch": 1.6172498465316145, + "grad_norm": 0.42634037137031555, + "learning_rate": 9.552149838886612e-05, + "loss": 1.9871, + "step": 5269 + }, + { + "epoch": 1.6175567833026396, + "grad_norm": 0.39007633924484253, + "learning_rate": 9.55194420260525e-05, + "loss": 1.9397, + "step": 5270 + }, + { + "epoch": 1.617863720073665, + "grad_norm": 0.41707170009613037, + "learning_rate": 9.551738521338906e-05, + "loss": 1.8555, + "step": 5271 + }, + { + "epoch": 1.6181706568446899, + "grad_norm": 0.46702343225479126, + "learning_rate": 9.551532795089616e-05, + "loss": 1.9987, + "step": 5272 + }, + { + "epoch": 1.6184775936157152, + "grad_norm": 0.44585564732551575, + "learning_rate": 9.551327023859411e-05, + "loss": 1.8512, + "step": 5273 + }, + { + "epoch": 1.6187845303867403, + "grad_norm": 0.42617684602737427, + "learning_rate": 9.551121207650324e-05, + "loss": 1.9405, + "step": 5274 + }, + { + "epoch": 1.6190914671577654, + "grad_norm": 0.39399340748786926, + "learning_rate": 9.55091534646439e-05, + "loss": 1.9787, + "step": 5275 + }, + { + "epoch": 1.6193984039287908, + "grad_norm": 0.44386324286460876, + "learning_rate": 9.550709440303642e-05, + "loss": 1.9791, + "step": 5276 + }, + { + "epoch": 1.6197053406998159, + "grad_norm": 0.3871287405490875, + "learning_rate": 9.550503489170117e-05, + "loss": 1.9354, + "step": 5277 + }, + { + "epoch": 1.620012277470841, + "grad_norm": 0.4131690263748169, + "learning_rate": 9.550297493065851e-05, + "loss": 1.9709, + "step": 5278 + }, + { + "epoch": 1.6203192142418663, + "grad_norm": 0.3919534683227539, + "learning_rate": 9.550091451992877e-05, + "loss": 1.8997, + "step": 5279 + }, + { + "epoch": 1.6206261510128912, + "grad_norm": 0.40001583099365234, + "learning_rate": 9.54988536595323e-05, + "loss": 1.9006, + "step": 5280 + }, + { + "epoch": 1.6209330877839165, + "grad_norm": 0.44222408533096313, + "learning_rate": 9.549679234948952e-05, + "loss": 2.0033, + "step": 5281 + }, + { + "epoch": 1.6212400245549416, + "grad_norm": 0.4243159592151642, + "learning_rate": 9.549473058982077e-05, + "loss": 1.9582, + "step": 5282 + }, + { + "epoch": 1.6215469613259668, + "grad_norm": 0.411408007144928, + "learning_rate": 9.549266838054641e-05, + "loss": 1.9244, + "step": 5283 + }, + { + "epoch": 1.621853898096992, + "grad_norm": 0.3833782970905304, + "learning_rate": 9.549060572168686e-05, + "loss": 1.9184, + "step": 5284 + }, + { + "epoch": 1.6221608348680172, + "grad_norm": 0.3925926685333252, + "learning_rate": 9.548854261326246e-05, + "loss": 1.9299, + "step": 5285 + }, + { + "epoch": 1.6224677716390423, + "grad_norm": 0.4472656846046448, + "learning_rate": 9.548647905529363e-05, + "loss": 2.0622, + "step": 5286 + }, + { + "epoch": 1.6227747084100677, + "grad_norm": 0.4842108488082886, + "learning_rate": 9.548441504780074e-05, + "loss": 1.9759, + "step": 5287 + }, + { + "epoch": 1.6230816451810925, + "grad_norm": 0.49826517701148987, + "learning_rate": 9.548235059080422e-05, + "loss": 1.9162, + "step": 5288 + }, + { + "epoch": 1.6233885819521179, + "grad_norm": 0.4672689735889435, + "learning_rate": 9.548028568432445e-05, + "loss": 1.9843, + "step": 5289 + }, + { + "epoch": 1.623695518723143, + "grad_norm": 0.48113325238227844, + "learning_rate": 9.547822032838182e-05, + "loss": 1.9426, + "step": 5290 + }, + { + "epoch": 1.624002455494168, + "grad_norm": 0.49646374583244324, + "learning_rate": 9.54761545229968e-05, + "loss": 1.908, + "step": 5291 + }, + { + "epoch": 1.6243093922651934, + "grad_norm": 0.42530664801597595, + "learning_rate": 9.547408826818974e-05, + "loss": 1.9189, + "step": 5292 + }, + { + "epoch": 1.6246163290362186, + "grad_norm": 0.592721164226532, + "learning_rate": 9.54720215639811e-05, + "loss": 1.9656, + "step": 5293 + }, + { + "epoch": 1.6249232658072437, + "grad_norm": 0.5530748963356018, + "learning_rate": 9.546995441039127e-05, + "loss": 1.8815, + "step": 5294 + }, + { + "epoch": 1.625230202578269, + "grad_norm": 0.4551030695438385, + "learning_rate": 9.546788680744073e-05, + "loss": 1.9485, + "step": 5295 + }, + { + "epoch": 1.625537139349294, + "grad_norm": 0.42004409432411194, + "learning_rate": 9.546581875514985e-05, + "loss": 1.9903, + "step": 5296 + }, + { + "epoch": 1.6258440761203192, + "grad_norm": 0.5363507270812988, + "learning_rate": 9.546375025353911e-05, + "loss": 1.93, + "step": 5297 + }, + { + "epoch": 1.6261510128913443, + "grad_norm": 0.457795649766922, + "learning_rate": 9.546168130262896e-05, + "loss": 1.9279, + "step": 5298 + }, + { + "epoch": 1.6264579496623695, + "grad_norm": 0.5061174631118774, + "learning_rate": 9.545961190243982e-05, + "loss": 1.9198, + "step": 5299 + }, + { + "epoch": 1.6267648864333948, + "grad_norm": 0.4366548955440521, + "learning_rate": 9.545754205299214e-05, + "loss": 1.9206, + "step": 5300 + }, + { + "epoch": 1.62707182320442, + "grad_norm": 0.361251562833786, + "learning_rate": 9.54554717543064e-05, + "loss": 1.8638, + "step": 5301 + }, + { + "epoch": 1.627378759975445, + "grad_norm": 0.45089036226272583, + "learning_rate": 9.545340100640303e-05, + "loss": 1.9206, + "step": 5302 + }, + { + "epoch": 1.6276856967464703, + "grad_norm": 0.38224726915359497, + "learning_rate": 9.545132980930251e-05, + "loss": 1.9893, + "step": 5303 + }, + { + "epoch": 1.6279926335174952, + "grad_norm": 0.43573206663131714, + "learning_rate": 9.544925816302533e-05, + "loss": 1.9358, + "step": 5304 + }, + { + "epoch": 1.6282995702885206, + "grad_norm": 0.5618723630905151, + "learning_rate": 9.544718606759193e-05, + "loss": 1.9745, + "step": 5305 + }, + { + "epoch": 1.6286065070595457, + "grad_norm": 0.517867386341095, + "learning_rate": 9.54451135230228e-05, + "loss": 2.0238, + "step": 5306 + }, + { + "epoch": 1.6289134438305708, + "grad_norm": 0.4745725393295288, + "learning_rate": 9.544304052933842e-05, + "loss": 1.999, + "step": 5307 + }, + { + "epoch": 1.6292203806015961, + "grad_norm": 0.4454270899295807, + "learning_rate": 9.544096708655928e-05, + "loss": 1.9215, + "step": 5308 + }, + { + "epoch": 1.6295273173726212, + "grad_norm": 0.5604696273803711, + "learning_rate": 9.543889319470586e-05, + "loss": 1.8756, + "step": 5309 + }, + { + "epoch": 1.6298342541436464, + "grad_norm": 0.645453155040741, + "learning_rate": 9.543681885379869e-05, + "loss": 1.9177, + "step": 5310 + }, + { + "epoch": 1.6301411909146717, + "grad_norm": 0.7018140554428101, + "learning_rate": 9.543474406385824e-05, + "loss": 1.9231, + "step": 5311 + }, + { + "epoch": 1.6304481276856968, + "grad_norm": 0.691644549369812, + "learning_rate": 9.543266882490501e-05, + "loss": 1.9055, + "step": 5312 + }, + { + "epoch": 1.630755064456722, + "grad_norm": 0.5484849810600281, + "learning_rate": 9.54305931369595e-05, + "loss": 1.8977, + "step": 5313 + }, + { + "epoch": 1.6310620012277472, + "grad_norm": 0.4035104811191559, + "learning_rate": 9.542851700004227e-05, + "loss": 1.9098, + "step": 5314 + }, + { + "epoch": 1.6313689379987721, + "grad_norm": 0.4578574299812317, + "learning_rate": 9.542644041417379e-05, + "loss": 1.9946, + "step": 5315 + }, + { + "epoch": 1.6316758747697975, + "grad_norm": 0.646272599697113, + "learning_rate": 9.542436337937462e-05, + "loss": 1.9489, + "step": 5316 + }, + { + "epoch": 1.6319828115408226, + "grad_norm": 0.5796291828155518, + "learning_rate": 9.542228589566524e-05, + "loss": 1.8396, + "step": 5317 + }, + { + "epoch": 1.6322897483118477, + "grad_norm": 0.42690619826316833, + "learning_rate": 9.542020796306623e-05, + "loss": 1.9691, + "step": 5318 + }, + { + "epoch": 1.632596685082873, + "grad_norm": 0.3943910002708435, + "learning_rate": 9.54181295815981e-05, + "loss": 1.8711, + "step": 5319 + }, + { + "epoch": 1.6329036218538981, + "grad_norm": 0.4636860489845276, + "learning_rate": 9.541605075128137e-05, + "loss": 1.8659, + "step": 5320 + }, + { + "epoch": 1.6332105586249233, + "grad_norm": 0.5485807061195374, + "learning_rate": 9.541397147213664e-05, + "loss": 2.031, + "step": 5321 + }, + { + "epoch": 1.6335174953959486, + "grad_norm": 0.40169721841812134, + "learning_rate": 9.541189174418441e-05, + "loss": 1.9346, + "step": 5322 + }, + { + "epoch": 1.6338244321669735, + "grad_norm": 0.3407663106918335, + "learning_rate": 9.540981156744524e-05, + "loss": 1.9238, + "step": 5323 + }, + { + "epoch": 1.6341313689379988, + "grad_norm": 0.4062422513961792, + "learning_rate": 9.540773094193971e-05, + "loss": 1.914, + "step": 5324 + }, + { + "epoch": 1.634438305709024, + "grad_norm": 0.47654685378074646, + "learning_rate": 9.540564986768836e-05, + "loss": 1.8957, + "step": 5325 + }, + { + "epoch": 1.634745242480049, + "grad_norm": 0.4369850754737854, + "learning_rate": 9.540356834471178e-05, + "loss": 1.968, + "step": 5326 + }, + { + "epoch": 1.6350521792510744, + "grad_norm": 0.38868457078933716, + "learning_rate": 9.540148637303052e-05, + "loss": 1.931, + "step": 5327 + }, + { + "epoch": 1.6353591160220995, + "grad_norm": 0.4998358190059662, + "learning_rate": 9.539940395266515e-05, + "loss": 1.9316, + "step": 5328 + }, + { + "epoch": 1.6356660527931246, + "grad_norm": 0.5497372150421143, + "learning_rate": 9.539732108363628e-05, + "loss": 1.9233, + "step": 5329 + }, + { + "epoch": 1.63597298956415, + "grad_norm": 0.5609846115112305, + "learning_rate": 9.539523776596445e-05, + "loss": 1.898, + "step": 5330 + }, + { + "epoch": 1.6362799263351748, + "grad_norm": 0.44984617829322815, + "learning_rate": 9.539315399967029e-05, + "loss": 2.0103, + "step": 5331 + }, + { + "epoch": 1.6365868631062002, + "grad_norm": 0.41710013151168823, + "learning_rate": 9.539106978477436e-05, + "loss": 1.9008, + "step": 5332 + }, + { + "epoch": 1.6368937998772253, + "grad_norm": 0.44854703545570374, + "learning_rate": 9.53889851212973e-05, + "loss": 1.9591, + "step": 5333 + }, + { + "epoch": 1.6372007366482504, + "grad_norm": 0.4259171485900879, + "learning_rate": 9.538690000925968e-05, + "loss": 1.915, + "step": 5334 + }, + { + "epoch": 1.6375076734192757, + "grad_norm": 0.4444480240345001, + "learning_rate": 9.53848144486821e-05, + "loss": 1.9562, + "step": 5335 + }, + { + "epoch": 1.6378146101903008, + "grad_norm": 0.40078794956207275, + "learning_rate": 9.538272843958518e-05, + "loss": 1.8802, + "step": 5336 + }, + { + "epoch": 1.638121546961326, + "grad_norm": 0.5346726179122925, + "learning_rate": 9.538064198198955e-05, + "loss": 2.0214, + "step": 5337 + }, + { + "epoch": 1.6384284837323513, + "grad_norm": 0.47136780619621277, + "learning_rate": 9.537855507591581e-05, + "loss": 1.9593, + "step": 5338 + }, + { + "epoch": 1.6387354205033762, + "grad_norm": 0.3839198052883148, + "learning_rate": 9.53764677213846e-05, + "loss": 1.9507, + "step": 5339 + }, + { + "epoch": 1.6390423572744015, + "grad_norm": 0.4565586447715759, + "learning_rate": 9.537437991841654e-05, + "loss": 1.9292, + "step": 5340 + }, + { + "epoch": 1.6393492940454266, + "grad_norm": 0.5139011740684509, + "learning_rate": 9.537229166703225e-05, + "loss": 1.9388, + "step": 5341 + }, + { + "epoch": 1.6396562308164517, + "grad_norm": 0.5421571135520935, + "learning_rate": 9.537020296725238e-05, + "loss": 1.9031, + "step": 5342 + }, + { + "epoch": 1.639963167587477, + "grad_norm": 0.4085434675216675, + "learning_rate": 9.536811381909758e-05, + "loss": 1.9167, + "step": 5343 + }, + { + "epoch": 1.6402701043585022, + "grad_norm": 0.3567824065685272, + "learning_rate": 9.536602422258849e-05, + "loss": 1.89, + "step": 5344 + }, + { + "epoch": 1.6405770411295273, + "grad_norm": 0.5427443385124207, + "learning_rate": 9.536393417774575e-05, + "loss": 2.0036, + "step": 5345 + }, + { + "epoch": 1.6408839779005526, + "grad_norm": 0.5275370478630066, + "learning_rate": 9.536184368459003e-05, + "loss": 1.94, + "step": 5346 + }, + { + "epoch": 1.6411909146715775, + "grad_norm": 0.3916989862918854, + "learning_rate": 9.535975274314198e-05, + "loss": 1.8769, + "step": 5347 + }, + { + "epoch": 1.6414978514426029, + "grad_norm": 0.4200802743434906, + "learning_rate": 9.535766135342228e-05, + "loss": 1.9384, + "step": 5348 + }, + { + "epoch": 1.641804788213628, + "grad_norm": 0.5287195444107056, + "learning_rate": 9.535556951545157e-05, + "loss": 1.9159, + "step": 5349 + }, + { + "epoch": 1.642111724984653, + "grad_norm": 0.5934851765632629, + "learning_rate": 9.535347722925055e-05, + "loss": 1.9927, + "step": 5350 + }, + { + "epoch": 1.6424186617556784, + "grad_norm": 0.49941807985305786, + "learning_rate": 9.535138449483987e-05, + "loss": 1.9124, + "step": 5351 + }, + { + "epoch": 1.6427255985267035, + "grad_norm": 0.41778016090393066, + "learning_rate": 9.534929131224024e-05, + "loss": 1.9468, + "step": 5352 + }, + { + "epoch": 1.6430325352977286, + "grad_norm": 0.5172474384307861, + "learning_rate": 9.534719768147233e-05, + "loss": 1.928, + "step": 5353 + }, + { + "epoch": 1.643339472068754, + "grad_norm": 0.6690294146537781, + "learning_rate": 9.534510360255683e-05, + "loss": 1.9697, + "step": 5354 + }, + { + "epoch": 1.6436464088397789, + "grad_norm": 0.617683470249176, + "learning_rate": 9.534300907551444e-05, + "loss": 1.9529, + "step": 5355 + }, + { + "epoch": 1.6439533456108042, + "grad_norm": 0.40067893266677856, + "learning_rate": 9.534091410036587e-05, + "loss": 1.915, + "step": 5356 + }, + { + "epoch": 1.6442602823818293, + "grad_norm": 0.46418440341949463, + "learning_rate": 9.53388186771318e-05, + "loss": 1.9056, + "step": 5357 + }, + { + "epoch": 1.6445672191528544, + "grad_norm": 0.6600098013877869, + "learning_rate": 9.533672280583295e-05, + "loss": 1.9641, + "step": 5358 + }, + { + "epoch": 1.6448741559238798, + "grad_norm": 0.6510347127914429, + "learning_rate": 9.533462648649004e-05, + "loss": 1.916, + "step": 5359 + }, + { + "epoch": 1.6451810926949049, + "grad_norm": 0.5004377365112305, + "learning_rate": 9.533252971912376e-05, + "loss": 1.9584, + "step": 5360 + }, + { + "epoch": 1.64548802946593, + "grad_norm": 0.45522230863571167, + "learning_rate": 9.533043250375488e-05, + "loss": 1.973, + "step": 5361 + }, + { + "epoch": 1.6457949662369553, + "grad_norm": 0.5304180383682251, + "learning_rate": 9.532833484040408e-05, + "loss": 1.8542, + "step": 5362 + }, + { + "epoch": 1.6461019030079802, + "grad_norm": 0.5320406556129456, + "learning_rate": 9.53262367290921e-05, + "loss": 1.9405, + "step": 5363 + }, + { + "epoch": 1.6464088397790055, + "grad_norm": 0.4377361536026001, + "learning_rate": 9.532413816983969e-05, + "loss": 1.9126, + "step": 5364 + }, + { + "epoch": 1.6467157765500307, + "grad_norm": 0.4632298946380615, + "learning_rate": 9.532203916266758e-05, + "loss": 1.9868, + "step": 5365 + }, + { + "epoch": 1.6470227133210558, + "grad_norm": 0.4861730635166168, + "learning_rate": 9.531993970759651e-05, + "loss": 1.895, + "step": 5366 + }, + { + "epoch": 1.647329650092081, + "grad_norm": 0.45012348890304565, + "learning_rate": 9.531783980464726e-05, + "loss": 1.9583, + "step": 5367 + }, + { + "epoch": 1.6476365868631062, + "grad_norm": 0.43772751092910767, + "learning_rate": 9.531573945384053e-05, + "loss": 1.9341, + "step": 5368 + }, + { + "epoch": 1.6479435236341313, + "grad_norm": 0.39253392815589905, + "learning_rate": 9.531363865519711e-05, + "loss": 1.8629, + "step": 5369 + }, + { + "epoch": 1.6482504604051567, + "grad_norm": 0.44614076614379883, + "learning_rate": 9.531153740873775e-05, + "loss": 1.9508, + "step": 5370 + }, + { + "epoch": 1.6485573971761815, + "grad_norm": 0.4442307949066162, + "learning_rate": 9.530943571448322e-05, + "loss": 1.9624, + "step": 5371 + }, + { + "epoch": 1.6488643339472069, + "grad_norm": 0.44962942600250244, + "learning_rate": 9.53073335724543e-05, + "loss": 1.9315, + "step": 5372 + }, + { + "epoch": 1.649171270718232, + "grad_norm": 0.4903222620487213, + "learning_rate": 9.530523098267173e-05, + "loss": 1.8776, + "step": 5373 + }, + { + "epoch": 1.649478207489257, + "grad_norm": 0.4733131229877472, + "learning_rate": 9.530312794515633e-05, + "loss": 1.958, + "step": 5374 + }, + { + "epoch": 1.6497851442602824, + "grad_norm": 0.4134232997894287, + "learning_rate": 9.530102445992886e-05, + "loss": 1.9184, + "step": 5375 + }, + { + "epoch": 1.6500920810313076, + "grad_norm": 0.43521758913993835, + "learning_rate": 9.529892052701012e-05, + "loss": 1.9383, + "step": 5376 + }, + { + "epoch": 1.6503990178023327, + "grad_norm": 0.5098583102226257, + "learning_rate": 9.52968161464209e-05, + "loss": 1.9596, + "step": 5377 + }, + { + "epoch": 1.650705954573358, + "grad_norm": 0.48421037197113037, + "learning_rate": 9.5294711318182e-05, + "loss": 1.9258, + "step": 5378 + }, + { + "epoch": 1.651012891344383, + "grad_norm": 0.4039461314678192, + "learning_rate": 9.52926060423142e-05, + "loss": 1.9975, + "step": 5379 + }, + { + "epoch": 1.6513198281154082, + "grad_norm": 0.491858571767807, + "learning_rate": 9.529050031883832e-05, + "loss": 1.9564, + "step": 5380 + }, + { + "epoch": 1.6516267648864333, + "grad_norm": 0.45920100808143616, + "learning_rate": 9.528839414777517e-05, + "loss": 1.8513, + "step": 5381 + }, + { + "epoch": 1.6519337016574585, + "grad_norm": 0.4812139868736267, + "learning_rate": 9.528628752914558e-05, + "loss": 1.9638, + "step": 5382 + }, + { + "epoch": 1.6522406384284838, + "grad_norm": 0.38021141290664673, + "learning_rate": 9.528418046297034e-05, + "loss": 1.848, + "step": 5383 + }, + { + "epoch": 1.652547575199509, + "grad_norm": 0.438681960105896, + "learning_rate": 9.52820729492703e-05, + "loss": 1.9931, + "step": 5384 + }, + { + "epoch": 1.652854511970534, + "grad_norm": 0.4387293756008148, + "learning_rate": 9.527996498806627e-05, + "loss": 1.9969, + "step": 5385 + }, + { + "epoch": 1.6531614487415593, + "grad_norm": 0.43315380811691284, + "learning_rate": 9.527785657937907e-05, + "loss": 1.9607, + "step": 5386 + }, + { + "epoch": 1.6534683855125845, + "grad_norm": 0.4800446927547455, + "learning_rate": 9.527574772322956e-05, + "loss": 1.9645, + "step": 5387 + }, + { + "epoch": 1.6537753222836096, + "grad_norm": 0.45495909452438354, + "learning_rate": 9.527363841963857e-05, + "loss": 1.8748, + "step": 5388 + }, + { + "epoch": 1.654082259054635, + "grad_norm": 0.4052638113498688, + "learning_rate": 9.527152866862696e-05, + "loss": 1.9491, + "step": 5389 + }, + { + "epoch": 1.6543891958256598, + "grad_norm": 0.44545745849609375, + "learning_rate": 9.526941847021558e-05, + "loss": 1.8938, + "step": 5390 + }, + { + "epoch": 1.6546961325966851, + "grad_norm": 0.5576399564743042, + "learning_rate": 9.526730782442526e-05, + "loss": 1.9656, + "step": 5391 + }, + { + "epoch": 1.6550030693677102, + "grad_norm": 0.5678401589393616, + "learning_rate": 9.526519673127686e-05, + "loss": 1.9914, + "step": 5392 + }, + { + "epoch": 1.6553100061387354, + "grad_norm": 0.4391598701477051, + "learning_rate": 9.526308519079127e-05, + "loss": 1.9452, + "step": 5393 + }, + { + "epoch": 1.6556169429097607, + "grad_norm": 0.4375559091567993, + "learning_rate": 9.526097320298934e-05, + "loss": 1.9335, + "step": 5394 + }, + { + "epoch": 1.6559238796807858, + "grad_norm": 0.4976498782634735, + "learning_rate": 9.525886076789194e-05, + "loss": 2.0065, + "step": 5395 + }, + { + "epoch": 1.656230816451811, + "grad_norm": 0.5966445207595825, + "learning_rate": 9.525674788551996e-05, + "loss": 1.9924, + "step": 5396 + }, + { + "epoch": 1.6565377532228363, + "grad_norm": 0.5119359493255615, + "learning_rate": 9.525463455589427e-05, + "loss": 2.0061, + "step": 5397 + }, + { + "epoch": 1.6568446899938611, + "grad_norm": 0.46835067868232727, + "learning_rate": 9.525252077903574e-05, + "loss": 1.9441, + "step": 5398 + }, + { + "epoch": 1.6571516267648865, + "grad_norm": 0.5319140553474426, + "learning_rate": 9.52504065549653e-05, + "loss": 1.9704, + "step": 5399 + }, + { + "epoch": 1.6574585635359116, + "grad_norm": 0.5132572054862976, + "learning_rate": 9.52482918837038e-05, + "loss": 1.9037, + "step": 5400 + }, + { + "epoch": 1.6577655003069367, + "grad_norm": 0.41260987520217896, + "learning_rate": 9.524617676527218e-05, + "loss": 1.9103, + "step": 5401 + }, + { + "epoch": 1.658072437077962, + "grad_norm": 0.41780540347099304, + "learning_rate": 9.524406119969131e-05, + "loss": 1.9419, + "step": 5402 + }, + { + "epoch": 1.6583793738489871, + "grad_norm": 0.42015889286994934, + "learning_rate": 9.524194518698211e-05, + "loss": 1.9143, + "step": 5403 + }, + { + "epoch": 1.6586863106200123, + "grad_norm": 0.4449796676635742, + "learning_rate": 9.523982872716548e-05, + "loss": 1.9794, + "step": 5404 + }, + { + "epoch": 1.6589932473910376, + "grad_norm": 0.4392293393611908, + "learning_rate": 9.523771182026237e-05, + "loss": 1.8687, + "step": 5405 + }, + { + "epoch": 1.6593001841620625, + "grad_norm": 0.49595963954925537, + "learning_rate": 9.523559446629366e-05, + "loss": 2.013, + "step": 5406 + }, + { + "epoch": 1.6596071209330878, + "grad_norm": 0.4456728994846344, + "learning_rate": 9.523347666528029e-05, + "loss": 1.9269, + "step": 5407 + }, + { + "epoch": 1.659914057704113, + "grad_norm": 0.3835284411907196, + "learning_rate": 9.52313584172432e-05, + "loss": 1.9042, + "step": 5408 + }, + { + "epoch": 1.660220994475138, + "grad_norm": 0.39068692922592163, + "learning_rate": 9.522923972220332e-05, + "loss": 1.999, + "step": 5409 + }, + { + "epoch": 1.6605279312461634, + "grad_norm": 0.4522729814052582, + "learning_rate": 9.522712058018157e-05, + "loss": 1.9546, + "step": 5410 + }, + { + "epoch": 1.6608348680171885, + "grad_norm": 0.3834155201911926, + "learning_rate": 9.522500099119891e-05, + "loss": 1.9184, + "step": 5411 + }, + { + "epoch": 1.6611418047882136, + "grad_norm": 0.36149126291275024, + "learning_rate": 9.522288095527629e-05, + "loss": 1.8973, + "step": 5412 + }, + { + "epoch": 1.661448741559239, + "grad_norm": 0.3502398729324341, + "learning_rate": 9.522076047243464e-05, + "loss": 1.8775, + "step": 5413 + }, + { + "epoch": 1.6617556783302638, + "grad_norm": 0.36552321910858154, + "learning_rate": 9.521863954269495e-05, + "loss": 1.901, + "step": 5414 + }, + { + "epoch": 1.6620626151012892, + "grad_norm": 0.37815216183662415, + "learning_rate": 9.521651816607814e-05, + "loss": 1.9143, + "step": 5415 + }, + { + "epoch": 1.6623695518723143, + "grad_norm": 0.4048994481563568, + "learning_rate": 9.52143963426052e-05, + "loss": 1.9892, + "step": 5416 + }, + { + "epoch": 1.6626764886433394, + "grad_norm": 0.35271233320236206, + "learning_rate": 9.52122740722971e-05, + "loss": 1.9209, + "step": 5417 + }, + { + "epoch": 1.6629834254143647, + "grad_norm": 0.405009925365448, + "learning_rate": 9.521015135517482e-05, + "loss": 1.9583, + "step": 5418 + }, + { + "epoch": 1.6632903621853898, + "grad_norm": 0.4041683077812195, + "learning_rate": 9.520802819125932e-05, + "loss": 1.8937, + "step": 5419 + }, + { + "epoch": 1.663597298956415, + "grad_norm": 0.41353970766067505, + "learning_rate": 9.520590458057157e-05, + "loss": 1.949, + "step": 5420 + }, + { + "epoch": 1.6639042357274403, + "grad_norm": 0.3704569637775421, + "learning_rate": 9.520378052313258e-05, + "loss": 1.9287, + "step": 5421 + }, + { + "epoch": 1.6642111724984652, + "grad_norm": 0.4043133854866028, + "learning_rate": 9.520165601896334e-05, + "loss": 1.9116, + "step": 5422 + }, + { + "epoch": 1.6645181092694905, + "grad_norm": 0.3976849317550659, + "learning_rate": 9.519953106808485e-05, + "loss": 1.9578, + "step": 5423 + }, + { + "epoch": 1.6648250460405156, + "grad_norm": 0.41225695610046387, + "learning_rate": 9.51974056705181e-05, + "loss": 1.8861, + "step": 5424 + }, + { + "epoch": 1.6651319828115407, + "grad_norm": 0.40096259117126465, + "learning_rate": 9.519527982628409e-05, + "loss": 1.926, + "step": 5425 + }, + { + "epoch": 1.665438919582566, + "grad_norm": 0.4373134970664978, + "learning_rate": 9.519315353540384e-05, + "loss": 1.8761, + "step": 5426 + }, + { + "epoch": 1.6657458563535912, + "grad_norm": 0.3798682689666748, + "learning_rate": 9.519102679789835e-05, + "loss": 1.8655, + "step": 5427 + }, + { + "epoch": 1.6660527931246163, + "grad_norm": 0.3889687955379486, + "learning_rate": 9.518889961378865e-05, + "loss": 1.8928, + "step": 5428 + }, + { + "epoch": 1.6663597298956416, + "grad_norm": 0.39567697048187256, + "learning_rate": 9.518677198309575e-05, + "loss": 1.9193, + "step": 5429 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.37571004033088684, + "learning_rate": 9.51846439058407e-05, + "loss": 1.9653, + "step": 5430 + }, + { + "epoch": 1.6669736034376919, + "grad_norm": 0.36011725664138794, + "learning_rate": 9.518251538204451e-05, + "loss": 1.9202, + "step": 5431 + }, + { + "epoch": 1.667280540208717, + "grad_norm": 0.42314839363098145, + "learning_rate": 9.518038641172822e-05, + "loss": 1.9883, + "step": 5432 + }, + { + "epoch": 1.667587476979742, + "grad_norm": 0.3986029326915741, + "learning_rate": 9.517825699491287e-05, + "loss": 1.9838, + "step": 5433 + }, + { + "epoch": 1.6678944137507674, + "grad_norm": 0.388236939907074, + "learning_rate": 9.517612713161949e-05, + "loss": 1.901, + "step": 5434 + }, + { + "epoch": 1.6682013505217925, + "grad_norm": 0.3849826455116272, + "learning_rate": 9.517399682186917e-05, + "loss": 1.9621, + "step": 5435 + }, + { + "epoch": 1.6685082872928176, + "grad_norm": 0.40182530879974365, + "learning_rate": 9.517186606568292e-05, + "loss": 1.9081, + "step": 5436 + }, + { + "epoch": 1.668815224063843, + "grad_norm": 0.4260261654853821, + "learning_rate": 9.516973486308181e-05, + "loss": 1.9701, + "step": 5437 + }, + { + "epoch": 1.6691221608348679, + "grad_norm": 0.4035099744796753, + "learning_rate": 9.516760321408692e-05, + "loss": 1.9269, + "step": 5438 + }, + { + "epoch": 1.6694290976058932, + "grad_norm": 0.42106589674949646, + "learning_rate": 9.51654711187193e-05, + "loss": 1.9026, + "step": 5439 + }, + { + "epoch": 1.6697360343769183, + "grad_norm": 0.4629819989204407, + "learning_rate": 9.516333857700001e-05, + "loss": 1.9128, + "step": 5440 + }, + { + "epoch": 1.6700429711479434, + "grad_norm": 0.3824837803840637, + "learning_rate": 9.516120558895014e-05, + "loss": 1.8861, + "step": 5441 + }, + { + "epoch": 1.6703499079189688, + "grad_norm": 0.37263223528862, + "learning_rate": 9.515907215459076e-05, + "loss": 1.9098, + "step": 5442 + }, + { + "epoch": 1.6706568446899939, + "grad_norm": 0.3980494439601898, + "learning_rate": 9.515693827394299e-05, + "loss": 1.9764, + "step": 5443 + }, + { + "epoch": 1.670963781461019, + "grad_norm": 0.5064507722854614, + "learning_rate": 9.515480394702786e-05, + "loss": 1.9771, + "step": 5444 + }, + { + "epoch": 1.6712707182320443, + "grad_norm": 0.5012909770011902, + "learning_rate": 9.515266917386649e-05, + "loss": 1.9162, + "step": 5445 + }, + { + "epoch": 1.6715776550030692, + "grad_norm": 0.5422279238700867, + "learning_rate": 9.515053395447999e-05, + "loss": 1.8913, + "step": 5446 + }, + { + "epoch": 1.6718845917740945, + "grad_norm": 0.4677022397518158, + "learning_rate": 9.514839828888946e-05, + "loss": 1.9156, + "step": 5447 + }, + { + "epoch": 1.6721915285451197, + "grad_norm": 0.39561185240745544, + "learning_rate": 9.514626217711597e-05, + "loss": 1.9203, + "step": 5448 + }, + { + "epoch": 1.6724984653161448, + "grad_norm": 0.4435743987560272, + "learning_rate": 9.514412561918068e-05, + "loss": 1.953, + "step": 5449 + }, + { + "epoch": 1.67280540208717, + "grad_norm": 0.5383535027503967, + "learning_rate": 9.514198861510467e-05, + "loss": 1.9662, + "step": 5450 + }, + { + "epoch": 1.6731123388581952, + "grad_norm": 0.4787214696407318, + "learning_rate": 9.513985116490906e-05, + "loss": 1.9278, + "step": 5451 + }, + { + "epoch": 1.6734192756292203, + "grad_norm": 0.40962034463882446, + "learning_rate": 9.513771326861501e-05, + "loss": 1.9267, + "step": 5452 + }, + { + "epoch": 1.6737262124002457, + "grad_norm": 0.43605929613113403, + "learning_rate": 9.513557492624359e-05, + "loss": 1.9537, + "step": 5453 + }, + { + "epoch": 1.6740331491712708, + "grad_norm": 0.46278494596481323, + "learning_rate": 9.513343613781599e-05, + "loss": 1.9383, + "step": 5454 + }, + { + "epoch": 1.6743400859422959, + "grad_norm": 0.4052918255329132, + "learning_rate": 9.513129690335331e-05, + "loss": 1.9289, + "step": 5455 + }, + { + "epoch": 1.6746470227133212, + "grad_norm": 0.37791141867637634, + "learning_rate": 9.51291572228767e-05, + "loss": 1.9185, + "step": 5456 + }, + { + "epoch": 1.674953959484346, + "grad_norm": 0.41135111451148987, + "learning_rate": 9.512701709640731e-05, + "loss": 2.0003, + "step": 5457 + }, + { + "epoch": 1.6752608962553714, + "grad_norm": 0.41175320744514465, + "learning_rate": 9.512487652396629e-05, + "loss": 1.9307, + "step": 5458 + }, + { + "epoch": 1.6755678330263966, + "grad_norm": 0.40061330795288086, + "learning_rate": 9.512273550557478e-05, + "loss": 1.9361, + "step": 5459 + }, + { + "epoch": 1.6758747697974217, + "grad_norm": 0.3938329219818115, + "learning_rate": 9.512059404125397e-05, + "loss": 1.9419, + "step": 5460 + }, + { + "epoch": 1.676181706568447, + "grad_norm": 0.42825883626937866, + "learning_rate": 9.511845213102498e-05, + "loss": 1.9201, + "step": 5461 + }, + { + "epoch": 1.6764886433394721, + "grad_norm": 0.3795798122882843, + "learning_rate": 9.511630977490901e-05, + "loss": 1.9872, + "step": 5462 + }, + { + "epoch": 1.6767955801104972, + "grad_norm": 0.3639005422592163, + "learning_rate": 9.511416697292724e-05, + "loss": 1.9066, + "step": 5463 + }, + { + "epoch": 1.6771025168815226, + "grad_norm": 0.4200088381767273, + "learning_rate": 9.511202372510082e-05, + "loss": 1.9928, + "step": 5464 + }, + { + "epoch": 1.6774094536525475, + "grad_norm": 0.436638742685318, + "learning_rate": 9.510988003145092e-05, + "loss": 1.8527, + "step": 5465 + }, + { + "epoch": 1.6777163904235728, + "grad_norm": 0.40901345014572144, + "learning_rate": 9.510773589199877e-05, + "loss": 1.9915, + "step": 5466 + }, + { + "epoch": 1.678023327194598, + "grad_norm": 0.39717167615890503, + "learning_rate": 9.510559130676553e-05, + "loss": 1.9682, + "step": 5467 + }, + { + "epoch": 1.678330263965623, + "grad_norm": 0.37574490904808044, + "learning_rate": 9.510344627577239e-05, + "loss": 1.9641, + "step": 5468 + }, + { + "epoch": 1.6786372007366483, + "grad_norm": 0.36686137318611145, + "learning_rate": 9.510130079904057e-05, + "loss": 1.9082, + "step": 5469 + }, + { + "epoch": 1.6789441375076735, + "grad_norm": 0.37321972846984863, + "learning_rate": 9.509915487659125e-05, + "loss": 1.8911, + "step": 5470 + }, + { + "epoch": 1.6792510742786986, + "grad_norm": 0.3911389112472534, + "learning_rate": 9.509700850844566e-05, + "loss": 1.9721, + "step": 5471 + }, + { + "epoch": 1.679558011049724, + "grad_norm": 0.41182973980903625, + "learning_rate": 9.509486169462499e-05, + "loss": 1.9188, + "step": 5472 + }, + { + "epoch": 1.6798649478207488, + "grad_norm": 0.4141900837421417, + "learning_rate": 9.509271443515047e-05, + "loss": 1.875, + "step": 5473 + }, + { + "epoch": 1.6801718845917741, + "grad_norm": 0.4259745478630066, + "learning_rate": 9.509056673004333e-05, + "loss": 1.9258, + "step": 5474 + }, + { + "epoch": 1.6804788213627992, + "grad_norm": 0.47081178426742554, + "learning_rate": 9.508841857932476e-05, + "loss": 2.0494, + "step": 5475 + }, + { + "epoch": 1.6807857581338244, + "grad_norm": 0.5346465110778809, + "learning_rate": 9.508626998301602e-05, + "loss": 1.9371, + "step": 5476 + }, + { + "epoch": 1.6810926949048497, + "grad_norm": 0.5532976388931274, + "learning_rate": 9.508412094113832e-05, + "loss": 1.8727, + "step": 5477 + }, + { + "epoch": 1.6813996316758748, + "grad_norm": 0.5262138843536377, + "learning_rate": 9.508197145371294e-05, + "loss": 1.9098, + "step": 5478 + }, + { + "epoch": 1.6817065684469, + "grad_norm": 0.47581788897514343, + "learning_rate": 9.507982152076108e-05, + "loss": 1.9174, + "step": 5479 + }, + { + "epoch": 1.6820135052179253, + "grad_norm": 0.41795024275779724, + "learning_rate": 9.507767114230399e-05, + "loss": 1.9333, + "step": 5480 + }, + { + "epoch": 1.6823204419889501, + "grad_norm": 0.5213392376899719, + "learning_rate": 9.507552031836295e-05, + "loss": 1.9731, + "step": 5481 + }, + { + "epoch": 1.6826273787599755, + "grad_norm": 0.624969482421875, + "learning_rate": 9.507336904895919e-05, + "loss": 1.965, + "step": 5482 + }, + { + "epoch": 1.6829343155310006, + "grad_norm": 0.5719303488731384, + "learning_rate": 9.507121733411397e-05, + "loss": 1.9325, + "step": 5483 + }, + { + "epoch": 1.6832412523020257, + "grad_norm": 0.45429563522338867, + "learning_rate": 9.506906517384858e-05, + "loss": 1.8846, + "step": 5484 + }, + { + "epoch": 1.683548189073051, + "grad_norm": 0.4679521322250366, + "learning_rate": 9.506691256818427e-05, + "loss": 1.9609, + "step": 5485 + }, + { + "epoch": 1.6838551258440762, + "grad_norm": 0.64385986328125, + "learning_rate": 9.50647595171423e-05, + "loss": 1.9138, + "step": 5486 + }, + { + "epoch": 1.6841620626151013, + "grad_norm": 0.6783073544502258, + "learning_rate": 9.506260602074398e-05, + "loss": 2.0252, + "step": 5487 + }, + { + "epoch": 1.6844689993861266, + "grad_norm": 0.6151844263076782, + "learning_rate": 9.506045207901058e-05, + "loss": 2.0077, + "step": 5488 + }, + { + "epoch": 1.6847759361571515, + "grad_norm": 0.43046683073043823, + "learning_rate": 9.505829769196338e-05, + "loss": 1.8945, + "step": 5489 + }, + { + "epoch": 1.6850828729281768, + "grad_norm": 0.44831258058547974, + "learning_rate": 9.505614285962366e-05, + "loss": 1.9775, + "step": 5490 + }, + { + "epoch": 1.685389809699202, + "grad_norm": 0.4917668402194977, + "learning_rate": 9.505398758201272e-05, + "loss": 1.9115, + "step": 5491 + }, + { + "epoch": 1.685696746470227, + "grad_norm": 0.4595036506652832, + "learning_rate": 9.505183185915187e-05, + "loss": 1.9103, + "step": 5492 + }, + { + "epoch": 1.6860036832412524, + "grad_norm": 0.43335607647895813, + "learning_rate": 9.504967569106243e-05, + "loss": 1.9147, + "step": 5493 + }, + { + "epoch": 1.6863106200122775, + "grad_norm": 0.42885956168174744, + "learning_rate": 9.504751907776567e-05, + "loss": 2.0085, + "step": 5494 + }, + { + "epoch": 1.6866175567833026, + "grad_norm": 0.4121492803096771, + "learning_rate": 9.504536201928295e-05, + "loss": 1.9212, + "step": 5495 + }, + { + "epoch": 1.686924493554328, + "grad_norm": 0.4387015700340271, + "learning_rate": 9.504320451563555e-05, + "loss": 1.9202, + "step": 5496 + }, + { + "epoch": 1.6872314303253528, + "grad_norm": 0.4333394467830658, + "learning_rate": 9.504104656684481e-05, + "loss": 1.9165, + "step": 5497 + }, + { + "epoch": 1.6875383670963782, + "grad_norm": 0.37835901975631714, + "learning_rate": 9.503888817293203e-05, + "loss": 1.9087, + "step": 5498 + }, + { + "epoch": 1.6878453038674033, + "grad_norm": 0.42156684398651123, + "learning_rate": 9.503672933391857e-05, + "loss": 1.8909, + "step": 5499 + }, + { + "epoch": 1.6881522406384284, + "grad_norm": 0.4315885603427887, + "learning_rate": 9.503457004982574e-05, + "loss": 1.8892, + "step": 5500 + }, + { + "epoch": 1.6884591774094537, + "grad_norm": 0.4349892735481262, + "learning_rate": 9.50324103206749e-05, + "loss": 1.9532, + "step": 5501 + }, + { + "epoch": 1.6887661141804788, + "grad_norm": 0.45786523818969727, + "learning_rate": 9.503025014648739e-05, + "loss": 1.9285, + "step": 5502 + }, + { + "epoch": 1.689073050951504, + "grad_norm": 0.36640092730522156, + "learning_rate": 9.502808952728456e-05, + "loss": 1.9167, + "step": 5503 + }, + { + "epoch": 1.6893799877225293, + "grad_norm": 0.46942031383514404, + "learning_rate": 9.502592846308775e-05, + "loss": 2.08, + "step": 5504 + }, + { + "epoch": 1.6896869244935542, + "grad_norm": 0.44714173674583435, + "learning_rate": 9.502376695391833e-05, + "loss": 1.9618, + "step": 5505 + }, + { + "epoch": 1.6899938612645795, + "grad_norm": 0.4216810464859009, + "learning_rate": 9.502160499979764e-05, + "loss": 1.888, + "step": 5506 + }, + { + "epoch": 1.6903007980356046, + "grad_norm": 0.40471377968788147, + "learning_rate": 9.501944260074709e-05, + "loss": 1.9048, + "step": 5507 + }, + { + "epoch": 1.6906077348066297, + "grad_norm": 0.399309366941452, + "learning_rate": 9.501727975678801e-05, + "loss": 1.8796, + "step": 5508 + }, + { + "epoch": 1.690914671577655, + "grad_norm": 0.36903873085975647, + "learning_rate": 9.501511646794176e-05, + "loss": 1.9607, + "step": 5509 + }, + { + "epoch": 1.6912216083486802, + "grad_norm": 0.40781939029693604, + "learning_rate": 9.501295273422977e-05, + "loss": 1.9328, + "step": 5510 + }, + { + "epoch": 1.6915285451197053, + "grad_norm": 0.38062483072280884, + "learning_rate": 9.50107885556734e-05, + "loss": 1.9552, + "step": 5511 + }, + { + "epoch": 1.6918354818907306, + "grad_norm": 0.4047648012638092, + "learning_rate": 9.500862393229402e-05, + "loss": 1.9503, + "step": 5512 + }, + { + "epoch": 1.6921424186617555, + "grad_norm": 0.3829517066478729, + "learning_rate": 9.500645886411305e-05, + "loss": 1.9034, + "step": 5513 + }, + { + "epoch": 1.6924493554327809, + "grad_norm": 0.3657867908477783, + "learning_rate": 9.500429335115188e-05, + "loss": 1.869, + "step": 5514 + }, + { + "epoch": 1.692756292203806, + "grad_norm": 0.410877525806427, + "learning_rate": 9.50021273934319e-05, + "loss": 1.9824, + "step": 5515 + }, + { + "epoch": 1.693063228974831, + "grad_norm": 0.420682817697525, + "learning_rate": 9.499996099097453e-05, + "loss": 1.969, + "step": 5516 + }, + { + "epoch": 1.6933701657458564, + "grad_norm": 0.44578227400779724, + "learning_rate": 9.499779414380115e-05, + "loss": 1.9513, + "step": 5517 + }, + { + "epoch": 1.6936771025168815, + "grad_norm": 0.42710423469543457, + "learning_rate": 9.499562685193319e-05, + "loss": 1.9423, + "step": 5518 + }, + { + "epoch": 1.6939840392879066, + "grad_norm": 0.4503214657306671, + "learning_rate": 9.49934591153921e-05, + "loss": 1.9849, + "step": 5519 + }, + { + "epoch": 1.694290976058932, + "grad_norm": 0.427157998085022, + "learning_rate": 9.499129093419926e-05, + "loss": 1.9502, + "step": 5520 + }, + { + "epoch": 1.6945979128299569, + "grad_norm": 0.4356638491153717, + "learning_rate": 9.498912230837611e-05, + "loss": 1.8593, + "step": 5521 + }, + { + "epoch": 1.6949048496009822, + "grad_norm": 0.3894338309764862, + "learning_rate": 9.498695323794409e-05, + "loss": 1.8857, + "step": 5522 + }, + { + "epoch": 1.6952117863720073, + "grad_norm": 0.4285121262073517, + "learning_rate": 9.498478372292464e-05, + "loss": 1.9774, + "step": 5523 + }, + { + "epoch": 1.6955187231430324, + "grad_norm": 0.4316183924674988, + "learning_rate": 9.498261376333916e-05, + "loss": 1.9067, + "step": 5524 + }, + { + "epoch": 1.6958256599140578, + "grad_norm": 0.3760167956352234, + "learning_rate": 9.498044335920914e-05, + "loss": 1.8375, + "step": 5525 + }, + { + "epoch": 1.6961325966850829, + "grad_norm": 0.4327097237110138, + "learning_rate": 9.497827251055602e-05, + "loss": 1.9333, + "step": 5526 + }, + { + "epoch": 1.696439533456108, + "grad_norm": 0.4169953167438507, + "learning_rate": 9.497610121740126e-05, + "loss": 1.9015, + "step": 5527 + }, + { + "epoch": 1.6967464702271333, + "grad_norm": 0.3915253281593323, + "learning_rate": 9.49739294797663e-05, + "loss": 1.8608, + "step": 5528 + }, + { + "epoch": 1.6970534069981584, + "grad_norm": 0.4071075916290283, + "learning_rate": 9.497175729767259e-05, + "loss": 1.9336, + "step": 5529 + }, + { + "epoch": 1.6973603437691835, + "grad_norm": 0.3550303876399994, + "learning_rate": 9.496958467114163e-05, + "loss": 1.8614, + "step": 5530 + }, + { + "epoch": 1.6976672805402089, + "grad_norm": 0.3757273554801941, + "learning_rate": 9.496741160019487e-05, + "loss": 1.9959, + "step": 5531 + }, + { + "epoch": 1.6979742173112338, + "grad_norm": 0.4126262366771698, + "learning_rate": 9.49652380848538e-05, + "loss": 1.935, + "step": 5532 + }, + { + "epoch": 1.698281154082259, + "grad_norm": 0.46366190910339355, + "learning_rate": 9.496306412513988e-05, + "loss": 1.9336, + "step": 5533 + }, + { + "epoch": 1.6985880908532842, + "grad_norm": 0.42553630471229553, + "learning_rate": 9.496088972107463e-05, + "loss": 1.9388, + "step": 5534 + }, + { + "epoch": 1.6988950276243093, + "grad_norm": 0.4060843884944916, + "learning_rate": 9.49587148726795e-05, + "loss": 1.917, + "step": 5535 + }, + { + "epoch": 1.6992019643953347, + "grad_norm": 0.37994736433029175, + "learning_rate": 9.495653957997601e-05, + "loss": 1.9268, + "step": 5536 + }, + { + "epoch": 1.6995089011663598, + "grad_norm": 0.4148559272289276, + "learning_rate": 9.495436384298563e-05, + "loss": 1.8936, + "step": 5537 + }, + { + "epoch": 1.6998158379373849, + "grad_norm": 0.39814767241477966, + "learning_rate": 9.495218766172989e-05, + "loss": 1.9468, + "step": 5538 + }, + { + "epoch": 1.7001227747084102, + "grad_norm": 0.40800294280052185, + "learning_rate": 9.495001103623027e-05, + "loss": 1.9649, + "step": 5539 + }, + { + "epoch": 1.7004297114794351, + "grad_norm": 0.4225989282131195, + "learning_rate": 9.49478339665083e-05, + "loss": 1.987, + "step": 5540 + }, + { + "epoch": 1.7007366482504604, + "grad_norm": 0.4280939996242523, + "learning_rate": 9.494565645258551e-05, + "loss": 2.0487, + "step": 5541 + }, + { + "epoch": 1.7010435850214856, + "grad_norm": 0.44816237688064575, + "learning_rate": 9.494347849448338e-05, + "loss": 1.9112, + "step": 5542 + }, + { + "epoch": 1.7013505217925107, + "grad_norm": 0.424629271030426, + "learning_rate": 9.494130009222346e-05, + "loss": 1.9284, + "step": 5543 + }, + { + "epoch": 1.701657458563536, + "grad_norm": 0.40010082721710205, + "learning_rate": 9.493912124582727e-05, + "loss": 1.9307, + "step": 5544 + }, + { + "epoch": 1.7019643953345611, + "grad_norm": 0.42541825771331787, + "learning_rate": 9.493694195531633e-05, + "loss": 2.0009, + "step": 5545 + }, + { + "epoch": 1.7022713321055862, + "grad_norm": 0.39693546295166016, + "learning_rate": 9.49347622207122e-05, + "loss": 1.9237, + "step": 5546 + }, + { + "epoch": 1.7025782688766116, + "grad_norm": 0.37853676080703735, + "learning_rate": 9.493258204203644e-05, + "loss": 1.9212, + "step": 5547 + }, + { + "epoch": 1.7028852056476365, + "grad_norm": 0.3856247663497925, + "learning_rate": 9.493040141931054e-05, + "loss": 1.926, + "step": 5548 + }, + { + "epoch": 1.7031921424186618, + "grad_norm": 0.3429555892944336, + "learning_rate": 9.492822035255608e-05, + "loss": 1.8854, + "step": 5549 + }, + { + "epoch": 1.703499079189687, + "grad_norm": 0.3500545620918274, + "learning_rate": 9.49260388417946e-05, + "loss": 1.8627, + "step": 5550 + }, + { + "epoch": 1.703806015960712, + "grad_norm": 0.3461480140686035, + "learning_rate": 9.49238568870477e-05, + "loss": 1.8962, + "step": 5551 + }, + { + "epoch": 1.7041129527317374, + "grad_norm": 0.36311015486717224, + "learning_rate": 9.492167448833691e-05, + "loss": 1.9398, + "step": 5552 + }, + { + "epoch": 1.7044198895027625, + "grad_norm": 0.36770105361938477, + "learning_rate": 9.491949164568379e-05, + "loss": 1.9083, + "step": 5553 + }, + { + "epoch": 1.7047268262737876, + "grad_norm": 0.42491769790649414, + "learning_rate": 9.491730835910993e-05, + "loss": 1.8874, + "step": 5554 + }, + { + "epoch": 1.705033763044813, + "grad_norm": 0.5321764945983887, + "learning_rate": 9.491512462863691e-05, + "loss": 1.9813, + "step": 5555 + }, + { + "epoch": 1.7053406998158378, + "grad_norm": 0.5481576323509216, + "learning_rate": 9.49129404542863e-05, + "loss": 1.8696, + "step": 5556 + }, + { + "epoch": 1.7056476365868631, + "grad_norm": 0.47720953822135925, + "learning_rate": 9.491075583607969e-05, + "loss": 1.9026, + "step": 5557 + }, + { + "epoch": 1.7059545733578882, + "grad_norm": 0.3976534605026245, + "learning_rate": 9.490857077403865e-05, + "loss": 1.8551, + "step": 5558 + }, + { + "epoch": 1.7062615101289134, + "grad_norm": 0.3744281828403473, + "learning_rate": 9.49063852681848e-05, + "loss": 2.012, + "step": 5559 + }, + { + "epoch": 1.7065684468999387, + "grad_norm": 0.3931918740272522, + "learning_rate": 9.490419931853974e-05, + "loss": 1.845, + "step": 5560 + }, + { + "epoch": 1.7068753836709638, + "grad_norm": 0.5411466956138611, + "learning_rate": 9.490201292512506e-05, + "loss": 2.0225, + "step": 5561 + }, + { + "epoch": 1.707182320441989, + "grad_norm": 0.6602910757064819, + "learning_rate": 9.489982608796237e-05, + "loss": 1.9559, + "step": 5562 + }, + { + "epoch": 1.7074892572130143, + "grad_norm": 0.5455329418182373, + "learning_rate": 9.489763880707329e-05, + "loss": 1.8855, + "step": 5563 + }, + { + "epoch": 1.7077961939840391, + "grad_norm": 0.42309099435806274, + "learning_rate": 9.489545108247941e-05, + "loss": 1.8784, + "step": 5564 + }, + { + "epoch": 1.7081031307550645, + "grad_norm": 0.3817001283168793, + "learning_rate": 9.489326291420239e-05, + "loss": 1.8926, + "step": 5565 + }, + { + "epoch": 1.7084100675260896, + "grad_norm": 0.5077582597732544, + "learning_rate": 9.489107430226381e-05, + "loss": 1.8742, + "step": 5566 + }, + { + "epoch": 1.7087170042971147, + "grad_norm": 0.5634065866470337, + "learning_rate": 9.488888524668533e-05, + "loss": 1.9251, + "step": 5567 + }, + { + "epoch": 1.70902394106814, + "grad_norm": 0.5182891488075256, + "learning_rate": 9.488669574748859e-05, + "loss": 1.9689, + "step": 5568 + }, + { + "epoch": 1.7093308778391652, + "grad_norm": 0.4180498719215393, + "learning_rate": 9.48845058046952e-05, + "loss": 1.9248, + "step": 5569 + }, + { + "epoch": 1.7096378146101903, + "grad_norm": 0.4833194315433502, + "learning_rate": 9.488231541832682e-05, + "loss": 2.0115, + "step": 5570 + }, + { + "epoch": 1.7099447513812156, + "grad_norm": 0.46525415778160095, + "learning_rate": 9.488012458840509e-05, + "loss": 1.9108, + "step": 5571 + }, + { + "epoch": 1.7102516881522405, + "grad_norm": 0.5051191449165344, + "learning_rate": 9.487793331495166e-05, + "loss": 1.9055, + "step": 5572 + }, + { + "epoch": 1.7105586249232658, + "grad_norm": 0.4713154137134552, + "learning_rate": 9.48757415979882e-05, + "loss": 1.9104, + "step": 5573 + }, + { + "epoch": 1.710865561694291, + "grad_norm": 0.44901835918426514, + "learning_rate": 9.487354943753635e-05, + "loss": 1.9536, + "step": 5574 + }, + { + "epoch": 1.711172498465316, + "grad_norm": 0.41106006503105164, + "learning_rate": 9.487135683361778e-05, + "loss": 1.9549, + "step": 5575 + }, + { + "epoch": 1.7114794352363414, + "grad_norm": 0.4571320116519928, + "learning_rate": 9.486916378625416e-05, + "loss": 1.859, + "step": 5576 + }, + { + "epoch": 1.7117863720073665, + "grad_norm": 0.4423540532588959, + "learning_rate": 9.486697029546718e-05, + "loss": 1.9621, + "step": 5577 + }, + { + "epoch": 1.7120933087783916, + "grad_norm": 0.44291070103645325, + "learning_rate": 9.48647763612785e-05, + "loss": 1.8567, + "step": 5578 + }, + { + "epoch": 1.712400245549417, + "grad_norm": 0.4374423921108246, + "learning_rate": 9.486258198370981e-05, + "loss": 1.9754, + "step": 5579 + }, + { + "epoch": 1.7127071823204418, + "grad_norm": 0.44008153676986694, + "learning_rate": 9.486038716278277e-05, + "loss": 1.8815, + "step": 5580 + }, + { + "epoch": 1.7130141190914672, + "grad_norm": 0.3571348190307617, + "learning_rate": 9.48581918985191e-05, + "loss": 1.8948, + "step": 5581 + }, + { + "epoch": 1.7133210558624923, + "grad_norm": 0.42260754108428955, + "learning_rate": 9.485599619094049e-05, + "loss": 1.9964, + "step": 5582 + }, + { + "epoch": 1.7136279926335174, + "grad_norm": 0.44568777084350586, + "learning_rate": 9.485380004006863e-05, + "loss": 1.9596, + "step": 5583 + }, + { + "epoch": 1.7139349294045427, + "grad_norm": 0.5488269925117493, + "learning_rate": 9.485160344592523e-05, + "loss": 1.9239, + "step": 5584 + }, + { + "epoch": 1.7142418661755678, + "grad_norm": 0.5653155446052551, + "learning_rate": 9.484940640853199e-05, + "loss": 1.9115, + "step": 5585 + }, + { + "epoch": 1.714548802946593, + "grad_norm": 0.4652312099933624, + "learning_rate": 9.484720892791064e-05, + "loss": 1.9973, + "step": 5586 + }, + { + "epoch": 1.7148557397176183, + "grad_norm": 0.41521382331848145, + "learning_rate": 9.484501100408288e-05, + "loss": 1.9395, + "step": 5587 + }, + { + "epoch": 1.7151626764886432, + "grad_norm": 0.46761438250541687, + "learning_rate": 9.484281263707043e-05, + "loss": 1.9465, + "step": 5588 + }, + { + "epoch": 1.7154696132596685, + "grad_norm": 0.46990182995796204, + "learning_rate": 9.484061382689501e-05, + "loss": 1.8969, + "step": 5589 + }, + { + "epoch": 1.7157765500306936, + "grad_norm": 0.44951021671295166, + "learning_rate": 9.48384145735784e-05, + "loss": 1.9925, + "step": 5590 + }, + { + "epoch": 1.7160834868017187, + "grad_norm": 0.4029327630996704, + "learning_rate": 9.483621487714227e-05, + "loss": 1.8574, + "step": 5591 + }, + { + "epoch": 1.716390423572744, + "grad_norm": 0.3501027226448059, + "learning_rate": 9.48340147376084e-05, + "loss": 1.9156, + "step": 5592 + }, + { + "epoch": 1.7166973603437692, + "grad_norm": 0.5058720111846924, + "learning_rate": 9.48318141549985e-05, + "loss": 2.071, + "step": 5593 + }, + { + "epoch": 1.7170042971147943, + "grad_norm": 0.5097518563270569, + "learning_rate": 9.482961312933435e-05, + "loss": 1.9609, + "step": 5594 + }, + { + "epoch": 1.7173112338858196, + "grad_norm": 0.4728573262691498, + "learning_rate": 9.482741166063769e-05, + "loss": 1.9552, + "step": 5595 + }, + { + "epoch": 1.7176181706568447, + "grad_norm": 0.44095897674560547, + "learning_rate": 9.482520974893026e-05, + "loss": 2.011, + "step": 5596 + }, + { + "epoch": 1.7179251074278699, + "grad_norm": 0.48331573605537415, + "learning_rate": 9.482300739423385e-05, + "loss": 1.9676, + "step": 5597 + }, + { + "epoch": 1.7182320441988952, + "grad_norm": 0.4890894293785095, + "learning_rate": 9.482080459657019e-05, + "loss": 1.9571, + "step": 5598 + }, + { + "epoch": 1.71853898096992, + "grad_norm": 0.4486929476261139, + "learning_rate": 9.481860135596109e-05, + "loss": 1.9205, + "step": 5599 + }, + { + "epoch": 1.7188459177409454, + "grad_norm": 0.44154083728790283, + "learning_rate": 9.48163976724283e-05, + "loss": 1.9995, + "step": 5600 + }, + { + "epoch": 1.7191528545119705, + "grad_norm": 0.4155641496181488, + "learning_rate": 9.481419354599358e-05, + "loss": 1.9192, + "step": 5601 + }, + { + "epoch": 1.7194597912829956, + "grad_norm": 0.453253835439682, + "learning_rate": 9.481198897667875e-05, + "loss": 2.0102, + "step": 5602 + }, + { + "epoch": 1.719766728054021, + "grad_norm": 0.4325653314590454, + "learning_rate": 9.480978396450557e-05, + "loss": 1.8859, + "step": 5603 + }, + { + "epoch": 1.720073664825046, + "grad_norm": 0.4191089868545532, + "learning_rate": 9.480757850949584e-05, + "loss": 2.0007, + "step": 5604 + }, + { + "epoch": 1.7203806015960712, + "grad_norm": 0.4182284474372864, + "learning_rate": 9.480537261167137e-05, + "loss": 1.9374, + "step": 5605 + }, + { + "epoch": 1.7206875383670965, + "grad_norm": 0.4695988893508911, + "learning_rate": 9.480316627105394e-05, + "loss": 1.983, + "step": 5606 + }, + { + "epoch": 1.7209944751381214, + "grad_norm": 0.4668160378932953, + "learning_rate": 9.480095948766536e-05, + "loss": 1.8705, + "step": 5607 + }, + { + "epoch": 1.7213014119091468, + "grad_norm": 0.3689236044883728, + "learning_rate": 9.479875226152744e-05, + "loss": 1.8695, + "step": 5608 + }, + { + "epoch": 1.7216083486801719, + "grad_norm": 0.4206932485103607, + "learning_rate": 9.4796544592662e-05, + "loss": 1.9494, + "step": 5609 + }, + { + "epoch": 1.721915285451197, + "grad_norm": 0.4420578181743622, + "learning_rate": 9.479433648109083e-05, + "loss": 1.8749, + "step": 5610 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.4243582487106323, + "learning_rate": 9.479212792683579e-05, + "loss": 1.9524, + "step": 5611 + }, + { + "epoch": 1.7225291589932474, + "grad_norm": 0.5053666234016418, + "learning_rate": 9.478991892991868e-05, + "loss": 1.9308, + "step": 5612 + }, + { + "epoch": 1.7228360957642725, + "grad_norm": 0.4365650713443756, + "learning_rate": 9.478770949036136e-05, + "loss": 1.9469, + "step": 5613 + }, + { + "epoch": 1.7231430325352979, + "grad_norm": 0.3916216194629669, + "learning_rate": 9.478549960818561e-05, + "loss": 1.8239, + "step": 5614 + }, + { + "epoch": 1.7234499693063228, + "grad_norm": 0.4051356911659241, + "learning_rate": 9.478328928341334e-05, + "loss": 1.892, + "step": 5615 + }, + { + "epoch": 1.723756906077348, + "grad_norm": 0.36592593789100647, + "learning_rate": 9.478107851606633e-05, + "loss": 1.8763, + "step": 5616 + }, + { + "epoch": 1.7240638428483732, + "grad_norm": 0.45741888880729675, + "learning_rate": 9.477886730616645e-05, + "loss": 1.9502, + "step": 5617 + }, + { + "epoch": 1.7243707796193983, + "grad_norm": 0.38170990347862244, + "learning_rate": 9.477665565373558e-05, + "loss": 1.8568, + "step": 5618 + }, + { + "epoch": 1.7246777163904237, + "grad_norm": 0.4193691313266754, + "learning_rate": 9.477444355879554e-05, + "loss": 1.9553, + "step": 5619 + }, + { + "epoch": 1.7249846531614488, + "grad_norm": 0.39682838320732117, + "learning_rate": 9.477223102136821e-05, + "loss": 1.9474, + "step": 5620 + }, + { + "epoch": 1.725291589932474, + "grad_norm": 0.391544371843338, + "learning_rate": 9.477001804147545e-05, + "loss": 1.9277, + "step": 5621 + }, + { + "epoch": 1.7255985267034992, + "grad_norm": 0.42348888516426086, + "learning_rate": 9.476780461913913e-05, + "loss": 1.8923, + "step": 5622 + }, + { + "epoch": 1.7259054634745241, + "grad_norm": 0.4393916130065918, + "learning_rate": 9.476559075438114e-05, + "loss": 1.9052, + "step": 5623 + }, + { + "epoch": 1.7262124002455494, + "grad_norm": 0.42631569504737854, + "learning_rate": 9.476337644722333e-05, + "loss": 1.8849, + "step": 5624 + }, + { + "epoch": 1.7265193370165746, + "grad_norm": 0.3514206111431122, + "learning_rate": 9.47611616976876e-05, + "loss": 1.9286, + "step": 5625 + }, + { + "epoch": 1.7268262737875997, + "grad_norm": 0.4104609191417694, + "learning_rate": 9.475894650579582e-05, + "loss": 1.9178, + "step": 5626 + }, + { + "epoch": 1.727133210558625, + "grad_norm": 0.44329676032066345, + "learning_rate": 9.475673087156992e-05, + "loss": 1.9789, + "step": 5627 + }, + { + "epoch": 1.7274401473296501, + "grad_norm": 0.41865840554237366, + "learning_rate": 9.475451479503175e-05, + "loss": 1.9105, + "step": 5628 + }, + { + "epoch": 1.7277470841006752, + "grad_norm": 0.4166790544986725, + "learning_rate": 9.475229827620326e-05, + "loss": 1.9089, + "step": 5629 + }, + { + "epoch": 1.7280540208717006, + "grad_norm": 0.353771448135376, + "learning_rate": 9.475008131510633e-05, + "loss": 1.9081, + "step": 5630 + }, + { + "epoch": 1.7283609576427255, + "grad_norm": 0.385046124458313, + "learning_rate": 9.474786391176284e-05, + "loss": 1.9268, + "step": 5631 + }, + { + "epoch": 1.7286678944137508, + "grad_norm": 0.3956538438796997, + "learning_rate": 9.474564606619474e-05, + "loss": 1.9445, + "step": 5632 + }, + { + "epoch": 1.728974831184776, + "grad_norm": 0.41305112838745117, + "learning_rate": 9.474342777842394e-05, + "loss": 1.9331, + "step": 5633 + }, + { + "epoch": 1.729281767955801, + "grad_norm": 0.39336860179901123, + "learning_rate": 9.474120904847237e-05, + "loss": 1.9792, + "step": 5634 + }, + { + "epoch": 1.7295887047268264, + "grad_norm": 0.41963186860084534, + "learning_rate": 9.473898987636194e-05, + "loss": 1.8719, + "step": 5635 + }, + { + "epoch": 1.7298956414978515, + "grad_norm": 0.4087338149547577, + "learning_rate": 9.473677026211458e-05, + "loss": 1.9121, + "step": 5636 + }, + { + "epoch": 1.7302025782688766, + "grad_norm": 0.3693830966949463, + "learning_rate": 9.473455020575226e-05, + "loss": 1.9293, + "step": 5637 + }, + { + "epoch": 1.730509515039902, + "grad_norm": 0.40699541568756104, + "learning_rate": 9.473232970729688e-05, + "loss": 1.94, + "step": 5638 + }, + { + "epoch": 1.7308164518109268, + "grad_norm": 0.4222811162471771, + "learning_rate": 9.473010876677041e-05, + "loss": 1.9416, + "step": 5639 + }, + { + "epoch": 1.7311233885819521, + "grad_norm": 0.41459110379219055, + "learning_rate": 9.472788738419477e-05, + "loss": 1.8801, + "step": 5640 + }, + { + "epoch": 1.7314303253529773, + "grad_norm": 0.36970487236976624, + "learning_rate": 9.472566555959195e-05, + "loss": 1.9122, + "step": 5641 + }, + { + "epoch": 1.7317372621240024, + "grad_norm": 0.35511577129364014, + "learning_rate": 9.472344329298388e-05, + "loss": 1.8646, + "step": 5642 + }, + { + "epoch": 1.7320441988950277, + "grad_norm": 0.3511577248573303, + "learning_rate": 9.472122058439252e-05, + "loss": 1.9047, + "step": 5643 + }, + { + "epoch": 1.7323511356660528, + "grad_norm": 0.3421955108642578, + "learning_rate": 9.471899743383986e-05, + "loss": 1.8732, + "step": 5644 + }, + { + "epoch": 1.732658072437078, + "grad_norm": 0.44008341431617737, + "learning_rate": 9.471677384134785e-05, + "loss": 1.8956, + "step": 5645 + }, + { + "epoch": 1.7329650092081033, + "grad_norm": 0.49410128593444824, + "learning_rate": 9.471454980693848e-05, + "loss": 1.9197, + "step": 5646 + }, + { + "epoch": 1.7332719459791281, + "grad_norm": 0.4664965867996216, + "learning_rate": 9.471232533063373e-05, + "loss": 1.8945, + "step": 5647 + }, + { + "epoch": 1.7335788827501535, + "grad_norm": 0.3789248764514923, + "learning_rate": 9.471010041245555e-05, + "loss": 1.9153, + "step": 5648 + }, + { + "epoch": 1.7338858195211786, + "grad_norm": 0.34556612372398376, + "learning_rate": 9.470787505242596e-05, + "loss": 1.9144, + "step": 5649 + }, + { + "epoch": 1.7341927562922037, + "grad_norm": 0.3466256856918335, + "learning_rate": 9.470564925056695e-05, + "loss": 1.8837, + "step": 5650 + }, + { + "epoch": 1.734499693063229, + "grad_norm": 0.34612321853637695, + "learning_rate": 9.470342300690051e-05, + "loss": 1.8667, + "step": 5651 + }, + { + "epoch": 1.7348066298342542, + "grad_norm": 0.3648833632469177, + "learning_rate": 9.470119632144864e-05, + "loss": 1.9499, + "step": 5652 + }, + { + "epoch": 1.7351135666052793, + "grad_norm": 0.3600454330444336, + "learning_rate": 9.469896919423334e-05, + "loss": 1.9093, + "step": 5653 + }, + { + "epoch": 1.7354205033763046, + "grad_norm": 0.41487598419189453, + "learning_rate": 9.469674162527664e-05, + "loss": 1.9714, + "step": 5654 + }, + { + "epoch": 1.7357274401473295, + "grad_norm": 0.35980695486068726, + "learning_rate": 9.469451361460053e-05, + "loss": 1.9006, + "step": 5655 + }, + { + "epoch": 1.7360343769183548, + "grad_norm": 0.42676928639411926, + "learning_rate": 9.469228516222705e-05, + "loss": 1.9286, + "step": 5656 + }, + { + "epoch": 1.73634131368938, + "grad_norm": 0.41541969776153564, + "learning_rate": 9.469005626817822e-05, + "loss": 1.9243, + "step": 5657 + }, + { + "epoch": 1.736648250460405, + "grad_norm": 0.4245065152645111, + "learning_rate": 9.468782693247604e-05, + "loss": 1.9427, + "step": 5658 + }, + { + "epoch": 1.7369551872314304, + "grad_norm": 0.46148940920829773, + "learning_rate": 9.468559715514257e-05, + "loss": 2.0201, + "step": 5659 + }, + { + "epoch": 1.7372621240024555, + "grad_norm": 0.47727301716804504, + "learning_rate": 9.468336693619985e-05, + "loss": 1.9792, + "step": 5660 + }, + { + "epoch": 1.7375690607734806, + "grad_norm": 0.4807848036289215, + "learning_rate": 9.46811362756699e-05, + "loss": 1.9036, + "step": 5661 + }, + { + "epoch": 1.737875997544506, + "grad_norm": 0.5129636526107788, + "learning_rate": 9.467890517357477e-05, + "loss": 1.8861, + "step": 5662 + }, + { + "epoch": 1.7381829343155308, + "grad_norm": 0.467804878950119, + "learning_rate": 9.467667362993651e-05, + "loss": 1.868, + "step": 5663 + }, + { + "epoch": 1.7384898710865562, + "grad_norm": 0.4179893136024475, + "learning_rate": 9.46744416447772e-05, + "loss": 1.9521, + "step": 5664 + }, + { + "epoch": 1.7387968078575813, + "grad_norm": 0.4384612739086151, + "learning_rate": 9.467220921811884e-05, + "loss": 1.9167, + "step": 5665 + }, + { + "epoch": 1.7391037446286064, + "grad_norm": 0.517855703830719, + "learning_rate": 9.466997634998354e-05, + "loss": 1.8919, + "step": 5666 + }, + { + "epoch": 1.7394106813996317, + "grad_norm": 0.4875940978527069, + "learning_rate": 9.466774304039334e-05, + "loss": 1.8774, + "step": 5667 + }, + { + "epoch": 1.7397176181706568, + "grad_norm": 0.44286540150642395, + "learning_rate": 9.466550928937034e-05, + "loss": 1.9696, + "step": 5668 + }, + { + "epoch": 1.740024554941682, + "grad_norm": 0.4092461168766022, + "learning_rate": 9.466327509693658e-05, + "loss": 1.9978, + "step": 5669 + }, + { + "epoch": 1.7403314917127073, + "grad_norm": 0.42797163128852844, + "learning_rate": 9.466104046311418e-05, + "loss": 1.9428, + "step": 5670 + }, + { + "epoch": 1.7406384284837324, + "grad_norm": 0.5174738764762878, + "learning_rate": 9.465880538792518e-05, + "loss": 1.9493, + "step": 5671 + }, + { + "epoch": 1.7409453652547575, + "grad_norm": 0.6263836622238159, + "learning_rate": 9.46565698713917e-05, + "loss": 1.9131, + "step": 5672 + }, + { + "epoch": 1.7412523020257828, + "grad_norm": 0.6452967524528503, + "learning_rate": 9.465433391353582e-05, + "loss": 2.0412, + "step": 5673 + }, + { + "epoch": 1.7415592387968077, + "grad_norm": 0.5004684925079346, + "learning_rate": 9.465209751437964e-05, + "loss": 1.8721, + "step": 5674 + }, + { + "epoch": 1.741866175567833, + "grad_norm": 0.4694507420063019, + "learning_rate": 9.464986067394526e-05, + "loss": 1.9614, + "step": 5675 + }, + { + "epoch": 1.7421731123388582, + "grad_norm": 0.4519532322883606, + "learning_rate": 9.464762339225479e-05, + "loss": 1.9687, + "step": 5676 + }, + { + "epoch": 1.7424800491098833, + "grad_norm": 0.4297941029071808, + "learning_rate": 9.464538566933033e-05, + "loss": 1.965, + "step": 5677 + }, + { + "epoch": 1.7427869858809086, + "grad_norm": 0.4612393081188202, + "learning_rate": 9.464314750519401e-05, + "loss": 1.9651, + "step": 5678 + }, + { + "epoch": 1.7430939226519337, + "grad_norm": 0.394142210483551, + "learning_rate": 9.464090889986794e-05, + "loss": 1.9185, + "step": 5679 + }, + { + "epoch": 1.7434008594229589, + "grad_norm": 0.39999979734420776, + "learning_rate": 9.463866985337424e-05, + "loss": 1.899, + "step": 5680 + }, + { + "epoch": 1.7437077961939842, + "grad_norm": 0.40942859649658203, + "learning_rate": 9.463643036573504e-05, + "loss": 1.9653, + "step": 5681 + }, + { + "epoch": 1.744014732965009, + "grad_norm": 0.4097300171852112, + "learning_rate": 9.463419043697248e-05, + "loss": 1.9944, + "step": 5682 + }, + { + "epoch": 1.7443216697360344, + "grad_norm": 0.41627535223960876, + "learning_rate": 9.463195006710868e-05, + "loss": 1.9156, + "step": 5683 + }, + { + "epoch": 1.7446286065070595, + "grad_norm": 0.3789215385913849, + "learning_rate": 9.46297092561658e-05, + "loss": 1.9262, + "step": 5684 + }, + { + "epoch": 1.7449355432780846, + "grad_norm": 0.4867783188819885, + "learning_rate": 9.462746800416595e-05, + "loss": 1.961, + "step": 5685 + }, + { + "epoch": 1.74524248004911, + "grad_norm": 0.6078580617904663, + "learning_rate": 9.462522631113133e-05, + "loss": 1.9694, + "step": 5686 + }, + { + "epoch": 1.745549416820135, + "grad_norm": 0.558968186378479, + "learning_rate": 9.462298417708406e-05, + "loss": 1.9537, + "step": 5687 + }, + { + "epoch": 1.7458563535911602, + "grad_norm": 0.4677596986293793, + "learning_rate": 9.46207416020463e-05, + "loss": 1.9253, + "step": 5688 + }, + { + "epoch": 1.7461632903621855, + "grad_norm": 0.40353646874427795, + "learning_rate": 9.461849858604023e-05, + "loss": 1.8992, + "step": 5689 + }, + { + "epoch": 1.7464702271332104, + "grad_norm": 0.3738614618778229, + "learning_rate": 9.4616255129088e-05, + "loss": 1.9109, + "step": 5690 + }, + { + "epoch": 1.7467771639042358, + "grad_norm": 0.4040324091911316, + "learning_rate": 9.461401123121179e-05, + "loss": 1.8981, + "step": 5691 + }, + { + "epoch": 1.7470841006752609, + "grad_norm": 0.44214901328086853, + "learning_rate": 9.461176689243376e-05, + "loss": 1.9244, + "step": 5692 + }, + { + "epoch": 1.747391037446286, + "grad_norm": 0.44187378883361816, + "learning_rate": 9.460952211277611e-05, + "loss": 1.9329, + "step": 5693 + }, + { + "epoch": 1.7476979742173113, + "grad_norm": 0.44287410378456116, + "learning_rate": 9.460727689226102e-05, + "loss": 1.97, + "step": 5694 + }, + { + "epoch": 1.7480049109883364, + "grad_norm": 0.3757341504096985, + "learning_rate": 9.460503123091067e-05, + "loss": 1.8766, + "step": 5695 + }, + { + "epoch": 1.7483118477593615, + "grad_norm": 0.4139314591884613, + "learning_rate": 9.460278512874725e-05, + "loss": 1.902, + "step": 5696 + }, + { + "epoch": 1.7486187845303869, + "grad_norm": 0.37526339292526245, + "learning_rate": 9.460053858579298e-05, + "loss": 1.9325, + "step": 5697 + }, + { + "epoch": 1.7489257213014118, + "grad_norm": 0.3770616948604584, + "learning_rate": 9.459829160207004e-05, + "loss": 1.9437, + "step": 5698 + }, + { + "epoch": 1.749232658072437, + "grad_norm": 0.4069806933403015, + "learning_rate": 9.459604417760064e-05, + "loss": 1.9454, + "step": 5699 + }, + { + "epoch": 1.7495395948434622, + "grad_norm": 0.42822694778442383, + "learning_rate": 9.459379631240699e-05, + "loss": 1.8798, + "step": 5700 + }, + { + "epoch": 1.7498465316144873, + "grad_norm": 0.44075292348861694, + "learning_rate": 9.459154800651131e-05, + "loss": 1.9842, + "step": 5701 + }, + { + "epoch": 1.7501534683855127, + "grad_norm": 0.4151122272014618, + "learning_rate": 9.458929925993583e-05, + "loss": 1.8495, + "step": 5702 + }, + { + "epoch": 1.7504604051565378, + "grad_norm": 0.41887882351875305, + "learning_rate": 9.458705007270275e-05, + "loss": 1.9611, + "step": 5703 + }, + { + "epoch": 1.750767341927563, + "grad_norm": 0.3976796865463257, + "learning_rate": 9.45848004448343e-05, + "loss": 1.8841, + "step": 5704 + }, + { + "epoch": 1.7510742786985882, + "grad_norm": 0.3783813416957855, + "learning_rate": 9.458255037635272e-05, + "loss": 1.8897, + "step": 5705 + }, + { + "epoch": 1.7513812154696131, + "grad_norm": 0.35153308510780334, + "learning_rate": 9.458029986728026e-05, + "loss": 1.911, + "step": 5706 + }, + { + "epoch": 1.7516881522406385, + "grad_norm": 0.38390985131263733, + "learning_rate": 9.457804891763913e-05, + "loss": 2.0105, + "step": 5707 + }, + { + "epoch": 1.7519950890116636, + "grad_norm": 0.3830740451812744, + "learning_rate": 9.457579752745161e-05, + "loss": 1.9635, + "step": 5708 + }, + { + "epoch": 1.7523020257826887, + "grad_norm": 0.3711417019367218, + "learning_rate": 9.457354569673993e-05, + "loss": 1.8553, + "step": 5709 + }, + { + "epoch": 1.752608962553714, + "grad_norm": 0.3670618236064911, + "learning_rate": 9.457129342552633e-05, + "loss": 1.9044, + "step": 5710 + }, + { + "epoch": 1.7529158993247391, + "grad_norm": 0.398863285779953, + "learning_rate": 9.45690407138331e-05, + "loss": 1.987, + "step": 5711 + }, + { + "epoch": 1.7532228360957642, + "grad_norm": 0.4100732207298279, + "learning_rate": 9.456678756168248e-05, + "loss": 1.8552, + "step": 5712 + }, + { + "epoch": 1.7535297728667896, + "grad_norm": 0.41883236169815063, + "learning_rate": 9.456453396909676e-05, + "loss": 1.9183, + "step": 5713 + }, + { + "epoch": 1.7538367096378145, + "grad_norm": 0.4063440263271332, + "learning_rate": 9.456227993609818e-05, + "loss": 1.8751, + "step": 5714 + }, + { + "epoch": 1.7541436464088398, + "grad_norm": 0.3880515694618225, + "learning_rate": 9.456002546270904e-05, + "loss": 1.9558, + "step": 5715 + }, + { + "epoch": 1.754450583179865, + "grad_norm": 0.38582444190979004, + "learning_rate": 9.45577705489516e-05, + "loss": 1.9588, + "step": 5716 + }, + { + "epoch": 1.75475751995089, + "grad_norm": 0.3678396940231323, + "learning_rate": 9.455551519484816e-05, + "loss": 1.9108, + "step": 5717 + }, + { + "epoch": 1.7550644567219154, + "grad_norm": 0.3590768277645111, + "learning_rate": 9.455325940042098e-05, + "loss": 1.9027, + "step": 5718 + }, + { + "epoch": 1.7553713934929405, + "grad_norm": 0.4104592204093933, + "learning_rate": 9.455100316569241e-05, + "loss": 1.9099, + "step": 5719 + }, + { + "epoch": 1.7556783302639656, + "grad_norm": 0.3774401843547821, + "learning_rate": 9.45487464906847e-05, + "loss": 1.9098, + "step": 5720 + }, + { + "epoch": 1.755985267034991, + "grad_norm": 0.38464388251304626, + "learning_rate": 9.454648937542019e-05, + "loss": 1.9194, + "step": 5721 + }, + { + "epoch": 1.7562922038060158, + "grad_norm": 0.435131698846817, + "learning_rate": 9.454423181992114e-05, + "loss": 1.9798, + "step": 5722 + }, + { + "epoch": 1.7565991405770411, + "grad_norm": 0.4583236575126648, + "learning_rate": 9.454197382420988e-05, + "loss": 1.9862, + "step": 5723 + }, + { + "epoch": 1.7569060773480663, + "grad_norm": 0.3644738793373108, + "learning_rate": 9.453971538830874e-05, + "loss": 1.8535, + "step": 5724 + }, + { + "epoch": 1.7572130141190914, + "grad_norm": 0.3644218444824219, + "learning_rate": 9.453745651224002e-05, + "loss": 1.8773, + "step": 5725 + }, + { + "epoch": 1.7575199508901167, + "grad_norm": 0.42884743213653564, + "learning_rate": 9.453519719602604e-05, + "loss": 1.882, + "step": 5726 + }, + { + "epoch": 1.7578268876611418, + "grad_norm": 0.41049477458000183, + "learning_rate": 9.453293743968916e-05, + "loss": 1.9133, + "step": 5727 + }, + { + "epoch": 1.758133824432167, + "grad_norm": 0.35882604122161865, + "learning_rate": 9.453067724325169e-05, + "loss": 1.9056, + "step": 5728 + }, + { + "epoch": 1.7584407612031923, + "grad_norm": 0.34516364336013794, + "learning_rate": 9.452841660673595e-05, + "loss": 1.8894, + "step": 5729 + }, + { + "epoch": 1.7587476979742172, + "grad_norm": 0.41804373264312744, + "learning_rate": 9.45261555301643e-05, + "loss": 1.8798, + "step": 5730 + }, + { + "epoch": 1.7590546347452425, + "grad_norm": 0.48584702610969543, + "learning_rate": 9.45238940135591e-05, + "loss": 1.9353, + "step": 5731 + }, + { + "epoch": 1.7593615715162676, + "grad_norm": 0.5693044662475586, + "learning_rate": 9.452163205694267e-05, + "loss": 1.8813, + "step": 5732 + }, + { + "epoch": 1.7596685082872927, + "grad_norm": 0.6146205067634583, + "learning_rate": 9.451936966033738e-05, + "loss": 1.9993, + "step": 5733 + }, + { + "epoch": 1.759975445058318, + "grad_norm": 0.4658338129520416, + "learning_rate": 9.451710682376558e-05, + "loss": 1.8977, + "step": 5734 + }, + { + "epoch": 1.7602823818293432, + "grad_norm": 0.35184696316719055, + "learning_rate": 9.451484354724964e-05, + "loss": 1.9924, + "step": 5735 + }, + { + "epoch": 1.7605893186003683, + "grad_norm": 0.48720163106918335, + "learning_rate": 9.451257983081194e-05, + "loss": 1.9054, + "step": 5736 + }, + { + "epoch": 1.7608962553713936, + "grad_norm": 0.6268271803855896, + "learning_rate": 9.451031567447482e-05, + "loss": 1.9956, + "step": 5737 + }, + { + "epoch": 1.7612031921424187, + "grad_norm": 0.5384534001350403, + "learning_rate": 9.450805107826068e-05, + "loss": 1.9169, + "step": 5738 + }, + { + "epoch": 1.7615101289134438, + "grad_norm": 0.4011121094226837, + "learning_rate": 9.450578604219188e-05, + "loss": 1.9845, + "step": 5739 + }, + { + "epoch": 1.7618170656844692, + "grad_norm": 0.4422668516635895, + "learning_rate": 9.450352056629082e-05, + "loss": 2.0014, + "step": 5740 + }, + { + "epoch": 1.762124002455494, + "grad_norm": 0.5033303499221802, + "learning_rate": 9.45012546505799e-05, + "loss": 1.9142, + "step": 5741 + }, + { + "epoch": 1.7624309392265194, + "grad_norm": 0.6074427366256714, + "learning_rate": 9.449898829508148e-05, + "loss": 1.9385, + "step": 5742 + }, + { + "epoch": 1.7627378759975445, + "grad_norm": 0.6405495405197144, + "learning_rate": 9.449672149981799e-05, + "loss": 1.9792, + "step": 5743 + }, + { + "epoch": 1.7630448127685696, + "grad_norm": 0.5432560443878174, + "learning_rate": 9.449445426481182e-05, + "loss": 1.9294, + "step": 5744 + }, + { + "epoch": 1.763351749539595, + "grad_norm": 0.41406089067459106, + "learning_rate": 9.449218659008536e-05, + "loss": 1.9266, + "step": 5745 + }, + { + "epoch": 1.76365868631062, + "grad_norm": 0.41278013586997986, + "learning_rate": 9.448991847566104e-05, + "loss": 1.9448, + "step": 5746 + }, + { + "epoch": 1.7639656230816452, + "grad_norm": 0.4682934582233429, + "learning_rate": 9.448764992156128e-05, + "loss": 1.9836, + "step": 5747 + }, + { + "epoch": 1.7642725598526705, + "grad_norm": 0.47673073410987854, + "learning_rate": 9.448538092780848e-05, + "loss": 2.0229, + "step": 5748 + }, + { + "epoch": 1.7645794966236954, + "grad_norm": 0.3956258296966553, + "learning_rate": 9.448311149442507e-05, + "loss": 1.9871, + "step": 5749 + }, + { + "epoch": 1.7648864333947207, + "grad_norm": 0.39578214287757874, + "learning_rate": 9.448084162143348e-05, + "loss": 1.8991, + "step": 5750 + }, + { + "epoch": 1.7651933701657458, + "grad_norm": 0.42902353405952454, + "learning_rate": 9.447857130885614e-05, + "loss": 1.9925, + "step": 5751 + }, + { + "epoch": 1.765500306936771, + "grad_norm": 0.45643556118011475, + "learning_rate": 9.44763005567155e-05, + "loss": 1.9662, + "step": 5752 + }, + { + "epoch": 1.7658072437077963, + "grad_norm": 0.39291635155677795, + "learning_rate": 9.447402936503398e-05, + "loss": 1.8925, + "step": 5753 + }, + { + "epoch": 1.7661141804788214, + "grad_norm": 0.36709296703338623, + "learning_rate": 9.447175773383404e-05, + "loss": 1.8669, + "step": 5754 + }, + { + "epoch": 1.7664211172498465, + "grad_norm": 0.41586652398109436, + "learning_rate": 9.446948566313812e-05, + "loss": 1.8925, + "step": 5755 + }, + { + "epoch": 1.7667280540208719, + "grad_norm": 0.42532578110694885, + "learning_rate": 9.446721315296867e-05, + "loss": 1.9923, + "step": 5756 + }, + { + "epoch": 1.7670349907918967, + "grad_norm": 0.45310646295547485, + "learning_rate": 9.446494020334817e-05, + "loss": 1.9908, + "step": 5757 + }, + { + "epoch": 1.767341927562922, + "grad_norm": 0.4391445219516754, + "learning_rate": 9.446266681429907e-05, + "loss": 1.9391, + "step": 5758 + }, + { + "epoch": 1.7676488643339472, + "grad_norm": 0.3728313446044922, + "learning_rate": 9.446039298584382e-05, + "loss": 1.9352, + "step": 5759 + }, + { + "epoch": 1.7679558011049723, + "grad_norm": 0.3862408697605133, + "learning_rate": 9.445811871800492e-05, + "loss": 1.9628, + "step": 5760 + }, + { + "epoch": 1.7682627378759976, + "grad_norm": 0.3704443573951721, + "learning_rate": 9.445584401080482e-05, + "loss": 1.9041, + "step": 5761 + }, + { + "epoch": 1.7685696746470227, + "grad_norm": 0.3490816652774811, + "learning_rate": 9.445356886426603e-05, + "loss": 1.9203, + "step": 5762 + }, + { + "epoch": 1.7688766114180479, + "grad_norm": 0.40135613083839417, + "learning_rate": 9.445129327841102e-05, + "loss": 1.9166, + "step": 5763 + }, + { + "epoch": 1.7691835481890732, + "grad_norm": 0.3794950246810913, + "learning_rate": 9.444901725326227e-05, + "loss": 1.8735, + "step": 5764 + }, + { + "epoch": 1.769490484960098, + "grad_norm": 0.3908408284187317, + "learning_rate": 9.444674078884228e-05, + "loss": 1.9044, + "step": 5765 + }, + { + "epoch": 1.7697974217311234, + "grad_norm": 0.45880573987960815, + "learning_rate": 9.444446388517354e-05, + "loss": 1.999, + "step": 5766 + }, + { + "epoch": 1.7701043585021485, + "grad_norm": 0.44833555817604065, + "learning_rate": 9.444218654227856e-05, + "loss": 1.8638, + "step": 5767 + }, + { + "epoch": 1.7704112952731736, + "grad_norm": 0.4608282446861267, + "learning_rate": 9.443990876017985e-05, + "loss": 2.0073, + "step": 5768 + }, + { + "epoch": 1.770718232044199, + "grad_norm": 0.41873493790626526, + "learning_rate": 9.44376305388999e-05, + "loss": 1.9337, + "step": 5769 + }, + { + "epoch": 1.771025168815224, + "grad_norm": 0.44395530223846436, + "learning_rate": 9.443535187846125e-05, + "loss": 1.9218, + "step": 5770 + }, + { + "epoch": 1.7713321055862492, + "grad_norm": 0.4347928464412689, + "learning_rate": 9.443307277888641e-05, + "loss": 1.9251, + "step": 5771 + }, + { + "epoch": 1.7716390423572745, + "grad_norm": 0.4892890155315399, + "learning_rate": 9.44307932401979e-05, + "loss": 1.9549, + "step": 5772 + }, + { + "epoch": 1.7719459791282994, + "grad_norm": 0.4234324097633362, + "learning_rate": 9.442851326241826e-05, + "loss": 1.9835, + "step": 5773 + }, + { + "epoch": 1.7722529158993248, + "grad_norm": 0.3614303171634674, + "learning_rate": 9.442623284557e-05, + "loss": 1.8942, + "step": 5774 + }, + { + "epoch": 1.7725598526703499, + "grad_norm": 0.4273429214954376, + "learning_rate": 9.442395198967566e-05, + "loss": 1.9363, + "step": 5775 + }, + { + "epoch": 1.772866789441375, + "grad_norm": 0.5049880146980286, + "learning_rate": 9.44216706947578e-05, + "loss": 1.904, + "step": 5776 + }, + { + "epoch": 1.7731737262124003, + "grad_norm": 0.5713424682617188, + "learning_rate": 9.441938896083895e-05, + "loss": 1.9756, + "step": 5777 + }, + { + "epoch": 1.7734806629834254, + "grad_norm": 0.4836362600326538, + "learning_rate": 9.441710678794166e-05, + "loss": 1.9657, + "step": 5778 + }, + { + "epoch": 1.7737875997544506, + "grad_norm": 0.39967820048332214, + "learning_rate": 9.44148241760885e-05, + "loss": 1.9566, + "step": 5779 + }, + { + "epoch": 1.7740945365254759, + "grad_norm": 0.38304075598716736, + "learning_rate": 9.4412541125302e-05, + "loss": 1.9055, + "step": 5780 + }, + { + "epoch": 1.7744014732965008, + "grad_norm": 0.3932463526725769, + "learning_rate": 9.441025763560474e-05, + "loss": 1.9603, + "step": 5781 + }, + { + "epoch": 1.774708410067526, + "grad_norm": 0.4528409242630005, + "learning_rate": 9.44079737070193e-05, + "loss": 2.0095, + "step": 5782 + }, + { + "epoch": 1.7750153468385512, + "grad_norm": 0.42075392603874207, + "learning_rate": 9.440568933956822e-05, + "loss": 1.8818, + "step": 5783 + }, + { + "epoch": 1.7753222836095763, + "grad_norm": 0.4114269018173218, + "learning_rate": 9.44034045332741e-05, + "loss": 1.8524, + "step": 5784 + }, + { + "epoch": 1.7756292203806017, + "grad_norm": 0.4052261412143707, + "learning_rate": 9.44011192881595e-05, + "loss": 1.9759, + "step": 5785 + }, + { + "epoch": 1.7759361571516268, + "grad_norm": 0.3551998436450958, + "learning_rate": 9.439883360424702e-05, + "loss": 1.9534, + "step": 5786 + }, + { + "epoch": 1.776243093922652, + "grad_norm": 0.404109925031662, + "learning_rate": 9.439654748155924e-05, + "loss": 1.8944, + "step": 5787 + }, + { + "epoch": 1.7765500306936772, + "grad_norm": 0.4092860519886017, + "learning_rate": 9.439426092011875e-05, + "loss": 2.0341, + "step": 5788 + }, + { + "epoch": 1.7768569674647021, + "grad_norm": 0.36132386326789856, + "learning_rate": 9.439197391994819e-05, + "loss": 1.8746, + "step": 5789 + }, + { + "epoch": 1.7771639042357275, + "grad_norm": 0.34845319390296936, + "learning_rate": 9.438968648107009e-05, + "loss": 1.8646, + "step": 5790 + }, + { + "epoch": 1.7774708410067526, + "grad_norm": 0.33360353112220764, + "learning_rate": 9.43873986035071e-05, + "loss": 1.901, + "step": 5791 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.348147988319397, + "learning_rate": 9.438511028728181e-05, + "loss": 1.8703, + "step": 5792 + }, + { + "epoch": 1.778084714548803, + "grad_norm": 0.385662704706192, + "learning_rate": 9.438282153241686e-05, + "loss": 1.9806, + "step": 5793 + }, + { + "epoch": 1.7783916513198281, + "grad_norm": 0.39457234740257263, + "learning_rate": 9.438053233893484e-05, + "loss": 1.9324, + "step": 5794 + }, + { + "epoch": 1.7786985880908532, + "grad_norm": 0.35745853185653687, + "learning_rate": 9.43782427068584e-05, + "loss": 1.9754, + "step": 5795 + }, + { + "epoch": 1.7790055248618786, + "grad_norm": 0.40866991877555847, + "learning_rate": 9.437595263621015e-05, + "loss": 1.959, + "step": 5796 + }, + { + "epoch": 1.7793124616329035, + "grad_norm": 0.3938930630683899, + "learning_rate": 9.437366212701274e-05, + "loss": 1.8746, + "step": 5797 + }, + { + "epoch": 1.7796193984039288, + "grad_norm": 0.36665603518486023, + "learning_rate": 9.437137117928878e-05, + "loss": 1.9209, + "step": 5798 + }, + { + "epoch": 1.779926335174954, + "grad_norm": 0.38514846563339233, + "learning_rate": 9.436907979306092e-05, + "loss": 1.8697, + "step": 5799 + }, + { + "epoch": 1.780233271945979, + "grad_norm": 0.4100898206233978, + "learning_rate": 9.43667879683518e-05, + "loss": 1.9606, + "step": 5800 + }, + { + "epoch": 1.7805402087170044, + "grad_norm": 0.40195250511169434, + "learning_rate": 9.43644957051841e-05, + "loss": 1.918, + "step": 5801 + }, + { + "epoch": 1.7808471454880295, + "grad_norm": 0.3943032920360565, + "learning_rate": 9.436220300358043e-05, + "loss": 1.9394, + "step": 5802 + }, + { + "epoch": 1.7811540822590546, + "grad_norm": 0.4171943664550781, + "learning_rate": 9.435990986356349e-05, + "loss": 1.9773, + "step": 5803 + }, + { + "epoch": 1.78146101903008, + "grad_norm": 0.4278806746006012, + "learning_rate": 9.435761628515589e-05, + "loss": 1.8696, + "step": 5804 + }, + { + "epoch": 1.7817679558011048, + "grad_norm": 0.4659377634525299, + "learning_rate": 9.435532226838036e-05, + "loss": 1.9387, + "step": 5805 + }, + { + "epoch": 1.7820748925721301, + "grad_norm": 0.4428139925003052, + "learning_rate": 9.435302781325952e-05, + "loss": 1.9673, + "step": 5806 + }, + { + "epoch": 1.7823818293431553, + "grad_norm": 0.4488377869129181, + "learning_rate": 9.435073291981607e-05, + "loss": 1.8493, + "step": 5807 + }, + { + "epoch": 1.7826887661141804, + "grad_norm": 0.5337218046188354, + "learning_rate": 9.434843758807268e-05, + "loss": 1.8631, + "step": 5808 + }, + { + "epoch": 1.7829957028852057, + "grad_norm": 0.5479410886764526, + "learning_rate": 9.434614181805202e-05, + "loss": 1.8548, + "step": 5809 + }, + { + "epoch": 1.7833026396562308, + "grad_norm": 0.5154398679733276, + "learning_rate": 9.434384560977681e-05, + "loss": 1.9558, + "step": 5810 + }, + { + "epoch": 1.783609576427256, + "grad_norm": 0.44863855838775635, + "learning_rate": 9.434154896326974e-05, + "loss": 1.9287, + "step": 5811 + }, + { + "epoch": 1.7839165131982813, + "grad_norm": 0.43923139572143555, + "learning_rate": 9.433925187855348e-05, + "loss": 1.9475, + "step": 5812 + }, + { + "epoch": 1.7842234499693064, + "grad_norm": 0.3602962791919708, + "learning_rate": 9.433695435565073e-05, + "loss": 1.8705, + "step": 5813 + }, + { + "epoch": 1.7845303867403315, + "grad_norm": 0.3956433832645416, + "learning_rate": 9.433465639458423e-05, + "loss": 1.9402, + "step": 5814 + }, + { + "epoch": 1.7848373235113568, + "grad_norm": 0.3382786810398102, + "learning_rate": 9.433235799537666e-05, + "loss": 1.9176, + "step": 5815 + }, + { + "epoch": 1.7851442602823817, + "grad_norm": 0.3681669533252716, + "learning_rate": 9.433005915805076e-05, + "loss": 1.8628, + "step": 5816 + }, + { + "epoch": 1.785451197053407, + "grad_norm": 0.32285505533218384, + "learning_rate": 9.432775988262921e-05, + "loss": 1.8875, + "step": 5817 + }, + { + "epoch": 1.7857581338244322, + "grad_norm": 0.35673508048057556, + "learning_rate": 9.432546016913477e-05, + "loss": 1.925, + "step": 5818 + }, + { + "epoch": 1.7860650705954573, + "grad_norm": 0.363308310508728, + "learning_rate": 9.432316001759015e-05, + "loss": 1.8711, + "step": 5819 + }, + { + "epoch": 1.7863720073664826, + "grad_norm": 0.36789265275001526, + "learning_rate": 9.432085942801808e-05, + "loss": 1.8578, + "step": 5820 + }, + { + "epoch": 1.7866789441375077, + "grad_norm": 0.3791796565055847, + "learning_rate": 9.43185584004413e-05, + "loss": 1.9162, + "step": 5821 + }, + { + "epoch": 1.7869858809085328, + "grad_norm": 0.3819539248943329, + "learning_rate": 9.431625693488256e-05, + "loss": 1.9042, + "step": 5822 + }, + { + "epoch": 1.7872928176795582, + "grad_norm": 0.36675095558166504, + "learning_rate": 9.43139550313646e-05, + "loss": 1.9775, + "step": 5823 + }, + { + "epoch": 1.787599754450583, + "grad_norm": 0.40895935893058777, + "learning_rate": 9.431165268991013e-05, + "loss": 1.9249, + "step": 5824 + }, + { + "epoch": 1.7879066912216084, + "grad_norm": 0.3866878151893616, + "learning_rate": 9.430934991054197e-05, + "loss": 1.8706, + "step": 5825 + }, + { + "epoch": 1.7882136279926335, + "grad_norm": 0.4892923831939697, + "learning_rate": 9.430704669328283e-05, + "loss": 1.9177, + "step": 5826 + }, + { + "epoch": 1.7885205647636586, + "grad_norm": 0.46216699481010437, + "learning_rate": 9.430474303815548e-05, + "loss": 1.8606, + "step": 5827 + }, + { + "epoch": 1.788827501534684, + "grad_norm": 0.4253760874271393, + "learning_rate": 9.430243894518271e-05, + "loss": 1.9123, + "step": 5828 + }, + { + "epoch": 1.789134438305709, + "grad_norm": 0.3316090404987335, + "learning_rate": 9.430013441438726e-05, + "loss": 1.9138, + "step": 5829 + }, + { + "epoch": 1.7894413750767342, + "grad_norm": 0.36144545674324036, + "learning_rate": 9.429782944579191e-05, + "loss": 1.8851, + "step": 5830 + }, + { + "epoch": 1.7897483118477595, + "grad_norm": 0.47213298082351685, + "learning_rate": 9.429552403941946e-05, + "loss": 1.9614, + "step": 5831 + }, + { + "epoch": 1.7900552486187844, + "grad_norm": 0.5166186094284058, + "learning_rate": 9.429321819529267e-05, + "loss": 1.9297, + "step": 5832 + }, + { + "epoch": 1.7903621853898097, + "grad_norm": 0.5276393294334412, + "learning_rate": 9.429091191343433e-05, + "loss": 1.8803, + "step": 5833 + }, + { + "epoch": 1.7906691221608348, + "grad_norm": 0.5736613869667053, + "learning_rate": 9.428860519386726e-05, + "loss": 1.9256, + "step": 5834 + }, + { + "epoch": 1.79097605893186, + "grad_norm": 0.6111080050468445, + "learning_rate": 9.428629803661421e-05, + "loss": 1.9624, + "step": 5835 + }, + { + "epoch": 1.7912829957028853, + "grad_norm": 0.45036107301712036, + "learning_rate": 9.428399044169802e-05, + "loss": 1.8625, + "step": 5836 + }, + { + "epoch": 1.7915899324739104, + "grad_norm": 0.35049325227737427, + "learning_rate": 9.428168240914148e-05, + "loss": 1.8988, + "step": 5837 + }, + { + "epoch": 1.7918968692449355, + "grad_norm": 0.4196048080921173, + "learning_rate": 9.427937393896739e-05, + "loss": 1.8593, + "step": 5838 + }, + { + "epoch": 1.7922038060159609, + "grad_norm": 0.5051491856575012, + "learning_rate": 9.42770650311986e-05, + "loss": 1.9283, + "step": 5839 + }, + { + "epoch": 1.7925107427869857, + "grad_norm": 0.5883297324180603, + "learning_rate": 9.427475568585787e-05, + "loss": 1.9211, + "step": 5840 + }, + { + "epoch": 1.792817679558011, + "grad_norm": 0.54326993227005, + "learning_rate": 9.427244590296807e-05, + "loss": 1.8856, + "step": 5841 + }, + { + "epoch": 1.7931246163290362, + "grad_norm": 0.3963034152984619, + "learning_rate": 9.4270135682552e-05, + "loss": 1.9302, + "step": 5842 + }, + { + "epoch": 1.7934315531000613, + "grad_norm": 0.3804232180118561, + "learning_rate": 9.426782502463251e-05, + "loss": 1.8615, + "step": 5843 + }, + { + "epoch": 1.7937384898710866, + "grad_norm": 0.5173880457878113, + "learning_rate": 9.426551392923244e-05, + "loss": 1.9702, + "step": 5844 + }, + { + "epoch": 1.7940454266421118, + "grad_norm": 0.5509253144264221, + "learning_rate": 9.42632023963746e-05, + "loss": 1.9091, + "step": 5845 + }, + { + "epoch": 1.7943523634131369, + "grad_norm": 0.4918860197067261, + "learning_rate": 9.426089042608186e-05, + "loss": 1.956, + "step": 5846 + }, + { + "epoch": 1.7946593001841622, + "grad_norm": 0.40632131695747375, + "learning_rate": 9.425857801837705e-05, + "loss": 1.978, + "step": 5847 + }, + { + "epoch": 1.794966236955187, + "grad_norm": 0.429643839597702, + "learning_rate": 9.425626517328303e-05, + "loss": 1.9293, + "step": 5848 + }, + { + "epoch": 1.7952731737262124, + "grad_norm": 0.46690109372138977, + "learning_rate": 9.425395189082267e-05, + "loss": 1.935, + "step": 5849 + }, + { + "epoch": 1.7955801104972375, + "grad_norm": 0.47745081782341003, + "learning_rate": 9.425163817101881e-05, + "loss": 1.9308, + "step": 5850 + }, + { + "epoch": 1.7958870472682626, + "grad_norm": 0.40971288084983826, + "learning_rate": 9.424932401389433e-05, + "loss": 1.8818, + "step": 5851 + }, + { + "epoch": 1.796193984039288, + "grad_norm": 0.44640809297561646, + "learning_rate": 9.424700941947209e-05, + "loss": 1.9298, + "step": 5852 + }, + { + "epoch": 1.796500920810313, + "grad_norm": 0.4068106412887573, + "learning_rate": 9.424469438777497e-05, + "loss": 1.9176, + "step": 5853 + }, + { + "epoch": 1.7968078575813382, + "grad_norm": 0.39228180050849915, + "learning_rate": 9.424237891882584e-05, + "loss": 1.9822, + "step": 5854 + }, + { + "epoch": 1.7971147943523635, + "grad_norm": 0.4050966203212738, + "learning_rate": 9.424006301264761e-05, + "loss": 2.0092, + "step": 5855 + }, + { + "epoch": 1.7974217311233884, + "grad_norm": 0.4402252733707428, + "learning_rate": 9.423774666926313e-05, + "loss": 1.9686, + "step": 5856 + }, + { + "epoch": 1.7977286678944138, + "grad_norm": 0.4362206757068634, + "learning_rate": 9.423542988869531e-05, + "loss": 1.9472, + "step": 5857 + }, + { + "epoch": 1.7980356046654389, + "grad_norm": 0.4363079369068146, + "learning_rate": 9.423311267096706e-05, + "loss": 1.9046, + "step": 5858 + }, + { + "epoch": 1.798342541436464, + "grad_norm": 0.4619371294975281, + "learning_rate": 9.423079501610123e-05, + "loss": 1.9322, + "step": 5859 + }, + { + "epoch": 1.7986494782074893, + "grad_norm": 0.3747330605983734, + "learning_rate": 9.42284769241208e-05, + "loss": 1.8859, + "step": 5860 + }, + { + "epoch": 1.7989564149785144, + "grad_norm": 0.46349939703941345, + "learning_rate": 9.422615839504863e-05, + "loss": 2.0343, + "step": 5861 + }, + { + "epoch": 1.7992633517495396, + "grad_norm": 0.4081406891345978, + "learning_rate": 9.422383942890762e-05, + "loss": 1.9261, + "step": 5862 + }, + { + "epoch": 1.7995702885205649, + "grad_norm": 0.4200274348258972, + "learning_rate": 9.42215200257207e-05, + "loss": 1.8922, + "step": 5863 + }, + { + "epoch": 1.7998772252915898, + "grad_norm": 0.4353233277797699, + "learning_rate": 9.421920018551084e-05, + "loss": 1.9263, + "step": 5864 + }, + { + "epoch": 1.8001841620626151, + "grad_norm": 0.43261346220970154, + "learning_rate": 9.42168799083009e-05, + "loss": 1.872, + "step": 5865 + }, + { + "epoch": 1.8004910988336402, + "grad_norm": 0.41588231921195984, + "learning_rate": 9.421455919411385e-05, + "loss": 1.9427, + "step": 5866 + }, + { + "epoch": 1.8007980356046653, + "grad_norm": 0.36490678787231445, + "learning_rate": 9.421223804297261e-05, + "loss": 1.9458, + "step": 5867 + }, + { + "epoch": 1.8011049723756907, + "grad_norm": 0.40656644105911255, + "learning_rate": 9.42099164549001e-05, + "loss": 1.8791, + "step": 5868 + }, + { + "epoch": 1.8014119091467158, + "grad_norm": 0.35529834032058716, + "learning_rate": 9.42075944299193e-05, + "loss": 1.8889, + "step": 5869 + }, + { + "epoch": 1.801718845917741, + "grad_norm": 0.3530628979206085, + "learning_rate": 9.420527196805314e-05, + "loss": 1.9093, + "step": 5870 + }, + { + "epoch": 1.8020257826887662, + "grad_norm": 0.35012003779411316, + "learning_rate": 9.420294906932457e-05, + "loss": 1.84, + "step": 5871 + }, + { + "epoch": 1.8023327194597911, + "grad_norm": 0.37993142008781433, + "learning_rate": 9.420062573375654e-05, + "loss": 1.9943, + "step": 5872 + }, + { + "epoch": 1.8026396562308165, + "grad_norm": 0.34801873564720154, + "learning_rate": 9.419830196137204e-05, + "loss": 1.9092, + "step": 5873 + }, + { + "epoch": 1.8029465930018416, + "grad_norm": 0.3381052017211914, + "learning_rate": 9.4195977752194e-05, + "loss": 1.9212, + "step": 5874 + }, + { + "epoch": 1.8032535297728667, + "grad_norm": 0.3624991476535797, + "learning_rate": 9.419365310624542e-05, + "loss": 1.9491, + "step": 5875 + }, + { + "epoch": 1.803560466543892, + "grad_norm": 0.3840768337249756, + "learning_rate": 9.419132802354925e-05, + "loss": 1.9531, + "step": 5876 + }, + { + "epoch": 1.8038674033149171, + "grad_norm": 0.377481073141098, + "learning_rate": 9.418900250412846e-05, + "loss": 1.9103, + "step": 5877 + }, + { + "epoch": 1.8041743400859422, + "grad_norm": 0.41462278366088867, + "learning_rate": 9.418667654800606e-05, + "loss": 1.944, + "step": 5878 + }, + { + "epoch": 1.8044812768569676, + "grad_norm": 0.5620705485343933, + "learning_rate": 9.418435015520502e-05, + "loss": 1.9184, + "step": 5879 + }, + { + "epoch": 1.8047882136279927, + "grad_norm": 0.6150699853897095, + "learning_rate": 9.418202332574833e-05, + "loss": 1.8971, + "step": 5880 + }, + { + "epoch": 1.8050951503990178, + "grad_norm": 0.5631645321846008, + "learning_rate": 9.4179696059659e-05, + "loss": 1.9668, + "step": 5881 + }, + { + "epoch": 1.8054020871700431, + "grad_norm": 0.4416831433773041, + "learning_rate": 9.417736835696001e-05, + "loss": 1.8531, + "step": 5882 + }, + { + "epoch": 1.805709023941068, + "grad_norm": 0.37340816855430603, + "learning_rate": 9.417504021767438e-05, + "loss": 1.8928, + "step": 5883 + }, + { + "epoch": 1.8060159607120934, + "grad_norm": 0.46018123626708984, + "learning_rate": 9.41727116418251e-05, + "loss": 1.8943, + "step": 5884 + }, + { + "epoch": 1.8063228974831185, + "grad_norm": 0.3852032721042633, + "learning_rate": 9.41703826294352e-05, + "loss": 1.8927, + "step": 5885 + }, + { + "epoch": 1.8066298342541436, + "grad_norm": 0.36783283948898315, + "learning_rate": 9.41680531805277e-05, + "loss": 1.9255, + "step": 5886 + }, + { + "epoch": 1.806936771025169, + "grad_norm": 0.39950302243232727, + "learning_rate": 9.416572329512559e-05, + "loss": 1.9215, + "step": 5887 + }, + { + "epoch": 1.807243707796194, + "grad_norm": 0.37217068672180176, + "learning_rate": 9.416339297325193e-05, + "loss": 1.8798, + "step": 5888 + }, + { + "epoch": 1.8075506445672191, + "grad_norm": 0.4334213137626648, + "learning_rate": 9.416106221492974e-05, + "loss": 1.9583, + "step": 5889 + }, + { + "epoch": 1.8078575813382445, + "grad_norm": 0.39610370993614197, + "learning_rate": 9.415873102018204e-05, + "loss": 1.9526, + "step": 5890 + }, + { + "epoch": 1.8081645181092694, + "grad_norm": 0.4256335496902466, + "learning_rate": 9.41563993890319e-05, + "loss": 1.9633, + "step": 5891 + }, + { + "epoch": 1.8084714548802947, + "grad_norm": 0.48030543327331543, + "learning_rate": 9.41540673215023e-05, + "loss": 1.8869, + "step": 5892 + }, + { + "epoch": 1.8087783916513198, + "grad_norm": 0.5549675822257996, + "learning_rate": 9.415173481761634e-05, + "loss": 1.9894, + "step": 5893 + }, + { + "epoch": 1.809085328422345, + "grad_norm": 0.5706361532211304, + "learning_rate": 9.414940187739708e-05, + "loss": 1.9721, + "step": 5894 + }, + { + "epoch": 1.8093922651933703, + "grad_norm": 0.4263947606086731, + "learning_rate": 9.414706850086754e-05, + "loss": 1.9408, + "step": 5895 + }, + { + "epoch": 1.8096992019643954, + "grad_norm": 0.3934611976146698, + "learning_rate": 9.414473468805078e-05, + "loss": 1.9444, + "step": 5896 + }, + { + "epoch": 1.8100061387354205, + "grad_norm": 0.4267776608467102, + "learning_rate": 9.41424004389699e-05, + "loss": 1.8774, + "step": 5897 + }, + { + "epoch": 1.8103130755064458, + "grad_norm": 0.46216219663619995, + "learning_rate": 9.414006575364795e-05, + "loss": 1.9648, + "step": 5898 + }, + { + "epoch": 1.8106200122774707, + "grad_norm": 0.4730767607688904, + "learning_rate": 9.413773063210798e-05, + "loss": 1.9528, + "step": 5899 + }, + { + "epoch": 1.810926949048496, + "grad_norm": 0.36383283138275146, + "learning_rate": 9.413539507437308e-05, + "loss": 1.843, + "step": 5900 + }, + { + "epoch": 1.8112338858195212, + "grad_norm": 0.343729168176651, + "learning_rate": 9.413305908046636e-05, + "loss": 1.9101, + "step": 5901 + }, + { + "epoch": 1.8115408225905463, + "grad_norm": 0.3774524927139282, + "learning_rate": 9.413072265041087e-05, + "loss": 1.8705, + "step": 5902 + }, + { + "epoch": 1.8118477593615716, + "grad_norm": 0.37734711170196533, + "learning_rate": 9.412838578422972e-05, + "loss": 1.868, + "step": 5903 + }, + { + "epoch": 1.8121546961325967, + "grad_norm": 0.3705524206161499, + "learning_rate": 9.4126048481946e-05, + "loss": 1.9587, + "step": 5904 + }, + { + "epoch": 1.8124616329036218, + "grad_norm": 0.45906612277030945, + "learning_rate": 9.41237107435828e-05, + "loss": 1.9872, + "step": 5905 + }, + { + "epoch": 1.8127685696746472, + "grad_norm": 0.5013484954833984, + "learning_rate": 9.412137256916323e-05, + "loss": 1.8692, + "step": 5906 + }, + { + "epoch": 1.813075506445672, + "grad_norm": 0.5123991370201111, + "learning_rate": 9.411903395871038e-05, + "loss": 1.9574, + "step": 5907 + }, + { + "epoch": 1.8133824432166974, + "grad_norm": 0.45425844192504883, + "learning_rate": 9.411669491224739e-05, + "loss": 1.9295, + "step": 5908 + }, + { + "epoch": 1.8136893799877225, + "grad_norm": 0.3939640522003174, + "learning_rate": 9.411435542979736e-05, + "loss": 1.9258, + "step": 5909 + }, + { + "epoch": 1.8139963167587476, + "grad_norm": 0.5032235383987427, + "learning_rate": 9.411201551138342e-05, + "loss": 1.9012, + "step": 5910 + }, + { + "epoch": 1.814303253529773, + "grad_norm": 0.6334826946258545, + "learning_rate": 9.410967515702869e-05, + "loss": 1.9699, + "step": 5911 + }, + { + "epoch": 1.814610190300798, + "grad_norm": 0.56645667552948, + "learning_rate": 9.41073343667563e-05, + "loss": 1.9346, + "step": 5912 + }, + { + "epoch": 1.8149171270718232, + "grad_norm": 0.461668461561203, + "learning_rate": 9.410499314058936e-05, + "loss": 1.9549, + "step": 5913 + }, + { + "epoch": 1.8152240638428485, + "grad_norm": 0.39917534589767456, + "learning_rate": 9.410265147855104e-05, + "loss": 1.9503, + "step": 5914 + }, + { + "epoch": 1.8155310006138734, + "grad_norm": 0.4409043788909912, + "learning_rate": 9.410030938066448e-05, + "loss": 1.897, + "step": 5915 + }, + { + "epoch": 1.8158379373848987, + "grad_norm": 0.5793384313583374, + "learning_rate": 9.40979668469528e-05, + "loss": 1.9526, + "step": 5916 + }, + { + "epoch": 1.8161448741559238, + "grad_norm": 0.4642924666404724, + "learning_rate": 9.409562387743917e-05, + "loss": 1.8993, + "step": 5917 + }, + { + "epoch": 1.816451810926949, + "grad_norm": 0.3799861669540405, + "learning_rate": 9.409328047214674e-05, + "loss": 1.9412, + "step": 5918 + }, + { + "epoch": 1.8167587476979743, + "grad_norm": 0.40758320689201355, + "learning_rate": 9.409093663109866e-05, + "loss": 1.9908, + "step": 5919 + }, + { + "epoch": 1.8170656844689994, + "grad_norm": 0.41446420550346375, + "learning_rate": 9.40885923543181e-05, + "loss": 1.8711, + "step": 5920 + }, + { + "epoch": 1.8173726212400245, + "grad_norm": 0.4744807183742523, + "learning_rate": 9.408624764182823e-05, + "loss": 2.0297, + "step": 5921 + }, + { + "epoch": 1.8176795580110499, + "grad_norm": 0.43377524614334106, + "learning_rate": 9.408390249365224e-05, + "loss": 1.9613, + "step": 5922 + }, + { + "epoch": 1.8179864947820747, + "grad_norm": 0.38450872898101807, + "learning_rate": 9.408155690981328e-05, + "loss": 1.8716, + "step": 5923 + }, + { + "epoch": 1.8182934315531, + "grad_norm": 0.4989684820175171, + "learning_rate": 9.407921089033452e-05, + "loss": 1.9909, + "step": 5924 + }, + { + "epoch": 1.8186003683241252, + "grad_norm": 0.4137042462825775, + "learning_rate": 9.407686443523918e-05, + "loss": 1.8778, + "step": 5925 + }, + { + "epoch": 1.8189073050951503, + "grad_norm": 0.3816729485988617, + "learning_rate": 9.407451754455042e-05, + "loss": 1.9355, + "step": 5926 + }, + { + "epoch": 1.8192142418661756, + "grad_norm": 0.48876214027404785, + "learning_rate": 9.407217021829145e-05, + "loss": 1.9256, + "step": 5927 + }, + { + "epoch": 1.8195211786372008, + "grad_norm": 0.5273690223693848, + "learning_rate": 9.406982245648547e-05, + "loss": 1.9456, + "step": 5928 + }, + { + "epoch": 1.8198281154082259, + "grad_norm": 0.4148990511894226, + "learning_rate": 9.406747425915566e-05, + "loss": 1.9184, + "step": 5929 + }, + { + "epoch": 1.8201350521792512, + "grad_norm": 0.4484131634235382, + "learning_rate": 9.406512562632526e-05, + "loss": 1.9305, + "step": 5930 + }, + { + "epoch": 1.820441988950276, + "grad_norm": 0.6036938428878784, + "learning_rate": 9.406277655801744e-05, + "loss": 1.9294, + "step": 5931 + }, + { + "epoch": 1.8207489257213014, + "grad_norm": 0.5399366021156311, + "learning_rate": 9.406042705425543e-05, + "loss": 1.9265, + "step": 5932 + }, + { + "epoch": 1.8210558624923265, + "grad_norm": 0.3591126501560211, + "learning_rate": 9.405807711506249e-05, + "loss": 1.8634, + "step": 5933 + }, + { + "epoch": 1.8213627992633517, + "grad_norm": 0.4474995732307434, + "learning_rate": 9.405572674046179e-05, + "loss": 2.0084, + "step": 5934 + }, + { + "epoch": 1.821669736034377, + "grad_norm": 0.4841657876968384, + "learning_rate": 9.405337593047657e-05, + "loss": 1.8885, + "step": 5935 + }, + { + "epoch": 1.821976672805402, + "grad_norm": 0.4786655008792877, + "learning_rate": 9.405102468513008e-05, + "loss": 1.9273, + "step": 5936 + }, + { + "epoch": 1.8222836095764272, + "grad_norm": 0.4675963521003723, + "learning_rate": 9.404867300444553e-05, + "loss": 1.9267, + "step": 5937 + }, + { + "epoch": 1.8225905463474525, + "grad_norm": 0.40235474705696106, + "learning_rate": 9.404632088844619e-05, + "loss": 2.0208, + "step": 5938 + }, + { + "epoch": 1.8228974831184774, + "grad_norm": 0.40626317262649536, + "learning_rate": 9.404396833715527e-05, + "loss": 1.9079, + "step": 5939 + }, + { + "epoch": 1.8232044198895028, + "grad_norm": 0.4164435565471649, + "learning_rate": 9.404161535059607e-05, + "loss": 1.8818, + "step": 5940 + }, + { + "epoch": 1.8235113566605279, + "grad_norm": 0.44487184286117554, + "learning_rate": 9.40392619287918e-05, + "loss": 1.9184, + "step": 5941 + }, + { + "epoch": 1.823818293431553, + "grad_norm": 0.4009508192539215, + "learning_rate": 9.403690807176572e-05, + "loss": 1.8814, + "step": 5942 + }, + { + "epoch": 1.8241252302025783, + "grad_norm": 0.3518575429916382, + "learning_rate": 9.403455377954112e-05, + "loss": 1.9319, + "step": 5943 + }, + { + "epoch": 1.8244321669736034, + "grad_norm": 0.36712533235549927, + "learning_rate": 9.403219905214125e-05, + "loss": 1.8609, + "step": 5944 + }, + { + "epoch": 1.8247391037446286, + "grad_norm": 0.3926267623901367, + "learning_rate": 9.402984388958937e-05, + "loss": 1.9328, + "step": 5945 + }, + { + "epoch": 1.825046040515654, + "grad_norm": 0.370781272649765, + "learning_rate": 9.402748829190878e-05, + "loss": 1.9848, + "step": 5946 + }, + { + "epoch": 1.8253529772866788, + "grad_norm": 0.38226625323295593, + "learning_rate": 9.402513225912273e-05, + "loss": 1.8933, + "step": 5947 + }, + { + "epoch": 1.8256599140577041, + "grad_norm": 0.40101101994514465, + "learning_rate": 9.402277579125451e-05, + "loss": 1.9231, + "step": 5948 + }, + { + "epoch": 1.8259668508287292, + "grad_norm": 0.41038060188293457, + "learning_rate": 9.402041888832744e-05, + "loss": 1.9445, + "step": 5949 + }, + { + "epoch": 1.8262737875997543, + "grad_norm": 0.37442395091056824, + "learning_rate": 9.401806155036479e-05, + "loss": 1.9271, + "step": 5950 + }, + { + "epoch": 1.8265807243707797, + "grad_norm": 0.43142926692962646, + "learning_rate": 9.401570377738984e-05, + "loss": 1.9489, + "step": 5951 + }, + { + "epoch": 1.8268876611418048, + "grad_norm": 0.38730981945991516, + "learning_rate": 9.401334556942591e-05, + "loss": 1.8802, + "step": 5952 + }, + { + "epoch": 1.82719459791283, + "grad_norm": 0.34189531207084656, + "learning_rate": 9.40109869264963e-05, + "loss": 1.9116, + "step": 5953 + }, + { + "epoch": 1.8275015346838552, + "grad_norm": 0.3632197678089142, + "learning_rate": 9.400862784862434e-05, + "loss": 1.8456, + "step": 5954 + }, + { + "epoch": 1.8278084714548803, + "grad_norm": 0.4008798599243164, + "learning_rate": 9.400626833583331e-05, + "loss": 1.9984, + "step": 5955 + }, + { + "epoch": 1.8281154082259055, + "grad_norm": 0.4087502062320709, + "learning_rate": 9.400390838814655e-05, + "loss": 1.8177, + "step": 5956 + }, + { + "epoch": 1.8284223449969308, + "grad_norm": 0.3753478229045868, + "learning_rate": 9.400154800558737e-05, + "loss": 1.864, + "step": 5957 + }, + { + "epoch": 1.8287292817679557, + "grad_norm": 0.37939608097076416, + "learning_rate": 9.399918718817911e-05, + "loss": 1.9331, + "step": 5958 + }, + { + "epoch": 1.829036218538981, + "grad_norm": 0.41382426023483276, + "learning_rate": 9.399682593594507e-05, + "loss": 1.9014, + "step": 5959 + }, + { + "epoch": 1.8293431553100061, + "grad_norm": 0.46129345893859863, + "learning_rate": 9.399446424890864e-05, + "loss": 1.9591, + "step": 5960 + }, + { + "epoch": 1.8296500920810312, + "grad_norm": 0.487870454788208, + "learning_rate": 9.399210212709312e-05, + "loss": 1.9073, + "step": 5961 + }, + { + "epoch": 1.8299570288520566, + "grad_norm": 0.4693615138530731, + "learning_rate": 9.398973957052185e-05, + "loss": 1.8336, + "step": 5962 + }, + { + "epoch": 1.8302639656230817, + "grad_norm": 0.38947850465774536, + "learning_rate": 9.39873765792182e-05, + "loss": 1.8599, + "step": 5963 + }, + { + "epoch": 1.8305709023941068, + "grad_norm": 0.372242271900177, + "learning_rate": 9.398501315320551e-05, + "loss": 1.9653, + "step": 5964 + }, + { + "epoch": 1.8308778391651321, + "grad_norm": 0.37679895758628845, + "learning_rate": 9.398264929250714e-05, + "loss": 1.8886, + "step": 5965 + }, + { + "epoch": 1.831184775936157, + "grad_norm": 0.347989022731781, + "learning_rate": 9.398028499714645e-05, + "loss": 1.8665, + "step": 5966 + }, + { + "epoch": 1.8314917127071824, + "grad_norm": 0.4297877550125122, + "learning_rate": 9.397792026714681e-05, + "loss": 1.9646, + "step": 5967 + }, + { + "epoch": 1.8317986494782075, + "grad_norm": 0.3698103427886963, + "learning_rate": 9.397555510253158e-05, + "loss": 1.9537, + "step": 5968 + }, + { + "epoch": 1.8321055862492326, + "grad_norm": 0.3268609941005707, + "learning_rate": 9.397318950332414e-05, + "loss": 1.8679, + "step": 5969 + }, + { + "epoch": 1.832412523020258, + "grad_norm": 0.3487341105937958, + "learning_rate": 9.397082346954788e-05, + "loss": 1.8936, + "step": 5970 + }, + { + "epoch": 1.832719459791283, + "grad_norm": 0.36363741755485535, + "learning_rate": 9.396845700122616e-05, + "loss": 1.8926, + "step": 5971 + }, + { + "epoch": 1.8330263965623081, + "grad_norm": 0.42258647084236145, + "learning_rate": 9.396609009838237e-05, + "loss": 1.9439, + "step": 5972 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.4087521433830261, + "learning_rate": 9.396372276103992e-05, + "loss": 1.8868, + "step": 5973 + }, + { + "epoch": 1.8336402701043584, + "grad_norm": 0.41857820749282837, + "learning_rate": 9.396135498922218e-05, + "loss": 1.9824, + "step": 5974 + }, + { + "epoch": 1.8339472068753837, + "grad_norm": 0.44207099080085754, + "learning_rate": 9.395898678295259e-05, + "loss": 1.9183, + "step": 5975 + }, + { + "epoch": 1.8342541436464088, + "grad_norm": 0.38295891880989075, + "learning_rate": 9.39566181422545e-05, + "loss": 1.8882, + "step": 5976 + }, + { + "epoch": 1.834561080417434, + "grad_norm": 0.4440687298774719, + "learning_rate": 9.395424906715136e-05, + "loss": 1.9401, + "step": 5977 + }, + { + "epoch": 1.8348680171884593, + "grad_norm": 0.3867577016353607, + "learning_rate": 9.395187955766655e-05, + "loss": 1.9243, + "step": 5978 + }, + { + "epoch": 1.8351749539594844, + "grad_norm": 0.47536182403564453, + "learning_rate": 9.394950961382354e-05, + "loss": 1.9248, + "step": 5979 + }, + { + "epoch": 1.8354818907305095, + "grad_norm": 0.4071936011314392, + "learning_rate": 9.394713923564569e-05, + "loss": 1.8701, + "step": 5980 + }, + { + "epoch": 1.8357888275015348, + "grad_norm": 0.41844502091407776, + "learning_rate": 9.394476842315645e-05, + "loss": 2.0087, + "step": 5981 + }, + { + "epoch": 1.8360957642725597, + "grad_norm": 0.40439316630363464, + "learning_rate": 9.394239717637927e-05, + "loss": 1.8945, + "step": 5982 + }, + { + "epoch": 1.836402701043585, + "grad_norm": 0.36738064885139465, + "learning_rate": 9.394002549533754e-05, + "loss": 1.9361, + "step": 5983 + }, + { + "epoch": 1.8367096378146102, + "grad_norm": 0.4733370542526245, + "learning_rate": 9.393765338005476e-05, + "loss": 1.9301, + "step": 5984 + }, + { + "epoch": 1.8370165745856353, + "grad_norm": 0.4467030465602875, + "learning_rate": 9.39352808305543e-05, + "loss": 1.8691, + "step": 5985 + }, + { + "epoch": 1.8373235113566606, + "grad_norm": 0.5276423692703247, + "learning_rate": 9.393290784685967e-05, + "loss": 1.9211, + "step": 5986 + }, + { + "epoch": 1.8376304481276857, + "grad_norm": 0.4791669547557831, + "learning_rate": 9.393053442899428e-05, + "loss": 1.9876, + "step": 5987 + }, + { + "epoch": 1.8379373848987108, + "grad_norm": 0.41468554735183716, + "learning_rate": 9.392816057698159e-05, + "loss": 1.9483, + "step": 5988 + }, + { + "epoch": 1.8382443216697362, + "grad_norm": 0.3979242742061615, + "learning_rate": 9.39257862908451e-05, + "loss": 1.8962, + "step": 5989 + }, + { + "epoch": 1.838551258440761, + "grad_norm": 0.47706472873687744, + "learning_rate": 9.392341157060822e-05, + "loss": 1.9028, + "step": 5990 + }, + { + "epoch": 1.8388581952117864, + "grad_norm": 0.5254244804382324, + "learning_rate": 9.392103641629446e-05, + "loss": 1.9244, + "step": 5991 + }, + { + "epoch": 1.8391651319828115, + "grad_norm": 0.49596595764160156, + "learning_rate": 9.391866082792727e-05, + "loss": 1.8731, + "step": 5992 + }, + { + "epoch": 1.8394720687538366, + "grad_norm": 0.3787136971950531, + "learning_rate": 9.391628480553013e-05, + "loss": 1.9404, + "step": 5993 + }, + { + "epoch": 1.839779005524862, + "grad_norm": 0.3986566960811615, + "learning_rate": 9.391390834912651e-05, + "loss": 1.9319, + "step": 5994 + }, + { + "epoch": 1.840085942295887, + "grad_norm": 0.4466419219970703, + "learning_rate": 9.391153145873992e-05, + "loss": 1.9755, + "step": 5995 + }, + { + "epoch": 1.8403928790669122, + "grad_norm": 0.43374884128570557, + "learning_rate": 9.390915413439385e-05, + "loss": 1.913, + "step": 5996 + }, + { + "epoch": 1.8406998158379375, + "grad_norm": 0.3897610902786255, + "learning_rate": 9.390677637611176e-05, + "loss": 1.9488, + "step": 5997 + }, + { + "epoch": 1.8410067526089624, + "grad_norm": 0.38407614827156067, + "learning_rate": 9.390439818391718e-05, + "loss": 1.8712, + "step": 5998 + }, + { + "epoch": 1.8413136893799877, + "grad_norm": 0.4159192740917206, + "learning_rate": 9.390201955783362e-05, + "loss": 1.9254, + "step": 5999 + }, + { + "epoch": 1.8416206261510129, + "grad_norm": 0.42220592498779297, + "learning_rate": 9.389964049788455e-05, + "loss": 1.9684, + "step": 6000 + }, + { + "epoch": 1.841927562922038, + "grad_norm": 0.3792029619216919, + "learning_rate": 9.389726100409351e-05, + "loss": 1.9091, + "step": 6001 + }, + { + "epoch": 1.8422344996930633, + "grad_norm": 0.37374788522720337, + "learning_rate": 9.389488107648401e-05, + "loss": 1.9498, + "step": 6002 + }, + { + "epoch": 1.8425414364640884, + "grad_norm": 0.4237084686756134, + "learning_rate": 9.389250071507958e-05, + "loss": 1.9177, + "step": 6003 + }, + { + "epoch": 1.8428483732351135, + "grad_norm": 0.5332993865013123, + "learning_rate": 9.38901199199037e-05, + "loss": 1.8994, + "step": 6004 + }, + { + "epoch": 1.8431553100061389, + "grad_norm": 0.42202335596084595, + "learning_rate": 9.388773869097996e-05, + "loss": 1.8365, + "step": 6005 + }, + { + "epoch": 1.8434622467771637, + "grad_norm": 0.3581100106239319, + "learning_rate": 9.388535702833185e-05, + "loss": 1.8536, + "step": 6006 + }, + { + "epoch": 1.843769183548189, + "grad_norm": 0.3670782446861267, + "learning_rate": 9.388297493198293e-05, + "loss": 1.8965, + "step": 6007 + }, + { + "epoch": 1.8440761203192142, + "grad_norm": 0.39181825518608093, + "learning_rate": 9.38805924019567e-05, + "loss": 1.8674, + "step": 6008 + }, + { + "epoch": 1.8443830570902393, + "grad_norm": 0.46757015585899353, + "learning_rate": 9.387820943827676e-05, + "loss": 1.8945, + "step": 6009 + }, + { + "epoch": 1.8446899938612646, + "grad_norm": 0.4656504690647125, + "learning_rate": 9.387582604096664e-05, + "loss": 1.8626, + "step": 6010 + }, + { + "epoch": 1.8449969306322898, + "grad_norm": 0.4699888825416565, + "learning_rate": 9.387344221004988e-05, + "loss": 1.9396, + "step": 6011 + }, + { + "epoch": 1.8453038674033149, + "grad_norm": 0.36591392755508423, + "learning_rate": 9.387105794555006e-05, + "loss": 1.8031, + "step": 6012 + }, + { + "epoch": 1.8456108041743402, + "grad_norm": 0.3563486933708191, + "learning_rate": 9.386867324749073e-05, + "loss": 1.8658, + "step": 6013 + }, + { + "epoch": 1.845917740945365, + "grad_norm": 0.4490883946418762, + "learning_rate": 9.386628811589547e-05, + "loss": 1.9809, + "step": 6014 + }, + { + "epoch": 1.8462246777163904, + "grad_norm": 0.39862295985221863, + "learning_rate": 9.38639025507878e-05, + "loss": 1.9268, + "step": 6015 + }, + { + "epoch": 1.8465316144874155, + "grad_norm": 0.3579883575439453, + "learning_rate": 9.386151655219138e-05, + "loss": 1.8538, + "step": 6016 + }, + { + "epoch": 1.8468385512584407, + "grad_norm": 0.411685973405838, + "learning_rate": 9.385913012012973e-05, + "loss": 1.9034, + "step": 6017 + }, + { + "epoch": 1.847145488029466, + "grad_norm": 0.44486066699028015, + "learning_rate": 9.385674325462643e-05, + "loss": 1.9279, + "step": 6018 + }, + { + "epoch": 1.847452424800491, + "grad_norm": 0.42794153094291687, + "learning_rate": 9.385435595570511e-05, + "loss": 1.9117, + "step": 6019 + }, + { + "epoch": 1.8477593615715162, + "grad_norm": 0.3652110695838928, + "learning_rate": 9.385196822338933e-05, + "loss": 1.9636, + "step": 6020 + }, + { + "epoch": 1.8480662983425415, + "grad_norm": 0.36490142345428467, + "learning_rate": 9.38495800577027e-05, + "loss": 1.9468, + "step": 6021 + }, + { + "epoch": 1.8483732351135667, + "grad_norm": 0.3946039080619812, + "learning_rate": 9.384719145866882e-05, + "loss": 1.8851, + "step": 6022 + }, + { + "epoch": 1.8486801718845918, + "grad_norm": 0.4236997067928314, + "learning_rate": 9.38448024263113e-05, + "loss": 2.0256, + "step": 6023 + }, + { + "epoch": 1.848987108655617, + "grad_norm": 0.34637942910194397, + "learning_rate": 9.384241296065374e-05, + "loss": 1.9032, + "step": 6024 + }, + { + "epoch": 1.849294045426642, + "grad_norm": 0.4096907079219818, + "learning_rate": 9.384002306171975e-05, + "loss": 1.9762, + "step": 6025 + }, + { + "epoch": 1.8496009821976673, + "grad_norm": 0.38225218653678894, + "learning_rate": 9.383763272953297e-05, + "loss": 2.023, + "step": 6026 + }, + { + "epoch": 1.8499079189686924, + "grad_norm": 0.4297153055667877, + "learning_rate": 9.3835241964117e-05, + "loss": 1.977, + "step": 6027 + }, + { + "epoch": 1.8502148557397176, + "grad_norm": 0.5225360989570618, + "learning_rate": 9.383285076549548e-05, + "loss": 1.919, + "step": 6028 + }, + { + "epoch": 1.850521792510743, + "grad_norm": 0.6799743175506592, + "learning_rate": 9.383045913369205e-05, + "loss": 1.9382, + "step": 6029 + }, + { + "epoch": 1.850828729281768, + "grad_norm": 0.6274817585945129, + "learning_rate": 9.382806706873031e-05, + "loss": 1.9782, + "step": 6030 + }, + { + "epoch": 1.8511356660527931, + "grad_norm": 0.4939708113670349, + "learning_rate": 9.382567457063392e-05, + "loss": 1.8794, + "step": 6031 + }, + { + "epoch": 1.8514426028238185, + "grad_norm": 0.3876135051250458, + "learning_rate": 9.382328163942656e-05, + "loss": 2.0153, + "step": 6032 + }, + { + "epoch": 1.8517495395948433, + "grad_norm": 0.592051088809967, + "learning_rate": 9.38208882751318e-05, + "loss": 1.9277, + "step": 6033 + }, + { + "epoch": 1.8520564763658687, + "grad_norm": 0.660763144493103, + "learning_rate": 9.381849447777337e-05, + "loss": 1.9177, + "step": 6034 + }, + { + "epoch": 1.8523634131368938, + "grad_norm": 0.5823151469230652, + "learning_rate": 9.381610024737489e-05, + "loss": 1.9363, + "step": 6035 + }, + { + "epoch": 1.852670349907919, + "grad_norm": 0.39519962668418884, + "learning_rate": 9.381370558396004e-05, + "loss": 1.8627, + "step": 6036 + }, + { + "epoch": 1.8529772866789442, + "grad_norm": 0.44657328724861145, + "learning_rate": 9.381131048755244e-05, + "loss": 1.9075, + "step": 6037 + }, + { + "epoch": 1.8532842234499693, + "grad_norm": 0.540743887424469, + "learning_rate": 9.380891495817581e-05, + "loss": 1.9518, + "step": 6038 + }, + { + "epoch": 1.8535911602209945, + "grad_norm": 0.4388680160045624, + "learning_rate": 9.38065189958538e-05, + "loss": 1.8485, + "step": 6039 + }, + { + "epoch": 1.8538980969920198, + "grad_norm": 0.37645572423934937, + "learning_rate": 9.38041226006101e-05, + "loss": 1.9542, + "step": 6040 + }, + { + "epoch": 1.8542050337630447, + "grad_norm": 0.4405656158924103, + "learning_rate": 9.380172577246837e-05, + "loss": 1.9054, + "step": 6041 + }, + { + "epoch": 1.85451197053407, + "grad_norm": 0.45483505725860596, + "learning_rate": 9.379932851145232e-05, + "loss": 1.9077, + "step": 6042 + }, + { + "epoch": 1.8548189073050951, + "grad_norm": 0.40666261315345764, + "learning_rate": 9.379693081758564e-05, + "loss": 1.9977, + "step": 6043 + }, + { + "epoch": 1.8551258440761202, + "grad_norm": 0.365241140127182, + "learning_rate": 9.379453269089202e-05, + "loss": 1.9047, + "step": 6044 + }, + { + "epoch": 1.8554327808471456, + "grad_norm": 0.40797916054725647, + "learning_rate": 9.379213413139516e-05, + "loss": 1.9621, + "step": 6045 + }, + { + "epoch": 1.8557397176181707, + "grad_norm": 0.4525306820869446, + "learning_rate": 9.378973513911875e-05, + "loss": 1.9479, + "step": 6046 + }, + { + "epoch": 1.8560466543891958, + "grad_norm": 0.45422959327697754, + "learning_rate": 9.378733571408652e-05, + "loss": 1.9754, + "step": 6047 + }, + { + "epoch": 1.8563535911602211, + "grad_norm": 0.381862998008728, + "learning_rate": 9.378493585632217e-05, + "loss": 1.8542, + "step": 6048 + }, + { + "epoch": 1.856660527931246, + "grad_norm": 0.40489691495895386, + "learning_rate": 9.378253556584944e-05, + "loss": 1.9331, + "step": 6049 + }, + { + "epoch": 1.8569674647022714, + "grad_norm": 0.40347445011138916, + "learning_rate": 9.378013484269201e-05, + "loss": 1.9414, + "step": 6050 + }, + { + "epoch": 1.8572744014732965, + "grad_norm": 0.35401904582977295, + "learning_rate": 9.377773368687363e-05, + "loss": 1.8094, + "step": 6051 + }, + { + "epoch": 1.8575813382443216, + "grad_norm": 0.4061582684516907, + "learning_rate": 9.377533209841805e-05, + "loss": 1.8686, + "step": 6052 + }, + { + "epoch": 1.857888275015347, + "grad_norm": 0.44419318437576294, + "learning_rate": 9.377293007734895e-05, + "loss": 1.929, + "step": 6053 + }, + { + "epoch": 1.858195211786372, + "grad_norm": 0.41038191318511963, + "learning_rate": 9.37705276236901e-05, + "loss": 1.9636, + "step": 6054 + }, + { + "epoch": 1.8585021485573971, + "grad_norm": 0.4431348145008087, + "learning_rate": 9.376812473746526e-05, + "loss": 1.953, + "step": 6055 + }, + { + "epoch": 1.8588090853284225, + "grad_norm": 0.42502057552337646, + "learning_rate": 9.376572141869814e-05, + "loss": 1.95, + "step": 6056 + }, + { + "epoch": 1.8591160220994474, + "grad_norm": 0.40050914883613586, + "learning_rate": 9.376331766741253e-05, + "loss": 1.9507, + "step": 6057 + }, + { + "epoch": 1.8594229588704727, + "grad_norm": 0.3863932490348816, + "learning_rate": 9.376091348363216e-05, + "loss": 1.8746, + "step": 6058 + }, + { + "epoch": 1.8597298956414978, + "grad_norm": 0.37295350432395935, + "learning_rate": 9.375850886738077e-05, + "loss": 1.8778, + "step": 6059 + }, + { + "epoch": 1.860036832412523, + "grad_norm": 0.37965887784957886, + "learning_rate": 9.375610381868217e-05, + "loss": 1.8511, + "step": 6060 + }, + { + "epoch": 1.8603437691835483, + "grad_norm": 0.3740752637386322, + "learning_rate": 9.37536983375601e-05, + "loss": 1.8988, + "step": 6061 + }, + { + "epoch": 1.8606507059545734, + "grad_norm": 0.40466782450675964, + "learning_rate": 9.375129242403834e-05, + "loss": 1.9195, + "step": 6062 + }, + { + "epoch": 1.8609576427255985, + "grad_norm": 0.3658956289291382, + "learning_rate": 9.374888607814067e-05, + "loss": 1.9598, + "step": 6063 + }, + { + "epoch": 1.8612645794966238, + "grad_norm": 0.3752783238887787, + "learning_rate": 9.374647929989085e-05, + "loss": 1.9791, + "step": 6064 + }, + { + "epoch": 1.8615715162676487, + "grad_norm": 0.408774733543396, + "learning_rate": 9.374407208931268e-05, + "loss": 1.88, + "step": 6065 + }, + { + "epoch": 1.861878453038674, + "grad_norm": 0.3968205749988556, + "learning_rate": 9.374166444642997e-05, + "loss": 1.8755, + "step": 6066 + }, + { + "epoch": 1.8621853898096992, + "grad_norm": 0.37851858139038086, + "learning_rate": 9.373925637126648e-05, + "loss": 1.9296, + "step": 6067 + }, + { + "epoch": 1.8624923265807243, + "grad_norm": 0.34285619854927063, + "learning_rate": 9.373684786384604e-05, + "loss": 2.0149, + "step": 6068 + }, + { + "epoch": 1.8627992633517496, + "grad_norm": 0.38841512799263, + "learning_rate": 9.373443892419242e-05, + "loss": 1.9134, + "step": 6069 + }, + { + "epoch": 1.8631062001227747, + "grad_norm": 0.4744485914707184, + "learning_rate": 9.373202955232943e-05, + "loss": 1.9164, + "step": 6070 + }, + { + "epoch": 1.8634131368937998, + "grad_norm": 0.522659420967102, + "learning_rate": 9.372961974828092e-05, + "loss": 1.9155, + "step": 6071 + }, + { + "epoch": 1.8637200736648252, + "grad_norm": 0.5794001817703247, + "learning_rate": 9.372720951207066e-05, + "loss": 1.9003, + "step": 6072 + }, + { + "epoch": 1.86402701043585, + "grad_norm": 0.5135447978973389, + "learning_rate": 9.372479884372247e-05, + "loss": 1.948, + "step": 6073 + }, + { + "epoch": 1.8643339472068754, + "grad_norm": 0.4060198664665222, + "learning_rate": 9.372238774326021e-05, + "loss": 1.8634, + "step": 6074 + }, + { + "epoch": 1.8646408839779005, + "grad_norm": 0.3880244195461273, + "learning_rate": 9.371997621070769e-05, + "loss": 1.8729, + "step": 6075 + }, + { + "epoch": 1.8649478207489256, + "grad_norm": 0.4862929582595825, + "learning_rate": 9.371756424608875e-05, + "loss": 1.9185, + "step": 6076 + }, + { + "epoch": 1.865254757519951, + "grad_norm": 0.4763035476207733, + "learning_rate": 9.371515184942719e-05, + "loss": 1.9696, + "step": 6077 + }, + { + "epoch": 1.865561694290976, + "grad_norm": 0.3552228808403015, + "learning_rate": 9.371273902074689e-05, + "loss": 1.9101, + "step": 6078 + }, + { + "epoch": 1.8658686310620012, + "grad_norm": 0.46329566836357117, + "learning_rate": 9.371032576007168e-05, + "loss": 1.8807, + "step": 6079 + }, + { + "epoch": 1.8661755678330265, + "grad_norm": 0.5176550149917603, + "learning_rate": 9.370791206742541e-05, + "loss": 1.9044, + "step": 6080 + }, + { + "epoch": 1.8664825046040514, + "grad_norm": 0.3929184675216675, + "learning_rate": 9.370549794283194e-05, + "loss": 1.8858, + "step": 6081 + }, + { + "epoch": 1.8667894413750767, + "grad_norm": 0.35135987401008606, + "learning_rate": 9.370308338631511e-05, + "loss": 1.8518, + "step": 6082 + }, + { + "epoch": 1.8670963781461019, + "grad_norm": 0.4229072034358978, + "learning_rate": 9.370066839789881e-05, + "loss": 1.891, + "step": 6083 + }, + { + "epoch": 1.867403314917127, + "grad_norm": 0.4862394630908966, + "learning_rate": 9.369825297760688e-05, + "loss": 1.9058, + "step": 6084 + }, + { + "epoch": 1.8677102516881523, + "grad_norm": 0.4775281548500061, + "learning_rate": 9.369583712546322e-05, + "loss": 1.9738, + "step": 6085 + }, + { + "epoch": 1.8680171884591774, + "grad_norm": 0.3831046521663666, + "learning_rate": 9.369342084149166e-05, + "loss": 1.9516, + "step": 6086 + }, + { + "epoch": 1.8683241252302025, + "grad_norm": 0.3970867395401001, + "learning_rate": 9.369100412571612e-05, + "loss": 2.0158, + "step": 6087 + }, + { + "epoch": 1.8686310620012279, + "grad_norm": 0.41662725806236267, + "learning_rate": 9.368858697816047e-05, + "loss": 1.86, + "step": 6088 + }, + { + "epoch": 1.8689379987722528, + "grad_norm": 0.44235244393348694, + "learning_rate": 9.36861693988486e-05, + "loss": 1.9257, + "step": 6089 + }, + { + "epoch": 1.869244935543278, + "grad_norm": 0.37863966822624207, + "learning_rate": 9.36837513878044e-05, + "loss": 1.8877, + "step": 6090 + }, + { + "epoch": 1.8695518723143032, + "grad_norm": 0.44757044315338135, + "learning_rate": 9.368133294505175e-05, + "loss": 1.8962, + "step": 6091 + }, + { + "epoch": 1.8698588090853283, + "grad_norm": 0.5299558639526367, + "learning_rate": 9.367891407061458e-05, + "loss": 1.8655, + "step": 6092 + }, + { + "epoch": 1.8701657458563536, + "grad_norm": 0.4899531900882721, + "learning_rate": 9.367649476451678e-05, + "loss": 1.8933, + "step": 6093 + }, + { + "epoch": 1.8704726826273788, + "grad_norm": 0.3883507251739502, + "learning_rate": 9.367407502678224e-05, + "loss": 1.88, + "step": 6094 + }, + { + "epoch": 1.8707796193984039, + "grad_norm": 0.40936750173568726, + "learning_rate": 9.367165485743493e-05, + "loss": 1.8926, + "step": 6095 + }, + { + "epoch": 1.8710865561694292, + "grad_norm": 0.5708447098731995, + "learning_rate": 9.36692342564987e-05, + "loss": 1.9701, + "step": 6096 + }, + { + "epoch": 1.8713934929404543, + "grad_norm": 0.5559602379798889, + "learning_rate": 9.366681322399751e-05, + "loss": 1.8962, + "step": 6097 + }, + { + "epoch": 1.8717004297114794, + "grad_norm": 0.45344826579093933, + "learning_rate": 9.366439175995528e-05, + "loss": 1.9766, + "step": 6098 + }, + { + "epoch": 1.8720073664825048, + "grad_norm": 0.4887133538722992, + "learning_rate": 9.366196986439592e-05, + "loss": 1.8982, + "step": 6099 + }, + { + "epoch": 1.8723143032535297, + "grad_norm": 0.536568284034729, + "learning_rate": 9.365954753734339e-05, + "loss": 1.9506, + "step": 6100 + }, + { + "epoch": 1.872621240024555, + "grad_norm": 0.4792746901512146, + "learning_rate": 9.365712477882162e-05, + "loss": 1.9392, + "step": 6101 + }, + { + "epoch": 1.87292817679558, + "grad_norm": 0.39836910367012024, + "learning_rate": 9.365470158885458e-05, + "loss": 1.8812, + "step": 6102 + }, + { + "epoch": 1.8732351135666052, + "grad_norm": 0.4263121783733368, + "learning_rate": 9.365227796746617e-05, + "loss": 1.8326, + "step": 6103 + }, + { + "epoch": 1.8735420503376305, + "grad_norm": 0.4158315360546112, + "learning_rate": 9.364985391468038e-05, + "loss": 1.8857, + "step": 6104 + }, + { + "epoch": 1.8738489871086557, + "grad_norm": 0.4384559094905853, + "learning_rate": 9.364742943052112e-05, + "loss": 1.9247, + "step": 6105 + }, + { + "epoch": 1.8741559238796808, + "grad_norm": 0.34221649169921875, + "learning_rate": 9.364500451501242e-05, + "loss": 1.8869, + "step": 6106 + }, + { + "epoch": 1.874462860650706, + "grad_norm": 0.38786688446998596, + "learning_rate": 9.364257916817817e-05, + "loss": 1.8879, + "step": 6107 + }, + { + "epoch": 1.874769797421731, + "grad_norm": 0.39408090710639954, + "learning_rate": 9.364015339004239e-05, + "loss": 1.8832, + "step": 6108 + }, + { + "epoch": 1.8750767341927563, + "grad_norm": 0.33985385298728943, + "learning_rate": 9.363772718062902e-05, + "loss": 1.8823, + "step": 6109 + }, + { + "epoch": 1.8753836709637814, + "grad_norm": 0.35319194197654724, + "learning_rate": 9.363530053996206e-05, + "loss": 1.9205, + "step": 6110 + }, + { + "epoch": 1.8756906077348066, + "grad_norm": 0.3455435335636139, + "learning_rate": 9.36328734680655e-05, + "loss": 1.9028, + "step": 6111 + }, + { + "epoch": 1.875997544505832, + "grad_norm": 0.3689115643501282, + "learning_rate": 9.363044596496329e-05, + "loss": 1.8996, + "step": 6112 + }, + { + "epoch": 1.876304481276857, + "grad_norm": 0.35776960849761963, + "learning_rate": 9.362801803067945e-05, + "loss": 1.9563, + "step": 6113 + }, + { + "epoch": 1.8766114180478821, + "grad_norm": 0.3524370491504669, + "learning_rate": 9.362558966523797e-05, + "loss": 1.9016, + "step": 6114 + }, + { + "epoch": 1.8769183548189075, + "grad_norm": 0.3725074529647827, + "learning_rate": 9.362316086866283e-05, + "loss": 1.9467, + "step": 6115 + }, + { + "epoch": 1.8772252915899323, + "grad_norm": 0.390055775642395, + "learning_rate": 9.362073164097807e-05, + "loss": 1.9326, + "step": 6116 + }, + { + "epoch": 1.8775322283609577, + "grad_norm": 0.39119964838027954, + "learning_rate": 9.361830198220764e-05, + "loss": 1.8723, + "step": 6117 + }, + { + "epoch": 1.8778391651319828, + "grad_norm": 0.3659103512763977, + "learning_rate": 9.36158718923756e-05, + "loss": 1.835, + "step": 6118 + }, + { + "epoch": 1.878146101903008, + "grad_norm": 0.3360283076763153, + "learning_rate": 9.361344137150597e-05, + "loss": 1.8622, + "step": 6119 + }, + { + "epoch": 1.8784530386740332, + "grad_norm": 0.35440295934677124, + "learning_rate": 9.361101041962272e-05, + "loss": 1.8523, + "step": 6120 + }, + { + "epoch": 1.8787599754450584, + "grad_norm": 1.2606174945831299, + "learning_rate": 9.36085790367499e-05, + "loss": 1.9826, + "step": 6121 + }, + { + "epoch": 1.8790669122160835, + "grad_norm": 0.49294769763946533, + "learning_rate": 9.360614722291157e-05, + "loss": 1.8478, + "step": 6122 + }, + { + "epoch": 1.8793738489871088, + "grad_norm": 0.5642881393432617, + "learning_rate": 9.360371497813172e-05, + "loss": 1.883, + "step": 6123 + }, + { + "epoch": 1.8796807857581337, + "grad_norm": 0.5257276296615601, + "learning_rate": 9.36012823024344e-05, + "loss": 1.8577, + "step": 6124 + }, + { + "epoch": 1.879987722529159, + "grad_norm": 0.36913231015205383, + "learning_rate": 9.359884919584366e-05, + "loss": 1.8934, + "step": 6125 + }, + { + "epoch": 1.8802946593001841, + "grad_norm": 0.43373262882232666, + "learning_rate": 9.359641565838353e-05, + "loss": 1.8354, + "step": 6126 + }, + { + "epoch": 1.8806015960712092, + "grad_norm": 0.5280462503433228, + "learning_rate": 9.359398169007807e-05, + "loss": 1.9446, + "step": 6127 + }, + { + "epoch": 1.8809085328422346, + "grad_norm": 0.4991915225982666, + "learning_rate": 9.359154729095135e-05, + "loss": 1.9003, + "step": 6128 + }, + { + "epoch": 1.8812154696132597, + "grad_norm": 0.3766331374645233, + "learning_rate": 9.358911246102738e-05, + "loss": 1.9149, + "step": 6129 + }, + { + "epoch": 1.8815224063842848, + "grad_norm": 0.39050692319869995, + "learning_rate": 9.358667720033026e-05, + "loss": 1.8945, + "step": 6130 + }, + { + "epoch": 1.8818293431553101, + "grad_norm": 0.47633904218673706, + "learning_rate": 9.358424150888405e-05, + "loss": 1.8772, + "step": 6131 + }, + { + "epoch": 1.882136279926335, + "grad_norm": 0.46322503685951233, + "learning_rate": 9.358180538671283e-05, + "loss": 1.893, + "step": 6132 + }, + { + "epoch": 1.8824432166973604, + "grad_norm": 0.39437612891197205, + "learning_rate": 9.357936883384066e-05, + "loss": 1.9394, + "step": 6133 + }, + { + "epoch": 1.8827501534683855, + "grad_norm": 0.4534996747970581, + "learning_rate": 9.357693185029162e-05, + "loss": 1.9689, + "step": 6134 + }, + { + "epoch": 1.8830570902394106, + "grad_norm": 0.4408230483531952, + "learning_rate": 9.35744944360898e-05, + "loss": 1.876, + "step": 6135 + }, + { + "epoch": 1.883364027010436, + "grad_norm": 0.5688899755477905, + "learning_rate": 9.35720565912593e-05, + "loss": 2.0153, + "step": 6136 + }, + { + "epoch": 1.883670963781461, + "grad_norm": 0.5005510449409485, + "learning_rate": 9.356961831582418e-05, + "loss": 1.9454, + "step": 6137 + }, + { + "epoch": 1.8839779005524862, + "grad_norm": 0.4002588987350464, + "learning_rate": 9.356717960980856e-05, + "loss": 1.9153, + "step": 6138 + }, + { + "epoch": 1.8842848373235115, + "grad_norm": 0.49053385853767395, + "learning_rate": 9.356474047323653e-05, + "loss": 1.9734, + "step": 6139 + }, + { + "epoch": 1.8845917740945364, + "grad_norm": 0.4828382432460785, + "learning_rate": 9.35623009061322e-05, + "loss": 1.8946, + "step": 6140 + }, + { + "epoch": 1.8848987108655617, + "grad_norm": 0.4389181137084961, + "learning_rate": 9.35598609085197e-05, + "loss": 1.9491, + "step": 6141 + }, + { + "epoch": 1.8852056476365868, + "grad_norm": 0.4010564982891083, + "learning_rate": 9.35574204804231e-05, + "loss": 1.8786, + "step": 6142 + }, + { + "epoch": 1.885512584407612, + "grad_norm": 0.4038756787776947, + "learning_rate": 9.355497962186657e-05, + "loss": 1.907, + "step": 6143 + }, + { + "epoch": 1.8858195211786373, + "grad_norm": 0.5030881762504578, + "learning_rate": 9.355253833287418e-05, + "loss": 1.8438, + "step": 6144 + }, + { + "epoch": 1.8861264579496624, + "grad_norm": 0.42690956592559814, + "learning_rate": 9.355009661347007e-05, + "loss": 1.8254, + "step": 6145 + }, + { + "epoch": 1.8864333947206875, + "grad_norm": 0.37733983993530273, + "learning_rate": 9.35476544636784e-05, + "loss": 1.9035, + "step": 6146 + }, + { + "epoch": 1.8867403314917128, + "grad_norm": 0.36874648928642273, + "learning_rate": 9.354521188352327e-05, + "loss": 1.885, + "step": 6147 + }, + { + "epoch": 1.8870472682627377, + "grad_norm": 0.36208659410476685, + "learning_rate": 9.354276887302885e-05, + "loss": 1.9416, + "step": 6148 + }, + { + "epoch": 1.887354205033763, + "grad_norm": 0.3952158987522125, + "learning_rate": 9.354032543221926e-05, + "loss": 1.9073, + "step": 6149 + }, + { + "epoch": 1.8876611418047882, + "grad_norm": 0.3603280782699585, + "learning_rate": 9.353788156111864e-05, + "loss": 1.9204, + "step": 6150 + }, + { + "epoch": 1.8879680785758133, + "grad_norm": 0.4325824975967407, + "learning_rate": 9.353543725975118e-05, + "loss": 1.9345, + "step": 6151 + }, + { + "epoch": 1.8882750153468386, + "grad_norm": 0.46270960569381714, + "learning_rate": 9.3532992528141e-05, + "loss": 1.9783, + "step": 6152 + }, + { + "epoch": 1.8885819521178637, + "grad_norm": 0.42317959666252136, + "learning_rate": 9.353054736631228e-05, + "loss": 1.9252, + "step": 6153 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.42653194069862366, + "learning_rate": 9.352810177428917e-05, + "loss": 1.9875, + "step": 6154 + }, + { + "epoch": 1.8891958256599142, + "grad_norm": 0.49819129705429077, + "learning_rate": 9.352565575209584e-05, + "loss": 1.9334, + "step": 6155 + }, + { + "epoch": 1.889502762430939, + "grad_norm": 0.4481790065765381, + "learning_rate": 9.352320929975646e-05, + "loss": 1.8939, + "step": 6156 + }, + { + "epoch": 1.8898096992019644, + "grad_norm": 0.41602686047554016, + "learning_rate": 9.352076241729524e-05, + "loss": 1.9207, + "step": 6157 + }, + { + "epoch": 1.8901166359729895, + "grad_norm": 0.4516759216785431, + "learning_rate": 9.351831510473633e-05, + "loss": 1.9384, + "step": 6158 + }, + { + "epoch": 1.8904235727440146, + "grad_norm": 0.5030251741409302, + "learning_rate": 9.351586736210391e-05, + "loss": 1.9787, + "step": 6159 + }, + { + "epoch": 1.89073050951504, + "grad_norm": 0.37176215648651123, + "learning_rate": 9.35134191894222e-05, + "loss": 1.8826, + "step": 6160 + }, + { + "epoch": 1.891037446286065, + "grad_norm": 0.3850235939025879, + "learning_rate": 9.351097058671537e-05, + "loss": 1.8689, + "step": 6161 + }, + { + "epoch": 1.8913443830570902, + "grad_norm": 0.3740260601043701, + "learning_rate": 9.350852155400764e-05, + "loss": 1.8624, + "step": 6162 + }, + { + "epoch": 1.8916513198281155, + "grad_norm": 0.386124849319458, + "learning_rate": 9.350607209132318e-05, + "loss": 1.8506, + "step": 6163 + }, + { + "epoch": 1.8919582565991406, + "grad_norm": 0.3743472993373871, + "learning_rate": 9.350362219868623e-05, + "loss": 1.9499, + "step": 6164 + }, + { + "epoch": 1.8922651933701657, + "grad_norm": 0.4257555603981018, + "learning_rate": 9.350117187612097e-05, + "loss": 1.9407, + "step": 6165 + }, + { + "epoch": 1.892572130141191, + "grad_norm": 0.37218552827835083, + "learning_rate": 9.349872112365163e-05, + "loss": 1.8772, + "step": 6166 + }, + { + "epoch": 1.892879066912216, + "grad_norm": 0.3443894386291504, + "learning_rate": 9.349626994130245e-05, + "loss": 1.8846, + "step": 6167 + }, + { + "epoch": 1.8931860036832413, + "grad_norm": 0.33507248759269714, + "learning_rate": 9.349381832909763e-05, + "loss": 1.9303, + "step": 6168 + }, + { + "epoch": 1.8934929404542664, + "grad_norm": 0.3844592869281769, + "learning_rate": 9.349136628706141e-05, + "loss": 1.9453, + "step": 6169 + }, + { + "epoch": 1.8937998772252915, + "grad_norm": 0.35765793919563293, + "learning_rate": 9.348891381521802e-05, + "loss": 1.8745, + "step": 6170 + }, + { + "epoch": 1.8941068139963169, + "grad_norm": 0.3732185661792755, + "learning_rate": 9.348646091359168e-05, + "loss": 1.9318, + "step": 6171 + }, + { + "epoch": 1.894413750767342, + "grad_norm": 0.3704257607460022, + "learning_rate": 9.348400758220666e-05, + "loss": 1.9285, + "step": 6172 + }, + { + "epoch": 1.894720687538367, + "grad_norm": 0.32159942388534546, + "learning_rate": 9.348155382108717e-05, + "loss": 1.8368, + "step": 6173 + }, + { + "epoch": 1.8950276243093924, + "grad_norm": 0.32755646109580994, + "learning_rate": 9.34790996302575e-05, + "loss": 1.8975, + "step": 6174 + }, + { + "epoch": 1.8953345610804173, + "grad_norm": 0.38797906041145325, + "learning_rate": 9.347664500974186e-05, + "loss": 1.9684, + "step": 6175 + }, + { + "epoch": 1.8956414978514426, + "grad_norm": 0.3870599865913391, + "learning_rate": 9.347418995956456e-05, + "loss": 1.963, + "step": 6176 + }, + { + "epoch": 1.8959484346224678, + "grad_norm": 0.35739025473594666, + "learning_rate": 9.347173447974982e-05, + "loss": 1.8912, + "step": 6177 + }, + { + "epoch": 1.8962553713934929, + "grad_norm": 0.3525852859020233, + "learning_rate": 9.346927857032193e-05, + "loss": 1.8455, + "step": 6178 + }, + { + "epoch": 1.8965623081645182, + "grad_norm": 0.39735934138298035, + "learning_rate": 9.346682223130514e-05, + "loss": 1.8824, + "step": 6179 + }, + { + "epoch": 1.8968692449355433, + "grad_norm": 0.3677692413330078, + "learning_rate": 9.346436546272373e-05, + "loss": 1.8723, + "step": 6180 + }, + { + "epoch": 1.8971761817065684, + "grad_norm": 0.3660476505756378, + "learning_rate": 9.346190826460199e-05, + "loss": 1.9674, + "step": 6181 + }, + { + "epoch": 1.8974831184775938, + "grad_norm": 0.4416230022907257, + "learning_rate": 9.34594506369642e-05, + "loss": 1.9309, + "step": 6182 + }, + { + "epoch": 1.8977900552486187, + "grad_norm": 0.39761826395988464, + "learning_rate": 9.345699257983466e-05, + "loss": 1.9408, + "step": 6183 + }, + { + "epoch": 1.898096992019644, + "grad_norm": 0.44419440627098083, + "learning_rate": 9.345453409323763e-05, + "loss": 2.0013, + "step": 6184 + }, + { + "epoch": 1.898403928790669, + "grad_norm": 0.4173676371574402, + "learning_rate": 9.345207517719743e-05, + "loss": 1.8462, + "step": 6185 + }, + { + "epoch": 1.8987108655616942, + "grad_norm": 0.39312002062797546, + "learning_rate": 9.344961583173837e-05, + "loss": 1.8716, + "step": 6186 + }, + { + "epoch": 1.8990178023327196, + "grad_norm": 0.389996737241745, + "learning_rate": 9.344715605688472e-05, + "loss": 1.9331, + "step": 6187 + }, + { + "epoch": 1.8993247391037447, + "grad_norm": 0.4575251340866089, + "learning_rate": 9.34446958526608e-05, + "loss": 1.9408, + "step": 6188 + }, + { + "epoch": 1.8996316758747698, + "grad_norm": 0.425075888633728, + "learning_rate": 9.344223521909097e-05, + "loss": 1.8632, + "step": 6189 + }, + { + "epoch": 1.899938612645795, + "grad_norm": 0.3622394800186157, + "learning_rate": 9.343977415619948e-05, + "loss": 1.8671, + "step": 6190 + }, + { + "epoch": 1.90024554941682, + "grad_norm": 0.38955047726631165, + "learning_rate": 9.343731266401068e-05, + "loss": 1.8955, + "step": 6191 + }, + { + "epoch": 1.9005524861878453, + "grad_norm": 0.40853381156921387, + "learning_rate": 9.34348507425489e-05, + "loss": 1.8477, + "step": 6192 + }, + { + "epoch": 1.9008594229588704, + "grad_norm": 0.36416095495224, + "learning_rate": 9.343238839183848e-05, + "loss": 1.8596, + "step": 6193 + }, + { + "epoch": 1.9011663597298956, + "grad_norm": 0.3371017277240753, + "learning_rate": 9.342992561190374e-05, + "loss": 1.9646, + "step": 6194 + }, + { + "epoch": 1.901473296500921, + "grad_norm": 0.3605191111564636, + "learning_rate": 9.3427462402769e-05, + "loss": 1.9165, + "step": 6195 + }, + { + "epoch": 1.901780233271946, + "grad_norm": 0.32952287793159485, + "learning_rate": 9.342499876445863e-05, + "loss": 1.8827, + "step": 6196 + }, + { + "epoch": 1.9020871700429711, + "grad_norm": 0.3627411425113678, + "learning_rate": 9.342253469699698e-05, + "loss": 1.9058, + "step": 6197 + }, + { + "epoch": 1.9023941068139965, + "grad_norm": 0.3830505311489105, + "learning_rate": 9.342007020040839e-05, + "loss": 1.89, + "step": 6198 + }, + { + "epoch": 1.9027010435850213, + "grad_norm": 0.36550065875053406, + "learning_rate": 9.341760527471722e-05, + "loss": 1.9004, + "step": 6199 + }, + { + "epoch": 1.9030079803560467, + "grad_norm": 0.4098506569862366, + "learning_rate": 9.341513991994782e-05, + "loss": 1.8656, + "step": 6200 + }, + { + "epoch": 1.9033149171270718, + "grad_norm": 0.5218825340270996, + "learning_rate": 9.341267413612456e-05, + "loss": 1.9179, + "step": 6201 + }, + { + "epoch": 1.903621853898097, + "grad_norm": 0.6201978921890259, + "learning_rate": 9.34102079232718e-05, + "loss": 1.9485, + "step": 6202 + }, + { + "epoch": 1.9039287906691222, + "grad_norm": 0.597594141960144, + "learning_rate": 9.340774128141395e-05, + "loss": 1.9074, + "step": 6203 + }, + { + "epoch": 1.9042357274401474, + "grad_norm": 0.477268248796463, + "learning_rate": 9.340527421057533e-05, + "loss": 1.9202, + "step": 6204 + }, + { + "epoch": 1.9045426642111725, + "grad_norm": 0.39805278182029724, + "learning_rate": 9.340280671078035e-05, + "loss": 1.8801, + "step": 6205 + }, + { + "epoch": 1.9048496009821978, + "grad_norm": 0.5815454721450806, + "learning_rate": 9.340033878205342e-05, + "loss": 1.8564, + "step": 6206 + }, + { + "epoch": 1.9051565377532227, + "grad_norm": 0.6385661363601685, + "learning_rate": 9.339787042441888e-05, + "loss": 1.8992, + "step": 6207 + }, + { + "epoch": 1.905463474524248, + "grad_norm": 0.5905124545097351, + "learning_rate": 9.339540163790116e-05, + "loss": 1.9608, + "step": 6208 + }, + { + "epoch": 1.9057704112952731, + "grad_norm": 0.37329113483428955, + "learning_rate": 9.339293242252465e-05, + "loss": 1.9037, + "step": 6209 + }, + { + "epoch": 1.9060773480662982, + "grad_norm": 0.4568968117237091, + "learning_rate": 9.339046277831374e-05, + "loss": 1.8719, + "step": 6210 + }, + { + "epoch": 1.9063842848373236, + "grad_norm": 0.43003782629966736, + "learning_rate": 9.338799270529284e-05, + "loss": 1.8594, + "step": 6211 + }, + { + "epoch": 1.9066912216083487, + "grad_norm": 0.3795240819454193, + "learning_rate": 9.338552220348637e-05, + "loss": 1.8645, + "step": 6212 + }, + { + "epoch": 1.9069981583793738, + "grad_norm": 0.3791581392288208, + "learning_rate": 9.338305127291876e-05, + "loss": 1.9076, + "step": 6213 + }, + { + "epoch": 1.9073050951503991, + "grad_norm": 0.3747733533382416, + "learning_rate": 9.338057991361438e-05, + "loss": 1.8665, + "step": 6214 + }, + { + "epoch": 1.907612031921424, + "grad_norm": 0.3994114100933075, + "learning_rate": 9.337810812559771e-05, + "loss": 1.9202, + "step": 6215 + }, + { + "epoch": 1.9079189686924494, + "grad_norm": 0.3808605670928955, + "learning_rate": 9.337563590889312e-05, + "loss": 1.9272, + "step": 6216 + }, + { + "epoch": 1.9082259054634745, + "grad_norm": 0.3461966812610626, + "learning_rate": 9.33731632635251e-05, + "loss": 1.8621, + "step": 6217 + }, + { + "epoch": 1.9085328422344996, + "grad_norm": 0.37272316217422485, + "learning_rate": 9.337069018951805e-05, + "loss": 1.8996, + "step": 6218 + }, + { + "epoch": 1.908839779005525, + "grad_norm": 0.40319329500198364, + "learning_rate": 9.336821668689642e-05, + "loss": 1.8852, + "step": 6219 + }, + { + "epoch": 1.90914671577655, + "grad_norm": 0.4059053659439087, + "learning_rate": 9.336574275568463e-05, + "loss": 1.9156, + "step": 6220 + }, + { + "epoch": 1.9094536525475752, + "grad_norm": 0.41244640946388245, + "learning_rate": 9.336326839590719e-05, + "loss": 1.9858, + "step": 6221 + }, + { + "epoch": 1.9097605893186005, + "grad_norm": 0.38230007886886597, + "learning_rate": 9.336079360758849e-05, + "loss": 1.8756, + "step": 6222 + }, + { + "epoch": 1.9100675260896254, + "grad_norm": 0.3620646297931671, + "learning_rate": 9.335831839075304e-05, + "loss": 1.9305, + "step": 6223 + }, + { + "epoch": 1.9103744628606507, + "grad_norm": 0.3700193166732788, + "learning_rate": 9.335584274542525e-05, + "loss": 1.8544, + "step": 6224 + }, + { + "epoch": 1.9106813996316758, + "grad_norm": 0.36827734112739563, + "learning_rate": 9.335336667162962e-05, + "loss": 1.8658, + "step": 6225 + }, + { + "epoch": 1.910988336402701, + "grad_norm": 0.33878061175346375, + "learning_rate": 9.33508901693906e-05, + "loss": 1.8638, + "step": 6226 + }, + { + "epoch": 1.9112952731737263, + "grad_norm": 0.3522186577320099, + "learning_rate": 9.334841323873269e-05, + "loss": 1.9109, + "step": 6227 + }, + { + "epoch": 1.9116022099447514, + "grad_norm": 0.3552776277065277, + "learning_rate": 9.334593587968035e-05, + "loss": 1.8499, + "step": 6228 + }, + { + "epoch": 1.9119091467157765, + "grad_norm": 0.3232300877571106, + "learning_rate": 9.334345809225805e-05, + "loss": 1.9078, + "step": 6229 + }, + { + "epoch": 1.9122160834868018, + "grad_norm": 0.3500599265098572, + "learning_rate": 9.33409798764903e-05, + "loss": 1.8953, + "step": 6230 + }, + { + "epoch": 1.9125230202578267, + "grad_norm": 0.4011479914188385, + "learning_rate": 9.333850123240159e-05, + "loss": 1.8961, + "step": 6231 + }, + { + "epoch": 1.912829957028852, + "grad_norm": 0.419539213180542, + "learning_rate": 9.333602216001642e-05, + "loss": 1.9381, + "step": 6232 + }, + { + "epoch": 1.9131368937998774, + "grad_norm": 0.364956259727478, + "learning_rate": 9.333354265935926e-05, + "loss": 1.8495, + "step": 6233 + }, + { + "epoch": 1.9134438305709023, + "grad_norm": 0.3322601318359375, + "learning_rate": 9.333106273045464e-05, + "loss": 1.8389, + "step": 6234 + }, + { + "epoch": 1.9137507673419276, + "grad_norm": 0.3706522583961487, + "learning_rate": 9.332858237332705e-05, + "loss": 1.904, + "step": 6235 + }, + { + "epoch": 1.9140577041129527, + "grad_norm": 0.3900963366031647, + "learning_rate": 9.332610158800104e-05, + "loss": 1.8974, + "step": 6236 + }, + { + "epoch": 1.9143646408839778, + "grad_norm": 0.3308334946632385, + "learning_rate": 9.332362037450108e-05, + "loss": 1.959, + "step": 6237 + }, + { + "epoch": 1.9146715776550032, + "grad_norm": 0.37876754999160767, + "learning_rate": 9.332113873285171e-05, + "loss": 1.9187, + "step": 6238 + }, + { + "epoch": 1.9149785144260283, + "grad_norm": 0.3557550609111786, + "learning_rate": 9.331865666307746e-05, + "loss": 1.9351, + "step": 6239 + }, + { + "epoch": 1.9152854511970534, + "grad_norm": 0.3792133927345276, + "learning_rate": 9.331617416520285e-05, + "loss": 1.8488, + "step": 6240 + }, + { + "epoch": 1.9155923879680787, + "grad_norm": 0.40517017245292664, + "learning_rate": 9.331369123925242e-05, + "loss": 1.9311, + "step": 6241 + }, + { + "epoch": 1.9158993247391036, + "grad_norm": 0.34011030197143555, + "learning_rate": 9.331120788525072e-05, + "loss": 1.8606, + "step": 6242 + }, + { + "epoch": 1.916206261510129, + "grad_norm": 0.39949584007263184, + "learning_rate": 9.330872410322227e-05, + "loss": 1.9156, + "step": 6243 + }, + { + "epoch": 1.916513198281154, + "grad_norm": 0.3771394193172455, + "learning_rate": 9.330623989319162e-05, + "loss": 1.8448, + "step": 6244 + }, + { + "epoch": 1.9168201350521792, + "grad_norm": 0.32114169001579285, + "learning_rate": 9.330375525518333e-05, + "loss": 1.8681, + "step": 6245 + }, + { + "epoch": 1.9171270718232045, + "grad_norm": 0.3438408672809601, + "learning_rate": 9.330127018922194e-05, + "loss": 1.8582, + "step": 6246 + }, + { + "epoch": 1.9174340085942296, + "grad_norm": 0.35971906781196594, + "learning_rate": 9.329878469533201e-05, + "loss": 1.9026, + "step": 6247 + }, + { + "epoch": 1.9177409453652547, + "grad_norm": 0.3953855633735657, + "learning_rate": 9.329629877353813e-05, + "loss": 1.8837, + "step": 6248 + }, + { + "epoch": 1.91804788213628, + "grad_norm": 0.36541905999183655, + "learning_rate": 9.329381242386485e-05, + "loss": 1.9156, + "step": 6249 + }, + { + "epoch": 1.918354818907305, + "grad_norm": 0.3577594459056854, + "learning_rate": 9.329132564633673e-05, + "loss": 1.8791, + "step": 6250 + }, + { + "epoch": 1.9186617556783303, + "grad_norm": 0.3869122564792633, + "learning_rate": 9.328883844097837e-05, + "loss": 1.9048, + "step": 6251 + }, + { + "epoch": 1.9189686924493554, + "grad_norm": 0.35097724199295044, + "learning_rate": 9.328635080781433e-05, + "loss": 1.9602, + "step": 6252 + }, + { + "epoch": 1.9192756292203805, + "grad_norm": 0.3813062012195587, + "learning_rate": 9.328386274686919e-05, + "loss": 1.9133, + "step": 6253 + }, + { + "epoch": 1.9195825659914059, + "grad_norm": 0.3950280249118805, + "learning_rate": 9.328137425816756e-05, + "loss": 1.9462, + "step": 6254 + }, + { + "epoch": 1.919889502762431, + "grad_norm": 0.41710540652275085, + "learning_rate": 9.327888534173402e-05, + "loss": 1.8616, + "step": 6255 + }, + { + "epoch": 1.920196439533456, + "grad_norm": 0.39998626708984375, + "learning_rate": 9.327639599759318e-05, + "loss": 1.8758, + "step": 6256 + }, + { + "epoch": 1.9205033763044814, + "grad_norm": 0.35425302386283875, + "learning_rate": 9.32739062257696e-05, + "loss": 1.8896, + "step": 6257 + }, + { + "epoch": 1.9208103130755063, + "grad_norm": 0.3487682640552521, + "learning_rate": 9.327141602628793e-05, + "loss": 1.8901, + "step": 6258 + }, + { + "epoch": 1.9211172498465316, + "grad_norm": 0.38767126202583313, + "learning_rate": 9.326892539917277e-05, + "loss": 1.9264, + "step": 6259 + }, + { + "epoch": 1.9214241866175568, + "grad_norm": 0.4265333116054535, + "learning_rate": 9.326643434444872e-05, + "loss": 1.9282, + "step": 6260 + }, + { + "epoch": 1.9217311233885819, + "grad_norm": 0.3386894166469574, + "learning_rate": 9.326394286214042e-05, + "loss": 1.8167, + "step": 6261 + }, + { + "epoch": 1.9220380601596072, + "grad_norm": 0.3594066798686981, + "learning_rate": 9.326145095227246e-05, + "loss": 1.9293, + "step": 6262 + }, + { + "epoch": 1.9223449969306323, + "grad_norm": 0.4041733741760254, + "learning_rate": 9.32589586148695e-05, + "loss": 2.0066, + "step": 6263 + }, + { + "epoch": 1.9226519337016574, + "grad_norm": 0.45588794350624084, + "learning_rate": 9.325646584995615e-05, + "loss": 1.9485, + "step": 6264 + }, + { + "epoch": 1.9229588704726828, + "grad_norm": 0.42583590745925903, + "learning_rate": 9.325397265755705e-05, + "loss": 1.8973, + "step": 6265 + }, + { + "epoch": 1.9232658072437077, + "grad_norm": 0.38701504468917847, + "learning_rate": 9.325147903769684e-05, + "loss": 1.9624, + "step": 6266 + }, + { + "epoch": 1.923572744014733, + "grad_norm": 0.4298608899116516, + "learning_rate": 9.324898499040017e-05, + "loss": 1.9033, + "step": 6267 + }, + { + "epoch": 1.923879680785758, + "grad_norm": 0.3692619800567627, + "learning_rate": 9.324649051569167e-05, + "loss": 1.973, + "step": 6268 + }, + { + "epoch": 1.9241866175567832, + "grad_norm": 0.40625011920928955, + "learning_rate": 9.324399561359602e-05, + "loss": 1.8629, + "step": 6269 + }, + { + "epoch": 1.9244935543278086, + "grad_norm": 0.43613263964653015, + "learning_rate": 9.324150028413784e-05, + "loss": 1.8928, + "step": 6270 + }, + { + "epoch": 1.9248004910988337, + "grad_norm": 0.4670937657356262, + "learning_rate": 9.323900452734182e-05, + "loss": 1.8809, + "step": 6271 + }, + { + "epoch": 1.9251074278698588, + "grad_norm": 0.43263986706733704, + "learning_rate": 9.323650834323262e-05, + "loss": 1.891, + "step": 6272 + }, + { + "epoch": 1.9254143646408841, + "grad_norm": 0.4253878891468048, + "learning_rate": 9.32340117318349e-05, + "loss": 2.0064, + "step": 6273 + }, + { + "epoch": 1.925721301411909, + "grad_norm": 0.3742302358150482, + "learning_rate": 9.323151469317332e-05, + "loss": 1.9441, + "step": 6274 + }, + { + "epoch": 1.9260282381829343, + "grad_norm": 0.37415632605552673, + "learning_rate": 9.32290172272726e-05, + "loss": 1.8901, + "step": 6275 + }, + { + "epoch": 1.9263351749539595, + "grad_norm": 0.402935266494751, + "learning_rate": 9.322651933415738e-05, + "loss": 1.9013, + "step": 6276 + }, + { + "epoch": 1.9266421117249846, + "grad_norm": 0.479819118976593, + "learning_rate": 9.322402101385235e-05, + "loss": 1.9713, + "step": 6277 + }, + { + "epoch": 1.92694904849601, + "grad_norm": 0.4472719430923462, + "learning_rate": 9.322152226638222e-05, + "loss": 1.9106, + "step": 6278 + }, + { + "epoch": 1.927255985267035, + "grad_norm": 0.36508920788764954, + "learning_rate": 9.321902309177168e-05, + "loss": 1.8999, + "step": 6279 + }, + { + "epoch": 1.9275629220380601, + "grad_norm": 0.38674476742744446, + "learning_rate": 9.321652349004542e-05, + "loss": 1.8653, + "step": 6280 + }, + { + "epoch": 1.9278698588090855, + "grad_norm": 0.3745587170124054, + "learning_rate": 9.321402346122814e-05, + "loss": 1.8764, + "step": 6281 + }, + { + "epoch": 1.9281767955801103, + "grad_norm": 0.37824445962905884, + "learning_rate": 9.321152300534454e-05, + "loss": 1.8712, + "step": 6282 + }, + { + "epoch": 1.9284837323511357, + "grad_norm": 0.3442685306072235, + "learning_rate": 9.320902212241936e-05, + "loss": 1.8242, + "step": 6283 + }, + { + "epoch": 1.9287906691221608, + "grad_norm": 0.3152186870574951, + "learning_rate": 9.32065208124773e-05, + "loss": 1.9282, + "step": 6284 + }, + { + "epoch": 1.929097605893186, + "grad_norm": 0.35380542278289795, + "learning_rate": 9.320401907554306e-05, + "loss": 1.8783, + "step": 6285 + }, + { + "epoch": 1.9294045426642112, + "grad_norm": 0.3140089511871338, + "learning_rate": 9.320151691164138e-05, + "loss": 1.9174, + "step": 6286 + }, + { + "epoch": 1.9297114794352364, + "grad_norm": 0.33666202425956726, + "learning_rate": 9.3199014320797e-05, + "loss": 1.8926, + "step": 6287 + }, + { + "epoch": 1.9300184162062615, + "grad_norm": 0.3297472894191742, + "learning_rate": 9.319651130303465e-05, + "loss": 1.8763, + "step": 6288 + }, + { + "epoch": 1.9303253529772868, + "grad_norm": 0.3323235511779785, + "learning_rate": 9.319400785837906e-05, + "loss": 1.9088, + "step": 6289 + }, + { + "epoch": 1.9306322897483117, + "grad_norm": 0.32601413130760193, + "learning_rate": 9.319150398685494e-05, + "loss": 1.8672, + "step": 6290 + }, + { + "epoch": 1.930939226519337, + "grad_norm": 0.35310089588165283, + "learning_rate": 9.318899968848708e-05, + "loss": 1.9492, + "step": 6291 + }, + { + "epoch": 1.9312461632903621, + "grad_norm": 0.3718548119068146, + "learning_rate": 9.31864949633002e-05, + "loss": 1.8692, + "step": 6292 + }, + { + "epoch": 1.9315531000613873, + "grad_norm": 0.42382025718688965, + "learning_rate": 9.318398981131908e-05, + "loss": 1.9693, + "step": 6293 + }, + { + "epoch": 1.9318600368324126, + "grad_norm": 0.5123299360275269, + "learning_rate": 9.318148423256845e-05, + "loss": 2.0117, + "step": 6294 + }, + { + "epoch": 1.9321669736034377, + "grad_norm": 0.4483809769153595, + "learning_rate": 9.317897822707308e-05, + "loss": 1.9165, + "step": 6295 + }, + { + "epoch": 1.9324739103744628, + "grad_norm": 0.4385908544063568, + "learning_rate": 9.317647179485776e-05, + "loss": 1.8869, + "step": 6296 + }, + { + "epoch": 1.9327808471454881, + "grad_norm": 0.42863771319389343, + "learning_rate": 9.317396493594724e-05, + "loss": 1.9484, + "step": 6297 + }, + { + "epoch": 1.933087783916513, + "grad_norm": 0.4130534529685974, + "learning_rate": 9.317145765036627e-05, + "loss": 1.9201, + "step": 6298 + }, + { + "epoch": 1.9333947206875384, + "grad_norm": 0.39024612307548523, + "learning_rate": 9.316894993813965e-05, + "loss": 1.9674, + "step": 6299 + }, + { + "epoch": 1.9337016574585635, + "grad_norm": 0.41060271859169006, + "learning_rate": 9.316644179929219e-05, + "loss": 1.9529, + "step": 6300 + }, + { + "epoch": 1.9340085942295886, + "grad_norm": 0.4302372634410858, + "learning_rate": 9.316393323384863e-05, + "loss": 1.8998, + "step": 6301 + }, + { + "epoch": 1.934315531000614, + "grad_norm": 0.3739410936832428, + "learning_rate": 9.316142424183379e-05, + "loss": 1.8812, + "step": 6302 + }, + { + "epoch": 1.934622467771639, + "grad_norm": 0.3965891897678375, + "learning_rate": 9.315891482327245e-05, + "loss": 1.8851, + "step": 6303 + }, + { + "epoch": 1.9349294045426642, + "grad_norm": 0.4486664831638336, + "learning_rate": 9.315640497818943e-05, + "loss": 1.9494, + "step": 6304 + }, + { + "epoch": 1.9352363413136895, + "grad_norm": 0.5530070662498474, + "learning_rate": 9.315389470660951e-05, + "loss": 1.9716, + "step": 6305 + }, + { + "epoch": 1.9355432780847146, + "grad_norm": 0.7142495512962341, + "learning_rate": 9.315138400855751e-05, + "loss": 1.947, + "step": 6306 + }, + { + "epoch": 1.9358502148557397, + "grad_norm": 0.7555594444274902, + "learning_rate": 9.314887288405827e-05, + "loss": 1.873, + "step": 6307 + }, + { + "epoch": 1.936157151626765, + "grad_norm": 0.6025232076644897, + "learning_rate": 9.314636133313654e-05, + "loss": 1.9189, + "step": 6308 + }, + { + "epoch": 1.93646408839779, + "grad_norm": 0.3686346113681793, + "learning_rate": 9.314384935581719e-05, + "loss": 1.8461, + "step": 6309 + }, + { + "epoch": 1.9367710251688153, + "grad_norm": 0.46265771985054016, + "learning_rate": 9.314133695212505e-05, + "loss": 1.8955, + "step": 6310 + }, + { + "epoch": 1.9370779619398404, + "grad_norm": 0.7023865580558777, + "learning_rate": 9.313882412208492e-05, + "loss": 1.9378, + "step": 6311 + }, + { + "epoch": 1.9373848987108655, + "grad_norm": 0.7163348197937012, + "learning_rate": 9.313631086572163e-05, + "loss": 1.9278, + "step": 6312 + }, + { + "epoch": 1.9376918354818908, + "grad_norm": 0.4772320091724396, + "learning_rate": 9.313379718306006e-05, + "loss": 1.9215, + "step": 6313 + }, + { + "epoch": 1.937998772252916, + "grad_norm": 0.4934171438217163, + "learning_rate": 9.313128307412501e-05, + "loss": 1.9725, + "step": 6314 + }, + { + "epoch": 1.938305709023941, + "grad_norm": 0.5988278985023499, + "learning_rate": 9.312876853894134e-05, + "loss": 1.9238, + "step": 6315 + }, + { + "epoch": 1.9386126457949664, + "grad_norm": 0.5819640159606934, + "learning_rate": 9.31262535775339e-05, + "loss": 1.9228, + "step": 6316 + }, + { + "epoch": 1.9389195825659913, + "grad_norm": 0.49525877833366394, + "learning_rate": 9.312373818992756e-05, + "loss": 1.8939, + "step": 6317 + }, + { + "epoch": 1.9392265193370166, + "grad_norm": 0.3778049647808075, + "learning_rate": 9.312122237614715e-05, + "loss": 1.8709, + "step": 6318 + }, + { + "epoch": 1.9395334561080417, + "grad_norm": 0.48716801404953003, + "learning_rate": 9.311870613621754e-05, + "loss": 1.9014, + "step": 6319 + }, + { + "epoch": 1.9398403928790668, + "grad_norm": 0.47298866510391235, + "learning_rate": 9.311618947016362e-05, + "loss": 1.8686, + "step": 6320 + }, + { + "epoch": 1.9401473296500922, + "grad_norm": 0.3709685206413269, + "learning_rate": 9.311367237801023e-05, + "loss": 1.9531, + "step": 6321 + }, + { + "epoch": 1.9404542664211173, + "grad_norm": 0.3898928761482239, + "learning_rate": 9.311115485978228e-05, + "loss": 1.8806, + "step": 6322 + }, + { + "epoch": 1.9407612031921424, + "grad_norm": 0.43091922998428345, + "learning_rate": 9.310863691550461e-05, + "loss": 1.9278, + "step": 6323 + }, + { + "epoch": 1.9410681399631677, + "grad_norm": 0.3788231909275055, + "learning_rate": 9.310611854520212e-05, + "loss": 1.893, + "step": 6324 + }, + { + "epoch": 1.9413750767341926, + "grad_norm": 0.4471469819545746, + "learning_rate": 9.310359974889972e-05, + "loss": 1.9706, + "step": 6325 + }, + { + "epoch": 1.941682013505218, + "grad_norm": 0.4047459661960602, + "learning_rate": 9.310108052662228e-05, + "loss": 1.8863, + "step": 6326 + }, + { + "epoch": 1.941988950276243, + "grad_norm": 0.4334566593170166, + "learning_rate": 9.309856087839468e-05, + "loss": 1.9543, + "step": 6327 + }, + { + "epoch": 1.9422958870472682, + "grad_norm": 0.3828316032886505, + "learning_rate": 9.309604080424185e-05, + "loss": 1.8601, + "step": 6328 + }, + { + "epoch": 1.9426028238182935, + "grad_norm": 0.3702560067176819, + "learning_rate": 9.30935203041887e-05, + "loss": 1.9055, + "step": 6329 + }, + { + "epoch": 1.9429097605893186, + "grad_norm": 0.4922797977924347, + "learning_rate": 9.309099937826011e-05, + "loss": 1.9589, + "step": 6330 + }, + { + "epoch": 1.9432166973603437, + "grad_norm": 0.4073271155357361, + "learning_rate": 9.308847802648102e-05, + "loss": 1.9727, + "step": 6331 + }, + { + "epoch": 1.943523634131369, + "grad_norm": 0.3833904266357422, + "learning_rate": 9.308595624887633e-05, + "loss": 1.8641, + "step": 6332 + }, + { + "epoch": 1.943830570902394, + "grad_norm": 0.44063761830329895, + "learning_rate": 9.308343404547095e-05, + "loss": 1.8996, + "step": 6333 + }, + { + "epoch": 1.9441375076734193, + "grad_norm": 0.4776977300643921, + "learning_rate": 9.308091141628983e-05, + "loss": 1.9353, + "step": 6334 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.39584699273109436, + "learning_rate": 9.307838836135792e-05, + "loss": 1.8521, + "step": 6335 + }, + { + "epoch": 1.9447513812154695, + "grad_norm": 0.3220890760421753, + "learning_rate": 9.30758648807001e-05, + "loss": 1.825, + "step": 6336 + }, + { + "epoch": 1.9450583179864949, + "grad_norm": 0.4301774501800537, + "learning_rate": 9.307334097434133e-05, + "loss": 1.9317, + "step": 6337 + }, + { + "epoch": 1.94536525475752, + "grad_norm": 0.439165323972702, + "learning_rate": 9.307081664230658e-05, + "loss": 1.8669, + "step": 6338 + }, + { + "epoch": 1.945672191528545, + "grad_norm": 0.4185279607772827, + "learning_rate": 9.306829188462076e-05, + "loss": 1.9512, + "step": 6339 + }, + { + "epoch": 1.9459791282995704, + "grad_norm": 0.4089502990245819, + "learning_rate": 9.306576670130885e-05, + "loss": 1.9607, + "step": 6340 + }, + { + "epoch": 1.9462860650705953, + "grad_norm": 0.508836567401886, + "learning_rate": 9.306324109239578e-05, + "loss": 1.9187, + "step": 6341 + }, + { + "epoch": 1.9465930018416207, + "grad_norm": 0.637534499168396, + "learning_rate": 9.306071505790652e-05, + "loss": 1.8237, + "step": 6342 + }, + { + "epoch": 1.9468999386126458, + "grad_norm": 0.5845112800598145, + "learning_rate": 9.305818859786603e-05, + "loss": 1.8238, + "step": 6343 + }, + { + "epoch": 1.9472068753836709, + "grad_norm": 0.4168374240398407, + "learning_rate": 9.305566171229932e-05, + "loss": 1.9343, + "step": 6344 + }, + { + "epoch": 1.9475138121546962, + "grad_norm": 0.43040701746940613, + "learning_rate": 9.305313440123129e-05, + "loss": 1.8774, + "step": 6345 + }, + { + "epoch": 1.9478207489257213, + "grad_norm": 0.6011641025543213, + "learning_rate": 9.305060666468696e-05, + "loss": 1.89, + "step": 6346 + }, + { + "epoch": 1.9481276856967464, + "grad_norm": 0.5530022382736206, + "learning_rate": 9.304807850269131e-05, + "loss": 2.0006, + "step": 6347 + }, + { + "epoch": 1.9484346224677718, + "grad_norm": 0.3707423210144043, + "learning_rate": 9.30455499152693e-05, + "loss": 1.9116, + "step": 6348 + }, + { + "epoch": 1.9487415592387967, + "grad_norm": 0.5013771653175354, + "learning_rate": 9.304302090244595e-05, + "loss": 1.8902, + "step": 6349 + }, + { + "epoch": 1.949048496009822, + "grad_norm": 0.5873609781265259, + "learning_rate": 9.304049146424623e-05, + "loss": 1.8879, + "step": 6350 + }, + { + "epoch": 1.949355432780847, + "grad_norm": 0.4389801621437073, + "learning_rate": 9.303796160069516e-05, + "loss": 1.9215, + "step": 6351 + }, + { + "epoch": 1.9496623695518722, + "grad_norm": 0.4004434645175934, + "learning_rate": 9.303543131181772e-05, + "loss": 1.9137, + "step": 6352 + }, + { + "epoch": 1.9499693063228976, + "grad_norm": 0.4928852617740631, + "learning_rate": 9.303290059763892e-05, + "loss": 1.9415, + "step": 6353 + }, + { + "epoch": 1.9502762430939227, + "grad_norm": 0.5045879483222961, + "learning_rate": 9.303036945818377e-05, + "loss": 1.8727, + "step": 6354 + }, + { + "epoch": 1.9505831798649478, + "grad_norm": 0.3434823453426361, + "learning_rate": 9.30278378934773e-05, + "loss": 1.8971, + "step": 6355 + }, + { + "epoch": 1.9508901166359731, + "grad_norm": 0.42980003356933594, + "learning_rate": 9.302530590354452e-05, + "loss": 1.9233, + "step": 6356 + }, + { + "epoch": 1.951197053406998, + "grad_norm": 0.3832406997680664, + "learning_rate": 9.302277348841042e-05, + "loss": 1.9317, + "step": 6357 + }, + { + "epoch": 1.9515039901780233, + "grad_norm": 0.37214264273643494, + "learning_rate": 9.30202406481001e-05, + "loss": 1.9172, + "step": 6358 + }, + { + "epoch": 1.9518109269490485, + "grad_norm": 0.3601585924625397, + "learning_rate": 9.30177073826385e-05, + "loss": 1.9286, + "step": 6359 + }, + { + "epoch": 1.9521178637200736, + "grad_norm": 0.36419349908828735, + "learning_rate": 9.301517369205072e-05, + "loss": 1.8624, + "step": 6360 + }, + { + "epoch": 1.952424800491099, + "grad_norm": 0.3808813691139221, + "learning_rate": 9.30126395763618e-05, + "loss": 1.8656, + "step": 6361 + }, + { + "epoch": 1.952731737262124, + "grad_norm": 0.39045700430870056, + "learning_rate": 9.301010503559675e-05, + "loss": 1.9205, + "step": 6362 + }, + { + "epoch": 1.9530386740331491, + "grad_norm": 0.37281444668769836, + "learning_rate": 9.300757006978065e-05, + "loss": 1.9162, + "step": 6363 + }, + { + "epoch": 1.9533456108041745, + "grad_norm": 0.4525204002857208, + "learning_rate": 9.300503467893851e-05, + "loss": 1.8999, + "step": 6364 + }, + { + "epoch": 1.9536525475751993, + "grad_norm": 0.41406187415122986, + "learning_rate": 9.300249886309542e-05, + "loss": 1.9804, + "step": 6365 + }, + { + "epoch": 1.9539594843462247, + "grad_norm": 0.4125058650970459, + "learning_rate": 9.299996262227644e-05, + "loss": 1.8464, + "step": 6366 + }, + { + "epoch": 1.9542664211172498, + "grad_norm": 0.41582876443862915, + "learning_rate": 9.299742595650663e-05, + "loss": 1.9937, + "step": 6367 + }, + { + "epoch": 1.954573357888275, + "grad_norm": 0.4360882639884949, + "learning_rate": 9.299488886581103e-05, + "loss": 1.9064, + "step": 6368 + }, + { + "epoch": 1.9548802946593002, + "grad_norm": 0.38369372487068176, + "learning_rate": 9.299235135021476e-05, + "loss": 1.9202, + "step": 6369 + }, + { + "epoch": 1.9551872314303254, + "grad_norm": 0.34401383996009827, + "learning_rate": 9.298981340974287e-05, + "loss": 1.844, + "step": 6370 + }, + { + "epoch": 1.9554941682013505, + "grad_norm": 0.3434326946735382, + "learning_rate": 9.298727504442044e-05, + "loss": 1.8206, + "step": 6371 + }, + { + "epoch": 1.9558011049723758, + "grad_norm": 0.35966724157333374, + "learning_rate": 9.298473625427257e-05, + "loss": 1.9, + "step": 6372 + }, + { + "epoch": 1.9561080417434007, + "grad_norm": 0.3726016581058502, + "learning_rate": 9.298219703932434e-05, + "loss": 1.9004, + "step": 6373 + }, + { + "epoch": 1.956414978514426, + "grad_norm": 0.3377366364002228, + "learning_rate": 9.297965739960084e-05, + "loss": 1.8747, + "step": 6374 + }, + { + "epoch": 1.9567219152854514, + "grad_norm": 0.36824578046798706, + "learning_rate": 9.297711733512718e-05, + "loss": 1.9059, + "step": 6375 + }, + { + "epoch": 1.9570288520564763, + "grad_norm": 0.3434023857116699, + "learning_rate": 9.297457684592847e-05, + "loss": 1.8624, + "step": 6376 + }, + { + "epoch": 1.9573357888275016, + "grad_norm": 0.36236703395843506, + "learning_rate": 9.297203593202979e-05, + "loss": 1.8558, + "step": 6377 + }, + { + "epoch": 1.9576427255985267, + "grad_norm": 0.3326953947544098, + "learning_rate": 9.296949459345625e-05, + "loss": 1.9189, + "step": 6378 + }, + { + "epoch": 1.9579496623695518, + "grad_norm": 0.3358452022075653, + "learning_rate": 9.2966952830233e-05, + "loss": 1.8601, + "step": 6379 + }, + { + "epoch": 1.9582565991405771, + "grad_norm": 0.36092114448547363, + "learning_rate": 9.296441064238514e-05, + "loss": 1.873, + "step": 6380 + }, + { + "epoch": 1.9585635359116023, + "grad_norm": 0.345683217048645, + "learning_rate": 9.296186802993778e-05, + "loss": 1.9122, + "step": 6381 + }, + { + "epoch": 1.9588704726826274, + "grad_norm": 0.32488611340522766, + "learning_rate": 9.295932499291606e-05, + "loss": 1.8709, + "step": 6382 + }, + { + "epoch": 1.9591774094536527, + "grad_norm": 0.34276288747787476, + "learning_rate": 9.295678153134512e-05, + "loss": 1.937, + "step": 6383 + }, + { + "epoch": 1.9594843462246776, + "grad_norm": 0.3953622877597809, + "learning_rate": 9.295423764525008e-05, + "loss": 1.9357, + "step": 6384 + }, + { + "epoch": 1.959791282995703, + "grad_norm": 0.37806951999664307, + "learning_rate": 9.29516933346561e-05, + "loss": 1.8813, + "step": 6385 + }, + { + "epoch": 1.960098219766728, + "grad_norm": 0.39551272988319397, + "learning_rate": 9.29491485995883e-05, + "loss": 1.8812, + "step": 6386 + }, + { + "epoch": 1.9604051565377532, + "grad_norm": 0.37042370438575745, + "learning_rate": 9.294660344007184e-05, + "loss": 1.9059, + "step": 6387 + }, + { + "epoch": 1.9607120933087785, + "grad_norm": 0.37503576278686523, + "learning_rate": 9.294405785613187e-05, + "loss": 1.9792, + "step": 6388 + }, + { + "epoch": 1.9610190300798036, + "grad_norm": 0.3515741229057312, + "learning_rate": 9.294151184779355e-05, + "loss": 1.8792, + "step": 6389 + }, + { + "epoch": 1.9613259668508287, + "grad_norm": 0.319890558719635, + "learning_rate": 9.293896541508205e-05, + "loss": 1.9222, + "step": 6390 + }, + { + "epoch": 1.961632903621854, + "grad_norm": 0.3517487645149231, + "learning_rate": 9.293641855802252e-05, + "loss": 1.8751, + "step": 6391 + }, + { + "epoch": 1.961939840392879, + "grad_norm": 0.33269986510276794, + "learning_rate": 9.293387127664012e-05, + "loss": 1.8372, + "step": 6392 + }, + { + "epoch": 1.9622467771639043, + "grad_norm": 0.36048516631126404, + "learning_rate": 9.293132357096007e-05, + "loss": 1.8944, + "step": 6393 + }, + { + "epoch": 1.9625537139349294, + "grad_norm": 0.4329642057418823, + "learning_rate": 9.292877544100751e-05, + "loss": 1.9868, + "step": 6394 + }, + { + "epoch": 1.9628606507059545, + "grad_norm": 0.445496529340744, + "learning_rate": 9.292622688680762e-05, + "loss": 1.9885, + "step": 6395 + }, + { + "epoch": 1.9631675874769798, + "grad_norm": 0.3818886876106262, + "learning_rate": 9.292367790838561e-05, + "loss": 1.9515, + "step": 6396 + }, + { + "epoch": 1.963474524248005, + "grad_norm": 0.3800121545791626, + "learning_rate": 9.292112850576664e-05, + "loss": 1.8838, + "step": 6397 + }, + { + "epoch": 1.96378146101903, + "grad_norm": 0.44252321124076843, + "learning_rate": 9.291857867897593e-05, + "loss": 1.9296, + "step": 6398 + }, + { + "epoch": 1.9640883977900554, + "grad_norm": 0.463766485452652, + "learning_rate": 9.291602842803867e-05, + "loss": 1.9164, + "step": 6399 + }, + { + "epoch": 1.9643953345610803, + "grad_norm": 0.4599217474460602, + "learning_rate": 9.291347775298006e-05, + "loss": 1.9277, + "step": 6400 + }, + { + "epoch": 1.9647022713321056, + "grad_norm": 0.371346652507782, + "learning_rate": 9.291092665382532e-05, + "loss": 1.9036, + "step": 6401 + }, + { + "epoch": 1.9650092081031307, + "grad_norm": 0.327197402715683, + "learning_rate": 9.290837513059965e-05, + "loss": 1.8214, + "step": 6402 + }, + { + "epoch": 1.9653161448741558, + "grad_norm": 0.3346688747406006, + "learning_rate": 9.290582318332826e-05, + "loss": 1.8671, + "step": 6403 + }, + { + "epoch": 1.9656230816451812, + "grad_norm": 0.342208594083786, + "learning_rate": 9.290327081203637e-05, + "loss": 1.9143, + "step": 6404 + }, + { + "epoch": 1.9659300184162063, + "grad_norm": 0.3430559039115906, + "learning_rate": 9.290071801674923e-05, + "loss": 1.9135, + "step": 6405 + }, + { + "epoch": 1.9662369551872314, + "grad_norm": 0.3335573971271515, + "learning_rate": 9.289816479749202e-05, + "loss": 1.9011, + "step": 6406 + }, + { + "epoch": 1.9665438919582567, + "grad_norm": 0.3464879095554352, + "learning_rate": 9.289561115429004e-05, + "loss": 1.9061, + "step": 6407 + }, + { + "epoch": 1.9668508287292816, + "grad_norm": 0.3513408899307251, + "learning_rate": 9.289305708716847e-05, + "loss": 1.8982, + "step": 6408 + }, + { + "epoch": 1.967157765500307, + "grad_norm": 0.3888663947582245, + "learning_rate": 9.289050259615256e-05, + "loss": 1.9196, + "step": 6409 + }, + { + "epoch": 1.967464702271332, + "grad_norm": 0.3414073884487152, + "learning_rate": 9.288794768126759e-05, + "loss": 1.932, + "step": 6410 + }, + { + "epoch": 1.9677716390423572, + "grad_norm": 0.33067384362220764, + "learning_rate": 9.288539234253876e-05, + "loss": 1.8547, + "step": 6411 + }, + { + "epoch": 1.9680785758133825, + "grad_norm": 0.31827688217163086, + "learning_rate": 9.288283657999135e-05, + "loss": 1.8691, + "step": 6412 + }, + { + "epoch": 1.9683855125844076, + "grad_norm": 0.32259073853492737, + "learning_rate": 9.288028039365062e-05, + "loss": 1.8889, + "step": 6413 + }, + { + "epoch": 1.9686924493554327, + "grad_norm": 0.37552687525749207, + "learning_rate": 9.287772378354182e-05, + "loss": 1.8709, + "step": 6414 + }, + { + "epoch": 1.968999386126458, + "grad_norm": 0.3446151316165924, + "learning_rate": 9.287516674969024e-05, + "loss": 1.8749, + "step": 6415 + }, + { + "epoch": 1.969306322897483, + "grad_norm": 0.3648208975791931, + "learning_rate": 9.287260929212111e-05, + "loss": 1.93, + "step": 6416 + }, + { + "epoch": 1.9696132596685083, + "grad_norm": 0.3430599868297577, + "learning_rate": 9.287005141085974e-05, + "loss": 1.8537, + "step": 6417 + }, + { + "epoch": 1.9699201964395334, + "grad_norm": 0.39110586047172546, + "learning_rate": 9.286749310593139e-05, + "loss": 1.987, + "step": 6418 + }, + { + "epoch": 1.9702271332105585, + "grad_norm": 0.4033393859863281, + "learning_rate": 9.286493437736136e-05, + "loss": 1.9793, + "step": 6419 + }, + { + "epoch": 1.9705340699815839, + "grad_norm": 0.3950151205062866, + "learning_rate": 9.286237522517491e-05, + "loss": 1.8781, + "step": 6420 + }, + { + "epoch": 1.970841006752609, + "grad_norm": 0.4614053964614868, + "learning_rate": 9.285981564939735e-05, + "loss": 1.9886, + "step": 6421 + }, + { + "epoch": 1.971147943523634, + "grad_norm": 0.4990023076534271, + "learning_rate": 9.285725565005398e-05, + "loss": 1.8957, + "step": 6422 + }, + { + "epoch": 1.9714548802946594, + "grad_norm": 0.501301109790802, + "learning_rate": 9.285469522717008e-05, + "loss": 1.8606, + "step": 6423 + }, + { + "epoch": 1.9717618170656843, + "grad_norm": 0.3820148706436157, + "learning_rate": 9.285213438077097e-05, + "loss": 1.9097, + "step": 6424 + }, + { + "epoch": 1.9720687538367097, + "grad_norm": 0.3959129750728607, + "learning_rate": 9.284957311088193e-05, + "loss": 1.8972, + "step": 6425 + }, + { + "epoch": 1.9723756906077348, + "grad_norm": 0.4914678931236267, + "learning_rate": 9.284701141752831e-05, + "loss": 1.9211, + "step": 6426 + }, + { + "epoch": 1.9726826273787599, + "grad_norm": 0.5992010831832886, + "learning_rate": 9.284444930073542e-05, + "loss": 1.917, + "step": 6427 + }, + { + "epoch": 1.9729895641497852, + "grad_norm": 0.6089407801628113, + "learning_rate": 9.284188676052856e-05, + "loss": 1.9497, + "step": 6428 + }, + { + "epoch": 1.9732965009208103, + "grad_norm": 0.5493173003196716, + "learning_rate": 9.283932379693306e-05, + "loss": 1.9888, + "step": 6429 + }, + { + "epoch": 1.9736034376918354, + "grad_norm": 0.4451984167098999, + "learning_rate": 9.283676040997426e-05, + "loss": 1.892, + "step": 6430 + }, + { + "epoch": 1.9739103744628608, + "grad_norm": 0.35765743255615234, + "learning_rate": 9.283419659967748e-05, + "loss": 1.8768, + "step": 6431 + }, + { + "epoch": 1.9742173112338857, + "grad_norm": 0.36561164259910583, + "learning_rate": 9.283163236606807e-05, + "loss": 1.825, + "step": 6432 + }, + { + "epoch": 1.974524248004911, + "grad_norm": 0.38473913073539734, + "learning_rate": 9.282906770917137e-05, + "loss": 1.9247, + "step": 6433 + }, + { + "epoch": 1.974831184775936, + "grad_norm": 0.324945867061615, + "learning_rate": 9.28265026290127e-05, + "loss": 1.8832, + "step": 6434 + }, + { + "epoch": 1.9751381215469612, + "grad_norm": 0.38697487115859985, + "learning_rate": 9.282393712561744e-05, + "loss": 1.9282, + "step": 6435 + }, + { + "epoch": 1.9754450583179866, + "grad_norm": 0.3772333264350891, + "learning_rate": 9.282137119901094e-05, + "loss": 1.8822, + "step": 6436 + }, + { + "epoch": 1.9757519950890117, + "grad_norm": 0.3522745668888092, + "learning_rate": 9.281880484921854e-05, + "loss": 1.9102, + "step": 6437 + }, + { + "epoch": 1.9760589318600368, + "grad_norm": 0.36745330691337585, + "learning_rate": 9.281623807626562e-05, + "loss": 1.8842, + "step": 6438 + }, + { + "epoch": 1.9763658686310621, + "grad_norm": 0.3990548253059387, + "learning_rate": 9.281367088017755e-05, + "loss": 1.9642, + "step": 6439 + }, + { + "epoch": 1.976672805402087, + "grad_norm": 0.3333520293235779, + "learning_rate": 9.281110326097969e-05, + "loss": 1.8541, + "step": 6440 + }, + { + "epoch": 1.9769797421731123, + "grad_norm": 0.3282802700996399, + "learning_rate": 9.280853521869739e-05, + "loss": 1.8416, + "step": 6441 + }, + { + "epoch": 1.9772866789441375, + "grad_norm": 0.3415268361568451, + "learning_rate": 9.280596675335607e-05, + "loss": 1.9009, + "step": 6442 + }, + { + "epoch": 1.9775936157151626, + "grad_norm": 0.3621836006641388, + "learning_rate": 9.28033978649811e-05, + "loss": 1.8584, + "step": 6443 + }, + { + "epoch": 1.977900552486188, + "grad_norm": 0.34778010845184326, + "learning_rate": 9.280082855359786e-05, + "loss": 1.9455, + "step": 6444 + }, + { + "epoch": 1.978207489257213, + "grad_norm": 0.36525633931159973, + "learning_rate": 9.279825881923174e-05, + "loss": 1.9182, + "step": 6445 + }, + { + "epoch": 1.9785144260282381, + "grad_norm": 0.3404203951358795, + "learning_rate": 9.279568866190815e-05, + "loss": 1.8853, + "step": 6446 + }, + { + "epoch": 1.9788213627992635, + "grad_norm": 0.4564785659313202, + "learning_rate": 9.279311808165249e-05, + "loss": 2.0012, + "step": 6447 + }, + { + "epoch": 1.9791282995702886, + "grad_norm": 0.4371441602706909, + "learning_rate": 9.279054707849015e-05, + "loss": 1.9372, + "step": 6448 + }, + { + "epoch": 1.9794352363413137, + "grad_norm": 0.3928726017475128, + "learning_rate": 9.278797565244652e-05, + "loss": 1.882, + "step": 6449 + }, + { + "epoch": 1.979742173112339, + "grad_norm": 0.483331561088562, + "learning_rate": 9.278540380354706e-05, + "loss": 1.9664, + "step": 6450 + }, + { + "epoch": 1.980049109883364, + "grad_norm": 0.39085066318511963, + "learning_rate": 9.278283153181716e-05, + "loss": 1.874, + "step": 6451 + }, + { + "epoch": 1.9803560466543892, + "grad_norm": 0.3549460172653198, + "learning_rate": 9.278025883728224e-05, + "loss": 1.9108, + "step": 6452 + }, + { + "epoch": 1.9806629834254144, + "grad_norm": 0.4260072410106659, + "learning_rate": 9.277768571996772e-05, + "loss": 1.8621, + "step": 6453 + }, + { + "epoch": 1.9809699201964395, + "grad_norm": 0.4531188905239105, + "learning_rate": 9.277511217989904e-05, + "loss": 1.9924, + "step": 6454 + }, + { + "epoch": 1.9812768569674648, + "grad_norm": 0.34916743636131287, + "learning_rate": 9.277253821710165e-05, + "loss": 1.9459, + "step": 6455 + }, + { + "epoch": 1.98158379373849, + "grad_norm": 0.45466169714927673, + "learning_rate": 9.276996383160095e-05, + "loss": 1.9129, + "step": 6456 + }, + { + "epoch": 1.981890730509515, + "grad_norm": 0.4948022663593292, + "learning_rate": 9.27673890234224e-05, + "loss": 1.9362, + "step": 6457 + }, + { + "epoch": 1.9821976672805404, + "grad_norm": 0.43365779519081116, + "learning_rate": 9.276481379259146e-05, + "loss": 1.9323, + "step": 6458 + }, + { + "epoch": 1.9825046040515653, + "grad_norm": 0.5301255583763123, + "learning_rate": 9.276223813913354e-05, + "loss": 1.9611, + "step": 6459 + }, + { + "epoch": 1.9828115408225906, + "grad_norm": 0.4785257577896118, + "learning_rate": 9.275966206307412e-05, + "loss": 1.8945, + "step": 6460 + }, + { + "epoch": 1.9831184775936157, + "grad_norm": 0.4091590940952301, + "learning_rate": 9.275708556443868e-05, + "loss": 1.9171, + "step": 6461 + }, + { + "epoch": 1.9834254143646408, + "grad_norm": 0.4031025767326355, + "learning_rate": 9.275450864325264e-05, + "loss": 1.9518, + "step": 6462 + }, + { + "epoch": 1.9837323511356661, + "grad_norm": 0.39147642254829407, + "learning_rate": 9.275193129954149e-05, + "loss": 1.8756, + "step": 6463 + }, + { + "epoch": 1.9840392879066913, + "grad_norm": 0.3863523006439209, + "learning_rate": 9.27493535333307e-05, + "loss": 1.8894, + "step": 6464 + }, + { + "epoch": 1.9843462246777164, + "grad_norm": 0.36373165249824524, + "learning_rate": 9.274677534464576e-05, + "loss": 1.8574, + "step": 6465 + }, + { + "epoch": 1.9846531614487417, + "grad_norm": 0.40247389674186707, + "learning_rate": 9.274419673351211e-05, + "loss": 1.832, + "step": 6466 + }, + { + "epoch": 1.9849600982197666, + "grad_norm": 0.3874013125896454, + "learning_rate": 9.274161769995526e-05, + "loss": 1.9079, + "step": 6467 + }, + { + "epoch": 1.985267034990792, + "grad_norm": 0.35506606101989746, + "learning_rate": 9.27390382440007e-05, + "loss": 1.8784, + "step": 6468 + }, + { + "epoch": 1.985573971761817, + "grad_norm": 0.406325101852417, + "learning_rate": 9.273645836567388e-05, + "loss": 1.9822, + "step": 6469 + }, + { + "epoch": 1.9858809085328422, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.273387806500036e-05, + "loss": 1.9334, + "step": 6470 + }, + { + "epoch": 1.9861878453038675, + "grad_norm": 0.4810343384742737, + "learning_rate": 9.273129734200561e-05, + "loss": 1.9598, + "step": 6471 + }, + { + "epoch": 1.9864947820748926, + "grad_norm": 0.4552834630012512, + "learning_rate": 9.272871619671513e-05, + "loss": 1.9504, + "step": 6472 + }, + { + "epoch": 1.9868017188459177, + "grad_norm": 0.38974207639694214, + "learning_rate": 9.272613462915443e-05, + "loss": 1.8811, + "step": 6473 + }, + { + "epoch": 1.987108655616943, + "grad_norm": 0.40983298420906067, + "learning_rate": 9.272355263934902e-05, + "loss": 1.8876, + "step": 6474 + }, + { + "epoch": 1.987415592387968, + "grad_norm": 0.3684757947921753, + "learning_rate": 9.272097022732443e-05, + "loss": 1.921, + "step": 6475 + }, + { + "epoch": 1.9877225291589933, + "grad_norm": 0.38384270668029785, + "learning_rate": 9.271838739310618e-05, + "loss": 1.9099, + "step": 6476 + }, + { + "epoch": 1.9880294659300184, + "grad_norm": 0.3783731460571289, + "learning_rate": 9.271580413671976e-05, + "loss": 1.9322, + "step": 6477 + }, + { + "epoch": 1.9883364027010435, + "grad_norm": 0.3686216473579407, + "learning_rate": 9.271322045819076e-05, + "loss": 1.914, + "step": 6478 + }, + { + "epoch": 1.9886433394720688, + "grad_norm": 0.38776305317878723, + "learning_rate": 9.271063635754466e-05, + "loss": 1.9331, + "step": 6479 + }, + { + "epoch": 1.988950276243094, + "grad_norm": 0.35099950432777405, + "learning_rate": 9.270805183480702e-05, + "loss": 1.9837, + "step": 6480 + }, + { + "epoch": 1.989257213014119, + "grad_norm": 0.3736453652381897, + "learning_rate": 9.270546689000339e-05, + "loss": 1.846, + "step": 6481 + }, + { + "epoch": 1.9895641497851444, + "grad_norm": 0.3654848635196686, + "learning_rate": 9.27028815231593e-05, + "loss": 1.8987, + "step": 6482 + }, + { + "epoch": 1.9898710865561693, + "grad_norm": 0.3534870147705078, + "learning_rate": 9.27002957343003e-05, + "loss": 1.868, + "step": 6483 + }, + { + "epoch": 1.9901780233271946, + "grad_norm": 0.3143392503261566, + "learning_rate": 9.269770952345197e-05, + "loss": 1.8042, + "step": 6484 + }, + { + "epoch": 1.9904849600982197, + "grad_norm": 0.37151026725769043, + "learning_rate": 9.269512289063982e-05, + "loss": 1.8392, + "step": 6485 + }, + { + "epoch": 1.9907918968692448, + "grad_norm": 0.39781463146209717, + "learning_rate": 9.269253583588947e-05, + "loss": 1.9911, + "step": 6486 + }, + { + "epoch": 1.9910988336402702, + "grad_norm": 0.44022107124328613, + "learning_rate": 9.268994835922643e-05, + "loss": 1.9644, + "step": 6487 + }, + { + "epoch": 1.9914057704112953, + "grad_norm": 0.4058530628681183, + "learning_rate": 9.268736046067632e-05, + "loss": 1.9062, + "step": 6488 + }, + { + "epoch": 1.9917127071823204, + "grad_norm": 0.3754481077194214, + "learning_rate": 9.268477214026467e-05, + "loss": 1.8278, + "step": 6489 + }, + { + "epoch": 1.9920196439533457, + "grad_norm": 0.318208247423172, + "learning_rate": 9.268218339801711e-05, + "loss": 1.8529, + "step": 6490 + }, + { + "epoch": 1.9923265807243706, + "grad_norm": 0.350777268409729, + "learning_rate": 9.267959423395918e-05, + "loss": 1.9024, + "step": 6491 + }, + { + "epoch": 1.992633517495396, + "grad_norm": 0.3145158588886261, + "learning_rate": 9.26770046481165e-05, + "loss": 1.934, + "step": 6492 + }, + { + "epoch": 1.992940454266421, + "grad_norm": 0.3347548842430115, + "learning_rate": 9.267441464051463e-05, + "loss": 1.8989, + "step": 6493 + }, + { + "epoch": 1.9932473910374462, + "grad_norm": 0.33111512660980225, + "learning_rate": 9.267182421117919e-05, + "loss": 1.8808, + "step": 6494 + }, + { + "epoch": 1.9935543278084715, + "grad_norm": 0.3135010898113251, + "learning_rate": 9.266923336013577e-05, + "loss": 1.895, + "step": 6495 + }, + { + "epoch": 1.9938612645794966, + "grad_norm": 0.3638830780982971, + "learning_rate": 9.266664208740998e-05, + "loss": 1.9331, + "step": 6496 + }, + { + "epoch": 1.9941682013505218, + "grad_norm": 0.3592624068260193, + "learning_rate": 9.266405039302743e-05, + "loss": 1.8963, + "step": 6497 + }, + { + "epoch": 1.994475138121547, + "grad_norm": 0.34216129779815674, + "learning_rate": 9.266145827701371e-05, + "loss": 1.9062, + "step": 6498 + }, + { + "epoch": 1.994782074892572, + "grad_norm": 0.4180343747138977, + "learning_rate": 9.265886573939447e-05, + "loss": 1.9351, + "step": 6499 + }, + { + "epoch": 1.9950890116635973, + "grad_norm": 0.36890342831611633, + "learning_rate": 9.265627278019531e-05, + "loss": 1.9037, + "step": 6500 + }, + { + "epoch": 1.9953959484346224, + "grad_norm": 0.36638152599334717, + "learning_rate": 9.265367939944188e-05, + "loss": 1.9524, + "step": 6501 + }, + { + "epoch": 1.9957028852056475, + "grad_norm": 0.44918373227119446, + "learning_rate": 9.265108559715976e-05, + "loss": 1.9236, + "step": 6502 + }, + { + "epoch": 1.9960098219766729, + "grad_norm": 0.3805326521396637, + "learning_rate": 9.264849137337462e-05, + "loss": 1.8526, + "step": 6503 + }, + { + "epoch": 1.996316758747698, + "grad_norm": 0.39035212993621826, + "learning_rate": 9.26458967281121e-05, + "loss": 1.8256, + "step": 6504 + }, + { + "epoch": 1.996623695518723, + "grad_norm": 0.330522358417511, + "learning_rate": 9.264330166139783e-05, + "loss": 1.8487, + "step": 6505 + }, + { + "epoch": 1.9969306322897484, + "grad_norm": 0.33569198846817017, + "learning_rate": 9.264070617325746e-05, + "loss": 1.8735, + "step": 6506 + }, + { + "epoch": 1.9972375690607733, + "grad_norm": 0.4121384918689728, + "learning_rate": 9.263811026371664e-05, + "loss": 2.0028, + "step": 6507 + }, + { + "epoch": 1.9975445058317987, + "grad_norm": 0.3419879972934723, + "learning_rate": 9.263551393280103e-05, + "loss": 1.8432, + "step": 6508 + }, + { + "epoch": 1.9978514426028238, + "grad_norm": 0.33369818329811096, + "learning_rate": 9.263291718053626e-05, + "loss": 1.8752, + "step": 6509 + }, + { + "epoch": 1.9981583793738489, + "grad_norm": 0.3580996096134186, + "learning_rate": 9.263032000694804e-05, + "loss": 1.9319, + "step": 6510 + }, + { + "epoch": 1.9984653161448742, + "grad_norm": 0.38216903805732727, + "learning_rate": 9.2627722412062e-05, + "loss": 1.9424, + "step": 6511 + }, + { + "epoch": 1.9987722529158993, + "grad_norm": 0.3836761713027954, + "learning_rate": 9.26251243959038e-05, + "loss": 1.9259, + "step": 6512 + }, + { + "epoch": 1.9990791896869244, + "grad_norm": 0.34978967905044556, + "learning_rate": 9.262252595849917e-05, + "loss": 1.8648, + "step": 6513 + }, + { + "epoch": 1.9993861264579498, + "grad_norm": 0.4190160632133484, + "learning_rate": 9.261992709987375e-05, + "loss": 1.9456, + "step": 6514 + }, + { + "epoch": 1.9996930632289747, + "grad_norm": 0.38700881600379944, + "learning_rate": 9.261732782005322e-05, + "loss": 1.8768, + "step": 6515 + }, + { + "epoch": 2.0, + "grad_norm": 0.3706338405609131, + "learning_rate": 9.261472811906328e-05, + "loss": 1.9247, + "step": 6516 + }, + { + "epoch": 2.0003069367710253, + "grad_norm": 0.36679908633232117, + "learning_rate": 9.261212799692962e-05, + "loss": 1.8193, + "step": 6517 + }, + { + "epoch": 2.0006138735420502, + "grad_norm": 0.45219072699546814, + "learning_rate": 9.260952745367795e-05, + "loss": 1.9019, + "step": 6518 + }, + { + "epoch": 2.0009208103130756, + "grad_norm": 0.6038491725921631, + "learning_rate": 9.260692648933393e-05, + "loss": 1.8834, + "step": 6519 + }, + { + "epoch": 2.001227747084101, + "grad_norm": 0.5823990106582642, + "learning_rate": 9.260432510392331e-05, + "loss": 1.9066, + "step": 6520 + }, + { + "epoch": 2.001534683855126, + "grad_norm": 0.4731088876724243, + "learning_rate": 9.260172329747178e-05, + "loss": 1.8997, + "step": 6521 + }, + { + "epoch": 2.001841620626151, + "grad_norm": 0.3397974669933319, + "learning_rate": 9.259912107000504e-05, + "loss": 1.9396, + "step": 6522 + }, + { + "epoch": 2.002148557397176, + "grad_norm": 0.374734103679657, + "learning_rate": 9.259651842154882e-05, + "loss": 1.9311, + "step": 6523 + }, + { + "epoch": 2.0024554941682013, + "grad_norm": 0.48218441009521484, + "learning_rate": 9.259391535212884e-05, + "loss": 1.948, + "step": 6524 + }, + { + "epoch": 2.0027624309392267, + "grad_norm": 0.40540626645088196, + "learning_rate": 9.259131186177082e-05, + "loss": 1.8541, + "step": 6525 + }, + { + "epoch": 2.0030693677102516, + "grad_norm": 0.3698440492153168, + "learning_rate": 9.258870795050048e-05, + "loss": 1.9622, + "step": 6526 + }, + { + "epoch": 2.003376304481277, + "grad_norm": 0.35084524750709534, + "learning_rate": 9.258610361834358e-05, + "loss": 1.8882, + "step": 6527 + }, + { + "epoch": 2.0036832412523022, + "grad_norm": 0.38982072472572327, + "learning_rate": 9.258349886532584e-05, + "loss": 1.9523, + "step": 6528 + }, + { + "epoch": 2.003990178023327, + "grad_norm": 0.3737744390964508, + "learning_rate": 9.258089369147302e-05, + "loss": 1.9091, + "step": 6529 + }, + { + "epoch": 2.0042971147943525, + "grad_norm": 0.36094167828559875, + "learning_rate": 9.257828809681083e-05, + "loss": 1.8711, + "step": 6530 + }, + { + "epoch": 2.0046040515653774, + "grad_norm": 0.3270244896411896, + "learning_rate": 9.257568208136506e-05, + "loss": 1.8738, + "step": 6531 + }, + { + "epoch": 2.0049109883364027, + "grad_norm": 0.3320237100124359, + "learning_rate": 9.257307564516145e-05, + "loss": 1.8889, + "step": 6532 + }, + { + "epoch": 2.005217925107428, + "grad_norm": 0.3091014623641968, + "learning_rate": 9.257046878822573e-05, + "loss": 1.8683, + "step": 6533 + }, + { + "epoch": 2.005524861878453, + "grad_norm": 0.3234712779521942, + "learning_rate": 9.25678615105837e-05, + "loss": 1.8787, + "step": 6534 + }, + { + "epoch": 2.0058317986494782, + "grad_norm": 0.38402292132377625, + "learning_rate": 9.25652538122611e-05, + "loss": 1.9414, + "step": 6535 + }, + { + "epoch": 2.0061387354205036, + "grad_norm": 0.41379863023757935, + "learning_rate": 9.256264569328372e-05, + "loss": 1.9185, + "step": 6536 + }, + { + "epoch": 2.0064456721915285, + "grad_norm": 0.35990384221076965, + "learning_rate": 9.256003715367733e-05, + "loss": 1.8756, + "step": 6537 + }, + { + "epoch": 2.006752608962554, + "grad_norm": 0.3489217460155487, + "learning_rate": 9.25574281934677e-05, + "loss": 1.8984, + "step": 6538 + }, + { + "epoch": 2.0070595457335787, + "grad_norm": 0.326541006565094, + "learning_rate": 9.255481881268064e-05, + "loss": 1.8559, + "step": 6539 + }, + { + "epoch": 2.007366482504604, + "grad_norm": 0.40900397300720215, + "learning_rate": 9.25522090113419e-05, + "loss": 1.8832, + "step": 6540 + }, + { + "epoch": 2.0076734192756294, + "grad_norm": 0.4130956828594208, + "learning_rate": 9.254959878947731e-05, + "loss": 1.8437, + "step": 6541 + }, + { + "epoch": 2.0079803560466543, + "grad_norm": 0.38869336247444153, + "learning_rate": 9.254698814711263e-05, + "loss": 1.8839, + "step": 6542 + }, + { + "epoch": 2.0082872928176796, + "grad_norm": 0.37832918763160706, + "learning_rate": 9.254437708427368e-05, + "loss": 1.9519, + "step": 6543 + }, + { + "epoch": 2.008594229588705, + "grad_norm": 0.35336560010910034, + "learning_rate": 9.254176560098625e-05, + "loss": 1.8928, + "step": 6544 + }, + { + "epoch": 2.00890116635973, + "grad_norm": 0.347260981798172, + "learning_rate": 9.253915369727617e-05, + "loss": 1.9133, + "step": 6545 + }, + { + "epoch": 2.009208103130755, + "grad_norm": 0.3706999719142914, + "learning_rate": 9.253654137316923e-05, + "loss": 1.9048, + "step": 6546 + }, + { + "epoch": 2.00951503990178, + "grad_norm": 0.40080907940864563, + "learning_rate": 9.253392862869127e-05, + "loss": 1.9169, + "step": 6547 + }, + { + "epoch": 2.0098219766728054, + "grad_norm": 0.3635334074497223, + "learning_rate": 9.253131546386808e-05, + "loss": 1.8623, + "step": 6548 + }, + { + "epoch": 2.0101289134438307, + "grad_norm": 0.32642990350723267, + "learning_rate": 9.252870187872552e-05, + "loss": 1.8624, + "step": 6549 + }, + { + "epoch": 2.0104358502148556, + "grad_norm": 0.32467779517173767, + "learning_rate": 9.25260878732894e-05, + "loss": 1.8867, + "step": 6550 + }, + { + "epoch": 2.010742786985881, + "grad_norm": 0.3496699631214142, + "learning_rate": 9.252347344758553e-05, + "loss": 1.8441, + "step": 6551 + }, + { + "epoch": 2.0110497237569063, + "grad_norm": 0.3624981939792633, + "learning_rate": 9.252085860163981e-05, + "loss": 1.9045, + "step": 6552 + }, + { + "epoch": 2.011356660527931, + "grad_norm": 0.3801099359989166, + "learning_rate": 9.251824333547801e-05, + "loss": 1.9273, + "step": 6553 + }, + { + "epoch": 2.0116635972989565, + "grad_norm": 0.355866402387619, + "learning_rate": 9.251562764912602e-05, + "loss": 1.9032, + "step": 6554 + }, + { + "epoch": 2.0119705340699814, + "grad_norm": 0.31210052967071533, + "learning_rate": 9.251301154260968e-05, + "loss": 1.8148, + "step": 6555 + }, + { + "epoch": 2.0122774708410067, + "grad_norm": 0.3583676218986511, + "learning_rate": 9.251039501595485e-05, + "loss": 1.9326, + "step": 6556 + }, + { + "epoch": 2.012584407612032, + "grad_norm": 0.40221846103668213, + "learning_rate": 9.250777806918737e-05, + "loss": 1.8968, + "step": 6557 + }, + { + "epoch": 2.012891344383057, + "grad_norm": 0.3403627574443817, + "learning_rate": 9.250516070233311e-05, + "loss": 1.8956, + "step": 6558 + }, + { + "epoch": 2.0131982811540823, + "grad_norm": 0.37752729654312134, + "learning_rate": 9.250254291541796e-05, + "loss": 1.9136, + "step": 6559 + }, + { + "epoch": 2.0135052179251076, + "grad_norm": 0.3661794364452362, + "learning_rate": 9.249992470846774e-05, + "loss": 1.8796, + "step": 6560 + }, + { + "epoch": 2.0138121546961325, + "grad_norm": 0.315603643655777, + "learning_rate": 9.249730608150837e-05, + "loss": 1.8711, + "step": 6561 + }, + { + "epoch": 2.014119091467158, + "grad_norm": 0.3187065124511719, + "learning_rate": 9.249468703456571e-05, + "loss": 1.8611, + "step": 6562 + }, + { + "epoch": 2.0144260282381827, + "grad_norm": 0.3018025755882263, + "learning_rate": 9.249206756766564e-05, + "loss": 1.786, + "step": 6563 + }, + { + "epoch": 2.014732965009208, + "grad_norm": 0.344963401556015, + "learning_rate": 9.248944768083406e-05, + "loss": 1.9428, + "step": 6564 + }, + { + "epoch": 2.0150399017802334, + "grad_norm": 0.29776978492736816, + "learning_rate": 9.248682737409687e-05, + "loss": 1.8089, + "step": 6565 + }, + { + "epoch": 2.0153468385512583, + "grad_norm": 0.348982572555542, + "learning_rate": 9.248420664747992e-05, + "loss": 1.8407, + "step": 6566 + }, + { + "epoch": 2.0156537753222836, + "grad_norm": 0.3413224518299103, + "learning_rate": 9.248158550100915e-05, + "loss": 1.9802, + "step": 6567 + }, + { + "epoch": 2.015960712093309, + "grad_norm": 0.3598950505256653, + "learning_rate": 9.247896393471044e-05, + "loss": 1.8882, + "step": 6568 + }, + { + "epoch": 2.016267648864334, + "grad_norm": 0.3609221875667572, + "learning_rate": 9.247634194860974e-05, + "loss": 1.934, + "step": 6569 + }, + { + "epoch": 2.016574585635359, + "grad_norm": 0.3893497586250305, + "learning_rate": 9.247371954273291e-05, + "loss": 1.8808, + "step": 6570 + }, + { + "epoch": 2.016881522406384, + "grad_norm": 0.347417950630188, + "learning_rate": 9.24710967171059e-05, + "loss": 1.863, + "step": 6571 + }, + { + "epoch": 2.0171884591774094, + "grad_norm": 0.35378298163414, + "learning_rate": 9.246847347175461e-05, + "loss": 1.8664, + "step": 6572 + }, + { + "epoch": 2.0174953959484347, + "grad_norm": 0.2819608151912689, + "learning_rate": 9.246584980670499e-05, + "loss": 1.9007, + "step": 6573 + }, + { + "epoch": 2.0178023327194596, + "grad_norm": 0.32445117831230164, + "learning_rate": 9.246322572198293e-05, + "loss": 1.9176, + "step": 6574 + }, + { + "epoch": 2.018109269490485, + "grad_norm": 0.33579203486442566, + "learning_rate": 9.24606012176144e-05, + "loss": 1.8192, + "step": 6575 + }, + { + "epoch": 2.0184162062615103, + "grad_norm": 0.40369588136672974, + "learning_rate": 9.245797629362532e-05, + "loss": 1.8731, + "step": 6576 + }, + { + "epoch": 2.018723143032535, + "grad_norm": 0.34241169691085815, + "learning_rate": 9.245535095004163e-05, + "loss": 1.8555, + "step": 6577 + }, + { + "epoch": 2.0190300798035605, + "grad_norm": 0.3627666234970093, + "learning_rate": 9.245272518688927e-05, + "loss": 1.9212, + "step": 6578 + }, + { + "epoch": 2.0193370165745854, + "grad_norm": 0.3330884873867035, + "learning_rate": 9.245009900419422e-05, + "loss": 1.8727, + "step": 6579 + }, + { + "epoch": 2.0196439533456108, + "grad_norm": 0.3259236514568329, + "learning_rate": 9.244747240198239e-05, + "loss": 1.8471, + "step": 6580 + }, + { + "epoch": 2.019950890116636, + "grad_norm": 0.3715277910232544, + "learning_rate": 9.244484538027976e-05, + "loss": 1.8925, + "step": 6581 + }, + { + "epoch": 2.020257826887661, + "grad_norm": 0.4752909541130066, + "learning_rate": 9.24422179391123e-05, + "loss": 1.889, + "step": 6582 + }, + { + "epoch": 2.0205647636586863, + "grad_norm": 0.5166791677474976, + "learning_rate": 9.243959007850597e-05, + "loss": 1.8637, + "step": 6583 + }, + { + "epoch": 2.0208717004297116, + "grad_norm": 0.5350266695022583, + "learning_rate": 9.243696179848673e-05, + "loss": 1.8916, + "step": 6584 + }, + { + "epoch": 2.0211786372007365, + "grad_norm": 0.6115607619285583, + "learning_rate": 9.243433309908055e-05, + "loss": 1.8847, + "step": 6585 + }, + { + "epoch": 2.021485573971762, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.243170398031343e-05, + "loss": 1.8889, + "step": 6586 + }, + { + "epoch": 2.021792510742787, + "grad_norm": 0.4547630846500397, + "learning_rate": 9.242907444221134e-05, + "loss": 1.8752, + "step": 6587 + }, + { + "epoch": 2.022099447513812, + "grad_norm": 0.39437413215637207, + "learning_rate": 9.242644448480027e-05, + "loss": 1.9318, + "step": 6588 + }, + { + "epoch": 2.0224063842848374, + "grad_norm": 0.39216291904449463, + "learning_rate": 9.24238141081062e-05, + "loss": 1.8799, + "step": 6589 + }, + { + "epoch": 2.0227133210558623, + "grad_norm": 0.4100605547428131, + "learning_rate": 9.242118331215513e-05, + "loss": 1.9278, + "step": 6590 + }, + { + "epoch": 2.0230202578268877, + "grad_norm": 0.38527074456214905, + "learning_rate": 9.241855209697307e-05, + "loss": 1.9085, + "step": 6591 + }, + { + "epoch": 2.023327194597913, + "grad_norm": 0.39856311678886414, + "learning_rate": 9.241592046258602e-05, + "loss": 1.8057, + "step": 6592 + }, + { + "epoch": 2.023634131368938, + "grad_norm": 0.4070499539375305, + "learning_rate": 9.241328840902e-05, + "loss": 1.8099, + "step": 6593 + }, + { + "epoch": 2.023941068139963, + "grad_norm": 0.40319183468818665, + "learning_rate": 9.241065593630097e-05, + "loss": 1.8654, + "step": 6594 + }, + { + "epoch": 2.0242480049109886, + "grad_norm": 0.3788430988788605, + "learning_rate": 9.240802304445499e-05, + "loss": 1.9419, + "step": 6595 + }, + { + "epoch": 2.0245549416820134, + "grad_norm": 0.3656894564628601, + "learning_rate": 9.240538973350809e-05, + "loss": 1.8625, + "step": 6596 + }, + { + "epoch": 2.0248618784530388, + "grad_norm": 0.4384852945804596, + "learning_rate": 9.240275600348625e-05, + "loss": 1.8893, + "step": 6597 + }, + { + "epoch": 2.0251688152240637, + "grad_norm": 0.5054775476455688, + "learning_rate": 9.240012185441554e-05, + "loss": 1.826, + "step": 6598 + }, + { + "epoch": 2.025475751995089, + "grad_norm": 0.4576725959777832, + "learning_rate": 9.239748728632196e-05, + "loss": 1.9319, + "step": 6599 + }, + { + "epoch": 2.0257826887661143, + "grad_norm": 0.40581515431404114, + "learning_rate": 9.239485229923157e-05, + "loss": 1.905, + "step": 6600 + }, + { + "epoch": 2.0260896255371392, + "grad_norm": 0.3168322443962097, + "learning_rate": 9.23922168931704e-05, + "loss": 1.8937, + "step": 6601 + }, + { + "epoch": 2.0263965623081646, + "grad_norm": 0.39211124181747437, + "learning_rate": 9.238958106816449e-05, + "loss": 1.8346, + "step": 6602 + }, + { + "epoch": 2.02670349907919, + "grad_norm": 0.4722496569156647, + "learning_rate": 9.23869448242399e-05, + "loss": 1.933, + "step": 6603 + }, + { + "epoch": 2.027010435850215, + "grad_norm": 0.47029170393943787, + "learning_rate": 9.238430816142268e-05, + "loss": 1.8873, + "step": 6604 + }, + { + "epoch": 2.02731737262124, + "grad_norm": 0.36421555280685425, + "learning_rate": 9.238167107973888e-05, + "loss": 1.8311, + "step": 6605 + }, + { + "epoch": 2.027624309392265, + "grad_norm": 0.36506712436676025, + "learning_rate": 9.237903357921455e-05, + "loss": 1.9025, + "step": 6606 + }, + { + "epoch": 2.0279312461632903, + "grad_norm": 0.5055087208747864, + "learning_rate": 9.237639565987579e-05, + "loss": 1.9138, + "step": 6607 + }, + { + "epoch": 2.0282381829343157, + "grad_norm": 0.5850993394851685, + "learning_rate": 9.237375732174867e-05, + "loss": 1.869, + "step": 6608 + }, + { + "epoch": 2.0285451197053406, + "grad_norm": 0.5053986310958862, + "learning_rate": 9.237111856485921e-05, + "loss": 1.8196, + "step": 6609 + }, + { + "epoch": 2.028852056476366, + "grad_norm": 0.40635839104652405, + "learning_rate": 9.236847938923354e-05, + "loss": 1.8399, + "step": 6610 + }, + { + "epoch": 2.0291589932473912, + "grad_norm": 0.32075709104537964, + "learning_rate": 9.236583979489771e-05, + "loss": 1.8532, + "step": 6611 + }, + { + "epoch": 2.029465930018416, + "grad_norm": 0.4474230408668518, + "learning_rate": 9.236319978187783e-05, + "loss": 1.8807, + "step": 6612 + }, + { + "epoch": 2.0297728667894415, + "grad_norm": 0.5391832590103149, + "learning_rate": 9.236055935019998e-05, + "loss": 1.8887, + "step": 6613 + }, + { + "epoch": 2.0300798035604664, + "grad_norm": 0.5129361748695374, + "learning_rate": 9.235791849989024e-05, + "loss": 1.8541, + "step": 6614 + }, + { + "epoch": 2.0303867403314917, + "grad_norm": 0.33113735914230347, + "learning_rate": 9.235527723097474e-05, + "loss": 1.8611, + "step": 6615 + }, + { + "epoch": 2.030693677102517, + "grad_norm": 0.3526761531829834, + "learning_rate": 9.235263554347956e-05, + "loss": 1.8436, + "step": 6616 + }, + { + "epoch": 2.031000613873542, + "grad_norm": 0.4380190670490265, + "learning_rate": 9.234999343743081e-05, + "loss": 1.854, + "step": 6617 + }, + { + "epoch": 2.0313075506445673, + "grad_norm": 0.4300559163093567, + "learning_rate": 9.23473509128546e-05, + "loss": 1.919, + "step": 6618 + }, + { + "epoch": 2.0316144874155926, + "grad_norm": 0.3445209860801697, + "learning_rate": 9.234470796977705e-05, + "loss": 1.88, + "step": 6619 + }, + { + "epoch": 2.0319214241866175, + "grad_norm": 0.35759109258651733, + "learning_rate": 9.234206460822428e-05, + "loss": 1.9244, + "step": 6620 + }, + { + "epoch": 2.032228360957643, + "grad_norm": 0.432804137468338, + "learning_rate": 9.23394208282224e-05, + "loss": 1.9312, + "step": 6621 + }, + { + "epoch": 2.0325352977286677, + "grad_norm": 0.446865439414978, + "learning_rate": 9.233677662979756e-05, + "loss": 1.8791, + "step": 6622 + }, + { + "epoch": 2.032842234499693, + "grad_norm": 0.37617436051368713, + "learning_rate": 9.233413201297588e-05, + "loss": 1.8794, + "step": 6623 + }, + { + "epoch": 2.0331491712707184, + "grad_norm": 0.33695775270462036, + "learning_rate": 9.233148697778349e-05, + "loss": 1.8649, + "step": 6624 + }, + { + "epoch": 2.0334561080417433, + "grad_norm": 0.3893069624900818, + "learning_rate": 9.232884152424654e-05, + "loss": 1.899, + "step": 6625 + }, + { + "epoch": 2.0337630448127686, + "grad_norm": 0.38993194699287415, + "learning_rate": 9.232619565239116e-05, + "loss": 1.8994, + "step": 6626 + }, + { + "epoch": 2.034069981583794, + "grad_norm": 0.3725507855415344, + "learning_rate": 9.23235493622435e-05, + "loss": 1.8758, + "step": 6627 + }, + { + "epoch": 2.034376918354819, + "grad_norm": 0.3236019015312195, + "learning_rate": 9.232090265382973e-05, + "loss": 1.9041, + "step": 6628 + }, + { + "epoch": 2.034683855125844, + "grad_norm": 0.3399617671966553, + "learning_rate": 9.231825552717599e-05, + "loss": 1.9081, + "step": 6629 + }, + { + "epoch": 2.034990791896869, + "grad_norm": 0.352096289396286, + "learning_rate": 9.231560798230845e-05, + "loss": 1.9001, + "step": 6630 + }, + { + "epoch": 2.0352977286678944, + "grad_norm": 0.39621952176094055, + "learning_rate": 9.231296001925327e-05, + "loss": 1.9258, + "step": 6631 + }, + { + "epoch": 2.0356046654389197, + "grad_norm": 0.36686012148857117, + "learning_rate": 9.23103116380366e-05, + "loss": 1.9325, + "step": 6632 + }, + { + "epoch": 2.0359116022099446, + "grad_norm": 0.36286696791648865, + "learning_rate": 9.230766283868466e-05, + "loss": 1.9623, + "step": 6633 + }, + { + "epoch": 2.03621853898097, + "grad_norm": 0.34748387336730957, + "learning_rate": 9.230501362122359e-05, + "loss": 1.8326, + "step": 6634 + }, + { + "epoch": 2.0365254757519953, + "grad_norm": 0.350993275642395, + "learning_rate": 9.230236398567958e-05, + "loss": 1.8333, + "step": 6635 + }, + { + "epoch": 2.03683241252302, + "grad_norm": 0.3181723356246948, + "learning_rate": 9.229971393207881e-05, + "loss": 1.8852, + "step": 6636 + }, + { + "epoch": 2.0371393492940455, + "grad_norm": 0.3446536660194397, + "learning_rate": 9.229706346044747e-05, + "loss": 1.8833, + "step": 6637 + }, + { + "epoch": 2.0374462860650704, + "grad_norm": 0.3077203631401062, + "learning_rate": 9.229441257081176e-05, + "loss": 1.8546, + "step": 6638 + }, + { + "epoch": 2.0377532228360957, + "grad_norm": 0.3659566342830658, + "learning_rate": 9.229176126319788e-05, + "loss": 1.8687, + "step": 6639 + }, + { + "epoch": 2.038060159607121, + "grad_norm": 0.379779577255249, + "learning_rate": 9.228910953763204e-05, + "loss": 1.9208, + "step": 6640 + }, + { + "epoch": 2.038367096378146, + "grad_norm": 0.4496903121471405, + "learning_rate": 9.228645739414042e-05, + "loss": 1.9471, + "step": 6641 + }, + { + "epoch": 2.0386740331491713, + "grad_norm": 0.37597209215164185, + "learning_rate": 9.228380483274923e-05, + "loss": 1.9047, + "step": 6642 + }, + { + "epoch": 2.0389809699201966, + "grad_norm": 0.3739323019981384, + "learning_rate": 9.228115185348471e-05, + "loss": 1.9697, + "step": 6643 + }, + { + "epoch": 2.0392879066912215, + "grad_norm": 0.3524092435836792, + "learning_rate": 9.227849845637306e-05, + "loss": 1.8716, + "step": 6644 + }, + { + "epoch": 2.039594843462247, + "grad_norm": 0.36939096450805664, + "learning_rate": 9.227584464144051e-05, + "loss": 1.9836, + "step": 6645 + }, + { + "epoch": 2.0399017802332717, + "grad_norm": 0.39015519618988037, + "learning_rate": 9.22731904087133e-05, + "loss": 1.907, + "step": 6646 + }, + { + "epoch": 2.040208717004297, + "grad_norm": 0.3725626468658447, + "learning_rate": 9.227053575821763e-05, + "loss": 1.9483, + "step": 6647 + }, + { + "epoch": 2.0405156537753224, + "grad_norm": 0.41595613956451416, + "learning_rate": 9.226788068997974e-05, + "loss": 1.9352, + "step": 6648 + }, + { + "epoch": 2.0408225905463473, + "grad_norm": 0.4026443660259247, + "learning_rate": 9.226522520402589e-05, + "loss": 1.9166, + "step": 6649 + }, + { + "epoch": 2.0411295273173726, + "grad_norm": 0.39883533120155334, + "learning_rate": 9.226256930038233e-05, + "loss": 1.8594, + "step": 6650 + }, + { + "epoch": 2.041436464088398, + "grad_norm": 0.35540083050727844, + "learning_rate": 9.225991297907526e-05, + "loss": 1.9065, + "step": 6651 + }, + { + "epoch": 2.041743400859423, + "grad_norm": 0.3799804747104645, + "learning_rate": 9.225725624013097e-05, + "loss": 1.9232, + "step": 6652 + }, + { + "epoch": 2.042050337630448, + "grad_norm": 0.37289959192276, + "learning_rate": 9.225459908357572e-05, + "loss": 1.9679, + "step": 6653 + }, + { + "epoch": 2.042357274401473, + "grad_norm": 0.38069143891334534, + "learning_rate": 9.225194150943574e-05, + "loss": 1.9699, + "step": 6654 + }, + { + "epoch": 2.0426642111724984, + "grad_norm": 0.43708884716033936, + "learning_rate": 9.224928351773731e-05, + "loss": 1.8907, + "step": 6655 + }, + { + "epoch": 2.0429711479435237, + "grad_norm": 0.47203195095062256, + "learning_rate": 9.22466251085067e-05, + "loss": 1.9615, + "step": 6656 + }, + { + "epoch": 2.0432780847145486, + "grad_norm": 0.405129998922348, + "learning_rate": 9.224396628177019e-05, + "loss": 1.9165, + "step": 6657 + }, + { + "epoch": 2.043585021485574, + "grad_norm": 0.33447468280792236, + "learning_rate": 9.224130703755403e-05, + "loss": 1.852, + "step": 6658 + }, + { + "epoch": 2.0438919582565993, + "grad_norm": 0.33780771493911743, + "learning_rate": 9.223864737588453e-05, + "loss": 1.875, + "step": 6659 + }, + { + "epoch": 2.044198895027624, + "grad_norm": 0.37942594289779663, + "learning_rate": 9.223598729678796e-05, + "loss": 1.9115, + "step": 6660 + }, + { + "epoch": 2.0445058317986495, + "grad_norm": 0.3368874192237854, + "learning_rate": 9.223332680029059e-05, + "loss": 1.822, + "step": 6661 + }, + { + "epoch": 2.044812768569675, + "grad_norm": 0.3029201924800873, + "learning_rate": 9.223066588641873e-05, + "loss": 1.8902, + "step": 6662 + }, + { + "epoch": 2.0451197053406998, + "grad_norm": 0.4605506360530853, + "learning_rate": 9.22280045551987e-05, + "loss": 1.9164, + "step": 6663 + }, + { + "epoch": 2.045426642111725, + "grad_norm": 0.5012617111206055, + "learning_rate": 9.222534280665675e-05, + "loss": 1.8859, + "step": 6664 + }, + { + "epoch": 2.04573357888275, + "grad_norm": 0.5177115797996521, + "learning_rate": 9.222268064081924e-05, + "loss": 1.93, + "step": 6665 + }, + { + "epoch": 2.0460405156537753, + "grad_norm": 0.3966628313064575, + "learning_rate": 9.222001805771244e-05, + "loss": 1.8817, + "step": 6666 + }, + { + "epoch": 2.0463474524248007, + "grad_norm": 0.3670666813850403, + "learning_rate": 9.221735505736269e-05, + "loss": 1.8224, + "step": 6667 + }, + { + "epoch": 2.0466543891958255, + "grad_norm": 0.4584221839904785, + "learning_rate": 9.221469163979628e-05, + "loss": 1.7788, + "step": 6668 + }, + { + "epoch": 2.046961325966851, + "grad_norm": 0.5598693490028381, + "learning_rate": 9.221202780503954e-05, + "loss": 1.9263, + "step": 6669 + }, + { + "epoch": 2.047268262737876, + "grad_norm": 0.44200289249420166, + "learning_rate": 9.22093635531188e-05, + "loss": 1.8455, + "step": 6670 + }, + { + "epoch": 2.047575199508901, + "grad_norm": 0.33257725834846497, + "learning_rate": 9.22066988840604e-05, + "loss": 1.9019, + "step": 6671 + }, + { + "epoch": 2.0478821362799264, + "grad_norm": 0.4716290831565857, + "learning_rate": 9.220403379789066e-05, + "loss": 1.9012, + "step": 6672 + }, + { + "epoch": 2.0481890730509513, + "grad_norm": 0.5600453615188599, + "learning_rate": 9.220136829463591e-05, + "loss": 1.9158, + "step": 6673 + }, + { + "epoch": 2.0484960098219767, + "grad_norm": 0.5345216393470764, + "learning_rate": 9.219870237432252e-05, + "loss": 1.931, + "step": 6674 + }, + { + "epoch": 2.048802946593002, + "grad_norm": 0.36617112159729004, + "learning_rate": 9.219603603697682e-05, + "loss": 1.9019, + "step": 6675 + }, + { + "epoch": 2.049109883364027, + "grad_norm": 0.33677804470062256, + "learning_rate": 9.219336928262514e-05, + "loss": 1.8897, + "step": 6676 + }, + { + "epoch": 2.049416820135052, + "grad_norm": 0.48563066124916077, + "learning_rate": 9.219070211129388e-05, + "loss": 1.9147, + "step": 6677 + }, + { + "epoch": 2.0497237569060776, + "grad_norm": 0.5029729008674622, + "learning_rate": 9.218803452300935e-05, + "loss": 1.8926, + "step": 6678 + }, + { + "epoch": 2.0500306936771024, + "grad_norm": 0.3969452977180481, + "learning_rate": 9.218536651779795e-05, + "loss": 1.9337, + "step": 6679 + }, + { + "epoch": 2.050337630448128, + "grad_norm": 0.37374138832092285, + "learning_rate": 9.218269809568603e-05, + "loss": 1.9147, + "step": 6680 + }, + { + "epoch": 2.0506445672191527, + "grad_norm": 0.416608065366745, + "learning_rate": 9.218002925669996e-05, + "loss": 1.975, + "step": 6681 + }, + { + "epoch": 2.050951503990178, + "grad_norm": 0.35848283767700195, + "learning_rate": 9.217736000086612e-05, + "loss": 1.9194, + "step": 6682 + }, + { + "epoch": 2.0512584407612033, + "grad_norm": 0.3294626772403717, + "learning_rate": 9.217469032821088e-05, + "loss": 1.8541, + "step": 6683 + }, + { + "epoch": 2.0515653775322282, + "grad_norm": 0.4164618253707886, + "learning_rate": 9.217202023876064e-05, + "loss": 1.8999, + "step": 6684 + }, + { + "epoch": 2.0518723143032536, + "grad_norm": 0.4067288935184479, + "learning_rate": 9.216934973254179e-05, + "loss": 1.8609, + "step": 6685 + }, + { + "epoch": 2.052179251074279, + "grad_norm": 0.38743069767951965, + "learning_rate": 9.216667880958069e-05, + "loss": 1.8571, + "step": 6686 + }, + { + "epoch": 2.052486187845304, + "grad_norm": 0.3430919647216797, + "learning_rate": 9.216400746990377e-05, + "loss": 1.9229, + "step": 6687 + }, + { + "epoch": 2.052793124616329, + "grad_norm": 0.3512028753757477, + "learning_rate": 9.21613357135374e-05, + "loss": 1.9331, + "step": 6688 + }, + { + "epoch": 2.053100061387354, + "grad_norm": 0.3708036541938782, + "learning_rate": 9.215866354050799e-05, + "loss": 1.8499, + "step": 6689 + }, + { + "epoch": 2.0534069981583793, + "grad_norm": 0.39376455545425415, + "learning_rate": 9.215599095084199e-05, + "loss": 1.8531, + "step": 6690 + }, + { + "epoch": 2.0537139349294047, + "grad_norm": 0.3855830430984497, + "learning_rate": 9.215331794456576e-05, + "loss": 1.8597, + "step": 6691 + }, + { + "epoch": 2.0540208717004296, + "grad_norm": 0.3515113592147827, + "learning_rate": 9.215064452170574e-05, + "loss": 1.8776, + "step": 6692 + }, + { + "epoch": 2.054327808471455, + "grad_norm": 0.3165057897567749, + "learning_rate": 9.214797068228833e-05, + "loss": 1.926, + "step": 6693 + }, + { + "epoch": 2.0546347452424802, + "grad_norm": 0.3516407310962677, + "learning_rate": 9.214529642633998e-05, + "loss": 1.9397, + "step": 6694 + }, + { + "epoch": 2.054941682013505, + "grad_norm": 0.36943888664245605, + "learning_rate": 9.214262175388713e-05, + "loss": 1.9114, + "step": 6695 + }, + { + "epoch": 2.0552486187845305, + "grad_norm": 0.3490065634250641, + "learning_rate": 9.213994666495616e-05, + "loss": 1.8637, + "step": 6696 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.30341869592666626, + "learning_rate": 9.213727115957356e-05, + "loss": 1.8525, + "step": 6697 + }, + { + "epoch": 2.0558624923265807, + "grad_norm": 0.3899247646331787, + "learning_rate": 9.213459523776573e-05, + "loss": 2.0578, + "step": 6698 + }, + { + "epoch": 2.056169429097606, + "grad_norm": 0.34904104471206665, + "learning_rate": 9.213191889955915e-05, + "loss": 1.9135, + "step": 6699 + }, + { + "epoch": 2.056476365868631, + "grad_norm": 0.3806450366973877, + "learning_rate": 9.212924214498024e-05, + "loss": 1.9252, + "step": 6700 + }, + { + "epoch": 2.0567833026396563, + "grad_norm": 0.33185848593711853, + "learning_rate": 9.212656497405547e-05, + "loss": 1.8457, + "step": 6701 + }, + { + "epoch": 2.0570902394106816, + "grad_norm": 0.356717050075531, + "learning_rate": 9.21238873868113e-05, + "loss": 1.9086, + "step": 6702 + }, + { + "epoch": 2.0573971761817065, + "grad_norm": 0.41743260622024536, + "learning_rate": 9.212120938327418e-05, + "loss": 1.9255, + "step": 6703 + }, + { + "epoch": 2.057704112952732, + "grad_norm": 0.3937377631664276, + "learning_rate": 9.211853096347058e-05, + "loss": 1.9529, + "step": 6704 + }, + { + "epoch": 2.0580110497237567, + "grad_norm": 0.43980923295021057, + "learning_rate": 9.211585212742698e-05, + "loss": 1.905, + "step": 6705 + }, + { + "epoch": 2.058317986494782, + "grad_norm": 0.36891186237335205, + "learning_rate": 9.211317287516984e-05, + "loss": 1.8109, + "step": 6706 + }, + { + "epoch": 2.0586249232658074, + "grad_norm": 0.3582547605037689, + "learning_rate": 9.211049320672563e-05, + "loss": 1.9633, + "step": 6707 + }, + { + "epoch": 2.0589318600368323, + "grad_norm": 0.3421446979045868, + "learning_rate": 9.210781312212087e-05, + "loss": 1.8956, + "step": 6708 + }, + { + "epoch": 2.0592387968078576, + "grad_norm": 0.34717023372650146, + "learning_rate": 9.210513262138199e-05, + "loss": 1.837, + "step": 6709 + }, + { + "epoch": 2.059545733578883, + "grad_norm": 0.32769930362701416, + "learning_rate": 9.210245170453553e-05, + "loss": 1.8588, + "step": 6710 + }, + { + "epoch": 2.059852670349908, + "grad_norm": 0.3694380223751068, + "learning_rate": 9.209977037160796e-05, + "loss": 1.9298, + "step": 6711 + }, + { + "epoch": 2.060159607120933, + "grad_norm": 0.38598594069480896, + "learning_rate": 9.209708862262578e-05, + "loss": 1.9011, + "step": 6712 + }, + { + "epoch": 2.060466543891958, + "grad_norm": 0.33520397543907166, + "learning_rate": 9.20944064576155e-05, + "loss": 1.9689, + "step": 6713 + }, + { + "epoch": 2.0607734806629834, + "grad_norm": 0.36898335814476013, + "learning_rate": 9.209172387660363e-05, + "loss": 1.9362, + "step": 6714 + }, + { + "epoch": 2.0610804174340087, + "grad_norm": 0.3989763855934143, + "learning_rate": 9.208904087961667e-05, + "loss": 1.8875, + "step": 6715 + }, + { + "epoch": 2.0613873542050336, + "grad_norm": 0.38079237937927246, + "learning_rate": 9.208635746668113e-05, + "loss": 1.8645, + "step": 6716 + }, + { + "epoch": 2.061694290976059, + "grad_norm": 0.3853057026863098, + "learning_rate": 9.208367363782355e-05, + "loss": 1.9346, + "step": 6717 + }, + { + "epoch": 2.0620012277470843, + "grad_norm": 0.33557942509651184, + "learning_rate": 9.208098939307044e-05, + "loss": 1.8629, + "step": 6718 + }, + { + "epoch": 2.062308164518109, + "grad_norm": 0.31848183274269104, + "learning_rate": 9.207830473244832e-05, + "loss": 1.7616, + "step": 6719 + }, + { + "epoch": 2.0626151012891345, + "grad_norm": 0.2901391088962555, + "learning_rate": 9.207561965598375e-05, + "loss": 1.8876, + "step": 6720 + }, + { + "epoch": 2.06292203806016, + "grad_norm": 0.33935174345970154, + "learning_rate": 9.207293416370322e-05, + "loss": 1.8407, + "step": 6721 + }, + { + "epoch": 2.0632289748311847, + "grad_norm": 0.3615114390850067, + "learning_rate": 9.207024825563331e-05, + "loss": 1.8378, + "step": 6722 + }, + { + "epoch": 2.06353591160221, + "grad_norm": 0.35903334617614746, + "learning_rate": 9.206756193180053e-05, + "loss": 1.8316, + "step": 6723 + }, + { + "epoch": 2.063842848373235, + "grad_norm": 0.35222968459129333, + "learning_rate": 9.206487519223146e-05, + "loss": 1.8786, + "step": 6724 + }, + { + "epoch": 2.0641497851442603, + "grad_norm": 0.3412967622280121, + "learning_rate": 9.206218803695264e-05, + "loss": 1.8682, + "step": 6725 + }, + { + "epoch": 2.0644567219152856, + "grad_norm": 0.4166354835033417, + "learning_rate": 9.205950046599062e-05, + "loss": 1.8871, + "step": 6726 + }, + { + "epoch": 2.0647636586863105, + "grad_norm": 0.4631161093711853, + "learning_rate": 9.205681247937196e-05, + "loss": 1.9328, + "step": 6727 + }, + { + "epoch": 2.065070595457336, + "grad_norm": 0.39197248220443726, + "learning_rate": 9.205412407712325e-05, + "loss": 1.9434, + "step": 6728 + }, + { + "epoch": 2.0653775322283607, + "grad_norm": 0.37939852476119995, + "learning_rate": 9.205143525927103e-05, + "loss": 1.9115, + "step": 6729 + }, + { + "epoch": 2.065684468999386, + "grad_norm": 0.35442814230918884, + "learning_rate": 9.204874602584186e-05, + "loss": 1.9197, + "step": 6730 + }, + { + "epoch": 2.0659914057704114, + "grad_norm": 0.3598809242248535, + "learning_rate": 9.204605637686235e-05, + "loss": 1.8684, + "step": 6731 + }, + { + "epoch": 2.0662983425414363, + "grad_norm": 0.3360415995121002, + "learning_rate": 9.204336631235905e-05, + "loss": 1.8531, + "step": 6732 + }, + { + "epoch": 2.0666052793124616, + "grad_norm": 0.4487619698047638, + "learning_rate": 9.204067583235859e-05, + "loss": 1.8509, + "step": 6733 + }, + { + "epoch": 2.066912216083487, + "grad_norm": 0.37166881561279297, + "learning_rate": 9.203798493688753e-05, + "loss": 1.8826, + "step": 6734 + }, + { + "epoch": 2.067219152854512, + "grad_norm": 0.35294032096862793, + "learning_rate": 9.203529362597244e-05, + "loss": 1.9029, + "step": 6735 + }, + { + "epoch": 2.067526089625537, + "grad_norm": 0.4115317165851593, + "learning_rate": 9.203260189963995e-05, + "loss": 1.9117, + "step": 6736 + }, + { + "epoch": 2.0678330263965625, + "grad_norm": 0.44137999415397644, + "learning_rate": 9.202990975791666e-05, + "loss": 1.8754, + "step": 6737 + }, + { + "epoch": 2.0681399631675874, + "grad_norm": 0.46055081486701965, + "learning_rate": 9.202721720082916e-05, + "loss": 1.8322, + "step": 6738 + }, + { + "epoch": 2.0684468999386127, + "grad_norm": 0.38548141717910767, + "learning_rate": 9.202452422840407e-05, + "loss": 1.8341, + "step": 6739 + }, + { + "epoch": 2.0687538367096376, + "grad_norm": 0.3542765974998474, + "learning_rate": 9.2021830840668e-05, + "loss": 1.9301, + "step": 6740 + }, + { + "epoch": 2.069060773480663, + "grad_norm": 0.35987207293510437, + "learning_rate": 9.201913703764755e-05, + "loss": 1.8756, + "step": 6741 + }, + { + "epoch": 2.0693677102516883, + "grad_norm": 0.4297364056110382, + "learning_rate": 9.201644281936938e-05, + "loss": 1.8549, + "step": 6742 + }, + { + "epoch": 2.069674647022713, + "grad_norm": 0.3679873049259186, + "learning_rate": 9.20137481858601e-05, + "loss": 1.8905, + "step": 6743 + }, + { + "epoch": 2.0699815837937385, + "grad_norm": 0.3402685523033142, + "learning_rate": 9.201105313714632e-05, + "loss": 1.8834, + "step": 6744 + }, + { + "epoch": 2.070288520564764, + "grad_norm": 0.40986955165863037, + "learning_rate": 9.200835767325469e-05, + "loss": 1.8861, + "step": 6745 + }, + { + "epoch": 2.0705954573357888, + "grad_norm": 0.4305949807167053, + "learning_rate": 9.200566179421186e-05, + "loss": 1.8977, + "step": 6746 + }, + { + "epoch": 2.070902394106814, + "grad_norm": 0.3948439359664917, + "learning_rate": 9.200296550004446e-05, + "loss": 1.8801, + "step": 6747 + }, + { + "epoch": 2.071209330877839, + "grad_norm": 0.3404015600681305, + "learning_rate": 9.200026879077912e-05, + "loss": 1.8417, + "step": 6748 + }, + { + "epoch": 2.0715162676488643, + "grad_norm": 0.39447101950645447, + "learning_rate": 9.199757166644252e-05, + "loss": 1.9675, + "step": 6749 + }, + { + "epoch": 2.0718232044198897, + "grad_norm": 0.44323647022247314, + "learning_rate": 9.199487412706129e-05, + "loss": 1.9014, + "step": 6750 + }, + { + "epoch": 2.0721301411909145, + "grad_norm": 0.47096556425094604, + "learning_rate": 9.199217617266212e-05, + "loss": 1.8783, + "step": 6751 + }, + { + "epoch": 2.07243707796194, + "grad_norm": 0.42863038182258606, + "learning_rate": 9.198947780327163e-05, + "loss": 1.8369, + "step": 6752 + }, + { + "epoch": 2.072744014732965, + "grad_norm": 0.414079874753952, + "learning_rate": 9.198677901891652e-05, + "loss": 1.9247, + "step": 6753 + }, + { + "epoch": 2.07305095150399, + "grad_norm": 0.3445589542388916, + "learning_rate": 9.198407981962345e-05, + "loss": 1.8494, + "step": 6754 + }, + { + "epoch": 2.0733578882750154, + "grad_norm": 0.4340321719646454, + "learning_rate": 9.198138020541908e-05, + "loss": 1.904, + "step": 6755 + }, + { + "epoch": 2.0736648250460403, + "grad_norm": 0.55349200963974, + "learning_rate": 9.197868017633013e-05, + "loss": 1.9368, + "step": 6756 + }, + { + "epoch": 2.0739717618170657, + "grad_norm": 0.5893970727920532, + "learning_rate": 9.197597973238326e-05, + "loss": 1.9329, + "step": 6757 + }, + { + "epoch": 2.074278698588091, + "grad_norm": 0.4942009449005127, + "learning_rate": 9.197327887360514e-05, + "loss": 1.7726, + "step": 6758 + }, + { + "epoch": 2.074585635359116, + "grad_norm": 0.36411046981811523, + "learning_rate": 9.197057760002247e-05, + "loss": 1.8214, + "step": 6759 + }, + { + "epoch": 2.074892572130141, + "grad_norm": 0.31520166993141174, + "learning_rate": 9.196787591166198e-05, + "loss": 1.8491, + "step": 6760 + }, + { + "epoch": 2.0751995089011666, + "grad_norm": 0.47392621636390686, + "learning_rate": 9.196517380855032e-05, + "loss": 2.0165, + "step": 6761 + }, + { + "epoch": 2.0755064456721914, + "grad_norm": 0.4768085181713104, + "learning_rate": 9.196247129071423e-05, + "loss": 1.9289, + "step": 6762 + }, + { + "epoch": 2.075813382443217, + "grad_norm": 0.396391361951828, + "learning_rate": 9.195976835818039e-05, + "loss": 1.9521, + "step": 6763 + }, + { + "epoch": 2.0761203192142417, + "grad_norm": 0.4030967950820923, + "learning_rate": 9.195706501097551e-05, + "loss": 1.8386, + "step": 6764 + }, + { + "epoch": 2.076427255985267, + "grad_norm": 0.48308777809143066, + "learning_rate": 9.195436124912635e-05, + "loss": 1.8874, + "step": 6765 + }, + { + "epoch": 2.0767341927562923, + "grad_norm": 0.5232771635055542, + "learning_rate": 9.19516570726596e-05, + "loss": 1.8822, + "step": 6766 + }, + { + "epoch": 2.0770411295273172, + "grad_norm": 0.3607174754142761, + "learning_rate": 9.194895248160198e-05, + "loss": 1.8995, + "step": 6767 + }, + { + "epoch": 2.0773480662983426, + "grad_norm": 0.4354429841041565, + "learning_rate": 9.194624747598022e-05, + "loss": 1.8629, + "step": 6768 + }, + { + "epoch": 2.077655003069368, + "grad_norm": 0.5405299067497253, + "learning_rate": 9.194354205582107e-05, + "loss": 1.8608, + "step": 6769 + }, + { + "epoch": 2.077961939840393, + "grad_norm": 0.5442025065422058, + "learning_rate": 9.194083622115123e-05, + "loss": 1.885, + "step": 6770 + }, + { + "epoch": 2.078268876611418, + "grad_norm": 0.4160112142562866, + "learning_rate": 9.193812997199749e-05, + "loss": 1.8617, + "step": 6771 + }, + { + "epoch": 2.078575813382443, + "grad_norm": 0.3550199866294861, + "learning_rate": 9.193542330838656e-05, + "loss": 1.9277, + "step": 6772 + }, + { + "epoch": 2.0788827501534684, + "grad_norm": 0.5224893093109131, + "learning_rate": 9.19327162303452e-05, + "loss": 1.7893, + "step": 6773 + }, + { + "epoch": 2.0791896869244937, + "grad_norm": 0.45021727681159973, + "learning_rate": 9.193000873790014e-05, + "loss": 1.8635, + "step": 6774 + }, + { + "epoch": 2.0794966236955186, + "grad_norm": 0.3087892532348633, + "learning_rate": 9.192730083107819e-05, + "loss": 1.842, + "step": 6775 + }, + { + "epoch": 2.079803560466544, + "grad_norm": 0.4304139018058777, + "learning_rate": 9.192459250990606e-05, + "loss": 1.8461, + "step": 6776 + }, + { + "epoch": 2.0801104972375692, + "grad_norm": 0.4388587474822998, + "learning_rate": 9.192188377441054e-05, + "loss": 1.8978, + "step": 6777 + }, + { + "epoch": 2.080417434008594, + "grad_norm": 0.3452616333961487, + "learning_rate": 9.19191746246184e-05, + "loss": 1.8849, + "step": 6778 + }, + { + "epoch": 2.0807243707796195, + "grad_norm": 0.3127618432044983, + "learning_rate": 9.191646506055638e-05, + "loss": 1.8703, + "step": 6779 + }, + { + "epoch": 2.0810313075506444, + "grad_norm": 0.3424977958202362, + "learning_rate": 9.191375508225131e-05, + "loss": 1.8446, + "step": 6780 + }, + { + "epoch": 2.0813382443216697, + "grad_norm": 0.3536671996116638, + "learning_rate": 9.191104468972993e-05, + "loss": 1.9079, + "step": 6781 + }, + { + "epoch": 2.081645181092695, + "grad_norm": 0.3689599633216858, + "learning_rate": 9.190833388301905e-05, + "loss": 1.8683, + "step": 6782 + }, + { + "epoch": 2.08195211786372, + "grad_norm": 0.30976906418800354, + "learning_rate": 9.190562266214546e-05, + "loss": 1.89, + "step": 6783 + }, + { + "epoch": 2.0822590546347453, + "grad_norm": 0.34682777523994446, + "learning_rate": 9.190291102713593e-05, + "loss": 1.8384, + "step": 6784 + }, + { + "epoch": 2.0825659914057706, + "grad_norm": 0.4135018587112427, + "learning_rate": 9.190019897801727e-05, + "loss": 1.8878, + "step": 6785 + }, + { + "epoch": 2.0828729281767955, + "grad_norm": 0.4247548580169678, + "learning_rate": 9.189748651481629e-05, + "loss": 1.9244, + "step": 6786 + }, + { + "epoch": 2.083179864947821, + "grad_norm": 0.3961609899997711, + "learning_rate": 9.18947736375598e-05, + "loss": 1.9539, + "step": 6787 + }, + { + "epoch": 2.0834868017188457, + "grad_norm": 0.4174231290817261, + "learning_rate": 9.18920603462746e-05, + "loss": 1.9705, + "step": 6788 + }, + { + "epoch": 2.083793738489871, + "grad_norm": 0.38771605491638184, + "learning_rate": 9.18893466409875e-05, + "loss": 1.9038, + "step": 6789 + }, + { + "epoch": 2.0841006752608964, + "grad_norm": 0.38480475544929504, + "learning_rate": 9.188663252172534e-05, + "loss": 1.8725, + "step": 6790 + }, + { + "epoch": 2.0844076120319213, + "grad_norm": 0.37508267164230347, + "learning_rate": 9.18839179885149e-05, + "loss": 1.8819, + "step": 6791 + }, + { + "epoch": 2.0847145488029466, + "grad_norm": 0.3970893621444702, + "learning_rate": 9.188120304138306e-05, + "loss": 1.9035, + "step": 6792 + }, + { + "epoch": 2.085021485573972, + "grad_norm": 0.42629706859588623, + "learning_rate": 9.18784876803566e-05, + "loss": 1.993, + "step": 6793 + }, + { + "epoch": 2.085328422344997, + "grad_norm": 0.40387317538261414, + "learning_rate": 9.18757719054624e-05, + "loss": 1.8987, + "step": 6794 + }, + { + "epoch": 2.085635359116022, + "grad_norm": 0.40304768085479736, + "learning_rate": 9.187305571672726e-05, + "loss": 1.9017, + "step": 6795 + }, + { + "epoch": 2.0859422958870475, + "grad_norm": 0.34255313873291016, + "learning_rate": 9.187033911417805e-05, + "loss": 1.8406, + "step": 6796 + }, + { + "epoch": 2.0862492326580724, + "grad_norm": 0.34713810682296753, + "learning_rate": 9.18676220978416e-05, + "loss": 1.8773, + "step": 6797 + }, + { + "epoch": 2.0865561694290977, + "grad_norm": 0.3651806712150574, + "learning_rate": 9.186490466774478e-05, + "loss": 1.9158, + "step": 6798 + }, + { + "epoch": 2.0868631062001226, + "grad_norm": 0.3859401047229767, + "learning_rate": 9.186218682391443e-05, + "loss": 1.8488, + "step": 6799 + }, + { + "epoch": 2.087170042971148, + "grad_norm": 0.34309303760528564, + "learning_rate": 9.185946856637742e-05, + "loss": 1.8373, + "step": 6800 + }, + { + "epoch": 2.0874769797421733, + "grad_norm": 0.3597384989261627, + "learning_rate": 9.18567498951606e-05, + "loss": 1.8297, + "step": 6801 + }, + { + "epoch": 2.087783916513198, + "grad_norm": 0.39170950651168823, + "learning_rate": 9.185403081029085e-05, + "loss": 1.9623, + "step": 6802 + }, + { + "epoch": 2.0880908532842235, + "grad_norm": 0.37024664878845215, + "learning_rate": 9.185131131179503e-05, + "loss": 1.8966, + "step": 6803 + }, + { + "epoch": 2.0883977900552484, + "grad_norm": 0.37869709730148315, + "learning_rate": 9.184859139970001e-05, + "loss": 1.9121, + "step": 6804 + }, + { + "epoch": 2.0887047268262737, + "grad_norm": 0.3808143436908722, + "learning_rate": 9.184587107403271e-05, + "loss": 1.918, + "step": 6805 + }, + { + "epoch": 2.089011663597299, + "grad_norm": 0.3864719271659851, + "learning_rate": 9.184315033481996e-05, + "loss": 1.9087, + "step": 6806 + }, + { + "epoch": 2.089318600368324, + "grad_norm": 0.41121476888656616, + "learning_rate": 9.184042918208869e-05, + "loss": 1.8971, + "step": 6807 + }, + { + "epoch": 2.0896255371393493, + "grad_norm": 0.33098986744880676, + "learning_rate": 9.183770761586576e-05, + "loss": 1.8497, + "step": 6808 + }, + { + "epoch": 2.0899324739103746, + "grad_norm": 0.336174339056015, + "learning_rate": 9.183498563617809e-05, + "loss": 1.8341, + "step": 6809 + }, + { + "epoch": 2.0902394106813995, + "grad_norm": 0.339040070772171, + "learning_rate": 9.183226324305258e-05, + "loss": 1.9228, + "step": 6810 + }, + { + "epoch": 2.090546347452425, + "grad_norm": 0.395000159740448, + "learning_rate": 9.182954043651613e-05, + "loss": 1.9773, + "step": 6811 + }, + { + "epoch": 2.09085328422345, + "grad_norm": 0.3884550929069519, + "learning_rate": 9.182681721659563e-05, + "loss": 1.9665, + "step": 6812 + }, + { + "epoch": 2.091160220994475, + "grad_norm": 0.38752105832099915, + "learning_rate": 9.182409358331801e-05, + "loss": 1.9337, + "step": 6813 + }, + { + "epoch": 2.0914671577655004, + "grad_norm": 0.3557493984699249, + "learning_rate": 9.182136953671017e-05, + "loss": 1.8506, + "step": 6814 + }, + { + "epoch": 2.0917740945365253, + "grad_norm": 0.36052554845809937, + "learning_rate": 9.181864507679906e-05, + "loss": 1.8336, + "step": 6815 + }, + { + "epoch": 2.0920810313075506, + "grad_norm": 0.3311133086681366, + "learning_rate": 9.181592020361158e-05, + "loss": 1.9121, + "step": 6816 + }, + { + "epoch": 2.092387968078576, + "grad_norm": 0.33922117948532104, + "learning_rate": 9.181319491717468e-05, + "loss": 1.8366, + "step": 6817 + }, + { + "epoch": 2.092694904849601, + "grad_norm": 0.30820000171661377, + "learning_rate": 9.181046921751527e-05, + "loss": 1.8931, + "step": 6818 + }, + { + "epoch": 2.093001841620626, + "grad_norm": 0.327374666929245, + "learning_rate": 9.180774310466031e-05, + "loss": 1.8818, + "step": 6819 + }, + { + "epoch": 2.0933087783916515, + "grad_norm": 0.3244091868400574, + "learning_rate": 9.180501657863672e-05, + "loss": 1.8542, + "step": 6820 + }, + { + "epoch": 2.0936157151626764, + "grad_norm": 0.32823657989501953, + "learning_rate": 9.180228963947144e-05, + "loss": 1.8745, + "step": 6821 + }, + { + "epoch": 2.0939226519337018, + "grad_norm": 0.32869017124176025, + "learning_rate": 9.179956228719144e-05, + "loss": 1.8497, + "step": 6822 + }, + { + "epoch": 2.0942295887047266, + "grad_norm": 0.3624805808067322, + "learning_rate": 9.179683452182369e-05, + "loss": 1.9499, + "step": 6823 + }, + { + "epoch": 2.094536525475752, + "grad_norm": 0.35709038376808167, + "learning_rate": 9.179410634339509e-05, + "loss": 1.8709, + "step": 6824 + }, + { + "epoch": 2.0948434622467773, + "grad_norm": 0.3875027298927307, + "learning_rate": 9.179137775193266e-05, + "loss": 1.883, + "step": 6825 + }, + { + "epoch": 2.095150399017802, + "grad_norm": 0.4203769862651825, + "learning_rate": 9.178864874746333e-05, + "loss": 1.814, + "step": 6826 + }, + { + "epoch": 2.0954573357888275, + "grad_norm": 0.46331214904785156, + "learning_rate": 9.178591933001407e-05, + "loss": 1.9821, + "step": 6827 + }, + { + "epoch": 2.095764272559853, + "grad_norm": 0.4264145791530609, + "learning_rate": 9.178318949961188e-05, + "loss": 1.9249, + "step": 6828 + }, + { + "epoch": 2.0960712093308778, + "grad_norm": 0.3697608709335327, + "learning_rate": 9.178045925628371e-05, + "loss": 2.0052, + "step": 6829 + }, + { + "epoch": 2.096378146101903, + "grad_norm": 0.39582517743110657, + "learning_rate": 9.177772860005656e-05, + "loss": 1.9086, + "step": 6830 + }, + { + "epoch": 2.096685082872928, + "grad_norm": 0.3287788927555084, + "learning_rate": 9.17749975309574e-05, + "loss": 1.8766, + "step": 6831 + }, + { + "epoch": 2.0969920196439533, + "grad_norm": 0.33648282289505005, + "learning_rate": 9.177226604901324e-05, + "loss": 1.933, + "step": 6832 + }, + { + "epoch": 2.0972989564149787, + "grad_norm": 0.34225910902023315, + "learning_rate": 9.176953415425106e-05, + "loss": 1.8801, + "step": 6833 + }, + { + "epoch": 2.0976058931860035, + "grad_norm": 0.35536935925483704, + "learning_rate": 9.176680184669786e-05, + "loss": 1.9472, + "step": 6834 + }, + { + "epoch": 2.097912829957029, + "grad_norm": 0.39152607321739197, + "learning_rate": 9.176406912638064e-05, + "loss": 1.9502, + "step": 6835 + }, + { + "epoch": 2.098219766728054, + "grad_norm": 0.3812694549560547, + "learning_rate": 9.176133599332643e-05, + "loss": 1.8746, + "step": 6836 + }, + { + "epoch": 2.098526703499079, + "grad_norm": 0.36225396394729614, + "learning_rate": 9.17586024475622e-05, + "loss": 1.8489, + "step": 6837 + }, + { + "epoch": 2.0988336402701044, + "grad_norm": 0.3953205943107605, + "learning_rate": 9.1755868489115e-05, + "loss": 1.8671, + "step": 6838 + }, + { + "epoch": 2.0991405770411293, + "grad_norm": 0.33443906903266907, + "learning_rate": 9.175313411801181e-05, + "loss": 1.8574, + "step": 6839 + }, + { + "epoch": 2.0994475138121547, + "grad_norm": 0.3358154892921448, + "learning_rate": 9.17503993342797e-05, + "loss": 1.8329, + "step": 6840 + }, + { + "epoch": 2.09975445058318, + "grad_norm": 0.45934513211250305, + "learning_rate": 9.174766413794566e-05, + "loss": 1.862, + "step": 6841 + }, + { + "epoch": 2.100061387354205, + "grad_norm": 0.46342480182647705, + "learning_rate": 9.174492852903673e-05, + "loss": 1.8747, + "step": 6842 + }, + { + "epoch": 2.1003683241252302, + "grad_norm": 0.4199588894844055, + "learning_rate": 9.174219250757996e-05, + "loss": 1.9308, + "step": 6843 + }, + { + "epoch": 2.1006752608962556, + "grad_norm": 0.3508588373661041, + "learning_rate": 9.173945607360238e-05, + "loss": 1.8622, + "step": 6844 + }, + { + "epoch": 2.1009821976672804, + "grad_norm": 0.3656609356403351, + "learning_rate": 9.173671922713104e-05, + "loss": 1.899, + "step": 6845 + }, + { + "epoch": 2.101289134438306, + "grad_norm": 0.43374791741371155, + "learning_rate": 9.173398196819295e-05, + "loss": 1.8725, + "step": 6846 + }, + { + "epoch": 2.1015960712093307, + "grad_norm": 0.49730411171913147, + "learning_rate": 9.17312442968152e-05, + "loss": 1.9224, + "step": 6847 + }, + { + "epoch": 2.101903007980356, + "grad_norm": 0.45392677187919617, + "learning_rate": 9.172850621302484e-05, + "loss": 1.8374, + "step": 6848 + }, + { + "epoch": 2.1022099447513813, + "grad_norm": 0.3507382273674011, + "learning_rate": 9.172576771684892e-05, + "loss": 1.8875, + "step": 6849 + }, + { + "epoch": 2.1025168815224062, + "grad_norm": 0.4124681055545807, + "learning_rate": 9.172302880831451e-05, + "loss": 1.8828, + "step": 6850 + }, + { + "epoch": 2.1028238182934316, + "grad_norm": 0.5120462775230408, + "learning_rate": 9.172028948744867e-05, + "loss": 1.8218, + "step": 6851 + }, + { + "epoch": 2.103130755064457, + "grad_norm": 0.5858038067817688, + "learning_rate": 9.171754975427848e-05, + "loss": 1.8679, + "step": 6852 + }, + { + "epoch": 2.103437691835482, + "grad_norm": 0.5196588039398193, + "learning_rate": 9.171480960883101e-05, + "loss": 1.8885, + "step": 6853 + }, + { + "epoch": 2.103744628606507, + "grad_norm": 0.38581255078315735, + "learning_rate": 9.171206905113335e-05, + "loss": 1.9127, + "step": 6854 + }, + { + "epoch": 2.104051565377532, + "grad_norm": 0.31531259417533875, + "learning_rate": 9.170932808121256e-05, + "loss": 1.84, + "step": 6855 + }, + { + "epoch": 2.1043585021485574, + "grad_norm": 0.4595080018043518, + "learning_rate": 9.170658669909575e-05, + "loss": 1.908, + "step": 6856 + }, + { + "epoch": 2.1046654389195827, + "grad_norm": 0.42485639452934265, + "learning_rate": 9.170384490481001e-05, + "loss": 1.8943, + "step": 6857 + }, + { + "epoch": 2.1049723756906076, + "grad_norm": 0.3465791344642639, + "learning_rate": 9.170110269838243e-05, + "loss": 1.8362, + "step": 6858 + }, + { + "epoch": 2.105279312461633, + "grad_norm": 0.26863181591033936, + "learning_rate": 9.16983600798401e-05, + "loss": 1.856, + "step": 6859 + }, + { + "epoch": 2.1055862492326582, + "grad_norm": 0.33826425671577454, + "learning_rate": 9.169561704921014e-05, + "loss": 1.8148, + "step": 6860 + }, + { + "epoch": 2.105893186003683, + "grad_norm": 0.3657929301261902, + "learning_rate": 9.169287360651967e-05, + "loss": 1.8978, + "step": 6861 + }, + { + "epoch": 2.1062001227747085, + "grad_norm": 0.2963617444038391, + "learning_rate": 9.169012975179579e-05, + "loss": 1.8432, + "step": 6862 + }, + { + "epoch": 2.1065070595457334, + "grad_norm": 0.32966092228889465, + "learning_rate": 9.168738548506559e-05, + "loss": 1.9137, + "step": 6863 + }, + { + "epoch": 2.1068139963167587, + "grad_norm": 0.4043191075325012, + "learning_rate": 9.168464080635622e-05, + "loss": 1.9294, + "step": 6864 + }, + { + "epoch": 2.107120933087784, + "grad_norm": 0.41461876034736633, + "learning_rate": 9.168189571569479e-05, + "loss": 1.8582, + "step": 6865 + }, + { + "epoch": 2.107427869858809, + "grad_norm": 0.34119492769241333, + "learning_rate": 9.167915021310845e-05, + "loss": 1.8245, + "step": 6866 + }, + { + "epoch": 2.1077348066298343, + "grad_norm": 0.3259434401988983, + "learning_rate": 9.167640429862429e-05, + "loss": 1.8962, + "step": 6867 + }, + { + "epoch": 2.1080417434008596, + "grad_norm": 0.3074548840522766, + "learning_rate": 9.167365797226951e-05, + "loss": 1.8617, + "step": 6868 + }, + { + "epoch": 2.1083486801718845, + "grad_norm": 0.40738388895988464, + "learning_rate": 9.167091123407121e-05, + "loss": 1.9701, + "step": 6869 + }, + { + "epoch": 2.10865561694291, + "grad_norm": 0.3931449055671692, + "learning_rate": 9.166816408405653e-05, + "loss": 1.8874, + "step": 6870 + }, + { + "epoch": 2.108962553713935, + "grad_norm": 0.3726460635662079, + "learning_rate": 9.166541652225264e-05, + "loss": 1.9307, + "step": 6871 + }, + { + "epoch": 2.10926949048496, + "grad_norm": 0.36566078662872314, + "learning_rate": 9.166266854868667e-05, + "loss": 1.8782, + "step": 6872 + }, + { + "epoch": 2.1095764272559854, + "grad_norm": 0.33448025584220886, + "learning_rate": 9.16599201633858e-05, + "loss": 1.8007, + "step": 6873 + }, + { + "epoch": 2.1098833640270103, + "grad_norm": 0.4261031150817871, + "learning_rate": 9.165717136637716e-05, + "loss": 1.9092, + "step": 6874 + }, + { + "epoch": 2.1101903007980356, + "grad_norm": 0.37860241532325745, + "learning_rate": 9.165442215768798e-05, + "loss": 1.8538, + "step": 6875 + }, + { + "epoch": 2.110497237569061, + "grad_norm": 0.35417279601097107, + "learning_rate": 9.165167253734535e-05, + "loss": 1.8859, + "step": 6876 + }, + { + "epoch": 2.110804174340086, + "grad_norm": 0.33357858657836914, + "learning_rate": 9.16489225053765e-05, + "loss": 1.8615, + "step": 6877 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.40441447496414185, + "learning_rate": 9.164617206180856e-05, + "loss": 1.8711, + "step": 6878 + }, + { + "epoch": 2.1114180478821365, + "grad_norm": 0.401530921459198, + "learning_rate": 9.164342120666876e-05, + "loss": 1.8378, + "step": 6879 + }, + { + "epoch": 2.1117249846531614, + "grad_norm": 0.36379504203796387, + "learning_rate": 9.164066993998426e-05, + "loss": 1.87, + "step": 6880 + }, + { + "epoch": 2.1120319214241867, + "grad_norm": 0.36242642998695374, + "learning_rate": 9.163791826178225e-05, + "loss": 1.9041, + "step": 6881 + }, + { + "epoch": 2.1123388581952116, + "grad_norm": 0.34601980447769165, + "learning_rate": 9.163516617208994e-05, + "loss": 1.9248, + "step": 6882 + }, + { + "epoch": 2.112645794966237, + "grad_norm": 0.4664660096168518, + "learning_rate": 9.163241367093451e-05, + "loss": 1.901, + "step": 6883 + }, + { + "epoch": 2.1129527317372623, + "grad_norm": 0.5991809964179993, + "learning_rate": 9.162966075834315e-05, + "loss": 1.9061, + "step": 6884 + }, + { + "epoch": 2.113259668508287, + "grad_norm": 0.5235050320625305, + "learning_rate": 9.16269074343431e-05, + "loss": 1.8958, + "step": 6885 + }, + { + "epoch": 2.1135666052793125, + "grad_norm": 0.39008161425590515, + "learning_rate": 9.162415369896153e-05, + "loss": 1.7935, + "step": 6886 + }, + { + "epoch": 2.113873542050338, + "grad_norm": 0.4212269186973572, + "learning_rate": 9.16213995522257e-05, + "loss": 1.9876, + "step": 6887 + }, + { + "epoch": 2.1141804788213627, + "grad_norm": 0.44495880603790283, + "learning_rate": 9.161864499416279e-05, + "loss": 1.9011, + "step": 6888 + }, + { + "epoch": 2.114487415592388, + "grad_norm": 0.40533384680747986, + "learning_rate": 9.161589002480006e-05, + "loss": 1.8734, + "step": 6889 + }, + { + "epoch": 2.114794352363413, + "grad_norm": 0.45783132314682007, + "learning_rate": 9.161313464416469e-05, + "loss": 1.9769, + "step": 6890 + }, + { + "epoch": 2.1151012891344383, + "grad_norm": 0.37975600361824036, + "learning_rate": 9.161037885228393e-05, + "loss": 1.8988, + "step": 6891 + }, + { + "epoch": 2.1154082259054636, + "grad_norm": 0.394987553358078, + "learning_rate": 9.160762264918504e-05, + "loss": 1.8076, + "step": 6892 + }, + { + "epoch": 2.1157151626764885, + "grad_norm": 0.4180262088775635, + "learning_rate": 9.160486603489522e-05, + "loss": 1.9497, + "step": 6893 + }, + { + "epoch": 2.116022099447514, + "grad_norm": 0.3917383849620819, + "learning_rate": 9.160210900944173e-05, + "loss": 1.9093, + "step": 6894 + }, + { + "epoch": 2.116329036218539, + "grad_norm": 0.3631739616394043, + "learning_rate": 9.15993515728518e-05, + "loss": 1.8724, + "step": 6895 + }, + { + "epoch": 2.116635972989564, + "grad_norm": 0.3304460942745209, + "learning_rate": 9.159659372515272e-05, + "loss": 1.8291, + "step": 6896 + }, + { + "epoch": 2.1169429097605894, + "grad_norm": 0.38202792406082153, + "learning_rate": 9.159383546637172e-05, + "loss": 1.8919, + "step": 6897 + }, + { + "epoch": 2.1172498465316143, + "grad_norm": 0.39544618129730225, + "learning_rate": 9.159107679653605e-05, + "loss": 1.8748, + "step": 6898 + }, + { + "epoch": 2.1175567833026396, + "grad_norm": 0.44175153970718384, + "learning_rate": 9.158831771567298e-05, + "loss": 1.9063, + "step": 6899 + }, + { + "epoch": 2.117863720073665, + "grad_norm": 0.3696559965610504, + "learning_rate": 9.158555822380979e-05, + "loss": 1.8356, + "step": 6900 + }, + { + "epoch": 2.11817065684469, + "grad_norm": 0.2917703688144684, + "learning_rate": 9.158279832097372e-05, + "loss": 1.8996, + "step": 6901 + }, + { + "epoch": 2.118477593615715, + "grad_norm": 0.3991266191005707, + "learning_rate": 9.158003800719208e-05, + "loss": 1.8872, + "step": 6902 + }, + { + "epoch": 2.1187845303867405, + "grad_norm": 0.41425880789756775, + "learning_rate": 9.157727728249213e-05, + "loss": 1.845, + "step": 6903 + }, + { + "epoch": 2.1190914671577654, + "grad_norm": 0.33590519428253174, + "learning_rate": 9.157451614690115e-05, + "loss": 1.8779, + "step": 6904 + }, + { + "epoch": 2.1193984039287908, + "grad_norm": 0.34963786602020264, + "learning_rate": 9.157175460044644e-05, + "loss": 1.8846, + "step": 6905 + }, + { + "epoch": 2.1197053406998156, + "grad_norm": 0.3274745047092438, + "learning_rate": 9.156899264315528e-05, + "loss": 1.8859, + "step": 6906 + }, + { + "epoch": 2.120012277470841, + "grad_norm": 0.35821303725242615, + "learning_rate": 9.156623027505498e-05, + "loss": 1.8314, + "step": 6907 + }, + { + "epoch": 2.1203192142418663, + "grad_norm": 0.41185733675956726, + "learning_rate": 9.156346749617283e-05, + "loss": 1.9162, + "step": 6908 + }, + { + "epoch": 2.120626151012891, + "grad_norm": 0.4120326042175293, + "learning_rate": 9.156070430653613e-05, + "loss": 1.8593, + "step": 6909 + }, + { + "epoch": 2.1209330877839165, + "grad_norm": 0.39017269015312195, + "learning_rate": 9.155794070617218e-05, + "loss": 1.9333, + "step": 6910 + }, + { + "epoch": 2.121240024554942, + "grad_norm": 0.3104727864265442, + "learning_rate": 9.155517669510832e-05, + "loss": 1.8274, + "step": 6911 + }, + { + "epoch": 2.1215469613259668, + "grad_norm": 0.38360875844955444, + "learning_rate": 9.155241227337183e-05, + "loss": 1.9013, + "step": 6912 + }, + { + "epoch": 2.121853898096992, + "grad_norm": 0.3752502501010895, + "learning_rate": 9.154964744099006e-05, + "loss": 1.9079, + "step": 6913 + }, + { + "epoch": 2.122160834868017, + "grad_norm": 0.32074928283691406, + "learning_rate": 9.154688219799033e-05, + "loss": 1.8232, + "step": 6914 + }, + { + "epoch": 2.1224677716390423, + "grad_norm": 0.39559221267700195, + "learning_rate": 9.154411654439993e-05, + "loss": 1.9273, + "step": 6915 + }, + { + "epoch": 2.1227747084100677, + "grad_norm": 0.4010276198387146, + "learning_rate": 9.154135048024623e-05, + "loss": 1.8368, + "step": 6916 + }, + { + "epoch": 2.1230816451810925, + "grad_norm": 0.5745936036109924, + "learning_rate": 9.153858400555658e-05, + "loss": 2.0344, + "step": 6917 + }, + { + "epoch": 2.123388581952118, + "grad_norm": 0.45708227157592773, + "learning_rate": 9.153581712035827e-05, + "loss": 1.9309, + "step": 6918 + }, + { + "epoch": 2.123695518723143, + "grad_norm": 0.43845629692077637, + "learning_rate": 9.153304982467868e-05, + "loss": 1.9213, + "step": 6919 + }, + { + "epoch": 2.124002455494168, + "grad_norm": 0.34456655383110046, + "learning_rate": 9.153028211854516e-05, + "loss": 1.9, + "step": 6920 + }, + { + "epoch": 2.1243093922651934, + "grad_norm": 0.3903563618659973, + "learning_rate": 9.152751400198502e-05, + "loss": 1.8619, + "step": 6921 + }, + { + "epoch": 2.1246163290362183, + "grad_norm": 0.3465174436569214, + "learning_rate": 9.152474547502566e-05, + "loss": 1.8253, + "step": 6922 + }, + { + "epoch": 2.1249232658072437, + "grad_norm": 0.38335317373275757, + "learning_rate": 9.152197653769444e-05, + "loss": 1.8824, + "step": 6923 + }, + { + "epoch": 2.125230202578269, + "grad_norm": 0.3583361506462097, + "learning_rate": 9.15192071900187e-05, + "loss": 1.8749, + "step": 6924 + }, + { + "epoch": 2.125537139349294, + "grad_norm": 0.38249272108078003, + "learning_rate": 9.151643743202582e-05, + "loss": 1.9289, + "step": 6925 + }, + { + "epoch": 2.1258440761203192, + "grad_norm": 0.3972204327583313, + "learning_rate": 9.151366726374318e-05, + "loss": 1.8259, + "step": 6926 + }, + { + "epoch": 2.1261510128913446, + "grad_norm": 0.42475268244743347, + "learning_rate": 9.151089668519814e-05, + "loss": 1.9026, + "step": 6927 + }, + { + "epoch": 2.1264579496623695, + "grad_norm": 0.39575010538101196, + "learning_rate": 9.15081256964181e-05, + "loss": 1.8835, + "step": 6928 + }, + { + "epoch": 2.126764886433395, + "grad_norm": 0.33592918515205383, + "learning_rate": 9.150535429743041e-05, + "loss": 1.9439, + "step": 6929 + }, + { + "epoch": 2.12707182320442, + "grad_norm": 0.41760140657424927, + "learning_rate": 9.150258248826249e-05, + "loss": 1.9326, + "step": 6930 + }, + { + "epoch": 2.127378759975445, + "grad_norm": 0.4759281575679779, + "learning_rate": 9.149981026894173e-05, + "loss": 1.8443, + "step": 6931 + }, + { + "epoch": 2.1276856967464703, + "grad_norm": 0.4669014513492584, + "learning_rate": 9.149703763949552e-05, + "loss": 1.9254, + "step": 6932 + }, + { + "epoch": 2.1279926335174952, + "grad_norm": 0.3498002588748932, + "learning_rate": 9.149426459995126e-05, + "loss": 1.8814, + "step": 6933 + }, + { + "epoch": 2.1282995702885206, + "grad_norm": 0.332998663187027, + "learning_rate": 9.149149115033637e-05, + "loss": 1.8223, + "step": 6934 + }, + { + "epoch": 2.128606507059546, + "grad_norm": 0.36990395188331604, + "learning_rate": 9.148871729067823e-05, + "loss": 1.917, + "step": 6935 + }, + { + "epoch": 2.128913443830571, + "grad_norm": 0.4807330369949341, + "learning_rate": 9.148594302100426e-05, + "loss": 1.9138, + "step": 6936 + }, + { + "epoch": 2.129220380601596, + "grad_norm": 0.4821743369102478, + "learning_rate": 9.14831683413419e-05, + "loss": 1.9201, + "step": 6937 + }, + { + "epoch": 2.129527317372621, + "grad_norm": 0.45373013615608215, + "learning_rate": 9.148039325171855e-05, + "loss": 1.88, + "step": 6938 + }, + { + "epoch": 2.1298342541436464, + "grad_norm": 0.3712935745716095, + "learning_rate": 9.147761775216166e-05, + "loss": 1.8424, + "step": 6939 + }, + { + "epoch": 2.1301411909146717, + "grad_norm": 0.32493939995765686, + "learning_rate": 9.147484184269862e-05, + "loss": 1.8691, + "step": 6940 + }, + { + "epoch": 2.1304481276856966, + "grad_norm": 0.41952449083328247, + "learning_rate": 9.14720655233569e-05, + "loss": 1.8468, + "step": 6941 + }, + { + "epoch": 2.130755064456722, + "grad_norm": 0.4730648398399353, + "learning_rate": 9.14692887941639e-05, + "loss": 2.0333, + "step": 6942 + }, + { + "epoch": 2.1310620012277472, + "grad_norm": 0.3745786249637604, + "learning_rate": 9.14665116551471e-05, + "loss": 1.8835, + "step": 6943 + }, + { + "epoch": 2.131368937998772, + "grad_norm": 0.3747421205043793, + "learning_rate": 9.146373410633392e-05, + "loss": 1.8958, + "step": 6944 + }, + { + "epoch": 2.1316758747697975, + "grad_norm": 0.4383934438228607, + "learning_rate": 9.146095614775182e-05, + "loss": 1.8527, + "step": 6945 + }, + { + "epoch": 2.131982811540823, + "grad_norm": 0.4657299220561981, + "learning_rate": 9.145817777942824e-05, + "loss": 1.9073, + "step": 6946 + }, + { + "epoch": 2.1322897483118477, + "grad_norm": 0.4741605818271637, + "learning_rate": 9.145539900139067e-05, + "loss": 1.8736, + "step": 6947 + }, + { + "epoch": 2.132596685082873, + "grad_norm": 0.4058460295200348, + "learning_rate": 9.145261981366653e-05, + "loss": 1.9365, + "step": 6948 + }, + { + "epoch": 2.132903621853898, + "grad_norm": 0.3430838882923126, + "learning_rate": 9.14498402162833e-05, + "loss": 1.8992, + "step": 6949 + }, + { + "epoch": 2.1332105586249233, + "grad_norm": 0.43009114265441895, + "learning_rate": 9.144706020926847e-05, + "loss": 1.925, + "step": 6950 + }, + { + "epoch": 2.1335174953959486, + "grad_norm": 0.47696158289909363, + "learning_rate": 9.144427979264949e-05, + "loss": 1.858, + "step": 6951 + }, + { + "epoch": 2.1338244321669735, + "grad_norm": 0.4477602243423462, + "learning_rate": 9.144149896645386e-05, + "loss": 1.9042, + "step": 6952 + }, + { + "epoch": 2.134131368937999, + "grad_norm": 0.3736960291862488, + "learning_rate": 9.143871773070903e-05, + "loss": 1.782, + "step": 6953 + }, + { + "epoch": 2.1344383057090237, + "grad_norm": 0.3065558075904846, + "learning_rate": 9.143593608544251e-05, + "loss": 1.8711, + "step": 6954 + }, + { + "epoch": 2.134745242480049, + "grad_norm": 0.41738569736480713, + "learning_rate": 9.143315403068178e-05, + "loss": 1.8651, + "step": 6955 + }, + { + "epoch": 2.1350521792510744, + "grad_norm": 0.4652978479862213, + "learning_rate": 9.143037156645435e-05, + "loss": 1.8225, + "step": 6956 + }, + { + "epoch": 2.1353591160220993, + "grad_norm": 0.3625001311302185, + "learning_rate": 9.142758869278769e-05, + "loss": 1.9045, + "step": 6957 + }, + { + "epoch": 2.1356660527931246, + "grad_norm": 0.34516090154647827, + "learning_rate": 9.142480540970933e-05, + "loss": 1.8527, + "step": 6958 + }, + { + "epoch": 2.13597298956415, + "grad_norm": 0.36983323097229004, + "learning_rate": 9.142202171724674e-05, + "loss": 1.7911, + "step": 6959 + }, + { + "epoch": 2.136279926335175, + "grad_norm": 0.46084535121917725, + "learning_rate": 9.141923761542748e-05, + "loss": 1.9489, + "step": 6960 + }, + { + "epoch": 2.1365868631062, + "grad_norm": 0.49472227692604065, + "learning_rate": 9.141645310427903e-05, + "loss": 1.9904, + "step": 6961 + }, + { + "epoch": 2.1368937998772255, + "grad_norm": 0.39878135919570923, + "learning_rate": 9.14136681838289e-05, + "loss": 1.8969, + "step": 6962 + }, + { + "epoch": 2.1372007366482504, + "grad_norm": 0.3451174795627594, + "learning_rate": 9.141088285410464e-05, + "loss": 1.9186, + "step": 6963 + }, + { + "epoch": 2.1375076734192757, + "grad_norm": 0.4497967064380646, + "learning_rate": 9.140809711513377e-05, + "loss": 1.8636, + "step": 6964 + }, + { + "epoch": 2.1378146101903006, + "grad_norm": 0.4643685221672058, + "learning_rate": 9.14053109669438e-05, + "loss": 1.8427, + "step": 6965 + }, + { + "epoch": 2.138121546961326, + "grad_norm": 0.3748690187931061, + "learning_rate": 9.140252440956229e-05, + "loss": 1.8529, + "step": 6966 + }, + { + "epoch": 2.1384284837323513, + "grad_norm": 0.3211230933666229, + "learning_rate": 9.139973744301675e-05, + "loss": 1.8849, + "step": 6967 + }, + { + "epoch": 2.138735420503376, + "grad_norm": 0.41169998049736023, + "learning_rate": 9.139695006733476e-05, + "loss": 1.8535, + "step": 6968 + }, + { + "epoch": 2.1390423572744015, + "grad_norm": 0.48356300592422485, + "learning_rate": 9.139416228254382e-05, + "loss": 1.8182, + "step": 6969 + }, + { + "epoch": 2.139349294045427, + "grad_norm": 0.4596598148345947, + "learning_rate": 9.139137408867153e-05, + "loss": 1.8522, + "step": 6970 + }, + { + "epoch": 2.1396562308164517, + "grad_norm": 0.37168747186660767, + "learning_rate": 9.138858548574543e-05, + "loss": 1.896, + "step": 6971 + }, + { + "epoch": 2.139963167587477, + "grad_norm": 0.34447649121284485, + "learning_rate": 9.138579647379305e-05, + "loss": 1.8473, + "step": 6972 + }, + { + "epoch": 2.140270104358502, + "grad_norm": 0.466169536113739, + "learning_rate": 9.138300705284197e-05, + "loss": 1.9131, + "step": 6973 + }, + { + "epoch": 2.1405770411295273, + "grad_norm": 0.4297258257865906, + "learning_rate": 9.138021722291977e-05, + "loss": 1.9013, + "step": 6974 + }, + { + "epoch": 2.1408839779005526, + "grad_norm": 0.29336342215538025, + "learning_rate": 9.1377426984054e-05, + "loss": 1.8242, + "step": 6975 + }, + { + "epoch": 2.1411909146715775, + "grad_norm": 0.4282750189304352, + "learning_rate": 9.137463633627226e-05, + "loss": 1.9159, + "step": 6976 + }, + { + "epoch": 2.141497851442603, + "grad_norm": 0.6071211099624634, + "learning_rate": 9.13718452796021e-05, + "loss": 1.9105, + "step": 6977 + }, + { + "epoch": 2.141804788213628, + "grad_norm": 0.5837090015411377, + "learning_rate": 9.136905381407113e-05, + "loss": 1.8735, + "step": 6978 + }, + { + "epoch": 2.142111724984653, + "grad_norm": 0.36910486221313477, + "learning_rate": 9.13662619397069e-05, + "loss": 1.9013, + "step": 6979 + }, + { + "epoch": 2.1424186617556784, + "grad_norm": 0.37497541308403015, + "learning_rate": 9.136346965653704e-05, + "loss": 1.8444, + "step": 6980 + }, + { + "epoch": 2.1427255985267033, + "grad_norm": 0.508252739906311, + "learning_rate": 9.136067696458911e-05, + "loss": 1.8756, + "step": 6981 + }, + { + "epoch": 2.1430325352977286, + "grad_norm": 0.4045214056968689, + "learning_rate": 9.135788386389077e-05, + "loss": 1.8843, + "step": 6982 + }, + { + "epoch": 2.143339472068754, + "grad_norm": 0.36260777711868286, + "learning_rate": 9.135509035446955e-05, + "loss": 1.9264, + "step": 6983 + }, + { + "epoch": 2.143646408839779, + "grad_norm": 0.4112427234649658, + "learning_rate": 9.135229643635309e-05, + "loss": 1.8843, + "step": 6984 + }, + { + "epoch": 2.143953345610804, + "grad_norm": 0.43893104791641235, + "learning_rate": 9.1349502109569e-05, + "loss": 1.9486, + "step": 6985 + }, + { + "epoch": 2.1442602823818295, + "grad_norm": 0.3942745625972748, + "learning_rate": 9.13467073741449e-05, + "loss": 1.8607, + "step": 6986 + }, + { + "epoch": 2.1445672191528544, + "grad_norm": 0.3920004963874817, + "learning_rate": 9.13439122301084e-05, + "loss": 1.8102, + "step": 6987 + }, + { + "epoch": 2.1448741559238798, + "grad_norm": 0.3774373531341553, + "learning_rate": 9.134111667748712e-05, + "loss": 1.8326, + "step": 6988 + }, + { + "epoch": 2.1451810926949046, + "grad_norm": 0.355228453874588, + "learning_rate": 9.13383207163087e-05, + "loss": 1.895, + "step": 6989 + }, + { + "epoch": 2.14548802946593, + "grad_norm": 0.40284648537635803, + "learning_rate": 9.133552434660077e-05, + "loss": 1.928, + "step": 6990 + }, + { + "epoch": 2.1457949662369553, + "grad_norm": 0.3974910378456116, + "learning_rate": 9.133272756839096e-05, + "loss": 1.8567, + "step": 6991 + }, + { + "epoch": 2.14610190300798, + "grad_norm": 0.3878382742404938, + "learning_rate": 9.13299303817069e-05, + "loss": 1.9125, + "step": 6992 + }, + { + "epoch": 2.1464088397790055, + "grad_norm": 0.36132267117500305, + "learning_rate": 9.132713278657625e-05, + "loss": 1.8395, + "step": 6993 + }, + { + "epoch": 2.146715776550031, + "grad_norm": 0.4648832082748413, + "learning_rate": 9.132433478302667e-05, + "loss": 1.8877, + "step": 6994 + }, + { + "epoch": 2.1470227133210558, + "grad_norm": 0.5171563625335693, + "learning_rate": 9.132153637108577e-05, + "loss": 1.857, + "step": 6995 + }, + { + "epoch": 2.147329650092081, + "grad_norm": 0.4256175756454468, + "learning_rate": 9.131873755078124e-05, + "loss": 1.8434, + "step": 6996 + }, + { + "epoch": 2.147636586863106, + "grad_norm": 0.3421500623226166, + "learning_rate": 9.131593832214072e-05, + "loss": 1.8747, + "step": 6997 + }, + { + "epoch": 2.1479435236341313, + "grad_norm": 0.3880314230918884, + "learning_rate": 9.131313868519188e-05, + "loss": 1.8592, + "step": 6998 + }, + { + "epoch": 2.1482504604051567, + "grad_norm": 0.41070252656936646, + "learning_rate": 9.131033863996239e-05, + "loss": 1.8746, + "step": 6999 + }, + { + "epoch": 2.1485573971761815, + "grad_norm": 0.3837376534938812, + "learning_rate": 9.130753818647992e-05, + "loss": 1.8722, + "step": 7000 + }, + { + "epoch": 2.148864333947207, + "grad_norm": 0.311184823513031, + "learning_rate": 9.130473732477217e-05, + "loss": 1.8964, + "step": 7001 + }, + { + "epoch": 2.149171270718232, + "grad_norm": 0.3548091948032379, + "learning_rate": 9.130193605486677e-05, + "loss": 1.9235, + "step": 7002 + }, + { + "epoch": 2.149478207489257, + "grad_norm": 0.3509860932826996, + "learning_rate": 9.129913437679143e-05, + "loss": 1.8088, + "step": 7003 + }, + { + "epoch": 2.1497851442602824, + "grad_norm": 0.3301749527454376, + "learning_rate": 9.129633229057384e-05, + "loss": 1.8926, + "step": 7004 + }, + { + "epoch": 2.150092081031308, + "grad_norm": 0.3071286082267761, + "learning_rate": 9.129352979624169e-05, + "loss": 1.8045, + "step": 7005 + }, + { + "epoch": 2.1503990178023327, + "grad_norm": 0.3222786486148834, + "learning_rate": 9.129072689382268e-05, + "loss": 1.877, + "step": 7006 + }, + { + "epoch": 2.150705954573358, + "grad_norm": 0.31817424297332764, + "learning_rate": 9.128792358334451e-05, + "loss": 1.8863, + "step": 7007 + }, + { + "epoch": 2.151012891344383, + "grad_norm": 0.29379183053970337, + "learning_rate": 9.128511986483487e-05, + "loss": 1.8339, + "step": 7008 + }, + { + "epoch": 2.1513198281154082, + "grad_norm": 0.3618883788585663, + "learning_rate": 9.128231573832149e-05, + "loss": 1.9521, + "step": 7009 + }, + { + "epoch": 2.1516267648864336, + "grad_norm": 0.3188464045524597, + "learning_rate": 9.127951120383205e-05, + "loss": 1.811, + "step": 7010 + }, + { + "epoch": 2.1519337016574585, + "grad_norm": 0.3257068395614624, + "learning_rate": 9.127670626139431e-05, + "loss": 1.9084, + "step": 7011 + }, + { + "epoch": 2.152240638428484, + "grad_norm": 0.3389057219028473, + "learning_rate": 9.127390091103595e-05, + "loss": 1.9272, + "step": 7012 + }, + { + "epoch": 2.1525475751995087, + "grad_norm": 0.3376730680465698, + "learning_rate": 9.127109515278471e-05, + "loss": 1.8841, + "step": 7013 + }, + { + "epoch": 2.152854511970534, + "grad_norm": 0.3032901883125305, + "learning_rate": 9.126828898666833e-05, + "loss": 1.8057, + "step": 7014 + }, + { + "epoch": 2.1531614487415593, + "grad_norm": 0.32034799456596375, + "learning_rate": 9.126548241271451e-05, + "loss": 1.7988, + "step": 7015 + }, + { + "epoch": 2.1534683855125842, + "grad_norm": 0.31879931688308716, + "learning_rate": 9.126267543095102e-05, + "loss": 1.8932, + "step": 7016 + }, + { + "epoch": 2.1537753222836096, + "grad_norm": 0.3282395005226135, + "learning_rate": 9.125986804140559e-05, + "loss": 1.907, + "step": 7017 + }, + { + "epoch": 2.154082259054635, + "grad_norm": 0.36310696601867676, + "learning_rate": 9.125706024410594e-05, + "loss": 1.9812, + "step": 7018 + }, + { + "epoch": 2.15438919582566, + "grad_norm": 0.39414262771606445, + "learning_rate": 9.125425203907985e-05, + "loss": 1.9112, + "step": 7019 + }, + { + "epoch": 2.154696132596685, + "grad_norm": 0.4457061290740967, + "learning_rate": 9.125144342635508e-05, + "loss": 1.8876, + "step": 7020 + }, + { + "epoch": 2.1550030693677105, + "grad_norm": 0.4651646316051483, + "learning_rate": 9.124863440595934e-05, + "loss": 1.8283, + "step": 7021 + }, + { + "epoch": 2.1553100061387354, + "grad_norm": 0.4404383897781372, + "learning_rate": 9.124582497792043e-05, + "loss": 1.8646, + "step": 7022 + }, + { + "epoch": 2.1556169429097607, + "grad_norm": 0.3569783866405487, + "learning_rate": 9.124301514226612e-05, + "loss": 1.9603, + "step": 7023 + }, + { + "epoch": 2.1559238796807856, + "grad_norm": 0.3878212571144104, + "learning_rate": 9.124020489902414e-05, + "loss": 1.889, + "step": 7024 + }, + { + "epoch": 2.156230816451811, + "grad_norm": 0.43005698919296265, + "learning_rate": 9.123739424822229e-05, + "loss": 1.9127, + "step": 7025 + }, + { + "epoch": 2.1565377532228363, + "grad_norm": 0.37798774242401123, + "learning_rate": 9.123458318988834e-05, + "loss": 1.8434, + "step": 7026 + }, + { + "epoch": 2.156844689993861, + "grad_norm": 0.38182979822158813, + "learning_rate": 9.123177172405007e-05, + "loss": 1.8905, + "step": 7027 + }, + { + "epoch": 2.1571516267648865, + "grad_norm": 0.4695180058479309, + "learning_rate": 9.122895985073524e-05, + "loss": 1.9035, + "step": 7028 + }, + { + "epoch": 2.1574585635359114, + "grad_norm": 0.37112870812416077, + "learning_rate": 9.12261475699717e-05, + "loss": 1.8497, + "step": 7029 + }, + { + "epoch": 2.1577655003069367, + "grad_norm": 0.36758264899253845, + "learning_rate": 9.122333488178721e-05, + "loss": 1.9015, + "step": 7030 + }, + { + "epoch": 2.158072437077962, + "grad_norm": 0.4691081643104553, + "learning_rate": 9.122052178620953e-05, + "loss": 1.9707, + "step": 7031 + }, + { + "epoch": 2.158379373848987, + "grad_norm": 0.47068753838539124, + "learning_rate": 9.121770828326653e-05, + "loss": 1.9103, + "step": 7032 + }, + { + "epoch": 2.1586863106200123, + "grad_norm": 0.38539063930511475, + "learning_rate": 9.121489437298593e-05, + "loss": 1.7872, + "step": 7033 + }, + { + "epoch": 2.1589932473910376, + "grad_norm": 0.43769749999046326, + "learning_rate": 9.121208005539563e-05, + "loss": 1.9654, + "step": 7034 + }, + { + "epoch": 2.1593001841620625, + "grad_norm": 0.4770655930042267, + "learning_rate": 9.120926533052338e-05, + "loss": 1.9754, + "step": 7035 + }, + { + "epoch": 2.159607120933088, + "grad_norm": 0.526979386806488, + "learning_rate": 9.120645019839702e-05, + "loss": 1.8833, + "step": 7036 + }, + { + "epoch": 2.159914057704113, + "grad_norm": 0.4734671413898468, + "learning_rate": 9.120363465904438e-05, + "loss": 1.8695, + "step": 7037 + }, + { + "epoch": 2.160220994475138, + "grad_norm": 0.40346798300743103, + "learning_rate": 9.120081871249326e-05, + "loss": 1.9216, + "step": 7038 + }, + { + "epoch": 2.1605279312461634, + "grad_norm": 0.38210105895996094, + "learning_rate": 9.119800235877149e-05, + "loss": 1.9334, + "step": 7039 + }, + { + "epoch": 2.1608348680171883, + "grad_norm": 0.5528677105903625, + "learning_rate": 9.119518559790694e-05, + "loss": 1.8858, + "step": 7040 + }, + { + "epoch": 2.1611418047882136, + "grad_norm": 0.6684148907661438, + "learning_rate": 9.11923684299274e-05, + "loss": 1.9105, + "step": 7041 + }, + { + "epoch": 2.161448741559239, + "grad_norm": 0.4497738778591156, + "learning_rate": 9.118955085486073e-05, + "loss": 1.8789, + "step": 7042 + }, + { + "epoch": 2.161755678330264, + "grad_norm": 0.4440831243991852, + "learning_rate": 9.11867328727348e-05, + "loss": 1.9966, + "step": 7043 + }, + { + "epoch": 2.162062615101289, + "grad_norm": 0.5910835266113281, + "learning_rate": 9.118391448357742e-05, + "loss": 1.8841, + "step": 7044 + }, + { + "epoch": 2.1623695518723145, + "grad_norm": 0.5312752723693848, + "learning_rate": 9.118109568741645e-05, + "loss": 1.8825, + "step": 7045 + }, + { + "epoch": 2.1626764886433394, + "grad_norm": 0.3885713815689087, + "learning_rate": 9.117827648427977e-05, + "loss": 1.8763, + "step": 7046 + }, + { + "epoch": 2.1629834254143647, + "grad_norm": 0.4274894893169403, + "learning_rate": 9.117545687419522e-05, + "loss": 1.8802, + "step": 7047 + }, + { + "epoch": 2.1632903621853896, + "grad_norm": 0.3984382748603821, + "learning_rate": 9.117263685719067e-05, + "loss": 1.8319, + "step": 7048 + }, + { + "epoch": 2.163597298956415, + "grad_norm": 0.3687778115272522, + "learning_rate": 9.1169816433294e-05, + "loss": 1.838, + "step": 7049 + }, + { + "epoch": 2.1639042357274403, + "grad_norm": 0.37597915530204773, + "learning_rate": 9.116699560253306e-05, + "loss": 1.8711, + "step": 7050 + }, + { + "epoch": 2.164211172498465, + "grad_norm": 0.41217467188835144, + "learning_rate": 9.116417436493574e-05, + "loss": 1.8552, + "step": 7051 + }, + { + "epoch": 2.1645181092694905, + "grad_norm": 0.3937448263168335, + "learning_rate": 9.116135272052994e-05, + "loss": 1.8548, + "step": 7052 + }, + { + "epoch": 2.164825046040516, + "grad_norm": 0.3545389175415039, + "learning_rate": 9.115853066934351e-05, + "loss": 1.8694, + "step": 7053 + }, + { + "epoch": 2.1651319828115407, + "grad_norm": 0.32625243067741394, + "learning_rate": 9.115570821140436e-05, + "loss": 1.8579, + "step": 7054 + }, + { + "epoch": 2.165438919582566, + "grad_norm": 0.32701975107192993, + "learning_rate": 9.115288534674038e-05, + "loss": 1.8676, + "step": 7055 + }, + { + "epoch": 2.165745856353591, + "grad_norm": 0.39372533559799194, + "learning_rate": 9.115006207537947e-05, + "loss": 1.8895, + "step": 7056 + }, + { + "epoch": 2.1660527931246163, + "grad_norm": 0.3688350021839142, + "learning_rate": 9.114723839734954e-05, + "loss": 1.8742, + "step": 7057 + }, + { + "epoch": 2.1663597298956416, + "grad_norm": 0.35461875796318054, + "learning_rate": 9.114441431267846e-05, + "loss": 1.8723, + "step": 7058 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.3331618010997772, + "learning_rate": 9.114158982139416e-05, + "loss": 1.8514, + "step": 7059 + }, + { + "epoch": 2.166973603437692, + "grad_norm": 0.3313215374946594, + "learning_rate": 9.113876492352458e-05, + "loss": 1.912, + "step": 7060 + }, + { + "epoch": 2.167280540208717, + "grad_norm": 0.3320949375629425, + "learning_rate": 9.113593961909759e-05, + "loss": 1.8908, + "step": 7061 + }, + { + "epoch": 2.167587476979742, + "grad_norm": 0.3292064070701599, + "learning_rate": 9.113311390814115e-05, + "loss": 1.8702, + "step": 7062 + }, + { + "epoch": 2.1678944137507674, + "grad_norm": 0.33991244435310364, + "learning_rate": 9.113028779068316e-05, + "loss": 1.8503, + "step": 7063 + }, + { + "epoch": 2.1682013505217923, + "grad_norm": 0.3602859377861023, + "learning_rate": 9.112746126675156e-05, + "loss": 1.9185, + "step": 7064 + }, + { + "epoch": 2.1685082872928176, + "grad_norm": 0.3354876637458801, + "learning_rate": 9.112463433637428e-05, + "loss": 1.8857, + "step": 7065 + }, + { + "epoch": 2.168815224063843, + "grad_norm": 0.32364192605018616, + "learning_rate": 9.112180699957926e-05, + "loss": 1.8548, + "step": 7066 + }, + { + "epoch": 2.169122160834868, + "grad_norm": 0.3617163896560669, + "learning_rate": 9.111897925639446e-05, + "loss": 1.9021, + "step": 7067 + }, + { + "epoch": 2.169429097605893, + "grad_norm": 0.3852904438972473, + "learning_rate": 9.111615110684778e-05, + "loss": 1.9331, + "step": 7068 + }, + { + "epoch": 2.1697360343769185, + "grad_norm": 0.332939088344574, + "learning_rate": 9.111332255096721e-05, + "loss": 1.9156, + "step": 7069 + }, + { + "epoch": 2.1700429711479434, + "grad_norm": 0.3386891186237335, + "learning_rate": 9.111049358878067e-05, + "loss": 1.8898, + "step": 7070 + }, + { + "epoch": 2.1703499079189688, + "grad_norm": 0.3559711277484894, + "learning_rate": 9.110766422031617e-05, + "loss": 1.8546, + "step": 7071 + }, + { + "epoch": 2.1706568446899936, + "grad_norm": 0.3440175950527191, + "learning_rate": 9.110483444560162e-05, + "loss": 1.9005, + "step": 7072 + }, + { + "epoch": 2.170963781461019, + "grad_norm": 0.3239493668079376, + "learning_rate": 9.110200426466499e-05, + "loss": 1.9258, + "step": 7073 + }, + { + "epoch": 2.1712707182320443, + "grad_norm": 0.3658723533153534, + "learning_rate": 9.109917367753428e-05, + "loss": 2.0203, + "step": 7074 + }, + { + "epoch": 2.171577655003069, + "grad_norm": 0.35419905185699463, + "learning_rate": 9.109634268423746e-05, + "loss": 1.8515, + "step": 7075 + }, + { + "epoch": 2.1718845917740945, + "grad_norm": 0.40852081775665283, + "learning_rate": 9.109351128480246e-05, + "loss": 1.8744, + "step": 7076 + }, + { + "epoch": 2.17219152854512, + "grad_norm": 0.3502386212348938, + "learning_rate": 9.109067947925732e-05, + "loss": 1.8785, + "step": 7077 + }, + { + "epoch": 2.1724984653161448, + "grad_norm": 0.42964309453964233, + "learning_rate": 9.108784726763e-05, + "loss": 1.9175, + "step": 7078 + }, + { + "epoch": 2.17280540208717, + "grad_norm": 0.39438319206237793, + "learning_rate": 9.108501464994849e-05, + "loss": 1.9072, + "step": 7079 + }, + { + "epoch": 2.1731123388581954, + "grad_norm": 0.5045785903930664, + "learning_rate": 9.108218162624079e-05, + "loss": 1.9246, + "step": 7080 + }, + { + "epoch": 2.1734192756292203, + "grad_norm": 0.4374946653842926, + "learning_rate": 9.107934819653488e-05, + "loss": 1.8669, + "step": 7081 + }, + { + "epoch": 2.1737262124002457, + "grad_norm": 0.3263556957244873, + "learning_rate": 9.107651436085878e-05, + "loss": 1.8402, + "step": 7082 + }, + { + "epoch": 2.1740331491712706, + "grad_norm": 0.4380986988544464, + "learning_rate": 9.107368011924048e-05, + "loss": 1.8948, + "step": 7083 + }, + { + "epoch": 2.174340085942296, + "grad_norm": 0.4350908696651459, + "learning_rate": 9.1070845471708e-05, + "loss": 1.8717, + "step": 7084 + }, + { + "epoch": 2.174647022713321, + "grad_norm": 0.37809762358665466, + "learning_rate": 9.106801041828936e-05, + "loss": 1.8703, + "step": 7085 + }, + { + "epoch": 2.174953959484346, + "grad_norm": 0.3473457992076874, + "learning_rate": 9.106517495901257e-05, + "loss": 1.8999, + "step": 7086 + }, + { + "epoch": 2.1752608962553714, + "grad_norm": 0.48066645860671997, + "learning_rate": 9.106233909390564e-05, + "loss": 1.8788, + "step": 7087 + }, + { + "epoch": 2.1755678330263963, + "grad_norm": 0.5873035788536072, + "learning_rate": 9.105950282299663e-05, + "loss": 1.8879, + "step": 7088 + }, + { + "epoch": 2.1758747697974217, + "grad_norm": 0.47609585523605347, + "learning_rate": 9.105666614631354e-05, + "loss": 1.8813, + "step": 7089 + }, + { + "epoch": 2.176181706568447, + "grad_norm": 0.3845362365245819, + "learning_rate": 9.10538290638844e-05, + "loss": 1.9629, + "step": 7090 + }, + { + "epoch": 2.176488643339472, + "grad_norm": 0.5463572144508362, + "learning_rate": 9.105099157573727e-05, + "loss": 1.9455, + "step": 7091 + }, + { + "epoch": 2.1767955801104972, + "grad_norm": 0.4875337779521942, + "learning_rate": 9.104815368190017e-05, + "loss": 1.9146, + "step": 7092 + }, + { + "epoch": 2.1771025168815226, + "grad_norm": 0.37513965368270874, + "learning_rate": 9.104531538240116e-05, + "loss": 1.8626, + "step": 7093 + }, + { + "epoch": 2.1774094536525475, + "grad_norm": 0.3477539122104645, + "learning_rate": 9.104247667726828e-05, + "loss": 1.878, + "step": 7094 + }, + { + "epoch": 2.177716390423573, + "grad_norm": 0.5122693181037903, + "learning_rate": 9.103963756652961e-05, + "loss": 1.8784, + "step": 7095 + }, + { + "epoch": 2.178023327194598, + "grad_norm": 0.49106159806251526, + "learning_rate": 9.103679805021317e-05, + "loss": 1.8441, + "step": 7096 + }, + { + "epoch": 2.178330263965623, + "grad_norm": 0.3801479637622833, + "learning_rate": 9.103395812834705e-05, + "loss": 1.8986, + "step": 7097 + }, + { + "epoch": 2.1786372007366483, + "grad_norm": 0.3429640233516693, + "learning_rate": 9.10311178009593e-05, + "loss": 1.8806, + "step": 7098 + }, + { + "epoch": 2.1789441375076732, + "grad_norm": 0.36715295910835266, + "learning_rate": 9.102827706807799e-05, + "loss": 1.8215, + "step": 7099 + }, + { + "epoch": 2.1792510742786986, + "grad_norm": 0.37225866317749023, + "learning_rate": 9.10254359297312e-05, + "loss": 1.8851, + "step": 7100 + }, + { + "epoch": 2.179558011049724, + "grad_norm": 0.3552459180355072, + "learning_rate": 9.102259438594702e-05, + "loss": 1.9345, + "step": 7101 + }, + { + "epoch": 2.179864947820749, + "grad_norm": 0.3876415193080902, + "learning_rate": 9.10197524367535e-05, + "loss": 1.8657, + "step": 7102 + }, + { + "epoch": 2.180171884591774, + "grad_norm": 0.4635472595691681, + "learning_rate": 9.101691008217875e-05, + "loss": 1.8527, + "step": 7103 + }, + { + "epoch": 2.1804788213627995, + "grad_norm": 0.46319296956062317, + "learning_rate": 9.101406732225086e-05, + "loss": 1.869, + "step": 7104 + }, + { + "epoch": 2.1807857581338244, + "grad_norm": 0.36179330945014954, + "learning_rate": 9.101122415699792e-05, + "loss": 1.9157, + "step": 7105 + }, + { + "epoch": 2.1810926949048497, + "grad_norm": 0.30921339988708496, + "learning_rate": 9.100838058644801e-05, + "loss": 1.858, + "step": 7106 + }, + { + "epoch": 2.1813996316758746, + "grad_norm": 0.4568884074687958, + "learning_rate": 9.100553661062925e-05, + "loss": 1.8663, + "step": 7107 + }, + { + "epoch": 2.1817065684469, + "grad_norm": 0.43856412172317505, + "learning_rate": 9.100269222956976e-05, + "loss": 1.8492, + "step": 7108 + }, + { + "epoch": 2.1820135052179253, + "grad_norm": 0.3025546967983246, + "learning_rate": 9.099984744329761e-05, + "loss": 1.8532, + "step": 7109 + }, + { + "epoch": 2.18232044198895, + "grad_norm": 0.38365665078163147, + "learning_rate": 9.099700225184096e-05, + "loss": 1.8883, + "step": 7110 + }, + { + "epoch": 2.1826273787599755, + "grad_norm": 0.4863334596157074, + "learning_rate": 9.099415665522788e-05, + "loss": 1.8682, + "step": 7111 + }, + { + "epoch": 2.182934315531001, + "grad_norm": 0.42789241671562195, + "learning_rate": 9.099131065348653e-05, + "loss": 1.8867, + "step": 7112 + }, + { + "epoch": 2.1832412523020257, + "grad_norm": 0.35933569073677063, + "learning_rate": 9.098846424664504e-05, + "loss": 1.9282, + "step": 7113 + }, + { + "epoch": 2.183548189073051, + "grad_norm": 0.42611026763916016, + "learning_rate": 9.09856174347315e-05, + "loss": 1.9609, + "step": 7114 + }, + { + "epoch": 2.183855125844076, + "grad_norm": 0.43970558047294617, + "learning_rate": 9.098277021777406e-05, + "loss": 1.823, + "step": 7115 + }, + { + "epoch": 2.1841620626151013, + "grad_norm": 0.36792683601379395, + "learning_rate": 9.097992259580089e-05, + "loss": 1.9231, + "step": 7116 + }, + { + "epoch": 2.1844689993861266, + "grad_norm": 0.3554590344429016, + "learning_rate": 9.097707456884008e-05, + "loss": 1.914, + "step": 7117 + }, + { + "epoch": 2.1847759361571515, + "grad_norm": 0.4271651804447174, + "learning_rate": 9.097422613691982e-05, + "loss": 1.8666, + "step": 7118 + }, + { + "epoch": 2.185082872928177, + "grad_norm": 0.32142770290374756, + "learning_rate": 9.097137730006822e-05, + "loss": 1.7989, + "step": 7119 + }, + { + "epoch": 2.185389809699202, + "grad_norm": 0.33245620131492615, + "learning_rate": 9.096852805831348e-05, + "loss": 1.8536, + "step": 7120 + }, + { + "epoch": 2.185696746470227, + "grad_norm": 0.3480495810508728, + "learning_rate": 9.09656784116837e-05, + "loss": 1.9008, + "step": 7121 + }, + { + "epoch": 2.1860036832412524, + "grad_norm": 0.35290226340293884, + "learning_rate": 9.09628283602071e-05, + "loss": 1.8593, + "step": 7122 + }, + { + "epoch": 2.1863106200122773, + "grad_norm": 0.3084987998008728, + "learning_rate": 9.095997790391183e-05, + "loss": 1.827, + "step": 7123 + }, + { + "epoch": 2.1866175567833026, + "grad_norm": 0.36295285820961, + "learning_rate": 9.095712704282604e-05, + "loss": 1.909, + "step": 7124 + }, + { + "epoch": 2.186924493554328, + "grad_norm": 0.3893873691558838, + "learning_rate": 9.095427577697791e-05, + "loss": 1.9221, + "step": 7125 + }, + { + "epoch": 2.187231430325353, + "grad_norm": 0.3699241578578949, + "learning_rate": 9.095142410639564e-05, + "loss": 1.9352, + "step": 7126 + }, + { + "epoch": 2.187538367096378, + "grad_norm": 0.3384705185890198, + "learning_rate": 9.094857203110738e-05, + "loss": 1.8541, + "step": 7127 + }, + { + "epoch": 2.1878453038674035, + "grad_norm": 0.377687007188797, + "learning_rate": 9.094571955114133e-05, + "loss": 1.8336, + "step": 7128 + }, + { + "epoch": 2.1881522406384284, + "grad_norm": 0.40227916836738586, + "learning_rate": 9.094286666652567e-05, + "loss": 1.9565, + "step": 7129 + }, + { + "epoch": 2.1884591774094537, + "grad_norm": 0.3679705560207367, + "learning_rate": 9.094001337728862e-05, + "loss": 1.8152, + "step": 7130 + }, + { + "epoch": 2.1887661141804786, + "grad_norm": 0.3197132647037506, + "learning_rate": 9.093715968345836e-05, + "loss": 1.9263, + "step": 7131 + }, + { + "epoch": 2.189073050951504, + "grad_norm": 0.3518284559249878, + "learning_rate": 9.09343055850631e-05, + "loss": 1.8675, + "step": 7132 + }, + { + "epoch": 2.1893799877225293, + "grad_norm": 0.3214010000228882, + "learning_rate": 9.093145108213103e-05, + "loss": 1.8991, + "step": 7133 + }, + { + "epoch": 2.189686924493554, + "grad_norm": 0.3563176393508911, + "learning_rate": 9.092859617469037e-05, + "loss": 1.8603, + "step": 7134 + }, + { + "epoch": 2.1899938612645795, + "grad_norm": 0.34053143858909607, + "learning_rate": 9.092574086276933e-05, + "loss": 1.8955, + "step": 7135 + }, + { + "epoch": 2.190300798035605, + "grad_norm": 0.3833705484867096, + "learning_rate": 9.092288514639613e-05, + "loss": 1.8845, + "step": 7136 + }, + { + "epoch": 2.1906077348066297, + "grad_norm": 0.3932427763938904, + "learning_rate": 9.092002902559901e-05, + "loss": 1.8608, + "step": 7137 + }, + { + "epoch": 2.190914671577655, + "grad_norm": 0.332955539226532, + "learning_rate": 9.091717250040617e-05, + "loss": 1.8558, + "step": 7138 + }, + { + "epoch": 2.1912216083486804, + "grad_norm": 0.3149980306625366, + "learning_rate": 9.091431557084584e-05, + "loss": 1.893, + "step": 7139 + }, + { + "epoch": 2.1915285451197053, + "grad_norm": 0.3679150640964508, + "learning_rate": 9.091145823694628e-05, + "loss": 1.9012, + "step": 7140 + }, + { + "epoch": 2.1918354818907306, + "grad_norm": 0.36836057901382446, + "learning_rate": 9.09086004987357e-05, + "loss": 1.9121, + "step": 7141 + }, + { + "epoch": 2.1921424186617555, + "grad_norm": 0.3581927418708801, + "learning_rate": 9.090574235624237e-05, + "loss": 1.8826, + "step": 7142 + }, + { + "epoch": 2.192449355432781, + "grad_norm": 0.40886545181274414, + "learning_rate": 9.09028838094945e-05, + "loss": 1.8828, + "step": 7143 + }, + { + "epoch": 2.192756292203806, + "grad_norm": 0.32729873061180115, + "learning_rate": 9.090002485852037e-05, + "loss": 1.8827, + "step": 7144 + }, + { + "epoch": 2.193063228974831, + "grad_norm": 0.35304784774780273, + "learning_rate": 9.089716550334819e-05, + "loss": 1.846, + "step": 7145 + }, + { + "epoch": 2.1933701657458564, + "grad_norm": 0.35022708773612976, + "learning_rate": 9.089430574400629e-05, + "loss": 1.9169, + "step": 7146 + }, + { + "epoch": 2.1936771025168813, + "grad_norm": 0.4137697219848633, + "learning_rate": 9.089144558052287e-05, + "loss": 1.9111, + "step": 7147 + }, + { + "epoch": 2.1939840392879066, + "grad_norm": 0.3193536102771759, + "learning_rate": 9.088858501292622e-05, + "loss": 1.8577, + "step": 7148 + }, + { + "epoch": 2.194290976058932, + "grad_norm": 0.35795432329177856, + "learning_rate": 9.08857240412446e-05, + "loss": 1.8645, + "step": 7149 + }, + { + "epoch": 2.194597912829957, + "grad_norm": 0.3626460134983063, + "learning_rate": 9.088286266550632e-05, + "loss": 1.9288, + "step": 7150 + }, + { + "epoch": 2.194904849600982, + "grad_norm": 0.3438000977039337, + "learning_rate": 9.08800008857396e-05, + "loss": 1.9112, + "step": 7151 + }, + { + "epoch": 2.1952117863720075, + "grad_norm": 0.3445241153240204, + "learning_rate": 9.087713870197276e-05, + "loss": 1.8711, + "step": 7152 + }, + { + "epoch": 2.1955187231430324, + "grad_norm": 0.34294596314430237, + "learning_rate": 9.087427611423408e-05, + "loss": 1.9061, + "step": 7153 + }, + { + "epoch": 2.1958256599140578, + "grad_norm": 0.3608735203742981, + "learning_rate": 9.087141312255184e-05, + "loss": 1.8634, + "step": 7154 + }, + { + "epoch": 2.196132596685083, + "grad_norm": 0.3417772352695465, + "learning_rate": 9.086854972695434e-05, + "loss": 1.9, + "step": 7155 + }, + { + "epoch": 2.196439533456108, + "grad_norm": 0.3516700863838196, + "learning_rate": 9.086568592746988e-05, + "loss": 1.9021, + "step": 7156 + }, + { + "epoch": 2.1967464702271333, + "grad_norm": 0.37481075525283813, + "learning_rate": 9.086282172412677e-05, + "loss": 1.8845, + "step": 7157 + }, + { + "epoch": 2.197053406998158, + "grad_norm": 0.3413105010986328, + "learning_rate": 9.08599571169533e-05, + "loss": 1.8128, + "step": 7158 + }, + { + "epoch": 2.1973603437691835, + "grad_norm": 0.3539934754371643, + "learning_rate": 9.085709210597777e-05, + "loss": 1.857, + "step": 7159 + }, + { + "epoch": 2.197667280540209, + "grad_norm": 0.4345060884952545, + "learning_rate": 9.085422669122851e-05, + "loss": 1.8698, + "step": 7160 + }, + { + "epoch": 2.1979742173112338, + "grad_norm": 0.40369880199432373, + "learning_rate": 9.085136087273386e-05, + "loss": 1.7948, + "step": 7161 + }, + { + "epoch": 2.198281154082259, + "grad_norm": 0.3832145035266876, + "learning_rate": 9.08484946505221e-05, + "loss": 1.8682, + "step": 7162 + }, + { + "epoch": 2.198588090853284, + "grad_norm": 0.2859131097793579, + "learning_rate": 9.084562802462158e-05, + "loss": 1.8123, + "step": 7163 + }, + { + "epoch": 2.1988950276243093, + "grad_norm": 0.3062222898006439, + "learning_rate": 9.084276099506062e-05, + "loss": 1.8448, + "step": 7164 + }, + { + "epoch": 2.1992019643953347, + "grad_norm": 0.3819046914577484, + "learning_rate": 9.083989356186757e-05, + "loss": 1.8661, + "step": 7165 + }, + { + "epoch": 2.1995089011663596, + "grad_norm": 0.5007020235061646, + "learning_rate": 9.083702572507074e-05, + "loss": 1.9144, + "step": 7166 + }, + { + "epoch": 2.199815837937385, + "grad_norm": 0.521885097026825, + "learning_rate": 9.083415748469849e-05, + "loss": 1.8695, + "step": 7167 + }, + { + "epoch": 2.2001227747084102, + "grad_norm": 0.35051268339157104, + "learning_rate": 9.083128884077916e-05, + "loss": 1.9378, + "step": 7168 + }, + { + "epoch": 2.200429711479435, + "grad_norm": 0.40265345573425293, + "learning_rate": 9.082841979334111e-05, + "loss": 1.8902, + "step": 7169 + }, + { + "epoch": 2.2007366482504604, + "grad_norm": 0.506377637386322, + "learning_rate": 9.082555034241267e-05, + "loss": 1.9115, + "step": 7170 + }, + { + "epoch": 2.201043585021486, + "grad_norm": 0.42828384041786194, + "learning_rate": 9.082268048802223e-05, + "loss": 1.8173, + "step": 7171 + }, + { + "epoch": 2.2013505217925107, + "grad_norm": 0.2979312539100647, + "learning_rate": 9.081981023019812e-05, + "loss": 1.8089, + "step": 7172 + }, + { + "epoch": 2.201657458563536, + "grad_norm": 0.3840465843677521, + "learning_rate": 9.081693956896872e-05, + "loss": 1.8557, + "step": 7173 + }, + { + "epoch": 2.201964395334561, + "grad_norm": 0.41454845666885376, + "learning_rate": 9.081406850436241e-05, + "loss": 1.8599, + "step": 7174 + }, + { + "epoch": 2.2022713321055862, + "grad_norm": 0.3305908739566803, + "learning_rate": 9.081119703640756e-05, + "loss": 1.8013, + "step": 7175 + }, + { + "epoch": 2.2025782688766116, + "grad_norm": 0.33649876713752747, + "learning_rate": 9.080832516513252e-05, + "loss": 1.9028, + "step": 7176 + }, + { + "epoch": 2.2028852056476365, + "grad_norm": 0.41247284412384033, + "learning_rate": 9.08054528905657e-05, + "loss": 1.8636, + "step": 7177 + }, + { + "epoch": 2.203192142418662, + "grad_norm": 0.4355279505252838, + "learning_rate": 9.080258021273548e-05, + "loss": 1.8923, + "step": 7178 + }, + { + "epoch": 2.203499079189687, + "grad_norm": 0.34598320722579956, + "learning_rate": 9.079970713167026e-05, + "loss": 1.9187, + "step": 7179 + }, + { + "epoch": 2.203806015960712, + "grad_norm": 0.3560951054096222, + "learning_rate": 9.07968336473984e-05, + "loss": 1.9382, + "step": 7180 + }, + { + "epoch": 2.2041129527317374, + "grad_norm": 0.3873176872730255, + "learning_rate": 9.079395975994834e-05, + "loss": 1.8377, + "step": 7181 + }, + { + "epoch": 2.2044198895027622, + "grad_norm": 0.38699567317962646, + "learning_rate": 9.079108546934844e-05, + "loss": 1.848, + "step": 7182 + }, + { + "epoch": 2.2047268262737876, + "grad_norm": 0.3658364713191986, + "learning_rate": 9.078821077562712e-05, + "loss": 1.9308, + "step": 7183 + }, + { + "epoch": 2.205033763044813, + "grad_norm": 0.35228830575942993, + "learning_rate": 9.078533567881281e-05, + "loss": 1.8886, + "step": 7184 + }, + { + "epoch": 2.205340699815838, + "grad_norm": 0.4177337884902954, + "learning_rate": 9.07824601789339e-05, + "loss": 1.8695, + "step": 7185 + }, + { + "epoch": 2.205647636586863, + "grad_norm": 0.4778536260128021, + "learning_rate": 9.077958427601882e-05, + "loss": 1.8288, + "step": 7186 + }, + { + "epoch": 2.2059545733578885, + "grad_norm": 0.46544820070266724, + "learning_rate": 9.077670797009599e-05, + "loss": 1.8974, + "step": 7187 + }, + { + "epoch": 2.2062615101289134, + "grad_norm": 0.36188805103302, + "learning_rate": 9.077383126119382e-05, + "loss": 1.8953, + "step": 7188 + }, + { + "epoch": 2.2065684468999387, + "grad_norm": 0.30941206216812134, + "learning_rate": 9.077095414934075e-05, + "loss": 1.8395, + "step": 7189 + }, + { + "epoch": 2.2068753836709636, + "grad_norm": 0.4497200846672058, + "learning_rate": 9.076807663456524e-05, + "loss": 1.8485, + "step": 7190 + }, + { + "epoch": 2.207182320441989, + "grad_norm": 0.4923233985900879, + "learning_rate": 9.076519871689568e-05, + "loss": 1.8233, + "step": 7191 + }, + { + "epoch": 2.2074892572130143, + "grad_norm": 0.32226502895355225, + "learning_rate": 9.076232039636053e-05, + "loss": 1.8563, + "step": 7192 + }, + { + "epoch": 2.207796193984039, + "grad_norm": 0.46719446778297424, + "learning_rate": 9.075944167298824e-05, + "loss": 1.8602, + "step": 7193 + }, + { + "epoch": 2.2081031307550645, + "grad_norm": 0.5534674525260925, + "learning_rate": 9.075656254680727e-05, + "loss": 1.8804, + "step": 7194 + }, + { + "epoch": 2.20841006752609, + "grad_norm": 0.4895678162574768, + "learning_rate": 9.075368301784606e-05, + "loss": 1.8893, + "step": 7195 + }, + { + "epoch": 2.2087170042971147, + "grad_norm": 0.33137625455856323, + "learning_rate": 9.075080308613306e-05, + "loss": 1.9158, + "step": 7196 + }, + { + "epoch": 2.20902394106814, + "grad_norm": 0.469319611787796, + "learning_rate": 9.074792275169674e-05, + "loss": 1.8628, + "step": 7197 + }, + { + "epoch": 2.209330877839165, + "grad_norm": 0.43872305750846863, + "learning_rate": 9.074504201456556e-05, + "loss": 1.8867, + "step": 7198 + }, + { + "epoch": 2.2096378146101903, + "grad_norm": 0.32900992035865784, + "learning_rate": 9.0742160874768e-05, + "loss": 1.8079, + "step": 7199 + }, + { + "epoch": 2.2099447513812156, + "grad_norm": 0.34231048822402954, + "learning_rate": 9.073927933233253e-05, + "loss": 1.9018, + "step": 7200 + }, + { + "epoch": 2.2102516881522405, + "grad_norm": 0.43461740016937256, + "learning_rate": 9.07363973872876e-05, + "loss": 1.8299, + "step": 7201 + }, + { + "epoch": 2.210558624923266, + "grad_norm": 0.43819913268089294, + "learning_rate": 9.073351503966174e-05, + "loss": 1.8641, + "step": 7202 + }, + { + "epoch": 2.210865561694291, + "grad_norm": 0.330683171749115, + "learning_rate": 9.073063228948339e-05, + "loss": 1.8595, + "step": 7203 + }, + { + "epoch": 2.211172498465316, + "grad_norm": 0.35648414492607117, + "learning_rate": 9.072774913678108e-05, + "loss": 1.8265, + "step": 7204 + }, + { + "epoch": 2.2114794352363414, + "grad_norm": 0.4420771300792694, + "learning_rate": 9.072486558158329e-05, + "loss": 1.902, + "step": 7205 + }, + { + "epoch": 2.2117863720073663, + "grad_norm": 0.41682472825050354, + "learning_rate": 9.072198162391849e-05, + "loss": 1.903, + "step": 7206 + }, + { + "epoch": 2.2120933087783916, + "grad_norm": 0.3194744288921356, + "learning_rate": 9.07190972638152e-05, + "loss": 1.8221, + "step": 7207 + }, + { + "epoch": 2.212400245549417, + "grad_norm": 0.35625776648521423, + "learning_rate": 9.071621250130192e-05, + "loss": 1.8737, + "step": 7208 + }, + { + "epoch": 2.212707182320442, + "grad_norm": 0.4136293828487396, + "learning_rate": 9.071332733640716e-05, + "loss": 1.7995, + "step": 7209 + }, + { + "epoch": 2.213014119091467, + "grad_norm": 0.39144495129585266, + "learning_rate": 9.071044176915947e-05, + "loss": 1.8446, + "step": 7210 + }, + { + "epoch": 2.2133210558624925, + "grad_norm": 0.3082813322544098, + "learning_rate": 9.07075557995873e-05, + "loss": 1.7635, + "step": 7211 + }, + { + "epoch": 2.2136279926335174, + "grad_norm": 0.3642291724681854, + "learning_rate": 9.070466942771921e-05, + "loss": 1.9471, + "step": 7212 + }, + { + "epoch": 2.2139349294045427, + "grad_norm": 0.4506807029247284, + "learning_rate": 9.070178265358372e-05, + "loss": 1.8542, + "step": 7213 + }, + { + "epoch": 2.214241866175568, + "grad_norm": 0.5011601448059082, + "learning_rate": 9.069889547720936e-05, + "loss": 1.9135, + "step": 7214 + }, + { + "epoch": 2.214548802946593, + "grad_norm": 0.3946228623390198, + "learning_rate": 9.069600789862467e-05, + "loss": 1.876, + "step": 7215 + }, + { + "epoch": 2.2148557397176183, + "grad_norm": 0.34833815693855286, + "learning_rate": 9.069311991785816e-05, + "loss": 1.8666, + "step": 7216 + }, + { + "epoch": 2.215162676488643, + "grad_norm": 0.43735191226005554, + "learning_rate": 9.069023153493839e-05, + "loss": 1.9238, + "step": 7217 + }, + { + "epoch": 2.2154696132596685, + "grad_norm": 0.5010718107223511, + "learning_rate": 9.06873427498939e-05, + "loss": 1.8724, + "step": 7218 + }, + { + "epoch": 2.215776550030694, + "grad_norm": 0.35850396752357483, + "learning_rate": 9.068445356275326e-05, + "loss": 1.8825, + "step": 7219 + }, + { + "epoch": 2.2160834868017187, + "grad_norm": 0.3528468906879425, + "learning_rate": 9.0681563973545e-05, + "loss": 1.8724, + "step": 7220 + }, + { + "epoch": 2.216390423572744, + "grad_norm": 0.34725508093833923, + "learning_rate": 9.067867398229767e-05, + "loss": 1.8722, + "step": 7221 + }, + { + "epoch": 2.216697360343769, + "grad_norm": 0.3343757092952728, + "learning_rate": 9.067578358903985e-05, + "loss": 1.8144, + "step": 7222 + }, + { + "epoch": 2.2170042971147943, + "grad_norm": 0.33384087681770325, + "learning_rate": 9.067289279380009e-05, + "loss": 1.832, + "step": 7223 + }, + { + "epoch": 2.2173112338858196, + "grad_norm": 0.3275810778141022, + "learning_rate": 9.067000159660697e-05, + "loss": 1.8819, + "step": 7224 + }, + { + "epoch": 2.2176181706568445, + "grad_norm": 0.405293732881546, + "learning_rate": 9.066710999748904e-05, + "loss": 1.8669, + "step": 7225 + }, + { + "epoch": 2.21792510742787, + "grad_norm": 0.3554569482803345, + "learning_rate": 9.066421799647491e-05, + "loss": 1.8331, + "step": 7226 + }, + { + "epoch": 2.218232044198895, + "grad_norm": 0.3896840810775757, + "learning_rate": 9.066132559359313e-05, + "loss": 1.891, + "step": 7227 + }, + { + "epoch": 2.21853898096992, + "grad_norm": 0.38668718934059143, + "learning_rate": 9.065843278887231e-05, + "loss": 1.9162, + "step": 7228 + }, + { + "epoch": 2.2188459177409454, + "grad_norm": 0.3593392074108124, + "learning_rate": 9.065553958234103e-05, + "loss": 1.866, + "step": 7229 + }, + { + "epoch": 2.2191528545119708, + "grad_norm": 0.3509809076786041, + "learning_rate": 9.065264597402788e-05, + "loss": 1.8979, + "step": 7230 + }, + { + "epoch": 2.2194597912829956, + "grad_norm": 0.35477882623672485, + "learning_rate": 9.064975196396144e-05, + "loss": 1.8425, + "step": 7231 + }, + { + "epoch": 2.219766728054021, + "grad_norm": 0.38763463497161865, + "learning_rate": 9.064685755217033e-05, + "loss": 1.8853, + "step": 7232 + }, + { + "epoch": 2.220073664825046, + "grad_norm": 0.33559930324554443, + "learning_rate": 9.064396273868316e-05, + "loss": 1.8825, + "step": 7233 + }, + { + "epoch": 2.220380601596071, + "grad_norm": 0.3130233585834503, + "learning_rate": 9.064106752352852e-05, + "loss": 1.8082, + "step": 7234 + }, + { + "epoch": 2.2206875383670965, + "grad_norm": 0.33321285247802734, + "learning_rate": 9.063817190673503e-05, + "loss": 1.8795, + "step": 7235 + }, + { + "epoch": 2.2209944751381214, + "grad_norm": 0.47564151883125305, + "learning_rate": 9.063527588833132e-05, + "loss": 1.9461, + "step": 7236 + }, + { + "epoch": 2.2213014119091468, + "grad_norm": 0.38102859258651733, + "learning_rate": 9.063237946834597e-05, + "loss": 1.8656, + "step": 7237 + }, + { + "epoch": 2.2216083486801717, + "grad_norm": 0.32240456342697144, + "learning_rate": 9.062948264680765e-05, + "loss": 1.8187, + "step": 7238 + }, + { + "epoch": 2.221915285451197, + "grad_norm": 0.2852800190448761, + "learning_rate": 9.062658542374496e-05, + "loss": 1.8172, + "step": 7239 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3506350815296173, + "learning_rate": 9.062368779918655e-05, + "loss": 1.8909, + "step": 7240 + }, + { + "epoch": 2.222529158993247, + "grad_norm": 0.29418817162513733, + "learning_rate": 9.062078977316104e-05, + "loss": 1.8078, + "step": 7241 + }, + { + "epoch": 2.2228360957642725, + "grad_norm": 0.31221407651901245, + "learning_rate": 9.061789134569707e-05, + "loss": 1.8813, + "step": 7242 + }, + { + "epoch": 2.223143032535298, + "grad_norm": 0.32314184308052063, + "learning_rate": 9.061499251682331e-05, + "loss": 1.8838, + "step": 7243 + }, + { + "epoch": 2.2234499693063228, + "grad_norm": 0.3329566419124603, + "learning_rate": 9.061209328656838e-05, + "loss": 1.8987, + "step": 7244 + }, + { + "epoch": 2.223756906077348, + "grad_norm": 0.35992133617401123, + "learning_rate": 9.060919365496094e-05, + "loss": 1.9194, + "step": 7245 + }, + { + "epoch": 2.2240638428483734, + "grad_norm": 0.33594536781311035, + "learning_rate": 9.060629362202964e-05, + "loss": 1.8303, + "step": 7246 + }, + { + "epoch": 2.2243707796193983, + "grad_norm": 0.3469938635826111, + "learning_rate": 9.060339318780316e-05, + "loss": 1.905, + "step": 7247 + }, + { + "epoch": 2.2246777163904237, + "grad_norm": 0.3989942967891693, + "learning_rate": 9.060049235231015e-05, + "loss": 1.8655, + "step": 7248 + }, + { + "epoch": 2.2249846531614486, + "grad_norm": 0.35004356503486633, + "learning_rate": 9.059759111557926e-05, + "loss": 1.8081, + "step": 7249 + }, + { + "epoch": 2.225291589932474, + "grad_norm": 0.38162320852279663, + "learning_rate": 9.059468947763919e-05, + "loss": 1.9243, + "step": 7250 + }, + { + "epoch": 2.2255985267034992, + "grad_norm": 0.3417564034461975, + "learning_rate": 9.059178743851859e-05, + "loss": 1.8246, + "step": 7251 + }, + { + "epoch": 2.225905463474524, + "grad_norm": 0.39185380935668945, + "learning_rate": 9.058888499824618e-05, + "loss": 1.9235, + "step": 7252 + }, + { + "epoch": 2.2262124002455494, + "grad_norm": 0.5741223096847534, + "learning_rate": 9.058598215685061e-05, + "loss": 1.9104, + "step": 7253 + }, + { + "epoch": 2.226519337016575, + "grad_norm": 0.6595804691314697, + "learning_rate": 9.058307891436057e-05, + "loss": 1.9956, + "step": 7254 + }, + { + "epoch": 2.2268262737875997, + "grad_norm": 0.6249661445617676, + "learning_rate": 9.058017527080476e-05, + "loss": 1.8913, + "step": 7255 + }, + { + "epoch": 2.227133210558625, + "grad_norm": 0.48208609223365784, + "learning_rate": 9.057727122621188e-05, + "loss": 1.9116, + "step": 7256 + }, + { + "epoch": 2.22744014732965, + "grad_norm": 0.37400147318840027, + "learning_rate": 9.057436678061062e-05, + "loss": 1.8828, + "step": 7257 + }, + { + "epoch": 2.2277470841006752, + "grad_norm": 0.40321463346481323, + "learning_rate": 9.057146193402968e-05, + "loss": 1.7984, + "step": 7258 + }, + { + "epoch": 2.2280540208717006, + "grad_norm": 0.43090149760246277, + "learning_rate": 9.056855668649778e-05, + "loss": 1.9135, + "step": 7259 + }, + { + "epoch": 2.2283609576427255, + "grad_norm": 0.3625677525997162, + "learning_rate": 9.056565103804362e-05, + "loss": 1.9005, + "step": 7260 + }, + { + "epoch": 2.228667894413751, + "grad_norm": 0.3386496901512146, + "learning_rate": 9.056274498869593e-05, + "loss": 1.879, + "step": 7261 + }, + { + "epoch": 2.228974831184776, + "grad_norm": 0.45207980275154114, + "learning_rate": 9.05598385384834e-05, + "loss": 1.8748, + "step": 7262 + }, + { + "epoch": 2.229281767955801, + "grad_norm": 0.38665562868118286, + "learning_rate": 9.055693168743478e-05, + "loss": 1.8828, + "step": 7263 + }, + { + "epoch": 2.2295887047268264, + "grad_norm": 0.3074968159198761, + "learning_rate": 9.05540244355788e-05, + "loss": 1.8443, + "step": 7264 + }, + { + "epoch": 2.2298956414978512, + "grad_norm": 0.36243903636932373, + "learning_rate": 9.055111678294418e-05, + "loss": 1.8681, + "step": 7265 + }, + { + "epoch": 2.2302025782688766, + "grad_norm": 0.4070085287094116, + "learning_rate": 9.054820872955965e-05, + "loss": 1.8643, + "step": 7266 + }, + { + "epoch": 2.230509515039902, + "grad_norm": 0.3784204125404358, + "learning_rate": 9.054530027545396e-05, + "loss": 1.9197, + "step": 7267 + }, + { + "epoch": 2.230816451810927, + "grad_norm": 0.32002586126327515, + "learning_rate": 9.054239142065583e-05, + "loss": 1.9, + "step": 7268 + }, + { + "epoch": 2.231123388581952, + "grad_norm": 0.3701259195804596, + "learning_rate": 9.053948216519405e-05, + "loss": 1.8815, + "step": 7269 + }, + { + "epoch": 2.2314303253529775, + "grad_norm": 0.32927554845809937, + "learning_rate": 9.053657250909734e-05, + "loss": 1.8599, + "step": 7270 + }, + { + "epoch": 2.2317372621240024, + "grad_norm": 0.2915503680706024, + "learning_rate": 9.053366245239445e-05, + "loss": 1.8553, + "step": 7271 + }, + { + "epoch": 2.2320441988950277, + "grad_norm": 0.3347928822040558, + "learning_rate": 9.053075199511416e-05, + "loss": 1.926, + "step": 7272 + }, + { + "epoch": 2.2323511356660526, + "grad_norm": 0.37499183416366577, + "learning_rate": 9.052784113728523e-05, + "loss": 1.8636, + "step": 7273 + }, + { + "epoch": 2.232658072437078, + "grad_norm": 0.38303107023239136, + "learning_rate": 9.05249298789364e-05, + "loss": 1.8739, + "step": 7274 + }, + { + "epoch": 2.2329650092081033, + "grad_norm": 0.356942355632782, + "learning_rate": 9.052201822009648e-05, + "loss": 1.8401, + "step": 7275 + }, + { + "epoch": 2.233271945979128, + "grad_norm": 0.3391316533088684, + "learning_rate": 9.051910616079422e-05, + "loss": 1.8954, + "step": 7276 + }, + { + "epoch": 2.2335788827501535, + "grad_norm": 0.3100464344024658, + "learning_rate": 9.051619370105839e-05, + "loss": 1.8726, + "step": 7277 + }, + { + "epoch": 2.233885819521179, + "grad_norm": 0.38745078444480896, + "learning_rate": 9.05132808409178e-05, + "loss": 1.9605, + "step": 7278 + }, + { + "epoch": 2.2341927562922037, + "grad_norm": 0.40631747245788574, + "learning_rate": 9.051036758040123e-05, + "loss": 1.8458, + "step": 7279 + }, + { + "epoch": 2.234499693063229, + "grad_norm": 0.4084717929363251, + "learning_rate": 9.050745391953745e-05, + "loss": 1.8696, + "step": 7280 + }, + { + "epoch": 2.234806629834254, + "grad_norm": 0.4426955282688141, + "learning_rate": 9.050453985835527e-05, + "loss": 1.9063, + "step": 7281 + }, + { + "epoch": 2.2351135666052793, + "grad_norm": 0.37360796332359314, + "learning_rate": 9.05016253968835e-05, + "loss": 1.9299, + "step": 7282 + }, + { + "epoch": 2.2354205033763046, + "grad_norm": 0.34415799379348755, + "learning_rate": 9.049871053515091e-05, + "loss": 1.8877, + "step": 7283 + }, + { + "epoch": 2.2357274401473295, + "grad_norm": 0.3745698928833008, + "learning_rate": 9.049579527318633e-05, + "loss": 1.9272, + "step": 7284 + }, + { + "epoch": 2.236034376918355, + "grad_norm": 0.3293079435825348, + "learning_rate": 9.049287961101857e-05, + "loss": 1.8599, + "step": 7285 + }, + { + "epoch": 2.23634131368938, + "grad_norm": 0.3563106060028076, + "learning_rate": 9.048996354867644e-05, + "loss": 1.938, + "step": 7286 + }, + { + "epoch": 2.236648250460405, + "grad_norm": 0.36354976892471313, + "learning_rate": 9.048704708618876e-05, + "loss": 1.9401, + "step": 7287 + }, + { + "epoch": 2.2369551872314304, + "grad_norm": 0.32659000158309937, + "learning_rate": 9.048413022358434e-05, + "loss": 1.8056, + "step": 7288 + }, + { + "epoch": 2.2372621240024557, + "grad_norm": 0.30486637353897095, + "learning_rate": 9.048121296089202e-05, + "loss": 1.8178, + "step": 7289 + }, + { + "epoch": 2.2375690607734806, + "grad_norm": 0.34506455063819885, + "learning_rate": 9.047829529814063e-05, + "loss": 1.8866, + "step": 7290 + }, + { + "epoch": 2.237875997544506, + "grad_norm": 0.3200983703136444, + "learning_rate": 9.047537723535902e-05, + "loss": 1.8218, + "step": 7291 + }, + { + "epoch": 2.238182934315531, + "grad_norm": 0.33315715193748474, + "learning_rate": 9.047245877257597e-05, + "loss": 1.8939, + "step": 7292 + }, + { + "epoch": 2.238489871086556, + "grad_norm": 0.38259127736091614, + "learning_rate": 9.046953990982039e-05, + "loss": 1.9566, + "step": 7293 + }, + { + "epoch": 2.2387968078575815, + "grad_norm": 0.32880350947380066, + "learning_rate": 9.04666206471211e-05, + "loss": 1.9056, + "step": 7294 + }, + { + "epoch": 2.2391037446286064, + "grad_norm": 0.39114195108413696, + "learning_rate": 9.046370098450692e-05, + "loss": 1.8773, + "step": 7295 + }, + { + "epoch": 2.2394106813996317, + "grad_norm": 0.37625813484191895, + "learning_rate": 9.046078092200675e-05, + "loss": 1.8685, + "step": 7296 + }, + { + "epoch": 2.2397176181706566, + "grad_norm": 0.3604978621006012, + "learning_rate": 9.045786045964942e-05, + "loss": 1.885, + "step": 7297 + }, + { + "epoch": 2.240024554941682, + "grad_norm": 0.32200589776039124, + "learning_rate": 9.045493959746381e-05, + "loss": 1.9146, + "step": 7298 + }, + { + "epoch": 2.2403314917127073, + "grad_norm": 0.3635976314544678, + "learning_rate": 9.045201833547876e-05, + "loss": 1.8597, + "step": 7299 + }, + { + "epoch": 2.240638428483732, + "grad_norm": 0.3326318562030792, + "learning_rate": 9.044909667372317e-05, + "loss": 1.8577, + "step": 7300 + }, + { + "epoch": 2.2409453652547575, + "grad_norm": 0.32209664583206177, + "learning_rate": 9.044617461222589e-05, + "loss": 1.844, + "step": 7301 + }, + { + "epoch": 2.241252302025783, + "grad_norm": 0.3654637634754181, + "learning_rate": 9.044325215101581e-05, + "loss": 1.8858, + "step": 7302 + }, + { + "epoch": 2.2415592387968077, + "grad_norm": 0.3583166003227234, + "learning_rate": 9.04403292901218e-05, + "loss": 1.8148, + "step": 7303 + }, + { + "epoch": 2.241866175567833, + "grad_norm": 0.3315606117248535, + "learning_rate": 9.043740602957276e-05, + "loss": 1.8504, + "step": 7304 + }, + { + "epoch": 2.2421731123388584, + "grad_norm": 0.36084556579589844, + "learning_rate": 9.043448236939758e-05, + "loss": 1.9167, + "step": 7305 + }, + { + "epoch": 2.2424800491098833, + "grad_norm": 0.43558987975120544, + "learning_rate": 9.043155830962514e-05, + "loss": 1.8937, + "step": 7306 + }, + { + "epoch": 2.2427869858809086, + "grad_norm": 0.455240398645401, + "learning_rate": 9.042863385028433e-05, + "loss": 1.9774, + "step": 7307 + }, + { + "epoch": 2.2430939226519335, + "grad_norm": 0.35868698358535767, + "learning_rate": 9.042570899140408e-05, + "loss": 1.7999, + "step": 7308 + }, + { + "epoch": 2.243400859422959, + "grad_norm": 0.33930447697639465, + "learning_rate": 9.042278373301327e-05, + "loss": 1.965, + "step": 7309 + }, + { + "epoch": 2.243707796193984, + "grad_norm": 0.34124335646629333, + "learning_rate": 9.041985807514082e-05, + "loss": 1.8916, + "step": 7310 + }, + { + "epoch": 2.244014732965009, + "grad_norm": 0.3905695974826813, + "learning_rate": 9.041693201781565e-05, + "loss": 1.9066, + "step": 7311 + }, + { + "epoch": 2.2443216697360344, + "grad_norm": 0.3108711242675781, + "learning_rate": 9.041400556106667e-05, + "loss": 1.8038, + "step": 7312 + }, + { + "epoch": 2.2446286065070598, + "grad_norm": 0.2853390872478485, + "learning_rate": 9.041107870492279e-05, + "loss": 1.8945, + "step": 7313 + }, + { + "epoch": 2.2449355432780846, + "grad_norm": 0.33351564407348633, + "learning_rate": 9.040815144941295e-05, + "loss": 1.8796, + "step": 7314 + }, + { + "epoch": 2.24524248004911, + "grad_norm": 0.3470609486103058, + "learning_rate": 9.040522379456606e-05, + "loss": 1.8914, + "step": 7315 + }, + { + "epoch": 2.245549416820135, + "grad_norm": 0.3474356532096863, + "learning_rate": 9.040229574041109e-05, + "loss": 1.838, + "step": 7316 + }, + { + "epoch": 2.24585635359116, + "grad_norm": 0.36590397357940674, + "learning_rate": 9.039936728697693e-05, + "loss": 1.86, + "step": 7317 + }, + { + "epoch": 2.2461632903621855, + "grad_norm": 0.35168272256851196, + "learning_rate": 9.039643843429257e-05, + "loss": 1.9337, + "step": 7318 + }, + { + "epoch": 2.2464702271332104, + "grad_norm": 0.3402341604232788, + "learning_rate": 9.039350918238691e-05, + "loss": 1.9291, + "step": 7319 + }, + { + "epoch": 2.2467771639042358, + "grad_norm": 0.3505321443080902, + "learning_rate": 9.03905795312889e-05, + "loss": 1.8252, + "step": 7320 + }, + { + "epoch": 2.247084100675261, + "grad_norm": 0.38366270065307617, + "learning_rate": 9.038764948102754e-05, + "loss": 1.8685, + "step": 7321 + }, + { + "epoch": 2.247391037446286, + "grad_norm": 0.3616010844707489, + "learning_rate": 9.038471903163176e-05, + "loss": 1.8734, + "step": 7322 + }, + { + "epoch": 2.2476979742173113, + "grad_norm": 0.2982875108718872, + "learning_rate": 9.038178818313048e-05, + "loss": 1.824, + "step": 7323 + }, + { + "epoch": 2.248004910988336, + "grad_norm": 0.41936174035072327, + "learning_rate": 9.037885693555273e-05, + "loss": 1.8799, + "step": 7324 + }, + { + "epoch": 2.2483118477593615, + "grad_norm": 0.3460717797279358, + "learning_rate": 9.037592528892744e-05, + "loss": 1.8889, + "step": 7325 + }, + { + "epoch": 2.248618784530387, + "grad_norm": 0.34347018599510193, + "learning_rate": 9.03729932432836e-05, + "loss": 1.8779, + "step": 7326 + }, + { + "epoch": 2.2489257213014118, + "grad_norm": 0.2988032400608063, + "learning_rate": 9.037006079865016e-05, + "loss": 1.8753, + "step": 7327 + }, + { + "epoch": 2.249232658072437, + "grad_norm": 0.32754310965538025, + "learning_rate": 9.036712795505613e-05, + "loss": 1.8896, + "step": 7328 + }, + { + "epoch": 2.2495395948434624, + "grad_norm": 0.3599032163619995, + "learning_rate": 9.036419471253049e-05, + "loss": 1.8752, + "step": 7329 + }, + { + "epoch": 2.2498465316144873, + "grad_norm": 0.3461225926876068, + "learning_rate": 9.03612610711022e-05, + "loss": 1.8723, + "step": 7330 + }, + { + "epoch": 2.2501534683855127, + "grad_norm": 0.3141838610172272, + "learning_rate": 9.035832703080027e-05, + "loss": 1.8825, + "step": 7331 + }, + { + "epoch": 2.250460405156538, + "grad_norm": 0.35188567638397217, + "learning_rate": 9.035539259165371e-05, + "loss": 1.8832, + "step": 7332 + }, + { + "epoch": 2.250767341927563, + "grad_norm": 0.3496280014514923, + "learning_rate": 9.035245775369151e-05, + "loss": 1.9084, + "step": 7333 + }, + { + "epoch": 2.2510742786985882, + "grad_norm": 0.34936273097991943, + "learning_rate": 9.034952251694266e-05, + "loss": 1.8142, + "step": 7334 + }, + { + "epoch": 2.251381215469613, + "grad_norm": 0.4227045774459839, + "learning_rate": 9.034658688143618e-05, + "loss": 1.9454, + "step": 7335 + }, + { + "epoch": 2.2516881522406385, + "grad_norm": 0.4042366147041321, + "learning_rate": 9.034365084720108e-05, + "loss": 1.8993, + "step": 7336 + }, + { + "epoch": 2.251995089011664, + "grad_norm": 0.392633318901062, + "learning_rate": 9.03407144142664e-05, + "loss": 1.9229, + "step": 7337 + }, + { + "epoch": 2.2523020257826887, + "grad_norm": 0.31304940581321716, + "learning_rate": 9.033777758266111e-05, + "loss": 1.8746, + "step": 7338 + }, + { + "epoch": 2.252608962553714, + "grad_norm": 0.3205752372741699, + "learning_rate": 9.033484035241426e-05, + "loss": 1.8224, + "step": 7339 + }, + { + "epoch": 2.252915899324739, + "grad_norm": 0.32164251804351807, + "learning_rate": 9.033190272355488e-05, + "loss": 1.8164, + "step": 7340 + }, + { + "epoch": 2.2532228360957642, + "grad_norm": 0.3567545413970947, + "learning_rate": 9.032896469611201e-05, + "loss": 1.8892, + "step": 7341 + }, + { + "epoch": 2.2535297728667896, + "grad_norm": 0.3475800156593323, + "learning_rate": 9.032602627011467e-05, + "loss": 1.8594, + "step": 7342 + }, + { + "epoch": 2.2538367096378145, + "grad_norm": 0.38770994544029236, + "learning_rate": 9.032308744559189e-05, + "loss": 1.8899, + "step": 7343 + }, + { + "epoch": 2.25414364640884, + "grad_norm": 0.3671153783798218, + "learning_rate": 9.032014822257273e-05, + "loss": 1.8795, + "step": 7344 + }, + { + "epoch": 2.254450583179865, + "grad_norm": 0.3415989875793457, + "learning_rate": 9.031720860108623e-05, + "loss": 1.9007, + "step": 7345 + }, + { + "epoch": 2.25475751995089, + "grad_norm": 0.3317084014415741, + "learning_rate": 9.031426858116145e-05, + "loss": 1.8604, + "step": 7346 + }, + { + "epoch": 2.2550644567219154, + "grad_norm": 0.3760251998901367, + "learning_rate": 9.031132816282745e-05, + "loss": 1.9061, + "step": 7347 + }, + { + "epoch": 2.2553713934929407, + "grad_norm": 0.4288908541202545, + "learning_rate": 9.030838734611326e-05, + "loss": 1.8621, + "step": 7348 + }, + { + "epoch": 2.2556783302639656, + "grad_norm": 0.3840491771697998, + "learning_rate": 9.030544613104797e-05, + "loss": 1.8743, + "step": 7349 + }, + { + "epoch": 2.255985267034991, + "grad_norm": 0.32746297121047974, + "learning_rate": 9.030250451766063e-05, + "loss": 1.8813, + "step": 7350 + }, + { + "epoch": 2.256292203806016, + "grad_norm": 0.31266525387763977, + "learning_rate": 9.029956250598032e-05, + "loss": 1.816, + "step": 7351 + }, + { + "epoch": 2.256599140577041, + "grad_norm": 0.34744998812675476, + "learning_rate": 9.029662009603613e-05, + "loss": 1.8728, + "step": 7352 + }, + { + "epoch": 2.2569060773480665, + "grad_norm": 0.36204856634140015, + "learning_rate": 9.029367728785709e-05, + "loss": 1.9331, + "step": 7353 + }, + { + "epoch": 2.2572130141190914, + "grad_norm": 0.3839271664619446, + "learning_rate": 9.029073408147234e-05, + "loss": 2.0018, + "step": 7354 + }, + { + "epoch": 2.2575199508901167, + "grad_norm": 0.34844526648521423, + "learning_rate": 9.028779047691094e-05, + "loss": 1.8873, + "step": 7355 + }, + { + "epoch": 2.2578268876611416, + "grad_norm": 0.31876906752586365, + "learning_rate": 9.028484647420196e-05, + "loss": 1.8569, + "step": 7356 + }, + { + "epoch": 2.258133824432167, + "grad_norm": 0.3633274435997009, + "learning_rate": 9.028190207337452e-05, + "loss": 1.8645, + "step": 7357 + }, + { + "epoch": 2.2584407612031923, + "grad_norm": 0.39025530219078064, + "learning_rate": 9.027895727445775e-05, + "loss": 1.911, + "step": 7358 + }, + { + "epoch": 2.258747697974217, + "grad_norm": 0.34168434143066406, + "learning_rate": 9.027601207748067e-05, + "loss": 1.8675, + "step": 7359 + }, + { + "epoch": 2.2590546347452425, + "grad_norm": 0.3539605438709259, + "learning_rate": 9.027306648247245e-05, + "loss": 1.9001, + "step": 7360 + }, + { + "epoch": 2.259361571516268, + "grad_norm": 0.30433401465415955, + "learning_rate": 9.02701204894622e-05, + "loss": 1.8598, + "step": 7361 + }, + { + "epoch": 2.2596685082872927, + "grad_norm": 0.35448700189590454, + "learning_rate": 9.026717409847898e-05, + "loss": 1.8845, + "step": 7362 + }, + { + "epoch": 2.259975445058318, + "grad_norm": 0.34060248732566833, + "learning_rate": 9.026422730955197e-05, + "loss": 1.9322, + "step": 7363 + }, + { + "epoch": 2.2602823818293434, + "grad_norm": 0.3370642364025116, + "learning_rate": 9.026128012271026e-05, + "loss": 1.8356, + "step": 7364 + }, + { + "epoch": 2.2605893186003683, + "grad_norm": 0.3148033022880554, + "learning_rate": 9.025833253798298e-05, + "loss": 1.7723, + "step": 7365 + }, + { + "epoch": 2.2608962553713936, + "grad_norm": 0.3062879145145416, + "learning_rate": 9.025538455539925e-05, + "loss": 1.8548, + "step": 7366 + }, + { + "epoch": 2.2612031921424185, + "grad_norm": 0.3378484547138214, + "learning_rate": 9.025243617498825e-05, + "loss": 1.9049, + "step": 7367 + }, + { + "epoch": 2.261510128913444, + "grad_norm": 0.277660608291626, + "learning_rate": 9.024948739677905e-05, + "loss": 1.7833, + "step": 7368 + }, + { + "epoch": 2.261817065684469, + "grad_norm": 0.3986060619354248, + "learning_rate": 9.024653822080083e-05, + "loss": 1.8837, + "step": 7369 + }, + { + "epoch": 2.262124002455494, + "grad_norm": 0.3013289272785187, + "learning_rate": 9.024358864708275e-05, + "loss": 1.8659, + "step": 7370 + }, + { + "epoch": 2.2624309392265194, + "grad_norm": 0.3403053879737854, + "learning_rate": 9.024063867565391e-05, + "loss": 1.8914, + "step": 7371 + }, + { + "epoch": 2.2627378759975443, + "grad_norm": 0.3488257825374603, + "learning_rate": 9.023768830654351e-05, + "loss": 1.8887, + "step": 7372 + }, + { + "epoch": 2.2630448127685696, + "grad_norm": 0.2950255274772644, + "learning_rate": 9.023473753978069e-05, + "loss": 1.8385, + "step": 7373 + }, + { + "epoch": 2.263351749539595, + "grad_norm": 0.35732173919677734, + "learning_rate": 9.023178637539461e-05, + "loss": 1.8769, + "step": 7374 + }, + { + "epoch": 2.26365868631062, + "grad_norm": 0.5403436422348022, + "learning_rate": 9.022883481341445e-05, + "loss": 1.9742, + "step": 7375 + }, + { + "epoch": 2.263965623081645, + "grad_norm": 0.5506799221038818, + "learning_rate": 9.022588285386935e-05, + "loss": 1.8667, + "step": 7376 + }, + { + "epoch": 2.2642725598526705, + "grad_norm": 0.4272395372390747, + "learning_rate": 9.02229304967885e-05, + "loss": 1.8336, + "step": 7377 + }, + { + "epoch": 2.2645794966236954, + "grad_norm": 0.34911462664604187, + "learning_rate": 9.021997774220108e-05, + "loss": 1.8608, + "step": 7378 + }, + { + "epoch": 2.2648864333947207, + "grad_norm": 0.3592715263366699, + "learning_rate": 9.021702459013626e-05, + "loss": 1.925, + "step": 7379 + }, + { + "epoch": 2.265193370165746, + "grad_norm": 0.38482216000556946, + "learning_rate": 9.021407104062323e-05, + "loss": 1.8553, + "step": 7380 + }, + { + "epoch": 2.265500306936771, + "grad_norm": 0.4675584137439728, + "learning_rate": 9.021111709369118e-05, + "loss": 1.9303, + "step": 7381 + }, + { + "epoch": 2.2658072437077963, + "grad_norm": 0.40397754311561584, + "learning_rate": 9.02081627493693e-05, + "loss": 1.9512, + "step": 7382 + }, + { + "epoch": 2.266114180478821, + "grad_norm": 0.3385498821735382, + "learning_rate": 9.02052080076868e-05, + "loss": 1.8314, + "step": 7383 + }, + { + "epoch": 2.2664211172498465, + "grad_norm": 0.40668871998786926, + "learning_rate": 9.020225286867285e-05, + "loss": 1.8658, + "step": 7384 + }, + { + "epoch": 2.266728054020872, + "grad_norm": 0.4566061198711395, + "learning_rate": 9.01992973323567e-05, + "loss": 1.8429, + "step": 7385 + }, + { + "epoch": 2.2670349907918967, + "grad_norm": 0.42283549904823303, + "learning_rate": 9.019634139876752e-05, + "loss": 1.8858, + "step": 7386 + }, + { + "epoch": 2.267341927562922, + "grad_norm": 0.3491251468658447, + "learning_rate": 9.019338506793454e-05, + "loss": 1.8389, + "step": 7387 + }, + { + "epoch": 2.267648864333947, + "grad_norm": 0.33846428990364075, + "learning_rate": 9.019042833988696e-05, + "loss": 1.8309, + "step": 7388 + }, + { + "epoch": 2.2679558011049723, + "grad_norm": 0.39968016743659973, + "learning_rate": 9.0187471214654e-05, + "loss": 1.8591, + "step": 7389 + }, + { + "epoch": 2.2682627378759976, + "grad_norm": 0.39926376938819885, + "learning_rate": 9.018451369226493e-05, + "loss": 1.9341, + "step": 7390 + }, + { + "epoch": 2.2685696746470225, + "grad_norm": 0.41112056374549866, + "learning_rate": 9.018155577274892e-05, + "loss": 1.8856, + "step": 7391 + }, + { + "epoch": 2.268876611418048, + "grad_norm": 0.49490058422088623, + "learning_rate": 9.017859745613521e-05, + "loss": 1.8458, + "step": 7392 + }, + { + "epoch": 2.269183548189073, + "grad_norm": 0.42149874567985535, + "learning_rate": 9.017563874245308e-05, + "loss": 1.862, + "step": 7393 + }, + { + "epoch": 2.269490484960098, + "grad_norm": 0.37284091114997864, + "learning_rate": 9.017267963173173e-05, + "loss": 1.8698, + "step": 7394 + }, + { + "epoch": 2.2697974217311234, + "grad_norm": 0.3743322193622589, + "learning_rate": 9.016972012400041e-05, + "loss": 1.8847, + "step": 7395 + }, + { + "epoch": 2.2701043585021488, + "grad_norm": 0.4327050447463989, + "learning_rate": 9.016676021928838e-05, + "loss": 1.8227, + "step": 7396 + }, + { + "epoch": 2.2704112952731736, + "grad_norm": 0.4334336519241333, + "learning_rate": 9.016379991762487e-05, + "loss": 1.9292, + "step": 7397 + }, + { + "epoch": 2.270718232044199, + "grad_norm": 0.37071630358695984, + "learning_rate": 9.016083921903915e-05, + "loss": 1.8045, + "step": 7398 + }, + { + "epoch": 2.271025168815224, + "grad_norm": 0.32131752371788025, + "learning_rate": 9.015787812356049e-05, + "loss": 1.8697, + "step": 7399 + }, + { + "epoch": 2.271332105586249, + "grad_norm": 0.3604664206504822, + "learning_rate": 9.015491663121813e-05, + "loss": 1.9259, + "step": 7400 + }, + { + "epoch": 2.2716390423572745, + "grad_norm": 0.3364580571651459, + "learning_rate": 9.015195474204136e-05, + "loss": 1.8964, + "step": 7401 + }, + { + "epoch": 2.2719459791282994, + "grad_norm": 0.3141402304172516, + "learning_rate": 9.014899245605944e-05, + "loss": 1.8536, + "step": 7402 + }, + { + "epoch": 2.2722529158993248, + "grad_norm": 0.3387024402618408, + "learning_rate": 9.014602977330162e-05, + "loss": 1.8362, + "step": 7403 + }, + { + "epoch": 2.27255985267035, + "grad_norm": 0.42270272970199585, + "learning_rate": 9.014306669379723e-05, + "loss": 1.8288, + "step": 7404 + }, + { + "epoch": 2.272866789441375, + "grad_norm": 0.4565230906009674, + "learning_rate": 9.01401032175755e-05, + "loss": 1.8573, + "step": 7405 + }, + { + "epoch": 2.2731737262124003, + "grad_norm": 0.38861140608787537, + "learning_rate": 9.013713934466576e-05, + "loss": 1.8778, + "step": 7406 + }, + { + "epoch": 2.2734806629834257, + "grad_norm": 0.31552520394325256, + "learning_rate": 9.01341750750973e-05, + "loss": 1.8342, + "step": 7407 + }, + { + "epoch": 2.2737875997544506, + "grad_norm": 0.3771591782569885, + "learning_rate": 9.013121040889938e-05, + "loss": 1.8847, + "step": 7408 + }, + { + "epoch": 2.274094536525476, + "grad_norm": 0.3689042925834656, + "learning_rate": 9.012824534610132e-05, + "loss": 1.9014, + "step": 7409 + }, + { + "epoch": 2.2744014732965008, + "grad_norm": 0.31477800011634827, + "learning_rate": 9.012527988673241e-05, + "loss": 1.8631, + "step": 7410 + }, + { + "epoch": 2.274708410067526, + "grad_norm": 0.3238977789878845, + "learning_rate": 9.012231403082199e-05, + "loss": 1.8319, + "step": 7411 + }, + { + "epoch": 2.2750153468385514, + "grad_norm": 0.3587593138217926, + "learning_rate": 9.011934777839932e-05, + "loss": 1.8982, + "step": 7412 + }, + { + "epoch": 2.2753222836095763, + "grad_norm": 0.35946986079216003, + "learning_rate": 9.011638112949376e-05, + "loss": 1.9206, + "step": 7413 + }, + { + "epoch": 2.2756292203806017, + "grad_norm": 0.3451001048088074, + "learning_rate": 9.01134140841346e-05, + "loss": 1.8122, + "step": 7414 + }, + { + "epoch": 2.2759361571516266, + "grad_norm": 0.3779532313346863, + "learning_rate": 9.011044664235116e-05, + "loss": 1.8851, + "step": 7415 + }, + { + "epoch": 2.276243093922652, + "grad_norm": 0.3812767267227173, + "learning_rate": 9.010747880417279e-05, + "loss": 1.902, + "step": 7416 + }, + { + "epoch": 2.2765500306936772, + "grad_norm": 0.3666127920150757, + "learning_rate": 9.01045105696288e-05, + "loss": 1.8296, + "step": 7417 + }, + { + "epoch": 2.276856967464702, + "grad_norm": 0.3588816225528717, + "learning_rate": 9.010154193874854e-05, + "loss": 1.9023, + "step": 7418 + }, + { + "epoch": 2.2771639042357275, + "grad_norm": 0.37766706943511963, + "learning_rate": 9.009857291156134e-05, + "loss": 1.7996, + "step": 7419 + }, + { + "epoch": 2.277470841006753, + "grad_norm": 0.4222901165485382, + "learning_rate": 9.009560348809654e-05, + "loss": 1.8802, + "step": 7420 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.39289870858192444, + "learning_rate": 9.009263366838348e-05, + "loss": 1.8988, + "step": 7421 + }, + { + "epoch": 2.278084714548803, + "grad_norm": 0.3670540750026703, + "learning_rate": 9.008966345245152e-05, + "loss": 1.8348, + "step": 7422 + }, + { + "epoch": 2.2783916513198283, + "grad_norm": 0.36671552062034607, + "learning_rate": 9.008669284032998e-05, + "loss": 1.9059, + "step": 7423 + }, + { + "epoch": 2.2786985880908532, + "grad_norm": 0.33226338028907776, + "learning_rate": 9.008372183204827e-05, + "loss": 1.8736, + "step": 7424 + }, + { + "epoch": 2.2790055248618786, + "grad_norm": 0.3424983322620392, + "learning_rate": 9.008075042763573e-05, + "loss": 1.8537, + "step": 7425 + }, + { + "epoch": 2.2793124616329035, + "grad_norm": 0.3336870074272156, + "learning_rate": 9.007777862712172e-05, + "loss": 1.8622, + "step": 7426 + }, + { + "epoch": 2.279619398403929, + "grad_norm": 0.3488881289958954, + "learning_rate": 9.007480643053561e-05, + "loss": 1.88, + "step": 7427 + }, + { + "epoch": 2.279926335174954, + "grad_norm": 0.34159761667251587, + "learning_rate": 9.007183383790676e-05, + "loss": 1.8893, + "step": 7428 + }, + { + "epoch": 2.280233271945979, + "grad_norm": 0.3075805604457855, + "learning_rate": 9.006886084926459e-05, + "loss": 1.8613, + "step": 7429 + }, + { + "epoch": 2.2805402087170044, + "grad_norm": 0.32371413707733154, + "learning_rate": 9.006588746463844e-05, + "loss": 1.909, + "step": 7430 + }, + { + "epoch": 2.2808471454880292, + "grad_norm": 0.34343451261520386, + "learning_rate": 9.006291368405769e-05, + "loss": 1.8696, + "step": 7431 + }, + { + "epoch": 2.2811540822590546, + "grad_norm": 0.34018251299858093, + "learning_rate": 9.005993950755177e-05, + "loss": 1.9155, + "step": 7432 + }, + { + "epoch": 2.28146101903008, + "grad_norm": 0.42582982778549194, + "learning_rate": 9.005696493515003e-05, + "loss": 1.8901, + "step": 7433 + }, + { + "epoch": 2.281767955801105, + "grad_norm": 0.44168829917907715, + "learning_rate": 9.005398996688188e-05, + "loss": 1.8693, + "step": 7434 + }, + { + "epoch": 2.28207489257213, + "grad_norm": 0.3650555908679962, + "learning_rate": 9.005101460277673e-05, + "loss": 1.8726, + "step": 7435 + }, + { + "epoch": 2.2823818293431555, + "grad_norm": 0.2945705056190491, + "learning_rate": 9.004803884286399e-05, + "loss": 1.8655, + "step": 7436 + }, + { + "epoch": 2.2826887661141804, + "grad_norm": 0.4192120432853699, + "learning_rate": 9.004506268717305e-05, + "loss": 1.9859, + "step": 7437 + }, + { + "epoch": 2.2829957028852057, + "grad_norm": 0.35403937101364136, + "learning_rate": 9.004208613573334e-05, + "loss": 1.785, + "step": 7438 + }, + { + "epoch": 2.283302639656231, + "grad_norm": 0.3038218021392822, + "learning_rate": 9.003910918857426e-05, + "loss": 1.8199, + "step": 7439 + }, + { + "epoch": 2.283609576427256, + "grad_norm": 0.3447442352771759, + "learning_rate": 9.003613184572522e-05, + "loss": 1.882, + "step": 7440 + }, + { + "epoch": 2.2839165131982813, + "grad_norm": 0.32208123803138733, + "learning_rate": 9.003315410721567e-05, + "loss": 1.8326, + "step": 7441 + }, + { + "epoch": 2.284223449969306, + "grad_norm": 0.31731268763542175, + "learning_rate": 9.003017597307504e-05, + "loss": 1.8947, + "step": 7442 + }, + { + "epoch": 2.2845303867403315, + "grad_norm": 0.3491910398006439, + "learning_rate": 9.002719744333273e-05, + "loss": 1.924, + "step": 7443 + }, + { + "epoch": 2.284837323511357, + "grad_norm": 0.32135117053985596, + "learning_rate": 9.00242185180182e-05, + "loss": 1.838, + "step": 7444 + }, + { + "epoch": 2.2851442602823817, + "grad_norm": 0.32201823592185974, + "learning_rate": 9.00212391971609e-05, + "loss": 1.8449, + "step": 7445 + }, + { + "epoch": 2.285451197053407, + "grad_norm": 0.3983609676361084, + "learning_rate": 9.001825948079024e-05, + "loss": 1.8897, + "step": 7446 + }, + { + "epoch": 2.285758133824432, + "grad_norm": 0.4174421727657318, + "learning_rate": 9.001527936893568e-05, + "loss": 1.8671, + "step": 7447 + }, + { + "epoch": 2.2860650705954573, + "grad_norm": 0.3456888496875763, + "learning_rate": 9.001229886162668e-05, + "loss": 1.9064, + "step": 7448 + }, + { + "epoch": 2.2863720073664826, + "grad_norm": 0.3092229664325714, + "learning_rate": 9.000931795889269e-05, + "loss": 1.8478, + "step": 7449 + }, + { + "epoch": 2.2866789441375075, + "grad_norm": 0.40093541145324707, + "learning_rate": 9.000633666076317e-05, + "loss": 1.9226, + "step": 7450 + }, + { + "epoch": 2.286985880908533, + "grad_norm": 0.41090336441993713, + "learning_rate": 9.000335496726759e-05, + "loss": 1.8542, + "step": 7451 + }, + { + "epoch": 2.287292817679558, + "grad_norm": 0.48479974269866943, + "learning_rate": 9.00003728784354e-05, + "loss": 1.9217, + "step": 7452 + }, + { + "epoch": 2.287599754450583, + "grad_norm": 0.662677526473999, + "learning_rate": 8.999739039429609e-05, + "loss": 1.957, + "step": 7453 + }, + { + "epoch": 2.2879066912216084, + "grad_norm": 0.6417959928512573, + "learning_rate": 8.999440751487911e-05, + "loss": 1.8273, + "step": 7454 + }, + { + "epoch": 2.2882136279926337, + "grad_norm": 0.5561745762825012, + "learning_rate": 8.999142424021396e-05, + "loss": 1.9081, + "step": 7455 + }, + { + "epoch": 2.2885205647636586, + "grad_norm": 0.3603537976741791, + "learning_rate": 8.998844057033013e-05, + "loss": 1.8256, + "step": 7456 + }, + { + "epoch": 2.288827501534684, + "grad_norm": 0.5149406790733337, + "learning_rate": 8.998545650525707e-05, + "loss": 1.8257, + "step": 7457 + }, + { + "epoch": 2.289134438305709, + "grad_norm": 0.6777750253677368, + "learning_rate": 8.99824720450243e-05, + "loss": 1.8581, + "step": 7458 + }, + { + "epoch": 2.289441375076734, + "grad_norm": 0.6244171857833862, + "learning_rate": 8.997948718966132e-05, + "loss": 1.9195, + "step": 7459 + }, + { + "epoch": 2.2897483118477595, + "grad_norm": 0.3903466463088989, + "learning_rate": 8.99765019391976e-05, + "loss": 1.8996, + "step": 7460 + }, + { + "epoch": 2.2900552486187844, + "grad_norm": 0.4231773614883423, + "learning_rate": 8.997351629366266e-05, + "loss": 1.9447, + "step": 7461 + }, + { + "epoch": 2.2903621853898097, + "grad_norm": 0.5735896825790405, + "learning_rate": 8.997053025308602e-05, + "loss": 1.9082, + "step": 7462 + }, + { + "epoch": 2.2906691221608346, + "grad_norm": 0.5015980005264282, + "learning_rate": 8.996754381749715e-05, + "loss": 1.8744, + "step": 7463 + }, + { + "epoch": 2.29097605893186, + "grad_norm": 0.3385339677333832, + "learning_rate": 8.996455698692558e-05, + "loss": 1.8908, + "step": 7464 + }, + { + "epoch": 2.2912829957028853, + "grad_norm": 0.35323935747146606, + "learning_rate": 8.996156976140086e-05, + "loss": 1.8739, + "step": 7465 + }, + { + "epoch": 2.29158993247391, + "grad_norm": 0.386081725358963, + "learning_rate": 8.995858214095248e-05, + "loss": 1.8734, + "step": 7466 + }, + { + "epoch": 2.2918968692449355, + "grad_norm": 0.32834386825561523, + "learning_rate": 8.995559412560996e-05, + "loss": 1.8849, + "step": 7467 + }, + { + "epoch": 2.292203806015961, + "grad_norm": 0.3868117034435272, + "learning_rate": 8.995260571540284e-05, + "loss": 1.8992, + "step": 7468 + }, + { + "epoch": 2.2925107427869857, + "grad_norm": 0.3869209885597229, + "learning_rate": 8.994961691036066e-05, + "loss": 1.8562, + "step": 7469 + }, + { + "epoch": 2.292817679558011, + "grad_norm": 0.39098650217056274, + "learning_rate": 8.994662771051294e-05, + "loss": 1.9077, + "step": 7470 + }, + { + "epoch": 2.2931246163290364, + "grad_norm": 0.4433341920375824, + "learning_rate": 8.994363811588923e-05, + "loss": 1.9193, + "step": 7471 + }, + { + "epoch": 2.2934315531000613, + "grad_norm": 0.37947940826416016, + "learning_rate": 8.99406481265191e-05, + "loss": 1.8843, + "step": 7472 + }, + { + "epoch": 2.2937384898710866, + "grad_norm": 0.4123954772949219, + "learning_rate": 8.993765774243206e-05, + "loss": 1.8847, + "step": 7473 + }, + { + "epoch": 2.2940454266421115, + "grad_norm": 0.3863835036754608, + "learning_rate": 8.993466696365768e-05, + "loss": 1.8226, + "step": 7474 + }, + { + "epoch": 2.294352363413137, + "grad_norm": 0.34903961420059204, + "learning_rate": 8.993167579022551e-05, + "loss": 1.9151, + "step": 7475 + }, + { + "epoch": 2.294659300184162, + "grad_norm": 0.439989298582077, + "learning_rate": 8.992868422216512e-05, + "loss": 1.8494, + "step": 7476 + }, + { + "epoch": 2.294966236955187, + "grad_norm": 0.42929476499557495, + "learning_rate": 8.992569225950607e-05, + "loss": 1.8174, + "step": 7477 + }, + { + "epoch": 2.2952731737262124, + "grad_norm": 0.39554497599601746, + "learning_rate": 8.992269990227792e-05, + "loss": 1.8692, + "step": 7478 + }, + { + "epoch": 2.2955801104972378, + "grad_norm": 0.29355254769325256, + "learning_rate": 8.991970715051026e-05, + "loss": 1.8033, + "step": 7479 + }, + { + "epoch": 2.2958870472682626, + "grad_norm": 0.3488605320453644, + "learning_rate": 8.991671400423265e-05, + "loss": 1.8979, + "step": 7480 + }, + { + "epoch": 2.296193984039288, + "grad_norm": 0.34984245896339417, + "learning_rate": 8.991372046347468e-05, + "loss": 1.8931, + "step": 7481 + }, + { + "epoch": 2.2965009208103133, + "grad_norm": 0.29404810070991516, + "learning_rate": 8.991072652826593e-05, + "loss": 1.8626, + "step": 7482 + }, + { + "epoch": 2.296807857581338, + "grad_norm": 0.2838701009750366, + "learning_rate": 8.990773219863598e-05, + "loss": 1.8542, + "step": 7483 + }, + { + "epoch": 2.2971147943523635, + "grad_norm": 0.28008925914764404, + "learning_rate": 8.990473747461444e-05, + "loss": 1.8354, + "step": 7484 + }, + { + "epoch": 2.2974217311233884, + "grad_norm": 0.3046751320362091, + "learning_rate": 8.99017423562309e-05, + "loss": 1.8657, + "step": 7485 + }, + { + "epoch": 2.2977286678944138, + "grad_norm": 0.28220781683921814, + "learning_rate": 8.989874684351494e-05, + "loss": 1.8349, + "step": 7486 + }, + { + "epoch": 2.298035604665439, + "grad_norm": 0.2665577232837677, + "learning_rate": 8.989575093649619e-05, + "loss": 1.8551, + "step": 7487 + }, + { + "epoch": 2.298342541436464, + "grad_norm": 0.2797924280166626, + "learning_rate": 8.989275463520423e-05, + "loss": 1.8568, + "step": 7488 + }, + { + "epoch": 2.2986494782074893, + "grad_norm": 0.2917410731315613, + "learning_rate": 8.98897579396687e-05, + "loss": 1.843, + "step": 7489 + }, + { + "epoch": 2.298956414978514, + "grad_norm": 0.3014819920063019, + "learning_rate": 8.98867608499192e-05, + "loss": 1.8527, + "step": 7490 + }, + { + "epoch": 2.2992633517495396, + "grad_norm": 0.28019243478775024, + "learning_rate": 8.988376336598537e-05, + "loss": 1.7744, + "step": 7491 + }, + { + "epoch": 2.299570288520565, + "grad_norm": 0.35014277696609497, + "learning_rate": 8.988076548789678e-05, + "loss": 1.9604, + "step": 7492 + }, + { + "epoch": 2.2998772252915898, + "grad_norm": 0.3060695230960846, + "learning_rate": 8.987776721568311e-05, + "loss": 1.8463, + "step": 7493 + }, + { + "epoch": 2.300184162062615, + "grad_norm": 0.29870638251304626, + "learning_rate": 8.987476854937395e-05, + "loss": 1.815, + "step": 7494 + }, + { + "epoch": 2.3004910988336404, + "grad_norm": 0.27395132184028625, + "learning_rate": 8.987176948899898e-05, + "loss": 1.8126, + "step": 7495 + }, + { + "epoch": 2.3007980356046653, + "grad_norm": 0.2982339859008789, + "learning_rate": 8.986877003458781e-05, + "loss": 1.9114, + "step": 7496 + }, + { + "epoch": 2.3011049723756907, + "grad_norm": 0.3113982081413269, + "learning_rate": 8.986577018617008e-05, + "loss": 1.8429, + "step": 7497 + }, + { + "epoch": 2.301411909146716, + "grad_norm": 0.3538585603237152, + "learning_rate": 8.986276994377544e-05, + "loss": 1.9045, + "step": 7498 + }, + { + "epoch": 2.301718845917741, + "grad_norm": 0.37576064467430115, + "learning_rate": 8.985976930743356e-05, + "loss": 1.8955, + "step": 7499 + }, + { + "epoch": 2.3020257826887662, + "grad_norm": 0.3080044388771057, + "learning_rate": 8.985676827717406e-05, + "loss": 1.7946, + "step": 7500 + }, + { + "epoch": 2.302332719459791, + "grad_norm": 0.33935341238975525, + "learning_rate": 8.985376685302662e-05, + "loss": 1.8817, + "step": 7501 + }, + { + "epoch": 2.3026396562308165, + "grad_norm": 0.3817180395126343, + "learning_rate": 8.98507650350209e-05, + "loss": 1.9178, + "step": 7502 + }, + { + "epoch": 2.302946593001842, + "grad_norm": 0.35170307755470276, + "learning_rate": 8.984776282318657e-05, + "loss": 1.9451, + "step": 7503 + }, + { + "epoch": 2.3032535297728667, + "grad_norm": 0.3451419770717621, + "learning_rate": 8.984476021755329e-05, + "loss": 1.9127, + "step": 7504 + }, + { + "epoch": 2.303560466543892, + "grad_norm": 0.4312259554862976, + "learning_rate": 8.984175721815071e-05, + "loss": 1.8784, + "step": 7505 + }, + { + "epoch": 2.303867403314917, + "grad_norm": 0.4684976041316986, + "learning_rate": 8.983875382500856e-05, + "loss": 1.8782, + "step": 7506 + }, + { + "epoch": 2.3041743400859422, + "grad_norm": 0.4230491518974304, + "learning_rate": 8.983575003815648e-05, + "loss": 1.8769, + "step": 7507 + }, + { + "epoch": 2.3044812768569676, + "grad_norm": 0.32715409994125366, + "learning_rate": 8.983274585762417e-05, + "loss": 1.8535, + "step": 7508 + }, + { + "epoch": 2.3047882136279925, + "grad_norm": 0.3857569396495819, + "learning_rate": 8.982974128344134e-05, + "loss": 1.8689, + "step": 7509 + }, + { + "epoch": 2.305095150399018, + "grad_norm": 0.46266329288482666, + "learning_rate": 8.982673631563766e-05, + "loss": 1.9151, + "step": 7510 + }, + { + "epoch": 2.305402087170043, + "grad_norm": 0.455713152885437, + "learning_rate": 8.98237309542428e-05, + "loss": 1.9304, + "step": 7511 + }, + { + "epoch": 2.305709023941068, + "grad_norm": 0.3413514792919159, + "learning_rate": 8.98207251992865e-05, + "loss": 1.8516, + "step": 7512 + }, + { + "epoch": 2.3060159607120934, + "grad_norm": 0.3705863058567047, + "learning_rate": 8.981771905079846e-05, + "loss": 1.8434, + "step": 7513 + }, + { + "epoch": 2.3063228974831187, + "grad_norm": 0.46615147590637207, + "learning_rate": 8.981471250880839e-05, + "loss": 1.9265, + "step": 7514 + }, + { + "epoch": 2.3066298342541436, + "grad_norm": 0.5400925278663635, + "learning_rate": 8.981170557334598e-05, + "loss": 1.9061, + "step": 7515 + }, + { + "epoch": 2.306936771025169, + "grad_norm": 0.40317288041114807, + "learning_rate": 8.980869824444096e-05, + "loss": 1.7916, + "step": 7516 + }, + { + "epoch": 2.307243707796194, + "grad_norm": 0.3522326648235321, + "learning_rate": 8.980569052212307e-05, + "loss": 1.867, + "step": 7517 + }, + { + "epoch": 2.307550644567219, + "grad_norm": 0.5134142637252808, + "learning_rate": 8.9802682406422e-05, + "loss": 1.8406, + "step": 7518 + }, + { + "epoch": 2.3078575813382445, + "grad_norm": 0.5792621970176697, + "learning_rate": 8.97996738973675e-05, + "loss": 1.8467, + "step": 7519 + }, + { + "epoch": 2.3081645181092694, + "grad_norm": 0.424405962228775, + "learning_rate": 8.979666499498928e-05, + "loss": 1.779, + "step": 7520 + }, + { + "epoch": 2.3084714548802947, + "grad_norm": 0.3233562409877777, + "learning_rate": 8.979365569931712e-05, + "loss": 1.9043, + "step": 7521 + }, + { + "epoch": 2.3087783916513196, + "grad_norm": 0.6043062806129456, + "learning_rate": 8.979064601038071e-05, + "loss": 1.9245, + "step": 7522 + }, + { + "epoch": 2.309085328422345, + "grad_norm": 0.6618810892105103, + "learning_rate": 8.978763592820982e-05, + "loss": 1.8601, + "step": 7523 + }, + { + "epoch": 2.3093922651933703, + "grad_norm": 0.44771909713745117, + "learning_rate": 8.978462545283418e-05, + "loss": 1.7836, + "step": 7524 + }, + { + "epoch": 2.309699201964395, + "grad_norm": 0.3473430871963501, + "learning_rate": 8.978161458428356e-05, + "loss": 1.8743, + "step": 7525 + }, + { + "epoch": 2.3100061387354205, + "grad_norm": 0.46158188581466675, + "learning_rate": 8.977860332258772e-05, + "loss": 1.8802, + "step": 7526 + }, + { + "epoch": 2.310313075506446, + "grad_norm": 0.42034098505973816, + "learning_rate": 8.977559166777639e-05, + "loss": 1.8773, + "step": 7527 + }, + { + "epoch": 2.3106200122774707, + "grad_norm": 0.30994895100593567, + "learning_rate": 8.977257961987936e-05, + "loss": 1.8042, + "step": 7528 + }, + { + "epoch": 2.310926949048496, + "grad_norm": 0.32265907526016235, + "learning_rate": 8.976956717892638e-05, + "loss": 1.8, + "step": 7529 + }, + { + "epoch": 2.3112338858195214, + "grad_norm": 0.3592197000980377, + "learning_rate": 8.976655434494723e-05, + "loss": 1.9053, + "step": 7530 + }, + { + "epoch": 2.3115408225905463, + "grad_norm": 0.36494702100753784, + "learning_rate": 8.97635411179717e-05, + "loss": 1.8982, + "step": 7531 + }, + { + "epoch": 2.3118477593615716, + "grad_norm": 0.3697327971458435, + "learning_rate": 8.976052749802952e-05, + "loss": 1.9446, + "step": 7532 + }, + { + "epoch": 2.3121546961325965, + "grad_norm": 0.5200048089027405, + "learning_rate": 8.975751348515052e-05, + "loss": 1.9429, + "step": 7533 + }, + { + "epoch": 2.312461632903622, + "grad_norm": 0.4033229947090149, + "learning_rate": 8.975449907936446e-05, + "loss": 1.8128, + "step": 7534 + }, + { + "epoch": 2.312768569674647, + "grad_norm": 0.35759851336479187, + "learning_rate": 8.975148428070115e-05, + "loss": 1.8721, + "step": 7535 + }, + { + "epoch": 2.313075506445672, + "grad_norm": 0.4578085243701935, + "learning_rate": 8.974846908919037e-05, + "loss": 1.8397, + "step": 7536 + }, + { + "epoch": 2.3133824432166974, + "grad_norm": 0.4557357132434845, + "learning_rate": 8.974545350486192e-05, + "loss": 1.8726, + "step": 7537 + }, + { + "epoch": 2.3136893799877223, + "grad_norm": 0.3946380615234375, + "learning_rate": 8.974243752774561e-05, + "loss": 1.8662, + "step": 7538 + }, + { + "epoch": 2.3139963167587476, + "grad_norm": 0.29723790287971497, + "learning_rate": 8.973942115787122e-05, + "loss": 1.8215, + "step": 7539 + }, + { + "epoch": 2.314303253529773, + "grad_norm": 0.37225791811943054, + "learning_rate": 8.973640439526858e-05, + "loss": 1.9422, + "step": 7540 + }, + { + "epoch": 2.314610190300798, + "grad_norm": 0.3359868824481964, + "learning_rate": 8.973338723996751e-05, + "loss": 1.7974, + "step": 7541 + }, + { + "epoch": 2.314917127071823, + "grad_norm": 0.2993139922618866, + "learning_rate": 8.973036969199782e-05, + "loss": 1.8691, + "step": 7542 + }, + { + "epoch": 2.3152240638428485, + "grad_norm": 0.3155567944049835, + "learning_rate": 8.972735175138933e-05, + "loss": 1.857, + "step": 7543 + }, + { + "epoch": 2.3155310006138734, + "grad_norm": 0.315820574760437, + "learning_rate": 8.972433341817188e-05, + "loss": 1.8597, + "step": 7544 + }, + { + "epoch": 2.3158379373848987, + "grad_norm": 0.32500606775283813, + "learning_rate": 8.972131469237526e-05, + "loss": 1.9293, + "step": 7545 + }, + { + "epoch": 2.316144874155924, + "grad_norm": 0.3481442332267761, + "learning_rate": 8.971829557402933e-05, + "loss": 1.8839, + "step": 7546 + }, + { + "epoch": 2.316451810926949, + "grad_norm": 0.3110404312610626, + "learning_rate": 8.971527606316394e-05, + "loss": 1.8717, + "step": 7547 + }, + { + "epoch": 2.3167587476979743, + "grad_norm": 0.319795161485672, + "learning_rate": 8.97122561598089e-05, + "loss": 1.8855, + "step": 7548 + }, + { + "epoch": 2.317065684468999, + "grad_norm": 0.33142411708831787, + "learning_rate": 8.970923586399407e-05, + "loss": 1.863, + "step": 7549 + }, + { + "epoch": 2.3173726212400245, + "grad_norm": 0.348715603351593, + "learning_rate": 8.970621517574929e-05, + "loss": 1.8886, + "step": 7550 + }, + { + "epoch": 2.31767955801105, + "grad_norm": 0.3179607689380646, + "learning_rate": 8.970319409510444e-05, + "loss": 1.8955, + "step": 7551 + }, + { + "epoch": 2.3179864947820747, + "grad_norm": 0.33166465163230896, + "learning_rate": 8.970017262208934e-05, + "loss": 1.8366, + "step": 7552 + }, + { + "epoch": 2.3182934315531, + "grad_norm": 0.30798691511154175, + "learning_rate": 8.969715075673386e-05, + "loss": 1.8437, + "step": 7553 + }, + { + "epoch": 2.3186003683241254, + "grad_norm": 0.292639821767807, + "learning_rate": 8.969412849906788e-05, + "loss": 1.8056, + "step": 7554 + }, + { + "epoch": 2.3189073050951503, + "grad_norm": 0.2972165048122406, + "learning_rate": 8.969110584912125e-05, + "loss": 1.8596, + "step": 7555 + }, + { + "epoch": 2.3192142418661756, + "grad_norm": 0.3346043527126312, + "learning_rate": 8.968808280692385e-05, + "loss": 1.8652, + "step": 7556 + }, + { + "epoch": 2.319521178637201, + "grad_norm": 0.31866857409477234, + "learning_rate": 8.968505937250555e-05, + "loss": 1.9263, + "step": 7557 + }, + { + "epoch": 2.319828115408226, + "grad_norm": 0.3511367440223694, + "learning_rate": 8.968203554589625e-05, + "loss": 1.8615, + "step": 7558 + }, + { + "epoch": 2.320135052179251, + "grad_norm": 0.36077243089675903, + "learning_rate": 8.96790113271258e-05, + "loss": 1.9155, + "step": 7559 + }, + { + "epoch": 2.320441988950276, + "grad_norm": 0.3335363268852234, + "learning_rate": 8.96759867162241e-05, + "loss": 1.8313, + "step": 7560 + }, + { + "epoch": 2.3207489257213014, + "grad_norm": 0.31834676861763, + "learning_rate": 8.967296171322105e-05, + "loss": 1.809, + "step": 7561 + }, + { + "epoch": 2.3210558624923268, + "grad_norm": 0.3629632890224457, + "learning_rate": 8.966993631814655e-05, + "loss": 1.854, + "step": 7562 + }, + { + "epoch": 2.3213627992633517, + "grad_norm": 0.3164220154285431, + "learning_rate": 8.966691053103049e-05, + "loss": 1.8431, + "step": 7563 + }, + { + "epoch": 2.321669736034377, + "grad_norm": 0.408178448677063, + "learning_rate": 8.966388435190276e-05, + "loss": 1.8652, + "step": 7564 + }, + { + "epoch": 2.321976672805402, + "grad_norm": 0.4244436025619507, + "learning_rate": 8.966085778079327e-05, + "loss": 1.8834, + "step": 7565 + }, + { + "epoch": 2.322283609576427, + "grad_norm": 0.44187989830970764, + "learning_rate": 8.965783081773195e-05, + "loss": 1.8822, + "step": 7566 + }, + { + "epoch": 2.3225905463474525, + "grad_norm": 0.30801042914390564, + "learning_rate": 8.965480346274869e-05, + "loss": 1.8145, + "step": 7567 + }, + { + "epoch": 2.3228974831184774, + "grad_norm": 0.30103740096092224, + "learning_rate": 8.965177571587343e-05, + "loss": 1.8207, + "step": 7568 + }, + { + "epoch": 2.3232044198895028, + "grad_norm": 0.417538046836853, + "learning_rate": 8.964874757713608e-05, + "loss": 1.9213, + "step": 7569 + }, + { + "epoch": 2.323511356660528, + "grad_norm": 0.4238434433937073, + "learning_rate": 8.964571904656656e-05, + "loss": 1.8309, + "step": 7570 + }, + { + "epoch": 2.323818293431553, + "grad_norm": 0.3717726171016693, + "learning_rate": 8.964269012419482e-05, + "loss": 1.8613, + "step": 7571 + }, + { + "epoch": 2.3241252302025783, + "grad_norm": 0.369182288646698, + "learning_rate": 8.963966081005078e-05, + "loss": 1.9232, + "step": 7572 + }, + { + "epoch": 2.3244321669736037, + "grad_norm": 0.40301385521888733, + "learning_rate": 8.963663110416436e-05, + "loss": 1.9509, + "step": 7573 + }, + { + "epoch": 2.3247391037446286, + "grad_norm": 0.3336825966835022, + "learning_rate": 8.963360100656553e-05, + "loss": 1.807, + "step": 7574 + }, + { + "epoch": 2.325046040515654, + "grad_norm": 0.4070039987564087, + "learning_rate": 8.963057051728423e-05, + "loss": 1.9349, + "step": 7575 + }, + { + "epoch": 2.325352977286679, + "grad_norm": 0.34244731068611145, + "learning_rate": 8.96275396363504e-05, + "loss": 1.8378, + "step": 7576 + }, + { + "epoch": 2.325659914057704, + "grad_norm": 0.3408849835395813, + "learning_rate": 8.962450836379401e-05, + "loss": 1.8087, + "step": 7577 + }, + { + "epoch": 2.3259668508287294, + "grad_norm": 0.34224358201026917, + "learning_rate": 8.962147669964498e-05, + "loss": 1.9158, + "step": 7578 + }, + { + "epoch": 2.3262737875997543, + "grad_norm": 0.36177051067352295, + "learning_rate": 8.961844464393332e-05, + "loss": 1.8774, + "step": 7579 + }, + { + "epoch": 2.3265807243707797, + "grad_norm": 0.3000224232673645, + "learning_rate": 8.961541219668895e-05, + "loss": 1.8092, + "step": 7580 + }, + { + "epoch": 2.3268876611418046, + "grad_norm": 0.34738194942474365, + "learning_rate": 8.961237935794185e-05, + "loss": 1.9107, + "step": 7581 + }, + { + "epoch": 2.32719459791283, + "grad_norm": 0.355585515499115, + "learning_rate": 8.960934612772203e-05, + "loss": 1.8343, + "step": 7582 + }, + { + "epoch": 2.3275015346838552, + "grad_norm": 0.29839828610420227, + "learning_rate": 8.96063125060594e-05, + "loss": 1.8345, + "step": 7583 + }, + { + "epoch": 2.32780847145488, + "grad_norm": 0.3695736229419708, + "learning_rate": 8.960327849298399e-05, + "loss": 1.8763, + "step": 7584 + }, + { + "epoch": 2.3281154082259055, + "grad_norm": 0.38834989070892334, + "learning_rate": 8.960024408852578e-05, + "loss": 1.8732, + "step": 7585 + }, + { + "epoch": 2.328422344996931, + "grad_norm": 0.4515606462955475, + "learning_rate": 8.959720929271474e-05, + "loss": 1.9685, + "step": 7586 + }, + { + "epoch": 2.3287292817679557, + "grad_norm": 0.39115825295448303, + "learning_rate": 8.959417410558087e-05, + "loss": 1.7969, + "step": 7587 + }, + { + "epoch": 2.329036218538981, + "grad_norm": 0.37858307361602783, + "learning_rate": 8.959113852715417e-05, + "loss": 1.9013, + "step": 7588 + }, + { + "epoch": 2.3293431553100064, + "grad_norm": 0.35533010959625244, + "learning_rate": 8.958810255746462e-05, + "loss": 1.8862, + "step": 7589 + }, + { + "epoch": 2.3296500920810312, + "grad_norm": 0.36994054913520813, + "learning_rate": 8.958506619654226e-05, + "loss": 1.9783, + "step": 7590 + }, + { + "epoch": 2.3299570288520566, + "grad_norm": 0.4424416124820709, + "learning_rate": 8.958202944441705e-05, + "loss": 1.9095, + "step": 7591 + }, + { + "epoch": 2.3302639656230815, + "grad_norm": 0.41932111978530884, + "learning_rate": 8.957899230111903e-05, + "loss": 1.8623, + "step": 7592 + }, + { + "epoch": 2.330570902394107, + "grad_norm": 0.4359748363494873, + "learning_rate": 8.957595476667822e-05, + "loss": 1.8917, + "step": 7593 + }, + { + "epoch": 2.330877839165132, + "grad_norm": 0.362957239151001, + "learning_rate": 8.957291684112463e-05, + "loss": 1.8478, + "step": 7594 + }, + { + "epoch": 2.331184775936157, + "grad_norm": 0.3442717492580414, + "learning_rate": 8.956987852448827e-05, + "loss": 1.862, + "step": 7595 + }, + { + "epoch": 2.3314917127071824, + "grad_norm": 0.33355212211608887, + "learning_rate": 8.956683981679918e-05, + "loss": 1.8319, + "step": 7596 + }, + { + "epoch": 2.3317986494782073, + "grad_norm": 0.36758801341056824, + "learning_rate": 8.95638007180874e-05, + "loss": 1.8989, + "step": 7597 + }, + { + "epoch": 2.3321055862492326, + "grad_norm": 0.3574751019477844, + "learning_rate": 8.956076122838294e-05, + "loss": 1.8304, + "step": 7598 + }, + { + "epoch": 2.332412523020258, + "grad_norm": 0.30615341663360596, + "learning_rate": 8.955772134771585e-05, + "loss": 1.9078, + "step": 7599 + }, + { + "epoch": 2.332719459791283, + "grad_norm": 0.38824397325515747, + "learning_rate": 8.955468107611618e-05, + "loss": 1.8733, + "step": 7600 + }, + { + "epoch": 2.333026396562308, + "grad_norm": 0.40545380115509033, + "learning_rate": 8.955164041361395e-05, + "loss": 1.8264, + "step": 7601 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.3104313910007477, + "learning_rate": 8.954859936023925e-05, + "loss": 1.8272, + "step": 7602 + }, + { + "epoch": 2.3336402701043584, + "grad_norm": 0.34795114398002625, + "learning_rate": 8.954555791602211e-05, + "loss": 1.8711, + "step": 7603 + }, + { + "epoch": 2.3339472068753837, + "grad_norm": 0.42790937423706055, + "learning_rate": 8.954251608099257e-05, + "loss": 1.8802, + "step": 7604 + }, + { + "epoch": 2.334254143646409, + "grad_norm": 0.3903054893016815, + "learning_rate": 8.953947385518072e-05, + "loss": 1.8489, + "step": 7605 + }, + { + "epoch": 2.334561080417434, + "grad_norm": 0.35869601368904114, + "learning_rate": 8.953643123861661e-05, + "loss": 1.8565, + "step": 7606 + }, + { + "epoch": 2.3348680171884593, + "grad_norm": 0.3960758447647095, + "learning_rate": 8.953338823133033e-05, + "loss": 1.9335, + "step": 7607 + }, + { + "epoch": 2.335174953959484, + "grad_norm": 0.3884136974811554, + "learning_rate": 8.953034483335191e-05, + "loss": 1.887, + "step": 7608 + }, + { + "epoch": 2.3354818907305095, + "grad_norm": 0.3734811246395111, + "learning_rate": 8.952730104471147e-05, + "loss": 1.861, + "step": 7609 + }, + { + "epoch": 2.335788827501535, + "grad_norm": 0.3074554204940796, + "learning_rate": 8.952425686543908e-05, + "loss": 1.8556, + "step": 7610 + }, + { + "epoch": 2.3360957642725597, + "grad_norm": 0.3098750412464142, + "learning_rate": 8.952121229556481e-05, + "loss": 1.8724, + "step": 7611 + }, + { + "epoch": 2.336402701043585, + "grad_norm": 0.3514649569988251, + "learning_rate": 8.951816733511875e-05, + "loss": 1.8023, + "step": 7612 + }, + { + "epoch": 2.33670963781461, + "grad_norm": 0.3275100290775299, + "learning_rate": 8.951512198413101e-05, + "loss": 1.8805, + "step": 7613 + }, + { + "epoch": 2.3370165745856353, + "grad_norm": 0.3380829989910126, + "learning_rate": 8.951207624263165e-05, + "loss": 1.8559, + "step": 7614 + }, + { + "epoch": 2.3373235113566606, + "grad_norm": 0.43179723620414734, + "learning_rate": 8.950903011065082e-05, + "loss": 1.937, + "step": 7615 + }, + { + "epoch": 2.337630448127686, + "grad_norm": 0.4981893002986908, + "learning_rate": 8.950598358821858e-05, + "loss": 1.8828, + "step": 7616 + }, + { + "epoch": 2.337937384898711, + "grad_norm": 0.42164552211761475, + "learning_rate": 8.950293667536506e-05, + "loss": 1.8898, + "step": 7617 + }, + { + "epoch": 2.338244321669736, + "grad_norm": 0.32897287607192993, + "learning_rate": 8.949988937212037e-05, + "loss": 1.9073, + "step": 7618 + }, + { + "epoch": 2.338551258440761, + "grad_norm": 0.38831618428230286, + "learning_rate": 8.949684167851462e-05, + "loss": 1.9694, + "step": 7619 + }, + { + "epoch": 2.3388581952117864, + "grad_norm": 0.3728467524051666, + "learning_rate": 8.949379359457793e-05, + "loss": 1.8803, + "step": 7620 + }, + { + "epoch": 2.3391651319828117, + "grad_norm": 0.4003579020500183, + "learning_rate": 8.949074512034044e-05, + "loss": 1.9306, + "step": 7621 + }, + { + "epoch": 2.3394720687538366, + "grad_norm": 0.35670751333236694, + "learning_rate": 8.948769625583224e-05, + "loss": 1.9176, + "step": 7622 + }, + { + "epoch": 2.339779005524862, + "grad_norm": 0.3257119357585907, + "learning_rate": 8.948464700108347e-05, + "loss": 1.8781, + "step": 7623 + }, + { + "epoch": 2.340085942295887, + "grad_norm": 0.2840226888656616, + "learning_rate": 8.94815973561243e-05, + "loss": 1.8112, + "step": 7624 + }, + { + "epoch": 2.340392879066912, + "grad_norm": 0.33156147599220276, + "learning_rate": 8.947854732098484e-05, + "loss": 1.8562, + "step": 7625 + }, + { + "epoch": 2.3406998158379375, + "grad_norm": 0.33335328102111816, + "learning_rate": 8.947549689569524e-05, + "loss": 1.8404, + "step": 7626 + }, + { + "epoch": 2.3410067526089624, + "grad_norm": 0.2913919985294342, + "learning_rate": 8.947244608028562e-05, + "loss": 1.83, + "step": 7627 + }, + { + "epoch": 2.3413136893799877, + "grad_norm": 0.32735875248908997, + "learning_rate": 8.946939487478618e-05, + "loss": 1.9047, + "step": 7628 + }, + { + "epoch": 2.341620626151013, + "grad_norm": 0.3421878516674042, + "learning_rate": 8.946634327922703e-05, + "loss": 1.8771, + "step": 7629 + }, + { + "epoch": 2.341927562922038, + "grad_norm": 0.33164483308792114, + "learning_rate": 8.946329129363835e-05, + "loss": 1.8463, + "step": 7630 + }, + { + "epoch": 2.3422344996930633, + "grad_norm": 0.35423099994659424, + "learning_rate": 8.946023891805029e-05, + "loss": 1.9254, + "step": 7631 + }, + { + "epoch": 2.3425414364640886, + "grad_norm": 0.3554958403110504, + "learning_rate": 8.9457186152493e-05, + "loss": 1.8949, + "step": 7632 + }, + { + "epoch": 2.3428483732351135, + "grad_norm": 0.35155919194221497, + "learning_rate": 8.94541329969967e-05, + "loss": 1.8432, + "step": 7633 + }, + { + "epoch": 2.343155310006139, + "grad_norm": 0.3210476338863373, + "learning_rate": 8.945107945159154e-05, + "loss": 1.8512, + "step": 7634 + }, + { + "epoch": 2.3434622467771637, + "grad_norm": 0.3587365746498108, + "learning_rate": 8.944802551630767e-05, + "loss": 1.8355, + "step": 7635 + }, + { + "epoch": 2.343769183548189, + "grad_norm": 0.41851457953453064, + "learning_rate": 8.94449711911753e-05, + "loss": 1.814, + "step": 7636 + }, + { + "epoch": 2.3440761203192144, + "grad_norm": 0.3516016900539398, + "learning_rate": 8.94419164762246e-05, + "loss": 1.8563, + "step": 7637 + }, + { + "epoch": 2.3443830570902393, + "grad_norm": 0.2917228937149048, + "learning_rate": 8.943886137148576e-05, + "loss": 1.8037, + "step": 7638 + }, + { + "epoch": 2.3446899938612646, + "grad_norm": 0.3597778379917145, + "learning_rate": 8.943580587698899e-05, + "loss": 1.8766, + "step": 7639 + }, + { + "epoch": 2.3449969306322895, + "grad_norm": 0.359642893075943, + "learning_rate": 8.943274999276445e-05, + "loss": 1.8485, + "step": 7640 + }, + { + "epoch": 2.345303867403315, + "grad_norm": 0.3543380796909332, + "learning_rate": 8.942969371884238e-05, + "loss": 1.8853, + "step": 7641 + }, + { + "epoch": 2.34561080417434, + "grad_norm": 0.371267706155777, + "learning_rate": 8.942663705525296e-05, + "loss": 1.869, + "step": 7642 + }, + { + "epoch": 2.345917740945365, + "grad_norm": 0.34073930978775024, + "learning_rate": 8.942358000202642e-05, + "loss": 1.831, + "step": 7643 + }, + { + "epoch": 2.3462246777163904, + "grad_norm": 0.3654492497444153, + "learning_rate": 8.942052255919293e-05, + "loss": 1.8697, + "step": 7644 + }, + { + "epoch": 2.3465316144874158, + "grad_norm": 0.31281957030296326, + "learning_rate": 8.941746472678275e-05, + "loss": 1.7908, + "step": 7645 + }, + { + "epoch": 2.3468385512584407, + "grad_norm": 0.3310844302177429, + "learning_rate": 8.941440650482607e-05, + "loss": 1.8523, + "step": 7646 + }, + { + "epoch": 2.347145488029466, + "grad_norm": 0.3187454342842102, + "learning_rate": 8.941134789335312e-05, + "loss": 1.8808, + "step": 7647 + }, + { + "epoch": 2.3474524248004913, + "grad_norm": 0.35980424284935, + "learning_rate": 8.940828889239415e-05, + "loss": 1.8713, + "step": 7648 + }, + { + "epoch": 2.347759361571516, + "grad_norm": 0.2960885763168335, + "learning_rate": 8.940522950197935e-05, + "loss": 1.8077, + "step": 7649 + }, + { + "epoch": 2.3480662983425415, + "grad_norm": 0.3056114912033081, + "learning_rate": 8.940216972213897e-05, + "loss": 1.8805, + "step": 7650 + }, + { + "epoch": 2.3483732351135664, + "grad_norm": 0.3047563135623932, + "learning_rate": 8.939910955290328e-05, + "loss": 1.793, + "step": 7651 + }, + { + "epoch": 2.3486801718845918, + "grad_norm": 0.3381251394748688, + "learning_rate": 8.939604899430248e-05, + "loss": 1.8267, + "step": 7652 + }, + { + "epoch": 2.348987108655617, + "grad_norm": 0.36855414509773254, + "learning_rate": 8.939298804636684e-05, + "loss": 1.9386, + "step": 7653 + }, + { + "epoch": 2.349294045426642, + "grad_norm": 0.3742626905441284, + "learning_rate": 8.93899267091266e-05, + "loss": 1.8695, + "step": 7654 + }, + { + "epoch": 2.3496009821976673, + "grad_norm": 0.3170017600059509, + "learning_rate": 8.938686498261201e-05, + "loss": 1.881, + "step": 7655 + }, + { + "epoch": 2.349907918968692, + "grad_norm": 0.2740418016910553, + "learning_rate": 8.938380286685334e-05, + "loss": 1.7992, + "step": 7656 + }, + { + "epoch": 2.3502148557397176, + "grad_norm": 0.3170342743396759, + "learning_rate": 8.938074036188087e-05, + "loss": 1.8281, + "step": 7657 + }, + { + "epoch": 2.350521792510743, + "grad_norm": 0.3487764298915863, + "learning_rate": 8.93776774677248e-05, + "loss": 1.8508, + "step": 7658 + }, + { + "epoch": 2.350828729281768, + "grad_norm": 0.3193725347518921, + "learning_rate": 8.937461418441549e-05, + "loss": 1.802, + "step": 7659 + }, + { + "epoch": 2.351135666052793, + "grad_norm": 0.30621078610420227, + "learning_rate": 8.937155051198312e-05, + "loss": 1.8723, + "step": 7660 + }, + { + "epoch": 2.3514426028238185, + "grad_norm": 0.3154527544975281, + "learning_rate": 8.936848645045803e-05, + "loss": 1.8276, + "step": 7661 + }, + { + "epoch": 2.3517495395948433, + "grad_norm": 0.3809822201728821, + "learning_rate": 8.936542199987048e-05, + "loss": 1.9682, + "step": 7662 + }, + { + "epoch": 2.3520564763658687, + "grad_norm": 0.3817490339279175, + "learning_rate": 8.936235716025076e-05, + "loss": 1.8896, + "step": 7663 + }, + { + "epoch": 2.352363413136894, + "grad_norm": 0.2996097207069397, + "learning_rate": 8.935929193162915e-05, + "loss": 1.7994, + "step": 7664 + }, + { + "epoch": 2.352670349907919, + "grad_norm": 0.30788013339042664, + "learning_rate": 8.935622631403596e-05, + "loss": 1.8243, + "step": 7665 + }, + { + "epoch": 2.3529772866789442, + "grad_norm": 0.331193745136261, + "learning_rate": 8.935316030750145e-05, + "loss": 1.9044, + "step": 7666 + }, + { + "epoch": 2.353284223449969, + "grad_norm": 0.31796711683273315, + "learning_rate": 8.935009391205598e-05, + "loss": 1.8006, + "step": 7667 + }, + { + "epoch": 2.3535911602209945, + "grad_norm": 0.3864014744758606, + "learning_rate": 8.934702712772979e-05, + "loss": 2.0193, + "step": 7668 + }, + { + "epoch": 2.35389809699202, + "grad_norm": 0.3923170566558838, + "learning_rate": 8.934395995455323e-05, + "loss": 1.9418, + "step": 7669 + }, + { + "epoch": 2.3542050337630447, + "grad_norm": 0.3210037052631378, + "learning_rate": 8.934089239255659e-05, + "loss": 1.7964, + "step": 7670 + }, + { + "epoch": 2.35451197053407, + "grad_norm": 0.32465317845344543, + "learning_rate": 8.933782444177019e-05, + "loss": 1.9405, + "step": 7671 + }, + { + "epoch": 2.354818907305095, + "grad_norm": 0.35554173588752747, + "learning_rate": 8.933475610222435e-05, + "loss": 1.8645, + "step": 7672 + }, + { + "epoch": 2.3551258440761202, + "grad_norm": 0.32723551988601685, + "learning_rate": 8.933168737394942e-05, + "loss": 1.8941, + "step": 7673 + }, + { + "epoch": 2.3554327808471456, + "grad_norm": 0.3295009732246399, + "learning_rate": 8.932861825697567e-05, + "loss": 1.9047, + "step": 7674 + }, + { + "epoch": 2.3557397176181705, + "grad_norm": 0.32315388321876526, + "learning_rate": 8.932554875133348e-05, + "loss": 1.8535, + "step": 7675 + }, + { + "epoch": 2.356046654389196, + "grad_norm": 0.31577154994010925, + "learning_rate": 8.932247885705315e-05, + "loss": 1.8697, + "step": 7676 + }, + { + "epoch": 2.356353591160221, + "grad_norm": 0.31099769473075867, + "learning_rate": 8.931940857416506e-05, + "loss": 1.8377, + "step": 7677 + }, + { + "epoch": 2.356660527931246, + "grad_norm": 0.32998642325401306, + "learning_rate": 8.931633790269954e-05, + "loss": 1.8528, + "step": 7678 + }, + { + "epoch": 2.3569674647022714, + "grad_norm": 0.29609233140945435, + "learning_rate": 8.93132668426869e-05, + "loss": 1.8646, + "step": 7679 + }, + { + "epoch": 2.3572744014732967, + "grad_norm": 0.31335413455963135, + "learning_rate": 8.931019539415752e-05, + "loss": 1.9011, + "step": 7680 + }, + { + "epoch": 2.3575813382443216, + "grad_norm": 0.3441788852214813, + "learning_rate": 8.930712355714174e-05, + "loss": 1.8673, + "step": 7681 + }, + { + "epoch": 2.357888275015347, + "grad_norm": 0.34610918164253235, + "learning_rate": 8.930405133166992e-05, + "loss": 1.8613, + "step": 7682 + }, + { + "epoch": 2.358195211786372, + "grad_norm": 0.31753265857696533, + "learning_rate": 8.930097871777245e-05, + "loss": 1.873, + "step": 7683 + }, + { + "epoch": 2.358502148557397, + "grad_norm": 0.29862073063850403, + "learning_rate": 8.929790571547966e-05, + "loss": 1.8392, + "step": 7684 + }, + { + "epoch": 2.3588090853284225, + "grad_norm": 0.2953017055988312, + "learning_rate": 8.929483232482194e-05, + "loss": 1.8402, + "step": 7685 + }, + { + "epoch": 2.3591160220994474, + "grad_norm": 0.36613956093788147, + "learning_rate": 8.929175854582966e-05, + "loss": 1.8954, + "step": 7686 + }, + { + "epoch": 2.3594229588704727, + "grad_norm": 0.3867746889591217, + "learning_rate": 8.928868437853319e-05, + "loss": 1.8496, + "step": 7687 + }, + { + "epoch": 2.359729895641498, + "grad_norm": 0.30742913484573364, + "learning_rate": 8.928560982296292e-05, + "loss": 1.82, + "step": 7688 + }, + { + "epoch": 2.360036832412523, + "grad_norm": 0.306905061006546, + "learning_rate": 8.928253487914921e-05, + "loss": 1.8299, + "step": 7689 + }, + { + "epoch": 2.3603437691835483, + "grad_norm": 0.3253326416015625, + "learning_rate": 8.927945954712247e-05, + "loss": 1.896, + "step": 7690 + }, + { + "epoch": 2.3606507059545736, + "grad_norm": 0.3139156699180603, + "learning_rate": 8.927638382691309e-05, + "loss": 1.838, + "step": 7691 + }, + { + "epoch": 2.3609576427255985, + "grad_norm": 0.3865121006965637, + "learning_rate": 8.927330771855147e-05, + "loss": 1.8502, + "step": 7692 + }, + { + "epoch": 2.361264579496624, + "grad_norm": 0.3640300929546356, + "learning_rate": 8.927023122206799e-05, + "loss": 1.8929, + "step": 7693 + }, + { + "epoch": 2.3615715162676487, + "grad_norm": 0.3446909487247467, + "learning_rate": 8.926715433749309e-05, + "loss": 1.864, + "step": 7694 + }, + { + "epoch": 2.361878453038674, + "grad_norm": 0.3086490035057068, + "learning_rate": 8.926407706485713e-05, + "loss": 1.8588, + "step": 7695 + }, + { + "epoch": 2.3621853898096994, + "grad_norm": 0.28351619839668274, + "learning_rate": 8.926099940419057e-05, + "loss": 1.8114, + "step": 7696 + }, + { + "epoch": 2.3624923265807243, + "grad_norm": 0.31882742047309875, + "learning_rate": 8.925792135552379e-05, + "loss": 1.8544, + "step": 7697 + }, + { + "epoch": 2.3627992633517496, + "grad_norm": 0.2691894769668579, + "learning_rate": 8.925484291888723e-05, + "loss": 1.8143, + "step": 7698 + }, + { + "epoch": 2.3631062001227745, + "grad_norm": 0.2815118432044983, + "learning_rate": 8.925176409431129e-05, + "loss": 1.8687, + "step": 7699 + }, + { + "epoch": 2.3634131368938, + "grad_norm": 0.34842196106910706, + "learning_rate": 8.924868488182643e-05, + "loss": 1.8673, + "step": 7700 + }, + { + "epoch": 2.363720073664825, + "grad_norm": 0.33553025126457214, + "learning_rate": 8.924560528146304e-05, + "loss": 1.8982, + "step": 7701 + }, + { + "epoch": 2.36402701043585, + "grad_norm": 0.30077221989631653, + "learning_rate": 8.924252529325159e-05, + "loss": 1.8155, + "step": 7702 + }, + { + "epoch": 2.3643339472068754, + "grad_norm": 0.3376595079898834, + "learning_rate": 8.923944491722252e-05, + "loss": 1.8871, + "step": 7703 + }, + { + "epoch": 2.3646408839779007, + "grad_norm": 0.3980284333229065, + "learning_rate": 8.923636415340622e-05, + "loss": 1.8414, + "step": 7704 + }, + { + "epoch": 2.3649478207489256, + "grad_norm": 0.4772777259349823, + "learning_rate": 8.92332830018332e-05, + "loss": 1.8393, + "step": 7705 + }, + { + "epoch": 2.365254757519951, + "grad_norm": 0.5061559081077576, + "learning_rate": 8.923020146253387e-05, + "loss": 1.9134, + "step": 7706 + }, + { + "epoch": 2.3655616942909763, + "grad_norm": 0.47147873044013977, + "learning_rate": 8.922711953553871e-05, + "loss": 1.9026, + "step": 7707 + }, + { + "epoch": 2.365868631062001, + "grad_norm": 0.37263748049736023, + "learning_rate": 8.922403722087814e-05, + "loss": 1.8474, + "step": 7708 + }, + { + "epoch": 2.3661755678330265, + "grad_norm": 0.3158501386642456, + "learning_rate": 8.922095451858265e-05, + "loss": 1.8771, + "step": 7709 + }, + { + "epoch": 2.3664825046040514, + "grad_norm": 0.3170566260814667, + "learning_rate": 8.921787142868271e-05, + "loss": 1.8111, + "step": 7710 + }, + { + "epoch": 2.3667894413750767, + "grad_norm": 0.3532208502292633, + "learning_rate": 8.921478795120877e-05, + "loss": 1.8708, + "step": 7711 + }, + { + "epoch": 2.367096378146102, + "grad_norm": 0.3211480379104614, + "learning_rate": 8.921170408619131e-05, + "loss": 1.8487, + "step": 7712 + }, + { + "epoch": 2.367403314917127, + "grad_norm": 0.2806071937084198, + "learning_rate": 8.920861983366083e-05, + "loss": 1.8325, + "step": 7713 + }, + { + "epoch": 2.3677102516881523, + "grad_norm": 0.30703970789909363, + "learning_rate": 8.920553519364777e-05, + "loss": 1.8364, + "step": 7714 + }, + { + "epoch": 2.368017188459177, + "grad_norm": 0.30848923325538635, + "learning_rate": 8.920245016618263e-05, + "loss": 1.833, + "step": 7715 + }, + { + "epoch": 2.3683241252302025, + "grad_norm": 0.31656739115715027, + "learning_rate": 8.919936475129588e-05, + "loss": 1.8884, + "step": 7716 + }, + { + "epoch": 2.368631062001228, + "grad_norm": 0.2806589603424072, + "learning_rate": 8.919627894901806e-05, + "loss": 1.7779, + "step": 7717 + }, + { + "epoch": 2.3689379987722528, + "grad_norm": 0.2943432629108429, + "learning_rate": 8.919319275937962e-05, + "loss": 1.8741, + "step": 7718 + }, + { + "epoch": 2.369244935543278, + "grad_norm": 0.2870347499847412, + "learning_rate": 8.919010618241111e-05, + "loss": 1.8415, + "step": 7719 + }, + { + "epoch": 2.3695518723143034, + "grad_norm": 0.3224312663078308, + "learning_rate": 8.918701921814297e-05, + "loss": 1.8594, + "step": 7720 + }, + { + "epoch": 2.3698588090853283, + "grad_norm": 0.3007681369781494, + "learning_rate": 8.918393186660575e-05, + "loss": 1.878, + "step": 7721 + }, + { + "epoch": 2.3701657458563536, + "grad_norm": 0.3083780109882355, + "learning_rate": 8.918084412782994e-05, + "loss": 1.9088, + "step": 7722 + }, + { + "epoch": 2.370472682627379, + "grad_norm": 0.30599063634872437, + "learning_rate": 8.917775600184608e-05, + "loss": 1.8743, + "step": 7723 + }, + { + "epoch": 2.370779619398404, + "grad_norm": 0.33503273129463196, + "learning_rate": 8.917466748868466e-05, + "loss": 1.9048, + "step": 7724 + }, + { + "epoch": 2.371086556169429, + "grad_norm": 0.3861919343471527, + "learning_rate": 8.917157858837622e-05, + "loss": 1.9073, + "step": 7725 + }, + { + "epoch": 2.371393492940454, + "grad_norm": 0.395945280790329, + "learning_rate": 8.916848930095128e-05, + "loss": 1.8678, + "step": 7726 + }, + { + "epoch": 2.3717004297114794, + "grad_norm": 0.3657386600971222, + "learning_rate": 8.916539962644037e-05, + "loss": 1.9138, + "step": 7727 + }, + { + "epoch": 2.3720073664825048, + "grad_norm": 0.32392752170562744, + "learning_rate": 8.916230956487402e-05, + "loss": 1.803, + "step": 7728 + }, + { + "epoch": 2.3723143032535297, + "grad_norm": 0.406703382730484, + "learning_rate": 8.915921911628278e-05, + "loss": 1.9222, + "step": 7729 + }, + { + "epoch": 2.372621240024555, + "grad_norm": 0.4293023645877838, + "learning_rate": 8.915612828069718e-05, + "loss": 1.8874, + "step": 7730 + }, + { + "epoch": 2.37292817679558, + "grad_norm": 0.45155876874923706, + "learning_rate": 8.915303705814777e-05, + "loss": 1.9059, + "step": 7731 + }, + { + "epoch": 2.373235113566605, + "grad_norm": 0.35105881094932556, + "learning_rate": 8.91499454486651e-05, + "loss": 1.8387, + "step": 7732 + }, + { + "epoch": 2.3735420503376305, + "grad_norm": 0.3197930157184601, + "learning_rate": 8.914685345227973e-05, + "loss": 1.8174, + "step": 7733 + }, + { + "epoch": 2.3738489871086554, + "grad_norm": 0.3610389232635498, + "learning_rate": 8.91437610690222e-05, + "loss": 1.841, + "step": 7734 + }, + { + "epoch": 2.3741559238796808, + "grad_norm": 0.3696954548358917, + "learning_rate": 8.91406682989231e-05, + "loss": 1.8511, + "step": 7735 + }, + { + "epoch": 2.374462860650706, + "grad_norm": 0.3364555239677429, + "learning_rate": 8.913757514201295e-05, + "loss": 1.8382, + "step": 7736 + }, + { + "epoch": 2.374769797421731, + "grad_norm": 0.4600698947906494, + "learning_rate": 8.913448159832236e-05, + "loss": 1.8247, + "step": 7737 + }, + { + "epoch": 2.3750767341927563, + "grad_norm": 0.5877843499183655, + "learning_rate": 8.913138766788187e-05, + "loss": 1.8449, + "step": 7738 + }, + { + "epoch": 2.3753836709637817, + "grad_norm": 0.5380640029907227, + "learning_rate": 8.912829335072208e-05, + "loss": 1.8647, + "step": 7739 + }, + { + "epoch": 2.3756906077348066, + "grad_norm": 0.5100306272506714, + "learning_rate": 8.912519864687357e-05, + "loss": 1.884, + "step": 7740 + }, + { + "epoch": 2.375997544505832, + "grad_norm": 0.48175910115242004, + "learning_rate": 8.91221035563669e-05, + "loss": 1.8378, + "step": 7741 + }, + { + "epoch": 2.376304481276857, + "grad_norm": 0.3296540081501007, + "learning_rate": 8.911900807923268e-05, + "loss": 1.8036, + "step": 7742 + }, + { + "epoch": 2.376611418047882, + "grad_norm": 0.32398131489753723, + "learning_rate": 8.911591221550149e-05, + "loss": 1.8415, + "step": 7743 + }, + { + "epoch": 2.3769183548189075, + "grad_norm": 0.33934786915779114, + "learning_rate": 8.911281596520393e-05, + "loss": 1.9002, + "step": 7744 + }, + { + "epoch": 2.3772252915899323, + "grad_norm": 0.33059465885162354, + "learning_rate": 8.91097193283706e-05, + "loss": 1.8194, + "step": 7745 + }, + { + "epoch": 2.3775322283609577, + "grad_norm": 0.2908796966075897, + "learning_rate": 8.91066223050321e-05, + "loss": 1.8272, + "step": 7746 + }, + { + "epoch": 2.3778391651319826, + "grad_norm": 0.31551963090896606, + "learning_rate": 8.910352489521904e-05, + "loss": 1.8717, + "step": 7747 + }, + { + "epoch": 2.378146101903008, + "grad_norm": 0.2886766493320465, + "learning_rate": 8.910042709896203e-05, + "loss": 1.8714, + "step": 7748 + }, + { + "epoch": 2.3784530386740332, + "grad_norm": 0.3288721740245819, + "learning_rate": 8.909732891629167e-05, + "loss": 1.9194, + "step": 7749 + }, + { + "epoch": 2.378759975445058, + "grad_norm": 0.42444637417793274, + "learning_rate": 8.90942303472386e-05, + "loss": 1.8871, + "step": 7750 + }, + { + "epoch": 2.3790669122160835, + "grad_norm": 0.3550770580768585, + "learning_rate": 8.909113139183343e-05, + "loss": 1.8639, + "step": 7751 + }, + { + "epoch": 2.379373848987109, + "grad_norm": 0.3291744589805603, + "learning_rate": 8.908803205010679e-05, + "loss": 1.8284, + "step": 7752 + }, + { + "epoch": 2.3796807857581337, + "grad_norm": 0.2803054451942444, + "learning_rate": 8.908493232208928e-05, + "loss": 1.8113, + "step": 7753 + }, + { + "epoch": 2.379987722529159, + "grad_norm": 0.30959245562553406, + "learning_rate": 8.908183220781158e-05, + "loss": 1.8821, + "step": 7754 + }, + { + "epoch": 2.3802946593001844, + "grad_norm": 0.37838777899742126, + "learning_rate": 8.907873170730431e-05, + "loss": 1.8749, + "step": 7755 + }, + { + "epoch": 2.3806015960712092, + "grad_norm": 0.34625449776649475, + "learning_rate": 8.907563082059813e-05, + "loss": 1.8804, + "step": 7756 + }, + { + "epoch": 2.3809085328422346, + "grad_norm": 0.3966830372810364, + "learning_rate": 8.907252954772364e-05, + "loss": 1.9295, + "step": 7757 + }, + { + "epoch": 2.3812154696132595, + "grad_norm": 0.3144119679927826, + "learning_rate": 8.906942788871151e-05, + "loss": 1.8486, + "step": 7758 + }, + { + "epoch": 2.381522406384285, + "grad_norm": 0.3498438596725464, + "learning_rate": 8.90663258435924e-05, + "loss": 1.8813, + "step": 7759 + }, + { + "epoch": 2.38182934315531, + "grad_norm": 0.32803723216056824, + "learning_rate": 8.906322341239696e-05, + "loss": 1.8282, + "step": 7760 + }, + { + "epoch": 2.382136279926335, + "grad_norm": 0.28600773215293884, + "learning_rate": 8.906012059515585e-05, + "loss": 1.8319, + "step": 7761 + }, + { + "epoch": 2.3824432166973604, + "grad_norm": 0.2743505537509918, + "learning_rate": 8.905701739189973e-05, + "loss": 1.8198, + "step": 7762 + }, + { + "epoch": 2.3827501534683857, + "grad_norm": 0.3011966347694397, + "learning_rate": 8.905391380265929e-05, + "loss": 1.8476, + "step": 7763 + }, + { + "epoch": 2.3830570902394106, + "grad_norm": 0.3022943437099457, + "learning_rate": 8.905080982746516e-05, + "loss": 1.9037, + "step": 7764 + }, + { + "epoch": 2.383364027010436, + "grad_norm": 0.3333243727684021, + "learning_rate": 8.904770546634805e-05, + "loss": 1.8487, + "step": 7765 + }, + { + "epoch": 2.3836709637814613, + "grad_norm": 0.3773072361946106, + "learning_rate": 8.904460071933862e-05, + "loss": 1.8828, + "step": 7766 + }, + { + "epoch": 2.383977900552486, + "grad_norm": 0.4382041096687317, + "learning_rate": 8.904149558646756e-05, + "loss": 1.9069, + "step": 7767 + }, + { + "epoch": 2.3842848373235115, + "grad_norm": 0.3963650166988373, + "learning_rate": 8.903839006776557e-05, + "loss": 1.816, + "step": 7768 + }, + { + "epoch": 2.3845917740945364, + "grad_norm": 0.35340386629104614, + "learning_rate": 8.903528416326333e-05, + "loss": 1.8853, + "step": 7769 + }, + { + "epoch": 2.3848987108655617, + "grad_norm": 0.31519120931625366, + "learning_rate": 8.903217787299153e-05, + "loss": 1.8953, + "step": 7770 + }, + { + "epoch": 2.385205647636587, + "grad_norm": 0.41126203536987305, + "learning_rate": 8.902907119698088e-05, + "loss": 1.9494, + "step": 7771 + }, + { + "epoch": 2.385512584407612, + "grad_norm": 0.4488140344619751, + "learning_rate": 8.902596413526205e-05, + "loss": 1.8717, + "step": 7772 + }, + { + "epoch": 2.3858195211786373, + "grad_norm": 0.36129191517829895, + "learning_rate": 8.902285668786578e-05, + "loss": 1.8472, + "step": 7773 + }, + { + "epoch": 2.386126457949662, + "grad_norm": 0.3357439935207367, + "learning_rate": 8.901974885482277e-05, + "loss": 1.8143, + "step": 7774 + }, + { + "epoch": 2.3864333947206875, + "grad_norm": 0.2832469046115875, + "learning_rate": 8.901664063616372e-05, + "loss": 1.7952, + "step": 7775 + }, + { + "epoch": 2.386740331491713, + "grad_norm": 0.31065669655799866, + "learning_rate": 8.901353203191937e-05, + "loss": 1.8651, + "step": 7776 + }, + { + "epoch": 2.3870472682627377, + "grad_norm": 0.2985263764858246, + "learning_rate": 8.901042304212042e-05, + "loss": 1.8106, + "step": 7777 + }, + { + "epoch": 2.387354205033763, + "grad_norm": 0.31606364250183105, + "learning_rate": 8.900731366679761e-05, + "loss": 1.8831, + "step": 7778 + }, + { + "epoch": 2.3876611418047884, + "grad_norm": 0.33167949318885803, + "learning_rate": 8.900420390598166e-05, + "loss": 1.9494, + "step": 7779 + }, + { + "epoch": 2.3879680785758133, + "grad_norm": 0.32814472913742065, + "learning_rate": 8.900109375970333e-05, + "loss": 1.8654, + "step": 7780 + }, + { + "epoch": 2.3882750153468386, + "grad_norm": 0.35307401418685913, + "learning_rate": 8.899798322799331e-05, + "loss": 1.904, + "step": 7781 + }, + { + "epoch": 2.388581952117864, + "grad_norm": 0.3936740458011627, + "learning_rate": 8.899487231088236e-05, + "loss": 1.8404, + "step": 7782 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.3675380349159241, + "learning_rate": 8.899176100840124e-05, + "loss": 1.8689, + "step": 7783 + }, + { + "epoch": 2.389195825659914, + "grad_norm": 0.34065911173820496, + "learning_rate": 8.898864932058067e-05, + "loss": 1.8819, + "step": 7784 + }, + { + "epoch": 2.389502762430939, + "grad_norm": 0.31531861424446106, + "learning_rate": 8.898553724745142e-05, + "loss": 1.8379, + "step": 7785 + }, + { + "epoch": 2.3898096992019644, + "grad_norm": 0.33485177159309387, + "learning_rate": 8.898242478904424e-05, + "loss": 1.9206, + "step": 7786 + }, + { + "epoch": 2.3901166359729897, + "grad_norm": 0.33116385340690613, + "learning_rate": 8.897931194538989e-05, + "loss": 1.8744, + "step": 7787 + }, + { + "epoch": 2.3904235727440146, + "grad_norm": 0.33216002583503723, + "learning_rate": 8.897619871651915e-05, + "loss": 1.8794, + "step": 7788 + }, + { + "epoch": 2.39073050951504, + "grad_norm": 0.3246794641017914, + "learning_rate": 8.897308510246273e-05, + "loss": 1.8739, + "step": 7789 + }, + { + "epoch": 2.391037446286065, + "grad_norm": 0.3038793206214905, + "learning_rate": 8.896997110325146e-05, + "loss": 1.8314, + "step": 7790 + }, + { + "epoch": 2.39134438305709, + "grad_norm": 0.35726267099380493, + "learning_rate": 8.896685671891612e-05, + "loss": 1.8764, + "step": 7791 + }, + { + "epoch": 2.3916513198281155, + "grad_norm": 0.421522855758667, + "learning_rate": 8.896374194948744e-05, + "loss": 1.8215, + "step": 7792 + }, + { + "epoch": 2.3919582565991404, + "grad_norm": 0.4456072747707367, + "learning_rate": 8.896062679499621e-05, + "loss": 1.9146, + "step": 7793 + }, + { + "epoch": 2.3922651933701657, + "grad_norm": 0.33498415350914, + "learning_rate": 8.895751125547325e-05, + "loss": 1.8372, + "step": 7794 + }, + { + "epoch": 2.392572130141191, + "grad_norm": 0.3279598355293274, + "learning_rate": 8.895439533094933e-05, + "loss": 1.8469, + "step": 7795 + }, + { + "epoch": 2.392879066912216, + "grad_norm": 0.4238305687904358, + "learning_rate": 8.895127902145524e-05, + "loss": 1.8259, + "step": 7796 + }, + { + "epoch": 2.3931860036832413, + "grad_norm": 0.473057359457016, + "learning_rate": 8.89481623270218e-05, + "loss": 1.8374, + "step": 7797 + }, + { + "epoch": 2.3934929404542666, + "grad_norm": 0.30914968252182007, + "learning_rate": 8.894504524767976e-05, + "loss": 1.7803, + "step": 7798 + }, + { + "epoch": 2.3937998772252915, + "grad_norm": 0.3433384597301483, + "learning_rate": 8.894192778345996e-05, + "loss": 1.8568, + "step": 7799 + }, + { + "epoch": 2.394106813996317, + "grad_norm": 0.4965706467628479, + "learning_rate": 8.893880993439323e-05, + "loss": 1.8576, + "step": 7800 + }, + { + "epoch": 2.3944137507673418, + "grad_norm": 0.4996519684791565, + "learning_rate": 8.893569170051032e-05, + "loss": 1.788, + "step": 7801 + }, + { + "epoch": 2.394720687538367, + "grad_norm": 0.31231364607810974, + "learning_rate": 8.893257308184212e-05, + "loss": 1.7846, + "step": 7802 + }, + { + "epoch": 2.3950276243093924, + "grad_norm": 0.32845574617385864, + "learning_rate": 8.89294540784194e-05, + "loss": 1.8811, + "step": 7803 + }, + { + "epoch": 2.3953345610804173, + "grad_norm": 0.525324285030365, + "learning_rate": 8.8926334690273e-05, + "loss": 1.8458, + "step": 7804 + }, + { + "epoch": 2.3956414978514426, + "grad_norm": 0.5107213854789734, + "learning_rate": 8.892321491743373e-05, + "loss": 1.8419, + "step": 7805 + }, + { + "epoch": 2.3959484346224675, + "grad_norm": 0.33831658959388733, + "learning_rate": 8.892009475993245e-05, + "loss": 1.811, + "step": 7806 + }, + { + "epoch": 2.396255371393493, + "grad_norm": 0.3781357407569885, + "learning_rate": 8.891697421779999e-05, + "loss": 1.9385, + "step": 7807 + }, + { + "epoch": 2.396562308164518, + "grad_norm": 0.43507882952690125, + "learning_rate": 8.891385329106717e-05, + "loss": 1.7705, + "step": 7808 + }, + { + "epoch": 2.396869244935543, + "grad_norm": 0.45114290714263916, + "learning_rate": 8.891073197976483e-05, + "loss": 1.8661, + "step": 7809 + }, + { + "epoch": 2.3971761817065684, + "grad_norm": 0.29369547963142395, + "learning_rate": 8.890761028392385e-05, + "loss": 1.873, + "step": 7810 + }, + { + "epoch": 2.3974831184775938, + "grad_norm": 0.3268595337867737, + "learning_rate": 8.890448820357506e-05, + "loss": 1.8461, + "step": 7811 + }, + { + "epoch": 2.3977900552486187, + "grad_norm": 0.4514225423336029, + "learning_rate": 8.890136573874931e-05, + "loss": 1.8458, + "step": 7812 + }, + { + "epoch": 2.398096992019644, + "grad_norm": 0.5288760662078857, + "learning_rate": 8.889824288947745e-05, + "loss": 1.8301, + "step": 7813 + }, + { + "epoch": 2.3984039287906693, + "grad_norm": 0.46517884731292725, + "learning_rate": 8.889511965579038e-05, + "loss": 1.8769, + "step": 7814 + }, + { + "epoch": 2.398710865561694, + "grad_norm": 0.29907044768333435, + "learning_rate": 8.889199603771892e-05, + "loss": 1.7815, + "step": 7815 + }, + { + "epoch": 2.3990178023327196, + "grad_norm": 0.36091622710227966, + "learning_rate": 8.888887203529398e-05, + "loss": 1.8375, + "step": 7816 + }, + { + "epoch": 2.3993247391037444, + "grad_norm": 0.5604190230369568, + "learning_rate": 8.88857476485464e-05, + "loss": 1.9176, + "step": 7817 + }, + { + "epoch": 2.3996316758747698, + "grad_norm": 0.48299452662467957, + "learning_rate": 8.888262287750707e-05, + "loss": 1.8682, + "step": 7818 + }, + { + "epoch": 2.399938612645795, + "grad_norm": 0.32829394936561584, + "learning_rate": 8.887949772220687e-05, + "loss": 1.9143, + "step": 7819 + }, + { + "epoch": 2.40024554941682, + "grad_norm": 0.401719868183136, + "learning_rate": 8.88763721826767e-05, + "loss": 1.8517, + "step": 7820 + }, + { + "epoch": 2.4005524861878453, + "grad_norm": 0.5205032825469971, + "learning_rate": 8.887324625894741e-05, + "loss": 1.811, + "step": 7821 + }, + { + "epoch": 2.4008594229588702, + "grad_norm": 0.3828800618648529, + "learning_rate": 8.887011995104993e-05, + "loss": 1.8042, + "step": 7822 + }, + { + "epoch": 2.4011663597298956, + "grad_norm": 0.31816062331199646, + "learning_rate": 8.886699325901514e-05, + "loss": 1.8998, + "step": 7823 + }, + { + "epoch": 2.401473296500921, + "grad_norm": 0.36172720789909363, + "learning_rate": 8.886386618287394e-05, + "loss": 1.8689, + "step": 7824 + }, + { + "epoch": 2.401780233271946, + "grad_norm": 0.3582005202770233, + "learning_rate": 8.886073872265725e-05, + "loss": 1.8565, + "step": 7825 + }, + { + "epoch": 2.402087170042971, + "grad_norm": 0.2915255129337311, + "learning_rate": 8.885761087839594e-05, + "loss": 1.8686, + "step": 7826 + }, + { + "epoch": 2.4023941068139965, + "grad_norm": 0.26619917154312134, + "learning_rate": 8.885448265012095e-05, + "loss": 1.7737, + "step": 7827 + }, + { + "epoch": 2.4027010435850213, + "grad_norm": 0.31685733795166016, + "learning_rate": 8.88513540378632e-05, + "loss": 1.9136, + "step": 7828 + }, + { + "epoch": 2.4030079803560467, + "grad_norm": 0.3427450954914093, + "learning_rate": 8.884822504165359e-05, + "loss": 1.8824, + "step": 7829 + }, + { + "epoch": 2.403314917127072, + "grad_norm": 0.3207513689994812, + "learning_rate": 8.884509566152306e-05, + "loss": 1.8332, + "step": 7830 + }, + { + "epoch": 2.403621853898097, + "grad_norm": 0.3301675319671631, + "learning_rate": 8.884196589750251e-05, + "loss": 1.9129, + "step": 7831 + }, + { + "epoch": 2.4039287906691222, + "grad_norm": 0.3232486844062805, + "learning_rate": 8.88388357496229e-05, + "loss": 1.8362, + "step": 7832 + }, + { + "epoch": 2.404235727440147, + "grad_norm": 0.3152230381965637, + "learning_rate": 8.883570521791514e-05, + "loss": 1.8586, + "step": 7833 + }, + { + "epoch": 2.4045426642111725, + "grad_norm": 0.3204822540283203, + "learning_rate": 8.883257430241019e-05, + "loss": 1.842, + "step": 7834 + }, + { + "epoch": 2.404849600982198, + "grad_norm": 0.28253886103630066, + "learning_rate": 8.882944300313897e-05, + "loss": 1.8521, + "step": 7835 + }, + { + "epoch": 2.4051565377532227, + "grad_norm": 0.37631165981292725, + "learning_rate": 8.882631132013245e-05, + "loss": 1.8838, + "step": 7836 + }, + { + "epoch": 2.405463474524248, + "grad_norm": 0.3606031537055969, + "learning_rate": 8.882317925342157e-05, + "loss": 1.8452, + "step": 7837 + }, + { + "epoch": 2.4057704112952734, + "grad_norm": 0.33793914318084717, + "learning_rate": 8.882004680303726e-05, + "loss": 1.8866, + "step": 7838 + }, + { + "epoch": 2.4060773480662982, + "grad_norm": 0.2714223265647888, + "learning_rate": 8.881691396901048e-05, + "loss": 1.7953, + "step": 7839 + }, + { + "epoch": 2.4063842848373236, + "grad_norm": 0.3588239252567291, + "learning_rate": 8.881378075137224e-05, + "loss": 1.9679, + "step": 7840 + }, + { + "epoch": 2.406691221608349, + "grad_norm": 0.3266383707523346, + "learning_rate": 8.881064715015344e-05, + "loss": 1.8747, + "step": 7841 + }, + { + "epoch": 2.406998158379374, + "grad_norm": 0.3498428761959076, + "learning_rate": 8.88075131653851e-05, + "loss": 1.8882, + "step": 7842 + }, + { + "epoch": 2.407305095150399, + "grad_norm": 0.36646100878715515, + "learning_rate": 8.880437879709815e-05, + "loss": 1.8624, + "step": 7843 + }, + { + "epoch": 2.407612031921424, + "grad_norm": 0.36088457703590393, + "learning_rate": 8.88012440453236e-05, + "loss": 1.8527, + "step": 7844 + }, + { + "epoch": 2.4079189686924494, + "grad_norm": 0.3267477750778198, + "learning_rate": 8.87981089100924e-05, + "loss": 1.8374, + "step": 7845 + }, + { + "epoch": 2.4082259054634747, + "grad_norm": 0.3262403607368469, + "learning_rate": 8.879497339143556e-05, + "loss": 1.8752, + "step": 7846 + }, + { + "epoch": 2.4085328422344996, + "grad_norm": 0.278877854347229, + "learning_rate": 8.879183748938405e-05, + "loss": 1.8056, + "step": 7847 + }, + { + "epoch": 2.408839779005525, + "grad_norm": 0.35509005188941956, + "learning_rate": 8.878870120396886e-05, + "loss": 1.8555, + "step": 7848 + }, + { + "epoch": 2.40914671577655, + "grad_norm": 0.3621126413345337, + "learning_rate": 8.8785564535221e-05, + "loss": 1.8084, + "step": 7849 + }, + { + "epoch": 2.409453652547575, + "grad_norm": 0.2772746682167053, + "learning_rate": 8.878242748317145e-05, + "loss": 1.8034, + "step": 7850 + }, + { + "epoch": 2.4097605893186005, + "grad_norm": 0.30938875675201416, + "learning_rate": 8.877929004785121e-05, + "loss": 1.8341, + "step": 7851 + }, + { + "epoch": 2.4100675260896254, + "grad_norm": 0.3349369764328003, + "learning_rate": 8.877615222929133e-05, + "loss": 1.8306, + "step": 7852 + }, + { + "epoch": 2.4103744628606507, + "grad_norm": 0.3109685778617859, + "learning_rate": 8.877301402752277e-05, + "loss": 1.7998, + "step": 7853 + }, + { + "epoch": 2.410681399631676, + "grad_norm": 0.3337927460670471, + "learning_rate": 8.876987544257655e-05, + "loss": 1.8766, + "step": 7854 + }, + { + "epoch": 2.410988336402701, + "grad_norm": 0.33891361951828003, + "learning_rate": 8.87667364744837e-05, + "loss": 1.8535, + "step": 7855 + }, + { + "epoch": 2.4112952731737263, + "grad_norm": 0.30946552753448486, + "learning_rate": 8.876359712327524e-05, + "loss": 1.8144, + "step": 7856 + }, + { + "epoch": 2.4116022099447516, + "grad_norm": 0.354981929063797, + "learning_rate": 8.87604573889822e-05, + "loss": 1.9253, + "step": 7857 + }, + { + "epoch": 2.4119091467157765, + "grad_norm": 0.42054516077041626, + "learning_rate": 8.875731727163559e-05, + "loss": 1.9122, + "step": 7858 + }, + { + "epoch": 2.412216083486802, + "grad_norm": 0.37435492873191833, + "learning_rate": 8.875417677126646e-05, + "loss": 1.8639, + "step": 7859 + }, + { + "epoch": 2.4125230202578267, + "grad_norm": 0.3742216229438782, + "learning_rate": 8.875103588790584e-05, + "loss": 1.8398, + "step": 7860 + }, + { + "epoch": 2.412829957028852, + "grad_norm": 0.3152104616165161, + "learning_rate": 8.874789462158478e-05, + "loss": 1.8078, + "step": 7861 + }, + { + "epoch": 2.4131368937998774, + "grad_norm": 0.32342761754989624, + "learning_rate": 8.87447529723343e-05, + "loss": 1.8632, + "step": 7862 + }, + { + "epoch": 2.4134438305709023, + "grad_norm": 0.31065210700035095, + "learning_rate": 8.874161094018547e-05, + "loss": 1.845, + "step": 7863 + }, + { + "epoch": 2.4137507673419276, + "grad_norm": 0.31379538774490356, + "learning_rate": 8.873846852516933e-05, + "loss": 1.8184, + "step": 7864 + }, + { + "epoch": 2.4140577041129525, + "grad_norm": 0.29058924317359924, + "learning_rate": 8.873532572731694e-05, + "loss": 1.8671, + "step": 7865 + }, + { + "epoch": 2.414364640883978, + "grad_norm": 0.3024691641330719, + "learning_rate": 8.873218254665936e-05, + "loss": 1.7977, + "step": 7866 + }, + { + "epoch": 2.414671577655003, + "grad_norm": 0.30356913805007935, + "learning_rate": 8.872903898322764e-05, + "loss": 1.8284, + "step": 7867 + }, + { + "epoch": 2.414978514426028, + "grad_norm": 0.29594334959983826, + "learning_rate": 8.872589503705287e-05, + "loss": 1.8651, + "step": 7868 + }, + { + "epoch": 2.4152854511970534, + "grad_norm": 0.2929564118385315, + "learning_rate": 8.872275070816612e-05, + "loss": 1.8671, + "step": 7869 + }, + { + "epoch": 2.4155923879680787, + "grad_norm": 0.30591902136802673, + "learning_rate": 8.871960599659842e-05, + "loss": 1.9341, + "step": 7870 + }, + { + "epoch": 2.4158993247391036, + "grad_norm": 0.3944799304008484, + "learning_rate": 8.87164609023809e-05, + "loss": 1.8947, + "step": 7871 + }, + { + "epoch": 2.416206261510129, + "grad_norm": 0.3568263351917267, + "learning_rate": 8.871331542554461e-05, + "loss": 1.8466, + "step": 7872 + }, + { + "epoch": 2.4165131982811543, + "grad_norm": 0.3182635009288788, + "learning_rate": 8.871016956612066e-05, + "loss": 1.8373, + "step": 7873 + }, + { + "epoch": 2.416820135052179, + "grad_norm": 0.31941649317741394, + "learning_rate": 8.870702332414012e-05, + "loss": 1.8356, + "step": 7874 + }, + { + "epoch": 2.4171270718232045, + "grad_norm": 0.3090899586677551, + "learning_rate": 8.870387669963407e-05, + "loss": 1.9308, + "step": 7875 + }, + { + "epoch": 2.4174340085942294, + "grad_norm": 0.3078390955924988, + "learning_rate": 8.870072969263364e-05, + "loss": 1.8521, + "step": 7876 + }, + { + "epoch": 2.4177409453652547, + "grad_norm": 0.29126885533332825, + "learning_rate": 8.869758230316992e-05, + "loss": 1.8091, + "step": 7877 + }, + { + "epoch": 2.41804788213628, + "grad_norm": 0.36473605036735535, + "learning_rate": 8.869443453127402e-05, + "loss": 1.8282, + "step": 7878 + }, + { + "epoch": 2.418354818907305, + "grad_norm": 0.3617660701274872, + "learning_rate": 8.869128637697702e-05, + "loss": 1.8843, + "step": 7879 + }, + { + "epoch": 2.4186617556783303, + "grad_norm": 0.33267220854759216, + "learning_rate": 8.868813784031005e-05, + "loss": 1.8647, + "step": 7880 + }, + { + "epoch": 2.418968692449355, + "grad_norm": 0.29990482330322266, + "learning_rate": 8.868498892130424e-05, + "loss": 1.7697, + "step": 7881 + }, + { + "epoch": 2.4192756292203805, + "grad_norm": 0.3618892431259155, + "learning_rate": 8.868183961999068e-05, + "loss": 1.7699, + "step": 7882 + }, + { + "epoch": 2.419582565991406, + "grad_norm": 0.29534587264060974, + "learning_rate": 8.867868993640051e-05, + "loss": 1.828, + "step": 7883 + }, + { + "epoch": 2.4198895027624308, + "grad_norm": 0.3086758255958557, + "learning_rate": 8.867553987056487e-05, + "loss": 1.8652, + "step": 7884 + }, + { + "epoch": 2.420196439533456, + "grad_norm": 0.3273947834968567, + "learning_rate": 8.867238942251487e-05, + "loss": 1.8553, + "step": 7885 + }, + { + "epoch": 2.4205033763044814, + "grad_norm": 0.3069070279598236, + "learning_rate": 8.866923859228165e-05, + "loss": 1.8057, + "step": 7886 + }, + { + "epoch": 2.4208103130755063, + "grad_norm": 0.2884439527988434, + "learning_rate": 8.866608737989635e-05, + "loss": 1.8479, + "step": 7887 + }, + { + "epoch": 2.4211172498465316, + "grad_norm": 0.32123002409935, + "learning_rate": 8.866293578539011e-05, + "loss": 1.916, + "step": 7888 + }, + { + "epoch": 2.421424186617557, + "grad_norm": 0.285966157913208, + "learning_rate": 8.865978380879407e-05, + "loss": 1.834, + "step": 7889 + }, + { + "epoch": 2.421731123388582, + "grad_norm": 0.28088799118995667, + "learning_rate": 8.865663145013941e-05, + "loss": 1.7794, + "step": 7890 + }, + { + "epoch": 2.422038060159607, + "grad_norm": 0.31160372495651245, + "learning_rate": 8.865347870945724e-05, + "loss": 1.8584, + "step": 7891 + }, + { + "epoch": 2.422344996930632, + "grad_norm": 0.3121089041233063, + "learning_rate": 8.865032558677874e-05, + "loss": 1.8797, + "step": 7892 + }, + { + "epoch": 2.4226519337016574, + "grad_norm": 0.35856643319129944, + "learning_rate": 8.864717208213506e-05, + "loss": 1.8664, + "step": 7893 + }, + { + "epoch": 2.4229588704726828, + "grad_norm": 0.32826781272888184, + "learning_rate": 8.864401819555739e-05, + "loss": 1.8473, + "step": 7894 + }, + { + "epoch": 2.4232658072437077, + "grad_norm": 0.34450921416282654, + "learning_rate": 8.86408639270769e-05, + "loss": 1.918, + "step": 7895 + }, + { + "epoch": 2.423572744014733, + "grad_norm": 0.39621153473854065, + "learning_rate": 8.86377092767247e-05, + "loss": 1.9411, + "step": 7896 + }, + { + "epoch": 2.423879680785758, + "grad_norm": 0.3765166103839874, + "learning_rate": 8.863455424453204e-05, + "loss": 1.9003, + "step": 7897 + }, + { + "epoch": 2.424186617556783, + "grad_norm": 0.3942621946334839, + "learning_rate": 8.863139883053007e-05, + "loss": 1.9647, + "step": 7898 + }, + { + "epoch": 2.4244935543278086, + "grad_norm": 0.4255806803703308, + "learning_rate": 8.862824303474996e-05, + "loss": 1.9147, + "step": 7899 + }, + { + "epoch": 2.424800491098834, + "grad_norm": 0.3993197977542877, + "learning_rate": 8.862508685722292e-05, + "loss": 1.8822, + "step": 7900 + }, + { + "epoch": 2.425107427869859, + "grad_norm": 0.3734201490879059, + "learning_rate": 8.862193029798013e-05, + "loss": 1.8745, + "step": 7901 + }, + { + "epoch": 2.425414364640884, + "grad_norm": 0.40955278277397156, + "learning_rate": 8.861877335705279e-05, + "loss": 1.877, + "step": 7902 + }, + { + "epoch": 2.425721301411909, + "grad_norm": 0.3975965678691864, + "learning_rate": 8.861561603447211e-05, + "loss": 1.868, + "step": 7903 + }, + { + "epoch": 2.4260282381829343, + "grad_norm": 0.30194091796875, + "learning_rate": 8.861245833026926e-05, + "loss": 1.7849, + "step": 7904 + }, + { + "epoch": 2.4263351749539597, + "grad_norm": 0.349930077791214, + "learning_rate": 8.860930024447547e-05, + "loss": 1.891, + "step": 7905 + }, + { + "epoch": 2.4266421117249846, + "grad_norm": 0.40644606947898865, + "learning_rate": 8.860614177712196e-05, + "loss": 1.8463, + "step": 7906 + }, + { + "epoch": 2.42694904849601, + "grad_norm": 0.3627426028251648, + "learning_rate": 8.86029829282399e-05, + "loss": 1.8518, + "step": 7907 + }, + { + "epoch": 2.427255985267035, + "grad_norm": 0.4019826054573059, + "learning_rate": 8.859982369786055e-05, + "loss": 1.7997, + "step": 7908 + }, + { + "epoch": 2.42756292203806, + "grad_norm": 0.375589519739151, + "learning_rate": 8.859666408601512e-05, + "loss": 1.9136, + "step": 7909 + }, + { + "epoch": 2.4278698588090855, + "grad_norm": 0.3135814070701599, + "learning_rate": 8.859350409273484e-05, + "loss": 1.8511, + "step": 7910 + }, + { + "epoch": 2.4281767955801103, + "grad_norm": 0.4534473717212677, + "learning_rate": 8.859034371805093e-05, + "loss": 1.9827, + "step": 7911 + }, + { + "epoch": 2.4284837323511357, + "grad_norm": 0.5559772849082947, + "learning_rate": 8.858718296199462e-05, + "loss": 1.8578, + "step": 7912 + }, + { + "epoch": 2.428790669122161, + "grad_norm": 0.4518011212348938, + "learning_rate": 8.858402182459715e-05, + "loss": 1.8374, + "step": 7913 + }, + { + "epoch": 2.429097605893186, + "grad_norm": 0.31662946939468384, + "learning_rate": 8.858086030588977e-05, + "loss": 1.8356, + "step": 7914 + }, + { + "epoch": 2.4294045426642112, + "grad_norm": 0.4660717844963074, + "learning_rate": 8.857769840590371e-05, + "loss": 1.7977, + "step": 7915 + }, + { + "epoch": 2.4297114794352366, + "grad_norm": 0.5611162185668945, + "learning_rate": 8.857453612467022e-05, + "loss": 1.8423, + "step": 7916 + }, + { + "epoch": 2.4300184162062615, + "grad_norm": 0.5055921077728271, + "learning_rate": 8.857137346222056e-05, + "loss": 1.8595, + "step": 7917 + }, + { + "epoch": 2.430325352977287, + "grad_norm": 0.3589123487472534, + "learning_rate": 8.856821041858597e-05, + "loss": 1.776, + "step": 7918 + }, + { + "epoch": 2.4306322897483117, + "grad_norm": 0.36849313974380493, + "learning_rate": 8.856504699379773e-05, + "loss": 1.8695, + "step": 7919 + }, + { + "epoch": 2.430939226519337, + "grad_norm": 0.47566625475883484, + "learning_rate": 8.856188318788709e-05, + "loss": 1.8578, + "step": 7920 + }, + { + "epoch": 2.4312461632903624, + "grad_norm": 0.554790735244751, + "learning_rate": 8.855871900088532e-05, + "loss": 1.8406, + "step": 7921 + }, + { + "epoch": 2.4315531000613873, + "grad_norm": 0.4846283197402954, + "learning_rate": 8.855555443282369e-05, + "loss": 1.8475, + "step": 7922 + }, + { + "epoch": 2.4318600368324126, + "grad_norm": 0.35256531834602356, + "learning_rate": 8.855238948373346e-05, + "loss": 1.8594, + "step": 7923 + }, + { + "epoch": 2.4321669736034375, + "grad_norm": 0.3713412880897522, + "learning_rate": 8.854922415364593e-05, + "loss": 1.893, + "step": 7924 + }, + { + "epoch": 2.432473910374463, + "grad_norm": 0.4289644658565521, + "learning_rate": 8.854605844259237e-05, + "loss": 1.8958, + "step": 7925 + }, + { + "epoch": 2.432780847145488, + "grad_norm": 0.4209578335285187, + "learning_rate": 8.854289235060406e-05, + "loss": 1.8419, + "step": 7926 + }, + { + "epoch": 2.433087783916513, + "grad_norm": 0.41226091980934143, + "learning_rate": 8.853972587771232e-05, + "loss": 1.958, + "step": 7927 + }, + { + "epoch": 2.4333947206875384, + "grad_norm": 0.36133915185928345, + "learning_rate": 8.853655902394841e-05, + "loss": 1.9181, + "step": 7928 + }, + { + "epoch": 2.4337016574585637, + "grad_norm": 0.44178202748298645, + "learning_rate": 8.853339178934363e-05, + "loss": 1.9242, + "step": 7929 + }, + { + "epoch": 2.4340085942295886, + "grad_norm": 0.4537523686885834, + "learning_rate": 8.853022417392929e-05, + "loss": 2.0451, + "step": 7930 + }, + { + "epoch": 2.434315531000614, + "grad_norm": 0.3214915990829468, + "learning_rate": 8.852705617773669e-05, + "loss": 1.8549, + "step": 7931 + }, + { + "epoch": 2.4346224677716393, + "grad_norm": 0.4621930420398712, + "learning_rate": 8.852388780079714e-05, + "loss": 1.8705, + "step": 7932 + }, + { + "epoch": 2.434929404542664, + "grad_norm": 0.52337646484375, + "learning_rate": 8.852071904314196e-05, + "loss": 1.8381, + "step": 7933 + }, + { + "epoch": 2.4352363413136895, + "grad_norm": 0.3846060633659363, + "learning_rate": 8.851754990480246e-05, + "loss": 1.828, + "step": 7934 + }, + { + "epoch": 2.4355432780847144, + "grad_norm": 0.34233763813972473, + "learning_rate": 8.851438038580994e-05, + "loss": 1.924, + "step": 7935 + }, + { + "epoch": 2.4358502148557397, + "grad_norm": 0.39583292603492737, + "learning_rate": 8.851121048619574e-05, + "loss": 1.8383, + "step": 7936 + }, + { + "epoch": 2.436157151626765, + "grad_norm": 0.3715476393699646, + "learning_rate": 8.850804020599119e-05, + "loss": 1.9251, + "step": 7937 + }, + { + "epoch": 2.43646408839779, + "grad_norm": 0.32089582085609436, + "learning_rate": 8.850486954522762e-05, + "loss": 1.9317, + "step": 7938 + }, + { + "epoch": 2.4367710251688153, + "grad_norm": 0.46823611855506897, + "learning_rate": 8.850169850393634e-05, + "loss": 1.9743, + "step": 7939 + }, + { + "epoch": 2.43707796193984, + "grad_norm": 0.405205637216568, + "learning_rate": 8.849852708214874e-05, + "loss": 1.8772, + "step": 7940 + }, + { + "epoch": 2.4373848987108655, + "grad_norm": 0.33672770857810974, + "learning_rate": 8.849535527989612e-05, + "loss": 1.8767, + "step": 7941 + }, + { + "epoch": 2.437691835481891, + "grad_norm": 0.38022953271865845, + "learning_rate": 8.849218309720983e-05, + "loss": 1.8882, + "step": 7942 + }, + { + "epoch": 2.4379987722529157, + "grad_norm": 0.4224186837673187, + "learning_rate": 8.848901053412124e-05, + "loss": 1.9016, + "step": 7943 + }, + { + "epoch": 2.438305709023941, + "grad_norm": 0.3890904486179352, + "learning_rate": 8.848583759066167e-05, + "loss": 1.8761, + "step": 7944 + }, + { + "epoch": 2.4386126457949664, + "grad_norm": 0.3747030794620514, + "learning_rate": 8.84826642668625e-05, + "loss": 1.8576, + "step": 7945 + }, + { + "epoch": 2.4389195825659913, + "grad_norm": 0.3317604959011078, + "learning_rate": 8.84794905627551e-05, + "loss": 1.9249, + "step": 7946 + }, + { + "epoch": 2.4392265193370166, + "grad_norm": 0.3294972777366638, + "learning_rate": 8.84763164783708e-05, + "loss": 1.8308, + "step": 7947 + }, + { + "epoch": 2.439533456108042, + "grad_norm": 0.42031124234199524, + "learning_rate": 8.847314201374101e-05, + "loss": 1.7884, + "step": 7948 + }, + { + "epoch": 2.439840392879067, + "grad_norm": 0.4018419682979584, + "learning_rate": 8.846996716889708e-05, + "loss": 1.8334, + "step": 7949 + }, + { + "epoch": 2.440147329650092, + "grad_norm": 0.39541858434677124, + "learning_rate": 8.846679194387036e-05, + "loss": 1.888, + "step": 7950 + }, + { + "epoch": 2.440454266421117, + "grad_norm": 0.34641456604003906, + "learning_rate": 8.846361633869228e-05, + "loss": 1.8521, + "step": 7951 + }, + { + "epoch": 2.4407612031921424, + "grad_norm": 0.42987826466560364, + "learning_rate": 8.846044035339419e-05, + "loss": 1.8789, + "step": 7952 + }, + { + "epoch": 2.4410681399631677, + "grad_norm": 0.3651089072227478, + "learning_rate": 8.845726398800749e-05, + "loss": 1.9024, + "step": 7953 + }, + { + "epoch": 2.4413750767341926, + "grad_norm": 0.3024137616157532, + "learning_rate": 8.845408724256356e-05, + "loss": 1.7773, + "step": 7954 + }, + { + "epoch": 2.441682013505218, + "grad_norm": 0.32426944375038147, + "learning_rate": 8.845091011709381e-05, + "loss": 1.7873, + "step": 7955 + }, + { + "epoch": 2.441988950276243, + "grad_norm": 0.34448274970054626, + "learning_rate": 8.844773261162962e-05, + "loss": 1.8854, + "step": 7956 + }, + { + "epoch": 2.442295887047268, + "grad_norm": 0.2942068874835968, + "learning_rate": 8.844455472620241e-05, + "loss": 1.8186, + "step": 7957 + }, + { + "epoch": 2.4426028238182935, + "grad_norm": 0.3849888741970062, + "learning_rate": 8.844137646084358e-05, + "loss": 1.905, + "step": 7958 + }, + { + "epoch": 2.4429097605893184, + "grad_norm": 0.44277897477149963, + "learning_rate": 8.843819781558452e-05, + "loss": 1.8836, + "step": 7959 + }, + { + "epoch": 2.4432166973603437, + "grad_norm": 0.34470248222351074, + "learning_rate": 8.843501879045667e-05, + "loss": 1.9368, + "step": 7960 + }, + { + "epoch": 2.443523634131369, + "grad_norm": 0.29713204503059387, + "learning_rate": 8.843183938549145e-05, + "loss": 1.8562, + "step": 7961 + }, + { + "epoch": 2.443830570902394, + "grad_norm": 0.370623379945755, + "learning_rate": 8.842865960072025e-05, + "loss": 1.8501, + "step": 7962 + }, + { + "epoch": 2.4441375076734193, + "grad_norm": 0.38828277587890625, + "learning_rate": 8.842547943617453e-05, + "loss": 1.884, + "step": 7963 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.294223427772522, + "learning_rate": 8.842229889188566e-05, + "loss": 1.857, + "step": 7964 + }, + { + "epoch": 2.4447513812154695, + "grad_norm": 0.31901589035987854, + "learning_rate": 8.841911796788516e-05, + "loss": 1.8675, + "step": 7965 + }, + { + "epoch": 2.445058317986495, + "grad_norm": 0.3586447834968567, + "learning_rate": 8.84159366642044e-05, + "loss": 1.86, + "step": 7966 + }, + { + "epoch": 2.4453652547575198, + "grad_norm": 0.30848199129104614, + "learning_rate": 8.841275498087482e-05, + "loss": 1.8153, + "step": 7967 + }, + { + "epoch": 2.445672191528545, + "grad_norm": 0.2694801688194275, + "learning_rate": 8.84095729179279e-05, + "loss": 1.7702, + "step": 7968 + }, + { + "epoch": 2.4459791282995704, + "grad_norm": 0.3068044185638428, + "learning_rate": 8.840639047539507e-05, + "loss": 1.8531, + "step": 7969 + }, + { + "epoch": 2.4462860650705953, + "grad_norm": 0.32885125279426575, + "learning_rate": 8.840320765330776e-05, + "loss": 1.9194, + "step": 7970 + }, + { + "epoch": 2.4465930018416207, + "grad_norm": 0.2949635088443756, + "learning_rate": 8.840002445169746e-05, + "loss": 1.8427, + "step": 7971 + }, + { + "epoch": 2.446899938612646, + "grad_norm": 0.27281275391578674, + "learning_rate": 8.83968408705956e-05, + "loss": 1.8279, + "step": 7972 + }, + { + "epoch": 2.447206875383671, + "grad_norm": 0.3038519620895386, + "learning_rate": 8.839365691003367e-05, + "loss": 1.8629, + "step": 7973 + }, + { + "epoch": 2.447513812154696, + "grad_norm": 0.28468266129493713, + "learning_rate": 8.839047257004311e-05, + "loss": 1.8765, + "step": 7974 + }, + { + "epoch": 2.4478207489257215, + "grad_norm": 0.29807159304618835, + "learning_rate": 8.83872878506554e-05, + "loss": 1.8152, + "step": 7975 + }, + { + "epoch": 2.4481276856967464, + "grad_norm": 0.3005301356315613, + "learning_rate": 8.838410275190201e-05, + "loss": 1.8577, + "step": 7976 + }, + { + "epoch": 2.4484346224677718, + "grad_norm": 0.3068598806858063, + "learning_rate": 8.838091727381442e-05, + "loss": 1.863, + "step": 7977 + }, + { + "epoch": 2.4487415592387967, + "grad_norm": 0.33748000860214233, + "learning_rate": 8.837773141642411e-05, + "loss": 1.7889, + "step": 7978 + }, + { + "epoch": 2.449048496009822, + "grad_norm": 0.344417542219162, + "learning_rate": 8.837454517976256e-05, + "loss": 1.9167, + "step": 7979 + }, + { + "epoch": 2.4493554327808473, + "grad_norm": 0.29128298163414, + "learning_rate": 8.837135856386127e-05, + "loss": 1.8246, + "step": 7980 + }, + { + "epoch": 2.449662369551872, + "grad_norm": 0.27023759484291077, + "learning_rate": 8.836817156875172e-05, + "loss": 1.8493, + "step": 7981 + }, + { + "epoch": 2.4499693063228976, + "grad_norm": 0.2792586088180542, + "learning_rate": 8.836498419446541e-05, + "loss": 1.8739, + "step": 7982 + }, + { + "epoch": 2.4502762430939224, + "grad_norm": 0.2715211510658264, + "learning_rate": 8.836179644103384e-05, + "loss": 1.8218, + "step": 7983 + }, + { + "epoch": 2.450583179864948, + "grad_norm": 0.273576557636261, + "learning_rate": 8.835860830848851e-05, + "loss": 1.9063, + "step": 7984 + }, + { + "epoch": 2.450890116635973, + "grad_norm": 0.2992589473724365, + "learning_rate": 8.835541979686093e-05, + "loss": 1.8799, + "step": 7985 + }, + { + "epoch": 2.451197053406998, + "grad_norm": 0.3231843411922455, + "learning_rate": 8.835223090618263e-05, + "loss": 1.8956, + "step": 7986 + }, + { + "epoch": 2.4515039901780233, + "grad_norm": 0.31108468770980835, + "learning_rate": 8.834904163648508e-05, + "loss": 1.8371, + "step": 7987 + }, + { + "epoch": 2.4518109269490487, + "grad_norm": 0.26657021045684814, + "learning_rate": 8.834585198779983e-05, + "loss": 1.8384, + "step": 7988 + }, + { + "epoch": 2.4521178637200736, + "grad_norm": 0.32093849778175354, + "learning_rate": 8.83426619601584e-05, + "loss": 1.8603, + "step": 7989 + }, + { + "epoch": 2.452424800491099, + "grad_norm": 0.32942765951156616, + "learning_rate": 8.833947155359231e-05, + "loss": 1.8306, + "step": 7990 + }, + { + "epoch": 2.4527317372621242, + "grad_norm": 0.31677374243736267, + "learning_rate": 8.83362807681331e-05, + "loss": 1.8339, + "step": 7991 + }, + { + "epoch": 2.453038674033149, + "grad_norm": 0.2739655673503876, + "learning_rate": 8.833308960381228e-05, + "loss": 1.8514, + "step": 7992 + }, + { + "epoch": 2.4533456108041745, + "grad_norm": 0.3194214105606079, + "learning_rate": 8.83298980606614e-05, + "loss": 1.8413, + "step": 7993 + }, + { + "epoch": 2.4536525475751993, + "grad_norm": 0.3346202075481415, + "learning_rate": 8.832670613871202e-05, + "loss": 1.8558, + "step": 7994 + }, + { + "epoch": 2.4539594843462247, + "grad_norm": 0.3400736451148987, + "learning_rate": 8.832351383799565e-05, + "loss": 1.8668, + "step": 7995 + }, + { + "epoch": 2.45426642111725, + "grad_norm": 0.2807479202747345, + "learning_rate": 8.832032115854385e-05, + "loss": 1.8361, + "step": 7996 + }, + { + "epoch": 2.454573357888275, + "grad_norm": 0.2977379262447357, + "learning_rate": 8.831712810038817e-05, + "loss": 1.84, + "step": 7997 + }, + { + "epoch": 2.4548802946593002, + "grad_norm": 0.3242948353290558, + "learning_rate": 8.831393466356019e-05, + "loss": 1.9421, + "step": 7998 + }, + { + "epoch": 2.455187231430325, + "grad_norm": 0.3289327025413513, + "learning_rate": 8.831074084809144e-05, + "loss": 1.9348, + "step": 7999 + }, + { + "epoch": 2.4554941682013505, + "grad_norm": 0.3378387987613678, + "learning_rate": 8.830754665401351e-05, + "loss": 1.7871, + "step": 8000 + }, + { + "epoch": 2.455801104972376, + "grad_norm": 0.29627665877342224, + "learning_rate": 8.830435208135794e-05, + "loss": 1.815, + "step": 8001 + }, + { + "epoch": 2.4561080417434007, + "grad_norm": 0.3509432375431061, + "learning_rate": 8.83011571301563e-05, + "loss": 1.9209, + "step": 8002 + }, + { + "epoch": 2.456414978514426, + "grad_norm": 0.3272305130958557, + "learning_rate": 8.829796180044019e-05, + "loss": 1.8437, + "step": 8003 + }, + { + "epoch": 2.4567219152854514, + "grad_norm": 0.33997493982315063, + "learning_rate": 8.829476609224119e-05, + "loss": 1.8827, + "step": 8004 + }, + { + "epoch": 2.4570288520564763, + "grad_norm": 0.30387789011001587, + "learning_rate": 8.829157000559084e-05, + "loss": 1.8427, + "step": 8005 + }, + { + "epoch": 2.4573357888275016, + "grad_norm": 0.30266425013542175, + "learning_rate": 8.828837354052075e-05, + "loss": 1.8274, + "step": 8006 + }, + { + "epoch": 2.457642725598527, + "grad_norm": 0.365546315908432, + "learning_rate": 8.828517669706254e-05, + "loss": 1.8455, + "step": 8007 + }, + { + "epoch": 2.457949662369552, + "grad_norm": 0.339226633310318, + "learning_rate": 8.828197947524774e-05, + "loss": 1.8665, + "step": 8008 + }, + { + "epoch": 2.458256599140577, + "grad_norm": 0.31167346239089966, + "learning_rate": 8.8278781875108e-05, + "loss": 1.7807, + "step": 8009 + }, + { + "epoch": 2.458563535911602, + "grad_norm": 0.2788028120994568, + "learning_rate": 8.82755838966749e-05, + "loss": 1.8834, + "step": 8010 + }, + { + "epoch": 2.4588704726826274, + "grad_norm": 0.34648752212524414, + "learning_rate": 8.827238553998005e-05, + "loss": 1.8981, + "step": 8011 + }, + { + "epoch": 2.4591774094536527, + "grad_norm": 0.3169974982738495, + "learning_rate": 8.826918680505504e-05, + "loss": 1.81, + "step": 8012 + }, + { + "epoch": 2.4594843462246776, + "grad_norm": 0.46924272179603577, + "learning_rate": 8.826598769193151e-05, + "loss": 1.9016, + "step": 8013 + }, + { + "epoch": 2.459791282995703, + "grad_norm": 0.38437098264694214, + "learning_rate": 8.826278820064106e-05, + "loss": 1.8924, + "step": 8014 + }, + { + "epoch": 2.460098219766728, + "grad_norm": 0.3350604474544525, + "learning_rate": 8.82595883312153e-05, + "loss": 1.8591, + "step": 8015 + }, + { + "epoch": 2.460405156537753, + "grad_norm": 0.3053742051124573, + "learning_rate": 8.825638808368588e-05, + "loss": 1.8114, + "step": 8016 + }, + { + "epoch": 2.4607120933087785, + "grad_norm": 0.29566875100135803, + "learning_rate": 8.82531874580844e-05, + "loss": 1.8055, + "step": 8017 + }, + { + "epoch": 2.4610190300798034, + "grad_norm": 0.3057360053062439, + "learning_rate": 8.824998645444249e-05, + "loss": 1.8268, + "step": 8018 + }, + { + "epoch": 2.4613259668508287, + "grad_norm": 0.27333348989486694, + "learning_rate": 8.82467850727918e-05, + "loss": 1.7876, + "step": 8019 + }, + { + "epoch": 2.461632903621854, + "grad_norm": 0.29202890396118164, + "learning_rate": 8.824358331316398e-05, + "loss": 1.8488, + "step": 8020 + }, + { + "epoch": 2.461939840392879, + "grad_norm": 0.3640623986721039, + "learning_rate": 8.824038117559064e-05, + "loss": 1.9665, + "step": 8021 + }, + { + "epoch": 2.4622467771639043, + "grad_norm": 0.35411131381988525, + "learning_rate": 8.823717866010344e-05, + "loss": 1.8561, + "step": 8022 + }, + { + "epoch": 2.4625537139349296, + "grad_norm": 0.3695240020751953, + "learning_rate": 8.823397576673403e-05, + "loss": 1.8489, + "step": 8023 + }, + { + "epoch": 2.4628606507059545, + "grad_norm": 0.36554715037345886, + "learning_rate": 8.823077249551406e-05, + "loss": 1.8523, + "step": 8024 + }, + { + "epoch": 2.46316758747698, + "grad_norm": 0.2982638478279114, + "learning_rate": 8.822756884647521e-05, + "loss": 1.8006, + "step": 8025 + }, + { + "epoch": 2.4634745242480047, + "grad_norm": 0.3693525791168213, + "learning_rate": 8.822436481964909e-05, + "loss": 1.8695, + "step": 8026 + }, + { + "epoch": 2.46378146101903, + "grad_norm": 0.46769842505455017, + "learning_rate": 8.82211604150674e-05, + "loss": 1.8509, + "step": 8027 + }, + { + "epoch": 2.4640883977900554, + "grad_norm": 0.5327584743499756, + "learning_rate": 8.82179556327618e-05, + "loss": 1.8642, + "step": 8028 + }, + { + "epoch": 2.4643953345610803, + "grad_norm": 0.5302795767784119, + "learning_rate": 8.821475047276398e-05, + "loss": 1.8645, + "step": 8029 + }, + { + "epoch": 2.4647022713321056, + "grad_norm": 0.43549028038978577, + "learning_rate": 8.821154493510557e-05, + "loss": 1.9193, + "step": 8030 + }, + { + "epoch": 2.4650092081031305, + "grad_norm": 0.3013847768306732, + "learning_rate": 8.82083390198183e-05, + "loss": 1.7819, + "step": 8031 + }, + { + "epoch": 2.465316144874156, + "grad_norm": 0.422325074672699, + "learning_rate": 8.820513272693383e-05, + "loss": 1.9307, + "step": 8032 + }, + { + "epoch": 2.465623081645181, + "grad_norm": 0.4823217988014221, + "learning_rate": 8.820192605648383e-05, + "loss": 1.8681, + "step": 8033 + }, + { + "epoch": 2.465930018416206, + "grad_norm": 0.3938382863998413, + "learning_rate": 8.819871900850001e-05, + "loss": 1.8483, + "step": 8034 + }, + { + "epoch": 2.4662369551872314, + "grad_norm": 0.30860164761543274, + "learning_rate": 8.819551158301406e-05, + "loss": 1.8818, + "step": 8035 + }, + { + "epoch": 2.4665438919582567, + "grad_norm": 0.3715503215789795, + "learning_rate": 8.819230378005767e-05, + "loss": 1.8443, + "step": 8036 + }, + { + "epoch": 2.4668508287292816, + "grad_norm": 0.4750272333621979, + "learning_rate": 8.818909559966255e-05, + "loss": 1.8379, + "step": 8037 + }, + { + "epoch": 2.467157765500307, + "grad_norm": 0.4794345796108246, + "learning_rate": 8.818588704186041e-05, + "loss": 1.8585, + "step": 8038 + }, + { + "epoch": 2.4674647022713323, + "grad_norm": 0.33470577001571655, + "learning_rate": 8.818267810668296e-05, + "loss": 1.8231, + "step": 8039 + }, + { + "epoch": 2.467771639042357, + "grad_norm": 0.31480371952056885, + "learning_rate": 8.817946879416191e-05, + "loss": 1.867, + "step": 8040 + }, + { + "epoch": 2.4680785758133825, + "grad_norm": 0.41635531187057495, + "learning_rate": 8.817625910432897e-05, + "loss": 1.9385, + "step": 8041 + }, + { + "epoch": 2.4683855125844074, + "grad_norm": 0.4570399522781372, + "learning_rate": 8.817304903721584e-05, + "loss": 1.7855, + "step": 8042 + }, + { + "epoch": 2.4686924493554327, + "grad_norm": 0.36506229639053345, + "learning_rate": 8.816983859285429e-05, + "loss": 1.808, + "step": 8043 + }, + { + "epoch": 2.468999386126458, + "grad_norm": 0.2650545537471771, + "learning_rate": 8.8166627771276e-05, + "loss": 1.8271, + "step": 8044 + }, + { + "epoch": 2.469306322897483, + "grad_norm": 0.3143758475780487, + "learning_rate": 8.816341657251272e-05, + "loss": 1.9016, + "step": 8045 + }, + { + "epoch": 2.4696132596685083, + "grad_norm": 0.3015407621860504, + "learning_rate": 8.81602049965962e-05, + "loss": 1.8357, + "step": 8046 + }, + { + "epoch": 2.4699201964395336, + "grad_norm": 0.26860085129737854, + "learning_rate": 8.815699304355819e-05, + "loss": 1.8223, + "step": 8047 + }, + { + "epoch": 2.4702271332105585, + "grad_norm": 0.2852436602115631, + "learning_rate": 8.81537807134304e-05, + "loss": 1.8298, + "step": 8048 + }, + { + "epoch": 2.470534069981584, + "grad_norm": 0.29519692063331604, + "learning_rate": 8.815056800624457e-05, + "loss": 1.863, + "step": 8049 + }, + { + "epoch": 2.470841006752609, + "grad_norm": 0.3163367807865143, + "learning_rate": 8.814735492203247e-05, + "loss": 1.878, + "step": 8050 + }, + { + "epoch": 2.471147943523634, + "grad_norm": 0.2955954968929291, + "learning_rate": 8.814414146082586e-05, + "loss": 1.8657, + "step": 8051 + }, + { + "epoch": 2.4714548802946594, + "grad_norm": 0.2773810029029846, + "learning_rate": 8.814092762265648e-05, + "loss": 1.7626, + "step": 8052 + }, + { + "epoch": 2.4717618170656843, + "grad_norm": 0.33908557891845703, + "learning_rate": 8.813771340755609e-05, + "loss": 1.8902, + "step": 8053 + }, + { + "epoch": 2.4720687538367097, + "grad_norm": 0.3083830773830414, + "learning_rate": 8.81344988155565e-05, + "loss": 1.876, + "step": 8054 + }, + { + "epoch": 2.472375690607735, + "grad_norm": 0.29082754254341125, + "learning_rate": 8.81312838466894e-05, + "loss": 1.8637, + "step": 8055 + }, + { + "epoch": 2.47268262737876, + "grad_norm": 0.3240490257740021, + "learning_rate": 8.81280685009866e-05, + "loss": 1.9096, + "step": 8056 + }, + { + "epoch": 2.472989564149785, + "grad_norm": 0.364561527967453, + "learning_rate": 8.812485277847991e-05, + "loss": 1.9361, + "step": 8057 + }, + { + "epoch": 2.47329650092081, + "grad_norm": 0.3420087695121765, + "learning_rate": 8.812163667920107e-05, + "loss": 1.9014, + "step": 8058 + }, + { + "epoch": 2.4736034376918354, + "grad_norm": 0.3346010148525238, + "learning_rate": 8.811842020318186e-05, + "loss": 1.9195, + "step": 8059 + }, + { + "epoch": 2.4739103744628608, + "grad_norm": 0.2990448772907257, + "learning_rate": 8.811520335045409e-05, + "loss": 1.8866, + "step": 8060 + }, + { + "epoch": 2.4742173112338857, + "grad_norm": 0.3047022223472595, + "learning_rate": 8.811198612104953e-05, + "loss": 1.8226, + "step": 8061 + }, + { + "epoch": 2.474524248004911, + "grad_norm": 0.300020307302475, + "learning_rate": 8.8108768515e-05, + "loss": 1.8496, + "step": 8062 + }, + { + "epoch": 2.4748311847759363, + "grad_norm": 0.31999605894088745, + "learning_rate": 8.810555053233729e-05, + "loss": 1.7853, + "step": 8063 + }, + { + "epoch": 2.4751381215469612, + "grad_norm": 0.3136597275733948, + "learning_rate": 8.810233217309318e-05, + "loss": 1.9317, + "step": 8064 + }, + { + "epoch": 2.4754450583179866, + "grad_norm": 0.3373543322086334, + "learning_rate": 8.809911343729948e-05, + "loss": 1.7827, + "step": 8065 + }, + { + "epoch": 2.475751995089012, + "grad_norm": 0.33876341581344604, + "learning_rate": 8.809589432498804e-05, + "loss": 1.8803, + "step": 8066 + }, + { + "epoch": 2.476058931860037, + "grad_norm": 0.3455486297607422, + "learning_rate": 8.809267483619061e-05, + "loss": 1.8987, + "step": 8067 + }, + { + "epoch": 2.476365868631062, + "grad_norm": 0.34245389699935913, + "learning_rate": 8.808945497093907e-05, + "loss": 1.8948, + "step": 8068 + }, + { + "epoch": 2.476672805402087, + "grad_norm": 0.3200787901878357, + "learning_rate": 8.808623472926521e-05, + "loss": 1.8234, + "step": 8069 + }, + { + "epoch": 2.4769797421731123, + "grad_norm": 0.3244795799255371, + "learning_rate": 8.808301411120083e-05, + "loss": 1.8974, + "step": 8070 + }, + { + "epoch": 2.4772866789441377, + "grad_norm": 0.30235809087753296, + "learning_rate": 8.80797931167778e-05, + "loss": 1.8461, + "step": 8071 + }, + { + "epoch": 2.4775936157151626, + "grad_norm": 0.3719651997089386, + "learning_rate": 8.807657174602792e-05, + "loss": 1.9717, + "step": 8072 + }, + { + "epoch": 2.477900552486188, + "grad_norm": 0.3349135220050812, + "learning_rate": 8.807334999898307e-05, + "loss": 1.9, + "step": 8073 + }, + { + "epoch": 2.478207489257213, + "grad_norm": 0.28822100162506104, + "learning_rate": 8.807012787567503e-05, + "loss": 1.7606, + "step": 8074 + }, + { + "epoch": 2.478514426028238, + "grad_norm": 0.33698850870132446, + "learning_rate": 8.806690537613568e-05, + "loss": 1.8909, + "step": 8075 + }, + { + "epoch": 2.4788213627992635, + "grad_norm": 0.35167089104652405, + "learning_rate": 8.806368250039687e-05, + "loss": 1.8529, + "step": 8076 + }, + { + "epoch": 2.4791282995702884, + "grad_norm": 0.3142544627189636, + "learning_rate": 8.806045924849044e-05, + "loss": 1.8169, + "step": 8077 + }, + { + "epoch": 2.4794352363413137, + "grad_norm": 0.3489094078540802, + "learning_rate": 8.805723562044824e-05, + "loss": 1.8822, + "step": 8078 + }, + { + "epoch": 2.479742173112339, + "grad_norm": 0.33814284205436707, + "learning_rate": 8.805401161630214e-05, + "loss": 1.7982, + "step": 8079 + }, + { + "epoch": 2.480049109883364, + "grad_norm": 0.26772376894950867, + "learning_rate": 8.805078723608398e-05, + "loss": 1.8354, + "step": 8080 + }, + { + "epoch": 2.4803560466543892, + "grad_norm": 0.3259965777397156, + "learning_rate": 8.804756247982563e-05, + "loss": 1.8292, + "step": 8081 + }, + { + "epoch": 2.4806629834254146, + "grad_norm": 0.32701683044433594, + "learning_rate": 8.804433734755899e-05, + "loss": 1.8339, + "step": 8082 + }, + { + "epoch": 2.4809699201964395, + "grad_norm": 0.3180190324783325, + "learning_rate": 8.804111183931589e-05, + "loss": 1.8839, + "step": 8083 + }, + { + "epoch": 2.481276856967465, + "grad_norm": 0.3318104147911072, + "learning_rate": 8.803788595512824e-05, + "loss": 1.9024, + "step": 8084 + }, + { + "epoch": 2.4815837937384897, + "grad_norm": 0.3849479854106903, + "learning_rate": 8.80346596950279e-05, + "loss": 1.8497, + "step": 8085 + }, + { + "epoch": 2.481890730509515, + "grad_norm": 0.48812124133110046, + "learning_rate": 8.803143305904676e-05, + "loss": 1.799, + "step": 8086 + }, + { + "epoch": 2.4821976672805404, + "grad_norm": 0.4957241415977478, + "learning_rate": 8.802820604721671e-05, + "loss": 1.8842, + "step": 8087 + }, + { + "epoch": 2.4825046040515653, + "grad_norm": 0.4011611342430115, + "learning_rate": 8.802497865956964e-05, + "loss": 1.8354, + "step": 8088 + }, + { + "epoch": 2.4828115408225906, + "grad_norm": 0.3676159679889679, + "learning_rate": 8.802175089613744e-05, + "loss": 1.8564, + "step": 8089 + }, + { + "epoch": 2.4831184775936155, + "grad_norm": 0.30699628591537476, + "learning_rate": 8.801852275695202e-05, + "loss": 1.8403, + "step": 8090 + }, + { + "epoch": 2.483425414364641, + "grad_norm": 0.4100657105445862, + "learning_rate": 8.801529424204527e-05, + "loss": 1.7885, + "step": 8091 + }, + { + "epoch": 2.483732351135666, + "grad_norm": 0.30880647897720337, + "learning_rate": 8.801206535144909e-05, + "loss": 1.8682, + "step": 8092 + }, + { + "epoch": 2.484039287906691, + "grad_norm": 0.2775783836841583, + "learning_rate": 8.800883608519541e-05, + "loss": 1.8179, + "step": 8093 + }, + { + "epoch": 2.4843462246777164, + "grad_norm": 0.3048902451992035, + "learning_rate": 8.800560644331613e-05, + "loss": 1.8799, + "step": 8094 + }, + { + "epoch": 2.4846531614487417, + "grad_norm": 0.30332526564598083, + "learning_rate": 8.800237642584318e-05, + "loss": 1.8892, + "step": 8095 + }, + { + "epoch": 2.4849600982197666, + "grad_norm": 0.27216237783432007, + "learning_rate": 8.799914603280847e-05, + "loss": 1.7896, + "step": 8096 + }, + { + "epoch": 2.485267034990792, + "grad_norm": 0.28771117329597473, + "learning_rate": 8.799591526424393e-05, + "loss": 1.8593, + "step": 8097 + }, + { + "epoch": 2.4855739717618173, + "grad_norm": 0.2986912429332733, + "learning_rate": 8.799268412018146e-05, + "loss": 1.8205, + "step": 8098 + }, + { + "epoch": 2.485880908532842, + "grad_norm": 0.3072153925895691, + "learning_rate": 8.798945260065306e-05, + "loss": 1.841, + "step": 8099 + }, + { + "epoch": 2.4861878453038675, + "grad_norm": 0.33869001269340515, + "learning_rate": 8.798622070569059e-05, + "loss": 1.8353, + "step": 8100 + }, + { + "epoch": 2.4864947820748924, + "grad_norm": 0.3075481951236725, + "learning_rate": 8.798298843532605e-05, + "loss": 1.8824, + "step": 8101 + }, + { + "epoch": 2.4868017188459177, + "grad_norm": 0.2758934795856476, + "learning_rate": 8.797975578959132e-05, + "loss": 1.8068, + "step": 8102 + }, + { + "epoch": 2.487108655616943, + "grad_norm": 0.3065447211265564, + "learning_rate": 8.79765227685184e-05, + "loss": 1.8661, + "step": 8103 + }, + { + "epoch": 2.487415592387968, + "grad_norm": 0.34466415643692017, + "learning_rate": 8.797328937213923e-05, + "loss": 1.8579, + "step": 8104 + }, + { + "epoch": 2.4877225291589933, + "grad_norm": 0.4202970862388611, + "learning_rate": 8.797005560048575e-05, + "loss": 1.8526, + "step": 8105 + }, + { + "epoch": 2.488029465930018, + "grad_norm": 0.35885924100875854, + "learning_rate": 8.796682145358991e-05, + "loss": 1.8194, + "step": 8106 + }, + { + "epoch": 2.4883364027010435, + "grad_norm": 0.3208492696285248, + "learning_rate": 8.796358693148372e-05, + "loss": 1.8379, + "step": 8107 + }, + { + "epoch": 2.488643339472069, + "grad_norm": 0.26514047384262085, + "learning_rate": 8.79603520341991e-05, + "loss": 1.7978, + "step": 8108 + }, + { + "epoch": 2.4889502762430937, + "grad_norm": 0.34550225734710693, + "learning_rate": 8.795711676176803e-05, + "loss": 1.8771, + "step": 8109 + }, + { + "epoch": 2.489257213014119, + "grad_norm": 0.3016511797904968, + "learning_rate": 8.795388111422248e-05, + "loss": 1.8184, + "step": 8110 + }, + { + "epoch": 2.4895641497851444, + "grad_norm": 0.34824177622795105, + "learning_rate": 8.795064509159444e-05, + "loss": 1.8486, + "step": 8111 + }, + { + "epoch": 2.4898710865561693, + "grad_norm": 0.341482013463974, + "learning_rate": 8.794740869391587e-05, + "loss": 1.7872, + "step": 8112 + }, + { + "epoch": 2.4901780233271946, + "grad_norm": 0.3366520404815674, + "learning_rate": 8.794417192121878e-05, + "loss": 1.838, + "step": 8113 + }, + { + "epoch": 2.49048496009822, + "grad_norm": 0.3168759047985077, + "learning_rate": 8.794093477353514e-05, + "loss": 1.8195, + "step": 8114 + }, + { + "epoch": 2.490791896869245, + "grad_norm": 0.36757516860961914, + "learning_rate": 8.793769725089693e-05, + "loss": 1.8825, + "step": 8115 + }, + { + "epoch": 2.49109883364027, + "grad_norm": 0.3936297297477722, + "learning_rate": 8.793445935333617e-05, + "loss": 1.855, + "step": 8116 + }, + { + "epoch": 2.491405770411295, + "grad_norm": 0.31962448358535767, + "learning_rate": 8.793122108088485e-05, + "loss": 1.8307, + "step": 8117 + }, + { + "epoch": 2.4917127071823204, + "grad_norm": 0.3082095980644226, + "learning_rate": 8.792798243357499e-05, + "loss": 1.8204, + "step": 8118 + }, + { + "epoch": 2.4920196439533457, + "grad_norm": 0.4574470520019531, + "learning_rate": 8.792474341143855e-05, + "loss": 1.8989, + "step": 8119 + }, + { + "epoch": 2.4923265807243706, + "grad_norm": 0.4596022367477417, + "learning_rate": 8.792150401450757e-05, + "loss": 1.8773, + "step": 8120 + }, + { + "epoch": 2.492633517495396, + "grad_norm": 0.32090309262275696, + "learning_rate": 8.791826424281407e-05, + "loss": 1.8621, + "step": 8121 + }, + { + "epoch": 2.4929404542664213, + "grad_norm": 0.3492026925086975, + "learning_rate": 8.791502409639006e-05, + "loss": 1.8887, + "step": 8122 + }, + { + "epoch": 2.493247391037446, + "grad_norm": 0.39859771728515625, + "learning_rate": 8.791178357526754e-05, + "loss": 1.8326, + "step": 8123 + }, + { + "epoch": 2.4935543278084715, + "grad_norm": 0.40439239144325256, + "learning_rate": 8.790854267947857e-05, + "loss": 1.8716, + "step": 8124 + }, + { + "epoch": 2.493861264579497, + "grad_norm": 0.4004671573638916, + "learning_rate": 8.790530140905515e-05, + "loss": 1.8253, + "step": 8125 + }, + { + "epoch": 2.4941682013505218, + "grad_norm": 0.31446993350982666, + "learning_rate": 8.790205976402934e-05, + "loss": 1.8356, + "step": 8126 + }, + { + "epoch": 2.494475138121547, + "grad_norm": 0.3069862723350525, + "learning_rate": 8.789881774443315e-05, + "loss": 1.8532, + "step": 8127 + }, + { + "epoch": 2.494782074892572, + "grad_norm": 0.3192054033279419, + "learning_rate": 8.789557535029864e-05, + "loss": 1.7991, + "step": 8128 + }, + { + "epoch": 2.4950890116635973, + "grad_norm": 0.30979350209236145, + "learning_rate": 8.789233258165783e-05, + "loss": 1.8874, + "step": 8129 + }, + { + "epoch": 2.4953959484346226, + "grad_norm": 0.3193976879119873, + "learning_rate": 8.788908943854279e-05, + "loss": 1.8218, + "step": 8130 + }, + { + "epoch": 2.4957028852056475, + "grad_norm": 0.3120083808898926, + "learning_rate": 8.788584592098557e-05, + "loss": 1.9542, + "step": 8131 + }, + { + "epoch": 2.496009821976673, + "grad_norm": 0.36913001537323, + "learning_rate": 8.788260202901819e-05, + "loss": 1.8543, + "step": 8132 + }, + { + "epoch": 2.4963167587476978, + "grad_norm": 0.40216776728630066, + "learning_rate": 8.787935776267275e-05, + "loss": 1.8645, + "step": 8133 + }, + { + "epoch": 2.496623695518723, + "grad_norm": 0.3553076684474945, + "learning_rate": 8.78761131219813e-05, + "loss": 1.8881, + "step": 8134 + }, + { + "epoch": 2.4969306322897484, + "grad_norm": 0.2926538288593292, + "learning_rate": 8.787286810697589e-05, + "loss": 1.8419, + "step": 8135 + }, + { + "epoch": 2.4972375690607733, + "grad_norm": 0.3412233293056488, + "learning_rate": 8.78696227176886e-05, + "loss": 1.8766, + "step": 8136 + }, + { + "epoch": 2.4975445058317987, + "grad_norm": 0.30935296416282654, + "learning_rate": 8.78663769541515e-05, + "loss": 1.8002, + "step": 8137 + }, + { + "epoch": 2.497851442602824, + "grad_norm": 0.31171828508377075, + "learning_rate": 8.786313081639666e-05, + "loss": 1.7795, + "step": 8138 + }, + { + "epoch": 2.498158379373849, + "grad_norm": 0.2874031364917755, + "learning_rate": 8.785988430445619e-05, + "loss": 1.8508, + "step": 8139 + }, + { + "epoch": 2.498465316144874, + "grad_norm": 0.3126043379306793, + "learning_rate": 8.785663741836215e-05, + "loss": 1.8328, + "step": 8140 + }, + { + "epoch": 2.4987722529158995, + "grad_norm": 0.32581454515457153, + "learning_rate": 8.785339015814662e-05, + "loss": 1.8333, + "step": 8141 + }, + { + "epoch": 2.4990791896869244, + "grad_norm": 0.329745888710022, + "learning_rate": 8.78501425238417e-05, + "loss": 1.8257, + "step": 8142 + }, + { + "epoch": 2.4993861264579498, + "grad_norm": 0.29101938009262085, + "learning_rate": 8.78468945154795e-05, + "loss": 1.8472, + "step": 8143 + }, + { + "epoch": 2.4996930632289747, + "grad_norm": 0.3123742341995239, + "learning_rate": 8.784364613309208e-05, + "loss": 1.9226, + "step": 8144 + }, + { + "epoch": 2.5, + "grad_norm": 0.3330230116844177, + "learning_rate": 8.784039737671159e-05, + "loss": 1.8768, + "step": 8145 + }, + { + "epoch": 2.5003069367710253, + "grad_norm": 0.3147718012332916, + "learning_rate": 8.783714824637011e-05, + "loss": 1.853, + "step": 8146 + }, + { + "epoch": 2.5006138735420502, + "grad_norm": 0.34790241718292236, + "learning_rate": 8.783389874209977e-05, + "loss": 1.8328, + "step": 8147 + }, + { + "epoch": 2.5009208103130756, + "grad_norm": 0.29425308108329773, + "learning_rate": 8.783064886393264e-05, + "loss": 1.8487, + "step": 8148 + }, + { + "epoch": 2.5012277470841005, + "grad_norm": 0.30555078387260437, + "learning_rate": 8.782739861190088e-05, + "loss": 1.8588, + "step": 8149 + }, + { + "epoch": 2.501534683855126, + "grad_norm": 0.29712429642677307, + "learning_rate": 8.78241479860366e-05, + "loss": 1.8056, + "step": 8150 + }, + { + "epoch": 2.501841620626151, + "grad_norm": 0.32512977719306946, + "learning_rate": 8.782089698637191e-05, + "loss": 1.9099, + "step": 8151 + }, + { + "epoch": 2.5021485573971765, + "grad_norm": 0.3660493493080139, + "learning_rate": 8.781764561293895e-05, + "loss": 1.905, + "step": 8152 + }, + { + "epoch": 2.5024554941682013, + "grad_norm": 0.33591583371162415, + "learning_rate": 8.781439386576984e-05, + "loss": 1.8353, + "step": 8153 + }, + { + "epoch": 2.5027624309392267, + "grad_norm": 0.3774370551109314, + "learning_rate": 8.781114174489673e-05, + "loss": 1.8626, + "step": 8154 + }, + { + "epoch": 2.5030693677102516, + "grad_norm": 0.3628109097480774, + "learning_rate": 8.780788925035178e-05, + "loss": 1.8549, + "step": 8155 + }, + { + "epoch": 2.503376304481277, + "grad_norm": 0.3089732825756073, + "learning_rate": 8.78046363821671e-05, + "loss": 1.835, + "step": 8156 + }, + { + "epoch": 2.5036832412523022, + "grad_norm": 0.3630690574645996, + "learning_rate": 8.780138314037482e-05, + "loss": 1.8308, + "step": 8157 + }, + { + "epoch": 2.503990178023327, + "grad_norm": 0.3658130466938019, + "learning_rate": 8.779812952500714e-05, + "loss": 1.8484, + "step": 8158 + }, + { + "epoch": 2.5042971147943525, + "grad_norm": 0.38401272892951965, + "learning_rate": 8.779487553609617e-05, + "loss": 1.8408, + "step": 8159 + }, + { + "epoch": 2.5046040515653774, + "grad_norm": 0.354514479637146, + "learning_rate": 8.77916211736741e-05, + "loss": 1.8491, + "step": 8160 + }, + { + "epoch": 2.5049109883364027, + "grad_norm": 0.3604681193828583, + "learning_rate": 8.778836643777309e-05, + "loss": 1.8887, + "step": 8161 + }, + { + "epoch": 2.505217925107428, + "grad_norm": 0.3155761957168579, + "learning_rate": 8.778511132842528e-05, + "loss": 1.8066, + "step": 8162 + }, + { + "epoch": 2.505524861878453, + "grad_norm": 0.35986092686653137, + "learning_rate": 8.778185584566286e-05, + "loss": 1.8348, + "step": 8163 + }, + { + "epoch": 2.5058317986494782, + "grad_norm": 0.558273434638977, + "learning_rate": 8.777859998951799e-05, + "loss": 1.9118, + "step": 8164 + }, + { + "epoch": 2.506138735420503, + "grad_norm": 0.6520169377326965, + "learning_rate": 8.777534376002285e-05, + "loss": 1.8747, + "step": 8165 + }, + { + "epoch": 2.5064456721915285, + "grad_norm": 0.5059971213340759, + "learning_rate": 8.777208715720963e-05, + "loss": 1.8218, + "step": 8166 + }, + { + "epoch": 2.506752608962554, + "grad_norm": 0.2873745560646057, + "learning_rate": 8.77688301811105e-05, + "loss": 1.8266, + "step": 8167 + }, + { + "epoch": 2.507059545733579, + "grad_norm": 0.4212021827697754, + "learning_rate": 8.776557283175765e-05, + "loss": 1.8553, + "step": 8168 + }, + { + "epoch": 2.507366482504604, + "grad_norm": 0.49324098229408264, + "learning_rate": 8.776231510918328e-05, + "loss": 1.8625, + "step": 8169 + }, + { + "epoch": 2.5076734192756294, + "grad_norm": 0.4414234459400177, + "learning_rate": 8.775905701341959e-05, + "loss": 1.7956, + "step": 8170 + }, + { + "epoch": 2.5079803560466543, + "grad_norm": 0.2691541612148285, + "learning_rate": 8.775579854449876e-05, + "loss": 1.8216, + "step": 8171 + }, + { + "epoch": 2.5082872928176796, + "grad_norm": 0.3366323411464691, + "learning_rate": 8.775253970245299e-05, + "loss": 1.8738, + "step": 8172 + }, + { + "epoch": 2.508594229588705, + "grad_norm": 0.49541351199150085, + "learning_rate": 8.77492804873145e-05, + "loss": 1.8281, + "step": 8173 + }, + { + "epoch": 2.50890116635973, + "grad_norm": 0.584227442741394, + "learning_rate": 8.774602089911548e-05, + "loss": 1.8248, + "step": 8174 + }, + { + "epoch": 2.509208103130755, + "grad_norm": 0.4493597149848938, + "learning_rate": 8.774276093788818e-05, + "loss": 1.8624, + "step": 8175 + }, + { + "epoch": 2.50951503990178, + "grad_norm": 0.29684513807296753, + "learning_rate": 8.77395006036648e-05, + "loss": 1.7806, + "step": 8176 + }, + { + "epoch": 2.5098219766728054, + "grad_norm": 0.38788866996765137, + "learning_rate": 8.773623989647754e-05, + "loss": 1.8334, + "step": 8177 + }, + { + "epoch": 2.5101289134438307, + "grad_norm": 0.44810980558395386, + "learning_rate": 8.773297881635865e-05, + "loss": 1.823, + "step": 8178 + }, + { + "epoch": 2.5104358502148556, + "grad_norm": 0.39918363094329834, + "learning_rate": 8.772971736334032e-05, + "loss": 1.8535, + "step": 8179 + }, + { + "epoch": 2.510742786985881, + "grad_norm": 0.3454466462135315, + "learning_rate": 8.772645553745484e-05, + "loss": 1.8532, + "step": 8180 + }, + { + "epoch": 2.511049723756906, + "grad_norm": 0.3523466885089874, + "learning_rate": 8.77231933387344e-05, + "loss": 1.8402, + "step": 8181 + }, + { + "epoch": 2.511356660527931, + "grad_norm": 0.41947969794273376, + "learning_rate": 8.771993076721126e-05, + "loss": 1.8509, + "step": 8182 + }, + { + "epoch": 2.5116635972989565, + "grad_norm": 0.43224433064460754, + "learning_rate": 8.771666782291765e-05, + "loss": 1.858, + "step": 8183 + }, + { + "epoch": 2.511970534069982, + "grad_norm": 0.3467538058757782, + "learning_rate": 8.771340450588584e-05, + "loss": 1.8528, + "step": 8184 + }, + { + "epoch": 2.5122774708410067, + "grad_norm": 0.33712685108184814, + "learning_rate": 8.771014081614803e-05, + "loss": 1.8741, + "step": 8185 + }, + { + "epoch": 2.512584407612032, + "grad_norm": 0.4289829134941101, + "learning_rate": 8.770687675373652e-05, + "loss": 1.8252, + "step": 8186 + }, + { + "epoch": 2.512891344383057, + "grad_norm": 0.4774068295955658, + "learning_rate": 8.770361231868356e-05, + "loss": 1.8285, + "step": 8187 + }, + { + "epoch": 2.5131982811540823, + "grad_norm": 0.3455580472946167, + "learning_rate": 8.77003475110214e-05, + "loss": 1.8025, + "step": 8188 + }, + { + "epoch": 2.5135052179251076, + "grad_norm": 0.3050900399684906, + "learning_rate": 8.769708233078231e-05, + "loss": 1.8764, + "step": 8189 + }, + { + "epoch": 2.5138121546961325, + "grad_norm": 0.42384061217308044, + "learning_rate": 8.769381677799855e-05, + "loss": 1.8937, + "step": 8190 + }, + { + "epoch": 2.514119091467158, + "grad_norm": 0.4084749221801758, + "learning_rate": 8.76905508527024e-05, + "loss": 1.8124, + "step": 8191 + }, + { + "epoch": 2.5144260282381827, + "grad_norm": 0.38785848021507263, + "learning_rate": 8.768728455492615e-05, + "loss": 1.8731, + "step": 8192 + }, + { + "epoch": 2.514732965009208, + "grad_norm": 0.28196588158607483, + "learning_rate": 8.768401788470206e-05, + "loss": 1.809, + "step": 8193 + }, + { + "epoch": 2.5150399017802334, + "grad_norm": 0.3551066815853119, + "learning_rate": 8.76807508420624e-05, + "loss": 1.8955, + "step": 8194 + }, + { + "epoch": 2.5153468385512583, + "grad_norm": 0.4327031373977661, + "learning_rate": 8.76774834270395e-05, + "loss": 1.8651, + "step": 8195 + }, + { + "epoch": 2.5156537753222836, + "grad_norm": 0.3748793303966522, + "learning_rate": 8.76742156396656e-05, + "loss": 1.8158, + "step": 8196 + }, + { + "epoch": 2.5159607120933085, + "grad_norm": 0.32504430413246155, + "learning_rate": 8.767094747997304e-05, + "loss": 1.8598, + "step": 8197 + }, + { + "epoch": 2.516267648864334, + "grad_norm": 0.3639826476573944, + "learning_rate": 8.76676789479941e-05, + "loss": 1.8829, + "step": 8198 + }, + { + "epoch": 2.516574585635359, + "grad_norm": 0.36793577671051025, + "learning_rate": 8.766441004376106e-05, + "loss": 1.8215, + "step": 8199 + }, + { + "epoch": 2.5168815224063845, + "grad_norm": 0.3245735466480255, + "learning_rate": 8.766114076730624e-05, + "loss": 1.8309, + "step": 8200 + }, + { + "epoch": 2.5171884591774094, + "grad_norm": 0.3022485673427582, + "learning_rate": 8.765787111866198e-05, + "loss": 1.8286, + "step": 8201 + }, + { + "epoch": 2.5174953959484347, + "grad_norm": 0.40962809324264526, + "learning_rate": 8.765460109786056e-05, + "loss": 1.8032, + "step": 8202 + }, + { + "epoch": 2.5178023327194596, + "grad_norm": 0.4123937487602234, + "learning_rate": 8.765133070493428e-05, + "loss": 1.9311, + "step": 8203 + }, + { + "epoch": 2.518109269490485, + "grad_norm": 0.30352556705474854, + "learning_rate": 8.764805993991551e-05, + "loss": 1.8197, + "step": 8204 + }, + { + "epoch": 2.5184162062615103, + "grad_norm": 0.3201169967651367, + "learning_rate": 8.764478880283653e-05, + "loss": 1.9355, + "step": 8205 + }, + { + "epoch": 2.518723143032535, + "grad_norm": 0.36343297362327576, + "learning_rate": 8.764151729372969e-05, + "loss": 1.9201, + "step": 8206 + }, + { + "epoch": 2.5190300798035605, + "grad_norm": 0.3273618817329407, + "learning_rate": 8.763824541262729e-05, + "loss": 1.8195, + "step": 8207 + }, + { + "epoch": 2.5193370165745854, + "grad_norm": 0.30200251936912537, + "learning_rate": 8.76349731595617e-05, + "loss": 1.8094, + "step": 8208 + }, + { + "epoch": 2.5196439533456108, + "grad_norm": 0.3177770674228668, + "learning_rate": 8.763170053456527e-05, + "loss": 1.8519, + "step": 8209 + }, + { + "epoch": 2.519950890116636, + "grad_norm": 0.3206307291984558, + "learning_rate": 8.762842753767031e-05, + "loss": 1.8496, + "step": 8210 + }, + { + "epoch": 2.520257826887661, + "grad_norm": 0.31902456283569336, + "learning_rate": 8.762515416890915e-05, + "loss": 1.9069, + "step": 8211 + }, + { + "epoch": 2.5205647636586863, + "grad_norm": 0.3088377118110657, + "learning_rate": 8.762188042831419e-05, + "loss": 1.8482, + "step": 8212 + }, + { + "epoch": 2.520871700429711, + "grad_norm": 0.3046402931213379, + "learning_rate": 8.761860631591773e-05, + "loss": 1.8241, + "step": 8213 + }, + { + "epoch": 2.5211786372007365, + "grad_norm": 0.291831910610199, + "learning_rate": 8.761533183175217e-05, + "loss": 1.846, + "step": 8214 + }, + { + "epoch": 2.521485573971762, + "grad_norm": 0.3514893054962158, + "learning_rate": 8.761205697584986e-05, + "loss": 1.9, + "step": 8215 + }, + { + "epoch": 2.521792510742787, + "grad_norm": 0.31843090057373047, + "learning_rate": 8.760878174824316e-05, + "loss": 1.78, + "step": 8216 + }, + { + "epoch": 2.522099447513812, + "grad_norm": 0.30090904235839844, + "learning_rate": 8.760550614896443e-05, + "loss": 1.8718, + "step": 8217 + }, + { + "epoch": 2.5224063842848374, + "grad_norm": 0.38502126932144165, + "learning_rate": 8.760223017804604e-05, + "loss": 1.8772, + "step": 8218 + }, + { + "epoch": 2.5227133210558623, + "grad_norm": 0.30862319469451904, + "learning_rate": 8.759895383552037e-05, + "loss": 1.8532, + "step": 8219 + }, + { + "epoch": 2.5230202578268877, + "grad_norm": 0.36331596970558167, + "learning_rate": 8.759567712141981e-05, + "loss": 1.8587, + "step": 8220 + }, + { + "epoch": 2.523327194597913, + "grad_norm": 0.3370853662490845, + "learning_rate": 8.759240003577673e-05, + "loss": 1.8065, + "step": 8221 + }, + { + "epoch": 2.523634131368938, + "grad_norm": 0.3047318160533905, + "learning_rate": 8.758912257862351e-05, + "loss": 1.8783, + "step": 8222 + }, + { + "epoch": 2.523941068139963, + "grad_norm": 0.3172069787979126, + "learning_rate": 8.758584474999257e-05, + "loss": 1.7844, + "step": 8223 + }, + { + "epoch": 2.524248004910988, + "grad_norm": 0.3063897490501404, + "learning_rate": 8.758256654991626e-05, + "loss": 1.8642, + "step": 8224 + }, + { + "epoch": 2.5245549416820134, + "grad_norm": 0.2535867393016815, + "learning_rate": 8.757928797842702e-05, + "loss": 1.7784, + "step": 8225 + }, + { + "epoch": 2.5248618784530388, + "grad_norm": 0.27732348442077637, + "learning_rate": 8.757600903555722e-05, + "loss": 1.8223, + "step": 8226 + }, + { + "epoch": 2.525168815224064, + "grad_norm": 0.29819566011428833, + "learning_rate": 8.757272972133927e-05, + "loss": 1.8237, + "step": 8227 + }, + { + "epoch": 2.525475751995089, + "grad_norm": 0.26726382970809937, + "learning_rate": 8.756945003580559e-05, + "loss": 1.8134, + "step": 8228 + }, + { + "epoch": 2.5257826887661143, + "grad_norm": 0.2845614552497864, + "learning_rate": 8.756616997898859e-05, + "loss": 1.8757, + "step": 8229 + }, + { + "epoch": 2.5260896255371392, + "grad_norm": 0.33399102091789246, + "learning_rate": 8.756288955092066e-05, + "loss": 1.9036, + "step": 8230 + }, + { + "epoch": 2.5263965623081646, + "grad_norm": 0.3839001953601837, + "learning_rate": 8.755960875163426e-05, + "loss": 1.8205, + "step": 8231 + }, + { + "epoch": 2.52670349907919, + "grad_norm": 0.3703761696815491, + "learning_rate": 8.75563275811618e-05, + "loss": 1.768, + "step": 8232 + }, + { + "epoch": 2.527010435850215, + "grad_norm": 0.3083760440349579, + "learning_rate": 8.755304603953568e-05, + "loss": 1.8621, + "step": 8233 + }, + { + "epoch": 2.52731737262124, + "grad_norm": 0.2995334267616272, + "learning_rate": 8.754976412678833e-05, + "loss": 1.8246, + "step": 8234 + }, + { + "epoch": 2.527624309392265, + "grad_norm": 0.3482929766178131, + "learning_rate": 8.754648184295222e-05, + "loss": 1.7982, + "step": 8235 + }, + { + "epoch": 2.5279312461632903, + "grad_norm": 0.37462911009788513, + "learning_rate": 8.754319918805978e-05, + "loss": 1.8458, + "step": 8236 + }, + { + "epoch": 2.5282381829343157, + "grad_norm": 0.3112029433250427, + "learning_rate": 8.753991616214343e-05, + "loss": 1.9116, + "step": 8237 + }, + { + "epoch": 2.5285451197053406, + "grad_norm": 0.309711217880249, + "learning_rate": 8.753663276523563e-05, + "loss": 1.8072, + "step": 8238 + }, + { + "epoch": 2.528852056476366, + "grad_norm": 0.3831833302974701, + "learning_rate": 8.753334899736882e-05, + "loss": 1.8769, + "step": 8239 + }, + { + "epoch": 2.529158993247391, + "grad_norm": 0.30272287130355835, + "learning_rate": 8.753006485857547e-05, + "loss": 1.7874, + "step": 8240 + }, + { + "epoch": 2.529465930018416, + "grad_norm": 0.3613976538181305, + "learning_rate": 8.752678034888801e-05, + "loss": 1.8591, + "step": 8241 + }, + { + "epoch": 2.5297728667894415, + "grad_norm": 0.35976549983024597, + "learning_rate": 8.75234954683389e-05, + "loss": 1.7831, + "step": 8242 + }, + { + "epoch": 2.530079803560467, + "grad_norm": 0.33987951278686523, + "learning_rate": 8.752021021696064e-05, + "loss": 1.7986, + "step": 8243 + }, + { + "epoch": 2.5303867403314917, + "grad_norm": 0.29231634736061096, + "learning_rate": 8.751692459478567e-05, + "loss": 1.8205, + "step": 8244 + }, + { + "epoch": 2.530693677102517, + "grad_norm": 0.3382028341293335, + "learning_rate": 8.751363860184644e-05, + "loss": 1.8403, + "step": 8245 + }, + { + "epoch": 2.531000613873542, + "grad_norm": 0.44643479585647583, + "learning_rate": 8.751035223817546e-05, + "loss": 1.8273, + "step": 8246 + }, + { + "epoch": 2.5313075506445673, + "grad_norm": 0.4412732720375061, + "learning_rate": 8.750706550380518e-05, + "loss": 1.7935, + "step": 8247 + }, + { + "epoch": 2.5316144874155926, + "grad_norm": 0.3826131820678711, + "learning_rate": 8.750377839876811e-05, + "loss": 1.8622, + "step": 8248 + }, + { + "epoch": 2.5319214241866175, + "grad_norm": 0.27509525418281555, + "learning_rate": 8.750049092309672e-05, + "loss": 1.8359, + "step": 8249 + }, + { + "epoch": 2.532228360957643, + "grad_norm": 0.36282727122306824, + "learning_rate": 8.749720307682348e-05, + "loss": 1.8531, + "step": 8250 + }, + { + "epoch": 2.5325352977286677, + "grad_norm": 0.3730177581310272, + "learning_rate": 8.749391485998091e-05, + "loss": 1.8616, + "step": 8251 + }, + { + "epoch": 2.532842234499693, + "grad_norm": 0.3347858190536499, + "learning_rate": 8.749062627260152e-05, + "loss": 1.8078, + "step": 8252 + }, + { + "epoch": 2.5331491712707184, + "grad_norm": 0.29422396421432495, + "learning_rate": 8.748733731471777e-05, + "loss": 1.8623, + "step": 8253 + }, + { + "epoch": 2.5334561080417433, + "grad_norm": 0.36915895342826843, + "learning_rate": 8.748404798636219e-05, + "loss": 1.8461, + "step": 8254 + }, + { + "epoch": 2.5337630448127686, + "grad_norm": 0.4497677981853485, + "learning_rate": 8.748075828756725e-05, + "loss": 1.8328, + "step": 8255 + }, + { + "epoch": 2.5340699815837935, + "grad_norm": 0.4770478308200836, + "learning_rate": 8.747746821836552e-05, + "loss": 1.8418, + "step": 8256 + }, + { + "epoch": 2.534376918354819, + "grad_norm": 0.39125776290893555, + "learning_rate": 8.747417777878946e-05, + "loss": 1.8044, + "step": 8257 + }, + { + "epoch": 2.534683855125844, + "grad_norm": 0.2976539731025696, + "learning_rate": 8.747088696887163e-05, + "loss": 1.8819, + "step": 8258 + }, + { + "epoch": 2.5349907918968695, + "grad_norm": 0.37511107325553894, + "learning_rate": 8.746759578864452e-05, + "loss": 1.8304, + "step": 8259 + }, + { + "epoch": 2.5352977286678944, + "grad_norm": 0.4462794363498688, + "learning_rate": 8.746430423814068e-05, + "loss": 1.8248, + "step": 8260 + }, + { + "epoch": 2.5356046654389197, + "grad_norm": 0.3465537130832672, + "learning_rate": 8.746101231739261e-05, + "loss": 1.7987, + "step": 8261 + }, + { + "epoch": 2.5359116022099446, + "grad_norm": 0.3182581663131714, + "learning_rate": 8.745772002643287e-05, + "loss": 1.8817, + "step": 8262 + }, + { + "epoch": 2.53621853898097, + "grad_norm": 0.43006083369255066, + "learning_rate": 8.745442736529398e-05, + "loss": 1.8003, + "step": 8263 + }, + { + "epoch": 2.5365254757519953, + "grad_norm": 0.45511460304260254, + "learning_rate": 8.745113433400849e-05, + "loss": 1.8735, + "step": 8264 + }, + { + "epoch": 2.53683241252302, + "grad_norm": 0.3625985085964203, + "learning_rate": 8.744784093260894e-05, + "loss": 1.8469, + "step": 8265 + }, + { + "epoch": 2.5371393492940455, + "grad_norm": 0.2977297306060791, + "learning_rate": 8.744454716112787e-05, + "loss": 1.7885, + "step": 8266 + }, + { + "epoch": 2.5374462860650704, + "grad_norm": 0.34910085797309875, + "learning_rate": 8.744125301959785e-05, + "loss": 1.8885, + "step": 8267 + }, + { + "epoch": 2.5377532228360957, + "grad_norm": 0.40707942843437195, + "learning_rate": 8.743795850805141e-05, + "loss": 1.8829, + "step": 8268 + }, + { + "epoch": 2.538060159607121, + "grad_norm": 0.4142697751522064, + "learning_rate": 8.743466362652114e-05, + "loss": 1.903, + "step": 8269 + }, + { + "epoch": 2.538367096378146, + "grad_norm": 0.38610437512397766, + "learning_rate": 8.743136837503958e-05, + "loss": 1.9245, + "step": 8270 + }, + { + "epoch": 2.5386740331491713, + "grad_norm": 0.2940465211868286, + "learning_rate": 8.742807275363928e-05, + "loss": 1.8532, + "step": 8271 + }, + { + "epoch": 2.538980969920196, + "grad_norm": 0.3257673978805542, + "learning_rate": 8.742477676235284e-05, + "loss": 1.8517, + "step": 8272 + }, + { + "epoch": 2.5392879066912215, + "grad_norm": 0.3709326982498169, + "learning_rate": 8.742148040121282e-05, + "loss": 1.872, + "step": 8273 + }, + { + "epoch": 2.539594843462247, + "grad_norm": 0.3433123826980591, + "learning_rate": 8.741818367025179e-05, + "loss": 1.8717, + "step": 8274 + }, + { + "epoch": 2.539901780233272, + "grad_norm": 0.39426255226135254, + "learning_rate": 8.741488656950234e-05, + "loss": 1.8155, + "step": 8275 + }, + { + "epoch": 2.540208717004297, + "grad_norm": 0.48205071687698364, + "learning_rate": 8.741158909899706e-05, + "loss": 1.8668, + "step": 8276 + }, + { + "epoch": 2.5405156537753224, + "grad_norm": 0.35280337929725647, + "learning_rate": 8.740829125876853e-05, + "loss": 1.7845, + "step": 8277 + }, + { + "epoch": 2.5408225905463473, + "grad_norm": 0.3148525059223175, + "learning_rate": 8.740499304884932e-05, + "loss": 1.8539, + "step": 8278 + }, + { + "epoch": 2.5411295273173726, + "grad_norm": 0.387932687997818, + "learning_rate": 8.740169446927207e-05, + "loss": 1.8514, + "step": 8279 + }, + { + "epoch": 2.541436464088398, + "grad_norm": 0.37375807762145996, + "learning_rate": 8.739839552006934e-05, + "loss": 1.8497, + "step": 8280 + }, + { + "epoch": 2.541743400859423, + "grad_norm": 0.3094288408756256, + "learning_rate": 8.739509620127375e-05, + "loss": 1.8675, + "step": 8281 + }, + { + "epoch": 2.542050337630448, + "grad_norm": 0.36951884627342224, + "learning_rate": 8.73917965129179e-05, + "loss": 1.8533, + "step": 8282 + }, + { + "epoch": 2.542357274401473, + "grad_norm": 0.39360809326171875, + "learning_rate": 8.73884964550344e-05, + "loss": 1.8688, + "step": 8283 + }, + { + "epoch": 2.5426642111724984, + "grad_norm": 0.29781201481819153, + "learning_rate": 8.738519602765586e-05, + "loss": 1.8285, + "step": 8284 + }, + { + "epoch": 2.5429711479435237, + "grad_norm": 0.29476743936538696, + "learning_rate": 8.73818952308149e-05, + "loss": 1.8234, + "step": 8285 + }, + { + "epoch": 2.5432780847145486, + "grad_norm": 0.3660123646259308, + "learning_rate": 8.737859406454416e-05, + "loss": 1.8933, + "step": 8286 + }, + { + "epoch": 2.543585021485574, + "grad_norm": 0.41587865352630615, + "learning_rate": 8.737529252887621e-05, + "loss": 1.8799, + "step": 8287 + }, + { + "epoch": 2.5438919582565993, + "grad_norm": 0.4183691143989563, + "learning_rate": 8.737199062384374e-05, + "loss": 1.8479, + "step": 8288 + }, + { + "epoch": 2.544198895027624, + "grad_norm": 0.35940057039260864, + "learning_rate": 8.736868834947935e-05, + "loss": 1.8164, + "step": 8289 + }, + { + "epoch": 2.5445058317986495, + "grad_norm": 0.26804691553115845, + "learning_rate": 8.736538570581568e-05, + "loss": 1.8017, + "step": 8290 + }, + { + "epoch": 2.544812768569675, + "grad_norm": 0.34537792205810547, + "learning_rate": 8.736208269288534e-05, + "loss": 1.9002, + "step": 8291 + }, + { + "epoch": 2.5451197053406998, + "grad_norm": 0.4636915624141693, + "learning_rate": 8.735877931072106e-05, + "loss": 1.8207, + "step": 8292 + }, + { + "epoch": 2.545426642111725, + "grad_norm": 0.4897560775279999, + "learning_rate": 8.735547555935537e-05, + "loss": 1.7981, + "step": 8293 + }, + { + "epoch": 2.54573357888275, + "grad_norm": 0.37379372119903564, + "learning_rate": 8.7352171438821e-05, + "loss": 1.8727, + "step": 8294 + }, + { + "epoch": 2.5460405156537753, + "grad_norm": 0.295436292886734, + "learning_rate": 8.734886694915059e-05, + "loss": 1.8321, + "step": 8295 + }, + { + "epoch": 2.5463474524248007, + "grad_norm": 0.40406084060668945, + "learning_rate": 8.734556209037676e-05, + "loss": 1.8666, + "step": 8296 + }, + { + "epoch": 2.5466543891958255, + "grad_norm": 0.3286290466785431, + "learning_rate": 8.734225686253221e-05, + "loss": 1.8574, + "step": 8297 + }, + { + "epoch": 2.546961325966851, + "grad_norm": 0.3200569152832031, + "learning_rate": 8.73389512656496e-05, + "loss": 1.8253, + "step": 8298 + }, + { + "epoch": 2.5472682627378758, + "grad_norm": 0.35550132393836975, + "learning_rate": 8.733564529976157e-05, + "loss": 1.8293, + "step": 8299 + }, + { + "epoch": 2.547575199508901, + "grad_norm": 0.3804685175418854, + "learning_rate": 8.733233896490081e-05, + "loss": 1.8689, + "step": 8300 + }, + { + "epoch": 2.5478821362799264, + "grad_norm": 0.34739598631858826, + "learning_rate": 8.73290322611e-05, + "loss": 1.8441, + "step": 8301 + }, + { + "epoch": 2.5481890730509518, + "grad_norm": 0.29757586121559143, + "learning_rate": 8.732572518839182e-05, + "loss": 1.8698, + "step": 8302 + }, + { + "epoch": 2.5484960098219767, + "grad_norm": 0.30403536558151245, + "learning_rate": 8.732241774680895e-05, + "loss": 1.8305, + "step": 8303 + }, + { + "epoch": 2.548802946593002, + "grad_norm": 0.326876699924469, + "learning_rate": 8.731910993638406e-05, + "loss": 1.8514, + "step": 8304 + }, + { + "epoch": 2.549109883364027, + "grad_norm": 0.3108467161655426, + "learning_rate": 8.731580175714986e-05, + "loss": 1.8509, + "step": 8305 + }, + { + "epoch": 2.549416820135052, + "grad_norm": 0.31641489267349243, + "learning_rate": 8.731249320913904e-05, + "loss": 1.9009, + "step": 8306 + }, + { + "epoch": 2.5497237569060776, + "grad_norm": 0.3166131377220154, + "learning_rate": 8.730918429238428e-05, + "loss": 1.8291, + "step": 8307 + }, + { + "epoch": 2.5500306936771024, + "grad_norm": 0.27900195121765137, + "learning_rate": 8.730587500691829e-05, + "loss": 1.856, + "step": 8308 + }, + { + "epoch": 2.550337630448128, + "grad_norm": 0.3000704050064087, + "learning_rate": 8.730256535277379e-05, + "loss": 1.839, + "step": 8309 + }, + { + "epoch": 2.5506445672191527, + "grad_norm": 0.30938518047332764, + "learning_rate": 8.729925532998348e-05, + "loss": 1.929, + "step": 8310 + }, + { + "epoch": 2.550951503990178, + "grad_norm": 0.3687250316143036, + "learning_rate": 8.729594493858007e-05, + "loss": 1.9214, + "step": 8311 + }, + { + "epoch": 2.5512584407612033, + "grad_norm": 0.3302690386772156, + "learning_rate": 8.729263417859625e-05, + "loss": 1.8667, + "step": 8312 + }, + { + "epoch": 2.5515653775322282, + "grad_norm": 0.32535505294799805, + "learning_rate": 8.728932305006478e-05, + "loss": 1.8298, + "step": 8313 + }, + { + "epoch": 2.5518723143032536, + "grad_norm": 0.3425545394420624, + "learning_rate": 8.728601155301834e-05, + "loss": 1.9479, + "step": 8314 + }, + { + "epoch": 2.5521792510742785, + "grad_norm": 0.29452621936798096, + "learning_rate": 8.72826996874897e-05, + "loss": 1.7963, + "step": 8315 + }, + { + "epoch": 2.552486187845304, + "grad_norm": 0.28749120235443115, + "learning_rate": 8.727938745351156e-05, + "loss": 1.7993, + "step": 8316 + }, + { + "epoch": 2.552793124616329, + "grad_norm": 0.29261404275894165, + "learning_rate": 8.727607485111669e-05, + "loss": 1.8307, + "step": 8317 + }, + { + "epoch": 2.5531000613873545, + "grad_norm": 0.2949221730232239, + "learning_rate": 8.727276188033778e-05, + "loss": 1.7918, + "step": 8318 + }, + { + "epoch": 2.5534069981583793, + "grad_norm": 0.2975117862224579, + "learning_rate": 8.726944854120757e-05, + "loss": 1.8488, + "step": 8319 + }, + { + "epoch": 2.5537139349294047, + "grad_norm": 0.30285659432411194, + "learning_rate": 8.726613483375885e-05, + "loss": 1.8763, + "step": 8320 + }, + { + "epoch": 2.5540208717004296, + "grad_norm": 0.3068414330482483, + "learning_rate": 8.726282075802435e-05, + "loss": 1.8684, + "step": 8321 + }, + { + "epoch": 2.554327808471455, + "grad_norm": 0.3904091715812683, + "learning_rate": 8.72595063140368e-05, + "loss": 1.8643, + "step": 8322 + }, + { + "epoch": 2.5546347452424802, + "grad_norm": 0.443294882774353, + "learning_rate": 8.725619150182897e-05, + "loss": 1.8268, + "step": 8323 + }, + { + "epoch": 2.554941682013505, + "grad_norm": 0.4574877619743347, + "learning_rate": 8.725287632143362e-05, + "loss": 1.8686, + "step": 8324 + }, + { + "epoch": 2.5552486187845305, + "grad_norm": 0.3246860206127167, + "learning_rate": 8.724956077288351e-05, + "loss": 1.8304, + "step": 8325 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.30745935440063477, + "learning_rate": 8.724624485621141e-05, + "loss": 1.8129, + "step": 8326 + }, + { + "epoch": 2.5558624923265807, + "grad_norm": 0.4026782214641571, + "learning_rate": 8.72429285714501e-05, + "loss": 1.8511, + "step": 8327 + }, + { + "epoch": 2.556169429097606, + "grad_norm": 0.41659530997276306, + "learning_rate": 8.723961191863232e-05, + "loss": 1.891, + "step": 8328 + }, + { + "epoch": 2.556476365868631, + "grad_norm": 0.31792551279067993, + "learning_rate": 8.723629489779088e-05, + "loss": 1.8413, + "step": 8329 + }, + { + "epoch": 2.5567833026396563, + "grad_norm": 0.3168247640132904, + "learning_rate": 8.723297750895856e-05, + "loss": 1.902, + "step": 8330 + }, + { + "epoch": 2.557090239410681, + "grad_norm": 0.27834242582321167, + "learning_rate": 8.72296597521681e-05, + "loss": 1.8185, + "step": 8331 + }, + { + "epoch": 2.5573971761817065, + "grad_norm": 0.2997399568557739, + "learning_rate": 8.722634162745236e-05, + "loss": 1.8389, + "step": 8332 + }, + { + "epoch": 2.557704112952732, + "grad_norm": 0.29116490483283997, + "learning_rate": 8.722302313484407e-05, + "loss": 1.8391, + "step": 8333 + }, + { + "epoch": 2.558011049723757, + "grad_norm": 0.2898460030555725, + "learning_rate": 8.721970427437605e-05, + "loss": 1.8891, + "step": 8334 + }, + { + "epoch": 2.558317986494782, + "grad_norm": 0.3231159746646881, + "learning_rate": 8.721638504608109e-05, + "loss": 1.826, + "step": 8335 + }, + { + "epoch": 2.5586249232658074, + "grad_norm": 0.38665273785591125, + "learning_rate": 8.721306544999203e-05, + "loss": 1.9162, + "step": 8336 + }, + { + "epoch": 2.5589318600368323, + "grad_norm": 0.367824912071228, + "learning_rate": 8.720974548614162e-05, + "loss": 1.8165, + "step": 8337 + }, + { + "epoch": 2.5592387968078576, + "grad_norm": 0.3095315098762512, + "learning_rate": 8.72064251545627e-05, + "loss": 1.8887, + "step": 8338 + }, + { + "epoch": 2.559545733578883, + "grad_norm": 0.316890150308609, + "learning_rate": 8.720310445528807e-05, + "loss": 1.8547, + "step": 8339 + }, + { + "epoch": 2.559852670349908, + "grad_norm": 0.2962728440761566, + "learning_rate": 8.719978338835057e-05, + "loss": 1.8252, + "step": 8340 + }, + { + "epoch": 2.560159607120933, + "grad_norm": 0.3351762890815735, + "learning_rate": 8.719646195378302e-05, + "loss": 1.8056, + "step": 8341 + }, + { + "epoch": 2.560466543891958, + "grad_norm": 0.2946149706840515, + "learning_rate": 8.719314015161822e-05, + "loss": 1.8219, + "step": 8342 + }, + { + "epoch": 2.5607734806629834, + "grad_norm": 0.30291053652763367, + "learning_rate": 8.718981798188899e-05, + "loss": 1.8161, + "step": 8343 + }, + { + "epoch": 2.5610804174340087, + "grad_norm": 0.30717429518699646, + "learning_rate": 8.71864954446282e-05, + "loss": 1.8763, + "step": 8344 + }, + { + "epoch": 2.5613873542050336, + "grad_norm": 0.28360515832901, + "learning_rate": 8.718317253986866e-05, + "loss": 1.7972, + "step": 8345 + }, + { + "epoch": 2.561694290976059, + "grad_norm": 0.34898701310157776, + "learning_rate": 8.717984926764322e-05, + "loss": 1.8843, + "step": 8346 + }, + { + "epoch": 2.562001227747084, + "grad_norm": 0.2702360451221466, + "learning_rate": 8.717652562798472e-05, + "loss": 1.7917, + "step": 8347 + }, + { + "epoch": 2.562308164518109, + "grad_norm": 0.30566295981407166, + "learning_rate": 8.7173201620926e-05, + "loss": 1.9027, + "step": 8348 + }, + { + "epoch": 2.5626151012891345, + "grad_norm": 0.2882433533668518, + "learning_rate": 8.716987724649991e-05, + "loss": 1.8167, + "step": 8349 + }, + { + "epoch": 2.56292203806016, + "grad_norm": 0.2616370916366577, + "learning_rate": 8.71665525047393e-05, + "loss": 1.7779, + "step": 8350 + }, + { + "epoch": 2.5632289748311847, + "grad_norm": 0.3033899664878845, + "learning_rate": 8.716322739567706e-05, + "loss": 1.9022, + "step": 8351 + }, + { + "epoch": 2.56353591160221, + "grad_norm": 0.30584800243377686, + "learning_rate": 8.7159901919346e-05, + "loss": 1.8808, + "step": 8352 + }, + { + "epoch": 2.563842848373235, + "grad_norm": 0.34650805592536926, + "learning_rate": 8.715657607577903e-05, + "loss": 1.8817, + "step": 8353 + }, + { + "epoch": 2.5641497851442603, + "grad_norm": 0.30568572878837585, + "learning_rate": 8.715324986500898e-05, + "loss": 1.8852, + "step": 8354 + }, + { + "epoch": 2.5644567219152856, + "grad_norm": 0.36174869537353516, + "learning_rate": 8.714992328706875e-05, + "loss": 1.8518, + "step": 8355 + }, + { + "epoch": 2.5647636586863105, + "grad_norm": 0.48538872599601746, + "learning_rate": 8.714659634199119e-05, + "loss": 1.8902, + "step": 8356 + }, + { + "epoch": 2.565070595457336, + "grad_norm": 0.44997766613960266, + "learning_rate": 8.71432690298092e-05, + "loss": 1.8914, + "step": 8357 + }, + { + "epoch": 2.5653775322283607, + "grad_norm": 0.30164965987205505, + "learning_rate": 8.713994135055566e-05, + "loss": 1.826, + "step": 8358 + }, + { + "epoch": 2.565684468999386, + "grad_norm": 0.35495996475219727, + "learning_rate": 8.713661330426345e-05, + "loss": 1.8006, + "step": 8359 + }, + { + "epoch": 2.5659914057704114, + "grad_norm": 0.4141593277454376, + "learning_rate": 8.713328489096545e-05, + "loss": 1.782, + "step": 8360 + }, + { + "epoch": 2.5662983425414367, + "grad_norm": 0.4758378267288208, + "learning_rate": 8.712995611069458e-05, + "loss": 1.8378, + "step": 8361 + }, + { + "epoch": 2.5666052793124616, + "grad_norm": 0.4852865934371948, + "learning_rate": 8.71266269634837e-05, + "loss": 1.8472, + "step": 8362 + }, + { + "epoch": 2.566912216083487, + "grad_norm": 0.43413496017456055, + "learning_rate": 8.712329744936576e-05, + "loss": 1.8118, + "step": 8363 + }, + { + "epoch": 2.567219152854512, + "grad_norm": 0.3100700080394745, + "learning_rate": 8.711996756837361e-05, + "loss": 1.8699, + "step": 8364 + }, + { + "epoch": 2.567526089625537, + "grad_norm": 0.31886258721351624, + "learning_rate": 8.711663732054021e-05, + "loss": 1.8022, + "step": 8365 + }, + { + "epoch": 2.5678330263965625, + "grad_norm": 0.38900697231292725, + "learning_rate": 8.711330670589841e-05, + "loss": 1.8119, + "step": 8366 + }, + { + "epoch": 2.5681399631675874, + "grad_norm": 0.4188348650932312, + "learning_rate": 8.710997572448119e-05, + "loss": 1.8561, + "step": 8367 + }, + { + "epoch": 2.5684468999386127, + "grad_norm": 0.3562021255493164, + "learning_rate": 8.710664437632143e-05, + "loss": 1.8605, + "step": 8368 + }, + { + "epoch": 2.5687538367096376, + "grad_norm": 0.3105112910270691, + "learning_rate": 8.710331266145206e-05, + "loss": 1.8122, + "step": 8369 + }, + { + "epoch": 2.569060773480663, + "grad_norm": 0.3209846615791321, + "learning_rate": 8.7099980579906e-05, + "loss": 1.8914, + "step": 8370 + }, + { + "epoch": 2.5693677102516883, + "grad_norm": 0.32560455799102783, + "learning_rate": 8.70966481317162e-05, + "loss": 1.9245, + "step": 8371 + }, + { + "epoch": 2.569674647022713, + "grad_norm": 0.29573267698287964, + "learning_rate": 8.709331531691558e-05, + "loss": 1.8576, + "step": 8372 + }, + { + "epoch": 2.5699815837937385, + "grad_norm": 0.2974778115749359, + "learning_rate": 8.708998213553707e-05, + "loss": 1.8464, + "step": 8373 + }, + { + "epoch": 2.5702885205647634, + "grad_norm": 0.3264322578907013, + "learning_rate": 8.708664858761362e-05, + "loss": 1.8945, + "step": 8374 + }, + { + "epoch": 2.5705954573357888, + "grad_norm": 0.28260353207588196, + "learning_rate": 8.708331467317816e-05, + "loss": 1.8296, + "step": 8375 + }, + { + "epoch": 2.570902394106814, + "grad_norm": 0.2991141676902771, + "learning_rate": 8.707998039226367e-05, + "loss": 1.9227, + "step": 8376 + }, + { + "epoch": 2.5712093308778394, + "grad_norm": 0.28582924604415894, + "learning_rate": 8.707664574490306e-05, + "loss": 1.8465, + "step": 8377 + }, + { + "epoch": 2.5715162676488643, + "grad_norm": 0.2860773205757141, + "learning_rate": 8.707331073112932e-05, + "loss": 1.8403, + "step": 8378 + }, + { + "epoch": 2.5718232044198897, + "grad_norm": 0.31145161390304565, + "learning_rate": 8.70699753509754e-05, + "loss": 1.8775, + "step": 8379 + }, + { + "epoch": 2.5721301411909145, + "grad_norm": 0.28711119294166565, + "learning_rate": 8.706663960447424e-05, + "loss": 1.8354, + "step": 8380 + }, + { + "epoch": 2.57243707796194, + "grad_norm": 0.2884272634983063, + "learning_rate": 8.706330349165884e-05, + "loss": 1.8772, + "step": 8381 + }, + { + "epoch": 2.572744014732965, + "grad_norm": 0.3581789433956146, + "learning_rate": 8.705996701256214e-05, + "loss": 1.8654, + "step": 8382 + }, + { + "epoch": 2.57305095150399, + "grad_norm": 0.41561809182167053, + "learning_rate": 8.705663016721712e-05, + "loss": 1.9112, + "step": 8383 + }, + { + "epoch": 2.5733578882750154, + "grad_norm": 0.301883727312088, + "learning_rate": 8.705329295565676e-05, + "loss": 1.803, + "step": 8384 + }, + { + "epoch": 2.5736648250460403, + "grad_norm": 0.37060779333114624, + "learning_rate": 8.704995537791405e-05, + "loss": 1.9371, + "step": 8385 + }, + { + "epoch": 2.5739717618170657, + "grad_norm": 0.44705548882484436, + "learning_rate": 8.704661743402195e-05, + "loss": 1.8599, + "step": 8386 + }, + { + "epoch": 2.574278698588091, + "grad_norm": 0.44097039103507996, + "learning_rate": 8.70432791240135e-05, + "loss": 1.8305, + "step": 8387 + }, + { + "epoch": 2.574585635359116, + "grad_norm": 0.3278143107891083, + "learning_rate": 8.703994044792161e-05, + "loss": 1.8817, + "step": 8388 + }, + { + "epoch": 2.574892572130141, + "grad_norm": 0.347153902053833, + "learning_rate": 8.703660140577934e-05, + "loss": 1.8182, + "step": 8389 + }, + { + "epoch": 2.575199508901166, + "grad_norm": 0.4667893052101135, + "learning_rate": 8.703326199761966e-05, + "loss": 1.8354, + "step": 8390 + }, + { + "epoch": 2.5755064456721914, + "grad_norm": 0.4956285059452057, + "learning_rate": 8.702992222347559e-05, + "loss": 1.8284, + "step": 8391 + }, + { + "epoch": 2.575813382443217, + "grad_norm": 0.3489355146884918, + "learning_rate": 8.702658208338012e-05, + "loss": 1.8439, + "step": 8392 + }, + { + "epoch": 2.576120319214242, + "grad_norm": 0.3054865002632141, + "learning_rate": 8.702324157736625e-05, + "loss": 1.8659, + "step": 8393 + }, + { + "epoch": 2.576427255985267, + "grad_norm": 0.3459004759788513, + "learning_rate": 8.701990070546703e-05, + "loss": 1.8644, + "step": 8394 + }, + { + "epoch": 2.5767341927562923, + "grad_norm": 0.34715306758880615, + "learning_rate": 8.701655946771544e-05, + "loss": 1.8765, + "step": 8395 + }, + { + "epoch": 2.5770411295273172, + "grad_norm": 0.35610535740852356, + "learning_rate": 8.701321786414452e-05, + "loss": 1.886, + "step": 8396 + }, + { + "epoch": 2.5773480662983426, + "grad_norm": 0.34869852662086487, + "learning_rate": 8.700987589478728e-05, + "loss": 1.8858, + "step": 8397 + }, + { + "epoch": 2.577655003069368, + "grad_norm": 0.33508050441741943, + "learning_rate": 8.700653355967675e-05, + "loss": 1.8429, + "step": 8398 + }, + { + "epoch": 2.577961939840393, + "grad_norm": 0.4707668721675873, + "learning_rate": 8.700319085884597e-05, + "loss": 1.8806, + "step": 8399 + }, + { + "epoch": 2.578268876611418, + "grad_norm": 0.5073609948158264, + "learning_rate": 8.699984779232797e-05, + "loss": 1.9252, + "step": 8400 + }, + { + "epoch": 2.578575813382443, + "grad_norm": 0.4120771884918213, + "learning_rate": 8.699650436015578e-05, + "loss": 1.9463, + "step": 8401 + }, + { + "epoch": 2.5788827501534684, + "grad_norm": 0.5639505386352539, + "learning_rate": 8.699316056236246e-05, + "loss": 1.9076, + "step": 8402 + }, + { + "epoch": 2.5791896869244937, + "grad_norm": 0.7611388564109802, + "learning_rate": 8.698981639898106e-05, + "loss": 1.8344, + "step": 8403 + }, + { + "epoch": 2.5794966236955186, + "grad_norm": 0.715629518032074, + "learning_rate": 8.69864718700446e-05, + "loss": 1.7928, + "step": 8404 + }, + { + "epoch": 2.579803560466544, + "grad_norm": 0.4248988926410675, + "learning_rate": 8.698312697558614e-05, + "loss": 1.835, + "step": 8405 + }, + { + "epoch": 2.580110497237569, + "grad_norm": 0.3638152778148651, + "learning_rate": 8.697978171563875e-05, + "loss": 1.8544, + "step": 8406 + }, + { + "epoch": 2.580417434008594, + "grad_norm": 0.40734997391700745, + "learning_rate": 8.697643609023547e-05, + "loss": 1.7759, + "step": 8407 + }, + { + "epoch": 2.5807243707796195, + "grad_norm": 0.41469305753707886, + "learning_rate": 8.697309009940939e-05, + "loss": 1.8989, + "step": 8408 + }, + { + "epoch": 2.581031307550645, + "grad_norm": 0.3003403842449188, + "learning_rate": 8.696974374319355e-05, + "loss": 1.8138, + "step": 8409 + }, + { + "epoch": 2.5813382443216697, + "grad_norm": 0.3475555181503296, + "learning_rate": 8.696639702162104e-05, + "loss": 1.8851, + "step": 8410 + }, + { + "epoch": 2.581645181092695, + "grad_norm": 0.3952930271625519, + "learning_rate": 8.696304993472493e-05, + "loss": 1.8421, + "step": 8411 + }, + { + "epoch": 2.58195211786372, + "grad_norm": 0.33059266209602356, + "learning_rate": 8.69597024825383e-05, + "loss": 1.886, + "step": 8412 + }, + { + "epoch": 2.5822590546347453, + "grad_norm": 0.291877806186676, + "learning_rate": 8.695635466509422e-05, + "loss": 1.8001, + "step": 8413 + }, + { + "epoch": 2.5825659914057706, + "grad_norm": 0.3707219064235687, + "learning_rate": 8.69530064824258e-05, + "loss": 1.8419, + "step": 8414 + }, + { + "epoch": 2.5828729281767955, + "grad_norm": 0.4656111001968384, + "learning_rate": 8.694965793456609e-05, + "loss": 1.8925, + "step": 8415 + }, + { + "epoch": 2.583179864947821, + "grad_norm": 0.4284421503543854, + "learning_rate": 8.694630902154821e-05, + "loss": 1.8794, + "step": 8416 + }, + { + "epoch": 2.5834868017188457, + "grad_norm": 0.25311100482940674, + "learning_rate": 8.694295974340525e-05, + "loss": 1.8004, + "step": 8417 + }, + { + "epoch": 2.583793738489871, + "grad_norm": 0.3463805615901947, + "learning_rate": 8.693961010017031e-05, + "loss": 1.8666, + "step": 8418 + }, + { + "epoch": 2.5841006752608964, + "grad_norm": 0.3193957209587097, + "learning_rate": 8.693626009187647e-05, + "loss": 1.8787, + "step": 8419 + }, + { + "epoch": 2.5844076120319213, + "grad_norm": 0.30919939279556274, + "learning_rate": 8.69329097185569e-05, + "loss": 1.9066, + "step": 8420 + }, + { + "epoch": 2.5847145488029466, + "grad_norm": 0.31369611620903015, + "learning_rate": 8.692955898024464e-05, + "loss": 1.8714, + "step": 8421 + }, + { + "epoch": 2.5850214855739715, + "grad_norm": 0.3191319406032562, + "learning_rate": 8.692620787697284e-05, + "loss": 1.8535, + "step": 8422 + }, + { + "epoch": 2.585328422344997, + "grad_norm": 0.3148418366909027, + "learning_rate": 8.692285640877462e-05, + "loss": 1.8648, + "step": 8423 + }, + { + "epoch": 2.585635359116022, + "grad_norm": 0.28245437145233154, + "learning_rate": 8.691950457568307e-05, + "loss": 1.8574, + "step": 8424 + }, + { + "epoch": 2.5859422958870475, + "grad_norm": 0.28383150696754456, + "learning_rate": 8.691615237773137e-05, + "loss": 1.7993, + "step": 8425 + }, + { + "epoch": 2.5862492326580724, + "grad_norm": 0.30522802472114563, + "learning_rate": 8.691279981495257e-05, + "loss": 1.8809, + "step": 8426 + }, + { + "epoch": 2.5865561694290977, + "grad_norm": 0.2936995327472687, + "learning_rate": 8.690944688737988e-05, + "loss": 1.745, + "step": 8427 + }, + { + "epoch": 2.5868631062001226, + "grad_norm": 0.2923533320426941, + "learning_rate": 8.69060935950464e-05, + "loss": 1.8929, + "step": 8428 + }, + { + "epoch": 2.587170042971148, + "grad_norm": 0.3280770182609558, + "learning_rate": 8.690273993798526e-05, + "loss": 1.8587, + "step": 8429 + }, + { + "epoch": 2.5874769797421733, + "grad_norm": 0.314712792634964, + "learning_rate": 8.689938591622962e-05, + "loss": 1.8569, + "step": 8430 + }, + { + "epoch": 2.587783916513198, + "grad_norm": 0.3230959475040436, + "learning_rate": 8.689603152981263e-05, + "loss": 1.8451, + "step": 8431 + }, + { + "epoch": 2.5880908532842235, + "grad_norm": 0.35917067527770996, + "learning_rate": 8.689267677876742e-05, + "loss": 1.7755, + "step": 8432 + }, + { + "epoch": 2.5883977900552484, + "grad_norm": 0.3590618968009949, + "learning_rate": 8.688932166312715e-05, + "loss": 1.8236, + "step": 8433 + }, + { + "epoch": 2.5887047268262737, + "grad_norm": 0.29416507482528687, + "learning_rate": 8.6885966182925e-05, + "loss": 1.7852, + "step": 8434 + }, + { + "epoch": 2.589011663597299, + "grad_norm": 0.24230079352855682, + "learning_rate": 8.688261033819409e-05, + "loss": 1.8006, + "step": 8435 + }, + { + "epoch": 2.5893186003683244, + "grad_norm": 0.2519497573375702, + "learning_rate": 8.687925412896762e-05, + "loss": 1.7787, + "step": 8436 + }, + { + "epoch": 2.5896255371393493, + "grad_norm": 0.2794395089149475, + "learning_rate": 8.687589755527874e-05, + "loss": 1.8408, + "step": 8437 + }, + { + "epoch": 2.5899324739103746, + "grad_norm": 0.28811511397361755, + "learning_rate": 8.687254061716063e-05, + "loss": 1.8961, + "step": 8438 + }, + { + "epoch": 2.5902394106813995, + "grad_norm": 0.28127825260162354, + "learning_rate": 8.686918331464647e-05, + "loss": 1.8235, + "step": 8439 + }, + { + "epoch": 2.590546347452425, + "grad_norm": 0.2869607210159302, + "learning_rate": 8.686582564776942e-05, + "loss": 1.8452, + "step": 8440 + }, + { + "epoch": 2.59085328422345, + "grad_norm": 0.36350393295288086, + "learning_rate": 8.686246761656268e-05, + "loss": 1.9262, + "step": 8441 + }, + { + "epoch": 2.591160220994475, + "grad_norm": 0.30231785774230957, + "learning_rate": 8.685910922105942e-05, + "loss": 1.8674, + "step": 8442 + }, + { + "epoch": 2.5914671577655004, + "grad_norm": 0.28321847319602966, + "learning_rate": 8.685575046129285e-05, + "loss": 1.8243, + "step": 8443 + }, + { + "epoch": 2.5917740945365253, + "grad_norm": 0.30235186219215393, + "learning_rate": 8.685239133729615e-05, + "loss": 1.8442, + "step": 8444 + }, + { + "epoch": 2.5920810313075506, + "grad_norm": 0.2684946060180664, + "learning_rate": 8.684903184910252e-05, + "loss": 1.8584, + "step": 8445 + }, + { + "epoch": 2.592387968078576, + "grad_norm": 0.33788567781448364, + "learning_rate": 8.684567199674514e-05, + "loss": 1.8296, + "step": 8446 + }, + { + "epoch": 2.592694904849601, + "grad_norm": 0.38110965490341187, + "learning_rate": 8.684231178025726e-05, + "loss": 1.8581, + "step": 8447 + }, + { + "epoch": 2.593001841620626, + "grad_norm": 0.36466923356056213, + "learning_rate": 8.683895119967204e-05, + "loss": 1.8799, + "step": 8448 + }, + { + "epoch": 2.593308778391651, + "grad_norm": 0.3052733838558197, + "learning_rate": 8.683559025502272e-05, + "loss": 1.8834, + "step": 8449 + }, + { + "epoch": 2.5936157151626764, + "grad_norm": 0.31457164883613586, + "learning_rate": 8.683222894634251e-05, + "loss": 1.8635, + "step": 8450 + }, + { + "epoch": 2.5939226519337018, + "grad_norm": 0.46189576387405396, + "learning_rate": 8.682886727366464e-05, + "loss": 1.8852, + "step": 8451 + }, + { + "epoch": 2.594229588704727, + "grad_norm": 0.467640221118927, + "learning_rate": 8.682550523702229e-05, + "loss": 1.8306, + "step": 8452 + }, + { + "epoch": 2.594536525475752, + "grad_norm": 0.3384416699409485, + "learning_rate": 8.682214283644873e-05, + "loss": 1.8298, + "step": 8453 + }, + { + "epoch": 2.5948434622467773, + "grad_norm": 0.2842169404029846, + "learning_rate": 8.681878007197717e-05, + "loss": 1.8091, + "step": 8454 + }, + { + "epoch": 2.595150399017802, + "grad_norm": 0.31266552209854126, + "learning_rate": 8.681541694364084e-05, + "loss": 1.8329, + "step": 8455 + }, + { + "epoch": 2.5954573357888275, + "grad_norm": 0.36803483963012695, + "learning_rate": 8.681205345147298e-05, + "loss": 1.8427, + "step": 8456 + }, + { + "epoch": 2.595764272559853, + "grad_norm": 0.37500229477882385, + "learning_rate": 8.680868959550684e-05, + "loss": 1.8865, + "step": 8457 + }, + { + "epoch": 2.5960712093308778, + "grad_norm": 0.30494266748428345, + "learning_rate": 8.680532537577565e-05, + "loss": 1.8375, + "step": 8458 + }, + { + "epoch": 2.596378146101903, + "grad_norm": 0.38320985436439514, + "learning_rate": 8.680196079231266e-05, + "loss": 1.8762, + "step": 8459 + }, + { + "epoch": 2.596685082872928, + "grad_norm": 0.48555347323417664, + "learning_rate": 8.679859584515112e-05, + "loss": 1.8558, + "step": 8460 + }, + { + "epoch": 2.5969920196439533, + "grad_norm": 0.3975796401500702, + "learning_rate": 8.67952305343243e-05, + "loss": 1.8265, + "step": 8461 + }, + { + "epoch": 2.5972989564149787, + "grad_norm": 0.3312734365463257, + "learning_rate": 8.679186485986544e-05, + "loss": 1.8346, + "step": 8462 + }, + { + "epoch": 2.5976058931860035, + "grad_norm": 0.37137889862060547, + "learning_rate": 8.67884988218078e-05, + "loss": 1.8894, + "step": 8463 + }, + { + "epoch": 2.597912829957029, + "grad_norm": 0.3645901083946228, + "learning_rate": 8.678513242018467e-05, + "loss": 1.8103, + "step": 8464 + }, + { + "epoch": 2.5982197667280538, + "grad_norm": 0.35010847449302673, + "learning_rate": 8.67817656550293e-05, + "loss": 1.8704, + "step": 8465 + }, + { + "epoch": 2.598526703499079, + "grad_norm": 0.36948931217193604, + "learning_rate": 8.677839852637492e-05, + "loss": 1.8413, + "step": 8466 + }, + { + "epoch": 2.5988336402701044, + "grad_norm": 0.3512018322944641, + "learning_rate": 8.67750310342549e-05, + "loss": 1.8222, + "step": 8467 + }, + { + "epoch": 2.5991405770411298, + "grad_norm": 0.3678590953350067, + "learning_rate": 8.677166317870245e-05, + "loss": 1.852, + "step": 8468 + }, + { + "epoch": 2.5994475138121547, + "grad_norm": 0.46718111634254456, + "learning_rate": 8.676829495975087e-05, + "loss": 1.8459, + "step": 8469 + }, + { + "epoch": 2.59975445058318, + "grad_norm": 0.4580456018447876, + "learning_rate": 8.676492637743345e-05, + "loss": 1.8547, + "step": 8470 + }, + { + "epoch": 2.600061387354205, + "grad_norm": 0.3790566921234131, + "learning_rate": 8.676155743178348e-05, + "loss": 1.8483, + "step": 8471 + }, + { + "epoch": 2.6003683241252302, + "grad_norm": 0.34775233268737793, + "learning_rate": 8.675818812283424e-05, + "loss": 1.9, + "step": 8472 + }, + { + "epoch": 2.6006752608962556, + "grad_norm": 0.4257417619228363, + "learning_rate": 8.675481845061906e-05, + "loss": 1.8354, + "step": 8473 + }, + { + "epoch": 2.6009821976672804, + "grad_norm": 0.46964964270591736, + "learning_rate": 8.675144841517122e-05, + "loss": 1.8305, + "step": 8474 + }, + { + "epoch": 2.601289134438306, + "grad_norm": 0.3592812120914459, + "learning_rate": 8.674807801652403e-05, + "loss": 1.778, + "step": 8475 + }, + { + "epoch": 2.6015960712093307, + "grad_norm": 0.3184985816478729, + "learning_rate": 8.674470725471078e-05, + "loss": 1.8706, + "step": 8476 + }, + { + "epoch": 2.601903007980356, + "grad_norm": 0.31306785345077515, + "learning_rate": 8.674133612976481e-05, + "loss": 1.8482, + "step": 8477 + }, + { + "epoch": 2.6022099447513813, + "grad_norm": 0.30568715929985046, + "learning_rate": 8.673796464171939e-05, + "loss": 1.8346, + "step": 8478 + }, + { + "epoch": 2.6025168815224062, + "grad_norm": 0.33701828122138977, + "learning_rate": 8.673459279060791e-05, + "loss": 1.8165, + "step": 8479 + }, + { + "epoch": 2.6028238182934316, + "grad_norm": 0.3153107166290283, + "learning_rate": 8.673122057646364e-05, + "loss": 1.8175, + "step": 8480 + }, + { + "epoch": 2.6031307550644565, + "grad_norm": 0.3428439497947693, + "learning_rate": 8.67278479993199e-05, + "loss": 1.8344, + "step": 8481 + }, + { + "epoch": 2.603437691835482, + "grad_norm": 0.39118432998657227, + "learning_rate": 8.672447505921006e-05, + "loss": 1.7904, + "step": 8482 + }, + { + "epoch": 2.603744628606507, + "grad_norm": 0.3845612108707428, + "learning_rate": 8.672110175616743e-05, + "loss": 1.8442, + "step": 8483 + }, + { + "epoch": 2.6040515653775325, + "grad_norm": 0.3402850329875946, + "learning_rate": 8.671772809022535e-05, + "loss": 1.8578, + "step": 8484 + }, + { + "epoch": 2.6043585021485574, + "grad_norm": 0.30314967036247253, + "learning_rate": 8.671435406141716e-05, + "loss": 1.8235, + "step": 8485 + }, + { + "epoch": 2.6046654389195827, + "grad_norm": 0.29402145743370056, + "learning_rate": 8.67109796697762e-05, + "loss": 1.8105, + "step": 8486 + }, + { + "epoch": 2.6049723756906076, + "grad_norm": 0.33207419514656067, + "learning_rate": 8.670760491533582e-05, + "loss": 1.9133, + "step": 8487 + }, + { + "epoch": 2.605279312461633, + "grad_norm": 0.3287195861339569, + "learning_rate": 8.670422979812938e-05, + "loss": 1.8344, + "step": 8488 + }, + { + "epoch": 2.6055862492326582, + "grad_norm": 0.37947842478752136, + "learning_rate": 8.670085431819021e-05, + "loss": 1.8504, + "step": 8489 + }, + { + "epoch": 2.605893186003683, + "grad_norm": 0.3688724935054779, + "learning_rate": 8.669747847555171e-05, + "loss": 1.8305, + "step": 8490 + }, + { + "epoch": 2.6062001227747085, + "grad_norm": 0.33962976932525635, + "learning_rate": 8.669410227024721e-05, + "loss": 1.861, + "step": 8491 + }, + { + "epoch": 2.6065070595457334, + "grad_norm": 0.27068057656288147, + "learning_rate": 8.669072570231009e-05, + "loss": 1.7666, + "step": 8492 + }, + { + "epoch": 2.6068139963167587, + "grad_norm": 0.32670122385025024, + "learning_rate": 8.668734877177371e-05, + "loss": 1.8434, + "step": 8493 + }, + { + "epoch": 2.607120933087784, + "grad_norm": 0.37303030490875244, + "learning_rate": 8.668397147867144e-05, + "loss": 1.8326, + "step": 8494 + }, + { + "epoch": 2.607427869858809, + "grad_norm": 0.2860218286514282, + "learning_rate": 8.668059382303666e-05, + "loss": 1.7993, + "step": 8495 + }, + { + "epoch": 2.6077348066298343, + "grad_norm": 0.3480636477470398, + "learning_rate": 8.667721580490278e-05, + "loss": 1.8895, + "step": 8496 + }, + { + "epoch": 2.608041743400859, + "grad_norm": 0.37609198689460754, + "learning_rate": 8.667383742430313e-05, + "loss": 1.8906, + "step": 8497 + }, + { + "epoch": 2.6083486801718845, + "grad_norm": 0.30747851729393005, + "learning_rate": 8.667045868127113e-05, + "loss": 1.8169, + "step": 8498 + }, + { + "epoch": 2.60865561694291, + "grad_norm": 0.3108443021774292, + "learning_rate": 8.666707957584016e-05, + "loss": 1.8296, + "step": 8499 + }, + { + "epoch": 2.608962553713935, + "grad_norm": 0.36353448033332825, + "learning_rate": 8.666370010804361e-05, + "loss": 1.879, + "step": 8500 + }, + { + "epoch": 2.60926949048496, + "grad_norm": 0.39959096908569336, + "learning_rate": 8.666032027791491e-05, + "loss": 1.8602, + "step": 8501 + }, + { + "epoch": 2.6095764272559854, + "grad_norm": 0.3505500853061676, + "learning_rate": 8.665694008548742e-05, + "loss": 1.861, + "step": 8502 + }, + { + "epoch": 2.6098833640270103, + "grad_norm": 0.3155219852924347, + "learning_rate": 8.665355953079457e-05, + "loss": 1.7911, + "step": 8503 + }, + { + "epoch": 2.6101903007980356, + "grad_norm": 0.2868075668811798, + "learning_rate": 8.665017861386975e-05, + "loss": 1.8023, + "step": 8504 + }, + { + "epoch": 2.610497237569061, + "grad_norm": 0.2890832722187042, + "learning_rate": 8.664679733474641e-05, + "loss": 1.8653, + "step": 8505 + }, + { + "epoch": 2.610804174340086, + "grad_norm": 0.3143366575241089, + "learning_rate": 8.66434156934579e-05, + "loss": 1.8024, + "step": 8506 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.28702911734580994, + "learning_rate": 8.664003369003772e-05, + "loss": 1.8231, + "step": 8507 + }, + { + "epoch": 2.611418047882136, + "grad_norm": 0.37087059020996094, + "learning_rate": 8.663665132451924e-05, + "loss": 1.8565, + "step": 8508 + }, + { + "epoch": 2.6117249846531614, + "grad_norm": 0.29796209931373596, + "learning_rate": 8.663326859693588e-05, + "loss": 1.8188, + "step": 8509 + }, + { + "epoch": 2.6120319214241867, + "grad_norm": 0.31352412700653076, + "learning_rate": 8.66298855073211e-05, + "loss": 1.806, + "step": 8510 + }, + { + "epoch": 2.612338858195212, + "grad_norm": 0.28749167919158936, + "learning_rate": 8.662650205570832e-05, + "loss": 1.8082, + "step": 8511 + }, + { + "epoch": 2.612645794966237, + "grad_norm": 0.26889678835868835, + "learning_rate": 8.662311824213099e-05, + "loss": 1.8211, + "step": 8512 + }, + { + "epoch": 2.6129527317372623, + "grad_norm": 0.2562754154205322, + "learning_rate": 8.661973406662253e-05, + "loss": 1.7519, + "step": 8513 + }, + { + "epoch": 2.613259668508287, + "grad_norm": 0.26967912912368774, + "learning_rate": 8.661634952921639e-05, + "loss": 1.8339, + "step": 8514 + }, + { + "epoch": 2.6135666052793125, + "grad_norm": 0.3468424081802368, + "learning_rate": 8.661296462994602e-05, + "loss": 1.9219, + "step": 8515 + }, + { + "epoch": 2.613873542050338, + "grad_norm": 0.34790560603141785, + "learning_rate": 8.660957936884489e-05, + "loss": 1.9089, + "step": 8516 + }, + { + "epoch": 2.6141804788213627, + "grad_norm": 0.350337952375412, + "learning_rate": 8.660619374594643e-05, + "loss": 1.8228, + "step": 8517 + }, + { + "epoch": 2.614487415592388, + "grad_norm": 0.37077057361602783, + "learning_rate": 8.660280776128411e-05, + "loss": 1.8658, + "step": 8518 + }, + { + "epoch": 2.614794352363413, + "grad_norm": 0.35846221446990967, + "learning_rate": 8.659942141489139e-05, + "loss": 1.8573, + "step": 8519 + }, + { + "epoch": 2.6151012891344383, + "grad_norm": 0.339101642370224, + "learning_rate": 8.659603470680173e-05, + "loss": 1.875, + "step": 8520 + }, + { + "epoch": 2.6154082259054636, + "grad_norm": 0.35074207186698914, + "learning_rate": 8.65926476370486e-05, + "loss": 1.8395, + "step": 8521 + }, + { + "epoch": 2.6157151626764885, + "grad_norm": 0.31544017791748047, + "learning_rate": 8.658926020566551e-05, + "loss": 1.8453, + "step": 8522 + }, + { + "epoch": 2.616022099447514, + "grad_norm": 0.30619683861732483, + "learning_rate": 8.658587241268587e-05, + "loss": 1.775, + "step": 8523 + }, + { + "epoch": 2.6163290362185387, + "grad_norm": 0.29331618547439575, + "learning_rate": 8.658248425814322e-05, + "loss": 1.8068, + "step": 8524 + }, + { + "epoch": 2.616635972989564, + "grad_norm": 0.2824336290359497, + "learning_rate": 8.6579095742071e-05, + "loss": 1.8759, + "step": 8525 + }, + { + "epoch": 2.6169429097605894, + "grad_norm": 0.2697986364364624, + "learning_rate": 8.657570686450271e-05, + "loss": 1.8295, + "step": 8526 + }, + { + "epoch": 2.6172498465316147, + "grad_norm": 0.3031822144985199, + "learning_rate": 8.657231762547186e-05, + "loss": 1.9205, + "step": 8527 + }, + { + "epoch": 2.6175567833026396, + "grad_norm": 0.2867984473705292, + "learning_rate": 8.656892802501196e-05, + "loss": 1.8638, + "step": 8528 + }, + { + "epoch": 2.617863720073665, + "grad_norm": 0.29799792170524597, + "learning_rate": 8.656553806315644e-05, + "loss": 1.8187, + "step": 8529 + }, + { + "epoch": 2.61817065684469, + "grad_norm": 0.3222150504589081, + "learning_rate": 8.656214773993884e-05, + "loss": 1.8661, + "step": 8530 + }, + { + "epoch": 2.618477593615715, + "grad_norm": 0.35999616980552673, + "learning_rate": 8.655875705539269e-05, + "loss": 1.9155, + "step": 8531 + }, + { + "epoch": 2.6187845303867405, + "grad_norm": 0.36571675539016724, + "learning_rate": 8.655536600955147e-05, + "loss": 1.8536, + "step": 8532 + }, + { + "epoch": 2.6190914671577654, + "grad_norm": 0.29667189717292786, + "learning_rate": 8.655197460244868e-05, + "loss": 1.8208, + "step": 8533 + }, + { + "epoch": 2.6193984039287908, + "grad_norm": 0.3216320276260376, + "learning_rate": 8.654858283411787e-05, + "loss": 1.8613, + "step": 8534 + }, + { + "epoch": 2.6197053406998156, + "grad_norm": 0.28880423307418823, + "learning_rate": 8.654519070459254e-05, + "loss": 1.8547, + "step": 8535 + }, + { + "epoch": 2.620012277470841, + "grad_norm": 0.3130050301551819, + "learning_rate": 8.654179821390621e-05, + "loss": 1.9355, + "step": 8536 + }, + { + "epoch": 2.6203192142418663, + "grad_norm": 0.3151358664035797, + "learning_rate": 8.653840536209241e-05, + "loss": 1.8462, + "step": 8537 + }, + { + "epoch": 2.620626151012891, + "grad_norm": 0.2702169120311737, + "learning_rate": 8.653501214918468e-05, + "loss": 1.7966, + "step": 8538 + }, + { + "epoch": 2.6209330877839165, + "grad_norm": 0.31494441628456116, + "learning_rate": 8.653161857521655e-05, + "loss": 1.7449, + "step": 8539 + }, + { + "epoch": 2.6212400245549414, + "grad_norm": 0.3219514787197113, + "learning_rate": 8.652822464022154e-05, + "loss": 1.8238, + "step": 8540 + }, + { + "epoch": 2.6215469613259668, + "grad_norm": 0.3237066864967346, + "learning_rate": 8.652483034423322e-05, + "loss": 1.8273, + "step": 8541 + }, + { + "epoch": 2.621853898096992, + "grad_norm": 0.31354910135269165, + "learning_rate": 8.65214356872851e-05, + "loss": 1.8662, + "step": 8542 + }, + { + "epoch": 2.6221608348680174, + "grad_norm": 0.30085036158561707, + "learning_rate": 8.651804066941077e-05, + "loss": 1.8922, + "step": 8543 + }, + { + "epoch": 2.6224677716390423, + "grad_norm": 0.337528258562088, + "learning_rate": 8.651464529064373e-05, + "loss": 1.8234, + "step": 8544 + }, + { + "epoch": 2.6227747084100677, + "grad_norm": 0.33202415704727173, + "learning_rate": 8.65112495510176e-05, + "loss": 1.8331, + "step": 8545 + }, + { + "epoch": 2.6230816451810925, + "grad_norm": 0.3288112282752991, + "learning_rate": 8.650785345056586e-05, + "loss": 1.8129, + "step": 8546 + }, + { + "epoch": 2.623388581952118, + "grad_norm": 0.35483047366142273, + "learning_rate": 8.650445698932214e-05, + "loss": 1.8488, + "step": 8547 + }, + { + "epoch": 2.623695518723143, + "grad_norm": 0.32108932733535767, + "learning_rate": 8.650106016731998e-05, + "loss": 1.8263, + "step": 8548 + }, + { + "epoch": 2.624002455494168, + "grad_norm": 0.2902318239212036, + "learning_rate": 8.649766298459295e-05, + "loss": 1.8352, + "step": 8549 + }, + { + "epoch": 2.6243093922651934, + "grad_norm": 0.29014477133750916, + "learning_rate": 8.64942654411746e-05, + "loss": 1.8568, + "step": 8550 + }, + { + "epoch": 2.6246163290362183, + "grad_norm": 0.3996742367744446, + "learning_rate": 8.649086753709855e-05, + "loss": 1.8928, + "step": 8551 + }, + { + "epoch": 2.6249232658072437, + "grad_norm": 0.3703175187110901, + "learning_rate": 8.648746927239835e-05, + "loss": 1.829, + "step": 8552 + }, + { + "epoch": 2.625230202578269, + "grad_norm": 0.33802542090415955, + "learning_rate": 8.64840706471076e-05, + "loss": 1.8827, + "step": 8553 + }, + { + "epoch": 2.625537139349294, + "grad_norm": 0.33303168416023254, + "learning_rate": 8.648067166125988e-05, + "loss": 1.8964, + "step": 8554 + }, + { + "epoch": 2.6258440761203192, + "grad_norm": 0.33449646830558777, + "learning_rate": 8.647727231488878e-05, + "loss": 1.8477, + "step": 8555 + }, + { + "epoch": 2.626151012891344, + "grad_norm": 0.3260989189147949, + "learning_rate": 8.647387260802788e-05, + "loss": 1.8623, + "step": 8556 + }, + { + "epoch": 2.6264579496623695, + "grad_norm": 0.2847815752029419, + "learning_rate": 8.647047254071082e-05, + "loss": 1.769, + "step": 8557 + }, + { + "epoch": 2.626764886433395, + "grad_norm": 0.30041372776031494, + "learning_rate": 8.646707211297116e-05, + "loss": 1.8451, + "step": 8558 + }, + { + "epoch": 2.62707182320442, + "grad_norm": 0.3557286560535431, + "learning_rate": 8.646367132484252e-05, + "loss": 1.8233, + "step": 8559 + }, + { + "epoch": 2.627378759975445, + "grad_norm": 0.39471131563186646, + "learning_rate": 8.646027017635851e-05, + "loss": 1.8364, + "step": 8560 + }, + { + "epoch": 2.6276856967464703, + "grad_norm": 0.37501803040504456, + "learning_rate": 8.645686866755273e-05, + "loss": 1.8129, + "step": 8561 + }, + { + "epoch": 2.6279926335174952, + "grad_norm": 0.374553918838501, + "learning_rate": 8.645346679845881e-05, + "loss": 1.9388, + "step": 8562 + }, + { + "epoch": 2.6282995702885206, + "grad_norm": 0.34410929679870605, + "learning_rate": 8.645006456911037e-05, + "loss": 1.8496, + "step": 8563 + }, + { + "epoch": 2.628606507059546, + "grad_norm": 0.28208592534065247, + "learning_rate": 8.644666197954103e-05, + "loss": 1.8405, + "step": 8564 + }, + { + "epoch": 2.628913443830571, + "grad_norm": 0.2913917005062103, + "learning_rate": 8.644325902978441e-05, + "loss": 1.8775, + "step": 8565 + }, + { + "epoch": 2.629220380601596, + "grad_norm": 0.33285796642303467, + "learning_rate": 8.643985571987414e-05, + "loss": 1.8217, + "step": 8566 + }, + { + "epoch": 2.629527317372621, + "grad_norm": 0.3419492244720459, + "learning_rate": 8.643645204984386e-05, + "loss": 1.8911, + "step": 8567 + }, + { + "epoch": 2.6298342541436464, + "grad_norm": 0.33901095390319824, + "learning_rate": 8.643304801972721e-05, + "loss": 1.8653, + "step": 8568 + }, + { + "epoch": 2.6301411909146717, + "grad_norm": 0.30073773860931396, + "learning_rate": 8.642964362955781e-05, + "loss": 1.7544, + "step": 8569 + }, + { + "epoch": 2.630448127685697, + "grad_norm": 0.3300367593765259, + "learning_rate": 8.642623887936933e-05, + "loss": 1.8764, + "step": 8570 + }, + { + "epoch": 2.630755064456722, + "grad_norm": 0.330671101808548, + "learning_rate": 8.642283376919542e-05, + "loss": 1.8227, + "step": 8571 + }, + { + "epoch": 2.6310620012277472, + "grad_norm": 0.3498590290546417, + "learning_rate": 8.64194282990697e-05, + "loss": 1.8639, + "step": 8572 + }, + { + "epoch": 2.631368937998772, + "grad_norm": 0.33145999908447266, + "learning_rate": 8.641602246902586e-05, + "loss": 1.8442, + "step": 8573 + }, + { + "epoch": 2.6316758747697975, + "grad_norm": 0.29510337114334106, + "learning_rate": 8.641261627909754e-05, + "loss": 1.829, + "step": 8574 + }, + { + "epoch": 2.631982811540823, + "grad_norm": 0.2788131833076477, + "learning_rate": 8.640920972931839e-05, + "loss": 1.7717, + "step": 8575 + }, + { + "epoch": 2.6322897483118477, + "grad_norm": 0.27459269762039185, + "learning_rate": 8.640580281972209e-05, + "loss": 1.7924, + "step": 8576 + }, + { + "epoch": 2.632596685082873, + "grad_norm": 0.3517146110534668, + "learning_rate": 8.640239555034232e-05, + "loss": 1.8921, + "step": 8577 + }, + { + "epoch": 2.632903621853898, + "grad_norm": 0.2852388620376587, + "learning_rate": 8.639898792121273e-05, + "loss": 1.8207, + "step": 8578 + }, + { + "epoch": 2.6332105586249233, + "grad_norm": 0.3164372742176056, + "learning_rate": 8.639557993236702e-05, + "loss": 1.8782, + "step": 8579 + }, + { + "epoch": 2.6335174953959486, + "grad_norm": 0.43939462304115295, + "learning_rate": 8.639217158383885e-05, + "loss": 1.8345, + "step": 8580 + }, + { + "epoch": 2.6338244321669735, + "grad_norm": 0.45321017503738403, + "learning_rate": 8.63887628756619e-05, + "loss": 1.904, + "step": 8581 + }, + { + "epoch": 2.634131368937999, + "grad_norm": 0.4423905611038208, + "learning_rate": 8.638535380786989e-05, + "loss": 1.8894, + "step": 8582 + }, + { + "epoch": 2.6344383057090237, + "grad_norm": 0.3929237723350525, + "learning_rate": 8.638194438049648e-05, + "loss": 1.8835, + "step": 8583 + }, + { + "epoch": 2.634745242480049, + "grad_norm": 0.3178403973579407, + "learning_rate": 8.637853459357536e-05, + "loss": 1.8125, + "step": 8584 + }, + { + "epoch": 2.6350521792510744, + "grad_norm": 0.3796660602092743, + "learning_rate": 8.637512444714024e-05, + "loss": 1.9376, + "step": 8585 + }, + { + "epoch": 2.6353591160220997, + "grad_norm": 0.34011390805244446, + "learning_rate": 8.637171394122483e-05, + "loss": 1.8339, + "step": 8586 + }, + { + "epoch": 2.6356660527931246, + "grad_norm": 0.3423489034175873, + "learning_rate": 8.636830307586281e-05, + "loss": 1.82, + "step": 8587 + }, + { + "epoch": 2.63597298956415, + "grad_norm": 0.3644867241382599, + "learning_rate": 8.636489185108791e-05, + "loss": 1.811, + "step": 8588 + }, + { + "epoch": 2.636279926335175, + "grad_norm": 0.35383811593055725, + "learning_rate": 8.636148026693384e-05, + "loss": 1.8228, + "step": 8589 + }, + { + "epoch": 2.6365868631062, + "grad_norm": 0.28066012263298035, + "learning_rate": 8.635806832343431e-05, + "loss": 1.7752, + "step": 8590 + }, + { + "epoch": 2.6368937998772255, + "grad_norm": 0.27132275700569153, + "learning_rate": 8.635465602062304e-05, + "loss": 1.8053, + "step": 8591 + }, + { + "epoch": 2.6372007366482504, + "grad_norm": 0.3076920211315155, + "learning_rate": 8.635124335853375e-05, + "loss": 1.77, + "step": 8592 + }, + { + "epoch": 2.6375076734192757, + "grad_norm": 0.35130617022514343, + "learning_rate": 8.634783033720015e-05, + "loss": 1.8272, + "step": 8593 + }, + { + "epoch": 2.6378146101903006, + "grad_norm": 0.3805561661720276, + "learning_rate": 8.634441695665601e-05, + "loss": 1.8549, + "step": 8594 + }, + { + "epoch": 2.638121546961326, + "grad_norm": 0.3168867230415344, + "learning_rate": 8.634100321693504e-05, + "loss": 1.9131, + "step": 8595 + }, + { + "epoch": 2.6384284837323513, + "grad_norm": 0.3061029314994812, + "learning_rate": 8.633758911807095e-05, + "loss": 1.84, + "step": 8596 + }, + { + "epoch": 2.638735420503376, + "grad_norm": 0.2766086459159851, + "learning_rate": 8.633417466009752e-05, + "loss": 1.8519, + "step": 8597 + }, + { + "epoch": 2.6390423572744015, + "grad_norm": 0.3250633180141449, + "learning_rate": 8.633075984304849e-05, + "loss": 1.8434, + "step": 8598 + }, + { + "epoch": 2.6393492940454264, + "grad_norm": 0.2819656729698181, + "learning_rate": 8.63273446669576e-05, + "loss": 1.8181, + "step": 8599 + }, + { + "epoch": 2.6396562308164517, + "grad_norm": 0.3506627678871155, + "learning_rate": 8.632392913185859e-05, + "loss": 1.8521, + "step": 8600 + }, + { + "epoch": 2.639963167587477, + "grad_norm": 0.3026714026927948, + "learning_rate": 8.632051323778521e-05, + "loss": 1.8183, + "step": 8601 + }, + { + "epoch": 2.6402701043585024, + "grad_norm": 0.31900104880332947, + "learning_rate": 8.631709698477124e-05, + "loss": 1.8615, + "step": 8602 + }, + { + "epoch": 2.6405770411295273, + "grad_norm": 0.3017260730266571, + "learning_rate": 8.631368037285044e-05, + "loss": 1.837, + "step": 8603 + }, + { + "epoch": 2.6408839779005526, + "grad_norm": 0.29461613297462463, + "learning_rate": 8.631026340205655e-05, + "loss": 1.8398, + "step": 8604 + }, + { + "epoch": 2.6411909146715775, + "grad_norm": 0.3405241370201111, + "learning_rate": 8.630684607242337e-05, + "loss": 1.9241, + "step": 8605 + }, + { + "epoch": 2.641497851442603, + "grad_norm": 0.36280715465545654, + "learning_rate": 8.630342838398465e-05, + "loss": 1.8319, + "step": 8606 + }, + { + "epoch": 2.641804788213628, + "grad_norm": 0.32274433970451355, + "learning_rate": 8.630001033677414e-05, + "loss": 1.8462, + "step": 8607 + }, + { + "epoch": 2.642111724984653, + "grad_norm": 0.28930720686912537, + "learning_rate": 8.629659193082571e-05, + "loss": 1.8251, + "step": 8608 + }, + { + "epoch": 2.6424186617556784, + "grad_norm": 0.30114278197288513, + "learning_rate": 8.629317316617305e-05, + "loss": 1.8037, + "step": 8609 + }, + { + "epoch": 2.6427255985267033, + "grad_norm": 0.31895074248313904, + "learning_rate": 8.628975404285e-05, + "loss": 1.808, + "step": 8610 + }, + { + "epoch": 2.6430325352977286, + "grad_norm": 0.31819066405296326, + "learning_rate": 8.62863345608903e-05, + "loss": 1.811, + "step": 8611 + }, + { + "epoch": 2.643339472068754, + "grad_norm": 0.3860008716583252, + "learning_rate": 8.628291472032779e-05, + "loss": 1.9041, + "step": 8612 + }, + { + "epoch": 2.643646408839779, + "grad_norm": 0.4598442614078522, + "learning_rate": 8.627949452119626e-05, + "loss": 1.788, + "step": 8613 + }, + { + "epoch": 2.643953345610804, + "grad_norm": 0.4720706641674042, + "learning_rate": 8.62760739635295e-05, + "loss": 1.8436, + "step": 8614 + }, + { + "epoch": 2.644260282381829, + "grad_norm": 0.3894381523132324, + "learning_rate": 8.627265304736131e-05, + "loss": 1.8188, + "step": 8615 + }, + { + "epoch": 2.6445672191528544, + "grad_norm": 0.2819352149963379, + "learning_rate": 8.626923177272551e-05, + "loss": 1.7804, + "step": 8616 + }, + { + "epoch": 2.6448741559238798, + "grad_norm": 0.33847305178642273, + "learning_rate": 8.626581013965588e-05, + "loss": 1.8628, + "step": 8617 + }, + { + "epoch": 2.645181092694905, + "grad_norm": 0.49113303422927856, + "learning_rate": 8.626238814818628e-05, + "loss": 1.821, + "step": 8618 + }, + { + "epoch": 2.64548802946593, + "grad_norm": 0.5562265515327454, + "learning_rate": 8.62589657983505e-05, + "loss": 1.8732, + "step": 8619 + }, + { + "epoch": 2.6457949662369553, + "grad_norm": 0.48525476455688477, + "learning_rate": 8.625554309018237e-05, + "loss": 1.8711, + "step": 8620 + }, + { + "epoch": 2.64610190300798, + "grad_norm": 0.35900986194610596, + "learning_rate": 8.62521200237157e-05, + "loss": 1.8922, + "step": 8621 + }, + { + "epoch": 2.6464088397790055, + "grad_norm": 0.2920636832714081, + "learning_rate": 8.624869659898435e-05, + "loss": 1.8121, + "step": 8622 + }, + { + "epoch": 2.646715776550031, + "grad_norm": 0.3626689314842224, + "learning_rate": 8.624527281602213e-05, + "loss": 1.8231, + "step": 8623 + }, + { + "epoch": 2.6470227133210558, + "grad_norm": 0.37683549523353577, + "learning_rate": 8.624184867486288e-05, + "loss": 1.8648, + "step": 8624 + }, + { + "epoch": 2.647329650092081, + "grad_norm": 0.293865829706192, + "learning_rate": 8.623842417554043e-05, + "loss": 1.8347, + "step": 8625 + }, + { + "epoch": 2.647636586863106, + "grad_norm": 0.28916221857070923, + "learning_rate": 8.623499931808863e-05, + "loss": 1.8337, + "step": 8626 + }, + { + "epoch": 2.6479435236341313, + "grad_norm": 0.439003586769104, + "learning_rate": 8.623157410254134e-05, + "loss": 1.8933, + "step": 8627 + }, + { + "epoch": 2.6482504604051567, + "grad_norm": 0.39125844836235046, + "learning_rate": 8.62281485289324e-05, + "loss": 1.7986, + "step": 8628 + }, + { + "epoch": 2.6485573971761815, + "grad_norm": 0.3968810439109802, + "learning_rate": 8.622472259729566e-05, + "loss": 1.8211, + "step": 8629 + }, + { + "epoch": 2.648864333947207, + "grad_norm": 0.37775713205337524, + "learning_rate": 8.622129630766498e-05, + "loss": 1.8976, + "step": 8630 + }, + { + "epoch": 2.6491712707182318, + "grad_norm": 0.329583078622818, + "learning_rate": 8.621786966007422e-05, + "loss": 1.9164, + "step": 8631 + }, + { + "epoch": 2.649478207489257, + "grad_norm": 0.3499230742454529, + "learning_rate": 8.621444265455725e-05, + "loss": 1.8589, + "step": 8632 + }, + { + "epoch": 2.6497851442602824, + "grad_norm": 0.504540741443634, + "learning_rate": 8.621101529114792e-05, + "loss": 1.7853, + "step": 8633 + }, + { + "epoch": 2.650092081031308, + "grad_norm": 0.47648704051971436, + "learning_rate": 8.620758756988012e-05, + "loss": 1.865, + "step": 8634 + }, + { + "epoch": 2.6503990178023327, + "grad_norm": 0.3592020869255066, + "learning_rate": 8.62041594907877e-05, + "loss": 1.886, + "step": 8635 + }, + { + "epoch": 2.650705954573358, + "grad_norm": 0.4862852096557617, + "learning_rate": 8.620073105390458e-05, + "loss": 1.8408, + "step": 8636 + }, + { + "epoch": 2.651012891344383, + "grad_norm": 0.5418413877487183, + "learning_rate": 8.619730225926462e-05, + "loss": 1.8715, + "step": 8637 + }, + { + "epoch": 2.6513198281154082, + "grad_norm": 0.4154299795627594, + "learning_rate": 8.619387310690168e-05, + "loss": 1.8879, + "step": 8638 + }, + { + "epoch": 2.6516267648864336, + "grad_norm": 0.3325296938419342, + "learning_rate": 8.619044359684968e-05, + "loss": 1.8422, + "step": 8639 + }, + { + "epoch": 2.6519337016574585, + "grad_norm": 0.4082878828048706, + "learning_rate": 8.61870137291425e-05, + "loss": 1.8375, + "step": 8640 + }, + { + "epoch": 2.652240638428484, + "grad_norm": 0.46948596835136414, + "learning_rate": 8.618358350381406e-05, + "loss": 1.8367, + "step": 8641 + }, + { + "epoch": 2.6525475751995087, + "grad_norm": 0.3770928978919983, + "learning_rate": 8.618015292089823e-05, + "loss": 1.8236, + "step": 8642 + }, + { + "epoch": 2.652854511970534, + "grad_norm": 0.27340826392173767, + "learning_rate": 8.617672198042892e-05, + "loss": 1.8446, + "step": 8643 + }, + { + "epoch": 2.6531614487415593, + "grad_norm": 0.4071608781814575, + "learning_rate": 8.617329068244004e-05, + "loss": 1.8576, + "step": 8644 + }, + { + "epoch": 2.6534683855125847, + "grad_norm": 0.5041884779930115, + "learning_rate": 8.61698590269655e-05, + "loss": 1.9075, + "step": 8645 + }, + { + "epoch": 2.6537753222836096, + "grad_norm": 0.4129817485809326, + "learning_rate": 8.616642701403921e-05, + "loss": 1.8592, + "step": 8646 + }, + { + "epoch": 2.654082259054635, + "grad_norm": 0.2837994694709778, + "learning_rate": 8.616299464369508e-05, + "loss": 1.8383, + "step": 8647 + }, + { + "epoch": 2.65438919582566, + "grad_norm": 0.3413170278072357, + "learning_rate": 8.615956191596707e-05, + "loss": 1.8083, + "step": 8648 + }, + { + "epoch": 2.654696132596685, + "grad_norm": 0.3661767244338989, + "learning_rate": 8.615612883088907e-05, + "loss": 1.9141, + "step": 8649 + }, + { + "epoch": 2.6550030693677105, + "grad_norm": 0.3209584951400757, + "learning_rate": 8.6152695388495e-05, + "loss": 1.8886, + "step": 8650 + }, + { + "epoch": 2.6553100061387354, + "grad_norm": 0.3161548674106598, + "learning_rate": 8.61492615888188e-05, + "loss": 1.832, + "step": 8651 + }, + { + "epoch": 2.6556169429097607, + "grad_norm": 0.3258545696735382, + "learning_rate": 8.614582743189441e-05, + "loss": 1.8747, + "step": 8652 + }, + { + "epoch": 2.6559238796807856, + "grad_norm": 0.3528682291507721, + "learning_rate": 8.614239291775579e-05, + "loss": 1.9192, + "step": 8653 + }, + { + "epoch": 2.656230816451811, + "grad_norm": 0.3430826961994171, + "learning_rate": 8.613895804643684e-05, + "loss": 1.8601, + "step": 8654 + }, + { + "epoch": 2.6565377532228363, + "grad_norm": 0.3221988379955292, + "learning_rate": 8.613552281797152e-05, + "loss": 1.9218, + "step": 8655 + }, + { + "epoch": 2.656844689993861, + "grad_norm": 0.2917289137840271, + "learning_rate": 8.613208723239379e-05, + "loss": 1.7443, + "step": 8656 + }, + { + "epoch": 2.6571516267648865, + "grad_norm": 0.28350377082824707, + "learning_rate": 8.612865128973762e-05, + "loss": 1.809, + "step": 8657 + }, + { + "epoch": 2.6574585635359114, + "grad_norm": 0.2758159339427948, + "learning_rate": 8.61252149900369e-05, + "loss": 1.8628, + "step": 8658 + }, + { + "epoch": 2.6577655003069367, + "grad_norm": 0.3537377417087555, + "learning_rate": 8.612177833332566e-05, + "loss": 1.8586, + "step": 8659 + }, + { + "epoch": 2.658072437077962, + "grad_norm": 0.38237693905830383, + "learning_rate": 8.611834131963783e-05, + "loss": 1.8869, + "step": 8660 + }, + { + "epoch": 2.6583793738489874, + "grad_norm": 0.30623751878738403, + "learning_rate": 8.611490394900739e-05, + "loss": 1.8508, + "step": 8661 + }, + { + "epoch": 2.6586863106200123, + "grad_norm": 0.2597752809524536, + "learning_rate": 8.611146622146828e-05, + "loss": 1.7931, + "step": 8662 + }, + { + "epoch": 2.6589932473910376, + "grad_norm": 0.2953357696533203, + "learning_rate": 8.61080281370545e-05, + "loss": 1.837, + "step": 8663 + }, + { + "epoch": 2.6593001841620625, + "grad_norm": 0.3018724322319031, + "learning_rate": 8.610458969580003e-05, + "loss": 1.871, + "step": 8664 + }, + { + "epoch": 2.659607120933088, + "grad_norm": 0.36607179045677185, + "learning_rate": 8.610115089773885e-05, + "loss": 1.9453, + "step": 8665 + }, + { + "epoch": 2.659914057704113, + "grad_norm": 0.38754695653915405, + "learning_rate": 8.609771174290493e-05, + "loss": 1.8886, + "step": 8666 + }, + { + "epoch": 2.660220994475138, + "grad_norm": 0.3752847909927368, + "learning_rate": 8.609427223133226e-05, + "loss": 1.8662, + "step": 8667 + }, + { + "epoch": 2.6605279312461634, + "grad_norm": 0.3301216661930084, + "learning_rate": 8.609083236305483e-05, + "loss": 1.8697, + "step": 8668 + }, + { + "epoch": 2.6608348680171883, + "grad_norm": 0.31682586669921875, + "learning_rate": 8.608739213810666e-05, + "loss": 1.8982, + "step": 8669 + }, + { + "epoch": 2.6611418047882136, + "grad_norm": 0.30835145711898804, + "learning_rate": 8.608395155652172e-05, + "loss": 1.8245, + "step": 8670 + }, + { + "epoch": 2.661448741559239, + "grad_norm": 0.32517582178115845, + "learning_rate": 8.608051061833402e-05, + "loss": 1.9117, + "step": 8671 + }, + { + "epoch": 2.661755678330264, + "grad_norm": 0.3120395541191101, + "learning_rate": 8.607706932357757e-05, + "loss": 1.76, + "step": 8672 + }, + { + "epoch": 2.662062615101289, + "grad_norm": 0.31719091534614563, + "learning_rate": 8.607362767228637e-05, + "loss": 1.8939, + "step": 8673 + }, + { + "epoch": 2.662369551872314, + "grad_norm": 0.28792136907577515, + "learning_rate": 8.607018566449445e-05, + "loss": 1.8403, + "step": 8674 + }, + { + "epoch": 2.6626764886433394, + "grad_norm": 0.28327643871307373, + "learning_rate": 8.606674330023581e-05, + "loss": 1.8204, + "step": 8675 + }, + { + "epoch": 2.6629834254143647, + "grad_norm": 0.29808422923088074, + "learning_rate": 8.606330057954446e-05, + "loss": 1.8325, + "step": 8676 + }, + { + "epoch": 2.66329036218539, + "grad_norm": 0.36162641644477844, + "learning_rate": 8.605985750245446e-05, + "loss": 1.8387, + "step": 8677 + }, + { + "epoch": 2.663597298956415, + "grad_norm": 0.3418589234352112, + "learning_rate": 8.605641406899978e-05, + "loss": 1.8139, + "step": 8678 + }, + { + "epoch": 2.6639042357274403, + "grad_norm": 0.31307870149612427, + "learning_rate": 8.605297027921451e-05, + "loss": 1.8897, + "step": 8679 + }, + { + "epoch": 2.664211172498465, + "grad_norm": 0.36962878704071045, + "learning_rate": 8.604952613313264e-05, + "loss": 1.9233, + "step": 8680 + }, + { + "epoch": 2.6645181092694905, + "grad_norm": 0.3502652049064636, + "learning_rate": 8.604608163078824e-05, + "loss": 1.8218, + "step": 8681 + }, + { + "epoch": 2.664825046040516, + "grad_norm": 0.3703038692474365, + "learning_rate": 8.604263677221533e-05, + "loss": 1.8484, + "step": 8682 + }, + { + "epoch": 2.6651319828115407, + "grad_norm": 0.2609662711620331, + "learning_rate": 8.603919155744796e-05, + "loss": 1.7645, + "step": 8683 + }, + { + "epoch": 2.665438919582566, + "grad_norm": 0.33297231793403625, + "learning_rate": 8.603574598652015e-05, + "loss": 1.8543, + "step": 8684 + }, + { + "epoch": 2.665745856353591, + "grad_norm": 0.28411462903022766, + "learning_rate": 8.603230005946601e-05, + "loss": 1.867, + "step": 8685 + }, + { + "epoch": 2.6660527931246163, + "grad_norm": 0.3209732174873352, + "learning_rate": 8.602885377631954e-05, + "loss": 1.8886, + "step": 8686 + }, + { + "epoch": 2.6663597298956416, + "grad_norm": 0.35397234559059143, + "learning_rate": 8.602540713711482e-05, + "loss": 1.8965, + "step": 8687 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2925071716308594, + "learning_rate": 8.602196014188593e-05, + "loss": 1.8027, + "step": 8688 + }, + { + "epoch": 2.666973603437692, + "grad_norm": 0.2902941107749939, + "learning_rate": 8.60185127906669e-05, + "loss": 1.8022, + "step": 8689 + }, + { + "epoch": 2.6672805402087167, + "grad_norm": 0.31528550386428833, + "learning_rate": 8.601506508349181e-05, + "loss": 1.8153, + "step": 8690 + }, + { + "epoch": 2.667587476979742, + "grad_norm": 0.32254844903945923, + "learning_rate": 8.601161702039477e-05, + "loss": 1.8199, + "step": 8691 + }, + { + "epoch": 2.6678944137507674, + "grad_norm": 0.2999059855937958, + "learning_rate": 8.600816860140979e-05, + "loss": 1.8404, + "step": 8692 + }, + { + "epoch": 2.6682013505217927, + "grad_norm": 0.32727453112602234, + "learning_rate": 8.6004719826571e-05, + "loss": 1.8148, + "step": 8693 + }, + { + "epoch": 2.6685082872928176, + "grad_norm": 0.3048906624317169, + "learning_rate": 8.600127069591245e-05, + "loss": 1.833, + "step": 8694 + }, + { + "epoch": 2.668815224063843, + "grad_norm": 0.43790102005004883, + "learning_rate": 8.599782120946826e-05, + "loss": 1.8537, + "step": 8695 + }, + { + "epoch": 2.669122160834868, + "grad_norm": 0.38096752762794495, + "learning_rate": 8.59943713672725e-05, + "loss": 1.8094, + "step": 8696 + }, + { + "epoch": 2.669429097605893, + "grad_norm": 0.3065931499004364, + "learning_rate": 8.599092116935927e-05, + "loss": 1.8878, + "step": 8697 + }, + { + "epoch": 2.6697360343769185, + "grad_norm": 0.41807904839515686, + "learning_rate": 8.598747061576264e-05, + "loss": 1.8753, + "step": 8698 + }, + { + "epoch": 2.6700429711479434, + "grad_norm": 0.4906943142414093, + "learning_rate": 8.598401970651676e-05, + "loss": 1.7642, + "step": 8699 + }, + { + "epoch": 2.6703499079189688, + "grad_norm": 0.37138858437538147, + "learning_rate": 8.598056844165567e-05, + "loss": 1.8191, + "step": 8700 + }, + { + "epoch": 2.6706568446899936, + "grad_norm": 0.2804940938949585, + "learning_rate": 8.597711682121354e-05, + "loss": 1.8238, + "step": 8701 + }, + { + "epoch": 2.670963781461019, + "grad_norm": 0.3853018581867218, + "learning_rate": 8.597366484522445e-05, + "loss": 1.8762, + "step": 8702 + }, + { + "epoch": 2.6712707182320443, + "grad_norm": 0.3066580295562744, + "learning_rate": 8.597021251372253e-05, + "loss": 1.7638, + "step": 8703 + }, + { + "epoch": 2.671577655003069, + "grad_norm": 0.30797824263572693, + "learning_rate": 8.596675982674186e-05, + "loss": 1.8574, + "step": 8704 + }, + { + "epoch": 2.6718845917740945, + "grad_norm": 0.3268548548221588, + "learning_rate": 8.596330678431661e-05, + "loss": 1.9184, + "step": 8705 + }, + { + "epoch": 2.6721915285451194, + "grad_norm": 0.4077534079551697, + "learning_rate": 8.595985338648087e-05, + "loss": 1.8967, + "step": 8706 + }, + { + "epoch": 2.6724984653161448, + "grad_norm": 0.4514889419078827, + "learning_rate": 8.595639963326881e-05, + "loss": 1.8491, + "step": 8707 + }, + { + "epoch": 2.67280540208717, + "grad_norm": 0.39269959926605225, + "learning_rate": 8.59529455247145e-05, + "loss": 1.7865, + "step": 8708 + }, + { + "epoch": 2.6731123388581954, + "grad_norm": 0.3139820694923401, + "learning_rate": 8.594949106085212e-05, + "loss": 1.8007, + "step": 8709 + }, + { + "epoch": 2.6734192756292203, + "grad_norm": 0.3423599600791931, + "learning_rate": 8.59460362417158e-05, + "loss": 1.8389, + "step": 8710 + }, + { + "epoch": 2.6737262124002457, + "grad_norm": 0.3829670548439026, + "learning_rate": 8.594258106733968e-05, + "loss": 1.8355, + "step": 8711 + }, + { + "epoch": 2.6740331491712706, + "grad_norm": 0.34447145462036133, + "learning_rate": 8.593912553775791e-05, + "loss": 1.8595, + "step": 8712 + }, + { + "epoch": 2.674340085942296, + "grad_norm": 0.34868502616882324, + "learning_rate": 8.593566965300465e-05, + "loss": 1.9195, + "step": 8713 + }, + { + "epoch": 2.674647022713321, + "grad_norm": 0.4919234812259674, + "learning_rate": 8.593221341311402e-05, + "loss": 1.8321, + "step": 8714 + }, + { + "epoch": 2.674953959484346, + "grad_norm": 0.4413202702999115, + "learning_rate": 8.59287568181202e-05, + "loss": 1.7976, + "step": 8715 + }, + { + "epoch": 2.6752608962553714, + "grad_norm": 0.3395153880119324, + "learning_rate": 8.592529986805736e-05, + "loss": 1.7974, + "step": 8716 + }, + { + "epoch": 2.6755678330263963, + "grad_norm": 0.30407002568244934, + "learning_rate": 8.592184256295965e-05, + "loss": 1.7929, + "step": 8717 + }, + { + "epoch": 2.6758747697974217, + "grad_norm": 0.31925150752067566, + "learning_rate": 8.591838490286121e-05, + "loss": 1.8413, + "step": 8718 + }, + { + "epoch": 2.676181706568447, + "grad_norm": 0.28456512093544006, + "learning_rate": 8.591492688779627e-05, + "loss": 1.8686, + "step": 8719 + }, + { + "epoch": 2.6764886433394723, + "grad_norm": 0.3286445438861847, + "learning_rate": 8.591146851779895e-05, + "loss": 1.8538, + "step": 8720 + }, + { + "epoch": 2.6767955801104972, + "grad_norm": 0.40354880690574646, + "learning_rate": 8.590800979290346e-05, + "loss": 1.8599, + "step": 8721 + }, + { + "epoch": 2.6771025168815226, + "grad_norm": 0.3654378652572632, + "learning_rate": 8.590455071314397e-05, + "loss": 1.8063, + "step": 8722 + }, + { + "epoch": 2.6774094536525475, + "grad_norm": 0.3211844861507416, + "learning_rate": 8.590109127855466e-05, + "loss": 1.8146, + "step": 8723 + }, + { + "epoch": 2.677716390423573, + "grad_norm": 0.30884361267089844, + "learning_rate": 8.589763148916973e-05, + "loss": 1.8725, + "step": 8724 + }, + { + "epoch": 2.678023327194598, + "grad_norm": 0.303095281124115, + "learning_rate": 8.589417134502336e-05, + "loss": 1.8994, + "step": 8725 + }, + { + "epoch": 2.678330263965623, + "grad_norm": 0.3086979389190674, + "learning_rate": 8.589071084614977e-05, + "loss": 1.7941, + "step": 8726 + }, + { + "epoch": 2.6786372007366483, + "grad_norm": 0.30298081040382385, + "learning_rate": 8.588724999258311e-05, + "loss": 1.8945, + "step": 8727 + }, + { + "epoch": 2.6789441375076732, + "grad_norm": 0.33253392577171326, + "learning_rate": 8.588378878435763e-05, + "loss": 1.8397, + "step": 8728 + }, + { + "epoch": 2.6792510742786986, + "grad_norm": 0.2782913148403168, + "learning_rate": 8.588032722150752e-05, + "loss": 1.8505, + "step": 8729 + }, + { + "epoch": 2.679558011049724, + "grad_norm": 0.3482373058795929, + "learning_rate": 8.587686530406697e-05, + "loss": 1.9144, + "step": 8730 + }, + { + "epoch": 2.679864947820749, + "grad_norm": 0.31985580921173096, + "learning_rate": 8.587340303207021e-05, + "loss": 1.7695, + "step": 8731 + }, + { + "epoch": 2.680171884591774, + "grad_norm": 0.3222995400428772, + "learning_rate": 8.586994040555147e-05, + "loss": 1.8624, + "step": 8732 + }, + { + "epoch": 2.680478821362799, + "grad_norm": 0.28178468346595764, + "learning_rate": 8.586647742454495e-05, + "loss": 1.8036, + "step": 8733 + }, + { + "epoch": 2.6807857581338244, + "grad_norm": 0.27367156744003296, + "learning_rate": 8.586301408908487e-05, + "loss": 1.801, + "step": 8734 + }, + { + "epoch": 2.6810926949048497, + "grad_norm": 0.2696636915206909, + "learning_rate": 8.585955039920547e-05, + "loss": 1.8211, + "step": 8735 + }, + { + "epoch": 2.681399631675875, + "grad_norm": 0.2880568504333496, + "learning_rate": 8.585608635494098e-05, + "loss": 1.8543, + "step": 8736 + }, + { + "epoch": 2.6817065684469, + "grad_norm": 0.28708669543266296, + "learning_rate": 8.585262195632562e-05, + "loss": 1.8311, + "step": 8737 + }, + { + "epoch": 2.6820135052179253, + "grad_norm": 0.2633354663848877, + "learning_rate": 8.584915720339364e-05, + "loss": 1.7815, + "step": 8738 + }, + { + "epoch": 2.68232044198895, + "grad_norm": 0.25772908329963684, + "learning_rate": 8.584569209617928e-05, + "loss": 1.8322, + "step": 8739 + }, + { + "epoch": 2.6826273787599755, + "grad_norm": 0.2665303647518158, + "learning_rate": 8.584222663471677e-05, + "loss": 1.8456, + "step": 8740 + }, + { + "epoch": 2.682934315531001, + "grad_norm": 0.26330938935279846, + "learning_rate": 8.583876081904038e-05, + "loss": 1.8552, + "step": 8741 + }, + { + "epoch": 2.6832412523020257, + "grad_norm": 0.29758915305137634, + "learning_rate": 8.583529464918434e-05, + "loss": 1.8362, + "step": 8742 + }, + { + "epoch": 2.683548189073051, + "grad_norm": 0.32018154859542847, + "learning_rate": 8.583182812518293e-05, + "loss": 1.8439, + "step": 8743 + }, + { + "epoch": 2.683855125844076, + "grad_norm": 0.33279770612716675, + "learning_rate": 8.582836124707036e-05, + "loss": 1.8629, + "step": 8744 + }, + { + "epoch": 2.6841620626151013, + "grad_norm": 0.40244174003601074, + "learning_rate": 8.582489401488096e-05, + "loss": 1.8221, + "step": 8745 + }, + { + "epoch": 2.6844689993861266, + "grad_norm": 0.3935016393661499, + "learning_rate": 8.582142642864895e-05, + "loss": 1.8564, + "step": 8746 + }, + { + "epoch": 2.6847759361571515, + "grad_norm": 0.3062369227409363, + "learning_rate": 8.58179584884086e-05, + "loss": 1.8587, + "step": 8747 + }, + { + "epoch": 2.685082872928177, + "grad_norm": 0.320422500371933, + "learning_rate": 8.58144901941942e-05, + "loss": 1.8758, + "step": 8748 + }, + { + "epoch": 2.6853898096992017, + "grad_norm": 0.3681413531303406, + "learning_rate": 8.581102154604001e-05, + "loss": 1.7899, + "step": 8749 + }, + { + "epoch": 2.685696746470227, + "grad_norm": 0.37779754400253296, + "learning_rate": 8.580755254398032e-05, + "loss": 1.8584, + "step": 8750 + }, + { + "epoch": 2.6860036832412524, + "grad_norm": 0.34761306643486023, + "learning_rate": 8.58040831880494e-05, + "loss": 1.8656, + "step": 8751 + }, + { + "epoch": 2.6863106200122777, + "grad_norm": 0.2833636403083801, + "learning_rate": 8.580061347828156e-05, + "loss": 1.8043, + "step": 8752 + }, + { + "epoch": 2.6866175567833026, + "grad_norm": 0.29990699887275696, + "learning_rate": 8.579714341471106e-05, + "loss": 1.8365, + "step": 8753 + }, + { + "epoch": 2.686924493554328, + "grad_norm": 0.3322729766368866, + "learning_rate": 8.579367299737222e-05, + "loss": 1.8541, + "step": 8754 + }, + { + "epoch": 2.687231430325353, + "grad_norm": 0.31999245285987854, + "learning_rate": 8.579020222629931e-05, + "loss": 1.8405, + "step": 8755 + }, + { + "epoch": 2.687538367096378, + "grad_norm": 0.332714319229126, + "learning_rate": 8.578673110152666e-05, + "loss": 1.9512, + "step": 8756 + }, + { + "epoch": 2.6878453038674035, + "grad_norm": 0.36372992396354675, + "learning_rate": 8.578325962308855e-05, + "loss": 1.8969, + "step": 8757 + }, + { + "epoch": 2.6881522406384284, + "grad_norm": 0.27239182591438293, + "learning_rate": 8.577978779101929e-05, + "loss": 1.7898, + "step": 8758 + }, + { + "epoch": 2.6884591774094537, + "grad_norm": 0.3552536070346832, + "learning_rate": 8.57763156053532e-05, + "loss": 1.8919, + "step": 8759 + }, + { + "epoch": 2.6887661141804786, + "grad_norm": 0.40591174364089966, + "learning_rate": 8.577284306612458e-05, + "loss": 1.8021, + "step": 8760 + }, + { + "epoch": 2.689073050951504, + "grad_norm": 0.37012994289398193, + "learning_rate": 8.576937017336777e-05, + "loss": 1.7803, + "step": 8761 + }, + { + "epoch": 2.6893799877225293, + "grad_norm": 0.33496031165122986, + "learning_rate": 8.576589692711707e-05, + "loss": 1.8573, + "step": 8762 + }, + { + "epoch": 2.689686924493554, + "grad_norm": 0.35000404715538025, + "learning_rate": 8.576242332740683e-05, + "loss": 1.8769, + "step": 8763 + }, + { + "epoch": 2.6899938612645795, + "grad_norm": 0.32730549573898315, + "learning_rate": 8.575894937427135e-05, + "loss": 1.823, + "step": 8764 + }, + { + "epoch": 2.6903007980356044, + "grad_norm": 0.31418806314468384, + "learning_rate": 8.575547506774497e-05, + "loss": 1.7646, + "step": 8765 + }, + { + "epoch": 2.6906077348066297, + "grad_norm": 0.277721107006073, + "learning_rate": 8.575200040786205e-05, + "loss": 1.8046, + "step": 8766 + }, + { + "epoch": 2.690914671577655, + "grad_norm": 0.3289557695388794, + "learning_rate": 8.574852539465688e-05, + "loss": 1.8145, + "step": 8767 + }, + { + "epoch": 2.6912216083486804, + "grad_norm": 0.28926602005958557, + "learning_rate": 8.574505002816385e-05, + "loss": 1.7627, + "step": 8768 + }, + { + "epoch": 2.6915285451197053, + "grad_norm": 0.2972332835197449, + "learning_rate": 8.574157430841727e-05, + "loss": 1.8294, + "step": 8769 + }, + { + "epoch": 2.6918354818907306, + "grad_norm": 0.28366953134536743, + "learning_rate": 8.57380982354515e-05, + "loss": 1.8535, + "step": 8770 + }, + { + "epoch": 2.6921424186617555, + "grad_norm": 0.2798771262168884, + "learning_rate": 8.57346218093009e-05, + "loss": 1.8298, + "step": 8771 + }, + { + "epoch": 2.692449355432781, + "grad_norm": 0.2614765465259552, + "learning_rate": 8.573114502999983e-05, + "loss": 1.8555, + "step": 8772 + }, + { + "epoch": 2.692756292203806, + "grad_norm": 0.30653777718544006, + "learning_rate": 8.572766789758265e-05, + "loss": 1.8507, + "step": 8773 + }, + { + "epoch": 2.693063228974831, + "grad_norm": 0.3189094066619873, + "learning_rate": 8.572419041208369e-05, + "loss": 1.8791, + "step": 8774 + }, + { + "epoch": 2.6933701657458564, + "grad_norm": 0.33381524682044983, + "learning_rate": 8.572071257353735e-05, + "loss": 1.8241, + "step": 8775 + }, + { + "epoch": 2.6936771025168813, + "grad_norm": 0.2776879668235779, + "learning_rate": 8.571723438197801e-05, + "loss": 1.7837, + "step": 8776 + }, + { + "epoch": 2.6939840392879066, + "grad_norm": 0.35845425724983215, + "learning_rate": 8.571375583744001e-05, + "loss": 1.8896, + "step": 8777 + }, + { + "epoch": 2.694290976058932, + "grad_norm": 0.28849005699157715, + "learning_rate": 8.571027693995775e-05, + "loss": 1.803, + "step": 8778 + }, + { + "epoch": 2.694597912829957, + "grad_norm": 0.3008786141872406, + "learning_rate": 8.57067976895656e-05, + "loss": 1.8559, + "step": 8779 + }, + { + "epoch": 2.694904849600982, + "grad_norm": 0.2924736440181732, + "learning_rate": 8.570331808629795e-05, + "loss": 1.8016, + "step": 8780 + }, + { + "epoch": 2.695211786372007, + "grad_norm": 0.2962380051612854, + "learning_rate": 8.569983813018917e-05, + "loss": 1.819, + "step": 8781 + }, + { + "epoch": 2.6955187231430324, + "grad_norm": 0.3141970634460449, + "learning_rate": 8.569635782127367e-05, + "loss": 1.8462, + "step": 8782 + }, + { + "epoch": 2.6958256599140578, + "grad_norm": 0.297061562538147, + "learning_rate": 8.569287715958584e-05, + "loss": 1.855, + "step": 8783 + }, + { + "epoch": 2.696132596685083, + "grad_norm": 0.30669623613357544, + "learning_rate": 8.568939614516009e-05, + "loss": 1.8626, + "step": 8784 + }, + { + "epoch": 2.696439533456108, + "grad_norm": 0.2782025933265686, + "learning_rate": 8.568591477803081e-05, + "loss": 1.8993, + "step": 8785 + }, + { + "epoch": 2.6967464702271333, + "grad_norm": 0.3644821345806122, + "learning_rate": 8.568243305823239e-05, + "loss": 1.8318, + "step": 8786 + }, + { + "epoch": 2.697053406998158, + "grad_norm": 0.4073259234428406, + "learning_rate": 8.567895098579925e-05, + "loss": 1.8963, + "step": 8787 + }, + { + "epoch": 2.6973603437691835, + "grad_norm": 0.40539780259132385, + "learning_rate": 8.567546856076583e-05, + "loss": 1.8644, + "step": 8788 + }, + { + "epoch": 2.697667280540209, + "grad_norm": 0.36739271879196167, + "learning_rate": 8.567198578316648e-05, + "loss": 1.8555, + "step": 8789 + }, + { + "epoch": 2.6979742173112338, + "grad_norm": 0.3339182138442993, + "learning_rate": 8.566850265303568e-05, + "loss": 1.8431, + "step": 8790 + }, + { + "epoch": 2.698281154082259, + "grad_norm": 0.3389740586280823, + "learning_rate": 8.566501917040784e-05, + "loss": 1.8271, + "step": 8791 + }, + { + "epoch": 2.698588090853284, + "grad_norm": 0.33819615840911865, + "learning_rate": 8.566153533531737e-05, + "loss": 1.8504, + "step": 8792 + }, + { + "epoch": 2.6988950276243093, + "grad_norm": 0.39106276631355286, + "learning_rate": 8.56580511477987e-05, + "loss": 1.7656, + "step": 8793 + }, + { + "epoch": 2.6992019643953347, + "grad_norm": 0.3374726474285126, + "learning_rate": 8.565456660788628e-05, + "loss": 1.8256, + "step": 8794 + }, + { + "epoch": 2.69950890116636, + "grad_norm": 0.33096614480018616, + "learning_rate": 8.565108171561452e-05, + "loss": 1.9486, + "step": 8795 + }, + { + "epoch": 2.699815837937385, + "grad_norm": 0.3202100396156311, + "learning_rate": 8.564759647101788e-05, + "loss": 1.7708, + "step": 8796 + }, + { + "epoch": 2.7001227747084102, + "grad_norm": 0.28830909729003906, + "learning_rate": 8.56441108741308e-05, + "loss": 1.8247, + "step": 8797 + }, + { + "epoch": 2.700429711479435, + "grad_norm": 0.32385459542274475, + "learning_rate": 8.564062492498772e-05, + "loss": 1.8338, + "step": 8798 + }, + { + "epoch": 2.7007366482504604, + "grad_norm": 0.3059900104999542, + "learning_rate": 8.56371386236231e-05, + "loss": 1.8321, + "step": 8799 + }, + { + "epoch": 2.701043585021486, + "grad_norm": 0.2922738492488861, + "learning_rate": 8.563365197007141e-05, + "loss": 1.7734, + "step": 8800 + }, + { + "epoch": 2.7013505217925107, + "grad_norm": 0.32542386651039124, + "learning_rate": 8.563016496436704e-05, + "loss": 1.8696, + "step": 8801 + }, + { + "epoch": 2.701657458563536, + "grad_norm": 0.2830851674079895, + "learning_rate": 8.562667760654452e-05, + "loss": 1.8237, + "step": 8802 + }, + { + "epoch": 2.701964395334561, + "grad_norm": 0.2794142961502075, + "learning_rate": 8.562318989663831e-05, + "loss": 1.8301, + "step": 8803 + }, + { + "epoch": 2.7022713321055862, + "grad_norm": 0.3149101436138153, + "learning_rate": 8.561970183468281e-05, + "loss": 1.8716, + "step": 8804 + }, + { + "epoch": 2.7025782688766116, + "grad_norm": 0.29530593752861023, + "learning_rate": 8.561621342071258e-05, + "loss": 1.9069, + "step": 8805 + }, + { + "epoch": 2.7028852056476365, + "grad_norm": 0.33965879678726196, + "learning_rate": 8.561272465476204e-05, + "loss": 1.8381, + "step": 8806 + }, + { + "epoch": 2.703192142418662, + "grad_norm": 0.3310995399951935, + "learning_rate": 8.560923553686569e-05, + "loss": 1.9293, + "step": 8807 + }, + { + "epoch": 2.7034990791896867, + "grad_norm": 0.3828842043876648, + "learning_rate": 8.5605746067058e-05, + "loss": 1.8789, + "step": 8808 + }, + { + "epoch": 2.703806015960712, + "grad_norm": 0.3666260242462158, + "learning_rate": 8.560225624537346e-05, + "loss": 1.8622, + "step": 8809 + }, + { + "epoch": 2.7041129527317374, + "grad_norm": 0.36732783913612366, + "learning_rate": 8.559876607184653e-05, + "loss": 1.8177, + "step": 8810 + }, + { + "epoch": 2.7044198895027627, + "grad_norm": 0.35554859042167664, + "learning_rate": 8.559527554651176e-05, + "loss": 1.884, + "step": 8811 + }, + { + "epoch": 2.7047268262737876, + "grad_norm": 0.3118159770965576, + "learning_rate": 8.55917846694036e-05, + "loss": 1.8779, + "step": 8812 + }, + { + "epoch": 2.705033763044813, + "grad_norm": 0.278105765581131, + "learning_rate": 8.558829344055657e-05, + "loss": 1.8513, + "step": 8813 + }, + { + "epoch": 2.705340699815838, + "grad_norm": 0.30809372663497925, + "learning_rate": 8.558480186000517e-05, + "loss": 1.8023, + "step": 8814 + }, + { + "epoch": 2.705647636586863, + "grad_norm": 0.28222522139549255, + "learning_rate": 8.558130992778388e-05, + "loss": 1.8421, + "step": 8815 + }, + { + "epoch": 2.7059545733578885, + "grad_norm": 0.29532718658447266, + "learning_rate": 8.557781764392725e-05, + "loss": 1.8131, + "step": 8816 + }, + { + "epoch": 2.7062615101289134, + "grad_norm": 0.2670072317123413, + "learning_rate": 8.557432500846975e-05, + "loss": 1.7856, + "step": 8817 + }, + { + "epoch": 2.7065684468999387, + "grad_norm": 0.3431483805179596, + "learning_rate": 8.557083202144594e-05, + "loss": 1.8484, + "step": 8818 + }, + { + "epoch": 2.7068753836709636, + "grad_norm": 0.3824561536312103, + "learning_rate": 8.556733868289033e-05, + "loss": 1.8954, + "step": 8819 + }, + { + "epoch": 2.707182320441989, + "grad_norm": 0.4189379811286926, + "learning_rate": 8.55638449928374e-05, + "loss": 1.7846, + "step": 8820 + }, + { + "epoch": 2.7074892572130143, + "grad_norm": 0.34948450326919556, + "learning_rate": 8.556035095132173e-05, + "loss": 1.7696, + "step": 8821 + }, + { + "epoch": 2.707796193984039, + "grad_norm": 0.2906292676925659, + "learning_rate": 8.555685655837783e-05, + "loss": 1.8359, + "step": 8822 + }, + { + "epoch": 2.7081031307550645, + "grad_norm": 0.2756035029888153, + "learning_rate": 8.555336181404023e-05, + "loss": 1.8684, + "step": 8823 + }, + { + "epoch": 2.7084100675260894, + "grad_norm": 0.3714772164821625, + "learning_rate": 8.554986671834346e-05, + "loss": 1.8833, + "step": 8824 + }, + { + "epoch": 2.7087170042971147, + "grad_norm": 0.41674792766571045, + "learning_rate": 8.554637127132209e-05, + "loss": 1.8272, + "step": 8825 + }, + { + "epoch": 2.70902394106814, + "grad_norm": 0.333915650844574, + "learning_rate": 8.554287547301063e-05, + "loss": 1.8343, + "step": 8826 + }, + { + "epoch": 2.7093308778391654, + "grad_norm": 0.33764639496803284, + "learning_rate": 8.553937932344365e-05, + "loss": 1.812, + "step": 8827 + }, + { + "epoch": 2.7096378146101903, + "grad_norm": 0.4445551931858063, + "learning_rate": 8.553588282265569e-05, + "loss": 1.8386, + "step": 8828 + }, + { + "epoch": 2.7099447513812156, + "grad_norm": 0.43314024806022644, + "learning_rate": 8.553238597068131e-05, + "loss": 1.7727, + "step": 8829 + }, + { + "epoch": 2.7102516881522405, + "grad_norm": 0.364596426486969, + "learning_rate": 8.552888876755506e-05, + "loss": 1.8875, + "step": 8830 + }, + { + "epoch": 2.710558624923266, + "grad_norm": 0.3023224174976349, + "learning_rate": 8.552539121331151e-05, + "loss": 1.8676, + "step": 8831 + }, + { + "epoch": 2.710865561694291, + "grad_norm": 0.3278682231903076, + "learning_rate": 8.552189330798522e-05, + "loss": 1.852, + "step": 8832 + }, + { + "epoch": 2.711172498465316, + "grad_norm": 0.34684303402900696, + "learning_rate": 8.551839505161077e-05, + "loss": 1.8449, + "step": 8833 + }, + { + "epoch": 2.7114794352363414, + "grad_norm": 0.3398132920265198, + "learning_rate": 8.551489644422271e-05, + "loss": 1.8493, + "step": 8834 + }, + { + "epoch": 2.7117863720073663, + "grad_norm": 0.2835905849933624, + "learning_rate": 8.551139748585563e-05, + "loss": 1.8283, + "step": 8835 + }, + { + "epoch": 2.7120933087783916, + "grad_norm": 0.30910351872444153, + "learning_rate": 8.55078981765441e-05, + "loss": 1.8429, + "step": 8836 + }, + { + "epoch": 2.712400245549417, + "grad_norm": 0.3802061676979065, + "learning_rate": 8.550439851632272e-05, + "loss": 1.8348, + "step": 8837 + }, + { + "epoch": 2.712707182320442, + "grad_norm": 0.3686448931694031, + "learning_rate": 8.550089850522606e-05, + "loss": 1.8652, + "step": 8838 + }, + { + "epoch": 2.713014119091467, + "grad_norm": 0.2919705808162689, + "learning_rate": 8.549739814328872e-05, + "loss": 1.8318, + "step": 8839 + }, + { + "epoch": 2.713321055862492, + "grad_norm": 0.34780198335647583, + "learning_rate": 8.549389743054527e-05, + "loss": 1.8781, + "step": 8840 + }, + { + "epoch": 2.7136279926335174, + "grad_norm": 0.3955966532230377, + "learning_rate": 8.549039636703034e-05, + "loss": 1.867, + "step": 8841 + }, + { + "epoch": 2.7139349294045427, + "grad_norm": 0.2836689054965973, + "learning_rate": 8.548689495277851e-05, + "loss": 1.7859, + "step": 8842 + }, + { + "epoch": 2.714241866175568, + "grad_norm": 0.369865357875824, + "learning_rate": 8.548339318782436e-05, + "loss": 1.8246, + "step": 8843 + }, + { + "epoch": 2.714548802946593, + "grad_norm": 0.2901081442832947, + "learning_rate": 8.547989107220256e-05, + "loss": 1.7888, + "step": 8844 + }, + { + "epoch": 2.7148557397176183, + "grad_norm": 0.2790970802307129, + "learning_rate": 8.547638860594764e-05, + "loss": 1.8311, + "step": 8845 + }, + { + "epoch": 2.715162676488643, + "grad_norm": 0.2935783267021179, + "learning_rate": 8.547288578909429e-05, + "loss": 1.857, + "step": 8846 + }, + { + "epoch": 2.7154696132596685, + "grad_norm": 0.27074959874153137, + "learning_rate": 8.546938262167708e-05, + "loss": 1.7457, + "step": 8847 + }, + { + "epoch": 2.715776550030694, + "grad_norm": 0.3042888343334198, + "learning_rate": 8.546587910373063e-05, + "loss": 1.8598, + "step": 8848 + }, + { + "epoch": 2.7160834868017187, + "grad_norm": 0.29088664054870605, + "learning_rate": 8.546237523528958e-05, + "loss": 1.8461, + "step": 8849 + }, + { + "epoch": 2.716390423572744, + "grad_norm": 0.3022211492061615, + "learning_rate": 8.545887101638857e-05, + "loss": 1.8327, + "step": 8850 + }, + { + "epoch": 2.716697360343769, + "grad_norm": 0.30194929242134094, + "learning_rate": 8.545536644706218e-05, + "loss": 1.8331, + "step": 8851 + }, + { + "epoch": 2.7170042971147943, + "grad_norm": 0.31702303886413574, + "learning_rate": 8.54518615273451e-05, + "loss": 1.8576, + "step": 8852 + }, + { + "epoch": 2.7173112338858196, + "grad_norm": 0.30386796593666077, + "learning_rate": 8.544835625727195e-05, + "loss": 1.8278, + "step": 8853 + }, + { + "epoch": 2.717618170656845, + "grad_norm": 0.30670568346977234, + "learning_rate": 8.544485063687735e-05, + "loss": 1.8123, + "step": 8854 + }, + { + "epoch": 2.71792510742787, + "grad_norm": 0.3896371126174927, + "learning_rate": 8.544134466619597e-05, + "loss": 1.8101, + "step": 8855 + }, + { + "epoch": 2.718232044198895, + "grad_norm": 0.4742000699043274, + "learning_rate": 8.543783834526245e-05, + "loss": 1.8402, + "step": 8856 + }, + { + "epoch": 2.71853898096992, + "grad_norm": 0.4234209954738617, + "learning_rate": 8.543433167411143e-05, + "loss": 1.8814, + "step": 8857 + }, + { + "epoch": 2.7188459177409454, + "grad_norm": 0.28478503227233887, + "learning_rate": 8.54308246527776e-05, + "loss": 1.8165, + "step": 8858 + }, + { + "epoch": 2.7191528545119708, + "grad_norm": 0.3534078896045685, + "learning_rate": 8.542731728129558e-05, + "loss": 1.7947, + "step": 8859 + }, + { + "epoch": 2.7194597912829956, + "grad_norm": 0.5471592545509338, + "learning_rate": 8.542380955970004e-05, + "loss": 1.9073, + "step": 8860 + }, + { + "epoch": 2.719766728054021, + "grad_norm": 0.5037226676940918, + "learning_rate": 8.542030148802566e-05, + "loss": 1.8701, + "step": 8861 + }, + { + "epoch": 2.720073664825046, + "grad_norm": 0.3415449559688568, + "learning_rate": 8.54167930663071e-05, + "loss": 1.827, + "step": 8862 + }, + { + "epoch": 2.720380601596071, + "grad_norm": 0.33516764640808105, + "learning_rate": 8.541328429457903e-05, + "loss": 1.9396, + "step": 8863 + }, + { + "epoch": 2.7206875383670965, + "grad_norm": 0.3934863209724426, + "learning_rate": 8.540977517287612e-05, + "loss": 1.8738, + "step": 8864 + }, + { + "epoch": 2.7209944751381214, + "grad_norm": 0.5137139558792114, + "learning_rate": 8.540626570123307e-05, + "loss": 1.9007, + "step": 8865 + }, + { + "epoch": 2.7213014119091468, + "grad_norm": 0.5846540331840515, + "learning_rate": 8.540275587968453e-05, + "loss": 1.9335, + "step": 8866 + }, + { + "epoch": 2.7216083486801717, + "grad_norm": 0.613388180732727, + "learning_rate": 8.539924570826523e-05, + "loss": 1.8967, + "step": 8867 + }, + { + "epoch": 2.721915285451197, + "grad_norm": 0.4804840087890625, + "learning_rate": 8.539573518700983e-05, + "loss": 1.7712, + "step": 8868 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.34939101338386536, + "learning_rate": 8.539222431595303e-05, + "loss": 1.8578, + "step": 8869 + }, + { + "epoch": 2.7225291589932477, + "grad_norm": 0.4230511486530304, + "learning_rate": 8.538871309512951e-05, + "loss": 1.793, + "step": 8870 + }, + { + "epoch": 2.7228360957642725, + "grad_norm": 0.5383400917053223, + "learning_rate": 8.538520152457402e-05, + "loss": 1.8153, + "step": 8871 + }, + { + "epoch": 2.723143032535298, + "grad_norm": 0.46213194727897644, + "learning_rate": 8.538168960432118e-05, + "loss": 1.9357, + "step": 8872 + }, + { + "epoch": 2.7234499693063228, + "grad_norm": 0.3126194477081299, + "learning_rate": 8.537817733440577e-05, + "loss": 1.7954, + "step": 8873 + }, + { + "epoch": 2.723756906077348, + "grad_norm": 0.4018714129924774, + "learning_rate": 8.537466471486248e-05, + "loss": 1.824, + "step": 8874 + }, + { + "epoch": 2.7240638428483734, + "grad_norm": 0.5690213441848755, + "learning_rate": 8.537115174572602e-05, + "loss": 1.7807, + "step": 8875 + }, + { + "epoch": 2.7243707796193983, + "grad_norm": 0.4669814705848694, + "learning_rate": 8.53676384270311e-05, + "loss": 1.7438, + "step": 8876 + }, + { + "epoch": 2.7246777163904237, + "grad_norm": 0.3040566146373749, + "learning_rate": 8.536412475881246e-05, + "loss": 1.8613, + "step": 8877 + }, + { + "epoch": 2.7249846531614486, + "grad_norm": 0.38985559344291687, + "learning_rate": 8.53606107411048e-05, + "loss": 1.816, + "step": 8878 + }, + { + "epoch": 2.725291589932474, + "grad_norm": 0.4417174160480499, + "learning_rate": 8.535709637394285e-05, + "loss": 1.8675, + "step": 8879 + }, + { + "epoch": 2.7255985267034992, + "grad_norm": 0.3254696726799011, + "learning_rate": 8.535358165736138e-05, + "loss": 1.8419, + "step": 8880 + }, + { + "epoch": 2.725905463474524, + "grad_norm": 0.36002370715141296, + "learning_rate": 8.535006659139506e-05, + "loss": 1.9084, + "step": 8881 + }, + { + "epoch": 2.7262124002455494, + "grad_norm": 0.3471790850162506, + "learning_rate": 8.534655117607869e-05, + "loss": 1.8442, + "step": 8882 + }, + { + "epoch": 2.7265193370165743, + "grad_norm": 0.3042849004268646, + "learning_rate": 8.534303541144697e-05, + "loss": 1.8261, + "step": 8883 + }, + { + "epoch": 2.7268262737875997, + "grad_norm": 0.32416659593582153, + "learning_rate": 8.533951929753465e-05, + "loss": 1.8625, + "step": 8884 + }, + { + "epoch": 2.727133210558625, + "grad_norm": 0.32449519634246826, + "learning_rate": 8.53360028343765e-05, + "loss": 1.8653, + "step": 8885 + }, + { + "epoch": 2.7274401473296503, + "grad_norm": 0.34744054079055786, + "learning_rate": 8.533248602200726e-05, + "loss": 1.8742, + "step": 8886 + }, + { + "epoch": 2.7277470841006752, + "grad_norm": 0.30540695786476135, + "learning_rate": 8.532896886046167e-05, + "loss": 1.8064, + "step": 8887 + }, + { + "epoch": 2.7280540208717006, + "grad_norm": 0.27105677127838135, + "learning_rate": 8.532545134977452e-05, + "loss": 1.7867, + "step": 8888 + }, + { + "epoch": 2.7283609576427255, + "grad_norm": 0.2682685852050781, + "learning_rate": 8.532193348998054e-05, + "loss": 1.8191, + "step": 8889 + }, + { + "epoch": 2.728667894413751, + "grad_norm": 0.33534809947013855, + "learning_rate": 8.531841528111452e-05, + "loss": 1.8758, + "step": 8890 + }, + { + "epoch": 2.728974831184776, + "grad_norm": 0.33555057644844055, + "learning_rate": 8.531489672321122e-05, + "loss": 1.8932, + "step": 8891 + }, + { + "epoch": 2.729281767955801, + "grad_norm": 0.3532167077064514, + "learning_rate": 8.531137781630542e-05, + "loss": 1.8621, + "step": 8892 + }, + { + "epoch": 2.7295887047268264, + "grad_norm": 0.337634414434433, + "learning_rate": 8.530785856043186e-05, + "loss": 1.8618, + "step": 8893 + }, + { + "epoch": 2.7298956414978512, + "grad_norm": 0.28855568170547485, + "learning_rate": 8.530433895562538e-05, + "loss": 1.8248, + "step": 8894 + }, + { + "epoch": 2.7302025782688766, + "grad_norm": 0.3128049373626709, + "learning_rate": 8.530081900192071e-05, + "loss": 1.8071, + "step": 8895 + }, + { + "epoch": 2.730509515039902, + "grad_norm": 0.2949801981449127, + "learning_rate": 8.529729869935265e-05, + "loss": 1.7704, + "step": 8896 + }, + { + "epoch": 2.730816451810927, + "grad_norm": 0.2708294987678528, + "learning_rate": 8.529377804795603e-05, + "loss": 1.8127, + "step": 8897 + }, + { + "epoch": 2.731123388581952, + "grad_norm": 0.300516813993454, + "learning_rate": 8.529025704776559e-05, + "loss": 1.9063, + "step": 8898 + }, + { + "epoch": 2.731430325352977, + "grad_norm": 0.2590954005718231, + "learning_rate": 8.528673569881613e-05, + "loss": 1.7595, + "step": 8899 + }, + { + "epoch": 2.7317372621240024, + "grad_norm": 0.30067136883735657, + "learning_rate": 8.528321400114248e-05, + "loss": 1.8697, + "step": 8900 + }, + { + "epoch": 2.7320441988950277, + "grad_norm": 0.3289981186389923, + "learning_rate": 8.527969195477943e-05, + "loss": 1.8257, + "step": 8901 + }, + { + "epoch": 2.732351135666053, + "grad_norm": 0.3205581307411194, + "learning_rate": 8.527616955976178e-05, + "loss": 1.9002, + "step": 8902 + }, + { + "epoch": 2.732658072437078, + "grad_norm": 0.30869361758232117, + "learning_rate": 8.527264681612435e-05, + "loss": 1.8239, + "step": 8903 + }, + { + "epoch": 2.7329650092081033, + "grad_norm": 0.3237484097480774, + "learning_rate": 8.526912372390195e-05, + "loss": 1.8879, + "step": 8904 + }, + { + "epoch": 2.733271945979128, + "grad_norm": 0.3172036111354828, + "learning_rate": 8.52656002831294e-05, + "loss": 1.8118, + "step": 8905 + }, + { + "epoch": 2.7335788827501535, + "grad_norm": 0.3326823115348816, + "learning_rate": 8.52620764938415e-05, + "loss": 1.8035, + "step": 8906 + }, + { + "epoch": 2.733885819521179, + "grad_norm": 0.36605212092399597, + "learning_rate": 8.525855235607311e-05, + "loss": 1.8689, + "step": 8907 + }, + { + "epoch": 2.7341927562922037, + "grad_norm": 0.31904828548431396, + "learning_rate": 8.525502786985905e-05, + "loss": 1.8188, + "step": 8908 + }, + { + "epoch": 2.734499693063229, + "grad_norm": 0.2657643258571625, + "learning_rate": 8.525150303523413e-05, + "loss": 1.7471, + "step": 8909 + }, + { + "epoch": 2.734806629834254, + "grad_norm": 0.32748520374298096, + "learning_rate": 8.524797785223318e-05, + "loss": 1.8678, + "step": 8910 + }, + { + "epoch": 2.7351135666052793, + "grad_norm": 0.32576173543930054, + "learning_rate": 8.524445232089107e-05, + "loss": 1.8296, + "step": 8911 + }, + { + "epoch": 2.7354205033763046, + "grad_norm": 0.3028578758239746, + "learning_rate": 8.524092644124261e-05, + "loss": 1.8656, + "step": 8912 + }, + { + "epoch": 2.7357274401473295, + "grad_norm": 0.29967090487480164, + "learning_rate": 8.523740021332268e-05, + "loss": 1.8206, + "step": 8913 + }, + { + "epoch": 2.736034376918355, + "grad_norm": 0.3042941391468048, + "learning_rate": 8.523387363716611e-05, + "loss": 1.7928, + "step": 8914 + }, + { + "epoch": 2.7363413136893797, + "grad_norm": 0.3278021216392517, + "learning_rate": 8.523034671280772e-05, + "loss": 1.9213, + "step": 8915 + }, + { + "epoch": 2.736648250460405, + "grad_norm": 0.39839017391204834, + "learning_rate": 8.522681944028242e-05, + "loss": 1.8242, + "step": 8916 + }, + { + "epoch": 2.7369551872314304, + "grad_norm": 0.3960748016834259, + "learning_rate": 8.522329181962504e-05, + "loss": 1.8761, + "step": 8917 + }, + { + "epoch": 2.7372621240024557, + "grad_norm": 0.3250591456890106, + "learning_rate": 8.521976385087044e-05, + "loss": 1.8318, + "step": 8918 + }, + { + "epoch": 2.7375690607734806, + "grad_norm": 0.31731119751930237, + "learning_rate": 8.521623553405349e-05, + "loss": 1.8062, + "step": 8919 + }, + { + "epoch": 2.737875997544506, + "grad_norm": 0.32452264428138733, + "learning_rate": 8.521270686920906e-05, + "loss": 1.8384, + "step": 8920 + }, + { + "epoch": 2.738182934315531, + "grad_norm": 0.2892500162124634, + "learning_rate": 8.520917785637204e-05, + "loss": 1.8128, + "step": 8921 + }, + { + "epoch": 2.738489871086556, + "grad_norm": 0.30028483271598816, + "learning_rate": 8.520564849557726e-05, + "loss": 1.8512, + "step": 8922 + }, + { + "epoch": 2.7387968078575815, + "grad_norm": 0.29927411675453186, + "learning_rate": 8.520211878685964e-05, + "loss": 1.8431, + "step": 8923 + }, + { + "epoch": 2.7391037446286064, + "grad_norm": 0.3426479995250702, + "learning_rate": 8.519858873025405e-05, + "loss": 1.8724, + "step": 8924 + }, + { + "epoch": 2.7394106813996317, + "grad_norm": 0.3795917332172394, + "learning_rate": 8.519505832579538e-05, + "loss": 1.8888, + "step": 8925 + }, + { + "epoch": 2.7397176181706566, + "grad_norm": 0.4924582839012146, + "learning_rate": 8.519152757351849e-05, + "loss": 1.7743, + "step": 8926 + }, + { + "epoch": 2.740024554941682, + "grad_norm": 0.43054282665252686, + "learning_rate": 8.518799647345832e-05, + "loss": 1.8556, + "step": 8927 + }, + { + "epoch": 2.7403314917127073, + "grad_norm": 0.37040412425994873, + "learning_rate": 8.518446502564974e-05, + "loss": 1.9162, + "step": 8928 + }, + { + "epoch": 2.7406384284837326, + "grad_norm": 0.38334885239601135, + "learning_rate": 8.518093323012766e-05, + "loss": 1.8078, + "step": 8929 + }, + { + "epoch": 2.7409453652547575, + "grad_norm": 0.409101665019989, + "learning_rate": 8.517740108692698e-05, + "loss": 1.7874, + "step": 8930 + }, + { + "epoch": 2.741252302025783, + "grad_norm": 0.3953499495983124, + "learning_rate": 8.517386859608258e-05, + "loss": 1.8455, + "step": 8931 + }, + { + "epoch": 2.7415592387968077, + "grad_norm": 0.30524972081184387, + "learning_rate": 8.517033575762942e-05, + "loss": 1.822, + "step": 8932 + }, + { + "epoch": 2.741866175567833, + "grad_norm": 0.354086309671402, + "learning_rate": 8.516680257160239e-05, + "loss": 1.859, + "step": 8933 + }, + { + "epoch": 2.7421731123388584, + "grad_norm": 0.4305376410484314, + "learning_rate": 8.516326903803638e-05, + "loss": 1.8918, + "step": 8934 + }, + { + "epoch": 2.7424800491098833, + "grad_norm": 0.590727686882019, + "learning_rate": 8.515973515696635e-05, + "loss": 1.8841, + "step": 8935 + }, + { + "epoch": 2.7427869858809086, + "grad_norm": 0.665314257144928, + "learning_rate": 8.515620092842723e-05, + "loss": 1.8166, + "step": 8936 + }, + { + "epoch": 2.7430939226519335, + "grad_norm": 0.5579181909561157, + "learning_rate": 8.515266635245389e-05, + "loss": 1.8344, + "step": 8937 + }, + { + "epoch": 2.743400859422959, + "grad_norm": 0.3698382079601288, + "learning_rate": 8.514913142908132e-05, + "loss": 1.8445, + "step": 8938 + }, + { + "epoch": 2.743707796193984, + "grad_norm": 0.30882057547569275, + "learning_rate": 8.514559615834442e-05, + "loss": 1.8443, + "step": 8939 + }, + { + "epoch": 2.744014732965009, + "grad_norm": 0.35821446776390076, + "learning_rate": 8.514206054027815e-05, + "loss": 1.8482, + "step": 8940 + }, + { + "epoch": 2.7443216697360344, + "grad_norm": 0.35552099347114563, + "learning_rate": 8.513852457491744e-05, + "loss": 1.7848, + "step": 8941 + }, + { + "epoch": 2.7446286065070593, + "grad_norm": 0.27788954973220825, + "learning_rate": 8.513498826229722e-05, + "loss": 1.7935, + "step": 8942 + }, + { + "epoch": 2.7449355432780846, + "grad_norm": 0.30653929710388184, + "learning_rate": 8.513145160245246e-05, + "loss": 1.808, + "step": 8943 + }, + { + "epoch": 2.74524248004911, + "grad_norm": 0.34749966859817505, + "learning_rate": 8.512791459541812e-05, + "loss": 1.8498, + "step": 8944 + }, + { + "epoch": 2.7455494168201353, + "grad_norm": 0.362326979637146, + "learning_rate": 8.512437724122912e-05, + "loss": 1.8263, + "step": 8945 + }, + { + "epoch": 2.74585635359116, + "grad_norm": 0.2914038598537445, + "learning_rate": 8.512083953992044e-05, + "loss": 1.834, + "step": 8946 + }, + { + "epoch": 2.7461632903621855, + "grad_norm": 0.31662893295288086, + "learning_rate": 8.511730149152705e-05, + "loss": 1.8157, + "step": 8947 + }, + { + "epoch": 2.7464702271332104, + "grad_norm": 0.38970568776130676, + "learning_rate": 8.51137630960839e-05, + "loss": 1.8764, + "step": 8948 + }, + { + "epoch": 2.7467771639042358, + "grad_norm": 0.3907272517681122, + "learning_rate": 8.511022435362594e-05, + "loss": 1.8665, + "step": 8949 + }, + { + "epoch": 2.747084100675261, + "grad_norm": 0.3315196931362152, + "learning_rate": 8.510668526418819e-05, + "loss": 1.8076, + "step": 8950 + }, + { + "epoch": 2.747391037446286, + "grad_norm": 0.29783520102500916, + "learning_rate": 8.510314582780559e-05, + "loss": 1.8518, + "step": 8951 + }, + { + "epoch": 2.7476979742173113, + "grad_norm": 0.3085685670375824, + "learning_rate": 8.509960604451312e-05, + "loss": 1.8961, + "step": 8952 + }, + { + "epoch": 2.748004910988336, + "grad_norm": 0.3204992711544037, + "learning_rate": 8.509606591434579e-05, + "loss": 1.8374, + "step": 8953 + }, + { + "epoch": 2.7483118477593615, + "grad_norm": 0.2801276445388794, + "learning_rate": 8.509252543733855e-05, + "loss": 1.8455, + "step": 8954 + }, + { + "epoch": 2.748618784530387, + "grad_norm": 0.26911506056785583, + "learning_rate": 8.508898461352641e-05, + "loss": 1.8093, + "step": 8955 + }, + { + "epoch": 2.7489257213014118, + "grad_norm": 0.30429625511169434, + "learning_rate": 8.508544344294435e-05, + "loss": 1.8526, + "step": 8956 + }, + { + "epoch": 2.749232658072437, + "grad_norm": 0.308403342962265, + "learning_rate": 8.50819019256274e-05, + "loss": 1.7917, + "step": 8957 + }, + { + "epoch": 2.749539594843462, + "grad_norm": 0.3292251229286194, + "learning_rate": 8.507836006161052e-05, + "loss": 1.8206, + "step": 8958 + }, + { + "epoch": 2.7498465316144873, + "grad_norm": 0.30014076828956604, + "learning_rate": 8.507481785092871e-05, + "loss": 1.8136, + "step": 8959 + }, + { + "epoch": 2.7501534683855127, + "grad_norm": 0.2879343032836914, + "learning_rate": 8.5071275293617e-05, + "loss": 1.8476, + "step": 8960 + }, + { + "epoch": 2.750460405156538, + "grad_norm": 0.30646058917045593, + "learning_rate": 8.506773238971039e-05, + "loss": 1.7936, + "step": 8961 + }, + { + "epoch": 2.750767341927563, + "grad_norm": 0.309804230928421, + "learning_rate": 8.506418913924391e-05, + "loss": 1.8076, + "step": 8962 + }, + { + "epoch": 2.7510742786985882, + "grad_norm": 0.27035996317863464, + "learning_rate": 8.506064554225255e-05, + "loss": 1.8169, + "step": 8963 + }, + { + "epoch": 2.751381215469613, + "grad_norm": 0.3185548782348633, + "learning_rate": 8.505710159877134e-05, + "loss": 1.8265, + "step": 8964 + }, + { + "epoch": 2.7516881522406385, + "grad_norm": 0.3806973099708557, + "learning_rate": 8.505355730883532e-05, + "loss": 1.824, + "step": 8965 + }, + { + "epoch": 2.751995089011664, + "grad_norm": 0.3206372857093811, + "learning_rate": 8.505001267247949e-05, + "loss": 1.8436, + "step": 8966 + }, + { + "epoch": 2.7523020257826887, + "grad_norm": 0.2957460880279541, + "learning_rate": 8.504646768973889e-05, + "loss": 1.8212, + "step": 8967 + }, + { + "epoch": 2.752608962553714, + "grad_norm": 0.2854628562927246, + "learning_rate": 8.504292236064854e-05, + "loss": 1.862, + "step": 8968 + }, + { + "epoch": 2.752915899324739, + "grad_norm": 0.30056047439575195, + "learning_rate": 8.503937668524351e-05, + "loss": 1.8007, + "step": 8969 + }, + { + "epoch": 2.7532228360957642, + "grad_norm": 0.33884522318840027, + "learning_rate": 8.503583066355883e-05, + "loss": 1.8972, + "step": 8970 + }, + { + "epoch": 2.7535297728667896, + "grad_norm": 0.29358747601509094, + "learning_rate": 8.503228429562951e-05, + "loss": 1.8343, + "step": 8971 + }, + { + "epoch": 2.7538367096378145, + "grad_norm": 0.3650909662246704, + "learning_rate": 8.502873758149063e-05, + "loss": 1.7866, + "step": 8972 + }, + { + "epoch": 2.75414364640884, + "grad_norm": 0.3245839476585388, + "learning_rate": 8.502519052117725e-05, + "loss": 1.8451, + "step": 8973 + }, + { + "epoch": 2.7544505831798647, + "grad_norm": 0.305429071187973, + "learning_rate": 8.502164311472441e-05, + "loss": 1.9277, + "step": 8974 + }, + { + "epoch": 2.75475751995089, + "grad_norm": 0.3520638942718506, + "learning_rate": 8.501809536216716e-05, + "loss": 1.7648, + "step": 8975 + }, + { + "epoch": 2.7550644567219154, + "grad_norm": 0.419918030500412, + "learning_rate": 8.501454726354054e-05, + "loss": 1.7862, + "step": 8976 + }, + { + "epoch": 2.7553713934929407, + "grad_norm": 0.3854345977306366, + "learning_rate": 8.501099881887968e-05, + "loss": 1.8234, + "step": 8977 + }, + { + "epoch": 2.7556783302639656, + "grad_norm": 0.27826064825057983, + "learning_rate": 8.50074500282196e-05, + "loss": 1.7694, + "step": 8978 + }, + { + "epoch": 2.755985267034991, + "grad_norm": 0.3439055383205414, + "learning_rate": 8.500390089159536e-05, + "loss": 1.8136, + "step": 8979 + }, + { + "epoch": 2.756292203806016, + "grad_norm": 0.3434913754463196, + "learning_rate": 8.500035140904208e-05, + "loss": 1.8053, + "step": 8980 + }, + { + "epoch": 2.756599140577041, + "grad_norm": 0.27551600337028503, + "learning_rate": 8.49968015805948e-05, + "loss": 1.8349, + "step": 8981 + }, + { + "epoch": 2.7569060773480665, + "grad_norm": 0.304706871509552, + "learning_rate": 8.499325140628863e-05, + "loss": 1.8488, + "step": 8982 + }, + { + "epoch": 2.7572130141190914, + "grad_norm": 0.36910584568977356, + "learning_rate": 8.498970088615861e-05, + "loss": 1.8519, + "step": 8983 + }, + { + "epoch": 2.7575199508901167, + "grad_norm": 0.30584999918937683, + "learning_rate": 8.498615002023987e-05, + "loss": 1.8479, + "step": 8984 + }, + { + "epoch": 2.7578268876611416, + "grad_norm": 0.28511542081832886, + "learning_rate": 8.498259880856749e-05, + "loss": 1.8047, + "step": 8985 + }, + { + "epoch": 2.758133824432167, + "grad_norm": 0.28804922103881836, + "learning_rate": 8.497904725117658e-05, + "loss": 1.891, + "step": 8986 + }, + { + "epoch": 2.7584407612031923, + "grad_norm": 0.32592445611953735, + "learning_rate": 8.497549534810221e-05, + "loss": 1.8081, + "step": 8987 + }, + { + "epoch": 2.758747697974217, + "grad_norm": 0.3298552632331848, + "learning_rate": 8.497194309937949e-05, + "loss": 1.8897, + "step": 8988 + }, + { + "epoch": 2.7590546347452425, + "grad_norm": 0.3506438136100769, + "learning_rate": 8.496839050504353e-05, + "loss": 1.9007, + "step": 8989 + }, + { + "epoch": 2.7593615715162674, + "grad_norm": 0.30891793966293335, + "learning_rate": 8.496483756512946e-05, + "loss": 1.8154, + "step": 8990 + }, + { + "epoch": 2.7596685082872927, + "grad_norm": 0.3697068691253662, + "learning_rate": 8.496128427967235e-05, + "loss": 1.8301, + "step": 8991 + }, + { + "epoch": 2.759975445058318, + "grad_norm": 0.3090182840824127, + "learning_rate": 8.495773064870734e-05, + "loss": 1.8443, + "step": 8992 + }, + { + "epoch": 2.7602823818293434, + "grad_norm": 0.31172695755958557, + "learning_rate": 8.495417667226955e-05, + "loss": 1.8051, + "step": 8993 + }, + { + "epoch": 2.7605893186003683, + "grad_norm": 0.34285077452659607, + "learning_rate": 8.495062235039411e-05, + "loss": 1.8766, + "step": 8994 + }, + { + "epoch": 2.7608962553713936, + "grad_norm": 0.30001118779182434, + "learning_rate": 8.494706768311612e-05, + "loss": 1.8267, + "step": 8995 + }, + { + "epoch": 2.7612031921424185, + "grad_norm": 0.2767544984817505, + "learning_rate": 8.494351267047074e-05, + "loss": 1.8038, + "step": 8996 + }, + { + "epoch": 2.761510128913444, + "grad_norm": 0.2952648401260376, + "learning_rate": 8.493995731249307e-05, + "loss": 1.7863, + "step": 8997 + }, + { + "epoch": 2.761817065684469, + "grad_norm": 0.27491581439971924, + "learning_rate": 8.493640160921828e-05, + "loss": 1.844, + "step": 8998 + }, + { + "epoch": 2.762124002455494, + "grad_norm": 0.2733328938484192, + "learning_rate": 8.493284556068147e-05, + "loss": 1.7909, + "step": 8999 + }, + { + "epoch": 2.7624309392265194, + "grad_norm": 0.3201010525226593, + "learning_rate": 8.492928916691783e-05, + "loss": 1.8827, + "step": 9000 + }, + { + "epoch": 2.7627378759975443, + "grad_norm": 0.293652206659317, + "learning_rate": 8.492573242796244e-05, + "loss": 1.7755, + "step": 9001 + }, + { + "epoch": 2.7630448127685696, + "grad_norm": 0.2862321436405182, + "learning_rate": 8.492217534385053e-05, + "loss": 1.7868, + "step": 9002 + }, + { + "epoch": 2.763351749539595, + "grad_norm": 0.364490270614624, + "learning_rate": 8.491861791461722e-05, + "loss": 1.8276, + "step": 9003 + }, + { + "epoch": 2.7636586863106203, + "grad_norm": 0.4316955506801605, + "learning_rate": 8.491506014029765e-05, + "loss": 1.8727, + "step": 9004 + }, + { + "epoch": 2.763965623081645, + "grad_norm": 0.37957659363746643, + "learning_rate": 8.491150202092697e-05, + "loss": 1.8471, + "step": 9005 + }, + { + "epoch": 2.7642725598526705, + "grad_norm": 0.2936808168888092, + "learning_rate": 8.490794355654039e-05, + "loss": 1.7964, + "step": 9006 + }, + { + "epoch": 2.7645794966236954, + "grad_norm": 0.3742556869983673, + "learning_rate": 8.490438474717304e-05, + "loss": 1.8461, + "step": 9007 + }, + { + "epoch": 2.7648864333947207, + "grad_norm": 0.4273780286312103, + "learning_rate": 8.49008255928601e-05, + "loss": 1.7947, + "step": 9008 + }, + { + "epoch": 2.765193370165746, + "grad_norm": 0.35967808961868286, + "learning_rate": 8.489726609363675e-05, + "loss": 1.8125, + "step": 9009 + }, + { + "epoch": 2.765500306936771, + "grad_norm": 0.27607613801956177, + "learning_rate": 8.489370624953817e-05, + "loss": 1.8413, + "step": 9010 + }, + { + "epoch": 2.7658072437077963, + "grad_norm": 0.38287433981895447, + "learning_rate": 8.489014606059952e-05, + "loss": 1.8184, + "step": 9011 + }, + { + "epoch": 2.766114180478821, + "grad_norm": 0.4284100830554962, + "learning_rate": 8.4886585526856e-05, + "loss": 1.7965, + "step": 9012 + }, + { + "epoch": 2.7664211172498465, + "grad_norm": 0.35851627588272095, + "learning_rate": 8.48830246483428e-05, + "loss": 1.8275, + "step": 9013 + }, + { + "epoch": 2.766728054020872, + "grad_norm": 0.30598360300064087, + "learning_rate": 8.487946342509509e-05, + "loss": 1.8383, + "step": 9014 + }, + { + "epoch": 2.7670349907918967, + "grad_norm": 0.30098259449005127, + "learning_rate": 8.487590185714811e-05, + "loss": 1.8229, + "step": 9015 + }, + { + "epoch": 2.767341927562922, + "grad_norm": 0.45887723565101624, + "learning_rate": 8.487233994453701e-05, + "loss": 1.9128, + "step": 9016 + }, + { + "epoch": 2.767648864333947, + "grad_norm": 0.4983403980731964, + "learning_rate": 8.4868777687297e-05, + "loss": 1.8269, + "step": 9017 + }, + { + "epoch": 2.7679558011049723, + "grad_norm": 0.4925507605075836, + "learning_rate": 8.48652150854633e-05, + "loss": 1.9231, + "step": 9018 + }, + { + "epoch": 2.7682627378759976, + "grad_norm": 0.31434112787246704, + "learning_rate": 8.48616521390711e-05, + "loss": 1.7782, + "step": 9019 + }, + { + "epoch": 2.768569674647023, + "grad_norm": 0.31802332401275635, + "learning_rate": 8.485808884815563e-05, + "loss": 1.8927, + "step": 9020 + }, + { + "epoch": 2.768876611418048, + "grad_norm": 0.4615871012210846, + "learning_rate": 8.485452521275208e-05, + "loss": 1.7866, + "step": 9021 + }, + { + "epoch": 2.769183548189073, + "grad_norm": 0.43722355365753174, + "learning_rate": 8.48509612328957e-05, + "loss": 1.8159, + "step": 9022 + }, + { + "epoch": 2.769490484960098, + "grad_norm": 0.27137285470962524, + "learning_rate": 8.484739690862169e-05, + "loss": 1.7613, + "step": 9023 + }, + { + "epoch": 2.7697974217311234, + "grad_norm": 0.32973676919937134, + "learning_rate": 8.484383223996528e-05, + "loss": 1.8321, + "step": 9024 + }, + { + "epoch": 2.7701043585021488, + "grad_norm": 0.38628003001213074, + "learning_rate": 8.484026722696169e-05, + "loss": 1.8154, + "step": 9025 + }, + { + "epoch": 2.7704112952731736, + "grad_norm": 0.33044543862342834, + "learning_rate": 8.483670186964617e-05, + "loss": 1.857, + "step": 9026 + }, + { + "epoch": 2.770718232044199, + "grad_norm": 0.2778245210647583, + "learning_rate": 8.483313616805393e-05, + "loss": 1.8524, + "step": 9027 + }, + { + "epoch": 2.771025168815224, + "grad_norm": 0.32064709067344666, + "learning_rate": 8.482957012222024e-05, + "loss": 1.8757, + "step": 9028 + }, + { + "epoch": 2.771332105586249, + "grad_norm": 0.29325249791145325, + "learning_rate": 8.48260037321803e-05, + "loss": 1.8504, + "step": 9029 + }, + { + "epoch": 2.7716390423572745, + "grad_norm": 0.308626651763916, + "learning_rate": 8.48224369979694e-05, + "loss": 1.882, + "step": 9030 + }, + { + "epoch": 2.7719459791282994, + "grad_norm": 0.34577706456184387, + "learning_rate": 8.481886991962276e-05, + "loss": 1.8178, + "step": 9031 + }, + { + "epoch": 2.7722529158993248, + "grad_norm": 0.3902320861816406, + "learning_rate": 8.481530249717564e-05, + "loss": 1.9111, + "step": 9032 + }, + { + "epoch": 2.7725598526703497, + "grad_norm": 0.431540310382843, + "learning_rate": 8.481173473066328e-05, + "loss": 1.8145, + "step": 9033 + }, + { + "epoch": 2.772866789441375, + "grad_norm": 0.3637184798717499, + "learning_rate": 8.480816662012097e-05, + "loss": 1.8298, + "step": 9034 + }, + { + "epoch": 2.7731737262124003, + "grad_norm": 0.3045017123222351, + "learning_rate": 8.480459816558397e-05, + "loss": 1.8099, + "step": 9035 + }, + { + "epoch": 2.7734806629834257, + "grad_norm": 0.4252402186393738, + "learning_rate": 8.48010293670875e-05, + "loss": 1.8125, + "step": 9036 + }, + { + "epoch": 2.7737875997544506, + "grad_norm": 0.37933188676834106, + "learning_rate": 8.479746022466688e-05, + "loss": 1.8162, + "step": 9037 + }, + { + "epoch": 2.774094536525476, + "grad_norm": 0.287536084651947, + "learning_rate": 8.479389073835735e-05, + "loss": 1.8377, + "step": 9038 + }, + { + "epoch": 2.7744014732965008, + "grad_norm": 0.3484840393066406, + "learning_rate": 8.47903209081942e-05, + "loss": 1.8166, + "step": 9039 + }, + { + "epoch": 2.774708410067526, + "grad_norm": 0.4489477872848511, + "learning_rate": 8.478675073421272e-05, + "loss": 1.8618, + "step": 9040 + }, + { + "epoch": 2.7750153468385514, + "grad_norm": 0.3817744553089142, + "learning_rate": 8.478318021644817e-05, + "loss": 1.86, + "step": 9041 + }, + { + "epoch": 2.7753222836095763, + "grad_norm": 0.263468861579895, + "learning_rate": 8.477960935493585e-05, + "loss": 1.7802, + "step": 9042 + }, + { + "epoch": 2.7756292203806017, + "grad_norm": 0.3218925893306732, + "learning_rate": 8.477603814971104e-05, + "loss": 1.8056, + "step": 9043 + }, + { + "epoch": 2.7759361571516266, + "grad_norm": 0.38502782583236694, + "learning_rate": 8.477246660080905e-05, + "loss": 1.8405, + "step": 9044 + }, + { + "epoch": 2.776243093922652, + "grad_norm": 0.3504064381122589, + "learning_rate": 8.476889470826517e-05, + "loss": 1.8606, + "step": 9045 + }, + { + "epoch": 2.7765500306936772, + "grad_norm": 0.3007161021232605, + "learning_rate": 8.476532247211468e-05, + "loss": 1.8407, + "step": 9046 + }, + { + "epoch": 2.776856967464702, + "grad_norm": 0.30306726694107056, + "learning_rate": 8.476174989239289e-05, + "loss": 1.8399, + "step": 9047 + }, + { + "epoch": 2.7771639042357275, + "grad_norm": 0.3898545801639557, + "learning_rate": 8.475817696913511e-05, + "loss": 1.8971, + "step": 9048 + }, + { + "epoch": 2.7774708410067523, + "grad_norm": 0.35386478900909424, + "learning_rate": 8.475460370237667e-05, + "loss": 1.8213, + "step": 9049 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.35815873742103577, + "learning_rate": 8.475103009215287e-05, + "loss": 1.9593, + "step": 9050 + }, + { + "epoch": 2.778084714548803, + "grad_norm": 0.28021275997161865, + "learning_rate": 8.474745613849901e-05, + "loss": 1.7767, + "step": 9051 + }, + { + "epoch": 2.7783916513198283, + "grad_norm": 0.3393603563308716, + "learning_rate": 8.474388184145042e-05, + "loss": 1.8484, + "step": 9052 + }, + { + "epoch": 2.7786985880908532, + "grad_norm": 0.30488693714141846, + "learning_rate": 8.474030720104243e-05, + "loss": 1.835, + "step": 9053 + }, + { + "epoch": 2.7790055248618786, + "grad_norm": 0.2839586138725281, + "learning_rate": 8.473673221731037e-05, + "loss": 1.8054, + "step": 9054 + }, + { + "epoch": 2.7793124616329035, + "grad_norm": 0.2718851864337921, + "learning_rate": 8.473315689028955e-05, + "loss": 1.8216, + "step": 9055 + }, + { + "epoch": 2.779619398403929, + "grad_norm": 0.3072827458381653, + "learning_rate": 8.472958122001531e-05, + "loss": 1.8537, + "step": 9056 + }, + { + "epoch": 2.779926335174954, + "grad_norm": 0.36827966570854187, + "learning_rate": 8.472600520652301e-05, + "loss": 1.8174, + "step": 9057 + }, + { + "epoch": 2.780233271945979, + "grad_norm": 0.37436968088150024, + "learning_rate": 8.472242884984797e-05, + "loss": 1.7983, + "step": 9058 + }, + { + "epoch": 2.7805402087170044, + "grad_norm": 0.3039530813694, + "learning_rate": 8.471885215002554e-05, + "loss": 1.839, + "step": 9059 + }, + { + "epoch": 2.7808471454880292, + "grad_norm": 0.2949865162372589, + "learning_rate": 8.471527510709106e-05, + "loss": 1.8191, + "step": 9060 + }, + { + "epoch": 2.7811540822590546, + "grad_norm": 0.2914051413536072, + "learning_rate": 8.471169772107987e-05, + "loss": 1.8511, + "step": 9061 + }, + { + "epoch": 2.78146101903008, + "grad_norm": 0.29169002175331116, + "learning_rate": 8.470811999202734e-05, + "loss": 1.8242, + "step": 9062 + }, + { + "epoch": 2.781767955801105, + "grad_norm": 0.2862909436225891, + "learning_rate": 8.470454191996884e-05, + "loss": 1.8471, + "step": 9063 + }, + { + "epoch": 2.78207489257213, + "grad_norm": 0.2820829749107361, + "learning_rate": 8.47009635049397e-05, + "loss": 1.8539, + "step": 9064 + }, + { + "epoch": 2.782381829343155, + "grad_norm": 0.2778072655200958, + "learning_rate": 8.469738474697532e-05, + "loss": 1.7999, + "step": 9065 + }, + { + "epoch": 2.7826887661141804, + "grad_norm": 0.35963353514671326, + "learning_rate": 8.469380564611103e-05, + "loss": 1.8589, + "step": 9066 + }, + { + "epoch": 2.7829957028852057, + "grad_norm": 0.29438379406929016, + "learning_rate": 8.469022620238223e-05, + "loss": 1.7898, + "step": 9067 + }, + { + "epoch": 2.783302639656231, + "grad_norm": 0.2766551971435547, + "learning_rate": 8.468664641582428e-05, + "loss": 1.858, + "step": 9068 + }, + { + "epoch": 2.783609576427256, + "grad_norm": 0.29893574118614197, + "learning_rate": 8.468306628647256e-05, + "loss": 1.7859, + "step": 9069 + }, + { + "epoch": 2.7839165131982813, + "grad_norm": 0.2744910717010498, + "learning_rate": 8.467948581436243e-05, + "loss": 1.7803, + "step": 9070 + }, + { + "epoch": 2.784223449969306, + "grad_norm": 0.2405908703804016, + "learning_rate": 8.467590499952931e-05, + "loss": 1.8064, + "step": 9071 + }, + { + "epoch": 2.7845303867403315, + "grad_norm": 0.28585049510002136, + "learning_rate": 8.467232384200858e-05, + "loss": 1.809, + "step": 9072 + }, + { + "epoch": 2.784837323511357, + "grad_norm": 0.25816819071769714, + "learning_rate": 8.466874234183562e-05, + "loss": 1.7687, + "step": 9073 + }, + { + "epoch": 2.7851442602823817, + "grad_norm": 0.3135145306587219, + "learning_rate": 8.466516049904582e-05, + "loss": 1.8902, + "step": 9074 + }, + { + "epoch": 2.785451197053407, + "grad_norm": 0.32004159688949585, + "learning_rate": 8.46615783136746e-05, + "loss": 1.8227, + "step": 9075 + }, + { + "epoch": 2.785758133824432, + "grad_norm": 0.2775251567363739, + "learning_rate": 8.465799578575733e-05, + "loss": 1.8293, + "step": 9076 + }, + { + "epoch": 2.7860650705954573, + "grad_norm": 0.3377391993999481, + "learning_rate": 8.465441291532944e-05, + "loss": 1.9096, + "step": 9077 + }, + { + "epoch": 2.7863720073664826, + "grad_norm": 0.322818398475647, + "learning_rate": 8.465082970242634e-05, + "loss": 1.8372, + "step": 9078 + }, + { + "epoch": 2.786678944137508, + "grad_norm": 0.30539727210998535, + "learning_rate": 8.464724614708342e-05, + "loss": 1.8678, + "step": 9079 + }, + { + "epoch": 2.786985880908533, + "grad_norm": 0.3148079216480255, + "learning_rate": 8.464366224933611e-05, + "loss": 1.798, + "step": 9080 + }, + { + "epoch": 2.787292817679558, + "grad_norm": 0.3834371566772461, + "learning_rate": 8.464007800921983e-05, + "loss": 1.7871, + "step": 9081 + }, + { + "epoch": 2.787599754450583, + "grad_norm": 0.360202431678772, + "learning_rate": 8.463649342676998e-05, + "loss": 1.8396, + "step": 9082 + }, + { + "epoch": 2.7879066912216084, + "grad_norm": 0.28360050916671753, + "learning_rate": 8.463290850202201e-05, + "loss": 1.7905, + "step": 9083 + }, + { + "epoch": 2.7882136279926337, + "grad_norm": 0.28087326884269714, + "learning_rate": 8.462932323501134e-05, + "loss": 1.8079, + "step": 9084 + }, + { + "epoch": 2.7885205647636586, + "grad_norm": 0.2725851833820343, + "learning_rate": 8.462573762577339e-05, + "loss": 1.8099, + "step": 9085 + }, + { + "epoch": 2.788827501534684, + "grad_norm": 0.27776938676834106, + "learning_rate": 8.462215167434363e-05, + "loss": 1.8002, + "step": 9086 + }, + { + "epoch": 2.789134438305709, + "grad_norm": 0.3118545711040497, + "learning_rate": 8.461856538075745e-05, + "loss": 1.8541, + "step": 9087 + }, + { + "epoch": 2.789441375076734, + "grad_norm": 0.29499873518943787, + "learning_rate": 8.461497874505034e-05, + "loss": 1.8667, + "step": 9088 + }, + { + "epoch": 2.7897483118477595, + "grad_norm": 0.31346917152404785, + "learning_rate": 8.46113917672577e-05, + "loss": 1.8737, + "step": 9089 + }, + { + "epoch": 2.7900552486187844, + "grad_norm": 0.30406203866004944, + "learning_rate": 8.460780444741501e-05, + "loss": 1.8467, + "step": 9090 + }, + { + "epoch": 2.7903621853898097, + "grad_norm": 0.28438735008239746, + "learning_rate": 8.46042167855577e-05, + "loss": 1.8008, + "step": 9091 + }, + { + "epoch": 2.7906691221608346, + "grad_norm": 0.29893866181373596, + "learning_rate": 8.460062878172125e-05, + "loss": 1.8498, + "step": 9092 + }, + { + "epoch": 2.79097605893186, + "grad_norm": 0.33810749650001526, + "learning_rate": 8.459704043594112e-05, + "loss": 1.8259, + "step": 9093 + }, + { + "epoch": 2.7912829957028853, + "grad_norm": 0.3726813495159149, + "learning_rate": 8.459345174825273e-05, + "loss": 1.8831, + "step": 9094 + }, + { + "epoch": 2.7915899324739106, + "grad_norm": 0.2983379662036896, + "learning_rate": 8.45898627186916e-05, + "loss": 1.7886, + "step": 9095 + }, + { + "epoch": 2.7918968692449355, + "grad_norm": 0.3235681354999542, + "learning_rate": 8.458627334729316e-05, + "loss": 1.8616, + "step": 9096 + }, + { + "epoch": 2.792203806015961, + "grad_norm": 0.47961094975471497, + "learning_rate": 8.458268363409288e-05, + "loss": 1.8134, + "step": 9097 + }, + { + "epoch": 2.7925107427869857, + "grad_norm": 0.5463281869888306, + "learning_rate": 8.457909357912628e-05, + "loss": 1.8288, + "step": 9098 + }, + { + "epoch": 2.792817679558011, + "grad_norm": 0.5377171635627747, + "learning_rate": 8.45755031824288e-05, + "loss": 1.8032, + "step": 9099 + }, + { + "epoch": 2.7931246163290364, + "grad_norm": 0.30159178376197815, + "learning_rate": 8.457191244403592e-05, + "loss": 1.7619, + "step": 9100 + }, + { + "epoch": 2.7934315531000613, + "grad_norm": 0.33798086643218994, + "learning_rate": 8.456832136398315e-05, + "loss": 1.839, + "step": 9101 + }, + { + "epoch": 2.7937384898710866, + "grad_norm": 0.5194488167762756, + "learning_rate": 8.456472994230595e-05, + "loss": 1.7908, + "step": 9102 + }, + { + "epoch": 2.7940454266421115, + "grad_norm": 0.49310582876205444, + "learning_rate": 8.456113817903986e-05, + "loss": 1.8471, + "step": 9103 + }, + { + "epoch": 2.794352363413137, + "grad_norm": 0.27490735054016113, + "learning_rate": 8.455754607422032e-05, + "loss": 1.8168, + "step": 9104 + }, + { + "epoch": 2.794659300184162, + "grad_norm": 0.3760504126548767, + "learning_rate": 8.455395362788285e-05, + "loss": 1.8796, + "step": 9105 + }, + { + "epoch": 2.794966236955187, + "grad_norm": 0.4636823534965515, + "learning_rate": 8.455036084006298e-05, + "loss": 1.8001, + "step": 9106 + }, + { + "epoch": 2.7952731737262124, + "grad_norm": 0.38666999340057373, + "learning_rate": 8.454676771079619e-05, + "loss": 1.8396, + "step": 9107 + }, + { + "epoch": 2.7955801104972373, + "grad_norm": 0.2992180585861206, + "learning_rate": 8.454317424011797e-05, + "loss": 1.8298, + "step": 9108 + }, + { + "epoch": 2.7958870472682626, + "grad_norm": 0.3744206428527832, + "learning_rate": 8.453958042806389e-05, + "loss": 1.8396, + "step": 9109 + }, + { + "epoch": 2.796193984039288, + "grad_norm": 0.5117284059524536, + "learning_rate": 8.453598627466941e-05, + "loss": 1.9734, + "step": 9110 + }, + { + "epoch": 2.7965009208103133, + "grad_norm": 0.36792969703674316, + "learning_rate": 8.453239177997008e-05, + "loss": 1.8347, + "step": 9111 + }, + { + "epoch": 2.796807857581338, + "grad_norm": 0.3352719843387604, + "learning_rate": 8.452879694400139e-05, + "loss": 1.7967, + "step": 9112 + }, + { + "epoch": 2.7971147943523635, + "grad_norm": 0.45745235681533813, + "learning_rate": 8.452520176679893e-05, + "loss": 1.8484, + "step": 9113 + }, + { + "epoch": 2.7974217311233884, + "grad_norm": 0.43958255648612976, + "learning_rate": 8.452160624839816e-05, + "loss": 1.7954, + "step": 9114 + }, + { + "epoch": 2.7977286678944138, + "grad_norm": 0.28715837001800537, + "learning_rate": 8.451801038883467e-05, + "loss": 1.8088, + "step": 9115 + }, + { + "epoch": 2.798035604665439, + "grad_norm": 0.3552972078323364, + "learning_rate": 8.451441418814394e-05, + "loss": 1.7654, + "step": 9116 + }, + { + "epoch": 2.798342541436464, + "grad_norm": 0.5065462589263916, + "learning_rate": 8.451081764636156e-05, + "loss": 1.7841, + "step": 9117 + }, + { + "epoch": 2.7986494782074893, + "grad_norm": 0.48900917172431946, + "learning_rate": 8.450722076352306e-05, + "loss": 1.8709, + "step": 9118 + }, + { + "epoch": 2.798956414978514, + "grad_norm": 0.31420227885246277, + "learning_rate": 8.450362353966395e-05, + "loss": 1.9057, + "step": 9119 + }, + { + "epoch": 2.7992633517495396, + "grad_norm": 0.35886913537979126, + "learning_rate": 8.450002597481982e-05, + "loss": 1.877, + "step": 9120 + }, + { + "epoch": 2.799570288520565, + "grad_norm": 0.3822213113307953, + "learning_rate": 8.449642806902623e-05, + "loss": 1.9171, + "step": 9121 + }, + { + "epoch": 2.7998772252915898, + "grad_norm": 0.3286183476448059, + "learning_rate": 8.449282982231869e-05, + "loss": 1.8342, + "step": 9122 + }, + { + "epoch": 2.800184162062615, + "grad_norm": 0.3498966693878174, + "learning_rate": 8.448923123473282e-05, + "loss": 1.8276, + "step": 9123 + }, + { + "epoch": 2.80049109883364, + "grad_norm": 0.3550187647342682, + "learning_rate": 8.448563230630413e-05, + "loss": 1.8585, + "step": 9124 + }, + { + "epoch": 2.8007980356046653, + "grad_norm": 0.32100117206573486, + "learning_rate": 8.448203303706821e-05, + "loss": 1.8168, + "step": 9125 + }, + { + "epoch": 2.8011049723756907, + "grad_norm": 0.3859860301017761, + "learning_rate": 8.447843342706063e-05, + "loss": 1.8941, + "step": 9126 + }, + { + "epoch": 2.801411909146716, + "grad_norm": 0.41674432158470154, + "learning_rate": 8.447483347631697e-05, + "loss": 1.7894, + "step": 9127 + }, + { + "epoch": 2.801718845917741, + "grad_norm": 0.3324837386608124, + "learning_rate": 8.44712331848728e-05, + "loss": 1.8901, + "step": 9128 + }, + { + "epoch": 2.8020257826887662, + "grad_norm": 0.30357789993286133, + "learning_rate": 8.44676325527637e-05, + "loss": 1.8434, + "step": 9129 + }, + { + "epoch": 2.802332719459791, + "grad_norm": 0.3215816617012024, + "learning_rate": 8.446403158002525e-05, + "loss": 1.8291, + "step": 9130 + }, + { + "epoch": 2.8026396562308165, + "grad_norm": 0.26280832290649414, + "learning_rate": 8.446043026669303e-05, + "loss": 1.7934, + "step": 9131 + }, + { + "epoch": 2.802946593001842, + "grad_norm": 0.2963539659976959, + "learning_rate": 8.445682861280265e-05, + "loss": 1.824, + "step": 9132 + }, + { + "epoch": 2.8032535297728667, + "grad_norm": 0.4251864552497864, + "learning_rate": 8.44532266183897e-05, + "loss": 1.9, + "step": 9133 + }, + { + "epoch": 2.803560466543892, + "grad_norm": 0.3920140862464905, + "learning_rate": 8.444962428348978e-05, + "loss": 1.7753, + "step": 9134 + }, + { + "epoch": 2.803867403314917, + "grad_norm": 0.2614890933036804, + "learning_rate": 8.444602160813845e-05, + "loss": 1.844, + "step": 9135 + }, + { + "epoch": 2.8041743400859422, + "grad_norm": 0.3359995484352112, + "learning_rate": 8.444241859237135e-05, + "loss": 1.8636, + "step": 9136 + }, + { + "epoch": 2.8044812768569676, + "grad_norm": 0.34399285912513733, + "learning_rate": 8.44388152362241e-05, + "loss": 1.8304, + "step": 9137 + }, + { + "epoch": 2.804788213627993, + "grad_norm": 0.27815961837768555, + "learning_rate": 8.443521153973228e-05, + "loss": 1.7916, + "step": 9138 + }, + { + "epoch": 2.805095150399018, + "grad_norm": 0.40705251693725586, + "learning_rate": 8.443160750293152e-05, + "loss": 1.7707, + "step": 9139 + }, + { + "epoch": 2.805402087170043, + "grad_norm": 0.49512532353401184, + "learning_rate": 8.442800312585744e-05, + "loss": 1.866, + "step": 9140 + }, + { + "epoch": 2.805709023941068, + "grad_norm": 0.31373831629753113, + "learning_rate": 8.442439840854565e-05, + "loss": 1.8495, + "step": 9141 + }, + { + "epoch": 2.8060159607120934, + "grad_norm": 0.33470213413238525, + "learning_rate": 8.442079335103177e-05, + "loss": 1.8459, + "step": 9142 + }, + { + "epoch": 2.8063228974831187, + "grad_norm": 0.4092586636543274, + "learning_rate": 8.441718795335145e-05, + "loss": 1.8547, + "step": 9143 + }, + { + "epoch": 2.8066298342541436, + "grad_norm": 0.37220728397369385, + "learning_rate": 8.44135822155403e-05, + "loss": 1.8922, + "step": 9144 + }, + { + "epoch": 2.806936771025169, + "grad_norm": 0.3197399973869324, + "learning_rate": 8.440997613763395e-05, + "loss": 1.872, + "step": 9145 + }, + { + "epoch": 2.807243707796194, + "grad_norm": 0.31258881092071533, + "learning_rate": 8.440636971966805e-05, + "loss": 1.8394, + "step": 9146 + }, + { + "epoch": 2.807550644567219, + "grad_norm": 0.31450721621513367, + "learning_rate": 8.440276296167825e-05, + "loss": 1.8496, + "step": 9147 + }, + { + "epoch": 2.8078575813382445, + "grad_norm": 0.30959805846214294, + "learning_rate": 8.439915586370018e-05, + "loss": 1.8326, + "step": 9148 + }, + { + "epoch": 2.8081645181092694, + "grad_norm": 0.2942456901073456, + "learning_rate": 8.439554842576949e-05, + "loss": 1.8742, + "step": 9149 + }, + { + "epoch": 2.8084714548802947, + "grad_norm": 0.32378795742988586, + "learning_rate": 8.439194064792182e-05, + "loss": 1.7991, + "step": 9150 + }, + { + "epoch": 2.8087783916513196, + "grad_norm": 0.30733996629714966, + "learning_rate": 8.438833253019285e-05, + "loss": 1.8822, + "step": 9151 + }, + { + "epoch": 2.809085328422345, + "grad_norm": 0.29933521151542664, + "learning_rate": 8.438472407261821e-05, + "loss": 1.7785, + "step": 9152 + }, + { + "epoch": 2.8093922651933703, + "grad_norm": 0.2992005944252014, + "learning_rate": 8.438111527523358e-05, + "loss": 1.9056, + "step": 9153 + }, + { + "epoch": 2.8096992019643956, + "grad_norm": 0.3074969947338104, + "learning_rate": 8.43775061380746e-05, + "loss": 1.8283, + "step": 9154 + }, + { + "epoch": 2.8100061387354205, + "grad_norm": 0.29843345284461975, + "learning_rate": 8.437389666117699e-05, + "loss": 1.87, + "step": 9155 + }, + { + "epoch": 2.810313075506446, + "grad_norm": 0.2939853072166443, + "learning_rate": 8.437028684457635e-05, + "loss": 1.8657, + "step": 9156 + }, + { + "epoch": 2.8106200122774707, + "grad_norm": 0.292972207069397, + "learning_rate": 8.436667668830841e-05, + "loss": 1.821, + "step": 9157 + }, + { + "epoch": 2.810926949048496, + "grad_norm": 0.298244833946228, + "learning_rate": 8.436306619240882e-05, + "loss": 1.8531, + "step": 9158 + }, + { + "epoch": 2.8112338858195214, + "grad_norm": 0.28567394614219666, + "learning_rate": 8.435945535691328e-05, + "loss": 1.7719, + "step": 9159 + }, + { + "epoch": 2.8115408225905463, + "grad_norm": 0.2876092493534088, + "learning_rate": 8.435584418185745e-05, + "loss": 1.7622, + "step": 9160 + }, + { + "epoch": 2.8118477593615716, + "grad_norm": 0.2656804919242859, + "learning_rate": 8.435223266727704e-05, + "loss": 1.7624, + "step": 9161 + }, + { + "epoch": 2.8121546961325965, + "grad_norm": 0.26690298318862915, + "learning_rate": 8.434862081320774e-05, + "loss": 1.807, + "step": 9162 + }, + { + "epoch": 2.812461632903622, + "grad_norm": 0.3088238537311554, + "learning_rate": 8.434500861968521e-05, + "loss": 1.9214, + "step": 9163 + }, + { + "epoch": 2.812768569674647, + "grad_norm": 0.32310751080513, + "learning_rate": 8.43413960867452e-05, + "loss": 1.8341, + "step": 9164 + }, + { + "epoch": 2.813075506445672, + "grad_norm": 0.3028428554534912, + "learning_rate": 8.433778321442339e-05, + "loss": 1.8316, + "step": 9165 + }, + { + "epoch": 2.8133824432166974, + "grad_norm": 0.28363901376724243, + "learning_rate": 8.433417000275545e-05, + "loss": 1.8506, + "step": 9166 + }, + { + "epoch": 2.8136893799877223, + "grad_norm": 0.2976547181606293, + "learning_rate": 8.433055645177714e-05, + "loss": 1.8654, + "step": 9167 + }, + { + "epoch": 2.8139963167587476, + "grad_norm": 0.2945725619792938, + "learning_rate": 8.432694256152414e-05, + "loss": 1.8146, + "step": 9168 + }, + { + "epoch": 2.814303253529773, + "grad_norm": 0.30364149808883667, + "learning_rate": 8.432332833203217e-05, + "loss": 1.8152, + "step": 9169 + }, + { + "epoch": 2.8146101903007983, + "grad_norm": 0.2776038348674774, + "learning_rate": 8.431971376333699e-05, + "loss": 1.7723, + "step": 9170 + }, + { + "epoch": 2.814917127071823, + "grad_norm": 0.41802000999450684, + "learning_rate": 8.431609885547425e-05, + "loss": 1.7909, + "step": 9171 + }, + { + "epoch": 2.8152240638428485, + "grad_norm": 0.400622695684433, + "learning_rate": 8.43124836084797e-05, + "loss": 1.8241, + "step": 9172 + }, + { + "epoch": 2.8155310006138734, + "grad_norm": 0.3760300576686859, + "learning_rate": 8.430886802238908e-05, + "loss": 1.9298, + "step": 9173 + }, + { + "epoch": 2.8158379373848987, + "grad_norm": 0.2944977283477783, + "learning_rate": 8.430525209723813e-05, + "loss": 1.8181, + "step": 9174 + }, + { + "epoch": 2.816144874155924, + "grad_norm": 0.28091785311698914, + "learning_rate": 8.430163583306257e-05, + "loss": 1.8178, + "step": 9175 + }, + { + "epoch": 2.816451810926949, + "grad_norm": 0.33689528703689575, + "learning_rate": 8.429801922989812e-05, + "loss": 1.8195, + "step": 9176 + }, + { + "epoch": 2.8167587476979743, + "grad_norm": 0.3541412055492401, + "learning_rate": 8.429440228778058e-05, + "loss": 1.8951, + "step": 9177 + }, + { + "epoch": 2.817065684468999, + "grad_norm": 0.2846376299858093, + "learning_rate": 8.429078500674564e-05, + "loss": 1.7858, + "step": 9178 + }, + { + "epoch": 2.8173726212400245, + "grad_norm": 0.28097108006477356, + "learning_rate": 8.428716738682905e-05, + "loss": 1.8503, + "step": 9179 + }, + { + "epoch": 2.81767955801105, + "grad_norm": 0.354670912027359, + "learning_rate": 8.428354942806658e-05, + "loss": 1.8332, + "step": 9180 + }, + { + "epoch": 2.8179864947820747, + "grad_norm": 0.3589770793914795, + "learning_rate": 8.427993113049397e-05, + "loss": 1.8527, + "step": 9181 + }, + { + "epoch": 2.8182934315531, + "grad_norm": 0.3171144723892212, + "learning_rate": 8.4276312494147e-05, + "loss": 1.789, + "step": 9182 + }, + { + "epoch": 2.818600368324125, + "grad_norm": 0.3540917932987213, + "learning_rate": 8.427269351906143e-05, + "loss": 1.8338, + "step": 9183 + }, + { + "epoch": 2.8189073050951503, + "grad_norm": 0.34149861335754395, + "learning_rate": 8.426907420527302e-05, + "loss": 1.8202, + "step": 9184 + }, + { + "epoch": 2.8192142418661756, + "grad_norm": 0.3035878837108612, + "learning_rate": 8.426545455281751e-05, + "loss": 1.842, + "step": 9185 + }, + { + "epoch": 2.819521178637201, + "grad_norm": 0.29007625579833984, + "learning_rate": 8.426183456173072e-05, + "loss": 1.8486, + "step": 9186 + }, + { + "epoch": 2.819828115408226, + "grad_norm": 0.3066602647304535, + "learning_rate": 8.425821423204837e-05, + "loss": 1.7833, + "step": 9187 + }, + { + "epoch": 2.820135052179251, + "grad_norm": 0.3163747191429138, + "learning_rate": 8.425459356380627e-05, + "loss": 1.8037, + "step": 9188 + }, + { + "epoch": 2.820441988950276, + "grad_norm": 0.3282648026943207, + "learning_rate": 8.425097255704022e-05, + "loss": 1.8476, + "step": 9189 + }, + { + "epoch": 2.8207489257213014, + "grad_norm": 0.3573009669780731, + "learning_rate": 8.424735121178598e-05, + "loss": 1.87, + "step": 9190 + }, + { + "epoch": 2.8210558624923268, + "grad_norm": 0.3480490744113922, + "learning_rate": 8.424372952807933e-05, + "loss": 1.8773, + "step": 9191 + }, + { + "epoch": 2.8213627992633517, + "grad_norm": 0.3296821415424347, + "learning_rate": 8.424010750595608e-05, + "loss": 1.8775, + "step": 9192 + }, + { + "epoch": 2.821669736034377, + "grad_norm": 0.33366382122039795, + "learning_rate": 8.423648514545202e-05, + "loss": 1.8064, + "step": 9193 + }, + { + "epoch": 2.821976672805402, + "grad_norm": 0.454303503036499, + "learning_rate": 8.423286244660295e-05, + "loss": 1.9702, + "step": 9194 + }, + { + "epoch": 2.822283609576427, + "grad_norm": 0.361215740442276, + "learning_rate": 8.422923940944466e-05, + "loss": 1.8055, + "step": 9195 + }, + { + "epoch": 2.8225905463474525, + "grad_norm": 0.3678447902202606, + "learning_rate": 8.422561603401297e-05, + "loss": 1.8924, + "step": 9196 + }, + { + "epoch": 2.8228974831184774, + "grad_norm": 0.32999005913734436, + "learning_rate": 8.422199232034369e-05, + "loss": 1.7887, + "step": 9197 + }, + { + "epoch": 2.8232044198895028, + "grad_norm": 0.2811618149280548, + "learning_rate": 8.42183682684726e-05, + "loss": 1.8166, + "step": 9198 + }, + { + "epoch": 2.8235113566605277, + "grad_norm": 0.3178839385509491, + "learning_rate": 8.421474387843555e-05, + "loss": 1.7868, + "step": 9199 + }, + { + "epoch": 2.823818293431553, + "grad_norm": 0.27299264073371887, + "learning_rate": 8.421111915026836e-05, + "loss": 1.816, + "step": 9200 + }, + { + "epoch": 2.8241252302025783, + "grad_norm": 0.3191591203212738, + "learning_rate": 8.420749408400684e-05, + "loss": 1.912, + "step": 9201 + }, + { + "epoch": 2.8244321669736037, + "grad_norm": 0.3638809323310852, + "learning_rate": 8.42038686796868e-05, + "loss": 1.7716, + "step": 9202 + }, + { + "epoch": 2.8247391037446286, + "grad_norm": 0.33573171496391296, + "learning_rate": 8.420024293734407e-05, + "loss": 1.8599, + "step": 9203 + }, + { + "epoch": 2.825046040515654, + "grad_norm": 0.29062843322753906, + "learning_rate": 8.419661685701452e-05, + "loss": 1.7982, + "step": 9204 + }, + { + "epoch": 2.825352977286679, + "grad_norm": 0.27475887537002563, + "learning_rate": 8.419299043873394e-05, + "loss": 1.7763, + "step": 9205 + }, + { + "epoch": 2.825659914057704, + "grad_norm": 0.2996850609779358, + "learning_rate": 8.41893636825382e-05, + "loss": 1.7957, + "step": 9206 + }, + { + "epoch": 2.8259668508287294, + "grad_norm": 0.38112908601760864, + "learning_rate": 8.418573658846314e-05, + "loss": 1.8536, + "step": 9207 + }, + { + "epoch": 2.8262737875997543, + "grad_norm": 0.3245584964752197, + "learning_rate": 8.418210915654456e-05, + "loss": 1.8254, + "step": 9208 + }, + { + "epoch": 2.8265807243707797, + "grad_norm": 0.24600234627723694, + "learning_rate": 8.417848138681837e-05, + "loss": 1.825, + "step": 9209 + }, + { + "epoch": 2.8268876611418046, + "grad_norm": 0.3130429685115814, + "learning_rate": 8.417485327932038e-05, + "loss": 1.7954, + "step": 9210 + }, + { + "epoch": 2.82719459791283, + "grad_norm": 0.3218819200992584, + "learning_rate": 8.417122483408647e-05, + "loss": 1.8343, + "step": 9211 + }, + { + "epoch": 2.8275015346838552, + "grad_norm": 0.3020598292350769, + "learning_rate": 8.416759605115248e-05, + "loss": 1.8547, + "step": 9212 + }, + { + "epoch": 2.8278084714548806, + "grad_norm": 0.2685437798500061, + "learning_rate": 8.416396693055429e-05, + "loss": 1.7828, + "step": 9213 + }, + { + "epoch": 2.8281154082259055, + "grad_norm": 0.2990378737449646, + "learning_rate": 8.416033747232775e-05, + "loss": 1.8108, + "step": 9214 + }, + { + "epoch": 2.828422344996931, + "grad_norm": 0.25395238399505615, + "learning_rate": 8.415670767650871e-05, + "loss": 1.786, + "step": 9215 + }, + { + "epoch": 2.8287292817679557, + "grad_norm": 0.3406725823879242, + "learning_rate": 8.41530775431331e-05, + "loss": 1.9015, + "step": 9216 + }, + { + "epoch": 2.829036218538981, + "grad_norm": 0.279859721660614, + "learning_rate": 8.414944707223676e-05, + "loss": 1.8639, + "step": 9217 + }, + { + "epoch": 2.8293431553100064, + "grad_norm": 0.2574310600757599, + "learning_rate": 8.414581626385554e-05, + "loss": 1.7595, + "step": 9218 + }, + { + "epoch": 2.8296500920810312, + "grad_norm": 0.2956291437149048, + "learning_rate": 8.414218511802537e-05, + "loss": 1.8418, + "step": 9219 + }, + { + "epoch": 2.8299570288520566, + "grad_norm": 0.30965283513069153, + "learning_rate": 8.41385536347821e-05, + "loss": 1.8241, + "step": 9220 + }, + { + "epoch": 2.8302639656230815, + "grad_norm": 0.3125357925891876, + "learning_rate": 8.413492181416166e-05, + "loss": 1.7961, + "step": 9221 + }, + { + "epoch": 2.830570902394107, + "grad_norm": 0.23901188373565674, + "learning_rate": 8.413128965619988e-05, + "loss": 1.8109, + "step": 9222 + }, + { + "epoch": 2.830877839165132, + "grad_norm": 0.26556700468063354, + "learning_rate": 8.412765716093272e-05, + "loss": 1.8756, + "step": 9223 + }, + { + "epoch": 2.831184775936157, + "grad_norm": 0.3080972731113434, + "learning_rate": 8.412402432839604e-05, + "loss": 1.8271, + "step": 9224 + }, + { + "epoch": 2.8314917127071824, + "grad_norm": 0.32894501090049744, + "learning_rate": 8.412039115862573e-05, + "loss": 1.8427, + "step": 9225 + }, + { + "epoch": 2.8317986494782073, + "grad_norm": 0.3136049509048462, + "learning_rate": 8.411675765165774e-05, + "loss": 1.8716, + "step": 9226 + }, + { + "epoch": 2.8321055862492326, + "grad_norm": 0.26859185099601746, + "learning_rate": 8.411312380752795e-05, + "loss": 1.8138, + "step": 9227 + }, + { + "epoch": 2.832412523020258, + "grad_norm": 0.26863718032836914, + "learning_rate": 8.410948962627227e-05, + "loss": 1.8286, + "step": 9228 + }, + { + "epoch": 2.8327194597912833, + "grad_norm": 0.25599852204322815, + "learning_rate": 8.410585510792663e-05, + "loss": 1.8274, + "step": 9229 + }, + { + "epoch": 2.833026396562308, + "grad_norm": 0.22787287831306458, + "learning_rate": 8.410222025252694e-05, + "loss": 1.7961, + "step": 9230 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.22957643866539001, + "learning_rate": 8.409858506010912e-05, + "loss": 1.7763, + "step": 9231 + }, + { + "epoch": 2.8336402701043584, + "grad_norm": 0.2794438302516937, + "learning_rate": 8.409494953070909e-05, + "loss": 1.8552, + "step": 9232 + }, + { + "epoch": 2.8339472068753837, + "grad_norm": 0.2755461037158966, + "learning_rate": 8.409131366436279e-05, + "loss": 1.8418, + "step": 9233 + }, + { + "epoch": 2.834254143646409, + "grad_norm": 0.27968719601631165, + "learning_rate": 8.408767746110616e-05, + "loss": 1.8774, + "step": 9234 + }, + { + "epoch": 2.834561080417434, + "grad_norm": 0.3014982044696808, + "learning_rate": 8.408404092097511e-05, + "loss": 1.8886, + "step": 9235 + }, + { + "epoch": 2.8348680171884593, + "grad_norm": 0.3139450252056122, + "learning_rate": 8.408040404400558e-05, + "loss": 1.8119, + "step": 9236 + }, + { + "epoch": 2.835174953959484, + "grad_norm": 0.43578827381134033, + "learning_rate": 8.407676683023353e-05, + "loss": 1.8173, + "step": 9237 + }, + { + "epoch": 2.8354818907305095, + "grad_norm": 0.4939953088760376, + "learning_rate": 8.407312927969489e-05, + "loss": 1.8647, + "step": 9238 + }, + { + "epoch": 2.835788827501535, + "grad_norm": 0.40801018476486206, + "learning_rate": 8.406949139242562e-05, + "loss": 1.8259, + "step": 9239 + }, + { + "epoch": 2.8360957642725597, + "grad_norm": 0.331249862909317, + "learning_rate": 8.406585316846168e-05, + "loss": 1.8727, + "step": 9240 + }, + { + "epoch": 2.836402701043585, + "grad_norm": 0.3368569314479828, + "learning_rate": 8.406221460783901e-05, + "loss": 1.8362, + "step": 9241 + }, + { + "epoch": 2.83670963781461, + "grad_norm": 0.4736326336860657, + "learning_rate": 8.405857571059355e-05, + "loss": 1.9543, + "step": 9242 + }, + { + "epoch": 2.8370165745856353, + "grad_norm": 0.4151712656021118, + "learning_rate": 8.405493647676131e-05, + "loss": 1.8764, + "step": 9243 + }, + { + "epoch": 2.8373235113566606, + "grad_norm": 0.3463367819786072, + "learning_rate": 8.405129690637821e-05, + "loss": 1.8578, + "step": 9244 + }, + { + "epoch": 2.837630448127686, + "grad_norm": 0.28701671957969666, + "learning_rate": 8.404765699948023e-05, + "loss": 1.8201, + "step": 9245 + }, + { + "epoch": 2.837937384898711, + "grad_norm": 0.2893613874912262, + "learning_rate": 8.404401675610336e-05, + "loss": 1.7918, + "step": 9246 + }, + { + "epoch": 2.838244321669736, + "grad_norm": 0.29359766840934753, + "learning_rate": 8.404037617628357e-05, + "loss": 1.7919, + "step": 9247 + }, + { + "epoch": 2.838551258440761, + "grad_norm": 0.30147913098335266, + "learning_rate": 8.403673526005682e-05, + "loss": 1.8227, + "step": 9248 + }, + { + "epoch": 2.8388581952117864, + "grad_norm": 0.28443291783332825, + "learning_rate": 8.403309400745908e-05, + "loss": 1.8128, + "step": 9249 + }, + { + "epoch": 2.8391651319828117, + "grad_norm": 0.27890142798423767, + "learning_rate": 8.40294524185264e-05, + "loss": 1.8109, + "step": 9250 + }, + { + "epoch": 2.8394720687538366, + "grad_norm": 0.29900890588760376, + "learning_rate": 8.402581049329471e-05, + "loss": 1.7852, + "step": 9251 + }, + { + "epoch": 2.839779005524862, + "grad_norm": 0.34249019622802734, + "learning_rate": 8.402216823180001e-05, + "loss": 1.8681, + "step": 9252 + }, + { + "epoch": 2.840085942295887, + "grad_norm": 0.3387257754802704, + "learning_rate": 8.40185256340783e-05, + "loss": 1.9171, + "step": 9253 + }, + { + "epoch": 2.840392879066912, + "grad_norm": 0.2831752598285675, + "learning_rate": 8.40148827001656e-05, + "loss": 1.8422, + "step": 9254 + }, + { + "epoch": 2.8406998158379375, + "grad_norm": 0.30895891785621643, + "learning_rate": 8.401123943009788e-05, + "loss": 1.7967, + "step": 9255 + }, + { + "epoch": 2.8410067526089624, + "grad_norm": 0.381154328584671, + "learning_rate": 8.400759582391116e-05, + "loss": 1.8359, + "step": 9256 + }, + { + "epoch": 2.8413136893799877, + "grad_norm": 0.4041622281074524, + "learning_rate": 8.400395188164144e-05, + "loss": 1.8306, + "step": 9257 + }, + { + "epoch": 2.8416206261510126, + "grad_norm": 0.3801247775554657, + "learning_rate": 8.400030760332474e-05, + "loss": 1.8696, + "step": 9258 + }, + { + "epoch": 2.841927562922038, + "grad_norm": 0.27382874488830566, + "learning_rate": 8.399666298899706e-05, + "loss": 1.8369, + "step": 9259 + }, + { + "epoch": 2.8422344996930633, + "grad_norm": 0.31395214796066284, + "learning_rate": 8.399301803869445e-05, + "loss": 1.8135, + "step": 9260 + }, + { + "epoch": 2.8425414364640886, + "grad_norm": 0.36473774909973145, + "learning_rate": 8.398937275245291e-05, + "loss": 1.8025, + "step": 9261 + }, + { + "epoch": 2.8428483732351135, + "grad_norm": 0.38420331478118896, + "learning_rate": 8.398572713030846e-05, + "loss": 1.7873, + "step": 9262 + }, + { + "epoch": 2.843155310006139, + "grad_norm": 0.2707001566886902, + "learning_rate": 8.398208117229714e-05, + "loss": 1.8071, + "step": 9263 + }, + { + "epoch": 2.8434622467771637, + "grad_norm": 0.3391258418560028, + "learning_rate": 8.397843487845496e-05, + "loss": 1.8186, + "step": 9264 + }, + { + "epoch": 2.843769183548189, + "grad_norm": 0.4473530650138855, + "learning_rate": 8.397478824881799e-05, + "loss": 1.9144, + "step": 9265 + }, + { + "epoch": 2.8440761203192144, + "grad_norm": 0.3141709268093109, + "learning_rate": 8.397114128342224e-05, + "loss": 1.77, + "step": 9266 + }, + { + "epoch": 2.8443830570902393, + "grad_norm": 0.29191854596138, + "learning_rate": 8.396749398230377e-05, + "loss": 1.8645, + "step": 9267 + }, + { + "epoch": 2.8446899938612646, + "grad_norm": 0.4399743676185608, + "learning_rate": 8.39638463454986e-05, + "loss": 1.8261, + "step": 9268 + }, + { + "epoch": 2.8449969306322895, + "grad_norm": 0.4741196036338806, + "learning_rate": 8.396019837304281e-05, + "loss": 1.8566, + "step": 9269 + }, + { + "epoch": 2.845303867403315, + "grad_norm": 0.39640361070632935, + "learning_rate": 8.395655006497243e-05, + "loss": 1.8062, + "step": 9270 + }, + { + "epoch": 2.84561080417434, + "grad_norm": 0.290171355009079, + "learning_rate": 8.39529014213235e-05, + "loss": 1.8463, + "step": 9271 + }, + { + "epoch": 2.845917740945365, + "grad_norm": 0.2773928940296173, + "learning_rate": 8.394925244213212e-05, + "loss": 1.7929, + "step": 9272 + }, + { + "epoch": 2.8462246777163904, + "grad_norm": 0.38512173295021057, + "learning_rate": 8.394560312743433e-05, + "loss": 1.8724, + "step": 9273 + }, + { + "epoch": 2.8465316144874153, + "grad_norm": 0.44405680894851685, + "learning_rate": 8.394195347726619e-05, + "loss": 1.8184, + "step": 9274 + }, + { + "epoch": 2.8468385512584407, + "grad_norm": 0.32526880502700806, + "learning_rate": 8.393830349166376e-05, + "loss": 1.8207, + "step": 9275 + }, + { + "epoch": 2.847145488029466, + "grad_norm": 0.2934194803237915, + "learning_rate": 8.393465317066313e-05, + "loss": 1.8023, + "step": 9276 + }, + { + "epoch": 2.8474524248004913, + "grad_norm": 0.43126001954078674, + "learning_rate": 8.393100251430037e-05, + "loss": 1.8283, + "step": 9277 + }, + { + "epoch": 2.847759361571516, + "grad_norm": 0.48253729939460754, + "learning_rate": 8.392735152261157e-05, + "loss": 1.8359, + "step": 9278 + }, + { + "epoch": 2.8480662983425415, + "grad_norm": 0.3736251890659332, + "learning_rate": 8.392370019563279e-05, + "loss": 1.8553, + "step": 9279 + }, + { + "epoch": 2.8483732351135664, + "grad_norm": 0.33329901099205017, + "learning_rate": 8.39200485334001e-05, + "loss": 1.8156, + "step": 9280 + }, + { + "epoch": 2.8486801718845918, + "grad_norm": 0.42538657784461975, + "learning_rate": 8.391639653594963e-05, + "loss": 1.7812, + "step": 9281 + }, + { + "epoch": 2.848987108655617, + "grad_norm": 0.39076727628707886, + "learning_rate": 8.391274420331744e-05, + "loss": 1.8027, + "step": 9282 + }, + { + "epoch": 2.849294045426642, + "grad_norm": 0.3558272123336792, + "learning_rate": 8.390909153553963e-05, + "loss": 1.8448, + "step": 9283 + }, + { + "epoch": 2.8496009821976673, + "grad_norm": 0.26782071590423584, + "learning_rate": 8.390543853265232e-05, + "loss": 1.7995, + "step": 9284 + }, + { + "epoch": 2.849907918968692, + "grad_norm": 0.3449724614620209, + "learning_rate": 8.390178519469158e-05, + "loss": 1.7888, + "step": 9285 + }, + { + "epoch": 2.8502148557397176, + "grad_norm": 0.36390578746795654, + "learning_rate": 8.389813152169355e-05, + "loss": 1.8072, + "step": 9286 + }, + { + "epoch": 2.850521792510743, + "grad_norm": 0.31959423422813416, + "learning_rate": 8.389447751369428e-05, + "loss": 1.8513, + "step": 9287 + }, + { + "epoch": 2.8508287292817682, + "grad_norm": 0.2717762589454651, + "learning_rate": 8.389082317072994e-05, + "loss": 1.8457, + "step": 9288 + }, + { + "epoch": 2.851135666052793, + "grad_norm": 0.28937265276908875, + "learning_rate": 8.388716849283662e-05, + "loss": 1.7945, + "step": 9289 + }, + { + "epoch": 2.8514426028238185, + "grad_norm": 0.293079674243927, + "learning_rate": 8.388351348005044e-05, + "loss": 1.7731, + "step": 9290 + }, + { + "epoch": 2.8517495395948433, + "grad_norm": 0.32930463552474976, + "learning_rate": 8.38798581324075e-05, + "loss": 1.9017, + "step": 9291 + }, + { + "epoch": 2.8520564763658687, + "grad_norm": 0.2972584664821625, + "learning_rate": 8.387620244994397e-05, + "loss": 1.861, + "step": 9292 + }, + { + "epoch": 2.852363413136894, + "grad_norm": 0.24732981622219086, + "learning_rate": 8.387254643269595e-05, + "loss": 1.7749, + "step": 9293 + }, + { + "epoch": 2.852670349907919, + "grad_norm": 0.31004419922828674, + "learning_rate": 8.386889008069955e-05, + "loss": 1.7848, + "step": 9294 + }, + { + "epoch": 2.8529772866789442, + "grad_norm": 0.2916278541088104, + "learning_rate": 8.386523339399095e-05, + "loss": 1.8299, + "step": 9295 + }, + { + "epoch": 2.853284223449969, + "grad_norm": 0.3109573423862457, + "learning_rate": 8.386157637260626e-05, + "loss": 1.8072, + "step": 9296 + }, + { + "epoch": 2.8535911602209945, + "grad_norm": 0.26398584246635437, + "learning_rate": 8.385791901658162e-05, + "loss": 1.8157, + "step": 9297 + }, + { + "epoch": 2.85389809699202, + "grad_norm": 0.3289371132850647, + "learning_rate": 8.385426132595317e-05, + "loss": 1.9382, + "step": 9298 + }, + { + "epoch": 2.8542050337630447, + "grad_norm": 0.2946974039077759, + "learning_rate": 8.38506033007571e-05, + "loss": 1.7893, + "step": 9299 + }, + { + "epoch": 2.85451197053407, + "grad_norm": 0.2909530699253082, + "learning_rate": 8.384694494102949e-05, + "loss": 1.8223, + "step": 9300 + }, + { + "epoch": 2.854818907305095, + "grad_norm": 0.2886645793914795, + "learning_rate": 8.384328624680655e-05, + "loss": 1.8239, + "step": 9301 + }, + { + "epoch": 2.8551258440761202, + "grad_norm": 0.2669137716293335, + "learning_rate": 8.383962721812442e-05, + "loss": 1.8102, + "step": 9302 + }, + { + "epoch": 2.8554327808471456, + "grad_norm": 0.3740660548210144, + "learning_rate": 8.383596785501926e-05, + "loss": 1.9014, + "step": 9303 + }, + { + "epoch": 2.855739717618171, + "grad_norm": 0.3062593638896942, + "learning_rate": 8.383230815752724e-05, + "loss": 1.8071, + "step": 9304 + }, + { + "epoch": 2.856046654389196, + "grad_norm": 0.2509091794490814, + "learning_rate": 8.382864812568452e-05, + "loss": 1.7968, + "step": 9305 + }, + { + "epoch": 2.856353591160221, + "grad_norm": 0.2764138877391815, + "learning_rate": 8.382498775952725e-05, + "loss": 1.7463, + "step": 9306 + }, + { + "epoch": 2.856660527931246, + "grad_norm": 0.3292323350906372, + "learning_rate": 8.382132705909165e-05, + "loss": 1.7888, + "step": 9307 + }, + { + "epoch": 2.8569674647022714, + "grad_norm": 0.3169284462928772, + "learning_rate": 8.381766602441386e-05, + "loss": 1.841, + "step": 9308 + }, + { + "epoch": 2.8572744014732967, + "grad_norm": 0.27665168046951294, + "learning_rate": 8.381400465553007e-05, + "loss": 1.7659, + "step": 9309 + }, + { + "epoch": 2.8575813382443216, + "grad_norm": 0.34908005595207214, + "learning_rate": 8.381034295247647e-05, + "loss": 1.8752, + "step": 9310 + }, + { + "epoch": 2.857888275015347, + "grad_norm": 0.31204238533973694, + "learning_rate": 8.380668091528924e-05, + "loss": 1.8201, + "step": 9311 + }, + { + "epoch": 2.858195211786372, + "grad_norm": 0.2713339328765869, + "learning_rate": 8.380301854400459e-05, + "loss": 1.8002, + "step": 9312 + }, + { + "epoch": 2.858502148557397, + "grad_norm": 0.30525076389312744, + "learning_rate": 8.379935583865868e-05, + "loss": 1.8533, + "step": 9313 + }, + { + "epoch": 2.8588090853284225, + "grad_norm": 0.3294430673122406, + "learning_rate": 8.379569279928774e-05, + "loss": 1.8895, + "step": 9314 + }, + { + "epoch": 2.8591160220994474, + "grad_norm": 0.31798750162124634, + "learning_rate": 8.379202942592795e-05, + "loss": 1.8148, + "step": 9315 + }, + { + "epoch": 2.8594229588704727, + "grad_norm": 0.3044969141483307, + "learning_rate": 8.378836571861553e-05, + "loss": 1.8477, + "step": 9316 + }, + { + "epoch": 2.8597298956414976, + "grad_norm": 0.2694118320941925, + "learning_rate": 8.378470167738665e-05, + "loss": 1.7998, + "step": 9317 + }, + { + "epoch": 2.860036832412523, + "grad_norm": 0.2601872980594635, + "learning_rate": 8.378103730227758e-05, + "loss": 1.8118, + "step": 9318 + }, + { + "epoch": 2.8603437691835483, + "grad_norm": 0.28168994188308716, + "learning_rate": 8.377737259332446e-05, + "loss": 1.8048, + "step": 9319 + }, + { + "epoch": 2.8606507059545736, + "grad_norm": 0.3008260428905487, + "learning_rate": 8.377370755056358e-05, + "loss": 1.7743, + "step": 9320 + }, + { + "epoch": 2.8609576427255985, + "grad_norm": 0.2578682601451874, + "learning_rate": 8.37700421740311e-05, + "loss": 1.8011, + "step": 9321 + }, + { + "epoch": 2.861264579496624, + "grad_norm": 0.3051932752132416, + "learning_rate": 8.376637646376329e-05, + "loss": 1.8747, + "step": 9322 + }, + { + "epoch": 2.8615715162676487, + "grad_norm": 0.27534300088882446, + "learning_rate": 8.376271041979636e-05, + "loss": 1.8018, + "step": 9323 + }, + { + "epoch": 2.861878453038674, + "grad_norm": 0.3990626335144043, + "learning_rate": 8.375904404216653e-05, + "loss": 1.9223, + "step": 9324 + }, + { + "epoch": 2.8621853898096994, + "grad_norm": 0.43015196919441223, + "learning_rate": 8.375537733091003e-05, + "loss": 1.8219, + "step": 9325 + }, + { + "epoch": 2.8624923265807243, + "grad_norm": 0.4051269590854645, + "learning_rate": 8.37517102860631e-05, + "loss": 1.8057, + "step": 9326 + }, + { + "epoch": 2.8627992633517496, + "grad_norm": 0.31781086325645447, + "learning_rate": 8.3748042907662e-05, + "loss": 1.8374, + "step": 9327 + }, + { + "epoch": 2.8631062001227745, + "grad_norm": 0.3476638197898865, + "learning_rate": 8.374437519574297e-05, + "loss": 1.8679, + "step": 9328 + }, + { + "epoch": 2.8634131368938, + "grad_norm": 0.40497875213623047, + "learning_rate": 8.374070715034224e-05, + "loss": 1.7996, + "step": 9329 + }, + { + "epoch": 2.863720073664825, + "grad_norm": 0.40277308225631714, + "learning_rate": 8.373703877149605e-05, + "loss": 1.8156, + "step": 9330 + }, + { + "epoch": 2.86402701043585, + "grad_norm": 0.3012325167655945, + "learning_rate": 8.373337005924069e-05, + "loss": 1.8765, + "step": 9331 + }, + { + "epoch": 2.8643339472068754, + "grad_norm": 0.3151897192001343, + "learning_rate": 8.372970101361238e-05, + "loss": 1.8395, + "step": 9332 + }, + { + "epoch": 2.8646408839779003, + "grad_norm": 0.33645790815353394, + "learning_rate": 8.372603163464741e-05, + "loss": 1.8587, + "step": 9333 + }, + { + "epoch": 2.8649478207489256, + "grad_norm": 0.29943743348121643, + "learning_rate": 8.3722361922382e-05, + "loss": 1.8007, + "step": 9334 + }, + { + "epoch": 2.865254757519951, + "grad_norm": 0.24727779626846313, + "learning_rate": 8.371869187685248e-05, + "loss": 1.766, + "step": 9335 + }, + { + "epoch": 2.8655616942909763, + "grad_norm": 0.3177282512187958, + "learning_rate": 8.371502149809507e-05, + "loss": 1.7954, + "step": 9336 + }, + { + "epoch": 2.865868631062001, + "grad_norm": 0.3415081202983856, + "learning_rate": 8.371135078614605e-05, + "loss": 1.8036, + "step": 9337 + }, + { + "epoch": 2.8661755678330265, + "grad_norm": 0.3044268488883972, + "learning_rate": 8.37076797410417e-05, + "loss": 1.8196, + "step": 9338 + }, + { + "epoch": 2.8664825046040514, + "grad_norm": 0.24425630271434784, + "learning_rate": 8.370400836281831e-05, + "loss": 1.8267, + "step": 9339 + }, + { + "epoch": 2.8667894413750767, + "grad_norm": 0.27264806628227234, + "learning_rate": 8.370033665151216e-05, + "loss": 1.8218, + "step": 9340 + }, + { + "epoch": 2.867096378146102, + "grad_norm": 0.275601327419281, + "learning_rate": 8.369666460715953e-05, + "loss": 1.8427, + "step": 9341 + }, + { + "epoch": 2.867403314917127, + "grad_norm": 0.2670573592185974, + "learning_rate": 8.36929922297967e-05, + "loss": 1.8449, + "step": 9342 + }, + { + "epoch": 2.8677102516881523, + "grad_norm": 0.2991434335708618, + "learning_rate": 8.368931951945998e-05, + "loss": 1.8866, + "step": 9343 + }, + { + "epoch": 2.868017188459177, + "grad_norm": 0.2975110411643982, + "learning_rate": 8.368564647618564e-05, + "loss": 1.7992, + "step": 9344 + }, + { + "epoch": 2.8683241252302025, + "grad_norm": 0.30109819769859314, + "learning_rate": 8.368197310001001e-05, + "loss": 1.8402, + "step": 9345 + }, + { + "epoch": 2.868631062001228, + "grad_norm": 0.3303714692592621, + "learning_rate": 8.367829939096938e-05, + "loss": 1.8329, + "step": 9346 + }, + { + "epoch": 2.8689379987722528, + "grad_norm": 0.3697182834148407, + "learning_rate": 8.367462534910007e-05, + "loss": 1.9328, + "step": 9347 + }, + { + "epoch": 2.869244935543278, + "grad_norm": 0.3292355537414551, + "learning_rate": 8.367095097443836e-05, + "loss": 1.8284, + "step": 9348 + }, + { + "epoch": 2.869551872314303, + "grad_norm": 0.30440348386764526, + "learning_rate": 8.366727626702058e-05, + "loss": 1.8891, + "step": 9349 + }, + { + "epoch": 2.8698588090853283, + "grad_norm": 0.28200212121009827, + "learning_rate": 8.366360122688303e-05, + "loss": 1.7931, + "step": 9350 + }, + { + "epoch": 2.8701657458563536, + "grad_norm": 0.3162787854671478, + "learning_rate": 8.365992585406207e-05, + "loss": 1.8033, + "step": 9351 + }, + { + "epoch": 2.870472682627379, + "grad_norm": 0.3326094448566437, + "learning_rate": 8.365625014859399e-05, + "loss": 1.8474, + "step": 9352 + }, + { + "epoch": 2.870779619398404, + "grad_norm": 0.36957383155822754, + "learning_rate": 8.36525741105151e-05, + "loss": 1.8387, + "step": 9353 + }, + { + "epoch": 2.871086556169429, + "grad_norm": 0.32996198534965515, + "learning_rate": 8.364889773986175e-05, + "loss": 1.9087, + "step": 9354 + }, + { + "epoch": 2.871393492940454, + "grad_norm": 0.3164239227771759, + "learning_rate": 8.36452210366703e-05, + "loss": 1.8735, + "step": 9355 + }, + { + "epoch": 2.8717004297114794, + "grad_norm": 0.411538302898407, + "learning_rate": 8.364154400097702e-05, + "loss": 1.832, + "step": 9356 + }, + { + "epoch": 2.8720073664825048, + "grad_norm": 0.48294687271118164, + "learning_rate": 8.36378666328183e-05, + "loss": 1.7772, + "step": 9357 + }, + { + "epoch": 2.8723143032535297, + "grad_norm": 0.4894202649593353, + "learning_rate": 8.363418893223046e-05, + "loss": 1.8396, + "step": 9358 + }, + { + "epoch": 2.872621240024555, + "grad_norm": 0.3328344225883484, + "learning_rate": 8.363051089924986e-05, + "loss": 1.8264, + "step": 9359 + }, + { + "epoch": 2.87292817679558, + "grad_norm": 0.29800695180892944, + "learning_rate": 8.362683253391284e-05, + "loss": 1.8609, + "step": 9360 + }, + { + "epoch": 2.873235113566605, + "grad_norm": 0.48049718141555786, + "learning_rate": 8.362315383625574e-05, + "loss": 1.8703, + "step": 9361 + }, + { + "epoch": 2.8735420503376305, + "grad_norm": 0.5477426052093506, + "learning_rate": 8.361947480631494e-05, + "loss": 1.8336, + "step": 9362 + }, + { + "epoch": 2.873848987108656, + "grad_norm": 0.42515942454338074, + "learning_rate": 8.361579544412676e-05, + "loss": 1.826, + "step": 9363 + }, + { + "epoch": 2.8741559238796808, + "grad_norm": 0.3049539029598236, + "learning_rate": 8.361211574972762e-05, + "loss": 1.9117, + "step": 9364 + }, + { + "epoch": 2.874462860650706, + "grad_norm": 0.4089799225330353, + "learning_rate": 8.360843572315384e-05, + "loss": 1.8669, + "step": 9365 + }, + { + "epoch": 2.874769797421731, + "grad_norm": 0.42594894766807556, + "learning_rate": 8.36047553644418e-05, + "loss": 1.8527, + "step": 9366 + }, + { + "epoch": 2.8750767341927563, + "grad_norm": 0.3282840847969055, + "learning_rate": 8.360107467362785e-05, + "loss": 1.833, + "step": 9367 + }, + { + "epoch": 2.8753836709637817, + "grad_norm": 0.26597294211387634, + "learning_rate": 8.359739365074841e-05, + "loss": 1.7735, + "step": 9368 + }, + { + "epoch": 2.8756906077348066, + "grad_norm": 0.33498096466064453, + "learning_rate": 8.359371229583983e-05, + "loss": 1.7923, + "step": 9369 + }, + { + "epoch": 2.875997544505832, + "grad_norm": 0.3046290874481201, + "learning_rate": 8.35900306089385e-05, + "loss": 1.8296, + "step": 9370 + }, + { + "epoch": 2.876304481276857, + "grad_norm": 0.3128269612789154, + "learning_rate": 8.358634859008079e-05, + "loss": 1.8115, + "step": 9371 + }, + { + "epoch": 2.876611418047882, + "grad_norm": 0.3814822733402252, + "learning_rate": 8.358266623930309e-05, + "loss": 1.8454, + "step": 9372 + }, + { + "epoch": 2.8769183548189075, + "grad_norm": 0.42400503158569336, + "learning_rate": 8.35789835566418e-05, + "loss": 1.8162, + "step": 9373 + }, + { + "epoch": 2.8772252915899323, + "grad_norm": 0.3131491243839264, + "learning_rate": 8.357530054213333e-05, + "loss": 1.8281, + "step": 9374 + }, + { + "epoch": 2.8775322283609577, + "grad_norm": 0.2566036581993103, + "learning_rate": 8.357161719581406e-05, + "loss": 1.7751, + "step": 9375 + }, + { + "epoch": 2.8778391651319826, + "grad_norm": 0.3858461081981659, + "learning_rate": 8.356793351772038e-05, + "loss": 1.8558, + "step": 9376 + }, + { + "epoch": 2.878146101903008, + "grad_norm": 0.38664349913597107, + "learning_rate": 8.35642495078887e-05, + "loss": 1.8009, + "step": 9377 + }, + { + "epoch": 2.8784530386740332, + "grad_norm": 0.33365172147750854, + "learning_rate": 8.356056516635545e-05, + "loss": 1.8689, + "step": 9378 + }, + { + "epoch": 2.8787599754450586, + "grad_norm": 0.3602980971336365, + "learning_rate": 8.355688049315702e-05, + "loss": 1.8397, + "step": 9379 + }, + { + "epoch": 2.8790669122160835, + "grad_norm": 0.4508447051048279, + "learning_rate": 8.355319548832983e-05, + "loss": 1.8163, + "step": 9380 + }, + { + "epoch": 2.879373848987109, + "grad_norm": 0.4433961808681488, + "learning_rate": 8.35495101519103e-05, + "loss": 1.7868, + "step": 9381 + }, + { + "epoch": 2.8796807857581337, + "grad_norm": 0.2754592299461365, + "learning_rate": 8.354582448393483e-05, + "loss": 1.8222, + "step": 9382 + }, + { + "epoch": 2.879987722529159, + "grad_norm": 0.29384344816207886, + "learning_rate": 8.354213848443987e-05, + "loss": 1.7742, + "step": 9383 + }, + { + "epoch": 2.8802946593001844, + "grad_norm": 0.33183756470680237, + "learning_rate": 8.353845215346183e-05, + "loss": 1.8327, + "step": 9384 + }, + { + "epoch": 2.8806015960712092, + "grad_norm": 0.3018858730792999, + "learning_rate": 8.353476549103717e-05, + "loss": 1.8606, + "step": 9385 + }, + { + "epoch": 2.8809085328422346, + "grad_norm": 0.38592803478240967, + "learning_rate": 8.353107849720229e-05, + "loss": 1.8091, + "step": 9386 + }, + { + "epoch": 2.8812154696132595, + "grad_norm": 0.448723703622818, + "learning_rate": 8.352739117199364e-05, + "loss": 1.8537, + "step": 9387 + }, + { + "epoch": 2.881522406384285, + "grad_norm": 0.25959616899490356, + "learning_rate": 8.352370351544765e-05, + "loss": 1.8188, + "step": 9388 + }, + { + "epoch": 2.88182934315531, + "grad_norm": 0.3304184079170227, + "learning_rate": 8.352001552760078e-05, + "loss": 1.8008, + "step": 9389 + }, + { + "epoch": 2.882136279926335, + "grad_norm": 0.3831254541873932, + "learning_rate": 8.351632720848947e-05, + "loss": 1.7636, + "step": 9390 + }, + { + "epoch": 2.8824432166973604, + "grad_norm": 0.3358294665813446, + "learning_rate": 8.351263855815017e-05, + "loss": 1.8375, + "step": 9391 + }, + { + "epoch": 2.8827501534683853, + "grad_norm": 0.31194913387298584, + "learning_rate": 8.350894957661935e-05, + "loss": 1.817, + "step": 9392 + }, + { + "epoch": 2.8830570902394106, + "grad_norm": 0.4156818687915802, + "learning_rate": 8.350526026393343e-05, + "loss": 1.799, + "step": 9393 + }, + { + "epoch": 2.883364027010436, + "grad_norm": 0.3062533140182495, + "learning_rate": 8.350157062012889e-05, + "loss": 1.8535, + "step": 9394 + }, + { + "epoch": 2.8836709637814613, + "grad_norm": 0.3091447949409485, + "learning_rate": 8.34978806452422e-05, + "loss": 1.839, + "step": 9395 + }, + { + "epoch": 2.883977900552486, + "grad_norm": 0.38731643557548523, + "learning_rate": 8.349419033930981e-05, + "loss": 1.8714, + "step": 9396 + }, + { + "epoch": 2.8842848373235115, + "grad_norm": 0.34655869007110596, + "learning_rate": 8.34904997023682e-05, + "loss": 1.8694, + "step": 9397 + }, + { + "epoch": 2.8845917740945364, + "grad_norm": 0.3094301223754883, + "learning_rate": 8.348680873445386e-05, + "loss": 1.8773, + "step": 9398 + }, + { + "epoch": 2.8848987108655617, + "grad_norm": 0.2954508364200592, + "learning_rate": 8.348311743560325e-05, + "loss": 1.7716, + "step": 9399 + }, + { + "epoch": 2.885205647636587, + "grad_norm": 0.32545948028564453, + "learning_rate": 8.347942580585282e-05, + "loss": 1.871, + "step": 9400 + }, + { + "epoch": 2.885512584407612, + "grad_norm": 0.3251612186431885, + "learning_rate": 8.34757338452391e-05, + "loss": 1.8553, + "step": 9401 + }, + { + "epoch": 2.8858195211786373, + "grad_norm": 0.2610895335674286, + "learning_rate": 8.347204155379856e-05, + "loss": 1.8018, + "step": 9402 + }, + { + "epoch": 2.886126457949662, + "grad_norm": 0.3369129002094269, + "learning_rate": 8.346834893156768e-05, + "loss": 1.8536, + "step": 9403 + }, + { + "epoch": 2.8864333947206875, + "grad_norm": 0.4544060528278351, + "learning_rate": 8.346465597858296e-05, + "loss": 1.8332, + "step": 9404 + }, + { + "epoch": 2.886740331491713, + "grad_norm": 0.45742174983024597, + "learning_rate": 8.346096269488089e-05, + "loss": 1.89, + "step": 9405 + }, + { + "epoch": 2.8870472682627377, + "grad_norm": 0.3458103537559509, + "learning_rate": 8.345726908049799e-05, + "loss": 1.8902, + "step": 9406 + }, + { + "epoch": 2.887354205033763, + "grad_norm": 0.33266058564186096, + "learning_rate": 8.345357513547074e-05, + "loss": 1.7975, + "step": 9407 + }, + { + "epoch": 2.887661141804788, + "grad_norm": 0.3503437042236328, + "learning_rate": 8.344988085983565e-05, + "loss": 1.8503, + "step": 9408 + }, + { + "epoch": 2.8879680785758133, + "grad_norm": 0.33511486649513245, + "learning_rate": 8.344618625362923e-05, + "loss": 1.8731, + "step": 9409 + }, + { + "epoch": 2.8882750153468386, + "grad_norm": 0.295250803232193, + "learning_rate": 8.344249131688799e-05, + "loss": 1.8557, + "step": 9410 + }, + { + "epoch": 2.888581952117864, + "grad_norm": 0.33287179470062256, + "learning_rate": 8.343879604964846e-05, + "loss": 1.8015, + "step": 9411 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.35169747471809387, + "learning_rate": 8.343510045194715e-05, + "loss": 1.7857, + "step": 9412 + }, + { + "epoch": 2.889195825659914, + "grad_norm": 0.3191360533237457, + "learning_rate": 8.343140452382056e-05, + "loss": 1.8474, + "step": 9413 + }, + { + "epoch": 2.889502762430939, + "grad_norm": 0.27216482162475586, + "learning_rate": 8.342770826530526e-05, + "loss": 1.7941, + "step": 9414 + }, + { + "epoch": 2.8898096992019644, + "grad_norm": 0.32968905568122864, + "learning_rate": 8.342401167643774e-05, + "loss": 1.8568, + "step": 9415 + }, + { + "epoch": 2.8901166359729897, + "grad_norm": 0.37429341673851013, + "learning_rate": 8.342031475725456e-05, + "loss": 1.8995, + "step": 9416 + }, + { + "epoch": 2.8904235727440146, + "grad_norm": 0.3318146765232086, + "learning_rate": 8.341661750779223e-05, + "loss": 1.8886, + "step": 9417 + }, + { + "epoch": 2.89073050951504, + "grad_norm": 0.3208807408809662, + "learning_rate": 8.34129199280873e-05, + "loss": 1.8306, + "step": 9418 + }, + { + "epoch": 2.891037446286065, + "grad_norm": 0.30906134843826294, + "learning_rate": 8.340922201817632e-05, + "loss": 1.8931, + "step": 9419 + }, + { + "epoch": 2.89134438305709, + "grad_norm": 0.2949373722076416, + "learning_rate": 8.340552377809581e-05, + "loss": 1.8375, + "step": 9420 + }, + { + "epoch": 2.8916513198281155, + "grad_norm": 0.2553368806838989, + "learning_rate": 8.340182520788236e-05, + "loss": 1.7816, + "step": 9421 + }, + { + "epoch": 2.891958256599141, + "grad_norm": 0.26867765188217163, + "learning_rate": 8.339812630757246e-05, + "loss": 1.7721, + "step": 9422 + }, + { + "epoch": 2.8922651933701657, + "grad_norm": 0.3132673501968384, + "learning_rate": 8.339442707720273e-05, + "loss": 1.8412, + "step": 9423 + }, + { + "epoch": 2.892572130141191, + "grad_norm": 0.32028669118881226, + "learning_rate": 8.33907275168097e-05, + "loss": 1.8081, + "step": 9424 + }, + { + "epoch": 2.892879066912216, + "grad_norm": 0.30383285880088806, + "learning_rate": 8.338702762642992e-05, + "loss": 1.8294, + "step": 9425 + }, + { + "epoch": 2.8931860036832413, + "grad_norm": 0.284161239862442, + "learning_rate": 8.338332740609995e-05, + "loss": 1.7788, + "step": 9426 + }, + { + "epoch": 2.8934929404542666, + "grad_norm": 0.26731929183006287, + "learning_rate": 8.337962685585638e-05, + "loss": 1.8244, + "step": 9427 + }, + { + "epoch": 2.8937998772252915, + "grad_norm": 0.2687760889530182, + "learning_rate": 8.337592597573578e-05, + "loss": 1.8104, + "step": 9428 + }, + { + "epoch": 2.894106813996317, + "grad_norm": 0.3097872734069824, + "learning_rate": 8.337222476577472e-05, + "loss": 1.8311, + "step": 9429 + }, + { + "epoch": 2.8944137507673418, + "grad_norm": 0.2915988862514496, + "learning_rate": 8.336852322600977e-05, + "loss": 1.8878, + "step": 9430 + }, + { + "epoch": 2.894720687538367, + "grad_norm": 0.2783167362213135, + "learning_rate": 8.336482135647751e-05, + "loss": 1.829, + "step": 9431 + }, + { + "epoch": 2.8950276243093924, + "grad_norm": 0.27866432070732117, + "learning_rate": 8.336111915721454e-05, + "loss": 1.8881, + "step": 9432 + }, + { + "epoch": 2.8953345610804173, + "grad_norm": 0.26949164271354675, + "learning_rate": 8.335741662825743e-05, + "loss": 1.7652, + "step": 9433 + }, + { + "epoch": 2.8956414978514426, + "grad_norm": 0.31324130296707153, + "learning_rate": 8.335371376964278e-05, + "loss": 1.8362, + "step": 9434 + }, + { + "epoch": 2.8959484346224675, + "grad_norm": 0.31150999665260315, + "learning_rate": 8.335001058140718e-05, + "loss": 1.8588, + "step": 9435 + }, + { + "epoch": 2.896255371393493, + "grad_norm": 0.30692601203918457, + "learning_rate": 8.334630706358724e-05, + "loss": 1.8473, + "step": 9436 + }, + { + "epoch": 2.896562308164518, + "grad_norm": 0.2764357328414917, + "learning_rate": 8.334260321621954e-05, + "loss": 1.8696, + "step": 9437 + }, + { + "epoch": 2.8968692449355435, + "grad_norm": 0.26108071208000183, + "learning_rate": 8.333889903934069e-05, + "loss": 1.7647, + "step": 9438 + }, + { + "epoch": 2.8971761817065684, + "grad_norm": 0.3382989466190338, + "learning_rate": 8.33351945329873e-05, + "loss": 1.8936, + "step": 9439 + }, + { + "epoch": 2.8974831184775938, + "grad_norm": 0.3121405839920044, + "learning_rate": 8.333148969719598e-05, + "loss": 1.8281, + "step": 9440 + }, + { + "epoch": 2.8977900552486187, + "grad_norm": 0.283149778842926, + "learning_rate": 8.332778453200334e-05, + "loss": 1.8642, + "step": 9441 + }, + { + "epoch": 2.898096992019644, + "grad_norm": 0.4140075445175171, + "learning_rate": 8.332407903744598e-05, + "loss": 1.8553, + "step": 9442 + }, + { + "epoch": 2.8984039287906693, + "grad_norm": 0.4345620274543762, + "learning_rate": 8.332037321356057e-05, + "loss": 1.7879, + "step": 9443 + }, + { + "epoch": 2.898710865561694, + "grad_norm": 0.4103661775588989, + "learning_rate": 8.33166670603837e-05, + "loss": 1.7928, + "step": 9444 + }, + { + "epoch": 2.8990178023327196, + "grad_norm": 0.2874266505241394, + "learning_rate": 8.3312960577952e-05, + "loss": 1.8097, + "step": 9445 + }, + { + "epoch": 2.8993247391037444, + "grad_norm": 0.2949487864971161, + "learning_rate": 8.330925376630208e-05, + "loss": 1.8679, + "step": 9446 + }, + { + "epoch": 2.8996316758747698, + "grad_norm": 0.3222406804561615, + "learning_rate": 8.330554662547059e-05, + "loss": 1.8184, + "step": 9447 + }, + { + "epoch": 2.899938612645795, + "grad_norm": 0.32089436054229736, + "learning_rate": 8.330183915549418e-05, + "loss": 1.8798, + "step": 9448 + }, + { + "epoch": 2.90024554941682, + "grad_norm": 0.28950363397598267, + "learning_rate": 8.329813135640947e-05, + "loss": 1.8502, + "step": 9449 + }, + { + "epoch": 2.9005524861878453, + "grad_norm": 0.29070547223091125, + "learning_rate": 8.329442322825312e-05, + "loss": 1.8826, + "step": 9450 + }, + { + "epoch": 2.9008594229588702, + "grad_norm": 0.3030688464641571, + "learning_rate": 8.329071477106175e-05, + "loss": 1.8002, + "step": 9451 + }, + { + "epoch": 2.9011663597298956, + "grad_norm": 0.33711570501327515, + "learning_rate": 8.328700598487203e-05, + "loss": 1.8876, + "step": 9452 + }, + { + "epoch": 2.901473296500921, + "grad_norm": 0.31995612382888794, + "learning_rate": 8.328329686972063e-05, + "loss": 1.7952, + "step": 9453 + }, + { + "epoch": 2.9017802332719462, + "grad_norm": 0.2619616389274597, + "learning_rate": 8.327958742564415e-05, + "loss": 1.7371, + "step": 9454 + }, + { + "epoch": 2.902087170042971, + "grad_norm": 0.3527650535106659, + "learning_rate": 8.32758776526793e-05, + "loss": 1.8385, + "step": 9455 + }, + { + "epoch": 2.9023941068139965, + "grad_norm": 0.3238582909107208, + "learning_rate": 8.327216755086271e-05, + "loss": 1.7955, + "step": 9456 + }, + { + "epoch": 2.9027010435850213, + "grad_norm": 0.2647970914840698, + "learning_rate": 8.326845712023106e-05, + "loss": 1.8639, + "step": 9457 + }, + { + "epoch": 2.9030079803560467, + "grad_norm": 0.3435346186161041, + "learning_rate": 8.326474636082103e-05, + "loss": 1.7831, + "step": 9458 + }, + { + "epoch": 2.903314917127072, + "grad_norm": 0.42539843916893005, + "learning_rate": 8.326103527266927e-05, + "loss": 1.8473, + "step": 9459 + }, + { + "epoch": 2.903621853898097, + "grad_norm": 0.3773367404937744, + "learning_rate": 8.325732385581247e-05, + "loss": 1.8993, + "step": 9460 + }, + { + "epoch": 2.9039287906691222, + "grad_norm": 0.2918262183666229, + "learning_rate": 8.32536121102873e-05, + "loss": 1.8198, + "step": 9461 + }, + { + "epoch": 2.904235727440147, + "grad_norm": 0.3997703492641449, + "learning_rate": 8.324990003613044e-05, + "loss": 1.8307, + "step": 9462 + }, + { + "epoch": 2.9045426642111725, + "grad_norm": 0.4593566656112671, + "learning_rate": 8.324618763337858e-05, + "loss": 1.8068, + "step": 9463 + }, + { + "epoch": 2.904849600982198, + "grad_norm": 0.30200180411338806, + "learning_rate": 8.324247490206841e-05, + "loss": 1.7935, + "step": 9464 + }, + { + "epoch": 2.9051565377532227, + "grad_norm": 0.37651970982551575, + "learning_rate": 8.323876184223663e-05, + "loss": 1.9268, + "step": 9465 + }, + { + "epoch": 2.905463474524248, + "grad_norm": 0.465863436460495, + "learning_rate": 8.32350484539199e-05, + "loss": 1.8331, + "step": 9466 + }, + { + "epoch": 2.905770411295273, + "grad_norm": 0.3527480661869049, + "learning_rate": 8.323133473715496e-05, + "loss": 1.899, + "step": 9467 + }, + { + "epoch": 2.9060773480662982, + "grad_norm": 0.30979883670806885, + "learning_rate": 8.32276206919785e-05, + "loss": 1.7578, + "step": 9468 + }, + { + "epoch": 2.9063842848373236, + "grad_norm": 0.5039793252944946, + "learning_rate": 8.322390631842718e-05, + "loss": 1.7822, + "step": 9469 + }, + { + "epoch": 2.906691221608349, + "grad_norm": 0.4683503806591034, + "learning_rate": 8.322019161653777e-05, + "loss": 1.7958, + "step": 9470 + }, + { + "epoch": 2.906998158379374, + "grad_norm": 0.27022865414619446, + "learning_rate": 8.321647658634696e-05, + "loss": 1.838, + "step": 9471 + }, + { + "epoch": 2.907305095150399, + "grad_norm": 0.3253246247768402, + "learning_rate": 8.321276122789146e-05, + "loss": 1.862, + "step": 9472 + }, + { + "epoch": 2.907612031921424, + "grad_norm": 0.3654547929763794, + "learning_rate": 8.320904554120798e-05, + "loss": 1.8578, + "step": 9473 + }, + { + "epoch": 2.9079189686924494, + "grad_norm": 0.3140239417552948, + "learning_rate": 8.320532952633325e-05, + "loss": 1.7954, + "step": 9474 + }, + { + "epoch": 2.9082259054634747, + "grad_norm": 0.24541302025318146, + "learning_rate": 8.3201613183304e-05, + "loss": 1.7711, + "step": 9475 + }, + { + "epoch": 2.9085328422344996, + "grad_norm": 0.2538415491580963, + "learning_rate": 8.319789651215692e-05, + "loss": 1.7756, + "step": 9476 + }, + { + "epoch": 2.908839779005525, + "grad_norm": 0.3181871175765991, + "learning_rate": 8.31941795129288e-05, + "loss": 1.7957, + "step": 9477 + }, + { + "epoch": 2.90914671577655, + "grad_norm": 0.3094673752784729, + "learning_rate": 8.319046218565633e-05, + "loss": 1.8897, + "step": 9478 + }, + { + "epoch": 2.909453652547575, + "grad_norm": 0.3004473149776459, + "learning_rate": 8.318674453037626e-05, + "loss": 1.7853, + "step": 9479 + }, + { + "epoch": 2.9097605893186005, + "grad_norm": 0.28673505783081055, + "learning_rate": 8.318302654712532e-05, + "loss": 1.8119, + "step": 9480 + }, + { + "epoch": 2.9100675260896254, + "grad_norm": 0.3177729547023773, + "learning_rate": 8.317930823594027e-05, + "loss": 1.8211, + "step": 9481 + }, + { + "epoch": 2.9103744628606507, + "grad_norm": 0.28347232937812805, + "learning_rate": 8.317558959685786e-05, + "loss": 1.8061, + "step": 9482 + }, + { + "epoch": 2.9106813996316756, + "grad_norm": 0.28247126936912537, + "learning_rate": 8.317187062991482e-05, + "loss": 1.8175, + "step": 9483 + }, + { + "epoch": 2.910988336402701, + "grad_norm": 0.3153017461299896, + "learning_rate": 8.31681513351479e-05, + "loss": 1.8619, + "step": 9484 + }, + { + "epoch": 2.9112952731737263, + "grad_norm": 0.265821635723114, + "learning_rate": 8.316443171259389e-05, + "loss": 1.7783, + "step": 9485 + }, + { + "epoch": 2.9116022099447516, + "grad_norm": 0.33247366547584534, + "learning_rate": 8.31607117622895e-05, + "loss": 1.8701, + "step": 9486 + }, + { + "epoch": 2.9119091467157765, + "grad_norm": 0.3343275189399719, + "learning_rate": 8.315699148427154e-05, + "loss": 1.742, + "step": 9487 + }, + { + "epoch": 2.912216083486802, + "grad_norm": 0.3427117168903351, + "learning_rate": 8.315327087857677e-05, + "loss": 1.8382, + "step": 9488 + }, + { + "epoch": 2.9125230202578267, + "grad_norm": 0.2884635925292969, + "learning_rate": 8.31495499452419e-05, + "loss": 1.8378, + "step": 9489 + }, + { + "epoch": 2.912829957028852, + "grad_norm": 0.30335184931755066, + "learning_rate": 8.31458286843038e-05, + "loss": 1.7619, + "step": 9490 + }, + { + "epoch": 2.9131368937998774, + "grad_norm": 0.3224368095397949, + "learning_rate": 8.314210709579916e-05, + "loss": 1.8289, + "step": 9491 + }, + { + "epoch": 2.9134438305709023, + "grad_norm": 0.28016242384910583, + "learning_rate": 8.31383851797648e-05, + "loss": 1.8027, + "step": 9492 + }, + { + "epoch": 2.9137507673419276, + "grad_norm": 0.32091468572616577, + "learning_rate": 8.313466293623749e-05, + "loss": 1.9027, + "step": 9493 + }, + { + "epoch": 2.9140577041129525, + "grad_norm": 0.2809069752693176, + "learning_rate": 8.313094036525403e-05, + "loss": 1.9194, + "step": 9494 + }, + { + "epoch": 2.914364640883978, + "grad_norm": 0.30734366178512573, + "learning_rate": 8.312721746685119e-05, + "loss": 1.8612, + "step": 9495 + }, + { + "epoch": 2.914671577655003, + "grad_norm": 0.25953513383865356, + "learning_rate": 8.312349424106578e-05, + "loss": 1.7593, + "step": 9496 + }, + { + "epoch": 2.9149785144260285, + "grad_norm": 0.27583983540534973, + "learning_rate": 8.311977068793459e-05, + "loss": 1.8138, + "step": 9497 + }, + { + "epoch": 2.9152854511970534, + "grad_norm": 0.30315884947776794, + "learning_rate": 8.31160468074944e-05, + "loss": 1.7704, + "step": 9498 + }, + { + "epoch": 2.9155923879680787, + "grad_norm": 0.321603387594223, + "learning_rate": 8.311232259978204e-05, + "loss": 1.8055, + "step": 9499 + }, + { + "epoch": 2.9158993247391036, + "grad_norm": 0.27882421016693115, + "learning_rate": 8.310859806483429e-05, + "loss": 1.8257, + "step": 9500 + }, + { + "epoch": 2.916206261510129, + "grad_norm": 0.3095625042915344, + "learning_rate": 8.310487320268795e-05, + "loss": 1.8561, + "step": 9501 + }, + { + "epoch": 2.9165131982811543, + "grad_norm": 0.27503731846809387, + "learning_rate": 8.310114801337988e-05, + "loss": 1.7588, + "step": 9502 + }, + { + "epoch": 2.916820135052179, + "grad_norm": 0.2534404695034027, + "learning_rate": 8.309742249694686e-05, + "loss": 1.7289, + "step": 9503 + }, + { + "epoch": 2.9171270718232045, + "grad_norm": 0.24968849122524261, + "learning_rate": 8.30936966534257e-05, + "loss": 1.7763, + "step": 9504 + }, + { + "epoch": 2.9174340085942294, + "grad_norm": 0.2728060781955719, + "learning_rate": 8.308997048285324e-05, + "loss": 1.7847, + "step": 9505 + }, + { + "epoch": 2.9177409453652547, + "grad_norm": 0.28728193044662476, + "learning_rate": 8.308624398526629e-05, + "loss": 1.7957, + "step": 9506 + }, + { + "epoch": 2.91804788213628, + "grad_norm": 0.3097241520881653, + "learning_rate": 8.308251716070169e-05, + "loss": 1.8141, + "step": 9507 + }, + { + "epoch": 2.918354818907305, + "grad_norm": 0.3570188879966736, + "learning_rate": 8.307879000919628e-05, + "loss": 1.8246, + "step": 9508 + }, + { + "epoch": 2.9186617556783303, + "grad_norm": 0.27077826857566833, + "learning_rate": 8.307506253078685e-05, + "loss": 1.7912, + "step": 9509 + }, + { + "epoch": 2.918968692449355, + "grad_norm": 0.26213565468788147, + "learning_rate": 8.307133472551028e-05, + "loss": 1.8378, + "step": 9510 + }, + { + "epoch": 2.9192756292203805, + "grad_norm": 0.3482845723628998, + "learning_rate": 8.306760659340339e-05, + "loss": 1.8031, + "step": 9511 + }, + { + "epoch": 2.919582565991406, + "grad_norm": 0.3730507791042328, + "learning_rate": 8.306387813450303e-05, + "loss": 1.7404, + "step": 9512 + }, + { + "epoch": 2.919889502762431, + "grad_norm": 0.2957874536514282, + "learning_rate": 8.306014934884606e-05, + "loss": 1.8623, + "step": 9513 + }, + { + "epoch": 2.920196439533456, + "grad_norm": 0.29137885570526123, + "learning_rate": 8.30564202364693e-05, + "loss": 1.847, + "step": 9514 + }, + { + "epoch": 2.9205033763044814, + "grad_norm": 0.35623642802238464, + "learning_rate": 8.305269079740964e-05, + "loss": 1.8382, + "step": 9515 + }, + { + "epoch": 2.9208103130755063, + "grad_norm": 0.28263330459594727, + "learning_rate": 8.304896103170389e-05, + "loss": 1.7732, + "step": 9516 + }, + { + "epoch": 2.9211172498465316, + "grad_norm": 0.23631221055984497, + "learning_rate": 8.304523093938897e-05, + "loss": 1.7709, + "step": 9517 + }, + { + "epoch": 2.921424186617557, + "grad_norm": 0.25887101888656616, + "learning_rate": 8.304150052050169e-05, + "loss": 1.7966, + "step": 9518 + }, + { + "epoch": 2.921731123388582, + "grad_norm": 0.31445473432540894, + "learning_rate": 8.303776977507894e-05, + "loss": 1.8735, + "step": 9519 + }, + { + "epoch": 2.922038060159607, + "grad_norm": 0.264930784702301, + "learning_rate": 8.303403870315757e-05, + "loss": 1.7983, + "step": 9520 + }, + { + "epoch": 2.922344996930632, + "grad_norm": 0.2664194107055664, + "learning_rate": 8.30303073047745e-05, + "loss": 1.8573, + "step": 9521 + }, + { + "epoch": 2.9226519337016574, + "grad_norm": 0.31645768880844116, + "learning_rate": 8.302657557996656e-05, + "loss": 1.913, + "step": 9522 + }, + { + "epoch": 2.9229588704726828, + "grad_norm": 0.2820858657360077, + "learning_rate": 8.302284352877063e-05, + "loss": 1.8714, + "step": 9523 + }, + { + "epoch": 2.9232658072437077, + "grad_norm": 0.2960543930530548, + "learning_rate": 8.30191111512236e-05, + "loss": 1.8296, + "step": 9524 + }, + { + "epoch": 2.923572744014733, + "grad_norm": 0.319363534450531, + "learning_rate": 8.301537844736237e-05, + "loss": 1.8533, + "step": 9525 + }, + { + "epoch": 2.923879680785758, + "grad_norm": 0.28047996759414673, + "learning_rate": 8.301164541722384e-05, + "loss": 1.7415, + "step": 9526 + }, + { + "epoch": 2.924186617556783, + "grad_norm": 0.3106628656387329, + "learning_rate": 8.300791206084486e-05, + "loss": 1.8809, + "step": 9527 + }, + { + "epoch": 2.9244935543278086, + "grad_norm": 0.2650253474712372, + "learning_rate": 8.300417837826235e-05, + "loss": 1.8097, + "step": 9528 + }, + { + "epoch": 2.924800491098834, + "grad_norm": 0.31832796335220337, + "learning_rate": 8.30004443695132e-05, + "loss": 1.881, + "step": 9529 + }, + { + "epoch": 2.925107427869859, + "grad_norm": 0.311018168926239, + "learning_rate": 8.299671003463432e-05, + "loss": 1.8725, + "step": 9530 + }, + { + "epoch": 2.925414364640884, + "grad_norm": 0.3125450909137726, + "learning_rate": 8.299297537366262e-05, + "loss": 1.8159, + "step": 9531 + }, + { + "epoch": 2.925721301411909, + "grad_norm": 0.30022570490837097, + "learning_rate": 8.298924038663498e-05, + "loss": 1.8217, + "step": 9532 + }, + { + "epoch": 2.9260282381829343, + "grad_norm": 0.3061163127422333, + "learning_rate": 8.298550507358836e-05, + "loss": 1.8529, + "step": 9533 + }, + { + "epoch": 2.9263351749539597, + "grad_norm": 0.258891224861145, + "learning_rate": 8.298176943455962e-05, + "loss": 1.8579, + "step": 9534 + }, + { + "epoch": 2.9266421117249846, + "grad_norm": 0.2871147096157074, + "learning_rate": 8.297803346958571e-05, + "loss": 1.8699, + "step": 9535 + }, + { + "epoch": 2.92694904849601, + "grad_norm": 0.3047468066215515, + "learning_rate": 8.297429717870356e-05, + "loss": 1.9165, + "step": 9536 + }, + { + "epoch": 2.927255985267035, + "grad_norm": 0.2852346897125244, + "learning_rate": 8.297056056195005e-05, + "loss": 1.8417, + "step": 9537 + }, + { + "epoch": 2.92756292203806, + "grad_norm": 0.30782654881477356, + "learning_rate": 8.296682361936216e-05, + "loss": 1.835, + "step": 9538 + }, + { + "epoch": 2.9278698588090855, + "grad_norm": 0.44828128814697266, + "learning_rate": 8.296308635097678e-05, + "loss": 1.8997, + "step": 9539 + }, + { + "epoch": 2.9281767955801103, + "grad_norm": 0.48911961913108826, + "learning_rate": 8.295934875683087e-05, + "loss": 1.8249, + "step": 9540 + }, + { + "epoch": 2.9284837323511357, + "grad_norm": 0.3377256691455841, + "learning_rate": 8.295561083696136e-05, + "loss": 1.757, + "step": 9541 + }, + { + "epoch": 2.9287906691221606, + "grad_norm": 0.29486989974975586, + "learning_rate": 8.295187259140518e-05, + "loss": 1.8282, + "step": 9542 + }, + { + "epoch": 2.929097605893186, + "grad_norm": 0.4291549026966095, + "learning_rate": 8.294813402019927e-05, + "loss": 1.7633, + "step": 9543 + }, + { + "epoch": 2.9294045426642112, + "grad_norm": 0.43153640627861023, + "learning_rate": 8.294439512338061e-05, + "loss": 1.7904, + "step": 9544 + }, + { + "epoch": 2.9297114794352366, + "grad_norm": 0.3454402685165405, + "learning_rate": 8.294065590098611e-05, + "loss": 1.8586, + "step": 9545 + }, + { + "epoch": 2.9300184162062615, + "grad_norm": 0.2709622383117676, + "learning_rate": 8.293691635305276e-05, + "loss": 1.8225, + "step": 9546 + }, + { + "epoch": 2.930325352977287, + "grad_norm": 0.34379467368125916, + "learning_rate": 8.293317647961749e-05, + "loss": 1.9005, + "step": 9547 + }, + { + "epoch": 2.9306322897483117, + "grad_norm": 0.37137365341186523, + "learning_rate": 8.292943628071727e-05, + "loss": 1.829, + "step": 9548 + }, + { + "epoch": 2.930939226519337, + "grad_norm": 0.31634894013404846, + "learning_rate": 8.292569575638905e-05, + "loss": 1.8062, + "step": 9549 + }, + { + "epoch": 2.9312461632903624, + "grad_norm": 0.25719332695007324, + "learning_rate": 8.292195490666981e-05, + "loss": 1.8044, + "step": 9550 + }, + { + "epoch": 2.9315531000613873, + "grad_norm": 0.3341852128505707, + "learning_rate": 8.291821373159652e-05, + "loss": 1.8627, + "step": 9551 + }, + { + "epoch": 2.9318600368324126, + "grad_norm": 0.38499385118484497, + "learning_rate": 8.291447223120614e-05, + "loss": 1.8138, + "step": 9552 + }, + { + "epoch": 2.9321669736034375, + "grad_norm": 0.28036460280418396, + "learning_rate": 8.291073040553567e-05, + "loss": 1.7958, + "step": 9553 + }, + { + "epoch": 2.932473910374463, + "grad_norm": 0.30798816680908203, + "learning_rate": 8.290698825462207e-05, + "loss": 1.899, + "step": 9554 + }, + { + "epoch": 2.932780847145488, + "grad_norm": 0.40930941700935364, + "learning_rate": 8.290324577850232e-05, + "loss": 1.841, + "step": 9555 + }, + { + "epoch": 2.933087783916513, + "grad_norm": 0.38794800639152527, + "learning_rate": 8.289950297721341e-05, + "loss": 1.8022, + "step": 9556 + }, + { + "epoch": 2.9333947206875384, + "grad_norm": 0.2716790437698364, + "learning_rate": 8.289575985079232e-05, + "loss": 1.8009, + "step": 9557 + }, + { + "epoch": 2.9337016574585633, + "grad_norm": 0.3063231110572815, + "learning_rate": 8.289201639927605e-05, + "loss": 1.8677, + "step": 9558 + }, + { + "epoch": 2.9340085942295886, + "grad_norm": 0.3279048800468445, + "learning_rate": 8.28882726227016e-05, + "loss": 1.8071, + "step": 9559 + }, + { + "epoch": 2.934315531000614, + "grad_norm": 0.32144758105278015, + "learning_rate": 8.288452852110596e-05, + "loss": 1.8601, + "step": 9560 + }, + { + "epoch": 2.9346224677716393, + "grad_norm": 0.284495085477829, + "learning_rate": 8.288078409452614e-05, + "loss": 1.8358, + "step": 9561 + }, + { + "epoch": 2.934929404542664, + "grad_norm": 0.3779112696647644, + "learning_rate": 8.287703934299915e-05, + "loss": 1.7903, + "step": 9562 + }, + { + "epoch": 2.9352363413136895, + "grad_norm": 0.33851495385169983, + "learning_rate": 8.287329426656197e-05, + "loss": 1.806, + "step": 9563 + }, + { + "epoch": 2.9355432780847144, + "grad_norm": 0.26610738039016724, + "learning_rate": 8.286954886525164e-05, + "loss": 1.7739, + "step": 9564 + }, + { + "epoch": 2.9358502148557397, + "grad_norm": 0.24825556576251984, + "learning_rate": 8.286580313910515e-05, + "loss": 1.7595, + "step": 9565 + }, + { + "epoch": 2.936157151626765, + "grad_norm": 0.28356245160102844, + "learning_rate": 8.286205708815954e-05, + "loss": 1.8497, + "step": 9566 + }, + { + "epoch": 2.93646408839779, + "grad_norm": 0.2974208891391754, + "learning_rate": 8.285831071245182e-05, + "loss": 1.8561, + "step": 9567 + }, + { + "epoch": 2.9367710251688153, + "grad_norm": 0.26718810200691223, + "learning_rate": 8.2854564012019e-05, + "loss": 1.776, + "step": 9568 + }, + { + "epoch": 2.93707796193984, + "grad_norm": 0.30627691745758057, + "learning_rate": 8.285081698689814e-05, + "loss": 1.8141, + "step": 9569 + }, + { + "epoch": 2.9373848987108655, + "grad_norm": 0.33287444710731506, + "learning_rate": 8.284706963712625e-05, + "loss": 1.8727, + "step": 9570 + }, + { + "epoch": 2.937691835481891, + "grad_norm": 0.30571332573890686, + "learning_rate": 8.284332196274036e-05, + "loss": 1.8388, + "step": 9571 + }, + { + "epoch": 2.937998772252916, + "grad_norm": 0.3603699207305908, + "learning_rate": 8.283957396377753e-05, + "loss": 1.8655, + "step": 9572 + }, + { + "epoch": 2.938305709023941, + "grad_norm": 0.2890760898590088, + "learning_rate": 8.283582564027477e-05, + "loss": 1.7919, + "step": 9573 + }, + { + "epoch": 2.9386126457949664, + "grad_norm": 0.34981194138526917, + "learning_rate": 8.283207699226912e-05, + "loss": 1.8542, + "step": 9574 + }, + { + "epoch": 2.9389195825659913, + "grad_norm": 0.43490317463874817, + "learning_rate": 8.282832801979766e-05, + "loss": 1.8109, + "step": 9575 + }, + { + "epoch": 2.9392265193370166, + "grad_norm": 0.4337438941001892, + "learning_rate": 8.282457872289742e-05, + "loss": 1.8856, + "step": 9576 + }, + { + "epoch": 2.939533456108042, + "grad_norm": 0.2723710834980011, + "learning_rate": 8.282082910160544e-05, + "loss": 1.8554, + "step": 9577 + }, + { + "epoch": 2.939840392879067, + "grad_norm": 0.32447734475135803, + "learning_rate": 8.28170791559588e-05, + "loss": 1.8086, + "step": 9578 + }, + { + "epoch": 2.940147329650092, + "grad_norm": 0.3495276868343353, + "learning_rate": 8.281332888599455e-05, + "loss": 1.785, + "step": 9579 + }, + { + "epoch": 2.940454266421117, + "grad_norm": 0.3324705958366394, + "learning_rate": 8.280957829174975e-05, + "loss": 1.8086, + "step": 9580 + }, + { + "epoch": 2.9407612031921424, + "grad_norm": 0.2633898854255676, + "learning_rate": 8.280582737326146e-05, + "loss": 1.8116, + "step": 9581 + }, + { + "epoch": 2.9410681399631677, + "grad_norm": 0.3109157085418701, + "learning_rate": 8.280207613056676e-05, + "loss": 1.8649, + "step": 9582 + }, + { + "epoch": 2.9413750767341926, + "grad_norm": 0.2772599756717682, + "learning_rate": 8.279832456370273e-05, + "loss": 1.8578, + "step": 9583 + }, + { + "epoch": 2.941682013505218, + "grad_norm": 0.32322654128074646, + "learning_rate": 8.279457267270642e-05, + "loss": 1.8621, + "step": 9584 + }, + { + "epoch": 2.941988950276243, + "grad_norm": 0.3678343594074249, + "learning_rate": 8.279082045761493e-05, + "loss": 1.8819, + "step": 9585 + }, + { + "epoch": 2.942295887047268, + "grad_norm": 0.30976057052612305, + "learning_rate": 8.27870679184653e-05, + "loss": 1.8126, + "step": 9586 + }, + { + "epoch": 2.9426028238182935, + "grad_norm": 0.26715603470802307, + "learning_rate": 8.278331505529469e-05, + "loss": 1.8831, + "step": 9587 + }, + { + "epoch": 2.942909760589319, + "grad_norm": 0.263288289308548, + "learning_rate": 8.277956186814014e-05, + "loss": 1.8057, + "step": 9588 + }, + { + "epoch": 2.9432166973603437, + "grad_norm": 0.29458633065223694, + "learning_rate": 8.277580835703873e-05, + "loss": 1.7307, + "step": 9589 + }, + { + "epoch": 2.943523634131369, + "grad_norm": 0.27819791436195374, + "learning_rate": 8.277205452202759e-05, + "loss": 1.8783, + "step": 9590 + }, + { + "epoch": 2.943830570902394, + "grad_norm": 0.29286056756973267, + "learning_rate": 8.276830036314379e-05, + "loss": 1.8061, + "step": 9591 + }, + { + "epoch": 2.9441375076734193, + "grad_norm": 0.2955230474472046, + "learning_rate": 8.276454588042442e-05, + "loss": 1.8227, + "step": 9592 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.3070714473724365, + "learning_rate": 8.276079107390663e-05, + "loss": 1.8451, + "step": 9593 + }, + { + "epoch": 2.9447513812154695, + "grad_norm": 0.34235841035842896, + "learning_rate": 8.275703594362749e-05, + "loss": 1.8052, + "step": 9594 + }, + { + "epoch": 2.945058317986495, + "grad_norm": 0.2863236665725708, + "learning_rate": 8.275328048962412e-05, + "loss": 1.8741, + "step": 9595 + }, + { + "epoch": 2.9453652547575198, + "grad_norm": 0.3013235032558441, + "learning_rate": 8.274952471193364e-05, + "loss": 1.8177, + "step": 9596 + }, + { + "epoch": 2.945672191528545, + "grad_norm": 0.2994023561477661, + "learning_rate": 8.274576861059316e-05, + "loss": 1.903, + "step": 9597 + }, + { + "epoch": 2.9459791282995704, + "grad_norm": 0.320049524307251, + "learning_rate": 8.27420121856398e-05, + "loss": 1.882, + "step": 9598 + }, + { + "epoch": 2.9462860650705953, + "grad_norm": 0.2789655029773712, + "learning_rate": 8.273825543711069e-05, + "loss": 1.794, + "step": 9599 + }, + { + "epoch": 2.9465930018416207, + "grad_norm": 0.3148564398288727, + "learning_rate": 8.273449836504294e-05, + "loss": 1.8453, + "step": 9600 + }, + { + "epoch": 2.9468999386126455, + "grad_norm": 0.46754372119903564, + "learning_rate": 8.273074096947371e-05, + "loss": 1.8147, + "step": 9601 + }, + { + "epoch": 2.947206875383671, + "grad_norm": 0.5946900844573975, + "learning_rate": 8.27269832504401e-05, + "loss": 1.8099, + "step": 9602 + }, + { + "epoch": 2.947513812154696, + "grad_norm": 0.4916069507598877, + "learning_rate": 8.272322520797926e-05, + "loss": 1.8315, + "step": 9603 + }, + { + "epoch": 2.9478207489257215, + "grad_norm": 0.30378973484039307, + "learning_rate": 8.271946684212833e-05, + "loss": 1.87, + "step": 9604 + }, + { + "epoch": 2.9481276856967464, + "grad_norm": 0.5197327136993408, + "learning_rate": 8.271570815292447e-05, + "loss": 1.8109, + "step": 9605 + }, + { + "epoch": 2.9484346224677718, + "grad_norm": 0.7213841080665588, + "learning_rate": 8.271194914040478e-05, + "loss": 1.8526, + "step": 9606 + }, + { + "epoch": 2.9487415592387967, + "grad_norm": 0.5521572232246399, + "learning_rate": 8.270818980460643e-05, + "loss": 1.7982, + "step": 9607 + }, + { + "epoch": 2.949048496009822, + "grad_norm": 0.3072868287563324, + "learning_rate": 8.27044301455666e-05, + "loss": 1.8708, + "step": 9608 + }, + { + "epoch": 2.9493554327808473, + "grad_norm": 0.5477200746536255, + "learning_rate": 8.270067016332241e-05, + "loss": 1.8708, + "step": 9609 + }, + { + "epoch": 2.949662369551872, + "grad_norm": 0.5991030335426331, + "learning_rate": 8.269690985791104e-05, + "loss": 1.7983, + "step": 9610 + }, + { + "epoch": 2.9499693063228976, + "grad_norm": 0.33343803882598877, + "learning_rate": 8.269314922936964e-05, + "loss": 1.7867, + "step": 9611 + }, + { + "epoch": 2.9502762430939224, + "grad_norm": 0.3671727776527405, + "learning_rate": 8.268938827773538e-05, + "loss": 1.9604, + "step": 9612 + }, + { + "epoch": 2.950583179864948, + "grad_norm": 0.5015503764152527, + "learning_rate": 8.26856270030454e-05, + "loss": 1.8424, + "step": 9613 + }, + { + "epoch": 2.950890116635973, + "grad_norm": 0.4369170367717743, + "learning_rate": 8.268186540533693e-05, + "loss": 1.7915, + "step": 9614 + }, + { + "epoch": 2.951197053406998, + "grad_norm": 0.2739746868610382, + "learning_rate": 8.267810348464709e-05, + "loss": 1.7816, + "step": 9615 + }, + { + "epoch": 2.9515039901780233, + "grad_norm": 0.3660983145236969, + "learning_rate": 8.26743412410131e-05, + "loss": 1.8235, + "step": 9616 + }, + { + "epoch": 2.9518109269490482, + "grad_norm": 0.44442248344421387, + "learning_rate": 8.26705786744721e-05, + "loss": 1.8566, + "step": 9617 + }, + { + "epoch": 2.9521178637200736, + "grad_norm": 0.28847622871398926, + "learning_rate": 8.266681578506129e-05, + "loss": 1.82, + "step": 9618 + }, + { + "epoch": 2.952424800491099, + "grad_norm": 0.32827475666999817, + "learning_rate": 8.266305257281786e-05, + "loss": 1.8422, + "step": 9619 + }, + { + "epoch": 2.9527317372621242, + "grad_norm": 0.3459654748439789, + "learning_rate": 8.265928903777902e-05, + "loss": 1.7919, + "step": 9620 + }, + { + "epoch": 2.953038674033149, + "grad_norm": 0.31467050313949585, + "learning_rate": 8.265552517998191e-05, + "loss": 1.8178, + "step": 9621 + }, + { + "epoch": 2.9533456108041745, + "grad_norm": 0.2814936935901642, + "learning_rate": 8.265176099946381e-05, + "loss": 1.7823, + "step": 9622 + }, + { + "epoch": 2.9536525475751993, + "grad_norm": 0.36387261748313904, + "learning_rate": 8.264799649626182e-05, + "loss": 1.7861, + "step": 9623 + }, + { + "epoch": 2.9539594843462247, + "grad_norm": 0.3504095673561096, + "learning_rate": 8.264423167041322e-05, + "loss": 1.8216, + "step": 9624 + }, + { + "epoch": 2.95426642111725, + "grad_norm": 0.28199300169944763, + "learning_rate": 8.264046652195519e-05, + "loss": 1.8397, + "step": 9625 + }, + { + "epoch": 2.954573357888275, + "grad_norm": 0.435774028301239, + "learning_rate": 8.263670105092494e-05, + "loss": 1.8316, + "step": 9626 + }, + { + "epoch": 2.9548802946593002, + "grad_norm": 0.37712937593460083, + "learning_rate": 8.263293525735967e-05, + "loss": 1.8089, + "step": 9627 + }, + { + "epoch": 2.955187231430325, + "grad_norm": 0.34833967685699463, + "learning_rate": 8.26291691412966e-05, + "loss": 1.8324, + "step": 9628 + }, + { + "epoch": 2.9554941682013505, + "grad_norm": 0.37515538930892944, + "learning_rate": 8.262540270277297e-05, + "loss": 1.7958, + "step": 9629 + }, + { + "epoch": 2.955801104972376, + "grad_norm": 0.3392273485660553, + "learning_rate": 8.262163594182598e-05, + "loss": 1.8322, + "step": 9630 + }, + { + "epoch": 2.9561080417434007, + "grad_norm": 0.3477925956249237, + "learning_rate": 8.261786885849287e-05, + "loss": 1.8525, + "step": 9631 + }, + { + "epoch": 2.956414978514426, + "grad_norm": 0.35574036836624146, + "learning_rate": 8.261410145281085e-05, + "loss": 1.8148, + "step": 9632 + }, + { + "epoch": 2.9567219152854514, + "grad_norm": 0.3166620135307312, + "learning_rate": 8.261033372481717e-05, + "loss": 1.7914, + "step": 9633 + }, + { + "epoch": 2.9570288520564763, + "grad_norm": 0.2562217116355896, + "learning_rate": 8.260656567454907e-05, + "loss": 1.7794, + "step": 9634 + }, + { + "epoch": 2.9573357888275016, + "grad_norm": 0.3328792452812195, + "learning_rate": 8.260279730204377e-05, + "loss": 1.8235, + "step": 9635 + }, + { + "epoch": 2.957642725598527, + "grad_norm": 0.33144834637641907, + "learning_rate": 8.259902860733852e-05, + "loss": 1.7668, + "step": 9636 + }, + { + "epoch": 2.957949662369552, + "grad_norm": 0.30557021498680115, + "learning_rate": 8.259525959047056e-05, + "loss": 1.9135, + "step": 9637 + }, + { + "epoch": 2.958256599140577, + "grad_norm": 0.2901468575000763, + "learning_rate": 8.259149025147713e-05, + "loss": 1.8023, + "step": 9638 + }, + { + "epoch": 2.958563535911602, + "grad_norm": 0.35177919268608093, + "learning_rate": 8.25877205903955e-05, + "loss": 1.8541, + "step": 9639 + }, + { + "epoch": 2.9588704726826274, + "grad_norm": 0.2745177447795868, + "learning_rate": 8.258395060726291e-05, + "loss": 1.8103, + "step": 9640 + }, + { + "epoch": 2.9591774094536527, + "grad_norm": 0.29005685448646545, + "learning_rate": 8.258018030211663e-05, + "loss": 1.7587, + "step": 9641 + }, + { + "epoch": 2.9594843462246776, + "grad_norm": 0.27498918771743774, + "learning_rate": 8.257640967499391e-05, + "loss": 1.8052, + "step": 9642 + }, + { + "epoch": 2.959791282995703, + "grad_norm": 0.2689644694328308, + "learning_rate": 8.257263872593202e-05, + "loss": 1.8582, + "step": 9643 + }, + { + "epoch": 2.960098219766728, + "grad_norm": 0.2953707277774811, + "learning_rate": 8.256886745496821e-05, + "loss": 1.7654, + "step": 9644 + }, + { + "epoch": 2.960405156537753, + "grad_norm": 0.2573971450328827, + "learning_rate": 8.256509586213978e-05, + "loss": 1.7819, + "step": 9645 + }, + { + "epoch": 2.9607120933087785, + "grad_norm": 0.29667192697525024, + "learning_rate": 8.256132394748398e-05, + "loss": 1.8632, + "step": 9646 + }, + { + "epoch": 2.961019030079804, + "grad_norm": 0.2953830361366272, + "learning_rate": 8.255755171103808e-05, + "loss": 1.8672, + "step": 9647 + }, + { + "epoch": 2.9613259668508287, + "grad_norm": 0.2925500273704529, + "learning_rate": 8.255377915283937e-05, + "loss": 1.8691, + "step": 9648 + }, + { + "epoch": 2.961632903621854, + "grad_norm": 0.32245302200317383, + "learning_rate": 8.255000627292515e-05, + "loss": 1.8701, + "step": 9649 + }, + { + "epoch": 2.961939840392879, + "grad_norm": 0.2671414315700531, + "learning_rate": 8.254623307133268e-05, + "loss": 1.8045, + "step": 9650 + }, + { + "epoch": 2.9622467771639043, + "grad_norm": 0.3135749101638794, + "learning_rate": 8.254245954809928e-05, + "loss": 1.7573, + "step": 9651 + }, + { + "epoch": 2.9625537139349296, + "grad_norm": 0.2604369521141052, + "learning_rate": 8.253868570326218e-05, + "loss": 1.8513, + "step": 9652 + }, + { + "epoch": 2.9628606507059545, + "grad_norm": 0.24657092988491058, + "learning_rate": 8.253491153685875e-05, + "loss": 1.8303, + "step": 9653 + }, + { + "epoch": 2.96316758747698, + "grad_norm": 0.24310527741909027, + "learning_rate": 8.253113704892623e-05, + "loss": 1.7648, + "step": 9654 + }, + { + "epoch": 2.9634745242480047, + "grad_norm": 0.24558408558368683, + "learning_rate": 8.252736223950198e-05, + "loss": 1.7517, + "step": 9655 + }, + { + "epoch": 2.96378146101903, + "grad_norm": 0.2500043511390686, + "learning_rate": 8.252358710862324e-05, + "loss": 1.7588, + "step": 9656 + }, + { + "epoch": 2.9640883977900554, + "grad_norm": 0.2532055079936981, + "learning_rate": 8.251981165632737e-05, + "loss": 1.8414, + "step": 9657 + }, + { + "epoch": 2.9643953345610803, + "grad_norm": 0.2692684829235077, + "learning_rate": 8.251603588265165e-05, + "loss": 1.8701, + "step": 9658 + }, + { + "epoch": 2.9647022713321056, + "grad_norm": 0.2511022984981537, + "learning_rate": 8.251225978763341e-05, + "loss": 1.8068, + "step": 9659 + }, + { + "epoch": 2.9650092081031305, + "grad_norm": 0.24702081084251404, + "learning_rate": 8.250848337130997e-05, + "loss": 1.7993, + "step": 9660 + }, + { + "epoch": 2.965316144874156, + "grad_norm": 0.26960623264312744, + "learning_rate": 8.250470663371862e-05, + "loss": 1.8269, + "step": 9661 + }, + { + "epoch": 2.965623081645181, + "grad_norm": 0.2651064693927765, + "learning_rate": 8.250092957489673e-05, + "loss": 1.8235, + "step": 9662 + }, + { + "epoch": 2.9659300184162065, + "grad_norm": 0.3117934465408325, + "learning_rate": 8.249715219488158e-05, + "loss": 1.9603, + "step": 9663 + }, + { + "epoch": 2.9662369551872314, + "grad_norm": 0.3244706988334656, + "learning_rate": 8.249337449371055e-05, + "loss": 1.8766, + "step": 9664 + }, + { + "epoch": 2.9665438919582567, + "grad_norm": 0.3071763515472412, + "learning_rate": 8.248959647142094e-05, + "loss": 1.8118, + "step": 9665 + }, + { + "epoch": 2.9668508287292816, + "grad_norm": 0.2575626075267792, + "learning_rate": 8.24858181280501e-05, + "loss": 1.8578, + "step": 9666 + }, + { + "epoch": 2.967157765500307, + "grad_norm": 0.369356244802475, + "learning_rate": 8.248203946363535e-05, + "loss": 1.7831, + "step": 9667 + }, + { + "epoch": 2.9674647022713323, + "grad_norm": 0.317775160074234, + "learning_rate": 8.247826047821405e-05, + "loss": 1.8839, + "step": 9668 + }, + { + "epoch": 2.967771639042357, + "grad_norm": 0.31816980242729187, + "learning_rate": 8.247448117182355e-05, + "loss": 1.8111, + "step": 9669 + }, + { + "epoch": 2.9680785758133825, + "grad_norm": 0.2943781316280365, + "learning_rate": 8.247070154450119e-05, + "loss": 1.848, + "step": 9670 + }, + { + "epoch": 2.9683855125844074, + "grad_norm": 0.28252434730529785, + "learning_rate": 8.246692159628433e-05, + "loss": 1.8601, + "step": 9671 + }, + { + "epoch": 2.9686924493554327, + "grad_norm": 0.29150691628456116, + "learning_rate": 8.246314132721032e-05, + "loss": 1.7738, + "step": 9672 + }, + { + "epoch": 2.968999386126458, + "grad_norm": 0.3699757754802704, + "learning_rate": 8.245936073731653e-05, + "loss": 1.842, + "step": 9673 + }, + { + "epoch": 2.969306322897483, + "grad_norm": 0.37951794266700745, + "learning_rate": 8.245557982664031e-05, + "loss": 1.8648, + "step": 9674 + }, + { + "epoch": 2.9696132596685083, + "grad_norm": 0.2792273461818695, + "learning_rate": 8.245179859521901e-05, + "loss": 1.889, + "step": 9675 + }, + { + "epoch": 2.969920196439533, + "grad_norm": 0.3405047059059143, + "learning_rate": 8.244801704309002e-05, + "loss": 1.7658, + "step": 9676 + }, + { + "epoch": 2.9702271332105585, + "grad_norm": 0.40138551592826843, + "learning_rate": 8.244423517029072e-05, + "loss": 1.79, + "step": 9677 + }, + { + "epoch": 2.970534069981584, + "grad_norm": 0.42260462045669556, + "learning_rate": 8.244045297685846e-05, + "loss": 1.9248, + "step": 9678 + }, + { + "epoch": 2.970841006752609, + "grad_norm": 0.30391061305999756, + "learning_rate": 8.243667046283063e-05, + "loss": 1.7922, + "step": 9679 + }, + { + "epoch": 2.971147943523634, + "grad_norm": 0.3194752037525177, + "learning_rate": 8.243288762824463e-05, + "loss": 1.8582, + "step": 9680 + }, + { + "epoch": 2.9714548802946594, + "grad_norm": 0.47853100299835205, + "learning_rate": 8.24291044731378e-05, + "loss": 1.8206, + "step": 9681 + }, + { + "epoch": 2.9717618170656843, + "grad_norm": 0.47428956627845764, + "learning_rate": 8.242532099754756e-05, + "loss": 1.8271, + "step": 9682 + }, + { + "epoch": 2.9720687538367097, + "grad_norm": 0.30275169014930725, + "learning_rate": 8.24215372015113e-05, + "loss": 1.8532, + "step": 9683 + }, + { + "epoch": 2.972375690607735, + "grad_norm": 0.31766825914382935, + "learning_rate": 8.24177530850664e-05, + "loss": 1.7751, + "step": 9684 + }, + { + "epoch": 2.97268262737876, + "grad_norm": 0.3738986551761627, + "learning_rate": 8.241396864825026e-05, + "loss": 1.7644, + "step": 9685 + }, + { + "epoch": 2.972989564149785, + "grad_norm": 0.2794596254825592, + "learning_rate": 8.24101838911003e-05, + "loss": 1.7445, + "step": 9686 + }, + { + "epoch": 2.97329650092081, + "grad_norm": 0.30008718371391296, + "learning_rate": 8.240639881365388e-05, + "loss": 1.8181, + "step": 9687 + }, + { + "epoch": 2.9736034376918354, + "grad_norm": 0.36667200922966003, + "learning_rate": 8.240261341594846e-05, + "loss": 1.8606, + "step": 9688 + }, + { + "epoch": 2.9739103744628608, + "grad_norm": 0.2943612039089203, + "learning_rate": 8.23988276980214e-05, + "loss": 1.8169, + "step": 9689 + }, + { + "epoch": 2.9742173112338857, + "grad_norm": 0.3499365746974945, + "learning_rate": 8.239504165991015e-05, + "loss": 1.8901, + "step": 9690 + }, + { + "epoch": 2.974524248004911, + "grad_norm": 0.35552978515625, + "learning_rate": 8.239125530165211e-05, + "loss": 1.8266, + "step": 9691 + }, + { + "epoch": 2.974831184775936, + "grad_norm": 0.35415011644363403, + "learning_rate": 8.23874686232847e-05, + "loss": 1.8588, + "step": 9692 + }, + { + "epoch": 2.9751381215469612, + "grad_norm": 0.3237420618534088, + "learning_rate": 8.238368162484533e-05, + "loss": 1.8112, + "step": 9693 + }, + { + "epoch": 2.9754450583179866, + "grad_norm": 0.31672203540802, + "learning_rate": 8.237989430637145e-05, + "loss": 1.7983, + "step": 9694 + }, + { + "epoch": 2.975751995089012, + "grad_norm": 0.2926657795906067, + "learning_rate": 8.237610666790048e-05, + "loss": 1.8137, + "step": 9695 + }, + { + "epoch": 2.976058931860037, + "grad_norm": 0.2924230992794037, + "learning_rate": 8.237231870946983e-05, + "loss": 1.8789, + "step": 9696 + }, + { + "epoch": 2.976365868631062, + "grad_norm": 0.2768077850341797, + "learning_rate": 8.236853043111697e-05, + "loss": 1.8643, + "step": 9697 + }, + { + "epoch": 2.976672805402087, + "grad_norm": 0.24151389300823212, + "learning_rate": 8.23647418328793e-05, + "loss": 1.8245, + "step": 9698 + }, + { + "epoch": 2.9769797421731123, + "grad_norm": 0.24514195322990417, + "learning_rate": 8.23609529147943e-05, + "loss": 1.761, + "step": 9699 + }, + { + "epoch": 2.9772866789441377, + "grad_norm": 0.2619125545024872, + "learning_rate": 8.235716367689938e-05, + "loss": 1.8445, + "step": 9700 + }, + { + "epoch": 2.9775936157151626, + "grad_norm": 0.2570437490940094, + "learning_rate": 8.235337411923203e-05, + "loss": 1.7881, + "step": 9701 + }, + { + "epoch": 2.977900552486188, + "grad_norm": 0.288775235414505, + "learning_rate": 8.234958424182966e-05, + "loss": 1.8177, + "step": 9702 + }, + { + "epoch": 2.978207489257213, + "grad_norm": 0.3186240792274475, + "learning_rate": 8.234579404472973e-05, + "loss": 1.8438, + "step": 9703 + }, + { + "epoch": 2.978514426028238, + "grad_norm": 0.2520117163658142, + "learning_rate": 8.23420035279697e-05, + "loss": 1.7791, + "step": 9704 + }, + { + "epoch": 2.9788213627992635, + "grad_norm": 0.23164312541484833, + "learning_rate": 8.233821269158706e-05, + "loss": 1.7368, + "step": 9705 + }, + { + "epoch": 2.979128299570289, + "grad_norm": 0.33843451738357544, + "learning_rate": 8.233442153561924e-05, + "loss": 1.8656, + "step": 9706 + }, + { + "epoch": 2.9794352363413137, + "grad_norm": 0.3070257604122162, + "learning_rate": 8.23306300601037e-05, + "loss": 1.7982, + "step": 9707 + }, + { + "epoch": 2.979742173112339, + "grad_norm": 0.29138872027397156, + "learning_rate": 8.232683826507793e-05, + "loss": 1.8227, + "step": 9708 + }, + { + "epoch": 2.980049109883364, + "grad_norm": 0.22698308527469635, + "learning_rate": 8.23230461505794e-05, + "loss": 1.7841, + "step": 9709 + }, + { + "epoch": 2.9803560466543892, + "grad_norm": 0.2597857713699341, + "learning_rate": 8.231925371664559e-05, + "loss": 1.7438, + "step": 9710 + }, + { + "epoch": 2.9806629834254146, + "grad_norm": 0.28672367334365845, + "learning_rate": 8.231546096331395e-05, + "loss": 1.8415, + "step": 9711 + }, + { + "epoch": 2.9809699201964395, + "grad_norm": 0.24295037984848022, + "learning_rate": 8.2311667890622e-05, + "loss": 1.8179, + "step": 9712 + }, + { + "epoch": 2.981276856967465, + "grad_norm": 0.24558894336223602, + "learning_rate": 8.23078744986072e-05, + "loss": 1.8092, + "step": 9713 + }, + { + "epoch": 2.9815837937384897, + "grad_norm": 0.2644276022911072, + "learning_rate": 8.230408078730706e-05, + "loss": 1.8214, + "step": 9714 + }, + { + "epoch": 2.981890730509515, + "grad_norm": 0.27007076144218445, + "learning_rate": 8.230028675675907e-05, + "loss": 1.8042, + "step": 9715 + }, + { + "epoch": 2.9821976672805404, + "grad_norm": 0.2729937732219696, + "learning_rate": 8.229649240700069e-05, + "loss": 1.8419, + "step": 9716 + }, + { + "epoch": 2.9825046040515653, + "grad_norm": 0.26545679569244385, + "learning_rate": 8.229269773806945e-05, + "loss": 1.823, + "step": 9717 + }, + { + "epoch": 2.9828115408225906, + "grad_norm": 0.23276878893375397, + "learning_rate": 8.228890275000285e-05, + "loss": 1.7635, + "step": 9718 + }, + { + "epoch": 2.9831184775936155, + "grad_norm": 0.28991779685020447, + "learning_rate": 8.228510744283837e-05, + "loss": 1.8303, + "step": 9719 + }, + { + "epoch": 2.983425414364641, + "grad_norm": 0.2821960151195526, + "learning_rate": 8.228131181661357e-05, + "loss": 1.8246, + "step": 9720 + }, + { + "epoch": 2.983732351135666, + "grad_norm": 0.25588423013687134, + "learning_rate": 8.22775158713659e-05, + "loss": 1.7764, + "step": 9721 + }, + { + "epoch": 2.9840392879066915, + "grad_norm": 0.2694758176803589, + "learning_rate": 8.227371960713289e-05, + "loss": 1.8026, + "step": 9722 + }, + { + "epoch": 2.9843462246777164, + "grad_norm": 0.27571097016334534, + "learning_rate": 8.226992302395209e-05, + "loss": 1.8051, + "step": 9723 + }, + { + "epoch": 2.9846531614487417, + "grad_norm": 0.2940119504928589, + "learning_rate": 8.226612612186099e-05, + "loss": 1.8782, + "step": 9724 + }, + { + "epoch": 2.9849600982197666, + "grad_norm": 0.34924936294555664, + "learning_rate": 8.226232890089711e-05, + "loss": 1.7845, + "step": 9725 + }, + { + "epoch": 2.985267034990792, + "grad_norm": 0.30503180623054504, + "learning_rate": 8.2258531361098e-05, + "loss": 1.8345, + "step": 9726 + }, + { + "epoch": 2.9855739717618173, + "grad_norm": 0.2463730275630951, + "learning_rate": 8.225473350250117e-05, + "loss": 1.8188, + "step": 9727 + }, + { + "epoch": 2.985880908532842, + "grad_norm": 0.3514629900455475, + "learning_rate": 8.225093532514417e-05, + "loss": 1.9253, + "step": 9728 + }, + { + "epoch": 2.9861878453038675, + "grad_norm": 0.26462769508361816, + "learning_rate": 8.224713682906449e-05, + "loss": 1.7396, + "step": 9729 + }, + { + "epoch": 2.9864947820748924, + "grad_norm": 0.27125996351242065, + "learning_rate": 8.224333801429973e-05, + "loss": 1.7784, + "step": 9730 + }, + { + "epoch": 2.9868017188459177, + "grad_norm": 0.3083387315273285, + "learning_rate": 8.22395388808874e-05, + "loss": 1.8503, + "step": 9731 + }, + { + "epoch": 2.987108655616943, + "grad_norm": 0.28289708495140076, + "learning_rate": 8.223573942886505e-05, + "loss": 1.8337, + "step": 9732 + }, + { + "epoch": 2.987415592387968, + "grad_norm": 0.3667753040790558, + "learning_rate": 8.223193965827023e-05, + "loss": 1.8213, + "step": 9733 + }, + { + "epoch": 2.9877225291589933, + "grad_norm": 0.3568948805332184, + "learning_rate": 8.222813956914049e-05, + "loss": 1.8337, + "step": 9734 + }, + { + "epoch": 2.988029465930018, + "grad_norm": 0.2883065640926361, + "learning_rate": 8.22243391615134e-05, + "loss": 1.7227, + "step": 9735 + }, + { + "epoch": 2.9883364027010435, + "grad_norm": 0.24940936267375946, + "learning_rate": 8.222053843542648e-05, + "loss": 1.7889, + "step": 9736 + }, + { + "epoch": 2.988643339472069, + "grad_norm": 0.31267982721328735, + "learning_rate": 8.221673739091732e-05, + "loss": 1.8432, + "step": 9737 + }, + { + "epoch": 2.988950276243094, + "grad_norm": 0.3552311658859253, + "learning_rate": 8.221293602802349e-05, + "loss": 1.8569, + "step": 9738 + }, + { + "epoch": 2.989257213014119, + "grad_norm": 0.4149966835975647, + "learning_rate": 8.220913434678252e-05, + "loss": 1.8052, + "step": 9739 + }, + { + "epoch": 2.9895641497851444, + "grad_norm": 0.282320499420166, + "learning_rate": 8.220533234723204e-05, + "loss": 1.7629, + "step": 9740 + }, + { + "epoch": 2.9898710865561693, + "grad_norm": 0.27737030386924744, + "learning_rate": 8.220153002940958e-05, + "loss": 1.8331, + "step": 9741 + }, + { + "epoch": 2.9901780233271946, + "grad_norm": 0.29296645522117615, + "learning_rate": 8.219772739335272e-05, + "loss": 1.8414, + "step": 9742 + }, + { + "epoch": 2.99048496009822, + "grad_norm": 0.35226449370384216, + "learning_rate": 8.219392443909903e-05, + "loss": 1.8608, + "step": 9743 + }, + { + "epoch": 2.990791896869245, + "grad_norm": 0.3199223577976227, + "learning_rate": 8.219012116668612e-05, + "loss": 1.7868, + "step": 9744 + }, + { + "epoch": 2.99109883364027, + "grad_norm": 0.2904597818851471, + "learning_rate": 8.218631757615159e-05, + "loss": 1.8495, + "step": 9745 + }, + { + "epoch": 2.991405770411295, + "grad_norm": 0.34674009680747986, + "learning_rate": 8.218251366753298e-05, + "loss": 1.8143, + "step": 9746 + }, + { + "epoch": 2.9917127071823204, + "grad_norm": 0.38007479906082153, + "learning_rate": 8.217870944086791e-05, + "loss": 1.8534, + "step": 9747 + }, + { + "epoch": 2.9920196439533457, + "grad_norm": 0.31660130620002747, + "learning_rate": 8.217490489619398e-05, + "loss": 1.7807, + "step": 9748 + }, + { + "epoch": 2.9923265807243706, + "grad_norm": 0.2923539876937866, + "learning_rate": 8.217110003354877e-05, + "loss": 1.8517, + "step": 9749 + }, + { + "epoch": 2.992633517495396, + "grad_norm": 0.31018227338790894, + "learning_rate": 8.21672948529699e-05, + "loss": 1.7998, + "step": 9750 + }, + { + "epoch": 2.992940454266421, + "grad_norm": 0.29448994994163513, + "learning_rate": 8.216348935449496e-05, + "loss": 1.7883, + "step": 9751 + }, + { + "epoch": 2.993247391037446, + "grad_norm": 0.26120781898498535, + "learning_rate": 8.215968353816158e-05, + "loss": 1.7762, + "step": 9752 + }, + { + "epoch": 2.9935543278084715, + "grad_norm": 0.27784180641174316, + "learning_rate": 8.215587740400735e-05, + "loss": 1.8711, + "step": 9753 + }, + { + "epoch": 2.993861264579497, + "grad_norm": 0.3106052577495575, + "learning_rate": 8.21520709520699e-05, + "loss": 1.8112, + "step": 9754 + }, + { + "epoch": 2.9941682013505218, + "grad_norm": 0.3170885145664215, + "learning_rate": 8.214826418238684e-05, + "loss": 1.8893, + "step": 9755 + }, + { + "epoch": 2.994475138121547, + "grad_norm": 0.2969432473182678, + "learning_rate": 8.214445709499577e-05, + "loss": 1.8628, + "step": 9756 + }, + { + "epoch": 2.994782074892572, + "grad_norm": 0.30484744906425476, + "learning_rate": 8.214064968993436e-05, + "loss": 1.8421, + "step": 9757 + }, + { + "epoch": 2.9950890116635973, + "grad_norm": 0.24819856882095337, + "learning_rate": 8.213684196724019e-05, + "loss": 1.8243, + "step": 9758 + }, + { + "epoch": 2.9953959484346226, + "grad_norm": 0.28566786646842957, + "learning_rate": 8.213303392695092e-05, + "loss": 1.8064, + "step": 9759 + }, + { + "epoch": 2.9957028852056475, + "grad_norm": 0.27742111682891846, + "learning_rate": 8.212922556910418e-05, + "loss": 1.8174, + "step": 9760 + }, + { + "epoch": 2.996009821976673, + "grad_norm": 0.27103090286254883, + "learning_rate": 8.212541689373761e-05, + "loss": 1.761, + "step": 9761 + }, + { + "epoch": 2.9963167587476978, + "grad_norm": 0.27157172560691833, + "learning_rate": 8.212160790088883e-05, + "loss": 1.8893, + "step": 9762 + }, + { + "epoch": 2.996623695518723, + "grad_norm": 0.2742370367050171, + "learning_rate": 8.21177985905955e-05, + "loss": 1.8774, + "step": 9763 + }, + { + "epoch": 2.9969306322897484, + "grad_norm": 0.26467064023017883, + "learning_rate": 8.211398896289524e-05, + "loss": 1.7805, + "step": 9764 + }, + { + "epoch": 2.9972375690607733, + "grad_norm": 0.2622149884700775, + "learning_rate": 8.211017901782574e-05, + "loss": 1.7346, + "step": 9765 + }, + { + "epoch": 2.9975445058317987, + "grad_norm": 0.3163202106952667, + "learning_rate": 8.210636875542462e-05, + "loss": 1.8348, + "step": 9766 + }, + { + "epoch": 2.9978514426028235, + "grad_norm": 0.2789528965950012, + "learning_rate": 8.210255817572955e-05, + "loss": 1.7535, + "step": 9767 + }, + { + "epoch": 2.998158379373849, + "grad_norm": 0.25694188475608826, + "learning_rate": 8.209874727877818e-05, + "loss": 1.8731, + "step": 9768 + }, + { + "epoch": 2.998465316144874, + "grad_norm": 0.40298742055892944, + "learning_rate": 8.209493606460818e-05, + "loss": 1.7924, + "step": 9769 + }, + { + "epoch": 2.9987722529158995, + "grad_norm": 0.5090280771255493, + "learning_rate": 8.20911245332572e-05, + "loss": 1.8253, + "step": 9770 + }, + { + "epoch": 2.9990791896869244, + "grad_norm": 0.41809162497520447, + "learning_rate": 8.208731268476293e-05, + "loss": 1.8233, + "step": 9771 + }, + { + "epoch": 2.9993861264579498, + "grad_norm": 0.23141434788703918, + "learning_rate": 8.208350051916303e-05, + "loss": 1.7842, + "step": 9772 + }, + { + "epoch": 2.9996930632289747, + "grad_norm": 0.3174372613430023, + "learning_rate": 8.207968803649517e-05, + "loss": 1.8477, + "step": 9773 + }, + { + "epoch": 3.0, + "grad_norm": 0.41795292496681213, + "learning_rate": 8.207587523679704e-05, + "loss": 1.8407, + "step": 9774 + }, + { + "epoch": 3.0003069367710253, + "grad_norm": 0.43365660309791565, + "learning_rate": 8.20720621201063e-05, + "loss": 1.8074, + "step": 9775 + }, + { + "epoch": 3.0006138735420502, + "grad_norm": 0.461374968290329, + "learning_rate": 8.206824868646064e-05, + "loss": 1.9089, + "step": 9776 + }, + { + "epoch": 3.0009208103130756, + "grad_norm": 0.3747929632663727, + "learning_rate": 8.206443493589776e-05, + "loss": 1.8358, + "step": 9777 + }, + { + "epoch": 3.001227747084101, + "grad_norm": 0.28436774015426636, + "learning_rate": 8.206062086845532e-05, + "loss": 1.8527, + "step": 9778 + }, + { + "epoch": 3.001534683855126, + "grad_norm": 0.33642131090164185, + "learning_rate": 8.205680648417106e-05, + "loss": 1.8142, + "step": 9779 + }, + { + "epoch": 3.001841620626151, + "grad_norm": 0.4283481240272522, + "learning_rate": 8.205299178308263e-05, + "loss": 1.9006, + "step": 9780 + }, + { + "epoch": 3.002148557397176, + "grad_norm": 0.34405630826950073, + "learning_rate": 8.204917676522777e-05, + "loss": 1.7988, + "step": 9781 + }, + { + "epoch": 3.0024554941682013, + "grad_norm": 0.3161070942878723, + "learning_rate": 8.204536143064414e-05, + "loss": 1.8271, + "step": 9782 + }, + { + "epoch": 3.0027624309392267, + "grad_norm": 0.42518749833106995, + "learning_rate": 8.204154577936946e-05, + "loss": 1.864, + "step": 9783 + }, + { + "epoch": 3.0030693677102516, + "grad_norm": 0.3760852813720703, + "learning_rate": 8.203772981144146e-05, + "loss": 1.8543, + "step": 9784 + }, + { + "epoch": 3.003376304481277, + "grad_norm": 0.32794755697250366, + "learning_rate": 8.203391352689784e-05, + "loss": 1.8776, + "step": 9785 + }, + { + "epoch": 3.0036832412523022, + "grad_norm": 0.3053889274597168, + "learning_rate": 8.20300969257763e-05, + "loss": 1.8064, + "step": 9786 + }, + { + "epoch": 3.003990178023327, + "grad_norm": 0.40283143520355225, + "learning_rate": 8.202628000811456e-05, + "loss": 1.8083, + "step": 9787 + }, + { + "epoch": 3.0042971147943525, + "grad_norm": 0.49270665645599365, + "learning_rate": 8.202246277395038e-05, + "loss": 1.802, + "step": 9788 + }, + { + "epoch": 3.0046040515653774, + "grad_norm": 0.4373023211956024, + "learning_rate": 8.201864522332143e-05, + "loss": 1.8429, + "step": 9789 + }, + { + "epoch": 3.0049109883364027, + "grad_norm": 0.3136310875415802, + "learning_rate": 8.201482735626547e-05, + "loss": 1.8224, + "step": 9790 + }, + { + "epoch": 3.005217925107428, + "grad_norm": 0.3306807279586792, + "learning_rate": 8.201100917282023e-05, + "loss": 1.8463, + "step": 9791 + }, + { + "epoch": 3.005524861878453, + "grad_norm": 0.45082196593284607, + "learning_rate": 8.200719067302342e-05, + "loss": 1.7587, + "step": 9792 + }, + { + "epoch": 3.0058317986494782, + "grad_norm": 0.49246448278427124, + "learning_rate": 8.20033718569128e-05, + "loss": 1.8245, + "step": 9793 + }, + { + "epoch": 3.0061387354205036, + "grad_norm": 0.3040246367454529, + "learning_rate": 8.199955272452609e-05, + "loss": 1.8309, + "step": 9794 + }, + { + "epoch": 3.0064456721915285, + "grad_norm": 0.3909318149089813, + "learning_rate": 8.199573327590105e-05, + "loss": 1.8187, + "step": 9795 + }, + { + "epoch": 3.006752608962554, + "grad_norm": 0.5753183960914612, + "learning_rate": 8.199191351107543e-05, + "loss": 1.826, + "step": 9796 + }, + { + "epoch": 3.0070595457335787, + "grad_norm": 0.48908689618110657, + "learning_rate": 8.198809343008695e-05, + "loss": 1.8475, + "step": 9797 + }, + { + "epoch": 3.007366482504604, + "grad_norm": 0.31570208072662354, + "learning_rate": 8.198427303297341e-05, + "loss": 1.8046, + "step": 9798 + }, + { + "epoch": 3.0076734192756294, + "grad_norm": 0.39205440878868103, + "learning_rate": 8.198045231977251e-05, + "loss": 1.8413, + "step": 9799 + }, + { + "epoch": 3.0079803560466543, + "grad_norm": 0.5117597579956055, + "learning_rate": 8.197663129052204e-05, + "loss": 1.8184, + "step": 9800 + }, + { + "epoch": 3.0082872928176796, + "grad_norm": 0.3623514175415039, + "learning_rate": 8.197280994525978e-05, + "loss": 1.8292, + "step": 9801 + }, + { + "epoch": 3.008594229588705, + "grad_norm": 0.2826726734638214, + "learning_rate": 8.196898828402344e-05, + "loss": 1.8216, + "step": 9802 + }, + { + "epoch": 3.00890116635973, + "grad_norm": 0.38658398389816284, + "learning_rate": 8.196516630685085e-05, + "loss": 1.867, + "step": 9803 + }, + { + "epoch": 3.009208103130755, + "grad_norm": 0.3371698260307312, + "learning_rate": 8.196134401377973e-05, + "loss": 1.8077, + "step": 9804 + }, + { + "epoch": 3.00951503990178, + "grad_norm": 0.24108785390853882, + "learning_rate": 8.195752140484789e-05, + "loss": 1.7858, + "step": 9805 + }, + { + "epoch": 3.0098219766728054, + "grad_norm": 0.34410104155540466, + "learning_rate": 8.195369848009309e-05, + "loss": 1.801, + "step": 9806 + }, + { + "epoch": 3.0101289134438307, + "grad_norm": 0.3412116467952728, + "learning_rate": 8.194987523955311e-05, + "loss": 1.7905, + "step": 9807 + }, + { + "epoch": 3.0104358502148556, + "grad_norm": 0.2473030537366867, + "learning_rate": 8.194605168326573e-05, + "loss": 1.7765, + "step": 9808 + }, + { + "epoch": 3.010742786985881, + "grad_norm": 0.28590065240859985, + "learning_rate": 8.194222781126875e-05, + "loss": 1.7897, + "step": 9809 + }, + { + "epoch": 3.0110497237569063, + "grad_norm": 0.2994272708892822, + "learning_rate": 8.193840362359994e-05, + "loss": 1.7976, + "step": 9810 + }, + { + "epoch": 3.011356660527931, + "grad_norm": 0.2971307635307312, + "learning_rate": 8.193457912029713e-05, + "loss": 1.829, + "step": 9811 + }, + { + "epoch": 3.0116635972989565, + "grad_norm": 0.25149810314178467, + "learning_rate": 8.193075430139809e-05, + "loss": 1.7709, + "step": 9812 + }, + { + "epoch": 3.0119705340699814, + "grad_norm": 0.2561332583427429, + "learning_rate": 8.19269291669406e-05, + "loss": 1.7689, + "step": 9813 + }, + { + "epoch": 3.0122774708410067, + "grad_norm": 0.2658882141113281, + "learning_rate": 8.192310371696249e-05, + "loss": 1.8497, + "step": 9814 + }, + { + "epoch": 3.012584407612032, + "grad_norm": 0.2873780429363251, + "learning_rate": 8.191927795150156e-05, + "loss": 1.8217, + "step": 9815 + }, + { + "epoch": 3.012891344383057, + "grad_norm": 0.2181183248758316, + "learning_rate": 8.191545187059562e-05, + "loss": 1.7261, + "step": 9816 + }, + { + "epoch": 3.0131982811540823, + "grad_norm": 0.2414858490228653, + "learning_rate": 8.191162547428248e-05, + "loss": 1.8035, + "step": 9817 + }, + { + "epoch": 3.0135052179251076, + "grad_norm": 0.2799840271472931, + "learning_rate": 8.190779876259995e-05, + "loss": 1.8279, + "step": 9818 + }, + { + "epoch": 3.0138121546961325, + "grad_norm": 0.2669760584831238, + "learning_rate": 8.190397173558584e-05, + "loss": 1.8155, + "step": 9819 + }, + { + "epoch": 3.014119091467158, + "grad_norm": 0.28857991099357605, + "learning_rate": 8.1900144393278e-05, + "loss": 1.8479, + "step": 9820 + }, + { + "epoch": 3.0144260282381827, + "grad_norm": 0.30534693598747253, + "learning_rate": 8.189631673571422e-05, + "loss": 1.8609, + "step": 9821 + }, + { + "epoch": 3.014732965009208, + "grad_norm": 0.3238218128681183, + "learning_rate": 8.189248876293236e-05, + "loss": 1.9292, + "step": 9822 + }, + { + "epoch": 3.0150399017802334, + "grad_norm": 0.3000536561012268, + "learning_rate": 8.188866047497022e-05, + "loss": 1.8214, + "step": 9823 + }, + { + "epoch": 3.0153468385512583, + "grad_norm": 0.2960065007209778, + "learning_rate": 8.188483187186565e-05, + "loss": 1.8316, + "step": 9824 + }, + { + "epoch": 3.0156537753222836, + "grad_norm": 0.28609779477119446, + "learning_rate": 8.188100295365648e-05, + "loss": 1.8002, + "step": 9825 + }, + { + "epoch": 3.015960712093309, + "grad_norm": 0.31390634179115295, + "learning_rate": 8.187717372038057e-05, + "loss": 1.8134, + "step": 9826 + }, + { + "epoch": 3.016267648864334, + "grad_norm": 0.28550946712493896, + "learning_rate": 8.187334417207573e-05, + "loss": 1.8359, + "step": 9827 + }, + { + "epoch": 3.016574585635359, + "grad_norm": 0.3085210621356964, + "learning_rate": 8.186951430877982e-05, + "loss": 1.813, + "step": 9828 + }, + { + "epoch": 3.016881522406384, + "grad_norm": 0.3043847978115082, + "learning_rate": 8.18656841305307e-05, + "loss": 1.8222, + "step": 9829 + }, + { + "epoch": 3.0171884591774094, + "grad_norm": 0.32524731755256653, + "learning_rate": 8.18618536373662e-05, + "loss": 1.8258, + "step": 9830 + }, + { + "epoch": 3.0174953959484347, + "grad_norm": 0.2690991461277008, + "learning_rate": 8.18580228293242e-05, + "loss": 1.8492, + "step": 9831 + }, + { + "epoch": 3.0178023327194596, + "grad_norm": 0.34936225414276123, + "learning_rate": 8.185419170644253e-05, + "loss": 1.8363, + "step": 9832 + }, + { + "epoch": 3.018109269490485, + "grad_norm": 0.3274296820163727, + "learning_rate": 8.185036026875908e-05, + "loss": 1.7789, + "step": 9833 + }, + { + "epoch": 3.0184162062615103, + "grad_norm": 0.2729836106300354, + "learning_rate": 8.184652851631169e-05, + "loss": 1.8264, + "step": 9834 + }, + { + "epoch": 3.018723143032535, + "grad_norm": 0.28682780265808105, + "learning_rate": 8.184269644913826e-05, + "loss": 1.8399, + "step": 9835 + }, + { + "epoch": 3.0190300798035605, + "grad_norm": 0.3224826455116272, + "learning_rate": 8.183886406727662e-05, + "loss": 1.8338, + "step": 9836 + }, + { + "epoch": 3.0193370165745854, + "grad_norm": 0.30945318937301636, + "learning_rate": 8.183503137076467e-05, + "loss": 1.8248, + "step": 9837 + }, + { + "epoch": 3.0196439533456108, + "grad_norm": 0.27580398321151733, + "learning_rate": 8.183119835964029e-05, + "loss": 1.8096, + "step": 9838 + }, + { + "epoch": 3.019950890116636, + "grad_norm": 0.28927183151245117, + "learning_rate": 8.182736503394132e-05, + "loss": 1.825, + "step": 9839 + }, + { + "epoch": 3.020257826887661, + "grad_norm": 0.253000408411026, + "learning_rate": 8.182353139370571e-05, + "loss": 1.7678, + "step": 9840 + }, + { + "epoch": 3.0205647636586863, + "grad_norm": 0.2882022559642792, + "learning_rate": 8.18196974389713e-05, + "loss": 1.8895, + "step": 9841 + }, + { + "epoch": 3.0208717004297116, + "grad_norm": 0.26864609122276306, + "learning_rate": 8.1815863169776e-05, + "loss": 1.7674, + "step": 9842 + }, + { + "epoch": 3.0211786372007365, + "grad_norm": 0.27344849705696106, + "learning_rate": 8.181202858615769e-05, + "loss": 1.8146, + "step": 9843 + }, + { + "epoch": 3.021485573971762, + "grad_norm": 0.31659772992134094, + "learning_rate": 8.180819368815425e-05, + "loss": 1.8485, + "step": 9844 + }, + { + "epoch": 3.021792510742787, + "grad_norm": 0.3163176476955414, + "learning_rate": 8.18043584758036e-05, + "loss": 1.8994, + "step": 9845 + }, + { + "epoch": 3.022099447513812, + "grad_norm": 0.2583829462528229, + "learning_rate": 8.180052294914365e-05, + "loss": 1.764, + "step": 9846 + }, + { + "epoch": 3.0224063842848374, + "grad_norm": 0.3006649315357208, + "learning_rate": 8.179668710821227e-05, + "loss": 1.9232, + "step": 9847 + }, + { + "epoch": 3.0227133210558623, + "grad_norm": 0.35702988505363464, + "learning_rate": 8.179285095304741e-05, + "loss": 1.8403, + "step": 9848 + }, + { + "epoch": 3.0230202578268877, + "grad_norm": 0.29699379205703735, + "learning_rate": 8.178901448368697e-05, + "loss": 1.8412, + "step": 9849 + }, + { + "epoch": 3.023327194597913, + "grad_norm": 0.3022700548171997, + "learning_rate": 8.178517770016885e-05, + "loss": 1.8197, + "step": 9850 + }, + { + "epoch": 3.023634131368938, + "grad_norm": 0.2943836748600006, + "learning_rate": 8.178134060253097e-05, + "loss": 1.8127, + "step": 9851 + }, + { + "epoch": 3.023941068139963, + "grad_norm": 0.31290489435195923, + "learning_rate": 8.177750319081126e-05, + "loss": 1.821, + "step": 9852 + }, + { + "epoch": 3.0242480049109886, + "grad_norm": 0.30308374762535095, + "learning_rate": 8.177366546504763e-05, + "loss": 1.8522, + "step": 9853 + }, + { + "epoch": 3.0245549416820134, + "grad_norm": 0.301559716463089, + "learning_rate": 8.176982742527802e-05, + "loss": 1.8758, + "step": 9854 + }, + { + "epoch": 3.0248618784530388, + "grad_norm": 0.33314836025238037, + "learning_rate": 8.176598907154034e-05, + "loss": 1.8178, + "step": 9855 + }, + { + "epoch": 3.0251688152240637, + "grad_norm": 0.3567935526371002, + "learning_rate": 8.176215040387255e-05, + "loss": 1.7847, + "step": 9856 + }, + { + "epoch": 3.025475751995089, + "grad_norm": 0.27716195583343506, + "learning_rate": 8.175831142231258e-05, + "loss": 1.772, + "step": 9857 + }, + { + "epoch": 3.0257826887661143, + "grad_norm": 0.24568212032318115, + "learning_rate": 8.175447212689836e-05, + "loss": 1.8171, + "step": 9858 + }, + { + "epoch": 3.0260896255371392, + "grad_norm": 0.25368261337280273, + "learning_rate": 8.175063251766784e-05, + "loss": 1.852, + "step": 9859 + }, + { + "epoch": 3.0263965623081646, + "grad_norm": 0.2509497404098511, + "learning_rate": 8.174679259465894e-05, + "loss": 1.7737, + "step": 9860 + }, + { + "epoch": 3.02670349907919, + "grad_norm": 0.3539343774318695, + "learning_rate": 8.174295235790963e-05, + "loss": 1.8663, + "step": 9861 + }, + { + "epoch": 3.027010435850215, + "grad_norm": 0.36450034379959106, + "learning_rate": 8.173911180745788e-05, + "loss": 1.8179, + "step": 9862 + }, + { + "epoch": 3.02731737262124, + "grad_norm": 0.3550017178058624, + "learning_rate": 8.173527094334162e-05, + "loss": 1.8256, + "step": 9863 + }, + { + "epoch": 3.027624309392265, + "grad_norm": 0.33518701791763306, + "learning_rate": 8.17314297655988e-05, + "loss": 1.7842, + "step": 9864 + }, + { + "epoch": 3.0279312461632903, + "grad_norm": 0.2522886097431183, + "learning_rate": 8.172758827426739e-05, + "loss": 1.7688, + "step": 9865 + }, + { + "epoch": 3.0282381829343157, + "grad_norm": 0.26222914457321167, + "learning_rate": 8.172374646938536e-05, + "loss": 1.8517, + "step": 9866 + }, + { + "epoch": 3.0285451197053406, + "grad_norm": 0.3355788588523865, + "learning_rate": 8.171990435099068e-05, + "loss": 1.9002, + "step": 9867 + }, + { + "epoch": 3.028852056476366, + "grad_norm": 0.32907500863075256, + "learning_rate": 8.171606191912131e-05, + "loss": 1.7801, + "step": 9868 + }, + { + "epoch": 3.0291589932473912, + "grad_norm": 0.29234179854393005, + "learning_rate": 8.171221917381523e-05, + "loss": 1.8055, + "step": 9869 + }, + { + "epoch": 3.029465930018416, + "grad_norm": 0.26374876499176025, + "learning_rate": 8.170837611511041e-05, + "loss": 1.781, + "step": 9870 + }, + { + "epoch": 3.0297728667894415, + "grad_norm": 0.311282217502594, + "learning_rate": 8.170453274304483e-05, + "loss": 1.839, + "step": 9871 + }, + { + "epoch": 3.0300798035604664, + "grad_norm": 0.24225831031799316, + "learning_rate": 8.170068905765648e-05, + "loss": 1.804, + "step": 9872 + }, + { + "epoch": 3.0303867403314917, + "grad_norm": 0.29383334517478943, + "learning_rate": 8.169684505898335e-05, + "loss": 1.7817, + "step": 9873 + }, + { + "epoch": 3.030693677102517, + "grad_norm": 0.2607928514480591, + "learning_rate": 8.169300074706339e-05, + "loss": 1.8379, + "step": 9874 + }, + { + "epoch": 3.031000613873542, + "grad_norm": 0.283028244972229, + "learning_rate": 8.168915612193464e-05, + "loss": 1.7797, + "step": 9875 + }, + { + "epoch": 3.0313075506445673, + "grad_norm": 0.27675309777259827, + "learning_rate": 8.168531118363508e-05, + "loss": 1.8355, + "step": 9876 + }, + { + "epoch": 3.0316144874155926, + "grad_norm": 0.2598227262496948, + "learning_rate": 8.16814659322027e-05, + "loss": 1.7898, + "step": 9877 + }, + { + "epoch": 3.0319214241866175, + "grad_norm": 0.24715003371238708, + "learning_rate": 8.16776203676755e-05, + "loss": 1.7791, + "step": 9878 + }, + { + "epoch": 3.032228360957643, + "grad_norm": 0.2749374210834503, + "learning_rate": 8.167377449009149e-05, + "loss": 1.8303, + "step": 9879 + }, + { + "epoch": 3.0325352977286677, + "grad_norm": 0.26150834560394287, + "learning_rate": 8.166992829948868e-05, + "loss": 1.8462, + "step": 9880 + }, + { + "epoch": 3.032842234499693, + "grad_norm": 0.3044755160808563, + "learning_rate": 8.166608179590506e-05, + "loss": 1.806, + "step": 9881 + }, + { + "epoch": 3.0331491712707184, + "grad_norm": 0.2949555516242981, + "learning_rate": 8.166223497937868e-05, + "loss": 1.8785, + "step": 9882 + }, + { + "epoch": 3.0334561080417433, + "grad_norm": 0.33206698298454285, + "learning_rate": 8.165838784994752e-05, + "loss": 1.8476, + "step": 9883 + }, + { + "epoch": 3.0337630448127686, + "grad_norm": 0.2720400094985962, + "learning_rate": 8.165454040764962e-05, + "loss": 1.843, + "step": 9884 + }, + { + "epoch": 3.034069981583794, + "grad_norm": 0.29340869188308716, + "learning_rate": 8.1650692652523e-05, + "loss": 1.7761, + "step": 9885 + }, + { + "epoch": 3.034376918354819, + "grad_norm": 0.35155293345451355, + "learning_rate": 8.16468445846057e-05, + "loss": 1.8887, + "step": 9886 + }, + { + "epoch": 3.034683855125844, + "grad_norm": 0.2688990831375122, + "learning_rate": 8.164299620393571e-05, + "loss": 1.8001, + "step": 9887 + }, + { + "epoch": 3.034990791896869, + "grad_norm": 0.2921253442764282, + "learning_rate": 8.16391475105511e-05, + "loss": 1.7951, + "step": 9888 + }, + { + "epoch": 3.0352977286678944, + "grad_norm": 0.28100699186325073, + "learning_rate": 8.163529850448988e-05, + "loss": 1.8041, + "step": 9889 + }, + { + "epoch": 3.0356046654389197, + "grad_norm": 0.3155081868171692, + "learning_rate": 8.16314491857901e-05, + "loss": 1.8026, + "step": 9890 + }, + { + "epoch": 3.0359116022099446, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.16275995544898e-05, + "loss": 1.8502, + "step": 9891 + }, + { + "epoch": 3.03621853898097, + "grad_norm": 0.2732076644897461, + "learning_rate": 8.162374961062704e-05, + "loss": 1.8424, + "step": 9892 + }, + { + "epoch": 3.0365254757519953, + "grad_norm": 0.2943679690361023, + "learning_rate": 8.161989935423984e-05, + "loss": 1.7635, + "step": 9893 + }, + { + "epoch": 3.03683241252302, + "grad_norm": 0.28894683718681335, + "learning_rate": 8.161604878536626e-05, + "loss": 1.78, + "step": 9894 + }, + { + "epoch": 3.0371393492940455, + "grad_norm": 0.2718082666397095, + "learning_rate": 8.161219790404435e-05, + "loss": 1.7664, + "step": 9895 + }, + { + "epoch": 3.0374462860650704, + "grad_norm": 0.29092124104499817, + "learning_rate": 8.160834671031216e-05, + "loss": 1.8621, + "step": 9896 + }, + { + "epoch": 3.0377532228360957, + "grad_norm": 0.284665584564209, + "learning_rate": 8.160449520420779e-05, + "loss": 1.8607, + "step": 9897 + }, + { + "epoch": 3.038060159607121, + "grad_norm": 0.23676982522010803, + "learning_rate": 8.160064338576925e-05, + "loss": 1.7137, + "step": 9898 + }, + { + "epoch": 3.038367096378146, + "grad_norm": 0.2666932940483093, + "learning_rate": 8.159679125503466e-05, + "loss": 1.8038, + "step": 9899 + }, + { + "epoch": 3.0386740331491713, + "grad_norm": 0.36214375495910645, + "learning_rate": 8.159293881204204e-05, + "loss": 1.8902, + "step": 9900 + }, + { + "epoch": 3.0389809699201966, + "grad_norm": 0.30301332473754883, + "learning_rate": 8.158908605682948e-05, + "loss": 1.8456, + "step": 9901 + }, + { + "epoch": 3.0392879066912215, + "grad_norm": 0.32190418243408203, + "learning_rate": 8.158523298943506e-05, + "loss": 1.8246, + "step": 9902 + }, + { + "epoch": 3.039594843462247, + "grad_norm": 0.2938043475151062, + "learning_rate": 8.158137960989685e-05, + "loss": 1.8324, + "step": 9903 + }, + { + "epoch": 3.0399017802332717, + "grad_norm": 0.29493969678878784, + "learning_rate": 8.157752591825294e-05, + "loss": 1.8458, + "step": 9904 + }, + { + "epoch": 3.040208717004297, + "grad_norm": 0.2681889832019806, + "learning_rate": 8.157367191454141e-05, + "loss": 1.889, + "step": 9905 + }, + { + "epoch": 3.0405156537753224, + "grad_norm": 0.3111969232559204, + "learning_rate": 8.156981759880035e-05, + "loss": 1.8966, + "step": 9906 + }, + { + "epoch": 3.0408225905463473, + "grad_norm": 0.345262736082077, + "learning_rate": 8.156596297106784e-05, + "loss": 1.8174, + "step": 9907 + }, + { + "epoch": 3.0411295273173726, + "grad_norm": 0.30156534910202026, + "learning_rate": 8.156210803138199e-05, + "loss": 1.766, + "step": 9908 + }, + { + "epoch": 3.041436464088398, + "grad_norm": 0.28691565990448, + "learning_rate": 8.15582527797809e-05, + "loss": 1.8436, + "step": 9909 + }, + { + "epoch": 3.041743400859423, + "grad_norm": 0.33418282866477966, + "learning_rate": 8.155439721630264e-05, + "loss": 1.8939, + "step": 9910 + }, + { + "epoch": 3.042050337630448, + "grad_norm": 0.25496938824653625, + "learning_rate": 8.155054134098535e-05, + "loss": 1.8368, + "step": 9911 + }, + { + "epoch": 3.042357274401473, + "grad_norm": 0.3806788921356201, + "learning_rate": 8.154668515386711e-05, + "loss": 1.8635, + "step": 9912 + }, + { + "epoch": 3.0426642111724984, + "grad_norm": 0.42668119072914124, + "learning_rate": 8.154282865498603e-05, + "loss": 1.76, + "step": 9913 + }, + { + "epoch": 3.0429711479435237, + "grad_norm": 0.35945314168930054, + "learning_rate": 8.153897184438024e-05, + "loss": 1.8275, + "step": 9914 + }, + { + "epoch": 3.0432780847145486, + "grad_norm": 0.3225449323654175, + "learning_rate": 8.153511472208784e-05, + "loss": 1.7901, + "step": 9915 + }, + { + "epoch": 3.043585021485574, + "grad_norm": 0.2905425727367401, + "learning_rate": 8.153125728814694e-05, + "loss": 1.8021, + "step": 9916 + }, + { + "epoch": 3.0438919582565993, + "grad_norm": 0.3315529525279999, + "learning_rate": 8.15273995425957e-05, + "loss": 1.8003, + "step": 9917 + }, + { + "epoch": 3.044198895027624, + "grad_norm": 0.30256444215774536, + "learning_rate": 8.152354148547221e-05, + "loss": 1.8243, + "step": 9918 + }, + { + "epoch": 3.0445058317986495, + "grad_norm": 0.2563035190105438, + "learning_rate": 8.15196831168146e-05, + "loss": 1.7877, + "step": 9919 + }, + { + "epoch": 3.044812768569675, + "grad_norm": 0.25705814361572266, + "learning_rate": 8.151582443666101e-05, + "loss": 1.813, + "step": 9920 + }, + { + "epoch": 3.0451197053406998, + "grad_norm": 0.3649071455001831, + "learning_rate": 8.151196544504957e-05, + "loss": 1.8114, + "step": 9921 + }, + { + "epoch": 3.045426642111725, + "grad_norm": 0.4076193571090698, + "learning_rate": 8.150810614201841e-05, + "loss": 1.7869, + "step": 9922 + }, + { + "epoch": 3.04573357888275, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.150424652760569e-05, + "loss": 1.7878, + "step": 9923 + }, + { + "epoch": 3.0460405156537753, + "grad_norm": 0.2243243157863617, + "learning_rate": 8.150038660184955e-05, + "loss": 1.8224, + "step": 9924 + }, + { + "epoch": 3.0463474524248007, + "grad_norm": 0.3295031487941742, + "learning_rate": 8.149652636478811e-05, + "loss": 1.8685, + "step": 9925 + }, + { + "epoch": 3.0466543891958255, + "grad_norm": 0.2973531186580658, + "learning_rate": 8.149266581645954e-05, + "loss": 1.8082, + "step": 9926 + }, + { + "epoch": 3.046961325966851, + "grad_norm": 0.25648918747901917, + "learning_rate": 8.148880495690199e-05, + "loss": 1.8089, + "step": 9927 + }, + { + "epoch": 3.047268262737876, + "grad_norm": 0.2845752537250519, + "learning_rate": 8.148494378615361e-05, + "loss": 1.8726, + "step": 9928 + }, + { + "epoch": 3.047575199508901, + "grad_norm": 0.2917105555534363, + "learning_rate": 8.148108230425255e-05, + "loss": 1.8035, + "step": 9929 + }, + { + "epoch": 3.0478821362799264, + "grad_norm": 0.2775834798812866, + "learning_rate": 8.1477220511237e-05, + "loss": 1.8545, + "step": 9930 + }, + { + "epoch": 3.0481890730509513, + "grad_norm": 0.3522767424583435, + "learning_rate": 8.14733584071451e-05, + "loss": 1.8261, + "step": 9931 + }, + { + "epoch": 3.0484960098219767, + "grad_norm": 0.3759000599384308, + "learning_rate": 8.146949599201503e-05, + "loss": 1.8405, + "step": 9932 + }, + { + "epoch": 3.048802946593002, + "grad_norm": 0.3353044390678406, + "learning_rate": 8.146563326588496e-05, + "loss": 1.7762, + "step": 9933 + }, + { + "epoch": 3.049109883364027, + "grad_norm": 0.263810932636261, + "learning_rate": 8.146177022879304e-05, + "loss": 1.7546, + "step": 9934 + }, + { + "epoch": 3.049416820135052, + "grad_norm": 0.24064256250858307, + "learning_rate": 8.14579068807775e-05, + "loss": 1.7903, + "step": 9935 + }, + { + "epoch": 3.0497237569060776, + "grad_norm": 0.3144194781780243, + "learning_rate": 8.145404322187645e-05, + "loss": 1.8011, + "step": 9936 + }, + { + "epoch": 3.0500306936771024, + "grad_norm": 0.3362879455089569, + "learning_rate": 8.145017925212812e-05, + "loss": 1.8224, + "step": 9937 + }, + { + "epoch": 3.050337630448128, + "grad_norm": 0.33979395031929016, + "learning_rate": 8.144631497157071e-05, + "loss": 1.8415, + "step": 9938 + }, + { + "epoch": 3.0506445672191527, + "grad_norm": 0.33391237258911133, + "learning_rate": 8.144245038024235e-05, + "loss": 1.7983, + "step": 9939 + }, + { + "epoch": 3.050951503990178, + "grad_norm": 0.34034964442253113, + "learning_rate": 8.143858547818128e-05, + "loss": 1.8635, + "step": 9940 + }, + { + "epoch": 3.0512584407612033, + "grad_norm": 0.3472529947757721, + "learning_rate": 8.143472026542569e-05, + "loss": 1.8067, + "step": 9941 + }, + { + "epoch": 3.0515653775322282, + "grad_norm": 0.3369109630584717, + "learning_rate": 8.143085474201376e-05, + "loss": 1.7933, + "step": 9942 + }, + { + "epoch": 3.0518723143032536, + "grad_norm": 0.3055182993412018, + "learning_rate": 8.14269889079837e-05, + "loss": 1.7358, + "step": 9943 + }, + { + "epoch": 3.052179251074279, + "grad_norm": 0.26729708909988403, + "learning_rate": 8.142312276337372e-05, + "loss": 1.8315, + "step": 9944 + }, + { + "epoch": 3.052486187845304, + "grad_norm": 0.3626720607280731, + "learning_rate": 8.141925630822203e-05, + "loss": 1.7593, + "step": 9945 + }, + { + "epoch": 3.052793124616329, + "grad_norm": 0.3673512637615204, + "learning_rate": 8.141538954256683e-05, + "loss": 1.8414, + "step": 9946 + }, + { + "epoch": 3.053100061387354, + "grad_norm": 0.30554768443107605, + "learning_rate": 8.141152246644632e-05, + "loss": 1.7504, + "step": 9947 + }, + { + "epoch": 3.0534069981583793, + "grad_norm": 0.41163405776023865, + "learning_rate": 8.140765507989875e-05, + "loss": 1.8794, + "step": 9948 + }, + { + "epoch": 3.0537139349294047, + "grad_norm": 0.592751145362854, + "learning_rate": 8.140378738296233e-05, + "loss": 1.8538, + "step": 9949 + }, + { + "epoch": 3.0540208717004296, + "grad_norm": 0.483828604221344, + "learning_rate": 8.139991937567527e-05, + "loss": 1.7952, + "step": 9950 + }, + { + "epoch": 3.054327808471455, + "grad_norm": 0.26665306091308594, + "learning_rate": 8.13960510580758e-05, + "loss": 1.8268, + "step": 9951 + }, + { + "epoch": 3.0546347452424802, + "grad_norm": 0.42917072772979736, + "learning_rate": 8.139218243020215e-05, + "loss": 1.843, + "step": 9952 + }, + { + "epoch": 3.054941682013505, + "grad_norm": 0.47911396622657776, + "learning_rate": 8.138831349209256e-05, + "loss": 1.8223, + "step": 9953 + }, + { + "epoch": 3.0552486187845305, + "grad_norm": 0.4540431797504425, + "learning_rate": 8.138444424378524e-05, + "loss": 1.9198, + "step": 9954 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.29719051718711853, + "learning_rate": 8.138057468531845e-05, + "loss": 1.7873, + "step": 9955 + }, + { + "epoch": 3.0558624923265807, + "grad_norm": 0.35133618116378784, + "learning_rate": 8.137670481673045e-05, + "loss": 1.8459, + "step": 9956 + }, + { + "epoch": 3.056169429097606, + "grad_norm": 0.42896488308906555, + "learning_rate": 8.137283463805945e-05, + "loss": 1.7814, + "step": 9957 + }, + { + "epoch": 3.056476365868631, + "grad_norm": 0.38993972539901733, + "learning_rate": 8.136896414934372e-05, + "loss": 1.7636, + "step": 9958 + }, + { + "epoch": 3.0567833026396563, + "grad_norm": 0.31362372636795044, + "learning_rate": 8.13650933506215e-05, + "loss": 1.8021, + "step": 9959 + }, + { + "epoch": 3.0570902394106816, + "grad_norm": 0.27980196475982666, + "learning_rate": 8.136122224193103e-05, + "loss": 1.8445, + "step": 9960 + }, + { + "epoch": 3.0573971761817065, + "grad_norm": 0.2721461057662964, + "learning_rate": 8.135735082331059e-05, + "loss": 1.7614, + "step": 9961 + }, + { + "epoch": 3.057704112952732, + "grad_norm": 0.25157424807548523, + "learning_rate": 8.135347909479843e-05, + "loss": 1.7598, + "step": 9962 + }, + { + "epoch": 3.0580110497237567, + "grad_norm": 0.25798025727272034, + "learning_rate": 8.13496070564328e-05, + "loss": 1.7823, + "step": 9963 + }, + { + "epoch": 3.058317986494782, + "grad_norm": 0.30775198340415955, + "learning_rate": 8.134573470825199e-05, + "loss": 1.7755, + "step": 9964 + }, + { + "epoch": 3.0586249232658074, + "grad_norm": 0.28916797041893005, + "learning_rate": 8.134186205029426e-05, + "loss": 1.8189, + "step": 9965 + }, + { + "epoch": 3.0589318600368323, + "grad_norm": 0.2829149067401886, + "learning_rate": 8.133798908259787e-05, + "loss": 1.8546, + "step": 9966 + }, + { + "epoch": 3.0592387968078576, + "grad_norm": 0.2884117662906647, + "learning_rate": 8.13341158052011e-05, + "loss": 1.7705, + "step": 9967 + }, + { + "epoch": 3.059545733578883, + "grad_norm": 0.28311973810195923, + "learning_rate": 8.133024221814225e-05, + "loss": 1.8147, + "step": 9968 + }, + { + "epoch": 3.059852670349908, + "grad_norm": 0.25405213236808777, + "learning_rate": 8.132636832145957e-05, + "loss": 1.7813, + "step": 9969 + }, + { + "epoch": 3.060159607120933, + "grad_norm": 0.3082229793071747, + "learning_rate": 8.132249411519137e-05, + "loss": 1.8536, + "step": 9970 + }, + { + "epoch": 3.060466543891958, + "grad_norm": 0.29918181896209717, + "learning_rate": 8.13186195993759e-05, + "loss": 1.8181, + "step": 9971 + }, + { + "epoch": 3.0607734806629834, + "grad_norm": 0.3025238811969757, + "learning_rate": 8.13147447740515e-05, + "loss": 1.7785, + "step": 9972 + }, + { + "epoch": 3.0610804174340087, + "grad_norm": 0.2798222303390503, + "learning_rate": 8.131086963925643e-05, + "loss": 1.7873, + "step": 9973 + }, + { + "epoch": 3.0613873542050336, + "grad_norm": 0.32636210322380066, + "learning_rate": 8.130699419502898e-05, + "loss": 1.882, + "step": 9974 + }, + { + "epoch": 3.061694290976059, + "grad_norm": 0.27722054719924927, + "learning_rate": 8.130311844140748e-05, + "loss": 1.7788, + "step": 9975 + }, + { + "epoch": 3.0620012277470843, + "grad_norm": 0.289156436920166, + "learning_rate": 8.129924237843023e-05, + "loss": 1.8591, + "step": 9976 + }, + { + "epoch": 3.062308164518109, + "grad_norm": 0.2839665412902832, + "learning_rate": 8.12953660061355e-05, + "loss": 1.8255, + "step": 9977 + }, + { + "epoch": 3.0626151012891345, + "grad_norm": 0.2650148272514343, + "learning_rate": 8.129148932456161e-05, + "loss": 1.8353, + "step": 9978 + }, + { + "epoch": 3.06292203806016, + "grad_norm": 0.2884560227394104, + "learning_rate": 8.128761233374691e-05, + "loss": 1.8099, + "step": 9979 + }, + { + "epoch": 3.0632289748311847, + "grad_norm": 0.2610029876232147, + "learning_rate": 8.128373503372967e-05, + "loss": 1.8173, + "step": 9980 + }, + { + "epoch": 3.06353591160221, + "grad_norm": 0.32512393593788147, + "learning_rate": 8.127985742454822e-05, + "loss": 1.8619, + "step": 9981 + }, + { + "epoch": 3.063842848373235, + "grad_norm": 0.3382968604564667, + "learning_rate": 8.127597950624091e-05, + "loss": 1.831, + "step": 9982 + }, + { + "epoch": 3.0641497851442603, + "grad_norm": 0.33773133158683777, + "learning_rate": 8.127210127884602e-05, + "loss": 1.8194, + "step": 9983 + }, + { + "epoch": 3.0644567219152856, + "grad_norm": 0.31642746925354004, + "learning_rate": 8.126822274240188e-05, + "loss": 1.8782, + "step": 9984 + }, + { + "epoch": 3.0647636586863105, + "grad_norm": 0.2476506233215332, + "learning_rate": 8.126434389694686e-05, + "loss": 1.7866, + "step": 9985 + }, + { + "epoch": 3.065070595457336, + "grad_norm": 0.27296319603919983, + "learning_rate": 8.126046474251927e-05, + "loss": 1.8276, + "step": 9986 + }, + { + "epoch": 3.0653775322283607, + "grad_norm": 0.353865385055542, + "learning_rate": 8.125658527915744e-05, + "loss": 1.9525, + "step": 9987 + }, + { + "epoch": 3.065684468999386, + "grad_norm": 0.370256632566452, + "learning_rate": 8.12527055068997e-05, + "loss": 1.8514, + "step": 9988 + }, + { + "epoch": 3.0659914057704114, + "grad_norm": 0.30738842487335205, + "learning_rate": 8.124882542578442e-05, + "loss": 1.8125, + "step": 9989 + }, + { + "epoch": 3.0662983425414363, + "grad_norm": 0.3151233494281769, + "learning_rate": 8.124494503584995e-05, + "loss": 1.8165, + "step": 9990 + }, + { + "epoch": 3.0666052793124616, + "grad_norm": 0.29071590304374695, + "learning_rate": 8.124106433713458e-05, + "loss": 1.7617, + "step": 9991 + }, + { + "epoch": 3.066912216083487, + "grad_norm": 0.2898697853088379, + "learning_rate": 8.123718332967672e-05, + "loss": 1.7779, + "step": 9992 + }, + { + "epoch": 3.067219152854512, + "grad_norm": 0.26601701974868774, + "learning_rate": 8.123330201351471e-05, + "loss": 1.8307, + "step": 9993 + }, + { + "epoch": 3.067526089625537, + "grad_norm": 0.2622119188308716, + "learning_rate": 8.12294203886869e-05, + "loss": 1.7958, + "step": 9994 + }, + { + "epoch": 3.0678330263965625, + "grad_norm": 0.29709386825561523, + "learning_rate": 8.122553845523166e-05, + "loss": 1.7799, + "step": 9995 + }, + { + "epoch": 3.0681399631675874, + "grad_norm": 0.31267789006233215, + "learning_rate": 8.122165621318733e-05, + "loss": 1.8149, + "step": 9996 + }, + { + "epoch": 3.0684468999386127, + "grad_norm": 0.3076523244380951, + "learning_rate": 8.121777366259232e-05, + "loss": 1.7701, + "step": 9997 + }, + { + "epoch": 3.0687538367096376, + "grad_norm": 0.30096009373664856, + "learning_rate": 8.121389080348496e-05, + "loss": 1.8323, + "step": 9998 + }, + { + "epoch": 3.069060773480663, + "grad_norm": 0.25739142298698425, + "learning_rate": 8.121000763590363e-05, + "loss": 1.8105, + "step": 9999 + }, + { + "epoch": 3.0693677102516883, + "grad_norm": 0.2780844271183014, + "learning_rate": 8.120612415988671e-05, + "loss": 1.8502, + "step": 10000 + }, + { + "epoch": 3.069674647022713, + "grad_norm": 0.3316378593444824, + "learning_rate": 8.120224037547259e-05, + "loss": 1.8244, + "step": 10001 + }, + { + "epoch": 3.0699815837937385, + "grad_norm": 0.261129766702652, + "learning_rate": 8.119835628269964e-05, + "loss": 1.7769, + "step": 10002 + }, + { + "epoch": 3.070288520564764, + "grad_norm": 0.29213985800743103, + "learning_rate": 8.119447188160625e-05, + "loss": 1.7717, + "step": 10003 + }, + { + "epoch": 3.0705954573357888, + "grad_norm": 0.38545623421669006, + "learning_rate": 8.11905871722308e-05, + "loss": 1.8433, + "step": 10004 + }, + { + "epoch": 3.070902394106814, + "grad_norm": 0.3617223799228668, + "learning_rate": 8.118670215461168e-05, + "loss": 1.8172, + "step": 10005 + }, + { + "epoch": 3.071209330877839, + "grad_norm": 0.3241543769836426, + "learning_rate": 8.11828168287873e-05, + "loss": 1.8325, + "step": 10006 + }, + { + "epoch": 3.0715162676488643, + "grad_norm": 0.3538578152656555, + "learning_rate": 8.117893119479605e-05, + "loss": 1.8188, + "step": 10007 + }, + { + "epoch": 3.0718232044198897, + "grad_norm": 0.3861970603466034, + "learning_rate": 8.117504525267632e-05, + "loss": 1.8518, + "step": 10008 + }, + { + "epoch": 3.0721301411909145, + "grad_norm": 0.35433146357536316, + "learning_rate": 8.117115900246652e-05, + "loss": 1.8601, + "step": 10009 + }, + { + "epoch": 3.07243707796194, + "grad_norm": 0.29796987771987915, + "learning_rate": 8.116727244420507e-05, + "loss": 1.7934, + "step": 10010 + }, + { + "epoch": 3.072744014732965, + "grad_norm": 0.3091779947280884, + "learning_rate": 8.116338557793035e-05, + "loss": 1.8111, + "step": 10011 + }, + { + "epoch": 3.07305095150399, + "grad_norm": 0.2741319537162781, + "learning_rate": 8.11594984036808e-05, + "loss": 1.8079, + "step": 10012 + }, + { + "epoch": 3.0733578882750154, + "grad_norm": 0.28905320167541504, + "learning_rate": 8.115561092149482e-05, + "loss": 1.8475, + "step": 10013 + }, + { + "epoch": 3.0736648250460403, + "grad_norm": 0.2897081673145294, + "learning_rate": 8.115172313141081e-05, + "loss": 1.838, + "step": 10014 + }, + { + "epoch": 3.0739717618170657, + "grad_norm": 0.2620783746242523, + "learning_rate": 8.114783503346725e-05, + "loss": 1.8024, + "step": 10015 + }, + { + "epoch": 3.074278698588091, + "grad_norm": 0.26478636264801025, + "learning_rate": 8.11439466277025e-05, + "loss": 1.8137, + "step": 10016 + }, + { + "epoch": 3.074585635359116, + "grad_norm": 0.2796174883842468, + "learning_rate": 8.114005791415502e-05, + "loss": 1.7976, + "step": 10017 + }, + { + "epoch": 3.074892572130141, + "grad_norm": 0.26813286542892456, + "learning_rate": 8.113616889286325e-05, + "loss": 1.7945, + "step": 10018 + }, + { + "epoch": 3.0751995089011666, + "grad_norm": 0.2443828582763672, + "learning_rate": 8.11322795638656e-05, + "loss": 1.7829, + "step": 10019 + }, + { + "epoch": 3.0755064456721914, + "grad_norm": 0.2981395423412323, + "learning_rate": 8.112838992720053e-05, + "loss": 1.7928, + "step": 10020 + }, + { + "epoch": 3.075813382443217, + "grad_norm": 0.25605037808418274, + "learning_rate": 8.112449998290644e-05, + "loss": 1.8129, + "step": 10021 + }, + { + "epoch": 3.0761203192142417, + "grad_norm": 0.31180307269096375, + "learning_rate": 8.112060973102181e-05, + "loss": 1.7393, + "step": 10022 + }, + { + "epoch": 3.076427255985267, + "grad_norm": 0.3230421543121338, + "learning_rate": 8.111671917158508e-05, + "loss": 1.818, + "step": 10023 + }, + { + "epoch": 3.0767341927562923, + "grad_norm": 0.3158549964427948, + "learning_rate": 8.111282830463468e-05, + "loss": 1.7582, + "step": 10024 + }, + { + "epoch": 3.0770411295273172, + "grad_norm": 0.24524325132369995, + "learning_rate": 8.110893713020908e-05, + "loss": 1.8215, + "step": 10025 + }, + { + "epoch": 3.0773480662983426, + "grad_norm": 0.2793932259082794, + "learning_rate": 8.110504564834675e-05, + "loss": 1.8551, + "step": 10026 + }, + { + "epoch": 3.077655003069368, + "grad_norm": 0.29629403352737427, + "learning_rate": 8.110115385908612e-05, + "loss": 1.8019, + "step": 10027 + }, + { + "epoch": 3.077961939840393, + "grad_norm": 0.3138490915298462, + "learning_rate": 8.109726176246564e-05, + "loss": 1.8436, + "step": 10028 + }, + { + "epoch": 3.078268876611418, + "grad_norm": 0.29802024364471436, + "learning_rate": 8.10933693585238e-05, + "loss": 1.8158, + "step": 10029 + }, + { + "epoch": 3.078575813382443, + "grad_norm": 0.30785220861434937, + "learning_rate": 8.108947664729907e-05, + "loss": 1.8674, + "step": 10030 + }, + { + "epoch": 3.0788827501534684, + "grad_norm": 0.277662992477417, + "learning_rate": 8.10855836288299e-05, + "loss": 1.8253, + "step": 10031 + }, + { + "epoch": 3.0791896869244937, + "grad_norm": 0.27399590611457825, + "learning_rate": 8.108169030315477e-05, + "loss": 1.8587, + "step": 10032 + }, + { + "epoch": 3.0794966236955186, + "grad_norm": 0.28398239612579346, + "learning_rate": 8.107779667031217e-05, + "loss": 1.8326, + "step": 10033 + }, + { + "epoch": 3.079803560466544, + "grad_norm": 0.2882741093635559, + "learning_rate": 8.107390273034057e-05, + "loss": 1.785, + "step": 10034 + }, + { + "epoch": 3.0801104972375692, + "grad_norm": 0.271043598651886, + "learning_rate": 8.107000848327843e-05, + "loss": 1.765, + "step": 10035 + }, + { + "epoch": 3.080417434008594, + "grad_norm": 0.2589638829231262, + "learning_rate": 8.106611392916427e-05, + "loss": 1.8136, + "step": 10036 + }, + { + "epoch": 3.0807243707796195, + "grad_norm": 0.3068227469921112, + "learning_rate": 8.106221906803656e-05, + "loss": 1.8034, + "step": 10037 + }, + { + "epoch": 3.0810313075506444, + "grad_norm": 0.2714168131351471, + "learning_rate": 8.105832389993379e-05, + "loss": 1.8007, + "step": 10038 + }, + { + "epoch": 3.0813382443216697, + "grad_norm": 0.2747504711151123, + "learning_rate": 8.105442842489447e-05, + "loss": 1.8135, + "step": 10039 + }, + { + "epoch": 3.081645181092695, + "grad_norm": 0.2719285488128662, + "learning_rate": 8.105053264295708e-05, + "loss": 1.7629, + "step": 10040 + }, + { + "epoch": 3.08195211786372, + "grad_norm": 0.3119582235813141, + "learning_rate": 8.104663655416014e-05, + "loss": 1.7887, + "step": 10041 + }, + { + "epoch": 3.0822590546347453, + "grad_norm": 0.35965192317962646, + "learning_rate": 8.104274015854212e-05, + "loss": 1.8484, + "step": 10042 + }, + { + "epoch": 3.0825659914057706, + "grad_norm": 0.3045980632305145, + "learning_rate": 8.103884345614157e-05, + "loss": 1.8625, + "step": 10043 + }, + { + "epoch": 3.0828729281767955, + "grad_norm": 0.2925138473510742, + "learning_rate": 8.103494644699696e-05, + "loss": 1.9306, + "step": 10044 + }, + { + "epoch": 3.083179864947821, + "grad_norm": 0.2894277274608612, + "learning_rate": 8.103104913114681e-05, + "loss": 1.7796, + "step": 10045 + }, + { + "epoch": 3.0834868017188457, + "grad_norm": 0.2776826322078705, + "learning_rate": 8.102715150862967e-05, + "loss": 1.8169, + "step": 10046 + }, + { + "epoch": 3.083793738489871, + "grad_norm": 0.3315230906009674, + "learning_rate": 8.102325357948402e-05, + "loss": 1.8139, + "step": 10047 + }, + { + "epoch": 3.0841006752608964, + "grad_norm": 0.2906761169433594, + "learning_rate": 8.10193553437484e-05, + "loss": 1.8162, + "step": 10048 + }, + { + "epoch": 3.0844076120319213, + "grad_norm": 0.32681339979171753, + "learning_rate": 8.101545680146132e-05, + "loss": 1.8245, + "step": 10049 + }, + { + "epoch": 3.0847145488029466, + "grad_norm": 0.32525795698165894, + "learning_rate": 8.101155795266131e-05, + "loss": 1.8605, + "step": 10050 + }, + { + "epoch": 3.085021485573972, + "grad_norm": 0.31705379486083984, + "learning_rate": 8.100765879738692e-05, + "loss": 1.8214, + "step": 10051 + }, + { + "epoch": 3.085328422344997, + "grad_norm": 0.27772918343544006, + "learning_rate": 8.100375933567668e-05, + "loss": 1.7822, + "step": 10052 + }, + { + "epoch": 3.085635359116022, + "grad_norm": 0.2877809405326843, + "learning_rate": 8.09998595675691e-05, + "loss": 1.7935, + "step": 10053 + }, + { + "epoch": 3.0859422958870475, + "grad_norm": 0.29759806394577026, + "learning_rate": 8.099595949310276e-05, + "loss": 1.8041, + "step": 10054 + }, + { + "epoch": 3.0862492326580724, + "grad_norm": 0.2715320289134979, + "learning_rate": 8.099205911231617e-05, + "loss": 1.7923, + "step": 10055 + }, + { + "epoch": 3.0865561694290977, + "grad_norm": 0.33566340804100037, + "learning_rate": 8.098815842524789e-05, + "loss": 1.7953, + "step": 10056 + }, + { + "epoch": 3.0868631062001226, + "grad_norm": 0.3360871970653534, + "learning_rate": 8.098425743193645e-05, + "loss": 1.8275, + "step": 10057 + }, + { + "epoch": 3.087170042971148, + "grad_norm": 0.2797739803791046, + "learning_rate": 8.098035613242043e-05, + "loss": 1.7597, + "step": 10058 + }, + { + "epoch": 3.0874769797421733, + "grad_norm": 0.25500187277793884, + "learning_rate": 8.097645452673837e-05, + "loss": 1.8059, + "step": 10059 + }, + { + "epoch": 3.087783916513198, + "grad_norm": 0.28042587637901306, + "learning_rate": 8.097255261492884e-05, + "loss": 1.7954, + "step": 10060 + }, + { + "epoch": 3.0880908532842235, + "grad_norm": 0.3616262376308441, + "learning_rate": 8.096865039703038e-05, + "loss": 1.8605, + "step": 10061 + }, + { + "epoch": 3.0883977900552484, + "grad_norm": 0.3453714847564697, + "learning_rate": 8.096474787308157e-05, + "loss": 1.7643, + "step": 10062 + }, + { + "epoch": 3.0887047268262737, + "grad_norm": 0.3192278742790222, + "learning_rate": 8.096084504312098e-05, + "loss": 1.8415, + "step": 10063 + }, + { + "epoch": 3.089011663597299, + "grad_norm": 0.2714482545852661, + "learning_rate": 8.095694190718715e-05, + "loss": 1.8204, + "step": 10064 + }, + { + "epoch": 3.089318600368324, + "grad_norm": 0.26562005281448364, + "learning_rate": 8.09530384653187e-05, + "loss": 1.7322, + "step": 10065 + }, + { + "epoch": 3.0896255371393493, + "grad_norm": 0.33727800846099854, + "learning_rate": 8.094913471755417e-05, + "loss": 1.8221, + "step": 10066 + }, + { + "epoch": 3.0899324739103746, + "grad_norm": 0.3561044931411743, + "learning_rate": 8.094523066393215e-05, + "loss": 1.8879, + "step": 10067 + }, + { + "epoch": 3.0902394106813995, + "grad_norm": 0.2568742334842682, + "learning_rate": 8.094132630449122e-05, + "loss": 1.8178, + "step": 10068 + }, + { + "epoch": 3.090546347452425, + "grad_norm": 0.4025525450706482, + "learning_rate": 8.093742163926998e-05, + "loss": 1.8186, + "step": 10069 + }, + { + "epoch": 3.09085328422345, + "grad_norm": 0.43863433599472046, + "learning_rate": 8.0933516668307e-05, + "loss": 1.8371, + "step": 10070 + }, + { + "epoch": 3.091160220994475, + "grad_norm": 0.34873950481414795, + "learning_rate": 8.092961139164087e-05, + "loss": 1.8083, + "step": 10071 + }, + { + "epoch": 3.0914671577655004, + "grad_norm": 0.31433534622192383, + "learning_rate": 8.092570580931021e-05, + "loss": 1.8154, + "step": 10072 + }, + { + "epoch": 3.0917740945365253, + "grad_norm": 0.25523966550827026, + "learning_rate": 8.092179992135358e-05, + "loss": 1.8158, + "step": 10073 + }, + { + "epoch": 3.0920810313075506, + "grad_norm": 0.348469078540802, + "learning_rate": 8.09178937278096e-05, + "loss": 1.8358, + "step": 10074 + }, + { + "epoch": 3.092387968078576, + "grad_norm": 0.33455297350883484, + "learning_rate": 8.091398722871688e-05, + "loss": 1.7779, + "step": 10075 + }, + { + "epoch": 3.092694904849601, + "grad_norm": 0.36544880270957947, + "learning_rate": 8.091008042411403e-05, + "loss": 1.9186, + "step": 10076 + }, + { + "epoch": 3.093001841620626, + "grad_norm": 0.29165831208229065, + "learning_rate": 8.090617331403965e-05, + "loss": 1.8964, + "step": 10077 + }, + { + "epoch": 3.0933087783916515, + "grad_norm": 0.31011059880256653, + "learning_rate": 8.090226589853234e-05, + "loss": 1.8453, + "step": 10078 + }, + { + "epoch": 3.0936157151626764, + "grad_norm": 0.2835703492164612, + "learning_rate": 8.089835817763071e-05, + "loss": 1.7718, + "step": 10079 + }, + { + "epoch": 3.0939226519337018, + "grad_norm": 0.2910583019256592, + "learning_rate": 8.08944501513734e-05, + "loss": 1.7881, + "step": 10080 + }, + { + "epoch": 3.0942295887047266, + "grad_norm": 0.391303688287735, + "learning_rate": 8.089054181979905e-05, + "loss": 1.7915, + "step": 10081 + }, + { + "epoch": 3.094536525475752, + "grad_norm": 0.4119330048561096, + "learning_rate": 8.088663318294623e-05, + "loss": 1.7975, + "step": 10082 + }, + { + "epoch": 3.0948434622467773, + "grad_norm": 0.2980102002620697, + "learning_rate": 8.088272424085361e-05, + "loss": 1.805, + "step": 10083 + }, + { + "epoch": 3.095150399017802, + "grad_norm": 0.3089980483055115, + "learning_rate": 8.087881499355983e-05, + "loss": 1.8265, + "step": 10084 + }, + { + "epoch": 3.0954573357888275, + "grad_norm": 0.3851003348827362, + "learning_rate": 8.087490544110348e-05, + "loss": 1.8174, + "step": 10085 + }, + { + "epoch": 3.095764272559853, + "grad_norm": 0.42357420921325684, + "learning_rate": 8.08709955835232e-05, + "loss": 1.8083, + "step": 10086 + }, + { + "epoch": 3.0960712093308778, + "grad_norm": 0.291777640581131, + "learning_rate": 8.086708542085768e-05, + "loss": 1.7713, + "step": 10087 + }, + { + "epoch": 3.096378146101903, + "grad_norm": 0.2563805878162384, + "learning_rate": 8.086317495314552e-05, + "loss": 1.7691, + "step": 10088 + }, + { + "epoch": 3.096685082872928, + "grad_norm": 0.3418877422809601, + "learning_rate": 8.085926418042536e-05, + "loss": 1.8547, + "step": 10089 + }, + { + "epoch": 3.0969920196439533, + "grad_norm": 0.3859385550022125, + "learning_rate": 8.085535310273589e-05, + "loss": 1.8226, + "step": 10090 + }, + { + "epoch": 3.0972989564149787, + "grad_norm": 0.3427267372608185, + "learning_rate": 8.085144172011571e-05, + "loss": 1.837, + "step": 10091 + }, + { + "epoch": 3.0976058931860035, + "grad_norm": 0.29290953278541565, + "learning_rate": 8.084753003260352e-05, + "loss": 1.8392, + "step": 10092 + }, + { + "epoch": 3.097912829957029, + "grad_norm": 0.33282020688056946, + "learning_rate": 8.084361804023795e-05, + "loss": 1.8351, + "step": 10093 + }, + { + "epoch": 3.098219766728054, + "grad_norm": 0.3802134394645691, + "learning_rate": 8.083970574305768e-05, + "loss": 1.7467, + "step": 10094 + }, + { + "epoch": 3.098526703499079, + "grad_norm": 0.3142111897468567, + "learning_rate": 8.083579314110135e-05, + "loss": 1.7966, + "step": 10095 + }, + { + "epoch": 3.0988336402701044, + "grad_norm": 0.2956278324127197, + "learning_rate": 8.083188023440765e-05, + "loss": 1.8724, + "step": 10096 + }, + { + "epoch": 3.0991405770411293, + "grad_norm": 0.3262473940849304, + "learning_rate": 8.082796702301522e-05, + "loss": 1.8448, + "step": 10097 + }, + { + "epoch": 3.0994475138121547, + "grad_norm": 0.29358017444610596, + "learning_rate": 8.082405350696276e-05, + "loss": 1.8679, + "step": 10098 + }, + { + "epoch": 3.09975445058318, + "grad_norm": 0.36439722776412964, + "learning_rate": 8.082013968628893e-05, + "loss": 1.8801, + "step": 10099 + }, + { + "epoch": 3.100061387354205, + "grad_norm": 0.3565322458744049, + "learning_rate": 8.081622556103244e-05, + "loss": 1.794, + "step": 10100 + }, + { + "epoch": 3.1003683241252302, + "grad_norm": 0.2841760814189911, + "learning_rate": 8.081231113123191e-05, + "loss": 1.7593, + "step": 10101 + }, + { + "epoch": 3.1006752608962556, + "grad_norm": 0.28589630126953125, + "learning_rate": 8.080839639692608e-05, + "loss": 1.864, + "step": 10102 + }, + { + "epoch": 3.1009821976672804, + "grad_norm": 0.3595057427883148, + "learning_rate": 8.080448135815362e-05, + "loss": 1.8067, + "step": 10103 + }, + { + "epoch": 3.101289134438306, + "grad_norm": 0.3909708261489868, + "learning_rate": 8.080056601495322e-05, + "loss": 1.8601, + "step": 10104 + }, + { + "epoch": 3.1015960712093307, + "grad_norm": 0.35180148482322693, + "learning_rate": 8.079665036736358e-05, + "loss": 1.8328, + "step": 10105 + }, + { + "epoch": 3.101903007980356, + "grad_norm": 0.3065175712108612, + "learning_rate": 8.079273441542338e-05, + "loss": 1.8449, + "step": 10106 + }, + { + "epoch": 3.1022099447513813, + "grad_norm": 0.31358617544174194, + "learning_rate": 8.078881815917134e-05, + "loss": 1.8325, + "step": 10107 + }, + { + "epoch": 3.1025168815224062, + "grad_norm": 0.4737118184566498, + "learning_rate": 8.078490159864614e-05, + "loss": 1.8232, + "step": 10108 + }, + { + "epoch": 3.1028238182934316, + "grad_norm": 0.435148686170578, + "learning_rate": 8.078098473388651e-05, + "loss": 1.8227, + "step": 10109 + }, + { + "epoch": 3.103130755064457, + "grad_norm": 0.3080987334251404, + "learning_rate": 8.077706756493115e-05, + "loss": 1.8072, + "step": 10110 + }, + { + "epoch": 3.103437691835482, + "grad_norm": 0.3225170075893402, + "learning_rate": 8.077315009181876e-05, + "loss": 1.7716, + "step": 10111 + }, + { + "epoch": 3.103744628606507, + "grad_norm": 0.46642443537712097, + "learning_rate": 8.076923231458808e-05, + "loss": 1.8295, + "step": 10112 + }, + { + "epoch": 3.104051565377532, + "grad_norm": 0.42561766505241394, + "learning_rate": 8.07653142332778e-05, + "loss": 1.8553, + "step": 10113 + }, + { + "epoch": 3.1043585021485574, + "grad_norm": 0.27187541127204895, + "learning_rate": 8.076139584792664e-05, + "loss": 1.7937, + "step": 10114 + }, + { + "epoch": 3.1046654389195827, + "grad_norm": 0.27822238206863403, + "learning_rate": 8.075747715857335e-05, + "loss": 1.8151, + "step": 10115 + }, + { + "epoch": 3.1049723756906076, + "grad_norm": 0.40106478333473206, + "learning_rate": 8.075355816525665e-05, + "loss": 1.8637, + "step": 10116 + }, + { + "epoch": 3.105279312461633, + "grad_norm": 0.33455124497413635, + "learning_rate": 8.074963886801525e-05, + "loss": 1.8543, + "step": 10117 + }, + { + "epoch": 3.1055862492326582, + "grad_norm": 0.32246437668800354, + "learning_rate": 8.07457192668879e-05, + "loss": 1.7907, + "step": 10118 + }, + { + "epoch": 3.105893186003683, + "grad_norm": 0.45360109210014343, + "learning_rate": 8.074179936191332e-05, + "loss": 1.7404, + "step": 10119 + }, + { + "epoch": 3.1062001227747085, + "grad_norm": 0.445916086435318, + "learning_rate": 8.07378791531303e-05, + "loss": 1.778, + "step": 10120 + }, + { + "epoch": 3.1065070595457334, + "grad_norm": 0.28561538457870483, + "learning_rate": 8.073395864057751e-05, + "loss": 1.8723, + "step": 10121 + }, + { + "epoch": 3.1068139963167587, + "grad_norm": 0.3258218467235565, + "learning_rate": 8.073003782429373e-05, + "loss": 1.8106, + "step": 10122 + }, + { + "epoch": 3.107120933087784, + "grad_norm": 0.5459560751914978, + "learning_rate": 8.07261167043177e-05, + "loss": 1.8022, + "step": 10123 + }, + { + "epoch": 3.107427869858809, + "grad_norm": 0.4828549921512604, + "learning_rate": 8.072219528068819e-05, + "loss": 1.7556, + "step": 10124 + }, + { + "epoch": 3.1077348066298343, + "grad_norm": 0.24075324833393097, + "learning_rate": 8.071827355344393e-05, + "loss": 1.7901, + "step": 10125 + }, + { + "epoch": 3.1080417434008596, + "grad_norm": 0.44677188992500305, + "learning_rate": 8.071435152262367e-05, + "loss": 1.7858, + "step": 10126 + }, + { + "epoch": 3.1083486801718845, + "grad_norm": 0.49862590432167053, + "learning_rate": 8.071042918826622e-05, + "loss": 1.805, + "step": 10127 + }, + { + "epoch": 3.10865561694291, + "grad_norm": 0.30883491039276123, + "learning_rate": 8.07065065504103e-05, + "loss": 1.7693, + "step": 10128 + }, + { + "epoch": 3.108962553713935, + "grad_norm": 0.29583030939102173, + "learning_rate": 8.070258360909467e-05, + "loss": 1.8141, + "step": 10129 + }, + { + "epoch": 3.10926949048496, + "grad_norm": 0.3595346510410309, + "learning_rate": 8.069866036435812e-05, + "loss": 1.8286, + "step": 10130 + }, + { + "epoch": 3.1095764272559854, + "grad_norm": 0.3215504288673401, + "learning_rate": 8.069473681623942e-05, + "loss": 1.8557, + "step": 10131 + }, + { + "epoch": 3.1098833640270103, + "grad_norm": 0.29734939336776733, + "learning_rate": 8.069081296477734e-05, + "loss": 1.7996, + "step": 10132 + }, + { + "epoch": 3.1101903007980356, + "grad_norm": 0.33546003699302673, + "learning_rate": 8.068688881001065e-05, + "loss": 1.8307, + "step": 10133 + }, + { + "epoch": 3.110497237569061, + "grad_norm": 0.3886832296848297, + "learning_rate": 8.068296435197814e-05, + "loss": 1.751, + "step": 10134 + }, + { + "epoch": 3.110804174340086, + "grad_norm": 0.34505394101142883, + "learning_rate": 8.06790395907186e-05, + "loss": 1.7543, + "step": 10135 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.27018141746520996, + "learning_rate": 8.06751145262708e-05, + "loss": 1.8109, + "step": 10136 + }, + { + "epoch": 3.1114180478821365, + "grad_norm": 0.3367149531841278, + "learning_rate": 8.067118915867355e-05, + "loss": 1.8025, + "step": 10137 + }, + { + "epoch": 3.1117249846531614, + "grad_norm": 0.40811091661453247, + "learning_rate": 8.066726348796562e-05, + "loss": 1.7327, + "step": 10138 + }, + { + "epoch": 3.1120319214241867, + "grad_norm": 0.3511471152305603, + "learning_rate": 8.066333751418583e-05, + "loss": 1.8711, + "step": 10139 + }, + { + "epoch": 3.1123388581952116, + "grad_norm": 0.3112446367740631, + "learning_rate": 8.065941123737295e-05, + "loss": 1.8621, + "step": 10140 + }, + { + "epoch": 3.112645794966237, + "grad_norm": 0.3424238860607147, + "learning_rate": 8.065548465756581e-05, + "loss": 1.8383, + "step": 10141 + }, + { + "epoch": 3.1129527317372623, + "grad_norm": 0.380013108253479, + "learning_rate": 8.06515577748032e-05, + "loss": 1.8121, + "step": 10142 + }, + { + "epoch": 3.113259668508287, + "grad_norm": 0.2650558650493622, + "learning_rate": 8.064763058912393e-05, + "loss": 1.866, + "step": 10143 + }, + { + "epoch": 3.1135666052793125, + "grad_norm": 0.30580762028694153, + "learning_rate": 8.06437031005668e-05, + "loss": 1.7769, + "step": 10144 + }, + { + "epoch": 3.113873542050338, + "grad_norm": 0.29927194118499756, + "learning_rate": 8.063977530917066e-05, + "loss": 1.7897, + "step": 10145 + }, + { + "epoch": 3.1141804788213627, + "grad_norm": 0.24322012066841125, + "learning_rate": 8.063584721497429e-05, + "loss": 1.7968, + "step": 10146 + }, + { + "epoch": 3.114487415592388, + "grad_norm": 0.3082945644855499, + "learning_rate": 8.063191881801651e-05, + "loss": 1.8456, + "step": 10147 + }, + { + "epoch": 3.114794352363413, + "grad_norm": 0.3247329890727997, + "learning_rate": 8.062799011833617e-05, + "loss": 1.7436, + "step": 10148 + }, + { + "epoch": 3.1151012891344383, + "grad_norm": 0.27591946721076965, + "learning_rate": 8.062406111597207e-05, + "loss": 1.7976, + "step": 10149 + }, + { + "epoch": 3.1154082259054636, + "grad_norm": 0.2752058804035187, + "learning_rate": 8.062013181096306e-05, + "loss": 1.7814, + "step": 10150 + }, + { + "epoch": 3.1157151626764885, + "grad_norm": 0.3207196891307831, + "learning_rate": 8.061620220334795e-05, + "loss": 1.7767, + "step": 10151 + }, + { + "epoch": 3.116022099447514, + "grad_norm": 0.2895309627056122, + "learning_rate": 8.061227229316559e-05, + "loss": 1.8588, + "step": 10152 + }, + { + "epoch": 3.116329036218539, + "grad_norm": 0.333843469619751, + "learning_rate": 8.060834208045481e-05, + "loss": 1.7871, + "step": 10153 + }, + { + "epoch": 3.116635972989564, + "grad_norm": 0.43877774477005005, + "learning_rate": 8.060441156525445e-05, + "loss": 1.8165, + "step": 10154 + }, + { + "epoch": 3.1169429097605894, + "grad_norm": 0.35700589418411255, + "learning_rate": 8.060048074760337e-05, + "loss": 1.777, + "step": 10155 + }, + { + "epoch": 3.1172498465316143, + "grad_norm": 0.26124534010887146, + "learning_rate": 8.059654962754039e-05, + "loss": 1.8343, + "step": 10156 + }, + { + "epoch": 3.1175567833026396, + "grad_norm": 0.331444650888443, + "learning_rate": 8.059261820510438e-05, + "loss": 1.9437, + "step": 10157 + }, + { + "epoch": 3.117863720073665, + "grad_norm": 0.31657731533050537, + "learning_rate": 8.058868648033419e-05, + "loss": 1.7621, + "step": 10158 + }, + { + "epoch": 3.11817065684469, + "grad_norm": 0.2785957455635071, + "learning_rate": 8.058475445326867e-05, + "loss": 1.9049, + "step": 10159 + }, + { + "epoch": 3.118477593615715, + "grad_norm": 0.2605743408203125, + "learning_rate": 8.058082212394667e-05, + "loss": 1.7895, + "step": 10160 + }, + { + "epoch": 3.1187845303867405, + "grad_norm": 0.2981378138065338, + "learning_rate": 8.057688949240707e-05, + "loss": 1.8373, + "step": 10161 + }, + { + "epoch": 3.1190914671577654, + "grad_norm": 0.2944273054599762, + "learning_rate": 8.057295655868873e-05, + "loss": 1.8373, + "step": 10162 + }, + { + "epoch": 3.1193984039287908, + "grad_norm": 0.2696721851825714, + "learning_rate": 8.056902332283052e-05, + "loss": 1.8023, + "step": 10163 + }, + { + "epoch": 3.1197053406998156, + "grad_norm": 0.27659857273101807, + "learning_rate": 8.056508978487128e-05, + "loss": 1.8453, + "step": 10164 + }, + { + "epoch": 3.120012277470841, + "grad_norm": 0.2982441186904907, + "learning_rate": 8.056115594484992e-05, + "loss": 1.9072, + "step": 10165 + }, + { + "epoch": 3.1203192142418663, + "grad_norm": 0.3136404752731323, + "learning_rate": 8.055722180280531e-05, + "loss": 1.8585, + "step": 10166 + }, + { + "epoch": 3.120626151012891, + "grad_norm": 0.2979940176010132, + "learning_rate": 8.055328735877631e-05, + "loss": 1.8699, + "step": 10167 + }, + { + "epoch": 3.1209330877839165, + "grad_norm": 0.2585618793964386, + "learning_rate": 8.054935261280184e-05, + "loss": 1.8323, + "step": 10168 + }, + { + "epoch": 3.121240024554942, + "grad_norm": 0.28734859824180603, + "learning_rate": 8.054541756492075e-05, + "loss": 1.8694, + "step": 10169 + }, + { + "epoch": 3.1215469613259668, + "grad_norm": 0.30582788586616516, + "learning_rate": 8.054148221517193e-05, + "loss": 1.856, + "step": 10170 + }, + { + "epoch": 3.121853898096992, + "grad_norm": 0.3128255009651184, + "learning_rate": 8.053754656359429e-05, + "loss": 1.8329, + "step": 10171 + }, + { + "epoch": 3.122160834868017, + "grad_norm": 0.2845318615436554, + "learning_rate": 8.053361061022671e-05, + "loss": 1.8111, + "step": 10172 + }, + { + "epoch": 3.1224677716390423, + "grad_norm": 0.2994609773159027, + "learning_rate": 8.05296743551081e-05, + "loss": 1.8157, + "step": 10173 + }, + { + "epoch": 3.1227747084100677, + "grad_norm": 0.26397961378097534, + "learning_rate": 8.052573779827737e-05, + "loss": 1.8572, + "step": 10174 + }, + { + "epoch": 3.1230816451810925, + "grad_norm": 0.2911500334739685, + "learning_rate": 8.052180093977339e-05, + "loss": 1.8312, + "step": 10175 + }, + { + "epoch": 3.123388581952118, + "grad_norm": 0.33455008268356323, + "learning_rate": 8.051786377963509e-05, + "loss": 1.8748, + "step": 10176 + }, + { + "epoch": 3.123695518723143, + "grad_norm": 0.3127586841583252, + "learning_rate": 8.051392631790135e-05, + "loss": 1.8224, + "step": 10177 + }, + { + "epoch": 3.124002455494168, + "grad_norm": 0.2910686433315277, + "learning_rate": 8.050998855461113e-05, + "loss": 1.8557, + "step": 10178 + }, + { + "epoch": 3.1243093922651934, + "grad_norm": 0.2849208414554596, + "learning_rate": 8.050605048980333e-05, + "loss": 1.82, + "step": 10179 + }, + { + "epoch": 3.1246163290362183, + "grad_norm": 0.35189691185951233, + "learning_rate": 8.050211212351683e-05, + "loss": 1.7884, + "step": 10180 + }, + { + "epoch": 3.1249232658072437, + "grad_norm": 0.3641110360622406, + "learning_rate": 8.04981734557906e-05, + "loss": 1.7984, + "step": 10181 + }, + { + "epoch": 3.125230202578269, + "grad_norm": 0.3111717700958252, + "learning_rate": 8.049423448666353e-05, + "loss": 1.8134, + "step": 10182 + }, + { + "epoch": 3.125537139349294, + "grad_norm": 0.2608453631401062, + "learning_rate": 8.049029521617457e-05, + "loss": 1.765, + "step": 10183 + }, + { + "epoch": 3.1258440761203192, + "grad_norm": 0.28779423236846924, + "learning_rate": 8.048635564436265e-05, + "loss": 1.8355, + "step": 10184 + }, + { + "epoch": 3.1261510128913446, + "grad_norm": 0.38227665424346924, + "learning_rate": 8.048241577126668e-05, + "loss": 1.8487, + "step": 10185 + }, + { + "epoch": 3.1264579496623695, + "grad_norm": 0.3603171706199646, + "learning_rate": 8.047847559692562e-05, + "loss": 1.8035, + "step": 10186 + }, + { + "epoch": 3.126764886433395, + "grad_norm": 0.21950066089630127, + "learning_rate": 8.04745351213784e-05, + "loss": 1.7399, + "step": 10187 + }, + { + "epoch": 3.12707182320442, + "grad_norm": 0.2796075642108917, + "learning_rate": 8.047059434466395e-05, + "loss": 1.8229, + "step": 10188 + }, + { + "epoch": 3.127378759975445, + "grad_norm": 0.3382907807826996, + "learning_rate": 8.046665326682125e-05, + "loss": 1.7713, + "step": 10189 + }, + { + "epoch": 3.1276856967464703, + "grad_norm": 0.36472463607788086, + "learning_rate": 8.04627118878892e-05, + "loss": 1.8129, + "step": 10190 + }, + { + "epoch": 3.1279926335174952, + "grad_norm": 0.2971884310245514, + "learning_rate": 8.045877020790679e-05, + "loss": 1.7894, + "step": 10191 + }, + { + "epoch": 3.1282995702885206, + "grad_norm": 0.2292303442955017, + "learning_rate": 8.045482822691297e-05, + "loss": 1.7637, + "step": 10192 + }, + { + "epoch": 3.128606507059546, + "grad_norm": 0.300750732421875, + "learning_rate": 8.045088594494668e-05, + "loss": 1.7678, + "step": 10193 + }, + { + "epoch": 3.128913443830571, + "grad_norm": 0.3121531009674072, + "learning_rate": 8.044694336204688e-05, + "loss": 1.8651, + "step": 10194 + }, + { + "epoch": 3.129220380601596, + "grad_norm": 0.2456093430519104, + "learning_rate": 8.044300047825254e-05, + "loss": 1.7769, + "step": 10195 + }, + { + "epoch": 3.129527317372621, + "grad_norm": 0.25085800886154175, + "learning_rate": 8.043905729360264e-05, + "loss": 1.7723, + "step": 10196 + }, + { + "epoch": 3.1298342541436464, + "grad_norm": 0.2505287826061249, + "learning_rate": 8.043511380813612e-05, + "loss": 1.7943, + "step": 10197 + }, + { + "epoch": 3.1301411909146717, + "grad_norm": 0.27144530415534973, + "learning_rate": 8.043117002189198e-05, + "loss": 1.8119, + "step": 10198 + }, + { + "epoch": 3.1304481276856966, + "grad_norm": 0.2702989876270294, + "learning_rate": 8.042722593490916e-05, + "loss": 1.8517, + "step": 10199 + }, + { + "epoch": 3.130755064456722, + "grad_norm": 0.2585136890411377, + "learning_rate": 8.042328154722667e-05, + "loss": 1.8382, + "step": 10200 + }, + { + "epoch": 3.1310620012277472, + "grad_norm": 0.26306065917015076, + "learning_rate": 8.041933685888348e-05, + "loss": 1.8211, + "step": 10201 + }, + { + "epoch": 3.131368937998772, + "grad_norm": 0.2208927720785141, + "learning_rate": 8.041539186991858e-05, + "loss": 1.7765, + "step": 10202 + }, + { + "epoch": 3.1316758747697975, + "grad_norm": 0.2756440043449402, + "learning_rate": 8.041144658037095e-05, + "loss": 1.898, + "step": 10203 + }, + { + "epoch": 3.131982811540823, + "grad_norm": 0.29718101024627686, + "learning_rate": 8.040750099027958e-05, + "loss": 1.8226, + "step": 10204 + }, + { + "epoch": 3.1322897483118477, + "grad_norm": 0.3166738748550415, + "learning_rate": 8.040355509968345e-05, + "loss": 1.8129, + "step": 10205 + }, + { + "epoch": 3.132596685082873, + "grad_norm": 0.3534909784793854, + "learning_rate": 8.039960890862158e-05, + "loss": 1.8915, + "step": 10206 + }, + { + "epoch": 3.132903621853898, + "grad_norm": 0.3015006184577942, + "learning_rate": 8.039566241713297e-05, + "loss": 1.8389, + "step": 10207 + }, + { + "epoch": 3.1332105586249233, + "grad_norm": 0.35226619243621826, + "learning_rate": 8.039171562525659e-05, + "loss": 1.7287, + "step": 10208 + }, + { + "epoch": 3.1335174953959486, + "grad_norm": 0.4290136694908142, + "learning_rate": 8.038776853303146e-05, + "loss": 1.8768, + "step": 10209 + }, + { + "epoch": 3.1338244321669735, + "grad_norm": 0.2828960418701172, + "learning_rate": 8.03838211404966e-05, + "loss": 1.7552, + "step": 10210 + }, + { + "epoch": 3.134131368937999, + "grad_norm": 0.3781953752040863, + "learning_rate": 8.0379873447691e-05, + "loss": 1.7812, + "step": 10211 + }, + { + "epoch": 3.1344383057090237, + "grad_norm": 0.4282926023006439, + "learning_rate": 8.037592545465371e-05, + "loss": 1.84, + "step": 10212 + }, + { + "epoch": 3.134745242480049, + "grad_norm": 0.2622411251068115, + "learning_rate": 8.03719771614237e-05, + "loss": 1.8114, + "step": 10213 + }, + { + "epoch": 3.1350521792510744, + "grad_norm": 0.34881457686424255, + "learning_rate": 8.036802856804001e-05, + "loss": 1.7694, + "step": 10214 + }, + { + "epoch": 3.1353591160220993, + "grad_norm": 0.40797632932662964, + "learning_rate": 8.036407967454167e-05, + "loss": 1.7595, + "step": 10215 + }, + { + "epoch": 3.1356660527931246, + "grad_norm": 0.24902814626693726, + "learning_rate": 8.036013048096769e-05, + "loss": 1.8068, + "step": 10216 + }, + { + "epoch": 3.13597298956415, + "grad_norm": 0.3682909607887268, + "learning_rate": 8.035618098735711e-05, + "loss": 1.8519, + "step": 10217 + }, + { + "epoch": 3.136279926335175, + "grad_norm": 0.6111233234405518, + "learning_rate": 8.035223119374895e-05, + "loss": 1.9254, + "step": 10218 + }, + { + "epoch": 3.1365868631062, + "grad_norm": 0.4793062210083008, + "learning_rate": 8.034828110018227e-05, + "loss": 1.786, + "step": 10219 + }, + { + "epoch": 3.1368937998772255, + "grad_norm": 0.3074932396411896, + "learning_rate": 8.034433070669607e-05, + "loss": 1.8495, + "step": 10220 + }, + { + "epoch": 3.1372007366482504, + "grad_norm": 0.4366479218006134, + "learning_rate": 8.034038001332942e-05, + "loss": 1.8501, + "step": 10221 + }, + { + "epoch": 3.1375076734192757, + "grad_norm": 0.4660070538520813, + "learning_rate": 8.033642902012135e-05, + "loss": 1.8317, + "step": 10222 + }, + { + "epoch": 3.1378146101903006, + "grad_norm": 0.3452899158000946, + "learning_rate": 8.03324777271109e-05, + "loss": 1.8702, + "step": 10223 + }, + { + "epoch": 3.138121546961326, + "grad_norm": 0.3658824563026428, + "learning_rate": 8.032852613433713e-05, + "loss": 1.8754, + "step": 10224 + }, + { + "epoch": 3.1384284837323513, + "grad_norm": 0.3777768909931183, + "learning_rate": 8.03245742418391e-05, + "loss": 1.8613, + "step": 10225 + }, + { + "epoch": 3.138735420503376, + "grad_norm": 0.3873192071914673, + "learning_rate": 8.032062204965582e-05, + "loss": 1.8438, + "step": 10226 + }, + { + "epoch": 3.1390423572744015, + "grad_norm": 0.30686715245246887, + "learning_rate": 8.031666955782641e-05, + "loss": 1.811, + "step": 10227 + }, + { + "epoch": 3.139349294045427, + "grad_norm": 0.2738516330718994, + "learning_rate": 8.03127167663899e-05, + "loss": 1.757, + "step": 10228 + }, + { + "epoch": 3.1396562308164517, + "grad_norm": 0.3093133270740509, + "learning_rate": 8.030876367538536e-05, + "loss": 1.8181, + "step": 10229 + }, + { + "epoch": 3.139963167587477, + "grad_norm": 0.3247159719467163, + "learning_rate": 8.030481028485185e-05, + "loss": 1.7798, + "step": 10230 + }, + { + "epoch": 3.140270104358502, + "grad_norm": 0.2855088412761688, + "learning_rate": 8.030085659482845e-05, + "loss": 1.825, + "step": 10231 + }, + { + "epoch": 3.1405770411295273, + "grad_norm": 0.2818242907524109, + "learning_rate": 8.02969026053542e-05, + "loss": 1.7737, + "step": 10232 + }, + { + "epoch": 3.1408839779005526, + "grad_norm": 0.27074751257896423, + "learning_rate": 8.029294831646822e-05, + "loss": 1.8306, + "step": 10233 + }, + { + "epoch": 3.1411909146715775, + "grad_norm": 0.29740920662879944, + "learning_rate": 8.028899372820954e-05, + "loss": 1.8157, + "step": 10234 + }, + { + "epoch": 3.141497851442603, + "grad_norm": 0.30743202567100525, + "learning_rate": 8.028503884061731e-05, + "loss": 1.7626, + "step": 10235 + }, + { + "epoch": 3.141804788213628, + "grad_norm": 0.27812567353248596, + "learning_rate": 8.028108365373058e-05, + "loss": 1.7604, + "step": 10236 + }, + { + "epoch": 3.142111724984653, + "grad_norm": 0.26212629675865173, + "learning_rate": 8.027712816758839e-05, + "loss": 1.8161, + "step": 10237 + }, + { + "epoch": 3.1424186617556784, + "grad_norm": 0.3611658811569214, + "learning_rate": 8.02731723822299e-05, + "loss": 1.8283, + "step": 10238 + }, + { + "epoch": 3.1427255985267033, + "grad_norm": 0.31705498695373535, + "learning_rate": 8.026921629769418e-05, + "loss": 1.7986, + "step": 10239 + }, + { + "epoch": 3.1430325352977286, + "grad_norm": 0.25905972719192505, + "learning_rate": 8.026525991402032e-05, + "loss": 1.7926, + "step": 10240 + }, + { + "epoch": 3.143339472068754, + "grad_norm": 0.42376595735549927, + "learning_rate": 8.026130323124741e-05, + "loss": 1.8275, + "step": 10241 + }, + { + "epoch": 3.143646408839779, + "grad_norm": 0.415556401014328, + "learning_rate": 8.025734624941458e-05, + "loss": 1.7938, + "step": 10242 + }, + { + "epoch": 3.143953345610804, + "grad_norm": 0.3558904528617859, + "learning_rate": 8.025338896856091e-05, + "loss": 1.836, + "step": 10243 + }, + { + "epoch": 3.1442602823818295, + "grad_norm": 0.3091062307357788, + "learning_rate": 8.024943138872553e-05, + "loss": 1.8285, + "step": 10244 + }, + { + "epoch": 3.1445672191528544, + "grad_norm": 0.2620905041694641, + "learning_rate": 8.024547350994753e-05, + "loss": 1.7115, + "step": 10245 + }, + { + "epoch": 3.1448741559238798, + "grad_norm": 0.25716835260391235, + "learning_rate": 8.024151533226604e-05, + "loss": 1.7702, + "step": 10246 + }, + { + "epoch": 3.1451810926949046, + "grad_norm": 0.250844269990921, + "learning_rate": 8.023755685572017e-05, + "loss": 1.7617, + "step": 10247 + }, + { + "epoch": 3.14548802946593, + "grad_norm": 0.23898956179618835, + "learning_rate": 8.023359808034903e-05, + "loss": 1.7872, + "step": 10248 + }, + { + "epoch": 3.1457949662369553, + "grad_norm": 0.2335387021303177, + "learning_rate": 8.022963900619176e-05, + "loss": 1.7656, + "step": 10249 + }, + { + "epoch": 3.14610190300798, + "grad_norm": 0.21822704374790192, + "learning_rate": 8.022567963328749e-05, + "loss": 1.7706, + "step": 10250 + }, + { + "epoch": 3.1464088397790055, + "grad_norm": 0.2627898156642914, + "learning_rate": 8.022171996167531e-05, + "loss": 1.8559, + "step": 10251 + }, + { + "epoch": 3.146715776550031, + "grad_norm": 0.2530064582824707, + "learning_rate": 8.021775999139441e-05, + "loss": 1.788, + "step": 10252 + }, + { + "epoch": 3.1470227133210558, + "grad_norm": 0.2293635457754135, + "learning_rate": 8.021379972248387e-05, + "loss": 1.8129, + "step": 10253 + }, + { + "epoch": 3.147329650092081, + "grad_norm": 0.27753588557243347, + "learning_rate": 8.020983915498286e-05, + "loss": 1.7957, + "step": 10254 + }, + { + "epoch": 3.147636586863106, + "grad_norm": 0.24507668614387512, + "learning_rate": 8.020587828893051e-05, + "loss": 1.7969, + "step": 10255 + }, + { + "epoch": 3.1479435236341313, + "grad_norm": 0.24818891286849976, + "learning_rate": 8.020191712436598e-05, + "loss": 1.8412, + "step": 10256 + }, + { + "epoch": 3.1482504604051567, + "grad_norm": 0.2463149130344391, + "learning_rate": 8.01979556613284e-05, + "loss": 1.8097, + "step": 10257 + }, + { + "epoch": 3.1485573971761815, + "grad_norm": 0.26742151379585266, + "learning_rate": 8.019399389985692e-05, + "loss": 1.8487, + "step": 10258 + }, + { + "epoch": 3.148864333947207, + "grad_norm": 0.3078254461288452, + "learning_rate": 8.01900318399907e-05, + "loss": 1.8189, + "step": 10259 + }, + { + "epoch": 3.149171270718232, + "grad_norm": 0.3819321393966675, + "learning_rate": 8.018606948176887e-05, + "loss": 1.8019, + "step": 10260 + }, + { + "epoch": 3.149478207489257, + "grad_norm": 0.3932126462459564, + "learning_rate": 8.018210682523061e-05, + "loss": 1.787, + "step": 10261 + }, + { + "epoch": 3.1497851442602824, + "grad_norm": 0.2696186900138855, + "learning_rate": 8.017814387041511e-05, + "loss": 1.8345, + "step": 10262 + }, + { + "epoch": 3.150092081031308, + "grad_norm": 0.32631832361221313, + "learning_rate": 8.017418061736149e-05, + "loss": 1.7724, + "step": 10263 + }, + { + "epoch": 3.1503990178023327, + "grad_norm": 0.36187833547592163, + "learning_rate": 8.017021706610893e-05, + "loss": 1.7829, + "step": 10264 + }, + { + "epoch": 3.150705954573358, + "grad_norm": 0.29678142070770264, + "learning_rate": 8.01662532166966e-05, + "loss": 1.7896, + "step": 10265 + }, + { + "epoch": 3.151012891344383, + "grad_norm": 0.2997078001499176, + "learning_rate": 8.016228906916368e-05, + "loss": 1.8401, + "step": 10266 + }, + { + "epoch": 3.1513198281154082, + "grad_norm": 0.4688792824745178, + "learning_rate": 8.015832462354933e-05, + "loss": 1.8263, + "step": 10267 + }, + { + "epoch": 3.1516267648864336, + "grad_norm": 0.42710503935813904, + "learning_rate": 8.015435987989275e-05, + "loss": 1.8233, + "step": 10268 + }, + { + "epoch": 3.1519337016574585, + "grad_norm": 0.2490987628698349, + "learning_rate": 8.01503948382331e-05, + "loss": 1.7792, + "step": 10269 + }, + { + "epoch": 3.152240638428484, + "grad_norm": 0.400836706161499, + "learning_rate": 8.014642949860957e-05, + "loss": 1.8113, + "step": 10270 + }, + { + "epoch": 3.1525475751995087, + "grad_norm": 0.47995972633361816, + "learning_rate": 8.014246386106138e-05, + "loss": 1.8754, + "step": 10271 + }, + { + "epoch": 3.152854511970534, + "grad_norm": 0.39069879055023193, + "learning_rate": 8.013849792562769e-05, + "loss": 1.8541, + "step": 10272 + }, + { + "epoch": 3.1531614487415593, + "grad_norm": 0.27174463868141174, + "learning_rate": 8.013453169234768e-05, + "loss": 1.8018, + "step": 10273 + }, + { + "epoch": 3.1534683855125842, + "grad_norm": 0.37808045744895935, + "learning_rate": 8.013056516126058e-05, + "loss": 1.8346, + "step": 10274 + }, + { + "epoch": 3.1537753222836096, + "grad_norm": 0.43864908814430237, + "learning_rate": 8.012659833240557e-05, + "loss": 1.7626, + "step": 10275 + }, + { + "epoch": 3.154082259054635, + "grad_norm": 0.3592168688774109, + "learning_rate": 8.012263120582187e-05, + "loss": 1.8261, + "step": 10276 + }, + { + "epoch": 3.15438919582566, + "grad_norm": 0.3056562542915344, + "learning_rate": 8.011866378154866e-05, + "loss": 1.903, + "step": 10277 + }, + { + "epoch": 3.154696132596685, + "grad_norm": 0.2898549735546112, + "learning_rate": 8.011469605962517e-05, + "loss": 1.7781, + "step": 10278 + }, + { + "epoch": 3.1550030693677105, + "grad_norm": 0.3498871624469757, + "learning_rate": 8.011072804009059e-05, + "loss": 1.7571, + "step": 10279 + }, + { + "epoch": 3.1553100061387354, + "grad_norm": 0.3330932557582855, + "learning_rate": 8.010675972298416e-05, + "loss": 1.8298, + "step": 10280 + }, + { + "epoch": 3.1556169429097607, + "grad_norm": 0.2540839910507202, + "learning_rate": 8.010279110834507e-05, + "loss": 1.8327, + "step": 10281 + }, + { + "epoch": 3.1559238796807856, + "grad_norm": 0.3557111322879791, + "learning_rate": 8.009882219621257e-05, + "loss": 1.7611, + "step": 10282 + }, + { + "epoch": 3.156230816451811, + "grad_norm": 0.28293952345848083, + "learning_rate": 8.009485298662584e-05, + "loss": 1.7761, + "step": 10283 + }, + { + "epoch": 3.1565377532228363, + "grad_norm": 0.27089303731918335, + "learning_rate": 8.009088347962416e-05, + "loss": 1.8081, + "step": 10284 + }, + { + "epoch": 3.156844689993861, + "grad_norm": 0.2689332664012909, + "learning_rate": 8.008691367524673e-05, + "loss": 1.7458, + "step": 10285 + }, + { + "epoch": 3.1571516267648865, + "grad_norm": 0.2495841234922409, + "learning_rate": 8.008294357353278e-05, + "loss": 1.8307, + "step": 10286 + }, + { + "epoch": 3.1574585635359114, + "grad_norm": 0.29242852330207825, + "learning_rate": 8.007897317452156e-05, + "loss": 1.9216, + "step": 10287 + }, + { + "epoch": 3.1577655003069367, + "grad_norm": 0.26574134826660156, + "learning_rate": 8.007500247825229e-05, + "loss": 1.8392, + "step": 10288 + }, + { + "epoch": 3.158072437077962, + "grad_norm": 0.2503872811794281, + "learning_rate": 8.00710314847642e-05, + "loss": 1.7742, + "step": 10289 + }, + { + "epoch": 3.158379373848987, + "grad_norm": 0.25614771246910095, + "learning_rate": 8.006706019409658e-05, + "loss": 1.828, + "step": 10290 + }, + { + "epoch": 3.1586863106200123, + "grad_norm": 0.259369820356369, + "learning_rate": 8.006308860628863e-05, + "loss": 1.8328, + "step": 10291 + }, + { + "epoch": 3.1589932473910376, + "grad_norm": 0.28183647990226746, + "learning_rate": 8.005911672137962e-05, + "loss": 1.8269, + "step": 10292 + }, + { + "epoch": 3.1593001841620625, + "grad_norm": 0.2926514446735382, + "learning_rate": 8.005514453940881e-05, + "loss": 1.8334, + "step": 10293 + }, + { + "epoch": 3.159607120933088, + "grad_norm": 0.34313449263572693, + "learning_rate": 8.005117206041543e-05, + "loss": 1.7866, + "step": 10294 + }, + { + "epoch": 3.159914057704113, + "grad_norm": 0.30971628427505493, + "learning_rate": 8.004719928443875e-05, + "loss": 1.7827, + "step": 10295 + }, + { + "epoch": 3.160220994475138, + "grad_norm": 0.23955371975898743, + "learning_rate": 8.004322621151807e-05, + "loss": 1.7619, + "step": 10296 + }, + { + "epoch": 3.1605279312461634, + "grad_norm": 0.31311795115470886, + "learning_rate": 8.003925284169261e-05, + "loss": 1.8247, + "step": 10297 + }, + { + "epoch": 3.1608348680171883, + "grad_norm": 0.3408358097076416, + "learning_rate": 8.003527917500163e-05, + "loss": 1.8146, + "step": 10298 + }, + { + "epoch": 3.1611418047882136, + "grad_norm": 0.3030858337879181, + "learning_rate": 8.003130521148442e-05, + "loss": 1.857, + "step": 10299 + }, + { + "epoch": 3.161448741559239, + "grad_norm": 0.25168511271476746, + "learning_rate": 8.002733095118025e-05, + "loss": 1.8404, + "step": 10300 + }, + { + "epoch": 3.161755678330264, + "grad_norm": 0.2956216335296631, + "learning_rate": 8.002335639412839e-05, + "loss": 1.7352, + "step": 10301 + }, + { + "epoch": 3.162062615101289, + "grad_norm": 0.27791857719421387, + "learning_rate": 8.001938154036814e-05, + "loss": 1.7797, + "step": 10302 + }, + { + "epoch": 3.1623695518723145, + "grad_norm": 0.3106420040130615, + "learning_rate": 8.001540638993876e-05, + "loss": 1.8434, + "step": 10303 + }, + { + "epoch": 3.1626764886433394, + "grad_norm": 0.2940445840358734, + "learning_rate": 8.001143094287954e-05, + "loss": 1.8459, + "step": 10304 + }, + { + "epoch": 3.1629834254143647, + "grad_norm": 0.3857429325580597, + "learning_rate": 8.000745519922977e-05, + "loss": 1.7853, + "step": 10305 + }, + { + "epoch": 3.1632903621853896, + "grad_norm": 0.3585071861743927, + "learning_rate": 8.000347915902874e-05, + "loss": 1.8905, + "step": 10306 + }, + { + "epoch": 3.163597298956415, + "grad_norm": 0.320003867149353, + "learning_rate": 7.999950282231574e-05, + "loss": 1.8397, + "step": 10307 + }, + { + "epoch": 3.1639042357274403, + "grad_norm": 0.24986252188682556, + "learning_rate": 7.999552618913009e-05, + "loss": 1.7916, + "step": 10308 + }, + { + "epoch": 3.164211172498465, + "grad_norm": 0.33077237010002136, + "learning_rate": 7.999154925951104e-05, + "loss": 1.8334, + "step": 10309 + }, + { + "epoch": 3.1645181092694905, + "grad_norm": 0.35700327157974243, + "learning_rate": 7.998757203349794e-05, + "loss": 1.7773, + "step": 10310 + }, + { + "epoch": 3.164825046040516, + "grad_norm": 0.3095493018627167, + "learning_rate": 7.998359451113007e-05, + "loss": 1.8156, + "step": 10311 + }, + { + "epoch": 3.1651319828115407, + "grad_norm": 0.3004748225212097, + "learning_rate": 7.997961669244673e-05, + "loss": 1.7862, + "step": 10312 + }, + { + "epoch": 3.165438919582566, + "grad_norm": 0.39382806420326233, + "learning_rate": 7.99756385774873e-05, + "loss": 1.764, + "step": 10313 + }, + { + "epoch": 3.165745856353591, + "grad_norm": 0.3109463155269623, + "learning_rate": 7.997166016629099e-05, + "loss": 1.8006, + "step": 10314 + }, + { + "epoch": 3.1660527931246163, + "grad_norm": 0.2896469235420227, + "learning_rate": 7.996768145889717e-05, + "loss": 1.8373, + "step": 10315 + }, + { + "epoch": 3.1663597298956416, + "grad_norm": 0.35024940967559814, + "learning_rate": 7.996370245534517e-05, + "loss": 1.797, + "step": 10316 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.3228827714920044, + "learning_rate": 7.995972315567431e-05, + "loss": 1.7757, + "step": 10317 + }, + { + "epoch": 3.166973603437692, + "grad_norm": 0.27102410793304443, + "learning_rate": 7.995574355992388e-05, + "loss": 1.7786, + "step": 10318 + }, + { + "epoch": 3.167280540208717, + "grad_norm": 0.2556116580963135, + "learning_rate": 7.995176366813325e-05, + "loss": 1.7621, + "step": 10319 + }, + { + "epoch": 3.167587476979742, + "grad_norm": 0.28279444575309753, + "learning_rate": 7.994778348034173e-05, + "loss": 1.7954, + "step": 10320 + }, + { + "epoch": 3.1678944137507674, + "grad_norm": 0.31778639554977417, + "learning_rate": 7.994380299658867e-05, + "loss": 1.7657, + "step": 10321 + }, + { + "epoch": 3.1682013505217923, + "grad_norm": 0.27935469150543213, + "learning_rate": 7.993982221691339e-05, + "loss": 1.7502, + "step": 10322 + }, + { + "epoch": 3.1685082872928176, + "grad_norm": 0.29012617468833923, + "learning_rate": 7.993584114135524e-05, + "loss": 1.8497, + "step": 10323 + }, + { + "epoch": 3.168815224063843, + "grad_norm": 0.2674056887626648, + "learning_rate": 7.993185976995356e-05, + "loss": 1.7875, + "step": 10324 + }, + { + "epoch": 3.169122160834868, + "grad_norm": 0.2667328417301178, + "learning_rate": 7.992787810274771e-05, + "loss": 1.771, + "step": 10325 + }, + { + "epoch": 3.169429097605893, + "grad_norm": 0.25807151198387146, + "learning_rate": 7.992389613977702e-05, + "loss": 1.7638, + "step": 10326 + }, + { + "epoch": 3.1697360343769185, + "grad_norm": 0.2572930157184601, + "learning_rate": 7.991991388108084e-05, + "loss": 1.8218, + "step": 10327 + }, + { + "epoch": 3.1700429711479434, + "grad_norm": 0.3955067992210388, + "learning_rate": 7.991593132669855e-05, + "loss": 1.8458, + "step": 10328 + }, + { + "epoch": 3.1703499079189688, + "grad_norm": 0.2813466489315033, + "learning_rate": 7.991194847666948e-05, + "loss": 1.8042, + "step": 10329 + }, + { + "epoch": 3.1706568446899936, + "grad_norm": 0.2645012140274048, + "learning_rate": 7.990796533103302e-05, + "loss": 1.8241, + "step": 10330 + }, + { + "epoch": 3.170963781461019, + "grad_norm": 0.28462091088294983, + "learning_rate": 7.99039818898285e-05, + "loss": 1.8853, + "step": 10331 + }, + { + "epoch": 3.1712707182320443, + "grad_norm": 0.2727372944355011, + "learning_rate": 7.98999981530953e-05, + "loss": 1.7564, + "step": 10332 + }, + { + "epoch": 3.171577655003069, + "grad_norm": 0.2658170759677887, + "learning_rate": 7.989601412087281e-05, + "loss": 1.8344, + "step": 10333 + }, + { + "epoch": 3.1718845917740945, + "grad_norm": 0.29713502526283264, + "learning_rate": 7.989202979320039e-05, + "loss": 1.8721, + "step": 10334 + }, + { + "epoch": 3.17219152854512, + "grad_norm": 0.26609495282173157, + "learning_rate": 7.98880451701174e-05, + "loss": 1.7991, + "step": 10335 + }, + { + "epoch": 3.1724984653161448, + "grad_norm": 0.29779741168022156, + "learning_rate": 7.988406025166322e-05, + "loss": 1.8182, + "step": 10336 + }, + { + "epoch": 3.17280540208717, + "grad_norm": 0.2771340012550354, + "learning_rate": 7.988007503787724e-05, + "loss": 1.8034, + "step": 10337 + }, + { + "epoch": 3.1731123388581954, + "grad_norm": 0.30510422587394714, + "learning_rate": 7.987608952879886e-05, + "loss": 1.8477, + "step": 10338 + }, + { + "epoch": 3.1734192756292203, + "grad_norm": 0.3097476363182068, + "learning_rate": 7.987210372446745e-05, + "loss": 1.7572, + "step": 10339 + }, + { + "epoch": 3.1737262124002457, + "grad_norm": 0.2553942799568176, + "learning_rate": 7.986811762492239e-05, + "loss": 1.7837, + "step": 10340 + }, + { + "epoch": 3.1740331491712706, + "grad_norm": 0.26546719670295715, + "learning_rate": 7.986413123020312e-05, + "loss": 1.7893, + "step": 10341 + }, + { + "epoch": 3.174340085942296, + "grad_norm": 0.37721553444862366, + "learning_rate": 7.986014454034895e-05, + "loss": 1.8475, + "step": 10342 + }, + { + "epoch": 3.174647022713321, + "grad_norm": 0.3215494453907013, + "learning_rate": 7.985615755539937e-05, + "loss": 1.7806, + "step": 10343 + }, + { + "epoch": 3.174953959484346, + "grad_norm": 0.2662442922592163, + "learning_rate": 7.985217027539373e-05, + "loss": 1.8116, + "step": 10344 + }, + { + "epoch": 3.1752608962553714, + "grad_norm": 0.23334236443042755, + "learning_rate": 7.984818270037145e-05, + "loss": 1.7929, + "step": 10345 + }, + { + "epoch": 3.1755678330263963, + "grad_norm": 0.2873367667198181, + "learning_rate": 7.98441948303719e-05, + "loss": 1.7808, + "step": 10346 + }, + { + "epoch": 3.1758747697974217, + "grad_norm": 0.3623826801776886, + "learning_rate": 7.984020666543458e-05, + "loss": 1.8817, + "step": 10347 + }, + { + "epoch": 3.176181706568447, + "grad_norm": 0.3060589134693146, + "learning_rate": 7.983621820559881e-05, + "loss": 1.796, + "step": 10348 + }, + { + "epoch": 3.176488643339472, + "grad_norm": 0.2396882325410843, + "learning_rate": 7.983222945090407e-05, + "loss": 1.7455, + "step": 10349 + }, + { + "epoch": 3.1767955801104972, + "grad_norm": 0.24811476469039917, + "learning_rate": 7.982824040138974e-05, + "loss": 1.7907, + "step": 10350 + }, + { + "epoch": 3.1771025168815226, + "grad_norm": 0.32749706506729126, + "learning_rate": 7.982425105709524e-05, + "loss": 1.8553, + "step": 10351 + }, + { + "epoch": 3.1774094536525475, + "grad_norm": 0.3648095726966858, + "learning_rate": 7.982026141806003e-05, + "loss": 1.8387, + "step": 10352 + }, + { + "epoch": 3.177716390423573, + "grad_norm": 0.2749348282814026, + "learning_rate": 7.981627148432352e-05, + "loss": 1.7676, + "step": 10353 + }, + { + "epoch": 3.178023327194598, + "grad_norm": 0.2735142409801483, + "learning_rate": 7.981228125592513e-05, + "loss": 1.822, + "step": 10354 + }, + { + "epoch": 3.178330263965623, + "grad_norm": 0.28759655356407166, + "learning_rate": 7.98082907329043e-05, + "loss": 1.8113, + "step": 10355 + }, + { + "epoch": 3.1786372007366483, + "grad_norm": 0.33661654591560364, + "learning_rate": 7.980429991530048e-05, + "loss": 1.8036, + "step": 10356 + }, + { + "epoch": 3.1789441375076732, + "grad_norm": 0.2634892761707306, + "learning_rate": 7.98003088031531e-05, + "loss": 1.8323, + "step": 10357 + }, + { + "epoch": 3.1792510742786986, + "grad_norm": 0.25864094495773315, + "learning_rate": 7.979631739650158e-05, + "loss": 1.8199, + "step": 10358 + }, + { + "epoch": 3.179558011049724, + "grad_norm": 0.27368444204330444, + "learning_rate": 7.979232569538541e-05, + "loss": 1.7673, + "step": 10359 + }, + { + "epoch": 3.179864947820749, + "grad_norm": 0.2506616413593292, + "learning_rate": 7.9788333699844e-05, + "loss": 1.7912, + "step": 10360 + }, + { + "epoch": 3.180171884591774, + "grad_norm": 0.2539178133010864, + "learning_rate": 7.978434140991684e-05, + "loss": 1.7934, + "step": 10361 + }, + { + "epoch": 3.1804788213627995, + "grad_norm": 0.2605626881122589, + "learning_rate": 7.978034882564334e-05, + "loss": 1.8031, + "step": 10362 + }, + { + "epoch": 3.1807857581338244, + "grad_norm": 0.2610207796096802, + "learning_rate": 7.977635594706299e-05, + "loss": 1.8664, + "step": 10363 + }, + { + "epoch": 3.1810926949048497, + "grad_norm": 0.26164132356643677, + "learning_rate": 7.977236277421523e-05, + "loss": 1.7758, + "step": 10364 + }, + { + "epoch": 3.1813996316758746, + "grad_norm": 0.3122340142726898, + "learning_rate": 7.976836930713953e-05, + "loss": 1.9033, + "step": 10365 + }, + { + "epoch": 3.1817065684469, + "grad_norm": 0.3317202031612396, + "learning_rate": 7.976437554587537e-05, + "loss": 1.7899, + "step": 10366 + }, + { + "epoch": 3.1820135052179253, + "grad_norm": 0.28612568974494934, + "learning_rate": 7.97603814904622e-05, + "loss": 1.8145, + "step": 10367 + }, + { + "epoch": 3.18232044198895, + "grad_norm": 0.349917471408844, + "learning_rate": 7.975638714093949e-05, + "loss": 1.877, + "step": 10368 + }, + { + "epoch": 3.1826273787599755, + "grad_norm": 0.3737771809101105, + "learning_rate": 7.975239249734672e-05, + "loss": 1.8204, + "step": 10369 + }, + { + "epoch": 3.182934315531001, + "grad_norm": 0.3688446879386902, + "learning_rate": 7.974839755972339e-05, + "loss": 1.8487, + "step": 10370 + }, + { + "epoch": 3.1832412523020257, + "grad_norm": 0.2934897541999817, + "learning_rate": 7.974440232810894e-05, + "loss": 1.8243, + "step": 10371 + }, + { + "epoch": 3.183548189073051, + "grad_norm": 0.2596173882484436, + "learning_rate": 7.974040680254287e-05, + "loss": 1.7887, + "step": 10372 + }, + { + "epoch": 3.183855125844076, + "grad_norm": 0.35686594247817993, + "learning_rate": 7.973641098306468e-05, + "loss": 1.8653, + "step": 10373 + }, + { + "epoch": 3.1841620626151013, + "grad_norm": 0.3187713921070099, + "learning_rate": 7.973241486971383e-05, + "loss": 1.8767, + "step": 10374 + }, + { + "epoch": 3.1844689993861266, + "grad_norm": 0.2596273124217987, + "learning_rate": 7.972841846252985e-05, + "loss": 1.8028, + "step": 10375 + }, + { + "epoch": 3.1847759361571515, + "grad_norm": 0.2637474834918976, + "learning_rate": 7.972442176155221e-05, + "loss": 1.802, + "step": 10376 + }, + { + "epoch": 3.185082872928177, + "grad_norm": 0.2641126215457916, + "learning_rate": 7.97204247668204e-05, + "loss": 1.7931, + "step": 10377 + }, + { + "epoch": 3.185389809699202, + "grad_norm": 0.25594159960746765, + "learning_rate": 7.971642747837393e-05, + "loss": 1.818, + "step": 10378 + }, + { + "epoch": 3.185696746470227, + "grad_norm": 0.26567938923835754, + "learning_rate": 7.971242989625233e-05, + "loss": 1.8174, + "step": 10379 + }, + { + "epoch": 3.1860036832412524, + "grad_norm": 0.29580214619636536, + "learning_rate": 7.970843202049508e-05, + "loss": 1.869, + "step": 10380 + }, + { + "epoch": 3.1863106200122773, + "grad_norm": 0.2657530605792999, + "learning_rate": 7.970443385114168e-05, + "loss": 1.8352, + "step": 10381 + }, + { + "epoch": 3.1866175567833026, + "grad_norm": 0.2468358278274536, + "learning_rate": 7.970043538823165e-05, + "loss": 1.7851, + "step": 10382 + }, + { + "epoch": 3.186924493554328, + "grad_norm": 0.26464715600013733, + "learning_rate": 7.969643663180451e-05, + "loss": 1.8208, + "step": 10383 + }, + { + "epoch": 3.187231430325353, + "grad_norm": 0.26035723090171814, + "learning_rate": 7.969243758189979e-05, + "loss": 1.8089, + "step": 10384 + }, + { + "epoch": 3.187538367096378, + "grad_norm": 0.2644619941711426, + "learning_rate": 7.968843823855699e-05, + "loss": 1.8379, + "step": 10385 + }, + { + "epoch": 3.1878453038674035, + "grad_norm": 0.25576624274253845, + "learning_rate": 7.968443860181565e-05, + "loss": 1.7932, + "step": 10386 + }, + { + "epoch": 3.1881522406384284, + "grad_norm": 0.24276074767112732, + "learning_rate": 7.968043867171528e-05, + "loss": 1.8037, + "step": 10387 + }, + { + "epoch": 3.1884591774094537, + "grad_norm": 0.27156540751457214, + "learning_rate": 7.967643844829543e-05, + "loss": 1.7998, + "step": 10388 + }, + { + "epoch": 3.1887661141804786, + "grad_norm": 0.2555428743362427, + "learning_rate": 7.96724379315956e-05, + "loss": 1.7612, + "step": 10389 + }, + { + "epoch": 3.189073050951504, + "grad_norm": 0.3358438014984131, + "learning_rate": 7.966843712165537e-05, + "loss": 1.8543, + "step": 10390 + }, + { + "epoch": 3.1893799877225293, + "grad_norm": 0.2799586355686188, + "learning_rate": 7.966443601851424e-05, + "loss": 1.819, + "step": 10391 + }, + { + "epoch": 3.189686924493554, + "grad_norm": 0.2364189177751541, + "learning_rate": 7.966043462221178e-05, + "loss": 1.8537, + "step": 10392 + }, + { + "epoch": 3.1899938612645795, + "grad_norm": 0.23849403858184814, + "learning_rate": 7.96564329327875e-05, + "loss": 1.8125, + "step": 10393 + }, + { + "epoch": 3.190300798035605, + "grad_norm": 0.2371583878993988, + "learning_rate": 7.965243095028098e-05, + "loss": 1.7352, + "step": 10394 + }, + { + "epoch": 3.1906077348066297, + "grad_norm": 0.2584737539291382, + "learning_rate": 7.964842867473176e-05, + "loss": 1.8801, + "step": 10395 + }, + { + "epoch": 3.190914671577655, + "grad_norm": 0.27768051624298096, + "learning_rate": 7.964442610617939e-05, + "loss": 1.8221, + "step": 10396 + }, + { + "epoch": 3.1912216083486804, + "grad_norm": 0.2680891752243042, + "learning_rate": 7.964042324466341e-05, + "loss": 1.8371, + "step": 10397 + }, + { + "epoch": 3.1915285451197053, + "grad_norm": 0.25301921367645264, + "learning_rate": 7.963642009022343e-05, + "loss": 1.7972, + "step": 10398 + }, + { + "epoch": 3.1918354818907306, + "grad_norm": 0.2589731216430664, + "learning_rate": 7.963241664289896e-05, + "loss": 1.8145, + "step": 10399 + }, + { + "epoch": 3.1921424186617555, + "grad_norm": 0.2611297369003296, + "learning_rate": 7.962841290272956e-05, + "loss": 1.8736, + "step": 10400 + }, + { + "epoch": 3.192449355432781, + "grad_norm": 0.2812272906303406, + "learning_rate": 7.962440886975483e-05, + "loss": 1.8116, + "step": 10401 + }, + { + "epoch": 3.192756292203806, + "grad_norm": 0.3261657655239105, + "learning_rate": 7.962040454401434e-05, + "loss": 1.7935, + "step": 10402 + }, + { + "epoch": 3.193063228974831, + "grad_norm": 0.3355373442173004, + "learning_rate": 7.961639992554764e-05, + "loss": 1.7957, + "step": 10403 + }, + { + "epoch": 3.1933701657458564, + "grad_norm": 0.2811843156814575, + "learning_rate": 7.961239501439432e-05, + "loss": 1.797, + "step": 10404 + }, + { + "epoch": 3.1936771025168813, + "grad_norm": 0.24933238327503204, + "learning_rate": 7.960838981059395e-05, + "loss": 1.7594, + "step": 10405 + }, + { + "epoch": 3.1939840392879066, + "grad_norm": 0.29110121726989746, + "learning_rate": 7.960438431418613e-05, + "loss": 1.8268, + "step": 10406 + }, + { + "epoch": 3.194290976058932, + "grad_norm": 0.3702283799648285, + "learning_rate": 7.960037852521043e-05, + "loss": 1.7629, + "step": 10407 + }, + { + "epoch": 3.194597912829957, + "grad_norm": 0.33275437355041504, + "learning_rate": 7.959637244370644e-05, + "loss": 1.8507, + "step": 10408 + }, + { + "epoch": 3.194904849600982, + "grad_norm": 0.2691981792449951, + "learning_rate": 7.959236606971375e-05, + "loss": 1.8084, + "step": 10409 + }, + { + "epoch": 3.1952117863720075, + "grad_norm": 0.30108413100242615, + "learning_rate": 7.958835940327194e-05, + "loss": 1.8525, + "step": 10410 + }, + { + "epoch": 3.1955187231430324, + "grad_norm": 0.32112306356430054, + "learning_rate": 7.958435244442064e-05, + "loss": 1.7431, + "step": 10411 + }, + { + "epoch": 3.1958256599140578, + "grad_norm": 0.2795291543006897, + "learning_rate": 7.958034519319942e-05, + "loss": 1.7985, + "step": 10412 + }, + { + "epoch": 3.196132596685083, + "grad_norm": 0.2485792338848114, + "learning_rate": 7.957633764964788e-05, + "loss": 1.7363, + "step": 10413 + }, + { + "epoch": 3.196439533456108, + "grad_norm": 0.3552432358264923, + "learning_rate": 7.957232981380565e-05, + "loss": 1.8174, + "step": 10414 + }, + { + "epoch": 3.1967464702271333, + "grad_norm": 0.3829655051231384, + "learning_rate": 7.956832168571234e-05, + "loss": 1.9249, + "step": 10415 + }, + { + "epoch": 3.197053406998158, + "grad_norm": 0.2498074769973755, + "learning_rate": 7.956431326540752e-05, + "loss": 1.8104, + "step": 10416 + }, + { + "epoch": 3.1973603437691835, + "grad_norm": 0.24596504867076874, + "learning_rate": 7.956030455293082e-05, + "loss": 1.8007, + "step": 10417 + }, + { + "epoch": 3.197667280540209, + "grad_norm": 0.2795363664627075, + "learning_rate": 7.95562955483219e-05, + "loss": 1.775, + "step": 10418 + }, + { + "epoch": 3.1979742173112338, + "grad_norm": 0.3581138253211975, + "learning_rate": 7.95522862516203e-05, + "loss": 1.8567, + "step": 10419 + }, + { + "epoch": 3.198281154082259, + "grad_norm": 0.36102500557899475, + "learning_rate": 7.95482766628657e-05, + "loss": 1.8509, + "step": 10420 + }, + { + "epoch": 3.198588090853284, + "grad_norm": 0.4717029929161072, + "learning_rate": 7.954426678209774e-05, + "loss": 1.8218, + "step": 10421 + }, + { + "epoch": 3.1988950276243093, + "grad_norm": 0.3211984932422638, + "learning_rate": 7.9540256609356e-05, + "loss": 1.8696, + "step": 10422 + }, + { + "epoch": 3.1992019643953347, + "grad_norm": 0.30094626545906067, + "learning_rate": 7.953624614468011e-05, + "loss": 1.8714, + "step": 10423 + }, + { + "epoch": 3.1995089011663596, + "grad_norm": 0.267578125, + "learning_rate": 7.953223538810976e-05, + "loss": 1.7903, + "step": 10424 + }, + { + "epoch": 3.199815837937385, + "grad_norm": 0.35577845573425293, + "learning_rate": 7.952822433968453e-05, + "loss": 1.7808, + "step": 10425 + }, + { + "epoch": 3.2001227747084102, + "grad_norm": 0.4117741882801056, + "learning_rate": 7.952421299944408e-05, + "loss": 1.7856, + "step": 10426 + }, + { + "epoch": 3.200429711479435, + "grad_norm": 0.35202035307884216, + "learning_rate": 7.952020136742806e-05, + "loss": 1.8112, + "step": 10427 + }, + { + "epoch": 3.2007366482504604, + "grad_norm": 0.26514917612075806, + "learning_rate": 7.951618944367611e-05, + "loss": 1.828, + "step": 10428 + }, + { + "epoch": 3.201043585021486, + "grad_norm": 0.29219159483909607, + "learning_rate": 7.951217722822786e-05, + "loss": 1.9366, + "step": 10429 + }, + { + "epoch": 3.2013505217925107, + "grad_norm": 0.2929961383342743, + "learning_rate": 7.950816472112298e-05, + "loss": 1.8006, + "step": 10430 + }, + { + "epoch": 3.201657458563536, + "grad_norm": 0.28339722752571106, + "learning_rate": 7.950415192240114e-05, + "loss": 1.7411, + "step": 10431 + }, + { + "epoch": 3.201964395334561, + "grad_norm": 0.258884996175766, + "learning_rate": 7.950013883210196e-05, + "loss": 1.8153, + "step": 10432 + }, + { + "epoch": 3.2022713321055862, + "grad_norm": 0.3065929114818573, + "learning_rate": 7.949612545026512e-05, + "loss": 1.7918, + "step": 10433 + }, + { + "epoch": 3.2025782688766116, + "grad_norm": 0.289874404668808, + "learning_rate": 7.949211177693029e-05, + "loss": 1.7975, + "step": 10434 + }, + { + "epoch": 3.2028852056476365, + "grad_norm": 0.27025631070137024, + "learning_rate": 7.948809781213711e-05, + "loss": 1.8129, + "step": 10435 + }, + { + "epoch": 3.203192142418662, + "grad_norm": 0.2501074969768524, + "learning_rate": 7.948408355592528e-05, + "loss": 1.7653, + "step": 10436 + }, + { + "epoch": 3.203499079189687, + "grad_norm": 0.30402958393096924, + "learning_rate": 7.948006900833445e-05, + "loss": 1.8311, + "step": 10437 + }, + { + "epoch": 3.203806015960712, + "grad_norm": 0.28783223032951355, + "learning_rate": 7.94760541694043e-05, + "loss": 1.82, + "step": 10438 + }, + { + "epoch": 3.2041129527317374, + "grad_norm": 0.30428317189216614, + "learning_rate": 7.947203903917451e-05, + "loss": 1.8673, + "step": 10439 + }, + { + "epoch": 3.2044198895027622, + "grad_norm": 0.2860367000102997, + "learning_rate": 7.946802361768473e-05, + "loss": 1.824, + "step": 10440 + }, + { + "epoch": 3.2047268262737876, + "grad_norm": 0.2995273172855377, + "learning_rate": 7.946400790497469e-05, + "loss": 1.7342, + "step": 10441 + }, + { + "epoch": 3.205033763044813, + "grad_norm": 0.4374088943004608, + "learning_rate": 7.945999190108407e-05, + "loss": 1.8522, + "step": 10442 + }, + { + "epoch": 3.205340699815838, + "grad_norm": 0.37659478187561035, + "learning_rate": 7.945597560605252e-05, + "loss": 1.7518, + "step": 10443 + }, + { + "epoch": 3.205647636586863, + "grad_norm": 0.24257932603359222, + "learning_rate": 7.945195901991975e-05, + "loss": 1.7892, + "step": 10444 + }, + { + "epoch": 3.2059545733578885, + "grad_norm": 0.3682694435119629, + "learning_rate": 7.944794214272546e-05, + "loss": 1.7757, + "step": 10445 + }, + { + "epoch": 3.2062615101289134, + "grad_norm": 0.434692919254303, + "learning_rate": 7.944392497450936e-05, + "loss": 1.8207, + "step": 10446 + }, + { + "epoch": 3.2065684468999387, + "grad_norm": 0.3982211947441101, + "learning_rate": 7.943990751531113e-05, + "loss": 1.8303, + "step": 10447 + }, + { + "epoch": 3.2068753836709636, + "grad_norm": 0.2877334654331207, + "learning_rate": 7.943588976517049e-05, + "loss": 1.8495, + "step": 10448 + }, + { + "epoch": 3.207182320441989, + "grad_norm": 0.34589654207229614, + "learning_rate": 7.943187172412712e-05, + "loss": 1.7773, + "step": 10449 + }, + { + "epoch": 3.2074892572130143, + "grad_norm": 0.4727517366409302, + "learning_rate": 7.942785339222074e-05, + "loss": 1.8702, + "step": 10450 + }, + { + "epoch": 3.207796193984039, + "grad_norm": 0.4019354581832886, + "learning_rate": 7.942383476949107e-05, + "loss": 1.8095, + "step": 10451 + }, + { + "epoch": 3.2081031307550645, + "grad_norm": 0.2726243734359741, + "learning_rate": 7.941981585597782e-05, + "loss": 1.7273, + "step": 10452 + }, + { + "epoch": 3.20841006752609, + "grad_norm": 0.2944760024547577, + "learning_rate": 7.941579665172072e-05, + "loss": 1.7507, + "step": 10453 + }, + { + "epoch": 3.2087170042971147, + "grad_norm": 0.3530777096748352, + "learning_rate": 7.941177715675945e-05, + "loss": 1.8434, + "step": 10454 + }, + { + "epoch": 3.20902394106814, + "grad_norm": 0.28612539172172546, + "learning_rate": 7.940775737113378e-05, + "loss": 1.8094, + "step": 10455 + }, + { + "epoch": 3.209330877839165, + "grad_norm": 0.27006468176841736, + "learning_rate": 7.94037372948834e-05, + "loss": 1.7854, + "step": 10456 + }, + { + "epoch": 3.2096378146101903, + "grad_norm": 0.3027147054672241, + "learning_rate": 7.939971692804806e-05, + "loss": 1.7596, + "step": 10457 + }, + { + "epoch": 3.2099447513812156, + "grad_norm": 0.31999528408050537, + "learning_rate": 7.939569627066749e-05, + "loss": 1.8836, + "step": 10458 + }, + { + "epoch": 3.2102516881522405, + "grad_norm": 0.267600417137146, + "learning_rate": 7.939167532278142e-05, + "loss": 1.8508, + "step": 10459 + }, + { + "epoch": 3.210558624923266, + "grad_norm": 0.3171706795692444, + "learning_rate": 7.938765408442958e-05, + "loss": 1.7507, + "step": 10460 + }, + { + "epoch": 3.210865561694291, + "grad_norm": 0.2955280840396881, + "learning_rate": 7.938363255565171e-05, + "loss": 1.733, + "step": 10461 + }, + { + "epoch": 3.211172498465316, + "grad_norm": 0.3427969217300415, + "learning_rate": 7.937961073648759e-05, + "loss": 1.9208, + "step": 10462 + }, + { + "epoch": 3.2114794352363414, + "grad_norm": 0.28788647055625916, + "learning_rate": 7.937558862697692e-05, + "loss": 1.7723, + "step": 10463 + }, + { + "epoch": 3.2117863720073663, + "grad_norm": 0.26093682646751404, + "learning_rate": 7.937156622715945e-05, + "loss": 1.803, + "step": 10464 + }, + { + "epoch": 3.2120933087783916, + "grad_norm": 0.2791301906108856, + "learning_rate": 7.936754353707497e-05, + "loss": 1.7601, + "step": 10465 + }, + { + "epoch": 3.212400245549417, + "grad_norm": 0.3039831519126892, + "learning_rate": 7.93635205567632e-05, + "loss": 1.7864, + "step": 10466 + }, + { + "epoch": 3.212707182320442, + "grad_norm": 0.28498128056526184, + "learning_rate": 7.935949728626392e-05, + "loss": 1.7745, + "step": 10467 + }, + { + "epoch": 3.213014119091467, + "grad_norm": 0.2908780872821808, + "learning_rate": 7.935547372561687e-05, + "loss": 1.8281, + "step": 10468 + }, + { + "epoch": 3.2133210558624925, + "grad_norm": 0.26148509979248047, + "learning_rate": 7.935144987486183e-05, + "loss": 1.8545, + "step": 10469 + }, + { + "epoch": 3.2136279926335174, + "grad_norm": 0.2853962481021881, + "learning_rate": 7.934742573403856e-05, + "loss": 1.7765, + "step": 10470 + }, + { + "epoch": 3.2139349294045427, + "grad_norm": 0.26497501134872437, + "learning_rate": 7.934340130318681e-05, + "loss": 1.7472, + "step": 10471 + }, + { + "epoch": 3.214241866175568, + "grad_norm": 0.2806912660598755, + "learning_rate": 7.933937658234638e-05, + "loss": 1.7879, + "step": 10472 + }, + { + "epoch": 3.214548802946593, + "grad_norm": 0.2699974477291107, + "learning_rate": 7.933535157155705e-05, + "loss": 1.7539, + "step": 10473 + }, + { + "epoch": 3.2148557397176183, + "grad_norm": 0.22714731097221375, + "learning_rate": 7.933132627085856e-05, + "loss": 1.7861, + "step": 10474 + }, + { + "epoch": 3.215162676488643, + "grad_norm": 0.291340708732605, + "learning_rate": 7.932730068029072e-05, + "loss": 1.8381, + "step": 10475 + }, + { + "epoch": 3.2154696132596685, + "grad_norm": 0.3257324695587158, + "learning_rate": 7.93232747998933e-05, + "loss": 1.8293, + "step": 10476 + }, + { + "epoch": 3.215776550030694, + "grad_norm": 0.3518911600112915, + "learning_rate": 7.93192486297061e-05, + "loss": 1.853, + "step": 10477 + }, + { + "epoch": 3.2160834868017187, + "grad_norm": 0.27663540840148926, + "learning_rate": 7.93152221697689e-05, + "loss": 1.7831, + "step": 10478 + }, + { + "epoch": 3.216390423572744, + "grad_norm": 0.3153248429298401, + "learning_rate": 7.931119542012149e-05, + "loss": 1.7443, + "step": 10479 + }, + { + "epoch": 3.216697360343769, + "grad_norm": 0.2919597029685974, + "learning_rate": 7.930716838080368e-05, + "loss": 1.8108, + "step": 10480 + }, + { + "epoch": 3.2170042971147943, + "grad_norm": 0.26892516016960144, + "learning_rate": 7.930314105185524e-05, + "loss": 1.7791, + "step": 10481 + }, + { + "epoch": 3.2173112338858196, + "grad_norm": 0.2486005276441574, + "learning_rate": 7.929911343331599e-05, + "loss": 1.8184, + "step": 10482 + }, + { + "epoch": 3.2176181706568445, + "grad_norm": 0.260728120803833, + "learning_rate": 7.929508552522571e-05, + "loss": 1.7933, + "step": 10483 + }, + { + "epoch": 3.21792510742787, + "grad_norm": 0.3081948757171631, + "learning_rate": 7.929105732762425e-05, + "loss": 1.7732, + "step": 10484 + }, + { + "epoch": 3.218232044198895, + "grad_norm": 0.3807671368122101, + "learning_rate": 7.928702884055138e-05, + "loss": 1.7652, + "step": 10485 + }, + { + "epoch": 3.21853898096992, + "grad_norm": 0.31637755036354065, + "learning_rate": 7.928300006404692e-05, + "loss": 1.7605, + "step": 10486 + }, + { + "epoch": 3.2188459177409454, + "grad_norm": 0.2812853455543518, + "learning_rate": 7.927897099815071e-05, + "loss": 1.7925, + "step": 10487 + }, + { + "epoch": 3.2191528545119708, + "grad_norm": 0.3472350239753723, + "learning_rate": 7.927494164290253e-05, + "loss": 1.8252, + "step": 10488 + }, + { + "epoch": 3.2194597912829956, + "grad_norm": 0.4202714264392853, + "learning_rate": 7.927091199834222e-05, + "loss": 1.7993, + "step": 10489 + }, + { + "epoch": 3.219766728054021, + "grad_norm": 0.44552353024482727, + "learning_rate": 7.92668820645096e-05, + "loss": 1.8609, + "step": 10490 + }, + { + "epoch": 3.220073664825046, + "grad_norm": 0.38964664936065674, + "learning_rate": 7.926285184144451e-05, + "loss": 1.864, + "step": 10491 + }, + { + "epoch": 3.220380601596071, + "grad_norm": 0.2978462278842926, + "learning_rate": 7.925882132918676e-05, + "loss": 1.7892, + "step": 10492 + }, + { + "epoch": 3.2206875383670965, + "grad_norm": 0.2520316243171692, + "learning_rate": 7.925479052777619e-05, + "loss": 1.7702, + "step": 10493 + }, + { + "epoch": 3.2209944751381214, + "grad_norm": 0.28151068091392517, + "learning_rate": 7.925075943725263e-05, + "loss": 1.7613, + "step": 10494 + }, + { + "epoch": 3.2213014119091468, + "grad_norm": 0.3346099555492401, + "learning_rate": 7.924672805765592e-05, + "loss": 1.894, + "step": 10495 + }, + { + "epoch": 3.2216083486801717, + "grad_norm": 0.2981362044811249, + "learning_rate": 7.924269638902591e-05, + "loss": 1.8157, + "step": 10496 + }, + { + "epoch": 3.221915285451197, + "grad_norm": 0.2561499774456024, + "learning_rate": 7.923866443140242e-05, + "loss": 1.8259, + "step": 10497 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.26480481028556824, + "learning_rate": 7.923463218482532e-05, + "loss": 1.7856, + "step": 10498 + }, + { + "epoch": 3.222529158993247, + "grad_norm": 0.24103692173957825, + "learning_rate": 7.923059964933446e-05, + "loss": 1.7765, + "step": 10499 + }, + { + "epoch": 3.2228360957642725, + "grad_norm": 0.2399173080921173, + "learning_rate": 7.922656682496967e-05, + "loss": 1.8216, + "step": 10500 + }, + { + "epoch": 3.223143032535298, + "grad_norm": 0.24530018866062164, + "learning_rate": 7.922253371177082e-05, + "loss": 1.8155, + "step": 10501 + }, + { + "epoch": 3.2234499693063228, + "grad_norm": 0.23298653960227966, + "learning_rate": 7.921850030977775e-05, + "loss": 1.7843, + "step": 10502 + }, + { + "epoch": 3.223756906077348, + "grad_norm": 0.3053973317146301, + "learning_rate": 7.921446661903035e-05, + "loss": 1.8113, + "step": 10503 + }, + { + "epoch": 3.2240638428483734, + "grad_norm": 0.261336088180542, + "learning_rate": 7.921043263956847e-05, + "loss": 1.8073, + "step": 10504 + }, + { + "epoch": 3.2243707796193983, + "grad_norm": 0.24877268075942993, + "learning_rate": 7.920639837143195e-05, + "loss": 1.8344, + "step": 10505 + }, + { + "epoch": 3.2246777163904237, + "grad_norm": 0.26784422993659973, + "learning_rate": 7.920236381466071e-05, + "loss": 1.7757, + "step": 10506 + }, + { + "epoch": 3.2249846531614486, + "grad_norm": 0.2672121226787567, + "learning_rate": 7.919832896929458e-05, + "loss": 1.8384, + "step": 10507 + }, + { + "epoch": 3.225291589932474, + "grad_norm": 0.27254921197891235, + "learning_rate": 7.919429383537346e-05, + "loss": 1.8056, + "step": 10508 + }, + { + "epoch": 3.2255985267034992, + "grad_norm": 0.24467822909355164, + "learning_rate": 7.91902584129372e-05, + "loss": 1.8109, + "step": 10509 + }, + { + "epoch": 3.225905463474524, + "grad_norm": 0.25966358184814453, + "learning_rate": 7.918622270202571e-05, + "loss": 1.82, + "step": 10510 + }, + { + "epoch": 3.2262124002455494, + "grad_norm": 0.28601330518722534, + "learning_rate": 7.918218670267886e-05, + "loss": 1.7266, + "step": 10511 + }, + { + "epoch": 3.226519337016575, + "grad_norm": 0.4017516076564789, + "learning_rate": 7.917815041493653e-05, + "loss": 1.8408, + "step": 10512 + }, + { + "epoch": 3.2268262737875997, + "grad_norm": 0.3995787501335144, + "learning_rate": 7.917411383883862e-05, + "loss": 1.8441, + "step": 10513 + }, + { + "epoch": 3.227133210558625, + "grad_norm": 0.26997458934783936, + "learning_rate": 7.917007697442502e-05, + "loss": 1.8078, + "step": 10514 + }, + { + "epoch": 3.22744014732965, + "grad_norm": 0.34353014826774597, + "learning_rate": 7.916603982173562e-05, + "loss": 1.7523, + "step": 10515 + }, + { + "epoch": 3.2277470841006752, + "grad_norm": 0.39522337913513184, + "learning_rate": 7.916200238081032e-05, + "loss": 1.7532, + "step": 10516 + }, + { + "epoch": 3.2280540208717006, + "grad_norm": 0.4176923334598541, + "learning_rate": 7.915796465168903e-05, + "loss": 1.8895, + "step": 10517 + }, + { + "epoch": 3.2283609576427255, + "grad_norm": 0.30232906341552734, + "learning_rate": 7.915392663441164e-05, + "loss": 1.8223, + "step": 10518 + }, + { + "epoch": 3.228667894413751, + "grad_norm": 0.230951726436615, + "learning_rate": 7.914988832901805e-05, + "loss": 1.7265, + "step": 10519 + }, + { + "epoch": 3.228974831184776, + "grad_norm": 0.26381877064704895, + "learning_rate": 7.914584973554819e-05, + "loss": 1.7858, + "step": 10520 + }, + { + "epoch": 3.229281767955801, + "grad_norm": 0.2500905394554138, + "learning_rate": 7.914181085404194e-05, + "loss": 1.7606, + "step": 10521 + }, + { + "epoch": 3.2295887047268264, + "grad_norm": 0.2585415840148926, + "learning_rate": 7.913777168453925e-05, + "loss": 1.787, + "step": 10522 + }, + { + "epoch": 3.2298956414978512, + "grad_norm": 0.24236604571342468, + "learning_rate": 7.913373222708001e-05, + "loss": 1.7623, + "step": 10523 + }, + { + "epoch": 3.2302025782688766, + "grad_norm": 0.3113093078136444, + "learning_rate": 7.912969248170416e-05, + "loss": 1.7736, + "step": 10524 + }, + { + "epoch": 3.230509515039902, + "grad_norm": 0.3341342806816101, + "learning_rate": 7.912565244845163e-05, + "loss": 1.8583, + "step": 10525 + }, + { + "epoch": 3.230816451810927, + "grad_norm": 0.2644478678703308, + "learning_rate": 7.912161212736231e-05, + "loss": 1.7891, + "step": 10526 + }, + { + "epoch": 3.231123388581952, + "grad_norm": 0.22916561365127563, + "learning_rate": 7.911757151847616e-05, + "loss": 1.7642, + "step": 10527 + }, + { + "epoch": 3.2314303253529775, + "grad_norm": 0.24204877018928528, + "learning_rate": 7.911353062183309e-05, + "loss": 1.8522, + "step": 10528 + }, + { + "epoch": 3.2317372621240024, + "grad_norm": 0.25339365005493164, + "learning_rate": 7.910948943747307e-05, + "loss": 1.8391, + "step": 10529 + }, + { + "epoch": 3.2320441988950277, + "grad_norm": 0.2652709186077118, + "learning_rate": 7.9105447965436e-05, + "loss": 1.7735, + "step": 10530 + }, + { + "epoch": 3.2323511356660526, + "grad_norm": 0.2711019217967987, + "learning_rate": 7.910140620576183e-05, + "loss": 1.8491, + "step": 10531 + }, + { + "epoch": 3.232658072437078, + "grad_norm": 0.2598389685153961, + "learning_rate": 7.909736415849052e-05, + "loss": 1.8417, + "step": 10532 + }, + { + "epoch": 3.2329650092081033, + "grad_norm": 0.278037428855896, + "learning_rate": 7.9093321823662e-05, + "loss": 1.8774, + "step": 10533 + }, + { + "epoch": 3.233271945979128, + "grad_norm": 0.32015568017959595, + "learning_rate": 7.90892792013162e-05, + "loss": 1.8873, + "step": 10534 + }, + { + "epoch": 3.2335788827501535, + "grad_norm": 0.3098098635673523, + "learning_rate": 7.908523629149312e-05, + "loss": 1.8141, + "step": 10535 + }, + { + "epoch": 3.233885819521179, + "grad_norm": 0.3127266764640808, + "learning_rate": 7.908119309423267e-05, + "loss": 1.8587, + "step": 10536 + }, + { + "epoch": 3.2341927562922037, + "grad_norm": 0.3085545301437378, + "learning_rate": 7.907714960957483e-05, + "loss": 1.8544, + "step": 10537 + }, + { + "epoch": 3.234499693063229, + "grad_norm": 0.3051004409790039, + "learning_rate": 7.907310583755956e-05, + "loss": 1.8144, + "step": 10538 + }, + { + "epoch": 3.234806629834254, + "grad_norm": 0.3458186686038971, + "learning_rate": 7.906906177822682e-05, + "loss": 1.8388, + "step": 10539 + }, + { + "epoch": 3.2351135666052793, + "grad_norm": 0.37064439058303833, + "learning_rate": 7.906501743161656e-05, + "loss": 1.7574, + "step": 10540 + }, + { + "epoch": 3.2354205033763046, + "grad_norm": 0.3382316827774048, + "learning_rate": 7.906097279776876e-05, + "loss": 1.8785, + "step": 10541 + }, + { + "epoch": 3.2357274401473295, + "grad_norm": 0.254802942276001, + "learning_rate": 7.905692787672341e-05, + "loss": 1.8276, + "step": 10542 + }, + { + "epoch": 3.236034376918355, + "grad_norm": 0.3362341523170471, + "learning_rate": 7.905288266852047e-05, + "loss": 1.8057, + "step": 10543 + }, + { + "epoch": 3.23634131368938, + "grad_norm": 0.38821661472320557, + "learning_rate": 7.904883717319988e-05, + "loss": 1.7841, + "step": 10544 + }, + { + "epoch": 3.236648250460405, + "grad_norm": 0.33889076113700867, + "learning_rate": 7.90447913908017e-05, + "loss": 1.7892, + "step": 10545 + }, + { + "epoch": 3.2369551872314304, + "grad_norm": 0.2741014361381531, + "learning_rate": 7.904074532136585e-05, + "loss": 1.7611, + "step": 10546 + }, + { + "epoch": 3.2372621240024557, + "grad_norm": 0.28950995206832886, + "learning_rate": 7.903669896493233e-05, + "loss": 1.7963, + "step": 10547 + }, + { + "epoch": 3.2375690607734806, + "grad_norm": 0.30647143721580505, + "learning_rate": 7.903265232154113e-05, + "loss": 1.7522, + "step": 10548 + }, + { + "epoch": 3.237875997544506, + "grad_norm": 0.30428263545036316, + "learning_rate": 7.902860539123225e-05, + "loss": 1.7383, + "step": 10549 + }, + { + "epoch": 3.238182934315531, + "grad_norm": 0.2357146292924881, + "learning_rate": 7.902455817404569e-05, + "loss": 1.7243, + "step": 10550 + }, + { + "epoch": 3.238489871086556, + "grad_norm": 0.3125104606151581, + "learning_rate": 7.90205106700214e-05, + "loss": 1.8542, + "step": 10551 + }, + { + "epoch": 3.2387968078575815, + "grad_norm": 0.25797244906425476, + "learning_rate": 7.901646287919944e-05, + "loss": 1.8374, + "step": 10552 + }, + { + "epoch": 3.2391037446286064, + "grad_norm": 0.3127591907978058, + "learning_rate": 7.901241480161978e-05, + "loss": 1.9457, + "step": 10553 + }, + { + "epoch": 3.2394106813996317, + "grad_norm": 0.2971835434436798, + "learning_rate": 7.900836643732243e-05, + "loss": 1.7933, + "step": 10554 + }, + { + "epoch": 3.2397176181706566, + "grad_norm": 0.28931814432144165, + "learning_rate": 7.90043177863474e-05, + "loss": 1.8201, + "step": 10555 + }, + { + "epoch": 3.240024554941682, + "grad_norm": 0.3348724842071533, + "learning_rate": 7.90002688487347e-05, + "loss": 1.8718, + "step": 10556 + }, + { + "epoch": 3.2403314917127073, + "grad_norm": 0.28566426038742065, + "learning_rate": 7.899621962452436e-05, + "loss": 1.805, + "step": 10557 + }, + { + "epoch": 3.240638428483732, + "grad_norm": 0.27074119448661804, + "learning_rate": 7.899217011375637e-05, + "loss": 1.842, + "step": 10558 + }, + { + "epoch": 3.2409453652547575, + "grad_norm": 0.27014291286468506, + "learning_rate": 7.898812031647076e-05, + "loss": 1.8156, + "step": 10559 + }, + { + "epoch": 3.241252302025783, + "grad_norm": 0.28087863326072693, + "learning_rate": 7.898407023270756e-05, + "loss": 1.8399, + "step": 10560 + }, + { + "epoch": 3.2415592387968077, + "grad_norm": 0.2641037404537201, + "learning_rate": 7.898001986250679e-05, + "loss": 1.7977, + "step": 10561 + }, + { + "epoch": 3.241866175567833, + "grad_norm": 0.2843858301639557, + "learning_rate": 7.897596920590848e-05, + "loss": 1.834, + "step": 10562 + }, + { + "epoch": 3.2421731123388584, + "grad_norm": 0.2724611163139343, + "learning_rate": 7.897191826295266e-05, + "loss": 1.7547, + "step": 10563 + }, + { + "epoch": 3.2424800491098833, + "grad_norm": 0.2583858370780945, + "learning_rate": 7.896786703367935e-05, + "loss": 1.7658, + "step": 10564 + }, + { + "epoch": 3.2427869858809086, + "grad_norm": 0.2666650712490082, + "learning_rate": 7.896381551812861e-05, + "loss": 1.8017, + "step": 10565 + }, + { + "epoch": 3.2430939226519335, + "grad_norm": 0.23269347846508026, + "learning_rate": 7.895976371634047e-05, + "loss": 1.8267, + "step": 10566 + }, + { + "epoch": 3.243400859422959, + "grad_norm": 0.27865225076675415, + "learning_rate": 7.895571162835496e-05, + "loss": 1.8093, + "step": 10567 + }, + { + "epoch": 3.243707796193984, + "grad_norm": 0.29445022344589233, + "learning_rate": 7.895165925421216e-05, + "loss": 1.7999, + "step": 10568 + }, + { + "epoch": 3.244014732965009, + "grad_norm": 0.32135528326034546, + "learning_rate": 7.894760659395206e-05, + "loss": 1.8405, + "step": 10569 + }, + { + "epoch": 3.2443216697360344, + "grad_norm": 0.3409091532230377, + "learning_rate": 7.894355364761477e-05, + "loss": 1.7861, + "step": 10570 + }, + { + "epoch": 3.2446286065070598, + "grad_norm": 0.3379025459289551, + "learning_rate": 7.893950041524032e-05, + "loss": 1.8495, + "step": 10571 + }, + { + "epoch": 3.2449355432780846, + "grad_norm": 0.2843063473701477, + "learning_rate": 7.893544689686874e-05, + "loss": 1.7888, + "step": 10572 + }, + { + "epoch": 3.24524248004911, + "grad_norm": 0.2914074957370758, + "learning_rate": 7.893139309254013e-05, + "loss": 1.7866, + "step": 10573 + }, + { + "epoch": 3.245549416820135, + "grad_norm": 0.39855021238327026, + "learning_rate": 7.892733900229454e-05, + "loss": 1.7865, + "step": 10574 + }, + { + "epoch": 3.24585635359116, + "grad_norm": 0.4232102632522583, + "learning_rate": 7.892328462617203e-05, + "loss": 1.8443, + "step": 10575 + }, + { + "epoch": 3.2461632903621855, + "grad_norm": 0.390794962644577, + "learning_rate": 7.891922996421267e-05, + "loss": 1.8735, + "step": 10576 + }, + { + "epoch": 3.2464702271332104, + "grad_norm": 0.3051595687866211, + "learning_rate": 7.891517501645653e-05, + "loss": 1.8654, + "step": 10577 + }, + { + "epoch": 3.2467771639042358, + "grad_norm": 0.25363096594810486, + "learning_rate": 7.891111978294367e-05, + "loss": 1.7602, + "step": 10578 + }, + { + "epoch": 3.247084100675261, + "grad_norm": 0.29785794019699097, + "learning_rate": 7.890706426371419e-05, + "loss": 1.8242, + "step": 10579 + }, + { + "epoch": 3.247391037446286, + "grad_norm": 0.346162885427475, + "learning_rate": 7.890300845880816e-05, + "loss": 1.8551, + "step": 10580 + }, + { + "epoch": 3.2476979742173113, + "grad_norm": 0.33906155824661255, + "learning_rate": 7.889895236826566e-05, + "loss": 1.765, + "step": 10581 + }, + { + "epoch": 3.248004910988336, + "grad_norm": 0.26083165407180786, + "learning_rate": 7.889489599212676e-05, + "loss": 1.8246, + "step": 10582 + }, + { + "epoch": 3.2483118477593615, + "grad_norm": 0.3042019009590149, + "learning_rate": 7.889083933043157e-05, + "loss": 1.9017, + "step": 10583 + }, + { + "epoch": 3.248618784530387, + "grad_norm": 0.34833577275276184, + "learning_rate": 7.888678238322018e-05, + "loss": 1.7863, + "step": 10584 + }, + { + "epoch": 3.2489257213014118, + "grad_norm": 0.34436655044555664, + "learning_rate": 7.888272515053267e-05, + "loss": 1.7937, + "step": 10585 + }, + { + "epoch": 3.249232658072437, + "grad_norm": 0.2550172507762909, + "learning_rate": 7.887866763240914e-05, + "loss": 1.7615, + "step": 10586 + }, + { + "epoch": 3.2495395948434624, + "grad_norm": 0.3334405720233917, + "learning_rate": 7.88746098288897e-05, + "loss": 1.7465, + "step": 10587 + }, + { + "epoch": 3.2498465316144873, + "grad_norm": 0.4668157696723938, + "learning_rate": 7.887055174001443e-05, + "loss": 1.7836, + "step": 10588 + }, + { + "epoch": 3.2501534683855127, + "grad_norm": 0.524680495262146, + "learning_rate": 7.886649336582344e-05, + "loss": 1.844, + "step": 10589 + }, + { + "epoch": 3.250460405156538, + "grad_norm": 0.36859074234962463, + "learning_rate": 7.886243470635685e-05, + "loss": 1.8072, + "step": 10590 + }, + { + "epoch": 3.250767341927563, + "grad_norm": 0.32370296120643616, + "learning_rate": 7.885837576165478e-05, + "loss": 1.802, + "step": 10591 + }, + { + "epoch": 3.2510742786985882, + "grad_norm": 0.3506374955177307, + "learning_rate": 7.88543165317573e-05, + "loss": 1.7965, + "step": 10592 + }, + { + "epoch": 3.251381215469613, + "grad_norm": 0.39058688282966614, + "learning_rate": 7.885025701670457e-05, + "loss": 1.7987, + "step": 10593 + }, + { + "epoch": 3.2516881522406385, + "grad_norm": 0.3042154014110565, + "learning_rate": 7.884619721653669e-05, + "loss": 1.8345, + "step": 10594 + }, + { + "epoch": 3.251995089011664, + "grad_norm": 0.2249498963356018, + "learning_rate": 7.884213713129378e-05, + "loss": 1.7796, + "step": 10595 + }, + { + "epoch": 3.2523020257826887, + "grad_norm": 0.2701997458934784, + "learning_rate": 7.883807676101595e-05, + "loss": 1.8027, + "step": 10596 + }, + { + "epoch": 3.252608962553714, + "grad_norm": 0.2574785053730011, + "learning_rate": 7.883401610574336e-05, + "loss": 1.7878, + "step": 10597 + }, + { + "epoch": 3.252915899324739, + "grad_norm": 0.24964739382266998, + "learning_rate": 7.882995516551613e-05, + "loss": 1.7612, + "step": 10598 + }, + { + "epoch": 3.2532228360957642, + "grad_norm": 0.2519865930080414, + "learning_rate": 7.882589394037437e-05, + "loss": 1.7583, + "step": 10599 + }, + { + "epoch": 3.2535297728667896, + "grad_norm": 0.23174463212490082, + "learning_rate": 7.882183243035823e-05, + "loss": 1.7607, + "step": 10600 + }, + { + "epoch": 3.2538367096378145, + "grad_norm": 0.28103554248809814, + "learning_rate": 7.881777063550786e-05, + "loss": 1.904, + "step": 10601 + }, + { + "epoch": 3.25414364640884, + "grad_norm": 0.265677809715271, + "learning_rate": 7.881370855586339e-05, + "loss": 1.8169, + "step": 10602 + }, + { + "epoch": 3.254450583179865, + "grad_norm": 0.2539603114128113, + "learning_rate": 7.880964619146493e-05, + "loss": 1.8439, + "step": 10603 + }, + { + "epoch": 3.25475751995089, + "grad_norm": 0.2741886377334595, + "learning_rate": 7.88055835423527e-05, + "loss": 1.8737, + "step": 10604 + }, + { + "epoch": 3.2550644567219154, + "grad_norm": 0.27548348903656006, + "learning_rate": 7.88015206085668e-05, + "loss": 1.8385, + "step": 10605 + }, + { + "epoch": 3.2553713934929407, + "grad_norm": 0.2958502769470215, + "learning_rate": 7.879745739014739e-05, + "loss": 1.8603, + "step": 10606 + }, + { + "epoch": 3.2556783302639656, + "grad_norm": 0.2728644907474518, + "learning_rate": 7.879339388713462e-05, + "loss": 1.8, + "step": 10607 + }, + { + "epoch": 3.255985267034991, + "grad_norm": 0.28718289732933044, + "learning_rate": 7.878933009956866e-05, + "loss": 1.7803, + "step": 10608 + }, + { + "epoch": 3.256292203806016, + "grad_norm": 0.2989691197872162, + "learning_rate": 7.878526602748967e-05, + "loss": 1.8155, + "step": 10609 + }, + { + "epoch": 3.256599140577041, + "grad_norm": 0.24515527486801147, + "learning_rate": 7.87812016709378e-05, + "loss": 1.7623, + "step": 10610 + }, + { + "epoch": 3.2569060773480665, + "grad_norm": 0.29946041107177734, + "learning_rate": 7.877713702995324e-05, + "loss": 1.8097, + "step": 10611 + }, + { + "epoch": 3.2572130141190914, + "grad_norm": 0.2854483723640442, + "learning_rate": 7.877307210457613e-05, + "loss": 1.8088, + "step": 10612 + }, + { + "epoch": 3.2575199508901167, + "grad_norm": 0.27812930941581726, + "learning_rate": 7.876900689484668e-05, + "loss": 1.8151, + "step": 10613 + }, + { + "epoch": 3.2578268876611416, + "grad_norm": 0.2658015787601471, + "learning_rate": 7.876494140080503e-05, + "loss": 1.8314, + "step": 10614 + }, + { + "epoch": 3.258133824432167, + "grad_norm": 0.28935661911964417, + "learning_rate": 7.876087562249137e-05, + "loss": 1.7948, + "step": 10615 + }, + { + "epoch": 3.2584407612031923, + "grad_norm": 0.27497121691703796, + "learning_rate": 7.875680955994587e-05, + "loss": 1.7964, + "step": 10616 + }, + { + "epoch": 3.258747697974217, + "grad_norm": 0.3313405513763428, + "learning_rate": 7.875274321320873e-05, + "loss": 1.8143, + "step": 10617 + }, + { + "epoch": 3.2590546347452425, + "grad_norm": 0.3217218816280365, + "learning_rate": 7.874867658232013e-05, + "loss": 1.7749, + "step": 10618 + }, + { + "epoch": 3.259361571516268, + "grad_norm": 0.25105544924736023, + "learning_rate": 7.874460966732025e-05, + "loss": 1.7834, + "step": 10619 + }, + { + "epoch": 3.2596685082872927, + "grad_norm": 0.2931382358074188, + "learning_rate": 7.874054246824931e-05, + "loss": 1.8252, + "step": 10620 + }, + { + "epoch": 3.259975445058318, + "grad_norm": 0.2803363502025604, + "learning_rate": 7.873647498514747e-05, + "loss": 1.7527, + "step": 10621 + }, + { + "epoch": 3.2602823818293434, + "grad_norm": 0.29857927560806274, + "learning_rate": 7.873240721805492e-05, + "loss": 1.8085, + "step": 10622 + }, + { + "epoch": 3.2605893186003683, + "grad_norm": 0.24864110350608826, + "learning_rate": 7.872833916701192e-05, + "loss": 1.7509, + "step": 10623 + }, + { + "epoch": 3.2608962553713936, + "grad_norm": 0.24105949699878693, + "learning_rate": 7.872427083205862e-05, + "loss": 1.7871, + "step": 10624 + }, + { + "epoch": 3.2612031921424185, + "grad_norm": 0.2429245114326477, + "learning_rate": 7.872020221323523e-05, + "loss": 1.777, + "step": 10625 + }, + { + "epoch": 3.261510128913444, + "grad_norm": 0.234287828207016, + "learning_rate": 7.871613331058197e-05, + "loss": 1.8001, + "step": 10626 + }, + { + "epoch": 3.261817065684469, + "grad_norm": 0.3463406264781952, + "learning_rate": 7.871206412413905e-05, + "loss": 1.8925, + "step": 10627 + }, + { + "epoch": 3.262124002455494, + "grad_norm": 0.26798921823501587, + "learning_rate": 7.87079946539467e-05, + "loss": 1.7963, + "step": 10628 + }, + { + "epoch": 3.2624309392265194, + "grad_norm": 0.28603312373161316, + "learning_rate": 7.87039249000451e-05, + "loss": 1.8308, + "step": 10629 + }, + { + "epoch": 3.2627378759975443, + "grad_norm": 0.2717527747154236, + "learning_rate": 7.86998548624745e-05, + "loss": 1.8246, + "step": 10630 + }, + { + "epoch": 3.2630448127685696, + "grad_norm": 0.32215580344200134, + "learning_rate": 7.86957845412751e-05, + "loss": 1.7278, + "step": 10631 + }, + { + "epoch": 3.263351749539595, + "grad_norm": 0.3578735589981079, + "learning_rate": 7.869171393648717e-05, + "loss": 1.7288, + "step": 10632 + }, + { + "epoch": 3.26365868631062, + "grad_norm": 0.3120707869529724, + "learning_rate": 7.868764304815089e-05, + "loss": 1.7971, + "step": 10633 + }, + { + "epoch": 3.263965623081645, + "grad_norm": 0.27419236302375793, + "learning_rate": 7.86835718763065e-05, + "loss": 1.8529, + "step": 10634 + }, + { + "epoch": 3.2642725598526705, + "grad_norm": 0.3200531601905823, + "learning_rate": 7.867950042099423e-05, + "loss": 1.7892, + "step": 10635 + }, + { + "epoch": 3.2645794966236954, + "grad_norm": 0.325706422328949, + "learning_rate": 7.867542868225435e-05, + "loss": 1.8236, + "step": 10636 + }, + { + "epoch": 3.2648864333947207, + "grad_norm": 0.2950136065483093, + "learning_rate": 7.867135666012707e-05, + "loss": 1.8163, + "step": 10637 + }, + { + "epoch": 3.265193370165746, + "grad_norm": 0.2772117257118225, + "learning_rate": 7.866728435465263e-05, + "loss": 1.8373, + "step": 10638 + }, + { + "epoch": 3.265500306936771, + "grad_norm": 0.2887401580810547, + "learning_rate": 7.866321176587129e-05, + "loss": 1.7756, + "step": 10639 + }, + { + "epoch": 3.2658072437077963, + "grad_norm": 0.3474489152431488, + "learning_rate": 7.865913889382329e-05, + "loss": 1.7539, + "step": 10640 + }, + { + "epoch": 3.266114180478821, + "grad_norm": 0.3433493971824646, + "learning_rate": 7.865506573854888e-05, + "loss": 1.7987, + "step": 10641 + }, + { + "epoch": 3.2664211172498465, + "grad_norm": 0.3075394630432129, + "learning_rate": 7.865099230008832e-05, + "loss": 1.7907, + "step": 10642 + }, + { + "epoch": 3.266728054020872, + "grad_norm": 0.24817697703838348, + "learning_rate": 7.864691857848187e-05, + "loss": 1.7941, + "step": 10643 + }, + { + "epoch": 3.2670349907918967, + "grad_norm": 0.290147602558136, + "learning_rate": 7.864284457376976e-05, + "loss": 1.9125, + "step": 10644 + }, + { + "epoch": 3.267341927562922, + "grad_norm": 0.253684937953949, + "learning_rate": 7.863877028599229e-05, + "loss": 1.8084, + "step": 10645 + }, + { + "epoch": 3.267648864333947, + "grad_norm": 0.26349252462387085, + "learning_rate": 7.863469571518969e-05, + "loss": 1.7548, + "step": 10646 + }, + { + "epoch": 3.2679558011049723, + "grad_norm": 0.30568864941596985, + "learning_rate": 7.863062086140224e-05, + "loss": 1.8551, + "step": 10647 + }, + { + "epoch": 3.2682627378759976, + "grad_norm": 0.2866690456867218, + "learning_rate": 7.862654572467024e-05, + "loss": 1.8145, + "step": 10648 + }, + { + "epoch": 3.2685696746470225, + "grad_norm": 0.32022854685783386, + "learning_rate": 7.862247030503391e-05, + "loss": 1.896, + "step": 10649 + }, + { + "epoch": 3.268876611418048, + "grad_norm": 0.25260284543037415, + "learning_rate": 7.861839460253356e-05, + "loss": 1.814, + "step": 10650 + }, + { + "epoch": 3.269183548189073, + "grad_norm": 0.26776066422462463, + "learning_rate": 7.861431861720947e-05, + "loss": 1.7755, + "step": 10651 + }, + { + "epoch": 3.269490484960098, + "grad_norm": 0.26514193415641785, + "learning_rate": 7.861024234910191e-05, + "loss": 1.7606, + "step": 10652 + }, + { + "epoch": 3.2697974217311234, + "grad_norm": 0.27213940024375916, + "learning_rate": 7.860616579825116e-05, + "loss": 1.8074, + "step": 10653 + }, + { + "epoch": 3.2701043585021488, + "grad_norm": 0.29192888736724854, + "learning_rate": 7.860208896469752e-05, + "loss": 1.8436, + "step": 10654 + }, + { + "epoch": 3.2704112952731736, + "grad_norm": 0.3772370219230652, + "learning_rate": 7.859801184848127e-05, + "loss": 1.8096, + "step": 10655 + }, + { + "epoch": 3.270718232044199, + "grad_norm": 0.4574970006942749, + "learning_rate": 7.859393444964269e-05, + "loss": 1.7612, + "step": 10656 + }, + { + "epoch": 3.271025168815224, + "grad_norm": 0.4614393413066864, + "learning_rate": 7.858985676822211e-05, + "loss": 1.8529, + "step": 10657 + }, + { + "epoch": 3.271332105586249, + "grad_norm": 0.33567267656326294, + "learning_rate": 7.85857788042598e-05, + "loss": 1.8391, + "step": 10658 + }, + { + "epoch": 3.2716390423572745, + "grad_norm": 0.2564064860343933, + "learning_rate": 7.858170055779609e-05, + "loss": 1.7621, + "step": 10659 + }, + { + "epoch": 3.2719459791282994, + "grad_norm": 0.26769882440567017, + "learning_rate": 7.857762202887122e-05, + "loss": 1.8145, + "step": 10660 + }, + { + "epoch": 3.2722529158993248, + "grad_norm": 0.262008935213089, + "learning_rate": 7.857354321752558e-05, + "loss": 1.7513, + "step": 10661 + }, + { + "epoch": 3.27255985267035, + "grad_norm": 0.26494377851486206, + "learning_rate": 7.856946412379942e-05, + "loss": 1.8071, + "step": 10662 + }, + { + "epoch": 3.272866789441375, + "grad_norm": 0.25613999366760254, + "learning_rate": 7.856538474773307e-05, + "loss": 1.8775, + "step": 10663 + }, + { + "epoch": 3.2731737262124003, + "grad_norm": 0.24789929389953613, + "learning_rate": 7.856130508936684e-05, + "loss": 1.8055, + "step": 10664 + }, + { + "epoch": 3.2734806629834257, + "grad_norm": 0.29111939668655396, + "learning_rate": 7.855722514874107e-05, + "loss": 1.8114, + "step": 10665 + }, + { + "epoch": 3.2737875997544506, + "grad_norm": 0.30511030554771423, + "learning_rate": 7.855314492589605e-05, + "loss": 1.8131, + "step": 10666 + }, + { + "epoch": 3.274094536525476, + "grad_norm": 0.2545989453792572, + "learning_rate": 7.854906442087212e-05, + "loss": 1.7933, + "step": 10667 + }, + { + "epoch": 3.2744014732965008, + "grad_norm": 0.26684823632240295, + "learning_rate": 7.85449836337096e-05, + "loss": 1.7604, + "step": 10668 + }, + { + "epoch": 3.274708410067526, + "grad_norm": 0.5097808837890625, + "learning_rate": 7.854090256444881e-05, + "loss": 1.777, + "step": 10669 + }, + { + "epoch": 3.2750153468385514, + "grad_norm": 0.27828142046928406, + "learning_rate": 7.853682121313011e-05, + "loss": 1.7885, + "step": 10670 + }, + { + "epoch": 3.2753222836095763, + "grad_norm": 0.2925552725791931, + "learning_rate": 7.853273957979381e-05, + "loss": 1.7962, + "step": 10671 + }, + { + "epoch": 3.2756292203806017, + "grad_norm": 0.284574955701828, + "learning_rate": 7.852865766448025e-05, + "loss": 1.8645, + "step": 10672 + }, + { + "epoch": 3.2759361571516266, + "grad_norm": 0.23407664895057678, + "learning_rate": 7.85245754672298e-05, + "loss": 1.7106, + "step": 10673 + }, + { + "epoch": 3.276243093922652, + "grad_norm": 0.2555919885635376, + "learning_rate": 7.852049298808274e-05, + "loss": 1.8237, + "step": 10674 + }, + { + "epoch": 3.2765500306936772, + "grad_norm": 0.26703694462776184, + "learning_rate": 7.851641022707947e-05, + "loss": 1.7844, + "step": 10675 + }, + { + "epoch": 3.276856967464702, + "grad_norm": 0.24889135360717773, + "learning_rate": 7.851232718426033e-05, + "loss": 1.7783, + "step": 10676 + }, + { + "epoch": 3.2771639042357275, + "grad_norm": 0.25770726799964905, + "learning_rate": 7.850824385966564e-05, + "loss": 1.8007, + "step": 10677 + }, + { + "epoch": 3.277470841006753, + "grad_norm": 0.31806984543800354, + "learning_rate": 7.850416025333578e-05, + "loss": 1.8623, + "step": 10678 + }, + { + "epoch": 3.2777777777777777, + "grad_norm": 0.2906930148601532, + "learning_rate": 7.850007636531111e-05, + "loss": 1.8315, + "step": 10679 + }, + { + "epoch": 3.278084714548803, + "grad_norm": 0.2802525460720062, + "learning_rate": 7.849599219563197e-05, + "loss": 1.8488, + "step": 10680 + }, + { + "epoch": 3.2783916513198283, + "grad_norm": 0.26150405406951904, + "learning_rate": 7.849190774433874e-05, + "loss": 1.7967, + "step": 10681 + }, + { + "epoch": 3.2786985880908532, + "grad_norm": 0.25863370299339294, + "learning_rate": 7.848782301147178e-05, + "loss": 1.864, + "step": 10682 + }, + { + "epoch": 3.2790055248618786, + "grad_norm": 0.25381043553352356, + "learning_rate": 7.848373799707145e-05, + "loss": 1.8239, + "step": 10683 + }, + { + "epoch": 3.2793124616329035, + "grad_norm": 0.2583387792110443, + "learning_rate": 7.847965270117814e-05, + "loss": 1.8449, + "step": 10684 + }, + { + "epoch": 3.279619398403929, + "grad_norm": 0.30759841203689575, + "learning_rate": 7.84755671238322e-05, + "loss": 1.7992, + "step": 10685 + }, + { + "epoch": 3.279926335174954, + "grad_norm": 0.4316023588180542, + "learning_rate": 7.847148126507402e-05, + "loss": 1.7912, + "step": 10686 + }, + { + "epoch": 3.280233271945979, + "grad_norm": 0.3988901674747467, + "learning_rate": 7.846739512494396e-05, + "loss": 1.8831, + "step": 10687 + }, + { + "epoch": 3.2805402087170044, + "grad_norm": 0.318934828042984, + "learning_rate": 7.846330870348244e-05, + "loss": 1.8411, + "step": 10688 + }, + { + "epoch": 3.2808471454880292, + "grad_norm": 0.27755632996559143, + "learning_rate": 7.84592220007298e-05, + "loss": 1.8763, + "step": 10689 + }, + { + "epoch": 3.2811540822590546, + "grad_norm": 0.33544883131980896, + "learning_rate": 7.845513501672646e-05, + "loss": 1.731, + "step": 10690 + }, + { + "epoch": 3.28146101903008, + "grad_norm": 0.28299057483673096, + "learning_rate": 7.845104775151278e-05, + "loss": 1.813, + "step": 10691 + }, + { + "epoch": 3.281767955801105, + "grad_norm": 0.2761382460594177, + "learning_rate": 7.844696020512918e-05, + "loss": 1.8018, + "step": 10692 + }, + { + "epoch": 3.28207489257213, + "grad_norm": 0.2919033169746399, + "learning_rate": 7.844287237761605e-05, + "loss": 1.793, + "step": 10693 + }, + { + "epoch": 3.2823818293431555, + "grad_norm": 0.32922014594078064, + "learning_rate": 7.843878426901378e-05, + "loss": 1.8186, + "step": 10694 + }, + { + "epoch": 3.2826887661141804, + "grad_norm": 0.2818562090396881, + "learning_rate": 7.843469587936279e-05, + "loss": 1.7794, + "step": 10695 + }, + { + "epoch": 3.2829957028852057, + "grad_norm": 0.26414254307746887, + "learning_rate": 7.843060720870345e-05, + "loss": 1.7854, + "step": 10696 + }, + { + "epoch": 3.283302639656231, + "grad_norm": 0.28345760703086853, + "learning_rate": 7.842651825707618e-05, + "loss": 1.7659, + "step": 10697 + }, + { + "epoch": 3.283609576427256, + "grad_norm": 0.3522340655326843, + "learning_rate": 7.842242902452141e-05, + "loss": 1.8427, + "step": 10698 + }, + { + "epoch": 3.2839165131982813, + "grad_norm": 0.2861590087413788, + "learning_rate": 7.841833951107954e-05, + "loss": 1.7539, + "step": 10699 + }, + { + "epoch": 3.284223449969306, + "grad_norm": 0.2596624493598938, + "learning_rate": 7.841424971679099e-05, + "loss": 1.8407, + "step": 10700 + }, + { + "epoch": 3.2845303867403315, + "grad_norm": 0.2847718298435211, + "learning_rate": 7.841015964169616e-05, + "loss": 1.8085, + "step": 10701 + }, + { + "epoch": 3.284837323511357, + "grad_norm": 0.29566115140914917, + "learning_rate": 7.840606928583547e-05, + "loss": 1.7873, + "step": 10702 + }, + { + "epoch": 3.2851442602823817, + "grad_norm": 0.2752111256122589, + "learning_rate": 7.840197864924936e-05, + "loss": 1.8186, + "step": 10703 + }, + { + "epoch": 3.285451197053407, + "grad_norm": 0.2907958924770355, + "learning_rate": 7.839788773197826e-05, + "loss": 1.8081, + "step": 10704 + }, + { + "epoch": 3.285758133824432, + "grad_norm": 0.25808724761009216, + "learning_rate": 7.839379653406258e-05, + "loss": 1.7635, + "step": 10705 + }, + { + "epoch": 3.2860650705954573, + "grad_norm": 0.2732730507850647, + "learning_rate": 7.838970505554277e-05, + "loss": 1.8061, + "step": 10706 + }, + { + "epoch": 3.2863720073664826, + "grad_norm": 0.23820067942142487, + "learning_rate": 7.838561329645923e-05, + "loss": 1.8091, + "step": 10707 + }, + { + "epoch": 3.2866789441375075, + "grad_norm": 0.24179396033287048, + "learning_rate": 7.838152125685245e-05, + "loss": 1.7513, + "step": 10708 + }, + { + "epoch": 3.286985880908533, + "grad_norm": 0.2627546787261963, + "learning_rate": 7.837742893676283e-05, + "loss": 1.8741, + "step": 10709 + }, + { + "epoch": 3.287292817679558, + "grad_norm": 0.2827817499637604, + "learning_rate": 7.837333633623083e-05, + "loss": 1.8387, + "step": 10710 + }, + { + "epoch": 3.287599754450583, + "grad_norm": 0.2666749060153961, + "learning_rate": 7.836924345529688e-05, + "loss": 1.8319, + "step": 10711 + }, + { + "epoch": 3.2879066912216084, + "grad_norm": 0.3403390944004059, + "learning_rate": 7.836515029400145e-05, + "loss": 1.7827, + "step": 10712 + }, + { + "epoch": 3.2882136279926337, + "grad_norm": 0.30646705627441406, + "learning_rate": 7.836105685238497e-05, + "loss": 1.8612, + "step": 10713 + }, + { + "epoch": 3.2885205647636586, + "grad_norm": 0.2580253481864929, + "learning_rate": 7.83569631304879e-05, + "loss": 1.7332, + "step": 10714 + }, + { + "epoch": 3.288827501534684, + "grad_norm": 0.23734542727470398, + "learning_rate": 7.835286912835071e-05, + "loss": 1.7899, + "step": 10715 + }, + { + "epoch": 3.289134438305709, + "grad_norm": 0.2457810491323471, + "learning_rate": 7.834877484601384e-05, + "loss": 1.8059, + "step": 10716 + }, + { + "epoch": 3.289441375076734, + "grad_norm": 0.2558443248271942, + "learning_rate": 7.834468028351778e-05, + "loss": 1.8689, + "step": 10717 + }, + { + "epoch": 3.2897483118477595, + "grad_norm": 0.26596710085868835, + "learning_rate": 7.834058544090298e-05, + "loss": 1.816, + "step": 10718 + }, + { + "epoch": 3.2900552486187844, + "grad_norm": 0.25424903631210327, + "learning_rate": 7.833649031820987e-05, + "loss": 1.7907, + "step": 10719 + }, + { + "epoch": 3.2903621853898097, + "grad_norm": 0.23873139917850494, + "learning_rate": 7.833239491547896e-05, + "loss": 1.7666, + "step": 10720 + }, + { + "epoch": 3.2906691221608346, + "grad_norm": 0.23292972147464752, + "learning_rate": 7.832829923275073e-05, + "loss": 1.7674, + "step": 10721 + }, + { + "epoch": 3.29097605893186, + "grad_norm": 0.30133312940597534, + "learning_rate": 7.832420327006566e-05, + "loss": 1.8229, + "step": 10722 + }, + { + "epoch": 3.2912829957028853, + "grad_norm": 0.2882522642612457, + "learning_rate": 7.83201070274642e-05, + "loss": 1.7855, + "step": 10723 + }, + { + "epoch": 3.29158993247391, + "grad_norm": 0.2578088045120239, + "learning_rate": 7.831601050498683e-05, + "loss": 1.7276, + "step": 10724 + }, + { + "epoch": 3.2918968692449355, + "grad_norm": 0.29511600732803345, + "learning_rate": 7.831191370267406e-05, + "loss": 1.8085, + "step": 10725 + }, + { + "epoch": 3.292203806015961, + "grad_norm": 0.29557499289512634, + "learning_rate": 7.830781662056634e-05, + "loss": 1.815, + "step": 10726 + }, + { + "epoch": 3.2925107427869857, + "grad_norm": 0.32722121477127075, + "learning_rate": 7.830371925870422e-05, + "loss": 1.7889, + "step": 10727 + }, + { + "epoch": 3.292817679558011, + "grad_norm": 0.3124488592147827, + "learning_rate": 7.829962161712814e-05, + "loss": 1.8063, + "step": 10728 + }, + { + "epoch": 3.2931246163290364, + "grad_norm": 0.311334490776062, + "learning_rate": 7.829552369587861e-05, + "loss": 1.8852, + "step": 10729 + }, + { + "epoch": 3.2934315531000613, + "grad_norm": 0.28010860085487366, + "learning_rate": 7.829142549499613e-05, + "loss": 1.8274, + "step": 10730 + }, + { + "epoch": 3.2937384898710866, + "grad_norm": 0.3453529477119446, + "learning_rate": 7.828732701452119e-05, + "loss": 1.8618, + "step": 10731 + }, + { + "epoch": 3.2940454266421115, + "grad_norm": 0.2946802079677582, + "learning_rate": 7.828322825449432e-05, + "loss": 1.7123, + "step": 10732 + }, + { + "epoch": 3.294352363413137, + "grad_norm": 0.2467648684978485, + "learning_rate": 7.827912921495601e-05, + "loss": 1.7786, + "step": 10733 + }, + { + "epoch": 3.294659300184162, + "grad_norm": 0.2957034707069397, + "learning_rate": 7.827502989594677e-05, + "loss": 1.7817, + "step": 10734 + }, + { + "epoch": 3.294966236955187, + "grad_norm": 0.300905704498291, + "learning_rate": 7.827093029750713e-05, + "loss": 1.7582, + "step": 10735 + }, + { + "epoch": 3.2952731737262124, + "grad_norm": 0.28935131430625916, + "learning_rate": 7.826683041967757e-05, + "loss": 1.7766, + "step": 10736 + }, + { + "epoch": 3.2955801104972378, + "grad_norm": 0.26046010851860046, + "learning_rate": 7.826273026249861e-05, + "loss": 1.8152, + "step": 10737 + }, + { + "epoch": 3.2958870472682626, + "grad_norm": 0.24247924983501434, + "learning_rate": 7.82586298260108e-05, + "loss": 1.8679, + "step": 10738 + }, + { + "epoch": 3.296193984039288, + "grad_norm": 0.25977620482444763, + "learning_rate": 7.825452911025466e-05, + "loss": 1.8108, + "step": 10739 + }, + { + "epoch": 3.2965009208103133, + "grad_norm": 0.2732592821121216, + "learning_rate": 7.825042811527068e-05, + "loss": 1.7355, + "step": 10740 + }, + { + "epoch": 3.296807857581338, + "grad_norm": 0.38407859206199646, + "learning_rate": 7.824632684109941e-05, + "loss": 1.8418, + "step": 10741 + }, + { + "epoch": 3.2971147943523635, + "grad_norm": 0.4239252805709839, + "learning_rate": 7.82422252877814e-05, + "loss": 1.7655, + "step": 10742 + }, + { + "epoch": 3.2974217311233884, + "grad_norm": 0.3810526132583618, + "learning_rate": 7.823812345535716e-05, + "loss": 1.8804, + "step": 10743 + }, + { + "epoch": 3.2977286678944138, + "grad_norm": 0.29939520359039307, + "learning_rate": 7.823402134386722e-05, + "loss": 1.8207, + "step": 10744 + }, + { + "epoch": 3.298035604665439, + "grad_norm": 0.4053972065448761, + "learning_rate": 7.822991895335215e-05, + "loss": 1.7901, + "step": 10745 + }, + { + "epoch": 3.298342541436464, + "grad_norm": 0.4975005090236664, + "learning_rate": 7.822581628385247e-05, + "loss": 1.8344, + "step": 10746 + }, + { + "epoch": 3.2986494782074893, + "grad_norm": 0.4100436270236969, + "learning_rate": 7.822171333540874e-05, + "loss": 1.7891, + "step": 10747 + }, + { + "epoch": 3.298956414978514, + "grad_norm": 0.2817644476890564, + "learning_rate": 7.821761010806147e-05, + "loss": 1.7895, + "step": 10748 + }, + { + "epoch": 3.2992633517495396, + "grad_norm": 0.332660973072052, + "learning_rate": 7.821350660185125e-05, + "loss": 1.7281, + "step": 10749 + }, + { + "epoch": 3.299570288520565, + "grad_norm": 0.42652732133865356, + "learning_rate": 7.820940281681863e-05, + "loss": 1.7855, + "step": 10750 + }, + { + "epoch": 3.2998772252915898, + "grad_norm": 0.35700714588165283, + "learning_rate": 7.820529875300415e-05, + "loss": 1.8722, + "step": 10751 + }, + { + "epoch": 3.300184162062615, + "grad_norm": 0.25305211544036865, + "learning_rate": 7.820119441044838e-05, + "loss": 1.7696, + "step": 10752 + }, + { + "epoch": 3.3004910988336404, + "grad_norm": 0.280205637216568, + "learning_rate": 7.819708978919188e-05, + "loss": 1.756, + "step": 10753 + }, + { + "epoch": 3.3007980356046653, + "grad_norm": 0.4176226854324341, + "learning_rate": 7.819298488927521e-05, + "loss": 1.7731, + "step": 10754 + }, + { + "epoch": 3.3011049723756907, + "grad_norm": 0.4264865517616272, + "learning_rate": 7.818887971073894e-05, + "loss": 1.7851, + "step": 10755 + }, + { + "epoch": 3.301411909146716, + "grad_norm": 0.2901221215724945, + "learning_rate": 7.818477425362363e-05, + "loss": 1.7356, + "step": 10756 + }, + { + "epoch": 3.301718845917741, + "grad_norm": 0.29583361744880676, + "learning_rate": 7.818066851796986e-05, + "loss": 1.8269, + "step": 10757 + }, + { + "epoch": 3.3020257826887662, + "grad_norm": 0.38592997193336487, + "learning_rate": 7.817656250381821e-05, + "loss": 1.7515, + "step": 10758 + }, + { + "epoch": 3.302332719459791, + "grad_norm": 0.29301533102989197, + "learning_rate": 7.817245621120927e-05, + "loss": 1.7955, + "step": 10759 + }, + { + "epoch": 3.3026396562308165, + "grad_norm": 0.2770880162715912, + "learning_rate": 7.816834964018359e-05, + "loss": 1.7899, + "step": 10760 + }, + { + "epoch": 3.302946593001842, + "grad_norm": 0.32566413283348083, + "learning_rate": 7.816424279078176e-05, + "loss": 1.74, + "step": 10761 + }, + { + "epoch": 3.3032535297728667, + "grad_norm": 0.3077750504016876, + "learning_rate": 7.81601356630444e-05, + "loss": 1.8123, + "step": 10762 + }, + { + "epoch": 3.303560466543892, + "grad_norm": 0.2826370298862457, + "learning_rate": 7.815602825701206e-05, + "loss": 1.865, + "step": 10763 + }, + { + "epoch": 3.303867403314917, + "grad_norm": 0.31700822710990906, + "learning_rate": 7.815192057272534e-05, + "loss": 1.8021, + "step": 10764 + }, + { + "epoch": 3.3041743400859422, + "grad_norm": 0.33182790875434875, + "learning_rate": 7.814781261022486e-05, + "loss": 1.818, + "step": 10765 + }, + { + "epoch": 3.3044812768569676, + "grad_norm": 0.2720039486885071, + "learning_rate": 7.814370436955118e-05, + "loss": 1.8369, + "step": 10766 + }, + { + "epoch": 3.3047882136279925, + "grad_norm": 0.28134068846702576, + "learning_rate": 7.813959585074493e-05, + "loss": 1.8391, + "step": 10767 + }, + { + "epoch": 3.305095150399018, + "grad_norm": 0.25748828053474426, + "learning_rate": 7.813548705384667e-05, + "loss": 1.7987, + "step": 10768 + }, + { + "epoch": 3.305402087170043, + "grad_norm": 0.26187625527381897, + "learning_rate": 7.813137797889708e-05, + "loss": 1.7645, + "step": 10769 + }, + { + "epoch": 3.305709023941068, + "grad_norm": 0.297262579202652, + "learning_rate": 7.812726862593671e-05, + "loss": 1.771, + "step": 10770 + }, + { + "epoch": 3.3060159607120934, + "grad_norm": 0.2987872064113617, + "learning_rate": 7.812315899500618e-05, + "loss": 1.8115, + "step": 10771 + }, + { + "epoch": 3.3063228974831187, + "grad_norm": 0.31963878870010376, + "learning_rate": 7.81190490861461e-05, + "loss": 1.7685, + "step": 10772 + }, + { + "epoch": 3.3066298342541436, + "grad_norm": 0.27007177472114563, + "learning_rate": 7.81149388993971e-05, + "loss": 1.8272, + "step": 10773 + }, + { + "epoch": 3.306936771025169, + "grad_norm": 0.26818498969078064, + "learning_rate": 7.811082843479981e-05, + "loss": 1.7894, + "step": 10774 + }, + { + "epoch": 3.307243707796194, + "grad_norm": 0.28857091069221497, + "learning_rate": 7.810671769239483e-05, + "loss": 1.8769, + "step": 10775 + }, + { + "epoch": 3.307550644567219, + "grad_norm": 0.26983144879341125, + "learning_rate": 7.810260667222277e-05, + "loss": 1.796, + "step": 10776 + }, + { + "epoch": 3.3078575813382445, + "grad_norm": 0.2566467225551605, + "learning_rate": 7.809849537432432e-05, + "loss": 1.848, + "step": 10777 + }, + { + "epoch": 3.3081645181092694, + "grad_norm": 0.25607848167419434, + "learning_rate": 7.809438379874005e-05, + "loss": 1.8072, + "step": 10778 + }, + { + "epoch": 3.3084714548802947, + "grad_norm": 0.29158470034599304, + "learning_rate": 7.809027194551059e-05, + "loss": 1.7772, + "step": 10779 + }, + { + "epoch": 3.3087783916513196, + "grad_norm": 0.360897421836853, + "learning_rate": 7.808615981467664e-05, + "loss": 1.8404, + "step": 10780 + }, + { + "epoch": 3.309085328422345, + "grad_norm": 0.31121253967285156, + "learning_rate": 7.808204740627877e-05, + "loss": 1.8137, + "step": 10781 + }, + { + "epoch": 3.3093922651933703, + "grad_norm": 0.2846451699733734, + "learning_rate": 7.807793472035765e-05, + "loss": 1.8367, + "step": 10782 + }, + { + "epoch": 3.309699201964395, + "grad_norm": 0.2711004316806793, + "learning_rate": 7.807382175695393e-05, + "loss": 1.7728, + "step": 10783 + }, + { + "epoch": 3.3100061387354205, + "grad_norm": 0.2693859338760376, + "learning_rate": 7.806970851610824e-05, + "loss": 1.7026, + "step": 10784 + }, + { + "epoch": 3.310313075506446, + "grad_norm": 0.3050517439842224, + "learning_rate": 7.806559499786125e-05, + "loss": 1.8041, + "step": 10785 + }, + { + "epoch": 3.3106200122774707, + "grad_norm": 0.27304747700691223, + "learning_rate": 7.80614812022536e-05, + "loss": 1.8182, + "step": 10786 + }, + { + "epoch": 3.310926949048496, + "grad_norm": 0.28378555178642273, + "learning_rate": 7.805736712932594e-05, + "loss": 1.8519, + "step": 10787 + }, + { + "epoch": 3.3112338858195214, + "grad_norm": 0.30620133876800537, + "learning_rate": 7.805325277911892e-05, + "loss": 1.8594, + "step": 10788 + }, + { + "epoch": 3.3115408225905463, + "grad_norm": 0.2580169141292572, + "learning_rate": 7.804913815167325e-05, + "loss": 1.7897, + "step": 10789 + }, + { + "epoch": 3.3118477593615716, + "grad_norm": 0.28937023878097534, + "learning_rate": 7.804502324702951e-05, + "loss": 1.8362, + "step": 10790 + }, + { + "epoch": 3.3121546961325965, + "grad_norm": 0.28032705187797546, + "learning_rate": 7.804090806522844e-05, + "loss": 1.8168, + "step": 10791 + }, + { + "epoch": 3.312461632903622, + "grad_norm": 0.33712559938430786, + "learning_rate": 7.803679260631069e-05, + "loss": 1.7489, + "step": 10792 + }, + { + "epoch": 3.312768569674647, + "grad_norm": 0.40536820888519287, + "learning_rate": 7.80326768703169e-05, + "loss": 1.8413, + "step": 10793 + }, + { + "epoch": 3.313075506445672, + "grad_norm": 0.34967559576034546, + "learning_rate": 7.802856085728778e-05, + "loss": 1.8076, + "step": 10794 + }, + { + "epoch": 3.3133824432166974, + "grad_norm": 0.2429870367050171, + "learning_rate": 7.8024444567264e-05, + "loss": 1.8002, + "step": 10795 + }, + { + "epoch": 3.3136893799877223, + "grad_norm": 0.40956684947013855, + "learning_rate": 7.802032800028621e-05, + "loss": 1.8151, + "step": 10796 + }, + { + "epoch": 3.3139963167587476, + "grad_norm": 0.4908781945705414, + "learning_rate": 7.801621115639512e-05, + "loss": 1.8124, + "step": 10797 + }, + { + "epoch": 3.314303253529773, + "grad_norm": 0.3922197222709656, + "learning_rate": 7.801209403563143e-05, + "loss": 1.7911, + "step": 10798 + }, + { + "epoch": 3.314610190300798, + "grad_norm": 0.29467105865478516, + "learning_rate": 7.800797663803578e-05, + "loss": 1.8472, + "step": 10799 + }, + { + "epoch": 3.314917127071823, + "grad_norm": 0.384974867105484, + "learning_rate": 7.800385896364891e-05, + "loss": 1.8139, + "step": 10800 + }, + { + "epoch": 3.3152240638428485, + "grad_norm": 0.4605129063129425, + "learning_rate": 7.79997410125115e-05, + "loss": 1.7982, + "step": 10801 + }, + { + "epoch": 3.3155310006138734, + "grad_norm": 0.2982464134693146, + "learning_rate": 7.799562278466423e-05, + "loss": 1.8496, + "step": 10802 + }, + { + "epoch": 3.3158379373848987, + "grad_norm": 0.3101392984390259, + "learning_rate": 7.79915042801478e-05, + "loss": 1.8172, + "step": 10803 + }, + { + "epoch": 3.316144874155924, + "grad_norm": 0.3651282489299774, + "learning_rate": 7.798738549900292e-05, + "loss": 1.7497, + "step": 10804 + }, + { + "epoch": 3.316451810926949, + "grad_norm": 0.28504419326782227, + "learning_rate": 7.79832664412703e-05, + "loss": 1.8027, + "step": 10805 + }, + { + "epoch": 3.3167587476979743, + "grad_norm": 0.28333309292793274, + "learning_rate": 7.797914710699063e-05, + "loss": 1.8121, + "step": 10806 + }, + { + "epoch": 3.317065684468999, + "grad_norm": 0.37549784779548645, + "learning_rate": 7.797502749620462e-05, + "loss": 1.817, + "step": 10807 + }, + { + "epoch": 3.3173726212400245, + "grad_norm": 0.3864210844039917, + "learning_rate": 7.797090760895301e-05, + "loss": 1.852, + "step": 10808 + }, + { + "epoch": 3.31767955801105, + "grad_norm": 0.2422102987766266, + "learning_rate": 7.79667874452765e-05, + "loss": 1.7523, + "step": 10809 + }, + { + "epoch": 3.3179864947820747, + "grad_norm": 0.307892382144928, + "learning_rate": 7.79626670052158e-05, + "loss": 1.7436, + "step": 10810 + }, + { + "epoch": 3.3182934315531, + "grad_norm": 0.29607462882995605, + "learning_rate": 7.795854628881162e-05, + "loss": 1.768, + "step": 10811 + }, + { + "epoch": 3.3186003683241254, + "grad_norm": 0.23334427177906036, + "learning_rate": 7.795442529610471e-05, + "loss": 1.7687, + "step": 10812 + }, + { + "epoch": 3.3189073050951503, + "grad_norm": 0.26257455348968506, + "learning_rate": 7.795030402713578e-05, + "loss": 1.8266, + "step": 10813 + }, + { + "epoch": 3.3192142418661756, + "grad_norm": 0.3252788782119751, + "learning_rate": 7.794618248194556e-05, + "loss": 1.8645, + "step": 10814 + }, + { + "epoch": 3.319521178637201, + "grad_norm": 0.3807232975959778, + "learning_rate": 7.79420606605748e-05, + "loss": 1.8154, + "step": 10815 + }, + { + "epoch": 3.319828115408226, + "grad_norm": 0.3395625948905945, + "learning_rate": 7.793793856306422e-05, + "loss": 1.8002, + "step": 10816 + }, + { + "epoch": 3.320135052179251, + "grad_norm": 0.2896415889263153, + "learning_rate": 7.793381618945455e-05, + "loss": 1.8077, + "step": 10817 + }, + { + "epoch": 3.320441988950276, + "grad_norm": 0.27733489871025085, + "learning_rate": 7.792969353978652e-05, + "loss": 1.7976, + "step": 10818 + }, + { + "epoch": 3.3207489257213014, + "grad_norm": 0.36985141038894653, + "learning_rate": 7.79255706141009e-05, + "loss": 1.8724, + "step": 10819 + }, + { + "epoch": 3.3210558624923268, + "grad_norm": 0.37886983156204224, + "learning_rate": 7.792144741243843e-05, + "loss": 1.8249, + "step": 10820 + }, + { + "epoch": 3.3213627992633517, + "grad_norm": 0.3030721843242645, + "learning_rate": 7.791732393483986e-05, + "loss": 1.7975, + "step": 10821 + }, + { + "epoch": 3.321669736034377, + "grad_norm": 0.2637709081172943, + "learning_rate": 7.791320018134592e-05, + "loss": 1.7205, + "step": 10822 + }, + { + "epoch": 3.321976672805402, + "grad_norm": 0.35307520627975464, + "learning_rate": 7.790907615199736e-05, + "loss": 1.8786, + "step": 10823 + }, + { + "epoch": 3.322283609576427, + "grad_norm": 0.3333272635936737, + "learning_rate": 7.790495184683497e-05, + "loss": 1.7715, + "step": 10824 + }, + { + "epoch": 3.3225905463474525, + "grad_norm": 0.2597469091415405, + "learning_rate": 7.790082726589948e-05, + "loss": 1.8379, + "step": 10825 + }, + { + "epoch": 3.3228974831184774, + "grad_norm": 0.34176257252693176, + "learning_rate": 7.789670240923168e-05, + "loss": 1.8305, + "step": 10826 + }, + { + "epoch": 3.3232044198895028, + "grad_norm": 0.37954533100128174, + "learning_rate": 7.789257727687229e-05, + "loss": 1.7728, + "step": 10827 + }, + { + "epoch": 3.323511356660528, + "grad_norm": 0.2840248644351959, + "learning_rate": 7.788845186886212e-05, + "loss": 1.8059, + "step": 10828 + }, + { + "epoch": 3.323818293431553, + "grad_norm": 0.3650275766849518, + "learning_rate": 7.788432618524193e-05, + "loss": 1.8127, + "step": 10829 + }, + { + "epoch": 3.3241252302025783, + "grad_norm": 0.4869692623615265, + "learning_rate": 7.788020022605247e-05, + "loss": 1.833, + "step": 10830 + }, + { + "epoch": 3.3244321669736037, + "grad_norm": 0.3419482707977295, + "learning_rate": 7.787607399133453e-05, + "loss": 1.7812, + "step": 10831 + }, + { + "epoch": 3.3247391037446286, + "grad_norm": 0.27625617384910583, + "learning_rate": 7.787194748112889e-05, + "loss": 1.8513, + "step": 10832 + }, + { + "epoch": 3.325046040515654, + "grad_norm": 0.4287806749343872, + "learning_rate": 7.786782069547633e-05, + "loss": 1.836, + "step": 10833 + }, + { + "epoch": 3.325352977286679, + "grad_norm": 0.4345545172691345, + "learning_rate": 7.786369363441763e-05, + "loss": 1.8027, + "step": 10834 + }, + { + "epoch": 3.325659914057704, + "grad_norm": 0.32976534962654114, + "learning_rate": 7.78595662979936e-05, + "loss": 1.7987, + "step": 10835 + }, + { + "epoch": 3.3259668508287294, + "grad_norm": 0.2677469849586487, + "learning_rate": 7.785543868624498e-05, + "loss": 1.8312, + "step": 10836 + }, + { + "epoch": 3.3262737875997543, + "grad_norm": 0.2547740638256073, + "learning_rate": 7.785131079921259e-05, + "loss": 1.7844, + "step": 10837 + }, + { + "epoch": 3.3265807243707797, + "grad_norm": 0.26755592226982117, + "learning_rate": 7.784718263693725e-05, + "loss": 1.8263, + "step": 10838 + }, + { + "epoch": 3.3268876611418046, + "grad_norm": 0.23884403705596924, + "learning_rate": 7.784305419945969e-05, + "loss": 1.7862, + "step": 10839 + }, + { + "epoch": 3.32719459791283, + "grad_norm": 0.2896903157234192, + "learning_rate": 7.783892548682077e-05, + "loss": 1.9138, + "step": 10840 + }, + { + "epoch": 3.3275015346838552, + "grad_norm": 0.3201359510421753, + "learning_rate": 7.783479649906127e-05, + "loss": 1.8382, + "step": 10841 + }, + { + "epoch": 3.32780847145488, + "grad_norm": 0.39285311102867126, + "learning_rate": 7.7830667236222e-05, + "loss": 1.7763, + "step": 10842 + }, + { + "epoch": 3.3281154082259055, + "grad_norm": 0.435007244348526, + "learning_rate": 7.782653769834376e-05, + "loss": 1.8415, + "step": 10843 + }, + { + "epoch": 3.328422344996931, + "grad_norm": 0.34605318307876587, + "learning_rate": 7.782240788546736e-05, + "loss": 1.757, + "step": 10844 + }, + { + "epoch": 3.3287292817679557, + "grad_norm": 0.26830604672431946, + "learning_rate": 7.781827779763362e-05, + "loss": 1.7779, + "step": 10845 + }, + { + "epoch": 3.329036218538981, + "grad_norm": 0.41851529479026794, + "learning_rate": 7.781414743488336e-05, + "loss": 1.8609, + "step": 10846 + }, + { + "epoch": 3.3293431553100064, + "grad_norm": 0.5058079361915588, + "learning_rate": 7.78100167972574e-05, + "loss": 1.8146, + "step": 10847 + }, + { + "epoch": 3.3296500920810312, + "grad_norm": 0.34394967555999756, + "learning_rate": 7.780588588479654e-05, + "loss": 1.8079, + "step": 10848 + }, + { + "epoch": 3.3299570288520566, + "grad_norm": 0.3033885061740875, + "learning_rate": 7.780175469754161e-05, + "loss": 1.8223, + "step": 10849 + }, + { + "epoch": 3.3302639656230815, + "grad_norm": 0.4431045651435852, + "learning_rate": 7.779762323553347e-05, + "loss": 1.8841, + "step": 10850 + }, + { + "epoch": 3.330570902394107, + "grad_norm": 0.3451448976993561, + "learning_rate": 7.77934914988129e-05, + "loss": 1.8092, + "step": 10851 + }, + { + "epoch": 3.330877839165132, + "grad_norm": 0.26580891013145447, + "learning_rate": 7.778935948742077e-05, + "loss": 1.8244, + "step": 10852 + }, + { + "epoch": 3.331184775936157, + "grad_norm": 0.32079070806503296, + "learning_rate": 7.778522720139792e-05, + "loss": 1.7816, + "step": 10853 + }, + { + "epoch": 3.3314917127071824, + "grad_norm": 0.35789042711257935, + "learning_rate": 7.778109464078514e-05, + "loss": 1.8211, + "step": 10854 + }, + { + "epoch": 3.3317986494782073, + "grad_norm": 0.2808612585067749, + "learning_rate": 7.77769618056233e-05, + "loss": 1.8387, + "step": 10855 + }, + { + "epoch": 3.3321055862492326, + "grad_norm": 0.24760548770427704, + "learning_rate": 7.777282869595326e-05, + "loss": 1.7795, + "step": 10856 + }, + { + "epoch": 3.332412523020258, + "grad_norm": 0.2840912640094757, + "learning_rate": 7.776869531181583e-05, + "loss": 1.7492, + "step": 10857 + }, + { + "epoch": 3.332719459791283, + "grad_norm": 0.2881413698196411, + "learning_rate": 7.77645616532519e-05, + "loss": 1.8157, + "step": 10858 + }, + { + "epoch": 3.333026396562308, + "grad_norm": 0.2508779764175415, + "learning_rate": 7.776042772030228e-05, + "loss": 1.8196, + "step": 10859 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.3307822048664093, + "learning_rate": 7.775629351300785e-05, + "loss": 1.8195, + "step": 10860 + }, + { + "epoch": 3.3336402701043584, + "grad_norm": 0.34392043948173523, + "learning_rate": 7.775215903140946e-05, + "loss": 1.7775, + "step": 10861 + }, + { + "epoch": 3.3339472068753837, + "grad_norm": 0.2594252824783325, + "learning_rate": 7.774802427554796e-05, + "loss": 1.7687, + "step": 10862 + }, + { + "epoch": 3.334254143646409, + "grad_norm": 0.3109053075313568, + "learning_rate": 7.774388924546423e-05, + "loss": 1.7908, + "step": 10863 + }, + { + "epoch": 3.334561080417434, + "grad_norm": 0.4801923930644989, + "learning_rate": 7.773975394119913e-05, + "loss": 1.8316, + "step": 10864 + }, + { + "epoch": 3.3348680171884593, + "grad_norm": 0.4754973351955414, + "learning_rate": 7.77356183627935e-05, + "loss": 1.8015, + "step": 10865 + }, + { + "epoch": 3.335174953959484, + "grad_norm": 0.29624658823013306, + "learning_rate": 7.773148251028825e-05, + "loss": 1.8179, + "step": 10866 + }, + { + "epoch": 3.3354818907305095, + "grad_norm": 0.32207581400871277, + "learning_rate": 7.772734638372423e-05, + "loss": 1.799, + "step": 10867 + }, + { + "epoch": 3.335788827501535, + "grad_norm": 0.5227517485618591, + "learning_rate": 7.772320998314233e-05, + "loss": 1.8452, + "step": 10868 + }, + { + "epoch": 3.3360957642725597, + "grad_norm": 0.4081100523471832, + "learning_rate": 7.771907330858341e-05, + "loss": 1.8182, + "step": 10869 + }, + { + "epoch": 3.336402701043585, + "grad_norm": 0.23786653578281403, + "learning_rate": 7.771493636008838e-05, + "loss": 1.7392, + "step": 10870 + }, + { + "epoch": 3.33670963781461, + "grad_norm": 0.37913820147514343, + "learning_rate": 7.771079913769807e-05, + "loss": 1.7559, + "step": 10871 + }, + { + "epoch": 3.3370165745856353, + "grad_norm": 0.4939163625240326, + "learning_rate": 7.770666164145344e-05, + "loss": 1.8076, + "step": 10872 + }, + { + "epoch": 3.3373235113566606, + "grad_norm": 0.3322528302669525, + "learning_rate": 7.770252387139532e-05, + "loss": 1.8045, + "step": 10873 + }, + { + "epoch": 3.337630448127686, + "grad_norm": 0.3685782849788666, + "learning_rate": 7.769838582756461e-05, + "loss": 1.7703, + "step": 10874 + }, + { + "epoch": 3.337937384898711, + "grad_norm": 0.5564271807670593, + "learning_rate": 7.769424751000224e-05, + "loss": 1.7697, + "step": 10875 + }, + { + "epoch": 3.338244321669736, + "grad_norm": 0.38610726594924927, + "learning_rate": 7.769010891874906e-05, + "loss": 1.7944, + "step": 10876 + }, + { + "epoch": 3.338551258440761, + "grad_norm": 0.23838558793067932, + "learning_rate": 7.768597005384602e-05, + "loss": 1.765, + "step": 10877 + }, + { + "epoch": 3.3388581952117864, + "grad_norm": 0.4334571063518524, + "learning_rate": 7.768183091533399e-05, + "loss": 1.7854, + "step": 10878 + }, + { + "epoch": 3.3391651319828117, + "grad_norm": 0.44844719767570496, + "learning_rate": 7.767769150325386e-05, + "loss": 1.7955, + "step": 10879 + }, + { + "epoch": 3.3394720687538366, + "grad_norm": 0.26543378829956055, + "learning_rate": 7.767355181764659e-05, + "loss": 1.8311, + "step": 10880 + }, + { + "epoch": 3.339779005524862, + "grad_norm": 0.39401358366012573, + "learning_rate": 7.766941185855304e-05, + "loss": 1.8264, + "step": 10881 + }, + { + "epoch": 3.340085942295887, + "grad_norm": 0.5476824045181274, + "learning_rate": 7.766527162601416e-05, + "loss": 1.8051, + "step": 10882 + }, + { + "epoch": 3.340392879066912, + "grad_norm": 0.4021138548851013, + "learning_rate": 7.766113112007084e-05, + "loss": 1.7941, + "step": 10883 + }, + { + "epoch": 3.3406998158379375, + "grad_norm": 0.3262040317058563, + "learning_rate": 7.765699034076402e-05, + "loss": 1.8317, + "step": 10884 + }, + { + "epoch": 3.3410067526089624, + "grad_norm": 0.5461146831512451, + "learning_rate": 7.765284928813459e-05, + "loss": 1.833, + "step": 10885 + }, + { + "epoch": 3.3413136893799877, + "grad_norm": 0.5067405700683594, + "learning_rate": 7.764870796222351e-05, + "loss": 1.7862, + "step": 10886 + }, + { + "epoch": 3.341620626151013, + "grad_norm": 0.2731069028377533, + "learning_rate": 7.76445663630717e-05, + "loss": 1.8173, + "step": 10887 + }, + { + "epoch": 3.341927562922038, + "grad_norm": 0.48928195238113403, + "learning_rate": 7.764042449072008e-05, + "loss": 1.7992, + "step": 10888 + }, + { + "epoch": 3.3422344996930633, + "grad_norm": 0.5338504910469055, + "learning_rate": 7.763628234520958e-05, + "loss": 1.7891, + "step": 10889 + }, + { + "epoch": 3.3425414364640886, + "grad_norm": 0.3136523365974426, + "learning_rate": 7.763213992658114e-05, + "loss": 1.8623, + "step": 10890 + }, + { + "epoch": 3.3428483732351135, + "grad_norm": 0.36551395058631897, + "learning_rate": 7.762799723487568e-05, + "loss": 1.8474, + "step": 10891 + }, + { + "epoch": 3.343155310006139, + "grad_norm": 0.35772353410720825, + "learning_rate": 7.762385427013419e-05, + "loss": 1.8625, + "step": 10892 + }, + { + "epoch": 3.3434622467771637, + "grad_norm": 0.29944708943367004, + "learning_rate": 7.761971103239755e-05, + "loss": 1.8181, + "step": 10893 + }, + { + "epoch": 3.343769183548189, + "grad_norm": 0.3395330309867859, + "learning_rate": 7.761556752170676e-05, + "loss": 1.7943, + "step": 10894 + }, + { + "epoch": 3.3440761203192144, + "grad_norm": 0.3624265193939209, + "learning_rate": 7.761142373810274e-05, + "loss": 1.8234, + "step": 10895 + }, + { + "epoch": 3.3443830570902393, + "grad_norm": 0.25409621000289917, + "learning_rate": 7.760727968162644e-05, + "loss": 1.7532, + "step": 10896 + }, + { + "epoch": 3.3446899938612646, + "grad_norm": 0.321437805891037, + "learning_rate": 7.760313535231883e-05, + "loss": 1.8808, + "step": 10897 + }, + { + "epoch": 3.3449969306322895, + "grad_norm": 0.2919142544269562, + "learning_rate": 7.759899075022086e-05, + "loss": 1.7677, + "step": 10898 + }, + { + "epoch": 3.345303867403315, + "grad_norm": 0.26515716314315796, + "learning_rate": 7.759484587537346e-05, + "loss": 1.8118, + "step": 10899 + }, + { + "epoch": 3.34561080417434, + "grad_norm": 0.2963240146636963, + "learning_rate": 7.759070072781764e-05, + "loss": 1.8329, + "step": 10900 + }, + { + "epoch": 3.345917740945365, + "grad_norm": 0.3186480700969696, + "learning_rate": 7.758655530759435e-05, + "loss": 1.8013, + "step": 10901 + }, + { + "epoch": 3.3462246777163904, + "grad_norm": 0.256145715713501, + "learning_rate": 7.758240961474454e-05, + "loss": 1.7865, + "step": 10902 + }, + { + "epoch": 3.3465316144874158, + "grad_norm": 0.28951629996299744, + "learning_rate": 7.757826364930921e-05, + "loss": 1.8091, + "step": 10903 + }, + { + "epoch": 3.3468385512584407, + "grad_norm": 0.2692483365535736, + "learning_rate": 7.75741174113293e-05, + "loss": 1.8308, + "step": 10904 + }, + { + "epoch": 3.347145488029466, + "grad_norm": 0.27615389227867126, + "learning_rate": 7.75699709008458e-05, + "loss": 1.7888, + "step": 10905 + }, + { + "epoch": 3.3474524248004913, + "grad_norm": 0.2819034457206726, + "learning_rate": 7.75658241178997e-05, + "loss": 1.7624, + "step": 10906 + }, + { + "epoch": 3.347759361571516, + "grad_norm": 0.2627592086791992, + "learning_rate": 7.756167706253196e-05, + "loss": 1.7696, + "step": 10907 + }, + { + "epoch": 3.3480662983425415, + "grad_norm": 0.3528621196746826, + "learning_rate": 7.755752973478356e-05, + "loss": 1.7725, + "step": 10908 + }, + { + "epoch": 3.3483732351135664, + "grad_norm": 0.35949698090553284, + "learning_rate": 7.755338213469552e-05, + "loss": 1.8163, + "step": 10909 + }, + { + "epoch": 3.3486801718845918, + "grad_norm": 0.25142577290534973, + "learning_rate": 7.75492342623088e-05, + "loss": 1.7879, + "step": 10910 + }, + { + "epoch": 3.348987108655617, + "grad_norm": 0.25766023993492126, + "learning_rate": 7.75450861176644e-05, + "loss": 1.8143, + "step": 10911 + }, + { + "epoch": 3.349294045426642, + "grad_norm": 0.2736956477165222, + "learning_rate": 7.754093770080331e-05, + "loss": 1.8907, + "step": 10912 + }, + { + "epoch": 3.3496009821976673, + "grad_norm": 0.23700755834579468, + "learning_rate": 7.753678901176654e-05, + "loss": 1.813, + "step": 10913 + }, + { + "epoch": 3.349907918968692, + "grad_norm": 0.245509073138237, + "learning_rate": 7.753264005059507e-05, + "loss": 1.8019, + "step": 10914 + }, + { + "epoch": 3.3502148557397176, + "grad_norm": 0.232910618185997, + "learning_rate": 7.752849081732993e-05, + "loss": 1.784, + "step": 10915 + }, + { + "epoch": 3.350521792510743, + "grad_norm": 0.22989360988140106, + "learning_rate": 7.75243413120121e-05, + "loss": 1.7597, + "step": 10916 + }, + { + "epoch": 3.350828729281768, + "grad_norm": 0.2093925178050995, + "learning_rate": 7.752019153468258e-05, + "loss": 1.7698, + "step": 10917 + }, + { + "epoch": 3.351135666052793, + "grad_norm": 0.25539630651474, + "learning_rate": 7.751604148538241e-05, + "loss": 1.8287, + "step": 10918 + }, + { + "epoch": 3.3514426028238185, + "grad_norm": 0.2731820046901703, + "learning_rate": 7.75118911641526e-05, + "loss": 1.8862, + "step": 10919 + }, + { + "epoch": 3.3517495395948433, + "grad_norm": 0.2464541345834732, + "learning_rate": 7.750774057103416e-05, + "loss": 1.8165, + "step": 10920 + }, + { + "epoch": 3.3520564763658687, + "grad_norm": 0.26380276679992676, + "learning_rate": 7.75035897060681e-05, + "loss": 1.78, + "step": 10921 + }, + { + "epoch": 3.352363413136894, + "grad_norm": 0.3080748915672302, + "learning_rate": 7.749943856929542e-05, + "loss": 1.7925, + "step": 10922 + }, + { + "epoch": 3.352670349907919, + "grad_norm": 0.317754864692688, + "learning_rate": 7.74952871607572e-05, + "loss": 1.8248, + "step": 10923 + }, + { + "epoch": 3.3529772866789442, + "grad_norm": 0.2525196373462677, + "learning_rate": 7.749113548049442e-05, + "loss": 1.762, + "step": 10924 + }, + { + "epoch": 3.353284223449969, + "grad_norm": 0.3149549961090088, + "learning_rate": 7.748698352854814e-05, + "loss": 1.8289, + "step": 10925 + }, + { + "epoch": 3.3535911602209945, + "grad_norm": 0.35744383931159973, + "learning_rate": 7.748283130495937e-05, + "loss": 1.8132, + "step": 10926 + }, + { + "epoch": 3.35389809699202, + "grad_norm": 0.28599128127098083, + "learning_rate": 7.747867880976916e-05, + "loss": 1.7351, + "step": 10927 + }, + { + "epoch": 3.3542050337630447, + "grad_norm": 0.24428869783878326, + "learning_rate": 7.747452604301852e-05, + "loss": 1.794, + "step": 10928 + }, + { + "epoch": 3.35451197053407, + "grad_norm": 0.29067808389663696, + "learning_rate": 7.747037300474854e-05, + "loss": 1.8181, + "step": 10929 + }, + { + "epoch": 3.354818907305095, + "grad_norm": 0.32417505979537964, + "learning_rate": 7.746621969500021e-05, + "loss": 1.8338, + "step": 10930 + }, + { + "epoch": 3.3551258440761202, + "grad_norm": 0.29536551237106323, + "learning_rate": 7.746206611381462e-05, + "loss": 1.8732, + "step": 10931 + }, + { + "epoch": 3.3554327808471456, + "grad_norm": 0.3169345259666443, + "learning_rate": 7.745791226123278e-05, + "loss": 1.876, + "step": 10932 + }, + { + "epoch": 3.3557397176181705, + "grad_norm": 0.2680271565914154, + "learning_rate": 7.745375813729576e-05, + "loss": 1.7347, + "step": 10933 + }, + { + "epoch": 3.356046654389196, + "grad_norm": 0.28339266777038574, + "learning_rate": 7.74496037420446e-05, + "loss": 1.8507, + "step": 10934 + }, + { + "epoch": 3.356353591160221, + "grad_norm": 0.2567409574985504, + "learning_rate": 7.744544907552038e-05, + "loss": 1.8244, + "step": 10935 + }, + { + "epoch": 3.356660527931246, + "grad_norm": 0.266063928604126, + "learning_rate": 7.744129413776416e-05, + "loss": 1.7864, + "step": 10936 + }, + { + "epoch": 3.3569674647022714, + "grad_norm": 0.2490999698638916, + "learning_rate": 7.743713892881696e-05, + "loss": 1.7637, + "step": 10937 + }, + { + "epoch": 3.3572744014732967, + "grad_norm": 0.25857025384902954, + "learning_rate": 7.743298344871988e-05, + "loss": 1.8101, + "step": 10938 + }, + { + "epoch": 3.3575813382443216, + "grad_norm": 0.2549006938934326, + "learning_rate": 7.742882769751398e-05, + "loss": 1.7782, + "step": 10939 + }, + { + "epoch": 3.357888275015347, + "grad_norm": 0.23915350437164307, + "learning_rate": 7.742467167524035e-05, + "loss": 1.7822, + "step": 10940 + }, + { + "epoch": 3.358195211786372, + "grad_norm": 0.25501590967178345, + "learning_rate": 7.742051538194e-05, + "loss": 1.798, + "step": 10941 + }, + { + "epoch": 3.358502148557397, + "grad_norm": 0.29332005977630615, + "learning_rate": 7.741635881765408e-05, + "loss": 1.8334, + "step": 10942 + }, + { + "epoch": 3.3588090853284225, + "grad_norm": 0.28878241777420044, + "learning_rate": 7.741220198242362e-05, + "loss": 1.8266, + "step": 10943 + }, + { + "epoch": 3.3591160220994474, + "grad_norm": 0.3068650960922241, + "learning_rate": 7.740804487628971e-05, + "loss": 1.8562, + "step": 10944 + }, + { + "epoch": 3.3594229588704727, + "grad_norm": 0.2522405683994293, + "learning_rate": 7.740388749929343e-05, + "loss": 1.8001, + "step": 10945 + }, + { + "epoch": 3.359729895641498, + "grad_norm": 0.3073521554470062, + "learning_rate": 7.739972985147588e-05, + "loss": 1.7454, + "step": 10946 + }, + { + "epoch": 3.360036832412523, + "grad_norm": 0.3018052577972412, + "learning_rate": 7.739557193287815e-05, + "loss": 1.7888, + "step": 10947 + }, + { + "epoch": 3.3603437691835483, + "grad_norm": 0.2738604247570038, + "learning_rate": 7.73914137435413e-05, + "loss": 1.7208, + "step": 10948 + }, + { + "epoch": 3.3606507059545736, + "grad_norm": 0.37699586153030396, + "learning_rate": 7.738725528350646e-05, + "loss": 1.8175, + "step": 10949 + }, + { + "epoch": 3.3609576427255985, + "grad_norm": 0.3479778468608856, + "learning_rate": 7.738309655281471e-05, + "loss": 1.818, + "step": 10950 + }, + { + "epoch": 3.361264579496624, + "grad_norm": 0.24871166050434113, + "learning_rate": 7.737893755150715e-05, + "loss": 1.7046, + "step": 10951 + }, + { + "epoch": 3.3615715162676487, + "grad_norm": 0.45015642046928406, + "learning_rate": 7.737477827962488e-05, + "loss": 1.8517, + "step": 10952 + }, + { + "epoch": 3.361878453038674, + "grad_norm": 0.4149077534675598, + "learning_rate": 7.7370618737209e-05, + "loss": 1.7403, + "step": 10953 + }, + { + "epoch": 3.3621853898096994, + "grad_norm": 0.2556059658527374, + "learning_rate": 7.736645892430064e-05, + "loss": 1.8167, + "step": 10954 + }, + { + "epoch": 3.3624923265807243, + "grad_norm": 0.3153657615184784, + "learning_rate": 7.736229884094088e-05, + "loss": 1.8471, + "step": 10955 + }, + { + "epoch": 3.3627992633517496, + "grad_norm": 0.27943772077560425, + "learning_rate": 7.735813848717084e-05, + "loss": 1.7742, + "step": 10956 + }, + { + "epoch": 3.3631062001227745, + "grad_norm": 0.28270283341407776, + "learning_rate": 7.735397786303164e-05, + "loss": 1.8418, + "step": 10957 + }, + { + "epoch": 3.3634131368938, + "grad_norm": 0.3596261441707611, + "learning_rate": 7.734981696856442e-05, + "loss": 1.8213, + "step": 10958 + }, + { + "epoch": 3.363720073664825, + "grad_norm": 0.3678492307662964, + "learning_rate": 7.734565580381026e-05, + "loss": 1.806, + "step": 10959 + }, + { + "epoch": 3.36402701043585, + "grad_norm": 0.27758681774139404, + "learning_rate": 7.734149436881031e-05, + "loss": 1.7832, + "step": 10960 + }, + { + "epoch": 3.3643339472068754, + "grad_norm": 0.2821379005908966, + "learning_rate": 7.733733266360568e-05, + "loss": 1.8888, + "step": 10961 + }, + { + "epoch": 3.3646408839779007, + "grad_norm": 0.33676958084106445, + "learning_rate": 7.733317068823751e-05, + "loss": 1.902, + "step": 10962 + }, + { + "epoch": 3.3649478207489256, + "grad_norm": 0.3116114139556885, + "learning_rate": 7.732900844274691e-05, + "loss": 1.8228, + "step": 10963 + }, + { + "epoch": 3.365254757519951, + "grad_norm": 0.3286324143409729, + "learning_rate": 7.732484592717506e-05, + "loss": 1.8707, + "step": 10964 + }, + { + "epoch": 3.3655616942909763, + "grad_norm": 0.2732192873954773, + "learning_rate": 7.732068314156304e-05, + "loss": 1.773, + "step": 10965 + }, + { + "epoch": 3.365868631062001, + "grad_norm": 0.26663896441459656, + "learning_rate": 7.731652008595204e-05, + "loss": 1.7837, + "step": 10966 + }, + { + "epoch": 3.3661755678330265, + "grad_norm": 0.27447745203971863, + "learning_rate": 7.731235676038317e-05, + "loss": 1.9103, + "step": 10967 + }, + { + "epoch": 3.3664825046040514, + "grad_norm": 0.30832916498184204, + "learning_rate": 7.730819316489757e-05, + "loss": 1.7552, + "step": 10968 + }, + { + "epoch": 3.3667894413750767, + "grad_norm": 0.29657161235809326, + "learning_rate": 7.73040292995364e-05, + "loss": 1.7654, + "step": 10969 + }, + { + "epoch": 3.367096378146102, + "grad_norm": 0.30434274673461914, + "learning_rate": 7.729986516434082e-05, + "loss": 1.8646, + "step": 10970 + }, + { + "epoch": 3.367403314917127, + "grad_norm": 0.25926661491394043, + "learning_rate": 7.729570075935198e-05, + "loss": 1.7555, + "step": 10971 + }, + { + "epoch": 3.3677102516881523, + "grad_norm": 0.2775980532169342, + "learning_rate": 7.729153608461102e-05, + "loss": 1.8427, + "step": 10972 + }, + { + "epoch": 3.368017188459177, + "grad_norm": 0.23915666341781616, + "learning_rate": 7.72873711401591e-05, + "loss": 1.7902, + "step": 10973 + }, + { + "epoch": 3.3683241252302025, + "grad_norm": 0.2603691518306732, + "learning_rate": 7.728320592603737e-05, + "loss": 1.8587, + "step": 10974 + }, + { + "epoch": 3.368631062001228, + "grad_norm": 0.2579508125782013, + "learning_rate": 7.727904044228703e-05, + "loss": 1.7617, + "step": 10975 + }, + { + "epoch": 3.3689379987722528, + "grad_norm": 0.3384297788143158, + "learning_rate": 7.72748746889492e-05, + "loss": 1.8499, + "step": 10976 + }, + { + "epoch": 3.369244935543278, + "grad_norm": 0.36756646633148193, + "learning_rate": 7.727070866606509e-05, + "loss": 1.808, + "step": 10977 + }, + { + "epoch": 3.3695518723143034, + "grad_norm": 0.3212372958660126, + "learning_rate": 7.726654237367587e-05, + "loss": 1.8245, + "step": 10978 + }, + { + "epoch": 3.3698588090853283, + "grad_norm": 0.23782415688037872, + "learning_rate": 7.726237581182267e-05, + "loss": 1.7629, + "step": 10979 + }, + { + "epoch": 3.3701657458563536, + "grad_norm": 0.2782919108867645, + "learning_rate": 7.725820898054669e-05, + "loss": 1.8, + "step": 10980 + }, + { + "epoch": 3.370472682627379, + "grad_norm": 0.2973455488681793, + "learning_rate": 7.725404187988914e-05, + "loss": 1.7949, + "step": 10981 + }, + { + "epoch": 3.370779619398404, + "grad_norm": 0.2875392735004425, + "learning_rate": 7.724987450989114e-05, + "loss": 1.8019, + "step": 10982 + }, + { + "epoch": 3.371086556169429, + "grad_norm": 0.26133236289024353, + "learning_rate": 7.724570687059394e-05, + "loss": 1.7984, + "step": 10983 + }, + { + "epoch": 3.371393492940454, + "grad_norm": 0.2760173976421356, + "learning_rate": 7.724153896203867e-05, + "loss": 1.8082, + "step": 10984 + }, + { + "epoch": 3.3717004297114794, + "grad_norm": 0.26373061537742615, + "learning_rate": 7.723737078426656e-05, + "loss": 1.8408, + "step": 10985 + }, + { + "epoch": 3.3720073664825048, + "grad_norm": 0.29425618052482605, + "learning_rate": 7.723320233731879e-05, + "loss": 1.7992, + "step": 10986 + }, + { + "epoch": 3.3723143032535297, + "grad_norm": 0.29822099208831787, + "learning_rate": 7.722903362123655e-05, + "loss": 1.8204, + "step": 10987 + }, + { + "epoch": 3.372621240024555, + "grad_norm": 0.25945618748664856, + "learning_rate": 7.722486463606104e-05, + "loss": 1.7376, + "step": 10988 + }, + { + "epoch": 3.37292817679558, + "grad_norm": 0.26367196440696716, + "learning_rate": 7.722069538183345e-05, + "loss": 1.814, + "step": 10989 + }, + { + "epoch": 3.373235113566605, + "grad_norm": 0.25015249848365784, + "learning_rate": 7.7216525858595e-05, + "loss": 1.8199, + "step": 10990 + }, + { + "epoch": 3.3735420503376305, + "grad_norm": 0.3035781681537628, + "learning_rate": 7.72123560663869e-05, + "loss": 1.739, + "step": 10991 + }, + { + "epoch": 3.3738489871086554, + "grad_norm": 0.2847912013530731, + "learning_rate": 7.720818600525033e-05, + "loss": 1.8754, + "step": 10992 + }, + { + "epoch": 3.3741559238796808, + "grad_norm": 0.2533976435661316, + "learning_rate": 7.720401567522653e-05, + "loss": 1.7616, + "step": 10993 + }, + { + "epoch": 3.374462860650706, + "grad_norm": 0.250828355550766, + "learning_rate": 7.719984507635669e-05, + "loss": 1.7973, + "step": 10994 + }, + { + "epoch": 3.374769797421731, + "grad_norm": 0.3019898235797882, + "learning_rate": 7.719567420868206e-05, + "loss": 1.7563, + "step": 10995 + }, + { + "epoch": 3.3750767341927563, + "grad_norm": 0.2703310549259186, + "learning_rate": 7.719150307224382e-05, + "loss": 1.8183, + "step": 10996 + }, + { + "epoch": 3.3753836709637817, + "grad_norm": 0.2434745579957962, + "learning_rate": 7.718733166708321e-05, + "loss": 1.7913, + "step": 10997 + }, + { + "epoch": 3.3756906077348066, + "grad_norm": 0.28036773204803467, + "learning_rate": 7.718315999324146e-05, + "loss": 1.7884, + "step": 10998 + }, + { + "epoch": 3.375997544505832, + "grad_norm": 0.25123077630996704, + "learning_rate": 7.717898805075978e-05, + "loss": 1.7394, + "step": 10999 + }, + { + "epoch": 3.376304481276857, + "grad_norm": 0.2313947230577469, + "learning_rate": 7.717481583967943e-05, + "loss": 1.7537, + "step": 11000 + }, + { + "epoch": 3.376611418047882, + "grad_norm": 0.27152860164642334, + "learning_rate": 7.71706433600416e-05, + "loss": 1.8596, + "step": 11001 + }, + { + "epoch": 3.3769183548189075, + "grad_norm": 0.32866382598876953, + "learning_rate": 7.716647061188757e-05, + "loss": 1.9007, + "step": 11002 + }, + { + "epoch": 3.3772252915899323, + "grad_norm": 0.2842368185520172, + "learning_rate": 7.716229759525854e-05, + "loss": 1.7781, + "step": 11003 + }, + { + "epoch": 3.3775322283609577, + "grad_norm": 0.30411216616630554, + "learning_rate": 7.715812431019576e-05, + "loss": 1.7403, + "step": 11004 + }, + { + "epoch": 3.3778391651319826, + "grad_norm": 0.31848132610321045, + "learning_rate": 7.71539507567405e-05, + "loss": 1.817, + "step": 11005 + }, + { + "epoch": 3.378146101903008, + "grad_norm": 0.24206148087978363, + "learning_rate": 7.714977693493397e-05, + "loss": 1.7796, + "step": 11006 + }, + { + "epoch": 3.3784530386740332, + "grad_norm": 0.2982998490333557, + "learning_rate": 7.714560284481742e-05, + "loss": 1.7883, + "step": 11007 + }, + { + "epoch": 3.378759975445058, + "grad_norm": 0.24857483804225922, + "learning_rate": 7.714142848643213e-05, + "loss": 1.7447, + "step": 11008 + }, + { + "epoch": 3.3790669122160835, + "grad_norm": 0.2509039044380188, + "learning_rate": 7.713725385981932e-05, + "loss": 1.8362, + "step": 11009 + }, + { + "epoch": 3.379373848987109, + "grad_norm": 0.2759779095649719, + "learning_rate": 7.713307896502027e-05, + "loss": 1.8655, + "step": 11010 + }, + { + "epoch": 3.3796807857581337, + "grad_norm": 0.264776349067688, + "learning_rate": 7.712890380207623e-05, + "loss": 1.8221, + "step": 11011 + }, + { + "epoch": 3.379987722529159, + "grad_norm": 0.2771971821784973, + "learning_rate": 7.712472837102846e-05, + "loss": 1.6992, + "step": 11012 + }, + { + "epoch": 3.3802946593001844, + "grad_norm": 0.2749316096305847, + "learning_rate": 7.712055267191822e-05, + "loss": 1.8128, + "step": 11013 + }, + { + "epoch": 3.3806015960712092, + "grad_norm": 0.256656289100647, + "learning_rate": 7.71163767047868e-05, + "loss": 1.8382, + "step": 11014 + }, + { + "epoch": 3.3809085328422346, + "grad_norm": 0.27646976709365845, + "learning_rate": 7.711220046967545e-05, + "loss": 1.8321, + "step": 11015 + }, + { + "epoch": 3.3812154696132595, + "grad_norm": 0.3083149194717407, + "learning_rate": 7.710802396662542e-05, + "loss": 1.904, + "step": 11016 + }, + { + "epoch": 3.381522406384285, + "grad_norm": 0.2750856280326843, + "learning_rate": 7.710384719567803e-05, + "loss": 1.7596, + "step": 11017 + }, + { + "epoch": 3.38182934315531, + "grad_norm": 0.3029455244541168, + "learning_rate": 7.709967015687452e-05, + "loss": 1.8542, + "step": 11018 + }, + { + "epoch": 3.382136279926335, + "grad_norm": 0.3144093453884125, + "learning_rate": 7.709549285025622e-05, + "loss": 1.7489, + "step": 11019 + }, + { + "epoch": 3.3824432166973604, + "grad_norm": 0.2675442099571228, + "learning_rate": 7.709131527586433e-05, + "loss": 1.7324, + "step": 11020 + }, + { + "epoch": 3.3827501534683857, + "grad_norm": 0.2906095087528229, + "learning_rate": 7.708713743374021e-05, + "loss": 1.7848, + "step": 11021 + }, + { + "epoch": 3.3830570902394106, + "grad_norm": 0.25141623616218567, + "learning_rate": 7.708295932392513e-05, + "loss": 1.7423, + "step": 11022 + }, + { + "epoch": 3.383364027010436, + "grad_norm": 0.25832003355026245, + "learning_rate": 7.707878094646037e-05, + "loss": 1.7792, + "step": 11023 + }, + { + "epoch": 3.3836709637814613, + "grad_norm": 0.23710070550441742, + "learning_rate": 7.70746023013872e-05, + "loss": 1.7916, + "step": 11024 + }, + { + "epoch": 3.383977900552486, + "grad_norm": 0.286735862493515, + "learning_rate": 7.707042338874697e-05, + "loss": 1.8272, + "step": 11025 + }, + { + "epoch": 3.3842848373235115, + "grad_norm": 0.2536577582359314, + "learning_rate": 7.706624420858094e-05, + "loss": 1.7839, + "step": 11026 + }, + { + "epoch": 3.3845917740945364, + "grad_norm": 0.5564702749252319, + "learning_rate": 7.706206476093043e-05, + "loss": 1.7832, + "step": 11027 + }, + { + "epoch": 3.3848987108655617, + "grad_norm": 0.34694772958755493, + "learning_rate": 7.705788504583671e-05, + "loss": 1.8668, + "step": 11028 + }, + { + "epoch": 3.385205647636587, + "grad_norm": 0.30388176441192627, + "learning_rate": 7.705370506334113e-05, + "loss": 1.8244, + "step": 11029 + }, + { + "epoch": 3.385512584407612, + "grad_norm": 0.2998919188976288, + "learning_rate": 7.704952481348497e-05, + "loss": 1.7927, + "step": 11030 + }, + { + "epoch": 3.3858195211786373, + "grad_norm": 0.2714936435222626, + "learning_rate": 7.704534429630955e-05, + "loss": 1.8757, + "step": 11031 + }, + { + "epoch": 3.386126457949662, + "grad_norm": 0.26670241355895996, + "learning_rate": 7.704116351185619e-05, + "loss": 1.8146, + "step": 11032 + }, + { + "epoch": 3.3864333947206875, + "grad_norm": 0.2500552833080292, + "learning_rate": 7.703698246016621e-05, + "loss": 1.7984, + "step": 11033 + }, + { + "epoch": 3.386740331491713, + "grad_norm": 0.2494918406009674, + "learning_rate": 7.703280114128091e-05, + "loss": 1.7433, + "step": 11034 + }, + { + "epoch": 3.3870472682627377, + "grad_norm": 0.25658491253852844, + "learning_rate": 7.702861955524163e-05, + "loss": 1.8487, + "step": 11035 + }, + { + "epoch": 3.387354205033763, + "grad_norm": 0.2871410548686981, + "learning_rate": 7.702443770208969e-05, + "loss": 1.7919, + "step": 11036 + }, + { + "epoch": 3.3876611418047884, + "grad_norm": 0.3347938060760498, + "learning_rate": 7.702025558186643e-05, + "loss": 1.8091, + "step": 11037 + }, + { + "epoch": 3.3879680785758133, + "grad_norm": 0.39016643166542053, + "learning_rate": 7.701607319461315e-05, + "loss": 1.7816, + "step": 11038 + }, + { + "epoch": 3.3882750153468386, + "grad_norm": 0.3423028290271759, + "learning_rate": 7.701189054037121e-05, + "loss": 1.8454, + "step": 11039 + }, + { + "epoch": 3.388581952117864, + "grad_norm": 0.27592089772224426, + "learning_rate": 7.700770761918192e-05, + "loss": 1.8431, + "step": 11040 + }, + { + "epoch": 3.388888888888889, + "grad_norm": 0.46047264337539673, + "learning_rate": 7.700352443108665e-05, + "loss": 1.8412, + "step": 11041 + }, + { + "epoch": 3.389195825659914, + "grad_norm": 0.49226754903793335, + "learning_rate": 7.699934097612673e-05, + "loss": 1.8212, + "step": 11042 + }, + { + "epoch": 3.389502762430939, + "grad_norm": 0.3958778381347656, + "learning_rate": 7.699515725434348e-05, + "loss": 1.747, + "step": 11043 + }, + { + "epoch": 3.3898096992019644, + "grad_norm": 0.26097169518470764, + "learning_rate": 7.699097326577827e-05, + "loss": 1.7631, + "step": 11044 + }, + { + "epoch": 3.3901166359729897, + "grad_norm": 0.2922612130641937, + "learning_rate": 7.698678901047245e-05, + "loss": 1.7891, + "step": 11045 + }, + { + "epoch": 3.3904235727440146, + "grad_norm": 0.4195055365562439, + "learning_rate": 7.698260448846734e-05, + "loss": 1.7765, + "step": 11046 + }, + { + "epoch": 3.39073050951504, + "grad_norm": 0.4572988450527191, + "learning_rate": 7.697841969980434e-05, + "loss": 1.8085, + "step": 11047 + }, + { + "epoch": 3.391037446286065, + "grad_norm": 0.38819587230682373, + "learning_rate": 7.697423464452478e-05, + "loss": 1.8854, + "step": 11048 + }, + { + "epoch": 3.39134438305709, + "grad_norm": 0.27421653270721436, + "learning_rate": 7.697004932267003e-05, + "loss": 1.8327, + "step": 11049 + }, + { + "epoch": 3.3916513198281155, + "grad_norm": 0.33559146523475647, + "learning_rate": 7.696586373428142e-05, + "loss": 1.8109, + "step": 11050 + }, + { + "epoch": 3.3919582565991404, + "grad_norm": 0.39438655972480774, + "learning_rate": 7.696167787940037e-05, + "loss": 1.7909, + "step": 11051 + }, + { + "epoch": 3.3922651933701657, + "grad_norm": 0.3425842523574829, + "learning_rate": 7.695749175806819e-05, + "loss": 1.8571, + "step": 11052 + }, + { + "epoch": 3.392572130141191, + "grad_norm": 0.2860080301761627, + "learning_rate": 7.695330537032628e-05, + "loss": 1.8546, + "step": 11053 + }, + { + "epoch": 3.392879066912216, + "grad_norm": 0.35894665122032166, + "learning_rate": 7.694911871621601e-05, + "loss": 1.7895, + "step": 11054 + }, + { + "epoch": 3.3931860036832413, + "grad_norm": 0.351193904876709, + "learning_rate": 7.694493179577879e-05, + "loss": 1.7453, + "step": 11055 + }, + { + "epoch": 3.3934929404542666, + "grad_norm": 0.24812865257263184, + "learning_rate": 7.694074460905592e-05, + "loss": 1.8131, + "step": 11056 + }, + { + "epoch": 3.3937998772252915, + "grad_norm": 0.38620972633361816, + "learning_rate": 7.693655715608883e-05, + "loss": 1.8346, + "step": 11057 + }, + { + "epoch": 3.394106813996317, + "grad_norm": 0.5005692839622498, + "learning_rate": 7.69323694369189e-05, + "loss": 1.9031, + "step": 11058 + }, + { + "epoch": 3.3944137507673418, + "grad_norm": 0.4321887791156769, + "learning_rate": 7.692818145158751e-05, + "loss": 1.8783, + "step": 11059 + }, + { + "epoch": 3.394720687538367, + "grad_norm": 0.269307017326355, + "learning_rate": 7.692399320013603e-05, + "loss": 1.8075, + "step": 11060 + }, + { + "epoch": 3.3950276243093924, + "grad_norm": 0.2945556342601776, + "learning_rate": 7.69198046826059e-05, + "loss": 1.8366, + "step": 11061 + }, + { + "epoch": 3.3953345610804173, + "grad_norm": 0.30531853437423706, + "learning_rate": 7.691561589903847e-05, + "loss": 1.7665, + "step": 11062 + }, + { + "epoch": 3.3956414978514426, + "grad_norm": 0.25105199217796326, + "learning_rate": 7.691142684947513e-05, + "loss": 1.782, + "step": 11063 + }, + { + "epoch": 3.3959484346224675, + "grad_norm": 0.3373202085494995, + "learning_rate": 7.69072375339573e-05, + "loss": 1.8148, + "step": 11064 + }, + { + "epoch": 3.396255371393493, + "grad_norm": 0.34207093715667725, + "learning_rate": 7.690304795252638e-05, + "loss": 1.8287, + "step": 11065 + }, + { + "epoch": 3.396562308164518, + "grad_norm": 0.26281681656837463, + "learning_rate": 7.68988581052238e-05, + "loss": 1.8551, + "step": 11066 + }, + { + "epoch": 3.396869244935543, + "grad_norm": 0.3091152608394623, + "learning_rate": 7.689466799209091e-05, + "loss": 1.7689, + "step": 11067 + }, + { + "epoch": 3.3971761817065684, + "grad_norm": 0.37421298027038574, + "learning_rate": 7.689047761316914e-05, + "loss": 1.7908, + "step": 11068 + }, + { + "epoch": 3.3974831184775938, + "grad_norm": 0.3745511770248413, + "learning_rate": 7.688628696849993e-05, + "loss": 1.8408, + "step": 11069 + }, + { + "epoch": 3.3977900552486187, + "grad_norm": 0.3003663122653961, + "learning_rate": 7.688209605812467e-05, + "loss": 1.9109, + "step": 11070 + }, + { + "epoch": 3.398096992019644, + "grad_norm": 0.3437681496143341, + "learning_rate": 7.687790488208478e-05, + "loss": 1.811, + "step": 11071 + }, + { + "epoch": 3.3984039287906693, + "grad_norm": 0.3480641841888428, + "learning_rate": 7.687371344042168e-05, + "loss": 1.8114, + "step": 11072 + }, + { + "epoch": 3.398710865561694, + "grad_norm": 0.24670913815498352, + "learning_rate": 7.686952173317679e-05, + "loss": 1.7959, + "step": 11073 + }, + { + "epoch": 3.3990178023327196, + "grad_norm": 0.2939499020576477, + "learning_rate": 7.686532976039154e-05, + "loss": 1.7518, + "step": 11074 + }, + { + "epoch": 3.3993247391037444, + "grad_norm": 0.3332279622554779, + "learning_rate": 7.686113752210736e-05, + "loss": 1.843, + "step": 11075 + }, + { + "epoch": 3.3996316758747698, + "grad_norm": 0.22967280447483063, + "learning_rate": 7.685694501836566e-05, + "loss": 1.7408, + "step": 11076 + }, + { + "epoch": 3.399938612645795, + "grad_norm": 0.3443470001220703, + "learning_rate": 7.685275224920789e-05, + "loss": 1.8004, + "step": 11077 + }, + { + "epoch": 3.40024554941682, + "grad_norm": 0.3725457489490509, + "learning_rate": 7.684855921467548e-05, + "loss": 1.833, + "step": 11078 + }, + { + "epoch": 3.4005524861878453, + "grad_norm": 0.3178638219833374, + "learning_rate": 7.68443659148099e-05, + "loss": 1.8055, + "step": 11079 + }, + { + "epoch": 3.4008594229588702, + "grad_norm": 0.2609167695045471, + "learning_rate": 7.684017234965254e-05, + "loss": 1.7881, + "step": 11080 + }, + { + "epoch": 3.4011663597298956, + "grad_norm": 0.26975762844085693, + "learning_rate": 7.683597851924486e-05, + "loss": 1.8424, + "step": 11081 + }, + { + "epoch": 3.401473296500921, + "grad_norm": 0.266661673784256, + "learning_rate": 7.683178442362832e-05, + "loss": 1.7785, + "step": 11082 + }, + { + "epoch": 3.401780233271946, + "grad_norm": 0.27915671467781067, + "learning_rate": 7.682759006284436e-05, + "loss": 1.8241, + "step": 11083 + }, + { + "epoch": 3.402087170042971, + "grad_norm": 0.25167274475097656, + "learning_rate": 7.682339543693444e-05, + "loss": 1.7637, + "step": 11084 + }, + { + "epoch": 3.4023941068139965, + "grad_norm": 0.2439529299736023, + "learning_rate": 7.681920054593999e-05, + "loss": 1.7796, + "step": 11085 + }, + { + "epoch": 3.4027010435850213, + "grad_norm": 0.26224252581596375, + "learning_rate": 7.681500538990249e-05, + "loss": 1.8018, + "step": 11086 + }, + { + "epoch": 3.4030079803560467, + "grad_norm": 0.25093868374824524, + "learning_rate": 7.681080996886336e-05, + "loss": 1.7664, + "step": 11087 + }, + { + "epoch": 3.403314917127072, + "grad_norm": 0.26393210887908936, + "learning_rate": 7.680661428286413e-05, + "loss": 1.8389, + "step": 11088 + }, + { + "epoch": 3.403621853898097, + "grad_norm": 0.24750283360481262, + "learning_rate": 7.680241833194622e-05, + "loss": 1.8358, + "step": 11089 + }, + { + "epoch": 3.4039287906691222, + "grad_norm": 0.21568982303142548, + "learning_rate": 7.67982221161511e-05, + "loss": 1.7874, + "step": 11090 + }, + { + "epoch": 3.404235727440147, + "grad_norm": 0.24407126009464264, + "learning_rate": 7.679402563552023e-05, + "loss": 1.7753, + "step": 11091 + }, + { + "epoch": 3.4045426642111725, + "grad_norm": 0.23288260400295258, + "learning_rate": 7.67898288900951e-05, + "loss": 1.8046, + "step": 11092 + }, + { + "epoch": 3.404849600982198, + "grad_norm": 0.2548544108867645, + "learning_rate": 7.678563187991718e-05, + "loss": 1.8778, + "step": 11093 + }, + { + "epoch": 3.4051565377532227, + "grad_norm": 0.24008090794086456, + "learning_rate": 7.678143460502796e-05, + "loss": 1.7912, + "step": 11094 + }, + { + "epoch": 3.405463474524248, + "grad_norm": 0.26085031032562256, + "learning_rate": 7.677723706546889e-05, + "loss": 1.849, + "step": 11095 + }, + { + "epoch": 3.4057704112952734, + "grad_norm": 0.2830932140350342, + "learning_rate": 7.677303926128147e-05, + "loss": 1.8265, + "step": 11096 + }, + { + "epoch": 3.4060773480662982, + "grad_norm": 0.27593597769737244, + "learning_rate": 7.676884119250718e-05, + "loss": 1.8555, + "step": 11097 + }, + { + "epoch": 3.4063842848373236, + "grad_norm": 0.2403372824192047, + "learning_rate": 7.676464285918751e-05, + "loss": 1.7243, + "step": 11098 + }, + { + "epoch": 3.406691221608349, + "grad_norm": 0.28830090165138245, + "learning_rate": 7.676044426136397e-05, + "loss": 1.8108, + "step": 11099 + }, + { + "epoch": 3.406998158379374, + "grad_norm": 0.2918153405189514, + "learning_rate": 7.675624539907802e-05, + "loss": 1.7875, + "step": 11100 + }, + { + "epoch": 3.407305095150399, + "grad_norm": 0.2609013020992279, + "learning_rate": 7.675204627237117e-05, + "loss": 1.778, + "step": 11101 + }, + { + "epoch": 3.407612031921424, + "grad_norm": 0.2714763283729553, + "learning_rate": 7.674784688128494e-05, + "loss": 1.8472, + "step": 11102 + }, + { + "epoch": 3.4079189686924494, + "grad_norm": 0.25857117772102356, + "learning_rate": 7.674364722586078e-05, + "loss": 1.7495, + "step": 11103 + }, + { + "epoch": 3.4082259054634747, + "grad_norm": 0.25485143065452576, + "learning_rate": 7.673944730614023e-05, + "loss": 1.7817, + "step": 11104 + }, + { + "epoch": 3.4085328422344996, + "grad_norm": 0.2735857665538788, + "learning_rate": 7.67352471221648e-05, + "loss": 1.7522, + "step": 11105 + }, + { + "epoch": 3.408839779005525, + "grad_norm": 0.25079572200775146, + "learning_rate": 7.6731046673976e-05, + "loss": 1.765, + "step": 11106 + }, + { + "epoch": 3.40914671577655, + "grad_norm": 0.3080148696899414, + "learning_rate": 7.672684596161532e-05, + "loss": 1.8305, + "step": 11107 + }, + { + "epoch": 3.409453652547575, + "grad_norm": 0.23771968483924866, + "learning_rate": 7.672264498512427e-05, + "loss": 1.7837, + "step": 11108 + }, + { + "epoch": 3.4097605893186005, + "grad_norm": 0.29941999912261963, + "learning_rate": 7.671844374454437e-05, + "loss": 1.8013, + "step": 11109 + }, + { + "epoch": 3.4100675260896254, + "grad_norm": 0.27871644496917725, + "learning_rate": 7.671424223991717e-05, + "loss": 1.8598, + "step": 11110 + }, + { + "epoch": 3.4103744628606507, + "grad_norm": 0.2751443684101105, + "learning_rate": 7.671004047128416e-05, + "loss": 1.8341, + "step": 11111 + }, + { + "epoch": 3.410681399631676, + "grad_norm": 0.27227312326431274, + "learning_rate": 7.670583843868688e-05, + "loss": 1.81, + "step": 11112 + }, + { + "epoch": 3.410988336402701, + "grad_norm": 0.29617756605148315, + "learning_rate": 7.670163614216685e-05, + "loss": 1.8795, + "step": 11113 + }, + { + "epoch": 3.4112952731737263, + "grad_norm": 0.268920361995697, + "learning_rate": 7.669743358176563e-05, + "loss": 1.7659, + "step": 11114 + }, + { + "epoch": 3.4116022099447516, + "grad_norm": 0.2875109314918518, + "learning_rate": 7.669323075752467e-05, + "loss": 1.8263, + "step": 11115 + }, + { + "epoch": 3.4119091467157765, + "grad_norm": 0.34703585505485535, + "learning_rate": 7.668902766948558e-05, + "loss": 1.7622, + "step": 11116 + }, + { + "epoch": 3.412216083486802, + "grad_norm": 0.3090265393257141, + "learning_rate": 7.668482431768989e-05, + "loss": 1.7381, + "step": 11117 + }, + { + "epoch": 3.4125230202578267, + "grad_norm": 0.2619737684726715, + "learning_rate": 7.668062070217911e-05, + "loss": 1.8004, + "step": 11118 + }, + { + "epoch": 3.412829957028852, + "grad_norm": 0.289815217256546, + "learning_rate": 7.667641682299482e-05, + "loss": 1.7946, + "step": 11119 + }, + { + "epoch": 3.4131368937998774, + "grad_norm": 0.28732073307037354, + "learning_rate": 7.667221268017852e-05, + "loss": 1.8746, + "step": 11120 + }, + { + "epoch": 3.4134438305709023, + "grad_norm": 0.23232576251029968, + "learning_rate": 7.666800827377178e-05, + "loss": 1.7403, + "step": 11121 + }, + { + "epoch": 3.4137507673419276, + "grad_norm": 0.22903507947921753, + "learning_rate": 7.666380360381616e-05, + "loss": 1.7785, + "step": 11122 + }, + { + "epoch": 3.4140577041129525, + "grad_norm": 0.25023025274276733, + "learning_rate": 7.665959867035321e-05, + "loss": 1.7881, + "step": 11123 + }, + { + "epoch": 3.414364640883978, + "grad_norm": 0.2199166864156723, + "learning_rate": 7.665539347342449e-05, + "loss": 1.7522, + "step": 11124 + }, + { + "epoch": 3.414671577655003, + "grad_norm": 0.2539862394332886, + "learning_rate": 7.665118801307152e-05, + "loss": 1.7964, + "step": 11125 + }, + { + "epoch": 3.414978514426028, + "grad_norm": 0.22670161724090576, + "learning_rate": 7.664698228933591e-05, + "loss": 1.7071, + "step": 11126 + }, + { + "epoch": 3.4152854511970534, + "grad_norm": 0.24827396869659424, + "learning_rate": 7.664277630225919e-05, + "loss": 1.7897, + "step": 11127 + }, + { + "epoch": 3.4155923879680787, + "grad_norm": 0.29391366243362427, + "learning_rate": 7.663857005188296e-05, + "loss": 1.7967, + "step": 11128 + }, + { + "epoch": 3.4158993247391036, + "grad_norm": 0.3201812505722046, + "learning_rate": 7.663436353824874e-05, + "loss": 1.7681, + "step": 11129 + }, + { + "epoch": 3.416206261510129, + "grad_norm": 0.2274552583694458, + "learning_rate": 7.663015676139814e-05, + "loss": 1.7535, + "step": 11130 + }, + { + "epoch": 3.4165131982811543, + "grad_norm": 0.3955044150352478, + "learning_rate": 7.662594972137273e-05, + "loss": 1.8175, + "step": 11131 + }, + { + "epoch": 3.416820135052179, + "grad_norm": 0.46493569016456604, + "learning_rate": 7.662174241821406e-05, + "loss": 1.7806, + "step": 11132 + }, + { + "epoch": 3.4171270718232045, + "grad_norm": 0.37731611728668213, + "learning_rate": 7.661753485196375e-05, + "loss": 1.7555, + "step": 11133 + }, + { + "epoch": 3.4174340085942294, + "grad_norm": 0.23983556032180786, + "learning_rate": 7.661332702266334e-05, + "loss": 1.7662, + "step": 11134 + }, + { + "epoch": 3.4177409453652547, + "grad_norm": 0.34964314103126526, + "learning_rate": 7.660911893035445e-05, + "loss": 1.7786, + "step": 11135 + }, + { + "epoch": 3.41804788213628, + "grad_norm": 0.44820764660835266, + "learning_rate": 7.660491057507864e-05, + "loss": 1.778, + "step": 11136 + }, + { + "epoch": 3.418354818907305, + "grad_norm": 0.32936233282089233, + "learning_rate": 7.660070195687752e-05, + "loss": 1.8181, + "step": 11137 + }, + { + "epoch": 3.4186617556783303, + "grad_norm": 0.2874850332736969, + "learning_rate": 7.659649307579266e-05, + "loss": 1.8733, + "step": 11138 + }, + { + "epoch": 3.418968692449355, + "grad_norm": 0.46269866824150085, + "learning_rate": 7.659228393186566e-05, + "loss": 1.8566, + "step": 11139 + }, + { + "epoch": 3.4192756292203805, + "grad_norm": 0.5873839855194092, + "learning_rate": 7.658807452513816e-05, + "loss": 1.8317, + "step": 11140 + }, + { + "epoch": 3.419582565991406, + "grad_norm": 0.43150341510772705, + "learning_rate": 7.65838648556517e-05, + "loss": 1.7702, + "step": 11141 + }, + { + "epoch": 3.4198895027624308, + "grad_norm": 0.2803891599178314, + "learning_rate": 7.65796549234479e-05, + "loss": 1.8043, + "step": 11142 + }, + { + "epoch": 3.420196439533456, + "grad_norm": 0.37295013666152954, + "learning_rate": 7.657544472856838e-05, + "loss": 1.7923, + "step": 11143 + }, + { + "epoch": 3.4205033763044814, + "grad_norm": 0.3922573924064636, + "learning_rate": 7.657123427105473e-05, + "loss": 1.8231, + "step": 11144 + }, + { + "epoch": 3.4208103130755063, + "grad_norm": 0.27254152297973633, + "learning_rate": 7.656702355094859e-05, + "loss": 1.8168, + "step": 11145 + }, + { + "epoch": 3.4211172498465316, + "grad_norm": 0.28005337715148926, + "learning_rate": 7.656281256829152e-05, + "loss": 1.8047, + "step": 11146 + }, + { + "epoch": 3.421424186617557, + "grad_norm": 0.4369073808193207, + "learning_rate": 7.655860132312519e-05, + "loss": 1.7243, + "step": 11147 + }, + { + "epoch": 3.421731123388582, + "grad_norm": 0.4127553701400757, + "learning_rate": 7.655438981549119e-05, + "loss": 1.8148, + "step": 11148 + }, + { + "epoch": 3.422038060159607, + "grad_norm": 0.3131798207759857, + "learning_rate": 7.655017804543114e-05, + "loss": 1.789, + "step": 11149 + }, + { + "epoch": 3.422344996930632, + "grad_norm": 0.2947194576263428, + "learning_rate": 7.654596601298666e-05, + "loss": 1.8221, + "step": 11150 + }, + { + "epoch": 3.4226519337016574, + "grad_norm": 0.3072497546672821, + "learning_rate": 7.654175371819941e-05, + "loss": 1.7747, + "step": 11151 + }, + { + "epoch": 3.4229588704726828, + "grad_norm": 0.29408320784568787, + "learning_rate": 7.653754116111099e-05, + "loss": 1.9009, + "step": 11152 + }, + { + "epoch": 3.4232658072437077, + "grad_norm": 0.2629215717315674, + "learning_rate": 7.653332834176303e-05, + "loss": 1.7354, + "step": 11153 + }, + { + "epoch": 3.423572744014733, + "grad_norm": 0.2850257456302643, + "learning_rate": 7.652911526019716e-05, + "loss": 1.8422, + "step": 11154 + }, + { + "epoch": 3.423879680785758, + "grad_norm": 0.29787111282348633, + "learning_rate": 7.652490191645503e-05, + "loss": 1.8122, + "step": 11155 + }, + { + "epoch": 3.424186617556783, + "grad_norm": 0.2670947015285492, + "learning_rate": 7.652068831057826e-05, + "loss": 1.7734, + "step": 11156 + }, + { + "epoch": 3.4244935543278086, + "grad_norm": 0.26415133476257324, + "learning_rate": 7.651647444260853e-05, + "loss": 1.7661, + "step": 11157 + }, + { + "epoch": 3.424800491098834, + "grad_norm": 0.2614886164665222, + "learning_rate": 7.651226031258745e-05, + "loss": 1.6918, + "step": 11158 + }, + { + "epoch": 3.425107427869859, + "grad_norm": 0.28485649824142456, + "learning_rate": 7.650804592055667e-05, + "loss": 1.7771, + "step": 11159 + }, + { + "epoch": 3.425414364640884, + "grad_norm": 0.26080289483070374, + "learning_rate": 7.650383126655784e-05, + "loss": 1.7637, + "step": 11160 + }, + { + "epoch": 3.425721301411909, + "grad_norm": 0.2503695487976074, + "learning_rate": 7.649961635063261e-05, + "loss": 1.7864, + "step": 11161 + }, + { + "epoch": 3.4260282381829343, + "grad_norm": 0.3165570795536041, + "learning_rate": 7.649540117282263e-05, + "loss": 1.8107, + "step": 11162 + }, + { + "epoch": 3.4263351749539597, + "grad_norm": 0.28411731123924255, + "learning_rate": 7.649118573316959e-05, + "loss": 1.7557, + "step": 11163 + }, + { + "epoch": 3.4266421117249846, + "grad_norm": 0.24469570815563202, + "learning_rate": 7.648697003171512e-05, + "loss": 1.7597, + "step": 11164 + }, + { + "epoch": 3.42694904849601, + "grad_norm": 0.31968292593955994, + "learning_rate": 7.648275406850087e-05, + "loss": 1.7796, + "step": 11165 + }, + { + "epoch": 3.427255985267035, + "grad_norm": 0.24520765244960785, + "learning_rate": 7.647853784356856e-05, + "loss": 1.7931, + "step": 11166 + }, + { + "epoch": 3.42756292203806, + "grad_norm": 0.23946821689605713, + "learning_rate": 7.647432135695977e-05, + "loss": 1.7143, + "step": 11167 + }, + { + "epoch": 3.4278698588090855, + "grad_norm": 0.321455180644989, + "learning_rate": 7.647010460871624e-05, + "loss": 1.8682, + "step": 11168 + }, + { + "epoch": 3.4281767955801103, + "grad_norm": 0.2803197503089905, + "learning_rate": 7.646588759887964e-05, + "loss": 1.8, + "step": 11169 + }, + { + "epoch": 3.4284837323511357, + "grad_norm": 0.2597559988498688, + "learning_rate": 7.64616703274916e-05, + "loss": 1.8027, + "step": 11170 + }, + { + "epoch": 3.428790669122161, + "grad_norm": 0.25055503845214844, + "learning_rate": 7.645745279459384e-05, + "loss": 1.7659, + "step": 11171 + }, + { + "epoch": 3.429097605893186, + "grad_norm": 0.34582629799842834, + "learning_rate": 7.645323500022803e-05, + "loss": 1.7868, + "step": 11172 + }, + { + "epoch": 3.4294045426642112, + "grad_norm": 0.32845041155815125, + "learning_rate": 7.644901694443584e-05, + "loss": 1.8247, + "step": 11173 + }, + { + "epoch": 3.4297114794352366, + "grad_norm": 0.2570398449897766, + "learning_rate": 7.644479862725896e-05, + "loss": 1.7802, + "step": 11174 + }, + { + "epoch": 3.4300184162062615, + "grad_norm": 0.23117294907569885, + "learning_rate": 7.644058004873908e-05, + "loss": 1.7575, + "step": 11175 + }, + { + "epoch": 3.430325352977287, + "grad_norm": 0.2417830377817154, + "learning_rate": 7.64363612089179e-05, + "loss": 1.7954, + "step": 11176 + }, + { + "epoch": 3.4306322897483117, + "grad_norm": 0.249378964304924, + "learning_rate": 7.643214210783708e-05, + "loss": 1.8161, + "step": 11177 + }, + { + "epoch": 3.430939226519337, + "grad_norm": 0.24494746327400208, + "learning_rate": 7.642792274553836e-05, + "loss": 1.825, + "step": 11178 + }, + { + "epoch": 3.4312461632903624, + "grad_norm": 0.2663760185241699, + "learning_rate": 7.642370312206342e-05, + "loss": 1.7589, + "step": 11179 + }, + { + "epoch": 3.4315531000613873, + "grad_norm": 0.2819322645664215, + "learning_rate": 7.641948323745395e-05, + "loss": 1.8097, + "step": 11180 + }, + { + "epoch": 3.4318600368324126, + "grad_norm": 0.26917630434036255, + "learning_rate": 7.641526309175166e-05, + "loss": 1.7934, + "step": 11181 + }, + { + "epoch": 3.4321669736034375, + "grad_norm": 0.31618112325668335, + "learning_rate": 7.641104268499826e-05, + "loss": 1.8522, + "step": 11182 + }, + { + "epoch": 3.432473910374463, + "grad_norm": 0.29209139943122864, + "learning_rate": 7.640682201723546e-05, + "loss": 1.7499, + "step": 11183 + }, + { + "epoch": 3.432780847145488, + "grad_norm": 0.24831914901733398, + "learning_rate": 7.640260108850496e-05, + "loss": 1.7897, + "step": 11184 + }, + { + "epoch": 3.433087783916513, + "grad_norm": 0.2459818720817566, + "learning_rate": 7.639837989884849e-05, + "loss": 1.7604, + "step": 11185 + }, + { + "epoch": 3.4333947206875384, + "grad_norm": 0.27157485485076904, + "learning_rate": 7.639415844830774e-05, + "loss": 1.7776, + "step": 11186 + }, + { + "epoch": 3.4337016574585637, + "grad_norm": 0.3021515905857086, + "learning_rate": 7.638993673692445e-05, + "loss": 1.7771, + "step": 11187 + }, + { + "epoch": 3.4340085942295886, + "grad_norm": 0.2591722309589386, + "learning_rate": 7.638571476474036e-05, + "loss": 1.8333, + "step": 11188 + }, + { + "epoch": 3.434315531000614, + "grad_norm": 0.2255258709192276, + "learning_rate": 7.638149253179717e-05, + "loss": 1.7647, + "step": 11189 + }, + { + "epoch": 3.4346224677716393, + "grad_norm": 0.2585793733596802, + "learning_rate": 7.637727003813658e-05, + "loss": 1.786, + "step": 11190 + }, + { + "epoch": 3.434929404542664, + "grad_norm": 0.23649543523788452, + "learning_rate": 7.637304728380036e-05, + "loss": 1.822, + "step": 11191 + }, + { + "epoch": 3.4352363413136895, + "grad_norm": 0.2610832452774048, + "learning_rate": 7.636882426883023e-05, + "loss": 1.7925, + "step": 11192 + }, + { + "epoch": 3.4355432780847144, + "grad_norm": 0.26230642199516296, + "learning_rate": 7.636460099326793e-05, + "loss": 1.8169, + "step": 11193 + }, + { + "epoch": 3.4358502148557397, + "grad_norm": 0.2800561189651489, + "learning_rate": 7.636037745715518e-05, + "loss": 1.845, + "step": 11194 + }, + { + "epoch": 3.436157151626765, + "grad_norm": 0.27790409326553345, + "learning_rate": 7.635615366053372e-05, + "loss": 1.8141, + "step": 11195 + }, + { + "epoch": 3.43646408839779, + "grad_norm": 0.2894865870475769, + "learning_rate": 7.635192960344533e-05, + "loss": 1.7916, + "step": 11196 + }, + { + "epoch": 3.4367710251688153, + "grad_norm": 0.22310738265514374, + "learning_rate": 7.634770528593171e-05, + "loss": 1.79, + "step": 11197 + }, + { + "epoch": 3.43707796193984, + "grad_norm": 0.2837755084037781, + "learning_rate": 7.634348070803463e-05, + "loss": 1.8763, + "step": 11198 + }, + { + "epoch": 3.4373848987108655, + "grad_norm": 0.32488104701042175, + "learning_rate": 7.633925586979583e-05, + "loss": 1.8331, + "step": 11199 + }, + { + "epoch": 3.437691835481891, + "grad_norm": 0.2708779573440552, + "learning_rate": 7.633503077125706e-05, + "loss": 1.761, + "step": 11200 + }, + { + "epoch": 3.4379987722529157, + "grad_norm": 0.23929642140865326, + "learning_rate": 7.633080541246008e-05, + "loss": 1.8217, + "step": 11201 + }, + { + "epoch": 3.438305709023941, + "grad_norm": 0.3213331997394562, + "learning_rate": 7.632657979344667e-05, + "loss": 1.8375, + "step": 11202 + }, + { + "epoch": 3.4386126457949664, + "grad_norm": 0.38420629501342773, + "learning_rate": 7.632235391425854e-05, + "loss": 1.765, + "step": 11203 + }, + { + "epoch": 3.4389195825659913, + "grad_norm": 0.40466073155403137, + "learning_rate": 7.631812777493749e-05, + "loss": 1.8262, + "step": 11204 + }, + { + "epoch": 3.4392265193370166, + "grad_norm": 0.35904639959335327, + "learning_rate": 7.631390137552527e-05, + "loss": 1.894, + "step": 11205 + }, + { + "epoch": 3.439533456108042, + "grad_norm": 0.28880515694618225, + "learning_rate": 7.630967471606368e-05, + "loss": 1.87, + "step": 11206 + }, + { + "epoch": 3.439840392879067, + "grad_norm": 0.2878882884979248, + "learning_rate": 7.630544779659444e-05, + "loss": 1.7841, + "step": 11207 + }, + { + "epoch": 3.440147329650092, + "grad_norm": 0.36002418398857117, + "learning_rate": 7.630122061715935e-05, + "loss": 1.7318, + "step": 11208 + }, + { + "epoch": 3.440454266421117, + "grad_norm": 0.3304644227027893, + "learning_rate": 7.629699317780019e-05, + "loss": 1.8581, + "step": 11209 + }, + { + "epoch": 3.4407612031921424, + "grad_norm": 0.23396331071853638, + "learning_rate": 7.629276547855872e-05, + "loss": 1.7897, + "step": 11210 + }, + { + "epoch": 3.4410681399631677, + "grad_norm": 0.34914183616638184, + "learning_rate": 7.628853751947674e-05, + "loss": 1.8531, + "step": 11211 + }, + { + "epoch": 3.4413750767341926, + "grad_norm": 0.3700502812862396, + "learning_rate": 7.6284309300596e-05, + "loss": 1.7884, + "step": 11212 + }, + { + "epoch": 3.441682013505218, + "grad_norm": 0.24606801569461823, + "learning_rate": 7.628008082195835e-05, + "loss": 1.7292, + "step": 11213 + }, + { + "epoch": 3.441988950276243, + "grad_norm": 0.26344993710517883, + "learning_rate": 7.627585208360551e-05, + "loss": 1.7832, + "step": 11214 + }, + { + "epoch": 3.442295887047268, + "grad_norm": 0.4034743010997772, + "learning_rate": 7.62716230855793e-05, + "loss": 1.8164, + "step": 11215 + }, + { + "epoch": 3.4426028238182935, + "grad_norm": 0.4508039355278015, + "learning_rate": 7.626739382792152e-05, + "loss": 1.7855, + "step": 11216 + }, + { + "epoch": 3.4429097605893184, + "grad_norm": 0.2963111400604248, + "learning_rate": 7.626316431067395e-05, + "loss": 1.7995, + "step": 11217 + }, + { + "epoch": 3.4432166973603437, + "grad_norm": 0.35248515009880066, + "learning_rate": 7.625893453387841e-05, + "loss": 1.8761, + "step": 11218 + }, + { + "epoch": 3.443523634131369, + "grad_norm": 0.4032224416732788, + "learning_rate": 7.625470449757668e-05, + "loss": 1.7746, + "step": 11219 + }, + { + "epoch": 3.443830570902394, + "grad_norm": 0.3505195081233978, + "learning_rate": 7.625047420181057e-05, + "loss": 1.851, + "step": 11220 + }, + { + "epoch": 3.4441375076734193, + "grad_norm": 0.288968563079834, + "learning_rate": 7.62462436466219e-05, + "loss": 1.8055, + "step": 11221 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.43141910433769226, + "learning_rate": 7.624201283205246e-05, + "loss": 1.816, + "step": 11222 + }, + { + "epoch": 3.4447513812154695, + "grad_norm": 0.46902137994766235, + "learning_rate": 7.623778175814407e-05, + "loss": 1.8478, + "step": 11223 + }, + { + "epoch": 3.445058317986495, + "grad_norm": 0.3333328366279602, + "learning_rate": 7.623355042493854e-05, + "loss": 1.7949, + "step": 11224 + }, + { + "epoch": 3.4453652547575198, + "grad_norm": 0.2625340521335602, + "learning_rate": 7.622931883247768e-05, + "loss": 1.745, + "step": 11225 + }, + { + "epoch": 3.445672191528545, + "grad_norm": 0.4565848410129547, + "learning_rate": 7.622508698080333e-05, + "loss": 1.796, + "step": 11226 + }, + { + "epoch": 3.4459791282995704, + "grad_norm": 0.4676518738269806, + "learning_rate": 7.622085486995729e-05, + "loss": 1.8115, + "step": 11227 + }, + { + "epoch": 3.4462860650705953, + "grad_norm": 0.3828938603401184, + "learning_rate": 7.62166224999814e-05, + "loss": 1.8758, + "step": 11228 + }, + { + "epoch": 3.4465930018416207, + "grad_norm": 0.2786383628845215, + "learning_rate": 7.621238987091747e-05, + "loss": 1.7616, + "step": 11229 + }, + { + "epoch": 3.446899938612646, + "grad_norm": 0.4442835748195648, + "learning_rate": 7.620815698280734e-05, + "loss": 1.8342, + "step": 11230 + }, + { + "epoch": 3.447206875383671, + "grad_norm": 0.45760586857795715, + "learning_rate": 7.620392383569286e-05, + "loss": 1.8159, + "step": 11231 + }, + { + "epoch": 3.447513812154696, + "grad_norm": 0.2567009925842285, + "learning_rate": 7.619969042961583e-05, + "loss": 1.774, + "step": 11232 + }, + { + "epoch": 3.4478207489257215, + "grad_norm": 0.3720102310180664, + "learning_rate": 7.619545676461812e-05, + "loss": 1.8366, + "step": 11233 + }, + { + "epoch": 3.4481276856967464, + "grad_norm": 0.36436137557029724, + "learning_rate": 7.619122284074154e-05, + "loss": 1.832, + "step": 11234 + }, + { + "epoch": 3.4484346224677718, + "grad_norm": 0.310310959815979, + "learning_rate": 7.618698865802795e-05, + "loss": 1.9023, + "step": 11235 + }, + { + "epoch": 3.4487415592387967, + "grad_norm": 0.2693026661872864, + "learning_rate": 7.618275421651916e-05, + "loss": 1.7696, + "step": 11236 + }, + { + "epoch": 3.449048496009822, + "grad_norm": 0.2942425608634949, + "learning_rate": 7.61785195162571e-05, + "loss": 1.822, + "step": 11237 + }, + { + "epoch": 3.4493554327808473, + "grad_norm": 0.22454749047756195, + "learning_rate": 7.617428455728353e-05, + "loss": 1.7011, + "step": 11238 + }, + { + "epoch": 3.449662369551872, + "grad_norm": 0.23345038294792175, + "learning_rate": 7.617004933964035e-05, + "loss": 1.7563, + "step": 11239 + }, + { + "epoch": 3.4499693063228976, + "grad_norm": 0.24990662932395935, + "learning_rate": 7.616581386336941e-05, + "loss": 1.8031, + "step": 11240 + }, + { + "epoch": 3.4502762430939224, + "grad_norm": 0.2919348478317261, + "learning_rate": 7.616157812851254e-05, + "loss": 1.7355, + "step": 11241 + }, + { + "epoch": 3.450583179864948, + "grad_norm": 0.2926909327507019, + "learning_rate": 7.615734213511165e-05, + "loss": 1.8341, + "step": 11242 + }, + { + "epoch": 3.450890116635973, + "grad_norm": 0.24316683411598206, + "learning_rate": 7.615310588320855e-05, + "loss": 1.8154, + "step": 11243 + }, + { + "epoch": 3.451197053406998, + "grad_norm": 0.23154498636722565, + "learning_rate": 7.614886937284513e-05, + "loss": 1.7904, + "step": 11244 + }, + { + "epoch": 3.4515039901780233, + "grad_norm": 0.25973939895629883, + "learning_rate": 7.614463260406327e-05, + "loss": 1.7598, + "step": 11245 + }, + { + "epoch": 3.4518109269490487, + "grad_norm": 0.22110119462013245, + "learning_rate": 7.614039557690482e-05, + "loss": 1.7903, + "step": 11246 + }, + { + "epoch": 3.4521178637200736, + "grad_norm": 0.26184993982315063, + "learning_rate": 7.613615829141165e-05, + "loss": 1.748, + "step": 11247 + }, + { + "epoch": 3.452424800491099, + "grad_norm": 0.26128727197647095, + "learning_rate": 7.613192074762565e-05, + "loss": 1.7786, + "step": 11248 + }, + { + "epoch": 3.4527317372621242, + "grad_norm": 0.23230813443660736, + "learning_rate": 7.612768294558871e-05, + "loss": 1.8114, + "step": 11249 + }, + { + "epoch": 3.453038674033149, + "grad_norm": 0.2686540186405182, + "learning_rate": 7.612344488534268e-05, + "loss": 1.7311, + "step": 11250 + }, + { + "epoch": 3.4533456108041745, + "grad_norm": 0.25553348660469055, + "learning_rate": 7.611920656692946e-05, + "loss": 1.8468, + "step": 11251 + }, + { + "epoch": 3.4536525475751993, + "grad_norm": 0.2639308273792267, + "learning_rate": 7.611496799039092e-05, + "loss": 1.8292, + "step": 11252 + }, + { + "epoch": 3.4539594843462247, + "grad_norm": 0.2468358874320984, + "learning_rate": 7.611072915576895e-05, + "loss": 1.8173, + "step": 11253 + }, + { + "epoch": 3.45426642111725, + "grad_norm": 0.27236035466194153, + "learning_rate": 7.610649006310549e-05, + "loss": 1.8082, + "step": 11254 + }, + { + "epoch": 3.454573357888275, + "grad_norm": 0.2277914434671402, + "learning_rate": 7.610225071244237e-05, + "loss": 1.7483, + "step": 11255 + }, + { + "epoch": 3.4548802946593002, + "grad_norm": 0.2292868196964264, + "learning_rate": 7.60980111038215e-05, + "loss": 1.7716, + "step": 11256 + }, + { + "epoch": 3.455187231430325, + "grad_norm": 0.22116152942180634, + "learning_rate": 7.60937712372848e-05, + "loss": 1.773, + "step": 11257 + }, + { + "epoch": 3.4554941682013505, + "grad_norm": 0.23238304257392883, + "learning_rate": 7.608953111287416e-05, + "loss": 1.7602, + "step": 11258 + }, + { + "epoch": 3.455801104972376, + "grad_norm": 0.2810615003108978, + "learning_rate": 7.608529073063149e-05, + "loss": 1.8781, + "step": 11259 + }, + { + "epoch": 3.4561080417434007, + "grad_norm": 0.2516821324825287, + "learning_rate": 7.608105009059867e-05, + "loss": 1.835, + "step": 11260 + }, + { + "epoch": 3.456414978514426, + "grad_norm": 0.25698330998420715, + "learning_rate": 7.607680919281763e-05, + "loss": 1.7859, + "step": 11261 + }, + { + "epoch": 3.4567219152854514, + "grad_norm": 0.2597602903842926, + "learning_rate": 7.60725680373303e-05, + "loss": 1.8287, + "step": 11262 + }, + { + "epoch": 3.4570288520564763, + "grad_norm": 0.2564091980457306, + "learning_rate": 7.606832662417855e-05, + "loss": 1.8003, + "step": 11263 + }, + { + "epoch": 3.4573357888275016, + "grad_norm": 0.2872684597969055, + "learning_rate": 7.606408495340432e-05, + "loss": 1.8242, + "step": 11264 + }, + { + "epoch": 3.457642725598527, + "grad_norm": 0.27513590455055237, + "learning_rate": 7.605984302504952e-05, + "loss": 1.8605, + "step": 11265 + }, + { + "epoch": 3.457949662369552, + "grad_norm": 0.27768459916114807, + "learning_rate": 7.605560083915609e-05, + "loss": 1.7948, + "step": 11266 + }, + { + "epoch": 3.458256599140577, + "grad_norm": 0.23911382257938385, + "learning_rate": 7.605135839576593e-05, + "loss": 1.7575, + "step": 11267 + }, + { + "epoch": 3.458563535911602, + "grad_norm": 0.26773568987846375, + "learning_rate": 7.604711569492098e-05, + "loss": 1.752, + "step": 11268 + }, + { + "epoch": 3.4588704726826274, + "grad_norm": 0.30079394578933716, + "learning_rate": 7.604287273666316e-05, + "loss": 1.8022, + "step": 11269 + }, + { + "epoch": 3.4591774094536527, + "grad_norm": 0.27393853664398193, + "learning_rate": 7.603862952103441e-05, + "loss": 1.8054, + "step": 11270 + }, + { + "epoch": 3.4594843462246776, + "grad_norm": 0.2794870436191559, + "learning_rate": 7.603438604807667e-05, + "loss": 1.808, + "step": 11271 + }, + { + "epoch": 3.459791282995703, + "grad_norm": 0.26482146978378296, + "learning_rate": 7.603014231783185e-05, + "loss": 1.8696, + "step": 11272 + }, + { + "epoch": 3.460098219766728, + "grad_norm": 0.2755354344844818, + "learning_rate": 7.602589833034192e-05, + "loss": 1.8412, + "step": 11273 + }, + { + "epoch": 3.460405156537753, + "grad_norm": 0.2666642367839813, + "learning_rate": 7.602165408564883e-05, + "loss": 1.8333, + "step": 11274 + }, + { + "epoch": 3.4607120933087785, + "grad_norm": 0.26958519220352173, + "learning_rate": 7.601740958379448e-05, + "loss": 1.7943, + "step": 11275 + }, + { + "epoch": 3.4610190300798034, + "grad_norm": 0.2915789783000946, + "learning_rate": 7.601316482482084e-05, + "loss": 1.7519, + "step": 11276 + }, + { + "epoch": 3.4613259668508287, + "grad_norm": 0.2456950694322586, + "learning_rate": 7.600891980876985e-05, + "loss": 1.8064, + "step": 11277 + }, + { + "epoch": 3.461632903621854, + "grad_norm": 0.2517867088317871, + "learning_rate": 7.600467453568348e-05, + "loss": 1.7766, + "step": 11278 + }, + { + "epoch": 3.461939840392879, + "grad_norm": 0.24567969143390656, + "learning_rate": 7.600042900560368e-05, + "loss": 1.7331, + "step": 11279 + }, + { + "epoch": 3.4622467771639043, + "grad_norm": 0.23986820876598358, + "learning_rate": 7.599618321857239e-05, + "loss": 1.7477, + "step": 11280 + }, + { + "epoch": 3.4625537139349296, + "grad_norm": 0.2555375397205353, + "learning_rate": 7.599193717463158e-05, + "loss": 1.8154, + "step": 11281 + }, + { + "epoch": 3.4628606507059545, + "grad_norm": 0.2522781193256378, + "learning_rate": 7.598769087382323e-05, + "loss": 1.7821, + "step": 11282 + }, + { + "epoch": 3.46316758747698, + "grad_norm": 0.25631004571914673, + "learning_rate": 7.598344431618926e-05, + "loss": 1.8043, + "step": 11283 + }, + { + "epoch": 3.4634745242480047, + "grad_norm": 0.2611328661441803, + "learning_rate": 7.597919750177168e-05, + "loss": 1.8036, + "step": 11284 + }, + { + "epoch": 3.46378146101903, + "grad_norm": 0.255670428276062, + "learning_rate": 7.597495043061244e-05, + "loss": 1.7375, + "step": 11285 + }, + { + "epoch": 3.4640883977900554, + "grad_norm": 0.2687236964702606, + "learning_rate": 7.597070310275353e-05, + "loss": 1.7496, + "step": 11286 + }, + { + "epoch": 3.4643953345610803, + "grad_norm": 0.2643752992153168, + "learning_rate": 7.596645551823688e-05, + "loss": 1.8444, + "step": 11287 + }, + { + "epoch": 3.4647022713321056, + "grad_norm": 0.2564511299133301, + "learning_rate": 7.596220767710452e-05, + "loss": 1.7557, + "step": 11288 + }, + { + "epoch": 3.4650092081031305, + "grad_norm": 0.2510208487510681, + "learning_rate": 7.59579595793984e-05, + "loss": 1.7234, + "step": 11289 + }, + { + "epoch": 3.465316144874156, + "grad_norm": 0.2765158712863922, + "learning_rate": 7.595371122516051e-05, + "loss": 1.8215, + "step": 11290 + }, + { + "epoch": 3.465623081645181, + "grad_norm": 0.28233039379119873, + "learning_rate": 7.594946261443286e-05, + "loss": 1.7752, + "step": 11291 + }, + { + "epoch": 3.465930018416206, + "grad_norm": 0.26971468329429626, + "learning_rate": 7.594521374725735e-05, + "loss": 1.7924, + "step": 11292 + }, + { + "epoch": 3.4662369551872314, + "grad_norm": 0.29425930976867676, + "learning_rate": 7.594096462367608e-05, + "loss": 1.8144, + "step": 11293 + }, + { + "epoch": 3.4665438919582567, + "grad_norm": 0.233150452375412, + "learning_rate": 7.593671524373098e-05, + "loss": 1.7741, + "step": 11294 + }, + { + "epoch": 3.4668508287292816, + "grad_norm": 0.2947762608528137, + "learning_rate": 7.593246560746406e-05, + "loss": 1.8031, + "step": 11295 + }, + { + "epoch": 3.467157765500307, + "grad_norm": 0.250552773475647, + "learning_rate": 7.59282157149173e-05, + "loss": 1.7501, + "step": 11296 + }, + { + "epoch": 3.4674647022713323, + "grad_norm": 0.26091331243515015, + "learning_rate": 7.592396556613274e-05, + "loss": 1.836, + "step": 11297 + }, + { + "epoch": 3.467771639042357, + "grad_norm": 0.28625619411468506, + "learning_rate": 7.591971516115233e-05, + "loss": 1.7555, + "step": 11298 + }, + { + "epoch": 3.4680785758133825, + "grad_norm": 0.2723398804664612, + "learning_rate": 7.591546450001811e-05, + "loss": 1.825, + "step": 11299 + }, + { + "epoch": 3.4683855125844074, + "grad_norm": 0.24289946258068085, + "learning_rate": 7.591121358277211e-05, + "loss": 1.7441, + "step": 11300 + }, + { + "epoch": 3.4686924493554327, + "grad_norm": 0.2706952691078186, + "learning_rate": 7.590696240945629e-05, + "loss": 1.8651, + "step": 11301 + }, + { + "epoch": 3.468999386126458, + "grad_norm": 0.24632862210273743, + "learning_rate": 7.590271098011268e-05, + "loss": 1.8229, + "step": 11302 + }, + { + "epoch": 3.469306322897483, + "grad_norm": 0.29275211691856384, + "learning_rate": 7.58984592947833e-05, + "loss": 1.7591, + "step": 11303 + }, + { + "epoch": 3.4696132596685083, + "grad_norm": 0.29228144884109497, + "learning_rate": 7.589420735351016e-05, + "loss": 1.8395, + "step": 11304 + }, + { + "epoch": 3.4699201964395336, + "grad_norm": 0.28339114785194397, + "learning_rate": 7.588995515633528e-05, + "loss": 1.8543, + "step": 11305 + }, + { + "epoch": 3.4702271332105585, + "grad_norm": 0.2834693193435669, + "learning_rate": 7.588570270330071e-05, + "loss": 1.826, + "step": 11306 + }, + { + "epoch": 3.470534069981584, + "grad_norm": 0.26130759716033936, + "learning_rate": 7.588144999444844e-05, + "loss": 1.7887, + "step": 11307 + }, + { + "epoch": 3.470841006752609, + "grad_norm": 0.29554685950279236, + "learning_rate": 7.587719702982052e-05, + "loss": 1.819, + "step": 11308 + }, + { + "epoch": 3.471147943523634, + "grad_norm": 0.2687968611717224, + "learning_rate": 7.587294380945898e-05, + "loss": 1.7354, + "step": 11309 + }, + { + "epoch": 3.4714548802946594, + "grad_norm": 0.28795287013053894, + "learning_rate": 7.586869033340582e-05, + "loss": 1.8267, + "step": 11310 + }, + { + "epoch": 3.4717618170656843, + "grad_norm": 0.33244553208351135, + "learning_rate": 7.58644366017031e-05, + "loss": 1.86, + "step": 11311 + }, + { + "epoch": 3.4720687538367097, + "grad_norm": 0.2878025472164154, + "learning_rate": 7.586018261439288e-05, + "loss": 1.7587, + "step": 11312 + }, + { + "epoch": 3.472375690607735, + "grad_norm": 0.26856711506843567, + "learning_rate": 7.585592837151716e-05, + "loss": 1.7351, + "step": 11313 + }, + { + "epoch": 3.47268262737876, + "grad_norm": 0.2554367780685425, + "learning_rate": 7.585167387311802e-05, + "loss": 1.7664, + "step": 11314 + }, + { + "epoch": 3.472989564149785, + "grad_norm": 0.3193204700946808, + "learning_rate": 7.584741911923748e-05, + "loss": 1.7487, + "step": 11315 + }, + { + "epoch": 3.47329650092081, + "grad_norm": 0.3227958679199219, + "learning_rate": 7.584316410991759e-05, + "loss": 1.8107, + "step": 11316 + }, + { + "epoch": 3.4736034376918354, + "grad_norm": 0.33891916275024414, + "learning_rate": 7.58389088452004e-05, + "loss": 1.8466, + "step": 11317 + }, + { + "epoch": 3.4739103744628608, + "grad_norm": 0.27050724625587463, + "learning_rate": 7.583465332512797e-05, + "loss": 1.7877, + "step": 11318 + }, + { + "epoch": 3.4742173112338857, + "grad_norm": 0.2935837209224701, + "learning_rate": 7.583039754974235e-05, + "loss": 1.7932, + "step": 11319 + }, + { + "epoch": 3.474524248004911, + "grad_norm": 0.27780550718307495, + "learning_rate": 7.582614151908561e-05, + "loss": 1.8374, + "step": 11320 + }, + { + "epoch": 3.4748311847759363, + "grad_norm": 0.2579033076763153, + "learning_rate": 7.58218852331998e-05, + "loss": 1.7305, + "step": 11321 + }, + { + "epoch": 3.4751381215469612, + "grad_norm": 0.2531716227531433, + "learning_rate": 7.581762869212699e-05, + "loss": 1.8136, + "step": 11322 + }, + { + "epoch": 3.4754450583179866, + "grad_norm": 0.25504544377326965, + "learning_rate": 7.581337189590924e-05, + "loss": 1.787, + "step": 11323 + }, + { + "epoch": 3.475751995089012, + "grad_norm": 0.23659855127334595, + "learning_rate": 7.580911484458861e-05, + "loss": 1.77, + "step": 11324 + }, + { + "epoch": 3.476058931860037, + "grad_norm": 0.22556856274604797, + "learning_rate": 7.580485753820721e-05, + "loss": 1.7808, + "step": 11325 + }, + { + "epoch": 3.476365868631062, + "grad_norm": 0.2860291600227356, + "learning_rate": 7.580059997680705e-05, + "loss": 1.8224, + "step": 11326 + }, + { + "epoch": 3.476672805402087, + "grad_norm": 0.3134596645832062, + "learning_rate": 7.579634216043023e-05, + "loss": 1.8278, + "step": 11327 + }, + { + "epoch": 3.4769797421731123, + "grad_norm": 0.2883087992668152, + "learning_rate": 7.579208408911887e-05, + "loss": 1.7917, + "step": 11328 + }, + { + "epoch": 3.4772866789441377, + "grad_norm": 0.2743333578109741, + "learning_rate": 7.578782576291501e-05, + "loss": 1.8228, + "step": 11329 + }, + { + "epoch": 3.4775936157151626, + "grad_norm": 0.25026053190231323, + "learning_rate": 7.578356718186073e-05, + "loss": 1.7717, + "step": 11330 + }, + { + "epoch": 3.477900552486188, + "grad_norm": 0.246905118227005, + "learning_rate": 7.577930834599813e-05, + "loss": 1.7979, + "step": 11331 + }, + { + "epoch": 3.478207489257213, + "grad_norm": 0.24709418416023254, + "learning_rate": 7.577504925536929e-05, + "loss": 1.8111, + "step": 11332 + }, + { + "epoch": 3.478514426028238, + "grad_norm": 0.25685814023017883, + "learning_rate": 7.577078991001632e-05, + "loss": 1.8255, + "step": 11333 + }, + { + "epoch": 3.4788213627992635, + "grad_norm": 0.23937836289405823, + "learning_rate": 7.576653030998129e-05, + "loss": 1.7254, + "step": 11334 + }, + { + "epoch": 3.4791282995702884, + "grad_norm": 0.22638650238513947, + "learning_rate": 7.57622704553063e-05, + "loss": 1.7847, + "step": 11335 + }, + { + "epoch": 3.4794352363413137, + "grad_norm": 0.26083993911743164, + "learning_rate": 7.575801034603347e-05, + "loss": 1.7947, + "step": 11336 + }, + { + "epoch": 3.479742173112339, + "grad_norm": 0.2715466022491455, + "learning_rate": 7.575374998220488e-05, + "loss": 1.848, + "step": 11337 + }, + { + "epoch": 3.480049109883364, + "grad_norm": 0.25554224848747253, + "learning_rate": 7.574948936386262e-05, + "loss": 1.7811, + "step": 11338 + }, + { + "epoch": 3.4803560466543892, + "grad_norm": 0.2689397931098938, + "learning_rate": 7.574522849104882e-05, + "loss": 1.82, + "step": 11339 + }, + { + "epoch": 3.4806629834254146, + "grad_norm": 0.25027474761009216, + "learning_rate": 7.57409673638056e-05, + "loss": 1.775, + "step": 11340 + }, + { + "epoch": 3.4809699201964395, + "grad_norm": 0.2545457184314728, + "learning_rate": 7.573670598217504e-05, + "loss": 1.8056, + "step": 11341 + }, + { + "epoch": 3.481276856967465, + "grad_norm": 0.28404027223587036, + "learning_rate": 7.573244434619928e-05, + "loss": 1.8372, + "step": 11342 + }, + { + "epoch": 3.4815837937384897, + "grad_norm": 0.28046950697898865, + "learning_rate": 7.572818245592041e-05, + "loss": 1.7851, + "step": 11343 + }, + { + "epoch": 3.481890730509515, + "grad_norm": 0.23005759716033936, + "learning_rate": 7.572392031138056e-05, + "loss": 1.7059, + "step": 11344 + }, + { + "epoch": 3.4821976672805404, + "grad_norm": 0.2931719124317169, + "learning_rate": 7.571965791262185e-05, + "loss": 1.84, + "step": 11345 + }, + { + "epoch": 3.4825046040515653, + "grad_norm": 0.4399266242980957, + "learning_rate": 7.571539525968642e-05, + "loss": 1.7465, + "step": 11346 + }, + { + "epoch": 3.4828115408225906, + "grad_norm": 0.48957565426826477, + "learning_rate": 7.571113235261638e-05, + "loss": 1.8494, + "step": 11347 + }, + { + "epoch": 3.4831184775936155, + "grad_norm": 0.37828895449638367, + "learning_rate": 7.570686919145385e-05, + "loss": 1.7598, + "step": 11348 + }, + { + "epoch": 3.483425414364641, + "grad_norm": 0.22943973541259766, + "learning_rate": 7.570260577624098e-05, + "loss": 1.7443, + "step": 11349 + }, + { + "epoch": 3.483732351135666, + "grad_norm": 0.3245384991168976, + "learning_rate": 7.569834210701987e-05, + "loss": 1.7232, + "step": 11350 + }, + { + "epoch": 3.484039287906691, + "grad_norm": 0.4419693648815155, + "learning_rate": 7.569407818383271e-05, + "loss": 1.841, + "step": 11351 + }, + { + "epoch": 3.4843462246777164, + "grad_norm": 0.4061864912509918, + "learning_rate": 7.568981400672159e-05, + "loss": 1.8274, + "step": 11352 + }, + { + "epoch": 3.4846531614487417, + "grad_norm": 0.2609417736530304, + "learning_rate": 7.56855495757287e-05, + "loss": 1.8631, + "step": 11353 + }, + { + "epoch": 3.4849600982197666, + "grad_norm": 0.28758567571640015, + "learning_rate": 7.568128489089612e-05, + "loss": 1.8169, + "step": 11354 + }, + { + "epoch": 3.485267034990792, + "grad_norm": 0.40643060207366943, + "learning_rate": 7.567701995226606e-05, + "loss": 1.809, + "step": 11355 + }, + { + "epoch": 3.4855739717618173, + "grad_norm": 0.37649446725845337, + "learning_rate": 7.56727547598806e-05, + "loss": 1.7661, + "step": 11356 + }, + { + "epoch": 3.485880908532842, + "grad_norm": 0.22863779962062836, + "learning_rate": 7.566848931378197e-05, + "loss": 1.808, + "step": 11357 + }, + { + "epoch": 3.4861878453038675, + "grad_norm": 0.4487019181251526, + "learning_rate": 7.566422361401226e-05, + "loss": 1.7627, + "step": 11358 + }, + { + "epoch": 3.4864947820748924, + "grad_norm": 0.4583640694618225, + "learning_rate": 7.565995766061367e-05, + "loss": 1.8186, + "step": 11359 + }, + { + "epoch": 3.4868017188459177, + "grad_norm": 0.27231526374816895, + "learning_rate": 7.565569145362833e-05, + "loss": 1.8465, + "step": 11360 + }, + { + "epoch": 3.487108655616943, + "grad_norm": 0.3877887725830078, + "learning_rate": 7.565142499309841e-05, + "loss": 1.7668, + "step": 11361 + }, + { + "epoch": 3.487415592387968, + "grad_norm": 0.5511242747306824, + "learning_rate": 7.564715827906606e-05, + "loss": 1.8417, + "step": 11362 + }, + { + "epoch": 3.4877225291589933, + "grad_norm": 0.5112231373786926, + "learning_rate": 7.564289131157348e-05, + "loss": 1.8038, + "step": 11363 + }, + { + "epoch": 3.488029465930018, + "grad_norm": 0.279502809047699, + "learning_rate": 7.56386240906628e-05, + "loss": 1.7545, + "step": 11364 + }, + { + "epoch": 3.4883364027010435, + "grad_norm": 0.30080464482307434, + "learning_rate": 7.563435661637623e-05, + "loss": 1.8136, + "step": 11365 + }, + { + "epoch": 3.488643339472069, + "grad_norm": 0.4424717128276825, + "learning_rate": 7.563008888875591e-05, + "loss": 1.7542, + "step": 11366 + }, + { + "epoch": 3.4889502762430937, + "grad_norm": 0.42144715785980225, + "learning_rate": 7.562582090784403e-05, + "loss": 1.8245, + "step": 11367 + }, + { + "epoch": 3.489257213014119, + "grad_norm": 0.2533668875694275, + "learning_rate": 7.562155267368277e-05, + "loss": 1.8654, + "step": 11368 + }, + { + "epoch": 3.4895641497851444, + "grad_norm": 0.3327534794807434, + "learning_rate": 7.56172841863143e-05, + "loss": 1.7882, + "step": 11369 + }, + { + "epoch": 3.4898710865561693, + "grad_norm": 0.44001486897468567, + "learning_rate": 7.561301544578081e-05, + "loss": 1.8397, + "step": 11370 + }, + { + "epoch": 3.4901780233271946, + "grad_norm": 0.2779090106487274, + "learning_rate": 7.56087464521245e-05, + "loss": 1.7398, + "step": 11371 + }, + { + "epoch": 3.49048496009822, + "grad_norm": 0.3018067479133606, + "learning_rate": 7.560447720538755e-05, + "loss": 1.8076, + "step": 11372 + }, + { + "epoch": 3.490791896869245, + "grad_norm": 0.4370935261249542, + "learning_rate": 7.560020770561216e-05, + "loss": 1.8057, + "step": 11373 + }, + { + "epoch": 3.49109883364027, + "grad_norm": 0.2936978042125702, + "learning_rate": 7.559593795284047e-05, + "loss": 1.7726, + "step": 11374 + }, + { + "epoch": 3.491405770411295, + "grad_norm": 0.28825095295906067, + "learning_rate": 7.559166794711476e-05, + "loss": 1.8039, + "step": 11375 + }, + { + "epoch": 3.4917127071823204, + "grad_norm": 0.39334073662757874, + "learning_rate": 7.55873976884772e-05, + "loss": 1.8388, + "step": 11376 + }, + { + "epoch": 3.4920196439533457, + "grad_norm": 0.33880460262298584, + "learning_rate": 7.558312717696995e-05, + "loss": 1.7791, + "step": 11377 + }, + { + "epoch": 3.4923265807243706, + "grad_norm": 0.4433762729167938, + "learning_rate": 7.557885641263524e-05, + "loss": 1.7786, + "step": 11378 + }, + { + "epoch": 3.492633517495396, + "grad_norm": 0.4710264205932617, + "learning_rate": 7.557458539551527e-05, + "loss": 1.7193, + "step": 11379 + }, + { + "epoch": 3.4929404542664213, + "grad_norm": 0.27514326572418213, + "learning_rate": 7.557031412565228e-05, + "loss": 1.823, + "step": 11380 + }, + { + "epoch": 3.493247391037446, + "grad_norm": 0.4681413471698761, + "learning_rate": 7.556604260308846e-05, + "loss": 1.7598, + "step": 11381 + }, + { + "epoch": 3.4935543278084715, + "grad_norm": 0.5032503604888916, + "learning_rate": 7.556177082786602e-05, + "loss": 1.741, + "step": 11382 + }, + { + "epoch": 3.493861264579497, + "grad_norm": 0.2677086889743805, + "learning_rate": 7.555749880002716e-05, + "loss": 1.8528, + "step": 11383 + }, + { + "epoch": 3.4941682013505218, + "grad_norm": 0.43870940804481506, + "learning_rate": 7.555322651961414e-05, + "loss": 1.7632, + "step": 11384 + }, + { + "epoch": 3.494475138121547, + "grad_norm": 0.5403209924697876, + "learning_rate": 7.554895398666914e-05, + "loss": 1.8181, + "step": 11385 + }, + { + "epoch": 3.494782074892572, + "grad_norm": 0.2714318335056305, + "learning_rate": 7.554468120123441e-05, + "loss": 1.8151, + "step": 11386 + }, + { + "epoch": 3.4950890116635973, + "grad_norm": 0.49661698937416077, + "learning_rate": 7.554040816335217e-05, + "loss": 1.8116, + "step": 11387 + }, + { + "epoch": 3.4953959484346226, + "grad_norm": 0.49954715371131897, + "learning_rate": 7.553613487306465e-05, + "loss": 1.8841, + "step": 11388 + }, + { + "epoch": 3.4957028852056475, + "grad_norm": 0.28189441561698914, + "learning_rate": 7.553186133041406e-05, + "loss": 1.7834, + "step": 11389 + }, + { + "epoch": 3.496009821976673, + "grad_norm": 0.36029115319252014, + "learning_rate": 7.552758753544267e-05, + "loss": 1.7796, + "step": 11390 + }, + { + "epoch": 3.4963167587476978, + "grad_norm": 0.45023465156555176, + "learning_rate": 7.552331348819268e-05, + "loss": 1.8773, + "step": 11391 + }, + { + "epoch": 3.496623695518723, + "grad_norm": 0.3235788643360138, + "learning_rate": 7.551903918870636e-05, + "loss": 1.7984, + "step": 11392 + }, + { + "epoch": 3.4969306322897484, + "grad_norm": 0.25656190514564514, + "learning_rate": 7.551476463702596e-05, + "loss": 1.8403, + "step": 11393 + }, + { + "epoch": 3.4972375690607733, + "grad_norm": 0.2866458594799042, + "learning_rate": 7.551048983319366e-05, + "loss": 1.7428, + "step": 11394 + }, + { + "epoch": 3.4975445058317987, + "grad_norm": 0.2713877856731415, + "learning_rate": 7.550621477725177e-05, + "loss": 1.8508, + "step": 11395 + }, + { + "epoch": 3.497851442602824, + "grad_norm": 0.27978867292404175, + "learning_rate": 7.55019394692425e-05, + "loss": 1.8049, + "step": 11396 + }, + { + "epoch": 3.498158379373849, + "grad_norm": 0.3275020122528076, + "learning_rate": 7.549766390920814e-05, + "loss": 1.8553, + "step": 11397 + }, + { + "epoch": 3.498465316144874, + "grad_norm": 0.29947492480278015, + "learning_rate": 7.54933880971909e-05, + "loss": 1.7614, + "step": 11398 + }, + { + "epoch": 3.4987722529158995, + "grad_norm": 0.25790849328041077, + "learning_rate": 7.548911203323308e-05, + "loss": 1.8223, + "step": 11399 + }, + { + "epoch": 3.4990791896869244, + "grad_norm": 0.3145451545715332, + "learning_rate": 7.54848357173769e-05, + "loss": 1.7642, + "step": 11400 + }, + { + "epoch": 3.4993861264579498, + "grad_norm": 0.29052913188934326, + "learning_rate": 7.548055914966463e-05, + "loss": 1.7728, + "step": 11401 + }, + { + "epoch": 3.4996930632289747, + "grad_norm": 0.2741037905216217, + "learning_rate": 7.547628233013854e-05, + "loss": 1.7382, + "step": 11402 + }, + { + "epoch": 3.5, + "grad_norm": 0.2562723755836487, + "learning_rate": 7.54720052588409e-05, + "loss": 1.7455, + "step": 11403 + }, + { + "epoch": 3.5003069367710253, + "grad_norm": 0.27649983763694763, + "learning_rate": 7.546772793581398e-05, + "loss": 1.7194, + "step": 11404 + }, + { + "epoch": 3.5006138735420502, + "grad_norm": 0.27290579676628113, + "learning_rate": 7.546345036110004e-05, + "loss": 1.87, + "step": 11405 + }, + { + "epoch": 3.5009208103130756, + "grad_norm": 0.33585605025291443, + "learning_rate": 7.545917253474136e-05, + "loss": 1.7703, + "step": 11406 + }, + { + "epoch": 3.5012277470841005, + "grad_norm": 0.2592691481113434, + "learning_rate": 7.545489445678022e-05, + "loss": 1.7657, + "step": 11407 + }, + { + "epoch": 3.501534683855126, + "grad_norm": 0.3081367015838623, + "learning_rate": 7.545061612725888e-05, + "loss": 1.8067, + "step": 11408 + }, + { + "epoch": 3.501841620626151, + "grad_norm": 0.31012001633644104, + "learning_rate": 7.544633754621965e-05, + "loss": 1.8009, + "step": 11409 + }, + { + "epoch": 3.5021485573971765, + "grad_norm": 0.28232479095458984, + "learning_rate": 7.54420587137048e-05, + "loss": 1.8124, + "step": 11410 + }, + { + "epoch": 3.5024554941682013, + "grad_norm": 0.24079222977161407, + "learning_rate": 7.54377796297566e-05, + "loss": 1.789, + "step": 11411 + }, + { + "epoch": 3.5027624309392267, + "grad_norm": 0.27347204089164734, + "learning_rate": 7.543350029441737e-05, + "loss": 1.7704, + "step": 11412 + }, + { + "epoch": 3.5030693677102516, + "grad_norm": 0.25545811653137207, + "learning_rate": 7.542922070772935e-05, + "loss": 1.7871, + "step": 11413 + }, + { + "epoch": 3.503376304481277, + "grad_norm": 0.2507263123989105, + "learning_rate": 7.54249408697349e-05, + "loss": 1.8424, + "step": 11414 + }, + { + "epoch": 3.5036832412523022, + "grad_norm": 0.2776084244251251, + "learning_rate": 7.542066078047627e-05, + "loss": 1.8246, + "step": 11415 + }, + { + "epoch": 3.503990178023327, + "grad_norm": 0.32833749055862427, + "learning_rate": 7.541638043999577e-05, + "loss": 1.7785, + "step": 11416 + }, + { + "epoch": 3.5042971147943525, + "grad_norm": 0.258486270904541, + "learning_rate": 7.541209984833571e-05, + "loss": 1.7543, + "step": 11417 + }, + { + "epoch": 3.5046040515653774, + "grad_norm": 0.25825178623199463, + "learning_rate": 7.540781900553837e-05, + "loss": 1.7939, + "step": 11418 + }, + { + "epoch": 3.5049109883364027, + "grad_norm": 0.26980888843536377, + "learning_rate": 7.540353791164606e-05, + "loss": 1.7777, + "step": 11419 + }, + { + "epoch": 3.505217925107428, + "grad_norm": 0.24103333055973053, + "learning_rate": 7.539925656670111e-05, + "loss": 1.7565, + "step": 11420 + }, + { + "epoch": 3.505524861878453, + "grad_norm": 0.25192007422447205, + "learning_rate": 7.539497497074584e-05, + "loss": 1.7696, + "step": 11421 + }, + { + "epoch": 3.5058317986494782, + "grad_norm": 0.218489870429039, + "learning_rate": 7.539069312382252e-05, + "loss": 1.761, + "step": 11422 + }, + { + "epoch": 3.506138735420503, + "grad_norm": 0.27533552050590515, + "learning_rate": 7.53864110259735e-05, + "loss": 1.7374, + "step": 11423 + }, + { + "epoch": 3.5064456721915285, + "grad_norm": 0.2603490650653839, + "learning_rate": 7.538212867724108e-05, + "loss": 1.8342, + "step": 11424 + }, + { + "epoch": 3.506752608962554, + "grad_norm": 0.27340635657310486, + "learning_rate": 7.537784607766758e-05, + "loss": 1.8099, + "step": 11425 + }, + { + "epoch": 3.507059545733579, + "grad_norm": 0.25342679023742676, + "learning_rate": 7.537356322729537e-05, + "loss": 1.7949, + "step": 11426 + }, + { + "epoch": 3.507366482504604, + "grad_norm": 0.292819082736969, + "learning_rate": 7.536928012616669e-05, + "loss": 1.9049, + "step": 11427 + }, + { + "epoch": 3.5076734192756294, + "grad_norm": 0.28256532549858093, + "learning_rate": 7.536499677432393e-05, + "loss": 1.8464, + "step": 11428 + }, + { + "epoch": 3.5079803560466543, + "grad_norm": 0.2672989070415497, + "learning_rate": 7.536071317180942e-05, + "loss": 1.8301, + "step": 11429 + }, + { + "epoch": 3.5082872928176796, + "grad_norm": 0.2525518238544464, + "learning_rate": 7.535642931866546e-05, + "loss": 1.8054, + "step": 11430 + }, + { + "epoch": 3.508594229588705, + "grad_norm": 0.2622447609901428, + "learning_rate": 7.535214521493442e-05, + "loss": 1.8293, + "step": 11431 + }, + { + "epoch": 3.50890116635973, + "grad_norm": 0.27057385444641113, + "learning_rate": 7.534786086065859e-05, + "loss": 1.7426, + "step": 11432 + }, + { + "epoch": 3.509208103130755, + "grad_norm": 0.27363866567611694, + "learning_rate": 7.534357625588038e-05, + "loss": 1.7138, + "step": 11433 + }, + { + "epoch": 3.50951503990178, + "grad_norm": 0.3029060363769531, + "learning_rate": 7.533929140064207e-05, + "loss": 1.864, + "step": 11434 + }, + { + "epoch": 3.5098219766728054, + "grad_norm": 0.3144821524620056, + "learning_rate": 7.533500629498604e-05, + "loss": 1.7846, + "step": 11435 + }, + { + "epoch": 3.5101289134438307, + "grad_norm": 0.44535213708877563, + "learning_rate": 7.533072093895461e-05, + "loss": 1.799, + "step": 11436 + }, + { + "epoch": 3.5104358502148556, + "grad_norm": 0.25344160199165344, + "learning_rate": 7.532643533259017e-05, + "loss": 1.7391, + "step": 11437 + }, + { + "epoch": 3.510742786985881, + "grad_norm": 0.286026269197464, + "learning_rate": 7.532214947593506e-05, + "loss": 1.8436, + "step": 11438 + }, + { + "epoch": 3.511049723756906, + "grad_norm": 0.3317352533340454, + "learning_rate": 7.53178633690316e-05, + "loss": 1.8507, + "step": 11439 + }, + { + "epoch": 3.511356660527931, + "grad_norm": 0.2547265589237213, + "learning_rate": 7.53135770119222e-05, + "loss": 1.7483, + "step": 11440 + }, + { + "epoch": 3.5116635972989565, + "grad_norm": 0.24281835556030273, + "learning_rate": 7.530929040464917e-05, + "loss": 1.759, + "step": 11441 + }, + { + "epoch": 3.511970534069982, + "grad_norm": 0.2935381829738617, + "learning_rate": 7.530500354725491e-05, + "loss": 1.8235, + "step": 11442 + }, + { + "epoch": 3.5122774708410067, + "grad_norm": 0.26642969250679016, + "learning_rate": 7.53007164397818e-05, + "loss": 1.8324, + "step": 11443 + }, + { + "epoch": 3.512584407612032, + "grad_norm": 0.24830882251262665, + "learning_rate": 7.529642908227215e-05, + "loss": 1.8132, + "step": 11444 + }, + { + "epoch": 3.512891344383057, + "grad_norm": 0.3100191056728363, + "learning_rate": 7.529214147476838e-05, + "loss": 1.8453, + "step": 11445 + }, + { + "epoch": 3.5131982811540823, + "grad_norm": 0.27948811650276184, + "learning_rate": 7.528785361731282e-05, + "loss": 1.7792, + "step": 11446 + }, + { + "epoch": 3.5135052179251076, + "grad_norm": 0.26978832483291626, + "learning_rate": 7.528356550994787e-05, + "loss": 1.7857, + "step": 11447 + }, + { + "epoch": 3.5138121546961325, + "grad_norm": 0.30527836084365845, + "learning_rate": 7.527927715271592e-05, + "loss": 1.807, + "step": 11448 + }, + { + "epoch": 3.514119091467158, + "grad_norm": 0.2915664315223694, + "learning_rate": 7.527498854565934e-05, + "loss": 1.8414, + "step": 11449 + }, + { + "epoch": 3.5144260282381827, + "grad_norm": 0.2854034900665283, + "learning_rate": 7.52706996888205e-05, + "loss": 1.793, + "step": 11450 + }, + { + "epoch": 3.514732965009208, + "grad_norm": 0.30281978845596313, + "learning_rate": 7.52664105822418e-05, + "loss": 1.7896, + "step": 11451 + }, + { + "epoch": 3.5150399017802334, + "grad_norm": 0.3317166566848755, + "learning_rate": 7.526212122596561e-05, + "loss": 1.7776, + "step": 11452 + }, + { + "epoch": 3.5153468385512583, + "grad_norm": 0.3400021195411682, + "learning_rate": 7.525783162003434e-05, + "loss": 1.8411, + "step": 11453 + }, + { + "epoch": 3.5156537753222836, + "grad_norm": 0.25169485807418823, + "learning_rate": 7.525354176449037e-05, + "loss": 1.7871, + "step": 11454 + }, + { + "epoch": 3.5159607120933085, + "grad_norm": 0.3442455530166626, + "learning_rate": 7.52492516593761e-05, + "loss": 1.7644, + "step": 11455 + }, + { + "epoch": 3.516267648864334, + "grad_norm": 0.35644033551216125, + "learning_rate": 7.524496130473394e-05, + "loss": 1.801, + "step": 11456 + }, + { + "epoch": 3.516574585635359, + "grad_norm": 0.3180185854434967, + "learning_rate": 7.524067070060625e-05, + "loss": 1.7897, + "step": 11457 + }, + { + "epoch": 3.5168815224063845, + "grad_norm": 0.2417978048324585, + "learning_rate": 7.523637984703548e-05, + "loss": 1.8527, + "step": 11458 + }, + { + "epoch": 3.5171884591774094, + "grad_norm": 0.29661375284194946, + "learning_rate": 7.5232088744064e-05, + "loss": 1.8276, + "step": 11459 + }, + { + "epoch": 3.5174953959484347, + "grad_norm": 0.2467545121908188, + "learning_rate": 7.522779739173424e-05, + "loss": 1.7819, + "step": 11460 + }, + { + "epoch": 3.5178023327194596, + "grad_norm": 0.26177898049354553, + "learning_rate": 7.522350579008859e-05, + "loss": 1.8017, + "step": 11461 + }, + { + "epoch": 3.518109269490485, + "grad_norm": 0.28740498423576355, + "learning_rate": 7.521921393916948e-05, + "loss": 1.7863, + "step": 11462 + }, + { + "epoch": 3.5184162062615103, + "grad_norm": 0.28685200214385986, + "learning_rate": 7.521492183901932e-05, + "loss": 1.8069, + "step": 11463 + }, + { + "epoch": 3.518723143032535, + "grad_norm": 0.24174338579177856, + "learning_rate": 7.521062948968051e-05, + "loss": 1.7523, + "step": 11464 + }, + { + "epoch": 3.5190300798035605, + "grad_norm": 0.23273243010044098, + "learning_rate": 7.520633689119548e-05, + "loss": 1.7827, + "step": 11465 + }, + { + "epoch": 3.5193370165745854, + "grad_norm": 0.22708217799663544, + "learning_rate": 7.520204404360667e-05, + "loss": 1.7377, + "step": 11466 + }, + { + "epoch": 3.5196439533456108, + "grad_norm": 0.24725353717803955, + "learning_rate": 7.519775094695649e-05, + "loss": 1.7828, + "step": 11467 + }, + { + "epoch": 3.519950890116636, + "grad_norm": 0.23046265542507172, + "learning_rate": 7.519345760128736e-05, + "loss": 1.7427, + "step": 11468 + }, + { + "epoch": 3.520257826887661, + "grad_norm": 0.2618728280067444, + "learning_rate": 7.518916400664171e-05, + "loss": 1.8133, + "step": 11469 + }, + { + "epoch": 3.5205647636586863, + "grad_norm": 0.23232363164424896, + "learning_rate": 7.5184870163062e-05, + "loss": 1.7468, + "step": 11470 + }, + { + "epoch": 3.520871700429711, + "grad_norm": 0.21993626654148102, + "learning_rate": 7.51805760705906e-05, + "loss": 1.7565, + "step": 11471 + }, + { + "epoch": 3.5211786372007365, + "grad_norm": 0.23563124239444733, + "learning_rate": 7.517628172927001e-05, + "loss": 1.7795, + "step": 11472 + }, + { + "epoch": 3.521485573971762, + "grad_norm": 0.24502862989902496, + "learning_rate": 7.517198713914266e-05, + "loss": 1.813, + "step": 11473 + }, + { + "epoch": 3.521792510742787, + "grad_norm": 0.24745969474315643, + "learning_rate": 7.516769230025097e-05, + "loss": 1.7601, + "step": 11474 + }, + { + "epoch": 3.522099447513812, + "grad_norm": 0.27686986327171326, + "learning_rate": 7.516339721263739e-05, + "loss": 1.8121, + "step": 11475 + }, + { + "epoch": 3.5224063842848374, + "grad_norm": 0.3110332787036896, + "learning_rate": 7.515910187634439e-05, + "loss": 1.7978, + "step": 11476 + }, + { + "epoch": 3.5227133210558623, + "grad_norm": 0.3394792377948761, + "learning_rate": 7.515480629141436e-05, + "loss": 1.8427, + "step": 11477 + }, + { + "epoch": 3.5230202578268877, + "grad_norm": 0.2802537679672241, + "learning_rate": 7.515051045788984e-05, + "loss": 1.7343, + "step": 11478 + }, + { + "epoch": 3.523327194597913, + "grad_norm": 0.23687711358070374, + "learning_rate": 7.514621437581319e-05, + "loss": 1.7786, + "step": 11479 + }, + { + "epoch": 3.523634131368938, + "grad_norm": 0.31114310026168823, + "learning_rate": 7.514191804522693e-05, + "loss": 1.8137, + "step": 11480 + }, + { + "epoch": 3.523941068139963, + "grad_norm": 0.3257891833782196, + "learning_rate": 7.513762146617351e-05, + "loss": 1.8015, + "step": 11481 + }, + { + "epoch": 3.524248004910988, + "grad_norm": 0.24353443086147308, + "learning_rate": 7.513332463869536e-05, + "loss": 1.7485, + "step": 11482 + }, + { + "epoch": 3.5245549416820134, + "grad_norm": 0.29861485958099365, + "learning_rate": 7.512902756283498e-05, + "loss": 1.7993, + "step": 11483 + }, + { + "epoch": 3.5248618784530388, + "grad_norm": 0.40380924940109253, + "learning_rate": 7.51247302386348e-05, + "loss": 1.7664, + "step": 11484 + }, + { + "epoch": 3.525168815224064, + "grad_norm": 0.3365862965583801, + "learning_rate": 7.512043266613733e-05, + "loss": 1.7512, + "step": 11485 + }, + { + "epoch": 3.525475751995089, + "grad_norm": 0.2502824068069458, + "learning_rate": 7.511613484538502e-05, + "loss": 1.8414, + "step": 11486 + }, + { + "epoch": 3.5257826887661143, + "grad_norm": 0.2598603069782257, + "learning_rate": 7.511183677642034e-05, + "loss": 1.7358, + "step": 11487 + }, + { + "epoch": 3.5260896255371392, + "grad_norm": 0.30246880650520325, + "learning_rate": 7.510753845928576e-05, + "loss": 1.791, + "step": 11488 + }, + { + "epoch": 3.5263965623081646, + "grad_norm": 0.25170832872390747, + "learning_rate": 7.510323989402378e-05, + "loss": 1.7498, + "step": 11489 + }, + { + "epoch": 3.52670349907919, + "grad_norm": 0.2925282418727875, + "learning_rate": 7.509894108067688e-05, + "loss": 1.8413, + "step": 11490 + }, + { + "epoch": 3.527010435850215, + "grad_norm": 0.2643601596355438, + "learning_rate": 7.509464201928752e-05, + "loss": 1.8052, + "step": 11491 + }, + { + "epoch": 3.52731737262124, + "grad_norm": 0.2938917279243469, + "learning_rate": 7.50903427098982e-05, + "loss": 1.7308, + "step": 11492 + }, + { + "epoch": 3.527624309392265, + "grad_norm": 0.2978343367576599, + "learning_rate": 7.508604315255142e-05, + "loss": 1.8147, + "step": 11493 + }, + { + "epoch": 3.5279312461632903, + "grad_norm": 0.2507816255092621, + "learning_rate": 7.508174334728963e-05, + "loss": 1.774, + "step": 11494 + }, + { + "epoch": 3.5282381829343157, + "grad_norm": 0.32971861958503723, + "learning_rate": 7.507744329415538e-05, + "loss": 1.7634, + "step": 11495 + }, + { + "epoch": 3.5285451197053406, + "grad_norm": 0.3149639964103699, + "learning_rate": 7.507314299319113e-05, + "loss": 1.8032, + "step": 11496 + }, + { + "epoch": 3.528852056476366, + "grad_norm": 0.2721364498138428, + "learning_rate": 7.506884244443937e-05, + "loss": 1.7702, + "step": 11497 + }, + { + "epoch": 3.529158993247391, + "grad_norm": 0.29375985264778137, + "learning_rate": 7.506454164794263e-05, + "loss": 1.8673, + "step": 11498 + }, + { + "epoch": 3.529465930018416, + "grad_norm": 0.379944384098053, + "learning_rate": 7.50602406037434e-05, + "loss": 1.883, + "step": 11499 + }, + { + "epoch": 3.5297728667894415, + "grad_norm": 0.4041840136051178, + "learning_rate": 7.505593931188417e-05, + "loss": 1.7998, + "step": 11500 + }, + { + "epoch": 3.530079803560467, + "grad_norm": 0.30013784766197205, + "learning_rate": 7.505163777240747e-05, + "loss": 1.775, + "step": 11501 + }, + { + "epoch": 3.5303867403314917, + "grad_norm": 0.25161153078079224, + "learning_rate": 7.50473359853558e-05, + "loss": 1.8609, + "step": 11502 + }, + { + "epoch": 3.530693677102517, + "grad_norm": 0.2803831100463867, + "learning_rate": 7.504303395077168e-05, + "loss": 1.8397, + "step": 11503 + }, + { + "epoch": 3.531000613873542, + "grad_norm": 0.26678118109703064, + "learning_rate": 7.503873166869762e-05, + "loss": 1.7877, + "step": 11504 + }, + { + "epoch": 3.5313075506445673, + "grad_norm": 0.24280449748039246, + "learning_rate": 7.503442913917613e-05, + "loss": 1.7891, + "step": 11505 + }, + { + "epoch": 3.5316144874155926, + "grad_norm": 0.26461485028266907, + "learning_rate": 7.503012636224976e-05, + "loss": 1.7993, + "step": 11506 + }, + { + "epoch": 3.5319214241866175, + "grad_norm": 0.27001824975013733, + "learning_rate": 7.502582333796098e-05, + "loss": 1.7719, + "step": 11507 + }, + { + "epoch": 3.532228360957643, + "grad_norm": 0.27585846185684204, + "learning_rate": 7.502152006635237e-05, + "loss": 1.7412, + "step": 11508 + }, + { + "epoch": 3.5325352977286677, + "grad_norm": 0.24896648526191711, + "learning_rate": 7.501721654746643e-05, + "loss": 1.7459, + "step": 11509 + }, + { + "epoch": 3.532842234499693, + "grad_norm": 0.2308502197265625, + "learning_rate": 7.501291278134569e-05, + "loss": 1.7717, + "step": 11510 + }, + { + "epoch": 3.5331491712707184, + "grad_norm": 0.3026069104671478, + "learning_rate": 7.500860876803267e-05, + "loss": 1.8578, + "step": 11511 + }, + { + "epoch": 3.5334561080417433, + "grad_norm": 0.30242082476615906, + "learning_rate": 7.500430450756995e-05, + "loss": 1.7793, + "step": 11512 + }, + { + "epoch": 3.5337630448127686, + "grad_norm": 0.2583339214324951, + "learning_rate": 7.500000000000001e-05, + "loss": 1.8388, + "step": 11513 + }, + { + "epoch": 3.5340699815837935, + "grad_norm": 0.29673871397972107, + "learning_rate": 7.499569524536542e-05, + "loss": 1.7749, + "step": 11514 + }, + { + "epoch": 3.534376918354819, + "grad_norm": 0.35199788212776184, + "learning_rate": 7.499139024370874e-05, + "loss": 1.7863, + "step": 11515 + }, + { + "epoch": 3.534683855125844, + "grad_norm": 0.25776436924934387, + "learning_rate": 7.498708499507247e-05, + "loss": 1.7568, + "step": 11516 + }, + { + "epoch": 3.5349907918968695, + "grad_norm": 0.26081520318984985, + "learning_rate": 7.498277949949919e-05, + "loss": 1.807, + "step": 11517 + }, + { + "epoch": 3.5352977286678944, + "grad_norm": 0.29247912764549255, + "learning_rate": 7.497847375703145e-05, + "loss": 1.7568, + "step": 11518 + }, + { + "epoch": 3.5356046654389197, + "grad_norm": 0.20964498817920685, + "learning_rate": 7.497416776771178e-05, + "loss": 1.7601, + "step": 11519 + }, + { + "epoch": 3.5359116022099446, + "grad_norm": 0.28739818930625916, + "learning_rate": 7.496986153158273e-05, + "loss": 1.7915, + "step": 11520 + }, + { + "epoch": 3.53621853898097, + "grad_norm": 0.3109932839870453, + "learning_rate": 7.496555504868691e-05, + "loss": 1.8046, + "step": 11521 + }, + { + "epoch": 3.5365254757519953, + "grad_norm": 0.259284108877182, + "learning_rate": 7.496124831906681e-05, + "loss": 1.7595, + "step": 11522 + }, + { + "epoch": 3.53683241252302, + "grad_norm": 0.265909343957901, + "learning_rate": 7.495694134276504e-05, + "loss": 1.8249, + "step": 11523 + }, + { + "epoch": 3.5371393492940455, + "grad_norm": 0.2478799819946289, + "learning_rate": 7.495263411982415e-05, + "loss": 1.8531, + "step": 11524 + }, + { + "epoch": 3.5374462860650704, + "grad_norm": 0.2636432945728302, + "learning_rate": 7.494832665028671e-05, + "loss": 1.8114, + "step": 11525 + }, + { + "epoch": 3.5377532228360957, + "grad_norm": 0.25323864817619324, + "learning_rate": 7.494401893419527e-05, + "loss": 1.8271, + "step": 11526 + }, + { + "epoch": 3.538060159607121, + "grad_norm": 0.2352467179298401, + "learning_rate": 7.493971097159241e-05, + "loss": 1.7524, + "step": 11527 + }, + { + "epoch": 3.538367096378146, + "grad_norm": 0.2788623869419098, + "learning_rate": 7.493540276252072e-05, + "loss": 1.8238, + "step": 11528 + }, + { + "epoch": 3.5386740331491713, + "grad_norm": 0.3506326377391815, + "learning_rate": 7.493109430702277e-05, + "loss": 1.8525, + "step": 11529 + }, + { + "epoch": 3.538980969920196, + "grad_norm": 0.3685263395309448, + "learning_rate": 7.492678560514113e-05, + "loss": 1.8497, + "step": 11530 + }, + { + "epoch": 3.5392879066912215, + "grad_norm": 0.32200056314468384, + "learning_rate": 7.492247665691837e-05, + "loss": 1.7587, + "step": 11531 + }, + { + "epoch": 3.539594843462247, + "grad_norm": 0.2800062894821167, + "learning_rate": 7.49181674623971e-05, + "loss": 1.8188, + "step": 11532 + }, + { + "epoch": 3.539901780233272, + "grad_norm": 0.24137580394744873, + "learning_rate": 7.491385802161989e-05, + "loss": 1.7947, + "step": 11533 + }, + { + "epoch": 3.540208717004297, + "grad_norm": 0.21900027990341187, + "learning_rate": 7.490954833462933e-05, + "loss": 1.7722, + "step": 11534 + }, + { + "epoch": 3.5405156537753224, + "grad_norm": 0.25009945034980774, + "learning_rate": 7.490523840146803e-05, + "loss": 1.8173, + "step": 11535 + }, + { + "epoch": 3.5408225905463473, + "grad_norm": 0.2778431475162506, + "learning_rate": 7.490092822217855e-05, + "loss": 1.8368, + "step": 11536 + }, + { + "epoch": 3.5411295273173726, + "grad_norm": 0.2845982611179352, + "learning_rate": 7.48966177968035e-05, + "loss": 1.7539, + "step": 11537 + }, + { + "epoch": 3.541436464088398, + "grad_norm": 0.27480921149253845, + "learning_rate": 7.48923071253855e-05, + "loss": 1.8494, + "step": 11538 + }, + { + "epoch": 3.541743400859423, + "grad_norm": 0.2722087502479553, + "learning_rate": 7.488799620796711e-05, + "loss": 1.8422, + "step": 11539 + }, + { + "epoch": 3.542050337630448, + "grad_norm": 0.2984340190887451, + "learning_rate": 7.488368504459097e-05, + "loss": 1.8042, + "step": 11540 + }, + { + "epoch": 3.542357274401473, + "grad_norm": 0.2405850738286972, + "learning_rate": 7.487937363529966e-05, + "loss": 1.749, + "step": 11541 + }, + { + "epoch": 3.5426642111724984, + "grad_norm": 0.24816973507404327, + "learning_rate": 7.487506198013579e-05, + "loss": 1.8671, + "step": 11542 + }, + { + "epoch": 3.5429711479435237, + "grad_norm": 0.2796473503112793, + "learning_rate": 7.487075007914199e-05, + "loss": 1.8023, + "step": 11543 + }, + { + "epoch": 3.5432780847145486, + "grad_norm": 0.2600162625312805, + "learning_rate": 7.486643793236086e-05, + "loss": 1.7997, + "step": 11544 + }, + { + "epoch": 3.543585021485574, + "grad_norm": 0.2746226489543915, + "learning_rate": 7.486212553983503e-05, + "loss": 1.7773, + "step": 11545 + }, + { + "epoch": 3.5438919582565993, + "grad_norm": 0.24142079055309296, + "learning_rate": 7.485781290160708e-05, + "loss": 1.791, + "step": 11546 + }, + { + "epoch": 3.544198895027624, + "grad_norm": 0.2472934126853943, + "learning_rate": 7.485350001771966e-05, + "loss": 1.8183, + "step": 11547 + }, + { + "epoch": 3.5445058317986495, + "grad_norm": 0.26891404390335083, + "learning_rate": 7.48491868882154e-05, + "loss": 1.7421, + "step": 11548 + }, + { + "epoch": 3.544812768569675, + "grad_norm": 0.24820464849472046, + "learning_rate": 7.48448735131369e-05, + "loss": 1.7372, + "step": 11549 + }, + { + "epoch": 3.5451197053406998, + "grad_norm": 0.2456594705581665, + "learning_rate": 7.484055989252679e-05, + "loss": 1.7883, + "step": 11550 + }, + { + "epoch": 3.545426642111725, + "grad_norm": 0.32420551776885986, + "learning_rate": 7.48362460264277e-05, + "loss": 1.8363, + "step": 11551 + }, + { + "epoch": 3.54573357888275, + "grad_norm": 0.3187662661075592, + "learning_rate": 7.483193191488229e-05, + "loss": 1.7957, + "step": 11552 + }, + { + "epoch": 3.5460405156537753, + "grad_norm": 0.2845410108566284, + "learning_rate": 7.482761755793316e-05, + "loss": 1.8288, + "step": 11553 + }, + { + "epoch": 3.5463474524248007, + "grad_norm": 0.2816021740436554, + "learning_rate": 7.482330295562298e-05, + "loss": 1.7562, + "step": 11554 + }, + { + "epoch": 3.5466543891958255, + "grad_norm": 0.28938058018684387, + "learning_rate": 7.481898810799435e-05, + "loss": 1.8139, + "step": 11555 + }, + { + "epoch": 3.546961325966851, + "grad_norm": 0.3305707573890686, + "learning_rate": 7.481467301508995e-05, + "loss": 1.8956, + "step": 11556 + }, + { + "epoch": 3.5472682627378758, + "grad_norm": 0.3890376091003418, + "learning_rate": 7.48103576769524e-05, + "loss": 1.8552, + "step": 11557 + }, + { + "epoch": 3.547575199508901, + "grad_norm": 0.3900652825832367, + "learning_rate": 7.480604209362434e-05, + "loss": 1.7748, + "step": 11558 + }, + { + "epoch": 3.5478821362799264, + "grad_norm": 0.3297326862812042, + "learning_rate": 7.480172626514845e-05, + "loss": 1.8201, + "step": 11559 + }, + { + "epoch": 3.5481890730509518, + "grad_norm": 0.28797218203544617, + "learning_rate": 7.479741019156737e-05, + "loss": 1.7652, + "step": 11560 + }, + { + "epoch": 3.5484960098219767, + "grad_norm": 0.2764691114425659, + "learning_rate": 7.479309387292373e-05, + "loss": 1.7534, + "step": 11561 + }, + { + "epoch": 3.548802946593002, + "grad_norm": 0.25067585706710815, + "learning_rate": 7.47887773092602e-05, + "loss": 1.7849, + "step": 11562 + }, + { + "epoch": 3.549109883364027, + "grad_norm": 0.29966798424720764, + "learning_rate": 7.478446050061947e-05, + "loss": 1.8299, + "step": 11563 + }, + { + "epoch": 3.549416820135052, + "grad_norm": 0.24068406224250793, + "learning_rate": 7.478014344704416e-05, + "loss": 1.8366, + "step": 11564 + }, + { + "epoch": 3.5497237569060776, + "grad_norm": 0.2559303641319275, + "learning_rate": 7.477582614857695e-05, + "loss": 1.7665, + "step": 11565 + }, + { + "epoch": 3.5500306936771024, + "grad_norm": 0.24617858231067657, + "learning_rate": 7.47715086052605e-05, + "loss": 1.8334, + "step": 11566 + }, + { + "epoch": 3.550337630448128, + "grad_norm": 0.2433501034975052, + "learning_rate": 7.476719081713749e-05, + "loss": 1.7963, + "step": 11567 + }, + { + "epoch": 3.5506445672191527, + "grad_norm": 0.2583518326282501, + "learning_rate": 7.476287278425057e-05, + "loss": 1.8311, + "step": 11568 + }, + { + "epoch": 3.550951503990178, + "grad_norm": 0.3232485055923462, + "learning_rate": 7.475855450664244e-05, + "loss": 1.9162, + "step": 11569 + }, + { + "epoch": 3.5512584407612033, + "grad_norm": 0.28247153759002686, + "learning_rate": 7.475423598435576e-05, + "loss": 1.8027, + "step": 11570 + }, + { + "epoch": 3.5515653775322282, + "grad_norm": 0.27201834321022034, + "learning_rate": 7.47499172174332e-05, + "loss": 1.7822, + "step": 11571 + }, + { + "epoch": 3.5518723143032536, + "grad_norm": 0.2408471554517746, + "learning_rate": 7.474559820591748e-05, + "loss": 1.7735, + "step": 11572 + }, + { + "epoch": 3.5521792510742785, + "grad_norm": 0.24187393486499786, + "learning_rate": 7.474127894985124e-05, + "loss": 1.7931, + "step": 11573 + }, + { + "epoch": 3.552486187845304, + "grad_norm": 0.2759699523448944, + "learning_rate": 7.473695944927717e-05, + "loss": 1.8407, + "step": 11574 + }, + { + "epoch": 3.552793124616329, + "grad_norm": 0.2503111958503723, + "learning_rate": 7.473263970423797e-05, + "loss": 1.7613, + "step": 11575 + }, + { + "epoch": 3.5531000613873545, + "grad_norm": 0.24795177578926086, + "learning_rate": 7.472831971477633e-05, + "loss": 1.8221, + "step": 11576 + }, + { + "epoch": 3.5534069981583793, + "grad_norm": 0.23190177977085114, + "learning_rate": 7.472399948093494e-05, + "loss": 1.7541, + "step": 11577 + }, + { + "epoch": 3.5537139349294047, + "grad_norm": 0.24650825560092926, + "learning_rate": 7.471967900275653e-05, + "loss": 1.8002, + "step": 11578 + }, + { + "epoch": 3.5540208717004296, + "grad_norm": 0.256598562002182, + "learning_rate": 7.471535828028372e-05, + "loss": 1.7052, + "step": 11579 + }, + { + "epoch": 3.554327808471455, + "grad_norm": 0.2715381681919098, + "learning_rate": 7.471103731355926e-05, + "loss": 1.7701, + "step": 11580 + }, + { + "epoch": 3.5546347452424802, + "grad_norm": 0.29806044697761536, + "learning_rate": 7.470671610262586e-05, + "loss": 1.7614, + "step": 11581 + }, + { + "epoch": 3.554941682013505, + "grad_norm": 0.26364314556121826, + "learning_rate": 7.470239464752621e-05, + "loss": 1.7957, + "step": 11582 + }, + { + "epoch": 3.5552486187845305, + "grad_norm": 0.29270800948143005, + "learning_rate": 7.4698072948303e-05, + "loss": 1.8263, + "step": 11583 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.25941839814186096, + "learning_rate": 7.469375100499898e-05, + "loss": 1.8517, + "step": 11584 + }, + { + "epoch": 3.5558624923265807, + "grad_norm": 0.29509237408638, + "learning_rate": 7.468942881765681e-05, + "loss": 1.8643, + "step": 11585 + }, + { + "epoch": 3.556169429097606, + "grad_norm": 0.23090367019176483, + "learning_rate": 7.468510638631926e-05, + "loss": 1.7239, + "step": 11586 + }, + { + "epoch": 3.556476365868631, + "grad_norm": 0.2696724236011505, + "learning_rate": 7.468078371102901e-05, + "loss": 1.848, + "step": 11587 + }, + { + "epoch": 3.5567833026396563, + "grad_norm": 0.2691192626953125, + "learning_rate": 7.46764607918288e-05, + "loss": 1.8194, + "step": 11588 + }, + { + "epoch": 3.557090239410681, + "grad_norm": 0.26616501808166504, + "learning_rate": 7.467213762876131e-05, + "loss": 1.8382, + "step": 11589 + }, + { + "epoch": 3.5573971761817065, + "grad_norm": 0.30629831552505493, + "learning_rate": 7.466781422186933e-05, + "loss": 1.8417, + "step": 11590 + }, + { + "epoch": 3.557704112952732, + "grad_norm": 0.27212417125701904, + "learning_rate": 7.466349057119552e-05, + "loss": 1.7612, + "step": 11591 + }, + { + "epoch": 3.558011049723757, + "grad_norm": 0.2872084379196167, + "learning_rate": 7.465916667678266e-05, + "loss": 1.7998, + "step": 11592 + }, + { + "epoch": 3.558317986494782, + "grad_norm": 0.3017117977142334, + "learning_rate": 7.465484253867348e-05, + "loss": 1.7996, + "step": 11593 + }, + { + "epoch": 3.5586249232658074, + "grad_norm": 0.2707957327365875, + "learning_rate": 7.465051815691066e-05, + "loss": 1.7678, + "step": 11594 + }, + { + "epoch": 3.5589318600368323, + "grad_norm": 0.28932711482048035, + "learning_rate": 7.464619353153702e-05, + "loss": 1.8576, + "step": 11595 + }, + { + "epoch": 3.5592387968078576, + "grad_norm": 0.2585125267505646, + "learning_rate": 7.464186866259519e-05, + "loss": 1.8678, + "step": 11596 + }, + { + "epoch": 3.559545733578883, + "grad_norm": 0.24386851489543915, + "learning_rate": 7.4637543550128e-05, + "loss": 1.7778, + "step": 11597 + }, + { + "epoch": 3.559852670349908, + "grad_norm": 0.2375860959291458, + "learning_rate": 7.463321819417817e-05, + "loss": 1.8096, + "step": 11598 + }, + { + "epoch": 3.560159607120933, + "grad_norm": 0.2341299206018448, + "learning_rate": 7.462889259478842e-05, + "loss": 1.7191, + "step": 11599 + }, + { + "epoch": 3.560466543891958, + "grad_norm": 0.2510595917701721, + "learning_rate": 7.462456675200154e-05, + "loss": 1.7763, + "step": 11600 + }, + { + "epoch": 3.5607734806629834, + "grad_norm": 0.2554674744606018, + "learning_rate": 7.462024066586025e-05, + "loss": 1.7578, + "step": 11601 + }, + { + "epoch": 3.5610804174340087, + "grad_norm": 0.25040730834007263, + "learning_rate": 7.46159143364073e-05, + "loss": 1.8194, + "step": 11602 + }, + { + "epoch": 3.5613873542050336, + "grad_norm": 0.24294932186603546, + "learning_rate": 7.461158776368547e-05, + "loss": 1.8063, + "step": 11603 + }, + { + "epoch": 3.561694290976059, + "grad_norm": 0.2388325333595276, + "learning_rate": 7.46072609477375e-05, + "loss": 1.7942, + "step": 11604 + }, + { + "epoch": 3.562001227747084, + "grad_norm": 0.2569502890110016, + "learning_rate": 7.460293388860615e-05, + "loss": 1.7824, + "step": 11605 + }, + { + "epoch": 3.562308164518109, + "grad_norm": 0.24004346132278442, + "learning_rate": 7.45986065863342e-05, + "loss": 1.8676, + "step": 11606 + }, + { + "epoch": 3.5626151012891345, + "grad_norm": 0.25446319580078125, + "learning_rate": 7.45942790409644e-05, + "loss": 1.7726, + "step": 11607 + }, + { + "epoch": 3.56292203806016, + "grad_norm": 0.26257482171058655, + "learning_rate": 7.458995125253951e-05, + "loss": 1.779, + "step": 11608 + }, + { + "epoch": 3.5632289748311847, + "grad_norm": 0.27703070640563965, + "learning_rate": 7.458562322110231e-05, + "loss": 1.8247, + "step": 11609 + }, + { + "epoch": 3.56353591160221, + "grad_norm": 0.25478535890579224, + "learning_rate": 7.458129494669556e-05, + "loss": 1.7794, + "step": 11610 + }, + { + "epoch": 3.563842848373235, + "grad_norm": 0.26173365116119385, + "learning_rate": 7.457696642936207e-05, + "loss": 1.758, + "step": 11611 + }, + { + "epoch": 3.5641497851442603, + "grad_norm": 0.25077274441719055, + "learning_rate": 7.45726376691446e-05, + "loss": 1.8234, + "step": 11612 + }, + { + "epoch": 3.5644567219152856, + "grad_norm": 0.2591109275817871, + "learning_rate": 7.456830866608589e-05, + "loss": 1.7723, + "step": 11613 + }, + { + "epoch": 3.5647636586863105, + "grad_norm": 0.2653447091579437, + "learning_rate": 7.456397942022877e-05, + "loss": 1.7839, + "step": 11614 + }, + { + "epoch": 3.565070595457336, + "grad_norm": 0.3203454911708832, + "learning_rate": 7.455964993161601e-05, + "loss": 1.8548, + "step": 11615 + }, + { + "epoch": 3.5653775322283607, + "grad_norm": 0.3041793704032898, + "learning_rate": 7.455532020029039e-05, + "loss": 1.7925, + "step": 11616 + }, + { + "epoch": 3.565684468999386, + "grad_norm": 0.26066139340400696, + "learning_rate": 7.45509902262947e-05, + "loss": 1.7905, + "step": 11617 + }, + { + "epoch": 3.5659914057704114, + "grad_norm": 0.2483314871788025, + "learning_rate": 7.454666000967174e-05, + "loss": 1.7658, + "step": 11618 + }, + { + "epoch": 3.5662983425414367, + "grad_norm": 0.24285900592803955, + "learning_rate": 7.45423295504643e-05, + "loss": 1.7575, + "step": 11619 + }, + { + "epoch": 3.5666052793124616, + "grad_norm": 0.27231669425964355, + "learning_rate": 7.453799884871517e-05, + "loss": 1.8389, + "step": 11620 + }, + { + "epoch": 3.566912216083487, + "grad_norm": 0.24324406683444977, + "learning_rate": 7.453366790446717e-05, + "loss": 1.7775, + "step": 11621 + }, + { + "epoch": 3.567219152854512, + "grad_norm": 0.2724440097808838, + "learning_rate": 7.452933671776305e-05, + "loss": 1.8135, + "step": 11622 + }, + { + "epoch": 3.567526089625537, + "grad_norm": 0.22207655012607574, + "learning_rate": 7.452500528864568e-05, + "loss": 1.722, + "step": 11623 + }, + { + "epoch": 3.5678330263965625, + "grad_norm": 0.25650298595428467, + "learning_rate": 7.452067361715782e-05, + "loss": 1.7813, + "step": 11624 + }, + { + "epoch": 3.5681399631675874, + "grad_norm": 0.2582200765609741, + "learning_rate": 7.45163417033423e-05, + "loss": 1.8253, + "step": 11625 + }, + { + "epoch": 3.5684468999386127, + "grad_norm": 0.29545384645462036, + "learning_rate": 7.451200954724188e-05, + "loss": 1.8108, + "step": 11626 + }, + { + "epoch": 3.5687538367096376, + "grad_norm": 0.30457428097724915, + "learning_rate": 7.450767714889946e-05, + "loss": 1.8257, + "step": 11627 + }, + { + "epoch": 3.569060773480663, + "grad_norm": 0.2955166697502136, + "learning_rate": 7.450334450835781e-05, + "loss": 1.8172, + "step": 11628 + }, + { + "epoch": 3.5693677102516883, + "grad_norm": 0.2793857753276825, + "learning_rate": 7.449901162565974e-05, + "loss": 1.8493, + "step": 11629 + }, + { + "epoch": 3.569674647022713, + "grad_norm": 0.27154335379600525, + "learning_rate": 7.449467850084808e-05, + "loss": 1.8306, + "step": 11630 + }, + { + "epoch": 3.5699815837937385, + "grad_norm": 0.22336189448833466, + "learning_rate": 7.449034513396564e-05, + "loss": 1.7435, + "step": 11631 + }, + { + "epoch": 3.5702885205647634, + "grad_norm": 0.22799183428287506, + "learning_rate": 7.448601152505526e-05, + "loss": 1.7818, + "step": 11632 + }, + { + "epoch": 3.5705954573357888, + "grad_norm": 0.26670658588409424, + "learning_rate": 7.448167767415976e-05, + "loss": 1.7777, + "step": 11633 + }, + { + "epoch": 3.570902394106814, + "grad_norm": 0.2848666310310364, + "learning_rate": 7.447734358132196e-05, + "loss": 1.7572, + "step": 11634 + }, + { + "epoch": 3.5712093308778394, + "grad_norm": 0.26843544840812683, + "learning_rate": 7.447300924658473e-05, + "loss": 1.7642, + "step": 11635 + }, + { + "epoch": 3.5715162676488643, + "grad_norm": 0.24666404724121094, + "learning_rate": 7.446867466999087e-05, + "loss": 1.7533, + "step": 11636 + }, + { + "epoch": 3.5718232044198897, + "grad_norm": 0.31111210584640503, + "learning_rate": 7.44643398515832e-05, + "loss": 1.7875, + "step": 11637 + }, + { + "epoch": 3.5721301411909145, + "grad_norm": 0.3157108724117279, + "learning_rate": 7.446000479140462e-05, + "loss": 1.7879, + "step": 11638 + }, + { + "epoch": 3.57243707796194, + "grad_norm": 0.2935558259487152, + "learning_rate": 7.445566948949792e-05, + "loss": 1.7819, + "step": 11639 + }, + { + "epoch": 3.572744014732965, + "grad_norm": 0.2265472710132599, + "learning_rate": 7.445133394590597e-05, + "loss": 1.7518, + "step": 11640 + }, + { + "epoch": 3.57305095150399, + "grad_norm": 0.2564176023006439, + "learning_rate": 7.444699816067159e-05, + "loss": 1.7281, + "step": 11641 + }, + { + "epoch": 3.5733578882750154, + "grad_norm": 0.27933555841445923, + "learning_rate": 7.444266213383766e-05, + "loss": 1.7852, + "step": 11642 + }, + { + "epoch": 3.5736648250460403, + "grad_norm": 0.29105356335639954, + "learning_rate": 7.4438325865447e-05, + "loss": 1.8056, + "step": 11643 + }, + { + "epoch": 3.5739717618170657, + "grad_norm": 0.27665549516677856, + "learning_rate": 7.443398935554249e-05, + "loss": 1.7249, + "step": 11644 + }, + { + "epoch": 3.574278698588091, + "grad_norm": 0.21899232268333435, + "learning_rate": 7.442965260416698e-05, + "loss": 1.7689, + "step": 11645 + }, + { + "epoch": 3.574585635359116, + "grad_norm": 0.3250672221183777, + "learning_rate": 7.442531561136333e-05, + "loss": 1.8058, + "step": 11646 + }, + { + "epoch": 3.574892572130141, + "grad_norm": 0.42442524433135986, + "learning_rate": 7.442097837717438e-05, + "loss": 1.7887, + "step": 11647 + }, + { + "epoch": 3.575199508901166, + "grad_norm": 0.33108964562416077, + "learning_rate": 7.441664090164302e-05, + "loss": 1.7628, + "step": 11648 + }, + { + "epoch": 3.5755064456721914, + "grad_norm": 0.23050357401371002, + "learning_rate": 7.44123031848121e-05, + "loss": 1.8121, + "step": 11649 + }, + { + "epoch": 3.575813382443217, + "grad_norm": 0.29251593351364136, + "learning_rate": 7.440796522672448e-05, + "loss": 1.8051, + "step": 11650 + }, + { + "epoch": 3.576120319214242, + "grad_norm": 0.3764750063419342, + "learning_rate": 7.440362702742305e-05, + "loss": 1.9002, + "step": 11651 + }, + { + "epoch": 3.576427255985267, + "grad_norm": 0.3751949071884155, + "learning_rate": 7.439928858695069e-05, + "loss": 1.821, + "step": 11652 + }, + { + "epoch": 3.5767341927562923, + "grad_norm": 0.268476665019989, + "learning_rate": 7.439494990535024e-05, + "loss": 1.8241, + "step": 11653 + }, + { + "epoch": 3.5770411295273172, + "grad_norm": 0.3072795271873474, + "learning_rate": 7.439061098266459e-05, + "loss": 1.8169, + "step": 11654 + }, + { + "epoch": 3.5773480662983426, + "grad_norm": 0.4948901832103729, + "learning_rate": 7.438627181893664e-05, + "loss": 1.7706, + "step": 11655 + }, + { + "epoch": 3.577655003069368, + "grad_norm": 0.5892601013183594, + "learning_rate": 7.438193241420926e-05, + "loss": 1.7631, + "step": 11656 + }, + { + "epoch": 3.577961939840393, + "grad_norm": 0.4599401652812958, + "learning_rate": 7.437759276852533e-05, + "loss": 1.7471, + "step": 11657 + }, + { + "epoch": 3.578268876611418, + "grad_norm": 0.2545170783996582, + "learning_rate": 7.437325288192773e-05, + "loss": 1.7945, + "step": 11658 + }, + { + "epoch": 3.578575813382443, + "grad_norm": 0.3136496841907501, + "learning_rate": 7.436891275445938e-05, + "loss": 1.828, + "step": 11659 + }, + { + "epoch": 3.5788827501534684, + "grad_norm": 0.3631688058376312, + "learning_rate": 7.436457238616313e-05, + "loss": 1.8302, + "step": 11660 + }, + { + "epoch": 3.5791896869244937, + "grad_norm": 0.3097386658191681, + "learning_rate": 7.436023177708192e-05, + "loss": 1.8397, + "step": 11661 + }, + { + "epoch": 3.5794966236955186, + "grad_norm": 0.20948798954486847, + "learning_rate": 7.43558909272586e-05, + "loss": 1.7844, + "step": 11662 + }, + { + "epoch": 3.579803560466544, + "grad_norm": 0.24327392876148224, + "learning_rate": 7.43515498367361e-05, + "loss": 1.7827, + "step": 11663 + }, + { + "epoch": 3.580110497237569, + "grad_norm": 0.25268325209617615, + "learning_rate": 7.434720850555731e-05, + "loss": 1.8224, + "step": 11664 + }, + { + "epoch": 3.580417434008594, + "grad_norm": 0.24883607029914856, + "learning_rate": 7.434286693376513e-05, + "loss": 1.8189, + "step": 11665 + }, + { + "epoch": 3.5807243707796195, + "grad_norm": 0.2942518889904022, + "learning_rate": 7.433852512140248e-05, + "loss": 1.8325, + "step": 11666 + }, + { + "epoch": 3.581031307550645, + "grad_norm": 0.3556186556816101, + "learning_rate": 7.433418306851225e-05, + "loss": 1.7511, + "step": 11667 + }, + { + "epoch": 3.5813382443216697, + "grad_norm": 0.421220600605011, + "learning_rate": 7.432984077513738e-05, + "loss": 1.8081, + "step": 11668 + }, + { + "epoch": 3.581645181092695, + "grad_norm": 0.3338243067264557, + "learning_rate": 7.432549824132074e-05, + "loss": 1.8274, + "step": 11669 + }, + { + "epoch": 3.58195211786372, + "grad_norm": 0.25091543793678284, + "learning_rate": 7.432115546710528e-05, + "loss": 1.7637, + "step": 11670 + }, + { + "epoch": 3.5822590546347453, + "grad_norm": 0.29870370030403137, + "learning_rate": 7.431681245253389e-05, + "loss": 1.8036, + "step": 11671 + }, + { + "epoch": 3.5825659914057706, + "grad_norm": 0.2682137191295624, + "learning_rate": 7.431246919764953e-05, + "loss": 1.8252, + "step": 11672 + }, + { + "epoch": 3.5828729281767955, + "grad_norm": 0.28790801763534546, + "learning_rate": 7.430812570249508e-05, + "loss": 1.7713, + "step": 11673 + }, + { + "epoch": 3.583179864947821, + "grad_norm": 0.26357609033584595, + "learning_rate": 7.43037819671135e-05, + "loss": 1.8388, + "step": 11674 + }, + { + "epoch": 3.5834868017188457, + "grad_norm": 0.2505483031272888, + "learning_rate": 7.42994379915477e-05, + "loss": 1.7722, + "step": 11675 + }, + { + "epoch": 3.583793738489871, + "grad_norm": 0.2535844147205353, + "learning_rate": 7.42950937758406e-05, + "loss": 1.756, + "step": 11676 + }, + { + "epoch": 3.5841006752608964, + "grad_norm": 0.23045027256011963, + "learning_rate": 7.429074932003515e-05, + "loss": 1.791, + "step": 11677 + }, + { + "epoch": 3.5844076120319213, + "grad_norm": 0.22525762021541595, + "learning_rate": 7.428640462417428e-05, + "loss": 1.7234, + "step": 11678 + }, + { + "epoch": 3.5847145488029466, + "grad_norm": 0.2402270883321762, + "learning_rate": 7.428205968830094e-05, + "loss": 1.845, + "step": 11679 + }, + { + "epoch": 3.5850214855739715, + "grad_norm": 0.24909646809101105, + "learning_rate": 7.427771451245802e-05, + "loss": 1.8537, + "step": 11680 + }, + { + "epoch": 3.585328422344997, + "grad_norm": 0.25813063979148865, + "learning_rate": 7.427336909668853e-05, + "loss": 1.7353, + "step": 11681 + }, + { + "epoch": 3.585635359116022, + "grad_norm": 0.26073768734931946, + "learning_rate": 7.426902344103534e-05, + "loss": 1.8142, + "step": 11682 + }, + { + "epoch": 3.5859422958870475, + "grad_norm": 0.2498280256986618, + "learning_rate": 7.426467754554147e-05, + "loss": 1.7996, + "step": 11683 + }, + { + "epoch": 3.5862492326580724, + "grad_norm": 0.3131188154220581, + "learning_rate": 7.426033141024981e-05, + "loss": 1.7793, + "step": 11684 + }, + { + "epoch": 3.5865561694290977, + "grad_norm": 0.24118199944496155, + "learning_rate": 7.425598503520337e-05, + "loss": 1.8249, + "step": 11685 + }, + { + "epoch": 3.5868631062001226, + "grad_norm": 0.2791197597980499, + "learning_rate": 7.425163842044504e-05, + "loss": 1.7966, + "step": 11686 + }, + { + "epoch": 3.587170042971148, + "grad_norm": 0.2298576384782791, + "learning_rate": 7.424729156601781e-05, + "loss": 1.7224, + "step": 11687 + }, + { + "epoch": 3.5874769797421733, + "grad_norm": 0.23113438487052917, + "learning_rate": 7.424294447196462e-05, + "loss": 1.7641, + "step": 11688 + }, + { + "epoch": 3.587783916513198, + "grad_norm": 0.3064495027065277, + "learning_rate": 7.423859713832847e-05, + "loss": 1.8688, + "step": 11689 + }, + { + "epoch": 3.5880908532842235, + "grad_norm": 0.22847676277160645, + "learning_rate": 7.423424956515228e-05, + "loss": 1.7513, + "step": 11690 + }, + { + "epoch": 3.5883977900552484, + "grad_norm": 0.2797350585460663, + "learning_rate": 7.422990175247905e-05, + "loss": 1.8268, + "step": 11691 + }, + { + "epoch": 3.5887047268262737, + "grad_norm": 0.2753821313381195, + "learning_rate": 7.422555370035171e-05, + "loss": 1.7313, + "step": 11692 + }, + { + "epoch": 3.589011663597299, + "grad_norm": 0.2981179654598236, + "learning_rate": 7.422120540881326e-05, + "loss": 1.8455, + "step": 11693 + }, + { + "epoch": 3.5893186003683244, + "grad_norm": 0.33028867840766907, + "learning_rate": 7.421685687790667e-05, + "loss": 1.8397, + "step": 11694 + }, + { + "epoch": 3.5896255371393493, + "grad_norm": 0.409173846244812, + "learning_rate": 7.421250810767487e-05, + "loss": 1.8088, + "step": 11695 + }, + { + "epoch": 3.5899324739103746, + "grad_norm": 0.4118194878101349, + "learning_rate": 7.42081590981609e-05, + "loss": 1.7719, + "step": 11696 + }, + { + "epoch": 3.5902394106813995, + "grad_norm": 0.34716179966926575, + "learning_rate": 7.420380984940773e-05, + "loss": 1.8063, + "step": 11697 + }, + { + "epoch": 3.590546347452425, + "grad_norm": 0.27763083577156067, + "learning_rate": 7.419946036145829e-05, + "loss": 1.7777, + "step": 11698 + }, + { + "epoch": 3.59085328422345, + "grad_norm": 0.3175280690193176, + "learning_rate": 7.419511063435562e-05, + "loss": 1.697, + "step": 11699 + }, + { + "epoch": 3.591160220994475, + "grad_norm": 0.3151503801345825, + "learning_rate": 7.419076066814268e-05, + "loss": 1.8067, + "step": 11700 + }, + { + "epoch": 3.5914671577655004, + "grad_norm": 0.26914867758750916, + "learning_rate": 7.418641046286245e-05, + "loss": 1.7797, + "step": 11701 + }, + { + "epoch": 3.5917740945365253, + "grad_norm": 0.27231964468955994, + "learning_rate": 7.418206001855797e-05, + "loss": 1.7931, + "step": 11702 + }, + { + "epoch": 3.5920810313075506, + "grad_norm": 0.3352177143096924, + "learning_rate": 7.417770933527217e-05, + "loss": 1.9187, + "step": 11703 + }, + { + "epoch": 3.592387968078576, + "grad_norm": 0.3510081470012665, + "learning_rate": 7.417335841304808e-05, + "loss": 1.7889, + "step": 11704 + }, + { + "epoch": 3.592694904849601, + "grad_norm": 0.24949313700199127, + "learning_rate": 7.41690072519287e-05, + "loss": 1.7683, + "step": 11705 + }, + { + "epoch": 3.593001841620626, + "grad_norm": 0.28442221879959106, + "learning_rate": 7.416465585195702e-05, + "loss": 1.7889, + "step": 11706 + }, + { + "epoch": 3.593308778391651, + "grad_norm": 0.3355824649333954, + "learning_rate": 7.416030421317605e-05, + "loss": 1.7637, + "step": 11707 + }, + { + "epoch": 3.5936157151626764, + "grad_norm": 0.33569446206092834, + "learning_rate": 7.415595233562878e-05, + "loss": 1.919, + "step": 11708 + }, + { + "epoch": 3.5939226519337018, + "grad_norm": 0.2488354742527008, + "learning_rate": 7.415160021935825e-05, + "loss": 1.8424, + "step": 11709 + }, + { + "epoch": 3.594229588704727, + "grad_norm": 0.2701130509376526, + "learning_rate": 7.414724786440746e-05, + "loss": 1.7586, + "step": 11710 + }, + { + "epoch": 3.594536525475752, + "grad_norm": 0.26289790868759155, + "learning_rate": 7.414289527081939e-05, + "loss": 1.7975, + "step": 11711 + }, + { + "epoch": 3.5948434622467773, + "grad_norm": 0.25382301211357117, + "learning_rate": 7.413854243863707e-05, + "loss": 1.7393, + "step": 11712 + }, + { + "epoch": 3.595150399017802, + "grad_norm": 0.28282979130744934, + "learning_rate": 7.413418936790357e-05, + "loss": 1.8048, + "step": 11713 + }, + { + "epoch": 3.5954573357888275, + "grad_norm": 0.28001347184181213, + "learning_rate": 7.412983605866183e-05, + "loss": 1.7864, + "step": 11714 + }, + { + "epoch": 3.595764272559853, + "grad_norm": 0.26107707619667053, + "learning_rate": 7.412548251095491e-05, + "loss": 1.8016, + "step": 11715 + }, + { + "epoch": 3.5960712093308778, + "grad_norm": 0.2518761456012726, + "learning_rate": 7.412112872482583e-05, + "loss": 1.7565, + "step": 11716 + }, + { + "epoch": 3.596378146101903, + "grad_norm": 0.25911152362823486, + "learning_rate": 7.411677470031762e-05, + "loss": 1.8333, + "step": 11717 + }, + { + "epoch": 3.596685082872928, + "grad_norm": 0.3411506414413452, + "learning_rate": 7.41124204374733e-05, + "loss": 1.8027, + "step": 11718 + }, + { + "epoch": 3.5969920196439533, + "grad_norm": 0.28535547852516174, + "learning_rate": 7.410806593633593e-05, + "loss": 1.7596, + "step": 11719 + }, + { + "epoch": 3.5972989564149787, + "grad_norm": 0.24665530025959015, + "learning_rate": 7.410371119694852e-05, + "loss": 1.7777, + "step": 11720 + }, + { + "epoch": 3.5976058931860035, + "grad_norm": 0.29162275791168213, + "learning_rate": 7.40993562193541e-05, + "loss": 1.795, + "step": 11721 + }, + { + "epoch": 3.597912829957029, + "grad_norm": 0.2712220549583435, + "learning_rate": 7.409500100359573e-05, + "loss": 1.824, + "step": 11722 + }, + { + "epoch": 3.5982197667280538, + "grad_norm": 0.239755779504776, + "learning_rate": 7.40906455497164e-05, + "loss": 1.7534, + "step": 11723 + }, + { + "epoch": 3.598526703499079, + "grad_norm": 0.26056957244873047, + "learning_rate": 7.408628985775922e-05, + "loss": 1.757, + "step": 11724 + }, + { + "epoch": 3.5988336402701044, + "grad_norm": 0.3230258822441101, + "learning_rate": 7.40819339277672e-05, + "loss": 1.8684, + "step": 11725 + }, + { + "epoch": 3.5991405770411298, + "grad_norm": 0.26070696115493774, + "learning_rate": 7.407757775978339e-05, + "loss": 1.7868, + "step": 11726 + }, + { + "epoch": 3.5994475138121547, + "grad_norm": 0.24940893054008484, + "learning_rate": 7.407322135385085e-05, + "loss": 1.8391, + "step": 11727 + }, + { + "epoch": 3.59975445058318, + "grad_norm": 0.2717723250389099, + "learning_rate": 7.406886471001263e-05, + "loss": 1.7567, + "step": 11728 + }, + { + "epoch": 3.600061387354205, + "grad_norm": 0.2328445315361023, + "learning_rate": 7.406450782831177e-05, + "loss": 1.7761, + "step": 11729 + }, + { + "epoch": 3.6003683241252302, + "grad_norm": 0.2740287184715271, + "learning_rate": 7.406015070879136e-05, + "loss": 1.8599, + "step": 11730 + }, + { + "epoch": 3.6006752608962556, + "grad_norm": 0.2930558919906616, + "learning_rate": 7.405579335149441e-05, + "loss": 1.852, + "step": 11731 + }, + { + "epoch": 3.6009821976672804, + "grad_norm": 0.30175161361694336, + "learning_rate": 7.405143575646403e-05, + "loss": 1.8861, + "step": 11732 + }, + { + "epoch": 3.601289134438306, + "grad_norm": 0.2617531418800354, + "learning_rate": 7.404707792374328e-05, + "loss": 1.7598, + "step": 11733 + }, + { + "epoch": 3.6015960712093307, + "grad_norm": 0.25384122133255005, + "learning_rate": 7.404271985337517e-05, + "loss": 1.7634, + "step": 11734 + }, + { + "epoch": 3.601903007980356, + "grad_norm": 0.31706711649894714, + "learning_rate": 7.403836154540284e-05, + "loss": 1.8125, + "step": 11735 + }, + { + "epoch": 3.6022099447513813, + "grad_norm": 0.299662709236145, + "learning_rate": 7.403400299986932e-05, + "loss": 1.748, + "step": 11736 + }, + { + "epoch": 3.6025168815224062, + "grad_norm": 0.23828944563865662, + "learning_rate": 7.40296442168177e-05, + "loss": 1.7473, + "step": 11737 + }, + { + "epoch": 3.6028238182934316, + "grad_norm": 0.22611604630947113, + "learning_rate": 7.402528519629106e-05, + "loss": 1.7519, + "step": 11738 + }, + { + "epoch": 3.6031307550644565, + "grad_norm": 0.28498536348342896, + "learning_rate": 7.402092593833246e-05, + "loss": 1.7792, + "step": 11739 + }, + { + "epoch": 3.603437691835482, + "grad_norm": 0.2404283881187439, + "learning_rate": 7.4016566442985e-05, + "loss": 1.7434, + "step": 11740 + }, + { + "epoch": 3.603744628606507, + "grad_norm": 0.2291589230298996, + "learning_rate": 7.401220671029173e-05, + "loss": 1.7623, + "step": 11741 + }, + { + "epoch": 3.6040515653775325, + "grad_norm": 0.23962698876857758, + "learning_rate": 7.400784674029578e-05, + "loss": 1.7232, + "step": 11742 + }, + { + "epoch": 3.6043585021485574, + "grad_norm": 0.3015185594558716, + "learning_rate": 7.400348653304022e-05, + "loss": 1.7808, + "step": 11743 + }, + { + "epoch": 3.6046654389195827, + "grad_norm": 0.30623099207878113, + "learning_rate": 7.399912608856813e-05, + "loss": 1.8518, + "step": 11744 + }, + { + "epoch": 3.6049723756906076, + "grad_norm": 0.2698235511779785, + "learning_rate": 7.39947654069226e-05, + "loss": 1.7829, + "step": 11745 + }, + { + "epoch": 3.605279312461633, + "grad_norm": 0.2195274829864502, + "learning_rate": 7.399040448814674e-05, + "loss": 1.7709, + "step": 11746 + }, + { + "epoch": 3.6055862492326582, + "grad_norm": 0.22962357103824615, + "learning_rate": 7.398604333228366e-05, + "loss": 1.7482, + "step": 11747 + }, + { + "epoch": 3.605893186003683, + "grad_norm": 0.2403932511806488, + "learning_rate": 7.398168193937642e-05, + "loss": 1.8063, + "step": 11748 + }, + { + "epoch": 3.6062001227747085, + "grad_norm": 0.23542718589305878, + "learning_rate": 7.397732030946816e-05, + "loss": 1.7599, + "step": 11749 + }, + { + "epoch": 3.6065070595457334, + "grad_norm": 0.2462490350008011, + "learning_rate": 7.397295844260195e-05, + "loss": 1.8183, + "step": 11750 + }, + { + "epoch": 3.6068139963167587, + "grad_norm": 0.21428349614143372, + "learning_rate": 7.396859633882091e-05, + "loss": 1.6944, + "step": 11751 + }, + { + "epoch": 3.607120933087784, + "grad_norm": 0.21240907907485962, + "learning_rate": 7.396423399816817e-05, + "loss": 1.7795, + "step": 11752 + }, + { + "epoch": 3.607427869858809, + "grad_norm": 0.23413677513599396, + "learning_rate": 7.395987142068682e-05, + "loss": 1.8015, + "step": 11753 + }, + { + "epoch": 3.6077348066298343, + "grad_norm": 0.26724907755851746, + "learning_rate": 7.395550860641998e-05, + "loss": 1.8174, + "step": 11754 + }, + { + "epoch": 3.608041743400859, + "grad_norm": 0.22077679634094238, + "learning_rate": 7.395114555541077e-05, + "loss": 1.7929, + "step": 11755 + }, + { + "epoch": 3.6083486801718845, + "grad_norm": 0.2475263774394989, + "learning_rate": 7.394678226770228e-05, + "loss": 1.7744, + "step": 11756 + }, + { + "epoch": 3.60865561694291, + "grad_norm": 0.22579342126846313, + "learning_rate": 7.394241874333764e-05, + "loss": 1.79, + "step": 11757 + }, + { + "epoch": 3.608962553713935, + "grad_norm": 0.26798152923583984, + "learning_rate": 7.393805498236001e-05, + "loss": 1.8087, + "step": 11758 + }, + { + "epoch": 3.60926949048496, + "grad_norm": 0.2755621373653412, + "learning_rate": 7.393369098481248e-05, + "loss": 1.7834, + "step": 11759 + }, + { + "epoch": 3.6095764272559854, + "grad_norm": 0.2741812467575073, + "learning_rate": 7.39293267507382e-05, + "loss": 1.7948, + "step": 11760 + }, + { + "epoch": 3.6098833640270103, + "grad_norm": 0.2378924936056137, + "learning_rate": 7.392496228018028e-05, + "loss": 1.8317, + "step": 11761 + }, + { + "epoch": 3.6101903007980356, + "grad_norm": 0.2628132700920105, + "learning_rate": 7.392059757318187e-05, + "loss": 1.8123, + "step": 11762 + }, + { + "epoch": 3.610497237569061, + "grad_norm": 0.2613002359867096, + "learning_rate": 7.391623262978607e-05, + "loss": 1.795, + "step": 11763 + }, + { + "epoch": 3.610804174340086, + "grad_norm": 0.27272161841392517, + "learning_rate": 7.391186745003608e-05, + "loss": 1.7808, + "step": 11764 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.21366162598133087, + "learning_rate": 7.390750203397497e-05, + "loss": 1.77, + "step": 11765 + }, + { + "epoch": 3.611418047882136, + "grad_norm": 0.25559261441230774, + "learning_rate": 7.390313638164593e-05, + "loss": 1.8442, + "step": 11766 + }, + { + "epoch": 3.6117249846531614, + "grad_norm": 0.23794838786125183, + "learning_rate": 7.389877049309207e-05, + "loss": 1.8237, + "step": 11767 + }, + { + "epoch": 3.6120319214241867, + "grad_norm": 0.2690154016017914, + "learning_rate": 7.389440436835656e-05, + "loss": 1.8194, + "step": 11768 + }, + { + "epoch": 3.612338858195212, + "grad_norm": 0.26148009300231934, + "learning_rate": 7.389003800748254e-05, + "loss": 1.7862, + "step": 11769 + }, + { + "epoch": 3.612645794966237, + "grad_norm": 0.26414936780929565, + "learning_rate": 7.388567141051315e-05, + "loss": 1.7815, + "step": 11770 + }, + { + "epoch": 3.6129527317372623, + "grad_norm": 0.24473857879638672, + "learning_rate": 7.388130457749157e-05, + "loss": 1.801, + "step": 11771 + }, + { + "epoch": 3.613259668508287, + "grad_norm": 0.24356001615524292, + "learning_rate": 7.387693750846094e-05, + "loss": 1.8031, + "step": 11772 + }, + { + "epoch": 3.6135666052793125, + "grad_norm": 0.26716411113739014, + "learning_rate": 7.387257020346441e-05, + "loss": 1.7999, + "step": 11773 + }, + { + "epoch": 3.613873542050338, + "grad_norm": 0.2730760872364044, + "learning_rate": 7.386820266254516e-05, + "loss": 1.8079, + "step": 11774 + }, + { + "epoch": 3.6141804788213627, + "grad_norm": 0.2570728361606598, + "learning_rate": 7.386383488574635e-05, + "loss": 1.7374, + "step": 11775 + }, + { + "epoch": 3.614487415592388, + "grad_norm": 0.24992883205413818, + "learning_rate": 7.385946687311112e-05, + "loss": 1.8432, + "step": 11776 + }, + { + "epoch": 3.614794352363413, + "grad_norm": 0.28632259368896484, + "learning_rate": 7.385509862468266e-05, + "loss": 1.8014, + "step": 11777 + }, + { + "epoch": 3.6151012891344383, + "grad_norm": 0.257303923368454, + "learning_rate": 7.385073014050412e-05, + "loss": 1.8166, + "step": 11778 + }, + { + "epoch": 3.6154082259054636, + "grad_norm": 0.2791872024536133, + "learning_rate": 7.38463614206187e-05, + "loss": 1.7865, + "step": 11779 + }, + { + "epoch": 3.6157151626764885, + "grad_norm": 0.25708603858947754, + "learning_rate": 7.384199246506956e-05, + "loss": 1.807, + "step": 11780 + }, + { + "epoch": 3.616022099447514, + "grad_norm": 0.28693172335624695, + "learning_rate": 7.383762327389988e-05, + "loss": 1.8049, + "step": 11781 + }, + { + "epoch": 3.6163290362185387, + "grad_norm": 0.2731167674064636, + "learning_rate": 7.383325384715283e-05, + "loss": 1.8937, + "step": 11782 + }, + { + "epoch": 3.616635972989564, + "grad_norm": 0.26151663064956665, + "learning_rate": 7.38288841848716e-05, + "loss": 1.8288, + "step": 11783 + }, + { + "epoch": 3.6169429097605894, + "grad_norm": 0.2732257843017578, + "learning_rate": 7.382451428709936e-05, + "loss": 1.7668, + "step": 11784 + }, + { + "epoch": 3.6172498465316147, + "grad_norm": 0.2747575640678406, + "learning_rate": 7.38201441538793e-05, + "loss": 1.7991, + "step": 11785 + }, + { + "epoch": 3.6175567833026396, + "grad_norm": 0.2884783446788788, + "learning_rate": 7.381577378525462e-05, + "loss": 1.7798, + "step": 11786 + }, + { + "epoch": 3.617863720073665, + "grad_norm": 0.2716344892978668, + "learning_rate": 7.381140318126851e-05, + "loss": 1.7923, + "step": 11787 + }, + { + "epoch": 3.61817065684469, + "grad_norm": 0.3007747232913971, + "learning_rate": 7.380703234196416e-05, + "loss": 1.8397, + "step": 11788 + }, + { + "epoch": 3.618477593615715, + "grad_norm": 0.39218056201934814, + "learning_rate": 7.380266126738476e-05, + "loss": 1.8517, + "step": 11789 + }, + { + "epoch": 3.6187845303867405, + "grad_norm": 0.43425866961479187, + "learning_rate": 7.379828995757351e-05, + "loss": 1.7518, + "step": 11790 + }, + { + "epoch": 3.6190914671577654, + "grad_norm": 0.34399518370628357, + "learning_rate": 7.37939184125736e-05, + "loss": 1.7607, + "step": 11791 + }, + { + "epoch": 3.6193984039287908, + "grad_norm": 0.23124302923679352, + "learning_rate": 7.378954663242825e-05, + "loss": 1.7898, + "step": 11792 + }, + { + "epoch": 3.6197053406998156, + "grad_norm": 0.32839757204055786, + "learning_rate": 7.378517461718066e-05, + "loss": 1.7472, + "step": 11793 + }, + { + "epoch": 3.620012277470841, + "grad_norm": 0.38583460450172424, + "learning_rate": 7.378080236687403e-05, + "loss": 1.7947, + "step": 11794 + }, + { + "epoch": 3.6203192142418663, + "grad_norm": 0.4622896909713745, + "learning_rate": 7.377642988155157e-05, + "loss": 1.9023, + "step": 11795 + }, + { + "epoch": 3.620626151012891, + "grad_norm": 0.3783189058303833, + "learning_rate": 7.37720571612565e-05, + "loss": 1.7813, + "step": 11796 + }, + { + "epoch": 3.6209330877839165, + "grad_norm": 0.3468814790248871, + "learning_rate": 7.376768420603204e-05, + "loss": 1.7509, + "step": 11797 + }, + { + "epoch": 3.6212400245549414, + "grad_norm": 0.2602507174015045, + "learning_rate": 7.376331101592138e-05, + "loss": 1.8158, + "step": 11798 + }, + { + "epoch": 3.6215469613259668, + "grad_norm": 0.28337883949279785, + "learning_rate": 7.375893759096775e-05, + "loss": 1.7755, + "step": 11799 + }, + { + "epoch": 3.621853898096992, + "grad_norm": 0.3644609749317169, + "learning_rate": 7.375456393121437e-05, + "loss": 1.8193, + "step": 11800 + }, + { + "epoch": 3.6221608348680174, + "grad_norm": 0.338211327791214, + "learning_rate": 7.375019003670448e-05, + "loss": 1.821, + "step": 11801 + }, + { + "epoch": 3.6224677716390423, + "grad_norm": 0.23850654065608978, + "learning_rate": 7.374581590748129e-05, + "loss": 1.7317, + "step": 11802 + }, + { + "epoch": 3.6227747084100677, + "grad_norm": 0.3496716618537903, + "learning_rate": 7.374144154358801e-05, + "loss": 1.8361, + "step": 11803 + }, + { + "epoch": 3.6230816451810925, + "grad_norm": 0.5585216283798218, + "learning_rate": 7.37370669450679e-05, + "loss": 1.7667, + "step": 11804 + }, + { + "epoch": 3.623388581952118, + "grad_norm": 0.4578089714050293, + "learning_rate": 7.373269211196418e-05, + "loss": 1.8051, + "step": 11805 + }, + { + "epoch": 3.623695518723143, + "grad_norm": 0.28195759654045105, + "learning_rate": 7.37283170443201e-05, + "loss": 1.7823, + "step": 11806 + }, + { + "epoch": 3.624002455494168, + "grad_norm": 0.4066108465194702, + "learning_rate": 7.372394174217887e-05, + "loss": 1.7819, + "step": 11807 + }, + { + "epoch": 3.6243093922651934, + "grad_norm": 0.5368703007698059, + "learning_rate": 7.371956620558375e-05, + "loss": 1.8121, + "step": 11808 + }, + { + "epoch": 3.6246163290362183, + "grad_norm": 0.36627063155174255, + "learning_rate": 7.371519043457795e-05, + "loss": 1.7944, + "step": 11809 + }, + { + "epoch": 3.6249232658072437, + "grad_norm": 0.3100780248641968, + "learning_rate": 7.371081442920476e-05, + "loss": 1.783, + "step": 11810 + }, + { + "epoch": 3.625230202578269, + "grad_norm": 0.3277178704738617, + "learning_rate": 7.370643818950741e-05, + "loss": 1.8105, + "step": 11811 + }, + { + "epoch": 3.625537139349294, + "grad_norm": 0.3887772560119629, + "learning_rate": 7.370206171552914e-05, + "loss": 1.8136, + "step": 11812 + }, + { + "epoch": 3.6258440761203192, + "grad_norm": 0.2770824134349823, + "learning_rate": 7.36976850073132e-05, + "loss": 1.7852, + "step": 11813 + }, + { + "epoch": 3.626151012891344, + "grad_norm": 0.26357728242874146, + "learning_rate": 7.369330806490284e-05, + "loss": 1.7621, + "step": 11814 + }, + { + "epoch": 3.6264579496623695, + "grad_norm": 0.3387344181537628, + "learning_rate": 7.368893088834135e-05, + "loss": 1.7785, + "step": 11815 + }, + { + "epoch": 3.626764886433395, + "grad_norm": 0.35155174136161804, + "learning_rate": 7.368455347767193e-05, + "loss": 1.8081, + "step": 11816 + }, + { + "epoch": 3.62707182320442, + "grad_norm": 0.2855289876461029, + "learning_rate": 7.368017583293788e-05, + "loss": 1.8245, + "step": 11817 + }, + { + "epoch": 3.627378759975445, + "grad_norm": 0.28462162613868713, + "learning_rate": 7.367579795418245e-05, + "loss": 1.8066, + "step": 11818 + }, + { + "epoch": 3.6276856967464703, + "grad_norm": 0.40696555376052856, + "learning_rate": 7.367141984144891e-05, + "loss": 1.8897, + "step": 11819 + }, + { + "epoch": 3.6279926335174952, + "grad_norm": 0.472782701253891, + "learning_rate": 7.366704149478054e-05, + "loss": 1.8071, + "step": 11820 + }, + { + "epoch": 3.6282995702885206, + "grad_norm": 0.27022916078567505, + "learning_rate": 7.366266291422057e-05, + "loss": 1.8574, + "step": 11821 + }, + { + "epoch": 3.628606507059546, + "grad_norm": 0.4207148253917694, + "learning_rate": 7.365828409981231e-05, + "loss": 1.7759, + "step": 11822 + }, + { + "epoch": 3.628913443830571, + "grad_norm": 0.42866072058677673, + "learning_rate": 7.365390505159902e-05, + "loss": 1.7366, + "step": 11823 + }, + { + "epoch": 3.629220380601596, + "grad_norm": 0.28288859128952026, + "learning_rate": 7.364952576962398e-05, + "loss": 1.8591, + "step": 11824 + }, + { + "epoch": 3.629527317372621, + "grad_norm": 0.30544906854629517, + "learning_rate": 7.364514625393045e-05, + "loss": 1.7965, + "step": 11825 + }, + { + "epoch": 3.6298342541436464, + "grad_norm": 0.3251616954803467, + "learning_rate": 7.364076650456173e-05, + "loss": 1.8197, + "step": 11826 + }, + { + "epoch": 3.6301411909146717, + "grad_norm": 0.3133888840675354, + "learning_rate": 7.363638652156109e-05, + "loss": 1.7978, + "step": 11827 + }, + { + "epoch": 3.630448127685697, + "grad_norm": 0.29004594683647156, + "learning_rate": 7.363200630497185e-05, + "loss": 1.8035, + "step": 11828 + }, + { + "epoch": 3.630755064456722, + "grad_norm": 0.2781279683113098, + "learning_rate": 7.362762585483725e-05, + "loss": 1.8462, + "step": 11829 + }, + { + "epoch": 3.6310620012277472, + "grad_norm": 0.29003822803497314, + "learning_rate": 7.362324517120063e-05, + "loss": 1.7952, + "step": 11830 + }, + { + "epoch": 3.631368937998772, + "grad_norm": 0.2510940134525299, + "learning_rate": 7.361886425410524e-05, + "loss": 1.7645, + "step": 11831 + }, + { + "epoch": 3.6316758747697975, + "grad_norm": 0.23798540234565735, + "learning_rate": 7.361448310359438e-05, + "loss": 1.7329, + "step": 11832 + }, + { + "epoch": 3.631982811540823, + "grad_norm": 0.2711278796195984, + "learning_rate": 7.361010171971137e-05, + "loss": 1.8245, + "step": 11833 + }, + { + "epoch": 3.6322897483118477, + "grad_norm": 0.2895669639110565, + "learning_rate": 7.360572010249949e-05, + "loss": 1.7668, + "step": 11834 + }, + { + "epoch": 3.632596685082873, + "grad_norm": 0.2216273844242096, + "learning_rate": 7.360133825200205e-05, + "loss": 1.8164, + "step": 11835 + }, + { + "epoch": 3.632903621853898, + "grad_norm": 0.3075082302093506, + "learning_rate": 7.359695616826236e-05, + "loss": 1.8159, + "step": 11836 + }, + { + "epoch": 3.6332105586249233, + "grad_norm": 0.3208801746368408, + "learning_rate": 7.35925738513237e-05, + "loss": 1.8385, + "step": 11837 + }, + { + "epoch": 3.6335174953959486, + "grad_norm": 0.272517591714859, + "learning_rate": 7.35881913012294e-05, + "loss": 1.7653, + "step": 11838 + }, + { + "epoch": 3.6338244321669735, + "grad_norm": 0.23105360567569733, + "learning_rate": 7.358380851802277e-05, + "loss": 1.7697, + "step": 11839 + }, + { + "epoch": 3.634131368937999, + "grad_norm": 0.2643153667449951, + "learning_rate": 7.357942550174714e-05, + "loss": 1.7885, + "step": 11840 + }, + { + "epoch": 3.6344383057090237, + "grad_norm": 0.22643202543258667, + "learning_rate": 7.357504225244579e-05, + "loss": 1.746, + "step": 11841 + }, + { + "epoch": 3.634745242480049, + "grad_norm": 0.27782970666885376, + "learning_rate": 7.357065877016207e-05, + "loss": 1.794, + "step": 11842 + }, + { + "epoch": 3.6350521792510744, + "grad_norm": 0.3035561740398407, + "learning_rate": 7.356627505493925e-05, + "loss": 1.7892, + "step": 11843 + }, + { + "epoch": 3.6353591160220997, + "grad_norm": 0.31859731674194336, + "learning_rate": 7.356189110682072e-05, + "loss": 1.7636, + "step": 11844 + }, + { + "epoch": 3.6356660527931246, + "grad_norm": 0.2960890233516693, + "learning_rate": 7.355750692584977e-05, + "loss": 1.8294, + "step": 11845 + }, + { + "epoch": 3.63597298956415, + "grad_norm": 0.2544194459915161, + "learning_rate": 7.355312251206972e-05, + "loss": 1.7603, + "step": 11846 + }, + { + "epoch": 3.636279926335175, + "grad_norm": 0.27864789962768555, + "learning_rate": 7.354873786552391e-05, + "loss": 1.7917, + "step": 11847 + }, + { + "epoch": 3.6365868631062, + "grad_norm": 0.32552552223205566, + "learning_rate": 7.354435298625568e-05, + "loss": 1.7769, + "step": 11848 + }, + { + "epoch": 3.6368937998772255, + "grad_norm": 0.25094640254974365, + "learning_rate": 7.353996787430833e-05, + "loss": 1.8371, + "step": 11849 + }, + { + "epoch": 3.6372007366482504, + "grad_norm": 0.26656433939933777, + "learning_rate": 7.353558252972524e-05, + "loss": 1.7686, + "step": 11850 + }, + { + "epoch": 3.6375076734192757, + "grad_norm": 0.3023635745048523, + "learning_rate": 7.353119695254973e-05, + "loss": 1.7892, + "step": 11851 + }, + { + "epoch": 3.6378146101903006, + "grad_norm": 0.2822463810443878, + "learning_rate": 7.352681114282514e-05, + "loss": 1.8221, + "step": 11852 + }, + { + "epoch": 3.638121546961326, + "grad_norm": 0.31159496307373047, + "learning_rate": 7.35224251005948e-05, + "loss": 1.803, + "step": 11853 + }, + { + "epoch": 3.6384284837323513, + "grad_norm": 0.3133087158203125, + "learning_rate": 7.351803882590207e-05, + "loss": 1.744, + "step": 11854 + }, + { + "epoch": 3.638735420503376, + "grad_norm": 0.3050002455711365, + "learning_rate": 7.351365231879029e-05, + "loss": 1.7522, + "step": 11855 + }, + { + "epoch": 3.6390423572744015, + "grad_norm": 0.2729037404060364, + "learning_rate": 7.350926557930283e-05, + "loss": 1.7629, + "step": 11856 + }, + { + "epoch": 3.6393492940454264, + "grad_norm": 0.3181995153427124, + "learning_rate": 7.350487860748303e-05, + "loss": 1.7603, + "step": 11857 + }, + { + "epoch": 3.6396562308164517, + "grad_norm": 0.352651447057724, + "learning_rate": 7.350049140337423e-05, + "loss": 1.8177, + "step": 11858 + }, + { + "epoch": 3.639963167587477, + "grad_norm": 0.22935177385807037, + "learning_rate": 7.349610396701981e-05, + "loss": 1.7421, + "step": 11859 + }, + { + "epoch": 3.6402701043585024, + "grad_norm": 0.26442599296569824, + "learning_rate": 7.349171629846312e-05, + "loss": 1.8026, + "step": 11860 + }, + { + "epoch": 3.6405770411295273, + "grad_norm": 0.25357648730278015, + "learning_rate": 7.348732839774751e-05, + "loss": 1.788, + "step": 11861 + }, + { + "epoch": 3.6408839779005526, + "grad_norm": 0.26959577202796936, + "learning_rate": 7.348294026491635e-05, + "loss": 1.884, + "step": 11862 + }, + { + "epoch": 3.6411909146715775, + "grad_norm": 0.2243001013994217, + "learning_rate": 7.347855190001304e-05, + "loss": 1.7765, + "step": 11863 + }, + { + "epoch": 3.641497851442603, + "grad_norm": 0.2480708807706833, + "learning_rate": 7.34741633030809e-05, + "loss": 1.7597, + "step": 11864 + }, + { + "epoch": 3.641804788213628, + "grad_norm": 0.22512994706630707, + "learning_rate": 7.346977447416332e-05, + "loss": 1.7647, + "step": 11865 + }, + { + "epoch": 3.642111724984653, + "grad_norm": 0.24961981177330017, + "learning_rate": 7.346538541330368e-05, + "loss": 1.8178, + "step": 11866 + }, + { + "epoch": 3.6424186617556784, + "grad_norm": 0.320896714925766, + "learning_rate": 7.346099612054533e-05, + "loss": 1.85, + "step": 11867 + }, + { + "epoch": 3.6427255985267033, + "grad_norm": 0.3420880436897278, + "learning_rate": 7.345660659593167e-05, + "loss": 1.8661, + "step": 11868 + }, + { + "epoch": 3.6430325352977286, + "grad_norm": 0.2675844132900238, + "learning_rate": 7.34522168395061e-05, + "loss": 1.8177, + "step": 11869 + }, + { + "epoch": 3.643339472068754, + "grad_norm": 0.23993943631649017, + "learning_rate": 7.344782685131195e-05, + "loss": 1.7365, + "step": 11870 + }, + { + "epoch": 3.643646408839779, + "grad_norm": 0.21805813908576965, + "learning_rate": 7.344343663139264e-05, + "loss": 1.7813, + "step": 11871 + }, + { + "epoch": 3.643953345610804, + "grad_norm": 0.24334421753883362, + "learning_rate": 7.343904617979154e-05, + "loss": 1.7763, + "step": 11872 + }, + { + "epoch": 3.644260282381829, + "grad_norm": 0.22768431901931763, + "learning_rate": 7.343465549655206e-05, + "loss": 1.7817, + "step": 11873 + }, + { + "epoch": 3.6445672191528544, + "grad_norm": 0.23828962445259094, + "learning_rate": 7.343026458171757e-05, + "loss": 1.8391, + "step": 11874 + }, + { + "epoch": 3.6448741559238798, + "grad_norm": 0.24838197231292725, + "learning_rate": 7.342587343533149e-05, + "loss": 1.759, + "step": 11875 + }, + { + "epoch": 3.645181092694905, + "grad_norm": 0.22732019424438477, + "learning_rate": 7.342148205743718e-05, + "loss": 1.7348, + "step": 11876 + }, + { + "epoch": 3.64548802946593, + "grad_norm": 0.25106775760650635, + "learning_rate": 7.341709044807807e-05, + "loss": 1.8121, + "step": 11877 + }, + { + "epoch": 3.6457949662369553, + "grad_norm": 0.28532838821411133, + "learning_rate": 7.341269860729753e-05, + "loss": 1.7147, + "step": 11878 + }, + { + "epoch": 3.64610190300798, + "grad_norm": 0.3041890859603882, + "learning_rate": 7.340830653513899e-05, + "loss": 1.7666, + "step": 11879 + }, + { + "epoch": 3.6464088397790055, + "grad_norm": 0.3142147958278656, + "learning_rate": 7.340391423164585e-05, + "loss": 1.8707, + "step": 11880 + }, + { + "epoch": 3.646715776550031, + "grad_norm": 0.28531381487846375, + "learning_rate": 7.339952169686151e-05, + "loss": 1.7961, + "step": 11881 + }, + { + "epoch": 3.6470227133210558, + "grad_norm": 0.33779671788215637, + "learning_rate": 7.339512893082938e-05, + "loss": 1.7428, + "step": 11882 + }, + { + "epoch": 3.647329650092081, + "grad_norm": 0.29611849784851074, + "learning_rate": 7.339073593359287e-05, + "loss": 1.8803, + "step": 11883 + }, + { + "epoch": 3.647636586863106, + "grad_norm": 0.31248557567596436, + "learning_rate": 7.33863427051954e-05, + "loss": 1.7868, + "step": 11884 + }, + { + "epoch": 3.6479435236341313, + "grad_norm": 0.42829564213752747, + "learning_rate": 7.338194924568039e-05, + "loss": 1.8558, + "step": 11885 + }, + { + "epoch": 3.6482504604051567, + "grad_norm": 0.431023508310318, + "learning_rate": 7.337755555509126e-05, + "loss": 1.7565, + "step": 11886 + }, + { + "epoch": 3.6485573971761815, + "grad_norm": 0.2917975187301636, + "learning_rate": 7.33731616334714e-05, + "loss": 1.8067, + "step": 11887 + }, + { + "epoch": 3.648864333947207, + "grad_norm": 0.3072175085544586, + "learning_rate": 7.336876748086427e-05, + "loss": 1.782, + "step": 11888 + }, + { + "epoch": 3.6491712707182318, + "grad_norm": 0.33658862113952637, + "learning_rate": 7.336437309731327e-05, + "loss": 1.8007, + "step": 11889 + }, + { + "epoch": 3.649478207489257, + "grad_norm": 0.23774033784866333, + "learning_rate": 7.335997848286185e-05, + "loss": 1.7606, + "step": 11890 + }, + { + "epoch": 3.6497851442602824, + "grad_norm": 0.3373236358165741, + "learning_rate": 7.335558363755344e-05, + "loss": 1.7335, + "step": 11891 + }, + { + "epoch": 3.650092081031308, + "grad_norm": 0.3906517028808594, + "learning_rate": 7.335118856143145e-05, + "loss": 1.7974, + "step": 11892 + }, + { + "epoch": 3.6503990178023327, + "grad_norm": 0.37715303897857666, + "learning_rate": 7.334679325453934e-05, + "loss": 1.8875, + "step": 11893 + }, + { + "epoch": 3.650705954573358, + "grad_norm": 0.278540700674057, + "learning_rate": 7.334239771692053e-05, + "loss": 1.8165, + "step": 11894 + }, + { + "epoch": 3.651012891344383, + "grad_norm": 0.24434895813465118, + "learning_rate": 7.333800194861845e-05, + "loss": 1.7756, + "step": 11895 + }, + { + "epoch": 3.6513198281154082, + "grad_norm": 0.25057271122932434, + "learning_rate": 7.333360594967658e-05, + "loss": 1.7932, + "step": 11896 + }, + { + "epoch": 3.6516267648864336, + "grad_norm": 0.3277342617511749, + "learning_rate": 7.332920972013833e-05, + "loss": 1.7781, + "step": 11897 + }, + { + "epoch": 3.6519337016574585, + "grad_norm": 0.2754829525947571, + "learning_rate": 7.332481326004715e-05, + "loss": 1.7916, + "step": 11898 + }, + { + "epoch": 3.652240638428484, + "grad_norm": 0.24490588903427124, + "learning_rate": 7.332041656944651e-05, + "loss": 1.7904, + "step": 11899 + }, + { + "epoch": 3.6525475751995087, + "grad_norm": 0.3176959455013275, + "learning_rate": 7.331601964837982e-05, + "loss": 1.7379, + "step": 11900 + }, + { + "epoch": 3.652854511970534, + "grad_norm": 0.3435784876346588, + "learning_rate": 7.331162249689057e-05, + "loss": 1.7635, + "step": 11901 + }, + { + "epoch": 3.6531614487415593, + "grad_norm": 0.335697740316391, + "learning_rate": 7.330722511502221e-05, + "loss": 1.7903, + "step": 11902 + }, + { + "epoch": 3.6534683855125847, + "grad_norm": 0.2748894691467285, + "learning_rate": 7.330282750281819e-05, + "loss": 1.8259, + "step": 11903 + }, + { + "epoch": 3.6537753222836096, + "grad_norm": 0.36754751205444336, + "learning_rate": 7.329842966032197e-05, + "loss": 1.7728, + "step": 11904 + }, + { + "epoch": 3.654082259054635, + "grad_norm": 0.4355713129043579, + "learning_rate": 7.3294031587577e-05, + "loss": 1.7447, + "step": 11905 + }, + { + "epoch": 3.65438919582566, + "grad_norm": 0.3967476487159729, + "learning_rate": 7.328963328462677e-05, + "loss": 1.8299, + "step": 11906 + }, + { + "epoch": 3.654696132596685, + "grad_norm": 0.23805755376815796, + "learning_rate": 7.328523475151472e-05, + "loss": 1.7631, + "step": 11907 + }, + { + "epoch": 3.6550030693677105, + "grad_norm": 0.40350377559661865, + "learning_rate": 7.328083598828435e-05, + "loss": 1.8693, + "step": 11908 + }, + { + "epoch": 3.6553100061387354, + "grad_norm": 0.4743673801422119, + "learning_rate": 7.32764369949791e-05, + "loss": 1.7887, + "step": 11909 + }, + { + "epoch": 3.6556169429097607, + "grad_norm": 0.33830127120018005, + "learning_rate": 7.327203777164246e-05, + "loss": 1.7527, + "step": 11910 + }, + { + "epoch": 3.6559238796807856, + "grad_norm": 0.2465003877878189, + "learning_rate": 7.326763831831791e-05, + "loss": 1.7898, + "step": 11911 + }, + { + "epoch": 3.656230816451811, + "grad_norm": 0.31647852063179016, + "learning_rate": 7.326323863504892e-05, + "loss": 1.8056, + "step": 11912 + }, + { + "epoch": 3.6565377532228363, + "grad_norm": 0.31436124444007874, + "learning_rate": 7.325883872187896e-05, + "loss": 1.7972, + "step": 11913 + }, + { + "epoch": 3.656844689993861, + "grad_norm": 0.260405957698822, + "learning_rate": 7.325443857885153e-05, + "loss": 1.8109, + "step": 11914 + }, + { + "epoch": 3.6571516267648865, + "grad_norm": 0.29312583804130554, + "learning_rate": 7.325003820601011e-05, + "loss": 1.8947, + "step": 11915 + }, + { + "epoch": 3.6574585635359114, + "grad_norm": 0.2641582190990448, + "learning_rate": 7.324563760339819e-05, + "loss": 1.7737, + "step": 11916 + }, + { + "epoch": 3.6577655003069367, + "grad_norm": 0.2338121086359024, + "learning_rate": 7.324123677105923e-05, + "loss": 1.7462, + "step": 11917 + }, + { + "epoch": 3.658072437077962, + "grad_norm": 0.27877378463745117, + "learning_rate": 7.323683570903676e-05, + "loss": 1.8371, + "step": 11918 + }, + { + "epoch": 3.6583793738489874, + "grad_norm": 0.24238766729831696, + "learning_rate": 7.323243441737427e-05, + "loss": 1.7304, + "step": 11919 + }, + { + "epoch": 3.6586863106200123, + "grad_norm": 0.2349759042263031, + "learning_rate": 7.322803289611525e-05, + "loss": 1.7422, + "step": 11920 + }, + { + "epoch": 3.6589932473910376, + "grad_norm": 0.2254217565059662, + "learning_rate": 7.322363114530318e-05, + "loss": 1.7296, + "step": 11921 + }, + { + "epoch": 3.6593001841620625, + "grad_norm": 0.24533270299434662, + "learning_rate": 7.321922916498158e-05, + "loss": 1.7834, + "step": 11922 + }, + { + "epoch": 3.659607120933088, + "grad_norm": 0.24993161857128143, + "learning_rate": 7.321482695519393e-05, + "loss": 1.8502, + "step": 11923 + }, + { + "epoch": 3.659914057704113, + "grad_norm": 0.2540178894996643, + "learning_rate": 7.321042451598378e-05, + "loss": 1.8372, + "step": 11924 + }, + { + "epoch": 3.660220994475138, + "grad_norm": 0.2241390198469162, + "learning_rate": 7.32060218473946e-05, + "loss": 1.7619, + "step": 11925 + }, + { + "epoch": 3.6605279312461634, + "grad_norm": 0.2137840837240219, + "learning_rate": 7.32016189494699e-05, + "loss": 1.751, + "step": 11926 + }, + { + "epoch": 3.6608348680171883, + "grad_norm": 0.2596585154533386, + "learning_rate": 7.319721582225323e-05, + "loss": 1.7773, + "step": 11927 + }, + { + "epoch": 3.6611418047882136, + "grad_norm": 0.24898354709148407, + "learning_rate": 7.319281246578806e-05, + "loss": 1.7347, + "step": 11928 + }, + { + "epoch": 3.661448741559239, + "grad_norm": 0.26553863286972046, + "learning_rate": 7.31884088801179e-05, + "loss": 1.7812, + "step": 11929 + }, + { + "epoch": 3.661755678330264, + "grad_norm": 0.2494438737630844, + "learning_rate": 7.318400506528633e-05, + "loss": 1.7554, + "step": 11930 + }, + { + "epoch": 3.662062615101289, + "grad_norm": 0.2794995903968811, + "learning_rate": 7.317960102133682e-05, + "loss": 1.7495, + "step": 11931 + }, + { + "epoch": 3.662369551872314, + "grad_norm": 0.2843860983848572, + "learning_rate": 7.317519674831293e-05, + "loss": 1.7734, + "step": 11932 + }, + { + "epoch": 3.6626764886433394, + "grad_norm": 0.28261128067970276, + "learning_rate": 7.317079224625813e-05, + "loss": 1.7794, + "step": 11933 + }, + { + "epoch": 3.6629834254143647, + "grad_norm": 0.2552426755428314, + "learning_rate": 7.316638751521599e-05, + "loss": 1.8397, + "step": 11934 + }, + { + "epoch": 3.66329036218539, + "grad_norm": 0.4140608608722687, + "learning_rate": 7.316198255523002e-05, + "loss": 1.848, + "step": 11935 + }, + { + "epoch": 3.663597298956415, + "grad_norm": 0.3709854483604431, + "learning_rate": 7.315757736634377e-05, + "loss": 1.8489, + "step": 11936 + }, + { + "epoch": 3.6639042357274403, + "grad_norm": 0.23637300729751587, + "learning_rate": 7.315317194860078e-05, + "loss": 1.7549, + "step": 11937 + }, + { + "epoch": 3.664211172498465, + "grad_norm": 0.32884421944618225, + "learning_rate": 7.314876630204456e-05, + "loss": 1.8061, + "step": 11938 + }, + { + "epoch": 3.6645181092694905, + "grad_norm": 0.33354130387306213, + "learning_rate": 7.314436042671867e-05, + "loss": 1.8346, + "step": 11939 + }, + { + "epoch": 3.664825046040516, + "grad_norm": 0.25776317715644836, + "learning_rate": 7.313995432266663e-05, + "loss": 1.8598, + "step": 11940 + }, + { + "epoch": 3.6651319828115407, + "grad_norm": 0.2910402715206146, + "learning_rate": 7.313554798993202e-05, + "loss": 1.7613, + "step": 11941 + }, + { + "epoch": 3.665438919582566, + "grad_norm": 0.3487538695335388, + "learning_rate": 7.313114142855836e-05, + "loss": 1.8105, + "step": 11942 + }, + { + "epoch": 3.665745856353591, + "grad_norm": 0.27271291613578796, + "learning_rate": 7.312673463858918e-05, + "loss": 1.8107, + "step": 11943 + }, + { + "epoch": 3.6660527931246163, + "grad_norm": 0.2613036632537842, + "learning_rate": 7.312232762006809e-05, + "loss": 1.7871, + "step": 11944 + }, + { + "epoch": 3.6663597298956416, + "grad_norm": 0.30594903230667114, + "learning_rate": 7.311792037303859e-05, + "loss": 1.8043, + "step": 11945 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.3960847854614258, + "learning_rate": 7.311351289754425e-05, + "loss": 1.8434, + "step": 11946 + }, + { + "epoch": 3.666973603437692, + "grad_norm": 0.33369311690330505, + "learning_rate": 7.310910519362861e-05, + "loss": 1.7496, + "step": 11947 + }, + { + "epoch": 3.6672805402087167, + "grad_norm": 0.29852384328842163, + "learning_rate": 7.310469726133528e-05, + "loss": 1.858, + "step": 11948 + }, + { + "epoch": 3.667587476979742, + "grad_norm": 0.2610527276992798, + "learning_rate": 7.310028910070777e-05, + "loss": 1.7642, + "step": 11949 + }, + { + "epoch": 3.6678944137507674, + "grad_norm": 0.3606704771518707, + "learning_rate": 7.309588071178967e-05, + "loss": 1.845, + "step": 11950 + }, + { + "epoch": 3.6682013505217927, + "grad_norm": 0.3157273828983307, + "learning_rate": 7.309147209462454e-05, + "loss": 1.7864, + "step": 11951 + }, + { + "epoch": 3.6685082872928176, + "grad_norm": 0.23907925188541412, + "learning_rate": 7.308706324925594e-05, + "loss": 1.8363, + "step": 11952 + }, + { + "epoch": 3.668815224063843, + "grad_norm": 0.3365088999271393, + "learning_rate": 7.308265417572747e-05, + "loss": 1.8755, + "step": 11953 + }, + { + "epoch": 3.669122160834868, + "grad_norm": 0.29404979944229126, + "learning_rate": 7.307824487408266e-05, + "loss": 1.8128, + "step": 11954 + }, + { + "epoch": 3.669429097605893, + "grad_norm": 0.2689574658870697, + "learning_rate": 7.307383534436511e-05, + "loss": 1.8072, + "step": 11955 + }, + { + "epoch": 3.6697360343769185, + "grad_norm": 0.28394198417663574, + "learning_rate": 7.306942558661841e-05, + "loss": 1.7919, + "step": 11956 + }, + { + "epoch": 3.6700429711479434, + "grad_norm": 0.2594783902168274, + "learning_rate": 7.306501560088612e-05, + "loss": 1.7467, + "step": 11957 + }, + { + "epoch": 3.6703499079189688, + "grad_norm": 0.24765191972255707, + "learning_rate": 7.30606053872118e-05, + "loss": 1.7876, + "step": 11958 + }, + { + "epoch": 3.6706568446899936, + "grad_norm": 0.22157172858715057, + "learning_rate": 7.305619494563909e-05, + "loss": 1.7802, + "step": 11959 + }, + { + "epoch": 3.670963781461019, + "grad_norm": 0.270151287317276, + "learning_rate": 7.305178427621155e-05, + "loss": 1.7723, + "step": 11960 + }, + { + "epoch": 3.6712707182320443, + "grad_norm": 0.3163939118385315, + "learning_rate": 7.304737337897277e-05, + "loss": 1.8488, + "step": 11961 + }, + { + "epoch": 3.671577655003069, + "grad_norm": 0.2605706453323364, + "learning_rate": 7.304296225396632e-05, + "loss": 1.7442, + "step": 11962 + }, + { + "epoch": 3.6718845917740945, + "grad_norm": 0.31179291009902954, + "learning_rate": 7.303855090123582e-05, + "loss": 1.831, + "step": 11963 + }, + { + "epoch": 3.6721915285451194, + "grad_norm": 0.33365359902381897, + "learning_rate": 7.303413932082483e-05, + "loss": 1.8376, + "step": 11964 + }, + { + "epoch": 3.6724984653161448, + "grad_norm": 0.2952130138874054, + "learning_rate": 7.302972751277701e-05, + "loss": 1.7733, + "step": 11965 + }, + { + "epoch": 3.67280540208717, + "grad_norm": 0.24270877242088318, + "learning_rate": 7.302531547713592e-05, + "loss": 1.8367, + "step": 11966 + }, + { + "epoch": 3.6731123388581954, + "grad_norm": 0.34315919876098633, + "learning_rate": 7.302090321394517e-05, + "loss": 1.7901, + "step": 11967 + }, + { + "epoch": 3.6734192756292203, + "grad_norm": 0.33511418104171753, + "learning_rate": 7.301649072324834e-05, + "loss": 1.7929, + "step": 11968 + }, + { + "epoch": 3.6737262124002457, + "grad_norm": 0.22397933900356293, + "learning_rate": 7.301207800508907e-05, + "loss": 1.7533, + "step": 11969 + }, + { + "epoch": 3.6740331491712706, + "grad_norm": 0.2882738411426544, + "learning_rate": 7.300766505951095e-05, + "loss": 1.8071, + "step": 11970 + }, + { + "epoch": 3.674340085942296, + "grad_norm": 0.242112398147583, + "learning_rate": 7.300325188655761e-05, + "loss": 1.7739, + "step": 11971 + }, + { + "epoch": 3.674647022713321, + "grad_norm": 0.27754491567611694, + "learning_rate": 7.299883848627265e-05, + "loss": 1.8295, + "step": 11972 + }, + { + "epoch": 3.674953959484346, + "grad_norm": 0.2787899076938629, + "learning_rate": 7.29944248586997e-05, + "loss": 1.7682, + "step": 11973 + }, + { + "epoch": 3.6752608962553714, + "grad_norm": 0.24448934197425842, + "learning_rate": 7.299001100388234e-05, + "loss": 1.7826, + "step": 11974 + }, + { + "epoch": 3.6755678330263963, + "grad_norm": 0.37869495153427124, + "learning_rate": 7.298559692186421e-05, + "loss": 1.8582, + "step": 11975 + }, + { + "epoch": 3.6758747697974217, + "grad_norm": 0.3299996256828308, + "learning_rate": 7.298118261268897e-05, + "loss": 1.7716, + "step": 11976 + }, + { + "epoch": 3.676181706568447, + "grad_norm": 0.278891384601593, + "learning_rate": 7.29767680764002e-05, + "loss": 1.879, + "step": 11977 + }, + { + "epoch": 3.6764886433394723, + "grad_norm": 0.29326459765434265, + "learning_rate": 7.297235331304155e-05, + "loss": 1.804, + "step": 11978 + }, + { + "epoch": 3.6767955801104972, + "grad_norm": 0.2697092592716217, + "learning_rate": 7.296793832265663e-05, + "loss": 1.7842, + "step": 11979 + }, + { + "epoch": 3.6771025168815226, + "grad_norm": 0.3045118749141693, + "learning_rate": 7.296352310528909e-05, + "loss": 1.7959, + "step": 11980 + }, + { + "epoch": 3.6774094536525475, + "grad_norm": 0.278647780418396, + "learning_rate": 7.295910766098252e-05, + "loss": 1.7907, + "step": 11981 + }, + { + "epoch": 3.677716390423573, + "grad_norm": 0.2370275855064392, + "learning_rate": 7.295469198978063e-05, + "loss": 1.757, + "step": 11982 + }, + { + "epoch": 3.678023327194598, + "grad_norm": 0.3061021566390991, + "learning_rate": 7.295027609172702e-05, + "loss": 1.7927, + "step": 11983 + }, + { + "epoch": 3.678330263965623, + "grad_norm": 0.2844544053077698, + "learning_rate": 7.294585996686532e-05, + "loss": 1.7705, + "step": 11984 + }, + { + "epoch": 3.6786372007366483, + "grad_norm": 0.31121113896369934, + "learning_rate": 7.29414436152392e-05, + "loss": 1.783, + "step": 11985 + }, + { + "epoch": 3.6789441375076732, + "grad_norm": 0.2566785514354706, + "learning_rate": 7.293702703689225e-05, + "loss": 1.7781, + "step": 11986 + }, + { + "epoch": 3.6792510742786986, + "grad_norm": 0.22176961600780487, + "learning_rate": 7.293261023186818e-05, + "loss": 1.7302, + "step": 11987 + }, + { + "epoch": 3.679558011049724, + "grad_norm": 0.21547441184520721, + "learning_rate": 7.292819320021062e-05, + "loss": 1.7666, + "step": 11988 + }, + { + "epoch": 3.679864947820749, + "grad_norm": 0.26309674978256226, + "learning_rate": 7.29237759419632e-05, + "loss": 1.7817, + "step": 11989 + }, + { + "epoch": 3.680171884591774, + "grad_norm": 0.2558063864707947, + "learning_rate": 7.29193584571696e-05, + "loss": 1.8257, + "step": 11990 + }, + { + "epoch": 3.680478821362799, + "grad_norm": 0.24516844749450684, + "learning_rate": 7.291494074587347e-05, + "loss": 1.7803, + "step": 11991 + }, + { + "epoch": 3.6807857581338244, + "grad_norm": 0.22891047596931458, + "learning_rate": 7.291052280811843e-05, + "loss": 1.7977, + "step": 11992 + }, + { + "epoch": 3.6810926949048497, + "grad_norm": 0.2776026129722595, + "learning_rate": 7.290610464394822e-05, + "loss": 1.8486, + "step": 11993 + }, + { + "epoch": 3.681399631675875, + "grad_norm": 0.31472426652908325, + "learning_rate": 7.290168625340644e-05, + "loss": 1.7841, + "step": 11994 + }, + { + "epoch": 3.6817065684469, + "grad_norm": 0.3459274470806122, + "learning_rate": 7.289726763653677e-05, + "loss": 1.7458, + "step": 11995 + }, + { + "epoch": 3.6820135052179253, + "grad_norm": 0.23645849525928497, + "learning_rate": 7.289284879338289e-05, + "loss": 1.781, + "step": 11996 + }, + { + "epoch": 3.68232044198895, + "grad_norm": 0.3257114291191101, + "learning_rate": 7.288842972398845e-05, + "loss": 1.8269, + "step": 11997 + }, + { + "epoch": 3.6826273787599755, + "grad_norm": 0.5450126528739929, + "learning_rate": 7.288401042839713e-05, + "loss": 1.8342, + "step": 11998 + }, + { + "epoch": 3.682934315531001, + "grad_norm": 0.5080512762069702, + "learning_rate": 7.287959090665262e-05, + "loss": 1.8097, + "step": 11999 + }, + { + "epoch": 3.6832412523020257, + "grad_norm": 0.3005252480506897, + "learning_rate": 7.287517115879858e-05, + "loss": 1.8271, + "step": 12000 + }, + { + "epoch": 3.683548189073051, + "grad_norm": 0.2760924994945526, + "learning_rate": 7.287075118487869e-05, + "loss": 1.8267, + "step": 12001 + }, + { + "epoch": 3.683855125844076, + "grad_norm": 0.3475865423679352, + "learning_rate": 7.286633098493663e-05, + "loss": 1.785, + "step": 12002 + }, + { + "epoch": 3.6841620626151013, + "grad_norm": 0.2905690670013428, + "learning_rate": 7.286191055901608e-05, + "loss": 1.8283, + "step": 12003 + }, + { + "epoch": 3.6844689993861266, + "grad_norm": 0.23666246235370636, + "learning_rate": 7.285748990716072e-05, + "loss": 1.7665, + "step": 12004 + }, + { + "epoch": 3.6847759361571515, + "grad_norm": 0.32329514622688293, + "learning_rate": 7.285306902941427e-05, + "loss": 1.7267, + "step": 12005 + }, + { + "epoch": 3.685082872928177, + "grad_norm": 0.32345879077911377, + "learning_rate": 7.28486479258204e-05, + "loss": 1.7529, + "step": 12006 + }, + { + "epoch": 3.6853898096992017, + "grad_norm": 0.2727855443954468, + "learning_rate": 7.284422659642279e-05, + "loss": 1.8279, + "step": 12007 + }, + { + "epoch": 3.685696746470227, + "grad_norm": 0.37847277522087097, + "learning_rate": 7.283980504126513e-05, + "loss": 1.7809, + "step": 12008 + }, + { + "epoch": 3.6860036832412524, + "grad_norm": 0.44694215059280396, + "learning_rate": 7.283538326039113e-05, + "loss": 1.8184, + "step": 12009 + }, + { + "epoch": 3.6863106200122777, + "grad_norm": 0.2868261933326721, + "learning_rate": 7.28309612538445e-05, + "loss": 1.7461, + "step": 12010 + }, + { + "epoch": 3.6866175567833026, + "grad_norm": 0.2601351737976074, + "learning_rate": 7.282653902166894e-05, + "loss": 1.8011, + "step": 12011 + }, + { + "epoch": 3.686924493554328, + "grad_norm": 0.328185498714447, + "learning_rate": 7.282211656390813e-05, + "loss": 1.7934, + "step": 12012 + }, + { + "epoch": 3.687231430325353, + "grad_norm": 0.2712559103965759, + "learning_rate": 7.281769388060578e-05, + "loss": 1.7566, + "step": 12013 + }, + { + "epoch": 3.687538367096378, + "grad_norm": 0.2725805938243866, + "learning_rate": 7.281327097180562e-05, + "loss": 1.8024, + "step": 12014 + }, + { + "epoch": 3.6878453038674035, + "grad_norm": 0.37282630801200867, + "learning_rate": 7.280884783755133e-05, + "loss": 1.7624, + "step": 12015 + }, + { + "epoch": 3.6881522406384284, + "grad_norm": 0.36519256234169006, + "learning_rate": 7.280442447788664e-05, + "loss": 1.8691, + "step": 12016 + }, + { + "epoch": 3.6884591774094537, + "grad_norm": 0.21699345111846924, + "learning_rate": 7.280000089285528e-05, + "loss": 1.7308, + "step": 12017 + }, + { + "epoch": 3.6887661141804786, + "grad_norm": 0.3159945011138916, + "learning_rate": 7.279557708250094e-05, + "loss": 1.8144, + "step": 12018 + }, + { + "epoch": 3.689073050951504, + "grad_norm": 0.2927449643611908, + "learning_rate": 7.279115304686735e-05, + "loss": 1.7746, + "step": 12019 + }, + { + "epoch": 3.6893799877225293, + "grad_norm": 0.279208242893219, + "learning_rate": 7.278672878599819e-05, + "loss": 1.7678, + "step": 12020 + }, + { + "epoch": 3.689686924493554, + "grad_norm": 0.40005648136138916, + "learning_rate": 7.278230429993725e-05, + "loss": 1.7876, + "step": 12021 + }, + { + "epoch": 3.6899938612645795, + "grad_norm": 0.3444392681121826, + "learning_rate": 7.277787958872824e-05, + "loss": 1.7591, + "step": 12022 + }, + { + "epoch": 3.6903007980356044, + "grad_norm": 0.21841467916965485, + "learning_rate": 7.277345465241485e-05, + "loss": 1.785, + "step": 12023 + }, + { + "epoch": 3.6906077348066297, + "grad_norm": 0.32463181018829346, + "learning_rate": 7.276902949104084e-05, + "loss": 1.8164, + "step": 12024 + }, + { + "epoch": 3.690914671577655, + "grad_norm": 0.36221247911453247, + "learning_rate": 7.276460410464994e-05, + "loss": 1.7529, + "step": 12025 + }, + { + "epoch": 3.6912216083486804, + "grad_norm": 0.24451927840709686, + "learning_rate": 7.276017849328588e-05, + "loss": 1.8031, + "step": 12026 + }, + { + "epoch": 3.6915285451197053, + "grad_norm": 0.3055694103240967, + "learning_rate": 7.275575265699239e-05, + "loss": 1.8158, + "step": 12027 + }, + { + "epoch": 3.6918354818907306, + "grad_norm": 0.4315083622932434, + "learning_rate": 7.27513265958132e-05, + "loss": 1.8322, + "step": 12028 + }, + { + "epoch": 3.6921424186617555, + "grad_norm": 0.3391095697879791, + "learning_rate": 7.274690030979209e-05, + "loss": 1.8214, + "step": 12029 + }, + { + "epoch": 3.692449355432781, + "grad_norm": 0.22714883089065552, + "learning_rate": 7.274247379897277e-05, + "loss": 1.7312, + "step": 12030 + }, + { + "epoch": 3.692756292203806, + "grad_norm": 0.24982765316963196, + "learning_rate": 7.273804706339899e-05, + "loss": 1.738, + "step": 12031 + }, + { + "epoch": 3.693063228974831, + "grad_norm": 0.32509860396385193, + "learning_rate": 7.273362010311451e-05, + "loss": 1.7773, + "step": 12032 + }, + { + "epoch": 3.6933701657458564, + "grad_norm": 0.2643086612224579, + "learning_rate": 7.272919291816307e-05, + "loss": 1.7545, + "step": 12033 + }, + { + "epoch": 3.6936771025168813, + "grad_norm": 0.2568800747394562, + "learning_rate": 7.272476550858842e-05, + "loss": 1.8055, + "step": 12034 + }, + { + "epoch": 3.6939840392879066, + "grad_norm": 0.27418240904808044, + "learning_rate": 7.272033787443433e-05, + "loss": 1.7769, + "step": 12035 + }, + { + "epoch": 3.694290976058932, + "grad_norm": 0.2459677755832672, + "learning_rate": 7.271591001574453e-05, + "loss": 1.7971, + "step": 12036 + }, + { + "epoch": 3.694597912829957, + "grad_norm": 0.22349393367767334, + "learning_rate": 7.27114819325628e-05, + "loss": 1.7791, + "step": 12037 + }, + { + "epoch": 3.694904849600982, + "grad_norm": 0.25321197509765625, + "learning_rate": 7.270705362493288e-05, + "loss": 1.7475, + "step": 12038 + }, + { + "epoch": 3.695211786372007, + "grad_norm": 0.2585916519165039, + "learning_rate": 7.270262509289855e-05, + "loss": 1.7801, + "step": 12039 + }, + { + "epoch": 3.6955187231430324, + "grad_norm": 0.2673574686050415, + "learning_rate": 7.269819633650359e-05, + "loss": 1.7578, + "step": 12040 + }, + { + "epoch": 3.6958256599140578, + "grad_norm": 0.2509469985961914, + "learning_rate": 7.269376735579175e-05, + "loss": 1.7994, + "step": 12041 + }, + { + "epoch": 3.696132596685083, + "grad_norm": 0.28527703881263733, + "learning_rate": 7.268933815080679e-05, + "loss": 1.7752, + "step": 12042 + }, + { + "epoch": 3.696439533456108, + "grad_norm": 0.22716578841209412, + "learning_rate": 7.268490872159248e-05, + "loss": 1.7186, + "step": 12043 + }, + { + "epoch": 3.6967464702271333, + "grad_norm": 0.24888403713703156, + "learning_rate": 7.268047906819262e-05, + "loss": 1.7882, + "step": 12044 + }, + { + "epoch": 3.697053406998158, + "grad_norm": 0.28976112604141235, + "learning_rate": 7.267604919065096e-05, + "loss": 1.7655, + "step": 12045 + }, + { + "epoch": 3.6973603437691835, + "grad_norm": 0.24668502807617188, + "learning_rate": 7.267161908901131e-05, + "loss": 1.8051, + "step": 12046 + }, + { + "epoch": 3.697667280540209, + "grad_norm": 0.2464776188135147, + "learning_rate": 7.266718876331742e-05, + "loss": 1.809, + "step": 12047 + }, + { + "epoch": 3.6979742173112338, + "grad_norm": 0.27648577094078064, + "learning_rate": 7.266275821361309e-05, + "loss": 1.7869, + "step": 12048 + }, + { + "epoch": 3.698281154082259, + "grad_norm": 0.26427242159843445, + "learning_rate": 7.26583274399421e-05, + "loss": 1.7681, + "step": 12049 + }, + { + "epoch": 3.698588090853284, + "grad_norm": 0.24595285952091217, + "learning_rate": 7.265389644234823e-05, + "loss": 1.7209, + "step": 12050 + }, + { + "epoch": 3.6988950276243093, + "grad_norm": 0.32514405250549316, + "learning_rate": 7.26494652208753e-05, + "loss": 1.8702, + "step": 12051 + }, + { + "epoch": 3.6992019643953347, + "grad_norm": 0.24512936174869537, + "learning_rate": 7.264503377556705e-05, + "loss": 1.784, + "step": 12052 + }, + { + "epoch": 3.69950890116636, + "grad_norm": 0.28698310256004333, + "learning_rate": 7.264060210646733e-05, + "loss": 1.905, + "step": 12053 + }, + { + "epoch": 3.699815837937385, + "grad_norm": 0.2995007336139679, + "learning_rate": 7.263617021361989e-05, + "loss": 1.7822, + "step": 12054 + }, + { + "epoch": 3.7001227747084102, + "grad_norm": 0.25869423151016235, + "learning_rate": 7.263173809706855e-05, + "loss": 1.7988, + "step": 12055 + }, + { + "epoch": 3.700429711479435, + "grad_norm": 0.350918710231781, + "learning_rate": 7.262730575685711e-05, + "loss": 1.9504, + "step": 12056 + }, + { + "epoch": 3.7007366482504604, + "grad_norm": 0.3407665491104126, + "learning_rate": 7.262287319302937e-05, + "loss": 1.8506, + "step": 12057 + }, + { + "epoch": 3.701043585021486, + "grad_norm": 0.3039441704750061, + "learning_rate": 7.261844040562915e-05, + "loss": 1.7841, + "step": 12058 + }, + { + "epoch": 3.7013505217925107, + "grad_norm": 0.23483428359031677, + "learning_rate": 7.261400739470023e-05, + "loss": 1.7899, + "step": 12059 + }, + { + "epoch": 3.701657458563536, + "grad_norm": 0.30779507756233215, + "learning_rate": 7.260957416028645e-05, + "loss": 1.8131, + "step": 12060 + }, + { + "epoch": 3.701964395334561, + "grad_norm": 0.29901376366615295, + "learning_rate": 7.26051407024316e-05, + "loss": 1.7861, + "step": 12061 + }, + { + "epoch": 3.7022713321055862, + "grad_norm": 0.30058762431144714, + "learning_rate": 7.260070702117949e-05, + "loss": 1.7485, + "step": 12062 + }, + { + "epoch": 3.7025782688766116, + "grad_norm": 0.24523651599884033, + "learning_rate": 7.259627311657396e-05, + "loss": 1.772, + "step": 12063 + }, + { + "epoch": 3.7028852056476365, + "grad_norm": 0.24375474452972412, + "learning_rate": 7.259183898865882e-05, + "loss": 1.7848, + "step": 12064 + }, + { + "epoch": 3.703192142418662, + "grad_norm": 0.2562403380870819, + "learning_rate": 7.258740463747788e-05, + "loss": 1.7447, + "step": 12065 + }, + { + "epoch": 3.7034990791896867, + "grad_norm": 0.265229195356369, + "learning_rate": 7.258297006307496e-05, + "loss": 1.8111, + "step": 12066 + }, + { + "epoch": 3.703806015960712, + "grad_norm": 0.2836552858352661, + "learning_rate": 7.25785352654939e-05, + "loss": 1.7952, + "step": 12067 + }, + { + "epoch": 3.7041129527317374, + "grad_norm": 0.3269572854042053, + "learning_rate": 7.257410024477852e-05, + "loss": 1.8604, + "step": 12068 + }, + { + "epoch": 3.7044198895027627, + "grad_norm": 0.2391490638256073, + "learning_rate": 7.256966500097264e-05, + "loss": 1.7417, + "step": 12069 + }, + { + "epoch": 3.7047268262737876, + "grad_norm": 0.2610675096511841, + "learning_rate": 7.256522953412011e-05, + "loss": 1.7712, + "step": 12070 + }, + { + "epoch": 3.705033763044813, + "grad_norm": 0.24954774975776672, + "learning_rate": 7.256079384426477e-05, + "loss": 1.7506, + "step": 12071 + }, + { + "epoch": 3.705340699815838, + "grad_norm": 0.2603892385959625, + "learning_rate": 7.255635793145042e-05, + "loss": 1.8105, + "step": 12072 + }, + { + "epoch": 3.705647636586863, + "grad_norm": 0.32728591561317444, + "learning_rate": 7.255192179572092e-05, + "loss": 1.8448, + "step": 12073 + }, + { + "epoch": 3.7059545733578885, + "grad_norm": 0.4559340178966522, + "learning_rate": 7.254748543712013e-05, + "loss": 1.7232, + "step": 12074 + }, + { + "epoch": 3.7062615101289134, + "grad_norm": 0.36526206135749817, + "learning_rate": 7.254304885569186e-05, + "loss": 1.7874, + "step": 12075 + }, + { + "epoch": 3.7065684468999387, + "grad_norm": 0.21606837213039398, + "learning_rate": 7.253861205147998e-05, + "loss": 1.7266, + "step": 12076 + }, + { + "epoch": 3.7068753836709636, + "grad_norm": 0.3629585802555084, + "learning_rate": 7.253417502452831e-05, + "loss": 1.7722, + "step": 12077 + }, + { + "epoch": 3.707182320441989, + "grad_norm": 0.4224923551082611, + "learning_rate": 7.252973777488072e-05, + "loss": 1.7369, + "step": 12078 + }, + { + "epoch": 3.7074892572130143, + "grad_norm": 0.32245784997940063, + "learning_rate": 7.252530030258106e-05, + "loss": 1.7836, + "step": 12079 + }, + { + "epoch": 3.707796193984039, + "grad_norm": 0.29909494519233704, + "learning_rate": 7.252086260767317e-05, + "loss": 1.8718, + "step": 12080 + }, + { + "epoch": 3.7081031307550645, + "grad_norm": 0.21995799243450165, + "learning_rate": 7.251642469020093e-05, + "loss": 1.7103, + "step": 12081 + }, + { + "epoch": 3.7084100675260894, + "grad_norm": 0.2737572193145752, + "learning_rate": 7.251198655020818e-05, + "loss": 1.7787, + "step": 12082 + }, + { + "epoch": 3.7087170042971147, + "grad_norm": 0.22417058050632477, + "learning_rate": 7.250754818773879e-05, + "loss": 1.7782, + "step": 12083 + }, + { + "epoch": 3.70902394106814, + "grad_norm": 0.3350662887096405, + "learning_rate": 7.25031096028366e-05, + "loss": 1.8193, + "step": 12084 + }, + { + "epoch": 3.7093308778391654, + "grad_norm": 0.3199101686477661, + "learning_rate": 7.24986707955455e-05, + "loss": 1.831, + "step": 12085 + }, + { + "epoch": 3.7096378146101903, + "grad_norm": 0.2513977289199829, + "learning_rate": 7.249423176590936e-05, + "loss": 1.8288, + "step": 12086 + }, + { + "epoch": 3.7099447513812156, + "grad_norm": 0.30411866307258606, + "learning_rate": 7.248979251397203e-05, + "loss": 1.7837, + "step": 12087 + }, + { + "epoch": 3.7102516881522405, + "grad_norm": 0.30755332112312317, + "learning_rate": 7.248535303977738e-05, + "loss": 1.8016, + "step": 12088 + }, + { + "epoch": 3.710558624923266, + "grad_norm": 0.25746986269950867, + "learning_rate": 7.248091334336929e-05, + "loss": 1.8014, + "step": 12089 + }, + { + "epoch": 3.710865561694291, + "grad_norm": 0.3327447772026062, + "learning_rate": 7.247647342479164e-05, + "loss": 1.752, + "step": 12090 + }, + { + "epoch": 3.711172498465316, + "grad_norm": 0.3101816475391388, + "learning_rate": 7.247203328408832e-05, + "loss": 1.7867, + "step": 12091 + }, + { + "epoch": 3.7114794352363414, + "grad_norm": 0.2168906182050705, + "learning_rate": 7.246759292130318e-05, + "loss": 1.7452, + "step": 12092 + }, + { + "epoch": 3.7117863720073663, + "grad_norm": 0.34260258078575134, + "learning_rate": 7.246315233648013e-05, + "loss": 1.8156, + "step": 12093 + }, + { + "epoch": 3.7120933087783916, + "grad_norm": 0.2730714976787567, + "learning_rate": 7.245871152966303e-05, + "loss": 1.7429, + "step": 12094 + }, + { + "epoch": 3.712400245549417, + "grad_norm": 0.2560936212539673, + "learning_rate": 7.245427050089578e-05, + "loss": 1.7969, + "step": 12095 + }, + { + "epoch": 3.712707182320442, + "grad_norm": 0.27510303258895874, + "learning_rate": 7.244982925022228e-05, + "loss": 1.7981, + "step": 12096 + }, + { + "epoch": 3.713014119091467, + "grad_norm": 0.29171642661094666, + "learning_rate": 7.24453877776864e-05, + "loss": 1.7913, + "step": 12097 + }, + { + "epoch": 3.713321055862492, + "grad_norm": 0.26431843638420105, + "learning_rate": 7.244094608333206e-05, + "loss": 1.8262, + "step": 12098 + }, + { + "epoch": 3.7136279926335174, + "grad_norm": 0.30747905373573303, + "learning_rate": 7.243650416720311e-05, + "loss": 1.7951, + "step": 12099 + }, + { + "epoch": 3.7139349294045427, + "grad_norm": 0.346443772315979, + "learning_rate": 7.24320620293435e-05, + "loss": 1.7677, + "step": 12100 + }, + { + "epoch": 3.714241866175568, + "grad_norm": 0.2910652458667755, + "learning_rate": 7.242761966979709e-05, + "loss": 1.7887, + "step": 12101 + }, + { + "epoch": 3.714548802946593, + "grad_norm": 0.22342006862163544, + "learning_rate": 7.24231770886078e-05, + "loss": 1.7678, + "step": 12102 + }, + { + "epoch": 3.7148557397176183, + "grad_norm": 0.24125796556472778, + "learning_rate": 7.241873428581954e-05, + "loss": 1.7436, + "step": 12103 + }, + { + "epoch": 3.715162676488643, + "grad_norm": 0.23542635142803192, + "learning_rate": 7.24142912614762e-05, + "loss": 1.7942, + "step": 12104 + }, + { + "epoch": 3.7154696132596685, + "grad_norm": 0.22476384043693542, + "learning_rate": 7.240984801562169e-05, + "loss": 1.8235, + "step": 12105 + }, + { + "epoch": 3.715776550030694, + "grad_norm": 0.25123465061187744, + "learning_rate": 7.240540454829992e-05, + "loss": 1.8112, + "step": 12106 + }, + { + "epoch": 3.7160834868017187, + "grad_norm": 0.27230000495910645, + "learning_rate": 7.240096085955483e-05, + "loss": 1.8312, + "step": 12107 + }, + { + "epoch": 3.716390423572744, + "grad_norm": 0.2722976803779602, + "learning_rate": 7.239651694943031e-05, + "loss": 1.8368, + "step": 12108 + }, + { + "epoch": 3.716697360343769, + "grad_norm": 0.264138400554657, + "learning_rate": 7.239207281797028e-05, + "loss": 1.8206, + "step": 12109 + }, + { + "epoch": 3.7170042971147943, + "grad_norm": 0.28813931345939636, + "learning_rate": 7.238762846521866e-05, + "loss": 1.7391, + "step": 12110 + }, + { + "epoch": 3.7173112338858196, + "grad_norm": 0.2319631576538086, + "learning_rate": 7.238318389121939e-05, + "loss": 1.7574, + "step": 12111 + }, + { + "epoch": 3.717618170656845, + "grad_norm": 0.2507809102535248, + "learning_rate": 7.237873909601635e-05, + "loss": 1.7359, + "step": 12112 + }, + { + "epoch": 3.71792510742787, + "grad_norm": 0.2717304825782776, + "learning_rate": 7.237429407965351e-05, + "loss": 1.774, + "step": 12113 + }, + { + "epoch": 3.718232044198895, + "grad_norm": 0.2619280517101288, + "learning_rate": 7.236984884217478e-05, + "loss": 1.8083, + "step": 12114 + }, + { + "epoch": 3.71853898096992, + "grad_norm": 0.22268806397914886, + "learning_rate": 7.23654033836241e-05, + "loss": 1.7436, + "step": 12115 + }, + { + "epoch": 3.7188459177409454, + "grad_norm": 0.2341407984495163, + "learning_rate": 7.236095770404539e-05, + "loss": 1.7807, + "step": 12116 + }, + { + "epoch": 3.7191528545119708, + "grad_norm": 0.23519712686538696, + "learning_rate": 7.235651180348258e-05, + "loss": 1.8051, + "step": 12117 + }, + { + "epoch": 3.7194597912829956, + "grad_norm": 0.2391074150800705, + "learning_rate": 7.235206568197963e-05, + "loss": 1.8377, + "step": 12118 + }, + { + "epoch": 3.719766728054021, + "grad_norm": 0.26821592450141907, + "learning_rate": 7.234761933958045e-05, + "loss": 1.8586, + "step": 12119 + }, + { + "epoch": 3.720073664825046, + "grad_norm": 0.24971134960651398, + "learning_rate": 7.234317277632902e-05, + "loss": 1.8404, + "step": 12120 + }, + { + "epoch": 3.720380601596071, + "grad_norm": 0.20817919075489044, + "learning_rate": 7.233872599226926e-05, + "loss": 1.7204, + "step": 12121 + }, + { + "epoch": 3.7206875383670965, + "grad_norm": 0.29301291704177856, + "learning_rate": 7.233427898744509e-05, + "loss": 1.8528, + "step": 12122 + }, + { + "epoch": 3.7209944751381214, + "grad_norm": 0.22214651107788086, + "learning_rate": 7.23298317619005e-05, + "loss": 1.748, + "step": 12123 + }, + { + "epoch": 3.7213014119091468, + "grad_norm": 0.2511044442653656, + "learning_rate": 7.232538431567941e-05, + "loss": 1.8146, + "step": 12124 + }, + { + "epoch": 3.7216083486801717, + "grad_norm": 0.26976367831230164, + "learning_rate": 7.232093664882581e-05, + "loss": 1.8483, + "step": 12125 + }, + { + "epoch": 3.721915285451197, + "grad_norm": 0.2538089156150818, + "learning_rate": 7.231648876138361e-05, + "loss": 1.8097, + "step": 12126 + }, + { + "epoch": 3.7222222222222223, + "grad_norm": 0.2353016883134842, + "learning_rate": 7.231204065339682e-05, + "loss": 1.737, + "step": 12127 + }, + { + "epoch": 3.7225291589932477, + "grad_norm": 0.3205147981643677, + "learning_rate": 7.230759232490935e-05, + "loss": 1.8116, + "step": 12128 + }, + { + "epoch": 3.7228360957642725, + "grad_norm": 0.39056599140167236, + "learning_rate": 7.230314377596516e-05, + "loss": 1.7785, + "step": 12129 + }, + { + "epoch": 3.723143032535298, + "grad_norm": 0.3846863806247711, + "learning_rate": 7.229869500660825e-05, + "loss": 1.738, + "step": 12130 + }, + { + "epoch": 3.7234499693063228, + "grad_norm": 0.24412120878696442, + "learning_rate": 7.229424601688256e-05, + "loss": 1.7351, + "step": 12131 + }, + { + "epoch": 3.723756906077348, + "grad_norm": 0.2978009581565857, + "learning_rate": 7.228979680683206e-05, + "loss": 1.8037, + "step": 12132 + }, + { + "epoch": 3.7240638428483734, + "grad_norm": 0.33787262439727783, + "learning_rate": 7.228534737650074e-05, + "loss": 1.8421, + "step": 12133 + }, + { + "epoch": 3.7243707796193983, + "grad_norm": 0.2536921203136444, + "learning_rate": 7.228089772593254e-05, + "loss": 1.7472, + "step": 12134 + }, + { + "epoch": 3.7246777163904237, + "grad_norm": 0.24103601276874542, + "learning_rate": 7.227644785517144e-05, + "loss": 1.8011, + "step": 12135 + }, + { + "epoch": 3.7249846531614486, + "grad_norm": 0.3653033375740051, + "learning_rate": 7.227199776426146e-05, + "loss": 1.8018, + "step": 12136 + }, + { + "epoch": 3.725291589932474, + "grad_norm": 0.35728752613067627, + "learning_rate": 7.226754745324652e-05, + "loss": 1.7684, + "step": 12137 + }, + { + "epoch": 3.7255985267034992, + "grad_norm": 0.262018620967865, + "learning_rate": 7.226309692217063e-05, + "loss": 1.8124, + "step": 12138 + }, + { + "epoch": 3.725905463474524, + "grad_norm": 0.3467118442058563, + "learning_rate": 7.225864617107776e-05, + "loss": 1.8761, + "step": 12139 + }, + { + "epoch": 3.7262124002455494, + "grad_norm": 0.4365626871585846, + "learning_rate": 7.22541952000119e-05, + "loss": 1.7159, + "step": 12140 + }, + { + "epoch": 3.7265193370165743, + "grad_norm": 0.2819811999797821, + "learning_rate": 7.224974400901705e-05, + "loss": 1.8051, + "step": 12141 + }, + { + "epoch": 3.7268262737875997, + "grad_norm": 0.39062437415122986, + "learning_rate": 7.224529259813719e-05, + "loss": 1.8517, + "step": 12142 + }, + { + "epoch": 3.727133210558625, + "grad_norm": 0.4383927285671234, + "learning_rate": 7.22408409674163e-05, + "loss": 1.8295, + "step": 12143 + }, + { + "epoch": 3.7274401473296503, + "grad_norm": 0.3043094575405121, + "learning_rate": 7.223638911689839e-05, + "loss": 1.7653, + "step": 12144 + }, + { + "epoch": 3.7277470841006752, + "grad_norm": 0.25198984146118164, + "learning_rate": 7.223193704662746e-05, + "loss": 1.7561, + "step": 12145 + }, + { + "epoch": 3.7280540208717006, + "grad_norm": 0.353565514087677, + "learning_rate": 7.222748475664749e-05, + "loss": 1.8077, + "step": 12146 + }, + { + "epoch": 3.7283609576427255, + "grad_norm": 0.39757224917411804, + "learning_rate": 7.222303224700248e-05, + "loss": 1.7622, + "step": 12147 + }, + { + "epoch": 3.728667894413751, + "grad_norm": 0.35595703125, + "learning_rate": 7.221857951773644e-05, + "loss": 1.8436, + "step": 12148 + }, + { + "epoch": 3.728974831184776, + "grad_norm": 0.2469715029001236, + "learning_rate": 7.221412656889338e-05, + "loss": 1.8531, + "step": 12149 + }, + { + "epoch": 3.729281767955801, + "grad_norm": 0.35324424505233765, + "learning_rate": 7.22096734005173e-05, + "loss": 1.7361, + "step": 12150 + }, + { + "epoch": 3.7295887047268264, + "grad_norm": 0.3783365488052368, + "learning_rate": 7.220522001265223e-05, + "loss": 1.7459, + "step": 12151 + }, + { + "epoch": 3.7298956414978512, + "grad_norm": 0.27526360750198364, + "learning_rate": 7.220076640534212e-05, + "loss": 1.8867, + "step": 12152 + }, + { + "epoch": 3.7302025782688766, + "grad_norm": 0.30863118171691895, + "learning_rate": 7.219631257863105e-05, + "loss": 1.7363, + "step": 12153 + }, + { + "epoch": 3.730509515039902, + "grad_norm": 0.38505107164382935, + "learning_rate": 7.219185853256301e-05, + "loss": 1.764, + "step": 12154 + }, + { + "epoch": 3.730816451810927, + "grad_norm": 0.2925978899002075, + "learning_rate": 7.218740426718202e-05, + "loss": 1.7693, + "step": 12155 + }, + { + "epoch": 3.731123388581952, + "grad_norm": 0.24510078132152557, + "learning_rate": 7.218294978253209e-05, + "loss": 1.8089, + "step": 12156 + }, + { + "epoch": 3.731430325352977, + "grad_norm": 0.33029109239578247, + "learning_rate": 7.217849507865724e-05, + "loss": 1.6885, + "step": 12157 + }, + { + "epoch": 3.7317372621240024, + "grad_norm": 0.333970308303833, + "learning_rate": 7.217404015560149e-05, + "loss": 1.8132, + "step": 12158 + }, + { + "epoch": 3.7320441988950277, + "grad_norm": 0.2467660754919052, + "learning_rate": 7.216958501340891e-05, + "loss": 1.8021, + "step": 12159 + }, + { + "epoch": 3.732351135666053, + "grad_norm": 0.2701449990272522, + "learning_rate": 7.216512965212348e-05, + "loss": 1.7006, + "step": 12160 + }, + { + "epoch": 3.732658072437078, + "grad_norm": 0.2784138023853302, + "learning_rate": 7.216067407178926e-05, + "loss": 1.7616, + "step": 12161 + }, + { + "epoch": 3.7329650092081033, + "grad_norm": 0.2082870900630951, + "learning_rate": 7.215621827245026e-05, + "loss": 1.7391, + "step": 12162 + }, + { + "epoch": 3.733271945979128, + "grad_norm": 0.2477869987487793, + "learning_rate": 7.215176225415053e-05, + "loss": 1.7761, + "step": 12163 + }, + { + "epoch": 3.7335788827501535, + "grad_norm": 0.28395572304725647, + "learning_rate": 7.21473060169341e-05, + "loss": 1.8181, + "step": 12164 + }, + { + "epoch": 3.733885819521179, + "grad_norm": 0.20430058240890503, + "learning_rate": 7.2142849560845e-05, + "loss": 1.7035, + "step": 12165 + }, + { + "epoch": 3.7341927562922037, + "grad_norm": 0.30061420798301697, + "learning_rate": 7.21383928859273e-05, + "loss": 1.7703, + "step": 12166 + }, + { + "epoch": 3.734499693063229, + "grad_norm": 0.33865803480148315, + "learning_rate": 7.2133935992225e-05, + "loss": 1.8204, + "step": 12167 + }, + { + "epoch": 3.734806629834254, + "grad_norm": 0.29172980785369873, + "learning_rate": 7.212947887978221e-05, + "loss": 1.739, + "step": 12168 + }, + { + "epoch": 3.7351135666052793, + "grad_norm": 0.2799396812915802, + "learning_rate": 7.212502154864291e-05, + "loss": 1.8503, + "step": 12169 + }, + { + "epoch": 3.7354205033763046, + "grad_norm": 0.2945539355278015, + "learning_rate": 7.212056399885118e-05, + "loss": 1.7523, + "step": 12170 + }, + { + "epoch": 3.7357274401473295, + "grad_norm": 0.2395290732383728, + "learning_rate": 7.211610623045108e-05, + "loss": 1.7728, + "step": 12171 + }, + { + "epoch": 3.736034376918355, + "grad_norm": 0.24369286000728607, + "learning_rate": 7.211164824348667e-05, + "loss": 1.7725, + "step": 12172 + }, + { + "epoch": 3.7363413136893797, + "grad_norm": 0.3272435963153839, + "learning_rate": 7.210719003800197e-05, + "loss": 1.8531, + "step": 12173 + }, + { + "epoch": 3.736648250460405, + "grad_norm": 0.23954182863235474, + "learning_rate": 7.210273161404107e-05, + "loss": 1.7807, + "step": 12174 + }, + { + "epoch": 3.7369551872314304, + "grad_norm": 0.24547603726387024, + "learning_rate": 7.209827297164801e-05, + "loss": 1.8481, + "step": 12175 + }, + { + "epoch": 3.7372621240024557, + "grad_norm": 0.26926249265670776, + "learning_rate": 7.209381411086687e-05, + "loss": 1.7496, + "step": 12176 + }, + { + "epoch": 3.7375690607734806, + "grad_norm": 0.22948235273361206, + "learning_rate": 7.208935503174172e-05, + "loss": 1.7681, + "step": 12177 + }, + { + "epoch": 3.737875997544506, + "grad_norm": 0.2697654664516449, + "learning_rate": 7.20848957343166e-05, + "loss": 1.789, + "step": 12178 + }, + { + "epoch": 3.738182934315531, + "grad_norm": 0.235344797372818, + "learning_rate": 7.208043621863562e-05, + "loss": 1.8309, + "step": 12179 + }, + { + "epoch": 3.738489871086556, + "grad_norm": 0.2688879072666168, + "learning_rate": 7.20759764847428e-05, + "loss": 1.7898, + "step": 12180 + }, + { + "epoch": 3.7387968078575815, + "grad_norm": 0.26818978786468506, + "learning_rate": 7.207151653268226e-05, + "loss": 1.7882, + "step": 12181 + }, + { + "epoch": 3.7391037446286064, + "grad_norm": 0.2612875998020172, + "learning_rate": 7.206705636249804e-05, + "loss": 1.7352, + "step": 12182 + }, + { + "epoch": 3.7394106813996317, + "grad_norm": 0.22547565400600433, + "learning_rate": 7.206259597423425e-05, + "loss": 1.733, + "step": 12183 + }, + { + "epoch": 3.7397176181706566, + "grad_norm": 0.24645474553108215, + "learning_rate": 7.205813536793495e-05, + "loss": 1.8064, + "step": 12184 + }, + { + "epoch": 3.740024554941682, + "grad_norm": 0.25879329442977905, + "learning_rate": 7.205367454364424e-05, + "loss": 1.8134, + "step": 12185 + }, + { + "epoch": 3.7403314917127073, + "grad_norm": 0.22420097887516022, + "learning_rate": 7.204921350140617e-05, + "loss": 1.7819, + "step": 12186 + }, + { + "epoch": 3.7406384284837326, + "grad_norm": 0.2569858431816101, + "learning_rate": 7.204475224126487e-05, + "loss": 1.784, + "step": 12187 + }, + { + "epoch": 3.7409453652547575, + "grad_norm": 0.23769912123680115, + "learning_rate": 7.20402907632644e-05, + "loss": 1.7853, + "step": 12188 + }, + { + "epoch": 3.741252302025783, + "grad_norm": 0.26935988664627075, + "learning_rate": 7.203582906744885e-05, + "loss": 1.806, + "step": 12189 + }, + { + "epoch": 3.7415592387968077, + "grad_norm": 0.2544274628162384, + "learning_rate": 7.203136715386233e-05, + "loss": 1.7988, + "step": 12190 + }, + { + "epoch": 3.741866175567833, + "grad_norm": 0.22665882110595703, + "learning_rate": 7.202690502254892e-05, + "loss": 1.7798, + "step": 12191 + }, + { + "epoch": 3.7421731123388584, + "grad_norm": 0.24512888491153717, + "learning_rate": 7.202244267355273e-05, + "loss": 1.816, + "step": 12192 + }, + { + "epoch": 3.7424800491098833, + "grad_norm": 0.2408553808927536, + "learning_rate": 7.201798010691785e-05, + "loss": 1.7417, + "step": 12193 + }, + { + "epoch": 3.7427869858809086, + "grad_norm": 0.23142600059509277, + "learning_rate": 7.201351732268838e-05, + "loss": 1.7771, + "step": 12194 + }, + { + "epoch": 3.7430939226519335, + "grad_norm": 0.245071142911911, + "learning_rate": 7.200905432090844e-05, + "loss": 1.7556, + "step": 12195 + }, + { + "epoch": 3.743400859422959, + "grad_norm": 0.2623934745788574, + "learning_rate": 7.200459110162211e-05, + "loss": 1.8042, + "step": 12196 + }, + { + "epoch": 3.743707796193984, + "grad_norm": 0.2531217038631439, + "learning_rate": 7.200012766487353e-05, + "loss": 1.7709, + "step": 12197 + }, + { + "epoch": 3.744014732965009, + "grad_norm": 0.23839864134788513, + "learning_rate": 7.19956640107068e-05, + "loss": 1.8202, + "step": 12198 + }, + { + "epoch": 3.7443216697360344, + "grad_norm": 0.2342260777950287, + "learning_rate": 7.1991200139166e-05, + "loss": 1.827, + "step": 12199 + }, + { + "epoch": 3.7446286065070593, + "grad_norm": 0.25511276721954346, + "learning_rate": 7.198673605029528e-05, + "loss": 1.7766, + "step": 12200 + }, + { + "epoch": 3.7449355432780846, + "grad_norm": 0.27601274847984314, + "learning_rate": 7.198227174413876e-05, + "loss": 1.7716, + "step": 12201 + }, + { + "epoch": 3.74524248004911, + "grad_norm": 0.3027385175228119, + "learning_rate": 7.197780722074056e-05, + "loss": 1.8007, + "step": 12202 + }, + { + "epoch": 3.7455494168201353, + "grad_norm": 0.31242382526397705, + "learning_rate": 7.197334248014477e-05, + "loss": 1.8089, + "step": 12203 + }, + { + "epoch": 3.74585635359116, + "grad_norm": 0.3673859238624573, + "learning_rate": 7.196887752239551e-05, + "loss": 1.8017, + "step": 12204 + }, + { + "epoch": 3.7461632903621855, + "grad_norm": 0.3152726888656616, + "learning_rate": 7.196441234753695e-05, + "loss": 1.7108, + "step": 12205 + }, + { + "epoch": 3.7464702271332104, + "grad_norm": 0.2606927156448364, + "learning_rate": 7.195994695561319e-05, + "loss": 1.8066, + "step": 12206 + }, + { + "epoch": 3.7467771639042358, + "grad_norm": 0.37624871730804443, + "learning_rate": 7.195548134666836e-05, + "loss": 1.725, + "step": 12207 + }, + { + "epoch": 3.747084100675261, + "grad_norm": 0.4138187766075134, + "learning_rate": 7.195101552074658e-05, + "loss": 1.7838, + "step": 12208 + }, + { + "epoch": 3.747391037446286, + "grad_norm": 0.3668459951877594, + "learning_rate": 7.194654947789204e-05, + "loss": 1.7575, + "step": 12209 + }, + { + "epoch": 3.7476979742173113, + "grad_norm": 0.27947792410850525, + "learning_rate": 7.19420832181488e-05, + "loss": 1.792, + "step": 12210 + }, + { + "epoch": 3.748004910988336, + "grad_norm": 0.2507692873477936, + "learning_rate": 7.193761674156103e-05, + "loss": 1.7752, + "step": 12211 + }, + { + "epoch": 3.7483118477593615, + "grad_norm": 0.3209949731826782, + "learning_rate": 7.193315004817289e-05, + "loss": 1.8491, + "step": 12212 + }, + { + "epoch": 3.748618784530387, + "grad_norm": 0.32883042097091675, + "learning_rate": 7.192868313802849e-05, + "loss": 1.8135, + "step": 12213 + }, + { + "epoch": 3.7489257213014118, + "grad_norm": 0.2450616955757141, + "learning_rate": 7.192421601117201e-05, + "loss": 1.7722, + "step": 12214 + }, + { + "epoch": 3.749232658072437, + "grad_norm": 0.2545110285282135, + "learning_rate": 7.191974866764757e-05, + "loss": 1.7866, + "step": 12215 + }, + { + "epoch": 3.749539594843462, + "grad_norm": 0.264017790555954, + "learning_rate": 7.191528110749932e-05, + "loss": 1.778, + "step": 12216 + }, + { + "epoch": 3.7498465316144873, + "grad_norm": 0.3156309425830841, + "learning_rate": 7.191081333077142e-05, + "loss": 1.7917, + "step": 12217 + }, + { + "epoch": 3.7501534683855127, + "grad_norm": 0.3578774631023407, + "learning_rate": 7.190634533750802e-05, + "loss": 1.8468, + "step": 12218 + }, + { + "epoch": 3.750460405156538, + "grad_norm": 0.30735981464385986, + "learning_rate": 7.19018771277533e-05, + "loss": 1.7502, + "step": 12219 + }, + { + "epoch": 3.750767341927563, + "grad_norm": 0.22870220243930817, + "learning_rate": 7.189740870155135e-05, + "loss": 1.7686, + "step": 12220 + }, + { + "epoch": 3.7510742786985882, + "grad_norm": 0.30297720432281494, + "learning_rate": 7.18929400589464e-05, + "loss": 1.826, + "step": 12221 + }, + { + "epoch": 3.751381215469613, + "grad_norm": 0.2735389173030853, + "learning_rate": 7.188847119998257e-05, + "loss": 1.8142, + "step": 12222 + }, + { + "epoch": 3.7516881522406385, + "grad_norm": 0.2823885679244995, + "learning_rate": 7.188400212470405e-05, + "loss": 1.8028, + "step": 12223 + }, + { + "epoch": 3.751995089011664, + "grad_norm": 0.4184139370918274, + "learning_rate": 7.187953283315499e-05, + "loss": 1.8467, + "step": 12224 + }, + { + "epoch": 3.7523020257826887, + "grad_norm": 0.3559226095676422, + "learning_rate": 7.187506332537957e-05, + "loss": 1.7416, + "step": 12225 + }, + { + "epoch": 3.752608962553714, + "grad_norm": 0.26055800914764404, + "learning_rate": 7.187059360142194e-05, + "loss": 1.8309, + "step": 12226 + }, + { + "epoch": 3.752915899324739, + "grad_norm": 0.28032660484313965, + "learning_rate": 7.186612366132629e-05, + "loss": 1.7926, + "step": 12227 + }, + { + "epoch": 3.7532228360957642, + "grad_norm": 0.26229965686798096, + "learning_rate": 7.18616535051368e-05, + "loss": 1.7368, + "step": 12228 + }, + { + "epoch": 3.7535297728667896, + "grad_norm": 0.2779417634010315, + "learning_rate": 7.185718313289763e-05, + "loss": 1.8418, + "step": 12229 + }, + { + "epoch": 3.7538367096378145, + "grad_norm": 0.26164770126342773, + "learning_rate": 7.185271254465295e-05, + "loss": 1.7511, + "step": 12230 + }, + { + "epoch": 3.75414364640884, + "grad_norm": 0.30725157260894775, + "learning_rate": 7.184824174044698e-05, + "loss": 1.7661, + "step": 12231 + }, + { + "epoch": 3.7544505831798647, + "grad_norm": 0.33111417293548584, + "learning_rate": 7.184377072032386e-05, + "loss": 1.7341, + "step": 12232 + }, + { + "epoch": 3.75475751995089, + "grad_norm": 0.23978343605995178, + "learning_rate": 7.183929948432779e-05, + "loss": 1.7151, + "step": 12233 + }, + { + "epoch": 3.7550644567219154, + "grad_norm": 0.3057664632797241, + "learning_rate": 7.183482803250299e-05, + "loss": 1.8446, + "step": 12234 + }, + { + "epoch": 3.7553713934929407, + "grad_norm": 0.2629055678844452, + "learning_rate": 7.18303563648936e-05, + "loss": 1.7415, + "step": 12235 + }, + { + "epoch": 3.7556783302639656, + "grad_norm": 0.22703498601913452, + "learning_rate": 7.182588448154386e-05, + "loss": 1.8188, + "step": 12236 + }, + { + "epoch": 3.755985267034991, + "grad_norm": 0.3014034032821655, + "learning_rate": 7.182141238249792e-05, + "loss": 1.8634, + "step": 12237 + }, + { + "epoch": 3.756292203806016, + "grad_norm": 0.28859084844589233, + "learning_rate": 7.181694006779998e-05, + "loss": 1.7509, + "step": 12238 + }, + { + "epoch": 3.756599140577041, + "grad_norm": 0.293720543384552, + "learning_rate": 7.181246753749426e-05, + "loss": 1.777, + "step": 12239 + }, + { + "epoch": 3.7569060773480665, + "grad_norm": 0.2374580055475235, + "learning_rate": 7.180799479162496e-05, + "loss": 1.7492, + "step": 12240 + }, + { + "epoch": 3.7572130141190914, + "grad_norm": 0.30106452107429504, + "learning_rate": 7.180352183023627e-05, + "loss": 1.7538, + "step": 12241 + }, + { + "epoch": 3.7575199508901167, + "grad_norm": 0.3504682183265686, + "learning_rate": 7.179904865337238e-05, + "loss": 1.7477, + "step": 12242 + }, + { + "epoch": 3.7578268876611416, + "grad_norm": 0.2901679575443268, + "learning_rate": 7.179457526107754e-05, + "loss": 1.9412, + "step": 12243 + }, + { + "epoch": 3.758133824432167, + "grad_norm": 0.37690606713294983, + "learning_rate": 7.179010165339591e-05, + "loss": 1.8222, + "step": 12244 + }, + { + "epoch": 3.7584407612031923, + "grad_norm": 0.45126965641975403, + "learning_rate": 7.178562783037172e-05, + "loss": 1.8563, + "step": 12245 + }, + { + "epoch": 3.758747697974217, + "grad_norm": 0.2747548818588257, + "learning_rate": 7.178115379204921e-05, + "loss": 1.7179, + "step": 12246 + }, + { + "epoch": 3.7590546347452425, + "grad_norm": 0.43243977427482605, + "learning_rate": 7.177667953847257e-05, + "loss": 1.8157, + "step": 12247 + }, + { + "epoch": 3.7593615715162674, + "grad_norm": 0.529448390007019, + "learning_rate": 7.177220506968602e-05, + "loss": 1.8113, + "step": 12248 + }, + { + "epoch": 3.7596685082872927, + "grad_norm": 0.3099314868450165, + "learning_rate": 7.176773038573377e-05, + "loss": 1.7833, + "step": 12249 + }, + { + "epoch": 3.759975445058318, + "grad_norm": 0.3111872375011444, + "learning_rate": 7.176325548666004e-05, + "loss": 1.7965, + "step": 12250 + }, + { + "epoch": 3.7602823818293434, + "grad_norm": 0.38437551259994507, + "learning_rate": 7.175878037250907e-05, + "loss": 1.7822, + "step": 12251 + }, + { + "epoch": 3.7605893186003683, + "grad_norm": 0.33643704652786255, + "learning_rate": 7.175430504332509e-05, + "loss": 1.7839, + "step": 12252 + }, + { + "epoch": 3.7608962553713936, + "grad_norm": 0.24705304205417633, + "learning_rate": 7.174982949915232e-05, + "loss": 1.8302, + "step": 12253 + }, + { + "epoch": 3.7612031921424185, + "grad_norm": 0.3615458309650421, + "learning_rate": 7.174535374003497e-05, + "loss": 1.7963, + "step": 12254 + }, + { + "epoch": 3.761510128913444, + "grad_norm": 0.36486589908599854, + "learning_rate": 7.17408777660173e-05, + "loss": 1.7933, + "step": 12255 + }, + { + "epoch": 3.761817065684469, + "grad_norm": 0.2566867172718048, + "learning_rate": 7.173640157714352e-05, + "loss": 1.7254, + "step": 12256 + }, + { + "epoch": 3.762124002455494, + "grad_norm": 0.2602523863315582, + "learning_rate": 7.17319251734579e-05, + "loss": 1.7357, + "step": 12257 + }, + { + "epoch": 3.7624309392265194, + "grad_norm": 0.3626105785369873, + "learning_rate": 7.172744855500464e-05, + "loss": 1.7971, + "step": 12258 + }, + { + "epoch": 3.7627378759975443, + "grad_norm": 0.36327603459358215, + "learning_rate": 7.172297172182802e-05, + "loss": 1.7819, + "step": 12259 + }, + { + "epoch": 3.7630448127685696, + "grad_norm": 0.25935736298561096, + "learning_rate": 7.171849467397224e-05, + "loss": 1.8112, + "step": 12260 + }, + { + "epoch": 3.763351749539595, + "grad_norm": 0.2779700756072998, + "learning_rate": 7.171401741148156e-05, + "loss": 1.786, + "step": 12261 + }, + { + "epoch": 3.7636586863106203, + "grad_norm": 0.3089013695716858, + "learning_rate": 7.170953993440025e-05, + "loss": 1.7808, + "step": 12262 + }, + { + "epoch": 3.763965623081645, + "grad_norm": 0.2562308609485626, + "learning_rate": 7.170506224277253e-05, + "loss": 1.8207, + "step": 12263 + }, + { + "epoch": 3.7642725598526705, + "grad_norm": 0.2907634973526001, + "learning_rate": 7.170058433664268e-05, + "loss": 1.7638, + "step": 12264 + }, + { + "epoch": 3.7645794966236954, + "grad_norm": 0.30341312289237976, + "learning_rate": 7.169610621605493e-05, + "loss": 1.7827, + "step": 12265 + }, + { + "epoch": 3.7648864333947207, + "grad_norm": 0.27091866731643677, + "learning_rate": 7.169162788105353e-05, + "loss": 1.786, + "step": 12266 + }, + { + "epoch": 3.765193370165746, + "grad_norm": 0.234042689204216, + "learning_rate": 7.168714933168277e-05, + "loss": 1.7638, + "step": 12267 + }, + { + "epoch": 3.765500306936771, + "grad_norm": 0.2477465271949768, + "learning_rate": 7.168267056798686e-05, + "loss": 1.7275, + "step": 12268 + }, + { + "epoch": 3.7658072437077963, + "grad_norm": 0.25578543543815613, + "learning_rate": 7.167819159001012e-05, + "loss": 1.7831, + "step": 12269 + }, + { + "epoch": 3.766114180478821, + "grad_norm": 0.26629674434661865, + "learning_rate": 7.167371239779678e-05, + "loss": 1.7866, + "step": 12270 + }, + { + "epoch": 3.7664211172498465, + "grad_norm": 0.31350967288017273, + "learning_rate": 7.16692329913911e-05, + "loss": 1.7755, + "step": 12271 + }, + { + "epoch": 3.766728054020872, + "grad_norm": 0.2670116126537323, + "learning_rate": 7.166475337083735e-05, + "loss": 1.7524, + "step": 12272 + }, + { + "epoch": 3.7670349907918967, + "grad_norm": 0.26503682136535645, + "learning_rate": 7.166027353617983e-05, + "loss": 1.7867, + "step": 12273 + }, + { + "epoch": 3.767341927562922, + "grad_norm": 0.3674192428588867, + "learning_rate": 7.165579348746278e-05, + "loss": 1.7604, + "step": 12274 + }, + { + "epoch": 3.767648864333947, + "grad_norm": 0.4120824337005615, + "learning_rate": 7.16513132247305e-05, + "loss": 1.7905, + "step": 12275 + }, + { + "epoch": 3.7679558011049723, + "grad_norm": 0.29074826836586, + "learning_rate": 7.164683274802723e-05, + "loss": 1.7539, + "step": 12276 + }, + { + "epoch": 3.7682627378759976, + "grad_norm": 0.22223204374313354, + "learning_rate": 7.164235205739729e-05, + "loss": 1.755, + "step": 12277 + }, + { + "epoch": 3.768569674647023, + "grad_norm": 0.23997461795806885, + "learning_rate": 7.163787115288494e-05, + "loss": 1.8024, + "step": 12278 + }, + { + "epoch": 3.768876611418048, + "grad_norm": 0.2556418776512146, + "learning_rate": 7.163339003453445e-05, + "loss": 1.7717, + "step": 12279 + }, + { + "epoch": 3.769183548189073, + "grad_norm": 0.3107141852378845, + "learning_rate": 7.162890870239013e-05, + "loss": 1.8257, + "step": 12280 + }, + { + "epoch": 3.769490484960098, + "grad_norm": 0.35293644666671753, + "learning_rate": 7.162442715649627e-05, + "loss": 1.7855, + "step": 12281 + }, + { + "epoch": 3.7697974217311234, + "grad_norm": 0.25989311933517456, + "learning_rate": 7.161994539689713e-05, + "loss": 1.7816, + "step": 12282 + }, + { + "epoch": 3.7701043585021488, + "grad_norm": 0.25615137815475464, + "learning_rate": 7.161546342363701e-05, + "loss": 1.7738, + "step": 12283 + }, + { + "epoch": 3.7704112952731736, + "grad_norm": 0.29345229268074036, + "learning_rate": 7.161098123676023e-05, + "loss": 1.8496, + "step": 12284 + }, + { + "epoch": 3.770718232044199, + "grad_norm": 0.2975969612598419, + "learning_rate": 7.160649883631105e-05, + "loss": 1.7342, + "step": 12285 + }, + { + "epoch": 3.771025168815224, + "grad_norm": 0.28458064794540405, + "learning_rate": 7.16020162223338e-05, + "loss": 1.8253, + "step": 12286 + }, + { + "epoch": 3.771332105586249, + "grad_norm": 0.2798703908920288, + "learning_rate": 7.159753339487276e-05, + "loss": 1.746, + "step": 12287 + }, + { + "epoch": 3.7716390423572745, + "grad_norm": 0.380044549703598, + "learning_rate": 7.159305035397223e-05, + "loss": 1.769, + "step": 12288 + }, + { + "epoch": 3.7719459791282994, + "grad_norm": 0.28760263323783875, + "learning_rate": 7.158856709967654e-05, + "loss": 1.7466, + "step": 12289 + }, + { + "epoch": 3.7722529158993248, + "grad_norm": 0.23314130306243896, + "learning_rate": 7.158408363202996e-05, + "loss": 1.7545, + "step": 12290 + }, + { + "epoch": 3.7725598526703497, + "grad_norm": 0.2864209711551666, + "learning_rate": 7.15795999510768e-05, + "loss": 1.7549, + "step": 12291 + }, + { + "epoch": 3.772866789441375, + "grad_norm": 0.2605510354042053, + "learning_rate": 7.15751160568614e-05, + "loss": 1.7684, + "step": 12292 + }, + { + "epoch": 3.7731737262124003, + "grad_norm": 0.2475409358739853, + "learning_rate": 7.157063194942806e-05, + "loss": 1.7841, + "step": 12293 + }, + { + "epoch": 3.7734806629834257, + "grad_norm": 0.22479289770126343, + "learning_rate": 7.15661476288211e-05, + "loss": 1.7592, + "step": 12294 + }, + { + "epoch": 3.7737875997544506, + "grad_norm": 0.22076937556266785, + "learning_rate": 7.156166309508482e-05, + "loss": 1.7853, + "step": 12295 + }, + { + "epoch": 3.774094536525476, + "grad_norm": 0.26082465052604675, + "learning_rate": 7.155717834826353e-05, + "loss": 1.7828, + "step": 12296 + }, + { + "epoch": 3.7744014732965008, + "grad_norm": 0.24771755933761597, + "learning_rate": 7.15526933884016e-05, + "loss": 1.758, + "step": 12297 + }, + { + "epoch": 3.774708410067526, + "grad_norm": 0.23806311190128326, + "learning_rate": 7.15482082155433e-05, + "loss": 1.7237, + "step": 12298 + }, + { + "epoch": 3.7750153468385514, + "grad_norm": 0.24822844564914703, + "learning_rate": 7.154372282973299e-05, + "loss": 1.7828, + "step": 12299 + }, + { + "epoch": 3.7753222836095763, + "grad_norm": 0.24423740804195404, + "learning_rate": 7.153923723101496e-05, + "loss": 1.8014, + "step": 12300 + }, + { + "epoch": 3.7756292203806017, + "grad_norm": 0.24966634809970856, + "learning_rate": 7.15347514194336e-05, + "loss": 1.8005, + "step": 12301 + }, + { + "epoch": 3.7759361571516266, + "grad_norm": 0.2549348473548889, + "learning_rate": 7.153026539503317e-05, + "loss": 1.8473, + "step": 12302 + }, + { + "epoch": 3.776243093922652, + "grad_norm": 0.23709465563297272, + "learning_rate": 7.152577915785807e-05, + "loss": 1.8031, + "step": 12303 + }, + { + "epoch": 3.7765500306936772, + "grad_norm": 0.28554168343544006, + "learning_rate": 7.152129270795258e-05, + "loss": 1.7836, + "step": 12304 + }, + { + "epoch": 3.776856967464702, + "grad_norm": 0.2568756639957428, + "learning_rate": 7.151680604536107e-05, + "loss": 1.7345, + "step": 12305 + }, + { + "epoch": 3.7771639042357275, + "grad_norm": 0.23883797228336334, + "learning_rate": 7.151231917012787e-05, + "loss": 1.7342, + "step": 12306 + }, + { + "epoch": 3.7774708410067523, + "grad_norm": 0.24026677012443542, + "learning_rate": 7.150783208229732e-05, + "loss": 1.8156, + "step": 12307 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.25756222009658813, + "learning_rate": 7.150334478191376e-05, + "loss": 1.8204, + "step": 12308 + }, + { + "epoch": 3.778084714548803, + "grad_norm": 0.24917428195476532, + "learning_rate": 7.149885726902156e-05, + "loss": 1.7867, + "step": 12309 + }, + { + "epoch": 3.7783916513198283, + "grad_norm": 0.26269277930259705, + "learning_rate": 7.149436954366504e-05, + "loss": 1.8233, + "step": 12310 + }, + { + "epoch": 3.7786985880908532, + "grad_norm": 0.2502293586730957, + "learning_rate": 7.148988160588857e-05, + "loss": 1.8329, + "step": 12311 + }, + { + "epoch": 3.7790055248618786, + "grad_norm": 0.24845796823501587, + "learning_rate": 7.14853934557365e-05, + "loss": 1.7936, + "step": 12312 + }, + { + "epoch": 3.7793124616329035, + "grad_norm": 0.2453537881374359, + "learning_rate": 7.148090509325315e-05, + "loss": 1.8149, + "step": 12313 + }, + { + "epoch": 3.779619398403929, + "grad_norm": 0.2336922138929367, + "learning_rate": 7.147641651848293e-05, + "loss": 1.7826, + "step": 12314 + }, + { + "epoch": 3.779926335174954, + "grad_norm": 0.25542667508125305, + "learning_rate": 7.147192773147017e-05, + "loss": 1.801, + "step": 12315 + }, + { + "epoch": 3.780233271945979, + "grad_norm": 0.2301866114139557, + "learning_rate": 7.146743873225923e-05, + "loss": 1.7302, + "step": 12316 + }, + { + "epoch": 3.7805402087170044, + "grad_norm": 0.25821468234062195, + "learning_rate": 7.14629495208945e-05, + "loss": 1.7704, + "step": 12317 + }, + { + "epoch": 3.7808471454880292, + "grad_norm": 0.22537970542907715, + "learning_rate": 7.145846009742029e-05, + "loss": 1.7281, + "step": 12318 + }, + { + "epoch": 3.7811540822590546, + "grad_norm": 0.2565869688987732, + "learning_rate": 7.145397046188102e-05, + "loss": 1.8077, + "step": 12319 + }, + { + "epoch": 3.78146101903008, + "grad_norm": 0.2588396966457367, + "learning_rate": 7.144948061432105e-05, + "loss": 1.7438, + "step": 12320 + }, + { + "epoch": 3.781767955801105, + "grad_norm": 0.2538135349750519, + "learning_rate": 7.144499055478472e-05, + "loss": 1.8253, + "step": 12321 + }, + { + "epoch": 3.78207489257213, + "grad_norm": 0.2272680401802063, + "learning_rate": 7.144050028331644e-05, + "loss": 1.7408, + "step": 12322 + }, + { + "epoch": 3.782381829343155, + "grad_norm": 0.25010406970977783, + "learning_rate": 7.143600979996055e-05, + "loss": 1.8219, + "step": 12323 + }, + { + "epoch": 3.7826887661141804, + "grad_norm": 0.2560291290283203, + "learning_rate": 7.143151910476144e-05, + "loss": 1.7734, + "step": 12324 + }, + { + "epoch": 3.7829957028852057, + "grad_norm": 0.24927431344985962, + "learning_rate": 7.142702819776352e-05, + "loss": 1.7682, + "step": 12325 + }, + { + "epoch": 3.783302639656231, + "grad_norm": 0.2501368224620819, + "learning_rate": 7.142253707901114e-05, + "loss": 1.818, + "step": 12326 + }, + { + "epoch": 3.783609576427256, + "grad_norm": 0.3132917284965515, + "learning_rate": 7.141804574854871e-05, + "loss": 1.7793, + "step": 12327 + }, + { + "epoch": 3.7839165131982813, + "grad_norm": 0.24229925870895386, + "learning_rate": 7.141355420642057e-05, + "loss": 1.7585, + "step": 12328 + }, + { + "epoch": 3.784223449969306, + "grad_norm": 0.22612906992435455, + "learning_rate": 7.140906245267116e-05, + "loss": 1.7374, + "step": 12329 + }, + { + "epoch": 3.7845303867403315, + "grad_norm": 0.26354333758354187, + "learning_rate": 7.140457048734482e-05, + "loss": 1.7751, + "step": 12330 + }, + { + "epoch": 3.784837323511357, + "grad_norm": 0.21500451862812042, + "learning_rate": 7.140007831048599e-05, + "loss": 1.7827, + "step": 12331 + }, + { + "epoch": 3.7851442602823817, + "grad_norm": 0.2826332151889801, + "learning_rate": 7.139558592213904e-05, + "loss": 1.7522, + "step": 12332 + }, + { + "epoch": 3.785451197053407, + "grad_norm": 0.3217725455760956, + "learning_rate": 7.139109332234837e-05, + "loss": 1.8758, + "step": 12333 + }, + { + "epoch": 3.785758133824432, + "grad_norm": 0.26934614777565, + "learning_rate": 7.138660051115837e-05, + "loss": 1.8322, + "step": 12334 + }, + { + "epoch": 3.7860650705954573, + "grad_norm": 0.2653827667236328, + "learning_rate": 7.138210748861346e-05, + "loss": 1.7651, + "step": 12335 + }, + { + "epoch": 3.7863720073664826, + "grad_norm": 0.30470311641693115, + "learning_rate": 7.137761425475802e-05, + "loss": 1.855, + "step": 12336 + }, + { + "epoch": 3.786678944137508, + "grad_norm": 0.2558726370334625, + "learning_rate": 7.137312080963647e-05, + "loss": 1.7174, + "step": 12337 + }, + { + "epoch": 3.786985880908533, + "grad_norm": 0.24025602638721466, + "learning_rate": 7.136862715329322e-05, + "loss": 1.7565, + "step": 12338 + }, + { + "epoch": 3.787292817679558, + "grad_norm": 0.34205392003059387, + "learning_rate": 7.136413328577267e-05, + "loss": 1.8116, + "step": 12339 + }, + { + "epoch": 3.787599754450583, + "grad_norm": 0.4069152772426605, + "learning_rate": 7.135963920711923e-05, + "loss": 1.7662, + "step": 12340 + }, + { + "epoch": 3.7879066912216084, + "grad_norm": 0.3915627598762512, + "learning_rate": 7.13551449173773e-05, + "loss": 1.81, + "step": 12341 + }, + { + "epoch": 3.7882136279926337, + "grad_norm": 0.27136507630348206, + "learning_rate": 7.135065041659134e-05, + "loss": 1.7845, + "step": 12342 + }, + { + "epoch": 3.7885205647636586, + "grad_norm": 0.2924078106880188, + "learning_rate": 7.134615570480572e-05, + "loss": 1.8606, + "step": 12343 + }, + { + "epoch": 3.788827501534684, + "grad_norm": 0.35581526160240173, + "learning_rate": 7.134166078206488e-05, + "loss": 1.7785, + "step": 12344 + }, + { + "epoch": 3.789134438305709, + "grad_norm": 0.3003756105899811, + "learning_rate": 7.133716564841324e-05, + "loss": 1.7321, + "step": 12345 + }, + { + "epoch": 3.789441375076734, + "grad_norm": 0.2586000859737396, + "learning_rate": 7.133267030389524e-05, + "loss": 1.7889, + "step": 12346 + }, + { + "epoch": 3.7897483118477595, + "grad_norm": 0.28053075075149536, + "learning_rate": 7.132817474855527e-05, + "loss": 1.8216, + "step": 12347 + }, + { + "epoch": 3.7900552486187844, + "grad_norm": 0.3064870834350586, + "learning_rate": 7.132367898243777e-05, + "loss": 1.7528, + "step": 12348 + }, + { + "epoch": 3.7903621853898097, + "grad_norm": 0.3045158386230469, + "learning_rate": 7.131918300558719e-05, + "loss": 1.8251, + "step": 12349 + }, + { + "epoch": 3.7906691221608346, + "grad_norm": 0.2438485324382782, + "learning_rate": 7.131468681804794e-05, + "loss": 1.7505, + "step": 12350 + }, + { + "epoch": 3.79097605893186, + "grad_norm": 0.24239958822727203, + "learning_rate": 7.131019041986447e-05, + "loss": 1.7544, + "step": 12351 + }, + { + "epoch": 3.7912829957028853, + "grad_norm": 0.24632441997528076, + "learning_rate": 7.130569381108121e-05, + "loss": 1.7485, + "step": 12352 + }, + { + "epoch": 3.7915899324739106, + "grad_norm": 0.22553624212741852, + "learning_rate": 7.13011969917426e-05, + "loss": 1.803, + "step": 12353 + }, + { + "epoch": 3.7918968692449355, + "grad_norm": 0.2164420485496521, + "learning_rate": 7.129669996189306e-05, + "loss": 1.7307, + "step": 12354 + }, + { + "epoch": 3.792203806015961, + "grad_norm": 0.25104281306266785, + "learning_rate": 7.129220272157705e-05, + "loss": 1.8154, + "step": 12355 + }, + { + "epoch": 3.7925107427869857, + "grad_norm": 0.25533202290534973, + "learning_rate": 7.128770527083903e-05, + "loss": 1.8046, + "step": 12356 + }, + { + "epoch": 3.792817679558011, + "grad_norm": 0.24428130686283112, + "learning_rate": 7.128320760972341e-05, + "loss": 1.7984, + "step": 12357 + }, + { + "epoch": 3.7931246163290364, + "grad_norm": 0.2366408109664917, + "learning_rate": 7.127870973827467e-05, + "loss": 1.7781, + "step": 12358 + }, + { + "epoch": 3.7934315531000613, + "grad_norm": 0.2558888792991638, + "learning_rate": 7.127421165653722e-05, + "loss": 1.7858, + "step": 12359 + }, + { + "epoch": 3.7937384898710866, + "grad_norm": 0.25825443863868713, + "learning_rate": 7.126971336455558e-05, + "loss": 1.8292, + "step": 12360 + }, + { + "epoch": 3.7940454266421115, + "grad_norm": 0.2554624080657959, + "learning_rate": 7.126521486237415e-05, + "loss": 1.822, + "step": 12361 + }, + { + "epoch": 3.794352363413137, + "grad_norm": 0.3030763268470764, + "learning_rate": 7.126071615003742e-05, + "loss": 1.8261, + "step": 12362 + }, + { + "epoch": 3.794659300184162, + "grad_norm": 0.3047907054424286, + "learning_rate": 7.125621722758981e-05, + "loss": 1.8419, + "step": 12363 + }, + { + "epoch": 3.794966236955187, + "grad_norm": 0.27782654762268066, + "learning_rate": 7.12517180950758e-05, + "loss": 1.7959, + "step": 12364 + }, + { + "epoch": 3.7952731737262124, + "grad_norm": 0.24526572227478027, + "learning_rate": 7.124721875253986e-05, + "loss": 1.7313, + "step": 12365 + }, + { + "epoch": 3.7955801104972373, + "grad_norm": 0.23718179762363434, + "learning_rate": 7.124271920002646e-05, + "loss": 1.7479, + "step": 12366 + }, + { + "epoch": 3.7958870472682626, + "grad_norm": 0.2880019247531891, + "learning_rate": 7.123821943758004e-05, + "loss": 1.7792, + "step": 12367 + }, + { + "epoch": 3.796193984039288, + "grad_norm": 0.28923723101615906, + "learning_rate": 7.123371946524511e-05, + "loss": 1.7474, + "step": 12368 + }, + { + "epoch": 3.7965009208103133, + "grad_norm": 0.2281525880098343, + "learning_rate": 7.122921928306612e-05, + "loss": 1.8106, + "step": 12369 + }, + { + "epoch": 3.796807857581338, + "grad_norm": 0.34825438261032104, + "learning_rate": 7.122471889108752e-05, + "loss": 1.8076, + "step": 12370 + }, + { + "epoch": 3.7971147943523635, + "grad_norm": 0.41145995259284973, + "learning_rate": 7.122021828935382e-05, + "loss": 1.7692, + "step": 12371 + }, + { + "epoch": 3.7974217311233884, + "grad_norm": 0.31711262464523315, + "learning_rate": 7.12157174779095e-05, + "loss": 1.8101, + "step": 12372 + }, + { + "epoch": 3.7977286678944138, + "grad_norm": 0.3044308125972748, + "learning_rate": 7.1211216456799e-05, + "loss": 1.8238, + "step": 12373 + }, + { + "epoch": 3.798035604665439, + "grad_norm": 0.3750055134296417, + "learning_rate": 7.120671522606683e-05, + "loss": 1.7323, + "step": 12374 + }, + { + "epoch": 3.798342541436464, + "grad_norm": 0.38852599263191223, + "learning_rate": 7.120221378575749e-05, + "loss": 1.8402, + "step": 12375 + }, + { + "epoch": 3.7986494782074893, + "grad_norm": 0.3430371582508087, + "learning_rate": 7.119771213591541e-05, + "loss": 1.8369, + "step": 12376 + }, + { + "epoch": 3.798956414978514, + "grad_norm": 0.4787428677082062, + "learning_rate": 7.119321027658515e-05, + "loss": 1.7977, + "step": 12377 + }, + { + "epoch": 3.7992633517495396, + "grad_norm": 0.4263977110385895, + "learning_rate": 7.118870820781114e-05, + "loss": 1.8208, + "step": 12378 + }, + { + "epoch": 3.799570288520565, + "grad_norm": 0.28649669885635376, + "learning_rate": 7.118420592963793e-05, + "loss": 1.773, + "step": 12379 + }, + { + "epoch": 3.7998772252915898, + "grad_norm": 0.26070261001586914, + "learning_rate": 7.117970344210996e-05, + "loss": 1.6866, + "step": 12380 + }, + { + "epoch": 3.800184162062615, + "grad_norm": 0.30127593874931335, + "learning_rate": 7.117520074527173e-05, + "loss": 1.7208, + "step": 12381 + }, + { + "epoch": 3.80049109883364, + "grad_norm": 0.23639258742332458, + "learning_rate": 7.117069783916777e-05, + "loss": 1.7504, + "step": 12382 + }, + { + "epoch": 3.8007980356046653, + "grad_norm": 0.2852858901023865, + "learning_rate": 7.116619472384256e-05, + "loss": 1.7954, + "step": 12383 + }, + { + "epoch": 3.8011049723756907, + "grad_norm": 0.2673225998878479, + "learning_rate": 7.116169139934063e-05, + "loss": 1.7562, + "step": 12384 + }, + { + "epoch": 3.801411909146716, + "grad_norm": 0.21615394949913025, + "learning_rate": 7.115718786570644e-05, + "loss": 1.7126, + "step": 12385 + }, + { + "epoch": 3.801718845917741, + "grad_norm": 0.2165435254573822, + "learning_rate": 7.115268412298453e-05, + "loss": 1.7171, + "step": 12386 + }, + { + "epoch": 3.8020257826887662, + "grad_norm": 0.280564546585083, + "learning_rate": 7.114818017121939e-05, + "loss": 1.7711, + "step": 12387 + }, + { + "epoch": 3.802332719459791, + "grad_norm": 0.3023521304130554, + "learning_rate": 7.114367601045555e-05, + "loss": 1.7538, + "step": 12388 + }, + { + "epoch": 3.8026396562308165, + "grad_norm": 0.27252480387687683, + "learning_rate": 7.11391716407375e-05, + "loss": 1.7604, + "step": 12389 + }, + { + "epoch": 3.802946593001842, + "grad_norm": 0.2122909128665924, + "learning_rate": 7.113466706210976e-05, + "loss": 1.716, + "step": 12390 + }, + { + "epoch": 3.8032535297728667, + "grad_norm": 0.30141574144363403, + "learning_rate": 7.113016227461686e-05, + "loss": 1.7636, + "step": 12391 + }, + { + "epoch": 3.803560466543892, + "grad_norm": 0.33359697461128235, + "learning_rate": 7.112565727830331e-05, + "loss": 1.7805, + "step": 12392 + }, + { + "epoch": 3.803867403314917, + "grad_norm": 0.3161376714706421, + "learning_rate": 7.112115207321364e-05, + "loss": 1.7974, + "step": 12393 + }, + { + "epoch": 3.8041743400859422, + "grad_norm": 0.29028698801994324, + "learning_rate": 7.111664665939235e-05, + "loss": 1.83, + "step": 12394 + }, + { + "epoch": 3.8044812768569676, + "grad_norm": 0.38829556107521057, + "learning_rate": 7.1112141036884e-05, + "loss": 1.8684, + "step": 12395 + }, + { + "epoch": 3.804788213627993, + "grad_norm": 0.4118283987045288, + "learning_rate": 7.110763520573309e-05, + "loss": 1.7812, + "step": 12396 + }, + { + "epoch": 3.805095150399018, + "grad_norm": 0.3907717168331146, + "learning_rate": 7.110312916598416e-05, + "loss": 1.7789, + "step": 12397 + }, + { + "epoch": 3.805402087170043, + "grad_norm": 0.2768644690513611, + "learning_rate": 7.109862291768173e-05, + "loss": 1.8575, + "step": 12398 + }, + { + "epoch": 3.805709023941068, + "grad_norm": 0.3234006464481354, + "learning_rate": 7.109411646087035e-05, + "loss": 1.7485, + "step": 12399 + }, + { + "epoch": 3.8060159607120934, + "grad_norm": 0.415475994348526, + "learning_rate": 7.108960979559454e-05, + "loss": 1.7363, + "step": 12400 + }, + { + "epoch": 3.8063228974831187, + "grad_norm": 0.38654613494873047, + "learning_rate": 7.108510292189884e-05, + "loss": 1.7907, + "step": 12401 + }, + { + "epoch": 3.8066298342541436, + "grad_norm": 0.2541481852531433, + "learning_rate": 7.10805958398278e-05, + "loss": 1.8458, + "step": 12402 + }, + { + "epoch": 3.806936771025169, + "grad_norm": 0.32562851905822754, + "learning_rate": 7.107608854942597e-05, + "loss": 1.7989, + "step": 12403 + }, + { + "epoch": 3.807243707796194, + "grad_norm": 0.3628395199775696, + "learning_rate": 7.107158105073786e-05, + "loss": 1.8044, + "step": 12404 + }, + { + "epoch": 3.807550644567219, + "grad_norm": 0.3363969027996063, + "learning_rate": 7.106707334380805e-05, + "loss": 1.8078, + "step": 12405 + }, + { + "epoch": 3.8078575813382445, + "grad_norm": 0.2853989601135254, + "learning_rate": 7.106256542868108e-05, + "loss": 1.7913, + "step": 12406 + }, + { + "epoch": 3.8081645181092694, + "grad_norm": 0.33455806970596313, + "learning_rate": 7.105805730540148e-05, + "loss": 1.7252, + "step": 12407 + }, + { + "epoch": 3.8084714548802947, + "grad_norm": 0.28103405237197876, + "learning_rate": 7.105354897401382e-05, + "loss": 1.6942, + "step": 12408 + }, + { + "epoch": 3.8087783916513196, + "grad_norm": 0.23230718076229095, + "learning_rate": 7.104904043456264e-05, + "loss": 1.7723, + "step": 12409 + }, + { + "epoch": 3.809085328422345, + "grad_norm": 0.2883053421974182, + "learning_rate": 7.104453168709251e-05, + "loss": 1.8015, + "step": 12410 + }, + { + "epoch": 3.8093922651933703, + "grad_norm": 0.28462252020835876, + "learning_rate": 7.104002273164798e-05, + "loss": 1.791, + "step": 12411 + }, + { + "epoch": 3.8096992019643956, + "grad_norm": 0.3004699647426605, + "learning_rate": 7.103551356827363e-05, + "loss": 1.8401, + "step": 12412 + }, + { + "epoch": 3.8100061387354205, + "grad_norm": 0.2546156048774719, + "learning_rate": 7.1031004197014e-05, + "loss": 1.7645, + "step": 12413 + }, + { + "epoch": 3.810313075506446, + "grad_norm": 0.24532915651798248, + "learning_rate": 7.102649461791364e-05, + "loss": 1.8, + "step": 12414 + }, + { + "epoch": 3.8106200122774707, + "grad_norm": 0.2432405799627304, + "learning_rate": 7.102198483101716e-05, + "loss": 1.7957, + "step": 12415 + }, + { + "epoch": 3.810926949048496, + "grad_norm": 0.24405215680599213, + "learning_rate": 7.101747483636908e-05, + "loss": 1.79, + "step": 12416 + }, + { + "epoch": 3.8112338858195214, + "grad_norm": 0.29519838094711304, + "learning_rate": 7.101296463401401e-05, + "loss": 1.8087, + "step": 12417 + }, + { + "epoch": 3.8115408225905463, + "grad_norm": 0.28205612301826477, + "learning_rate": 7.100845422399652e-05, + "loss": 1.7897, + "step": 12418 + }, + { + "epoch": 3.8118477593615716, + "grad_norm": 0.25014567375183105, + "learning_rate": 7.100394360636115e-05, + "loss": 1.7574, + "step": 12419 + }, + { + "epoch": 3.8121546961325965, + "grad_norm": 0.3133499026298523, + "learning_rate": 7.099943278115251e-05, + "loss": 1.7957, + "step": 12420 + }, + { + "epoch": 3.812461632903622, + "grad_norm": 0.3706473708152771, + "learning_rate": 7.099492174841516e-05, + "loss": 1.8519, + "step": 12421 + }, + { + "epoch": 3.812768569674647, + "grad_norm": 0.30085715651512146, + "learning_rate": 7.09904105081937e-05, + "loss": 1.778, + "step": 12422 + }, + { + "epoch": 3.813075506445672, + "grad_norm": 0.23897981643676758, + "learning_rate": 7.09858990605327e-05, + "loss": 1.7289, + "step": 12423 + }, + { + "epoch": 3.8133824432166974, + "grad_norm": 0.30046290159225464, + "learning_rate": 7.098138740547673e-05, + "loss": 1.8838, + "step": 12424 + }, + { + "epoch": 3.8136893799877223, + "grad_norm": 0.32126328349113464, + "learning_rate": 7.097687554307041e-05, + "loss": 1.7916, + "step": 12425 + }, + { + "epoch": 3.8139963167587476, + "grad_norm": 0.2922256886959076, + "learning_rate": 7.097236347335829e-05, + "loss": 1.8305, + "step": 12426 + }, + { + "epoch": 3.814303253529773, + "grad_norm": 0.2772706151008606, + "learning_rate": 7.0967851196385e-05, + "loss": 1.7694, + "step": 12427 + }, + { + "epoch": 3.8146101903007983, + "grad_norm": 0.25763455033302307, + "learning_rate": 7.096333871219511e-05, + "loss": 1.8716, + "step": 12428 + }, + { + "epoch": 3.814917127071823, + "grad_norm": 0.2631739377975464, + "learning_rate": 7.095882602083322e-05, + "loss": 1.7771, + "step": 12429 + }, + { + "epoch": 3.8152240638428485, + "grad_norm": 0.29229632019996643, + "learning_rate": 7.095431312234392e-05, + "loss": 1.7865, + "step": 12430 + }, + { + "epoch": 3.8155310006138734, + "grad_norm": 0.2672729790210724, + "learning_rate": 7.094980001677181e-05, + "loss": 1.7848, + "step": 12431 + }, + { + "epoch": 3.8158379373848987, + "grad_norm": 0.2388373166322708, + "learning_rate": 7.094528670416152e-05, + "loss": 1.75, + "step": 12432 + }, + { + "epoch": 3.816144874155924, + "grad_norm": 0.2385305017232895, + "learning_rate": 7.094077318455762e-05, + "loss": 1.748, + "step": 12433 + }, + { + "epoch": 3.816451810926949, + "grad_norm": 0.25421401858329773, + "learning_rate": 7.093625945800471e-05, + "loss": 1.779, + "step": 12434 + }, + { + "epoch": 3.8167587476979743, + "grad_norm": 0.2785158157348633, + "learning_rate": 7.093174552454743e-05, + "loss": 1.8295, + "step": 12435 + }, + { + "epoch": 3.817065684468999, + "grad_norm": 0.2907472252845764, + "learning_rate": 7.092723138423036e-05, + "loss": 1.8216, + "step": 12436 + }, + { + "epoch": 3.8173726212400245, + "grad_norm": 0.253955215215683, + "learning_rate": 7.092271703709814e-05, + "loss": 1.8394, + "step": 12437 + }, + { + "epoch": 3.81767955801105, + "grad_norm": 0.32139912247657776, + "learning_rate": 7.091820248319537e-05, + "loss": 1.8634, + "step": 12438 + }, + { + "epoch": 3.8179864947820747, + "grad_norm": 0.25890466570854187, + "learning_rate": 7.091368772256664e-05, + "loss": 1.7336, + "step": 12439 + }, + { + "epoch": 3.8182934315531, + "grad_norm": 0.2823775112628937, + "learning_rate": 7.090917275525661e-05, + "loss": 1.7927, + "step": 12440 + }, + { + "epoch": 3.818600368324125, + "grad_norm": 0.28739333152770996, + "learning_rate": 7.090465758130988e-05, + "loss": 1.7807, + "step": 12441 + }, + { + "epoch": 3.8189073050951503, + "grad_norm": 0.36823949217796326, + "learning_rate": 7.090014220077106e-05, + "loss": 1.7288, + "step": 12442 + }, + { + "epoch": 3.8192142418661756, + "grad_norm": 0.3061312735080719, + "learning_rate": 7.089562661368479e-05, + "loss": 1.8039, + "step": 12443 + }, + { + "epoch": 3.819521178637201, + "grad_norm": 0.25867924094200134, + "learning_rate": 7.089111082009569e-05, + "loss": 1.7678, + "step": 12444 + }, + { + "epoch": 3.819828115408226, + "grad_norm": 0.26834985613822937, + "learning_rate": 7.088659482004837e-05, + "loss": 1.7592, + "step": 12445 + }, + { + "epoch": 3.820135052179251, + "grad_norm": 0.25608211755752563, + "learning_rate": 7.08820786135875e-05, + "loss": 1.7622, + "step": 12446 + }, + { + "epoch": 3.820441988950276, + "grad_norm": 0.2512456774711609, + "learning_rate": 7.087756220075769e-05, + "loss": 1.7648, + "step": 12447 + }, + { + "epoch": 3.8207489257213014, + "grad_norm": 0.2434878647327423, + "learning_rate": 7.087304558160355e-05, + "loss": 1.7435, + "step": 12448 + }, + { + "epoch": 3.8210558624923268, + "grad_norm": 0.26456570625305176, + "learning_rate": 7.086852875616978e-05, + "loss": 1.7342, + "step": 12449 + }, + { + "epoch": 3.8213627992633517, + "grad_norm": 0.2958984971046448, + "learning_rate": 7.086401172450095e-05, + "loss": 1.8532, + "step": 12450 + }, + { + "epoch": 3.821669736034377, + "grad_norm": 0.25939157605171204, + "learning_rate": 7.085949448664172e-05, + "loss": 1.7746, + "step": 12451 + }, + { + "epoch": 3.821976672805402, + "grad_norm": 0.2210223525762558, + "learning_rate": 7.085497704263675e-05, + "loss": 1.7745, + "step": 12452 + }, + { + "epoch": 3.822283609576427, + "grad_norm": 0.2409319430589676, + "learning_rate": 7.085045939253068e-05, + "loss": 1.7981, + "step": 12453 + }, + { + "epoch": 3.8225905463474525, + "grad_norm": 0.26331812143325806, + "learning_rate": 7.084594153636815e-05, + "loss": 1.8163, + "step": 12454 + }, + { + "epoch": 3.8228974831184774, + "grad_norm": 0.2613828480243683, + "learning_rate": 7.08414234741938e-05, + "loss": 1.8362, + "step": 12455 + }, + { + "epoch": 3.8232044198895028, + "grad_norm": 0.3139529228210449, + "learning_rate": 7.083690520605228e-05, + "loss": 1.8247, + "step": 12456 + }, + { + "epoch": 3.8235113566605277, + "grad_norm": 0.2958570718765259, + "learning_rate": 7.083238673198826e-05, + "loss": 1.8011, + "step": 12457 + }, + { + "epoch": 3.823818293431553, + "grad_norm": 0.2517626881599426, + "learning_rate": 7.082786805204639e-05, + "loss": 1.7353, + "step": 12458 + }, + { + "epoch": 3.8241252302025783, + "grad_norm": 0.2443888783454895, + "learning_rate": 7.082334916627132e-05, + "loss": 1.7916, + "step": 12459 + }, + { + "epoch": 3.8244321669736037, + "grad_norm": 0.283514142036438, + "learning_rate": 7.08188300747077e-05, + "loss": 1.8048, + "step": 12460 + }, + { + "epoch": 3.8247391037446286, + "grad_norm": 0.24775351583957672, + "learning_rate": 7.08143107774002e-05, + "loss": 1.8145, + "step": 12461 + }, + { + "epoch": 3.825046040515654, + "grad_norm": 0.27904003858566284, + "learning_rate": 7.080979127439347e-05, + "loss": 1.8003, + "step": 12462 + }, + { + "epoch": 3.825352977286679, + "grad_norm": 0.24997512996196747, + "learning_rate": 7.08052715657322e-05, + "loss": 1.7962, + "step": 12463 + }, + { + "epoch": 3.825659914057704, + "grad_norm": 0.25874343514442444, + "learning_rate": 7.080075165146104e-05, + "loss": 1.7861, + "step": 12464 + }, + { + "epoch": 3.8259668508287294, + "grad_norm": 0.2964434027671814, + "learning_rate": 7.079623153162467e-05, + "loss": 1.7618, + "step": 12465 + }, + { + "epoch": 3.8262737875997543, + "grad_norm": 0.26403337717056274, + "learning_rate": 7.079171120626774e-05, + "loss": 1.8016, + "step": 12466 + }, + { + "epoch": 3.8265807243707797, + "grad_norm": 0.28369295597076416, + "learning_rate": 7.078719067543494e-05, + "loss": 1.7517, + "step": 12467 + }, + { + "epoch": 3.8268876611418046, + "grad_norm": 0.254312127828598, + "learning_rate": 7.078266993917093e-05, + "loss": 1.8085, + "step": 12468 + }, + { + "epoch": 3.82719459791283, + "grad_norm": 0.24992622435092926, + "learning_rate": 7.077814899752038e-05, + "loss": 1.7657, + "step": 12469 + }, + { + "epoch": 3.8275015346838552, + "grad_norm": 0.26485762000083923, + "learning_rate": 7.077362785052802e-05, + "loss": 1.7303, + "step": 12470 + }, + { + "epoch": 3.8278084714548806, + "grad_norm": 0.29864901304244995, + "learning_rate": 7.076910649823846e-05, + "loss": 1.7734, + "step": 12471 + }, + { + "epoch": 3.8281154082259055, + "grad_norm": 0.2973599433898926, + "learning_rate": 7.076458494069644e-05, + "loss": 1.8055, + "step": 12472 + }, + { + "epoch": 3.828422344996931, + "grad_norm": 0.2150362730026245, + "learning_rate": 7.07600631779466e-05, + "loss": 1.7377, + "step": 12473 + }, + { + "epoch": 3.8287292817679557, + "grad_norm": 0.26443010568618774, + "learning_rate": 7.075554121003367e-05, + "loss": 1.837, + "step": 12474 + }, + { + "epoch": 3.829036218538981, + "grad_norm": 0.27365007996559143, + "learning_rate": 7.075101903700231e-05, + "loss": 1.7784, + "step": 12475 + }, + { + "epoch": 3.8293431553100064, + "grad_norm": 0.22037263214588165, + "learning_rate": 7.074649665889721e-05, + "loss": 1.8182, + "step": 12476 + }, + { + "epoch": 3.8296500920810312, + "grad_norm": 0.29614946246147156, + "learning_rate": 7.074197407576308e-05, + "loss": 1.7993, + "step": 12477 + }, + { + "epoch": 3.8299570288520566, + "grad_norm": 0.25135520100593567, + "learning_rate": 7.07374512876446e-05, + "loss": 1.8211, + "step": 12478 + }, + { + "epoch": 3.8302639656230815, + "grad_norm": 0.2711503207683563, + "learning_rate": 7.073292829458645e-05, + "loss": 1.8274, + "step": 12479 + }, + { + "epoch": 3.830570902394107, + "grad_norm": 0.38659265637397766, + "learning_rate": 7.072840509663338e-05, + "loss": 1.796, + "step": 12480 + }, + { + "epoch": 3.830877839165132, + "grad_norm": 0.39382728934288025, + "learning_rate": 7.072388169383005e-05, + "loss": 1.8439, + "step": 12481 + }, + { + "epoch": 3.831184775936157, + "grad_norm": 0.27570033073425293, + "learning_rate": 7.071935808622118e-05, + "loss": 1.8155, + "step": 12482 + }, + { + "epoch": 3.8314917127071824, + "grad_norm": 0.29054465889930725, + "learning_rate": 7.071483427385147e-05, + "loss": 1.754, + "step": 12483 + }, + { + "epoch": 3.8317986494782073, + "grad_norm": 0.4138031303882599, + "learning_rate": 7.071031025676562e-05, + "loss": 1.7686, + "step": 12484 + }, + { + "epoch": 3.8321055862492326, + "grad_norm": 0.3447251617908478, + "learning_rate": 7.070578603500833e-05, + "loss": 1.8135, + "step": 12485 + }, + { + "epoch": 3.832412523020258, + "grad_norm": 0.265115886926651, + "learning_rate": 7.070126160862436e-05, + "loss": 1.803, + "step": 12486 + }, + { + "epoch": 3.8327194597912833, + "grad_norm": 0.4288817346096039, + "learning_rate": 7.069673697765837e-05, + "loss": 1.7814, + "step": 12487 + }, + { + "epoch": 3.833026396562308, + "grad_norm": 0.4890103340148926, + "learning_rate": 7.06922121421551e-05, + "loss": 1.8318, + "step": 12488 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.3676142990589142, + "learning_rate": 7.068768710215928e-05, + "loss": 1.7792, + "step": 12489 + }, + { + "epoch": 3.8336402701043584, + "grad_norm": 0.23254090547561646, + "learning_rate": 7.068316185771557e-05, + "loss": 1.7154, + "step": 12490 + }, + { + "epoch": 3.8339472068753837, + "grad_norm": 0.35014036297798157, + "learning_rate": 7.067863640886876e-05, + "loss": 1.7031, + "step": 12491 + }, + { + "epoch": 3.834254143646409, + "grad_norm": 0.32155317068099976, + "learning_rate": 7.067411075566353e-05, + "loss": 1.7692, + "step": 12492 + }, + { + "epoch": 3.834561080417434, + "grad_norm": 0.260772705078125, + "learning_rate": 7.066958489814463e-05, + "loss": 1.7488, + "step": 12493 + }, + { + "epoch": 3.8348680171884593, + "grad_norm": 0.2624910771846771, + "learning_rate": 7.066505883635678e-05, + "loss": 1.7436, + "step": 12494 + }, + { + "epoch": 3.835174953959484, + "grad_norm": 0.2782299220561981, + "learning_rate": 7.066053257034471e-05, + "loss": 1.8219, + "step": 12495 + }, + { + "epoch": 3.8354818907305095, + "grad_norm": 0.2749497890472412, + "learning_rate": 7.065600610015312e-05, + "loss": 1.8068, + "step": 12496 + }, + { + "epoch": 3.835788827501535, + "grad_norm": 0.2730359733104706, + "learning_rate": 7.06514794258268e-05, + "loss": 1.7588, + "step": 12497 + }, + { + "epoch": 3.8360957642725597, + "grad_norm": 0.3606291711330414, + "learning_rate": 7.064695254741044e-05, + "loss": 1.8509, + "step": 12498 + }, + { + "epoch": 3.836402701043585, + "grad_norm": 0.23282989859580994, + "learning_rate": 7.064242546494879e-05, + "loss": 1.7444, + "step": 12499 + }, + { + "epoch": 3.83670963781461, + "grad_norm": 0.2554507255554199, + "learning_rate": 7.06378981784866e-05, + "loss": 1.7486, + "step": 12500 + }, + { + "epoch": 3.8370165745856353, + "grad_norm": 0.2916143834590912, + "learning_rate": 7.06333706880686e-05, + "loss": 1.8035, + "step": 12501 + }, + { + "epoch": 3.8373235113566606, + "grad_norm": 0.23719090223312378, + "learning_rate": 7.062884299373955e-05, + "loss": 1.7896, + "step": 12502 + }, + { + "epoch": 3.837630448127686, + "grad_norm": 0.2596152126789093, + "learning_rate": 7.062431509554417e-05, + "loss": 1.7944, + "step": 12503 + }, + { + "epoch": 3.837937384898711, + "grad_norm": 0.29140764474868774, + "learning_rate": 7.061978699352723e-05, + "loss": 1.7988, + "step": 12504 + }, + { + "epoch": 3.838244321669736, + "grad_norm": 0.3421068489551544, + "learning_rate": 7.061525868773347e-05, + "loss": 1.751, + "step": 12505 + }, + { + "epoch": 3.838551258440761, + "grad_norm": 0.2705349624156952, + "learning_rate": 7.061073017820764e-05, + "loss": 1.7578, + "step": 12506 + }, + { + "epoch": 3.8388581952117864, + "grad_norm": 0.2403286248445511, + "learning_rate": 7.060620146499448e-05, + "loss": 1.8422, + "step": 12507 + }, + { + "epoch": 3.8391651319828117, + "grad_norm": 0.3860442042350769, + "learning_rate": 7.060167254813876e-05, + "loss": 1.8168, + "step": 12508 + }, + { + "epoch": 3.8394720687538366, + "grad_norm": 0.4729512631893158, + "learning_rate": 7.059714342768526e-05, + "loss": 1.7786, + "step": 12509 + }, + { + "epoch": 3.839779005524862, + "grad_norm": 0.3522968888282776, + "learning_rate": 7.059261410367871e-05, + "loss": 1.8749, + "step": 12510 + }, + { + "epoch": 3.840085942295887, + "grad_norm": 0.28071436285972595, + "learning_rate": 7.058808457616386e-05, + "loss": 1.7959, + "step": 12511 + }, + { + "epoch": 3.840392879066912, + "grad_norm": 0.4356439411640167, + "learning_rate": 7.05835548451855e-05, + "loss": 1.8045, + "step": 12512 + }, + { + "epoch": 3.8406998158379375, + "grad_norm": 0.4051562249660492, + "learning_rate": 7.057902491078839e-05, + "loss": 1.7909, + "step": 12513 + }, + { + "epoch": 3.8410067526089624, + "grad_norm": 0.2817205488681793, + "learning_rate": 7.057449477301728e-05, + "loss": 1.8736, + "step": 12514 + }, + { + "epoch": 3.8413136893799877, + "grad_norm": 0.33369559049606323, + "learning_rate": 7.056996443191697e-05, + "loss": 1.7799, + "step": 12515 + }, + { + "epoch": 3.8416206261510126, + "grad_norm": 0.369954913854599, + "learning_rate": 7.056543388753221e-05, + "loss": 1.795, + "step": 12516 + }, + { + "epoch": 3.841927562922038, + "grad_norm": 0.289474755525589, + "learning_rate": 7.056090313990778e-05, + "loss": 1.786, + "step": 12517 + }, + { + "epoch": 3.8422344996930633, + "grad_norm": 0.2431849092245102, + "learning_rate": 7.055637218908845e-05, + "loss": 1.7363, + "step": 12518 + }, + { + "epoch": 3.8425414364640886, + "grad_norm": 0.3736060857772827, + "learning_rate": 7.0551841035119e-05, + "loss": 1.8234, + "step": 12519 + }, + { + "epoch": 3.8428483732351135, + "grad_norm": 0.34008854627609253, + "learning_rate": 7.054730967804422e-05, + "loss": 1.8001, + "step": 12520 + }, + { + "epoch": 3.843155310006139, + "grad_norm": 0.24852876365184784, + "learning_rate": 7.054277811790887e-05, + "loss": 1.8298, + "step": 12521 + }, + { + "epoch": 3.8434622467771637, + "grad_norm": 0.3491046726703644, + "learning_rate": 7.053824635475777e-05, + "loss": 1.7336, + "step": 12522 + }, + { + "epoch": 3.843769183548189, + "grad_norm": 0.38757824897766113, + "learning_rate": 7.053371438863566e-05, + "loss": 1.8241, + "step": 12523 + }, + { + "epoch": 3.8440761203192144, + "grad_norm": 0.2607647180557251, + "learning_rate": 7.052918221958735e-05, + "loss": 1.7813, + "step": 12524 + }, + { + "epoch": 3.8443830570902393, + "grad_norm": 0.25634410977363586, + "learning_rate": 7.052464984765764e-05, + "loss": 1.7836, + "step": 12525 + }, + { + "epoch": 3.8446899938612646, + "grad_norm": 0.3113503158092499, + "learning_rate": 7.052011727289129e-05, + "loss": 1.8477, + "step": 12526 + }, + { + "epoch": 3.8449969306322895, + "grad_norm": 0.2852596044540405, + "learning_rate": 7.051558449533313e-05, + "loss": 1.7607, + "step": 12527 + }, + { + "epoch": 3.845303867403315, + "grad_norm": 0.24841541051864624, + "learning_rate": 7.051105151502795e-05, + "loss": 1.8109, + "step": 12528 + }, + { + "epoch": 3.84561080417434, + "grad_norm": 0.2231549620628357, + "learning_rate": 7.050651833202053e-05, + "loss": 1.7245, + "step": 12529 + }, + { + "epoch": 3.845917740945365, + "grad_norm": 0.21975892782211304, + "learning_rate": 7.050198494635566e-05, + "loss": 1.7512, + "step": 12530 + }, + { + "epoch": 3.8462246777163904, + "grad_norm": 0.2546280324459076, + "learning_rate": 7.049745135807816e-05, + "loss": 1.8003, + "step": 12531 + }, + { + "epoch": 3.8465316144874153, + "grad_norm": 0.21507929265499115, + "learning_rate": 7.049291756723284e-05, + "loss": 1.7616, + "step": 12532 + }, + { + "epoch": 3.8468385512584407, + "grad_norm": 0.24927987158298492, + "learning_rate": 7.04883835738645e-05, + "loss": 1.7519, + "step": 12533 + }, + { + "epoch": 3.847145488029466, + "grad_norm": 0.24988602101802826, + "learning_rate": 7.048384937801793e-05, + "loss": 1.7966, + "step": 12534 + }, + { + "epoch": 3.8474524248004913, + "grad_norm": 0.24039845168590546, + "learning_rate": 7.047931497973798e-05, + "loss": 1.7834, + "step": 12535 + }, + { + "epoch": 3.847759361571516, + "grad_norm": 0.22826696932315826, + "learning_rate": 7.047478037906943e-05, + "loss": 1.7334, + "step": 12536 + }, + { + "epoch": 3.8480662983425415, + "grad_norm": 0.22260744869709015, + "learning_rate": 7.047024557605708e-05, + "loss": 1.787, + "step": 12537 + }, + { + "epoch": 3.8483732351135664, + "grad_norm": 0.2457917332649231, + "learning_rate": 7.046571057074578e-05, + "loss": 1.7865, + "step": 12538 + }, + { + "epoch": 3.8486801718845918, + "grad_norm": 0.23952928185462952, + "learning_rate": 7.046117536318035e-05, + "loss": 1.7764, + "step": 12539 + }, + { + "epoch": 3.848987108655617, + "grad_norm": 0.22186748683452606, + "learning_rate": 7.045663995340557e-05, + "loss": 1.7917, + "step": 12540 + }, + { + "epoch": 3.849294045426642, + "grad_norm": 0.24234962463378906, + "learning_rate": 7.045210434146629e-05, + "loss": 1.7697, + "step": 12541 + }, + { + "epoch": 3.8496009821976673, + "grad_norm": 0.2510770857334137, + "learning_rate": 7.044756852740732e-05, + "loss": 1.8012, + "step": 12542 + }, + { + "epoch": 3.849907918968692, + "grad_norm": 0.24910703301429749, + "learning_rate": 7.044303251127349e-05, + "loss": 1.831, + "step": 12543 + }, + { + "epoch": 3.8502148557397176, + "grad_norm": 0.3159966468811035, + "learning_rate": 7.043849629310964e-05, + "loss": 1.8029, + "step": 12544 + }, + { + "epoch": 3.850521792510743, + "grad_norm": 0.3155403733253479, + "learning_rate": 7.04339598729606e-05, + "loss": 1.7429, + "step": 12545 + }, + { + "epoch": 3.8508287292817682, + "grad_norm": 0.3037515878677368, + "learning_rate": 7.042942325087117e-05, + "loss": 1.8186, + "step": 12546 + }, + { + "epoch": 3.851135666052793, + "grad_norm": 0.2319766730070114, + "learning_rate": 7.042488642688621e-05, + "loss": 1.7853, + "step": 12547 + }, + { + "epoch": 3.8514426028238185, + "grad_norm": 0.23911969363689423, + "learning_rate": 7.042034940105055e-05, + "loss": 1.8314, + "step": 12548 + }, + { + "epoch": 3.8517495395948433, + "grad_norm": 0.2541846036911011, + "learning_rate": 7.041581217340905e-05, + "loss": 1.8289, + "step": 12549 + }, + { + "epoch": 3.8520564763658687, + "grad_norm": 0.22234943509101868, + "learning_rate": 7.04112747440065e-05, + "loss": 1.7847, + "step": 12550 + }, + { + "epoch": 3.852363413136894, + "grad_norm": 0.2747870981693268, + "learning_rate": 7.04067371128878e-05, + "loss": 1.7875, + "step": 12551 + }, + { + "epoch": 3.852670349907919, + "grad_norm": 0.28589147329330444, + "learning_rate": 7.040219928009775e-05, + "loss": 1.7289, + "step": 12552 + }, + { + "epoch": 3.8529772866789442, + "grad_norm": 0.21180351078510284, + "learning_rate": 7.039766124568119e-05, + "loss": 1.7611, + "step": 12553 + }, + { + "epoch": 3.853284223449969, + "grad_norm": 0.27751782536506653, + "learning_rate": 7.0393123009683e-05, + "loss": 1.7481, + "step": 12554 + }, + { + "epoch": 3.8535911602209945, + "grad_norm": 0.32883307337760925, + "learning_rate": 7.038858457214802e-05, + "loss": 1.7271, + "step": 12555 + }, + { + "epoch": 3.85389809699202, + "grad_norm": 0.30965641140937805, + "learning_rate": 7.03840459331211e-05, + "loss": 1.81, + "step": 12556 + }, + { + "epoch": 3.8542050337630447, + "grad_norm": 0.25184348225593567, + "learning_rate": 7.037950709264709e-05, + "loss": 1.7642, + "step": 12557 + }, + { + "epoch": 3.85451197053407, + "grad_norm": 0.2376822829246521, + "learning_rate": 7.037496805077084e-05, + "loss": 1.7774, + "step": 12558 + }, + { + "epoch": 3.854818907305095, + "grad_norm": 0.2395993024110794, + "learning_rate": 7.03704288075372e-05, + "loss": 1.8397, + "step": 12559 + }, + { + "epoch": 3.8551258440761202, + "grad_norm": 0.26460394263267517, + "learning_rate": 7.036588936299107e-05, + "loss": 1.7472, + "step": 12560 + }, + { + "epoch": 3.8554327808471456, + "grad_norm": 0.34742459654808044, + "learning_rate": 7.036134971717725e-05, + "loss": 1.8003, + "step": 12561 + }, + { + "epoch": 3.855739717618171, + "grad_norm": 0.2829316556453705, + "learning_rate": 7.035680987014068e-05, + "loss": 1.7765, + "step": 12562 + }, + { + "epoch": 3.856046654389196, + "grad_norm": 0.3087223172187805, + "learning_rate": 7.035226982192615e-05, + "loss": 1.8462, + "step": 12563 + }, + { + "epoch": 3.856353591160221, + "grad_norm": 0.2806380093097687, + "learning_rate": 7.034772957257858e-05, + "loss": 1.7704, + "step": 12564 + }, + { + "epoch": 3.856660527931246, + "grad_norm": 0.25598087906837463, + "learning_rate": 7.03431891221428e-05, + "loss": 1.7843, + "step": 12565 + }, + { + "epoch": 3.8569674647022714, + "grad_norm": 0.30833700299263, + "learning_rate": 7.033864847066373e-05, + "loss": 1.8404, + "step": 12566 + }, + { + "epoch": 3.8572744014732967, + "grad_norm": 0.29562532901763916, + "learning_rate": 7.03341076181862e-05, + "loss": 1.8044, + "step": 12567 + }, + { + "epoch": 3.8575813382443216, + "grad_norm": 0.2901719808578491, + "learning_rate": 7.03295665647551e-05, + "loss": 1.7789, + "step": 12568 + }, + { + "epoch": 3.857888275015347, + "grad_norm": 0.25453686714172363, + "learning_rate": 7.03250253104153e-05, + "loss": 1.6792, + "step": 12569 + }, + { + "epoch": 3.858195211786372, + "grad_norm": 0.26009416580200195, + "learning_rate": 7.03204838552117e-05, + "loss": 1.7835, + "step": 12570 + }, + { + "epoch": 3.858502148557397, + "grad_norm": 0.28074127435684204, + "learning_rate": 7.031594219918916e-05, + "loss": 1.7932, + "step": 12571 + }, + { + "epoch": 3.8588090853284225, + "grad_norm": 0.3341725170612335, + "learning_rate": 7.031140034239258e-05, + "loss": 1.7439, + "step": 12572 + }, + { + "epoch": 3.8591160220994474, + "grad_norm": 0.28142449259757996, + "learning_rate": 7.030685828486684e-05, + "loss": 1.8263, + "step": 12573 + }, + { + "epoch": 3.8594229588704727, + "grad_norm": 0.2571438252925873, + "learning_rate": 7.030231602665681e-05, + "loss": 1.7628, + "step": 12574 + }, + { + "epoch": 3.8597298956414976, + "grad_norm": 0.3079041838645935, + "learning_rate": 7.029777356780741e-05, + "loss": 1.7879, + "step": 12575 + }, + { + "epoch": 3.860036832412523, + "grad_norm": 0.2605433464050293, + "learning_rate": 7.029323090836349e-05, + "loss": 1.7841, + "step": 12576 + }, + { + "epoch": 3.8603437691835483, + "grad_norm": 0.24069640040397644, + "learning_rate": 7.028868804836999e-05, + "loss": 1.7939, + "step": 12577 + }, + { + "epoch": 3.8606507059545736, + "grad_norm": 0.26801639795303345, + "learning_rate": 7.028414498787177e-05, + "loss": 1.8082, + "step": 12578 + }, + { + "epoch": 3.8609576427255985, + "grad_norm": 0.28828585147857666, + "learning_rate": 7.027960172691375e-05, + "loss": 1.8094, + "step": 12579 + }, + { + "epoch": 3.861264579496624, + "grad_norm": 0.22927051782608032, + "learning_rate": 7.027505826554082e-05, + "loss": 1.7758, + "step": 12580 + }, + { + "epoch": 3.8615715162676487, + "grad_norm": 0.25755998492240906, + "learning_rate": 7.027051460379788e-05, + "loss": 1.8429, + "step": 12581 + }, + { + "epoch": 3.861878453038674, + "grad_norm": 0.23636581003665924, + "learning_rate": 7.026597074172982e-05, + "loss": 1.7662, + "step": 12582 + }, + { + "epoch": 3.8621853898096994, + "grad_norm": 0.22599349915981293, + "learning_rate": 7.026142667938156e-05, + "loss": 1.7199, + "step": 12583 + }, + { + "epoch": 3.8624923265807243, + "grad_norm": 0.2504875659942627, + "learning_rate": 7.025688241679802e-05, + "loss": 1.8473, + "step": 12584 + }, + { + "epoch": 3.8627992633517496, + "grad_norm": 0.3012976348400116, + "learning_rate": 7.025233795402408e-05, + "loss": 1.8715, + "step": 12585 + }, + { + "epoch": 3.8631062001227745, + "grad_norm": 0.31703677773475647, + "learning_rate": 7.024779329110469e-05, + "loss": 1.8143, + "step": 12586 + }, + { + "epoch": 3.8634131368938, + "grad_norm": 0.27287593483924866, + "learning_rate": 7.024324842808472e-05, + "loss": 1.7227, + "step": 12587 + }, + { + "epoch": 3.863720073664825, + "grad_norm": 0.24663801491260529, + "learning_rate": 7.02387033650091e-05, + "loss": 1.7529, + "step": 12588 + }, + { + "epoch": 3.86402701043585, + "grad_norm": 0.26127147674560547, + "learning_rate": 7.023415810192277e-05, + "loss": 1.7629, + "step": 12589 + }, + { + "epoch": 3.8643339472068754, + "grad_norm": 0.3457142114639282, + "learning_rate": 7.022961263887062e-05, + "loss": 1.8212, + "step": 12590 + }, + { + "epoch": 3.8646408839779003, + "grad_norm": 0.3296070694923401, + "learning_rate": 7.022506697589759e-05, + "loss": 1.7907, + "step": 12591 + }, + { + "epoch": 3.8649478207489256, + "grad_norm": 0.29474303126335144, + "learning_rate": 7.022052111304858e-05, + "loss": 1.7866, + "step": 12592 + }, + { + "epoch": 3.865254757519951, + "grad_norm": 0.2535403072834015, + "learning_rate": 7.021597505036852e-05, + "loss": 1.7607, + "step": 12593 + }, + { + "epoch": 3.8655616942909763, + "grad_norm": 0.26691222190856934, + "learning_rate": 7.021142878790237e-05, + "loss": 1.8063, + "step": 12594 + }, + { + "epoch": 3.865868631062001, + "grad_norm": 0.2784755229949951, + "learning_rate": 7.020688232569502e-05, + "loss": 1.8065, + "step": 12595 + }, + { + "epoch": 3.8661755678330265, + "grad_norm": 0.23714317381381989, + "learning_rate": 7.020233566379142e-05, + "loss": 1.8317, + "step": 12596 + }, + { + "epoch": 3.8664825046040514, + "grad_norm": 0.25010553002357483, + "learning_rate": 7.019778880223649e-05, + "loss": 1.8493, + "step": 12597 + }, + { + "epoch": 3.8667894413750767, + "grad_norm": 0.2798489034175873, + "learning_rate": 7.01932417410752e-05, + "loss": 1.8134, + "step": 12598 + }, + { + "epoch": 3.867096378146102, + "grad_norm": 0.26199260354042053, + "learning_rate": 7.018869448035243e-05, + "loss": 1.6931, + "step": 12599 + }, + { + "epoch": 3.867403314917127, + "grad_norm": 0.24582891166210175, + "learning_rate": 7.018414702011314e-05, + "loss": 1.8076, + "step": 12600 + }, + { + "epoch": 3.8677102516881523, + "grad_norm": 0.25493237376213074, + "learning_rate": 7.01795993604023e-05, + "loss": 1.7851, + "step": 12601 + }, + { + "epoch": 3.868017188459177, + "grad_norm": 0.2607674300670624, + "learning_rate": 7.017505150126483e-05, + "loss": 1.7285, + "step": 12602 + }, + { + "epoch": 3.8683241252302025, + "grad_norm": 0.23629581928253174, + "learning_rate": 7.017050344274568e-05, + "loss": 1.8254, + "step": 12603 + }, + { + "epoch": 3.868631062001228, + "grad_norm": 0.3129318058490753, + "learning_rate": 7.016595518488979e-05, + "loss": 1.7914, + "step": 12604 + }, + { + "epoch": 3.8689379987722528, + "grad_norm": 0.3178271949291229, + "learning_rate": 7.01614067277421e-05, + "loss": 1.8139, + "step": 12605 + }, + { + "epoch": 3.869244935543278, + "grad_norm": 0.3230711817741394, + "learning_rate": 7.015685807134757e-05, + "loss": 1.8203, + "step": 12606 + }, + { + "epoch": 3.869551872314303, + "grad_norm": 0.26339825987815857, + "learning_rate": 7.015230921575118e-05, + "loss": 1.8022, + "step": 12607 + }, + { + "epoch": 3.8698588090853283, + "grad_norm": 0.25337356328964233, + "learning_rate": 7.014776016099785e-05, + "loss": 1.7779, + "step": 12608 + }, + { + "epoch": 3.8701657458563536, + "grad_norm": 0.2506195306777954, + "learning_rate": 7.014321090713253e-05, + "loss": 1.7858, + "step": 12609 + }, + { + "epoch": 3.870472682627379, + "grad_norm": 0.26249951124191284, + "learning_rate": 7.013866145420021e-05, + "loss": 1.8051, + "step": 12610 + }, + { + "epoch": 3.870779619398404, + "grad_norm": 0.25666534900665283, + "learning_rate": 7.013411180224581e-05, + "loss": 1.7945, + "step": 12611 + }, + { + "epoch": 3.871086556169429, + "grad_norm": 0.23901648819446564, + "learning_rate": 7.012956195131433e-05, + "loss": 1.7844, + "step": 12612 + }, + { + "epoch": 3.871393492940454, + "grad_norm": 0.26814451813697815, + "learning_rate": 7.012501190145071e-05, + "loss": 1.7713, + "step": 12613 + }, + { + "epoch": 3.8717004297114794, + "grad_norm": 0.28377315402030945, + "learning_rate": 7.012046165269995e-05, + "loss": 1.7866, + "step": 12614 + }, + { + "epoch": 3.8720073664825048, + "grad_norm": 0.2751680612564087, + "learning_rate": 7.011591120510699e-05, + "loss": 1.7215, + "step": 12615 + }, + { + "epoch": 3.8723143032535297, + "grad_norm": 0.21988113224506378, + "learning_rate": 7.011136055871679e-05, + "loss": 1.8009, + "step": 12616 + }, + { + "epoch": 3.872621240024555, + "grad_norm": 0.26462143659591675, + "learning_rate": 7.010680971357434e-05, + "loss": 1.7618, + "step": 12617 + }, + { + "epoch": 3.87292817679558, + "grad_norm": 0.29054632782936096, + "learning_rate": 7.010225866972462e-05, + "loss": 1.7549, + "step": 12618 + }, + { + "epoch": 3.873235113566605, + "grad_norm": 0.31341224908828735, + "learning_rate": 7.00977074272126e-05, + "loss": 1.8827, + "step": 12619 + }, + { + "epoch": 3.8735420503376305, + "grad_norm": 0.24252115190029144, + "learning_rate": 7.009315598608324e-05, + "loss": 1.7544, + "step": 12620 + }, + { + "epoch": 3.873848987108656, + "grad_norm": 0.30036893486976624, + "learning_rate": 7.008860434638154e-05, + "loss": 1.7465, + "step": 12621 + }, + { + "epoch": 3.8741559238796808, + "grad_norm": 0.3217438757419586, + "learning_rate": 7.00840525081525e-05, + "loss": 1.72, + "step": 12622 + }, + { + "epoch": 3.874462860650706, + "grad_norm": 0.22507290542125702, + "learning_rate": 7.007950047144105e-05, + "loss": 1.7177, + "step": 12623 + }, + { + "epoch": 3.874769797421731, + "grad_norm": 0.3014441728591919, + "learning_rate": 7.007494823629224e-05, + "loss": 1.7502, + "step": 12624 + }, + { + "epoch": 3.8750767341927563, + "grad_norm": 0.3836904466152191, + "learning_rate": 7.0070395802751e-05, + "loss": 1.7971, + "step": 12625 + }, + { + "epoch": 3.8753836709637817, + "grad_norm": 0.33565691113471985, + "learning_rate": 7.006584317086235e-05, + "loss": 1.7439, + "step": 12626 + }, + { + "epoch": 3.8756906077348066, + "grad_norm": 0.2292134314775467, + "learning_rate": 7.006129034067128e-05, + "loss": 1.7998, + "step": 12627 + }, + { + "epoch": 3.875997544505832, + "grad_norm": 0.26385873556137085, + "learning_rate": 7.005673731222277e-05, + "loss": 1.7914, + "step": 12628 + }, + { + "epoch": 3.876304481276857, + "grad_norm": 0.2854950428009033, + "learning_rate": 7.005218408556184e-05, + "loss": 1.7761, + "step": 12629 + }, + { + "epoch": 3.876611418047882, + "grad_norm": 0.34260645508766174, + "learning_rate": 7.004763066073348e-05, + "loss": 1.8015, + "step": 12630 + }, + { + "epoch": 3.8769183548189075, + "grad_norm": 0.3223683834075928, + "learning_rate": 7.004307703778267e-05, + "loss": 1.7453, + "step": 12631 + }, + { + "epoch": 3.8772252915899323, + "grad_norm": 0.24715089797973633, + "learning_rate": 7.003852321675442e-05, + "loss": 1.7813, + "step": 12632 + }, + { + "epoch": 3.8775322283609577, + "grad_norm": 0.22822390496730804, + "learning_rate": 7.003396919769377e-05, + "loss": 1.7982, + "step": 12633 + }, + { + "epoch": 3.8778391651319826, + "grad_norm": 0.24125081300735474, + "learning_rate": 7.002941498064565e-05, + "loss": 1.8606, + "step": 12634 + }, + { + "epoch": 3.878146101903008, + "grad_norm": 0.23512506484985352, + "learning_rate": 7.002486056565513e-05, + "loss": 1.7469, + "step": 12635 + }, + { + "epoch": 3.8784530386740332, + "grad_norm": 0.2908322215080261, + "learning_rate": 7.00203059527672e-05, + "loss": 1.796, + "step": 12636 + }, + { + "epoch": 3.8787599754450586, + "grad_norm": 0.22931252419948578, + "learning_rate": 7.001575114202689e-05, + "loss": 1.7482, + "step": 12637 + }, + { + "epoch": 3.8790669122160835, + "grad_norm": 0.22574284672737122, + "learning_rate": 7.001119613347917e-05, + "loss": 1.7698, + "step": 12638 + }, + { + "epoch": 3.879373848987109, + "grad_norm": 0.23129726946353912, + "learning_rate": 7.000664092716909e-05, + "loss": 1.776, + "step": 12639 + }, + { + "epoch": 3.8796807857581337, + "grad_norm": 0.2763366401195526, + "learning_rate": 7.000208552314165e-05, + "loss": 1.7814, + "step": 12640 + }, + { + "epoch": 3.879987722529159, + "grad_norm": 0.29870158433914185, + "learning_rate": 6.99975299214419e-05, + "loss": 1.7467, + "step": 12641 + }, + { + "epoch": 3.8802946593001844, + "grad_norm": 0.33574381470680237, + "learning_rate": 6.999297412211484e-05, + "loss": 1.8159, + "step": 12642 + }, + { + "epoch": 3.8806015960712092, + "grad_norm": 0.30309897661209106, + "learning_rate": 6.998841812520547e-05, + "loss": 1.8454, + "step": 12643 + }, + { + "epoch": 3.8809085328422346, + "grad_norm": 0.27399247884750366, + "learning_rate": 6.998386193075886e-05, + "loss": 1.7956, + "step": 12644 + }, + { + "epoch": 3.8812154696132595, + "grad_norm": 0.28649580478668213, + "learning_rate": 6.997930553881998e-05, + "loss": 1.8308, + "step": 12645 + }, + { + "epoch": 3.881522406384285, + "grad_norm": 0.2716052532196045, + "learning_rate": 6.997474894943392e-05, + "loss": 1.7698, + "step": 12646 + }, + { + "epoch": 3.88182934315531, + "grad_norm": 0.21380536258220673, + "learning_rate": 6.997019216264567e-05, + "loss": 1.7028, + "step": 12647 + }, + { + "epoch": 3.882136279926335, + "grad_norm": 0.25262731313705444, + "learning_rate": 6.996563517850028e-05, + "loss": 1.8236, + "step": 12648 + }, + { + "epoch": 3.8824432166973604, + "grad_norm": 0.21150052547454834, + "learning_rate": 6.996107799704277e-05, + "loss": 1.7437, + "step": 12649 + }, + { + "epoch": 3.8827501534683853, + "grad_norm": 0.2614554464817047, + "learning_rate": 6.995652061831821e-05, + "loss": 1.7575, + "step": 12650 + }, + { + "epoch": 3.8830570902394106, + "grad_norm": 0.214684396982193, + "learning_rate": 6.995196304237159e-05, + "loss": 1.8195, + "step": 12651 + }, + { + "epoch": 3.883364027010436, + "grad_norm": 0.2226872444152832, + "learning_rate": 6.994740526924798e-05, + "loss": 1.7556, + "step": 12652 + }, + { + "epoch": 3.8836709637814613, + "grad_norm": 0.22270764410495758, + "learning_rate": 6.994284729899246e-05, + "loss": 1.7536, + "step": 12653 + }, + { + "epoch": 3.883977900552486, + "grad_norm": 0.20683564245700836, + "learning_rate": 6.993828913165e-05, + "loss": 1.7728, + "step": 12654 + }, + { + "epoch": 3.8842848373235115, + "grad_norm": 0.23667018115520477, + "learning_rate": 6.993373076726568e-05, + "loss": 1.7819, + "step": 12655 + }, + { + "epoch": 3.8845917740945364, + "grad_norm": 0.2265234887599945, + "learning_rate": 6.992917220588455e-05, + "loss": 1.7502, + "step": 12656 + }, + { + "epoch": 3.8848987108655617, + "grad_norm": 0.24490754306316376, + "learning_rate": 6.992461344755168e-05, + "loss": 1.7513, + "step": 12657 + }, + { + "epoch": 3.885205647636587, + "grad_norm": 0.23001348972320557, + "learning_rate": 6.992005449231208e-05, + "loss": 1.733, + "step": 12658 + }, + { + "epoch": 3.885512584407612, + "grad_norm": 0.25424695014953613, + "learning_rate": 6.991549534021084e-05, + "loss": 1.7621, + "step": 12659 + }, + { + "epoch": 3.8858195211786373, + "grad_norm": 0.25552862882614136, + "learning_rate": 6.991093599129299e-05, + "loss": 1.7974, + "step": 12660 + }, + { + "epoch": 3.886126457949662, + "grad_norm": 0.26876959204673767, + "learning_rate": 6.99063764456036e-05, + "loss": 1.7924, + "step": 12661 + }, + { + "epoch": 3.8864333947206875, + "grad_norm": 0.2754429578781128, + "learning_rate": 6.990181670318772e-05, + "loss": 1.7981, + "step": 12662 + }, + { + "epoch": 3.886740331491713, + "grad_norm": 0.281818687915802, + "learning_rate": 6.989725676409044e-05, + "loss": 1.7328, + "step": 12663 + }, + { + "epoch": 3.8870472682627377, + "grad_norm": 0.21676552295684814, + "learning_rate": 6.989269662835681e-05, + "loss": 1.7376, + "step": 12664 + }, + { + "epoch": 3.887354205033763, + "grad_norm": 0.276115745306015, + "learning_rate": 6.98881362960319e-05, + "loss": 1.7784, + "step": 12665 + }, + { + "epoch": 3.887661141804788, + "grad_norm": 0.2806364893913269, + "learning_rate": 6.988357576716075e-05, + "loss": 1.8078, + "step": 12666 + }, + { + "epoch": 3.8879680785758133, + "grad_norm": 0.27620184421539307, + "learning_rate": 6.987901504178845e-05, + "loss": 1.8115, + "step": 12667 + }, + { + "epoch": 3.8882750153468386, + "grad_norm": 0.23845402896404266, + "learning_rate": 6.987445411996009e-05, + "loss": 1.7485, + "step": 12668 + }, + { + "epoch": 3.888581952117864, + "grad_norm": 0.25063586235046387, + "learning_rate": 6.986989300172071e-05, + "loss": 1.7663, + "step": 12669 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.2417975515127182, + "learning_rate": 6.98653316871154e-05, + "loss": 1.7562, + "step": 12670 + }, + { + "epoch": 3.889195825659914, + "grad_norm": 0.24952733516693115, + "learning_rate": 6.986077017618923e-05, + "loss": 1.8063, + "step": 12671 + }, + { + "epoch": 3.889502762430939, + "grad_norm": 0.25847554206848145, + "learning_rate": 6.985620846898732e-05, + "loss": 1.7722, + "step": 12672 + }, + { + "epoch": 3.8898096992019644, + "grad_norm": 0.23762650787830353, + "learning_rate": 6.985164656555471e-05, + "loss": 1.8368, + "step": 12673 + }, + { + "epoch": 3.8901166359729897, + "grad_norm": 0.25346314907073975, + "learning_rate": 6.984708446593648e-05, + "loss": 1.7957, + "step": 12674 + }, + { + "epoch": 3.8904235727440146, + "grad_norm": 0.2466745674610138, + "learning_rate": 6.984252217017774e-05, + "loss": 1.8286, + "step": 12675 + }, + { + "epoch": 3.89073050951504, + "grad_norm": 0.25413215160369873, + "learning_rate": 6.983795967832356e-05, + "loss": 1.7711, + "step": 12676 + }, + { + "epoch": 3.891037446286065, + "grad_norm": 0.2315925806760788, + "learning_rate": 6.983339699041903e-05, + "loss": 1.7546, + "step": 12677 + }, + { + "epoch": 3.89134438305709, + "grad_norm": 0.26473405957221985, + "learning_rate": 6.982883410650925e-05, + "loss": 1.7563, + "step": 12678 + }, + { + "epoch": 3.8916513198281155, + "grad_norm": 0.24176491796970367, + "learning_rate": 6.982427102663932e-05, + "loss": 1.7734, + "step": 12679 + }, + { + "epoch": 3.891958256599141, + "grad_norm": 0.25444844365119934, + "learning_rate": 6.98197077508543e-05, + "loss": 1.803, + "step": 12680 + }, + { + "epoch": 3.8922651933701657, + "grad_norm": 0.25234144926071167, + "learning_rate": 6.981514427919933e-05, + "loss": 1.8099, + "step": 12681 + }, + { + "epoch": 3.892572130141191, + "grad_norm": 0.2571142315864563, + "learning_rate": 6.98105806117195e-05, + "loss": 1.8618, + "step": 12682 + }, + { + "epoch": 3.892879066912216, + "grad_norm": 0.21235275268554688, + "learning_rate": 6.980601674845988e-05, + "loss": 1.7121, + "step": 12683 + }, + { + "epoch": 3.8931860036832413, + "grad_norm": 0.27078527212142944, + "learning_rate": 6.98014526894656e-05, + "loss": 1.8103, + "step": 12684 + }, + { + "epoch": 3.8934929404542666, + "grad_norm": 0.3198096454143524, + "learning_rate": 6.979688843478176e-05, + "loss": 1.7529, + "step": 12685 + }, + { + "epoch": 3.8937998772252915, + "grad_norm": 0.3170493245124817, + "learning_rate": 6.979232398445345e-05, + "loss": 1.7629, + "step": 12686 + }, + { + "epoch": 3.894106813996317, + "grad_norm": 0.2495265007019043, + "learning_rate": 6.978775933852582e-05, + "loss": 1.7407, + "step": 12687 + }, + { + "epoch": 3.8944137507673418, + "grad_norm": 0.24570141732692719, + "learning_rate": 6.978319449704395e-05, + "loss": 1.7688, + "step": 12688 + }, + { + "epoch": 3.894720687538367, + "grad_norm": 0.23956388235092163, + "learning_rate": 6.977862946005295e-05, + "loss": 1.7115, + "step": 12689 + }, + { + "epoch": 3.8950276243093924, + "grad_norm": 0.21548940241336823, + "learning_rate": 6.977406422759793e-05, + "loss": 1.7611, + "step": 12690 + }, + { + "epoch": 3.8953345610804173, + "grad_norm": 0.25797295570373535, + "learning_rate": 6.976949879972403e-05, + "loss": 1.7688, + "step": 12691 + }, + { + "epoch": 3.8956414978514426, + "grad_norm": 0.28257784247398376, + "learning_rate": 6.976493317647636e-05, + "loss": 1.7517, + "step": 12692 + }, + { + "epoch": 3.8959484346224675, + "grad_norm": 0.23828580975532532, + "learning_rate": 6.976036735790004e-05, + "loss": 1.7877, + "step": 12693 + }, + { + "epoch": 3.896255371393493, + "grad_norm": 0.22915001213550568, + "learning_rate": 6.975580134404017e-05, + "loss": 1.7741, + "step": 12694 + }, + { + "epoch": 3.896562308164518, + "grad_norm": 0.22975030541419983, + "learning_rate": 6.97512351349419e-05, + "loss": 1.772, + "step": 12695 + }, + { + "epoch": 3.8968692449355435, + "grad_norm": 0.29515185952186584, + "learning_rate": 6.974666873065034e-05, + "loss": 1.8001, + "step": 12696 + }, + { + "epoch": 3.8971761817065684, + "grad_norm": 0.26904794573783875, + "learning_rate": 6.974210213121064e-05, + "loss": 1.7069, + "step": 12697 + }, + { + "epoch": 3.8974831184775938, + "grad_norm": 0.2549479603767395, + "learning_rate": 6.97375353366679e-05, + "loss": 1.7419, + "step": 12698 + }, + { + "epoch": 3.8977900552486187, + "grad_norm": 0.23750101029872894, + "learning_rate": 6.973296834706729e-05, + "loss": 1.7815, + "step": 12699 + }, + { + "epoch": 3.898096992019644, + "grad_norm": 0.23529762029647827, + "learning_rate": 6.972840116245389e-05, + "loss": 1.8139, + "step": 12700 + }, + { + "epoch": 3.8984039287906693, + "grad_norm": 0.3212098777294159, + "learning_rate": 6.97238337828729e-05, + "loss": 1.7507, + "step": 12701 + }, + { + "epoch": 3.898710865561694, + "grad_norm": 0.3167687952518463, + "learning_rate": 6.971926620836941e-05, + "loss": 1.8062, + "step": 12702 + }, + { + "epoch": 3.8990178023327196, + "grad_norm": 0.31298309564590454, + "learning_rate": 6.971469843898855e-05, + "loss": 1.8127, + "step": 12703 + }, + { + "epoch": 3.8993247391037444, + "grad_norm": 0.2537378668785095, + "learning_rate": 6.971013047477551e-05, + "loss": 1.7675, + "step": 12704 + }, + { + "epoch": 3.8996316758747698, + "grad_norm": 0.24292805790901184, + "learning_rate": 6.97055623157754e-05, + "loss": 1.8004, + "step": 12705 + }, + { + "epoch": 3.899938612645795, + "grad_norm": 0.2929537296295166, + "learning_rate": 6.970099396203338e-05, + "loss": 1.7963, + "step": 12706 + }, + { + "epoch": 3.90024554941682, + "grad_norm": 0.30531612038612366, + "learning_rate": 6.969642541359459e-05, + "loss": 1.7347, + "step": 12707 + }, + { + "epoch": 3.9005524861878453, + "grad_norm": 0.3138202726840973, + "learning_rate": 6.969185667050417e-05, + "loss": 1.7987, + "step": 12708 + }, + { + "epoch": 3.9008594229588702, + "grad_norm": 0.2366247922182083, + "learning_rate": 6.96872877328073e-05, + "loss": 1.7671, + "step": 12709 + }, + { + "epoch": 3.9011663597298956, + "grad_norm": 0.26251721382141113, + "learning_rate": 6.96827186005491e-05, + "loss": 1.7657, + "step": 12710 + }, + { + "epoch": 3.901473296500921, + "grad_norm": 0.32497119903564453, + "learning_rate": 6.967814927377474e-05, + "loss": 1.7873, + "step": 12711 + }, + { + "epoch": 3.9017802332719462, + "grad_norm": 0.3290228843688965, + "learning_rate": 6.967357975252939e-05, + "loss": 1.8076, + "step": 12712 + }, + { + "epoch": 3.902087170042971, + "grad_norm": 0.2737300992012024, + "learning_rate": 6.966901003685817e-05, + "loss": 1.7405, + "step": 12713 + }, + { + "epoch": 3.9023941068139965, + "grad_norm": 0.25465309619903564, + "learning_rate": 6.966444012680626e-05, + "loss": 1.8063, + "step": 12714 + }, + { + "epoch": 3.9027010435850213, + "grad_norm": 0.2397255003452301, + "learning_rate": 6.965987002241885e-05, + "loss": 1.8079, + "step": 12715 + }, + { + "epoch": 3.9030079803560467, + "grad_norm": 0.23115718364715576, + "learning_rate": 6.965529972374108e-05, + "loss": 1.8032, + "step": 12716 + }, + { + "epoch": 3.903314917127072, + "grad_norm": 0.2536461055278778, + "learning_rate": 6.96507292308181e-05, + "loss": 1.7477, + "step": 12717 + }, + { + "epoch": 3.903621853898097, + "grad_norm": 0.27151185274124146, + "learning_rate": 6.96461585436951e-05, + "loss": 1.75, + "step": 12718 + }, + { + "epoch": 3.9039287906691222, + "grad_norm": 0.26894113421440125, + "learning_rate": 6.964158766241726e-05, + "loss": 1.7816, + "step": 12719 + }, + { + "epoch": 3.904235727440147, + "grad_norm": 0.23541375994682312, + "learning_rate": 6.963701658702972e-05, + "loss": 1.7991, + "step": 12720 + }, + { + "epoch": 3.9045426642111725, + "grad_norm": 0.22142915427684784, + "learning_rate": 6.96324453175777e-05, + "loss": 1.7245, + "step": 12721 + }, + { + "epoch": 3.904849600982198, + "grad_norm": 0.32864269614219666, + "learning_rate": 6.962787385410632e-05, + "loss": 1.7631, + "step": 12722 + }, + { + "epoch": 3.9051565377532227, + "grad_norm": 0.23657776415348053, + "learning_rate": 6.96233021966608e-05, + "loss": 1.8081, + "step": 12723 + }, + { + "epoch": 3.905463474524248, + "grad_norm": 0.24790632724761963, + "learning_rate": 6.961873034528629e-05, + "loss": 1.7193, + "step": 12724 + }, + { + "epoch": 3.905770411295273, + "grad_norm": 0.2517886459827423, + "learning_rate": 6.961415830002801e-05, + "loss": 1.7785, + "step": 12725 + }, + { + "epoch": 3.9060773480662982, + "grad_norm": 0.2340923547744751, + "learning_rate": 6.960958606093113e-05, + "loss": 1.7632, + "step": 12726 + }, + { + "epoch": 3.9063842848373236, + "grad_norm": 0.23260441422462463, + "learning_rate": 6.960501362804079e-05, + "loss": 1.7865, + "step": 12727 + }, + { + "epoch": 3.906691221608349, + "grad_norm": 0.22616329789161682, + "learning_rate": 6.960044100140224e-05, + "loss": 1.7851, + "step": 12728 + }, + { + "epoch": 3.906998158379374, + "grad_norm": 0.2849951982498169, + "learning_rate": 6.959586818106064e-05, + "loss": 1.8618, + "step": 12729 + }, + { + "epoch": 3.907305095150399, + "grad_norm": 0.3279374837875366, + "learning_rate": 6.95912951670612e-05, + "loss": 1.8563, + "step": 12730 + }, + { + "epoch": 3.907612031921424, + "grad_norm": 0.24359555542469025, + "learning_rate": 6.958672195944906e-05, + "loss": 1.7604, + "step": 12731 + }, + { + "epoch": 3.9079189686924494, + "grad_norm": 0.30881935358047485, + "learning_rate": 6.958214855826947e-05, + "loss": 1.8463, + "step": 12732 + }, + { + "epoch": 3.9082259054634747, + "grad_norm": 0.25361543893814087, + "learning_rate": 6.957757496356763e-05, + "loss": 1.7831, + "step": 12733 + }, + { + "epoch": 3.9085328422344996, + "grad_norm": 0.26763513684272766, + "learning_rate": 6.957300117538869e-05, + "loss": 1.8383, + "step": 12734 + }, + { + "epoch": 3.908839779005525, + "grad_norm": 0.2238057255744934, + "learning_rate": 6.95684271937779e-05, + "loss": 1.7702, + "step": 12735 + }, + { + "epoch": 3.90914671577655, + "grad_norm": 0.22110232710838318, + "learning_rate": 6.956385301878045e-05, + "loss": 1.7931, + "step": 12736 + }, + { + "epoch": 3.909453652547575, + "grad_norm": 0.23765070736408234, + "learning_rate": 6.955927865044152e-05, + "loss": 1.7212, + "step": 12737 + }, + { + "epoch": 3.9097605893186005, + "grad_norm": 0.22324508428573608, + "learning_rate": 6.955470408880633e-05, + "loss": 1.7161, + "step": 12738 + }, + { + "epoch": 3.9100675260896254, + "grad_norm": 0.22485347092151642, + "learning_rate": 6.955012933392012e-05, + "loss": 1.7374, + "step": 12739 + }, + { + "epoch": 3.9103744628606507, + "grad_norm": 0.28046715259552, + "learning_rate": 6.954555438582806e-05, + "loss": 1.9264, + "step": 12740 + }, + { + "epoch": 3.9106813996316756, + "grad_norm": 0.26391276717185974, + "learning_rate": 6.954097924457536e-05, + "loss": 1.7343, + "step": 12741 + }, + { + "epoch": 3.910988336402701, + "grad_norm": 0.29596614837646484, + "learning_rate": 6.953640391020726e-05, + "loss": 1.8111, + "step": 12742 + }, + { + "epoch": 3.9112952731737263, + "grad_norm": 0.2709808051586151, + "learning_rate": 6.953182838276896e-05, + "loss": 1.7776, + "step": 12743 + }, + { + "epoch": 3.9116022099447516, + "grad_norm": 0.2585100531578064, + "learning_rate": 6.952725266230571e-05, + "loss": 1.7774, + "step": 12744 + }, + { + "epoch": 3.9119091467157765, + "grad_norm": 0.26490530371665955, + "learning_rate": 6.952267674886268e-05, + "loss": 1.78, + "step": 12745 + }, + { + "epoch": 3.912216083486802, + "grad_norm": 0.23654767870903015, + "learning_rate": 6.951810064248512e-05, + "loss": 1.8263, + "step": 12746 + }, + { + "epoch": 3.9125230202578267, + "grad_norm": 0.2495296597480774, + "learning_rate": 6.951352434321826e-05, + "loss": 1.787, + "step": 12747 + }, + { + "epoch": 3.912829957028852, + "grad_norm": 0.24038313329219818, + "learning_rate": 6.950894785110728e-05, + "loss": 1.774, + "step": 12748 + }, + { + "epoch": 3.9131368937998774, + "grad_norm": 0.23738732933998108, + "learning_rate": 6.950437116619749e-05, + "loss": 1.7401, + "step": 12749 + }, + { + "epoch": 3.9134438305709023, + "grad_norm": 0.28192025423049927, + "learning_rate": 6.949979428853405e-05, + "loss": 1.8416, + "step": 12750 + }, + { + "epoch": 3.9137507673419276, + "grad_norm": 0.30579057335853577, + "learning_rate": 6.949521721816221e-05, + "loss": 1.7404, + "step": 12751 + }, + { + "epoch": 3.9140577041129525, + "grad_norm": 0.23972894251346588, + "learning_rate": 6.949063995512721e-05, + "loss": 1.7543, + "step": 12752 + }, + { + "epoch": 3.914364640883978, + "grad_norm": 0.2837793231010437, + "learning_rate": 6.94860624994743e-05, + "loss": 1.7779, + "step": 12753 + }, + { + "epoch": 3.914671577655003, + "grad_norm": 0.3344916105270386, + "learning_rate": 6.948148485124868e-05, + "loss": 1.7803, + "step": 12754 + }, + { + "epoch": 3.9149785144260285, + "grad_norm": 0.24271291494369507, + "learning_rate": 6.94769070104956e-05, + "loss": 1.7362, + "step": 12755 + }, + { + "epoch": 3.9152854511970534, + "grad_norm": 0.25299304723739624, + "learning_rate": 6.947232897726031e-05, + "loss": 1.7685, + "step": 12756 + }, + { + "epoch": 3.9155923879680787, + "grad_norm": 0.24766205251216888, + "learning_rate": 6.946775075158807e-05, + "loss": 1.829, + "step": 12757 + }, + { + "epoch": 3.9158993247391036, + "grad_norm": 0.2508428692817688, + "learning_rate": 6.94631723335241e-05, + "loss": 1.809, + "step": 12758 + }, + { + "epoch": 3.916206261510129, + "grad_norm": 0.2172096222639084, + "learning_rate": 6.945859372311365e-05, + "loss": 1.7376, + "step": 12759 + }, + { + "epoch": 3.9165131982811543, + "grad_norm": 0.28976425528526306, + "learning_rate": 6.945401492040198e-05, + "loss": 1.8229, + "step": 12760 + }, + { + "epoch": 3.916820135052179, + "grad_norm": 0.3528063893318176, + "learning_rate": 6.944943592543432e-05, + "loss": 1.7559, + "step": 12761 + }, + { + "epoch": 3.9171270718232045, + "grad_norm": 0.46312370896339417, + "learning_rate": 6.944485673825595e-05, + "loss": 1.7664, + "step": 12762 + }, + { + "epoch": 3.9174340085942294, + "grad_norm": 0.4466164708137512, + "learning_rate": 6.94402773589121e-05, + "loss": 1.7833, + "step": 12763 + }, + { + "epoch": 3.9177409453652547, + "grad_norm": 0.2637740969657898, + "learning_rate": 6.943569778744804e-05, + "loss": 1.818, + "step": 12764 + }, + { + "epoch": 3.91804788213628, + "grad_norm": 0.37515267729759216, + "learning_rate": 6.943111802390901e-05, + "loss": 1.7898, + "step": 12765 + }, + { + "epoch": 3.918354818907305, + "grad_norm": 0.45146289467811584, + "learning_rate": 6.942653806834029e-05, + "loss": 1.7797, + "step": 12766 + }, + { + "epoch": 3.9186617556783303, + "grad_norm": 0.2809859812259674, + "learning_rate": 6.942195792078712e-05, + "loss": 1.7836, + "step": 12767 + }, + { + "epoch": 3.918968692449355, + "grad_norm": 0.3606306314468384, + "learning_rate": 6.94173775812948e-05, + "loss": 1.7657, + "step": 12768 + }, + { + "epoch": 3.9192756292203805, + "grad_norm": 0.49528738856315613, + "learning_rate": 6.941279704990857e-05, + "loss": 1.7628, + "step": 12769 + }, + { + "epoch": 3.919582565991406, + "grad_norm": 0.3484322428703308, + "learning_rate": 6.940821632667371e-05, + "loss": 1.7939, + "step": 12770 + }, + { + "epoch": 3.919889502762431, + "grad_norm": 0.2479606419801712, + "learning_rate": 6.940363541163546e-05, + "loss": 1.813, + "step": 12771 + }, + { + "epoch": 3.920196439533456, + "grad_norm": 0.3491765558719635, + "learning_rate": 6.939905430483911e-05, + "loss": 1.7338, + "step": 12772 + }, + { + "epoch": 3.9205033763044814, + "grad_norm": 0.291810005903244, + "learning_rate": 6.939447300632995e-05, + "loss": 1.7445, + "step": 12773 + }, + { + "epoch": 3.9208103130755063, + "grad_norm": 0.2467527985572815, + "learning_rate": 6.938989151615324e-05, + "loss": 1.8462, + "step": 12774 + }, + { + "epoch": 3.9211172498465316, + "grad_norm": 0.35656824707984924, + "learning_rate": 6.938530983435426e-05, + "loss": 1.7751, + "step": 12775 + }, + { + "epoch": 3.921424186617557, + "grad_norm": 0.31269776821136475, + "learning_rate": 6.938072796097828e-05, + "loss": 1.7714, + "step": 12776 + }, + { + "epoch": 3.921731123388582, + "grad_norm": 0.2082831859588623, + "learning_rate": 6.937614589607058e-05, + "loss": 1.7263, + "step": 12777 + }, + { + "epoch": 3.922038060159607, + "grad_norm": 0.27583765983581543, + "learning_rate": 6.937156363967646e-05, + "loss": 1.6822, + "step": 12778 + }, + { + "epoch": 3.922344996930632, + "grad_norm": 0.32773876190185547, + "learning_rate": 6.93669811918412e-05, + "loss": 1.7792, + "step": 12779 + }, + { + "epoch": 3.9226519337016574, + "grad_norm": 0.2583121657371521, + "learning_rate": 6.936239855261007e-05, + "loss": 1.7812, + "step": 12780 + }, + { + "epoch": 3.9229588704726828, + "grad_norm": 0.245570570230484, + "learning_rate": 6.935781572202836e-05, + "loss": 1.7252, + "step": 12781 + }, + { + "epoch": 3.9232658072437077, + "grad_norm": 0.2379419505596161, + "learning_rate": 6.935323270014138e-05, + "loss": 1.7485, + "step": 12782 + }, + { + "epoch": 3.923572744014733, + "grad_norm": 0.2239784598350525, + "learning_rate": 6.934864948699439e-05, + "loss": 1.7444, + "step": 12783 + }, + { + "epoch": 3.923879680785758, + "grad_norm": 0.2366618812084198, + "learning_rate": 6.934406608263274e-05, + "loss": 1.777, + "step": 12784 + }, + { + "epoch": 3.924186617556783, + "grad_norm": 0.22583791613578796, + "learning_rate": 6.933948248710169e-05, + "loss": 1.7291, + "step": 12785 + }, + { + "epoch": 3.9244935543278086, + "grad_norm": 0.24141047894954681, + "learning_rate": 6.933489870044651e-05, + "loss": 1.7748, + "step": 12786 + }, + { + "epoch": 3.924800491098834, + "grad_norm": 0.2389962524175644, + "learning_rate": 6.933031472271255e-05, + "loss": 1.7957, + "step": 12787 + }, + { + "epoch": 3.925107427869859, + "grad_norm": 0.25230300426483154, + "learning_rate": 6.932573055394509e-05, + "loss": 1.7621, + "step": 12788 + }, + { + "epoch": 3.925414364640884, + "grad_norm": 0.23894043266773224, + "learning_rate": 6.932114619418941e-05, + "loss": 1.7285, + "step": 12789 + }, + { + "epoch": 3.925721301411909, + "grad_norm": 0.2650291919708252, + "learning_rate": 6.931656164349086e-05, + "loss": 1.7613, + "step": 12790 + }, + { + "epoch": 3.9260282381829343, + "grad_norm": 0.20616789162158966, + "learning_rate": 6.931197690189472e-05, + "loss": 1.7505, + "step": 12791 + }, + { + "epoch": 3.9263351749539597, + "grad_norm": 0.23915675282478333, + "learning_rate": 6.930739196944633e-05, + "loss": 1.7477, + "step": 12792 + }, + { + "epoch": 3.9266421117249846, + "grad_norm": 0.2522687613964081, + "learning_rate": 6.930280684619094e-05, + "loss": 1.8, + "step": 12793 + }, + { + "epoch": 3.92694904849601, + "grad_norm": 0.264167845249176, + "learning_rate": 6.929822153217391e-05, + "loss": 1.7516, + "step": 12794 + }, + { + "epoch": 3.927255985267035, + "grad_norm": 0.21358054876327515, + "learning_rate": 6.929363602744054e-05, + "loss": 1.7207, + "step": 12795 + }, + { + "epoch": 3.92756292203806, + "grad_norm": 0.25632721185684204, + "learning_rate": 6.928905033203617e-05, + "loss": 1.7446, + "step": 12796 + }, + { + "epoch": 3.9278698588090855, + "grad_norm": 0.2717185318470001, + "learning_rate": 6.928446444600608e-05, + "loss": 1.8555, + "step": 12797 + }, + { + "epoch": 3.9281767955801103, + "grad_norm": 0.2871767282485962, + "learning_rate": 6.927987836939561e-05, + "loss": 1.7861, + "step": 12798 + }, + { + "epoch": 3.9284837323511357, + "grad_norm": 0.282507061958313, + "learning_rate": 6.927529210225009e-05, + "loss": 1.7683, + "step": 12799 + }, + { + "epoch": 3.9287906691221606, + "grad_norm": 0.24870644509792328, + "learning_rate": 6.927070564461482e-05, + "loss": 1.7355, + "step": 12800 + }, + { + "epoch": 3.929097605893186, + "grad_norm": 0.2093631625175476, + "learning_rate": 6.926611899653516e-05, + "loss": 1.7691, + "step": 12801 + }, + { + "epoch": 3.9294045426642112, + "grad_norm": 0.34258076548576355, + "learning_rate": 6.926153215805642e-05, + "loss": 1.8398, + "step": 12802 + }, + { + "epoch": 3.9297114794352366, + "grad_norm": 0.39179500937461853, + "learning_rate": 6.925694512922391e-05, + "loss": 1.8229, + "step": 12803 + }, + { + "epoch": 3.9300184162062615, + "grad_norm": 0.36814743280410767, + "learning_rate": 6.9252357910083e-05, + "loss": 1.7759, + "step": 12804 + }, + { + "epoch": 3.930325352977287, + "grad_norm": 0.2659403085708618, + "learning_rate": 6.924777050067902e-05, + "loss": 1.7553, + "step": 12805 + }, + { + "epoch": 3.9306322897483117, + "grad_norm": 0.20617491006851196, + "learning_rate": 6.924318290105724e-05, + "loss": 1.7398, + "step": 12806 + }, + { + "epoch": 3.930939226519337, + "grad_norm": 0.23730522394180298, + "learning_rate": 6.923859511126309e-05, + "loss": 1.699, + "step": 12807 + }, + { + "epoch": 3.9312461632903624, + "grad_norm": 0.24865423142910004, + "learning_rate": 6.923400713134184e-05, + "loss": 1.7801, + "step": 12808 + }, + { + "epoch": 3.9315531000613873, + "grad_norm": 0.2495356798171997, + "learning_rate": 6.92294189613389e-05, + "loss": 1.803, + "step": 12809 + }, + { + "epoch": 3.9318600368324126, + "grad_norm": 0.24223244190216064, + "learning_rate": 6.922483060129955e-05, + "loss": 1.751, + "step": 12810 + }, + { + "epoch": 3.9321669736034375, + "grad_norm": 0.2541450262069702, + "learning_rate": 6.922024205126913e-05, + "loss": 1.7721, + "step": 12811 + }, + { + "epoch": 3.932473910374463, + "grad_norm": 0.24528831243515015, + "learning_rate": 6.921565331129304e-05, + "loss": 1.792, + "step": 12812 + }, + { + "epoch": 3.932780847145488, + "grad_norm": 0.22789500653743744, + "learning_rate": 6.921106438141659e-05, + "loss": 1.8455, + "step": 12813 + }, + { + "epoch": 3.933087783916513, + "grad_norm": 0.26267170906066895, + "learning_rate": 6.920647526168515e-05, + "loss": 1.7254, + "step": 12814 + }, + { + "epoch": 3.9333947206875384, + "grad_norm": 0.23044808208942413, + "learning_rate": 6.920188595214406e-05, + "loss": 1.7217, + "step": 12815 + }, + { + "epoch": 3.9337016574585633, + "grad_norm": 0.2304011732339859, + "learning_rate": 6.919729645283867e-05, + "loss": 1.8121, + "step": 12816 + }, + { + "epoch": 3.9340085942295886, + "grad_norm": 0.21516792476177216, + "learning_rate": 6.919270676381435e-05, + "loss": 1.7305, + "step": 12817 + }, + { + "epoch": 3.934315531000614, + "grad_norm": 0.24698840081691742, + "learning_rate": 6.918811688511646e-05, + "loss": 1.7967, + "step": 12818 + }, + { + "epoch": 3.9346224677716393, + "grad_norm": 0.23132537305355072, + "learning_rate": 6.918352681679035e-05, + "loss": 1.7439, + "step": 12819 + }, + { + "epoch": 3.934929404542664, + "grad_norm": 0.2597793936729431, + "learning_rate": 6.917893655888139e-05, + "loss": 1.7882, + "step": 12820 + }, + { + "epoch": 3.9352363413136895, + "grad_norm": 0.23946607112884521, + "learning_rate": 6.917434611143493e-05, + "loss": 1.7991, + "step": 12821 + }, + { + "epoch": 3.9355432780847144, + "grad_norm": 0.25808244943618774, + "learning_rate": 6.916975547449634e-05, + "loss": 1.845, + "step": 12822 + }, + { + "epoch": 3.9358502148557397, + "grad_norm": 0.26082557439804077, + "learning_rate": 6.9165164648111e-05, + "loss": 1.7562, + "step": 12823 + }, + { + "epoch": 3.936157151626765, + "grad_norm": 0.24810053408145905, + "learning_rate": 6.916057363232425e-05, + "loss": 1.778, + "step": 12824 + }, + { + "epoch": 3.93646408839779, + "grad_norm": 0.24168157577514648, + "learning_rate": 6.91559824271815e-05, + "loss": 1.7628, + "step": 12825 + }, + { + "epoch": 3.9367710251688153, + "grad_norm": 0.23800434172153473, + "learning_rate": 6.91513910327281e-05, + "loss": 1.8063, + "step": 12826 + }, + { + "epoch": 3.93707796193984, + "grad_norm": 0.23055073618888855, + "learning_rate": 6.914679944900944e-05, + "loss": 1.749, + "step": 12827 + }, + { + "epoch": 3.9373848987108655, + "grad_norm": 0.22455987334251404, + "learning_rate": 6.914220767607088e-05, + "loss": 1.7471, + "step": 12828 + }, + { + "epoch": 3.937691835481891, + "grad_norm": 0.21808378398418427, + "learning_rate": 6.913761571395778e-05, + "loss": 1.7503, + "step": 12829 + }, + { + "epoch": 3.937998772252916, + "grad_norm": 0.23136213421821594, + "learning_rate": 6.913302356271556e-05, + "loss": 1.752, + "step": 12830 + }, + { + "epoch": 3.938305709023941, + "grad_norm": 0.29579970240592957, + "learning_rate": 6.912843122238959e-05, + "loss": 1.8028, + "step": 12831 + }, + { + "epoch": 3.9386126457949664, + "grad_norm": 0.28578072786331177, + "learning_rate": 6.912383869302526e-05, + "loss": 1.8183, + "step": 12832 + }, + { + "epoch": 3.9389195825659913, + "grad_norm": 0.2616737186908722, + "learning_rate": 6.911924597466793e-05, + "loss": 1.8366, + "step": 12833 + }, + { + "epoch": 3.9392265193370166, + "grad_norm": 0.29275768995285034, + "learning_rate": 6.911465306736302e-05, + "loss": 1.731, + "step": 12834 + }, + { + "epoch": 3.939533456108042, + "grad_norm": 0.3300873041152954, + "learning_rate": 6.91100599711559e-05, + "loss": 1.8713, + "step": 12835 + }, + { + "epoch": 3.939840392879067, + "grad_norm": 0.2744643986225128, + "learning_rate": 6.910546668609195e-05, + "loss": 1.8479, + "step": 12836 + }, + { + "epoch": 3.940147329650092, + "grad_norm": 0.25248417258262634, + "learning_rate": 6.91008732122166e-05, + "loss": 1.7962, + "step": 12837 + }, + { + "epoch": 3.940454266421117, + "grad_norm": 0.3068546652793884, + "learning_rate": 6.909627954957521e-05, + "loss": 1.759, + "step": 12838 + }, + { + "epoch": 3.9407612031921424, + "grad_norm": 0.3273559808731079, + "learning_rate": 6.909168569821321e-05, + "loss": 1.814, + "step": 12839 + }, + { + "epoch": 3.9410681399631677, + "grad_norm": 0.31192758679389954, + "learning_rate": 6.908709165817597e-05, + "loss": 1.7906, + "step": 12840 + }, + { + "epoch": 3.9413750767341926, + "grad_norm": 0.24487090110778809, + "learning_rate": 6.90824974295089e-05, + "loss": 1.8238, + "step": 12841 + }, + { + "epoch": 3.941682013505218, + "grad_norm": 0.24863721430301666, + "learning_rate": 6.907790301225743e-05, + "loss": 1.7651, + "step": 12842 + }, + { + "epoch": 3.941988950276243, + "grad_norm": 0.26555630564689636, + "learning_rate": 6.907330840646693e-05, + "loss": 1.8268, + "step": 12843 + }, + { + "epoch": 3.942295887047268, + "grad_norm": 0.2439817190170288, + "learning_rate": 6.906871361218281e-05, + "loss": 1.7291, + "step": 12844 + }, + { + "epoch": 3.9426028238182935, + "grad_norm": 0.2410304993391037, + "learning_rate": 6.906411862945048e-05, + "loss": 1.712, + "step": 12845 + }, + { + "epoch": 3.942909760589319, + "grad_norm": 0.28575149178504944, + "learning_rate": 6.905952345831537e-05, + "loss": 1.7269, + "step": 12846 + }, + { + "epoch": 3.9432166973603437, + "grad_norm": 0.3055815100669861, + "learning_rate": 6.905492809882286e-05, + "loss": 1.7234, + "step": 12847 + }, + { + "epoch": 3.943523634131369, + "grad_norm": 0.2762533724308014, + "learning_rate": 6.905033255101839e-05, + "loss": 1.7768, + "step": 12848 + }, + { + "epoch": 3.943830570902394, + "grad_norm": 0.22819125652313232, + "learning_rate": 6.904573681494738e-05, + "loss": 1.7416, + "step": 12849 + }, + { + "epoch": 3.9441375076734193, + "grad_norm": 0.21664194762706757, + "learning_rate": 6.904114089065523e-05, + "loss": 1.7506, + "step": 12850 + }, + { + "epoch": 3.9444444444444446, + "grad_norm": 0.21935151517391205, + "learning_rate": 6.903654477818735e-05, + "loss": 1.7522, + "step": 12851 + }, + { + "epoch": 3.9447513812154695, + "grad_norm": 0.2204175442457199, + "learning_rate": 6.903194847758918e-05, + "loss": 1.7753, + "step": 12852 + }, + { + "epoch": 3.945058317986495, + "grad_norm": 0.23130151629447937, + "learning_rate": 6.902735198890615e-05, + "loss": 1.7743, + "step": 12853 + }, + { + "epoch": 3.9453652547575198, + "grad_norm": 0.2548399567604065, + "learning_rate": 6.902275531218368e-05, + "loss": 1.8373, + "step": 12854 + }, + { + "epoch": 3.945672191528545, + "grad_norm": 0.2905479371547699, + "learning_rate": 6.901815844746718e-05, + "loss": 1.8336, + "step": 12855 + }, + { + "epoch": 3.9459791282995704, + "grad_norm": 0.2698945105075836, + "learning_rate": 6.90135613948021e-05, + "loss": 1.7498, + "step": 12856 + }, + { + "epoch": 3.9462860650705953, + "grad_norm": 0.24966828525066376, + "learning_rate": 6.900896415423387e-05, + "loss": 1.7664, + "step": 12857 + }, + { + "epoch": 3.9465930018416207, + "grad_norm": 0.23272784054279327, + "learning_rate": 6.90043667258079e-05, + "loss": 1.7742, + "step": 12858 + }, + { + "epoch": 3.9468999386126455, + "grad_norm": 0.2277698516845703, + "learning_rate": 6.899976910956965e-05, + "loss": 1.7465, + "step": 12859 + }, + { + "epoch": 3.947206875383671, + "grad_norm": 0.2376442402601242, + "learning_rate": 6.899517130556454e-05, + "loss": 1.7995, + "step": 12860 + }, + { + "epoch": 3.947513812154696, + "grad_norm": 0.25591593980789185, + "learning_rate": 6.899057331383802e-05, + "loss": 1.8017, + "step": 12861 + }, + { + "epoch": 3.9478207489257215, + "grad_norm": 0.2715262472629547, + "learning_rate": 6.898597513443551e-05, + "loss": 1.7967, + "step": 12862 + }, + { + "epoch": 3.9481276856967464, + "grad_norm": 0.20916256308555603, + "learning_rate": 6.898137676740246e-05, + "loss": 1.7711, + "step": 12863 + }, + { + "epoch": 3.9484346224677718, + "grad_norm": 0.2570229768753052, + "learning_rate": 6.897677821278435e-05, + "loss": 1.833, + "step": 12864 + }, + { + "epoch": 3.9487415592387967, + "grad_norm": 0.26343438029289246, + "learning_rate": 6.897217947062657e-05, + "loss": 1.7625, + "step": 12865 + }, + { + "epoch": 3.949048496009822, + "grad_norm": 0.23407024145126343, + "learning_rate": 6.896758054097459e-05, + "loss": 1.7211, + "step": 12866 + }, + { + "epoch": 3.9493554327808473, + "grad_norm": 0.2554715573787689, + "learning_rate": 6.896298142387387e-05, + "loss": 1.8548, + "step": 12867 + }, + { + "epoch": 3.949662369551872, + "grad_norm": 0.24143370985984802, + "learning_rate": 6.895838211936986e-05, + "loss": 1.7635, + "step": 12868 + }, + { + "epoch": 3.9499693063228976, + "grad_norm": 0.24634715914726257, + "learning_rate": 6.8953782627508e-05, + "loss": 1.8012, + "step": 12869 + }, + { + "epoch": 3.9502762430939224, + "grad_norm": 0.22740426659584045, + "learning_rate": 6.894918294833375e-05, + "loss": 1.7294, + "step": 12870 + }, + { + "epoch": 3.950583179864948, + "grad_norm": 0.2651631832122803, + "learning_rate": 6.894458308189257e-05, + "loss": 1.8289, + "step": 12871 + }, + { + "epoch": 3.950890116635973, + "grad_norm": 0.28693267703056335, + "learning_rate": 6.893998302822991e-05, + "loss": 1.8462, + "step": 12872 + }, + { + "epoch": 3.951197053406998, + "grad_norm": 0.26584213972091675, + "learning_rate": 6.893538278739125e-05, + "loss": 1.7621, + "step": 12873 + }, + { + "epoch": 3.9515039901780233, + "grad_norm": 0.29970669746398926, + "learning_rate": 6.893078235942203e-05, + "loss": 1.7659, + "step": 12874 + }, + { + "epoch": 3.9518109269490482, + "grad_norm": 0.2271152138710022, + "learning_rate": 6.892618174436771e-05, + "loss": 1.7151, + "step": 12875 + }, + { + "epoch": 3.9521178637200736, + "grad_norm": 0.24783682823181152, + "learning_rate": 6.892158094227379e-05, + "loss": 1.761, + "step": 12876 + }, + { + "epoch": 3.952424800491099, + "grad_norm": 0.2371140718460083, + "learning_rate": 6.891697995318573e-05, + "loss": 1.7557, + "step": 12877 + }, + { + "epoch": 3.9527317372621242, + "grad_norm": 0.29708394408226013, + "learning_rate": 6.891237877714896e-05, + "loss": 1.8629, + "step": 12878 + }, + { + "epoch": 3.953038674033149, + "grad_norm": 0.2724219262599945, + "learning_rate": 6.890777741420899e-05, + "loss": 1.7378, + "step": 12879 + }, + { + "epoch": 3.9533456108041745, + "grad_norm": 0.2227276861667633, + "learning_rate": 6.890317586441126e-05, + "loss": 1.6989, + "step": 12880 + }, + { + "epoch": 3.9536525475751993, + "grad_norm": 0.2546161413192749, + "learning_rate": 6.889857412780128e-05, + "loss": 1.8688, + "step": 12881 + }, + { + "epoch": 3.9539594843462247, + "grad_norm": 0.24882884323596954, + "learning_rate": 6.889397220442452e-05, + "loss": 1.8137, + "step": 12882 + }, + { + "epoch": 3.95426642111725, + "grad_norm": 0.2549113929271698, + "learning_rate": 6.888937009432644e-05, + "loss": 1.8366, + "step": 12883 + }, + { + "epoch": 3.954573357888275, + "grad_norm": 0.30032673478126526, + "learning_rate": 6.888476779755255e-05, + "loss": 1.8267, + "step": 12884 + }, + { + "epoch": 3.9548802946593002, + "grad_norm": 0.2887294292449951, + "learning_rate": 6.888016531414832e-05, + "loss": 1.8295, + "step": 12885 + }, + { + "epoch": 3.955187231430325, + "grad_norm": 0.2947406470775604, + "learning_rate": 6.88755626441592e-05, + "loss": 1.7713, + "step": 12886 + }, + { + "epoch": 3.9554941682013505, + "grad_norm": 0.2967108190059662, + "learning_rate": 6.887095978763072e-05, + "loss": 1.7636, + "step": 12887 + }, + { + "epoch": 3.955801104972376, + "grad_norm": 0.2495311200618744, + "learning_rate": 6.886635674460836e-05, + "loss": 1.8148, + "step": 12888 + }, + { + "epoch": 3.9561080417434007, + "grad_norm": 0.23367099463939667, + "learning_rate": 6.88617535151376e-05, + "loss": 1.7353, + "step": 12889 + }, + { + "epoch": 3.956414978514426, + "grad_norm": 0.36790570616722107, + "learning_rate": 6.885715009926395e-05, + "loss": 1.7853, + "step": 12890 + }, + { + "epoch": 3.9567219152854514, + "grad_norm": 0.5013020038604736, + "learning_rate": 6.885254649703287e-05, + "loss": 1.7923, + "step": 12891 + }, + { + "epoch": 3.9570288520564763, + "grad_norm": 0.4446276128292084, + "learning_rate": 6.884794270848988e-05, + "loss": 1.7504, + "step": 12892 + }, + { + "epoch": 3.9573357888275016, + "grad_norm": 0.2478526383638382, + "learning_rate": 6.88433387336805e-05, + "loss": 1.7629, + "step": 12893 + }, + { + "epoch": 3.957642725598527, + "grad_norm": 0.30111798644065857, + "learning_rate": 6.883873457265019e-05, + "loss": 1.8291, + "step": 12894 + }, + { + "epoch": 3.957949662369552, + "grad_norm": 0.3812437951564789, + "learning_rate": 6.883413022544445e-05, + "loss": 1.7919, + "step": 12895 + }, + { + "epoch": 3.958256599140577, + "grad_norm": 0.2895318269729614, + "learning_rate": 6.882952569210881e-05, + "loss": 1.7467, + "step": 12896 + }, + { + "epoch": 3.958563535911602, + "grad_norm": 0.30391454696655273, + "learning_rate": 6.882492097268873e-05, + "loss": 1.8145, + "step": 12897 + }, + { + "epoch": 3.9588704726826274, + "grad_norm": 0.5033623576164246, + "learning_rate": 6.882031606722977e-05, + "loss": 1.8231, + "step": 12898 + }, + { + "epoch": 3.9591774094536527, + "grad_norm": 0.5351777672767639, + "learning_rate": 6.881571097577742e-05, + "loss": 1.807, + "step": 12899 + }, + { + "epoch": 3.9594843462246776, + "grad_norm": 0.35540491342544556, + "learning_rate": 6.881110569837719e-05, + "loss": 1.7626, + "step": 12900 + }, + { + "epoch": 3.959791282995703, + "grad_norm": 0.22447600960731506, + "learning_rate": 6.880650023507457e-05, + "loss": 1.7392, + "step": 12901 + }, + { + "epoch": 3.960098219766728, + "grad_norm": 0.44619202613830566, + "learning_rate": 6.88018945859151e-05, + "loss": 1.8138, + "step": 12902 + }, + { + "epoch": 3.960405156537753, + "grad_norm": 0.41381633281707764, + "learning_rate": 6.879728875094428e-05, + "loss": 1.7676, + "step": 12903 + }, + { + "epoch": 3.9607120933087785, + "grad_norm": 0.2601528465747833, + "learning_rate": 6.879268273020764e-05, + "loss": 1.8406, + "step": 12904 + }, + { + "epoch": 3.961019030079804, + "grad_norm": 0.3309035003185272, + "learning_rate": 6.878807652375071e-05, + "loss": 1.7673, + "step": 12905 + }, + { + "epoch": 3.9613259668508287, + "grad_norm": 0.5281669497489929, + "learning_rate": 6.878347013161899e-05, + "loss": 1.7686, + "step": 12906 + }, + { + "epoch": 3.961632903621854, + "grad_norm": 0.5397645831108093, + "learning_rate": 6.8778863553858e-05, + "loss": 1.8575, + "step": 12907 + }, + { + "epoch": 3.961939840392879, + "grad_norm": 0.329485684633255, + "learning_rate": 6.877425679051327e-05, + "loss": 1.8185, + "step": 12908 + }, + { + "epoch": 3.9622467771639043, + "grad_norm": 0.3012789487838745, + "learning_rate": 6.876964984163034e-05, + "loss": 1.7962, + "step": 12909 + }, + { + "epoch": 3.9625537139349296, + "grad_norm": 0.5596817135810852, + "learning_rate": 6.876504270725472e-05, + "loss": 1.7972, + "step": 12910 + }, + { + "epoch": 3.9628606507059545, + "grad_norm": 0.5374729633331299, + "learning_rate": 6.876043538743197e-05, + "loss": 1.7863, + "step": 12911 + }, + { + "epoch": 3.96316758747698, + "grad_norm": 0.24617290496826172, + "learning_rate": 6.875582788220757e-05, + "loss": 1.7555, + "step": 12912 + }, + { + "epoch": 3.9634745242480047, + "grad_norm": 0.3493972420692444, + "learning_rate": 6.875122019162712e-05, + "loss": 1.8595, + "step": 12913 + }, + { + "epoch": 3.96378146101903, + "grad_norm": 0.4293089807033539, + "learning_rate": 6.874661231573609e-05, + "loss": 1.7647, + "step": 12914 + }, + { + "epoch": 3.9640883977900554, + "grad_norm": 0.30602574348449707, + "learning_rate": 6.874200425458006e-05, + "loss": 1.7122, + "step": 12915 + }, + { + "epoch": 3.9643953345610803, + "grad_norm": 0.22776013612747192, + "learning_rate": 6.873739600820457e-05, + "loss": 1.7136, + "step": 12916 + }, + { + "epoch": 3.9647022713321056, + "grad_norm": 0.3727327585220337, + "learning_rate": 6.873278757665513e-05, + "loss": 1.8314, + "step": 12917 + }, + { + "epoch": 3.9650092081031305, + "grad_norm": 0.35110536217689514, + "learning_rate": 6.872817895997733e-05, + "loss": 1.7506, + "step": 12918 + }, + { + "epoch": 3.965316144874156, + "grad_norm": 0.275560587644577, + "learning_rate": 6.872357015821666e-05, + "loss": 1.7865, + "step": 12919 + }, + { + "epoch": 3.965623081645181, + "grad_norm": 0.2686980366706848, + "learning_rate": 6.871896117141873e-05, + "loss": 1.8431, + "step": 12920 + }, + { + "epoch": 3.9659300184162065, + "grad_norm": 0.3299664556980133, + "learning_rate": 6.871435199962901e-05, + "loss": 1.7988, + "step": 12921 + }, + { + "epoch": 3.9662369551872314, + "grad_norm": 0.2833637297153473, + "learning_rate": 6.870974264289313e-05, + "loss": 1.6993, + "step": 12922 + }, + { + "epoch": 3.9665438919582567, + "grad_norm": 0.25062620639801025, + "learning_rate": 6.870513310125659e-05, + "loss": 1.7814, + "step": 12923 + }, + { + "epoch": 3.9668508287292816, + "grad_norm": 0.26609909534454346, + "learning_rate": 6.870052337476498e-05, + "loss": 1.7871, + "step": 12924 + }, + { + "epoch": 3.967157765500307, + "grad_norm": 0.22760890424251556, + "learning_rate": 6.869591346346382e-05, + "loss": 1.7941, + "step": 12925 + }, + { + "epoch": 3.9674647022713323, + "grad_norm": 0.2845582067966461, + "learning_rate": 6.869130336739869e-05, + "loss": 1.8215, + "step": 12926 + }, + { + "epoch": 3.967771639042357, + "grad_norm": 0.254948228597641, + "learning_rate": 6.868669308661514e-05, + "loss": 1.7515, + "step": 12927 + }, + { + "epoch": 3.9680785758133825, + "grad_norm": 0.2372167855501175, + "learning_rate": 6.868208262115875e-05, + "loss": 1.7524, + "step": 12928 + }, + { + "epoch": 3.9683855125844074, + "grad_norm": 0.31165993213653564, + "learning_rate": 6.867747197107506e-05, + "loss": 1.8139, + "step": 12929 + }, + { + "epoch": 3.9686924493554327, + "grad_norm": 0.2617839276790619, + "learning_rate": 6.867286113640965e-05, + "loss": 1.7388, + "step": 12930 + }, + { + "epoch": 3.968999386126458, + "grad_norm": 0.22749558091163635, + "learning_rate": 6.866825011720807e-05, + "loss": 1.7421, + "step": 12931 + }, + { + "epoch": 3.969306322897483, + "grad_norm": 0.27737462520599365, + "learning_rate": 6.86636389135159e-05, + "loss": 1.7977, + "step": 12932 + }, + { + "epoch": 3.9696132596685083, + "grad_norm": 0.3331063985824585, + "learning_rate": 6.865902752537871e-05, + "loss": 1.7925, + "step": 12933 + }, + { + "epoch": 3.969920196439533, + "grad_norm": 0.24229519069194794, + "learning_rate": 6.86544159528421e-05, + "loss": 1.7782, + "step": 12934 + }, + { + "epoch": 3.9702271332105585, + "grad_norm": 0.29494860768318176, + "learning_rate": 6.86498041959516e-05, + "loss": 1.7713, + "step": 12935 + }, + { + "epoch": 3.970534069981584, + "grad_norm": 0.26064008474349976, + "learning_rate": 6.86451922547528e-05, + "loss": 1.7161, + "step": 12936 + }, + { + "epoch": 3.970841006752609, + "grad_norm": 0.2656785547733307, + "learning_rate": 6.864058012929129e-05, + "loss": 1.8154, + "step": 12937 + }, + { + "epoch": 3.971147943523634, + "grad_norm": 0.21170997619628906, + "learning_rate": 6.863596781961263e-05, + "loss": 1.7614, + "step": 12938 + }, + { + "epoch": 3.9714548802946594, + "grad_norm": 0.21709072589874268, + "learning_rate": 6.863135532576241e-05, + "loss": 1.7896, + "step": 12939 + }, + { + "epoch": 3.9717618170656843, + "grad_norm": 0.2361367791891098, + "learning_rate": 6.862674264778623e-05, + "loss": 1.7775, + "step": 12940 + }, + { + "epoch": 3.9720687538367097, + "grad_norm": 0.22042550146579742, + "learning_rate": 6.862212978572967e-05, + "loss": 1.7781, + "step": 12941 + }, + { + "epoch": 3.972375690607735, + "grad_norm": 0.2535422146320343, + "learning_rate": 6.86175167396383e-05, + "loss": 1.7665, + "step": 12942 + }, + { + "epoch": 3.97268262737876, + "grad_norm": 0.23741906881332397, + "learning_rate": 6.861290350955771e-05, + "loss": 1.7829, + "step": 12943 + }, + { + "epoch": 3.972989564149785, + "grad_norm": 0.23789910972118378, + "learning_rate": 6.860829009553351e-05, + "loss": 1.7745, + "step": 12944 + }, + { + "epoch": 3.97329650092081, + "grad_norm": 0.26867765188217163, + "learning_rate": 6.860367649761127e-05, + "loss": 1.7239, + "step": 12945 + }, + { + "epoch": 3.9736034376918354, + "grad_norm": 0.3211663067340851, + "learning_rate": 6.85990627158366e-05, + "loss": 1.7976, + "step": 12946 + }, + { + "epoch": 3.9739103744628608, + "grad_norm": 0.26177310943603516, + "learning_rate": 6.85944487502551e-05, + "loss": 1.7446, + "step": 12947 + }, + { + "epoch": 3.9742173112338857, + "grad_norm": 0.23622745275497437, + "learning_rate": 6.858983460091234e-05, + "loss": 1.7824, + "step": 12948 + }, + { + "epoch": 3.974524248004911, + "grad_norm": 0.24372988939285278, + "learning_rate": 6.858522026785395e-05, + "loss": 1.8014, + "step": 12949 + }, + { + "epoch": 3.974831184775936, + "grad_norm": 0.2566998600959778, + "learning_rate": 6.85806057511255e-05, + "loss": 1.742, + "step": 12950 + }, + { + "epoch": 3.9751381215469612, + "grad_norm": 0.24418365955352783, + "learning_rate": 6.857599105077264e-05, + "loss": 1.7331, + "step": 12951 + }, + { + "epoch": 3.9754450583179866, + "grad_norm": 0.2260327935218811, + "learning_rate": 6.857137616684094e-05, + "loss": 1.7173, + "step": 12952 + }, + { + "epoch": 3.975751995089012, + "grad_norm": 0.277044415473938, + "learning_rate": 6.856676109937602e-05, + "loss": 1.7255, + "step": 12953 + }, + { + "epoch": 3.976058931860037, + "grad_norm": 0.228300079703331, + "learning_rate": 6.856214584842348e-05, + "loss": 1.7796, + "step": 12954 + }, + { + "epoch": 3.976365868631062, + "grad_norm": 0.2246638983488083, + "learning_rate": 6.855753041402893e-05, + "loss": 1.7458, + "step": 12955 + }, + { + "epoch": 3.976672805402087, + "grad_norm": 0.22235621511936188, + "learning_rate": 6.855291479623799e-05, + "loss": 1.7585, + "step": 12956 + }, + { + "epoch": 3.9769797421731123, + "grad_norm": 0.23710694909095764, + "learning_rate": 6.854829899509627e-05, + "loss": 1.767, + "step": 12957 + }, + { + "epoch": 3.9772866789441377, + "grad_norm": 0.2527346611022949, + "learning_rate": 6.854368301064939e-05, + "loss": 1.828, + "step": 12958 + }, + { + "epoch": 3.9775936157151626, + "grad_norm": 0.25032514333724976, + "learning_rate": 6.853906684294298e-05, + "loss": 1.8533, + "step": 12959 + }, + { + "epoch": 3.977900552486188, + "grad_norm": 0.2346320003271103, + "learning_rate": 6.853445049202262e-05, + "loss": 1.8046, + "step": 12960 + }, + { + "epoch": 3.978207489257213, + "grad_norm": 0.22576460242271423, + "learning_rate": 6.852983395793398e-05, + "loss": 1.7502, + "step": 12961 + }, + { + "epoch": 3.978514426028238, + "grad_norm": 0.2230147123336792, + "learning_rate": 6.852521724072266e-05, + "loss": 1.7362, + "step": 12962 + }, + { + "epoch": 3.9788213627992635, + "grad_norm": 0.2339705526828766, + "learning_rate": 6.852060034043425e-05, + "loss": 1.763, + "step": 12963 + }, + { + "epoch": 3.979128299570289, + "grad_norm": 0.24511271715164185, + "learning_rate": 6.851598325711446e-05, + "loss": 1.7988, + "step": 12964 + }, + { + "epoch": 3.9794352363413137, + "grad_norm": 0.2927285134792328, + "learning_rate": 6.851136599080885e-05, + "loss": 1.8346, + "step": 12965 + }, + { + "epoch": 3.979742173112339, + "grad_norm": 0.2593212425708771, + "learning_rate": 6.850674854156305e-05, + "loss": 1.7368, + "step": 12966 + }, + { + "epoch": 3.980049109883364, + "grad_norm": 0.3013291656970978, + "learning_rate": 6.850213090942275e-05, + "loss": 1.7911, + "step": 12967 + }, + { + "epoch": 3.9803560466543892, + "grad_norm": 0.3420047163963318, + "learning_rate": 6.849751309443352e-05, + "loss": 1.7899, + "step": 12968 + }, + { + "epoch": 3.9806629834254146, + "grad_norm": 0.2901746928691864, + "learning_rate": 6.849289509664105e-05, + "loss": 1.8244, + "step": 12969 + }, + { + "epoch": 3.9809699201964395, + "grad_norm": 0.2389298677444458, + "learning_rate": 6.848827691609093e-05, + "loss": 1.7116, + "step": 12970 + }, + { + "epoch": 3.981276856967465, + "grad_norm": 0.3153960704803467, + "learning_rate": 6.848365855282882e-05, + "loss": 1.7665, + "step": 12971 + }, + { + "epoch": 3.9815837937384897, + "grad_norm": 0.3162175118923187, + "learning_rate": 6.847904000690036e-05, + "loss": 1.7722, + "step": 12972 + }, + { + "epoch": 3.981890730509515, + "grad_norm": 0.27458643913269043, + "learning_rate": 6.847442127835122e-05, + "loss": 1.8095, + "step": 12973 + }, + { + "epoch": 3.9821976672805404, + "grad_norm": 0.22330710291862488, + "learning_rate": 6.846980236722699e-05, + "loss": 1.7179, + "step": 12974 + }, + { + "epoch": 3.9825046040515653, + "grad_norm": 0.2940923869609833, + "learning_rate": 6.846518327357339e-05, + "loss": 1.7363, + "step": 12975 + }, + { + "epoch": 3.9828115408225906, + "grad_norm": 0.26479849219322205, + "learning_rate": 6.846056399743599e-05, + "loss": 1.7788, + "step": 12976 + }, + { + "epoch": 3.9831184775936155, + "grad_norm": 0.24145057797431946, + "learning_rate": 6.845594453886048e-05, + "loss": 1.7825, + "step": 12977 + }, + { + "epoch": 3.983425414364641, + "grad_norm": 0.2795869708061218, + "learning_rate": 6.845132489789252e-05, + "loss": 1.7705, + "step": 12978 + }, + { + "epoch": 3.983732351135666, + "grad_norm": 0.3117202818393707, + "learning_rate": 6.844670507457776e-05, + "loss": 1.8183, + "step": 12979 + }, + { + "epoch": 3.9840392879066915, + "grad_norm": 0.2666899263858795, + "learning_rate": 6.844208506896184e-05, + "loss": 1.7434, + "step": 12980 + }, + { + "epoch": 3.9843462246777164, + "grad_norm": 0.24682332575321198, + "learning_rate": 6.843746488109042e-05, + "loss": 1.751, + "step": 12981 + }, + { + "epoch": 3.9846531614487417, + "grad_norm": 0.2558208703994751, + "learning_rate": 6.843284451100916e-05, + "loss": 1.7983, + "step": 12982 + }, + { + "epoch": 3.9849600982197666, + "grad_norm": 0.4236481189727783, + "learning_rate": 6.842822395876374e-05, + "loss": 1.8584, + "step": 12983 + }, + { + "epoch": 3.985267034990792, + "grad_norm": 0.4931485950946808, + "learning_rate": 6.84236032243998e-05, + "loss": 1.7617, + "step": 12984 + }, + { + "epoch": 3.9855739717618173, + "grad_norm": 0.37793654203414917, + "learning_rate": 6.841898230796302e-05, + "loss": 1.7411, + "step": 12985 + }, + { + "epoch": 3.985880908532842, + "grad_norm": 0.2093842774629593, + "learning_rate": 6.841436120949906e-05, + "loss": 1.772, + "step": 12986 + }, + { + "epoch": 3.9861878453038675, + "grad_norm": 0.4065552055835724, + "learning_rate": 6.840973992905359e-05, + "loss": 1.7675, + "step": 12987 + }, + { + "epoch": 3.9864947820748924, + "grad_norm": 0.5334183573722839, + "learning_rate": 6.840511846667228e-05, + "loss": 1.7872, + "step": 12988 + }, + { + "epoch": 3.9868017188459177, + "grad_norm": 0.378974974155426, + "learning_rate": 6.84004968224008e-05, + "loss": 1.8288, + "step": 12989 + }, + { + "epoch": 3.987108655616943, + "grad_norm": 0.22518309950828552, + "learning_rate": 6.839587499628483e-05, + "loss": 1.7715, + "step": 12990 + }, + { + "epoch": 3.987415592387968, + "grad_norm": 0.4270850718021393, + "learning_rate": 6.839125298837003e-05, + "loss": 1.7797, + "step": 12991 + }, + { + "epoch": 3.9877225291589933, + "grad_norm": 0.4629896879196167, + "learning_rate": 6.838663079870211e-05, + "loss": 1.7936, + "step": 12992 + }, + { + "epoch": 3.988029465930018, + "grad_norm": 0.29273948073387146, + "learning_rate": 6.838200842732672e-05, + "loss": 1.8264, + "step": 12993 + }, + { + "epoch": 3.9883364027010435, + "grad_norm": 0.31575852632522583, + "learning_rate": 6.837738587428954e-05, + "loss": 1.8043, + "step": 12994 + }, + { + "epoch": 3.988643339472069, + "grad_norm": 0.40602433681488037, + "learning_rate": 6.837276313963627e-05, + "loss": 1.7409, + "step": 12995 + }, + { + "epoch": 3.988950276243094, + "grad_norm": 0.23413142561912537, + "learning_rate": 6.836814022341259e-05, + "loss": 1.8585, + "step": 12996 + }, + { + "epoch": 3.989257213014119, + "grad_norm": 0.3518814444541931, + "learning_rate": 6.836351712566416e-05, + "loss": 1.7768, + "step": 12997 + }, + { + "epoch": 3.9895641497851444, + "grad_norm": 0.3811505436897278, + "learning_rate": 6.83588938464367e-05, + "loss": 1.7738, + "step": 12998 + }, + { + "epoch": 3.9898710865561693, + "grad_norm": 0.2516780197620392, + "learning_rate": 6.835427038577589e-05, + "loss": 1.7351, + "step": 12999 + }, + { + "epoch": 3.9901780233271946, + "grad_norm": 0.23704510927200317, + "learning_rate": 6.834964674372744e-05, + "loss": 1.7907, + "step": 13000 + }, + { + "epoch": 3.99048496009822, + "grad_norm": 0.2890201807022095, + "learning_rate": 6.8345022920337e-05, + "loss": 1.9546, + "step": 13001 + }, + { + "epoch": 3.990791896869245, + "grad_norm": 0.2678101360797882, + "learning_rate": 6.834039891565031e-05, + "loss": 1.7338, + "step": 13002 + }, + { + "epoch": 3.99109883364027, + "grad_norm": 0.31726256012916565, + "learning_rate": 6.833577472971304e-05, + "loss": 1.8464, + "step": 13003 + }, + { + "epoch": 3.991405770411295, + "grad_norm": 0.28112682700157166, + "learning_rate": 6.83311503625709e-05, + "loss": 1.7427, + "step": 13004 + }, + { + "epoch": 3.9917127071823204, + "grad_norm": 0.2651563584804535, + "learning_rate": 6.832652581426958e-05, + "loss": 1.8117, + "step": 13005 + }, + { + "epoch": 3.9920196439533457, + "grad_norm": 0.3095388114452362, + "learning_rate": 6.83219010848548e-05, + "loss": 1.8286, + "step": 13006 + }, + { + "epoch": 3.9923265807243706, + "grad_norm": 0.24704942107200623, + "learning_rate": 6.831727617437225e-05, + "loss": 1.77, + "step": 13007 + }, + { + "epoch": 3.992633517495396, + "grad_norm": 0.24868519604206085, + "learning_rate": 6.831265108286764e-05, + "loss": 1.8129, + "step": 13008 + }, + { + "epoch": 3.992940454266421, + "grad_norm": 0.26511049270629883, + "learning_rate": 6.830802581038669e-05, + "loss": 1.7539, + "step": 13009 + }, + { + "epoch": 3.993247391037446, + "grad_norm": 0.2823421061038971, + "learning_rate": 6.830340035697508e-05, + "loss": 1.8068, + "step": 13010 + }, + { + "epoch": 3.9935543278084715, + "grad_norm": 0.28526121377944946, + "learning_rate": 6.829877472267856e-05, + "loss": 1.764, + "step": 13011 + }, + { + "epoch": 3.993861264579497, + "grad_norm": 0.2576456069946289, + "learning_rate": 6.829414890754281e-05, + "loss": 1.728, + "step": 13012 + }, + { + "epoch": 3.9941682013505218, + "grad_norm": 0.27154842019081116, + "learning_rate": 6.828952291161356e-05, + "loss": 1.797, + "step": 13013 + }, + { + "epoch": 3.994475138121547, + "grad_norm": 0.3129710555076599, + "learning_rate": 6.828489673493652e-05, + "loss": 1.769, + "step": 13014 + }, + { + "epoch": 3.994782074892572, + "grad_norm": 0.40118902921676636, + "learning_rate": 6.828027037755742e-05, + "loss": 1.8029, + "step": 13015 + }, + { + "epoch": 3.9950890116635973, + "grad_norm": 0.33228442072868347, + "learning_rate": 6.827564383952197e-05, + "loss": 1.7295, + "step": 13016 + }, + { + "epoch": 3.9953959484346226, + "grad_norm": 0.218771830201149, + "learning_rate": 6.827101712087591e-05, + "loss": 1.7693, + "step": 13017 + }, + { + "epoch": 3.9957028852056475, + "grad_norm": 0.31354373693466187, + "learning_rate": 6.826639022166492e-05, + "loss": 1.743, + "step": 13018 + }, + { + "epoch": 3.996009821976673, + "grad_norm": 0.3584701418876648, + "learning_rate": 6.826176314193478e-05, + "loss": 1.7597, + "step": 13019 + }, + { + "epoch": 3.9963167587476978, + "grad_norm": 0.2692064344882965, + "learning_rate": 6.82571358817312e-05, + "loss": 1.7871, + "step": 13020 + }, + { + "epoch": 3.996623695518723, + "grad_norm": 0.3064020276069641, + "learning_rate": 6.825250844109987e-05, + "loss": 1.7858, + "step": 13021 + }, + { + "epoch": 3.9969306322897484, + "grad_norm": 0.29913413524627686, + "learning_rate": 6.824788082008657e-05, + "loss": 1.7773, + "step": 13022 + }, + { + "epoch": 3.9972375690607733, + "grad_norm": 0.2682165801525116, + "learning_rate": 6.824325301873703e-05, + "loss": 1.8321, + "step": 13023 + }, + { + "epoch": 3.9975445058317987, + "grad_norm": 0.3274376690387726, + "learning_rate": 6.823862503709694e-05, + "loss": 1.8514, + "step": 13024 + }, + { + "epoch": 3.9978514426028235, + "grad_norm": 0.29828041791915894, + "learning_rate": 6.823399687521211e-05, + "loss": 1.7923, + "step": 13025 + }, + { + "epoch": 3.998158379373849, + "grad_norm": 0.22339288890361786, + "learning_rate": 6.82293685331282e-05, + "loss": 1.756, + "step": 13026 + }, + { + "epoch": 3.998465316144874, + "grad_norm": 0.2254658192396164, + "learning_rate": 6.8224740010891e-05, + "loss": 1.7392, + "step": 13027 + }, + { + "epoch": 3.9987722529158995, + "grad_norm": 0.24932752549648285, + "learning_rate": 6.822011130854624e-05, + "loss": 1.7538, + "step": 13028 + }, + { + "epoch": 3.9990791896869244, + "grad_norm": 0.21429690718650818, + "learning_rate": 6.821548242613966e-05, + "loss": 1.7746, + "step": 13029 + }, + { + "epoch": 3.9993861264579498, + "grad_norm": 0.25503116846084595, + "learning_rate": 6.8210853363717e-05, + "loss": 1.814, + "step": 13030 + }, + { + "epoch": 3.9996930632289747, + "grad_norm": 0.23168155550956726, + "learning_rate": 6.820622412132402e-05, + "loss": 1.769, + "step": 13031 + }, + { + "epoch": 4.0, + "grad_norm": 0.2252223789691925, + "learning_rate": 6.820159469900645e-05, + "loss": 1.7782, + "step": 13032 + }, + { + "epoch": 4.000306936771025, + "grad_norm": 0.1996588408946991, + "learning_rate": 6.819696509681007e-05, + "loss": 1.6839, + "step": 13033 + }, + { + "epoch": 4.000613873542051, + "grad_norm": 0.22297053039073944, + "learning_rate": 6.81923353147806e-05, + "loss": 1.7767, + "step": 13034 + }, + { + "epoch": 4.000920810313075, + "grad_norm": 0.25867611169815063, + "learning_rate": 6.818770535296381e-05, + "loss": 1.8623, + "step": 13035 + }, + { + "epoch": 4.0012277470841005, + "grad_norm": 0.2173648178577423, + "learning_rate": 6.818307521140547e-05, + "loss": 1.8034, + "step": 13036 + }, + { + "epoch": 4.001534683855126, + "grad_norm": 0.23634609580039978, + "learning_rate": 6.81784448901513e-05, + "loss": 1.7503, + "step": 13037 + }, + { + "epoch": 4.001841620626151, + "grad_norm": 0.2626810073852539, + "learning_rate": 6.81738143892471e-05, + "loss": 1.8116, + "step": 13038 + }, + { + "epoch": 4.0021485573971765, + "grad_norm": 0.27888983488082886, + "learning_rate": 6.816918370873861e-05, + "loss": 1.8032, + "step": 13039 + }, + { + "epoch": 4.002455494168202, + "grad_norm": 0.275038480758667, + "learning_rate": 6.816455284867162e-05, + "loss": 1.7445, + "step": 13040 + }, + { + "epoch": 4.002762430939226, + "grad_norm": 0.3475828170776367, + "learning_rate": 6.815992180909184e-05, + "loss": 1.7404, + "step": 13041 + }, + { + "epoch": 4.003069367710252, + "grad_norm": 0.27314287424087524, + "learning_rate": 6.815529059004507e-05, + "loss": 1.8333, + "step": 13042 + }, + { + "epoch": 4.003376304481277, + "grad_norm": 0.34846973419189453, + "learning_rate": 6.815065919157709e-05, + "loss": 1.7921, + "step": 13043 + }, + { + "epoch": 4.003683241252302, + "grad_norm": 0.4191788136959076, + "learning_rate": 6.814602761373365e-05, + "loss": 1.8018, + "step": 13044 + }, + { + "epoch": 4.003990178023328, + "grad_norm": 0.2655608057975769, + "learning_rate": 6.814139585656055e-05, + "loss": 1.7638, + "step": 13045 + }, + { + "epoch": 4.004297114794352, + "grad_norm": 0.25938618183135986, + "learning_rate": 6.813676392010353e-05, + "loss": 1.794, + "step": 13046 + }, + { + "epoch": 4.004604051565377, + "grad_norm": 0.3464813828468323, + "learning_rate": 6.813213180440837e-05, + "loss": 1.8662, + "step": 13047 + }, + { + "epoch": 4.004910988336403, + "grad_norm": 0.30185338854789734, + "learning_rate": 6.812749950952087e-05, + "loss": 1.8029, + "step": 13048 + }, + { + "epoch": 4.005217925107428, + "grad_norm": 0.23291908204555511, + "learning_rate": 6.812286703548678e-05, + "loss": 1.7365, + "step": 13049 + }, + { + "epoch": 4.005524861878453, + "grad_norm": 0.3542841374874115, + "learning_rate": 6.811823438235189e-05, + "loss": 1.8674, + "step": 13050 + }, + { + "epoch": 4.005831798649478, + "grad_norm": 0.2914685606956482, + "learning_rate": 6.811360155016202e-05, + "loss": 1.8306, + "step": 13051 + }, + { + "epoch": 4.006138735420503, + "grad_norm": 0.24888737499713898, + "learning_rate": 6.810896853896289e-05, + "loss": 1.7767, + "step": 13052 + }, + { + "epoch": 4.0064456721915285, + "grad_norm": 0.2977537512779236, + "learning_rate": 6.810433534880033e-05, + "loss": 1.8227, + "step": 13053 + }, + { + "epoch": 4.006752608962554, + "grad_norm": 0.3367510735988617, + "learning_rate": 6.809970197972013e-05, + "loss": 1.734, + "step": 13054 + }, + { + "epoch": 4.007059545733579, + "grad_norm": 0.28098800778388977, + "learning_rate": 6.809506843176806e-05, + "loss": 1.7032, + "step": 13055 + }, + { + "epoch": 4.0073664825046045, + "grad_norm": 0.24016784131526947, + "learning_rate": 6.809043470498991e-05, + "loss": 1.7863, + "step": 13056 + }, + { + "epoch": 4.007673419275629, + "grad_norm": 0.2883957624435425, + "learning_rate": 6.808580079943148e-05, + "loss": 1.7342, + "step": 13057 + }, + { + "epoch": 4.007980356046654, + "grad_norm": 0.3069116473197937, + "learning_rate": 6.808116671513856e-05, + "loss": 1.8544, + "step": 13058 + }, + { + "epoch": 4.00828729281768, + "grad_norm": 0.24113236367702484, + "learning_rate": 6.807653245215697e-05, + "loss": 1.7692, + "step": 13059 + }, + { + "epoch": 4.008594229588705, + "grad_norm": 0.2651619017124176, + "learning_rate": 6.807189801053249e-05, + "loss": 1.8096, + "step": 13060 + }, + { + "epoch": 4.00890116635973, + "grad_norm": 0.2636481523513794, + "learning_rate": 6.806726339031092e-05, + "loss": 1.8062, + "step": 13061 + }, + { + "epoch": 4.009208103130755, + "grad_norm": 0.22691169381141663, + "learning_rate": 6.806262859153807e-05, + "loss": 1.7001, + "step": 13062 + }, + { + "epoch": 4.00951503990178, + "grad_norm": 0.23288170993328094, + "learning_rate": 6.805799361425972e-05, + "loss": 1.7508, + "step": 13063 + }, + { + "epoch": 4.009821976672805, + "grad_norm": 0.243272602558136, + "learning_rate": 6.80533584585217e-05, + "loss": 1.7797, + "step": 13064 + }, + { + "epoch": 4.010128913443831, + "grad_norm": 0.24594646692276, + "learning_rate": 6.80487231243698e-05, + "loss": 1.7894, + "step": 13065 + }, + { + "epoch": 4.010435850214856, + "grad_norm": 0.21726086735725403, + "learning_rate": 6.804408761184986e-05, + "loss": 1.7472, + "step": 13066 + }, + { + "epoch": 4.0107427869858805, + "grad_norm": 0.2262321561574936, + "learning_rate": 6.803945192100767e-05, + "loss": 1.7563, + "step": 13067 + }, + { + "epoch": 4.011049723756906, + "grad_norm": 0.2449522763490677, + "learning_rate": 6.803481605188903e-05, + "loss": 1.7282, + "step": 13068 + }, + { + "epoch": 4.011356660527931, + "grad_norm": 0.2281760573387146, + "learning_rate": 6.803018000453975e-05, + "loss": 1.8191, + "step": 13069 + }, + { + "epoch": 4.0116635972989565, + "grad_norm": 0.3039850890636444, + "learning_rate": 6.80255437790057e-05, + "loss": 1.8258, + "step": 13070 + }, + { + "epoch": 4.011970534069982, + "grad_norm": 0.3978467881679535, + "learning_rate": 6.802090737533264e-05, + "loss": 1.7338, + "step": 13071 + }, + { + "epoch": 4.012277470841007, + "grad_norm": 0.29175812005996704, + "learning_rate": 6.801627079356641e-05, + "loss": 1.7754, + "step": 13072 + }, + { + "epoch": 4.012584407612032, + "grad_norm": 0.24228449165821075, + "learning_rate": 6.801163403375285e-05, + "loss": 1.7624, + "step": 13073 + }, + { + "epoch": 4.012891344383057, + "grad_norm": 0.34527531266212463, + "learning_rate": 6.800699709593776e-05, + "loss": 1.87, + "step": 13074 + }, + { + "epoch": 4.013198281154082, + "grad_norm": 0.1995161920785904, + "learning_rate": 6.800235998016696e-05, + "loss": 1.7253, + "step": 13075 + }, + { + "epoch": 4.013505217925108, + "grad_norm": 0.3509151339530945, + "learning_rate": 6.799772268648628e-05, + "loss": 1.8013, + "step": 13076 + }, + { + "epoch": 4.013812154696133, + "grad_norm": 0.38569679856300354, + "learning_rate": 6.799308521494156e-05, + "loss": 1.7761, + "step": 13077 + }, + { + "epoch": 4.014119091467157, + "grad_norm": 0.2636256814002991, + "learning_rate": 6.798844756557865e-05, + "loss": 1.8101, + "step": 13078 + }, + { + "epoch": 4.014426028238183, + "grad_norm": 0.2570696473121643, + "learning_rate": 6.798380973844335e-05, + "loss": 1.7561, + "step": 13079 + }, + { + "epoch": 4.014732965009208, + "grad_norm": 0.38540002703666687, + "learning_rate": 6.797917173358148e-05, + "loss": 1.7893, + "step": 13080 + }, + { + "epoch": 4.015039901780233, + "grad_norm": 0.2974525988101959, + "learning_rate": 6.79745335510389e-05, + "loss": 1.8331, + "step": 13081 + }, + { + "epoch": 4.015346838551259, + "grad_norm": 0.2563362419605255, + "learning_rate": 6.796989519086146e-05, + "loss": 1.7784, + "step": 13082 + }, + { + "epoch": 4.015653775322283, + "grad_norm": 0.37037795782089233, + "learning_rate": 6.7965256653095e-05, + "loss": 1.7947, + "step": 13083 + }, + { + "epoch": 4.0159607120933085, + "grad_norm": 0.4145336449146271, + "learning_rate": 6.796061793778531e-05, + "loss": 1.7633, + "step": 13084 + }, + { + "epoch": 4.016267648864334, + "grad_norm": 0.32278406620025635, + "learning_rate": 6.795597904497828e-05, + "loss": 1.7827, + "step": 13085 + }, + { + "epoch": 4.016574585635359, + "grad_norm": 0.26466837525367737, + "learning_rate": 6.795133997471974e-05, + "loss": 1.7441, + "step": 13086 + }, + { + "epoch": 4.0168815224063845, + "grad_norm": 0.3212043344974518, + "learning_rate": 6.794670072705553e-05, + "loss": 1.7602, + "step": 13087 + }, + { + "epoch": 4.01718845917741, + "grad_norm": 0.3054736852645874, + "learning_rate": 6.79420613020315e-05, + "loss": 1.7417, + "step": 13088 + }, + { + "epoch": 4.017495395948434, + "grad_norm": 0.22281476855278015, + "learning_rate": 6.793742169969351e-05, + "loss": 1.7675, + "step": 13089 + }, + { + "epoch": 4.01780233271946, + "grad_norm": 0.32630839943885803, + "learning_rate": 6.793278192008742e-05, + "loss": 1.8409, + "step": 13090 + }, + { + "epoch": 4.018109269490485, + "grad_norm": 0.2658778429031372, + "learning_rate": 6.792814196325905e-05, + "loss": 1.7718, + "step": 13091 + }, + { + "epoch": 4.01841620626151, + "grad_norm": 0.24016901850700378, + "learning_rate": 6.792350182925429e-05, + "loss": 1.8393, + "step": 13092 + }, + { + "epoch": 4.018723143032536, + "grad_norm": 0.2882223427295685, + "learning_rate": 6.791886151811897e-05, + "loss": 1.7497, + "step": 13093 + }, + { + "epoch": 4.01903007980356, + "grad_norm": 0.24340751767158508, + "learning_rate": 6.791422102989895e-05, + "loss": 1.72, + "step": 13094 + }, + { + "epoch": 4.019337016574585, + "grad_norm": 0.235665962100029, + "learning_rate": 6.79095803646401e-05, + "loss": 1.7269, + "step": 13095 + }, + { + "epoch": 4.019643953345611, + "grad_norm": 0.32772955298423767, + "learning_rate": 6.79049395223883e-05, + "loss": 1.7916, + "step": 13096 + }, + { + "epoch": 4.019950890116636, + "grad_norm": 0.3189625144004822, + "learning_rate": 6.790029850318938e-05, + "loss": 1.7571, + "step": 13097 + }, + { + "epoch": 4.020257826887661, + "grad_norm": 0.2211185097694397, + "learning_rate": 6.789565730708921e-05, + "loss": 1.793, + "step": 13098 + }, + { + "epoch": 4.020564763658686, + "grad_norm": 0.2840392291545868, + "learning_rate": 6.789101593413367e-05, + "loss": 1.7434, + "step": 13099 + }, + { + "epoch": 4.020871700429711, + "grad_norm": 0.27857357263565063, + "learning_rate": 6.788637438436863e-05, + "loss": 1.742, + "step": 13100 + }, + { + "epoch": 4.0211786372007365, + "grad_norm": 0.314628005027771, + "learning_rate": 6.788173265783996e-05, + "loss": 1.7881, + "step": 13101 + }, + { + "epoch": 4.021485573971762, + "grad_norm": 0.2994774580001831, + "learning_rate": 6.787709075459352e-05, + "loss": 1.7741, + "step": 13102 + }, + { + "epoch": 4.021792510742787, + "grad_norm": 0.3256312310695648, + "learning_rate": 6.787244867467519e-05, + "loss": 1.7758, + "step": 13103 + }, + { + "epoch": 4.0220994475138125, + "grad_norm": 0.2332412451505661, + "learning_rate": 6.786780641813083e-05, + "loss": 1.7654, + "step": 13104 + }, + { + "epoch": 4.022406384284837, + "grad_norm": 0.23226258158683777, + "learning_rate": 6.786316398500636e-05, + "loss": 1.7605, + "step": 13105 + }, + { + "epoch": 4.022713321055862, + "grad_norm": 0.24631965160369873, + "learning_rate": 6.785852137534763e-05, + "loss": 1.7469, + "step": 13106 + }, + { + "epoch": 4.023020257826888, + "grad_norm": 0.1969226449728012, + "learning_rate": 6.785387858920051e-05, + "loss": 1.8151, + "step": 13107 + }, + { + "epoch": 4.023327194597913, + "grad_norm": 0.22769485414028168, + "learning_rate": 6.784923562661091e-05, + "loss": 1.7024, + "step": 13108 + }, + { + "epoch": 4.023634131368938, + "grad_norm": 0.2174670249223709, + "learning_rate": 6.78445924876247e-05, + "loss": 1.8094, + "step": 13109 + }, + { + "epoch": 4.023941068139963, + "grad_norm": 0.2606858015060425, + "learning_rate": 6.783994917228775e-05, + "loss": 1.8043, + "step": 13110 + }, + { + "epoch": 4.024248004910988, + "grad_norm": 0.24721349775791168, + "learning_rate": 6.783530568064599e-05, + "loss": 1.842, + "step": 13111 + }, + { + "epoch": 4.024554941682013, + "grad_norm": 0.2353603094816208, + "learning_rate": 6.783066201274529e-05, + "loss": 1.76, + "step": 13112 + }, + { + "epoch": 4.024861878453039, + "grad_norm": 0.22285830974578857, + "learning_rate": 6.782601816863153e-05, + "loss": 1.8014, + "step": 13113 + }, + { + "epoch": 4.025168815224064, + "grad_norm": 0.2482440173625946, + "learning_rate": 6.782137414835061e-05, + "loss": 1.7552, + "step": 13114 + }, + { + "epoch": 4.0254757519950894, + "grad_norm": 0.19926191866397858, + "learning_rate": 6.781672995194842e-05, + "loss": 1.7549, + "step": 13115 + }, + { + "epoch": 4.025782688766114, + "grad_norm": 0.2342877984046936, + "learning_rate": 6.781208557947086e-05, + "loss": 1.8622, + "step": 13116 + }, + { + "epoch": 4.026089625537139, + "grad_norm": 0.24096547067165375, + "learning_rate": 6.780744103096382e-05, + "loss": 1.7795, + "step": 13117 + }, + { + "epoch": 4.026396562308165, + "grad_norm": 0.23714657127857208, + "learning_rate": 6.780279630647322e-05, + "loss": 1.799, + "step": 13118 + }, + { + "epoch": 4.02670349907919, + "grad_norm": 0.28252026438713074, + "learning_rate": 6.779815140604496e-05, + "loss": 1.7573, + "step": 13119 + }, + { + "epoch": 4.027010435850215, + "grad_norm": 0.28028404712677, + "learning_rate": 6.779350632972493e-05, + "loss": 1.8103, + "step": 13120 + }, + { + "epoch": 4.02731737262124, + "grad_norm": 0.21088312566280365, + "learning_rate": 6.778886107755904e-05, + "loss": 1.7169, + "step": 13121 + }, + { + "epoch": 4.027624309392265, + "grad_norm": 0.22282038629055023, + "learning_rate": 6.77842156495932e-05, + "loss": 1.7206, + "step": 13122 + }, + { + "epoch": 4.02793124616329, + "grad_norm": 0.3281327784061432, + "learning_rate": 6.777957004587331e-05, + "loss": 1.8664, + "step": 13123 + }, + { + "epoch": 4.028238182934316, + "grad_norm": 0.29496827721595764, + "learning_rate": 6.77749242664453e-05, + "loss": 1.7532, + "step": 13124 + }, + { + "epoch": 4.028545119705341, + "grad_norm": 0.25299328565597534, + "learning_rate": 6.777027831135508e-05, + "loss": 1.7836, + "step": 13125 + }, + { + "epoch": 4.0288520564763655, + "grad_norm": 0.3000280559062958, + "learning_rate": 6.776563218064854e-05, + "loss": 1.8079, + "step": 13126 + }, + { + "epoch": 4.029158993247391, + "grad_norm": 0.3613673448562622, + "learning_rate": 6.77609858743716e-05, + "loss": 1.7931, + "step": 13127 + }, + { + "epoch": 4.029465930018416, + "grad_norm": 0.25613468885421753, + "learning_rate": 6.77563393925702e-05, + "loss": 1.7522, + "step": 13128 + }, + { + "epoch": 4.0297728667894415, + "grad_norm": 0.24391578137874603, + "learning_rate": 6.775169273529026e-05, + "loss": 1.818, + "step": 13129 + }, + { + "epoch": 4.030079803560467, + "grad_norm": 0.2806173264980316, + "learning_rate": 6.774704590257768e-05, + "loss": 1.7349, + "step": 13130 + }, + { + "epoch": 4.030386740331492, + "grad_norm": 0.22214172780513763, + "learning_rate": 6.774239889447838e-05, + "loss": 1.759, + "step": 13131 + }, + { + "epoch": 4.030693677102517, + "grad_norm": 0.27285513281822205, + "learning_rate": 6.773775171103828e-05, + "loss": 1.742, + "step": 13132 + }, + { + "epoch": 4.031000613873542, + "grad_norm": 0.22302402555942535, + "learning_rate": 6.773310435230334e-05, + "loss": 1.7277, + "step": 13133 + }, + { + "epoch": 4.031307550644567, + "grad_norm": 0.2350187450647354, + "learning_rate": 6.772845681831947e-05, + "loss": 1.8648, + "step": 13134 + }, + { + "epoch": 4.031614487415593, + "grad_norm": 0.2665547728538513, + "learning_rate": 6.772380910913261e-05, + "loss": 1.776, + "step": 13135 + }, + { + "epoch": 4.031921424186618, + "grad_norm": 0.30652403831481934, + "learning_rate": 6.771916122478867e-05, + "loss": 1.7884, + "step": 13136 + }, + { + "epoch": 4.032228360957642, + "grad_norm": 0.29372814297676086, + "learning_rate": 6.771451316533359e-05, + "loss": 1.8203, + "step": 13137 + }, + { + "epoch": 4.032535297728668, + "grad_norm": 0.2244873046875, + "learning_rate": 6.770986493081329e-05, + "loss": 1.7869, + "step": 13138 + }, + { + "epoch": 4.032842234499693, + "grad_norm": 0.25075265765190125, + "learning_rate": 6.770521652127375e-05, + "loss": 1.772, + "step": 13139 + }, + { + "epoch": 4.033149171270718, + "grad_norm": 0.28118211030960083, + "learning_rate": 6.770056793676087e-05, + "loss": 1.7922, + "step": 13140 + }, + { + "epoch": 4.033456108041744, + "grad_norm": 0.25199100375175476, + "learning_rate": 6.769591917732062e-05, + "loss": 1.7526, + "step": 13141 + }, + { + "epoch": 4.033763044812768, + "grad_norm": 0.2920379638671875, + "learning_rate": 6.769127024299892e-05, + "loss": 1.8365, + "step": 13142 + }, + { + "epoch": 4.0340699815837935, + "grad_norm": 0.23018018901348114, + "learning_rate": 6.768662113384171e-05, + "loss": 1.7411, + "step": 13143 + }, + { + "epoch": 4.034376918354819, + "grad_norm": 0.23253841698169708, + "learning_rate": 6.768197184989494e-05, + "loss": 1.7921, + "step": 13144 + }, + { + "epoch": 4.034683855125844, + "grad_norm": 0.22618864476680756, + "learning_rate": 6.767732239120456e-05, + "loss": 1.7421, + "step": 13145 + }, + { + "epoch": 4.0349907918968695, + "grad_norm": 0.24552187323570251, + "learning_rate": 6.767267275781655e-05, + "loss": 1.7299, + "step": 13146 + }, + { + "epoch": 4.035297728667895, + "grad_norm": 0.22562766075134277, + "learning_rate": 6.76680229497768e-05, + "loss": 1.766, + "step": 13147 + }, + { + "epoch": 4.035604665438919, + "grad_norm": 0.28718629479408264, + "learning_rate": 6.76633729671313e-05, + "loss": 1.7366, + "step": 13148 + }, + { + "epoch": 4.035911602209945, + "grad_norm": 0.38769885897636414, + "learning_rate": 6.765872280992598e-05, + "loss": 1.8244, + "step": 13149 + }, + { + "epoch": 4.03621853898097, + "grad_norm": 0.4232725501060486, + "learning_rate": 6.765407247820683e-05, + "loss": 1.8244, + "step": 13150 + }, + { + "epoch": 4.036525475751995, + "grad_norm": 0.2771088778972626, + "learning_rate": 6.764942197201977e-05, + "loss": 1.7863, + "step": 13151 + }, + { + "epoch": 4.036832412523021, + "grad_norm": 0.2917862832546234, + "learning_rate": 6.76447712914108e-05, + "loss": 1.791, + "step": 13152 + }, + { + "epoch": 4.037139349294045, + "grad_norm": 0.37355467677116394, + "learning_rate": 6.764012043642584e-05, + "loss": 1.74, + "step": 13153 + }, + { + "epoch": 4.03744628606507, + "grad_norm": 0.35664018988609314, + "learning_rate": 6.763546940711089e-05, + "loss": 1.7734, + "step": 13154 + }, + { + "epoch": 4.037753222836096, + "grad_norm": 0.2335754930973053, + "learning_rate": 6.763081820351188e-05, + "loss": 1.7765, + "step": 13155 + }, + { + "epoch": 4.038060159607121, + "grad_norm": 0.2825562357902527, + "learning_rate": 6.762616682567478e-05, + "loss": 1.7867, + "step": 13156 + }, + { + "epoch": 4.038367096378146, + "grad_norm": 0.3103202283382416, + "learning_rate": 6.762151527364559e-05, + "loss": 1.7331, + "step": 13157 + }, + { + "epoch": 4.038674033149171, + "grad_norm": 0.2897353172302246, + "learning_rate": 6.761686354747025e-05, + "loss": 1.7638, + "step": 13158 + }, + { + "epoch": 4.038980969920196, + "grad_norm": 0.21260851621627808, + "learning_rate": 6.761221164719474e-05, + "loss": 1.7302, + "step": 13159 + }, + { + "epoch": 4.0392879066912215, + "grad_norm": 0.2878021001815796, + "learning_rate": 6.760755957286503e-05, + "loss": 1.7368, + "step": 13160 + }, + { + "epoch": 4.039594843462247, + "grad_norm": 0.2785978317260742, + "learning_rate": 6.76029073245271e-05, + "loss": 1.7258, + "step": 13161 + }, + { + "epoch": 4.039901780233272, + "grad_norm": 0.1963953971862793, + "learning_rate": 6.759825490222692e-05, + "loss": 1.755, + "step": 13162 + }, + { + "epoch": 4.0402087170042975, + "grad_norm": 0.26776790618896484, + "learning_rate": 6.759360230601047e-05, + "loss": 1.7676, + "step": 13163 + }, + { + "epoch": 4.040515653775322, + "grad_norm": 0.2751332223415375, + "learning_rate": 6.758894953592373e-05, + "loss": 1.7313, + "step": 13164 + }, + { + "epoch": 4.040822590546347, + "grad_norm": 0.2339213341474533, + "learning_rate": 6.758429659201269e-05, + "loss": 1.714, + "step": 13165 + }, + { + "epoch": 4.041129527317373, + "grad_norm": 0.2624664008617401, + "learning_rate": 6.75796434743233e-05, + "loss": 1.8296, + "step": 13166 + }, + { + "epoch": 4.041436464088398, + "grad_norm": 0.40156883001327515, + "learning_rate": 6.757499018290159e-05, + "loss": 1.8228, + "step": 13167 + }, + { + "epoch": 4.041743400859423, + "grad_norm": 0.32976576685905457, + "learning_rate": 6.757033671779352e-05, + "loss": 1.7403, + "step": 13168 + }, + { + "epoch": 4.042050337630448, + "grad_norm": 0.2343887835741043, + "learning_rate": 6.756568307904508e-05, + "loss": 1.7837, + "step": 13169 + }, + { + "epoch": 4.042357274401473, + "grad_norm": 0.36174145340919495, + "learning_rate": 6.756102926670227e-05, + "loss": 1.7291, + "step": 13170 + }, + { + "epoch": 4.042664211172498, + "grad_norm": 0.3324793577194214, + "learning_rate": 6.755637528081108e-05, + "loss": 1.7414, + "step": 13171 + }, + { + "epoch": 4.042971147943524, + "grad_norm": 0.21945348381996155, + "learning_rate": 6.75517211214175e-05, + "loss": 1.7762, + "step": 13172 + }, + { + "epoch": 4.043278084714549, + "grad_norm": 0.31069812178611755, + "learning_rate": 6.75470667885675e-05, + "loss": 1.7666, + "step": 13173 + }, + { + "epoch": 4.043585021485574, + "grad_norm": 0.3931153118610382, + "learning_rate": 6.754241228230713e-05, + "loss": 1.7871, + "step": 13174 + }, + { + "epoch": 4.043891958256599, + "grad_norm": 0.25559595227241516, + "learning_rate": 6.753775760268234e-05, + "loss": 1.7916, + "step": 13175 + }, + { + "epoch": 4.044198895027624, + "grad_norm": 0.3686937391757965, + "learning_rate": 6.753310274973917e-05, + "loss": 1.7642, + "step": 13176 + }, + { + "epoch": 4.0445058317986495, + "grad_norm": 0.4793247580528259, + "learning_rate": 6.75284477235236e-05, + "loss": 1.739, + "step": 13177 + }, + { + "epoch": 4.044812768569675, + "grad_norm": 0.36179354786872864, + "learning_rate": 6.752379252408164e-05, + "loss": 1.7993, + "step": 13178 + }, + { + "epoch": 4.0451197053407, + "grad_norm": 0.22559234499931335, + "learning_rate": 6.751913715145926e-05, + "loss": 1.7401, + "step": 13179 + }, + { + "epoch": 4.045426642111725, + "grad_norm": 0.29058873653411865, + "learning_rate": 6.751448160570253e-05, + "loss": 1.8089, + "step": 13180 + }, + { + "epoch": 4.04573357888275, + "grad_norm": 0.3069808781147003, + "learning_rate": 6.750982588685742e-05, + "loss": 1.7587, + "step": 13181 + }, + { + "epoch": 4.046040515653775, + "grad_norm": 0.2292155921459198, + "learning_rate": 6.750516999496994e-05, + "loss": 1.7429, + "step": 13182 + }, + { + "epoch": 4.046347452424801, + "grad_norm": 0.2520677149295807, + "learning_rate": 6.750051393008612e-05, + "loss": 1.7842, + "step": 13183 + }, + { + "epoch": 4.046654389195826, + "grad_norm": 0.32546502351760864, + "learning_rate": 6.749585769225194e-05, + "loss": 1.8057, + "step": 13184 + }, + { + "epoch": 4.04696132596685, + "grad_norm": 0.27634644508361816, + "learning_rate": 6.749120128151346e-05, + "loss": 1.7708, + "step": 13185 + }, + { + "epoch": 4.047268262737876, + "grad_norm": 0.2546750009059906, + "learning_rate": 6.748654469791668e-05, + "loss": 1.8744, + "step": 13186 + }, + { + "epoch": 4.047575199508901, + "grad_norm": 0.43873605132102966, + "learning_rate": 6.748188794150761e-05, + "loss": 1.8573, + "step": 13187 + }, + { + "epoch": 4.047882136279926, + "grad_norm": 0.45526960492134094, + "learning_rate": 6.747723101233227e-05, + "loss": 1.7761, + "step": 13188 + }, + { + "epoch": 4.048189073050952, + "grad_norm": 0.24995557963848114, + "learning_rate": 6.74725739104367e-05, + "loss": 1.7679, + "step": 13189 + }, + { + "epoch": 4.048496009821977, + "grad_norm": 0.3203068971633911, + "learning_rate": 6.74679166358669e-05, + "loss": 1.7772, + "step": 13190 + }, + { + "epoch": 4.0488029465930016, + "grad_norm": 0.37020671367645264, + "learning_rate": 6.746325918866893e-05, + "loss": 1.8002, + "step": 13191 + }, + { + "epoch": 4.049109883364027, + "grad_norm": 0.2543959319591522, + "learning_rate": 6.745860156888878e-05, + "loss": 1.8057, + "step": 13192 + }, + { + "epoch": 4.049416820135052, + "grad_norm": 0.2566509246826172, + "learning_rate": 6.74539437765725e-05, + "loss": 1.7853, + "step": 13193 + }, + { + "epoch": 4.0497237569060776, + "grad_norm": 0.2545804977416992, + "learning_rate": 6.744928581176612e-05, + "loss": 1.8136, + "step": 13194 + }, + { + "epoch": 4.050030693677103, + "grad_norm": 0.24307197332382202, + "learning_rate": 6.744462767451568e-05, + "loss": 1.7919, + "step": 13195 + }, + { + "epoch": 4.050337630448127, + "grad_norm": 0.24427616596221924, + "learning_rate": 6.743996936486719e-05, + "loss": 1.8037, + "step": 13196 + }, + { + "epoch": 4.050644567219153, + "grad_norm": 0.2154439389705658, + "learning_rate": 6.743531088286673e-05, + "loss": 1.7088, + "step": 13197 + }, + { + "epoch": 4.050951503990178, + "grad_norm": 0.22251558303833008, + "learning_rate": 6.743065222856027e-05, + "loss": 1.7512, + "step": 13198 + }, + { + "epoch": 4.051258440761203, + "grad_norm": 0.2373272329568863, + "learning_rate": 6.74259934019939e-05, + "loss": 1.8056, + "step": 13199 + }, + { + "epoch": 4.051565377532229, + "grad_norm": 0.23308727145195007, + "learning_rate": 6.742133440321366e-05, + "loss": 1.731, + "step": 13200 + }, + { + "epoch": 4.051872314303253, + "grad_norm": 0.2438805252313614, + "learning_rate": 6.741667523226557e-05, + "loss": 1.7938, + "step": 13201 + }, + { + "epoch": 4.0521792510742785, + "grad_norm": 0.22354702651500702, + "learning_rate": 6.741201588919569e-05, + "loss": 1.762, + "step": 13202 + }, + { + "epoch": 4.052486187845304, + "grad_norm": 0.2505488097667694, + "learning_rate": 6.740735637405006e-05, + "loss": 1.7627, + "step": 13203 + }, + { + "epoch": 4.052793124616329, + "grad_norm": 0.21378709375858307, + "learning_rate": 6.740269668687474e-05, + "loss": 1.7598, + "step": 13204 + }, + { + "epoch": 4.0531000613873545, + "grad_norm": 0.24863660335540771, + "learning_rate": 6.739803682771577e-05, + "loss": 1.7665, + "step": 13205 + }, + { + "epoch": 4.05340699815838, + "grad_norm": 0.3041808605194092, + "learning_rate": 6.739337679661921e-05, + "loss": 1.7909, + "step": 13206 + }, + { + "epoch": 4.053713934929404, + "grad_norm": 0.2745797634124756, + "learning_rate": 6.738871659363109e-05, + "loss": 1.7547, + "step": 13207 + }, + { + "epoch": 4.05402087170043, + "grad_norm": 0.2610073387622833, + "learning_rate": 6.738405621879748e-05, + "loss": 1.7723, + "step": 13208 + }, + { + "epoch": 4.054327808471455, + "grad_norm": 0.22728075087070465, + "learning_rate": 6.737939567216446e-05, + "loss": 1.7865, + "step": 13209 + }, + { + "epoch": 4.05463474524248, + "grad_norm": 0.2877669930458069, + "learning_rate": 6.737473495377804e-05, + "loss": 1.8352, + "step": 13210 + }, + { + "epoch": 4.054941682013506, + "grad_norm": 0.35316282510757446, + "learning_rate": 6.737007406368432e-05, + "loss": 1.8202, + "step": 13211 + }, + { + "epoch": 4.05524861878453, + "grad_norm": 0.34625691175460815, + "learning_rate": 6.736541300192936e-05, + "loss": 1.8456, + "step": 13212 + }, + { + "epoch": 4.055555555555555, + "grad_norm": 0.2432134598493576, + "learning_rate": 6.736075176855917e-05, + "loss": 1.8237, + "step": 13213 + }, + { + "epoch": 4.055862492326581, + "grad_norm": 0.27446529269218445, + "learning_rate": 6.735609036361989e-05, + "loss": 1.71, + "step": 13214 + }, + { + "epoch": 4.056169429097606, + "grad_norm": 0.2870408892631531, + "learning_rate": 6.735142878715754e-05, + "loss": 1.7473, + "step": 13215 + }, + { + "epoch": 4.056476365868631, + "grad_norm": 0.22249078750610352, + "learning_rate": 6.734676703921822e-05, + "loss": 1.7462, + "step": 13216 + }, + { + "epoch": 4.056783302639656, + "grad_norm": 0.25519105792045593, + "learning_rate": 6.734210511984796e-05, + "loss": 1.7022, + "step": 13217 + }, + { + "epoch": 4.057090239410681, + "grad_norm": 0.3366561830043793, + "learning_rate": 6.733744302909285e-05, + "loss": 1.787, + "step": 13218 + }, + { + "epoch": 4.0573971761817065, + "grad_norm": 0.2443208247423172, + "learning_rate": 6.733278076699897e-05, + "loss": 1.8048, + "step": 13219 + }, + { + "epoch": 4.057704112952732, + "grad_norm": 0.2893153131008148, + "learning_rate": 6.73281183336124e-05, + "loss": 1.7805, + "step": 13220 + }, + { + "epoch": 4.058011049723757, + "grad_norm": 0.3178043067455292, + "learning_rate": 6.73234557289792e-05, + "loss": 1.8264, + "step": 13221 + }, + { + "epoch": 4.0583179864947825, + "grad_norm": 0.27355703711509705, + "learning_rate": 6.731879295314546e-05, + "loss": 1.8427, + "step": 13222 + }, + { + "epoch": 4.058624923265807, + "grad_norm": 0.32180166244506836, + "learning_rate": 6.731413000615726e-05, + "loss": 1.7332, + "step": 13223 + }, + { + "epoch": 4.058931860036832, + "grad_norm": 0.3736574351787567, + "learning_rate": 6.730946688806067e-05, + "loss": 1.7447, + "step": 13224 + }, + { + "epoch": 4.059238796807858, + "grad_norm": 0.2526068687438965, + "learning_rate": 6.73048035989018e-05, + "loss": 1.8104, + "step": 13225 + }, + { + "epoch": 4.059545733578883, + "grad_norm": 0.29076167941093445, + "learning_rate": 6.73001401387267e-05, + "loss": 1.7977, + "step": 13226 + }, + { + "epoch": 4.059852670349908, + "grad_norm": 0.37963762879371643, + "learning_rate": 6.729547650758148e-05, + "loss": 1.8336, + "step": 13227 + }, + { + "epoch": 4.060159607120933, + "grad_norm": 0.31584078073501587, + "learning_rate": 6.729081270551222e-05, + "loss": 1.7843, + "step": 13228 + }, + { + "epoch": 4.060466543891958, + "grad_norm": 0.22793468832969666, + "learning_rate": 6.728614873256502e-05, + "loss": 1.7444, + "step": 13229 + }, + { + "epoch": 4.060773480662983, + "grad_norm": 0.3114435076713562, + "learning_rate": 6.728148458878596e-05, + "loss": 1.8012, + "step": 13230 + }, + { + "epoch": 4.061080417434009, + "grad_norm": 0.29843854904174805, + "learning_rate": 6.727682027422116e-05, + "loss": 1.8014, + "step": 13231 + }, + { + "epoch": 4.061387354205034, + "grad_norm": 0.22745616734027863, + "learning_rate": 6.727215578891668e-05, + "loss": 1.7303, + "step": 13232 + }, + { + "epoch": 4.0616942909760585, + "grad_norm": 0.2701241970062256, + "learning_rate": 6.726749113291864e-05, + "loss": 1.7665, + "step": 13233 + }, + { + "epoch": 4.062001227747084, + "grad_norm": 0.29304635524749756, + "learning_rate": 6.726282630627313e-05, + "loss": 1.875, + "step": 13234 + }, + { + "epoch": 4.062308164518109, + "grad_norm": 0.21467708051204681, + "learning_rate": 6.725816130902625e-05, + "loss": 1.7442, + "step": 13235 + }, + { + "epoch": 4.0626151012891345, + "grad_norm": 0.23517470061779022, + "learning_rate": 6.72534961412241e-05, + "loss": 1.7154, + "step": 13236 + }, + { + "epoch": 4.06292203806016, + "grad_norm": 0.21483808755874634, + "learning_rate": 6.724883080291278e-05, + "loss": 1.7162, + "step": 13237 + }, + { + "epoch": 4.063228974831185, + "grad_norm": 0.2274744212627411, + "learning_rate": 6.724416529413843e-05, + "loss": 1.8066, + "step": 13238 + }, + { + "epoch": 4.06353591160221, + "grad_norm": 0.24682378768920898, + "learning_rate": 6.723949961494712e-05, + "loss": 1.7905, + "step": 13239 + }, + { + "epoch": 4.063842848373235, + "grad_norm": 0.2516227066516876, + "learning_rate": 6.723483376538498e-05, + "loss": 1.7693, + "step": 13240 + }, + { + "epoch": 4.06414978514426, + "grad_norm": 0.22076398134231567, + "learning_rate": 6.723016774549808e-05, + "loss": 1.7357, + "step": 13241 + }, + { + "epoch": 4.064456721915286, + "grad_norm": 0.20741026103496552, + "learning_rate": 6.722550155533258e-05, + "loss": 1.8082, + "step": 13242 + }, + { + "epoch": 4.064763658686311, + "grad_norm": 0.2074010819196701, + "learning_rate": 6.722083519493458e-05, + "loss": 1.71, + "step": 13243 + }, + { + "epoch": 4.065070595457335, + "grad_norm": 0.2661527991294861, + "learning_rate": 6.72161686643502e-05, + "loss": 1.7448, + "step": 13244 + }, + { + "epoch": 4.065377532228361, + "grad_norm": 0.2877216935157776, + "learning_rate": 6.721150196362555e-05, + "loss": 1.7574, + "step": 13245 + }, + { + "epoch": 4.065684468999386, + "grad_norm": 0.2520955801010132, + "learning_rate": 6.720683509280675e-05, + "loss": 1.7717, + "step": 13246 + }, + { + "epoch": 4.065991405770411, + "grad_norm": 0.2219560444355011, + "learning_rate": 6.72021680519399e-05, + "loss": 1.7355, + "step": 13247 + }, + { + "epoch": 4.066298342541437, + "grad_norm": 0.24671706557273865, + "learning_rate": 6.719750084107117e-05, + "loss": 1.8204, + "step": 13248 + }, + { + "epoch": 4.066605279312462, + "grad_norm": 0.24512135982513428, + "learning_rate": 6.719283346024664e-05, + "loss": 1.826, + "step": 13249 + }, + { + "epoch": 4.0669122160834865, + "grad_norm": 0.24370841681957245, + "learning_rate": 6.718816590951247e-05, + "loss": 1.8322, + "step": 13250 + }, + { + "epoch": 4.067219152854512, + "grad_norm": 0.2312363088130951, + "learning_rate": 6.718349818891475e-05, + "loss": 1.7621, + "step": 13251 + }, + { + "epoch": 4.067526089625537, + "grad_norm": 0.2500494420528412, + "learning_rate": 6.717883029849965e-05, + "loss": 1.829, + "step": 13252 + }, + { + "epoch": 4.0678330263965625, + "grad_norm": 0.29882633686065674, + "learning_rate": 6.717416223831324e-05, + "loss": 1.799, + "step": 13253 + }, + { + "epoch": 4.068139963167588, + "grad_norm": 0.21962928771972656, + "learning_rate": 6.716949400840172e-05, + "loss": 1.7714, + "step": 13254 + }, + { + "epoch": 4.068446899938612, + "grad_norm": 0.25544899702072144, + "learning_rate": 6.716482560881121e-05, + "loss": 1.7911, + "step": 13255 + }, + { + "epoch": 4.068753836709638, + "grad_norm": 0.24865686893463135, + "learning_rate": 6.716015703958781e-05, + "loss": 1.7107, + "step": 13256 + }, + { + "epoch": 4.069060773480663, + "grad_norm": 0.22669239342212677, + "learning_rate": 6.715548830077769e-05, + "loss": 1.8503, + "step": 13257 + }, + { + "epoch": 4.069367710251688, + "grad_norm": 0.2973819077014923, + "learning_rate": 6.715081939242698e-05, + "loss": 1.7859, + "step": 13258 + }, + { + "epoch": 4.069674647022714, + "grad_norm": 0.3178746700286865, + "learning_rate": 6.714615031458181e-05, + "loss": 1.7705, + "step": 13259 + }, + { + "epoch": 4.069981583793738, + "grad_norm": 0.20452535152435303, + "learning_rate": 6.714148106728835e-05, + "loss": 1.7386, + "step": 13260 + }, + { + "epoch": 4.070288520564763, + "grad_norm": 0.30288320779800415, + "learning_rate": 6.713681165059271e-05, + "loss": 1.7823, + "step": 13261 + }, + { + "epoch": 4.070595457335789, + "grad_norm": 0.30014416575431824, + "learning_rate": 6.713214206454107e-05, + "loss": 1.7626, + "step": 13262 + }, + { + "epoch": 4.070902394106814, + "grad_norm": 0.25144243240356445, + "learning_rate": 6.712747230917956e-05, + "loss": 1.8359, + "step": 13263 + }, + { + "epoch": 4.071209330877839, + "grad_norm": 0.308148592710495, + "learning_rate": 6.712280238455432e-05, + "loss": 1.7226, + "step": 13264 + }, + { + "epoch": 4.071516267648865, + "grad_norm": 0.2704198658466339, + "learning_rate": 6.711813229071151e-05, + "loss": 1.7982, + "step": 13265 + }, + { + "epoch": 4.071823204419889, + "grad_norm": 0.3928656280040741, + "learning_rate": 6.711346202769729e-05, + "loss": 1.7987, + "step": 13266 + }, + { + "epoch": 4.0721301411909145, + "grad_norm": 0.3603350520133972, + "learning_rate": 6.71087915955578e-05, + "loss": 1.7963, + "step": 13267 + }, + { + "epoch": 4.07243707796194, + "grad_norm": 0.2673214077949524, + "learning_rate": 6.710412099433921e-05, + "loss": 1.8011, + "step": 13268 + }, + { + "epoch": 4.072744014732965, + "grad_norm": 0.2523653209209442, + "learning_rate": 6.709945022408768e-05, + "loss": 1.755, + "step": 13269 + }, + { + "epoch": 4.0730509515039905, + "grad_norm": 0.3818903863430023, + "learning_rate": 6.709477928484934e-05, + "loss": 1.7968, + "step": 13270 + }, + { + "epoch": 4.073357888275015, + "grad_norm": 0.31509929895401, + "learning_rate": 6.709010817667039e-05, + "loss": 1.744, + "step": 13271 + }, + { + "epoch": 4.07366482504604, + "grad_norm": 0.21875518560409546, + "learning_rate": 6.708543689959697e-05, + "loss": 1.7511, + "step": 13272 + }, + { + "epoch": 4.073971761817066, + "grad_norm": 0.25381338596343994, + "learning_rate": 6.708076545367523e-05, + "loss": 1.7523, + "step": 13273 + }, + { + "epoch": 4.074278698588091, + "grad_norm": 0.24193842709064484, + "learning_rate": 6.707609383895137e-05, + "loss": 1.7713, + "step": 13274 + }, + { + "epoch": 4.074585635359116, + "grad_norm": 0.21972359716892242, + "learning_rate": 6.707142205547154e-05, + "loss": 1.7329, + "step": 13275 + }, + { + "epoch": 4.074892572130141, + "grad_norm": 0.22188499569892883, + "learning_rate": 6.706675010328192e-05, + "loss": 1.7507, + "step": 13276 + }, + { + "epoch": 4.075199508901166, + "grad_norm": 0.23344436287879944, + "learning_rate": 6.706207798242865e-05, + "loss": 1.771, + "step": 13277 + }, + { + "epoch": 4.0755064456721914, + "grad_norm": 0.3008805513381958, + "learning_rate": 6.705740569295795e-05, + "loss": 1.775, + "step": 13278 + }, + { + "epoch": 4.075813382443217, + "grad_norm": 0.31407982110977173, + "learning_rate": 6.705273323491595e-05, + "loss": 1.7625, + "step": 13279 + }, + { + "epoch": 4.076120319214242, + "grad_norm": 0.2430381178855896, + "learning_rate": 6.704806060834886e-05, + "loss": 1.7706, + "step": 13280 + }, + { + "epoch": 4.0764272559852675, + "grad_norm": 0.23250171542167664, + "learning_rate": 6.704338781330284e-05, + "loss": 1.7977, + "step": 13281 + }, + { + "epoch": 4.076734192756292, + "grad_norm": 0.22073723375797272, + "learning_rate": 6.703871484982407e-05, + "loss": 1.7686, + "step": 13282 + }, + { + "epoch": 4.077041129527317, + "grad_norm": 0.24987035989761353, + "learning_rate": 6.703404171795874e-05, + "loss": 1.736, + "step": 13283 + }, + { + "epoch": 4.077348066298343, + "grad_norm": 0.2697623670101166, + "learning_rate": 6.702936841775301e-05, + "loss": 1.8367, + "step": 13284 + }, + { + "epoch": 4.077655003069368, + "grad_norm": 0.21592749655246735, + "learning_rate": 6.702469494925309e-05, + "loss": 1.7467, + "step": 13285 + }, + { + "epoch": 4.077961939840393, + "grad_norm": 0.2612052261829376, + "learning_rate": 6.702002131250515e-05, + "loss": 1.7689, + "step": 13286 + }, + { + "epoch": 4.078268876611418, + "grad_norm": 0.3004797697067261, + "learning_rate": 6.701534750755539e-05, + "loss": 1.7586, + "step": 13287 + }, + { + "epoch": 4.078575813382443, + "grad_norm": 0.24615366756916046, + "learning_rate": 6.701067353444998e-05, + "loss": 1.7636, + "step": 13288 + }, + { + "epoch": 4.078882750153468, + "grad_norm": 0.23401159048080444, + "learning_rate": 6.700599939323515e-05, + "loss": 1.8015, + "step": 13289 + }, + { + "epoch": 4.079189686924494, + "grad_norm": 0.24546295404434204, + "learning_rate": 6.700132508395705e-05, + "loss": 1.7606, + "step": 13290 + }, + { + "epoch": 4.079496623695519, + "grad_norm": 0.24664412438869476, + "learning_rate": 6.69966506066619e-05, + "loss": 1.7994, + "step": 13291 + }, + { + "epoch": 4.0798035604665435, + "grad_norm": 0.2780163288116455, + "learning_rate": 6.699197596139587e-05, + "loss": 1.7972, + "step": 13292 + }, + { + "epoch": 4.080110497237569, + "grad_norm": 0.2554188668727875, + "learning_rate": 6.698730114820517e-05, + "loss": 1.7928, + "step": 13293 + }, + { + "epoch": 4.080417434008594, + "grad_norm": 0.2471141666173935, + "learning_rate": 6.698262616713602e-05, + "loss": 1.7948, + "step": 13294 + }, + { + "epoch": 4.0807243707796195, + "grad_norm": 0.2556581199169159, + "learning_rate": 6.697795101823461e-05, + "loss": 1.7942, + "step": 13295 + }, + { + "epoch": 4.081031307550645, + "grad_norm": 0.24462421238422394, + "learning_rate": 6.697327570154712e-05, + "loss": 1.7336, + "step": 13296 + }, + { + "epoch": 4.08133824432167, + "grad_norm": 0.22378689050674438, + "learning_rate": 6.696860021711978e-05, + "loss": 1.7703, + "step": 13297 + }, + { + "epoch": 4.081645181092695, + "grad_norm": 0.23949933052062988, + "learning_rate": 6.69639245649988e-05, + "loss": 1.7651, + "step": 13298 + }, + { + "epoch": 4.08195211786372, + "grad_norm": 0.27751216292381287, + "learning_rate": 6.695924874523035e-05, + "loss": 1.7866, + "step": 13299 + }, + { + "epoch": 4.082259054634745, + "grad_norm": 0.22700226306915283, + "learning_rate": 6.695457275786068e-05, + "loss": 1.79, + "step": 13300 + }, + { + "epoch": 4.082565991405771, + "grad_norm": 0.2138090431690216, + "learning_rate": 6.694989660293598e-05, + "loss": 1.7882, + "step": 13301 + }, + { + "epoch": 4.082872928176796, + "grad_norm": 0.2963469326496124, + "learning_rate": 6.694522028050246e-05, + "loss": 1.8779, + "step": 13302 + }, + { + "epoch": 4.08317986494782, + "grad_norm": 0.31833669543266296, + "learning_rate": 6.694054379060634e-05, + "loss": 1.7923, + "step": 13303 + }, + { + "epoch": 4.083486801718846, + "grad_norm": 0.27751585841178894, + "learning_rate": 6.693586713329385e-05, + "loss": 1.7557, + "step": 13304 + }, + { + "epoch": 4.083793738489871, + "grad_norm": 0.23790816962718964, + "learning_rate": 6.69311903086112e-05, + "loss": 1.7587, + "step": 13305 + }, + { + "epoch": 4.084100675260896, + "grad_norm": 0.24153777956962585, + "learning_rate": 6.692651331660458e-05, + "loss": 1.7573, + "step": 13306 + }, + { + "epoch": 4.084407612031922, + "grad_norm": 0.26607179641723633, + "learning_rate": 6.692183615732025e-05, + "loss": 1.7823, + "step": 13307 + }, + { + "epoch": 4.084714548802946, + "grad_norm": 0.26670268177986145, + "learning_rate": 6.691715883080442e-05, + "loss": 1.784, + "step": 13308 + }, + { + "epoch": 4.0850214855739715, + "grad_norm": 0.25980666279792786, + "learning_rate": 6.69124813371033e-05, + "loss": 1.797, + "step": 13309 + }, + { + "epoch": 4.085328422344997, + "grad_norm": 0.2805597484111786, + "learning_rate": 6.690780367626314e-05, + "loss": 1.8298, + "step": 13310 + }, + { + "epoch": 4.085635359116022, + "grad_norm": 0.27198413014411926, + "learning_rate": 6.690312584833012e-05, + "loss": 1.8104, + "step": 13311 + }, + { + "epoch": 4.0859422958870475, + "grad_norm": 0.2619116008281708, + "learning_rate": 6.689844785335054e-05, + "loss": 1.771, + "step": 13312 + }, + { + "epoch": 4.086249232658073, + "grad_norm": 0.22647863626480103, + "learning_rate": 6.689376969137057e-05, + "loss": 1.8114, + "step": 13313 + }, + { + "epoch": 4.086556169429097, + "grad_norm": 1.469475507736206, + "learning_rate": 6.68890913624365e-05, + "loss": 1.8796, + "step": 13314 + }, + { + "epoch": 4.086863106200123, + "grad_norm": 0.4577515423297882, + "learning_rate": 6.68844128665945e-05, + "loss": 1.716, + "step": 13315 + }, + { + "epoch": 4.087170042971148, + "grad_norm": 0.5830543637275696, + "learning_rate": 6.687973420389085e-05, + "loss": 1.7692, + "step": 13316 + }, + { + "epoch": 4.087476979742173, + "grad_norm": 0.4404197037220001, + "learning_rate": 6.687505537437178e-05, + "loss": 1.7909, + "step": 13317 + }, + { + "epoch": 4.087783916513199, + "grad_norm": 0.31379908323287964, + "learning_rate": 6.68703763780835e-05, + "loss": 1.7957, + "step": 13318 + }, + { + "epoch": 4.088090853284223, + "grad_norm": 0.49588730931282043, + "learning_rate": 6.686569721507229e-05, + "loss": 1.7126, + "step": 13319 + }, + { + "epoch": 4.088397790055248, + "grad_norm": 0.3690234124660492, + "learning_rate": 6.686101788538437e-05, + "loss": 1.8233, + "step": 13320 + }, + { + "epoch": 4.088704726826274, + "grad_norm": 0.337310254573822, + "learning_rate": 6.685633838906598e-05, + "loss": 1.6886, + "step": 13321 + }, + { + "epoch": 4.089011663597299, + "grad_norm": 0.5164821147918701, + "learning_rate": 6.685165872616337e-05, + "loss": 1.7967, + "step": 13322 + }, + { + "epoch": 4.089318600368324, + "grad_norm": 0.36501309275627136, + "learning_rate": 6.68469788967228e-05, + "loss": 1.755, + "step": 13323 + }, + { + "epoch": 4.08962553713935, + "grad_norm": 0.35017216205596924, + "learning_rate": 6.684229890079052e-05, + "loss": 1.7595, + "step": 13324 + }, + { + "epoch": 4.089932473910374, + "grad_norm": 0.5622650980949402, + "learning_rate": 6.683761873841277e-05, + "loss": 1.7841, + "step": 13325 + }, + { + "epoch": 4.0902394106813995, + "grad_norm": 0.47010260820388794, + "learning_rate": 6.683293840963578e-05, + "loss": 1.7537, + "step": 13326 + }, + { + "epoch": 4.090546347452425, + "grad_norm": 0.25515374541282654, + "learning_rate": 6.682825791450584e-05, + "loss": 1.7692, + "step": 13327 + }, + { + "epoch": 4.09085328422345, + "grad_norm": 0.5063003897666931, + "learning_rate": 6.682357725306919e-05, + "loss": 1.7454, + "step": 13328 + }, + { + "epoch": 4.0911602209944755, + "grad_norm": 0.4197622835636139, + "learning_rate": 6.681889642537209e-05, + "loss": 1.7792, + "step": 13329 + }, + { + "epoch": 4.0914671577655, + "grad_norm": 0.24038295447826385, + "learning_rate": 6.68142154314608e-05, + "loss": 1.7631, + "step": 13330 + }, + { + "epoch": 4.091774094536525, + "grad_norm": 0.42108532786369324, + "learning_rate": 6.680953427138159e-05, + "loss": 1.7784, + "step": 13331 + }, + { + "epoch": 4.092081031307551, + "grad_norm": 0.33729633688926697, + "learning_rate": 6.68048529451807e-05, + "loss": 1.8057, + "step": 13332 + }, + { + "epoch": 4.092387968078576, + "grad_norm": 0.31847241520881653, + "learning_rate": 6.68001714529044e-05, + "loss": 1.7375, + "step": 13333 + }, + { + "epoch": 4.092694904849601, + "grad_norm": 0.45276644825935364, + "learning_rate": 6.679548979459896e-05, + "loss": 1.7507, + "step": 13334 + }, + { + "epoch": 4.093001841620626, + "grad_norm": 0.3781665861606598, + "learning_rate": 6.679080797031065e-05, + "loss": 1.7718, + "step": 13335 + }, + { + "epoch": 4.093308778391651, + "grad_norm": 0.25868359208106995, + "learning_rate": 6.678612598008573e-05, + "loss": 1.8105, + "step": 13336 + }, + { + "epoch": 4.093615715162676, + "grad_norm": 0.32834702730178833, + "learning_rate": 6.678144382397048e-05, + "loss": 1.7883, + "step": 13337 + }, + { + "epoch": 4.093922651933702, + "grad_norm": 0.2830568253993988, + "learning_rate": 6.677676150201116e-05, + "loss": 1.7994, + "step": 13338 + }, + { + "epoch": 4.094229588704727, + "grad_norm": 0.219541534781456, + "learning_rate": 6.677207901425405e-05, + "loss": 1.7344, + "step": 13339 + }, + { + "epoch": 4.094536525475752, + "grad_norm": 0.2557326555252075, + "learning_rate": 6.676739636074542e-05, + "loss": 1.7734, + "step": 13340 + }, + { + "epoch": 4.094843462246777, + "grad_norm": 0.2741365432739258, + "learning_rate": 6.676271354153156e-05, + "loss": 1.7912, + "step": 13341 + }, + { + "epoch": 4.095150399017802, + "grad_norm": 0.31258970499038696, + "learning_rate": 6.675803055665874e-05, + "loss": 1.7798, + "step": 13342 + }, + { + "epoch": 4.0954573357888275, + "grad_norm": 0.30181947350502014, + "learning_rate": 6.675334740617322e-05, + "loss": 1.7746, + "step": 13343 + }, + { + "epoch": 4.095764272559853, + "grad_norm": 0.3000102937221527, + "learning_rate": 6.674866409012133e-05, + "loss": 1.7842, + "step": 13344 + }, + { + "epoch": 4.096071209330878, + "grad_norm": 0.22871005535125732, + "learning_rate": 6.674398060854931e-05, + "loss": 1.7473, + "step": 13345 + }, + { + "epoch": 4.096378146101903, + "grad_norm": 0.2700810432434082, + "learning_rate": 6.673929696150346e-05, + "loss": 1.7862, + "step": 13346 + }, + { + "epoch": 4.096685082872928, + "grad_norm": 0.27537551522254944, + "learning_rate": 6.673461314903007e-05, + "loss": 1.7843, + "step": 13347 + }, + { + "epoch": 4.096992019643953, + "grad_norm": 0.23700574040412903, + "learning_rate": 6.672992917117542e-05, + "loss": 1.765, + "step": 13348 + }, + { + "epoch": 4.097298956414979, + "grad_norm": 0.23331589996814728, + "learning_rate": 6.672524502798583e-05, + "loss": 1.7894, + "step": 13349 + }, + { + "epoch": 4.097605893186004, + "grad_norm": 0.28591978549957275, + "learning_rate": 6.672056071950753e-05, + "loss": 1.7736, + "step": 13350 + }, + { + "epoch": 4.097912829957028, + "grad_norm": 0.3000452518463135, + "learning_rate": 6.671587624578685e-05, + "loss": 1.7635, + "step": 13351 + }, + { + "epoch": 4.098219766728054, + "grad_norm": 0.21877998113632202, + "learning_rate": 6.67111916068701e-05, + "loss": 1.7225, + "step": 13352 + }, + { + "epoch": 4.098526703499079, + "grad_norm": 0.2598817050457001, + "learning_rate": 6.670650680280358e-05, + "loss": 1.6874, + "step": 13353 + }, + { + "epoch": 4.098833640270104, + "grad_norm": 0.3063203692436218, + "learning_rate": 6.670182183363353e-05, + "loss": 1.7821, + "step": 13354 + }, + { + "epoch": 4.09914057704113, + "grad_norm": 0.2328508347272873, + "learning_rate": 6.66971366994063e-05, + "loss": 1.788, + "step": 13355 + }, + { + "epoch": 4.099447513812155, + "grad_norm": 0.33936765789985657, + "learning_rate": 6.669245140016817e-05, + "loss": 1.8159, + "step": 13356 + }, + { + "epoch": 4.0997544505831796, + "grad_norm": 0.27464553713798523, + "learning_rate": 6.668776593596546e-05, + "loss": 1.7371, + "step": 13357 + }, + { + "epoch": 4.100061387354205, + "grad_norm": 0.24255812168121338, + "learning_rate": 6.668308030684447e-05, + "loss": 1.7993, + "step": 13358 + }, + { + "epoch": 4.10036832412523, + "grad_norm": 0.27203628420829773, + "learning_rate": 6.667839451285149e-05, + "loss": 1.8253, + "step": 13359 + }, + { + "epoch": 4.100675260896256, + "grad_norm": 0.2503862679004669, + "learning_rate": 6.667370855403286e-05, + "loss": 1.7927, + "step": 13360 + }, + { + "epoch": 4.100982197667281, + "grad_norm": 0.2616904377937317, + "learning_rate": 6.666902243043486e-05, + "loss": 1.8226, + "step": 13361 + }, + { + "epoch": 4.101289134438305, + "grad_norm": 0.26707521080970764, + "learning_rate": 6.666433614210379e-05, + "loss": 1.8485, + "step": 13362 + }, + { + "epoch": 4.101596071209331, + "grad_norm": 0.2427528202533722, + "learning_rate": 6.6659649689086e-05, + "loss": 1.7387, + "step": 13363 + }, + { + "epoch": 4.101903007980356, + "grad_norm": 0.2319549173116684, + "learning_rate": 6.66549630714278e-05, + "loss": 1.7396, + "step": 13364 + }, + { + "epoch": 4.102209944751381, + "grad_norm": 0.2248002141714096, + "learning_rate": 6.665027628917548e-05, + "loss": 1.7817, + "step": 13365 + }, + { + "epoch": 4.102516881522407, + "grad_norm": 0.21929535269737244, + "learning_rate": 6.664558934237538e-05, + "loss": 1.7478, + "step": 13366 + }, + { + "epoch": 4.102823818293431, + "grad_norm": 0.21144583821296692, + "learning_rate": 6.66409022310738e-05, + "loss": 1.7602, + "step": 13367 + }, + { + "epoch": 4.1031307550644565, + "grad_norm": 0.21984660625457764, + "learning_rate": 6.663621495531707e-05, + "loss": 1.7541, + "step": 13368 + }, + { + "epoch": 4.103437691835482, + "grad_norm": 0.2075357735157013, + "learning_rate": 6.663152751515152e-05, + "loss": 1.7362, + "step": 13369 + }, + { + "epoch": 4.103744628606507, + "grad_norm": 0.23316961526870728, + "learning_rate": 6.662683991062347e-05, + "loss": 1.8273, + "step": 13370 + }, + { + "epoch": 4.1040515653775325, + "grad_norm": 0.23142337799072266, + "learning_rate": 6.662215214177922e-05, + "loss": 1.7543, + "step": 13371 + }, + { + "epoch": 4.104358502148558, + "grad_norm": 0.24335260689258575, + "learning_rate": 6.661746420866515e-05, + "loss": 1.8328, + "step": 13372 + }, + { + "epoch": 4.104665438919582, + "grad_norm": 0.2440192997455597, + "learning_rate": 6.661277611132753e-05, + "loss": 1.8114, + "step": 13373 + }, + { + "epoch": 4.104972375690608, + "grad_norm": 0.252808541059494, + "learning_rate": 6.660808784981273e-05, + "loss": 1.8556, + "step": 13374 + }, + { + "epoch": 4.105279312461633, + "grad_norm": 0.24564477801322937, + "learning_rate": 6.660339942416708e-05, + "loss": 1.8231, + "step": 13375 + }, + { + "epoch": 4.105586249232658, + "grad_norm": 0.2371874898672104, + "learning_rate": 6.65987108344369e-05, + "loss": 1.7763, + "step": 13376 + }, + { + "epoch": 4.105893186003684, + "grad_norm": 0.22882802784442902, + "learning_rate": 6.659402208066854e-05, + "loss": 1.7388, + "step": 13377 + }, + { + "epoch": 4.106200122774708, + "grad_norm": 0.24857540428638458, + "learning_rate": 6.658933316290832e-05, + "loss": 1.7735, + "step": 13378 + }, + { + "epoch": 4.106507059545733, + "grad_norm": 0.22574029862880707, + "learning_rate": 6.658464408120257e-05, + "loss": 1.7403, + "step": 13379 + }, + { + "epoch": 4.106813996316759, + "grad_norm": 0.24944272637367249, + "learning_rate": 6.657995483559767e-05, + "loss": 1.7827, + "step": 13380 + }, + { + "epoch": 4.107120933087784, + "grad_norm": 0.27386224269866943, + "learning_rate": 6.657526542613992e-05, + "loss": 1.7673, + "step": 13381 + }, + { + "epoch": 4.107427869858809, + "grad_norm": 0.29222097992897034, + "learning_rate": 6.65705758528757e-05, + "loss": 1.7958, + "step": 13382 + }, + { + "epoch": 4.107734806629834, + "grad_norm": 0.2471150904893875, + "learning_rate": 6.656588611585133e-05, + "loss": 1.7706, + "step": 13383 + }, + { + "epoch": 4.108041743400859, + "grad_norm": 0.289316862821579, + "learning_rate": 6.656119621511317e-05, + "loss": 1.7828, + "step": 13384 + }, + { + "epoch": 4.1083486801718845, + "grad_norm": 0.36710497736930847, + "learning_rate": 6.655650615070756e-05, + "loss": 1.712, + "step": 13385 + }, + { + "epoch": 4.10865561694291, + "grad_norm": 0.2999880611896515, + "learning_rate": 6.655181592268084e-05, + "loss": 1.7711, + "step": 13386 + }, + { + "epoch": 4.108962553713935, + "grad_norm": 0.332011342048645, + "learning_rate": 6.654712553107939e-05, + "loss": 1.907, + "step": 13387 + }, + { + "epoch": 4.1092694904849605, + "grad_norm": 0.43125995993614197, + "learning_rate": 6.654243497594953e-05, + "loss": 1.7819, + "step": 13388 + }, + { + "epoch": 4.109576427255985, + "grad_norm": 0.33719149231910706, + "learning_rate": 6.653774425733765e-05, + "loss": 1.797, + "step": 13389 + }, + { + "epoch": 4.10988336402701, + "grad_norm": 0.23091599345207214, + "learning_rate": 6.653305337529006e-05, + "loss": 1.7384, + "step": 13390 + }, + { + "epoch": 4.110190300798036, + "grad_norm": 0.4283982515335083, + "learning_rate": 6.652836232985317e-05, + "loss": 1.8284, + "step": 13391 + }, + { + "epoch": 4.110497237569061, + "grad_norm": 0.43575870990753174, + "learning_rate": 6.652367112107332e-05, + "loss": 1.7235, + "step": 13392 + }, + { + "epoch": 4.110804174340086, + "grad_norm": 0.246877059340477, + "learning_rate": 6.651897974899685e-05, + "loss": 1.7174, + "step": 13393 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.36063629388809204, + "learning_rate": 6.651428821367015e-05, + "loss": 1.8064, + "step": 13394 + }, + { + "epoch": 4.111418047882136, + "grad_norm": 0.4454420804977417, + "learning_rate": 6.650959651513957e-05, + "loss": 1.7575, + "step": 13395 + }, + { + "epoch": 4.111724984653161, + "grad_norm": 0.2788856327533722, + "learning_rate": 6.650490465345149e-05, + "loss": 1.7696, + "step": 13396 + }, + { + "epoch": 4.112031921424187, + "grad_norm": 0.40281879901885986, + "learning_rate": 6.650021262865225e-05, + "loss": 1.8368, + "step": 13397 + }, + { + "epoch": 4.112338858195212, + "grad_norm": 0.5151103138923645, + "learning_rate": 6.649552044078825e-05, + "loss": 1.8224, + "step": 13398 + }, + { + "epoch": 4.112645794966237, + "grad_norm": 0.29390639066696167, + "learning_rate": 6.649082808990586e-05, + "loss": 1.7846, + "step": 13399 + }, + { + "epoch": 4.112952731737262, + "grad_norm": 0.3061942458152771, + "learning_rate": 6.648613557605142e-05, + "loss": 1.7954, + "step": 13400 + }, + { + "epoch": 4.113259668508287, + "grad_norm": 0.47628748416900635, + "learning_rate": 6.648144289927132e-05, + "loss": 1.7782, + "step": 13401 + }, + { + "epoch": 4.1135666052793125, + "grad_norm": 0.4299588203430176, + "learning_rate": 6.647675005961197e-05, + "loss": 1.7459, + "step": 13402 + }, + { + "epoch": 4.113873542050338, + "grad_norm": 0.24556589126586914, + "learning_rate": 6.64720570571197e-05, + "loss": 1.753, + "step": 13403 + }, + { + "epoch": 4.114180478821363, + "grad_norm": 0.29620522260665894, + "learning_rate": 6.646736389184092e-05, + "loss": 1.773, + "step": 13404 + }, + { + "epoch": 4.114487415592388, + "grad_norm": 0.37710070610046387, + "learning_rate": 6.646267056382199e-05, + "loss": 1.8389, + "step": 13405 + }, + { + "epoch": 4.114794352363413, + "grad_norm": 0.2562984824180603, + "learning_rate": 6.64579770731093e-05, + "loss": 1.7905, + "step": 13406 + }, + { + "epoch": 4.115101289134438, + "grad_norm": 0.3999946713447571, + "learning_rate": 6.645328341974924e-05, + "loss": 1.7734, + "step": 13407 + }, + { + "epoch": 4.115408225905464, + "grad_norm": 0.36087217926979065, + "learning_rate": 6.644858960378817e-05, + "loss": 1.801, + "step": 13408 + }, + { + "epoch": 4.115715162676489, + "grad_norm": 0.2520254850387573, + "learning_rate": 6.644389562527251e-05, + "loss": 1.7394, + "step": 13409 + }, + { + "epoch": 4.116022099447513, + "grad_norm": 0.4321835935115814, + "learning_rate": 6.643920148424864e-05, + "loss": 1.8091, + "step": 13410 + }, + { + "epoch": 4.116329036218539, + "grad_norm": 0.40900173783302307, + "learning_rate": 6.643450718076294e-05, + "loss": 1.8198, + "step": 13411 + }, + { + "epoch": 4.116635972989564, + "grad_norm": 0.23693956434726715, + "learning_rate": 6.642981271486182e-05, + "loss": 1.6807, + "step": 13412 + }, + { + "epoch": 4.116942909760589, + "grad_norm": 0.33526891469955444, + "learning_rate": 6.642511808659164e-05, + "loss": 1.8673, + "step": 13413 + }, + { + "epoch": 4.117249846531615, + "grad_norm": 0.4037325382232666, + "learning_rate": 6.642042329599883e-05, + "loss": 1.743, + "step": 13414 + }, + { + "epoch": 4.11755678330264, + "grad_norm": 0.25629740953445435, + "learning_rate": 6.641572834312975e-05, + "loss": 1.6904, + "step": 13415 + }, + { + "epoch": 4.1178637200736645, + "grad_norm": 0.29203253984451294, + "learning_rate": 6.641103322803087e-05, + "loss": 1.7811, + "step": 13416 + }, + { + "epoch": 4.11817065684469, + "grad_norm": 0.423926442861557, + "learning_rate": 6.64063379507485e-05, + "loss": 1.7341, + "step": 13417 + }, + { + "epoch": 4.118477593615715, + "grad_norm": 0.29561251401901245, + "learning_rate": 6.64016425113291e-05, + "loss": 1.7915, + "step": 13418 + }, + { + "epoch": 4.1187845303867405, + "grad_norm": 0.2536832094192505, + "learning_rate": 6.639694690981903e-05, + "loss": 1.7628, + "step": 13419 + }, + { + "epoch": 4.119091467157766, + "grad_norm": 0.2931392192840576, + "learning_rate": 6.639225114626475e-05, + "loss": 1.7877, + "step": 13420 + }, + { + "epoch": 4.11939840392879, + "grad_norm": 0.2219499796628952, + "learning_rate": 6.638755522071263e-05, + "loss": 1.7183, + "step": 13421 + }, + { + "epoch": 4.119705340699816, + "grad_norm": 0.2951931953430176, + "learning_rate": 6.638285913320908e-05, + "loss": 1.7983, + "step": 13422 + }, + { + "epoch": 4.120012277470841, + "grad_norm": 0.3495960533618927, + "learning_rate": 6.63781628838005e-05, + "loss": 1.7531, + "step": 13423 + }, + { + "epoch": 4.120319214241866, + "grad_norm": 0.2389262616634369, + "learning_rate": 6.637346647253333e-05, + "loss": 1.7454, + "step": 13424 + }, + { + "epoch": 4.120626151012892, + "grad_norm": 0.28729167580604553, + "learning_rate": 6.636876989945395e-05, + "loss": 1.8105, + "step": 13425 + }, + { + "epoch": 4.120933087783916, + "grad_norm": 0.2620082199573517, + "learning_rate": 6.636407316460882e-05, + "loss": 1.7948, + "step": 13426 + }, + { + "epoch": 4.121240024554941, + "grad_norm": 0.2694189250469208, + "learning_rate": 6.635937626804432e-05, + "loss": 1.809, + "step": 13427 + }, + { + "epoch": 4.121546961325967, + "grad_norm": 0.2660866379737854, + "learning_rate": 6.635467920980687e-05, + "loss": 1.7431, + "step": 13428 + }, + { + "epoch": 4.121853898096992, + "grad_norm": 0.2579907774925232, + "learning_rate": 6.634998198994289e-05, + "loss": 1.7941, + "step": 13429 + }, + { + "epoch": 4.122160834868017, + "grad_norm": 0.28349989652633667, + "learning_rate": 6.634528460849881e-05, + "loss": 1.8142, + "step": 13430 + }, + { + "epoch": 4.122467771639043, + "grad_norm": 0.28716522455215454, + "learning_rate": 6.634058706552104e-05, + "loss": 1.7496, + "step": 13431 + }, + { + "epoch": 4.122774708410067, + "grad_norm": 0.23228077590465546, + "learning_rate": 6.633588936105601e-05, + "loss": 1.7399, + "step": 13432 + }, + { + "epoch": 4.1230816451810925, + "grad_norm": 0.3649841248989105, + "learning_rate": 6.633119149515017e-05, + "loss": 1.7696, + "step": 13433 + }, + { + "epoch": 4.123388581952118, + "grad_norm": 0.2757830321788788, + "learning_rate": 6.632649346784992e-05, + "loss": 1.8329, + "step": 13434 + }, + { + "epoch": 4.123695518723143, + "grad_norm": 0.28163692355155945, + "learning_rate": 6.632179527920167e-05, + "loss": 1.7761, + "step": 13435 + }, + { + "epoch": 4.1240024554941686, + "grad_norm": 0.3453187048435211, + "learning_rate": 6.631709692925188e-05, + "loss": 1.7843, + "step": 13436 + }, + { + "epoch": 4.124309392265193, + "grad_norm": 0.2792697250843048, + "learning_rate": 6.631239841804698e-05, + "loss": 1.7889, + "step": 13437 + }, + { + "epoch": 4.124616329036218, + "grad_norm": 0.21881693601608276, + "learning_rate": 6.630769974563339e-05, + "loss": 1.8015, + "step": 13438 + }, + { + "epoch": 4.124923265807244, + "grad_norm": 0.4464910328388214, + "learning_rate": 6.630300091205756e-05, + "loss": 1.7851, + "step": 13439 + }, + { + "epoch": 4.125230202578269, + "grad_norm": 0.40191107988357544, + "learning_rate": 6.629830191736591e-05, + "loss": 1.8608, + "step": 13440 + }, + { + "epoch": 4.125537139349294, + "grad_norm": 0.2809060513973236, + "learning_rate": 6.62936027616049e-05, + "loss": 1.7374, + "step": 13441 + }, + { + "epoch": 4.12584407612032, + "grad_norm": 0.24980643391609192, + "learning_rate": 6.628890344482095e-05, + "loss": 1.8152, + "step": 13442 + }, + { + "epoch": 4.126151012891344, + "grad_norm": 0.24538342654705048, + "learning_rate": 6.62842039670605e-05, + "loss": 1.7687, + "step": 13443 + }, + { + "epoch": 4.1264579496623695, + "grad_norm": 0.24684634804725647, + "learning_rate": 6.627950432837002e-05, + "loss": 1.787, + "step": 13444 + }, + { + "epoch": 4.126764886433395, + "grad_norm": 0.22724607586860657, + "learning_rate": 6.627480452879593e-05, + "loss": 1.7871, + "step": 13445 + }, + { + "epoch": 4.12707182320442, + "grad_norm": 0.24724406003952026, + "learning_rate": 6.627010456838469e-05, + "loss": 1.7524, + "step": 13446 + }, + { + "epoch": 4.1273787599754455, + "grad_norm": 0.24219536781311035, + "learning_rate": 6.626540444718274e-05, + "loss": 1.7754, + "step": 13447 + }, + { + "epoch": 4.12768569674647, + "grad_norm": 0.24857915937900543, + "learning_rate": 6.626070416523652e-05, + "loss": 1.7839, + "step": 13448 + }, + { + "epoch": 4.127992633517495, + "grad_norm": 0.2639105021953583, + "learning_rate": 6.625600372259248e-05, + "loss": 1.7546, + "step": 13449 + }, + { + "epoch": 4.128299570288521, + "grad_norm": 0.23598137497901917, + "learning_rate": 6.62513031192971e-05, + "loss": 1.7957, + "step": 13450 + }, + { + "epoch": 4.128606507059546, + "grad_norm": 0.3038909137248993, + "learning_rate": 6.624660235539682e-05, + "loss": 1.8117, + "step": 13451 + }, + { + "epoch": 4.128913443830571, + "grad_norm": 0.27671241760253906, + "learning_rate": 6.624190143093809e-05, + "loss": 1.729, + "step": 13452 + }, + { + "epoch": 4.129220380601596, + "grad_norm": 0.24638360738754272, + "learning_rate": 6.623720034596735e-05, + "loss": 1.7414, + "step": 13453 + }, + { + "epoch": 4.129527317372621, + "grad_norm": 0.24073924124240875, + "learning_rate": 6.623249910053111e-05, + "loss": 1.8046, + "step": 13454 + }, + { + "epoch": 4.129834254143646, + "grad_norm": 0.29734376072883606, + "learning_rate": 6.622779769467578e-05, + "loss": 1.8336, + "step": 13455 + }, + { + "epoch": 4.130141190914672, + "grad_norm": 0.23182810842990875, + "learning_rate": 6.622309612844785e-05, + "loss": 1.7742, + "step": 13456 + }, + { + "epoch": 4.130448127685697, + "grad_norm": 0.2179390788078308, + "learning_rate": 6.621839440189378e-05, + "loss": 1.7656, + "step": 13457 + }, + { + "epoch": 4.1307550644567215, + "grad_norm": 0.21389013528823853, + "learning_rate": 6.621369251506002e-05, + "loss": 1.7504, + "step": 13458 + }, + { + "epoch": 4.131062001227747, + "grad_norm": 0.22306203842163086, + "learning_rate": 6.620899046799305e-05, + "loss": 1.7573, + "step": 13459 + }, + { + "epoch": 4.131368937998772, + "grad_norm": 0.2699708938598633, + "learning_rate": 6.620428826073934e-05, + "loss": 1.7419, + "step": 13460 + }, + { + "epoch": 4.1316758747697975, + "grad_norm": 0.34087565541267395, + "learning_rate": 6.619958589334534e-05, + "loss": 1.7545, + "step": 13461 + }, + { + "epoch": 4.131982811540823, + "grad_norm": 0.2934977412223816, + "learning_rate": 6.619488336585755e-05, + "loss": 1.7611, + "step": 13462 + }, + { + "epoch": 4.132289748311848, + "grad_norm": 0.22545567154884338, + "learning_rate": 6.619018067832243e-05, + "loss": 1.7562, + "step": 13463 + }, + { + "epoch": 4.132596685082873, + "grad_norm": 0.23334743082523346, + "learning_rate": 6.618547783078647e-05, + "loss": 1.7784, + "step": 13464 + }, + { + "epoch": 4.132903621853898, + "grad_norm": 0.22466403245925903, + "learning_rate": 6.618077482329612e-05, + "loss": 1.7277, + "step": 13465 + }, + { + "epoch": 4.133210558624923, + "grad_norm": 0.23504197597503662, + "learning_rate": 6.617607165589785e-05, + "loss": 1.7983, + "step": 13466 + }, + { + "epoch": 4.133517495395949, + "grad_norm": 0.2500833570957184, + "learning_rate": 6.617136832863819e-05, + "loss": 1.7826, + "step": 13467 + }, + { + "epoch": 4.133824432166974, + "grad_norm": 0.22398658096790314, + "learning_rate": 6.616666484156357e-05, + "loss": 1.7281, + "step": 13468 + }, + { + "epoch": 4.134131368937998, + "grad_norm": 0.2537873089313507, + "learning_rate": 6.616196119472052e-05, + "loss": 1.7598, + "step": 13469 + }, + { + "epoch": 4.134438305709024, + "grad_norm": 0.26881173253059387, + "learning_rate": 6.615725738815546e-05, + "loss": 1.8161, + "step": 13470 + }, + { + "epoch": 4.134745242480049, + "grad_norm": 0.3311346471309662, + "learning_rate": 6.615255342191492e-05, + "loss": 1.7954, + "step": 13471 + }, + { + "epoch": 4.135052179251074, + "grad_norm": 0.2562953233718872, + "learning_rate": 6.614784929604539e-05, + "loss": 1.7284, + "step": 13472 + }, + { + "epoch": 4.1353591160221, + "grad_norm": 0.2563154101371765, + "learning_rate": 6.614314501059334e-05, + "loss": 1.7995, + "step": 13473 + }, + { + "epoch": 4.135666052793125, + "grad_norm": 0.24861161410808563, + "learning_rate": 6.613844056560527e-05, + "loss": 1.7589, + "step": 13474 + }, + { + "epoch": 4.1359729895641495, + "grad_norm": 0.23815487325191498, + "learning_rate": 6.613373596112769e-05, + "loss": 1.6906, + "step": 13475 + }, + { + "epoch": 4.136279926335175, + "grad_norm": 0.25394049286842346, + "learning_rate": 6.612903119720705e-05, + "loss": 1.781, + "step": 13476 + }, + { + "epoch": 4.1365868631062, + "grad_norm": 0.24501466751098633, + "learning_rate": 6.612432627388988e-05, + "loss": 1.797, + "step": 13477 + }, + { + "epoch": 4.1368937998772255, + "grad_norm": 0.24909707903862, + "learning_rate": 6.611962119122267e-05, + "loss": 1.7643, + "step": 13478 + }, + { + "epoch": 4.137200736648251, + "grad_norm": 0.24954476952552795, + "learning_rate": 6.611491594925192e-05, + "loss": 1.8219, + "step": 13479 + }, + { + "epoch": 4.137507673419275, + "grad_norm": 0.30572372674942017, + "learning_rate": 6.611021054802411e-05, + "loss": 1.8039, + "step": 13480 + }, + { + "epoch": 4.137814610190301, + "grad_norm": 0.27466365694999695, + "learning_rate": 6.610550498758577e-05, + "loss": 1.6945, + "step": 13481 + }, + { + "epoch": 4.138121546961326, + "grad_norm": 0.2614271640777588, + "learning_rate": 6.610079926798339e-05, + "loss": 1.8648, + "step": 13482 + }, + { + "epoch": 4.138428483732351, + "grad_norm": 0.23645827174186707, + "learning_rate": 6.609609338926346e-05, + "loss": 1.7424, + "step": 13483 + }, + { + "epoch": 4.138735420503377, + "grad_norm": 0.24473626911640167, + "learning_rate": 6.609138735147253e-05, + "loss": 1.8036, + "step": 13484 + }, + { + "epoch": 4.139042357274401, + "grad_norm": 0.2472417950630188, + "learning_rate": 6.608668115465706e-05, + "loss": 1.794, + "step": 13485 + }, + { + "epoch": 4.139349294045426, + "grad_norm": 0.25330284237861633, + "learning_rate": 6.608197479886358e-05, + "loss": 1.8052, + "step": 13486 + }, + { + "epoch": 4.139656230816452, + "grad_norm": 0.24279309809207916, + "learning_rate": 6.60772682841386e-05, + "loss": 1.7375, + "step": 13487 + }, + { + "epoch": 4.139963167587477, + "grad_norm": 0.22319461405277252, + "learning_rate": 6.607256161052862e-05, + "loss": 1.7696, + "step": 13488 + }, + { + "epoch": 4.140270104358502, + "grad_norm": 0.25261563062667847, + "learning_rate": 6.606785477808017e-05, + "loss": 1.7646, + "step": 13489 + }, + { + "epoch": 4.140577041129528, + "grad_norm": 0.3127744793891907, + "learning_rate": 6.606314778683977e-05, + "loss": 1.7899, + "step": 13490 + }, + { + "epoch": 4.140883977900552, + "grad_norm": 0.3550816774368286, + "learning_rate": 6.605844063685392e-05, + "loss": 1.7971, + "step": 13491 + }, + { + "epoch": 4.1411909146715775, + "grad_norm": 0.20977813005447388, + "learning_rate": 6.605373332816916e-05, + "loss": 1.7416, + "step": 13492 + }, + { + "epoch": 4.141497851442603, + "grad_norm": 0.26593849062919617, + "learning_rate": 6.6049025860832e-05, + "loss": 1.7586, + "step": 13493 + }, + { + "epoch": 4.141804788213628, + "grad_norm": 0.2452937364578247, + "learning_rate": 6.604431823488893e-05, + "loss": 1.757, + "step": 13494 + }, + { + "epoch": 4.1421117249846535, + "grad_norm": 0.21029168367385864, + "learning_rate": 6.603961045038652e-05, + "loss": 1.7665, + "step": 13495 + }, + { + "epoch": 4.142418661755678, + "grad_norm": 0.2396312952041626, + "learning_rate": 6.603490250737128e-05, + "loss": 1.7609, + "step": 13496 + }, + { + "epoch": 4.142725598526703, + "grad_norm": 0.23266808688640594, + "learning_rate": 6.603019440588975e-05, + "loss": 1.7893, + "step": 13497 + }, + { + "epoch": 4.143032535297729, + "grad_norm": 0.25235217809677124, + "learning_rate": 6.602548614598842e-05, + "loss": 1.7465, + "step": 13498 + }, + { + "epoch": 4.143339472068754, + "grad_norm": 0.22944024205207825, + "learning_rate": 6.602077772771386e-05, + "loss": 1.7052, + "step": 13499 + }, + { + "epoch": 4.143646408839779, + "grad_norm": 0.2116660475730896, + "learning_rate": 6.601606915111257e-05, + "loss": 1.7042, + "step": 13500 + }, + { + "epoch": 4.143953345610804, + "grad_norm": 0.21777184307575226, + "learning_rate": 6.601136041623111e-05, + "loss": 1.7938, + "step": 13501 + }, + { + "epoch": 4.144260282381829, + "grad_norm": 0.23663075268268585, + "learning_rate": 6.600665152311601e-05, + "loss": 1.7475, + "step": 13502 + }, + { + "epoch": 4.144567219152854, + "grad_norm": 0.20644642412662506, + "learning_rate": 6.600194247181377e-05, + "loss": 1.7992, + "step": 13503 + }, + { + "epoch": 4.14487415592388, + "grad_norm": 0.21479010581970215, + "learning_rate": 6.599723326237098e-05, + "loss": 1.7877, + "step": 13504 + }, + { + "epoch": 4.145181092694905, + "grad_norm": 0.2266562283039093, + "learning_rate": 6.599252389483413e-05, + "loss": 1.8097, + "step": 13505 + }, + { + "epoch": 4.14548802946593, + "grad_norm": 0.2053738683462143, + "learning_rate": 6.59878143692498e-05, + "loss": 1.6878, + "step": 13506 + }, + { + "epoch": 4.145794966236955, + "grad_norm": 0.19583995640277863, + "learning_rate": 6.598310468566452e-05, + "loss": 1.7547, + "step": 13507 + }, + { + "epoch": 4.14610190300798, + "grad_norm": 0.23421542346477509, + "learning_rate": 6.597839484412484e-05, + "loss": 1.7926, + "step": 13508 + }, + { + "epoch": 4.1464088397790055, + "grad_norm": 0.24575260281562805, + "learning_rate": 6.597368484467728e-05, + "loss": 1.7311, + "step": 13509 + }, + { + "epoch": 4.146715776550031, + "grad_norm": 0.27519574761390686, + "learning_rate": 6.596897468736842e-05, + "loss": 1.7858, + "step": 13510 + }, + { + "epoch": 4.147022713321056, + "grad_norm": 0.26434022188186646, + "learning_rate": 6.596426437224477e-05, + "loss": 1.7387, + "step": 13511 + }, + { + "epoch": 4.147329650092081, + "grad_norm": 0.2192772775888443, + "learning_rate": 6.595955389935291e-05, + "loss": 1.7565, + "step": 13512 + }, + { + "epoch": 4.147636586863106, + "grad_norm": 0.21047350764274597, + "learning_rate": 6.595484326873938e-05, + "loss": 1.7234, + "step": 13513 + }, + { + "epoch": 4.147943523634131, + "grad_norm": 0.22838951647281647, + "learning_rate": 6.595013248045075e-05, + "loss": 1.8205, + "step": 13514 + }, + { + "epoch": 4.148250460405157, + "grad_norm": 0.3467923402786255, + "learning_rate": 6.594542153453356e-05, + "loss": 1.7973, + "step": 13515 + }, + { + "epoch": 4.148557397176182, + "grad_norm": 0.241237074136734, + "learning_rate": 6.594071043103438e-05, + "loss": 1.7764, + "step": 13516 + }, + { + "epoch": 4.148864333947207, + "grad_norm": 0.22543516755104065, + "learning_rate": 6.593599916999973e-05, + "loss": 1.7528, + "step": 13517 + }, + { + "epoch": 4.149171270718232, + "grad_norm": 0.24590276181697845, + "learning_rate": 6.593128775147623e-05, + "loss": 1.7422, + "step": 13518 + }, + { + "epoch": 4.149478207489257, + "grad_norm": 0.2434391975402832, + "learning_rate": 6.592657617551038e-05, + "loss": 1.7523, + "step": 13519 + }, + { + "epoch": 4.149785144260282, + "grad_norm": 0.23169009387493134, + "learning_rate": 6.592186444214877e-05, + "loss": 1.8158, + "step": 13520 + }, + { + "epoch": 4.150092081031308, + "grad_norm": 0.2217840999364853, + "learning_rate": 6.591715255143798e-05, + "loss": 1.7487, + "step": 13521 + }, + { + "epoch": 4.150399017802333, + "grad_norm": 0.2405092418193817, + "learning_rate": 6.591244050342454e-05, + "loss": 1.7726, + "step": 13522 + }, + { + "epoch": 4.150705954573358, + "grad_norm": 0.29432612657546997, + "learning_rate": 6.590772829815504e-05, + "loss": 1.7841, + "step": 13523 + }, + { + "epoch": 4.151012891344383, + "grad_norm": 0.2708737850189209, + "learning_rate": 6.590301593567605e-05, + "loss": 1.8551, + "step": 13524 + }, + { + "epoch": 4.151319828115408, + "grad_norm": 0.26643216609954834, + "learning_rate": 6.589830341603413e-05, + "loss": 1.7697, + "step": 13525 + }, + { + "epoch": 4.151626764886434, + "grad_norm": 0.3672652840614319, + "learning_rate": 6.589359073927587e-05, + "loss": 1.8292, + "step": 13526 + }, + { + "epoch": 4.151933701657459, + "grad_norm": 0.2413325160741806, + "learning_rate": 6.588887790544782e-05, + "loss": 1.7514, + "step": 13527 + }, + { + "epoch": 4.152240638428483, + "grad_norm": 0.3248155117034912, + "learning_rate": 6.588416491459657e-05, + "loss": 1.7437, + "step": 13528 + }, + { + "epoch": 4.152547575199509, + "grad_norm": 0.40951836109161377, + "learning_rate": 6.587945176676869e-05, + "loss": 1.7779, + "step": 13529 + }, + { + "epoch": 4.152854511970534, + "grad_norm": 0.23874351382255554, + "learning_rate": 6.587473846201075e-05, + "loss": 1.8343, + "step": 13530 + }, + { + "epoch": 4.153161448741559, + "grad_norm": 0.4535207450389862, + "learning_rate": 6.587002500036936e-05, + "loss": 1.8301, + "step": 13531 + }, + { + "epoch": 4.153468385512585, + "grad_norm": 0.458003968000412, + "learning_rate": 6.586531138189108e-05, + "loss": 1.7053, + "step": 13532 + }, + { + "epoch": 4.153775322283609, + "grad_norm": 0.24350887537002563, + "learning_rate": 6.586059760662248e-05, + "loss": 1.7642, + "step": 13533 + }, + { + "epoch": 4.1540822590546345, + "grad_norm": 0.46951553225517273, + "learning_rate": 6.585588367461017e-05, + "loss": 1.7345, + "step": 13534 + }, + { + "epoch": 4.15438919582566, + "grad_norm": 0.5524527430534363, + "learning_rate": 6.585116958590072e-05, + "loss": 1.7677, + "step": 13535 + }, + { + "epoch": 4.154696132596685, + "grad_norm": 0.2887112498283386, + "learning_rate": 6.584645534054072e-05, + "loss": 1.7704, + "step": 13536 + }, + { + "epoch": 4.1550030693677105, + "grad_norm": 0.36243724822998047, + "learning_rate": 6.584174093857675e-05, + "loss": 1.8133, + "step": 13537 + }, + { + "epoch": 4.155310006138736, + "grad_norm": 0.3869550824165344, + "learning_rate": 6.583702638005543e-05, + "loss": 1.7253, + "step": 13538 + }, + { + "epoch": 4.15561694290976, + "grad_norm": 0.25859662890434265, + "learning_rate": 6.583231166502333e-05, + "loss": 1.7683, + "step": 13539 + }, + { + "epoch": 4.155923879680786, + "grad_norm": 0.3011144995689392, + "learning_rate": 6.582759679352704e-05, + "loss": 1.7139, + "step": 13540 + }, + { + "epoch": 4.156230816451811, + "grad_norm": 0.38033372163772583, + "learning_rate": 6.582288176561316e-05, + "loss": 1.8182, + "step": 13541 + }, + { + "epoch": 4.156537753222836, + "grad_norm": 0.2224060595035553, + "learning_rate": 6.581816658132829e-05, + "loss": 1.7527, + "step": 13542 + }, + { + "epoch": 4.156844689993862, + "grad_norm": 0.4147234261035919, + "learning_rate": 6.581345124071903e-05, + "loss": 1.7339, + "step": 13543 + }, + { + "epoch": 4.157151626764886, + "grad_norm": 0.45334625244140625, + "learning_rate": 6.580873574383198e-05, + "loss": 1.8166, + "step": 13544 + }, + { + "epoch": 4.157458563535911, + "grad_norm": 0.3050530254840851, + "learning_rate": 6.580402009071372e-05, + "loss": 1.7967, + "step": 13545 + }, + { + "epoch": 4.157765500306937, + "grad_norm": 0.25901293754577637, + "learning_rate": 6.579930428141088e-05, + "loss": 1.7806, + "step": 13546 + }, + { + "epoch": 4.158072437077962, + "grad_norm": 0.3142934739589691, + "learning_rate": 6.579458831597006e-05, + "loss": 1.7724, + "step": 13547 + }, + { + "epoch": 4.158379373848987, + "grad_norm": 0.23943179845809937, + "learning_rate": 6.578987219443787e-05, + "loss": 1.7515, + "step": 13548 + }, + { + "epoch": 4.158686310620013, + "grad_norm": 0.2838635742664337, + "learning_rate": 6.578515591686089e-05, + "loss": 1.7707, + "step": 13549 + }, + { + "epoch": 4.158993247391037, + "grad_norm": 0.3064457178115845, + "learning_rate": 6.578043948328575e-05, + "loss": 1.7839, + "step": 13550 + }, + { + "epoch": 4.1593001841620625, + "grad_norm": 0.2311718463897705, + "learning_rate": 6.577572289375907e-05, + "loss": 1.8298, + "step": 13551 + }, + { + "epoch": 4.159607120933088, + "grad_norm": 0.35726481676101685, + "learning_rate": 6.577100614832743e-05, + "loss": 1.811, + "step": 13552 + }, + { + "epoch": 4.159914057704113, + "grad_norm": 0.3176140785217285, + "learning_rate": 6.576628924703749e-05, + "loss": 1.732, + "step": 13553 + }, + { + "epoch": 4.1602209944751385, + "grad_norm": 0.2325647473335266, + "learning_rate": 6.576157218993582e-05, + "loss": 1.827, + "step": 13554 + }, + { + "epoch": 4.160527931246163, + "grad_norm": 0.32260453701019287, + "learning_rate": 6.575685497706905e-05, + "loss": 1.8218, + "step": 13555 + }, + { + "epoch": 4.160834868017188, + "grad_norm": 0.2638537287712097, + "learning_rate": 6.575213760848382e-05, + "loss": 1.7091, + "step": 13556 + }, + { + "epoch": 4.161141804788214, + "grad_norm": 0.2501799762248993, + "learning_rate": 6.574742008422671e-05, + "loss": 1.7707, + "step": 13557 + }, + { + "epoch": 4.161448741559239, + "grad_norm": 0.3212645649909973, + "learning_rate": 6.574270240434439e-05, + "loss": 1.7541, + "step": 13558 + }, + { + "epoch": 4.161755678330264, + "grad_norm": 0.25915586948394775, + "learning_rate": 6.573798456888345e-05, + "loss": 1.7597, + "step": 13559 + }, + { + "epoch": 4.162062615101289, + "grad_norm": 0.2538192868232727, + "learning_rate": 6.573326657789052e-05, + "loss": 1.8507, + "step": 13560 + }, + { + "epoch": 4.162369551872314, + "grad_norm": 0.2542131543159485, + "learning_rate": 6.572854843141223e-05, + "loss": 1.782, + "step": 13561 + }, + { + "epoch": 4.162676488643339, + "grad_norm": 0.26163414120674133, + "learning_rate": 6.572383012949521e-05, + "loss": 1.8482, + "step": 13562 + }, + { + "epoch": 4.162983425414365, + "grad_norm": 0.2566238343715668, + "learning_rate": 6.571911167218608e-05, + "loss": 1.7284, + "step": 13563 + }, + { + "epoch": 4.16329036218539, + "grad_norm": 0.28413113951683044, + "learning_rate": 6.571439305953147e-05, + "loss": 1.7473, + "step": 13564 + }, + { + "epoch": 4.163597298956415, + "grad_norm": 0.20399242639541626, + "learning_rate": 6.570967429157802e-05, + "loss": 1.6942, + "step": 13565 + }, + { + "epoch": 4.16390423572744, + "grad_norm": 0.256104439496994, + "learning_rate": 6.570495536837235e-05, + "loss": 1.7346, + "step": 13566 + }, + { + "epoch": 4.164211172498465, + "grad_norm": 0.350909560918808, + "learning_rate": 6.570023628996112e-05, + "loss": 1.8284, + "step": 13567 + }, + { + "epoch": 4.1645181092694905, + "grad_norm": 0.23500367999076843, + "learning_rate": 6.569551705639096e-05, + "loss": 1.7504, + "step": 13568 + }, + { + "epoch": 4.164825046040516, + "grad_norm": 0.26683783531188965, + "learning_rate": 6.569079766770849e-05, + "loss": 1.7293, + "step": 13569 + }, + { + "epoch": 4.165131982811541, + "grad_norm": 0.3145855963230133, + "learning_rate": 6.568607812396037e-05, + "loss": 1.8171, + "step": 13570 + }, + { + "epoch": 4.165438919582566, + "grad_norm": 0.2354860156774521, + "learning_rate": 6.568135842519324e-05, + "loss": 1.7555, + "step": 13571 + }, + { + "epoch": 4.165745856353591, + "grad_norm": 0.2893243730068207, + "learning_rate": 6.56766385714537e-05, + "loss": 1.7636, + "step": 13572 + }, + { + "epoch": 4.166052793124616, + "grad_norm": 0.20707663893699646, + "learning_rate": 6.567191856278846e-05, + "loss": 1.7239, + "step": 13573 + }, + { + "epoch": 4.166359729895642, + "grad_norm": 0.34200331568717957, + "learning_rate": 6.566719839924412e-05, + "loss": 1.7848, + "step": 13574 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.23326615989208221, + "learning_rate": 6.566247808086734e-05, + "loss": 1.7447, + "step": 13575 + }, + { + "epoch": 4.166973603437691, + "grad_norm": 0.22375629842281342, + "learning_rate": 6.565775760770479e-05, + "loss": 1.7429, + "step": 13576 + }, + { + "epoch": 4.167280540208717, + "grad_norm": 0.2412862777709961, + "learning_rate": 6.565303697980308e-05, + "loss": 1.7671, + "step": 13577 + }, + { + "epoch": 4.167587476979742, + "grad_norm": 0.2482215315103531, + "learning_rate": 6.56483161972089e-05, + "loss": 1.812, + "step": 13578 + }, + { + "epoch": 4.167894413750767, + "grad_norm": 0.2252974659204483, + "learning_rate": 6.564359525996889e-05, + "loss": 1.8173, + "step": 13579 + }, + { + "epoch": 4.168201350521793, + "grad_norm": 0.23497292399406433, + "learning_rate": 6.563887416812969e-05, + "loss": 1.7945, + "step": 13580 + }, + { + "epoch": 4.168508287292818, + "grad_norm": 0.24911245703697205, + "learning_rate": 6.563415292173796e-05, + "loss": 1.7516, + "step": 13581 + }, + { + "epoch": 4.1688152240638425, + "grad_norm": 0.20920930802822113, + "learning_rate": 6.562943152084039e-05, + "loss": 1.765, + "step": 13582 + }, + { + "epoch": 4.169122160834868, + "grad_norm": 0.26001816987991333, + "learning_rate": 6.562470996548361e-05, + "loss": 1.7504, + "step": 13583 + }, + { + "epoch": 4.169429097605893, + "grad_norm": 0.2504529058933258, + "learning_rate": 6.561998825571429e-05, + "loss": 1.7689, + "step": 13584 + }, + { + "epoch": 4.1697360343769185, + "grad_norm": 0.2210187464952469, + "learning_rate": 6.561526639157908e-05, + "loss": 1.752, + "step": 13585 + }, + { + "epoch": 4.170042971147944, + "grad_norm": 0.26323240995407104, + "learning_rate": 6.561054437312467e-05, + "loss": 1.8104, + "step": 13586 + }, + { + "epoch": 4.170349907918968, + "grad_norm": 0.20436744391918182, + "learning_rate": 6.560582220039771e-05, + "loss": 1.7281, + "step": 13587 + }, + { + "epoch": 4.170656844689994, + "grad_norm": 0.2053878903388977, + "learning_rate": 6.560109987344487e-05, + "loss": 1.7192, + "step": 13588 + }, + { + "epoch": 4.170963781461019, + "grad_norm": 0.2416568547487259, + "learning_rate": 6.559637739231281e-05, + "loss": 1.7679, + "step": 13589 + }, + { + "epoch": 4.171270718232044, + "grad_norm": 0.23847989737987518, + "learning_rate": 6.55916547570482e-05, + "loss": 1.7182, + "step": 13590 + }, + { + "epoch": 4.17157765500307, + "grad_norm": 0.2057785540819168, + "learning_rate": 6.558693196769772e-05, + "loss": 1.816, + "step": 13591 + }, + { + "epoch": 4.171884591774095, + "grad_norm": 0.2270805537700653, + "learning_rate": 6.558220902430804e-05, + "loss": 1.7091, + "step": 13592 + }, + { + "epoch": 4.172191528545119, + "grad_norm": 0.22143644094467163, + "learning_rate": 6.557748592692585e-05, + "loss": 1.7446, + "step": 13593 + }, + { + "epoch": 4.172498465316145, + "grad_norm": 0.2032770961523056, + "learning_rate": 6.557276267559781e-05, + "loss": 1.7501, + "step": 13594 + }, + { + "epoch": 4.17280540208717, + "grad_norm": 0.20851244032382965, + "learning_rate": 6.55680392703706e-05, + "loss": 1.8283, + "step": 13595 + }, + { + "epoch": 4.173112338858195, + "grad_norm": 0.2603934109210968, + "learning_rate": 6.55633157112909e-05, + "loss": 1.8523, + "step": 13596 + }, + { + "epoch": 4.173419275629221, + "grad_norm": 0.2232515811920166, + "learning_rate": 6.55585919984054e-05, + "loss": 1.7803, + "step": 13597 + }, + { + "epoch": 4.173726212400245, + "grad_norm": 0.2541115880012512, + "learning_rate": 6.555386813176075e-05, + "loss": 1.7407, + "step": 13598 + }, + { + "epoch": 4.1740331491712706, + "grad_norm": 0.3044603765010834, + "learning_rate": 6.55491441114037e-05, + "loss": 1.8257, + "step": 13599 + }, + { + "epoch": 4.174340085942296, + "grad_norm": 0.29227301478385925, + "learning_rate": 6.554441993738086e-05, + "loss": 1.7998, + "step": 13600 + }, + { + "epoch": 4.174647022713321, + "grad_norm": 0.25166594982147217, + "learning_rate": 6.553969560973896e-05, + "loss": 1.8258, + "step": 13601 + }, + { + "epoch": 4.1749539594843466, + "grad_norm": 0.22973991930484772, + "learning_rate": 6.55349711285247e-05, + "loss": 1.7871, + "step": 13602 + }, + { + "epoch": 4.175260896255371, + "grad_norm": 0.2615009844303131, + "learning_rate": 6.553024649378473e-05, + "loss": 1.7572, + "step": 13603 + }, + { + "epoch": 4.175567833026396, + "grad_norm": 0.24145473539829254, + "learning_rate": 6.552552170556576e-05, + "loss": 1.7546, + "step": 13604 + }, + { + "epoch": 4.175874769797422, + "grad_norm": 0.21989156305789948, + "learning_rate": 6.55207967639145e-05, + "loss": 1.6939, + "step": 13605 + }, + { + "epoch": 4.176181706568447, + "grad_norm": 0.206025168299675, + "learning_rate": 6.551607166887761e-05, + "loss": 1.7531, + "step": 13606 + }, + { + "epoch": 4.176488643339472, + "grad_norm": 0.2175903469324112, + "learning_rate": 6.551134642050181e-05, + "loss": 1.7631, + "step": 13607 + }, + { + "epoch": 4.176795580110497, + "grad_norm": 0.23259282112121582, + "learning_rate": 6.550662101883379e-05, + "loss": 1.7773, + "step": 13608 + }, + { + "epoch": 4.177102516881522, + "grad_norm": 0.23955227434635162, + "learning_rate": 6.550189546392025e-05, + "loss": 1.7321, + "step": 13609 + }, + { + "epoch": 4.1774094536525475, + "grad_norm": 0.23614998161792755, + "learning_rate": 6.549716975580792e-05, + "loss": 1.7855, + "step": 13610 + }, + { + "epoch": 4.177716390423573, + "grad_norm": 0.2274426817893982, + "learning_rate": 6.549244389454345e-05, + "loss": 1.7778, + "step": 13611 + }, + { + "epoch": 4.178023327194598, + "grad_norm": 0.2204308807849884, + "learning_rate": 6.548771788017358e-05, + "loss": 1.7175, + "step": 13612 + }, + { + "epoch": 4.1783302639656235, + "grad_norm": 0.2283930778503418, + "learning_rate": 6.548299171274501e-05, + "loss": 1.8081, + "step": 13613 + }, + { + "epoch": 4.178637200736648, + "grad_norm": 0.25433486700057983, + "learning_rate": 6.547826539230442e-05, + "loss": 1.8009, + "step": 13614 + }, + { + "epoch": 4.178944137507673, + "grad_norm": 0.24452579021453857, + "learning_rate": 6.547353891889856e-05, + "loss": 1.7244, + "step": 13615 + }, + { + "epoch": 4.179251074278699, + "grad_norm": 0.20611275732517242, + "learning_rate": 6.546881229257411e-05, + "loss": 1.7566, + "step": 13616 + }, + { + "epoch": 4.179558011049724, + "grad_norm": 0.24557232856750488, + "learning_rate": 6.546408551337779e-05, + "loss": 1.7638, + "step": 13617 + }, + { + "epoch": 4.179864947820749, + "grad_norm": 0.2158801257610321, + "learning_rate": 6.545935858135631e-05, + "loss": 1.7659, + "step": 13618 + }, + { + "epoch": 4.180171884591774, + "grad_norm": 0.23800688982009888, + "learning_rate": 6.54546314965564e-05, + "loss": 1.7468, + "step": 13619 + }, + { + "epoch": 4.180478821362799, + "grad_norm": 0.2504122853279114, + "learning_rate": 6.544990425902476e-05, + "loss": 1.7682, + "step": 13620 + }, + { + "epoch": 4.180785758133824, + "grad_norm": 0.21556814014911652, + "learning_rate": 6.54451768688081e-05, + "loss": 1.772, + "step": 13621 + }, + { + "epoch": 4.18109269490485, + "grad_norm": 0.23404552042484283, + "learning_rate": 6.544044932595315e-05, + "loss": 1.7844, + "step": 13622 + }, + { + "epoch": 4.181399631675875, + "grad_norm": 0.22129055857658386, + "learning_rate": 6.543572163050664e-05, + "loss": 1.7725, + "step": 13623 + }, + { + "epoch": 4.1817065684469, + "grad_norm": 0.2533521354198456, + "learning_rate": 6.543099378251528e-05, + "loss": 1.7908, + "step": 13624 + }, + { + "epoch": 4.182013505217925, + "grad_norm": 0.2905815541744232, + "learning_rate": 6.542626578202579e-05, + "loss": 1.7913, + "step": 13625 + }, + { + "epoch": 4.18232044198895, + "grad_norm": 0.3330783247947693, + "learning_rate": 6.54215376290849e-05, + "loss": 1.8374, + "step": 13626 + }, + { + "epoch": 4.1826273787599755, + "grad_norm": 0.29268717765808105, + "learning_rate": 6.541680932373933e-05, + "loss": 1.8714, + "step": 13627 + }, + { + "epoch": 4.182934315531001, + "grad_norm": 0.2820781171321869, + "learning_rate": 6.541208086603584e-05, + "loss": 1.8089, + "step": 13628 + }, + { + "epoch": 4.183241252302026, + "grad_norm": 0.3062323033809662, + "learning_rate": 6.54073522560211e-05, + "loss": 1.7307, + "step": 13629 + }, + { + "epoch": 4.183548189073051, + "grad_norm": 0.3010510504245758, + "learning_rate": 6.54026234937419e-05, + "loss": 1.7523, + "step": 13630 + }, + { + "epoch": 4.183855125844076, + "grad_norm": 0.21932095289230347, + "learning_rate": 6.539789457924493e-05, + "loss": 1.737, + "step": 13631 + }, + { + "epoch": 4.184162062615101, + "grad_norm": 0.2710212469100952, + "learning_rate": 6.539316551257695e-05, + "loss": 1.7228, + "step": 13632 + }, + { + "epoch": 4.184468999386127, + "grad_norm": 0.2885816991329193, + "learning_rate": 6.538843629378469e-05, + "loss": 1.8734, + "step": 13633 + }, + { + "epoch": 4.184775936157152, + "grad_norm": 0.2621026635169983, + "learning_rate": 6.538370692291487e-05, + "loss": 1.7884, + "step": 13634 + }, + { + "epoch": 4.185082872928176, + "grad_norm": 0.30503126978874207, + "learning_rate": 6.537897740001426e-05, + "loss": 1.7833, + "step": 13635 + }, + { + "epoch": 4.185389809699202, + "grad_norm": 0.29491373896598816, + "learning_rate": 6.537424772512955e-05, + "loss": 1.7894, + "step": 13636 + }, + { + "epoch": 4.185696746470227, + "grad_norm": 0.24423296749591827, + "learning_rate": 6.536951789830754e-05, + "loss": 1.7409, + "step": 13637 + }, + { + "epoch": 4.186003683241252, + "grad_norm": 0.2184748351573944, + "learning_rate": 6.536478791959495e-05, + "loss": 1.747, + "step": 13638 + }, + { + "epoch": 4.186310620012278, + "grad_norm": 0.2348455935716629, + "learning_rate": 6.53600577890385e-05, + "loss": 1.7422, + "step": 13639 + }, + { + "epoch": 4.186617556783303, + "grad_norm": 0.2554566264152527, + "learning_rate": 6.535532750668497e-05, + "loss": 1.7623, + "step": 13640 + }, + { + "epoch": 4.1869244935543275, + "grad_norm": 0.26424553990364075, + "learning_rate": 6.535059707258109e-05, + "loss": 1.8408, + "step": 13641 + }, + { + "epoch": 4.187231430325353, + "grad_norm": 0.35363274812698364, + "learning_rate": 6.534586648677361e-05, + "loss": 1.7435, + "step": 13642 + }, + { + "epoch": 4.187538367096378, + "grad_norm": 0.3225265443325043, + "learning_rate": 6.534113574930926e-05, + "loss": 1.7181, + "step": 13643 + }, + { + "epoch": 4.1878453038674035, + "grad_norm": 0.23529650270938873, + "learning_rate": 6.533640486023485e-05, + "loss": 1.7712, + "step": 13644 + }, + { + "epoch": 4.188152240638429, + "grad_norm": 0.3490132987499237, + "learning_rate": 6.53316738195971e-05, + "loss": 1.7329, + "step": 13645 + }, + { + "epoch": 4.188459177409453, + "grad_norm": 0.3759285509586334, + "learning_rate": 6.532694262744274e-05, + "loss": 1.802, + "step": 13646 + }, + { + "epoch": 4.188766114180479, + "grad_norm": 0.27383577823638916, + "learning_rate": 6.532221128381858e-05, + "loss": 1.801, + "step": 13647 + }, + { + "epoch": 4.189073050951504, + "grad_norm": 0.23240652680397034, + "learning_rate": 6.531747978877132e-05, + "loss": 1.8415, + "step": 13648 + }, + { + "epoch": 4.189379987722529, + "grad_norm": 0.3302704989910126, + "learning_rate": 6.531274814234773e-05, + "loss": 1.7765, + "step": 13649 + }, + { + "epoch": 4.189686924493555, + "grad_norm": 0.3209368586540222, + "learning_rate": 6.530801634459463e-05, + "loss": 1.6935, + "step": 13650 + }, + { + "epoch": 4.189993861264579, + "grad_norm": 0.26643648743629456, + "learning_rate": 6.530328439555872e-05, + "loss": 1.8159, + "step": 13651 + }, + { + "epoch": 4.190300798035604, + "grad_norm": 0.22594431042671204, + "learning_rate": 6.529855229528679e-05, + "loss": 1.7764, + "step": 13652 + }, + { + "epoch": 4.19060773480663, + "grad_norm": 0.3288109302520752, + "learning_rate": 6.529382004382561e-05, + "loss": 1.7963, + "step": 13653 + }, + { + "epoch": 4.190914671577655, + "grad_norm": 0.3067106604576111, + "learning_rate": 6.528908764122191e-05, + "loss": 1.7564, + "step": 13654 + }, + { + "epoch": 4.19122160834868, + "grad_norm": 0.23437078297138214, + "learning_rate": 6.528435508752249e-05, + "loss": 1.759, + "step": 13655 + }, + { + "epoch": 4.191528545119706, + "grad_norm": 0.30662333965301514, + "learning_rate": 6.527962238277413e-05, + "loss": 1.7549, + "step": 13656 + }, + { + "epoch": 4.19183548189073, + "grad_norm": 0.3545009195804596, + "learning_rate": 6.527488952702356e-05, + "loss": 1.7761, + "step": 13657 + }, + { + "epoch": 4.1921424186617555, + "grad_norm": 0.2509438991546631, + "learning_rate": 6.52701565203176e-05, + "loss": 1.7162, + "step": 13658 + }, + { + "epoch": 4.192449355432781, + "grad_norm": 0.24423806369304657, + "learning_rate": 6.5265423362703e-05, + "loss": 1.735, + "step": 13659 + }, + { + "epoch": 4.192756292203806, + "grad_norm": 0.37365156412124634, + "learning_rate": 6.526069005422654e-05, + "loss": 1.7697, + "step": 13660 + }, + { + "epoch": 4.1930632289748315, + "grad_norm": 0.4025731682777405, + "learning_rate": 6.525595659493499e-05, + "loss": 1.7931, + "step": 13661 + }, + { + "epoch": 4.193370165745856, + "grad_norm": 0.31360915303230286, + "learning_rate": 6.525122298487514e-05, + "loss": 1.8014, + "step": 13662 + }, + { + "epoch": 4.193677102516881, + "grad_norm": 0.2480524778366089, + "learning_rate": 6.524648922409376e-05, + "loss": 1.7753, + "step": 13663 + }, + { + "epoch": 4.193984039287907, + "grad_norm": 0.33740919828414917, + "learning_rate": 6.524175531263765e-05, + "loss": 1.7296, + "step": 13664 + }, + { + "epoch": 4.194290976058932, + "grad_norm": 0.26871639490127563, + "learning_rate": 6.523702125055358e-05, + "loss": 1.7113, + "step": 13665 + }, + { + "epoch": 4.194597912829957, + "grad_norm": 0.2687455415725708, + "learning_rate": 6.52322870378883e-05, + "loss": 1.7645, + "step": 13666 + }, + { + "epoch": 4.194904849600983, + "grad_norm": 0.4207400679588318, + "learning_rate": 6.522755267468868e-05, + "loss": 1.7758, + "step": 13667 + }, + { + "epoch": 4.195211786372007, + "grad_norm": 0.36043494939804077, + "learning_rate": 6.522281816100142e-05, + "loss": 1.7433, + "step": 13668 + }, + { + "epoch": 4.195518723143032, + "grad_norm": 0.2515890598297119, + "learning_rate": 6.52180834968734e-05, + "loss": 1.7646, + "step": 13669 + }, + { + "epoch": 4.195825659914058, + "grad_norm": 0.2871458828449249, + "learning_rate": 6.521334868235132e-05, + "loss": 1.8147, + "step": 13670 + }, + { + "epoch": 4.196132596685083, + "grad_norm": 0.28454354405403137, + "learning_rate": 6.5208613717482e-05, + "loss": 1.8576, + "step": 13671 + }, + { + "epoch": 4.196439533456108, + "grad_norm": 0.2520541548728943, + "learning_rate": 6.520387860231227e-05, + "loss": 1.7513, + "step": 13672 + }, + { + "epoch": 4.196746470227133, + "grad_norm": 0.22782307863235474, + "learning_rate": 6.51991433368889e-05, + "loss": 1.7737, + "step": 13673 + }, + { + "epoch": 4.197053406998158, + "grad_norm": 0.2451259195804596, + "learning_rate": 6.519440792125869e-05, + "loss": 1.7483, + "step": 13674 + }, + { + "epoch": 4.1973603437691835, + "grad_norm": 0.21915963292121887, + "learning_rate": 6.518967235546841e-05, + "loss": 1.718, + "step": 13675 + }, + { + "epoch": 4.197667280540209, + "grad_norm": 0.23005805909633636, + "learning_rate": 6.51849366395649e-05, + "loss": 1.7786, + "step": 13676 + }, + { + "epoch": 4.197974217311234, + "grad_norm": 0.25039517879486084, + "learning_rate": 6.518020077359494e-05, + "loss": 1.7785, + "step": 13677 + }, + { + "epoch": 4.198281154082259, + "grad_norm": 0.26631081104278564, + "learning_rate": 6.517546475760535e-05, + "loss": 1.7921, + "step": 13678 + }, + { + "epoch": 4.198588090853284, + "grad_norm": 0.2220793515443802, + "learning_rate": 6.517072859164292e-05, + "loss": 1.7696, + "step": 13679 + }, + { + "epoch": 4.198895027624309, + "grad_norm": 0.24681030213832855, + "learning_rate": 6.516599227575446e-05, + "loss": 1.7702, + "step": 13680 + }, + { + "epoch": 4.199201964395335, + "grad_norm": 0.2421828955411911, + "learning_rate": 6.516125580998678e-05, + "loss": 1.8058, + "step": 13681 + }, + { + "epoch": 4.19950890116636, + "grad_norm": 0.2170087695121765, + "learning_rate": 6.515651919438667e-05, + "loss": 1.7271, + "step": 13682 + }, + { + "epoch": 4.199815837937384, + "grad_norm": 0.23383566737174988, + "learning_rate": 6.515178242900096e-05, + "loss": 1.7515, + "step": 13683 + }, + { + "epoch": 4.20012277470841, + "grad_norm": 0.2522997558116913, + "learning_rate": 6.514704551387645e-05, + "loss": 1.7619, + "step": 13684 + }, + { + "epoch": 4.200429711479435, + "grad_norm": 0.20973703265190125, + "learning_rate": 6.514230844905995e-05, + "loss": 1.7326, + "step": 13685 + }, + { + "epoch": 4.2007366482504604, + "grad_norm": 0.2308073341846466, + "learning_rate": 6.513757123459832e-05, + "loss": 1.811, + "step": 13686 + }, + { + "epoch": 4.201043585021486, + "grad_norm": 0.21751229465007782, + "learning_rate": 6.51328338705383e-05, + "loss": 1.7795, + "step": 13687 + }, + { + "epoch": 4.201350521792511, + "grad_norm": 0.2357407957315445, + "learning_rate": 6.512809635692675e-05, + "loss": 1.8069, + "step": 13688 + }, + { + "epoch": 4.201657458563536, + "grad_norm": 0.32245033979415894, + "learning_rate": 6.51233586938105e-05, + "loss": 1.8179, + "step": 13689 + }, + { + "epoch": 4.201964395334561, + "grad_norm": 0.22740167379379272, + "learning_rate": 6.511862088123635e-05, + "loss": 1.7482, + "step": 13690 + }, + { + "epoch": 4.202271332105586, + "grad_norm": 0.26880496740341187, + "learning_rate": 6.511388291925114e-05, + "loss": 1.7919, + "step": 13691 + }, + { + "epoch": 4.202578268876612, + "grad_norm": 0.2261822521686554, + "learning_rate": 6.510914480790166e-05, + "loss": 1.7543, + "step": 13692 + }, + { + "epoch": 4.202885205647637, + "grad_norm": 0.2635782063007355, + "learning_rate": 6.510440654723477e-05, + "loss": 1.7874, + "step": 13693 + }, + { + "epoch": 4.203192142418661, + "grad_norm": 0.2505982518196106, + "learning_rate": 6.509966813729726e-05, + "loss": 1.8016, + "step": 13694 + }, + { + "epoch": 4.203499079189687, + "grad_norm": 0.23177236318588257, + "learning_rate": 6.5094929578136e-05, + "loss": 1.7582, + "step": 13695 + }, + { + "epoch": 4.203806015960712, + "grad_norm": 0.2315056324005127, + "learning_rate": 6.509019086979779e-05, + "loss": 1.7418, + "step": 13696 + }, + { + "epoch": 4.204112952731737, + "grad_norm": 0.25565484166145325, + "learning_rate": 6.508545201232947e-05, + "loss": 1.7476, + "step": 13697 + }, + { + "epoch": 4.204419889502763, + "grad_norm": 0.29210081696510315, + "learning_rate": 6.508071300577787e-05, + "loss": 1.8397, + "step": 13698 + }, + { + "epoch": 4.204726826273788, + "grad_norm": 0.2830582559108734, + "learning_rate": 6.507597385018984e-05, + "loss": 1.834, + "step": 13699 + }, + { + "epoch": 4.2050337630448125, + "grad_norm": 0.23013398051261902, + "learning_rate": 6.507123454561217e-05, + "loss": 1.7593, + "step": 13700 + }, + { + "epoch": 4.205340699815838, + "grad_norm": 0.21970276534557343, + "learning_rate": 6.506649509209174e-05, + "loss": 1.754, + "step": 13701 + }, + { + "epoch": 4.205647636586863, + "grad_norm": 0.32052233815193176, + "learning_rate": 6.50617554896754e-05, + "loss": 1.7531, + "step": 13702 + }, + { + "epoch": 4.2059545733578885, + "grad_norm": 0.2597332000732422, + "learning_rate": 6.505701573840995e-05, + "loss": 1.7836, + "step": 13703 + }, + { + "epoch": 4.206261510128914, + "grad_norm": 0.22070355713367462, + "learning_rate": 6.505227583834224e-05, + "loss": 1.7225, + "step": 13704 + }, + { + "epoch": 4.206568446899938, + "grad_norm": 0.27219358086586, + "learning_rate": 6.50475357895191e-05, + "loss": 1.8215, + "step": 13705 + }, + { + "epoch": 4.206875383670964, + "grad_norm": 0.32541659474372864, + "learning_rate": 6.504279559198741e-05, + "loss": 1.7786, + "step": 13706 + }, + { + "epoch": 4.207182320441989, + "grad_norm": 0.25871729850769043, + "learning_rate": 6.5038055245794e-05, + "loss": 1.7621, + "step": 13707 + }, + { + "epoch": 4.207489257213014, + "grad_norm": 0.2190464735031128, + "learning_rate": 6.50333147509857e-05, + "loss": 1.7612, + "step": 13708 + }, + { + "epoch": 4.20779619398404, + "grad_norm": 0.19565832614898682, + "learning_rate": 6.50285741076094e-05, + "loss": 1.7581, + "step": 13709 + }, + { + "epoch": 4.208103130755064, + "grad_norm": 0.1889251321554184, + "learning_rate": 6.50238333157119e-05, + "loss": 1.7611, + "step": 13710 + }, + { + "epoch": 4.208410067526089, + "grad_norm": 0.2013053596019745, + "learning_rate": 6.501909237534008e-05, + "loss": 1.7393, + "step": 13711 + }, + { + "epoch": 4.208717004297115, + "grad_norm": 0.1899433434009552, + "learning_rate": 6.501435128654077e-05, + "loss": 1.7122, + "step": 13712 + }, + { + "epoch": 4.20902394106814, + "grad_norm": 0.19337882101535797, + "learning_rate": 6.500961004936085e-05, + "loss": 1.7538, + "step": 13713 + }, + { + "epoch": 4.209330877839165, + "grad_norm": 0.20419920980930328, + "learning_rate": 6.500486866384718e-05, + "loss": 1.728, + "step": 13714 + }, + { + "epoch": 4.209637814610191, + "grad_norm": 0.20615679025650024, + "learning_rate": 6.50001271300466e-05, + "loss": 1.7843, + "step": 13715 + }, + { + "epoch": 4.209944751381215, + "grad_norm": 0.22178977727890015, + "learning_rate": 6.499538544800596e-05, + "loss": 1.7751, + "step": 13716 + }, + { + "epoch": 4.2102516881522405, + "grad_norm": 0.23703891038894653, + "learning_rate": 6.499064361777214e-05, + "loss": 1.7304, + "step": 13717 + }, + { + "epoch": 4.210558624923266, + "grad_norm": 0.2785723805427551, + "learning_rate": 6.498590163939198e-05, + "loss": 1.802, + "step": 13718 + }, + { + "epoch": 4.210865561694291, + "grad_norm": 0.23277060687541962, + "learning_rate": 6.498115951291237e-05, + "loss": 1.7316, + "step": 13719 + }, + { + "epoch": 4.2111724984653165, + "grad_norm": 0.22289474308490753, + "learning_rate": 6.497641723838017e-05, + "loss": 1.8469, + "step": 13720 + }, + { + "epoch": 4.211479435236341, + "grad_norm": 0.2715846002101898, + "learning_rate": 6.497167481584221e-05, + "loss": 1.7919, + "step": 13721 + }, + { + "epoch": 4.211786372007366, + "grad_norm": 0.29262226819992065, + "learning_rate": 6.49669322453454e-05, + "loss": 1.8379, + "step": 13722 + }, + { + "epoch": 4.212093308778392, + "grad_norm": 0.29136186838150024, + "learning_rate": 6.49621895269366e-05, + "loss": 1.789, + "step": 13723 + }, + { + "epoch": 4.212400245549417, + "grad_norm": 0.25110194087028503, + "learning_rate": 6.495744666066266e-05, + "loss": 1.7574, + "step": 13724 + }, + { + "epoch": 4.212707182320442, + "grad_norm": 0.2301366776227951, + "learning_rate": 6.495270364657048e-05, + "loss": 1.7637, + "step": 13725 + }, + { + "epoch": 4.213014119091467, + "grad_norm": 0.2556478977203369, + "learning_rate": 6.49479604847069e-05, + "loss": 1.7975, + "step": 13726 + }, + { + "epoch": 4.213321055862492, + "grad_norm": 0.2645667493343353, + "learning_rate": 6.494321717511884e-05, + "loss": 1.7594, + "step": 13727 + }, + { + "epoch": 4.213627992633517, + "grad_norm": 0.23664188385009766, + "learning_rate": 6.493847371785312e-05, + "loss": 1.7963, + "step": 13728 + }, + { + "epoch": 4.213934929404543, + "grad_norm": 0.2947930693626404, + "learning_rate": 6.493373011295665e-05, + "loss": 1.7477, + "step": 13729 + }, + { + "epoch": 4.214241866175568, + "grad_norm": 0.34598737955093384, + "learning_rate": 6.492898636047631e-05, + "loss": 1.7014, + "step": 13730 + }, + { + "epoch": 4.214548802946593, + "grad_norm": 0.24406935274600983, + "learning_rate": 6.4924242460459e-05, + "loss": 1.7436, + "step": 13731 + }, + { + "epoch": 4.214855739717618, + "grad_norm": 0.27176225185394287, + "learning_rate": 6.491949841295156e-05, + "loss": 1.8429, + "step": 13732 + }, + { + "epoch": 4.215162676488643, + "grad_norm": 0.2506968080997467, + "learning_rate": 6.491475421800089e-05, + "loss": 1.7519, + "step": 13733 + }, + { + "epoch": 4.2154696132596685, + "grad_norm": 0.2240980863571167, + "learning_rate": 6.491000987565387e-05, + "loss": 1.7595, + "step": 13734 + }, + { + "epoch": 4.215776550030694, + "grad_norm": 0.23201732337474823, + "learning_rate": 6.490526538595741e-05, + "loss": 1.7466, + "step": 13735 + }, + { + "epoch": 4.216083486801719, + "grad_norm": 0.24624750018119812, + "learning_rate": 6.490052074895836e-05, + "loss": 1.7364, + "step": 13736 + }, + { + "epoch": 4.216390423572744, + "grad_norm": 0.22936980426311493, + "learning_rate": 6.489577596470366e-05, + "loss": 1.7095, + "step": 13737 + }, + { + "epoch": 4.216697360343769, + "grad_norm": 0.2106638103723526, + "learning_rate": 6.489103103324016e-05, + "loss": 1.7387, + "step": 13738 + }, + { + "epoch": 4.217004297114794, + "grad_norm": 0.2936140298843384, + "learning_rate": 6.488628595461477e-05, + "loss": 1.9129, + "step": 13739 + }, + { + "epoch": 4.21731123388582, + "grad_norm": 0.21871696412563324, + "learning_rate": 6.488154072887435e-05, + "loss": 1.7489, + "step": 13740 + }, + { + "epoch": 4.217618170656845, + "grad_norm": 0.25941070914268494, + "learning_rate": 6.487679535606583e-05, + "loss": 1.7788, + "step": 13741 + }, + { + "epoch": 4.21792510742787, + "grad_norm": 0.2540862560272217, + "learning_rate": 6.487204983623612e-05, + "loss": 1.8074, + "step": 13742 + }, + { + "epoch": 4.218232044198895, + "grad_norm": 0.25180327892303467, + "learning_rate": 6.486730416943207e-05, + "loss": 1.7503, + "step": 13743 + }, + { + "epoch": 4.21853898096992, + "grad_norm": 0.26625585556030273, + "learning_rate": 6.486255835570063e-05, + "loss": 1.8149, + "step": 13744 + }, + { + "epoch": 4.218845917740945, + "grad_norm": 0.3023914396762848, + "learning_rate": 6.485781239508867e-05, + "loss": 1.8599, + "step": 13745 + }, + { + "epoch": 4.219152854511971, + "grad_norm": 0.2683780789375305, + "learning_rate": 6.48530662876431e-05, + "loss": 1.7911, + "step": 13746 + }, + { + "epoch": 4.219459791282996, + "grad_norm": 0.20747442543506622, + "learning_rate": 6.484832003341081e-05, + "loss": 1.7343, + "step": 13747 + }, + { + "epoch": 4.2197667280540205, + "grad_norm": 0.29284465312957764, + "learning_rate": 6.484357363243873e-05, + "loss": 1.7917, + "step": 13748 + }, + { + "epoch": 4.220073664825046, + "grad_norm": 0.24303840100765228, + "learning_rate": 6.483882708477376e-05, + "loss": 1.7921, + "step": 13749 + }, + { + "epoch": 4.220380601596071, + "grad_norm": 0.26253026723861694, + "learning_rate": 6.48340803904628e-05, + "loss": 1.7971, + "step": 13750 + }, + { + "epoch": 4.2206875383670965, + "grad_norm": 0.23888511955738068, + "learning_rate": 6.482933354955275e-05, + "loss": 1.7967, + "step": 13751 + }, + { + "epoch": 4.220994475138122, + "grad_norm": 0.24966883659362793, + "learning_rate": 6.482458656209054e-05, + "loss": 1.7924, + "step": 13752 + }, + { + "epoch": 4.221301411909146, + "grad_norm": 0.26556864380836487, + "learning_rate": 6.481983942812309e-05, + "loss": 1.8608, + "step": 13753 + }, + { + "epoch": 4.221608348680172, + "grad_norm": 0.29064711928367615, + "learning_rate": 6.48150921476973e-05, + "loss": 1.7785, + "step": 13754 + }, + { + "epoch": 4.221915285451197, + "grad_norm": 0.30876123905181885, + "learning_rate": 6.481034472086008e-05, + "loss": 1.8287, + "step": 13755 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.2622467875480652, + "learning_rate": 6.480559714765835e-05, + "loss": 1.8336, + "step": 13756 + }, + { + "epoch": 4.222529158993248, + "grad_norm": 0.2502644956111908, + "learning_rate": 6.480084942813902e-05, + "loss": 1.7803, + "step": 13757 + }, + { + "epoch": 4.222836095764273, + "grad_norm": 0.2879922688007355, + "learning_rate": 6.479610156234903e-05, + "loss": 1.7544, + "step": 13758 + }, + { + "epoch": 4.223143032535297, + "grad_norm": 0.2831384241580963, + "learning_rate": 6.47913535503353e-05, + "loss": 1.887, + "step": 13759 + }, + { + "epoch": 4.223449969306323, + "grad_norm": 0.3221064805984497, + "learning_rate": 6.478660539214474e-05, + "loss": 1.7455, + "step": 13760 + }, + { + "epoch": 4.223756906077348, + "grad_norm": 0.4231930673122406, + "learning_rate": 6.478185708782427e-05, + "loss": 1.8209, + "step": 13761 + }, + { + "epoch": 4.224063842848373, + "grad_norm": 0.34327802062034607, + "learning_rate": 6.477710863742083e-05, + "loss": 1.7754, + "step": 13762 + }, + { + "epoch": 4.224370779619399, + "grad_norm": 0.21713349223136902, + "learning_rate": 6.477236004098135e-05, + "loss": 1.7576, + "step": 13763 + }, + { + "epoch": 4.224677716390423, + "grad_norm": 0.3262602388858795, + "learning_rate": 6.476761129855275e-05, + "loss": 1.7772, + "step": 13764 + }, + { + "epoch": 4.2249846531614486, + "grad_norm": 0.3231413662433624, + "learning_rate": 6.476286241018195e-05, + "loss": 1.7821, + "step": 13765 + }, + { + "epoch": 4.225291589932474, + "grad_norm": 0.2440098226070404, + "learning_rate": 6.475811337591588e-05, + "loss": 1.7684, + "step": 13766 + }, + { + "epoch": 4.225598526703499, + "grad_norm": 0.329949289560318, + "learning_rate": 6.475336419580151e-05, + "loss": 1.8564, + "step": 13767 + }, + { + "epoch": 4.225905463474525, + "grad_norm": 0.3567483425140381, + "learning_rate": 6.474861486988574e-05, + "loss": 1.7625, + "step": 13768 + }, + { + "epoch": 4.226212400245549, + "grad_norm": 0.25257283449172974, + "learning_rate": 6.47438653982155e-05, + "loss": 1.823, + "step": 13769 + }, + { + "epoch": 4.226519337016574, + "grad_norm": 0.31542617082595825, + "learning_rate": 6.473911578083776e-05, + "loss": 1.7817, + "step": 13770 + }, + { + "epoch": 4.2268262737876, + "grad_norm": 0.29670149087905884, + "learning_rate": 6.473436601779944e-05, + "loss": 1.7493, + "step": 13771 + }, + { + "epoch": 4.227133210558625, + "grad_norm": 0.2635453939437866, + "learning_rate": 6.472961610914745e-05, + "loss": 1.792, + "step": 13772 + }, + { + "epoch": 4.22744014732965, + "grad_norm": 0.25017979741096497, + "learning_rate": 6.472486605492878e-05, + "loss": 1.7183, + "step": 13773 + }, + { + "epoch": 4.227747084100676, + "grad_norm": 0.3766646087169647, + "learning_rate": 6.472011585519034e-05, + "loss": 1.8039, + "step": 13774 + }, + { + "epoch": 4.2280540208717, + "grad_norm": 0.29860204458236694, + "learning_rate": 6.47153655099791e-05, + "loss": 1.8016, + "step": 13775 + }, + { + "epoch": 4.2283609576427255, + "grad_norm": 0.2540898323059082, + "learning_rate": 6.4710615019342e-05, + "loss": 1.8481, + "step": 13776 + }, + { + "epoch": 4.228667894413751, + "grad_norm": 0.3677786886692047, + "learning_rate": 6.470586438332597e-05, + "loss": 1.7663, + "step": 13777 + }, + { + "epoch": 4.228974831184776, + "grad_norm": 0.35693466663360596, + "learning_rate": 6.470111360197797e-05, + "loss": 1.7733, + "step": 13778 + }, + { + "epoch": 4.2292817679558015, + "grad_norm": 0.23747926950454712, + "learning_rate": 6.469636267534496e-05, + "loss": 1.7938, + "step": 13779 + }, + { + "epoch": 4.229588704726826, + "grad_norm": 0.32890695333480835, + "learning_rate": 6.469161160347386e-05, + "loss": 1.7233, + "step": 13780 + }, + { + "epoch": 4.229895641497851, + "grad_norm": 0.3437706530094147, + "learning_rate": 6.468686038641164e-05, + "loss": 1.7716, + "step": 13781 + }, + { + "epoch": 4.230202578268877, + "grad_norm": 0.23452162742614746, + "learning_rate": 6.468210902420527e-05, + "loss": 1.764, + "step": 13782 + }, + { + "epoch": 4.230509515039902, + "grad_norm": 0.3205265402793884, + "learning_rate": 6.46773575169017e-05, + "loss": 1.7464, + "step": 13783 + }, + { + "epoch": 4.230816451810927, + "grad_norm": 0.4234732985496521, + "learning_rate": 6.467260586454787e-05, + "loss": 1.7786, + "step": 13784 + }, + { + "epoch": 4.231123388581952, + "grad_norm": 0.2484128773212433, + "learning_rate": 6.466785406719076e-05, + "loss": 1.8125, + "step": 13785 + }, + { + "epoch": 4.231430325352977, + "grad_norm": 0.3696556091308594, + "learning_rate": 6.46631021248773e-05, + "loss": 1.7974, + "step": 13786 + }, + { + "epoch": 4.231737262124002, + "grad_norm": 0.4251437485218048, + "learning_rate": 6.465835003765449e-05, + "loss": 1.7486, + "step": 13787 + }, + { + "epoch": 4.232044198895028, + "grad_norm": 0.2507621943950653, + "learning_rate": 6.465359780556927e-05, + "loss": 1.829, + "step": 13788 + }, + { + "epoch": 4.232351135666053, + "grad_norm": 0.2911818325519562, + "learning_rate": 6.464884542866861e-05, + "loss": 1.7401, + "step": 13789 + }, + { + "epoch": 4.232658072437078, + "grad_norm": 0.35354506969451904, + "learning_rate": 6.464409290699946e-05, + "loss": 1.7848, + "step": 13790 + }, + { + "epoch": 4.232965009208103, + "grad_norm": 0.2659081518650055, + "learning_rate": 6.46393402406088e-05, + "loss": 1.7408, + "step": 13791 + }, + { + "epoch": 4.233271945979128, + "grad_norm": 0.22676481306552887, + "learning_rate": 6.46345874295436e-05, + "loss": 1.7542, + "step": 13792 + }, + { + "epoch": 4.2335788827501535, + "grad_norm": 0.2549789845943451, + "learning_rate": 6.462983447385085e-05, + "loss": 1.8095, + "step": 13793 + }, + { + "epoch": 4.233885819521179, + "grad_norm": 0.2157238870859146, + "learning_rate": 6.462508137357748e-05, + "loss": 1.7529, + "step": 13794 + }, + { + "epoch": 4.234192756292204, + "grad_norm": 0.2494724988937378, + "learning_rate": 6.46203281287705e-05, + "loss": 1.7839, + "step": 13795 + }, + { + "epoch": 4.234499693063229, + "grad_norm": 0.29560065269470215, + "learning_rate": 6.461557473947685e-05, + "loss": 1.7239, + "step": 13796 + }, + { + "epoch": 4.234806629834254, + "grad_norm": 0.23693916201591492, + "learning_rate": 6.461082120574354e-05, + "loss": 1.8074, + "step": 13797 + }, + { + "epoch": 4.235113566605279, + "grad_norm": 0.2538869082927704, + "learning_rate": 6.460606752761752e-05, + "loss": 1.8319, + "step": 13798 + }, + { + "epoch": 4.235420503376305, + "grad_norm": 0.3186401426792145, + "learning_rate": 6.460131370514578e-05, + "loss": 1.7877, + "step": 13799 + }, + { + "epoch": 4.23572744014733, + "grad_norm": 0.2473619133234024, + "learning_rate": 6.45965597383753e-05, + "loss": 1.8323, + "step": 13800 + }, + { + "epoch": 4.236034376918354, + "grad_norm": 0.32806503772735596, + "learning_rate": 6.459180562735307e-05, + "loss": 1.744, + "step": 13801 + }, + { + "epoch": 4.23634131368938, + "grad_norm": 0.3975784480571747, + "learning_rate": 6.458705137212606e-05, + "loss": 1.7216, + "step": 13802 + }, + { + "epoch": 4.236648250460405, + "grad_norm": 0.2946135997772217, + "learning_rate": 6.458229697274125e-05, + "loss": 1.8781, + "step": 13803 + }, + { + "epoch": 4.23695518723143, + "grad_norm": 0.25109192728996277, + "learning_rate": 6.457754242924565e-05, + "loss": 1.7458, + "step": 13804 + }, + { + "epoch": 4.237262124002456, + "grad_norm": 0.2763883173465729, + "learning_rate": 6.457278774168623e-05, + "loss": 1.7612, + "step": 13805 + }, + { + "epoch": 4.237569060773481, + "grad_norm": 0.22427856922149658, + "learning_rate": 6.456803291010996e-05, + "loss": 1.8049, + "step": 13806 + }, + { + "epoch": 4.2378759975445055, + "grad_norm": 0.28295788168907166, + "learning_rate": 6.456327793456387e-05, + "loss": 1.7608, + "step": 13807 + }, + { + "epoch": 4.238182934315531, + "grad_norm": 0.27857527136802673, + "learning_rate": 6.455852281509493e-05, + "loss": 1.7281, + "step": 13808 + }, + { + "epoch": 4.238489871086556, + "grad_norm": 0.24014849960803986, + "learning_rate": 6.455376755175012e-05, + "loss": 1.7247, + "step": 13809 + }, + { + "epoch": 4.2387968078575815, + "grad_norm": 0.25149038434028625, + "learning_rate": 6.454901214457646e-05, + "loss": 1.8575, + "step": 13810 + }, + { + "epoch": 4.239103744628607, + "grad_norm": 0.32072681188583374, + "learning_rate": 6.454425659362093e-05, + "loss": 1.7421, + "step": 13811 + }, + { + "epoch": 4.239410681399631, + "grad_norm": 0.28418242931365967, + "learning_rate": 6.453950089893054e-05, + "loss": 1.7031, + "step": 13812 + }, + { + "epoch": 4.239717618170657, + "grad_norm": 0.23725132644176483, + "learning_rate": 6.453474506055228e-05, + "loss": 1.7901, + "step": 13813 + }, + { + "epoch": 4.240024554941682, + "grad_norm": 0.3056317865848541, + "learning_rate": 6.452998907853315e-05, + "loss": 1.7414, + "step": 13814 + }, + { + "epoch": 4.240331491712707, + "grad_norm": 0.3111891448497772, + "learning_rate": 6.452523295292013e-05, + "loss": 1.7532, + "step": 13815 + }, + { + "epoch": 4.240638428483733, + "grad_norm": 0.2126779705286026, + "learning_rate": 6.452047668376027e-05, + "loss": 1.6779, + "step": 13816 + }, + { + "epoch": 4.240945365254758, + "grad_norm": 0.26660779118537903, + "learning_rate": 6.451572027110054e-05, + "loss": 1.7162, + "step": 13817 + }, + { + "epoch": 4.241252302025782, + "grad_norm": 0.25901922583580017, + "learning_rate": 6.451096371498794e-05, + "loss": 1.7784, + "step": 13818 + }, + { + "epoch": 4.241559238796808, + "grad_norm": 0.24091807007789612, + "learning_rate": 6.450620701546953e-05, + "loss": 1.7928, + "step": 13819 + }, + { + "epoch": 4.241866175567833, + "grad_norm": 0.25097009539604187, + "learning_rate": 6.450145017259225e-05, + "loss": 1.761, + "step": 13820 + }, + { + "epoch": 4.242173112338858, + "grad_norm": 0.22978942096233368, + "learning_rate": 6.449669318640315e-05, + "loss": 1.7891, + "step": 13821 + }, + { + "epoch": 4.242480049109884, + "grad_norm": 0.27255937457084656, + "learning_rate": 6.449193605694923e-05, + "loss": 1.7964, + "step": 13822 + }, + { + "epoch": 4.242786985880908, + "grad_norm": 0.2210773378610611, + "learning_rate": 6.44871787842775e-05, + "loss": 1.7628, + "step": 13823 + }, + { + "epoch": 4.2430939226519335, + "grad_norm": 0.25784751772880554, + "learning_rate": 6.448242136843497e-05, + "loss": 1.7596, + "step": 13824 + }, + { + "epoch": 4.243400859422959, + "grad_norm": 0.23475486040115356, + "learning_rate": 6.447766380946868e-05, + "loss": 1.8174, + "step": 13825 + }, + { + "epoch": 4.243707796193984, + "grad_norm": 0.2567705512046814, + "learning_rate": 6.447290610742561e-05, + "loss": 1.737, + "step": 13826 + }, + { + "epoch": 4.2440147329650095, + "grad_norm": 0.23973144590854645, + "learning_rate": 6.446814826235281e-05, + "loss": 1.7881, + "step": 13827 + }, + { + "epoch": 4.244321669736034, + "grad_norm": 0.25584739446640015, + "learning_rate": 6.446339027429729e-05, + "loss": 1.7673, + "step": 13828 + }, + { + "epoch": 4.244628606507059, + "grad_norm": 0.2653748393058777, + "learning_rate": 6.445863214330608e-05, + "loss": 1.7443, + "step": 13829 + }, + { + "epoch": 4.244935543278085, + "grad_norm": 0.2492038607597351, + "learning_rate": 6.445387386942619e-05, + "loss": 1.7223, + "step": 13830 + }, + { + "epoch": 4.24524248004911, + "grad_norm": 0.2282228320837021, + "learning_rate": 6.444911545270464e-05, + "loss": 1.7577, + "step": 13831 + }, + { + "epoch": 4.245549416820135, + "grad_norm": 0.2411092072725296, + "learning_rate": 6.444435689318845e-05, + "loss": 1.7324, + "step": 13832 + }, + { + "epoch": 4.245856353591161, + "grad_norm": 0.21557089686393738, + "learning_rate": 6.443959819092468e-05, + "loss": 1.7355, + "step": 13833 + }, + { + "epoch": 4.246163290362185, + "grad_norm": 0.2500394880771637, + "learning_rate": 6.443483934596033e-05, + "loss": 1.775, + "step": 13834 + }, + { + "epoch": 4.24647022713321, + "grad_norm": 0.24135248363018036, + "learning_rate": 6.443008035834244e-05, + "loss": 1.7885, + "step": 13835 + }, + { + "epoch": 4.246777163904236, + "grad_norm": 0.22860904037952423, + "learning_rate": 6.442532122811803e-05, + "loss": 1.7891, + "step": 13836 + }, + { + "epoch": 4.247084100675261, + "grad_norm": 0.2277665138244629, + "learning_rate": 6.442056195533415e-05, + "loss": 1.7583, + "step": 13837 + }, + { + "epoch": 4.247391037446286, + "grad_norm": 0.22822454571723938, + "learning_rate": 6.441580254003782e-05, + "loss": 1.7777, + "step": 13838 + }, + { + "epoch": 4.247697974217311, + "grad_norm": 0.24274896085262299, + "learning_rate": 6.441104298227608e-05, + "loss": 1.7537, + "step": 13839 + }, + { + "epoch": 4.248004910988336, + "grad_norm": 0.25080999732017517, + "learning_rate": 6.440628328209598e-05, + "loss": 1.7537, + "step": 13840 + }, + { + "epoch": 4.2483118477593615, + "grad_norm": 0.22409579157829285, + "learning_rate": 6.440152343954453e-05, + "loss": 1.7652, + "step": 13841 + }, + { + "epoch": 4.248618784530387, + "grad_norm": 0.24028798937797546, + "learning_rate": 6.439676345466877e-05, + "loss": 1.7512, + "step": 13842 + }, + { + "epoch": 4.248925721301412, + "grad_norm": 0.28739503026008606, + "learning_rate": 6.439200332751576e-05, + "loss": 1.8034, + "step": 13843 + }, + { + "epoch": 4.249232658072437, + "grad_norm": 0.2244807928800583, + "learning_rate": 6.438724305813255e-05, + "loss": 1.7243, + "step": 13844 + }, + { + "epoch": 4.249539594843462, + "grad_norm": 0.24478118121623993, + "learning_rate": 6.438248264656618e-05, + "loss": 1.7754, + "step": 13845 + }, + { + "epoch": 4.249846531614487, + "grad_norm": 0.25554370880126953, + "learning_rate": 6.437772209286368e-05, + "loss": 1.7845, + "step": 13846 + }, + { + "epoch": 4.250153468385513, + "grad_norm": 0.24478472769260406, + "learning_rate": 6.43729613970721e-05, + "loss": 1.7954, + "step": 13847 + }, + { + "epoch": 4.250460405156538, + "grad_norm": 0.22287282347679138, + "learning_rate": 6.436820055923849e-05, + "loss": 1.7379, + "step": 13848 + }, + { + "epoch": 4.250767341927563, + "grad_norm": 0.2810569703578949, + "learning_rate": 6.43634395794099e-05, + "loss": 1.8492, + "step": 13849 + }, + { + "epoch": 4.251074278698588, + "grad_norm": 0.2544163465499878, + "learning_rate": 6.435867845763337e-05, + "loss": 1.7846, + "step": 13850 + }, + { + "epoch": 4.251381215469613, + "grad_norm": 0.27879175543785095, + "learning_rate": 6.435391719395598e-05, + "loss": 1.767, + "step": 13851 + }, + { + "epoch": 4.2516881522406385, + "grad_norm": 0.2876715362071991, + "learning_rate": 6.434915578842477e-05, + "loss": 1.8048, + "step": 13852 + }, + { + "epoch": 4.251995089011664, + "grad_norm": 0.27844297885894775, + "learning_rate": 6.434439424108678e-05, + "loss": 1.7472, + "step": 13853 + }, + { + "epoch": 4.252302025782689, + "grad_norm": 0.2417020946741104, + "learning_rate": 6.43396325519891e-05, + "loss": 1.8481, + "step": 13854 + }, + { + "epoch": 4.252608962553714, + "grad_norm": 0.23828522861003876, + "learning_rate": 6.433487072117874e-05, + "loss": 1.7536, + "step": 13855 + }, + { + "epoch": 4.252915899324739, + "grad_norm": 0.22304333746433258, + "learning_rate": 6.43301087487028e-05, + "loss": 1.741, + "step": 13856 + }, + { + "epoch": 4.253222836095764, + "grad_norm": 0.27089163661003113, + "learning_rate": 6.432534663460832e-05, + "loss": 1.7974, + "step": 13857 + }, + { + "epoch": 4.25352977286679, + "grad_norm": 0.2439592182636261, + "learning_rate": 6.432058437894237e-05, + "loss": 1.7713, + "step": 13858 + }, + { + "epoch": 4.253836709637815, + "grad_norm": 0.2368553727865219, + "learning_rate": 6.431582198175203e-05, + "loss": 1.6915, + "step": 13859 + }, + { + "epoch": 4.25414364640884, + "grad_norm": 0.25248441100120544, + "learning_rate": 6.431105944308431e-05, + "loss": 1.7286, + "step": 13860 + }, + { + "epoch": 4.254450583179865, + "grad_norm": 0.20928484201431274, + "learning_rate": 6.430629676298634e-05, + "loss": 1.79, + "step": 13861 + }, + { + "epoch": 4.25475751995089, + "grad_norm": 0.25262540578842163, + "learning_rate": 6.430153394150514e-05, + "loss": 1.7443, + "step": 13862 + }, + { + "epoch": 4.255064456721915, + "grad_norm": 0.27508237957954407, + "learning_rate": 6.429677097868783e-05, + "loss": 1.8207, + "step": 13863 + }, + { + "epoch": 4.255371393492941, + "grad_norm": 0.28129303455352783, + "learning_rate": 6.429200787458141e-05, + "loss": 1.7589, + "step": 13864 + }, + { + "epoch": 4.255678330263966, + "grad_norm": 0.3205658495426178, + "learning_rate": 6.428724462923302e-05, + "loss": 1.8037, + "step": 13865 + }, + { + "epoch": 4.2559852670349905, + "grad_norm": 0.24048078060150146, + "learning_rate": 6.428248124268969e-05, + "loss": 1.7303, + "step": 13866 + }, + { + "epoch": 4.256292203806016, + "grad_norm": 0.24742475152015686, + "learning_rate": 6.427771771499852e-05, + "loss": 1.7753, + "step": 13867 + }, + { + "epoch": 4.256599140577041, + "grad_norm": 0.3082354962825775, + "learning_rate": 6.427295404620656e-05, + "loss": 1.7275, + "step": 13868 + }, + { + "epoch": 4.2569060773480665, + "grad_norm": 0.23319822549819946, + "learning_rate": 6.426819023636093e-05, + "loss": 1.7562, + "step": 13869 + }, + { + "epoch": 4.257213014119092, + "grad_norm": 0.2611405551433563, + "learning_rate": 6.426342628550866e-05, + "loss": 1.7417, + "step": 13870 + }, + { + "epoch": 4.257519950890116, + "grad_norm": 0.2577543258666992, + "learning_rate": 6.425866219369686e-05, + "loss": 1.6906, + "step": 13871 + }, + { + "epoch": 4.257826887661142, + "grad_norm": 0.31353357434272766, + "learning_rate": 6.42538979609726e-05, + "loss": 1.7155, + "step": 13872 + }, + { + "epoch": 4.258133824432167, + "grad_norm": 0.23280073702335358, + "learning_rate": 6.424913358738296e-05, + "loss": 1.7576, + "step": 13873 + }, + { + "epoch": 4.258440761203192, + "grad_norm": 0.24087542295455933, + "learning_rate": 6.424436907297504e-05, + "loss": 1.7622, + "step": 13874 + }, + { + "epoch": 4.258747697974218, + "grad_norm": 0.3146509826183319, + "learning_rate": 6.42396044177959e-05, + "loss": 1.769, + "step": 13875 + }, + { + "epoch": 4.259054634745242, + "grad_norm": 0.2645811438560486, + "learning_rate": 6.423483962189268e-05, + "loss": 1.7713, + "step": 13876 + }, + { + "epoch": 4.259361571516267, + "grad_norm": 0.2166455090045929, + "learning_rate": 6.423007468531238e-05, + "loss": 1.7705, + "step": 13877 + }, + { + "epoch": 4.259668508287293, + "grad_norm": 0.29142528772354126, + "learning_rate": 6.422530960810217e-05, + "loss": 1.7725, + "step": 13878 + }, + { + "epoch": 4.259975445058318, + "grad_norm": 0.28777652978897095, + "learning_rate": 6.422054439030911e-05, + "loss": 1.7853, + "step": 13879 + }, + { + "epoch": 4.260282381829343, + "grad_norm": 0.2285117357969284, + "learning_rate": 6.42157790319803e-05, + "loss": 1.7034, + "step": 13880 + }, + { + "epoch": 4.260589318600369, + "grad_norm": 0.32407644391059875, + "learning_rate": 6.421101353316282e-05, + "loss": 1.7858, + "step": 13881 + }, + { + "epoch": 4.260896255371393, + "grad_norm": 0.4803469777107239, + "learning_rate": 6.420624789390378e-05, + "loss": 1.7337, + "step": 13882 + }, + { + "epoch": 4.2612031921424185, + "grad_norm": 0.4245823919773102, + "learning_rate": 6.420148211425027e-05, + "loss": 1.8024, + "step": 13883 + }, + { + "epoch": 4.261510128913444, + "grad_norm": 0.22298674285411835, + "learning_rate": 6.419671619424938e-05, + "loss": 1.7129, + "step": 13884 + }, + { + "epoch": 4.261817065684469, + "grad_norm": 0.46955862641334534, + "learning_rate": 6.419195013394824e-05, + "loss": 1.7151, + "step": 13885 + }, + { + "epoch": 4.2621240024554945, + "grad_norm": 0.4809224009513855, + "learning_rate": 6.418718393339392e-05, + "loss": 1.7697, + "step": 13886 + }, + { + "epoch": 4.262430939226519, + "grad_norm": 0.2741130292415619, + "learning_rate": 6.418241759263353e-05, + "loss": 1.8133, + "step": 13887 + }, + { + "epoch": 4.262737875997544, + "grad_norm": 0.3673117756843567, + "learning_rate": 6.417765111171419e-05, + "loss": 1.7424, + "step": 13888 + }, + { + "epoch": 4.26304481276857, + "grad_norm": 0.4609327018260956, + "learning_rate": 6.417288449068299e-05, + "loss": 1.741, + "step": 13889 + }, + { + "epoch": 4.263351749539595, + "grad_norm": 0.2929460406303406, + "learning_rate": 6.416811772958702e-05, + "loss": 1.8385, + "step": 13890 + }, + { + "epoch": 4.26365868631062, + "grad_norm": 0.2727305293083191, + "learning_rate": 6.416335082847342e-05, + "loss": 1.794, + "step": 13891 + }, + { + "epoch": 4.263965623081646, + "grad_norm": 0.26089411973953247, + "learning_rate": 6.41585837873893e-05, + "loss": 1.7907, + "step": 13892 + }, + { + "epoch": 4.26427255985267, + "grad_norm": 0.24655573070049286, + "learning_rate": 6.415381660638174e-05, + "loss": 1.7481, + "step": 13893 + }, + { + "epoch": 4.264579496623695, + "grad_norm": 0.4186919629573822, + "learning_rate": 6.414904928549787e-05, + "loss": 1.8021, + "step": 13894 + }, + { + "epoch": 4.264886433394721, + "grad_norm": 0.38188236951828003, + "learning_rate": 6.414428182478478e-05, + "loss": 1.75, + "step": 13895 + }, + { + "epoch": 4.265193370165746, + "grad_norm": 0.23686440289020538, + "learning_rate": 6.413951422428963e-05, + "loss": 1.7882, + "step": 13896 + }, + { + "epoch": 4.265500306936771, + "grad_norm": 0.35963737964630127, + "learning_rate": 6.413474648405952e-05, + "loss": 1.7427, + "step": 13897 + }, + { + "epoch": 4.265807243707796, + "grad_norm": 0.38558289408683777, + "learning_rate": 6.412997860414155e-05, + "loss": 1.7622, + "step": 13898 + }, + { + "epoch": 4.266114180478821, + "grad_norm": 0.2311459481716156, + "learning_rate": 6.412521058458285e-05, + "loss": 1.7894, + "step": 13899 + }, + { + "epoch": 4.2664211172498465, + "grad_norm": 0.2647818624973297, + "learning_rate": 6.412044242543054e-05, + "loss": 1.7399, + "step": 13900 + }, + { + "epoch": 4.266728054020872, + "grad_norm": 0.3174133002758026, + "learning_rate": 6.411567412673174e-05, + "loss": 1.7552, + "step": 13901 + }, + { + "epoch": 4.267034990791897, + "grad_norm": 0.25207316875457764, + "learning_rate": 6.411090568853358e-05, + "loss": 1.7876, + "step": 13902 + }, + { + "epoch": 4.267341927562922, + "grad_norm": 0.24549202620983124, + "learning_rate": 6.410613711088317e-05, + "loss": 1.8554, + "step": 13903 + }, + { + "epoch": 4.267648864333947, + "grad_norm": 0.26293641328811646, + "learning_rate": 6.410136839382765e-05, + "loss": 1.8553, + "step": 13904 + }, + { + "epoch": 4.267955801104972, + "grad_norm": 0.20258362591266632, + "learning_rate": 6.409659953741416e-05, + "loss": 1.7205, + "step": 13905 + }, + { + "epoch": 4.268262737875998, + "grad_norm": 0.24885907769203186, + "learning_rate": 6.409183054168979e-05, + "loss": 1.7718, + "step": 13906 + }, + { + "epoch": 4.268569674647023, + "grad_norm": 0.22737209498882294, + "learning_rate": 6.408706140670169e-05, + "loss": 1.7228, + "step": 13907 + }, + { + "epoch": 4.268876611418047, + "grad_norm": 0.2201235145330429, + "learning_rate": 6.4082292132497e-05, + "loss": 1.7451, + "step": 13908 + }, + { + "epoch": 4.269183548189073, + "grad_norm": 0.24108454585075378, + "learning_rate": 6.407752271912285e-05, + "loss": 1.7531, + "step": 13909 + }, + { + "epoch": 4.269490484960098, + "grad_norm": 0.21723641455173492, + "learning_rate": 6.407275316662636e-05, + "loss": 1.7139, + "step": 13910 + }, + { + "epoch": 4.269797421731123, + "grad_norm": 0.22557848691940308, + "learning_rate": 6.406798347505469e-05, + "loss": 1.7633, + "step": 13911 + }, + { + "epoch": 4.270104358502149, + "grad_norm": 0.24664700031280518, + "learning_rate": 6.406321364445494e-05, + "loss": 1.7854, + "step": 13912 + }, + { + "epoch": 4.270411295273174, + "grad_norm": 0.2599056661128998, + "learning_rate": 6.405844367487428e-05, + "loss": 1.7662, + "step": 13913 + }, + { + "epoch": 4.2707182320441985, + "grad_norm": 0.2378663718700409, + "learning_rate": 6.405367356635982e-05, + "loss": 1.7477, + "step": 13914 + }, + { + "epoch": 4.271025168815224, + "grad_norm": 0.27158626914024353, + "learning_rate": 6.404890331895876e-05, + "loss": 1.7426, + "step": 13915 + }, + { + "epoch": 4.271332105586249, + "grad_norm": 0.28585317730903625, + "learning_rate": 6.404413293271818e-05, + "loss": 1.7492, + "step": 13916 + }, + { + "epoch": 4.2716390423572745, + "grad_norm": 0.2321750968694687, + "learning_rate": 6.403936240768526e-05, + "loss": 1.8594, + "step": 13917 + }, + { + "epoch": 4.2719459791283, + "grad_norm": 0.25824111700057983, + "learning_rate": 6.40345917439071e-05, + "loss": 1.7622, + "step": 13918 + }, + { + "epoch": 4.272252915899324, + "grad_norm": 0.24641194939613342, + "learning_rate": 6.40298209414309e-05, + "loss": 1.7519, + "step": 13919 + }, + { + "epoch": 4.27255985267035, + "grad_norm": 0.2132398933172226, + "learning_rate": 6.40250500003038e-05, + "loss": 1.7339, + "step": 13920 + }, + { + "epoch": 4.272866789441375, + "grad_norm": 0.22630736231803894, + "learning_rate": 6.402027892057292e-05, + "loss": 1.7396, + "step": 13921 + }, + { + "epoch": 4.2731737262124, + "grad_norm": 0.295163631439209, + "learning_rate": 6.401550770228543e-05, + "loss": 1.8063, + "step": 13922 + }, + { + "epoch": 4.273480662983426, + "grad_norm": 0.2722746729850769, + "learning_rate": 6.401073634548848e-05, + "loss": 1.7775, + "step": 13923 + }, + { + "epoch": 4.273787599754451, + "grad_norm": 0.23201976716518402, + "learning_rate": 6.400596485022922e-05, + "loss": 1.7755, + "step": 13924 + }, + { + "epoch": 4.274094536525475, + "grad_norm": 0.23880761861801147, + "learning_rate": 6.40011932165548e-05, + "loss": 1.778, + "step": 13925 + }, + { + "epoch": 4.274401473296501, + "grad_norm": 0.22305625677108765, + "learning_rate": 6.399642144451239e-05, + "loss": 1.761, + "step": 13926 + }, + { + "epoch": 4.274708410067526, + "grad_norm": 0.21874886751174927, + "learning_rate": 6.399164953414914e-05, + "loss": 1.7148, + "step": 13927 + }, + { + "epoch": 4.2750153468385514, + "grad_norm": 0.2003604918718338, + "learning_rate": 6.398687748551221e-05, + "loss": 1.8049, + "step": 13928 + }, + { + "epoch": 4.275322283609577, + "grad_norm": 0.2443511188030243, + "learning_rate": 6.398210529864875e-05, + "loss": 1.782, + "step": 13929 + }, + { + "epoch": 4.275629220380601, + "grad_norm": 0.2297198623418808, + "learning_rate": 6.397733297360594e-05, + "loss": 1.7682, + "step": 13930 + }, + { + "epoch": 4.275936157151627, + "grad_norm": 0.23474562168121338, + "learning_rate": 6.39725605104309e-05, + "loss": 1.7809, + "step": 13931 + }, + { + "epoch": 4.276243093922652, + "grad_norm": 0.25908544659614563, + "learning_rate": 6.396778790917087e-05, + "loss": 1.7343, + "step": 13932 + }, + { + "epoch": 4.276550030693677, + "grad_norm": 0.2440379112958908, + "learning_rate": 6.396301516987295e-05, + "loss": 1.786, + "step": 13933 + }, + { + "epoch": 4.276856967464703, + "grad_norm": 0.26185858249664307, + "learning_rate": 6.395824229258435e-05, + "loss": 1.7863, + "step": 13934 + }, + { + "epoch": 4.277163904235728, + "grad_norm": 0.24470919370651245, + "learning_rate": 6.39534692773522e-05, + "loss": 1.7774, + "step": 13935 + }, + { + "epoch": 4.277470841006752, + "grad_norm": 0.2612632215023041, + "learning_rate": 6.39486961242237e-05, + "loss": 1.7536, + "step": 13936 + }, + { + "epoch": 4.277777777777778, + "grad_norm": 0.26870301365852356, + "learning_rate": 6.3943922833246e-05, + "loss": 1.8177, + "step": 13937 + }, + { + "epoch": 4.278084714548803, + "grad_norm": 0.24445784091949463, + "learning_rate": 6.393914940446628e-05, + "loss": 1.7539, + "step": 13938 + }, + { + "epoch": 4.278391651319828, + "grad_norm": 0.2622319757938385, + "learning_rate": 6.393437583793174e-05, + "loss": 1.8252, + "step": 13939 + }, + { + "epoch": 4.278698588090854, + "grad_norm": 0.2586652636528015, + "learning_rate": 6.39296021336895e-05, + "loss": 1.7975, + "step": 13940 + }, + { + "epoch": 4.279005524861878, + "grad_norm": 0.19488228857517242, + "learning_rate": 6.392482829178678e-05, + "loss": 1.7678, + "step": 13941 + }, + { + "epoch": 4.2793124616329035, + "grad_norm": 0.23956604301929474, + "learning_rate": 6.392005431227074e-05, + "loss": 1.7444, + "step": 13942 + }, + { + "epoch": 4.279619398403929, + "grad_norm": 0.24195842444896698, + "learning_rate": 6.391528019518857e-05, + "loss": 1.8116, + "step": 13943 + }, + { + "epoch": 4.279926335174954, + "grad_norm": 0.21479523181915283, + "learning_rate": 6.391050594058746e-05, + "loss": 1.7351, + "step": 13944 + }, + { + "epoch": 4.2802332719459795, + "grad_norm": 0.2309941202402115, + "learning_rate": 6.390573154851456e-05, + "loss": 1.8245, + "step": 13945 + }, + { + "epoch": 4.280540208717004, + "grad_norm": 0.2375536412000656, + "learning_rate": 6.390095701901706e-05, + "loss": 1.7921, + "step": 13946 + }, + { + "epoch": 4.280847145488029, + "grad_norm": 0.25518664717674255, + "learning_rate": 6.389618235214216e-05, + "loss": 1.7549, + "step": 13947 + }, + { + "epoch": 4.281154082259055, + "grad_norm": 0.2579016089439392, + "learning_rate": 6.389140754793705e-05, + "loss": 1.7637, + "step": 13948 + }, + { + "epoch": 4.28146101903008, + "grad_norm": 0.25350916385650635, + "learning_rate": 6.388663260644892e-05, + "loss": 1.746, + "step": 13949 + }, + { + "epoch": 4.281767955801105, + "grad_norm": 0.2994026839733124, + "learning_rate": 6.388185752772493e-05, + "loss": 1.8196, + "step": 13950 + }, + { + "epoch": 4.28207489257213, + "grad_norm": 0.29938533902168274, + "learning_rate": 6.387708231181229e-05, + "loss": 1.7187, + "step": 13951 + }, + { + "epoch": 4.282381829343155, + "grad_norm": 0.23865137994289398, + "learning_rate": 6.387230695875819e-05, + "loss": 1.7317, + "step": 13952 + }, + { + "epoch": 4.28268876611418, + "grad_norm": 0.23812857270240784, + "learning_rate": 6.386753146860982e-05, + "loss": 1.7536, + "step": 13953 + }, + { + "epoch": 4.282995702885206, + "grad_norm": 0.3395650088787079, + "learning_rate": 6.386275584141438e-05, + "loss": 1.7932, + "step": 13954 + }, + { + "epoch": 4.283302639656231, + "grad_norm": 0.38207507133483887, + "learning_rate": 6.385798007721906e-05, + "loss": 1.8196, + "step": 13955 + }, + { + "epoch": 4.283609576427256, + "grad_norm": 0.32960978150367737, + "learning_rate": 6.385320417607107e-05, + "loss": 1.7898, + "step": 13956 + }, + { + "epoch": 4.283916513198281, + "grad_norm": 0.22978928685188293, + "learning_rate": 6.384842813801757e-05, + "loss": 1.7835, + "step": 13957 + }, + { + "epoch": 4.284223449969306, + "grad_norm": 0.24607588350772858, + "learning_rate": 6.38436519631058e-05, + "loss": 1.7829, + "step": 13958 + }, + { + "epoch": 4.2845303867403315, + "grad_norm": 0.2770270109176636, + "learning_rate": 6.383887565138295e-05, + "loss": 1.7294, + "step": 13959 + }, + { + "epoch": 4.284837323511357, + "grad_norm": 0.27644863724708557, + "learning_rate": 6.383409920289622e-05, + "loss": 1.829, + "step": 13960 + }, + { + "epoch": 4.285144260282382, + "grad_norm": 0.3870919942855835, + "learning_rate": 6.382932261769282e-05, + "loss": 1.8146, + "step": 13961 + }, + { + "epoch": 4.285451197053407, + "grad_norm": 0.3562348186969757, + "learning_rate": 6.382454589581994e-05, + "loss": 1.8225, + "step": 13962 + }, + { + "epoch": 4.285758133824432, + "grad_norm": 0.28444886207580566, + "learning_rate": 6.38197690373248e-05, + "loss": 1.7734, + "step": 13963 + }, + { + "epoch": 4.286065070595457, + "grad_norm": 0.27935758233070374, + "learning_rate": 6.381499204225459e-05, + "loss": 1.7402, + "step": 13964 + }, + { + "epoch": 4.286372007366483, + "grad_norm": 0.34188997745513916, + "learning_rate": 6.381021491065653e-05, + "loss": 1.7661, + "step": 13965 + }, + { + "epoch": 4.286678944137508, + "grad_norm": 0.28648918867111206, + "learning_rate": 6.380543764257785e-05, + "loss": 1.8312, + "step": 13966 + }, + { + "epoch": 4.286985880908533, + "grad_norm": 0.2733290493488312, + "learning_rate": 6.380066023806572e-05, + "loss": 1.7505, + "step": 13967 + }, + { + "epoch": 4.287292817679558, + "grad_norm": 0.3344273865222931, + "learning_rate": 6.37958826971674e-05, + "loss": 1.8392, + "step": 13968 + }, + { + "epoch": 4.287599754450583, + "grad_norm": 0.2655799090862274, + "learning_rate": 6.379110501993006e-05, + "loss": 1.7575, + "step": 13969 + }, + { + "epoch": 4.287906691221608, + "grad_norm": 0.2569151818752289, + "learning_rate": 6.378632720640095e-05, + "loss": 1.6619, + "step": 13970 + }, + { + "epoch": 4.288213627992634, + "grad_norm": 0.2477198988199234, + "learning_rate": 6.378154925662727e-05, + "loss": 1.7532, + "step": 13971 + }, + { + "epoch": 4.288520564763659, + "grad_norm": 0.2867630422115326, + "learning_rate": 6.377677117065624e-05, + "loss": 1.7725, + "step": 13972 + }, + { + "epoch": 4.2888275015346835, + "grad_norm": 0.28316137194633484, + "learning_rate": 6.37719929485351e-05, + "loss": 1.7628, + "step": 13973 + }, + { + "epoch": 4.289134438305709, + "grad_norm": 0.2934304475784302, + "learning_rate": 6.376721459031106e-05, + "loss": 1.7346, + "step": 13974 + }, + { + "epoch": 4.289441375076734, + "grad_norm": 0.22847147285938263, + "learning_rate": 6.376243609603129e-05, + "loss": 1.7409, + "step": 13975 + }, + { + "epoch": 4.2897483118477595, + "grad_norm": 0.360441118478775, + "learning_rate": 6.375765746574311e-05, + "loss": 1.808, + "step": 13976 + }, + { + "epoch": 4.290055248618785, + "grad_norm": 0.2750907242298126, + "learning_rate": 6.375287869949367e-05, + "loss": 1.8046, + "step": 13977 + }, + { + "epoch": 4.290362185389809, + "grad_norm": 0.26193201541900635, + "learning_rate": 6.374809979733022e-05, + "loss": 1.7097, + "step": 13978 + }, + { + "epoch": 4.290669122160835, + "grad_norm": 0.3282175064086914, + "learning_rate": 6.37433207593e-05, + "loss": 1.7924, + "step": 13979 + }, + { + "epoch": 4.29097605893186, + "grad_norm": 0.2845167815685272, + "learning_rate": 6.373854158545021e-05, + "loss": 1.7663, + "step": 13980 + }, + { + "epoch": 4.291282995702885, + "grad_norm": 0.21816621720790863, + "learning_rate": 6.37337622758281e-05, + "loss": 1.7368, + "step": 13981 + }, + { + "epoch": 4.291589932473911, + "grad_norm": 0.264272540807724, + "learning_rate": 6.372898283048094e-05, + "loss": 1.7377, + "step": 13982 + }, + { + "epoch": 4.291896869244935, + "grad_norm": 0.2182006686925888, + "learning_rate": 6.37242032494559e-05, + "loss": 1.8107, + "step": 13983 + }, + { + "epoch": 4.29220380601596, + "grad_norm": 0.26856422424316406, + "learning_rate": 6.371942353280023e-05, + "loss": 1.7708, + "step": 13984 + }, + { + "epoch": 4.292510742786986, + "grad_norm": 0.3025323748588562, + "learning_rate": 6.37146436805612e-05, + "loss": 1.7768, + "step": 13985 + }, + { + "epoch": 4.292817679558011, + "grad_norm": 0.2949144244194031, + "learning_rate": 6.3709863692786e-05, + "loss": 1.7848, + "step": 13986 + }, + { + "epoch": 4.293124616329036, + "grad_norm": 0.20670418441295624, + "learning_rate": 6.370508356952188e-05, + "loss": 1.7367, + "step": 13987 + }, + { + "epoch": 4.293431553100062, + "grad_norm": 0.2453860342502594, + "learning_rate": 6.370030331081611e-05, + "loss": 1.7246, + "step": 13988 + }, + { + "epoch": 4.293738489871086, + "grad_norm": 0.3413507044315338, + "learning_rate": 6.369552291671592e-05, + "loss": 1.7829, + "step": 13989 + }, + { + "epoch": 4.2940454266421115, + "grad_norm": 0.28352782130241394, + "learning_rate": 6.369074238726856e-05, + "loss": 1.7755, + "step": 13990 + }, + { + "epoch": 4.294352363413137, + "grad_norm": 0.21408751606941223, + "learning_rate": 6.368596172252124e-05, + "loss": 1.7292, + "step": 13991 + }, + { + "epoch": 4.294659300184162, + "grad_norm": 0.28372085094451904, + "learning_rate": 6.36811809225212e-05, + "loss": 1.8197, + "step": 13992 + }, + { + "epoch": 4.2949662369551875, + "grad_norm": 0.2400829792022705, + "learning_rate": 6.367639998731573e-05, + "loss": 1.7559, + "step": 13993 + }, + { + "epoch": 4.295273173726212, + "grad_norm": 0.22853593528270721, + "learning_rate": 6.367161891695207e-05, + "loss": 1.8116, + "step": 13994 + }, + { + "epoch": 4.295580110497237, + "grad_norm": 0.22098208963871002, + "learning_rate": 6.366683771147745e-05, + "loss": 1.7269, + "step": 13995 + }, + { + "epoch": 4.295887047268263, + "grad_norm": 0.22293934226036072, + "learning_rate": 6.366205637093914e-05, + "loss": 1.7944, + "step": 13996 + }, + { + "epoch": 4.296193984039288, + "grad_norm": 0.26120004057884216, + "learning_rate": 6.365727489538437e-05, + "loss": 1.7581, + "step": 13997 + }, + { + "epoch": 4.296500920810313, + "grad_norm": 0.2568937838077545, + "learning_rate": 6.365249328486041e-05, + "loss": 1.7356, + "step": 13998 + }, + { + "epoch": 4.296807857581339, + "grad_norm": 0.2419043630361557, + "learning_rate": 6.364771153941449e-05, + "loss": 1.8127, + "step": 13999 + }, + { + "epoch": 4.297114794352363, + "grad_norm": 0.2521972060203552, + "learning_rate": 6.364292965909391e-05, + "loss": 1.7445, + "step": 14000 + }, + { + "epoch": 4.297421731123388, + "grad_norm": 0.3269292414188385, + "learning_rate": 6.363814764394589e-05, + "loss": 1.7835, + "step": 14001 + }, + { + "epoch": 4.297728667894414, + "grad_norm": 0.258405864238739, + "learning_rate": 6.36333654940177e-05, + "loss": 1.7407, + "step": 14002 + }, + { + "epoch": 4.298035604665439, + "grad_norm": 0.21527236700057983, + "learning_rate": 6.362858320935662e-05, + "loss": 1.7729, + "step": 14003 + }, + { + "epoch": 4.298342541436464, + "grad_norm": 0.25343602895736694, + "learning_rate": 6.362380079000988e-05, + "loss": 1.8087, + "step": 14004 + }, + { + "epoch": 4.298649478207489, + "grad_norm": 0.26110637187957764, + "learning_rate": 6.361901823602474e-05, + "loss": 1.813, + "step": 14005 + }, + { + "epoch": 4.298956414978514, + "grad_norm": 0.26749926805496216, + "learning_rate": 6.361423554744851e-05, + "loss": 1.8193, + "step": 14006 + }, + { + "epoch": 4.2992633517495396, + "grad_norm": 0.22357676923274994, + "learning_rate": 6.360945272432841e-05, + "loss": 1.7498, + "step": 14007 + }, + { + "epoch": 4.299570288520565, + "grad_norm": 0.2367832362651825, + "learning_rate": 6.360466976671172e-05, + "loss": 1.7843, + "step": 14008 + }, + { + "epoch": 4.29987722529159, + "grad_norm": 0.23594366014003754, + "learning_rate": 6.35998866746457e-05, + "loss": 1.7442, + "step": 14009 + }, + { + "epoch": 4.300184162062616, + "grad_norm": 0.2660543918609619, + "learning_rate": 6.359510344817765e-05, + "loss": 1.7557, + "step": 14010 + }, + { + "epoch": 4.30049109883364, + "grad_norm": 0.191593199968338, + "learning_rate": 6.359032008735481e-05, + "loss": 1.7988, + "step": 14011 + }, + { + "epoch": 4.300798035604665, + "grad_norm": 0.2755490243434906, + "learning_rate": 6.358553659222447e-05, + "loss": 1.7551, + "step": 14012 + }, + { + "epoch": 4.301104972375691, + "grad_norm": 0.2900530993938446, + "learning_rate": 6.358075296283387e-05, + "loss": 1.7523, + "step": 14013 + }, + { + "epoch": 4.301411909146716, + "grad_norm": 0.22242774069309235, + "learning_rate": 6.357596919923033e-05, + "loss": 1.7626, + "step": 14014 + }, + { + "epoch": 4.301718845917741, + "grad_norm": 0.26636210083961487, + "learning_rate": 6.357118530146108e-05, + "loss": 1.7855, + "step": 14015 + }, + { + "epoch": 4.302025782688766, + "grad_norm": 0.3055269718170166, + "learning_rate": 6.356640126957344e-05, + "loss": 1.7528, + "step": 14016 + }, + { + "epoch": 4.302332719459791, + "grad_norm": 0.29695719480514526, + "learning_rate": 6.356161710361468e-05, + "loss": 1.7482, + "step": 14017 + }, + { + "epoch": 4.3026396562308165, + "grad_norm": 0.2369711697101593, + "learning_rate": 6.355683280363207e-05, + "loss": 1.7635, + "step": 14018 + }, + { + "epoch": 4.302946593001842, + "grad_norm": 0.26681363582611084, + "learning_rate": 6.35520483696729e-05, + "loss": 1.8814, + "step": 14019 + }, + { + "epoch": 4.303253529772867, + "grad_norm": 0.2623308598995209, + "learning_rate": 6.354726380178442e-05, + "loss": 1.8645, + "step": 14020 + }, + { + "epoch": 4.303560466543892, + "grad_norm": 0.23326413333415985, + "learning_rate": 6.354247910001394e-05, + "loss": 1.8093, + "step": 14021 + }, + { + "epoch": 4.303867403314917, + "grad_norm": 0.3037295639514923, + "learning_rate": 6.353769426440875e-05, + "loss": 1.8556, + "step": 14022 + }, + { + "epoch": 4.304174340085942, + "grad_norm": 0.23624882102012634, + "learning_rate": 6.353290929501616e-05, + "loss": 1.803, + "step": 14023 + }, + { + "epoch": 4.304481276856968, + "grad_norm": 0.22106927633285522, + "learning_rate": 6.35281241918834e-05, + "loss": 1.7133, + "step": 14024 + }, + { + "epoch": 4.304788213627993, + "grad_norm": 0.2374040186405182, + "learning_rate": 6.352333895505778e-05, + "loss": 1.8127, + "step": 14025 + }, + { + "epoch": 4.305095150399017, + "grad_norm": 0.2782450318336487, + "learning_rate": 6.35185535845866e-05, + "loss": 1.8613, + "step": 14026 + }, + { + "epoch": 4.305402087170043, + "grad_norm": 0.2527763843536377, + "learning_rate": 6.351376808051717e-05, + "loss": 1.7533, + "step": 14027 + }, + { + "epoch": 4.305709023941068, + "grad_norm": 0.2462318390607834, + "learning_rate": 6.350898244289675e-05, + "loss": 1.8075, + "step": 14028 + }, + { + "epoch": 4.306015960712093, + "grad_norm": 0.2646189332008362, + "learning_rate": 6.350419667177265e-05, + "loss": 1.8261, + "step": 14029 + }, + { + "epoch": 4.306322897483119, + "grad_norm": 0.24918611347675323, + "learning_rate": 6.349941076719218e-05, + "loss": 1.7542, + "step": 14030 + }, + { + "epoch": 4.306629834254144, + "grad_norm": 0.22440841794013977, + "learning_rate": 6.349462472920259e-05, + "loss": 1.7897, + "step": 14031 + }, + { + "epoch": 4.3069367710251685, + "grad_norm": 0.28614330291748047, + "learning_rate": 6.348983855785121e-05, + "loss": 1.88, + "step": 14032 + }, + { + "epoch": 4.307243707796194, + "grad_norm": 0.25015848875045776, + "learning_rate": 6.348505225318535e-05, + "loss": 1.8008, + "step": 14033 + }, + { + "epoch": 4.307550644567219, + "grad_norm": 0.2468707263469696, + "learning_rate": 6.34802658152523e-05, + "loss": 1.8025, + "step": 14034 + }, + { + "epoch": 4.3078575813382445, + "grad_norm": 0.30504748225212097, + "learning_rate": 6.347547924409937e-05, + "loss": 1.8765, + "step": 14035 + }, + { + "epoch": 4.30816451810927, + "grad_norm": 0.35419392585754395, + "learning_rate": 6.347069253977385e-05, + "loss": 1.7807, + "step": 14036 + }, + { + "epoch": 4.308471454880294, + "grad_norm": 0.33683931827545166, + "learning_rate": 6.346590570232305e-05, + "loss": 1.7244, + "step": 14037 + }, + { + "epoch": 4.30877839165132, + "grad_norm": 0.3339467942714691, + "learning_rate": 6.346111873179427e-05, + "loss": 1.7642, + "step": 14038 + }, + { + "epoch": 4.309085328422345, + "grad_norm": 0.2369392216205597, + "learning_rate": 6.345633162823484e-05, + "loss": 1.7127, + "step": 14039 + }, + { + "epoch": 4.30939226519337, + "grad_norm": 0.26469686627388, + "learning_rate": 6.345154439169206e-05, + "loss": 1.7235, + "step": 14040 + }, + { + "epoch": 4.309699201964396, + "grad_norm": 0.2737344205379486, + "learning_rate": 6.344675702221321e-05, + "loss": 1.783, + "step": 14041 + }, + { + "epoch": 4.310006138735421, + "grad_norm": 0.2381773442029953, + "learning_rate": 6.344196951984565e-05, + "loss": 1.7172, + "step": 14042 + }, + { + "epoch": 4.310313075506445, + "grad_norm": 0.28199076652526855, + "learning_rate": 6.343718188463663e-05, + "loss": 1.8315, + "step": 14043 + }, + { + "epoch": 4.310620012277471, + "grad_norm": 0.24378590285778046, + "learning_rate": 6.343239411663353e-05, + "loss": 1.7828, + "step": 14044 + }, + { + "epoch": 4.310926949048496, + "grad_norm": 0.26343944668769836, + "learning_rate": 6.342760621588365e-05, + "loss": 1.7679, + "step": 14045 + }, + { + "epoch": 4.311233885819521, + "grad_norm": 0.23703521490097046, + "learning_rate": 6.342281818243427e-05, + "loss": 1.7885, + "step": 14046 + }, + { + "epoch": 4.311540822590547, + "grad_norm": 0.2230173498392105, + "learning_rate": 6.341803001633276e-05, + "loss": 1.767, + "step": 14047 + }, + { + "epoch": 4.311847759361571, + "grad_norm": 0.249002143740654, + "learning_rate": 6.34132417176264e-05, + "loss": 1.8032, + "step": 14048 + }, + { + "epoch": 4.3121546961325965, + "grad_norm": 0.2383791208267212, + "learning_rate": 6.34084532863625e-05, + "loss": 1.7558, + "step": 14049 + }, + { + "epoch": 4.312461632903622, + "grad_norm": 0.2783047556877136, + "learning_rate": 6.340366472258843e-05, + "loss": 1.8389, + "step": 14050 + }, + { + "epoch": 4.312768569674647, + "grad_norm": 0.2654891312122345, + "learning_rate": 6.339887602635148e-05, + "loss": 1.7989, + "step": 14051 + }, + { + "epoch": 4.3130755064456725, + "grad_norm": 0.2638411521911621, + "learning_rate": 6.3394087197699e-05, + "loss": 1.8707, + "step": 14052 + }, + { + "epoch": 4.313382443216697, + "grad_norm": 0.3026179075241089, + "learning_rate": 6.338929823667829e-05, + "loss": 1.7892, + "step": 14053 + }, + { + "epoch": 4.313689379987722, + "grad_norm": 0.27496880292892456, + "learning_rate": 6.338450914333668e-05, + "loss": 1.7398, + "step": 14054 + }, + { + "epoch": 4.313996316758748, + "grad_norm": 0.2601073086261749, + "learning_rate": 6.337971991772151e-05, + "loss": 1.7646, + "step": 14055 + }, + { + "epoch": 4.314303253529773, + "grad_norm": 0.2061719298362732, + "learning_rate": 6.337493055988011e-05, + "loss": 1.7372, + "step": 14056 + }, + { + "epoch": 4.314610190300798, + "grad_norm": 0.23722340166568756, + "learning_rate": 6.337014106985981e-05, + "loss": 1.7457, + "step": 14057 + }, + { + "epoch": 4.314917127071823, + "grad_norm": 0.2729428708553314, + "learning_rate": 6.336535144770793e-05, + "loss": 1.8423, + "step": 14058 + }, + { + "epoch": 4.315224063842848, + "grad_norm": 0.23520450294017792, + "learning_rate": 6.336056169347182e-05, + "loss": 1.8124, + "step": 14059 + }, + { + "epoch": 4.315531000613873, + "grad_norm": 0.25142738223075867, + "learning_rate": 6.33557718071988e-05, + "loss": 1.7285, + "step": 14060 + }, + { + "epoch": 4.315837937384899, + "grad_norm": 0.24833035469055176, + "learning_rate": 6.335098178893621e-05, + "loss": 1.766, + "step": 14061 + }, + { + "epoch": 4.316144874155924, + "grad_norm": 0.2406177669763565, + "learning_rate": 6.334619163873141e-05, + "loss": 1.8824, + "step": 14062 + }, + { + "epoch": 4.316451810926949, + "grad_norm": 0.23077574372291565, + "learning_rate": 6.334140135663172e-05, + "loss": 1.7589, + "step": 14063 + }, + { + "epoch": 4.316758747697974, + "grad_norm": 0.20476560294628143, + "learning_rate": 6.333661094268448e-05, + "loss": 1.7331, + "step": 14064 + }, + { + "epoch": 4.317065684468999, + "grad_norm": 0.207991823554039, + "learning_rate": 6.333182039693704e-05, + "loss": 1.6876, + "step": 14065 + }, + { + "epoch": 4.3173726212400245, + "grad_norm": 0.20813052356243134, + "learning_rate": 6.332702971943671e-05, + "loss": 1.775, + "step": 14066 + }, + { + "epoch": 4.31767955801105, + "grad_norm": 0.2470991462469101, + "learning_rate": 6.332223891023087e-05, + "loss": 1.7673, + "step": 14067 + }, + { + "epoch": 4.317986494782075, + "grad_norm": 0.23855723440647125, + "learning_rate": 6.331744796936687e-05, + "loss": 1.7842, + "step": 14068 + }, + { + "epoch": 4.3182934315531, + "grad_norm": 0.21852652728557587, + "learning_rate": 6.331265689689204e-05, + "loss": 1.7727, + "step": 14069 + }, + { + "epoch": 4.318600368324125, + "grad_norm": 0.284496545791626, + "learning_rate": 6.330786569285374e-05, + "loss": 1.8248, + "step": 14070 + }, + { + "epoch": 4.31890730509515, + "grad_norm": 0.21709981560707092, + "learning_rate": 6.33030743572993e-05, + "loss": 1.7547, + "step": 14071 + }, + { + "epoch": 4.319214241866176, + "grad_norm": 0.24209457635879517, + "learning_rate": 6.329828289027608e-05, + "loss": 1.7695, + "step": 14072 + }, + { + "epoch": 4.319521178637201, + "grad_norm": 0.24869373440742493, + "learning_rate": 6.329349129183144e-05, + "loss": 1.8204, + "step": 14073 + }, + { + "epoch": 4.319828115408226, + "grad_norm": 0.21702703833580017, + "learning_rate": 6.328869956201274e-05, + "loss": 1.779, + "step": 14074 + }, + { + "epoch": 4.320135052179251, + "grad_norm": 0.22993850708007812, + "learning_rate": 6.328390770086731e-05, + "loss": 1.7935, + "step": 14075 + }, + { + "epoch": 4.320441988950276, + "grad_norm": 0.23491734266281128, + "learning_rate": 6.327911570844252e-05, + "loss": 1.7261, + "step": 14076 + }, + { + "epoch": 4.320748925721301, + "grad_norm": 0.2479303777217865, + "learning_rate": 6.327432358478571e-05, + "loss": 1.7683, + "step": 14077 + }, + { + "epoch": 4.321055862492327, + "grad_norm": 0.24261580407619476, + "learning_rate": 6.326953132994427e-05, + "loss": 1.7147, + "step": 14078 + }, + { + "epoch": 4.321362799263352, + "grad_norm": 0.24627646803855896, + "learning_rate": 6.326473894396553e-05, + "loss": 1.7976, + "step": 14079 + }, + { + "epoch": 4.3216697360343765, + "grad_norm": 0.269149512052536, + "learning_rate": 6.325994642689688e-05, + "loss": 1.7247, + "step": 14080 + }, + { + "epoch": 4.321976672805402, + "grad_norm": 0.4162158966064453, + "learning_rate": 6.325515377878566e-05, + "loss": 1.7485, + "step": 14081 + }, + { + "epoch": 4.322283609576427, + "grad_norm": 0.366459459066391, + "learning_rate": 6.325036099967925e-05, + "loss": 1.7286, + "step": 14082 + }, + { + "epoch": 4.3225905463474525, + "grad_norm": 0.2465270757675171, + "learning_rate": 6.324556808962499e-05, + "loss": 1.8097, + "step": 14083 + }, + { + "epoch": 4.322897483118478, + "grad_norm": 0.2911076843738556, + "learning_rate": 6.324077504867026e-05, + "loss": 1.7979, + "step": 14084 + }, + { + "epoch": 4.323204419889503, + "grad_norm": 0.33455169200897217, + "learning_rate": 6.323598187686245e-05, + "loss": 1.7988, + "step": 14085 + }, + { + "epoch": 4.323511356660528, + "grad_norm": 0.25020337104797363, + "learning_rate": 6.32311885742489e-05, + "loss": 1.7184, + "step": 14086 + }, + { + "epoch": 4.323818293431553, + "grad_norm": 0.23941513895988464, + "learning_rate": 6.322639514087699e-05, + "loss": 1.7672, + "step": 14087 + }, + { + "epoch": 4.324125230202578, + "grad_norm": 0.35258981585502625, + "learning_rate": 6.32216015767941e-05, + "loss": 1.7571, + "step": 14088 + }, + { + "epoch": 4.324432166973604, + "grad_norm": 0.2854993939399719, + "learning_rate": 6.321680788204758e-05, + "loss": 1.8096, + "step": 14089 + }, + { + "epoch": 4.324739103744629, + "grad_norm": 0.24422863125801086, + "learning_rate": 6.321201405668482e-05, + "loss": 1.778, + "step": 14090 + }, + { + "epoch": 4.3250460405156534, + "grad_norm": 0.36629122495651245, + "learning_rate": 6.320722010075321e-05, + "loss": 1.716, + "step": 14091 + }, + { + "epoch": 4.325352977286679, + "grad_norm": 0.37115517258644104, + "learning_rate": 6.32024260143001e-05, + "loss": 1.77, + "step": 14092 + }, + { + "epoch": 4.325659914057704, + "grad_norm": 0.21540327370166779, + "learning_rate": 6.319763179737288e-05, + "loss": 1.7529, + "step": 14093 + }, + { + "epoch": 4.3259668508287294, + "grad_norm": 0.2573898732662201, + "learning_rate": 6.319283745001892e-05, + "loss": 1.8101, + "step": 14094 + }, + { + "epoch": 4.326273787599755, + "grad_norm": 0.29481247067451477, + "learning_rate": 6.31880429722856e-05, + "loss": 1.7459, + "step": 14095 + }, + { + "epoch": 4.326580724370779, + "grad_norm": 0.23474647104740143, + "learning_rate": 6.318324836422031e-05, + "loss": 1.786, + "step": 14096 + }, + { + "epoch": 4.326887661141805, + "grad_norm": 0.2884673476219177, + "learning_rate": 6.317845362587045e-05, + "loss": 1.8123, + "step": 14097 + }, + { + "epoch": 4.32719459791283, + "grad_norm": 0.39008447527885437, + "learning_rate": 6.317365875728338e-05, + "loss": 1.7729, + "step": 14098 + }, + { + "epoch": 4.327501534683855, + "grad_norm": 0.30568063259124756, + "learning_rate": 6.316886375850651e-05, + "loss": 1.7088, + "step": 14099 + }, + { + "epoch": 4.327808471454881, + "grad_norm": 0.2538018524646759, + "learning_rate": 6.316406862958718e-05, + "loss": 1.8028, + "step": 14100 + }, + { + "epoch": 4.328115408225905, + "grad_norm": 0.3815068006515503, + "learning_rate": 6.315927337057281e-05, + "loss": 1.7143, + "step": 14101 + }, + { + "epoch": 4.32842234499693, + "grad_norm": 0.3813243508338928, + "learning_rate": 6.31544779815108e-05, + "loss": 1.7072, + "step": 14102 + }, + { + "epoch": 4.328729281767956, + "grad_norm": 0.22438868880271912, + "learning_rate": 6.314968246244852e-05, + "loss": 1.7445, + "step": 14103 + }, + { + "epoch": 4.329036218538981, + "grad_norm": 0.3818886876106262, + "learning_rate": 6.314488681343337e-05, + "loss": 1.8292, + "step": 14104 + }, + { + "epoch": 4.329343155310006, + "grad_norm": 0.4376567006111145, + "learning_rate": 6.314009103451277e-05, + "loss": 1.8224, + "step": 14105 + }, + { + "epoch": 4.329650092081032, + "grad_norm": 0.2741515636444092, + "learning_rate": 6.313529512573406e-05, + "loss": 1.8078, + "step": 14106 + }, + { + "epoch": 4.329957028852056, + "grad_norm": 0.264343798160553, + "learning_rate": 6.313049908714467e-05, + "loss": 1.7314, + "step": 14107 + }, + { + "epoch": 4.3302639656230815, + "grad_norm": 0.3601943552494049, + "learning_rate": 6.312570291879201e-05, + "loss": 1.7351, + "step": 14108 + }, + { + "epoch": 4.330570902394107, + "grad_norm": 0.2931751012802124, + "learning_rate": 6.312090662072345e-05, + "loss": 1.8117, + "step": 14109 + }, + { + "epoch": 4.330877839165132, + "grad_norm": 0.27670225501060486, + "learning_rate": 6.31161101929864e-05, + "loss": 1.7707, + "step": 14110 + }, + { + "epoch": 4.3311847759361575, + "grad_norm": 0.33669596910476685, + "learning_rate": 6.311131363562825e-05, + "loss": 1.7337, + "step": 14111 + }, + { + "epoch": 4.331491712707182, + "grad_norm": 0.232634037733078, + "learning_rate": 6.310651694869643e-05, + "loss": 1.7372, + "step": 14112 + }, + { + "epoch": 4.331798649478207, + "grad_norm": 0.28611311316490173, + "learning_rate": 6.310172013223832e-05, + "loss": 1.6977, + "step": 14113 + }, + { + "epoch": 4.332105586249233, + "grad_norm": 0.30207201838493347, + "learning_rate": 6.309692318630132e-05, + "loss": 1.7765, + "step": 14114 + }, + { + "epoch": 4.332412523020258, + "grad_norm": 0.20757484436035156, + "learning_rate": 6.309212611093287e-05, + "loss": 1.697, + "step": 14115 + }, + { + "epoch": 4.332719459791283, + "grad_norm": 0.31472963094711304, + "learning_rate": 6.308732890618034e-05, + "loss": 1.7757, + "step": 14116 + }, + { + "epoch": 4.333026396562309, + "grad_norm": 0.37042325735092163, + "learning_rate": 6.308253157209117e-05, + "loss": 1.7745, + "step": 14117 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.25001442432403564, + "learning_rate": 6.307773410871275e-05, + "loss": 1.7461, + "step": 14118 + }, + { + "epoch": 4.333640270104358, + "grad_norm": 0.2691943347454071, + "learning_rate": 6.307293651609248e-05, + "loss": 1.7539, + "step": 14119 + }, + { + "epoch": 4.333947206875384, + "grad_norm": 0.30845868587493896, + "learning_rate": 6.306813879427782e-05, + "loss": 1.7559, + "step": 14120 + }, + { + "epoch": 4.334254143646409, + "grad_norm": 0.2244730293750763, + "learning_rate": 6.306334094331613e-05, + "loss": 1.7609, + "step": 14121 + }, + { + "epoch": 4.334561080417434, + "grad_norm": 0.32132062315940857, + "learning_rate": 6.305854296325485e-05, + "loss": 1.7837, + "step": 14122 + }, + { + "epoch": 4.334868017188459, + "grad_norm": 0.3762948513031006, + "learning_rate": 6.30537448541414e-05, + "loss": 1.7631, + "step": 14123 + }, + { + "epoch": 4.335174953959484, + "grad_norm": 0.24174273014068604, + "learning_rate": 6.30489466160232e-05, + "loss": 1.7532, + "step": 14124 + }, + { + "epoch": 4.3354818907305095, + "grad_norm": 0.23468497395515442, + "learning_rate": 6.304414824894765e-05, + "loss": 1.7731, + "step": 14125 + }, + { + "epoch": 4.335788827501535, + "grad_norm": 0.29086077213287354, + "learning_rate": 6.303934975296218e-05, + "loss": 1.7668, + "step": 14126 + }, + { + "epoch": 4.33609576427256, + "grad_norm": 0.2889879643917084, + "learning_rate": 6.303455112811422e-05, + "loss": 1.8188, + "step": 14127 + }, + { + "epoch": 4.336402701043585, + "grad_norm": 0.2335619181394577, + "learning_rate": 6.302975237445119e-05, + "loss": 1.7944, + "step": 14128 + }, + { + "epoch": 4.33670963781461, + "grad_norm": 0.29027310013771057, + "learning_rate": 6.302495349202051e-05, + "loss": 1.7771, + "step": 14129 + }, + { + "epoch": 4.337016574585635, + "grad_norm": 0.31961241364479065, + "learning_rate": 6.302015448086959e-05, + "loss": 1.8187, + "step": 14130 + }, + { + "epoch": 4.337323511356661, + "grad_norm": 0.26015788316726685, + "learning_rate": 6.301535534104587e-05, + "loss": 1.7819, + "step": 14131 + }, + { + "epoch": 4.337630448127686, + "grad_norm": 0.2440631091594696, + "learning_rate": 6.30105560725968e-05, + "loss": 1.7127, + "step": 14132 + }, + { + "epoch": 4.337937384898711, + "grad_norm": 0.304441899061203, + "learning_rate": 6.300575667556979e-05, + "loss": 1.7619, + "step": 14133 + }, + { + "epoch": 4.338244321669736, + "grad_norm": 0.3085228204727173, + "learning_rate": 6.300095715001226e-05, + "loss": 1.8287, + "step": 14134 + }, + { + "epoch": 4.338551258440761, + "grad_norm": 0.2863372564315796, + "learning_rate": 6.299615749597165e-05, + "loss": 1.8068, + "step": 14135 + }, + { + "epoch": 4.338858195211786, + "grad_norm": 0.25255265831947327, + "learning_rate": 6.299135771349537e-05, + "loss": 1.7506, + "step": 14136 + }, + { + "epoch": 4.339165131982812, + "grad_norm": 0.30224961042404175, + "learning_rate": 6.298655780263092e-05, + "loss": 1.7292, + "step": 14137 + }, + { + "epoch": 4.339472068753837, + "grad_norm": 0.24222104251384735, + "learning_rate": 6.298175776342567e-05, + "loss": 1.7616, + "step": 14138 + }, + { + "epoch": 4.3397790055248615, + "grad_norm": 0.3236368000507355, + "learning_rate": 6.29769575959271e-05, + "loss": 1.787, + "step": 14139 + }, + { + "epoch": 4.340085942295887, + "grad_norm": 0.26049408316612244, + "learning_rate": 6.297215730018261e-05, + "loss": 1.7108, + "step": 14140 + }, + { + "epoch": 4.340392879066912, + "grad_norm": 0.22833532094955444, + "learning_rate": 6.296735687623967e-05, + "loss": 1.7661, + "step": 14141 + }, + { + "epoch": 4.3406998158379375, + "grad_norm": 0.28397905826568604, + "learning_rate": 6.296255632414571e-05, + "loss": 1.7163, + "step": 14142 + }, + { + "epoch": 4.341006752608963, + "grad_norm": 0.3072611093521118, + "learning_rate": 6.295775564394817e-05, + "loss": 1.857, + "step": 14143 + }, + { + "epoch": 4.341313689379987, + "grad_norm": 0.22901058197021484, + "learning_rate": 6.295295483569448e-05, + "loss": 1.7325, + "step": 14144 + }, + { + "epoch": 4.341620626151013, + "grad_norm": 0.27433091402053833, + "learning_rate": 6.294815389943212e-05, + "loss": 1.8229, + "step": 14145 + }, + { + "epoch": 4.341927562922038, + "grad_norm": 0.2635616958141327, + "learning_rate": 6.29433528352085e-05, + "loss": 1.7585, + "step": 14146 + }, + { + "epoch": 4.342234499693063, + "grad_norm": 0.29129260778427124, + "learning_rate": 6.293855164307108e-05, + "loss": 1.8294, + "step": 14147 + }, + { + "epoch": 4.342541436464089, + "grad_norm": 0.3429001569747925, + "learning_rate": 6.293375032306731e-05, + "loss": 1.7725, + "step": 14148 + }, + { + "epoch": 4.342848373235114, + "grad_norm": 0.22407259047031403, + "learning_rate": 6.292894887524464e-05, + "loss": 1.7018, + "step": 14149 + }, + { + "epoch": 4.343155310006138, + "grad_norm": 0.3319321274757385, + "learning_rate": 6.292414729965053e-05, + "loss": 1.8472, + "step": 14150 + }, + { + "epoch": 4.343462246777164, + "grad_norm": 0.42744341492652893, + "learning_rate": 6.291934559633241e-05, + "loss": 1.8118, + "step": 14151 + }, + { + "epoch": 4.343769183548189, + "grad_norm": 0.24572840332984924, + "learning_rate": 6.291454376533774e-05, + "loss": 1.7184, + "step": 14152 + }, + { + "epoch": 4.344076120319214, + "grad_norm": 0.2485980987548828, + "learning_rate": 6.290974180671397e-05, + "loss": 1.7649, + "step": 14153 + }, + { + "epoch": 4.34438305709024, + "grad_norm": 0.3911706209182739, + "learning_rate": 6.29049397205086e-05, + "loss": 1.8105, + "step": 14154 + }, + { + "epoch": 4.344689993861264, + "grad_norm": 0.3008342981338501, + "learning_rate": 6.290013750676902e-05, + "loss": 1.7671, + "step": 14155 + }, + { + "epoch": 4.3449969306322895, + "grad_norm": 0.2072051614522934, + "learning_rate": 6.289533516554274e-05, + "loss": 1.7406, + "step": 14156 + }, + { + "epoch": 4.345303867403315, + "grad_norm": 0.3047312796115875, + "learning_rate": 6.289053269687719e-05, + "loss": 1.8133, + "step": 14157 + }, + { + "epoch": 4.34561080417434, + "grad_norm": 0.28260552883148193, + "learning_rate": 6.288573010081984e-05, + "loss": 1.7253, + "step": 14158 + }, + { + "epoch": 4.3459177409453655, + "grad_norm": 0.2474137246608734, + "learning_rate": 6.288092737741815e-05, + "loss": 1.822, + "step": 14159 + }, + { + "epoch": 4.346224677716391, + "grad_norm": 0.23717878758907318, + "learning_rate": 6.287612452671961e-05, + "loss": 1.7826, + "step": 14160 + }, + { + "epoch": 4.346531614487415, + "grad_norm": 0.2646107077598572, + "learning_rate": 6.287132154877163e-05, + "loss": 1.8118, + "step": 14161 + }, + { + "epoch": 4.346838551258441, + "grad_norm": 0.22026480734348297, + "learning_rate": 6.286651844362172e-05, + "loss": 1.7767, + "step": 14162 + }, + { + "epoch": 4.347145488029466, + "grad_norm": 0.2692350447177887, + "learning_rate": 6.286171521131733e-05, + "loss": 1.8718, + "step": 14163 + }, + { + "epoch": 4.347452424800491, + "grad_norm": 0.2749998867511749, + "learning_rate": 6.285691185190592e-05, + "loss": 1.7689, + "step": 14164 + }, + { + "epoch": 4.347759361571517, + "grad_norm": 0.24552448093891144, + "learning_rate": 6.2852108365435e-05, + "loss": 1.8049, + "step": 14165 + }, + { + "epoch": 4.348066298342541, + "grad_norm": 0.20530807971954346, + "learning_rate": 6.2847304751952e-05, + "loss": 1.7606, + "step": 14166 + }, + { + "epoch": 4.348373235113566, + "grad_norm": 0.23396088182926178, + "learning_rate": 6.28425010115044e-05, + "loss": 1.7482, + "step": 14167 + }, + { + "epoch": 4.348680171884592, + "grad_norm": 0.20512452721595764, + "learning_rate": 6.283769714413968e-05, + "loss": 1.6976, + "step": 14168 + }, + { + "epoch": 4.348987108655617, + "grad_norm": 0.20287172496318817, + "learning_rate": 6.283289314990531e-05, + "loss": 1.7439, + "step": 14169 + }, + { + "epoch": 4.349294045426642, + "grad_norm": 0.2193746268749237, + "learning_rate": 6.282808902884876e-05, + "loss": 1.763, + "step": 14170 + }, + { + "epoch": 4.349600982197667, + "grad_norm": 0.20415273308753967, + "learning_rate": 6.282328478101753e-05, + "loss": 1.7025, + "step": 14171 + }, + { + "epoch": 4.349907918968692, + "grad_norm": 0.19286803901195526, + "learning_rate": 6.281848040645907e-05, + "loss": 1.7529, + "step": 14172 + }, + { + "epoch": 4.350214855739718, + "grad_norm": 0.20908218622207642, + "learning_rate": 6.281367590522088e-05, + "loss": 1.7896, + "step": 14173 + }, + { + "epoch": 4.350521792510743, + "grad_norm": 0.2599989175796509, + "learning_rate": 6.280887127735045e-05, + "loss": 1.764, + "step": 14174 + }, + { + "epoch": 4.350828729281768, + "grad_norm": 0.23955710232257843, + "learning_rate": 6.280406652289523e-05, + "loss": 1.7321, + "step": 14175 + }, + { + "epoch": 4.351135666052793, + "grad_norm": 0.2311990112066269, + "learning_rate": 6.279926164190272e-05, + "loss": 1.7338, + "step": 14176 + }, + { + "epoch": 4.351442602823818, + "grad_norm": 0.2599658966064453, + "learning_rate": 6.27944566344204e-05, + "loss": 1.7444, + "step": 14177 + }, + { + "epoch": 4.351749539594843, + "grad_norm": 0.23079386353492737, + "learning_rate": 6.278965150049579e-05, + "loss": 1.7011, + "step": 14178 + }, + { + "epoch": 4.352056476365869, + "grad_norm": 0.24844171106815338, + "learning_rate": 6.278484624017631e-05, + "loss": 1.7298, + "step": 14179 + }, + { + "epoch": 4.352363413136894, + "grad_norm": 0.24839860200881958, + "learning_rate": 6.27800408535095e-05, + "loss": 1.7717, + "step": 14180 + }, + { + "epoch": 4.352670349907919, + "grad_norm": 0.2652966380119324, + "learning_rate": 6.277523534054284e-05, + "loss": 1.7759, + "step": 14181 + }, + { + "epoch": 4.352977286678944, + "grad_norm": 0.2787603735923767, + "learning_rate": 6.277042970132381e-05, + "loss": 1.8981, + "step": 14182 + }, + { + "epoch": 4.353284223449969, + "grad_norm": 0.2535475194454193, + "learning_rate": 6.276562393589991e-05, + "loss": 1.7538, + "step": 14183 + }, + { + "epoch": 4.3535911602209945, + "grad_norm": 0.3210967183113098, + "learning_rate": 6.276081804431863e-05, + "loss": 1.7087, + "step": 14184 + }, + { + "epoch": 4.35389809699202, + "grad_norm": 0.29936519265174866, + "learning_rate": 6.275601202662749e-05, + "loss": 1.7647, + "step": 14185 + }, + { + "epoch": 4.354205033763045, + "grad_norm": 0.21980762481689453, + "learning_rate": 6.275120588287394e-05, + "loss": 1.7759, + "step": 14186 + }, + { + "epoch": 4.35451197053407, + "grad_norm": 0.26833051443099976, + "learning_rate": 6.274639961310549e-05, + "loss": 1.7648, + "step": 14187 + }, + { + "epoch": 4.354818907305095, + "grad_norm": 0.27998095750808716, + "learning_rate": 6.274159321736966e-05, + "loss": 1.746, + "step": 14188 + }, + { + "epoch": 4.35512584407612, + "grad_norm": 0.21354494988918304, + "learning_rate": 6.273678669571395e-05, + "loss": 1.7417, + "step": 14189 + }, + { + "epoch": 4.355432780847146, + "grad_norm": 0.2295297235250473, + "learning_rate": 6.273198004818583e-05, + "loss": 1.7805, + "step": 14190 + }, + { + "epoch": 4.355739717618171, + "grad_norm": 0.2416422963142395, + "learning_rate": 6.272717327483283e-05, + "loss": 1.73, + "step": 14191 + }, + { + "epoch": 4.356046654389196, + "grad_norm": 0.2685304880142212, + "learning_rate": 6.272236637570244e-05, + "loss": 1.7936, + "step": 14192 + }, + { + "epoch": 4.356353591160221, + "grad_norm": 0.32481294870376587, + "learning_rate": 6.271755935084218e-05, + "loss": 1.7192, + "step": 14193 + }, + { + "epoch": 4.356660527931246, + "grad_norm": 0.2428581267595291, + "learning_rate": 6.271275220029954e-05, + "loss": 1.7428, + "step": 14194 + }, + { + "epoch": 4.356967464702271, + "grad_norm": 0.2266654521226883, + "learning_rate": 6.270794492412203e-05, + "loss": 1.7266, + "step": 14195 + }, + { + "epoch": 4.357274401473297, + "grad_norm": 0.25062093138694763, + "learning_rate": 6.270313752235716e-05, + "loss": 1.7476, + "step": 14196 + }, + { + "epoch": 4.357581338244322, + "grad_norm": 0.24085770547389984, + "learning_rate": 6.269832999505244e-05, + "loss": 1.7981, + "step": 14197 + }, + { + "epoch": 4.3578882750153465, + "grad_norm": 0.27035796642303467, + "learning_rate": 6.269352234225536e-05, + "loss": 1.8867, + "step": 14198 + }, + { + "epoch": 4.358195211786372, + "grad_norm": 0.22464458644390106, + "learning_rate": 6.268871456401348e-05, + "loss": 1.7514, + "step": 14199 + }, + { + "epoch": 4.358502148557397, + "grad_norm": 0.22485734522342682, + "learning_rate": 6.268390666037427e-05, + "loss": 1.7558, + "step": 14200 + }, + { + "epoch": 4.3588090853284225, + "grad_norm": 0.2052135169506073, + "learning_rate": 6.267909863138527e-05, + "loss": 1.7453, + "step": 14201 + }, + { + "epoch": 4.359116022099448, + "grad_norm": 0.2130763679742813, + "learning_rate": 6.267429047709397e-05, + "loss": 1.7712, + "step": 14202 + }, + { + "epoch": 4.359422958870473, + "grad_norm": 0.23146997392177582, + "learning_rate": 6.266948219754793e-05, + "loss": 1.6978, + "step": 14203 + }, + { + "epoch": 4.359729895641498, + "grad_norm": 0.21657225489616394, + "learning_rate": 6.266467379279463e-05, + "loss": 1.7641, + "step": 14204 + }, + { + "epoch": 4.360036832412523, + "grad_norm": 0.2598700523376465, + "learning_rate": 6.265986526288158e-05, + "loss": 1.7956, + "step": 14205 + }, + { + "epoch": 4.360343769183548, + "grad_norm": 0.23497453331947327, + "learning_rate": 6.265505660785633e-05, + "loss": 1.7835, + "step": 14206 + }, + { + "epoch": 4.360650705954574, + "grad_norm": 0.2491760104894638, + "learning_rate": 6.265024782776641e-05, + "loss": 1.8454, + "step": 14207 + }, + { + "epoch": 4.360957642725599, + "grad_norm": 0.224884033203125, + "learning_rate": 6.264543892265932e-05, + "loss": 1.8383, + "step": 14208 + }, + { + "epoch": 4.361264579496623, + "grad_norm": 0.24057646095752716, + "learning_rate": 6.264062989258259e-05, + "loss": 1.7437, + "step": 14209 + }, + { + "epoch": 4.361571516267649, + "grad_norm": 0.24661841988563538, + "learning_rate": 6.263582073758374e-05, + "loss": 1.8151, + "step": 14210 + }, + { + "epoch": 4.361878453038674, + "grad_norm": 0.24618980288505554, + "learning_rate": 6.263101145771031e-05, + "loss": 1.7955, + "step": 14211 + }, + { + "epoch": 4.362185389809699, + "grad_norm": 0.2615448236465454, + "learning_rate": 6.262620205300981e-05, + "loss": 1.7819, + "step": 14212 + }, + { + "epoch": 4.362492326580725, + "grad_norm": 0.3528309464454651, + "learning_rate": 6.26213925235298e-05, + "loss": 1.7723, + "step": 14213 + }, + { + "epoch": 4.362799263351749, + "grad_norm": 0.3099561035633087, + "learning_rate": 6.261658286931779e-05, + "loss": 1.7361, + "step": 14214 + }, + { + "epoch": 4.3631062001227745, + "grad_norm": 0.23693235218524933, + "learning_rate": 6.26117730904213e-05, + "loss": 1.8117, + "step": 14215 + }, + { + "epoch": 4.3634131368938, + "grad_norm": 0.4164150655269623, + "learning_rate": 6.260696318688786e-05, + "loss": 1.7908, + "step": 14216 + }, + { + "epoch": 4.363720073664825, + "grad_norm": 0.39376336336135864, + "learning_rate": 6.260215315876506e-05, + "loss": 1.7832, + "step": 14217 + }, + { + "epoch": 4.3640270104358505, + "grad_norm": 0.24071799218654633, + "learning_rate": 6.259734300610037e-05, + "loss": 1.7569, + "step": 14218 + }, + { + "epoch": 4.364333947206875, + "grad_norm": 0.4305122494697571, + "learning_rate": 6.259253272894136e-05, + "loss": 1.7974, + "step": 14219 + }, + { + "epoch": 4.3646408839779, + "grad_norm": 0.3023197054862976, + "learning_rate": 6.258772232733556e-05, + "loss": 1.7589, + "step": 14220 + }, + { + "epoch": 4.364947820748926, + "grad_norm": 0.23253366351127625, + "learning_rate": 6.258291180133052e-05, + "loss": 1.7138, + "step": 14221 + }, + { + "epoch": 4.365254757519951, + "grad_norm": 0.41141277551651, + "learning_rate": 6.257810115097376e-05, + "loss": 1.7608, + "step": 14222 + }, + { + "epoch": 4.365561694290976, + "grad_norm": 0.3308235704898834, + "learning_rate": 6.257329037631284e-05, + "loss": 1.8006, + "step": 14223 + }, + { + "epoch": 4.365868631062002, + "grad_norm": 0.2635105848312378, + "learning_rate": 6.256847947739528e-05, + "loss": 1.7275, + "step": 14224 + }, + { + "epoch": 4.366175567833026, + "grad_norm": 0.45886602997779846, + "learning_rate": 6.256366845426864e-05, + "loss": 1.7701, + "step": 14225 + }, + { + "epoch": 4.366482504604051, + "grad_norm": 0.48503565788269043, + "learning_rate": 6.255885730698049e-05, + "loss": 1.7409, + "step": 14226 + }, + { + "epoch": 4.366789441375077, + "grad_norm": 0.26727184653282166, + "learning_rate": 6.255404603557833e-05, + "loss": 1.7288, + "step": 14227 + }, + { + "epoch": 4.367096378146102, + "grad_norm": 0.3343912363052368, + "learning_rate": 6.254923464010974e-05, + "loss": 1.764, + "step": 14228 + }, + { + "epoch": 4.367403314917127, + "grad_norm": 0.40050622820854187, + "learning_rate": 6.254442312062224e-05, + "loss": 1.7653, + "step": 14229 + }, + { + "epoch": 4.367710251688152, + "grad_norm": 0.23941144347190857, + "learning_rate": 6.253961147716341e-05, + "loss": 1.6886, + "step": 14230 + }, + { + "epoch": 4.368017188459177, + "grad_norm": 0.25737255811691284, + "learning_rate": 6.253479970978079e-05, + "loss": 1.8047, + "step": 14231 + }, + { + "epoch": 4.3683241252302025, + "grad_norm": 0.28780993819236755, + "learning_rate": 6.252998781852192e-05, + "loss": 1.7453, + "step": 14232 + }, + { + "epoch": 4.368631062001228, + "grad_norm": 0.2362327128648758, + "learning_rate": 6.252517580343438e-05, + "loss": 1.7963, + "step": 14233 + }, + { + "epoch": 4.368937998772253, + "grad_norm": 0.263013631105423, + "learning_rate": 6.252036366456571e-05, + "loss": 1.7837, + "step": 14234 + }, + { + "epoch": 4.3692449355432785, + "grad_norm": 0.27674412727355957, + "learning_rate": 6.251555140196347e-05, + "loss": 1.767, + "step": 14235 + }, + { + "epoch": 4.369551872314303, + "grad_norm": 0.2360621690750122, + "learning_rate": 6.251073901567522e-05, + "loss": 1.7806, + "step": 14236 + }, + { + "epoch": 4.369858809085328, + "grad_norm": 0.2568018138408661, + "learning_rate": 6.25059265057485e-05, + "loss": 1.7672, + "step": 14237 + }, + { + "epoch": 4.370165745856354, + "grad_norm": 0.2512381374835968, + "learning_rate": 6.25011138722309e-05, + "loss": 1.7506, + "step": 14238 + }, + { + "epoch": 4.370472682627379, + "grad_norm": 0.21587291359901428, + "learning_rate": 6.249630111516994e-05, + "loss": 1.7336, + "step": 14239 + }, + { + "epoch": 4.370779619398404, + "grad_norm": 0.21791933476924896, + "learning_rate": 6.249148823461323e-05, + "loss": 1.7588, + "step": 14240 + }, + { + "epoch": 4.371086556169429, + "grad_norm": 0.23061512410640717, + "learning_rate": 6.248667523060831e-05, + "loss": 1.742, + "step": 14241 + }, + { + "epoch": 4.371393492940454, + "grad_norm": 0.2007007598876953, + "learning_rate": 6.248186210320274e-05, + "loss": 1.7227, + "step": 14242 + }, + { + "epoch": 4.371700429711479, + "grad_norm": 0.2564350366592407, + "learning_rate": 6.247704885244411e-05, + "loss": 1.7529, + "step": 14243 + }, + { + "epoch": 4.372007366482505, + "grad_norm": 0.21880537271499634, + "learning_rate": 6.247223547837995e-05, + "loss": 1.7828, + "step": 14244 + }, + { + "epoch": 4.37231430325353, + "grad_norm": 0.26154282689094543, + "learning_rate": 6.246742198105785e-05, + "loss": 1.7895, + "step": 14245 + }, + { + "epoch": 4.3726212400245545, + "grad_norm": 0.2652645707130432, + "learning_rate": 6.24626083605254e-05, + "loss": 1.8038, + "step": 14246 + }, + { + "epoch": 4.37292817679558, + "grad_norm": 0.21463751792907715, + "learning_rate": 6.245779461683013e-05, + "loss": 1.7139, + "step": 14247 + }, + { + "epoch": 4.373235113566605, + "grad_norm": 0.21285851299762726, + "learning_rate": 6.245298075001961e-05, + "loss": 1.7686, + "step": 14248 + }, + { + "epoch": 4.3735420503376305, + "grad_norm": 0.258602499961853, + "learning_rate": 6.244816676014149e-05, + "loss": 1.8518, + "step": 14249 + }, + { + "epoch": 4.373848987108656, + "grad_norm": 0.25747501850128174, + "learning_rate": 6.244335264724323e-05, + "loss": 1.8019, + "step": 14250 + }, + { + "epoch": 4.37415592387968, + "grad_norm": 0.24678784608840942, + "learning_rate": 6.243853841137251e-05, + "loss": 1.7846, + "step": 14251 + }, + { + "epoch": 4.374462860650706, + "grad_norm": 0.31382107734680176, + "learning_rate": 6.243372405257685e-05, + "loss": 1.8389, + "step": 14252 + }, + { + "epoch": 4.374769797421731, + "grad_norm": 0.30522868037223816, + "learning_rate": 6.242890957090383e-05, + "loss": 1.8057, + "step": 14253 + }, + { + "epoch": 4.375076734192756, + "grad_norm": 0.2449347972869873, + "learning_rate": 6.242409496640106e-05, + "loss": 1.7144, + "step": 14254 + }, + { + "epoch": 4.375383670963782, + "grad_norm": 0.3193594217300415, + "learning_rate": 6.241928023911609e-05, + "loss": 1.7404, + "step": 14255 + }, + { + "epoch": 4.375690607734807, + "grad_norm": 0.23948179185390472, + "learning_rate": 6.241446538909651e-05, + "loss": 1.7338, + "step": 14256 + }, + { + "epoch": 4.3759975445058314, + "grad_norm": 0.35325706005096436, + "learning_rate": 6.240965041638991e-05, + "loss": 1.7673, + "step": 14257 + }, + { + "epoch": 4.376304481276857, + "grad_norm": 0.38753262162208557, + "learning_rate": 6.240483532104387e-05, + "loss": 1.769, + "step": 14258 + }, + { + "epoch": 4.376611418047882, + "grad_norm": 0.2749052941799164, + "learning_rate": 6.2400020103106e-05, + "loss": 1.8086, + "step": 14259 + }, + { + "epoch": 4.3769183548189075, + "grad_norm": 0.2553126811981201, + "learning_rate": 6.239520476262384e-05, + "loss": 1.7733, + "step": 14260 + }, + { + "epoch": 4.377225291589933, + "grad_norm": 0.2854517698287964, + "learning_rate": 6.2390389299645e-05, + "loss": 1.7926, + "step": 14261 + }, + { + "epoch": 4.377532228360957, + "grad_norm": 0.24617259204387665, + "learning_rate": 6.238557371421708e-05, + "loss": 1.7297, + "step": 14262 + }, + { + "epoch": 4.377839165131983, + "grad_norm": 0.2555331289768219, + "learning_rate": 6.238075800638765e-05, + "loss": 1.7566, + "step": 14263 + }, + { + "epoch": 4.378146101903008, + "grad_norm": 0.31666773557662964, + "learning_rate": 6.237594217620432e-05, + "loss": 1.8003, + "step": 14264 + }, + { + "epoch": 4.378453038674033, + "grad_norm": 0.24166476726531982, + "learning_rate": 6.237112622371468e-05, + "loss": 1.7425, + "step": 14265 + }, + { + "epoch": 4.378759975445059, + "grad_norm": 0.21237102150917053, + "learning_rate": 6.236631014896633e-05, + "loss": 1.73, + "step": 14266 + }, + { + "epoch": 4.379066912216084, + "grad_norm": 0.2739151120185852, + "learning_rate": 6.236149395200683e-05, + "loss": 1.7113, + "step": 14267 + }, + { + "epoch": 4.379373848987108, + "grad_norm": 0.23700746893882751, + "learning_rate": 6.23566776328838e-05, + "loss": 1.7256, + "step": 14268 + }, + { + "epoch": 4.379680785758134, + "grad_norm": 0.22366748750209808, + "learning_rate": 6.235186119164485e-05, + "loss": 1.7981, + "step": 14269 + }, + { + "epoch": 4.379987722529159, + "grad_norm": 0.28440114855766296, + "learning_rate": 6.234704462833758e-05, + "loss": 1.8087, + "step": 14270 + }, + { + "epoch": 4.380294659300184, + "grad_norm": 0.2706616520881653, + "learning_rate": 6.234222794300957e-05, + "loss": 1.7502, + "step": 14271 + }, + { + "epoch": 4.38060159607121, + "grad_norm": 0.21666266024112701, + "learning_rate": 6.233741113570843e-05, + "loss": 1.7639, + "step": 14272 + }, + { + "epoch": 4.380908532842234, + "grad_norm": 0.26790255308151245, + "learning_rate": 6.233259420648175e-05, + "loss": 1.796, + "step": 14273 + }, + { + "epoch": 4.3812154696132595, + "grad_norm": 0.22233673930168152, + "learning_rate": 6.232777715537715e-05, + "loss": 1.7661, + "step": 14274 + }, + { + "epoch": 4.381522406384285, + "grad_norm": 0.3277546763420105, + "learning_rate": 6.232295998244223e-05, + "loss": 1.7932, + "step": 14275 + }, + { + "epoch": 4.38182934315531, + "grad_norm": 0.2907596826553345, + "learning_rate": 6.231814268772463e-05, + "loss": 1.7103, + "step": 14276 + }, + { + "epoch": 4.3821362799263355, + "grad_norm": 0.2318384349346161, + "learning_rate": 6.231332527127188e-05, + "loss": 1.7351, + "step": 14277 + }, + { + "epoch": 4.382443216697361, + "grad_norm": 0.32904061675071716, + "learning_rate": 6.230850773313163e-05, + "loss": 1.7967, + "step": 14278 + }, + { + "epoch": 4.382750153468385, + "grad_norm": 0.2455490082502365, + "learning_rate": 6.230369007335153e-05, + "loss": 1.7474, + "step": 14279 + }, + { + "epoch": 4.383057090239411, + "grad_norm": 0.23648180067539215, + "learning_rate": 6.229887229197913e-05, + "loss": 1.7106, + "step": 14280 + }, + { + "epoch": 4.383364027010436, + "grad_norm": 0.29552599787712097, + "learning_rate": 6.229405438906207e-05, + "loss": 1.7765, + "step": 14281 + }, + { + "epoch": 4.383670963781461, + "grad_norm": 0.2094641923904419, + "learning_rate": 6.228923636464796e-05, + "loss": 1.7105, + "step": 14282 + }, + { + "epoch": 4.383977900552487, + "grad_norm": 0.24632154405117035, + "learning_rate": 6.228441821878441e-05, + "loss": 1.7913, + "step": 14283 + }, + { + "epoch": 4.384284837323511, + "grad_norm": 0.28114691376686096, + "learning_rate": 6.227959995151904e-05, + "loss": 1.7456, + "step": 14284 + }, + { + "epoch": 4.384591774094536, + "grad_norm": 0.24226875603199005, + "learning_rate": 6.227478156289946e-05, + "loss": 1.797, + "step": 14285 + }, + { + "epoch": 4.384898710865562, + "grad_norm": 0.2526854872703552, + "learning_rate": 6.22699630529733e-05, + "loss": 1.7155, + "step": 14286 + }, + { + "epoch": 4.385205647636587, + "grad_norm": 0.312916100025177, + "learning_rate": 6.226514442178818e-05, + "loss": 1.7808, + "step": 14287 + }, + { + "epoch": 4.385512584407612, + "grad_norm": 0.23087100684642792, + "learning_rate": 6.22603256693917e-05, + "loss": 1.7543, + "step": 14288 + }, + { + "epoch": 4.385819521178637, + "grad_norm": 0.3042476177215576, + "learning_rate": 6.22555067958315e-05, + "loss": 1.747, + "step": 14289 + }, + { + "epoch": 4.386126457949662, + "grad_norm": 0.2604007422924042, + "learning_rate": 6.225068780115522e-05, + "loss": 1.7262, + "step": 14290 + }, + { + "epoch": 4.3864333947206875, + "grad_norm": 0.2200118750333786, + "learning_rate": 6.224586868541044e-05, + "loss": 1.75, + "step": 14291 + }, + { + "epoch": 4.386740331491713, + "grad_norm": 0.3452017307281494, + "learning_rate": 6.224104944864481e-05, + "loss": 1.7598, + "step": 14292 + }, + { + "epoch": 4.387047268262738, + "grad_norm": 0.3169453740119934, + "learning_rate": 6.223623009090597e-05, + "loss": 1.7939, + "step": 14293 + }, + { + "epoch": 4.387354205033763, + "grad_norm": 0.23640502989292145, + "learning_rate": 6.223141061224151e-05, + "loss": 1.8005, + "step": 14294 + }, + { + "epoch": 4.387661141804788, + "grad_norm": 0.26212456822395325, + "learning_rate": 6.22265910126991e-05, + "loss": 1.7951, + "step": 14295 + }, + { + "epoch": 4.387968078575813, + "grad_norm": 0.2687644362449646, + "learning_rate": 6.222177129232634e-05, + "loss": 1.7674, + "step": 14296 + }, + { + "epoch": 4.388275015346839, + "grad_norm": 0.2553202211856842, + "learning_rate": 6.221695145117086e-05, + "loss": 1.8142, + "step": 14297 + }, + { + "epoch": 4.388581952117864, + "grad_norm": 0.3317619264125824, + "learning_rate": 6.221213148928034e-05, + "loss": 1.7884, + "step": 14298 + }, + { + "epoch": 4.388888888888889, + "grad_norm": 0.3059331476688385, + "learning_rate": 6.220731140670235e-05, + "loss": 1.7377, + "step": 14299 + }, + { + "epoch": 4.389195825659914, + "grad_norm": 0.21544015407562256, + "learning_rate": 6.220249120348457e-05, + "loss": 1.6818, + "step": 14300 + }, + { + "epoch": 4.389502762430939, + "grad_norm": 0.3112640380859375, + "learning_rate": 6.219767087967461e-05, + "loss": 1.72, + "step": 14301 + }, + { + "epoch": 4.389809699201964, + "grad_norm": 0.2572654187679291, + "learning_rate": 6.219285043532011e-05, + "loss": 1.793, + "step": 14302 + }, + { + "epoch": 4.39011663597299, + "grad_norm": 0.2621476948261261, + "learning_rate": 6.218802987046874e-05, + "loss": 1.8301, + "step": 14303 + }, + { + "epoch": 4.390423572744015, + "grad_norm": 0.2592658996582031, + "learning_rate": 6.218320918516809e-05, + "loss": 1.7219, + "step": 14304 + }, + { + "epoch": 4.3907305095150395, + "grad_norm": 0.25503265857696533, + "learning_rate": 6.217838837946584e-05, + "loss": 1.8149, + "step": 14305 + }, + { + "epoch": 4.391037446286065, + "grad_norm": 0.21944166719913483, + "learning_rate": 6.217356745340962e-05, + "loss": 1.7174, + "step": 14306 + }, + { + "epoch": 4.39134438305709, + "grad_norm": 0.2937396466732025, + "learning_rate": 6.216874640704707e-05, + "loss": 1.8562, + "step": 14307 + }, + { + "epoch": 4.3916513198281155, + "grad_norm": 0.22520211338996887, + "learning_rate": 6.216392524042581e-05, + "loss": 1.7701, + "step": 14308 + }, + { + "epoch": 4.391958256599141, + "grad_norm": 0.24397830665111542, + "learning_rate": 6.215910395359355e-05, + "loss": 1.7794, + "step": 14309 + }, + { + "epoch": 4.392265193370166, + "grad_norm": 0.2867623567581177, + "learning_rate": 6.215428254659788e-05, + "loss": 1.7275, + "step": 14310 + }, + { + "epoch": 4.392572130141191, + "grad_norm": 0.2632426917552948, + "learning_rate": 6.214946101948648e-05, + "loss": 1.7919, + "step": 14311 + }, + { + "epoch": 4.392879066912216, + "grad_norm": 0.23146092891693115, + "learning_rate": 6.214463937230696e-05, + "loss": 1.744, + "step": 14312 + }, + { + "epoch": 4.393186003683241, + "grad_norm": 0.21877676248550415, + "learning_rate": 6.213981760510701e-05, + "loss": 1.7577, + "step": 14313 + }, + { + "epoch": 4.393492940454267, + "grad_norm": 0.2320399284362793, + "learning_rate": 6.213499571793426e-05, + "loss": 1.7864, + "step": 14314 + }, + { + "epoch": 4.393799877225292, + "grad_norm": 0.2951548993587494, + "learning_rate": 6.213017371083638e-05, + "loss": 1.8257, + "step": 14315 + }, + { + "epoch": 4.394106813996316, + "grad_norm": 0.26062941551208496, + "learning_rate": 6.212535158386102e-05, + "loss": 1.7448, + "step": 14316 + }, + { + "epoch": 4.394413750767342, + "grad_norm": 0.24760986864566803, + "learning_rate": 6.21205293370558e-05, + "loss": 1.7902, + "step": 14317 + }, + { + "epoch": 4.394720687538367, + "grad_norm": 0.2686399221420288, + "learning_rate": 6.211570697046844e-05, + "loss": 1.8209, + "step": 14318 + }, + { + "epoch": 4.395027624309392, + "grad_norm": 0.2599134147167206, + "learning_rate": 6.211088448414653e-05, + "loss": 1.8231, + "step": 14319 + }, + { + "epoch": 4.395334561080418, + "grad_norm": 0.254044771194458, + "learning_rate": 6.210606187813778e-05, + "loss": 1.806, + "step": 14320 + }, + { + "epoch": 4.395641497851442, + "grad_norm": 0.262229323387146, + "learning_rate": 6.210123915248982e-05, + "loss": 1.7857, + "step": 14321 + }, + { + "epoch": 4.3959484346224675, + "grad_norm": 0.2849259078502655, + "learning_rate": 6.209641630725033e-05, + "loss": 1.8005, + "step": 14322 + }, + { + "epoch": 4.396255371393493, + "grad_norm": 0.35480254888534546, + "learning_rate": 6.209159334246697e-05, + "loss": 1.8189, + "step": 14323 + }, + { + "epoch": 4.396562308164518, + "grad_norm": 0.2599184215068817, + "learning_rate": 6.20867702581874e-05, + "loss": 1.7384, + "step": 14324 + }, + { + "epoch": 4.3968692449355435, + "grad_norm": 0.23994222283363342, + "learning_rate": 6.208194705445926e-05, + "loss": 1.7566, + "step": 14325 + }, + { + "epoch": 4.397176181706568, + "grad_norm": 0.24361753463745117, + "learning_rate": 6.207712373133024e-05, + "loss": 1.6965, + "step": 14326 + }, + { + "epoch": 4.397483118477593, + "grad_norm": 0.23925161361694336, + "learning_rate": 6.207230028884803e-05, + "loss": 1.7596, + "step": 14327 + }, + { + "epoch": 4.397790055248619, + "grad_norm": 0.24365897476673126, + "learning_rate": 6.206747672706025e-05, + "loss": 1.7951, + "step": 14328 + }, + { + "epoch": 4.398096992019644, + "grad_norm": 0.25245413184165955, + "learning_rate": 6.206265304601461e-05, + "loss": 1.8086, + "step": 14329 + }, + { + "epoch": 4.398403928790669, + "grad_norm": 0.24272513389587402, + "learning_rate": 6.205782924575874e-05, + "loss": 1.8148, + "step": 14330 + }, + { + "epoch": 4.398710865561695, + "grad_norm": 0.21299590170383453, + "learning_rate": 6.205300532634036e-05, + "loss": 1.7666, + "step": 14331 + }, + { + "epoch": 4.399017802332719, + "grad_norm": 0.23543189465999603, + "learning_rate": 6.20481812878071e-05, + "loss": 1.7629, + "step": 14332 + }, + { + "epoch": 4.399324739103744, + "grad_norm": 0.2284495085477829, + "learning_rate": 6.204335713020665e-05, + "loss": 1.768, + "step": 14333 + }, + { + "epoch": 4.39963167587477, + "grad_norm": 0.23158542811870575, + "learning_rate": 6.20385328535867e-05, + "loss": 1.7761, + "step": 14334 + }, + { + "epoch": 4.399938612645795, + "grad_norm": 0.2378150224685669, + "learning_rate": 6.20337084579949e-05, + "loss": 1.8483, + "step": 14335 + }, + { + "epoch": 4.4002455494168204, + "grad_norm": 0.2407436966896057, + "learning_rate": 6.202888394347892e-05, + "loss": 1.7364, + "step": 14336 + }, + { + "epoch": 4.400552486187845, + "grad_norm": 0.256259560585022, + "learning_rate": 6.202405931008649e-05, + "loss": 1.7376, + "step": 14337 + }, + { + "epoch": 4.40085942295887, + "grad_norm": 0.29293057322502136, + "learning_rate": 6.201923455786524e-05, + "loss": 1.7493, + "step": 14338 + }, + { + "epoch": 4.401166359729896, + "grad_norm": 0.24025334417819977, + "learning_rate": 6.201440968686288e-05, + "loss": 1.7522, + "step": 14339 + }, + { + "epoch": 4.401473296500921, + "grad_norm": 0.3215656280517578, + "learning_rate": 6.200958469712708e-05, + "loss": 1.7748, + "step": 14340 + }, + { + "epoch": 4.401780233271946, + "grad_norm": 0.43553170561790466, + "learning_rate": 6.200475958870553e-05, + "loss": 1.771, + "step": 14341 + }, + { + "epoch": 4.402087170042972, + "grad_norm": 0.3112131953239441, + "learning_rate": 6.19999343616459e-05, + "loss": 1.7655, + "step": 14342 + }, + { + "epoch": 4.402394106813996, + "grad_norm": 0.25197842717170715, + "learning_rate": 6.199510901599589e-05, + "loss": 1.7214, + "step": 14343 + }, + { + "epoch": 4.402701043585021, + "grad_norm": 0.33227142691612244, + "learning_rate": 6.19902835518032e-05, + "loss": 1.7332, + "step": 14344 + }, + { + "epoch": 4.403007980356047, + "grad_norm": 0.27962982654571533, + "learning_rate": 6.198545796911548e-05, + "loss": 1.6943, + "step": 14345 + }, + { + "epoch": 4.403314917127072, + "grad_norm": 0.24374182522296906, + "learning_rate": 6.198063226798044e-05, + "loss": 1.7222, + "step": 14346 + }, + { + "epoch": 4.403621853898097, + "grad_norm": 0.3101944625377655, + "learning_rate": 6.197580644844576e-05, + "loss": 1.7113, + "step": 14347 + }, + { + "epoch": 4.403928790669122, + "grad_norm": 0.25919321179389954, + "learning_rate": 6.197098051055916e-05, + "loss": 1.71, + "step": 14348 + }, + { + "epoch": 4.404235727440147, + "grad_norm": 0.23140330612659454, + "learning_rate": 6.19661544543683e-05, + "loss": 1.7472, + "step": 14349 + }, + { + "epoch": 4.4045426642111725, + "grad_norm": 0.3274286687374115, + "learning_rate": 6.19613282799209e-05, + "loss": 1.7093, + "step": 14350 + }, + { + "epoch": 4.404849600982198, + "grad_norm": 0.3187442123889923, + "learning_rate": 6.195650198726464e-05, + "loss": 1.7488, + "step": 14351 + }, + { + "epoch": 4.405156537753223, + "grad_norm": 0.20547433197498322, + "learning_rate": 6.195167557644722e-05, + "loss": 1.7295, + "step": 14352 + }, + { + "epoch": 4.4054634745242485, + "grad_norm": 0.2623414993286133, + "learning_rate": 6.194684904751633e-05, + "loss": 1.8258, + "step": 14353 + }, + { + "epoch": 4.405770411295273, + "grad_norm": 0.2468457818031311, + "learning_rate": 6.194202240051967e-05, + "loss": 1.6957, + "step": 14354 + }, + { + "epoch": 4.406077348066298, + "grad_norm": 0.2082364559173584, + "learning_rate": 6.193719563550496e-05, + "loss": 1.7596, + "step": 14355 + }, + { + "epoch": 4.406384284837324, + "grad_norm": 0.27072983980178833, + "learning_rate": 6.193236875251988e-05, + "loss": 1.7341, + "step": 14356 + }, + { + "epoch": 4.406691221608349, + "grad_norm": 0.2630362808704376, + "learning_rate": 6.192754175161215e-05, + "loss": 1.7664, + "step": 14357 + }, + { + "epoch": 4.406998158379374, + "grad_norm": 0.25400006771087646, + "learning_rate": 6.192271463282944e-05, + "loss": 1.7582, + "step": 14358 + }, + { + "epoch": 4.407305095150399, + "grad_norm": 0.22256311774253845, + "learning_rate": 6.191788739621949e-05, + "loss": 1.7389, + "step": 14359 + }, + { + "epoch": 4.407612031921424, + "grad_norm": 0.2160387486219406, + "learning_rate": 6.191306004182999e-05, + "loss": 1.7051, + "step": 14360 + }, + { + "epoch": 4.407918968692449, + "grad_norm": 0.20665684342384338, + "learning_rate": 6.190823256970865e-05, + "loss": 1.7606, + "step": 14361 + }, + { + "epoch": 4.408225905463475, + "grad_norm": 0.2173188328742981, + "learning_rate": 6.190340497990318e-05, + "loss": 1.7944, + "step": 14362 + }, + { + "epoch": 4.4085328422345, + "grad_norm": 0.189287930727005, + "learning_rate": 6.189857727246127e-05, + "loss": 1.7283, + "step": 14363 + }, + { + "epoch": 4.4088397790055245, + "grad_norm": 0.2531645596027374, + "learning_rate": 6.189374944743065e-05, + "loss": 1.7554, + "step": 14364 + }, + { + "epoch": 4.40914671577655, + "grad_norm": 0.25439125299453735, + "learning_rate": 6.188892150485903e-05, + "loss": 1.8032, + "step": 14365 + }, + { + "epoch": 4.409453652547575, + "grad_norm": 0.20938685536384583, + "learning_rate": 6.188409344479412e-05, + "loss": 1.7385, + "step": 14366 + }, + { + "epoch": 4.4097605893186005, + "grad_norm": 0.20471477508544922, + "learning_rate": 6.187926526728364e-05, + "loss": 1.7487, + "step": 14367 + }, + { + "epoch": 4.410067526089626, + "grad_norm": 0.2381851226091385, + "learning_rate": 6.187443697237529e-05, + "loss": 1.7443, + "step": 14368 + }, + { + "epoch": 4.41037446286065, + "grad_norm": 0.21584098041057587, + "learning_rate": 6.18696085601168e-05, + "loss": 1.7818, + "step": 14369 + }, + { + "epoch": 4.410681399631676, + "grad_norm": 0.2575368583202362, + "learning_rate": 6.186478003055587e-05, + "loss": 1.8204, + "step": 14370 + }, + { + "epoch": 4.410988336402701, + "grad_norm": 0.21133238077163696, + "learning_rate": 6.185995138374024e-05, + "loss": 1.7274, + "step": 14371 + }, + { + "epoch": 4.411295273173726, + "grad_norm": 0.24918322265148163, + "learning_rate": 6.18551226197176e-05, + "loss": 1.8021, + "step": 14372 + }, + { + "epoch": 4.411602209944752, + "grad_norm": 0.2253655642271042, + "learning_rate": 6.185029373853572e-05, + "loss": 1.7308, + "step": 14373 + }, + { + "epoch": 4.411909146715777, + "grad_norm": 0.20098713040351868, + "learning_rate": 6.184546474024226e-05, + "loss": 1.7549, + "step": 14374 + }, + { + "epoch": 4.412216083486801, + "grad_norm": 0.25612789392471313, + "learning_rate": 6.1840635624885e-05, + "loss": 1.8305, + "step": 14375 + }, + { + "epoch": 4.412523020257827, + "grad_norm": 0.24287539720535278, + "learning_rate": 6.183580639251164e-05, + "loss": 1.7339, + "step": 14376 + }, + { + "epoch": 4.412829957028852, + "grad_norm": 0.2304944545030594, + "learning_rate": 6.183097704316988e-05, + "loss": 1.7023, + "step": 14377 + }, + { + "epoch": 4.413136893799877, + "grad_norm": 0.21911773085594177, + "learning_rate": 6.18261475769075e-05, + "loss": 1.7305, + "step": 14378 + }, + { + "epoch": 4.413443830570903, + "grad_norm": 0.24207864701747894, + "learning_rate": 6.182131799377217e-05, + "loss": 1.7318, + "step": 14379 + }, + { + "epoch": 4.413750767341927, + "grad_norm": 0.2551634609699249, + "learning_rate": 6.181648829381165e-05, + "loss": 1.8101, + "step": 14380 + }, + { + "epoch": 4.4140577041129525, + "grad_norm": 0.4114011526107788, + "learning_rate": 6.181165847707368e-05, + "loss": 1.772, + "step": 14381 + }, + { + "epoch": 4.414364640883978, + "grad_norm": 0.4592796862125397, + "learning_rate": 6.180682854360598e-05, + "loss": 1.7359, + "step": 14382 + }, + { + "epoch": 4.414671577655003, + "grad_norm": 0.2599259614944458, + "learning_rate": 6.180199849345627e-05, + "loss": 1.7028, + "step": 14383 + }, + { + "epoch": 4.4149785144260285, + "grad_norm": 0.3489506244659424, + "learning_rate": 6.17971683266723e-05, + "loss": 1.8252, + "step": 14384 + }, + { + "epoch": 4.415285451197054, + "grad_norm": 0.44563809037208557, + "learning_rate": 6.179233804330179e-05, + "loss": 1.6894, + "step": 14385 + }, + { + "epoch": 4.415592387968078, + "grad_norm": 0.2596888542175293, + "learning_rate": 6.17875076433925e-05, + "loss": 1.8141, + "step": 14386 + }, + { + "epoch": 4.415899324739104, + "grad_norm": 0.3560626804828644, + "learning_rate": 6.178267712699213e-05, + "loss": 1.7764, + "step": 14387 + }, + { + "epoch": 4.416206261510129, + "grad_norm": 0.3746717572212219, + "learning_rate": 6.177784649414843e-05, + "loss": 1.7528, + "step": 14388 + }, + { + "epoch": 4.416513198281154, + "grad_norm": 0.23248885571956635, + "learning_rate": 6.177301574490918e-05, + "loss": 1.7148, + "step": 14389 + }, + { + "epoch": 4.41682013505218, + "grad_norm": 0.26936978101730347, + "learning_rate": 6.176818487932208e-05, + "loss": 1.7199, + "step": 14390 + }, + { + "epoch": 4.417127071823204, + "grad_norm": 0.3102504014968872, + "learning_rate": 6.176335389743486e-05, + "loss": 1.6886, + "step": 14391 + }, + { + "epoch": 4.417434008594229, + "grad_norm": 0.24406832456588745, + "learning_rate": 6.175852279929531e-05, + "loss": 1.7766, + "step": 14392 + }, + { + "epoch": 4.417740945365255, + "grad_norm": 0.271158903837204, + "learning_rate": 6.175369158495112e-05, + "loss": 1.8099, + "step": 14393 + }, + { + "epoch": 4.41804788213628, + "grad_norm": 0.343667209148407, + "learning_rate": 6.174886025445008e-05, + "loss": 1.779, + "step": 14394 + }, + { + "epoch": 4.418354818907305, + "grad_norm": 0.37423139810562134, + "learning_rate": 6.17440288078399e-05, + "loss": 1.7796, + "step": 14395 + }, + { + "epoch": 4.41866175567833, + "grad_norm": 0.3152335286140442, + "learning_rate": 6.173919724516836e-05, + "loss": 1.7388, + "step": 14396 + }, + { + "epoch": 4.418968692449355, + "grad_norm": 0.21467824280261993, + "learning_rate": 6.173436556648319e-05, + "loss": 1.7689, + "step": 14397 + }, + { + "epoch": 4.4192756292203805, + "grad_norm": 0.2861369848251343, + "learning_rate": 6.172953377183213e-05, + "loss": 1.819, + "step": 14398 + }, + { + "epoch": 4.419582565991406, + "grad_norm": 0.34777504205703735, + "learning_rate": 6.172470186126295e-05, + "loss": 1.7444, + "step": 14399 + }, + { + "epoch": 4.419889502762431, + "grad_norm": 0.2728833854198456, + "learning_rate": 6.171986983482339e-05, + "loss": 1.7637, + "step": 14400 + }, + { + "epoch": 4.420196439533456, + "grad_norm": 0.2593914270401001, + "learning_rate": 6.17150376925612e-05, + "loss": 1.8196, + "step": 14401 + }, + { + "epoch": 4.420503376304481, + "grad_norm": 0.29425305128097534, + "learning_rate": 6.171020543452416e-05, + "loss": 1.7511, + "step": 14402 + }, + { + "epoch": 4.420810313075506, + "grad_norm": 0.2587110102176666, + "learning_rate": 6.170537306076e-05, + "loss": 1.8085, + "step": 14403 + }, + { + "epoch": 4.421117249846532, + "grad_norm": 0.22442933917045593, + "learning_rate": 6.170054057131648e-05, + "loss": 1.8023, + "step": 14404 + }, + { + "epoch": 4.421424186617557, + "grad_norm": 0.23302629590034485, + "learning_rate": 6.169570796624136e-05, + "loss": 1.7995, + "step": 14405 + }, + { + "epoch": 4.421731123388582, + "grad_norm": 0.2295885682106018, + "learning_rate": 6.169087524558239e-05, + "loss": 1.7948, + "step": 14406 + }, + { + "epoch": 4.422038060159607, + "grad_norm": 0.2161262482404709, + "learning_rate": 6.168604240938735e-05, + "loss": 1.7159, + "step": 14407 + }, + { + "epoch": 4.422344996930632, + "grad_norm": 0.20746205747127533, + "learning_rate": 6.1681209457704e-05, + "loss": 1.7703, + "step": 14408 + }, + { + "epoch": 4.422651933701657, + "grad_norm": 0.25677376985549927, + "learning_rate": 6.167637639058006e-05, + "loss": 1.7819, + "step": 14409 + }, + { + "epoch": 4.422958870472683, + "grad_norm": 0.226568341255188, + "learning_rate": 6.167154320806336e-05, + "loss": 1.7661, + "step": 14410 + }, + { + "epoch": 4.423265807243708, + "grad_norm": 0.22997824847698212, + "learning_rate": 6.166670991020162e-05, + "loss": 1.7364, + "step": 14411 + }, + { + "epoch": 4.4235727440147325, + "grad_norm": 0.2528770864009857, + "learning_rate": 6.166187649704261e-05, + "loss": 1.8505, + "step": 14412 + }, + { + "epoch": 4.423879680785758, + "grad_norm": 0.27278614044189453, + "learning_rate": 6.165704296863409e-05, + "loss": 1.7855, + "step": 14413 + }, + { + "epoch": 4.424186617556783, + "grad_norm": 0.23086364567279816, + "learning_rate": 6.165220932502385e-05, + "loss": 1.7489, + "step": 14414 + }, + { + "epoch": 4.4244935543278086, + "grad_norm": 0.2570587396621704, + "learning_rate": 6.164737556625965e-05, + "loss": 1.8008, + "step": 14415 + }, + { + "epoch": 4.424800491098834, + "grad_norm": 0.2637264132499695, + "learning_rate": 6.164254169238923e-05, + "loss": 1.7563, + "step": 14416 + }, + { + "epoch": 4.425107427869859, + "grad_norm": 0.23046623170375824, + "learning_rate": 6.163770770346043e-05, + "loss": 1.7433, + "step": 14417 + }, + { + "epoch": 4.425414364640884, + "grad_norm": 0.2531467080116272, + "learning_rate": 6.163287359952095e-05, + "loss": 1.8122, + "step": 14418 + }, + { + "epoch": 4.425721301411909, + "grad_norm": 0.26507216691970825, + "learning_rate": 6.162803938061861e-05, + "loss": 1.7019, + "step": 14419 + }, + { + "epoch": 4.426028238182934, + "grad_norm": 0.229641854763031, + "learning_rate": 6.162320504680117e-05, + "loss": 1.7518, + "step": 14420 + }, + { + "epoch": 4.42633517495396, + "grad_norm": 0.22777152061462402, + "learning_rate": 6.161837059811641e-05, + "loss": 1.8094, + "step": 14421 + }, + { + "epoch": 4.426642111724985, + "grad_norm": 0.22121338546276093, + "learning_rate": 6.161353603461209e-05, + "loss": 1.7204, + "step": 14422 + }, + { + "epoch": 4.4269490484960095, + "grad_norm": 0.21914128959178925, + "learning_rate": 6.1608701356336e-05, + "loss": 1.7554, + "step": 14423 + }, + { + "epoch": 4.427255985267035, + "grad_norm": 0.22649390995502472, + "learning_rate": 6.160386656333593e-05, + "loss": 1.8058, + "step": 14424 + }, + { + "epoch": 4.42756292203806, + "grad_norm": 0.24529023468494415, + "learning_rate": 6.159903165565964e-05, + "loss": 1.7302, + "step": 14425 + }, + { + "epoch": 4.4278698588090855, + "grad_norm": 0.2726481854915619, + "learning_rate": 6.159419663335492e-05, + "loss": 1.825, + "step": 14426 + }, + { + "epoch": 4.428176795580111, + "grad_norm": 0.2772440016269684, + "learning_rate": 6.158936149646957e-05, + "loss": 1.7322, + "step": 14427 + }, + { + "epoch": 4.428483732351136, + "grad_norm": 0.29778853058815, + "learning_rate": 6.158452624505135e-05, + "loss": 1.7421, + "step": 14428 + }, + { + "epoch": 4.428790669122161, + "grad_norm": 0.21327480673789978, + "learning_rate": 6.157969087914804e-05, + "loss": 1.7269, + "step": 14429 + }, + { + "epoch": 4.429097605893186, + "grad_norm": 0.2718868851661682, + "learning_rate": 6.157485539880744e-05, + "loss": 1.7817, + "step": 14430 + }, + { + "epoch": 4.429404542664211, + "grad_norm": 0.32242509722709656, + "learning_rate": 6.157001980407735e-05, + "loss": 1.7115, + "step": 14431 + }, + { + "epoch": 4.429711479435237, + "grad_norm": 0.2931978106498718, + "learning_rate": 6.156518409500553e-05, + "loss": 1.7822, + "step": 14432 + }, + { + "epoch": 4.430018416206262, + "grad_norm": 0.229528546333313, + "learning_rate": 6.156034827163977e-05, + "loss": 1.7623, + "step": 14433 + }, + { + "epoch": 4.430325352977286, + "grad_norm": 0.28702354431152344, + "learning_rate": 6.15555123340279e-05, + "loss": 1.8101, + "step": 14434 + }, + { + "epoch": 4.430632289748312, + "grad_norm": 0.27162131667137146, + "learning_rate": 6.155067628221766e-05, + "loss": 1.7525, + "step": 14435 + }, + { + "epoch": 4.430939226519337, + "grad_norm": 0.24290388822555542, + "learning_rate": 6.154584011625688e-05, + "loss": 1.8701, + "step": 14436 + }, + { + "epoch": 4.431246163290362, + "grad_norm": 0.3055405020713806, + "learning_rate": 6.154100383619334e-05, + "loss": 1.8659, + "step": 14437 + }, + { + "epoch": 4.431553100061388, + "grad_norm": 0.24528950452804565, + "learning_rate": 6.153616744207483e-05, + "loss": 1.8493, + "step": 14438 + }, + { + "epoch": 4.431860036832412, + "grad_norm": 0.2611897587776184, + "learning_rate": 6.153133093394917e-05, + "loss": 1.7905, + "step": 14439 + }, + { + "epoch": 4.4321669736034375, + "grad_norm": 0.2172730267047882, + "learning_rate": 6.15264943118641e-05, + "loss": 1.7087, + "step": 14440 + }, + { + "epoch": 4.432473910374463, + "grad_norm": 0.2320949286222458, + "learning_rate": 6.152165757586749e-05, + "loss": 1.7473, + "step": 14441 + }, + { + "epoch": 4.432780847145488, + "grad_norm": 0.2602086365222931, + "learning_rate": 6.15168207260071e-05, + "loss": 1.7365, + "step": 14442 + }, + { + "epoch": 4.4330877839165135, + "grad_norm": 0.25193190574645996, + "learning_rate": 6.151198376233074e-05, + "loss": 1.8205, + "step": 14443 + }, + { + "epoch": 4.433394720687538, + "grad_norm": 0.2894204556941986, + "learning_rate": 6.150714668488621e-05, + "loss": 1.7759, + "step": 14444 + }, + { + "epoch": 4.433701657458563, + "grad_norm": 0.24150310456752777, + "learning_rate": 6.150230949372131e-05, + "loss": 1.8415, + "step": 14445 + }, + { + "epoch": 4.434008594229589, + "grad_norm": 0.23475918173789978, + "learning_rate": 6.149747218888384e-05, + "loss": 1.7487, + "step": 14446 + }, + { + "epoch": 4.434315531000614, + "grad_norm": 0.29425546526908875, + "learning_rate": 6.149263477042162e-05, + "loss": 1.7538, + "step": 14447 + }, + { + "epoch": 4.434622467771639, + "grad_norm": 0.26241615414619446, + "learning_rate": 6.148779723838244e-05, + "loss": 1.7564, + "step": 14448 + }, + { + "epoch": 4.434929404542665, + "grad_norm": 0.23195287585258484, + "learning_rate": 6.148295959281411e-05, + "loss": 1.837, + "step": 14449 + }, + { + "epoch": 4.435236341313689, + "grad_norm": 0.34972792863845825, + "learning_rate": 6.147812183376445e-05, + "loss": 1.7632, + "step": 14450 + }, + { + "epoch": 4.435543278084714, + "grad_norm": 0.3536125719547272, + "learning_rate": 6.147328396128126e-05, + "loss": 1.8372, + "step": 14451 + }, + { + "epoch": 4.43585021485574, + "grad_norm": 0.2086079865694046, + "learning_rate": 6.146844597541235e-05, + "loss": 1.7014, + "step": 14452 + }, + { + "epoch": 4.436157151626765, + "grad_norm": 0.25547802448272705, + "learning_rate": 6.146360787620554e-05, + "loss": 1.7544, + "step": 14453 + }, + { + "epoch": 4.43646408839779, + "grad_norm": 0.26176998019218445, + "learning_rate": 6.145876966370864e-05, + "loss": 1.7617, + "step": 14454 + }, + { + "epoch": 4.436771025168815, + "grad_norm": 0.2672959566116333, + "learning_rate": 6.145393133796946e-05, + "loss": 1.8178, + "step": 14455 + }, + { + "epoch": 4.43707796193984, + "grad_norm": 0.23373909294605255, + "learning_rate": 6.144909289903582e-05, + "loss": 1.7295, + "step": 14456 + }, + { + "epoch": 4.4373848987108655, + "grad_norm": 0.2369835078716278, + "learning_rate": 6.144425434695551e-05, + "loss": 1.8097, + "step": 14457 + }, + { + "epoch": 4.437691835481891, + "grad_norm": 0.25528979301452637, + "learning_rate": 6.14394156817764e-05, + "loss": 1.7523, + "step": 14458 + }, + { + "epoch": 4.437998772252916, + "grad_norm": 0.2541787624359131, + "learning_rate": 6.143457690354626e-05, + "loss": 1.7606, + "step": 14459 + }, + { + "epoch": 4.4383057090239415, + "grad_norm": 0.2032637745141983, + "learning_rate": 6.142973801231295e-05, + "loss": 1.7967, + "step": 14460 + }, + { + "epoch": 4.438612645794966, + "grad_norm": 0.2413996160030365, + "learning_rate": 6.142489900812426e-05, + "loss": 1.7688, + "step": 14461 + }, + { + "epoch": 4.438919582565991, + "grad_norm": 0.43451038002967834, + "learning_rate": 6.142005989102803e-05, + "loss": 1.8269, + "step": 14462 + }, + { + "epoch": 4.439226519337017, + "grad_norm": 0.23981481790542603, + "learning_rate": 6.141522066107206e-05, + "loss": 1.7628, + "step": 14463 + }, + { + "epoch": 4.439533456108042, + "grad_norm": 0.25396493077278137, + "learning_rate": 6.14103813183042e-05, + "loss": 1.7913, + "step": 14464 + }, + { + "epoch": 4.439840392879067, + "grad_norm": 0.2567536532878876, + "learning_rate": 6.140554186277225e-05, + "loss": 1.7612, + "step": 14465 + }, + { + "epoch": 4.440147329650092, + "grad_norm": 0.2201337069272995, + "learning_rate": 6.140070229452406e-05, + "loss": 1.7541, + "step": 14466 + }, + { + "epoch": 4.440454266421117, + "grad_norm": 0.24202953279018402, + "learning_rate": 6.139586261360746e-05, + "loss": 1.777, + "step": 14467 + }, + { + "epoch": 4.440761203192142, + "grad_norm": 0.23891687393188477, + "learning_rate": 6.139102282007024e-05, + "loss": 1.7509, + "step": 14468 + }, + { + "epoch": 4.441068139963168, + "grad_norm": 0.21132555603981018, + "learning_rate": 6.138618291396026e-05, + "loss": 1.7362, + "step": 14469 + }, + { + "epoch": 4.441375076734193, + "grad_norm": 0.2731861472129822, + "learning_rate": 6.138134289532536e-05, + "loss": 1.8063, + "step": 14470 + }, + { + "epoch": 4.4416820135052175, + "grad_norm": 0.29503315687179565, + "learning_rate": 6.137650276421336e-05, + "loss": 1.7193, + "step": 14471 + }, + { + "epoch": 4.441988950276243, + "grad_norm": 0.2778526544570923, + "learning_rate": 6.137166252067208e-05, + "loss": 1.7507, + "step": 14472 + }, + { + "epoch": 4.442295887047268, + "grad_norm": 0.2907710075378418, + "learning_rate": 6.136682216474938e-05, + "loss": 1.7939, + "step": 14473 + }, + { + "epoch": 4.4426028238182935, + "grad_norm": 0.4133768379688263, + "learning_rate": 6.136198169649306e-05, + "loss": 1.8012, + "step": 14474 + }, + { + "epoch": 4.442909760589319, + "grad_norm": 0.2505052983760834, + "learning_rate": 6.135714111595099e-05, + "loss": 1.8426, + "step": 14475 + }, + { + "epoch": 4.443216697360343, + "grad_norm": 0.3884379267692566, + "learning_rate": 6.135230042317099e-05, + "loss": 1.7383, + "step": 14476 + }, + { + "epoch": 4.443523634131369, + "grad_norm": 0.42902377247810364, + "learning_rate": 6.134745961820091e-05, + "loss": 1.732, + "step": 14477 + }, + { + "epoch": 4.443830570902394, + "grad_norm": 0.21782708168029785, + "learning_rate": 6.134261870108858e-05, + "loss": 1.7369, + "step": 14478 + }, + { + "epoch": 4.444137507673419, + "grad_norm": 0.4160648286342621, + "learning_rate": 6.133777767188186e-05, + "loss": 1.8083, + "step": 14479 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.5057216882705688, + "learning_rate": 6.133293653062856e-05, + "loss": 1.8971, + "step": 14480 + }, + { + "epoch": 4.44475138121547, + "grad_norm": 0.2189750075340271, + "learning_rate": 6.132809527737654e-05, + "loss": 1.7508, + "step": 14481 + }, + { + "epoch": 4.445058317986494, + "grad_norm": 0.4415782392024994, + "learning_rate": 6.132325391217364e-05, + "loss": 1.8548, + "step": 14482 + }, + { + "epoch": 4.44536525475752, + "grad_norm": 0.3907296359539032, + "learning_rate": 6.13184124350677e-05, + "loss": 1.7879, + "step": 14483 + }, + { + "epoch": 4.445672191528545, + "grad_norm": 0.24117955565452576, + "learning_rate": 6.131357084610659e-05, + "loss": 1.7227, + "step": 14484 + }, + { + "epoch": 4.44597912829957, + "grad_norm": 0.3083679974079132, + "learning_rate": 6.130872914533815e-05, + "loss": 1.7505, + "step": 14485 + }, + { + "epoch": 4.446286065070596, + "grad_norm": 0.27730658650398254, + "learning_rate": 6.13038873328102e-05, + "loss": 1.7485, + "step": 14486 + }, + { + "epoch": 4.44659300184162, + "grad_norm": 0.28548410534858704, + "learning_rate": 6.12990454085706e-05, + "loss": 1.8145, + "step": 14487 + }, + { + "epoch": 4.4468999386126455, + "grad_norm": 0.24743106961250305, + "learning_rate": 6.129420337266724e-05, + "loss": 1.7131, + "step": 14488 + }, + { + "epoch": 4.447206875383671, + "grad_norm": 0.2899693250656128, + "learning_rate": 6.128936122514794e-05, + "loss": 1.8567, + "step": 14489 + }, + { + "epoch": 4.447513812154696, + "grad_norm": 0.259916752576828, + "learning_rate": 6.128451896606053e-05, + "loss": 1.7563, + "step": 14490 + }, + { + "epoch": 4.4478207489257215, + "grad_norm": 0.21112586557865143, + "learning_rate": 6.12796765954529e-05, + "loss": 1.6975, + "step": 14491 + }, + { + "epoch": 4.448127685696747, + "grad_norm": 0.2890239953994751, + "learning_rate": 6.12748341133729e-05, + "loss": 1.7904, + "step": 14492 + }, + { + "epoch": 4.448434622467771, + "grad_norm": 0.23394012451171875, + "learning_rate": 6.126999151986839e-05, + "loss": 1.7559, + "step": 14493 + }, + { + "epoch": 4.448741559238797, + "grad_norm": 0.3492949903011322, + "learning_rate": 6.12651488149872e-05, + "loss": 1.7734, + "step": 14494 + }, + { + "epoch": 4.449048496009822, + "grad_norm": 0.48309218883514404, + "learning_rate": 6.126030599877723e-05, + "loss": 1.7798, + "step": 14495 + }, + { + "epoch": 4.449355432780847, + "grad_norm": 0.341146320104599, + "learning_rate": 6.12554630712863e-05, + "loss": 1.7921, + "step": 14496 + }, + { + "epoch": 4.449662369551873, + "grad_norm": 0.223160982131958, + "learning_rate": 6.125062003256229e-05, + "loss": 1.7784, + "step": 14497 + }, + { + "epoch": 4.449969306322897, + "grad_norm": 0.32664811611175537, + "learning_rate": 6.124577688265306e-05, + "loss": 1.7353, + "step": 14498 + }, + { + "epoch": 4.4502762430939224, + "grad_norm": 0.215936541557312, + "learning_rate": 6.124093362160646e-05, + "loss": 1.68, + "step": 14499 + }, + { + "epoch": 4.450583179864948, + "grad_norm": 0.26081225275993347, + "learning_rate": 6.123609024947038e-05, + "loss": 1.7107, + "step": 14500 + }, + { + "epoch": 4.450890116635973, + "grad_norm": 0.3124069571495056, + "learning_rate": 6.123124676629267e-05, + "loss": 1.7338, + "step": 14501 + }, + { + "epoch": 4.4511970534069984, + "grad_norm": 0.23125620186328888, + "learning_rate": 6.122640317212118e-05, + "loss": 1.7842, + "step": 14502 + }, + { + "epoch": 4.451503990178024, + "grad_norm": 0.27065595984458923, + "learning_rate": 6.122155946700381e-05, + "loss": 1.7284, + "step": 14503 + }, + { + "epoch": 4.451810926949048, + "grad_norm": 0.4677436053752899, + "learning_rate": 6.121671565098841e-05, + "loss": 1.8156, + "step": 14504 + }, + { + "epoch": 4.452117863720074, + "grad_norm": 0.36325082182884216, + "learning_rate": 6.121187172412285e-05, + "loss": 1.7875, + "step": 14505 + }, + { + "epoch": 4.452424800491099, + "grad_norm": 0.23409567773342133, + "learning_rate": 6.1207027686455e-05, + "loss": 1.7421, + "step": 14506 + }, + { + "epoch": 4.452731737262124, + "grad_norm": 0.36919257044792175, + "learning_rate": 6.120218353803273e-05, + "loss": 1.7545, + "step": 14507 + }, + { + "epoch": 4.45303867403315, + "grad_norm": 0.318452388048172, + "learning_rate": 6.119733927890393e-05, + "loss": 1.7179, + "step": 14508 + }, + { + "epoch": 4.453345610804174, + "grad_norm": 0.21279768645763397, + "learning_rate": 6.119249490911643e-05, + "loss": 1.7534, + "step": 14509 + }, + { + "epoch": 4.453652547575199, + "grad_norm": 0.30565473437309265, + "learning_rate": 6.118765042871816e-05, + "loss": 1.7962, + "step": 14510 + }, + { + "epoch": 4.453959484346225, + "grad_norm": 0.2608480453491211, + "learning_rate": 6.118280583775697e-05, + "loss": 1.7336, + "step": 14511 + }, + { + "epoch": 4.45426642111725, + "grad_norm": 0.22978845238685608, + "learning_rate": 6.117796113628075e-05, + "loss": 1.8244, + "step": 14512 + }, + { + "epoch": 4.454573357888275, + "grad_norm": 0.26357781887054443, + "learning_rate": 6.117311632433735e-05, + "loss": 1.7425, + "step": 14513 + }, + { + "epoch": 4.4548802946593, + "grad_norm": 0.22127102315425873, + "learning_rate": 6.116827140197467e-05, + "loss": 1.7679, + "step": 14514 + }, + { + "epoch": 4.455187231430325, + "grad_norm": 0.2876584231853485, + "learning_rate": 6.116342636924058e-05, + "loss": 1.8104, + "step": 14515 + }, + { + "epoch": 4.4554941682013505, + "grad_norm": 0.28290677070617676, + "learning_rate": 6.115858122618297e-05, + "loss": 1.7485, + "step": 14516 + }, + { + "epoch": 4.455801104972376, + "grad_norm": 0.21914640069007874, + "learning_rate": 6.115373597284974e-05, + "loss": 1.7736, + "step": 14517 + }, + { + "epoch": 4.456108041743401, + "grad_norm": 0.2603909969329834, + "learning_rate": 6.114889060928873e-05, + "loss": 1.7446, + "step": 14518 + }, + { + "epoch": 4.456414978514426, + "grad_norm": 0.2157236635684967, + "learning_rate": 6.114404513554784e-05, + "loss": 1.7594, + "step": 14519 + }, + { + "epoch": 4.456721915285451, + "grad_norm": 0.27622368931770325, + "learning_rate": 6.113919955167499e-05, + "loss": 1.8154, + "step": 14520 + }, + { + "epoch": 4.457028852056476, + "grad_norm": 0.27298516035079956, + "learning_rate": 6.113435385771803e-05, + "loss": 1.7458, + "step": 14521 + }, + { + "epoch": 4.457335788827502, + "grad_norm": 0.22220586240291595, + "learning_rate": 6.112950805372485e-05, + "loss": 1.7102, + "step": 14522 + }, + { + "epoch": 4.457642725598527, + "grad_norm": 0.19480876624584198, + "learning_rate": 6.112466213974336e-05, + "loss": 1.7696, + "step": 14523 + }, + { + "epoch": 4.457949662369552, + "grad_norm": 0.24261653423309326, + "learning_rate": 6.111981611582144e-05, + "loss": 1.8193, + "step": 14524 + }, + { + "epoch": 4.458256599140577, + "grad_norm": 0.2502967417240143, + "learning_rate": 6.111496998200697e-05, + "loss": 1.7701, + "step": 14525 + }, + { + "epoch": 4.458563535911602, + "grad_norm": 0.25764599442481995, + "learning_rate": 6.111012373834786e-05, + "loss": 1.8055, + "step": 14526 + }, + { + "epoch": 4.458870472682627, + "grad_norm": 0.24085427820682526, + "learning_rate": 6.110527738489198e-05, + "loss": 1.7592, + "step": 14527 + }, + { + "epoch": 4.459177409453653, + "grad_norm": 0.2469809502363205, + "learning_rate": 6.110043092168727e-05, + "loss": 1.6977, + "step": 14528 + }, + { + "epoch": 4.459484346224678, + "grad_norm": 0.21888838708400726, + "learning_rate": 6.109558434878159e-05, + "loss": 1.777, + "step": 14529 + }, + { + "epoch": 4.4597912829957025, + "grad_norm": 0.2094014585018158, + "learning_rate": 6.109073766622281e-05, + "loss": 1.7041, + "step": 14530 + }, + { + "epoch": 4.460098219766728, + "grad_norm": 0.23801055550575256, + "learning_rate": 6.108589087405888e-05, + "loss": 1.8392, + "step": 14531 + }, + { + "epoch": 4.460405156537753, + "grad_norm": 0.2164965718984604, + "learning_rate": 6.108104397233769e-05, + "loss": 1.7643, + "step": 14532 + }, + { + "epoch": 4.4607120933087785, + "grad_norm": 0.21322336792945862, + "learning_rate": 6.107619696110712e-05, + "loss": 1.7063, + "step": 14533 + }, + { + "epoch": 4.461019030079804, + "grad_norm": 0.29019200801849365, + "learning_rate": 6.107134984041507e-05, + "loss": 1.8254, + "step": 14534 + }, + { + "epoch": 4.461325966850829, + "grad_norm": 0.2765025496482849, + "learning_rate": 6.106650261030947e-05, + "loss": 1.7609, + "step": 14535 + }, + { + "epoch": 4.461632903621854, + "grad_norm": 0.20879749953746796, + "learning_rate": 6.106165527083818e-05, + "loss": 1.7387, + "step": 14536 + }, + { + "epoch": 4.461939840392879, + "grad_norm": 0.22295843064785004, + "learning_rate": 6.105680782204913e-05, + "loss": 1.7691, + "step": 14537 + }, + { + "epoch": 4.462246777163904, + "grad_norm": 0.23502351343631744, + "learning_rate": 6.105196026399025e-05, + "loss": 1.7335, + "step": 14538 + }, + { + "epoch": 4.46255371393493, + "grad_norm": 0.22143007814884186, + "learning_rate": 6.104711259670941e-05, + "loss": 1.7338, + "step": 14539 + }, + { + "epoch": 4.462860650705955, + "grad_norm": 0.22361041605472565, + "learning_rate": 6.104226482025453e-05, + "loss": 1.7033, + "step": 14540 + }, + { + "epoch": 4.463167587476979, + "grad_norm": 0.27104905247688293, + "learning_rate": 6.10374169346735e-05, + "loss": 1.7926, + "step": 14541 + }, + { + "epoch": 4.463474524248005, + "grad_norm": 0.23564264178276062, + "learning_rate": 6.103256894001427e-05, + "loss": 1.7522, + "step": 14542 + }, + { + "epoch": 4.46378146101903, + "grad_norm": 0.2585970163345337, + "learning_rate": 6.102772083632471e-05, + "loss": 1.7755, + "step": 14543 + }, + { + "epoch": 4.464088397790055, + "grad_norm": 0.358634889125824, + "learning_rate": 6.102287262365276e-05, + "loss": 1.8092, + "step": 14544 + }, + { + "epoch": 4.464395334561081, + "grad_norm": 0.2862946689128876, + "learning_rate": 6.1018024302046314e-05, + "loss": 1.7051, + "step": 14545 + }, + { + "epoch": 4.464702271332105, + "grad_norm": 0.21907158195972443, + "learning_rate": 6.101317587155331e-05, + "loss": 1.7882, + "step": 14546 + }, + { + "epoch": 4.4650092081031305, + "grad_norm": 0.24268488585948944, + "learning_rate": 6.100832733222164e-05, + "loss": 1.7756, + "step": 14547 + }, + { + "epoch": 4.465316144874156, + "grad_norm": 0.2350744605064392, + "learning_rate": 6.1003478684099214e-05, + "loss": 1.7483, + "step": 14548 + }, + { + "epoch": 4.465623081645181, + "grad_norm": 0.22902250289916992, + "learning_rate": 6.099862992723397e-05, + "loss": 1.7687, + "step": 14549 + }, + { + "epoch": 4.4659300184162065, + "grad_norm": 0.23590944707393646, + "learning_rate": 6.099378106167382e-05, + "loss": 1.8481, + "step": 14550 + }, + { + "epoch": 4.466236955187231, + "grad_norm": 0.23644296824932098, + "learning_rate": 6.098893208746668e-05, + "loss": 1.7422, + "step": 14551 + }, + { + "epoch": 4.466543891958256, + "grad_norm": 0.23782360553741455, + "learning_rate": 6.0984083004660475e-05, + "loss": 1.7852, + "step": 14552 + }, + { + "epoch": 4.466850828729282, + "grad_norm": 0.2546575665473938, + "learning_rate": 6.097923381330313e-05, + "loss": 1.8483, + "step": 14553 + }, + { + "epoch": 4.467157765500307, + "grad_norm": 0.2555409371852875, + "learning_rate": 6.097438451344254e-05, + "loss": 1.7887, + "step": 14554 + }, + { + "epoch": 4.467464702271332, + "grad_norm": 0.28074198961257935, + "learning_rate": 6.0969535105126664e-05, + "loss": 1.7521, + "step": 14555 + }, + { + "epoch": 4.467771639042358, + "grad_norm": 0.22622554004192352, + "learning_rate": 6.096468558840341e-05, + "loss": 1.8088, + "step": 14556 + }, + { + "epoch": 4.468078575813382, + "grad_norm": 0.302749902009964, + "learning_rate": 6.095983596332071e-05, + "loss": 1.8192, + "step": 14557 + }, + { + "epoch": 4.468385512584407, + "grad_norm": 0.27925750613212585, + "learning_rate": 6.0954986229926494e-05, + "loss": 1.8453, + "step": 14558 + }, + { + "epoch": 4.468692449355433, + "grad_norm": 0.2246330976486206, + "learning_rate": 6.095013638826868e-05, + "loss": 1.744, + "step": 14559 + }, + { + "epoch": 4.468999386126458, + "grad_norm": 0.26677101850509644, + "learning_rate": 6.094528643839518e-05, + "loss": 1.708, + "step": 14560 + }, + { + "epoch": 4.469306322897483, + "grad_norm": 0.23684042692184448, + "learning_rate": 6.094043638035396e-05, + "loss": 1.713, + "step": 14561 + }, + { + "epoch": 4.469613259668508, + "grad_norm": 0.2470075935125351, + "learning_rate": 6.093558621419294e-05, + "loss": 1.8096, + "step": 14562 + }, + { + "epoch": 4.469920196439533, + "grad_norm": 0.2775517702102661, + "learning_rate": 6.093073593996005e-05, + "loss": 1.697, + "step": 14563 + }, + { + "epoch": 4.4702271332105585, + "grad_norm": 0.21053175628185272, + "learning_rate": 6.092588555770322e-05, + "loss": 1.6894, + "step": 14564 + }, + { + "epoch": 4.470534069981584, + "grad_norm": 0.2555869221687317, + "learning_rate": 6.0921035067470366e-05, + "loss": 1.7051, + "step": 14565 + }, + { + "epoch": 4.470841006752609, + "grad_norm": 0.34468984603881836, + "learning_rate": 6.0916184469309454e-05, + "loss": 1.7317, + "step": 14566 + }, + { + "epoch": 4.4711479435236345, + "grad_norm": 0.2517752945423126, + "learning_rate": 6.0911333763268407e-05, + "loss": 1.7524, + "step": 14567 + }, + { + "epoch": 4.471454880294659, + "grad_norm": 0.2749727666378021, + "learning_rate": 6.090648294939517e-05, + "loss": 1.7045, + "step": 14568 + }, + { + "epoch": 4.471761817065684, + "grad_norm": 0.36250773072242737, + "learning_rate": 6.0901632027737673e-05, + "loss": 1.7196, + "step": 14569 + }, + { + "epoch": 4.47206875383671, + "grad_norm": 0.2317698448896408, + "learning_rate": 6.089678099834386e-05, + "loss": 1.7318, + "step": 14570 + }, + { + "epoch": 4.472375690607735, + "grad_norm": 0.2863345444202423, + "learning_rate": 6.089192986126166e-05, + "loss": 1.7798, + "step": 14571 + }, + { + "epoch": 4.47268262737876, + "grad_norm": 0.3493366241455078, + "learning_rate": 6.088707861653904e-05, + "loss": 1.7749, + "step": 14572 + }, + { + "epoch": 4.472989564149785, + "grad_norm": 0.25718605518341064, + "learning_rate": 6.0882227264223924e-05, + "loss": 1.7683, + "step": 14573 + }, + { + "epoch": 4.47329650092081, + "grad_norm": 0.2320062816143036, + "learning_rate": 6.087737580436426e-05, + "loss": 1.8296, + "step": 14574 + }, + { + "epoch": 4.473603437691835, + "grad_norm": 0.29071560502052307, + "learning_rate": 6.087252423700799e-05, + "loss": 1.7428, + "step": 14575 + }, + { + "epoch": 4.473910374462861, + "grad_norm": 0.24233707785606384, + "learning_rate": 6.086767256220306e-05, + "loss": 1.7332, + "step": 14576 + }, + { + "epoch": 4.474217311233886, + "grad_norm": 0.228043332695961, + "learning_rate": 6.086282077999742e-05, + "loss": 1.7697, + "step": 14577 + }, + { + "epoch": 4.474524248004911, + "grad_norm": 0.29154402017593384, + "learning_rate": 6.085796889043902e-05, + "loss": 1.8043, + "step": 14578 + }, + { + "epoch": 4.474831184775936, + "grad_norm": 0.30543211102485657, + "learning_rate": 6.0853116893575814e-05, + "loss": 1.7665, + "step": 14579 + }, + { + "epoch": 4.475138121546961, + "grad_norm": 0.22792959213256836, + "learning_rate": 6.0848264789455754e-05, + "loss": 1.729, + "step": 14580 + }, + { + "epoch": 4.475445058317987, + "grad_norm": 0.2615707218647003, + "learning_rate": 6.084341257812677e-05, + "loss": 1.7438, + "step": 14581 + }, + { + "epoch": 4.475751995089012, + "grad_norm": 0.23342981934547424, + "learning_rate": 6.083856025963681e-05, + "loss": 1.7158, + "step": 14582 + }, + { + "epoch": 4.476058931860037, + "grad_norm": 0.22279240190982819, + "learning_rate": 6.083370783403387e-05, + "loss": 1.7413, + "step": 14583 + }, + { + "epoch": 4.476365868631062, + "grad_norm": 0.28867462277412415, + "learning_rate": 6.082885530136587e-05, + "loss": 1.7932, + "step": 14584 + }, + { + "epoch": 4.476672805402087, + "grad_norm": 0.2947152256965637, + "learning_rate": 6.082400266168078e-05, + "loss": 1.8986, + "step": 14585 + }, + { + "epoch": 4.476979742173112, + "grad_norm": 0.2948935627937317, + "learning_rate": 6.0819149915026555e-05, + "loss": 1.9134, + "step": 14586 + }, + { + "epoch": 4.477286678944138, + "grad_norm": 0.4436163902282715, + "learning_rate": 6.081429706145114e-05, + "loss": 1.7616, + "step": 14587 + }, + { + "epoch": 4.477593615715163, + "grad_norm": 0.4879693388938904, + "learning_rate": 6.080944410100249e-05, + "loss": 1.8155, + "step": 14588 + }, + { + "epoch": 4.4779005524861875, + "grad_norm": 0.29742667078971863, + "learning_rate": 6.08045910337286e-05, + "loss": 1.7428, + "step": 14589 + }, + { + "epoch": 4.478207489257213, + "grad_norm": 0.2994751036167145, + "learning_rate": 6.0799737859677395e-05, + "loss": 1.7764, + "step": 14590 + }, + { + "epoch": 4.478514426028238, + "grad_norm": 0.46379905939102173, + "learning_rate": 6.079488457889686e-05, + "loss": 1.7289, + "step": 14591 + }, + { + "epoch": 4.4788213627992635, + "grad_norm": 0.3511717617511749, + "learning_rate": 6.0790031191434946e-05, + "loss": 1.7658, + "step": 14592 + }, + { + "epoch": 4.479128299570289, + "grad_norm": 0.22678083181381226, + "learning_rate": 6.0785177697339626e-05, + "loss": 1.7973, + "step": 14593 + }, + { + "epoch": 4.479435236341313, + "grad_norm": 0.31201767921447754, + "learning_rate": 6.0780324096658837e-05, + "loss": 1.7542, + "step": 14594 + }, + { + "epoch": 4.479742173112339, + "grad_norm": 0.23759113252162933, + "learning_rate": 6.077547038944058e-05, + "loss": 1.7191, + "step": 14595 + }, + { + "epoch": 4.480049109883364, + "grad_norm": 0.25801756978034973, + "learning_rate": 6.077061657573282e-05, + "loss": 1.8229, + "step": 14596 + }, + { + "epoch": 4.480356046654389, + "grad_norm": 0.3435722887516022, + "learning_rate": 6.0765762655583514e-05, + "loss": 1.7633, + "step": 14597 + }, + { + "epoch": 4.480662983425415, + "grad_norm": 0.2710443437099457, + "learning_rate": 6.076090862904063e-05, + "loss": 1.8126, + "step": 14598 + }, + { + "epoch": 4.48096992019644, + "grad_norm": 0.25750285387039185, + "learning_rate": 6.075605449615212e-05, + "loss": 1.7382, + "step": 14599 + }, + { + "epoch": 4.481276856967464, + "grad_norm": 0.3638051152229309, + "learning_rate": 6.075120025696598e-05, + "loss": 1.8191, + "step": 14600 + }, + { + "epoch": 4.48158379373849, + "grad_norm": 0.24185293912887573, + "learning_rate": 6.074634591153019e-05, + "loss": 1.7637, + "step": 14601 + }, + { + "epoch": 4.481890730509515, + "grad_norm": 0.317283570766449, + "learning_rate": 6.0741491459892707e-05, + "loss": 1.7805, + "step": 14602 + }, + { + "epoch": 4.48219766728054, + "grad_norm": 0.33884385228157043, + "learning_rate": 6.073663690210151e-05, + "loss": 1.7719, + "step": 14603 + }, + { + "epoch": 4.482504604051566, + "grad_norm": 0.2554258704185486, + "learning_rate": 6.073178223820457e-05, + "loss": 1.836, + "step": 14604 + }, + { + "epoch": 4.48281154082259, + "grad_norm": 0.3363535702228546, + "learning_rate": 6.072692746824987e-05, + "loss": 1.8249, + "step": 14605 + }, + { + "epoch": 4.4831184775936155, + "grad_norm": 0.36090195178985596, + "learning_rate": 6.072207259228537e-05, + "loss": 1.733, + "step": 14606 + }, + { + "epoch": 4.483425414364641, + "grad_norm": 0.21928483247756958, + "learning_rate": 6.071721761035909e-05, + "loss": 1.7413, + "step": 14607 + }, + { + "epoch": 4.483732351135666, + "grad_norm": 0.4256608486175537, + "learning_rate": 6.071236252251897e-05, + "loss": 1.7585, + "step": 14608 + }, + { + "epoch": 4.4840392879066915, + "grad_norm": 0.41980308294296265, + "learning_rate": 6.0707507328813007e-05, + "loss": 1.7584, + "step": 14609 + }, + { + "epoch": 4.484346224677717, + "grad_norm": 0.200295090675354, + "learning_rate": 6.0702652029289186e-05, + "loss": 1.7492, + "step": 14610 + }, + { + "epoch": 4.484653161448741, + "grad_norm": 0.41847771406173706, + "learning_rate": 6.069779662399549e-05, + "loss": 1.8101, + "step": 14611 + }, + { + "epoch": 4.484960098219767, + "grad_norm": 0.4846353530883789, + "learning_rate": 6.069294111297987e-05, + "loss": 1.8227, + "step": 14612 + }, + { + "epoch": 4.485267034990792, + "grad_norm": 0.23216098546981812, + "learning_rate": 6.068808549629036e-05, + "loss": 1.6811, + "step": 14613 + }, + { + "epoch": 4.485573971761817, + "grad_norm": 0.34903186559677124, + "learning_rate": 6.0683229773974934e-05, + "loss": 1.6858, + "step": 14614 + }, + { + "epoch": 4.485880908532843, + "grad_norm": 0.4349122941493988, + "learning_rate": 6.0678373946081556e-05, + "loss": 1.7704, + "step": 14615 + }, + { + "epoch": 4.486187845303867, + "grad_norm": 0.25738775730133057, + "learning_rate": 6.067351801265824e-05, + "loss": 1.7487, + "step": 14616 + }, + { + "epoch": 4.486494782074892, + "grad_norm": 0.3052736818790436, + "learning_rate": 6.0668661973752936e-05, + "loss": 1.7528, + "step": 14617 + }, + { + "epoch": 4.486801718845918, + "grad_norm": 0.3400498628616333, + "learning_rate": 6.066380582941368e-05, + "loss": 1.7414, + "step": 14618 + }, + { + "epoch": 4.487108655616943, + "grad_norm": 0.28251948952674866, + "learning_rate": 6.065894957968845e-05, + "loss": 1.8078, + "step": 14619 + }, + { + "epoch": 4.487415592387968, + "grad_norm": 0.26907965540885925, + "learning_rate": 6.0654093224625216e-05, + "loss": 1.8143, + "step": 14620 + }, + { + "epoch": 4.487722529158993, + "grad_norm": 0.2821955978870392, + "learning_rate": 6.064923676427201e-05, + "loss": 1.7163, + "step": 14621 + }, + { + "epoch": 4.488029465930018, + "grad_norm": 0.2223028987646103, + "learning_rate": 6.0644380198676786e-05, + "loss": 1.704, + "step": 14622 + }, + { + "epoch": 4.4883364027010435, + "grad_norm": 0.25243067741394043, + "learning_rate": 6.063952352788755e-05, + "loss": 1.7236, + "step": 14623 + }, + { + "epoch": 4.488643339472069, + "grad_norm": 0.30026015639305115, + "learning_rate": 6.063466675195233e-05, + "loss": 1.7575, + "step": 14624 + }, + { + "epoch": 4.488950276243094, + "grad_norm": 0.2055491805076599, + "learning_rate": 6.0629809870919085e-05, + "loss": 1.7294, + "step": 14625 + }, + { + "epoch": 4.4892572130141195, + "grad_norm": 0.2507593035697937, + "learning_rate": 6.0624952884835836e-05, + "loss": 1.762, + "step": 14626 + }, + { + "epoch": 4.489564149785144, + "grad_norm": 0.21385909616947174, + "learning_rate": 6.0620095793750576e-05, + "loss": 1.7396, + "step": 14627 + }, + { + "epoch": 4.489871086556169, + "grad_norm": 0.21926651895046234, + "learning_rate": 6.06152385977113e-05, + "loss": 1.7863, + "step": 14628 + }, + { + "epoch": 4.490178023327195, + "grad_norm": 0.21950845420360565, + "learning_rate": 6.0610381296766016e-05, + "loss": 1.7576, + "step": 14629 + }, + { + "epoch": 4.49048496009822, + "grad_norm": 0.2030971795320511, + "learning_rate": 6.0605523890962736e-05, + "loss": 1.7069, + "step": 14630 + }, + { + "epoch": 4.490791896869245, + "grad_norm": 0.23991432785987854, + "learning_rate": 6.0600666380349436e-05, + "loss": 1.7598, + "step": 14631 + }, + { + "epoch": 4.49109883364027, + "grad_norm": 0.23766861855983734, + "learning_rate": 6.059580876497415e-05, + "loss": 1.7687, + "step": 14632 + }, + { + "epoch": 4.491405770411295, + "grad_norm": 0.2361454963684082, + "learning_rate": 6.059095104488487e-05, + "loss": 1.7883, + "step": 14633 + }, + { + "epoch": 4.49171270718232, + "grad_norm": 0.3128328323364258, + "learning_rate": 6.058609322012958e-05, + "loss": 1.8087, + "step": 14634 + }, + { + "epoch": 4.492019643953346, + "grad_norm": 0.2958957850933075, + "learning_rate": 6.0581235290756335e-05, + "loss": 1.782, + "step": 14635 + }, + { + "epoch": 4.492326580724371, + "grad_norm": 0.2197243571281433, + "learning_rate": 6.057637725681312e-05, + "loss": 1.7408, + "step": 14636 + }, + { + "epoch": 4.4926335174953955, + "grad_norm": 0.22227831184864044, + "learning_rate": 6.0571519118347944e-05, + "loss": 1.734, + "step": 14637 + }, + { + "epoch": 4.492940454266421, + "grad_norm": 0.2784527540206909, + "learning_rate": 6.056666087540882e-05, + "loss": 1.8017, + "step": 14638 + }, + { + "epoch": 4.493247391037446, + "grad_norm": 0.21929821372032166, + "learning_rate": 6.056180252804377e-05, + "loss": 1.7271, + "step": 14639 + }, + { + "epoch": 4.4935543278084715, + "grad_norm": 0.2156134843826294, + "learning_rate": 6.055694407630077e-05, + "loss": 1.8082, + "step": 14640 + }, + { + "epoch": 4.493861264579497, + "grad_norm": 0.22672387957572937, + "learning_rate": 6.0552085520227875e-05, + "loss": 1.7506, + "step": 14641 + }, + { + "epoch": 4.494168201350522, + "grad_norm": 0.228785440325737, + "learning_rate": 6.0547226859873086e-05, + "loss": 1.7023, + "step": 14642 + }, + { + "epoch": 4.494475138121547, + "grad_norm": 0.19483685493469238, + "learning_rate": 6.054236809528443e-05, + "loss": 1.6879, + "step": 14643 + }, + { + "epoch": 4.494782074892572, + "grad_norm": 0.24911309778690338, + "learning_rate": 6.0537509226509904e-05, + "loss": 1.7856, + "step": 14644 + }, + { + "epoch": 4.495089011663597, + "grad_norm": 0.24811938405036926, + "learning_rate": 6.053265025359753e-05, + "loss": 1.7581, + "step": 14645 + }, + { + "epoch": 4.495395948434623, + "grad_norm": 0.2487260401248932, + "learning_rate": 6.052779117659534e-05, + "loss": 1.7536, + "step": 14646 + }, + { + "epoch": 4.495702885205648, + "grad_norm": 0.2594854235649109, + "learning_rate": 6.052293199555136e-05, + "loss": 1.7822, + "step": 14647 + }, + { + "epoch": 4.496009821976672, + "grad_norm": 0.22837325930595398, + "learning_rate": 6.051807271051359e-05, + "loss": 1.7542, + "step": 14648 + }, + { + "epoch": 4.496316758747698, + "grad_norm": 0.23106649518013, + "learning_rate": 6.051321332153005e-05, + "loss": 1.7758, + "step": 14649 + }, + { + "epoch": 4.496623695518723, + "grad_norm": 0.29424673318862915, + "learning_rate": 6.050835382864878e-05, + "loss": 1.8335, + "step": 14650 + }, + { + "epoch": 4.496930632289748, + "grad_norm": 0.28297343850135803, + "learning_rate": 6.050349423191779e-05, + "loss": 1.7711, + "step": 14651 + }, + { + "epoch": 4.497237569060774, + "grad_norm": 0.2001795768737793, + "learning_rate": 6.049863453138511e-05, + "loss": 1.7008, + "step": 14652 + }, + { + "epoch": 4.497544505831799, + "grad_norm": 0.35177022218704224, + "learning_rate": 6.04937747270988e-05, + "loss": 1.7763, + "step": 14653 + }, + { + "epoch": 4.4978514426028235, + "grad_norm": 0.28870898485183716, + "learning_rate": 6.0488914819106835e-05, + "loss": 1.7373, + "step": 14654 + }, + { + "epoch": 4.498158379373849, + "grad_norm": 0.23962664604187012, + "learning_rate": 6.048405480745727e-05, + "loss": 1.7278, + "step": 14655 + }, + { + "epoch": 4.498465316144874, + "grad_norm": 0.324505478143692, + "learning_rate": 6.047919469219813e-05, + "loss": 1.7674, + "step": 14656 + }, + { + "epoch": 4.4987722529158995, + "grad_norm": 0.38313817977905273, + "learning_rate": 6.047433447337744e-05, + "loss": 1.789, + "step": 14657 + }, + { + "epoch": 4.499079189686925, + "grad_norm": 0.2101358324289322, + "learning_rate": 6.046947415104324e-05, + "loss": 1.7331, + "step": 14658 + }, + { + "epoch": 4.499386126457949, + "grad_norm": 0.3388524353504181, + "learning_rate": 6.046461372524357e-05, + "loss": 1.8467, + "step": 14659 + }, + { + "epoch": 4.499693063228975, + "grad_norm": 0.3360123634338379, + "learning_rate": 6.045975319602645e-05, + "loss": 1.8427, + "step": 14660 + }, + { + "epoch": 4.5, + "grad_norm": 0.27596545219421387, + "learning_rate": 6.0454892563439914e-05, + "loss": 1.7768, + "step": 14661 + }, + { + "epoch": 4.500306936771025, + "grad_norm": 0.2580861747264862, + "learning_rate": 6.0450031827532e-05, + "loss": 1.763, + "step": 14662 + }, + { + "epoch": 4.500613873542051, + "grad_norm": 0.3521091938018799, + "learning_rate": 6.044517098835074e-05, + "loss": 1.7118, + "step": 14663 + }, + { + "epoch": 4.500920810313076, + "grad_norm": 0.29412439465522766, + "learning_rate": 6.0440310045944204e-05, + "loss": 1.7252, + "step": 14664 + }, + { + "epoch": 4.5012277470841005, + "grad_norm": 0.23845252394676208, + "learning_rate": 6.043544900036039e-05, + "loss": 1.7622, + "step": 14665 + }, + { + "epoch": 4.501534683855126, + "grad_norm": 0.22957031428813934, + "learning_rate": 6.043058785164736e-05, + "loss": 1.7527, + "step": 14666 + }, + { + "epoch": 4.501841620626151, + "grad_norm": 0.2564462721347809, + "learning_rate": 6.042572659985314e-05, + "loss": 1.801, + "step": 14667 + }, + { + "epoch": 4.5021485573971765, + "grad_norm": 0.22588051855564117, + "learning_rate": 6.042086524502576e-05, + "loss": 1.7387, + "step": 14668 + }, + { + "epoch": 4.502455494168201, + "grad_norm": 0.2609740197658539, + "learning_rate": 6.0416003787213306e-05, + "loss": 1.7615, + "step": 14669 + }, + { + "epoch": 4.502762430939226, + "grad_norm": 0.2535521984100342, + "learning_rate": 6.041114222646379e-05, + "loss": 1.7398, + "step": 14670 + }, + { + "epoch": 4.503069367710252, + "grad_norm": 0.2512127757072449, + "learning_rate": 6.040628056282527e-05, + "loss": 1.7679, + "step": 14671 + }, + { + "epoch": 4.503376304481277, + "grad_norm": 0.2438639998435974, + "learning_rate": 6.0401418796345774e-05, + "loss": 1.7, + "step": 14672 + }, + { + "epoch": 4.503683241252302, + "grad_norm": 0.23428042232990265, + "learning_rate": 6.0396556927073376e-05, + "loss": 1.7748, + "step": 14673 + }, + { + "epoch": 4.503990178023328, + "grad_norm": 0.22894345223903656, + "learning_rate": 6.03916949550561e-05, + "loss": 1.7881, + "step": 14674 + }, + { + "epoch": 4.504297114794352, + "grad_norm": 0.24813716113567352, + "learning_rate": 6.0386832880342006e-05, + "loss": 1.7676, + "step": 14675 + }, + { + "epoch": 4.504604051565377, + "grad_norm": 0.23448842763900757, + "learning_rate": 6.038197070297914e-05, + "loss": 1.7828, + "step": 14676 + }, + { + "epoch": 4.504910988336403, + "grad_norm": 0.25302332639694214, + "learning_rate": 6.037710842301556e-05, + "loss": 1.8061, + "step": 14677 + }, + { + "epoch": 4.505217925107428, + "grad_norm": 0.2411813735961914, + "learning_rate": 6.0372246040499305e-05, + "loss": 1.6901, + "step": 14678 + }, + { + "epoch": 4.505524861878453, + "grad_norm": 0.3154819905757904, + "learning_rate": 6.036738355547844e-05, + "loss": 1.7472, + "step": 14679 + }, + { + "epoch": 4.505831798649478, + "grad_norm": 0.2935639023780823, + "learning_rate": 6.0362520968001014e-05, + "loss": 1.7508, + "step": 14680 + }, + { + "epoch": 4.506138735420503, + "grad_norm": 0.27064070105552673, + "learning_rate": 6.035765827811508e-05, + "loss": 1.8133, + "step": 14681 + }, + { + "epoch": 4.5064456721915285, + "grad_norm": 0.23748525977134705, + "learning_rate": 6.03527954858687e-05, + "loss": 1.7742, + "step": 14682 + }, + { + "epoch": 4.506752608962554, + "grad_norm": 0.216410830616951, + "learning_rate": 6.034793259130992e-05, + "loss": 1.7448, + "step": 14683 + }, + { + "epoch": 4.507059545733579, + "grad_norm": 0.23339977860450745, + "learning_rate": 6.034306959448681e-05, + "loss": 1.7437, + "step": 14684 + }, + { + "epoch": 4.5073664825046045, + "grad_norm": 0.23951120674610138, + "learning_rate": 6.0338206495447414e-05, + "loss": 1.7535, + "step": 14685 + }, + { + "epoch": 4.507673419275629, + "grad_norm": 0.22137518227100372, + "learning_rate": 6.0333343294239816e-05, + "loss": 1.7537, + "step": 14686 + }, + { + "epoch": 4.507980356046654, + "grad_norm": 0.2550075054168701, + "learning_rate": 6.032847999091206e-05, + "loss": 1.8069, + "step": 14687 + }, + { + "epoch": 4.50828729281768, + "grad_norm": 0.2166420966386795, + "learning_rate": 6.032361658551221e-05, + "loss": 1.7746, + "step": 14688 + }, + { + "epoch": 4.508594229588705, + "grad_norm": 0.21926096081733704, + "learning_rate": 6.031875307808833e-05, + "loss": 1.7848, + "step": 14689 + }, + { + "epoch": 4.50890116635973, + "grad_norm": 0.27769652009010315, + "learning_rate": 6.031388946868848e-05, + "loss": 1.7563, + "step": 14690 + }, + { + "epoch": 4.509208103130755, + "grad_norm": 0.23417410254478455, + "learning_rate": 6.030902575736074e-05, + "loss": 1.7475, + "step": 14691 + }, + { + "epoch": 4.50951503990178, + "grad_norm": 0.25454118847846985, + "learning_rate": 6.030416194415314e-05, + "loss": 1.7416, + "step": 14692 + }, + { + "epoch": 4.509821976672805, + "grad_norm": 0.3118220567703247, + "learning_rate": 6.029929802911379e-05, + "loss": 1.8001, + "step": 14693 + }, + { + "epoch": 4.510128913443831, + "grad_norm": 0.2338017225265503, + "learning_rate": 6.029443401229075e-05, + "loss": 1.7243, + "step": 14694 + }, + { + "epoch": 4.510435850214856, + "grad_norm": 0.2490454763174057, + "learning_rate": 6.028956989373207e-05, + "loss": 1.7866, + "step": 14695 + }, + { + "epoch": 4.510742786985881, + "grad_norm": 0.2579275369644165, + "learning_rate": 6.028470567348582e-05, + "loss": 1.7594, + "step": 14696 + }, + { + "epoch": 4.511049723756906, + "grad_norm": 0.23982174694538116, + "learning_rate": 6.0279841351600094e-05, + "loss": 1.7444, + "step": 14697 + }, + { + "epoch": 4.511356660527931, + "grad_norm": 0.2160159945487976, + "learning_rate": 6.027497692812295e-05, + "loss": 1.7002, + "step": 14698 + }, + { + "epoch": 4.5116635972989565, + "grad_norm": 0.24604511260986328, + "learning_rate": 6.0270112403102455e-05, + "loss": 1.7654, + "step": 14699 + }, + { + "epoch": 4.511970534069982, + "grad_norm": 0.21978263556957245, + "learning_rate": 6.026524777658669e-05, + "loss": 1.7278, + "step": 14700 + }, + { + "epoch": 4.512277470841006, + "grad_norm": 0.2814212441444397, + "learning_rate": 6.026038304862373e-05, + "loss": 1.7743, + "step": 14701 + }, + { + "epoch": 4.512584407612032, + "grad_norm": 0.23798944056034088, + "learning_rate": 6.025551821926165e-05, + "loss": 1.7348, + "step": 14702 + }, + { + "epoch": 4.512891344383057, + "grad_norm": 0.22415988147258759, + "learning_rate": 6.025065328854853e-05, + "loss": 1.7973, + "step": 14703 + }, + { + "epoch": 4.513198281154082, + "grad_norm": 0.34614792466163635, + "learning_rate": 6.0245788256532445e-05, + "loss": 1.7263, + "step": 14704 + }, + { + "epoch": 4.513505217925108, + "grad_norm": 0.333918958902359, + "learning_rate": 6.0240923123261485e-05, + "loss": 1.7305, + "step": 14705 + }, + { + "epoch": 4.513812154696133, + "grad_norm": 0.22231793403625488, + "learning_rate": 6.02360578887837e-05, + "loss": 1.806, + "step": 14706 + }, + { + "epoch": 4.514119091467157, + "grad_norm": 0.23323194682598114, + "learning_rate": 6.023119255314721e-05, + "loss": 1.7076, + "step": 14707 + }, + { + "epoch": 4.514426028238183, + "grad_norm": 0.26695477962493896, + "learning_rate": 6.022632711640007e-05, + "loss": 1.775, + "step": 14708 + }, + { + "epoch": 4.514732965009208, + "grad_norm": 0.21446476876735687, + "learning_rate": 6.0221461578590364e-05, + "loss": 1.7524, + "step": 14709 + }, + { + "epoch": 4.515039901780233, + "grad_norm": 0.2677358090877533, + "learning_rate": 6.0216595939766204e-05, + "loss": 1.7513, + "step": 14710 + }, + { + "epoch": 4.515346838551259, + "grad_norm": 0.28648239374160767, + "learning_rate": 6.021173019997565e-05, + "loss": 1.7249, + "step": 14711 + }, + { + "epoch": 4.515653775322283, + "grad_norm": 0.2178548276424408, + "learning_rate": 6.020686435926678e-05, + "loss": 1.7502, + "step": 14712 + }, + { + "epoch": 4.5159607120933085, + "grad_norm": 0.3391740024089813, + "learning_rate": 6.02019984176877e-05, + "loss": 1.6828, + "step": 14713 + }, + { + "epoch": 4.516267648864334, + "grad_norm": 0.25222229957580566, + "learning_rate": 6.01971323752865e-05, + "loss": 1.6982, + "step": 14714 + }, + { + "epoch": 4.516574585635359, + "grad_norm": 0.28776636719703674, + "learning_rate": 6.019226623211125e-05, + "loss": 1.8595, + "step": 14715 + }, + { + "epoch": 4.5168815224063845, + "grad_norm": 0.3240084648132324, + "learning_rate": 6.018739998821006e-05, + "loss": 1.7461, + "step": 14716 + }, + { + "epoch": 4.51718845917741, + "grad_norm": 0.26735052466392517, + "learning_rate": 6.0182533643631015e-05, + "loss": 1.7955, + "step": 14717 + }, + { + "epoch": 4.517495395948434, + "grad_norm": 0.24573692679405212, + "learning_rate": 6.017766719842219e-05, + "loss": 1.7441, + "step": 14718 + }, + { + "epoch": 4.51780233271946, + "grad_norm": 0.27401313185691833, + "learning_rate": 6.01728006526317e-05, + "loss": 1.7399, + "step": 14719 + }, + { + "epoch": 4.518109269490485, + "grad_norm": 0.23578806221485138, + "learning_rate": 6.016793400630763e-05, + "loss": 1.7936, + "step": 14720 + }, + { + "epoch": 4.51841620626151, + "grad_norm": 0.27763426303863525, + "learning_rate": 6.0163067259498074e-05, + "loss": 1.7263, + "step": 14721 + }, + { + "epoch": 4.518723143032536, + "grad_norm": 0.27102044224739075, + "learning_rate": 6.015820041225113e-05, + "loss": 1.7085, + "step": 14722 + }, + { + "epoch": 4.51903007980356, + "grad_norm": 0.2046152651309967, + "learning_rate": 6.01533334646149e-05, + "loss": 1.7602, + "step": 14723 + }, + { + "epoch": 4.519337016574585, + "grad_norm": 0.2645253837108612, + "learning_rate": 6.0148466416637484e-05, + "loss": 1.7729, + "step": 14724 + }, + { + "epoch": 4.519643953345611, + "grad_norm": 0.27467650175094604, + "learning_rate": 6.014359926836697e-05, + "loss": 1.7834, + "step": 14725 + }, + { + "epoch": 4.519950890116636, + "grad_norm": 0.30357635021209717, + "learning_rate": 6.013873201985145e-05, + "loss": 1.8685, + "step": 14726 + }, + { + "epoch": 4.520257826887661, + "grad_norm": 0.22923336923122406, + "learning_rate": 6.013386467113905e-05, + "loss": 1.7531, + "step": 14727 + }, + { + "epoch": 4.520564763658687, + "grad_norm": 0.2792156934738159, + "learning_rate": 6.012899722227786e-05, + "loss": 1.7927, + "step": 14728 + }, + { + "epoch": 4.520871700429711, + "grad_norm": 0.286161869764328, + "learning_rate": 6.012412967331598e-05, + "loss": 1.77, + "step": 14729 + }, + { + "epoch": 4.5211786372007365, + "grad_norm": 0.23964659869670868, + "learning_rate": 6.011926202430151e-05, + "loss": 1.7873, + "step": 14730 + }, + { + "epoch": 4.521485573971762, + "grad_norm": 0.2250162959098816, + "learning_rate": 6.011439427528258e-05, + "loss": 1.741, + "step": 14731 + }, + { + "epoch": 4.521792510742787, + "grad_norm": 0.2797175347805023, + "learning_rate": 6.010952642630726e-05, + "loss": 1.7482, + "step": 14732 + }, + { + "epoch": 4.5220994475138125, + "grad_norm": 0.22159560024738312, + "learning_rate": 6.010465847742368e-05, + "loss": 1.7591, + "step": 14733 + }, + { + "epoch": 4.522406384284837, + "grad_norm": 0.26638463139533997, + "learning_rate": 6.009979042867995e-05, + "loss": 1.8564, + "step": 14734 + }, + { + "epoch": 4.522713321055862, + "grad_norm": 0.2972821891307831, + "learning_rate": 6.009492228012416e-05, + "loss": 1.7569, + "step": 14735 + }, + { + "epoch": 4.523020257826888, + "grad_norm": 0.28108885884284973, + "learning_rate": 6.0090054031804444e-05, + "loss": 1.7256, + "step": 14736 + }, + { + "epoch": 4.523327194597913, + "grad_norm": 0.22359851002693176, + "learning_rate": 6.008518568376888e-05, + "loss": 1.7342, + "step": 14737 + }, + { + "epoch": 4.523634131368938, + "grad_norm": 0.2620728015899658, + "learning_rate": 6.008031723606562e-05, + "loss": 1.7703, + "step": 14738 + }, + { + "epoch": 4.523941068139964, + "grad_norm": 0.2641485333442688, + "learning_rate": 6.007544868874274e-05, + "loss": 1.6944, + "step": 14739 + }, + { + "epoch": 4.524248004910988, + "grad_norm": 0.24957752227783203, + "learning_rate": 6.007058004184839e-05, + "loss": 1.7746, + "step": 14740 + }, + { + "epoch": 4.524554941682013, + "grad_norm": 0.29830998182296753, + "learning_rate": 6.006571129543065e-05, + "loss": 1.7718, + "step": 14741 + }, + { + "epoch": 4.524861878453039, + "grad_norm": 0.32740798592567444, + "learning_rate": 6.006084244953766e-05, + "loss": 1.8194, + "step": 14742 + }, + { + "epoch": 4.525168815224064, + "grad_norm": 0.2614956796169281, + "learning_rate": 6.005597350421751e-05, + "loss": 1.7078, + "step": 14743 + }, + { + "epoch": 4.525475751995089, + "grad_norm": 0.23940515518188477, + "learning_rate": 6.005110445951836e-05, + "loss": 1.7488, + "step": 14744 + }, + { + "epoch": 4.525782688766114, + "grad_norm": 0.25485914945602417, + "learning_rate": 6.004623531548829e-05, + "loss": 1.7705, + "step": 14745 + }, + { + "epoch": 4.526089625537139, + "grad_norm": 0.213532954454422, + "learning_rate": 6.0041366072175445e-05, + "loss": 1.7501, + "step": 14746 + }, + { + "epoch": 4.526396562308165, + "grad_norm": 0.2420104295015335, + "learning_rate": 6.003649672962792e-05, + "loss": 1.717, + "step": 14747 + }, + { + "epoch": 4.52670349907919, + "grad_norm": 0.26179102063179016, + "learning_rate": 6.0031627287893865e-05, + "loss": 1.7665, + "step": 14748 + }, + { + "epoch": 4.527010435850215, + "grad_norm": 0.22032082080841064, + "learning_rate": 6.002675774702139e-05, + "loss": 1.7555, + "step": 14749 + }, + { + "epoch": 4.52731737262124, + "grad_norm": 0.23915240168571472, + "learning_rate": 6.002188810705861e-05, + "loss": 1.8219, + "step": 14750 + }, + { + "epoch": 4.527624309392265, + "grad_norm": 0.2275150567293167, + "learning_rate": 6.0017018368053665e-05, + "loss": 1.7418, + "step": 14751 + }, + { + "epoch": 4.52793124616329, + "grad_norm": 0.2349669486284256, + "learning_rate": 6.001214853005467e-05, + "loss": 1.7814, + "step": 14752 + }, + { + "epoch": 4.528238182934316, + "grad_norm": 0.29985731840133667, + "learning_rate": 6.000727859310975e-05, + "loss": 1.7109, + "step": 14753 + }, + { + "epoch": 4.528545119705341, + "grad_norm": 0.27282044291496277, + "learning_rate": 6.0002408557267044e-05, + "loss": 1.7806, + "step": 14754 + }, + { + "epoch": 4.5288520564763655, + "grad_norm": 0.20906320214271545, + "learning_rate": 5.9997538422574675e-05, + "loss": 1.7221, + "step": 14755 + }, + { + "epoch": 4.529158993247391, + "grad_norm": 0.24553455412387848, + "learning_rate": 5.999266818908076e-05, + "loss": 1.793, + "step": 14756 + }, + { + "epoch": 4.529465930018416, + "grad_norm": 0.29730647802352905, + "learning_rate": 5.998779785683345e-05, + "loss": 1.7597, + "step": 14757 + }, + { + "epoch": 4.5297728667894415, + "grad_norm": 0.28297582268714905, + "learning_rate": 5.998292742588087e-05, + "loss": 1.7459, + "step": 14758 + }, + { + "epoch": 4.530079803560467, + "grad_norm": 0.21853844821453094, + "learning_rate": 5.997805689627115e-05, + "loss": 1.7234, + "step": 14759 + }, + { + "epoch": 4.530386740331492, + "grad_norm": 0.2997361421585083, + "learning_rate": 5.997318626805242e-05, + "loss": 1.7294, + "step": 14760 + }, + { + "epoch": 4.530693677102517, + "grad_norm": 0.3298671543598175, + "learning_rate": 5.9968315541272804e-05, + "loss": 1.7837, + "step": 14761 + }, + { + "epoch": 4.531000613873542, + "grad_norm": 0.22812490165233612, + "learning_rate": 5.996344471598047e-05, + "loss": 1.7509, + "step": 14762 + }, + { + "epoch": 4.531307550644567, + "grad_norm": 0.3179669678211212, + "learning_rate": 5.995857379222354e-05, + "loss": 1.8354, + "step": 14763 + }, + { + "epoch": 4.531614487415593, + "grad_norm": 0.3072827458381653, + "learning_rate": 5.9953702770050135e-05, + "loss": 1.8051, + "step": 14764 + }, + { + "epoch": 4.531921424186618, + "grad_norm": 0.19386722147464752, + "learning_rate": 5.994883164950841e-05, + "loss": 1.7093, + "step": 14765 + }, + { + "epoch": 4.532228360957642, + "grad_norm": 0.2380950152873993, + "learning_rate": 5.99439604306465e-05, + "loss": 1.7547, + "step": 14766 + }, + { + "epoch": 4.532535297728668, + "grad_norm": 0.32604947686195374, + "learning_rate": 5.993908911351254e-05, + "loss": 1.8708, + "step": 14767 + }, + { + "epoch": 4.532842234499693, + "grad_norm": 0.2436954528093338, + "learning_rate": 5.993421769815468e-05, + "loss": 1.7272, + "step": 14768 + }, + { + "epoch": 4.533149171270718, + "grad_norm": 0.2470337301492691, + "learning_rate": 5.992934618462105e-05, + "loss": 1.7242, + "step": 14769 + }, + { + "epoch": 4.533456108041744, + "grad_norm": 0.25720325112342834, + "learning_rate": 5.992447457295981e-05, + "loss": 1.7219, + "step": 14770 + }, + { + "epoch": 4.533763044812769, + "grad_norm": 0.2518918812274933, + "learning_rate": 5.991960286321909e-05, + "loss": 1.7916, + "step": 14771 + }, + { + "epoch": 4.5340699815837935, + "grad_norm": 0.2561487853527069, + "learning_rate": 5.9914731055447037e-05, + "loss": 1.7695, + "step": 14772 + }, + { + "epoch": 4.534376918354819, + "grad_norm": 0.25361356139183044, + "learning_rate": 5.9909859149691804e-05, + "loss": 1.7464, + "step": 14773 + }, + { + "epoch": 4.534683855125844, + "grad_norm": 0.22827522456645966, + "learning_rate": 5.9904987146001545e-05, + "loss": 1.7288, + "step": 14774 + }, + { + "epoch": 4.5349907918968695, + "grad_norm": 0.2417261302471161, + "learning_rate": 5.9900115044424385e-05, + "loss": 1.7311, + "step": 14775 + }, + { + "epoch": 4.535297728667894, + "grad_norm": 0.20756755769252777, + "learning_rate": 5.9895242845008495e-05, + "loss": 1.7799, + "step": 14776 + }, + { + "epoch": 4.535604665438919, + "grad_norm": 0.21999207139015198, + "learning_rate": 5.989037054780201e-05, + "loss": 1.7782, + "step": 14777 + }, + { + "epoch": 4.535911602209945, + "grad_norm": 0.22863444685935974, + "learning_rate": 5.988549815285308e-05, + "loss": 1.7869, + "step": 14778 + }, + { + "epoch": 4.53621853898097, + "grad_norm": 0.23033374547958374, + "learning_rate": 5.988062566020987e-05, + "loss": 1.7328, + "step": 14779 + }, + { + "epoch": 4.536525475751995, + "grad_norm": 0.21903404593467712, + "learning_rate": 5.987575306992053e-05, + "loss": 1.7689, + "step": 14780 + }, + { + "epoch": 4.536832412523021, + "grad_norm": 0.2433948963880539, + "learning_rate": 5.98708803820332e-05, + "loss": 1.7647, + "step": 14781 + }, + { + "epoch": 4.537139349294045, + "grad_norm": 0.2564239799976349, + "learning_rate": 5.986600759659606e-05, + "loss": 1.7958, + "step": 14782 + }, + { + "epoch": 4.53744628606507, + "grad_norm": 0.24009190499782562, + "learning_rate": 5.9861134713657244e-05, + "loss": 1.7511, + "step": 14783 + }, + { + "epoch": 4.537753222836096, + "grad_norm": 0.2578975558280945, + "learning_rate": 5.985626173326491e-05, + "loss": 1.8285, + "step": 14784 + }, + { + "epoch": 4.538060159607121, + "grad_norm": 0.24334335327148438, + "learning_rate": 5.9851388655467225e-05, + "loss": 1.7391, + "step": 14785 + }, + { + "epoch": 4.538367096378146, + "grad_norm": 0.26446983218193054, + "learning_rate": 5.9846515480312335e-05, + "loss": 1.8232, + "step": 14786 + }, + { + "epoch": 4.538674033149171, + "grad_norm": 0.3125670850276947, + "learning_rate": 5.9841642207848415e-05, + "loss": 1.7202, + "step": 14787 + }, + { + "epoch": 4.538980969920196, + "grad_norm": 0.2524511218070984, + "learning_rate": 5.983676883812361e-05, + "loss": 1.7653, + "step": 14788 + }, + { + "epoch": 4.5392879066912215, + "grad_norm": 0.3693946897983551, + "learning_rate": 5.98318953711861e-05, + "loss": 1.7457, + "step": 14789 + }, + { + "epoch": 4.539594843462247, + "grad_norm": 0.32625386118888855, + "learning_rate": 5.9827021807084026e-05, + "loss": 1.784, + "step": 14790 + }, + { + "epoch": 4.539901780233272, + "grad_norm": 0.24243168532848358, + "learning_rate": 5.9822148145865574e-05, + "loss": 1.7651, + "step": 14791 + }, + { + "epoch": 4.5402087170042975, + "grad_norm": 0.2950129210948944, + "learning_rate": 5.9817274387578895e-05, + "loss": 1.7316, + "step": 14792 + }, + { + "epoch": 4.540515653775322, + "grad_norm": 0.29455235600471497, + "learning_rate": 5.981240053227216e-05, + "loss": 1.7504, + "step": 14793 + }, + { + "epoch": 4.540822590546347, + "grad_norm": 0.23161925375461578, + "learning_rate": 5.980752657999352e-05, + "loss": 1.7663, + "step": 14794 + }, + { + "epoch": 4.541129527317373, + "grad_norm": 0.2725144922733307, + "learning_rate": 5.980265253079116e-05, + "loss": 1.765, + "step": 14795 + }, + { + "epoch": 4.541436464088398, + "grad_norm": 0.30911222100257874, + "learning_rate": 5.979777838471324e-05, + "loss": 1.7888, + "step": 14796 + }, + { + "epoch": 4.541743400859423, + "grad_norm": 0.2818063497543335, + "learning_rate": 5.979290414180794e-05, + "loss": 1.8047, + "step": 14797 + }, + { + "epoch": 4.542050337630448, + "grad_norm": 0.23335030674934387, + "learning_rate": 5.978802980212341e-05, + "loss": 1.8205, + "step": 14798 + }, + { + "epoch": 4.542357274401473, + "grad_norm": 0.24228201806545258, + "learning_rate": 5.9783155365707855e-05, + "loss": 1.7774, + "step": 14799 + }, + { + "epoch": 4.542664211172498, + "grad_norm": 0.2410847544670105, + "learning_rate": 5.97782808326094e-05, + "loss": 1.6959, + "step": 14800 + }, + { + "epoch": 4.542971147943524, + "grad_norm": 0.24812567234039307, + "learning_rate": 5.9773406202876245e-05, + "loss": 1.8158, + "step": 14801 + }, + { + "epoch": 4.543278084714549, + "grad_norm": 0.2606147229671478, + "learning_rate": 5.9768531476556566e-05, + "loss": 1.7478, + "step": 14802 + }, + { + "epoch": 4.543585021485574, + "grad_norm": 0.24853013455867767, + "learning_rate": 5.976365665369854e-05, + "loss": 1.8158, + "step": 14803 + }, + { + "epoch": 4.543891958256599, + "grad_norm": 0.2320917695760727, + "learning_rate": 5.9758781734350334e-05, + "loss": 1.7812, + "step": 14804 + }, + { + "epoch": 4.544198895027624, + "grad_norm": 0.3460223376750946, + "learning_rate": 5.9753906718560127e-05, + "loss": 1.7562, + "step": 14805 + }, + { + "epoch": 4.5445058317986495, + "grad_norm": 0.2941136658191681, + "learning_rate": 5.9749031606376086e-05, + "loss": 1.7562, + "step": 14806 + }, + { + "epoch": 4.544812768569675, + "grad_norm": 0.2371312975883484, + "learning_rate": 5.9744156397846404e-05, + "loss": 1.7793, + "step": 14807 + }, + { + "epoch": 4.5451197053407, + "grad_norm": 0.2885094881057739, + "learning_rate": 5.973928109301926e-05, + "loss": 1.7564, + "step": 14808 + }, + { + "epoch": 4.545426642111725, + "grad_norm": 0.2369023859500885, + "learning_rate": 5.973440569194284e-05, + "loss": 1.7862, + "step": 14809 + }, + { + "epoch": 4.54573357888275, + "grad_norm": 0.26628994941711426, + "learning_rate": 5.972953019466531e-05, + "loss": 1.7828, + "step": 14810 + }, + { + "epoch": 4.546040515653775, + "grad_norm": 0.3091031610965729, + "learning_rate": 5.9724654601234864e-05, + "loss": 1.7623, + "step": 14811 + }, + { + "epoch": 4.546347452424801, + "grad_norm": 0.24652205407619476, + "learning_rate": 5.971977891169966e-05, + "loss": 1.6982, + "step": 14812 + }, + { + "epoch": 4.546654389195826, + "grad_norm": 0.21779046952724457, + "learning_rate": 5.971490312610793e-05, + "loss": 1.7363, + "step": 14813 + }, + { + "epoch": 4.546961325966851, + "grad_norm": 0.24130751192569733, + "learning_rate": 5.971002724450783e-05, + "loss": 1.7014, + "step": 14814 + }, + { + "epoch": 4.547268262737876, + "grad_norm": 0.21868734061717987, + "learning_rate": 5.9705151266947534e-05, + "loss": 1.7872, + "step": 14815 + }, + { + "epoch": 4.547575199508901, + "grad_norm": 0.257376492023468, + "learning_rate": 5.9700275193475275e-05, + "loss": 1.75, + "step": 14816 + }, + { + "epoch": 4.547882136279926, + "grad_norm": 0.3182791769504547, + "learning_rate": 5.9695399024139174e-05, + "loss": 1.7965, + "step": 14817 + }, + { + "epoch": 4.548189073050952, + "grad_norm": 0.25553280115127563, + "learning_rate": 5.969052275898748e-05, + "loss": 1.8394, + "step": 14818 + }, + { + "epoch": 4.548496009821976, + "grad_norm": 0.2810833752155304, + "learning_rate": 5.9685646398068354e-05, + "loss": 1.704, + "step": 14819 + }, + { + "epoch": 4.5488029465930016, + "grad_norm": 0.21320512890815735, + "learning_rate": 5.9680769941429993e-05, + "loss": 1.7248, + "step": 14820 + }, + { + "epoch": 4.549109883364027, + "grad_norm": 0.3159593939781189, + "learning_rate": 5.96758933891206e-05, + "loss": 1.7885, + "step": 14821 + }, + { + "epoch": 4.549416820135052, + "grad_norm": 0.21894599497318268, + "learning_rate": 5.967101674118834e-05, + "loss": 1.7388, + "step": 14822 + }, + { + "epoch": 4.5497237569060776, + "grad_norm": 0.24804852902889252, + "learning_rate": 5.9666139997681424e-05, + "loss": 1.7631, + "step": 14823 + }, + { + "epoch": 4.550030693677103, + "grad_norm": 0.2678423523902893, + "learning_rate": 5.966126315864806e-05, + "loss": 1.7631, + "step": 14824 + }, + { + "epoch": 4.550337630448127, + "grad_norm": 0.229649156332016, + "learning_rate": 5.9656386224136426e-05, + "loss": 1.7292, + "step": 14825 + }, + { + "epoch": 4.550644567219153, + "grad_norm": 0.25248458981513977, + "learning_rate": 5.965150919419473e-05, + "loss": 1.8, + "step": 14826 + }, + { + "epoch": 4.550951503990178, + "grad_norm": 0.2583169937133789, + "learning_rate": 5.964663206887116e-05, + "loss": 1.7641, + "step": 14827 + }, + { + "epoch": 4.551258440761203, + "grad_norm": 0.21465209126472473, + "learning_rate": 5.964175484821392e-05, + "loss": 1.7475, + "step": 14828 + }, + { + "epoch": 4.551565377532229, + "grad_norm": 0.28028783202171326, + "learning_rate": 5.963687753227118e-05, + "loss": 1.7649, + "step": 14829 + }, + { + "epoch": 4.551872314303253, + "grad_norm": 0.30248284339904785, + "learning_rate": 5.9632000121091194e-05, + "loss": 1.6969, + "step": 14830 + }, + { + "epoch": 4.5521792510742785, + "grad_norm": 0.24335962533950806, + "learning_rate": 5.962712261472213e-05, + "loss": 1.7295, + "step": 14831 + }, + { + "epoch": 4.552486187845304, + "grad_norm": 0.21014504134655, + "learning_rate": 5.9622245013212206e-05, + "loss": 1.7508, + "step": 14832 + }, + { + "epoch": 4.552793124616329, + "grad_norm": 0.24892041087150574, + "learning_rate": 5.961736731660963e-05, + "loss": 1.7317, + "step": 14833 + }, + { + "epoch": 4.5531000613873545, + "grad_norm": 0.2159881740808487, + "learning_rate": 5.9612489524962556e-05, + "loss": 1.7114, + "step": 14834 + }, + { + "epoch": 4.55340699815838, + "grad_norm": 0.2952292263507843, + "learning_rate": 5.960761163831925e-05, + "loss": 1.8226, + "step": 14835 + }, + { + "epoch": 4.553713934929404, + "grad_norm": 0.3019000291824341, + "learning_rate": 5.9602733656727895e-05, + "loss": 1.7391, + "step": 14836 + }, + { + "epoch": 4.55402087170043, + "grad_norm": 0.2273966521024704, + "learning_rate": 5.9597855580236696e-05, + "loss": 1.7718, + "step": 14837 + }, + { + "epoch": 4.554327808471455, + "grad_norm": 0.2462005764245987, + "learning_rate": 5.959297740889386e-05, + "loss": 1.8428, + "step": 14838 + }, + { + "epoch": 4.55463474524248, + "grad_norm": 0.2773323059082031, + "learning_rate": 5.95880991427476e-05, + "loss": 1.6878, + "step": 14839 + }, + { + "epoch": 4.554941682013506, + "grad_norm": 0.26519861817359924, + "learning_rate": 5.958322078184611e-05, + "loss": 1.737, + "step": 14840 + }, + { + "epoch": 4.55524861878453, + "grad_norm": 0.20157647132873535, + "learning_rate": 5.9578342326237626e-05, + "loss": 1.7164, + "step": 14841 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.21715669333934784, + "learning_rate": 5.957346377597035e-05, + "loss": 1.705, + "step": 14842 + }, + { + "epoch": 4.555862492326581, + "grad_norm": 0.3056442439556122, + "learning_rate": 5.95685851310925e-05, + "loss": 1.7672, + "step": 14843 + }, + { + "epoch": 4.556169429097606, + "grad_norm": 0.24832262098789215, + "learning_rate": 5.956370639165228e-05, + "loss": 1.7305, + "step": 14844 + }, + { + "epoch": 4.556476365868631, + "grad_norm": 0.25814661383628845, + "learning_rate": 5.955882755769791e-05, + "loss": 1.7562, + "step": 14845 + }, + { + "epoch": 4.556783302639657, + "grad_norm": 0.38242629170417786, + "learning_rate": 5.95539486292776e-05, + "loss": 1.7077, + "step": 14846 + }, + { + "epoch": 4.557090239410681, + "grad_norm": 0.2901807427406311, + "learning_rate": 5.954906960643956e-05, + "loss": 1.7233, + "step": 14847 + }, + { + "epoch": 4.5573971761817065, + "grad_norm": 0.22636106610298157, + "learning_rate": 5.954419048923202e-05, + "loss": 1.777, + "step": 14848 + }, + { + "epoch": 4.557704112952732, + "grad_norm": 0.32392850518226624, + "learning_rate": 5.953931127770321e-05, + "loss": 1.7477, + "step": 14849 + }, + { + "epoch": 4.558011049723757, + "grad_norm": 0.3403460681438446, + "learning_rate": 5.953443197190134e-05, + "loss": 1.7712, + "step": 14850 + }, + { + "epoch": 4.558317986494782, + "grad_norm": 0.22923234105110168, + "learning_rate": 5.95295525718746e-05, + "loss": 1.8154, + "step": 14851 + }, + { + "epoch": 4.558624923265807, + "grad_norm": 0.25152841210365295, + "learning_rate": 5.952467307767124e-05, + "loss": 1.7091, + "step": 14852 + }, + { + "epoch": 4.558931860036832, + "grad_norm": 0.27743563055992126, + "learning_rate": 5.951979348933949e-05, + "loss": 1.7621, + "step": 14853 + }, + { + "epoch": 4.559238796807858, + "grad_norm": 0.25809308886528015, + "learning_rate": 5.951491380692756e-05, + "loss": 1.7669, + "step": 14854 + }, + { + "epoch": 4.559545733578883, + "grad_norm": 0.24863946437835693, + "learning_rate": 5.9510034030483676e-05, + "loss": 1.7354, + "step": 14855 + }, + { + "epoch": 4.559852670349908, + "grad_norm": 0.2896040380001068, + "learning_rate": 5.9505154160056066e-05, + "loss": 1.7878, + "step": 14856 + }, + { + "epoch": 4.560159607120933, + "grad_norm": 0.23814482986927032, + "learning_rate": 5.950027419569294e-05, + "loss": 1.7781, + "step": 14857 + }, + { + "epoch": 4.560466543891958, + "grad_norm": 0.2531175911426544, + "learning_rate": 5.949539413744253e-05, + "loss": 1.762, + "step": 14858 + }, + { + "epoch": 4.560773480662983, + "grad_norm": 0.2541767656803131, + "learning_rate": 5.949051398535308e-05, + "loss": 1.7722, + "step": 14859 + }, + { + "epoch": 4.561080417434009, + "grad_norm": 0.25216221809387207, + "learning_rate": 5.948563373947281e-05, + "loss": 1.754, + "step": 14860 + }, + { + "epoch": 4.561387354205034, + "grad_norm": 0.24421775341033936, + "learning_rate": 5.948075339984994e-05, + "loss": 1.7976, + "step": 14861 + }, + { + "epoch": 4.5616942909760585, + "grad_norm": 0.24435418844223022, + "learning_rate": 5.947587296653272e-05, + "loss": 1.79, + "step": 14862 + }, + { + "epoch": 4.562001227747084, + "grad_norm": 0.24471627175807953, + "learning_rate": 5.947099243956936e-05, + "loss": 1.755, + "step": 14863 + }, + { + "epoch": 4.562308164518109, + "grad_norm": 0.2762158215045929, + "learning_rate": 5.9466111819008096e-05, + "loss": 1.7695, + "step": 14864 + }, + { + "epoch": 4.5626151012891345, + "grad_norm": 0.23841319978237152, + "learning_rate": 5.9461231104897174e-05, + "loss": 1.7302, + "step": 14865 + }, + { + "epoch": 4.56292203806016, + "grad_norm": 0.260231077671051, + "learning_rate": 5.9456350297284826e-05, + "loss": 1.7917, + "step": 14866 + }, + { + "epoch": 4.563228974831185, + "grad_norm": 0.2752247452735901, + "learning_rate": 5.945146939621929e-05, + "loss": 1.7953, + "step": 14867 + }, + { + "epoch": 4.56353591160221, + "grad_norm": 0.28760650753974915, + "learning_rate": 5.944658840174878e-05, + "loss": 1.8582, + "step": 14868 + }, + { + "epoch": 4.563842848373235, + "grad_norm": 0.24311676621437073, + "learning_rate": 5.944170731392153e-05, + "loss": 1.8006, + "step": 14869 + }, + { + "epoch": 4.56414978514426, + "grad_norm": 0.2692974805831909, + "learning_rate": 5.943682613278583e-05, + "loss": 1.6984, + "step": 14870 + }, + { + "epoch": 4.564456721915286, + "grad_norm": 0.2784348726272583, + "learning_rate": 5.943194485838985e-05, + "loss": 1.8082, + "step": 14871 + }, + { + "epoch": 4.564763658686311, + "grad_norm": 0.2557264268398285, + "learning_rate": 5.9427063490781885e-05, + "loss": 1.7715, + "step": 14872 + }, + { + "epoch": 4.565070595457335, + "grad_norm": 0.3738742470741272, + "learning_rate": 5.942218203001015e-05, + "loss": 1.7549, + "step": 14873 + }, + { + "epoch": 4.565377532228361, + "grad_norm": 0.2424495816230774, + "learning_rate": 5.941730047612288e-05, + "loss": 1.7388, + "step": 14874 + }, + { + "epoch": 4.565684468999386, + "grad_norm": 0.27020737528800964, + "learning_rate": 5.941241882916833e-05, + "loss": 1.752, + "step": 14875 + }, + { + "epoch": 4.565991405770411, + "grad_norm": 0.3763764798641205, + "learning_rate": 5.940753708919474e-05, + "loss": 1.7918, + "step": 14876 + }, + { + "epoch": 4.566298342541437, + "grad_norm": 0.26782163977622986, + "learning_rate": 5.940265525625036e-05, + "loss": 1.7244, + "step": 14877 + }, + { + "epoch": 4.566605279312462, + "grad_norm": 0.24978911876678467, + "learning_rate": 5.9397773330383434e-05, + "loss": 1.7706, + "step": 14878 + }, + { + "epoch": 4.5669122160834865, + "grad_norm": 0.32905304431915283, + "learning_rate": 5.93928913116422e-05, + "loss": 1.7381, + "step": 14879 + }, + { + "epoch": 4.567219152854512, + "grad_norm": 0.2196444720029831, + "learning_rate": 5.93880092000749e-05, + "loss": 1.7605, + "step": 14880 + }, + { + "epoch": 4.567526089625537, + "grad_norm": 0.3156622350215912, + "learning_rate": 5.9383126995729786e-05, + "loss": 1.9181, + "step": 14881 + }, + { + "epoch": 4.5678330263965625, + "grad_norm": 0.2895203232765198, + "learning_rate": 5.937824469865513e-05, + "loss": 1.7967, + "step": 14882 + }, + { + "epoch": 4.568139963167588, + "grad_norm": 0.24854810535907745, + "learning_rate": 5.937336230889916e-05, + "loss": 1.7332, + "step": 14883 + }, + { + "epoch": 4.568446899938612, + "grad_norm": 0.3417081832885742, + "learning_rate": 5.936847982651013e-05, + "loss": 1.7525, + "step": 14884 + }, + { + "epoch": 4.568753836709638, + "grad_norm": 0.2874949276447296, + "learning_rate": 5.936359725153629e-05, + "loss": 1.7659, + "step": 14885 + }, + { + "epoch": 4.569060773480663, + "grad_norm": 0.25031307339668274, + "learning_rate": 5.935871458402588e-05, + "loss": 1.8061, + "step": 14886 + }, + { + "epoch": 4.569367710251688, + "grad_norm": 0.27047309279441833, + "learning_rate": 5.935383182402717e-05, + "loss": 1.7318, + "step": 14887 + }, + { + "epoch": 4.569674647022714, + "grad_norm": 0.2642819881439209, + "learning_rate": 5.9348948971588425e-05, + "loss": 1.849, + "step": 14888 + }, + { + "epoch": 4.569981583793739, + "grad_norm": 0.2452307790517807, + "learning_rate": 5.9344066026757886e-05, + "loss": 1.7491, + "step": 14889 + }, + { + "epoch": 4.570288520564763, + "grad_norm": 0.24055036902427673, + "learning_rate": 5.9339182989583795e-05, + "loss": 1.7573, + "step": 14890 + }, + { + "epoch": 4.570595457335789, + "grad_norm": 0.23036183416843414, + "learning_rate": 5.933429986011444e-05, + "loss": 1.7841, + "step": 14891 + }, + { + "epoch": 4.570902394106814, + "grad_norm": 0.27987608313560486, + "learning_rate": 5.932941663839805e-05, + "loss": 1.7835, + "step": 14892 + }, + { + "epoch": 4.571209330877839, + "grad_norm": 0.31747013330459595, + "learning_rate": 5.93245333244829e-05, + "loss": 1.7905, + "step": 14893 + }, + { + "epoch": 4.571516267648864, + "grad_norm": 0.24841344356536865, + "learning_rate": 5.931964991841725e-05, + "loss": 1.8003, + "step": 14894 + }, + { + "epoch": 4.571823204419889, + "grad_norm": 0.2416950911283493, + "learning_rate": 5.9314766420249356e-05, + "loss": 1.7787, + "step": 14895 + }, + { + "epoch": 4.5721301411909145, + "grad_norm": 0.2322494238615036, + "learning_rate": 5.930988283002748e-05, + "loss": 1.8153, + "step": 14896 + }, + { + "epoch": 4.57243707796194, + "grad_norm": 0.22629016637802124, + "learning_rate": 5.930499914779989e-05, + "loss": 1.6743, + "step": 14897 + }, + { + "epoch": 4.572744014732965, + "grad_norm": 0.21481508016586304, + "learning_rate": 5.930011537361483e-05, + "loss": 1.7301, + "step": 14898 + }, + { + "epoch": 4.5730509515039905, + "grad_norm": 0.1993340700864792, + "learning_rate": 5.9295231507520586e-05, + "loss": 1.6796, + "step": 14899 + }, + { + "epoch": 4.573357888275015, + "grad_norm": 0.21681822836399078, + "learning_rate": 5.929034754956543e-05, + "loss": 1.7333, + "step": 14900 + }, + { + "epoch": 4.57366482504604, + "grad_norm": 0.23105305433273315, + "learning_rate": 5.928546349979761e-05, + "loss": 1.8207, + "step": 14901 + }, + { + "epoch": 4.573971761817066, + "grad_norm": 0.24656468629837036, + "learning_rate": 5.9280579358265384e-05, + "loss": 1.7805, + "step": 14902 + }, + { + "epoch": 4.574278698588091, + "grad_norm": 0.28564780950546265, + "learning_rate": 5.927569512501704e-05, + "loss": 1.7224, + "step": 14903 + }, + { + "epoch": 4.574585635359116, + "grad_norm": 0.26030251383781433, + "learning_rate": 5.927081080010084e-05, + "loss": 1.7417, + "step": 14904 + }, + { + "epoch": 4.574892572130141, + "grad_norm": 0.21427087485790253, + "learning_rate": 5.926592638356505e-05, + "loss": 1.7239, + "step": 14905 + }, + { + "epoch": 4.575199508901166, + "grad_norm": 0.2351662665605545, + "learning_rate": 5.9261041875457956e-05, + "loss": 1.7711, + "step": 14906 + }, + { + "epoch": 4.5755064456721914, + "grad_norm": 0.27335020899772644, + "learning_rate": 5.925615727582781e-05, + "loss": 1.7496, + "step": 14907 + }, + { + "epoch": 4.575813382443217, + "grad_norm": 0.27849945425987244, + "learning_rate": 5.925127258472289e-05, + "loss": 1.7576, + "step": 14908 + }, + { + "epoch": 4.576120319214242, + "grad_norm": 0.27859339118003845, + "learning_rate": 5.924638780219147e-05, + "loss": 1.8076, + "step": 14909 + }, + { + "epoch": 4.5764272559852675, + "grad_norm": 0.24664369225502014, + "learning_rate": 5.9241502928281836e-05, + "loss": 1.7657, + "step": 14910 + }, + { + "epoch": 4.576734192756292, + "grad_norm": 0.29881149530410767, + "learning_rate": 5.923661796304224e-05, + "loss": 1.7611, + "step": 14911 + }, + { + "epoch": 4.577041129527317, + "grad_norm": 0.2672356367111206, + "learning_rate": 5.9231732906520984e-05, + "loss": 1.7605, + "step": 14912 + }, + { + "epoch": 4.577348066298343, + "grad_norm": 0.24282832443714142, + "learning_rate": 5.9226847758766336e-05, + "loss": 1.7037, + "step": 14913 + }, + { + "epoch": 4.577655003069368, + "grad_norm": 0.3822915852069855, + "learning_rate": 5.922196251982656e-05, + "loss": 1.7609, + "step": 14914 + }, + { + "epoch": 4.577961939840393, + "grad_norm": 0.30721214413642883, + "learning_rate": 5.921707718974994e-05, + "loss": 1.7398, + "step": 14915 + }, + { + "epoch": 4.578268876611418, + "grad_norm": 0.235477477312088, + "learning_rate": 5.921219176858477e-05, + "loss": 1.6869, + "step": 14916 + }, + { + "epoch": 4.578575813382443, + "grad_norm": 0.3752216100692749, + "learning_rate": 5.920730625637934e-05, + "loss": 1.7296, + "step": 14917 + }, + { + "epoch": 4.578882750153468, + "grad_norm": 0.36901310086250305, + "learning_rate": 5.920242065318189e-05, + "loss": 1.7405, + "step": 14918 + }, + { + "epoch": 4.579189686924494, + "grad_norm": 0.2308608740568161, + "learning_rate": 5.9197534959040725e-05, + "loss": 1.7953, + "step": 14919 + }, + { + "epoch": 4.579496623695519, + "grad_norm": 0.3286738991737366, + "learning_rate": 5.919264917400412e-05, + "loss": 1.7669, + "step": 14920 + }, + { + "epoch": 4.579803560466544, + "grad_norm": 0.3944021165370941, + "learning_rate": 5.918776329812039e-05, + "loss": 1.7165, + "step": 14921 + }, + { + "epoch": 4.580110497237569, + "grad_norm": 0.22054845094680786, + "learning_rate": 5.9182877331437795e-05, + "loss": 1.7739, + "step": 14922 + }, + { + "epoch": 4.580417434008594, + "grad_norm": 0.3467540740966797, + "learning_rate": 5.9177991274004605e-05, + "loss": 1.7713, + "step": 14923 + }, + { + "epoch": 4.5807243707796195, + "grad_norm": 0.4313695728778839, + "learning_rate": 5.917310512586914e-05, + "loss": 1.7654, + "step": 14924 + }, + { + "epoch": 4.581031307550645, + "grad_norm": 0.2723502814769745, + "learning_rate": 5.9168218887079685e-05, + "loss": 1.7314, + "step": 14925 + }, + { + "epoch": 4.581338244321669, + "grad_norm": 0.2641250789165497, + "learning_rate": 5.9163332557684504e-05, + "loss": 1.7303, + "step": 14926 + }, + { + "epoch": 4.581645181092695, + "grad_norm": 0.3780760169029236, + "learning_rate": 5.915844613773189e-05, + "loss": 1.7748, + "step": 14927 + }, + { + "epoch": 4.58195211786372, + "grad_norm": 0.23379632830619812, + "learning_rate": 5.915355962727015e-05, + "loss": 1.7482, + "step": 14928 + }, + { + "epoch": 4.582259054634745, + "grad_norm": 0.35227084159851074, + "learning_rate": 5.914867302634758e-05, + "loss": 1.8198, + "step": 14929 + }, + { + "epoch": 4.582565991405771, + "grad_norm": 0.34348124265670776, + "learning_rate": 5.914378633501245e-05, + "loss": 1.8364, + "step": 14930 + }, + { + "epoch": 4.582872928176796, + "grad_norm": 0.2446804940700531, + "learning_rate": 5.9138899553313066e-05, + "loss": 1.7779, + "step": 14931 + }, + { + "epoch": 4.58317986494782, + "grad_norm": 0.23893557488918304, + "learning_rate": 5.913401268129772e-05, + "loss": 1.7582, + "step": 14932 + }, + { + "epoch": 4.583486801718846, + "grad_norm": 0.3046814203262329, + "learning_rate": 5.912912571901471e-05, + "loss": 1.6871, + "step": 14933 + }, + { + "epoch": 4.583793738489871, + "grad_norm": 0.2232733964920044, + "learning_rate": 5.912423866651233e-05, + "loss": 1.7269, + "step": 14934 + }, + { + "epoch": 4.584100675260896, + "grad_norm": 0.18664126098155975, + "learning_rate": 5.911935152383888e-05, + "loss": 1.7155, + "step": 14935 + }, + { + "epoch": 4.584407612031922, + "grad_norm": 0.2573263347148895, + "learning_rate": 5.911446429104265e-05, + "loss": 1.7901, + "step": 14936 + }, + { + "epoch": 4.584714548802946, + "grad_norm": 0.2382393181324005, + "learning_rate": 5.910957696817194e-05, + "loss": 1.7407, + "step": 14937 + }, + { + "epoch": 4.5850214855739715, + "grad_norm": 0.28363972902297974, + "learning_rate": 5.910468955527504e-05, + "loss": 1.7971, + "step": 14938 + }, + { + "epoch": 4.585328422344997, + "grad_norm": 0.3173120617866516, + "learning_rate": 5.909980205240027e-05, + "loss": 1.744, + "step": 14939 + }, + { + "epoch": 4.585635359116022, + "grad_norm": 0.2281302511692047, + "learning_rate": 5.909491445959592e-05, + "loss": 1.6976, + "step": 14940 + }, + { + "epoch": 4.5859422958870475, + "grad_norm": 0.24962912499904633, + "learning_rate": 5.9090026776910304e-05, + "loss": 1.7979, + "step": 14941 + }, + { + "epoch": 4.586249232658073, + "grad_norm": 0.22330854833126068, + "learning_rate": 5.908513900439171e-05, + "loss": 1.7854, + "step": 14942 + }, + { + "epoch": 4.586556169429097, + "grad_norm": 0.20861582458019257, + "learning_rate": 5.908025114208845e-05, + "loss": 1.7133, + "step": 14943 + }, + { + "epoch": 4.586863106200123, + "grad_norm": 0.21838510036468506, + "learning_rate": 5.90753631900488e-05, + "loss": 1.6919, + "step": 14944 + }, + { + "epoch": 4.587170042971148, + "grad_norm": 0.252798467874527, + "learning_rate": 5.907047514832112e-05, + "loss": 1.838, + "step": 14945 + }, + { + "epoch": 4.587476979742173, + "grad_norm": 0.326893150806427, + "learning_rate": 5.906558701695369e-05, + "loss": 1.7303, + "step": 14946 + }, + { + "epoch": 4.587783916513199, + "grad_norm": 0.36489585041999817, + "learning_rate": 5.9060698795994804e-05, + "loss": 1.7631, + "step": 14947 + }, + { + "epoch": 4.588090853284223, + "grad_norm": 0.27491649985313416, + "learning_rate": 5.905581048549279e-05, + "loss": 1.7773, + "step": 14948 + }, + { + "epoch": 4.588397790055248, + "grad_norm": 0.2334890067577362, + "learning_rate": 5.905092208549595e-05, + "loss": 1.7254, + "step": 14949 + }, + { + "epoch": 4.588704726826274, + "grad_norm": 0.24383895099163055, + "learning_rate": 5.904603359605257e-05, + "loss": 1.7496, + "step": 14950 + }, + { + "epoch": 4.589011663597299, + "grad_norm": 0.2144637256860733, + "learning_rate": 5.904114501721102e-05, + "loss": 1.7028, + "step": 14951 + }, + { + "epoch": 4.589318600368324, + "grad_norm": 0.19675977528095245, + "learning_rate": 5.9036256349019555e-05, + "loss": 1.7548, + "step": 14952 + }, + { + "epoch": 4.58962553713935, + "grad_norm": 0.23712843656539917, + "learning_rate": 5.903136759152652e-05, + "loss": 1.7722, + "step": 14953 + }, + { + "epoch": 4.589932473910374, + "grad_norm": 0.20307733118534088, + "learning_rate": 5.902647874478021e-05, + "loss": 1.7177, + "step": 14954 + }, + { + "epoch": 4.5902394106813995, + "grad_norm": 0.21767669916152954, + "learning_rate": 5.9021589808828936e-05, + "loss": 1.7963, + "step": 14955 + }, + { + "epoch": 4.590546347452425, + "grad_norm": 0.2056351602077484, + "learning_rate": 5.9016700783721036e-05, + "loss": 1.7439, + "step": 14956 + }, + { + "epoch": 4.59085328422345, + "grad_norm": 0.20480911433696747, + "learning_rate": 5.90118116695048e-05, + "loss": 1.7122, + "step": 14957 + }, + { + "epoch": 4.5911602209944755, + "grad_norm": 0.24091731011867523, + "learning_rate": 5.900692246622858e-05, + "loss": 1.7862, + "step": 14958 + }, + { + "epoch": 4.5914671577655, + "grad_norm": 0.20246434211730957, + "learning_rate": 5.900203317394066e-05, + "loss": 1.6895, + "step": 14959 + }, + { + "epoch": 4.591774094536525, + "grad_norm": 0.23771630227565765, + "learning_rate": 5.899714379268938e-05, + "loss": 1.7794, + "step": 14960 + }, + { + "epoch": 4.592081031307551, + "grad_norm": 0.2638718783855438, + "learning_rate": 5.899225432252303e-05, + "loss": 1.8059, + "step": 14961 + }, + { + "epoch": 4.592387968078576, + "grad_norm": 0.24251408874988556, + "learning_rate": 5.898736476348997e-05, + "loss": 1.8063, + "step": 14962 + }, + { + "epoch": 4.592694904849601, + "grad_norm": 0.2487735152244568, + "learning_rate": 5.8982475115638515e-05, + "loss": 1.7615, + "step": 14963 + }, + { + "epoch": 4.593001841620627, + "grad_norm": 0.23507241904735565, + "learning_rate": 5.897758537901696e-05, + "loss": 1.7496, + "step": 14964 + }, + { + "epoch": 4.593308778391651, + "grad_norm": 0.22354768216609955, + "learning_rate": 5.897269555367365e-05, + "loss": 1.7293, + "step": 14965 + }, + { + "epoch": 4.593615715162676, + "grad_norm": 0.2711353003978729, + "learning_rate": 5.89678056396569e-05, + "loss": 1.8127, + "step": 14966 + }, + { + "epoch": 4.593922651933702, + "grad_norm": 0.30061110854148865, + "learning_rate": 5.8962915637015036e-05, + "loss": 1.7653, + "step": 14967 + }, + { + "epoch": 4.594229588704727, + "grad_norm": 0.24577318131923676, + "learning_rate": 5.895802554579639e-05, + "loss": 1.7888, + "step": 14968 + }, + { + "epoch": 4.5945365254757515, + "grad_norm": 0.25568944215774536, + "learning_rate": 5.895313536604929e-05, + "loss": 1.7912, + "step": 14969 + }, + { + "epoch": 4.594843462246777, + "grad_norm": 0.2710168957710266, + "learning_rate": 5.894824509782206e-05, + "loss": 1.7681, + "step": 14970 + }, + { + "epoch": 4.595150399017802, + "grad_norm": 0.24056777358055115, + "learning_rate": 5.894335474116303e-05, + "loss": 1.7729, + "step": 14971 + }, + { + "epoch": 4.5954573357888275, + "grad_norm": 0.21956710517406464, + "learning_rate": 5.89384642961205e-05, + "loss": 1.7576, + "step": 14972 + }, + { + "epoch": 4.595764272559853, + "grad_norm": 0.27499106526374817, + "learning_rate": 5.893357376274284e-05, + "loss": 1.7909, + "step": 14973 + }, + { + "epoch": 4.596071209330878, + "grad_norm": 0.28581273555755615, + "learning_rate": 5.8928683141078376e-05, + "loss": 1.7592, + "step": 14974 + }, + { + "epoch": 4.596378146101903, + "grad_norm": 0.23218442499637604, + "learning_rate": 5.892379243117543e-05, + "loss": 1.7142, + "step": 14975 + }, + { + "epoch": 4.596685082872928, + "grad_norm": 0.34015771746635437, + "learning_rate": 5.891890163308234e-05, + "loss": 1.7457, + "step": 14976 + }, + { + "epoch": 4.596992019643953, + "grad_norm": 0.2630012333393097, + "learning_rate": 5.8914010746847435e-05, + "loss": 1.7612, + "step": 14977 + }, + { + "epoch": 4.597298956414979, + "grad_norm": 0.2265843003988266, + "learning_rate": 5.890911977251904e-05, + "loss": 1.7272, + "step": 14978 + }, + { + "epoch": 4.597605893186004, + "grad_norm": 0.22325244545936584, + "learning_rate": 5.8904228710145505e-05, + "loss": 1.7447, + "step": 14979 + }, + { + "epoch": 4.597912829957028, + "grad_norm": 0.23512716591358185, + "learning_rate": 5.889933755977517e-05, + "loss": 1.7123, + "step": 14980 + }, + { + "epoch": 4.598219766728054, + "grad_norm": 0.22534869611263275, + "learning_rate": 5.8894446321456365e-05, + "loss": 1.785, + "step": 14981 + }, + { + "epoch": 4.598526703499079, + "grad_norm": 0.2447836697101593, + "learning_rate": 5.888955499523743e-05, + "loss": 1.7154, + "step": 14982 + }, + { + "epoch": 4.598833640270104, + "grad_norm": 0.2451140582561493, + "learning_rate": 5.88846635811667e-05, + "loss": 1.7494, + "step": 14983 + }, + { + "epoch": 4.59914057704113, + "grad_norm": 0.2253585308790207, + "learning_rate": 5.8879772079292504e-05, + "loss": 1.7591, + "step": 14984 + }, + { + "epoch": 4.599447513812155, + "grad_norm": 0.21714572608470917, + "learning_rate": 5.887488048966322e-05, + "loss": 1.7314, + "step": 14985 + }, + { + "epoch": 4.5997544505831796, + "grad_norm": 0.24897411465644836, + "learning_rate": 5.8869988812327145e-05, + "loss": 1.776, + "step": 14986 + }, + { + "epoch": 4.600061387354205, + "grad_norm": 0.22575093805789948, + "learning_rate": 5.8865097047332653e-05, + "loss": 1.7168, + "step": 14987 + }, + { + "epoch": 4.60036832412523, + "grad_norm": 0.22857412695884705, + "learning_rate": 5.886020519472808e-05, + "loss": 1.8262, + "step": 14988 + }, + { + "epoch": 4.600675260896256, + "grad_norm": 0.22741298377513885, + "learning_rate": 5.885531325456174e-05, + "loss": 1.6732, + "step": 14989 + }, + { + "epoch": 4.600982197667281, + "grad_norm": 0.2229645550251007, + "learning_rate": 5.885042122688202e-05, + "loss": 1.7384, + "step": 14990 + }, + { + "epoch": 4.601289134438305, + "grad_norm": 0.22609494626522064, + "learning_rate": 5.884552911173726e-05, + "loss": 1.714, + "step": 14991 + }, + { + "epoch": 4.601596071209331, + "grad_norm": 0.2629149854183197, + "learning_rate": 5.884063690917578e-05, + "loss": 1.8133, + "step": 14992 + }, + { + "epoch": 4.601903007980356, + "grad_norm": 0.220725417137146, + "learning_rate": 5.883574461924597e-05, + "loss": 1.6898, + "step": 14993 + }, + { + "epoch": 4.602209944751381, + "grad_norm": 0.207612082362175, + "learning_rate": 5.8830852241996135e-05, + "loss": 1.7302, + "step": 14994 + }, + { + "epoch": 4.602516881522407, + "grad_norm": 0.22418084740638733, + "learning_rate": 5.8825959777474625e-05, + "loss": 1.763, + "step": 14995 + }, + { + "epoch": 4.602823818293432, + "grad_norm": 0.30606865882873535, + "learning_rate": 5.882106722572983e-05, + "loss": 1.7657, + "step": 14996 + }, + { + "epoch": 4.6031307550644565, + "grad_norm": 0.2947966456413269, + "learning_rate": 5.881617458681008e-05, + "loss": 1.7796, + "step": 14997 + }, + { + "epoch": 4.603437691835482, + "grad_norm": 0.23430216312408447, + "learning_rate": 5.881128186076372e-05, + "loss": 1.78, + "step": 14998 + }, + { + "epoch": 4.603744628606507, + "grad_norm": 0.28081849217414856, + "learning_rate": 5.880638904763911e-05, + "loss": 1.6791, + "step": 14999 + }, + { + "epoch": 4.6040515653775325, + "grad_norm": 0.25459226965904236, + "learning_rate": 5.88014961474846e-05, + "loss": 1.8064, + "step": 15000 + }, + { + "epoch": 4.604358502148557, + "grad_norm": 0.2358713001012802, + "learning_rate": 5.879660316034854e-05, + "loss": 1.763, + "step": 15001 + }, + { + "epoch": 4.604665438919582, + "grad_norm": 0.32954758405685425, + "learning_rate": 5.879171008627931e-05, + "loss": 1.7462, + "step": 15002 + }, + { + "epoch": 4.604972375690608, + "grad_norm": 0.2588615417480469, + "learning_rate": 5.878681692532523e-05, + "loss": 1.7771, + "step": 15003 + }, + { + "epoch": 4.605279312461633, + "grad_norm": 0.21216195821762085, + "learning_rate": 5.878192367753468e-05, + "loss": 1.7128, + "step": 15004 + }, + { + "epoch": 4.605586249232658, + "grad_norm": 0.26849040389060974, + "learning_rate": 5.8777030342956016e-05, + "loss": 1.7048, + "step": 15005 + }, + { + "epoch": 4.605893186003684, + "grad_norm": 0.22343295812606812, + "learning_rate": 5.877213692163759e-05, + "loss": 1.7695, + "step": 15006 + }, + { + "epoch": 4.606200122774708, + "grad_norm": 0.2794288694858551, + "learning_rate": 5.876724341362776e-05, + "loss": 1.7856, + "step": 15007 + }, + { + "epoch": 4.606507059545733, + "grad_norm": 0.3525427579879761, + "learning_rate": 5.8762349818974905e-05, + "loss": 1.7807, + "step": 15008 + }, + { + "epoch": 4.606813996316759, + "grad_norm": 0.25886499881744385, + "learning_rate": 5.875745613772736e-05, + "loss": 1.7818, + "step": 15009 + }, + { + "epoch": 4.607120933087784, + "grad_norm": 0.24822987616062164, + "learning_rate": 5.8752562369933515e-05, + "loss": 1.7369, + "step": 15010 + }, + { + "epoch": 4.607427869858809, + "grad_norm": 0.26067355275154114, + "learning_rate": 5.874766851564171e-05, + "loss": 1.7056, + "step": 15011 + }, + { + "epoch": 4.607734806629834, + "grad_norm": 0.2869747579097748, + "learning_rate": 5.874277457490033e-05, + "loss": 1.7284, + "step": 15012 + }, + { + "epoch": 4.608041743400859, + "grad_norm": 0.23153580725193024, + "learning_rate": 5.87378805477577e-05, + "loss": 1.7331, + "step": 15013 + }, + { + "epoch": 4.6083486801718845, + "grad_norm": 0.29307299852371216, + "learning_rate": 5.873298643426223e-05, + "loss": 1.7376, + "step": 15014 + }, + { + "epoch": 4.60865561694291, + "grad_norm": 0.25638771057128906, + "learning_rate": 5.872809223446227e-05, + "loss": 1.7585, + "step": 15015 + }, + { + "epoch": 4.608962553713935, + "grad_norm": 0.2272702306509018, + "learning_rate": 5.872319794840618e-05, + "loss": 1.7482, + "step": 15016 + }, + { + "epoch": 4.6092694904849605, + "grad_norm": 0.2579486072063446, + "learning_rate": 5.8718303576142356e-05, + "loss": 1.778, + "step": 15017 + }, + { + "epoch": 4.609576427255985, + "grad_norm": 0.2216452956199646, + "learning_rate": 5.871340911771912e-05, + "loss": 1.7517, + "step": 15018 + }, + { + "epoch": 4.60988336402701, + "grad_norm": 0.22628961503505707, + "learning_rate": 5.870851457318488e-05, + "loss": 1.7579, + "step": 15019 + }, + { + "epoch": 4.610190300798036, + "grad_norm": 0.31018149852752686, + "learning_rate": 5.8703619942588e-05, + "loss": 1.7911, + "step": 15020 + }, + { + "epoch": 4.610497237569061, + "grad_norm": 0.2618122100830078, + "learning_rate": 5.869872522597683e-05, + "loss": 1.8121, + "step": 15021 + }, + { + "epoch": 4.610804174340086, + "grad_norm": 0.26085740327835083, + "learning_rate": 5.869383042339978e-05, + "loss": 1.7952, + "step": 15022 + }, + { + "epoch": 4.611111111111111, + "grad_norm": 0.25237780809402466, + "learning_rate": 5.86889355349052e-05, + "loss": 1.7575, + "step": 15023 + }, + { + "epoch": 4.611418047882136, + "grad_norm": 0.27550897002220154, + "learning_rate": 5.868404056054144e-05, + "loss": 1.7816, + "step": 15024 + }, + { + "epoch": 4.611724984653161, + "grad_norm": 0.2458692342042923, + "learning_rate": 5.8679145500356926e-05, + "loss": 1.7783, + "step": 15025 + }, + { + "epoch": 4.612031921424187, + "grad_norm": 0.25606176257133484, + "learning_rate": 5.867425035439999e-05, + "loss": 1.7863, + "step": 15026 + }, + { + "epoch": 4.612338858195212, + "grad_norm": 0.3206995725631714, + "learning_rate": 5.866935512271905e-05, + "loss": 1.7468, + "step": 15027 + }, + { + "epoch": 4.612645794966237, + "grad_norm": 0.2754824459552765, + "learning_rate": 5.866445980536245e-05, + "loss": 1.793, + "step": 15028 + }, + { + "epoch": 4.612952731737262, + "grad_norm": 0.25168612599372864, + "learning_rate": 5.865956440237859e-05, + "loss": 1.7252, + "step": 15029 + }, + { + "epoch": 4.613259668508287, + "grad_norm": 0.3226735293865204, + "learning_rate": 5.8654668913815815e-05, + "loss": 1.7291, + "step": 15030 + }, + { + "epoch": 4.6135666052793125, + "grad_norm": 0.2580295503139496, + "learning_rate": 5.864977333972255e-05, + "loss": 1.7622, + "step": 15031 + }, + { + "epoch": 4.613873542050338, + "grad_norm": 0.21486075222492218, + "learning_rate": 5.864487768014715e-05, + "loss": 1.7662, + "step": 15032 + }, + { + "epoch": 4.614180478821363, + "grad_norm": 0.2331690639257431, + "learning_rate": 5.8639981935137996e-05, + "loss": 1.7389, + "step": 15033 + }, + { + "epoch": 4.614487415592388, + "grad_norm": 0.2573511302471161, + "learning_rate": 5.863508610474348e-05, + "loss": 1.7699, + "step": 15034 + }, + { + "epoch": 4.614794352363413, + "grad_norm": 0.2260694056749344, + "learning_rate": 5.863019018901199e-05, + "loss": 1.7784, + "step": 15035 + }, + { + "epoch": 4.615101289134438, + "grad_norm": 0.2283065915107727, + "learning_rate": 5.8625294187991895e-05, + "loss": 1.7061, + "step": 15036 + }, + { + "epoch": 4.615408225905464, + "grad_norm": 0.24772310256958008, + "learning_rate": 5.862039810173159e-05, + "loss": 1.7568, + "step": 15037 + }, + { + "epoch": 4.615715162676489, + "grad_norm": 0.2515513002872467, + "learning_rate": 5.861550193027945e-05, + "loss": 1.7445, + "step": 15038 + }, + { + "epoch": 4.616022099447514, + "grad_norm": 0.26472151279449463, + "learning_rate": 5.8610605673683885e-05, + "loss": 1.7735, + "step": 15039 + }, + { + "epoch": 4.616329036218539, + "grad_norm": 0.24053528904914856, + "learning_rate": 5.8605709331993254e-05, + "loss": 1.8009, + "step": 15040 + }, + { + "epoch": 4.616635972989564, + "grad_norm": 0.25125381350517273, + "learning_rate": 5.860081290525596e-05, + "loss": 1.7712, + "step": 15041 + }, + { + "epoch": 4.616942909760589, + "grad_norm": 0.23056018352508545, + "learning_rate": 5.85959163935204e-05, + "loss": 1.7684, + "step": 15042 + }, + { + "epoch": 4.617249846531615, + "grad_norm": 0.2533007562160492, + "learning_rate": 5.859101979683494e-05, + "loss": 1.7793, + "step": 15043 + }, + { + "epoch": 4.617556783302639, + "grad_norm": 0.21007375419139862, + "learning_rate": 5.8586123115248e-05, + "loss": 1.7484, + "step": 15044 + }, + { + "epoch": 4.6178637200736645, + "grad_norm": 0.21329566836357117, + "learning_rate": 5.858122634880797e-05, + "loss": 1.7763, + "step": 15045 + }, + { + "epoch": 4.61817065684469, + "grad_norm": 0.2362898588180542, + "learning_rate": 5.857632949756322e-05, + "loss": 1.7484, + "step": 15046 + }, + { + "epoch": 4.618477593615715, + "grad_norm": 0.2168794423341751, + "learning_rate": 5.857143256156214e-05, + "loss": 1.7752, + "step": 15047 + }, + { + "epoch": 4.6187845303867405, + "grad_norm": 0.24761471152305603, + "learning_rate": 5.856653554085316e-05, + "loss": 1.7793, + "step": 15048 + }, + { + "epoch": 4.619091467157766, + "grad_norm": 0.23202158510684967, + "learning_rate": 5.856163843548466e-05, + "loss": 1.6862, + "step": 15049 + }, + { + "epoch": 4.61939840392879, + "grad_norm": 0.23868000507354736, + "learning_rate": 5.855674124550501e-05, + "loss": 1.8075, + "step": 15050 + }, + { + "epoch": 4.619705340699816, + "grad_norm": 0.3063114583492279, + "learning_rate": 5.855184397096265e-05, + "loss": 1.8051, + "step": 15051 + }, + { + "epoch": 4.620012277470841, + "grad_norm": 0.22672493755817413, + "learning_rate": 5.854694661190594e-05, + "loss": 1.7478, + "step": 15052 + }, + { + "epoch": 4.620319214241866, + "grad_norm": 0.3403559923171997, + "learning_rate": 5.8542049168383296e-05, + "loss": 1.765, + "step": 15053 + }, + { + "epoch": 4.620626151012892, + "grad_norm": 0.33852189779281616, + "learning_rate": 5.853715164044312e-05, + "loss": 1.7602, + "step": 15054 + }, + { + "epoch": 4.620933087783916, + "grad_norm": 0.25166940689086914, + "learning_rate": 5.85322540281338e-05, + "loss": 1.7584, + "step": 15055 + }, + { + "epoch": 4.621240024554941, + "grad_norm": 0.3417987823486328, + "learning_rate": 5.8527356331503757e-05, + "loss": 1.8491, + "step": 15056 + }, + { + "epoch": 4.621546961325967, + "grad_norm": 0.3286994397640228, + "learning_rate": 5.852245855060138e-05, + "loss": 1.7146, + "step": 15057 + }, + { + "epoch": 4.621853898096992, + "grad_norm": 0.24394257366657257, + "learning_rate": 5.851756068547505e-05, + "loss": 1.8762, + "step": 15058 + }, + { + "epoch": 4.622160834868017, + "grad_norm": 0.34945347905158997, + "learning_rate": 5.851266273617321e-05, + "loss": 1.8086, + "step": 15059 + }, + { + "epoch": 4.622467771639043, + "grad_norm": 0.30189210176467896, + "learning_rate": 5.850776470274425e-05, + "loss": 1.7366, + "step": 15060 + }, + { + "epoch": 4.622774708410067, + "grad_norm": 0.24050579965114594, + "learning_rate": 5.850286658523657e-05, + "loss": 1.7599, + "step": 15061 + }, + { + "epoch": 4.6230816451810925, + "grad_norm": 0.33650726079940796, + "learning_rate": 5.849796838369857e-05, + "loss": 1.7343, + "step": 15062 + }, + { + "epoch": 4.623388581952118, + "grad_norm": 0.2855902910232544, + "learning_rate": 5.849307009817868e-05, + "loss": 1.7325, + "step": 15063 + }, + { + "epoch": 4.623695518723143, + "grad_norm": 0.2562592923641205, + "learning_rate": 5.8488171728725275e-05, + "loss": 1.7772, + "step": 15064 + }, + { + "epoch": 4.6240024554941686, + "grad_norm": 0.23494984209537506, + "learning_rate": 5.84832732753868e-05, + "loss": 1.7263, + "step": 15065 + }, + { + "epoch": 4.624309392265193, + "grad_norm": 0.23248226940631866, + "learning_rate": 5.847837473821164e-05, + "loss": 1.7441, + "step": 15066 + }, + { + "epoch": 4.624616329036218, + "grad_norm": 0.2291254848241806, + "learning_rate": 5.847347611724821e-05, + "loss": 1.7742, + "step": 15067 + }, + { + "epoch": 4.624923265807244, + "grad_norm": 0.28305280208587646, + "learning_rate": 5.8468577412544925e-05, + "loss": 1.8224, + "step": 15068 + }, + { + "epoch": 4.625230202578269, + "grad_norm": 0.25531691312789917, + "learning_rate": 5.84636786241502e-05, + "loss": 1.7458, + "step": 15069 + }, + { + "epoch": 4.625537139349294, + "grad_norm": 0.2363462746143341, + "learning_rate": 5.845877975211242e-05, + "loss": 1.7977, + "step": 15070 + }, + { + "epoch": 4.62584407612032, + "grad_norm": 0.2707001864910126, + "learning_rate": 5.845388079648004e-05, + "loss": 1.774, + "step": 15071 + }, + { + "epoch": 4.626151012891344, + "grad_norm": 0.22281844913959503, + "learning_rate": 5.844898175730146e-05, + "loss": 1.7888, + "step": 15072 + }, + { + "epoch": 4.6264579496623695, + "grad_norm": 0.24809995293617249, + "learning_rate": 5.8444082634625086e-05, + "loss": 1.7895, + "step": 15073 + }, + { + "epoch": 4.626764886433395, + "grad_norm": 0.2842096984386444, + "learning_rate": 5.843918342849933e-05, + "loss": 1.7323, + "step": 15074 + }, + { + "epoch": 4.62707182320442, + "grad_norm": 0.21343614161014557, + "learning_rate": 5.843428413897261e-05, + "loss": 1.7298, + "step": 15075 + }, + { + "epoch": 4.627378759975445, + "grad_norm": 0.2420526146888733, + "learning_rate": 5.842938476609336e-05, + "loss": 1.778, + "step": 15076 + }, + { + "epoch": 4.62768569674647, + "grad_norm": 0.22202003002166748, + "learning_rate": 5.842448530990999e-05, + "loss": 1.779, + "step": 15077 + }, + { + "epoch": 4.627992633517495, + "grad_norm": 0.26784011721611023, + "learning_rate": 5.841958577047092e-05, + "loss": 1.799, + "step": 15078 + }, + { + "epoch": 4.628299570288521, + "grad_norm": 0.3230212926864624, + "learning_rate": 5.841468614782457e-05, + "loss": 1.7789, + "step": 15079 + }, + { + "epoch": 4.628606507059546, + "grad_norm": 0.24062715470790863, + "learning_rate": 5.840978644201935e-05, + "loss": 1.7697, + "step": 15080 + }, + { + "epoch": 4.628913443830571, + "grad_norm": 0.2882130444049835, + "learning_rate": 5.84048866531037e-05, + "loss": 1.7946, + "step": 15081 + }, + { + "epoch": 4.629220380601596, + "grad_norm": 0.3145603537559509, + "learning_rate": 5.839998678112602e-05, + "loss": 1.7116, + "step": 15082 + }, + { + "epoch": 4.629527317372621, + "grad_norm": 0.270997017621994, + "learning_rate": 5.839508682613477e-05, + "loss": 1.8281, + "step": 15083 + }, + { + "epoch": 4.629834254143646, + "grad_norm": 0.27299395203590393, + "learning_rate": 5.839018678817834e-05, + "loss": 1.8233, + "step": 15084 + }, + { + "epoch": 4.630141190914672, + "grad_norm": 0.2684478461742401, + "learning_rate": 5.838528666730517e-05, + "loss": 1.8111, + "step": 15085 + }, + { + "epoch": 4.630448127685697, + "grad_norm": 0.2365201860666275, + "learning_rate": 5.838038646356367e-05, + "loss": 1.7475, + "step": 15086 + }, + { + "epoch": 4.6307550644567215, + "grad_norm": 0.2661258280277252, + "learning_rate": 5.8375486177002305e-05, + "loss": 1.748, + "step": 15087 + }, + { + "epoch": 4.631062001227747, + "grad_norm": 0.2865012586116791, + "learning_rate": 5.8370585807669455e-05, + "loss": 1.7525, + "step": 15088 + }, + { + "epoch": 4.631368937998772, + "grad_norm": 0.2445172518491745, + "learning_rate": 5.836568535561358e-05, + "loss": 1.7278, + "step": 15089 + }, + { + "epoch": 4.6316758747697975, + "grad_norm": 0.28192558884620667, + "learning_rate": 5.8360784820883083e-05, + "loss": 1.7371, + "step": 15090 + }, + { + "epoch": 4.631982811540823, + "grad_norm": 0.38927358388900757, + "learning_rate": 5.835588420352642e-05, + "loss": 1.8088, + "step": 15091 + }, + { + "epoch": 4.632289748311848, + "grad_norm": 0.3409229516983032, + "learning_rate": 5.8350983503592025e-05, + "loss": 1.8011, + "step": 15092 + }, + { + "epoch": 4.632596685082873, + "grad_norm": 0.2464994341135025, + "learning_rate": 5.8346082721128294e-05, + "loss": 1.8354, + "step": 15093 + }, + { + "epoch": 4.632903621853898, + "grad_norm": 0.38765814900398254, + "learning_rate": 5.834118185618369e-05, + "loss": 1.7811, + "step": 15094 + }, + { + "epoch": 4.633210558624923, + "grad_norm": 0.42435070872306824, + "learning_rate": 5.833628090880664e-05, + "loss": 1.7855, + "step": 15095 + }, + { + "epoch": 4.633517495395949, + "grad_norm": 0.244876891374588, + "learning_rate": 5.833137987904558e-05, + "loss": 1.7494, + "step": 15096 + }, + { + "epoch": 4.633824432166974, + "grad_norm": 0.30353477597236633, + "learning_rate": 5.8326478766948934e-05, + "loss": 1.7772, + "step": 15097 + }, + { + "epoch": 4.634131368937998, + "grad_norm": 0.38839244842529297, + "learning_rate": 5.8321577572565146e-05, + "loss": 1.7689, + "step": 15098 + }, + { + "epoch": 4.634438305709024, + "grad_norm": 0.357129842042923, + "learning_rate": 5.8316676295942644e-05, + "loss": 1.7777, + "step": 15099 + }, + { + "epoch": 4.634745242480049, + "grad_norm": 0.23458799719810486, + "learning_rate": 5.831177493712988e-05, + "loss": 1.7544, + "step": 15100 + }, + { + "epoch": 4.635052179251074, + "grad_norm": 0.23751308023929596, + "learning_rate": 5.830687349617529e-05, + "loss": 1.7491, + "step": 15101 + }, + { + "epoch": 4.6353591160221, + "grad_norm": 0.31978943943977356, + "learning_rate": 5.83019719731273e-05, + "loss": 1.7439, + "step": 15102 + }, + { + "epoch": 4.635666052793125, + "grad_norm": 0.2751142084598541, + "learning_rate": 5.829707036803438e-05, + "loss": 1.8598, + "step": 15103 + }, + { + "epoch": 4.6359729895641495, + "grad_norm": 0.23670406639575958, + "learning_rate": 5.8292168680944914e-05, + "loss": 1.7629, + "step": 15104 + }, + { + "epoch": 4.636279926335175, + "grad_norm": 0.2447349727153778, + "learning_rate": 5.828726691190739e-05, + "loss": 1.7606, + "step": 15105 + }, + { + "epoch": 4.6365868631062, + "grad_norm": 0.2739902436733246, + "learning_rate": 5.828236506097023e-05, + "loss": 1.707, + "step": 15106 + }, + { + "epoch": 4.6368937998772255, + "grad_norm": 0.2050863653421402, + "learning_rate": 5.82774631281819e-05, + "loss": 1.7235, + "step": 15107 + }, + { + "epoch": 4.637200736648251, + "grad_norm": 0.3005560338497162, + "learning_rate": 5.827256111359082e-05, + "loss": 1.7785, + "step": 15108 + }, + { + "epoch": 4.637507673419275, + "grad_norm": 0.27168264985084534, + "learning_rate": 5.8267659017245434e-05, + "loss": 1.7844, + "step": 15109 + }, + { + "epoch": 4.637814610190301, + "grad_norm": 0.2965840995311737, + "learning_rate": 5.82627568391942e-05, + "loss": 1.7631, + "step": 15110 + }, + { + "epoch": 4.638121546961326, + "grad_norm": 0.3114408552646637, + "learning_rate": 5.825785457948556e-05, + "loss": 1.77, + "step": 15111 + }, + { + "epoch": 4.638428483732351, + "grad_norm": 0.2638910114765167, + "learning_rate": 5.825295223816796e-05, + "loss": 1.9183, + "step": 15112 + }, + { + "epoch": 4.638735420503377, + "grad_norm": 0.3293665051460266, + "learning_rate": 5.824804981528986e-05, + "loss": 1.6779, + "step": 15113 + }, + { + "epoch": 4.639042357274402, + "grad_norm": 0.28586456179618835, + "learning_rate": 5.824314731089968e-05, + "loss": 1.7905, + "step": 15114 + }, + { + "epoch": 4.639349294045426, + "grad_norm": 0.2254554182291031, + "learning_rate": 5.8238244725045906e-05, + "loss": 1.7602, + "step": 15115 + }, + { + "epoch": 4.639656230816452, + "grad_norm": 0.2770406901836395, + "learning_rate": 5.823334205777695e-05, + "loss": 1.7789, + "step": 15116 + }, + { + "epoch": 4.639963167587477, + "grad_norm": 0.2867025136947632, + "learning_rate": 5.822843930914129e-05, + "loss": 1.7408, + "step": 15117 + }, + { + "epoch": 4.640270104358502, + "grad_norm": 0.23486989736557007, + "learning_rate": 5.822353647918737e-05, + "loss": 1.7489, + "step": 15118 + }, + { + "epoch": 4.640577041129527, + "grad_norm": 0.2274324595928192, + "learning_rate": 5.821863356796367e-05, + "loss": 1.768, + "step": 15119 + }, + { + "epoch": 4.640883977900552, + "grad_norm": 0.25032591819763184, + "learning_rate": 5.821373057551858e-05, + "loss": 1.7602, + "step": 15120 + }, + { + "epoch": 4.6411909146715775, + "grad_norm": 0.22332963347434998, + "learning_rate": 5.820882750190059e-05, + "loss": 1.756, + "step": 15121 + }, + { + "epoch": 4.641497851442603, + "grad_norm": 0.24975591897964478, + "learning_rate": 5.820392434715817e-05, + "loss": 1.6963, + "step": 15122 + }, + { + "epoch": 4.641804788213628, + "grad_norm": 0.27892687916755676, + "learning_rate": 5.819902111133976e-05, + "loss": 1.8295, + "step": 15123 + }, + { + "epoch": 4.6421117249846535, + "grad_norm": 0.23914897441864014, + "learning_rate": 5.819411779449381e-05, + "loss": 1.7636, + "step": 15124 + }, + { + "epoch": 4.642418661755678, + "grad_norm": 0.2349565476179123, + "learning_rate": 5.818921439666879e-05, + "loss": 1.7823, + "step": 15125 + }, + { + "epoch": 4.642725598526703, + "grad_norm": 0.2075800597667694, + "learning_rate": 5.818431091791315e-05, + "loss": 1.7282, + "step": 15126 + }, + { + "epoch": 4.643032535297729, + "grad_norm": 0.19781073927879333, + "learning_rate": 5.817940735827535e-05, + "loss": 1.7598, + "step": 15127 + }, + { + "epoch": 4.643339472068754, + "grad_norm": 0.21997439861297607, + "learning_rate": 5.8174503717803866e-05, + "loss": 1.766, + "step": 15128 + }, + { + "epoch": 4.643646408839779, + "grad_norm": 0.23971444368362427, + "learning_rate": 5.816959999654713e-05, + "loss": 1.7824, + "step": 15129 + }, + { + "epoch": 4.643953345610804, + "grad_norm": 0.23357853293418884, + "learning_rate": 5.816469619455363e-05, + "loss": 1.7353, + "step": 15130 + }, + { + "epoch": 4.644260282381829, + "grad_norm": 0.22030897438526154, + "learning_rate": 5.815979231187181e-05, + "loss": 1.7413, + "step": 15131 + }, + { + "epoch": 4.644567219152854, + "grad_norm": 0.2322571873664856, + "learning_rate": 5.815488834855014e-05, + "loss": 1.7305, + "step": 15132 + }, + { + "epoch": 4.64487415592388, + "grad_norm": 0.25256821513175964, + "learning_rate": 5.814998430463709e-05, + "loss": 1.7533, + "step": 15133 + }, + { + "epoch": 4.645181092694905, + "grad_norm": 0.248504638671875, + "learning_rate": 5.81450801801811e-05, + "loss": 1.7345, + "step": 15134 + }, + { + "epoch": 4.64548802946593, + "grad_norm": 0.22850964963436127, + "learning_rate": 5.8140175975230673e-05, + "loss": 1.8308, + "step": 15135 + }, + { + "epoch": 4.645794966236955, + "grad_norm": 0.3517951965332031, + "learning_rate": 5.813527168983426e-05, + "loss": 1.811, + "step": 15136 + }, + { + "epoch": 4.64610190300798, + "grad_norm": 0.32132068276405334, + "learning_rate": 5.813036732404031e-05, + "loss": 1.7584, + "step": 15137 + }, + { + "epoch": 4.6464088397790055, + "grad_norm": 0.2349396049976349, + "learning_rate": 5.812546287789731e-05, + "loss": 1.7762, + "step": 15138 + }, + { + "epoch": 4.646715776550031, + "grad_norm": 0.23519493639469147, + "learning_rate": 5.812055835145372e-05, + "loss": 1.7428, + "step": 15139 + }, + { + "epoch": 4.647022713321056, + "grad_norm": 0.29277852177619934, + "learning_rate": 5.8115653744758016e-05, + "loss": 1.7599, + "step": 15140 + }, + { + "epoch": 4.647329650092081, + "grad_norm": 0.2347593754529953, + "learning_rate": 5.811074905785867e-05, + "loss": 1.7401, + "step": 15141 + }, + { + "epoch": 4.647636586863106, + "grad_norm": 0.23080264031887054, + "learning_rate": 5.8105844290804147e-05, + "loss": 1.7705, + "step": 15142 + }, + { + "epoch": 4.647943523634131, + "grad_norm": 0.24686801433563232, + "learning_rate": 5.810093944364291e-05, + "loss": 1.7409, + "step": 15143 + }, + { + "epoch": 4.648250460405157, + "grad_norm": 0.24098120629787445, + "learning_rate": 5.809603451642344e-05, + "loss": 1.7893, + "step": 15144 + }, + { + "epoch": 4.648557397176182, + "grad_norm": 0.23020638525485992, + "learning_rate": 5.809112950919422e-05, + "loss": 1.7589, + "step": 15145 + }, + { + "epoch": 4.648864333947207, + "grad_norm": 0.3036736249923706, + "learning_rate": 5.808622442200371e-05, + "loss": 1.7964, + "step": 15146 + }, + { + "epoch": 4.649171270718232, + "grad_norm": 0.2965635657310486, + "learning_rate": 5.808131925490039e-05, + "loss": 1.7986, + "step": 15147 + }, + { + "epoch": 4.649478207489257, + "grad_norm": 0.22241640090942383, + "learning_rate": 5.8076414007932745e-05, + "loss": 1.749, + "step": 15148 + }, + { + "epoch": 4.649785144260282, + "grad_norm": 0.20304246246814728, + "learning_rate": 5.8071508681149246e-05, + "loss": 1.7374, + "step": 15149 + }, + { + "epoch": 4.650092081031308, + "grad_norm": 0.19534410536289215, + "learning_rate": 5.806660327459834e-05, + "loss": 1.7087, + "step": 15150 + }, + { + "epoch": 4.650399017802332, + "grad_norm": 0.2151753008365631, + "learning_rate": 5.806169778832856e-05, + "loss": 1.7409, + "step": 15151 + }, + { + "epoch": 4.650705954573358, + "grad_norm": 0.2180301696062088, + "learning_rate": 5.805679222238836e-05, + "loss": 1.7522, + "step": 15152 + }, + { + "epoch": 4.651012891344383, + "grad_norm": 0.19917607307434082, + "learning_rate": 5.8051886576826205e-05, + "loss": 1.768, + "step": 15153 + }, + { + "epoch": 4.651319828115408, + "grad_norm": 0.2312052994966507, + "learning_rate": 5.804698085169059e-05, + "loss": 1.7799, + "step": 15154 + }, + { + "epoch": 4.651626764886434, + "grad_norm": 0.21541514992713928, + "learning_rate": 5.804207504702999e-05, + "loss": 1.7595, + "step": 15155 + }, + { + "epoch": 4.651933701657459, + "grad_norm": 0.2029450386762619, + "learning_rate": 5.803716916289289e-05, + "loss": 1.7727, + "step": 15156 + }, + { + "epoch": 4.652240638428484, + "grad_norm": 0.21796850860118866, + "learning_rate": 5.8032263199327787e-05, + "loss": 1.7445, + "step": 15157 + }, + { + "epoch": 4.652547575199509, + "grad_norm": 0.20309078693389893, + "learning_rate": 5.802735715638314e-05, + "loss": 1.6971, + "step": 15158 + }, + { + "epoch": 4.652854511970534, + "grad_norm": 0.21270112693309784, + "learning_rate": 5.802245103410745e-05, + "loss": 1.7162, + "step": 15159 + }, + { + "epoch": 4.653161448741559, + "grad_norm": 0.25357750058174133, + "learning_rate": 5.8017544832549184e-05, + "loss": 1.7534, + "step": 15160 + }, + { + "epoch": 4.653468385512585, + "grad_norm": 0.24015015363693237, + "learning_rate": 5.8012638551756847e-05, + "loss": 1.7639, + "step": 15161 + }, + { + "epoch": 4.653775322283609, + "grad_norm": 0.20507018268108368, + "learning_rate": 5.800773219177893e-05, + "loss": 1.7293, + "step": 15162 + }, + { + "epoch": 4.6540822590546345, + "grad_norm": 0.23399868607521057, + "learning_rate": 5.800282575266389e-05, + "loss": 1.8286, + "step": 15163 + }, + { + "epoch": 4.65438919582566, + "grad_norm": 0.27126726508140564, + "learning_rate": 5.799791923446025e-05, + "loss": 1.8028, + "step": 15164 + }, + { + "epoch": 4.654696132596685, + "grad_norm": 0.23644569516181946, + "learning_rate": 5.7993012637216494e-05, + "loss": 1.7138, + "step": 15165 + }, + { + "epoch": 4.6550030693677105, + "grad_norm": 0.21557916700839996, + "learning_rate": 5.7988105960981086e-05, + "loss": 1.7703, + "step": 15166 + }, + { + "epoch": 4.655310006138736, + "grad_norm": 0.22030150890350342, + "learning_rate": 5.798319920580254e-05, + "loss": 1.7282, + "step": 15167 + }, + { + "epoch": 4.65561694290976, + "grad_norm": 0.2092939168214798, + "learning_rate": 5.7978292371729325e-05, + "loss": 1.7853, + "step": 15168 + }, + { + "epoch": 4.655923879680786, + "grad_norm": 0.21643707156181335, + "learning_rate": 5.797338545880997e-05, + "loss": 1.7582, + "step": 15169 + }, + { + "epoch": 4.656230816451811, + "grad_norm": 0.3064669668674469, + "learning_rate": 5.796847846709294e-05, + "loss": 1.8139, + "step": 15170 + }, + { + "epoch": 4.656537753222836, + "grad_norm": 0.3060479760169983, + "learning_rate": 5.796357139662674e-05, + "loss": 1.7356, + "step": 15171 + }, + { + "epoch": 4.656844689993862, + "grad_norm": 0.23546656966209412, + "learning_rate": 5.7958664247459835e-05, + "loss": 1.7937, + "step": 15172 + }, + { + "epoch": 4.657151626764886, + "grad_norm": 0.2890888750553131, + "learning_rate": 5.795375701964077e-05, + "loss": 1.7305, + "step": 15173 + }, + { + "epoch": 4.657458563535911, + "grad_norm": 0.27948084473609924, + "learning_rate": 5.794884971321801e-05, + "loss": 1.7428, + "step": 15174 + }, + { + "epoch": 4.657765500306937, + "grad_norm": 0.2354089468717575, + "learning_rate": 5.794394232824007e-05, + "loss": 1.7622, + "step": 15175 + }, + { + "epoch": 4.658072437077962, + "grad_norm": 0.3271159827709198, + "learning_rate": 5.793903486475541e-05, + "loss": 1.7826, + "step": 15176 + }, + { + "epoch": 4.658379373848987, + "grad_norm": 0.3561338782310486, + "learning_rate": 5.793412732281257e-05, + "loss": 1.7698, + "step": 15177 + }, + { + "epoch": 4.658686310620013, + "grad_norm": 0.2913050949573517, + "learning_rate": 5.7929219702460035e-05, + "loss": 1.8156, + "step": 15178 + }, + { + "epoch": 4.658993247391037, + "grad_norm": 0.2345089465379715, + "learning_rate": 5.7924312003746294e-05, + "loss": 1.7859, + "step": 15179 + }, + { + "epoch": 4.6593001841620625, + "grad_norm": 0.3018132150173187, + "learning_rate": 5.7919404226719865e-05, + "loss": 1.7622, + "step": 15180 + }, + { + "epoch": 4.659607120933088, + "grad_norm": 0.29134172201156616, + "learning_rate": 5.791449637142924e-05, + "loss": 1.7287, + "step": 15181 + }, + { + "epoch": 4.659914057704113, + "grad_norm": 0.24126321077346802, + "learning_rate": 5.7909588437922924e-05, + "loss": 1.7969, + "step": 15182 + }, + { + "epoch": 4.6602209944751385, + "grad_norm": 0.27053284645080566, + "learning_rate": 5.7904680426249415e-05, + "loss": 1.7399, + "step": 15183 + }, + { + "epoch": 4.660527931246163, + "grad_norm": 0.2636512219905853, + "learning_rate": 5.789977233645722e-05, + "loss": 1.7615, + "step": 15184 + }, + { + "epoch": 4.660834868017188, + "grad_norm": 0.2263207584619522, + "learning_rate": 5.789486416859484e-05, + "loss": 1.7668, + "step": 15185 + }, + { + "epoch": 4.661141804788214, + "grad_norm": 0.25387826561927795, + "learning_rate": 5.78899559227108e-05, + "loss": 1.7594, + "step": 15186 + }, + { + "epoch": 4.661448741559239, + "grad_norm": 0.2268977165222168, + "learning_rate": 5.7885047598853596e-05, + "loss": 1.75, + "step": 15187 + }, + { + "epoch": 4.661755678330264, + "grad_norm": 0.29093095660209656, + "learning_rate": 5.788013919707172e-05, + "loss": 1.7291, + "step": 15188 + }, + { + "epoch": 4.66206261510129, + "grad_norm": 0.26578736305236816, + "learning_rate": 5.7875230717413684e-05, + "loss": 1.7276, + "step": 15189 + }, + { + "epoch": 4.662369551872314, + "grad_norm": 0.2548983097076416, + "learning_rate": 5.7870322159928e-05, + "loss": 1.755, + "step": 15190 + }, + { + "epoch": 4.662676488643339, + "grad_norm": 0.2246701419353485, + "learning_rate": 5.7865413524663184e-05, + "loss": 1.751, + "step": 15191 + }, + { + "epoch": 4.662983425414365, + "grad_norm": 0.3069002032279968, + "learning_rate": 5.7860504811667747e-05, + "loss": 1.7522, + "step": 15192 + }, + { + "epoch": 4.66329036218539, + "grad_norm": 0.3081241250038147, + "learning_rate": 5.7855596020990186e-05, + "loss": 1.7152, + "step": 15193 + }, + { + "epoch": 4.6635972989564145, + "grad_norm": 0.29006731510162354, + "learning_rate": 5.7850687152679026e-05, + "loss": 1.8471, + "step": 15194 + }, + { + "epoch": 4.66390423572744, + "grad_norm": 0.24131664633750916, + "learning_rate": 5.7845778206782786e-05, + "loss": 1.763, + "step": 15195 + }, + { + "epoch": 4.664211172498465, + "grad_norm": 0.21808001399040222, + "learning_rate": 5.784086918334994e-05, + "loss": 1.6989, + "step": 15196 + }, + { + "epoch": 4.6645181092694905, + "grad_norm": 0.2413240373134613, + "learning_rate": 5.783596008242904e-05, + "loss": 1.7869, + "step": 15197 + }, + { + "epoch": 4.664825046040516, + "grad_norm": 0.23310934007167816, + "learning_rate": 5.7831050904068594e-05, + "loss": 1.8017, + "step": 15198 + }, + { + "epoch": 4.665131982811541, + "grad_norm": 0.2577926814556122, + "learning_rate": 5.7826141648317125e-05, + "loss": 1.6938, + "step": 15199 + }, + { + "epoch": 4.665438919582566, + "grad_norm": 0.22523443400859833, + "learning_rate": 5.782123231522312e-05, + "loss": 1.8104, + "step": 15200 + }, + { + "epoch": 4.665745856353591, + "grad_norm": 0.23603026568889618, + "learning_rate": 5.781632290483512e-05, + "loss": 1.7484, + "step": 15201 + }, + { + "epoch": 4.666052793124616, + "grad_norm": 0.23195989429950714, + "learning_rate": 5.781141341720162e-05, + "loss": 1.7786, + "step": 15202 + }, + { + "epoch": 4.666359729895642, + "grad_norm": 0.21838274598121643, + "learning_rate": 5.780650385237118e-05, + "loss": 1.7509, + "step": 15203 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.26656514406204224, + "learning_rate": 5.780159421039229e-05, + "loss": 1.7875, + "step": 15204 + }, + { + "epoch": 4.666973603437691, + "grad_norm": 0.2293243706226349, + "learning_rate": 5.7796684491313456e-05, + "loss": 1.7518, + "step": 15205 + }, + { + "epoch": 4.667280540208717, + "grad_norm": 0.24190817773342133, + "learning_rate": 5.779177469518323e-05, + "loss": 1.7593, + "step": 15206 + }, + { + "epoch": 4.667587476979742, + "grad_norm": 0.31113871932029724, + "learning_rate": 5.77868648220501e-05, + "loss": 1.7911, + "step": 15207 + }, + { + "epoch": 4.667894413750767, + "grad_norm": 0.2875262498855591, + "learning_rate": 5.778195487196263e-05, + "loss": 1.7871, + "step": 15208 + }, + { + "epoch": 4.668201350521793, + "grad_norm": 0.2172149419784546, + "learning_rate": 5.777704484496931e-05, + "loss": 1.7592, + "step": 15209 + }, + { + "epoch": 4.668508287292818, + "grad_norm": 0.3282458186149597, + "learning_rate": 5.7772134741118675e-05, + "loss": 1.7687, + "step": 15210 + }, + { + "epoch": 4.6688152240638425, + "grad_norm": 0.36963000893592834, + "learning_rate": 5.7767224560459255e-05, + "loss": 1.812, + "step": 15211 + }, + { + "epoch": 4.669122160834868, + "grad_norm": 0.22387740015983582, + "learning_rate": 5.776231430303957e-05, + "loss": 1.7449, + "step": 15212 + }, + { + "epoch": 4.669429097605893, + "grad_norm": 0.21468734741210938, + "learning_rate": 5.775740396890813e-05, + "loss": 1.716, + "step": 15213 + }, + { + "epoch": 4.6697360343769185, + "grad_norm": 0.2478475719690323, + "learning_rate": 5.7752493558113486e-05, + "loss": 1.7182, + "step": 15214 + }, + { + "epoch": 4.670042971147944, + "grad_norm": 0.20924845337867737, + "learning_rate": 5.774758307070416e-05, + "loss": 1.784, + "step": 15215 + }, + { + "epoch": 4.670349907918968, + "grad_norm": 0.2933209538459778, + "learning_rate": 5.774267250672868e-05, + "loss": 1.8375, + "step": 15216 + }, + { + "epoch": 4.670656844689994, + "grad_norm": 0.2744538486003876, + "learning_rate": 5.7737761866235565e-05, + "loss": 1.7019, + "step": 15217 + }, + { + "epoch": 4.670963781461019, + "grad_norm": 0.20991720259189606, + "learning_rate": 5.773285114927336e-05, + "loss": 1.7189, + "step": 15218 + }, + { + "epoch": 4.671270718232044, + "grad_norm": 0.2873254716396332, + "learning_rate": 5.772794035589057e-05, + "loss": 1.7492, + "step": 15219 + }, + { + "epoch": 4.67157765500307, + "grad_norm": 0.2781519591808319, + "learning_rate": 5.772302948613576e-05, + "loss": 1.7342, + "step": 15220 + }, + { + "epoch": 4.671884591774095, + "grad_norm": 0.23288768529891968, + "learning_rate": 5.7718118540057455e-05, + "loss": 1.7245, + "step": 15221 + }, + { + "epoch": 4.672191528545119, + "grad_norm": 0.40817564725875854, + "learning_rate": 5.771320751770417e-05, + "loss": 1.7659, + "step": 15222 + }, + { + "epoch": 4.672498465316145, + "grad_norm": 0.45521771907806396, + "learning_rate": 5.770829641912444e-05, + "loss": 1.7875, + "step": 15223 + }, + { + "epoch": 4.67280540208717, + "grad_norm": 0.22353248298168182, + "learning_rate": 5.77033852443668e-05, + "loss": 1.7098, + "step": 15224 + }, + { + "epoch": 4.673112338858195, + "grad_norm": 0.4066791534423828, + "learning_rate": 5.769847399347981e-05, + "loss": 1.7277, + "step": 15225 + }, + { + "epoch": 4.67341927562922, + "grad_norm": 0.4299545884132385, + "learning_rate": 5.769356266651198e-05, + "loss": 1.7777, + "step": 15226 + }, + { + "epoch": 4.673726212400245, + "grad_norm": 0.21037638187408447, + "learning_rate": 5.768865126351186e-05, + "loss": 1.7263, + "step": 15227 + }, + { + "epoch": 4.6740331491712706, + "grad_norm": 0.3390437066555023, + "learning_rate": 5.768373978452798e-05, + "loss": 1.7457, + "step": 15228 + }, + { + "epoch": 4.674340085942296, + "grad_norm": 0.40003323554992676, + "learning_rate": 5.767882822960887e-05, + "loss": 1.8137, + "step": 15229 + }, + { + "epoch": 4.674647022713321, + "grad_norm": 0.2212848961353302, + "learning_rate": 5.767391659880308e-05, + "loss": 1.7131, + "step": 15230 + }, + { + "epoch": 4.6749539594843466, + "grad_norm": 0.30634984374046326, + "learning_rate": 5.766900489215915e-05, + "loss": 1.7775, + "step": 15231 + }, + { + "epoch": 4.675260896255372, + "grad_norm": 0.31412798166275024, + "learning_rate": 5.766409310972563e-05, + "loss": 1.7383, + "step": 15232 + }, + { + "epoch": 4.675567833026396, + "grad_norm": 0.21125225722789764, + "learning_rate": 5.7659181251551045e-05, + "loss": 1.8046, + "step": 15233 + }, + { + "epoch": 4.675874769797422, + "grad_norm": 0.3234494924545288, + "learning_rate": 5.765426931768394e-05, + "loss": 1.7838, + "step": 15234 + }, + { + "epoch": 4.676181706568447, + "grad_norm": 0.2668779194355011, + "learning_rate": 5.764935730817286e-05, + "loss": 1.7464, + "step": 15235 + }, + { + "epoch": 4.676488643339472, + "grad_norm": 0.22423583269119263, + "learning_rate": 5.764444522306633e-05, + "loss": 1.7165, + "step": 15236 + }, + { + "epoch": 4.676795580110497, + "grad_norm": 0.29066675901412964, + "learning_rate": 5.7639533062412945e-05, + "loss": 1.75, + "step": 15237 + }, + { + "epoch": 4.677102516881522, + "grad_norm": 0.2963598370552063, + "learning_rate": 5.76346208262612e-05, + "loss": 1.8168, + "step": 15238 + }, + { + "epoch": 4.6774094536525475, + "grad_norm": 0.21484358608722687, + "learning_rate": 5.7629708514659655e-05, + "loss": 1.71, + "step": 15239 + }, + { + "epoch": 4.677716390423573, + "grad_norm": 0.20657925307750702, + "learning_rate": 5.762479612765686e-05, + "loss": 1.7239, + "step": 15240 + }, + { + "epoch": 4.678023327194598, + "grad_norm": 0.21336235105991364, + "learning_rate": 5.761988366530136e-05, + "loss": 1.7952, + "step": 15241 + }, + { + "epoch": 4.6783302639656235, + "grad_norm": 0.24156586825847626, + "learning_rate": 5.7614971127641696e-05, + "loss": 1.7709, + "step": 15242 + }, + { + "epoch": 4.678637200736648, + "grad_norm": 0.2633824944496155, + "learning_rate": 5.761005851472643e-05, + "loss": 1.7404, + "step": 15243 + }, + { + "epoch": 4.678944137507673, + "grad_norm": 0.23302829265594482, + "learning_rate": 5.760514582660411e-05, + "loss": 1.7006, + "step": 15244 + }, + { + "epoch": 4.679251074278699, + "grad_norm": 0.22404874861240387, + "learning_rate": 5.7600233063323283e-05, + "loss": 1.7731, + "step": 15245 + }, + { + "epoch": 4.679558011049724, + "grad_norm": 0.23217839002609253, + "learning_rate": 5.7595320224932495e-05, + "loss": 1.7452, + "step": 15246 + }, + { + "epoch": 4.679864947820749, + "grad_norm": 0.23131491243839264, + "learning_rate": 5.7590407311480296e-05, + "loss": 1.7547, + "step": 15247 + }, + { + "epoch": 4.680171884591774, + "grad_norm": 0.21907350420951843, + "learning_rate": 5.7585494323015245e-05, + "loss": 1.7556, + "step": 15248 + }, + { + "epoch": 4.680478821362799, + "grad_norm": 0.22416768968105316, + "learning_rate": 5.7580581259585895e-05, + "loss": 1.7783, + "step": 15249 + }, + { + "epoch": 4.680785758133824, + "grad_norm": 0.20203055441379547, + "learning_rate": 5.75756681212408e-05, + "loss": 1.7285, + "step": 15250 + }, + { + "epoch": 4.68109269490485, + "grad_norm": 0.27838602662086487, + "learning_rate": 5.75707549080285e-05, + "loss": 1.7489, + "step": 15251 + }, + { + "epoch": 4.681399631675875, + "grad_norm": 0.2415023297071457, + "learning_rate": 5.7565841619997586e-05, + "loss": 1.7453, + "step": 15252 + }, + { + "epoch": 4.6817065684469, + "grad_norm": 0.22986920177936554, + "learning_rate": 5.756092825719658e-05, + "loss": 1.7315, + "step": 15253 + }, + { + "epoch": 4.682013505217925, + "grad_norm": 0.2427850216627121, + "learning_rate": 5.755601481967404e-05, + "loss": 1.772, + "step": 15254 + }, + { + "epoch": 4.68232044198895, + "grad_norm": 0.24556589126586914, + "learning_rate": 5.755110130747854e-05, + "loss": 1.7475, + "step": 15255 + }, + { + "epoch": 4.6826273787599755, + "grad_norm": 0.25252529978752136, + "learning_rate": 5.754618772065864e-05, + "loss": 1.7152, + "step": 15256 + }, + { + "epoch": 4.682934315531001, + "grad_norm": 0.24599005281925201, + "learning_rate": 5.754127405926287e-05, + "loss": 1.7911, + "step": 15257 + }, + { + "epoch": 4.683241252302026, + "grad_norm": 0.18961480259895325, + "learning_rate": 5.7536360323339836e-05, + "loss": 1.681, + "step": 15258 + }, + { + "epoch": 4.683548189073051, + "grad_norm": 0.24372327327728271, + "learning_rate": 5.7531446512938035e-05, + "loss": 1.7771, + "step": 15259 + }, + { + "epoch": 4.683855125844076, + "grad_norm": 0.23239269852638245, + "learning_rate": 5.752653262810609e-05, + "loss": 1.7502, + "step": 15260 + }, + { + "epoch": 4.684162062615101, + "grad_norm": 0.25076135993003845, + "learning_rate": 5.752161866889254e-05, + "loss": 1.7974, + "step": 15261 + }, + { + "epoch": 4.684468999386127, + "grad_norm": 0.2703748941421509, + "learning_rate": 5.7516704635345945e-05, + "loss": 1.7245, + "step": 15262 + }, + { + "epoch": 4.684775936157152, + "grad_norm": 0.19247616827487946, + "learning_rate": 5.751179052751487e-05, + "loss": 1.7105, + "step": 15263 + }, + { + "epoch": 4.685082872928177, + "grad_norm": 0.23166817426681519, + "learning_rate": 5.750687634544787e-05, + "loss": 1.8026, + "step": 15264 + }, + { + "epoch": 4.685389809699202, + "grad_norm": 0.22434166073799133, + "learning_rate": 5.7501962089193507e-05, + "loss": 1.7779, + "step": 15265 + }, + { + "epoch": 4.685696746470227, + "grad_norm": 0.190699502825737, + "learning_rate": 5.749704775880037e-05, + "loss": 1.726, + "step": 15266 + }, + { + "epoch": 4.686003683241252, + "grad_norm": 0.22995290160179138, + "learning_rate": 5.749213335431702e-05, + "loss": 1.7495, + "step": 15267 + }, + { + "epoch": 4.686310620012278, + "grad_norm": 0.2712057828903198, + "learning_rate": 5.7487218875792016e-05, + "loss": 1.7862, + "step": 15268 + }, + { + "epoch": 4.686617556783302, + "grad_norm": 0.2524562180042267, + "learning_rate": 5.7482304323273913e-05, + "loss": 1.7092, + "step": 15269 + }, + { + "epoch": 4.6869244935543275, + "grad_norm": 0.23810559511184692, + "learning_rate": 5.747738969681131e-05, + "loss": 1.8049, + "step": 15270 + }, + { + "epoch": 4.687231430325353, + "grad_norm": 0.25521910190582275, + "learning_rate": 5.747247499645275e-05, + "loss": 1.8124, + "step": 15271 + }, + { + "epoch": 4.687538367096378, + "grad_norm": 0.27797845005989075, + "learning_rate": 5.746756022224682e-05, + "loss": 1.7694, + "step": 15272 + }, + { + "epoch": 4.6878453038674035, + "grad_norm": 0.23849260807037354, + "learning_rate": 5.746264537424208e-05, + "loss": 1.7771, + "step": 15273 + }, + { + "epoch": 4.688152240638429, + "grad_norm": 0.24368882179260254, + "learning_rate": 5.74577304524871e-05, + "loss": 1.8143, + "step": 15274 + }, + { + "epoch": 4.688459177409453, + "grad_norm": 0.2712198793888092, + "learning_rate": 5.745281545703045e-05, + "loss": 1.7683, + "step": 15275 + }, + { + "epoch": 4.688766114180479, + "grad_norm": 0.30913081765174866, + "learning_rate": 5.7447900387920716e-05, + "loss": 1.7111, + "step": 15276 + }, + { + "epoch": 4.689073050951504, + "grad_norm": 0.22123363614082336, + "learning_rate": 5.744298524520646e-05, + "loss": 1.7466, + "step": 15277 + }, + { + "epoch": 4.689379987722529, + "grad_norm": 0.32836318016052246, + "learning_rate": 5.743807002893628e-05, + "loss": 1.8083, + "step": 15278 + }, + { + "epoch": 4.689686924493555, + "grad_norm": 0.33319979906082153, + "learning_rate": 5.743315473915871e-05, + "loss": 1.7122, + "step": 15279 + }, + { + "epoch": 4.689993861264579, + "grad_norm": 0.252163290977478, + "learning_rate": 5.742823937592236e-05, + "loss": 1.7599, + "step": 15280 + }, + { + "epoch": 4.690300798035604, + "grad_norm": 0.23248571157455444, + "learning_rate": 5.7423323939275797e-05, + "loss": 1.7791, + "step": 15281 + }, + { + "epoch": 4.69060773480663, + "grad_norm": 0.27024057507514954, + "learning_rate": 5.741840842926759e-05, + "loss": 1.7608, + "step": 15282 + }, + { + "epoch": 4.690914671577655, + "grad_norm": 0.21888256072998047, + "learning_rate": 5.7413492845946326e-05, + "loss": 1.7407, + "step": 15283 + }, + { + "epoch": 4.69122160834868, + "grad_norm": 0.2574782073497772, + "learning_rate": 5.740857718936058e-05, + "loss": 1.707, + "step": 15284 + }, + { + "epoch": 4.691528545119706, + "grad_norm": 0.2541569769382477, + "learning_rate": 5.740366145955893e-05, + "loss": 1.7301, + "step": 15285 + }, + { + "epoch": 4.69183548189073, + "grad_norm": 0.23484647274017334, + "learning_rate": 5.7398745656589955e-05, + "loss": 1.772, + "step": 15286 + }, + { + "epoch": 4.6921424186617555, + "grad_norm": 0.2827093005180359, + "learning_rate": 5.739382978050225e-05, + "loss": 1.7745, + "step": 15287 + }, + { + "epoch": 4.692449355432781, + "grad_norm": 0.300387978553772, + "learning_rate": 5.738891383134437e-05, + "loss": 1.7966, + "step": 15288 + }, + { + "epoch": 4.692756292203806, + "grad_norm": 0.2414523959159851, + "learning_rate": 5.7383997809164926e-05, + "loss": 1.7355, + "step": 15289 + }, + { + "epoch": 4.6930632289748315, + "grad_norm": 0.21221841871738434, + "learning_rate": 5.737908171401248e-05, + "loss": 1.7935, + "step": 15290 + }, + { + "epoch": 4.693370165745856, + "grad_norm": 0.23488084971904755, + "learning_rate": 5.737416554593563e-05, + "loss": 1.7447, + "step": 15291 + }, + { + "epoch": 4.693677102516881, + "grad_norm": 0.26176631450653076, + "learning_rate": 5.7369249304982954e-05, + "loss": 1.769, + "step": 15292 + }, + { + "epoch": 4.693984039287907, + "grad_norm": 0.23060615360736847, + "learning_rate": 5.736433299120303e-05, + "loss": 1.7344, + "step": 15293 + }, + { + "epoch": 4.694290976058932, + "grad_norm": 0.2536846399307251, + "learning_rate": 5.7359416604644456e-05, + "loss": 1.7862, + "step": 15294 + }, + { + "epoch": 4.694597912829957, + "grad_norm": 0.23221342265605927, + "learning_rate": 5.735450014535581e-05, + "loss": 1.743, + "step": 15295 + }, + { + "epoch": 4.694904849600983, + "grad_norm": 0.25320062041282654, + "learning_rate": 5.734958361338568e-05, + "loss": 1.8001, + "step": 15296 + }, + { + "epoch": 4.695211786372007, + "grad_norm": 0.23132461309432983, + "learning_rate": 5.734466700878267e-05, + "loss": 1.7676, + "step": 15297 + }, + { + "epoch": 4.695518723143032, + "grad_norm": 0.2222728580236435, + "learning_rate": 5.7339750331595346e-05, + "loss": 1.7267, + "step": 15298 + }, + { + "epoch": 4.695825659914058, + "grad_norm": 0.2505118250846863, + "learning_rate": 5.733483358187231e-05, + "loss": 1.7467, + "step": 15299 + }, + { + "epoch": 4.696132596685083, + "grad_norm": 0.23609887063503265, + "learning_rate": 5.732991675966214e-05, + "loss": 1.7319, + "step": 15300 + }, + { + "epoch": 4.696439533456108, + "grad_norm": 0.2939738631248474, + "learning_rate": 5.732499986501345e-05, + "loss": 1.8676, + "step": 15301 + }, + { + "epoch": 4.696746470227133, + "grad_norm": 0.29868564009666443, + "learning_rate": 5.7320082897974814e-05, + "loss": 1.7541, + "step": 15302 + }, + { + "epoch": 4.697053406998158, + "grad_norm": 0.2366383820772171, + "learning_rate": 5.731516585859482e-05, + "loss": 1.7531, + "step": 15303 + }, + { + "epoch": 4.6973603437691835, + "grad_norm": 0.2721317410469055, + "learning_rate": 5.731024874692208e-05, + "loss": 1.7444, + "step": 15304 + }, + { + "epoch": 4.697667280540209, + "grad_norm": 0.24925900995731354, + "learning_rate": 5.730533156300517e-05, + "loss": 1.7716, + "step": 15305 + }, + { + "epoch": 4.697974217311234, + "grad_norm": 0.23012754321098328, + "learning_rate": 5.7300414306892704e-05, + "loss": 1.7211, + "step": 15306 + }, + { + "epoch": 4.6982811540822595, + "grad_norm": 0.21274085342884064, + "learning_rate": 5.7295496978633254e-05, + "loss": 1.7853, + "step": 15307 + }, + { + "epoch": 4.698588090853284, + "grad_norm": 0.21799001097679138, + "learning_rate": 5.729057957827544e-05, + "loss": 1.7505, + "step": 15308 + }, + { + "epoch": 4.698895027624309, + "grad_norm": 0.22365793585777283, + "learning_rate": 5.728566210586783e-05, + "loss": 1.7934, + "step": 15309 + }, + { + "epoch": 4.699201964395335, + "grad_norm": 0.23325085639953613, + "learning_rate": 5.728074456145903e-05, + "loss": 1.7354, + "step": 15310 + }, + { + "epoch": 4.69950890116636, + "grad_norm": 0.2175164669752121, + "learning_rate": 5.7275826945097654e-05, + "loss": 1.7541, + "step": 15311 + }, + { + "epoch": 4.699815837937384, + "grad_norm": 0.24657388031482697, + "learning_rate": 5.727090925683231e-05, + "loss": 1.814, + "step": 15312 + }, + { + "epoch": 4.70012277470841, + "grad_norm": 0.2437550574541092, + "learning_rate": 5.726599149671156e-05, + "loss": 1.7234, + "step": 15313 + }, + { + "epoch": 4.700429711479435, + "grad_norm": 0.21053487062454224, + "learning_rate": 5.726107366478402e-05, + "loss": 1.7788, + "step": 15314 + }, + { + "epoch": 4.7007366482504604, + "grad_norm": 0.2007097452878952, + "learning_rate": 5.725615576109831e-05, + "loss": 1.7453, + "step": 15315 + }, + { + "epoch": 4.701043585021486, + "grad_norm": 0.19331564009189606, + "learning_rate": 5.725123778570299e-05, + "loss": 1.7142, + "step": 15316 + }, + { + "epoch": 4.701350521792511, + "grad_norm": 0.24291567504405975, + "learning_rate": 5.7246319738646706e-05, + "loss": 1.8081, + "step": 15317 + }, + { + "epoch": 4.701657458563536, + "grad_norm": 0.21423695981502533, + "learning_rate": 5.724140161997804e-05, + "loss": 1.7021, + "step": 15318 + }, + { + "epoch": 4.701964395334561, + "grad_norm": 0.20857618749141693, + "learning_rate": 5.72364834297456e-05, + "loss": 1.7447, + "step": 15319 + }, + { + "epoch": 4.702271332105586, + "grad_norm": 0.2547401487827301, + "learning_rate": 5.7231565167998e-05, + "loss": 1.7505, + "step": 15320 + }, + { + "epoch": 4.702578268876612, + "grad_norm": 0.2729472219944, + "learning_rate": 5.7226646834783825e-05, + "loss": 1.7974, + "step": 15321 + }, + { + "epoch": 4.702885205647637, + "grad_norm": 0.23258371651172638, + "learning_rate": 5.722172843015169e-05, + "loss": 1.7562, + "step": 15322 + }, + { + "epoch": 4.703192142418661, + "grad_norm": 0.23399893939495087, + "learning_rate": 5.72168099541502e-05, + "loss": 1.7674, + "step": 15323 + }, + { + "epoch": 4.703499079189687, + "grad_norm": 0.2678206264972687, + "learning_rate": 5.721189140682797e-05, + "loss": 1.7331, + "step": 15324 + }, + { + "epoch": 4.703806015960712, + "grad_norm": 0.19472146034240723, + "learning_rate": 5.7206972788233593e-05, + "loss": 1.7003, + "step": 15325 + }, + { + "epoch": 4.704112952731737, + "grad_norm": 0.2199394404888153, + "learning_rate": 5.72020540984157e-05, + "loss": 1.7072, + "step": 15326 + }, + { + "epoch": 4.704419889502763, + "grad_norm": 0.219175323843956, + "learning_rate": 5.719713533742287e-05, + "loss": 1.7591, + "step": 15327 + }, + { + "epoch": 4.704726826273788, + "grad_norm": 0.21127547323703766, + "learning_rate": 5.719221650530374e-05, + "loss": 1.8059, + "step": 15328 + }, + { + "epoch": 4.7050337630448125, + "grad_norm": 0.22189834713935852, + "learning_rate": 5.7187297602106905e-05, + "loss": 1.7529, + "step": 15329 + }, + { + "epoch": 4.705340699815838, + "grad_norm": 0.19945195317268372, + "learning_rate": 5.7182378627881e-05, + "loss": 1.7133, + "step": 15330 + }, + { + "epoch": 4.705647636586863, + "grad_norm": 0.2177499681711197, + "learning_rate": 5.7177459582674595e-05, + "loss": 1.7451, + "step": 15331 + }, + { + "epoch": 4.7059545733578885, + "grad_norm": 0.19489440321922302, + "learning_rate": 5.717254046653635e-05, + "loss": 1.7499, + "step": 15332 + }, + { + "epoch": 4.706261510128914, + "grad_norm": 0.21366968750953674, + "learning_rate": 5.716762127951485e-05, + "loss": 1.7683, + "step": 15333 + }, + { + "epoch": 4.706568446899938, + "grad_norm": 0.2894177734851837, + "learning_rate": 5.71627020216587e-05, + "loss": 1.8235, + "step": 15334 + }, + { + "epoch": 4.706875383670964, + "grad_norm": 0.22175677120685577, + "learning_rate": 5.7157782693016534e-05, + "loss": 1.7421, + "step": 15335 + }, + { + "epoch": 4.707182320441989, + "grad_norm": 0.23653541505336761, + "learning_rate": 5.715286329363698e-05, + "loss": 1.6937, + "step": 15336 + }, + { + "epoch": 4.707489257213014, + "grad_norm": 0.3015746772289276, + "learning_rate": 5.714794382356863e-05, + "loss": 1.7159, + "step": 15337 + }, + { + "epoch": 4.70779619398404, + "grad_norm": 0.24045881628990173, + "learning_rate": 5.714302428286011e-05, + "loss": 1.7263, + "step": 15338 + }, + { + "epoch": 4.708103130755065, + "grad_norm": 0.19836920499801636, + "learning_rate": 5.7138104671560035e-05, + "loss": 1.7604, + "step": 15339 + }, + { + "epoch": 4.708410067526089, + "grad_norm": 0.2430238276720047, + "learning_rate": 5.7133184989717036e-05, + "loss": 1.7147, + "step": 15340 + }, + { + "epoch": 4.708717004297115, + "grad_norm": 0.19388417899608612, + "learning_rate": 5.712826523737971e-05, + "loss": 1.7153, + "step": 15341 + }, + { + "epoch": 4.70902394106814, + "grad_norm": 0.19648151099681854, + "learning_rate": 5.7123345414596694e-05, + "loss": 1.7373, + "step": 15342 + }, + { + "epoch": 4.709330877839165, + "grad_norm": 0.20326325297355652, + "learning_rate": 5.711842552141661e-05, + "loss": 1.7012, + "step": 15343 + }, + { + "epoch": 4.70963781461019, + "grad_norm": 0.20798304677009583, + "learning_rate": 5.711350555788806e-05, + "loss": 1.7134, + "step": 15344 + }, + { + "epoch": 4.709944751381215, + "grad_norm": 0.29318806529045105, + "learning_rate": 5.7108585524059674e-05, + "loss": 1.7661, + "step": 15345 + }, + { + "epoch": 4.7102516881522405, + "grad_norm": 0.273318350315094, + "learning_rate": 5.710366541998009e-05, + "loss": 1.7329, + "step": 15346 + }, + { + "epoch": 4.710558624923266, + "grad_norm": 0.2306031584739685, + "learning_rate": 5.7098745245697925e-05, + "loss": 1.8152, + "step": 15347 + }, + { + "epoch": 4.710865561694291, + "grad_norm": 0.27630630135536194, + "learning_rate": 5.709382500126179e-05, + "loss": 1.7955, + "step": 15348 + }, + { + "epoch": 4.7111724984653165, + "grad_norm": 0.2366025298833847, + "learning_rate": 5.7088904686720326e-05, + "loss": 1.7943, + "step": 15349 + }, + { + "epoch": 4.711479435236341, + "grad_norm": 0.24196656048297882, + "learning_rate": 5.708398430212215e-05, + "loss": 1.698, + "step": 15350 + }, + { + "epoch": 4.711786372007366, + "grad_norm": 0.2770058512687683, + "learning_rate": 5.707906384751588e-05, + "loss": 1.7618, + "step": 15351 + }, + { + "epoch": 4.712093308778392, + "grad_norm": 0.20432323217391968, + "learning_rate": 5.7074143322950157e-05, + "loss": 1.7422, + "step": 15352 + }, + { + "epoch": 4.712400245549417, + "grad_norm": 0.25543150305747986, + "learning_rate": 5.70692227284736e-05, + "loss": 1.7744, + "step": 15353 + }, + { + "epoch": 4.712707182320442, + "grad_norm": 0.24315913021564484, + "learning_rate": 5.7064302064134855e-05, + "loss": 1.7127, + "step": 15354 + }, + { + "epoch": 4.713014119091467, + "grad_norm": 0.23636099696159363, + "learning_rate": 5.705938132998252e-05, + "loss": 1.7725, + "step": 15355 + }, + { + "epoch": 4.713321055862492, + "grad_norm": 0.26809820532798767, + "learning_rate": 5.705446052606526e-05, + "loss": 1.8338, + "step": 15356 + }, + { + "epoch": 4.713627992633517, + "grad_norm": 0.24969002604484558, + "learning_rate": 5.704953965243167e-05, + "loss": 1.8225, + "step": 15357 + }, + { + "epoch": 4.713934929404543, + "grad_norm": 0.23189692199230194, + "learning_rate": 5.70446187091304e-05, + "loss": 1.7901, + "step": 15358 + }, + { + "epoch": 4.714241866175568, + "grad_norm": 0.22373750805854797, + "learning_rate": 5.703969769621008e-05, + "loss": 1.6919, + "step": 15359 + }, + { + "epoch": 4.714548802946593, + "grad_norm": 0.23963531851768494, + "learning_rate": 5.703477661371934e-05, + "loss": 1.7806, + "step": 15360 + }, + { + "epoch": 4.714855739717618, + "grad_norm": 0.20365150272846222, + "learning_rate": 5.702985546170683e-05, + "loss": 1.7207, + "step": 15361 + }, + { + "epoch": 4.715162676488643, + "grad_norm": 0.245658278465271, + "learning_rate": 5.702493424022114e-05, + "loss": 1.7589, + "step": 15362 + }, + { + "epoch": 4.7154696132596685, + "grad_norm": 0.22633756697177887, + "learning_rate": 5.702001294931094e-05, + "loss": 1.7893, + "step": 15363 + }, + { + "epoch": 4.715776550030694, + "grad_norm": 0.21587726473808289, + "learning_rate": 5.701509158902487e-05, + "loss": 1.8095, + "step": 15364 + }, + { + "epoch": 4.716083486801719, + "grad_norm": 0.22553963959217072, + "learning_rate": 5.701017015941155e-05, + "loss": 1.7419, + "step": 15365 + }, + { + "epoch": 4.716390423572744, + "grad_norm": 0.2276087999343872, + "learning_rate": 5.700524866051962e-05, + "loss": 1.7052, + "step": 15366 + }, + { + "epoch": 4.716697360343769, + "grad_norm": 0.22236761450767517, + "learning_rate": 5.700032709239771e-05, + "loss": 1.8612, + "step": 15367 + }, + { + "epoch": 4.717004297114794, + "grad_norm": 0.22816185653209686, + "learning_rate": 5.6995405455094465e-05, + "loss": 1.78, + "step": 15368 + }, + { + "epoch": 4.71731123388582, + "grad_norm": 0.21597479283809662, + "learning_rate": 5.6990483748658516e-05, + "loss": 1.8276, + "step": 15369 + }, + { + "epoch": 4.717618170656845, + "grad_norm": 0.22209586203098297, + "learning_rate": 5.6985561973138533e-05, + "loss": 1.74, + "step": 15370 + }, + { + "epoch": 4.71792510742787, + "grad_norm": 0.24249997735023499, + "learning_rate": 5.6980640128583116e-05, + "loss": 1.8035, + "step": 15371 + }, + { + "epoch": 4.718232044198895, + "grad_norm": 0.23326106369495392, + "learning_rate": 5.6975718215040943e-05, + "loss": 1.7969, + "step": 15372 + }, + { + "epoch": 4.71853898096992, + "grad_norm": 0.215044766664505, + "learning_rate": 5.6970796232560596e-05, + "loss": 1.7345, + "step": 15373 + }, + { + "epoch": 4.718845917740945, + "grad_norm": 0.20231883227825165, + "learning_rate": 5.696587418119078e-05, + "loss": 1.7231, + "step": 15374 + }, + { + "epoch": 4.719152854511971, + "grad_norm": 0.2136038839817047, + "learning_rate": 5.696095206098011e-05, + "loss": 1.7421, + "step": 15375 + }, + { + "epoch": 4.719459791282996, + "grad_norm": 0.2662335932254791, + "learning_rate": 5.6956029871977235e-05, + "loss": 1.7518, + "step": 15376 + }, + { + "epoch": 4.7197667280540205, + "grad_norm": 0.25649648904800415, + "learning_rate": 5.6951107614230783e-05, + "loss": 1.8314, + "step": 15377 + }, + { + "epoch": 4.720073664825046, + "grad_norm": 0.21995560824871063, + "learning_rate": 5.6946185287789425e-05, + "loss": 1.7511, + "step": 15378 + }, + { + "epoch": 4.720380601596071, + "grad_norm": 0.3388935923576355, + "learning_rate": 5.694126289270177e-05, + "loss": 1.7975, + "step": 15379 + }, + { + "epoch": 4.7206875383670965, + "grad_norm": 0.32886409759521484, + "learning_rate": 5.693634042901651e-05, + "loss": 1.7153, + "step": 15380 + }, + { + "epoch": 4.720994475138122, + "grad_norm": 0.21727977693080902, + "learning_rate": 5.693141789678226e-05, + "loss": 1.7095, + "step": 15381 + }, + { + "epoch": 4.721301411909147, + "grad_norm": 0.2680833041667938, + "learning_rate": 5.6926495296047675e-05, + "loss": 1.696, + "step": 15382 + }, + { + "epoch": 4.721608348680172, + "grad_norm": 0.2645499110221863, + "learning_rate": 5.692157262686141e-05, + "loss": 1.6889, + "step": 15383 + }, + { + "epoch": 4.721915285451197, + "grad_norm": 0.20362348854541779, + "learning_rate": 5.69166498892721e-05, + "loss": 1.7303, + "step": 15384 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.24259062111377716, + "learning_rate": 5.691172708332839e-05, + "loss": 1.7684, + "step": 15385 + }, + { + "epoch": 4.722529158993248, + "grad_norm": 0.24204276502132416, + "learning_rate": 5.690680420907897e-05, + "loss": 1.7728, + "step": 15386 + }, + { + "epoch": 4.722836095764272, + "grad_norm": 0.3038320243358612, + "learning_rate": 5.690188126657244e-05, + "loss": 1.7573, + "step": 15387 + }, + { + "epoch": 4.723143032535297, + "grad_norm": 0.24619868397712708, + "learning_rate": 5.689695825585749e-05, + "loss": 1.754, + "step": 15388 + }, + { + "epoch": 4.723449969306323, + "grad_norm": 0.19441325962543488, + "learning_rate": 5.689203517698276e-05, + "loss": 1.726, + "step": 15389 + }, + { + "epoch": 4.723756906077348, + "grad_norm": 0.2874276340007782, + "learning_rate": 5.688711202999688e-05, + "loss": 1.7704, + "step": 15390 + }, + { + "epoch": 4.724063842848373, + "grad_norm": 0.24488390982151031, + "learning_rate": 5.6882188814948535e-05, + "loss": 1.7477, + "step": 15391 + }, + { + "epoch": 4.724370779619399, + "grad_norm": 0.22674018144607544, + "learning_rate": 5.687726553188636e-05, + "loss": 1.7287, + "step": 15392 + }, + { + "epoch": 4.724677716390423, + "grad_norm": 0.2653258442878723, + "learning_rate": 5.687234218085902e-05, + "loss": 1.7415, + "step": 15393 + }, + { + "epoch": 4.7249846531614486, + "grad_norm": 0.20345374941825867, + "learning_rate": 5.686741876191516e-05, + "loss": 1.764, + "step": 15394 + }, + { + "epoch": 4.725291589932474, + "grad_norm": 0.23193977773189545, + "learning_rate": 5.686249527510345e-05, + "loss": 1.7557, + "step": 15395 + }, + { + "epoch": 4.725598526703499, + "grad_norm": 0.26426708698272705, + "learning_rate": 5.685757172047253e-05, + "loss": 1.7708, + "step": 15396 + }, + { + "epoch": 4.725905463474525, + "grad_norm": 0.21377156674861908, + "learning_rate": 5.685264809807107e-05, + "loss": 1.6921, + "step": 15397 + }, + { + "epoch": 4.726212400245549, + "grad_norm": 0.21628457307815552, + "learning_rate": 5.684772440794773e-05, + "loss": 1.72, + "step": 15398 + }, + { + "epoch": 4.726519337016574, + "grad_norm": 0.19200581312179565, + "learning_rate": 5.684280065015116e-05, + "loss": 1.7311, + "step": 15399 + }, + { + "epoch": 4.7268262737876, + "grad_norm": 0.22227540612220764, + "learning_rate": 5.683787682473003e-05, + "loss": 1.7451, + "step": 15400 + }, + { + "epoch": 4.727133210558625, + "grad_norm": 0.18053604662418365, + "learning_rate": 5.683295293173299e-05, + "loss": 1.6816, + "step": 15401 + }, + { + "epoch": 4.72744014732965, + "grad_norm": 0.19827169179916382, + "learning_rate": 5.682802897120869e-05, + "loss": 1.7315, + "step": 15402 + }, + { + "epoch": 4.727747084100676, + "grad_norm": 0.2768021821975708, + "learning_rate": 5.682310494320582e-05, + "loss": 1.7714, + "step": 15403 + }, + { + "epoch": 4.7280540208717, + "grad_norm": 0.2613474428653717, + "learning_rate": 5.6818180847773027e-05, + "loss": 1.7332, + "step": 15404 + }, + { + "epoch": 4.7283609576427255, + "grad_norm": 0.21546787023544312, + "learning_rate": 5.681325668495898e-05, + "loss": 1.771, + "step": 15405 + }, + { + "epoch": 4.728667894413751, + "grad_norm": 0.24442137777805328, + "learning_rate": 5.680833245481234e-05, + "loss": 1.7296, + "step": 15406 + }, + { + "epoch": 4.728974831184776, + "grad_norm": 0.2622109055519104, + "learning_rate": 5.680340815738175e-05, + "loss": 1.7778, + "step": 15407 + }, + { + "epoch": 4.7292817679558015, + "grad_norm": 0.22379513084888458, + "learning_rate": 5.6798483792715904e-05, + "loss": 1.7953, + "step": 15408 + }, + { + "epoch": 4.729588704726826, + "grad_norm": 0.21901065111160278, + "learning_rate": 5.679355936086346e-05, + "loss": 1.7287, + "step": 15409 + }, + { + "epoch": 4.729895641497851, + "grad_norm": 0.3023792505264282, + "learning_rate": 5.6788634861873066e-05, + "loss": 1.7851, + "step": 15410 + }, + { + "epoch": 4.730202578268877, + "grad_norm": 0.23882482945919037, + "learning_rate": 5.678371029579342e-05, + "loss": 1.7621, + "step": 15411 + }, + { + "epoch": 4.730509515039902, + "grad_norm": 0.2661043703556061, + "learning_rate": 5.6778785662673175e-05, + "loss": 1.7453, + "step": 15412 + }, + { + "epoch": 4.730816451810927, + "grad_norm": 0.330208957195282, + "learning_rate": 5.677386096256099e-05, + "loss": 1.761, + "step": 15413 + }, + { + "epoch": 4.731123388581953, + "grad_norm": 0.2686570882797241, + "learning_rate": 5.676893619550552e-05, + "loss": 1.7539, + "step": 15414 + }, + { + "epoch": 4.731430325352977, + "grad_norm": 0.24308046698570251, + "learning_rate": 5.676401136155548e-05, + "loss": 1.7345, + "step": 15415 + }, + { + "epoch": 4.731737262124002, + "grad_norm": 0.4137137830257416, + "learning_rate": 5.67590864607595e-05, + "loss": 1.7688, + "step": 15416 + }, + { + "epoch": 4.732044198895028, + "grad_norm": 0.32161539793014526, + "learning_rate": 5.675416149316628e-05, + "loss": 1.7881, + "step": 15417 + }, + { + "epoch": 4.732351135666053, + "grad_norm": 0.2336999475955963, + "learning_rate": 5.674923645882447e-05, + "loss": 1.755, + "step": 15418 + }, + { + "epoch": 4.7326580724370775, + "grad_norm": 0.32781684398651123, + "learning_rate": 5.6744311357782754e-05, + "loss": 1.8062, + "step": 15419 + }, + { + "epoch": 4.732965009208103, + "grad_norm": 0.2475704401731491, + "learning_rate": 5.6739386190089795e-05, + "loss": 1.725, + "step": 15420 + }, + { + "epoch": 4.733271945979128, + "grad_norm": 0.26295650005340576, + "learning_rate": 5.673446095579427e-05, + "loss": 1.7673, + "step": 15421 + }, + { + "epoch": 4.7335788827501535, + "grad_norm": 0.3454873859882355, + "learning_rate": 5.6729535654944864e-05, + "loss": 1.7523, + "step": 15422 + }, + { + "epoch": 4.733885819521179, + "grad_norm": 0.2306666374206543, + "learning_rate": 5.672461028759024e-05, + "loss": 1.7085, + "step": 15423 + }, + { + "epoch": 4.734192756292204, + "grad_norm": 0.30825871229171753, + "learning_rate": 5.671968485377908e-05, + "loss": 1.7642, + "step": 15424 + }, + { + "epoch": 4.734499693063229, + "grad_norm": 0.42611342668533325, + "learning_rate": 5.6714759353560045e-05, + "loss": 1.7832, + "step": 15425 + }, + { + "epoch": 4.734806629834254, + "grad_norm": 0.29502514004707336, + "learning_rate": 5.670983378698182e-05, + "loss": 1.8153, + "step": 15426 + }, + { + "epoch": 4.735113566605279, + "grad_norm": 0.28416305780410767, + "learning_rate": 5.6704908154093096e-05, + "loss": 1.756, + "step": 15427 + }, + { + "epoch": 4.735420503376305, + "grad_norm": 0.43111103773117065, + "learning_rate": 5.6699982454942534e-05, + "loss": 1.7797, + "step": 15428 + }, + { + "epoch": 4.73572744014733, + "grad_norm": 0.27667397260665894, + "learning_rate": 5.669505668957882e-05, + "loss": 1.7316, + "step": 15429 + }, + { + "epoch": 4.736034376918354, + "grad_norm": 0.3045295774936676, + "learning_rate": 5.669013085805063e-05, + "loss": 1.7591, + "step": 15430 + }, + { + "epoch": 4.73634131368938, + "grad_norm": 0.4494635760784149, + "learning_rate": 5.6685204960406635e-05, + "loss": 1.8295, + "step": 15431 + }, + { + "epoch": 4.736648250460405, + "grad_norm": 0.2951449453830719, + "learning_rate": 5.6680278996695544e-05, + "loss": 1.7857, + "step": 15432 + }, + { + "epoch": 4.73695518723143, + "grad_norm": 0.2714167535305023, + "learning_rate": 5.6675352966966014e-05, + "loss": 1.816, + "step": 15433 + }, + { + "epoch": 4.737262124002456, + "grad_norm": 0.32701000571250916, + "learning_rate": 5.667042687126673e-05, + "loss": 1.7637, + "step": 15434 + }, + { + "epoch": 4.737569060773481, + "grad_norm": 0.2466556429862976, + "learning_rate": 5.666550070964638e-05, + "loss": 1.7805, + "step": 15435 + }, + { + "epoch": 4.7378759975445055, + "grad_norm": 0.3283855617046356, + "learning_rate": 5.666057448215365e-05, + "loss": 1.786, + "step": 15436 + }, + { + "epoch": 4.738182934315531, + "grad_norm": 0.35860660672187805, + "learning_rate": 5.6655648188837205e-05, + "loss": 1.8309, + "step": 15437 + }, + { + "epoch": 4.738489871086556, + "grad_norm": 0.22293898463249207, + "learning_rate": 5.665072182974576e-05, + "loss": 1.7317, + "step": 15438 + }, + { + "epoch": 4.7387968078575815, + "grad_norm": 0.3155089020729065, + "learning_rate": 5.664579540492798e-05, + "loss": 1.7202, + "step": 15439 + }, + { + "epoch": 4.739103744628607, + "grad_norm": 0.28723904490470886, + "learning_rate": 5.6640868914432566e-05, + "loss": 1.7788, + "step": 15440 + }, + { + "epoch": 4.739410681399631, + "grad_norm": 0.2461984008550644, + "learning_rate": 5.6635942358308183e-05, + "loss": 1.8504, + "step": 15441 + }, + { + "epoch": 4.739717618170657, + "grad_norm": 0.2503122091293335, + "learning_rate": 5.663101573660351e-05, + "loss": 1.7375, + "step": 15442 + }, + { + "epoch": 4.740024554941682, + "grad_norm": 0.24925372004508972, + "learning_rate": 5.662608904936727e-05, + "loss": 1.7152, + "step": 15443 + }, + { + "epoch": 4.740331491712707, + "grad_norm": 0.2734573483467102, + "learning_rate": 5.662116229664813e-05, + "loss": 1.7476, + "step": 15444 + }, + { + "epoch": 4.740638428483733, + "grad_norm": 0.38122060894966125, + "learning_rate": 5.661623547849479e-05, + "loss": 1.7682, + "step": 15445 + }, + { + "epoch": 4.740945365254758, + "grad_norm": 0.3786417245864868, + "learning_rate": 5.661130859495593e-05, + "loss": 1.7446, + "step": 15446 + }, + { + "epoch": 4.741252302025782, + "grad_norm": 0.22618255019187927, + "learning_rate": 5.6606381646080244e-05, + "loss": 1.7427, + "step": 15447 + }, + { + "epoch": 4.741559238796808, + "grad_norm": 0.3000899851322174, + "learning_rate": 5.6601454631916405e-05, + "loss": 1.7087, + "step": 15448 + }, + { + "epoch": 4.741866175567833, + "grad_norm": 0.36542513966560364, + "learning_rate": 5.659652755251315e-05, + "loss": 1.7985, + "step": 15449 + }, + { + "epoch": 4.742173112338858, + "grad_norm": 0.23550496995449066, + "learning_rate": 5.659160040791912e-05, + "loss": 1.8163, + "step": 15450 + }, + { + "epoch": 4.742480049109884, + "grad_norm": 0.25615251064300537, + "learning_rate": 5.658667319818305e-05, + "loss": 1.7372, + "step": 15451 + }, + { + "epoch": 4.742786985880908, + "grad_norm": 0.28744083642959595, + "learning_rate": 5.6581745923353615e-05, + "loss": 1.7193, + "step": 15452 + }, + { + "epoch": 4.7430939226519335, + "grad_norm": 0.2500229775905609, + "learning_rate": 5.65768185834795e-05, + "loss": 1.7263, + "step": 15453 + }, + { + "epoch": 4.743400859422959, + "grad_norm": 0.21520425379276276, + "learning_rate": 5.6571891178609394e-05, + "loss": 1.7337, + "step": 15454 + }, + { + "epoch": 4.743707796193984, + "grad_norm": 0.212506502866745, + "learning_rate": 5.656696370879202e-05, + "loss": 1.7672, + "step": 15455 + }, + { + "epoch": 4.7440147329650095, + "grad_norm": 0.21143417060375214, + "learning_rate": 5.656203617407607e-05, + "loss": 1.7189, + "step": 15456 + }, + { + "epoch": 4.744321669736035, + "grad_norm": 0.18320922553539276, + "learning_rate": 5.6557108574510243e-05, + "loss": 1.7521, + "step": 15457 + }, + { + "epoch": 4.744628606507059, + "grad_norm": 0.19202999770641327, + "learning_rate": 5.655218091014321e-05, + "loss": 1.6756, + "step": 15458 + }, + { + "epoch": 4.744935543278085, + "grad_norm": 0.2152331918478012, + "learning_rate": 5.654725318102367e-05, + "loss": 1.7653, + "step": 15459 + }, + { + "epoch": 4.74524248004911, + "grad_norm": 0.24565903842449188, + "learning_rate": 5.6542325387200354e-05, + "loss": 1.7654, + "step": 15460 + }, + { + "epoch": 4.745549416820135, + "grad_norm": 0.2504819333553314, + "learning_rate": 5.653739752872195e-05, + "loss": 1.7073, + "step": 15461 + }, + { + "epoch": 4.74585635359116, + "grad_norm": 0.19258706271648407, + "learning_rate": 5.653246960563714e-05, + "loss": 1.7106, + "step": 15462 + }, + { + "epoch": 4.746163290362185, + "grad_norm": 0.22961968183517456, + "learning_rate": 5.652754161799465e-05, + "loss": 1.7868, + "step": 15463 + }, + { + "epoch": 4.74647022713321, + "grad_norm": 0.2763231098651886, + "learning_rate": 5.652261356584315e-05, + "loss": 1.7714, + "step": 15464 + }, + { + "epoch": 4.746777163904236, + "grad_norm": 0.23866096138954163, + "learning_rate": 5.651768544923136e-05, + "loss": 1.7537, + "step": 15465 + }, + { + "epoch": 4.747084100675261, + "grad_norm": 0.21851976215839386, + "learning_rate": 5.6512757268207997e-05, + "loss": 1.8109, + "step": 15466 + }, + { + "epoch": 4.747391037446286, + "grad_norm": 0.22249393165111542, + "learning_rate": 5.6507829022821745e-05, + "loss": 1.7357, + "step": 15467 + }, + { + "epoch": 4.747697974217311, + "grad_norm": 0.20202289521694183, + "learning_rate": 5.650290071312131e-05, + "loss": 1.7867, + "step": 15468 + }, + { + "epoch": 4.748004910988336, + "grad_norm": 0.20618727803230286, + "learning_rate": 5.649797233915539e-05, + "loss": 1.6904, + "step": 15469 + }, + { + "epoch": 4.7483118477593615, + "grad_norm": 0.25609052181243896, + "learning_rate": 5.649304390097272e-05, + "loss": 1.7287, + "step": 15470 + }, + { + "epoch": 4.748618784530387, + "grad_norm": 0.22966544330120087, + "learning_rate": 5.648811539862195e-05, + "loss": 1.7384, + "step": 15471 + }, + { + "epoch": 4.748925721301412, + "grad_norm": 0.24070143699645996, + "learning_rate": 5.6483186832151856e-05, + "loss": 1.7625, + "step": 15472 + }, + { + "epoch": 4.749232658072437, + "grad_norm": 0.22642426192760468, + "learning_rate": 5.647825820161109e-05, + "loss": 1.7291, + "step": 15473 + }, + { + "epoch": 4.749539594843462, + "grad_norm": 0.23255646228790283, + "learning_rate": 5.64733295070484e-05, + "loss": 1.8076, + "step": 15474 + }, + { + "epoch": 4.749846531614487, + "grad_norm": 0.20902042090892792, + "learning_rate": 5.646840074851246e-05, + "loss": 1.6627, + "step": 15475 + }, + { + "epoch": 4.750153468385513, + "grad_norm": 0.21608836948871613, + "learning_rate": 5.646347192605198e-05, + "loss": 1.7458, + "step": 15476 + }, + { + "epoch": 4.750460405156538, + "grad_norm": 0.22368495166301727, + "learning_rate": 5.6458543039715694e-05, + "loss": 1.7601, + "step": 15477 + }, + { + "epoch": 4.750767341927563, + "grad_norm": 0.30586308240890503, + "learning_rate": 5.645361408955231e-05, + "loss": 1.8389, + "step": 15478 + }, + { + "epoch": 4.751074278698588, + "grad_norm": 0.25122150778770447, + "learning_rate": 5.644868507561052e-05, + "loss": 1.7509, + "step": 15479 + }, + { + "epoch": 4.751381215469613, + "grad_norm": 0.28435763716697693, + "learning_rate": 5.644375599793904e-05, + "loss": 1.7723, + "step": 15480 + }, + { + "epoch": 4.7516881522406385, + "grad_norm": 0.3111409842967987, + "learning_rate": 5.643882685658659e-05, + "loss": 1.7973, + "step": 15481 + }, + { + "epoch": 4.751995089011664, + "grad_norm": 0.3108380138874054, + "learning_rate": 5.6433897651601874e-05, + "loss": 1.8126, + "step": 15482 + }, + { + "epoch": 4.752302025782689, + "grad_norm": 0.25894731283187866, + "learning_rate": 5.642896838303362e-05, + "loss": 1.7849, + "step": 15483 + }, + { + "epoch": 4.752608962553714, + "grad_norm": 0.39321839809417725, + "learning_rate": 5.642403905093052e-05, + "loss": 1.7583, + "step": 15484 + }, + { + "epoch": 4.752915899324739, + "grad_norm": 0.3206121027469635, + "learning_rate": 5.6419109655341315e-05, + "loss": 1.8061, + "step": 15485 + }, + { + "epoch": 4.753222836095764, + "grad_norm": 0.2817624807357788, + "learning_rate": 5.64141801963147e-05, + "loss": 1.8252, + "step": 15486 + }, + { + "epoch": 4.75352977286679, + "grad_norm": 0.3344736397266388, + "learning_rate": 5.6409250673899405e-05, + "loss": 1.6975, + "step": 15487 + }, + { + "epoch": 4.753836709637815, + "grad_norm": 0.21873882412910461, + "learning_rate": 5.640432108814413e-05, + "loss": 1.7126, + "step": 15488 + }, + { + "epoch": 4.75414364640884, + "grad_norm": 0.3317199945449829, + "learning_rate": 5.639939143909758e-05, + "loss": 1.7826, + "step": 15489 + }, + { + "epoch": 4.754450583179865, + "grad_norm": 0.34901630878448486, + "learning_rate": 5.639446172680854e-05, + "loss": 1.7411, + "step": 15490 + }, + { + "epoch": 4.75475751995089, + "grad_norm": 0.24015867710113525, + "learning_rate": 5.6389531951325645e-05, + "loss": 1.7514, + "step": 15491 + }, + { + "epoch": 4.755064456721915, + "grad_norm": 0.28364554047584534, + "learning_rate": 5.6384602112697674e-05, + "loss": 1.7569, + "step": 15492 + }, + { + "epoch": 4.755371393492941, + "grad_norm": 0.3561246693134308, + "learning_rate": 5.637967221097329e-05, + "loss": 1.7212, + "step": 15493 + }, + { + "epoch": 4.755678330263965, + "grad_norm": 0.3383684456348419, + "learning_rate": 5.637474224620126e-05, + "loss": 1.6866, + "step": 15494 + }, + { + "epoch": 4.7559852670349905, + "grad_norm": 0.2399235963821411, + "learning_rate": 5.63698122184303e-05, + "loss": 1.7609, + "step": 15495 + }, + { + "epoch": 4.756292203806016, + "grad_norm": 0.38559645414352417, + "learning_rate": 5.636488212770912e-05, + "loss": 1.7509, + "step": 15496 + }, + { + "epoch": 4.756599140577041, + "grad_norm": 0.365005224943161, + "learning_rate": 5.635995197408645e-05, + "loss": 1.7894, + "step": 15497 + }, + { + "epoch": 4.7569060773480665, + "grad_norm": 0.21254757046699524, + "learning_rate": 5.635502175761099e-05, + "loss": 1.6969, + "step": 15498 + }, + { + "epoch": 4.757213014119092, + "grad_norm": 0.42865821719169617, + "learning_rate": 5.635009147833149e-05, + "loss": 1.7989, + "step": 15499 + }, + { + "epoch": 4.757519950890116, + "grad_norm": 0.35717228055000305, + "learning_rate": 5.634516113629665e-05, + "loss": 1.7338, + "step": 15500 + }, + { + "epoch": 4.757826887661142, + "grad_norm": 0.21582463383674622, + "learning_rate": 5.634023073155523e-05, + "loss": 1.7429, + "step": 15501 + }, + { + "epoch": 4.758133824432167, + "grad_norm": 0.3376842141151428, + "learning_rate": 5.633530026415592e-05, + "loss": 1.7703, + "step": 15502 + }, + { + "epoch": 4.758440761203192, + "grad_norm": 0.2760981023311615, + "learning_rate": 5.633036973414747e-05, + "loss": 1.7389, + "step": 15503 + }, + { + "epoch": 4.758747697974218, + "grad_norm": 0.3808997571468353, + "learning_rate": 5.63254391415786e-05, + "loss": 1.7513, + "step": 15504 + }, + { + "epoch": 4.759054634745242, + "grad_norm": 0.5152496695518494, + "learning_rate": 5.6320508486498014e-05, + "loss": 1.7376, + "step": 15505 + }, + { + "epoch": 4.759361571516267, + "grad_norm": 0.33983346819877625, + "learning_rate": 5.6315577768954464e-05, + "loss": 1.7209, + "step": 15506 + }, + { + "epoch": 4.759668508287293, + "grad_norm": 0.27064043283462524, + "learning_rate": 5.631064698899669e-05, + "loss": 1.7808, + "step": 15507 + }, + { + "epoch": 4.759975445058318, + "grad_norm": 0.3659237027168274, + "learning_rate": 5.630571614667339e-05, + "loss": 1.7706, + "step": 15508 + }, + { + "epoch": 4.760282381829343, + "grad_norm": 0.246379554271698, + "learning_rate": 5.63007852420333e-05, + "loss": 1.7425, + "step": 15509 + }, + { + "epoch": 4.760589318600369, + "grad_norm": 0.2683795392513275, + "learning_rate": 5.629585427512518e-05, + "loss": 1.7332, + "step": 15510 + }, + { + "epoch": 4.760896255371393, + "grad_norm": 0.32626205682754517, + "learning_rate": 5.6290923245997704e-05, + "loss": 1.786, + "step": 15511 + }, + { + "epoch": 4.7612031921424185, + "grad_norm": 0.23723098635673523, + "learning_rate": 5.6285992154699666e-05, + "loss": 1.7305, + "step": 15512 + }, + { + "epoch": 4.761510128913444, + "grad_norm": 0.26316091418266296, + "learning_rate": 5.628106100127976e-05, + "loss": 1.7804, + "step": 15513 + }, + { + "epoch": 4.761817065684469, + "grad_norm": 0.24376356601715088, + "learning_rate": 5.6276129785786726e-05, + "loss": 1.738, + "step": 15514 + }, + { + "epoch": 4.7621240024554945, + "grad_norm": 0.27778422832489014, + "learning_rate": 5.627119850826931e-05, + "loss": 1.7444, + "step": 15515 + }, + { + "epoch": 4.762430939226519, + "grad_norm": 0.3134306073188782, + "learning_rate": 5.6266267168776224e-05, + "loss": 1.7696, + "step": 15516 + }, + { + "epoch": 4.762737875997544, + "grad_norm": 0.2354283481836319, + "learning_rate": 5.6261335767356195e-05, + "loss": 1.799, + "step": 15517 + }, + { + "epoch": 4.76304481276857, + "grad_norm": 0.26902756094932556, + "learning_rate": 5.6256404304058e-05, + "loss": 1.7091, + "step": 15518 + }, + { + "epoch": 4.763351749539595, + "grad_norm": 0.2760716676712036, + "learning_rate": 5.6251472778930345e-05, + "loss": 1.742, + "step": 15519 + }, + { + "epoch": 4.76365868631062, + "grad_norm": 0.2138829231262207, + "learning_rate": 5.624654119202197e-05, + "loss": 1.7093, + "step": 15520 + }, + { + "epoch": 4.763965623081646, + "grad_norm": 0.31404614448547363, + "learning_rate": 5.624160954338162e-05, + "loss": 1.7467, + "step": 15521 + }, + { + "epoch": 4.76427255985267, + "grad_norm": 0.24810083210468292, + "learning_rate": 5.623667783305803e-05, + "loss": 1.745, + "step": 15522 + }, + { + "epoch": 4.764579496623695, + "grad_norm": 0.23674242198467255, + "learning_rate": 5.6231746061099913e-05, + "loss": 1.7662, + "step": 15523 + }, + { + "epoch": 4.764886433394721, + "grad_norm": 0.264230877161026, + "learning_rate": 5.622681422755606e-05, + "loss": 1.7627, + "step": 15524 + }, + { + "epoch": 4.765193370165746, + "grad_norm": 0.2982041537761688, + "learning_rate": 5.6221882332475165e-05, + "loss": 1.7558, + "step": 15525 + }, + { + "epoch": 4.765500306936771, + "grad_norm": 0.29215967655181885, + "learning_rate": 5.6216950375905975e-05, + "loss": 1.7981, + "step": 15526 + }, + { + "epoch": 4.765807243707796, + "grad_norm": 0.20014487206935883, + "learning_rate": 5.6212018357897244e-05, + "loss": 1.7113, + "step": 15527 + }, + { + "epoch": 4.766114180478821, + "grad_norm": 0.22359825670719147, + "learning_rate": 5.620708627849769e-05, + "loss": 1.7356, + "step": 15528 + }, + { + "epoch": 4.7664211172498465, + "grad_norm": 0.2254783809185028, + "learning_rate": 5.620215413775609e-05, + "loss": 1.7397, + "step": 15529 + }, + { + "epoch": 4.766728054020872, + "grad_norm": 0.2827560305595398, + "learning_rate": 5.619722193572117e-05, + "loss": 1.732, + "step": 15530 + }, + { + "epoch": 4.767034990791897, + "grad_norm": 0.22591307759284973, + "learning_rate": 5.619228967244165e-05, + "loss": 1.7713, + "step": 15531 + }, + { + "epoch": 4.7673419275629225, + "grad_norm": 0.25872737169265747, + "learning_rate": 5.618735734796632e-05, + "loss": 1.7291, + "step": 15532 + }, + { + "epoch": 4.767648864333947, + "grad_norm": 0.24515275657176971, + "learning_rate": 5.6182424962343884e-05, + "loss": 1.8079, + "step": 15533 + }, + { + "epoch": 4.767955801104972, + "grad_norm": 0.2456643134355545, + "learning_rate": 5.617749251562309e-05, + "loss": 1.7082, + "step": 15534 + }, + { + "epoch": 4.768262737875998, + "grad_norm": 0.21684220433235168, + "learning_rate": 5.6172560007852716e-05, + "loss": 1.7563, + "step": 15535 + }, + { + "epoch": 4.768569674647023, + "grad_norm": 0.2141445428133011, + "learning_rate": 5.616762743908147e-05, + "loss": 1.7115, + "step": 15536 + }, + { + "epoch": 4.768876611418047, + "grad_norm": 0.22502638399600983, + "learning_rate": 5.616269480935812e-05, + "loss": 1.723, + "step": 15537 + }, + { + "epoch": 4.769183548189073, + "grad_norm": 0.23387989401817322, + "learning_rate": 5.6157762118731416e-05, + "loss": 1.7775, + "step": 15538 + }, + { + "epoch": 4.769490484960098, + "grad_norm": 0.19615057110786438, + "learning_rate": 5.6152829367250096e-05, + "loss": 1.7696, + "step": 15539 + }, + { + "epoch": 4.769797421731123, + "grad_norm": 0.2408154010772705, + "learning_rate": 5.614789655496289e-05, + "loss": 1.7758, + "step": 15540 + }, + { + "epoch": 4.770104358502149, + "grad_norm": 0.20994634926319122, + "learning_rate": 5.614296368191859e-05, + "loss": 1.6935, + "step": 15541 + }, + { + "epoch": 4.770411295273174, + "grad_norm": 0.24135129153728485, + "learning_rate": 5.613803074816591e-05, + "loss": 1.7644, + "step": 15542 + }, + { + "epoch": 4.7707182320441985, + "grad_norm": 0.2380143105983734, + "learning_rate": 5.6133097753753625e-05, + "loss": 1.741, + "step": 15543 + }, + { + "epoch": 4.771025168815224, + "grad_norm": 0.30300623178482056, + "learning_rate": 5.6128164698730465e-05, + "loss": 1.7935, + "step": 15544 + }, + { + "epoch": 4.771332105586249, + "grad_norm": 0.2620760500431061, + "learning_rate": 5.612323158314519e-05, + "loss": 1.7436, + "step": 15545 + }, + { + "epoch": 4.7716390423572745, + "grad_norm": 0.3791491389274597, + "learning_rate": 5.6118298407046544e-05, + "loss": 1.7503, + "step": 15546 + }, + { + "epoch": 4.7719459791283, + "grad_norm": 0.3830909729003906, + "learning_rate": 5.61133651704833e-05, + "loss": 1.7651, + "step": 15547 + }, + { + "epoch": 4.772252915899324, + "grad_norm": 0.26680612564086914, + "learning_rate": 5.610843187350419e-05, + "loss": 1.8075, + "step": 15548 + }, + { + "epoch": 4.77255985267035, + "grad_norm": 0.38018953800201416, + "learning_rate": 5.610349851615798e-05, + "loss": 1.8301, + "step": 15549 + }, + { + "epoch": 4.772866789441375, + "grad_norm": 0.4514484107494354, + "learning_rate": 5.6098565098493414e-05, + "loss": 1.7709, + "step": 15550 + }, + { + "epoch": 4.7731737262124, + "grad_norm": 0.28267863392829895, + "learning_rate": 5.6093631620559254e-05, + "loss": 1.8087, + "step": 15551 + }, + { + "epoch": 4.773480662983426, + "grad_norm": 0.22541162371635437, + "learning_rate": 5.6088698082404256e-05, + "loss": 1.7457, + "step": 15552 + }, + { + "epoch": 4.773787599754451, + "grad_norm": 0.3012544512748718, + "learning_rate": 5.608376448407718e-05, + "loss": 1.7454, + "step": 15553 + }, + { + "epoch": 4.774094536525475, + "grad_norm": 0.2460169941186905, + "learning_rate": 5.607883082562677e-05, + "loss": 1.8237, + "step": 15554 + }, + { + "epoch": 4.774401473296501, + "grad_norm": 0.2918507158756256, + "learning_rate": 5.6073897107101804e-05, + "loss": 1.7416, + "step": 15555 + }, + { + "epoch": 4.774708410067526, + "grad_norm": 0.3104710280895233, + "learning_rate": 5.6068963328551016e-05, + "loss": 1.8162, + "step": 15556 + }, + { + "epoch": 4.7750153468385514, + "grad_norm": 0.2576459050178528, + "learning_rate": 5.606402949002317e-05, + "loss": 1.7732, + "step": 15557 + }, + { + "epoch": 4.775322283609577, + "grad_norm": 0.2373739629983902, + "learning_rate": 5.605909559156706e-05, + "loss": 1.7812, + "step": 15558 + }, + { + "epoch": 4.775629220380601, + "grad_norm": 0.30436694622039795, + "learning_rate": 5.6054161633231385e-05, + "loss": 1.7606, + "step": 15559 + }, + { + "epoch": 4.775936157151627, + "grad_norm": 0.3058558702468872, + "learning_rate": 5.604922761506495e-05, + "loss": 1.8384, + "step": 15560 + }, + { + "epoch": 4.776243093922652, + "grad_norm": 0.26421624422073364, + "learning_rate": 5.6044293537116496e-05, + "loss": 1.8041, + "step": 15561 + }, + { + "epoch": 4.776550030693677, + "grad_norm": 0.4945085346698761, + "learning_rate": 5.603935939943479e-05, + "loss": 1.7522, + "step": 15562 + }, + { + "epoch": 4.776856967464703, + "grad_norm": 0.41049134731292725, + "learning_rate": 5.6034425202068595e-05, + "loss": 1.7471, + "step": 15563 + }, + { + "epoch": 4.777163904235728, + "grad_norm": 0.22972853481769562, + "learning_rate": 5.602949094506668e-05, + "loss": 1.7041, + "step": 15564 + }, + { + "epoch": 4.777470841006752, + "grad_norm": 0.37373700737953186, + "learning_rate": 5.6024556628477785e-05, + "loss": 1.7811, + "step": 15565 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.3603375554084778, + "learning_rate": 5.6019622252350714e-05, + "loss": 1.8396, + "step": 15566 + }, + { + "epoch": 4.778084714548803, + "grad_norm": 0.2085956335067749, + "learning_rate": 5.601468781673419e-05, + "loss": 1.7453, + "step": 15567 + }, + { + "epoch": 4.778391651319828, + "grad_norm": 0.28871405124664307, + "learning_rate": 5.6009753321677e-05, + "loss": 1.7135, + "step": 15568 + }, + { + "epoch": 4.778698588090853, + "grad_norm": 0.2378411591053009, + "learning_rate": 5.600481876722791e-05, + "loss": 1.77, + "step": 15569 + }, + { + "epoch": 4.779005524861878, + "grad_norm": 0.2902696430683136, + "learning_rate": 5.599988415343567e-05, + "loss": 1.7416, + "step": 15570 + }, + { + "epoch": 4.7793124616329035, + "grad_norm": 0.36155447363853455, + "learning_rate": 5.5994949480349066e-05, + "loss": 1.7095, + "step": 15571 + }, + { + "epoch": 4.779619398403929, + "grad_norm": 0.24867403507232666, + "learning_rate": 5.599001474801686e-05, + "loss": 1.8063, + "step": 15572 + }, + { + "epoch": 4.779926335174954, + "grad_norm": 0.24853186309337616, + "learning_rate": 5.5985079956487815e-05, + "loss": 1.7537, + "step": 15573 + }, + { + "epoch": 4.7802332719459795, + "grad_norm": 0.31984636187553406, + "learning_rate": 5.598014510581071e-05, + "loss": 1.7888, + "step": 15574 + }, + { + "epoch": 4.780540208717004, + "grad_norm": 0.23907123506069183, + "learning_rate": 5.597521019603429e-05, + "loss": 1.7157, + "step": 15575 + }, + { + "epoch": 4.780847145488029, + "grad_norm": 0.25759413838386536, + "learning_rate": 5.597027522720736e-05, + "loss": 1.7579, + "step": 15576 + }, + { + "epoch": 4.781154082259055, + "grad_norm": 0.34123921394348145, + "learning_rate": 5.5965340199378654e-05, + "loss": 1.838, + "step": 15577 + }, + { + "epoch": 4.78146101903008, + "grad_norm": 0.2769980728626251, + "learning_rate": 5.596040511259697e-05, + "loss": 1.7889, + "step": 15578 + }, + { + "epoch": 4.781767955801105, + "grad_norm": 0.21936915814876556, + "learning_rate": 5.5955469966911066e-05, + "loss": 1.7434, + "step": 15579 + }, + { + "epoch": 4.78207489257213, + "grad_norm": 0.27583181858062744, + "learning_rate": 5.59505347623697e-05, + "loss": 1.7229, + "step": 15580 + }, + { + "epoch": 4.782381829343155, + "grad_norm": 0.24246171116828918, + "learning_rate": 5.594559949902168e-05, + "loss": 1.7368, + "step": 15581 + }, + { + "epoch": 4.78268876611418, + "grad_norm": 0.22705630958080292, + "learning_rate": 5.594066417691576e-05, + "loss": 1.7261, + "step": 15582 + }, + { + "epoch": 4.782995702885206, + "grad_norm": 0.23308728635311127, + "learning_rate": 5.593572879610072e-05, + "loss": 1.7451, + "step": 15583 + }, + { + "epoch": 4.783302639656231, + "grad_norm": 0.21654267609119415, + "learning_rate": 5.5930793356625324e-05, + "loss": 1.7133, + "step": 15584 + }, + { + "epoch": 4.783609576427256, + "grad_norm": 0.22884133458137512, + "learning_rate": 5.5925857858538347e-05, + "loss": 1.6899, + "step": 15585 + }, + { + "epoch": 4.783916513198281, + "grad_norm": 0.2396838665008545, + "learning_rate": 5.5920922301888555e-05, + "loss": 1.7837, + "step": 15586 + }, + { + "epoch": 4.784223449969306, + "grad_norm": 0.22941450774669647, + "learning_rate": 5.5915986686724765e-05, + "loss": 1.7443, + "step": 15587 + }, + { + "epoch": 4.7845303867403315, + "grad_norm": 0.23992502689361572, + "learning_rate": 5.591105101309572e-05, + "loss": 1.8054, + "step": 15588 + }, + { + "epoch": 4.784837323511357, + "grad_norm": 0.2540588974952698, + "learning_rate": 5.59061152810502e-05, + "loss": 1.855, + "step": 15589 + }, + { + "epoch": 4.785144260282382, + "grad_norm": 0.22691720724105835, + "learning_rate": 5.590117949063699e-05, + "loss": 1.7441, + "step": 15590 + }, + { + "epoch": 4.785451197053407, + "grad_norm": 0.23691289126873016, + "learning_rate": 5.5896243641904864e-05, + "loss": 1.8156, + "step": 15591 + }, + { + "epoch": 4.785758133824432, + "grad_norm": 0.2749332785606384, + "learning_rate": 5.589130773490261e-05, + "loss": 1.8157, + "step": 15592 + }, + { + "epoch": 4.786065070595457, + "grad_norm": 0.2435624748468399, + "learning_rate": 5.588637176967899e-05, + "loss": 1.7473, + "step": 15593 + }, + { + "epoch": 4.786372007366483, + "grad_norm": 0.22931383550167084, + "learning_rate": 5.5881435746282795e-05, + "loss": 1.7652, + "step": 15594 + }, + { + "epoch": 4.786678944137508, + "grad_norm": 0.23916593194007874, + "learning_rate": 5.587649966476282e-05, + "loss": 1.7415, + "step": 15595 + }, + { + "epoch": 4.786985880908533, + "grad_norm": 0.23483172059059143, + "learning_rate": 5.5871563525167814e-05, + "loss": 1.7308, + "step": 15596 + }, + { + "epoch": 4.787292817679558, + "grad_norm": 0.24850021302700043, + "learning_rate": 5.586662732754656e-05, + "loss": 1.8294, + "step": 15597 + }, + { + "epoch": 4.787599754450583, + "grad_norm": 0.2439260333776474, + "learning_rate": 5.586169107194788e-05, + "loss": 1.7599, + "step": 15598 + }, + { + "epoch": 4.787906691221608, + "grad_norm": 0.22379007935523987, + "learning_rate": 5.585675475842054e-05, + "loss": 1.7278, + "step": 15599 + }, + { + "epoch": 4.788213627992634, + "grad_norm": 0.2633908689022064, + "learning_rate": 5.58518183870133e-05, + "loss": 1.7318, + "step": 15600 + }, + { + "epoch": 4.788520564763659, + "grad_norm": 0.20992474257946014, + "learning_rate": 5.584688195777497e-05, + "loss": 1.7003, + "step": 15601 + }, + { + "epoch": 4.7888275015346835, + "grad_norm": 0.2460084706544876, + "learning_rate": 5.584194547075432e-05, + "loss": 1.78, + "step": 15602 + }, + { + "epoch": 4.789134438305709, + "grad_norm": 0.23955418169498444, + "learning_rate": 5.583700892600013e-05, + "loss": 1.7953, + "step": 15603 + }, + { + "epoch": 4.789441375076734, + "grad_norm": 0.2495713233947754, + "learning_rate": 5.583207232356121e-05, + "loss": 1.7874, + "step": 15604 + }, + { + "epoch": 4.7897483118477595, + "grad_norm": 0.22878028452396393, + "learning_rate": 5.5827135663486344e-05, + "loss": 1.7961, + "step": 15605 + }, + { + "epoch": 4.790055248618785, + "grad_norm": 0.2299363762140274, + "learning_rate": 5.582219894582429e-05, + "loss": 1.7497, + "step": 15606 + }, + { + "epoch": 4.79036218538981, + "grad_norm": 0.22896108031272888, + "learning_rate": 5.5817262170623865e-05, + "loss": 1.7543, + "step": 15607 + }, + { + "epoch": 4.790669122160835, + "grad_norm": 0.2150495946407318, + "learning_rate": 5.581232533793383e-05, + "loss": 1.8034, + "step": 15608 + }, + { + "epoch": 4.79097605893186, + "grad_norm": 0.21317999064922333, + "learning_rate": 5.580738844780301e-05, + "loss": 1.7482, + "step": 15609 + }, + { + "epoch": 4.791282995702885, + "grad_norm": 0.21904391050338745, + "learning_rate": 5.580245150028016e-05, + "loss": 1.7647, + "step": 15610 + }, + { + "epoch": 4.791589932473911, + "grad_norm": 0.2026481032371521, + "learning_rate": 5.5797514495414095e-05, + "loss": 1.6997, + "step": 15611 + }, + { + "epoch": 4.791896869244935, + "grad_norm": 0.22508487105369568, + "learning_rate": 5.579257743325359e-05, + "loss": 1.8258, + "step": 15612 + }, + { + "epoch": 4.79220380601596, + "grad_norm": 0.2801211178302765, + "learning_rate": 5.5787640313847435e-05, + "loss": 1.6991, + "step": 15613 + }, + { + "epoch": 4.792510742786986, + "grad_norm": 0.2696724236011505, + "learning_rate": 5.578270313724442e-05, + "loss": 1.7339, + "step": 15614 + }, + { + "epoch": 4.792817679558011, + "grad_norm": 0.2909143269062042, + "learning_rate": 5.577776590349334e-05, + "loss": 1.8481, + "step": 15615 + }, + { + "epoch": 4.793124616329036, + "grad_norm": 0.21682757139205933, + "learning_rate": 5.5772828612643005e-05, + "loss": 1.759, + "step": 15616 + }, + { + "epoch": 4.793431553100062, + "grad_norm": 0.23074059188365936, + "learning_rate": 5.576789126474219e-05, + "loss": 1.7652, + "step": 15617 + }, + { + "epoch": 4.793738489871086, + "grad_norm": 0.24018999934196472, + "learning_rate": 5.576295385983969e-05, + "loss": 1.7986, + "step": 15618 + }, + { + "epoch": 4.7940454266421115, + "grad_norm": 0.23987948894500732, + "learning_rate": 5.575801639798431e-05, + "loss": 1.779, + "step": 15619 + }, + { + "epoch": 4.794352363413137, + "grad_norm": 0.2138533890247345, + "learning_rate": 5.575307887922482e-05, + "loss": 1.7097, + "step": 15620 + }, + { + "epoch": 4.794659300184162, + "grad_norm": 0.1995106190443039, + "learning_rate": 5.5748141303610044e-05, + "loss": 1.6924, + "step": 15621 + }, + { + "epoch": 4.7949662369551875, + "grad_norm": 0.23547641932964325, + "learning_rate": 5.574320367118877e-05, + "loss": 1.8492, + "step": 15622 + }, + { + "epoch": 4.795273173726212, + "grad_norm": 0.22931239008903503, + "learning_rate": 5.5738265982009794e-05, + "loss": 1.8054, + "step": 15623 + }, + { + "epoch": 4.795580110497237, + "grad_norm": 0.19957222044467926, + "learning_rate": 5.573332823612191e-05, + "loss": 1.7464, + "step": 15624 + }, + { + "epoch": 4.795887047268263, + "grad_norm": 0.1990327090024948, + "learning_rate": 5.5728390433573905e-05, + "loss": 1.7438, + "step": 15625 + }, + { + "epoch": 4.796193984039288, + "grad_norm": 0.22276802361011505, + "learning_rate": 5.572345257441459e-05, + "loss": 1.7674, + "step": 15626 + }, + { + "epoch": 4.796500920810313, + "grad_norm": 0.2109617441892624, + "learning_rate": 5.571851465869277e-05, + "loss": 1.7577, + "step": 15627 + }, + { + "epoch": 4.796807857581339, + "grad_norm": 0.22917217016220093, + "learning_rate": 5.5713576686457234e-05, + "loss": 1.7478, + "step": 15628 + }, + { + "epoch": 4.797114794352363, + "grad_norm": 0.21016938984394073, + "learning_rate": 5.570863865775678e-05, + "loss": 1.8078, + "step": 15629 + }, + { + "epoch": 4.797421731123388, + "grad_norm": 0.22478216886520386, + "learning_rate": 5.5703700572640215e-05, + "loss": 1.7621, + "step": 15630 + }, + { + "epoch": 4.797728667894414, + "grad_norm": 0.26899904012680054, + "learning_rate": 5.569876243115634e-05, + "loss": 1.8065, + "step": 15631 + }, + { + "epoch": 4.798035604665439, + "grad_norm": 0.23187808692455292, + "learning_rate": 5.569382423335394e-05, + "loss": 1.7337, + "step": 15632 + }, + { + "epoch": 4.798342541436464, + "grad_norm": 0.2264855057001114, + "learning_rate": 5.568888597928185e-05, + "loss": 1.7879, + "step": 15633 + }, + { + "epoch": 4.798649478207489, + "grad_norm": 0.244137242436409, + "learning_rate": 5.568394766898886e-05, + "loss": 1.8307, + "step": 15634 + }, + { + "epoch": 4.798956414978514, + "grad_norm": 0.2400583177804947, + "learning_rate": 5.5679009302523744e-05, + "loss": 1.76, + "step": 15635 + }, + { + "epoch": 4.7992633517495396, + "grad_norm": 0.2324059158563614, + "learning_rate": 5.5674070879935347e-05, + "loss": 1.7594, + "step": 15636 + }, + { + "epoch": 4.799570288520565, + "grad_norm": 0.21753786504268646, + "learning_rate": 5.566913240127244e-05, + "loss": 1.7568, + "step": 15637 + }, + { + "epoch": 4.79987722529159, + "grad_norm": 0.21557624638080597, + "learning_rate": 5.566419386658386e-05, + "loss": 1.7733, + "step": 15638 + }, + { + "epoch": 4.800184162062616, + "grad_norm": 0.22795113921165466, + "learning_rate": 5.565925527591839e-05, + "loss": 1.7624, + "step": 15639 + }, + { + "epoch": 4.80049109883364, + "grad_norm": 0.23035180568695068, + "learning_rate": 5.565431662932484e-05, + "loss": 1.7436, + "step": 15640 + }, + { + "epoch": 4.800798035604665, + "grad_norm": 0.2569425404071808, + "learning_rate": 5.564937792685203e-05, + "loss": 1.7027, + "step": 15641 + }, + { + "epoch": 4.801104972375691, + "grad_norm": 0.20544980466365814, + "learning_rate": 5.564443916854875e-05, + "loss": 1.7125, + "step": 15642 + }, + { + "epoch": 4.801411909146716, + "grad_norm": 0.25040850043296814, + "learning_rate": 5.5639500354463815e-05, + "loss": 1.7646, + "step": 15643 + }, + { + "epoch": 4.8017188459177405, + "grad_norm": 0.1991344839334488, + "learning_rate": 5.563456148464602e-05, + "loss": 1.7206, + "step": 15644 + }, + { + "epoch": 4.802025782688766, + "grad_norm": 0.236537903547287, + "learning_rate": 5.56296225591442e-05, + "loss": 1.7288, + "step": 15645 + }, + { + "epoch": 4.802332719459791, + "grad_norm": 0.253619521856308, + "learning_rate": 5.562468357800714e-05, + "loss": 1.7347, + "step": 15646 + }, + { + "epoch": 4.8026396562308165, + "grad_norm": 0.22038741409778595, + "learning_rate": 5.561974454128367e-05, + "loss": 1.7854, + "step": 15647 + }, + { + "epoch": 4.802946593001842, + "grad_norm": 0.24848157167434692, + "learning_rate": 5.5614805449022576e-05, + "loss": 1.6904, + "step": 15648 + }, + { + "epoch": 4.803253529772867, + "grad_norm": 0.28735271096229553, + "learning_rate": 5.56098663012727e-05, + "loss": 1.7476, + "step": 15649 + }, + { + "epoch": 4.803560466543892, + "grad_norm": 0.2658432722091675, + "learning_rate": 5.5604927098082825e-05, + "loss": 1.7314, + "step": 15650 + }, + { + "epoch": 4.803867403314917, + "grad_norm": 0.20409154891967773, + "learning_rate": 5.559998783950179e-05, + "loss": 1.7698, + "step": 15651 + }, + { + "epoch": 4.804174340085942, + "grad_norm": 0.21932728588581085, + "learning_rate": 5.5595048525578384e-05, + "loss": 1.7808, + "step": 15652 + }, + { + "epoch": 4.804481276856968, + "grad_norm": 0.2549879848957062, + "learning_rate": 5.559010915636143e-05, + "loss": 1.8294, + "step": 15653 + }, + { + "epoch": 4.804788213627993, + "grad_norm": 0.2002289742231369, + "learning_rate": 5.5585169731899736e-05, + "loss": 1.732, + "step": 15654 + }, + { + "epoch": 4.805095150399017, + "grad_norm": 0.19988931715488434, + "learning_rate": 5.558023025224212e-05, + "loss": 1.7482, + "step": 15655 + }, + { + "epoch": 4.805402087170043, + "grad_norm": 0.21265259385108948, + "learning_rate": 5.55752907174374e-05, + "loss": 1.8003, + "step": 15656 + }, + { + "epoch": 4.805709023941068, + "grad_norm": 0.22365640103816986, + "learning_rate": 5.5570351127534395e-05, + "loss": 1.7536, + "step": 15657 + }, + { + "epoch": 4.806015960712093, + "grad_norm": 0.25516408681869507, + "learning_rate": 5.556541148258192e-05, + "loss": 1.7648, + "step": 15658 + }, + { + "epoch": 4.806322897483119, + "grad_norm": 0.24870765209197998, + "learning_rate": 5.5560471782628775e-05, + "loss": 1.7793, + "step": 15659 + }, + { + "epoch": 4.806629834254144, + "grad_norm": 0.22119416296482086, + "learning_rate": 5.555553202772379e-05, + "loss": 1.7464, + "step": 15660 + }, + { + "epoch": 4.8069367710251685, + "grad_norm": 0.2781904637813568, + "learning_rate": 5.555059221791579e-05, + "loss": 1.7537, + "step": 15661 + }, + { + "epoch": 4.807243707796194, + "grad_norm": 0.2433774471282959, + "learning_rate": 5.5545652353253574e-05, + "loss": 1.74, + "step": 15662 + }, + { + "epoch": 4.807550644567219, + "grad_norm": 0.19932180643081665, + "learning_rate": 5.554071243378598e-05, + "loss": 1.75, + "step": 15663 + }, + { + "epoch": 4.8078575813382445, + "grad_norm": 0.2428865283727646, + "learning_rate": 5.553577245956182e-05, + "loss": 1.7198, + "step": 15664 + }, + { + "epoch": 4.80816451810927, + "grad_norm": 0.2914198338985443, + "learning_rate": 5.553083243062991e-05, + "loss": 1.7544, + "step": 15665 + }, + { + "epoch": 4.808471454880294, + "grad_norm": 0.2274291068315506, + "learning_rate": 5.5525892347039056e-05, + "loss": 1.8213, + "step": 15666 + }, + { + "epoch": 4.80877839165132, + "grad_norm": 0.23662471771240234, + "learning_rate": 5.552095220883811e-05, + "loss": 1.8025, + "step": 15667 + }, + { + "epoch": 4.809085328422345, + "grad_norm": 0.23062555491924286, + "learning_rate": 5.551601201607587e-05, + "loss": 1.7109, + "step": 15668 + }, + { + "epoch": 4.80939226519337, + "grad_norm": 0.19986943900585175, + "learning_rate": 5.551107176880117e-05, + "loss": 1.7442, + "step": 15669 + }, + { + "epoch": 4.809699201964396, + "grad_norm": 0.2545560300350189, + "learning_rate": 5.5506131467062836e-05, + "loss": 1.7609, + "step": 15670 + }, + { + "epoch": 4.810006138735421, + "grad_norm": 0.253296434879303, + "learning_rate": 5.550119111090968e-05, + "loss": 1.7307, + "step": 15671 + }, + { + "epoch": 4.810313075506445, + "grad_norm": 0.19617940485477448, + "learning_rate": 5.549625070039052e-05, + "loss": 1.7507, + "step": 15672 + }, + { + "epoch": 4.810620012277471, + "grad_norm": 0.2525297999382019, + "learning_rate": 5.5491310235554193e-05, + "loss": 1.8021, + "step": 15673 + }, + { + "epoch": 4.810926949048496, + "grad_norm": 0.20537389814853668, + "learning_rate": 5.548636971644953e-05, + "loss": 1.7432, + "step": 15674 + }, + { + "epoch": 4.811233885819521, + "grad_norm": 0.19924211502075195, + "learning_rate": 5.548142914312533e-05, + "loss": 1.7741, + "step": 15675 + }, + { + "epoch": 4.811540822590547, + "grad_norm": 0.21121448278427124, + "learning_rate": 5.547648851563046e-05, + "loss": 1.7198, + "step": 15676 + }, + { + "epoch": 4.811847759361571, + "grad_norm": 0.23504914343357086, + "learning_rate": 5.547154783401369e-05, + "loss": 1.7173, + "step": 15677 + }, + { + "epoch": 4.8121546961325965, + "grad_norm": 0.2362392097711563, + "learning_rate": 5.54666070983239e-05, + "loss": 1.7752, + "step": 15678 + }, + { + "epoch": 4.812461632903622, + "grad_norm": 0.2524966895580292, + "learning_rate": 5.5461666308609886e-05, + "loss": 1.7943, + "step": 15679 + }, + { + "epoch": 4.812768569674647, + "grad_norm": 0.2250952422618866, + "learning_rate": 5.5456725464920476e-05, + "loss": 1.7606, + "step": 15680 + }, + { + "epoch": 4.8130755064456725, + "grad_norm": 0.21753156185150146, + "learning_rate": 5.5451784567304524e-05, + "loss": 1.7846, + "step": 15681 + }, + { + "epoch": 4.813382443216698, + "grad_norm": 0.220795676112175, + "learning_rate": 5.5446843615810825e-05, + "loss": 1.7422, + "step": 15682 + }, + { + "epoch": 4.813689379987722, + "grad_norm": 0.23597733676433563, + "learning_rate": 5.544190261048823e-05, + "loss": 1.7818, + "step": 15683 + }, + { + "epoch": 4.813996316758748, + "grad_norm": 0.2625976502895355, + "learning_rate": 5.543696155138557e-05, + "loss": 1.7796, + "step": 15684 + }, + { + "epoch": 4.814303253529773, + "grad_norm": 0.20515871047973633, + "learning_rate": 5.5432020438551656e-05, + "loss": 1.7096, + "step": 15685 + }, + { + "epoch": 4.814610190300798, + "grad_norm": 0.19353924691677094, + "learning_rate": 5.542707927203536e-05, + "loss": 1.7541, + "step": 15686 + }, + { + "epoch": 4.814917127071823, + "grad_norm": 0.21998172998428345, + "learning_rate": 5.5422138051885454e-05, + "loss": 1.7696, + "step": 15687 + }, + { + "epoch": 4.815224063842848, + "grad_norm": 0.27576857805252075, + "learning_rate": 5.5417196778150816e-05, + "loss": 1.7491, + "step": 15688 + }, + { + "epoch": 4.815531000613873, + "grad_norm": 0.28202036023139954, + "learning_rate": 5.5412255450880254e-05, + "loss": 1.8615, + "step": 15689 + }, + { + "epoch": 4.815837937384899, + "grad_norm": 0.29632845520973206, + "learning_rate": 5.540731407012263e-05, + "loss": 1.7698, + "step": 15690 + }, + { + "epoch": 4.816144874155924, + "grad_norm": 0.35393890738487244, + "learning_rate": 5.540237263592675e-05, + "loss": 1.7924, + "step": 15691 + }, + { + "epoch": 4.816451810926949, + "grad_norm": 0.23756493628025055, + "learning_rate": 5.5397431148341447e-05, + "loss": 1.8301, + "step": 15692 + }, + { + "epoch": 4.816758747697974, + "grad_norm": 0.310153603553772, + "learning_rate": 5.53924896074156e-05, + "loss": 1.8162, + "step": 15693 + }, + { + "epoch": 4.817065684468999, + "grad_norm": 0.3355565369129181, + "learning_rate": 5.538754801319797e-05, + "loss": 1.7738, + "step": 15694 + }, + { + "epoch": 4.8173726212400245, + "grad_norm": 0.2360079288482666, + "learning_rate": 5.5382606365737446e-05, + "loss": 1.6883, + "step": 15695 + }, + { + "epoch": 4.81767955801105, + "grad_norm": 0.2932819724082947, + "learning_rate": 5.537766466508286e-05, + "loss": 1.8045, + "step": 15696 + }, + { + "epoch": 4.817986494782075, + "grad_norm": 0.31298181414604187, + "learning_rate": 5.537272291128304e-05, + "loss": 1.7516, + "step": 15697 + }, + { + "epoch": 4.8182934315531, + "grad_norm": 0.22871924936771393, + "learning_rate": 5.5367781104386806e-05, + "loss": 1.7386, + "step": 15698 + }, + { + "epoch": 4.818600368324125, + "grad_norm": 0.27097782492637634, + "learning_rate": 5.5362839244443034e-05, + "loss": 1.733, + "step": 15699 + }, + { + "epoch": 4.81890730509515, + "grad_norm": 0.23296736180782318, + "learning_rate": 5.535789733150052e-05, + "loss": 1.7735, + "step": 15700 + }, + { + "epoch": 4.819214241866176, + "grad_norm": 0.22650237381458282, + "learning_rate": 5.5352955365608125e-05, + "loss": 1.7443, + "step": 15701 + }, + { + "epoch": 4.819521178637201, + "grad_norm": 0.25525161623954773, + "learning_rate": 5.534801334681471e-05, + "loss": 1.7379, + "step": 15702 + }, + { + "epoch": 4.819828115408226, + "grad_norm": 0.2249457836151123, + "learning_rate": 5.534307127516908e-05, + "loss": 1.7393, + "step": 15703 + }, + { + "epoch": 4.820135052179251, + "grad_norm": 0.1995566338300705, + "learning_rate": 5.5338129150720084e-05, + "loss": 1.7411, + "step": 15704 + }, + { + "epoch": 4.820441988950276, + "grad_norm": 0.250851035118103, + "learning_rate": 5.533318697351657e-05, + "loss": 1.7801, + "step": 15705 + }, + { + "epoch": 4.820748925721301, + "grad_norm": 0.3175830543041229, + "learning_rate": 5.532824474360737e-05, + "loss": 1.7553, + "step": 15706 + }, + { + "epoch": 4.821055862492327, + "grad_norm": 0.22842039167881012, + "learning_rate": 5.532330246104134e-05, + "loss": 1.7489, + "step": 15707 + }, + { + "epoch": 4.821362799263352, + "grad_norm": 0.21125485002994537, + "learning_rate": 5.531836012586732e-05, + "loss": 1.7543, + "step": 15708 + }, + { + "epoch": 4.8216697360343765, + "grad_norm": 0.33028700947761536, + "learning_rate": 5.531341773813414e-05, + "loss": 1.8237, + "step": 15709 + }, + { + "epoch": 4.821976672805402, + "grad_norm": 0.324564129114151, + "learning_rate": 5.530847529789067e-05, + "loss": 1.7288, + "step": 15710 + }, + { + "epoch": 4.822283609576427, + "grad_norm": 0.3299528956413269, + "learning_rate": 5.530353280518571e-05, + "loss": 1.7536, + "step": 15711 + }, + { + "epoch": 4.8225905463474525, + "grad_norm": 0.3535030782222748, + "learning_rate": 5.5298590260068136e-05, + "loss": 1.7941, + "step": 15712 + }, + { + "epoch": 4.822897483118478, + "grad_norm": 0.2627669870853424, + "learning_rate": 5.5293647662586804e-05, + "loss": 1.7638, + "step": 15713 + }, + { + "epoch": 4.823204419889503, + "grad_norm": 0.25569450855255127, + "learning_rate": 5.5288705012790535e-05, + "loss": 1.7396, + "step": 15714 + }, + { + "epoch": 4.823511356660528, + "grad_norm": 0.26099520921707153, + "learning_rate": 5.528376231072817e-05, + "loss": 1.7415, + "step": 15715 + }, + { + "epoch": 4.823818293431553, + "grad_norm": 0.31833693385124207, + "learning_rate": 5.527881955644858e-05, + "loss": 1.7683, + "step": 15716 + }, + { + "epoch": 4.824125230202578, + "grad_norm": 0.2753448188304901, + "learning_rate": 5.5273876750000594e-05, + "loss": 1.6653, + "step": 15717 + }, + { + "epoch": 4.824432166973604, + "grad_norm": 0.23816895484924316, + "learning_rate": 5.526893389143307e-05, + "loss": 1.7575, + "step": 15718 + }, + { + "epoch": 4.824739103744628, + "grad_norm": 0.25376051664352417, + "learning_rate": 5.5263990980794856e-05, + "loss": 1.755, + "step": 15719 + }, + { + "epoch": 4.8250460405156534, + "grad_norm": 0.2483726590871811, + "learning_rate": 5.52590480181348e-05, + "loss": 1.7566, + "step": 15720 + }, + { + "epoch": 4.825352977286679, + "grad_norm": 0.2073517143726349, + "learning_rate": 5.5254105003501746e-05, + "loss": 1.7069, + "step": 15721 + }, + { + "epoch": 4.825659914057704, + "grad_norm": 0.3166659474372864, + "learning_rate": 5.524916193694455e-05, + "loss": 1.7012, + "step": 15722 + }, + { + "epoch": 4.8259668508287294, + "grad_norm": 0.24518641829490662, + "learning_rate": 5.524421881851205e-05, + "loss": 1.7027, + "step": 15723 + }, + { + "epoch": 4.826273787599755, + "grad_norm": 0.23137906193733215, + "learning_rate": 5.523927564825311e-05, + "loss": 1.746, + "step": 15724 + }, + { + "epoch": 4.82658072437078, + "grad_norm": 0.27937051653862, + "learning_rate": 5.5234332426216586e-05, + "loss": 1.7064, + "step": 15725 + }, + { + "epoch": 4.826887661141805, + "grad_norm": 0.26408496499061584, + "learning_rate": 5.522938915245131e-05, + "loss": 1.6598, + "step": 15726 + }, + { + "epoch": 4.82719459791283, + "grad_norm": 0.22269997000694275, + "learning_rate": 5.5224445827006164e-05, + "loss": 1.7166, + "step": 15727 + }, + { + "epoch": 4.827501534683855, + "grad_norm": 0.22687453031539917, + "learning_rate": 5.5219502449929964e-05, + "loss": 1.7156, + "step": 15728 + }, + { + "epoch": 4.827808471454881, + "grad_norm": 0.26355600357055664, + "learning_rate": 5.5214559021271585e-05, + "loss": 1.8016, + "step": 15729 + }, + { + "epoch": 4.828115408225905, + "grad_norm": 0.30103012919425964, + "learning_rate": 5.520961554107987e-05, + "loss": 1.7856, + "step": 15730 + }, + { + "epoch": 4.82842234499693, + "grad_norm": 0.22604018449783325, + "learning_rate": 5.520467200940369e-05, + "loss": 1.813, + "step": 15731 + }, + { + "epoch": 4.828729281767956, + "grad_norm": 0.25435203313827515, + "learning_rate": 5.51997284262919e-05, + "loss": 1.7511, + "step": 15732 + }, + { + "epoch": 4.829036218538981, + "grad_norm": 0.2740691304206848, + "learning_rate": 5.519478479179333e-05, + "loss": 1.7326, + "step": 15733 + }, + { + "epoch": 4.829343155310006, + "grad_norm": 0.19710861146450043, + "learning_rate": 5.5189841105956866e-05, + "loss": 1.7581, + "step": 15734 + }, + { + "epoch": 4.829650092081032, + "grad_norm": 0.2315293401479721, + "learning_rate": 5.518489736883132e-05, + "loss": 1.6796, + "step": 15735 + }, + { + "epoch": 4.829957028852056, + "grad_norm": 0.2465476542711258, + "learning_rate": 5.51799535804656e-05, + "loss": 1.7276, + "step": 15736 + }, + { + "epoch": 4.8302639656230815, + "grad_norm": 0.20438486337661743, + "learning_rate": 5.5175009740908546e-05, + "loss": 1.7188, + "step": 15737 + }, + { + "epoch": 4.830570902394107, + "grad_norm": 0.24328351020812988, + "learning_rate": 5.5170065850209016e-05, + "loss": 1.7165, + "step": 15738 + }, + { + "epoch": 4.830877839165132, + "grad_norm": 0.22486837208271027, + "learning_rate": 5.516512190841586e-05, + "loss": 1.7369, + "step": 15739 + }, + { + "epoch": 4.8311847759361575, + "grad_norm": 0.2065822333097458, + "learning_rate": 5.5160177915577934e-05, + "loss": 1.7125, + "step": 15740 + }, + { + "epoch": 4.831491712707182, + "grad_norm": 0.21223095059394836, + "learning_rate": 5.5155233871744104e-05, + "loss": 1.7319, + "step": 15741 + }, + { + "epoch": 4.831798649478207, + "grad_norm": 0.25712934136390686, + "learning_rate": 5.515028977696325e-05, + "loss": 1.7847, + "step": 15742 + }, + { + "epoch": 4.832105586249233, + "grad_norm": 0.21289978921413422, + "learning_rate": 5.5145345631284215e-05, + "loss": 1.7629, + "step": 15743 + }, + { + "epoch": 4.832412523020258, + "grad_norm": 0.22347134351730347, + "learning_rate": 5.514040143475585e-05, + "loss": 1.7491, + "step": 15744 + }, + { + "epoch": 4.832719459791283, + "grad_norm": 0.20660510659217834, + "learning_rate": 5.513545718742702e-05, + "loss": 1.7377, + "step": 15745 + }, + { + "epoch": 4.833026396562309, + "grad_norm": 0.21612273156642914, + "learning_rate": 5.513051288934658e-05, + "loss": 1.7973, + "step": 15746 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.22515933215618134, + "learning_rate": 5.512556854056342e-05, + "loss": 1.7774, + "step": 15747 + }, + { + "epoch": 4.833640270104358, + "grad_norm": 0.21075554192066193, + "learning_rate": 5.512062414112639e-05, + "loss": 1.7741, + "step": 15748 + }, + { + "epoch": 4.833947206875384, + "grad_norm": 0.2203720659017563, + "learning_rate": 5.511567969108436e-05, + "loss": 1.7902, + "step": 15749 + }, + { + "epoch": 4.834254143646409, + "grad_norm": 0.20247167348861694, + "learning_rate": 5.511073519048616e-05, + "loss": 1.7084, + "step": 15750 + }, + { + "epoch": 4.834561080417434, + "grad_norm": 0.247711181640625, + "learning_rate": 5.5105790639380695e-05, + "loss": 1.8465, + "step": 15751 + }, + { + "epoch": 4.834868017188459, + "grad_norm": 0.22866854071617126, + "learning_rate": 5.51008460378168e-05, + "loss": 1.7252, + "step": 15752 + }, + { + "epoch": 4.835174953959484, + "grad_norm": 0.2335643470287323, + "learning_rate": 5.5095901385843374e-05, + "loss": 1.703, + "step": 15753 + }, + { + "epoch": 4.8354818907305095, + "grad_norm": 0.20874348282814026, + "learning_rate": 5.509095668350926e-05, + "loss": 1.7114, + "step": 15754 + }, + { + "epoch": 4.835788827501535, + "grad_norm": 0.19156917929649353, + "learning_rate": 5.5086011930863314e-05, + "loss": 1.6975, + "step": 15755 + }, + { + "epoch": 4.83609576427256, + "grad_norm": 0.23480524122714996, + "learning_rate": 5.508106712795443e-05, + "loss": 1.8291, + "step": 15756 + }, + { + "epoch": 4.8364027010435855, + "grad_norm": 0.20430417358875275, + "learning_rate": 5.5076122274831454e-05, + "loss": 1.7605, + "step": 15757 + }, + { + "epoch": 4.83670963781461, + "grad_norm": 0.26790598034858704, + "learning_rate": 5.5071177371543256e-05, + "loss": 1.7541, + "step": 15758 + }, + { + "epoch": 4.837016574585635, + "grad_norm": 0.3339289724826813, + "learning_rate": 5.506623241813873e-05, + "loss": 1.7566, + "step": 15759 + }, + { + "epoch": 4.837323511356661, + "grad_norm": 0.30528193712234497, + "learning_rate": 5.5061287414666726e-05, + "loss": 1.7371, + "step": 15760 + }, + { + "epoch": 4.837630448127686, + "grad_norm": 0.21059657633304596, + "learning_rate": 5.5056342361176114e-05, + "loss": 1.7599, + "step": 15761 + }, + { + "epoch": 4.83793738489871, + "grad_norm": 0.27918973565101624, + "learning_rate": 5.5051397257715756e-05, + "loss": 1.7485, + "step": 15762 + }, + { + "epoch": 4.838244321669736, + "grad_norm": 0.23147793114185333, + "learning_rate": 5.5046452104334514e-05, + "loss": 1.7121, + "step": 15763 + }, + { + "epoch": 4.838551258440761, + "grad_norm": 0.22028742730617523, + "learning_rate": 5.5041506901081294e-05, + "loss": 1.803, + "step": 15764 + }, + { + "epoch": 4.838858195211786, + "grad_norm": 0.22840891778469086, + "learning_rate": 5.5036561648004946e-05, + "loss": 1.7555, + "step": 15765 + }, + { + "epoch": 4.839165131982812, + "grad_norm": 0.2610893249511719, + "learning_rate": 5.503161634515433e-05, + "loss": 1.7873, + "step": 15766 + }, + { + "epoch": 4.839472068753837, + "grad_norm": 0.2530003786087036, + "learning_rate": 5.502667099257836e-05, + "loss": 1.7604, + "step": 15767 + }, + { + "epoch": 4.8397790055248615, + "grad_norm": 0.20120400190353394, + "learning_rate": 5.5021725590325854e-05, + "loss": 1.7476, + "step": 15768 + }, + { + "epoch": 4.840085942295887, + "grad_norm": 0.2189723700284958, + "learning_rate": 5.501678013844571e-05, + "loss": 1.7174, + "step": 15769 + }, + { + "epoch": 4.840392879066912, + "grad_norm": 0.2511899173259735, + "learning_rate": 5.501183463698683e-05, + "loss": 1.7589, + "step": 15770 + }, + { + "epoch": 4.8406998158379375, + "grad_norm": 0.24899333715438843, + "learning_rate": 5.5006889085998035e-05, + "loss": 1.7253, + "step": 15771 + }, + { + "epoch": 4.841006752608963, + "grad_norm": 0.21223559975624084, + "learning_rate": 5.5001943485528254e-05, + "loss": 1.6949, + "step": 15772 + }, + { + "epoch": 4.841313689379987, + "grad_norm": 0.21394596993923187, + "learning_rate": 5.499699783562632e-05, + "loss": 1.7827, + "step": 15773 + }, + { + "epoch": 4.841620626151013, + "grad_norm": 0.2379613220691681, + "learning_rate": 5.4992052136341134e-05, + "loss": 1.7968, + "step": 15774 + }, + { + "epoch": 4.841927562922038, + "grad_norm": 0.23748385906219482, + "learning_rate": 5.498710638772154e-05, + "loss": 1.797, + "step": 15775 + }, + { + "epoch": 4.842234499693063, + "grad_norm": 0.2502206265926361, + "learning_rate": 5.498216058981646e-05, + "loss": 1.7292, + "step": 15776 + }, + { + "epoch": 4.842541436464089, + "grad_norm": 0.23613516986370087, + "learning_rate": 5.497721474267475e-05, + "loss": 1.7353, + "step": 15777 + }, + { + "epoch": 4.842848373235114, + "grad_norm": 0.25274696946144104, + "learning_rate": 5.497226884634527e-05, + "loss": 1.7782, + "step": 15778 + }, + { + "epoch": 4.843155310006138, + "grad_norm": 0.19574183225631714, + "learning_rate": 5.496732290087694e-05, + "loss": 1.6926, + "step": 15779 + }, + { + "epoch": 4.843462246777164, + "grad_norm": 0.21040405333042145, + "learning_rate": 5.496237690631858e-05, + "loss": 1.7235, + "step": 15780 + }, + { + "epoch": 4.843769183548189, + "grad_norm": 0.22499679028987885, + "learning_rate": 5.495743086271913e-05, + "loss": 1.7889, + "step": 15781 + }, + { + "epoch": 4.844076120319214, + "grad_norm": 0.24623246490955353, + "learning_rate": 5.4952484770127433e-05, + "loss": 1.7357, + "step": 15782 + }, + { + "epoch": 4.84438305709024, + "grad_norm": 0.21706275641918182, + "learning_rate": 5.494753862859238e-05, + "loss": 1.7349, + "step": 15783 + }, + { + "epoch": 4.844689993861264, + "grad_norm": 0.20705166459083557, + "learning_rate": 5.4942592438162855e-05, + "loss": 1.7047, + "step": 15784 + }, + { + "epoch": 4.8449969306322895, + "grad_norm": 0.21216751635074615, + "learning_rate": 5.493764619888773e-05, + "loss": 1.7335, + "step": 15785 + }, + { + "epoch": 4.845303867403315, + "grad_norm": 0.2945895195007324, + "learning_rate": 5.493269991081588e-05, + "loss": 1.838, + "step": 15786 + }, + { + "epoch": 4.84561080417434, + "grad_norm": 0.22013652324676514, + "learning_rate": 5.492775357399621e-05, + "loss": 1.7541, + "step": 15787 + }, + { + "epoch": 4.8459177409453655, + "grad_norm": 0.25428512692451477, + "learning_rate": 5.4922807188477585e-05, + "loss": 1.7405, + "step": 15788 + }, + { + "epoch": 4.846224677716391, + "grad_norm": 0.23189012706279755, + "learning_rate": 5.49178607543089e-05, + "loss": 1.8075, + "step": 15789 + }, + { + "epoch": 4.846531614487415, + "grad_norm": 0.21637389063835144, + "learning_rate": 5.491291427153904e-05, + "loss": 1.7229, + "step": 15790 + }, + { + "epoch": 4.846838551258441, + "grad_norm": 0.20628009736537933, + "learning_rate": 5.490796774021687e-05, + "loss": 1.7605, + "step": 15791 + }, + { + "epoch": 4.847145488029466, + "grad_norm": 0.20845308899879456, + "learning_rate": 5.4903021160391276e-05, + "loss": 1.7864, + "step": 15792 + }, + { + "epoch": 4.847452424800491, + "grad_norm": 0.20367322862148285, + "learning_rate": 5.4898074532111164e-05, + "loss": 1.733, + "step": 15793 + }, + { + "epoch": 4.847759361571516, + "grad_norm": 0.2066505253314972, + "learning_rate": 5.489312785542543e-05, + "loss": 1.7113, + "step": 15794 + }, + { + "epoch": 4.848066298342541, + "grad_norm": 0.23874987661838531, + "learning_rate": 5.488818113038292e-05, + "loss": 1.7735, + "step": 15795 + }, + { + "epoch": 4.848373235113566, + "grad_norm": 0.26583850383758545, + "learning_rate": 5.488323435703254e-05, + "loss": 1.8019, + "step": 15796 + }, + { + "epoch": 4.848680171884592, + "grad_norm": 0.25207552313804626, + "learning_rate": 5.487828753542317e-05, + "loss": 1.7491, + "step": 15797 + }, + { + "epoch": 4.848987108655617, + "grad_norm": 0.23065905272960663, + "learning_rate": 5.48733406656037e-05, + "loss": 1.7451, + "step": 15798 + }, + { + "epoch": 4.849294045426642, + "grad_norm": 0.26914483308792114, + "learning_rate": 5.486839374762304e-05, + "loss": 1.7553, + "step": 15799 + }, + { + "epoch": 4.849600982197668, + "grad_norm": 0.2509605884552002, + "learning_rate": 5.4863446781530046e-05, + "loss": 1.7124, + "step": 15800 + }, + { + "epoch": 4.849907918968692, + "grad_norm": 0.2618432343006134, + "learning_rate": 5.485849976737362e-05, + "loss": 1.7368, + "step": 15801 + }, + { + "epoch": 4.850214855739718, + "grad_norm": 0.46875160932540894, + "learning_rate": 5.485355270520266e-05, + "loss": 1.7883, + "step": 15802 + }, + { + "epoch": 4.850521792510743, + "grad_norm": 0.37585484981536865, + "learning_rate": 5.4848605595066025e-05, + "loss": 1.7894, + "step": 15803 + }, + { + "epoch": 4.850828729281768, + "grad_norm": 0.2244408279657364, + "learning_rate": 5.4843658437012646e-05, + "loss": 1.7394, + "step": 15804 + }, + { + "epoch": 4.851135666052793, + "grad_norm": 0.4061773419380188, + "learning_rate": 5.48387112310914e-05, + "loss": 1.7703, + "step": 15805 + }, + { + "epoch": 4.851442602823818, + "grad_norm": 0.35925009846687317, + "learning_rate": 5.483376397735117e-05, + "loss": 1.7798, + "step": 15806 + }, + { + "epoch": 4.851749539594843, + "grad_norm": 0.23050184547901154, + "learning_rate": 5.482881667584084e-05, + "loss": 1.7984, + "step": 15807 + }, + { + "epoch": 4.852056476365869, + "grad_norm": 0.37308645248413086, + "learning_rate": 5.4823869326609335e-05, + "loss": 1.6747, + "step": 15808 + }, + { + "epoch": 4.852363413136894, + "grad_norm": 0.29826754331588745, + "learning_rate": 5.481892192970551e-05, + "loss": 1.7432, + "step": 15809 + }, + { + "epoch": 4.852670349907919, + "grad_norm": 0.23652370274066925, + "learning_rate": 5.4813974485178266e-05, + "loss": 1.7557, + "step": 15810 + }, + { + "epoch": 4.852977286678944, + "grad_norm": 0.40549808740615845, + "learning_rate": 5.4809026993076526e-05, + "loss": 1.7317, + "step": 15811 + }, + { + "epoch": 4.853284223449969, + "grad_norm": 0.3367961347103119, + "learning_rate": 5.4804079453449156e-05, + "loss": 1.7648, + "step": 15812 + }, + { + "epoch": 4.8535911602209945, + "grad_norm": 0.21629661321640015, + "learning_rate": 5.4799131866345055e-05, + "loss": 1.7986, + "step": 15813 + }, + { + "epoch": 4.85389809699202, + "grad_norm": 0.26381492614746094, + "learning_rate": 5.4794184231813105e-05, + "loss": 1.7401, + "step": 15814 + }, + { + "epoch": 4.854205033763045, + "grad_norm": 0.22319363057613373, + "learning_rate": 5.478923654990223e-05, + "loss": 1.7773, + "step": 15815 + }, + { + "epoch": 4.85451197053407, + "grad_norm": 0.2547159492969513, + "learning_rate": 5.4784288820661326e-05, + "loss": 1.8194, + "step": 15816 + }, + { + "epoch": 4.854818907305095, + "grad_norm": 0.29574522376060486, + "learning_rate": 5.477934104413925e-05, + "loss": 1.7351, + "step": 15817 + }, + { + "epoch": 4.85512584407612, + "grad_norm": 0.17389361560344696, + "learning_rate": 5.4774393220384945e-05, + "loss": 1.6957, + "step": 15818 + }, + { + "epoch": 4.855432780847146, + "grad_norm": 0.23746751248836517, + "learning_rate": 5.476944534944728e-05, + "loss": 1.7713, + "step": 15819 + }, + { + "epoch": 4.855739717618171, + "grad_norm": 0.182356595993042, + "learning_rate": 5.476449743137516e-05, + "loss": 1.7144, + "step": 15820 + }, + { + "epoch": 4.856046654389196, + "grad_norm": 0.23716382682323456, + "learning_rate": 5.4759549466217475e-05, + "loss": 1.7451, + "step": 15821 + }, + { + "epoch": 4.856353591160221, + "grad_norm": 0.316806823015213, + "learning_rate": 5.475460145402313e-05, + "loss": 1.7823, + "step": 15822 + }, + { + "epoch": 4.856660527931246, + "grad_norm": 0.2333129197359085, + "learning_rate": 5.474965339484105e-05, + "loss": 1.7788, + "step": 15823 + }, + { + "epoch": 4.856967464702271, + "grad_norm": 0.21180212497711182, + "learning_rate": 5.47447052887201e-05, + "loss": 1.7513, + "step": 15824 + }, + { + "epoch": 4.857274401473297, + "grad_norm": 0.22641299664974213, + "learning_rate": 5.473975713570919e-05, + "loss": 1.7514, + "step": 15825 + }, + { + "epoch": 4.857581338244322, + "grad_norm": 0.3179668188095093, + "learning_rate": 5.473480893585723e-05, + "loss": 1.7939, + "step": 15826 + }, + { + "epoch": 4.8578882750153465, + "grad_norm": 0.27463147044181824, + "learning_rate": 5.472986068921309e-05, + "loss": 1.7487, + "step": 15827 + }, + { + "epoch": 4.858195211786372, + "grad_norm": 0.18621626496315002, + "learning_rate": 5.472491239582572e-05, + "loss": 1.7155, + "step": 15828 + }, + { + "epoch": 4.858502148557397, + "grad_norm": 0.2437327802181244, + "learning_rate": 5.471996405574399e-05, + "loss": 1.7586, + "step": 15829 + }, + { + "epoch": 4.8588090853284225, + "grad_norm": 0.26658934354782104, + "learning_rate": 5.47150156690168e-05, + "loss": 1.7331, + "step": 15830 + }, + { + "epoch": 4.859116022099448, + "grad_norm": 0.2257174700498581, + "learning_rate": 5.471006723569308e-05, + "loss": 1.7556, + "step": 15831 + }, + { + "epoch": 4.859422958870473, + "grad_norm": 0.25434550642967224, + "learning_rate": 5.470511875582168e-05, + "loss": 1.7196, + "step": 15832 + }, + { + "epoch": 4.859729895641498, + "grad_norm": 0.2251453697681427, + "learning_rate": 5.470017022945156e-05, + "loss": 1.7174, + "step": 15833 + }, + { + "epoch": 4.860036832412523, + "grad_norm": 0.2757972180843353, + "learning_rate": 5.469522165663161e-05, + "loss": 1.7701, + "step": 15834 + }, + { + "epoch": 4.860343769183548, + "grad_norm": 0.2771994173526764, + "learning_rate": 5.469027303741072e-05, + "loss": 1.8085, + "step": 15835 + }, + { + "epoch": 4.860650705954574, + "grad_norm": 0.23825454711914062, + "learning_rate": 5.468532437183781e-05, + "loss": 1.733, + "step": 15836 + }, + { + "epoch": 4.860957642725598, + "grad_norm": 0.18100066483020782, + "learning_rate": 5.468037565996177e-05, + "loss": 1.7012, + "step": 15837 + }, + { + "epoch": 4.861264579496623, + "grad_norm": 0.22552812099456787, + "learning_rate": 5.4675426901831506e-05, + "loss": 1.728, + "step": 15838 + }, + { + "epoch": 4.861571516267649, + "grad_norm": 0.2505643665790558, + "learning_rate": 5.467047809749595e-05, + "loss": 1.7219, + "step": 15839 + }, + { + "epoch": 4.861878453038674, + "grad_norm": 0.25920796394348145, + "learning_rate": 5.4665529247003975e-05, + "loss": 1.7945, + "step": 15840 + }, + { + "epoch": 4.862185389809699, + "grad_norm": 0.23549394309520721, + "learning_rate": 5.466058035040452e-05, + "loss": 1.7904, + "step": 15841 + }, + { + "epoch": 4.862492326580725, + "grad_norm": 0.26510992646217346, + "learning_rate": 5.465563140774648e-05, + "loss": 1.8051, + "step": 15842 + }, + { + "epoch": 4.862799263351749, + "grad_norm": 0.19175390899181366, + "learning_rate": 5.465068241907876e-05, + "loss": 1.6799, + "step": 15843 + }, + { + "epoch": 4.8631062001227745, + "grad_norm": 0.2588976323604584, + "learning_rate": 5.464573338445025e-05, + "loss": 1.7394, + "step": 15844 + }, + { + "epoch": 4.8634131368938, + "grad_norm": 0.28729483485221863, + "learning_rate": 5.464078430390991e-05, + "loss": 1.797, + "step": 15845 + }, + { + "epoch": 4.863720073664825, + "grad_norm": 0.21302445232868195, + "learning_rate": 5.463583517750661e-05, + "loss": 1.7303, + "step": 15846 + }, + { + "epoch": 4.8640270104358505, + "grad_norm": 0.2407636195421219, + "learning_rate": 5.463088600528926e-05, + "loss": 1.7175, + "step": 15847 + }, + { + "epoch": 4.864333947206875, + "grad_norm": 0.25653502345085144, + "learning_rate": 5.4625936787306784e-05, + "loss": 1.6996, + "step": 15848 + }, + { + "epoch": 4.8646408839779, + "grad_norm": 0.2100832760334015, + "learning_rate": 5.462098752360809e-05, + "loss": 1.7416, + "step": 15849 + }, + { + "epoch": 4.864947820748926, + "grad_norm": 0.2785186469554901, + "learning_rate": 5.461603821424208e-05, + "loss": 1.74, + "step": 15850 + }, + { + "epoch": 4.865254757519951, + "grad_norm": 0.2896614968776703, + "learning_rate": 5.4611088859257696e-05, + "loss": 1.7436, + "step": 15851 + }, + { + "epoch": 4.865561694290976, + "grad_norm": 0.18890418112277985, + "learning_rate": 5.460613945870382e-05, + "loss": 1.7093, + "step": 15852 + }, + { + "epoch": 4.865868631062002, + "grad_norm": 0.27681079506874084, + "learning_rate": 5.4601190012629364e-05, + "loss": 1.8772, + "step": 15853 + }, + { + "epoch": 4.866175567833026, + "grad_norm": 0.24658115208148956, + "learning_rate": 5.4596240521083265e-05, + "loss": 1.776, + "step": 15854 + }, + { + "epoch": 4.866482504604051, + "grad_norm": 0.21958144009113312, + "learning_rate": 5.459129098411441e-05, + "loss": 1.7503, + "step": 15855 + }, + { + "epoch": 4.866789441375077, + "grad_norm": 0.2778300642967224, + "learning_rate": 5.458634140177174e-05, + "loss": 1.8194, + "step": 15856 + }, + { + "epoch": 4.867096378146102, + "grad_norm": 0.28673580288887024, + "learning_rate": 5.458139177410414e-05, + "loss": 1.8033, + "step": 15857 + }, + { + "epoch": 4.867403314917127, + "grad_norm": 0.24472850561141968, + "learning_rate": 5.457644210116055e-05, + "loss": 1.7304, + "step": 15858 + }, + { + "epoch": 4.867710251688152, + "grad_norm": 0.24581189453601837, + "learning_rate": 5.4571492382989886e-05, + "loss": 1.7443, + "step": 15859 + }, + { + "epoch": 4.868017188459177, + "grad_norm": 0.22296221554279327, + "learning_rate": 5.4566542619641045e-05, + "loss": 1.7201, + "step": 15860 + }, + { + "epoch": 4.8683241252302025, + "grad_norm": 0.2378673404455185, + "learning_rate": 5.456159281116295e-05, + "loss": 1.7893, + "step": 15861 + }, + { + "epoch": 4.868631062001228, + "grad_norm": 0.3320823907852173, + "learning_rate": 5.4556642957604534e-05, + "loss": 1.7944, + "step": 15862 + }, + { + "epoch": 4.868937998772253, + "grad_norm": 0.3303453326225281, + "learning_rate": 5.45516930590147e-05, + "loss": 1.7267, + "step": 15863 + }, + { + "epoch": 4.8692449355432785, + "grad_norm": 0.223227858543396, + "learning_rate": 5.454674311544235e-05, + "loss": 1.7477, + "step": 15864 + }, + { + "epoch": 4.869551872314303, + "grad_norm": 0.3012549579143524, + "learning_rate": 5.454179312693643e-05, + "loss": 1.731, + "step": 15865 + }, + { + "epoch": 4.869858809085328, + "grad_norm": 0.3780311942100525, + "learning_rate": 5.453684309354585e-05, + "loss": 1.7296, + "step": 15866 + }, + { + "epoch": 4.870165745856354, + "grad_norm": 0.2753889262676239, + "learning_rate": 5.4531893015319526e-05, + "loss": 1.8024, + "step": 15867 + }, + { + "epoch": 4.870472682627379, + "grad_norm": 0.2270934134721756, + "learning_rate": 5.452694289230639e-05, + "loss": 1.7095, + "step": 15868 + }, + { + "epoch": 4.870779619398404, + "grad_norm": 0.2621576488018036, + "learning_rate": 5.452199272455534e-05, + "loss": 1.75, + "step": 15869 + }, + { + "epoch": 4.871086556169429, + "grad_norm": 0.22175776958465576, + "learning_rate": 5.45170425121153e-05, + "loss": 1.7658, + "step": 15870 + }, + { + "epoch": 4.871393492940454, + "grad_norm": 0.2038736790418625, + "learning_rate": 5.451209225503521e-05, + "loss": 1.6916, + "step": 15871 + }, + { + "epoch": 4.871700429711479, + "grad_norm": 0.2493467777967453, + "learning_rate": 5.450714195336397e-05, + "loss": 1.7408, + "step": 15872 + }, + { + "epoch": 4.872007366482505, + "grad_norm": 0.1966754049062729, + "learning_rate": 5.450219160715052e-05, + "loss": 1.7379, + "step": 15873 + }, + { + "epoch": 4.87231430325353, + "grad_norm": 0.23193517327308655, + "learning_rate": 5.4497241216443775e-05, + "loss": 1.7736, + "step": 15874 + }, + { + "epoch": 4.872621240024555, + "grad_norm": 0.2164391279220581, + "learning_rate": 5.4492290781292646e-05, + "loss": 1.7618, + "step": 15875 + }, + { + "epoch": 4.87292817679558, + "grad_norm": 0.286460816860199, + "learning_rate": 5.448734030174607e-05, + "loss": 1.7745, + "step": 15876 + }, + { + "epoch": 4.873235113566605, + "grad_norm": 0.3454538881778717, + "learning_rate": 5.448238977785298e-05, + "loss": 1.7605, + "step": 15877 + }, + { + "epoch": 4.8735420503376305, + "grad_norm": 0.26775062084198, + "learning_rate": 5.447743920966227e-05, + "loss": 1.7263, + "step": 15878 + }, + { + "epoch": 4.873848987108656, + "grad_norm": 0.2644907832145691, + "learning_rate": 5.447248859722289e-05, + "loss": 1.8489, + "step": 15879 + }, + { + "epoch": 4.87415592387968, + "grad_norm": 0.21646654605865479, + "learning_rate": 5.446753794058376e-05, + "loss": 1.7605, + "step": 15880 + }, + { + "epoch": 4.874462860650706, + "grad_norm": 0.23431318998336792, + "learning_rate": 5.446258723979381e-05, + "loss": 1.7209, + "step": 15881 + }, + { + "epoch": 4.874769797421731, + "grad_norm": 0.24665607511997223, + "learning_rate": 5.4457636494901934e-05, + "loss": 1.813, + "step": 15882 + }, + { + "epoch": 4.875076734192756, + "grad_norm": 0.26269975304603577, + "learning_rate": 5.445268570595708e-05, + "loss": 1.8255, + "step": 15883 + }, + { + "epoch": 4.875383670963782, + "grad_norm": 0.2722402811050415, + "learning_rate": 5.444773487300819e-05, + "loss": 1.7795, + "step": 15884 + }, + { + "epoch": 4.875690607734807, + "grad_norm": 0.3235624134540558, + "learning_rate": 5.444278399610417e-05, + "loss": 1.7804, + "step": 15885 + }, + { + "epoch": 4.8759975445058314, + "grad_norm": 0.2647583782672882, + "learning_rate": 5.4437833075293964e-05, + "loss": 1.7359, + "step": 15886 + }, + { + "epoch": 4.876304481276857, + "grad_norm": 0.272370845079422, + "learning_rate": 5.443288211062649e-05, + "loss": 1.7605, + "step": 15887 + }, + { + "epoch": 4.876611418047882, + "grad_norm": 0.3147594630718231, + "learning_rate": 5.4427931102150675e-05, + "loss": 1.7118, + "step": 15888 + }, + { + "epoch": 4.8769183548189075, + "grad_norm": 0.22751441597938538, + "learning_rate": 5.442298004991544e-05, + "loss": 1.723, + "step": 15889 + }, + { + "epoch": 4.877225291589933, + "grad_norm": 0.2121521681547165, + "learning_rate": 5.441802895396972e-05, + "loss": 1.7485, + "step": 15890 + }, + { + "epoch": 4.877532228360957, + "grad_norm": 0.25370222330093384, + "learning_rate": 5.4413077814362466e-05, + "loss": 1.8064, + "step": 15891 + }, + { + "epoch": 4.877839165131983, + "grad_norm": 0.19492633640766144, + "learning_rate": 5.440812663114259e-05, + "loss": 1.6773, + "step": 15892 + }, + { + "epoch": 4.878146101903008, + "grad_norm": 0.2101750522851944, + "learning_rate": 5.440317540435901e-05, + "loss": 1.7215, + "step": 15893 + }, + { + "epoch": 4.878453038674033, + "grad_norm": 0.21150651574134827, + "learning_rate": 5.439822413406068e-05, + "loss": 1.7875, + "step": 15894 + }, + { + "epoch": 4.878759975445059, + "grad_norm": 0.21008379757404327, + "learning_rate": 5.439327282029651e-05, + "loss": 1.7108, + "step": 15895 + }, + { + "epoch": 4.879066912216084, + "grad_norm": 0.22885502874851227, + "learning_rate": 5.4388321463115453e-05, + "loss": 1.7899, + "step": 15896 + }, + { + "epoch": 4.879373848987108, + "grad_norm": 0.24868059158325195, + "learning_rate": 5.4383370062566444e-05, + "loss": 1.7368, + "step": 15897 + }, + { + "epoch": 4.879680785758134, + "grad_norm": 0.27225378155708313, + "learning_rate": 5.437841861869838e-05, + "loss": 1.7623, + "step": 15898 + }, + { + "epoch": 4.879987722529159, + "grad_norm": 0.23353120684623718, + "learning_rate": 5.437346713156023e-05, + "loss": 1.7908, + "step": 15899 + }, + { + "epoch": 4.880294659300184, + "grad_norm": 0.19032470881938934, + "learning_rate": 5.436851560120091e-05, + "loss": 1.7511, + "step": 15900 + }, + { + "epoch": 4.88060159607121, + "grad_norm": 0.23714862763881683, + "learning_rate": 5.4363564027669345e-05, + "loss": 1.7197, + "step": 15901 + }, + { + "epoch": 4.880908532842234, + "grad_norm": 0.24897022545337677, + "learning_rate": 5.4358612411014495e-05, + "loss": 1.7822, + "step": 15902 + }, + { + "epoch": 4.8812154696132595, + "grad_norm": 0.21433588862419128, + "learning_rate": 5.435366075128528e-05, + "loss": 1.7928, + "step": 15903 + }, + { + "epoch": 4.881522406384285, + "grad_norm": 0.30019649863243103, + "learning_rate": 5.4348709048530646e-05, + "loss": 1.8067, + "step": 15904 + }, + { + "epoch": 4.88182934315531, + "grad_norm": 0.20227669179439545, + "learning_rate": 5.4343757302799515e-05, + "loss": 1.7254, + "step": 15905 + }, + { + "epoch": 4.8821362799263355, + "grad_norm": 0.23447728157043457, + "learning_rate": 5.4338805514140836e-05, + "loss": 1.7314, + "step": 15906 + }, + { + "epoch": 4.882443216697361, + "grad_norm": 0.29545050859451294, + "learning_rate": 5.4333853682603506e-05, + "loss": 1.7659, + "step": 15907 + }, + { + "epoch": 4.882750153468385, + "grad_norm": 0.245390385389328, + "learning_rate": 5.432890180823652e-05, + "loss": 1.7264, + "step": 15908 + }, + { + "epoch": 4.883057090239411, + "grad_norm": 0.209987074136734, + "learning_rate": 5.432394989108879e-05, + "loss": 1.7174, + "step": 15909 + }, + { + "epoch": 4.883364027010436, + "grad_norm": 0.2402341365814209, + "learning_rate": 5.431899793120925e-05, + "loss": 1.7512, + "step": 15910 + }, + { + "epoch": 4.883670963781461, + "grad_norm": 0.26227688789367676, + "learning_rate": 5.431404592864684e-05, + "loss": 1.7697, + "step": 15911 + }, + { + "epoch": 4.883977900552486, + "grad_norm": 0.2556503117084503, + "learning_rate": 5.4309093883450504e-05, + "loss": 1.8191, + "step": 15912 + }, + { + "epoch": 4.884284837323511, + "grad_norm": 0.24766884744167328, + "learning_rate": 5.4304141795669174e-05, + "loss": 1.7574, + "step": 15913 + }, + { + "epoch": 4.884591774094536, + "grad_norm": 0.19925951957702637, + "learning_rate": 5.429918966535179e-05, + "loss": 1.7249, + "step": 15914 + }, + { + "epoch": 4.884898710865562, + "grad_norm": 0.1899442970752716, + "learning_rate": 5.4294237492547294e-05, + "loss": 1.7446, + "step": 15915 + }, + { + "epoch": 4.885205647636587, + "grad_norm": 0.25900956988334656, + "learning_rate": 5.4289285277304636e-05, + "loss": 1.725, + "step": 15916 + }, + { + "epoch": 4.885512584407612, + "grad_norm": 0.2537781000137329, + "learning_rate": 5.428433301967274e-05, + "loss": 1.7861, + "step": 15917 + }, + { + "epoch": 4.885819521178637, + "grad_norm": 0.26432034373283386, + "learning_rate": 5.427938071970054e-05, + "loss": 1.7538, + "step": 15918 + }, + { + "epoch": 4.886126457949662, + "grad_norm": 0.22722363471984863, + "learning_rate": 5.4274428377437e-05, + "loss": 1.7631, + "step": 15919 + }, + { + "epoch": 4.8864333947206875, + "grad_norm": 0.24846172332763672, + "learning_rate": 5.426947599293106e-05, + "loss": 1.7833, + "step": 15920 + }, + { + "epoch": 4.886740331491713, + "grad_norm": 0.24821995198726654, + "learning_rate": 5.426452356623165e-05, + "loss": 1.7638, + "step": 15921 + }, + { + "epoch": 4.887047268262738, + "grad_norm": 0.2796781063079834, + "learning_rate": 5.425957109738773e-05, + "loss": 1.6982, + "step": 15922 + }, + { + "epoch": 4.887354205033763, + "grad_norm": 0.2875385284423828, + "learning_rate": 5.425461858644821e-05, + "loss": 1.7172, + "step": 15923 + }, + { + "epoch": 4.887661141804788, + "grad_norm": 0.21614491939544678, + "learning_rate": 5.424966603346207e-05, + "loss": 1.7521, + "step": 15924 + }, + { + "epoch": 4.887968078575813, + "grad_norm": 0.22944390773773193, + "learning_rate": 5.4244713438478235e-05, + "loss": 1.772, + "step": 15925 + }, + { + "epoch": 4.888275015346839, + "grad_norm": 0.21566039323806763, + "learning_rate": 5.423976080154566e-05, + "loss": 1.734, + "step": 15926 + }, + { + "epoch": 4.888581952117864, + "grad_norm": 0.4253925383090973, + "learning_rate": 5.4234808122713275e-05, + "loss": 1.8017, + "step": 15927 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.239146426320076, + "learning_rate": 5.422985540203004e-05, + "loss": 1.7229, + "step": 15928 + }, + { + "epoch": 4.889195825659914, + "grad_norm": 0.2344054877758026, + "learning_rate": 5.42249026395449e-05, + "loss": 1.7111, + "step": 15929 + }, + { + "epoch": 4.889502762430939, + "grad_norm": 0.21717922389507294, + "learning_rate": 5.421994983530679e-05, + "loss": 1.7427, + "step": 15930 + }, + { + "epoch": 4.889809699201964, + "grad_norm": 0.26895472407341003, + "learning_rate": 5.421499698936466e-05, + "loss": 1.8402, + "step": 15931 + }, + { + "epoch": 4.89011663597299, + "grad_norm": 0.25761866569519043, + "learning_rate": 5.421004410176746e-05, + "loss": 1.7822, + "step": 15932 + }, + { + "epoch": 4.890423572744015, + "grad_norm": 0.24465128779411316, + "learning_rate": 5.420509117256415e-05, + "loss": 1.8074, + "step": 15933 + }, + { + "epoch": 4.8907305095150395, + "grad_norm": 0.2527398467063904, + "learning_rate": 5.4200138201803655e-05, + "loss": 1.7522, + "step": 15934 + }, + { + "epoch": 4.891037446286065, + "grad_norm": 0.23118112981319427, + "learning_rate": 5.4195185189534916e-05, + "loss": 1.7394, + "step": 15935 + }, + { + "epoch": 4.89134438305709, + "grad_norm": 0.2054537534713745, + "learning_rate": 5.419023213580691e-05, + "loss": 1.7096, + "step": 15936 + }, + { + "epoch": 4.8916513198281155, + "grad_norm": 0.2929638922214508, + "learning_rate": 5.418527904066858e-05, + "loss": 1.8733, + "step": 15937 + }, + { + "epoch": 4.891958256599141, + "grad_norm": 0.2957170009613037, + "learning_rate": 5.418032590416886e-05, + "loss": 1.7201, + "step": 15938 + }, + { + "epoch": 4.892265193370166, + "grad_norm": 0.2520081698894501, + "learning_rate": 5.417537272635672e-05, + "loss": 1.7034, + "step": 15939 + }, + { + "epoch": 4.892572130141191, + "grad_norm": 0.25217053294181824, + "learning_rate": 5.41704195072811e-05, + "loss": 1.8538, + "step": 15940 + }, + { + "epoch": 4.892879066912216, + "grad_norm": 0.23605379462242126, + "learning_rate": 5.416546624699093e-05, + "loss": 1.724, + "step": 15941 + }, + { + "epoch": 4.893186003683241, + "grad_norm": 0.321750283241272, + "learning_rate": 5.416051294553519e-05, + "loss": 1.806, + "step": 15942 + }, + { + "epoch": 4.893492940454267, + "grad_norm": 0.23800241947174072, + "learning_rate": 5.415555960296284e-05, + "loss": 1.7578, + "step": 15943 + }, + { + "epoch": 4.893799877225292, + "grad_norm": 0.3423094153404236, + "learning_rate": 5.4150606219322796e-05, + "loss": 1.7324, + "step": 15944 + }, + { + "epoch": 4.894106813996316, + "grad_norm": 0.453074187040329, + "learning_rate": 5.414565279466404e-05, + "loss": 1.7268, + "step": 15945 + }, + { + "epoch": 4.894413750767342, + "grad_norm": 0.21972697973251343, + "learning_rate": 5.4140699329035504e-05, + "loss": 1.6547, + "step": 15946 + }, + { + "epoch": 4.894720687538367, + "grad_norm": 0.32876282930374146, + "learning_rate": 5.413574582248616e-05, + "loss": 1.7527, + "step": 15947 + }, + { + "epoch": 4.895027624309392, + "grad_norm": 0.34035229682922363, + "learning_rate": 5.413079227506494e-05, + "loss": 1.7636, + "step": 15948 + }, + { + "epoch": 4.895334561080418, + "grad_norm": 0.2410411536693573, + "learning_rate": 5.412583868682082e-05, + "loss": 1.8114, + "step": 15949 + }, + { + "epoch": 4.895641497851443, + "grad_norm": 0.2787366211414337, + "learning_rate": 5.412088505780274e-05, + "loss": 1.7393, + "step": 15950 + }, + { + "epoch": 4.8959484346224675, + "grad_norm": 0.23288428783416748, + "learning_rate": 5.411593138805966e-05, + "loss": 1.7413, + "step": 15951 + }, + { + "epoch": 4.896255371393493, + "grad_norm": 0.26302778720855713, + "learning_rate": 5.411097767764053e-05, + "loss": 1.7372, + "step": 15952 + }, + { + "epoch": 4.896562308164518, + "grad_norm": 0.31638020277023315, + "learning_rate": 5.410602392659431e-05, + "loss": 1.8114, + "step": 15953 + }, + { + "epoch": 4.8968692449355435, + "grad_norm": 0.23361825942993164, + "learning_rate": 5.410107013496996e-05, + "loss": 1.7592, + "step": 15954 + }, + { + "epoch": 4.897176181706568, + "grad_norm": 0.19887785613536835, + "learning_rate": 5.409611630281642e-05, + "loss": 1.7509, + "step": 15955 + }, + { + "epoch": 4.897483118477593, + "grad_norm": 0.22396783530712128, + "learning_rate": 5.409116243018266e-05, + "loss": 1.6841, + "step": 15956 + }, + { + "epoch": 4.897790055248619, + "grad_norm": 0.20397686958312988, + "learning_rate": 5.4086208517117645e-05, + "loss": 1.7427, + "step": 15957 + }, + { + "epoch": 4.898096992019644, + "grad_norm": 0.20848311483860016, + "learning_rate": 5.4081254563670314e-05, + "loss": 1.713, + "step": 15958 + }, + { + "epoch": 4.898403928790669, + "grad_norm": 0.2739275395870209, + "learning_rate": 5.407630056988964e-05, + "loss": 1.7673, + "step": 15959 + }, + { + "epoch": 4.898710865561695, + "grad_norm": 0.21485929191112518, + "learning_rate": 5.407134653582456e-05, + "loss": 1.7347, + "step": 15960 + }, + { + "epoch": 4.899017802332719, + "grad_norm": 0.26980286836624146, + "learning_rate": 5.406639246152406e-05, + "loss": 1.7158, + "step": 15961 + }, + { + "epoch": 4.899324739103744, + "grad_norm": 0.22327515482902527, + "learning_rate": 5.4061438347037084e-05, + "loss": 1.7387, + "step": 15962 + }, + { + "epoch": 4.89963167587477, + "grad_norm": 0.2542823553085327, + "learning_rate": 5.4056484192412603e-05, + "loss": 1.7826, + "step": 15963 + }, + { + "epoch": 4.899938612645795, + "grad_norm": 0.3248840868473053, + "learning_rate": 5.405152999769956e-05, + "loss": 1.7878, + "step": 15964 + }, + { + "epoch": 4.9002455494168204, + "grad_norm": 0.21210803091526031, + "learning_rate": 5.404657576294691e-05, + "loss": 1.7378, + "step": 15965 + }, + { + "epoch": 4.900552486187845, + "grad_norm": 0.25679782032966614, + "learning_rate": 5.404162148820365e-05, + "loss": 1.7493, + "step": 15966 + }, + { + "epoch": 4.90085942295887, + "grad_norm": 0.36698678135871887, + "learning_rate": 5.4036667173518704e-05, + "loss": 1.7662, + "step": 15967 + }, + { + "epoch": 4.901166359729896, + "grad_norm": 0.3396874964237213, + "learning_rate": 5.403171281894105e-05, + "loss": 1.7618, + "step": 15968 + }, + { + "epoch": 4.901473296500921, + "grad_norm": 0.2792030870914459, + "learning_rate": 5.402675842451964e-05, + "loss": 1.7858, + "step": 15969 + }, + { + "epoch": 4.901780233271946, + "grad_norm": 0.24499626457691193, + "learning_rate": 5.4021803990303454e-05, + "loss": 1.7503, + "step": 15970 + }, + { + "epoch": 4.902087170042972, + "grad_norm": 0.29185110330581665, + "learning_rate": 5.401684951634144e-05, + "loss": 1.7536, + "step": 15971 + }, + { + "epoch": 4.902394106813996, + "grad_norm": 0.2480020374059677, + "learning_rate": 5.401189500268256e-05, + "loss": 1.7877, + "step": 15972 + }, + { + "epoch": 4.902701043585021, + "grad_norm": 0.3302663564682007, + "learning_rate": 5.400694044937579e-05, + "loss": 1.8693, + "step": 15973 + }, + { + "epoch": 4.903007980356047, + "grad_norm": 0.2500915825366974, + "learning_rate": 5.400198585647008e-05, + "loss": 1.7489, + "step": 15974 + }, + { + "epoch": 4.903314917127072, + "grad_norm": 0.25079864263534546, + "learning_rate": 5.399703122401441e-05, + "loss": 1.7965, + "step": 15975 + }, + { + "epoch": 4.903621853898097, + "grad_norm": 0.2643207907676697, + "learning_rate": 5.399207655205771e-05, + "loss": 1.7696, + "step": 15976 + }, + { + "epoch": 4.903928790669122, + "grad_norm": 0.23719522356987, + "learning_rate": 5.398712184064899e-05, + "loss": 1.7608, + "step": 15977 + }, + { + "epoch": 4.904235727440147, + "grad_norm": 0.25226888060569763, + "learning_rate": 5.3982167089837184e-05, + "loss": 1.8055, + "step": 15978 + }, + { + "epoch": 4.9045426642111725, + "grad_norm": 0.21601852774620056, + "learning_rate": 5.39772122996713e-05, + "loss": 1.7553, + "step": 15979 + }, + { + "epoch": 4.904849600982198, + "grad_norm": 0.20275430381298065, + "learning_rate": 5.397225747020023e-05, + "loss": 1.7221, + "step": 15980 + }, + { + "epoch": 4.905156537753223, + "grad_norm": 0.24815937876701355, + "learning_rate": 5.3967302601473e-05, + "loss": 1.8098, + "step": 15981 + }, + { + "epoch": 4.9054634745242485, + "grad_norm": 0.2193612903356552, + "learning_rate": 5.3962347693538575e-05, + "loss": 1.7116, + "step": 15982 + }, + { + "epoch": 4.905770411295273, + "grad_norm": 0.21409118175506592, + "learning_rate": 5.395739274644589e-05, + "loss": 1.7503, + "step": 15983 + }, + { + "epoch": 4.906077348066298, + "grad_norm": 0.20907564461231232, + "learning_rate": 5.3952437760243935e-05, + "loss": 1.7518, + "step": 15984 + }, + { + "epoch": 4.906384284837324, + "grad_norm": 0.21193571388721466, + "learning_rate": 5.394748273498168e-05, + "loss": 1.6905, + "step": 15985 + }, + { + "epoch": 4.906691221608349, + "grad_norm": 0.19729891419410706, + "learning_rate": 5.394252767070808e-05, + "loss": 1.7398, + "step": 15986 + }, + { + "epoch": 4.906998158379373, + "grad_norm": 0.2654789686203003, + "learning_rate": 5.393757256747211e-05, + "loss": 1.7931, + "step": 15987 + }, + { + "epoch": 4.907305095150399, + "grad_norm": 0.2627345025539398, + "learning_rate": 5.3932617425322726e-05, + "loss": 1.8174, + "step": 15988 + }, + { + "epoch": 4.907612031921424, + "grad_norm": 0.27162298560142517, + "learning_rate": 5.392766224430894e-05, + "loss": 1.8015, + "step": 15989 + }, + { + "epoch": 4.907918968692449, + "grad_norm": 0.24248667061328888, + "learning_rate": 5.3922707024479676e-05, + "loss": 1.7457, + "step": 15990 + }, + { + "epoch": 4.908225905463475, + "grad_norm": 0.24715331196784973, + "learning_rate": 5.391775176588393e-05, + "loss": 1.7724, + "step": 15991 + }, + { + "epoch": 4.9085328422345, + "grad_norm": 0.26335644721984863, + "learning_rate": 5.3912796468570656e-05, + "loss": 1.7183, + "step": 15992 + }, + { + "epoch": 4.9088397790055245, + "grad_norm": 0.23459944128990173, + "learning_rate": 5.3907841132588843e-05, + "loss": 1.7245, + "step": 15993 + }, + { + "epoch": 4.90914671577655, + "grad_norm": 0.21779637038707733, + "learning_rate": 5.3902885757987444e-05, + "loss": 1.7485, + "step": 15994 + }, + { + "epoch": 4.909453652547575, + "grad_norm": 0.227664977312088, + "learning_rate": 5.389793034481545e-05, + "loss": 1.7418, + "step": 15995 + }, + { + "epoch": 4.9097605893186005, + "grad_norm": 0.26230278611183167, + "learning_rate": 5.389297489312183e-05, + "loss": 1.7619, + "step": 15996 + }, + { + "epoch": 4.910067526089626, + "grad_norm": 0.22563579678535461, + "learning_rate": 5.388801940295555e-05, + "loss": 1.7168, + "step": 15997 + }, + { + "epoch": 4.91037446286065, + "grad_norm": 0.24829435348510742, + "learning_rate": 5.388306387436556e-05, + "loss": 1.7422, + "step": 15998 + }, + { + "epoch": 4.910681399631676, + "grad_norm": 0.24395976960659027, + "learning_rate": 5.387810830740088e-05, + "loss": 1.7783, + "step": 15999 + }, + { + "epoch": 4.910988336402701, + "grad_norm": 0.2189297378063202, + "learning_rate": 5.387315270211044e-05, + "loss": 1.7885, + "step": 16000 + }, + { + "epoch": 4.911295273173726, + "grad_norm": 0.21750971674919128, + "learning_rate": 5.386819705854324e-05, + "loss": 1.7659, + "step": 16001 + }, + { + "epoch": 4.911602209944752, + "grad_norm": 0.21907657384872437, + "learning_rate": 5.386324137674826e-05, + "loss": 1.789, + "step": 16002 + }, + { + "epoch": 4.911909146715777, + "grad_norm": 0.18778781592845917, + "learning_rate": 5.3858285656774465e-05, + "loss": 1.7151, + "step": 16003 + }, + { + "epoch": 4.912216083486801, + "grad_norm": 0.24217712879180908, + "learning_rate": 5.385332989867082e-05, + "loss": 1.8108, + "step": 16004 + }, + { + "epoch": 4.912523020257827, + "grad_norm": 0.27637016773223877, + "learning_rate": 5.384837410248632e-05, + "loss": 1.8368, + "step": 16005 + }, + { + "epoch": 4.912829957028852, + "grad_norm": 0.22366084158420563, + "learning_rate": 5.3843418268269926e-05, + "loss": 1.7351, + "step": 16006 + }, + { + "epoch": 4.913136893799877, + "grad_norm": 0.2742357552051544, + "learning_rate": 5.383846239607062e-05, + "loss": 1.7599, + "step": 16007 + }, + { + "epoch": 4.913443830570903, + "grad_norm": 0.2288598269224167, + "learning_rate": 5.383350648593738e-05, + "loss": 1.7056, + "step": 16008 + }, + { + "epoch": 4.913750767341927, + "grad_norm": 0.23319020867347717, + "learning_rate": 5.382855053791919e-05, + "loss": 1.7356, + "step": 16009 + }, + { + "epoch": 4.9140577041129525, + "grad_norm": 0.2232198268175125, + "learning_rate": 5.382359455206499e-05, + "loss": 1.7375, + "step": 16010 + }, + { + "epoch": 4.914364640883978, + "grad_norm": 0.24420048296451569, + "learning_rate": 5.381863852842381e-05, + "loss": 1.8287, + "step": 16011 + }, + { + "epoch": 4.914671577655003, + "grad_norm": 0.22653080523014069, + "learning_rate": 5.381368246704461e-05, + "loss": 1.7137, + "step": 16012 + }, + { + "epoch": 4.9149785144260285, + "grad_norm": 0.20439405739307404, + "learning_rate": 5.380872636797637e-05, + "loss": 1.7688, + "step": 16013 + }, + { + "epoch": 4.915285451197054, + "grad_norm": 0.2602155804634094, + "learning_rate": 5.380377023126806e-05, + "loss": 1.7875, + "step": 16014 + }, + { + "epoch": 4.915592387968078, + "grad_norm": 0.2757892608642578, + "learning_rate": 5.3798814056968647e-05, + "loss": 1.7446, + "step": 16015 + }, + { + "epoch": 4.915899324739104, + "grad_norm": 0.25938209891319275, + "learning_rate": 5.379385784512714e-05, + "loss": 1.6997, + "step": 16016 + }, + { + "epoch": 4.916206261510129, + "grad_norm": 0.2056962549686432, + "learning_rate": 5.37889015957925e-05, + "loss": 1.6961, + "step": 16017 + }, + { + "epoch": 4.916513198281154, + "grad_norm": 0.24388402700424194, + "learning_rate": 5.3783945309013714e-05, + "loss": 1.712, + "step": 16018 + }, + { + "epoch": 4.91682013505218, + "grad_norm": 0.2381993532180786, + "learning_rate": 5.3778988984839775e-05, + "loss": 1.7444, + "step": 16019 + }, + { + "epoch": 4.917127071823204, + "grad_norm": 0.20201562345027924, + "learning_rate": 5.377403262331964e-05, + "loss": 1.7254, + "step": 16020 + }, + { + "epoch": 4.917434008594229, + "grad_norm": 0.24019409716129303, + "learning_rate": 5.376907622450229e-05, + "loss": 1.684, + "step": 16021 + }, + { + "epoch": 4.917740945365255, + "grad_norm": 0.2441694289445877, + "learning_rate": 5.376411978843674e-05, + "loss": 1.7334, + "step": 16022 + }, + { + "epoch": 4.91804788213628, + "grad_norm": 0.23866300284862518, + "learning_rate": 5.3759163315171945e-05, + "loss": 1.7258, + "step": 16023 + }, + { + "epoch": 4.918354818907305, + "grad_norm": 0.28068670630455017, + "learning_rate": 5.375420680475689e-05, + "loss": 1.8049, + "step": 16024 + }, + { + "epoch": 4.918661755678331, + "grad_norm": 0.2956274151802063, + "learning_rate": 5.3749250257240566e-05, + "loss": 1.8544, + "step": 16025 + }, + { + "epoch": 4.918968692449355, + "grad_norm": 0.1971627175807953, + "learning_rate": 5.374429367267196e-05, + "loss": 1.7314, + "step": 16026 + }, + { + "epoch": 4.9192756292203805, + "grad_norm": 0.28565749526023865, + "learning_rate": 5.373933705110004e-05, + "loss": 1.7587, + "step": 16027 + }, + { + "epoch": 4.919582565991406, + "grad_norm": 0.3087369501590729, + "learning_rate": 5.37343803925738e-05, + "loss": 1.7708, + "step": 16028 + }, + { + "epoch": 4.919889502762431, + "grad_norm": 0.22460010647773743, + "learning_rate": 5.372942369714223e-05, + "loss": 1.7401, + "step": 16029 + }, + { + "epoch": 4.920196439533456, + "grad_norm": 0.29492735862731934, + "learning_rate": 5.3724466964854326e-05, + "loss": 1.7033, + "step": 16030 + }, + { + "epoch": 4.920503376304481, + "grad_norm": 0.24452674388885498, + "learning_rate": 5.371951019575904e-05, + "loss": 1.7688, + "step": 16031 + }, + { + "epoch": 4.920810313075506, + "grad_norm": 0.24686957895755768, + "learning_rate": 5.3714553389905366e-05, + "loss": 1.7463, + "step": 16032 + }, + { + "epoch": 4.921117249846532, + "grad_norm": 0.23661597073078156, + "learning_rate": 5.37095965473423e-05, + "loss": 1.7256, + "step": 16033 + }, + { + "epoch": 4.921424186617557, + "grad_norm": 0.22861288487911224, + "learning_rate": 5.370463966811884e-05, + "loss": 1.7722, + "step": 16034 + }, + { + "epoch": 4.921731123388582, + "grad_norm": 0.2453136146068573, + "learning_rate": 5.3699682752283944e-05, + "loss": 1.7343, + "step": 16035 + }, + { + "epoch": 4.922038060159607, + "grad_norm": 0.25267064571380615, + "learning_rate": 5.369472579988663e-05, + "loss": 1.7817, + "step": 16036 + }, + { + "epoch": 4.922344996930632, + "grad_norm": 0.25301575660705566, + "learning_rate": 5.368976881097586e-05, + "loss": 1.8146, + "step": 16037 + }, + { + "epoch": 4.922651933701657, + "grad_norm": 0.23579831421375275, + "learning_rate": 5.368481178560062e-05, + "loss": 1.8089, + "step": 16038 + }, + { + "epoch": 4.922958870472683, + "grad_norm": 0.2181949019432068, + "learning_rate": 5.367985472380993e-05, + "loss": 1.7689, + "step": 16039 + }, + { + "epoch": 4.923265807243708, + "grad_norm": 0.24622827768325806, + "learning_rate": 5.367489762565276e-05, + "loss": 1.791, + "step": 16040 + }, + { + "epoch": 4.9235727440147325, + "grad_norm": 0.2545134723186493, + "learning_rate": 5.3669940491178084e-05, + "loss": 1.738, + "step": 16041 + }, + { + "epoch": 4.923879680785758, + "grad_norm": 0.258139431476593, + "learning_rate": 5.366498332043491e-05, + "loss": 1.8303, + "step": 16042 + }, + { + "epoch": 4.924186617556783, + "grad_norm": 0.23804105818271637, + "learning_rate": 5.366002611347223e-05, + "loss": 1.751, + "step": 16043 + }, + { + "epoch": 4.9244935543278086, + "grad_norm": 0.2354477345943451, + "learning_rate": 5.365506887033901e-05, + "loss": 1.7911, + "step": 16044 + }, + { + "epoch": 4.924800491098834, + "grad_norm": 0.22212550044059753, + "learning_rate": 5.3650111591084276e-05, + "loss": 1.7439, + "step": 16045 + }, + { + "epoch": 4.925107427869859, + "grad_norm": 0.23621168732643127, + "learning_rate": 5.3645154275756984e-05, + "loss": 1.7339, + "step": 16046 + }, + { + "epoch": 4.925414364640884, + "grad_norm": 0.2163209468126297, + "learning_rate": 5.364019692440616e-05, + "loss": 1.7247, + "step": 16047 + }, + { + "epoch": 4.925721301411909, + "grad_norm": 0.21352291107177734, + "learning_rate": 5.3635239537080774e-05, + "loss": 1.7431, + "step": 16048 + }, + { + "epoch": 4.926028238182934, + "grad_norm": 0.3170754909515381, + "learning_rate": 5.36302821138298e-05, + "loss": 1.8075, + "step": 16049 + }, + { + "epoch": 4.92633517495396, + "grad_norm": 0.27073633670806885, + "learning_rate": 5.362532465470226e-05, + "loss": 1.7209, + "step": 16050 + }, + { + "epoch": 4.926642111724985, + "grad_norm": 0.2677803039550781, + "learning_rate": 5.362036715974714e-05, + "loss": 1.7454, + "step": 16051 + }, + { + "epoch": 4.9269490484960095, + "grad_norm": 0.3555704355239868, + "learning_rate": 5.3615409629013436e-05, + "loss": 1.7737, + "step": 16052 + }, + { + "epoch": 4.927255985267035, + "grad_norm": 0.2819947302341461, + "learning_rate": 5.3610452062550124e-05, + "loss": 1.7588, + "step": 16053 + }, + { + "epoch": 4.92756292203806, + "grad_norm": 0.26638996601104736, + "learning_rate": 5.360549446040621e-05, + "loss": 1.8078, + "step": 16054 + }, + { + "epoch": 4.9278698588090855, + "grad_norm": 0.37828773260116577, + "learning_rate": 5.360053682263069e-05, + "loss": 1.7527, + "step": 16055 + }, + { + "epoch": 4.928176795580111, + "grad_norm": 0.35836395621299744, + "learning_rate": 5.359557914927254e-05, + "loss": 1.7199, + "step": 16056 + }, + { + "epoch": 4.928483732351136, + "grad_norm": 0.2720802128314972, + "learning_rate": 5.359062144038078e-05, + "loss": 1.7598, + "step": 16057 + }, + { + "epoch": 4.928790669122161, + "grad_norm": 0.36662939190864563, + "learning_rate": 5.358566369600441e-05, + "loss": 1.7199, + "step": 16058 + }, + { + "epoch": 4.929097605893186, + "grad_norm": 0.42243221402168274, + "learning_rate": 5.3580705916192395e-05, + "loss": 1.7584, + "step": 16059 + }, + { + "epoch": 4.929404542664211, + "grad_norm": 0.21667765080928802, + "learning_rate": 5.357574810099375e-05, + "loss": 1.7608, + "step": 16060 + }, + { + "epoch": 4.929711479435237, + "grad_norm": 0.48101645708084106, + "learning_rate": 5.3570790250457456e-05, + "loss": 1.8157, + "step": 16061 + }, + { + "epoch": 4.930018416206261, + "grad_norm": 0.5289245843887329, + "learning_rate": 5.356583236463253e-05, + "loss": 1.7173, + "step": 16062 + }, + { + "epoch": 4.930325352977286, + "grad_norm": 0.21454930305480957, + "learning_rate": 5.356087444356795e-05, + "loss": 1.7399, + "step": 16063 + }, + { + "epoch": 4.930632289748312, + "grad_norm": 0.5648324489593506, + "learning_rate": 5.355591648731274e-05, + "loss": 1.7814, + "step": 16064 + }, + { + "epoch": 4.930939226519337, + "grad_norm": 0.5669483542442322, + "learning_rate": 5.355095849591587e-05, + "loss": 1.7769, + "step": 16065 + }, + { + "epoch": 4.931246163290362, + "grad_norm": 0.33108505606651306, + "learning_rate": 5.354600046942635e-05, + "loss": 1.7704, + "step": 16066 + }, + { + "epoch": 4.931553100061388, + "grad_norm": 0.31149306893348694, + "learning_rate": 5.3541042407893164e-05, + "loss": 1.7631, + "step": 16067 + }, + { + "epoch": 4.931860036832412, + "grad_norm": 0.30377596616744995, + "learning_rate": 5.353608431136532e-05, + "loss": 1.7888, + "step": 16068 + }, + { + "epoch": 4.9321669736034375, + "grad_norm": 0.25041452050209045, + "learning_rate": 5.3531126179891825e-05, + "loss": 1.7507, + "step": 16069 + }, + { + "epoch": 4.932473910374463, + "grad_norm": 0.33900725841522217, + "learning_rate": 5.352616801352167e-05, + "loss": 1.7365, + "step": 16070 + }, + { + "epoch": 4.932780847145488, + "grad_norm": 0.23939846456050873, + "learning_rate": 5.352120981230386e-05, + "loss": 1.7934, + "step": 16071 + }, + { + "epoch": 4.9330877839165135, + "grad_norm": 0.2419881969690323, + "learning_rate": 5.351625157628739e-05, + "loss": 1.7555, + "step": 16072 + }, + { + "epoch": 4.933394720687538, + "grad_norm": 0.3517596423625946, + "learning_rate": 5.351129330552125e-05, + "loss": 1.7102, + "step": 16073 + }, + { + "epoch": 4.933701657458563, + "grad_norm": 0.2660250663757324, + "learning_rate": 5.350633500005446e-05, + "loss": 1.7692, + "step": 16074 + }, + { + "epoch": 4.934008594229589, + "grad_norm": 0.20726454257965088, + "learning_rate": 5.350137665993601e-05, + "loss": 1.718, + "step": 16075 + }, + { + "epoch": 4.934315531000614, + "grad_norm": 0.28218522667884827, + "learning_rate": 5.3496418285214914e-05, + "loss": 1.8402, + "step": 16076 + }, + { + "epoch": 4.934622467771639, + "grad_norm": 0.2142515480518341, + "learning_rate": 5.349145987594015e-05, + "loss": 1.7571, + "step": 16077 + }, + { + "epoch": 4.934929404542665, + "grad_norm": 0.2777026891708374, + "learning_rate": 5.348650143216074e-05, + "loss": 1.7617, + "step": 16078 + }, + { + "epoch": 4.935236341313689, + "grad_norm": 0.24057620763778687, + "learning_rate": 5.348154295392567e-05, + "loss": 1.7149, + "step": 16079 + }, + { + "epoch": 4.935543278084714, + "grad_norm": 0.22220350801944733, + "learning_rate": 5.3476584441283964e-05, + "loss": 1.7402, + "step": 16080 + }, + { + "epoch": 4.93585021485574, + "grad_norm": 0.2451290488243103, + "learning_rate": 5.347162589428462e-05, + "loss": 1.7004, + "step": 16081 + }, + { + "epoch": 4.936157151626765, + "grad_norm": 0.25621771812438965, + "learning_rate": 5.3466667312976625e-05, + "loss": 1.7765, + "step": 16082 + }, + { + "epoch": 4.93646408839779, + "grad_norm": 0.217393159866333, + "learning_rate": 5.346170869740899e-05, + "loss": 1.7695, + "step": 16083 + }, + { + "epoch": 4.936771025168815, + "grad_norm": 0.21248537302017212, + "learning_rate": 5.345675004763071e-05, + "loss": 1.7277, + "step": 16084 + }, + { + "epoch": 4.93707796193984, + "grad_norm": 0.19431474804878235, + "learning_rate": 5.3451791363690805e-05, + "loss": 1.7352, + "step": 16085 + }, + { + "epoch": 4.9373848987108655, + "grad_norm": 0.20233909785747528, + "learning_rate": 5.344683264563829e-05, + "loss": 1.71, + "step": 16086 + }, + { + "epoch": 4.937691835481891, + "grad_norm": 0.2199622094631195, + "learning_rate": 5.344187389352214e-05, + "loss": 1.7443, + "step": 16087 + }, + { + "epoch": 4.937998772252916, + "grad_norm": 0.23495158553123474, + "learning_rate": 5.343691510739138e-05, + "loss": 1.7758, + "step": 16088 + }, + { + "epoch": 4.9383057090239415, + "grad_norm": 0.228348970413208, + "learning_rate": 5.3431956287295015e-05, + "loss": 1.7645, + "step": 16089 + }, + { + "epoch": 4.938612645794966, + "grad_norm": 0.2337537258863449, + "learning_rate": 5.342699743328203e-05, + "loss": 1.7353, + "step": 16090 + }, + { + "epoch": 4.938919582565991, + "grad_norm": 0.1899309754371643, + "learning_rate": 5.3422038545401454e-05, + "loss": 1.6907, + "step": 16091 + }, + { + "epoch": 4.939226519337017, + "grad_norm": 0.2479192316532135, + "learning_rate": 5.341707962370229e-05, + "loss": 1.7961, + "step": 16092 + }, + { + "epoch": 4.939533456108042, + "grad_norm": 0.2444314956665039, + "learning_rate": 5.341212066823355e-05, + "loss": 1.7768, + "step": 16093 + }, + { + "epoch": 4.939840392879067, + "grad_norm": 0.2123393714427948, + "learning_rate": 5.340716167904423e-05, + "loss": 1.7617, + "step": 16094 + }, + { + "epoch": 4.940147329650092, + "grad_norm": 0.20779116451740265, + "learning_rate": 5.340220265618334e-05, + "loss": 1.6951, + "step": 16095 + }, + { + "epoch": 4.940454266421117, + "grad_norm": 0.22189265489578247, + "learning_rate": 5.3397243599699884e-05, + "loss": 1.8368, + "step": 16096 + }, + { + "epoch": 4.940761203192142, + "grad_norm": 0.22316497564315796, + "learning_rate": 5.3392284509642875e-05, + "loss": 1.7096, + "step": 16097 + }, + { + "epoch": 4.941068139963168, + "grad_norm": 0.20406664907932281, + "learning_rate": 5.3387325386061346e-05, + "loss": 1.7269, + "step": 16098 + }, + { + "epoch": 4.941375076734193, + "grad_norm": 0.263007789850235, + "learning_rate": 5.338236622900427e-05, + "loss": 1.7663, + "step": 16099 + }, + { + "epoch": 4.941682013505218, + "grad_norm": 0.24388311803340912, + "learning_rate": 5.3377407038520654e-05, + "loss": 1.7113, + "step": 16100 + }, + { + "epoch": 4.941988950276243, + "grad_norm": 0.21918313205242157, + "learning_rate": 5.3372447814659524e-05, + "loss": 1.775, + "step": 16101 + }, + { + "epoch": 4.942295887047268, + "grad_norm": 0.30842962861061096, + "learning_rate": 5.336748855746989e-05, + "loss": 1.8229, + "step": 16102 + }, + { + "epoch": 4.9426028238182935, + "grad_norm": 0.2875657379627228, + "learning_rate": 5.336252926700077e-05, + "loss": 1.7377, + "step": 16103 + }, + { + "epoch": 4.942909760589319, + "grad_norm": 0.23411425948143005, + "learning_rate": 5.3357569943301156e-05, + "loss": 1.754, + "step": 16104 + }, + { + "epoch": 4.943216697360343, + "grad_norm": 0.29758864641189575, + "learning_rate": 5.335261058642007e-05, + "loss": 1.7471, + "step": 16105 + }, + { + "epoch": 4.943523634131369, + "grad_norm": 0.31761085987091064, + "learning_rate": 5.3347651196406534e-05, + "loss": 1.7658, + "step": 16106 + }, + { + "epoch": 4.943830570902394, + "grad_norm": 0.2487023025751114, + "learning_rate": 5.334269177330952e-05, + "loss": 1.786, + "step": 16107 + }, + { + "epoch": 4.944137507673419, + "grad_norm": 0.23954913020133972, + "learning_rate": 5.333773231717808e-05, + "loss": 1.8486, + "step": 16108 + }, + { + "epoch": 4.944444444444445, + "grad_norm": 0.24893096089363098, + "learning_rate": 5.3332772828061214e-05, + "loss": 1.7927, + "step": 16109 + }, + { + "epoch": 4.94475138121547, + "grad_norm": 0.28653839230537415, + "learning_rate": 5.332781330600795e-05, + "loss": 1.8331, + "step": 16110 + }, + { + "epoch": 4.945058317986494, + "grad_norm": 0.2597404718399048, + "learning_rate": 5.332285375106726e-05, + "loss": 1.7128, + "step": 16111 + }, + { + "epoch": 4.94536525475752, + "grad_norm": 0.23813198506832123, + "learning_rate": 5.3317894163288196e-05, + "loss": 1.7483, + "step": 16112 + }, + { + "epoch": 4.945672191528545, + "grad_norm": 0.2545793652534485, + "learning_rate": 5.331293454271974e-05, + "loss": 1.7987, + "step": 16113 + }, + { + "epoch": 4.94597912829957, + "grad_norm": 0.2453712821006775, + "learning_rate": 5.330797488941095e-05, + "loss": 1.7376, + "step": 16114 + }, + { + "epoch": 4.946286065070596, + "grad_norm": 0.20583751797676086, + "learning_rate": 5.33030152034108e-05, + "loss": 1.7038, + "step": 16115 + }, + { + "epoch": 4.94659300184162, + "grad_norm": 0.22557811439037323, + "learning_rate": 5.3298055484768313e-05, + "loss": 1.6999, + "step": 16116 + }, + { + "epoch": 4.9468999386126455, + "grad_norm": 0.23163801431655884, + "learning_rate": 5.329309573353252e-05, + "loss": 1.7575, + "step": 16117 + }, + { + "epoch": 4.947206875383671, + "grad_norm": 0.3560176491737366, + "learning_rate": 5.3288135949752394e-05, + "loss": 1.8494, + "step": 16118 + }, + { + "epoch": 4.947513812154696, + "grad_norm": 0.306379109621048, + "learning_rate": 5.328317613347701e-05, + "loss": 1.7229, + "step": 16119 + }, + { + "epoch": 4.9478207489257215, + "grad_norm": 0.24428823590278625, + "learning_rate": 5.3278216284755344e-05, + "loss": 1.7939, + "step": 16120 + }, + { + "epoch": 4.948127685696747, + "grad_norm": 0.22251521050930023, + "learning_rate": 5.327325640363643e-05, + "loss": 1.7624, + "step": 16121 + }, + { + "epoch": 4.948434622467771, + "grad_norm": 0.23310889303684235, + "learning_rate": 5.326829649016928e-05, + "loss": 1.7727, + "step": 16122 + }, + { + "epoch": 4.948741559238797, + "grad_norm": 0.22457881271839142, + "learning_rate": 5.326333654440291e-05, + "loss": 1.7602, + "step": 16123 + }, + { + "epoch": 4.949048496009822, + "grad_norm": 0.24032343924045563, + "learning_rate": 5.325837656638631e-05, + "loss": 1.7591, + "step": 16124 + }, + { + "epoch": 4.949355432780847, + "grad_norm": 0.25082892179489136, + "learning_rate": 5.3253416556168546e-05, + "loss": 1.7745, + "step": 16125 + }, + { + "epoch": 4.949662369551873, + "grad_norm": 0.22859038412570953, + "learning_rate": 5.3248456513798615e-05, + "loss": 1.7475, + "step": 16126 + }, + { + "epoch": 4.949969306322897, + "grad_norm": 0.27282553911209106, + "learning_rate": 5.3243496439325525e-05, + "loss": 1.7438, + "step": 16127 + }, + { + "epoch": 4.9502762430939224, + "grad_norm": 0.23622353374958038, + "learning_rate": 5.3238536332798303e-05, + "loss": 1.7625, + "step": 16128 + }, + { + "epoch": 4.950583179864948, + "grad_norm": 0.28060024976730347, + "learning_rate": 5.3233576194265975e-05, + "loss": 1.8028, + "step": 16129 + }, + { + "epoch": 4.950890116635973, + "grad_norm": 0.33281829953193665, + "learning_rate": 5.322861602377755e-05, + "loss": 1.7163, + "step": 16130 + }, + { + "epoch": 4.9511970534069984, + "grad_norm": 0.26457497477531433, + "learning_rate": 5.322365582138203e-05, + "loss": 1.7347, + "step": 16131 + }, + { + "epoch": 4.951503990178024, + "grad_norm": 0.21651674807071686, + "learning_rate": 5.3218695587128476e-05, + "loss": 1.7123, + "step": 16132 + }, + { + "epoch": 4.951810926949048, + "grad_norm": 0.2299882024526596, + "learning_rate": 5.3213735321065885e-05, + "loss": 1.775, + "step": 16133 + }, + { + "epoch": 4.952117863720074, + "grad_norm": 0.2252396047115326, + "learning_rate": 5.3208775023243265e-05, + "loss": 1.7598, + "step": 16134 + }, + { + "epoch": 4.952424800491099, + "grad_norm": 0.2263660430908203, + "learning_rate": 5.3203814693709655e-05, + "loss": 1.7519, + "step": 16135 + }, + { + "epoch": 4.952731737262124, + "grad_norm": 0.2425432950258255, + "learning_rate": 5.3198854332514056e-05, + "loss": 1.7769, + "step": 16136 + }, + { + "epoch": 4.953038674033149, + "grad_norm": 0.22624996304512024, + "learning_rate": 5.319389393970553e-05, + "loss": 1.7686, + "step": 16137 + }, + { + "epoch": 4.953345610804174, + "grad_norm": 0.2240568846464157, + "learning_rate": 5.318893351533306e-05, + "loss": 1.7795, + "step": 16138 + }, + { + "epoch": 4.953652547575199, + "grad_norm": 0.21708132326602936, + "learning_rate": 5.318397305944568e-05, + "loss": 1.7348, + "step": 16139 + }, + { + "epoch": 4.953959484346225, + "grad_norm": 0.2263328731060028, + "learning_rate": 5.3179012572092415e-05, + "loss": 1.7645, + "step": 16140 + }, + { + "epoch": 4.95426642111725, + "grad_norm": 0.2541986107826233, + "learning_rate": 5.3174052053322274e-05, + "loss": 1.723, + "step": 16141 + }, + { + "epoch": 4.954573357888275, + "grad_norm": 0.25829461216926575, + "learning_rate": 5.316909150318429e-05, + "loss": 1.7469, + "step": 16142 + }, + { + "epoch": 4.9548802946593, + "grad_norm": 0.21251125633716583, + "learning_rate": 5.3164130921727494e-05, + "loss": 1.7699, + "step": 16143 + }, + { + "epoch": 4.955187231430325, + "grad_norm": 0.29195618629455566, + "learning_rate": 5.315917030900091e-05, + "loss": 1.7373, + "step": 16144 + }, + { + "epoch": 4.9554941682013505, + "grad_norm": 0.29457888007164, + "learning_rate": 5.315420966505355e-05, + "loss": 1.7202, + "step": 16145 + }, + { + "epoch": 4.955801104972376, + "grad_norm": 0.19679461419582367, + "learning_rate": 5.314924898993443e-05, + "loss": 1.75, + "step": 16146 + }, + { + "epoch": 4.956108041743401, + "grad_norm": 0.287955105304718, + "learning_rate": 5.314428828369259e-05, + "loss": 1.7385, + "step": 16147 + }, + { + "epoch": 4.956414978514426, + "grad_norm": 0.3081825375556946, + "learning_rate": 5.313932754637706e-05, + "loss": 1.7558, + "step": 16148 + }, + { + "epoch": 4.956721915285451, + "grad_norm": 0.25226521492004395, + "learning_rate": 5.3134366778036846e-05, + "loss": 1.8407, + "step": 16149 + }, + { + "epoch": 4.957028852056476, + "grad_norm": 0.43601852655410767, + "learning_rate": 5.3129405978720984e-05, + "loss": 1.7762, + "step": 16150 + }, + { + "epoch": 4.957335788827502, + "grad_norm": 0.3630274832248688, + "learning_rate": 5.31244451484785e-05, + "loss": 1.7802, + "step": 16151 + }, + { + "epoch": 4.957642725598527, + "grad_norm": 0.21337948739528656, + "learning_rate": 5.311948428735841e-05, + "loss": 1.7107, + "step": 16152 + }, + { + "epoch": 4.957949662369552, + "grad_norm": 0.38581085205078125, + "learning_rate": 5.311452339540974e-05, + "loss": 1.7583, + "step": 16153 + }, + { + "epoch": 4.958256599140577, + "grad_norm": 0.28447309136390686, + "learning_rate": 5.310956247268154e-05, + "loss": 1.6992, + "step": 16154 + }, + { + "epoch": 4.958563535911602, + "grad_norm": 0.24510730803012848, + "learning_rate": 5.310460151922283e-05, + "loss": 1.7059, + "step": 16155 + }, + { + "epoch": 4.958870472682627, + "grad_norm": 0.41670146584510803, + "learning_rate": 5.309964053508262e-05, + "loss": 1.7191, + "step": 16156 + }, + { + "epoch": 4.959177409453653, + "grad_norm": 0.3123849034309387, + "learning_rate": 5.309467952030993e-05, + "loss": 1.7161, + "step": 16157 + }, + { + "epoch": 4.959484346224678, + "grad_norm": 0.2275281697511673, + "learning_rate": 5.308971847495382e-05, + "loss": 1.722, + "step": 16158 + }, + { + "epoch": 4.9597912829957025, + "grad_norm": 0.40216436982154846, + "learning_rate": 5.308475739906329e-05, + "loss": 1.7477, + "step": 16159 + }, + { + "epoch": 4.960098219766728, + "grad_norm": 0.259981244802475, + "learning_rate": 5.307979629268739e-05, + "loss": 1.7384, + "step": 16160 + }, + { + "epoch": 4.960405156537753, + "grad_norm": 0.22969573736190796, + "learning_rate": 5.3074835155875134e-05, + "loss": 1.7328, + "step": 16161 + }, + { + "epoch": 4.9607120933087785, + "grad_norm": 0.2773746848106384, + "learning_rate": 5.3069873988675556e-05, + "loss": 1.7333, + "step": 16162 + }, + { + "epoch": 4.961019030079804, + "grad_norm": 0.2764189541339874, + "learning_rate": 5.306491279113768e-05, + "loss": 1.7956, + "step": 16163 + }, + { + "epoch": 4.961325966850829, + "grad_norm": 0.3640958070755005, + "learning_rate": 5.305995156331054e-05, + "loss": 1.7464, + "step": 16164 + }, + { + "epoch": 4.961632903621854, + "grad_norm": 0.3573450446128845, + "learning_rate": 5.305499030524317e-05, + "loss": 1.75, + "step": 16165 + }, + { + "epoch": 4.961939840392879, + "grad_norm": 0.24313980340957642, + "learning_rate": 5.305002901698459e-05, + "loss": 1.7505, + "step": 16166 + }, + { + "epoch": 4.962246777163904, + "grad_norm": 0.3417615592479706, + "learning_rate": 5.304506769858384e-05, + "loss": 1.7387, + "step": 16167 + }, + { + "epoch": 4.96255371393493, + "grad_norm": 0.23209623992443085, + "learning_rate": 5.304010635008995e-05, + "loss": 1.7111, + "step": 16168 + }, + { + "epoch": 4.962860650705955, + "grad_norm": 0.2994776666164398, + "learning_rate": 5.3035144971551944e-05, + "loss": 1.75, + "step": 16169 + }, + { + "epoch": 4.963167587476979, + "grad_norm": 0.3147084712982178, + "learning_rate": 5.303018356301884e-05, + "loss": 1.7598, + "step": 16170 + }, + { + "epoch": 4.963474524248005, + "grad_norm": 0.20136526226997375, + "learning_rate": 5.30252221245397e-05, + "loss": 1.7217, + "step": 16171 + }, + { + "epoch": 4.96378146101903, + "grad_norm": 0.3308684229850769, + "learning_rate": 5.302026065616355e-05, + "loss": 1.7554, + "step": 16172 + }, + { + "epoch": 4.964088397790055, + "grad_norm": 0.22890877723693848, + "learning_rate": 5.30152991579394e-05, + "loss": 1.7598, + "step": 16173 + }, + { + "epoch": 4.964395334561081, + "grad_norm": 0.3036035895347595, + "learning_rate": 5.301033762991631e-05, + "loss": 1.758, + "step": 16174 + }, + { + "epoch": 4.964702271332106, + "grad_norm": 0.2983579933643341, + "learning_rate": 5.300537607214329e-05, + "loss": 1.8132, + "step": 16175 + }, + { + "epoch": 4.9650092081031305, + "grad_norm": 0.21401815116405487, + "learning_rate": 5.300041448466937e-05, + "loss": 1.7179, + "step": 16176 + }, + { + "epoch": 4.965316144874156, + "grad_norm": 0.2939651608467102, + "learning_rate": 5.2995452867543606e-05, + "loss": 1.7928, + "step": 16177 + }, + { + "epoch": 4.965623081645181, + "grad_norm": 0.24803484976291656, + "learning_rate": 5.2990491220815034e-05, + "loss": 1.7366, + "step": 16178 + }, + { + "epoch": 4.9659300184162065, + "grad_norm": 0.1999569535255432, + "learning_rate": 5.2985529544532656e-05, + "loss": 1.6691, + "step": 16179 + }, + { + "epoch": 4.966236955187231, + "grad_norm": 0.22315269708633423, + "learning_rate": 5.298056783874553e-05, + "loss": 1.7693, + "step": 16180 + }, + { + "epoch": 4.966543891958256, + "grad_norm": 0.22688794136047363, + "learning_rate": 5.2975606103502694e-05, + "loss": 1.8401, + "step": 16181 + }, + { + "epoch": 4.966850828729282, + "grad_norm": 0.2592024505138397, + "learning_rate": 5.297064433885317e-05, + "loss": 1.8054, + "step": 16182 + }, + { + "epoch": 4.967157765500307, + "grad_norm": 0.2508920133113861, + "learning_rate": 5.2965682544846e-05, + "loss": 1.766, + "step": 16183 + }, + { + "epoch": 4.967464702271332, + "grad_norm": 0.22318799793720245, + "learning_rate": 5.296072072153022e-05, + "loss": 1.751, + "step": 16184 + }, + { + "epoch": 4.967771639042358, + "grad_norm": 0.2348448485136032, + "learning_rate": 5.2955758868954855e-05, + "loss": 1.7844, + "step": 16185 + }, + { + "epoch": 4.968078575813382, + "grad_norm": 0.23294343054294586, + "learning_rate": 5.295079698716895e-05, + "loss": 1.7685, + "step": 16186 + }, + { + "epoch": 4.968385512584407, + "grad_norm": 0.20854508876800537, + "learning_rate": 5.2945835076221526e-05, + "loss": 1.6914, + "step": 16187 + }, + { + "epoch": 4.968692449355433, + "grad_norm": 0.21952031552791595, + "learning_rate": 5.294087313616165e-05, + "loss": 1.7121, + "step": 16188 + }, + { + "epoch": 4.968999386126458, + "grad_norm": 0.24097788333892822, + "learning_rate": 5.2935911167038346e-05, + "loss": 1.7712, + "step": 16189 + }, + { + "epoch": 4.969306322897483, + "grad_norm": 0.24433603882789612, + "learning_rate": 5.293094916890063e-05, + "loss": 1.7608, + "step": 16190 + }, + { + "epoch": 4.969613259668508, + "grad_norm": 0.22209061682224274, + "learning_rate": 5.292598714179757e-05, + "loss": 1.7563, + "step": 16191 + }, + { + "epoch": 4.969920196439533, + "grad_norm": 0.24291595816612244, + "learning_rate": 5.29210250857782e-05, + "loss": 1.7765, + "step": 16192 + }, + { + "epoch": 4.9702271332105585, + "grad_norm": 0.3143673837184906, + "learning_rate": 5.291606300089151e-05, + "loss": 1.7945, + "step": 16193 + }, + { + "epoch": 4.970534069981584, + "grad_norm": 0.22693613171577454, + "learning_rate": 5.291110088718661e-05, + "loss": 1.7411, + "step": 16194 + }, + { + "epoch": 4.970841006752609, + "grad_norm": 0.2271365374326706, + "learning_rate": 5.2906138744712494e-05, + "loss": 1.7754, + "step": 16195 + }, + { + "epoch": 4.9711479435236345, + "grad_norm": 0.2428499162197113, + "learning_rate": 5.290117657351822e-05, + "loss": 1.8007, + "step": 16196 + }, + { + "epoch": 4.971454880294659, + "grad_norm": 0.21862711012363434, + "learning_rate": 5.289621437365281e-05, + "loss": 1.7484, + "step": 16197 + }, + { + "epoch": 4.971761817065684, + "grad_norm": 0.26744964718818665, + "learning_rate": 5.2891252145165315e-05, + "loss": 1.7759, + "step": 16198 + }, + { + "epoch": 4.97206875383671, + "grad_norm": 0.2608526647090912, + "learning_rate": 5.288628988810477e-05, + "loss": 1.8527, + "step": 16199 + }, + { + "epoch": 4.972375690607735, + "grad_norm": 0.2245805710554123, + "learning_rate": 5.2881327602520216e-05, + "loss": 1.7773, + "step": 16200 + }, + { + "epoch": 4.97268262737876, + "grad_norm": 0.22023041546344757, + "learning_rate": 5.2876365288460694e-05, + "loss": 1.7101, + "step": 16201 + }, + { + "epoch": 4.972989564149785, + "grad_norm": 0.22034525871276855, + "learning_rate": 5.287140294597525e-05, + "loss": 1.7672, + "step": 16202 + }, + { + "epoch": 4.97329650092081, + "grad_norm": 0.23101158440113068, + "learning_rate": 5.286644057511292e-05, + "loss": 1.741, + "step": 16203 + }, + { + "epoch": 4.973603437691835, + "grad_norm": 0.23050430417060852, + "learning_rate": 5.286147817592273e-05, + "loss": 1.7727, + "step": 16204 + }, + { + "epoch": 4.973910374462861, + "grad_norm": 0.21803520619869232, + "learning_rate": 5.285651574845374e-05, + "loss": 1.7353, + "step": 16205 + }, + { + "epoch": 4.974217311233886, + "grad_norm": 0.22252169251441956, + "learning_rate": 5.2851553292754995e-05, + "loss": 1.7658, + "step": 16206 + }, + { + "epoch": 4.974524248004911, + "grad_norm": 0.22458864748477936, + "learning_rate": 5.284659080887552e-05, + "loss": 1.7157, + "step": 16207 + }, + { + "epoch": 4.974831184775936, + "grad_norm": 0.20769210159778595, + "learning_rate": 5.2841628296864376e-05, + "loss": 1.7731, + "step": 16208 + }, + { + "epoch": 4.975138121546961, + "grad_norm": 0.1952340304851532, + "learning_rate": 5.283666575677059e-05, + "loss": 1.6907, + "step": 16209 + }, + { + "epoch": 4.975445058317987, + "grad_norm": 0.21943804621696472, + "learning_rate": 5.28317031886432e-05, + "loss": 1.8007, + "step": 16210 + }, + { + "epoch": 4.975751995089012, + "grad_norm": 0.21987493336200714, + "learning_rate": 5.2826740592531276e-05, + "loss": 1.7205, + "step": 16211 + }, + { + "epoch": 4.976058931860036, + "grad_norm": 0.2076522558927536, + "learning_rate": 5.2821777968483845e-05, + "loss": 1.7063, + "step": 16212 + }, + { + "epoch": 4.976365868631062, + "grad_norm": 0.19126583635807037, + "learning_rate": 5.281681531654994e-05, + "loss": 1.7118, + "step": 16213 + }, + { + "epoch": 4.976672805402087, + "grad_norm": 0.22308050096035004, + "learning_rate": 5.2811852636778625e-05, + "loss": 1.7565, + "step": 16214 + }, + { + "epoch": 4.976979742173112, + "grad_norm": 0.23187528550624847, + "learning_rate": 5.280688992921893e-05, + "loss": 1.8261, + "step": 16215 + }, + { + "epoch": 4.977286678944138, + "grad_norm": 0.21373791992664337, + "learning_rate": 5.28019271939199e-05, + "loss": 1.6974, + "step": 16216 + }, + { + "epoch": 4.977593615715163, + "grad_norm": 0.21647346019744873, + "learning_rate": 5.2796964430930585e-05, + "loss": 1.7967, + "step": 16217 + }, + { + "epoch": 4.9779005524861875, + "grad_norm": 0.2231660932302475, + "learning_rate": 5.279200164030002e-05, + "loss": 1.7495, + "step": 16218 + }, + { + "epoch": 4.978207489257213, + "grad_norm": 0.2810545563697815, + "learning_rate": 5.278703882207728e-05, + "loss": 1.875, + "step": 16219 + }, + { + "epoch": 4.978514426028238, + "grad_norm": 0.298984557390213, + "learning_rate": 5.2782075976311374e-05, + "loss": 1.7494, + "step": 16220 + }, + { + "epoch": 4.9788213627992635, + "grad_norm": 0.2530893385410309, + "learning_rate": 5.2777113103051365e-05, + "loss": 1.7594, + "step": 16221 + }, + { + "epoch": 4.979128299570289, + "grad_norm": 0.26165664196014404, + "learning_rate": 5.277215020234629e-05, + "loss": 1.7543, + "step": 16222 + }, + { + "epoch": 4.979435236341313, + "grad_norm": 0.25115957856178284, + "learning_rate": 5.276718727424521e-05, + "loss": 1.7925, + "step": 16223 + }, + { + "epoch": 4.979742173112339, + "grad_norm": 0.22134126722812653, + "learning_rate": 5.276222431879716e-05, + "loss": 1.8359, + "step": 16224 + }, + { + "epoch": 4.980049109883364, + "grad_norm": 0.24447613954544067, + "learning_rate": 5.275726133605119e-05, + "loss": 1.7693, + "step": 16225 + }, + { + "epoch": 4.980356046654389, + "grad_norm": 0.23025095462799072, + "learning_rate": 5.275229832605635e-05, + "loss": 1.7911, + "step": 16226 + }, + { + "epoch": 4.980662983425415, + "grad_norm": 0.23424232006072998, + "learning_rate": 5.2747335288861686e-05, + "loss": 1.7628, + "step": 16227 + }, + { + "epoch": 4.98096992019644, + "grad_norm": 0.24598535895347595, + "learning_rate": 5.2742372224516235e-05, + "loss": 1.7651, + "step": 16228 + }, + { + "epoch": 4.981276856967464, + "grad_norm": 0.262893944978714, + "learning_rate": 5.273740913306906e-05, + "loss": 1.7282, + "step": 16229 + }, + { + "epoch": 4.98158379373849, + "grad_norm": 0.21981783211231232, + "learning_rate": 5.2732446014569207e-05, + "loss": 1.7448, + "step": 16230 + }, + { + "epoch": 4.981890730509515, + "grad_norm": 0.24244973063468933, + "learning_rate": 5.272748286906573e-05, + "loss": 1.7216, + "step": 16231 + }, + { + "epoch": 4.98219766728054, + "grad_norm": 0.2365221232175827, + "learning_rate": 5.272251969660766e-05, + "loss": 1.7227, + "step": 16232 + }, + { + "epoch": 4.982504604051566, + "grad_norm": 0.2081129401922226, + "learning_rate": 5.271755649724405e-05, + "loss": 1.7184, + "step": 16233 + }, + { + "epoch": 4.98281154082259, + "grad_norm": 0.2256374955177307, + "learning_rate": 5.271259327102395e-05, + "loss": 1.7412, + "step": 16234 + }, + { + "epoch": 4.9831184775936155, + "grad_norm": 0.23727381229400635, + "learning_rate": 5.270763001799643e-05, + "loss": 1.8095, + "step": 16235 + }, + { + "epoch": 4.983425414364641, + "grad_norm": 0.21498435735702515, + "learning_rate": 5.2702666738210504e-05, + "loss": 1.744, + "step": 16236 + }, + { + "epoch": 4.983732351135666, + "grad_norm": 0.24772173166275024, + "learning_rate": 5.269770343171525e-05, + "loss": 1.741, + "step": 16237 + }, + { + "epoch": 4.9840392879066915, + "grad_norm": 0.2835623621940613, + "learning_rate": 5.269274009855971e-05, + "loss": 1.7765, + "step": 16238 + }, + { + "epoch": 4.984346224677717, + "grad_norm": 0.2570044696331024, + "learning_rate": 5.2687776738792926e-05, + "loss": 1.8206, + "step": 16239 + }, + { + "epoch": 4.984653161448741, + "grad_norm": 0.21549640595912933, + "learning_rate": 5.268281335246397e-05, + "loss": 1.7022, + "step": 16240 + }, + { + "epoch": 4.984960098219767, + "grad_norm": 0.23158684372901917, + "learning_rate": 5.267784993962187e-05, + "loss": 1.7882, + "step": 16241 + }, + { + "epoch": 4.985267034990792, + "grad_norm": 0.22778423130512238, + "learning_rate": 5.26728865003157e-05, + "loss": 1.7358, + "step": 16242 + }, + { + "epoch": 4.985573971761817, + "grad_norm": 0.23197145760059357, + "learning_rate": 5.266792303459449e-05, + "loss": 1.7687, + "step": 16243 + }, + { + "epoch": 4.985880908532843, + "grad_norm": 0.19270172715187073, + "learning_rate": 5.26629595425073e-05, + "loss": 1.6999, + "step": 16244 + }, + { + "epoch": 4.986187845303867, + "grad_norm": 0.25262632966041565, + "learning_rate": 5.2657996024103175e-05, + "loss": 1.7536, + "step": 16245 + }, + { + "epoch": 4.986494782074892, + "grad_norm": 0.18620926141738892, + "learning_rate": 5.2653032479431185e-05, + "loss": 1.7033, + "step": 16246 + }, + { + "epoch": 4.986801718845918, + "grad_norm": 0.19537273049354553, + "learning_rate": 5.2648068908540374e-05, + "loss": 1.7457, + "step": 16247 + }, + { + "epoch": 4.987108655616943, + "grad_norm": 0.19447599351406097, + "learning_rate": 5.26431053114798e-05, + "loss": 1.7053, + "step": 16248 + }, + { + "epoch": 4.987415592387968, + "grad_norm": 0.20431137084960938, + "learning_rate": 5.263814168829852e-05, + "loss": 1.7695, + "step": 16249 + }, + { + "epoch": 4.987722529158994, + "grad_norm": 0.21123024821281433, + "learning_rate": 5.263317803904554e-05, + "loss": 1.7666, + "step": 16250 + }, + { + "epoch": 4.988029465930018, + "grad_norm": 0.21279335021972656, + "learning_rate": 5.262821436376998e-05, + "loss": 1.7231, + "step": 16251 + }, + { + "epoch": 4.9883364027010435, + "grad_norm": 0.22504910826683044, + "learning_rate": 5.262325066252085e-05, + "loss": 1.7657, + "step": 16252 + }, + { + "epoch": 4.988643339472069, + "grad_norm": 0.23505981266498566, + "learning_rate": 5.261828693534723e-05, + "loss": 1.7576, + "step": 16253 + }, + { + "epoch": 4.988950276243094, + "grad_norm": 0.21553601324558258, + "learning_rate": 5.261332318229817e-05, + "loss": 1.7782, + "step": 16254 + }, + { + "epoch": 4.989257213014119, + "grad_norm": 0.29189521074295044, + "learning_rate": 5.26083594034227e-05, + "loss": 1.7664, + "step": 16255 + }, + { + "epoch": 4.989564149785144, + "grad_norm": 0.38108906149864197, + "learning_rate": 5.26033955987699e-05, + "loss": 1.8573, + "step": 16256 + }, + { + "epoch": 4.989871086556169, + "grad_norm": 0.30329224467277527, + "learning_rate": 5.2598431768388824e-05, + "loss": 1.7584, + "step": 16257 + }, + { + "epoch": 4.990178023327195, + "grad_norm": 0.2437417358160019, + "learning_rate": 5.259346791232852e-05, + "loss": 1.7352, + "step": 16258 + }, + { + "epoch": 4.99048496009822, + "grad_norm": 0.3601737320423126, + "learning_rate": 5.258850403063804e-05, + "loss": 1.7206, + "step": 16259 + }, + { + "epoch": 4.990791896869245, + "grad_norm": 0.20259195566177368, + "learning_rate": 5.258354012336646e-05, + "loss": 1.7403, + "step": 16260 + }, + { + "epoch": 4.99109883364027, + "grad_norm": 0.38022148609161377, + "learning_rate": 5.257857619056281e-05, + "loss": 1.7783, + "step": 16261 + }, + { + "epoch": 4.991405770411295, + "grad_norm": 0.30131712555885315, + "learning_rate": 5.257361223227615e-05, + "loss": 1.7826, + "step": 16262 + }, + { + "epoch": 4.99171270718232, + "grad_norm": 0.24159663915634155, + "learning_rate": 5.2568648248555565e-05, + "loss": 1.7792, + "step": 16263 + }, + { + "epoch": 4.992019643953346, + "grad_norm": 0.4641213119029999, + "learning_rate": 5.2563684239450084e-05, + "loss": 1.7432, + "step": 16264 + }, + { + "epoch": 4.992326580724371, + "grad_norm": 0.3526865541934967, + "learning_rate": 5.255872020500877e-05, + "loss": 1.7736, + "step": 16265 + }, + { + "epoch": 4.9926335174953955, + "grad_norm": 0.2396051585674286, + "learning_rate": 5.255375614528071e-05, + "loss": 1.7505, + "step": 16266 + }, + { + "epoch": 4.992940454266421, + "grad_norm": 0.320987343788147, + "learning_rate": 5.25487920603149e-05, + "loss": 1.8229, + "step": 16267 + }, + { + "epoch": 4.993247391037446, + "grad_norm": 0.24689678847789764, + "learning_rate": 5.254382795016044e-05, + "loss": 1.7011, + "step": 16268 + }, + { + "epoch": 4.9935543278084715, + "grad_norm": 0.2407137155532837, + "learning_rate": 5.253886381486639e-05, + "loss": 1.741, + "step": 16269 + }, + { + "epoch": 4.993861264579497, + "grad_norm": 0.3677252531051636, + "learning_rate": 5.25338996544818e-05, + "loss": 1.7792, + "step": 16270 + }, + { + "epoch": 4.994168201350522, + "grad_norm": 0.25096553564071655, + "learning_rate": 5.252893546905573e-05, + "loss": 1.7523, + "step": 16271 + }, + { + "epoch": 4.994475138121547, + "grad_norm": 0.2966327965259552, + "learning_rate": 5.252397125863723e-05, + "loss": 1.7114, + "step": 16272 + }, + { + "epoch": 4.994782074892572, + "grad_norm": 0.36577650904655457, + "learning_rate": 5.2519007023275356e-05, + "loss": 1.7609, + "step": 16273 + }, + { + "epoch": 4.995089011663597, + "grad_norm": 0.2450687140226364, + "learning_rate": 5.25140427630192e-05, + "loss": 1.7452, + "step": 16274 + }, + { + "epoch": 4.995395948434623, + "grad_norm": 0.20782120525836945, + "learning_rate": 5.250907847791778e-05, + "loss": 1.7109, + "step": 16275 + }, + { + "epoch": 4.995702885205648, + "grad_norm": 0.2423330545425415, + "learning_rate": 5.25041141680202e-05, + "loss": 1.7234, + "step": 16276 + }, + { + "epoch": 4.996009821976672, + "grad_norm": 0.20855975151062012, + "learning_rate": 5.2499149833375484e-05, + "loss": 1.7734, + "step": 16277 + }, + { + "epoch": 4.996316758747698, + "grad_norm": 0.24400894343852997, + "learning_rate": 5.24941854740327e-05, + "loss": 1.7566, + "step": 16278 + }, + { + "epoch": 4.996623695518723, + "grad_norm": 0.4378018379211426, + "learning_rate": 5.2489221090040906e-05, + "loss": 1.7536, + "step": 16279 + }, + { + "epoch": 4.996930632289748, + "grad_norm": 0.20726722478866577, + "learning_rate": 5.248425668144918e-05, + "loss": 1.8008, + "step": 16280 + }, + { + "epoch": 4.997237569060774, + "grad_norm": 0.2506333589553833, + "learning_rate": 5.247929224830658e-05, + "loss": 1.7404, + "step": 16281 + }, + { + "epoch": 4.997544505831799, + "grad_norm": 0.24178004264831543, + "learning_rate": 5.247432779066216e-05, + "loss": 1.7517, + "step": 16282 + }, + { + "epoch": 4.9978514426028235, + "grad_norm": 0.2500220835208893, + "learning_rate": 5.246936330856499e-05, + "loss": 1.7705, + "step": 16283 + }, + { + "epoch": 4.998158379373849, + "grad_norm": 0.30043718218803406, + "learning_rate": 5.24643988020641e-05, + "loss": 1.8118, + "step": 16284 + }, + { + "epoch": 4.998465316144874, + "grad_norm": 0.284805566072464, + "learning_rate": 5.245943427120859e-05, + "loss": 1.7968, + "step": 16285 + }, + { + "epoch": 4.9987722529158995, + "grad_norm": 0.3652406632900238, + "learning_rate": 5.245446971604751e-05, + "loss": 1.7785, + "step": 16286 + }, + { + "epoch": 4.999079189686924, + "grad_norm": 0.24879656732082367, + "learning_rate": 5.244950513662992e-05, + "loss": 1.734, + "step": 16287 + }, + { + "epoch": 4.999386126457949, + "grad_norm": 0.2374224215745926, + "learning_rate": 5.244454053300488e-05, + "loss": 1.7394, + "step": 16288 + }, + { + "epoch": 4.999693063228975, + "grad_norm": 0.27090463042259216, + "learning_rate": 5.243957590522147e-05, + "loss": 1.7529, + "step": 16289 + }, + { + "epoch": 5.0, + "grad_norm": 0.23060791194438934, + "learning_rate": 5.243461125332873e-05, + "loss": 1.7599, + "step": 16290 + }, + { + "epoch": 5.000306936771025, + "grad_norm": 0.21159487962722778, + "learning_rate": 5.242964657737572e-05, + "loss": 1.747, + "step": 16291 + }, + { + "epoch": 5.000613873542051, + "grad_norm": 0.21556304395198822, + "learning_rate": 5.242468187741154e-05, + "loss": 1.7653, + "step": 16292 + }, + { + "epoch": 5.000920810313075, + "grad_norm": 0.2569669783115387, + "learning_rate": 5.241971715348524e-05, + "loss": 1.7284, + "step": 16293 + }, + { + "epoch": 5.0012277470841005, + "grad_norm": 0.2827381491661072, + "learning_rate": 5.241475240564586e-05, + "loss": 1.7765, + "step": 16294 + }, + { + "epoch": 5.001534683855126, + "grad_norm": 0.22498267889022827, + "learning_rate": 5.240978763394249e-05, + "loss": 1.729, + "step": 16295 + }, + { + "epoch": 5.001841620626151, + "grad_norm": 0.23975814878940582, + "learning_rate": 5.240482283842418e-05, + "loss": 1.7968, + "step": 16296 + }, + { + "epoch": 5.0021485573971765, + "grad_norm": 0.20811420679092407, + "learning_rate": 5.239985801914e-05, + "loss": 1.6931, + "step": 16297 + }, + { + "epoch": 5.002455494168202, + "grad_norm": 0.22985060513019562, + "learning_rate": 5.2394893176139014e-05, + "loss": 1.7724, + "step": 16298 + }, + { + "epoch": 5.002762430939226, + "grad_norm": 0.22867995500564575, + "learning_rate": 5.2389928309470305e-05, + "loss": 1.7179, + "step": 16299 + }, + { + "epoch": 5.003069367710252, + "grad_norm": 0.2543974220752716, + "learning_rate": 5.238496341918293e-05, + "loss": 1.7859, + "step": 16300 + }, + { + "epoch": 5.003376304481277, + "grad_norm": 0.226583793759346, + "learning_rate": 5.237999850532592e-05, + "loss": 1.7567, + "step": 16301 + }, + { + "epoch": 5.003683241252302, + "grad_norm": 0.21744728088378906, + "learning_rate": 5.237503356794838e-05, + "loss": 1.7345, + "step": 16302 + }, + { + "epoch": 5.003990178023328, + "grad_norm": 0.25915467739105225, + "learning_rate": 5.2370068607099373e-05, + "loss": 1.7179, + "step": 16303 + }, + { + "epoch": 5.004297114794352, + "grad_norm": 0.20572461187839508, + "learning_rate": 5.236510362282796e-05, + "loss": 1.7211, + "step": 16304 + }, + { + "epoch": 5.004604051565377, + "grad_norm": 0.2821461856365204, + "learning_rate": 5.236013861518321e-05, + "loss": 1.7894, + "step": 16305 + }, + { + "epoch": 5.004910988336403, + "grad_norm": 0.22273759543895721, + "learning_rate": 5.235517358421417e-05, + "loss": 1.7919, + "step": 16306 + }, + { + "epoch": 5.005217925107428, + "grad_norm": 0.23875468969345093, + "learning_rate": 5.2350208529969935e-05, + "loss": 1.7558, + "step": 16307 + }, + { + "epoch": 5.005524861878453, + "grad_norm": 0.24673783779144287, + "learning_rate": 5.234524345249955e-05, + "loss": 1.7705, + "step": 16308 + }, + { + "epoch": 5.005831798649478, + "grad_norm": 0.21992872655391693, + "learning_rate": 5.234027835185211e-05, + "loss": 1.7059, + "step": 16309 + }, + { + "epoch": 5.006138735420503, + "grad_norm": 0.19214966893196106, + "learning_rate": 5.233531322807667e-05, + "loss": 1.6647, + "step": 16310 + }, + { + "epoch": 5.0064456721915285, + "grad_norm": 0.18525120615959167, + "learning_rate": 5.233034808122228e-05, + "loss": 1.719, + "step": 16311 + }, + { + "epoch": 5.006752608962554, + "grad_norm": 0.25996243953704834, + "learning_rate": 5.232538291133804e-05, + "loss": 1.7227, + "step": 16312 + }, + { + "epoch": 5.007059545733579, + "grad_norm": 0.2163757085800171, + "learning_rate": 5.232041771847299e-05, + "loss": 1.6962, + "step": 16313 + }, + { + "epoch": 5.0073664825046045, + "grad_norm": 0.23484158515930176, + "learning_rate": 5.231545250267621e-05, + "loss": 1.7816, + "step": 16314 + }, + { + "epoch": 5.007673419275629, + "grad_norm": 0.2188636213541031, + "learning_rate": 5.2310487263996776e-05, + "loss": 1.7477, + "step": 16315 + }, + { + "epoch": 5.007980356046654, + "grad_norm": 0.1950213611125946, + "learning_rate": 5.230552200248377e-05, + "loss": 1.7165, + "step": 16316 + }, + { + "epoch": 5.00828729281768, + "grad_norm": 0.25340089201927185, + "learning_rate": 5.230055671818623e-05, + "loss": 1.7764, + "step": 16317 + }, + { + "epoch": 5.008594229588705, + "grad_norm": 0.23749271035194397, + "learning_rate": 5.2295591411153245e-05, + "loss": 1.7193, + "step": 16318 + }, + { + "epoch": 5.00890116635973, + "grad_norm": 0.2317294180393219, + "learning_rate": 5.229062608143387e-05, + "loss": 1.7607, + "step": 16319 + }, + { + "epoch": 5.009208103130755, + "grad_norm": 0.2751505672931671, + "learning_rate": 5.228566072907719e-05, + "loss": 1.7562, + "step": 16320 + }, + { + "epoch": 5.00951503990178, + "grad_norm": 0.29476025700569153, + "learning_rate": 5.2280695354132267e-05, + "loss": 1.687, + "step": 16321 + }, + { + "epoch": 5.009821976672805, + "grad_norm": 0.20734120905399323, + "learning_rate": 5.227572995664819e-05, + "loss": 1.7608, + "step": 16322 + }, + { + "epoch": 5.010128913443831, + "grad_norm": 0.2537878155708313, + "learning_rate": 5.227076453667401e-05, + "loss": 1.7947, + "step": 16323 + }, + { + "epoch": 5.010435850214856, + "grad_norm": 0.23516076803207397, + "learning_rate": 5.2265799094258796e-05, + "loss": 1.7545, + "step": 16324 + }, + { + "epoch": 5.0107427869858805, + "grad_norm": 0.2581529915332794, + "learning_rate": 5.226083362945162e-05, + "loss": 1.7529, + "step": 16325 + }, + { + "epoch": 5.011049723756906, + "grad_norm": 0.2982035279273987, + "learning_rate": 5.225586814230158e-05, + "loss": 1.74, + "step": 16326 + }, + { + "epoch": 5.011356660527931, + "grad_norm": 0.2773981988430023, + "learning_rate": 5.225090263285772e-05, + "loss": 1.7562, + "step": 16327 + }, + { + "epoch": 5.0116635972989565, + "grad_norm": 0.19992689788341522, + "learning_rate": 5.2245937101169116e-05, + "loss": 1.6896, + "step": 16328 + }, + { + "epoch": 5.011970534069982, + "grad_norm": 0.2913428246974945, + "learning_rate": 5.224097154728486e-05, + "loss": 1.7574, + "step": 16329 + }, + { + "epoch": 5.012277470841007, + "grad_norm": 0.23173104226589203, + "learning_rate": 5.2236005971254e-05, + "loss": 1.6954, + "step": 16330 + }, + { + "epoch": 5.012584407612032, + "grad_norm": 0.2019525170326233, + "learning_rate": 5.2231040373125614e-05, + "loss": 1.7711, + "step": 16331 + }, + { + "epoch": 5.012891344383057, + "grad_norm": 0.29070746898651123, + "learning_rate": 5.222607475294878e-05, + "loss": 1.8201, + "step": 16332 + }, + { + "epoch": 5.013198281154082, + "grad_norm": 0.22005079686641693, + "learning_rate": 5.222110911077258e-05, + "loss": 1.7421, + "step": 16333 + }, + { + "epoch": 5.013505217925108, + "grad_norm": 0.24422192573547363, + "learning_rate": 5.2216143446646085e-05, + "loss": 1.7074, + "step": 16334 + }, + { + "epoch": 5.013812154696133, + "grad_norm": 0.2417927384376526, + "learning_rate": 5.221117776061836e-05, + "loss": 1.7726, + "step": 16335 + }, + { + "epoch": 5.014119091467157, + "grad_norm": 0.245828777551651, + "learning_rate": 5.2206212052738454e-05, + "loss": 1.7932, + "step": 16336 + }, + { + "epoch": 5.014426028238183, + "grad_norm": 0.24054239690303802, + "learning_rate": 5.220124632305548e-05, + "loss": 1.727, + "step": 16337 + }, + { + "epoch": 5.014732965009208, + "grad_norm": 0.2572494149208069, + "learning_rate": 5.21962805716185e-05, + "loss": 1.7234, + "step": 16338 + }, + { + "epoch": 5.015039901780233, + "grad_norm": 0.33624622225761414, + "learning_rate": 5.2191314798476595e-05, + "loss": 1.7499, + "step": 16339 + }, + { + "epoch": 5.015346838551259, + "grad_norm": 0.22321413457393646, + "learning_rate": 5.218634900367883e-05, + "loss": 1.7155, + "step": 16340 + }, + { + "epoch": 5.015653775322283, + "grad_norm": 0.26709917187690735, + "learning_rate": 5.218138318727429e-05, + "loss": 1.8346, + "step": 16341 + }, + { + "epoch": 5.0159607120933085, + "grad_norm": 0.27600952982902527, + "learning_rate": 5.217641734931202e-05, + "loss": 1.789, + "step": 16342 + }, + { + "epoch": 5.016267648864334, + "grad_norm": 0.21392405033111572, + "learning_rate": 5.217145148984114e-05, + "loss": 1.7266, + "step": 16343 + }, + { + "epoch": 5.016574585635359, + "grad_norm": 0.3215450942516327, + "learning_rate": 5.2166485608910696e-05, + "loss": 1.7453, + "step": 16344 + }, + { + "epoch": 5.0168815224063845, + "grad_norm": 0.22328032553195953, + "learning_rate": 5.2161519706569776e-05, + "loss": 1.7209, + "step": 16345 + }, + { + "epoch": 5.01718845917741, + "grad_norm": 0.2438887059688568, + "learning_rate": 5.215655378286744e-05, + "loss": 1.7289, + "step": 16346 + }, + { + "epoch": 5.017495395948434, + "grad_norm": 0.30078747868537903, + "learning_rate": 5.2151587837852786e-05, + "loss": 1.7483, + "step": 16347 + }, + { + "epoch": 5.01780233271946, + "grad_norm": 0.21723167598247528, + "learning_rate": 5.214662187157488e-05, + "loss": 1.7654, + "step": 16348 + }, + { + "epoch": 5.018109269490485, + "grad_norm": 0.26358669996261597, + "learning_rate": 5.2141655884082784e-05, + "loss": 1.7563, + "step": 16349 + }, + { + "epoch": 5.01841620626151, + "grad_norm": 0.24285505712032318, + "learning_rate": 5.2136689875425615e-05, + "loss": 1.7377, + "step": 16350 + }, + { + "epoch": 5.018723143032536, + "grad_norm": 0.2401108294725418, + "learning_rate": 5.2131723845652416e-05, + "loss": 1.7445, + "step": 16351 + }, + { + "epoch": 5.01903007980356, + "grad_norm": 0.3347793519496918, + "learning_rate": 5.212675779481226e-05, + "loss": 1.7872, + "step": 16352 + }, + { + "epoch": 5.019337016574585, + "grad_norm": 0.306728720664978, + "learning_rate": 5.212179172295424e-05, + "loss": 1.8051, + "step": 16353 + }, + { + "epoch": 5.019643953345611, + "grad_norm": 0.22297725081443787, + "learning_rate": 5.211682563012743e-05, + "loss": 1.7082, + "step": 16354 + }, + { + "epoch": 5.019950890116636, + "grad_norm": 0.24047277867794037, + "learning_rate": 5.211185951638091e-05, + "loss": 1.7024, + "step": 16355 + }, + { + "epoch": 5.020257826887661, + "grad_norm": 0.19570080935955048, + "learning_rate": 5.210689338176377e-05, + "loss": 1.6947, + "step": 16356 + }, + { + "epoch": 5.020564763658686, + "grad_norm": 0.2024889886379242, + "learning_rate": 5.2101927226325066e-05, + "loss": 1.7168, + "step": 16357 + }, + { + "epoch": 5.020871700429711, + "grad_norm": 0.23546278476715088, + "learning_rate": 5.209696105011388e-05, + "loss": 1.7697, + "step": 16358 + }, + { + "epoch": 5.0211786372007365, + "grad_norm": 0.21003498136997223, + "learning_rate": 5.209199485317928e-05, + "loss": 1.7198, + "step": 16359 + }, + { + "epoch": 5.021485573971762, + "grad_norm": 0.21375493705272675, + "learning_rate": 5.208702863557039e-05, + "loss": 1.7689, + "step": 16360 + }, + { + "epoch": 5.021792510742787, + "grad_norm": 0.21549762785434723, + "learning_rate": 5.2082062397336254e-05, + "loss": 1.6936, + "step": 16361 + }, + { + "epoch": 5.0220994475138125, + "grad_norm": 0.22633691132068634, + "learning_rate": 5.207709613852595e-05, + "loss": 1.7512, + "step": 16362 + }, + { + "epoch": 5.022406384284837, + "grad_norm": 0.21888238191604614, + "learning_rate": 5.2072129859188566e-05, + "loss": 1.7082, + "step": 16363 + }, + { + "epoch": 5.022713321055862, + "grad_norm": 0.2416619062423706, + "learning_rate": 5.206716355937318e-05, + "loss": 1.7938, + "step": 16364 + }, + { + "epoch": 5.023020257826888, + "grad_norm": 0.22451527416706085, + "learning_rate": 5.206219723912886e-05, + "loss": 1.7372, + "step": 16365 + }, + { + "epoch": 5.023327194597913, + "grad_norm": 0.19698494672775269, + "learning_rate": 5.2057230898504716e-05, + "loss": 1.7205, + "step": 16366 + }, + { + "epoch": 5.023634131368938, + "grad_norm": 0.2441127747297287, + "learning_rate": 5.205226453754982e-05, + "loss": 1.7625, + "step": 16367 + }, + { + "epoch": 5.023941068139963, + "grad_norm": 0.21940121054649353, + "learning_rate": 5.204729815631323e-05, + "loss": 1.7985, + "step": 16368 + }, + { + "epoch": 5.024248004910988, + "grad_norm": 0.21751399338245392, + "learning_rate": 5.204233175484403e-05, + "loss": 1.7759, + "step": 16369 + }, + { + "epoch": 5.024554941682013, + "grad_norm": 0.20261377096176147, + "learning_rate": 5.2037365333191315e-05, + "loss": 1.746, + "step": 16370 + }, + { + "epoch": 5.024861878453039, + "grad_norm": 0.2628774046897888, + "learning_rate": 5.2032398891404166e-05, + "loss": 1.8178, + "step": 16371 + }, + { + "epoch": 5.025168815224064, + "grad_norm": 0.20626378059387207, + "learning_rate": 5.2027432429531665e-05, + "loss": 1.7456, + "step": 16372 + }, + { + "epoch": 5.0254757519950894, + "grad_norm": 0.25548869371414185, + "learning_rate": 5.2022465947622876e-05, + "loss": 1.8098, + "step": 16373 + }, + { + "epoch": 5.025782688766114, + "grad_norm": 0.1978374719619751, + "learning_rate": 5.20174994457269e-05, + "loss": 1.685, + "step": 16374 + }, + { + "epoch": 5.026089625537139, + "grad_norm": 0.2708980143070221, + "learning_rate": 5.201253292389282e-05, + "loss": 1.7464, + "step": 16375 + }, + { + "epoch": 5.026396562308165, + "grad_norm": 0.2730494737625122, + "learning_rate": 5.2007566382169706e-05, + "loss": 1.7391, + "step": 16376 + }, + { + "epoch": 5.02670349907919, + "grad_norm": 0.243557408452034, + "learning_rate": 5.2002599820606624e-05, + "loss": 1.7439, + "step": 16377 + }, + { + "epoch": 5.027010435850215, + "grad_norm": 0.2208259105682373, + "learning_rate": 5.19976332392527e-05, + "loss": 1.7612, + "step": 16378 + }, + { + "epoch": 5.02731737262124, + "grad_norm": 0.21288715302944183, + "learning_rate": 5.199266663815698e-05, + "loss": 1.7546, + "step": 16379 + }, + { + "epoch": 5.027624309392265, + "grad_norm": 0.2106054425239563, + "learning_rate": 5.198770001736857e-05, + "loss": 1.7281, + "step": 16380 + }, + { + "epoch": 5.02793124616329, + "grad_norm": 0.2247164249420166, + "learning_rate": 5.198273337693654e-05, + "loss": 1.8405, + "step": 16381 + }, + { + "epoch": 5.028238182934316, + "grad_norm": 0.21713724732398987, + "learning_rate": 5.197776671690998e-05, + "loss": 1.7333, + "step": 16382 + }, + { + "epoch": 5.028545119705341, + "grad_norm": 0.24063727259635925, + "learning_rate": 5.1972800037337956e-05, + "loss": 1.7608, + "step": 16383 + }, + { + "epoch": 5.0288520564763655, + "grad_norm": 0.22022177278995514, + "learning_rate": 5.196783333826959e-05, + "loss": 1.7045, + "step": 16384 + }, + { + "epoch": 5.029158993247391, + "grad_norm": 0.21348948776721954, + "learning_rate": 5.1962866619753927e-05, + "loss": 1.7516, + "step": 16385 + }, + { + "epoch": 5.029465930018416, + "grad_norm": 0.289315789937973, + "learning_rate": 5.195789988184007e-05, + "loss": 1.8555, + "step": 16386 + }, + { + "epoch": 5.0297728667894415, + "grad_norm": 0.30966848134994507, + "learning_rate": 5.19529331245771e-05, + "loss": 1.7245, + "step": 16387 + }, + { + "epoch": 5.030079803560467, + "grad_norm": 0.24625633656978607, + "learning_rate": 5.194796634801409e-05, + "loss": 1.7788, + "step": 16388 + }, + { + "epoch": 5.030386740331492, + "grad_norm": 0.25937986373901367, + "learning_rate": 5.1942999552200136e-05, + "loss": 1.7655, + "step": 16389 + }, + { + "epoch": 5.030693677102517, + "grad_norm": 0.3056741952896118, + "learning_rate": 5.1938032737184325e-05, + "loss": 1.7167, + "step": 16390 + }, + { + "epoch": 5.031000613873542, + "grad_norm": 0.29773563146591187, + "learning_rate": 5.1933065903015743e-05, + "loss": 1.7247, + "step": 16391 + }, + { + "epoch": 5.031307550644567, + "grad_norm": 0.26433971524238586, + "learning_rate": 5.192809904974347e-05, + "loss": 1.7779, + "step": 16392 + }, + { + "epoch": 5.031614487415593, + "grad_norm": 0.3308073580265045, + "learning_rate": 5.192313217741659e-05, + "loss": 1.7782, + "step": 16393 + }, + { + "epoch": 5.031921424186618, + "grad_norm": 0.2584165632724762, + "learning_rate": 5.1918165286084176e-05, + "loss": 1.7812, + "step": 16394 + }, + { + "epoch": 5.032228360957642, + "grad_norm": 0.31678953766822815, + "learning_rate": 5.1913198375795346e-05, + "loss": 1.7341, + "step": 16395 + }, + { + "epoch": 5.032535297728668, + "grad_norm": 0.3527325391769409, + "learning_rate": 5.190823144659916e-05, + "loss": 1.7844, + "step": 16396 + }, + { + "epoch": 5.032842234499693, + "grad_norm": 0.29233935475349426, + "learning_rate": 5.1903264498544724e-05, + "loss": 1.7993, + "step": 16397 + }, + { + "epoch": 5.033149171270718, + "grad_norm": 0.24549467861652374, + "learning_rate": 5.1898297531681106e-05, + "loss": 1.7294, + "step": 16398 + }, + { + "epoch": 5.033456108041744, + "grad_norm": 0.3446930944919586, + "learning_rate": 5.18933305460574e-05, + "loss": 1.6818, + "step": 16399 + }, + { + "epoch": 5.033763044812768, + "grad_norm": 0.2628229856491089, + "learning_rate": 5.188836354172268e-05, + "loss": 1.7867, + "step": 16400 + }, + { + "epoch": 5.0340699815837935, + "grad_norm": 0.26548629999160767, + "learning_rate": 5.188339651872607e-05, + "loss": 1.7448, + "step": 16401 + }, + { + "epoch": 5.034376918354819, + "grad_norm": 0.29242032766342163, + "learning_rate": 5.187842947711662e-05, + "loss": 1.7103, + "step": 16402 + }, + { + "epoch": 5.034683855125844, + "grad_norm": 0.2515408992767334, + "learning_rate": 5.187346241694343e-05, + "loss": 1.7865, + "step": 16403 + }, + { + "epoch": 5.0349907918968695, + "grad_norm": 0.2253103256225586, + "learning_rate": 5.186849533825559e-05, + "loss": 1.6993, + "step": 16404 + }, + { + "epoch": 5.035297728667895, + "grad_norm": 0.2743360102176666, + "learning_rate": 5.1863528241102154e-05, + "loss": 1.7532, + "step": 16405 + }, + { + "epoch": 5.035604665438919, + "grad_norm": 0.22807851433753967, + "learning_rate": 5.185856112553227e-05, + "loss": 1.7873, + "step": 16406 + }, + { + "epoch": 5.035911602209945, + "grad_norm": 0.23719090223312378, + "learning_rate": 5.1853593991594985e-05, + "loss": 1.7555, + "step": 16407 + }, + { + "epoch": 5.03621853898097, + "grad_norm": 0.2964477241039276, + "learning_rate": 5.184862683933941e-05, + "loss": 1.7204, + "step": 16408 + }, + { + "epoch": 5.036525475751995, + "grad_norm": 0.23717865347862244, + "learning_rate": 5.18436596688146e-05, + "loss": 1.7239, + "step": 16409 + }, + { + "epoch": 5.036832412523021, + "grad_norm": 0.22650085389614105, + "learning_rate": 5.1838692480069686e-05, + "loss": 1.7148, + "step": 16410 + }, + { + "epoch": 5.037139349294045, + "grad_norm": 0.25606781244277954, + "learning_rate": 5.183372527315371e-05, + "loss": 1.7916, + "step": 16411 + }, + { + "epoch": 5.03744628606507, + "grad_norm": 0.22266390919685364, + "learning_rate": 5.182875804811581e-05, + "loss": 1.7481, + "step": 16412 + }, + { + "epoch": 5.037753222836096, + "grad_norm": 0.23481780290603638, + "learning_rate": 5.1823790805005045e-05, + "loss": 1.8014, + "step": 16413 + }, + { + "epoch": 5.038060159607121, + "grad_norm": 0.2629338800907135, + "learning_rate": 5.1818823543870506e-05, + "loss": 1.81, + "step": 16414 + }, + { + "epoch": 5.038367096378146, + "grad_norm": 0.22891482710838318, + "learning_rate": 5.18138562647613e-05, + "loss": 1.757, + "step": 16415 + }, + { + "epoch": 5.038674033149171, + "grad_norm": 0.2666641175746918, + "learning_rate": 5.180888896772649e-05, + "loss": 1.7457, + "step": 16416 + }, + { + "epoch": 5.038980969920196, + "grad_norm": 0.37610310316085815, + "learning_rate": 5.180392165281517e-05, + "loss": 1.8214, + "step": 16417 + }, + { + "epoch": 5.0392879066912215, + "grad_norm": 0.2521277964115143, + "learning_rate": 5.1798954320076455e-05, + "loss": 1.7731, + "step": 16418 + }, + { + "epoch": 5.039594843462247, + "grad_norm": 0.25097090005874634, + "learning_rate": 5.1793986969559415e-05, + "loss": 1.8029, + "step": 16419 + }, + { + "epoch": 5.039901780233272, + "grad_norm": 0.2946726381778717, + "learning_rate": 5.178901960131315e-05, + "loss": 1.7483, + "step": 16420 + }, + { + "epoch": 5.0402087170042975, + "grad_norm": 0.24240419268608093, + "learning_rate": 5.1784052215386736e-05, + "loss": 1.731, + "step": 16421 + }, + { + "epoch": 5.040515653775322, + "grad_norm": 0.2403198480606079, + "learning_rate": 5.177908481182926e-05, + "loss": 1.722, + "step": 16422 + }, + { + "epoch": 5.040822590546347, + "grad_norm": 0.3451874554157257, + "learning_rate": 5.177411739068985e-05, + "loss": 1.7562, + "step": 16423 + }, + { + "epoch": 5.041129527317373, + "grad_norm": 0.3244951069355011, + "learning_rate": 5.176914995201756e-05, + "loss": 1.7321, + "step": 16424 + }, + { + "epoch": 5.041436464088398, + "grad_norm": 0.2346230000257492, + "learning_rate": 5.176418249586149e-05, + "loss": 1.7839, + "step": 16425 + }, + { + "epoch": 5.041743400859423, + "grad_norm": 0.357022225856781, + "learning_rate": 5.1759215022270744e-05, + "loss": 1.7776, + "step": 16426 + }, + { + "epoch": 5.042050337630448, + "grad_norm": 0.259007066488266, + "learning_rate": 5.17542475312944e-05, + "loss": 1.7544, + "step": 16427 + }, + { + "epoch": 5.042357274401473, + "grad_norm": 0.2516533136367798, + "learning_rate": 5.174928002298154e-05, + "loss": 1.7269, + "step": 16428 + }, + { + "epoch": 5.042664211172498, + "grad_norm": 0.3393619954586029, + "learning_rate": 5.174431249738129e-05, + "loss": 1.7487, + "step": 16429 + }, + { + "epoch": 5.042971147943524, + "grad_norm": 0.2730594873428345, + "learning_rate": 5.1739344954542714e-05, + "loss": 1.7468, + "step": 16430 + }, + { + "epoch": 5.043278084714549, + "grad_norm": 0.21233965456485748, + "learning_rate": 5.1734377394514914e-05, + "loss": 1.783, + "step": 16431 + }, + { + "epoch": 5.043585021485574, + "grad_norm": 0.3460896909236908, + "learning_rate": 5.1729409817346974e-05, + "loss": 1.7497, + "step": 16432 + }, + { + "epoch": 5.043891958256599, + "grad_norm": 0.31918221712112427, + "learning_rate": 5.1724442223088e-05, + "loss": 1.7834, + "step": 16433 + }, + { + "epoch": 5.044198895027624, + "grad_norm": 0.23016802966594696, + "learning_rate": 5.171947461178706e-05, + "loss": 1.7348, + "step": 16434 + }, + { + "epoch": 5.0445058317986495, + "grad_norm": 0.35758304595947266, + "learning_rate": 5.171450698349329e-05, + "loss": 1.7734, + "step": 16435 + }, + { + "epoch": 5.044812768569675, + "grad_norm": 0.279725581407547, + "learning_rate": 5.170953933825574e-05, + "loss": 1.7283, + "step": 16436 + }, + { + "epoch": 5.0451197053407, + "grad_norm": 0.23965120315551758, + "learning_rate": 5.170457167612354e-05, + "loss": 1.7606, + "step": 16437 + }, + { + "epoch": 5.045426642111725, + "grad_norm": 0.28026309609413147, + "learning_rate": 5.169960399714574e-05, + "loss": 1.7872, + "step": 16438 + }, + { + "epoch": 5.04573357888275, + "grad_norm": 0.3262448012828827, + "learning_rate": 5.169463630137146e-05, + "loss": 1.8654, + "step": 16439 + }, + { + "epoch": 5.046040515653775, + "grad_norm": 0.4249584674835205, + "learning_rate": 5.168966858884979e-05, + "loss": 1.7244, + "step": 16440 + }, + { + "epoch": 5.046347452424801, + "grad_norm": 0.3385370969772339, + "learning_rate": 5.168470085962984e-05, + "loss": 1.7745, + "step": 16441 + }, + { + "epoch": 5.046654389195826, + "grad_norm": 0.2321811318397522, + "learning_rate": 5.1679733113760675e-05, + "loss": 1.8093, + "step": 16442 + }, + { + "epoch": 5.04696132596685, + "grad_norm": 0.3426755368709564, + "learning_rate": 5.167476535129141e-05, + "loss": 1.7752, + "step": 16443 + }, + { + "epoch": 5.047268262737876, + "grad_norm": 0.27672505378723145, + "learning_rate": 5.166979757227114e-05, + "loss": 1.7619, + "step": 16444 + }, + { + "epoch": 5.047575199508901, + "grad_norm": 0.4111184775829315, + "learning_rate": 5.1664829776748925e-05, + "loss": 1.7672, + "step": 16445 + }, + { + "epoch": 5.047882136279926, + "grad_norm": 0.40139874815940857, + "learning_rate": 5.1659861964773905e-05, + "loss": 1.7753, + "step": 16446 + }, + { + "epoch": 5.048189073050952, + "grad_norm": 0.28931725025177, + "learning_rate": 5.165489413639516e-05, + "loss": 1.7607, + "step": 16447 + }, + { + "epoch": 5.048496009821977, + "grad_norm": 0.297538161277771, + "learning_rate": 5.1649926291661775e-05, + "loss": 1.7661, + "step": 16448 + }, + { + "epoch": 5.0488029465930016, + "grad_norm": 0.4299027621746063, + "learning_rate": 5.1644958430622846e-05, + "loss": 1.6998, + "step": 16449 + }, + { + "epoch": 5.049109883364027, + "grad_norm": 0.2554767429828644, + "learning_rate": 5.163999055332749e-05, + "loss": 1.7716, + "step": 16450 + }, + { + "epoch": 5.049416820135052, + "grad_norm": 0.3561006486415863, + "learning_rate": 5.163502265982477e-05, + "loss": 1.7493, + "step": 16451 + }, + { + "epoch": 5.0497237569060776, + "grad_norm": 0.3839687407016754, + "learning_rate": 5.1630054750163806e-05, + "loss": 1.7314, + "step": 16452 + }, + { + "epoch": 5.050030693677103, + "grad_norm": 0.20022284984588623, + "learning_rate": 5.1625086824393684e-05, + "loss": 1.6992, + "step": 16453 + }, + { + "epoch": 5.050337630448127, + "grad_norm": 0.36830398440361023, + "learning_rate": 5.162011888256349e-05, + "loss": 1.7339, + "step": 16454 + }, + { + "epoch": 5.050644567219153, + "grad_norm": 0.31947389245033264, + "learning_rate": 5.161515092472236e-05, + "loss": 1.7254, + "step": 16455 + }, + { + "epoch": 5.050951503990178, + "grad_norm": 0.2779252827167511, + "learning_rate": 5.161018295091933e-05, + "loss": 1.7941, + "step": 16456 + }, + { + "epoch": 5.051258440761203, + "grad_norm": 0.3796578347682953, + "learning_rate": 5.160521496120354e-05, + "loss": 1.7389, + "step": 16457 + }, + { + "epoch": 5.051565377532229, + "grad_norm": 0.23569442331790924, + "learning_rate": 5.1600246955624076e-05, + "loss": 1.7149, + "step": 16458 + }, + { + "epoch": 5.051872314303253, + "grad_norm": 0.27342507243156433, + "learning_rate": 5.159527893423004e-05, + "loss": 1.699, + "step": 16459 + }, + { + "epoch": 5.0521792510742785, + "grad_norm": 0.2877296209335327, + "learning_rate": 5.159031089707052e-05, + "loss": 1.7668, + "step": 16460 + }, + { + "epoch": 5.052486187845304, + "grad_norm": 0.21482446789741516, + "learning_rate": 5.1585342844194605e-05, + "loss": 1.7132, + "step": 16461 + }, + { + "epoch": 5.052793124616329, + "grad_norm": 0.23588669300079346, + "learning_rate": 5.158037477565142e-05, + "loss": 1.7267, + "step": 16462 + }, + { + "epoch": 5.0531000613873545, + "grad_norm": 0.20188623666763306, + "learning_rate": 5.157540669149003e-05, + "loss": 1.7486, + "step": 16463 + }, + { + "epoch": 5.05340699815838, + "grad_norm": 0.2012643963098526, + "learning_rate": 5.157043859175955e-05, + "loss": 1.718, + "step": 16464 + }, + { + "epoch": 5.053713934929404, + "grad_norm": 0.23133818805217743, + "learning_rate": 5.156547047650908e-05, + "loss": 1.7892, + "step": 16465 + }, + { + "epoch": 5.05402087170043, + "grad_norm": 0.2524542510509491, + "learning_rate": 5.156050234578771e-05, + "loss": 1.8034, + "step": 16466 + }, + { + "epoch": 5.054327808471455, + "grad_norm": 0.20992529392242432, + "learning_rate": 5.155553419964454e-05, + "loss": 1.7158, + "step": 16467 + }, + { + "epoch": 5.05463474524248, + "grad_norm": 0.23815447092056274, + "learning_rate": 5.155056603812868e-05, + "loss": 1.7632, + "step": 16468 + }, + { + "epoch": 5.054941682013506, + "grad_norm": 0.3306051790714264, + "learning_rate": 5.1545597861289205e-05, + "loss": 1.7719, + "step": 16469 + }, + { + "epoch": 5.05524861878453, + "grad_norm": 0.287541925907135, + "learning_rate": 5.154062966917523e-05, + "loss": 1.7092, + "step": 16470 + }, + { + "epoch": 5.055555555555555, + "grad_norm": 0.28186658024787903, + "learning_rate": 5.153566146183586e-05, + "loss": 1.8548, + "step": 16471 + }, + { + "epoch": 5.055862492326581, + "grad_norm": 0.3511136472225189, + "learning_rate": 5.153069323932017e-05, + "loss": 1.8029, + "step": 16472 + }, + { + "epoch": 5.056169429097606, + "grad_norm": 0.32083824276924133, + "learning_rate": 5.152572500167728e-05, + "loss": 1.7321, + "step": 16473 + }, + { + "epoch": 5.056476365868631, + "grad_norm": 0.22571051120758057, + "learning_rate": 5.1520756748956265e-05, + "loss": 1.7218, + "step": 16474 + }, + { + "epoch": 5.056783302639656, + "grad_norm": 0.2902646064758301, + "learning_rate": 5.151578848120626e-05, + "loss": 1.7231, + "step": 16475 + }, + { + "epoch": 5.057090239410681, + "grad_norm": 0.20447610318660736, + "learning_rate": 5.1510820198476336e-05, + "loss": 1.6998, + "step": 16476 + }, + { + "epoch": 5.0573971761817065, + "grad_norm": 0.29436638951301575, + "learning_rate": 5.1505851900815606e-05, + "loss": 1.6793, + "step": 16477 + }, + { + "epoch": 5.057704112952732, + "grad_norm": 0.29718565940856934, + "learning_rate": 5.1500883588273164e-05, + "loss": 1.8322, + "step": 16478 + }, + { + "epoch": 5.058011049723757, + "grad_norm": 0.23530519008636475, + "learning_rate": 5.149591526089811e-05, + "loss": 1.7408, + "step": 16479 + }, + { + "epoch": 5.0583179864947825, + "grad_norm": 0.30735042691230774, + "learning_rate": 5.1490946918739536e-05, + "loss": 1.7454, + "step": 16480 + }, + { + "epoch": 5.058624923265807, + "grad_norm": 0.26151445508003235, + "learning_rate": 5.148597856184656e-05, + "loss": 1.7728, + "step": 16481 + }, + { + "epoch": 5.058931860036832, + "grad_norm": 0.2657756209373474, + "learning_rate": 5.1481010190268263e-05, + "loss": 1.7905, + "step": 16482 + }, + { + "epoch": 5.059238796807858, + "grad_norm": 0.25418251752853394, + "learning_rate": 5.147604180405376e-05, + "loss": 1.7676, + "step": 16483 + }, + { + "epoch": 5.059545733578883, + "grad_norm": 0.25486254692077637, + "learning_rate": 5.1471073403252154e-05, + "loss": 1.8347, + "step": 16484 + }, + { + "epoch": 5.059852670349908, + "grad_norm": 0.22693100571632385, + "learning_rate": 5.146610498791255e-05, + "loss": 1.7308, + "step": 16485 + }, + { + "epoch": 5.060159607120933, + "grad_norm": 0.22056837379932404, + "learning_rate": 5.146113655808401e-05, + "loss": 1.7158, + "step": 16486 + }, + { + "epoch": 5.060466543891958, + "grad_norm": 0.221246138215065, + "learning_rate": 5.1456168113815685e-05, + "loss": 1.6985, + "step": 16487 + }, + { + "epoch": 5.060773480662983, + "grad_norm": 0.2149408906698227, + "learning_rate": 5.145119965515664e-05, + "loss": 1.716, + "step": 16488 + }, + { + "epoch": 5.061080417434009, + "grad_norm": 0.23958513140678406, + "learning_rate": 5.144623118215599e-05, + "loss": 1.8092, + "step": 16489 + }, + { + "epoch": 5.061387354205034, + "grad_norm": 0.2870621085166931, + "learning_rate": 5.1441262694862836e-05, + "loss": 1.75, + "step": 16490 + }, + { + "epoch": 5.0616942909760585, + "grad_norm": 0.26755061745643616, + "learning_rate": 5.1436294193326276e-05, + "loss": 1.7848, + "step": 16491 + }, + { + "epoch": 5.062001227747084, + "grad_norm": 0.2434249073266983, + "learning_rate": 5.143132567759542e-05, + "loss": 1.7487, + "step": 16492 + }, + { + "epoch": 5.062308164518109, + "grad_norm": 0.3044668138027191, + "learning_rate": 5.142635714771936e-05, + "loss": 1.741, + "step": 16493 + }, + { + "epoch": 5.0626151012891345, + "grad_norm": 0.2166958749294281, + "learning_rate": 5.142138860374721e-05, + "loss": 1.7232, + "step": 16494 + }, + { + "epoch": 5.06292203806016, + "grad_norm": 0.34558552503585815, + "learning_rate": 5.141642004572806e-05, + "loss": 1.7663, + "step": 16495 + }, + { + "epoch": 5.063228974831185, + "grad_norm": 0.330751895904541, + "learning_rate": 5.141145147371102e-05, + "loss": 1.6818, + "step": 16496 + }, + { + "epoch": 5.06353591160221, + "grad_norm": 0.21613973379135132, + "learning_rate": 5.140648288774518e-05, + "loss": 1.7914, + "step": 16497 + }, + { + "epoch": 5.063842848373235, + "grad_norm": 0.32759732007980347, + "learning_rate": 5.140151428787966e-05, + "loss": 1.7543, + "step": 16498 + }, + { + "epoch": 5.06414978514426, + "grad_norm": 0.3180293142795563, + "learning_rate": 5.1396545674163556e-05, + "loss": 1.8163, + "step": 16499 + }, + { + "epoch": 5.064456721915286, + "grad_norm": 0.19757944345474243, + "learning_rate": 5.1391577046645964e-05, + "loss": 1.71, + "step": 16500 + }, + { + "epoch": 5.064763658686311, + "grad_norm": 0.253366619348526, + "learning_rate": 5.1386608405376005e-05, + "loss": 1.7266, + "step": 16501 + }, + { + "epoch": 5.065070595457335, + "grad_norm": 0.24577608704566956, + "learning_rate": 5.1381639750402754e-05, + "loss": 1.7218, + "step": 16502 + }, + { + "epoch": 5.065377532228361, + "grad_norm": 0.22847014665603638, + "learning_rate": 5.137667108177533e-05, + "loss": 1.8025, + "step": 16503 + }, + { + "epoch": 5.065684468999386, + "grad_norm": 0.2089833766222, + "learning_rate": 5.137170239954284e-05, + "loss": 1.8032, + "step": 16504 + }, + { + "epoch": 5.065991405770411, + "grad_norm": 0.21528512239456177, + "learning_rate": 5.136673370375439e-05, + "loss": 1.7227, + "step": 16505 + }, + { + "epoch": 5.066298342541437, + "grad_norm": 0.2099117785692215, + "learning_rate": 5.1361764994459074e-05, + "loss": 1.7176, + "step": 16506 + }, + { + "epoch": 5.066605279312462, + "grad_norm": 0.2140430212020874, + "learning_rate": 5.135679627170599e-05, + "loss": 1.8195, + "step": 16507 + }, + { + "epoch": 5.0669122160834865, + "grad_norm": 0.20253533124923706, + "learning_rate": 5.135182753554424e-05, + "loss": 1.7284, + "step": 16508 + }, + { + "epoch": 5.067219152854512, + "grad_norm": 0.19945639371871948, + "learning_rate": 5.134685878602295e-05, + "loss": 1.6915, + "step": 16509 + }, + { + "epoch": 5.067526089625537, + "grad_norm": 0.20138494670391083, + "learning_rate": 5.1341890023191216e-05, + "loss": 1.7856, + "step": 16510 + }, + { + "epoch": 5.0678330263965625, + "grad_norm": 0.22124232351779938, + "learning_rate": 5.1336921247098136e-05, + "loss": 1.7674, + "step": 16511 + }, + { + "epoch": 5.068139963167588, + "grad_norm": 0.21564216911792755, + "learning_rate": 5.133195245779282e-05, + "loss": 1.6998, + "step": 16512 + }, + { + "epoch": 5.068446899938612, + "grad_norm": 0.21836799383163452, + "learning_rate": 5.1326983655324365e-05, + "loss": 1.7468, + "step": 16513 + }, + { + "epoch": 5.068753836709638, + "grad_norm": 0.2412201464176178, + "learning_rate": 5.132201483974187e-05, + "loss": 1.7433, + "step": 16514 + }, + { + "epoch": 5.069060773480663, + "grad_norm": 0.262054979801178, + "learning_rate": 5.131704601109446e-05, + "loss": 1.8315, + "step": 16515 + }, + { + "epoch": 5.069367710251688, + "grad_norm": 0.21573080122470856, + "learning_rate": 5.1312077169431225e-05, + "loss": 1.7668, + "step": 16516 + }, + { + "epoch": 5.069674647022714, + "grad_norm": 0.21407057344913483, + "learning_rate": 5.130710831480129e-05, + "loss": 1.7486, + "step": 16517 + }, + { + "epoch": 5.069981583793738, + "grad_norm": 0.2128407508134842, + "learning_rate": 5.130213944725373e-05, + "loss": 1.7618, + "step": 16518 + }, + { + "epoch": 5.070288520564763, + "grad_norm": 0.2034141719341278, + "learning_rate": 5.129717056683767e-05, + "loss": 1.726, + "step": 16519 + }, + { + "epoch": 5.070595457335789, + "grad_norm": 0.21474458277225494, + "learning_rate": 5.1292201673602205e-05, + "loss": 1.7883, + "step": 16520 + }, + { + "epoch": 5.070902394106814, + "grad_norm": 0.2102673202753067, + "learning_rate": 5.128723276759645e-05, + "loss": 1.7826, + "step": 16521 + }, + { + "epoch": 5.071209330877839, + "grad_norm": 0.21342496573925018, + "learning_rate": 5.1282263848869505e-05, + "loss": 1.7561, + "step": 16522 + }, + { + "epoch": 5.071516267648865, + "grad_norm": 0.21749620139598846, + "learning_rate": 5.1277294917470474e-05, + "loss": 1.7814, + "step": 16523 + }, + { + "epoch": 5.071823204419889, + "grad_norm": 0.20006774365901947, + "learning_rate": 5.1272325973448476e-05, + "loss": 1.6965, + "step": 16524 + }, + { + "epoch": 5.0721301411909145, + "grad_norm": 0.20878590643405914, + "learning_rate": 5.1267357016852593e-05, + "loss": 1.7426, + "step": 16525 + }, + { + "epoch": 5.07243707796194, + "grad_norm": 0.21824820339679718, + "learning_rate": 5.1262388047731946e-05, + "loss": 1.7704, + "step": 16526 + }, + { + "epoch": 5.072744014732965, + "grad_norm": 0.1992526650428772, + "learning_rate": 5.125741906613565e-05, + "loss": 1.7874, + "step": 16527 + }, + { + "epoch": 5.0730509515039905, + "grad_norm": 0.21028028428554535, + "learning_rate": 5.12524500721128e-05, + "loss": 1.7483, + "step": 16528 + }, + { + "epoch": 5.073357888275015, + "grad_norm": 0.21840833127498627, + "learning_rate": 5.12474810657125e-05, + "loss": 1.7763, + "step": 16529 + }, + { + "epoch": 5.07366482504604, + "grad_norm": 0.249269038438797, + "learning_rate": 5.124251204698387e-05, + "loss": 1.7451, + "step": 16530 + }, + { + "epoch": 5.073971761817066, + "grad_norm": 0.2176963835954666, + "learning_rate": 5.1237543015975986e-05, + "loss": 1.7079, + "step": 16531 + }, + { + "epoch": 5.074278698588091, + "grad_norm": 0.20284616947174072, + "learning_rate": 5.1232573972738e-05, + "loss": 1.7235, + "step": 16532 + }, + { + "epoch": 5.074585635359116, + "grad_norm": 0.20140530169010162, + "learning_rate": 5.1227604917318984e-05, + "loss": 1.7014, + "step": 16533 + }, + { + "epoch": 5.074892572130141, + "grad_norm": 0.2407023161649704, + "learning_rate": 5.1222635849768066e-05, + "loss": 1.7493, + "step": 16534 + }, + { + "epoch": 5.075199508901166, + "grad_norm": 0.2013770490884781, + "learning_rate": 5.121766677013433e-05, + "loss": 1.7601, + "step": 16535 + }, + { + "epoch": 5.0755064456721914, + "grad_norm": 0.23889221251010895, + "learning_rate": 5.1212697678466916e-05, + "loss": 1.7282, + "step": 16536 + }, + { + "epoch": 5.075813382443217, + "grad_norm": 0.2411198765039444, + "learning_rate": 5.120772857481489e-05, + "loss": 1.8138, + "step": 16537 + }, + { + "epoch": 5.076120319214242, + "grad_norm": 0.24521365761756897, + "learning_rate": 5.12027594592274e-05, + "loss": 1.7659, + "step": 16538 + }, + { + "epoch": 5.0764272559852675, + "grad_norm": 0.2841372787952423, + "learning_rate": 5.119779033175354e-05, + "loss": 1.7973, + "step": 16539 + }, + { + "epoch": 5.076734192756292, + "grad_norm": 0.21796928346157074, + "learning_rate": 5.1192821192442395e-05, + "loss": 1.6985, + "step": 16540 + }, + { + "epoch": 5.077041129527317, + "grad_norm": 0.2244848757982254, + "learning_rate": 5.118785204134311e-05, + "loss": 1.7413, + "step": 16541 + }, + { + "epoch": 5.077348066298343, + "grad_norm": 0.22581063210964203, + "learning_rate": 5.1182882878504766e-05, + "loss": 1.7706, + "step": 16542 + }, + { + "epoch": 5.077655003069368, + "grad_norm": 0.24478016793727875, + "learning_rate": 5.117791370397647e-05, + "loss": 1.7628, + "step": 16543 + }, + { + "epoch": 5.077961939840393, + "grad_norm": 0.31270188093185425, + "learning_rate": 5.117294451780734e-05, + "loss": 1.8254, + "step": 16544 + }, + { + "epoch": 5.078268876611418, + "grad_norm": 0.3547368049621582, + "learning_rate": 5.11679753200465e-05, + "loss": 1.781, + "step": 16545 + }, + { + "epoch": 5.078575813382443, + "grad_norm": 0.24920180439949036, + "learning_rate": 5.116300611074304e-05, + "loss": 1.7748, + "step": 16546 + }, + { + "epoch": 5.078882750153468, + "grad_norm": 0.2368776649236679, + "learning_rate": 5.115803688994607e-05, + "loss": 1.7459, + "step": 16547 + }, + { + "epoch": 5.079189686924494, + "grad_norm": 0.28341975808143616, + "learning_rate": 5.115306765770471e-05, + "loss": 1.6694, + "step": 16548 + }, + { + "epoch": 5.079496623695519, + "grad_norm": 0.2521432936191559, + "learning_rate": 5.114809841406804e-05, + "loss": 1.7544, + "step": 16549 + }, + { + "epoch": 5.0798035604665435, + "grad_norm": 0.21199844777584076, + "learning_rate": 5.11431291590852e-05, + "loss": 1.7215, + "step": 16550 + }, + { + "epoch": 5.080110497237569, + "grad_norm": 0.25157347321510315, + "learning_rate": 5.113815989280528e-05, + "loss": 1.8021, + "step": 16551 + }, + { + "epoch": 5.080417434008594, + "grad_norm": 0.2284129559993744, + "learning_rate": 5.1133190615277414e-05, + "loss": 1.7125, + "step": 16552 + }, + { + "epoch": 5.0807243707796195, + "grad_norm": 0.2297726720571518, + "learning_rate": 5.11282213265507e-05, + "loss": 1.7602, + "step": 16553 + }, + { + "epoch": 5.081031307550645, + "grad_norm": 0.22392617166042328, + "learning_rate": 5.112325202667421e-05, + "loss": 1.7251, + "step": 16554 + }, + { + "epoch": 5.08133824432167, + "grad_norm": 0.22406147420406342, + "learning_rate": 5.11182827156971e-05, + "loss": 1.7232, + "step": 16555 + }, + { + "epoch": 5.081645181092695, + "grad_norm": 0.2547284960746765, + "learning_rate": 5.111331339366846e-05, + "loss": 1.7335, + "step": 16556 + }, + { + "epoch": 5.08195211786372, + "grad_norm": 0.216146782040596, + "learning_rate": 5.1108344060637415e-05, + "loss": 1.7469, + "step": 16557 + }, + { + "epoch": 5.082259054634745, + "grad_norm": 0.1926967352628708, + "learning_rate": 5.110337471665306e-05, + "loss": 1.7492, + "step": 16558 + }, + { + "epoch": 5.082565991405771, + "grad_norm": 0.30311331152915955, + "learning_rate": 5.109840536176451e-05, + "loss": 1.8129, + "step": 16559 + }, + { + "epoch": 5.082872928176796, + "grad_norm": 0.24273787438869476, + "learning_rate": 5.109343599602087e-05, + "loss": 1.7206, + "step": 16560 + }, + { + "epoch": 5.08317986494782, + "grad_norm": 0.22736592590808868, + "learning_rate": 5.1088466619471255e-05, + "loss": 1.732, + "step": 16561 + }, + { + "epoch": 5.083486801718846, + "grad_norm": 0.21457640826702118, + "learning_rate": 5.1083497232164777e-05, + "loss": 1.726, + "step": 16562 + }, + { + "epoch": 5.083793738489871, + "grad_norm": 0.20968590676784515, + "learning_rate": 5.107852783415055e-05, + "loss": 1.8095, + "step": 16563 + }, + { + "epoch": 5.084100675260896, + "grad_norm": 0.2846728265285492, + "learning_rate": 5.107355842547768e-05, + "loss": 1.7524, + "step": 16564 + }, + { + "epoch": 5.084407612031922, + "grad_norm": 0.21162885427474976, + "learning_rate": 5.106858900619526e-05, + "loss": 1.753, + "step": 16565 + }, + { + "epoch": 5.084714548802946, + "grad_norm": 0.24349012970924377, + "learning_rate": 5.106361957635242e-05, + "loss": 1.7003, + "step": 16566 + }, + { + "epoch": 5.0850214855739715, + "grad_norm": 0.24532537162303925, + "learning_rate": 5.105865013599828e-05, + "loss": 1.7818, + "step": 16567 + }, + { + "epoch": 5.085328422344997, + "grad_norm": 0.22788558900356293, + "learning_rate": 5.1053680685181926e-05, + "loss": 1.7291, + "step": 16568 + }, + { + "epoch": 5.085635359116022, + "grad_norm": 0.22402508556842804, + "learning_rate": 5.10487112239525e-05, + "loss": 1.8292, + "step": 16569 + }, + { + "epoch": 5.0859422958870475, + "grad_norm": 0.2396162748336792, + "learning_rate": 5.1043741752359085e-05, + "loss": 1.7441, + "step": 16570 + }, + { + "epoch": 5.086249232658073, + "grad_norm": 0.22364887595176697, + "learning_rate": 5.1038772270450796e-05, + "loss": 1.7356, + "step": 16571 + }, + { + "epoch": 5.086556169429097, + "grad_norm": 0.20385414361953735, + "learning_rate": 5.103380277827676e-05, + "loss": 1.774, + "step": 16572 + }, + { + "epoch": 5.086863106200123, + "grad_norm": 0.2050715535879135, + "learning_rate": 5.102883327588608e-05, + "loss": 1.7217, + "step": 16573 + }, + { + "epoch": 5.087170042971148, + "grad_norm": 0.23750410974025726, + "learning_rate": 5.102386376332786e-05, + "loss": 1.7605, + "step": 16574 + }, + { + "epoch": 5.087476979742173, + "grad_norm": 0.24313338100910187, + "learning_rate": 5.101889424065122e-05, + "loss": 1.7498, + "step": 16575 + }, + { + "epoch": 5.087783916513199, + "grad_norm": 0.22145850956439972, + "learning_rate": 5.101392470790527e-05, + "loss": 1.7827, + "step": 16576 + }, + { + "epoch": 5.088090853284223, + "grad_norm": 0.23073779046535492, + "learning_rate": 5.100895516513912e-05, + "loss": 1.7722, + "step": 16577 + }, + { + "epoch": 5.088397790055248, + "grad_norm": 0.2112295925617218, + "learning_rate": 5.100398561240188e-05, + "loss": 1.7755, + "step": 16578 + }, + { + "epoch": 5.088704726826274, + "grad_norm": 0.23263800144195557, + "learning_rate": 5.0999016049742675e-05, + "loss": 1.7593, + "step": 16579 + }, + { + "epoch": 5.089011663597299, + "grad_norm": 0.23011381924152374, + "learning_rate": 5.09940464772106e-05, + "loss": 1.704, + "step": 16580 + }, + { + "epoch": 5.089318600368324, + "grad_norm": 0.1930779367685318, + "learning_rate": 5.0989076894854785e-05, + "loss": 1.7038, + "step": 16581 + }, + { + "epoch": 5.08962553713935, + "grad_norm": 0.2100505381822586, + "learning_rate": 5.098410730272433e-05, + "loss": 1.7671, + "step": 16582 + }, + { + "epoch": 5.089932473910374, + "grad_norm": 0.1919277459383011, + "learning_rate": 5.097913770086833e-05, + "loss": 1.651, + "step": 16583 + }, + { + "epoch": 5.0902394106813995, + "grad_norm": 0.23310615122318268, + "learning_rate": 5.097416808933594e-05, + "loss": 1.8294, + "step": 16584 + }, + { + "epoch": 5.090546347452425, + "grad_norm": 0.26191771030426025, + "learning_rate": 5.096919846817624e-05, + "loss": 1.7522, + "step": 16585 + }, + { + "epoch": 5.09085328422345, + "grad_norm": 0.2508419156074524, + "learning_rate": 5.096422883743835e-05, + "loss": 1.8025, + "step": 16586 + }, + { + "epoch": 5.0911602209944755, + "grad_norm": 0.23192499577999115, + "learning_rate": 5.0959259197171414e-05, + "loss": 1.7885, + "step": 16587 + }, + { + "epoch": 5.0914671577655, + "grad_norm": 0.2164602279663086, + "learning_rate": 5.095428954742448e-05, + "loss": 1.7299, + "step": 16588 + }, + { + "epoch": 5.091774094536525, + "grad_norm": 0.21431668102741241, + "learning_rate": 5.094931988824671e-05, + "loss": 1.7122, + "step": 16589 + }, + { + "epoch": 5.092081031307551, + "grad_norm": 0.20563583076000214, + "learning_rate": 5.094435021968722e-05, + "loss": 1.7118, + "step": 16590 + }, + { + "epoch": 5.092387968078576, + "grad_norm": 0.20916326344013214, + "learning_rate": 5.093938054179509e-05, + "loss": 1.7639, + "step": 16591 + }, + { + "epoch": 5.092694904849601, + "grad_norm": 0.21197481453418732, + "learning_rate": 5.0934410854619454e-05, + "loss": 1.7357, + "step": 16592 + }, + { + "epoch": 5.093001841620626, + "grad_norm": 0.21085995435714722, + "learning_rate": 5.092944115820942e-05, + "loss": 1.6921, + "step": 16593 + }, + { + "epoch": 5.093308778391651, + "grad_norm": 0.2608145773410797, + "learning_rate": 5.09244714526141e-05, + "loss": 1.7541, + "step": 16594 + }, + { + "epoch": 5.093615715162676, + "grad_norm": 0.2138587087392807, + "learning_rate": 5.0919501737882624e-05, + "loss": 1.727, + "step": 16595 + }, + { + "epoch": 5.093922651933702, + "grad_norm": 0.230251282453537, + "learning_rate": 5.0914532014064084e-05, + "loss": 1.7828, + "step": 16596 + }, + { + "epoch": 5.094229588704727, + "grad_norm": 0.2162851244211197, + "learning_rate": 5.0909562281207614e-05, + "loss": 1.6905, + "step": 16597 + }, + { + "epoch": 5.094536525475752, + "grad_norm": 0.20637664198875427, + "learning_rate": 5.090459253936231e-05, + "loss": 1.7484, + "step": 16598 + }, + { + "epoch": 5.094843462246777, + "grad_norm": 0.19427815079689026, + "learning_rate": 5.089962278857728e-05, + "loss": 1.7379, + "step": 16599 + }, + { + "epoch": 5.095150399017802, + "grad_norm": 0.1877593845129013, + "learning_rate": 5.089465302890165e-05, + "loss": 1.7017, + "step": 16600 + }, + { + "epoch": 5.0954573357888275, + "grad_norm": 0.19219037890434265, + "learning_rate": 5.0889683260384543e-05, + "loss": 1.7379, + "step": 16601 + }, + { + "epoch": 5.095764272559853, + "grad_norm": 0.19855685532093048, + "learning_rate": 5.088471348307507e-05, + "loss": 1.7171, + "step": 16602 + }, + { + "epoch": 5.096071209330878, + "grad_norm": 0.19119660556316376, + "learning_rate": 5.087974369702235e-05, + "loss": 1.6912, + "step": 16603 + }, + { + "epoch": 5.096378146101903, + "grad_norm": 0.2102670818567276, + "learning_rate": 5.0874773902275476e-05, + "loss": 1.6825, + "step": 16604 + }, + { + "epoch": 5.096685082872928, + "grad_norm": 0.2120765596628189, + "learning_rate": 5.0869804098883564e-05, + "loss": 1.7055, + "step": 16605 + }, + { + "epoch": 5.096992019643953, + "grad_norm": 0.25874772667884827, + "learning_rate": 5.0864834286895745e-05, + "loss": 1.7193, + "step": 16606 + }, + { + "epoch": 5.097298956414979, + "grad_norm": 0.20822012424468994, + "learning_rate": 5.085986446636113e-05, + "loss": 1.6748, + "step": 16607 + }, + { + "epoch": 5.097605893186004, + "grad_norm": 0.21364718675613403, + "learning_rate": 5.085489463732883e-05, + "loss": 1.7762, + "step": 16608 + }, + { + "epoch": 5.097912829957028, + "grad_norm": 0.21961788833141327, + "learning_rate": 5.084992479984796e-05, + "loss": 1.7243, + "step": 16609 + }, + { + "epoch": 5.098219766728054, + "grad_norm": 0.22056026756763458, + "learning_rate": 5.0844954953967624e-05, + "loss": 1.6983, + "step": 16610 + }, + { + "epoch": 5.098526703499079, + "grad_norm": 0.21347738802433014, + "learning_rate": 5.083998509973695e-05, + "loss": 1.7319, + "step": 16611 + }, + { + "epoch": 5.098833640270104, + "grad_norm": 0.23593664169311523, + "learning_rate": 5.083501523720506e-05, + "loss": 1.7121, + "step": 16612 + }, + { + "epoch": 5.09914057704113, + "grad_norm": 0.2088623344898224, + "learning_rate": 5.0830045366421055e-05, + "loss": 1.72, + "step": 16613 + }, + { + "epoch": 5.099447513812155, + "grad_norm": 0.2293832004070282, + "learning_rate": 5.082507548743406e-05, + "loss": 1.7548, + "step": 16614 + }, + { + "epoch": 5.0997544505831796, + "grad_norm": 0.2509057819843292, + "learning_rate": 5.082010560029319e-05, + "loss": 1.7729, + "step": 16615 + }, + { + "epoch": 5.100061387354205, + "grad_norm": 0.1925390362739563, + "learning_rate": 5.081513570504755e-05, + "loss": 1.7109, + "step": 16616 + }, + { + "epoch": 5.10036832412523, + "grad_norm": 0.20876559615135193, + "learning_rate": 5.081016580174626e-05, + "loss": 1.7031, + "step": 16617 + }, + { + "epoch": 5.100675260896256, + "grad_norm": 0.2038683146238327, + "learning_rate": 5.080519589043842e-05, + "loss": 1.7489, + "step": 16618 + }, + { + "epoch": 5.100982197667281, + "grad_norm": 0.25018224120140076, + "learning_rate": 5.080022597117318e-05, + "loss": 1.7884, + "step": 16619 + }, + { + "epoch": 5.101289134438305, + "grad_norm": 0.24430342018604279, + "learning_rate": 5.079525604399965e-05, + "loss": 1.7558, + "step": 16620 + }, + { + "epoch": 5.101596071209331, + "grad_norm": 0.22151432931423187, + "learning_rate": 5.079028610896692e-05, + "loss": 1.7543, + "step": 16621 + }, + { + "epoch": 5.101903007980356, + "grad_norm": 0.2313055694103241, + "learning_rate": 5.0785316166124107e-05, + "loss": 1.7755, + "step": 16622 + }, + { + "epoch": 5.102209944751381, + "grad_norm": 0.27405816316604614, + "learning_rate": 5.0780346215520355e-05, + "loss": 1.7006, + "step": 16623 + }, + { + "epoch": 5.102516881522407, + "grad_norm": 0.2209920734167099, + "learning_rate": 5.077537625720476e-05, + "loss": 1.6877, + "step": 16624 + }, + { + "epoch": 5.102823818293431, + "grad_norm": 0.20993784070014954, + "learning_rate": 5.077040629122645e-05, + "loss": 1.7558, + "step": 16625 + }, + { + "epoch": 5.1031307550644565, + "grad_norm": 0.25554344058036804, + "learning_rate": 5.076543631763453e-05, + "loss": 1.7142, + "step": 16626 + }, + { + "epoch": 5.103437691835482, + "grad_norm": 0.28980588912963867, + "learning_rate": 5.0760466336478116e-05, + "loss": 1.7632, + "step": 16627 + }, + { + "epoch": 5.103744628606507, + "grad_norm": 0.20144744217395782, + "learning_rate": 5.075549634780633e-05, + "loss": 1.7472, + "step": 16628 + }, + { + "epoch": 5.1040515653775325, + "grad_norm": 0.30335596203804016, + "learning_rate": 5.075052635166827e-05, + "loss": 1.7283, + "step": 16629 + }, + { + "epoch": 5.104358502148558, + "grad_norm": 0.3014097213745117, + "learning_rate": 5.074555634811309e-05, + "loss": 1.7273, + "step": 16630 + }, + { + "epoch": 5.104665438919582, + "grad_norm": 0.20123563706874847, + "learning_rate": 5.074058633718988e-05, + "loss": 1.7119, + "step": 16631 + }, + { + "epoch": 5.104972375690608, + "grad_norm": 0.3375137746334076, + "learning_rate": 5.073561631894776e-05, + "loss": 1.7594, + "step": 16632 + }, + { + "epoch": 5.105279312461633, + "grad_norm": 0.3471776247024536, + "learning_rate": 5.0730646293435846e-05, + "loss": 1.729, + "step": 16633 + }, + { + "epoch": 5.105586249232658, + "grad_norm": 0.26405471563339233, + "learning_rate": 5.072567626070327e-05, + "loss": 1.7472, + "step": 16634 + }, + { + "epoch": 5.105893186003684, + "grad_norm": 0.2339334636926651, + "learning_rate": 5.072070622079911e-05, + "loss": 1.7285, + "step": 16635 + }, + { + "epoch": 5.106200122774708, + "grad_norm": 0.26267752051353455, + "learning_rate": 5.0715736173772534e-05, + "loss": 1.7171, + "step": 16636 + }, + { + "epoch": 5.106507059545733, + "grad_norm": 0.22254765033721924, + "learning_rate": 5.0710766119672626e-05, + "loss": 1.7702, + "step": 16637 + }, + { + "epoch": 5.106813996316759, + "grad_norm": 0.2457888424396515, + "learning_rate": 5.070579605854852e-05, + "loss": 1.7987, + "step": 16638 + }, + { + "epoch": 5.107120933087784, + "grad_norm": 0.24500930309295654, + "learning_rate": 5.070082599044931e-05, + "loss": 1.8103, + "step": 16639 + }, + { + "epoch": 5.107427869858809, + "grad_norm": 0.24446405470371246, + "learning_rate": 5.0695855915424116e-05, + "loss": 1.7058, + "step": 16640 + }, + { + "epoch": 5.107734806629834, + "grad_norm": 0.22352534532546997, + "learning_rate": 5.0690885833522086e-05, + "loss": 1.7503, + "step": 16641 + }, + { + "epoch": 5.108041743400859, + "grad_norm": 0.2308795005083084, + "learning_rate": 5.068591574479231e-05, + "loss": 1.8064, + "step": 16642 + }, + { + "epoch": 5.1083486801718845, + "grad_norm": 0.23804180324077606, + "learning_rate": 5.068094564928392e-05, + "loss": 1.7603, + "step": 16643 + }, + { + "epoch": 5.10865561694291, + "grad_norm": 0.1956508308649063, + "learning_rate": 5.0675975547046016e-05, + "loss": 1.7448, + "step": 16644 + }, + { + "epoch": 5.108962553713935, + "grad_norm": 0.24438725411891937, + "learning_rate": 5.067100543812773e-05, + "loss": 1.7706, + "step": 16645 + }, + { + "epoch": 5.1092694904849605, + "grad_norm": 0.26129621267318726, + "learning_rate": 5.066603532257817e-05, + "loss": 1.7321, + "step": 16646 + }, + { + "epoch": 5.109576427255985, + "grad_norm": 0.2024240493774414, + "learning_rate": 5.066106520044646e-05, + "loss": 1.7033, + "step": 16647 + }, + { + "epoch": 5.10988336402701, + "grad_norm": 0.2096802294254303, + "learning_rate": 5.0656095071781716e-05, + "loss": 1.716, + "step": 16648 + }, + { + "epoch": 5.110190300798036, + "grad_norm": 0.20643317699432373, + "learning_rate": 5.0651124936633054e-05, + "loss": 1.7473, + "step": 16649 + }, + { + "epoch": 5.110497237569061, + "grad_norm": 0.2268853783607483, + "learning_rate": 5.0646154795049604e-05, + "loss": 1.7844, + "step": 16650 + }, + { + "epoch": 5.110804174340086, + "grad_norm": 0.20215095579624176, + "learning_rate": 5.064118464708046e-05, + "loss": 1.7138, + "step": 16651 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 0.19411569833755493, + "learning_rate": 5.063621449277476e-05, + "loss": 1.7526, + "step": 16652 + }, + { + "epoch": 5.111418047882136, + "grad_norm": 0.20199783146381378, + "learning_rate": 5.063124433218161e-05, + "loss": 1.806, + "step": 16653 + }, + { + "epoch": 5.111724984653161, + "grad_norm": 0.23351836204528809, + "learning_rate": 5.0626274165350165e-05, + "loss": 1.7529, + "step": 16654 + }, + { + "epoch": 5.112031921424187, + "grad_norm": 0.21098989248275757, + "learning_rate": 5.062130399232948e-05, + "loss": 1.7647, + "step": 16655 + }, + { + "epoch": 5.112338858195212, + "grad_norm": 0.21959169209003448, + "learning_rate": 5.0616333813168714e-05, + "loss": 1.7462, + "step": 16656 + }, + { + "epoch": 5.112645794966237, + "grad_norm": 0.21173696219921112, + "learning_rate": 5.061136362791696e-05, + "loss": 1.7413, + "step": 16657 + }, + { + "epoch": 5.112952731737262, + "grad_norm": 0.22357577085494995, + "learning_rate": 5.0606393436623365e-05, + "loss": 1.7163, + "step": 16658 + }, + { + "epoch": 5.113259668508287, + "grad_norm": 0.24364936351776123, + "learning_rate": 5.060142323933704e-05, + "loss": 1.8139, + "step": 16659 + }, + { + "epoch": 5.1135666052793125, + "grad_norm": 0.21646073460578918, + "learning_rate": 5.05964530361071e-05, + "loss": 1.741, + "step": 16660 + }, + { + "epoch": 5.113873542050338, + "grad_norm": 0.24261775612831116, + "learning_rate": 5.059148282698265e-05, + "loss": 1.7162, + "step": 16661 + }, + { + "epoch": 5.114180478821363, + "grad_norm": 0.22883281111717224, + "learning_rate": 5.058651261201283e-05, + "loss": 1.7342, + "step": 16662 + }, + { + "epoch": 5.114487415592388, + "grad_norm": 0.2616727352142334, + "learning_rate": 5.058154239124674e-05, + "loss": 1.8054, + "step": 16663 + }, + { + "epoch": 5.114794352363413, + "grad_norm": 0.21293358504772186, + "learning_rate": 5.0576572164733505e-05, + "loss": 1.742, + "step": 16664 + }, + { + "epoch": 5.115101289134438, + "grad_norm": 0.20037685334682465, + "learning_rate": 5.057160193252225e-05, + "loss": 1.7518, + "step": 16665 + }, + { + "epoch": 5.115408225905464, + "grad_norm": 0.19102689623832703, + "learning_rate": 5.056663169466209e-05, + "loss": 1.6892, + "step": 16666 + }, + { + "epoch": 5.115715162676489, + "grad_norm": 0.22261591255664825, + "learning_rate": 5.056166145120216e-05, + "loss": 1.7744, + "step": 16667 + }, + { + "epoch": 5.116022099447513, + "grad_norm": 0.23966702818870544, + "learning_rate": 5.055669120219154e-05, + "loss": 1.7786, + "step": 16668 + }, + { + "epoch": 5.116329036218539, + "grad_norm": 0.22008271515369415, + "learning_rate": 5.055172094767937e-05, + "loss": 1.7501, + "step": 16669 + }, + { + "epoch": 5.116635972989564, + "grad_norm": 0.21643415093421936, + "learning_rate": 5.054675068771478e-05, + "loss": 1.7548, + "step": 16670 + }, + { + "epoch": 5.116942909760589, + "grad_norm": 0.24661116302013397, + "learning_rate": 5.0541780422346894e-05, + "loss": 1.8117, + "step": 16671 + }, + { + "epoch": 5.117249846531615, + "grad_norm": 0.21393093466758728, + "learning_rate": 5.05368101516248e-05, + "loss": 1.7341, + "step": 16672 + }, + { + "epoch": 5.11755678330264, + "grad_norm": 0.30949896574020386, + "learning_rate": 5.053183987559763e-05, + "loss": 1.7703, + "step": 16673 + }, + { + "epoch": 5.1178637200736645, + "grad_norm": 0.22236786782741547, + "learning_rate": 5.052686959431451e-05, + "loss": 1.719, + "step": 16674 + }, + { + "epoch": 5.11817065684469, + "grad_norm": 0.26826921105384827, + "learning_rate": 5.052189930782455e-05, + "loss": 1.741, + "step": 16675 + }, + { + "epoch": 5.118477593615715, + "grad_norm": 0.2608947455883026, + "learning_rate": 5.051692901617688e-05, + "loss": 1.7062, + "step": 16676 + }, + { + "epoch": 5.1187845303867405, + "grad_norm": 0.20709002017974854, + "learning_rate": 5.051195871942063e-05, + "loss": 1.703, + "step": 16677 + }, + { + "epoch": 5.119091467157766, + "grad_norm": 0.18957734107971191, + "learning_rate": 5.0506988417604885e-05, + "loss": 1.762, + "step": 16678 + }, + { + "epoch": 5.11939840392879, + "grad_norm": 0.21578781306743622, + "learning_rate": 5.050201811077879e-05, + "loss": 1.7167, + "step": 16679 + }, + { + "epoch": 5.119705340699816, + "grad_norm": 0.2253631353378296, + "learning_rate": 5.049704779899145e-05, + "loss": 1.7374, + "step": 16680 + }, + { + "epoch": 5.120012277470841, + "grad_norm": 0.1977664828300476, + "learning_rate": 5.049207748229199e-05, + "loss": 1.7399, + "step": 16681 + }, + { + "epoch": 5.120319214241866, + "grad_norm": 0.2964428663253784, + "learning_rate": 5.048710716072954e-05, + "loss": 1.8359, + "step": 16682 + }, + { + "epoch": 5.120626151012892, + "grad_norm": 0.24788637459278107, + "learning_rate": 5.0482136834353224e-05, + "loss": 1.7593, + "step": 16683 + }, + { + "epoch": 5.120933087783916, + "grad_norm": 0.21537743508815765, + "learning_rate": 5.0477166503212135e-05, + "loss": 1.7472, + "step": 16684 + }, + { + "epoch": 5.121240024554941, + "grad_norm": 0.2055196613073349, + "learning_rate": 5.047219616735541e-05, + "loss": 1.7106, + "step": 16685 + }, + { + "epoch": 5.121546961325967, + "grad_norm": 0.19770687818527222, + "learning_rate": 5.046722582683215e-05, + "loss": 1.6887, + "step": 16686 + }, + { + "epoch": 5.121853898096992, + "grad_norm": 0.20407389104366302, + "learning_rate": 5.046225548169151e-05, + "loss": 1.7412, + "step": 16687 + }, + { + "epoch": 5.122160834868017, + "grad_norm": 0.20153474807739258, + "learning_rate": 5.045728513198259e-05, + "loss": 1.7643, + "step": 16688 + }, + { + "epoch": 5.122467771639043, + "grad_norm": 0.18737752735614777, + "learning_rate": 5.045231477775452e-05, + "loss": 1.763, + "step": 16689 + }, + { + "epoch": 5.122774708410067, + "grad_norm": 0.19790658354759216, + "learning_rate": 5.0447344419056385e-05, + "loss": 1.7446, + "step": 16690 + }, + { + "epoch": 5.1230816451810925, + "grad_norm": 0.21496973931789398, + "learning_rate": 5.0442374055937336e-05, + "loss": 1.7756, + "step": 16691 + }, + { + "epoch": 5.123388581952118, + "grad_norm": 0.19318655133247375, + "learning_rate": 5.043740368844649e-05, + "loss": 1.7687, + "step": 16692 + }, + { + "epoch": 5.123695518723143, + "grad_norm": 0.2237338423728943, + "learning_rate": 5.0432433316632976e-05, + "loss": 1.7258, + "step": 16693 + }, + { + "epoch": 5.1240024554941686, + "grad_norm": 0.2257162630558014, + "learning_rate": 5.042746294054589e-05, + "loss": 1.7462, + "step": 16694 + }, + { + "epoch": 5.124309392265193, + "grad_norm": 0.25666359066963196, + "learning_rate": 5.0422492560234366e-05, + "loss": 1.7318, + "step": 16695 + }, + { + "epoch": 5.124616329036218, + "grad_norm": 0.2615324556827545, + "learning_rate": 5.0417522175747536e-05, + "loss": 1.7533, + "step": 16696 + }, + { + "epoch": 5.124923265807244, + "grad_norm": 0.2372874766588211, + "learning_rate": 5.0412551787134475e-05, + "loss": 1.7361, + "step": 16697 + }, + { + "epoch": 5.125230202578269, + "grad_norm": 0.25976815819740295, + "learning_rate": 5.040758139444436e-05, + "loss": 1.7542, + "step": 16698 + }, + { + "epoch": 5.125537139349294, + "grad_norm": 0.36173003911972046, + "learning_rate": 5.040261099772629e-05, + "loss": 1.7421, + "step": 16699 + }, + { + "epoch": 5.12584407612032, + "grad_norm": 0.2767728269100189, + "learning_rate": 5.039764059702937e-05, + "loss": 1.7341, + "step": 16700 + }, + { + "epoch": 5.126151012891344, + "grad_norm": 0.20185241103172302, + "learning_rate": 5.039267019240275e-05, + "loss": 1.7068, + "step": 16701 + }, + { + "epoch": 5.1264579496623695, + "grad_norm": 0.26872581243515015, + "learning_rate": 5.0387699783895514e-05, + "loss": 1.7404, + "step": 16702 + }, + { + "epoch": 5.126764886433395, + "grad_norm": 0.2867858111858368, + "learning_rate": 5.038272937155682e-05, + "loss": 1.7702, + "step": 16703 + }, + { + "epoch": 5.12707182320442, + "grad_norm": 0.20939521491527557, + "learning_rate": 5.037775895543574e-05, + "loss": 1.7653, + "step": 16704 + }, + { + "epoch": 5.1273787599754455, + "grad_norm": 0.2674047648906708, + "learning_rate": 5.037278853558146e-05, + "loss": 1.701, + "step": 16705 + }, + { + "epoch": 5.12768569674647, + "grad_norm": 0.20776906609535217, + "learning_rate": 5.036781811204304e-05, + "loss": 1.7476, + "step": 16706 + }, + { + "epoch": 5.127992633517495, + "grad_norm": 0.2695952355861664, + "learning_rate": 5.036284768486964e-05, + "loss": 1.7206, + "step": 16707 + }, + { + "epoch": 5.128299570288521, + "grad_norm": 0.30661383271217346, + "learning_rate": 5.0357877254110363e-05, + "loss": 1.72, + "step": 16708 + }, + { + "epoch": 5.128606507059546, + "grad_norm": 0.2527785003185272, + "learning_rate": 5.0352906819814316e-05, + "loss": 1.6936, + "step": 16709 + }, + { + "epoch": 5.128913443830571, + "grad_norm": 0.23000696301460266, + "learning_rate": 5.034793638203066e-05, + "loss": 1.7634, + "step": 16710 + }, + { + "epoch": 5.129220380601596, + "grad_norm": 0.33594760298728943, + "learning_rate": 5.0342965940808486e-05, + "loss": 1.6952, + "step": 16711 + }, + { + "epoch": 5.129527317372621, + "grad_norm": 0.22834168374538422, + "learning_rate": 5.033799549619692e-05, + "loss": 1.7537, + "step": 16712 + }, + { + "epoch": 5.129834254143646, + "grad_norm": 0.26585114002227783, + "learning_rate": 5.033302504824509e-05, + "loss": 1.7554, + "step": 16713 + }, + { + "epoch": 5.130141190914672, + "grad_norm": 0.25632211565971375, + "learning_rate": 5.032805459700211e-05, + "loss": 1.8141, + "step": 16714 + }, + { + "epoch": 5.130448127685697, + "grad_norm": 0.256523996591568, + "learning_rate": 5.0323084142517084e-05, + "loss": 1.777, + "step": 16715 + }, + { + "epoch": 5.1307550644567215, + "grad_norm": 0.31409457325935364, + "learning_rate": 5.0318113684839166e-05, + "loss": 1.7414, + "step": 16716 + }, + { + "epoch": 5.131062001227747, + "grad_norm": 0.21156816184520721, + "learning_rate": 5.0313143224017455e-05, + "loss": 1.7397, + "step": 16717 + }, + { + "epoch": 5.131368937998772, + "grad_norm": 0.23596547544002533, + "learning_rate": 5.030817276010109e-05, + "loss": 1.752, + "step": 16718 + }, + { + "epoch": 5.1316758747697975, + "grad_norm": 0.2587638199329376, + "learning_rate": 5.0303202293139186e-05, + "loss": 1.7645, + "step": 16719 + }, + { + "epoch": 5.131982811540823, + "grad_norm": 0.2006666213274002, + "learning_rate": 5.029823182318084e-05, + "loss": 1.7009, + "step": 16720 + }, + { + "epoch": 5.132289748311848, + "grad_norm": 0.3075694739818573, + "learning_rate": 5.029326135027521e-05, + "loss": 1.749, + "step": 16721 + }, + { + "epoch": 5.132596685082873, + "grad_norm": 0.3116205334663391, + "learning_rate": 5.028829087447139e-05, + "loss": 1.7458, + "step": 16722 + }, + { + "epoch": 5.132903621853898, + "grad_norm": 0.17925913631916046, + "learning_rate": 5.028332039581851e-05, + "loss": 1.6502, + "step": 16723 + }, + { + "epoch": 5.133210558624923, + "grad_norm": 0.21779952943325043, + "learning_rate": 5.0278349914365694e-05, + "loss": 1.7656, + "step": 16724 + }, + { + "epoch": 5.133517495395949, + "grad_norm": 0.20085318386554718, + "learning_rate": 5.027337943016207e-05, + "loss": 1.7662, + "step": 16725 + }, + { + "epoch": 5.133824432166974, + "grad_norm": 0.19975553452968597, + "learning_rate": 5.026840894325673e-05, + "loss": 1.7392, + "step": 16726 + }, + { + "epoch": 5.134131368937998, + "grad_norm": 0.20610745251178741, + "learning_rate": 5.026343845369883e-05, + "loss": 1.7221, + "step": 16727 + }, + { + "epoch": 5.134438305709024, + "grad_norm": 0.21451768279075623, + "learning_rate": 5.025846796153747e-05, + "loss": 1.8381, + "step": 16728 + }, + { + "epoch": 5.134745242480049, + "grad_norm": 0.19518613815307617, + "learning_rate": 5.0253497466821786e-05, + "loss": 1.7483, + "step": 16729 + }, + { + "epoch": 5.135052179251074, + "grad_norm": 0.24284996092319489, + "learning_rate": 5.024852696960088e-05, + "loss": 1.7895, + "step": 16730 + }, + { + "epoch": 5.1353591160221, + "grad_norm": 0.23962461948394775, + "learning_rate": 5.0243556469923905e-05, + "loss": 1.8468, + "step": 16731 + }, + { + "epoch": 5.135666052793125, + "grad_norm": 0.20455054938793182, + "learning_rate": 5.023858596783993e-05, + "loss": 1.6973, + "step": 16732 + }, + { + "epoch": 5.1359729895641495, + "grad_norm": 0.20629842579364777, + "learning_rate": 5.023361546339813e-05, + "loss": 1.7608, + "step": 16733 + }, + { + "epoch": 5.136279926335175, + "grad_norm": 0.19375818967819214, + "learning_rate": 5.0228644956647606e-05, + "loss": 1.7327, + "step": 16734 + }, + { + "epoch": 5.1365868631062, + "grad_norm": 0.20960548520088196, + "learning_rate": 5.022367444763748e-05, + "loss": 1.7227, + "step": 16735 + }, + { + "epoch": 5.1368937998772255, + "grad_norm": 0.24732786417007446, + "learning_rate": 5.021870393641687e-05, + "loss": 1.8144, + "step": 16736 + }, + { + "epoch": 5.137200736648251, + "grad_norm": 0.22190099954605103, + "learning_rate": 5.021373342303489e-05, + "loss": 1.705, + "step": 16737 + }, + { + "epoch": 5.137507673419275, + "grad_norm": 0.2091664969921112, + "learning_rate": 5.020876290754069e-05, + "loss": 1.7926, + "step": 16738 + }, + { + "epoch": 5.137814610190301, + "grad_norm": 0.22298938035964966, + "learning_rate": 5.020379238998335e-05, + "loss": 1.7782, + "step": 16739 + }, + { + "epoch": 5.138121546961326, + "grad_norm": 0.20843006670475006, + "learning_rate": 5.019882187041203e-05, + "loss": 1.7245, + "step": 16740 + }, + { + "epoch": 5.138428483732351, + "grad_norm": 0.23383544385433197, + "learning_rate": 5.019385134887583e-05, + "loss": 1.6834, + "step": 16741 + }, + { + "epoch": 5.138735420503377, + "grad_norm": 0.3015683889389038, + "learning_rate": 5.018888082542388e-05, + "loss": 1.7636, + "step": 16742 + }, + { + "epoch": 5.139042357274401, + "grad_norm": 0.2253810614347458, + "learning_rate": 5.0183910300105284e-05, + "loss": 1.7375, + "step": 16743 + }, + { + "epoch": 5.139349294045426, + "grad_norm": 0.2064623087644577, + "learning_rate": 5.01789397729692e-05, + "loss": 1.7683, + "step": 16744 + }, + { + "epoch": 5.139656230816452, + "grad_norm": 0.2106693685054779, + "learning_rate": 5.0173969244064724e-05, + "loss": 1.7432, + "step": 16745 + }, + { + "epoch": 5.139963167587477, + "grad_norm": 0.19944638013839722, + "learning_rate": 5.016899871344097e-05, + "loss": 1.701, + "step": 16746 + }, + { + "epoch": 5.140270104358502, + "grad_norm": 0.23210744559764862, + "learning_rate": 5.016402818114708e-05, + "loss": 1.8008, + "step": 16747 + }, + { + "epoch": 5.140577041129528, + "grad_norm": 0.26014089584350586, + "learning_rate": 5.015905764723217e-05, + "loss": 1.7131, + "step": 16748 + }, + { + "epoch": 5.140883977900552, + "grad_norm": 0.25526607036590576, + "learning_rate": 5.015408711174535e-05, + "loss": 1.7525, + "step": 16749 + }, + { + "epoch": 5.1411909146715775, + "grad_norm": 0.2092386782169342, + "learning_rate": 5.0149116574735756e-05, + "loss": 1.7502, + "step": 16750 + }, + { + "epoch": 5.141497851442603, + "grad_norm": 0.21560105681419373, + "learning_rate": 5.01441460362525e-05, + "loss": 1.7903, + "step": 16751 + }, + { + "epoch": 5.141804788213628, + "grad_norm": 0.23538467288017273, + "learning_rate": 5.013917549634471e-05, + "loss": 1.6995, + "step": 16752 + }, + { + "epoch": 5.1421117249846535, + "grad_norm": 0.26545262336730957, + "learning_rate": 5.0134204955061526e-05, + "loss": 1.7511, + "step": 16753 + }, + { + "epoch": 5.142418661755678, + "grad_norm": 0.23030948638916016, + "learning_rate": 5.012923441245203e-05, + "loss": 1.7271, + "step": 16754 + }, + { + "epoch": 5.142725598526703, + "grad_norm": 0.22395408153533936, + "learning_rate": 5.012426386856537e-05, + "loss": 1.7273, + "step": 16755 + }, + { + "epoch": 5.143032535297729, + "grad_norm": 0.21355997025966644, + "learning_rate": 5.011929332345066e-05, + "loss": 1.7347, + "step": 16756 + }, + { + "epoch": 5.143339472068754, + "grad_norm": 0.2355809509754181, + "learning_rate": 5.011432277715702e-05, + "loss": 1.8289, + "step": 16757 + }, + { + "epoch": 5.143646408839779, + "grad_norm": 0.24319802224636078, + "learning_rate": 5.0109352229733584e-05, + "loss": 1.7621, + "step": 16758 + }, + { + "epoch": 5.143953345610804, + "grad_norm": 0.2591453492641449, + "learning_rate": 5.010438168122946e-05, + "loss": 1.8043, + "step": 16759 + }, + { + "epoch": 5.144260282381829, + "grad_norm": 0.22595751285552979, + "learning_rate": 5.009941113169376e-05, + "loss": 1.8137, + "step": 16760 + }, + { + "epoch": 5.144567219152854, + "grad_norm": 0.220921128988266, + "learning_rate": 5.009444058117564e-05, + "loss": 1.7105, + "step": 16761 + }, + { + "epoch": 5.14487415592388, + "grad_norm": 0.25713789463043213, + "learning_rate": 5.0089470029724195e-05, + "loss": 1.8184, + "step": 16762 + }, + { + "epoch": 5.145181092694905, + "grad_norm": 0.19849328696727753, + "learning_rate": 5.008449947738856e-05, + "loss": 1.7331, + "step": 16763 + }, + { + "epoch": 5.14548802946593, + "grad_norm": 0.2073405385017395, + "learning_rate": 5.007952892421785e-05, + "loss": 1.7053, + "step": 16764 + }, + { + "epoch": 5.145794966236955, + "grad_norm": 0.22307951748371124, + "learning_rate": 5.007455837026119e-05, + "loss": 1.7724, + "step": 16765 + }, + { + "epoch": 5.14610190300798, + "grad_norm": 0.22160649299621582, + "learning_rate": 5.006958781556769e-05, + "loss": 1.7191, + "step": 16766 + }, + { + "epoch": 5.1464088397790055, + "grad_norm": 0.2202252298593521, + "learning_rate": 5.0064617260186487e-05, + "loss": 1.7339, + "step": 16767 + }, + { + "epoch": 5.146715776550031, + "grad_norm": 0.23693829774856567, + "learning_rate": 5.005964670416671e-05, + "loss": 1.7143, + "step": 16768 + }, + { + "epoch": 5.147022713321056, + "grad_norm": 0.22675764560699463, + "learning_rate": 5.005467614755746e-05, + "loss": 1.7913, + "step": 16769 + }, + { + "epoch": 5.147329650092081, + "grad_norm": 0.21288467943668365, + "learning_rate": 5.0049705590407866e-05, + "loss": 1.7581, + "step": 16770 + }, + { + "epoch": 5.147636586863106, + "grad_norm": 0.216839998960495, + "learning_rate": 5.0044735032767064e-05, + "loss": 1.7305, + "step": 16771 + }, + { + "epoch": 5.147943523634131, + "grad_norm": 0.2111063450574875, + "learning_rate": 5.003976447468416e-05, + "loss": 1.7444, + "step": 16772 + }, + { + "epoch": 5.148250460405157, + "grad_norm": 0.2536773085594177, + "learning_rate": 5.003479391620827e-05, + "loss": 1.6952, + "step": 16773 + }, + { + "epoch": 5.148557397176182, + "grad_norm": 0.23585477471351624, + "learning_rate": 5.002982335738854e-05, + "loss": 1.6921, + "step": 16774 + }, + { + "epoch": 5.148864333947207, + "grad_norm": 0.1927027702331543, + "learning_rate": 5.002485279827407e-05, + "loss": 1.7781, + "step": 16775 + }, + { + "epoch": 5.149171270718232, + "grad_norm": 0.22545355558395386, + "learning_rate": 5.001988223891399e-05, + "loss": 1.7582, + "step": 16776 + }, + { + "epoch": 5.149478207489257, + "grad_norm": 0.20837660133838654, + "learning_rate": 5.001491167935741e-05, + "loss": 1.7379, + "step": 16777 + }, + { + "epoch": 5.149785144260282, + "grad_norm": 0.20510734617710114, + "learning_rate": 5.000994111965348e-05, + "loss": 1.7568, + "step": 16778 + }, + { + "epoch": 5.150092081031308, + "grad_norm": 0.2629711329936981, + "learning_rate": 5.00049705598513e-05, + "loss": 1.7613, + "step": 16779 + }, + { + "epoch": 5.150399017802333, + "grad_norm": 0.2390555888414383, + "learning_rate": 5e-05, + "loss": 1.7099, + "step": 16780 + }, + { + "epoch": 5.150705954573358, + "grad_norm": 0.19643893837928772, + "learning_rate": 4.9995029440148715e-05, + "loss": 1.7012, + "step": 16781 + }, + { + "epoch": 5.151012891344383, + "grad_norm": 0.1881607472896576, + "learning_rate": 4.999005888034653e-05, + "loss": 1.705, + "step": 16782 + }, + { + "epoch": 5.151319828115408, + "grad_norm": 0.3219485282897949, + "learning_rate": 4.99850883206426e-05, + "loss": 1.8089, + "step": 16783 + }, + { + "epoch": 5.151626764886434, + "grad_norm": 0.22285562753677368, + "learning_rate": 4.998011776108602e-05, + "loss": 1.7343, + "step": 16784 + }, + { + "epoch": 5.151933701657459, + "grad_norm": 0.1981910616159439, + "learning_rate": 4.9975147201725955e-05, + "loss": 1.6939, + "step": 16785 + }, + { + "epoch": 5.152240638428483, + "grad_norm": 0.2338661551475525, + "learning_rate": 4.997017664261148e-05, + "loss": 1.6833, + "step": 16786 + }, + { + "epoch": 5.152547575199509, + "grad_norm": 0.2613268792629242, + "learning_rate": 4.996520608379175e-05, + "loss": 1.7251, + "step": 16787 + }, + { + "epoch": 5.152854511970534, + "grad_norm": 0.26063668727874756, + "learning_rate": 4.996023552531586e-05, + "loss": 1.8444, + "step": 16788 + }, + { + "epoch": 5.153161448741559, + "grad_norm": 0.2711321711540222, + "learning_rate": 4.9955264967232954e-05, + "loss": 1.7257, + "step": 16789 + }, + { + "epoch": 5.153468385512585, + "grad_norm": 0.30134227871894836, + "learning_rate": 4.995029440959213e-05, + "loss": 1.7599, + "step": 16790 + }, + { + "epoch": 5.153775322283609, + "grad_norm": 0.22983741760253906, + "learning_rate": 4.994532385244255e-05, + "loss": 1.7944, + "step": 16791 + }, + { + "epoch": 5.1540822590546345, + "grad_norm": 0.2992973327636719, + "learning_rate": 4.994035329583329e-05, + "loss": 1.7507, + "step": 16792 + }, + { + "epoch": 5.15438919582566, + "grad_norm": 0.2659669518470764, + "learning_rate": 4.993538273981352e-05, + "loss": 1.7246, + "step": 16793 + }, + { + "epoch": 5.154696132596685, + "grad_norm": 0.24235470592975616, + "learning_rate": 4.9930412184432315e-05, + "loss": 1.8378, + "step": 16794 + }, + { + "epoch": 5.1550030693677105, + "grad_norm": 0.30005061626434326, + "learning_rate": 4.992544162973882e-05, + "loss": 1.7526, + "step": 16795 + }, + { + "epoch": 5.155310006138736, + "grad_norm": 0.2183740884065628, + "learning_rate": 4.992047107578215e-05, + "loss": 1.7197, + "step": 16796 + }, + { + "epoch": 5.15561694290976, + "grad_norm": 0.35874706506729126, + "learning_rate": 4.991550052261145e-05, + "loss": 1.8196, + "step": 16797 + }, + { + "epoch": 5.155923879680786, + "grad_norm": 0.42146921157836914, + "learning_rate": 4.991052997027583e-05, + "loss": 1.7165, + "step": 16798 + }, + { + "epoch": 5.156230816451811, + "grad_norm": 0.2738321125507355, + "learning_rate": 4.990555941882437e-05, + "loss": 1.7042, + "step": 16799 + }, + { + "epoch": 5.156537753222836, + "grad_norm": 0.26304566860198975, + "learning_rate": 4.990058886830625e-05, + "loss": 1.7551, + "step": 16800 + }, + { + "epoch": 5.156844689993862, + "grad_norm": 0.4301520586013794, + "learning_rate": 4.9895618318770556e-05, + "loss": 1.7219, + "step": 16801 + }, + { + "epoch": 5.157151626764886, + "grad_norm": 0.3316499590873718, + "learning_rate": 4.989064777026644e-05, + "loss": 1.8034, + "step": 16802 + }, + { + "epoch": 5.157458563535911, + "grad_norm": 0.30105581879615784, + "learning_rate": 4.9885677222842984e-05, + "loss": 1.7022, + "step": 16803 + }, + { + "epoch": 5.157765500306937, + "grad_norm": 0.3830905854701996, + "learning_rate": 4.988070667654937e-05, + "loss": 1.7898, + "step": 16804 + }, + { + "epoch": 5.158072437077962, + "grad_norm": 0.2204640656709671, + "learning_rate": 4.9875736131434644e-05, + "loss": 1.7081, + "step": 16805 + }, + { + "epoch": 5.158379373848987, + "grad_norm": 0.3620772063732147, + "learning_rate": 4.9870765587547976e-05, + "loss": 1.7345, + "step": 16806 + }, + { + "epoch": 5.158686310620013, + "grad_norm": 0.3268207907676697, + "learning_rate": 4.986579504493848e-05, + "loss": 1.7364, + "step": 16807 + }, + { + "epoch": 5.158993247391037, + "grad_norm": 0.2499808967113495, + "learning_rate": 4.986082450365529e-05, + "loss": 1.7836, + "step": 16808 + }, + { + "epoch": 5.1593001841620625, + "grad_norm": 0.3696226477622986, + "learning_rate": 4.98558539637475e-05, + "loss": 1.8094, + "step": 16809 + }, + { + "epoch": 5.159607120933088, + "grad_norm": 0.3239068388938904, + "learning_rate": 4.9850883425264256e-05, + "loss": 1.7448, + "step": 16810 + }, + { + "epoch": 5.159914057704113, + "grad_norm": 0.19875772297382355, + "learning_rate": 4.9845912888254655e-05, + "loss": 1.6945, + "step": 16811 + }, + { + "epoch": 5.1602209944751385, + "grad_norm": 0.3952203691005707, + "learning_rate": 4.984094235276784e-05, + "loss": 1.8457, + "step": 16812 + }, + { + "epoch": 5.160527931246163, + "grad_norm": 0.3052334785461426, + "learning_rate": 4.9835971818852916e-05, + "loss": 1.7371, + "step": 16813 + }, + { + "epoch": 5.160834868017188, + "grad_norm": 0.2874486446380615, + "learning_rate": 4.983100128655904e-05, + "loss": 1.7194, + "step": 16814 + }, + { + "epoch": 5.161141804788214, + "grad_norm": 0.39117491245269775, + "learning_rate": 4.98260307559353e-05, + "loss": 1.7919, + "step": 16815 + }, + { + "epoch": 5.161448741559239, + "grad_norm": 0.2532150149345398, + "learning_rate": 4.982106022703081e-05, + "loss": 1.8103, + "step": 16816 + }, + { + "epoch": 5.161755678330264, + "grad_norm": 0.3545167148113251, + "learning_rate": 4.981608969989473e-05, + "loss": 1.8093, + "step": 16817 + }, + { + "epoch": 5.162062615101289, + "grad_norm": 0.397806316614151, + "learning_rate": 4.981111917457613e-05, + "loss": 1.7885, + "step": 16818 + }, + { + "epoch": 5.162369551872314, + "grad_norm": 0.2523536682128906, + "learning_rate": 4.980614865112419e-05, + "loss": 1.797, + "step": 16819 + }, + { + "epoch": 5.162676488643339, + "grad_norm": 0.3666839301586151, + "learning_rate": 4.980117812958798e-05, + "loss": 1.7859, + "step": 16820 + }, + { + "epoch": 5.162983425414365, + "grad_norm": 0.3392138183116913, + "learning_rate": 4.9796207610016664e-05, + "loss": 1.7717, + "step": 16821 + }, + { + "epoch": 5.16329036218539, + "grad_norm": 0.21040666103363037, + "learning_rate": 4.9791237092459325e-05, + "loss": 1.7447, + "step": 16822 + }, + { + "epoch": 5.163597298956415, + "grad_norm": 0.3140225112438202, + "learning_rate": 4.978626657696512e-05, + "loss": 1.7405, + "step": 16823 + }, + { + "epoch": 5.16390423572744, + "grad_norm": 0.23963581025600433, + "learning_rate": 4.978129606358313e-05, + "loss": 1.7041, + "step": 16824 + }, + { + "epoch": 5.164211172498465, + "grad_norm": 0.32476937770843506, + "learning_rate": 4.977632555236253e-05, + "loss": 1.736, + "step": 16825 + }, + { + "epoch": 5.1645181092694905, + "grad_norm": 0.4362463653087616, + "learning_rate": 4.977135504335239e-05, + "loss": 1.7657, + "step": 16826 + }, + { + "epoch": 5.164825046040516, + "grad_norm": 0.26118260622024536, + "learning_rate": 4.976638453660188e-05, + "loss": 1.7339, + "step": 16827 + }, + { + "epoch": 5.165131982811541, + "grad_norm": 0.27284330129623413, + "learning_rate": 4.9761414032160065e-05, + "loss": 1.8086, + "step": 16828 + }, + { + "epoch": 5.165438919582566, + "grad_norm": 0.2942579388618469, + "learning_rate": 4.975644353007611e-05, + "loss": 1.7869, + "step": 16829 + }, + { + "epoch": 5.165745856353591, + "grad_norm": 0.23257993161678314, + "learning_rate": 4.975147303039912e-05, + "loss": 1.8048, + "step": 16830 + }, + { + "epoch": 5.166052793124616, + "grad_norm": 0.28638842701911926, + "learning_rate": 4.9746502533178225e-05, + "loss": 1.7744, + "step": 16831 + }, + { + "epoch": 5.166359729895642, + "grad_norm": 0.21571335196495056, + "learning_rate": 4.974153203846255e-05, + "loss": 1.7842, + "step": 16832 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 0.268883615732193, + "learning_rate": 4.9736561546301185e-05, + "loss": 1.7194, + "step": 16833 + }, + { + "epoch": 5.166973603437691, + "grad_norm": 0.22934168577194214, + "learning_rate": 4.9731591056743285e-05, + "loss": 1.757, + "step": 16834 + }, + { + "epoch": 5.167280540208717, + "grad_norm": 0.26321718096733093, + "learning_rate": 4.9726620569837946e-05, + "loss": 1.7675, + "step": 16835 + }, + { + "epoch": 5.167587476979742, + "grad_norm": 0.2893882393836975, + "learning_rate": 4.9721650085634325e-05, + "loss": 1.7134, + "step": 16836 + }, + { + "epoch": 5.167894413750767, + "grad_norm": 0.24130617082118988, + "learning_rate": 4.97166796041815e-05, + "loss": 1.7119, + "step": 16837 + }, + { + "epoch": 5.168201350521793, + "grad_norm": 0.23614190518856049, + "learning_rate": 4.9711709125528635e-05, + "loss": 1.7556, + "step": 16838 + }, + { + "epoch": 5.168508287292818, + "grad_norm": 0.2031065821647644, + "learning_rate": 4.97067386497248e-05, + "loss": 1.7678, + "step": 16839 + }, + { + "epoch": 5.1688152240638425, + "grad_norm": 0.30695948004722595, + "learning_rate": 4.970176817681917e-05, + "loss": 1.7907, + "step": 16840 + }, + { + "epoch": 5.169122160834868, + "grad_norm": 0.31256723403930664, + "learning_rate": 4.969679770686082e-05, + "loss": 1.7448, + "step": 16841 + }, + { + "epoch": 5.169429097605893, + "grad_norm": 0.24183644354343414, + "learning_rate": 4.969182723989892e-05, + "loss": 1.7259, + "step": 16842 + }, + { + "epoch": 5.1697360343769185, + "grad_norm": 0.22440548241138458, + "learning_rate": 4.9686856775982536e-05, + "loss": 1.7949, + "step": 16843 + }, + { + "epoch": 5.170042971147944, + "grad_norm": 0.29006195068359375, + "learning_rate": 4.9681886315160846e-05, + "loss": 1.7128, + "step": 16844 + }, + { + "epoch": 5.170349907918968, + "grad_norm": 0.2189658135175705, + "learning_rate": 4.967691585748292e-05, + "loss": 1.7375, + "step": 16845 + }, + { + "epoch": 5.170656844689994, + "grad_norm": 0.289909690618515, + "learning_rate": 4.967194540299791e-05, + "loss": 1.779, + "step": 16846 + }, + { + "epoch": 5.170963781461019, + "grad_norm": 0.28279590606689453, + "learning_rate": 4.966697495175492e-05, + "loss": 1.7368, + "step": 16847 + }, + { + "epoch": 5.171270718232044, + "grad_norm": 0.2056259959936142, + "learning_rate": 4.966200450380309e-05, + "loss": 1.7548, + "step": 16848 + }, + { + "epoch": 5.17157765500307, + "grad_norm": 0.2607482969760895, + "learning_rate": 4.965703405919154e-05, + "loss": 1.7178, + "step": 16849 + }, + { + "epoch": 5.171884591774095, + "grad_norm": 0.26085609197616577, + "learning_rate": 4.965206361796935e-05, + "loss": 1.751, + "step": 16850 + }, + { + "epoch": 5.172191528545119, + "grad_norm": 0.17960335314273834, + "learning_rate": 4.964709318018569e-05, + "loss": 1.6932, + "step": 16851 + }, + { + "epoch": 5.172498465316145, + "grad_norm": 0.2617340385913849, + "learning_rate": 4.964212274588965e-05, + "loss": 1.7753, + "step": 16852 + }, + { + "epoch": 5.17280540208717, + "grad_norm": 0.2454555630683899, + "learning_rate": 4.9637152315130383e-05, + "loss": 1.7587, + "step": 16853 + }, + { + "epoch": 5.173112338858195, + "grad_norm": 0.19221605360507965, + "learning_rate": 4.963218188795696e-05, + "loss": 1.7337, + "step": 16854 + }, + { + "epoch": 5.173419275629221, + "grad_norm": 0.24314738810062408, + "learning_rate": 4.9627211464418565e-05, + "loss": 1.725, + "step": 16855 + }, + { + "epoch": 5.173726212400245, + "grad_norm": 0.2533986568450928, + "learning_rate": 4.962224104456426e-05, + "loss": 1.7502, + "step": 16856 + }, + { + "epoch": 5.1740331491712706, + "grad_norm": 0.21800079941749573, + "learning_rate": 4.9617270628443195e-05, + "loss": 1.7622, + "step": 16857 + }, + { + "epoch": 5.174340085942296, + "grad_norm": 0.22742362320423126, + "learning_rate": 4.96123002161045e-05, + "loss": 1.7078, + "step": 16858 + }, + { + "epoch": 5.174647022713321, + "grad_norm": 0.22729982435703278, + "learning_rate": 4.960732980759727e-05, + "loss": 1.8349, + "step": 16859 + }, + { + "epoch": 5.1749539594843466, + "grad_norm": 0.28869518637657166, + "learning_rate": 4.9602359402970625e-05, + "loss": 1.8932, + "step": 16860 + }, + { + "epoch": 5.175260896255371, + "grad_norm": 0.21931354701519012, + "learning_rate": 4.9597389002273725e-05, + "loss": 1.6989, + "step": 16861 + }, + { + "epoch": 5.175567833026396, + "grad_norm": 0.2130192667245865, + "learning_rate": 4.959241860555564e-05, + "loss": 1.752, + "step": 16862 + }, + { + "epoch": 5.175874769797422, + "grad_norm": 0.21272781491279602, + "learning_rate": 4.958744821286553e-05, + "loss": 1.7402, + "step": 16863 + }, + { + "epoch": 5.176181706568447, + "grad_norm": 0.20279285311698914, + "learning_rate": 4.958247782425248e-05, + "loss": 1.7103, + "step": 16864 + }, + { + "epoch": 5.176488643339472, + "grad_norm": 0.23561790585517883, + "learning_rate": 4.957750743976564e-05, + "loss": 1.7742, + "step": 16865 + }, + { + "epoch": 5.176795580110497, + "grad_norm": 0.27608510851860046, + "learning_rate": 4.957253705945413e-05, + "loss": 1.7505, + "step": 16866 + }, + { + "epoch": 5.177102516881522, + "grad_norm": 0.20624001324176788, + "learning_rate": 4.956756668336704e-05, + "loss": 1.7032, + "step": 16867 + }, + { + "epoch": 5.1774094536525475, + "grad_norm": 0.23743939399719238, + "learning_rate": 4.956259631155352e-05, + "loss": 1.7469, + "step": 16868 + }, + { + "epoch": 5.177716390423573, + "grad_norm": 0.27421119809150696, + "learning_rate": 4.9557625944062675e-05, + "loss": 1.7028, + "step": 16869 + }, + { + "epoch": 5.178023327194598, + "grad_norm": 0.23788046836853027, + "learning_rate": 4.955265558094363e-05, + "loss": 1.7468, + "step": 16870 + }, + { + "epoch": 5.1783302639656235, + "grad_norm": 0.24712958931922913, + "learning_rate": 4.95476852222455e-05, + "loss": 1.7348, + "step": 16871 + }, + { + "epoch": 5.178637200736648, + "grad_norm": 0.21558570861816406, + "learning_rate": 4.9542714868017424e-05, + "loss": 1.7599, + "step": 16872 + }, + { + "epoch": 5.178944137507673, + "grad_norm": 0.2561664283275604, + "learning_rate": 4.953774451830849e-05, + "loss": 1.7673, + "step": 16873 + }, + { + "epoch": 5.179251074278699, + "grad_norm": 0.19761815667152405, + "learning_rate": 4.953277417316786e-05, + "loss": 1.743, + "step": 16874 + }, + { + "epoch": 5.179558011049724, + "grad_norm": 0.24140769243240356, + "learning_rate": 4.95278038326446e-05, + "loss": 1.8229, + "step": 16875 + }, + { + "epoch": 5.179864947820749, + "grad_norm": 0.21686211228370667, + "learning_rate": 4.9522833496787876e-05, + "loss": 1.7914, + "step": 16876 + }, + { + "epoch": 5.180171884591774, + "grad_norm": 0.2537819743156433, + "learning_rate": 4.951786316564678e-05, + "loss": 1.7532, + "step": 16877 + }, + { + "epoch": 5.180478821362799, + "grad_norm": 0.24567632377147675, + "learning_rate": 4.951289283927046e-05, + "loss": 1.7528, + "step": 16878 + }, + { + "epoch": 5.180785758133824, + "grad_norm": 0.1958467960357666, + "learning_rate": 4.9507922517708e-05, + "loss": 1.6922, + "step": 16879 + }, + { + "epoch": 5.18109269490485, + "grad_norm": 0.2012091726064682, + "learning_rate": 4.950295220100857e-05, + "loss": 1.7509, + "step": 16880 + }, + { + "epoch": 5.181399631675875, + "grad_norm": 0.2416311800479889, + "learning_rate": 4.9497981889221226e-05, + "loss": 1.7341, + "step": 16881 + }, + { + "epoch": 5.1817065684469, + "grad_norm": 0.21407842636108398, + "learning_rate": 4.949301158239513e-05, + "loss": 1.7493, + "step": 16882 + }, + { + "epoch": 5.182013505217925, + "grad_norm": 0.2354930192232132, + "learning_rate": 4.94880412805794e-05, + "loss": 1.7726, + "step": 16883 + }, + { + "epoch": 5.18232044198895, + "grad_norm": 0.2168428748846054, + "learning_rate": 4.948307098382313e-05, + "loss": 1.77, + "step": 16884 + }, + { + "epoch": 5.1826273787599755, + "grad_norm": 0.19605880975723267, + "learning_rate": 4.947810069217547e-05, + "loss": 1.7292, + "step": 16885 + }, + { + "epoch": 5.182934315531001, + "grad_norm": 0.23066702485084534, + "learning_rate": 4.947313040568551e-05, + "loss": 1.7265, + "step": 16886 + }, + { + "epoch": 5.183241252302026, + "grad_norm": 0.20139534771442413, + "learning_rate": 4.9468160124402386e-05, + "loss": 1.7443, + "step": 16887 + }, + { + "epoch": 5.183548189073051, + "grad_norm": 0.25097572803497314, + "learning_rate": 4.946318984837521e-05, + "loss": 1.7537, + "step": 16888 + }, + { + "epoch": 5.183855125844076, + "grad_norm": 0.26215067505836487, + "learning_rate": 4.945821957765313e-05, + "loss": 1.8397, + "step": 16889 + }, + { + "epoch": 5.184162062615101, + "grad_norm": 0.22072140872478485, + "learning_rate": 4.9453249312285215e-05, + "loss": 1.7052, + "step": 16890 + }, + { + "epoch": 5.184468999386127, + "grad_norm": 0.20372305810451508, + "learning_rate": 4.944827905232064e-05, + "loss": 1.7228, + "step": 16891 + }, + { + "epoch": 5.184775936157152, + "grad_norm": 0.20383495092391968, + "learning_rate": 4.944330879780847e-05, + "loss": 1.7063, + "step": 16892 + }, + { + "epoch": 5.185082872928176, + "grad_norm": 0.1903693675994873, + "learning_rate": 4.943833854879786e-05, + "loss": 1.6435, + "step": 16893 + }, + { + "epoch": 5.185389809699202, + "grad_norm": 0.20357775688171387, + "learning_rate": 4.94333683053379e-05, + "loss": 1.7485, + "step": 16894 + }, + { + "epoch": 5.185696746470227, + "grad_norm": 0.24776104092597961, + "learning_rate": 4.942839806747775e-05, + "loss": 1.718, + "step": 16895 + }, + { + "epoch": 5.186003683241252, + "grad_norm": 0.2455051839351654, + "learning_rate": 4.942342783526649e-05, + "loss": 1.7124, + "step": 16896 + }, + { + "epoch": 5.186310620012278, + "grad_norm": 0.2102014273405075, + "learning_rate": 4.941845760875328e-05, + "loss": 1.7584, + "step": 16897 + }, + { + "epoch": 5.186617556783303, + "grad_norm": 0.2177651822566986, + "learning_rate": 4.941348738798718e-05, + "loss": 1.7019, + "step": 16898 + }, + { + "epoch": 5.1869244935543275, + "grad_norm": 0.21296697854995728, + "learning_rate": 4.9408517173017355e-05, + "loss": 1.7299, + "step": 16899 + }, + { + "epoch": 5.187231430325353, + "grad_norm": 0.23485495150089264, + "learning_rate": 4.940354696389292e-05, + "loss": 1.7271, + "step": 16900 + }, + { + "epoch": 5.187538367096378, + "grad_norm": 0.27287766337394714, + "learning_rate": 4.939857676066297e-05, + "loss": 1.7601, + "step": 16901 + }, + { + "epoch": 5.1878453038674035, + "grad_norm": 0.2060246467590332, + "learning_rate": 4.939360656337665e-05, + "loss": 1.7064, + "step": 16902 + }, + { + "epoch": 5.188152240638429, + "grad_norm": 0.25422418117523193, + "learning_rate": 4.938863637208305e-05, + "loss": 1.7423, + "step": 16903 + }, + { + "epoch": 5.188459177409453, + "grad_norm": 0.2798483669757843, + "learning_rate": 4.9383666186831304e-05, + "loss": 1.7132, + "step": 16904 + }, + { + "epoch": 5.188766114180479, + "grad_norm": 0.23505693674087524, + "learning_rate": 4.9378696007670525e-05, + "loss": 1.7759, + "step": 16905 + }, + { + "epoch": 5.189073050951504, + "grad_norm": 0.23761989176273346, + "learning_rate": 4.937372583464987e-05, + "loss": 1.7076, + "step": 16906 + }, + { + "epoch": 5.189379987722529, + "grad_norm": 0.3005945086479187, + "learning_rate": 4.9368755667818385e-05, + "loss": 1.6957, + "step": 16907 + }, + { + "epoch": 5.189686924493555, + "grad_norm": 0.2502881586551666, + "learning_rate": 4.936378550722525e-05, + "loss": 1.7352, + "step": 16908 + }, + { + "epoch": 5.189993861264579, + "grad_norm": 0.24194179475307465, + "learning_rate": 4.9358815352919544e-05, + "loss": 1.738, + "step": 16909 + }, + { + "epoch": 5.190300798035604, + "grad_norm": 0.27478742599487305, + "learning_rate": 4.935384520495041e-05, + "loss": 1.7118, + "step": 16910 + }, + { + "epoch": 5.19060773480663, + "grad_norm": 0.22327560186386108, + "learning_rate": 4.9348875063366944e-05, + "loss": 1.7697, + "step": 16911 + }, + { + "epoch": 5.190914671577655, + "grad_norm": 0.21844418346881866, + "learning_rate": 4.9343904928218295e-05, + "loss": 1.7733, + "step": 16912 + }, + { + "epoch": 5.19122160834868, + "grad_norm": 0.25267866253852844, + "learning_rate": 4.933893479955354e-05, + "loss": 1.7313, + "step": 16913 + }, + { + "epoch": 5.191528545119706, + "grad_norm": 0.22045068442821503, + "learning_rate": 4.933396467742185e-05, + "loss": 1.7856, + "step": 16914 + }, + { + "epoch": 5.19183548189073, + "grad_norm": 0.22642305493354797, + "learning_rate": 4.932899456187229e-05, + "loss": 1.7326, + "step": 16915 + }, + { + "epoch": 5.1921424186617555, + "grad_norm": 0.20601733028888702, + "learning_rate": 4.9324024452953995e-05, + "loss": 1.7743, + "step": 16916 + }, + { + "epoch": 5.192449355432781, + "grad_norm": 0.25580671429634094, + "learning_rate": 4.931905435071611e-05, + "loss": 1.7705, + "step": 16917 + }, + { + "epoch": 5.192756292203806, + "grad_norm": 0.38173142075538635, + "learning_rate": 4.9314084255207706e-05, + "loss": 1.7504, + "step": 16918 + }, + { + "epoch": 5.1930632289748315, + "grad_norm": 0.2254420667886734, + "learning_rate": 4.930911416647794e-05, + "loss": 1.7344, + "step": 16919 + }, + { + "epoch": 5.193370165745856, + "grad_norm": 0.2354312688112259, + "learning_rate": 4.9304144084575896e-05, + "loss": 1.7607, + "step": 16920 + }, + { + "epoch": 5.193677102516881, + "grad_norm": 0.23879510164260864, + "learning_rate": 4.9299174009550716e-05, + "loss": 1.683, + "step": 16921 + }, + { + "epoch": 5.193984039287907, + "grad_norm": 0.228669211268425, + "learning_rate": 4.9294203941451494e-05, + "loss": 1.7776, + "step": 16922 + }, + { + "epoch": 5.194290976058932, + "grad_norm": 0.2266843616962433, + "learning_rate": 4.928923388032739e-05, + "loss": 1.7563, + "step": 16923 + }, + { + "epoch": 5.194597912829957, + "grad_norm": 0.2581404745578766, + "learning_rate": 4.928426382622747e-05, + "loss": 1.8112, + "step": 16924 + }, + { + "epoch": 5.194904849600983, + "grad_norm": 0.25179803371429443, + "learning_rate": 4.92792937792009e-05, + "loss": 1.7661, + "step": 16925 + }, + { + "epoch": 5.195211786372007, + "grad_norm": 0.23408514261245728, + "learning_rate": 4.9274323739296746e-05, + "loss": 1.7618, + "step": 16926 + }, + { + "epoch": 5.195518723143032, + "grad_norm": 0.23110872507095337, + "learning_rate": 4.926935370656416e-05, + "loss": 1.6945, + "step": 16927 + }, + { + "epoch": 5.195825659914058, + "grad_norm": 0.2863025665283203, + "learning_rate": 4.926438368105224e-05, + "loss": 1.8659, + "step": 16928 + }, + { + "epoch": 5.196132596685083, + "grad_norm": 0.2156454175710678, + "learning_rate": 4.925941366281013e-05, + "loss": 1.7281, + "step": 16929 + }, + { + "epoch": 5.196439533456108, + "grad_norm": 0.2338300198316574, + "learning_rate": 4.925444365188691e-05, + "loss": 1.7271, + "step": 16930 + }, + { + "epoch": 5.196746470227133, + "grad_norm": 0.21434102952480316, + "learning_rate": 4.924947364833173e-05, + "loss": 1.7342, + "step": 16931 + }, + { + "epoch": 5.197053406998158, + "grad_norm": 0.21619778871536255, + "learning_rate": 4.924450365219369e-05, + "loss": 1.7493, + "step": 16932 + }, + { + "epoch": 5.1973603437691835, + "grad_norm": 0.24532032012939453, + "learning_rate": 4.9239533663521896e-05, + "loss": 1.7707, + "step": 16933 + }, + { + "epoch": 5.197667280540209, + "grad_norm": 0.21795547008514404, + "learning_rate": 4.923456368236549e-05, + "loss": 1.7642, + "step": 16934 + }, + { + "epoch": 5.197974217311234, + "grad_norm": 0.2070101797580719, + "learning_rate": 4.922959370877356e-05, + "loss": 1.7377, + "step": 16935 + }, + { + "epoch": 5.198281154082259, + "grad_norm": 0.22546489536762238, + "learning_rate": 4.9224623742795256e-05, + "loss": 1.7766, + "step": 16936 + }, + { + "epoch": 5.198588090853284, + "grad_norm": 0.20723624527454376, + "learning_rate": 4.921965378447965e-05, + "loss": 1.7316, + "step": 16937 + }, + { + "epoch": 5.198895027624309, + "grad_norm": 0.21870547533035278, + "learning_rate": 4.9214683833875905e-05, + "loss": 1.7653, + "step": 16938 + }, + { + "epoch": 5.199201964395335, + "grad_norm": 0.19606490433216095, + "learning_rate": 4.920971389103309e-05, + "loss": 1.7181, + "step": 16939 + }, + { + "epoch": 5.19950890116636, + "grad_norm": 0.18372730910778046, + "learning_rate": 4.920474395600037e-05, + "loss": 1.7041, + "step": 16940 + }, + { + "epoch": 5.199815837937384, + "grad_norm": 0.22051765024662018, + "learning_rate": 4.919977402882682e-05, + "loss": 1.7172, + "step": 16941 + }, + { + "epoch": 5.20012277470841, + "grad_norm": 0.2135835587978363, + "learning_rate": 4.919480410956159e-05, + "loss": 1.6918, + "step": 16942 + }, + { + "epoch": 5.200429711479435, + "grad_norm": 0.19619768857955933, + "learning_rate": 4.918983419825376e-05, + "loss": 1.7005, + "step": 16943 + }, + { + "epoch": 5.2007366482504604, + "grad_norm": 0.22726574540138245, + "learning_rate": 4.918486429495246e-05, + "loss": 1.6775, + "step": 16944 + }, + { + "epoch": 5.201043585021486, + "grad_norm": 0.21471361815929413, + "learning_rate": 4.9179894399706815e-05, + "loss": 1.7102, + "step": 16945 + }, + { + "epoch": 5.201350521792511, + "grad_norm": 0.20113740861415863, + "learning_rate": 4.917492451256595e-05, + "loss": 1.7548, + "step": 16946 + }, + { + "epoch": 5.201657458563536, + "grad_norm": 0.2337827831506729, + "learning_rate": 4.916995463357894e-05, + "loss": 1.818, + "step": 16947 + }, + { + "epoch": 5.201964395334561, + "grad_norm": 0.2649554908275604, + "learning_rate": 4.9164984762794955e-05, + "loss": 1.7784, + "step": 16948 + }, + { + "epoch": 5.202271332105586, + "grad_norm": 0.2297617793083191, + "learning_rate": 4.916001490026306e-05, + "loss": 1.7484, + "step": 16949 + }, + { + "epoch": 5.202578268876612, + "grad_norm": 0.20791979134082794, + "learning_rate": 4.915504504603238e-05, + "loss": 1.7164, + "step": 16950 + }, + { + "epoch": 5.202885205647637, + "grad_norm": 0.21769596636295319, + "learning_rate": 4.915007520015207e-05, + "loss": 1.7783, + "step": 16951 + }, + { + "epoch": 5.203192142418661, + "grad_norm": 0.21038469672203064, + "learning_rate": 4.914510536267118e-05, + "loss": 1.6863, + "step": 16952 + }, + { + "epoch": 5.203499079189687, + "grad_norm": 0.20725449919700623, + "learning_rate": 4.914013553363889e-05, + "loss": 1.6855, + "step": 16953 + }, + { + "epoch": 5.203806015960712, + "grad_norm": 0.23879854381084442, + "learning_rate": 4.9135165713104266e-05, + "loss": 1.6986, + "step": 16954 + }, + { + "epoch": 5.204112952731737, + "grad_norm": 0.20515915751457214, + "learning_rate": 4.913019590111645e-05, + "loss": 1.6912, + "step": 16955 + }, + { + "epoch": 5.204419889502763, + "grad_norm": 0.2252528965473175, + "learning_rate": 4.912522609772453e-05, + "loss": 1.6974, + "step": 16956 + }, + { + "epoch": 5.204726826273788, + "grad_norm": 0.1946130096912384, + "learning_rate": 4.9120256302977665e-05, + "loss": 1.7009, + "step": 16957 + }, + { + "epoch": 5.2050337630448125, + "grad_norm": 0.21323645114898682, + "learning_rate": 4.9115286516924925e-05, + "loss": 1.7746, + "step": 16958 + }, + { + "epoch": 5.205340699815838, + "grad_norm": 0.20721712708473206, + "learning_rate": 4.911031673961546e-05, + "loss": 1.7103, + "step": 16959 + }, + { + "epoch": 5.205647636586863, + "grad_norm": 0.19630689918994904, + "learning_rate": 4.910534697109834e-05, + "loss": 1.7042, + "step": 16960 + }, + { + "epoch": 5.2059545733578885, + "grad_norm": 0.2036786526441574, + "learning_rate": 4.910037721142273e-05, + "loss": 1.7713, + "step": 16961 + }, + { + "epoch": 5.206261510128914, + "grad_norm": 0.20518352091312408, + "learning_rate": 4.9095407460637696e-05, + "loss": 1.7456, + "step": 16962 + }, + { + "epoch": 5.206568446899938, + "grad_norm": 0.199858620762825, + "learning_rate": 4.9090437718792404e-05, + "loss": 1.7598, + "step": 16963 + }, + { + "epoch": 5.206875383670964, + "grad_norm": 0.22860252857208252, + "learning_rate": 4.9085467985935914e-05, + "loss": 1.7947, + "step": 16964 + }, + { + "epoch": 5.207182320441989, + "grad_norm": 0.22179929912090302, + "learning_rate": 4.9080498262117395e-05, + "loss": 1.7537, + "step": 16965 + }, + { + "epoch": 5.207489257213014, + "grad_norm": 0.24737581610679626, + "learning_rate": 4.9075528547385906e-05, + "loss": 1.7932, + "step": 16966 + }, + { + "epoch": 5.20779619398404, + "grad_norm": 0.2653762400150299, + "learning_rate": 4.907055884179059e-05, + "loss": 1.7683, + "step": 16967 + }, + { + "epoch": 5.208103130755064, + "grad_norm": 0.2891876697540283, + "learning_rate": 4.9065589145380564e-05, + "loss": 1.7867, + "step": 16968 + }, + { + "epoch": 5.208410067526089, + "grad_norm": 0.23162086308002472, + "learning_rate": 4.906061945820492e-05, + "loss": 1.7981, + "step": 16969 + }, + { + "epoch": 5.208717004297115, + "grad_norm": 0.2746187150478363, + "learning_rate": 4.9055649780312805e-05, + "loss": 1.7215, + "step": 16970 + }, + { + "epoch": 5.20902394106814, + "grad_norm": 0.3217853605747223, + "learning_rate": 4.905068011175329e-05, + "loss": 1.8027, + "step": 16971 + }, + { + "epoch": 5.209330877839165, + "grad_norm": 0.21517686545848846, + "learning_rate": 4.904571045257553e-05, + "loss": 1.7055, + "step": 16972 + }, + { + "epoch": 5.209637814610191, + "grad_norm": 0.23613709211349487, + "learning_rate": 4.90407408028286e-05, + "loss": 1.751, + "step": 16973 + }, + { + "epoch": 5.209944751381215, + "grad_norm": 0.35093945264816284, + "learning_rate": 4.903577116256165e-05, + "loss": 1.7749, + "step": 16974 + }, + { + "epoch": 5.2102516881522405, + "grad_norm": 0.3289217948913574, + "learning_rate": 4.903080153182376e-05, + "loss": 1.7722, + "step": 16975 + }, + { + "epoch": 5.210558624923266, + "grad_norm": 0.29387256503105164, + "learning_rate": 4.9025831910664074e-05, + "loss": 1.8121, + "step": 16976 + }, + { + "epoch": 5.210865561694291, + "grad_norm": 0.44418805837631226, + "learning_rate": 4.9020862299131664e-05, + "loss": 1.7744, + "step": 16977 + }, + { + "epoch": 5.2111724984653165, + "grad_norm": 0.39242252707481384, + "learning_rate": 4.901589269727568e-05, + "loss": 1.7183, + "step": 16978 + }, + { + "epoch": 5.211479435236341, + "grad_norm": 0.2028690129518509, + "learning_rate": 4.901092310514522e-05, + "loss": 1.7101, + "step": 16979 + }, + { + "epoch": 5.211786372007366, + "grad_norm": 0.4025843143463135, + "learning_rate": 4.900595352278941e-05, + "loss": 1.7545, + "step": 16980 + }, + { + "epoch": 5.212093308778392, + "grad_norm": 0.284568727016449, + "learning_rate": 4.900098395025733e-05, + "loss": 1.7758, + "step": 16981 + }, + { + "epoch": 5.212400245549417, + "grad_norm": 0.2527516484260559, + "learning_rate": 4.899601438759813e-05, + "loss": 1.695, + "step": 16982 + }, + { + "epoch": 5.212707182320442, + "grad_norm": 0.3063630759716034, + "learning_rate": 4.89910448348609e-05, + "loss": 1.714, + "step": 16983 + }, + { + "epoch": 5.213014119091467, + "grad_norm": 0.22754468023777008, + "learning_rate": 4.898607529209474e-05, + "loss": 1.8315, + "step": 16984 + }, + { + "epoch": 5.213321055862492, + "grad_norm": 0.29594969749450684, + "learning_rate": 4.89811057593488e-05, + "loss": 1.6669, + "step": 16985 + }, + { + "epoch": 5.213627992633517, + "grad_norm": 0.21486569941043854, + "learning_rate": 4.897613623667215e-05, + "loss": 1.7425, + "step": 16986 + }, + { + "epoch": 5.213934929404543, + "grad_norm": 0.30908775329589844, + "learning_rate": 4.897116672411395e-05, + "loss": 1.7915, + "step": 16987 + }, + { + "epoch": 5.214241866175568, + "grad_norm": 0.23515601456165314, + "learning_rate": 4.896619722172325e-05, + "loss": 1.7226, + "step": 16988 + }, + { + "epoch": 5.214548802946593, + "grad_norm": 0.2847287952899933, + "learning_rate": 4.8961227729549215e-05, + "loss": 1.7641, + "step": 16989 + }, + { + "epoch": 5.214855739717618, + "grad_norm": 0.2986287772655487, + "learning_rate": 4.895625824764092e-05, + "loss": 1.8025, + "step": 16990 + }, + { + "epoch": 5.215162676488643, + "grad_norm": 0.23454971611499786, + "learning_rate": 4.8951288776047514e-05, + "loss": 1.7057, + "step": 16991 + }, + { + "epoch": 5.2154696132596685, + "grad_norm": 0.2578633725643158, + "learning_rate": 4.894631931481807e-05, + "loss": 1.7267, + "step": 16992 + }, + { + "epoch": 5.215776550030694, + "grad_norm": 0.29975566267967224, + "learning_rate": 4.894134986400174e-05, + "loss": 1.7452, + "step": 16993 + }, + { + "epoch": 5.216083486801719, + "grad_norm": 0.22313638031482697, + "learning_rate": 4.893638042364758e-05, + "loss": 1.6917, + "step": 16994 + }, + { + "epoch": 5.216390423572744, + "grad_norm": 0.258297860622406, + "learning_rate": 4.893141099380475e-05, + "loss": 1.7816, + "step": 16995 + }, + { + "epoch": 5.216697360343769, + "grad_norm": 0.2656872272491455, + "learning_rate": 4.892644157452233e-05, + "loss": 1.7248, + "step": 16996 + }, + { + "epoch": 5.217004297114794, + "grad_norm": 0.20239698886871338, + "learning_rate": 4.8921472165849464e-05, + "loss": 1.7629, + "step": 16997 + }, + { + "epoch": 5.21731123388582, + "grad_norm": 0.2575492262840271, + "learning_rate": 4.891650276783523e-05, + "loss": 1.719, + "step": 16998 + }, + { + "epoch": 5.217618170656845, + "grad_norm": 0.27563637495040894, + "learning_rate": 4.8911533380528756e-05, + "loss": 1.718, + "step": 16999 + }, + { + "epoch": 5.21792510742787, + "grad_norm": 0.1969723105430603, + "learning_rate": 4.890656400397915e-05, + "loss": 1.7557, + "step": 17000 + }, + { + "epoch": 5.218232044198895, + "grad_norm": 0.24336831271648407, + "learning_rate": 4.89015946382355e-05, + "loss": 1.6861, + "step": 17001 + }, + { + "epoch": 5.21853898096992, + "grad_norm": 0.2804388403892517, + "learning_rate": 4.889662528334696e-05, + "loss": 1.7411, + "step": 17002 + }, + { + "epoch": 5.218845917740945, + "grad_norm": 0.21116352081298828, + "learning_rate": 4.8891655939362596e-05, + "loss": 1.7135, + "step": 17003 + }, + { + "epoch": 5.219152854511971, + "grad_norm": 0.21042904257774353, + "learning_rate": 4.8886686606331556e-05, + "loss": 1.7224, + "step": 17004 + }, + { + "epoch": 5.219459791282996, + "grad_norm": 0.22463755309581757, + "learning_rate": 4.888171728430291e-05, + "loss": 1.8272, + "step": 17005 + }, + { + "epoch": 5.2197667280540205, + "grad_norm": 0.25604158639907837, + "learning_rate": 4.8876747973325805e-05, + "loss": 1.674, + "step": 17006 + }, + { + "epoch": 5.220073664825046, + "grad_norm": 0.3108421564102173, + "learning_rate": 4.887177867344932e-05, + "loss": 1.761, + "step": 17007 + }, + { + "epoch": 5.220380601596071, + "grad_norm": 0.25135359168052673, + "learning_rate": 4.88668093847226e-05, + "loss": 1.7455, + "step": 17008 + }, + { + "epoch": 5.2206875383670965, + "grad_norm": 0.24508307874202728, + "learning_rate": 4.886184010719471e-05, + "loss": 1.7632, + "step": 17009 + }, + { + "epoch": 5.220994475138122, + "grad_norm": 0.26777148246765137, + "learning_rate": 4.8856870840914816e-05, + "loss": 1.7814, + "step": 17010 + }, + { + "epoch": 5.221301411909146, + "grad_norm": 0.22404739260673523, + "learning_rate": 4.8851901585931967e-05, + "loss": 1.7441, + "step": 17011 + }, + { + "epoch": 5.221608348680172, + "grad_norm": 0.2406606674194336, + "learning_rate": 4.884693234229531e-05, + "loss": 1.7789, + "step": 17012 + }, + { + "epoch": 5.221915285451197, + "grad_norm": 0.27320384979248047, + "learning_rate": 4.884196311005394e-05, + "loss": 1.8046, + "step": 17013 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.3393586277961731, + "learning_rate": 4.8836993889256965e-05, + "loss": 1.7155, + "step": 17014 + }, + { + "epoch": 5.222529158993248, + "grad_norm": 0.3069504499435425, + "learning_rate": 4.88320246799535e-05, + "loss": 1.6985, + "step": 17015 + }, + { + "epoch": 5.222836095764273, + "grad_norm": 0.22184616327285767, + "learning_rate": 4.8827055482192664e-05, + "loss": 1.7996, + "step": 17016 + }, + { + "epoch": 5.223143032535297, + "grad_norm": 0.2791864573955536, + "learning_rate": 4.8822086296023544e-05, + "loss": 1.7223, + "step": 17017 + }, + { + "epoch": 5.223449969306323, + "grad_norm": 0.259726345539093, + "learning_rate": 4.8817117121495245e-05, + "loss": 1.7481, + "step": 17018 + }, + { + "epoch": 5.223756906077348, + "grad_norm": 0.19968681037425995, + "learning_rate": 4.8812147958656916e-05, + "loss": 1.702, + "step": 17019 + }, + { + "epoch": 5.224063842848373, + "grad_norm": 0.20161856710910797, + "learning_rate": 4.8807178807557616e-05, + "loss": 1.6689, + "step": 17020 + }, + { + "epoch": 5.224370779619399, + "grad_norm": 0.2365240454673767, + "learning_rate": 4.880220966824649e-05, + "loss": 1.7742, + "step": 17021 + }, + { + "epoch": 5.224677716390423, + "grad_norm": 0.20116381347179413, + "learning_rate": 4.879724054077261e-05, + "loss": 1.7584, + "step": 17022 + }, + { + "epoch": 5.2249846531614486, + "grad_norm": 0.22845037281513214, + "learning_rate": 4.879227142518511e-05, + "loss": 1.7794, + "step": 17023 + }, + { + "epoch": 5.225291589932474, + "grad_norm": 0.251724511384964, + "learning_rate": 4.87873023215331e-05, + "loss": 1.7722, + "step": 17024 + }, + { + "epoch": 5.225598526703499, + "grad_norm": 0.206145241856575, + "learning_rate": 4.878233322986568e-05, + "loss": 1.7452, + "step": 17025 + }, + { + "epoch": 5.225905463474525, + "grad_norm": 0.24065247178077698, + "learning_rate": 4.877736415023194e-05, + "loss": 1.8144, + "step": 17026 + }, + { + "epoch": 5.226212400245549, + "grad_norm": 0.2255484163761139, + "learning_rate": 4.877239508268103e-05, + "loss": 1.706, + "step": 17027 + }, + { + "epoch": 5.226519337016574, + "grad_norm": 0.21035850048065186, + "learning_rate": 4.8767426027262e-05, + "loss": 1.7167, + "step": 17028 + }, + { + "epoch": 5.2268262737876, + "grad_norm": 0.19618964195251465, + "learning_rate": 4.8762456984024025e-05, + "loss": 1.7063, + "step": 17029 + }, + { + "epoch": 5.227133210558625, + "grad_norm": 0.19595398008823395, + "learning_rate": 4.875748795301614e-05, + "loss": 1.7452, + "step": 17030 + }, + { + "epoch": 5.22744014732965, + "grad_norm": 0.22870996594429016, + "learning_rate": 4.8752518934287506e-05, + "loss": 1.8169, + "step": 17031 + }, + { + "epoch": 5.227747084100676, + "grad_norm": 0.24048443138599396, + "learning_rate": 4.87475499278872e-05, + "loss": 1.6988, + "step": 17032 + }, + { + "epoch": 5.2280540208717, + "grad_norm": 0.24177183210849762, + "learning_rate": 4.8742580933864356e-05, + "loss": 1.77, + "step": 17033 + }, + { + "epoch": 5.2283609576427255, + "grad_norm": 0.2023085057735443, + "learning_rate": 4.873761195226806e-05, + "loss": 1.7, + "step": 17034 + }, + { + "epoch": 5.228667894413751, + "grad_norm": 0.2614101767539978, + "learning_rate": 4.873264298314742e-05, + "loss": 1.767, + "step": 17035 + }, + { + "epoch": 5.228974831184776, + "grad_norm": 0.19607602059841156, + "learning_rate": 4.872767402655154e-05, + "loss": 1.7391, + "step": 17036 + }, + { + "epoch": 5.2292817679558015, + "grad_norm": 0.2053994983434677, + "learning_rate": 4.872270508252953e-05, + "loss": 1.7155, + "step": 17037 + }, + { + "epoch": 5.229588704726826, + "grad_norm": 0.18256273865699768, + "learning_rate": 4.871773615113051e-05, + "loss": 1.6999, + "step": 17038 + }, + { + "epoch": 5.229895641497851, + "grad_norm": 0.21956393122673035, + "learning_rate": 4.871276723240356e-05, + "loss": 1.7946, + "step": 17039 + }, + { + "epoch": 5.230202578268877, + "grad_norm": 0.23779109120368958, + "learning_rate": 4.870779832639781e-05, + "loss": 1.8063, + "step": 17040 + }, + { + "epoch": 5.230509515039902, + "grad_norm": 0.21662941575050354, + "learning_rate": 4.8702829433162346e-05, + "loss": 1.7276, + "step": 17041 + }, + { + "epoch": 5.230816451810927, + "grad_norm": 0.21578755974769592, + "learning_rate": 4.869786055274628e-05, + "loss": 1.7577, + "step": 17042 + }, + { + "epoch": 5.231123388581952, + "grad_norm": 0.23229347169399261, + "learning_rate": 4.8692891685198715e-05, + "loss": 1.7884, + "step": 17043 + }, + { + "epoch": 5.231430325352977, + "grad_norm": 0.2302366942167282, + "learning_rate": 4.868792283056878e-05, + "loss": 1.7823, + "step": 17044 + }, + { + "epoch": 5.231737262124002, + "grad_norm": 0.2181033343076706, + "learning_rate": 4.868295398890554e-05, + "loss": 1.7027, + "step": 17045 + }, + { + "epoch": 5.232044198895028, + "grad_norm": 0.20863409340381622, + "learning_rate": 4.8677985160258135e-05, + "loss": 1.7247, + "step": 17046 + }, + { + "epoch": 5.232351135666053, + "grad_norm": 0.2242976278066635, + "learning_rate": 4.867301634467564e-05, + "loss": 1.7799, + "step": 17047 + }, + { + "epoch": 5.232658072437078, + "grad_norm": 0.19934964179992676, + "learning_rate": 4.866804754220719e-05, + "loss": 1.6973, + "step": 17048 + }, + { + "epoch": 5.232965009208103, + "grad_norm": 0.22056198120117188, + "learning_rate": 4.8663078752901855e-05, + "loss": 1.7677, + "step": 17049 + }, + { + "epoch": 5.233271945979128, + "grad_norm": 0.2303200513124466, + "learning_rate": 4.865810997680879e-05, + "loss": 1.7517, + "step": 17050 + }, + { + "epoch": 5.2335788827501535, + "grad_norm": 0.21193410456180573, + "learning_rate": 4.8653141213977066e-05, + "loss": 1.7478, + "step": 17051 + }, + { + "epoch": 5.233885819521179, + "grad_norm": 0.18498395383358002, + "learning_rate": 4.864817246445577e-05, + "loss": 1.6891, + "step": 17052 + }, + { + "epoch": 5.234192756292204, + "grad_norm": 0.22879233956336975, + "learning_rate": 4.8643203728294036e-05, + "loss": 1.7166, + "step": 17053 + }, + { + "epoch": 5.234499693063229, + "grad_norm": 0.2128525823354721, + "learning_rate": 4.8638235005540944e-05, + "loss": 1.7993, + "step": 17054 + }, + { + "epoch": 5.234806629834254, + "grad_norm": 0.21245025098323822, + "learning_rate": 4.8633266296245634e-05, + "loss": 1.7436, + "step": 17055 + }, + { + "epoch": 5.235113566605279, + "grad_norm": 0.20301629602909088, + "learning_rate": 4.8628297600457165e-05, + "loss": 1.7774, + "step": 17056 + }, + { + "epoch": 5.235420503376305, + "grad_norm": 0.23251961171627045, + "learning_rate": 4.8623328918224687e-05, + "loss": 1.7897, + "step": 17057 + }, + { + "epoch": 5.23572744014733, + "grad_norm": 0.2272956669330597, + "learning_rate": 4.861836024959726e-05, + "loss": 1.7668, + "step": 17058 + }, + { + "epoch": 5.236034376918354, + "grad_norm": 0.20540569722652435, + "learning_rate": 4.8613391594624013e-05, + "loss": 1.7549, + "step": 17059 + }, + { + "epoch": 5.23634131368938, + "grad_norm": 0.20306967198848724, + "learning_rate": 4.8608422953354034e-05, + "loss": 1.6993, + "step": 17060 + }, + { + "epoch": 5.236648250460405, + "grad_norm": 0.19415293633937836, + "learning_rate": 4.8603454325836455e-05, + "loss": 1.7313, + "step": 17061 + }, + { + "epoch": 5.23695518723143, + "grad_norm": 0.2058337777853012, + "learning_rate": 4.859848571212034e-05, + "loss": 1.7994, + "step": 17062 + }, + { + "epoch": 5.237262124002456, + "grad_norm": 0.24489709734916687, + "learning_rate": 4.859351711225483e-05, + "loss": 1.7555, + "step": 17063 + }, + { + "epoch": 5.237569060773481, + "grad_norm": 0.22589795291423798, + "learning_rate": 4.858854852628899e-05, + "loss": 1.7136, + "step": 17064 + }, + { + "epoch": 5.2378759975445055, + "grad_norm": 0.21404492855072021, + "learning_rate": 4.858357995427195e-05, + "loss": 1.7598, + "step": 17065 + }, + { + "epoch": 5.238182934315531, + "grad_norm": 0.24936965107917786, + "learning_rate": 4.8578611396252786e-05, + "loss": 1.8027, + "step": 17066 + }, + { + "epoch": 5.238489871086556, + "grad_norm": 0.23391515016555786, + "learning_rate": 4.857364285228065e-05, + "loss": 1.7704, + "step": 17067 + }, + { + "epoch": 5.2387968078575815, + "grad_norm": 0.22633357346057892, + "learning_rate": 4.85686743224046e-05, + "loss": 1.7075, + "step": 17068 + }, + { + "epoch": 5.239103744628607, + "grad_norm": 0.221492201089859, + "learning_rate": 4.8563705806673736e-05, + "loss": 1.7755, + "step": 17069 + }, + { + "epoch": 5.239410681399631, + "grad_norm": 0.2381046712398529, + "learning_rate": 4.855873730513719e-05, + "loss": 1.7971, + "step": 17070 + }, + { + "epoch": 5.239717618170657, + "grad_norm": 0.21930988132953644, + "learning_rate": 4.855376881784402e-05, + "loss": 1.7295, + "step": 17071 + }, + { + "epoch": 5.240024554941682, + "grad_norm": 0.20897921919822693, + "learning_rate": 4.854880034484339e-05, + "loss": 1.7796, + "step": 17072 + }, + { + "epoch": 5.240331491712707, + "grad_norm": 0.26616254448890686, + "learning_rate": 4.8543831886184334e-05, + "loss": 1.7095, + "step": 17073 + }, + { + "epoch": 5.240638428483733, + "grad_norm": 0.19513870775699615, + "learning_rate": 4.853886344191601e-05, + "loss": 1.7181, + "step": 17074 + }, + { + "epoch": 5.240945365254758, + "grad_norm": 0.23476530611515045, + "learning_rate": 4.853389501208747e-05, + "loss": 1.7928, + "step": 17075 + }, + { + "epoch": 5.241252302025782, + "grad_norm": 0.18197014927864075, + "learning_rate": 4.852892659674785e-05, + "loss": 1.6888, + "step": 17076 + }, + { + "epoch": 5.241559238796808, + "grad_norm": 0.20317208766937256, + "learning_rate": 4.852395819594623e-05, + "loss": 1.7828, + "step": 17077 + }, + { + "epoch": 5.241866175567833, + "grad_norm": 0.1953772008419037, + "learning_rate": 4.851898980973175e-05, + "loss": 1.7394, + "step": 17078 + }, + { + "epoch": 5.242173112338858, + "grad_norm": 0.19714407622814178, + "learning_rate": 4.851402143815345e-05, + "loss": 1.7261, + "step": 17079 + }, + { + "epoch": 5.242480049109884, + "grad_norm": 0.2196008861064911, + "learning_rate": 4.850905308126048e-05, + "loss": 1.7387, + "step": 17080 + }, + { + "epoch": 5.242786985880908, + "grad_norm": 0.2337818443775177, + "learning_rate": 4.85040847391019e-05, + "loss": 1.7448, + "step": 17081 + }, + { + "epoch": 5.2430939226519335, + "grad_norm": 0.20940040051937103, + "learning_rate": 4.849911641172685e-05, + "loss": 1.7354, + "step": 17082 + }, + { + "epoch": 5.243400859422959, + "grad_norm": 0.2242170125246048, + "learning_rate": 4.849414809918439e-05, + "loss": 1.7325, + "step": 17083 + }, + { + "epoch": 5.243707796193984, + "grad_norm": 0.2322687953710556, + "learning_rate": 4.8489179801523675e-05, + "loss": 1.7557, + "step": 17084 + }, + { + "epoch": 5.2440147329650095, + "grad_norm": 0.20303767919540405, + "learning_rate": 4.8484211518793764e-05, + "loss": 1.7063, + "step": 17085 + }, + { + "epoch": 5.244321669736034, + "grad_norm": 0.2446853369474411, + "learning_rate": 4.8479243251043746e-05, + "loss": 1.7587, + "step": 17086 + }, + { + "epoch": 5.244628606507059, + "grad_norm": 0.22901636362075806, + "learning_rate": 4.8474274998322735e-05, + "loss": 1.7992, + "step": 17087 + }, + { + "epoch": 5.244935543278085, + "grad_norm": 0.29676303267478943, + "learning_rate": 4.846930676067984e-05, + "loss": 1.7688, + "step": 17088 + }, + { + "epoch": 5.24524248004911, + "grad_norm": 0.24160240590572357, + "learning_rate": 4.846433853816416e-05, + "loss": 1.7367, + "step": 17089 + }, + { + "epoch": 5.245549416820135, + "grad_norm": 0.2097402662038803, + "learning_rate": 4.8459370330824774e-05, + "loss": 1.721, + "step": 17090 + }, + { + "epoch": 5.245856353591161, + "grad_norm": 0.26451143622398376, + "learning_rate": 4.8454402138710814e-05, + "loss": 1.7707, + "step": 17091 + }, + { + "epoch": 5.246163290362185, + "grad_norm": 0.30428358912467957, + "learning_rate": 4.844943396187133e-05, + "loss": 1.7232, + "step": 17092 + }, + { + "epoch": 5.24647022713321, + "grad_norm": 0.24332918226718903, + "learning_rate": 4.8444465800355466e-05, + "loss": 1.8215, + "step": 17093 + }, + { + "epoch": 5.246777163904236, + "grad_norm": 0.292703777551651, + "learning_rate": 4.843949765421229e-05, + "loss": 1.7199, + "step": 17094 + }, + { + "epoch": 5.247084100675261, + "grad_norm": 0.2458789199590683, + "learning_rate": 4.843452952349094e-05, + "loss": 1.7615, + "step": 17095 + }, + { + "epoch": 5.247391037446286, + "grad_norm": 0.22538037598133087, + "learning_rate": 4.842956140824045e-05, + "loss": 1.7279, + "step": 17096 + }, + { + "epoch": 5.247697974217311, + "grad_norm": 0.2959176003932953, + "learning_rate": 4.842459330850999e-05, + "loss": 1.767, + "step": 17097 + }, + { + "epoch": 5.248004910988336, + "grad_norm": 0.26158571243286133, + "learning_rate": 4.84196252243486e-05, + "loss": 1.7387, + "step": 17098 + }, + { + "epoch": 5.2483118477593615, + "grad_norm": 0.22855687141418457, + "learning_rate": 4.84146571558054e-05, + "loss": 1.7497, + "step": 17099 + }, + { + "epoch": 5.248618784530387, + "grad_norm": 0.22470593452453613, + "learning_rate": 4.840968910292949e-05, + "loss": 1.7705, + "step": 17100 + }, + { + "epoch": 5.248925721301412, + "grad_norm": 0.24680538475513458, + "learning_rate": 4.840472106576998e-05, + "loss": 1.7426, + "step": 17101 + }, + { + "epoch": 5.249232658072437, + "grad_norm": 0.23919185996055603, + "learning_rate": 4.839975304437594e-05, + "loss": 1.78, + "step": 17102 + }, + { + "epoch": 5.249539594843462, + "grad_norm": 0.24717366695404053, + "learning_rate": 4.839478503879647e-05, + "loss": 1.7373, + "step": 17103 + }, + { + "epoch": 5.249846531614487, + "grad_norm": 0.20463785529136658, + "learning_rate": 4.838981704908068e-05, + "loss": 1.702, + "step": 17104 + }, + { + "epoch": 5.250153468385513, + "grad_norm": 0.19791419804096222, + "learning_rate": 4.838484907527766e-05, + "loss": 1.746, + "step": 17105 + }, + { + "epoch": 5.250460405156538, + "grad_norm": 0.26169353723526, + "learning_rate": 4.837988111743652e-05, + "loss": 1.7227, + "step": 17106 + }, + { + "epoch": 5.250767341927563, + "grad_norm": 0.23545648157596588, + "learning_rate": 4.837491317560633e-05, + "loss": 1.7104, + "step": 17107 + }, + { + "epoch": 5.251074278698588, + "grad_norm": 0.21569804847240448, + "learning_rate": 4.836994524983622e-05, + "loss": 1.7883, + "step": 17108 + }, + { + "epoch": 5.251381215469613, + "grad_norm": 0.2730300724506378, + "learning_rate": 4.836497734017524e-05, + "loss": 1.7105, + "step": 17109 + }, + { + "epoch": 5.2516881522406385, + "grad_norm": 0.2834697663784027, + "learning_rate": 4.836000944667253e-05, + "loss": 1.8041, + "step": 17110 + }, + { + "epoch": 5.251995089011664, + "grad_norm": 0.31536951661109924, + "learning_rate": 4.835504156937715e-05, + "loss": 1.7708, + "step": 17111 + }, + { + "epoch": 5.252302025782689, + "grad_norm": 0.3830285668373108, + "learning_rate": 4.835007370833824e-05, + "loss": 1.7464, + "step": 17112 + }, + { + "epoch": 5.252608962553714, + "grad_norm": 0.23248349130153656, + "learning_rate": 4.834510586360485e-05, + "loss": 1.7274, + "step": 17113 + }, + { + "epoch": 5.252915899324739, + "grad_norm": 0.4755091071128845, + "learning_rate": 4.834013803522611e-05, + "loss": 1.7853, + "step": 17114 + }, + { + "epoch": 5.253222836095764, + "grad_norm": 0.4267823398113251, + "learning_rate": 4.8335170223251073e-05, + "loss": 1.7424, + "step": 17115 + }, + { + "epoch": 5.25352977286679, + "grad_norm": 0.17621731758117676, + "learning_rate": 4.8330202427728876e-05, + "loss": 1.7415, + "step": 17116 + }, + { + "epoch": 5.253836709637815, + "grad_norm": 0.37484630942344666, + "learning_rate": 4.832523464870859e-05, + "loss": 1.7357, + "step": 17117 + }, + { + "epoch": 5.25414364640884, + "grad_norm": 0.27773791551589966, + "learning_rate": 4.832026688623933e-05, + "loss": 1.717, + "step": 17118 + }, + { + "epoch": 5.254450583179865, + "grad_norm": 0.31190845370292664, + "learning_rate": 4.8315299140370183e-05, + "loss": 1.7226, + "step": 17119 + }, + { + "epoch": 5.25475751995089, + "grad_norm": 0.4321303367614746, + "learning_rate": 4.8310331411150215e-05, + "loss": 1.8003, + "step": 17120 + }, + { + "epoch": 5.255064456721915, + "grad_norm": 0.31622835993766785, + "learning_rate": 4.830536369862855e-05, + "loss": 1.8462, + "step": 17121 + }, + { + "epoch": 5.255371393492941, + "grad_norm": 0.2144850194454193, + "learning_rate": 4.830039600285427e-05, + "loss": 1.8153, + "step": 17122 + }, + { + "epoch": 5.255678330263966, + "grad_norm": 0.3107511103153229, + "learning_rate": 4.829542832387649e-05, + "loss": 1.7271, + "step": 17123 + }, + { + "epoch": 5.2559852670349905, + "grad_norm": 0.24607159197330475, + "learning_rate": 4.8290460661744265e-05, + "loss": 1.7946, + "step": 17124 + }, + { + "epoch": 5.256292203806016, + "grad_norm": 0.226362943649292, + "learning_rate": 4.828549301650673e-05, + "loss": 1.7338, + "step": 17125 + }, + { + "epoch": 5.256599140577041, + "grad_norm": 0.29993724822998047, + "learning_rate": 4.828052538821294e-05, + "loss": 1.8, + "step": 17126 + }, + { + "epoch": 5.2569060773480665, + "grad_norm": 0.25639984011650085, + "learning_rate": 4.8275557776912014e-05, + "loss": 1.8009, + "step": 17127 + }, + { + "epoch": 5.257213014119092, + "grad_norm": 0.2308105081319809, + "learning_rate": 4.8270590182653024e-05, + "loss": 1.7468, + "step": 17128 + }, + { + "epoch": 5.257519950890116, + "grad_norm": 0.27337542176246643, + "learning_rate": 4.82656226054851e-05, + "loss": 1.7725, + "step": 17129 + }, + { + "epoch": 5.257826887661142, + "grad_norm": 0.24848094582557678, + "learning_rate": 4.826065504545729e-05, + "loss": 1.8084, + "step": 17130 + }, + { + "epoch": 5.258133824432167, + "grad_norm": 0.35026392340660095, + "learning_rate": 4.825568750261872e-05, + "loss": 1.7705, + "step": 17131 + }, + { + "epoch": 5.258440761203192, + "grad_norm": 0.3207968473434448, + "learning_rate": 4.825071997701846e-05, + "loss": 1.7329, + "step": 17132 + }, + { + "epoch": 5.258747697974218, + "grad_norm": 0.20949263870716095, + "learning_rate": 4.8245752468705614e-05, + "loss": 1.7658, + "step": 17133 + }, + { + "epoch": 5.259054634745242, + "grad_norm": 0.3158881366252899, + "learning_rate": 4.824078497772926e-05, + "loss": 1.7249, + "step": 17134 + }, + { + "epoch": 5.259361571516267, + "grad_norm": 0.2283414602279663, + "learning_rate": 4.823581750413852e-05, + "loss": 1.7177, + "step": 17135 + }, + { + "epoch": 5.259668508287293, + "grad_norm": 0.24753578007221222, + "learning_rate": 4.823085004798247e-05, + "loss": 1.7232, + "step": 17136 + }, + { + "epoch": 5.259975445058318, + "grad_norm": 0.20381587743759155, + "learning_rate": 4.822588260931017e-05, + "loss": 1.7049, + "step": 17137 + }, + { + "epoch": 5.260282381829343, + "grad_norm": 0.21220643818378448, + "learning_rate": 4.8220915188170746e-05, + "loss": 1.7221, + "step": 17138 + }, + { + "epoch": 5.260589318600369, + "grad_norm": 0.19324758648872375, + "learning_rate": 4.8215947784613276e-05, + "loss": 1.7168, + "step": 17139 + }, + { + "epoch": 5.260896255371393, + "grad_norm": 0.26500338315963745, + "learning_rate": 4.821098039868688e-05, + "loss": 1.7627, + "step": 17140 + }, + { + "epoch": 5.2612031921424185, + "grad_norm": 0.19597655534744263, + "learning_rate": 4.82060130304406e-05, + "loss": 1.7214, + "step": 17141 + }, + { + "epoch": 5.261510128913444, + "grad_norm": 0.2105483114719391, + "learning_rate": 4.820104567992357e-05, + "loss": 1.6742, + "step": 17142 + }, + { + "epoch": 5.261817065684469, + "grad_norm": 0.20020028948783875, + "learning_rate": 4.8196078347184837e-05, + "loss": 1.7721, + "step": 17143 + }, + { + "epoch": 5.2621240024554945, + "grad_norm": 0.2313549965620041, + "learning_rate": 4.819111103227353e-05, + "loss": 1.7644, + "step": 17144 + }, + { + "epoch": 5.262430939226519, + "grad_norm": 0.31893789768218994, + "learning_rate": 4.818614373523871e-05, + "loss": 1.747, + "step": 17145 + }, + { + "epoch": 5.262737875997544, + "grad_norm": 0.2531197667121887, + "learning_rate": 4.8181176456129505e-05, + "loss": 1.7713, + "step": 17146 + }, + { + "epoch": 5.26304481276857, + "grad_norm": 0.2063976377248764, + "learning_rate": 4.817620919499496e-05, + "loss": 1.7254, + "step": 17147 + }, + { + "epoch": 5.263351749539595, + "grad_norm": 0.22220590710639954, + "learning_rate": 4.8171241951884204e-05, + "loss": 1.7345, + "step": 17148 + }, + { + "epoch": 5.26365868631062, + "grad_norm": 0.24240384995937347, + "learning_rate": 4.8166274726846286e-05, + "loss": 1.7302, + "step": 17149 + }, + { + "epoch": 5.263965623081646, + "grad_norm": 0.215829998254776, + "learning_rate": 4.8161307519930326e-05, + "loss": 1.7725, + "step": 17150 + }, + { + "epoch": 5.26427255985267, + "grad_norm": 0.2697906494140625, + "learning_rate": 4.815634033118541e-05, + "loss": 1.7156, + "step": 17151 + }, + { + "epoch": 5.264579496623695, + "grad_norm": 0.21649456024169922, + "learning_rate": 4.815137316066061e-05, + "loss": 1.745, + "step": 17152 + }, + { + "epoch": 5.264886433394721, + "grad_norm": 0.22773787379264832, + "learning_rate": 4.8146406008405033e-05, + "loss": 1.7592, + "step": 17153 + }, + { + "epoch": 5.265193370165746, + "grad_norm": 0.2920280396938324, + "learning_rate": 4.8141438874467745e-05, + "loss": 1.8301, + "step": 17154 + }, + { + "epoch": 5.265500306936771, + "grad_norm": 0.23919162154197693, + "learning_rate": 4.813647175889785e-05, + "loss": 1.7687, + "step": 17155 + }, + { + "epoch": 5.265807243707796, + "grad_norm": 0.24617896974086761, + "learning_rate": 4.8131504661744425e-05, + "loss": 1.8279, + "step": 17156 + }, + { + "epoch": 5.266114180478821, + "grad_norm": 0.22756172716617584, + "learning_rate": 4.812653758305659e-05, + "loss": 1.7595, + "step": 17157 + }, + { + "epoch": 5.2664211172498465, + "grad_norm": 0.22939376533031464, + "learning_rate": 4.812157052288339e-05, + "loss": 1.7445, + "step": 17158 + }, + { + "epoch": 5.266728054020872, + "grad_norm": 0.21021319925785065, + "learning_rate": 4.811660348127395e-05, + "loss": 1.7875, + "step": 17159 + }, + { + "epoch": 5.267034990791897, + "grad_norm": 0.2271810919046402, + "learning_rate": 4.811163645827732e-05, + "loss": 1.74, + "step": 17160 + }, + { + "epoch": 5.267341927562922, + "grad_norm": 0.238374263048172, + "learning_rate": 4.81066694539426e-05, + "loss": 1.7717, + "step": 17161 + }, + { + "epoch": 5.267648864333947, + "grad_norm": 0.20655091106891632, + "learning_rate": 4.8101702468318885e-05, + "loss": 1.7447, + "step": 17162 + }, + { + "epoch": 5.267955801104972, + "grad_norm": 0.24652259051799774, + "learning_rate": 4.809673550145528e-05, + "loss": 1.7755, + "step": 17163 + }, + { + "epoch": 5.268262737875998, + "grad_norm": 0.20256781578063965, + "learning_rate": 4.809176855340083e-05, + "loss": 1.7689, + "step": 17164 + }, + { + "epoch": 5.268569674647023, + "grad_norm": 0.27023112773895264, + "learning_rate": 4.8086801624204665e-05, + "loss": 1.8364, + "step": 17165 + }, + { + "epoch": 5.268876611418047, + "grad_norm": 0.251638799905777, + "learning_rate": 4.808183471391582e-05, + "loss": 1.7924, + "step": 17166 + }, + { + "epoch": 5.269183548189073, + "grad_norm": 0.22897782921791077, + "learning_rate": 4.807686782258342e-05, + "loss": 1.7378, + "step": 17167 + }, + { + "epoch": 5.269490484960098, + "grad_norm": 0.19141456484794617, + "learning_rate": 4.807190095025655e-05, + "loss": 1.6911, + "step": 17168 + }, + { + "epoch": 5.269797421731123, + "grad_norm": 0.19960568845272064, + "learning_rate": 4.806693409698427e-05, + "loss": 1.71, + "step": 17169 + }, + { + "epoch": 5.270104358502149, + "grad_norm": 0.23332087695598602, + "learning_rate": 4.8061967262815694e-05, + "loss": 1.7993, + "step": 17170 + }, + { + "epoch": 5.270411295273174, + "grad_norm": 0.24831432104110718, + "learning_rate": 4.8057000447799876e-05, + "loss": 1.7459, + "step": 17171 + }, + { + "epoch": 5.2707182320441985, + "grad_norm": 0.24735838174819946, + "learning_rate": 4.805203365198593e-05, + "loss": 1.7751, + "step": 17172 + }, + { + "epoch": 5.271025168815224, + "grad_norm": 0.32630103826522827, + "learning_rate": 4.804706687542291e-05, + "loss": 1.7885, + "step": 17173 + }, + { + "epoch": 5.271332105586249, + "grad_norm": 0.29055842757225037, + "learning_rate": 4.804210011815995e-05, + "loss": 1.6819, + "step": 17174 + }, + { + "epoch": 5.2716390423572745, + "grad_norm": 0.22968806326389313, + "learning_rate": 4.803713338024608e-05, + "loss": 1.8146, + "step": 17175 + }, + { + "epoch": 5.2719459791283, + "grad_norm": 0.23430144786834717, + "learning_rate": 4.8032166661730434e-05, + "loss": 1.7401, + "step": 17176 + }, + { + "epoch": 5.272252915899324, + "grad_norm": 0.26312723755836487, + "learning_rate": 4.802719996266204e-05, + "loss": 1.8319, + "step": 17177 + }, + { + "epoch": 5.27255985267035, + "grad_norm": 0.23715369403362274, + "learning_rate": 4.802223328309003e-05, + "loss": 1.8014, + "step": 17178 + }, + { + "epoch": 5.272866789441375, + "grad_norm": 0.23943877220153809, + "learning_rate": 4.801726662306347e-05, + "loss": 1.7181, + "step": 17179 + }, + { + "epoch": 5.2731737262124, + "grad_norm": 0.2366543412208557, + "learning_rate": 4.8012299982631435e-05, + "loss": 1.6685, + "step": 17180 + }, + { + "epoch": 5.273480662983426, + "grad_norm": 0.20688587427139282, + "learning_rate": 4.8007333361843016e-05, + "loss": 1.7089, + "step": 17181 + }, + { + "epoch": 5.273787599754451, + "grad_norm": 0.2069951444864273, + "learning_rate": 4.8002366760747314e-05, + "loss": 1.7447, + "step": 17182 + }, + { + "epoch": 5.274094536525475, + "grad_norm": 0.26072344183921814, + "learning_rate": 4.7997400179393374e-05, + "loss": 1.7346, + "step": 17183 + }, + { + "epoch": 5.274401473296501, + "grad_norm": 0.2397938072681427, + "learning_rate": 4.799243361783031e-05, + "loss": 1.7556, + "step": 17184 + }, + { + "epoch": 5.274708410067526, + "grad_norm": 0.23606348037719727, + "learning_rate": 4.798746707610721e-05, + "loss": 1.732, + "step": 17185 + }, + { + "epoch": 5.2750153468385514, + "grad_norm": 0.21078252792358398, + "learning_rate": 4.798250055427311e-05, + "loss": 1.7571, + "step": 17186 + }, + { + "epoch": 5.275322283609577, + "grad_norm": 0.21331414580345154, + "learning_rate": 4.797753405237714e-05, + "loss": 1.732, + "step": 17187 + }, + { + "epoch": 5.275629220380601, + "grad_norm": 0.23700307309627533, + "learning_rate": 4.7972567570468354e-05, + "loss": 1.7354, + "step": 17188 + }, + { + "epoch": 5.275936157151627, + "grad_norm": 0.20519722998142242, + "learning_rate": 4.7967601108595845e-05, + "loss": 1.7435, + "step": 17189 + }, + { + "epoch": 5.276243093922652, + "grad_norm": 0.22358302772045135, + "learning_rate": 4.79626346668087e-05, + "loss": 1.7891, + "step": 17190 + }, + { + "epoch": 5.276550030693677, + "grad_norm": 0.2434413880109787, + "learning_rate": 4.795766824515598e-05, + "loss": 1.814, + "step": 17191 + }, + { + "epoch": 5.276856967464703, + "grad_norm": 0.2198423594236374, + "learning_rate": 4.795270184368678e-05, + "loss": 1.7212, + "step": 17192 + }, + { + "epoch": 5.277163904235728, + "grad_norm": 0.23587806522846222, + "learning_rate": 4.7947735462450205e-05, + "loss": 1.8337, + "step": 17193 + }, + { + "epoch": 5.277470841006752, + "grad_norm": 0.234666645526886, + "learning_rate": 4.794276910149528e-05, + "loss": 1.7548, + "step": 17194 + }, + { + "epoch": 5.277777777777778, + "grad_norm": 0.23363247513771057, + "learning_rate": 4.793780276087115e-05, + "loss": 1.7587, + "step": 17195 + }, + { + "epoch": 5.278084714548803, + "grad_norm": 0.23191119730472565, + "learning_rate": 4.793283644062683e-05, + "loss": 1.7691, + "step": 17196 + }, + { + "epoch": 5.278391651319828, + "grad_norm": 0.2363097071647644, + "learning_rate": 4.7927870140811445e-05, + "loss": 1.8139, + "step": 17197 + }, + { + "epoch": 5.278698588090854, + "grad_norm": 0.2852413058280945, + "learning_rate": 4.7922903861474056e-05, + "loss": 1.7905, + "step": 17198 + }, + { + "epoch": 5.279005524861878, + "grad_norm": 0.23633842170238495, + "learning_rate": 4.7917937602663764e-05, + "loss": 1.8014, + "step": 17199 + }, + { + "epoch": 5.2793124616329035, + "grad_norm": 0.27007919549942017, + "learning_rate": 4.791297136442961e-05, + "loss": 1.7242, + "step": 17200 + }, + { + "epoch": 5.279619398403929, + "grad_norm": 0.29482147097587585, + "learning_rate": 4.790800514682072e-05, + "loss": 1.7154, + "step": 17201 + }, + { + "epoch": 5.279926335174954, + "grad_norm": 0.27772340178489685, + "learning_rate": 4.790303894988614e-05, + "loss": 1.7771, + "step": 17202 + }, + { + "epoch": 5.2802332719459795, + "grad_norm": 0.21761848032474518, + "learning_rate": 4.789807277367495e-05, + "loss": 1.6983, + "step": 17203 + }, + { + "epoch": 5.280540208717004, + "grad_norm": 0.22621290385723114, + "learning_rate": 4.789310661823626e-05, + "loss": 1.7667, + "step": 17204 + }, + { + "epoch": 5.280847145488029, + "grad_norm": 0.2284683883190155, + "learning_rate": 4.7888140483619095e-05, + "loss": 1.7419, + "step": 17205 + }, + { + "epoch": 5.281154082259055, + "grad_norm": 0.20145639777183533, + "learning_rate": 4.788317436987259e-05, + "loss": 1.7068, + "step": 17206 + }, + { + "epoch": 5.28146101903008, + "grad_norm": 0.23146072030067444, + "learning_rate": 4.7878208277045775e-05, + "loss": 1.7195, + "step": 17207 + }, + { + "epoch": 5.281767955801105, + "grad_norm": 0.24014149606227875, + "learning_rate": 4.787324220518776e-05, + "loss": 1.8148, + "step": 17208 + }, + { + "epoch": 5.28207489257213, + "grad_norm": 0.21067874133586884, + "learning_rate": 4.7868276154347595e-05, + "loss": 1.7754, + "step": 17209 + }, + { + "epoch": 5.282381829343155, + "grad_norm": 0.2313496321439743, + "learning_rate": 4.786331012457441e-05, + "loss": 1.7693, + "step": 17210 + }, + { + "epoch": 5.28268876611418, + "grad_norm": 0.24190983176231384, + "learning_rate": 4.7858344115917214e-05, + "loss": 1.7342, + "step": 17211 + }, + { + "epoch": 5.282995702885206, + "grad_norm": 0.24541905522346497, + "learning_rate": 4.785337812842514e-05, + "loss": 1.7721, + "step": 17212 + }, + { + "epoch": 5.283302639656231, + "grad_norm": 0.21989032626152039, + "learning_rate": 4.784841216214722e-05, + "loss": 1.7522, + "step": 17213 + }, + { + "epoch": 5.283609576427256, + "grad_norm": 0.20637241005897522, + "learning_rate": 4.784344621713256e-05, + "loss": 1.7418, + "step": 17214 + }, + { + "epoch": 5.283916513198281, + "grad_norm": 0.22538220882415771, + "learning_rate": 4.783848029343023e-05, + "loss": 1.8287, + "step": 17215 + }, + { + "epoch": 5.284223449969306, + "grad_norm": 0.24478071928024292, + "learning_rate": 4.7833514391089315e-05, + "loss": 1.7419, + "step": 17216 + }, + { + "epoch": 5.2845303867403315, + "grad_norm": 0.22707650065422058, + "learning_rate": 4.782854851015886e-05, + "loss": 1.7831, + "step": 17217 + }, + { + "epoch": 5.284837323511357, + "grad_norm": 0.2843529284000397, + "learning_rate": 4.7823582650687984e-05, + "loss": 1.7704, + "step": 17218 + }, + { + "epoch": 5.285144260282382, + "grad_norm": 0.21647678315639496, + "learning_rate": 4.781861681272573e-05, + "loss": 1.7514, + "step": 17219 + }, + { + "epoch": 5.285451197053407, + "grad_norm": 0.2279205620288849, + "learning_rate": 4.781365099632117e-05, + "loss": 1.6803, + "step": 17220 + }, + { + "epoch": 5.285758133824432, + "grad_norm": 0.2287401556968689, + "learning_rate": 4.7808685201523417e-05, + "loss": 1.7278, + "step": 17221 + }, + { + "epoch": 5.286065070595457, + "grad_norm": 0.2103174477815628, + "learning_rate": 4.78037194283815e-05, + "loss": 1.7667, + "step": 17222 + }, + { + "epoch": 5.286372007366483, + "grad_norm": 0.24339279532432556, + "learning_rate": 4.7798753676944536e-05, + "loss": 1.7828, + "step": 17223 + }, + { + "epoch": 5.286678944137508, + "grad_norm": 0.2343035340309143, + "learning_rate": 4.779378794726156e-05, + "loss": 1.7277, + "step": 17224 + }, + { + "epoch": 5.286985880908533, + "grad_norm": 0.22456331551074982, + "learning_rate": 4.778882223938167e-05, + "loss": 1.756, + "step": 17225 + }, + { + "epoch": 5.287292817679558, + "grad_norm": 0.2211158126592636, + "learning_rate": 4.778385655335392e-05, + "loss": 1.7733, + "step": 17226 + }, + { + "epoch": 5.287599754450583, + "grad_norm": 0.2731948792934418, + "learning_rate": 4.777889088922743e-05, + "loss": 1.787, + "step": 17227 + }, + { + "epoch": 5.287906691221608, + "grad_norm": 0.19578024744987488, + "learning_rate": 4.7773925247051215e-05, + "loss": 1.7474, + "step": 17228 + }, + { + "epoch": 5.288213627992634, + "grad_norm": 0.277332067489624, + "learning_rate": 4.77689596268744e-05, + "loss": 1.7432, + "step": 17229 + }, + { + "epoch": 5.288520564763659, + "grad_norm": 0.2979765832424164, + "learning_rate": 4.7763994028746003e-05, + "loss": 1.8198, + "step": 17230 + }, + { + "epoch": 5.2888275015346835, + "grad_norm": 0.23176288604736328, + "learning_rate": 4.775902845271515e-05, + "loss": 1.7317, + "step": 17231 + }, + { + "epoch": 5.289134438305709, + "grad_norm": 0.35821911692619324, + "learning_rate": 4.7754062898830876e-05, + "loss": 1.7287, + "step": 17232 + }, + { + "epoch": 5.289441375076734, + "grad_norm": 0.2881525158882141, + "learning_rate": 4.7749097367142296e-05, + "loss": 1.7391, + "step": 17233 + }, + { + "epoch": 5.2897483118477595, + "grad_norm": 0.22021767497062683, + "learning_rate": 4.774413185769842e-05, + "loss": 1.7462, + "step": 17234 + }, + { + "epoch": 5.290055248618785, + "grad_norm": 0.3286842703819275, + "learning_rate": 4.7739166370548385e-05, + "loss": 1.7749, + "step": 17235 + }, + { + "epoch": 5.290362185389809, + "grad_norm": 0.3298519253730774, + "learning_rate": 4.773420090574122e-05, + "loss": 1.7548, + "step": 17236 + }, + { + "epoch": 5.290669122160835, + "grad_norm": 0.20910575985908508, + "learning_rate": 4.7729235463326005e-05, + "loss": 1.7308, + "step": 17237 + }, + { + "epoch": 5.29097605893186, + "grad_norm": 0.3324633240699768, + "learning_rate": 4.7724270043351835e-05, + "loss": 1.7328, + "step": 17238 + }, + { + "epoch": 5.291282995702885, + "grad_norm": 0.21235628426074982, + "learning_rate": 4.771930464586774e-05, + "loss": 1.7186, + "step": 17239 + }, + { + "epoch": 5.291589932473911, + "grad_norm": 0.2971087694168091, + "learning_rate": 4.771433927092283e-05, + "loss": 1.7947, + "step": 17240 + }, + { + "epoch": 5.291896869244935, + "grad_norm": 0.3637695908546448, + "learning_rate": 4.770937391856614e-05, + "loss": 1.7753, + "step": 17241 + }, + { + "epoch": 5.29220380601596, + "grad_norm": 0.2503713369369507, + "learning_rate": 4.770440858884678e-05, + "loss": 1.684, + "step": 17242 + }, + { + "epoch": 5.292510742786986, + "grad_norm": 0.25510790944099426, + "learning_rate": 4.7699443281813774e-05, + "loss": 1.7517, + "step": 17243 + }, + { + "epoch": 5.292817679558011, + "grad_norm": 0.3189590871334076, + "learning_rate": 4.7694477997516244e-05, + "loss": 1.7488, + "step": 17244 + }, + { + "epoch": 5.293124616329036, + "grad_norm": 0.2807229161262512, + "learning_rate": 4.7689512736003215e-05, + "loss": 1.7962, + "step": 17245 + }, + { + "epoch": 5.293431553100062, + "grad_norm": 0.2166406810283661, + "learning_rate": 4.76845474973238e-05, + "loss": 1.7423, + "step": 17246 + }, + { + "epoch": 5.293738489871086, + "grad_norm": 0.29000815749168396, + "learning_rate": 4.767958228152702e-05, + "loss": 1.7508, + "step": 17247 + }, + { + "epoch": 5.2940454266421115, + "grad_norm": 0.19301612675189972, + "learning_rate": 4.767461708866198e-05, + "loss": 1.7223, + "step": 17248 + }, + { + "epoch": 5.294352363413137, + "grad_norm": 0.2828899323940277, + "learning_rate": 4.766965191877772e-05, + "loss": 1.8139, + "step": 17249 + }, + { + "epoch": 5.294659300184162, + "grad_norm": 0.32610374689102173, + "learning_rate": 4.766468677192335e-05, + "loss": 1.7744, + "step": 17250 + }, + { + "epoch": 5.2949662369551875, + "grad_norm": 0.2175719439983368, + "learning_rate": 4.7659721648147895e-05, + "loss": 1.7345, + "step": 17251 + }, + { + "epoch": 5.295273173726212, + "grad_norm": 0.24777816236019135, + "learning_rate": 4.7654756547500457e-05, + "loss": 1.7382, + "step": 17252 + }, + { + "epoch": 5.295580110497237, + "grad_norm": 0.25927749276161194, + "learning_rate": 4.764979147003008e-05, + "loss": 1.7625, + "step": 17253 + }, + { + "epoch": 5.295887047268263, + "grad_norm": 0.2271798849105835, + "learning_rate": 4.7644826415785834e-05, + "loss": 1.6928, + "step": 17254 + }, + { + "epoch": 5.296193984039288, + "grad_norm": 0.30804958939552307, + "learning_rate": 4.763986138481682e-05, + "loss": 1.743, + "step": 17255 + }, + { + "epoch": 5.296500920810313, + "grad_norm": 0.2247130572795868, + "learning_rate": 4.763489637717205e-05, + "loss": 1.7593, + "step": 17256 + }, + { + "epoch": 5.296807857581339, + "grad_norm": 0.22203052043914795, + "learning_rate": 4.7629931392900645e-05, + "loss": 1.6923, + "step": 17257 + }, + { + "epoch": 5.297114794352363, + "grad_norm": 0.23044714331626892, + "learning_rate": 4.7624966432051624e-05, + "loss": 1.7676, + "step": 17258 + }, + { + "epoch": 5.297421731123388, + "grad_norm": 0.2824070155620575, + "learning_rate": 4.7620001494674096e-05, + "loss": 1.8272, + "step": 17259 + }, + { + "epoch": 5.297728667894414, + "grad_norm": 0.27077800035476685, + "learning_rate": 4.761503658081709e-05, + "loss": 1.8106, + "step": 17260 + }, + { + "epoch": 5.298035604665439, + "grad_norm": 0.2333833873271942, + "learning_rate": 4.7610071690529706e-05, + "loss": 1.6841, + "step": 17261 + }, + { + "epoch": 5.298342541436464, + "grad_norm": 0.2542032301425934, + "learning_rate": 4.760510682386098e-05, + "loss": 1.7656, + "step": 17262 + }, + { + "epoch": 5.298649478207489, + "grad_norm": 0.30680081248283386, + "learning_rate": 4.760014198086002e-05, + "loss": 1.7443, + "step": 17263 + }, + { + "epoch": 5.298956414978514, + "grad_norm": 0.21580225229263306, + "learning_rate": 4.759517716157583e-05, + "loss": 1.7907, + "step": 17264 + }, + { + "epoch": 5.2992633517495396, + "grad_norm": 0.2644323408603668, + "learning_rate": 4.7590212366057516e-05, + "loss": 1.6835, + "step": 17265 + }, + { + "epoch": 5.299570288520565, + "grad_norm": 0.23600110411643982, + "learning_rate": 4.758524759435414e-05, + "loss": 1.7481, + "step": 17266 + }, + { + "epoch": 5.29987722529159, + "grad_norm": 0.23825959861278534, + "learning_rate": 4.758028284651477e-05, + "loss": 1.7267, + "step": 17267 + }, + { + "epoch": 5.300184162062616, + "grad_norm": 0.2659476101398468, + "learning_rate": 4.757531812258845e-05, + "loss": 1.7303, + "step": 17268 + }, + { + "epoch": 5.30049109883364, + "grad_norm": 0.30770114064216614, + "learning_rate": 4.757035342262428e-05, + "loss": 1.7636, + "step": 17269 + }, + { + "epoch": 5.300798035604665, + "grad_norm": 0.27921241521835327, + "learning_rate": 4.756538874667129e-05, + "loss": 1.7736, + "step": 17270 + }, + { + "epoch": 5.301104972375691, + "grad_norm": 0.2518016993999481, + "learning_rate": 4.756042409477855e-05, + "loss": 1.7942, + "step": 17271 + }, + { + "epoch": 5.301411909146716, + "grad_norm": 0.2678029537200928, + "learning_rate": 4.755545946699514e-05, + "loss": 1.7179, + "step": 17272 + }, + { + "epoch": 5.301718845917741, + "grad_norm": 0.3082284927368164, + "learning_rate": 4.7550494863370094e-05, + "loss": 1.7282, + "step": 17273 + }, + { + "epoch": 5.302025782688766, + "grad_norm": 0.23269952833652496, + "learning_rate": 4.754553028395251e-05, + "loss": 1.755, + "step": 17274 + }, + { + "epoch": 5.302332719459791, + "grad_norm": 0.2273751199245453, + "learning_rate": 4.754056572879142e-05, + "loss": 1.7661, + "step": 17275 + }, + { + "epoch": 5.3026396562308165, + "grad_norm": 0.2175082415342331, + "learning_rate": 4.7535601197935915e-05, + "loss": 1.7034, + "step": 17276 + }, + { + "epoch": 5.302946593001842, + "grad_norm": 0.20551301538944244, + "learning_rate": 4.753063669143503e-05, + "loss": 1.7329, + "step": 17277 + }, + { + "epoch": 5.303253529772867, + "grad_norm": 0.2350638061761856, + "learning_rate": 4.752567220933785e-05, + "loss": 1.8361, + "step": 17278 + }, + { + "epoch": 5.303560466543892, + "grad_norm": 0.20268140733242035, + "learning_rate": 4.752070775169342e-05, + "loss": 1.6736, + "step": 17279 + }, + { + "epoch": 5.303867403314917, + "grad_norm": 0.1891544908285141, + "learning_rate": 4.7515743318550823e-05, + "loss": 1.7241, + "step": 17280 + }, + { + "epoch": 5.304174340085942, + "grad_norm": 0.22900860011577606, + "learning_rate": 4.751077890995909e-05, + "loss": 1.7321, + "step": 17281 + }, + { + "epoch": 5.304481276856968, + "grad_norm": 0.25827866792678833, + "learning_rate": 4.7505814525967304e-05, + "loss": 1.8021, + "step": 17282 + }, + { + "epoch": 5.304788213627993, + "grad_norm": 0.22459273040294647, + "learning_rate": 4.7500850166624514e-05, + "loss": 1.7845, + "step": 17283 + }, + { + "epoch": 5.305095150399017, + "grad_norm": 0.23737964034080505, + "learning_rate": 4.7495885831979816e-05, + "loss": 1.7274, + "step": 17284 + }, + { + "epoch": 5.305402087170043, + "grad_norm": 0.2267502397298813, + "learning_rate": 4.749092152208221e-05, + "loss": 1.7747, + "step": 17285 + }, + { + "epoch": 5.305709023941068, + "grad_norm": 0.31811007857322693, + "learning_rate": 4.748595723698081e-05, + "loss": 1.7852, + "step": 17286 + }, + { + "epoch": 5.306015960712093, + "grad_norm": 0.42865583300590515, + "learning_rate": 4.7480992976724655e-05, + "loss": 1.7711, + "step": 17287 + }, + { + "epoch": 5.306322897483119, + "grad_norm": 0.3211027979850769, + "learning_rate": 4.747602874136278e-05, + "loss": 1.7813, + "step": 17288 + }, + { + "epoch": 5.306629834254144, + "grad_norm": 0.22552837431430817, + "learning_rate": 4.7471064530944295e-05, + "loss": 1.7407, + "step": 17289 + }, + { + "epoch": 5.3069367710251685, + "grad_norm": 0.3119906485080719, + "learning_rate": 4.746610034551821e-05, + "loss": 1.7255, + "step": 17290 + }, + { + "epoch": 5.307243707796194, + "grad_norm": 0.26405754685401917, + "learning_rate": 4.7461136185133623e-05, + "loss": 1.6945, + "step": 17291 + }, + { + "epoch": 5.307550644567219, + "grad_norm": 0.21759621798992157, + "learning_rate": 4.7456172049839566e-05, + "loss": 1.7319, + "step": 17292 + }, + { + "epoch": 5.3078575813382445, + "grad_norm": 0.26193925738334656, + "learning_rate": 4.745120793968511e-05, + "loss": 1.7508, + "step": 17293 + }, + { + "epoch": 5.30816451810927, + "grad_norm": 0.2549780011177063, + "learning_rate": 4.74462438547193e-05, + "loss": 1.7153, + "step": 17294 + }, + { + "epoch": 5.308471454880294, + "grad_norm": 0.21164020895957947, + "learning_rate": 4.7441279794991235e-05, + "loss": 1.7315, + "step": 17295 + }, + { + "epoch": 5.30877839165132, + "grad_norm": 0.20548345148563385, + "learning_rate": 4.7436315760549914e-05, + "loss": 1.68, + "step": 17296 + }, + { + "epoch": 5.309085328422345, + "grad_norm": 0.23997166752815247, + "learning_rate": 4.7431351751444446e-05, + "loss": 1.8528, + "step": 17297 + }, + { + "epoch": 5.30939226519337, + "grad_norm": 0.2639109194278717, + "learning_rate": 4.7426387767723845e-05, + "loss": 1.8041, + "step": 17298 + }, + { + "epoch": 5.309699201964396, + "grad_norm": 0.2285986840724945, + "learning_rate": 4.7421423809437196e-05, + "loss": 1.8188, + "step": 17299 + }, + { + "epoch": 5.310006138735421, + "grad_norm": 0.22183369100093842, + "learning_rate": 4.741645987663355e-05, + "loss": 1.7581, + "step": 17300 + }, + { + "epoch": 5.310313075506445, + "grad_norm": 0.22716040909290314, + "learning_rate": 4.741149596936197e-05, + "loss": 1.7438, + "step": 17301 + }, + { + "epoch": 5.310620012277471, + "grad_norm": 0.24641327559947968, + "learning_rate": 4.740653208767148e-05, + "loss": 1.761, + "step": 17302 + }, + { + "epoch": 5.310926949048496, + "grad_norm": 0.28470689058303833, + "learning_rate": 4.7401568231611194e-05, + "loss": 1.7512, + "step": 17303 + }, + { + "epoch": 5.311233885819521, + "grad_norm": 0.23279942572116852, + "learning_rate": 4.739660440123012e-05, + "loss": 1.7797, + "step": 17304 + }, + { + "epoch": 5.311540822590547, + "grad_norm": 0.26397696137428284, + "learning_rate": 4.739164059657731e-05, + "loss": 1.748, + "step": 17305 + }, + { + "epoch": 5.311847759361571, + "grad_norm": 0.25072020292282104, + "learning_rate": 4.7386676817701856e-05, + "loss": 1.7571, + "step": 17306 + }, + { + "epoch": 5.3121546961325965, + "grad_norm": 0.20815810561180115, + "learning_rate": 4.7381713064652774e-05, + "loss": 1.7566, + "step": 17307 + }, + { + "epoch": 5.312461632903622, + "grad_norm": 0.23104289174079895, + "learning_rate": 4.7376749337479174e-05, + "loss": 1.7308, + "step": 17308 + }, + { + "epoch": 5.312768569674647, + "grad_norm": 0.21978867053985596, + "learning_rate": 4.737178563623004e-05, + "loss": 1.7997, + "step": 17309 + }, + { + "epoch": 5.3130755064456725, + "grad_norm": 0.34588614106178284, + "learning_rate": 4.736682196095447e-05, + "loss": 1.8414, + "step": 17310 + }, + { + "epoch": 5.313382443216697, + "grad_norm": 0.3475342094898224, + "learning_rate": 4.73618583117015e-05, + "loss": 1.7823, + "step": 17311 + }, + { + "epoch": 5.313689379987722, + "grad_norm": 0.1965305358171463, + "learning_rate": 4.7356894688520215e-05, + "loss": 1.7597, + "step": 17312 + }, + { + "epoch": 5.313996316758748, + "grad_norm": 0.3035048246383667, + "learning_rate": 4.7351931091459624e-05, + "loss": 1.6803, + "step": 17313 + }, + { + "epoch": 5.314303253529773, + "grad_norm": 0.27722910046577454, + "learning_rate": 4.7346967520568827e-05, + "loss": 1.7472, + "step": 17314 + }, + { + "epoch": 5.314610190300798, + "grad_norm": 0.21481415629386902, + "learning_rate": 4.734200397589682e-05, + "loss": 1.7319, + "step": 17315 + }, + { + "epoch": 5.314917127071823, + "grad_norm": 0.2570357918739319, + "learning_rate": 4.733704045749271e-05, + "loss": 1.7392, + "step": 17316 + }, + { + "epoch": 5.315224063842848, + "grad_norm": 0.2404400259256363, + "learning_rate": 4.733207696540551e-05, + "loss": 1.7231, + "step": 17317 + }, + { + "epoch": 5.315531000613873, + "grad_norm": 0.222911074757576, + "learning_rate": 4.732711349968432e-05, + "loss": 1.7584, + "step": 17318 + }, + { + "epoch": 5.315837937384899, + "grad_norm": 0.22908064723014832, + "learning_rate": 4.732215006037813e-05, + "loss": 1.7242, + "step": 17319 + }, + { + "epoch": 5.316144874155924, + "grad_norm": 0.2432398796081543, + "learning_rate": 4.7317186647536044e-05, + "loss": 1.7056, + "step": 17320 + }, + { + "epoch": 5.316451810926949, + "grad_norm": 0.1994420737028122, + "learning_rate": 4.7312223261207086e-05, + "loss": 1.6667, + "step": 17321 + }, + { + "epoch": 5.316758747697974, + "grad_norm": 0.22314350306987762, + "learning_rate": 4.73072599014403e-05, + "loss": 1.7945, + "step": 17322 + }, + { + "epoch": 5.317065684468999, + "grad_norm": 0.2309068888425827, + "learning_rate": 4.730229656828477e-05, + "loss": 1.7099, + "step": 17323 + }, + { + "epoch": 5.3173726212400245, + "grad_norm": 0.22388015687465668, + "learning_rate": 4.729733326178951e-05, + "loss": 1.7053, + "step": 17324 + }, + { + "epoch": 5.31767955801105, + "grad_norm": 0.20203040540218353, + "learning_rate": 4.72923699820036e-05, + "loss": 1.6992, + "step": 17325 + }, + { + "epoch": 5.317986494782075, + "grad_norm": 0.24416297674179077, + "learning_rate": 4.728740672897606e-05, + "loss": 1.7455, + "step": 17326 + }, + { + "epoch": 5.3182934315531, + "grad_norm": 0.2501862049102783, + "learning_rate": 4.728244350275597e-05, + "loss": 1.7609, + "step": 17327 + }, + { + "epoch": 5.318600368324125, + "grad_norm": 0.21482665836811066, + "learning_rate": 4.727748030339235e-05, + "loss": 1.7614, + "step": 17328 + }, + { + "epoch": 5.31890730509515, + "grad_norm": 0.2241419404745102, + "learning_rate": 4.727251713093429e-05, + "loss": 1.736, + "step": 17329 + }, + { + "epoch": 5.319214241866176, + "grad_norm": 0.1757260262966156, + "learning_rate": 4.726755398543079e-05, + "loss": 1.6646, + "step": 17330 + }, + { + "epoch": 5.319521178637201, + "grad_norm": 0.18697243928909302, + "learning_rate": 4.726259086693095e-05, + "loss": 1.7512, + "step": 17331 + }, + { + "epoch": 5.319828115408226, + "grad_norm": 0.22584228217601776, + "learning_rate": 4.725762777548376e-05, + "loss": 1.7439, + "step": 17332 + }, + { + "epoch": 5.320135052179251, + "grad_norm": 0.18673470616340637, + "learning_rate": 4.725266471113832e-05, + "loss": 1.7007, + "step": 17333 + }, + { + "epoch": 5.320441988950276, + "grad_norm": 0.23030288517475128, + "learning_rate": 4.7247701673943656e-05, + "loss": 1.8021, + "step": 17334 + }, + { + "epoch": 5.320748925721301, + "grad_norm": 0.19333480298519135, + "learning_rate": 4.7242738663948813e-05, + "loss": 1.6659, + "step": 17335 + }, + { + "epoch": 5.321055862492327, + "grad_norm": 0.278097003698349, + "learning_rate": 4.723777568120284e-05, + "loss": 1.7302, + "step": 17336 + }, + { + "epoch": 5.321362799263352, + "grad_norm": 0.2146742343902588, + "learning_rate": 4.72328127257548e-05, + "loss": 1.7644, + "step": 17337 + }, + { + "epoch": 5.3216697360343765, + "grad_norm": 0.25582969188690186, + "learning_rate": 4.722784979765372e-05, + "loss": 1.7872, + "step": 17338 + }, + { + "epoch": 5.321976672805402, + "grad_norm": 0.20411577820777893, + "learning_rate": 4.722288689694864e-05, + "loss": 1.7167, + "step": 17339 + }, + { + "epoch": 5.322283609576427, + "grad_norm": 0.20894703269004822, + "learning_rate": 4.7217924023688645e-05, + "loss": 1.7526, + "step": 17340 + }, + { + "epoch": 5.3225905463474525, + "grad_norm": 0.20197831094264984, + "learning_rate": 4.721296117792273e-05, + "loss": 1.711, + "step": 17341 + }, + { + "epoch": 5.322897483118478, + "grad_norm": 0.20490549504756927, + "learning_rate": 4.720799835969999e-05, + "loss": 1.7303, + "step": 17342 + }, + { + "epoch": 5.323204419889503, + "grad_norm": 0.20666229724884033, + "learning_rate": 4.720303556906943e-05, + "loss": 1.6738, + "step": 17343 + }, + { + "epoch": 5.323511356660528, + "grad_norm": 0.21899856626987457, + "learning_rate": 4.719807280608011e-05, + "loss": 1.7632, + "step": 17344 + }, + { + "epoch": 5.323818293431553, + "grad_norm": 0.2310410887002945, + "learning_rate": 4.719311007078108e-05, + "loss": 1.7568, + "step": 17345 + }, + { + "epoch": 5.324125230202578, + "grad_norm": 0.20057427883148193, + "learning_rate": 4.7188147363221394e-05, + "loss": 1.6716, + "step": 17346 + }, + { + "epoch": 5.324432166973604, + "grad_norm": 0.21361050009727478, + "learning_rate": 4.718318468345006e-05, + "loss": 1.7224, + "step": 17347 + }, + { + "epoch": 5.324739103744629, + "grad_norm": 0.28389376401901245, + "learning_rate": 4.7178222031516173e-05, + "loss": 1.8519, + "step": 17348 + }, + { + "epoch": 5.3250460405156534, + "grad_norm": 0.2094416618347168, + "learning_rate": 4.717325940746872e-05, + "loss": 1.7763, + "step": 17349 + }, + { + "epoch": 5.325352977286679, + "grad_norm": 0.2263312190771103, + "learning_rate": 4.716829681135681e-05, + "loss": 1.7961, + "step": 17350 + }, + { + "epoch": 5.325659914057704, + "grad_norm": 0.2685631811618805, + "learning_rate": 4.7163334243229417e-05, + "loss": 1.7763, + "step": 17351 + }, + { + "epoch": 5.3259668508287294, + "grad_norm": 0.2029418647289276, + "learning_rate": 4.7158371703135636e-05, + "loss": 1.7662, + "step": 17352 + }, + { + "epoch": 5.326273787599755, + "grad_norm": 0.3109094798564911, + "learning_rate": 4.715340919112447e-05, + "loss": 1.7064, + "step": 17353 + }, + { + "epoch": 5.326580724370779, + "grad_norm": 0.24679912626743317, + "learning_rate": 4.714844670724502e-05, + "loss": 1.6903, + "step": 17354 + }, + { + "epoch": 5.326887661141805, + "grad_norm": 0.2004890739917755, + "learning_rate": 4.714348425154627e-05, + "loss": 1.7242, + "step": 17355 + }, + { + "epoch": 5.32719459791283, + "grad_norm": 0.27442196011543274, + "learning_rate": 4.7138521824077284e-05, + "loss": 1.826, + "step": 17356 + }, + { + "epoch": 5.327501534683855, + "grad_norm": 0.19933666288852692, + "learning_rate": 4.713355942488711e-05, + "loss": 1.748, + "step": 17357 + }, + { + "epoch": 5.327808471454881, + "grad_norm": 0.2306378185749054, + "learning_rate": 4.712859705402476e-05, + "loss": 1.7426, + "step": 17358 + }, + { + "epoch": 5.328115408225905, + "grad_norm": 0.22484014928340912, + "learning_rate": 4.7123634711539324e-05, + "loss": 1.7355, + "step": 17359 + }, + { + "epoch": 5.32842234499693, + "grad_norm": 0.2501749098300934, + "learning_rate": 4.711867239747979e-05, + "loss": 1.7502, + "step": 17360 + }, + { + "epoch": 5.328729281767956, + "grad_norm": 0.1940663903951645, + "learning_rate": 4.711371011189525e-05, + "loss": 1.7423, + "step": 17361 + }, + { + "epoch": 5.329036218538981, + "grad_norm": 0.28115448355674744, + "learning_rate": 4.71087478548347e-05, + "loss": 1.7134, + "step": 17362 + }, + { + "epoch": 5.329343155310006, + "grad_norm": 0.29717928171157837, + "learning_rate": 4.71037856263472e-05, + "loss": 1.8145, + "step": 17363 + }, + { + "epoch": 5.329650092081032, + "grad_norm": 0.24278375506401062, + "learning_rate": 4.709882342648179e-05, + "loss": 1.689, + "step": 17364 + }, + { + "epoch": 5.329957028852056, + "grad_norm": 0.26382890343666077, + "learning_rate": 4.709386125528751e-05, + "loss": 1.801, + "step": 17365 + }, + { + "epoch": 5.3302639656230815, + "grad_norm": 0.237087219953537, + "learning_rate": 4.708889911281339e-05, + "loss": 1.7019, + "step": 17366 + }, + { + "epoch": 5.330570902394107, + "grad_norm": 0.21994253993034363, + "learning_rate": 4.7083936999108494e-05, + "loss": 1.707, + "step": 17367 + }, + { + "epoch": 5.330877839165132, + "grad_norm": 0.3028903901576996, + "learning_rate": 4.707897491422182e-05, + "loss": 1.7992, + "step": 17368 + }, + { + "epoch": 5.3311847759361575, + "grad_norm": 0.24991434812545776, + "learning_rate": 4.7074012858202435e-05, + "loss": 1.7894, + "step": 17369 + }, + { + "epoch": 5.331491712707182, + "grad_norm": 0.20631250739097595, + "learning_rate": 4.706905083109936e-05, + "loss": 1.6816, + "step": 17370 + }, + { + "epoch": 5.331798649478207, + "grad_norm": 0.23300573229789734, + "learning_rate": 4.7064088832961666e-05, + "loss": 1.7101, + "step": 17371 + }, + { + "epoch": 5.332105586249233, + "grad_norm": 0.22331316769123077, + "learning_rate": 4.705912686383837e-05, + "loss": 1.861, + "step": 17372 + }, + { + "epoch": 5.332412523020258, + "grad_norm": 0.204593226313591, + "learning_rate": 4.7054164923778485e-05, + "loss": 1.7062, + "step": 17373 + }, + { + "epoch": 5.332719459791283, + "grad_norm": 0.22207681834697723, + "learning_rate": 4.704920301283107e-05, + "loss": 1.7546, + "step": 17374 + }, + { + "epoch": 5.333026396562309, + "grad_norm": 0.2508530020713806, + "learning_rate": 4.7044241131045157e-05, + "loss": 1.7881, + "step": 17375 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.26084616780281067, + "learning_rate": 4.7039279278469804e-05, + "loss": 1.7292, + "step": 17376 + }, + { + "epoch": 5.333640270104358, + "grad_norm": 0.2122940719127655, + "learning_rate": 4.7034317455154006e-05, + "loss": 1.7493, + "step": 17377 + }, + { + "epoch": 5.333947206875384, + "grad_norm": 0.2627449333667755, + "learning_rate": 4.702935566114685e-05, + "loss": 1.759, + "step": 17378 + }, + { + "epoch": 5.334254143646409, + "grad_norm": 0.20637977123260498, + "learning_rate": 4.702439389649732e-05, + "loss": 1.8043, + "step": 17379 + }, + { + "epoch": 5.334561080417434, + "grad_norm": 0.28783395886421204, + "learning_rate": 4.701943216125447e-05, + "loss": 1.7256, + "step": 17380 + }, + { + "epoch": 5.334868017188459, + "grad_norm": 0.21130618453025818, + "learning_rate": 4.701447045546734e-05, + "loss": 1.7161, + "step": 17381 + }, + { + "epoch": 5.335174953959484, + "grad_norm": 0.2793416678905487, + "learning_rate": 4.7009508779184984e-05, + "loss": 1.7659, + "step": 17382 + }, + { + "epoch": 5.3354818907305095, + "grad_norm": 0.3088020384311676, + "learning_rate": 4.700454713245639e-05, + "loss": 1.6877, + "step": 17383 + }, + { + "epoch": 5.335788827501535, + "grad_norm": 0.19697681069374084, + "learning_rate": 4.6999585515330646e-05, + "loss": 1.7111, + "step": 17384 + }, + { + "epoch": 5.33609576427256, + "grad_norm": 0.29234182834625244, + "learning_rate": 4.699462392785673e-05, + "loss": 1.7136, + "step": 17385 + }, + { + "epoch": 5.336402701043585, + "grad_norm": 0.2593611776828766, + "learning_rate": 4.698966237008371e-05, + "loss": 1.7531, + "step": 17386 + }, + { + "epoch": 5.33670963781461, + "grad_norm": 0.20024444162845612, + "learning_rate": 4.6984700842060604e-05, + "loss": 1.7035, + "step": 17387 + }, + { + "epoch": 5.337016574585635, + "grad_norm": 0.2929787039756775, + "learning_rate": 4.697973934383647e-05, + "loss": 1.7212, + "step": 17388 + }, + { + "epoch": 5.337323511356661, + "grad_norm": 0.2425665408372879, + "learning_rate": 4.697477787546032e-05, + "loss": 1.7191, + "step": 17389 + }, + { + "epoch": 5.337630448127686, + "grad_norm": 0.19175556302070618, + "learning_rate": 4.6969816436981176e-05, + "loss": 1.7291, + "step": 17390 + }, + { + "epoch": 5.337937384898711, + "grad_norm": 0.2602384686470032, + "learning_rate": 4.696485502844809e-05, + "loss": 1.7035, + "step": 17391 + }, + { + "epoch": 5.338244321669736, + "grad_norm": 0.19117408990859985, + "learning_rate": 4.695989364991006e-05, + "loss": 1.707, + "step": 17392 + }, + { + "epoch": 5.338551258440761, + "grad_norm": 0.31086108088493347, + "learning_rate": 4.6954932301416174e-05, + "loss": 1.7397, + "step": 17393 + }, + { + "epoch": 5.338858195211786, + "grad_norm": 0.27402472496032715, + "learning_rate": 4.694997098301542e-05, + "loss": 1.7144, + "step": 17394 + }, + { + "epoch": 5.339165131982812, + "grad_norm": 0.20345155894756317, + "learning_rate": 4.694500969475685e-05, + "loss": 1.7492, + "step": 17395 + }, + { + "epoch": 5.339472068753837, + "grad_norm": 0.23786045610904694, + "learning_rate": 4.694004843668947e-05, + "loss": 1.7781, + "step": 17396 + }, + { + "epoch": 5.3397790055248615, + "grad_norm": 0.19747424125671387, + "learning_rate": 4.6935087208862335e-05, + "loss": 1.7353, + "step": 17397 + }, + { + "epoch": 5.340085942295887, + "grad_norm": 0.224543035030365, + "learning_rate": 4.693012601132445e-05, + "loss": 1.7229, + "step": 17398 + }, + { + "epoch": 5.340392879066912, + "grad_norm": 0.20840135216712952, + "learning_rate": 4.692516484412488e-05, + "loss": 1.7557, + "step": 17399 + }, + { + "epoch": 5.3406998158379375, + "grad_norm": 0.21019098162651062, + "learning_rate": 4.692020370731261e-05, + "loss": 1.7793, + "step": 17400 + }, + { + "epoch": 5.341006752608963, + "grad_norm": 0.20540091395378113, + "learning_rate": 4.691524260093672e-05, + "loss": 1.6925, + "step": 17401 + }, + { + "epoch": 5.341313689379987, + "grad_norm": 0.2414131462574005, + "learning_rate": 4.691028152504619e-05, + "loss": 1.7706, + "step": 17402 + }, + { + "epoch": 5.341620626151013, + "grad_norm": 0.19627155363559723, + "learning_rate": 4.6905320479690073e-05, + "loss": 1.6356, + "step": 17403 + }, + { + "epoch": 5.341927562922038, + "grad_norm": 0.20978952944278717, + "learning_rate": 4.690035946491741e-05, + "loss": 1.7487, + "step": 17404 + }, + { + "epoch": 5.342234499693063, + "grad_norm": 0.2524566054344177, + "learning_rate": 4.689539848077719e-05, + "loss": 1.7713, + "step": 17405 + }, + { + "epoch": 5.342541436464089, + "grad_norm": 0.1967654973268509, + "learning_rate": 4.689043752731847e-05, + "loss": 1.7358, + "step": 17406 + }, + { + "epoch": 5.342848373235114, + "grad_norm": 0.2085377424955368, + "learning_rate": 4.688547660459026e-05, + "loss": 1.7104, + "step": 17407 + }, + { + "epoch": 5.343155310006138, + "grad_norm": 0.21294310688972473, + "learning_rate": 4.688051571264161e-05, + "loss": 1.7349, + "step": 17408 + }, + { + "epoch": 5.343462246777164, + "grad_norm": 0.23702891170978546, + "learning_rate": 4.6875554851521514e-05, + "loss": 1.8048, + "step": 17409 + }, + { + "epoch": 5.343769183548189, + "grad_norm": 0.2513964772224426, + "learning_rate": 4.687059402127904e-05, + "loss": 1.6669, + "step": 17410 + }, + { + "epoch": 5.344076120319214, + "grad_norm": 0.259540855884552, + "learning_rate": 4.6865633221963165e-05, + "loss": 1.7763, + "step": 17411 + }, + { + "epoch": 5.34438305709024, + "grad_norm": 0.28354617953300476, + "learning_rate": 4.6860672453622966e-05, + "loss": 1.7912, + "step": 17412 + }, + { + "epoch": 5.344689993861264, + "grad_norm": 0.2503860592842102, + "learning_rate": 4.685571171630742e-05, + "loss": 1.6817, + "step": 17413 + }, + { + "epoch": 5.3449969306322895, + "grad_norm": 0.2317555695772171, + "learning_rate": 4.685075101006558e-05, + "loss": 1.7652, + "step": 17414 + }, + { + "epoch": 5.345303867403315, + "grad_norm": 0.23333363234996796, + "learning_rate": 4.684579033494646e-05, + "loss": 1.722, + "step": 17415 + }, + { + "epoch": 5.34561080417434, + "grad_norm": 0.22507359087467194, + "learning_rate": 4.6840829690999104e-05, + "loss": 1.7522, + "step": 17416 + }, + { + "epoch": 5.3459177409453655, + "grad_norm": 0.2298288643360138, + "learning_rate": 4.6835869078272504e-05, + "loss": 1.7425, + "step": 17417 + }, + { + "epoch": 5.346224677716391, + "grad_norm": 0.2829224765300751, + "learning_rate": 4.683090849681572e-05, + "loss": 1.7798, + "step": 17418 + }, + { + "epoch": 5.346531614487415, + "grad_norm": 0.18153807520866394, + "learning_rate": 4.682594794667773e-05, + "loss": 1.6846, + "step": 17419 + }, + { + "epoch": 5.346838551258441, + "grad_norm": 0.24153028428554535, + "learning_rate": 4.6820987427907596e-05, + "loss": 1.7474, + "step": 17420 + }, + { + "epoch": 5.347145488029466, + "grad_norm": 0.2529772222042084, + "learning_rate": 4.681602694055434e-05, + "loss": 1.7465, + "step": 17421 + }, + { + "epoch": 5.347452424800491, + "grad_norm": 0.20414131879806519, + "learning_rate": 4.681106648466696e-05, + "loss": 1.7704, + "step": 17422 + }, + { + "epoch": 5.347759361571517, + "grad_norm": 0.27280452847480774, + "learning_rate": 4.68061060602945e-05, + "loss": 1.791, + "step": 17423 + }, + { + "epoch": 5.348066298342541, + "grad_norm": 0.20767468214035034, + "learning_rate": 4.680114566748595e-05, + "loss": 1.7744, + "step": 17424 + }, + { + "epoch": 5.348373235113566, + "grad_norm": 0.2661697566509247, + "learning_rate": 4.679618530629036e-05, + "loss": 1.7999, + "step": 17425 + }, + { + "epoch": 5.348680171884592, + "grad_norm": 0.23666872084140778, + "learning_rate": 4.679122497675674e-05, + "loss": 1.7204, + "step": 17426 + }, + { + "epoch": 5.348987108655617, + "grad_norm": 0.2688015401363373, + "learning_rate": 4.678626467893414e-05, + "loss": 1.7619, + "step": 17427 + }, + { + "epoch": 5.349294045426642, + "grad_norm": 0.23924420773983002, + "learning_rate": 4.678130441287153e-05, + "loss": 1.7754, + "step": 17428 + }, + { + "epoch": 5.349600982197667, + "grad_norm": 0.25724148750305176, + "learning_rate": 4.677634417861798e-05, + "loss": 1.761, + "step": 17429 + }, + { + "epoch": 5.349907918968692, + "grad_norm": 0.2633780241012573, + "learning_rate": 4.6771383976222464e-05, + "loss": 1.8705, + "step": 17430 + }, + { + "epoch": 5.350214855739718, + "grad_norm": 0.24774575233459473, + "learning_rate": 4.6766423805734036e-05, + "loss": 1.7127, + "step": 17431 + }, + { + "epoch": 5.350521792510743, + "grad_norm": 0.29887545108795166, + "learning_rate": 4.6761463667201695e-05, + "loss": 1.7651, + "step": 17432 + }, + { + "epoch": 5.350828729281768, + "grad_norm": 0.2231605499982834, + "learning_rate": 4.6756503560674486e-05, + "loss": 1.7636, + "step": 17433 + }, + { + "epoch": 5.351135666052793, + "grad_norm": 0.27977073192596436, + "learning_rate": 4.675154348620139e-05, + "loss": 1.7108, + "step": 17434 + }, + { + "epoch": 5.351442602823818, + "grad_norm": 0.26866039633750916, + "learning_rate": 4.674658344383146e-05, + "loss": 1.7593, + "step": 17435 + }, + { + "epoch": 5.351749539594843, + "grad_norm": 0.2154620885848999, + "learning_rate": 4.6741623433613685e-05, + "loss": 1.7536, + "step": 17436 + }, + { + "epoch": 5.352056476365869, + "grad_norm": 0.276656836271286, + "learning_rate": 4.673666345559711e-05, + "loss": 1.803, + "step": 17437 + }, + { + "epoch": 5.352363413136894, + "grad_norm": 0.22247640788555145, + "learning_rate": 4.6731703509830744e-05, + "loss": 1.7273, + "step": 17438 + }, + { + "epoch": 5.352670349907919, + "grad_norm": 0.2399090677499771, + "learning_rate": 4.6726743596363574e-05, + "loss": 1.7708, + "step": 17439 + }, + { + "epoch": 5.352977286678944, + "grad_norm": 0.2550101578235626, + "learning_rate": 4.6721783715244674e-05, + "loss": 1.7016, + "step": 17440 + }, + { + "epoch": 5.353284223449969, + "grad_norm": 0.19929546117782593, + "learning_rate": 4.6716823866523e-05, + "loss": 1.7417, + "step": 17441 + }, + { + "epoch": 5.3535911602209945, + "grad_norm": 0.2496672421693802, + "learning_rate": 4.671186405024761e-05, + "loss": 1.72, + "step": 17442 + }, + { + "epoch": 5.35389809699202, + "grad_norm": 0.19827665388584137, + "learning_rate": 4.67069042664675e-05, + "loss": 1.7515, + "step": 17443 + }, + { + "epoch": 5.354205033763045, + "grad_norm": 0.2528775930404663, + "learning_rate": 4.670194451523171e-05, + "loss": 1.7429, + "step": 17444 + }, + { + "epoch": 5.35451197053407, + "grad_norm": 0.19569729268550873, + "learning_rate": 4.6696984796589215e-05, + "loss": 1.7314, + "step": 17445 + }, + { + "epoch": 5.354818907305095, + "grad_norm": 0.21892370283603668, + "learning_rate": 4.669202511058908e-05, + "loss": 1.7331, + "step": 17446 + }, + { + "epoch": 5.35512584407612, + "grad_norm": 0.21609409153461456, + "learning_rate": 4.668706545728026e-05, + "loss": 1.7267, + "step": 17447 + }, + { + "epoch": 5.355432780847146, + "grad_norm": 0.2631370425224304, + "learning_rate": 4.668210583671182e-05, + "loss": 1.7513, + "step": 17448 + }, + { + "epoch": 5.355739717618171, + "grad_norm": 0.31327441334724426, + "learning_rate": 4.667714624893274e-05, + "loss": 1.7936, + "step": 17449 + }, + { + "epoch": 5.356046654389196, + "grad_norm": 0.21602430939674377, + "learning_rate": 4.667218669399207e-05, + "loss": 1.7387, + "step": 17450 + }, + { + "epoch": 5.356353591160221, + "grad_norm": 0.2895040214061737, + "learning_rate": 4.6667227171938784e-05, + "loss": 1.7293, + "step": 17451 + }, + { + "epoch": 5.356660527931246, + "grad_norm": 0.35150307416915894, + "learning_rate": 4.666226768282193e-05, + "loss": 1.8215, + "step": 17452 + }, + { + "epoch": 5.356967464702271, + "grad_norm": 0.19034281373023987, + "learning_rate": 4.665730822669048e-05, + "loss": 1.702, + "step": 17453 + }, + { + "epoch": 5.357274401473297, + "grad_norm": 0.25586241483688354, + "learning_rate": 4.6652348803593484e-05, + "loss": 1.7809, + "step": 17454 + }, + { + "epoch": 5.357581338244322, + "grad_norm": 0.23919305205345154, + "learning_rate": 4.6647389413579944e-05, + "loss": 1.7555, + "step": 17455 + }, + { + "epoch": 5.3578882750153465, + "grad_norm": 0.22707165777683258, + "learning_rate": 4.664243005669885e-05, + "loss": 1.7633, + "step": 17456 + }, + { + "epoch": 5.358195211786372, + "grad_norm": 0.20666839182376862, + "learning_rate": 4.663747073299925e-05, + "loss": 1.6522, + "step": 17457 + }, + { + "epoch": 5.358502148557397, + "grad_norm": 0.20557542145252228, + "learning_rate": 4.663251144253012e-05, + "loss": 1.73, + "step": 17458 + }, + { + "epoch": 5.3588090853284225, + "grad_norm": 0.22375571727752686, + "learning_rate": 4.662755218534049e-05, + "loss": 1.7189, + "step": 17459 + }, + { + "epoch": 5.359116022099448, + "grad_norm": 0.261393278837204, + "learning_rate": 4.662259296147936e-05, + "loss": 1.6863, + "step": 17460 + }, + { + "epoch": 5.359422958870473, + "grad_norm": 0.2279379516839981, + "learning_rate": 4.6617633770995764e-05, + "loss": 1.7332, + "step": 17461 + }, + { + "epoch": 5.359729895641498, + "grad_norm": 0.2194606065750122, + "learning_rate": 4.6612674613938666e-05, + "loss": 1.7324, + "step": 17462 + }, + { + "epoch": 5.360036832412523, + "grad_norm": 0.27714410424232483, + "learning_rate": 4.660771549035713e-05, + "loss": 1.7386, + "step": 17463 + }, + { + "epoch": 5.360343769183548, + "grad_norm": 0.2118787169456482, + "learning_rate": 4.660275640030012e-05, + "loss": 1.7587, + "step": 17464 + }, + { + "epoch": 5.360650705954574, + "grad_norm": 0.2546979784965515, + "learning_rate": 4.6597797343816665e-05, + "loss": 1.7756, + "step": 17465 + }, + { + "epoch": 5.360957642725599, + "grad_norm": 0.194237619638443, + "learning_rate": 4.659283832095577e-05, + "loss": 1.7351, + "step": 17466 + }, + { + "epoch": 5.361264579496623, + "grad_norm": 0.23448583483695984, + "learning_rate": 4.658787933176646e-05, + "loss": 1.7051, + "step": 17467 + }, + { + "epoch": 5.361571516267649, + "grad_norm": 0.22796298563480377, + "learning_rate": 4.65829203762977e-05, + "loss": 1.7395, + "step": 17468 + }, + { + "epoch": 5.361878453038674, + "grad_norm": 0.22674904763698578, + "learning_rate": 4.657796145459855e-05, + "loss": 1.714, + "step": 17469 + }, + { + "epoch": 5.362185389809699, + "grad_norm": 0.2697311341762543, + "learning_rate": 4.657300256671797e-05, + "loss": 1.8271, + "step": 17470 + }, + { + "epoch": 5.362492326580725, + "grad_norm": 0.28040480613708496, + "learning_rate": 4.6568043712705004e-05, + "loss": 1.8192, + "step": 17471 + }, + { + "epoch": 5.362799263351749, + "grad_norm": 0.21100232005119324, + "learning_rate": 4.6563084892608644e-05, + "loss": 1.7285, + "step": 17472 + }, + { + "epoch": 5.3631062001227745, + "grad_norm": 0.23545897006988525, + "learning_rate": 4.655812610647787e-05, + "loss": 1.7302, + "step": 17473 + }, + { + "epoch": 5.3634131368938, + "grad_norm": 0.23278315365314484, + "learning_rate": 4.655316735436174e-05, + "loss": 1.7749, + "step": 17474 + }, + { + "epoch": 5.363720073664825, + "grad_norm": 0.333763986825943, + "learning_rate": 4.65482086363092e-05, + "loss": 1.7393, + "step": 17475 + }, + { + "epoch": 5.3640270104358505, + "grad_norm": 0.2743878662586212, + "learning_rate": 4.6543249952369306e-05, + "loss": 1.7274, + "step": 17476 + }, + { + "epoch": 5.364333947206875, + "grad_norm": 0.234402596950531, + "learning_rate": 4.6538291302591024e-05, + "loss": 1.7848, + "step": 17477 + }, + { + "epoch": 5.3646408839779, + "grad_norm": 0.29100897908210754, + "learning_rate": 4.65333326870234e-05, + "loss": 1.7698, + "step": 17478 + }, + { + "epoch": 5.364947820748926, + "grad_norm": 0.24178378283977509, + "learning_rate": 4.652837410571539e-05, + "loss": 1.8142, + "step": 17479 + }, + { + "epoch": 5.365254757519951, + "grad_norm": 0.4189155101776123, + "learning_rate": 4.652341555871605e-05, + "loss": 1.7435, + "step": 17480 + }, + { + "epoch": 5.365561694290976, + "grad_norm": 0.40106773376464844, + "learning_rate": 4.651845704607433e-05, + "loss": 1.837, + "step": 17481 + }, + { + "epoch": 5.365868631062002, + "grad_norm": 0.24127443134784698, + "learning_rate": 4.651349856783927e-05, + "loss": 1.7257, + "step": 17482 + }, + { + "epoch": 5.366175567833026, + "grad_norm": 0.412812739610672, + "learning_rate": 4.650854012405985e-05, + "loss": 1.762, + "step": 17483 + }, + { + "epoch": 5.366482504604051, + "grad_norm": 0.2636469602584839, + "learning_rate": 4.65035817147851e-05, + "loss": 1.7995, + "step": 17484 + }, + { + "epoch": 5.366789441375077, + "grad_norm": 0.282186895608902, + "learning_rate": 4.649862334006399e-05, + "loss": 1.75, + "step": 17485 + }, + { + "epoch": 5.367096378146102, + "grad_norm": 0.3280154764652252, + "learning_rate": 4.649366499994555e-05, + "loss": 1.7668, + "step": 17486 + }, + { + "epoch": 5.367403314917127, + "grad_norm": 0.24608035385608673, + "learning_rate": 4.648870669447875e-05, + "loss": 1.8332, + "step": 17487 + }, + { + "epoch": 5.367710251688152, + "grad_norm": 0.21927174925804138, + "learning_rate": 4.648374842371262e-05, + "loss": 1.7365, + "step": 17488 + }, + { + "epoch": 5.368017188459177, + "grad_norm": 0.2658425569534302, + "learning_rate": 4.6478790187696164e-05, + "loss": 1.841, + "step": 17489 + }, + { + "epoch": 5.3683241252302025, + "grad_norm": 0.2302858531475067, + "learning_rate": 4.647383198647834e-05, + "loss": 1.7882, + "step": 17490 + }, + { + "epoch": 5.368631062001228, + "grad_norm": 0.2562740743160248, + "learning_rate": 4.64688738201082e-05, + "loss": 1.7188, + "step": 17491 + }, + { + "epoch": 5.368937998772253, + "grad_norm": 0.28140220046043396, + "learning_rate": 4.646391568863469e-05, + "loss": 1.7482, + "step": 17492 + }, + { + "epoch": 5.3692449355432785, + "grad_norm": 0.21040008962154388, + "learning_rate": 4.6458957592106855e-05, + "loss": 1.7695, + "step": 17493 + }, + { + "epoch": 5.369551872314303, + "grad_norm": 0.25322291254997253, + "learning_rate": 4.645399953057367e-05, + "loss": 1.7127, + "step": 17494 + }, + { + "epoch": 5.369858809085328, + "grad_norm": 0.2239738404750824, + "learning_rate": 4.644904150408415e-05, + "loss": 1.7376, + "step": 17495 + }, + { + "epoch": 5.370165745856354, + "grad_norm": 0.21432901918888092, + "learning_rate": 4.644408351268727e-05, + "loss": 1.7156, + "step": 17496 + }, + { + "epoch": 5.370472682627379, + "grad_norm": 0.3057272732257843, + "learning_rate": 4.643912555643205e-05, + "loss": 1.7706, + "step": 17497 + }, + { + "epoch": 5.370779619398404, + "grad_norm": 0.2826928496360779, + "learning_rate": 4.643416763536748e-05, + "loss": 1.8298, + "step": 17498 + }, + { + "epoch": 5.371086556169429, + "grad_norm": 0.2395278513431549, + "learning_rate": 4.642920974954255e-05, + "loss": 1.7357, + "step": 17499 + }, + { + "epoch": 5.371393492940454, + "grad_norm": 0.21004743874073029, + "learning_rate": 4.642425189900626e-05, + "loss": 1.7263, + "step": 17500 + }, + { + "epoch": 5.371700429711479, + "grad_norm": 0.23981697857379913, + "learning_rate": 4.641929408380761e-05, + "loss": 1.7341, + "step": 17501 + }, + { + "epoch": 5.372007366482505, + "grad_norm": 0.1984727531671524, + "learning_rate": 4.641433630399559e-05, + "loss": 1.7133, + "step": 17502 + }, + { + "epoch": 5.37231430325353, + "grad_norm": 0.22153446078300476, + "learning_rate": 4.640937855961922e-05, + "loss": 1.8028, + "step": 17503 + }, + { + "epoch": 5.3726212400245545, + "grad_norm": 0.24257974326610565, + "learning_rate": 4.6404420850727455e-05, + "loss": 1.7842, + "step": 17504 + }, + { + "epoch": 5.37292817679558, + "grad_norm": 0.19444705545902252, + "learning_rate": 4.6399463177369316e-05, + "loss": 1.7296, + "step": 17505 + }, + { + "epoch": 5.373235113566605, + "grad_norm": 0.2068849354982376, + "learning_rate": 4.6394505539593806e-05, + "loss": 1.6949, + "step": 17506 + }, + { + "epoch": 5.3735420503376305, + "grad_norm": 0.21762309968471527, + "learning_rate": 4.638954793744989e-05, + "loss": 1.7556, + "step": 17507 + }, + { + "epoch": 5.373848987108656, + "grad_norm": 0.20791584253311157, + "learning_rate": 4.638459037098659e-05, + "loss": 1.7442, + "step": 17508 + }, + { + "epoch": 5.37415592387968, + "grad_norm": 0.27774497866630554, + "learning_rate": 4.6379632840252875e-05, + "loss": 1.7834, + "step": 17509 + }, + { + "epoch": 5.374462860650706, + "grad_norm": 0.24211421608924866, + "learning_rate": 4.637467534529775e-05, + "loss": 1.819, + "step": 17510 + }, + { + "epoch": 5.374769797421731, + "grad_norm": 0.24857789278030396, + "learning_rate": 4.636971788617022e-05, + "loss": 1.7483, + "step": 17511 + }, + { + "epoch": 5.375076734192756, + "grad_norm": 0.25142937898635864, + "learning_rate": 4.636476046291925e-05, + "loss": 1.7405, + "step": 17512 + }, + { + "epoch": 5.375383670963782, + "grad_norm": 0.25860801339149475, + "learning_rate": 4.6359803075593846e-05, + "loss": 1.7821, + "step": 17513 + }, + { + "epoch": 5.375690607734807, + "grad_norm": 0.25223109126091003, + "learning_rate": 4.635484572424302e-05, + "loss": 1.738, + "step": 17514 + }, + { + "epoch": 5.3759975445058314, + "grad_norm": 0.22931768000125885, + "learning_rate": 4.634988840891573e-05, + "loss": 1.7717, + "step": 17515 + }, + { + "epoch": 5.376304481276857, + "grad_norm": 0.21371231973171234, + "learning_rate": 4.6344931129661e-05, + "loss": 1.7741, + "step": 17516 + }, + { + "epoch": 5.376611418047882, + "grad_norm": 0.2653632164001465, + "learning_rate": 4.633997388652778e-05, + "loss": 1.7548, + "step": 17517 + }, + { + "epoch": 5.3769183548189075, + "grad_norm": 0.2559951841831207, + "learning_rate": 4.6335016679565094e-05, + "loss": 1.7833, + "step": 17518 + }, + { + "epoch": 5.377225291589933, + "grad_norm": 0.22560031712055206, + "learning_rate": 4.6330059508821914e-05, + "loss": 1.6929, + "step": 17519 + }, + { + "epoch": 5.377532228360957, + "grad_norm": 0.3084852695465088, + "learning_rate": 4.6325102374347255e-05, + "loss": 1.8107, + "step": 17520 + }, + { + "epoch": 5.377839165131983, + "grad_norm": 0.3329267203807831, + "learning_rate": 4.632014527619007e-05, + "loss": 1.6791, + "step": 17521 + }, + { + "epoch": 5.378146101903008, + "grad_norm": 0.26274019479751587, + "learning_rate": 4.631518821439939e-05, + "loss": 1.7187, + "step": 17522 + }, + { + "epoch": 5.378453038674033, + "grad_norm": 0.3769492208957672, + "learning_rate": 4.6310231189024165e-05, + "loss": 1.8366, + "step": 17523 + }, + { + "epoch": 5.378759975445059, + "grad_norm": 0.2503921687602997, + "learning_rate": 4.6305274200113385e-05, + "loss": 1.7281, + "step": 17524 + }, + { + "epoch": 5.379066912216084, + "grad_norm": 0.26305708289146423, + "learning_rate": 4.6300317247716074e-05, + "loss": 1.7231, + "step": 17525 + }, + { + "epoch": 5.379373848987108, + "grad_norm": 0.31899142265319824, + "learning_rate": 4.629536033188118e-05, + "loss": 1.8025, + "step": 17526 + }, + { + "epoch": 5.379680785758134, + "grad_norm": 0.21400104463100433, + "learning_rate": 4.629040345265772e-05, + "loss": 1.7481, + "step": 17527 + }, + { + "epoch": 5.379987722529159, + "grad_norm": 0.23147371411323547, + "learning_rate": 4.628544661009465e-05, + "loss": 1.7049, + "step": 17528 + }, + { + "epoch": 5.380294659300184, + "grad_norm": 0.21156759560108185, + "learning_rate": 4.628048980424099e-05, + "loss": 1.806, + "step": 17529 + }, + { + "epoch": 5.38060159607121, + "grad_norm": 0.22061556577682495, + "learning_rate": 4.6275533035145685e-05, + "loss": 1.7606, + "step": 17530 + }, + { + "epoch": 5.380908532842234, + "grad_norm": 0.23379987478256226, + "learning_rate": 4.6270576302857774e-05, + "loss": 1.7874, + "step": 17531 + }, + { + "epoch": 5.3812154696132595, + "grad_norm": 0.24738669395446777, + "learning_rate": 4.62656196074262e-05, + "loss": 1.7611, + "step": 17532 + }, + { + "epoch": 5.381522406384285, + "grad_norm": 0.19738905131816864, + "learning_rate": 4.6260662948899974e-05, + "loss": 1.7375, + "step": 17533 + }, + { + "epoch": 5.38182934315531, + "grad_norm": 0.2327810823917389, + "learning_rate": 4.6255706327328044e-05, + "loss": 1.7188, + "step": 17534 + }, + { + "epoch": 5.3821362799263355, + "grad_norm": 0.18944145739078522, + "learning_rate": 4.625074974275944e-05, + "loss": 1.6672, + "step": 17535 + }, + { + "epoch": 5.382443216697361, + "grad_norm": 0.20943734049797058, + "learning_rate": 4.624579319524311e-05, + "loss": 1.7238, + "step": 17536 + }, + { + "epoch": 5.382750153468385, + "grad_norm": 0.2060960829257965, + "learning_rate": 4.6240836684828074e-05, + "loss": 1.744, + "step": 17537 + }, + { + "epoch": 5.383057090239411, + "grad_norm": 0.19089816510677338, + "learning_rate": 4.6235880211563264e-05, + "loss": 1.6884, + "step": 17538 + }, + { + "epoch": 5.383364027010436, + "grad_norm": 0.22362665832042694, + "learning_rate": 4.623092377549772e-05, + "loss": 1.7076, + "step": 17539 + }, + { + "epoch": 5.383670963781461, + "grad_norm": 0.19429968297481537, + "learning_rate": 4.622596737668039e-05, + "loss": 1.7315, + "step": 17540 + }, + { + "epoch": 5.383977900552487, + "grad_norm": 0.20481903851032257, + "learning_rate": 4.622101101516024e-05, + "loss": 1.711, + "step": 17541 + }, + { + "epoch": 5.384284837323511, + "grad_norm": 0.19181163609027863, + "learning_rate": 4.6216054690986304e-05, + "loss": 1.6879, + "step": 17542 + }, + { + "epoch": 5.384591774094536, + "grad_norm": 0.23105846345424652, + "learning_rate": 4.6211098404207514e-05, + "loss": 1.7797, + "step": 17543 + }, + { + "epoch": 5.384898710865562, + "grad_norm": 0.2742008864879608, + "learning_rate": 4.6206142154872886e-05, + "loss": 1.7404, + "step": 17544 + }, + { + "epoch": 5.385205647636587, + "grad_norm": 0.2256750613451004, + "learning_rate": 4.6201185943031365e-05, + "loss": 1.7616, + "step": 17545 + }, + { + "epoch": 5.385512584407612, + "grad_norm": 0.23230868577957153, + "learning_rate": 4.6196229768731964e-05, + "loss": 1.7457, + "step": 17546 + }, + { + "epoch": 5.385819521178637, + "grad_norm": 0.2200126200914383, + "learning_rate": 4.6191273632023634e-05, + "loss": 1.7835, + "step": 17547 + }, + { + "epoch": 5.386126457949662, + "grad_norm": 0.21903863549232483, + "learning_rate": 4.6186317532955395e-05, + "loss": 1.7315, + "step": 17548 + }, + { + "epoch": 5.3864333947206875, + "grad_norm": 0.1915556788444519, + "learning_rate": 4.6181361471576186e-05, + "loss": 1.6786, + "step": 17549 + }, + { + "epoch": 5.386740331491713, + "grad_norm": 0.20177799463272095, + "learning_rate": 4.617640544793501e-05, + "loss": 1.7453, + "step": 17550 + }, + { + "epoch": 5.387047268262738, + "grad_norm": 0.2598256766796112, + "learning_rate": 4.617144946208083e-05, + "loss": 1.7931, + "step": 17551 + }, + { + "epoch": 5.387354205033763, + "grad_norm": 0.2357153594493866, + "learning_rate": 4.616649351406263e-05, + "loss": 1.7932, + "step": 17552 + }, + { + "epoch": 5.387661141804788, + "grad_norm": 0.2228964865207672, + "learning_rate": 4.616153760392938e-05, + "loss": 1.7725, + "step": 17553 + }, + { + "epoch": 5.387968078575813, + "grad_norm": 0.20811811089515686, + "learning_rate": 4.6156581731730085e-05, + "loss": 1.744, + "step": 17554 + }, + { + "epoch": 5.388275015346839, + "grad_norm": 0.20008429884910583, + "learning_rate": 4.615162589751369e-05, + "loss": 1.6973, + "step": 17555 + }, + { + "epoch": 5.388581952117864, + "grad_norm": 0.20487523078918457, + "learning_rate": 4.614667010132919e-05, + "loss": 1.7712, + "step": 17556 + }, + { + "epoch": 5.388888888888889, + "grad_norm": 0.21279677748680115, + "learning_rate": 4.6141714343225554e-05, + "loss": 1.7783, + "step": 17557 + }, + { + "epoch": 5.389195825659914, + "grad_norm": 0.28035736083984375, + "learning_rate": 4.613675862325174e-05, + "loss": 1.767, + "step": 17558 + }, + { + "epoch": 5.389502762430939, + "grad_norm": 0.27426794171333313, + "learning_rate": 4.613180294145677e-05, + "loss": 1.7909, + "step": 17559 + }, + { + "epoch": 5.389809699201964, + "grad_norm": 0.22420327365398407, + "learning_rate": 4.612684729788957e-05, + "loss": 1.6902, + "step": 17560 + }, + { + "epoch": 5.39011663597299, + "grad_norm": 0.19799382984638214, + "learning_rate": 4.612189169259915e-05, + "loss": 1.7276, + "step": 17561 + }, + { + "epoch": 5.390423572744015, + "grad_norm": 0.2508823573589325, + "learning_rate": 4.611693612563445e-05, + "loss": 1.7445, + "step": 17562 + }, + { + "epoch": 5.3907305095150395, + "grad_norm": 0.20835694670677185, + "learning_rate": 4.611198059704448e-05, + "loss": 1.696, + "step": 17563 + }, + { + "epoch": 5.391037446286065, + "grad_norm": 0.22136010229587555, + "learning_rate": 4.6107025106878176e-05, + "loss": 1.7701, + "step": 17564 + }, + { + "epoch": 5.39134438305709, + "grad_norm": 0.23835612833499908, + "learning_rate": 4.610206965518456e-05, + "loss": 1.7494, + "step": 17565 + }, + { + "epoch": 5.3916513198281155, + "grad_norm": 0.26142916083335876, + "learning_rate": 4.6097114242012554e-05, + "loss": 1.7616, + "step": 17566 + }, + { + "epoch": 5.391958256599141, + "grad_norm": 0.3366851806640625, + "learning_rate": 4.6092158867411175e-05, + "loss": 1.7409, + "step": 17567 + }, + { + "epoch": 5.392265193370166, + "grad_norm": 0.2592991292476654, + "learning_rate": 4.608720353142935e-05, + "loss": 1.7469, + "step": 17568 + }, + { + "epoch": 5.392572130141191, + "grad_norm": 0.25810322165489197, + "learning_rate": 4.608224823411608e-05, + "loss": 1.7345, + "step": 17569 + }, + { + "epoch": 5.392879066912216, + "grad_norm": 0.26776888966560364, + "learning_rate": 4.607729297552032e-05, + "loss": 1.7698, + "step": 17570 + }, + { + "epoch": 5.393186003683241, + "grad_norm": 0.21023939549922943, + "learning_rate": 4.607233775569107e-05, + "loss": 1.7681, + "step": 17571 + }, + { + "epoch": 5.393492940454267, + "grad_norm": 0.24452096223831177, + "learning_rate": 4.6067382574677265e-05, + "loss": 1.8154, + "step": 17572 + }, + { + "epoch": 5.393799877225292, + "grad_norm": 0.27084338665008545, + "learning_rate": 4.606242743252791e-05, + "loss": 1.7106, + "step": 17573 + }, + { + "epoch": 5.394106813996316, + "grad_norm": 0.24783825874328613, + "learning_rate": 4.605747232929195e-05, + "loss": 1.713, + "step": 17574 + }, + { + "epoch": 5.394413750767342, + "grad_norm": 0.2528151869773865, + "learning_rate": 4.6052517265018333e-05, + "loss": 1.8475, + "step": 17575 + }, + { + "epoch": 5.394720687538367, + "grad_norm": 0.24361065030097961, + "learning_rate": 4.604756223975609e-05, + "loss": 1.7414, + "step": 17576 + }, + { + "epoch": 5.395027624309392, + "grad_norm": 0.2751234769821167, + "learning_rate": 4.604260725355412e-05, + "loss": 1.7603, + "step": 17577 + }, + { + "epoch": 5.395334561080418, + "grad_norm": 0.23183637857437134, + "learning_rate": 4.603765230646146e-05, + "loss": 1.7053, + "step": 17578 + }, + { + "epoch": 5.395641497851442, + "grad_norm": 0.27462145686149597, + "learning_rate": 4.6032697398527005e-05, + "loss": 1.746, + "step": 17579 + }, + { + "epoch": 5.3959484346224675, + "grad_norm": 0.3665321171283722, + "learning_rate": 4.602774252979978e-05, + "loss": 1.6883, + "step": 17580 + }, + { + "epoch": 5.396255371393493, + "grad_norm": 0.22438424825668335, + "learning_rate": 4.602278770032872e-05, + "loss": 1.7473, + "step": 17581 + }, + { + "epoch": 5.396562308164518, + "grad_norm": 0.38713687658309937, + "learning_rate": 4.601783291016282e-05, + "loss": 1.7993, + "step": 17582 + }, + { + "epoch": 5.3968692449355435, + "grad_norm": 0.3399868905544281, + "learning_rate": 4.6012878159351015e-05, + "loss": 1.7709, + "step": 17583 + }, + { + "epoch": 5.397176181706568, + "grad_norm": 0.21916119754314423, + "learning_rate": 4.60079234479423e-05, + "loss": 1.7351, + "step": 17584 + }, + { + "epoch": 5.397483118477593, + "grad_norm": 0.3796394467353821, + "learning_rate": 4.600296877598561e-05, + "loss": 1.7534, + "step": 17585 + }, + { + "epoch": 5.397790055248619, + "grad_norm": 0.27824562788009644, + "learning_rate": 4.599801414352993e-05, + "loss": 1.6962, + "step": 17586 + }, + { + "epoch": 5.398096992019644, + "grad_norm": 0.21037112176418304, + "learning_rate": 4.599305955062421e-05, + "loss": 1.7062, + "step": 17587 + }, + { + "epoch": 5.398403928790669, + "grad_norm": 0.3373035192489624, + "learning_rate": 4.598810499731745e-05, + "loss": 1.8263, + "step": 17588 + }, + { + "epoch": 5.398710865561695, + "grad_norm": 0.2560507357120514, + "learning_rate": 4.5983150483658564e-05, + "loss": 1.7232, + "step": 17589 + }, + { + "epoch": 5.399017802332719, + "grad_norm": 0.23010993003845215, + "learning_rate": 4.5978196009696564e-05, + "loss": 1.805, + "step": 17590 + }, + { + "epoch": 5.399324739103744, + "grad_norm": 0.32955634593963623, + "learning_rate": 4.597324157548037e-05, + "loss": 1.7018, + "step": 17591 + }, + { + "epoch": 5.39963167587477, + "grad_norm": 0.2534363865852356, + "learning_rate": 4.5968287181058953e-05, + "loss": 1.6919, + "step": 17592 + }, + { + "epoch": 5.399938612645795, + "grad_norm": 0.23179130256175995, + "learning_rate": 4.5963332826481314e-05, + "loss": 1.7237, + "step": 17593 + }, + { + "epoch": 5.4002455494168204, + "grad_norm": 0.37712663412094116, + "learning_rate": 4.5958378511796365e-05, + "loss": 1.7694, + "step": 17594 + }, + { + "epoch": 5.400552486187845, + "grad_norm": 0.21228717267513275, + "learning_rate": 4.59534242370531e-05, + "loss": 1.7528, + "step": 17595 + }, + { + "epoch": 5.40085942295887, + "grad_norm": 0.2818812429904938, + "learning_rate": 4.5948470002300454e-05, + "loss": 1.8214, + "step": 17596 + }, + { + "epoch": 5.401166359729896, + "grad_norm": 0.24916675686836243, + "learning_rate": 4.5943515807587415e-05, + "loss": 1.7792, + "step": 17597 + }, + { + "epoch": 5.401473296500921, + "grad_norm": 0.2096913456916809, + "learning_rate": 4.593856165296291e-05, + "loss": 1.6983, + "step": 17598 + }, + { + "epoch": 5.401780233271946, + "grad_norm": 0.271124005317688, + "learning_rate": 4.593360753847595e-05, + "loss": 1.7534, + "step": 17599 + }, + { + "epoch": 5.402087170042972, + "grad_norm": 0.24798092246055603, + "learning_rate": 4.5928653464175435e-05, + "loss": 1.7783, + "step": 17600 + }, + { + "epoch": 5.402394106813996, + "grad_norm": 0.3531748056411743, + "learning_rate": 4.592369943011038e-05, + "loss": 1.7834, + "step": 17601 + }, + { + "epoch": 5.402701043585021, + "grad_norm": 0.29650232195854187, + "learning_rate": 4.591874543632969e-05, + "loss": 1.7186, + "step": 17602 + }, + { + "epoch": 5.403007980356047, + "grad_norm": 0.25578248500823975, + "learning_rate": 4.591379148288236e-05, + "loss": 1.7849, + "step": 17603 + }, + { + "epoch": 5.403314917127072, + "grad_norm": 0.3790532946586609, + "learning_rate": 4.590883756981733e-05, + "loss": 1.7192, + "step": 17604 + }, + { + "epoch": 5.403621853898097, + "grad_norm": 0.23684249818325043, + "learning_rate": 4.590388369718359e-05, + "loss": 1.7171, + "step": 17605 + }, + { + "epoch": 5.403928790669122, + "grad_norm": 0.267702579498291, + "learning_rate": 4.589892986503005e-05, + "loss": 1.7181, + "step": 17606 + }, + { + "epoch": 5.404235727440147, + "grad_norm": 0.29105648398399353, + "learning_rate": 4.5893976073405704e-05, + "loss": 1.7395, + "step": 17607 + }, + { + "epoch": 5.4045426642111725, + "grad_norm": 0.2266589254140854, + "learning_rate": 4.588902232235949e-05, + "loss": 1.7244, + "step": 17608 + }, + { + "epoch": 5.404849600982198, + "grad_norm": 0.24065524339675903, + "learning_rate": 4.588406861194035e-05, + "loss": 1.7398, + "step": 17609 + }, + { + "epoch": 5.405156537753223, + "grad_norm": 0.23166650533676147, + "learning_rate": 4.587911494219728e-05, + "loss": 1.7592, + "step": 17610 + }, + { + "epoch": 5.4054634745242485, + "grad_norm": 0.19882038235664368, + "learning_rate": 4.5874161313179186e-05, + "loss": 1.7087, + "step": 17611 + }, + { + "epoch": 5.405770411295273, + "grad_norm": 0.2688273787498474, + "learning_rate": 4.5869207724935076e-05, + "loss": 1.7791, + "step": 17612 + }, + { + "epoch": 5.406077348066298, + "grad_norm": 0.1970982402563095, + "learning_rate": 4.5864254177513855e-05, + "loss": 1.7079, + "step": 17613 + }, + { + "epoch": 5.406384284837324, + "grad_norm": 0.2531265318393707, + "learning_rate": 4.585930067096451e-05, + "loss": 1.716, + "step": 17614 + }, + { + "epoch": 5.406691221608349, + "grad_norm": 0.2610352337360382, + "learning_rate": 4.585434720533596e-05, + "loss": 1.7133, + "step": 17615 + }, + { + "epoch": 5.406998158379374, + "grad_norm": 0.2420870065689087, + "learning_rate": 4.5849393780677216e-05, + "loss": 1.7044, + "step": 17616 + }, + { + "epoch": 5.407305095150399, + "grad_norm": 0.24078647792339325, + "learning_rate": 4.584444039703717e-05, + "loss": 1.7486, + "step": 17617 + }, + { + "epoch": 5.407612031921424, + "grad_norm": 0.19324539601802826, + "learning_rate": 4.583948705446481e-05, + "loss": 1.7439, + "step": 17618 + }, + { + "epoch": 5.407918968692449, + "grad_norm": 0.2311750054359436, + "learning_rate": 4.5834533753009065e-05, + "loss": 1.7794, + "step": 17619 + }, + { + "epoch": 5.408225905463475, + "grad_norm": 0.2554466128349304, + "learning_rate": 4.5829580492718914e-05, + "loss": 1.7146, + "step": 17620 + }, + { + "epoch": 5.4085328422345, + "grad_norm": 0.2679688334465027, + "learning_rate": 4.582462727364328e-05, + "loss": 1.7677, + "step": 17621 + }, + { + "epoch": 5.4088397790055245, + "grad_norm": 0.19292913377285004, + "learning_rate": 4.5819674095831146e-05, + "loss": 1.7544, + "step": 17622 + }, + { + "epoch": 5.40914671577655, + "grad_norm": 0.2146623730659485, + "learning_rate": 4.5814720959331425e-05, + "loss": 1.7182, + "step": 17623 + }, + { + "epoch": 5.409453652547575, + "grad_norm": 0.23098216950893402, + "learning_rate": 4.5809767864193096e-05, + "loss": 1.6844, + "step": 17624 + }, + { + "epoch": 5.4097605893186005, + "grad_norm": 0.22482910752296448, + "learning_rate": 4.5804814810465096e-05, + "loss": 1.7921, + "step": 17625 + }, + { + "epoch": 5.410067526089626, + "grad_norm": 0.22098569571971893, + "learning_rate": 4.579986179819636e-05, + "loss": 1.7419, + "step": 17626 + }, + { + "epoch": 5.41037446286065, + "grad_norm": 0.2131706178188324, + "learning_rate": 4.579490882743588e-05, + "loss": 1.7587, + "step": 17627 + }, + { + "epoch": 5.410681399631676, + "grad_norm": 0.22448734939098358, + "learning_rate": 4.578995589823254e-05, + "loss": 1.6959, + "step": 17628 + }, + { + "epoch": 5.410988336402701, + "grad_norm": 0.22372964024543762, + "learning_rate": 4.578500301063536e-05, + "loss": 1.7462, + "step": 17629 + }, + { + "epoch": 5.411295273173726, + "grad_norm": 0.22140730917453766, + "learning_rate": 4.578005016469322e-05, + "loss": 1.8348, + "step": 17630 + }, + { + "epoch": 5.411602209944752, + "grad_norm": 0.21697622537612915, + "learning_rate": 4.577509736045511e-05, + "loss": 1.7634, + "step": 17631 + }, + { + "epoch": 5.411909146715777, + "grad_norm": 0.2044363021850586, + "learning_rate": 4.5770144597969954e-05, + "loss": 1.7095, + "step": 17632 + }, + { + "epoch": 5.412216083486801, + "grad_norm": 0.1910451501607895, + "learning_rate": 4.576519187728674e-05, + "loss": 1.7022, + "step": 17633 + }, + { + "epoch": 5.412523020257827, + "grad_norm": 0.21787554025650024, + "learning_rate": 4.576023919845434e-05, + "loss": 1.7206, + "step": 17634 + }, + { + "epoch": 5.412829957028852, + "grad_norm": 0.2363428920507431, + "learning_rate": 4.575528656152178e-05, + "loss": 1.8052, + "step": 17635 + }, + { + "epoch": 5.413136893799877, + "grad_norm": 0.22830195724964142, + "learning_rate": 4.575033396653793e-05, + "loss": 1.7432, + "step": 17636 + }, + { + "epoch": 5.413443830570903, + "grad_norm": 0.24867239594459534, + "learning_rate": 4.5745381413551794e-05, + "loss": 1.7011, + "step": 17637 + }, + { + "epoch": 5.413750767341927, + "grad_norm": 0.19329775869846344, + "learning_rate": 4.574042890261228e-05, + "loss": 1.7749, + "step": 17638 + }, + { + "epoch": 5.4140577041129525, + "grad_norm": 0.22917115688323975, + "learning_rate": 4.573547643376836e-05, + "loss": 1.7478, + "step": 17639 + }, + { + "epoch": 5.414364640883978, + "grad_norm": 0.23882724344730377, + "learning_rate": 4.573052400706894e-05, + "loss": 1.7396, + "step": 17640 + }, + { + "epoch": 5.414671577655003, + "grad_norm": 0.19127070903778076, + "learning_rate": 4.572557162256301e-05, + "loss": 1.6791, + "step": 17641 + }, + { + "epoch": 5.4149785144260285, + "grad_norm": 0.18385560810565948, + "learning_rate": 4.5720619280299475e-05, + "loss": 1.7288, + "step": 17642 + }, + { + "epoch": 5.415285451197054, + "grad_norm": 0.19845189154148102, + "learning_rate": 4.571566698032728e-05, + "loss": 1.7525, + "step": 17643 + }, + { + "epoch": 5.415592387968078, + "grad_norm": 0.18987210094928741, + "learning_rate": 4.571071472269539e-05, + "loss": 1.7253, + "step": 17644 + }, + { + "epoch": 5.415899324739104, + "grad_norm": 0.18257199227809906, + "learning_rate": 4.570576250745271e-05, + "loss": 1.7051, + "step": 17645 + }, + { + "epoch": 5.416206261510129, + "grad_norm": 0.22803467512130737, + "learning_rate": 4.570081033464823e-05, + "loss": 1.7478, + "step": 17646 + }, + { + "epoch": 5.416513198281154, + "grad_norm": 0.18763841688632965, + "learning_rate": 4.569585820433084e-05, + "loss": 1.7316, + "step": 17647 + }, + { + "epoch": 5.41682013505218, + "grad_norm": 0.23974654078483582, + "learning_rate": 4.56909061165495e-05, + "loss": 1.7566, + "step": 17648 + }, + { + "epoch": 5.417127071823204, + "grad_norm": 0.24336253106594086, + "learning_rate": 4.568595407135315e-05, + "loss": 1.7468, + "step": 17649 + }, + { + "epoch": 5.417434008594229, + "grad_norm": 0.23891226947307587, + "learning_rate": 4.5681002068790755e-05, + "loss": 1.7201, + "step": 17650 + }, + { + "epoch": 5.417740945365255, + "grad_norm": 0.19209685921669006, + "learning_rate": 4.56760501089112e-05, + "loss": 1.713, + "step": 17651 + }, + { + "epoch": 5.41804788213628, + "grad_norm": 0.2407880276441574, + "learning_rate": 4.567109819176349e-05, + "loss": 1.7073, + "step": 17652 + }, + { + "epoch": 5.418354818907305, + "grad_norm": 0.2385055273771286, + "learning_rate": 4.5666146317396485e-05, + "loss": 1.7387, + "step": 17653 + }, + { + "epoch": 5.41866175567833, + "grad_norm": 0.22068475186824799, + "learning_rate": 4.566119448585918e-05, + "loss": 1.7116, + "step": 17654 + }, + { + "epoch": 5.418968692449355, + "grad_norm": 0.318375825881958, + "learning_rate": 4.5656242697200496e-05, + "loss": 1.7659, + "step": 17655 + }, + { + "epoch": 5.4192756292203805, + "grad_norm": 0.25311973690986633, + "learning_rate": 4.5651290951469366e-05, + "loss": 1.7814, + "step": 17656 + }, + { + "epoch": 5.419582565991406, + "grad_norm": 0.18701443076133728, + "learning_rate": 4.5646339248714735e-05, + "loss": 1.6993, + "step": 17657 + }, + { + "epoch": 5.419889502762431, + "grad_norm": 0.2964496314525604, + "learning_rate": 4.5641387588985516e-05, + "loss": 1.8254, + "step": 17658 + }, + { + "epoch": 5.420196439533456, + "grad_norm": 0.19447220861911774, + "learning_rate": 4.563643597233067e-05, + "loss": 1.7208, + "step": 17659 + }, + { + "epoch": 5.420503376304481, + "grad_norm": 0.21666039526462555, + "learning_rate": 4.5631484398799105e-05, + "loss": 1.6695, + "step": 17660 + }, + { + "epoch": 5.420810313075506, + "grad_norm": 0.23104412853717804, + "learning_rate": 4.5626532868439796e-05, + "loss": 1.7449, + "step": 17661 + }, + { + "epoch": 5.421117249846532, + "grad_norm": 0.20463459193706512, + "learning_rate": 4.562158138130163e-05, + "loss": 1.6714, + "step": 17662 + }, + { + "epoch": 5.421424186617557, + "grad_norm": 0.21948079764842987, + "learning_rate": 4.561662993743359e-05, + "loss": 1.6957, + "step": 17663 + }, + { + "epoch": 5.421731123388582, + "grad_norm": 0.2672746777534485, + "learning_rate": 4.561167853688455e-05, + "loss": 1.7137, + "step": 17664 + }, + { + "epoch": 5.422038060159607, + "grad_norm": 0.2652325928211212, + "learning_rate": 4.5606727179703493e-05, + "loss": 1.7943, + "step": 17665 + }, + { + "epoch": 5.422344996930632, + "grad_norm": 0.17761313915252686, + "learning_rate": 4.560177586593933e-05, + "loss": 1.7072, + "step": 17666 + }, + { + "epoch": 5.422651933701657, + "grad_norm": 0.24759770929813385, + "learning_rate": 4.5596824595641e-05, + "loss": 1.7807, + "step": 17667 + }, + { + "epoch": 5.422958870472683, + "grad_norm": 0.22191929817199707, + "learning_rate": 4.5591873368857416e-05, + "loss": 1.7668, + "step": 17668 + }, + { + "epoch": 5.423265807243708, + "grad_norm": 0.21293842792510986, + "learning_rate": 4.5586922185637546e-05, + "loss": 1.7304, + "step": 17669 + }, + { + "epoch": 5.4235727440147325, + "grad_norm": 0.2646051049232483, + "learning_rate": 4.5581971046030277e-05, + "loss": 1.7258, + "step": 17670 + }, + { + "epoch": 5.423879680785758, + "grad_norm": 0.1894550621509552, + "learning_rate": 4.5577019950084574e-05, + "loss": 1.7066, + "step": 17671 + }, + { + "epoch": 5.424186617556783, + "grad_norm": 0.2533467710018158, + "learning_rate": 4.557206889784934e-05, + "loss": 1.7668, + "step": 17672 + }, + { + "epoch": 5.4244935543278086, + "grad_norm": 0.1972150355577469, + "learning_rate": 4.556711788937352e-05, + "loss": 1.7306, + "step": 17673 + }, + { + "epoch": 5.424800491098834, + "grad_norm": 0.2726735472679138, + "learning_rate": 4.5562166924706054e-05, + "loss": 1.7281, + "step": 17674 + }, + { + "epoch": 5.425107427869859, + "grad_norm": 0.2244454175233841, + "learning_rate": 4.555721600389584e-05, + "loss": 1.7461, + "step": 17675 + }, + { + "epoch": 5.425414364640884, + "grad_norm": 0.19486510753631592, + "learning_rate": 4.555226512699182e-05, + "loss": 1.7361, + "step": 17676 + }, + { + "epoch": 5.425721301411909, + "grad_norm": 0.18128283321857452, + "learning_rate": 4.554731429404293e-05, + "loss": 1.7637, + "step": 17677 + }, + { + "epoch": 5.426028238182934, + "grad_norm": 0.24709749221801758, + "learning_rate": 4.5542363505098084e-05, + "loss": 1.7928, + "step": 17678 + }, + { + "epoch": 5.42633517495396, + "grad_norm": 0.2236633151769638, + "learning_rate": 4.553741276020621e-05, + "loss": 1.8262, + "step": 17679 + }, + { + "epoch": 5.426642111724985, + "grad_norm": 0.2592087984085083, + "learning_rate": 4.553246205941626e-05, + "loss": 1.675, + "step": 17680 + }, + { + "epoch": 5.4269490484960095, + "grad_norm": 0.27751871943473816, + "learning_rate": 4.552751140277712e-05, + "loss": 1.7344, + "step": 17681 + }, + { + "epoch": 5.427255985267035, + "grad_norm": 0.23752287030220032, + "learning_rate": 4.5522560790337746e-05, + "loss": 1.7748, + "step": 17682 + }, + { + "epoch": 5.42756292203806, + "grad_norm": 0.3259925842285156, + "learning_rate": 4.5517610222147035e-05, + "loss": 1.7855, + "step": 17683 + }, + { + "epoch": 5.4278698588090855, + "grad_norm": 0.2579646706581116, + "learning_rate": 4.551265969825394e-05, + "loss": 1.7978, + "step": 17684 + }, + { + "epoch": 5.428176795580111, + "grad_norm": 0.3217744827270508, + "learning_rate": 4.550770921870735e-05, + "loss": 1.7793, + "step": 17685 + }, + { + "epoch": 5.428483732351136, + "grad_norm": 0.2930903434753418, + "learning_rate": 4.550275878355624e-05, + "loss": 1.7226, + "step": 17686 + }, + { + "epoch": 5.428790669122161, + "grad_norm": 0.1982879489660263, + "learning_rate": 4.549780839284948e-05, + "loss": 1.6841, + "step": 17687 + }, + { + "epoch": 5.429097605893186, + "grad_norm": 0.20843900740146637, + "learning_rate": 4.5492858046636046e-05, + "loss": 1.7201, + "step": 17688 + }, + { + "epoch": 5.429404542664211, + "grad_norm": 0.23116534948349, + "learning_rate": 4.5487907744964794e-05, + "loss": 1.7565, + "step": 17689 + }, + { + "epoch": 5.429711479435237, + "grad_norm": 0.19177772104740143, + "learning_rate": 4.548295748788471e-05, + "loss": 1.7479, + "step": 17690 + }, + { + "epoch": 5.430018416206262, + "grad_norm": 0.22261449694633484, + "learning_rate": 4.547800727544469e-05, + "loss": 1.7785, + "step": 17691 + }, + { + "epoch": 5.430325352977286, + "grad_norm": 0.20073406398296356, + "learning_rate": 4.547305710769363e-05, + "loss": 1.741, + "step": 17692 + }, + { + "epoch": 5.430632289748312, + "grad_norm": 0.21662208437919617, + "learning_rate": 4.546810698468049e-05, + "loss": 1.7269, + "step": 17693 + }, + { + "epoch": 5.430939226519337, + "grad_norm": 0.19540879130363464, + "learning_rate": 4.546315690645416e-05, + "loss": 1.7141, + "step": 17694 + }, + { + "epoch": 5.431246163290362, + "grad_norm": 0.20063656568527222, + "learning_rate": 4.545820687306358e-05, + "loss": 1.7244, + "step": 17695 + }, + { + "epoch": 5.431553100061388, + "grad_norm": 0.2172660082578659, + "learning_rate": 4.545325688455765e-05, + "loss": 1.7172, + "step": 17696 + }, + { + "epoch": 5.431860036832412, + "grad_norm": 0.2480388581752777, + "learning_rate": 4.5448306940985326e-05, + "loss": 1.6994, + "step": 17697 + }, + { + "epoch": 5.4321669736034375, + "grad_norm": 0.22499477863311768, + "learning_rate": 4.544335704239547e-05, + "loss": 1.7405, + "step": 17698 + }, + { + "epoch": 5.432473910374463, + "grad_norm": 0.20655590295791626, + "learning_rate": 4.5438407188837065e-05, + "loss": 1.6867, + "step": 17699 + }, + { + "epoch": 5.432780847145488, + "grad_norm": 0.2045906037092209, + "learning_rate": 4.543345738035896e-05, + "loss": 1.7752, + "step": 17700 + }, + { + "epoch": 5.4330877839165135, + "grad_norm": 0.2092052847146988, + "learning_rate": 4.542850761701013e-05, + "loss": 1.7389, + "step": 17701 + }, + { + "epoch": 5.433394720687538, + "grad_norm": 0.1943730264902115, + "learning_rate": 4.5423557898839446e-05, + "loss": 1.7276, + "step": 17702 + }, + { + "epoch": 5.433701657458563, + "grad_norm": 0.23487289249897003, + "learning_rate": 4.541860822589587e-05, + "loss": 1.8119, + "step": 17703 + }, + { + "epoch": 5.434008594229589, + "grad_norm": 0.204689159989357, + "learning_rate": 4.541365859822827e-05, + "loss": 1.7865, + "step": 17704 + }, + { + "epoch": 5.434315531000614, + "grad_norm": 0.20850931107997894, + "learning_rate": 4.5408709015885604e-05, + "loss": 1.7733, + "step": 17705 + }, + { + "epoch": 5.434622467771639, + "grad_norm": 0.18685877323150635, + "learning_rate": 4.540375947891675e-05, + "loss": 1.7526, + "step": 17706 + }, + { + "epoch": 5.434929404542665, + "grad_norm": 0.2009890079498291, + "learning_rate": 4.539880998737064e-05, + "loss": 1.6904, + "step": 17707 + }, + { + "epoch": 5.435236341313689, + "grad_norm": 0.16602718830108643, + "learning_rate": 4.5393860541296205e-05, + "loss": 1.689, + "step": 17708 + }, + { + "epoch": 5.435543278084714, + "grad_norm": 0.24318818747997284, + "learning_rate": 4.5388911140742315e-05, + "loss": 1.7993, + "step": 17709 + }, + { + "epoch": 5.43585021485574, + "grad_norm": 0.24094417691230774, + "learning_rate": 4.538396178575793e-05, + "loss": 1.7235, + "step": 17710 + }, + { + "epoch": 5.436157151626765, + "grad_norm": 0.20361751317977905, + "learning_rate": 4.537901247639192e-05, + "loss": 1.7198, + "step": 17711 + }, + { + "epoch": 5.43646408839779, + "grad_norm": 0.2563718259334564, + "learning_rate": 4.537406321269323e-05, + "loss": 1.795, + "step": 17712 + }, + { + "epoch": 5.436771025168815, + "grad_norm": 0.29895591735839844, + "learning_rate": 4.536911399471075e-05, + "loss": 1.7515, + "step": 17713 + }, + { + "epoch": 5.43707796193984, + "grad_norm": 0.22535841166973114, + "learning_rate": 4.536416482249342e-05, + "loss": 1.6998, + "step": 17714 + }, + { + "epoch": 5.4373848987108655, + "grad_norm": 0.26025068759918213, + "learning_rate": 4.53592156960901e-05, + "loss": 1.7821, + "step": 17715 + }, + { + "epoch": 5.437691835481891, + "grad_norm": 0.3473168611526489, + "learning_rate": 4.535426661554975e-05, + "loss": 1.7035, + "step": 17716 + }, + { + "epoch": 5.437998772252916, + "grad_norm": 0.22207199037075043, + "learning_rate": 4.534931758092126e-05, + "loss": 1.7485, + "step": 17717 + }, + { + "epoch": 5.4383057090239415, + "grad_norm": 0.26839709281921387, + "learning_rate": 4.534436859225353e-05, + "loss": 1.7272, + "step": 17718 + }, + { + "epoch": 5.438612645794966, + "grad_norm": 0.37715891003608704, + "learning_rate": 4.5339419649595476e-05, + "loss": 1.7254, + "step": 17719 + }, + { + "epoch": 5.438919582565991, + "grad_norm": 0.21485768258571625, + "learning_rate": 4.533447075299603e-05, + "loss": 1.7349, + "step": 17720 + }, + { + "epoch": 5.439226519337017, + "grad_norm": 0.29502415657043457, + "learning_rate": 4.5329521902504055e-05, + "loss": 1.7325, + "step": 17721 + }, + { + "epoch": 5.439533456108042, + "grad_norm": 0.29448410868644714, + "learning_rate": 4.5324573098168505e-05, + "loss": 1.768, + "step": 17722 + }, + { + "epoch": 5.439840392879067, + "grad_norm": 0.1892058402299881, + "learning_rate": 4.5319624340038244e-05, + "loss": 1.6866, + "step": 17723 + }, + { + "epoch": 5.440147329650092, + "grad_norm": 0.3365040123462677, + "learning_rate": 4.531467562816221e-05, + "loss": 1.7662, + "step": 17724 + }, + { + "epoch": 5.440454266421117, + "grad_norm": 0.2960789203643799, + "learning_rate": 4.53097269625893e-05, + "loss": 1.746, + "step": 17725 + }, + { + "epoch": 5.440761203192142, + "grad_norm": 0.21623700857162476, + "learning_rate": 4.530477834336841e-05, + "loss": 1.7619, + "step": 17726 + }, + { + "epoch": 5.441068139963168, + "grad_norm": 0.29010120034217834, + "learning_rate": 4.5299829770548456e-05, + "loss": 1.717, + "step": 17727 + }, + { + "epoch": 5.441375076734193, + "grad_norm": 0.18467605113983154, + "learning_rate": 4.529488124417833e-05, + "loss": 1.6938, + "step": 17728 + }, + { + "epoch": 5.4416820135052175, + "grad_norm": 0.2875411808490753, + "learning_rate": 4.528993276430695e-05, + "loss": 1.7633, + "step": 17729 + }, + { + "epoch": 5.441988950276243, + "grad_norm": 0.24252675473690033, + "learning_rate": 4.528498433098321e-05, + "loss": 1.6477, + "step": 17730 + }, + { + "epoch": 5.442295887047268, + "grad_norm": 0.18885886669158936, + "learning_rate": 4.5280035944256035e-05, + "loss": 1.7241, + "step": 17731 + }, + { + "epoch": 5.4426028238182935, + "grad_norm": 0.2594204246997833, + "learning_rate": 4.527508760417429e-05, + "loss": 1.6697, + "step": 17732 + }, + { + "epoch": 5.442909760589319, + "grad_norm": 0.23796287178993225, + "learning_rate": 4.527013931078692e-05, + "loss": 1.7035, + "step": 17733 + }, + { + "epoch": 5.443216697360343, + "grad_norm": 0.2591552436351776, + "learning_rate": 4.5265191064142787e-05, + "loss": 1.8014, + "step": 17734 + }, + { + "epoch": 5.443523634131369, + "grad_norm": 0.3316073417663574, + "learning_rate": 4.526024286429082e-05, + "loss": 1.752, + "step": 17735 + }, + { + "epoch": 5.443830570902394, + "grad_norm": 0.2409597635269165, + "learning_rate": 4.52552947112799e-05, + "loss": 1.7662, + "step": 17736 + }, + { + "epoch": 5.444137507673419, + "grad_norm": 0.2896713614463806, + "learning_rate": 4.5250346605158964e-05, + "loss": 1.7168, + "step": 17737 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 0.30870527029037476, + "learning_rate": 4.524539854597686e-05, + "loss": 1.704, + "step": 17738 + }, + { + "epoch": 5.44475138121547, + "grad_norm": 0.2476932406425476, + "learning_rate": 4.524045053378254e-05, + "loss": 1.7649, + "step": 17739 + }, + { + "epoch": 5.445058317986494, + "grad_norm": 0.2937077283859253, + "learning_rate": 4.5235502568624855e-05, + "loss": 1.7028, + "step": 17740 + }, + { + "epoch": 5.44536525475752, + "grad_norm": 0.22881117463111877, + "learning_rate": 4.523055465055273e-05, + "loss": 1.7539, + "step": 17741 + }, + { + "epoch": 5.445672191528545, + "grad_norm": 0.2551842927932739, + "learning_rate": 4.522560677961508e-05, + "loss": 1.7601, + "step": 17742 + }, + { + "epoch": 5.44597912829957, + "grad_norm": 0.27533504366874695, + "learning_rate": 4.5220658955860754e-05, + "loss": 1.7695, + "step": 17743 + }, + { + "epoch": 5.446286065070596, + "grad_norm": 0.23387418687343597, + "learning_rate": 4.5215711179338706e-05, + "loss": 1.7218, + "step": 17744 + }, + { + "epoch": 5.44659300184162, + "grad_norm": 0.37932485342025757, + "learning_rate": 4.521076345009777e-05, + "loss": 1.7685, + "step": 17745 + }, + { + "epoch": 5.4468999386126455, + "grad_norm": 0.2668898105621338, + "learning_rate": 4.520581576818691e-05, + "loss": 1.7217, + "step": 17746 + }, + { + "epoch": 5.447206875383671, + "grad_norm": 0.2417856752872467, + "learning_rate": 4.520086813365496e-05, + "loss": 1.692, + "step": 17747 + }, + { + "epoch": 5.447513812154696, + "grad_norm": 0.3170008063316345, + "learning_rate": 4.519592054655086e-05, + "loss": 1.7565, + "step": 17748 + }, + { + "epoch": 5.4478207489257215, + "grad_norm": 0.20711660385131836, + "learning_rate": 4.519097300692348e-05, + "loss": 1.6708, + "step": 17749 + }, + { + "epoch": 5.448127685696747, + "grad_norm": 0.2196272760629654, + "learning_rate": 4.5186025514821746e-05, + "loss": 1.7335, + "step": 17750 + }, + { + "epoch": 5.448434622467771, + "grad_norm": 0.27563074231147766, + "learning_rate": 4.5181078070294505e-05, + "loss": 1.7383, + "step": 17751 + }, + { + "epoch": 5.448741559238797, + "grad_norm": 0.185418501496315, + "learning_rate": 4.517613067339068e-05, + "loss": 1.6841, + "step": 17752 + }, + { + "epoch": 5.449048496009822, + "grad_norm": 0.26787856221199036, + "learning_rate": 4.517118332415915e-05, + "loss": 1.7733, + "step": 17753 + }, + { + "epoch": 5.449355432780847, + "grad_norm": 0.22114823758602142, + "learning_rate": 4.516623602264885e-05, + "loss": 1.7153, + "step": 17754 + }, + { + "epoch": 5.449662369551873, + "grad_norm": 0.23090483248233795, + "learning_rate": 4.51612887689086e-05, + "loss": 1.7063, + "step": 17755 + }, + { + "epoch": 5.449969306322897, + "grad_norm": 0.3227362632751465, + "learning_rate": 4.515634156298736e-05, + "loss": 1.7528, + "step": 17756 + }, + { + "epoch": 5.4502762430939224, + "grad_norm": 0.24202494323253632, + "learning_rate": 4.515139440493397e-05, + "loss": 1.8119, + "step": 17757 + }, + { + "epoch": 5.450583179864948, + "grad_norm": 0.3778383731842041, + "learning_rate": 4.5146447294797356e-05, + "loss": 1.7589, + "step": 17758 + }, + { + "epoch": 5.450890116635973, + "grad_norm": 0.3726772964000702, + "learning_rate": 4.51415002326264e-05, + "loss": 1.7095, + "step": 17759 + }, + { + "epoch": 5.4511970534069984, + "grad_norm": 0.2424323409795761, + "learning_rate": 4.5136553218469966e-05, + "loss": 1.7374, + "step": 17760 + }, + { + "epoch": 5.451503990178024, + "grad_norm": 0.4347550570964813, + "learning_rate": 4.513160625237699e-05, + "loss": 1.8339, + "step": 17761 + }, + { + "epoch": 5.451810926949048, + "grad_norm": 0.2556018829345703, + "learning_rate": 4.512665933439631e-05, + "loss": 1.7024, + "step": 17762 + }, + { + "epoch": 5.452117863720074, + "grad_norm": 0.36380240321159363, + "learning_rate": 4.512171246457685e-05, + "loss": 1.7706, + "step": 17763 + }, + { + "epoch": 5.452424800491099, + "grad_norm": 0.42120790481567383, + "learning_rate": 4.5116765642967476e-05, + "loss": 1.7609, + "step": 17764 + }, + { + "epoch": 5.452731737262124, + "grad_norm": 0.20573028922080994, + "learning_rate": 4.51118188696171e-05, + "loss": 1.7521, + "step": 17765 + }, + { + "epoch": 5.45303867403315, + "grad_norm": 0.39001402258872986, + "learning_rate": 4.510687214457458e-05, + "loss": 1.7097, + "step": 17766 + }, + { + "epoch": 5.453345610804174, + "grad_norm": 0.2778739333152771, + "learning_rate": 4.510192546788884e-05, + "loss": 1.7677, + "step": 17767 + }, + { + "epoch": 5.453652547575199, + "grad_norm": 0.2500934600830078, + "learning_rate": 4.509697883960872e-05, + "loss": 1.7322, + "step": 17768 + }, + { + "epoch": 5.453959484346225, + "grad_norm": 0.23733557760715485, + "learning_rate": 4.509203225978314e-05, + "loss": 1.7426, + "step": 17769 + }, + { + "epoch": 5.45426642111725, + "grad_norm": 0.20033739507198334, + "learning_rate": 4.508708572846096e-05, + "loss": 1.7093, + "step": 17770 + }, + { + "epoch": 5.454573357888275, + "grad_norm": 0.202667698264122, + "learning_rate": 4.508213924569111e-05, + "loss": 1.6807, + "step": 17771 + }, + { + "epoch": 5.4548802946593, + "grad_norm": 0.1980566531419754, + "learning_rate": 4.507719281152241e-05, + "loss": 1.7102, + "step": 17772 + }, + { + "epoch": 5.455187231430325, + "grad_norm": 0.20612162351608276, + "learning_rate": 4.507224642600381e-05, + "loss": 1.7692, + "step": 17773 + }, + { + "epoch": 5.4554941682013505, + "grad_norm": 0.22859175503253937, + "learning_rate": 4.506730008918412e-05, + "loss": 1.7887, + "step": 17774 + }, + { + "epoch": 5.455801104972376, + "grad_norm": 0.19720709323883057, + "learning_rate": 4.5062353801112285e-05, + "loss": 1.7557, + "step": 17775 + }, + { + "epoch": 5.456108041743401, + "grad_norm": 0.23289217054843903, + "learning_rate": 4.505740756183717e-05, + "loss": 1.7023, + "step": 17776 + }, + { + "epoch": 5.456414978514426, + "grad_norm": 0.2120361477136612, + "learning_rate": 4.505246137140763e-05, + "loss": 1.7249, + "step": 17777 + }, + { + "epoch": 5.456721915285451, + "grad_norm": 0.2094341218471527, + "learning_rate": 4.504751522987259e-05, + "loss": 1.7586, + "step": 17778 + }, + { + "epoch": 5.457028852056476, + "grad_norm": 0.22361092269420624, + "learning_rate": 4.504256913728088e-05, + "loss": 1.737, + "step": 17779 + }, + { + "epoch": 5.457335788827502, + "grad_norm": 0.2100353240966797, + "learning_rate": 4.5037623093681424e-05, + "loss": 1.704, + "step": 17780 + }, + { + "epoch": 5.457642725598527, + "grad_norm": 0.20550231635570526, + "learning_rate": 4.503267709912308e-05, + "loss": 1.7732, + "step": 17781 + }, + { + "epoch": 5.457949662369552, + "grad_norm": 0.22843749821186066, + "learning_rate": 4.502773115365474e-05, + "loss": 1.6916, + "step": 17782 + }, + { + "epoch": 5.458256599140577, + "grad_norm": 0.2351907640695572, + "learning_rate": 4.502278525732526e-05, + "loss": 1.8043, + "step": 17783 + }, + { + "epoch": 5.458563535911602, + "grad_norm": 0.271028071641922, + "learning_rate": 4.501783941018355e-05, + "loss": 1.7665, + "step": 17784 + }, + { + "epoch": 5.458870472682627, + "grad_norm": 0.1974802166223526, + "learning_rate": 4.501289361227846e-05, + "loss": 1.718, + "step": 17785 + }, + { + "epoch": 5.459177409453653, + "grad_norm": 0.23726068437099457, + "learning_rate": 4.5007947863658884e-05, + "loss": 1.7507, + "step": 17786 + }, + { + "epoch": 5.459484346224678, + "grad_norm": 0.2112259715795517, + "learning_rate": 4.5003002164373684e-05, + "loss": 1.8116, + "step": 17787 + }, + { + "epoch": 5.4597912829957025, + "grad_norm": 0.2676105201244354, + "learning_rate": 4.4998056514471764e-05, + "loss": 1.7013, + "step": 17788 + }, + { + "epoch": 5.460098219766728, + "grad_norm": 0.2735576033592224, + "learning_rate": 4.4993110914001956e-05, + "loss": 1.7516, + "step": 17789 + }, + { + "epoch": 5.460405156537753, + "grad_norm": 0.1925152987241745, + "learning_rate": 4.498816536301319e-05, + "loss": 1.7018, + "step": 17790 + }, + { + "epoch": 5.4607120933087785, + "grad_norm": 0.25037717819213867, + "learning_rate": 4.498321986155429e-05, + "loss": 1.7207, + "step": 17791 + }, + { + "epoch": 5.461019030079804, + "grad_norm": 0.20481008291244507, + "learning_rate": 4.497827440967415e-05, + "loss": 1.6988, + "step": 17792 + }, + { + "epoch": 5.461325966850829, + "grad_norm": 0.19434049725532532, + "learning_rate": 4.4973329007421673e-05, + "loss": 1.7363, + "step": 17793 + }, + { + "epoch": 5.461632903621854, + "grad_norm": 0.21797434985637665, + "learning_rate": 4.496838365484567e-05, + "loss": 1.7218, + "step": 17794 + }, + { + "epoch": 5.461939840392879, + "grad_norm": 0.18477453291416168, + "learning_rate": 4.496343835199508e-05, + "loss": 1.7204, + "step": 17795 + }, + { + "epoch": 5.462246777163904, + "grad_norm": 0.21657803654670715, + "learning_rate": 4.495849309891872e-05, + "loss": 1.7671, + "step": 17796 + }, + { + "epoch": 5.46255371393493, + "grad_norm": 0.21027342975139618, + "learning_rate": 4.495354789566549e-05, + "loss": 1.7424, + "step": 17797 + }, + { + "epoch": 5.462860650705955, + "grad_norm": 0.2016189992427826, + "learning_rate": 4.4948602742284256e-05, + "loss": 1.7706, + "step": 17798 + }, + { + "epoch": 5.463167587476979, + "grad_norm": 0.2155935913324356, + "learning_rate": 4.494365763882391e-05, + "loss": 1.7314, + "step": 17799 + }, + { + "epoch": 5.463474524248005, + "grad_norm": 0.22079701721668243, + "learning_rate": 4.493871258533328e-05, + "loss": 1.7938, + "step": 17800 + }, + { + "epoch": 5.46378146101903, + "grad_norm": 0.1907699704170227, + "learning_rate": 4.4933767581861283e-05, + "loss": 1.6958, + "step": 17801 + }, + { + "epoch": 5.464088397790055, + "grad_norm": 0.2784879207611084, + "learning_rate": 4.4928822628456735e-05, + "loss": 1.7285, + "step": 17802 + }, + { + "epoch": 5.464395334561081, + "grad_norm": 0.29470255970954895, + "learning_rate": 4.492387772516855e-05, + "loss": 1.7363, + "step": 17803 + }, + { + "epoch": 5.464702271332105, + "grad_norm": 0.21387436985969543, + "learning_rate": 4.4918932872045575e-05, + "loss": 1.7414, + "step": 17804 + }, + { + "epoch": 5.4650092081031305, + "grad_norm": 0.3102552890777588, + "learning_rate": 4.49139880691367e-05, + "loss": 1.7359, + "step": 17805 + }, + { + "epoch": 5.465316144874156, + "grad_norm": 0.2312939465045929, + "learning_rate": 4.490904331649075e-05, + "loss": 1.7609, + "step": 17806 + }, + { + "epoch": 5.465623081645181, + "grad_norm": 0.323913037776947, + "learning_rate": 4.4904098614156645e-05, + "loss": 1.7693, + "step": 17807 + }, + { + "epoch": 5.4659300184162065, + "grad_norm": 0.2975599467754364, + "learning_rate": 4.48991539621832e-05, + "loss": 1.7506, + "step": 17808 + }, + { + "epoch": 5.466236955187231, + "grad_norm": 0.24702571332454681, + "learning_rate": 4.4894209360619316e-05, + "loss": 1.8258, + "step": 17809 + }, + { + "epoch": 5.466543891958256, + "grad_norm": 0.29016581177711487, + "learning_rate": 4.488926480951386e-05, + "loss": 1.7096, + "step": 17810 + }, + { + "epoch": 5.466850828729282, + "grad_norm": 0.2194555252790451, + "learning_rate": 4.488432030891566e-05, + "loss": 1.788, + "step": 17811 + }, + { + "epoch": 5.467157765500307, + "grad_norm": 0.2504041790962219, + "learning_rate": 4.487937585887363e-05, + "loss": 1.7672, + "step": 17812 + }, + { + "epoch": 5.467464702271332, + "grad_norm": 0.2362445741891861, + "learning_rate": 4.487443145943659e-05, + "loss": 1.7426, + "step": 17813 + }, + { + "epoch": 5.467771639042358, + "grad_norm": 0.20075896382331848, + "learning_rate": 4.486948711065343e-05, + "loss": 1.7406, + "step": 17814 + }, + { + "epoch": 5.468078575813382, + "grad_norm": 0.2219153791666031, + "learning_rate": 4.486454281257299e-05, + "loss": 1.683, + "step": 17815 + }, + { + "epoch": 5.468385512584407, + "grad_norm": 0.22551953792572021, + "learning_rate": 4.4859598565244176e-05, + "loss": 1.7896, + "step": 17816 + }, + { + "epoch": 5.468692449355433, + "grad_norm": 0.2385476976633072, + "learning_rate": 4.48546543687158e-05, + "loss": 1.7799, + "step": 17817 + }, + { + "epoch": 5.468999386126458, + "grad_norm": 0.24263370037078857, + "learning_rate": 4.4849710223036764e-05, + "loss": 1.682, + "step": 17818 + }, + { + "epoch": 5.469306322897483, + "grad_norm": 0.24301160871982574, + "learning_rate": 4.484476612825589e-05, + "loss": 1.8121, + "step": 17819 + }, + { + "epoch": 5.469613259668508, + "grad_norm": 0.2516932487487793, + "learning_rate": 4.483982208442207e-05, + "loss": 1.7344, + "step": 17820 + }, + { + "epoch": 5.469920196439533, + "grad_norm": 0.24309395253658295, + "learning_rate": 4.4834878091584156e-05, + "loss": 1.7746, + "step": 17821 + }, + { + "epoch": 5.4702271332105585, + "grad_norm": 0.24711866676807404, + "learning_rate": 4.4829934149790996e-05, + "loss": 1.7887, + "step": 17822 + }, + { + "epoch": 5.470534069981584, + "grad_norm": 0.2923797369003296, + "learning_rate": 4.4824990259091445e-05, + "loss": 1.7017, + "step": 17823 + }, + { + "epoch": 5.470841006752609, + "grad_norm": 0.21658629179000854, + "learning_rate": 4.482004641953441e-05, + "loss": 1.725, + "step": 17824 + }, + { + "epoch": 5.4711479435236345, + "grad_norm": 0.233424574136734, + "learning_rate": 4.481510263116868e-05, + "loss": 1.74, + "step": 17825 + }, + { + "epoch": 5.471454880294659, + "grad_norm": 0.28997600078582764, + "learning_rate": 4.481015889404315e-05, + "loss": 1.8418, + "step": 17826 + }, + { + "epoch": 5.471761817065684, + "grad_norm": 0.2245558649301529, + "learning_rate": 4.480521520820669e-05, + "loss": 1.7519, + "step": 17827 + }, + { + "epoch": 5.47206875383671, + "grad_norm": 0.21008887887001038, + "learning_rate": 4.480027157370812e-05, + "loss": 1.6977, + "step": 17828 + }, + { + "epoch": 5.472375690607735, + "grad_norm": 0.1990261971950531, + "learning_rate": 4.479532799059633e-05, + "loss": 1.7004, + "step": 17829 + }, + { + "epoch": 5.47268262737876, + "grad_norm": 0.2354540079832077, + "learning_rate": 4.479038445892014e-05, + "loss": 1.7755, + "step": 17830 + }, + { + "epoch": 5.472989564149785, + "grad_norm": 0.21904973685741425, + "learning_rate": 4.478544097872843e-05, + "loss": 1.8328, + "step": 17831 + }, + { + "epoch": 5.47329650092081, + "grad_norm": 0.21188503503799438, + "learning_rate": 4.4780497550070055e-05, + "loss": 1.7105, + "step": 17832 + }, + { + "epoch": 5.473603437691835, + "grad_norm": 0.2196870595216751, + "learning_rate": 4.477555417299386e-05, + "loss": 1.7261, + "step": 17833 + }, + { + "epoch": 5.473910374462861, + "grad_norm": 0.24522331357002258, + "learning_rate": 4.477061084754869e-05, + "loss": 1.8101, + "step": 17834 + }, + { + "epoch": 5.474217311233886, + "grad_norm": 0.24073927104473114, + "learning_rate": 4.476566757378343e-05, + "loss": 1.8295, + "step": 17835 + }, + { + "epoch": 5.474524248004911, + "grad_norm": 0.3724605143070221, + "learning_rate": 4.476072435174689e-05, + "loss": 1.7785, + "step": 17836 + }, + { + "epoch": 5.474831184775936, + "grad_norm": 0.25552257895469666, + "learning_rate": 4.475578118148797e-05, + "loss": 1.6978, + "step": 17837 + }, + { + "epoch": 5.475138121546961, + "grad_norm": 0.22402255237102509, + "learning_rate": 4.475083806305546e-05, + "loss": 1.697, + "step": 17838 + }, + { + "epoch": 5.475445058317987, + "grad_norm": 0.25869324803352356, + "learning_rate": 4.474589499649826e-05, + "loss": 1.7026, + "step": 17839 + }, + { + "epoch": 5.475751995089012, + "grad_norm": 0.249742329120636, + "learning_rate": 4.47409519818652e-05, + "loss": 1.7738, + "step": 17840 + }, + { + "epoch": 5.476058931860037, + "grad_norm": 0.28722140192985535, + "learning_rate": 4.473600901920515e-05, + "loss": 1.7555, + "step": 17841 + }, + { + "epoch": 5.476365868631062, + "grad_norm": 0.250964879989624, + "learning_rate": 4.4731066108566926e-05, + "loss": 1.6951, + "step": 17842 + }, + { + "epoch": 5.476672805402087, + "grad_norm": 0.20562006533145905, + "learning_rate": 4.472612324999942e-05, + "loss": 1.7109, + "step": 17843 + }, + { + "epoch": 5.476979742173112, + "grad_norm": 0.26964858174324036, + "learning_rate": 4.472118044355144e-05, + "loss": 1.7468, + "step": 17844 + }, + { + "epoch": 5.477286678944138, + "grad_norm": 0.25700438022613525, + "learning_rate": 4.471623768927184e-05, + "loss": 1.7046, + "step": 17845 + }, + { + "epoch": 5.477593615715163, + "grad_norm": 0.2152809500694275, + "learning_rate": 4.47112949872095e-05, + "loss": 1.7464, + "step": 17846 + }, + { + "epoch": 5.4779005524861875, + "grad_norm": 0.26429688930511475, + "learning_rate": 4.470635233741321e-05, + "loss": 1.7629, + "step": 17847 + }, + { + "epoch": 5.478207489257213, + "grad_norm": 0.18546637892723083, + "learning_rate": 4.470140973993188e-05, + "loss": 1.7143, + "step": 17848 + }, + { + "epoch": 5.478514426028238, + "grad_norm": 0.1927761435508728, + "learning_rate": 4.46964671948143e-05, + "loss": 1.6919, + "step": 17849 + }, + { + "epoch": 5.4788213627992635, + "grad_norm": 0.21581199765205383, + "learning_rate": 4.469152470210935e-05, + "loss": 1.7596, + "step": 17850 + }, + { + "epoch": 5.479128299570289, + "grad_norm": 0.20244133472442627, + "learning_rate": 4.468658226186586e-05, + "loss": 1.7372, + "step": 17851 + }, + { + "epoch": 5.479435236341313, + "grad_norm": 0.2467198520898819, + "learning_rate": 4.468163987413269e-05, + "loss": 1.7361, + "step": 17852 + }, + { + "epoch": 5.479742173112339, + "grad_norm": 0.22134411334991455, + "learning_rate": 4.467669753895866e-05, + "loss": 1.7276, + "step": 17853 + }, + { + "epoch": 5.480049109883364, + "grad_norm": 0.1953750103712082, + "learning_rate": 4.4671755256392636e-05, + "loss": 1.6931, + "step": 17854 + }, + { + "epoch": 5.480356046654389, + "grad_norm": 0.21492068469524384, + "learning_rate": 4.466681302648343e-05, + "loss": 1.7437, + "step": 17855 + }, + { + "epoch": 5.480662983425415, + "grad_norm": 0.24377848207950592, + "learning_rate": 4.466187084927993e-05, + "loss": 1.7869, + "step": 17856 + }, + { + "epoch": 5.48096992019644, + "grad_norm": 0.23674219846725464, + "learning_rate": 4.465692872483093e-05, + "loss": 1.8142, + "step": 17857 + }, + { + "epoch": 5.481276856967464, + "grad_norm": 0.25036486983299255, + "learning_rate": 4.4651986653185304e-05, + "loss": 1.8075, + "step": 17858 + }, + { + "epoch": 5.48158379373849, + "grad_norm": 0.32649150490760803, + "learning_rate": 4.4647044634391867e-05, + "loss": 1.7177, + "step": 17859 + }, + { + "epoch": 5.481890730509515, + "grad_norm": 0.20300604403018951, + "learning_rate": 4.46421026684995e-05, + "loss": 1.6912, + "step": 17860 + }, + { + "epoch": 5.48219766728054, + "grad_norm": 0.24630679190158844, + "learning_rate": 4.4637160755557e-05, + "loss": 1.8312, + "step": 17861 + }, + { + "epoch": 5.482504604051566, + "grad_norm": 0.2263093739748001, + "learning_rate": 4.46322188956132e-05, + "loss": 1.7214, + "step": 17862 + }, + { + "epoch": 5.48281154082259, + "grad_norm": 0.22949177026748657, + "learning_rate": 4.462727708871699e-05, + "loss": 1.6882, + "step": 17863 + }, + { + "epoch": 5.4831184775936155, + "grad_norm": 0.23389381170272827, + "learning_rate": 4.4622335334917156e-05, + "loss": 1.7613, + "step": 17864 + }, + { + "epoch": 5.483425414364641, + "grad_norm": 0.2259683907032013, + "learning_rate": 4.461739363426257e-05, + "loss": 1.7021, + "step": 17865 + }, + { + "epoch": 5.483732351135666, + "grad_norm": 0.3213486969470978, + "learning_rate": 4.4612451986802036e-05, + "loss": 1.7469, + "step": 17866 + }, + { + "epoch": 5.4840392879066915, + "grad_norm": 0.3415670096874237, + "learning_rate": 4.4607510392584426e-05, + "loss": 1.7605, + "step": 17867 + }, + { + "epoch": 5.484346224677717, + "grad_norm": 0.2079494297504425, + "learning_rate": 4.460256885165855e-05, + "loss": 1.7832, + "step": 17868 + }, + { + "epoch": 5.484653161448741, + "grad_norm": 0.30334988236427307, + "learning_rate": 4.459762736407327e-05, + "loss": 1.6825, + "step": 17869 + }, + { + "epoch": 5.484960098219767, + "grad_norm": 0.22320730984210968, + "learning_rate": 4.4592685929877374e-05, + "loss": 1.7452, + "step": 17870 + }, + { + "epoch": 5.485267034990792, + "grad_norm": 0.25325682759284973, + "learning_rate": 4.458774454911975e-05, + "loss": 1.7359, + "step": 17871 + }, + { + "epoch": 5.485573971761817, + "grad_norm": 0.305501788854599, + "learning_rate": 4.458280322184919e-05, + "loss": 1.7161, + "step": 17872 + }, + { + "epoch": 5.485880908532843, + "grad_norm": 0.19486182928085327, + "learning_rate": 4.457786194811455e-05, + "loss": 1.7097, + "step": 17873 + }, + { + "epoch": 5.486187845303867, + "grad_norm": 0.3306363821029663, + "learning_rate": 4.457292072796465e-05, + "loss": 1.7653, + "step": 17874 + }, + { + "epoch": 5.486494782074892, + "grad_norm": 0.25172874331474304, + "learning_rate": 4.456797956144835e-05, + "loss": 1.7289, + "step": 17875 + }, + { + "epoch": 5.486801718845918, + "grad_norm": 0.24508661031723022, + "learning_rate": 4.456303844861444e-05, + "loss": 1.7255, + "step": 17876 + }, + { + "epoch": 5.487108655616943, + "grad_norm": 0.3043360114097595, + "learning_rate": 4.455809738951178e-05, + "loss": 1.7852, + "step": 17877 + }, + { + "epoch": 5.487415592387968, + "grad_norm": 0.22181758284568787, + "learning_rate": 4.4553156384189186e-05, + "loss": 1.7887, + "step": 17878 + }, + { + "epoch": 5.487722529158993, + "grad_norm": 0.2174321413040161, + "learning_rate": 4.454821543269549e-05, + "loss": 1.7024, + "step": 17879 + }, + { + "epoch": 5.488029465930018, + "grad_norm": 0.19634750485420227, + "learning_rate": 4.4543274535079535e-05, + "loss": 1.7451, + "step": 17880 + }, + { + "epoch": 5.4883364027010435, + "grad_norm": 0.20481908321380615, + "learning_rate": 4.4538333691390125e-05, + "loss": 1.7068, + "step": 17881 + }, + { + "epoch": 5.488643339472069, + "grad_norm": 0.2025458663702011, + "learning_rate": 4.453339290167612e-05, + "loss": 1.72, + "step": 17882 + }, + { + "epoch": 5.488950276243094, + "grad_norm": 0.21013019979000092, + "learning_rate": 4.452845216598632e-05, + "loss": 1.7113, + "step": 17883 + }, + { + "epoch": 5.4892572130141195, + "grad_norm": 0.2057499885559082, + "learning_rate": 4.452351148436956e-05, + "loss": 1.7007, + "step": 17884 + }, + { + "epoch": 5.489564149785144, + "grad_norm": 0.19957664608955383, + "learning_rate": 4.4518570856874666e-05, + "loss": 1.6999, + "step": 17885 + }, + { + "epoch": 5.489871086556169, + "grad_norm": 0.22609412670135498, + "learning_rate": 4.451363028355048e-05, + "loss": 1.8124, + "step": 17886 + }, + { + "epoch": 5.490178023327195, + "grad_norm": 0.27350863814353943, + "learning_rate": 4.4508689764445805e-05, + "loss": 1.8042, + "step": 17887 + }, + { + "epoch": 5.49048496009822, + "grad_norm": 0.23416854441165924, + "learning_rate": 4.450374929960949e-05, + "loss": 1.7607, + "step": 17888 + }, + { + "epoch": 5.490791896869245, + "grad_norm": 0.2891421318054199, + "learning_rate": 4.449880888909033e-05, + "loss": 1.7419, + "step": 17889 + }, + { + "epoch": 5.49109883364027, + "grad_norm": 0.2458745837211609, + "learning_rate": 4.449386853293717e-05, + "loss": 1.7234, + "step": 17890 + }, + { + "epoch": 5.491405770411295, + "grad_norm": 0.23390449583530426, + "learning_rate": 4.4488928231198826e-05, + "loss": 1.7482, + "step": 17891 + }, + { + "epoch": 5.49171270718232, + "grad_norm": 0.3509657084941864, + "learning_rate": 4.448398798392414e-05, + "loss": 1.7639, + "step": 17892 + }, + { + "epoch": 5.492019643953346, + "grad_norm": 0.2487955242395401, + "learning_rate": 4.4479047791161916e-05, + "loss": 1.7163, + "step": 17893 + }, + { + "epoch": 5.492326580724371, + "grad_norm": 0.22630274295806885, + "learning_rate": 4.4474107652960956e-05, + "loss": 1.7449, + "step": 17894 + }, + { + "epoch": 5.4926335174953955, + "grad_norm": 0.25909537076950073, + "learning_rate": 4.446916756937012e-05, + "loss": 1.7396, + "step": 17895 + }, + { + "epoch": 5.492940454266421, + "grad_norm": 0.29732683300971985, + "learning_rate": 4.446422754043819e-05, + "loss": 1.8109, + "step": 17896 + }, + { + "epoch": 5.493247391037446, + "grad_norm": 0.22436772286891937, + "learning_rate": 4.4459287566214035e-05, + "loss": 1.7657, + "step": 17897 + }, + { + "epoch": 5.4935543278084715, + "grad_norm": 0.24584892392158508, + "learning_rate": 4.445434764674643e-05, + "loss": 1.73, + "step": 17898 + }, + { + "epoch": 5.493861264579497, + "grad_norm": 0.27446454763412476, + "learning_rate": 4.444940778208423e-05, + "loss": 1.7428, + "step": 17899 + }, + { + "epoch": 5.494168201350522, + "grad_norm": 0.20442110300064087, + "learning_rate": 4.4444467972276215e-05, + "loss": 1.6911, + "step": 17900 + }, + { + "epoch": 5.494475138121547, + "grad_norm": 0.23089268803596497, + "learning_rate": 4.4439528217371236e-05, + "loss": 1.7192, + "step": 17901 + }, + { + "epoch": 5.494782074892572, + "grad_norm": 0.19402450323104858, + "learning_rate": 4.443458851741808e-05, + "loss": 1.7304, + "step": 17902 + }, + { + "epoch": 5.495089011663597, + "grad_norm": 0.2310219705104828, + "learning_rate": 4.442964887246561e-05, + "loss": 1.6963, + "step": 17903 + }, + { + "epoch": 5.495395948434623, + "grad_norm": 0.25573140382766724, + "learning_rate": 4.44247092825626e-05, + "loss": 1.7781, + "step": 17904 + }, + { + "epoch": 5.495702885205648, + "grad_norm": 0.20298753678798676, + "learning_rate": 4.4419769747757894e-05, + "loss": 1.763, + "step": 17905 + }, + { + "epoch": 5.496009821976672, + "grad_norm": 0.22243307530879974, + "learning_rate": 4.441483026810027e-05, + "loss": 1.7345, + "step": 17906 + }, + { + "epoch": 5.496316758747698, + "grad_norm": 0.19801411032676697, + "learning_rate": 4.4409890843638584e-05, + "loss": 1.7504, + "step": 17907 + }, + { + "epoch": 5.496623695518723, + "grad_norm": 0.2804374396800995, + "learning_rate": 4.440495147442162e-05, + "loss": 1.7985, + "step": 17908 + }, + { + "epoch": 5.496930632289748, + "grad_norm": 0.21824021637439728, + "learning_rate": 4.440001216049822e-05, + "loss": 1.6703, + "step": 17909 + }, + { + "epoch": 5.497237569060774, + "grad_norm": 0.23335935175418854, + "learning_rate": 4.439507290191719e-05, + "loss": 1.7426, + "step": 17910 + }, + { + "epoch": 5.497544505831799, + "grad_norm": 0.2093769609928131, + "learning_rate": 4.4390133698727315e-05, + "loss": 1.7178, + "step": 17911 + }, + { + "epoch": 5.4978514426028235, + "grad_norm": 0.18354324996471405, + "learning_rate": 4.438519455097743e-05, + "loss": 1.6849, + "step": 17912 + }, + { + "epoch": 5.498158379373849, + "grad_norm": 0.26826491951942444, + "learning_rate": 4.438025545871633e-05, + "loss": 1.7804, + "step": 17913 + }, + { + "epoch": 5.498465316144874, + "grad_norm": 0.29171738028526306, + "learning_rate": 4.437531642199288e-05, + "loss": 1.764, + "step": 17914 + }, + { + "epoch": 5.4987722529158995, + "grad_norm": 0.17870590090751648, + "learning_rate": 4.437037744085581e-05, + "loss": 1.6789, + "step": 17915 + }, + { + "epoch": 5.499079189686925, + "grad_norm": 0.25412192940711975, + "learning_rate": 4.4365438515354e-05, + "loss": 1.7536, + "step": 17916 + }, + { + "epoch": 5.499386126457949, + "grad_norm": 0.24465163052082062, + "learning_rate": 4.4360499645536203e-05, + "loss": 1.7582, + "step": 17917 + }, + { + "epoch": 5.499693063228975, + "grad_norm": 0.21248452365398407, + "learning_rate": 4.4355560831451264e-05, + "loss": 1.7209, + "step": 17918 + }, + { + "epoch": 5.5, + "grad_norm": 0.21018685400485992, + "learning_rate": 4.435062207314797e-05, + "loss": 1.7461, + "step": 17919 + }, + { + "epoch": 5.500306936771025, + "grad_norm": 0.1880551278591156, + "learning_rate": 4.434568337067517e-05, + "loss": 1.6818, + "step": 17920 + }, + { + "epoch": 5.500613873542051, + "grad_norm": 0.2224894016981125, + "learning_rate": 4.434074472408161e-05, + "loss": 1.8211, + "step": 17921 + }, + { + "epoch": 5.500920810313076, + "grad_norm": 0.19419749081134796, + "learning_rate": 4.433580613341615e-05, + "loss": 1.7625, + "step": 17922 + }, + { + "epoch": 5.5012277470841005, + "grad_norm": 0.2167430967092514, + "learning_rate": 4.433086759872756e-05, + "loss": 1.745, + "step": 17923 + }, + { + "epoch": 5.501534683855126, + "grad_norm": 0.1926383525133133, + "learning_rate": 4.4325929120064665e-05, + "loss": 1.7353, + "step": 17924 + }, + { + "epoch": 5.501841620626151, + "grad_norm": 0.22943224012851715, + "learning_rate": 4.432099069747625e-05, + "loss": 1.6903, + "step": 17925 + }, + { + "epoch": 5.5021485573971765, + "grad_norm": 0.18218693137168884, + "learning_rate": 4.431605233101116e-05, + "loss": 1.742, + "step": 17926 + }, + { + "epoch": 5.502455494168201, + "grad_norm": 0.2660788893699646, + "learning_rate": 4.431111402071817e-05, + "loss": 1.7208, + "step": 17927 + }, + { + "epoch": 5.502762430939226, + "grad_norm": 0.20015788078308105, + "learning_rate": 4.430617576664606e-05, + "loss": 1.721, + "step": 17928 + }, + { + "epoch": 5.503069367710252, + "grad_norm": 0.20011179149150848, + "learning_rate": 4.430123756884368e-05, + "loss": 1.7488, + "step": 17929 + }, + { + "epoch": 5.503376304481277, + "grad_norm": 0.22541452944278717, + "learning_rate": 4.429629942735979e-05, + "loss": 1.7997, + "step": 17930 + }, + { + "epoch": 5.503683241252302, + "grad_norm": 0.21067193150520325, + "learning_rate": 4.4291361342243236e-05, + "loss": 1.6652, + "step": 17931 + }, + { + "epoch": 5.503990178023328, + "grad_norm": 0.38401395082473755, + "learning_rate": 4.428642331354278e-05, + "loss": 1.815, + "step": 17932 + }, + { + "epoch": 5.504297114794352, + "grad_norm": 0.22600100934505463, + "learning_rate": 4.428148534130725e-05, + "loss": 1.7593, + "step": 17933 + }, + { + "epoch": 5.504604051565377, + "grad_norm": 0.21340666711330414, + "learning_rate": 4.427654742558542e-05, + "loss": 1.7447, + "step": 17934 + }, + { + "epoch": 5.504910988336403, + "grad_norm": 0.20676501095294952, + "learning_rate": 4.427160956642611e-05, + "loss": 1.7174, + "step": 17935 + }, + { + "epoch": 5.505217925107428, + "grad_norm": 0.2374252825975418, + "learning_rate": 4.42666717638781e-05, + "loss": 1.703, + "step": 17936 + }, + { + "epoch": 5.505524861878453, + "grad_norm": 0.20975756645202637, + "learning_rate": 4.426173401799022e-05, + "loss": 1.7076, + "step": 17937 + }, + { + "epoch": 5.505831798649478, + "grad_norm": 0.23778517544269562, + "learning_rate": 4.4256796328811226e-05, + "loss": 1.7647, + "step": 17938 + }, + { + "epoch": 5.506138735420503, + "grad_norm": 0.2088557481765747, + "learning_rate": 4.425185869638996e-05, + "loss": 1.764, + "step": 17939 + }, + { + "epoch": 5.5064456721915285, + "grad_norm": 0.26953455805778503, + "learning_rate": 4.424692112077518e-05, + "loss": 1.7351, + "step": 17940 + }, + { + "epoch": 5.506752608962554, + "grad_norm": 0.2762589454650879, + "learning_rate": 4.42419836020157e-05, + "loss": 1.7051, + "step": 17941 + }, + { + "epoch": 5.507059545733579, + "grad_norm": 0.19611702859401703, + "learning_rate": 4.4237046140160306e-05, + "loss": 1.7445, + "step": 17942 + }, + { + "epoch": 5.5073664825046045, + "grad_norm": 0.2708270251750946, + "learning_rate": 4.4232108735257824e-05, + "loss": 1.7284, + "step": 17943 + }, + { + "epoch": 5.507673419275629, + "grad_norm": 0.24194146692752838, + "learning_rate": 4.422717138735701e-05, + "loss": 1.7302, + "step": 17944 + }, + { + "epoch": 5.507980356046654, + "grad_norm": 0.21558286249637604, + "learning_rate": 4.422223409650666e-05, + "loss": 1.7435, + "step": 17945 + }, + { + "epoch": 5.50828729281768, + "grad_norm": 0.1842707246541977, + "learning_rate": 4.4217296862755597e-05, + "loss": 1.6579, + "step": 17946 + }, + { + "epoch": 5.508594229588705, + "grad_norm": 0.20211941003799438, + "learning_rate": 4.4212359686152576e-05, + "loss": 1.8017, + "step": 17947 + }, + { + "epoch": 5.50890116635973, + "grad_norm": 0.23749016225337982, + "learning_rate": 4.420742256674644e-05, + "loss": 1.6721, + "step": 17948 + }, + { + "epoch": 5.509208103130755, + "grad_norm": 0.2076852172613144, + "learning_rate": 4.420248550458592e-05, + "loss": 1.7102, + "step": 17949 + }, + { + "epoch": 5.50951503990178, + "grad_norm": 0.2599447965621948, + "learning_rate": 4.419754849971986e-05, + "loss": 1.7819, + "step": 17950 + }, + { + "epoch": 5.509821976672805, + "grad_norm": 0.2017187476158142, + "learning_rate": 4.4192611552197e-05, + "loss": 1.6812, + "step": 17951 + }, + { + "epoch": 5.510128913443831, + "grad_norm": 0.21972116827964783, + "learning_rate": 4.418767466206617e-05, + "loss": 1.7122, + "step": 17952 + }, + { + "epoch": 5.510435850214856, + "grad_norm": 0.21750569343566895, + "learning_rate": 4.418273782937613e-05, + "loss": 1.7285, + "step": 17953 + }, + { + "epoch": 5.510742786985881, + "grad_norm": 0.19349125027656555, + "learning_rate": 4.417780105417572e-05, + "loss": 1.7383, + "step": 17954 + }, + { + "epoch": 5.511049723756906, + "grad_norm": 0.2094268798828125, + "learning_rate": 4.417286433651366e-05, + "loss": 1.7107, + "step": 17955 + }, + { + "epoch": 5.511356660527931, + "grad_norm": 0.2684331238269806, + "learning_rate": 4.41679276764388e-05, + "loss": 1.7336, + "step": 17956 + }, + { + "epoch": 5.5116635972989565, + "grad_norm": 0.27616915106773376, + "learning_rate": 4.416299107399987e-05, + "loss": 1.7439, + "step": 17957 + }, + { + "epoch": 5.511970534069982, + "grad_norm": 0.23874540627002716, + "learning_rate": 4.415805452924569e-05, + "loss": 1.7979, + "step": 17958 + }, + { + "epoch": 5.512277470841006, + "grad_norm": 0.21870921552181244, + "learning_rate": 4.415311804222503e-05, + "loss": 1.6674, + "step": 17959 + }, + { + "epoch": 5.512584407612032, + "grad_norm": 0.23042429983615875, + "learning_rate": 4.414818161298671e-05, + "loss": 1.7588, + "step": 17960 + }, + { + "epoch": 5.512891344383057, + "grad_norm": 0.2957153916358948, + "learning_rate": 4.4143245241579486e-05, + "loss": 1.8412, + "step": 17961 + }, + { + "epoch": 5.513198281154082, + "grad_norm": 0.28292644023895264, + "learning_rate": 4.413830892805213e-05, + "loss": 1.7915, + "step": 17962 + }, + { + "epoch": 5.513505217925108, + "grad_norm": 0.26526281237602234, + "learning_rate": 4.413337267245344e-05, + "loss": 1.7199, + "step": 17963 + }, + { + "epoch": 5.513812154696133, + "grad_norm": 0.41243693232536316, + "learning_rate": 4.4128436474832204e-05, + "loss": 1.7419, + "step": 17964 + }, + { + "epoch": 5.514119091467157, + "grad_norm": 0.2747771739959717, + "learning_rate": 4.4123500335237214e-05, + "loss": 1.7449, + "step": 17965 + }, + { + "epoch": 5.514426028238183, + "grad_norm": 0.25944122672080994, + "learning_rate": 4.4118564253717216e-05, + "loss": 1.7667, + "step": 17966 + }, + { + "epoch": 5.514732965009208, + "grad_norm": 0.32558533549308777, + "learning_rate": 4.411362823032103e-05, + "loss": 1.7292, + "step": 17967 + }, + { + "epoch": 5.515039901780233, + "grad_norm": 0.20190958678722382, + "learning_rate": 4.4108692265097404e-05, + "loss": 1.7529, + "step": 17968 + }, + { + "epoch": 5.515346838551259, + "grad_norm": 0.35485807061195374, + "learning_rate": 4.410375635809514e-05, + "loss": 1.7335, + "step": 17969 + }, + { + "epoch": 5.515653775322283, + "grad_norm": 0.2670159935951233, + "learning_rate": 4.409882050936301e-05, + "loss": 1.6789, + "step": 17970 + }, + { + "epoch": 5.5159607120933085, + "grad_norm": 0.19106578826904297, + "learning_rate": 4.409388471894981e-05, + "loss": 1.708, + "step": 17971 + }, + { + "epoch": 5.516267648864334, + "grad_norm": 0.2707268297672272, + "learning_rate": 4.4088948986904286e-05, + "loss": 1.7917, + "step": 17972 + }, + { + "epoch": 5.516574585635359, + "grad_norm": 0.2329230159521103, + "learning_rate": 4.408401331327525e-05, + "loss": 1.7378, + "step": 17973 + }, + { + "epoch": 5.5168815224063845, + "grad_norm": 0.22164998948574066, + "learning_rate": 4.4079077698111436e-05, + "loss": 1.7287, + "step": 17974 + }, + { + "epoch": 5.51718845917741, + "grad_norm": 0.25895699858665466, + "learning_rate": 4.4074142141461665e-05, + "loss": 1.7158, + "step": 17975 + }, + { + "epoch": 5.517495395948434, + "grad_norm": 0.2617860436439514, + "learning_rate": 4.4069206643374695e-05, + "loss": 1.7767, + "step": 17976 + }, + { + "epoch": 5.51780233271946, + "grad_norm": 0.20443588495254517, + "learning_rate": 4.40642712038993e-05, + "loss": 1.7371, + "step": 17977 + }, + { + "epoch": 5.518109269490485, + "grad_norm": 0.26251545548439026, + "learning_rate": 4.4059335823084266e-05, + "loss": 1.8154, + "step": 17978 + }, + { + "epoch": 5.51841620626151, + "grad_norm": 0.2315993458032608, + "learning_rate": 4.405440050097833e-05, + "loss": 1.7426, + "step": 17979 + }, + { + "epoch": 5.518723143032536, + "grad_norm": 0.19467706978321075, + "learning_rate": 4.404946523763031e-05, + "loss": 1.7418, + "step": 17980 + }, + { + "epoch": 5.51903007980356, + "grad_norm": 0.2387837916612625, + "learning_rate": 4.4044530033088946e-05, + "loss": 1.7648, + "step": 17981 + }, + { + "epoch": 5.519337016574585, + "grad_norm": 0.21097531914710999, + "learning_rate": 4.403959488740306e-05, + "loss": 1.7198, + "step": 17982 + }, + { + "epoch": 5.519643953345611, + "grad_norm": 0.22303247451782227, + "learning_rate": 4.403465980062136e-05, + "loss": 1.7679, + "step": 17983 + }, + { + "epoch": 5.519950890116636, + "grad_norm": 0.19705620408058167, + "learning_rate": 4.4029724772792666e-05, + "loss": 1.7747, + "step": 17984 + }, + { + "epoch": 5.520257826887661, + "grad_norm": 0.20864570140838623, + "learning_rate": 4.4024789803965715e-05, + "loss": 1.6797, + "step": 17985 + }, + { + "epoch": 5.520564763658687, + "grad_norm": 0.1917724758386612, + "learning_rate": 4.401985489418931e-05, + "loss": 1.7246, + "step": 17986 + }, + { + "epoch": 5.520871700429711, + "grad_norm": 0.25668975710868835, + "learning_rate": 4.401492004351219e-05, + "loss": 1.7245, + "step": 17987 + }, + { + "epoch": 5.5211786372007365, + "grad_norm": 0.22576093673706055, + "learning_rate": 4.4009985251983146e-05, + "loss": 1.6766, + "step": 17988 + }, + { + "epoch": 5.521485573971762, + "grad_norm": 0.18614664673805237, + "learning_rate": 4.400505051965093e-05, + "loss": 1.7379, + "step": 17989 + }, + { + "epoch": 5.521792510742787, + "grad_norm": 0.21472783386707306, + "learning_rate": 4.4000115846564335e-05, + "loss": 1.7203, + "step": 17990 + }, + { + "epoch": 5.5220994475138125, + "grad_norm": 0.201142817735672, + "learning_rate": 4.39951812327721e-05, + "loss": 1.7049, + "step": 17991 + }, + { + "epoch": 5.522406384284837, + "grad_norm": 0.193614661693573, + "learning_rate": 4.3990246678323e-05, + "loss": 1.6938, + "step": 17992 + }, + { + "epoch": 5.522713321055862, + "grad_norm": 0.23343239724636078, + "learning_rate": 4.398531218326582e-05, + "loss": 1.744, + "step": 17993 + }, + { + "epoch": 5.523020257826888, + "grad_norm": 0.26271605491638184, + "learning_rate": 4.3980377747649305e-05, + "loss": 1.7458, + "step": 17994 + }, + { + "epoch": 5.523327194597913, + "grad_norm": 0.2048577219247818, + "learning_rate": 4.397544337152223e-05, + "loss": 1.763, + "step": 17995 + }, + { + "epoch": 5.523634131368938, + "grad_norm": 0.27748194336891174, + "learning_rate": 4.397050905493334e-05, + "loss": 1.7346, + "step": 17996 + }, + { + "epoch": 5.523941068139964, + "grad_norm": 0.3040253520011902, + "learning_rate": 4.3965574797931417e-05, + "loss": 1.7396, + "step": 17997 + }, + { + "epoch": 5.524248004910988, + "grad_norm": 0.3310317397117615, + "learning_rate": 4.396064060056523e-05, + "loss": 1.8094, + "step": 17998 + }, + { + "epoch": 5.524554941682013, + "grad_norm": 0.21845392882823944, + "learning_rate": 4.395570646288352e-05, + "loss": 1.7013, + "step": 17999 + }, + { + "epoch": 5.524861878453039, + "grad_norm": 0.319876492023468, + "learning_rate": 4.395077238493506e-05, + "loss": 1.7985, + "step": 18000 + }, + { + "epoch": 5.525168815224064, + "grad_norm": 0.28261950612068176, + "learning_rate": 4.394583836676863e-05, + "loss": 1.7979, + "step": 18001 + }, + { + "epoch": 5.525475751995089, + "grad_norm": 0.20874030888080597, + "learning_rate": 4.394090440843296e-05, + "loss": 1.7363, + "step": 18002 + }, + { + "epoch": 5.525782688766114, + "grad_norm": 0.28587406873703003, + "learning_rate": 4.393597050997684e-05, + "loss": 1.6787, + "step": 18003 + }, + { + "epoch": 5.526089625537139, + "grad_norm": 0.2719021439552307, + "learning_rate": 4.393103667144899e-05, + "loss": 1.7625, + "step": 18004 + }, + { + "epoch": 5.526396562308165, + "grad_norm": 0.22485414147377014, + "learning_rate": 4.392610289289821e-05, + "loss": 1.6847, + "step": 18005 + }, + { + "epoch": 5.52670349907919, + "grad_norm": 0.3500347435474396, + "learning_rate": 4.392116917437322e-05, + "loss": 1.7244, + "step": 18006 + }, + { + "epoch": 5.527010435850215, + "grad_norm": 0.26308783888816833, + "learning_rate": 4.3916235515922836e-05, + "loss": 1.7738, + "step": 18007 + }, + { + "epoch": 5.52731737262124, + "grad_norm": 0.27030646800994873, + "learning_rate": 4.391130191759574e-05, + "loss": 1.7149, + "step": 18008 + }, + { + "epoch": 5.527624309392265, + "grad_norm": 0.4137318730354309, + "learning_rate": 4.390636837944076e-05, + "loss": 1.7581, + "step": 18009 + }, + { + "epoch": 5.52793124616329, + "grad_norm": 0.2462068647146225, + "learning_rate": 4.390143490150659e-05, + "loss": 1.7767, + "step": 18010 + }, + { + "epoch": 5.528238182934316, + "grad_norm": 0.27424392104148865, + "learning_rate": 4.3896501483842036e-05, + "loss": 1.7701, + "step": 18011 + }, + { + "epoch": 5.528545119705341, + "grad_norm": 0.31268683075904846, + "learning_rate": 4.389156812649583e-05, + "loss": 1.7342, + "step": 18012 + }, + { + "epoch": 5.5288520564763655, + "grad_norm": 0.20428471267223358, + "learning_rate": 4.388663482951671e-05, + "loss": 1.7083, + "step": 18013 + }, + { + "epoch": 5.529158993247391, + "grad_norm": 0.322344034910202, + "learning_rate": 4.3881701592953475e-05, + "loss": 1.7423, + "step": 18014 + }, + { + "epoch": 5.529465930018416, + "grad_norm": 0.2267894744873047, + "learning_rate": 4.387676841685483e-05, + "loss": 1.7309, + "step": 18015 + }, + { + "epoch": 5.5297728667894415, + "grad_norm": 0.23041954636573792, + "learning_rate": 4.387183530126955e-05, + "loss": 1.7352, + "step": 18016 + }, + { + "epoch": 5.530079803560467, + "grad_norm": 0.31139662861824036, + "learning_rate": 4.386690224624638e-05, + "loss": 1.7223, + "step": 18017 + }, + { + "epoch": 5.530386740331492, + "grad_norm": 0.20144063234329224, + "learning_rate": 4.38619692518341e-05, + "loss": 1.7607, + "step": 18018 + }, + { + "epoch": 5.530693677102517, + "grad_norm": 0.23812296986579895, + "learning_rate": 4.385703631808142e-05, + "loss": 1.7599, + "step": 18019 + }, + { + "epoch": 5.531000613873542, + "grad_norm": 0.2442231923341751, + "learning_rate": 4.385210344503712e-05, + "loss": 1.7094, + "step": 18020 + }, + { + "epoch": 5.531307550644567, + "grad_norm": 0.19497406482696533, + "learning_rate": 4.384717063274992e-05, + "loss": 1.7686, + "step": 18021 + }, + { + "epoch": 5.531614487415593, + "grad_norm": 0.29085835814476013, + "learning_rate": 4.38422378812686e-05, + "loss": 1.7454, + "step": 18022 + }, + { + "epoch": 5.531921424186618, + "grad_norm": 0.2701610028743744, + "learning_rate": 4.3837305190641876e-05, + "loss": 1.7376, + "step": 18023 + }, + { + "epoch": 5.532228360957642, + "grad_norm": 0.21232132613658905, + "learning_rate": 4.383237256091854e-05, + "loss": 1.7773, + "step": 18024 + }, + { + "epoch": 5.532535297728668, + "grad_norm": 0.24131610989570618, + "learning_rate": 4.382743999214729e-05, + "loss": 1.7899, + "step": 18025 + }, + { + "epoch": 5.532842234499693, + "grad_norm": 0.2752540409564972, + "learning_rate": 4.382250748437692e-05, + "loss": 1.7603, + "step": 18026 + }, + { + "epoch": 5.533149171270718, + "grad_norm": 0.2007865607738495, + "learning_rate": 4.381757503765613e-05, + "loss": 1.7553, + "step": 18027 + }, + { + "epoch": 5.533456108041744, + "grad_norm": 0.23768723011016846, + "learning_rate": 4.38126426520337e-05, + "loss": 1.757, + "step": 18028 + }, + { + "epoch": 5.533763044812769, + "grad_norm": 0.22198502719402313, + "learning_rate": 4.3807710327558366e-05, + "loss": 1.7578, + "step": 18029 + }, + { + "epoch": 5.5340699815837935, + "grad_norm": 0.22432352602481842, + "learning_rate": 4.380277806427885e-05, + "loss": 1.75, + "step": 18030 + }, + { + "epoch": 5.534376918354819, + "grad_norm": 0.23029591143131256, + "learning_rate": 4.379784586224394e-05, + "loss": 1.7829, + "step": 18031 + }, + { + "epoch": 5.534683855125844, + "grad_norm": 0.23901896178722382, + "learning_rate": 4.379291372150232e-05, + "loss": 1.7461, + "step": 18032 + }, + { + "epoch": 5.5349907918968695, + "grad_norm": 0.20958681404590607, + "learning_rate": 4.378798164210278e-05, + "loss": 1.7224, + "step": 18033 + }, + { + "epoch": 5.535297728667894, + "grad_norm": 0.21619680523872375, + "learning_rate": 4.3783049624094036e-05, + "loss": 1.7605, + "step": 18034 + }, + { + "epoch": 5.535604665438919, + "grad_norm": 0.22988620400428772, + "learning_rate": 4.3778117667524867e-05, + "loss": 1.7668, + "step": 18035 + }, + { + "epoch": 5.535911602209945, + "grad_norm": 0.20107243955135345, + "learning_rate": 4.377318577244395e-05, + "loss": 1.7932, + "step": 18036 + }, + { + "epoch": 5.53621853898097, + "grad_norm": 0.25803956389427185, + "learning_rate": 4.376825393890009e-05, + "loss": 1.7409, + "step": 18037 + }, + { + "epoch": 5.536525475751995, + "grad_norm": 0.34292399883270264, + "learning_rate": 4.376332216694198e-05, + "loss": 1.8554, + "step": 18038 + }, + { + "epoch": 5.536832412523021, + "grad_norm": 0.23147790133953094, + "learning_rate": 4.375839045661839e-05, + "loss": 1.7918, + "step": 18039 + }, + { + "epoch": 5.537139349294045, + "grad_norm": 0.2387644350528717, + "learning_rate": 4.375345880797802e-05, + "loss": 1.7391, + "step": 18040 + }, + { + "epoch": 5.53744628606507, + "grad_norm": 0.21463727951049805, + "learning_rate": 4.374852722106966e-05, + "loss": 1.6812, + "step": 18041 + }, + { + "epoch": 5.537753222836096, + "grad_norm": 0.21994563937187195, + "learning_rate": 4.3743595695941994e-05, + "loss": 1.7727, + "step": 18042 + }, + { + "epoch": 5.538060159607121, + "grad_norm": 0.21102699637413025, + "learning_rate": 4.373866423264381e-05, + "loss": 1.7854, + "step": 18043 + }, + { + "epoch": 5.538367096378146, + "grad_norm": 0.21742786467075348, + "learning_rate": 4.3733732831223794e-05, + "loss": 1.7352, + "step": 18044 + }, + { + "epoch": 5.538674033149171, + "grad_norm": 0.20080791413784027, + "learning_rate": 4.372880149173071e-05, + "loss": 1.7264, + "step": 18045 + }, + { + "epoch": 5.538980969920196, + "grad_norm": 0.21027569472789764, + "learning_rate": 4.372387021421329e-05, + "loss": 1.766, + "step": 18046 + }, + { + "epoch": 5.5392879066912215, + "grad_norm": 0.22870683670043945, + "learning_rate": 4.371893899872025e-05, + "loss": 1.7746, + "step": 18047 + }, + { + "epoch": 5.539594843462247, + "grad_norm": 0.21248690783977509, + "learning_rate": 4.371400784530036e-05, + "loss": 1.7447, + "step": 18048 + }, + { + "epoch": 5.539901780233272, + "grad_norm": 0.23059454560279846, + "learning_rate": 4.37090767540023e-05, + "loss": 1.7827, + "step": 18049 + }, + { + "epoch": 5.5402087170042975, + "grad_norm": 0.2519036531448364, + "learning_rate": 4.370414572487485e-05, + "loss": 1.7984, + "step": 18050 + }, + { + "epoch": 5.540515653775322, + "grad_norm": 0.23621398210525513, + "learning_rate": 4.36992147579667e-05, + "loss": 1.7517, + "step": 18051 + }, + { + "epoch": 5.540822590546347, + "grad_norm": 0.24267609417438507, + "learning_rate": 4.3694283853326625e-05, + "loss": 1.8285, + "step": 18052 + }, + { + "epoch": 5.541129527317373, + "grad_norm": 0.23209960758686066, + "learning_rate": 4.368935301100332e-05, + "loss": 1.7765, + "step": 18053 + }, + { + "epoch": 5.541436464088398, + "grad_norm": 0.21277187764644623, + "learning_rate": 4.368442223104555e-05, + "loss": 1.7182, + "step": 18054 + }, + { + "epoch": 5.541743400859423, + "grad_norm": 0.20821616053581238, + "learning_rate": 4.367949151350199e-05, + "loss": 1.6766, + "step": 18055 + }, + { + "epoch": 5.542050337630448, + "grad_norm": 0.23019999265670776, + "learning_rate": 4.3674560858421414e-05, + "loss": 1.7438, + "step": 18056 + }, + { + "epoch": 5.542357274401473, + "grad_norm": 0.21547134220600128, + "learning_rate": 4.366963026585253e-05, + "loss": 1.7003, + "step": 18057 + }, + { + "epoch": 5.542664211172498, + "grad_norm": 0.22454513609409332, + "learning_rate": 4.3664699735844084e-05, + "loss": 1.7072, + "step": 18058 + }, + { + "epoch": 5.542971147943524, + "grad_norm": 0.22228482365608215, + "learning_rate": 4.365976926844477e-05, + "loss": 1.7557, + "step": 18059 + }, + { + "epoch": 5.543278084714549, + "grad_norm": 0.25762560963630676, + "learning_rate": 4.365483886370335e-05, + "loss": 1.7751, + "step": 18060 + }, + { + "epoch": 5.543585021485574, + "grad_norm": 0.2086205631494522, + "learning_rate": 4.3649908521668516e-05, + "loss": 1.7399, + "step": 18061 + }, + { + "epoch": 5.543891958256599, + "grad_norm": 0.2759089767932892, + "learning_rate": 4.3644978242389014e-05, + "loss": 1.7503, + "step": 18062 + }, + { + "epoch": 5.544198895027624, + "grad_norm": 0.2235182225704193, + "learning_rate": 4.364004802591358e-05, + "loss": 1.7313, + "step": 18063 + }, + { + "epoch": 5.5445058317986495, + "grad_norm": 0.23074570298194885, + "learning_rate": 4.3635117872290885e-05, + "loss": 1.7649, + "step": 18064 + }, + { + "epoch": 5.544812768569675, + "grad_norm": 0.24929538369178772, + "learning_rate": 4.363018778156972e-05, + "loss": 1.732, + "step": 18065 + }, + { + "epoch": 5.5451197053407, + "grad_norm": 0.26422035694122314, + "learning_rate": 4.362525775379874e-05, + "loss": 1.7276, + "step": 18066 + }, + { + "epoch": 5.545426642111725, + "grad_norm": 0.3160388767719269, + "learning_rate": 4.362032778902672e-05, + "loss": 1.7777, + "step": 18067 + }, + { + "epoch": 5.54573357888275, + "grad_norm": 0.20791196823120117, + "learning_rate": 4.3615397887302345e-05, + "loss": 1.7058, + "step": 18068 + }, + { + "epoch": 5.546040515653775, + "grad_norm": 0.31438156962394714, + "learning_rate": 4.361046804867437e-05, + "loss": 1.8102, + "step": 18069 + }, + { + "epoch": 5.546347452424801, + "grad_norm": 0.3008113205432892, + "learning_rate": 4.3605538273191475e-05, + "loss": 1.7297, + "step": 18070 + }, + { + "epoch": 5.546654389195826, + "grad_norm": 0.21147282421588898, + "learning_rate": 4.3600608560902425e-05, + "loss": 1.776, + "step": 18071 + }, + { + "epoch": 5.546961325966851, + "grad_norm": 0.25202393531799316, + "learning_rate": 4.3595678911855884e-05, + "loss": 1.7273, + "step": 18072 + }, + { + "epoch": 5.547268262737876, + "grad_norm": 0.18881210684776306, + "learning_rate": 4.3590749326100614e-05, + "loss": 1.7026, + "step": 18073 + }, + { + "epoch": 5.547575199508901, + "grad_norm": 0.25075671076774597, + "learning_rate": 4.3585819803685295e-05, + "loss": 1.7694, + "step": 18074 + }, + { + "epoch": 5.547882136279926, + "grad_norm": 0.2625887989997864, + "learning_rate": 4.358089034465869e-05, + "loss": 1.7338, + "step": 18075 + }, + { + "epoch": 5.548189073050952, + "grad_norm": 0.27278679609298706, + "learning_rate": 4.357596094906947e-05, + "loss": 1.7684, + "step": 18076 + }, + { + "epoch": 5.548496009821976, + "grad_norm": 0.283964604139328, + "learning_rate": 4.3571031616966396e-05, + "loss": 1.7539, + "step": 18077 + }, + { + "epoch": 5.5488029465930016, + "grad_norm": 0.2702009975910187, + "learning_rate": 4.3566102348398124e-05, + "loss": 1.8064, + "step": 18078 + }, + { + "epoch": 5.549109883364027, + "grad_norm": 0.449733167886734, + "learning_rate": 4.356117314341342e-05, + "loss": 1.7258, + "step": 18079 + }, + { + "epoch": 5.549416820135052, + "grad_norm": 0.3199995160102844, + "learning_rate": 4.3556244002060975e-05, + "loss": 1.7526, + "step": 18080 + }, + { + "epoch": 5.5497237569060776, + "grad_norm": 0.2803747355937958, + "learning_rate": 4.3551314924389494e-05, + "loss": 1.764, + "step": 18081 + }, + { + "epoch": 5.550030693677103, + "grad_norm": 0.28995978832244873, + "learning_rate": 4.3546385910447715e-05, + "loss": 1.7617, + "step": 18082 + }, + { + "epoch": 5.550337630448127, + "grad_norm": 0.24313311278820038, + "learning_rate": 4.354145696028431e-05, + "loss": 1.7515, + "step": 18083 + }, + { + "epoch": 5.550644567219153, + "grad_norm": 0.2668032944202423, + "learning_rate": 4.3536528073948025e-05, + "loss": 1.743, + "step": 18084 + }, + { + "epoch": 5.550951503990178, + "grad_norm": 0.22831310331821442, + "learning_rate": 4.353159925148755e-05, + "loss": 1.7971, + "step": 18085 + }, + { + "epoch": 5.551258440761203, + "grad_norm": 0.22047942876815796, + "learning_rate": 4.352667049295162e-05, + "loss": 1.6983, + "step": 18086 + }, + { + "epoch": 5.551565377532229, + "grad_norm": 0.22895069420337677, + "learning_rate": 4.35217417983889e-05, + "loss": 1.7866, + "step": 18087 + }, + { + "epoch": 5.551872314303253, + "grad_norm": 0.19946368038654327, + "learning_rate": 4.3516813167848156e-05, + "loss": 1.7129, + "step": 18088 + }, + { + "epoch": 5.5521792510742785, + "grad_norm": 0.21508903801441193, + "learning_rate": 4.351188460137804e-05, + "loss": 1.7154, + "step": 18089 + }, + { + "epoch": 5.552486187845304, + "grad_norm": 0.24813953042030334, + "learning_rate": 4.3506956099027294e-05, + "loss": 1.8326, + "step": 18090 + }, + { + "epoch": 5.552793124616329, + "grad_norm": 0.21306444704532623, + "learning_rate": 4.35020276608446e-05, + "loss": 1.7651, + "step": 18091 + }, + { + "epoch": 5.5531000613873545, + "grad_norm": 0.22041217982769012, + "learning_rate": 4.34970992868787e-05, + "loss": 1.6852, + "step": 18092 + }, + { + "epoch": 5.55340699815838, + "grad_norm": 0.21699896454811096, + "learning_rate": 4.349217097717826e-05, + "loss": 1.7524, + "step": 18093 + }, + { + "epoch": 5.553713934929404, + "grad_norm": 0.23086662590503693, + "learning_rate": 4.3487242731792015e-05, + "loss": 1.7441, + "step": 18094 + }, + { + "epoch": 5.55402087170043, + "grad_norm": 0.21898184716701508, + "learning_rate": 4.348231455076864e-05, + "loss": 1.7131, + "step": 18095 + }, + { + "epoch": 5.554327808471455, + "grad_norm": 0.17392560839653015, + "learning_rate": 4.3477386434156854e-05, + "loss": 1.7049, + "step": 18096 + }, + { + "epoch": 5.55463474524248, + "grad_norm": 0.1984172910451889, + "learning_rate": 4.3472458382005374e-05, + "loss": 1.7136, + "step": 18097 + }, + { + "epoch": 5.554941682013506, + "grad_norm": 0.19227837026119232, + "learning_rate": 4.3467530394362866e-05, + "loss": 1.7468, + "step": 18098 + }, + { + "epoch": 5.55524861878453, + "grad_norm": 0.2307087779045105, + "learning_rate": 4.346260247127807e-05, + "loss": 1.7004, + "step": 18099 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.21496252715587616, + "learning_rate": 4.345767461279965e-05, + "loss": 1.7508, + "step": 18100 + }, + { + "epoch": 5.555862492326581, + "grad_norm": 0.21119998395442963, + "learning_rate": 4.3452746818976333e-05, + "loss": 1.7965, + "step": 18101 + }, + { + "epoch": 5.556169429097606, + "grad_norm": 0.2416355311870575, + "learning_rate": 4.34478190898568e-05, + "loss": 1.7006, + "step": 18102 + }, + { + "epoch": 5.556476365868631, + "grad_norm": 0.2009642869234085, + "learning_rate": 4.344289142548978e-05, + "loss": 1.7567, + "step": 18103 + }, + { + "epoch": 5.556783302639657, + "grad_norm": 0.2387058436870575, + "learning_rate": 4.343796382592393e-05, + "loss": 1.7898, + "step": 18104 + }, + { + "epoch": 5.557090239410681, + "grad_norm": 0.19835951924324036, + "learning_rate": 4.343303629120798e-05, + "loss": 1.7888, + "step": 18105 + }, + { + "epoch": 5.5573971761817065, + "grad_norm": 0.23324637115001678, + "learning_rate": 4.3428108821390604e-05, + "loss": 1.7923, + "step": 18106 + }, + { + "epoch": 5.557704112952732, + "grad_norm": 0.22334477305412292, + "learning_rate": 4.342318141652052e-05, + "loss": 1.7234, + "step": 18107 + }, + { + "epoch": 5.558011049723757, + "grad_norm": 0.20220427215099335, + "learning_rate": 4.341825407664639e-05, + "loss": 1.7639, + "step": 18108 + }, + { + "epoch": 5.558317986494782, + "grad_norm": 0.23658546805381775, + "learning_rate": 4.3413326801816964e-05, + "loss": 1.7505, + "step": 18109 + }, + { + "epoch": 5.558624923265807, + "grad_norm": 0.21157726645469666, + "learning_rate": 4.3408399592080875e-05, + "loss": 1.7655, + "step": 18110 + }, + { + "epoch": 5.558931860036832, + "grad_norm": 0.2139829397201538, + "learning_rate": 4.340347244748687e-05, + "loss": 1.767, + "step": 18111 + }, + { + "epoch": 5.559238796807858, + "grad_norm": 0.17811299860477448, + "learning_rate": 4.339854536808359e-05, + "loss": 1.6629, + "step": 18112 + }, + { + "epoch": 5.559545733578883, + "grad_norm": 0.2005898356437683, + "learning_rate": 4.339361835391977e-05, + "loss": 1.7269, + "step": 18113 + }, + { + "epoch": 5.559852670349908, + "grad_norm": 0.21514086425304413, + "learning_rate": 4.338869140504409e-05, + "loss": 1.7806, + "step": 18114 + }, + { + "epoch": 5.560159607120933, + "grad_norm": 0.23163840174674988, + "learning_rate": 4.338376452150522e-05, + "loss": 1.7259, + "step": 18115 + }, + { + "epoch": 5.560466543891958, + "grad_norm": 0.23657509684562683, + "learning_rate": 4.337883770335189e-05, + "loss": 1.7778, + "step": 18116 + }, + { + "epoch": 5.560773480662983, + "grad_norm": 0.20135201513767242, + "learning_rate": 4.337391095063274e-05, + "loss": 1.7359, + "step": 18117 + }, + { + "epoch": 5.561080417434009, + "grad_norm": 0.22871774435043335, + "learning_rate": 4.33689842633965e-05, + "loss": 1.7658, + "step": 18118 + }, + { + "epoch": 5.561387354205034, + "grad_norm": 0.21755221486091614, + "learning_rate": 4.3364057641691835e-05, + "loss": 1.7408, + "step": 18119 + }, + { + "epoch": 5.5616942909760585, + "grad_norm": 0.215267151594162, + "learning_rate": 4.335913108556746e-05, + "loss": 1.7175, + "step": 18120 + }, + { + "epoch": 5.562001227747084, + "grad_norm": 0.25724974274635315, + "learning_rate": 4.335420459507202e-05, + "loss": 1.7197, + "step": 18121 + }, + { + "epoch": 5.562308164518109, + "grad_norm": 0.25375521183013916, + "learning_rate": 4.3349278170254254e-05, + "loss": 1.7251, + "step": 18122 + }, + { + "epoch": 5.5626151012891345, + "grad_norm": 0.24768905341625214, + "learning_rate": 4.334435181116279e-05, + "loss": 1.7405, + "step": 18123 + }, + { + "epoch": 5.56292203806016, + "grad_norm": 0.21281081438064575, + "learning_rate": 4.333942551784636e-05, + "loss": 1.7131, + "step": 18124 + }, + { + "epoch": 5.563228974831185, + "grad_norm": 0.2129398137331009, + "learning_rate": 4.333449929035361e-05, + "loss": 1.7049, + "step": 18125 + }, + { + "epoch": 5.56353591160221, + "grad_norm": 0.24582397937774658, + "learning_rate": 4.332957312873328e-05, + "loss": 1.7205, + "step": 18126 + }, + { + "epoch": 5.563842848373235, + "grad_norm": 0.21282973885536194, + "learning_rate": 4.332464703303399e-05, + "loss": 1.7655, + "step": 18127 + }, + { + "epoch": 5.56414978514426, + "grad_norm": 0.2302251160144806, + "learning_rate": 4.331972100330447e-05, + "loss": 1.7597, + "step": 18128 + }, + { + "epoch": 5.564456721915286, + "grad_norm": 0.23453226685523987, + "learning_rate": 4.331479503959336e-05, + "loss": 1.7028, + "step": 18129 + }, + { + "epoch": 5.564763658686311, + "grad_norm": 0.19723562896251678, + "learning_rate": 4.330986914194938e-05, + "loss": 1.7101, + "step": 18130 + }, + { + "epoch": 5.565070595457335, + "grad_norm": 0.22021643817424774, + "learning_rate": 4.33049433104212e-05, + "loss": 1.7123, + "step": 18131 + }, + { + "epoch": 5.565377532228361, + "grad_norm": 0.25540977716445923, + "learning_rate": 4.3300017545057484e-05, + "loss": 1.7392, + "step": 18132 + }, + { + "epoch": 5.565684468999386, + "grad_norm": 0.23482176661491394, + "learning_rate": 4.329509184590693e-05, + "loss": 1.7175, + "step": 18133 + }, + { + "epoch": 5.565991405770411, + "grad_norm": 0.19537311792373657, + "learning_rate": 4.329016621301819e-05, + "loss": 1.7583, + "step": 18134 + }, + { + "epoch": 5.566298342541437, + "grad_norm": 0.21828842163085938, + "learning_rate": 4.328524064643997e-05, + "loss": 1.7411, + "step": 18135 + }, + { + "epoch": 5.566605279312462, + "grad_norm": 0.24589122831821442, + "learning_rate": 4.328031514622093e-05, + "loss": 1.7769, + "step": 18136 + }, + { + "epoch": 5.5669122160834865, + "grad_norm": 0.20964545011520386, + "learning_rate": 4.327538971240978e-05, + "loss": 1.7743, + "step": 18137 + }, + { + "epoch": 5.567219152854512, + "grad_norm": 0.2210713028907776, + "learning_rate": 4.327046434505514e-05, + "loss": 1.7671, + "step": 18138 + }, + { + "epoch": 5.567526089625537, + "grad_norm": 0.21382687985897064, + "learning_rate": 4.3265539044205736e-05, + "loss": 1.793, + "step": 18139 + }, + { + "epoch": 5.5678330263965625, + "grad_norm": 0.23289678990840912, + "learning_rate": 4.326061380991021e-05, + "loss": 1.738, + "step": 18140 + }, + { + "epoch": 5.568139963167588, + "grad_norm": 0.23789258301258087, + "learning_rate": 4.325568864221725e-05, + "loss": 1.8315, + "step": 18141 + }, + { + "epoch": 5.568446899938612, + "grad_norm": 0.1925022453069687, + "learning_rate": 4.325076354117554e-05, + "loss": 1.6956, + "step": 18142 + }, + { + "epoch": 5.568753836709638, + "grad_norm": 0.22522561252117157, + "learning_rate": 4.324583850683373e-05, + "loss": 1.7957, + "step": 18143 + }, + { + "epoch": 5.569060773480663, + "grad_norm": 0.2787671387195587, + "learning_rate": 4.324091353924049e-05, + "loss": 1.7325, + "step": 18144 + }, + { + "epoch": 5.569367710251688, + "grad_norm": 0.2723194658756256, + "learning_rate": 4.3235988638444536e-05, + "loss": 1.7668, + "step": 18145 + }, + { + "epoch": 5.569674647022714, + "grad_norm": 0.2241704910993576, + "learning_rate": 4.3231063804494484e-05, + "loss": 1.7977, + "step": 18146 + }, + { + "epoch": 5.569981583793739, + "grad_norm": 0.2627747356891632, + "learning_rate": 4.322613903743903e-05, + "loss": 1.6775, + "step": 18147 + }, + { + "epoch": 5.570288520564763, + "grad_norm": 0.2644255757331848, + "learning_rate": 4.322121433732686e-05, + "loss": 1.7404, + "step": 18148 + }, + { + "epoch": 5.570595457335789, + "grad_norm": 0.2386743575334549, + "learning_rate": 4.321628970420659e-05, + "loss": 1.7386, + "step": 18149 + }, + { + "epoch": 5.570902394106814, + "grad_norm": 0.22444583475589752, + "learning_rate": 4.3211365138126945e-05, + "loss": 1.7482, + "step": 18150 + }, + { + "epoch": 5.571209330877839, + "grad_norm": 0.21770013868808746, + "learning_rate": 4.3206440639136554e-05, + "loss": 1.7322, + "step": 18151 + }, + { + "epoch": 5.571516267648864, + "grad_norm": 0.22356587648391724, + "learning_rate": 4.320151620728411e-05, + "loss": 1.751, + "step": 18152 + }, + { + "epoch": 5.571823204419889, + "grad_norm": 0.2040669322013855, + "learning_rate": 4.319659184261826e-05, + "loss": 1.712, + "step": 18153 + }, + { + "epoch": 5.5721301411909145, + "grad_norm": 0.20951713621616364, + "learning_rate": 4.319166754518768e-05, + "loss": 1.7308, + "step": 18154 + }, + { + "epoch": 5.57243707796194, + "grad_norm": 0.186195969581604, + "learning_rate": 4.3186743315041025e-05, + "loss": 1.7133, + "step": 18155 + }, + { + "epoch": 5.572744014732965, + "grad_norm": 0.2098865509033203, + "learning_rate": 4.318181915222698e-05, + "loss": 1.7645, + "step": 18156 + }, + { + "epoch": 5.5730509515039905, + "grad_norm": 0.20552097260951996, + "learning_rate": 4.317689505679418e-05, + "loss": 1.7156, + "step": 18157 + }, + { + "epoch": 5.573357888275015, + "grad_norm": 0.22506964206695557, + "learning_rate": 4.3171971028791314e-05, + "loss": 1.7192, + "step": 18158 + }, + { + "epoch": 5.57366482504604, + "grad_norm": 0.2296760082244873, + "learning_rate": 4.316704706826702e-05, + "loss": 1.7534, + "step": 18159 + }, + { + "epoch": 5.573971761817066, + "grad_norm": 0.20140253007411957, + "learning_rate": 4.316212317526998e-05, + "loss": 1.6906, + "step": 18160 + }, + { + "epoch": 5.574278698588091, + "grad_norm": 0.23313316702842712, + "learning_rate": 4.315719934984884e-05, + "loss": 1.6929, + "step": 18161 + }, + { + "epoch": 5.574585635359116, + "grad_norm": 0.23398169875144958, + "learning_rate": 4.315227559205228e-05, + "loss": 1.7254, + "step": 18162 + }, + { + "epoch": 5.574892572130141, + "grad_norm": 0.20836731791496277, + "learning_rate": 4.314735190192894e-05, + "loss": 1.7335, + "step": 18163 + }, + { + "epoch": 5.575199508901166, + "grad_norm": 0.19899079203605652, + "learning_rate": 4.3142428279527485e-05, + "loss": 1.69, + "step": 18164 + }, + { + "epoch": 5.5755064456721914, + "grad_norm": 0.24623680114746094, + "learning_rate": 4.313750472489657e-05, + "loss": 1.7413, + "step": 18165 + }, + { + "epoch": 5.575813382443217, + "grad_norm": 0.2432616949081421, + "learning_rate": 4.313258123808484e-05, + "loss": 1.7426, + "step": 18166 + }, + { + "epoch": 5.576120319214242, + "grad_norm": 0.22773970663547516, + "learning_rate": 4.3127657819141006e-05, + "loss": 1.7986, + "step": 18167 + }, + { + "epoch": 5.5764272559852675, + "grad_norm": 0.19891540706157684, + "learning_rate": 4.312273446811366e-05, + "loss": 1.7007, + "step": 18168 + }, + { + "epoch": 5.576734192756292, + "grad_norm": 0.23402714729309082, + "learning_rate": 4.311781118505149e-05, + "loss": 1.7774, + "step": 18169 + }, + { + "epoch": 5.577041129527317, + "grad_norm": 0.2248220294713974, + "learning_rate": 4.3112887970003134e-05, + "loss": 1.7079, + "step": 18170 + }, + { + "epoch": 5.577348066298343, + "grad_norm": 0.20901209115982056, + "learning_rate": 4.310796482301726e-05, + "loss": 1.7336, + "step": 18171 + }, + { + "epoch": 5.577655003069368, + "grad_norm": 0.21872754395008087, + "learning_rate": 4.3103041744142516e-05, + "loss": 1.7742, + "step": 18172 + }, + { + "epoch": 5.577961939840393, + "grad_norm": 0.2567403018474579, + "learning_rate": 4.309811873342757e-05, + "loss": 1.7894, + "step": 18173 + }, + { + "epoch": 5.578268876611418, + "grad_norm": 0.219998300075531, + "learning_rate": 4.3093195790921035e-05, + "loss": 1.7283, + "step": 18174 + }, + { + "epoch": 5.578575813382443, + "grad_norm": 0.1944747269153595, + "learning_rate": 4.3088272916671614e-05, + "loss": 1.7129, + "step": 18175 + }, + { + "epoch": 5.578882750153468, + "grad_norm": 0.19492141902446747, + "learning_rate": 4.308335011072791e-05, + "loss": 1.7286, + "step": 18176 + }, + { + "epoch": 5.579189686924494, + "grad_norm": 0.22383002936840057, + "learning_rate": 4.3078427373138604e-05, + "loss": 1.733, + "step": 18177 + }, + { + "epoch": 5.579496623695519, + "grad_norm": 0.20238643884658813, + "learning_rate": 4.307350470395232e-05, + "loss": 1.7522, + "step": 18178 + }, + { + "epoch": 5.579803560466544, + "grad_norm": 0.21456125378608704, + "learning_rate": 4.3068582103217755e-05, + "loss": 1.7298, + "step": 18179 + }, + { + "epoch": 5.580110497237569, + "grad_norm": 0.28084230422973633, + "learning_rate": 4.3063659570983514e-05, + "loss": 1.7805, + "step": 18180 + }, + { + "epoch": 5.580417434008594, + "grad_norm": 0.21319706737995148, + "learning_rate": 4.305873710729824e-05, + "loss": 1.6801, + "step": 18181 + }, + { + "epoch": 5.5807243707796195, + "grad_norm": 0.2279660850763321, + "learning_rate": 4.30538147122106e-05, + "loss": 1.752, + "step": 18182 + }, + { + "epoch": 5.581031307550645, + "grad_norm": 0.1958594173192978, + "learning_rate": 4.304889238576922e-05, + "loss": 1.7487, + "step": 18183 + }, + { + "epoch": 5.581338244321669, + "grad_norm": 0.19484321773052216, + "learning_rate": 4.304397012802279e-05, + "loss": 1.7222, + "step": 18184 + }, + { + "epoch": 5.581645181092695, + "grad_norm": 0.19863305985927582, + "learning_rate": 4.3039047939019906e-05, + "loss": 1.7296, + "step": 18185 + }, + { + "epoch": 5.58195211786372, + "grad_norm": 0.18674087524414062, + "learning_rate": 4.303412581880924e-05, + "loss": 1.6753, + "step": 18186 + }, + { + "epoch": 5.582259054634745, + "grad_norm": 0.22263208031654358, + "learning_rate": 4.302920376743941e-05, + "loss": 1.7431, + "step": 18187 + }, + { + "epoch": 5.582565991405771, + "grad_norm": 0.1926872879266739, + "learning_rate": 4.302428178495909e-05, + "loss": 1.7662, + "step": 18188 + }, + { + "epoch": 5.582872928176796, + "grad_norm": 0.23190459609031677, + "learning_rate": 4.301935987141689e-05, + "loss": 1.7271, + "step": 18189 + }, + { + "epoch": 5.58317986494782, + "grad_norm": 0.30057230591773987, + "learning_rate": 4.301443802686148e-05, + "loss": 1.7957, + "step": 18190 + }, + { + "epoch": 5.583486801718846, + "grad_norm": 0.2520695626735687, + "learning_rate": 4.3009516251341475e-05, + "loss": 1.7501, + "step": 18191 + }, + { + "epoch": 5.583793738489871, + "grad_norm": 0.19143317639827728, + "learning_rate": 4.300459454490555e-05, + "loss": 1.7091, + "step": 18192 + }, + { + "epoch": 5.584100675260896, + "grad_norm": 0.2064475119113922, + "learning_rate": 4.299967290760229e-05, + "loss": 1.6849, + "step": 18193 + }, + { + "epoch": 5.584407612031922, + "grad_norm": 0.3093598484992981, + "learning_rate": 4.299475133948039e-05, + "loss": 1.8479, + "step": 18194 + }, + { + "epoch": 5.584714548802946, + "grad_norm": 0.2875300943851471, + "learning_rate": 4.298982984058845e-05, + "loss": 1.7296, + "step": 18195 + }, + { + "epoch": 5.5850214855739715, + "grad_norm": 0.33194443583488464, + "learning_rate": 4.298490841097514e-05, + "loss": 1.7668, + "step": 18196 + }, + { + "epoch": 5.585328422344997, + "grad_norm": 0.20940829813480377, + "learning_rate": 4.297998705068908e-05, + "loss": 1.7316, + "step": 18197 + }, + { + "epoch": 5.585635359116022, + "grad_norm": 0.32381999492645264, + "learning_rate": 4.297506575977887e-05, + "loss": 1.7212, + "step": 18198 + }, + { + "epoch": 5.5859422958870475, + "grad_norm": 0.31585511565208435, + "learning_rate": 4.29701445382932e-05, + "loss": 1.7695, + "step": 18199 + }, + { + "epoch": 5.586249232658073, + "grad_norm": 0.2272588014602661, + "learning_rate": 4.2965223386280664e-05, + "loss": 1.7105, + "step": 18200 + }, + { + "epoch": 5.586556169429097, + "grad_norm": 0.2949761152267456, + "learning_rate": 4.296030230378993e-05, + "loss": 1.803, + "step": 18201 + }, + { + "epoch": 5.586863106200123, + "grad_norm": 0.20512579381465912, + "learning_rate": 4.29553812908696e-05, + "loss": 1.759, + "step": 18202 + }, + { + "epoch": 5.587170042971148, + "grad_norm": 0.21143598854541779, + "learning_rate": 4.295046034756835e-05, + "loss": 1.7286, + "step": 18203 + }, + { + "epoch": 5.587476979742173, + "grad_norm": 0.22148001194000244, + "learning_rate": 4.294553947393476e-05, + "loss": 1.7258, + "step": 18204 + }, + { + "epoch": 5.587783916513199, + "grad_norm": 0.17245957255363464, + "learning_rate": 4.2940618670017484e-05, + "loss": 1.6863, + "step": 18205 + }, + { + "epoch": 5.588090853284223, + "grad_norm": 0.20260390639305115, + "learning_rate": 4.293569793586515e-05, + "loss": 1.6866, + "step": 18206 + }, + { + "epoch": 5.588397790055248, + "grad_norm": 0.20671936869621277, + "learning_rate": 4.293077727152641e-05, + "loss": 1.7849, + "step": 18207 + }, + { + "epoch": 5.588704726826274, + "grad_norm": 0.21415838599205017, + "learning_rate": 4.292585667704984e-05, + "loss": 1.7279, + "step": 18208 + }, + { + "epoch": 5.589011663597299, + "grad_norm": 0.18668091297149658, + "learning_rate": 4.2920936152484134e-05, + "loss": 1.7087, + "step": 18209 + }, + { + "epoch": 5.589318600368324, + "grad_norm": 0.2253870815038681, + "learning_rate": 4.291601569787786e-05, + "loss": 1.769, + "step": 18210 + }, + { + "epoch": 5.58962553713935, + "grad_norm": 0.22426939010620117, + "learning_rate": 4.291109531327968e-05, + "loss": 1.7382, + "step": 18211 + }, + { + "epoch": 5.589932473910374, + "grad_norm": 0.21552452445030212, + "learning_rate": 4.29061749987382e-05, + "loss": 1.7316, + "step": 18212 + }, + { + "epoch": 5.5902394106813995, + "grad_norm": 0.2337147295475006, + "learning_rate": 4.290125475430209e-05, + "loss": 1.7836, + "step": 18213 + }, + { + "epoch": 5.590546347452425, + "grad_norm": 0.21780124306678772, + "learning_rate": 4.289633458001992e-05, + "loss": 1.6923, + "step": 18214 + }, + { + "epoch": 5.59085328422345, + "grad_norm": 0.20009608566761017, + "learning_rate": 4.289141447594033e-05, + "loss": 1.719, + "step": 18215 + }, + { + "epoch": 5.5911602209944755, + "grad_norm": 0.18165744841098785, + "learning_rate": 4.288649444211196e-05, + "loss": 1.6825, + "step": 18216 + }, + { + "epoch": 5.5914671577655, + "grad_norm": 0.2244826704263687, + "learning_rate": 4.288157447858341e-05, + "loss": 1.7323, + "step": 18217 + }, + { + "epoch": 5.591774094536525, + "grad_norm": 0.16875946521759033, + "learning_rate": 4.2876654585403325e-05, + "loss": 1.6787, + "step": 18218 + }, + { + "epoch": 5.592081031307551, + "grad_norm": 0.19244243204593658, + "learning_rate": 4.28717347626203e-05, + "loss": 1.7225, + "step": 18219 + }, + { + "epoch": 5.592387968078576, + "grad_norm": 0.21081633865833282, + "learning_rate": 4.286681501028299e-05, + "loss": 1.7063, + "step": 18220 + }, + { + "epoch": 5.592694904849601, + "grad_norm": 0.20926406979560852, + "learning_rate": 4.286189532843997e-05, + "loss": 1.7307, + "step": 18221 + }, + { + "epoch": 5.593001841620627, + "grad_norm": 0.20258775353431702, + "learning_rate": 4.28569757171399e-05, + "loss": 1.6917, + "step": 18222 + }, + { + "epoch": 5.593308778391651, + "grad_norm": 0.21956230700016022, + "learning_rate": 4.285205617643137e-05, + "loss": 1.7127, + "step": 18223 + }, + { + "epoch": 5.593615715162676, + "grad_norm": 0.2071436047554016, + "learning_rate": 4.284713670636303e-05, + "loss": 1.7487, + "step": 18224 + }, + { + "epoch": 5.593922651933702, + "grad_norm": 0.2002478390932083, + "learning_rate": 4.2842217306983464e-05, + "loss": 1.6544, + "step": 18225 + }, + { + "epoch": 5.594229588704727, + "grad_norm": 0.20691382884979248, + "learning_rate": 4.283729797834132e-05, + "loss": 1.768, + "step": 18226 + }, + { + "epoch": 5.5945365254757515, + "grad_norm": 0.18423563241958618, + "learning_rate": 4.283237872048517e-05, + "loss": 1.7563, + "step": 18227 + }, + { + "epoch": 5.594843462246777, + "grad_norm": 0.23055453598499298, + "learning_rate": 4.2827459533463665e-05, + "loss": 1.8083, + "step": 18228 + }, + { + "epoch": 5.595150399017802, + "grad_norm": 0.20735648274421692, + "learning_rate": 4.2822540417325396e-05, + "loss": 1.7761, + "step": 18229 + }, + { + "epoch": 5.5954573357888275, + "grad_norm": 0.2919909656047821, + "learning_rate": 4.281762137211902e-05, + "loss": 1.7836, + "step": 18230 + }, + { + "epoch": 5.595764272559853, + "grad_norm": 0.22636881470680237, + "learning_rate": 4.2812702397893113e-05, + "loss": 1.7389, + "step": 18231 + }, + { + "epoch": 5.596071209330878, + "grad_norm": 0.23788630962371826, + "learning_rate": 4.280778349469627e-05, + "loss": 1.7536, + "step": 18232 + }, + { + "epoch": 5.596378146101903, + "grad_norm": 0.22089426219463348, + "learning_rate": 4.280286466257715e-05, + "loss": 1.7584, + "step": 18233 + }, + { + "epoch": 5.596685082872928, + "grad_norm": 0.20486171543598175, + "learning_rate": 4.279794590158431e-05, + "loss": 1.7182, + "step": 18234 + }, + { + "epoch": 5.596992019643953, + "grad_norm": 0.2343701422214508, + "learning_rate": 4.2793027211766425e-05, + "loss": 1.751, + "step": 18235 + }, + { + "epoch": 5.597298956414979, + "grad_norm": 0.21734023094177246, + "learning_rate": 4.2788108593172036e-05, + "loss": 1.7084, + "step": 18236 + }, + { + "epoch": 5.597605893186004, + "grad_norm": 0.20593903958797455, + "learning_rate": 4.278319004584982e-05, + "loss": 1.6805, + "step": 18237 + }, + { + "epoch": 5.597912829957028, + "grad_norm": 0.20877878367900848, + "learning_rate": 4.2778271569848324e-05, + "loss": 1.7011, + "step": 18238 + }, + { + "epoch": 5.598219766728054, + "grad_norm": 0.23915995657444, + "learning_rate": 4.277335316521619e-05, + "loss": 1.732, + "step": 18239 + }, + { + "epoch": 5.598526703499079, + "grad_norm": 0.24310529232025146, + "learning_rate": 4.2768434832002004e-05, + "loss": 1.7859, + "step": 18240 + }, + { + "epoch": 5.598833640270104, + "grad_norm": 0.23189407587051392, + "learning_rate": 4.27635165702544e-05, + "loss": 1.7237, + "step": 18241 + }, + { + "epoch": 5.59914057704113, + "grad_norm": 0.2708875834941864, + "learning_rate": 4.275859838002195e-05, + "loss": 1.7046, + "step": 18242 + }, + { + "epoch": 5.599447513812155, + "grad_norm": 0.23692840337753296, + "learning_rate": 4.27536802613533e-05, + "loss": 1.8556, + "step": 18243 + }, + { + "epoch": 5.5997544505831796, + "grad_norm": 0.28285983204841614, + "learning_rate": 4.274876221429701e-05, + "loss": 1.6734, + "step": 18244 + }, + { + "epoch": 5.600061387354205, + "grad_norm": 0.20602203905582428, + "learning_rate": 4.27438442389017e-05, + "loss": 1.7113, + "step": 18245 + }, + { + "epoch": 5.60036832412523, + "grad_norm": 0.19719314575195312, + "learning_rate": 4.273892633521598e-05, + "loss": 1.7229, + "step": 18246 + }, + { + "epoch": 5.600675260896256, + "grad_norm": 0.2396705001592636, + "learning_rate": 4.273400850328846e-05, + "loss": 1.6986, + "step": 18247 + }, + { + "epoch": 5.600982197667281, + "grad_norm": 0.1974172443151474, + "learning_rate": 4.2729090743167724e-05, + "loss": 1.7445, + "step": 18248 + }, + { + "epoch": 5.601289134438305, + "grad_norm": 0.2193709760904312, + "learning_rate": 4.272417305490235e-05, + "loss": 1.7657, + "step": 18249 + }, + { + "epoch": 5.601596071209331, + "grad_norm": 0.24138681590557098, + "learning_rate": 4.271925543854098e-05, + "loss": 1.7388, + "step": 18250 + }, + { + "epoch": 5.601903007980356, + "grad_norm": 0.19056223332881927, + "learning_rate": 4.271433789413219e-05, + "loss": 1.6897, + "step": 18251 + }, + { + "epoch": 5.602209944751381, + "grad_norm": 0.20533505082130432, + "learning_rate": 4.270942042172459e-05, + "loss": 1.7222, + "step": 18252 + }, + { + "epoch": 5.602516881522407, + "grad_norm": 0.20570224523544312, + "learning_rate": 4.270450302136675e-05, + "loss": 1.8089, + "step": 18253 + }, + { + "epoch": 5.602823818293432, + "grad_norm": 0.2822209298610687, + "learning_rate": 4.269958569310732e-05, + "loss": 1.7523, + "step": 18254 + }, + { + "epoch": 5.6031307550644565, + "grad_norm": 0.2994859218597412, + "learning_rate": 4.269466843699484e-05, + "loss": 1.7538, + "step": 18255 + }, + { + "epoch": 5.603437691835482, + "grad_norm": 0.24851159751415253, + "learning_rate": 4.2689751253077925e-05, + "loss": 1.8162, + "step": 18256 + }, + { + "epoch": 5.603744628606507, + "grad_norm": 0.20387138426303864, + "learning_rate": 4.268483414140517e-05, + "loss": 1.6803, + "step": 18257 + }, + { + "epoch": 5.6040515653775325, + "grad_norm": 0.21620385348796844, + "learning_rate": 4.2679917102025204e-05, + "loss": 1.7236, + "step": 18258 + }, + { + "epoch": 5.604358502148557, + "grad_norm": 0.1925734579563141, + "learning_rate": 4.267500013498655e-05, + "loss": 1.7295, + "step": 18259 + }, + { + "epoch": 5.604665438919582, + "grad_norm": 0.22216086089611053, + "learning_rate": 4.267008324033787e-05, + "loss": 1.6844, + "step": 18260 + }, + { + "epoch": 5.604972375690608, + "grad_norm": 0.20293502509593964, + "learning_rate": 4.26651664181277e-05, + "loss": 1.7065, + "step": 18261 + }, + { + "epoch": 5.605279312461633, + "grad_norm": 0.21269507706165314, + "learning_rate": 4.266024966840466e-05, + "loss": 1.7573, + "step": 18262 + }, + { + "epoch": 5.605586249232658, + "grad_norm": 0.23574227094650269, + "learning_rate": 4.2655332991217334e-05, + "loss": 1.7625, + "step": 18263 + }, + { + "epoch": 5.605893186003684, + "grad_norm": 0.1875103861093521, + "learning_rate": 4.265041638661433e-05, + "loss": 1.7266, + "step": 18264 + }, + { + "epoch": 5.606200122774708, + "grad_norm": 0.20348483324050903, + "learning_rate": 4.264549985464421e-05, + "loss": 1.731, + "step": 18265 + }, + { + "epoch": 5.606507059545733, + "grad_norm": 0.2345927655696869, + "learning_rate": 4.264058339535556e-05, + "loss": 1.7809, + "step": 18266 + }, + { + "epoch": 5.606813996316759, + "grad_norm": 0.21142496168613434, + "learning_rate": 4.2635667008796985e-05, + "loss": 1.7362, + "step": 18267 + }, + { + "epoch": 5.607120933087784, + "grad_norm": 0.19670210778713226, + "learning_rate": 4.263075069501705e-05, + "loss": 1.7029, + "step": 18268 + }, + { + "epoch": 5.607427869858809, + "grad_norm": 0.20985090732574463, + "learning_rate": 4.262583445406439e-05, + "loss": 1.7478, + "step": 18269 + }, + { + "epoch": 5.607734806629834, + "grad_norm": 0.20972272753715515, + "learning_rate": 4.262091828598752e-05, + "loss": 1.7561, + "step": 18270 + }, + { + "epoch": 5.608041743400859, + "grad_norm": 0.20006676018238068, + "learning_rate": 4.261600219083509e-05, + "loss": 1.7584, + "step": 18271 + }, + { + "epoch": 5.6083486801718845, + "grad_norm": 0.21590086817741394, + "learning_rate": 4.2611086168655635e-05, + "loss": 1.7405, + "step": 18272 + }, + { + "epoch": 5.60865561694291, + "grad_norm": 0.19330906867980957, + "learning_rate": 4.260617021949776e-05, + "loss": 1.6797, + "step": 18273 + }, + { + "epoch": 5.608962553713935, + "grad_norm": 0.1955050528049469, + "learning_rate": 4.260125434341004e-05, + "loss": 1.7174, + "step": 18274 + }, + { + "epoch": 5.6092694904849605, + "grad_norm": 0.2117784321308136, + "learning_rate": 4.2596338540441086e-05, + "loss": 1.743, + "step": 18275 + }, + { + "epoch": 5.609576427255985, + "grad_norm": 0.21788950264453888, + "learning_rate": 4.2591422810639425e-05, + "loss": 1.7603, + "step": 18276 + }, + { + "epoch": 5.60988336402701, + "grad_norm": 0.2092670351266861, + "learning_rate": 4.258650715405369e-05, + "loss": 1.7379, + "step": 18277 + }, + { + "epoch": 5.610190300798036, + "grad_norm": 0.1941552758216858, + "learning_rate": 4.2581591570732414e-05, + "loss": 1.7547, + "step": 18278 + }, + { + "epoch": 5.610497237569061, + "grad_norm": 0.21306751668453217, + "learning_rate": 4.2576676060724215e-05, + "loss": 1.7284, + "step": 18279 + }, + { + "epoch": 5.610804174340086, + "grad_norm": 0.18618693947792053, + "learning_rate": 4.2571760624077635e-05, + "loss": 1.7268, + "step": 18280 + }, + { + "epoch": 5.611111111111111, + "grad_norm": 0.21530354022979736, + "learning_rate": 4.256684526084129e-05, + "loss": 1.7036, + "step": 18281 + }, + { + "epoch": 5.611418047882136, + "grad_norm": 0.23363792896270752, + "learning_rate": 4.256192997106375e-05, + "loss": 1.7797, + "step": 18282 + }, + { + "epoch": 5.611724984653161, + "grad_norm": 0.1786416620016098, + "learning_rate": 4.2557014754793544e-05, + "loss": 1.7008, + "step": 18283 + }, + { + "epoch": 5.612031921424187, + "grad_norm": 0.2042730301618576, + "learning_rate": 4.25520996120793e-05, + "loss": 1.7667, + "step": 18284 + }, + { + "epoch": 5.612338858195212, + "grad_norm": 0.2275264412164688, + "learning_rate": 4.2547184542969554e-05, + "loss": 1.8277, + "step": 18285 + }, + { + "epoch": 5.612645794966237, + "grad_norm": 0.21252553164958954, + "learning_rate": 4.2542269547512925e-05, + "loss": 1.7272, + "step": 18286 + }, + { + "epoch": 5.612952731737262, + "grad_norm": 0.20384398102760315, + "learning_rate": 4.2537354625757934e-05, + "loss": 1.6707, + "step": 18287 + }, + { + "epoch": 5.613259668508287, + "grad_norm": 0.19805553555488586, + "learning_rate": 4.253243977775321e-05, + "loss": 1.7443, + "step": 18288 + }, + { + "epoch": 5.6135666052793125, + "grad_norm": 0.20447707176208496, + "learning_rate": 4.2527525003547256e-05, + "loss": 1.7392, + "step": 18289 + }, + { + "epoch": 5.613873542050338, + "grad_norm": 0.21025662124156952, + "learning_rate": 4.25226103031887e-05, + "loss": 1.7856, + "step": 18290 + }, + { + "epoch": 5.614180478821363, + "grad_norm": 0.2131013125181198, + "learning_rate": 4.2517695676726085e-05, + "loss": 1.7521, + "step": 18291 + }, + { + "epoch": 5.614487415592388, + "grad_norm": 0.2511558532714844, + "learning_rate": 4.2512781124208e-05, + "loss": 1.6873, + "step": 18292 + }, + { + "epoch": 5.614794352363413, + "grad_norm": 0.19668610394001007, + "learning_rate": 4.2507866645682984e-05, + "loss": 1.6808, + "step": 18293 + }, + { + "epoch": 5.615101289134438, + "grad_norm": 0.22313621640205383, + "learning_rate": 4.2502952241199637e-05, + "loss": 1.7794, + "step": 18294 + }, + { + "epoch": 5.615408225905464, + "grad_norm": 0.2053089439868927, + "learning_rate": 4.249803791080649e-05, + "loss": 1.7405, + "step": 18295 + }, + { + "epoch": 5.615715162676489, + "grad_norm": 0.2052931934595108, + "learning_rate": 4.249312365455215e-05, + "loss": 1.6698, + "step": 18296 + }, + { + "epoch": 5.616022099447514, + "grad_norm": 0.223783478140831, + "learning_rate": 4.248820947248515e-05, + "loss": 1.7696, + "step": 18297 + }, + { + "epoch": 5.616329036218539, + "grad_norm": 0.3424001932144165, + "learning_rate": 4.248329536465407e-05, + "loss": 1.7724, + "step": 18298 + }, + { + "epoch": 5.616635972989564, + "grad_norm": 0.25015103816986084, + "learning_rate": 4.247838133110749e-05, + "loss": 1.7188, + "step": 18299 + }, + { + "epoch": 5.616942909760589, + "grad_norm": 0.239765465259552, + "learning_rate": 4.247346737189392e-05, + "loss": 1.695, + "step": 18300 + }, + { + "epoch": 5.617249846531615, + "grad_norm": 0.42259401082992554, + "learning_rate": 4.246855348706197e-05, + "loss": 1.6882, + "step": 18301 + }, + { + "epoch": 5.617556783302639, + "grad_norm": 0.2985959053039551, + "learning_rate": 4.246363967666018e-05, + "loss": 1.7236, + "step": 18302 + }, + { + "epoch": 5.6178637200736645, + "grad_norm": 0.22437956929206848, + "learning_rate": 4.245872594073714e-05, + "loss": 1.7158, + "step": 18303 + }, + { + "epoch": 5.61817065684469, + "grad_norm": 0.3165835440158844, + "learning_rate": 4.245381227934138e-05, + "loss": 1.7543, + "step": 18304 + }, + { + "epoch": 5.618477593615715, + "grad_norm": 0.2565564513206482, + "learning_rate": 4.244889869252148e-05, + "loss": 1.7863, + "step": 18305 + }, + { + "epoch": 5.6187845303867405, + "grad_norm": 0.25741446018218994, + "learning_rate": 4.244398518032597e-05, + "loss": 1.721, + "step": 18306 + }, + { + "epoch": 5.619091467157766, + "grad_norm": 0.26492297649383545, + "learning_rate": 4.2439071742803435e-05, + "loss": 1.7697, + "step": 18307 + }, + { + "epoch": 5.61939840392879, + "grad_norm": 0.2086823433637619, + "learning_rate": 4.243415838000243e-05, + "loss": 1.7072, + "step": 18308 + }, + { + "epoch": 5.619705340699816, + "grad_norm": 0.26784422993659973, + "learning_rate": 4.24292450919715e-05, + "loss": 1.7826, + "step": 18309 + }, + { + "epoch": 5.620012277470841, + "grad_norm": 0.21774251759052277, + "learning_rate": 4.242433187875921e-05, + "loss": 1.7204, + "step": 18310 + }, + { + "epoch": 5.620319214241866, + "grad_norm": 0.29547446966171265, + "learning_rate": 4.241941874041412e-05, + "loss": 1.7303, + "step": 18311 + }, + { + "epoch": 5.620626151012892, + "grad_norm": 0.20278988778591156, + "learning_rate": 4.241450567698476e-05, + "loss": 1.692, + "step": 18312 + }, + { + "epoch": 5.620933087783916, + "grad_norm": 0.2084289938211441, + "learning_rate": 4.240959268851971e-05, + "loss": 1.7069, + "step": 18313 + }, + { + "epoch": 5.621240024554941, + "grad_norm": 0.19901904463768005, + "learning_rate": 4.240467977506752e-05, + "loss": 1.6798, + "step": 18314 + }, + { + "epoch": 5.621546961325967, + "grad_norm": 0.24629411101341248, + "learning_rate": 4.2399766936676735e-05, + "loss": 1.775, + "step": 18315 + }, + { + "epoch": 5.621853898096992, + "grad_norm": 0.2532403767108917, + "learning_rate": 4.239485417339591e-05, + "loss": 1.7669, + "step": 18316 + }, + { + "epoch": 5.622160834868017, + "grad_norm": 0.22495722770690918, + "learning_rate": 4.2389941485273576e-05, + "loss": 1.7772, + "step": 18317 + }, + { + "epoch": 5.622467771639043, + "grad_norm": 0.2789733111858368, + "learning_rate": 4.2385028872358316e-05, + "loss": 1.751, + "step": 18318 + }, + { + "epoch": 5.622774708410067, + "grad_norm": 0.2266954481601715, + "learning_rate": 4.238011633469866e-05, + "loss": 1.7213, + "step": 18319 + }, + { + "epoch": 5.6230816451810925, + "grad_norm": 0.2163502722978592, + "learning_rate": 4.237520387234316e-05, + "loss": 1.7781, + "step": 18320 + }, + { + "epoch": 5.623388581952118, + "grad_norm": 0.25249144434928894, + "learning_rate": 4.237029148534036e-05, + "loss": 1.7293, + "step": 18321 + }, + { + "epoch": 5.623695518723143, + "grad_norm": 0.2320011854171753, + "learning_rate": 4.2365379173738826e-05, + "loss": 1.7909, + "step": 18322 + }, + { + "epoch": 5.6240024554941686, + "grad_norm": 0.22074681520462036, + "learning_rate": 4.2360466937587074e-05, + "loss": 1.743, + "step": 18323 + }, + { + "epoch": 5.624309392265193, + "grad_norm": 0.20864775776863098, + "learning_rate": 4.235555477693368e-05, + "loss": 1.726, + "step": 18324 + }, + { + "epoch": 5.624616329036218, + "grad_norm": 0.24547792971134186, + "learning_rate": 4.235064269182716e-05, + "loss": 1.7646, + "step": 18325 + }, + { + "epoch": 5.624923265807244, + "grad_norm": 0.29965806007385254, + "learning_rate": 4.234573068231607e-05, + "loss": 1.7789, + "step": 18326 + }, + { + "epoch": 5.625230202578269, + "grad_norm": 0.20844583213329315, + "learning_rate": 4.234081874844896e-05, + "loss": 1.7007, + "step": 18327 + }, + { + "epoch": 5.625537139349294, + "grad_norm": 0.2455398142337799, + "learning_rate": 4.2335906890274385e-05, + "loss": 1.7094, + "step": 18328 + }, + { + "epoch": 5.62584407612032, + "grad_norm": 0.17839518189430237, + "learning_rate": 4.233099510784085e-05, + "loss": 1.6849, + "step": 18329 + }, + { + "epoch": 5.626151012891344, + "grad_norm": 0.20219004154205322, + "learning_rate": 4.232608340119693e-05, + "loss": 1.716, + "step": 18330 + }, + { + "epoch": 5.6264579496623695, + "grad_norm": 0.23570619523525238, + "learning_rate": 4.232117177039114e-05, + "loss": 1.7622, + "step": 18331 + }, + { + "epoch": 5.626764886433395, + "grad_norm": 0.23534397780895233, + "learning_rate": 4.231626021547204e-05, + "loss": 1.7758, + "step": 18332 + }, + { + "epoch": 5.62707182320442, + "grad_norm": 0.2177352011203766, + "learning_rate": 4.231134873648817e-05, + "loss": 1.7102, + "step": 18333 + }, + { + "epoch": 5.627378759975445, + "grad_norm": 0.22886058688163757, + "learning_rate": 4.230643733348803e-05, + "loss": 1.7766, + "step": 18334 + }, + { + "epoch": 5.62768569674647, + "grad_norm": 0.20723696053028107, + "learning_rate": 4.2301526006520215e-05, + "loss": 1.7287, + "step": 18335 + }, + { + "epoch": 5.627992633517495, + "grad_norm": 0.18612104654312134, + "learning_rate": 4.229661475563321e-05, + "loss": 1.7255, + "step": 18336 + }, + { + "epoch": 5.628299570288521, + "grad_norm": 0.26456236839294434, + "learning_rate": 4.229170358087558e-05, + "loss": 1.7388, + "step": 18337 + }, + { + "epoch": 5.628606507059546, + "grad_norm": 0.25253555178642273, + "learning_rate": 4.2286792482295845e-05, + "loss": 1.7031, + "step": 18338 + }, + { + "epoch": 5.628913443830571, + "grad_norm": 0.23093348741531372, + "learning_rate": 4.228188145994257e-05, + "loss": 1.8032, + "step": 18339 + }, + { + "epoch": 5.629220380601596, + "grad_norm": 0.24142487347126007, + "learning_rate": 4.227697051386424e-05, + "loss": 1.6621, + "step": 18340 + }, + { + "epoch": 5.629527317372621, + "grad_norm": 0.2883392572402954, + "learning_rate": 4.227205964410944e-05, + "loss": 1.7125, + "step": 18341 + }, + { + "epoch": 5.629834254143646, + "grad_norm": 0.22670713067054749, + "learning_rate": 4.226714885072665e-05, + "loss": 1.7659, + "step": 18342 + }, + { + "epoch": 5.630141190914672, + "grad_norm": 0.2795337438583374, + "learning_rate": 4.226223813376444e-05, + "loss": 1.7559, + "step": 18343 + }, + { + "epoch": 5.630448127685697, + "grad_norm": 0.2513083219528198, + "learning_rate": 4.225732749327132e-05, + "loss": 1.6969, + "step": 18344 + }, + { + "epoch": 5.6307550644567215, + "grad_norm": 0.24588467180728912, + "learning_rate": 4.225241692929585e-05, + "loss": 1.7724, + "step": 18345 + }, + { + "epoch": 5.631062001227747, + "grad_norm": 0.41726353764533997, + "learning_rate": 4.224750644188651e-05, + "loss": 1.7308, + "step": 18346 + }, + { + "epoch": 5.631368937998772, + "grad_norm": 0.2512385845184326, + "learning_rate": 4.2242596031091886e-05, + "loss": 1.7068, + "step": 18347 + }, + { + "epoch": 5.6316758747697975, + "grad_norm": 0.3077464997768402, + "learning_rate": 4.223768569696044e-05, + "loss": 1.7383, + "step": 18348 + }, + { + "epoch": 5.631982811540823, + "grad_norm": 0.3460720479488373, + "learning_rate": 4.2232775439540756e-05, + "loss": 1.7317, + "step": 18349 + }, + { + "epoch": 5.632289748311848, + "grad_norm": 0.24827539920806885, + "learning_rate": 4.222786525888134e-05, + "loss": 1.6871, + "step": 18350 + }, + { + "epoch": 5.632596685082873, + "grad_norm": 0.24851584434509277, + "learning_rate": 4.22229551550307e-05, + "loss": 1.7058, + "step": 18351 + }, + { + "epoch": 5.632903621853898, + "grad_norm": 0.31132519245147705, + "learning_rate": 4.2218045128037396e-05, + "loss": 1.7523, + "step": 18352 + }, + { + "epoch": 5.633210558624923, + "grad_norm": 0.3104027807712555, + "learning_rate": 4.2213135177949906e-05, + "loss": 1.7669, + "step": 18353 + }, + { + "epoch": 5.633517495395949, + "grad_norm": 0.31351104378700256, + "learning_rate": 4.2208225304816795e-05, + "loss": 1.7031, + "step": 18354 + }, + { + "epoch": 5.633824432166974, + "grad_norm": 0.3217851221561432, + "learning_rate": 4.2203315508686555e-05, + "loss": 1.7694, + "step": 18355 + }, + { + "epoch": 5.634131368937998, + "grad_norm": 0.22287796437740326, + "learning_rate": 4.2198405789607745e-05, + "loss": 1.7742, + "step": 18356 + }, + { + "epoch": 5.634438305709024, + "grad_norm": 0.20288340747356415, + "learning_rate": 4.219349614762883e-05, + "loss": 1.7113, + "step": 18357 + }, + { + "epoch": 5.634745242480049, + "grad_norm": 0.19823449850082397, + "learning_rate": 4.218858658279839e-05, + "loss": 1.7433, + "step": 18358 + }, + { + "epoch": 5.635052179251074, + "grad_norm": 0.2756347358226776, + "learning_rate": 4.2183677095164895e-05, + "loss": 1.8278, + "step": 18359 + }, + { + "epoch": 5.6353591160221, + "grad_norm": 0.2303706556558609, + "learning_rate": 4.2178767684776895e-05, + "loss": 1.6943, + "step": 18360 + }, + { + "epoch": 5.635666052793125, + "grad_norm": 0.25089216232299805, + "learning_rate": 4.217385835168288e-05, + "loss": 1.6562, + "step": 18361 + }, + { + "epoch": 5.6359729895641495, + "grad_norm": 0.3013486862182617, + "learning_rate": 4.216894909593141e-05, + "loss": 1.7323, + "step": 18362 + }, + { + "epoch": 5.636279926335175, + "grad_norm": 0.19471928477287292, + "learning_rate": 4.2164039917570956e-05, + "loss": 1.7301, + "step": 18363 + }, + { + "epoch": 5.6365868631062, + "grad_norm": 0.3257733881473541, + "learning_rate": 4.2159130816650075e-05, + "loss": 1.7522, + "step": 18364 + }, + { + "epoch": 5.6368937998772255, + "grad_norm": 0.3065868020057678, + "learning_rate": 4.215422179321723e-05, + "loss": 1.7077, + "step": 18365 + }, + { + "epoch": 5.637200736648251, + "grad_norm": 0.20643819868564606, + "learning_rate": 4.214931284732098e-05, + "loss": 1.8033, + "step": 18366 + }, + { + "epoch": 5.637507673419275, + "grad_norm": 0.23551981151103973, + "learning_rate": 4.2144403979009826e-05, + "loss": 1.7391, + "step": 18367 + }, + { + "epoch": 5.637814610190301, + "grad_norm": 0.20602314174175262, + "learning_rate": 4.2139495188332265e-05, + "loss": 1.7593, + "step": 18368 + }, + { + "epoch": 5.638121546961326, + "grad_norm": 0.27911239862442017, + "learning_rate": 4.2134586475336834e-05, + "loss": 1.7212, + "step": 18369 + }, + { + "epoch": 5.638428483732351, + "grad_norm": 0.2700496017932892, + "learning_rate": 4.212967784007201e-05, + "loss": 1.7755, + "step": 18370 + }, + { + "epoch": 5.638735420503377, + "grad_norm": 0.24988985061645508, + "learning_rate": 4.2124769282586334e-05, + "loss": 1.7364, + "step": 18371 + }, + { + "epoch": 5.639042357274402, + "grad_norm": 0.20491284132003784, + "learning_rate": 4.211986080292829e-05, + "loss": 1.7477, + "step": 18372 + }, + { + "epoch": 5.639349294045426, + "grad_norm": 0.24953459203243256, + "learning_rate": 4.211495240114643e-05, + "loss": 1.7712, + "step": 18373 + }, + { + "epoch": 5.639656230816452, + "grad_norm": 0.2028491199016571, + "learning_rate": 4.2110044077289204e-05, + "loss": 1.701, + "step": 18374 + }, + { + "epoch": 5.639963167587477, + "grad_norm": 0.22320568561553955, + "learning_rate": 4.210513583140517e-05, + "loss": 1.7818, + "step": 18375 + }, + { + "epoch": 5.640270104358502, + "grad_norm": 0.22680947184562683, + "learning_rate": 4.210022766354278e-05, + "loss": 1.7631, + "step": 18376 + }, + { + "epoch": 5.640577041129527, + "grad_norm": 0.20724014937877655, + "learning_rate": 4.2095319573750596e-05, + "loss": 1.7757, + "step": 18377 + }, + { + "epoch": 5.640883977900552, + "grad_norm": 0.21785953640937805, + "learning_rate": 4.209041156207708e-05, + "loss": 1.7161, + "step": 18378 + }, + { + "epoch": 5.6411909146715775, + "grad_norm": 0.21751803159713745, + "learning_rate": 4.208550362857078e-05, + "loss": 1.7449, + "step": 18379 + }, + { + "epoch": 5.641497851442603, + "grad_norm": 0.1765962839126587, + "learning_rate": 4.208059577328014e-05, + "loss": 1.7191, + "step": 18380 + }, + { + "epoch": 5.641804788213628, + "grad_norm": 0.22720913589000702, + "learning_rate": 4.2075687996253724e-05, + "loss": 1.7037, + "step": 18381 + }, + { + "epoch": 5.6421117249846535, + "grad_norm": 0.23589655756950378, + "learning_rate": 4.2070780297539976e-05, + "loss": 1.8147, + "step": 18382 + }, + { + "epoch": 5.642418661755678, + "grad_norm": 0.21187056601047516, + "learning_rate": 4.2065872677187435e-05, + "loss": 1.7655, + "step": 18383 + }, + { + "epoch": 5.642725598526703, + "grad_norm": 0.24153946340084076, + "learning_rate": 4.2060965135244606e-05, + "loss": 1.7841, + "step": 18384 + }, + { + "epoch": 5.643032535297729, + "grad_norm": 0.2059229612350464, + "learning_rate": 4.205605767175995e-05, + "loss": 1.6718, + "step": 18385 + }, + { + "epoch": 5.643339472068754, + "grad_norm": 0.20235973596572876, + "learning_rate": 4.205115028678201e-05, + "loss": 1.6931, + "step": 18386 + }, + { + "epoch": 5.643646408839779, + "grad_norm": 0.25149911642074585, + "learning_rate": 4.204624298035924e-05, + "loss": 1.7465, + "step": 18387 + }, + { + "epoch": 5.643953345610804, + "grad_norm": 0.2050812691450119, + "learning_rate": 4.204133575254017e-05, + "loss": 1.7147, + "step": 18388 + }, + { + "epoch": 5.644260282381829, + "grad_norm": 0.20906420052051544, + "learning_rate": 4.2036428603373274e-05, + "loss": 1.6762, + "step": 18389 + }, + { + "epoch": 5.644567219152854, + "grad_norm": 0.20150595903396606, + "learning_rate": 4.2031521532907075e-05, + "loss": 1.678, + "step": 18390 + }, + { + "epoch": 5.64487415592388, + "grad_norm": 0.2141568511724472, + "learning_rate": 4.202661454119004e-05, + "loss": 1.7274, + "step": 18391 + }, + { + "epoch": 5.645181092694905, + "grad_norm": 0.2641741931438446, + "learning_rate": 4.202170762827069e-05, + "loss": 1.7975, + "step": 18392 + }, + { + "epoch": 5.64548802946593, + "grad_norm": 0.22928468883037567, + "learning_rate": 4.201680079419747e-05, + "loss": 1.7687, + "step": 18393 + }, + { + "epoch": 5.645794966236955, + "grad_norm": 0.22713731229305267, + "learning_rate": 4.2011894039018925e-05, + "loss": 1.7475, + "step": 18394 + }, + { + "epoch": 5.64610190300798, + "grad_norm": 0.25602981448173523, + "learning_rate": 4.200698736278351e-05, + "loss": 1.7356, + "step": 18395 + }, + { + "epoch": 5.6464088397790055, + "grad_norm": 0.2619759738445282, + "learning_rate": 4.200208076553975e-05, + "loss": 1.7334, + "step": 18396 + }, + { + "epoch": 5.646715776550031, + "grad_norm": 0.24756783246994019, + "learning_rate": 4.19971742473361e-05, + "loss": 1.7253, + "step": 18397 + }, + { + "epoch": 5.647022713321056, + "grad_norm": 0.2068249136209488, + "learning_rate": 4.199226780822109e-05, + "loss": 1.7246, + "step": 18398 + }, + { + "epoch": 5.647329650092081, + "grad_norm": 0.23219087719917297, + "learning_rate": 4.1987361448243165e-05, + "loss": 1.7388, + "step": 18399 + }, + { + "epoch": 5.647636586863106, + "grad_norm": 0.2051403522491455, + "learning_rate": 4.198245516745082e-05, + "loss": 1.7775, + "step": 18400 + }, + { + "epoch": 5.647943523634131, + "grad_norm": 0.26408639550209045, + "learning_rate": 4.1977548965892575e-05, + "loss": 1.8069, + "step": 18401 + }, + { + "epoch": 5.648250460405157, + "grad_norm": 0.2104891538619995, + "learning_rate": 4.197264284361687e-05, + "loss": 1.7335, + "step": 18402 + }, + { + "epoch": 5.648557397176182, + "grad_norm": 0.23963849246501923, + "learning_rate": 4.196773680067224e-05, + "loss": 1.7254, + "step": 18403 + }, + { + "epoch": 5.648864333947207, + "grad_norm": 0.2770128846168518, + "learning_rate": 4.1962830837107117e-05, + "loss": 1.7848, + "step": 18404 + }, + { + "epoch": 5.649171270718232, + "grad_norm": 0.23342710733413696, + "learning_rate": 4.195792495297002e-05, + "loss": 1.7818, + "step": 18405 + }, + { + "epoch": 5.649478207489257, + "grad_norm": 0.23835061490535736, + "learning_rate": 4.195301914830941e-05, + "loss": 1.7453, + "step": 18406 + }, + { + "epoch": 5.649785144260282, + "grad_norm": 0.21896767616271973, + "learning_rate": 4.194811342317381e-05, + "loss": 1.7205, + "step": 18407 + }, + { + "epoch": 5.650092081031308, + "grad_norm": 0.20222818851470947, + "learning_rate": 4.1943207777611646e-05, + "loss": 1.6833, + "step": 18408 + }, + { + "epoch": 5.650399017802332, + "grad_norm": 0.2182089239358902, + "learning_rate": 4.193830221167146e-05, + "loss": 1.7296, + "step": 18409 + }, + { + "epoch": 5.650705954573358, + "grad_norm": 0.19981688261032104, + "learning_rate": 4.1933396725401655e-05, + "loss": 1.7327, + "step": 18410 + }, + { + "epoch": 5.651012891344383, + "grad_norm": 0.23925067484378815, + "learning_rate": 4.192849131885077e-05, + "loss": 1.7545, + "step": 18411 + }, + { + "epoch": 5.651319828115408, + "grad_norm": 0.21967993676662445, + "learning_rate": 4.192358599206725e-05, + "loss": 1.6973, + "step": 18412 + }, + { + "epoch": 5.651626764886434, + "grad_norm": 0.2273840606212616, + "learning_rate": 4.1918680745099614e-05, + "loss": 1.8229, + "step": 18413 + }, + { + "epoch": 5.651933701657459, + "grad_norm": 0.26950231194496155, + "learning_rate": 4.1913775577996286e-05, + "loss": 1.7666, + "step": 18414 + }, + { + "epoch": 5.652240638428484, + "grad_norm": 0.26608848571777344, + "learning_rate": 4.190887049080579e-05, + "loss": 1.8279, + "step": 18415 + }, + { + "epoch": 5.652547575199509, + "grad_norm": 0.20856785774230957, + "learning_rate": 4.190396548357658e-05, + "loss": 1.7224, + "step": 18416 + }, + { + "epoch": 5.652854511970534, + "grad_norm": 0.2894255816936493, + "learning_rate": 4.18990605563571e-05, + "loss": 1.7308, + "step": 18417 + }, + { + "epoch": 5.653161448741559, + "grad_norm": 0.2047591209411621, + "learning_rate": 4.189415570919588e-05, + "loss": 1.758, + "step": 18418 + }, + { + "epoch": 5.653468385512585, + "grad_norm": 0.37161269783973694, + "learning_rate": 4.1889250942141346e-05, + "loss": 1.7926, + "step": 18419 + }, + { + "epoch": 5.653775322283609, + "grad_norm": 0.37338340282440186, + "learning_rate": 4.1884346255242e-05, + "loss": 1.7491, + "step": 18420 + }, + { + "epoch": 5.6540822590546345, + "grad_norm": 0.24279838800430298, + "learning_rate": 4.187944164854629e-05, + "loss": 1.7103, + "step": 18421 + }, + { + "epoch": 5.65438919582566, + "grad_norm": 0.219639852643013, + "learning_rate": 4.18745371221027e-05, + "loss": 1.7824, + "step": 18422 + }, + { + "epoch": 5.654696132596685, + "grad_norm": 0.22248409688472748, + "learning_rate": 4.186963267595969e-05, + "loss": 1.8098, + "step": 18423 + }, + { + "epoch": 5.6550030693677105, + "grad_norm": 0.2115657478570938, + "learning_rate": 4.1864728310165755e-05, + "loss": 1.72, + "step": 18424 + }, + { + "epoch": 5.655310006138736, + "grad_norm": 0.19723005592823029, + "learning_rate": 4.1859824024769325e-05, + "loss": 1.6818, + "step": 18425 + }, + { + "epoch": 5.65561694290976, + "grad_norm": 0.1828317642211914, + "learning_rate": 4.185491981981891e-05, + "loss": 1.7243, + "step": 18426 + }, + { + "epoch": 5.655923879680786, + "grad_norm": 0.271781861782074, + "learning_rate": 4.185001569536292e-05, + "loss": 1.7688, + "step": 18427 + }, + { + "epoch": 5.656230816451811, + "grad_norm": 0.3140811324119568, + "learning_rate": 4.184511165144986e-05, + "loss": 1.7319, + "step": 18428 + }, + { + "epoch": 5.656537753222836, + "grad_norm": 0.20013047754764557, + "learning_rate": 4.184020768812818e-05, + "loss": 1.7104, + "step": 18429 + }, + { + "epoch": 5.656844689993862, + "grad_norm": 0.2615044414997101, + "learning_rate": 4.183530380544638e-05, + "loss": 1.7314, + "step": 18430 + }, + { + "epoch": 5.657151626764886, + "grad_norm": 0.2645856440067291, + "learning_rate": 4.183040000345287e-05, + "loss": 1.7431, + "step": 18431 + }, + { + "epoch": 5.657458563535911, + "grad_norm": 0.1916145384311676, + "learning_rate": 4.182549628219615e-05, + "loss": 1.7013, + "step": 18432 + }, + { + "epoch": 5.657765500306937, + "grad_norm": 0.2647114396095276, + "learning_rate": 4.182059264172466e-05, + "loss": 1.7278, + "step": 18433 + }, + { + "epoch": 5.658072437077962, + "grad_norm": 0.20201756060123444, + "learning_rate": 4.1815689082086854e-05, + "loss": 1.7065, + "step": 18434 + }, + { + "epoch": 5.658379373848987, + "grad_norm": 0.23892022669315338, + "learning_rate": 4.181078560333123e-05, + "loss": 1.7365, + "step": 18435 + }, + { + "epoch": 5.658686310620013, + "grad_norm": 0.3125975728034973, + "learning_rate": 4.18058822055062e-05, + "loss": 1.7152, + "step": 18436 + }, + { + "epoch": 5.658993247391037, + "grad_norm": 0.18924804031848907, + "learning_rate": 4.180097888866027e-05, + "loss": 1.7763, + "step": 18437 + }, + { + "epoch": 5.6593001841620625, + "grad_norm": 0.28476929664611816, + "learning_rate": 4.1796075652841845e-05, + "loss": 1.7517, + "step": 18438 + }, + { + "epoch": 5.659607120933088, + "grad_norm": 0.30616337060928345, + "learning_rate": 4.1791172498099416e-05, + "loss": 1.7446, + "step": 18439 + }, + { + "epoch": 5.659914057704113, + "grad_norm": 0.3219330608844757, + "learning_rate": 4.1786269424481426e-05, + "loss": 1.8374, + "step": 18440 + }, + { + "epoch": 5.6602209944751385, + "grad_norm": 0.34074151515960693, + "learning_rate": 4.1781366432036364e-05, + "loss": 1.7915, + "step": 18441 + }, + { + "epoch": 5.660527931246163, + "grad_norm": 0.2321610003709793, + "learning_rate": 4.177646352081263e-05, + "loss": 1.7361, + "step": 18442 + }, + { + "epoch": 5.660834868017188, + "grad_norm": 0.34283575415611267, + "learning_rate": 4.1771560690858716e-05, + "loss": 1.6859, + "step": 18443 + }, + { + "epoch": 5.661141804788214, + "grad_norm": 0.32274290919303894, + "learning_rate": 4.1766657942223055e-05, + "loss": 1.7376, + "step": 18444 + }, + { + "epoch": 5.661448741559239, + "grad_norm": 0.23960906267166138, + "learning_rate": 4.1761755274954105e-05, + "loss": 1.7198, + "step": 18445 + }, + { + "epoch": 5.661755678330264, + "grad_norm": 0.2622305154800415, + "learning_rate": 4.175685268910031e-05, + "loss": 1.6997, + "step": 18446 + }, + { + "epoch": 5.66206261510129, + "grad_norm": 0.19836951792240143, + "learning_rate": 4.1751950184710157e-05, + "loss": 1.6612, + "step": 18447 + }, + { + "epoch": 5.662369551872314, + "grad_norm": 0.29541507363319397, + "learning_rate": 4.174704776183204e-05, + "loss": 1.7606, + "step": 18448 + }, + { + "epoch": 5.662676488643339, + "grad_norm": 0.21632203459739685, + "learning_rate": 4.174214542051445e-05, + "loss": 1.7108, + "step": 18449 + }, + { + "epoch": 5.662983425414365, + "grad_norm": 0.2851164638996124, + "learning_rate": 4.173724316080582e-05, + "loss": 1.747, + "step": 18450 + }, + { + "epoch": 5.66329036218539, + "grad_norm": 0.30293309688568115, + "learning_rate": 4.173234098275458e-05, + "loss": 1.7549, + "step": 18451 + }, + { + "epoch": 5.6635972989564145, + "grad_norm": 0.2131963074207306, + "learning_rate": 4.172743888640921e-05, + "loss": 1.7804, + "step": 18452 + }, + { + "epoch": 5.66390423572744, + "grad_norm": 0.234910249710083, + "learning_rate": 4.172253687181812e-05, + "loss": 1.7149, + "step": 18453 + }, + { + "epoch": 5.664211172498465, + "grad_norm": 0.21238654851913452, + "learning_rate": 4.171763493902979e-05, + "loss": 1.7272, + "step": 18454 + }, + { + "epoch": 5.6645181092694905, + "grad_norm": 0.20571236312389374, + "learning_rate": 4.171273308809263e-05, + "loss": 1.713, + "step": 18455 + }, + { + "epoch": 5.664825046040516, + "grad_norm": 0.24867361783981323, + "learning_rate": 4.1707831319055104e-05, + "loss": 1.682, + "step": 18456 + }, + { + "epoch": 5.665131982811541, + "grad_norm": 0.20556440949440002, + "learning_rate": 4.170292963196564e-05, + "loss": 1.7126, + "step": 18457 + }, + { + "epoch": 5.665438919582566, + "grad_norm": 0.26431065797805786, + "learning_rate": 4.169802802687271e-05, + "loss": 1.8142, + "step": 18458 + }, + { + "epoch": 5.665745856353591, + "grad_norm": 0.26041486859321594, + "learning_rate": 4.169312650382471e-05, + "loss": 1.7206, + "step": 18459 + }, + { + "epoch": 5.666052793124616, + "grad_norm": 0.2190525084733963, + "learning_rate": 4.1688225062870126e-05, + "loss": 1.787, + "step": 18460 + }, + { + "epoch": 5.666359729895642, + "grad_norm": 0.24726425111293793, + "learning_rate": 4.1683323704057354e-05, + "loss": 1.7677, + "step": 18461 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.22206442058086395, + "learning_rate": 4.167842242743486e-05, + "loss": 1.73, + "step": 18462 + }, + { + "epoch": 5.666973603437691, + "grad_norm": 0.22501195967197418, + "learning_rate": 4.167352123305108e-05, + "loss": 1.7213, + "step": 18463 + }, + { + "epoch": 5.667280540208717, + "grad_norm": 0.26164770126342773, + "learning_rate": 4.166862012095443e-05, + "loss": 1.7839, + "step": 18464 + }, + { + "epoch": 5.667587476979742, + "grad_norm": 0.19480809569358826, + "learning_rate": 4.166371909119336e-05, + "loss": 1.7562, + "step": 18465 + }, + { + "epoch": 5.667894413750767, + "grad_norm": 0.26677292585372925, + "learning_rate": 4.165881814381632e-05, + "loss": 1.776, + "step": 18466 + }, + { + "epoch": 5.668201350521793, + "grad_norm": 0.22019581496715546, + "learning_rate": 4.165391727887172e-05, + "loss": 1.7575, + "step": 18467 + }, + { + "epoch": 5.668508287292818, + "grad_norm": 0.23851899802684784, + "learning_rate": 4.1649016496407986e-05, + "loss": 1.7346, + "step": 18468 + }, + { + "epoch": 5.6688152240638425, + "grad_norm": 0.3118130564689636, + "learning_rate": 4.1644115796473596e-05, + "loss": 1.7808, + "step": 18469 + }, + { + "epoch": 5.669122160834868, + "grad_norm": 0.22783879935741425, + "learning_rate": 4.163921517911692e-05, + "loss": 1.831, + "step": 18470 + }, + { + "epoch": 5.669429097605893, + "grad_norm": 0.2203773707151413, + "learning_rate": 4.163431464438645e-05, + "loss": 1.7034, + "step": 18471 + }, + { + "epoch": 5.6697360343769185, + "grad_norm": 0.21838103234767914, + "learning_rate": 4.162941419233056e-05, + "loss": 1.7553, + "step": 18472 + }, + { + "epoch": 5.670042971147944, + "grad_norm": 0.18453563749790192, + "learning_rate": 4.162451382299771e-05, + "loss": 1.7139, + "step": 18473 + }, + { + "epoch": 5.670349907918968, + "grad_norm": 0.25308313965797424, + "learning_rate": 4.161961353643633e-05, + "loss": 1.7291, + "step": 18474 + }, + { + "epoch": 5.670656844689994, + "grad_norm": 0.2528827488422394, + "learning_rate": 4.1614713332694845e-05, + "loss": 1.781, + "step": 18475 + }, + { + "epoch": 5.670963781461019, + "grad_norm": 0.24774135649204254, + "learning_rate": 4.160981321182166e-05, + "loss": 1.7808, + "step": 18476 + }, + { + "epoch": 5.671270718232044, + "grad_norm": 0.25225830078125, + "learning_rate": 4.160491317386524e-05, + "loss": 1.739, + "step": 18477 + }, + { + "epoch": 5.67157765500307, + "grad_norm": 0.2095808982849121, + "learning_rate": 4.160001321887397e-05, + "loss": 1.7242, + "step": 18478 + }, + { + "epoch": 5.671884591774095, + "grad_norm": 0.23906216025352478, + "learning_rate": 4.159511334689631e-05, + "loss": 1.7071, + "step": 18479 + }, + { + "epoch": 5.672191528545119, + "grad_norm": 0.21851155161857605, + "learning_rate": 4.159021355798065e-05, + "loss": 1.7171, + "step": 18480 + }, + { + "epoch": 5.672498465316145, + "grad_norm": 0.2005140632390976, + "learning_rate": 4.158531385217544e-05, + "loss": 1.7483, + "step": 18481 + }, + { + "epoch": 5.67280540208717, + "grad_norm": 0.2230832278728485, + "learning_rate": 4.1580414229529074e-05, + "loss": 1.7386, + "step": 18482 + }, + { + "epoch": 5.673112338858195, + "grad_norm": 0.22402967512607574, + "learning_rate": 4.1575514690090014e-05, + "loss": 1.7989, + "step": 18483 + }, + { + "epoch": 5.67341927562922, + "grad_norm": 0.20350080728530884, + "learning_rate": 4.157061523390665e-05, + "loss": 1.6856, + "step": 18484 + }, + { + "epoch": 5.673726212400245, + "grad_norm": 0.2039422243833542, + "learning_rate": 4.15657158610274e-05, + "loss": 1.7262, + "step": 18485 + }, + { + "epoch": 5.6740331491712706, + "grad_norm": 0.20411522686481476, + "learning_rate": 4.156081657150069e-05, + "loss": 1.738, + "step": 18486 + }, + { + "epoch": 5.674340085942296, + "grad_norm": 0.2693086862564087, + "learning_rate": 4.155591736537493e-05, + "loss": 1.731, + "step": 18487 + }, + { + "epoch": 5.674647022713321, + "grad_norm": 0.20745019614696503, + "learning_rate": 4.1551018242698567e-05, + "loss": 1.7138, + "step": 18488 + }, + { + "epoch": 5.6749539594843466, + "grad_norm": 0.22033964097499847, + "learning_rate": 4.1546119203519964e-05, + "loss": 1.8144, + "step": 18489 + }, + { + "epoch": 5.675260896255372, + "grad_norm": 0.22859029471874237, + "learning_rate": 4.154122024788759e-05, + "loss": 1.6724, + "step": 18490 + }, + { + "epoch": 5.675567833026396, + "grad_norm": 0.2226465791463852, + "learning_rate": 4.153632137584982e-05, + "loss": 1.731, + "step": 18491 + }, + { + "epoch": 5.675874769797422, + "grad_norm": 0.19657716155052185, + "learning_rate": 4.1531422587455086e-05, + "loss": 1.6937, + "step": 18492 + }, + { + "epoch": 5.676181706568447, + "grad_norm": 0.23167578876018524, + "learning_rate": 4.152652388275179e-05, + "loss": 1.7444, + "step": 18493 + }, + { + "epoch": 5.676488643339472, + "grad_norm": 0.24468563497066498, + "learning_rate": 4.1521625261788374e-05, + "loss": 1.7173, + "step": 18494 + }, + { + "epoch": 5.676795580110497, + "grad_norm": 0.27125802636146545, + "learning_rate": 4.1516726724613206e-05, + "loss": 1.7424, + "step": 18495 + }, + { + "epoch": 5.677102516881522, + "grad_norm": 0.23816901445388794, + "learning_rate": 4.151182827127473e-05, + "loss": 1.6911, + "step": 18496 + }, + { + "epoch": 5.6774094536525475, + "grad_norm": 0.26058733463287354, + "learning_rate": 4.150692990182133e-05, + "loss": 1.7142, + "step": 18497 + }, + { + "epoch": 5.677716390423573, + "grad_norm": 0.20207929611206055, + "learning_rate": 4.150203161630143e-05, + "loss": 1.7506, + "step": 18498 + }, + { + "epoch": 5.678023327194598, + "grad_norm": 0.259857714176178, + "learning_rate": 4.1497133414763435e-05, + "loss": 1.7181, + "step": 18499 + }, + { + "epoch": 5.6783302639656235, + "grad_norm": 0.2607496380805969, + "learning_rate": 4.149223529725577e-05, + "loss": 1.7829, + "step": 18500 + }, + { + "epoch": 5.678637200736648, + "grad_norm": 0.23265719413757324, + "learning_rate": 4.148733726382681e-05, + "loss": 1.7028, + "step": 18501 + }, + { + "epoch": 5.678944137507673, + "grad_norm": 0.26610276103019714, + "learning_rate": 4.1482439314524964e-05, + "loss": 1.8604, + "step": 18502 + }, + { + "epoch": 5.679251074278699, + "grad_norm": 0.24022582173347473, + "learning_rate": 4.147754144939865e-05, + "loss": 1.7142, + "step": 18503 + }, + { + "epoch": 5.679558011049724, + "grad_norm": 0.2849755585193634, + "learning_rate": 4.1472643668496255e-05, + "loss": 1.6956, + "step": 18504 + }, + { + "epoch": 5.679864947820749, + "grad_norm": 0.24330341815948486, + "learning_rate": 4.1467745971866216e-05, + "loss": 1.7617, + "step": 18505 + }, + { + "epoch": 5.680171884591774, + "grad_norm": 0.21072770655155182, + "learning_rate": 4.146284835955689e-05, + "loss": 1.6999, + "step": 18506 + }, + { + "epoch": 5.680478821362799, + "grad_norm": 0.1971336454153061, + "learning_rate": 4.145795083161673e-05, + "loss": 1.6756, + "step": 18507 + }, + { + "epoch": 5.680785758133824, + "grad_norm": 0.18576614558696747, + "learning_rate": 4.1453053388094073e-05, + "loss": 1.6885, + "step": 18508 + }, + { + "epoch": 5.68109269490485, + "grad_norm": 0.21335965394973755, + "learning_rate": 4.144815602903737e-05, + "loss": 1.7278, + "step": 18509 + }, + { + "epoch": 5.681399631675875, + "grad_norm": 0.21756233274936676, + "learning_rate": 4.1443258754494986e-05, + "loss": 1.7549, + "step": 18510 + }, + { + "epoch": 5.6817065684469, + "grad_norm": 0.2214142084121704, + "learning_rate": 4.143836156451536e-05, + "loss": 1.6654, + "step": 18511 + }, + { + "epoch": 5.682013505217925, + "grad_norm": 0.2230863869190216, + "learning_rate": 4.143346445914684e-05, + "loss": 1.7286, + "step": 18512 + }, + { + "epoch": 5.68232044198895, + "grad_norm": 0.2283746749162674, + "learning_rate": 4.142856743843787e-05, + "loss": 1.7652, + "step": 18513 + }, + { + "epoch": 5.6826273787599755, + "grad_norm": 0.20059749484062195, + "learning_rate": 4.142367050243679e-05, + "loss": 1.6854, + "step": 18514 + }, + { + "epoch": 5.682934315531001, + "grad_norm": 0.17887794971466064, + "learning_rate": 4.141877365119204e-05, + "loss": 1.6975, + "step": 18515 + }, + { + "epoch": 5.683241252302026, + "grad_norm": 0.21266087889671326, + "learning_rate": 4.141387688475199e-05, + "loss": 1.7361, + "step": 18516 + }, + { + "epoch": 5.683548189073051, + "grad_norm": 0.20075422525405884, + "learning_rate": 4.140898020316506e-05, + "loss": 1.7496, + "step": 18517 + }, + { + "epoch": 5.683855125844076, + "grad_norm": 0.21430443227291107, + "learning_rate": 4.140408360647963e-05, + "loss": 1.7481, + "step": 18518 + }, + { + "epoch": 5.684162062615101, + "grad_norm": 0.1951984018087387, + "learning_rate": 4.139918709474405e-05, + "loss": 1.713, + "step": 18519 + }, + { + "epoch": 5.684468999386127, + "grad_norm": 0.21636274456977844, + "learning_rate": 4.1394290668006764e-05, + "loss": 1.8169, + "step": 18520 + }, + { + "epoch": 5.684775936157152, + "grad_norm": 0.21003715693950653, + "learning_rate": 4.138939432631613e-05, + "loss": 1.7453, + "step": 18521 + }, + { + "epoch": 5.685082872928177, + "grad_norm": 0.23559699952602386, + "learning_rate": 4.138449806972057e-05, + "loss": 1.7534, + "step": 18522 + }, + { + "epoch": 5.685389809699202, + "grad_norm": 0.23322029411792755, + "learning_rate": 4.137960189826843e-05, + "loss": 1.7535, + "step": 18523 + }, + { + "epoch": 5.685696746470227, + "grad_norm": 0.1998462826013565, + "learning_rate": 4.137470581200813e-05, + "loss": 1.7025, + "step": 18524 + }, + { + "epoch": 5.686003683241252, + "grad_norm": 0.22321350872516632, + "learning_rate": 4.1369809810988025e-05, + "loss": 1.7666, + "step": 18525 + }, + { + "epoch": 5.686310620012278, + "grad_norm": 0.20851604640483856, + "learning_rate": 4.136491389525653e-05, + "loss": 1.6958, + "step": 18526 + }, + { + "epoch": 5.686617556783302, + "grad_norm": 0.21494868397712708, + "learning_rate": 4.136001806486201e-05, + "loss": 1.7703, + "step": 18527 + }, + { + "epoch": 5.6869244935543275, + "grad_norm": 0.19872798025608063, + "learning_rate": 4.135512231985287e-05, + "loss": 1.7451, + "step": 18528 + }, + { + "epoch": 5.687231430325353, + "grad_norm": 0.2424371987581253, + "learning_rate": 4.1350226660277456e-05, + "loss": 1.8153, + "step": 18529 + }, + { + "epoch": 5.687538367096378, + "grad_norm": 0.20388297736644745, + "learning_rate": 4.1345331086184196e-05, + "loss": 1.6882, + "step": 18530 + }, + { + "epoch": 5.6878453038674035, + "grad_norm": 0.22662605345249176, + "learning_rate": 4.134043559762143e-05, + "loss": 1.7532, + "step": 18531 + }, + { + "epoch": 5.688152240638429, + "grad_norm": 0.2281452864408493, + "learning_rate": 4.133554019463756e-05, + "loss": 1.769, + "step": 18532 + }, + { + "epoch": 5.688459177409453, + "grad_norm": 0.2303505390882492, + "learning_rate": 4.1330644877280955e-05, + "loss": 1.7176, + "step": 18533 + }, + { + "epoch": 5.688766114180479, + "grad_norm": 0.24411743879318237, + "learning_rate": 4.132574964560001e-05, + "loss": 1.7557, + "step": 18534 + }, + { + "epoch": 5.689073050951504, + "grad_norm": 0.2674088776111603, + "learning_rate": 4.13208544996431e-05, + "loss": 1.6997, + "step": 18535 + }, + { + "epoch": 5.689379987722529, + "grad_norm": 0.22232958674430847, + "learning_rate": 4.1315959439458565e-05, + "loss": 1.7731, + "step": 18536 + }, + { + "epoch": 5.689686924493555, + "grad_norm": 0.23894453048706055, + "learning_rate": 4.131106446509483e-05, + "loss": 1.7454, + "step": 18537 + }, + { + "epoch": 5.689993861264579, + "grad_norm": 0.19710026681423187, + "learning_rate": 4.1306169576600226e-05, + "loss": 1.6872, + "step": 18538 + }, + { + "epoch": 5.690300798035604, + "grad_norm": 0.1879546344280243, + "learning_rate": 4.130127477402318e-05, + "loss": 1.6929, + "step": 18539 + }, + { + "epoch": 5.69060773480663, + "grad_norm": 0.1964653730392456, + "learning_rate": 4.129638005741201e-05, + "loss": 1.7778, + "step": 18540 + }, + { + "epoch": 5.690914671577655, + "grad_norm": 0.20161493122577667, + "learning_rate": 4.129148542681513e-05, + "loss": 1.7388, + "step": 18541 + }, + { + "epoch": 5.69122160834868, + "grad_norm": 0.26742830872535706, + "learning_rate": 4.1286590882280886e-05, + "loss": 1.7472, + "step": 18542 + }, + { + "epoch": 5.691528545119706, + "grad_norm": 0.2613312900066376, + "learning_rate": 4.128169642385766e-05, + "loss": 1.7656, + "step": 18543 + }, + { + "epoch": 5.69183548189073, + "grad_norm": 0.17979474365711212, + "learning_rate": 4.127680205159381e-05, + "loss": 1.6992, + "step": 18544 + }, + { + "epoch": 5.6921424186617555, + "grad_norm": 0.23575037717819214, + "learning_rate": 4.1271907765537745e-05, + "loss": 1.7399, + "step": 18545 + }, + { + "epoch": 5.692449355432781, + "grad_norm": 0.19461458921432495, + "learning_rate": 4.126701356573777e-05, + "loss": 1.709, + "step": 18546 + }, + { + "epoch": 5.692756292203806, + "grad_norm": 0.19715365767478943, + "learning_rate": 4.1262119452242306e-05, + "loss": 1.7634, + "step": 18547 + }, + { + "epoch": 5.6930632289748315, + "grad_norm": 0.21454904973506927, + "learning_rate": 4.125722542509969e-05, + "loss": 1.7663, + "step": 18548 + }, + { + "epoch": 5.693370165745856, + "grad_norm": 0.19884896278381348, + "learning_rate": 4.12523314843583e-05, + "loss": 1.7618, + "step": 18549 + }, + { + "epoch": 5.693677102516881, + "grad_norm": 0.2080020159482956, + "learning_rate": 4.124743763006648e-05, + "loss": 1.7379, + "step": 18550 + }, + { + "epoch": 5.693984039287907, + "grad_norm": 0.18780875205993652, + "learning_rate": 4.124254386227264e-05, + "loss": 1.7036, + "step": 18551 + }, + { + "epoch": 5.694290976058932, + "grad_norm": 0.2114439308643341, + "learning_rate": 4.123765018102512e-05, + "loss": 1.6873, + "step": 18552 + }, + { + "epoch": 5.694597912829957, + "grad_norm": 0.1712789535522461, + "learning_rate": 4.123275658637225e-05, + "loss": 1.6772, + "step": 18553 + }, + { + "epoch": 5.694904849600983, + "grad_norm": 0.2435859888792038, + "learning_rate": 4.122786307836243e-05, + "loss": 1.7946, + "step": 18554 + }, + { + "epoch": 5.695211786372007, + "grad_norm": 0.20587889850139618, + "learning_rate": 4.122296965704399e-05, + "loss": 1.7459, + "step": 18555 + }, + { + "epoch": 5.695518723143032, + "grad_norm": 0.2183443009853363, + "learning_rate": 4.121807632246534e-05, + "loss": 1.7036, + "step": 18556 + }, + { + "epoch": 5.695825659914058, + "grad_norm": 0.19276869297027588, + "learning_rate": 4.121318307467478e-05, + "loss": 1.7371, + "step": 18557 + }, + { + "epoch": 5.696132596685083, + "grad_norm": 0.19815512001514435, + "learning_rate": 4.120828991372072e-05, + "loss": 1.7038, + "step": 18558 + }, + { + "epoch": 5.696439533456108, + "grad_norm": 0.18509675562381744, + "learning_rate": 4.120339683965146e-05, + "loss": 1.6936, + "step": 18559 + }, + { + "epoch": 5.696746470227133, + "grad_norm": 0.2296193689107895, + "learning_rate": 4.1198503852515416e-05, + "loss": 1.7626, + "step": 18560 + }, + { + "epoch": 5.697053406998158, + "grad_norm": 0.2064799964427948, + "learning_rate": 4.11936109523609e-05, + "loss": 1.7387, + "step": 18561 + }, + { + "epoch": 5.6973603437691835, + "grad_norm": 0.20171360671520233, + "learning_rate": 4.1188718139236296e-05, + "loss": 1.7372, + "step": 18562 + }, + { + "epoch": 5.697667280540209, + "grad_norm": 0.19421936571598053, + "learning_rate": 4.118382541318993e-05, + "loss": 1.7187, + "step": 18563 + }, + { + "epoch": 5.697974217311234, + "grad_norm": 0.22517532110214233, + "learning_rate": 4.117893277427018e-05, + "loss": 1.7503, + "step": 18564 + }, + { + "epoch": 5.6982811540822595, + "grad_norm": 0.2293393909931183, + "learning_rate": 4.1174040222525366e-05, + "loss": 1.7174, + "step": 18565 + }, + { + "epoch": 5.698588090853284, + "grad_norm": 0.24003073573112488, + "learning_rate": 4.1169147758003876e-05, + "loss": 1.7829, + "step": 18566 + }, + { + "epoch": 5.698895027624309, + "grad_norm": 0.21476133167743683, + "learning_rate": 4.1164255380754034e-05, + "loss": 1.7906, + "step": 18567 + }, + { + "epoch": 5.699201964395335, + "grad_norm": 0.21347576379776, + "learning_rate": 4.115936309082422e-05, + "loss": 1.6986, + "step": 18568 + }, + { + "epoch": 5.69950890116636, + "grad_norm": 0.22650402784347534, + "learning_rate": 4.115447088826276e-05, + "loss": 1.7949, + "step": 18569 + }, + { + "epoch": 5.699815837937384, + "grad_norm": 0.25815197825431824, + "learning_rate": 4.114957877311799e-05, + "loss": 1.7499, + "step": 18570 + }, + { + "epoch": 5.70012277470841, + "grad_norm": 0.22644442319869995, + "learning_rate": 4.1144686745438265e-05, + "loss": 1.7689, + "step": 18571 + }, + { + "epoch": 5.700429711479435, + "grad_norm": 0.241188645362854, + "learning_rate": 4.113979480527194e-05, + "loss": 1.7341, + "step": 18572 + }, + { + "epoch": 5.7007366482504604, + "grad_norm": 0.20984862744808197, + "learning_rate": 4.1134902952667365e-05, + "loss": 1.7091, + "step": 18573 + }, + { + "epoch": 5.701043585021486, + "grad_norm": 0.25150877237319946, + "learning_rate": 4.113001118767286e-05, + "loss": 1.723, + "step": 18574 + }, + { + "epoch": 5.701350521792511, + "grad_norm": 0.21693028509616852, + "learning_rate": 4.1125119510336804e-05, + "loss": 1.7483, + "step": 18575 + }, + { + "epoch": 5.701657458563536, + "grad_norm": 0.2620212733745575, + "learning_rate": 4.11202279207075e-05, + "loss": 1.8159, + "step": 18576 + }, + { + "epoch": 5.701964395334561, + "grad_norm": 0.18722239136695862, + "learning_rate": 4.111533641883332e-05, + "loss": 1.7197, + "step": 18577 + }, + { + "epoch": 5.702271332105586, + "grad_norm": 0.21321091055870056, + "learning_rate": 4.111044500476258e-05, + "loss": 1.7408, + "step": 18578 + }, + { + "epoch": 5.702578268876612, + "grad_norm": 0.24459265172481537, + "learning_rate": 4.110555367854365e-05, + "loss": 1.8304, + "step": 18579 + }, + { + "epoch": 5.702885205647637, + "grad_norm": 0.24987100064754486, + "learning_rate": 4.110066244022483e-05, + "loss": 1.7051, + "step": 18580 + }, + { + "epoch": 5.703192142418661, + "grad_norm": 0.19059090316295624, + "learning_rate": 4.1095771289854506e-05, + "loss": 1.7489, + "step": 18581 + }, + { + "epoch": 5.703499079189687, + "grad_norm": 0.23020480573177338, + "learning_rate": 4.1090880227480966e-05, + "loss": 1.7101, + "step": 18582 + }, + { + "epoch": 5.703806015960712, + "grad_norm": 0.18733634054660797, + "learning_rate": 4.108598925315258e-05, + "loss": 1.7116, + "step": 18583 + }, + { + "epoch": 5.704112952731737, + "grad_norm": 0.1959095001220703, + "learning_rate": 4.108109836691766e-05, + "loss": 1.7283, + "step": 18584 + }, + { + "epoch": 5.704419889502763, + "grad_norm": 0.22685091197490692, + "learning_rate": 4.107620756882457e-05, + "loss": 1.7588, + "step": 18585 + }, + { + "epoch": 5.704726826273788, + "grad_norm": 0.1998603790998459, + "learning_rate": 4.107131685892164e-05, + "loss": 1.7071, + "step": 18586 + }, + { + "epoch": 5.7050337630448125, + "grad_norm": 0.2018733024597168, + "learning_rate": 4.106642623725717e-05, + "loss": 1.6782, + "step": 18587 + }, + { + "epoch": 5.705340699815838, + "grad_norm": 0.21826615929603577, + "learning_rate": 4.106153570387951e-05, + "loss": 1.736, + "step": 18588 + }, + { + "epoch": 5.705647636586863, + "grad_norm": 0.20197603106498718, + "learning_rate": 4.105664525883699e-05, + "loss": 1.6921, + "step": 18589 + }, + { + "epoch": 5.7059545733578885, + "grad_norm": 0.20943905413150787, + "learning_rate": 4.105175490217796e-05, + "loss": 1.665, + "step": 18590 + }, + { + "epoch": 5.706261510128914, + "grad_norm": 0.202060267329216, + "learning_rate": 4.104686463395071e-05, + "loss": 1.714, + "step": 18591 + }, + { + "epoch": 5.706568446899938, + "grad_norm": 0.220698744058609, + "learning_rate": 4.1041974454203623e-05, + "loss": 1.8076, + "step": 18592 + }, + { + "epoch": 5.706875383670964, + "grad_norm": 0.21536946296691895, + "learning_rate": 4.103708436298497e-05, + "loss": 1.6801, + "step": 18593 + }, + { + "epoch": 5.707182320441989, + "grad_norm": 0.21442468464374542, + "learning_rate": 4.103219436034311e-05, + "loss": 1.6921, + "step": 18594 + }, + { + "epoch": 5.707489257213014, + "grad_norm": 0.2047559767961502, + "learning_rate": 4.1027304446326356e-05, + "loss": 1.7861, + "step": 18595 + }, + { + "epoch": 5.70779619398404, + "grad_norm": 0.20304669439792633, + "learning_rate": 4.102241462098305e-05, + "loss": 1.7751, + "step": 18596 + }, + { + "epoch": 5.708103130755065, + "grad_norm": 0.18702620267868042, + "learning_rate": 4.101752488436149e-05, + "loss": 1.6951, + "step": 18597 + }, + { + "epoch": 5.708410067526089, + "grad_norm": 0.1821923404932022, + "learning_rate": 4.1012635236510034e-05, + "loss": 1.711, + "step": 18598 + }, + { + "epoch": 5.708717004297115, + "grad_norm": 0.19422096014022827, + "learning_rate": 4.100774567747696e-05, + "loss": 1.7202, + "step": 18599 + }, + { + "epoch": 5.70902394106814, + "grad_norm": 0.20800530910491943, + "learning_rate": 4.100285620731063e-05, + "loss": 1.7403, + "step": 18600 + }, + { + "epoch": 5.709330877839165, + "grad_norm": 0.221746027469635, + "learning_rate": 4.099796682605934e-05, + "loss": 1.7769, + "step": 18601 + }, + { + "epoch": 5.70963781461019, + "grad_norm": 0.19284313917160034, + "learning_rate": 4.099307753377143e-05, + "loss": 1.692, + "step": 18602 + }, + { + "epoch": 5.709944751381215, + "grad_norm": 0.17635129392147064, + "learning_rate": 4.0988188330495216e-05, + "loss": 1.7212, + "step": 18603 + }, + { + "epoch": 5.7102516881522405, + "grad_norm": 0.17728061974048615, + "learning_rate": 4.098329921627898e-05, + "loss": 1.7217, + "step": 18604 + }, + { + "epoch": 5.710558624923266, + "grad_norm": 0.19998152554035187, + "learning_rate": 4.097841019117108e-05, + "loss": 1.7583, + "step": 18605 + }, + { + "epoch": 5.710865561694291, + "grad_norm": 0.18840095400810242, + "learning_rate": 4.09735212552198e-05, + "loss": 1.7353, + "step": 18606 + }, + { + "epoch": 5.7111724984653165, + "grad_norm": 0.2528367042541504, + "learning_rate": 4.09686324084735e-05, + "loss": 1.7576, + "step": 18607 + }, + { + "epoch": 5.711479435236341, + "grad_norm": 0.27240338921546936, + "learning_rate": 4.096374365098045e-05, + "loss": 1.7303, + "step": 18608 + }, + { + "epoch": 5.711786372007366, + "grad_norm": 0.20187151432037354, + "learning_rate": 4.0958854982789e-05, + "loss": 1.7599, + "step": 18609 + }, + { + "epoch": 5.712093308778392, + "grad_norm": 0.24890528619289398, + "learning_rate": 4.095396640394742e-05, + "loss": 1.7737, + "step": 18610 + }, + { + "epoch": 5.712400245549417, + "grad_norm": 0.21524454653263092, + "learning_rate": 4.094907791450406e-05, + "loss": 1.7704, + "step": 18611 + }, + { + "epoch": 5.712707182320442, + "grad_norm": 0.20070379972457886, + "learning_rate": 4.094418951450721e-05, + "loss": 1.7358, + "step": 18612 + }, + { + "epoch": 5.713014119091467, + "grad_norm": 0.2252196967601776, + "learning_rate": 4.09393012040052e-05, + "loss": 1.7262, + "step": 18613 + }, + { + "epoch": 5.713321055862492, + "grad_norm": 0.19511987268924713, + "learning_rate": 4.093441298304631e-05, + "loss": 1.7146, + "step": 18614 + }, + { + "epoch": 5.713627992633517, + "grad_norm": 0.2047072798013687, + "learning_rate": 4.092952485167888e-05, + "loss": 1.7864, + "step": 18615 + }, + { + "epoch": 5.713934929404543, + "grad_norm": 0.21794871985912323, + "learning_rate": 4.092463680995119e-05, + "loss": 1.7759, + "step": 18616 + }, + { + "epoch": 5.714241866175568, + "grad_norm": 0.23863841593265533, + "learning_rate": 4.0919748857911566e-05, + "loss": 1.7207, + "step": 18617 + }, + { + "epoch": 5.714548802946593, + "grad_norm": 0.19706958532333374, + "learning_rate": 4.09148609956083e-05, + "loss": 1.7247, + "step": 18618 + }, + { + "epoch": 5.714855739717618, + "grad_norm": 0.23663771152496338, + "learning_rate": 4.090997322308971e-05, + "loss": 1.7929, + "step": 18619 + }, + { + "epoch": 5.715162676488643, + "grad_norm": 0.23079079389572144, + "learning_rate": 4.09050855404041e-05, + "loss": 1.763, + "step": 18620 + }, + { + "epoch": 5.7154696132596685, + "grad_norm": 0.23883379995822906, + "learning_rate": 4.0900197947599736e-05, + "loss": 1.7995, + "step": 18621 + }, + { + "epoch": 5.715776550030694, + "grad_norm": 0.2125123143196106, + "learning_rate": 4.0895310444724974e-05, + "loss": 1.8045, + "step": 18622 + }, + { + "epoch": 5.716083486801719, + "grad_norm": 0.21062424778938293, + "learning_rate": 4.0890423031828076e-05, + "loss": 1.7348, + "step": 18623 + }, + { + "epoch": 5.716390423572744, + "grad_norm": 0.24079614877700806, + "learning_rate": 4.088553570895737e-05, + "loss": 1.7462, + "step": 18624 + }, + { + "epoch": 5.716697360343769, + "grad_norm": 0.2120666354894638, + "learning_rate": 4.088064847616113e-05, + "loss": 1.7235, + "step": 18625 + }, + { + "epoch": 5.717004297114794, + "grad_norm": 0.19663050770759583, + "learning_rate": 4.0875761333487685e-05, + "loss": 1.6743, + "step": 18626 + }, + { + "epoch": 5.71731123388582, + "grad_norm": 0.24010685086250305, + "learning_rate": 4.0870874280985295e-05, + "loss": 1.6742, + "step": 18627 + }, + { + "epoch": 5.717618170656845, + "grad_norm": 0.22140294313430786, + "learning_rate": 4.086598731870228e-05, + "loss": 1.7601, + "step": 18628 + }, + { + "epoch": 5.71792510742787, + "grad_norm": 0.2876693308353424, + "learning_rate": 4.086110044668694e-05, + "loss": 1.7601, + "step": 18629 + }, + { + "epoch": 5.718232044198895, + "grad_norm": 0.3103853464126587, + "learning_rate": 4.085621366498756e-05, + "loss": 1.6824, + "step": 18630 + }, + { + "epoch": 5.71853898096992, + "grad_norm": 0.18194396793842316, + "learning_rate": 4.0851326973652424e-05, + "loss": 1.6976, + "step": 18631 + }, + { + "epoch": 5.718845917740945, + "grad_norm": 0.28400903940200806, + "learning_rate": 4.0846440372729854e-05, + "loss": 1.7352, + "step": 18632 + }, + { + "epoch": 5.719152854511971, + "grad_norm": 0.23753583431243896, + "learning_rate": 4.084155386226811e-05, + "loss": 1.7418, + "step": 18633 + }, + { + "epoch": 5.719459791282996, + "grad_norm": 0.215620756149292, + "learning_rate": 4.0836667442315514e-05, + "loss": 1.7602, + "step": 18634 + }, + { + "epoch": 5.7197667280540205, + "grad_norm": 0.21057941019535065, + "learning_rate": 4.083178111292034e-05, + "loss": 1.6818, + "step": 18635 + }, + { + "epoch": 5.720073664825046, + "grad_norm": 0.2169445902109146, + "learning_rate": 4.0826894874130863e-05, + "loss": 1.7942, + "step": 18636 + }, + { + "epoch": 5.720380601596071, + "grad_norm": 0.2779453992843628, + "learning_rate": 4.082200872599541e-05, + "loss": 1.7432, + "step": 18637 + }, + { + "epoch": 5.7206875383670965, + "grad_norm": 0.22556698322296143, + "learning_rate": 4.0817122668562224e-05, + "loss": 1.7748, + "step": 18638 + }, + { + "epoch": 5.720994475138122, + "grad_norm": 0.2570365071296692, + "learning_rate": 4.081223670187962e-05, + "loss": 1.7314, + "step": 18639 + }, + { + "epoch": 5.721301411909147, + "grad_norm": 0.266176700592041, + "learning_rate": 4.080735082599588e-05, + "loss": 1.689, + "step": 18640 + }, + { + "epoch": 5.721608348680172, + "grad_norm": 0.20190037786960602, + "learning_rate": 4.080246504095929e-05, + "loss": 1.7467, + "step": 18641 + }, + { + "epoch": 5.721915285451197, + "grad_norm": 0.2498215138912201, + "learning_rate": 4.079757934681813e-05, + "loss": 1.7063, + "step": 18642 + }, + { + "epoch": 5.722222222222222, + "grad_norm": 0.25594204664230347, + "learning_rate": 4.0792693743620695e-05, + "loss": 1.7096, + "step": 18643 + }, + { + "epoch": 5.722529158993248, + "grad_norm": 0.22674626111984253, + "learning_rate": 4.0787808231415233e-05, + "loss": 1.715, + "step": 18644 + }, + { + "epoch": 5.722836095764272, + "grad_norm": 0.267140656709671, + "learning_rate": 4.078292281025007e-05, + "loss": 1.7747, + "step": 18645 + }, + { + "epoch": 5.723143032535297, + "grad_norm": 0.21161147952079773, + "learning_rate": 4.077803748017345e-05, + "loss": 1.7312, + "step": 18646 + }, + { + "epoch": 5.723449969306323, + "grad_norm": 0.2580260634422302, + "learning_rate": 4.077315224123368e-05, + "loss": 1.7246, + "step": 18647 + }, + { + "epoch": 5.723756906077348, + "grad_norm": 0.23766927421092987, + "learning_rate": 4.076826709347902e-05, + "loss": 1.7147, + "step": 18648 + }, + { + "epoch": 5.724063842848373, + "grad_norm": 0.22764286398887634, + "learning_rate": 4.076338203695776e-05, + "loss": 1.7034, + "step": 18649 + }, + { + "epoch": 5.724370779619399, + "grad_norm": 0.28205159306526184, + "learning_rate": 4.075849707171817e-05, + "loss": 1.7472, + "step": 18650 + }, + { + "epoch": 5.724677716390423, + "grad_norm": 0.2091183066368103, + "learning_rate": 4.075361219780854e-05, + "loss": 1.7693, + "step": 18651 + }, + { + "epoch": 5.7249846531614486, + "grad_norm": 0.29513829946517944, + "learning_rate": 4.074872741527713e-05, + "loss": 1.7286, + "step": 18652 + }, + { + "epoch": 5.725291589932474, + "grad_norm": 0.226357102394104, + "learning_rate": 4.07438427241722e-05, + "loss": 1.7658, + "step": 18653 + }, + { + "epoch": 5.725598526703499, + "grad_norm": 0.23732580244541168, + "learning_rate": 4.073895812454207e-05, + "loss": 1.7591, + "step": 18654 + }, + { + "epoch": 5.725905463474525, + "grad_norm": 0.2835488021373749, + "learning_rate": 4.0734073616434956e-05, + "loss": 1.757, + "step": 18655 + }, + { + "epoch": 5.726212400245549, + "grad_norm": 0.1986306756734848, + "learning_rate": 4.0729189199899186e-05, + "loss": 1.714, + "step": 18656 + }, + { + "epoch": 5.726519337016574, + "grad_norm": 0.25071820616722107, + "learning_rate": 4.072430487498298e-05, + "loss": 1.7334, + "step": 18657 + }, + { + "epoch": 5.7268262737876, + "grad_norm": 0.19989889860153198, + "learning_rate": 4.0719420641734634e-05, + "loss": 1.7472, + "step": 18658 + }, + { + "epoch": 5.727133210558625, + "grad_norm": 0.30006101727485657, + "learning_rate": 4.071453650020241e-05, + "loss": 1.7846, + "step": 18659 + }, + { + "epoch": 5.72744014732965, + "grad_norm": 0.19856922328472137, + "learning_rate": 4.070965245043459e-05, + "loss": 1.6965, + "step": 18660 + }, + { + "epoch": 5.727747084100676, + "grad_norm": 0.20139823853969574, + "learning_rate": 4.070476849247941e-05, + "loss": 1.7265, + "step": 18661 + }, + { + "epoch": 5.7280540208717, + "grad_norm": 0.21507953107357025, + "learning_rate": 4.0699884626385184e-05, + "loss": 1.762, + "step": 18662 + }, + { + "epoch": 5.7283609576427255, + "grad_norm": 0.1885843127965927, + "learning_rate": 4.069500085220013e-05, + "loss": 1.6721, + "step": 18663 + }, + { + "epoch": 5.728667894413751, + "grad_norm": 0.2076897919178009, + "learning_rate": 4.069011716997253e-05, + "loss": 1.7399, + "step": 18664 + }, + { + "epoch": 5.728974831184776, + "grad_norm": 0.21482045948505402, + "learning_rate": 4.068523357975065e-05, + "loss": 1.7105, + "step": 18665 + }, + { + "epoch": 5.7292817679558015, + "grad_norm": 0.20438800752162933, + "learning_rate": 4.0680350081582765e-05, + "loss": 1.7408, + "step": 18666 + }, + { + "epoch": 5.729588704726826, + "grad_norm": 0.2137845903635025, + "learning_rate": 4.0675466675517104e-05, + "loss": 1.7814, + "step": 18667 + }, + { + "epoch": 5.729895641497851, + "grad_norm": 0.23009657859802246, + "learning_rate": 4.067058336160197e-05, + "loss": 1.7311, + "step": 18668 + }, + { + "epoch": 5.730202578268877, + "grad_norm": 0.20602397620677948, + "learning_rate": 4.066570013988558e-05, + "loss": 1.741, + "step": 18669 + }, + { + "epoch": 5.730509515039902, + "grad_norm": 0.24884814023971558, + "learning_rate": 4.066081701041621e-05, + "loss": 1.7222, + "step": 18670 + }, + { + "epoch": 5.730816451810927, + "grad_norm": 0.17906342446804047, + "learning_rate": 4.065593397324214e-05, + "loss": 1.6879, + "step": 18671 + }, + { + "epoch": 5.731123388581953, + "grad_norm": 0.20345427095890045, + "learning_rate": 4.0651051028411586e-05, + "loss": 1.7713, + "step": 18672 + }, + { + "epoch": 5.731430325352977, + "grad_norm": 0.21115002036094666, + "learning_rate": 4.0646168175972846e-05, + "loss": 1.7666, + "step": 18673 + }, + { + "epoch": 5.731737262124002, + "grad_norm": 0.22189734876155853, + "learning_rate": 4.064128541597413e-05, + "loss": 1.6989, + "step": 18674 + }, + { + "epoch": 5.732044198895028, + "grad_norm": 0.24036027491092682, + "learning_rate": 4.063640274846373e-05, + "loss": 1.707, + "step": 18675 + }, + { + "epoch": 5.732351135666053, + "grad_norm": 0.23091022670269012, + "learning_rate": 4.063152017348988e-05, + "loss": 1.7072, + "step": 18676 + }, + { + "epoch": 5.7326580724370775, + "grad_norm": 0.3142668306827545, + "learning_rate": 4.062663769110085e-05, + "loss": 1.7641, + "step": 18677 + }, + { + "epoch": 5.732965009208103, + "grad_norm": 0.2634848356246948, + "learning_rate": 4.0621755301344875e-05, + "loss": 1.7007, + "step": 18678 + }, + { + "epoch": 5.733271945979128, + "grad_norm": 0.21296904981136322, + "learning_rate": 4.061687300427022e-05, + "loss": 1.7201, + "step": 18679 + }, + { + "epoch": 5.7335788827501535, + "grad_norm": 0.24943144619464874, + "learning_rate": 4.0611990799925104e-05, + "loss": 1.7186, + "step": 18680 + }, + { + "epoch": 5.733885819521179, + "grad_norm": 0.2574152946472168, + "learning_rate": 4.060710868835781e-05, + "loss": 1.8671, + "step": 18681 + }, + { + "epoch": 5.734192756292204, + "grad_norm": 0.26023826003074646, + "learning_rate": 4.0602226669616564e-05, + "loss": 1.7618, + "step": 18682 + }, + { + "epoch": 5.734499693063229, + "grad_norm": 0.21078336238861084, + "learning_rate": 4.0597344743749645e-05, + "loss": 1.7548, + "step": 18683 + }, + { + "epoch": 5.734806629834254, + "grad_norm": 0.2195056676864624, + "learning_rate": 4.059246291080525e-05, + "loss": 1.6843, + "step": 18684 + }, + { + "epoch": 5.735113566605279, + "grad_norm": 0.20719893276691437, + "learning_rate": 4.058758117083168e-05, + "loss": 1.692, + "step": 18685 + }, + { + "epoch": 5.735420503376305, + "grad_norm": 0.23012077808380127, + "learning_rate": 4.058269952387713e-05, + "loss": 1.7072, + "step": 18686 + }, + { + "epoch": 5.73572744014733, + "grad_norm": 0.18598411977291107, + "learning_rate": 4.057781796998986e-05, + "loss": 1.6983, + "step": 18687 + }, + { + "epoch": 5.736034376918354, + "grad_norm": 0.20211926102638245, + "learning_rate": 4.057293650921813e-05, + "loss": 1.6818, + "step": 18688 + }, + { + "epoch": 5.73634131368938, + "grad_norm": 0.1957080215215683, + "learning_rate": 4.056805514161015e-05, + "loss": 1.7154, + "step": 18689 + }, + { + "epoch": 5.736648250460405, + "grad_norm": 0.23581798374652863, + "learning_rate": 4.0563173867214196e-05, + "loss": 1.7724, + "step": 18690 + }, + { + "epoch": 5.73695518723143, + "grad_norm": 0.22706671059131622, + "learning_rate": 4.055829268607847e-05, + "loss": 1.7387, + "step": 18691 + }, + { + "epoch": 5.737262124002456, + "grad_norm": 0.20050427317619324, + "learning_rate": 4.055341159825124e-05, + "loss": 1.7585, + "step": 18692 + }, + { + "epoch": 5.737569060773481, + "grad_norm": 0.18666231632232666, + "learning_rate": 4.054853060378072e-05, + "loss": 1.6996, + "step": 18693 + }, + { + "epoch": 5.7378759975445055, + "grad_norm": 0.23018911480903625, + "learning_rate": 4.0543649702715186e-05, + "loss": 1.7167, + "step": 18694 + }, + { + "epoch": 5.738182934315531, + "grad_norm": 0.21207039058208466, + "learning_rate": 4.053876889510282e-05, + "loss": 1.7539, + "step": 18695 + }, + { + "epoch": 5.738489871086556, + "grad_norm": 0.22042523324489594, + "learning_rate": 4.0533888180991915e-05, + "loss": 1.8145, + "step": 18696 + }, + { + "epoch": 5.7387968078575815, + "grad_norm": 0.20705139636993408, + "learning_rate": 4.0529007560430646e-05, + "loss": 1.7612, + "step": 18697 + }, + { + "epoch": 5.739103744628607, + "grad_norm": 0.20673857629299164, + "learning_rate": 4.052412703346729e-05, + "loss": 1.7338, + "step": 18698 + }, + { + "epoch": 5.739410681399631, + "grad_norm": 0.20742641389369965, + "learning_rate": 4.051924660015005e-05, + "loss": 1.7497, + "step": 18699 + }, + { + "epoch": 5.739717618170657, + "grad_norm": 0.22352617979049683, + "learning_rate": 4.05143662605272e-05, + "loss": 1.7568, + "step": 18700 + }, + { + "epoch": 5.740024554941682, + "grad_norm": 0.20306691527366638, + "learning_rate": 4.050948601464692e-05, + "loss": 1.7416, + "step": 18701 + }, + { + "epoch": 5.740331491712707, + "grad_norm": 0.22972522675991058, + "learning_rate": 4.050460586255748e-05, + "loss": 1.7907, + "step": 18702 + }, + { + "epoch": 5.740638428483733, + "grad_norm": 0.2056068629026413, + "learning_rate": 4.0499725804307084e-05, + "loss": 1.7584, + "step": 18703 + }, + { + "epoch": 5.740945365254758, + "grad_norm": 0.2150508463382721, + "learning_rate": 4.049484583994395e-05, + "loss": 1.7695, + "step": 18704 + }, + { + "epoch": 5.741252302025782, + "grad_norm": 0.20274797081947327, + "learning_rate": 4.048996596951634e-05, + "loss": 1.7398, + "step": 18705 + }, + { + "epoch": 5.741559238796808, + "grad_norm": 0.20521290600299835, + "learning_rate": 4.0485086193072444e-05, + "loss": 1.7529, + "step": 18706 + }, + { + "epoch": 5.741866175567833, + "grad_norm": 0.22344307601451874, + "learning_rate": 4.0480206510660527e-05, + "loss": 1.6729, + "step": 18707 + }, + { + "epoch": 5.742173112338858, + "grad_norm": 0.20007841289043427, + "learning_rate": 4.047532692232876e-05, + "loss": 1.7004, + "step": 18708 + }, + { + "epoch": 5.742480049109884, + "grad_norm": 0.2455853819847107, + "learning_rate": 4.047044742812541e-05, + "loss": 1.7324, + "step": 18709 + }, + { + "epoch": 5.742786985880908, + "grad_norm": 0.29901546239852905, + "learning_rate": 4.046556802809867e-05, + "loss": 1.7138, + "step": 18710 + }, + { + "epoch": 5.7430939226519335, + "grad_norm": 0.19636842608451843, + "learning_rate": 4.04606887222968e-05, + "loss": 1.7098, + "step": 18711 + }, + { + "epoch": 5.743400859422959, + "grad_norm": 0.24916070699691772, + "learning_rate": 4.045580951076797e-05, + "loss": 1.7073, + "step": 18712 + }, + { + "epoch": 5.743707796193984, + "grad_norm": 0.2122841477394104, + "learning_rate": 4.0450930393560453e-05, + "loss": 1.7608, + "step": 18713 + }, + { + "epoch": 5.7440147329650095, + "grad_norm": 0.25119176506996155, + "learning_rate": 4.044605137072241e-05, + "loss": 1.7528, + "step": 18714 + }, + { + "epoch": 5.744321669736035, + "grad_norm": 0.2128097116947174, + "learning_rate": 4.0441172442302104e-05, + "loss": 1.6834, + "step": 18715 + }, + { + "epoch": 5.744628606507059, + "grad_norm": 0.1771443784236908, + "learning_rate": 4.043629360834772e-05, + "loss": 1.6699, + "step": 18716 + }, + { + "epoch": 5.744935543278085, + "grad_norm": 0.2360549122095108, + "learning_rate": 4.043141486890751e-05, + "loss": 1.7704, + "step": 18717 + }, + { + "epoch": 5.74524248004911, + "grad_norm": 0.22453519701957703, + "learning_rate": 4.0426536224029645e-05, + "loss": 1.7305, + "step": 18718 + }, + { + "epoch": 5.745549416820135, + "grad_norm": 0.2170165628194809, + "learning_rate": 4.042165767376238e-05, + "loss": 1.7859, + "step": 18719 + }, + { + "epoch": 5.74585635359116, + "grad_norm": 0.233921617269516, + "learning_rate": 4.0416779218153896e-05, + "loss": 1.7622, + "step": 18720 + }, + { + "epoch": 5.746163290362185, + "grad_norm": 0.2698482871055603, + "learning_rate": 4.041190085725242e-05, + "loss": 1.7419, + "step": 18721 + }, + { + "epoch": 5.74647022713321, + "grad_norm": 0.28437280654907227, + "learning_rate": 4.0407022591106165e-05, + "loss": 1.7242, + "step": 18722 + }, + { + "epoch": 5.746777163904236, + "grad_norm": 0.2087356448173523, + "learning_rate": 4.040214441976332e-05, + "loss": 1.747, + "step": 18723 + }, + { + "epoch": 5.747084100675261, + "grad_norm": 0.2028181403875351, + "learning_rate": 4.039726634327213e-05, + "loss": 1.7843, + "step": 18724 + }, + { + "epoch": 5.747391037446286, + "grad_norm": 0.18513897061347961, + "learning_rate": 4.039238836168076e-05, + "loss": 1.692, + "step": 18725 + }, + { + "epoch": 5.747697974217311, + "grad_norm": 0.2308989316225052, + "learning_rate": 4.038751047503745e-05, + "loss": 1.6625, + "step": 18726 + }, + { + "epoch": 5.748004910988336, + "grad_norm": 0.23922030627727509, + "learning_rate": 4.0382632683390386e-05, + "loss": 1.7407, + "step": 18727 + }, + { + "epoch": 5.7483118477593615, + "grad_norm": 0.17225340008735657, + "learning_rate": 4.0377754986787806e-05, + "loss": 1.6888, + "step": 18728 + }, + { + "epoch": 5.748618784530387, + "grad_norm": 0.1898551732301712, + "learning_rate": 4.037287738527786e-05, + "loss": 1.6931, + "step": 18729 + }, + { + "epoch": 5.748925721301412, + "grad_norm": 0.22900012135505676, + "learning_rate": 4.036799987890881e-05, + "loss": 1.751, + "step": 18730 + }, + { + "epoch": 5.749232658072437, + "grad_norm": 0.21106193959712982, + "learning_rate": 4.0363122467728815e-05, + "loss": 1.6919, + "step": 18731 + }, + { + "epoch": 5.749539594843462, + "grad_norm": 0.19944290816783905, + "learning_rate": 4.03582451517861e-05, + "loss": 1.7232, + "step": 18732 + }, + { + "epoch": 5.749846531614487, + "grad_norm": 0.1833256036043167, + "learning_rate": 4.035336793112885e-05, + "loss": 1.7199, + "step": 18733 + }, + { + "epoch": 5.750153468385513, + "grad_norm": 0.2596902847290039, + "learning_rate": 4.0348490805805287e-05, + "loss": 1.7386, + "step": 18734 + }, + { + "epoch": 5.750460405156538, + "grad_norm": 0.23708637058734894, + "learning_rate": 4.034361377586357e-05, + "loss": 1.7697, + "step": 18735 + }, + { + "epoch": 5.750767341927563, + "grad_norm": 0.20476554334163666, + "learning_rate": 4.033873684135195e-05, + "loss": 1.7804, + "step": 18736 + }, + { + "epoch": 5.751074278698588, + "grad_norm": 0.2625868320465088, + "learning_rate": 4.033386000231858e-05, + "loss": 1.7046, + "step": 18737 + }, + { + "epoch": 5.751381215469613, + "grad_norm": 0.23011820018291473, + "learning_rate": 4.032898325881166e-05, + "loss": 1.7758, + "step": 18738 + }, + { + "epoch": 5.7516881522406385, + "grad_norm": 0.23972748219966888, + "learning_rate": 4.032410661087943e-05, + "loss": 1.7165, + "step": 18739 + }, + { + "epoch": 5.751995089011664, + "grad_norm": 0.2241208404302597, + "learning_rate": 4.031923005857001e-05, + "loss": 1.713, + "step": 18740 + }, + { + "epoch": 5.752302025782689, + "grad_norm": 0.22316952049732208, + "learning_rate": 4.0314353601931665e-05, + "loss": 1.7655, + "step": 18741 + }, + { + "epoch": 5.752608962553714, + "grad_norm": 0.2177707403898239, + "learning_rate": 4.030947724101253e-05, + "loss": 1.7517, + "step": 18742 + }, + { + "epoch": 5.752915899324739, + "grad_norm": 0.21731823682785034, + "learning_rate": 4.030460097586083e-05, + "loss": 1.718, + "step": 18743 + }, + { + "epoch": 5.753222836095764, + "grad_norm": 0.1700165718793869, + "learning_rate": 4.0299724806524744e-05, + "loss": 1.6536, + "step": 18744 + }, + { + "epoch": 5.75352977286679, + "grad_norm": 0.21920062601566315, + "learning_rate": 4.029484873305247e-05, + "loss": 1.7298, + "step": 18745 + }, + { + "epoch": 5.753836709637815, + "grad_norm": 0.22648905217647552, + "learning_rate": 4.028997275549218e-05, + "loss": 1.7878, + "step": 18746 + }, + { + "epoch": 5.75414364640884, + "grad_norm": 0.19443005323410034, + "learning_rate": 4.028509687389208e-05, + "loss": 1.7582, + "step": 18747 + }, + { + "epoch": 5.754450583179865, + "grad_norm": 0.21973860263824463, + "learning_rate": 4.028022108830034e-05, + "loss": 1.8215, + "step": 18748 + }, + { + "epoch": 5.75475751995089, + "grad_norm": 0.2215481847524643, + "learning_rate": 4.0275345398765155e-05, + "loss": 1.7092, + "step": 18749 + }, + { + "epoch": 5.755064456721915, + "grad_norm": 0.18789733946323395, + "learning_rate": 4.0270469805334696e-05, + "loss": 1.7089, + "step": 18750 + }, + { + "epoch": 5.755371393492941, + "grad_norm": 0.2423657774925232, + "learning_rate": 4.0265594308057175e-05, + "loss": 1.7412, + "step": 18751 + }, + { + "epoch": 5.755678330263965, + "grad_norm": 0.22020475566387177, + "learning_rate": 4.026071890698074e-05, + "loss": 1.7644, + "step": 18752 + }, + { + "epoch": 5.7559852670349905, + "grad_norm": 0.31772032380104065, + "learning_rate": 4.025584360215361e-05, + "loss": 1.7326, + "step": 18753 + }, + { + "epoch": 5.756292203806016, + "grad_norm": 0.23786257207393646, + "learning_rate": 4.025096839362393e-05, + "loss": 1.7652, + "step": 18754 + }, + { + "epoch": 5.756599140577041, + "grad_norm": 0.24288083612918854, + "learning_rate": 4.024609328143989e-05, + "loss": 1.6797, + "step": 18755 + }, + { + "epoch": 5.7569060773480665, + "grad_norm": 0.30519670248031616, + "learning_rate": 4.024121826564969e-05, + "loss": 1.7442, + "step": 18756 + }, + { + "epoch": 5.757213014119092, + "grad_norm": 0.218281090259552, + "learning_rate": 4.023634334630147e-05, + "loss": 1.7498, + "step": 18757 + }, + { + "epoch": 5.757519950890116, + "grad_norm": 0.215846985578537, + "learning_rate": 4.023146852344345e-05, + "loss": 1.7728, + "step": 18758 + }, + { + "epoch": 5.757826887661142, + "grad_norm": 0.2883944511413574, + "learning_rate": 4.022659379712376e-05, + "loss": 1.8098, + "step": 18759 + }, + { + "epoch": 5.758133824432167, + "grad_norm": 0.25141629576683044, + "learning_rate": 4.022171916739062e-05, + "loss": 1.6574, + "step": 18760 + }, + { + "epoch": 5.758440761203192, + "grad_norm": 0.22118757665157318, + "learning_rate": 4.021684463429216e-05, + "loss": 1.7542, + "step": 18761 + }, + { + "epoch": 5.758747697974218, + "grad_norm": 0.2437646985054016, + "learning_rate": 4.02119701978766e-05, + "loss": 1.7182, + "step": 18762 + }, + { + "epoch": 5.759054634745242, + "grad_norm": 0.24247203767299652, + "learning_rate": 4.020709585819206e-05, + "loss": 1.7134, + "step": 18763 + }, + { + "epoch": 5.759361571516267, + "grad_norm": 0.208528533577919, + "learning_rate": 4.020222161528677e-05, + "loss": 1.6966, + "step": 18764 + }, + { + "epoch": 5.759668508287293, + "grad_norm": 0.19645826518535614, + "learning_rate": 4.0197347469208843e-05, + "loss": 1.7261, + "step": 18765 + }, + { + "epoch": 5.759975445058318, + "grad_norm": 0.20066291093826294, + "learning_rate": 4.019247342000648e-05, + "loss": 1.7197, + "step": 18766 + }, + { + "epoch": 5.760282381829343, + "grad_norm": 0.25344669818878174, + "learning_rate": 4.0187599467727845e-05, + "loss": 1.7957, + "step": 18767 + }, + { + "epoch": 5.760589318600369, + "grad_norm": 0.1917620301246643, + "learning_rate": 4.018272561242111e-05, + "loss": 1.6868, + "step": 18768 + }, + { + "epoch": 5.760896255371393, + "grad_norm": 0.21996566653251648, + "learning_rate": 4.0177851854134424e-05, + "loss": 1.7128, + "step": 18769 + }, + { + "epoch": 5.7612031921424185, + "grad_norm": 0.23226283490657806, + "learning_rate": 4.017297819291598e-05, + "loss": 1.7079, + "step": 18770 + }, + { + "epoch": 5.761510128913444, + "grad_norm": 0.30606213212013245, + "learning_rate": 4.016810462881391e-05, + "loss": 1.8087, + "step": 18771 + }, + { + "epoch": 5.761817065684469, + "grad_norm": 0.2171698361635208, + "learning_rate": 4.016323116187639e-05, + "loss": 1.7377, + "step": 18772 + }, + { + "epoch": 5.7621240024554945, + "grad_norm": 0.24234412610530853, + "learning_rate": 4.01583577921516e-05, + "loss": 1.734, + "step": 18773 + }, + { + "epoch": 5.762430939226519, + "grad_norm": 0.2648961544036865, + "learning_rate": 4.015348451968767e-05, + "loss": 1.7423, + "step": 18774 + }, + { + "epoch": 5.762737875997544, + "grad_norm": 0.18316571414470673, + "learning_rate": 4.01486113445328e-05, + "loss": 1.6708, + "step": 18775 + }, + { + "epoch": 5.76304481276857, + "grad_norm": 0.241583451628685, + "learning_rate": 4.0143738266735104e-05, + "loss": 1.708, + "step": 18776 + }, + { + "epoch": 5.763351749539595, + "grad_norm": 0.2268480360507965, + "learning_rate": 4.0138865286342775e-05, + "loss": 1.7106, + "step": 18777 + }, + { + "epoch": 5.76365868631062, + "grad_norm": 0.2038748860359192, + "learning_rate": 4.0133992403403944e-05, + "loss": 1.7349, + "step": 18778 + }, + { + "epoch": 5.763965623081646, + "grad_norm": 0.24422483146190643, + "learning_rate": 4.0129119617966805e-05, + "loss": 1.659, + "step": 18779 + }, + { + "epoch": 5.76427255985267, + "grad_norm": 0.19925715029239655, + "learning_rate": 4.0124246930079476e-05, + "loss": 1.6983, + "step": 18780 + }, + { + "epoch": 5.764579496623695, + "grad_norm": 0.29671359062194824, + "learning_rate": 4.0119374339790136e-05, + "loss": 1.7188, + "step": 18781 + }, + { + "epoch": 5.764886433394721, + "grad_norm": 0.2752140760421753, + "learning_rate": 4.011450184714692e-05, + "loss": 1.738, + "step": 18782 + }, + { + "epoch": 5.765193370165746, + "grad_norm": 0.2112676352262497, + "learning_rate": 4.0109629452198e-05, + "loss": 1.7529, + "step": 18783 + }, + { + "epoch": 5.765500306936771, + "grad_norm": 0.2091330885887146, + "learning_rate": 4.010475715499151e-05, + "loss": 1.6771, + "step": 18784 + }, + { + "epoch": 5.765807243707796, + "grad_norm": 0.26556238532066345, + "learning_rate": 4.009988495557562e-05, + "loss": 1.7721, + "step": 18785 + }, + { + "epoch": 5.766114180478821, + "grad_norm": 0.20728638768196106, + "learning_rate": 4.009501285399846e-05, + "loss": 1.6893, + "step": 18786 + }, + { + "epoch": 5.7664211172498465, + "grad_norm": 0.213730126619339, + "learning_rate": 4.00901408503082e-05, + "loss": 1.704, + "step": 18787 + }, + { + "epoch": 5.766728054020872, + "grad_norm": 0.21422363817691803, + "learning_rate": 4.0085268944552975e-05, + "loss": 1.7571, + "step": 18788 + }, + { + "epoch": 5.767034990791897, + "grad_norm": 0.20936815440654755, + "learning_rate": 4.0080397136780915e-05, + "loss": 1.7423, + "step": 18789 + }, + { + "epoch": 5.7673419275629225, + "grad_norm": 0.26223674416542053, + "learning_rate": 4.007552542704021e-05, + "loss": 1.7687, + "step": 18790 + }, + { + "epoch": 5.767648864333947, + "grad_norm": 0.3524645268917084, + "learning_rate": 4.0070653815378954e-05, + "loss": 1.7754, + "step": 18791 + }, + { + "epoch": 5.767955801104972, + "grad_norm": 0.20238324999809265, + "learning_rate": 4.006578230184534e-05, + "loss": 1.7043, + "step": 18792 + }, + { + "epoch": 5.768262737875998, + "grad_norm": 0.2739984393119812, + "learning_rate": 4.006091088648747e-05, + "loss": 1.7596, + "step": 18793 + }, + { + "epoch": 5.768569674647023, + "grad_norm": 0.29209306836128235, + "learning_rate": 4.0056039569353515e-05, + "loss": 1.6857, + "step": 18794 + }, + { + "epoch": 5.768876611418047, + "grad_norm": 0.21838447451591492, + "learning_rate": 4.005116835049161e-05, + "loss": 1.7531, + "step": 18795 + }, + { + "epoch": 5.769183548189073, + "grad_norm": 0.21940091252326965, + "learning_rate": 4.0046297229949884e-05, + "loss": 1.7363, + "step": 18796 + }, + { + "epoch": 5.769490484960098, + "grad_norm": 0.22679758071899414, + "learning_rate": 4.004142620777647e-05, + "loss": 1.7586, + "step": 18797 + }, + { + "epoch": 5.769797421731123, + "grad_norm": 0.23782022297382355, + "learning_rate": 4.003655528401954e-05, + "loss": 1.7154, + "step": 18798 + }, + { + "epoch": 5.770104358502149, + "grad_norm": 0.20452092587947845, + "learning_rate": 4.0031684458727194e-05, + "loss": 1.7078, + "step": 18799 + }, + { + "epoch": 5.770411295273174, + "grad_norm": 0.22733618319034576, + "learning_rate": 4.0026813731947594e-05, + "loss": 1.6989, + "step": 18800 + }, + { + "epoch": 5.7707182320441985, + "grad_norm": 0.2322154939174652, + "learning_rate": 4.002194310372886e-05, + "loss": 1.7508, + "step": 18801 + }, + { + "epoch": 5.771025168815224, + "grad_norm": 0.24573352932929993, + "learning_rate": 4.001707257411914e-05, + "loss": 1.7245, + "step": 18802 + }, + { + "epoch": 5.771332105586249, + "grad_norm": 0.19692079722881317, + "learning_rate": 4.001220214316655e-05, + "loss": 1.7116, + "step": 18803 + }, + { + "epoch": 5.7716390423572745, + "grad_norm": 0.20525199174880981, + "learning_rate": 4.000733181091925e-05, + "loss": 1.7503, + "step": 18804 + }, + { + "epoch": 5.7719459791283, + "grad_norm": 0.2097626030445099, + "learning_rate": 4.0002461577425344e-05, + "loss": 1.8204, + "step": 18805 + }, + { + "epoch": 5.772252915899324, + "grad_norm": 0.23059608042240143, + "learning_rate": 3.9997591442732975e-05, + "loss": 1.7747, + "step": 18806 + }, + { + "epoch": 5.77255985267035, + "grad_norm": 0.22085745632648468, + "learning_rate": 3.9992721406890265e-05, + "loss": 1.7579, + "step": 18807 + }, + { + "epoch": 5.772866789441375, + "grad_norm": 0.21529869735240936, + "learning_rate": 3.9987851469945334e-05, + "loss": 1.711, + "step": 18808 + }, + { + "epoch": 5.7731737262124, + "grad_norm": 0.20563572645187378, + "learning_rate": 3.998298163194636e-05, + "loss": 1.761, + "step": 18809 + }, + { + "epoch": 5.773480662983426, + "grad_norm": 0.2081122100353241, + "learning_rate": 3.9978111892941394e-05, + "loss": 1.7112, + "step": 18810 + }, + { + "epoch": 5.773787599754451, + "grad_norm": 0.2373751550912857, + "learning_rate": 3.9973242252978635e-05, + "loss": 1.7726, + "step": 18811 + }, + { + "epoch": 5.774094536525475, + "grad_norm": 0.2742944359779358, + "learning_rate": 3.996837271210615e-05, + "loss": 1.7743, + "step": 18812 + }, + { + "epoch": 5.774401473296501, + "grad_norm": 0.20724992454051971, + "learning_rate": 3.996350327037208e-05, + "loss": 1.7052, + "step": 18813 + }, + { + "epoch": 5.774708410067526, + "grad_norm": 0.22324968874454498, + "learning_rate": 3.995863392782456e-05, + "loss": 1.7865, + "step": 18814 + }, + { + "epoch": 5.7750153468385514, + "grad_norm": 0.22314245998859406, + "learning_rate": 3.995376468451172e-05, + "loss": 1.7705, + "step": 18815 + }, + { + "epoch": 5.775322283609577, + "grad_norm": 0.20793841779232025, + "learning_rate": 3.994889554048165e-05, + "loss": 1.739, + "step": 18816 + }, + { + "epoch": 5.775629220380601, + "grad_norm": 0.20117145776748657, + "learning_rate": 3.994402649578249e-05, + "loss": 1.7256, + "step": 18817 + }, + { + "epoch": 5.775936157151627, + "grad_norm": 0.24406170845031738, + "learning_rate": 3.993915755046235e-05, + "loss": 1.8015, + "step": 18818 + }, + { + "epoch": 5.776243093922652, + "grad_norm": 0.20912545919418335, + "learning_rate": 3.993428870456935e-05, + "loss": 1.7038, + "step": 18819 + }, + { + "epoch": 5.776550030693677, + "grad_norm": 0.2587272822856903, + "learning_rate": 3.992941995815162e-05, + "loss": 1.7918, + "step": 18820 + }, + { + "epoch": 5.776856967464703, + "grad_norm": 0.2996658980846405, + "learning_rate": 3.9924551311257266e-05, + "loss": 1.7513, + "step": 18821 + }, + { + "epoch": 5.777163904235728, + "grad_norm": 0.24603547155857086, + "learning_rate": 3.991968276393441e-05, + "loss": 1.7329, + "step": 18822 + }, + { + "epoch": 5.777470841006752, + "grad_norm": 0.2321038693189621, + "learning_rate": 3.991481431623113e-05, + "loss": 1.7406, + "step": 18823 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.3397100269794464, + "learning_rate": 3.990994596819558e-05, + "loss": 1.8129, + "step": 18824 + }, + { + "epoch": 5.778084714548803, + "grad_norm": 0.2807735800743103, + "learning_rate": 3.990507771987584e-05, + "loss": 1.7579, + "step": 18825 + }, + { + "epoch": 5.778391651319828, + "grad_norm": 0.1952899694442749, + "learning_rate": 3.990020957132007e-05, + "loss": 1.7153, + "step": 18826 + }, + { + "epoch": 5.778698588090853, + "grad_norm": 0.28998714685440063, + "learning_rate": 3.989534152257632e-05, + "loss": 1.7844, + "step": 18827 + }, + { + "epoch": 5.779005524861878, + "grad_norm": 0.20929136872291565, + "learning_rate": 3.989047357369275e-05, + "loss": 1.7499, + "step": 18828 + }, + { + "epoch": 5.7793124616329035, + "grad_norm": 0.31144043803215027, + "learning_rate": 3.9885605724717436e-05, + "loss": 1.7745, + "step": 18829 + }, + { + "epoch": 5.779619398403929, + "grad_norm": 0.22598792612552643, + "learning_rate": 3.988073797569849e-05, + "loss": 1.7226, + "step": 18830 + }, + { + "epoch": 5.779926335174954, + "grad_norm": 0.1971752643585205, + "learning_rate": 3.987587032668402e-05, + "loss": 1.7033, + "step": 18831 + }, + { + "epoch": 5.7802332719459795, + "grad_norm": 0.221087247133255, + "learning_rate": 3.9871002777722156e-05, + "loss": 1.7281, + "step": 18832 + }, + { + "epoch": 5.780540208717004, + "grad_norm": 0.21678583323955536, + "learning_rate": 3.986613532886095e-05, + "loss": 1.7207, + "step": 18833 + }, + { + "epoch": 5.780847145488029, + "grad_norm": 0.2511122226715088, + "learning_rate": 3.9861267980148566e-05, + "loss": 1.7091, + "step": 18834 + }, + { + "epoch": 5.781154082259055, + "grad_norm": 0.2883855104446411, + "learning_rate": 3.985640073163304e-05, + "loss": 1.7963, + "step": 18835 + }, + { + "epoch": 5.78146101903008, + "grad_norm": 0.21786242723464966, + "learning_rate": 3.985153358336253e-05, + "loss": 1.6883, + "step": 18836 + }, + { + "epoch": 5.781767955801105, + "grad_norm": 0.18529155850410461, + "learning_rate": 3.98466665353851e-05, + "loss": 1.7194, + "step": 18837 + }, + { + "epoch": 5.78207489257213, + "grad_norm": 0.20535743236541748, + "learning_rate": 3.984179958774888e-05, + "loss": 1.6943, + "step": 18838 + }, + { + "epoch": 5.782381829343155, + "grad_norm": 0.19377392530441284, + "learning_rate": 3.983693274050195e-05, + "loss": 1.6732, + "step": 18839 + }, + { + "epoch": 5.78268876611418, + "grad_norm": 0.22373615205287933, + "learning_rate": 3.983206599369239e-05, + "loss": 1.7668, + "step": 18840 + }, + { + "epoch": 5.782995702885206, + "grad_norm": 0.2132388800382614, + "learning_rate": 3.982719934736832e-05, + "loss": 1.7155, + "step": 18841 + }, + { + "epoch": 5.783302639656231, + "grad_norm": 0.24871744215488434, + "learning_rate": 3.982233280157782e-05, + "loss": 1.7232, + "step": 18842 + }, + { + "epoch": 5.783609576427256, + "grad_norm": 0.1861848086118698, + "learning_rate": 3.981746635636902e-05, + "loss": 1.707, + "step": 18843 + }, + { + "epoch": 5.783916513198281, + "grad_norm": 0.21882779896259308, + "learning_rate": 3.981260001178995e-05, + "loss": 1.7165, + "step": 18844 + }, + { + "epoch": 5.784223449969306, + "grad_norm": 0.22144648432731628, + "learning_rate": 3.980773376788877e-05, + "loss": 1.7799, + "step": 18845 + }, + { + "epoch": 5.7845303867403315, + "grad_norm": 0.210894376039505, + "learning_rate": 3.980286762471351e-05, + "loss": 1.7539, + "step": 18846 + }, + { + "epoch": 5.784837323511357, + "grad_norm": 0.20435640215873718, + "learning_rate": 3.9798001582312305e-05, + "loss": 1.6736, + "step": 18847 + }, + { + "epoch": 5.785144260282382, + "grad_norm": 0.18998762965202332, + "learning_rate": 3.979313564073322e-05, + "loss": 1.7045, + "step": 18848 + }, + { + "epoch": 5.785451197053407, + "grad_norm": 0.19869361817836761, + "learning_rate": 3.978826980002437e-05, + "loss": 1.7444, + "step": 18849 + }, + { + "epoch": 5.785758133824432, + "grad_norm": 0.2175174504518509, + "learning_rate": 3.97834040602338e-05, + "loss": 1.7565, + "step": 18850 + }, + { + "epoch": 5.786065070595457, + "grad_norm": 0.22726793587207794, + "learning_rate": 3.977853842140964e-05, + "loss": 1.713, + "step": 18851 + }, + { + "epoch": 5.786372007366483, + "grad_norm": 0.26518720388412476, + "learning_rate": 3.9773672883599934e-05, + "loss": 1.6892, + "step": 18852 + }, + { + "epoch": 5.786678944137508, + "grad_norm": 0.20721858739852905, + "learning_rate": 3.97688074468528e-05, + "loss": 1.724, + "step": 18853 + }, + { + "epoch": 5.786985880908533, + "grad_norm": 0.22739483416080475, + "learning_rate": 3.976394211121629e-05, + "loss": 1.762, + "step": 18854 + }, + { + "epoch": 5.787292817679558, + "grad_norm": 0.21918894350528717, + "learning_rate": 3.975907687673853e-05, + "loss": 1.6812, + "step": 18855 + }, + { + "epoch": 5.787599754450583, + "grad_norm": 0.20931273698806763, + "learning_rate": 3.9754211743467574e-05, + "loss": 1.6874, + "step": 18856 + }, + { + "epoch": 5.787906691221608, + "grad_norm": 0.2015041708946228, + "learning_rate": 3.974934671145148e-05, + "loss": 1.7248, + "step": 18857 + }, + { + "epoch": 5.788213627992634, + "grad_norm": 0.21632663905620575, + "learning_rate": 3.974448178073836e-05, + "loss": 1.7313, + "step": 18858 + }, + { + "epoch": 5.788520564763659, + "grad_norm": 0.18995213508605957, + "learning_rate": 3.973961695137627e-05, + "loss": 1.6761, + "step": 18859 + }, + { + "epoch": 5.7888275015346835, + "grad_norm": 0.18678395450115204, + "learning_rate": 3.973475222341333e-05, + "loss": 1.7082, + "step": 18860 + }, + { + "epoch": 5.789134438305709, + "grad_norm": 0.1889343559741974, + "learning_rate": 3.972988759689756e-05, + "loss": 1.7296, + "step": 18861 + }, + { + "epoch": 5.789441375076734, + "grad_norm": 0.20196790993213654, + "learning_rate": 3.9725023071877074e-05, + "loss": 1.6876, + "step": 18862 + }, + { + "epoch": 5.7897483118477595, + "grad_norm": 0.198349729180336, + "learning_rate": 3.972015864839992e-05, + "loss": 1.6826, + "step": 18863 + }, + { + "epoch": 5.790055248618785, + "grad_norm": 0.21323837339878082, + "learning_rate": 3.9715294326514185e-05, + "loss": 1.7444, + "step": 18864 + }, + { + "epoch": 5.79036218538981, + "grad_norm": 0.18581731617450714, + "learning_rate": 3.9710430106267934e-05, + "loss": 1.7731, + "step": 18865 + }, + { + "epoch": 5.790669122160835, + "grad_norm": 0.21925146877765656, + "learning_rate": 3.970556598770927e-05, + "loss": 1.7505, + "step": 18866 + }, + { + "epoch": 5.79097605893186, + "grad_norm": 0.20773115754127502, + "learning_rate": 3.970070197088621e-05, + "loss": 1.7408, + "step": 18867 + }, + { + "epoch": 5.791282995702885, + "grad_norm": 0.1805189698934555, + "learning_rate": 3.9695838055846865e-05, + "loss": 1.6871, + "step": 18868 + }, + { + "epoch": 5.791589932473911, + "grad_norm": 0.24685314297676086, + "learning_rate": 3.969097424263928e-05, + "loss": 1.7186, + "step": 18869 + }, + { + "epoch": 5.791896869244935, + "grad_norm": 0.18801769614219666, + "learning_rate": 3.9686110531311526e-05, + "loss": 1.7196, + "step": 18870 + }, + { + "epoch": 5.79220380601596, + "grad_norm": 0.22717779874801636, + "learning_rate": 3.968124692191168e-05, + "loss": 1.7309, + "step": 18871 + }, + { + "epoch": 5.792510742786986, + "grad_norm": 0.23058642446994781, + "learning_rate": 3.9676383414487806e-05, + "loss": 1.6993, + "step": 18872 + }, + { + "epoch": 5.792817679558011, + "grad_norm": 0.24307532608509064, + "learning_rate": 3.967152000908796e-05, + "loss": 1.6986, + "step": 18873 + }, + { + "epoch": 5.793124616329036, + "grad_norm": 0.3032459318637848, + "learning_rate": 3.9666656705760195e-05, + "loss": 1.677, + "step": 18874 + }, + { + "epoch": 5.793431553100062, + "grad_norm": 0.22669538855552673, + "learning_rate": 3.966179350455259e-05, + "loss": 1.7361, + "step": 18875 + }, + { + "epoch": 5.793738489871086, + "grad_norm": 0.27729150652885437, + "learning_rate": 3.96569304055132e-05, + "loss": 1.746, + "step": 18876 + }, + { + "epoch": 5.7940454266421115, + "grad_norm": 0.3422098755836487, + "learning_rate": 3.96520674086901e-05, + "loss": 1.783, + "step": 18877 + }, + { + "epoch": 5.794352363413137, + "grad_norm": 0.2114052176475525, + "learning_rate": 3.964720451413131e-05, + "loss": 1.7127, + "step": 18878 + }, + { + "epoch": 5.794659300184162, + "grad_norm": 0.22928549349308014, + "learning_rate": 3.964234172188494e-05, + "loss": 1.6579, + "step": 18879 + }, + { + "epoch": 5.7949662369551875, + "grad_norm": 0.24813635647296906, + "learning_rate": 3.9637479031999e-05, + "loss": 1.728, + "step": 18880 + }, + { + "epoch": 5.795273173726212, + "grad_norm": 0.19779744744300842, + "learning_rate": 3.963261644452158e-05, + "loss": 1.7338, + "step": 18881 + }, + { + "epoch": 5.795580110497237, + "grad_norm": 0.2424263060092926, + "learning_rate": 3.96277539595007e-05, + "loss": 1.7762, + "step": 18882 + }, + { + "epoch": 5.795887047268263, + "grad_norm": 0.24621224403381348, + "learning_rate": 3.9622891576984456e-05, + "loss": 1.7746, + "step": 18883 + }, + { + "epoch": 5.796193984039288, + "grad_norm": 0.1973372846841812, + "learning_rate": 3.961802929702086e-05, + "loss": 1.7243, + "step": 18884 + }, + { + "epoch": 5.796500920810313, + "grad_norm": 0.22170570492744446, + "learning_rate": 3.961316711965801e-05, + "loss": 1.764, + "step": 18885 + }, + { + "epoch": 5.796807857581339, + "grad_norm": 0.22319282591342926, + "learning_rate": 3.9608305044943906e-05, + "loss": 1.6795, + "step": 18886 + }, + { + "epoch": 5.797114794352363, + "grad_norm": 0.20000022649765015, + "learning_rate": 3.9603443072926635e-05, + "loss": 1.7587, + "step": 18887 + }, + { + "epoch": 5.797421731123388, + "grad_norm": 0.25041815638542175, + "learning_rate": 3.959858120365424e-05, + "loss": 1.7631, + "step": 18888 + }, + { + "epoch": 5.797728667894414, + "grad_norm": 0.23383729159832, + "learning_rate": 3.959371943717474e-05, + "loss": 1.741, + "step": 18889 + }, + { + "epoch": 5.798035604665439, + "grad_norm": 0.18609663844108582, + "learning_rate": 3.958885777353623e-05, + "loss": 1.6981, + "step": 18890 + }, + { + "epoch": 5.798342541436464, + "grad_norm": 0.29523593187332153, + "learning_rate": 3.9583996212786706e-05, + "loss": 1.8018, + "step": 18891 + }, + { + "epoch": 5.798649478207489, + "grad_norm": 0.20356589555740356, + "learning_rate": 3.9579134754974244e-05, + "loss": 1.7157, + "step": 18892 + }, + { + "epoch": 5.798956414978514, + "grad_norm": 0.2901862561702728, + "learning_rate": 3.957427340014688e-05, + "loss": 1.7249, + "step": 18893 + }, + { + "epoch": 5.7992633517495396, + "grad_norm": 0.24768278002738953, + "learning_rate": 3.956941214835267e-05, + "loss": 1.6894, + "step": 18894 + }, + { + "epoch": 5.799570288520565, + "grad_norm": 0.2417999804019928, + "learning_rate": 3.956455099963962e-05, + "loss": 1.7203, + "step": 18895 + }, + { + "epoch": 5.79987722529159, + "grad_norm": 0.2889639437198639, + "learning_rate": 3.9559689954055814e-05, + "loss": 1.7531, + "step": 18896 + }, + { + "epoch": 5.800184162062616, + "grad_norm": 0.21204611659049988, + "learning_rate": 3.955482901164926e-05, + "loss": 1.7521, + "step": 18897 + }, + { + "epoch": 5.80049109883364, + "grad_norm": 0.2961438298225403, + "learning_rate": 3.954996817246801e-05, + "loss": 1.8102, + "step": 18898 + }, + { + "epoch": 5.800798035604665, + "grad_norm": 0.36562761664390564, + "learning_rate": 3.9545107436560084e-05, + "loss": 1.6722, + "step": 18899 + }, + { + "epoch": 5.801104972375691, + "grad_norm": 0.22423696517944336, + "learning_rate": 3.954024680397357e-05, + "loss": 1.7101, + "step": 18900 + }, + { + "epoch": 5.801411909146716, + "grad_norm": 0.3122335970401764, + "learning_rate": 3.953538627475644e-05, + "loss": 1.7314, + "step": 18901 + }, + { + "epoch": 5.8017188459177405, + "grad_norm": 0.39004257321357727, + "learning_rate": 3.953052584895677e-05, + "loss": 1.762, + "step": 18902 + }, + { + "epoch": 5.802025782688766, + "grad_norm": 0.1827487200498581, + "learning_rate": 3.952566552662256e-05, + "loss": 1.6935, + "step": 18903 + }, + { + "epoch": 5.802332719459791, + "grad_norm": 0.3025164306163788, + "learning_rate": 3.952080530780188e-05, + "loss": 1.7448, + "step": 18904 + }, + { + "epoch": 5.8026396562308165, + "grad_norm": 0.2313300520181656, + "learning_rate": 3.9515945192542754e-05, + "loss": 1.7686, + "step": 18905 + }, + { + "epoch": 5.802946593001842, + "grad_norm": 0.3501042425632477, + "learning_rate": 3.9511085180893184e-05, + "loss": 1.775, + "step": 18906 + }, + { + "epoch": 5.803253529772867, + "grad_norm": 0.4111124873161316, + "learning_rate": 3.950622527290123e-05, + "loss": 1.7561, + "step": 18907 + }, + { + "epoch": 5.803560466543892, + "grad_norm": 0.20877736806869507, + "learning_rate": 3.950136546861489e-05, + "loss": 1.7356, + "step": 18908 + }, + { + "epoch": 5.803867403314917, + "grad_norm": 0.33404025435447693, + "learning_rate": 3.949650576808222e-05, + "loss": 1.7289, + "step": 18909 + }, + { + "epoch": 5.804174340085942, + "grad_norm": 0.2183927446603775, + "learning_rate": 3.9491646171351234e-05, + "loss": 1.7136, + "step": 18910 + }, + { + "epoch": 5.804481276856968, + "grad_norm": 0.27149543166160583, + "learning_rate": 3.948678667846997e-05, + "loss": 1.7516, + "step": 18911 + }, + { + "epoch": 5.804788213627993, + "grad_norm": 0.2369886338710785, + "learning_rate": 3.948192728948643e-05, + "loss": 1.6767, + "step": 18912 + }, + { + "epoch": 5.805095150399017, + "grad_norm": 0.20671069622039795, + "learning_rate": 3.947706800444867e-05, + "loss": 1.7831, + "step": 18913 + }, + { + "epoch": 5.805402087170043, + "grad_norm": 0.23622260987758636, + "learning_rate": 3.9472208823404665e-05, + "loss": 1.7121, + "step": 18914 + }, + { + "epoch": 5.805709023941068, + "grad_norm": 0.21099595725536346, + "learning_rate": 3.946734974640247e-05, + "loss": 1.7137, + "step": 18915 + }, + { + "epoch": 5.806015960712093, + "grad_norm": 0.2205580472946167, + "learning_rate": 3.9462490773490094e-05, + "loss": 1.713, + "step": 18916 + }, + { + "epoch": 5.806322897483119, + "grad_norm": 0.20183326303958893, + "learning_rate": 3.9457631904715584e-05, + "loss": 1.7316, + "step": 18917 + }, + { + "epoch": 5.806629834254144, + "grad_norm": 0.27381497621536255, + "learning_rate": 3.9452773140126906e-05, + "loss": 1.7577, + "step": 18918 + }, + { + "epoch": 5.8069367710251685, + "grad_norm": 0.29962384700775146, + "learning_rate": 3.944791447977214e-05, + "loss": 1.7579, + "step": 18919 + }, + { + "epoch": 5.807243707796194, + "grad_norm": 0.22385326027870178, + "learning_rate": 3.944305592369923e-05, + "loss": 1.7795, + "step": 18920 + }, + { + "epoch": 5.807550644567219, + "grad_norm": 0.2954902648925781, + "learning_rate": 3.943819747195625e-05, + "loss": 1.6655, + "step": 18921 + }, + { + "epoch": 5.8078575813382445, + "grad_norm": 0.18947024643421173, + "learning_rate": 3.94333391245912e-05, + "loss": 1.6803, + "step": 18922 + }, + { + "epoch": 5.80816451810927, + "grad_norm": 0.26797959208488464, + "learning_rate": 3.942848088165206e-05, + "loss": 1.7671, + "step": 18923 + }, + { + "epoch": 5.808471454880294, + "grad_norm": 0.23453201353549957, + "learning_rate": 3.94236227431869e-05, + "loss": 1.7472, + "step": 18924 + }, + { + "epoch": 5.80877839165132, + "grad_norm": 0.24471673369407654, + "learning_rate": 3.941876470924367e-05, + "loss": 1.7482, + "step": 18925 + }, + { + "epoch": 5.809085328422345, + "grad_norm": 0.22249098122119904, + "learning_rate": 3.9413906779870426e-05, + "loss": 1.6794, + "step": 18926 + }, + { + "epoch": 5.80939226519337, + "grad_norm": 0.1985001564025879, + "learning_rate": 3.9409048955115144e-05, + "loss": 1.7278, + "step": 18927 + }, + { + "epoch": 5.809699201964396, + "grad_norm": 0.22482000291347504, + "learning_rate": 3.940419123502587e-05, + "loss": 1.7658, + "step": 18928 + }, + { + "epoch": 5.810006138735421, + "grad_norm": 0.18513578176498413, + "learning_rate": 3.939933361965057e-05, + "loss": 1.7154, + "step": 18929 + }, + { + "epoch": 5.810313075506445, + "grad_norm": 0.1984710991382599, + "learning_rate": 3.939447610903729e-05, + "loss": 1.7324, + "step": 18930 + }, + { + "epoch": 5.810620012277471, + "grad_norm": 0.26089081168174744, + "learning_rate": 3.938961870323399e-05, + "loss": 1.774, + "step": 18931 + }, + { + "epoch": 5.810926949048496, + "grad_norm": 0.2059585452079773, + "learning_rate": 3.9384761402288706e-05, + "loss": 1.7059, + "step": 18932 + }, + { + "epoch": 5.811233885819521, + "grad_norm": 0.1887979656457901, + "learning_rate": 3.937990420624942e-05, + "loss": 1.6829, + "step": 18933 + }, + { + "epoch": 5.811540822590547, + "grad_norm": 0.2589145600795746, + "learning_rate": 3.937504711516417e-05, + "loss": 1.7301, + "step": 18934 + }, + { + "epoch": 5.811847759361571, + "grad_norm": 0.209516704082489, + "learning_rate": 3.9370190129080907e-05, + "loss": 1.7716, + "step": 18935 + }, + { + "epoch": 5.8121546961325965, + "grad_norm": 0.3321632146835327, + "learning_rate": 3.936533324804768e-05, + "loss": 1.7754, + "step": 18936 + }, + { + "epoch": 5.812461632903622, + "grad_norm": 0.236944317817688, + "learning_rate": 3.9360476472112446e-05, + "loss": 1.7546, + "step": 18937 + }, + { + "epoch": 5.812768569674647, + "grad_norm": 0.29667431116104126, + "learning_rate": 3.9355619801323226e-05, + "loss": 1.7712, + "step": 18938 + }, + { + "epoch": 5.8130755064456725, + "grad_norm": 0.3071129620075226, + "learning_rate": 3.935076323572802e-05, + "loss": 1.7351, + "step": 18939 + }, + { + "epoch": 5.813382443216698, + "grad_norm": 0.22747032344341278, + "learning_rate": 3.934590677537479e-05, + "loss": 1.7788, + "step": 18940 + }, + { + "epoch": 5.813689379987722, + "grad_norm": 0.2575854957103729, + "learning_rate": 3.934105042031158e-05, + "loss": 1.705, + "step": 18941 + }, + { + "epoch": 5.813996316758748, + "grad_norm": 0.2561504542827606, + "learning_rate": 3.9336194170586325e-05, + "loss": 1.7309, + "step": 18942 + }, + { + "epoch": 5.814303253529773, + "grad_norm": 0.21570482850074768, + "learning_rate": 3.933133802624707e-05, + "loss": 1.7408, + "step": 18943 + }, + { + "epoch": 5.814610190300798, + "grad_norm": 0.29227179288864136, + "learning_rate": 3.932648198734177e-05, + "loss": 1.7415, + "step": 18944 + }, + { + "epoch": 5.814917127071823, + "grad_norm": 0.17847758531570435, + "learning_rate": 3.9321626053918456e-05, + "loss": 1.7926, + "step": 18945 + }, + { + "epoch": 5.815224063842848, + "grad_norm": 0.24604015052318573, + "learning_rate": 3.931677022602507e-05, + "loss": 1.7519, + "step": 18946 + }, + { + "epoch": 5.815531000613873, + "grad_norm": 0.23843185603618622, + "learning_rate": 3.931191450370965e-05, + "loss": 1.7206, + "step": 18947 + }, + { + "epoch": 5.815837937384899, + "grad_norm": 0.23431400954723358, + "learning_rate": 3.9307058887020126e-05, + "loss": 1.7743, + "step": 18948 + }, + { + "epoch": 5.816144874155924, + "grad_norm": 0.23685097694396973, + "learning_rate": 3.9302203376004525e-05, + "loss": 1.7485, + "step": 18949 + }, + { + "epoch": 5.816451810926949, + "grad_norm": 0.2129819542169571, + "learning_rate": 3.929734797071082e-05, + "loss": 1.6897, + "step": 18950 + }, + { + "epoch": 5.816758747697974, + "grad_norm": 0.24736030399799347, + "learning_rate": 3.9292492671187e-05, + "loss": 1.7292, + "step": 18951 + }, + { + "epoch": 5.817065684468999, + "grad_norm": 0.28659793734550476, + "learning_rate": 3.9287637477481025e-05, + "loss": 1.6772, + "step": 18952 + }, + { + "epoch": 5.8173726212400245, + "grad_norm": 0.22304075956344604, + "learning_rate": 3.928278238964092e-05, + "loss": 1.7991, + "step": 18953 + }, + { + "epoch": 5.81767955801105, + "grad_norm": 0.25354304909706116, + "learning_rate": 3.927792740771462e-05, + "loss": 1.7407, + "step": 18954 + }, + { + "epoch": 5.817986494782075, + "grad_norm": 0.3014552593231201, + "learning_rate": 3.927307253175014e-05, + "loss": 1.7714, + "step": 18955 + }, + { + "epoch": 5.8182934315531, + "grad_norm": 0.20537856221199036, + "learning_rate": 3.926821776179545e-05, + "loss": 1.6992, + "step": 18956 + }, + { + "epoch": 5.818600368324125, + "grad_norm": 0.29656440019607544, + "learning_rate": 3.92633630978985e-05, + "loss": 1.7476, + "step": 18957 + }, + { + "epoch": 5.81890730509515, + "grad_norm": 0.20956869423389435, + "learning_rate": 3.925850854010732e-05, + "loss": 1.808, + "step": 18958 + }, + { + "epoch": 5.819214241866176, + "grad_norm": 0.29395633935928345, + "learning_rate": 3.925365408846983e-05, + "loss": 1.7787, + "step": 18959 + }, + { + "epoch": 5.819521178637201, + "grad_norm": 0.31101030111312866, + "learning_rate": 3.9248799743034025e-05, + "loss": 1.7685, + "step": 18960 + }, + { + "epoch": 5.819828115408226, + "grad_norm": 0.2109794020652771, + "learning_rate": 3.9243945503847894e-05, + "loss": 1.7307, + "step": 18961 + }, + { + "epoch": 5.820135052179251, + "grad_norm": 0.2503393292427063, + "learning_rate": 3.9239091370959405e-05, + "loss": 1.763, + "step": 18962 + }, + { + "epoch": 5.820441988950276, + "grad_norm": 0.21757015585899353, + "learning_rate": 3.92342373444165e-05, + "loss": 1.7862, + "step": 18963 + }, + { + "epoch": 5.820748925721301, + "grad_norm": 0.22108088433742523, + "learning_rate": 3.9229383424267197e-05, + "loss": 1.6845, + "step": 18964 + }, + { + "epoch": 5.821055862492327, + "grad_norm": 0.20059655606746674, + "learning_rate": 3.922452961055941e-05, + "loss": 1.7523, + "step": 18965 + }, + { + "epoch": 5.821362799263352, + "grad_norm": 0.22009585797786713, + "learning_rate": 3.921967590334117e-05, + "loss": 1.7802, + "step": 18966 + }, + { + "epoch": 5.8216697360343765, + "grad_norm": 0.22554142773151398, + "learning_rate": 3.9214822302660386e-05, + "loss": 1.7911, + "step": 18967 + }, + { + "epoch": 5.821976672805402, + "grad_norm": 0.23434770107269287, + "learning_rate": 3.920996880856506e-05, + "loss": 1.6755, + "step": 18968 + }, + { + "epoch": 5.822283609576427, + "grad_norm": 0.2162926346063614, + "learning_rate": 3.920511542110314e-05, + "loss": 1.7145, + "step": 18969 + }, + { + "epoch": 5.8225905463474525, + "grad_norm": 0.18654806911945343, + "learning_rate": 3.9200262140322616e-05, + "loss": 1.7076, + "step": 18970 + }, + { + "epoch": 5.822897483118478, + "grad_norm": 0.22357499599456787, + "learning_rate": 3.9195408966271404e-05, + "loss": 1.791, + "step": 18971 + }, + { + "epoch": 5.823204419889503, + "grad_norm": 0.21073313057422638, + "learning_rate": 3.919055589899752e-05, + "loss": 1.7976, + "step": 18972 + }, + { + "epoch": 5.823511356660528, + "grad_norm": 0.21481956541538239, + "learning_rate": 3.9185702938548886e-05, + "loss": 1.7468, + "step": 18973 + }, + { + "epoch": 5.823818293431553, + "grad_norm": 0.22051872313022614, + "learning_rate": 3.9180850084973464e-05, + "loss": 1.7201, + "step": 18974 + }, + { + "epoch": 5.824125230202578, + "grad_norm": 0.24410493671894073, + "learning_rate": 3.917599733831924e-05, + "loss": 1.7774, + "step": 18975 + }, + { + "epoch": 5.824432166973604, + "grad_norm": 0.19711458683013916, + "learning_rate": 3.917114469863414e-05, + "loss": 1.7907, + "step": 18976 + }, + { + "epoch": 5.824739103744628, + "grad_norm": 0.2045203000307083, + "learning_rate": 3.9166292165966155e-05, + "loss": 1.7105, + "step": 18977 + }, + { + "epoch": 5.8250460405156534, + "grad_norm": 0.21570880711078644, + "learning_rate": 3.9161439740363196e-05, + "loss": 1.7312, + "step": 18978 + }, + { + "epoch": 5.825352977286679, + "grad_norm": 0.21203923225402832, + "learning_rate": 3.915658742187325e-05, + "loss": 1.7869, + "step": 18979 + }, + { + "epoch": 5.825659914057704, + "grad_norm": 0.26233312487602234, + "learning_rate": 3.915173521054426e-05, + "loss": 1.7453, + "step": 18980 + }, + { + "epoch": 5.8259668508287294, + "grad_norm": 0.23792949318885803, + "learning_rate": 3.91468831064242e-05, + "loss": 1.6886, + "step": 18981 + }, + { + "epoch": 5.826273787599755, + "grad_norm": 0.20325250923633575, + "learning_rate": 3.914203110956098e-05, + "loss": 1.7538, + "step": 18982 + }, + { + "epoch": 5.82658072437078, + "grad_norm": 0.28146329522132874, + "learning_rate": 3.9137179220002596e-05, + "loss": 1.7674, + "step": 18983 + }, + { + "epoch": 5.826887661141805, + "grad_norm": 0.2319503277540207, + "learning_rate": 3.9132327437796946e-05, + "loss": 1.7864, + "step": 18984 + }, + { + "epoch": 5.82719459791283, + "grad_norm": 0.22653794288635254, + "learning_rate": 3.9127475762992025e-05, + "loss": 1.7424, + "step": 18985 + }, + { + "epoch": 5.827501534683855, + "grad_norm": 0.26855236291885376, + "learning_rate": 3.912262419563574e-05, + "loss": 1.762, + "step": 18986 + }, + { + "epoch": 5.827808471454881, + "grad_norm": 0.18356221914291382, + "learning_rate": 3.9117772735776095e-05, + "loss": 1.7199, + "step": 18987 + }, + { + "epoch": 5.828115408225905, + "grad_norm": 0.2802455425262451, + "learning_rate": 3.911292138346096e-05, + "loss": 1.7142, + "step": 18988 + }, + { + "epoch": 5.82842234499693, + "grad_norm": 0.2638777494430542, + "learning_rate": 3.910807013873835e-05, + "loss": 1.6759, + "step": 18989 + }, + { + "epoch": 5.828729281767956, + "grad_norm": 0.18397162854671478, + "learning_rate": 3.910321900165615e-05, + "loss": 1.693, + "step": 18990 + }, + { + "epoch": 5.829036218538981, + "grad_norm": 0.20967607200145721, + "learning_rate": 3.909836797226233e-05, + "loss": 1.6908, + "step": 18991 + }, + { + "epoch": 5.829343155310006, + "grad_norm": 0.21123014390468597, + "learning_rate": 3.909351705060485e-05, + "loss": 1.7875, + "step": 18992 + }, + { + "epoch": 5.829650092081032, + "grad_norm": 0.1988777220249176, + "learning_rate": 3.90886662367316e-05, + "loss": 1.7254, + "step": 18993 + }, + { + "epoch": 5.829957028852056, + "grad_norm": 0.17793473601341248, + "learning_rate": 3.9083815530690564e-05, + "loss": 1.7233, + "step": 18994 + }, + { + "epoch": 5.8302639656230815, + "grad_norm": 0.2289644330739975, + "learning_rate": 3.9078964932529645e-05, + "loss": 1.7739, + "step": 18995 + }, + { + "epoch": 5.830570902394107, + "grad_norm": 0.18145552277565002, + "learning_rate": 3.9074114442296804e-05, + "loss": 1.6989, + "step": 18996 + }, + { + "epoch": 5.830877839165132, + "grad_norm": 0.1941588670015335, + "learning_rate": 3.9069264060039956e-05, + "loss": 1.6981, + "step": 18997 + }, + { + "epoch": 5.8311847759361575, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.9064413785807075e-05, + "loss": 1.7163, + "step": 18998 + }, + { + "epoch": 5.831491712707182, + "grad_norm": 0.19494447112083435, + "learning_rate": 3.905956361964604e-05, + "loss": 1.7481, + "step": 18999 + }, + { + "epoch": 5.831798649478207, + "grad_norm": 0.2127624899148941, + "learning_rate": 3.9054713561604826e-05, + "loss": 1.7494, + "step": 19000 + }, + { + "epoch": 5.832105586249233, + "grad_norm": 0.20107653737068176, + "learning_rate": 3.9049863611731334e-05, + "loss": 1.7483, + "step": 19001 + }, + { + "epoch": 5.832412523020258, + "grad_norm": 0.22574639320373535, + "learning_rate": 3.904501377007352e-05, + "loss": 1.8184, + "step": 19002 + }, + { + "epoch": 5.832719459791283, + "grad_norm": 0.20027579367160797, + "learning_rate": 3.9040164036679285e-05, + "loss": 1.6995, + "step": 19003 + }, + { + "epoch": 5.833026396562309, + "grad_norm": 0.21599887311458588, + "learning_rate": 3.90353144115966e-05, + "loss": 1.7487, + "step": 19004 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.21122781932353973, + "learning_rate": 3.9030464894873334e-05, + "loss": 1.7332, + "step": 19005 + }, + { + "epoch": 5.833640270104358, + "grad_norm": 0.19006453454494476, + "learning_rate": 3.902561548655747e-05, + "loss": 1.688, + "step": 19006 + }, + { + "epoch": 5.833947206875384, + "grad_norm": 0.22979344427585602, + "learning_rate": 3.9020766186696895e-05, + "loss": 1.7495, + "step": 19007 + }, + { + "epoch": 5.834254143646409, + "grad_norm": 0.18405365943908691, + "learning_rate": 3.901591699533953e-05, + "loss": 1.7395, + "step": 19008 + }, + { + "epoch": 5.834561080417434, + "grad_norm": 0.26198676228523254, + "learning_rate": 3.901106791253334e-05, + "loss": 1.8286, + "step": 19009 + }, + { + "epoch": 5.834868017188459, + "grad_norm": 0.2535797357559204, + "learning_rate": 3.900621893832619e-05, + "loss": 1.757, + "step": 19010 + }, + { + "epoch": 5.835174953959484, + "grad_norm": 0.24599581956863403, + "learning_rate": 3.900137007276605e-05, + "loss": 1.7266, + "step": 19011 + }, + { + "epoch": 5.8354818907305095, + "grad_norm": 0.25688427686691284, + "learning_rate": 3.8996521315900805e-05, + "loss": 1.7255, + "step": 19012 + }, + { + "epoch": 5.835788827501535, + "grad_norm": 0.24668128788471222, + "learning_rate": 3.8991672667778385e-05, + "loss": 1.737, + "step": 19013 + }, + { + "epoch": 5.83609576427256, + "grad_norm": 0.28365740180015564, + "learning_rate": 3.8986824128446695e-05, + "loss": 1.7129, + "step": 19014 + }, + { + "epoch": 5.8364027010435855, + "grad_norm": 0.2543952465057373, + "learning_rate": 3.89819756979537e-05, + "loss": 1.7249, + "step": 19015 + }, + { + "epoch": 5.83670963781461, + "grad_norm": 0.2868666350841522, + "learning_rate": 3.8977127376347245e-05, + "loss": 1.6985, + "step": 19016 + }, + { + "epoch": 5.837016574585635, + "grad_norm": 0.3818367123603821, + "learning_rate": 3.897227916367531e-05, + "loss": 1.6954, + "step": 19017 + }, + { + "epoch": 5.837323511356661, + "grad_norm": 0.20922113955020905, + "learning_rate": 3.896743105998574e-05, + "loss": 1.7571, + "step": 19018 + }, + { + "epoch": 5.837630448127686, + "grad_norm": 0.3669843375682831, + "learning_rate": 3.89625830653265e-05, + "loss": 1.8041, + "step": 19019 + }, + { + "epoch": 5.83793738489871, + "grad_norm": 0.2889872193336487, + "learning_rate": 3.895773517974548e-05, + "loss": 1.7775, + "step": 19020 + }, + { + "epoch": 5.838244321669736, + "grad_norm": 0.22619491815567017, + "learning_rate": 3.89528874032906e-05, + "loss": 1.7019, + "step": 19021 + }, + { + "epoch": 5.838551258440761, + "grad_norm": 0.4169046878814697, + "learning_rate": 3.894803973600976e-05, + "loss": 1.8282, + "step": 19022 + }, + { + "epoch": 5.838858195211786, + "grad_norm": 0.2567043900489807, + "learning_rate": 3.894319217795087e-05, + "loss": 1.733, + "step": 19023 + }, + { + "epoch": 5.839165131982812, + "grad_norm": 0.2435060739517212, + "learning_rate": 3.8938344729161834e-05, + "loss": 1.7208, + "step": 19024 + }, + { + "epoch": 5.839472068753837, + "grad_norm": 0.2941838204860687, + "learning_rate": 3.893349738969055e-05, + "loss": 1.7202, + "step": 19025 + }, + { + "epoch": 5.8397790055248615, + "grad_norm": 0.23542317748069763, + "learning_rate": 3.892865015958495e-05, + "loss": 1.7571, + "step": 19026 + }, + { + "epoch": 5.840085942295887, + "grad_norm": 0.3248259723186493, + "learning_rate": 3.8923803038892897e-05, + "loss": 1.7118, + "step": 19027 + }, + { + "epoch": 5.840392879066912, + "grad_norm": 0.24359026551246643, + "learning_rate": 3.891895602766234e-05, + "loss": 1.8126, + "step": 19028 + }, + { + "epoch": 5.8406998158379375, + "grad_norm": 0.3053695559501648, + "learning_rate": 3.8914109125941126e-05, + "loss": 1.6632, + "step": 19029 + }, + { + "epoch": 5.841006752608963, + "grad_norm": 0.3194943368434906, + "learning_rate": 3.8909262333777195e-05, + "loss": 1.8432, + "step": 19030 + }, + { + "epoch": 5.841313689379987, + "grad_norm": 0.23532693088054657, + "learning_rate": 3.8904415651218426e-05, + "loss": 1.716, + "step": 19031 + }, + { + "epoch": 5.841620626151013, + "grad_norm": 0.2941347062587738, + "learning_rate": 3.889956907831275e-05, + "loss": 1.7737, + "step": 19032 + }, + { + "epoch": 5.841927562922038, + "grad_norm": 0.2265428602695465, + "learning_rate": 3.889472261510801e-05, + "loss": 1.7111, + "step": 19033 + }, + { + "epoch": 5.842234499693063, + "grad_norm": 0.3023710548877716, + "learning_rate": 3.888987626165216e-05, + "loss": 1.7845, + "step": 19034 + }, + { + "epoch": 5.842541436464089, + "grad_norm": 0.2855348289012909, + "learning_rate": 3.8885030017993026e-05, + "loss": 1.8009, + "step": 19035 + }, + { + "epoch": 5.842848373235114, + "grad_norm": 0.23046357929706573, + "learning_rate": 3.888018388417857e-05, + "loss": 1.8225, + "step": 19036 + }, + { + "epoch": 5.843155310006138, + "grad_norm": 0.23732341825962067, + "learning_rate": 3.8875337860256634e-05, + "loss": 1.7542, + "step": 19037 + }, + { + "epoch": 5.843462246777164, + "grad_norm": 0.18987004458904266, + "learning_rate": 3.887049194627516e-05, + "loss": 1.7327, + "step": 19038 + }, + { + "epoch": 5.843769183548189, + "grad_norm": 0.21539908647537231, + "learning_rate": 3.8865646142281974e-05, + "loss": 1.715, + "step": 19039 + }, + { + "epoch": 5.844076120319214, + "grad_norm": 0.2991954982280731, + "learning_rate": 3.8860800448325024e-05, + "loss": 1.7728, + "step": 19040 + }, + { + "epoch": 5.84438305709024, + "grad_norm": 0.19066409766674042, + "learning_rate": 3.885595486445216e-05, + "loss": 1.7128, + "step": 19041 + }, + { + "epoch": 5.844689993861264, + "grad_norm": 0.21643762290477753, + "learning_rate": 3.885110939071128e-05, + "loss": 1.7584, + "step": 19042 + }, + { + "epoch": 5.8449969306322895, + "grad_norm": 0.20227304100990295, + "learning_rate": 3.884626402715029e-05, + "loss": 1.7053, + "step": 19043 + }, + { + "epoch": 5.845303867403315, + "grad_norm": 0.20429107546806335, + "learning_rate": 3.884141877381703e-05, + "loss": 1.761, + "step": 19044 + }, + { + "epoch": 5.84561080417434, + "grad_norm": 0.1873873621225357, + "learning_rate": 3.8836573630759435e-05, + "loss": 1.7251, + "step": 19045 + }, + { + "epoch": 5.8459177409453655, + "grad_norm": 0.18025323748588562, + "learning_rate": 3.883172859802534e-05, + "loss": 1.6696, + "step": 19046 + }, + { + "epoch": 5.846224677716391, + "grad_norm": 0.22011777758598328, + "learning_rate": 3.8826883675662664e-05, + "loss": 1.7148, + "step": 19047 + }, + { + "epoch": 5.846531614487415, + "grad_norm": 0.17827673256397247, + "learning_rate": 3.882203886371925e-05, + "loss": 1.69, + "step": 19048 + }, + { + "epoch": 5.846838551258441, + "grad_norm": 0.200766459107399, + "learning_rate": 3.881719416224303e-05, + "loss": 1.7773, + "step": 19049 + }, + { + "epoch": 5.847145488029466, + "grad_norm": 0.22770950198173523, + "learning_rate": 3.8812349571281834e-05, + "loss": 1.7156, + "step": 19050 + }, + { + "epoch": 5.847452424800491, + "grad_norm": 0.19483895599842072, + "learning_rate": 3.880750509088357e-05, + "loss": 1.7304, + "step": 19051 + }, + { + "epoch": 5.847759361571516, + "grad_norm": 0.1988774836063385, + "learning_rate": 3.8802660721096086e-05, + "loss": 1.7428, + "step": 19052 + }, + { + "epoch": 5.848066298342541, + "grad_norm": 0.19881510734558105, + "learning_rate": 3.879781646196727e-05, + "loss": 1.7268, + "step": 19053 + }, + { + "epoch": 5.848373235113566, + "grad_norm": 0.21257543563842773, + "learning_rate": 3.8792972313545e-05, + "loss": 1.7532, + "step": 19054 + }, + { + "epoch": 5.848680171884592, + "grad_norm": 0.21000613272190094, + "learning_rate": 3.878812827587716e-05, + "loss": 1.7782, + "step": 19055 + }, + { + "epoch": 5.848987108655617, + "grad_norm": 0.2136746346950531, + "learning_rate": 3.878328434901159e-05, + "loss": 1.6875, + "step": 19056 + }, + { + "epoch": 5.849294045426642, + "grad_norm": 0.20291505753993988, + "learning_rate": 3.8778440532996204e-05, + "loss": 1.74, + "step": 19057 + }, + { + "epoch": 5.849600982197668, + "grad_norm": 0.22568103671073914, + "learning_rate": 3.877359682787883e-05, + "loss": 1.7074, + "step": 19058 + }, + { + "epoch": 5.849907918968692, + "grad_norm": 0.24398963153362274, + "learning_rate": 3.876875323370734e-05, + "loss": 1.6825, + "step": 19059 + }, + { + "epoch": 5.850214855739718, + "grad_norm": 0.19684453308582306, + "learning_rate": 3.876390975052964e-05, + "loss": 1.7143, + "step": 19060 + }, + { + "epoch": 5.850521792510743, + "grad_norm": 0.2786783277988434, + "learning_rate": 3.8759066378393544e-05, + "loss": 1.8339, + "step": 19061 + }, + { + "epoch": 5.850828729281768, + "grad_norm": 0.1977633833885193, + "learning_rate": 3.875422311734697e-05, + "loss": 1.742, + "step": 19062 + }, + { + "epoch": 5.851135666052793, + "grad_norm": 0.260643869638443, + "learning_rate": 3.874937996743772e-05, + "loss": 1.7728, + "step": 19063 + }, + { + "epoch": 5.851442602823818, + "grad_norm": 0.20998433232307434, + "learning_rate": 3.874453692871372e-05, + "loss": 1.768, + "step": 19064 + }, + { + "epoch": 5.851749539594843, + "grad_norm": 0.2603224217891693, + "learning_rate": 3.873969400122278e-05, + "loss": 1.8015, + "step": 19065 + }, + { + "epoch": 5.852056476365869, + "grad_norm": 0.24428118765354156, + "learning_rate": 3.87348511850128e-05, + "loss": 1.8133, + "step": 19066 + }, + { + "epoch": 5.852363413136894, + "grad_norm": 0.19380085170269012, + "learning_rate": 3.873000848013161e-05, + "loss": 1.7331, + "step": 19067 + }, + { + "epoch": 5.852670349907919, + "grad_norm": 0.20088011026382446, + "learning_rate": 3.87251658866271e-05, + "loss": 1.7501, + "step": 19068 + }, + { + "epoch": 5.852977286678944, + "grad_norm": 0.21920672059059143, + "learning_rate": 3.8720323404547095e-05, + "loss": 1.6848, + "step": 19069 + }, + { + "epoch": 5.853284223449969, + "grad_norm": 0.21692565083503723, + "learning_rate": 3.871548103393947e-05, + "loss": 1.7132, + "step": 19070 + }, + { + "epoch": 5.8535911602209945, + "grad_norm": 0.19463133811950684, + "learning_rate": 3.871063877485207e-05, + "loss": 1.7263, + "step": 19071 + }, + { + "epoch": 5.85389809699202, + "grad_norm": 0.21563300490379333, + "learning_rate": 3.870579662733277e-05, + "loss": 1.7271, + "step": 19072 + }, + { + "epoch": 5.854205033763045, + "grad_norm": 0.19901902973651886, + "learning_rate": 3.870095459142939e-05, + "loss": 1.7153, + "step": 19073 + }, + { + "epoch": 5.85451197053407, + "grad_norm": 0.2053879052400589, + "learning_rate": 3.869611266718982e-05, + "loss": 1.7769, + "step": 19074 + }, + { + "epoch": 5.854818907305095, + "grad_norm": 0.18877504765987396, + "learning_rate": 3.869127085466188e-05, + "loss": 1.7427, + "step": 19075 + }, + { + "epoch": 5.85512584407612, + "grad_norm": 0.2000892460346222, + "learning_rate": 3.8686429153893414e-05, + "loss": 1.7245, + "step": 19076 + }, + { + "epoch": 5.855432780847146, + "grad_norm": 0.23791030049324036, + "learning_rate": 3.868158756493231e-05, + "loss": 1.7128, + "step": 19077 + }, + { + "epoch": 5.855739717618171, + "grad_norm": 0.20807631313800812, + "learning_rate": 3.8676746087826374e-05, + "loss": 1.7235, + "step": 19078 + }, + { + "epoch": 5.856046654389196, + "grad_norm": 0.2603290379047394, + "learning_rate": 3.867190472262349e-05, + "loss": 1.7272, + "step": 19079 + }, + { + "epoch": 5.856353591160221, + "grad_norm": 0.25234153866767883, + "learning_rate": 3.8667063469371456e-05, + "loss": 1.7818, + "step": 19080 + }, + { + "epoch": 5.856660527931246, + "grad_norm": 0.20621159672737122, + "learning_rate": 3.866222232811816e-05, + "loss": 1.7318, + "step": 19081 + }, + { + "epoch": 5.856967464702271, + "grad_norm": 0.19565562903881073, + "learning_rate": 3.865738129891141e-05, + "loss": 1.6364, + "step": 19082 + }, + { + "epoch": 5.857274401473297, + "grad_norm": 0.2090953141450882, + "learning_rate": 3.86525403817991e-05, + "loss": 1.7763, + "step": 19083 + }, + { + "epoch": 5.857581338244322, + "grad_norm": 0.21286322176456451, + "learning_rate": 3.864769957682901e-05, + "loss": 1.7652, + "step": 19084 + }, + { + "epoch": 5.8578882750153465, + "grad_norm": 0.20606130361557007, + "learning_rate": 3.864285888404902e-05, + "loss": 1.7267, + "step": 19085 + }, + { + "epoch": 5.858195211786372, + "grad_norm": 0.18837152421474457, + "learning_rate": 3.863801830350694e-05, + "loss": 1.7013, + "step": 19086 + }, + { + "epoch": 5.858502148557397, + "grad_norm": 0.19374001026153564, + "learning_rate": 3.8633177835250636e-05, + "loss": 1.7462, + "step": 19087 + }, + { + "epoch": 5.8588090853284225, + "grad_norm": 0.19090552628040314, + "learning_rate": 3.8628337479327914e-05, + "loss": 1.7321, + "step": 19088 + }, + { + "epoch": 5.859116022099448, + "grad_norm": 0.19487829506397247, + "learning_rate": 3.8623497235786656e-05, + "loss": 1.7323, + "step": 19089 + }, + { + "epoch": 5.859422958870473, + "grad_norm": 0.23836077749729156, + "learning_rate": 3.861865710467464e-05, + "loss": 1.7277, + "step": 19090 + }, + { + "epoch": 5.859729895641498, + "grad_norm": 0.22283829748630524, + "learning_rate": 3.861381708603974e-05, + "loss": 1.7521, + "step": 19091 + }, + { + "epoch": 5.860036832412523, + "grad_norm": 0.2094828337430954, + "learning_rate": 3.8608977179929774e-05, + "loss": 1.763, + "step": 19092 + }, + { + "epoch": 5.860343769183548, + "grad_norm": 0.30857667326927185, + "learning_rate": 3.860413738639256e-05, + "loss": 1.7112, + "step": 19093 + }, + { + "epoch": 5.860650705954574, + "grad_norm": 0.22634989023208618, + "learning_rate": 3.8599297705475954e-05, + "loss": 1.7076, + "step": 19094 + }, + { + "epoch": 5.860957642725598, + "grad_norm": 0.20488132536411285, + "learning_rate": 3.8594458137227757e-05, + "loss": 1.6821, + "step": 19095 + }, + { + "epoch": 5.861264579496623, + "grad_norm": 0.22760719060897827, + "learning_rate": 3.8589618681695826e-05, + "loss": 1.6981, + "step": 19096 + }, + { + "epoch": 5.861571516267649, + "grad_norm": 0.21168997883796692, + "learning_rate": 3.858477933892795e-05, + "loss": 1.7396, + "step": 19097 + }, + { + "epoch": 5.861878453038674, + "grad_norm": 0.24725143611431122, + "learning_rate": 3.8579940108971984e-05, + "loss": 1.791, + "step": 19098 + }, + { + "epoch": 5.862185389809699, + "grad_norm": 0.2245369702577591, + "learning_rate": 3.857510099187573e-05, + "loss": 1.7643, + "step": 19099 + }, + { + "epoch": 5.862492326580725, + "grad_norm": 0.20065639913082123, + "learning_rate": 3.8570261987687056e-05, + "loss": 1.715, + "step": 19100 + }, + { + "epoch": 5.862799263351749, + "grad_norm": 0.1857454925775528, + "learning_rate": 3.856542309645373e-05, + "loss": 1.6833, + "step": 19101 + }, + { + "epoch": 5.8631062001227745, + "grad_norm": 0.18816804885864258, + "learning_rate": 3.856058431822361e-05, + "loss": 1.7049, + "step": 19102 + }, + { + "epoch": 5.8634131368938, + "grad_norm": 0.2861626148223877, + "learning_rate": 3.855574565304448e-05, + "loss": 1.8275, + "step": 19103 + }, + { + "epoch": 5.863720073664825, + "grad_norm": 0.19937226176261902, + "learning_rate": 3.8550907100964196e-05, + "loss": 1.7137, + "step": 19104 + }, + { + "epoch": 5.8640270104358505, + "grad_norm": 0.2040586620569229, + "learning_rate": 3.854606866203055e-05, + "loss": 1.725, + "step": 19105 + }, + { + "epoch": 5.864333947206875, + "grad_norm": 0.21082650125026703, + "learning_rate": 3.854123033629137e-05, + "loss": 1.7143, + "step": 19106 + }, + { + "epoch": 5.8646408839779, + "grad_norm": 0.1977517306804657, + "learning_rate": 3.853639212379446e-05, + "loss": 1.7482, + "step": 19107 + }, + { + "epoch": 5.864947820748926, + "grad_norm": 0.2272191196680069, + "learning_rate": 3.8531554024587655e-05, + "loss": 1.7678, + "step": 19108 + }, + { + "epoch": 5.865254757519951, + "grad_norm": 0.22765736281871796, + "learning_rate": 3.852671603871876e-05, + "loss": 1.7721, + "step": 19109 + }, + { + "epoch": 5.865561694290976, + "grad_norm": 0.20707197487354279, + "learning_rate": 3.852187816623556e-05, + "loss": 1.7509, + "step": 19110 + }, + { + "epoch": 5.865868631062002, + "grad_norm": 0.2699931561946869, + "learning_rate": 3.851704040718591e-05, + "loss": 1.6845, + "step": 19111 + }, + { + "epoch": 5.866175567833026, + "grad_norm": 0.24394196271896362, + "learning_rate": 3.8512202761617575e-05, + "loss": 1.6895, + "step": 19112 + }, + { + "epoch": 5.866482504604051, + "grad_norm": 0.21921835839748383, + "learning_rate": 3.850736522957841e-05, + "loss": 1.7739, + "step": 19113 + }, + { + "epoch": 5.866789441375077, + "grad_norm": 0.2268306314945221, + "learning_rate": 3.8502527811116175e-05, + "loss": 1.7773, + "step": 19114 + }, + { + "epoch": 5.867096378146102, + "grad_norm": 0.2165728509426117, + "learning_rate": 3.84976905062787e-05, + "loss": 1.7567, + "step": 19115 + }, + { + "epoch": 5.867403314917127, + "grad_norm": 0.188106968998909, + "learning_rate": 3.8492853315113804e-05, + "loss": 1.7209, + "step": 19116 + }, + { + "epoch": 5.867710251688152, + "grad_norm": 0.20750530064105988, + "learning_rate": 3.848801623766927e-05, + "loss": 1.6999, + "step": 19117 + }, + { + "epoch": 5.868017188459177, + "grad_norm": 0.2475438266992569, + "learning_rate": 3.84831792739929e-05, + "loss": 1.7535, + "step": 19118 + }, + { + "epoch": 5.8683241252302025, + "grad_norm": 0.23291872441768646, + "learning_rate": 3.847834242413252e-05, + "loss": 1.7137, + "step": 19119 + }, + { + "epoch": 5.868631062001228, + "grad_norm": 0.18381048738956451, + "learning_rate": 3.847350568813589e-05, + "loss": 1.7657, + "step": 19120 + }, + { + "epoch": 5.868937998772253, + "grad_norm": 0.19330385327339172, + "learning_rate": 3.8468669066050845e-05, + "loss": 1.7109, + "step": 19121 + }, + { + "epoch": 5.8692449355432785, + "grad_norm": 0.22503000497817993, + "learning_rate": 3.846383255792517e-05, + "loss": 1.7668, + "step": 19122 + }, + { + "epoch": 5.869551872314303, + "grad_norm": 0.2147306352853775, + "learning_rate": 3.845899616380667e-05, + "loss": 1.74, + "step": 19123 + }, + { + "epoch": 5.869858809085328, + "grad_norm": 0.18493011593818665, + "learning_rate": 3.845415988374312e-05, + "loss": 1.7066, + "step": 19124 + }, + { + "epoch": 5.870165745856354, + "grad_norm": 0.28276753425598145, + "learning_rate": 3.844932371778235e-05, + "loss": 1.7925, + "step": 19125 + }, + { + "epoch": 5.870472682627379, + "grad_norm": 0.23486676812171936, + "learning_rate": 3.844448766597212e-05, + "loss": 1.8216, + "step": 19126 + }, + { + "epoch": 5.870779619398404, + "grad_norm": 0.24370723962783813, + "learning_rate": 3.843965172836024e-05, + "loss": 1.709, + "step": 19127 + }, + { + "epoch": 5.871086556169429, + "grad_norm": 0.22540852427482605, + "learning_rate": 3.843481590499449e-05, + "loss": 1.7608, + "step": 19128 + }, + { + "epoch": 5.871393492940454, + "grad_norm": 0.20578467845916748, + "learning_rate": 3.8429980195922666e-05, + "loss": 1.7288, + "step": 19129 + }, + { + "epoch": 5.871700429711479, + "grad_norm": 0.265325129032135, + "learning_rate": 3.842514460119258e-05, + "loss": 1.7711, + "step": 19130 + }, + { + "epoch": 5.872007366482505, + "grad_norm": 0.20076121389865875, + "learning_rate": 3.842030912085197e-05, + "loss": 1.6764, + "step": 19131 + }, + { + "epoch": 5.87231430325353, + "grad_norm": 0.23941899836063385, + "learning_rate": 3.841547375494868e-05, + "loss": 1.8157, + "step": 19132 + }, + { + "epoch": 5.872621240024555, + "grad_norm": 0.23184041678905487, + "learning_rate": 3.841063850353044e-05, + "loss": 1.6948, + "step": 19133 + }, + { + "epoch": 5.87292817679558, + "grad_norm": 0.20299546420574188, + "learning_rate": 3.840580336664508e-05, + "loss": 1.7812, + "step": 19134 + }, + { + "epoch": 5.873235113566605, + "grad_norm": 0.24654673039913177, + "learning_rate": 3.840096834434036e-05, + "loss": 1.7999, + "step": 19135 + }, + { + "epoch": 5.8735420503376305, + "grad_norm": 0.21144285798072815, + "learning_rate": 3.8396133436664085e-05, + "loss": 1.7033, + "step": 19136 + }, + { + "epoch": 5.873848987108656, + "grad_norm": 0.22186708450317383, + "learning_rate": 3.8391298643663997e-05, + "loss": 1.7292, + "step": 19137 + }, + { + "epoch": 5.87415592387968, + "grad_norm": 0.21017275750637054, + "learning_rate": 3.838646396538793e-05, + "loss": 1.6989, + "step": 19138 + }, + { + "epoch": 5.874462860650706, + "grad_norm": 0.19430704414844513, + "learning_rate": 3.83816294018836e-05, + "loss": 1.7446, + "step": 19139 + }, + { + "epoch": 5.874769797421731, + "grad_norm": 0.25048547983169556, + "learning_rate": 3.8376794953198836e-05, + "loss": 1.7358, + "step": 19140 + }, + { + "epoch": 5.875076734192756, + "grad_norm": 0.21869583427906036, + "learning_rate": 3.8371960619381406e-05, + "loss": 1.7017, + "step": 19141 + }, + { + "epoch": 5.875383670963782, + "grad_norm": 0.2053002119064331, + "learning_rate": 3.836712640047905e-05, + "loss": 1.7077, + "step": 19142 + }, + { + "epoch": 5.875690607734807, + "grad_norm": 0.2222425490617752, + "learning_rate": 3.83622922965396e-05, + "loss": 1.7259, + "step": 19143 + }, + { + "epoch": 5.8759975445058314, + "grad_norm": 0.20682495832443237, + "learning_rate": 3.8357458307610774e-05, + "loss": 1.7597, + "step": 19144 + }, + { + "epoch": 5.876304481276857, + "grad_norm": 0.2001802772283554, + "learning_rate": 3.835262443374038e-05, + "loss": 1.7546, + "step": 19145 + }, + { + "epoch": 5.876611418047882, + "grad_norm": 0.20499882102012634, + "learning_rate": 3.8347790674976166e-05, + "loss": 1.6741, + "step": 19146 + }, + { + "epoch": 5.8769183548189075, + "grad_norm": 0.17830348014831543, + "learning_rate": 3.834295703136593e-05, + "loss": 1.7067, + "step": 19147 + }, + { + "epoch": 5.877225291589933, + "grad_norm": 0.25055429339408875, + "learning_rate": 3.833812350295741e-05, + "loss": 1.753, + "step": 19148 + }, + { + "epoch": 5.877532228360957, + "grad_norm": 0.19037213921546936, + "learning_rate": 3.8333290089798415e-05, + "loss": 1.7336, + "step": 19149 + }, + { + "epoch": 5.877839165131983, + "grad_norm": 0.18041233718395233, + "learning_rate": 3.8328456791936656e-05, + "loss": 1.7172, + "step": 19150 + }, + { + "epoch": 5.878146101903008, + "grad_norm": 0.21531802415847778, + "learning_rate": 3.832362360941994e-05, + "loss": 1.7328, + "step": 19151 + }, + { + "epoch": 5.878453038674033, + "grad_norm": 0.23101283609867096, + "learning_rate": 3.831879054229601e-05, + "loss": 1.7548, + "step": 19152 + }, + { + "epoch": 5.878759975445059, + "grad_norm": 0.19029635190963745, + "learning_rate": 3.831395759061266e-05, + "loss": 1.6852, + "step": 19153 + }, + { + "epoch": 5.879066912216084, + "grad_norm": 0.20305602252483368, + "learning_rate": 3.830912475441761e-05, + "loss": 1.6982, + "step": 19154 + }, + { + "epoch": 5.879373848987108, + "grad_norm": 0.19752593338489532, + "learning_rate": 3.830429203375866e-05, + "loss": 1.7726, + "step": 19155 + }, + { + "epoch": 5.879680785758134, + "grad_norm": 0.2109406590461731, + "learning_rate": 3.8299459428683526e-05, + "loss": 1.7629, + "step": 19156 + }, + { + "epoch": 5.879987722529159, + "grad_norm": 0.19448740780353546, + "learning_rate": 3.829462693924001e-05, + "loss": 1.6981, + "step": 19157 + }, + { + "epoch": 5.880294659300184, + "grad_norm": 0.19344154000282288, + "learning_rate": 3.828979456547586e-05, + "loss": 1.6822, + "step": 19158 + }, + { + "epoch": 5.88060159607121, + "grad_norm": 0.24466145038604736, + "learning_rate": 3.82849623074388e-05, + "loss": 1.7575, + "step": 19159 + }, + { + "epoch": 5.880908532842234, + "grad_norm": 0.20174476504325867, + "learning_rate": 3.828013016517663e-05, + "loss": 1.7267, + "step": 19160 + }, + { + "epoch": 5.8812154696132595, + "grad_norm": 0.23560820519924164, + "learning_rate": 3.827529813873706e-05, + "loss": 1.7125, + "step": 19161 + }, + { + "epoch": 5.881522406384285, + "grad_norm": 0.18118280172348022, + "learning_rate": 3.827046622816789e-05, + "loss": 1.7436, + "step": 19162 + }, + { + "epoch": 5.88182934315531, + "grad_norm": 0.27250152826309204, + "learning_rate": 3.8265634433516824e-05, + "loss": 1.7249, + "step": 19163 + }, + { + "epoch": 5.8821362799263355, + "grad_norm": 0.23510734736919403, + "learning_rate": 3.826080275483166e-05, + "loss": 1.7502, + "step": 19164 + }, + { + "epoch": 5.882443216697361, + "grad_norm": 0.22708909213542938, + "learning_rate": 3.82559711921601e-05, + "loss": 1.7478, + "step": 19165 + }, + { + "epoch": 5.882750153468385, + "grad_norm": 0.292584627866745, + "learning_rate": 3.825113974554995e-05, + "loss": 1.6757, + "step": 19166 + }, + { + "epoch": 5.883057090239411, + "grad_norm": 0.22186334431171417, + "learning_rate": 3.8246308415048884e-05, + "loss": 1.7061, + "step": 19167 + }, + { + "epoch": 5.883364027010436, + "grad_norm": 0.23995520174503326, + "learning_rate": 3.8241477200704714e-05, + "loss": 1.6962, + "step": 19168 + }, + { + "epoch": 5.883670963781461, + "grad_norm": 0.25545260310173035, + "learning_rate": 3.823664610256513e-05, + "loss": 1.7582, + "step": 19169 + }, + { + "epoch": 5.883977900552486, + "grad_norm": 0.2209167629480362, + "learning_rate": 3.823181512067794e-05, + "loss": 1.7212, + "step": 19170 + }, + { + "epoch": 5.884284837323511, + "grad_norm": 0.24626508355140686, + "learning_rate": 3.8226984255090824e-05, + "loss": 1.7356, + "step": 19171 + }, + { + "epoch": 5.884591774094536, + "grad_norm": 0.22982320189476013, + "learning_rate": 3.822215350585157e-05, + "loss": 1.7516, + "step": 19172 + }, + { + "epoch": 5.884898710865562, + "grad_norm": 0.19458627700805664, + "learning_rate": 3.8217322873007874e-05, + "loss": 1.7097, + "step": 19173 + }, + { + "epoch": 5.885205647636587, + "grad_norm": 0.2030913233757019, + "learning_rate": 3.8212492356607524e-05, + "loss": 1.7273, + "step": 19174 + }, + { + "epoch": 5.885512584407612, + "grad_norm": 0.20174767076969147, + "learning_rate": 3.820766195669823e-05, + "loss": 1.7167, + "step": 19175 + }, + { + "epoch": 5.885819521178637, + "grad_norm": 0.22572553157806396, + "learning_rate": 3.820283167332772e-05, + "loss": 1.8034, + "step": 19176 + }, + { + "epoch": 5.886126457949662, + "grad_norm": 0.24423041939735413, + "learning_rate": 3.819800150654376e-05, + "loss": 1.7188, + "step": 19177 + }, + { + "epoch": 5.8864333947206875, + "grad_norm": 0.20805509388446808, + "learning_rate": 3.819317145639404e-05, + "loss": 1.7252, + "step": 19178 + }, + { + "epoch": 5.886740331491713, + "grad_norm": 0.2731400728225708, + "learning_rate": 3.8188341522926334e-05, + "loss": 1.7778, + "step": 19179 + }, + { + "epoch": 5.887047268262738, + "grad_norm": 0.2604491412639618, + "learning_rate": 3.818351170618835e-05, + "loss": 1.7524, + "step": 19180 + }, + { + "epoch": 5.887354205033763, + "grad_norm": 0.20043112337589264, + "learning_rate": 3.817868200622785e-05, + "loss": 1.7176, + "step": 19181 + }, + { + "epoch": 5.887661141804788, + "grad_norm": 0.2224988341331482, + "learning_rate": 3.817385242309253e-05, + "loss": 1.7267, + "step": 19182 + }, + { + "epoch": 5.887968078575813, + "grad_norm": 0.24603894352912903, + "learning_rate": 3.8169022956830135e-05, + "loss": 1.716, + "step": 19183 + }, + { + "epoch": 5.888275015346839, + "grad_norm": 0.19959969818592072, + "learning_rate": 3.816419360748839e-05, + "loss": 1.7461, + "step": 19184 + }, + { + "epoch": 5.888581952117864, + "grad_norm": 0.21907947957515717, + "learning_rate": 3.815936437511501e-05, + "loss": 1.6982, + "step": 19185 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.1920289248228073, + "learning_rate": 3.8154535259757735e-05, + "loss": 1.7213, + "step": 19186 + }, + { + "epoch": 5.889195825659914, + "grad_norm": 0.21930737793445587, + "learning_rate": 3.81497062614643e-05, + "loss": 1.7389, + "step": 19187 + }, + { + "epoch": 5.889502762430939, + "grad_norm": 0.1972137838602066, + "learning_rate": 3.814487738028239e-05, + "loss": 1.7317, + "step": 19188 + }, + { + "epoch": 5.889809699201964, + "grad_norm": 0.20000529289245605, + "learning_rate": 3.8140048616259785e-05, + "loss": 1.7148, + "step": 19189 + }, + { + "epoch": 5.89011663597299, + "grad_norm": 0.18828663229942322, + "learning_rate": 3.8135219969444135e-05, + "loss": 1.725, + "step": 19190 + }, + { + "epoch": 5.890423572744015, + "grad_norm": 0.2237224131822586, + "learning_rate": 3.8130391439883216e-05, + "loss": 1.7252, + "step": 19191 + }, + { + "epoch": 5.8907305095150395, + "grad_norm": 0.19954712688922882, + "learning_rate": 3.812556302762473e-05, + "loss": 1.7071, + "step": 19192 + }, + { + "epoch": 5.891037446286065, + "grad_norm": 0.23509685695171356, + "learning_rate": 3.812073473271637e-05, + "loss": 1.7603, + "step": 19193 + }, + { + "epoch": 5.89134438305709, + "grad_norm": 0.28477707505226135, + "learning_rate": 3.81159065552059e-05, + "loss": 1.8193, + "step": 19194 + }, + { + "epoch": 5.8916513198281155, + "grad_norm": 0.1936045140028, + "learning_rate": 3.811107849514098e-05, + "loss": 1.7438, + "step": 19195 + }, + { + "epoch": 5.891958256599141, + "grad_norm": 0.288253515958786, + "learning_rate": 3.810625055256936e-05, + "loss": 1.8042, + "step": 19196 + }, + { + "epoch": 5.892265193370166, + "grad_norm": 0.19256485998630524, + "learning_rate": 3.810142272753873e-05, + "loss": 1.6997, + "step": 19197 + }, + { + "epoch": 5.892572130141191, + "grad_norm": 0.2823546826839447, + "learning_rate": 3.809659502009684e-05, + "loss": 1.7133, + "step": 19198 + }, + { + "epoch": 5.892879066912216, + "grad_norm": 0.25116851925849915, + "learning_rate": 3.809176743029136e-05, + "loss": 1.7402, + "step": 19199 + }, + { + "epoch": 5.893186003683241, + "grad_norm": 0.19840675592422485, + "learning_rate": 3.808693995817003e-05, + "loss": 1.7009, + "step": 19200 + }, + { + "epoch": 5.893492940454267, + "grad_norm": 0.2703700363636017, + "learning_rate": 3.808211260378051e-05, + "loss": 1.741, + "step": 19201 + }, + { + "epoch": 5.893799877225292, + "grad_norm": 0.25683698058128357, + "learning_rate": 3.807728536717056e-05, + "loss": 1.7431, + "step": 19202 + }, + { + "epoch": 5.894106813996316, + "grad_norm": 0.19033822417259216, + "learning_rate": 3.8072458248387855e-05, + "loss": 1.7423, + "step": 19203 + }, + { + "epoch": 5.894413750767342, + "grad_norm": 0.2771024703979492, + "learning_rate": 3.806763124748012e-05, + "loss": 1.7376, + "step": 19204 + }, + { + "epoch": 5.894720687538367, + "grad_norm": 0.30265524983406067, + "learning_rate": 3.806280436449504e-05, + "loss": 1.7124, + "step": 19205 + }, + { + "epoch": 5.895027624309392, + "grad_norm": 0.21838776767253876, + "learning_rate": 3.805797759948033e-05, + "loss": 1.7319, + "step": 19206 + }, + { + "epoch": 5.895334561080418, + "grad_norm": 0.22244395315647125, + "learning_rate": 3.805315095248368e-05, + "loss": 1.7034, + "step": 19207 + }, + { + "epoch": 5.895641497851443, + "grad_norm": 0.20621941983699799, + "learning_rate": 3.8048324423552786e-05, + "loss": 1.7231, + "step": 19208 + }, + { + "epoch": 5.8959484346224675, + "grad_norm": 0.23735111951828003, + "learning_rate": 3.804349801273538e-05, + "loss": 1.7484, + "step": 19209 + }, + { + "epoch": 5.896255371393493, + "grad_norm": 0.33221447467803955, + "learning_rate": 3.803867172007911e-05, + "loss": 1.7782, + "step": 19210 + }, + { + "epoch": 5.896562308164518, + "grad_norm": 0.20859810709953308, + "learning_rate": 3.803384554563172e-05, + "loss": 1.688, + "step": 19211 + }, + { + "epoch": 5.8968692449355435, + "grad_norm": 0.25731268525123596, + "learning_rate": 3.8029019489440855e-05, + "loss": 1.7463, + "step": 19212 + }, + { + "epoch": 5.897176181706568, + "grad_norm": 0.26556700468063354, + "learning_rate": 3.802419355155425e-05, + "loss": 1.7251, + "step": 19213 + }, + { + "epoch": 5.897483118477593, + "grad_norm": 0.20397205650806427, + "learning_rate": 3.801936773201957e-05, + "loss": 1.6785, + "step": 19214 + }, + { + "epoch": 5.897790055248619, + "grad_norm": 0.2198234349489212, + "learning_rate": 3.8014542030884544e-05, + "loss": 1.7608, + "step": 19215 + }, + { + "epoch": 5.898096992019644, + "grad_norm": 0.22619546949863434, + "learning_rate": 3.800971644819681e-05, + "loss": 1.8034, + "step": 19216 + }, + { + "epoch": 5.898403928790669, + "grad_norm": 0.22074444591999054, + "learning_rate": 3.800489098400412e-05, + "loss": 1.777, + "step": 19217 + }, + { + "epoch": 5.898710865561695, + "grad_norm": 0.2555946707725525, + "learning_rate": 3.80000656383541e-05, + "loss": 1.7578, + "step": 19218 + }, + { + "epoch": 5.899017802332719, + "grad_norm": 0.2130863517522812, + "learning_rate": 3.7995240411294474e-05, + "loss": 1.7312, + "step": 19219 + }, + { + "epoch": 5.899324739103744, + "grad_norm": 0.2574099898338318, + "learning_rate": 3.799041530287291e-05, + "loss": 1.7509, + "step": 19220 + }, + { + "epoch": 5.89963167587477, + "grad_norm": 0.2556573152542114, + "learning_rate": 3.798559031313712e-05, + "loss": 1.7624, + "step": 19221 + }, + { + "epoch": 5.899938612645795, + "grad_norm": 0.19909335672855377, + "learning_rate": 3.798076544213475e-05, + "loss": 1.7466, + "step": 19222 + }, + { + "epoch": 5.9002455494168204, + "grad_norm": 0.19832594692707062, + "learning_rate": 3.7975940689913526e-05, + "loss": 1.6896, + "step": 19223 + }, + { + "epoch": 5.900552486187845, + "grad_norm": 0.18473665416240692, + "learning_rate": 3.7971116056521076e-05, + "loss": 1.7167, + "step": 19224 + }, + { + "epoch": 5.90085942295887, + "grad_norm": 0.21106892824172974, + "learning_rate": 3.796629154200512e-05, + "loss": 1.8071, + "step": 19225 + }, + { + "epoch": 5.901166359729896, + "grad_norm": 0.20903728902339935, + "learning_rate": 3.796146714641333e-05, + "loss": 1.6946, + "step": 19226 + }, + { + "epoch": 5.901473296500921, + "grad_norm": 0.21518728137016296, + "learning_rate": 3.795664286979336e-05, + "loss": 1.6899, + "step": 19227 + }, + { + "epoch": 5.901780233271946, + "grad_norm": 0.1948135644197464, + "learning_rate": 3.7951818712192926e-05, + "loss": 1.7568, + "step": 19228 + }, + { + "epoch": 5.902087170042972, + "grad_norm": 0.2222091257572174, + "learning_rate": 3.7946994673659667e-05, + "loss": 1.8118, + "step": 19229 + }, + { + "epoch": 5.902394106813996, + "grad_norm": 0.2173513025045395, + "learning_rate": 3.794217075424127e-05, + "loss": 1.7194, + "step": 19230 + }, + { + "epoch": 5.902701043585021, + "grad_norm": 0.2026323676109314, + "learning_rate": 3.79373469539854e-05, + "loss": 1.6944, + "step": 19231 + }, + { + "epoch": 5.903007980356047, + "grad_norm": 0.22178098559379578, + "learning_rate": 3.7932523272939765e-05, + "loss": 1.7328, + "step": 19232 + }, + { + "epoch": 5.903314917127072, + "grad_norm": 0.22846719622612, + "learning_rate": 3.792769971115198e-05, + "loss": 1.8065, + "step": 19233 + }, + { + "epoch": 5.903621853898097, + "grad_norm": 0.2086053490638733, + "learning_rate": 3.792287626866977e-05, + "loss": 1.7511, + "step": 19234 + }, + { + "epoch": 5.903928790669122, + "grad_norm": 0.22444705665111542, + "learning_rate": 3.791805294554075e-05, + "loss": 1.742, + "step": 19235 + }, + { + "epoch": 5.904235727440147, + "grad_norm": 0.24630236625671387, + "learning_rate": 3.7913229741812625e-05, + "loss": 1.7531, + "step": 19236 + }, + { + "epoch": 5.9045426642111725, + "grad_norm": 0.2618274986743927, + "learning_rate": 3.7908406657533036e-05, + "loss": 1.7387, + "step": 19237 + }, + { + "epoch": 5.904849600982198, + "grad_norm": 0.25871509313583374, + "learning_rate": 3.790358369274968e-05, + "loss": 1.7822, + "step": 19238 + }, + { + "epoch": 5.905156537753223, + "grad_norm": 0.22675062716007233, + "learning_rate": 3.789876084751018e-05, + "loss": 1.7788, + "step": 19239 + }, + { + "epoch": 5.9054634745242485, + "grad_norm": 0.26623663306236267, + "learning_rate": 3.789393812186224e-05, + "loss": 1.7092, + "step": 19240 + }, + { + "epoch": 5.905770411295273, + "grad_norm": 0.19448868930339813, + "learning_rate": 3.788911551585348e-05, + "loss": 1.7164, + "step": 19241 + }, + { + "epoch": 5.906077348066298, + "grad_norm": 0.22451938688755035, + "learning_rate": 3.788429302953158e-05, + "loss": 1.667, + "step": 19242 + }, + { + "epoch": 5.906384284837324, + "grad_norm": 0.2323608547449112, + "learning_rate": 3.7879470662944214e-05, + "loss": 1.7992, + "step": 19243 + }, + { + "epoch": 5.906691221608349, + "grad_norm": 0.2508258819580078, + "learning_rate": 3.7874648416139e-05, + "loss": 1.7681, + "step": 19244 + }, + { + "epoch": 5.906998158379373, + "grad_norm": 0.22333547472953796, + "learning_rate": 3.786982628916364e-05, + "loss": 1.7006, + "step": 19245 + }, + { + "epoch": 5.907305095150399, + "grad_norm": 0.19816327095031738, + "learning_rate": 3.786500428206575e-05, + "loss": 1.7458, + "step": 19246 + }, + { + "epoch": 5.907612031921424, + "grad_norm": 0.2047683447599411, + "learning_rate": 3.7860182394893006e-05, + "loss": 1.7385, + "step": 19247 + }, + { + "epoch": 5.907918968692449, + "grad_norm": 0.2124621719121933, + "learning_rate": 3.785536062769304e-05, + "loss": 1.7373, + "step": 19248 + }, + { + "epoch": 5.908225905463475, + "grad_norm": 0.200453981757164, + "learning_rate": 3.785053898051355e-05, + "loss": 1.7754, + "step": 19249 + }, + { + "epoch": 5.9085328422345, + "grad_norm": 0.19543224573135376, + "learning_rate": 3.784571745340212e-05, + "loss": 1.724, + "step": 19250 + }, + { + "epoch": 5.9088397790055245, + "grad_norm": 0.17079658806324005, + "learning_rate": 3.784089604640647e-05, + "loss": 1.6843, + "step": 19251 + }, + { + "epoch": 5.90914671577655, + "grad_norm": 0.22792236506938934, + "learning_rate": 3.783607475957418e-05, + "loss": 1.7442, + "step": 19252 + }, + { + "epoch": 5.909453652547575, + "grad_norm": 0.20699752867221832, + "learning_rate": 3.783125359295294e-05, + "loss": 1.7868, + "step": 19253 + }, + { + "epoch": 5.9097605893186005, + "grad_norm": 0.2156144678592682, + "learning_rate": 3.782643254659038e-05, + "loss": 1.7443, + "step": 19254 + }, + { + "epoch": 5.910067526089626, + "grad_norm": 0.2021300345659256, + "learning_rate": 3.782161162053417e-05, + "loss": 1.7749, + "step": 19255 + }, + { + "epoch": 5.91037446286065, + "grad_norm": 0.17613129317760468, + "learning_rate": 3.7816790814831905e-05, + "loss": 1.7001, + "step": 19256 + }, + { + "epoch": 5.910681399631676, + "grad_norm": 0.18911564350128174, + "learning_rate": 3.781197012953128e-05, + "loss": 1.6817, + "step": 19257 + }, + { + "epoch": 5.910988336402701, + "grad_norm": 0.18920689821243286, + "learning_rate": 3.780714956467989e-05, + "loss": 1.7554, + "step": 19258 + }, + { + "epoch": 5.911295273173726, + "grad_norm": 0.22030571103096008, + "learning_rate": 3.7802329120325396e-05, + "loss": 1.7554, + "step": 19259 + }, + { + "epoch": 5.911602209944752, + "grad_norm": 0.21164962649345398, + "learning_rate": 3.779750879651545e-05, + "loss": 1.74, + "step": 19260 + }, + { + "epoch": 5.911909146715777, + "grad_norm": 0.2205103188753128, + "learning_rate": 3.779268859329766e-05, + "loss": 1.7424, + "step": 19261 + }, + { + "epoch": 5.912216083486801, + "grad_norm": 0.19262658059597015, + "learning_rate": 3.7787868510719685e-05, + "loss": 1.7157, + "step": 19262 + }, + { + "epoch": 5.912523020257827, + "grad_norm": 0.19583287835121155, + "learning_rate": 3.778304854882914e-05, + "loss": 1.7343, + "step": 19263 + }, + { + "epoch": 5.912829957028852, + "grad_norm": 0.18275529146194458, + "learning_rate": 3.777822870767368e-05, + "loss": 1.6938, + "step": 19264 + }, + { + "epoch": 5.913136893799877, + "grad_norm": 0.21268916130065918, + "learning_rate": 3.7773408987300914e-05, + "loss": 1.7546, + "step": 19265 + }, + { + "epoch": 5.913443830570903, + "grad_norm": 0.20878887176513672, + "learning_rate": 3.77685893877585e-05, + "loss": 1.8109, + "step": 19266 + }, + { + "epoch": 5.913750767341927, + "grad_norm": 0.2326175421476364, + "learning_rate": 3.776376990909404e-05, + "loss": 1.7248, + "step": 19267 + }, + { + "epoch": 5.9140577041129525, + "grad_norm": 0.28189611434936523, + "learning_rate": 3.7758950551355204e-05, + "loss": 1.7796, + "step": 19268 + }, + { + "epoch": 5.914364640883978, + "grad_norm": 0.1922682821750641, + "learning_rate": 3.775413131458957e-05, + "loss": 1.7096, + "step": 19269 + }, + { + "epoch": 5.914671577655003, + "grad_norm": 0.2839193642139435, + "learning_rate": 3.774931219884479e-05, + "loss": 1.7341, + "step": 19270 + }, + { + "epoch": 5.9149785144260285, + "grad_norm": 0.2075256109237671, + "learning_rate": 3.7744493204168495e-05, + "loss": 1.7565, + "step": 19271 + }, + { + "epoch": 5.915285451197054, + "grad_norm": 0.2780497372150421, + "learning_rate": 3.7739674330608306e-05, + "loss": 1.7186, + "step": 19272 + }, + { + "epoch": 5.915592387968078, + "grad_norm": 0.26129212975502014, + "learning_rate": 3.773485557821182e-05, + "loss": 1.8468, + "step": 19273 + }, + { + "epoch": 5.915899324739104, + "grad_norm": 0.3299194276332855, + "learning_rate": 3.773003694702671e-05, + "loss": 1.7705, + "step": 19274 + }, + { + "epoch": 5.916206261510129, + "grad_norm": 0.3011106848716736, + "learning_rate": 3.772521843710054e-05, + "loss": 1.748, + "step": 19275 + }, + { + "epoch": 5.916513198281154, + "grad_norm": 0.21370603144168854, + "learning_rate": 3.7720400048480966e-05, + "loss": 1.7709, + "step": 19276 + }, + { + "epoch": 5.91682013505218, + "grad_norm": 0.29374879598617554, + "learning_rate": 3.771558178121561e-05, + "loss": 1.6948, + "step": 19277 + }, + { + "epoch": 5.917127071823204, + "grad_norm": 0.2545807659626007, + "learning_rate": 3.771076363535205e-05, + "loss": 1.7974, + "step": 19278 + }, + { + "epoch": 5.917434008594229, + "grad_norm": 0.24210263788700104, + "learning_rate": 3.7705945610937954e-05, + "loss": 1.7438, + "step": 19279 + }, + { + "epoch": 5.917740945365255, + "grad_norm": 0.26224827766418457, + "learning_rate": 3.770112770802088e-05, + "loss": 1.7294, + "step": 19280 + }, + { + "epoch": 5.91804788213628, + "grad_norm": 0.23358991742134094, + "learning_rate": 3.7696309926648486e-05, + "loss": 1.7973, + "step": 19281 + }, + { + "epoch": 5.918354818907305, + "grad_norm": 0.3466563820838928, + "learning_rate": 3.769149226686837e-05, + "loss": 1.784, + "step": 19282 + }, + { + "epoch": 5.918661755678331, + "grad_norm": 0.2416994869709015, + "learning_rate": 3.768667472872814e-05, + "loss": 1.6957, + "step": 19283 + }, + { + "epoch": 5.918968692449355, + "grad_norm": 0.2285085767507553, + "learning_rate": 3.768185731227539e-05, + "loss": 1.71, + "step": 19284 + }, + { + "epoch": 5.9192756292203805, + "grad_norm": 0.2566430866718292, + "learning_rate": 3.7677040017557775e-05, + "loss": 1.792, + "step": 19285 + }, + { + "epoch": 5.919582565991406, + "grad_norm": 0.21566689014434814, + "learning_rate": 3.767222284462285e-05, + "loss": 1.8085, + "step": 19286 + }, + { + "epoch": 5.919889502762431, + "grad_norm": 0.24078889191150665, + "learning_rate": 3.7667405793518264e-05, + "loss": 1.7221, + "step": 19287 + }, + { + "epoch": 5.920196439533456, + "grad_norm": 0.22127531468868256, + "learning_rate": 3.7662588864291584e-05, + "loss": 1.7173, + "step": 19288 + }, + { + "epoch": 5.920503376304481, + "grad_norm": 0.18165946006774902, + "learning_rate": 3.765777205699045e-05, + "loss": 1.7518, + "step": 19289 + }, + { + "epoch": 5.920810313075506, + "grad_norm": 0.2569290101528168, + "learning_rate": 3.765295537166242e-05, + "loss": 1.7716, + "step": 19290 + }, + { + "epoch": 5.921117249846532, + "grad_norm": 0.19010202586650848, + "learning_rate": 3.764813880835515e-05, + "loss": 1.7146, + "step": 19291 + }, + { + "epoch": 5.921424186617557, + "grad_norm": 0.2882116436958313, + "learning_rate": 3.7643322367116195e-05, + "loss": 1.7677, + "step": 19292 + }, + { + "epoch": 5.921731123388582, + "grad_norm": 0.30711185932159424, + "learning_rate": 3.763850604799319e-05, + "loss": 1.7506, + "step": 19293 + }, + { + "epoch": 5.922038060159607, + "grad_norm": 0.19295164942741394, + "learning_rate": 3.76336898510337e-05, + "loss": 1.715, + "step": 19294 + }, + { + "epoch": 5.922344996930632, + "grad_norm": 0.24849168956279755, + "learning_rate": 3.762887377628533e-05, + "loss": 1.6807, + "step": 19295 + }, + { + "epoch": 5.922651933701657, + "grad_norm": 0.23573634028434753, + "learning_rate": 3.7624057823795696e-05, + "loss": 1.7363, + "step": 19296 + }, + { + "epoch": 5.922958870472683, + "grad_norm": 0.24384267628192902, + "learning_rate": 3.761924199361235e-05, + "loss": 1.726, + "step": 19297 + }, + { + "epoch": 5.923265807243708, + "grad_norm": 0.2589210271835327, + "learning_rate": 3.761442628578294e-05, + "loss": 1.7771, + "step": 19298 + }, + { + "epoch": 5.9235727440147325, + "grad_norm": 0.23527951538562775, + "learning_rate": 3.760961070035501e-05, + "loss": 1.6561, + "step": 19299 + }, + { + "epoch": 5.923879680785758, + "grad_norm": 0.20286870002746582, + "learning_rate": 3.7604795237376175e-05, + "loss": 1.7464, + "step": 19300 + }, + { + "epoch": 5.924186617556783, + "grad_norm": 0.22705033421516418, + "learning_rate": 3.759997989689401e-05, + "loss": 1.7814, + "step": 19301 + }, + { + "epoch": 5.9244935543278086, + "grad_norm": 0.21780981123447418, + "learning_rate": 3.7595164678956135e-05, + "loss": 1.7601, + "step": 19302 + }, + { + "epoch": 5.924800491098834, + "grad_norm": 0.2030021697282791, + "learning_rate": 3.759034958361009e-05, + "loss": 1.7222, + "step": 19303 + }, + { + "epoch": 5.925107427869859, + "grad_norm": 0.22956500947475433, + "learning_rate": 3.758553461090351e-05, + "loss": 1.674, + "step": 19304 + }, + { + "epoch": 5.925414364640884, + "grad_norm": 0.2368287444114685, + "learning_rate": 3.758071976088392e-05, + "loss": 1.7483, + "step": 19305 + }, + { + "epoch": 5.925721301411909, + "grad_norm": 0.22852632403373718, + "learning_rate": 3.757590503359896e-05, + "loss": 1.7561, + "step": 19306 + }, + { + "epoch": 5.926028238182934, + "grad_norm": 0.21657361090183258, + "learning_rate": 3.757109042909617e-05, + "loss": 1.7814, + "step": 19307 + }, + { + "epoch": 5.92633517495396, + "grad_norm": 0.21996551752090454, + "learning_rate": 3.756627594742317e-05, + "loss": 1.732, + "step": 19308 + }, + { + "epoch": 5.926642111724985, + "grad_norm": 0.23319712281227112, + "learning_rate": 3.75614615886275e-05, + "loss": 1.6807, + "step": 19309 + }, + { + "epoch": 5.9269490484960095, + "grad_norm": 0.17926698923110962, + "learning_rate": 3.755664735275677e-05, + "loss": 1.6925, + "step": 19310 + }, + { + "epoch": 5.927255985267035, + "grad_norm": 0.18986931443214417, + "learning_rate": 3.755183323985855e-05, + "loss": 1.7002, + "step": 19311 + }, + { + "epoch": 5.92756292203806, + "grad_norm": 0.18753086030483246, + "learning_rate": 3.7547019249980385e-05, + "loss": 1.695, + "step": 19312 + }, + { + "epoch": 5.9278698588090855, + "grad_norm": 0.21354973316192627, + "learning_rate": 3.7542205383169904e-05, + "loss": 1.6629, + "step": 19313 + }, + { + "epoch": 5.928176795580111, + "grad_norm": 0.19713245332241058, + "learning_rate": 3.753739163947463e-05, + "loss": 1.707, + "step": 19314 + }, + { + "epoch": 5.928483732351136, + "grad_norm": 0.2122458517551422, + "learning_rate": 3.753257801894217e-05, + "loss": 1.7309, + "step": 19315 + }, + { + "epoch": 5.928790669122161, + "grad_norm": 0.20360666513442993, + "learning_rate": 3.7527764521620065e-05, + "loss": 1.6861, + "step": 19316 + }, + { + "epoch": 5.929097605893186, + "grad_norm": 0.2652932405471802, + "learning_rate": 3.752295114755592e-05, + "loss": 1.7662, + "step": 19317 + }, + { + "epoch": 5.929404542664211, + "grad_norm": 0.18292152881622314, + "learning_rate": 3.751813789679726e-05, + "loss": 1.6691, + "step": 19318 + }, + { + "epoch": 5.929711479435237, + "grad_norm": 0.25630465149879456, + "learning_rate": 3.75133247693917e-05, + "loss": 1.7647, + "step": 19319 + }, + { + "epoch": 5.930018416206261, + "grad_norm": 0.2463291883468628, + "learning_rate": 3.750851176538677e-05, + "loss": 1.7252, + "step": 19320 + }, + { + "epoch": 5.930325352977286, + "grad_norm": 0.19977931678295135, + "learning_rate": 3.750369888483007e-05, + "loss": 1.7694, + "step": 19321 + }, + { + "epoch": 5.930632289748312, + "grad_norm": 0.19523118436336517, + "learning_rate": 3.7498886127769116e-05, + "loss": 1.7095, + "step": 19322 + }, + { + "epoch": 5.930939226519337, + "grad_norm": 0.19273912906646729, + "learning_rate": 3.749407349425151e-05, + "loss": 1.7009, + "step": 19323 + }, + { + "epoch": 5.931246163290362, + "grad_norm": 0.2419402152299881, + "learning_rate": 3.748926098432479e-05, + "loss": 1.7167, + "step": 19324 + }, + { + "epoch": 5.931553100061388, + "grad_norm": 0.22429771721363068, + "learning_rate": 3.7484448598036534e-05, + "loss": 1.6957, + "step": 19325 + }, + { + "epoch": 5.931860036832412, + "grad_norm": 0.23211807012557983, + "learning_rate": 3.747963633543429e-05, + "loss": 1.767, + "step": 19326 + }, + { + "epoch": 5.9321669736034375, + "grad_norm": 0.23204533755779266, + "learning_rate": 3.7474824196565625e-05, + "loss": 1.7405, + "step": 19327 + }, + { + "epoch": 5.932473910374463, + "grad_norm": 0.24068887531757355, + "learning_rate": 3.747001218147809e-05, + "loss": 1.7539, + "step": 19328 + }, + { + "epoch": 5.932780847145488, + "grad_norm": 0.18140049278736115, + "learning_rate": 3.746520029021922e-05, + "loss": 1.6956, + "step": 19329 + }, + { + "epoch": 5.9330877839165135, + "grad_norm": 0.28421929478645325, + "learning_rate": 3.746038852283661e-05, + "loss": 1.8539, + "step": 19330 + }, + { + "epoch": 5.933394720687538, + "grad_norm": 0.21984805166721344, + "learning_rate": 3.745557687937777e-05, + "loss": 1.7469, + "step": 19331 + }, + { + "epoch": 5.933701657458563, + "grad_norm": 0.2500358819961548, + "learning_rate": 3.7450765359890294e-05, + "loss": 1.7184, + "step": 19332 + }, + { + "epoch": 5.934008594229589, + "grad_norm": 0.2608816623687744, + "learning_rate": 3.744595396442169e-05, + "loss": 1.6825, + "step": 19333 + }, + { + "epoch": 5.934315531000614, + "grad_norm": 0.20359274744987488, + "learning_rate": 3.7441142693019526e-05, + "loss": 1.7535, + "step": 19334 + }, + { + "epoch": 5.934622467771639, + "grad_norm": 0.24795760214328766, + "learning_rate": 3.743633154573135e-05, + "loss": 1.7829, + "step": 19335 + }, + { + "epoch": 5.934929404542665, + "grad_norm": 0.20762503147125244, + "learning_rate": 3.7431520522604736e-05, + "loss": 1.7657, + "step": 19336 + }, + { + "epoch": 5.935236341313689, + "grad_norm": 0.24349527060985565, + "learning_rate": 3.7426709623687174e-05, + "loss": 1.7037, + "step": 19337 + }, + { + "epoch": 5.935543278084714, + "grad_norm": 0.2138780951499939, + "learning_rate": 3.742189884902626e-05, + "loss": 1.7302, + "step": 19338 + }, + { + "epoch": 5.93585021485574, + "grad_norm": 0.24776574969291687, + "learning_rate": 3.741708819866949e-05, + "loss": 1.7293, + "step": 19339 + }, + { + "epoch": 5.936157151626765, + "grad_norm": 0.297888845205307, + "learning_rate": 3.7412277672664444e-05, + "loss": 1.8341, + "step": 19340 + }, + { + "epoch": 5.93646408839779, + "grad_norm": 0.2811104953289032, + "learning_rate": 3.740746727105864e-05, + "loss": 1.7188, + "step": 19341 + }, + { + "epoch": 5.936771025168815, + "grad_norm": 0.37908127903938293, + "learning_rate": 3.740265699389964e-05, + "loss": 1.765, + "step": 19342 + }, + { + "epoch": 5.93707796193984, + "grad_norm": 0.24403691291809082, + "learning_rate": 3.739784684123495e-05, + "loss": 1.6897, + "step": 19343 + }, + { + "epoch": 5.9373848987108655, + "grad_norm": 0.2393181174993515, + "learning_rate": 3.7393036813112135e-05, + "loss": 1.6843, + "step": 19344 + }, + { + "epoch": 5.937691835481891, + "grad_norm": 0.2927580177783966, + "learning_rate": 3.738822690957872e-05, + "loss": 1.6946, + "step": 19345 + }, + { + "epoch": 5.937998772252916, + "grad_norm": 0.23423373699188232, + "learning_rate": 3.738341713068223e-05, + "loss": 1.7409, + "step": 19346 + }, + { + "epoch": 5.9383057090239415, + "grad_norm": 0.2544272840023041, + "learning_rate": 3.7378607476470216e-05, + "loss": 1.698, + "step": 19347 + }, + { + "epoch": 5.938612645794966, + "grad_norm": 0.2120404839515686, + "learning_rate": 3.737379794699019e-05, + "loss": 1.7412, + "step": 19348 + }, + { + "epoch": 5.938919582565991, + "grad_norm": 0.2076033353805542, + "learning_rate": 3.736898854228971e-05, + "loss": 1.752, + "step": 19349 + }, + { + "epoch": 5.939226519337017, + "grad_norm": 0.20122376084327698, + "learning_rate": 3.736417926241627e-05, + "loss": 1.6741, + "step": 19350 + }, + { + "epoch": 5.939533456108042, + "grad_norm": 0.1856858730316162, + "learning_rate": 3.735937010741742e-05, + "loss": 1.6959, + "step": 19351 + }, + { + "epoch": 5.939840392879067, + "grad_norm": 0.22192558646202087, + "learning_rate": 3.7354561077340684e-05, + "loss": 1.7597, + "step": 19352 + }, + { + "epoch": 5.940147329650092, + "grad_norm": 0.2653545141220093, + "learning_rate": 3.73497521722336e-05, + "loss": 1.7324, + "step": 19353 + }, + { + "epoch": 5.940454266421117, + "grad_norm": 0.1975676715373993, + "learning_rate": 3.734494339214366e-05, + "loss": 1.6852, + "step": 19354 + }, + { + "epoch": 5.940761203192142, + "grad_norm": 0.26949796080589294, + "learning_rate": 3.734013473711843e-05, + "loss": 1.7695, + "step": 19355 + }, + { + "epoch": 5.941068139963168, + "grad_norm": 0.2272176742553711, + "learning_rate": 3.733532620720539e-05, + "loss": 1.745, + "step": 19356 + }, + { + "epoch": 5.941375076734193, + "grad_norm": 0.25740066170692444, + "learning_rate": 3.733051780245208e-05, + "loss": 1.7701, + "step": 19357 + }, + { + "epoch": 5.941682013505218, + "grad_norm": 0.1910635381937027, + "learning_rate": 3.732570952290602e-05, + "loss": 1.7276, + "step": 19358 + }, + { + "epoch": 5.941988950276243, + "grad_norm": 0.24896447360515594, + "learning_rate": 3.732090136861474e-05, + "loss": 1.7717, + "step": 19359 + }, + { + "epoch": 5.942295887047268, + "grad_norm": 0.20696721971035004, + "learning_rate": 3.731609333962572e-05, + "loss": 1.7053, + "step": 19360 + }, + { + "epoch": 5.9426028238182935, + "grad_norm": 0.18822510540485382, + "learning_rate": 3.731128543598653e-05, + "loss": 1.6869, + "step": 19361 + }, + { + "epoch": 5.942909760589319, + "grad_norm": 0.20757299661636353, + "learning_rate": 3.730647765774464e-05, + "loss": 1.7214, + "step": 19362 + }, + { + "epoch": 5.943216697360343, + "grad_norm": 0.21238471567630768, + "learning_rate": 3.7301670004947574e-05, + "loss": 1.6953, + "step": 19363 + }, + { + "epoch": 5.943523634131369, + "grad_norm": 0.19326119124889374, + "learning_rate": 3.729686247764286e-05, + "loss": 1.7224, + "step": 19364 + }, + { + "epoch": 5.943830570902394, + "grad_norm": 0.17631326615810394, + "learning_rate": 3.729205507587798e-05, + "loss": 1.6471, + "step": 19365 + }, + { + "epoch": 5.944137507673419, + "grad_norm": 0.1741493195295334, + "learning_rate": 3.728724779970048e-05, + "loss": 1.7169, + "step": 19366 + }, + { + "epoch": 5.944444444444445, + "grad_norm": 0.18203428387641907, + "learning_rate": 3.728244064915782e-05, + "loss": 1.7301, + "step": 19367 + }, + { + "epoch": 5.94475138121547, + "grad_norm": 0.2063162475824356, + "learning_rate": 3.727763362429756e-05, + "loss": 1.7274, + "step": 19368 + }, + { + "epoch": 5.945058317986494, + "grad_norm": 0.17239537835121155, + "learning_rate": 3.7272826725167164e-05, + "loss": 1.7194, + "step": 19369 + }, + { + "epoch": 5.94536525475752, + "grad_norm": 0.1910972148180008, + "learning_rate": 3.726801995181418e-05, + "loss": 1.7017, + "step": 19370 + }, + { + "epoch": 5.945672191528545, + "grad_norm": 0.18822111189365387, + "learning_rate": 3.726321330428606e-05, + "loss": 1.723, + "step": 19371 + }, + { + "epoch": 5.94597912829957, + "grad_norm": 0.19680333137512207, + "learning_rate": 3.725840678263035e-05, + "loss": 1.685, + "step": 19372 + }, + { + "epoch": 5.946286065070596, + "grad_norm": 0.19016215205192566, + "learning_rate": 3.725360038689451e-05, + "loss": 1.7148, + "step": 19373 + }, + { + "epoch": 5.94659300184162, + "grad_norm": 0.1992037147283554, + "learning_rate": 3.7248794117126075e-05, + "loss": 1.7278, + "step": 19374 + }, + { + "epoch": 5.9468999386126455, + "grad_norm": 0.1892910748720169, + "learning_rate": 3.724398797337252e-05, + "loss": 1.7093, + "step": 19375 + }, + { + "epoch": 5.947206875383671, + "grad_norm": 0.23379561305046082, + "learning_rate": 3.723918195568137e-05, + "loss": 1.768, + "step": 19376 + }, + { + "epoch": 5.947513812154696, + "grad_norm": 0.1986081600189209, + "learning_rate": 3.7234376064100104e-05, + "loss": 1.719, + "step": 19377 + }, + { + "epoch": 5.9478207489257215, + "grad_norm": 0.20901642739772797, + "learning_rate": 3.7229570298676195e-05, + "loss": 1.7066, + "step": 19378 + }, + { + "epoch": 5.948127685696747, + "grad_norm": 0.2102847546339035, + "learning_rate": 3.722476465945718e-05, + "loss": 1.7354, + "step": 19379 + }, + { + "epoch": 5.948434622467771, + "grad_norm": 0.1857316792011261, + "learning_rate": 3.72199591464905e-05, + "loss": 1.7159, + "step": 19380 + }, + { + "epoch": 5.948741559238797, + "grad_norm": 0.3045661151409149, + "learning_rate": 3.721515375982371e-05, + "loss": 1.8782, + "step": 19381 + }, + { + "epoch": 5.949048496009822, + "grad_norm": 0.24114711582660675, + "learning_rate": 3.7210348499504236e-05, + "loss": 1.6819, + "step": 19382 + }, + { + "epoch": 5.949355432780847, + "grad_norm": 0.20186996459960938, + "learning_rate": 3.720554336557961e-05, + "loss": 1.8028, + "step": 19383 + }, + { + "epoch": 5.949662369551873, + "grad_norm": 0.25385335087776184, + "learning_rate": 3.7200738358097295e-05, + "loss": 1.7278, + "step": 19384 + }, + { + "epoch": 5.949969306322897, + "grad_norm": 0.23390468955039978, + "learning_rate": 3.719593347710478e-05, + "loss": 1.7775, + "step": 19385 + }, + { + "epoch": 5.9502762430939224, + "grad_norm": 0.22577936947345734, + "learning_rate": 3.719112872264956e-05, + "loss": 1.7567, + "step": 19386 + }, + { + "epoch": 5.950583179864948, + "grad_norm": 0.2540932297706604, + "learning_rate": 3.718632409477912e-05, + "loss": 1.6749, + "step": 19387 + }, + { + "epoch": 5.950890116635973, + "grad_norm": 0.1994820535182953, + "learning_rate": 3.718151959354093e-05, + "loss": 1.6809, + "step": 19388 + }, + { + "epoch": 5.9511970534069984, + "grad_norm": 0.27669432759284973, + "learning_rate": 3.717671521898249e-05, + "loss": 1.7633, + "step": 19389 + }, + { + "epoch": 5.951503990178024, + "grad_norm": 0.2533062994480133, + "learning_rate": 3.717191097115125e-05, + "loss": 1.7536, + "step": 19390 + }, + { + "epoch": 5.951810926949048, + "grad_norm": 0.22249148786067963, + "learning_rate": 3.716710685009471e-05, + "loss": 1.7325, + "step": 19391 + }, + { + "epoch": 5.952117863720074, + "grad_norm": 0.3085922598838806, + "learning_rate": 3.716230285586033e-05, + "loss": 1.7046, + "step": 19392 + }, + { + "epoch": 5.952424800491099, + "grad_norm": 0.2591574192047119, + "learning_rate": 3.715749898849562e-05, + "loss": 1.7165, + "step": 19393 + }, + { + "epoch": 5.952731737262124, + "grad_norm": 0.24586348235607147, + "learning_rate": 3.715269524804803e-05, + "loss": 1.749, + "step": 19394 + }, + { + "epoch": 5.953038674033149, + "grad_norm": 0.3424640893936157, + "learning_rate": 3.714789163456502e-05, + "loss": 1.7143, + "step": 19395 + }, + { + "epoch": 5.953345610804174, + "grad_norm": 0.24856910109519958, + "learning_rate": 3.714308814809408e-05, + "loss": 1.868, + "step": 19396 + }, + { + "epoch": 5.953652547575199, + "grad_norm": 0.2758113145828247, + "learning_rate": 3.7138284788682676e-05, + "loss": 1.6722, + "step": 19397 + }, + { + "epoch": 5.953959484346225, + "grad_norm": 0.25981786847114563, + "learning_rate": 3.71334815563783e-05, + "loss": 1.764, + "step": 19398 + }, + { + "epoch": 5.95426642111725, + "grad_norm": 0.27885568141937256, + "learning_rate": 3.7128678451228385e-05, + "loss": 1.7422, + "step": 19399 + }, + { + "epoch": 5.954573357888275, + "grad_norm": 0.2909421920776367, + "learning_rate": 3.712387547328042e-05, + "loss": 1.7862, + "step": 19400 + }, + { + "epoch": 5.9548802946593, + "grad_norm": 0.2288074642419815, + "learning_rate": 3.711907262258185e-05, + "loss": 1.7054, + "step": 19401 + }, + { + "epoch": 5.955187231430325, + "grad_norm": 0.2986883819103241, + "learning_rate": 3.711426989918017e-05, + "loss": 1.7555, + "step": 19402 + }, + { + "epoch": 5.9554941682013505, + "grad_norm": 0.23201194405555725, + "learning_rate": 3.710946730312281e-05, + "loss": 1.8186, + "step": 19403 + }, + { + "epoch": 5.955801104972376, + "grad_norm": 0.2609403431415558, + "learning_rate": 3.710466483445728e-05, + "loss": 1.7743, + "step": 19404 + }, + { + "epoch": 5.956108041743401, + "grad_norm": 0.31131741404533386, + "learning_rate": 3.709986249323098e-05, + "loss": 1.7938, + "step": 19405 + }, + { + "epoch": 5.956414978514426, + "grad_norm": 0.20544753968715668, + "learning_rate": 3.7095060279491424e-05, + "loss": 1.7278, + "step": 19406 + }, + { + "epoch": 5.956721915285451, + "grad_norm": 0.3063479959964752, + "learning_rate": 3.709025819328602e-05, + "loss": 1.7544, + "step": 19407 + }, + { + "epoch": 5.957028852056476, + "grad_norm": 0.34868693351745605, + "learning_rate": 3.708545623466227e-05, + "loss": 1.7536, + "step": 19408 + }, + { + "epoch": 5.957335788827502, + "grad_norm": 0.20847822725772858, + "learning_rate": 3.70806544036676e-05, + "loss": 1.7003, + "step": 19409 + }, + { + "epoch": 5.957642725598527, + "grad_norm": 0.3250095844268799, + "learning_rate": 3.707585270034949e-05, + "loss": 1.6815, + "step": 19410 + }, + { + "epoch": 5.957949662369552, + "grad_norm": 0.24854284524917603, + "learning_rate": 3.707105112475539e-05, + "loss": 1.7665, + "step": 19411 + }, + { + "epoch": 5.958256599140577, + "grad_norm": 0.2921455502510071, + "learning_rate": 3.706624967693271e-05, + "loss": 1.7039, + "step": 19412 + }, + { + "epoch": 5.958563535911602, + "grad_norm": 0.2659071385860443, + "learning_rate": 3.706144835692894e-05, + "loss": 1.7641, + "step": 19413 + }, + { + "epoch": 5.958870472682627, + "grad_norm": 0.30329519510269165, + "learning_rate": 3.7056647164791516e-05, + "loss": 1.7962, + "step": 19414 + }, + { + "epoch": 5.959177409453653, + "grad_norm": 0.4023756682872772, + "learning_rate": 3.7051846100567906e-05, + "loss": 1.7624, + "step": 19415 + }, + { + "epoch": 5.959484346224678, + "grad_norm": 0.24528828263282776, + "learning_rate": 3.704704516430553e-05, + "loss": 1.8156, + "step": 19416 + }, + { + "epoch": 5.9597912829957025, + "grad_norm": 0.46833130717277527, + "learning_rate": 3.704224435605186e-05, + "loss": 1.798, + "step": 19417 + }, + { + "epoch": 5.960098219766728, + "grad_norm": 0.26952674984931946, + "learning_rate": 3.70374436758543e-05, + "loss": 1.743, + "step": 19418 + }, + { + "epoch": 5.960405156537753, + "grad_norm": 0.3126155734062195, + "learning_rate": 3.703264312376034e-05, + "loss": 1.8003, + "step": 19419 + }, + { + "epoch": 5.9607120933087785, + "grad_norm": 0.2833348512649536, + "learning_rate": 3.702784269981738e-05, + "loss": 1.7524, + "step": 19420 + }, + { + "epoch": 5.961019030079804, + "grad_norm": 0.25425654649734497, + "learning_rate": 3.7023042404072916e-05, + "loss": 1.7241, + "step": 19421 + }, + { + "epoch": 5.961325966850829, + "grad_norm": 0.29460933804512024, + "learning_rate": 3.701824223657433e-05, + "loss": 1.676, + "step": 19422 + }, + { + "epoch": 5.961632903621854, + "grad_norm": 0.21040670573711395, + "learning_rate": 3.7013442197369094e-05, + "loss": 1.71, + "step": 19423 + }, + { + "epoch": 5.961939840392879, + "grad_norm": 0.3200007379055023, + "learning_rate": 3.7008642286504624e-05, + "loss": 1.7108, + "step": 19424 + }, + { + "epoch": 5.962246777163904, + "grad_norm": 0.20397430658340454, + "learning_rate": 3.7003842504028366e-05, + "loss": 1.7472, + "step": 19425 + }, + { + "epoch": 5.96255371393493, + "grad_norm": 0.24811354279518127, + "learning_rate": 3.699904284998776e-05, + "loss": 1.7116, + "step": 19426 + }, + { + "epoch": 5.962860650705955, + "grad_norm": 0.20980580151081085, + "learning_rate": 3.699424332443023e-05, + "loss": 1.786, + "step": 19427 + }, + { + "epoch": 5.963167587476979, + "grad_norm": 0.1967400163412094, + "learning_rate": 3.698944392740322e-05, + "loss": 1.7141, + "step": 19428 + }, + { + "epoch": 5.963474524248005, + "grad_norm": 0.21907822787761688, + "learning_rate": 3.698464465895414e-05, + "loss": 1.6983, + "step": 19429 + }, + { + "epoch": 5.96378146101903, + "grad_norm": 0.19938960671424866, + "learning_rate": 3.697984551913043e-05, + "loss": 1.6811, + "step": 19430 + }, + { + "epoch": 5.964088397790055, + "grad_norm": 0.22280220687389374, + "learning_rate": 3.6975046507979506e-05, + "loss": 1.6838, + "step": 19431 + }, + { + "epoch": 5.964395334561081, + "grad_norm": 0.2530672550201416, + "learning_rate": 3.697024762554883e-05, + "loss": 1.8116, + "step": 19432 + }, + { + "epoch": 5.964702271332106, + "grad_norm": 0.21853135526180267, + "learning_rate": 3.696544887188579e-05, + "loss": 1.692, + "step": 19433 + }, + { + "epoch": 5.9650092081031305, + "grad_norm": 0.18738535046577454, + "learning_rate": 3.696065024703783e-05, + "loss": 1.6971, + "step": 19434 + }, + { + "epoch": 5.965316144874156, + "grad_norm": 0.21199190616607666, + "learning_rate": 3.695585175105236e-05, + "loss": 1.7526, + "step": 19435 + }, + { + "epoch": 5.965623081645181, + "grad_norm": 0.22184251248836517, + "learning_rate": 3.695105338397681e-05, + "loss": 1.8075, + "step": 19436 + }, + { + "epoch": 5.9659300184162065, + "grad_norm": 0.20191644132137299, + "learning_rate": 3.6946255145858605e-05, + "loss": 1.7427, + "step": 19437 + }, + { + "epoch": 5.966236955187231, + "grad_norm": 0.2113640457391739, + "learning_rate": 3.694145703674515e-05, + "loss": 1.7556, + "step": 19438 + }, + { + "epoch": 5.966543891958256, + "grad_norm": 0.21834735572338104, + "learning_rate": 3.693665905668387e-05, + "loss": 1.7673, + "step": 19439 + }, + { + "epoch": 5.966850828729282, + "grad_norm": 0.2260274887084961, + "learning_rate": 3.6931861205722197e-05, + "loss": 1.8168, + "step": 19440 + }, + { + "epoch": 5.967157765500307, + "grad_norm": 0.24090524017810822, + "learning_rate": 3.692706348390751e-05, + "loss": 1.821, + "step": 19441 + }, + { + "epoch": 5.967464702271332, + "grad_norm": 0.27469882369041443, + "learning_rate": 3.6922265891287256e-05, + "loss": 1.7114, + "step": 19442 + }, + { + "epoch": 5.967771639042358, + "grad_norm": 0.23479801416397095, + "learning_rate": 3.6917468427908833e-05, + "loss": 1.7334, + "step": 19443 + }, + { + "epoch": 5.968078575813382, + "grad_norm": 0.21109704673290253, + "learning_rate": 3.6912671093819663e-05, + "loss": 1.7047, + "step": 19444 + }, + { + "epoch": 5.968385512584407, + "grad_norm": 0.21141986548900604, + "learning_rate": 3.690787388906715e-05, + "loss": 1.6868, + "step": 19445 + }, + { + "epoch": 5.968692449355433, + "grad_norm": 0.21836397051811218, + "learning_rate": 3.690307681369868e-05, + "loss": 1.6923, + "step": 19446 + }, + { + "epoch": 5.968999386126458, + "grad_norm": 0.21733662486076355, + "learning_rate": 3.6898279867761695e-05, + "loss": 1.7699, + "step": 19447 + }, + { + "epoch": 5.969306322897483, + "grad_norm": 0.19220437109470367, + "learning_rate": 3.689348305130359e-05, + "loss": 1.7002, + "step": 19448 + }, + { + "epoch": 5.969613259668508, + "grad_norm": 0.22644726932048798, + "learning_rate": 3.688868636437176e-05, + "loss": 1.7024, + "step": 19449 + }, + { + "epoch": 5.969920196439533, + "grad_norm": 0.1832779198884964, + "learning_rate": 3.688388980701361e-05, + "loss": 1.699, + "step": 19450 + }, + { + "epoch": 5.9702271332105585, + "grad_norm": 0.20793284475803375, + "learning_rate": 3.687909337927658e-05, + "loss": 1.7557, + "step": 19451 + }, + { + "epoch": 5.970534069981584, + "grad_norm": 0.19485175609588623, + "learning_rate": 3.6874297081207995e-05, + "loss": 1.7641, + "step": 19452 + }, + { + "epoch": 5.970841006752609, + "grad_norm": 0.20980949699878693, + "learning_rate": 3.686950091285534e-05, + "loss": 1.7542, + "step": 19453 + }, + { + "epoch": 5.9711479435236345, + "grad_norm": 0.24902600049972534, + "learning_rate": 3.686470487426594e-05, + "loss": 1.7342, + "step": 19454 + }, + { + "epoch": 5.971454880294659, + "grad_norm": 0.20191124081611633, + "learning_rate": 3.685990896548724e-05, + "loss": 1.6844, + "step": 19455 + }, + { + "epoch": 5.971761817065684, + "grad_norm": 0.23217806220054626, + "learning_rate": 3.685511318656662e-05, + "loss": 1.7054, + "step": 19456 + }, + { + "epoch": 5.97206875383671, + "grad_norm": 0.23383383452892303, + "learning_rate": 3.6850317537551484e-05, + "loss": 1.6903, + "step": 19457 + }, + { + "epoch": 5.972375690607735, + "grad_norm": 0.2147756665945053, + "learning_rate": 3.6845522018489196e-05, + "loss": 1.736, + "step": 19458 + }, + { + "epoch": 5.97268262737876, + "grad_norm": 0.23864400386810303, + "learning_rate": 3.68407266294272e-05, + "loss": 1.7483, + "step": 19459 + }, + { + "epoch": 5.972989564149785, + "grad_norm": 0.18702742457389832, + "learning_rate": 3.6835931370412836e-05, + "loss": 1.6874, + "step": 19460 + }, + { + "epoch": 5.97329650092081, + "grad_norm": 0.2167401760816574, + "learning_rate": 3.683113624149351e-05, + "loss": 1.652, + "step": 19461 + }, + { + "epoch": 5.973603437691835, + "grad_norm": 0.17105139791965485, + "learning_rate": 3.6826341242716636e-05, + "loss": 1.7029, + "step": 19462 + }, + { + "epoch": 5.973910374462861, + "grad_norm": 0.2189798206090927, + "learning_rate": 3.682154637412956e-05, + "loss": 1.7203, + "step": 19463 + }, + { + "epoch": 5.974217311233886, + "grad_norm": 0.17864444851875305, + "learning_rate": 3.68167516357797e-05, + "loss": 1.7176, + "step": 19464 + }, + { + "epoch": 5.974524248004911, + "grad_norm": 0.22356030344963074, + "learning_rate": 3.681195702771442e-05, + "loss": 1.7492, + "step": 19465 + }, + { + "epoch": 5.974831184775936, + "grad_norm": 0.19020728766918182, + "learning_rate": 3.68071625499811e-05, + "loss": 1.6925, + "step": 19466 + }, + { + "epoch": 5.975138121546961, + "grad_norm": 0.19092151522636414, + "learning_rate": 3.680236820262714e-05, + "loss": 1.7253, + "step": 19467 + }, + { + "epoch": 5.975445058317987, + "grad_norm": 0.20842085778713226, + "learning_rate": 3.6797573985699926e-05, + "loss": 1.7251, + "step": 19468 + }, + { + "epoch": 5.975751995089012, + "grad_norm": 0.2245844155550003, + "learning_rate": 3.6792779899246796e-05, + "loss": 1.7351, + "step": 19469 + }, + { + "epoch": 5.976058931860036, + "grad_norm": 0.18867328763008118, + "learning_rate": 3.678798594331519e-05, + "loss": 1.6646, + "step": 19470 + }, + { + "epoch": 5.976365868631062, + "grad_norm": 0.2892500162124634, + "learning_rate": 3.678319211795242e-05, + "loss": 1.7146, + "step": 19471 + }, + { + "epoch": 5.976672805402087, + "grad_norm": 0.22490514814853668, + "learning_rate": 3.677839842320591e-05, + "loss": 1.7147, + "step": 19472 + }, + { + "epoch": 5.976979742173112, + "grad_norm": 0.296724796295166, + "learning_rate": 3.677360485912301e-05, + "loss": 1.7714, + "step": 19473 + }, + { + "epoch": 5.977286678944138, + "grad_norm": 0.2784444987773895, + "learning_rate": 3.676881142575111e-05, + "loss": 1.7198, + "step": 19474 + }, + { + "epoch": 5.977593615715163, + "grad_norm": 0.20270293951034546, + "learning_rate": 3.676401812313755e-05, + "loss": 1.7336, + "step": 19475 + }, + { + "epoch": 5.9779005524861875, + "grad_norm": 0.23352907598018646, + "learning_rate": 3.6759224951329745e-05, + "loss": 1.7428, + "step": 19476 + }, + { + "epoch": 5.978207489257213, + "grad_norm": 0.1892426460981369, + "learning_rate": 3.675443191037502e-05, + "loss": 1.6636, + "step": 19477 + }, + { + "epoch": 5.978514426028238, + "grad_norm": 0.22216783463954926, + "learning_rate": 3.6749639000320766e-05, + "loss": 1.7446, + "step": 19478 + }, + { + "epoch": 5.9788213627992635, + "grad_norm": 0.19465389847755432, + "learning_rate": 3.6744846221214364e-05, + "loss": 1.7403, + "step": 19479 + }, + { + "epoch": 5.979128299570289, + "grad_norm": 0.1918177455663681, + "learning_rate": 3.674005357310314e-05, + "loss": 1.6974, + "step": 19480 + }, + { + "epoch": 5.979435236341313, + "grad_norm": 0.19065791368484497, + "learning_rate": 3.673526105603449e-05, + "loss": 1.7299, + "step": 19481 + }, + { + "epoch": 5.979742173112339, + "grad_norm": 0.24036844074726105, + "learning_rate": 3.673046867005575e-05, + "loss": 1.7441, + "step": 19482 + }, + { + "epoch": 5.980049109883364, + "grad_norm": 0.22352568805217743, + "learning_rate": 3.6725676415214305e-05, + "loss": 1.7556, + "step": 19483 + }, + { + "epoch": 5.980356046654389, + "grad_norm": 0.2492935210466385, + "learning_rate": 3.67208842915575e-05, + "loss": 1.6833, + "step": 19484 + }, + { + "epoch": 5.980662983425415, + "grad_norm": 0.2554415762424469, + "learning_rate": 3.671609229913272e-05, + "loss": 1.7426, + "step": 19485 + }, + { + "epoch": 5.98096992019644, + "grad_norm": 0.24076475203037262, + "learning_rate": 3.671130043798728e-05, + "loss": 1.7362, + "step": 19486 + }, + { + "epoch": 5.981276856967464, + "grad_norm": 0.24297118186950684, + "learning_rate": 3.670650870816858e-05, + "loss": 1.7493, + "step": 19487 + }, + { + "epoch": 5.98158379373849, + "grad_norm": 0.19533030688762665, + "learning_rate": 3.6701717109723924e-05, + "loss": 1.7397, + "step": 19488 + }, + { + "epoch": 5.981890730509515, + "grad_norm": 0.24731193482875824, + "learning_rate": 3.669692564270071e-05, + "loss": 1.7483, + "step": 19489 + }, + { + "epoch": 5.98219766728054, + "grad_norm": 0.23274390399456024, + "learning_rate": 3.669213430714626e-05, + "loss": 1.7677, + "step": 19490 + }, + { + "epoch": 5.982504604051566, + "grad_norm": 0.180234894156456, + "learning_rate": 3.668734310310796e-05, + "loss": 1.7065, + "step": 19491 + }, + { + "epoch": 5.98281154082259, + "grad_norm": 0.19045281410217285, + "learning_rate": 3.6682552030633125e-05, + "loss": 1.7089, + "step": 19492 + }, + { + "epoch": 5.9831184775936155, + "grad_norm": 0.17261318862438202, + "learning_rate": 3.667776108976914e-05, + "loss": 1.7227, + "step": 19493 + }, + { + "epoch": 5.983425414364641, + "grad_norm": 0.2156316339969635, + "learning_rate": 3.667297028056329e-05, + "loss": 1.7025, + "step": 19494 + }, + { + "epoch": 5.983732351135666, + "grad_norm": 0.22288112342357635, + "learning_rate": 3.666817960306298e-05, + "loss": 1.7123, + "step": 19495 + }, + { + "epoch": 5.9840392879066915, + "grad_norm": 0.21983082592487335, + "learning_rate": 3.6663389057315543e-05, + "loss": 1.7688, + "step": 19496 + }, + { + "epoch": 5.984346224677717, + "grad_norm": 0.1804746687412262, + "learning_rate": 3.665859864336829e-05, + "loss": 1.759, + "step": 19497 + }, + { + "epoch": 5.984653161448741, + "grad_norm": 0.22762230038642883, + "learning_rate": 3.6653808361268605e-05, + "loss": 1.8128, + "step": 19498 + }, + { + "epoch": 5.984960098219767, + "grad_norm": 0.21779340505599976, + "learning_rate": 3.664901821106379e-05, + "loss": 1.7316, + "step": 19499 + }, + { + "epoch": 5.985267034990792, + "grad_norm": 0.18899449706077576, + "learning_rate": 3.664422819280121e-05, + "loss": 1.7535, + "step": 19500 + }, + { + "epoch": 5.985573971761817, + "grad_norm": 0.22799427807331085, + "learning_rate": 3.663943830652819e-05, + "loss": 1.7626, + "step": 19501 + }, + { + "epoch": 5.985880908532843, + "grad_norm": 0.19936929643154144, + "learning_rate": 3.6634648552292086e-05, + "loss": 1.6887, + "step": 19502 + }, + { + "epoch": 5.986187845303867, + "grad_norm": 0.22482532262802124, + "learning_rate": 3.6629858930140206e-05, + "loss": 1.6867, + "step": 19503 + }, + { + "epoch": 5.986494782074892, + "grad_norm": 0.23543842136859894, + "learning_rate": 3.662506944011991e-05, + "loss": 1.7715, + "step": 19504 + }, + { + "epoch": 5.986801718845918, + "grad_norm": 0.230603888630867, + "learning_rate": 3.6620280082278495e-05, + "loss": 1.7514, + "step": 19505 + }, + { + "epoch": 5.987108655616943, + "grad_norm": 0.26767033338546753, + "learning_rate": 3.6615490856663334e-05, + "loss": 1.6862, + "step": 19506 + }, + { + "epoch": 5.987415592387968, + "grad_norm": 0.18282492458820343, + "learning_rate": 3.661070176332172e-05, + "loss": 1.6569, + "step": 19507 + }, + { + "epoch": 5.987722529158994, + "grad_norm": 0.255426824092865, + "learning_rate": 3.6605912802301016e-05, + "loss": 1.7623, + "step": 19508 + }, + { + "epoch": 5.988029465930018, + "grad_norm": 0.25026118755340576, + "learning_rate": 3.6601123973648524e-05, + "loss": 1.6907, + "step": 19509 + }, + { + "epoch": 5.9883364027010435, + "grad_norm": 0.19193407893180847, + "learning_rate": 3.659633527741159e-05, + "loss": 1.7647, + "step": 19510 + }, + { + "epoch": 5.988643339472069, + "grad_norm": 0.25562727451324463, + "learning_rate": 3.6591546713637506e-05, + "loss": 1.6806, + "step": 19511 + }, + { + "epoch": 5.988950276243094, + "grad_norm": 0.2296016663312912, + "learning_rate": 3.6586758282373624e-05, + "loss": 1.7747, + "step": 19512 + }, + { + "epoch": 5.989257213014119, + "grad_norm": 0.22875753045082092, + "learning_rate": 3.6581969983667275e-05, + "loss": 1.7847, + "step": 19513 + }, + { + "epoch": 5.989564149785144, + "grad_norm": 0.24469317495822906, + "learning_rate": 3.6577181817565736e-05, + "loss": 1.6784, + "step": 19514 + }, + { + "epoch": 5.989871086556169, + "grad_norm": 0.22855928540229797, + "learning_rate": 3.657239378411638e-05, + "loss": 1.788, + "step": 19515 + }, + { + "epoch": 5.990178023327195, + "grad_norm": 0.28745612502098083, + "learning_rate": 3.656760588336647e-05, + "loss": 1.6836, + "step": 19516 + }, + { + "epoch": 5.99048496009822, + "grad_norm": 0.18221193552017212, + "learning_rate": 3.656281811536337e-05, + "loss": 1.6687, + "step": 19517 + }, + { + "epoch": 5.990791896869245, + "grad_norm": 0.2556660771369934, + "learning_rate": 3.655803048015437e-05, + "loss": 1.7351, + "step": 19518 + }, + { + "epoch": 5.99109883364027, + "grad_norm": 0.18791422247886658, + "learning_rate": 3.6553242977786803e-05, + "loss": 1.6749, + "step": 19519 + }, + { + "epoch": 5.991405770411295, + "grad_norm": 0.28149592876434326, + "learning_rate": 3.654845560830796e-05, + "loss": 1.7333, + "step": 19520 + }, + { + "epoch": 5.99171270718232, + "grad_norm": 0.24631322920322418, + "learning_rate": 3.654366837176517e-05, + "loss": 1.7672, + "step": 19521 + }, + { + "epoch": 5.992019643953346, + "grad_norm": 0.22054782509803772, + "learning_rate": 3.653888126820573e-05, + "loss": 1.7499, + "step": 19522 + }, + { + "epoch": 5.992326580724371, + "grad_norm": 0.23334862291812897, + "learning_rate": 3.653409429767696e-05, + "loss": 1.7133, + "step": 19523 + }, + { + "epoch": 5.9926335174953955, + "grad_norm": 0.19809292256832123, + "learning_rate": 3.6529307460226145e-05, + "loss": 1.6965, + "step": 19524 + }, + { + "epoch": 5.992940454266421, + "grad_norm": 0.23769772052764893, + "learning_rate": 3.652452075590064e-05, + "loss": 1.699, + "step": 19525 + }, + { + "epoch": 5.993247391037446, + "grad_norm": 0.19045031070709229, + "learning_rate": 3.6519734184747686e-05, + "loss": 1.7043, + "step": 19526 + }, + { + "epoch": 5.9935543278084715, + "grad_norm": 0.20795129239559174, + "learning_rate": 3.651494774681465e-05, + "loss": 1.7159, + "step": 19527 + }, + { + "epoch": 5.993861264579497, + "grad_norm": 0.1933370679616928, + "learning_rate": 3.651016144214878e-05, + "loss": 1.6999, + "step": 19528 + }, + { + "epoch": 5.994168201350522, + "grad_norm": 0.18360544741153717, + "learning_rate": 3.650537527079742e-05, + "loss": 1.7525, + "step": 19529 + }, + { + "epoch": 5.994475138121547, + "grad_norm": 0.21080785989761353, + "learning_rate": 3.650058923280786e-05, + "loss": 1.6832, + "step": 19530 + }, + { + "epoch": 5.994782074892572, + "grad_norm": 0.19701606035232544, + "learning_rate": 3.649580332822736e-05, + "loss": 1.7104, + "step": 19531 + }, + { + "epoch": 5.995089011663597, + "grad_norm": 0.24208703637123108, + "learning_rate": 3.6491017557103266e-05, + "loss": 1.726, + "step": 19532 + }, + { + "epoch": 5.995395948434623, + "grad_norm": 0.25981345772743225, + "learning_rate": 3.648623191948284e-05, + "loss": 1.7644, + "step": 19533 + }, + { + "epoch": 5.995702885205648, + "grad_norm": 0.24137455224990845, + "learning_rate": 3.64814464154134e-05, + "loss": 1.7354, + "step": 19534 + }, + { + "epoch": 5.996009821976672, + "grad_norm": 0.2140759378671646, + "learning_rate": 3.647666104494222e-05, + "loss": 1.7244, + "step": 19535 + }, + { + "epoch": 5.996316758747698, + "grad_norm": 0.2801622748374939, + "learning_rate": 3.647187580811663e-05, + "loss": 1.6996, + "step": 19536 + }, + { + "epoch": 5.996623695518723, + "grad_norm": 0.21048817038536072, + "learning_rate": 3.6467090704983856e-05, + "loss": 1.7378, + "step": 19537 + }, + { + "epoch": 5.996930632289748, + "grad_norm": 0.2935819625854492, + "learning_rate": 3.6462305735591254e-05, + "loss": 1.7066, + "step": 19538 + }, + { + "epoch": 5.997237569060774, + "grad_norm": 0.22473880648612976, + "learning_rate": 3.645752089998606e-05, + "loss": 1.7539, + "step": 19539 + }, + { + "epoch": 5.997544505831799, + "grad_norm": 0.20606113970279694, + "learning_rate": 3.6452736198215585e-05, + "loss": 1.7338, + "step": 19540 + }, + { + "epoch": 5.9978514426028235, + "grad_norm": 0.2702842950820923, + "learning_rate": 3.6447951630327116e-05, + "loss": 1.7171, + "step": 19541 + }, + { + "epoch": 5.998158379373849, + "grad_norm": 0.19971637427806854, + "learning_rate": 3.6443167196367946e-05, + "loss": 1.7132, + "step": 19542 + }, + { + "epoch": 5.998465316144874, + "grad_norm": 0.2352653592824936, + "learning_rate": 3.643838289638531e-05, + "loss": 1.787, + "step": 19543 + }, + { + "epoch": 5.9987722529158995, + "grad_norm": 0.2324669510126114, + "learning_rate": 3.643359873042656e-05, + "loss": 1.7039, + "step": 19544 + }, + { + "epoch": 5.999079189686924, + "grad_norm": 0.1935029774904251, + "learning_rate": 3.6428814698538914e-05, + "loss": 1.6846, + "step": 19545 + }, + { + "epoch": 5.999386126457949, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.642403080076968e-05, + "loss": 1.7018, + "step": 19546 + }, + { + "epoch": 5.999693063228975, + "grad_norm": 0.19364693760871887, + "learning_rate": 3.6419247037166146e-05, + "loss": 1.6901, + "step": 19547 + }, + { + "epoch": 6.0, + "grad_norm": 0.23718556761741638, + "learning_rate": 3.641446340777556e-05, + "loss": 1.7743, + "step": 19548 + }, + { + "epoch": 6.000306936771025, + "grad_norm": 0.23907634615898132, + "learning_rate": 3.640967991264521e-05, + "loss": 1.8225, + "step": 19549 + }, + { + "epoch": 6.000613873542051, + "grad_norm": 0.18895737826824188, + "learning_rate": 3.6404896551822365e-05, + "loss": 1.7004, + "step": 19550 + }, + { + "epoch": 6.000920810313075, + "grad_norm": 0.20192188024520874, + "learning_rate": 3.64001133253543e-05, + "loss": 1.7304, + "step": 19551 + }, + { + "epoch": 6.0012277470841005, + "grad_norm": 0.1961488425731659, + "learning_rate": 3.6395330233288285e-05, + "loss": 1.6839, + "step": 19552 + }, + { + "epoch": 6.001534683855126, + "grad_norm": 0.271635502576828, + "learning_rate": 3.639054727567161e-05, + "loss": 1.8182, + "step": 19553 + }, + { + "epoch": 6.001841620626151, + "grad_norm": 0.20838679373264313, + "learning_rate": 3.63857644525515e-05, + "loss": 1.7688, + "step": 19554 + }, + { + "epoch": 6.0021485573971765, + "grad_norm": 0.23661796748638153, + "learning_rate": 3.6380981763975266e-05, + "loss": 1.6785, + "step": 19555 + }, + { + "epoch": 6.002455494168202, + "grad_norm": 0.1728433072566986, + "learning_rate": 3.637619920999013e-05, + "loss": 1.6648, + "step": 19556 + }, + { + "epoch": 6.002762430939226, + "grad_norm": 0.2845853269100189, + "learning_rate": 3.6371416790643395e-05, + "loss": 1.7592, + "step": 19557 + }, + { + "epoch": 6.003069367710252, + "grad_norm": 0.3246566951274872, + "learning_rate": 3.636663450598229e-05, + "loss": 1.7045, + "step": 19558 + }, + { + "epoch": 6.003376304481277, + "grad_norm": 0.21857120096683502, + "learning_rate": 3.636185235605412e-05, + "loss": 1.756, + "step": 19559 + }, + { + "epoch": 6.003683241252302, + "grad_norm": 0.3583754599094391, + "learning_rate": 3.63570703409061e-05, + "loss": 1.6828, + "step": 19560 + }, + { + "epoch": 6.003990178023328, + "grad_norm": 0.25527241826057434, + "learning_rate": 3.635228846058552e-05, + "loss": 1.7611, + "step": 19561 + }, + { + "epoch": 6.004297114794352, + "grad_norm": 0.29662930965423584, + "learning_rate": 3.6347506715139604e-05, + "loss": 1.747, + "step": 19562 + }, + { + "epoch": 6.004604051565377, + "grad_norm": 0.2588978707790375, + "learning_rate": 3.634272510461564e-05, + "loss": 1.7153, + "step": 19563 + }, + { + "epoch": 6.004910988336403, + "grad_norm": 0.23874366283416748, + "learning_rate": 3.633794362906089e-05, + "loss": 1.7285, + "step": 19564 + }, + { + "epoch": 6.005217925107428, + "grad_norm": 0.2898634374141693, + "learning_rate": 3.633316228852256e-05, + "loss": 1.7539, + "step": 19565 + }, + { + "epoch": 6.005524861878453, + "grad_norm": 0.2578127682209015, + "learning_rate": 3.6328381083047946e-05, + "loss": 1.7504, + "step": 19566 + }, + { + "epoch": 6.005831798649478, + "grad_norm": 0.3094595968723297, + "learning_rate": 3.632360001268427e-05, + "loss": 1.7076, + "step": 19567 + }, + { + "epoch": 6.006138735420503, + "grad_norm": 0.27825623750686646, + "learning_rate": 3.63188190774788e-05, + "loss": 1.7651, + "step": 19568 + }, + { + "epoch": 6.0064456721915285, + "grad_norm": 0.27732032537460327, + "learning_rate": 3.631403827747878e-05, + "loss": 1.7209, + "step": 19569 + }, + { + "epoch": 6.006752608962554, + "grad_norm": 0.36446672677993774, + "learning_rate": 3.6309257612731475e-05, + "loss": 1.7191, + "step": 19570 + }, + { + "epoch": 6.007059545733579, + "grad_norm": 0.19071432948112488, + "learning_rate": 3.6304477083284076e-05, + "loss": 1.6981, + "step": 19571 + }, + { + "epoch": 6.0073664825046045, + "grad_norm": 0.40523234009742737, + "learning_rate": 3.6299696689183895e-05, + "loss": 1.7259, + "step": 19572 + }, + { + "epoch": 6.007673419275629, + "grad_norm": 0.30279576778411865, + "learning_rate": 3.6294916430478116e-05, + "loss": 1.8017, + "step": 19573 + }, + { + "epoch": 6.007980356046654, + "grad_norm": 0.2944689989089966, + "learning_rate": 3.629013630721402e-05, + "loss": 1.7347, + "step": 19574 + }, + { + "epoch": 6.00828729281768, + "grad_norm": 0.3557213246822357, + "learning_rate": 3.6285356319438814e-05, + "loss": 1.7308, + "step": 19575 + }, + { + "epoch": 6.008594229588705, + "grad_norm": 0.19888661801815033, + "learning_rate": 3.628057646719978e-05, + "loss": 1.7571, + "step": 19576 + }, + { + "epoch": 6.00890116635973, + "grad_norm": 0.34002986550331116, + "learning_rate": 3.627579675054411e-05, + "loss": 1.7417, + "step": 19577 + }, + { + "epoch": 6.009208103130755, + "grad_norm": 0.2756921350955963, + "learning_rate": 3.627101716951908e-05, + "loss": 1.7351, + "step": 19578 + }, + { + "epoch": 6.00951503990178, + "grad_norm": 0.3520946502685547, + "learning_rate": 3.6266237724171885e-05, + "loss": 1.7056, + "step": 19579 + }, + { + "epoch": 6.009821976672805, + "grad_norm": 0.3673728406429291, + "learning_rate": 3.6261458414549786e-05, + "loss": 1.6388, + "step": 19580 + }, + { + "epoch": 6.010128913443831, + "grad_norm": 0.2247757613658905, + "learning_rate": 3.625667924070003e-05, + "loss": 1.7772, + "step": 19581 + }, + { + "epoch": 6.010435850214856, + "grad_norm": 0.4387452006340027, + "learning_rate": 3.6251900202669795e-05, + "loss": 1.7629, + "step": 19582 + }, + { + "epoch": 6.0107427869858805, + "grad_norm": 0.23595796525478363, + "learning_rate": 3.624712130050636e-05, + "loss": 1.8044, + "step": 19583 + }, + { + "epoch": 6.011049723756906, + "grad_norm": 0.31198835372924805, + "learning_rate": 3.624234253425691e-05, + "loss": 1.7623, + "step": 19584 + }, + { + "epoch": 6.011356660527931, + "grad_norm": 0.25283896923065186, + "learning_rate": 3.6237563903968705e-05, + "loss": 1.7771, + "step": 19585 + }, + { + "epoch": 6.0116635972989565, + "grad_norm": 0.2595483064651489, + "learning_rate": 3.6232785409688954e-05, + "loss": 1.7405, + "step": 19586 + }, + { + "epoch": 6.011970534069982, + "grad_norm": 0.302273690700531, + "learning_rate": 3.622800705146491e-05, + "loss": 1.7236, + "step": 19587 + }, + { + "epoch": 6.012277470841007, + "grad_norm": 0.20444928109645844, + "learning_rate": 3.622322882934375e-05, + "loss": 1.6863, + "step": 19588 + }, + { + "epoch": 6.012584407612032, + "grad_norm": 0.2682531774044037, + "learning_rate": 3.621845074337273e-05, + "loss": 1.752, + "step": 19589 + }, + { + "epoch": 6.012891344383057, + "grad_norm": 0.25617173314094543, + "learning_rate": 3.621367279359905e-05, + "loss": 1.7496, + "step": 19590 + }, + { + "epoch": 6.013198281154082, + "grad_norm": 0.24514207243919373, + "learning_rate": 3.620889498006994e-05, + "loss": 1.6568, + "step": 19591 + }, + { + "epoch": 6.013505217925108, + "grad_norm": 0.2799128293991089, + "learning_rate": 3.6204117302832616e-05, + "loss": 1.7284, + "step": 19592 + }, + { + "epoch": 6.013812154696133, + "grad_norm": 0.2025543451309204, + "learning_rate": 3.619933976193428e-05, + "loss": 1.7172, + "step": 19593 + }, + { + "epoch": 6.014119091467157, + "grad_norm": 0.24697700142860413, + "learning_rate": 3.619456235742216e-05, + "loss": 1.7316, + "step": 19594 + }, + { + "epoch": 6.014426028238183, + "grad_norm": 0.2518150210380554, + "learning_rate": 3.618978508934348e-05, + "loss": 1.8183, + "step": 19595 + }, + { + "epoch": 6.014732965009208, + "grad_norm": 0.165326327085495, + "learning_rate": 3.618500795774542e-05, + "loss": 1.665, + "step": 19596 + }, + { + "epoch": 6.015039901780233, + "grad_norm": 0.19158180058002472, + "learning_rate": 3.6180230962675216e-05, + "loss": 1.7232, + "step": 19597 + }, + { + "epoch": 6.015346838551259, + "grad_norm": 0.19456413388252258, + "learning_rate": 3.6175454104180086e-05, + "loss": 1.7153, + "step": 19598 + }, + { + "epoch": 6.015653775322283, + "grad_norm": 0.233373761177063, + "learning_rate": 3.6170677382307195e-05, + "loss": 1.7914, + "step": 19599 + }, + { + "epoch": 6.0159607120933085, + "grad_norm": 0.18567882478237152, + "learning_rate": 3.6165900797103796e-05, + "loss": 1.6793, + "step": 19600 + }, + { + "epoch": 6.016267648864334, + "grad_norm": 0.2119273990392685, + "learning_rate": 3.616112434861706e-05, + "loss": 1.689, + "step": 19601 + }, + { + "epoch": 6.016574585635359, + "grad_norm": 0.1915217787027359, + "learning_rate": 3.61563480368942e-05, + "loss": 1.6835, + "step": 19602 + }, + { + "epoch": 6.0168815224063845, + "grad_norm": 0.24824760854244232, + "learning_rate": 3.615157186198244e-05, + "loss": 1.8411, + "step": 19603 + }, + { + "epoch": 6.01718845917741, + "grad_norm": 0.2198900282382965, + "learning_rate": 3.6146795823928955e-05, + "loss": 1.7311, + "step": 19604 + }, + { + "epoch": 6.017495395948434, + "grad_norm": 0.22993668913841248, + "learning_rate": 3.614201992278095e-05, + "loss": 1.7249, + "step": 19605 + }, + { + "epoch": 6.01780233271946, + "grad_norm": 0.20677974820137024, + "learning_rate": 3.613724415858564e-05, + "loss": 1.7137, + "step": 19606 + }, + { + "epoch": 6.018109269490485, + "grad_norm": 0.1844938099384308, + "learning_rate": 3.6132468531390184e-05, + "loss": 1.6512, + "step": 19607 + }, + { + "epoch": 6.01841620626151, + "grad_norm": 0.224154993891716, + "learning_rate": 3.6127693041241815e-05, + "loss": 1.7116, + "step": 19608 + }, + { + "epoch": 6.018723143032536, + "grad_norm": 0.17322199046611786, + "learning_rate": 3.612291768818772e-05, + "loss": 1.6743, + "step": 19609 + }, + { + "epoch": 6.01903007980356, + "grad_norm": 0.24451903998851776, + "learning_rate": 3.611814247227508e-05, + "loss": 1.8332, + "step": 19610 + }, + { + "epoch": 6.019337016574585, + "grad_norm": 0.1911642849445343, + "learning_rate": 3.611336739355109e-05, + "loss": 1.707, + "step": 19611 + }, + { + "epoch": 6.019643953345611, + "grad_norm": 0.20917518436908722, + "learning_rate": 3.6108592452062954e-05, + "loss": 1.7328, + "step": 19612 + }, + { + "epoch": 6.019950890116636, + "grad_norm": 0.2314450889825821, + "learning_rate": 3.610381764785784e-05, + "loss": 1.7575, + "step": 19613 + }, + { + "epoch": 6.020257826887661, + "grad_norm": 0.20701734721660614, + "learning_rate": 3.609904298098296e-05, + "loss": 1.6958, + "step": 19614 + }, + { + "epoch": 6.020564763658686, + "grad_norm": 0.2494465857744217, + "learning_rate": 3.609426845148547e-05, + "loss": 1.706, + "step": 19615 + }, + { + "epoch": 6.020871700429711, + "grad_norm": 0.25842729210853577, + "learning_rate": 3.608949405941256e-05, + "loss": 1.7667, + "step": 19616 + }, + { + "epoch": 6.0211786372007365, + "grad_norm": 0.19831863045692444, + "learning_rate": 3.608471980481145e-05, + "loss": 1.7135, + "step": 19617 + }, + { + "epoch": 6.021485573971762, + "grad_norm": 0.21611735224723816, + "learning_rate": 3.607994568772927e-05, + "loss": 1.7416, + "step": 19618 + }, + { + "epoch": 6.021792510742787, + "grad_norm": 0.2356715202331543, + "learning_rate": 3.607517170821324e-05, + "loss": 1.7696, + "step": 19619 + }, + { + "epoch": 6.0220994475138125, + "grad_norm": 0.24737675487995148, + "learning_rate": 3.6070397866310514e-05, + "loss": 1.7189, + "step": 19620 + }, + { + "epoch": 6.022406384284837, + "grad_norm": 0.19260701537132263, + "learning_rate": 3.6065624162068284e-05, + "loss": 1.7292, + "step": 19621 + }, + { + "epoch": 6.022713321055862, + "grad_norm": 0.29366952180862427, + "learning_rate": 3.6060850595533716e-05, + "loss": 1.7875, + "step": 19622 + }, + { + "epoch": 6.023020257826888, + "grad_norm": 0.2038174718618393, + "learning_rate": 3.605607716675401e-05, + "loss": 1.6777, + "step": 19623 + }, + { + "epoch": 6.023327194597913, + "grad_norm": 0.28923583030700684, + "learning_rate": 3.605130387577631e-05, + "loss": 1.7175, + "step": 19624 + }, + { + "epoch": 6.023634131368938, + "grad_norm": 0.3004317283630371, + "learning_rate": 3.6046530722647816e-05, + "loss": 1.8059, + "step": 19625 + }, + { + "epoch": 6.023941068139963, + "grad_norm": 0.19832390546798706, + "learning_rate": 3.6041757707415666e-05, + "loss": 1.7197, + "step": 19626 + }, + { + "epoch": 6.024248004910988, + "grad_norm": 0.2782927453517914, + "learning_rate": 3.6036984830127054e-05, + "loss": 1.6563, + "step": 19627 + }, + { + "epoch": 6.024554941682013, + "grad_norm": 0.20395785570144653, + "learning_rate": 3.603221209082913e-05, + "loss": 1.6972, + "step": 19628 + }, + { + "epoch": 6.024861878453039, + "grad_norm": 0.26302096247673035, + "learning_rate": 3.60274394895691e-05, + "loss": 1.7348, + "step": 19629 + }, + { + "epoch": 6.025168815224064, + "grad_norm": 0.26376327872276306, + "learning_rate": 3.6022667026394095e-05, + "loss": 1.7183, + "step": 19630 + }, + { + "epoch": 6.0254757519950894, + "grad_norm": 0.20590877532958984, + "learning_rate": 3.601789470135127e-05, + "loss": 1.7114, + "step": 19631 + }, + { + "epoch": 6.025782688766114, + "grad_norm": 0.2873607277870178, + "learning_rate": 3.6013122514487815e-05, + "loss": 1.7598, + "step": 19632 + }, + { + "epoch": 6.026089625537139, + "grad_norm": 0.24324963986873627, + "learning_rate": 3.600835046585087e-05, + "loss": 1.8844, + "step": 19633 + }, + { + "epoch": 6.026396562308165, + "grad_norm": 0.27910730242729187, + "learning_rate": 3.6003578555487624e-05, + "loss": 1.8598, + "step": 19634 + }, + { + "epoch": 6.02670349907919, + "grad_norm": 0.22766844928264618, + "learning_rate": 3.59988067834452e-05, + "loss": 1.7281, + "step": 19635 + }, + { + "epoch": 6.027010435850215, + "grad_norm": 0.2390190064907074, + "learning_rate": 3.5994035149770804e-05, + "loss": 1.7355, + "step": 19636 + }, + { + "epoch": 6.02731737262124, + "grad_norm": 0.23422548174858093, + "learning_rate": 3.598926365451153e-05, + "loss": 1.7226, + "step": 19637 + }, + { + "epoch": 6.027624309392265, + "grad_norm": 0.20240288972854614, + "learning_rate": 3.598449229771458e-05, + "loss": 1.7523, + "step": 19638 + }, + { + "epoch": 6.02793124616329, + "grad_norm": 0.26388832926750183, + "learning_rate": 3.597972107942708e-05, + "loss": 1.7003, + "step": 19639 + }, + { + "epoch": 6.028238182934316, + "grad_norm": 0.19814053177833557, + "learning_rate": 3.597494999969622e-05, + "loss": 1.7087, + "step": 19640 + }, + { + "epoch": 6.028545119705341, + "grad_norm": 0.2779136896133423, + "learning_rate": 3.5970179058569095e-05, + "loss": 1.7581, + "step": 19641 + }, + { + "epoch": 6.0288520564763655, + "grad_norm": 0.220394566655159, + "learning_rate": 3.5965408256092905e-05, + "loss": 1.7236, + "step": 19642 + }, + { + "epoch": 6.029158993247391, + "grad_norm": 0.28568828105926514, + "learning_rate": 3.596063759231476e-05, + "loss": 1.7933, + "step": 19643 + }, + { + "epoch": 6.029465930018416, + "grad_norm": 0.19509564340114594, + "learning_rate": 3.595586706728183e-05, + "loss": 1.6803, + "step": 19644 + }, + { + "epoch": 6.0297728667894415, + "grad_norm": 0.30855104327201843, + "learning_rate": 3.595109668104124e-05, + "loss": 1.7345, + "step": 19645 + }, + { + "epoch": 6.030079803560467, + "grad_norm": 0.24195496737957, + "learning_rate": 3.5946326433640174e-05, + "loss": 1.7493, + "step": 19646 + }, + { + "epoch": 6.030386740331492, + "grad_norm": 0.28324684500694275, + "learning_rate": 3.5941556325125744e-05, + "loss": 1.7959, + "step": 19647 + }, + { + "epoch": 6.030693677102517, + "grad_norm": 0.25351646542549133, + "learning_rate": 3.593678635554508e-05, + "loss": 1.7298, + "step": 19648 + }, + { + "epoch": 6.031000613873542, + "grad_norm": 0.2608177959918976, + "learning_rate": 3.593201652494534e-05, + "loss": 1.7072, + "step": 19649 + }, + { + "epoch": 6.031307550644567, + "grad_norm": 0.3182333707809448, + "learning_rate": 3.592724683337365e-05, + "loss": 1.6976, + "step": 19650 + }, + { + "epoch": 6.031614487415593, + "grad_norm": 0.19296859204769135, + "learning_rate": 3.592247728087717e-05, + "loss": 1.6879, + "step": 19651 + }, + { + "epoch": 6.031921424186618, + "grad_norm": 0.3927764594554901, + "learning_rate": 3.591770786750301e-05, + "loss": 1.6824, + "step": 19652 + }, + { + "epoch": 6.032228360957642, + "grad_norm": 0.23609496653079987, + "learning_rate": 3.591293859329833e-05, + "loss": 1.7224, + "step": 19653 + }, + { + "epoch": 6.032535297728668, + "grad_norm": 0.40787333250045776, + "learning_rate": 3.590816945831023e-05, + "loss": 1.7206, + "step": 19654 + }, + { + "epoch": 6.032842234499693, + "grad_norm": 0.31101885437965393, + "learning_rate": 3.590340046258586e-05, + "loss": 1.7446, + "step": 19655 + }, + { + "epoch": 6.033149171270718, + "grad_norm": 0.19401656091213226, + "learning_rate": 3.589863160617235e-05, + "loss": 1.6778, + "step": 19656 + }, + { + "epoch": 6.033456108041744, + "grad_norm": 0.3309115469455719, + "learning_rate": 3.589386288911684e-05, + "loss": 1.7196, + "step": 19657 + }, + { + "epoch": 6.033763044812768, + "grad_norm": 0.22281408309936523, + "learning_rate": 3.588909431146643e-05, + "loss": 1.7122, + "step": 19658 + }, + { + "epoch": 6.0340699815837935, + "grad_norm": 0.2903781831264496, + "learning_rate": 3.5884325873268275e-05, + "loss": 1.7428, + "step": 19659 + }, + { + "epoch": 6.034376918354819, + "grad_norm": 0.2529856562614441, + "learning_rate": 3.587955757456947e-05, + "loss": 1.7075, + "step": 19660 + }, + { + "epoch": 6.034683855125844, + "grad_norm": 0.2445102334022522, + "learning_rate": 3.587478941541716e-05, + "loss": 1.6631, + "step": 19661 + }, + { + "epoch": 6.0349907918968695, + "grad_norm": 0.31834688782691956, + "learning_rate": 3.5870021395858454e-05, + "loss": 1.7009, + "step": 19662 + }, + { + "epoch": 6.035297728667895, + "grad_norm": 0.20666317641735077, + "learning_rate": 3.5865253515940496e-05, + "loss": 1.7252, + "step": 19663 + }, + { + "epoch": 6.035604665438919, + "grad_norm": 0.3070019483566284, + "learning_rate": 3.586048577571039e-05, + "loss": 1.7139, + "step": 19664 + }, + { + "epoch": 6.035911602209945, + "grad_norm": 0.22463096678256989, + "learning_rate": 3.585571817521522e-05, + "loss": 1.7574, + "step": 19665 + }, + { + "epoch": 6.03621853898097, + "grad_norm": 0.25405722856521606, + "learning_rate": 3.585095071450216e-05, + "loss": 1.7135, + "step": 19666 + }, + { + "epoch": 6.036525475751995, + "grad_norm": 0.24543432891368866, + "learning_rate": 3.584618339361828e-05, + "loss": 1.7312, + "step": 19667 + }, + { + "epoch": 6.036832412523021, + "grad_norm": 0.2454189658164978, + "learning_rate": 3.584141621261073e-05, + "loss": 1.7905, + "step": 19668 + }, + { + "epoch": 6.037139349294045, + "grad_norm": 0.2163272649049759, + "learning_rate": 3.583664917152658e-05, + "loss": 1.7042, + "step": 19669 + }, + { + "epoch": 6.03744628606507, + "grad_norm": 0.2088690549135208, + "learning_rate": 3.5831882270412994e-05, + "loss": 1.7905, + "step": 19670 + }, + { + "epoch": 6.037753222836096, + "grad_norm": 0.26145869493484497, + "learning_rate": 3.5827115509317024e-05, + "loss": 1.7487, + "step": 19671 + }, + { + "epoch": 6.038060159607121, + "grad_norm": 0.20306496322155, + "learning_rate": 3.582234888828582e-05, + "loss": 1.7103, + "step": 19672 + }, + { + "epoch": 6.038367096378146, + "grad_norm": 0.2504192292690277, + "learning_rate": 3.5817582407366454e-05, + "loss": 1.7397, + "step": 19673 + }, + { + "epoch": 6.038674033149171, + "grad_norm": 0.22803208231925964, + "learning_rate": 3.5812816066606084e-05, + "loss": 1.7105, + "step": 19674 + }, + { + "epoch": 6.038980969920196, + "grad_norm": 0.24963071942329407, + "learning_rate": 3.580804986605176e-05, + "loss": 1.734, + "step": 19675 + }, + { + "epoch": 6.0392879066912215, + "grad_norm": 0.2468494027853012, + "learning_rate": 3.580328380575062e-05, + "loss": 1.6866, + "step": 19676 + }, + { + "epoch": 6.039594843462247, + "grad_norm": 0.17628586292266846, + "learning_rate": 3.579851788574973e-05, + "loss": 1.7106, + "step": 19677 + }, + { + "epoch": 6.039901780233272, + "grad_norm": 0.23965299129486084, + "learning_rate": 3.579375210609622e-05, + "loss": 1.7675, + "step": 19678 + }, + { + "epoch": 6.0402087170042975, + "grad_norm": 0.19638453423976898, + "learning_rate": 3.5788986466837175e-05, + "loss": 1.7242, + "step": 19679 + }, + { + "epoch": 6.040515653775322, + "grad_norm": 0.2602851092815399, + "learning_rate": 3.578422096801971e-05, + "loss": 1.7287, + "step": 19680 + }, + { + "epoch": 6.040822590546347, + "grad_norm": 0.25868186354637146, + "learning_rate": 3.577945560969091e-05, + "loss": 1.7604, + "step": 19681 + }, + { + "epoch": 6.041129527317373, + "grad_norm": 0.1996527463197708, + "learning_rate": 3.577469039189784e-05, + "loss": 1.7469, + "step": 19682 + }, + { + "epoch": 6.041436464088398, + "grad_norm": 0.29909980297088623, + "learning_rate": 3.576992531468763e-05, + "loss": 1.682, + "step": 19683 + }, + { + "epoch": 6.041743400859423, + "grad_norm": 0.20064286887645721, + "learning_rate": 3.576516037810734e-05, + "loss": 1.7125, + "step": 19684 + }, + { + "epoch": 6.042050337630448, + "grad_norm": 0.2134515345096588, + "learning_rate": 3.576039558220411e-05, + "loss": 1.7371, + "step": 19685 + }, + { + "epoch": 6.042357274401473, + "grad_norm": 0.20365437865257263, + "learning_rate": 3.575563092702497e-05, + "loss": 1.7446, + "step": 19686 + }, + { + "epoch": 6.042664211172498, + "grad_norm": 0.24526065587997437, + "learning_rate": 3.5750866412617054e-05, + "loss": 1.759, + "step": 19687 + }, + { + "epoch": 6.042971147943524, + "grad_norm": 0.24521295726299286, + "learning_rate": 3.5746102039027414e-05, + "loss": 1.7589, + "step": 19688 + }, + { + "epoch": 6.043278084714549, + "grad_norm": 0.2151515632867813, + "learning_rate": 3.5741337806303155e-05, + "loss": 1.761, + "step": 19689 + }, + { + "epoch": 6.043585021485574, + "grad_norm": 0.25733521580696106, + "learning_rate": 3.573657371449134e-05, + "loss": 1.7171, + "step": 19690 + }, + { + "epoch": 6.043891958256599, + "grad_norm": 0.18520839512348175, + "learning_rate": 3.5731809763639084e-05, + "loss": 1.6691, + "step": 19691 + }, + { + "epoch": 6.044198895027624, + "grad_norm": 0.24617944657802582, + "learning_rate": 3.572704595379342e-05, + "loss": 1.7869, + "step": 19692 + }, + { + "epoch": 6.0445058317986495, + "grad_norm": 0.20246629416942596, + "learning_rate": 3.5722282285001493e-05, + "loss": 1.7667, + "step": 19693 + }, + { + "epoch": 6.044812768569675, + "grad_norm": 0.21190209686756134, + "learning_rate": 3.5717518757310305e-05, + "loss": 1.6839, + "step": 19694 + }, + { + "epoch": 6.0451197053407, + "grad_norm": 0.19021087884902954, + "learning_rate": 3.571275537076699e-05, + "loss": 1.7023, + "step": 19695 + }, + { + "epoch": 6.045426642111725, + "grad_norm": 0.1793040931224823, + "learning_rate": 3.570799212541858e-05, + "loss": 1.7022, + "step": 19696 + }, + { + "epoch": 6.04573357888275, + "grad_norm": 0.19105301797389984, + "learning_rate": 3.570322902131219e-05, + "loss": 1.7151, + "step": 19697 + }, + { + "epoch": 6.046040515653775, + "grad_norm": 0.22083842754364014, + "learning_rate": 3.569846605849487e-05, + "loss": 1.7097, + "step": 19698 + }, + { + "epoch": 6.046347452424801, + "grad_norm": 0.2607622444629669, + "learning_rate": 3.569370323701368e-05, + "loss": 1.7508, + "step": 19699 + }, + { + "epoch": 6.046654389195826, + "grad_norm": 0.22349929809570312, + "learning_rate": 3.56889405569157e-05, + "loss": 1.7131, + "step": 19700 + }, + { + "epoch": 6.04696132596685, + "grad_norm": 0.19442661106586456, + "learning_rate": 3.5684178018247996e-05, + "loss": 1.7476, + "step": 19701 + }, + { + "epoch": 6.047268262737876, + "grad_norm": 0.2002776861190796, + "learning_rate": 3.5679415621057646e-05, + "loss": 1.7982, + "step": 19702 + }, + { + "epoch": 6.047575199508901, + "grad_norm": 0.21558646857738495, + "learning_rate": 3.567465336539169e-05, + "loss": 1.7231, + "step": 19703 + }, + { + "epoch": 6.047882136279926, + "grad_norm": 0.20468449592590332, + "learning_rate": 3.5669891251297224e-05, + "loss": 1.6426, + "step": 19704 + }, + { + "epoch": 6.048189073050952, + "grad_norm": 0.23098553717136383, + "learning_rate": 3.566512927882127e-05, + "loss": 1.7763, + "step": 19705 + }, + { + "epoch": 6.048496009821977, + "grad_norm": 0.22959274053573608, + "learning_rate": 3.566036744801092e-05, + "loss": 1.7663, + "step": 19706 + }, + { + "epoch": 6.0488029465930016, + "grad_norm": 0.18519435822963715, + "learning_rate": 3.5655605758913215e-05, + "loss": 1.6995, + "step": 19707 + }, + { + "epoch": 6.049109883364027, + "grad_norm": 0.2529381513595581, + "learning_rate": 3.565084421157524e-05, + "loss": 1.754, + "step": 19708 + }, + { + "epoch": 6.049416820135052, + "grad_norm": 0.2208617776632309, + "learning_rate": 3.5646082806044015e-05, + "loss": 1.6939, + "step": 19709 + }, + { + "epoch": 6.0497237569060776, + "grad_norm": 0.18433862924575806, + "learning_rate": 3.564132154236663e-05, + "loss": 1.7145, + "step": 19710 + }, + { + "epoch": 6.050030693677103, + "grad_norm": 0.1963127702474594, + "learning_rate": 3.563656042059011e-05, + "loss": 1.7101, + "step": 19711 + }, + { + "epoch": 6.050337630448127, + "grad_norm": 0.19860461354255676, + "learning_rate": 3.5631799440761526e-05, + "loss": 1.7218, + "step": 19712 + }, + { + "epoch": 6.050644567219153, + "grad_norm": 0.19304174184799194, + "learning_rate": 3.5627038602927905e-05, + "loss": 1.7575, + "step": 19713 + }, + { + "epoch": 6.050951503990178, + "grad_norm": 0.20402809977531433, + "learning_rate": 3.5622277907136335e-05, + "loss": 1.7438, + "step": 19714 + }, + { + "epoch": 6.051258440761203, + "grad_norm": 0.20821911096572876, + "learning_rate": 3.5617517353433844e-05, + "loss": 1.7381, + "step": 19715 + }, + { + "epoch": 6.051565377532229, + "grad_norm": 0.24375931918621063, + "learning_rate": 3.561275694186745e-05, + "loss": 1.8377, + "step": 19716 + }, + { + "epoch": 6.051872314303253, + "grad_norm": 0.19745339453220367, + "learning_rate": 3.560799667248424e-05, + "loss": 1.6839, + "step": 19717 + }, + { + "epoch": 6.0521792510742785, + "grad_norm": 0.2039431631565094, + "learning_rate": 3.560323654533124e-05, + "loss": 1.692, + "step": 19718 + }, + { + "epoch": 6.052486187845304, + "grad_norm": 0.23229047656059265, + "learning_rate": 3.559847656045551e-05, + "loss": 1.7408, + "step": 19719 + }, + { + "epoch": 6.052793124616329, + "grad_norm": 0.20387259125709534, + "learning_rate": 3.559371671790404e-05, + "loss": 1.7215, + "step": 19720 + }, + { + "epoch": 6.0531000613873545, + "grad_norm": 0.23960062861442566, + "learning_rate": 3.5588957017723944e-05, + "loss": 1.8048, + "step": 19721 + }, + { + "epoch": 6.05340699815838, + "grad_norm": 0.1979944109916687, + "learning_rate": 3.5584197459962196e-05, + "loss": 1.7307, + "step": 19722 + }, + { + "epoch": 6.053713934929404, + "grad_norm": 0.21914203464984894, + "learning_rate": 3.557943804466586e-05, + "loss": 1.6999, + "step": 19723 + }, + { + "epoch": 6.05402087170043, + "grad_norm": 0.22338175773620605, + "learning_rate": 3.557467877188197e-05, + "loss": 1.6977, + "step": 19724 + }, + { + "epoch": 6.054327808471455, + "grad_norm": 0.2692863643169403, + "learning_rate": 3.5569919641657576e-05, + "loss": 1.7664, + "step": 19725 + }, + { + "epoch": 6.05463474524248, + "grad_norm": 0.2882823944091797, + "learning_rate": 3.5565160654039675e-05, + "loss": 1.6943, + "step": 19726 + }, + { + "epoch": 6.054941682013506, + "grad_norm": 0.2114996612071991, + "learning_rate": 3.5560401809075336e-05, + "loss": 1.7426, + "step": 19727 + }, + { + "epoch": 6.05524861878453, + "grad_norm": 0.19616106152534485, + "learning_rate": 3.5555643106811546e-05, + "loss": 1.6616, + "step": 19728 + }, + { + "epoch": 6.055555555555555, + "grad_norm": 0.241346076130867, + "learning_rate": 3.555088454729537e-05, + "loss": 1.7423, + "step": 19729 + }, + { + "epoch": 6.055862492326581, + "grad_norm": 0.24495846033096313, + "learning_rate": 3.554612613057381e-05, + "loss": 1.7699, + "step": 19730 + }, + { + "epoch": 6.056169429097606, + "grad_norm": 0.233306422829628, + "learning_rate": 3.554136785669393e-05, + "loss": 1.7201, + "step": 19731 + }, + { + "epoch": 6.056476365868631, + "grad_norm": 0.23820927739143372, + "learning_rate": 3.553660972570272e-05, + "loss": 1.7694, + "step": 19732 + }, + { + "epoch": 6.056783302639656, + "grad_norm": 0.20664167404174805, + "learning_rate": 3.553185173764719e-05, + "loss": 1.7151, + "step": 19733 + }, + { + "epoch": 6.057090239410681, + "grad_norm": 0.22572578489780426, + "learning_rate": 3.5527093892574394e-05, + "loss": 1.7715, + "step": 19734 + }, + { + "epoch": 6.0573971761817065, + "grad_norm": 0.18554186820983887, + "learning_rate": 3.552233619053133e-05, + "loss": 1.7481, + "step": 19735 + }, + { + "epoch": 6.057704112952732, + "grad_norm": 0.2434636950492859, + "learning_rate": 3.551757863156504e-05, + "loss": 1.7992, + "step": 19736 + }, + { + "epoch": 6.058011049723757, + "grad_norm": 0.1949392408132553, + "learning_rate": 3.5512821215722514e-05, + "loss": 1.7439, + "step": 19737 + }, + { + "epoch": 6.0583179864947825, + "grad_norm": 0.2696731686592102, + "learning_rate": 3.55080639430508e-05, + "loss": 1.7092, + "step": 19738 + }, + { + "epoch": 6.058624923265807, + "grad_norm": 0.1963263303041458, + "learning_rate": 3.550330681359686e-05, + "loss": 1.6726, + "step": 19739 + }, + { + "epoch": 6.058931860036832, + "grad_norm": 0.20115122199058533, + "learning_rate": 3.549854982740776e-05, + "loss": 1.7459, + "step": 19740 + }, + { + "epoch": 6.059238796807858, + "grad_norm": 0.21378284692764282, + "learning_rate": 3.549379298453048e-05, + "loss": 1.7028, + "step": 19741 + }, + { + "epoch": 6.059545733578883, + "grad_norm": 0.21954336762428284, + "learning_rate": 3.5489036285012055e-05, + "loss": 1.7209, + "step": 19742 + }, + { + "epoch": 6.059852670349908, + "grad_norm": 0.20117704570293427, + "learning_rate": 3.548427972889946e-05, + "loss": 1.7273, + "step": 19743 + }, + { + "epoch": 6.060159607120933, + "grad_norm": 0.23786263167858124, + "learning_rate": 3.5479523316239745e-05, + "loss": 1.7519, + "step": 19744 + }, + { + "epoch": 6.060466543891958, + "grad_norm": 0.17704391479492188, + "learning_rate": 3.5474767047079864e-05, + "loss": 1.6644, + "step": 19745 + }, + { + "epoch": 6.060773480662983, + "grad_norm": 0.1883699744939804, + "learning_rate": 3.547001092146687e-05, + "loss": 1.6586, + "step": 19746 + }, + { + "epoch": 6.061080417434009, + "grad_norm": 0.19101519882678986, + "learning_rate": 3.546525493944773e-05, + "loss": 1.7575, + "step": 19747 + }, + { + "epoch": 6.061387354205034, + "grad_norm": 0.1924263834953308, + "learning_rate": 3.546049910106947e-05, + "loss": 1.743, + "step": 19748 + }, + { + "epoch": 6.0616942909760585, + "grad_norm": 0.1853020042181015, + "learning_rate": 3.5455743406379084e-05, + "loss": 1.7466, + "step": 19749 + }, + { + "epoch": 6.062001227747084, + "grad_norm": 0.21322499215602875, + "learning_rate": 3.545098785542355e-05, + "loss": 1.7625, + "step": 19750 + }, + { + "epoch": 6.062308164518109, + "grad_norm": 0.1567271500825882, + "learning_rate": 3.544623244824989e-05, + "loss": 1.6531, + "step": 19751 + }, + { + "epoch": 6.0626151012891345, + "grad_norm": 0.2125476449728012, + "learning_rate": 3.544147718490508e-05, + "loss": 1.7547, + "step": 19752 + }, + { + "epoch": 6.06292203806016, + "grad_norm": 0.19470059871673584, + "learning_rate": 3.543672206543615e-05, + "loss": 1.7327, + "step": 19753 + }, + { + "epoch": 6.063228974831185, + "grad_norm": 0.1690339744091034, + "learning_rate": 3.543196708989004e-05, + "loss": 1.6621, + "step": 19754 + }, + { + "epoch": 6.06353591160221, + "grad_norm": 0.17322230339050293, + "learning_rate": 3.54272122583138e-05, + "loss": 1.7018, + "step": 19755 + }, + { + "epoch": 6.063842848373235, + "grad_norm": 0.22174575924873352, + "learning_rate": 3.5422457570754365e-05, + "loss": 1.724, + "step": 19756 + }, + { + "epoch": 6.06414978514426, + "grad_norm": 0.20233364403247833, + "learning_rate": 3.541770302725875e-05, + "loss": 1.6518, + "step": 19757 + }, + { + "epoch": 6.064456721915286, + "grad_norm": 0.1585279405117035, + "learning_rate": 3.541294862787395e-05, + "loss": 1.6985, + "step": 19758 + }, + { + "epoch": 6.064763658686311, + "grad_norm": 0.2180105745792389, + "learning_rate": 3.540819437264694e-05, + "loss": 1.6728, + "step": 19759 + }, + { + "epoch": 6.065070595457335, + "grad_norm": 0.2295975238084793, + "learning_rate": 3.5403440261624696e-05, + "loss": 1.7566, + "step": 19760 + }, + { + "epoch": 6.065377532228361, + "grad_norm": 0.17460396885871887, + "learning_rate": 3.5398686294854234e-05, + "loss": 1.6977, + "step": 19761 + }, + { + "epoch": 6.065684468999386, + "grad_norm": 0.20828662812709808, + "learning_rate": 3.539393247238249e-05, + "loss": 1.7789, + "step": 19762 + }, + { + "epoch": 6.065991405770411, + "grad_norm": 0.2273385375738144, + "learning_rate": 3.5389178794256476e-05, + "loss": 1.7316, + "step": 19763 + }, + { + "epoch": 6.066298342541437, + "grad_norm": 0.2332257330417633, + "learning_rate": 3.538442526052316e-05, + "loss": 1.7355, + "step": 19764 + }, + { + "epoch": 6.066605279312462, + "grad_norm": 0.17953866720199585, + "learning_rate": 3.537967187122952e-05, + "loss": 1.7107, + "step": 19765 + }, + { + "epoch": 6.0669122160834865, + "grad_norm": 0.2334052473306656, + "learning_rate": 3.537491862642254e-05, + "loss": 1.7572, + "step": 19766 + }, + { + "epoch": 6.067219152854512, + "grad_norm": 0.2427968829870224, + "learning_rate": 3.5370165526149165e-05, + "loss": 1.7254, + "step": 19767 + }, + { + "epoch": 6.067526089625537, + "grad_norm": 0.2701692283153534, + "learning_rate": 3.53654125704564e-05, + "loss": 1.7525, + "step": 19768 + }, + { + "epoch": 6.0678330263965625, + "grad_norm": 0.3775569796562195, + "learning_rate": 3.536065975939121e-05, + "loss": 1.7516, + "step": 19769 + }, + { + "epoch": 6.068139963167588, + "grad_norm": 0.18971984088420868, + "learning_rate": 3.535590709300056e-05, + "loss": 1.6777, + "step": 19770 + }, + { + "epoch": 6.068446899938612, + "grad_norm": 0.2710094749927521, + "learning_rate": 3.535115457133141e-05, + "loss": 1.7612, + "step": 19771 + }, + { + "epoch": 6.068753836709638, + "grad_norm": 0.19414621591567993, + "learning_rate": 3.534640219443075e-05, + "loss": 1.6795, + "step": 19772 + }, + { + "epoch": 6.069060773480663, + "grad_norm": 0.2384893298149109, + "learning_rate": 3.534164996234552e-05, + "loss": 1.7869, + "step": 19773 + }, + { + "epoch": 6.069367710251688, + "grad_norm": 0.2206166833639145, + "learning_rate": 3.533689787512271e-05, + "loss": 1.7332, + "step": 19774 + }, + { + "epoch": 6.069674647022714, + "grad_norm": 0.19740800559520721, + "learning_rate": 3.533214593280926e-05, + "loss": 1.6744, + "step": 19775 + }, + { + "epoch": 6.069981583793738, + "grad_norm": 0.2098212093114853, + "learning_rate": 3.532739413545214e-05, + "loss": 1.731, + "step": 19776 + }, + { + "epoch": 6.070288520564763, + "grad_norm": 0.2508943974971771, + "learning_rate": 3.5322642483098304e-05, + "loss": 1.7682, + "step": 19777 + }, + { + "epoch": 6.070595457335789, + "grad_norm": 0.22202368080615997, + "learning_rate": 3.531789097579474e-05, + "loss": 1.6965, + "step": 19778 + }, + { + "epoch": 6.070902394106814, + "grad_norm": 0.19276803731918335, + "learning_rate": 3.5313139613588355e-05, + "loss": 1.6855, + "step": 19779 + }, + { + "epoch": 6.071209330877839, + "grad_norm": 0.23910140991210938, + "learning_rate": 3.530838839652616e-05, + "loss": 1.8099, + "step": 19780 + }, + { + "epoch": 6.071516267648865, + "grad_norm": 0.19440437853336334, + "learning_rate": 3.530363732465506e-05, + "loss": 1.67, + "step": 19781 + }, + { + "epoch": 6.071823204419889, + "grad_norm": 0.1954154074192047, + "learning_rate": 3.529888639802204e-05, + "loss": 1.7154, + "step": 19782 + }, + { + "epoch": 6.0721301411909145, + "grad_norm": 0.20836392045021057, + "learning_rate": 3.529413561667405e-05, + "loss": 1.7451, + "step": 19783 + }, + { + "epoch": 6.07243707796194, + "grad_norm": 0.20521731674671173, + "learning_rate": 3.5289384980658016e-05, + "loss": 1.7008, + "step": 19784 + }, + { + "epoch": 6.072744014732965, + "grad_norm": 0.22885540127754211, + "learning_rate": 3.528463449002092e-05, + "loss": 1.7605, + "step": 19785 + }, + { + "epoch": 6.0730509515039905, + "grad_norm": 0.27740219235420227, + "learning_rate": 3.5279884144809664e-05, + "loss": 1.7816, + "step": 19786 + }, + { + "epoch": 6.073357888275015, + "grad_norm": 0.24747557938098907, + "learning_rate": 3.527513394507124e-05, + "loss": 1.7207, + "step": 19787 + }, + { + "epoch": 6.07366482504604, + "grad_norm": 0.20127782225608826, + "learning_rate": 3.527038389085256e-05, + "loss": 1.702, + "step": 19788 + }, + { + "epoch": 6.073971761817066, + "grad_norm": 0.20683316886425018, + "learning_rate": 3.5265633982200595e-05, + "loss": 1.7022, + "step": 19789 + }, + { + "epoch": 6.074278698588091, + "grad_norm": 0.17829765379428864, + "learning_rate": 3.5260884219162256e-05, + "loss": 1.7099, + "step": 19790 + }, + { + "epoch": 6.074585635359116, + "grad_norm": 0.256964772939682, + "learning_rate": 3.525613460178452e-05, + "loss": 1.7226, + "step": 19791 + }, + { + "epoch": 6.074892572130141, + "grad_norm": 0.22840122878551483, + "learning_rate": 3.525138513011428e-05, + "loss": 1.7738, + "step": 19792 + }, + { + "epoch": 6.075199508901166, + "grad_norm": 0.18988655507564545, + "learning_rate": 3.52466358041985e-05, + "loss": 1.6775, + "step": 19793 + }, + { + "epoch": 6.0755064456721914, + "grad_norm": 0.21857139468193054, + "learning_rate": 3.524188662408411e-05, + "loss": 1.7596, + "step": 19794 + }, + { + "epoch": 6.075813382443217, + "grad_norm": 0.22910535335540771, + "learning_rate": 3.523713758981807e-05, + "loss": 1.7969, + "step": 19795 + }, + { + "epoch": 6.076120319214242, + "grad_norm": 0.20885716378688812, + "learning_rate": 3.523238870144726e-05, + "loss": 1.7407, + "step": 19796 + }, + { + "epoch": 6.0764272559852675, + "grad_norm": 0.2056209295988083, + "learning_rate": 3.5227639959018666e-05, + "loss": 1.759, + "step": 19797 + }, + { + "epoch": 6.076734192756292, + "grad_norm": 0.17485356330871582, + "learning_rate": 3.522289136257917e-05, + "loss": 1.6988, + "step": 19798 + }, + { + "epoch": 6.077041129527317, + "grad_norm": 0.2103404402732849, + "learning_rate": 3.521814291217573e-05, + "loss": 1.766, + "step": 19799 + }, + { + "epoch": 6.077348066298343, + "grad_norm": 0.21852105855941772, + "learning_rate": 3.521339460785528e-05, + "loss": 1.7435, + "step": 19800 + }, + { + "epoch": 6.077655003069368, + "grad_norm": 0.21578362584114075, + "learning_rate": 3.520864644966471e-05, + "loss": 1.7281, + "step": 19801 + }, + { + "epoch": 6.077961939840393, + "grad_norm": 0.20405036211013794, + "learning_rate": 3.520389843765099e-05, + "loss": 1.7367, + "step": 19802 + }, + { + "epoch": 6.078268876611418, + "grad_norm": 0.2578286826610565, + "learning_rate": 3.5199150571860996e-05, + "loss": 1.7625, + "step": 19803 + }, + { + "epoch": 6.078575813382443, + "grad_norm": 0.240324467420578, + "learning_rate": 3.519440285234168e-05, + "loss": 1.6979, + "step": 19804 + }, + { + "epoch": 6.078882750153468, + "grad_norm": 0.220765620470047, + "learning_rate": 3.5189655279139935e-05, + "loss": 1.7679, + "step": 19805 + }, + { + "epoch": 6.079189686924494, + "grad_norm": 0.2731996774673462, + "learning_rate": 3.518490785230273e-05, + "loss": 1.6723, + "step": 19806 + }, + { + "epoch": 6.079496623695519, + "grad_norm": 0.2593478262424469, + "learning_rate": 3.518016057187692e-05, + "loss": 1.7232, + "step": 19807 + }, + { + "epoch": 6.0798035604665435, + "grad_norm": 0.34642404317855835, + "learning_rate": 3.517541343790947e-05, + "loss": 1.8265, + "step": 19808 + }, + { + "epoch": 6.080110497237569, + "grad_norm": 0.3187299370765686, + "learning_rate": 3.5170666450447255e-05, + "loss": 1.6847, + "step": 19809 + }, + { + "epoch": 6.080417434008594, + "grad_norm": 0.20413202047348022, + "learning_rate": 3.5165919609537215e-05, + "loss": 1.6533, + "step": 19810 + }, + { + "epoch": 6.0807243707796195, + "grad_norm": 0.2753545343875885, + "learning_rate": 3.516117291522625e-05, + "loss": 1.7491, + "step": 19811 + }, + { + "epoch": 6.081031307550645, + "grad_norm": 0.20174793899059296, + "learning_rate": 3.515642636756128e-05, + "loss": 1.6902, + "step": 19812 + }, + { + "epoch": 6.08133824432167, + "grad_norm": 0.22567492723464966, + "learning_rate": 3.515167996658919e-05, + "loss": 1.7165, + "step": 19813 + }, + { + "epoch": 6.081645181092695, + "grad_norm": 0.2115732729434967, + "learning_rate": 3.514693371235692e-05, + "loss": 1.6888, + "step": 19814 + }, + { + "epoch": 6.08195211786372, + "grad_norm": 0.2141808122396469, + "learning_rate": 3.514218760491134e-05, + "loss": 1.7152, + "step": 19815 + }, + { + "epoch": 6.082259054634745, + "grad_norm": 0.19767558574676514, + "learning_rate": 3.513744164429938e-05, + "loss": 1.6926, + "step": 19816 + }, + { + "epoch": 6.082565991405771, + "grad_norm": 0.20220023393630981, + "learning_rate": 3.5132695830567944e-05, + "loss": 1.6727, + "step": 19817 + }, + { + "epoch": 6.082872928176796, + "grad_norm": 0.19589759409427643, + "learning_rate": 3.5127950163763896e-05, + "loss": 1.7545, + "step": 19818 + }, + { + "epoch": 6.08317986494782, + "grad_norm": 0.21303611993789673, + "learning_rate": 3.512320464393418e-05, + "loss": 1.753, + "step": 19819 + }, + { + "epoch": 6.083486801718846, + "grad_norm": 0.19438377022743225, + "learning_rate": 3.511845927112566e-05, + "loss": 1.7022, + "step": 19820 + }, + { + "epoch": 6.083793738489871, + "grad_norm": 0.21282976865768433, + "learning_rate": 3.511371404538526e-05, + "loss": 1.7099, + "step": 19821 + }, + { + "epoch": 6.084100675260896, + "grad_norm": 0.1874496042728424, + "learning_rate": 3.5108968966759846e-05, + "loss": 1.7033, + "step": 19822 + }, + { + "epoch": 6.084407612031922, + "grad_norm": 0.21199075877666473, + "learning_rate": 3.510422403529636e-05, + "loss": 1.7088, + "step": 19823 + }, + { + "epoch": 6.084714548802946, + "grad_norm": 0.21847110986709595, + "learning_rate": 3.5099479251041634e-05, + "loss": 1.7395, + "step": 19824 + }, + { + "epoch": 6.0850214855739715, + "grad_norm": 0.201395645737648, + "learning_rate": 3.509473461404261e-05, + "loss": 1.7522, + "step": 19825 + }, + { + "epoch": 6.085328422344997, + "grad_norm": 0.19637656211853027, + "learning_rate": 3.5089990124346135e-05, + "loss": 1.6774, + "step": 19826 + }, + { + "epoch": 6.085635359116022, + "grad_norm": 0.25918442010879517, + "learning_rate": 3.5085245781999124e-05, + "loss": 1.7704, + "step": 19827 + }, + { + "epoch": 6.0859422958870475, + "grad_norm": 0.21271947026252747, + "learning_rate": 3.508050158704844e-05, + "loss": 1.6902, + "step": 19828 + }, + { + "epoch": 6.086249232658073, + "grad_norm": 0.2065698802471161, + "learning_rate": 3.5075757539541024e-05, + "loss": 1.7945, + "step": 19829 + }, + { + "epoch": 6.086556169429097, + "grad_norm": 0.20247824490070343, + "learning_rate": 3.5071013639523684e-05, + "loss": 1.7532, + "step": 19830 + }, + { + "epoch": 6.086863106200123, + "grad_norm": 0.19705431163311005, + "learning_rate": 3.506626988704336e-05, + "loss": 1.6353, + "step": 19831 + }, + { + "epoch": 6.087170042971148, + "grad_norm": 0.20158523321151733, + "learning_rate": 3.5061526282146886e-05, + "loss": 1.6596, + "step": 19832 + }, + { + "epoch": 6.087476979742173, + "grad_norm": 0.19492848217487335, + "learning_rate": 3.505678282488118e-05, + "loss": 1.7107, + "step": 19833 + }, + { + "epoch": 6.087783916513199, + "grad_norm": 0.2403736114501953, + "learning_rate": 3.505203951529312e-05, + "loss": 1.7456, + "step": 19834 + }, + { + "epoch": 6.088090853284223, + "grad_norm": 0.25649771094322205, + "learning_rate": 3.504729635342954e-05, + "loss": 1.7513, + "step": 19835 + }, + { + "epoch": 6.088397790055248, + "grad_norm": 0.20172113180160522, + "learning_rate": 3.504255333933736e-05, + "loss": 1.7737, + "step": 19836 + }, + { + "epoch": 6.088704726826274, + "grad_norm": 0.2715936303138733, + "learning_rate": 3.5037810473063414e-05, + "loss": 1.759, + "step": 19837 + }, + { + "epoch": 6.089011663597299, + "grad_norm": 0.23145076632499695, + "learning_rate": 3.503306775465461e-05, + "loss": 1.7811, + "step": 19838 + }, + { + "epoch": 6.089318600368324, + "grad_norm": 0.1953691691160202, + "learning_rate": 3.502832518415778e-05, + "loss": 1.752, + "step": 19839 + }, + { + "epoch": 6.08962553713935, + "grad_norm": 0.1927584707736969, + "learning_rate": 3.502358276161986e-05, + "loss": 1.6865, + "step": 19840 + }, + { + "epoch": 6.089932473910374, + "grad_norm": 0.19294732809066772, + "learning_rate": 3.501884048708763e-05, + "loss": 1.6838, + "step": 19841 + }, + { + "epoch": 6.0902394106813995, + "grad_norm": 0.23351021111011505, + "learning_rate": 3.501409836060803e-05, + "loss": 1.8029, + "step": 19842 + }, + { + "epoch": 6.090546347452425, + "grad_norm": 0.21615718305110931, + "learning_rate": 3.5009356382227877e-05, + "loss": 1.7441, + "step": 19843 + }, + { + "epoch": 6.09085328422345, + "grad_norm": 0.19091549515724182, + "learning_rate": 3.500461455199405e-05, + "loss": 1.7056, + "step": 19844 + }, + { + "epoch": 6.0911602209944755, + "grad_norm": 0.21189090609550476, + "learning_rate": 3.499987286995341e-05, + "loss": 1.6853, + "step": 19845 + }, + { + "epoch": 6.0914671577655, + "grad_norm": 0.22545887529850006, + "learning_rate": 3.499513133615283e-05, + "loss": 1.7854, + "step": 19846 + }, + { + "epoch": 6.091774094536525, + "grad_norm": 0.21960650384426117, + "learning_rate": 3.4990389950639144e-05, + "loss": 1.7558, + "step": 19847 + }, + { + "epoch": 6.092081031307551, + "grad_norm": 0.20825782418251038, + "learning_rate": 3.4985648713459244e-05, + "loss": 1.7103, + "step": 19848 + }, + { + "epoch": 6.092387968078576, + "grad_norm": 0.20886415243148804, + "learning_rate": 3.498090762465993e-05, + "loss": 1.6897, + "step": 19849 + }, + { + "epoch": 6.092694904849601, + "grad_norm": 0.19306892156600952, + "learning_rate": 3.4976166684288115e-05, + "loss": 1.7506, + "step": 19850 + }, + { + "epoch": 6.093001841620626, + "grad_norm": 0.2178204357624054, + "learning_rate": 3.497142589239063e-05, + "loss": 1.6774, + "step": 19851 + }, + { + "epoch": 6.093308778391651, + "grad_norm": 0.1914307177066803, + "learning_rate": 3.4966685249014294e-05, + "loss": 1.7182, + "step": 19852 + }, + { + "epoch": 6.093615715162676, + "grad_norm": 0.22006092965602875, + "learning_rate": 3.496194475420602e-05, + "loss": 1.7209, + "step": 19853 + }, + { + "epoch": 6.093922651933702, + "grad_norm": 0.20621439814567566, + "learning_rate": 3.49572044080126e-05, + "loss": 1.7403, + "step": 19854 + }, + { + "epoch": 6.094229588704727, + "grad_norm": 0.24079272150993347, + "learning_rate": 3.495246421048091e-05, + "loss": 1.7619, + "step": 19855 + }, + { + "epoch": 6.094536525475752, + "grad_norm": 0.19073884189128876, + "learning_rate": 3.494772416165777e-05, + "loss": 1.6677, + "step": 19856 + }, + { + "epoch": 6.094843462246777, + "grad_norm": 0.18217229843139648, + "learning_rate": 3.494298426159007e-05, + "loss": 1.7162, + "step": 19857 + }, + { + "epoch": 6.095150399017802, + "grad_norm": 0.21901506185531616, + "learning_rate": 3.493824451032461e-05, + "loss": 1.7173, + "step": 19858 + }, + { + "epoch": 6.0954573357888275, + "grad_norm": 0.22156217694282532, + "learning_rate": 3.493350490790826e-05, + "loss": 1.8029, + "step": 19859 + }, + { + "epoch": 6.095764272559853, + "grad_norm": 0.1663675606250763, + "learning_rate": 3.4928765454387824e-05, + "loss": 1.7306, + "step": 19860 + }, + { + "epoch": 6.096071209330878, + "grad_norm": 0.19684657454490662, + "learning_rate": 3.4924026149810175e-05, + "loss": 1.6944, + "step": 19861 + }, + { + "epoch": 6.096378146101903, + "grad_norm": 0.19163468480110168, + "learning_rate": 3.4919286994222125e-05, + "loss": 1.7331, + "step": 19862 + }, + { + "epoch": 6.096685082872928, + "grad_norm": 0.20134083926677704, + "learning_rate": 3.491454798767054e-05, + "loss": 1.7365, + "step": 19863 + }, + { + "epoch": 6.096992019643953, + "grad_norm": 0.23877696692943573, + "learning_rate": 3.490980913020221e-05, + "loss": 1.753, + "step": 19864 + }, + { + "epoch": 6.097298956414979, + "grad_norm": 0.207699254155159, + "learning_rate": 3.490507042186402e-05, + "loss": 1.6835, + "step": 19865 + }, + { + "epoch": 6.097605893186004, + "grad_norm": 0.20608612895011902, + "learning_rate": 3.490033186270274e-05, + "loss": 1.7379, + "step": 19866 + }, + { + "epoch": 6.097912829957028, + "grad_norm": 0.25086313486099243, + "learning_rate": 3.489559345276524e-05, + "loss": 1.7692, + "step": 19867 + }, + { + "epoch": 6.098219766728054, + "grad_norm": 0.22025549411773682, + "learning_rate": 3.489085519209836e-05, + "loss": 1.6579, + "step": 19868 + }, + { + "epoch": 6.098526703499079, + "grad_norm": 0.23805730044841766, + "learning_rate": 3.4886117080748875e-05, + "loss": 1.7695, + "step": 19869 + }, + { + "epoch": 6.098833640270104, + "grad_norm": 0.23271869122982025, + "learning_rate": 3.4881379118763666e-05, + "loss": 1.7268, + "step": 19870 + }, + { + "epoch": 6.09914057704113, + "grad_norm": 0.21795618534088135, + "learning_rate": 3.4876641306189505e-05, + "loss": 1.6996, + "step": 19871 + }, + { + "epoch": 6.099447513812155, + "grad_norm": 0.22064761817455292, + "learning_rate": 3.487190364307326e-05, + "loss": 1.7032, + "step": 19872 + }, + { + "epoch": 6.0997544505831796, + "grad_norm": 0.23834183812141418, + "learning_rate": 3.4867166129461706e-05, + "loss": 1.6942, + "step": 19873 + }, + { + "epoch": 6.100061387354205, + "grad_norm": 0.21143686771392822, + "learning_rate": 3.486242876540171e-05, + "loss": 1.6904, + "step": 19874 + }, + { + "epoch": 6.10036832412523, + "grad_norm": 0.18099969625473022, + "learning_rate": 3.485769155094004e-05, + "loss": 1.6669, + "step": 19875 + }, + { + "epoch": 6.100675260896256, + "grad_norm": 0.25324884057044983, + "learning_rate": 3.4852954486123566e-05, + "loss": 1.7878, + "step": 19876 + }, + { + "epoch": 6.100982197667281, + "grad_norm": 0.2252139449119568, + "learning_rate": 3.4848217570999055e-05, + "loss": 1.7674, + "step": 19877 + }, + { + "epoch": 6.101289134438305, + "grad_norm": 0.19629882276058197, + "learning_rate": 3.4843480805613346e-05, + "loss": 1.6898, + "step": 19878 + }, + { + "epoch": 6.101596071209331, + "grad_norm": 0.1858786642551422, + "learning_rate": 3.483874419001323e-05, + "loss": 1.6856, + "step": 19879 + }, + { + "epoch": 6.101903007980356, + "grad_norm": 0.1842946857213974, + "learning_rate": 3.483400772424555e-05, + "loss": 1.7229, + "step": 19880 + }, + { + "epoch": 6.102209944751381, + "grad_norm": 0.18981511890888214, + "learning_rate": 3.482927140835708e-05, + "loss": 1.75, + "step": 19881 + }, + { + "epoch": 6.102516881522407, + "grad_norm": 0.19914525747299194, + "learning_rate": 3.482453524239466e-05, + "loss": 1.7702, + "step": 19882 + }, + { + "epoch": 6.102823818293431, + "grad_norm": 0.1960345208644867, + "learning_rate": 3.481979922640507e-05, + "loss": 1.7189, + "step": 19883 + }, + { + "epoch": 6.1031307550644565, + "grad_norm": 0.20309221744537354, + "learning_rate": 3.48150633604351e-05, + "loss": 1.7888, + "step": 19884 + }, + { + "epoch": 6.103437691835482, + "grad_norm": 0.20090891420841217, + "learning_rate": 3.48103276445316e-05, + "loss": 1.8017, + "step": 19885 + }, + { + "epoch": 6.103744628606507, + "grad_norm": 0.22500385344028473, + "learning_rate": 3.480559207874133e-05, + "loss": 1.7061, + "step": 19886 + }, + { + "epoch": 6.1040515653775325, + "grad_norm": 0.22594885528087616, + "learning_rate": 3.480085666311113e-05, + "loss": 1.7659, + "step": 19887 + }, + { + "epoch": 6.104358502148558, + "grad_norm": 0.2769651710987091, + "learning_rate": 3.479612139768774e-05, + "loss": 1.7668, + "step": 19888 + }, + { + "epoch": 6.104665438919582, + "grad_norm": 0.24251700937747955, + "learning_rate": 3.4791386282518e-05, + "loss": 1.8068, + "step": 19889 + }, + { + "epoch": 6.104972375690608, + "grad_norm": 0.23325790464878082, + "learning_rate": 3.478665131764869e-05, + "loss": 1.7116, + "step": 19890 + }, + { + "epoch": 6.105279312461633, + "grad_norm": 0.19998812675476074, + "learning_rate": 3.478191650312663e-05, + "loss": 1.7116, + "step": 19891 + }, + { + "epoch": 6.105586249232658, + "grad_norm": 0.20933640003204346, + "learning_rate": 3.4777181838998566e-05, + "loss": 1.7138, + "step": 19892 + }, + { + "epoch": 6.105893186003684, + "grad_norm": 0.24344035983085632, + "learning_rate": 3.477244732531134e-05, + "loss": 1.784, + "step": 19893 + }, + { + "epoch": 6.106200122774708, + "grad_norm": 0.2220575362443924, + "learning_rate": 3.4767712962111686e-05, + "loss": 1.7479, + "step": 19894 + }, + { + "epoch": 6.106507059545733, + "grad_norm": 0.2222832590341568, + "learning_rate": 3.476297874944644e-05, + "loss": 1.7278, + "step": 19895 + }, + { + "epoch": 6.106813996316759, + "grad_norm": 0.222265362739563, + "learning_rate": 3.4758244687362353e-05, + "loss": 1.7321, + "step": 19896 + }, + { + "epoch": 6.107120933087784, + "grad_norm": 0.2921304702758789, + "learning_rate": 3.475351077590625e-05, + "loss": 1.7848, + "step": 19897 + }, + { + "epoch": 6.107427869858809, + "grad_norm": 0.21015208959579468, + "learning_rate": 3.4748777015124856e-05, + "loss": 1.7987, + "step": 19898 + }, + { + "epoch": 6.107734806629834, + "grad_norm": 0.19510969519615173, + "learning_rate": 3.474404340506502e-05, + "loss": 1.7317, + "step": 19899 + }, + { + "epoch": 6.108041743400859, + "grad_norm": 0.21978609263896942, + "learning_rate": 3.473930994577348e-05, + "loss": 1.6943, + "step": 19900 + }, + { + "epoch": 6.1083486801718845, + "grad_norm": 0.1793510913848877, + "learning_rate": 3.4734576637297004e-05, + "loss": 1.6659, + "step": 19901 + }, + { + "epoch": 6.10865561694291, + "grad_norm": 0.2029319554567337, + "learning_rate": 3.4729843479682414e-05, + "loss": 1.7127, + "step": 19902 + }, + { + "epoch": 6.108962553713935, + "grad_norm": 0.2001914530992508, + "learning_rate": 3.472511047297644e-05, + "loss": 1.691, + "step": 19903 + }, + { + "epoch": 6.1092694904849605, + "grad_norm": 0.2194693237543106, + "learning_rate": 3.47203776172259e-05, + "loss": 1.7181, + "step": 19904 + }, + { + "epoch": 6.109576427255985, + "grad_norm": 0.1865277737379074, + "learning_rate": 3.4715644912477515e-05, + "loss": 1.6786, + "step": 19905 + }, + { + "epoch": 6.10988336402701, + "grad_norm": 0.20574906468391418, + "learning_rate": 3.471091235877811e-05, + "loss": 1.7681, + "step": 19906 + }, + { + "epoch": 6.110190300798036, + "grad_norm": 0.21072493493556976, + "learning_rate": 3.470617995617441e-05, + "loss": 1.7494, + "step": 19907 + }, + { + "epoch": 6.110497237569061, + "grad_norm": 0.2411658763885498, + "learning_rate": 3.470144770471323e-05, + "loss": 1.7183, + "step": 19908 + }, + { + "epoch": 6.110804174340086, + "grad_norm": 0.19782759249210358, + "learning_rate": 3.4696715604441285e-05, + "loss": 1.6823, + "step": 19909 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.315026193857193, + "learning_rate": 3.469198365540539e-05, + "loss": 1.691, + "step": 19910 + }, + { + "epoch": 6.111418047882136, + "grad_norm": 0.19840773940086365, + "learning_rate": 3.468725185765226e-05, + "loss": 1.7413, + "step": 19911 + }, + { + "epoch": 6.111724984653161, + "grad_norm": 0.1813160926103592, + "learning_rate": 3.46825202112287e-05, + "loss": 1.7095, + "step": 19912 + }, + { + "epoch": 6.112031921424187, + "grad_norm": 0.21025459468364716, + "learning_rate": 3.467778871618145e-05, + "loss": 1.7783, + "step": 19913 + }, + { + "epoch": 6.112338858195212, + "grad_norm": 0.20088298618793488, + "learning_rate": 3.4673057372557265e-05, + "loss": 1.7671, + "step": 19914 + }, + { + "epoch": 6.112645794966237, + "grad_norm": 0.21919472515583038, + "learning_rate": 3.466832618040291e-05, + "loss": 1.7052, + "step": 19915 + }, + { + "epoch": 6.112952731737262, + "grad_norm": 0.19135436415672302, + "learning_rate": 3.466359513976516e-05, + "loss": 1.7862, + "step": 19916 + }, + { + "epoch": 6.113259668508287, + "grad_norm": 0.19943594932556152, + "learning_rate": 3.465886425069074e-05, + "loss": 1.6926, + "step": 19917 + }, + { + "epoch": 6.1135666052793125, + "grad_norm": 0.19390980899333954, + "learning_rate": 3.46541335132264e-05, + "loss": 1.761, + "step": 19918 + }, + { + "epoch": 6.113873542050338, + "grad_norm": 0.22745995223522186, + "learning_rate": 3.4649402927418935e-05, + "loss": 1.7147, + "step": 19919 + }, + { + "epoch": 6.114180478821363, + "grad_norm": 0.17792920768260956, + "learning_rate": 3.4644672493315045e-05, + "loss": 1.6946, + "step": 19920 + }, + { + "epoch": 6.114487415592388, + "grad_norm": 0.2009986788034439, + "learning_rate": 3.463994221096152e-05, + "loss": 1.6977, + "step": 19921 + }, + { + "epoch": 6.114794352363413, + "grad_norm": 0.2448386251926422, + "learning_rate": 3.4635212080405066e-05, + "loss": 1.7169, + "step": 19922 + }, + { + "epoch": 6.115101289134438, + "grad_norm": 0.21506112813949585, + "learning_rate": 3.463048210169247e-05, + "loss": 1.6632, + "step": 19923 + }, + { + "epoch": 6.115408225905464, + "grad_norm": 0.1805233359336853, + "learning_rate": 3.462575227487045e-05, + "loss": 1.6742, + "step": 19924 + }, + { + "epoch": 6.115715162676489, + "grad_norm": 0.20023848116397858, + "learning_rate": 3.4621022599985766e-05, + "loss": 1.7106, + "step": 19925 + }, + { + "epoch": 6.116022099447513, + "grad_norm": 0.20388077199459076, + "learning_rate": 3.461629307708513e-05, + "loss": 1.7065, + "step": 19926 + }, + { + "epoch": 6.116329036218539, + "grad_norm": 0.23886005580425262, + "learning_rate": 3.461156370621533e-05, + "loss": 1.7177, + "step": 19927 + }, + { + "epoch": 6.116635972989564, + "grad_norm": 0.2054048627614975, + "learning_rate": 3.460683448742306e-05, + "loss": 1.6773, + "step": 19928 + }, + { + "epoch": 6.116942909760589, + "grad_norm": 0.1909634917974472, + "learning_rate": 3.460210542075508e-05, + "loss": 1.7562, + "step": 19929 + }, + { + "epoch": 6.117249846531615, + "grad_norm": 0.20221595466136932, + "learning_rate": 3.459737650625812e-05, + "loss": 1.7948, + "step": 19930 + }, + { + "epoch": 6.11755678330264, + "grad_norm": 0.25445356965065, + "learning_rate": 3.459264774397891e-05, + "loss": 1.7964, + "step": 19931 + }, + { + "epoch": 6.1178637200736645, + "grad_norm": 0.2227735072374344, + "learning_rate": 3.4587919133964176e-05, + "loss": 1.7833, + "step": 19932 + }, + { + "epoch": 6.11817065684469, + "grad_norm": 0.20591853559017181, + "learning_rate": 3.458319067626068e-05, + "loss": 1.7535, + "step": 19933 + }, + { + "epoch": 6.118477593615715, + "grad_norm": 0.22087402641773224, + "learning_rate": 3.4578462370915115e-05, + "loss": 1.7228, + "step": 19934 + }, + { + "epoch": 6.1187845303867405, + "grad_norm": 0.234156996011734, + "learning_rate": 3.457373421797423e-05, + "loss": 1.7167, + "step": 19935 + }, + { + "epoch": 6.119091467157766, + "grad_norm": 0.209685817360878, + "learning_rate": 3.4569006217484746e-05, + "loss": 1.6633, + "step": 19936 + }, + { + "epoch": 6.11939840392879, + "grad_norm": 0.18499237298965454, + "learning_rate": 3.4564278369493366e-05, + "loss": 1.6769, + "step": 19937 + }, + { + "epoch": 6.119705340699816, + "grad_norm": 0.2600767910480499, + "learning_rate": 3.455955067404686e-05, + "loss": 1.7788, + "step": 19938 + }, + { + "epoch": 6.120012277470841, + "grad_norm": 0.21499377489089966, + "learning_rate": 3.455482313119191e-05, + "loss": 1.789, + "step": 19939 + }, + { + "epoch": 6.120319214241866, + "grad_norm": 0.19618432223796844, + "learning_rate": 3.455009574097527e-05, + "loss": 1.7162, + "step": 19940 + }, + { + "epoch": 6.120626151012892, + "grad_norm": 0.23219916224479675, + "learning_rate": 3.4545368503443616e-05, + "loss": 1.7871, + "step": 19941 + }, + { + "epoch": 6.120933087783916, + "grad_norm": 0.22315794229507446, + "learning_rate": 3.45406414186437e-05, + "loss": 1.6944, + "step": 19942 + }, + { + "epoch": 6.121240024554941, + "grad_norm": 0.22536693513393402, + "learning_rate": 3.453591448662221e-05, + "loss": 1.7727, + "step": 19943 + }, + { + "epoch": 6.121546961325967, + "grad_norm": 0.21811100840568542, + "learning_rate": 3.45311877074259e-05, + "loss": 1.7037, + "step": 19944 + }, + { + "epoch": 6.121853898096992, + "grad_norm": 0.1957094967365265, + "learning_rate": 3.452646108110145e-05, + "loss": 1.7734, + "step": 19945 + }, + { + "epoch": 6.122160834868017, + "grad_norm": 0.185706228017807, + "learning_rate": 3.452173460769559e-05, + "loss": 1.6715, + "step": 19946 + }, + { + "epoch": 6.122467771639043, + "grad_norm": 0.21081562340259552, + "learning_rate": 3.4517008287255005e-05, + "loss": 1.7798, + "step": 19947 + }, + { + "epoch": 6.122774708410067, + "grad_norm": 0.24175535142421722, + "learning_rate": 3.451228211982642e-05, + "loss": 1.7111, + "step": 19948 + }, + { + "epoch": 6.1230816451810925, + "grad_norm": 0.244124636054039, + "learning_rate": 3.450755610545654e-05, + "loss": 1.7263, + "step": 19949 + }, + { + "epoch": 6.123388581952118, + "grad_norm": 0.21109984815120697, + "learning_rate": 3.45028302441921e-05, + "loss": 1.7556, + "step": 19950 + }, + { + "epoch": 6.123695518723143, + "grad_norm": 0.21721722185611725, + "learning_rate": 3.449810453607976e-05, + "loss": 1.7416, + "step": 19951 + }, + { + "epoch": 6.1240024554941686, + "grad_norm": 0.18695317208766937, + "learning_rate": 3.4493378981166216e-05, + "loss": 1.7128, + "step": 19952 + }, + { + "epoch": 6.124309392265193, + "grad_norm": 0.19175554811954498, + "learning_rate": 3.4488653579498206e-05, + "loss": 1.7014, + "step": 19953 + }, + { + "epoch": 6.124616329036218, + "grad_norm": 0.22297006845474243, + "learning_rate": 3.4483928331122405e-05, + "loss": 1.7231, + "step": 19954 + }, + { + "epoch": 6.124923265807244, + "grad_norm": 0.2407974898815155, + "learning_rate": 3.447920323608553e-05, + "loss": 1.7354, + "step": 19955 + }, + { + "epoch": 6.125230202578269, + "grad_norm": 0.19767232239246368, + "learning_rate": 3.447447829443425e-05, + "loss": 1.7487, + "step": 19956 + }, + { + "epoch": 6.125537139349294, + "grad_norm": 0.20033477246761322, + "learning_rate": 3.446975350621529e-05, + "loss": 1.7232, + "step": 19957 + }, + { + "epoch": 6.12584407612032, + "grad_norm": 0.20310243964195251, + "learning_rate": 3.446502887147532e-05, + "loss": 1.6946, + "step": 19958 + }, + { + "epoch": 6.126151012891344, + "grad_norm": 0.2322724461555481, + "learning_rate": 3.446030439026104e-05, + "loss": 1.7071, + "step": 19959 + }, + { + "epoch": 6.1264579496623695, + "grad_norm": 0.24134255945682526, + "learning_rate": 3.445558006261914e-05, + "loss": 1.7259, + "step": 19960 + }, + { + "epoch": 6.126764886433395, + "grad_norm": 0.22821731865406036, + "learning_rate": 3.445085588859632e-05, + "loss": 1.7488, + "step": 19961 + }, + { + "epoch": 6.12707182320442, + "grad_norm": 0.258241206407547, + "learning_rate": 3.444613186823924e-05, + "loss": 1.7403, + "step": 19962 + }, + { + "epoch": 6.1273787599754455, + "grad_norm": 0.18758481740951538, + "learning_rate": 3.4441408001594625e-05, + "loss": 1.7079, + "step": 19963 + }, + { + "epoch": 6.12768569674647, + "grad_norm": 0.24032682180404663, + "learning_rate": 3.443668428870911e-05, + "loss": 1.7377, + "step": 19964 + }, + { + "epoch": 6.127992633517495, + "grad_norm": 0.24468545615673065, + "learning_rate": 3.4431960729629406e-05, + "loss": 1.7724, + "step": 19965 + }, + { + "epoch": 6.128299570288521, + "grad_norm": 0.23840154707431793, + "learning_rate": 3.4427237324402197e-05, + "loss": 1.7813, + "step": 19966 + }, + { + "epoch": 6.128606507059546, + "grad_norm": 0.2476109117269516, + "learning_rate": 3.4422514073074165e-05, + "loss": 1.7578, + "step": 19967 + }, + { + "epoch": 6.128913443830571, + "grad_norm": 0.2109041064977646, + "learning_rate": 3.4417790975691974e-05, + "loss": 1.6917, + "step": 19968 + }, + { + "epoch": 6.129220380601596, + "grad_norm": 0.21841584146022797, + "learning_rate": 3.4413068032302296e-05, + "loss": 1.7511, + "step": 19969 + }, + { + "epoch": 6.129527317372621, + "grad_norm": 0.2111930102109909, + "learning_rate": 3.440834524295182e-05, + "loss": 1.7194, + "step": 19970 + }, + { + "epoch": 6.129834254143646, + "grad_norm": 0.21868006885051727, + "learning_rate": 3.440362260768721e-05, + "loss": 1.7933, + "step": 19971 + }, + { + "epoch": 6.130141190914672, + "grad_norm": 0.19846780598163605, + "learning_rate": 3.439890012655516e-05, + "loss": 1.6985, + "step": 19972 + }, + { + "epoch": 6.130448127685697, + "grad_norm": 0.218460813164711, + "learning_rate": 3.439417779960231e-05, + "loss": 1.7205, + "step": 19973 + }, + { + "epoch": 6.1307550644567215, + "grad_norm": 0.22504402697086334, + "learning_rate": 3.438945562687535e-05, + "loss": 1.7437, + "step": 19974 + }, + { + "epoch": 6.131062001227747, + "grad_norm": 0.35414671897888184, + "learning_rate": 3.438473360842093e-05, + "loss": 1.7641, + "step": 19975 + }, + { + "epoch": 6.131368937998772, + "grad_norm": 0.21090710163116455, + "learning_rate": 3.4380011744285726e-05, + "loss": 1.6817, + "step": 19976 + }, + { + "epoch": 6.1316758747697975, + "grad_norm": 0.19118748605251312, + "learning_rate": 3.437529003451639e-05, + "loss": 1.694, + "step": 19977 + }, + { + "epoch": 6.131982811540823, + "grad_norm": 0.2341139018535614, + "learning_rate": 3.437056847915962e-05, + "loss": 1.781, + "step": 19978 + }, + { + "epoch": 6.132289748311848, + "grad_norm": 0.19120962917804718, + "learning_rate": 3.4365847078262033e-05, + "loss": 1.6974, + "step": 19979 + }, + { + "epoch": 6.132596685082873, + "grad_norm": 0.1998066008090973, + "learning_rate": 3.436112583187033e-05, + "loss": 1.6933, + "step": 19980 + }, + { + "epoch": 6.132903621853898, + "grad_norm": 0.19839663803577423, + "learning_rate": 3.4356404740031123e-05, + "loss": 1.6867, + "step": 19981 + }, + { + "epoch": 6.133210558624923, + "grad_norm": 0.19892877340316772, + "learning_rate": 3.4351683802791114e-05, + "loss": 1.7349, + "step": 19982 + }, + { + "epoch": 6.133517495395949, + "grad_norm": 0.23215502500534058, + "learning_rate": 3.434696302019692e-05, + "loss": 1.7411, + "step": 19983 + }, + { + "epoch": 6.133824432166974, + "grad_norm": 0.21246971189975739, + "learning_rate": 3.4342242392295225e-05, + "loss": 1.6918, + "step": 19984 + }, + { + "epoch": 6.134131368937998, + "grad_norm": 0.18585935235023499, + "learning_rate": 3.4337521919132675e-05, + "loss": 1.71, + "step": 19985 + }, + { + "epoch": 6.134438305709024, + "grad_norm": 0.24194715917110443, + "learning_rate": 3.4332801600755896e-05, + "loss": 1.7314, + "step": 19986 + }, + { + "epoch": 6.134745242480049, + "grad_norm": 0.19925665855407715, + "learning_rate": 3.432808143721156e-05, + "loss": 1.7425, + "step": 19987 + }, + { + "epoch": 6.135052179251074, + "grad_norm": 0.22253449261188507, + "learning_rate": 3.43233614285463e-05, + "loss": 1.702, + "step": 19988 + }, + { + "epoch": 6.1353591160221, + "grad_norm": 0.22180478274822235, + "learning_rate": 3.4318641574806796e-05, + "loss": 1.6659, + "step": 19989 + }, + { + "epoch": 6.135666052793125, + "grad_norm": 0.19818264245986938, + "learning_rate": 3.431392187603964e-05, + "loss": 1.8057, + "step": 19990 + }, + { + "epoch": 6.1359729895641495, + "grad_norm": 0.34630170464515686, + "learning_rate": 3.4309202332291526e-05, + "loss": 1.7233, + "step": 19991 + }, + { + "epoch": 6.136279926335175, + "grad_norm": 0.2633006274700165, + "learning_rate": 3.430448294360905e-05, + "loss": 1.7421, + "step": 19992 + }, + { + "epoch": 6.1365868631062, + "grad_norm": 0.1976388394832611, + "learning_rate": 3.429976371003888e-05, + "loss": 1.7474, + "step": 19993 + }, + { + "epoch": 6.1368937998772255, + "grad_norm": 0.2386583834886551, + "learning_rate": 3.429504463162764e-05, + "loss": 1.7026, + "step": 19994 + }, + { + "epoch": 6.137200736648251, + "grad_norm": 0.20853812992572784, + "learning_rate": 3.4290325708422e-05, + "loss": 1.7846, + "step": 19995 + }, + { + "epoch": 6.137507673419275, + "grad_norm": 0.24667194485664368, + "learning_rate": 3.428560694046854e-05, + "loss": 1.6446, + "step": 19996 + }, + { + "epoch": 6.137814610190301, + "grad_norm": 0.24396342039108276, + "learning_rate": 3.428088832781394e-05, + "loss": 1.7368, + "step": 19997 + }, + { + "epoch": 6.138121546961326, + "grad_norm": 0.1958172619342804, + "learning_rate": 3.4276169870504804e-05, + "loss": 1.7197, + "step": 19998 + }, + { + "epoch": 6.138428483732351, + "grad_norm": 0.21487464010715485, + "learning_rate": 3.427145156858778e-05, + "loss": 1.7318, + "step": 19999 + }, + { + "epoch": 6.138735420503377, + "grad_norm": 0.2152775675058365, + "learning_rate": 3.4266733422109476e-05, + "loss": 1.7924, + "step": 20000 + }, + { + "epoch": 6.139042357274401, + "grad_norm": 0.17151346802711487, + "learning_rate": 3.426201543111656e-05, + "loss": 1.6915, + "step": 20001 + }, + { + "epoch": 6.139349294045426, + "grad_norm": 0.22197338938713074, + "learning_rate": 3.425729759565563e-05, + "loss": 1.8028, + "step": 20002 + }, + { + "epoch": 6.139656230816452, + "grad_norm": 0.23111973702907562, + "learning_rate": 3.42525799157733e-05, + "loss": 1.7515, + "step": 20003 + }, + { + "epoch": 6.139963167587477, + "grad_norm": 0.2829805314540863, + "learning_rate": 3.42478623915162e-05, + "loss": 1.8379, + "step": 20004 + }, + { + "epoch": 6.140270104358502, + "grad_norm": 0.23467600345611572, + "learning_rate": 3.424314502293096e-05, + "loss": 1.7755, + "step": 20005 + }, + { + "epoch": 6.140577041129528, + "grad_norm": 0.2047930657863617, + "learning_rate": 3.42384278100642e-05, + "loss": 1.7198, + "step": 20006 + }, + { + "epoch": 6.140883977900552, + "grad_norm": 0.1893673986196518, + "learning_rate": 3.423371075296253e-05, + "loss": 1.7318, + "step": 20007 + }, + { + "epoch": 6.1411909146715775, + "grad_norm": 0.21514710783958435, + "learning_rate": 3.422899385167259e-05, + "loss": 1.7499, + "step": 20008 + }, + { + "epoch": 6.141497851442603, + "grad_norm": 0.20030297338962555, + "learning_rate": 3.422427710624095e-05, + "loss": 1.7109, + "step": 20009 + }, + { + "epoch": 6.141804788213628, + "grad_norm": 0.23581266403198242, + "learning_rate": 3.421956051671426e-05, + "loss": 1.7834, + "step": 20010 + }, + { + "epoch": 6.1421117249846535, + "grad_norm": 0.22492484748363495, + "learning_rate": 3.421484408313911e-05, + "loss": 1.785, + "step": 20011 + }, + { + "epoch": 6.142418661755678, + "grad_norm": 0.34137019515037537, + "learning_rate": 3.421012780556215e-05, + "loss": 1.8101, + "step": 20012 + }, + { + "epoch": 6.142725598526703, + "grad_norm": 0.28489169478416443, + "learning_rate": 3.420541168402994e-05, + "loss": 1.7945, + "step": 20013 + }, + { + "epoch": 6.143032535297729, + "grad_norm": 0.259362131357193, + "learning_rate": 3.420069571858913e-05, + "loss": 1.7011, + "step": 20014 + }, + { + "epoch": 6.143339472068754, + "grad_norm": 0.3628309667110443, + "learning_rate": 3.419597990928628e-05, + "loss": 1.8273, + "step": 20015 + }, + { + "epoch": 6.143646408839779, + "grad_norm": 0.22306841611862183, + "learning_rate": 3.419126425616803e-05, + "loss": 1.7447, + "step": 20016 + }, + { + "epoch": 6.143953345610804, + "grad_norm": 0.36336812376976013, + "learning_rate": 3.4186548759280964e-05, + "loss": 1.7076, + "step": 20017 + }, + { + "epoch": 6.144260282381829, + "grad_norm": 0.23167413473129272, + "learning_rate": 3.418183341867172e-05, + "loss": 1.6924, + "step": 20018 + }, + { + "epoch": 6.144567219152854, + "grad_norm": 0.2541113495826721, + "learning_rate": 3.417711823438686e-05, + "loss": 1.755, + "step": 20019 + }, + { + "epoch": 6.14487415592388, + "grad_norm": 0.3733784854412079, + "learning_rate": 3.4172403206472975e-05, + "loss": 1.7087, + "step": 20020 + }, + { + "epoch": 6.145181092694905, + "grad_norm": 0.1940508335828781, + "learning_rate": 3.416768833497669e-05, + "loss": 1.717, + "step": 20021 + }, + { + "epoch": 6.14548802946593, + "grad_norm": 0.2707524001598358, + "learning_rate": 3.416297361994457e-05, + "loss": 1.7422, + "step": 20022 + }, + { + "epoch": 6.145794966236955, + "grad_norm": 0.25535452365875244, + "learning_rate": 3.415825906142326e-05, + "loss": 1.6915, + "step": 20023 + }, + { + "epoch": 6.14610190300798, + "grad_norm": 0.24094220995903015, + "learning_rate": 3.415354465945929e-05, + "loss": 1.7192, + "step": 20024 + }, + { + "epoch": 6.1464088397790055, + "grad_norm": 0.28329676389694214, + "learning_rate": 3.4148830414099306e-05, + "loss": 1.7272, + "step": 20025 + }, + { + "epoch": 6.146715776550031, + "grad_norm": 0.217180535197258, + "learning_rate": 3.414411632538984e-05, + "loss": 1.7195, + "step": 20026 + }, + { + "epoch": 6.147022713321056, + "grad_norm": 0.22693867981433868, + "learning_rate": 3.413940239337753e-05, + "loss": 1.6889, + "step": 20027 + }, + { + "epoch": 6.147329650092081, + "grad_norm": 0.30376315116882324, + "learning_rate": 3.413468861810892e-05, + "loss": 1.7741, + "step": 20028 + }, + { + "epoch": 6.147636586863106, + "grad_norm": 0.1928185671567917, + "learning_rate": 3.412997499963065e-05, + "loss": 1.6986, + "step": 20029 + }, + { + "epoch": 6.147943523634131, + "grad_norm": 0.260929137468338, + "learning_rate": 3.412526153798924e-05, + "loss": 1.7044, + "step": 20030 + }, + { + "epoch": 6.148250460405157, + "grad_norm": 0.23274847865104675, + "learning_rate": 3.4120548233231326e-05, + "loss": 1.7626, + "step": 20031 + }, + { + "epoch": 6.148557397176182, + "grad_norm": 0.2389308512210846, + "learning_rate": 3.411583508540344e-05, + "loss": 1.71, + "step": 20032 + }, + { + "epoch": 6.148864333947207, + "grad_norm": 0.2745562195777893, + "learning_rate": 3.411112209455219e-05, + "loss": 1.7144, + "step": 20033 + }, + { + "epoch": 6.149171270718232, + "grad_norm": 0.2369096428155899, + "learning_rate": 3.4106409260724135e-05, + "loss": 1.7879, + "step": 20034 + }, + { + "epoch": 6.149478207489257, + "grad_norm": 0.3103141486644745, + "learning_rate": 3.4101696583965874e-05, + "loss": 1.7862, + "step": 20035 + }, + { + "epoch": 6.149785144260282, + "grad_norm": 0.18625277280807495, + "learning_rate": 3.409698406432397e-05, + "loss": 1.7717, + "step": 20036 + }, + { + "epoch": 6.150092081031308, + "grad_norm": 0.2539508640766144, + "learning_rate": 3.409227170184497e-05, + "loss": 1.7023, + "step": 20037 + }, + { + "epoch": 6.150399017802333, + "grad_norm": 0.2185351699590683, + "learning_rate": 3.4087559496575474e-05, + "loss": 1.7283, + "step": 20038 + }, + { + "epoch": 6.150705954573358, + "grad_norm": 0.21225227415561676, + "learning_rate": 3.408284744856204e-05, + "loss": 1.7055, + "step": 20039 + }, + { + "epoch": 6.151012891344383, + "grad_norm": 0.23623189330101013, + "learning_rate": 3.407813555785125e-05, + "loss": 1.6862, + "step": 20040 + }, + { + "epoch": 6.151319828115408, + "grad_norm": 0.19061312079429626, + "learning_rate": 3.4073423824489634e-05, + "loss": 1.7501, + "step": 20041 + }, + { + "epoch": 6.151626764886434, + "grad_norm": 0.22176402807235718, + "learning_rate": 3.4068712248523804e-05, + "loss": 1.7417, + "step": 20042 + }, + { + "epoch": 6.151933701657459, + "grad_norm": 0.20093770325183868, + "learning_rate": 3.406400083000028e-05, + "loss": 1.7283, + "step": 20043 + }, + { + "epoch": 6.152240638428483, + "grad_norm": 0.21968910098075867, + "learning_rate": 3.4059289568965635e-05, + "loss": 1.7187, + "step": 20044 + }, + { + "epoch": 6.152547575199509, + "grad_norm": 0.19038841128349304, + "learning_rate": 3.4054578465466435e-05, + "loss": 1.7131, + "step": 20045 + }, + { + "epoch": 6.152854511970534, + "grad_norm": 0.2239457368850708, + "learning_rate": 3.404986751954925e-05, + "loss": 1.7643, + "step": 20046 + }, + { + "epoch": 6.153161448741559, + "grad_norm": 0.2357017546892166, + "learning_rate": 3.404515673126061e-05, + "loss": 1.7196, + "step": 20047 + }, + { + "epoch": 6.153468385512585, + "grad_norm": 0.2633310556411743, + "learning_rate": 3.4040446100647104e-05, + "loss": 1.7613, + "step": 20048 + }, + { + "epoch": 6.153775322283609, + "grad_norm": 0.28470975160598755, + "learning_rate": 3.403573562775524e-05, + "loss": 1.7564, + "step": 20049 + }, + { + "epoch": 6.1540822590546345, + "grad_norm": 0.37435805797576904, + "learning_rate": 3.40310253126316e-05, + "loss": 1.8365, + "step": 20050 + }, + { + "epoch": 6.15438919582566, + "grad_norm": 0.1706259697675705, + "learning_rate": 3.402631515532272e-05, + "loss": 1.7373, + "step": 20051 + }, + { + "epoch": 6.154696132596685, + "grad_norm": 0.30885928869247437, + "learning_rate": 3.402160515587518e-05, + "loss": 1.7152, + "step": 20052 + }, + { + "epoch": 6.1550030693677105, + "grad_norm": 0.21448500454425812, + "learning_rate": 3.40168953143355e-05, + "loss": 1.7463, + "step": 20053 + }, + { + "epoch": 6.155310006138736, + "grad_norm": 0.23774586617946625, + "learning_rate": 3.4012185630750204e-05, + "loss": 1.7268, + "step": 20054 + }, + { + "epoch": 6.15561694290976, + "grad_norm": 0.1943385899066925, + "learning_rate": 3.400747610516588e-05, + "loss": 1.6578, + "step": 20055 + }, + { + "epoch": 6.155923879680786, + "grad_norm": 0.27488210797309875, + "learning_rate": 3.400276673762903e-05, + "loss": 1.8204, + "step": 20056 + }, + { + "epoch": 6.156230816451811, + "grad_norm": 0.1871461570262909, + "learning_rate": 3.3998057528186244e-05, + "loss": 1.6775, + "step": 20057 + }, + { + "epoch": 6.156537753222836, + "grad_norm": 0.23566775023937225, + "learning_rate": 3.399334847688401e-05, + "loss": 1.7089, + "step": 20058 + }, + { + "epoch": 6.156844689993862, + "grad_norm": 0.26842471957206726, + "learning_rate": 3.398863958376891e-05, + "loss": 1.7554, + "step": 20059 + }, + { + "epoch": 6.157151626764886, + "grad_norm": 0.19267809391021729, + "learning_rate": 3.3983930848887435e-05, + "loss": 1.6709, + "step": 20060 + }, + { + "epoch": 6.157458563535911, + "grad_norm": 0.21130084991455078, + "learning_rate": 3.3979222272286156e-05, + "loss": 1.7312, + "step": 20061 + }, + { + "epoch": 6.157765500306937, + "grad_norm": 0.2322172224521637, + "learning_rate": 3.397451385401158e-05, + "loss": 1.8069, + "step": 20062 + }, + { + "epoch": 6.158072437077962, + "grad_norm": 0.21852418780326843, + "learning_rate": 3.396980559411027e-05, + "loss": 1.715, + "step": 20063 + }, + { + "epoch": 6.158379373848987, + "grad_norm": 0.21385829150676727, + "learning_rate": 3.3965097492628714e-05, + "loss": 1.6804, + "step": 20064 + }, + { + "epoch": 6.158686310620013, + "grad_norm": 0.21639080345630646, + "learning_rate": 3.3960389549613494e-05, + "loss": 1.655, + "step": 20065 + }, + { + "epoch": 6.158993247391037, + "grad_norm": 0.19219942390918732, + "learning_rate": 3.395568176511107e-05, + "loss": 1.7325, + "step": 20066 + }, + { + "epoch": 6.1593001841620625, + "grad_norm": 0.21853557229042053, + "learning_rate": 3.3950974139168024e-05, + "loss": 1.7204, + "step": 20067 + }, + { + "epoch": 6.159607120933088, + "grad_norm": 0.24144381284713745, + "learning_rate": 3.3946266671830854e-05, + "loss": 1.754, + "step": 20068 + }, + { + "epoch": 6.159914057704113, + "grad_norm": 0.2014230340719223, + "learning_rate": 3.394155936314609e-05, + "loss": 1.6905, + "step": 20069 + }, + { + "epoch": 6.1602209944751385, + "grad_norm": 0.26940762996673584, + "learning_rate": 3.393685221316025e-05, + "loss": 1.729, + "step": 20070 + }, + { + "epoch": 6.160527931246163, + "grad_norm": 0.1937808394432068, + "learning_rate": 3.3932145221919843e-05, + "loss": 1.7492, + "step": 20071 + }, + { + "epoch": 6.160834868017188, + "grad_norm": 0.2586243450641632, + "learning_rate": 3.39274383894714e-05, + "loss": 1.7706, + "step": 20072 + }, + { + "epoch": 6.161141804788214, + "grad_norm": 0.21995361149311066, + "learning_rate": 3.3922731715861416e-05, + "loss": 1.7716, + "step": 20073 + }, + { + "epoch": 6.161448741559239, + "grad_norm": 0.22915497422218323, + "learning_rate": 3.391802520113645e-05, + "loss": 1.716, + "step": 20074 + }, + { + "epoch": 6.161755678330264, + "grad_norm": 0.24317315220832825, + "learning_rate": 3.3913318845342956e-05, + "loss": 1.7392, + "step": 20075 + }, + { + "epoch": 6.162062615101289, + "grad_norm": 0.20439307391643524, + "learning_rate": 3.390861264852749e-05, + "loss": 1.7076, + "step": 20076 + }, + { + "epoch": 6.162369551872314, + "grad_norm": 0.2197176069021225, + "learning_rate": 3.3903906610736534e-05, + "loss": 1.7334, + "step": 20077 + }, + { + "epoch": 6.162676488643339, + "grad_norm": 0.21651993691921234, + "learning_rate": 3.389920073201662e-05, + "loss": 1.7651, + "step": 20078 + }, + { + "epoch": 6.162983425414365, + "grad_norm": 0.1999540627002716, + "learning_rate": 3.389449501241424e-05, + "loss": 1.7031, + "step": 20079 + }, + { + "epoch": 6.16329036218539, + "grad_norm": 0.21965044736862183, + "learning_rate": 3.38897894519759e-05, + "loss": 1.7243, + "step": 20080 + }, + { + "epoch": 6.163597298956415, + "grad_norm": 0.20127563178539276, + "learning_rate": 3.388508405074808e-05, + "loss": 1.693, + "step": 20081 + }, + { + "epoch": 6.16390423572744, + "grad_norm": 0.2143397182226181, + "learning_rate": 3.3880378808777336e-05, + "loss": 1.7304, + "step": 20082 + }, + { + "epoch": 6.164211172498465, + "grad_norm": 0.23116083443164825, + "learning_rate": 3.387567372611012e-05, + "loss": 1.7558, + "step": 20083 + }, + { + "epoch": 6.1645181092694905, + "grad_norm": 0.25513985753059387, + "learning_rate": 3.3870968802792946e-05, + "loss": 1.7169, + "step": 20084 + }, + { + "epoch": 6.164825046040516, + "grad_norm": 0.20549121499061584, + "learning_rate": 3.386626403887232e-05, + "loss": 1.7147, + "step": 20085 + }, + { + "epoch": 6.165131982811541, + "grad_norm": 0.2850625514984131, + "learning_rate": 3.386155943439473e-05, + "loss": 1.7865, + "step": 20086 + }, + { + "epoch": 6.165438919582566, + "grad_norm": 0.2689895033836365, + "learning_rate": 3.3856854989406675e-05, + "loss": 1.7576, + "step": 20087 + }, + { + "epoch": 6.165745856353591, + "grad_norm": 0.21677634119987488, + "learning_rate": 3.385215070395462e-05, + "loss": 1.7186, + "step": 20088 + }, + { + "epoch": 6.166052793124616, + "grad_norm": 0.19525155425071716, + "learning_rate": 3.384744657808509e-05, + "loss": 1.6713, + "step": 20089 + }, + { + "epoch": 6.166359729895642, + "grad_norm": 0.23097296059131622, + "learning_rate": 3.3842742611844555e-05, + "loss": 1.6975, + "step": 20090 + }, + { + "epoch": 6.166666666666667, + "grad_norm": 0.22210827469825745, + "learning_rate": 3.3838038805279516e-05, + "loss": 1.733, + "step": 20091 + }, + { + "epoch": 6.166973603437691, + "grad_norm": 0.3336607813835144, + "learning_rate": 3.383333515843643e-05, + "loss": 1.7441, + "step": 20092 + }, + { + "epoch": 6.167280540208717, + "grad_norm": 0.25274014472961426, + "learning_rate": 3.382863167136183e-05, + "loss": 1.7235, + "step": 20093 + }, + { + "epoch": 6.167587476979742, + "grad_norm": 0.3228790760040283, + "learning_rate": 3.3823928344102144e-05, + "loss": 1.8096, + "step": 20094 + }, + { + "epoch": 6.167894413750767, + "grad_norm": 0.34542208909988403, + "learning_rate": 3.381922517670389e-05, + "loss": 1.7431, + "step": 20095 + }, + { + "epoch": 6.168201350521793, + "grad_norm": 0.1921117901802063, + "learning_rate": 3.381452216921355e-05, + "loss": 1.787, + "step": 20096 + }, + { + "epoch": 6.168508287292818, + "grad_norm": 0.29019802808761597, + "learning_rate": 3.380981932167757e-05, + "loss": 1.7122, + "step": 20097 + }, + { + "epoch": 6.1688152240638425, + "grad_norm": 0.17999929189682007, + "learning_rate": 3.380511663414244e-05, + "loss": 1.7153, + "step": 20098 + }, + { + "epoch": 6.169122160834868, + "grad_norm": 0.2641841471195221, + "learning_rate": 3.380041410665466e-05, + "loss": 1.7317, + "step": 20099 + }, + { + "epoch": 6.169429097605893, + "grad_norm": 0.25492918491363525, + "learning_rate": 3.379571173926067e-05, + "loss": 1.6975, + "step": 20100 + }, + { + "epoch": 6.1697360343769185, + "grad_norm": 0.2554764151573181, + "learning_rate": 3.379100953200697e-05, + "loss": 1.7539, + "step": 20101 + }, + { + "epoch": 6.170042971147944, + "grad_norm": 0.2339072823524475, + "learning_rate": 3.378630748493999e-05, + "loss": 1.6871, + "step": 20102 + }, + { + "epoch": 6.170349907918968, + "grad_norm": 0.19663162529468536, + "learning_rate": 3.3781605598106236e-05, + "loss": 1.7419, + "step": 20103 + }, + { + "epoch": 6.170656844689994, + "grad_norm": 0.2479846328496933, + "learning_rate": 3.3776903871552166e-05, + "loss": 1.7849, + "step": 20104 + }, + { + "epoch": 6.170963781461019, + "grad_norm": 0.18630735576152802, + "learning_rate": 3.377220230532423e-05, + "loss": 1.7412, + "step": 20105 + }, + { + "epoch": 6.171270718232044, + "grad_norm": 0.2211095094680786, + "learning_rate": 3.376750089946892e-05, + "loss": 1.7445, + "step": 20106 + }, + { + "epoch": 6.17157765500307, + "grad_norm": 0.20783299207687378, + "learning_rate": 3.3762799654032653e-05, + "loss": 1.7346, + "step": 20107 + }, + { + "epoch": 6.171884591774095, + "grad_norm": 0.18022862076759338, + "learning_rate": 3.3758098569061934e-05, + "loss": 1.7083, + "step": 20108 + }, + { + "epoch": 6.172191528545119, + "grad_norm": 0.23707088828086853, + "learning_rate": 3.375339764460319e-05, + "loss": 1.8542, + "step": 20109 + }, + { + "epoch": 6.172498465316145, + "grad_norm": 0.2289234846830368, + "learning_rate": 3.3748696880702913e-05, + "loss": 1.7564, + "step": 20110 + }, + { + "epoch": 6.17280540208717, + "grad_norm": 0.28396767377853394, + "learning_rate": 3.374399627740752e-05, + "loss": 1.7349, + "step": 20111 + }, + { + "epoch": 6.173112338858195, + "grad_norm": 0.20154817402362823, + "learning_rate": 3.373929583476351e-05, + "loss": 1.7356, + "step": 20112 + }, + { + "epoch": 6.173419275629221, + "grad_norm": 0.22590605914592743, + "learning_rate": 3.373459555281728e-05, + "loss": 1.7291, + "step": 20113 + }, + { + "epoch": 6.173726212400245, + "grad_norm": 0.2145034223794937, + "learning_rate": 3.372989543161532e-05, + "loss": 1.7544, + "step": 20114 + }, + { + "epoch": 6.1740331491712706, + "grad_norm": 0.26797109842300415, + "learning_rate": 3.372519547120407e-05, + "loss": 1.743, + "step": 20115 + }, + { + "epoch": 6.174340085942296, + "grad_norm": 0.2795363664627075, + "learning_rate": 3.372049567162999e-05, + "loss": 1.7278, + "step": 20116 + }, + { + "epoch": 6.174647022713321, + "grad_norm": 0.21436716616153717, + "learning_rate": 3.3715796032939494e-05, + "loss": 1.7306, + "step": 20117 + }, + { + "epoch": 6.1749539594843466, + "grad_norm": 0.2593919336795807, + "learning_rate": 3.3711096555179064e-05, + "loss": 1.7323, + "step": 20118 + }, + { + "epoch": 6.175260896255371, + "grad_norm": 0.19639115035533905, + "learning_rate": 3.3706397238395124e-05, + "loss": 1.7444, + "step": 20119 + }, + { + "epoch": 6.175567833026396, + "grad_norm": 0.23408278822898865, + "learning_rate": 3.370169808263409e-05, + "loss": 1.7461, + "step": 20120 + }, + { + "epoch": 6.175874769797422, + "grad_norm": 0.21200022101402283, + "learning_rate": 3.369699908794246e-05, + "loss": 1.7588, + "step": 20121 + }, + { + "epoch": 6.176181706568447, + "grad_norm": 0.17609953880310059, + "learning_rate": 3.369230025436662e-05, + "loss": 1.6608, + "step": 20122 + }, + { + "epoch": 6.176488643339472, + "grad_norm": 0.19895964860916138, + "learning_rate": 3.3687601581953046e-05, + "loss": 1.729, + "step": 20123 + }, + { + "epoch": 6.176795580110497, + "grad_norm": 0.22833310067653656, + "learning_rate": 3.368290307074814e-05, + "loss": 1.7148, + "step": 20124 + }, + { + "epoch": 6.177102516881522, + "grad_norm": 0.1847219169139862, + "learning_rate": 3.367820472079835e-05, + "loss": 1.6894, + "step": 20125 + }, + { + "epoch": 6.1774094536525475, + "grad_norm": 0.20269884169101715, + "learning_rate": 3.36735065321501e-05, + "loss": 1.794, + "step": 20126 + }, + { + "epoch": 6.177716390423573, + "grad_norm": 0.19277122616767883, + "learning_rate": 3.3668808504849845e-05, + "loss": 1.6936, + "step": 20127 + }, + { + "epoch": 6.178023327194598, + "grad_norm": 0.23804394900798798, + "learning_rate": 3.3664110638943985e-05, + "loss": 1.746, + "step": 20128 + }, + { + "epoch": 6.1783302639656235, + "grad_norm": 0.20946018397808075, + "learning_rate": 3.365941293447897e-05, + "loss": 1.6952, + "step": 20129 + }, + { + "epoch": 6.178637200736648, + "grad_norm": 0.21680596470832825, + "learning_rate": 3.36547153915012e-05, + "loss": 1.7709, + "step": 20130 + }, + { + "epoch": 6.178944137507673, + "grad_norm": 0.22549709677696228, + "learning_rate": 3.365001801005712e-05, + "loss": 1.6814, + "step": 20131 + }, + { + "epoch": 6.179251074278699, + "grad_norm": 0.20660072565078735, + "learning_rate": 3.3645320790193136e-05, + "loss": 1.6992, + "step": 20132 + }, + { + "epoch": 6.179558011049724, + "grad_norm": 0.23697195947170258, + "learning_rate": 3.36406237319557e-05, + "loss": 1.7325, + "step": 20133 + }, + { + "epoch": 6.179864947820749, + "grad_norm": 0.20847748219966888, + "learning_rate": 3.363592683539118e-05, + "loss": 1.7066, + "step": 20134 + }, + { + "epoch": 6.180171884591774, + "grad_norm": 0.24317312240600586, + "learning_rate": 3.363123010054605e-05, + "loss": 1.7259, + "step": 20135 + }, + { + "epoch": 6.180478821362799, + "grad_norm": 0.22137925028800964, + "learning_rate": 3.3626533527466686e-05, + "loss": 1.7492, + "step": 20136 + }, + { + "epoch": 6.180785758133824, + "grad_norm": 0.23857460916042328, + "learning_rate": 3.362183711619951e-05, + "loss": 1.6671, + "step": 20137 + }, + { + "epoch": 6.18109269490485, + "grad_norm": 0.20017468929290771, + "learning_rate": 3.361714086679095e-05, + "loss": 1.7151, + "step": 20138 + }, + { + "epoch": 6.181399631675875, + "grad_norm": 0.21566617488861084, + "learning_rate": 3.361244477928739e-05, + "loss": 1.7659, + "step": 20139 + }, + { + "epoch": 6.1817065684469, + "grad_norm": 0.21695555746555328, + "learning_rate": 3.360774885373528e-05, + "loss": 1.7463, + "step": 20140 + }, + { + "epoch": 6.182013505217925, + "grad_norm": 0.19326116144657135, + "learning_rate": 3.360305309018098e-05, + "loss": 1.7182, + "step": 20141 + }, + { + "epoch": 6.18232044198895, + "grad_norm": 0.2135429084300995, + "learning_rate": 3.359835748867093e-05, + "loss": 1.8001, + "step": 20142 + }, + { + "epoch": 6.1826273787599755, + "grad_norm": 0.20097343623638153, + "learning_rate": 3.359366204925151e-05, + "loss": 1.7442, + "step": 20143 + }, + { + "epoch": 6.182934315531001, + "grad_norm": 0.212847501039505, + "learning_rate": 3.358896677196916e-05, + "loss": 1.7418, + "step": 20144 + }, + { + "epoch": 6.183241252302026, + "grad_norm": 0.18414677679538727, + "learning_rate": 3.358427165687024e-05, + "loss": 1.6813, + "step": 20145 + }, + { + "epoch": 6.183548189073051, + "grad_norm": 0.23170427978038788, + "learning_rate": 3.357957670400119e-05, + "loss": 1.7722, + "step": 20146 + }, + { + "epoch": 6.183855125844076, + "grad_norm": 0.28952550888061523, + "learning_rate": 3.357488191340837e-05, + "loss": 1.7785, + "step": 20147 + }, + { + "epoch": 6.184162062615101, + "grad_norm": 0.2126605361700058, + "learning_rate": 3.35701872851382e-05, + "loss": 1.7064, + "step": 20148 + }, + { + "epoch": 6.184468999386127, + "grad_norm": 0.2376919537782669, + "learning_rate": 3.356549281923706e-05, + "loss": 1.7322, + "step": 20149 + }, + { + "epoch": 6.184775936157152, + "grad_norm": 0.24168729782104492, + "learning_rate": 3.3560798515751375e-05, + "loss": 1.7296, + "step": 20150 + }, + { + "epoch": 6.185082872928176, + "grad_norm": 0.19746467471122742, + "learning_rate": 3.355610437472749e-05, + "loss": 1.7816, + "step": 20151 + }, + { + "epoch": 6.185389809699202, + "grad_norm": 0.2399774193763733, + "learning_rate": 3.3551410396211844e-05, + "loss": 1.7309, + "step": 20152 + }, + { + "epoch": 6.185696746470227, + "grad_norm": 0.20560777187347412, + "learning_rate": 3.3546716580250785e-05, + "loss": 1.7134, + "step": 20153 + }, + { + "epoch": 6.186003683241252, + "grad_norm": 0.22640523314476013, + "learning_rate": 3.354202292689072e-05, + "loss": 1.7572, + "step": 20154 + }, + { + "epoch": 6.186310620012278, + "grad_norm": 0.20796974003314972, + "learning_rate": 3.353732943617803e-05, + "loss": 1.6897, + "step": 20155 + }, + { + "epoch": 6.186617556783303, + "grad_norm": 0.19902797043323517, + "learning_rate": 3.35326361081591e-05, + "loss": 1.6836, + "step": 20156 + }, + { + "epoch": 6.1869244935543275, + "grad_norm": 0.30999818444252014, + "learning_rate": 3.352794294288032e-05, + "loss": 1.7704, + "step": 20157 + }, + { + "epoch": 6.187231430325353, + "grad_norm": 0.20634675025939941, + "learning_rate": 3.3523249940388045e-05, + "loss": 1.7599, + "step": 20158 + }, + { + "epoch": 6.187538367096378, + "grad_norm": 0.25650453567504883, + "learning_rate": 3.3518557100728674e-05, + "loss": 1.7441, + "step": 20159 + }, + { + "epoch": 6.1878453038674035, + "grad_norm": 0.2400079369544983, + "learning_rate": 3.351386442394858e-05, + "loss": 1.6836, + "step": 20160 + }, + { + "epoch": 6.188152240638429, + "grad_norm": 0.23734217882156372, + "learning_rate": 3.350917191009416e-05, + "loss": 1.7, + "step": 20161 + }, + { + "epoch": 6.188459177409453, + "grad_norm": 0.29579323530197144, + "learning_rate": 3.3504479559211755e-05, + "loss": 1.71, + "step": 20162 + }, + { + "epoch": 6.188766114180479, + "grad_norm": 0.18999184668064117, + "learning_rate": 3.349978737134776e-05, + "loss": 1.7396, + "step": 20163 + }, + { + "epoch": 6.189073050951504, + "grad_norm": 0.26760223507881165, + "learning_rate": 3.3495095346548525e-05, + "loss": 1.7846, + "step": 20164 + }, + { + "epoch": 6.189379987722529, + "grad_norm": 0.18416397273540497, + "learning_rate": 3.349040348486044e-05, + "loss": 1.6911, + "step": 20165 + }, + { + "epoch": 6.189686924493555, + "grad_norm": 0.23761679232120514, + "learning_rate": 3.348571178632986e-05, + "loss": 1.6776, + "step": 20166 + }, + { + "epoch": 6.189993861264579, + "grad_norm": 0.2056473195552826, + "learning_rate": 3.348102025100316e-05, + "loss": 1.697, + "step": 20167 + }, + { + "epoch": 6.190300798035604, + "grad_norm": 0.23916250467300415, + "learning_rate": 3.3476328878926685e-05, + "loss": 1.7943, + "step": 20168 + }, + { + "epoch": 6.19060773480663, + "grad_norm": 0.2205415964126587, + "learning_rate": 3.347163767014684e-05, + "loss": 1.8037, + "step": 20169 + }, + { + "epoch": 6.190914671577655, + "grad_norm": 0.28907346725463867, + "learning_rate": 3.346694662470995e-05, + "loss": 1.6875, + "step": 20170 + }, + { + "epoch": 6.19122160834868, + "grad_norm": 0.2382480502128601, + "learning_rate": 3.3462255742662364e-05, + "loss": 1.7116, + "step": 20171 + }, + { + "epoch": 6.191528545119706, + "grad_norm": 0.25309205055236816, + "learning_rate": 3.3457565024050485e-05, + "loss": 1.7584, + "step": 20172 + }, + { + "epoch": 6.19183548189073, + "grad_norm": 0.3959091901779175, + "learning_rate": 3.3452874468920626e-05, + "loss": 1.7054, + "step": 20173 + }, + { + "epoch": 6.1921424186617555, + "grad_norm": 0.22697016596794128, + "learning_rate": 3.344818407731918e-05, + "loss": 1.7373, + "step": 20174 + }, + { + "epoch": 6.192449355432781, + "grad_norm": 0.298178493976593, + "learning_rate": 3.3443493849292465e-05, + "loss": 1.7192, + "step": 20175 + }, + { + "epoch": 6.192756292203806, + "grad_norm": 0.2742854058742523, + "learning_rate": 3.343880378488685e-05, + "loss": 1.7538, + "step": 20176 + }, + { + "epoch": 6.1930632289748315, + "grad_norm": 0.23367546498775482, + "learning_rate": 3.343411388414867e-05, + "loss": 1.694, + "step": 20177 + }, + { + "epoch": 6.193370165745856, + "grad_norm": 0.2932305932044983, + "learning_rate": 3.342942414712431e-05, + "loss": 1.7291, + "step": 20178 + }, + { + "epoch": 6.193677102516881, + "grad_norm": 0.24306413531303406, + "learning_rate": 3.342473457386007e-05, + "loss": 1.6959, + "step": 20179 + }, + { + "epoch": 6.193984039287907, + "grad_norm": 0.30828577280044556, + "learning_rate": 3.3420045164402344e-05, + "loss": 1.6848, + "step": 20180 + }, + { + "epoch": 6.194290976058932, + "grad_norm": 0.18766994774341583, + "learning_rate": 3.341535591879743e-05, + "loss": 1.7261, + "step": 20181 + }, + { + "epoch": 6.194597912829957, + "grad_norm": 0.300778329372406, + "learning_rate": 3.3410666837091696e-05, + "loss": 1.7539, + "step": 20182 + }, + { + "epoch": 6.194904849600983, + "grad_norm": 0.20148977637290955, + "learning_rate": 3.340597791933147e-05, + "loss": 1.7496, + "step": 20183 + }, + { + "epoch": 6.195211786372007, + "grad_norm": 0.2746329605579376, + "learning_rate": 3.340128916556311e-05, + "loss": 1.6458, + "step": 20184 + }, + { + "epoch": 6.195518723143032, + "grad_norm": 0.2715265452861786, + "learning_rate": 3.339660057583292e-05, + "loss": 1.7799, + "step": 20185 + }, + { + "epoch": 6.195825659914058, + "grad_norm": 0.2145555317401886, + "learning_rate": 3.339191215018728e-05, + "loss": 1.6854, + "step": 20186 + }, + { + "epoch": 6.196132596685083, + "grad_norm": 0.3018960654735565, + "learning_rate": 3.338722388867248e-05, + "loss": 1.7569, + "step": 20187 + }, + { + "epoch": 6.196439533456108, + "grad_norm": 0.24876931309700012, + "learning_rate": 3.338253579133487e-05, + "loss": 1.7434, + "step": 20188 + }, + { + "epoch": 6.196746470227133, + "grad_norm": 0.3609273433685303, + "learning_rate": 3.337784785822079e-05, + "loss": 1.737, + "step": 20189 + }, + { + "epoch": 6.197053406998158, + "grad_norm": 0.21586830914020538, + "learning_rate": 3.337316008937655e-05, + "loss": 1.7553, + "step": 20190 + }, + { + "epoch": 6.1973603437691835, + "grad_norm": 0.23542988300323486, + "learning_rate": 3.3368472484848504e-05, + "loss": 1.7174, + "step": 20191 + }, + { + "epoch": 6.197667280540209, + "grad_norm": 0.19861294329166412, + "learning_rate": 3.336378504468294e-05, + "loss": 1.7268, + "step": 20192 + }, + { + "epoch": 6.197974217311234, + "grad_norm": 0.26865682005882263, + "learning_rate": 3.335909776892622e-05, + "loss": 1.7656, + "step": 20193 + }, + { + "epoch": 6.198281154082259, + "grad_norm": 0.343078076839447, + "learning_rate": 3.3354410657624624e-05, + "loss": 1.734, + "step": 20194 + }, + { + "epoch": 6.198588090853284, + "grad_norm": 0.21613667905330658, + "learning_rate": 3.334972371082453e-05, + "loss": 1.7777, + "step": 20195 + }, + { + "epoch": 6.198895027624309, + "grad_norm": 0.22268854081630707, + "learning_rate": 3.3345036928572207e-05, + "loss": 1.667, + "step": 20196 + }, + { + "epoch": 6.199201964395335, + "grad_norm": 0.22870087623596191, + "learning_rate": 3.3340350310914e-05, + "loss": 1.7532, + "step": 20197 + }, + { + "epoch": 6.19950890116636, + "grad_norm": 0.1969831883907318, + "learning_rate": 3.3335663857896205e-05, + "loss": 1.7821, + "step": 20198 + }, + { + "epoch": 6.199815837937384, + "grad_norm": 0.20414133369922638, + "learning_rate": 3.3330977569565154e-05, + "loss": 1.7449, + "step": 20199 + }, + { + "epoch": 6.20012277470841, + "grad_norm": 0.21947748959064484, + "learning_rate": 3.332629144596714e-05, + "loss": 1.6888, + "step": 20200 + }, + { + "epoch": 6.200429711479435, + "grad_norm": 0.20943035185337067, + "learning_rate": 3.332160548714851e-05, + "loss": 1.7278, + "step": 20201 + }, + { + "epoch": 6.2007366482504604, + "grad_norm": 0.22410117089748383, + "learning_rate": 3.331691969315553e-05, + "loss": 1.721, + "step": 20202 + }, + { + "epoch": 6.201043585021486, + "grad_norm": 0.21422281861305237, + "learning_rate": 3.3312234064034555e-05, + "loss": 1.7199, + "step": 20203 + }, + { + "epoch": 6.201350521792511, + "grad_norm": 0.21021418273448944, + "learning_rate": 3.330754859983184e-05, + "loss": 1.7972, + "step": 20204 + }, + { + "epoch": 6.201657458563536, + "grad_norm": 0.21155185997486115, + "learning_rate": 3.330286330059371e-05, + "loss": 1.7463, + "step": 20205 + }, + { + "epoch": 6.201964395334561, + "grad_norm": 0.20241162180900574, + "learning_rate": 3.329817816636649e-05, + "loss": 1.7804, + "step": 20206 + }, + { + "epoch": 6.202271332105586, + "grad_norm": 0.19882376492023468, + "learning_rate": 3.329349319719644e-05, + "loss": 1.7564, + "step": 20207 + }, + { + "epoch": 6.202578268876612, + "grad_norm": 0.20528686046600342, + "learning_rate": 3.328880839312991e-05, + "loss": 1.751, + "step": 20208 + }, + { + "epoch": 6.202885205647637, + "grad_norm": 0.2708488404750824, + "learning_rate": 3.328412375421315e-05, + "loss": 1.8008, + "step": 20209 + }, + { + "epoch": 6.203192142418661, + "grad_norm": 0.1986229121685028, + "learning_rate": 3.3279439280492486e-05, + "loss": 1.6833, + "step": 20210 + }, + { + "epoch": 6.203499079189687, + "grad_norm": 0.2700355350971222, + "learning_rate": 3.3274754972014186e-05, + "loss": 1.8071, + "step": 20211 + }, + { + "epoch": 6.203806015960712, + "grad_norm": 0.23060421645641327, + "learning_rate": 3.327007082882458e-05, + "loss": 1.6856, + "step": 20212 + }, + { + "epoch": 6.204112952731737, + "grad_norm": 0.20798510313034058, + "learning_rate": 3.3265386850969926e-05, + "loss": 1.7421, + "step": 20213 + }, + { + "epoch": 6.204419889502763, + "grad_norm": 0.21828265488147736, + "learning_rate": 3.3260703038496556e-05, + "loss": 1.7212, + "step": 20214 + }, + { + "epoch": 6.204726826273788, + "grad_norm": 0.1965378224849701, + "learning_rate": 3.325601939145069e-05, + "loss": 1.6987, + "step": 20215 + }, + { + "epoch": 6.2050337630448125, + "grad_norm": 0.23897121846675873, + "learning_rate": 3.325133590987868e-05, + "loss": 1.7501, + "step": 20216 + }, + { + "epoch": 6.205340699815838, + "grad_norm": 0.18647781014442444, + "learning_rate": 3.324665259382676e-05, + "loss": 1.688, + "step": 20217 + }, + { + "epoch": 6.205647636586863, + "grad_norm": 0.19906121492385864, + "learning_rate": 3.324196944334127e-05, + "loss": 1.749, + "step": 20218 + }, + { + "epoch": 6.2059545733578885, + "grad_norm": 0.2061154991388321, + "learning_rate": 3.3237286458468444e-05, + "loss": 1.757, + "step": 20219 + }, + { + "epoch": 6.206261510128914, + "grad_norm": 0.19410182535648346, + "learning_rate": 3.323260363925459e-05, + "loss": 1.6826, + "step": 20220 + }, + { + "epoch": 6.206568446899938, + "grad_norm": 0.2017979919910431, + "learning_rate": 3.322792098574597e-05, + "loss": 1.7568, + "step": 20221 + }, + { + "epoch": 6.206875383670964, + "grad_norm": 0.19491736590862274, + "learning_rate": 3.322323849798885e-05, + "loss": 1.7082, + "step": 20222 + }, + { + "epoch": 6.207182320441989, + "grad_norm": 0.19826333224773407, + "learning_rate": 3.321855617602954e-05, + "loss": 1.7654, + "step": 20223 + }, + { + "epoch": 6.207489257213014, + "grad_norm": 0.18185383081436157, + "learning_rate": 3.321387401991428e-05, + "loss": 1.6826, + "step": 20224 + }, + { + "epoch": 6.20779619398404, + "grad_norm": 0.22402678430080414, + "learning_rate": 3.320919202968937e-05, + "loss": 1.795, + "step": 20225 + }, + { + "epoch": 6.208103130755064, + "grad_norm": 0.201541468501091, + "learning_rate": 3.320451020540105e-05, + "loss": 1.6838, + "step": 20226 + }, + { + "epoch": 6.208410067526089, + "grad_norm": 0.25479504466056824, + "learning_rate": 3.3199828547095616e-05, + "loss": 1.7881, + "step": 20227 + }, + { + "epoch": 6.208717004297115, + "grad_norm": 0.2057993859052658, + "learning_rate": 3.31951470548193e-05, + "loss": 1.737, + "step": 20228 + }, + { + "epoch": 6.20902394106814, + "grad_norm": 0.183469757437706, + "learning_rate": 3.319046572861842e-05, + "loss": 1.6989, + "step": 20229 + }, + { + "epoch": 6.209330877839165, + "grad_norm": 0.21723738312721252, + "learning_rate": 3.318578456853919e-05, + "loss": 1.7537, + "step": 20230 + }, + { + "epoch": 6.209637814610191, + "grad_norm": 0.21919457614421844, + "learning_rate": 3.318110357462791e-05, + "loss": 1.7444, + "step": 20231 + }, + { + "epoch": 6.209944751381215, + "grad_norm": 0.17009909451007843, + "learning_rate": 3.317642274693081e-05, + "loss": 1.6885, + "step": 20232 + }, + { + "epoch": 6.2102516881522405, + "grad_norm": 0.19625195860862732, + "learning_rate": 3.317174208549416e-05, + "loss": 1.7255, + "step": 20233 + }, + { + "epoch": 6.210558624923266, + "grad_norm": 0.2131364941596985, + "learning_rate": 3.316706159036422e-05, + "loss": 1.7047, + "step": 20234 + }, + { + "epoch": 6.210865561694291, + "grad_norm": 0.18454425036907196, + "learning_rate": 3.316238126158725e-05, + "loss": 1.7536, + "step": 20235 + }, + { + "epoch": 6.2111724984653165, + "grad_norm": 0.2124820202589035, + "learning_rate": 3.3157701099209485e-05, + "loss": 1.7456, + "step": 20236 + }, + { + "epoch": 6.211479435236341, + "grad_norm": 0.1929594725370407, + "learning_rate": 3.3153021103277206e-05, + "loss": 1.7118, + "step": 20237 + }, + { + "epoch": 6.211786372007366, + "grad_norm": 0.19876480102539062, + "learning_rate": 3.314834127383664e-05, + "loss": 1.6855, + "step": 20238 + }, + { + "epoch": 6.212093308778392, + "grad_norm": 0.18902665376663208, + "learning_rate": 3.314366161093403e-05, + "loss": 1.7052, + "step": 20239 + }, + { + "epoch": 6.212400245549417, + "grad_norm": 0.1859758198261261, + "learning_rate": 3.313898211461566e-05, + "loss": 1.7277, + "step": 20240 + }, + { + "epoch": 6.212707182320442, + "grad_norm": 0.2160472422838211, + "learning_rate": 3.313430278492773e-05, + "loss": 1.6787, + "step": 20241 + }, + { + "epoch": 6.213014119091467, + "grad_norm": 0.24482262134552002, + "learning_rate": 3.312962362191652e-05, + "loss": 1.7439, + "step": 20242 + }, + { + "epoch": 6.213321055862492, + "grad_norm": 0.2343531847000122, + "learning_rate": 3.312494462562824e-05, + "loss": 1.7981, + "step": 20243 + }, + { + "epoch": 6.213627992633517, + "grad_norm": 0.2385960817337036, + "learning_rate": 3.3120265796109163e-05, + "loss": 1.7144, + "step": 20244 + }, + { + "epoch": 6.213934929404543, + "grad_norm": 0.21878042817115784, + "learning_rate": 3.3115587133405503e-05, + "loss": 1.7057, + "step": 20245 + }, + { + "epoch": 6.214241866175568, + "grad_norm": 0.23426075279712677, + "learning_rate": 3.311090863756351e-05, + "loss": 1.7372, + "step": 20246 + }, + { + "epoch": 6.214548802946593, + "grad_norm": 0.2369524985551834, + "learning_rate": 3.310623030862942e-05, + "loss": 1.7502, + "step": 20247 + }, + { + "epoch": 6.214855739717618, + "grad_norm": 0.31635788083076477, + "learning_rate": 3.3101552146649474e-05, + "loss": 1.7616, + "step": 20248 + }, + { + "epoch": 6.215162676488643, + "grad_norm": 0.2312999814748764, + "learning_rate": 3.309687415166986e-05, + "loss": 1.6991, + "step": 20249 + }, + { + "epoch": 6.2154696132596685, + "grad_norm": 0.23423358798027039, + "learning_rate": 3.309219632373688e-05, + "loss": 1.7737, + "step": 20250 + }, + { + "epoch": 6.215776550030694, + "grad_norm": 0.28763437271118164, + "learning_rate": 3.308751866289671e-05, + "loss": 1.7822, + "step": 20251 + }, + { + "epoch": 6.216083486801719, + "grad_norm": 0.20754525065422058, + "learning_rate": 3.30828411691956e-05, + "loss": 1.7427, + "step": 20252 + }, + { + "epoch": 6.216390423572744, + "grad_norm": 0.31858858466148376, + "learning_rate": 3.307816384267975e-05, + "loss": 1.7384, + "step": 20253 + }, + { + "epoch": 6.216697360343769, + "grad_norm": 0.21968062222003937, + "learning_rate": 3.307348668339543e-05, + "loss": 1.6896, + "step": 20254 + }, + { + "epoch": 6.217004297114794, + "grad_norm": 0.21643556654453278, + "learning_rate": 3.306880969138882e-05, + "loss": 1.7353, + "step": 20255 + }, + { + "epoch": 6.21731123388582, + "grad_norm": 0.22141097486019135, + "learning_rate": 3.306413286670616e-05, + "loss": 1.7254, + "step": 20256 + }, + { + "epoch": 6.217618170656845, + "grad_norm": 0.17666983604431152, + "learning_rate": 3.305945620939367e-05, + "loss": 1.7198, + "step": 20257 + }, + { + "epoch": 6.21792510742787, + "grad_norm": 0.25182467699050903, + "learning_rate": 3.3054779719497544e-05, + "loss": 1.7562, + "step": 20258 + }, + { + "epoch": 6.218232044198895, + "grad_norm": 0.23481281101703644, + "learning_rate": 3.305010339706404e-05, + "loss": 1.8293, + "step": 20259 + }, + { + "epoch": 6.21853898096992, + "grad_norm": 0.23981143534183502, + "learning_rate": 3.304542724213933e-05, + "loss": 1.7619, + "step": 20260 + }, + { + "epoch": 6.218845917740945, + "grad_norm": 0.2388351708650589, + "learning_rate": 3.3040751254769665e-05, + "loss": 1.7471, + "step": 20261 + }, + { + "epoch": 6.219152854511971, + "grad_norm": 0.2039698362350464, + "learning_rate": 3.3036075435001216e-05, + "loss": 1.6893, + "step": 20262 + }, + { + "epoch": 6.219459791282996, + "grad_norm": 0.218357652425766, + "learning_rate": 3.3031399782880224e-05, + "loss": 1.753, + "step": 20263 + }, + { + "epoch": 6.2197667280540205, + "grad_norm": 0.25466734170913696, + "learning_rate": 3.302672429845288e-05, + "loss": 1.7496, + "step": 20264 + }, + { + "epoch": 6.220073664825046, + "grad_norm": 0.1853330284357071, + "learning_rate": 3.302204898176541e-05, + "loss": 1.7779, + "step": 20265 + }, + { + "epoch": 6.220380601596071, + "grad_norm": 0.24044091999530792, + "learning_rate": 3.3017373832863976e-05, + "loss": 1.8226, + "step": 20266 + }, + { + "epoch": 6.2206875383670965, + "grad_norm": 0.2209070324897766, + "learning_rate": 3.3012698851794835e-05, + "loss": 1.7069, + "step": 20267 + }, + { + "epoch": 6.220994475138122, + "grad_norm": 0.2775282561779022, + "learning_rate": 3.3008024038604135e-05, + "loss": 1.7048, + "step": 20268 + }, + { + "epoch": 6.221301411909146, + "grad_norm": 0.22873717546463013, + "learning_rate": 3.3003349393338116e-05, + "loss": 1.7956, + "step": 20269 + }, + { + "epoch": 6.221608348680172, + "grad_norm": 0.27883464097976685, + "learning_rate": 3.2998674916042946e-05, + "loss": 1.6955, + "step": 20270 + }, + { + "epoch": 6.221915285451197, + "grad_norm": 0.2383071482181549, + "learning_rate": 3.2994000606764865e-05, + "loss": 1.7645, + "step": 20271 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 0.26280200481414795, + "learning_rate": 3.298932646555003e-05, + "loss": 1.7854, + "step": 20272 + }, + { + "epoch": 6.222529158993248, + "grad_norm": 0.2387673407793045, + "learning_rate": 3.2984652492444625e-05, + "loss": 1.679, + "step": 20273 + }, + { + "epoch": 6.222836095764273, + "grad_norm": 0.2136983871459961, + "learning_rate": 3.297997868749486e-05, + "loss": 1.7313, + "step": 20274 + }, + { + "epoch": 6.223143032535297, + "grad_norm": 0.2629627585411072, + "learning_rate": 3.297530505074692e-05, + "loss": 1.7452, + "step": 20275 + }, + { + "epoch": 6.223449969306323, + "grad_norm": 0.22018705308437347, + "learning_rate": 3.2970631582247e-05, + "loss": 1.7368, + "step": 20276 + }, + { + "epoch": 6.223756906077348, + "grad_norm": 0.19277356564998627, + "learning_rate": 3.296595828204128e-05, + "loss": 1.7084, + "step": 20277 + }, + { + "epoch": 6.224063842848373, + "grad_norm": 0.18806682527065277, + "learning_rate": 3.2961285150175944e-05, + "loss": 1.6576, + "step": 20278 + }, + { + "epoch": 6.224370779619399, + "grad_norm": 0.2019709348678589, + "learning_rate": 3.295661218669717e-05, + "loss": 1.7594, + "step": 20279 + }, + { + "epoch": 6.224677716390423, + "grad_norm": 0.19662119448184967, + "learning_rate": 3.295193939165114e-05, + "loss": 1.6946, + "step": 20280 + }, + { + "epoch": 6.2249846531614486, + "grad_norm": 0.1880662590265274, + "learning_rate": 3.294726676508404e-05, + "loss": 1.7232, + "step": 20281 + }, + { + "epoch": 6.225291589932474, + "grad_norm": 0.23242273926734924, + "learning_rate": 3.294259430704206e-05, + "loss": 1.7331, + "step": 20282 + }, + { + "epoch": 6.225598526703499, + "grad_norm": 0.19915202260017395, + "learning_rate": 3.293792201757134e-05, + "loss": 1.7844, + "step": 20283 + }, + { + "epoch": 6.225905463474525, + "grad_norm": 0.1845373958349228, + "learning_rate": 3.2933249896718097e-05, + "loss": 1.6803, + "step": 20284 + }, + { + "epoch": 6.226212400245549, + "grad_norm": 0.19340910017490387, + "learning_rate": 3.292857794452846e-05, + "loss": 1.6929, + "step": 20285 + }, + { + "epoch": 6.226519337016574, + "grad_norm": 0.21429216861724854, + "learning_rate": 3.292390616104863e-05, + "loss": 1.6833, + "step": 20286 + }, + { + "epoch": 6.2268262737876, + "grad_norm": 0.2267037034034729, + "learning_rate": 3.291923454632476e-05, + "loss": 1.7271, + "step": 20287 + }, + { + "epoch": 6.227133210558625, + "grad_norm": 0.23121988773345947, + "learning_rate": 3.2914563100403054e-05, + "loss": 1.8443, + "step": 20288 + }, + { + "epoch": 6.22744014732965, + "grad_norm": 0.20980899035930634, + "learning_rate": 3.290989182332964e-05, + "loss": 1.6907, + "step": 20289 + }, + { + "epoch": 6.227747084100676, + "grad_norm": 0.28162500262260437, + "learning_rate": 3.290522071515067e-05, + "loss": 1.7497, + "step": 20290 + }, + { + "epoch": 6.2280540208717, + "grad_norm": 0.2163640707731247, + "learning_rate": 3.290054977591234e-05, + "loss": 1.736, + "step": 20291 + }, + { + "epoch": 6.2283609576427255, + "grad_norm": 0.19144479930400848, + "learning_rate": 3.289587900566079e-05, + "loss": 1.7222, + "step": 20292 + }, + { + "epoch": 6.228667894413751, + "grad_norm": 0.24952897429466248, + "learning_rate": 3.2891208404442216e-05, + "loss": 1.7095, + "step": 20293 + }, + { + "epoch": 6.228974831184776, + "grad_norm": 0.19421981275081635, + "learning_rate": 3.288653797230272e-05, + "loss": 1.7231, + "step": 20294 + }, + { + "epoch": 6.2292817679558015, + "grad_norm": 0.22837944328784943, + "learning_rate": 3.288186770928851e-05, + "loss": 1.7404, + "step": 20295 + }, + { + "epoch": 6.229588704726826, + "grad_norm": 0.2292151004076004, + "learning_rate": 3.2877197615445685e-05, + "loss": 1.6999, + "step": 20296 + }, + { + "epoch": 6.229895641497851, + "grad_norm": 0.18376365303993225, + "learning_rate": 3.2872527690820456e-05, + "loss": 1.681, + "step": 20297 + }, + { + "epoch": 6.230202578268877, + "grad_norm": 0.21331918239593506, + "learning_rate": 3.286785793545893e-05, + "loss": 1.7362, + "step": 20298 + }, + { + "epoch": 6.230509515039902, + "grad_norm": 0.21247150003910065, + "learning_rate": 3.286318834940729e-05, + "loss": 1.7816, + "step": 20299 + }, + { + "epoch": 6.230816451810927, + "grad_norm": 0.19166043400764465, + "learning_rate": 3.285851893271165e-05, + "loss": 1.7209, + "step": 20300 + }, + { + "epoch": 6.231123388581952, + "grad_norm": 0.2139919251203537, + "learning_rate": 3.2853849685418195e-05, + "loss": 1.6946, + "step": 20301 + }, + { + "epoch": 6.231430325352977, + "grad_norm": 0.20296575129032135, + "learning_rate": 3.284918060757303e-05, + "loss": 1.6829, + "step": 20302 + }, + { + "epoch": 6.231737262124002, + "grad_norm": 0.2465996891260147, + "learning_rate": 3.2844511699222314e-05, + "loss": 1.751, + "step": 20303 + }, + { + "epoch": 6.232044198895028, + "grad_norm": 0.23327109217643738, + "learning_rate": 3.283984296041219e-05, + "loss": 1.736, + "step": 20304 + }, + { + "epoch": 6.232351135666053, + "grad_norm": 0.24316997826099396, + "learning_rate": 3.2835174391188806e-05, + "loss": 1.7187, + "step": 20305 + }, + { + "epoch": 6.232658072437078, + "grad_norm": 0.25280308723449707, + "learning_rate": 3.2830505991598294e-05, + "loss": 1.7087, + "step": 20306 + }, + { + "epoch": 6.232965009208103, + "grad_norm": 0.19143202900886536, + "learning_rate": 3.282583776168676e-05, + "loss": 1.674, + "step": 20307 + }, + { + "epoch": 6.233271945979128, + "grad_norm": 0.2667979598045349, + "learning_rate": 3.282116970150038e-05, + "loss": 1.7978, + "step": 20308 + }, + { + "epoch": 6.2335788827501535, + "grad_norm": 0.18397411704063416, + "learning_rate": 3.281650181108526e-05, + "loss": 1.7669, + "step": 20309 + }, + { + "epoch": 6.233885819521179, + "grad_norm": 0.2842588722705841, + "learning_rate": 3.281183409048756e-05, + "loss": 1.8238, + "step": 20310 + }, + { + "epoch": 6.234192756292204, + "grad_norm": 0.20290467143058777, + "learning_rate": 3.280716653975336e-05, + "loss": 1.7317, + "step": 20311 + }, + { + "epoch": 6.234499693063229, + "grad_norm": 0.224524587392807, + "learning_rate": 3.280249915892885e-05, + "loss": 1.8166, + "step": 20312 + }, + { + "epoch": 6.234806629834254, + "grad_norm": 0.28204405307769775, + "learning_rate": 3.2797831948060096e-05, + "loss": 1.7435, + "step": 20313 + }, + { + "epoch": 6.235113566605279, + "grad_norm": 0.2101798951625824, + "learning_rate": 3.2793164907193264e-05, + "loss": 1.6747, + "step": 20314 + }, + { + "epoch": 6.235420503376305, + "grad_norm": 0.1961289346218109, + "learning_rate": 3.278849803637445e-05, + "loss": 1.7131, + "step": 20315 + }, + { + "epoch": 6.23572744014733, + "grad_norm": 0.30541354417800903, + "learning_rate": 3.27838313356498e-05, + "loss": 1.8036, + "step": 20316 + }, + { + "epoch": 6.236034376918354, + "grad_norm": 0.21517200767993927, + "learning_rate": 3.277916480506541e-05, + "loss": 1.7684, + "step": 20317 + }, + { + "epoch": 6.23634131368938, + "grad_norm": 0.22871750593185425, + "learning_rate": 3.2774498444667426e-05, + "loss": 1.7545, + "step": 20318 + }, + { + "epoch": 6.236648250460405, + "grad_norm": 0.24596424400806427, + "learning_rate": 3.276983225450192e-05, + "loss": 1.6705, + "step": 20319 + }, + { + "epoch": 6.23695518723143, + "grad_norm": 0.19123119115829468, + "learning_rate": 3.2765166234615044e-05, + "loss": 1.7402, + "step": 20320 + }, + { + "epoch": 6.237262124002456, + "grad_norm": 0.25287121534347534, + "learning_rate": 3.276050038505288e-05, + "loss": 1.741, + "step": 20321 + }, + { + "epoch": 6.237569060773481, + "grad_norm": 0.19741536676883698, + "learning_rate": 3.275583470586158e-05, + "loss": 1.736, + "step": 20322 + }, + { + "epoch": 6.2378759975445055, + "grad_norm": 0.24529922008514404, + "learning_rate": 3.275116919708723e-05, + "loss": 1.6696, + "step": 20323 + }, + { + "epoch": 6.238182934315531, + "grad_norm": 0.25428420305252075, + "learning_rate": 3.274650385877591e-05, + "loss": 1.696, + "step": 20324 + }, + { + "epoch": 6.238489871086556, + "grad_norm": 0.19502994418144226, + "learning_rate": 3.274183869097377e-05, + "loss": 1.6976, + "step": 20325 + }, + { + "epoch": 6.2387968078575815, + "grad_norm": 0.23710335791110992, + "learning_rate": 3.273717369372688e-05, + "loss": 1.7395, + "step": 20326 + }, + { + "epoch": 6.239103744628607, + "grad_norm": 0.20904341340065002, + "learning_rate": 3.273250886708138e-05, + "loss": 1.7455, + "step": 20327 + }, + { + "epoch": 6.239410681399631, + "grad_norm": 0.2112383097410202, + "learning_rate": 3.272784421108332e-05, + "loss": 1.7401, + "step": 20328 + }, + { + "epoch": 6.239717618170657, + "grad_norm": 0.2310914695262909, + "learning_rate": 3.272317972577886e-05, + "loss": 1.8049, + "step": 20329 + }, + { + "epoch": 6.240024554941682, + "grad_norm": 0.18222108483314514, + "learning_rate": 3.271851541121404e-05, + "loss": 1.7119, + "step": 20330 + }, + { + "epoch": 6.240331491712707, + "grad_norm": 0.18739092350006104, + "learning_rate": 3.2713851267434984e-05, + "loss": 1.744, + "step": 20331 + }, + { + "epoch": 6.240638428483733, + "grad_norm": 0.17722012102603912, + "learning_rate": 3.2709187294487775e-05, + "loss": 1.7054, + "step": 20332 + }, + { + "epoch": 6.240945365254758, + "grad_norm": 0.18650192022323608, + "learning_rate": 3.270452349241854e-05, + "loss": 1.7272, + "step": 20333 + }, + { + "epoch": 6.241252302025782, + "grad_norm": 0.2004886120557785, + "learning_rate": 3.269985986127331e-05, + "loss": 1.6777, + "step": 20334 + }, + { + "epoch": 6.241559238796808, + "grad_norm": 0.1855446845293045, + "learning_rate": 3.269519640109823e-05, + "loss": 1.6823, + "step": 20335 + }, + { + "epoch": 6.241866175567833, + "grad_norm": 0.1950632780790329, + "learning_rate": 3.269053311193934e-05, + "loss": 1.7052, + "step": 20336 + }, + { + "epoch": 6.242173112338858, + "grad_norm": 0.19386698305606842, + "learning_rate": 3.268586999384276e-05, + "loss": 1.7431, + "step": 20337 + }, + { + "epoch": 6.242480049109884, + "grad_norm": 0.2266446053981781, + "learning_rate": 3.268120704685454e-05, + "loss": 1.735, + "step": 20338 + }, + { + "epoch": 6.242786985880908, + "grad_norm": 0.24133828282356262, + "learning_rate": 3.2676544271020814e-05, + "loss": 1.7707, + "step": 20339 + }, + { + "epoch": 6.2430939226519335, + "grad_norm": 0.22397162020206451, + "learning_rate": 3.267188166638763e-05, + "loss": 1.6943, + "step": 20340 + }, + { + "epoch": 6.243400859422959, + "grad_norm": 0.1614205688238144, + "learning_rate": 3.266721923300104e-05, + "loss": 1.6801, + "step": 20341 + }, + { + "epoch": 6.243707796193984, + "grad_norm": 0.22376522421836853, + "learning_rate": 3.2662556970907166e-05, + "loss": 1.6933, + "step": 20342 + }, + { + "epoch": 6.2440147329650095, + "grad_norm": 0.18614265322685242, + "learning_rate": 3.265789488015205e-05, + "loss": 1.7396, + "step": 20343 + }, + { + "epoch": 6.244321669736034, + "grad_norm": 0.2385358214378357, + "learning_rate": 3.265323296078181e-05, + "loss": 1.7782, + "step": 20344 + }, + { + "epoch": 6.244628606507059, + "grad_norm": 0.24316444993019104, + "learning_rate": 3.264857121284246e-05, + "loss": 1.7443, + "step": 20345 + }, + { + "epoch": 6.244935543278085, + "grad_norm": 0.184532031416893, + "learning_rate": 3.264390963638012e-05, + "loss": 1.7603, + "step": 20346 + }, + { + "epoch": 6.24524248004911, + "grad_norm": 0.2018461376428604, + "learning_rate": 3.2639248231440825e-05, + "loss": 1.7289, + "step": 20347 + }, + { + "epoch": 6.245549416820135, + "grad_norm": 0.23732338845729828, + "learning_rate": 3.263458699807066e-05, + "loss": 1.7924, + "step": 20348 + }, + { + "epoch": 6.245856353591161, + "grad_norm": 0.19645710289478302, + "learning_rate": 3.2629925936315674e-05, + "loss": 1.6855, + "step": 20349 + }, + { + "epoch": 6.246163290362185, + "grad_norm": 0.20730608701705933, + "learning_rate": 3.262526504622196e-05, + "loss": 1.7238, + "step": 20350 + }, + { + "epoch": 6.24647022713321, + "grad_norm": 0.21139587461948395, + "learning_rate": 3.2620604327835545e-05, + "loss": 1.7173, + "step": 20351 + }, + { + "epoch": 6.246777163904236, + "grad_norm": 0.22644877433776855, + "learning_rate": 3.261594378120252e-05, + "loss": 1.7976, + "step": 20352 + }, + { + "epoch": 6.247084100675261, + "grad_norm": 0.23719535768032074, + "learning_rate": 3.2611283406368906e-05, + "loss": 1.7549, + "step": 20353 + }, + { + "epoch": 6.247391037446286, + "grad_norm": 0.2046387791633606, + "learning_rate": 3.2606623203380807e-05, + "loss": 1.7343, + "step": 20354 + }, + { + "epoch": 6.247697974217311, + "grad_norm": 0.19325366616249084, + "learning_rate": 3.260196317228422e-05, + "loss": 1.7352, + "step": 20355 + }, + { + "epoch": 6.248004910988336, + "grad_norm": 0.2315458059310913, + "learning_rate": 3.259730331312526e-05, + "loss": 1.7838, + "step": 20356 + }, + { + "epoch": 6.2483118477593615, + "grad_norm": 0.24549536406993866, + "learning_rate": 3.2592643625949956e-05, + "loss": 1.7418, + "step": 20357 + }, + { + "epoch": 6.248618784530387, + "grad_norm": 0.2702246606349945, + "learning_rate": 3.258798411080432e-05, + "loss": 1.7651, + "step": 20358 + }, + { + "epoch": 6.248925721301412, + "grad_norm": 0.20515258610248566, + "learning_rate": 3.2583324767734444e-05, + "loss": 1.6866, + "step": 20359 + }, + { + "epoch": 6.249232658072437, + "grad_norm": 0.2696690261363983, + "learning_rate": 3.257866559678635e-05, + "loss": 1.7446, + "step": 20360 + }, + { + "epoch": 6.249539594843462, + "grad_norm": 0.19707174599170685, + "learning_rate": 3.2574006598006114e-05, + "loss": 1.6835, + "step": 20361 + }, + { + "epoch": 6.249846531614487, + "grad_norm": 0.23478952050209045, + "learning_rate": 3.256934777143974e-05, + "loss": 1.7344, + "step": 20362 + }, + { + "epoch": 6.250153468385513, + "grad_norm": 0.24214082956314087, + "learning_rate": 3.2564689117133306e-05, + "loss": 1.722, + "step": 20363 + }, + { + "epoch": 6.250460405156538, + "grad_norm": 0.18361221253871918, + "learning_rate": 3.256003063513281e-05, + "loss": 1.7336, + "step": 20364 + }, + { + "epoch": 6.250767341927563, + "grad_norm": 0.18548928201198578, + "learning_rate": 3.255537232548433e-05, + "loss": 1.6586, + "step": 20365 + }, + { + "epoch": 6.251074278698588, + "grad_norm": 0.2121812105178833, + "learning_rate": 3.2550714188233874e-05, + "loss": 1.7273, + "step": 20366 + }, + { + "epoch": 6.251381215469613, + "grad_norm": 0.2351878583431244, + "learning_rate": 3.25460562234275e-05, + "loss": 1.7101, + "step": 20367 + }, + { + "epoch": 6.2516881522406385, + "grad_norm": 0.20723144710063934, + "learning_rate": 3.2541398431111216e-05, + "loss": 1.7042, + "step": 20368 + }, + { + "epoch": 6.251995089011664, + "grad_norm": 0.19093643128871918, + "learning_rate": 3.2536740811331084e-05, + "loss": 1.7585, + "step": 20369 + }, + { + "epoch": 6.252302025782689, + "grad_norm": 0.27191361784935, + "learning_rate": 3.2532083364133094e-05, + "loss": 1.7734, + "step": 20370 + }, + { + "epoch": 6.252608962553714, + "grad_norm": 0.21019349992275238, + "learning_rate": 3.2527426089563306e-05, + "loss": 1.7015, + "step": 20371 + }, + { + "epoch": 6.252915899324739, + "grad_norm": 0.2300454080104828, + "learning_rate": 3.2522768987667744e-05, + "loss": 1.7311, + "step": 20372 + }, + { + "epoch": 6.253222836095764, + "grad_norm": 0.24723999202251434, + "learning_rate": 3.25181120584924e-05, + "loss": 1.674, + "step": 20373 + }, + { + "epoch": 6.25352977286679, + "grad_norm": 0.20302192866802216, + "learning_rate": 3.251345530208335e-05, + "loss": 1.6999, + "step": 20374 + }, + { + "epoch": 6.253836709637815, + "grad_norm": 0.25393861532211304, + "learning_rate": 3.250879871848655e-05, + "loss": 1.6761, + "step": 20375 + }, + { + "epoch": 6.25414364640884, + "grad_norm": 0.1879536211490631, + "learning_rate": 3.2504142307748064e-05, + "loss": 1.7233, + "step": 20376 + }, + { + "epoch": 6.254450583179865, + "grad_norm": 0.22197771072387695, + "learning_rate": 3.24994860699139e-05, + "loss": 1.6994, + "step": 20377 + }, + { + "epoch": 6.25475751995089, + "grad_norm": 0.24946242570877075, + "learning_rate": 3.249483000503008e-05, + "loss": 1.8488, + "step": 20378 + }, + { + "epoch": 6.255064456721915, + "grad_norm": 0.25218987464904785, + "learning_rate": 3.2490174113142594e-05, + "loss": 1.7947, + "step": 20379 + }, + { + "epoch": 6.255371393492941, + "grad_norm": 0.23970970511436462, + "learning_rate": 3.248551839429749e-05, + "loss": 1.785, + "step": 20380 + }, + { + "epoch": 6.255678330263966, + "grad_norm": 0.243649423122406, + "learning_rate": 3.248086284854074e-05, + "loss": 1.8089, + "step": 20381 + }, + { + "epoch": 6.2559852670349905, + "grad_norm": 0.18813125789165497, + "learning_rate": 3.247620747591838e-05, + "loss": 1.6892, + "step": 20382 + }, + { + "epoch": 6.256292203806016, + "grad_norm": 0.2495514154434204, + "learning_rate": 3.2471552276476404e-05, + "loss": 1.7573, + "step": 20383 + }, + { + "epoch": 6.256599140577041, + "grad_norm": 0.200107604265213, + "learning_rate": 3.2466897250260835e-05, + "loss": 1.7292, + "step": 20384 + }, + { + "epoch": 6.2569060773480665, + "grad_norm": 0.25782206654548645, + "learning_rate": 3.246224239731765e-05, + "loss": 1.8533, + "step": 20385 + }, + { + "epoch": 6.257213014119092, + "grad_norm": 0.1966158151626587, + "learning_rate": 3.245758771769288e-05, + "loss": 1.648, + "step": 20386 + }, + { + "epoch": 6.257519950890116, + "grad_norm": 0.23248116672039032, + "learning_rate": 3.245293321143249e-05, + "loss": 1.7277, + "step": 20387 + }, + { + "epoch": 6.257826887661142, + "grad_norm": 0.26347780227661133, + "learning_rate": 3.244827887858251e-05, + "loss": 1.7429, + "step": 20388 + }, + { + "epoch": 6.258133824432167, + "grad_norm": 0.20794285833835602, + "learning_rate": 3.244362471918894e-05, + "loss": 1.7358, + "step": 20389 + }, + { + "epoch": 6.258440761203192, + "grad_norm": 0.200898677110672, + "learning_rate": 3.243897073329774e-05, + "loss": 1.6661, + "step": 20390 + }, + { + "epoch": 6.258747697974218, + "grad_norm": 0.20945283770561218, + "learning_rate": 3.2434316920954935e-05, + "loss": 1.7036, + "step": 20391 + }, + { + "epoch": 6.259054634745242, + "grad_norm": 0.3154161274433136, + "learning_rate": 3.242966328220649e-05, + "loss": 1.8174, + "step": 20392 + }, + { + "epoch": 6.259361571516267, + "grad_norm": 0.19321799278259277, + "learning_rate": 3.242500981709843e-05, + "loss": 1.6823, + "step": 20393 + }, + { + "epoch": 6.259668508287293, + "grad_norm": 0.22610130906105042, + "learning_rate": 3.2420356525676696e-05, + "loss": 1.6865, + "step": 20394 + }, + { + "epoch": 6.259975445058318, + "grad_norm": 0.19190505146980286, + "learning_rate": 3.241570340798734e-05, + "loss": 1.6663, + "step": 20395 + }, + { + "epoch": 6.260282381829343, + "grad_norm": 0.21956418454647064, + "learning_rate": 3.2411050464076276e-05, + "loss": 1.7279, + "step": 20396 + }, + { + "epoch": 6.260589318600369, + "grad_norm": 0.2448553591966629, + "learning_rate": 3.240639769398956e-05, + "loss": 1.7438, + "step": 20397 + }, + { + "epoch": 6.260896255371393, + "grad_norm": 0.19194214046001434, + "learning_rate": 3.2401745097773096e-05, + "loss": 1.7429, + "step": 20398 + }, + { + "epoch": 6.2612031921424185, + "grad_norm": 0.2567521333694458, + "learning_rate": 3.239709267547291e-05, + "loss": 1.7051, + "step": 20399 + }, + { + "epoch": 6.261510128913444, + "grad_norm": 0.18335886299610138, + "learning_rate": 3.239244042713498e-05, + "loss": 1.6828, + "step": 20400 + }, + { + "epoch": 6.261817065684469, + "grad_norm": 0.20112362504005432, + "learning_rate": 3.238778835280527e-05, + "loss": 1.6887, + "step": 20401 + }, + { + "epoch": 6.2621240024554945, + "grad_norm": 0.17095179855823517, + "learning_rate": 3.238313645252975e-05, + "loss": 1.7202, + "step": 20402 + }, + { + "epoch": 6.262430939226519, + "grad_norm": 0.24681979417800903, + "learning_rate": 3.237848472635442e-05, + "loss": 1.7196, + "step": 20403 + }, + { + "epoch": 6.262737875997544, + "grad_norm": 0.2022300660610199, + "learning_rate": 3.237383317432522e-05, + "loss": 1.7265, + "step": 20404 + }, + { + "epoch": 6.26304481276857, + "grad_norm": 0.2900621294975281, + "learning_rate": 3.236918179648813e-05, + "loss": 1.7051, + "step": 20405 + }, + { + "epoch": 6.263351749539595, + "grad_norm": 0.37675586342811584, + "learning_rate": 3.2364530592889135e-05, + "loss": 1.7747, + "step": 20406 + }, + { + "epoch": 6.26365868631062, + "grad_norm": 0.19033703207969666, + "learning_rate": 3.235987956357416e-05, + "loss": 1.7529, + "step": 20407 + }, + { + "epoch": 6.263965623081646, + "grad_norm": 0.2877013385295868, + "learning_rate": 3.235522870858922e-05, + "loss": 1.6942, + "step": 20408 + }, + { + "epoch": 6.26427255985267, + "grad_norm": 0.22717125713825226, + "learning_rate": 3.235057802798023e-05, + "loss": 1.7302, + "step": 20409 + }, + { + "epoch": 6.264579496623695, + "grad_norm": 0.2571920156478882, + "learning_rate": 3.2345927521793185e-05, + "loss": 1.6782, + "step": 20410 + }, + { + "epoch": 6.264886433394721, + "grad_norm": 0.43085625767707825, + "learning_rate": 3.234127719007403e-05, + "loss": 1.7946, + "step": 20411 + }, + { + "epoch": 6.265193370165746, + "grad_norm": 0.19355928897857666, + "learning_rate": 3.2336627032868726e-05, + "loss": 1.7288, + "step": 20412 + }, + { + "epoch": 6.265500306936771, + "grad_norm": 0.24871474504470825, + "learning_rate": 3.233197705022322e-05, + "loss": 1.6862, + "step": 20413 + }, + { + "epoch": 6.265807243707796, + "grad_norm": 0.26919320225715637, + "learning_rate": 3.232732724218348e-05, + "loss": 1.8061, + "step": 20414 + }, + { + "epoch": 6.266114180478821, + "grad_norm": 0.21714363992214203, + "learning_rate": 3.2322677608795436e-05, + "loss": 1.7036, + "step": 20415 + }, + { + "epoch": 6.2664211172498465, + "grad_norm": 0.24496719241142273, + "learning_rate": 3.231802815010506e-05, + "loss": 1.7334, + "step": 20416 + }, + { + "epoch": 6.266728054020872, + "grad_norm": 0.22501519322395325, + "learning_rate": 3.231337886615831e-05, + "loss": 1.7545, + "step": 20417 + }, + { + "epoch": 6.267034990791897, + "grad_norm": 0.2683655917644501, + "learning_rate": 3.23087297570011e-05, + "loss": 1.7235, + "step": 20418 + }, + { + "epoch": 6.267341927562922, + "grad_norm": 0.23341359198093414, + "learning_rate": 3.230408082267938e-05, + "loss": 1.7389, + "step": 20419 + }, + { + "epoch": 6.267648864333947, + "grad_norm": 0.2914128601551056, + "learning_rate": 3.229943206323913e-05, + "loss": 1.7223, + "step": 20420 + }, + { + "epoch": 6.267955801104972, + "grad_norm": 0.2072528451681137, + "learning_rate": 3.229478347872625e-05, + "loss": 1.7422, + "step": 20421 + }, + { + "epoch": 6.268262737875998, + "grad_norm": 0.22678662836551666, + "learning_rate": 3.229013506918671e-05, + "loss": 1.6973, + "step": 20422 + }, + { + "epoch": 6.268569674647023, + "grad_norm": 0.1928883194923401, + "learning_rate": 3.228548683466643e-05, + "loss": 1.7235, + "step": 20423 + }, + { + "epoch": 6.268876611418047, + "grad_norm": 0.2402963638305664, + "learning_rate": 3.2280838775211345e-05, + "loss": 1.7587, + "step": 20424 + }, + { + "epoch": 6.269183548189073, + "grad_norm": 0.20416294038295746, + "learning_rate": 3.227619089086742e-05, + "loss": 1.7591, + "step": 20425 + }, + { + "epoch": 6.269490484960098, + "grad_norm": 0.20308947563171387, + "learning_rate": 3.227154318168053e-05, + "loss": 1.7264, + "step": 20426 + }, + { + "epoch": 6.269797421731123, + "grad_norm": 0.18733863532543182, + "learning_rate": 3.226689564769667e-05, + "loss": 1.6943, + "step": 20427 + }, + { + "epoch": 6.270104358502149, + "grad_norm": 0.183793842792511, + "learning_rate": 3.226224828896173e-05, + "loss": 1.7082, + "step": 20428 + }, + { + "epoch": 6.270411295273174, + "grad_norm": 0.20471547544002533, + "learning_rate": 3.225760110552165e-05, + "loss": 1.7352, + "step": 20429 + }, + { + "epoch": 6.2707182320441985, + "grad_norm": 0.23386713862419128, + "learning_rate": 3.225295409742234e-05, + "loss": 1.7666, + "step": 20430 + }, + { + "epoch": 6.271025168815224, + "grad_norm": 0.2024994194507599, + "learning_rate": 3.224830726470976e-05, + "loss": 1.6573, + "step": 20431 + }, + { + "epoch": 6.271332105586249, + "grad_norm": 0.2352776825428009, + "learning_rate": 3.2243660607429805e-05, + "loss": 1.7884, + "step": 20432 + }, + { + "epoch": 6.2716390423572745, + "grad_norm": 0.19755585491657257, + "learning_rate": 3.223901412562841e-05, + "loss": 1.6964, + "step": 20433 + }, + { + "epoch": 6.2719459791283, + "grad_norm": 0.25833839178085327, + "learning_rate": 3.223436781935148e-05, + "loss": 1.715, + "step": 20434 + }, + { + "epoch": 6.272252915899324, + "grad_norm": 0.2110220193862915, + "learning_rate": 3.222972168864493e-05, + "loss": 1.7617, + "step": 20435 + }, + { + "epoch": 6.27255985267035, + "grad_norm": 0.23262515664100647, + "learning_rate": 3.2225075733554685e-05, + "loss": 1.7616, + "step": 20436 + }, + { + "epoch": 6.272866789441375, + "grad_norm": 0.1926576942205429, + "learning_rate": 3.222042995412669e-05, + "loss": 1.6956, + "step": 20437 + }, + { + "epoch": 6.2731737262124, + "grad_norm": 0.20662757754325867, + "learning_rate": 3.22157843504068e-05, + "loss": 1.703, + "step": 20438 + }, + { + "epoch": 6.273480662983426, + "grad_norm": 0.22137406468391418, + "learning_rate": 3.2211138922440975e-05, + "loss": 1.6961, + "step": 20439 + }, + { + "epoch": 6.273787599754451, + "grad_norm": 0.25777003169059753, + "learning_rate": 3.2206493670275086e-05, + "loss": 1.704, + "step": 20440 + }, + { + "epoch": 6.274094536525475, + "grad_norm": 0.20540094375610352, + "learning_rate": 3.2201848593955046e-05, + "loss": 1.6759, + "step": 20441 + }, + { + "epoch": 6.274401473296501, + "grad_norm": 0.2447255402803421, + "learning_rate": 3.21972036935268e-05, + "loss": 1.7379, + "step": 20442 + }, + { + "epoch": 6.274708410067526, + "grad_norm": 0.2017194777727127, + "learning_rate": 3.219255896903619e-05, + "loss": 1.6518, + "step": 20443 + }, + { + "epoch": 6.2750153468385514, + "grad_norm": 0.22742003202438354, + "learning_rate": 3.2187914420529174e-05, + "loss": 1.7568, + "step": 20444 + }, + { + "epoch": 6.275322283609577, + "grad_norm": 0.2065356969833374, + "learning_rate": 3.218327004805161e-05, + "loss": 1.643, + "step": 20445 + }, + { + "epoch": 6.275629220380601, + "grad_norm": 0.18083053827285767, + "learning_rate": 3.217862585164942e-05, + "loss": 1.77, + "step": 20446 + }, + { + "epoch": 6.275936157151627, + "grad_norm": 0.2175968736410141, + "learning_rate": 3.2173981831368484e-05, + "loss": 1.738, + "step": 20447 + }, + { + "epoch": 6.276243093922652, + "grad_norm": 0.17635080218315125, + "learning_rate": 3.216933798725473e-05, + "loss": 1.7109, + "step": 20448 + }, + { + "epoch": 6.276550030693677, + "grad_norm": 0.22289423644542694, + "learning_rate": 3.216469431935401e-05, + "loss": 1.7853, + "step": 20449 + }, + { + "epoch": 6.276856967464703, + "grad_norm": 0.21214549243450165, + "learning_rate": 3.216005082771225e-05, + "loss": 1.8196, + "step": 20450 + }, + { + "epoch": 6.277163904235728, + "grad_norm": 0.21992212533950806, + "learning_rate": 3.215540751237531e-05, + "loss": 1.7445, + "step": 20451 + }, + { + "epoch": 6.277470841006752, + "grad_norm": 0.16256563365459442, + "learning_rate": 3.2150764373389096e-05, + "loss": 1.6582, + "step": 20452 + }, + { + "epoch": 6.277777777777778, + "grad_norm": 0.1885976791381836, + "learning_rate": 3.214612141079949e-05, + "loss": 1.7491, + "step": 20453 + }, + { + "epoch": 6.278084714548803, + "grad_norm": 0.24101774394512177, + "learning_rate": 3.2141478624652386e-05, + "loss": 1.7476, + "step": 20454 + }, + { + "epoch": 6.278391651319828, + "grad_norm": 0.23378998041152954, + "learning_rate": 3.213683601499364e-05, + "loss": 1.7575, + "step": 20455 + }, + { + "epoch": 6.278698588090854, + "grad_norm": 0.2032867670059204, + "learning_rate": 3.213219358186917e-05, + "loss": 1.6999, + "step": 20456 + }, + { + "epoch": 6.279005524861878, + "grad_norm": 0.21332181990146637, + "learning_rate": 3.2127551325324836e-05, + "loss": 1.6634, + "step": 20457 + }, + { + "epoch": 6.2793124616329035, + "grad_norm": 0.23767098784446716, + "learning_rate": 3.2122909245406494e-05, + "loss": 1.8023, + "step": 20458 + }, + { + "epoch": 6.279619398403929, + "grad_norm": 0.19987638294696808, + "learning_rate": 3.211826734216007e-05, + "loss": 1.6848, + "step": 20459 + }, + { + "epoch": 6.279926335174954, + "grad_norm": 0.22169579565525055, + "learning_rate": 3.2113625615631385e-05, + "loss": 1.7599, + "step": 20460 + }, + { + "epoch": 6.2802332719459795, + "grad_norm": 0.1768191009759903, + "learning_rate": 3.210898406586634e-05, + "loss": 1.6894, + "step": 20461 + }, + { + "epoch": 6.280540208717004, + "grad_norm": 0.1923041045665741, + "learning_rate": 3.21043426929108e-05, + "loss": 1.7379, + "step": 20462 + }, + { + "epoch": 6.280847145488029, + "grad_norm": 0.1836252212524414, + "learning_rate": 3.2099701496810644e-05, + "loss": 1.6748, + "step": 20463 + }, + { + "epoch": 6.281154082259055, + "grad_norm": 0.2203192561864853, + "learning_rate": 3.2095060477611705e-05, + "loss": 1.6969, + "step": 20464 + }, + { + "epoch": 6.28146101903008, + "grad_norm": 0.25511759519577026, + "learning_rate": 3.20904196353599e-05, + "loss": 1.7806, + "step": 20465 + }, + { + "epoch": 6.281767955801105, + "grad_norm": 0.19464822113513947, + "learning_rate": 3.208577897010106e-05, + "loss": 1.6784, + "step": 20466 + }, + { + "epoch": 6.28207489257213, + "grad_norm": 0.1949714869260788, + "learning_rate": 3.208113848188105e-05, + "loss": 1.713, + "step": 20467 + }, + { + "epoch": 6.282381829343155, + "grad_norm": 0.22094127535820007, + "learning_rate": 3.207649817074572e-05, + "loss": 1.7397, + "step": 20468 + }, + { + "epoch": 6.28268876611418, + "grad_norm": 0.22343899309635162, + "learning_rate": 3.2071858036740954e-05, + "loss": 1.717, + "step": 20469 + }, + { + "epoch": 6.282995702885206, + "grad_norm": 0.20854893326759338, + "learning_rate": 3.2067218079912584e-05, + "loss": 1.7255, + "step": 20470 + }, + { + "epoch": 6.283302639656231, + "grad_norm": 0.21306286752223969, + "learning_rate": 3.206257830030649e-05, + "loss": 1.7251, + "step": 20471 + }, + { + "epoch": 6.283609576427256, + "grad_norm": 0.24995777010917664, + "learning_rate": 3.20579386979685e-05, + "loss": 1.7892, + "step": 20472 + }, + { + "epoch": 6.283916513198281, + "grad_norm": 0.23720023036003113, + "learning_rate": 3.2053299272944486e-05, + "loss": 1.7843, + "step": 20473 + }, + { + "epoch": 6.284223449969306, + "grad_norm": 0.2042113095521927, + "learning_rate": 3.204866002528029e-05, + "loss": 1.7318, + "step": 20474 + }, + { + "epoch": 6.2845303867403315, + "grad_norm": 0.22996367514133453, + "learning_rate": 3.2044020955021735e-05, + "loss": 1.6875, + "step": 20475 + }, + { + "epoch": 6.284837323511357, + "grad_norm": 0.187749981880188, + "learning_rate": 3.203938206221471e-05, + "loss": 1.7297, + "step": 20476 + }, + { + "epoch": 6.285144260282382, + "grad_norm": 0.18279509246349335, + "learning_rate": 3.2034743346905025e-05, + "loss": 1.6858, + "step": 20477 + }, + { + "epoch": 6.285451197053407, + "grad_norm": 0.1871512532234192, + "learning_rate": 3.203010480913855e-05, + "loss": 1.7224, + "step": 20478 + }, + { + "epoch": 6.285758133824432, + "grad_norm": 0.17732922732830048, + "learning_rate": 3.202546644896109e-05, + "loss": 1.6872, + "step": 20479 + }, + { + "epoch": 6.286065070595457, + "grad_norm": 0.21146097779273987, + "learning_rate": 3.2020828266418527e-05, + "loss": 1.797, + "step": 20480 + }, + { + "epoch": 6.286372007366483, + "grad_norm": 0.18914340436458588, + "learning_rate": 3.201619026155666e-05, + "loss": 1.7149, + "step": 20481 + }, + { + "epoch": 6.286678944137508, + "grad_norm": 0.20919133722782135, + "learning_rate": 3.2011552434421364e-05, + "loss": 1.7803, + "step": 20482 + }, + { + "epoch": 6.286985880908533, + "grad_norm": 0.17882505059242249, + "learning_rate": 3.200691478505843e-05, + "loss": 1.757, + "step": 20483 + }, + { + "epoch": 6.287292817679558, + "grad_norm": 0.1850014477968216, + "learning_rate": 3.200227731351373e-05, + "loss": 1.7006, + "step": 20484 + }, + { + "epoch": 6.287599754450583, + "grad_norm": 0.19999323785305023, + "learning_rate": 3.1997640019833056e-05, + "loss": 1.702, + "step": 20485 + }, + { + "epoch": 6.287906691221608, + "grad_norm": 0.20464713871479034, + "learning_rate": 3.1993002904062255e-05, + "loss": 1.7272, + "step": 20486 + }, + { + "epoch": 6.288213627992634, + "grad_norm": 0.2105564922094345, + "learning_rate": 3.1988365966247154e-05, + "loss": 1.7062, + "step": 20487 + }, + { + "epoch": 6.288520564763659, + "grad_norm": 0.26322871446609497, + "learning_rate": 3.198372920643359e-05, + "loss": 1.7309, + "step": 20488 + }, + { + "epoch": 6.2888275015346835, + "grad_norm": 0.22787201404571533, + "learning_rate": 3.197909262466736e-05, + "loss": 1.7797, + "step": 20489 + }, + { + "epoch": 6.289134438305709, + "grad_norm": 0.21409621834754944, + "learning_rate": 3.1974456220994314e-05, + "loss": 1.8211, + "step": 20490 + }, + { + "epoch": 6.289441375076734, + "grad_norm": 0.2241450846195221, + "learning_rate": 3.196981999546025e-05, + "loss": 1.7255, + "step": 20491 + }, + { + "epoch": 6.2897483118477595, + "grad_norm": 0.23141883313655853, + "learning_rate": 3.1965183948110985e-05, + "loss": 1.7695, + "step": 20492 + }, + { + "epoch": 6.290055248618785, + "grad_norm": 0.209358349442482, + "learning_rate": 3.196054807899236e-05, + "loss": 1.6808, + "step": 20493 + }, + { + "epoch": 6.290362185389809, + "grad_norm": 0.20730538666248322, + "learning_rate": 3.195591238815015e-05, + "loss": 1.6847, + "step": 20494 + }, + { + "epoch": 6.290669122160835, + "grad_norm": 0.2568998634815216, + "learning_rate": 3.195127687563021e-05, + "loss": 1.664, + "step": 20495 + }, + { + "epoch": 6.29097605893186, + "grad_norm": 0.238932803273201, + "learning_rate": 3.1946641541478316e-05, + "loss": 1.7166, + "step": 20496 + }, + { + "epoch": 6.291282995702885, + "grad_norm": 0.235393688082695, + "learning_rate": 3.19420063857403e-05, + "loss": 1.6572, + "step": 20497 + }, + { + "epoch": 6.291589932473911, + "grad_norm": 0.2888807952404022, + "learning_rate": 3.1937371408461944e-05, + "loss": 1.7484, + "step": 20498 + }, + { + "epoch": 6.291896869244935, + "grad_norm": 0.18588709831237793, + "learning_rate": 3.1932736609689096e-05, + "loss": 1.7027, + "step": 20499 + }, + { + "epoch": 6.29220380601596, + "grad_norm": 0.3065604865550995, + "learning_rate": 3.1928101989467514e-05, + "loss": 1.8051, + "step": 20500 + }, + { + "epoch": 6.292510742786986, + "grad_norm": 0.2480497658252716, + "learning_rate": 3.192346754784304e-05, + "loss": 1.7749, + "step": 20501 + }, + { + "epoch": 6.292817679558011, + "grad_norm": 0.268686443567276, + "learning_rate": 3.1918833284861436e-05, + "loss": 1.7062, + "step": 20502 + }, + { + "epoch": 6.293124616329036, + "grad_norm": 0.337510883808136, + "learning_rate": 3.191419920056853e-05, + "loss": 1.745, + "step": 20503 + }, + { + "epoch": 6.293431553100062, + "grad_norm": 0.18532821536064148, + "learning_rate": 3.190956529501009e-05, + "loss": 1.7098, + "step": 20504 + }, + { + "epoch": 6.293738489871086, + "grad_norm": 0.27805468440055847, + "learning_rate": 3.1904931568231956e-05, + "loss": 1.7252, + "step": 20505 + }, + { + "epoch": 6.2940454266421115, + "grad_norm": 0.22137443721294403, + "learning_rate": 3.190029802027987e-05, + "loss": 1.7595, + "step": 20506 + }, + { + "epoch": 6.294352363413137, + "grad_norm": 0.23159445822238922, + "learning_rate": 3.189566465119968e-05, + "loss": 1.7503, + "step": 20507 + }, + { + "epoch": 6.294659300184162, + "grad_norm": 0.2089100182056427, + "learning_rate": 3.189103146103712e-05, + "loss": 1.7021, + "step": 20508 + }, + { + "epoch": 6.2949662369551875, + "grad_norm": 0.1985119879245758, + "learning_rate": 3.1886398449838e-05, + "loss": 1.7468, + "step": 20509 + }, + { + "epoch": 6.295273173726212, + "grad_norm": 0.18612028658390045, + "learning_rate": 3.188176561764812e-05, + "loss": 1.6657, + "step": 20510 + }, + { + "epoch": 6.295580110497237, + "grad_norm": 0.22453728318214417, + "learning_rate": 3.1877132964513226e-05, + "loss": 1.7223, + "step": 20511 + }, + { + "epoch": 6.295887047268263, + "grad_norm": 0.270304799079895, + "learning_rate": 3.187250049047916e-05, + "loss": 1.7548, + "step": 20512 + }, + { + "epoch": 6.296193984039288, + "grad_norm": 0.19762152433395386, + "learning_rate": 3.1867868195591643e-05, + "loss": 1.6945, + "step": 20513 + }, + { + "epoch": 6.296500920810313, + "grad_norm": 0.25173795223236084, + "learning_rate": 3.1863236079896486e-05, + "loss": 1.7303, + "step": 20514 + }, + { + "epoch": 6.296807857581339, + "grad_norm": 0.2073308676481247, + "learning_rate": 3.185860414343945e-05, + "loss": 1.7327, + "step": 20515 + }, + { + "epoch": 6.297114794352363, + "grad_norm": 0.24174070358276367, + "learning_rate": 3.185397238626635e-05, + "loss": 1.7577, + "step": 20516 + }, + { + "epoch": 6.297421731123388, + "grad_norm": 0.1950366348028183, + "learning_rate": 3.1849340808422905e-05, + "loss": 1.7137, + "step": 20517 + }, + { + "epoch": 6.297728667894414, + "grad_norm": 0.23416653275489807, + "learning_rate": 3.1844709409954936e-05, + "loss": 1.7547, + "step": 20518 + }, + { + "epoch": 6.298035604665439, + "grad_norm": 0.1939592808485031, + "learning_rate": 3.184007819090817e-05, + "loss": 1.7215, + "step": 20519 + }, + { + "epoch": 6.298342541436464, + "grad_norm": 0.21807245910167694, + "learning_rate": 3.1835447151328405e-05, + "loss": 1.7021, + "step": 20520 + }, + { + "epoch": 6.298649478207489, + "grad_norm": 0.21653762459754944, + "learning_rate": 3.183081629126138e-05, + "loss": 1.7426, + "step": 20521 + }, + { + "epoch": 6.298956414978514, + "grad_norm": 0.20749153196811676, + "learning_rate": 3.18261856107529e-05, + "loss": 1.7302, + "step": 20522 + }, + { + "epoch": 6.2992633517495396, + "grad_norm": 0.23450545966625214, + "learning_rate": 3.182155510984869e-05, + "loss": 1.7414, + "step": 20523 + }, + { + "epoch": 6.299570288520565, + "grad_norm": 0.17081578075885773, + "learning_rate": 3.181692478859455e-05, + "loss": 1.7017, + "step": 20524 + }, + { + "epoch": 6.29987722529159, + "grad_norm": 0.20244698226451874, + "learning_rate": 3.18122946470362e-05, + "loss": 1.6765, + "step": 20525 + }, + { + "epoch": 6.300184162062616, + "grad_norm": 0.20153406262397766, + "learning_rate": 3.180766468521941e-05, + "loss": 1.7437, + "step": 20526 + }, + { + "epoch": 6.30049109883364, + "grad_norm": 0.21135647594928741, + "learning_rate": 3.180303490318996e-05, + "loss": 1.7202, + "step": 20527 + }, + { + "epoch": 6.300798035604665, + "grad_norm": 0.20342735946178436, + "learning_rate": 3.1798405300993555e-05, + "loss": 1.7268, + "step": 20528 + }, + { + "epoch": 6.301104972375691, + "grad_norm": 0.21153734624385834, + "learning_rate": 3.1793775878676e-05, + "loss": 1.7455, + "step": 20529 + }, + { + "epoch": 6.301411909146716, + "grad_norm": 0.2197744995355606, + "learning_rate": 3.1789146636283015e-05, + "loss": 1.7876, + "step": 20530 + }, + { + "epoch": 6.301718845917741, + "grad_norm": 0.2236124575138092, + "learning_rate": 3.1784517573860356e-05, + "loss": 1.7454, + "step": 20531 + }, + { + "epoch": 6.302025782688766, + "grad_norm": 0.22071333229541779, + "learning_rate": 3.177988869145376e-05, + "loss": 1.7197, + "step": 20532 + }, + { + "epoch": 6.302332719459791, + "grad_norm": 0.20137591660022736, + "learning_rate": 3.177525998910901e-05, + "loss": 1.7153, + "step": 20533 + }, + { + "epoch": 6.3026396562308165, + "grad_norm": 0.18981720507144928, + "learning_rate": 3.17706314668718e-05, + "loss": 1.6948, + "step": 20534 + }, + { + "epoch": 6.302946593001842, + "grad_norm": 0.20803335309028625, + "learning_rate": 3.176600312478791e-05, + "loss": 1.7454, + "step": 20535 + }, + { + "epoch": 6.303253529772867, + "grad_norm": 0.2224191278219223, + "learning_rate": 3.176137496290305e-05, + "loss": 1.708, + "step": 20536 + }, + { + "epoch": 6.303560466543892, + "grad_norm": 0.21110501885414124, + "learning_rate": 3.175674698126298e-05, + "loss": 1.6976, + "step": 20537 + }, + { + "epoch": 6.303867403314917, + "grad_norm": 0.19902437925338745, + "learning_rate": 3.175211917991342e-05, + "loss": 1.7246, + "step": 20538 + }, + { + "epoch": 6.304174340085942, + "grad_norm": 0.1930927336215973, + "learning_rate": 3.174749155890013e-05, + "loss": 1.7849, + "step": 20539 + }, + { + "epoch": 6.304481276856968, + "grad_norm": 0.19350691139698029, + "learning_rate": 3.174286411826881e-05, + "loss": 1.7441, + "step": 20540 + }, + { + "epoch": 6.304788213627993, + "grad_norm": 0.18532924354076385, + "learning_rate": 3.173823685806523e-05, + "loss": 1.6675, + "step": 20541 + }, + { + "epoch": 6.305095150399017, + "grad_norm": 0.18890263140201569, + "learning_rate": 3.173360977833508e-05, + "loss": 1.7889, + "step": 20542 + }, + { + "epoch": 6.305402087170043, + "grad_norm": 0.20418904721736908, + "learning_rate": 3.17289828791241e-05, + "loss": 1.8298, + "step": 20543 + }, + { + "epoch": 6.305709023941068, + "grad_norm": 0.2298857718706131, + "learning_rate": 3.172435616047804e-05, + "loss": 1.7889, + "step": 20544 + }, + { + "epoch": 6.306015960712093, + "grad_norm": 0.20661889016628265, + "learning_rate": 3.171972962244258e-05, + "loss": 1.74, + "step": 20545 + }, + { + "epoch": 6.306322897483119, + "grad_norm": 0.17712774872779846, + "learning_rate": 3.1715103265063496e-05, + "loss": 1.72, + "step": 20546 + }, + { + "epoch": 6.306629834254144, + "grad_norm": 0.16776354610919952, + "learning_rate": 3.1710477088386456e-05, + "loss": 1.6715, + "step": 20547 + }, + { + "epoch": 6.3069367710251685, + "grad_norm": 0.21919682621955872, + "learning_rate": 3.170585109245721e-05, + "loss": 1.7232, + "step": 20548 + }, + { + "epoch": 6.307243707796194, + "grad_norm": 0.2026829719543457, + "learning_rate": 3.170122527732144e-05, + "loss": 1.7551, + "step": 20549 + }, + { + "epoch": 6.307550644567219, + "grad_norm": 0.18783780932426453, + "learning_rate": 3.169659964302493e-05, + "loss": 1.7024, + "step": 20550 + }, + { + "epoch": 6.3078575813382445, + "grad_norm": 0.2058420479297638, + "learning_rate": 3.1691974189613316e-05, + "loss": 1.7006, + "step": 20551 + }, + { + "epoch": 6.30816451810927, + "grad_norm": 0.21351832151412964, + "learning_rate": 3.168734891713237e-05, + "loss": 1.7586, + "step": 20552 + }, + { + "epoch": 6.308471454880294, + "grad_norm": 0.19816654920578003, + "learning_rate": 3.168272382562776e-05, + "loss": 1.7532, + "step": 20553 + }, + { + "epoch": 6.30877839165132, + "grad_norm": 0.18253186345100403, + "learning_rate": 3.16780989151452e-05, + "loss": 1.7413, + "step": 20554 + }, + { + "epoch": 6.309085328422345, + "grad_norm": 0.23097483813762665, + "learning_rate": 3.167347418573042e-05, + "loss": 1.7355, + "step": 20555 + }, + { + "epoch": 6.30939226519337, + "grad_norm": 0.1984725296497345, + "learning_rate": 3.166884963742911e-05, + "loss": 1.6754, + "step": 20556 + }, + { + "epoch": 6.309699201964396, + "grad_norm": 0.2385166734457016, + "learning_rate": 3.166422527028696e-05, + "loss": 1.7322, + "step": 20557 + }, + { + "epoch": 6.310006138735421, + "grad_norm": 0.23216524720191956, + "learning_rate": 3.165960108434971e-05, + "loss": 1.7426, + "step": 20558 + }, + { + "epoch": 6.310313075506445, + "grad_norm": 0.22017790377140045, + "learning_rate": 3.165497707966301e-05, + "loss": 1.6977, + "step": 20559 + }, + { + "epoch": 6.310620012277471, + "grad_norm": 0.2934584617614746, + "learning_rate": 3.165035325627257e-05, + "loss": 1.7252, + "step": 20560 + }, + { + "epoch": 6.310926949048496, + "grad_norm": 0.21830198168754578, + "learning_rate": 3.1645729614224126e-05, + "loss": 1.781, + "step": 20561 + }, + { + "epoch": 6.311233885819521, + "grad_norm": 0.3082836866378784, + "learning_rate": 3.1641106153563306e-05, + "loss": 1.8015, + "step": 20562 + }, + { + "epoch": 6.311540822590547, + "grad_norm": 0.22441358864307404, + "learning_rate": 3.163648287433586e-05, + "loss": 1.8058, + "step": 20563 + }, + { + "epoch": 6.311847759361571, + "grad_norm": 0.36623889207839966, + "learning_rate": 3.163185977658744e-05, + "loss": 1.7092, + "step": 20564 + }, + { + "epoch": 6.3121546961325965, + "grad_norm": 0.22231145203113556, + "learning_rate": 3.1627236860363755e-05, + "loss": 1.6432, + "step": 20565 + }, + { + "epoch": 6.312461632903622, + "grad_norm": 0.25871971249580383, + "learning_rate": 3.162261412571047e-05, + "loss": 1.7156, + "step": 20566 + }, + { + "epoch": 6.312768569674647, + "grad_norm": 0.24574241042137146, + "learning_rate": 3.16179915726733e-05, + "loss": 1.7977, + "step": 20567 + }, + { + "epoch": 6.3130755064456725, + "grad_norm": 0.197379007935524, + "learning_rate": 3.1613369201297895e-05, + "loss": 1.6966, + "step": 20568 + }, + { + "epoch": 6.313382443216697, + "grad_norm": 0.2149469256401062, + "learning_rate": 3.1608747011629975e-05, + "loss": 1.7385, + "step": 20569 + }, + { + "epoch": 6.313689379987722, + "grad_norm": 0.21942345798015594, + "learning_rate": 3.1604125003715174e-05, + "loss": 1.7369, + "step": 20570 + }, + { + "epoch": 6.313996316758748, + "grad_norm": 0.20977036654949188, + "learning_rate": 3.1599503177599197e-05, + "loss": 1.7429, + "step": 20571 + }, + { + "epoch": 6.314303253529773, + "grad_norm": 0.20113405585289001, + "learning_rate": 3.159488153332772e-05, + "loss": 1.7163, + "step": 20572 + }, + { + "epoch": 6.314610190300798, + "grad_norm": 0.22031868994235992, + "learning_rate": 3.1590260070946414e-05, + "loss": 1.7085, + "step": 20573 + }, + { + "epoch": 6.314917127071823, + "grad_norm": 0.24137777090072632, + "learning_rate": 3.158563879050094e-05, + "loss": 1.7169, + "step": 20574 + }, + { + "epoch": 6.315224063842848, + "grad_norm": 0.20265905559062958, + "learning_rate": 3.1581017692036985e-05, + "loss": 1.7466, + "step": 20575 + }, + { + "epoch": 6.315531000613873, + "grad_norm": 0.2997782528400421, + "learning_rate": 3.1576396775600206e-05, + "loss": 1.7287, + "step": 20576 + }, + { + "epoch": 6.315837937384899, + "grad_norm": 0.19672340154647827, + "learning_rate": 3.157177604123628e-05, + "loss": 1.7121, + "step": 20577 + }, + { + "epoch": 6.316144874155924, + "grad_norm": 0.26618507504463196, + "learning_rate": 3.156715548899085e-05, + "loss": 1.6958, + "step": 20578 + }, + { + "epoch": 6.316451810926949, + "grad_norm": 0.18854503333568573, + "learning_rate": 3.156253511890959e-05, + "loss": 1.7751, + "step": 20579 + }, + { + "epoch": 6.316758747697974, + "grad_norm": 0.2306061089038849, + "learning_rate": 3.155791493103819e-05, + "loss": 1.6853, + "step": 20580 + }, + { + "epoch": 6.317065684468999, + "grad_norm": 0.20650778710842133, + "learning_rate": 3.1553294925422254e-05, + "loss": 1.7021, + "step": 20581 + }, + { + "epoch": 6.3173726212400245, + "grad_norm": 0.19474658370018005, + "learning_rate": 3.1548675102107494e-05, + "loss": 1.7146, + "step": 20582 + }, + { + "epoch": 6.31767955801105, + "grad_norm": 0.2150747925043106, + "learning_rate": 3.154405546113952e-05, + "loss": 1.7473, + "step": 20583 + }, + { + "epoch": 6.317986494782075, + "grad_norm": 0.19304975867271423, + "learning_rate": 3.153943600256402e-05, + "loss": 1.7209, + "step": 20584 + }, + { + "epoch": 6.3182934315531, + "grad_norm": 0.22610948979854584, + "learning_rate": 3.153481672642662e-05, + "loss": 1.717, + "step": 20585 + }, + { + "epoch": 6.318600368324125, + "grad_norm": 0.18705105781555176, + "learning_rate": 3.1530197632773006e-05, + "loss": 1.7326, + "step": 20586 + }, + { + "epoch": 6.31890730509515, + "grad_norm": 0.25632867217063904, + "learning_rate": 3.152557872164878e-05, + "loss": 1.7391, + "step": 20587 + }, + { + "epoch": 6.319214241866176, + "grad_norm": 0.18723119795322418, + "learning_rate": 3.152095999309964e-05, + "loss": 1.7193, + "step": 20588 + }, + { + "epoch": 6.319521178637201, + "grad_norm": 0.1759091317653656, + "learning_rate": 3.1516341447171184e-05, + "loss": 1.7024, + "step": 20589 + }, + { + "epoch": 6.319828115408226, + "grad_norm": 0.1838626265525818, + "learning_rate": 3.1511723083909084e-05, + "loss": 1.7027, + "step": 20590 + }, + { + "epoch": 6.320135052179251, + "grad_norm": 0.2615656554698944, + "learning_rate": 3.1507104903358964e-05, + "loss": 1.7798, + "step": 20591 + }, + { + "epoch": 6.320441988950276, + "grad_norm": 0.18816477060317993, + "learning_rate": 3.150248690556649e-05, + "loss": 1.6778, + "step": 20592 + }, + { + "epoch": 6.320748925721301, + "grad_norm": 0.20011866092681885, + "learning_rate": 3.149786909057728e-05, + "loss": 1.6653, + "step": 20593 + }, + { + "epoch": 6.321055862492327, + "grad_norm": 0.26681140065193176, + "learning_rate": 3.149325145843696e-05, + "loss": 1.7523, + "step": 20594 + }, + { + "epoch": 6.321362799263352, + "grad_norm": 0.2062411904335022, + "learning_rate": 3.1488634009191177e-05, + "loss": 1.7584, + "step": 20595 + }, + { + "epoch": 6.3216697360343765, + "grad_norm": 0.22355243563652039, + "learning_rate": 3.148401674288556e-05, + "loss": 1.7106, + "step": 20596 + }, + { + "epoch": 6.321976672805402, + "grad_norm": 0.20189255475997925, + "learning_rate": 3.147939965956576e-05, + "loss": 1.6775, + "step": 20597 + }, + { + "epoch": 6.322283609576427, + "grad_norm": 0.23753875494003296, + "learning_rate": 3.147478275927736e-05, + "loss": 1.7661, + "step": 20598 + }, + { + "epoch": 6.3225905463474525, + "grad_norm": 0.18658648431301117, + "learning_rate": 3.147016604206604e-05, + "loss": 1.7562, + "step": 20599 + }, + { + "epoch": 6.322897483118478, + "grad_norm": 0.2610020637512207, + "learning_rate": 3.146554950797738e-05, + "loss": 1.7217, + "step": 20600 + }, + { + "epoch": 6.323204419889503, + "grad_norm": 0.18329289555549622, + "learning_rate": 3.146093315705704e-05, + "loss": 1.7206, + "step": 20601 + }, + { + "epoch": 6.323511356660528, + "grad_norm": 0.2393725961446762, + "learning_rate": 3.1456316989350606e-05, + "loss": 1.7646, + "step": 20602 + }, + { + "epoch": 6.323818293431553, + "grad_norm": 0.23535947501659393, + "learning_rate": 3.1451701004903736e-05, + "loss": 1.7718, + "step": 20603 + }, + { + "epoch": 6.324125230202578, + "grad_norm": 0.23179253935813904, + "learning_rate": 3.1447085203762014e-05, + "loss": 1.7311, + "step": 20604 + }, + { + "epoch": 6.324432166973604, + "grad_norm": 0.24929681420326233, + "learning_rate": 3.144246958597109e-05, + "loss": 1.7728, + "step": 20605 + }, + { + "epoch": 6.324739103744629, + "grad_norm": 0.22520960867404938, + "learning_rate": 3.1437854151576526e-05, + "loss": 1.749, + "step": 20606 + }, + { + "epoch": 6.3250460405156534, + "grad_norm": 0.3005391061306, + "learning_rate": 3.1433238900623997e-05, + "loss": 1.7725, + "step": 20607 + }, + { + "epoch": 6.325352977286679, + "grad_norm": 0.22625432908535004, + "learning_rate": 3.142862383315908e-05, + "loss": 1.7083, + "step": 20608 + }, + { + "epoch": 6.325659914057704, + "grad_norm": 0.28015029430389404, + "learning_rate": 3.142400894922737e-05, + "loss": 1.6862, + "step": 20609 + }, + { + "epoch": 6.3259668508287294, + "grad_norm": 0.2520587146282196, + "learning_rate": 3.141939424887451e-05, + "loss": 1.7059, + "step": 20610 + }, + { + "epoch": 6.326273787599755, + "grad_norm": 0.24668551981449127, + "learning_rate": 3.141477973214607e-05, + "loss": 1.6858, + "step": 20611 + }, + { + "epoch": 6.326580724370779, + "grad_norm": 0.2524704337120056, + "learning_rate": 3.1410165399087675e-05, + "loss": 1.6884, + "step": 20612 + }, + { + "epoch": 6.326887661141805, + "grad_norm": 0.18849264085292816, + "learning_rate": 3.1405551249744916e-05, + "loss": 1.6984, + "step": 20613 + }, + { + "epoch": 6.32719459791283, + "grad_norm": 0.2411552518606186, + "learning_rate": 3.140093728416342e-05, + "loss": 1.7455, + "step": 20614 + }, + { + "epoch": 6.327501534683855, + "grad_norm": 0.2268913835287094, + "learning_rate": 3.139632350238874e-05, + "loss": 1.7124, + "step": 20615 + }, + { + "epoch": 6.327808471454881, + "grad_norm": 0.3118770718574524, + "learning_rate": 3.1391709904466515e-05, + "loss": 1.7322, + "step": 20616 + }, + { + "epoch": 6.328115408225905, + "grad_norm": 0.25166428089141846, + "learning_rate": 3.1387096490442294e-05, + "loss": 1.7136, + "step": 20617 + }, + { + "epoch": 6.32842234499693, + "grad_norm": 0.2733297049999237, + "learning_rate": 3.138248326036172e-05, + "loss": 1.7939, + "step": 20618 + }, + { + "epoch": 6.328729281767956, + "grad_norm": 0.24583236873149872, + "learning_rate": 3.1377870214270334e-05, + "loss": 1.7105, + "step": 20619 + }, + { + "epoch": 6.329036218538981, + "grad_norm": 0.2533528506755829, + "learning_rate": 3.137325735221377e-05, + "loss": 1.7828, + "step": 20620 + }, + { + "epoch": 6.329343155310006, + "grad_norm": 0.27662715315818787, + "learning_rate": 3.136864467423758e-05, + "loss": 1.6969, + "step": 20621 + }, + { + "epoch": 6.329650092081032, + "grad_norm": 0.20107655227184296, + "learning_rate": 3.136403218038738e-05, + "loss": 1.6659, + "step": 20622 + }, + { + "epoch": 6.329957028852056, + "grad_norm": 0.21126115322113037, + "learning_rate": 3.135941987070872e-05, + "loss": 1.7372, + "step": 20623 + }, + { + "epoch": 6.3302639656230815, + "grad_norm": 0.1840609908103943, + "learning_rate": 3.1354807745247206e-05, + "loss": 1.7219, + "step": 20624 + }, + { + "epoch": 6.330570902394107, + "grad_norm": 0.23623648285865784, + "learning_rate": 3.135019580404842e-05, + "loss": 1.8059, + "step": 20625 + }, + { + "epoch": 6.330877839165132, + "grad_norm": 0.19853124022483826, + "learning_rate": 3.134558404715792e-05, + "loss": 1.7336, + "step": 20626 + }, + { + "epoch": 6.3311847759361575, + "grad_norm": 0.2261304259300232, + "learning_rate": 3.13409724746213e-05, + "loss": 1.7508, + "step": 20627 + }, + { + "epoch": 6.331491712707182, + "grad_norm": 0.1797952800989151, + "learning_rate": 3.1336361086484104e-05, + "loss": 1.6569, + "step": 20628 + }, + { + "epoch": 6.331798649478207, + "grad_norm": 0.21610359847545624, + "learning_rate": 3.133174988279195e-05, + "loss": 1.7093, + "step": 20629 + }, + { + "epoch": 6.332105586249233, + "grad_norm": 0.1818271279335022, + "learning_rate": 3.1327138863590365e-05, + "loss": 1.6951, + "step": 20630 + }, + { + "epoch": 6.332412523020258, + "grad_norm": 0.20425963401794434, + "learning_rate": 3.1322528028924956e-05, + "loss": 1.7399, + "step": 20631 + }, + { + "epoch": 6.332719459791283, + "grad_norm": 0.20357854664325714, + "learning_rate": 3.131791737884126e-05, + "loss": 1.693, + "step": 20632 + }, + { + "epoch": 6.333026396562309, + "grad_norm": 0.25307130813598633, + "learning_rate": 3.1313306913384874e-05, + "loss": 1.674, + "step": 20633 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 0.21596084535121918, + "learning_rate": 3.130869663260132e-05, + "loss": 1.7521, + "step": 20634 + }, + { + "epoch": 6.333640270104358, + "grad_norm": 0.24110902845859528, + "learning_rate": 3.1304086536536194e-05, + "loss": 1.6723, + "step": 20635 + }, + { + "epoch": 6.333947206875384, + "grad_norm": 0.21365956962108612, + "learning_rate": 3.129947662523503e-05, + "loss": 1.7702, + "step": 20636 + }, + { + "epoch": 6.334254143646409, + "grad_norm": 0.21873877942562103, + "learning_rate": 3.129486689874341e-05, + "loss": 1.7176, + "step": 20637 + }, + { + "epoch": 6.334561080417434, + "grad_norm": 0.2543679475784302, + "learning_rate": 3.129025735710687e-05, + "loss": 1.7733, + "step": 20638 + }, + { + "epoch": 6.334868017188459, + "grad_norm": 0.24591630697250366, + "learning_rate": 3.1285648000370996e-05, + "loss": 1.7212, + "step": 20639 + }, + { + "epoch": 6.335174953959484, + "grad_norm": 0.2453039139509201, + "learning_rate": 3.128103882858129e-05, + "loss": 1.7316, + "step": 20640 + }, + { + "epoch": 6.3354818907305095, + "grad_norm": 0.239897683262825, + "learning_rate": 3.127642984178334e-05, + "loss": 1.7495, + "step": 20641 + }, + { + "epoch": 6.335788827501535, + "grad_norm": 0.20719192922115326, + "learning_rate": 3.12718210400227e-05, + "loss": 1.7242, + "step": 20642 + }, + { + "epoch": 6.33609576427256, + "grad_norm": 0.1813955008983612, + "learning_rate": 3.126721242334487e-05, + "loss": 1.672, + "step": 20643 + }, + { + "epoch": 6.336402701043585, + "grad_norm": 0.20045650005340576, + "learning_rate": 3.126260399179546e-05, + "loss": 1.7854, + "step": 20644 + }, + { + "epoch": 6.33670963781461, + "grad_norm": 0.23010976612567902, + "learning_rate": 3.125799574541995e-05, + "loss": 1.7508, + "step": 20645 + }, + { + "epoch": 6.337016574585635, + "grad_norm": 0.1854519248008728, + "learning_rate": 3.1253387684263924e-05, + "loss": 1.7049, + "step": 20646 + }, + { + "epoch": 6.337323511356661, + "grad_norm": 0.2062511295080185, + "learning_rate": 3.1248779808372894e-05, + "loss": 1.6894, + "step": 20647 + }, + { + "epoch": 6.337630448127686, + "grad_norm": 0.19851341843605042, + "learning_rate": 3.124417211779244e-05, + "loss": 1.7332, + "step": 20648 + }, + { + "epoch": 6.337937384898711, + "grad_norm": 0.2099175751209259, + "learning_rate": 3.1239564612568054e-05, + "loss": 1.7577, + "step": 20649 + }, + { + "epoch": 6.338244321669736, + "grad_norm": 0.2152891904115677, + "learning_rate": 3.123495729274529e-05, + "loss": 1.7691, + "step": 20650 + }, + { + "epoch": 6.338551258440761, + "grad_norm": 0.19431835412979126, + "learning_rate": 3.123035015836967e-05, + "loss": 1.7035, + "step": 20651 + }, + { + "epoch": 6.338858195211786, + "grad_norm": 0.20863930881023407, + "learning_rate": 3.122574320948674e-05, + "loss": 1.7166, + "step": 20652 + }, + { + "epoch": 6.339165131982812, + "grad_norm": 0.17948369681835175, + "learning_rate": 3.122113644614201e-05, + "loss": 1.732, + "step": 20653 + }, + { + "epoch": 6.339472068753837, + "grad_norm": 0.2329161912202835, + "learning_rate": 3.121652986838103e-05, + "loss": 1.6934, + "step": 20654 + }, + { + "epoch": 6.3397790055248615, + "grad_norm": 0.23563681542873383, + "learning_rate": 3.12119234762493e-05, + "loss": 1.7329, + "step": 20655 + }, + { + "epoch": 6.340085942295887, + "grad_norm": 0.22654885053634644, + "learning_rate": 3.120731726979236e-05, + "loss": 1.767, + "step": 20656 + }, + { + "epoch": 6.340392879066912, + "grad_norm": 0.2507181465625763, + "learning_rate": 3.1202711249055715e-05, + "loss": 1.7071, + "step": 20657 + }, + { + "epoch": 6.3406998158379375, + "grad_norm": 0.20573864877223969, + "learning_rate": 3.1198105414084906e-05, + "loss": 1.7566, + "step": 20658 + }, + { + "epoch": 6.341006752608963, + "grad_norm": 0.23311644792556763, + "learning_rate": 3.119349976492545e-05, + "loss": 1.6778, + "step": 20659 + }, + { + "epoch": 6.341313689379987, + "grad_norm": 0.18166053295135498, + "learning_rate": 3.118889430162283e-05, + "loss": 1.7109, + "step": 20660 + }, + { + "epoch": 6.341620626151013, + "grad_norm": 0.21054090559482574, + "learning_rate": 3.11842890242226e-05, + "loss": 1.7255, + "step": 20661 + }, + { + "epoch": 6.341927562922038, + "grad_norm": 0.19898973405361176, + "learning_rate": 3.1179683932770235e-05, + "loss": 1.7017, + "step": 20662 + }, + { + "epoch": 6.342234499693063, + "grad_norm": 0.17782434821128845, + "learning_rate": 3.117507902731127e-05, + "loss": 1.6858, + "step": 20663 + }, + { + "epoch": 6.342541436464089, + "grad_norm": 0.19286927580833435, + "learning_rate": 3.117047430789121e-05, + "loss": 1.707, + "step": 20664 + }, + { + "epoch": 6.342848373235114, + "grad_norm": 0.18578651547431946, + "learning_rate": 3.1165869774555565e-05, + "loss": 1.7331, + "step": 20665 + }, + { + "epoch": 6.343155310006138, + "grad_norm": 0.19728249311447144, + "learning_rate": 3.1161265427349826e-05, + "loss": 1.7165, + "step": 20666 + }, + { + "epoch": 6.343462246777164, + "grad_norm": 0.18240176141262054, + "learning_rate": 3.115666126631952e-05, + "loss": 1.7167, + "step": 20667 + }, + { + "epoch": 6.343769183548189, + "grad_norm": 0.1928495317697525, + "learning_rate": 3.115205729151011e-05, + "loss": 1.7431, + "step": 20668 + }, + { + "epoch": 6.344076120319214, + "grad_norm": 0.19459952414035797, + "learning_rate": 3.1147453502967125e-05, + "loss": 1.7294, + "step": 20669 + }, + { + "epoch": 6.34438305709024, + "grad_norm": 0.18829894065856934, + "learning_rate": 3.1142849900736046e-05, + "loss": 1.7512, + "step": 20670 + }, + { + "epoch": 6.344689993861264, + "grad_norm": 0.19678451120853424, + "learning_rate": 3.11382464848624e-05, + "loss": 1.673, + "step": 20671 + }, + { + "epoch": 6.3449969306322895, + "grad_norm": 0.22256550192832947, + "learning_rate": 3.1133643255391635e-05, + "loss": 1.7044, + "step": 20672 + }, + { + "epoch": 6.345303867403315, + "grad_norm": 0.24741628766059875, + "learning_rate": 3.112904021236929e-05, + "loss": 1.7904, + "step": 20673 + }, + { + "epoch": 6.34561080417434, + "grad_norm": 0.20286159217357635, + "learning_rate": 3.11244373558408e-05, + "loss": 1.6976, + "step": 20674 + }, + { + "epoch": 6.3459177409453655, + "grad_norm": 0.2005387842655182, + "learning_rate": 3.11198346858517e-05, + "loss": 1.7083, + "step": 20675 + }, + { + "epoch": 6.346224677716391, + "grad_norm": 0.22312256693840027, + "learning_rate": 3.111523220244747e-05, + "loss": 1.7575, + "step": 20676 + }, + { + "epoch": 6.346531614487415, + "grad_norm": 0.2968841791152954, + "learning_rate": 3.111062990567356e-05, + "loss": 1.7813, + "step": 20677 + }, + { + "epoch": 6.346838551258441, + "grad_norm": 0.22900697588920593, + "learning_rate": 3.1106027795575496e-05, + "loss": 1.6818, + "step": 20678 + }, + { + "epoch": 6.347145488029466, + "grad_norm": 0.1912240833044052, + "learning_rate": 3.110142587219873e-05, + "loss": 1.7174, + "step": 20679 + }, + { + "epoch": 6.347452424800491, + "grad_norm": 0.20461280643939972, + "learning_rate": 3.1096824135588754e-05, + "loss": 1.6945, + "step": 20680 + }, + { + "epoch": 6.347759361571517, + "grad_norm": 0.19344913959503174, + "learning_rate": 3.109222258579103e-05, + "loss": 1.7064, + "step": 20681 + }, + { + "epoch": 6.348066298342541, + "grad_norm": 0.1833983063697815, + "learning_rate": 3.108762122285106e-05, + "loss": 1.702, + "step": 20682 + }, + { + "epoch": 6.348373235113566, + "grad_norm": 0.20344893634319305, + "learning_rate": 3.108302004681429e-05, + "loss": 1.7323, + "step": 20683 + }, + { + "epoch": 6.348680171884592, + "grad_norm": 0.18629617989063263, + "learning_rate": 3.107841905772622e-05, + "loss": 1.6841, + "step": 20684 + }, + { + "epoch": 6.348987108655617, + "grad_norm": 0.19279471039772034, + "learning_rate": 3.107381825563228e-05, + "loss": 1.7581, + "step": 20685 + }, + { + "epoch": 6.349294045426642, + "grad_norm": 0.21727058291435242, + "learning_rate": 3.106921764057798e-05, + "loss": 1.7231, + "step": 20686 + }, + { + "epoch": 6.349600982197667, + "grad_norm": 0.20952723920345306, + "learning_rate": 3.1064617212608747e-05, + "loss": 1.713, + "step": 20687 + }, + { + "epoch": 6.349907918968692, + "grad_norm": 0.2358582466840744, + "learning_rate": 3.10600169717701e-05, + "loss": 1.7291, + "step": 20688 + }, + { + "epoch": 6.350214855739718, + "grad_norm": 0.21846619248390198, + "learning_rate": 3.105541691810743e-05, + "loss": 1.7365, + "step": 20689 + }, + { + "epoch": 6.350521792510743, + "grad_norm": 0.22137843072414398, + "learning_rate": 3.1050817051666256e-05, + "loss": 1.7404, + "step": 20690 + }, + { + "epoch": 6.350828729281768, + "grad_norm": 0.2301674485206604, + "learning_rate": 3.1046217372492e-05, + "loss": 1.7422, + "step": 20691 + }, + { + "epoch": 6.351135666052793, + "grad_norm": 0.18955166637897491, + "learning_rate": 3.104161788063015e-05, + "loss": 1.7063, + "step": 20692 + }, + { + "epoch": 6.351442602823818, + "grad_norm": 0.21172095835208893, + "learning_rate": 3.103701857612614e-05, + "loss": 1.6856, + "step": 20693 + }, + { + "epoch": 6.351749539594843, + "grad_norm": 0.20921260118484497, + "learning_rate": 3.103241945902541e-05, + "loss": 1.7384, + "step": 20694 + }, + { + "epoch": 6.352056476365869, + "grad_norm": 0.21005603671073914, + "learning_rate": 3.102782052937345e-05, + "loss": 1.7118, + "step": 20695 + }, + { + "epoch": 6.352363413136894, + "grad_norm": 0.20888659358024597, + "learning_rate": 3.102322178721567e-05, + "loss": 1.7172, + "step": 20696 + }, + { + "epoch": 6.352670349907919, + "grad_norm": 0.194463849067688, + "learning_rate": 3.101862323259754e-05, + "loss": 1.6909, + "step": 20697 + }, + { + "epoch": 6.352977286678944, + "grad_norm": 0.20848685503005981, + "learning_rate": 3.1014024865564494e-05, + "loss": 1.7846, + "step": 20698 + }, + { + "epoch": 6.353284223449969, + "grad_norm": 0.18669761717319489, + "learning_rate": 3.100942668616201e-05, + "loss": 1.7542, + "step": 20699 + }, + { + "epoch": 6.3535911602209945, + "grad_norm": 0.23618464171886444, + "learning_rate": 3.100482869443547e-05, + "loss": 1.7292, + "step": 20700 + }, + { + "epoch": 6.35389809699202, + "grad_norm": 0.19389905035495758, + "learning_rate": 3.100023089043037e-05, + "loss": 1.6847, + "step": 20701 + }, + { + "epoch": 6.354205033763045, + "grad_norm": 0.20346343517303467, + "learning_rate": 3.09956332741921e-05, + "loss": 1.7096, + "step": 20702 + }, + { + "epoch": 6.35451197053407, + "grad_norm": 0.20825842022895813, + "learning_rate": 3.099103584576614e-05, + "loss": 1.6974, + "step": 20703 + }, + { + "epoch": 6.354818907305095, + "grad_norm": 0.2093508094549179, + "learning_rate": 3.0986438605197895e-05, + "loss": 1.6849, + "step": 20704 + }, + { + "epoch": 6.35512584407612, + "grad_norm": 0.2576633393764496, + "learning_rate": 3.098184155253282e-05, + "loss": 1.7974, + "step": 20705 + }, + { + "epoch": 6.355432780847146, + "grad_norm": 0.18197253346443176, + "learning_rate": 3.097724468781632e-05, + "loss": 1.6723, + "step": 20706 + }, + { + "epoch": 6.355739717618171, + "grad_norm": 0.24809512495994568, + "learning_rate": 3.0972648011093855e-05, + "loss": 1.7378, + "step": 20707 + }, + { + "epoch": 6.356046654389196, + "grad_norm": 0.2046923190355301, + "learning_rate": 3.0968051522410814e-05, + "loss": 1.7502, + "step": 20708 + }, + { + "epoch": 6.356353591160221, + "grad_norm": 0.20443019270896912, + "learning_rate": 3.096345522181265e-05, + "loss": 1.7179, + "step": 20709 + }, + { + "epoch": 6.356660527931246, + "grad_norm": 0.1906277984380722, + "learning_rate": 3.09588591093448e-05, + "loss": 1.7167, + "step": 20710 + }, + { + "epoch": 6.356967464702271, + "grad_norm": 0.20729197561740875, + "learning_rate": 3.095426318505263e-05, + "loss": 1.7193, + "step": 20711 + }, + { + "epoch": 6.357274401473297, + "grad_norm": 0.23446644842624664, + "learning_rate": 3.094966744898162e-05, + "loss": 1.7341, + "step": 20712 + }, + { + "epoch": 6.357581338244322, + "grad_norm": 0.18882590532302856, + "learning_rate": 3.094507190117715e-05, + "loss": 1.7001, + "step": 20713 + }, + { + "epoch": 6.3578882750153465, + "grad_norm": 0.27240705490112305, + "learning_rate": 3.094047654168465e-05, + "loss": 1.7641, + "step": 20714 + }, + { + "epoch": 6.358195211786372, + "grad_norm": 0.19616954028606415, + "learning_rate": 3.093588137054952e-05, + "loss": 1.751, + "step": 20715 + }, + { + "epoch": 6.358502148557397, + "grad_norm": 0.23402562737464905, + "learning_rate": 3.093128638781721e-05, + "loss": 1.7274, + "step": 20716 + }, + { + "epoch": 6.3588090853284225, + "grad_norm": 0.18189528584480286, + "learning_rate": 3.092669159353309e-05, + "loss": 1.7079, + "step": 20717 + }, + { + "epoch": 6.359116022099448, + "grad_norm": 0.21583771705627441, + "learning_rate": 3.092209698774259e-05, + "loss": 1.6811, + "step": 20718 + }, + { + "epoch": 6.359422958870473, + "grad_norm": 0.2477681040763855, + "learning_rate": 3.091750257049109e-05, + "loss": 1.6963, + "step": 20719 + }, + { + "epoch": 6.359729895641498, + "grad_norm": 0.2883109152317047, + "learning_rate": 3.091290834182403e-05, + "loss": 1.8349, + "step": 20720 + }, + { + "epoch": 6.360036832412523, + "grad_norm": 0.23407170176506042, + "learning_rate": 3.09083143017868e-05, + "loss": 1.7271, + "step": 20721 + }, + { + "epoch": 6.360343769183548, + "grad_norm": 0.2818833589553833, + "learning_rate": 3.090372045042479e-05, + "loss": 1.7852, + "step": 20722 + }, + { + "epoch": 6.360650705954574, + "grad_norm": 0.24415317177772522, + "learning_rate": 3.089912678778341e-05, + "loss": 1.6826, + "step": 20723 + }, + { + "epoch": 6.360957642725599, + "grad_norm": 0.26786303520202637, + "learning_rate": 3.0894533313908056e-05, + "loss": 1.7616, + "step": 20724 + }, + { + "epoch": 6.361264579496623, + "grad_norm": 0.3235633969306946, + "learning_rate": 3.088994002884411e-05, + "loss": 1.7637, + "step": 20725 + }, + { + "epoch": 6.361571516267649, + "grad_norm": 0.18675416707992554, + "learning_rate": 3.0885346932637e-05, + "loss": 1.7037, + "step": 20726 + }, + { + "epoch": 6.361878453038674, + "grad_norm": 0.295802503824234, + "learning_rate": 3.0880754025332084e-05, + "loss": 1.7435, + "step": 20727 + }, + { + "epoch": 6.362185389809699, + "grad_norm": 0.18665561079978943, + "learning_rate": 3.0876161306974756e-05, + "loss": 1.684, + "step": 20728 + }, + { + "epoch": 6.362492326580725, + "grad_norm": 0.2530463635921478, + "learning_rate": 3.087156877761043e-05, + "loss": 1.7934, + "step": 20729 + }, + { + "epoch": 6.362799263351749, + "grad_norm": 0.17860126495361328, + "learning_rate": 3.086697643728445e-05, + "loss": 1.6977, + "step": 20730 + }, + { + "epoch": 6.3631062001227745, + "grad_norm": 0.20118845999240875, + "learning_rate": 3.086238428604223e-05, + "loss": 1.7241, + "step": 20731 + }, + { + "epoch": 6.3634131368938, + "grad_norm": 0.18811924755573273, + "learning_rate": 3.085779232392915e-05, + "loss": 1.6918, + "step": 20732 + }, + { + "epoch": 6.363720073664825, + "grad_norm": 0.1841908097267151, + "learning_rate": 3.085320055099058e-05, + "loss": 1.735, + "step": 20733 + }, + { + "epoch": 6.3640270104358505, + "grad_norm": 0.1956033855676651, + "learning_rate": 3.08486089672719e-05, + "loss": 1.7203, + "step": 20734 + }, + { + "epoch": 6.364333947206875, + "grad_norm": 0.19844500720500946, + "learning_rate": 3.084401757281851e-05, + "loss": 1.6767, + "step": 20735 + }, + { + "epoch": 6.3646408839779, + "grad_norm": 0.2018919438123703, + "learning_rate": 3.083942636767575e-05, + "loss": 1.6912, + "step": 20736 + }, + { + "epoch": 6.364947820748926, + "grad_norm": 0.18929271399974823, + "learning_rate": 3.083483535188901e-05, + "loss": 1.6838, + "step": 20737 + }, + { + "epoch": 6.365254757519951, + "grad_norm": 0.19833499193191528, + "learning_rate": 3.0830244525503674e-05, + "loss": 1.7139, + "step": 20738 + }, + { + "epoch": 6.365561694290976, + "grad_norm": 0.17029902338981628, + "learning_rate": 3.082565388856509e-05, + "loss": 1.6665, + "step": 20739 + }, + { + "epoch": 6.365868631062002, + "grad_norm": 0.19526802003383636, + "learning_rate": 3.082106344111861e-05, + "loss": 1.7021, + "step": 20740 + }, + { + "epoch": 6.366175567833026, + "grad_norm": 0.19061279296875, + "learning_rate": 3.081647318320966e-05, + "loss": 1.7134, + "step": 20741 + }, + { + "epoch": 6.366482504604051, + "grad_norm": 0.17782293260097504, + "learning_rate": 3.081188311488354e-05, + "loss": 1.741, + "step": 20742 + }, + { + "epoch": 6.366789441375077, + "grad_norm": 0.20002372562885284, + "learning_rate": 3.080729323618565e-05, + "loss": 1.6943, + "step": 20743 + }, + { + "epoch": 6.367096378146102, + "grad_norm": 0.22873486578464508, + "learning_rate": 3.080270354716134e-05, + "loss": 1.7223, + "step": 20744 + }, + { + "epoch": 6.367403314917127, + "grad_norm": 0.191136434674263, + "learning_rate": 3.079811404785595e-05, + "loss": 1.6774, + "step": 20745 + }, + { + "epoch": 6.367710251688152, + "grad_norm": 0.20446795225143433, + "learning_rate": 3.0793524738314874e-05, + "loss": 1.7443, + "step": 20746 + }, + { + "epoch": 6.368017188459177, + "grad_norm": 0.20668596029281616, + "learning_rate": 3.078893561858341e-05, + "loss": 1.7553, + "step": 20747 + }, + { + "epoch": 6.3683241252302025, + "grad_norm": 0.18445394933223724, + "learning_rate": 3.078434668870698e-05, + "loss": 1.7365, + "step": 20748 + }, + { + "epoch": 6.368631062001228, + "grad_norm": 0.1824318915605545, + "learning_rate": 3.077975794873088e-05, + "loss": 1.7248, + "step": 20749 + }, + { + "epoch": 6.368937998772253, + "grad_norm": 0.18452249467372894, + "learning_rate": 3.077516939870047e-05, + "loss": 1.7095, + "step": 20750 + }, + { + "epoch": 6.3692449355432785, + "grad_norm": 0.17254458367824554, + "learning_rate": 3.077058103866112e-05, + "loss": 1.6937, + "step": 20751 + }, + { + "epoch": 6.369551872314303, + "grad_norm": 0.2022976130247116, + "learning_rate": 3.0765992868658154e-05, + "loss": 1.7593, + "step": 20752 + }, + { + "epoch": 6.369858809085328, + "grad_norm": 0.19274397194385529, + "learning_rate": 3.076140488873691e-05, + "loss": 1.7288, + "step": 20753 + }, + { + "epoch": 6.370165745856354, + "grad_norm": 0.18847523629665375, + "learning_rate": 3.075681709894276e-05, + "loss": 1.7293, + "step": 20754 + }, + { + "epoch": 6.370472682627379, + "grad_norm": 0.21054589748382568, + "learning_rate": 3.075222949932101e-05, + "loss": 1.7688, + "step": 20755 + }, + { + "epoch": 6.370779619398404, + "grad_norm": 0.16934558749198914, + "learning_rate": 3.0747642089917005e-05, + "loss": 1.7092, + "step": 20756 + }, + { + "epoch": 6.371086556169429, + "grad_norm": 0.19154684245586395, + "learning_rate": 3.0743054870776075e-05, + "loss": 1.6827, + "step": 20757 + }, + { + "epoch": 6.371393492940454, + "grad_norm": 0.2622900605201721, + "learning_rate": 3.0738467841943594e-05, + "loss": 1.748, + "step": 20758 + }, + { + "epoch": 6.371700429711479, + "grad_norm": 0.1767888218164444, + "learning_rate": 3.073388100346484e-05, + "loss": 1.717, + "step": 20759 + }, + { + "epoch": 6.372007366482505, + "grad_norm": 0.21692602336406708, + "learning_rate": 3.072929435538518e-05, + "loss": 1.7543, + "step": 20760 + }, + { + "epoch": 6.37231430325353, + "grad_norm": 0.19853977859020233, + "learning_rate": 3.0724707897749926e-05, + "loss": 1.7599, + "step": 20761 + }, + { + "epoch": 6.3726212400245545, + "grad_norm": 0.1904703676700592, + "learning_rate": 3.0720121630604396e-05, + "loss": 1.7094, + "step": 20762 + }, + { + "epoch": 6.37292817679558, + "grad_norm": 0.1961483359336853, + "learning_rate": 3.071553555399395e-05, + "loss": 1.7363, + "step": 20763 + }, + { + "epoch": 6.373235113566605, + "grad_norm": 0.16419392824172974, + "learning_rate": 3.071094966796385e-05, + "loss": 1.7073, + "step": 20764 + }, + { + "epoch": 6.3735420503376305, + "grad_norm": 0.1784946471452713, + "learning_rate": 3.0706363972559476e-05, + "loss": 1.699, + "step": 20765 + }, + { + "epoch": 6.373848987108656, + "grad_norm": 0.19472888112068176, + "learning_rate": 3.070177846782611e-05, + "loss": 1.7541, + "step": 20766 + }, + { + "epoch": 6.37415592387968, + "grad_norm": 0.2355004847049713, + "learning_rate": 3.0697193153809076e-05, + "loss": 1.7389, + "step": 20767 + }, + { + "epoch": 6.374462860650706, + "grad_norm": 0.1956906020641327, + "learning_rate": 3.069260803055369e-05, + "loss": 1.7197, + "step": 20768 + }, + { + "epoch": 6.374769797421731, + "grad_norm": 0.21212655305862427, + "learning_rate": 3.068802309810529e-05, + "loss": 1.7291, + "step": 20769 + }, + { + "epoch": 6.375076734192756, + "grad_norm": 0.22920182347297668, + "learning_rate": 3.068343835650914e-05, + "loss": 1.7397, + "step": 20770 + }, + { + "epoch": 6.375383670963782, + "grad_norm": 0.2143404483795166, + "learning_rate": 3.0678853805810605e-05, + "loss": 1.76, + "step": 20771 + }, + { + "epoch": 6.375690607734807, + "grad_norm": 0.1848321557044983, + "learning_rate": 3.067426944605492e-05, + "loss": 1.7127, + "step": 20772 + }, + { + "epoch": 6.3759975445058314, + "grad_norm": 0.23339331150054932, + "learning_rate": 3.0669685277287465e-05, + "loss": 1.7828, + "step": 20773 + }, + { + "epoch": 6.376304481276857, + "grad_norm": 0.19590741395950317, + "learning_rate": 3.066510129955349e-05, + "loss": 1.7224, + "step": 20774 + }, + { + "epoch": 6.376611418047882, + "grad_norm": 0.19986604154109955, + "learning_rate": 3.066051751289833e-05, + "loss": 1.7412, + "step": 20775 + }, + { + "epoch": 6.3769183548189075, + "grad_norm": 0.18629087507724762, + "learning_rate": 3.0655933917367266e-05, + "loss": 1.695, + "step": 20776 + }, + { + "epoch": 6.377225291589933, + "grad_norm": 0.2248111218214035, + "learning_rate": 3.0651350513005605e-05, + "loss": 1.7685, + "step": 20777 + }, + { + "epoch": 6.377532228360957, + "grad_norm": 0.1803683638572693, + "learning_rate": 3.064676729985864e-05, + "loss": 1.7206, + "step": 20778 + }, + { + "epoch": 6.377839165131983, + "grad_norm": 0.23836754262447357, + "learning_rate": 3.064218427797165e-05, + "loss": 1.7428, + "step": 20779 + }, + { + "epoch": 6.378146101903008, + "grad_norm": 0.22549279034137726, + "learning_rate": 3.063760144738996e-05, + "loss": 1.7314, + "step": 20780 + }, + { + "epoch": 6.378453038674033, + "grad_norm": 0.20714345574378967, + "learning_rate": 3.063301880815882e-05, + "loss": 1.7179, + "step": 20781 + }, + { + "epoch": 6.378759975445059, + "grad_norm": 0.17024052143096924, + "learning_rate": 3.0628436360323565e-05, + "loss": 1.6602, + "step": 20782 + }, + { + "epoch": 6.379066912216084, + "grad_norm": 0.20378601551055908, + "learning_rate": 3.062385410392943e-05, + "loss": 1.7708, + "step": 20783 + }, + { + "epoch": 6.379373848987108, + "grad_norm": 0.1885673850774765, + "learning_rate": 3.0619272039021734e-05, + "loss": 1.7034, + "step": 20784 + }, + { + "epoch": 6.379680785758134, + "grad_norm": 0.18746556341648102, + "learning_rate": 3.0614690165645746e-05, + "loss": 1.6946, + "step": 20785 + }, + { + "epoch": 6.379987722529159, + "grad_norm": 0.19569392502307892, + "learning_rate": 3.061010848384677e-05, + "loss": 1.7298, + "step": 20786 + }, + { + "epoch": 6.380294659300184, + "grad_norm": 0.21114139258861542, + "learning_rate": 3.0605526993670046e-05, + "loss": 1.795, + "step": 20787 + }, + { + "epoch": 6.38060159607121, + "grad_norm": 0.20940302312374115, + "learning_rate": 3.06009456951609e-05, + "loss": 1.6747, + "step": 20788 + }, + { + "epoch": 6.380908532842234, + "grad_norm": 0.21008993685245514, + "learning_rate": 3.059636458836455e-05, + "loss": 1.7219, + "step": 20789 + }, + { + "epoch": 6.3812154696132595, + "grad_norm": 0.17642457783222198, + "learning_rate": 3.0591783673326304e-05, + "loss": 1.6555, + "step": 20790 + }, + { + "epoch": 6.381522406384285, + "grad_norm": 0.2786177396774292, + "learning_rate": 3.058720295009143e-05, + "loss": 1.8463, + "step": 20791 + }, + { + "epoch": 6.38182934315531, + "grad_norm": 0.21209503710269928, + "learning_rate": 3.058262241870521e-05, + "loss": 1.6848, + "step": 20792 + }, + { + "epoch": 6.3821362799263355, + "grad_norm": 0.1880561262369156, + "learning_rate": 3.057804207921287e-05, + "loss": 1.7401, + "step": 20793 + }, + { + "epoch": 6.382443216697361, + "grad_norm": 0.22108516097068787, + "learning_rate": 3.0573461931659726e-05, + "loss": 1.7482, + "step": 20794 + }, + { + "epoch": 6.382750153468385, + "grad_norm": 0.2161533385515213, + "learning_rate": 3.0568881976091006e-05, + "loss": 1.7425, + "step": 20795 + }, + { + "epoch": 6.383057090239411, + "grad_norm": 0.22933612763881683, + "learning_rate": 3.0564302212551975e-05, + "loss": 1.7424, + "step": 20796 + }, + { + "epoch": 6.383364027010436, + "grad_norm": 0.19572989642620087, + "learning_rate": 3.0559722641087916e-05, + "loss": 1.6763, + "step": 20797 + }, + { + "epoch": 6.383670963781461, + "grad_norm": 0.2181084007024765, + "learning_rate": 3.0555143261744056e-05, + "loss": 1.7164, + "step": 20798 + }, + { + "epoch": 6.383977900552487, + "grad_norm": 0.1927991509437561, + "learning_rate": 3.055056407456569e-05, + "loss": 1.6833, + "step": 20799 + }, + { + "epoch": 6.384284837323511, + "grad_norm": 0.20569704473018646, + "learning_rate": 3.0545985079598025e-05, + "loss": 1.7716, + "step": 20800 + }, + { + "epoch": 6.384591774094536, + "grad_norm": 0.1856541931629181, + "learning_rate": 3.054140627688635e-05, + "loss": 1.6939, + "step": 20801 + }, + { + "epoch": 6.384898710865562, + "grad_norm": 0.2450970858335495, + "learning_rate": 3.05368276664759e-05, + "loss": 1.8197, + "step": 20802 + }, + { + "epoch": 6.385205647636587, + "grad_norm": 0.23325784504413605, + "learning_rate": 3.053224924841194e-05, + "loss": 1.7195, + "step": 20803 + }, + { + "epoch": 6.385512584407612, + "grad_norm": 0.19614358246326447, + "learning_rate": 3.052767102273968e-05, + "loss": 1.6966, + "step": 20804 + }, + { + "epoch": 6.385819521178637, + "grad_norm": 0.20615628361701965, + "learning_rate": 3.0523092989504415e-05, + "loss": 1.7429, + "step": 20805 + }, + { + "epoch": 6.386126457949662, + "grad_norm": 0.18418943881988525, + "learning_rate": 3.0518515148751336e-05, + "loss": 1.7612, + "step": 20806 + }, + { + "epoch": 6.3864333947206875, + "grad_norm": 0.17176245152950287, + "learning_rate": 3.0513937500525725e-05, + "loss": 1.6918, + "step": 20807 + }, + { + "epoch": 6.386740331491713, + "grad_norm": 0.22239255905151367, + "learning_rate": 3.0509360044872787e-05, + "loss": 1.8072, + "step": 20808 + }, + { + "epoch": 6.387047268262738, + "grad_norm": 0.20312704145908356, + "learning_rate": 3.0504782781837798e-05, + "loss": 1.7348, + "step": 20809 + }, + { + "epoch": 6.387354205033763, + "grad_norm": 0.23198208212852478, + "learning_rate": 3.0500205711465958e-05, + "loss": 1.7516, + "step": 20810 + }, + { + "epoch": 6.387661141804788, + "grad_norm": 0.2244081050157547, + "learning_rate": 3.0495628833802526e-05, + "loss": 1.731, + "step": 20811 + }, + { + "epoch": 6.387968078575813, + "grad_norm": 0.18282169103622437, + "learning_rate": 3.0491052148892717e-05, + "loss": 1.6743, + "step": 20812 + }, + { + "epoch": 6.388275015346839, + "grad_norm": 0.19108405709266663, + "learning_rate": 3.0486475656781753e-05, + "loss": 1.7485, + "step": 20813 + }, + { + "epoch": 6.388581952117864, + "grad_norm": 0.20574834942817688, + "learning_rate": 3.0481899357514898e-05, + "loss": 1.6979, + "step": 20814 + }, + { + "epoch": 6.388888888888889, + "grad_norm": 0.21263298392295837, + "learning_rate": 3.047732325113733e-05, + "loss": 1.687, + "step": 20815 + }, + { + "epoch": 6.389195825659914, + "grad_norm": 0.22646664083003998, + "learning_rate": 3.047274733769432e-05, + "loss": 1.7593, + "step": 20816 + }, + { + "epoch": 6.389502762430939, + "grad_norm": 0.1846906542778015, + "learning_rate": 3.046817161723104e-05, + "loss": 1.7271, + "step": 20817 + }, + { + "epoch": 6.389809699201964, + "grad_norm": 0.1965247541666031, + "learning_rate": 3.0463596089792746e-05, + "loss": 1.7121, + "step": 20818 + }, + { + "epoch": 6.39011663597299, + "grad_norm": 0.255577951669693, + "learning_rate": 3.045902075542464e-05, + "loss": 1.7311, + "step": 20819 + }, + { + "epoch": 6.390423572744015, + "grad_norm": 0.1837676465511322, + "learning_rate": 3.0454445614171966e-05, + "loss": 1.7177, + "step": 20820 + }, + { + "epoch": 6.3907305095150395, + "grad_norm": 0.24845893681049347, + "learning_rate": 3.0449870666079895e-05, + "loss": 1.6902, + "step": 20821 + }, + { + "epoch": 6.391037446286065, + "grad_norm": 0.28572577238082886, + "learning_rate": 3.0445295911193678e-05, + "loss": 1.7942, + "step": 20822 + }, + { + "epoch": 6.39134438305709, + "grad_norm": 0.20460839569568634, + "learning_rate": 3.044072134955849e-05, + "loss": 1.6747, + "step": 20823 + }, + { + "epoch": 6.3916513198281155, + "grad_norm": 0.3547010123729706, + "learning_rate": 3.0436146981219565e-05, + "loss": 1.7359, + "step": 20824 + }, + { + "epoch": 6.391958256599141, + "grad_norm": 0.20490451157093048, + "learning_rate": 3.04315728062221e-05, + "loss": 1.6863, + "step": 20825 + }, + { + "epoch": 6.392265193370166, + "grad_norm": 0.25874415040016174, + "learning_rate": 3.0426998824611307e-05, + "loss": 1.6798, + "step": 20826 + }, + { + "epoch": 6.392572130141191, + "grad_norm": 0.27858632802963257, + "learning_rate": 3.0422425036432378e-05, + "loss": 1.6943, + "step": 20827 + }, + { + "epoch": 6.392879066912216, + "grad_norm": 0.20951922237873077, + "learning_rate": 3.041785144173054e-05, + "loss": 1.7025, + "step": 20828 + }, + { + "epoch": 6.393186003683241, + "grad_norm": 0.3158397674560547, + "learning_rate": 3.0413278040550952e-05, + "loss": 1.7193, + "step": 20829 + }, + { + "epoch": 6.393492940454267, + "grad_norm": 0.18556484580039978, + "learning_rate": 3.0408704832938824e-05, + "loss": 1.7017, + "step": 20830 + }, + { + "epoch": 6.393799877225292, + "grad_norm": 0.31651169061660767, + "learning_rate": 3.0404131818939376e-05, + "loss": 1.7716, + "step": 20831 + }, + { + "epoch": 6.394106813996316, + "grad_norm": 0.2850388288497925, + "learning_rate": 3.0399558998597765e-05, + "loss": 1.7144, + "step": 20832 + }, + { + "epoch": 6.394413750767342, + "grad_norm": 0.19256308674812317, + "learning_rate": 3.0394986371959223e-05, + "loss": 1.6603, + "step": 20833 + }, + { + "epoch": 6.394720687538367, + "grad_norm": 0.2654922604560852, + "learning_rate": 3.0390413939068896e-05, + "loss": 1.6825, + "step": 20834 + }, + { + "epoch": 6.395027624309392, + "grad_norm": 0.19514231383800507, + "learning_rate": 3.0385841699971997e-05, + "loss": 1.7226, + "step": 20835 + }, + { + "epoch": 6.395334561080418, + "grad_norm": 0.27765151858329773, + "learning_rate": 3.0381269654713702e-05, + "loss": 1.7599, + "step": 20836 + }, + { + "epoch": 6.395641497851442, + "grad_norm": 0.2056504338979721, + "learning_rate": 3.0376697803339215e-05, + "loss": 1.7237, + "step": 20837 + }, + { + "epoch": 6.3959484346224675, + "grad_norm": 0.22516649961471558, + "learning_rate": 3.0372126145893688e-05, + "loss": 1.7566, + "step": 20838 + }, + { + "epoch": 6.396255371393493, + "grad_norm": 0.17632099986076355, + "learning_rate": 3.0367554682422327e-05, + "loss": 1.7014, + "step": 20839 + }, + { + "epoch": 6.396562308164518, + "grad_norm": 0.21872831881046295, + "learning_rate": 3.036298341297028e-05, + "loss": 1.6935, + "step": 20840 + }, + { + "epoch": 6.3968692449355435, + "grad_norm": 0.22132672369480133, + "learning_rate": 3.0358412337582752e-05, + "loss": 1.6735, + "step": 20841 + }, + { + "epoch": 6.397176181706568, + "grad_norm": 0.17865684628486633, + "learning_rate": 3.0353841456304895e-05, + "loss": 1.7097, + "step": 20842 + }, + { + "epoch": 6.397483118477593, + "grad_norm": 0.2069701999425888, + "learning_rate": 3.0349270769181914e-05, + "loss": 1.7592, + "step": 20843 + }, + { + "epoch": 6.397790055248619, + "grad_norm": 0.19800925254821777, + "learning_rate": 3.034470027625893e-05, + "loss": 1.6943, + "step": 20844 + }, + { + "epoch": 6.398096992019644, + "grad_norm": 0.24116787314414978, + "learning_rate": 3.0340129977581165e-05, + "loss": 1.7126, + "step": 20845 + }, + { + "epoch": 6.398403928790669, + "grad_norm": 0.1995212435722351, + "learning_rate": 3.033555987319375e-05, + "loss": 1.75, + "step": 20846 + }, + { + "epoch": 6.398710865561695, + "grad_norm": 0.23717111349105835, + "learning_rate": 3.0330989963141843e-05, + "loss": 1.7338, + "step": 20847 + }, + { + "epoch": 6.399017802332719, + "grad_norm": 0.18372474610805511, + "learning_rate": 3.0326420247470643e-05, + "loss": 1.7034, + "step": 20848 + }, + { + "epoch": 6.399324739103744, + "grad_norm": 0.25953924655914307, + "learning_rate": 3.0321850726225265e-05, + "loss": 1.731, + "step": 20849 + }, + { + "epoch": 6.39963167587477, + "grad_norm": 0.24846702814102173, + "learning_rate": 3.031728139945092e-05, + "loss": 1.7559, + "step": 20850 + }, + { + "epoch": 6.399938612645795, + "grad_norm": 0.20783887803554535, + "learning_rate": 3.0312712267192713e-05, + "loss": 1.7229, + "step": 20851 + }, + { + "epoch": 6.4002455494168204, + "grad_norm": 0.1904737949371338, + "learning_rate": 3.030814332949583e-05, + "loss": 1.6986, + "step": 20852 + }, + { + "epoch": 6.400552486187845, + "grad_norm": 0.2275397777557373, + "learning_rate": 3.030357458640541e-05, + "loss": 1.708, + "step": 20853 + }, + { + "epoch": 6.40085942295887, + "grad_norm": 0.20119737088680267, + "learning_rate": 3.0299006037966628e-05, + "loss": 1.7727, + "step": 20854 + }, + { + "epoch": 6.401166359729896, + "grad_norm": 0.17214249074459076, + "learning_rate": 3.0294437684224596e-05, + "loss": 1.6674, + "step": 20855 + }, + { + "epoch": 6.401473296500921, + "grad_norm": 0.21268978714942932, + "learning_rate": 3.02898695252245e-05, + "loss": 1.7182, + "step": 20856 + }, + { + "epoch": 6.401780233271946, + "grad_norm": 0.19911682605743408, + "learning_rate": 3.0285301561011448e-05, + "loss": 1.6861, + "step": 20857 + }, + { + "epoch": 6.402087170042972, + "grad_norm": 0.194064199924469, + "learning_rate": 3.0280733791630613e-05, + "loss": 1.6768, + "step": 20858 + }, + { + "epoch": 6.402394106813996, + "grad_norm": 0.17554323375225067, + "learning_rate": 3.027616621712711e-05, + "loss": 1.6987, + "step": 20859 + }, + { + "epoch": 6.402701043585021, + "grad_norm": 0.205257385969162, + "learning_rate": 3.027159883754611e-05, + "loss": 1.7951, + "step": 20860 + }, + { + "epoch": 6.403007980356047, + "grad_norm": 0.1766849011182785, + "learning_rate": 3.0267031652932743e-05, + "loss": 1.7157, + "step": 20861 + }, + { + "epoch": 6.403314917127072, + "grad_norm": 0.17106789350509644, + "learning_rate": 3.0262464663332106e-05, + "loss": 1.685, + "step": 20862 + }, + { + "epoch": 6.403621853898097, + "grad_norm": 0.17380768060684204, + "learning_rate": 3.0257897868789377e-05, + "loss": 1.708, + "step": 20863 + }, + { + "epoch": 6.403928790669122, + "grad_norm": 0.15817396342754364, + "learning_rate": 3.0253331269349662e-05, + "loss": 1.6629, + "step": 20864 + }, + { + "epoch": 6.404235727440147, + "grad_norm": 0.18253934383392334, + "learning_rate": 3.0248764865058122e-05, + "loss": 1.6877, + "step": 20865 + }, + { + "epoch": 6.4045426642111725, + "grad_norm": 0.20645618438720703, + "learning_rate": 3.0244198655959843e-05, + "loss": 1.7238, + "step": 20866 + }, + { + "epoch": 6.404849600982198, + "grad_norm": 0.2216680645942688, + "learning_rate": 3.0239632642099992e-05, + "loss": 1.7721, + "step": 20867 + }, + { + "epoch": 6.405156537753223, + "grad_norm": 0.21479755640029907, + "learning_rate": 3.023506682352365e-05, + "loss": 1.6686, + "step": 20868 + }, + { + "epoch": 6.4054634745242485, + "grad_norm": 0.21274925768375397, + "learning_rate": 3.0230501200275974e-05, + "loss": 1.7245, + "step": 20869 + }, + { + "epoch": 6.405770411295273, + "grad_norm": 0.19894039630889893, + "learning_rate": 3.0225935772402064e-05, + "loss": 1.6734, + "step": 20870 + }, + { + "epoch": 6.406077348066298, + "grad_norm": 0.24450170993804932, + "learning_rate": 3.022137053994707e-05, + "loss": 1.7103, + "step": 20871 + }, + { + "epoch": 6.406384284837324, + "grad_norm": 0.18289846181869507, + "learning_rate": 3.0216805502956057e-05, + "loss": 1.7866, + "step": 20872 + }, + { + "epoch": 6.406691221608349, + "grad_norm": 0.2884466350078583, + "learning_rate": 3.021224066147419e-05, + "loss": 1.7817, + "step": 20873 + }, + { + "epoch": 6.406998158379374, + "grad_norm": 0.21871373057365417, + "learning_rate": 3.0207676015546537e-05, + "loss": 1.6871, + "step": 20874 + }, + { + "epoch": 6.407305095150399, + "grad_norm": 0.239889994263649, + "learning_rate": 3.0203111565218244e-05, + "loss": 1.6412, + "step": 20875 + }, + { + "epoch": 6.407612031921424, + "grad_norm": 0.26960206031799316, + "learning_rate": 3.019854731053441e-05, + "loss": 1.7537, + "step": 20876 + }, + { + "epoch": 6.407918968692449, + "grad_norm": 0.32872483134269714, + "learning_rate": 3.019398325154013e-05, + "loss": 1.7718, + "step": 20877 + }, + { + "epoch": 6.408225905463475, + "grad_norm": 0.27766308188438416, + "learning_rate": 3.018941938828053e-05, + "loss": 1.7537, + "step": 20878 + }, + { + "epoch": 6.4085328422345, + "grad_norm": 0.1989286094903946, + "learning_rate": 3.0184855720800674e-05, + "loss": 1.7373, + "step": 20879 + }, + { + "epoch": 6.4088397790055245, + "grad_norm": 0.19748768210411072, + "learning_rate": 3.0180292249145703e-05, + "loss": 1.6821, + "step": 20880 + }, + { + "epoch": 6.40914671577655, + "grad_norm": 0.20632879436016083, + "learning_rate": 3.0175728973360694e-05, + "loss": 1.7641, + "step": 20881 + }, + { + "epoch": 6.409453652547575, + "grad_norm": 0.23808124661445618, + "learning_rate": 3.017116589349076e-05, + "loss": 1.7434, + "step": 20882 + }, + { + "epoch": 6.4097605893186005, + "grad_norm": 0.265514612197876, + "learning_rate": 3.0166603009580974e-05, + "loss": 1.7877, + "step": 20883 + }, + { + "epoch": 6.410067526089626, + "grad_norm": 0.21031250059604645, + "learning_rate": 3.0162040321676465e-05, + "loss": 1.738, + "step": 20884 + }, + { + "epoch": 6.41037446286065, + "grad_norm": 0.3011578619480133, + "learning_rate": 3.015747782982228e-05, + "loss": 1.7063, + "step": 20885 + }, + { + "epoch": 6.410681399631676, + "grad_norm": 0.28601503372192383, + "learning_rate": 3.015291553406353e-05, + "loss": 1.7021, + "step": 20886 + }, + { + "epoch": 6.410988336402701, + "grad_norm": 0.2433992624282837, + "learning_rate": 3.014835343444531e-05, + "loss": 1.6887, + "step": 20887 + }, + { + "epoch": 6.411295273173726, + "grad_norm": 0.3342660963535309, + "learning_rate": 3.014379153101269e-05, + "loss": 1.7798, + "step": 20888 + }, + { + "epoch": 6.411602209944752, + "grad_norm": 0.2390800267457962, + "learning_rate": 3.0139229823810757e-05, + "loss": 1.774, + "step": 20889 + }, + { + "epoch": 6.411909146715777, + "grad_norm": 0.2659217417240143, + "learning_rate": 3.0134668312884613e-05, + "loss": 1.7396, + "step": 20890 + }, + { + "epoch": 6.412216083486801, + "grad_norm": 0.22885620594024658, + "learning_rate": 3.0130106998279294e-05, + "loss": 1.7303, + "step": 20891 + }, + { + "epoch": 6.412523020257827, + "grad_norm": 0.20651856064796448, + "learning_rate": 3.0125545880039925e-05, + "loss": 1.7796, + "step": 20892 + }, + { + "epoch": 6.412829957028852, + "grad_norm": 0.26611828804016113, + "learning_rate": 3.0120984958211552e-05, + "loss": 1.7019, + "step": 20893 + }, + { + "epoch": 6.413136893799877, + "grad_norm": 0.2526776194572449, + "learning_rate": 3.0116424232839258e-05, + "loss": 1.7062, + "step": 20894 + }, + { + "epoch": 6.413443830570903, + "grad_norm": 0.2087634801864624, + "learning_rate": 3.0111863703968128e-05, + "loss": 1.7011, + "step": 20895 + }, + { + "epoch": 6.413750767341927, + "grad_norm": 0.20656780898571014, + "learning_rate": 3.0107303371643197e-05, + "loss": 1.7637, + "step": 20896 + }, + { + "epoch": 6.4140577041129525, + "grad_norm": 0.2083009034395218, + "learning_rate": 3.010274323590956e-05, + "loss": 1.7213, + "step": 20897 + }, + { + "epoch": 6.414364640883978, + "grad_norm": 0.22496090829372406, + "learning_rate": 3.0098183296812277e-05, + "loss": 1.7793, + "step": 20898 + }, + { + "epoch": 6.414671577655003, + "grad_norm": 0.2601132392883301, + "learning_rate": 3.0093623554396416e-05, + "loss": 1.8358, + "step": 20899 + }, + { + "epoch": 6.4149785144260285, + "grad_norm": 0.2364497184753418, + "learning_rate": 3.0089064008707026e-05, + "loss": 1.7299, + "step": 20900 + }, + { + "epoch": 6.415285451197054, + "grad_norm": 0.2011861503124237, + "learning_rate": 3.0084504659789186e-05, + "loss": 1.7521, + "step": 20901 + }, + { + "epoch": 6.415592387968078, + "grad_norm": 0.20605513453483582, + "learning_rate": 3.007994550768793e-05, + "loss": 1.7099, + "step": 20902 + }, + { + "epoch": 6.415899324739104, + "grad_norm": 0.20890796184539795, + "learning_rate": 3.0075386552448337e-05, + "loss": 1.7383, + "step": 20903 + }, + { + "epoch": 6.416206261510129, + "grad_norm": 0.20005083084106445, + "learning_rate": 3.0070827794115452e-05, + "loss": 1.6999, + "step": 20904 + }, + { + "epoch": 6.416513198281154, + "grad_norm": 0.20547670125961304, + "learning_rate": 3.006626923273433e-05, + "loss": 1.7424, + "step": 20905 + }, + { + "epoch": 6.41682013505218, + "grad_norm": 0.20799006521701813, + "learning_rate": 3.0061710868350003e-05, + "loss": 1.7266, + "step": 20906 + }, + { + "epoch": 6.417127071823204, + "grad_norm": 0.22234687209129333, + "learning_rate": 3.0057152701007563e-05, + "loss": 1.7755, + "step": 20907 + }, + { + "epoch": 6.417434008594229, + "grad_norm": 0.21947267651557922, + "learning_rate": 3.0052594730752005e-05, + "loss": 1.826, + "step": 20908 + }, + { + "epoch": 6.417740945365255, + "grad_norm": 0.2183268964290619, + "learning_rate": 3.0048036957628416e-05, + "loss": 1.7772, + "step": 20909 + }, + { + "epoch": 6.41804788213628, + "grad_norm": 0.1967134177684784, + "learning_rate": 3.0043479381681805e-05, + "loss": 1.6833, + "step": 20910 + }, + { + "epoch": 6.418354818907305, + "grad_norm": 0.2016787827014923, + "learning_rate": 3.003892200295723e-05, + "loss": 1.773, + "step": 20911 + }, + { + "epoch": 6.41866175567833, + "grad_norm": 0.2192344218492508, + "learning_rate": 3.0034364821499745e-05, + "loss": 1.7124, + "step": 20912 + }, + { + "epoch": 6.418968692449355, + "grad_norm": 0.24924327433109283, + "learning_rate": 3.002980783735434e-05, + "loss": 1.6882, + "step": 20913 + }, + { + "epoch": 6.4192756292203805, + "grad_norm": 0.2221844494342804, + "learning_rate": 3.0025251050566106e-05, + "loss": 1.8028, + "step": 20914 + }, + { + "epoch": 6.419582565991406, + "grad_norm": 0.27141162753105164, + "learning_rate": 3.0020694461180033e-05, + "loss": 1.698, + "step": 20915 + }, + { + "epoch": 6.419889502762431, + "grad_norm": 0.18856655061244965, + "learning_rate": 3.001613806924117e-05, + "loss": 1.7112, + "step": 20916 + }, + { + "epoch": 6.420196439533456, + "grad_norm": 0.2226688265800476, + "learning_rate": 3.0011581874794537e-05, + "loss": 1.6967, + "step": 20917 + }, + { + "epoch": 6.420503376304481, + "grad_norm": 0.2070344239473343, + "learning_rate": 3.000702587788518e-05, + "loss": 1.742, + "step": 20918 + }, + { + "epoch": 6.420810313075506, + "grad_norm": 0.22616387903690338, + "learning_rate": 3.00024700785581e-05, + "loss": 1.6865, + "step": 20919 + }, + { + "epoch": 6.421117249846532, + "grad_norm": 0.19745604693889618, + "learning_rate": 2.9997914476858348e-05, + "loss": 1.7328, + "step": 20920 + }, + { + "epoch": 6.421424186617557, + "grad_norm": 0.20654593408107758, + "learning_rate": 2.9993359072830906e-05, + "loss": 1.7811, + "step": 20921 + }, + { + "epoch": 6.421731123388582, + "grad_norm": 0.19188611209392548, + "learning_rate": 2.9988803866520832e-05, + "loss": 1.6808, + "step": 20922 + }, + { + "epoch": 6.422038060159607, + "grad_norm": 0.19907493889331818, + "learning_rate": 2.9984248857973118e-05, + "loss": 1.7326, + "step": 20923 + }, + { + "epoch": 6.422344996930632, + "grad_norm": 0.17484794557094574, + "learning_rate": 2.9979694047232804e-05, + "loss": 1.7166, + "step": 20924 + }, + { + "epoch": 6.422651933701657, + "grad_norm": 0.21412795782089233, + "learning_rate": 2.997513943434487e-05, + "loss": 1.7926, + "step": 20925 + }, + { + "epoch": 6.422958870472683, + "grad_norm": 0.17554008960723877, + "learning_rate": 2.9970585019354357e-05, + "loss": 1.6931, + "step": 20926 + }, + { + "epoch": 6.423265807243708, + "grad_norm": 0.16687868535518646, + "learning_rate": 2.9966030802306256e-05, + "loss": 1.6911, + "step": 20927 + }, + { + "epoch": 6.4235727440147325, + "grad_norm": 0.1802106350660324, + "learning_rate": 2.9961476783245578e-05, + "loss": 1.6921, + "step": 20928 + }, + { + "epoch": 6.423879680785758, + "grad_norm": 0.1968134343624115, + "learning_rate": 2.9956922962217347e-05, + "loss": 1.7035, + "step": 20929 + }, + { + "epoch": 6.424186617556783, + "grad_norm": 0.17703908681869507, + "learning_rate": 2.9952369339266538e-05, + "loss": 1.7122, + "step": 20930 + }, + { + "epoch": 6.4244935543278086, + "grad_norm": 0.22176744043827057, + "learning_rate": 2.9947815914438175e-05, + "loss": 1.7189, + "step": 20931 + }, + { + "epoch": 6.424800491098834, + "grad_norm": 0.19128306210041046, + "learning_rate": 2.9943262687777236e-05, + "loss": 1.7208, + "step": 20932 + }, + { + "epoch": 6.425107427869859, + "grad_norm": 0.2285725623369217, + "learning_rate": 2.9938709659328735e-05, + "loss": 1.7859, + "step": 20933 + }, + { + "epoch": 6.425414364640884, + "grad_norm": 0.1998651921749115, + "learning_rate": 2.9934156829137653e-05, + "loss": 1.6912, + "step": 20934 + }, + { + "epoch": 6.425721301411909, + "grad_norm": 0.1879023313522339, + "learning_rate": 2.9929604197249016e-05, + "loss": 1.7164, + "step": 20935 + }, + { + "epoch": 6.426028238182934, + "grad_norm": 0.2675700783729553, + "learning_rate": 2.992505176370778e-05, + "loss": 1.7475, + "step": 20936 + }, + { + "epoch": 6.42633517495396, + "grad_norm": 0.22345949709415436, + "learning_rate": 2.992049952855896e-05, + "loss": 1.6867, + "step": 20937 + }, + { + "epoch": 6.426642111724985, + "grad_norm": 0.17801997065544128, + "learning_rate": 2.9915947491847517e-05, + "loss": 1.736, + "step": 20938 + }, + { + "epoch": 6.4269490484960095, + "grad_norm": 0.22132502496242523, + "learning_rate": 2.991139565361846e-05, + "loss": 1.7244, + "step": 20939 + }, + { + "epoch": 6.427255985267035, + "grad_norm": 0.1899508535861969, + "learning_rate": 2.9906844013916758e-05, + "loss": 1.6781, + "step": 20940 + }, + { + "epoch": 6.42756292203806, + "grad_norm": 0.21948131918907166, + "learning_rate": 2.9902292572787414e-05, + "loss": 1.6911, + "step": 20941 + }, + { + "epoch": 6.4278698588090855, + "grad_norm": 0.16277503967285156, + "learning_rate": 2.9897741330275387e-05, + "loss": 1.702, + "step": 20942 + }, + { + "epoch": 6.428176795580111, + "grad_norm": 0.22303056716918945, + "learning_rate": 2.989319028642567e-05, + "loss": 1.7573, + "step": 20943 + }, + { + "epoch": 6.428483732351136, + "grad_norm": 0.21077899634838104, + "learning_rate": 2.9888639441283217e-05, + "loss": 1.7903, + "step": 20944 + }, + { + "epoch": 6.428790669122161, + "grad_norm": 0.23918256163597107, + "learning_rate": 2.988408879489303e-05, + "loss": 1.7112, + "step": 20945 + }, + { + "epoch": 6.429097605893186, + "grad_norm": 0.22226610779762268, + "learning_rate": 2.9879538347300074e-05, + "loss": 1.7039, + "step": 20946 + }, + { + "epoch": 6.429404542664211, + "grad_norm": 0.18605270981788635, + "learning_rate": 2.987498809854929e-05, + "loss": 1.7102, + "step": 20947 + }, + { + "epoch": 6.429711479435237, + "grad_norm": 0.24812746047973633, + "learning_rate": 2.987043804868569e-05, + "loss": 1.7112, + "step": 20948 + }, + { + "epoch": 6.430018416206262, + "grad_norm": 0.1869048923254013, + "learning_rate": 2.9865888197754206e-05, + "loss": 1.6946, + "step": 20949 + }, + { + "epoch": 6.430325352977286, + "grad_norm": 0.30707576870918274, + "learning_rate": 2.986133854579982e-05, + "loss": 1.7596, + "step": 20950 + }, + { + "epoch": 6.430632289748312, + "grad_norm": 0.20475640892982483, + "learning_rate": 2.985678909286748e-05, + "loss": 1.7162, + "step": 20951 + }, + { + "epoch": 6.430939226519337, + "grad_norm": 0.24273128807544708, + "learning_rate": 2.9852239839002182e-05, + "loss": 1.6803, + "step": 20952 + }, + { + "epoch": 6.431246163290362, + "grad_norm": 0.27484890818595886, + "learning_rate": 2.9847690784248834e-05, + "loss": 1.7948, + "step": 20953 + }, + { + "epoch": 6.431553100061388, + "grad_norm": 0.2204331010580063, + "learning_rate": 2.984314192865244e-05, + "loss": 1.769, + "step": 20954 + }, + { + "epoch": 6.431860036832412, + "grad_norm": 0.262463241815567, + "learning_rate": 2.9838593272257907e-05, + "loss": 1.7483, + "step": 20955 + }, + { + "epoch": 6.4321669736034375, + "grad_norm": 0.225942924618721, + "learning_rate": 2.983404481511023e-05, + "loss": 1.7228, + "step": 20956 + }, + { + "epoch": 6.432473910374463, + "grad_norm": 0.22381044924259186, + "learning_rate": 2.982949655725432e-05, + "loss": 1.7579, + "step": 20957 + }, + { + "epoch": 6.432780847145488, + "grad_norm": 0.1937711238861084, + "learning_rate": 2.982494849873518e-05, + "loss": 1.6833, + "step": 20958 + }, + { + "epoch": 6.4330877839165135, + "grad_norm": 0.2609664499759674, + "learning_rate": 2.9820400639597702e-05, + "loss": 1.7524, + "step": 20959 + }, + { + "epoch": 6.433394720687538, + "grad_norm": 0.2891463041305542, + "learning_rate": 2.981585297988686e-05, + "loss": 1.7672, + "step": 20960 + }, + { + "epoch": 6.433701657458563, + "grad_norm": 0.19604064524173737, + "learning_rate": 2.9811305519647582e-05, + "loss": 1.6684, + "step": 20961 + }, + { + "epoch": 6.434008594229589, + "grad_norm": 0.23522239923477173, + "learning_rate": 2.9806758258924822e-05, + "loss": 1.7461, + "step": 20962 + }, + { + "epoch": 6.434315531000614, + "grad_norm": 0.24907514452934265, + "learning_rate": 2.9802211197763525e-05, + "loss": 1.7702, + "step": 20963 + }, + { + "epoch": 6.434622467771639, + "grad_norm": 0.21963126957416534, + "learning_rate": 2.9797664336208592e-05, + "loss": 1.7263, + "step": 20964 + }, + { + "epoch": 6.434929404542665, + "grad_norm": 0.23124000430107117, + "learning_rate": 2.9793117674305004e-05, + "loss": 1.7362, + "step": 20965 + }, + { + "epoch": 6.435236341313689, + "grad_norm": 0.1917882263660431, + "learning_rate": 2.978857121209765e-05, + "loss": 1.7505, + "step": 20966 + }, + { + "epoch": 6.435543278084714, + "grad_norm": 0.24407804012298584, + "learning_rate": 2.9784024949631484e-05, + "loss": 1.7898, + "step": 20967 + }, + { + "epoch": 6.43585021485574, + "grad_norm": 0.210384339094162, + "learning_rate": 2.977947888695143e-05, + "loss": 1.7515, + "step": 20968 + }, + { + "epoch": 6.436157151626765, + "grad_norm": 0.20764803886413574, + "learning_rate": 2.9774933024102436e-05, + "loss": 1.7628, + "step": 20969 + }, + { + "epoch": 6.43646408839779, + "grad_norm": 0.21542097628116608, + "learning_rate": 2.9770387361129387e-05, + "loss": 1.7882, + "step": 20970 + }, + { + "epoch": 6.436771025168815, + "grad_norm": 0.1768570989370346, + "learning_rate": 2.976584189807725e-05, + "loss": 1.7471, + "step": 20971 + }, + { + "epoch": 6.43707796193984, + "grad_norm": 0.2398732751607895, + "learning_rate": 2.97612966349909e-05, + "loss": 1.6676, + "step": 20972 + }, + { + "epoch": 6.4373848987108655, + "grad_norm": 0.18291664123535156, + "learning_rate": 2.9756751571915286e-05, + "loss": 1.6791, + "step": 20973 + }, + { + "epoch": 6.437691835481891, + "grad_norm": 0.2769327759742737, + "learning_rate": 2.9752206708895314e-05, + "loss": 1.7675, + "step": 20974 + }, + { + "epoch": 6.437998772252916, + "grad_norm": 0.24859526753425598, + "learning_rate": 2.974766204597592e-05, + "loss": 1.7661, + "step": 20975 + }, + { + "epoch": 6.4383057090239415, + "grad_norm": 0.20495273172855377, + "learning_rate": 2.9743117583201984e-05, + "loss": 1.6774, + "step": 20976 + }, + { + "epoch": 6.438612645794966, + "grad_norm": 0.24650859832763672, + "learning_rate": 2.9738573320618447e-05, + "loss": 1.759, + "step": 20977 + }, + { + "epoch": 6.438919582565991, + "grad_norm": 0.21430176496505737, + "learning_rate": 2.973402925827019e-05, + "loss": 1.7273, + "step": 20978 + }, + { + "epoch": 6.439226519337017, + "grad_norm": 0.22392596304416656, + "learning_rate": 2.972948539620214e-05, + "loss": 1.7506, + "step": 20979 + }, + { + "epoch": 6.439533456108042, + "grad_norm": 0.24393923580646515, + "learning_rate": 2.9724941734459205e-05, + "loss": 1.7815, + "step": 20980 + }, + { + "epoch": 6.439840392879067, + "grad_norm": 0.2873772084712982, + "learning_rate": 2.9720398273086264e-05, + "loss": 1.7863, + "step": 20981 + }, + { + "epoch": 6.440147329650092, + "grad_norm": 0.218470498919487, + "learning_rate": 2.9715855012128246e-05, + "loss": 1.7347, + "step": 20982 + }, + { + "epoch": 6.440454266421117, + "grad_norm": 0.24520666897296906, + "learning_rate": 2.971131195163003e-05, + "loss": 1.6892, + "step": 20983 + }, + { + "epoch": 6.440761203192142, + "grad_norm": 0.2255270928144455, + "learning_rate": 2.970676909163652e-05, + "loss": 1.7179, + "step": 20984 + }, + { + "epoch": 6.441068139963168, + "grad_norm": 0.25171026587486267, + "learning_rate": 2.9702226432192604e-05, + "loss": 1.7087, + "step": 20985 + }, + { + "epoch": 6.441375076734193, + "grad_norm": 0.27045872807502747, + "learning_rate": 2.9697683973343204e-05, + "loss": 1.732, + "step": 20986 + }, + { + "epoch": 6.4416820135052175, + "grad_norm": 0.25374144315719604, + "learning_rate": 2.9693141715133177e-05, + "loss": 1.7688, + "step": 20987 + }, + { + "epoch": 6.441988950276243, + "grad_norm": 0.22694779932498932, + "learning_rate": 2.9688599657607442e-05, + "loss": 1.7105, + "step": 20988 + }, + { + "epoch": 6.442295887047268, + "grad_norm": 0.23455791175365448, + "learning_rate": 2.9684057800810845e-05, + "loss": 1.8007, + "step": 20989 + }, + { + "epoch": 6.4426028238182935, + "grad_norm": 0.23054158687591553, + "learning_rate": 2.9679516144788312e-05, + "loss": 1.6787, + "step": 20990 + }, + { + "epoch": 6.442909760589319, + "grad_norm": 0.22110030055046082, + "learning_rate": 2.9674974689584696e-05, + "loss": 1.8048, + "step": 20991 + }, + { + "epoch": 6.443216697360343, + "grad_norm": 0.22141657769680023, + "learning_rate": 2.9670433435244915e-05, + "loss": 1.7691, + "step": 20992 + }, + { + "epoch": 6.443523634131369, + "grad_norm": 0.18511974811553955, + "learning_rate": 2.9665892381813807e-05, + "loss": 1.6825, + "step": 20993 + }, + { + "epoch": 6.443830570902394, + "grad_norm": 0.21904997527599335, + "learning_rate": 2.966135152933629e-05, + "loss": 1.7711, + "step": 20994 + }, + { + "epoch": 6.444137507673419, + "grad_norm": 0.19334301352500916, + "learning_rate": 2.9656810877857196e-05, + "loss": 1.687, + "step": 20995 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 0.1766969859600067, + "learning_rate": 2.9652270427421426e-05, + "loss": 1.7211, + "step": 20996 + }, + { + "epoch": 6.44475138121547, + "grad_norm": 0.1821468323469162, + "learning_rate": 2.9647730178073864e-05, + "loss": 1.7086, + "step": 20997 + }, + { + "epoch": 6.445058317986494, + "grad_norm": 0.20812760293483734, + "learning_rate": 2.9643190129859333e-05, + "loss": 1.6844, + "step": 20998 + }, + { + "epoch": 6.44536525475752, + "grad_norm": 0.259042352437973, + "learning_rate": 2.9638650282822754e-05, + "loss": 1.7971, + "step": 20999 + }, + { + "epoch": 6.445672191528545, + "grad_norm": 0.2134076952934265, + "learning_rate": 2.9634110637008948e-05, + "loss": 1.7061, + "step": 21000 + }, + { + "epoch": 6.44597912829957, + "grad_norm": 0.21120613813400269, + "learning_rate": 2.962957119246281e-05, + "loss": 1.6708, + "step": 21001 + }, + { + "epoch": 6.446286065070596, + "grad_norm": 0.18577797710895538, + "learning_rate": 2.9625031949229176e-05, + "loss": 1.719, + "step": 21002 + }, + { + "epoch": 6.44659300184162, + "grad_norm": 0.21755708754062653, + "learning_rate": 2.962049290735294e-05, + "loss": 1.7203, + "step": 21003 + }, + { + "epoch": 6.4468999386126455, + "grad_norm": 0.2161538451910019, + "learning_rate": 2.961595406687891e-05, + "loss": 1.7254, + "step": 21004 + }, + { + "epoch": 6.447206875383671, + "grad_norm": 0.19979329407215118, + "learning_rate": 2.9611415427851995e-05, + "loss": 1.7203, + "step": 21005 + }, + { + "epoch": 6.447513812154696, + "grad_norm": 0.2103399932384491, + "learning_rate": 2.9606876990317e-05, + "loss": 1.7291, + "step": 21006 + }, + { + "epoch": 6.4478207489257215, + "grad_norm": 0.19513745605945587, + "learning_rate": 2.9602338754318815e-05, + "loss": 1.7574, + "step": 21007 + }, + { + "epoch": 6.448127685696747, + "grad_norm": 0.19819851219654083, + "learning_rate": 2.9597800719902256e-05, + "loss": 1.6913, + "step": 21008 + }, + { + "epoch": 6.448434622467771, + "grad_norm": 0.1847768872976303, + "learning_rate": 2.9593262887112215e-05, + "loss": 1.6987, + "step": 21009 + }, + { + "epoch": 6.448741559238797, + "grad_norm": 0.22399301826953888, + "learning_rate": 2.9588725255993487e-05, + "loss": 1.8328, + "step": 21010 + }, + { + "epoch": 6.449048496009822, + "grad_norm": 0.20540264248847961, + "learning_rate": 2.958418782659097e-05, + "loss": 1.765, + "step": 21011 + }, + { + "epoch": 6.449355432780847, + "grad_norm": 0.183661550283432, + "learning_rate": 2.9579650598949442e-05, + "loss": 1.7128, + "step": 21012 + }, + { + "epoch": 6.449662369551873, + "grad_norm": 0.1972927302122116, + "learning_rate": 2.9575113573113788e-05, + "loss": 1.717, + "step": 21013 + }, + { + "epoch": 6.449969306322897, + "grad_norm": 0.20188379287719727, + "learning_rate": 2.9570576749128846e-05, + "loss": 1.7603, + "step": 21014 + }, + { + "epoch": 6.4502762430939224, + "grad_norm": 0.20789781212806702, + "learning_rate": 2.9566040127039418e-05, + "loss": 1.7142, + "step": 21015 + }, + { + "epoch": 6.450583179864948, + "grad_norm": 0.19319608807563782, + "learning_rate": 2.956150370689038e-05, + "loss": 1.7524, + "step": 21016 + }, + { + "epoch": 6.450890116635973, + "grad_norm": 0.2153816968202591, + "learning_rate": 2.9556967488726516e-05, + "loss": 1.7325, + "step": 21017 + }, + { + "epoch": 6.4511970534069984, + "grad_norm": 0.19134823977947235, + "learning_rate": 2.9552431472592702e-05, + "loss": 1.7547, + "step": 21018 + }, + { + "epoch": 6.451503990178024, + "grad_norm": 0.21069955825805664, + "learning_rate": 2.9547895658533725e-05, + "loss": 1.7038, + "step": 21019 + }, + { + "epoch": 6.451810926949048, + "grad_norm": 0.20742546021938324, + "learning_rate": 2.9543360046594455e-05, + "loss": 1.7151, + "step": 21020 + }, + { + "epoch": 6.452117863720074, + "grad_norm": 0.16917672753334045, + "learning_rate": 2.9538824636819666e-05, + "loss": 1.6957, + "step": 21021 + }, + { + "epoch": 6.452424800491099, + "grad_norm": 0.21134577691555023, + "learning_rate": 2.953428942925423e-05, + "loss": 1.711, + "step": 21022 + }, + { + "epoch": 6.452731737262124, + "grad_norm": 0.19403810799121857, + "learning_rate": 2.9529754423942918e-05, + "loss": 1.734, + "step": 21023 + }, + { + "epoch": 6.45303867403315, + "grad_norm": 0.18534770607948303, + "learning_rate": 2.9525219620930582e-05, + "loss": 1.6857, + "step": 21024 + }, + { + "epoch": 6.453345610804174, + "grad_norm": 0.24268858134746552, + "learning_rate": 2.9520685020262016e-05, + "loss": 1.7316, + "step": 21025 + }, + { + "epoch": 6.453652547575199, + "grad_norm": 0.17590615153312683, + "learning_rate": 2.9516150621982063e-05, + "loss": 1.6608, + "step": 21026 + }, + { + "epoch": 6.453959484346225, + "grad_norm": 0.1949763298034668, + "learning_rate": 2.9511616426135504e-05, + "loss": 1.7955, + "step": 21027 + }, + { + "epoch": 6.45426642111725, + "grad_norm": 0.2424435019493103, + "learning_rate": 2.950708243276717e-05, + "loss": 1.7334, + "step": 21028 + }, + { + "epoch": 6.454573357888275, + "grad_norm": 0.22753369808197021, + "learning_rate": 2.950254864192184e-05, + "loss": 1.733, + "step": 21029 + }, + { + "epoch": 6.4548802946593, + "grad_norm": 0.1706271469593048, + "learning_rate": 2.949801505364435e-05, + "loss": 1.7424, + "step": 21030 + }, + { + "epoch": 6.455187231430325, + "grad_norm": 0.21614442765712738, + "learning_rate": 2.9493481667979506e-05, + "loss": 1.7813, + "step": 21031 + }, + { + "epoch": 6.4554941682013505, + "grad_norm": 0.1793162226676941, + "learning_rate": 2.9488948484972068e-05, + "loss": 1.7076, + "step": 21032 + }, + { + "epoch": 6.455801104972376, + "grad_norm": 0.19251759350299835, + "learning_rate": 2.9484415504666885e-05, + "loss": 1.7487, + "step": 21033 + }, + { + "epoch": 6.456108041743401, + "grad_norm": 0.1817556619644165, + "learning_rate": 2.947988272710871e-05, + "loss": 1.6958, + "step": 21034 + }, + { + "epoch": 6.456414978514426, + "grad_norm": 0.24368418753147125, + "learning_rate": 2.9475350152342378e-05, + "loss": 1.7867, + "step": 21035 + }, + { + "epoch": 6.456721915285451, + "grad_norm": 0.2362157702445984, + "learning_rate": 2.9470817780412653e-05, + "loss": 1.7241, + "step": 21036 + }, + { + "epoch": 6.457028852056476, + "grad_norm": 0.21049003303050995, + "learning_rate": 2.9466285611364358e-05, + "loss": 1.7146, + "step": 21037 + }, + { + "epoch": 6.457335788827502, + "grad_norm": 0.2516530454158783, + "learning_rate": 2.9461753645242246e-05, + "loss": 1.7349, + "step": 21038 + }, + { + "epoch": 6.457642725598527, + "grad_norm": 0.23165179789066315, + "learning_rate": 2.945722188209114e-05, + "loss": 1.7285, + "step": 21039 + }, + { + "epoch": 6.457949662369552, + "grad_norm": 0.27345010638237, + "learning_rate": 2.945269032195579e-05, + "loss": 1.7266, + "step": 21040 + }, + { + "epoch": 6.458256599140577, + "grad_norm": 0.16312900185585022, + "learning_rate": 2.9448158964881e-05, + "loss": 1.6781, + "step": 21041 + }, + { + "epoch": 6.458563535911602, + "grad_norm": 0.238658607006073, + "learning_rate": 2.9443627810911557e-05, + "loss": 1.6819, + "step": 21042 + }, + { + "epoch": 6.458870472682627, + "grad_norm": 0.19861388206481934, + "learning_rate": 2.943909686009223e-05, + "loss": 1.7397, + "step": 21043 + }, + { + "epoch": 6.459177409453653, + "grad_norm": 0.22675637900829315, + "learning_rate": 2.9434566112467793e-05, + "loss": 1.7231, + "step": 21044 + }, + { + "epoch": 6.459484346224678, + "grad_norm": 0.22638066112995148, + "learning_rate": 2.9430035568083043e-05, + "loss": 1.7466, + "step": 21045 + }, + { + "epoch": 6.4597912829957025, + "grad_norm": 0.2237064391374588, + "learning_rate": 2.942550522698272e-05, + "loss": 1.7373, + "step": 21046 + }, + { + "epoch": 6.460098219766728, + "grad_norm": 0.2613731324672699, + "learning_rate": 2.942097508921162e-05, + "loss": 1.7567, + "step": 21047 + }, + { + "epoch": 6.460405156537753, + "grad_norm": 0.21602070331573486, + "learning_rate": 2.941644515481452e-05, + "loss": 1.7512, + "step": 21048 + }, + { + "epoch": 6.4607120933087785, + "grad_norm": 0.30129116773605347, + "learning_rate": 2.941191542383615e-05, + "loss": 1.761, + "step": 21049 + }, + { + "epoch": 6.461019030079804, + "grad_norm": 0.2303919792175293, + "learning_rate": 2.940738589632132e-05, + "loss": 1.742, + "step": 21050 + }, + { + "epoch": 6.461325966850829, + "grad_norm": 0.2195158153772354, + "learning_rate": 2.940285657231475e-05, + "loss": 1.7169, + "step": 21051 + }, + { + "epoch": 6.461632903621854, + "grad_norm": 0.19029918313026428, + "learning_rate": 2.9398327451861242e-05, + "loss": 1.6721, + "step": 21052 + }, + { + "epoch": 6.461939840392879, + "grad_norm": 0.2006317377090454, + "learning_rate": 2.939379853500553e-05, + "loss": 1.7393, + "step": 21053 + }, + { + "epoch": 6.462246777163904, + "grad_norm": 0.222677081823349, + "learning_rate": 2.9389269821792377e-05, + "loss": 1.7858, + "step": 21054 + }, + { + "epoch": 6.46255371393493, + "grad_norm": 0.20772451162338257, + "learning_rate": 2.938474131226654e-05, + "loss": 1.735, + "step": 21055 + }, + { + "epoch": 6.462860650705955, + "grad_norm": 0.21006503701210022, + "learning_rate": 2.9380213006472778e-05, + "loss": 1.7197, + "step": 21056 + }, + { + "epoch": 6.463167587476979, + "grad_norm": 0.23545250296592712, + "learning_rate": 2.9375684904455825e-05, + "loss": 1.8278, + "step": 21057 + }, + { + "epoch": 6.463474524248005, + "grad_norm": 0.24590329825878143, + "learning_rate": 2.937115700626045e-05, + "loss": 1.6411, + "step": 21058 + }, + { + "epoch": 6.46378146101903, + "grad_norm": 0.22359445691108704, + "learning_rate": 2.9366629311931393e-05, + "loss": 1.7901, + "step": 21059 + }, + { + "epoch": 6.464088397790055, + "grad_norm": 0.22807523608207703, + "learning_rate": 2.93621018215134e-05, + "loss": 1.7472, + "step": 21060 + }, + { + "epoch": 6.464395334561081, + "grad_norm": 0.24183115363121033, + "learning_rate": 2.93575745350512e-05, + "loss": 1.7553, + "step": 21061 + }, + { + "epoch": 6.464702271332105, + "grad_norm": 0.23809055984020233, + "learning_rate": 2.935304745258958e-05, + "loss": 1.7451, + "step": 21062 + }, + { + "epoch": 6.4650092081031305, + "grad_norm": 0.28455644845962524, + "learning_rate": 2.934852057417321e-05, + "loss": 1.8112, + "step": 21063 + }, + { + "epoch": 6.465316144874156, + "grad_norm": 0.22193321585655212, + "learning_rate": 2.9343993899846888e-05, + "loss": 1.747, + "step": 21064 + }, + { + "epoch": 6.465623081645181, + "grad_norm": 0.30524322390556335, + "learning_rate": 2.933946742965532e-05, + "loss": 1.7117, + "step": 21065 + }, + { + "epoch": 6.4659300184162065, + "grad_norm": 0.19748717546463013, + "learning_rate": 2.9334941163643233e-05, + "loss": 1.6899, + "step": 21066 + }, + { + "epoch": 6.466236955187231, + "grad_norm": 0.25551193952560425, + "learning_rate": 2.933041510185539e-05, + "loss": 1.7264, + "step": 21067 + }, + { + "epoch": 6.466543891958256, + "grad_norm": 0.20016206800937653, + "learning_rate": 2.932588924433648e-05, + "loss": 1.6613, + "step": 21068 + }, + { + "epoch": 6.466850828729282, + "grad_norm": 0.31049394607543945, + "learning_rate": 2.932136359113127e-05, + "loss": 1.6575, + "step": 21069 + }, + { + "epoch": 6.467157765500307, + "grad_norm": 0.29408347606658936, + "learning_rate": 2.9316838142284436e-05, + "loss": 1.72, + "step": 21070 + }, + { + "epoch": 6.467464702271332, + "grad_norm": 0.18981193006038666, + "learning_rate": 2.9312312897840748e-05, + "loss": 1.6799, + "step": 21071 + }, + { + "epoch": 6.467771639042358, + "grad_norm": 0.26828575134277344, + "learning_rate": 2.9307787857844905e-05, + "loss": 1.6983, + "step": 21072 + }, + { + "epoch": 6.468078575813382, + "grad_norm": 0.2605530321598053, + "learning_rate": 2.9303263022341642e-05, + "loss": 1.7973, + "step": 21073 + }, + { + "epoch": 6.468385512584407, + "grad_norm": 0.389957070350647, + "learning_rate": 2.9298738391375648e-05, + "loss": 1.7288, + "step": 21074 + }, + { + "epoch": 6.468692449355433, + "grad_norm": 0.20525416731834412, + "learning_rate": 2.9294213964991667e-05, + "loss": 1.7526, + "step": 21075 + }, + { + "epoch": 6.468999386126458, + "grad_norm": 0.3628186285495758, + "learning_rate": 2.9289689743234387e-05, + "loss": 1.7055, + "step": 21076 + }, + { + "epoch": 6.469306322897483, + "grad_norm": 0.21661829948425293, + "learning_rate": 2.9285165726148545e-05, + "loss": 1.7806, + "step": 21077 + }, + { + "epoch": 6.469613259668508, + "grad_norm": 0.3815501034259796, + "learning_rate": 2.9280641913778816e-05, + "loss": 1.7257, + "step": 21078 + }, + { + "epoch": 6.469920196439533, + "grad_norm": 0.19470983743667603, + "learning_rate": 2.9276118306169957e-05, + "loss": 1.7055, + "step": 21079 + }, + { + "epoch": 6.4702271332105585, + "grad_norm": 0.36236056685447693, + "learning_rate": 2.927159490336662e-05, + "loss": 1.6748, + "step": 21080 + }, + { + "epoch": 6.470534069981584, + "grad_norm": 0.201282799243927, + "learning_rate": 2.9267071705413552e-05, + "loss": 1.6987, + "step": 21081 + }, + { + "epoch": 6.470841006752609, + "grad_norm": 0.3806697130203247, + "learning_rate": 2.9262548712355425e-05, + "loss": 1.7386, + "step": 21082 + }, + { + "epoch": 6.4711479435236345, + "grad_norm": 0.3023025691509247, + "learning_rate": 2.9258025924236933e-05, + "loss": 1.7183, + "step": 21083 + }, + { + "epoch": 6.471454880294659, + "grad_norm": 0.2648932635784149, + "learning_rate": 2.9253503341102806e-05, + "loss": 1.6755, + "step": 21084 + }, + { + "epoch": 6.471761817065684, + "grad_norm": 0.2647169828414917, + "learning_rate": 2.9248980962997707e-05, + "loss": 1.7326, + "step": 21085 + }, + { + "epoch": 6.47206875383671, + "grad_norm": 0.23535950481891632, + "learning_rate": 2.9244458789966355e-05, + "loss": 1.7541, + "step": 21086 + }, + { + "epoch": 6.472375690607735, + "grad_norm": 0.2551584541797638, + "learning_rate": 2.9239936822053403e-05, + "loss": 1.6907, + "step": 21087 + }, + { + "epoch": 6.47268262737876, + "grad_norm": 0.23313823342323303, + "learning_rate": 2.923541505930357e-05, + "loss": 1.705, + "step": 21088 + }, + { + "epoch": 6.472989564149785, + "grad_norm": 0.2368597686290741, + "learning_rate": 2.9230893501761534e-05, + "loss": 1.6666, + "step": 21089 + }, + { + "epoch": 6.47329650092081, + "grad_norm": 0.17861969769001007, + "learning_rate": 2.9226372149472003e-05, + "loss": 1.6927, + "step": 21090 + }, + { + "epoch": 6.473603437691835, + "grad_norm": 0.2212727665901184, + "learning_rate": 2.9221851002479616e-05, + "loss": 1.6972, + "step": 21091 + }, + { + "epoch": 6.473910374462861, + "grad_norm": 0.19382402300834656, + "learning_rate": 2.9217330060829096e-05, + "loss": 1.7602, + "step": 21092 + }, + { + "epoch": 6.474217311233886, + "grad_norm": 0.2762092053890228, + "learning_rate": 2.9212809324565076e-05, + "loss": 1.7642, + "step": 21093 + }, + { + "epoch": 6.474524248004911, + "grad_norm": 0.22068747878074646, + "learning_rate": 2.9208288793732274e-05, + "loss": 1.7477, + "step": 21094 + }, + { + "epoch": 6.474831184775936, + "grad_norm": 0.19979839026927948, + "learning_rate": 2.9203768468375337e-05, + "loss": 1.7266, + "step": 21095 + }, + { + "epoch": 6.475138121546961, + "grad_norm": 0.23038682341575623, + "learning_rate": 2.9199248348538965e-05, + "loss": 1.7428, + "step": 21096 + }, + { + "epoch": 6.475445058317987, + "grad_norm": 0.16841283440589905, + "learning_rate": 2.91947284342678e-05, + "loss": 1.6788, + "step": 21097 + }, + { + "epoch": 6.475751995089012, + "grad_norm": 0.22812627255916595, + "learning_rate": 2.9190208725606528e-05, + "loss": 1.7513, + "step": 21098 + }, + { + "epoch": 6.476058931860037, + "grad_norm": 0.18409393727779388, + "learning_rate": 2.9185689222599832e-05, + "loss": 1.6834, + "step": 21099 + }, + { + "epoch": 6.476365868631062, + "grad_norm": 0.26226910948753357, + "learning_rate": 2.9181169925292313e-05, + "loss": 1.7375, + "step": 21100 + }, + { + "epoch": 6.476672805402087, + "grad_norm": 0.1915685385465622, + "learning_rate": 2.9176650833728697e-05, + "loss": 1.7521, + "step": 21101 + }, + { + "epoch": 6.476979742173112, + "grad_norm": 0.22342176735401154, + "learning_rate": 2.917213194795362e-05, + "loss": 1.8018, + "step": 21102 + }, + { + "epoch": 6.477286678944138, + "grad_norm": 0.18338742852210999, + "learning_rate": 2.9167613268011745e-05, + "loss": 1.6817, + "step": 21103 + }, + { + "epoch": 6.477593615715163, + "grad_norm": 0.23008635640144348, + "learning_rate": 2.9163094793947728e-05, + "loss": 1.7037, + "step": 21104 + }, + { + "epoch": 6.4779005524861875, + "grad_norm": 0.20954197645187378, + "learning_rate": 2.9158576525806215e-05, + "loss": 1.7565, + "step": 21105 + }, + { + "epoch": 6.478207489257213, + "grad_norm": 0.21065562963485718, + "learning_rate": 2.9154058463631874e-05, + "loss": 1.6899, + "step": 21106 + }, + { + "epoch": 6.478514426028238, + "grad_norm": 0.20217828452587128, + "learning_rate": 2.9149540607469335e-05, + "loss": 1.7055, + "step": 21107 + }, + { + "epoch": 6.4788213627992635, + "grad_norm": 0.19058823585510254, + "learning_rate": 2.9145022957363244e-05, + "loss": 1.6794, + "step": 21108 + }, + { + "epoch": 6.479128299570289, + "grad_norm": 0.2308664619922638, + "learning_rate": 2.9140505513358297e-05, + "loss": 1.7322, + "step": 21109 + }, + { + "epoch": 6.479435236341313, + "grad_norm": 0.18911845982074738, + "learning_rate": 2.9135988275499056e-05, + "loss": 1.7255, + "step": 21110 + }, + { + "epoch": 6.479742173112339, + "grad_norm": 0.21459296345710754, + "learning_rate": 2.9131471243830256e-05, + "loss": 1.6599, + "step": 21111 + }, + { + "epoch": 6.480049109883364, + "grad_norm": 0.20521530508995056, + "learning_rate": 2.912695441839644e-05, + "loss": 1.7564, + "step": 21112 + }, + { + "epoch": 6.480356046654389, + "grad_norm": 0.21924994885921478, + "learning_rate": 2.912243779924232e-05, + "loss": 1.6922, + "step": 21113 + }, + { + "epoch": 6.480662983425415, + "grad_norm": 0.18219491839408875, + "learning_rate": 2.911792138641253e-05, + "loss": 1.6907, + "step": 21114 + }, + { + "epoch": 6.48096992019644, + "grad_norm": 0.23122453689575195, + "learning_rate": 2.9113405179951626e-05, + "loss": 1.7665, + "step": 21115 + }, + { + "epoch": 6.481276856967464, + "grad_norm": 0.18411210179328918, + "learning_rate": 2.9108889179904348e-05, + "loss": 1.7216, + "step": 21116 + }, + { + "epoch": 6.48158379373849, + "grad_norm": 0.2251562923192978, + "learning_rate": 2.9104373386315225e-05, + "loss": 1.7605, + "step": 21117 + }, + { + "epoch": 6.481890730509515, + "grad_norm": 0.2252185344696045, + "learning_rate": 2.9099857799228957e-05, + "loss": 1.7345, + "step": 21118 + }, + { + "epoch": 6.48219766728054, + "grad_norm": 0.20799386501312256, + "learning_rate": 2.909534241869014e-05, + "loss": 1.7497, + "step": 21119 + }, + { + "epoch": 6.482504604051566, + "grad_norm": 0.2059052586555481, + "learning_rate": 2.90908272447434e-05, + "loss": 1.7444, + "step": 21120 + }, + { + "epoch": 6.48281154082259, + "grad_norm": 0.17851221561431885, + "learning_rate": 2.9086312277433362e-05, + "loss": 1.7208, + "step": 21121 + }, + { + "epoch": 6.4831184775936155, + "grad_norm": 0.20561498403549194, + "learning_rate": 2.908179751680465e-05, + "loss": 1.731, + "step": 21122 + }, + { + "epoch": 6.483425414364641, + "grad_norm": 0.2386128008365631, + "learning_rate": 2.9077282962901868e-05, + "loss": 1.7493, + "step": 21123 + }, + { + "epoch": 6.483732351135666, + "grad_norm": 0.21024827659130096, + "learning_rate": 2.9072768615769642e-05, + "loss": 1.7353, + "step": 21124 + }, + { + "epoch": 6.4840392879066915, + "grad_norm": 0.23443256318569183, + "learning_rate": 2.9068254475452582e-05, + "loss": 1.7419, + "step": 21125 + }, + { + "epoch": 6.484346224677717, + "grad_norm": 0.1849295198917389, + "learning_rate": 2.90637405419953e-05, + "loss": 1.7239, + "step": 21126 + }, + { + "epoch": 6.484653161448741, + "grad_norm": 0.1967659890651703, + "learning_rate": 2.9059226815442385e-05, + "loss": 1.7163, + "step": 21127 + }, + { + "epoch": 6.484960098219767, + "grad_norm": 0.20395416021347046, + "learning_rate": 2.9054713295838505e-05, + "loss": 1.7108, + "step": 21128 + }, + { + "epoch": 6.485267034990792, + "grad_norm": 0.24162746965885162, + "learning_rate": 2.9050199983228184e-05, + "loss": 1.7666, + "step": 21129 + }, + { + "epoch": 6.485573971761817, + "grad_norm": 0.18104900419712067, + "learning_rate": 2.9045686877656086e-05, + "loss": 1.6863, + "step": 21130 + }, + { + "epoch": 6.485880908532843, + "grad_norm": 0.18469318747520447, + "learning_rate": 2.9041173979166813e-05, + "loss": 1.7344, + "step": 21131 + }, + { + "epoch": 6.486187845303867, + "grad_norm": 0.18488821387290955, + "learning_rate": 2.90366612878049e-05, + "loss": 1.694, + "step": 21132 + }, + { + "epoch": 6.486494782074892, + "grad_norm": 0.2030600905418396, + "learning_rate": 2.903214880361503e-05, + "loss": 1.7079, + "step": 21133 + }, + { + "epoch": 6.486801718845918, + "grad_norm": 0.2222873419523239, + "learning_rate": 2.902763652664171e-05, + "loss": 1.7193, + "step": 21134 + }, + { + "epoch": 6.487108655616943, + "grad_norm": 0.1936846524477005, + "learning_rate": 2.9023124456929608e-05, + "loss": 1.7152, + "step": 21135 + }, + { + "epoch": 6.487415592387968, + "grad_norm": 0.25259360671043396, + "learning_rate": 2.9018612594523274e-05, + "loss": 1.776, + "step": 21136 + }, + { + "epoch": 6.487722529158993, + "grad_norm": 0.22994543612003326, + "learning_rate": 2.9014100939467316e-05, + "loss": 1.7437, + "step": 21137 + }, + { + "epoch": 6.488029465930018, + "grad_norm": 0.2646990716457367, + "learning_rate": 2.900958949180631e-05, + "loss": 1.7535, + "step": 21138 + }, + { + "epoch": 6.4883364027010435, + "grad_norm": 0.22973869740962982, + "learning_rate": 2.9005078251584843e-05, + "loss": 1.6772, + "step": 21139 + }, + { + "epoch": 6.488643339472069, + "grad_norm": 0.21261750161647797, + "learning_rate": 2.9000567218847497e-05, + "loss": 1.6899, + "step": 21140 + }, + { + "epoch": 6.488950276243094, + "grad_norm": 0.24828271567821503, + "learning_rate": 2.8996056393638858e-05, + "loss": 1.7994, + "step": 21141 + }, + { + "epoch": 6.4892572130141195, + "grad_norm": 0.18308857083320618, + "learning_rate": 2.8991545776003497e-05, + "loss": 1.7847, + "step": 21142 + }, + { + "epoch": 6.489564149785144, + "grad_norm": 0.22744092345237732, + "learning_rate": 2.8987035365985994e-05, + "loss": 1.7789, + "step": 21143 + }, + { + "epoch": 6.489871086556169, + "grad_norm": 0.18573936820030212, + "learning_rate": 2.8982525163630903e-05, + "loss": 1.6649, + "step": 21144 + }, + { + "epoch": 6.490178023327195, + "grad_norm": 0.26056674122810364, + "learning_rate": 2.8978015168982863e-05, + "loss": 1.68, + "step": 21145 + }, + { + "epoch": 6.49048496009822, + "grad_norm": 0.1912553906440735, + "learning_rate": 2.897350538208635e-05, + "loss": 1.7011, + "step": 21146 + }, + { + "epoch": 6.490791896869245, + "grad_norm": 0.25937187671661377, + "learning_rate": 2.896899580298603e-05, + "loss": 1.7409, + "step": 21147 + }, + { + "epoch": 6.49109883364027, + "grad_norm": 0.22148750722408295, + "learning_rate": 2.8964486431726397e-05, + "loss": 1.6921, + "step": 21148 + }, + { + "epoch": 6.491405770411295, + "grad_norm": 0.23678559064865112, + "learning_rate": 2.8959977268352012e-05, + "loss": 1.6833, + "step": 21149 + }, + { + "epoch": 6.49171270718232, + "grad_norm": 0.2942093312740326, + "learning_rate": 2.8955468312907506e-05, + "loss": 1.7119, + "step": 21150 + }, + { + "epoch": 6.492019643953346, + "grad_norm": 0.18726128339767456, + "learning_rate": 2.8950959565437365e-05, + "loss": 1.7067, + "step": 21151 + }, + { + "epoch": 6.492326580724371, + "grad_norm": 0.23851951956748962, + "learning_rate": 2.894645102598621e-05, + "loss": 1.73, + "step": 21152 + }, + { + "epoch": 6.4926335174953955, + "grad_norm": 0.18054445087909698, + "learning_rate": 2.8941942694598533e-05, + "loss": 1.7243, + "step": 21153 + }, + { + "epoch": 6.492940454266421, + "grad_norm": 0.21889349818229675, + "learning_rate": 2.8937434571318934e-05, + "loss": 1.7789, + "step": 21154 + }, + { + "epoch": 6.493247391037446, + "grad_norm": 0.18788981437683105, + "learning_rate": 2.893292665619195e-05, + "loss": 1.7496, + "step": 21155 + }, + { + "epoch": 6.4935543278084715, + "grad_norm": 0.1964103877544403, + "learning_rate": 2.8928418949262138e-05, + "loss": 1.6732, + "step": 21156 + }, + { + "epoch": 6.493861264579497, + "grad_norm": 0.21939502656459808, + "learning_rate": 2.8923911450574043e-05, + "loss": 1.7149, + "step": 21157 + }, + { + "epoch": 6.494168201350522, + "grad_norm": 0.16927817463874817, + "learning_rate": 2.8919404160172203e-05, + "loss": 1.7093, + "step": 21158 + }, + { + "epoch": 6.494475138121547, + "grad_norm": 0.19907668232917786, + "learning_rate": 2.8914897078101166e-05, + "loss": 1.718, + "step": 21159 + }, + { + "epoch": 6.494782074892572, + "grad_norm": 0.18071576952934265, + "learning_rate": 2.891039020440548e-05, + "loss": 1.7241, + "step": 21160 + }, + { + "epoch": 6.495089011663597, + "grad_norm": 0.17780692875385284, + "learning_rate": 2.890588353912965e-05, + "loss": 1.7013, + "step": 21161 + }, + { + "epoch": 6.495395948434623, + "grad_norm": 0.20762500166893005, + "learning_rate": 2.8901377082318292e-05, + "loss": 1.8149, + "step": 21162 + }, + { + "epoch": 6.495702885205648, + "grad_norm": 0.21616768836975098, + "learning_rate": 2.889687083401585e-05, + "loss": 1.7467, + "step": 21163 + }, + { + "epoch": 6.496009821976672, + "grad_norm": 0.20075570046901703, + "learning_rate": 2.8892364794266935e-05, + "loss": 1.6643, + "step": 21164 + }, + { + "epoch": 6.496316758747698, + "grad_norm": 0.18893925845623016, + "learning_rate": 2.8887858963116028e-05, + "loss": 1.7362, + "step": 21165 + }, + { + "epoch": 6.496623695518723, + "grad_norm": 0.20031611621379852, + "learning_rate": 2.888335334060765e-05, + "loss": 1.6902, + "step": 21166 + }, + { + "epoch": 6.496930632289748, + "grad_norm": 0.2959407866001129, + "learning_rate": 2.887884792678639e-05, + "loss": 1.7874, + "step": 21167 + }, + { + "epoch": 6.497237569060774, + "grad_norm": 0.17434875667095184, + "learning_rate": 2.8874342721696697e-05, + "loss": 1.7353, + "step": 21168 + }, + { + "epoch": 6.497544505831799, + "grad_norm": 0.19451481103897095, + "learning_rate": 2.8869837725383163e-05, + "loss": 1.6942, + "step": 21169 + }, + { + "epoch": 6.4978514426028235, + "grad_norm": 0.17984920740127563, + "learning_rate": 2.886533293789025e-05, + "loss": 1.7461, + "step": 21170 + }, + { + "epoch": 6.498158379373849, + "grad_norm": 0.18166208267211914, + "learning_rate": 2.8860828359262516e-05, + "loss": 1.7202, + "step": 21171 + }, + { + "epoch": 6.498465316144874, + "grad_norm": 0.1849331557750702, + "learning_rate": 2.8856323989544472e-05, + "loss": 1.6862, + "step": 21172 + }, + { + "epoch": 6.4987722529158995, + "grad_norm": 0.17846204340457916, + "learning_rate": 2.8851819828780623e-05, + "loss": 1.7446, + "step": 21173 + }, + { + "epoch": 6.499079189686925, + "grad_norm": 0.1963818222284317, + "learning_rate": 2.8847315877015486e-05, + "loss": 1.7366, + "step": 21174 + }, + { + "epoch": 6.499386126457949, + "grad_norm": 0.1917402446269989, + "learning_rate": 2.8842812134293574e-05, + "loss": 1.7362, + "step": 21175 + }, + { + "epoch": 6.499693063228975, + "grad_norm": 0.16559138894081116, + "learning_rate": 2.883830860065939e-05, + "loss": 1.6735, + "step": 21176 + }, + { + "epoch": 6.5, + "grad_norm": 0.1820032149553299, + "learning_rate": 2.8833805276157442e-05, + "loss": 1.7107, + "step": 21177 + }, + { + "epoch": 6.500306936771025, + "grad_norm": 0.23760980367660522, + "learning_rate": 2.882930216083222e-05, + "loss": 1.7024, + "step": 21178 + }, + { + "epoch": 6.500613873542051, + "grad_norm": 0.22314296662807465, + "learning_rate": 2.8824799254728285e-05, + "loss": 1.714, + "step": 21179 + }, + { + "epoch": 6.500920810313076, + "grad_norm": 0.21919335424900055, + "learning_rate": 2.8820296557890046e-05, + "loss": 1.7625, + "step": 21180 + }, + { + "epoch": 6.5012277470841005, + "grad_norm": 0.21632128953933716, + "learning_rate": 2.88157940703621e-05, + "loss": 1.6589, + "step": 21181 + }, + { + "epoch": 6.501534683855126, + "grad_norm": 0.17998506128787994, + "learning_rate": 2.8811291792188867e-05, + "loss": 1.7528, + "step": 21182 + }, + { + "epoch": 6.501841620626151, + "grad_norm": 0.19783075153827667, + "learning_rate": 2.880678972341485e-05, + "loss": 1.6908, + "step": 21183 + }, + { + "epoch": 6.5021485573971765, + "grad_norm": 0.20510388910770416, + "learning_rate": 2.88022878640846e-05, + "loss": 1.7342, + "step": 21184 + }, + { + "epoch": 6.502455494168201, + "grad_norm": 0.24218666553497314, + "learning_rate": 2.879778621424253e-05, + "loss": 1.8, + "step": 21185 + }, + { + "epoch": 6.502762430939226, + "grad_norm": 0.1901179403066635, + "learning_rate": 2.8793284773933195e-05, + "loss": 1.699, + "step": 21186 + }, + { + "epoch": 6.503069367710252, + "grad_norm": 0.2652232348918915, + "learning_rate": 2.8788783543201007e-05, + "loss": 1.8394, + "step": 21187 + }, + { + "epoch": 6.503376304481277, + "grad_norm": 0.17701558768749237, + "learning_rate": 2.878428252209052e-05, + "loss": 1.6674, + "step": 21188 + }, + { + "epoch": 6.503683241252302, + "grad_norm": 0.17464707791805267, + "learning_rate": 2.8779781710646185e-05, + "loss": 1.6894, + "step": 21189 + }, + { + "epoch": 6.503990178023328, + "grad_norm": 0.19469478726387024, + "learning_rate": 2.877528110891249e-05, + "loss": 1.7487, + "step": 21190 + }, + { + "epoch": 6.504297114794352, + "grad_norm": 0.21656417846679688, + "learning_rate": 2.87707807169339e-05, + "loss": 1.641, + "step": 21191 + }, + { + "epoch": 6.504604051565377, + "grad_norm": 0.20374895632266998, + "learning_rate": 2.8766280534754896e-05, + "loss": 1.6692, + "step": 21192 + }, + { + "epoch": 6.504910988336403, + "grad_norm": 0.26638445258140564, + "learning_rate": 2.876178056241996e-05, + "loss": 1.7415, + "step": 21193 + }, + { + "epoch": 6.505217925107428, + "grad_norm": 0.1852893978357315, + "learning_rate": 2.8757280799973557e-05, + "loss": 1.6981, + "step": 21194 + }, + { + "epoch": 6.505524861878453, + "grad_norm": 0.20518383383750916, + "learning_rate": 2.875278124746013e-05, + "loss": 1.781, + "step": 21195 + }, + { + "epoch": 6.505831798649478, + "grad_norm": 0.19968904554843903, + "learning_rate": 2.874828190492422e-05, + "loss": 1.6813, + "step": 21196 + }, + { + "epoch": 6.506138735420503, + "grad_norm": 0.19164247810840607, + "learning_rate": 2.87437827724102e-05, + "loss": 1.6833, + "step": 21197 + }, + { + "epoch": 6.5064456721915285, + "grad_norm": 0.19305361807346344, + "learning_rate": 2.873928384996262e-05, + "loss": 1.7164, + "step": 21198 + }, + { + "epoch": 6.506752608962554, + "grad_norm": 0.1853758841753006, + "learning_rate": 2.873478513762587e-05, + "loss": 1.7481, + "step": 21199 + }, + { + "epoch": 6.507059545733579, + "grad_norm": 0.20187529921531677, + "learning_rate": 2.8730286635444425e-05, + "loss": 1.7666, + "step": 21200 + }, + { + "epoch": 6.5073664825046045, + "grad_norm": 0.19769401848316193, + "learning_rate": 2.872578834346279e-05, + "loss": 1.798, + "step": 21201 + }, + { + "epoch": 6.507673419275629, + "grad_norm": 0.1936112940311432, + "learning_rate": 2.8721290261725342e-05, + "loss": 1.6992, + "step": 21202 + }, + { + "epoch": 6.507980356046654, + "grad_norm": 0.17090481519699097, + "learning_rate": 2.871679239027662e-05, + "loss": 1.6802, + "step": 21203 + }, + { + "epoch": 6.50828729281768, + "grad_norm": 0.19443605840206146, + "learning_rate": 2.8712294729160987e-05, + "loss": 1.736, + "step": 21204 + }, + { + "epoch": 6.508594229588705, + "grad_norm": 0.19216817617416382, + "learning_rate": 2.8707797278422954e-05, + "loss": 1.7109, + "step": 21205 + }, + { + "epoch": 6.50890116635973, + "grad_norm": 0.19900040328502655, + "learning_rate": 2.8703300038106952e-05, + "loss": 1.7158, + "step": 21206 + }, + { + "epoch": 6.509208103130755, + "grad_norm": 0.17810803651809692, + "learning_rate": 2.8698803008257425e-05, + "loss": 1.6886, + "step": 21207 + }, + { + "epoch": 6.50951503990178, + "grad_norm": 0.1890508532524109, + "learning_rate": 2.8694306188918807e-05, + "loss": 1.7447, + "step": 21208 + }, + { + "epoch": 6.509821976672805, + "grad_norm": 0.17456012964248657, + "learning_rate": 2.868980958013554e-05, + "loss": 1.7094, + "step": 21209 + }, + { + "epoch": 6.510128913443831, + "grad_norm": 0.17089629173278809, + "learning_rate": 2.8685313181952066e-05, + "loss": 1.6827, + "step": 21210 + }, + { + "epoch": 6.510435850214856, + "grad_norm": 0.22681273519992828, + "learning_rate": 2.8680816994412823e-05, + "loss": 1.7374, + "step": 21211 + }, + { + "epoch": 6.510742786985881, + "grad_norm": 0.20642207562923431, + "learning_rate": 2.8676321017562225e-05, + "loss": 1.7609, + "step": 21212 + }, + { + "epoch": 6.511049723756906, + "grad_norm": 0.2360219657421112, + "learning_rate": 2.867182525144475e-05, + "loss": 1.7577, + "step": 21213 + }, + { + "epoch": 6.511356660527931, + "grad_norm": 0.19686923921108246, + "learning_rate": 2.8667329696104766e-05, + "loss": 1.7459, + "step": 21214 + }, + { + "epoch": 6.5116635972989565, + "grad_norm": 0.21280834078788757, + "learning_rate": 2.8662834351586777e-05, + "loss": 1.7837, + "step": 21215 + }, + { + "epoch": 6.511970534069982, + "grad_norm": 0.19297273457050323, + "learning_rate": 2.8658339217935136e-05, + "loss": 1.734, + "step": 21216 + }, + { + "epoch": 6.512277470841006, + "grad_norm": 0.1937931329011917, + "learning_rate": 2.8653844295194283e-05, + "loss": 1.6631, + "step": 21217 + }, + { + "epoch": 6.512584407612032, + "grad_norm": 0.2061077207326889, + "learning_rate": 2.8649349583408692e-05, + "loss": 1.7324, + "step": 21218 + }, + { + "epoch": 6.512891344383057, + "grad_norm": 0.19711358845233917, + "learning_rate": 2.8644855082622695e-05, + "loss": 1.7024, + "step": 21219 + }, + { + "epoch": 6.513198281154082, + "grad_norm": 0.17352496087551117, + "learning_rate": 2.8640360792880804e-05, + "loss": 1.7261, + "step": 21220 + }, + { + "epoch": 6.513505217925108, + "grad_norm": 0.181448295712471, + "learning_rate": 2.8635866714227344e-05, + "loss": 1.7147, + "step": 21221 + }, + { + "epoch": 6.513812154696133, + "grad_norm": 0.1827932894229889, + "learning_rate": 2.8631372846706787e-05, + "loss": 1.7338, + "step": 21222 + }, + { + "epoch": 6.514119091467157, + "grad_norm": 0.20659075677394867, + "learning_rate": 2.862687919036353e-05, + "loss": 1.6611, + "step": 21223 + }, + { + "epoch": 6.514426028238183, + "grad_norm": 0.19185996055603027, + "learning_rate": 2.8622385745241987e-05, + "loss": 1.7834, + "step": 21224 + }, + { + "epoch": 6.514732965009208, + "grad_norm": 0.19825506210327148, + "learning_rate": 2.8617892511386558e-05, + "loss": 1.7608, + "step": 21225 + }, + { + "epoch": 6.515039901780233, + "grad_norm": 0.16927020251750946, + "learning_rate": 2.861339948884164e-05, + "loss": 1.6651, + "step": 21226 + }, + { + "epoch": 6.515346838551259, + "grad_norm": 0.19211016595363617, + "learning_rate": 2.8608906677651646e-05, + "loss": 1.6673, + "step": 21227 + }, + { + "epoch": 6.515653775322283, + "grad_norm": 0.20192545652389526, + "learning_rate": 2.8604414077860974e-05, + "loss": 1.7301, + "step": 21228 + }, + { + "epoch": 6.5159607120933085, + "grad_norm": 0.2075425237417221, + "learning_rate": 2.8599921689514002e-05, + "loss": 1.783, + "step": 21229 + }, + { + "epoch": 6.516267648864334, + "grad_norm": 0.21261392533779144, + "learning_rate": 2.8595429512655192e-05, + "loss": 1.7277, + "step": 21230 + }, + { + "epoch": 6.516574585635359, + "grad_norm": 0.21201452612876892, + "learning_rate": 2.8590937547328844e-05, + "loss": 1.6582, + "step": 21231 + }, + { + "epoch": 6.5168815224063845, + "grad_norm": 0.2071799635887146, + "learning_rate": 2.858644579357944e-05, + "loss": 1.7559, + "step": 21232 + }, + { + "epoch": 6.51718845917741, + "grad_norm": 0.20225903391838074, + "learning_rate": 2.858195425145132e-05, + "loss": 1.7507, + "step": 21233 + }, + { + "epoch": 6.517495395948434, + "grad_norm": 0.2738147974014282, + "learning_rate": 2.8577462920988852e-05, + "loss": 1.7073, + "step": 21234 + }, + { + "epoch": 6.51780233271946, + "grad_norm": 0.17878220975399017, + "learning_rate": 2.8572971802236498e-05, + "loss": 1.6598, + "step": 21235 + }, + { + "epoch": 6.518109269490485, + "grad_norm": 0.21365594863891602, + "learning_rate": 2.8568480895238552e-05, + "loss": 1.7404, + "step": 21236 + }, + { + "epoch": 6.51841620626151, + "grad_norm": 0.18392804265022278, + "learning_rate": 2.856399020003948e-05, + "loss": 1.706, + "step": 21237 + }, + { + "epoch": 6.518723143032536, + "grad_norm": 0.16268405318260193, + "learning_rate": 2.855949971668358e-05, + "loss": 1.6725, + "step": 21238 + }, + { + "epoch": 6.51903007980356, + "grad_norm": 0.19590096175670624, + "learning_rate": 2.855500944521529e-05, + "loss": 1.7269, + "step": 21239 + }, + { + "epoch": 6.519337016574585, + "grad_norm": 0.19443263113498688, + "learning_rate": 2.8550519385678965e-05, + "loss": 1.686, + "step": 21240 + }, + { + "epoch": 6.519643953345611, + "grad_norm": 0.2112705111503601, + "learning_rate": 2.8546029538118985e-05, + "loss": 1.6904, + "step": 21241 + }, + { + "epoch": 6.519950890116636, + "grad_norm": 0.21015888452529907, + "learning_rate": 2.8541539902579712e-05, + "loss": 1.6972, + "step": 21242 + }, + { + "epoch": 6.520257826887661, + "grad_norm": 0.2853320837020874, + "learning_rate": 2.853705047910552e-05, + "loss": 1.7415, + "step": 21243 + }, + { + "epoch": 6.520564763658687, + "grad_norm": 0.20927128195762634, + "learning_rate": 2.853256126774077e-05, + "loss": 1.6955, + "step": 21244 + }, + { + "epoch": 6.520871700429711, + "grad_norm": 0.27824920415878296, + "learning_rate": 2.8528072268529836e-05, + "loss": 1.7666, + "step": 21245 + }, + { + "epoch": 6.5211786372007365, + "grad_norm": 0.21164646744728088, + "learning_rate": 2.8523583481517057e-05, + "loss": 1.75, + "step": 21246 + }, + { + "epoch": 6.521485573971762, + "grad_norm": 0.249397411942482, + "learning_rate": 2.851909490674686e-05, + "loss": 1.6767, + "step": 21247 + }, + { + "epoch": 6.521792510742787, + "grad_norm": 0.2311551868915558, + "learning_rate": 2.8514606544263507e-05, + "loss": 1.8071, + "step": 21248 + }, + { + "epoch": 6.5220994475138125, + "grad_norm": 0.21878042817115784, + "learning_rate": 2.8510118394111453e-05, + "loss": 1.6881, + "step": 21249 + }, + { + "epoch": 6.522406384284837, + "grad_norm": 0.2095690816640854, + "learning_rate": 2.8505630456334974e-05, + "loss": 1.6526, + "step": 21250 + }, + { + "epoch": 6.522713321055862, + "grad_norm": 0.2303982526063919, + "learning_rate": 2.850114273097844e-05, + "loss": 1.7256, + "step": 21251 + }, + { + "epoch": 6.523020257826888, + "grad_norm": 0.22640225291252136, + "learning_rate": 2.8496655218086255e-05, + "loss": 1.7797, + "step": 21252 + }, + { + "epoch": 6.523327194597913, + "grad_norm": 0.24268805980682373, + "learning_rate": 2.8492167917702683e-05, + "loss": 1.7673, + "step": 21253 + }, + { + "epoch": 6.523634131368938, + "grad_norm": 0.1988469958305359, + "learning_rate": 2.8487680829872158e-05, + "loss": 1.7126, + "step": 21254 + }, + { + "epoch": 6.523941068139964, + "grad_norm": 0.18385496735572815, + "learning_rate": 2.8483193954638942e-05, + "loss": 1.7113, + "step": 21255 + }, + { + "epoch": 6.524248004910988, + "grad_norm": 0.21865327656269073, + "learning_rate": 2.847870729204743e-05, + "loss": 1.6686, + "step": 21256 + }, + { + "epoch": 6.524554941682013, + "grad_norm": 0.16982951760292053, + "learning_rate": 2.8474220842141946e-05, + "loss": 1.6865, + "step": 21257 + }, + { + "epoch": 6.524861878453039, + "grad_norm": 0.23028478026390076, + "learning_rate": 2.8469734604966834e-05, + "loss": 1.7647, + "step": 21258 + }, + { + "epoch": 6.525168815224064, + "grad_norm": 0.1805485039949417, + "learning_rate": 2.8465248580566415e-05, + "loss": 1.7524, + "step": 21259 + }, + { + "epoch": 6.525475751995089, + "grad_norm": 0.18652063608169556, + "learning_rate": 2.8460762768985037e-05, + "loss": 1.7028, + "step": 21260 + }, + { + "epoch": 6.525782688766114, + "grad_norm": 0.22772997617721558, + "learning_rate": 2.845627717026703e-05, + "loss": 1.7866, + "step": 21261 + }, + { + "epoch": 6.526089625537139, + "grad_norm": 0.19889821112155914, + "learning_rate": 2.8451791784456718e-05, + "loss": 1.7076, + "step": 21262 + }, + { + "epoch": 6.526396562308165, + "grad_norm": 0.24747174978256226, + "learning_rate": 2.8447306611598402e-05, + "loss": 1.7615, + "step": 21263 + }, + { + "epoch": 6.52670349907919, + "grad_norm": 0.1988009363412857, + "learning_rate": 2.8442821651736473e-05, + "loss": 1.7853, + "step": 21264 + }, + { + "epoch": 6.527010435850215, + "grad_norm": 0.250032901763916, + "learning_rate": 2.8438336904915185e-05, + "loss": 1.6906, + "step": 21265 + }, + { + "epoch": 6.52731737262124, + "grad_norm": 0.15398284792900085, + "learning_rate": 2.8433852371178925e-05, + "loss": 1.6437, + "step": 21266 + }, + { + "epoch": 6.527624309392265, + "grad_norm": 0.33137503266334534, + "learning_rate": 2.8429368050571958e-05, + "loss": 1.8213, + "step": 21267 + }, + { + "epoch": 6.52793124616329, + "grad_norm": 0.23827852308750153, + "learning_rate": 2.8424883943138593e-05, + "loss": 1.7148, + "step": 21268 + }, + { + "epoch": 6.528238182934316, + "grad_norm": 0.21171489357948303, + "learning_rate": 2.8420400048923217e-05, + "loss": 1.7729, + "step": 21269 + }, + { + "epoch": 6.528545119705341, + "grad_norm": 0.21698513627052307, + "learning_rate": 2.8415916367970053e-05, + "loss": 1.7267, + "step": 21270 + }, + { + "epoch": 6.5288520564763655, + "grad_norm": 0.2217913120985031, + "learning_rate": 2.8411432900323498e-05, + "loss": 1.7259, + "step": 21271 + }, + { + "epoch": 6.529158993247391, + "grad_norm": 0.25518202781677246, + "learning_rate": 2.8406949646027768e-05, + "loss": 1.7754, + "step": 21272 + }, + { + "epoch": 6.529465930018416, + "grad_norm": 0.22206325829029083, + "learning_rate": 2.8402466605127247e-05, + "loss": 1.755, + "step": 21273 + }, + { + "epoch": 6.5297728667894415, + "grad_norm": 0.26918017864227295, + "learning_rate": 2.8397983777666206e-05, + "loss": 1.783, + "step": 21274 + }, + { + "epoch": 6.530079803560467, + "grad_norm": 0.19280646741390228, + "learning_rate": 2.8393501163688952e-05, + "loss": 1.6942, + "step": 21275 + }, + { + "epoch": 6.530386740331492, + "grad_norm": 0.24567140638828278, + "learning_rate": 2.8389018763239784e-05, + "loss": 1.7316, + "step": 21276 + }, + { + "epoch": 6.530693677102517, + "grad_norm": 0.21791695058345795, + "learning_rate": 2.8384536576362997e-05, + "loss": 1.7627, + "step": 21277 + }, + { + "epoch": 6.531000613873542, + "grad_norm": 0.2441660761833191, + "learning_rate": 2.8380054603102885e-05, + "loss": 1.7112, + "step": 21278 + }, + { + "epoch": 6.531307550644567, + "grad_norm": 0.1768653243780136, + "learning_rate": 2.837557284350375e-05, + "loss": 1.6906, + "step": 21279 + }, + { + "epoch": 6.531614487415593, + "grad_norm": 0.21037769317626953, + "learning_rate": 2.8371091297609877e-05, + "loss": 1.7197, + "step": 21280 + }, + { + "epoch": 6.531921424186618, + "grad_norm": 0.23989829421043396, + "learning_rate": 2.8366609965465563e-05, + "loss": 1.7693, + "step": 21281 + }, + { + "epoch": 6.532228360957642, + "grad_norm": 0.18302181363105774, + "learning_rate": 2.836212884711506e-05, + "loss": 1.6643, + "step": 21282 + }, + { + "epoch": 6.532535297728668, + "grad_norm": 0.2068471908569336, + "learning_rate": 2.835764794260273e-05, + "loss": 1.7431, + "step": 21283 + }, + { + "epoch": 6.532842234499693, + "grad_norm": 0.18803778290748596, + "learning_rate": 2.8353167251972777e-05, + "loss": 1.7506, + "step": 21284 + }, + { + "epoch": 6.533149171270718, + "grad_norm": 0.20789632201194763, + "learning_rate": 2.8348686775269507e-05, + "loss": 1.7174, + "step": 21285 + }, + { + "epoch": 6.533456108041744, + "grad_norm": 0.18927012383937836, + "learning_rate": 2.834420651253723e-05, + "loss": 1.6723, + "step": 21286 + }, + { + "epoch": 6.533763044812769, + "grad_norm": 0.22616887092590332, + "learning_rate": 2.8339726463820172e-05, + "loss": 1.7045, + "step": 21287 + }, + { + "epoch": 6.5340699815837935, + "grad_norm": 0.23880253732204437, + "learning_rate": 2.8335246629162658e-05, + "loss": 1.7255, + "step": 21288 + }, + { + "epoch": 6.534376918354819, + "grad_norm": 0.24279431998729706, + "learning_rate": 2.8330767008608904e-05, + "loss": 1.7548, + "step": 21289 + }, + { + "epoch": 6.534683855125844, + "grad_norm": 0.20542044937610626, + "learning_rate": 2.832628760220323e-05, + "loss": 1.6851, + "step": 21290 + }, + { + "epoch": 6.5349907918968695, + "grad_norm": 0.19426794350147247, + "learning_rate": 2.832180840998988e-05, + "loss": 1.7528, + "step": 21291 + }, + { + "epoch": 6.535297728667894, + "grad_norm": 0.2744491398334503, + "learning_rate": 2.8317329432013136e-05, + "loss": 1.7821, + "step": 21292 + }, + { + "epoch": 6.535604665438919, + "grad_norm": 0.2692170739173889, + "learning_rate": 2.8312850668317243e-05, + "loss": 1.6626, + "step": 21293 + }, + { + "epoch": 6.535911602209945, + "grad_norm": 0.24998809397220612, + "learning_rate": 2.830837211894647e-05, + "loss": 1.7031, + "step": 21294 + }, + { + "epoch": 6.53621853898097, + "grad_norm": 0.22888946533203125, + "learning_rate": 2.830389378394508e-05, + "loss": 1.7706, + "step": 21295 + }, + { + "epoch": 6.536525475751995, + "grad_norm": 0.21685005724430084, + "learning_rate": 2.8299415663357332e-05, + "loss": 1.681, + "step": 21296 + }, + { + "epoch": 6.536832412523021, + "grad_norm": 0.23309725522994995, + "learning_rate": 2.8294937757227475e-05, + "loss": 1.7781, + "step": 21297 + }, + { + "epoch": 6.537139349294045, + "grad_norm": 0.26712173223495483, + "learning_rate": 2.829046006559976e-05, + "loss": 1.6966, + "step": 21298 + }, + { + "epoch": 6.53744628606507, + "grad_norm": 0.1836499124765396, + "learning_rate": 2.8285982588518428e-05, + "loss": 1.7192, + "step": 21299 + }, + { + "epoch": 6.537753222836096, + "grad_norm": 0.24073021113872528, + "learning_rate": 2.828150532602778e-05, + "loss": 1.6997, + "step": 21300 + }, + { + "epoch": 6.538060159607121, + "grad_norm": 0.16308051347732544, + "learning_rate": 2.8277028278172014e-05, + "loss": 1.6901, + "step": 21301 + }, + { + "epoch": 6.538367096378146, + "grad_norm": 0.2330634444952011, + "learning_rate": 2.8272551444995376e-05, + "loss": 1.7426, + "step": 21302 + }, + { + "epoch": 6.538674033149171, + "grad_norm": 0.18600425124168396, + "learning_rate": 2.8268074826542123e-05, + "loss": 1.6906, + "step": 21303 + }, + { + "epoch": 6.538980969920196, + "grad_norm": 0.24717238545417786, + "learning_rate": 2.8263598422856475e-05, + "loss": 1.6962, + "step": 21304 + }, + { + "epoch": 6.5392879066912215, + "grad_norm": 0.1907368302345276, + "learning_rate": 2.8259122233982727e-05, + "loss": 1.7083, + "step": 21305 + }, + { + "epoch": 6.539594843462247, + "grad_norm": 0.22698798775672913, + "learning_rate": 2.8254646259965035e-05, + "loss": 1.7377, + "step": 21306 + }, + { + "epoch": 6.539901780233272, + "grad_norm": 0.19169457256793976, + "learning_rate": 2.8250170500847696e-05, + "loss": 1.7416, + "step": 21307 + }, + { + "epoch": 6.5402087170042975, + "grad_norm": 0.18730394542217255, + "learning_rate": 2.8245694956674918e-05, + "loss": 1.7273, + "step": 21308 + }, + { + "epoch": 6.540515653775322, + "grad_norm": 0.19813422858715057, + "learning_rate": 2.8241219627490927e-05, + "loss": 1.7638, + "step": 21309 + }, + { + "epoch": 6.540822590546347, + "grad_norm": 0.20460368692874908, + "learning_rate": 2.8236744513339965e-05, + "loss": 1.7266, + "step": 21310 + }, + { + "epoch": 6.541129527317373, + "grad_norm": 0.20448380708694458, + "learning_rate": 2.823226961426625e-05, + "loss": 1.7335, + "step": 21311 + }, + { + "epoch": 6.541436464088398, + "grad_norm": 0.21458712220191956, + "learning_rate": 2.8227794930314e-05, + "loss": 1.7274, + "step": 21312 + }, + { + "epoch": 6.541743400859423, + "grad_norm": 0.1964675635099411, + "learning_rate": 2.8223320461527442e-05, + "loss": 1.7514, + "step": 21313 + }, + { + "epoch": 6.542050337630448, + "grad_norm": 0.18982458114624023, + "learning_rate": 2.82188462079508e-05, + "loss": 1.6858, + "step": 21314 + }, + { + "epoch": 6.542357274401473, + "grad_norm": 0.21377761662006378, + "learning_rate": 2.8214372169628277e-05, + "loss": 1.727, + "step": 21315 + }, + { + "epoch": 6.542664211172498, + "grad_norm": 0.19484922289848328, + "learning_rate": 2.8209898346604087e-05, + "loss": 1.7646, + "step": 21316 + }, + { + "epoch": 6.542971147943524, + "grad_norm": 0.20614980161190033, + "learning_rate": 2.8205424738922488e-05, + "loss": 1.6705, + "step": 21317 + }, + { + "epoch": 6.543278084714549, + "grad_norm": 0.1888885796070099, + "learning_rate": 2.8200951346627636e-05, + "loss": 1.7854, + "step": 21318 + }, + { + "epoch": 6.543585021485574, + "grad_norm": 0.20957863330841064, + "learning_rate": 2.8196478169763763e-05, + "loss": 1.6971, + "step": 21319 + }, + { + "epoch": 6.543891958256599, + "grad_norm": 0.20744509994983673, + "learning_rate": 2.8192005208375073e-05, + "loss": 1.7408, + "step": 21320 + }, + { + "epoch": 6.544198895027624, + "grad_norm": 0.20038767158985138, + "learning_rate": 2.818753246250574e-05, + "loss": 1.7355, + "step": 21321 + }, + { + "epoch": 6.5445058317986495, + "grad_norm": 0.18535862863063812, + "learning_rate": 2.818305993220004e-05, + "loss": 1.7229, + "step": 21322 + }, + { + "epoch": 6.544812768569675, + "grad_norm": 0.2191225290298462, + "learning_rate": 2.8178587617502095e-05, + "loss": 1.7364, + "step": 21323 + }, + { + "epoch": 6.5451197053407, + "grad_norm": 0.2055424451828003, + "learning_rate": 2.8174115518456175e-05, + "loss": 1.7488, + "step": 21324 + }, + { + "epoch": 6.545426642111725, + "grad_norm": 0.22267968952655792, + "learning_rate": 2.8169643635106398e-05, + "loss": 1.6936, + "step": 21325 + }, + { + "epoch": 6.54573357888275, + "grad_norm": 0.20295512676239014, + "learning_rate": 2.8165171967497018e-05, + "loss": 1.7651, + "step": 21326 + }, + { + "epoch": 6.546040515653775, + "grad_norm": 0.25859618186950684, + "learning_rate": 2.81607005156722e-05, + "loss": 1.7264, + "step": 21327 + }, + { + "epoch": 6.546347452424801, + "grad_norm": 0.22232379019260406, + "learning_rate": 2.8156229279676143e-05, + "loss": 1.7282, + "step": 21328 + }, + { + "epoch": 6.546654389195826, + "grad_norm": 0.2548457682132721, + "learning_rate": 2.8151758259553035e-05, + "loss": 1.7137, + "step": 21329 + }, + { + "epoch": 6.546961325966851, + "grad_norm": 0.22040672600269318, + "learning_rate": 2.8147287455347055e-05, + "loss": 1.7553, + "step": 21330 + }, + { + "epoch": 6.547268262737876, + "grad_norm": 0.19622360169887543, + "learning_rate": 2.8142816867102388e-05, + "loss": 1.6502, + "step": 21331 + }, + { + "epoch": 6.547575199508901, + "grad_norm": 0.20849336683750153, + "learning_rate": 2.813834649486322e-05, + "loss": 1.6824, + "step": 21332 + }, + { + "epoch": 6.547882136279926, + "grad_norm": 0.18474788963794708, + "learning_rate": 2.8133876338673703e-05, + "loss": 1.7136, + "step": 21333 + }, + { + "epoch": 6.548189073050952, + "grad_norm": 0.2421834021806717, + "learning_rate": 2.8129406398578074e-05, + "loss": 1.7841, + "step": 21334 + }, + { + "epoch": 6.548496009821976, + "grad_norm": 0.18089748919010162, + "learning_rate": 2.812493667462045e-05, + "loss": 1.6918, + "step": 21335 + }, + { + "epoch": 6.5488029465930016, + "grad_norm": 0.18575069308280945, + "learning_rate": 2.8120467166845022e-05, + "loss": 1.7098, + "step": 21336 + }, + { + "epoch": 6.549109883364027, + "grad_norm": 0.20840388536453247, + "learning_rate": 2.811599787529596e-05, + "loss": 1.7405, + "step": 21337 + }, + { + "epoch": 6.549416820135052, + "grad_norm": 0.19018858671188354, + "learning_rate": 2.811152880001742e-05, + "loss": 1.7098, + "step": 21338 + }, + { + "epoch": 6.5497237569060776, + "grad_norm": 0.22326117753982544, + "learning_rate": 2.8107059941053627e-05, + "loss": 1.7452, + "step": 21339 + }, + { + "epoch": 6.550030693677103, + "grad_norm": 0.26071304082870483, + "learning_rate": 2.8102591298448643e-05, + "loss": 1.7685, + "step": 21340 + }, + { + "epoch": 6.550337630448127, + "grad_norm": 0.2253575623035431, + "learning_rate": 2.8098122872246734e-05, + "loss": 1.8025, + "step": 21341 + }, + { + "epoch": 6.550644567219153, + "grad_norm": 0.2503850758075714, + "learning_rate": 2.8093654662491975e-05, + "loss": 1.7453, + "step": 21342 + }, + { + "epoch": 6.550951503990178, + "grad_norm": 0.18953700363636017, + "learning_rate": 2.808918666922858e-05, + "loss": 1.7549, + "step": 21343 + }, + { + "epoch": 6.551258440761203, + "grad_norm": 0.21360619366168976, + "learning_rate": 2.8084718892500685e-05, + "loss": 1.7363, + "step": 21344 + }, + { + "epoch": 6.551565377532229, + "grad_norm": 0.24622702598571777, + "learning_rate": 2.8080251332352437e-05, + "loss": 1.7325, + "step": 21345 + }, + { + "epoch": 6.551872314303253, + "grad_norm": 0.20079167187213898, + "learning_rate": 2.8075783988827997e-05, + "loss": 1.7478, + "step": 21346 + }, + { + "epoch": 6.5521792510742785, + "grad_norm": 0.2337643951177597, + "learning_rate": 2.807131686197151e-05, + "loss": 1.6683, + "step": 21347 + }, + { + "epoch": 6.552486187845304, + "grad_norm": 0.20815308392047882, + "learning_rate": 2.8066849951827123e-05, + "loss": 1.7436, + "step": 21348 + }, + { + "epoch": 6.552793124616329, + "grad_norm": 0.2450367957353592, + "learning_rate": 2.8062383258438972e-05, + "loss": 1.7464, + "step": 21349 + }, + { + "epoch": 6.5531000613873545, + "grad_norm": 0.232087641954422, + "learning_rate": 2.8057916781851222e-05, + "loss": 1.7378, + "step": 21350 + }, + { + "epoch": 6.55340699815838, + "grad_norm": 0.2254600077867508, + "learning_rate": 2.8053450522107993e-05, + "loss": 1.7299, + "step": 21351 + }, + { + "epoch": 6.553713934929404, + "grad_norm": 0.23282572627067566, + "learning_rate": 2.8048984479253425e-05, + "loss": 1.7512, + "step": 21352 + }, + { + "epoch": 6.55402087170043, + "grad_norm": 0.21826763451099396, + "learning_rate": 2.8044518653331665e-05, + "loss": 1.706, + "step": 21353 + }, + { + "epoch": 6.554327808471455, + "grad_norm": 0.20807425677776337, + "learning_rate": 2.804005304438683e-05, + "loss": 1.7013, + "step": 21354 + }, + { + "epoch": 6.55463474524248, + "grad_norm": 0.21791879832744598, + "learning_rate": 2.8035587652463046e-05, + "loss": 1.7312, + "step": 21355 + }, + { + "epoch": 6.554941682013506, + "grad_norm": 0.23205329477787018, + "learning_rate": 2.8031122477604505e-05, + "loss": 1.7166, + "step": 21356 + }, + { + "epoch": 6.55524861878453, + "grad_norm": 0.1910320371389389, + "learning_rate": 2.802665751985525e-05, + "loss": 1.694, + "step": 21357 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 0.24150735139846802, + "learning_rate": 2.8022192779259472e-05, + "loss": 1.7934, + "step": 21358 + }, + { + "epoch": 6.555862492326581, + "grad_norm": 0.18308573961257935, + "learning_rate": 2.801772825586123e-05, + "loss": 1.6851, + "step": 21359 + }, + { + "epoch": 6.556169429097606, + "grad_norm": 0.28410083055496216, + "learning_rate": 2.8013263949704705e-05, + "loss": 1.7687, + "step": 21360 + }, + { + "epoch": 6.556476365868631, + "grad_norm": 0.21073146164417267, + "learning_rate": 2.8008799860833996e-05, + "loss": 1.711, + "step": 21361 + }, + { + "epoch": 6.556783302639657, + "grad_norm": 0.22758159041404724, + "learning_rate": 2.8004335989293213e-05, + "loss": 1.7495, + "step": 21362 + }, + { + "epoch": 6.557090239410681, + "grad_norm": 0.2112412452697754, + "learning_rate": 2.799987233512647e-05, + "loss": 1.7125, + "step": 21363 + }, + { + "epoch": 6.5573971761817065, + "grad_norm": 0.1804153323173523, + "learning_rate": 2.7995408898377884e-05, + "loss": 1.689, + "step": 21364 + }, + { + "epoch": 6.557704112952732, + "grad_norm": 0.17632657289505005, + "learning_rate": 2.7990945679091572e-05, + "loss": 1.6868, + "step": 21365 + }, + { + "epoch": 6.558011049723757, + "grad_norm": 0.17942996323108673, + "learning_rate": 2.7986482677311632e-05, + "loss": 1.7082, + "step": 21366 + }, + { + "epoch": 6.558317986494782, + "grad_norm": 0.278486967086792, + "learning_rate": 2.7982019893082167e-05, + "loss": 1.7909, + "step": 21367 + }, + { + "epoch": 6.558624923265807, + "grad_norm": 0.208990678191185, + "learning_rate": 2.797755732644729e-05, + "loss": 1.7643, + "step": 21368 + }, + { + "epoch": 6.558931860036832, + "grad_norm": 0.20375309884548187, + "learning_rate": 2.7973094977451096e-05, + "loss": 1.6957, + "step": 21369 + }, + { + "epoch": 6.559238796807858, + "grad_norm": 0.24685338139533997, + "learning_rate": 2.7968632846137694e-05, + "loss": 1.7574, + "step": 21370 + }, + { + "epoch": 6.559545733578883, + "grad_norm": 0.2237502634525299, + "learning_rate": 2.796417093255117e-05, + "loss": 1.7422, + "step": 21371 + }, + { + "epoch": 6.559852670349908, + "grad_norm": 0.22731846570968628, + "learning_rate": 2.795970923673561e-05, + "loss": 1.7594, + "step": 21372 + }, + { + "epoch": 6.560159607120933, + "grad_norm": 0.2518742084503174, + "learning_rate": 2.7955247758735158e-05, + "loss": 1.6817, + "step": 21373 + }, + { + "epoch": 6.560466543891958, + "grad_norm": 0.21982096135616302, + "learning_rate": 2.7950786498593827e-05, + "loss": 1.7289, + "step": 21374 + }, + { + "epoch": 6.560773480662983, + "grad_norm": 0.19061018526554108, + "learning_rate": 2.7946325456355787e-05, + "loss": 1.6809, + "step": 21375 + }, + { + "epoch": 6.561080417434009, + "grad_norm": 0.2023245394229889, + "learning_rate": 2.794186463206505e-05, + "loss": 1.7053, + "step": 21376 + }, + { + "epoch": 6.561387354205034, + "grad_norm": 0.18003186583518982, + "learning_rate": 2.7937404025765752e-05, + "loss": 1.6447, + "step": 21377 + }, + { + "epoch": 6.5616942909760585, + "grad_norm": 0.19133709371089935, + "learning_rate": 2.7932943637501956e-05, + "loss": 1.7677, + "step": 21378 + }, + { + "epoch": 6.562001227747084, + "grad_norm": 0.18476714193820953, + "learning_rate": 2.7928483467317746e-05, + "loss": 1.685, + "step": 21379 + }, + { + "epoch": 6.562308164518109, + "grad_norm": 0.2065780758857727, + "learning_rate": 2.79240235152572e-05, + "loss": 1.6827, + "step": 21380 + }, + { + "epoch": 6.5626151012891345, + "grad_norm": 0.1885409951210022, + "learning_rate": 2.79195637813644e-05, + "loss": 1.6819, + "step": 21381 + }, + { + "epoch": 6.56292203806016, + "grad_norm": 0.18055391311645508, + "learning_rate": 2.79151042656834e-05, + "loss": 1.7007, + "step": 21382 + }, + { + "epoch": 6.563228974831185, + "grad_norm": 0.25148439407348633, + "learning_rate": 2.7910644968258294e-05, + "loss": 1.7723, + "step": 21383 + }, + { + "epoch": 6.56353591160221, + "grad_norm": 0.2308066487312317, + "learning_rate": 2.7906185889133134e-05, + "loss": 1.7525, + "step": 21384 + }, + { + "epoch": 6.563842848373235, + "grad_norm": 0.19580784440040588, + "learning_rate": 2.7901727028351997e-05, + "loss": 1.7197, + "step": 21385 + }, + { + "epoch": 6.56414978514426, + "grad_norm": 0.19686979055404663, + "learning_rate": 2.7897268385958952e-05, + "loss": 1.6873, + "step": 21386 + }, + { + "epoch": 6.564456721915286, + "grad_norm": 0.2657351493835449, + "learning_rate": 2.7892809961998045e-05, + "loss": 1.7005, + "step": 21387 + }, + { + "epoch": 6.564763658686311, + "grad_norm": 0.20131130516529083, + "learning_rate": 2.7888351756513353e-05, + "loss": 1.7211, + "step": 21388 + }, + { + "epoch": 6.565070595457335, + "grad_norm": 0.2524282932281494, + "learning_rate": 2.7883893769548908e-05, + "loss": 1.7038, + "step": 21389 + }, + { + "epoch": 6.565377532228361, + "grad_norm": 0.1601654291152954, + "learning_rate": 2.787943600114883e-05, + "loss": 1.691, + "step": 21390 + }, + { + "epoch": 6.565684468999386, + "grad_norm": 0.25074124336242676, + "learning_rate": 2.787497845135709e-05, + "loss": 1.688, + "step": 21391 + }, + { + "epoch": 6.565991405770411, + "grad_norm": 0.19491349160671234, + "learning_rate": 2.787052112021782e-05, + "loss": 1.7108, + "step": 21392 + }, + { + "epoch": 6.566298342541437, + "grad_norm": 0.23931637406349182, + "learning_rate": 2.786606400777499e-05, + "loss": 1.7315, + "step": 21393 + }, + { + "epoch": 6.566605279312462, + "grad_norm": 0.1643616110086441, + "learning_rate": 2.786160711407271e-05, + "loss": 1.6745, + "step": 21394 + }, + { + "epoch": 6.5669122160834865, + "grad_norm": 0.17805394530296326, + "learning_rate": 2.7857150439155e-05, + "loss": 1.6817, + "step": 21395 + }, + { + "epoch": 6.567219152854512, + "grad_norm": 0.20370139181613922, + "learning_rate": 2.7852693983065913e-05, + "loss": 1.7173, + "step": 21396 + }, + { + "epoch": 6.567526089625537, + "grad_norm": 0.1620296984910965, + "learning_rate": 2.784823774584948e-05, + "loss": 1.7135, + "step": 21397 + }, + { + "epoch": 6.5678330263965625, + "grad_norm": 0.19116036593914032, + "learning_rate": 2.7843781727549752e-05, + "loss": 1.6815, + "step": 21398 + }, + { + "epoch": 6.568139963167588, + "grad_norm": 0.20118895173072815, + "learning_rate": 2.7839325928210757e-05, + "loss": 1.7336, + "step": 21399 + }, + { + "epoch": 6.568446899938612, + "grad_norm": 0.198282390832901, + "learning_rate": 2.7834870347876528e-05, + "loss": 1.7379, + "step": 21400 + }, + { + "epoch": 6.568753836709638, + "grad_norm": 0.19203920662403107, + "learning_rate": 2.7830414986591104e-05, + "loss": 1.6913, + "step": 21401 + }, + { + "epoch": 6.569060773480663, + "grad_norm": 0.24601610004901886, + "learning_rate": 2.7825959844398507e-05, + "loss": 1.7842, + "step": 21402 + }, + { + "epoch": 6.569367710251688, + "grad_norm": 0.19069935381412506, + "learning_rate": 2.7821504921342777e-05, + "loss": 1.706, + "step": 21403 + }, + { + "epoch": 6.569674647022714, + "grad_norm": 0.20221085846424103, + "learning_rate": 2.7817050217467945e-05, + "loss": 1.7223, + "step": 21404 + }, + { + "epoch": 6.569981583793739, + "grad_norm": 0.2129664123058319, + "learning_rate": 2.781259573281801e-05, + "loss": 1.7429, + "step": 21405 + }, + { + "epoch": 6.570288520564763, + "grad_norm": 0.20684000849723816, + "learning_rate": 2.7808141467436993e-05, + "loss": 1.7349, + "step": 21406 + }, + { + "epoch": 6.570595457335789, + "grad_norm": 0.2153804898262024, + "learning_rate": 2.7803687421368968e-05, + "loss": 1.7245, + "step": 21407 + }, + { + "epoch": 6.570902394106814, + "grad_norm": 0.245448499917984, + "learning_rate": 2.7799233594657875e-05, + "loss": 1.7102, + "step": 21408 + }, + { + "epoch": 6.571209330877839, + "grad_norm": 0.18146783113479614, + "learning_rate": 2.7794779987347807e-05, + "loss": 1.6777, + "step": 21409 + }, + { + "epoch": 6.571516267648864, + "grad_norm": 0.21388854086399078, + "learning_rate": 2.7790326599482698e-05, + "loss": 1.7263, + "step": 21410 + }, + { + "epoch": 6.571823204419889, + "grad_norm": 0.2242165058851242, + "learning_rate": 2.7785873431106625e-05, + "loss": 1.7624, + "step": 21411 + }, + { + "epoch": 6.5721301411909145, + "grad_norm": 0.23132537305355072, + "learning_rate": 2.7781420482263565e-05, + "loss": 1.7013, + "step": 21412 + }, + { + "epoch": 6.57243707796194, + "grad_norm": 0.21074987947940826, + "learning_rate": 2.777696775299753e-05, + "loss": 1.7111, + "step": 21413 + }, + { + "epoch": 6.572744014732965, + "grad_norm": 0.2933674156665802, + "learning_rate": 2.7772515243352525e-05, + "loss": 1.7515, + "step": 21414 + }, + { + "epoch": 6.5730509515039905, + "grad_norm": 0.2100256085395813, + "learning_rate": 2.7768062953372552e-05, + "loss": 1.7425, + "step": 21415 + }, + { + "epoch": 6.573357888275015, + "grad_norm": 0.21765680611133575, + "learning_rate": 2.776361088310161e-05, + "loss": 1.7064, + "step": 21416 + }, + { + "epoch": 6.57366482504604, + "grad_norm": 0.205422043800354, + "learning_rate": 2.7759159032583702e-05, + "loss": 1.7458, + "step": 21417 + }, + { + "epoch": 6.573971761817066, + "grad_norm": 0.2009960114955902, + "learning_rate": 2.775470740186282e-05, + "loss": 1.7111, + "step": 21418 + }, + { + "epoch": 6.574278698588091, + "grad_norm": 0.18974804878234863, + "learning_rate": 2.7750255990982955e-05, + "loss": 1.7385, + "step": 21419 + }, + { + "epoch": 6.574585635359116, + "grad_norm": 0.1784054934978485, + "learning_rate": 2.7745804799988106e-05, + "loss": 1.7129, + "step": 21420 + }, + { + "epoch": 6.574892572130141, + "grad_norm": 0.2047782689332962, + "learning_rate": 2.7741353828922258e-05, + "loss": 1.6972, + "step": 21421 + }, + { + "epoch": 6.575199508901166, + "grad_norm": 0.18886682391166687, + "learning_rate": 2.773690307782939e-05, + "loss": 1.6564, + "step": 21422 + }, + { + "epoch": 6.5755064456721914, + "grad_norm": 0.2088952213525772, + "learning_rate": 2.7732452546753484e-05, + "loss": 1.7309, + "step": 21423 + }, + { + "epoch": 6.575813382443217, + "grad_norm": 0.20526883006095886, + "learning_rate": 2.7728002235738565e-05, + "loss": 1.6811, + "step": 21424 + }, + { + "epoch": 6.576120319214242, + "grad_norm": 0.19648446142673492, + "learning_rate": 2.7723552144828545e-05, + "loss": 1.7237, + "step": 21425 + }, + { + "epoch": 6.5764272559852675, + "grad_norm": 0.22405673563480377, + "learning_rate": 2.7719102274067484e-05, + "loss": 1.7454, + "step": 21426 + }, + { + "epoch": 6.576734192756292, + "grad_norm": 0.24119171500205994, + "learning_rate": 2.7714652623499265e-05, + "loss": 1.7106, + "step": 21427 + }, + { + "epoch": 6.577041129527317, + "grad_norm": 0.2127196192741394, + "learning_rate": 2.771020319316794e-05, + "loss": 1.7895, + "step": 21428 + }, + { + "epoch": 6.577348066298343, + "grad_norm": 0.23805706202983856, + "learning_rate": 2.7705753983117443e-05, + "loss": 1.739, + "step": 21429 + }, + { + "epoch": 6.577655003069368, + "grad_norm": 0.24212954938411713, + "learning_rate": 2.7701304993391753e-05, + "loss": 1.683, + "step": 21430 + }, + { + "epoch": 6.577961939840393, + "grad_norm": 0.1946132481098175, + "learning_rate": 2.769685622403484e-05, + "loss": 1.6953, + "step": 21431 + }, + { + "epoch": 6.578268876611418, + "grad_norm": 0.2465951144695282, + "learning_rate": 2.769240767509067e-05, + "loss": 1.6594, + "step": 21432 + }, + { + "epoch": 6.578575813382443, + "grad_norm": 0.17029622197151184, + "learning_rate": 2.76879593466032e-05, + "loss": 1.6977, + "step": 21433 + }, + { + "epoch": 6.578882750153468, + "grad_norm": 0.23793117702007294, + "learning_rate": 2.7683511238616388e-05, + "loss": 1.6709, + "step": 21434 + }, + { + "epoch": 6.579189686924494, + "grad_norm": 0.20149341225624084, + "learning_rate": 2.76790633511742e-05, + "loss": 1.8074, + "step": 21435 + }, + { + "epoch": 6.579496623695519, + "grad_norm": 0.25029948353767395, + "learning_rate": 2.7674615684320593e-05, + "loss": 1.6649, + "step": 21436 + }, + { + "epoch": 6.579803560466544, + "grad_norm": 0.22212490439414978, + "learning_rate": 2.7670168238099515e-05, + "loss": 1.7322, + "step": 21437 + }, + { + "epoch": 6.580110497237569, + "grad_norm": 0.26087918877601624, + "learning_rate": 2.7665721012554925e-05, + "loss": 1.7285, + "step": 21438 + }, + { + "epoch": 6.580417434008594, + "grad_norm": 0.19286726415157318, + "learning_rate": 2.7661274007730776e-05, + "loss": 1.6912, + "step": 21439 + }, + { + "epoch": 6.5807243707796195, + "grad_norm": 0.23935118317604065, + "learning_rate": 2.7656827223670982e-05, + "loss": 1.6929, + "step": 21440 + }, + { + "epoch": 6.581031307550645, + "grad_norm": 0.2263423204421997, + "learning_rate": 2.7652380660419563e-05, + "loss": 1.6786, + "step": 21441 + }, + { + "epoch": 6.581338244321669, + "grad_norm": 0.19788038730621338, + "learning_rate": 2.7647934318020373e-05, + "loss": 1.7906, + "step": 21442 + }, + { + "epoch": 6.581645181092695, + "grad_norm": 0.25891759991645813, + "learning_rate": 2.7643488196517435e-05, + "loss": 1.7691, + "step": 21443 + }, + { + "epoch": 6.58195211786372, + "grad_norm": 0.25175485014915466, + "learning_rate": 2.7639042295954615e-05, + "loss": 1.7329, + "step": 21444 + }, + { + "epoch": 6.582259054634745, + "grad_norm": 0.1860336810350418, + "learning_rate": 2.7634596616375908e-05, + "loss": 1.7348, + "step": 21445 + }, + { + "epoch": 6.582565991405771, + "grad_norm": 0.2704271972179413, + "learning_rate": 2.7630151157825218e-05, + "loss": 1.7199, + "step": 21446 + }, + { + "epoch": 6.582872928176796, + "grad_norm": 0.16306720674037933, + "learning_rate": 2.762570592034649e-05, + "loss": 1.7174, + "step": 21447 + }, + { + "epoch": 6.58317986494782, + "grad_norm": 0.2585636079311371, + "learning_rate": 2.7621260903983648e-05, + "loss": 1.7392, + "step": 21448 + }, + { + "epoch": 6.583486801718846, + "grad_norm": 0.2086072564125061, + "learning_rate": 2.7616816108780623e-05, + "loss": 1.7417, + "step": 21449 + }, + { + "epoch": 6.583793738489871, + "grad_norm": 0.1747613251209259, + "learning_rate": 2.7612371534781343e-05, + "loss": 1.6607, + "step": 21450 + }, + { + "epoch": 6.584100675260896, + "grad_norm": 0.21026404201984406, + "learning_rate": 2.7607927182029726e-05, + "loss": 1.7725, + "step": 21451 + }, + { + "epoch": 6.584407612031922, + "grad_norm": 0.17881789803504944, + "learning_rate": 2.76034830505697e-05, + "loss": 1.7502, + "step": 21452 + }, + { + "epoch": 6.584714548802946, + "grad_norm": 0.2503713369369507, + "learning_rate": 2.7599039140445182e-05, + "loss": 1.798, + "step": 21453 + }, + { + "epoch": 6.5850214855739715, + "grad_norm": 0.22163939476013184, + "learning_rate": 2.7594595451700083e-05, + "loss": 1.725, + "step": 21454 + }, + { + "epoch": 6.585328422344997, + "grad_norm": 0.2154664546251297, + "learning_rate": 2.759015198437833e-05, + "loss": 1.7917, + "step": 21455 + }, + { + "epoch": 6.585635359116022, + "grad_norm": 0.1814090609550476, + "learning_rate": 2.7585708738523823e-05, + "loss": 1.6562, + "step": 21456 + }, + { + "epoch": 6.5859422958870475, + "grad_norm": 0.18815121054649353, + "learning_rate": 2.758126571418049e-05, + "loss": 1.6833, + "step": 21457 + }, + { + "epoch": 6.586249232658073, + "grad_norm": 0.19383473694324493, + "learning_rate": 2.757682291139222e-05, + "loss": 1.6987, + "step": 21458 + }, + { + "epoch": 6.586556169429097, + "grad_norm": 0.19574831426143646, + "learning_rate": 2.7572380330202912e-05, + "loss": 1.7231, + "step": 21459 + }, + { + "epoch": 6.586863106200123, + "grad_norm": 0.17509032785892487, + "learning_rate": 2.7567937970656527e-05, + "loss": 1.6452, + "step": 21460 + }, + { + "epoch": 6.587170042971148, + "grad_norm": 0.19439785182476044, + "learning_rate": 2.7563495832796886e-05, + "loss": 1.7168, + "step": 21461 + }, + { + "epoch": 6.587476979742173, + "grad_norm": 0.17384520173072815, + "learning_rate": 2.7559053916667953e-05, + "loss": 1.7128, + "step": 21462 + }, + { + "epoch": 6.587783916513199, + "grad_norm": 0.18308506906032562, + "learning_rate": 2.7554612222313597e-05, + "loss": 1.7184, + "step": 21463 + }, + { + "epoch": 6.588090853284223, + "grad_norm": 0.20052805542945862, + "learning_rate": 2.7550170749777726e-05, + "loss": 1.7239, + "step": 21464 + }, + { + "epoch": 6.588397790055248, + "grad_norm": 0.21892015635967255, + "learning_rate": 2.7545729499104215e-05, + "loss": 1.7297, + "step": 21465 + }, + { + "epoch": 6.588704726826274, + "grad_norm": 0.19819483160972595, + "learning_rate": 2.7541288470336973e-05, + "loss": 1.7303, + "step": 21466 + }, + { + "epoch": 6.589011663597299, + "grad_norm": 0.24296818673610687, + "learning_rate": 2.7536847663519884e-05, + "loss": 1.8525, + "step": 21467 + }, + { + "epoch": 6.589318600368324, + "grad_norm": 0.1971593201160431, + "learning_rate": 2.753240707869683e-05, + "loss": 1.7396, + "step": 21468 + }, + { + "epoch": 6.58962553713935, + "grad_norm": 0.24418935179710388, + "learning_rate": 2.7527966715911696e-05, + "loss": 1.7414, + "step": 21469 + }, + { + "epoch": 6.589932473910374, + "grad_norm": 0.2193990796804428, + "learning_rate": 2.7523526575208368e-05, + "loss": 1.7243, + "step": 21470 + }, + { + "epoch": 6.5902394106813995, + "grad_norm": 0.23612114787101746, + "learning_rate": 2.7519086656630722e-05, + "loss": 1.7072, + "step": 21471 + }, + { + "epoch": 6.590546347452425, + "grad_norm": 0.22282655537128448, + "learning_rate": 2.751464696022264e-05, + "loss": 1.7423, + "step": 21472 + }, + { + "epoch": 6.59085328422345, + "grad_norm": 0.21411976218223572, + "learning_rate": 2.7510207486027995e-05, + "loss": 1.7397, + "step": 21473 + }, + { + "epoch": 6.5911602209944755, + "grad_norm": 0.2244768589735031, + "learning_rate": 2.7505768234090663e-05, + "loss": 1.6964, + "step": 21474 + }, + { + "epoch": 6.5914671577655, + "grad_norm": 0.2250032275915146, + "learning_rate": 2.7501329204454512e-05, + "loss": 1.7307, + "step": 21475 + }, + { + "epoch": 6.591774094536525, + "grad_norm": 0.2643435299396515, + "learning_rate": 2.7496890397163395e-05, + "loss": 1.7298, + "step": 21476 + }, + { + "epoch": 6.592081031307551, + "grad_norm": 0.2204463928937912, + "learning_rate": 2.7492451812261232e-05, + "loss": 1.723, + "step": 21477 + }, + { + "epoch": 6.592387968078576, + "grad_norm": 0.2278377115726471, + "learning_rate": 2.7488013449791816e-05, + "loss": 1.7597, + "step": 21478 + }, + { + "epoch": 6.592694904849601, + "grad_norm": 0.18430690467357635, + "learning_rate": 2.7483575309799086e-05, + "loss": 1.6314, + "step": 21479 + }, + { + "epoch": 6.593001841620627, + "grad_norm": 0.26019781827926636, + "learning_rate": 2.7479137392326827e-05, + "loss": 1.7362, + "step": 21480 + }, + { + "epoch": 6.593308778391651, + "grad_norm": 0.2103995382785797, + "learning_rate": 2.7474699697418936e-05, + "loss": 1.7137, + "step": 21481 + }, + { + "epoch": 6.593615715162676, + "grad_norm": 0.220427006483078, + "learning_rate": 2.747026222511928e-05, + "loss": 1.7323, + "step": 21482 + }, + { + "epoch": 6.593922651933702, + "grad_norm": 0.21523109078407288, + "learning_rate": 2.7465824975471693e-05, + "loss": 1.7572, + "step": 21483 + }, + { + "epoch": 6.594229588704727, + "grad_norm": 0.21639512479305267, + "learning_rate": 2.7461387948520033e-05, + "loss": 1.7275, + "step": 21484 + }, + { + "epoch": 6.5945365254757515, + "grad_norm": 0.2043544203042984, + "learning_rate": 2.7456951144308147e-05, + "loss": 1.7454, + "step": 21485 + }, + { + "epoch": 6.594843462246777, + "grad_norm": 0.17847217619419098, + "learning_rate": 2.7452514562879882e-05, + "loss": 1.7356, + "step": 21486 + }, + { + "epoch": 6.595150399017802, + "grad_norm": 0.20756758749485016, + "learning_rate": 2.744807820427908e-05, + "loss": 1.7557, + "step": 21487 + }, + { + "epoch": 6.5954573357888275, + "grad_norm": 0.23579071462154388, + "learning_rate": 2.744364206854959e-05, + "loss": 1.7855, + "step": 21488 + }, + { + "epoch": 6.595764272559853, + "grad_norm": 0.1947307586669922, + "learning_rate": 2.7439206155735254e-05, + "loss": 1.7105, + "step": 21489 + }, + { + "epoch": 6.596071209330878, + "grad_norm": 0.1900642365217209, + "learning_rate": 2.74347704658799e-05, + "loss": 1.6692, + "step": 21490 + }, + { + "epoch": 6.596378146101903, + "grad_norm": 0.16756244003772736, + "learning_rate": 2.7430334999027375e-05, + "loss": 1.7175, + "step": 21491 + }, + { + "epoch": 6.596685082872928, + "grad_norm": 0.18581146001815796, + "learning_rate": 2.7425899755221506e-05, + "loss": 1.72, + "step": 21492 + }, + { + "epoch": 6.596992019643953, + "grad_norm": 0.2384853959083557, + "learning_rate": 2.7421464734506107e-05, + "loss": 1.718, + "step": 21493 + }, + { + "epoch": 6.597298956414979, + "grad_norm": 0.16853606700897217, + "learning_rate": 2.7417029936925065e-05, + "loss": 1.6819, + "step": 21494 + }, + { + "epoch": 6.597605893186004, + "grad_norm": 0.2273230254650116, + "learning_rate": 2.741259536252213e-05, + "loss": 1.7158, + "step": 21495 + }, + { + "epoch": 6.597912829957028, + "grad_norm": 0.2291530966758728, + "learning_rate": 2.7408161011341205e-05, + "loss": 1.7804, + "step": 21496 + }, + { + "epoch": 6.598219766728054, + "grad_norm": 0.17676831781864166, + "learning_rate": 2.740372688342604e-05, + "loss": 1.6693, + "step": 21497 + }, + { + "epoch": 6.598526703499079, + "grad_norm": 0.2386767417192459, + "learning_rate": 2.7399292978820508e-05, + "loss": 1.6932, + "step": 21498 + }, + { + "epoch": 6.598833640270104, + "grad_norm": 0.21329782903194427, + "learning_rate": 2.739485929756841e-05, + "loss": 1.7811, + "step": 21499 + }, + { + "epoch": 6.59914057704113, + "grad_norm": 0.19382116198539734, + "learning_rate": 2.7390425839713556e-05, + "loss": 1.7152, + "step": 21500 + }, + { + "epoch": 6.599447513812155, + "grad_norm": 0.1819920688867569, + "learning_rate": 2.738599260529977e-05, + "loss": 1.6571, + "step": 21501 + }, + { + "epoch": 6.5997544505831796, + "grad_norm": 0.19947806000709534, + "learning_rate": 2.738155959437086e-05, + "loss": 1.7138, + "step": 21502 + }, + { + "epoch": 6.600061387354205, + "grad_norm": 0.1851014792919159, + "learning_rate": 2.7377126806970634e-05, + "loss": 1.7109, + "step": 21503 + }, + { + "epoch": 6.60036832412523, + "grad_norm": 0.20365974307060242, + "learning_rate": 2.7372694243142905e-05, + "loss": 1.7145, + "step": 21504 + }, + { + "epoch": 6.600675260896256, + "grad_norm": 0.2070893943309784, + "learning_rate": 2.736826190293147e-05, + "loss": 1.7172, + "step": 21505 + }, + { + "epoch": 6.600982197667281, + "grad_norm": 0.19077777862548828, + "learning_rate": 2.7363829786380136e-05, + "loss": 1.7059, + "step": 21506 + }, + { + "epoch": 6.601289134438305, + "grad_norm": 0.21168744564056396, + "learning_rate": 2.73593978935327e-05, + "loss": 1.7483, + "step": 21507 + }, + { + "epoch": 6.601596071209331, + "grad_norm": 0.20746631920337677, + "learning_rate": 2.7354966224432965e-05, + "loss": 1.7165, + "step": 21508 + }, + { + "epoch": 6.601903007980356, + "grad_norm": 0.19440631568431854, + "learning_rate": 2.7350534779124732e-05, + "loss": 1.694, + "step": 21509 + }, + { + "epoch": 6.602209944751381, + "grad_norm": 0.20699405670166016, + "learning_rate": 2.7346103557651765e-05, + "loss": 1.7077, + "step": 21510 + }, + { + "epoch": 6.602516881522407, + "grad_norm": 0.19856512546539307, + "learning_rate": 2.7341672560057917e-05, + "loss": 1.77, + "step": 21511 + }, + { + "epoch": 6.602823818293432, + "grad_norm": 0.23978421092033386, + "learning_rate": 2.7337241786386915e-05, + "loss": 1.7531, + "step": 21512 + }, + { + "epoch": 6.6031307550644565, + "grad_norm": 0.1834867000579834, + "learning_rate": 2.73328112366826e-05, + "loss": 1.751, + "step": 21513 + }, + { + "epoch": 6.603437691835482, + "grad_norm": 0.2154606282711029, + "learning_rate": 2.7328380910988694e-05, + "loss": 1.737, + "step": 21514 + }, + { + "epoch": 6.603744628606507, + "grad_norm": 0.20554645359516144, + "learning_rate": 2.7323950809349035e-05, + "loss": 1.7629, + "step": 21515 + }, + { + "epoch": 6.6040515653775325, + "grad_norm": 0.20497548580169678, + "learning_rate": 2.7319520931807386e-05, + "loss": 1.7001, + "step": 21516 + }, + { + "epoch": 6.604358502148557, + "grad_norm": 0.18628253042697906, + "learning_rate": 2.7315091278407523e-05, + "loss": 1.7477, + "step": 21517 + }, + { + "epoch": 6.604665438919582, + "grad_norm": 0.20788705348968506, + "learning_rate": 2.731066184919323e-05, + "loss": 1.7185, + "step": 21518 + }, + { + "epoch": 6.604972375690608, + "grad_norm": 0.17834967374801636, + "learning_rate": 2.730623264420827e-05, + "loss": 1.67, + "step": 21519 + }, + { + "epoch": 6.605279312461633, + "grad_norm": 0.2183784693479538, + "learning_rate": 2.7301803663496417e-05, + "loss": 1.6983, + "step": 21520 + }, + { + "epoch": 6.605586249232658, + "grad_norm": 0.1735544204711914, + "learning_rate": 2.7297374907101447e-05, + "loss": 1.7352, + "step": 21521 + }, + { + "epoch": 6.605893186003684, + "grad_norm": 0.2504538893699646, + "learning_rate": 2.729294637506713e-05, + "loss": 1.7332, + "step": 21522 + }, + { + "epoch": 6.606200122774708, + "grad_norm": 0.1801074892282486, + "learning_rate": 2.728851806743722e-05, + "loss": 1.7251, + "step": 21523 + }, + { + "epoch": 6.606507059545733, + "grad_norm": 0.25701379776000977, + "learning_rate": 2.728408998425549e-05, + "loss": 1.732, + "step": 21524 + }, + { + "epoch": 6.606813996316759, + "grad_norm": 0.1801779717206955, + "learning_rate": 2.7279662125565697e-05, + "loss": 1.6793, + "step": 21525 + }, + { + "epoch": 6.607120933087784, + "grad_norm": 0.21244947612285614, + "learning_rate": 2.7275234491411595e-05, + "loss": 1.7493, + "step": 21526 + }, + { + "epoch": 6.607427869858809, + "grad_norm": 0.20944559574127197, + "learning_rate": 2.7270807081836924e-05, + "loss": 1.722, + "step": 21527 + }, + { + "epoch": 6.607734806629834, + "grad_norm": 0.2526783049106598, + "learning_rate": 2.7266379896885508e-05, + "loss": 1.7628, + "step": 21528 + }, + { + "epoch": 6.608041743400859, + "grad_norm": 0.19788937270641327, + "learning_rate": 2.7261952936601002e-05, + "loss": 1.6538, + "step": 21529 + }, + { + "epoch": 6.6083486801718845, + "grad_norm": 0.2623229920864105, + "learning_rate": 2.725752620102725e-05, + "loss": 1.7694, + "step": 21530 + }, + { + "epoch": 6.60865561694291, + "grad_norm": 0.21503256261348724, + "learning_rate": 2.7253099690207913e-05, + "loss": 1.7553, + "step": 21531 + }, + { + "epoch": 6.608962553713935, + "grad_norm": 0.2114928811788559, + "learning_rate": 2.724867340418679e-05, + "loss": 1.7067, + "step": 21532 + }, + { + "epoch": 6.6092694904849605, + "grad_norm": 0.17945198714733124, + "learning_rate": 2.7244247343007623e-05, + "loss": 1.7419, + "step": 21533 + }, + { + "epoch": 6.609576427255985, + "grad_norm": 0.19239214062690735, + "learning_rate": 2.7239821506714137e-05, + "loss": 1.7644, + "step": 21534 + }, + { + "epoch": 6.60988336402701, + "grad_norm": 0.22906997799873352, + "learning_rate": 2.7235395895350068e-05, + "loss": 1.8063, + "step": 21535 + }, + { + "epoch": 6.610190300798036, + "grad_norm": 0.1965717375278473, + "learning_rate": 2.7230970508959162e-05, + "loss": 1.7841, + "step": 21536 + }, + { + "epoch": 6.610497237569061, + "grad_norm": 0.19944418966770172, + "learning_rate": 2.7226545347585158e-05, + "loss": 1.7382, + "step": 21537 + }, + { + "epoch": 6.610804174340086, + "grad_norm": 0.17155805230140686, + "learning_rate": 2.722212041127178e-05, + "loss": 1.6621, + "step": 21538 + }, + { + "epoch": 6.611111111111111, + "grad_norm": 0.20459938049316406, + "learning_rate": 2.721769570006275e-05, + "loss": 1.7481, + "step": 21539 + }, + { + "epoch": 6.611418047882136, + "grad_norm": 0.1991354376077652, + "learning_rate": 2.7213271214001813e-05, + "loss": 1.7874, + "step": 21540 + }, + { + "epoch": 6.611724984653161, + "grad_norm": 0.25073128938674927, + "learning_rate": 2.7208846953132682e-05, + "loss": 1.7921, + "step": 21541 + }, + { + "epoch": 6.612031921424187, + "grad_norm": 0.24456258118152618, + "learning_rate": 2.7204422917499085e-05, + "loss": 1.7564, + "step": 21542 + }, + { + "epoch": 6.612338858195212, + "grad_norm": 0.18416531383991241, + "learning_rate": 2.7199999107144736e-05, + "loss": 1.7247, + "step": 21543 + }, + { + "epoch": 6.612645794966237, + "grad_norm": 0.18439221382141113, + "learning_rate": 2.7195575522113347e-05, + "loss": 1.6607, + "step": 21544 + }, + { + "epoch": 6.612952731737262, + "grad_norm": 0.20334671437740326, + "learning_rate": 2.7191152162448685e-05, + "loss": 1.7487, + "step": 21545 + }, + { + "epoch": 6.613259668508287, + "grad_norm": 0.17871633172035217, + "learning_rate": 2.718672902819438e-05, + "loss": 1.7355, + "step": 21546 + }, + { + "epoch": 6.6135666052793125, + "grad_norm": 0.23006688058376312, + "learning_rate": 2.718230611939424e-05, + "loss": 1.6489, + "step": 21547 + }, + { + "epoch": 6.613873542050338, + "grad_norm": 0.19141538441181183, + "learning_rate": 2.7177883436091877e-05, + "loss": 1.6793, + "step": 21548 + }, + { + "epoch": 6.614180478821363, + "grad_norm": 0.20549756288528442, + "learning_rate": 2.7173460978331068e-05, + "loss": 1.8331, + "step": 21549 + }, + { + "epoch": 6.614487415592388, + "grad_norm": 0.19106455147266388, + "learning_rate": 2.7169038746155495e-05, + "loss": 1.7295, + "step": 21550 + }, + { + "epoch": 6.614794352363413, + "grad_norm": 0.20190143585205078, + "learning_rate": 2.7164616739608866e-05, + "loss": 1.7032, + "step": 21551 + }, + { + "epoch": 6.615101289134438, + "grad_norm": 0.1969708949327469, + "learning_rate": 2.716019495873488e-05, + "loss": 1.6935, + "step": 21552 + }, + { + "epoch": 6.615408225905464, + "grad_norm": 0.23748311400413513, + "learning_rate": 2.7155773403577235e-05, + "loss": 1.7942, + "step": 21553 + }, + { + "epoch": 6.615715162676489, + "grad_norm": 0.29168081283569336, + "learning_rate": 2.715135207417962e-05, + "loss": 1.7121, + "step": 21554 + }, + { + "epoch": 6.616022099447514, + "grad_norm": 0.2428344041109085, + "learning_rate": 2.7146930970585738e-05, + "loss": 1.7287, + "step": 21555 + }, + { + "epoch": 6.616329036218539, + "grad_norm": 0.2520657479763031, + "learning_rate": 2.714251009283928e-05, + "loss": 1.8462, + "step": 21556 + }, + { + "epoch": 6.616635972989564, + "grad_norm": 0.2426053285598755, + "learning_rate": 2.713808944098394e-05, + "loss": 1.7094, + "step": 21557 + }, + { + "epoch": 6.616942909760589, + "grad_norm": 0.17593255639076233, + "learning_rate": 2.713366901506339e-05, + "loss": 1.6891, + "step": 21558 + }, + { + "epoch": 6.617249846531615, + "grad_norm": 0.20620940625667572, + "learning_rate": 2.7129248815121332e-05, + "loss": 1.7277, + "step": 21559 + }, + { + "epoch": 6.617556783302639, + "grad_norm": 0.21467719972133636, + "learning_rate": 2.7124828841201445e-05, + "loss": 1.7543, + "step": 21560 + }, + { + "epoch": 6.6178637200736645, + "grad_norm": 0.21372607350349426, + "learning_rate": 2.7120409093347378e-05, + "loss": 1.7207, + "step": 21561 + }, + { + "epoch": 6.61817065684469, + "grad_norm": 0.2123684585094452, + "learning_rate": 2.7115989571602884e-05, + "loss": 1.71, + "step": 21562 + }, + { + "epoch": 6.618477593615715, + "grad_norm": 0.19155478477478027, + "learning_rate": 2.711157027601155e-05, + "loss": 1.7182, + "step": 21563 + }, + { + "epoch": 6.6187845303867405, + "grad_norm": 0.23053184151649475, + "learning_rate": 2.7107151206617136e-05, + "loss": 1.7147, + "step": 21564 + }, + { + "epoch": 6.619091467157766, + "grad_norm": 0.1635691374540329, + "learning_rate": 2.7102732363463235e-05, + "loss": 1.6913, + "step": 21565 + }, + { + "epoch": 6.61939840392879, + "grad_norm": 0.19415298104286194, + "learning_rate": 2.709831374659357e-05, + "loss": 1.6813, + "step": 21566 + }, + { + "epoch": 6.619705340699816, + "grad_norm": 0.19547943770885468, + "learning_rate": 2.709389535605179e-05, + "loss": 1.6988, + "step": 21567 + }, + { + "epoch": 6.620012277470841, + "grad_norm": 0.1921805888414383, + "learning_rate": 2.7089477191881564e-05, + "loss": 1.6931, + "step": 21568 + }, + { + "epoch": 6.620319214241866, + "grad_norm": 0.18463274836540222, + "learning_rate": 2.7085059254126554e-05, + "loss": 1.7168, + "step": 21569 + }, + { + "epoch": 6.620626151012892, + "grad_norm": 0.2078532725572586, + "learning_rate": 2.7080641542830414e-05, + "loss": 1.7248, + "step": 21570 + }, + { + "epoch": 6.620933087783916, + "grad_norm": 0.18778283894062042, + "learning_rate": 2.7076224058036813e-05, + "loss": 1.6745, + "step": 21571 + }, + { + "epoch": 6.621240024554941, + "grad_norm": 0.26190707087516785, + "learning_rate": 2.70718067997894e-05, + "loss": 1.7317, + "step": 21572 + }, + { + "epoch": 6.621546961325967, + "grad_norm": 0.20449557900428772, + "learning_rate": 2.7067389768131836e-05, + "loss": 1.7167, + "step": 21573 + }, + { + "epoch": 6.621853898096992, + "grad_norm": 0.22722119092941284, + "learning_rate": 2.706297296310776e-05, + "loss": 1.7262, + "step": 21574 + }, + { + "epoch": 6.622160834868017, + "grad_norm": 0.24897173047065735, + "learning_rate": 2.7058556384760825e-05, + "loss": 1.7273, + "step": 21575 + }, + { + "epoch": 6.622467771639043, + "grad_norm": 0.19774340093135834, + "learning_rate": 2.705414003313469e-05, + "loss": 1.6765, + "step": 21576 + }, + { + "epoch": 6.622774708410067, + "grad_norm": 0.2661767303943634, + "learning_rate": 2.7049723908272995e-05, + "loss": 1.7046, + "step": 21577 + }, + { + "epoch": 6.6230816451810925, + "grad_norm": 0.2013266384601593, + "learning_rate": 2.7045308010219356e-05, + "loss": 1.7156, + "step": 21578 + }, + { + "epoch": 6.623388581952118, + "grad_norm": 0.22952915728092194, + "learning_rate": 2.7040892339017475e-05, + "loss": 1.7601, + "step": 21579 + }, + { + "epoch": 6.623695518723143, + "grad_norm": 0.18262411653995514, + "learning_rate": 2.7036476894710916e-05, + "loss": 1.7334, + "step": 21580 + }, + { + "epoch": 6.6240024554941686, + "grad_norm": 0.18907666206359863, + "learning_rate": 2.703206167734339e-05, + "loss": 1.7196, + "step": 21581 + }, + { + "epoch": 6.624309392265193, + "grad_norm": 0.2192571759223938, + "learning_rate": 2.7027646686958453e-05, + "loss": 1.7046, + "step": 21582 + }, + { + "epoch": 6.624616329036218, + "grad_norm": 0.165769562125206, + "learning_rate": 2.70232319235998e-05, + "loss": 1.7028, + "step": 21583 + }, + { + "epoch": 6.624923265807244, + "grad_norm": 0.19245828688144684, + "learning_rate": 2.701881738731103e-05, + "loss": 1.7153, + "step": 21584 + }, + { + "epoch": 6.625230202578269, + "grad_norm": 0.17638756334781647, + "learning_rate": 2.7014403078135776e-05, + "loss": 1.7071, + "step": 21585 + }, + { + "epoch": 6.625537139349294, + "grad_norm": 0.17205210030078888, + "learning_rate": 2.700998899611767e-05, + "loss": 1.6706, + "step": 21586 + }, + { + "epoch": 6.62584407612032, + "grad_norm": 0.24107681214809418, + "learning_rate": 2.700557514130032e-05, + "loss": 1.8013, + "step": 21587 + }, + { + "epoch": 6.626151012891344, + "grad_norm": 0.1839917004108429, + "learning_rate": 2.7001161513727358e-05, + "loss": 1.7381, + "step": 21588 + }, + { + "epoch": 6.6264579496623695, + "grad_norm": 0.24043352901935577, + "learning_rate": 2.6996748113442394e-05, + "loss": 1.7523, + "step": 21589 + }, + { + "epoch": 6.626764886433395, + "grad_norm": 0.23488068580627441, + "learning_rate": 2.6992334940489056e-05, + "loss": 1.7587, + "step": 21590 + }, + { + "epoch": 6.62707182320442, + "grad_norm": 0.18784530460834503, + "learning_rate": 2.698792199491094e-05, + "loss": 1.7053, + "step": 21591 + }, + { + "epoch": 6.627378759975445, + "grad_norm": 0.2758429944515228, + "learning_rate": 2.6983509276751673e-05, + "loss": 1.6927, + "step": 21592 + }, + { + "epoch": 6.62768569674647, + "grad_norm": 0.2731272280216217, + "learning_rate": 2.697909678605486e-05, + "loss": 1.7351, + "step": 21593 + }, + { + "epoch": 6.627992633517495, + "grad_norm": 0.24450576305389404, + "learning_rate": 2.6974684522864098e-05, + "loss": 1.7126, + "step": 21594 + }, + { + "epoch": 6.628299570288521, + "grad_norm": 0.21820391714572906, + "learning_rate": 2.6970272487222982e-05, + "loss": 1.7075, + "step": 21595 + }, + { + "epoch": 6.628606507059546, + "grad_norm": 0.23647959530353546, + "learning_rate": 2.696586067917517e-05, + "loss": 1.7369, + "step": 21596 + }, + { + "epoch": 6.628913443830571, + "grad_norm": 0.2665121555328369, + "learning_rate": 2.696144909876419e-05, + "loss": 1.7575, + "step": 21597 + }, + { + "epoch": 6.629220380601596, + "grad_norm": 0.19871680438518524, + "learning_rate": 2.695703774603371e-05, + "loss": 1.7334, + "step": 21598 + }, + { + "epoch": 6.629527317372621, + "grad_norm": 0.2363109588623047, + "learning_rate": 2.6952626621027245e-05, + "loss": 1.6878, + "step": 21599 + }, + { + "epoch": 6.629834254143646, + "grad_norm": 0.21958591043949127, + "learning_rate": 2.694821572378845e-05, + "loss": 1.6828, + "step": 21600 + }, + { + "epoch": 6.630141190914672, + "grad_norm": 0.20437858998775482, + "learning_rate": 2.6943805054360906e-05, + "loss": 1.7138, + "step": 21601 + }, + { + "epoch": 6.630448127685697, + "grad_norm": 0.27741923928260803, + "learning_rate": 2.6939394612788193e-05, + "loss": 1.7506, + "step": 21602 + }, + { + "epoch": 6.6307550644567215, + "grad_norm": 0.1885133981704712, + "learning_rate": 2.6934984399113917e-05, + "loss": 1.7669, + "step": 21603 + }, + { + "epoch": 6.631062001227747, + "grad_norm": 0.19453810155391693, + "learning_rate": 2.6930574413381604e-05, + "loss": 1.6837, + "step": 21604 + }, + { + "epoch": 6.631368937998772, + "grad_norm": 0.1685735285282135, + "learning_rate": 2.6926164655634894e-05, + "loss": 1.7045, + "step": 21605 + }, + { + "epoch": 6.6316758747697975, + "grad_norm": 0.2507462203502655, + "learning_rate": 2.6921755125917347e-05, + "loss": 1.7754, + "step": 21606 + }, + { + "epoch": 6.631982811540823, + "grad_norm": 0.1725471317768097, + "learning_rate": 2.691734582427255e-05, + "loss": 1.7219, + "step": 21607 + }, + { + "epoch": 6.632289748311848, + "grad_norm": 0.2633528709411621, + "learning_rate": 2.6912936750744068e-05, + "loss": 1.7362, + "step": 21608 + }, + { + "epoch": 6.632596685082873, + "grad_norm": 0.1808360069990158, + "learning_rate": 2.6908527905375474e-05, + "loss": 1.7338, + "step": 21609 + }, + { + "epoch": 6.632903621853898, + "grad_norm": 0.16186563670635223, + "learning_rate": 2.6904119288210344e-05, + "loss": 1.6752, + "step": 21610 + }, + { + "epoch": 6.633210558624923, + "grad_norm": 0.1954091340303421, + "learning_rate": 2.689971089929224e-05, + "loss": 1.714, + "step": 21611 + }, + { + "epoch": 6.633517495395949, + "grad_norm": 0.18954069912433624, + "learning_rate": 2.689530273866474e-05, + "loss": 1.7869, + "step": 21612 + }, + { + "epoch": 6.633824432166974, + "grad_norm": 0.182058185338974, + "learning_rate": 2.6890894806371392e-05, + "loss": 1.7708, + "step": 21613 + }, + { + "epoch": 6.634131368937998, + "grad_norm": 0.17313501238822937, + "learning_rate": 2.6886487102455755e-05, + "loss": 1.7064, + "step": 21614 + }, + { + "epoch": 6.634438305709024, + "grad_norm": 0.1732148379087448, + "learning_rate": 2.688207962696143e-05, + "loss": 1.7378, + "step": 21615 + }, + { + "epoch": 6.634745242480049, + "grad_norm": 0.17057274281978607, + "learning_rate": 2.687767237993191e-05, + "loss": 1.671, + "step": 21616 + }, + { + "epoch": 6.635052179251074, + "grad_norm": 0.17723220586776733, + "learning_rate": 2.6873265361410805e-05, + "loss": 1.7179, + "step": 21617 + }, + { + "epoch": 6.6353591160221, + "grad_norm": 0.18634437024593353, + "learning_rate": 2.6868858571441645e-05, + "loss": 1.7355, + "step": 21618 + }, + { + "epoch": 6.635666052793125, + "grad_norm": 0.205010786652565, + "learning_rate": 2.6864452010067985e-05, + "loss": 1.7399, + "step": 21619 + }, + { + "epoch": 6.6359729895641495, + "grad_norm": 0.2071879357099533, + "learning_rate": 2.6860045677333383e-05, + "loss": 1.7199, + "step": 21620 + }, + { + "epoch": 6.636279926335175, + "grad_norm": 0.17309685051441193, + "learning_rate": 2.685563957328134e-05, + "loss": 1.6595, + "step": 21621 + }, + { + "epoch": 6.6365868631062, + "grad_norm": 0.3505750000476837, + "learning_rate": 2.685123369795545e-05, + "loss": 1.7601, + "step": 21622 + }, + { + "epoch": 6.6368937998772255, + "grad_norm": 0.19184419512748718, + "learning_rate": 2.684682805139923e-05, + "loss": 1.7225, + "step": 21623 + }, + { + "epoch": 6.637200736648251, + "grad_norm": 0.20142409205436707, + "learning_rate": 2.6842422633656233e-05, + "loss": 1.7201, + "step": 21624 + }, + { + "epoch": 6.637507673419275, + "grad_norm": 0.18348537385463715, + "learning_rate": 2.6838017444769993e-05, + "loss": 1.6983, + "step": 21625 + }, + { + "epoch": 6.637814610190301, + "grad_norm": 0.19275228679180145, + "learning_rate": 2.6833612484784033e-05, + "loss": 1.7028, + "step": 21626 + }, + { + "epoch": 6.638121546961326, + "grad_norm": 0.21269574761390686, + "learning_rate": 2.682920775374189e-05, + "loss": 1.7888, + "step": 21627 + }, + { + "epoch": 6.638428483732351, + "grad_norm": 0.17470422387123108, + "learning_rate": 2.68248032516871e-05, + "loss": 1.7147, + "step": 21628 + }, + { + "epoch": 6.638735420503377, + "grad_norm": 0.15697288513183594, + "learning_rate": 2.6820398978663185e-05, + "loss": 1.6544, + "step": 21629 + }, + { + "epoch": 6.639042357274402, + "grad_norm": 0.18636487424373627, + "learning_rate": 2.6815994934713677e-05, + "loss": 1.721, + "step": 21630 + }, + { + "epoch": 6.639349294045426, + "grad_norm": 0.18091215193271637, + "learning_rate": 2.681159111988208e-05, + "loss": 1.6973, + "step": 21631 + }, + { + "epoch": 6.639656230816452, + "grad_norm": 0.21360217034816742, + "learning_rate": 2.6807187534211965e-05, + "loss": 1.7379, + "step": 21632 + }, + { + "epoch": 6.639963167587477, + "grad_norm": 0.20027592778205872, + "learning_rate": 2.6802784177746777e-05, + "loss": 1.7207, + "step": 21633 + }, + { + "epoch": 6.640270104358502, + "grad_norm": 0.21839644014835358, + "learning_rate": 2.679838105053011e-05, + "loss": 1.715, + "step": 21634 + }, + { + "epoch": 6.640577041129527, + "grad_norm": 0.19237302243709564, + "learning_rate": 2.6793978152605404e-05, + "loss": 1.7415, + "step": 21635 + }, + { + "epoch": 6.640883977900552, + "grad_norm": 0.1979883313179016, + "learning_rate": 2.678957548401623e-05, + "loss": 1.7005, + "step": 21636 + }, + { + "epoch": 6.6411909146715775, + "grad_norm": 0.21867144107818604, + "learning_rate": 2.678517304480609e-05, + "loss": 1.8008, + "step": 21637 + }, + { + "epoch": 6.641497851442603, + "grad_norm": 0.17232954502105713, + "learning_rate": 2.6780770835018433e-05, + "loss": 1.6867, + "step": 21638 + }, + { + "epoch": 6.641804788213628, + "grad_norm": 0.21535196900367737, + "learning_rate": 2.6776368854696853e-05, + "loss": 1.7545, + "step": 21639 + }, + { + "epoch": 6.6421117249846535, + "grad_norm": 0.18891240656375885, + "learning_rate": 2.6771967103884766e-05, + "loss": 1.7164, + "step": 21640 + }, + { + "epoch": 6.642418661755678, + "grad_norm": 0.2558320462703705, + "learning_rate": 2.6767565582625743e-05, + "loss": 1.8125, + "step": 21641 + }, + { + "epoch": 6.642725598526703, + "grad_norm": 0.20400027930736542, + "learning_rate": 2.6763164290963244e-05, + "loss": 1.7335, + "step": 21642 + }, + { + "epoch": 6.643032535297729, + "grad_norm": 0.21388766169548035, + "learning_rate": 2.6758763228940775e-05, + "loss": 1.7788, + "step": 21643 + }, + { + "epoch": 6.643339472068754, + "grad_norm": 0.20607435703277588, + "learning_rate": 2.6754362396601834e-05, + "loss": 1.7481, + "step": 21644 + }, + { + "epoch": 6.643646408839779, + "grad_norm": 0.1608831286430359, + "learning_rate": 2.6749961793989907e-05, + "loss": 1.6577, + "step": 21645 + }, + { + "epoch": 6.643953345610804, + "grad_norm": 0.19074808061122894, + "learning_rate": 2.6745561421148485e-05, + "loss": 1.7335, + "step": 21646 + }, + { + "epoch": 6.644260282381829, + "grad_norm": 0.16517756879329681, + "learning_rate": 2.6741161278121053e-05, + "loss": 1.6663, + "step": 21647 + }, + { + "epoch": 6.644567219152854, + "grad_norm": 0.18976998329162598, + "learning_rate": 2.673676136495108e-05, + "loss": 1.7231, + "step": 21648 + }, + { + "epoch": 6.64487415592388, + "grad_norm": 0.20694875717163086, + "learning_rate": 2.6732361681682106e-05, + "loss": 1.7469, + "step": 21649 + }, + { + "epoch": 6.645181092694905, + "grad_norm": 0.1994311809539795, + "learning_rate": 2.6727962228357533e-05, + "loss": 1.6864, + "step": 21650 + }, + { + "epoch": 6.64548802946593, + "grad_norm": 0.18886511027812958, + "learning_rate": 2.672356300502091e-05, + "loss": 1.6874, + "step": 21651 + }, + { + "epoch": 6.645794966236955, + "grad_norm": 0.2152819186449051, + "learning_rate": 2.6719164011715653e-05, + "loss": 1.7327, + "step": 21652 + }, + { + "epoch": 6.64610190300798, + "grad_norm": 0.20525617897510529, + "learning_rate": 2.6714765248485275e-05, + "loss": 1.7409, + "step": 21653 + }, + { + "epoch": 6.6464088397790055, + "grad_norm": 0.21892790496349335, + "learning_rate": 2.6710366715373254e-05, + "loss": 1.7281, + "step": 21654 + }, + { + "epoch": 6.646715776550031, + "grad_norm": 0.20156462490558624, + "learning_rate": 2.6705968412423e-05, + "loss": 1.7211, + "step": 21655 + }, + { + "epoch": 6.647022713321056, + "grad_norm": 0.19993625581264496, + "learning_rate": 2.670157033967806e-05, + "loss": 1.8058, + "step": 21656 + }, + { + "epoch": 6.647329650092081, + "grad_norm": 0.1970909684896469, + "learning_rate": 2.669717249718182e-05, + "loss": 1.7707, + "step": 21657 + }, + { + "epoch": 6.647636586863106, + "grad_norm": 0.19287796318531036, + "learning_rate": 2.6692774884977796e-05, + "loss": 1.688, + "step": 21658 + }, + { + "epoch": 6.647943523634131, + "grad_norm": 0.17658226191997528, + "learning_rate": 2.668837750310943e-05, + "loss": 1.6936, + "step": 21659 + }, + { + "epoch": 6.648250460405157, + "grad_norm": 0.20234479010105133, + "learning_rate": 2.6683980351620184e-05, + "loss": 1.7069, + "step": 21660 + }, + { + "epoch": 6.648557397176182, + "grad_norm": 0.1957871913909912, + "learning_rate": 2.6679583430553513e-05, + "loss": 1.736, + "step": 21661 + }, + { + "epoch": 6.648864333947207, + "grad_norm": 0.20084553956985474, + "learning_rate": 2.667518673995286e-05, + "loss": 1.7262, + "step": 21662 + }, + { + "epoch": 6.649171270718232, + "grad_norm": 0.18749211728572845, + "learning_rate": 2.667079027986169e-05, + "loss": 1.7127, + "step": 21663 + }, + { + "epoch": 6.649478207489257, + "grad_norm": 0.1747027188539505, + "learning_rate": 2.666639405032344e-05, + "loss": 1.6922, + "step": 21664 + }, + { + "epoch": 6.649785144260282, + "grad_norm": 0.3119397759437561, + "learning_rate": 2.666199805138154e-05, + "loss": 1.7373, + "step": 21665 + }, + { + "epoch": 6.650092081031308, + "grad_norm": 0.25986436009407043, + "learning_rate": 2.6657602283079498e-05, + "loss": 1.7521, + "step": 21666 + }, + { + "epoch": 6.650399017802332, + "grad_norm": 0.20535705983638763, + "learning_rate": 2.6653206745460663e-05, + "loss": 1.7144, + "step": 21667 + }, + { + "epoch": 6.650705954573358, + "grad_norm": 0.20804347097873688, + "learning_rate": 2.6648811438568566e-05, + "loss": 1.7186, + "step": 21668 + }, + { + "epoch": 6.651012891344383, + "grad_norm": 0.20753289759159088, + "learning_rate": 2.6644416362446566e-05, + "loss": 1.7098, + "step": 21669 + }, + { + "epoch": 6.651319828115408, + "grad_norm": 0.18725311756134033, + "learning_rate": 2.6640021517138148e-05, + "loss": 1.7331, + "step": 21670 + }, + { + "epoch": 6.651626764886434, + "grad_norm": 0.1907210648059845, + "learning_rate": 2.663562690268675e-05, + "loss": 1.6677, + "step": 21671 + }, + { + "epoch": 6.651933701657459, + "grad_norm": 0.19124922156333923, + "learning_rate": 2.6631232519135747e-05, + "loss": 1.7337, + "step": 21672 + }, + { + "epoch": 6.652240638428484, + "grad_norm": 0.21045447885990143, + "learning_rate": 2.6626838366528633e-05, + "loss": 1.7028, + "step": 21673 + }, + { + "epoch": 6.652547575199509, + "grad_norm": 0.1891855001449585, + "learning_rate": 2.6622444444908767e-05, + "loss": 1.7247, + "step": 21674 + }, + { + "epoch": 6.652854511970534, + "grad_norm": 0.2236541211605072, + "learning_rate": 2.6618050754319623e-05, + "loss": 1.6986, + "step": 21675 + }, + { + "epoch": 6.653161448741559, + "grad_norm": 0.19088539481163025, + "learning_rate": 2.6613657294804604e-05, + "loss": 1.7118, + "step": 21676 + }, + { + "epoch": 6.653468385512585, + "grad_norm": 0.26210764050483704, + "learning_rate": 2.660926406640714e-05, + "loss": 1.7542, + "step": 21677 + }, + { + "epoch": 6.653775322283609, + "grad_norm": 0.2564029097557068, + "learning_rate": 2.6604871069170632e-05, + "loss": 1.7395, + "step": 21678 + }, + { + "epoch": 6.6540822590546345, + "grad_norm": 0.22974301874637604, + "learning_rate": 2.6600478303138503e-05, + "loss": 1.6905, + "step": 21679 + }, + { + "epoch": 6.65438919582566, + "grad_norm": 0.299772173166275, + "learning_rate": 2.659608576835416e-05, + "loss": 1.7875, + "step": 21680 + }, + { + "epoch": 6.654696132596685, + "grad_norm": 0.26459556818008423, + "learning_rate": 2.6591693464861018e-05, + "loss": 1.7185, + "step": 21681 + }, + { + "epoch": 6.6550030693677105, + "grad_norm": 0.24505311250686646, + "learning_rate": 2.6587301392702457e-05, + "loss": 1.7105, + "step": 21682 + }, + { + "epoch": 6.655310006138736, + "grad_norm": 0.1626308262348175, + "learning_rate": 2.6582909551921953e-05, + "loss": 1.6668, + "step": 21683 + }, + { + "epoch": 6.65561694290976, + "grad_norm": 0.20354291796684265, + "learning_rate": 2.6578517942562813e-05, + "loss": 1.7437, + "step": 21684 + }, + { + "epoch": 6.655923879680786, + "grad_norm": 0.18618443608283997, + "learning_rate": 2.6574126564668532e-05, + "loss": 1.6757, + "step": 21685 + }, + { + "epoch": 6.656230816451811, + "grad_norm": 0.1863735467195511, + "learning_rate": 2.656973541828242e-05, + "loss": 1.6549, + "step": 21686 + }, + { + "epoch": 6.656537753222836, + "grad_norm": 0.2118620127439499, + "learning_rate": 2.6565344503447935e-05, + "loss": 1.6927, + "step": 21687 + }, + { + "epoch": 6.656844689993862, + "grad_norm": 0.24023136496543884, + "learning_rate": 2.6560953820208478e-05, + "loss": 1.6969, + "step": 21688 + }, + { + "epoch": 6.657151626764886, + "grad_norm": 0.21124204993247986, + "learning_rate": 2.6556563368607368e-05, + "loss": 1.6662, + "step": 21689 + }, + { + "epoch": 6.657458563535911, + "grad_norm": 0.16295355558395386, + "learning_rate": 2.6552173148688075e-05, + "loss": 1.7203, + "step": 21690 + }, + { + "epoch": 6.657765500306937, + "grad_norm": 0.18650858104228973, + "learning_rate": 2.6547783160493916e-05, + "loss": 1.7177, + "step": 21691 + }, + { + "epoch": 6.658072437077962, + "grad_norm": 0.20509213209152222, + "learning_rate": 2.6543393404068328e-05, + "loss": 1.723, + "step": 21692 + }, + { + "epoch": 6.658379373848987, + "grad_norm": 0.20985513925552368, + "learning_rate": 2.6539003879454678e-05, + "loss": 1.6679, + "step": 21693 + }, + { + "epoch": 6.658686310620013, + "grad_norm": 0.19907233119010925, + "learning_rate": 2.6534614586696338e-05, + "loss": 1.7028, + "step": 21694 + }, + { + "epoch": 6.658993247391037, + "grad_norm": 0.21793772280216217, + "learning_rate": 2.6530225525836692e-05, + "loss": 1.7706, + "step": 21695 + }, + { + "epoch": 6.6593001841620625, + "grad_norm": 0.24162191152572632, + "learning_rate": 2.6525836696919117e-05, + "loss": 1.806, + "step": 21696 + }, + { + "epoch": 6.659607120933088, + "grad_norm": 0.1735360324382782, + "learning_rate": 2.652144809998698e-05, + "loss": 1.7047, + "step": 21697 + }, + { + "epoch": 6.659914057704113, + "grad_norm": 0.18471799790859222, + "learning_rate": 2.651705973508365e-05, + "loss": 1.7306, + "step": 21698 + }, + { + "epoch": 6.6602209944751385, + "grad_norm": 0.17422814667224884, + "learning_rate": 2.6512671602252482e-05, + "loss": 1.6666, + "step": 21699 + }, + { + "epoch": 6.660527931246163, + "grad_norm": 0.19209833443164825, + "learning_rate": 2.6508283701536897e-05, + "loss": 1.6966, + "step": 21700 + }, + { + "epoch": 6.660834868017188, + "grad_norm": 0.1902640461921692, + "learning_rate": 2.650389603298019e-05, + "loss": 1.7887, + "step": 21701 + }, + { + "epoch": 6.661141804788214, + "grad_norm": 0.18551218509674072, + "learning_rate": 2.6499508596625787e-05, + "loss": 1.6851, + "step": 21702 + }, + { + "epoch": 6.661448741559239, + "grad_norm": 0.2165011614561081, + "learning_rate": 2.6495121392516976e-05, + "loss": 1.7465, + "step": 21703 + }, + { + "epoch": 6.661755678330264, + "grad_norm": 0.22871245443820953, + "learning_rate": 2.6490734420697172e-05, + "loss": 1.7487, + "step": 21704 + }, + { + "epoch": 6.66206261510129, + "grad_norm": 0.21275551617145538, + "learning_rate": 2.6486347681209723e-05, + "loss": 1.7782, + "step": 21705 + }, + { + "epoch": 6.662369551872314, + "grad_norm": 0.2926945984363556, + "learning_rate": 2.6481961174097937e-05, + "loss": 1.7413, + "step": 21706 + }, + { + "epoch": 6.662676488643339, + "grad_norm": 0.17143094539642334, + "learning_rate": 2.6477574899405233e-05, + "loss": 1.6639, + "step": 21707 + }, + { + "epoch": 6.662983425414365, + "grad_norm": 0.22194001078605652, + "learning_rate": 2.647318885717488e-05, + "loss": 1.7035, + "step": 21708 + }, + { + "epoch": 6.66329036218539, + "grad_norm": 0.18232671916484833, + "learning_rate": 2.6468803047450286e-05, + "loss": 1.6977, + "step": 21709 + }, + { + "epoch": 6.6635972989564145, + "grad_norm": 0.2626599371433258, + "learning_rate": 2.6464417470274773e-05, + "loss": 1.7422, + "step": 21710 + }, + { + "epoch": 6.66390423572744, + "grad_norm": 0.2034282237291336, + "learning_rate": 2.6460032125691668e-05, + "loss": 1.7531, + "step": 21711 + }, + { + "epoch": 6.664211172498465, + "grad_norm": 0.2308860868215561, + "learning_rate": 2.645564701374434e-05, + "loss": 1.7271, + "step": 21712 + }, + { + "epoch": 6.6645181092694905, + "grad_norm": 0.2163545936346054, + "learning_rate": 2.64512621344761e-05, + "loss": 1.7632, + "step": 21713 + }, + { + "epoch": 6.664825046040516, + "grad_norm": 0.2566233277320862, + "learning_rate": 2.644687748793029e-05, + "loss": 1.7573, + "step": 21714 + }, + { + "epoch": 6.665131982811541, + "grad_norm": 0.21093623340129852, + "learning_rate": 2.6442493074150244e-05, + "loss": 1.6703, + "step": 21715 + }, + { + "epoch": 6.665438919582566, + "grad_norm": 0.2083086222410202, + "learning_rate": 2.643810889317927e-05, + "loss": 1.6672, + "step": 21716 + }, + { + "epoch": 6.665745856353591, + "grad_norm": 0.20711155235767365, + "learning_rate": 2.643372494506075e-05, + "loss": 1.7276, + "step": 21717 + }, + { + "epoch": 6.666052793124616, + "grad_norm": 0.18977457284927368, + "learning_rate": 2.6429341229837935e-05, + "loss": 1.7207, + "step": 21718 + }, + { + "epoch": 6.666359729895642, + "grad_norm": 0.28336507081985474, + "learning_rate": 2.6424957747554224e-05, + "loss": 1.7473, + "step": 21719 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.1761232167482376, + "learning_rate": 2.642057449825286e-05, + "loss": 1.7172, + "step": 21720 + }, + { + "epoch": 6.666973603437691, + "grad_norm": 0.21672405302524567, + "learning_rate": 2.6416191481977215e-05, + "loss": 1.6561, + "step": 21721 + }, + { + "epoch": 6.667280540208717, + "grad_norm": 0.226834237575531, + "learning_rate": 2.6411808698770613e-05, + "loss": 1.7315, + "step": 21722 + }, + { + "epoch": 6.667587476979742, + "grad_norm": 0.22553586959838867, + "learning_rate": 2.6407426148676307e-05, + "loss": 1.7301, + "step": 21723 + }, + { + "epoch": 6.667894413750767, + "grad_norm": 0.1913517564535141, + "learning_rate": 2.6403043831737672e-05, + "loss": 1.6739, + "step": 21724 + }, + { + "epoch": 6.668201350521793, + "grad_norm": 0.24560052156448364, + "learning_rate": 2.6398661747997955e-05, + "loss": 1.7347, + "step": 21725 + }, + { + "epoch": 6.668508287292818, + "grad_norm": 0.27361172437667847, + "learning_rate": 2.6394279897500517e-05, + "loss": 1.7713, + "step": 21726 + }, + { + "epoch": 6.6688152240638425, + "grad_norm": 0.21486583352088928, + "learning_rate": 2.6389898280288638e-05, + "loss": 1.7504, + "step": 21727 + }, + { + "epoch": 6.669122160834868, + "grad_norm": 0.19056405127048492, + "learning_rate": 2.6385516896405627e-05, + "loss": 1.7457, + "step": 21728 + }, + { + "epoch": 6.669429097605893, + "grad_norm": 0.19316376745700836, + "learning_rate": 2.638113574589478e-05, + "loss": 1.6969, + "step": 21729 + }, + { + "epoch": 6.6697360343769185, + "grad_norm": 0.21700869500637054, + "learning_rate": 2.637675482879939e-05, + "loss": 1.7055, + "step": 21730 + }, + { + "epoch": 6.670042971147944, + "grad_norm": 0.19720883667469025, + "learning_rate": 2.637237414516275e-05, + "loss": 1.7029, + "step": 21731 + }, + { + "epoch": 6.670349907918968, + "grad_norm": 0.16528408229351044, + "learning_rate": 2.6367993695028158e-05, + "loss": 1.6915, + "step": 21732 + }, + { + "epoch": 6.670656844689994, + "grad_norm": 0.19576294720172882, + "learning_rate": 2.636361347843889e-05, + "loss": 1.7034, + "step": 21733 + }, + { + "epoch": 6.670963781461019, + "grad_norm": 0.16859273612499237, + "learning_rate": 2.6359233495438285e-05, + "loss": 1.7114, + "step": 21734 + }, + { + "epoch": 6.671270718232044, + "grad_norm": 0.20480163395404816, + "learning_rate": 2.6354853746069553e-05, + "loss": 1.7304, + "step": 21735 + }, + { + "epoch": 6.67157765500307, + "grad_norm": 0.19104263186454773, + "learning_rate": 2.6350474230376048e-05, + "loss": 1.7026, + "step": 21736 + }, + { + "epoch": 6.671884591774095, + "grad_norm": 0.18243174254894257, + "learning_rate": 2.634609494840098e-05, + "loss": 1.6769, + "step": 21737 + }, + { + "epoch": 6.672191528545119, + "grad_norm": 0.20766063034534454, + "learning_rate": 2.634171590018769e-05, + "loss": 1.7436, + "step": 21738 + }, + { + "epoch": 6.672498465316145, + "grad_norm": 0.22035297751426697, + "learning_rate": 2.6337337085779444e-05, + "loss": 1.8211, + "step": 21739 + }, + { + "epoch": 6.67280540208717, + "grad_norm": 0.18965984880924225, + "learning_rate": 2.6332958505219475e-05, + "loss": 1.7067, + "step": 21740 + }, + { + "epoch": 6.673112338858195, + "grad_norm": 0.21209993958473206, + "learning_rate": 2.632858015855111e-05, + "loss": 1.7743, + "step": 21741 + }, + { + "epoch": 6.67341927562922, + "grad_norm": 0.18409015238285065, + "learning_rate": 2.6324202045817547e-05, + "loss": 1.7494, + "step": 21742 + }, + { + "epoch": 6.673726212400245, + "grad_norm": 0.23252969980239868, + "learning_rate": 2.6319824167062125e-05, + "loss": 1.7459, + "step": 21743 + }, + { + "epoch": 6.6740331491712706, + "grad_norm": 0.16296416521072388, + "learning_rate": 2.631544652232808e-05, + "loss": 1.648, + "step": 21744 + }, + { + "epoch": 6.674340085942296, + "grad_norm": 0.2458602488040924, + "learning_rate": 2.631106911165867e-05, + "loss": 1.6847, + "step": 21745 + }, + { + "epoch": 6.674647022713321, + "grad_norm": 0.21203550696372986, + "learning_rate": 2.6306691935097162e-05, + "loss": 1.713, + "step": 21746 + }, + { + "epoch": 6.6749539594843466, + "grad_norm": 0.19969885051250458, + "learning_rate": 2.6302314992686804e-05, + "loss": 1.7445, + "step": 21747 + }, + { + "epoch": 6.675260896255372, + "grad_norm": 0.21001017093658447, + "learning_rate": 2.629793828447087e-05, + "loss": 1.703, + "step": 21748 + }, + { + "epoch": 6.675567833026396, + "grad_norm": 0.18607214093208313, + "learning_rate": 2.6293561810492595e-05, + "loss": 1.6765, + "step": 21749 + }, + { + "epoch": 6.675874769797422, + "grad_norm": 0.21806176006793976, + "learning_rate": 2.6289185570795223e-05, + "loss": 1.7099, + "step": 21750 + }, + { + "epoch": 6.676181706568447, + "grad_norm": 0.1861930787563324, + "learning_rate": 2.6284809565422052e-05, + "loss": 1.6978, + "step": 21751 + }, + { + "epoch": 6.676488643339472, + "grad_norm": 0.18779867887496948, + "learning_rate": 2.6280433794416254e-05, + "loss": 1.7132, + "step": 21752 + }, + { + "epoch": 6.676795580110497, + "grad_norm": 0.18255293369293213, + "learning_rate": 2.627605825782115e-05, + "loss": 1.7045, + "step": 21753 + }, + { + "epoch": 6.677102516881522, + "grad_norm": 0.22258871793746948, + "learning_rate": 2.6271682955679904e-05, + "loss": 1.7159, + "step": 21754 + }, + { + "epoch": 6.6774094536525475, + "grad_norm": 0.17425768077373505, + "learning_rate": 2.626730788803582e-05, + "loss": 1.6571, + "step": 21755 + }, + { + "epoch": 6.677716390423573, + "grad_norm": 0.1921091377735138, + "learning_rate": 2.6262933054932122e-05, + "loss": 1.8178, + "step": 21756 + }, + { + "epoch": 6.678023327194598, + "grad_norm": 0.16262951493263245, + "learning_rate": 2.6258558456411996e-05, + "loss": 1.6586, + "step": 21757 + }, + { + "epoch": 6.6783302639656235, + "grad_norm": 0.1853780597448349, + "learning_rate": 2.6254184092518752e-05, + "loss": 1.7116, + "step": 21758 + }, + { + "epoch": 6.678637200736648, + "grad_norm": 0.17973974347114563, + "learning_rate": 2.6249809963295536e-05, + "loss": 1.7317, + "step": 21759 + }, + { + "epoch": 6.678944137507673, + "grad_norm": 0.21258050203323364, + "learning_rate": 2.6245436068785634e-05, + "loss": 1.7852, + "step": 21760 + }, + { + "epoch": 6.679251074278699, + "grad_norm": 0.18741287291049957, + "learning_rate": 2.6241062409032262e-05, + "loss": 1.7071, + "step": 21761 + }, + { + "epoch": 6.679558011049724, + "grad_norm": 0.20436155796051025, + "learning_rate": 2.623668898407864e-05, + "loss": 1.7683, + "step": 21762 + }, + { + "epoch": 6.679864947820749, + "grad_norm": 0.18840116262435913, + "learning_rate": 2.6232315793967977e-05, + "loss": 1.7335, + "step": 21763 + }, + { + "epoch": 6.680171884591774, + "grad_norm": 0.1968357264995575, + "learning_rate": 2.62279428387435e-05, + "loss": 1.6848, + "step": 21764 + }, + { + "epoch": 6.680478821362799, + "grad_norm": 0.1774388998746872, + "learning_rate": 2.622357011844844e-05, + "loss": 1.6943, + "step": 21765 + }, + { + "epoch": 6.680785758133824, + "grad_norm": 0.2424328327178955, + "learning_rate": 2.621919763312598e-05, + "loss": 1.7479, + "step": 21766 + }, + { + "epoch": 6.68109269490485, + "grad_norm": 0.21220771968364716, + "learning_rate": 2.6214825382819353e-05, + "loss": 1.7384, + "step": 21767 + }, + { + "epoch": 6.681399631675875, + "grad_norm": 0.23322279751300812, + "learning_rate": 2.6210453367571764e-05, + "loss": 1.6625, + "step": 21768 + }, + { + "epoch": 6.6817065684469, + "grad_norm": 0.1726260483264923, + "learning_rate": 2.620608158742639e-05, + "loss": 1.7055, + "step": 21769 + }, + { + "epoch": 6.682013505217925, + "grad_norm": 0.25436410307884216, + "learning_rate": 2.6201710042426512e-05, + "loss": 1.7449, + "step": 21770 + }, + { + "epoch": 6.68232044198895, + "grad_norm": 0.20275171101093292, + "learning_rate": 2.619733873261524e-05, + "loss": 1.7575, + "step": 21771 + }, + { + "epoch": 6.6826273787599755, + "grad_norm": 0.24221903085708618, + "learning_rate": 2.6192967658035846e-05, + "loss": 1.7312, + "step": 21772 + }, + { + "epoch": 6.682934315531001, + "grad_norm": 0.30804362893104553, + "learning_rate": 2.6188596818731507e-05, + "loss": 1.7669, + "step": 21773 + }, + { + "epoch": 6.683241252302026, + "grad_norm": 0.1818273365497589, + "learning_rate": 2.6184226214745377e-05, + "loss": 1.7102, + "step": 21774 + }, + { + "epoch": 6.683548189073051, + "grad_norm": 0.28026455640792847, + "learning_rate": 2.6179855846120727e-05, + "loss": 1.7313, + "step": 21775 + }, + { + "epoch": 6.683855125844076, + "grad_norm": 0.26503586769104004, + "learning_rate": 2.6175485712900655e-05, + "loss": 1.7622, + "step": 21776 + }, + { + "epoch": 6.684162062615101, + "grad_norm": 0.19122248888015747, + "learning_rate": 2.6171115815128423e-05, + "loss": 1.7347, + "step": 21777 + }, + { + "epoch": 6.684468999386127, + "grad_norm": 0.18789063394069672, + "learning_rate": 2.6166746152847187e-05, + "loss": 1.7158, + "step": 21778 + }, + { + "epoch": 6.684775936157152, + "grad_norm": 0.17315362393856049, + "learning_rate": 2.6162376726100135e-05, + "loss": 1.6561, + "step": 21779 + }, + { + "epoch": 6.685082872928177, + "grad_norm": 0.20659680664539337, + "learning_rate": 2.615800753493045e-05, + "loss": 1.7063, + "step": 21780 + }, + { + "epoch": 6.685389809699202, + "grad_norm": 0.2051183432340622, + "learning_rate": 2.6153638579381307e-05, + "loss": 1.7213, + "step": 21781 + }, + { + "epoch": 6.685696746470227, + "grad_norm": 0.23349207639694214, + "learning_rate": 2.6149269859495884e-05, + "loss": 1.7453, + "step": 21782 + }, + { + "epoch": 6.686003683241252, + "grad_norm": 0.1979275941848755, + "learning_rate": 2.6144901375317355e-05, + "loss": 1.7482, + "step": 21783 + }, + { + "epoch": 6.686310620012278, + "grad_norm": 0.2742067873477936, + "learning_rate": 2.61405331268889e-05, + "loss": 1.7114, + "step": 21784 + }, + { + "epoch": 6.686617556783302, + "grad_norm": 0.18656300008296967, + "learning_rate": 2.6136165114253675e-05, + "loss": 1.7114, + "step": 21785 + }, + { + "epoch": 6.6869244935543275, + "grad_norm": 0.19345268607139587, + "learning_rate": 2.6131797337454834e-05, + "loss": 1.6818, + "step": 21786 + }, + { + "epoch": 6.687231430325353, + "grad_norm": 0.2194962054491043, + "learning_rate": 2.6127429796535597e-05, + "loss": 1.7519, + "step": 21787 + }, + { + "epoch": 6.687538367096378, + "grad_norm": 0.21714645624160767, + "learning_rate": 2.6123062491539054e-05, + "loss": 1.7334, + "step": 21788 + }, + { + "epoch": 6.6878453038674035, + "grad_norm": 0.1684521585702896, + "learning_rate": 2.6118695422508444e-05, + "loss": 1.6843, + "step": 21789 + }, + { + "epoch": 6.688152240638429, + "grad_norm": 0.16155442595481873, + "learning_rate": 2.6114328589486865e-05, + "loss": 1.6541, + "step": 21790 + }, + { + "epoch": 6.688459177409453, + "grad_norm": 0.18483634293079376, + "learning_rate": 2.6109961992517462e-05, + "loss": 1.688, + "step": 21791 + }, + { + "epoch": 6.688766114180479, + "grad_norm": 0.23146624863147736, + "learning_rate": 2.6105595631643466e-05, + "loss": 1.8006, + "step": 21792 + }, + { + "epoch": 6.689073050951504, + "grad_norm": 0.1852748543024063, + "learning_rate": 2.6101229506907937e-05, + "loss": 1.6624, + "step": 21793 + }, + { + "epoch": 6.689379987722529, + "grad_norm": 0.23809482157230377, + "learning_rate": 2.6096863618354105e-05, + "loss": 1.7313, + "step": 21794 + }, + { + "epoch": 6.689686924493555, + "grad_norm": 0.17145361006259918, + "learning_rate": 2.609249796602503e-05, + "loss": 1.6966, + "step": 21795 + }, + { + "epoch": 6.689993861264579, + "grad_norm": 0.1842796355485916, + "learning_rate": 2.6088132549963933e-05, + "loss": 1.6871, + "step": 21796 + }, + { + "epoch": 6.690300798035604, + "grad_norm": 0.1810201108455658, + "learning_rate": 2.608376737021392e-05, + "loss": 1.7509, + "step": 21797 + }, + { + "epoch": 6.69060773480663, + "grad_norm": 0.20428195595741272, + "learning_rate": 2.607940242681814e-05, + "loss": 1.7102, + "step": 21798 + }, + { + "epoch": 6.690914671577655, + "grad_norm": 0.1659073680639267, + "learning_rate": 2.6075037719819716e-05, + "loss": 1.7053, + "step": 21799 + }, + { + "epoch": 6.69122160834868, + "grad_norm": 0.19351087510585785, + "learning_rate": 2.60706732492618e-05, + "loss": 1.6847, + "step": 21800 + }, + { + "epoch": 6.691528545119706, + "grad_norm": 0.1734616905450821, + "learning_rate": 2.6066309015187517e-05, + "loss": 1.6989, + "step": 21801 + }, + { + "epoch": 6.69183548189073, + "grad_norm": 0.1863887459039688, + "learning_rate": 2.6061945017639995e-05, + "loss": 1.665, + "step": 21802 + }, + { + "epoch": 6.6921424186617555, + "grad_norm": 0.20225204527378082, + "learning_rate": 2.6057581256662344e-05, + "loss": 1.718, + "step": 21803 + }, + { + "epoch": 6.692449355432781, + "grad_norm": 0.22148309648036957, + "learning_rate": 2.605321773229774e-05, + "loss": 1.7801, + "step": 21804 + }, + { + "epoch": 6.692756292203806, + "grad_norm": 0.1870507448911667, + "learning_rate": 2.6048854444589242e-05, + "loss": 1.6613, + "step": 21805 + }, + { + "epoch": 6.6930632289748315, + "grad_norm": 0.18597224354743958, + "learning_rate": 2.604449139358004e-05, + "loss": 1.7284, + "step": 21806 + }, + { + "epoch": 6.693370165745856, + "grad_norm": 0.2082163542509079, + "learning_rate": 2.6040128579313193e-05, + "loss": 1.7456, + "step": 21807 + }, + { + "epoch": 6.693677102516881, + "grad_norm": 0.22506757080554962, + "learning_rate": 2.603576600183183e-05, + "loss": 1.7369, + "step": 21808 + }, + { + "epoch": 6.693984039287907, + "grad_norm": 0.20707464218139648, + "learning_rate": 2.60314036611791e-05, + "loss": 1.7176, + "step": 21809 + }, + { + "epoch": 6.694290976058932, + "grad_norm": 0.2306852787733078, + "learning_rate": 2.6027041557398053e-05, + "loss": 1.7582, + "step": 21810 + }, + { + "epoch": 6.694597912829957, + "grad_norm": 0.23120234906673431, + "learning_rate": 2.602267969053187e-05, + "loss": 1.7169, + "step": 21811 + }, + { + "epoch": 6.694904849600983, + "grad_norm": 0.24841509759426117, + "learning_rate": 2.6018318060623582e-05, + "loss": 1.7636, + "step": 21812 + }, + { + "epoch": 6.695211786372007, + "grad_norm": 0.22443681955337524, + "learning_rate": 2.601395666771635e-05, + "loss": 1.7465, + "step": 21813 + }, + { + "epoch": 6.695518723143032, + "grad_norm": 0.2905699908733368, + "learning_rate": 2.6009595511853257e-05, + "loss": 1.779, + "step": 21814 + }, + { + "epoch": 6.695825659914058, + "grad_norm": 0.18677717447280884, + "learning_rate": 2.60052345930774e-05, + "loss": 1.711, + "step": 21815 + }, + { + "epoch": 6.696132596685083, + "grad_norm": 0.2150946855545044, + "learning_rate": 2.6000873911431883e-05, + "loss": 1.7254, + "step": 21816 + }, + { + "epoch": 6.696439533456108, + "grad_norm": 0.20066408812999725, + "learning_rate": 2.5996513466959794e-05, + "loss": 1.7198, + "step": 21817 + }, + { + "epoch": 6.696746470227133, + "grad_norm": 0.23815886676311493, + "learning_rate": 2.5992153259704228e-05, + "loss": 1.749, + "step": 21818 + }, + { + "epoch": 6.697053406998158, + "grad_norm": 0.2067428082227707, + "learning_rate": 2.5987793289708273e-05, + "loss": 1.736, + "step": 21819 + }, + { + "epoch": 6.6973603437691835, + "grad_norm": 0.2126816362142563, + "learning_rate": 2.5983433557015e-05, + "loss": 1.6804, + "step": 21820 + }, + { + "epoch": 6.697667280540209, + "grad_norm": 0.2003033310174942, + "learning_rate": 2.597907406166756e-05, + "loss": 1.7303, + "step": 21821 + }, + { + "epoch": 6.697974217311234, + "grad_norm": 0.238821879029274, + "learning_rate": 2.5974714803708946e-05, + "loss": 1.7399, + "step": 21822 + }, + { + "epoch": 6.6982811540822595, + "grad_norm": 0.21327996253967285, + "learning_rate": 2.597035578318231e-05, + "loss": 1.766, + "step": 21823 + }, + { + "epoch": 6.698588090853284, + "grad_norm": 0.19689476490020752, + "learning_rate": 2.5965997000130694e-05, + "loss": 1.7621, + "step": 21824 + }, + { + "epoch": 6.698895027624309, + "grad_norm": 0.18349261581897736, + "learning_rate": 2.5961638454597158e-05, + "loss": 1.6339, + "step": 21825 + }, + { + "epoch": 6.699201964395335, + "grad_norm": 0.21475930511951447, + "learning_rate": 2.595728014662484e-05, + "loss": 1.6973, + "step": 21826 + }, + { + "epoch": 6.69950890116636, + "grad_norm": 0.2711705267429352, + "learning_rate": 2.5952922076256737e-05, + "loss": 1.7801, + "step": 21827 + }, + { + "epoch": 6.699815837937384, + "grad_norm": 0.2601792514324188, + "learning_rate": 2.5948564243535988e-05, + "loss": 1.7508, + "step": 21828 + }, + { + "epoch": 6.70012277470841, + "grad_norm": 0.206949844956398, + "learning_rate": 2.5944206648505586e-05, + "loss": 1.7853, + "step": 21829 + }, + { + "epoch": 6.700429711479435, + "grad_norm": 0.25003641843795776, + "learning_rate": 2.5939849291208653e-05, + "loss": 1.766, + "step": 21830 + }, + { + "epoch": 6.7007366482504604, + "grad_norm": 0.25864318013191223, + "learning_rate": 2.593549217168823e-05, + "loss": 1.7778, + "step": 21831 + }, + { + "epoch": 6.701043585021486, + "grad_norm": 0.20212729275226593, + "learning_rate": 2.593113528998738e-05, + "loss": 1.7249, + "step": 21832 + }, + { + "epoch": 6.701350521792511, + "grad_norm": 0.2518431842327118, + "learning_rate": 2.5926778646149154e-05, + "loss": 1.7466, + "step": 21833 + }, + { + "epoch": 6.701657458563536, + "grad_norm": 0.24284590780735016, + "learning_rate": 2.5922422240216614e-05, + "loss": 1.8309, + "step": 21834 + }, + { + "epoch": 6.701964395334561, + "grad_norm": 0.21829955279827118, + "learning_rate": 2.5918066072232817e-05, + "loss": 1.7458, + "step": 21835 + }, + { + "epoch": 6.702271332105586, + "grad_norm": 0.2842165231704712, + "learning_rate": 2.5913710142240792e-05, + "loss": 1.7379, + "step": 21836 + }, + { + "epoch": 6.702578268876612, + "grad_norm": 0.19648514688014984, + "learning_rate": 2.590935445028359e-05, + "loss": 1.7141, + "step": 21837 + }, + { + "epoch": 6.702885205647637, + "grad_norm": 0.24336646497249603, + "learning_rate": 2.5904998996404305e-05, + "loss": 1.6719, + "step": 21838 + }, + { + "epoch": 6.703192142418661, + "grad_norm": 0.17288628220558167, + "learning_rate": 2.5900643780645905e-05, + "loss": 1.6982, + "step": 21839 + }, + { + "epoch": 6.703499079189687, + "grad_norm": 0.24906334280967712, + "learning_rate": 2.5896288803051505e-05, + "loss": 1.6873, + "step": 21840 + }, + { + "epoch": 6.703806015960712, + "grad_norm": 0.2177029550075531, + "learning_rate": 2.5891934063664085e-05, + "loss": 1.6884, + "step": 21841 + }, + { + "epoch": 6.704112952731737, + "grad_norm": 0.20478956401348114, + "learning_rate": 2.5887579562526688e-05, + "loss": 1.7342, + "step": 21842 + }, + { + "epoch": 6.704419889502763, + "grad_norm": 0.26212164759635925, + "learning_rate": 2.58832252996824e-05, + "loss": 1.7304, + "step": 21843 + }, + { + "epoch": 6.704726826273788, + "grad_norm": 0.2049340009689331, + "learning_rate": 2.587887127517418e-05, + "loss": 1.7472, + "step": 21844 + }, + { + "epoch": 6.7050337630448125, + "grad_norm": 0.2453075796365738, + "learning_rate": 2.587451748904512e-05, + "loss": 1.7443, + "step": 21845 + }, + { + "epoch": 6.705340699815838, + "grad_norm": 0.19545187056064606, + "learning_rate": 2.5870163941338188e-05, + "loss": 1.7328, + "step": 21846 + }, + { + "epoch": 6.705647636586863, + "grad_norm": 0.24424482882022858, + "learning_rate": 2.5865810632096456e-05, + "loss": 1.6876, + "step": 21847 + }, + { + "epoch": 6.7059545733578885, + "grad_norm": 0.2150830626487732, + "learning_rate": 2.5861457561362922e-05, + "loss": 1.7272, + "step": 21848 + }, + { + "epoch": 6.706261510128914, + "grad_norm": 0.2632520794868469, + "learning_rate": 2.5857104729180626e-05, + "loss": 1.7542, + "step": 21849 + }, + { + "epoch": 6.706568446899938, + "grad_norm": 0.21789421141147614, + "learning_rate": 2.5852752135592563e-05, + "loss": 1.6856, + "step": 21850 + }, + { + "epoch": 6.706875383670964, + "grad_norm": 0.2227005511522293, + "learning_rate": 2.5848399780641758e-05, + "loss": 1.7473, + "step": 21851 + }, + { + "epoch": 6.707182320441989, + "grad_norm": 0.23424866795539856, + "learning_rate": 2.5844047664371218e-05, + "loss": 1.7016, + "step": 21852 + }, + { + "epoch": 6.707489257213014, + "grad_norm": 0.2125028669834137, + "learning_rate": 2.5839695786823964e-05, + "loss": 1.8296, + "step": 21853 + }, + { + "epoch": 6.70779619398404, + "grad_norm": 0.2533423900604248, + "learning_rate": 2.5835344148042972e-05, + "loss": 1.7237, + "step": 21854 + }, + { + "epoch": 6.708103130755065, + "grad_norm": 0.1951744705438614, + "learning_rate": 2.583099274807132e-05, + "loss": 1.6685, + "step": 21855 + }, + { + "epoch": 6.708410067526089, + "grad_norm": 0.2564519941806793, + "learning_rate": 2.5826641586951938e-05, + "loss": 1.7542, + "step": 21856 + }, + { + "epoch": 6.708717004297115, + "grad_norm": 0.2586502134799957, + "learning_rate": 2.5822290664727856e-05, + "loss": 1.7477, + "step": 21857 + }, + { + "epoch": 6.70902394106814, + "grad_norm": 0.30357107520103455, + "learning_rate": 2.5817939981442062e-05, + "loss": 1.7454, + "step": 21858 + }, + { + "epoch": 6.709330877839165, + "grad_norm": 0.20547500252723694, + "learning_rate": 2.5813589537137544e-05, + "loss": 1.7517, + "step": 21859 + }, + { + "epoch": 6.70963781461019, + "grad_norm": 0.2961783707141876, + "learning_rate": 2.5809239331857348e-05, + "loss": 1.698, + "step": 21860 + }, + { + "epoch": 6.709944751381215, + "grad_norm": 0.2062019556760788, + "learning_rate": 2.580488936564439e-05, + "loss": 1.7358, + "step": 21861 + }, + { + "epoch": 6.7102516881522405, + "grad_norm": 0.22287480533123016, + "learning_rate": 2.580053963854173e-05, + "loss": 1.7099, + "step": 21862 + }, + { + "epoch": 6.710558624923266, + "grad_norm": 0.1853112131357193, + "learning_rate": 2.579619015059229e-05, + "loss": 1.7493, + "step": 21863 + }, + { + "epoch": 6.710865561694291, + "grad_norm": 0.24855247139930725, + "learning_rate": 2.5791840901839105e-05, + "loss": 1.7248, + "step": 21864 + }, + { + "epoch": 6.7111724984653165, + "grad_norm": 0.18156948685646057, + "learning_rate": 2.5787491892325126e-05, + "loss": 1.6744, + "step": 21865 + }, + { + "epoch": 6.711479435236341, + "grad_norm": 0.3272082209587097, + "learning_rate": 2.5783143122093357e-05, + "loss": 1.7546, + "step": 21866 + }, + { + "epoch": 6.711786372007366, + "grad_norm": 0.2875421643257141, + "learning_rate": 2.577879459118675e-05, + "loss": 1.6477, + "step": 21867 + }, + { + "epoch": 6.712093308778392, + "grad_norm": 0.19682031869888306, + "learning_rate": 2.5774446299648297e-05, + "loss": 1.7455, + "step": 21868 + }, + { + "epoch": 6.712400245549417, + "grad_norm": 0.32829195261001587, + "learning_rate": 2.5770098247520968e-05, + "loss": 1.7817, + "step": 21869 + }, + { + "epoch": 6.712707182320442, + "grad_norm": 0.26227760314941406, + "learning_rate": 2.5765750434847724e-05, + "loss": 1.763, + "step": 21870 + }, + { + "epoch": 6.713014119091467, + "grad_norm": 0.2902637720108032, + "learning_rate": 2.576140286167152e-05, + "loss": 1.7432, + "step": 21871 + }, + { + "epoch": 6.713321055862492, + "grad_norm": 0.2290763407945633, + "learning_rate": 2.5757055528035377e-05, + "loss": 1.7149, + "step": 21872 + }, + { + "epoch": 6.713627992633517, + "grad_norm": 0.3445907533168793, + "learning_rate": 2.575270843398221e-05, + "loss": 1.7874, + "step": 21873 + }, + { + "epoch": 6.713934929404543, + "grad_norm": 0.1841191053390503, + "learning_rate": 2.574836157955498e-05, + "loss": 1.6954, + "step": 21874 + }, + { + "epoch": 6.714241866175568, + "grad_norm": 0.24168385565280914, + "learning_rate": 2.5744014964796657e-05, + "loss": 1.7153, + "step": 21875 + }, + { + "epoch": 6.714548802946593, + "grad_norm": 0.17855188250541687, + "learning_rate": 2.5739668589750175e-05, + "loss": 1.7329, + "step": 21876 + }, + { + "epoch": 6.714855739717618, + "grad_norm": 0.189789280295372, + "learning_rate": 2.5735322454458554e-05, + "loss": 1.6854, + "step": 21877 + }, + { + "epoch": 6.715162676488643, + "grad_norm": 0.1792519986629486, + "learning_rate": 2.5730976558964647e-05, + "loss": 1.7483, + "step": 21878 + }, + { + "epoch": 6.7154696132596685, + "grad_norm": 0.24460360407829285, + "learning_rate": 2.5726630903311504e-05, + "loss": 1.8337, + "step": 21879 + }, + { + "epoch": 6.715776550030694, + "grad_norm": 0.21612058579921722, + "learning_rate": 2.572228548754198e-05, + "loss": 1.7293, + "step": 21880 + }, + { + "epoch": 6.716083486801719, + "grad_norm": 0.22057892382144928, + "learning_rate": 2.5717940311699078e-05, + "loss": 1.7269, + "step": 21881 + }, + { + "epoch": 6.716390423572744, + "grad_norm": 0.19635777175426483, + "learning_rate": 2.571359537582572e-05, + "loss": 1.6744, + "step": 21882 + }, + { + "epoch": 6.716697360343769, + "grad_norm": 0.20406895875930786, + "learning_rate": 2.570925067996485e-05, + "loss": 1.6866, + "step": 21883 + }, + { + "epoch": 6.717004297114794, + "grad_norm": 0.1942419856786728, + "learning_rate": 2.5704906224159407e-05, + "loss": 1.724, + "step": 21884 + }, + { + "epoch": 6.71731123388582, + "grad_norm": 0.20423445105552673, + "learning_rate": 2.570056200845231e-05, + "loss": 1.6709, + "step": 21885 + }, + { + "epoch": 6.717618170656845, + "grad_norm": 0.27171632647514343, + "learning_rate": 2.569621803288651e-05, + "loss": 1.7532, + "step": 21886 + }, + { + "epoch": 6.71792510742787, + "grad_norm": 0.22753871977329254, + "learning_rate": 2.5691874297504926e-05, + "loss": 1.7534, + "step": 21887 + }, + { + "epoch": 6.718232044198895, + "grad_norm": 0.1907290369272232, + "learning_rate": 2.5687530802350468e-05, + "loss": 1.6696, + "step": 21888 + }, + { + "epoch": 6.71853898096992, + "grad_norm": 0.2226637750864029, + "learning_rate": 2.568318754746612e-05, + "loss": 1.7194, + "step": 21889 + }, + { + "epoch": 6.718845917740945, + "grad_norm": 0.20878726243972778, + "learning_rate": 2.5678844532894742e-05, + "loss": 1.6878, + "step": 21890 + }, + { + "epoch": 6.719152854511971, + "grad_norm": 0.18087267875671387, + "learning_rate": 2.567450175867928e-05, + "loss": 1.7432, + "step": 21891 + }, + { + "epoch": 6.719459791282996, + "grad_norm": 0.19818328320980072, + "learning_rate": 2.567015922486265e-05, + "loss": 1.6959, + "step": 21892 + }, + { + "epoch": 6.7197667280540205, + "grad_norm": 0.19593466818332672, + "learning_rate": 2.566581693148775e-05, + "loss": 1.7357, + "step": 21893 + }, + { + "epoch": 6.720073664825046, + "grad_norm": 0.24518795311450958, + "learning_rate": 2.5661474878597546e-05, + "loss": 1.7948, + "step": 21894 + }, + { + "epoch": 6.720380601596071, + "grad_norm": 0.18471074104309082, + "learning_rate": 2.5657133066234872e-05, + "loss": 1.6983, + "step": 21895 + }, + { + "epoch": 6.7206875383670965, + "grad_norm": 0.20073382556438446, + "learning_rate": 2.5652791494442718e-05, + "loss": 1.7241, + "step": 21896 + }, + { + "epoch": 6.720994475138122, + "grad_norm": 0.21688152849674225, + "learning_rate": 2.5648450163263903e-05, + "loss": 1.7073, + "step": 21897 + }, + { + "epoch": 6.721301411909147, + "grad_norm": 0.17722688615322113, + "learning_rate": 2.5644109072741406e-05, + "loss": 1.7047, + "step": 21898 + }, + { + "epoch": 6.721608348680172, + "grad_norm": 0.2060708999633789, + "learning_rate": 2.5639768222918093e-05, + "loss": 1.7246, + "step": 21899 + }, + { + "epoch": 6.721915285451197, + "grad_norm": 0.26590242981910706, + "learning_rate": 2.563542761383687e-05, + "loss": 1.8141, + "step": 21900 + }, + { + "epoch": 6.722222222222222, + "grad_norm": 0.22498780488967896, + "learning_rate": 2.5631087245540632e-05, + "loss": 1.7211, + "step": 21901 + }, + { + "epoch": 6.722529158993248, + "grad_norm": 0.20546968281269073, + "learning_rate": 2.562674711807227e-05, + "loss": 1.8001, + "step": 21902 + }, + { + "epoch": 6.722836095764272, + "grad_norm": 0.19668535888195038, + "learning_rate": 2.5622407231474683e-05, + "loss": 1.7443, + "step": 21903 + }, + { + "epoch": 6.723143032535297, + "grad_norm": 0.18932129442691803, + "learning_rate": 2.5618067585790752e-05, + "loss": 1.7307, + "step": 21904 + }, + { + "epoch": 6.723449969306323, + "grad_norm": 0.19501622021198273, + "learning_rate": 2.561372818106335e-05, + "loss": 1.7016, + "step": 21905 + }, + { + "epoch": 6.723756906077348, + "grad_norm": 0.21313562989234924, + "learning_rate": 2.5609389017335416e-05, + "loss": 1.8012, + "step": 21906 + }, + { + "epoch": 6.724063842848373, + "grad_norm": 0.174738347530365, + "learning_rate": 2.560505009464978e-05, + "loss": 1.6824, + "step": 21907 + }, + { + "epoch": 6.724370779619399, + "grad_norm": 0.20349650084972382, + "learning_rate": 2.560071141304934e-05, + "loss": 1.7813, + "step": 21908 + }, + { + "epoch": 6.724677716390423, + "grad_norm": 0.21878227591514587, + "learning_rate": 2.5596372972576967e-05, + "loss": 1.8166, + "step": 21909 + }, + { + "epoch": 6.7249846531614486, + "grad_norm": 0.2082633078098297, + "learning_rate": 2.559203477327552e-05, + "loss": 1.7197, + "step": 21910 + }, + { + "epoch": 6.725291589932474, + "grad_norm": 0.17738287150859833, + "learning_rate": 2.558769681518792e-05, + "loss": 1.7093, + "step": 21911 + }, + { + "epoch": 6.725598526703499, + "grad_norm": 0.1930074542760849, + "learning_rate": 2.5583359098356986e-05, + "loss": 1.7702, + "step": 21912 + }, + { + "epoch": 6.725905463474525, + "grad_norm": 0.17668531835079193, + "learning_rate": 2.5579021622825638e-05, + "loss": 1.7466, + "step": 21913 + }, + { + "epoch": 6.726212400245549, + "grad_norm": 0.1737186163663864, + "learning_rate": 2.5574684388636677e-05, + "loss": 1.6876, + "step": 21914 + }, + { + "epoch": 6.726519337016574, + "grad_norm": 0.18352502584457397, + "learning_rate": 2.5570347395833018e-05, + "loss": 1.6745, + "step": 21915 + }, + { + "epoch": 6.7268262737876, + "grad_norm": 0.19047673046588898, + "learning_rate": 2.5566010644457506e-05, + "loss": 1.7465, + "step": 21916 + }, + { + "epoch": 6.727133210558625, + "grad_norm": 0.1762397438287735, + "learning_rate": 2.5561674134553005e-05, + "loss": 1.6767, + "step": 21917 + }, + { + "epoch": 6.72744014732965, + "grad_norm": 0.22884784638881683, + "learning_rate": 2.5557337866162358e-05, + "loss": 1.7054, + "step": 21918 + }, + { + "epoch": 6.727747084100676, + "grad_norm": 0.17476098239421844, + "learning_rate": 2.5553001839328417e-05, + "loss": 1.721, + "step": 21919 + }, + { + "epoch": 6.7280540208717, + "grad_norm": 0.1827213317155838, + "learning_rate": 2.554866605409405e-05, + "loss": 1.78, + "step": 21920 + }, + { + "epoch": 6.7283609576427255, + "grad_norm": 0.21709343791007996, + "learning_rate": 2.554433051050209e-05, + "loss": 1.8064, + "step": 21921 + }, + { + "epoch": 6.728667894413751, + "grad_norm": 0.1972692310810089, + "learning_rate": 2.5539995208595398e-05, + "loss": 1.7231, + "step": 21922 + }, + { + "epoch": 6.728974831184776, + "grad_norm": 0.19464808702468872, + "learning_rate": 2.5535660148416802e-05, + "loss": 1.7931, + "step": 21923 + }, + { + "epoch": 6.7292817679558015, + "grad_norm": 0.19610099494457245, + "learning_rate": 2.5531325330009158e-05, + "loss": 1.7467, + "step": 21924 + }, + { + "epoch": 6.729588704726826, + "grad_norm": 0.21104763448238373, + "learning_rate": 2.5526990753415292e-05, + "loss": 1.7543, + "step": 21925 + }, + { + "epoch": 6.729895641497851, + "grad_norm": 0.1881588101387024, + "learning_rate": 2.5522656418678047e-05, + "loss": 1.7666, + "step": 21926 + }, + { + "epoch": 6.730202578268877, + "grad_norm": 0.2163291722536087, + "learning_rate": 2.551832232584025e-05, + "loss": 1.7321, + "step": 21927 + }, + { + "epoch": 6.730509515039902, + "grad_norm": 0.19252021610736847, + "learning_rate": 2.551398847494477e-05, + "loss": 1.7287, + "step": 21928 + }, + { + "epoch": 6.730816451810927, + "grad_norm": 0.22602233290672302, + "learning_rate": 2.550965486603437e-05, + "loss": 1.767, + "step": 21929 + }, + { + "epoch": 6.731123388581953, + "grad_norm": 0.21509617567062378, + "learning_rate": 2.5505321499151957e-05, + "loss": 1.7637, + "step": 21930 + }, + { + "epoch": 6.731430325352977, + "grad_norm": 0.24291658401489258, + "learning_rate": 2.5500988374340274e-05, + "loss": 1.7312, + "step": 21931 + }, + { + "epoch": 6.731737262124002, + "grad_norm": 0.26562216877937317, + "learning_rate": 2.5496655491642195e-05, + "loss": 1.7763, + "step": 21932 + }, + { + "epoch": 6.732044198895028, + "grad_norm": 0.19785790145397186, + "learning_rate": 2.5492322851100535e-05, + "loss": 1.6979, + "step": 21933 + }, + { + "epoch": 6.732351135666053, + "grad_norm": 0.20044486224651337, + "learning_rate": 2.5487990452758104e-05, + "loss": 1.7359, + "step": 21934 + }, + { + "epoch": 6.7326580724370775, + "grad_norm": 0.20468659698963165, + "learning_rate": 2.548365829665772e-05, + "loss": 1.6996, + "step": 21935 + }, + { + "epoch": 6.732965009208103, + "grad_norm": 0.16516120731830597, + "learning_rate": 2.5479326382842195e-05, + "loss": 1.717, + "step": 21936 + }, + { + "epoch": 6.733271945979128, + "grad_norm": 0.22404411435127258, + "learning_rate": 2.547499471135433e-05, + "loss": 1.7261, + "step": 21937 + }, + { + "epoch": 6.7335788827501535, + "grad_norm": 0.21485663950443268, + "learning_rate": 2.547066328223695e-05, + "loss": 1.7463, + "step": 21938 + }, + { + "epoch": 6.733885819521179, + "grad_norm": 0.330018550157547, + "learning_rate": 2.5466332095532853e-05, + "loss": 1.854, + "step": 21939 + }, + { + "epoch": 6.734192756292204, + "grad_norm": 0.25225213170051575, + "learning_rate": 2.5462001151284842e-05, + "loss": 1.722, + "step": 21940 + }, + { + "epoch": 6.734499693063229, + "grad_norm": 0.2422008365392685, + "learning_rate": 2.5457670449535713e-05, + "loss": 1.6996, + "step": 21941 + }, + { + "epoch": 6.734806629834254, + "grad_norm": 0.2421465814113617, + "learning_rate": 2.5453339990328275e-05, + "loss": 1.7014, + "step": 21942 + }, + { + "epoch": 6.735113566605279, + "grad_norm": 0.2520611882209778, + "learning_rate": 2.5449009773705313e-05, + "loss": 1.7149, + "step": 21943 + }, + { + "epoch": 6.735420503376305, + "grad_norm": 0.24940338730812073, + "learning_rate": 2.5444679799709626e-05, + "loss": 1.7423, + "step": 21944 + }, + { + "epoch": 6.73572744014733, + "grad_norm": 0.2328663021326065, + "learning_rate": 2.544035006838401e-05, + "loss": 1.6893, + "step": 21945 + }, + { + "epoch": 6.736034376918354, + "grad_norm": 0.2190757393836975, + "learning_rate": 2.5436020579771226e-05, + "loss": 1.7375, + "step": 21946 + }, + { + "epoch": 6.73634131368938, + "grad_norm": 0.2204900085926056, + "learning_rate": 2.543169133391413e-05, + "loss": 1.6971, + "step": 21947 + }, + { + "epoch": 6.736648250460405, + "grad_norm": 0.29192328453063965, + "learning_rate": 2.5427362330855415e-05, + "loss": 1.7633, + "step": 21948 + }, + { + "epoch": 6.73695518723143, + "grad_norm": 0.19859355688095093, + "learning_rate": 2.542303357063793e-05, + "loss": 1.7515, + "step": 21949 + }, + { + "epoch": 6.737262124002456, + "grad_norm": 0.23010417819023132, + "learning_rate": 2.5418705053304425e-05, + "loss": 1.7282, + "step": 21950 + }, + { + "epoch": 6.737569060773481, + "grad_norm": 0.2168324589729309, + "learning_rate": 2.5414376778897698e-05, + "loss": 1.7347, + "step": 21951 + }, + { + "epoch": 6.7378759975445055, + "grad_norm": 0.2190646231174469, + "learning_rate": 2.54100487474605e-05, + "loss": 1.7893, + "step": 21952 + }, + { + "epoch": 6.738182934315531, + "grad_norm": 0.23925794661045074, + "learning_rate": 2.5405720959035617e-05, + "loss": 1.7825, + "step": 21953 + }, + { + "epoch": 6.738489871086556, + "grad_norm": 0.17987917363643646, + "learning_rate": 2.5401393413665807e-05, + "loss": 1.724, + "step": 21954 + }, + { + "epoch": 6.7387968078575815, + "grad_norm": 0.2300983965396881, + "learning_rate": 2.5397066111393853e-05, + "loss": 1.7023, + "step": 21955 + }, + { + "epoch": 6.739103744628607, + "grad_norm": 0.2128167450428009, + "learning_rate": 2.539273905226251e-05, + "loss": 1.7218, + "step": 21956 + }, + { + "epoch": 6.739410681399631, + "grad_norm": 0.19105537235736847, + "learning_rate": 2.538841223631454e-05, + "loss": 1.7781, + "step": 21957 + }, + { + "epoch": 6.739717618170657, + "grad_norm": 0.22985289990901947, + "learning_rate": 2.5384085663592704e-05, + "loss": 1.7362, + "step": 21958 + }, + { + "epoch": 6.740024554941682, + "grad_norm": 0.18608705699443817, + "learning_rate": 2.5379759334139768e-05, + "loss": 1.7174, + "step": 21959 + }, + { + "epoch": 6.740331491712707, + "grad_norm": 0.2659450173377991, + "learning_rate": 2.5375433247998482e-05, + "loss": 1.8118, + "step": 21960 + }, + { + "epoch": 6.740638428483733, + "grad_norm": 0.1904401034116745, + "learning_rate": 2.537110740521159e-05, + "loss": 1.6789, + "step": 21961 + }, + { + "epoch": 6.740945365254758, + "grad_norm": 0.1826045662164688, + "learning_rate": 2.5366781805821847e-05, + "loss": 1.6906, + "step": 21962 + }, + { + "epoch": 6.741252302025782, + "grad_norm": 0.1919000893831253, + "learning_rate": 2.5362456449871995e-05, + "loss": 1.7412, + "step": 21963 + }, + { + "epoch": 6.741559238796808, + "grad_norm": 0.1921864151954651, + "learning_rate": 2.5358131337404822e-05, + "loss": 1.7023, + "step": 21964 + }, + { + "epoch": 6.741866175567833, + "grad_norm": 0.1628783494234085, + "learning_rate": 2.5353806468463004e-05, + "loss": 1.6842, + "step": 21965 + }, + { + "epoch": 6.742173112338858, + "grad_norm": 0.19764694571495056, + "learning_rate": 2.534948184308935e-05, + "loss": 1.7238, + "step": 21966 + }, + { + "epoch": 6.742480049109884, + "grad_norm": 0.1845860630273819, + "learning_rate": 2.534515746132653e-05, + "loss": 1.728, + "step": 21967 + }, + { + "epoch": 6.742786985880908, + "grad_norm": 0.20269328355789185, + "learning_rate": 2.5340833323217327e-05, + "loss": 1.7541, + "step": 21968 + }, + { + "epoch": 6.7430939226519335, + "grad_norm": 0.16586242616176605, + "learning_rate": 2.5336509428804468e-05, + "loss": 1.7025, + "step": 21969 + }, + { + "epoch": 6.743400859422959, + "grad_norm": 0.1693086177110672, + "learning_rate": 2.533218577813068e-05, + "loss": 1.6975, + "step": 21970 + }, + { + "epoch": 6.743707796193984, + "grad_norm": 0.2206759750843048, + "learning_rate": 2.5327862371238686e-05, + "loss": 1.764, + "step": 21971 + }, + { + "epoch": 6.7440147329650095, + "grad_norm": 0.1915574073791504, + "learning_rate": 2.532353920817122e-05, + "loss": 1.7576, + "step": 21972 + }, + { + "epoch": 6.744321669736035, + "grad_norm": 0.1741783618927002, + "learning_rate": 2.5319216288971003e-05, + "loss": 1.7394, + "step": 21973 + }, + { + "epoch": 6.744628606507059, + "grad_norm": 0.21624934673309326, + "learning_rate": 2.5314893613680755e-05, + "loss": 1.7358, + "step": 21974 + }, + { + "epoch": 6.744935543278085, + "grad_norm": 0.2350481003522873, + "learning_rate": 2.5310571182343197e-05, + "loss": 1.7801, + "step": 21975 + }, + { + "epoch": 6.74524248004911, + "grad_norm": 0.18618559837341309, + "learning_rate": 2.5306248995001048e-05, + "loss": 1.7012, + "step": 21976 + }, + { + "epoch": 6.745549416820135, + "grad_norm": 0.18479639291763306, + "learning_rate": 2.5301927051697016e-05, + "loss": 1.7238, + "step": 21977 + }, + { + "epoch": 6.74585635359116, + "grad_norm": 0.19978758692741394, + "learning_rate": 2.5297605352473818e-05, + "loss": 1.6636, + "step": 21978 + }, + { + "epoch": 6.746163290362185, + "grad_norm": 0.23122164607048035, + "learning_rate": 2.529328389737416e-05, + "loss": 1.7455, + "step": 21979 + }, + { + "epoch": 6.74647022713321, + "grad_norm": 0.20423240959644318, + "learning_rate": 2.5288962686440732e-05, + "loss": 1.7516, + "step": 21980 + }, + { + "epoch": 6.746777163904236, + "grad_norm": 0.18271920084953308, + "learning_rate": 2.52846417197163e-05, + "loss": 1.762, + "step": 21981 + }, + { + "epoch": 6.747084100675261, + "grad_norm": 0.19280247390270233, + "learning_rate": 2.528032099724349e-05, + "loss": 1.7298, + "step": 21982 + }, + { + "epoch": 6.747391037446286, + "grad_norm": 0.20908337831497192, + "learning_rate": 2.527600051906507e-05, + "loss": 1.7323, + "step": 21983 + }, + { + "epoch": 6.747697974217311, + "grad_norm": 0.18399856984615326, + "learning_rate": 2.5271680285223663e-05, + "loss": 1.6795, + "step": 21984 + }, + { + "epoch": 6.748004910988336, + "grad_norm": 0.2273191213607788, + "learning_rate": 2.5267360295762033e-05, + "loss": 1.6811, + "step": 21985 + }, + { + "epoch": 6.7483118477593615, + "grad_norm": 0.1844841092824936, + "learning_rate": 2.526304055072284e-05, + "loss": 1.7404, + "step": 21986 + }, + { + "epoch": 6.748618784530387, + "grad_norm": 0.25975871086120605, + "learning_rate": 2.5258721050148775e-05, + "loss": 1.6994, + "step": 21987 + }, + { + "epoch": 6.748925721301412, + "grad_norm": 0.1664818376302719, + "learning_rate": 2.5254401794082532e-05, + "loss": 1.6722, + "step": 21988 + }, + { + "epoch": 6.749232658072437, + "grad_norm": 0.2597639560699463, + "learning_rate": 2.5250082782566796e-05, + "loss": 1.7654, + "step": 21989 + }, + { + "epoch": 6.749539594843462, + "grad_norm": 0.19326356053352356, + "learning_rate": 2.5245764015644248e-05, + "loss": 1.668, + "step": 21990 + }, + { + "epoch": 6.749846531614487, + "grad_norm": 0.22924599051475525, + "learning_rate": 2.5241445493357574e-05, + "loss": 1.7522, + "step": 21991 + }, + { + "epoch": 6.750153468385513, + "grad_norm": 0.24588358402252197, + "learning_rate": 2.523712721574944e-05, + "loss": 1.7396, + "step": 21992 + }, + { + "epoch": 6.750460405156538, + "grad_norm": 0.1988971084356308, + "learning_rate": 2.5232809182862526e-05, + "loss": 1.7338, + "step": 21993 + }, + { + "epoch": 6.750767341927563, + "grad_norm": 0.18566425144672394, + "learning_rate": 2.5228491394739518e-05, + "loss": 1.7135, + "step": 21994 + }, + { + "epoch": 6.751074278698588, + "grad_norm": 0.22216622531414032, + "learning_rate": 2.5224173851423073e-05, + "loss": 1.744, + "step": 21995 + }, + { + "epoch": 6.751381215469613, + "grad_norm": 0.18695887923240662, + "learning_rate": 2.5219856552955863e-05, + "loss": 1.7324, + "step": 21996 + }, + { + "epoch": 6.7516881522406385, + "grad_norm": 0.1866987645626068, + "learning_rate": 2.5215539499380535e-05, + "loss": 1.6855, + "step": 21997 + }, + { + "epoch": 6.751995089011664, + "grad_norm": 0.1743573248386383, + "learning_rate": 2.521122269073981e-05, + "loss": 1.6833, + "step": 21998 + }, + { + "epoch": 6.752302025782689, + "grad_norm": 0.2173541784286499, + "learning_rate": 2.5206906127076274e-05, + "loss": 1.7434, + "step": 21999 + }, + { + "epoch": 6.752608962553714, + "grad_norm": 0.17558147013187408, + "learning_rate": 2.5202589808432665e-05, + "loss": 1.6884, + "step": 22000 + }, + { + "epoch": 6.752915899324739, + "grad_norm": 0.16630353033542633, + "learning_rate": 2.5198273734851553e-05, + "loss": 1.7005, + "step": 22001 + }, + { + "epoch": 6.753222836095764, + "grad_norm": 0.1834949105978012, + "learning_rate": 2.519395790637566e-05, + "loss": 1.7123, + "step": 22002 + }, + { + "epoch": 6.75352977286679, + "grad_norm": 0.1806751936674118, + "learning_rate": 2.5189642323047614e-05, + "loss": 1.7305, + "step": 22003 + }, + { + "epoch": 6.753836709637815, + "grad_norm": 0.2350265085697174, + "learning_rate": 2.5185326984910062e-05, + "loss": 1.772, + "step": 22004 + }, + { + "epoch": 6.75414364640884, + "grad_norm": 0.18105818331241608, + "learning_rate": 2.518101189200566e-05, + "loss": 1.7487, + "step": 22005 + }, + { + "epoch": 6.754450583179865, + "grad_norm": 0.17640845477581024, + "learning_rate": 2.517669704437704e-05, + "loss": 1.7178, + "step": 22006 + }, + { + "epoch": 6.75475751995089, + "grad_norm": 0.21648885309696198, + "learning_rate": 2.5172382442066845e-05, + "loss": 1.7144, + "step": 22007 + }, + { + "epoch": 6.755064456721915, + "grad_norm": 0.2042703926563263, + "learning_rate": 2.5168068085117724e-05, + "loss": 1.7476, + "step": 22008 + }, + { + "epoch": 6.755371393492941, + "grad_norm": 0.24397306144237518, + "learning_rate": 2.5163753973572306e-05, + "loss": 1.7033, + "step": 22009 + }, + { + "epoch": 6.755678330263965, + "grad_norm": 0.2030377835035324, + "learning_rate": 2.5159440107473232e-05, + "loss": 1.7353, + "step": 22010 + }, + { + "epoch": 6.7559852670349905, + "grad_norm": 0.2493598908185959, + "learning_rate": 2.5155126486863127e-05, + "loss": 1.7346, + "step": 22011 + }, + { + "epoch": 6.756292203806016, + "grad_norm": 0.17272062599658966, + "learning_rate": 2.5150813111784627e-05, + "loss": 1.7095, + "step": 22012 + }, + { + "epoch": 6.756599140577041, + "grad_norm": 0.2417706698179245, + "learning_rate": 2.514649998228036e-05, + "loss": 1.7631, + "step": 22013 + }, + { + "epoch": 6.7569060773480665, + "grad_norm": 0.17753612995147705, + "learning_rate": 2.5142187098392915e-05, + "loss": 1.697, + "step": 22014 + }, + { + "epoch": 6.757213014119092, + "grad_norm": 0.2246367186307907, + "learning_rate": 2.5137874460164995e-05, + "loss": 1.7216, + "step": 22015 + }, + { + "epoch": 6.757519950890116, + "grad_norm": 0.24141135811805725, + "learning_rate": 2.5133562067639134e-05, + "loss": 1.7368, + "step": 22016 + }, + { + "epoch": 6.757826887661142, + "grad_norm": 0.21253570914268494, + "learning_rate": 2.5129249920858022e-05, + "loss": 1.7029, + "step": 22017 + }, + { + "epoch": 6.758133824432167, + "grad_norm": 0.21176676452159882, + "learning_rate": 2.5124938019864198e-05, + "loss": 1.7472, + "step": 22018 + }, + { + "epoch": 6.758440761203192, + "grad_norm": 0.1990927904844284, + "learning_rate": 2.5120626364700338e-05, + "loss": 1.6686, + "step": 22019 + }, + { + "epoch": 6.758747697974218, + "grad_norm": 0.1736145317554474, + "learning_rate": 2.5116314955409038e-05, + "loss": 1.6984, + "step": 22020 + }, + { + "epoch": 6.759054634745242, + "grad_norm": 0.2618037462234497, + "learning_rate": 2.511200379203289e-05, + "loss": 1.7374, + "step": 22021 + }, + { + "epoch": 6.759361571516267, + "grad_norm": 0.25363266468048096, + "learning_rate": 2.5107692874614507e-05, + "loss": 1.7001, + "step": 22022 + }, + { + "epoch": 6.759668508287293, + "grad_norm": 0.20287153124809265, + "learning_rate": 2.51033822031965e-05, + "loss": 1.7704, + "step": 22023 + }, + { + "epoch": 6.759975445058318, + "grad_norm": 0.2401949167251587, + "learning_rate": 2.509907177782146e-05, + "loss": 1.7157, + "step": 22024 + }, + { + "epoch": 6.760282381829343, + "grad_norm": 0.177081897854805, + "learning_rate": 2.5094761598531985e-05, + "loss": 1.7572, + "step": 22025 + }, + { + "epoch": 6.760589318600369, + "grad_norm": 0.2641974687576294, + "learning_rate": 2.5090451665370674e-05, + "loss": 1.725, + "step": 22026 + }, + { + "epoch": 6.760896255371393, + "grad_norm": 0.20262297987937927, + "learning_rate": 2.5086141978380116e-05, + "loss": 1.6591, + "step": 22027 + }, + { + "epoch": 6.7612031921424185, + "grad_norm": 0.19107301533222198, + "learning_rate": 2.5081832537602913e-05, + "loss": 1.6914, + "step": 22028 + }, + { + "epoch": 6.761510128913444, + "grad_norm": 0.28122687339782715, + "learning_rate": 2.5077523343081643e-05, + "loss": 1.7759, + "step": 22029 + }, + { + "epoch": 6.761817065684469, + "grad_norm": 0.16575101017951965, + "learning_rate": 2.5073214394858897e-05, + "loss": 1.6994, + "step": 22030 + }, + { + "epoch": 6.7621240024554945, + "grad_norm": 0.26933449506759644, + "learning_rate": 2.506890569297723e-05, + "loss": 1.7565, + "step": 22031 + }, + { + "epoch": 6.762430939226519, + "grad_norm": 0.2452966868877411, + "learning_rate": 2.5064597237479292e-05, + "loss": 1.7442, + "step": 22032 + }, + { + "epoch": 6.762737875997544, + "grad_norm": 0.20781855285167694, + "learning_rate": 2.5060289028407585e-05, + "loss": 1.714, + "step": 22033 + }, + { + "epoch": 6.76304481276857, + "grad_norm": 0.1997823268175125, + "learning_rate": 2.5055981065804756e-05, + "loss": 1.7318, + "step": 22034 + }, + { + "epoch": 6.763351749539595, + "grad_norm": 0.2080194652080536, + "learning_rate": 2.50516733497133e-05, + "loss": 1.7466, + "step": 22035 + }, + { + "epoch": 6.76365868631062, + "grad_norm": 0.17558889091014862, + "learning_rate": 2.504736588017585e-05, + "loss": 1.7049, + "step": 22036 + }, + { + "epoch": 6.763965623081646, + "grad_norm": 0.1999572217464447, + "learning_rate": 2.5043058657234957e-05, + "loss": 1.7121, + "step": 22037 + }, + { + "epoch": 6.76427255985267, + "grad_norm": 0.16219176352024078, + "learning_rate": 2.5038751680933185e-05, + "loss": 1.698, + "step": 22038 + }, + { + "epoch": 6.764579496623695, + "grad_norm": 0.17965151369571686, + "learning_rate": 2.50344449513131e-05, + "loss": 1.7021, + "step": 22039 + }, + { + "epoch": 6.764886433394721, + "grad_norm": 0.18831093609333038, + "learning_rate": 2.5030138468417263e-05, + "loss": 1.7049, + "step": 22040 + }, + { + "epoch": 6.765193370165746, + "grad_norm": 0.20622828602790833, + "learning_rate": 2.5025832232288236e-05, + "loss": 1.7834, + "step": 22041 + }, + { + "epoch": 6.765500306936771, + "grad_norm": 0.22746746242046356, + "learning_rate": 2.5021526242968574e-05, + "loss": 1.7426, + "step": 22042 + }, + { + "epoch": 6.765807243707796, + "grad_norm": 0.2048977166414261, + "learning_rate": 2.5017220500500828e-05, + "loss": 1.7192, + "step": 22043 + }, + { + "epoch": 6.766114180478821, + "grad_norm": 0.19647538661956787, + "learning_rate": 2.5012915004927546e-05, + "loss": 1.6738, + "step": 22044 + }, + { + "epoch": 6.7664211172498465, + "grad_norm": 0.2133142054080963, + "learning_rate": 2.5008609756291284e-05, + "loss": 1.7482, + "step": 22045 + }, + { + "epoch": 6.766728054020872, + "grad_norm": 0.23578259348869324, + "learning_rate": 2.500430475463459e-05, + "loss": 1.696, + "step": 22046 + }, + { + "epoch": 6.767034990791897, + "grad_norm": 0.24862529337406158, + "learning_rate": 2.500000000000001e-05, + "loss": 1.7508, + "step": 22047 + }, + { + "epoch": 6.7673419275629225, + "grad_norm": 0.22704963386058807, + "learning_rate": 2.4995695492430066e-05, + "loss": 1.7739, + "step": 22048 + }, + { + "epoch": 6.767648864333947, + "grad_norm": 0.20216481387615204, + "learning_rate": 2.4991391231967347e-05, + "loss": 1.7406, + "step": 22049 + }, + { + "epoch": 6.767955801104972, + "grad_norm": 0.18778519332408905, + "learning_rate": 2.498708721865432e-05, + "loss": 1.683, + "step": 22050 + }, + { + "epoch": 6.768262737875998, + "grad_norm": 0.21680599451065063, + "learning_rate": 2.4982783452533597e-05, + "loss": 1.7652, + "step": 22051 + }, + { + "epoch": 6.768569674647023, + "grad_norm": 0.16952121257781982, + "learning_rate": 2.4978479933647637e-05, + "loss": 1.6551, + "step": 22052 + }, + { + "epoch": 6.768876611418047, + "grad_norm": 0.1979489028453827, + "learning_rate": 2.4974176662039017e-05, + "loss": 1.7399, + "step": 22053 + }, + { + "epoch": 6.769183548189073, + "grad_norm": 0.18934862315654755, + "learning_rate": 2.496987363775025e-05, + "loss": 1.7228, + "step": 22054 + }, + { + "epoch": 6.769490484960098, + "grad_norm": 0.17551462352275848, + "learning_rate": 2.496557086082387e-05, + "loss": 1.6725, + "step": 22055 + }, + { + "epoch": 6.769797421731123, + "grad_norm": 0.23561003804206848, + "learning_rate": 2.496126833130239e-05, + "loss": 1.7606, + "step": 22056 + }, + { + "epoch": 6.770104358502149, + "grad_norm": 0.19105803966522217, + "learning_rate": 2.4956966049228324e-05, + "loss": 1.6975, + "step": 22057 + }, + { + "epoch": 6.770411295273174, + "grad_norm": 0.28581124544143677, + "learning_rate": 2.4952664014644204e-05, + "loss": 1.7408, + "step": 22058 + }, + { + "epoch": 6.7707182320441985, + "grad_norm": 0.20723536610603333, + "learning_rate": 2.494836222759254e-05, + "loss": 1.752, + "step": 22059 + }, + { + "epoch": 6.771025168815224, + "grad_norm": 0.2089354693889618, + "learning_rate": 2.4944060688115846e-05, + "loss": 1.6662, + "step": 22060 + }, + { + "epoch": 6.771332105586249, + "grad_norm": 0.2299557626247406, + "learning_rate": 2.4939759396256625e-05, + "loss": 1.7978, + "step": 22061 + }, + { + "epoch": 6.7716390423572745, + "grad_norm": 0.17900820076465607, + "learning_rate": 2.493545835205739e-05, + "loss": 1.6876, + "step": 22062 + }, + { + "epoch": 6.7719459791283, + "grad_norm": 0.21412713825702667, + "learning_rate": 2.4931157555560648e-05, + "loss": 1.7347, + "step": 22063 + }, + { + "epoch": 6.772252915899324, + "grad_norm": 0.24448172748088837, + "learning_rate": 2.49268570068089e-05, + "loss": 1.7611, + "step": 22064 + }, + { + "epoch": 6.77255985267035, + "grad_norm": 0.20153972506523132, + "learning_rate": 2.4922556705844624e-05, + "loss": 1.7347, + "step": 22065 + }, + { + "epoch": 6.772866789441375, + "grad_norm": 0.2142268568277359, + "learning_rate": 2.4918256652710387e-05, + "loss": 1.7548, + "step": 22066 + }, + { + "epoch": 6.7731737262124, + "grad_norm": 0.19735601544380188, + "learning_rate": 2.4913956847448595e-05, + "loss": 1.7138, + "step": 22067 + }, + { + "epoch": 6.773480662983426, + "grad_norm": 0.1847008913755417, + "learning_rate": 2.4909657290101824e-05, + "loss": 1.6812, + "step": 22068 + }, + { + "epoch": 6.773787599754451, + "grad_norm": 0.18406464159488678, + "learning_rate": 2.4905357980712486e-05, + "loss": 1.6992, + "step": 22069 + }, + { + "epoch": 6.774094536525475, + "grad_norm": 0.19595865905284882, + "learning_rate": 2.490105891932313e-05, + "loss": 1.7118, + "step": 22070 + }, + { + "epoch": 6.774401473296501, + "grad_norm": 0.1929878294467926, + "learning_rate": 2.4896760105976218e-05, + "loss": 1.7187, + "step": 22071 + }, + { + "epoch": 6.774708410067526, + "grad_norm": 0.23972687125205994, + "learning_rate": 2.4892461540714242e-05, + "loss": 1.7293, + "step": 22072 + }, + { + "epoch": 6.7750153468385514, + "grad_norm": 0.18744204938411713, + "learning_rate": 2.4888163223579675e-05, + "loss": 1.7102, + "step": 22073 + }, + { + "epoch": 6.775322283609577, + "grad_norm": 0.20168112218379974, + "learning_rate": 2.4883865154614994e-05, + "loss": 1.7655, + "step": 22074 + }, + { + "epoch": 6.775629220380601, + "grad_norm": 0.22825658321380615, + "learning_rate": 2.487956733386268e-05, + "loss": 1.7251, + "step": 22075 + }, + { + "epoch": 6.775936157151627, + "grad_norm": 0.19441691040992737, + "learning_rate": 2.4875269761365205e-05, + "loss": 1.7657, + "step": 22076 + }, + { + "epoch": 6.776243093922652, + "grad_norm": 0.22861605882644653, + "learning_rate": 2.487097243716504e-05, + "loss": 1.7132, + "step": 22077 + }, + { + "epoch": 6.776550030693677, + "grad_norm": 0.19157674908638, + "learning_rate": 2.486667536130466e-05, + "loss": 1.7448, + "step": 22078 + }, + { + "epoch": 6.776856967464703, + "grad_norm": 0.2203369438648224, + "learning_rate": 2.486237853382652e-05, + "loss": 1.7535, + "step": 22079 + }, + { + "epoch": 6.777163904235728, + "grad_norm": 0.16477027535438538, + "learning_rate": 2.4858081954773088e-05, + "loss": 1.706, + "step": 22080 + }, + { + "epoch": 6.777470841006752, + "grad_norm": 0.16536933183670044, + "learning_rate": 2.4853785624186827e-05, + "loss": 1.6725, + "step": 22081 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 0.18266050517559052, + "learning_rate": 2.4849489542110176e-05, + "loss": 1.6799, + "step": 22082 + }, + { + "epoch": 6.778084714548803, + "grad_norm": 0.21422190964221954, + "learning_rate": 2.4845193708585647e-05, + "loss": 1.7275, + "step": 22083 + }, + { + "epoch": 6.778391651319828, + "grad_norm": 0.19356754422187805, + "learning_rate": 2.4840898123655622e-05, + "loss": 1.7172, + "step": 22084 + }, + { + "epoch": 6.778698588090853, + "grad_norm": 0.21090209484100342, + "learning_rate": 2.4836602787362628e-05, + "loss": 1.6581, + "step": 22085 + }, + { + "epoch": 6.779005524861878, + "grad_norm": 0.20072491466999054, + "learning_rate": 2.483230769974903e-05, + "loss": 1.7398, + "step": 22086 + }, + { + "epoch": 6.7793124616329035, + "grad_norm": 0.20642702281475067, + "learning_rate": 2.482801286085734e-05, + "loss": 1.7505, + "step": 22087 + }, + { + "epoch": 6.779619398403929, + "grad_norm": 0.20322991907596588, + "learning_rate": 2.4823718270729985e-05, + "loss": 1.6693, + "step": 22088 + }, + { + "epoch": 6.779926335174954, + "grad_norm": 0.17060843110084534, + "learning_rate": 2.4819423929409396e-05, + "loss": 1.6746, + "step": 22089 + }, + { + "epoch": 6.7802332719459795, + "grad_norm": 0.20697785913944244, + "learning_rate": 2.4815129836938024e-05, + "loss": 1.7413, + "step": 22090 + }, + { + "epoch": 6.780540208717004, + "grad_norm": 0.19845673441886902, + "learning_rate": 2.48108359933583e-05, + "loss": 1.694, + "step": 22091 + }, + { + "epoch": 6.780847145488029, + "grad_norm": 0.24547794461250305, + "learning_rate": 2.4806542398712657e-05, + "loss": 1.7316, + "step": 22092 + }, + { + "epoch": 6.781154082259055, + "grad_norm": 0.15587118268013, + "learning_rate": 2.4802249053043526e-05, + "loss": 1.667, + "step": 22093 + }, + { + "epoch": 6.78146101903008, + "grad_norm": 0.22754593193531036, + "learning_rate": 2.4797955956393336e-05, + "loss": 1.7504, + "step": 22094 + }, + { + "epoch": 6.781767955801105, + "grad_norm": 0.201420396566391, + "learning_rate": 2.4793663108804528e-05, + "loss": 1.749, + "step": 22095 + }, + { + "epoch": 6.78207489257213, + "grad_norm": 0.1952153891324997, + "learning_rate": 2.4789370510319504e-05, + "loss": 1.7306, + "step": 22096 + }, + { + "epoch": 6.782381829343155, + "grad_norm": 0.16750730574131012, + "learning_rate": 2.4785078160980703e-05, + "loss": 1.6775, + "step": 22097 + }, + { + "epoch": 6.78268876611418, + "grad_norm": 0.19943620264530182, + "learning_rate": 2.4780786060830535e-05, + "loss": 1.7233, + "step": 22098 + }, + { + "epoch": 6.782995702885206, + "grad_norm": 0.21302999556064606, + "learning_rate": 2.4776494209911423e-05, + "loss": 1.798, + "step": 22099 + }, + { + "epoch": 6.783302639656231, + "grad_norm": 0.22949734330177307, + "learning_rate": 2.4772202608265776e-05, + "loss": 1.7678, + "step": 22100 + }, + { + "epoch": 6.783609576427256, + "grad_norm": 0.20945954322814941, + "learning_rate": 2.4767911255935993e-05, + "loss": 1.701, + "step": 22101 + }, + { + "epoch": 6.783916513198281, + "grad_norm": 0.189425989985466, + "learning_rate": 2.476362015296454e-05, + "loss": 1.7152, + "step": 22102 + }, + { + "epoch": 6.784223449969306, + "grad_norm": 0.18826924264431, + "learning_rate": 2.4759329299393747e-05, + "loss": 1.7004, + "step": 22103 + }, + { + "epoch": 6.7845303867403315, + "grad_norm": 0.20359934866428375, + "learning_rate": 2.475503869526607e-05, + "loss": 1.705, + "step": 22104 + }, + { + "epoch": 6.784837323511357, + "grad_norm": 0.22381560504436493, + "learning_rate": 2.4750748340623896e-05, + "loss": 1.7345, + "step": 22105 + }, + { + "epoch": 6.785144260282382, + "grad_norm": 0.1750476062297821, + "learning_rate": 2.474645823550963e-05, + "loss": 1.7084, + "step": 22106 + }, + { + "epoch": 6.785451197053407, + "grad_norm": 0.17943856120109558, + "learning_rate": 2.4742168379965662e-05, + "loss": 1.7417, + "step": 22107 + }, + { + "epoch": 6.785758133824432, + "grad_norm": 0.21809861063957214, + "learning_rate": 2.4737878774034397e-05, + "loss": 1.7197, + "step": 22108 + }, + { + "epoch": 6.786065070595457, + "grad_norm": 0.19761307537555695, + "learning_rate": 2.473358941775821e-05, + "loss": 1.6763, + "step": 22109 + }, + { + "epoch": 6.786372007366483, + "grad_norm": 0.19513878226280212, + "learning_rate": 2.472930031117951e-05, + "loss": 1.6859, + "step": 22110 + }, + { + "epoch": 6.786678944137508, + "grad_norm": 0.21796870231628418, + "learning_rate": 2.4725011454340675e-05, + "loss": 1.6957, + "step": 22111 + }, + { + "epoch": 6.786985880908533, + "grad_norm": 0.1885530948638916, + "learning_rate": 2.4720722847284088e-05, + "loss": 1.731, + "step": 22112 + }, + { + "epoch": 6.787292817679558, + "grad_norm": 0.2108110785484314, + "learning_rate": 2.4716434490052137e-05, + "loss": 1.7985, + "step": 22113 + }, + { + "epoch": 6.787599754450583, + "grad_norm": 0.23425176739692688, + "learning_rate": 2.4712146382687194e-05, + "loss": 1.7177, + "step": 22114 + }, + { + "epoch": 6.787906691221608, + "grad_norm": 0.17368707060813904, + "learning_rate": 2.4707858525231652e-05, + "loss": 1.7158, + "step": 22115 + }, + { + "epoch": 6.788213627992634, + "grad_norm": 0.22731448709964752, + "learning_rate": 2.470357091772787e-05, + "loss": 1.7037, + "step": 22116 + }, + { + "epoch": 6.788520564763659, + "grad_norm": 0.19142407178878784, + "learning_rate": 2.469928356021823e-05, + "loss": 1.7283, + "step": 22117 + }, + { + "epoch": 6.7888275015346835, + "grad_norm": 0.17515631020069122, + "learning_rate": 2.4694996452745072e-05, + "loss": 1.6812, + "step": 22118 + }, + { + "epoch": 6.789134438305709, + "grad_norm": 0.17932391166687012, + "learning_rate": 2.4690709595350838e-05, + "loss": 1.6832, + "step": 22119 + }, + { + "epoch": 6.789441375076734, + "grad_norm": 0.21177144348621368, + "learning_rate": 2.4686422988077802e-05, + "loss": 1.7443, + "step": 22120 + }, + { + "epoch": 6.7897483118477595, + "grad_norm": 0.17952793836593628, + "learning_rate": 2.4682136630968412e-05, + "loss": 1.6794, + "step": 22121 + }, + { + "epoch": 6.790055248618785, + "grad_norm": 0.18464395403862, + "learning_rate": 2.467785052406495e-05, + "loss": 1.6316, + "step": 22122 + }, + { + "epoch": 6.79036218538981, + "grad_norm": 0.1936565786600113, + "learning_rate": 2.4673564667409828e-05, + "loss": 1.6935, + "step": 22123 + }, + { + "epoch": 6.790669122160835, + "grad_norm": 0.21169735491275787, + "learning_rate": 2.4669279061045387e-05, + "loss": 1.7232, + "step": 22124 + }, + { + "epoch": 6.79097605893186, + "grad_norm": 0.199925035238266, + "learning_rate": 2.466499370501397e-05, + "loss": 1.8242, + "step": 22125 + }, + { + "epoch": 6.791282995702885, + "grad_norm": 0.19049705564975739, + "learning_rate": 2.4660708599357963e-05, + "loss": 1.7342, + "step": 22126 + }, + { + "epoch": 6.791589932473911, + "grad_norm": 0.16483616828918457, + "learning_rate": 2.465642374411964e-05, + "loss": 1.7144, + "step": 22127 + }, + { + "epoch": 6.791896869244935, + "grad_norm": 0.17355477809906006, + "learning_rate": 2.4652139139341413e-05, + "loss": 1.6715, + "step": 22128 + }, + { + "epoch": 6.79220380601596, + "grad_norm": 0.17448700964450836, + "learning_rate": 2.4647854785065605e-05, + "loss": 1.6669, + "step": 22129 + }, + { + "epoch": 6.792510742786986, + "grad_norm": 0.19858810305595398, + "learning_rate": 2.4643570681334553e-05, + "loss": 1.6781, + "step": 22130 + }, + { + "epoch": 6.792817679558011, + "grad_norm": 0.17350561916828156, + "learning_rate": 2.46392868281906e-05, + "loss": 1.7005, + "step": 22131 + }, + { + "epoch": 6.793124616329036, + "grad_norm": 0.17494787275791168, + "learning_rate": 2.4635003225676078e-05, + "loss": 1.7204, + "step": 22132 + }, + { + "epoch": 6.793431553100062, + "grad_norm": 0.1988590806722641, + "learning_rate": 2.463071987383332e-05, + "loss": 1.7314, + "step": 22133 + }, + { + "epoch": 6.793738489871086, + "grad_norm": 0.18046239018440247, + "learning_rate": 2.4626436772704658e-05, + "loss": 1.706, + "step": 22134 + }, + { + "epoch": 6.7940454266421115, + "grad_norm": 0.21060462296009064, + "learning_rate": 2.4622153922332402e-05, + "loss": 1.6967, + "step": 22135 + }, + { + "epoch": 6.794352363413137, + "grad_norm": 0.22328679263591766, + "learning_rate": 2.4617871322758934e-05, + "loss": 1.7502, + "step": 22136 + }, + { + "epoch": 6.794659300184162, + "grad_norm": 0.18324224650859833, + "learning_rate": 2.46135889740265e-05, + "loss": 1.7183, + "step": 22137 + }, + { + "epoch": 6.7949662369551875, + "grad_norm": 0.2381133884191513, + "learning_rate": 2.4609306876177496e-05, + "loss": 1.739, + "step": 22138 + }, + { + "epoch": 6.795273173726212, + "grad_norm": 0.21471738815307617, + "learning_rate": 2.4605025029254164e-05, + "loss": 1.7466, + "step": 22139 + }, + { + "epoch": 6.795580110497237, + "grad_norm": 0.209581658244133, + "learning_rate": 2.4600743433298885e-05, + "loss": 1.7495, + "step": 22140 + }, + { + "epoch": 6.795887047268263, + "grad_norm": 0.1806897670030594, + "learning_rate": 2.459646208835394e-05, + "loss": 1.7137, + "step": 22141 + }, + { + "epoch": 6.796193984039288, + "grad_norm": 0.19036264717578888, + "learning_rate": 2.4592180994461644e-05, + "loss": 1.6993, + "step": 22142 + }, + { + "epoch": 6.796500920810313, + "grad_norm": 0.17937630414962769, + "learning_rate": 2.4587900151664335e-05, + "loss": 1.7102, + "step": 22143 + }, + { + "epoch": 6.796807857581339, + "grad_norm": 0.19278483092784882, + "learning_rate": 2.4583619560004244e-05, + "loss": 1.7058, + "step": 22144 + }, + { + "epoch": 6.797114794352363, + "grad_norm": 0.19507993757724762, + "learning_rate": 2.4579339219523744e-05, + "loss": 1.7137, + "step": 22145 + }, + { + "epoch": 6.797421731123388, + "grad_norm": 0.20417597889900208, + "learning_rate": 2.4575059130265115e-05, + "loss": 1.7156, + "step": 22146 + }, + { + "epoch": 6.797728667894414, + "grad_norm": 0.1898338943719864, + "learning_rate": 2.4570779292270658e-05, + "loss": 1.7501, + "step": 22147 + }, + { + "epoch": 6.798035604665439, + "grad_norm": 0.18777382373809814, + "learning_rate": 2.4566499705582656e-05, + "loss": 1.7192, + "step": 22148 + }, + { + "epoch": 6.798342541436464, + "grad_norm": 0.19526423513889313, + "learning_rate": 2.4562220370243415e-05, + "loss": 1.6637, + "step": 22149 + }, + { + "epoch": 6.798649478207489, + "grad_norm": 0.23661594092845917, + "learning_rate": 2.455794128629522e-05, + "loss": 1.7557, + "step": 22150 + }, + { + "epoch": 6.798956414978514, + "grad_norm": 0.27043846249580383, + "learning_rate": 2.4553662453780362e-05, + "loss": 1.7712, + "step": 22151 + }, + { + "epoch": 6.7992633517495396, + "grad_norm": 0.17968088388442993, + "learning_rate": 2.454938387274111e-05, + "loss": 1.6721, + "step": 22152 + }, + { + "epoch": 6.799570288520565, + "grad_norm": 0.21456219255924225, + "learning_rate": 2.45451055432198e-05, + "loss": 1.7249, + "step": 22153 + }, + { + "epoch": 6.79987722529159, + "grad_norm": 0.22433941066265106, + "learning_rate": 2.4540827465258638e-05, + "loss": 1.7319, + "step": 22154 + }, + { + "epoch": 6.800184162062616, + "grad_norm": 0.2808871567249298, + "learning_rate": 2.4536549638899976e-05, + "loss": 1.7802, + "step": 22155 + }, + { + "epoch": 6.80049109883364, + "grad_norm": 0.28654494881629944, + "learning_rate": 2.4532272064186018e-05, + "loss": 1.7431, + "step": 22156 + }, + { + "epoch": 6.800798035604665, + "grad_norm": 0.19476976990699768, + "learning_rate": 2.45279947411591e-05, + "loss": 1.6792, + "step": 22157 + }, + { + "epoch": 6.801104972375691, + "grad_norm": 0.25114744901657104, + "learning_rate": 2.452371766986146e-05, + "loss": 1.7458, + "step": 22158 + }, + { + "epoch": 6.801411909146716, + "grad_norm": 0.18099439144134521, + "learning_rate": 2.451944085033538e-05, + "loss": 1.6952, + "step": 22159 + }, + { + "epoch": 6.8017188459177405, + "grad_norm": 0.21425777673721313, + "learning_rate": 2.4515164282623138e-05, + "loss": 1.7593, + "step": 22160 + }, + { + "epoch": 6.802025782688766, + "grad_norm": 0.19833709299564362, + "learning_rate": 2.4510887966766937e-05, + "loss": 1.6643, + "step": 22161 + }, + { + "epoch": 6.802332719459791, + "grad_norm": 0.20073090493679047, + "learning_rate": 2.45066119028091e-05, + "loss": 1.7112, + "step": 22162 + }, + { + "epoch": 6.8026396562308165, + "grad_norm": 0.18599852919578552, + "learning_rate": 2.4502336090791872e-05, + "loss": 1.7121, + "step": 22163 + }, + { + "epoch": 6.802946593001842, + "grad_norm": 0.22036875784397125, + "learning_rate": 2.4498060530757498e-05, + "loss": 1.7944, + "step": 22164 + }, + { + "epoch": 6.803253529772867, + "grad_norm": 0.19521577656269073, + "learning_rate": 2.4493785222748243e-05, + "loss": 1.7463, + "step": 22165 + }, + { + "epoch": 6.803560466543892, + "grad_norm": 0.22010843455791473, + "learning_rate": 2.448951016680635e-05, + "loss": 1.6951, + "step": 22166 + }, + { + "epoch": 6.803867403314917, + "grad_norm": 0.20490090548992157, + "learning_rate": 2.448523536297407e-05, + "loss": 1.7723, + "step": 22167 + }, + { + "epoch": 6.804174340085942, + "grad_norm": 0.2298613339662552, + "learning_rate": 2.4480960811293648e-05, + "loss": 1.7644, + "step": 22168 + }, + { + "epoch": 6.804481276856968, + "grad_norm": 0.18560375273227692, + "learning_rate": 2.4476686511807306e-05, + "loss": 1.686, + "step": 22169 + }, + { + "epoch": 6.804788213627993, + "grad_norm": 0.24295780062675476, + "learning_rate": 2.4472412464557347e-05, + "loss": 1.7561, + "step": 22170 + }, + { + "epoch": 6.805095150399017, + "grad_norm": 0.1962144672870636, + "learning_rate": 2.4468138669585932e-05, + "loss": 1.7438, + "step": 22171 + }, + { + "epoch": 6.805402087170043, + "grad_norm": 0.21924439072608948, + "learning_rate": 2.4463865126935377e-05, + "loss": 1.7488, + "step": 22172 + }, + { + "epoch": 6.805709023941068, + "grad_norm": 0.1777856945991516, + "learning_rate": 2.4459591836647833e-05, + "loss": 1.6664, + "step": 22173 + }, + { + "epoch": 6.806015960712093, + "grad_norm": 0.24367454648017883, + "learning_rate": 2.4455318798765593e-05, + "loss": 1.7441, + "step": 22174 + }, + { + "epoch": 6.806322897483119, + "grad_norm": 0.2269427478313446, + "learning_rate": 2.4451046013330865e-05, + "loss": 1.7809, + "step": 22175 + }, + { + "epoch": 6.806629834254144, + "grad_norm": 0.21986174583435059, + "learning_rate": 2.444677348038587e-05, + "loss": 1.7453, + "step": 22176 + }, + { + "epoch": 6.8069367710251685, + "grad_norm": 0.1773367077112198, + "learning_rate": 2.4442501199972862e-05, + "loss": 1.6927, + "step": 22177 + }, + { + "epoch": 6.807243707796194, + "grad_norm": 0.20545031130313873, + "learning_rate": 2.4438229172133997e-05, + "loss": 1.7782, + "step": 22178 + }, + { + "epoch": 6.807550644567219, + "grad_norm": 0.1997014880180359, + "learning_rate": 2.443395739691155e-05, + "loss": 1.7295, + "step": 22179 + }, + { + "epoch": 6.8078575813382445, + "grad_norm": 0.19634006917476654, + "learning_rate": 2.4429685874347723e-05, + "loss": 1.7017, + "step": 22180 + }, + { + "epoch": 6.80816451810927, + "grad_norm": 0.2007836550474167, + "learning_rate": 2.442541460448473e-05, + "loss": 1.7252, + "step": 22181 + }, + { + "epoch": 6.808471454880294, + "grad_norm": 0.22204343974590302, + "learning_rate": 2.4421143587364775e-05, + "loss": 1.7526, + "step": 22182 + }, + { + "epoch": 6.80877839165132, + "grad_norm": 0.1906677633523941, + "learning_rate": 2.4416872823030073e-05, + "loss": 1.7121, + "step": 22183 + }, + { + "epoch": 6.809085328422345, + "grad_norm": 0.17165397107601166, + "learning_rate": 2.441260231152283e-05, + "loss": 1.6942, + "step": 22184 + }, + { + "epoch": 6.80939226519337, + "grad_norm": 0.17022575438022614, + "learning_rate": 2.4408332052885246e-05, + "loss": 1.6973, + "step": 22185 + }, + { + "epoch": 6.809699201964396, + "grad_norm": 0.16693587601184845, + "learning_rate": 2.4404062047159503e-05, + "loss": 1.6996, + "step": 22186 + }, + { + "epoch": 6.810006138735421, + "grad_norm": 0.2251187264919281, + "learning_rate": 2.4399792294387864e-05, + "loss": 1.778, + "step": 22187 + }, + { + "epoch": 6.810313075506445, + "grad_norm": 0.20622244477272034, + "learning_rate": 2.439552279461244e-05, + "loss": 1.7273, + "step": 22188 + }, + { + "epoch": 6.810620012277471, + "grad_norm": 0.19736994802951813, + "learning_rate": 2.439125354787551e-05, + "loss": 1.7096, + "step": 22189 + }, + { + "epoch": 6.810926949048496, + "grad_norm": 0.22955237329006195, + "learning_rate": 2.4386984554219182e-05, + "loss": 1.7859, + "step": 22190 + }, + { + "epoch": 6.811233885819521, + "grad_norm": 0.2283364087343216, + "learning_rate": 2.43827158136857e-05, + "loss": 1.6999, + "step": 22191 + }, + { + "epoch": 6.811540822590547, + "grad_norm": 0.18393704295158386, + "learning_rate": 2.4378447326317243e-05, + "loss": 1.654, + "step": 22192 + }, + { + "epoch": 6.811847759361571, + "grad_norm": 0.2031537890434265, + "learning_rate": 2.4374179092155986e-05, + "loss": 1.7353, + "step": 22193 + }, + { + "epoch": 6.8121546961325965, + "grad_norm": 0.1849071979522705, + "learning_rate": 2.4369911111244125e-05, + "loss": 1.7157, + "step": 22194 + }, + { + "epoch": 6.812461632903622, + "grad_norm": 0.20584192872047424, + "learning_rate": 2.4365643383623787e-05, + "loss": 1.7529, + "step": 22195 + }, + { + "epoch": 6.812768569674647, + "grad_norm": 0.24152903258800507, + "learning_rate": 2.436137590933721e-05, + "loss": 1.7662, + "step": 22196 + }, + { + "epoch": 6.8130755064456725, + "grad_norm": 0.26625362038612366, + "learning_rate": 2.4357108688426532e-05, + "loss": 1.7624, + "step": 22197 + }, + { + "epoch": 6.813382443216698, + "grad_norm": 0.27122190594673157, + "learning_rate": 2.435284172093395e-05, + "loss": 1.747, + "step": 22198 + }, + { + "epoch": 6.813689379987722, + "grad_norm": 0.18996810913085938, + "learning_rate": 2.434857500690161e-05, + "loss": 1.7377, + "step": 22199 + }, + { + "epoch": 6.813996316758748, + "grad_norm": 0.22355122864246368, + "learning_rate": 2.4344308546371686e-05, + "loss": 1.6865, + "step": 22200 + }, + { + "epoch": 6.814303253529773, + "grad_norm": 0.18468965590000153, + "learning_rate": 2.4340042339386348e-05, + "loss": 1.7091, + "step": 22201 + }, + { + "epoch": 6.814610190300798, + "grad_norm": 0.25356602668762207, + "learning_rate": 2.4335776385987747e-05, + "loss": 1.7482, + "step": 22202 + }, + { + "epoch": 6.814917127071823, + "grad_norm": 0.22462932765483856, + "learning_rate": 2.433151068621803e-05, + "loss": 1.6985, + "step": 22203 + }, + { + "epoch": 6.815224063842848, + "grad_norm": 0.2540687024593353, + "learning_rate": 2.43272452401194e-05, + "loss": 1.7878, + "step": 22204 + }, + { + "epoch": 6.815531000613873, + "grad_norm": 0.267811119556427, + "learning_rate": 2.432298004773395e-05, + "loss": 1.7862, + "step": 22205 + }, + { + "epoch": 6.815837937384899, + "grad_norm": 0.23089277744293213, + "learning_rate": 2.4318715109103894e-05, + "loss": 1.6892, + "step": 22206 + }, + { + "epoch": 6.816144874155924, + "grad_norm": 0.22740885615348816, + "learning_rate": 2.431445042427131e-05, + "loss": 1.6934, + "step": 22207 + }, + { + "epoch": 6.816451810926949, + "grad_norm": 0.18555034697055817, + "learning_rate": 2.4310185993278405e-05, + "loss": 1.6747, + "step": 22208 + }, + { + "epoch": 6.816758747697974, + "grad_norm": 0.23693101108074188, + "learning_rate": 2.430592181616729e-05, + "loss": 1.7212, + "step": 22209 + }, + { + "epoch": 6.817065684468999, + "grad_norm": 0.20551325380802155, + "learning_rate": 2.4301657892980128e-05, + "loss": 1.711, + "step": 22210 + }, + { + "epoch": 6.8173726212400245, + "grad_norm": 0.20047837495803833, + "learning_rate": 2.4297394223759056e-05, + "loss": 1.729, + "step": 22211 + }, + { + "epoch": 6.81767955801105, + "grad_norm": 0.22111602127552032, + "learning_rate": 2.4293130808546167e-05, + "loss": 1.706, + "step": 22212 + }, + { + "epoch": 6.817986494782075, + "grad_norm": 0.18199655413627625, + "learning_rate": 2.428886764738364e-05, + "loss": 1.7082, + "step": 22213 + }, + { + "epoch": 6.8182934315531, + "grad_norm": 0.18591821193695068, + "learning_rate": 2.4284604740313595e-05, + "loss": 1.6957, + "step": 22214 + }, + { + "epoch": 6.818600368324125, + "grad_norm": 0.19427789747714996, + "learning_rate": 2.4280342087378154e-05, + "loss": 1.7396, + "step": 22215 + }, + { + "epoch": 6.81890730509515, + "grad_norm": 0.233908548951149, + "learning_rate": 2.427607968861945e-05, + "loss": 1.741, + "step": 22216 + }, + { + "epoch": 6.819214241866176, + "grad_norm": 0.168926402926445, + "learning_rate": 2.4271817544079606e-05, + "loss": 1.7023, + "step": 22217 + }, + { + "epoch": 6.819521178637201, + "grad_norm": 0.34345322847366333, + "learning_rate": 2.426755565380074e-05, + "loss": 1.7201, + "step": 22218 + }, + { + "epoch": 6.819828115408226, + "grad_norm": 0.21531274914741516, + "learning_rate": 2.4263294017824974e-05, + "loss": 1.725, + "step": 22219 + }, + { + "epoch": 6.820135052179251, + "grad_norm": 0.25251755118370056, + "learning_rate": 2.4259032636194395e-05, + "loss": 1.6764, + "step": 22220 + }, + { + "epoch": 6.820441988950276, + "grad_norm": 0.246616929769516, + "learning_rate": 2.4254771508951186e-05, + "loss": 1.7971, + "step": 22221 + }, + { + "epoch": 6.820748925721301, + "grad_norm": 0.20998120307922363, + "learning_rate": 2.4250510636137375e-05, + "loss": 1.723, + "step": 22222 + }, + { + "epoch": 6.821055862492327, + "grad_norm": 0.28388240933418274, + "learning_rate": 2.4246250017795148e-05, + "loss": 1.7508, + "step": 22223 + }, + { + "epoch": 6.821362799263352, + "grad_norm": 0.18146218359470367, + "learning_rate": 2.4241989653966535e-05, + "loss": 1.7254, + "step": 22224 + }, + { + "epoch": 6.8216697360343765, + "grad_norm": 0.2384043037891388, + "learning_rate": 2.4237729544693694e-05, + "loss": 1.7624, + "step": 22225 + }, + { + "epoch": 6.821976672805402, + "grad_norm": 0.21908332407474518, + "learning_rate": 2.4233469690018714e-05, + "loss": 1.7595, + "step": 22226 + }, + { + "epoch": 6.822283609576427, + "grad_norm": 0.20963989198207855, + "learning_rate": 2.422921008998369e-05, + "loss": 1.6679, + "step": 22227 + }, + { + "epoch": 6.8225905463474525, + "grad_norm": 0.21045777201652527, + "learning_rate": 2.4224950744630732e-05, + "loss": 1.657, + "step": 22228 + }, + { + "epoch": 6.822897483118478, + "grad_norm": 0.21567417681217194, + "learning_rate": 2.4220691654001883e-05, + "loss": 1.7788, + "step": 22229 + }, + { + "epoch": 6.823204419889503, + "grad_norm": 0.2908889055252075, + "learning_rate": 2.4216432818139283e-05, + "loss": 1.7633, + "step": 22230 + }, + { + "epoch": 6.823511356660528, + "grad_norm": 0.22683843970298767, + "learning_rate": 2.4212174237085007e-05, + "loss": 1.7974, + "step": 22231 + }, + { + "epoch": 6.823818293431553, + "grad_norm": 0.25254085659980774, + "learning_rate": 2.420791591088114e-05, + "loss": 1.6871, + "step": 22232 + }, + { + "epoch": 6.824125230202578, + "grad_norm": 0.1804734766483307, + "learning_rate": 2.420365783956977e-05, + "loss": 1.7331, + "step": 22233 + }, + { + "epoch": 6.824432166973604, + "grad_norm": 0.21634186804294586, + "learning_rate": 2.419940002319297e-05, + "loss": 1.6641, + "step": 22234 + }, + { + "epoch": 6.824739103744628, + "grad_norm": 0.1941644847393036, + "learning_rate": 2.4195142461792818e-05, + "loss": 1.7198, + "step": 22235 + }, + { + "epoch": 6.8250460405156534, + "grad_norm": 0.20209947228431702, + "learning_rate": 2.4190885155411398e-05, + "loss": 1.7137, + "step": 22236 + }, + { + "epoch": 6.825352977286679, + "grad_norm": 0.17161925137043, + "learning_rate": 2.4186628104090757e-05, + "loss": 1.7059, + "step": 22237 + }, + { + "epoch": 6.825659914057704, + "grad_norm": 0.19352135062217712, + "learning_rate": 2.4182371307873025e-05, + "loss": 1.6699, + "step": 22238 + }, + { + "epoch": 6.8259668508287294, + "grad_norm": 0.20384716987609863, + "learning_rate": 2.417811476680019e-05, + "loss": 1.7167, + "step": 22239 + }, + { + "epoch": 6.826273787599755, + "grad_norm": 0.22764970362186432, + "learning_rate": 2.4173858480914402e-05, + "loss": 1.7085, + "step": 22240 + }, + { + "epoch": 6.82658072437078, + "grad_norm": 0.1988842487335205, + "learning_rate": 2.4169602450257645e-05, + "loss": 1.7458, + "step": 22241 + }, + { + "epoch": 6.826887661141805, + "grad_norm": 0.20511481165885925, + "learning_rate": 2.416534667487203e-05, + "loss": 1.7597, + "step": 22242 + }, + { + "epoch": 6.82719459791283, + "grad_norm": 0.20906902849674225, + "learning_rate": 2.4161091154799608e-05, + "loss": 1.7418, + "step": 22243 + }, + { + "epoch": 6.827501534683855, + "grad_norm": 0.22555884718894958, + "learning_rate": 2.4156835890082426e-05, + "loss": 1.8198, + "step": 22244 + }, + { + "epoch": 6.827808471454881, + "grad_norm": 0.25855058431625366, + "learning_rate": 2.4152580880762553e-05, + "loss": 1.7588, + "step": 22245 + }, + { + "epoch": 6.828115408225905, + "grad_norm": 0.16975226998329163, + "learning_rate": 2.4148326126881993e-05, + "loss": 1.6897, + "step": 22246 + }, + { + "epoch": 6.82842234499693, + "grad_norm": 0.2336781919002533, + "learning_rate": 2.414407162848284e-05, + "loss": 1.7412, + "step": 22247 + }, + { + "epoch": 6.828729281767956, + "grad_norm": 0.1660032868385315, + "learning_rate": 2.4139817385607126e-05, + "loss": 1.6221, + "step": 22248 + }, + { + "epoch": 6.829036218538981, + "grad_norm": 0.22926606237888336, + "learning_rate": 2.41355633982969e-05, + "loss": 1.7201, + "step": 22249 + }, + { + "epoch": 6.829343155310006, + "grad_norm": 0.1759374737739563, + "learning_rate": 2.4131309666594193e-05, + "loss": 1.6842, + "step": 22250 + }, + { + "epoch": 6.829650092081032, + "grad_norm": 0.23005764186382294, + "learning_rate": 2.4127056190541042e-05, + "loss": 1.7327, + "step": 22251 + }, + { + "epoch": 6.829957028852056, + "grad_norm": 0.2216579169034958, + "learning_rate": 2.412280297017949e-05, + "loss": 1.7856, + "step": 22252 + }, + { + "epoch": 6.8302639656230815, + "grad_norm": 0.22133000195026398, + "learning_rate": 2.4118550005551565e-05, + "loss": 1.7711, + "step": 22253 + }, + { + "epoch": 6.830570902394107, + "grad_norm": 0.21860742568969727, + "learning_rate": 2.41142972966993e-05, + "loss": 1.7276, + "step": 22254 + }, + { + "epoch": 6.830877839165132, + "grad_norm": 0.2484082579612732, + "learning_rate": 2.4110044843664726e-05, + "loss": 1.7038, + "step": 22255 + }, + { + "epoch": 6.8311847759361575, + "grad_norm": 0.22288921475410461, + "learning_rate": 2.410579264648984e-05, + "loss": 1.7149, + "step": 22256 + }, + { + "epoch": 6.831491712707182, + "grad_norm": 0.23635484278202057, + "learning_rate": 2.4101540705216724e-05, + "loss": 1.7296, + "step": 22257 + }, + { + "epoch": 6.831798649478207, + "grad_norm": 0.24334096908569336, + "learning_rate": 2.4097289019887324e-05, + "loss": 1.7458, + "step": 22258 + }, + { + "epoch": 6.832105586249233, + "grad_norm": 0.23019789159297943, + "learning_rate": 2.4093037590543716e-05, + "loss": 1.7296, + "step": 22259 + }, + { + "epoch": 6.832412523020258, + "grad_norm": 0.23739024996757507, + "learning_rate": 2.4088786417227895e-05, + "loss": 1.7844, + "step": 22260 + }, + { + "epoch": 6.832719459791283, + "grad_norm": 0.1969252973794937, + "learning_rate": 2.4084535499981873e-05, + "loss": 1.6692, + "step": 22261 + }, + { + "epoch": 6.833026396562309, + "grad_norm": 0.20111167430877686, + "learning_rate": 2.4080284838847682e-05, + "loss": 1.7813, + "step": 22262 + }, + { + "epoch": 6.833333333333333, + "grad_norm": 0.26112934947013855, + "learning_rate": 2.4076034433867268e-05, + "loss": 1.6852, + "step": 22263 + }, + { + "epoch": 6.833640270104358, + "grad_norm": 0.24244411289691925, + "learning_rate": 2.40717842850827e-05, + "loss": 1.7054, + "step": 22264 + }, + { + "epoch": 6.833947206875384, + "grad_norm": 0.22703053057193756, + "learning_rate": 2.406753439253595e-05, + "loss": 1.7655, + "step": 22265 + }, + { + "epoch": 6.834254143646409, + "grad_norm": 0.23935651779174805, + "learning_rate": 2.4063284756269027e-05, + "loss": 1.7462, + "step": 22266 + }, + { + "epoch": 6.834561080417434, + "grad_norm": 0.2169155478477478, + "learning_rate": 2.4059035376323928e-05, + "loss": 1.7059, + "step": 22267 + }, + { + "epoch": 6.834868017188459, + "grad_norm": 0.2045663446187973, + "learning_rate": 2.4054786252742645e-05, + "loss": 1.7166, + "step": 22268 + }, + { + "epoch": 6.835174953959484, + "grad_norm": 0.22796253859996796, + "learning_rate": 2.4050537385567172e-05, + "loss": 1.7361, + "step": 22269 + }, + { + "epoch": 6.8354818907305095, + "grad_norm": 0.20807915925979614, + "learning_rate": 2.4046288774839497e-05, + "loss": 1.7007, + "step": 22270 + }, + { + "epoch": 6.835788827501535, + "grad_norm": 0.22157903015613556, + "learning_rate": 2.4042040420601607e-05, + "loss": 1.7409, + "step": 22271 + }, + { + "epoch": 6.83609576427256, + "grad_norm": 0.21494148671627045, + "learning_rate": 2.4037792322895492e-05, + "loss": 1.7975, + "step": 22272 + }, + { + "epoch": 6.8364027010435855, + "grad_norm": 0.2275875061750412, + "learning_rate": 2.403354448176311e-05, + "loss": 1.6759, + "step": 22273 + }, + { + "epoch": 6.83670963781461, + "grad_norm": 0.21105073392391205, + "learning_rate": 2.4029296897246496e-05, + "loss": 1.7229, + "step": 22274 + }, + { + "epoch": 6.837016574585635, + "grad_norm": 0.21957579255104065, + "learning_rate": 2.4025049569387553e-05, + "loss": 1.737, + "step": 22275 + }, + { + "epoch": 6.837323511356661, + "grad_norm": 0.2291470617055893, + "learning_rate": 2.4020802498228335e-05, + "loss": 1.6731, + "step": 22276 + }, + { + "epoch": 6.837630448127686, + "grad_norm": 0.18196065723896027, + "learning_rate": 2.401655568381074e-05, + "loss": 1.6823, + "step": 22277 + }, + { + "epoch": 6.83793738489871, + "grad_norm": 0.20915214717388153, + "learning_rate": 2.401230912617678e-05, + "loss": 1.7038, + "step": 22278 + }, + { + "epoch": 6.838244321669736, + "grad_norm": 0.2060854732990265, + "learning_rate": 2.4008062825368437e-05, + "loss": 1.7514, + "step": 22279 + }, + { + "epoch": 6.838551258440761, + "grad_norm": 0.20858527719974518, + "learning_rate": 2.400381678142762e-05, + "loss": 1.7494, + "step": 22280 + }, + { + "epoch": 6.838858195211786, + "grad_norm": 0.19124718010425568, + "learning_rate": 2.3999570994396352e-05, + "loss": 1.7641, + "step": 22281 + }, + { + "epoch": 6.839165131982812, + "grad_norm": 0.28222304582595825, + "learning_rate": 2.3995325464316525e-05, + "loss": 1.7204, + "step": 22282 + }, + { + "epoch": 6.839472068753837, + "grad_norm": 0.20047026872634888, + "learning_rate": 2.399108019123016e-05, + "loss": 1.7261, + "step": 22283 + }, + { + "epoch": 6.8397790055248615, + "grad_norm": 0.2758225202560425, + "learning_rate": 2.3986835175179178e-05, + "loss": 1.6903, + "step": 22284 + }, + { + "epoch": 6.840085942295887, + "grad_norm": 0.2719727158546448, + "learning_rate": 2.3982590416205535e-05, + "loss": 1.8716, + "step": 22285 + }, + { + "epoch": 6.840392879066912, + "grad_norm": 0.3524060845375061, + "learning_rate": 2.3978345914351193e-05, + "loss": 1.7778, + "step": 22286 + }, + { + "epoch": 6.8406998158379375, + "grad_norm": 0.2711596190929413, + "learning_rate": 2.397410166965808e-05, + "loss": 1.7111, + "step": 22287 + }, + { + "epoch": 6.841006752608963, + "grad_norm": 0.2818336486816406, + "learning_rate": 2.396985768216815e-05, + "loss": 1.7292, + "step": 22288 + }, + { + "epoch": 6.841313689379987, + "grad_norm": 0.19677700102329254, + "learning_rate": 2.3965613951923343e-05, + "loss": 1.6975, + "step": 22289 + }, + { + "epoch": 6.841620626151013, + "grad_norm": 0.300997257232666, + "learning_rate": 2.3961370478965583e-05, + "loss": 1.7014, + "step": 22290 + }, + { + "epoch": 6.841927562922038, + "grad_norm": 0.23549453914165497, + "learning_rate": 2.395712726333686e-05, + "loss": 1.7052, + "step": 22291 + }, + { + "epoch": 6.842234499693063, + "grad_norm": 0.29898303747177124, + "learning_rate": 2.3952884305079026e-05, + "loss": 1.7828, + "step": 22292 + }, + { + "epoch": 6.842541436464089, + "grad_norm": 0.26108843088150024, + "learning_rate": 2.3948641604234096e-05, + "loss": 1.7023, + "step": 22293 + }, + { + "epoch": 6.842848373235114, + "grad_norm": 0.18781059980392456, + "learning_rate": 2.394439916084392e-05, + "loss": 1.6808, + "step": 22294 + }, + { + "epoch": 6.843155310006138, + "grad_norm": 0.22659730911254883, + "learning_rate": 2.3940156974950485e-05, + "loss": 1.7224, + "step": 22295 + }, + { + "epoch": 6.843462246777164, + "grad_norm": 0.17422057688236237, + "learning_rate": 2.3935915046595713e-05, + "loss": 1.668, + "step": 22296 + }, + { + "epoch": 6.843769183548189, + "grad_norm": 0.2008846402168274, + "learning_rate": 2.393167337582146e-05, + "loss": 1.7283, + "step": 22297 + }, + { + "epoch": 6.844076120319214, + "grad_norm": 0.20376072824001312, + "learning_rate": 2.392743196266973e-05, + "loss": 1.74, + "step": 22298 + }, + { + "epoch": 6.84438305709024, + "grad_norm": 0.16353756189346313, + "learning_rate": 2.3923190807182372e-05, + "loss": 1.717, + "step": 22299 + }, + { + "epoch": 6.844689993861264, + "grad_norm": 0.18436652421951294, + "learning_rate": 2.3918949909401335e-05, + "loss": 1.7257, + "step": 22300 + }, + { + "epoch": 6.8449969306322895, + "grad_norm": 0.2038460522890091, + "learning_rate": 2.3914709269368523e-05, + "loss": 1.7254, + "step": 22301 + }, + { + "epoch": 6.845303867403315, + "grad_norm": 0.17111587524414062, + "learning_rate": 2.3910468887125842e-05, + "loss": 1.6993, + "step": 22302 + }, + { + "epoch": 6.84561080417434, + "grad_norm": 0.20049406588077545, + "learning_rate": 2.3906228762715207e-05, + "loss": 1.7099, + "step": 22303 + }, + { + "epoch": 6.8459177409453655, + "grad_norm": 0.2168554663658142, + "learning_rate": 2.39019888961785e-05, + "loss": 1.725, + "step": 22304 + }, + { + "epoch": 6.846224677716391, + "grad_norm": 0.2228514850139618, + "learning_rate": 2.3897749287557647e-05, + "loss": 1.7348, + "step": 22305 + }, + { + "epoch": 6.846531614487415, + "grad_norm": 0.17166151106357574, + "learning_rate": 2.3893509936894532e-05, + "loss": 1.7451, + "step": 22306 + }, + { + "epoch": 6.846838551258441, + "grad_norm": 0.24896936118602753, + "learning_rate": 2.3889270844231026e-05, + "loss": 1.7397, + "step": 22307 + }, + { + "epoch": 6.847145488029466, + "grad_norm": 0.1984332948923111, + "learning_rate": 2.3885032009609098e-05, + "loss": 1.7167, + "step": 22308 + }, + { + "epoch": 6.847452424800491, + "grad_norm": 0.20763449370861053, + "learning_rate": 2.388079343307055e-05, + "loss": 1.7154, + "step": 22309 + }, + { + "epoch": 6.847759361571516, + "grad_norm": 0.21818630397319794, + "learning_rate": 2.3876555114657346e-05, + "loss": 1.7364, + "step": 22310 + }, + { + "epoch": 6.848066298342541, + "grad_norm": 0.21220166981220245, + "learning_rate": 2.3872317054411298e-05, + "loss": 1.74, + "step": 22311 + }, + { + "epoch": 6.848373235113566, + "grad_norm": 0.17486892640590668, + "learning_rate": 2.3868079252374343e-05, + "loss": 1.68, + "step": 22312 + }, + { + "epoch": 6.848680171884592, + "grad_norm": 0.20809298753738403, + "learning_rate": 2.386384170858837e-05, + "loss": 1.8102, + "step": 22313 + }, + { + "epoch": 6.848987108655617, + "grad_norm": 0.19927671551704407, + "learning_rate": 2.385960442309519e-05, + "loss": 1.7742, + "step": 22314 + }, + { + "epoch": 6.849294045426642, + "grad_norm": 0.18705040216445923, + "learning_rate": 2.3855367395936757e-05, + "loss": 1.689, + "step": 22315 + }, + { + "epoch": 6.849600982197668, + "grad_norm": 0.22023466229438782, + "learning_rate": 2.385113062715487e-05, + "loss": 1.7819, + "step": 22316 + }, + { + "epoch": 6.849907918968692, + "grad_norm": 0.24443435668945312, + "learning_rate": 2.384689411679146e-05, + "loss": 1.6533, + "step": 22317 + }, + { + "epoch": 6.850214855739718, + "grad_norm": 0.20103834569454193, + "learning_rate": 2.3842657864888368e-05, + "loss": 1.7274, + "step": 22318 + }, + { + "epoch": 6.850521792510743, + "grad_norm": 0.2265254408121109, + "learning_rate": 2.3838421871487465e-05, + "loss": 1.7874, + "step": 22319 + }, + { + "epoch": 6.850828729281768, + "grad_norm": 0.2775460183620453, + "learning_rate": 2.383418613663061e-05, + "loss": 1.8038, + "step": 22320 + }, + { + "epoch": 6.851135666052793, + "grad_norm": 0.2001011073589325, + "learning_rate": 2.3829950660359663e-05, + "loss": 1.7135, + "step": 22321 + }, + { + "epoch": 6.851442602823818, + "grad_norm": 0.21427330374717712, + "learning_rate": 2.382571544271648e-05, + "loss": 1.7155, + "step": 22322 + }, + { + "epoch": 6.851749539594843, + "grad_norm": 0.18420884013175964, + "learning_rate": 2.382148048374292e-05, + "loss": 1.7178, + "step": 22323 + }, + { + "epoch": 6.852056476365869, + "grad_norm": 0.19436471164226532, + "learning_rate": 2.3817245783480813e-05, + "loss": 1.7396, + "step": 22324 + }, + { + "epoch": 6.852363413136894, + "grad_norm": 0.23191674053668976, + "learning_rate": 2.381301134197207e-05, + "loss": 1.7102, + "step": 22325 + }, + { + "epoch": 6.852670349907919, + "grad_norm": 0.20381706953048706, + "learning_rate": 2.3808777159258462e-05, + "loss": 1.7671, + "step": 22326 + }, + { + "epoch": 6.852977286678944, + "grad_norm": 0.20202197134494781, + "learning_rate": 2.3804543235381897e-05, + "loss": 1.6774, + "step": 22327 + }, + { + "epoch": 6.853284223449969, + "grad_norm": 0.23496322333812714, + "learning_rate": 2.380030957038416e-05, + "loss": 1.7745, + "step": 22328 + }, + { + "epoch": 6.8535911602209945, + "grad_norm": 0.22473813593387604, + "learning_rate": 2.379607616430714e-05, + "loss": 1.7319, + "step": 22329 + }, + { + "epoch": 6.85389809699202, + "grad_norm": 0.2149224430322647, + "learning_rate": 2.3791843017192667e-05, + "loss": 1.77, + "step": 22330 + }, + { + "epoch": 6.854205033763045, + "grad_norm": 0.21146108210086823, + "learning_rate": 2.378761012908253e-05, + "loss": 1.762, + "step": 22331 + }, + { + "epoch": 6.85451197053407, + "grad_norm": 0.2031458169221878, + "learning_rate": 2.3783377500018626e-05, + "loss": 1.7007, + "step": 22332 + }, + { + "epoch": 6.854818907305095, + "grad_norm": 0.19763319194316864, + "learning_rate": 2.377914513004272e-05, + "loss": 1.6899, + "step": 22333 + }, + { + "epoch": 6.85512584407612, + "grad_norm": 0.17337046563625336, + "learning_rate": 2.3774913019196688e-05, + "loss": 1.683, + "step": 22334 + }, + { + "epoch": 6.855432780847146, + "grad_norm": 0.1850815862417221, + "learning_rate": 2.3770681167522328e-05, + "loss": 1.7284, + "step": 22335 + }, + { + "epoch": 6.855739717618171, + "grad_norm": 0.19693362712860107, + "learning_rate": 2.3766449575061477e-05, + "loss": 1.7694, + "step": 22336 + }, + { + "epoch": 6.856046654389196, + "grad_norm": 0.1981547325849533, + "learning_rate": 2.376221824185595e-05, + "loss": 1.736, + "step": 22337 + }, + { + "epoch": 6.856353591160221, + "grad_norm": 0.17638558149337769, + "learning_rate": 2.375798716794756e-05, + "loss": 1.6979, + "step": 22338 + }, + { + "epoch": 6.856660527931246, + "grad_norm": 0.20189990103244781, + "learning_rate": 2.3753756353378116e-05, + "loss": 1.7876, + "step": 22339 + }, + { + "epoch": 6.856967464702271, + "grad_norm": 0.1880224347114563, + "learning_rate": 2.3749525798189438e-05, + "loss": 1.7134, + "step": 22340 + }, + { + "epoch": 6.857274401473297, + "grad_norm": 0.2464265078306198, + "learning_rate": 2.3745295502423316e-05, + "loss": 1.7782, + "step": 22341 + }, + { + "epoch": 6.857581338244322, + "grad_norm": 0.19218963384628296, + "learning_rate": 2.3741065466121604e-05, + "loss": 1.7027, + "step": 22342 + }, + { + "epoch": 6.8578882750153465, + "grad_norm": 0.27446448802948, + "learning_rate": 2.3736835689326043e-05, + "loss": 1.772, + "step": 22343 + }, + { + "epoch": 6.858195211786372, + "grad_norm": 0.19315828382968903, + "learning_rate": 2.3732606172078497e-05, + "loss": 1.6855, + "step": 22344 + }, + { + "epoch": 6.858502148557397, + "grad_norm": 0.2668892741203308, + "learning_rate": 2.372837691442072e-05, + "loss": 1.7703, + "step": 22345 + }, + { + "epoch": 6.8588090853284225, + "grad_norm": 0.23552054166793823, + "learning_rate": 2.3724147916394497e-05, + "loss": 1.7184, + "step": 22346 + }, + { + "epoch": 6.859116022099448, + "grad_norm": 0.3194984793663025, + "learning_rate": 2.3719919178041682e-05, + "loss": 1.7531, + "step": 22347 + }, + { + "epoch": 6.859422958870473, + "grad_norm": 0.19298717379570007, + "learning_rate": 2.371569069940399e-05, + "loss": 1.7064, + "step": 22348 + }, + { + "epoch": 6.859729895641498, + "grad_norm": 0.2990693151950836, + "learning_rate": 2.3711462480523293e-05, + "loss": 1.7434, + "step": 22349 + }, + { + "epoch": 6.860036832412523, + "grad_norm": 0.1976640820503235, + "learning_rate": 2.370723452144129e-05, + "loss": 1.6881, + "step": 22350 + }, + { + "epoch": 6.860343769183548, + "grad_norm": 0.24306917190551758, + "learning_rate": 2.3703006822199825e-05, + "loss": 1.7791, + "step": 22351 + }, + { + "epoch": 6.860650705954574, + "grad_norm": 0.20065687596797943, + "learning_rate": 2.3698779382840657e-05, + "loss": 1.7162, + "step": 22352 + }, + { + "epoch": 6.860957642725598, + "grad_norm": 0.21599936485290527, + "learning_rate": 2.3694552203405574e-05, + "loss": 1.7702, + "step": 22353 + }, + { + "epoch": 6.861264579496623, + "grad_norm": 0.16836890578269958, + "learning_rate": 2.3690325283936338e-05, + "loss": 1.6676, + "step": 22354 + }, + { + "epoch": 6.861571516267649, + "grad_norm": 0.1756831407546997, + "learning_rate": 2.368609862447473e-05, + "loss": 1.6934, + "step": 22355 + }, + { + "epoch": 6.861878453038674, + "grad_norm": 0.18676789104938507, + "learning_rate": 2.3681872225062517e-05, + "loss": 1.6879, + "step": 22356 + }, + { + "epoch": 6.862185389809699, + "grad_norm": 0.18018634617328644, + "learning_rate": 2.3677646085741473e-05, + "loss": 1.7143, + "step": 22357 + }, + { + "epoch": 6.862492326580725, + "grad_norm": 0.1789008378982544, + "learning_rate": 2.3673420206553332e-05, + "loss": 1.6914, + "step": 22358 + }, + { + "epoch": 6.862799263351749, + "grad_norm": 0.1869693398475647, + "learning_rate": 2.366919458753993e-05, + "loss": 1.7431, + "step": 22359 + }, + { + "epoch": 6.8631062001227745, + "grad_norm": 0.1958019733428955, + "learning_rate": 2.3664969228742934e-05, + "loss": 1.7132, + "step": 22360 + }, + { + "epoch": 6.8634131368938, + "grad_norm": 0.199384868144989, + "learning_rate": 2.366074413020419e-05, + "loss": 1.7095, + "step": 22361 + }, + { + "epoch": 6.863720073664825, + "grad_norm": 0.2125246673822403, + "learning_rate": 2.365651929196539e-05, + "loss": 1.7125, + "step": 22362 + }, + { + "epoch": 6.8640270104358505, + "grad_norm": 0.1574707180261612, + "learning_rate": 2.3652294714068284e-05, + "loss": 1.6386, + "step": 22363 + }, + { + "epoch": 6.864333947206875, + "grad_norm": 0.30648529529571533, + "learning_rate": 2.364807039655469e-05, + "loss": 1.7665, + "step": 22364 + }, + { + "epoch": 6.8646408839779, + "grad_norm": 0.19746489822864532, + "learning_rate": 2.364384633946627e-05, + "loss": 1.6736, + "step": 22365 + }, + { + "epoch": 6.864947820748926, + "grad_norm": 0.25084391236305237, + "learning_rate": 2.3639622542844842e-05, + "loss": 1.7346, + "step": 22366 + }, + { + "epoch": 6.865254757519951, + "grad_norm": 0.1884133219718933, + "learning_rate": 2.3635399006732077e-05, + "loss": 1.6868, + "step": 22367 + }, + { + "epoch": 6.865561694290976, + "grad_norm": 0.21225856244564056, + "learning_rate": 2.3631175731169774e-05, + "loss": 1.7438, + "step": 22368 + }, + { + "epoch": 6.865868631062002, + "grad_norm": 0.1863771378993988, + "learning_rate": 2.3626952716199647e-05, + "loss": 1.7677, + "step": 22369 + }, + { + "epoch": 6.866175567833026, + "grad_norm": 0.1839088648557663, + "learning_rate": 2.362272996186343e-05, + "loss": 1.6902, + "step": 22370 + }, + { + "epoch": 6.866482504604051, + "grad_norm": 0.18304915726184845, + "learning_rate": 2.3618507468202856e-05, + "loss": 1.7142, + "step": 22371 + }, + { + "epoch": 6.866789441375077, + "grad_norm": 0.21228280663490295, + "learning_rate": 2.3614285235259655e-05, + "loss": 1.8277, + "step": 22372 + }, + { + "epoch": 6.867096378146102, + "grad_norm": 0.19515320658683777, + "learning_rate": 2.361006326307555e-05, + "loss": 1.7029, + "step": 22373 + }, + { + "epoch": 6.867403314917127, + "grad_norm": 0.16277433931827545, + "learning_rate": 2.360584155169227e-05, + "loss": 1.672, + "step": 22374 + }, + { + "epoch": 6.867710251688152, + "grad_norm": 0.2180202454328537, + "learning_rate": 2.360162010115151e-05, + "loss": 1.7516, + "step": 22375 + }, + { + "epoch": 6.868017188459177, + "grad_norm": 0.17940378189086914, + "learning_rate": 2.3597398911495055e-05, + "loss": 1.6782, + "step": 22376 + }, + { + "epoch": 6.8683241252302025, + "grad_norm": 0.20751933753490448, + "learning_rate": 2.3593177982764543e-05, + "loss": 1.7954, + "step": 22377 + }, + { + "epoch": 6.868631062001228, + "grad_norm": 0.23098444938659668, + "learning_rate": 2.3588957315001758e-05, + "loss": 1.7472, + "step": 22378 + }, + { + "epoch": 6.868937998772253, + "grad_norm": 0.2351236343383789, + "learning_rate": 2.358473690824836e-05, + "loss": 1.7959, + "step": 22379 + }, + { + "epoch": 6.8692449355432785, + "grad_norm": 0.1890626847743988, + "learning_rate": 2.3580516762546055e-05, + "loss": 1.7015, + "step": 22380 + }, + { + "epoch": 6.869551872314303, + "grad_norm": 0.21120475232601166, + "learning_rate": 2.3576296877936604e-05, + "loss": 1.7998, + "step": 22381 + }, + { + "epoch": 6.869858809085328, + "grad_norm": 0.18141280114650726, + "learning_rate": 2.3572077254461638e-05, + "loss": 1.6973, + "step": 22382 + }, + { + "epoch": 6.870165745856354, + "grad_norm": 0.19084444642066956, + "learning_rate": 2.356785789216293e-05, + "loss": 1.6853, + "step": 22383 + }, + { + "epoch": 6.870472682627379, + "grad_norm": 0.18046700954437256, + "learning_rate": 2.356363879108211e-05, + "loss": 1.7476, + "step": 22384 + }, + { + "epoch": 6.870779619398404, + "grad_norm": 0.19875061511993408, + "learning_rate": 2.3559419951260926e-05, + "loss": 1.7223, + "step": 22385 + }, + { + "epoch": 6.871086556169429, + "grad_norm": 0.2377827763557434, + "learning_rate": 2.3555201372741047e-05, + "loss": 1.7976, + "step": 22386 + }, + { + "epoch": 6.871393492940454, + "grad_norm": 0.17645993828773499, + "learning_rate": 2.3550983055564168e-05, + "loss": 1.6726, + "step": 22387 + }, + { + "epoch": 6.871700429711479, + "grad_norm": 0.19499735534191132, + "learning_rate": 2.3546764999771976e-05, + "loss": 1.67, + "step": 22388 + }, + { + "epoch": 6.872007366482505, + "grad_norm": 0.22010546922683716, + "learning_rate": 2.3542547205406163e-05, + "loss": 1.8461, + "step": 22389 + }, + { + "epoch": 6.87231430325353, + "grad_norm": 0.2101692259311676, + "learning_rate": 2.3538329672508396e-05, + "loss": 1.6922, + "step": 22390 + }, + { + "epoch": 6.872621240024555, + "grad_norm": 0.1926269382238388, + "learning_rate": 2.3534112401120372e-05, + "loss": 1.6934, + "step": 22391 + }, + { + "epoch": 6.87292817679558, + "grad_norm": 0.20662687718868256, + "learning_rate": 2.3529895391283742e-05, + "loss": 1.7284, + "step": 22392 + }, + { + "epoch": 6.873235113566605, + "grad_norm": 0.2392960786819458, + "learning_rate": 2.3525678643040235e-05, + "loss": 1.7207, + "step": 22393 + }, + { + "epoch": 6.8735420503376305, + "grad_norm": 0.2067870795726776, + "learning_rate": 2.3521462156431452e-05, + "loss": 1.7269, + "step": 22394 + }, + { + "epoch": 6.873848987108656, + "grad_norm": 0.2544265687465668, + "learning_rate": 2.351724593149914e-05, + "loss": 1.7358, + "step": 22395 + }, + { + "epoch": 6.87415592387968, + "grad_norm": 0.2243366837501526, + "learning_rate": 2.3513029968284907e-05, + "loss": 1.7625, + "step": 22396 + }, + { + "epoch": 6.874462860650706, + "grad_norm": 0.23003467917442322, + "learning_rate": 2.3508814266830414e-05, + "loss": 1.6943, + "step": 22397 + }, + { + "epoch": 6.874769797421731, + "grad_norm": 0.19257886707782745, + "learning_rate": 2.3504598827177383e-05, + "loss": 1.7393, + "step": 22398 + }, + { + "epoch": 6.875076734192756, + "grad_norm": 0.23782171308994293, + "learning_rate": 2.3500383649367404e-05, + "loss": 1.7758, + "step": 22399 + }, + { + "epoch": 6.875383670963782, + "grad_norm": 0.18137066066265106, + "learning_rate": 2.3496168733442197e-05, + "loss": 1.7083, + "step": 22400 + }, + { + "epoch": 6.875690607734807, + "grad_norm": 0.21970662474632263, + "learning_rate": 2.3491954079443344e-05, + "loss": 1.7552, + "step": 22401 + }, + { + "epoch": 6.8759975445058314, + "grad_norm": 0.2032134085893631, + "learning_rate": 2.3487739687412562e-05, + "loss": 1.7653, + "step": 22402 + }, + { + "epoch": 6.876304481276857, + "grad_norm": 0.22016118466854095, + "learning_rate": 2.348352555739148e-05, + "loss": 1.7277, + "step": 22403 + }, + { + "epoch": 6.876611418047882, + "grad_norm": 0.2250203788280487, + "learning_rate": 2.3479311689421736e-05, + "loss": 1.7451, + "step": 22404 + }, + { + "epoch": 6.8769183548189075, + "grad_norm": 0.19726359844207764, + "learning_rate": 2.3475098083544977e-05, + "loss": 1.728, + "step": 22405 + }, + { + "epoch": 6.877225291589933, + "grad_norm": 0.21295994520187378, + "learning_rate": 2.3470884739802844e-05, + "loss": 1.7438, + "step": 22406 + }, + { + "epoch": 6.877532228360957, + "grad_norm": 0.19653508067131042, + "learning_rate": 2.346667165823698e-05, + "loss": 1.7189, + "step": 22407 + }, + { + "epoch": 6.877839165131983, + "grad_norm": 0.21406517922878265, + "learning_rate": 2.3462458838889016e-05, + "loss": 1.7475, + "step": 22408 + }, + { + "epoch": 6.878146101903008, + "grad_norm": 0.20569753646850586, + "learning_rate": 2.3458246281800595e-05, + "loss": 1.7262, + "step": 22409 + }, + { + "epoch": 6.878453038674033, + "grad_norm": 0.19365517795085907, + "learning_rate": 2.3454033987013334e-05, + "loss": 1.6938, + "step": 22410 + }, + { + "epoch": 6.878759975445059, + "grad_norm": 0.20935405790805817, + "learning_rate": 2.344982195456885e-05, + "loss": 1.724, + "step": 22411 + }, + { + "epoch": 6.879066912216084, + "grad_norm": 0.2104228436946869, + "learning_rate": 2.3445610184508826e-05, + "loss": 1.7474, + "step": 22412 + }, + { + "epoch": 6.879373848987108, + "grad_norm": 0.19795742630958557, + "learning_rate": 2.3441398676874826e-05, + "loss": 1.7572, + "step": 22413 + }, + { + "epoch": 6.879680785758134, + "grad_norm": 0.20640577375888824, + "learning_rate": 2.3437187431708472e-05, + "loss": 1.7258, + "step": 22414 + }, + { + "epoch": 6.879987722529159, + "grad_norm": 0.2092565894126892, + "learning_rate": 2.3432976449051442e-05, + "loss": 1.7437, + "step": 22415 + }, + { + "epoch": 6.880294659300184, + "grad_norm": 0.2083825170993805, + "learning_rate": 2.3428765728945275e-05, + "loss": 1.7127, + "step": 22416 + }, + { + "epoch": 6.88060159607121, + "grad_norm": 0.20619866251945496, + "learning_rate": 2.3424555271431647e-05, + "loss": 1.7729, + "step": 22417 + }, + { + "epoch": 6.880908532842234, + "grad_norm": 0.22689959406852722, + "learning_rate": 2.3420345076552107e-05, + "loss": 1.7142, + "step": 22418 + }, + { + "epoch": 6.8812154696132595, + "grad_norm": 0.16664449870586395, + "learning_rate": 2.3416135144348316e-05, + "loss": 1.6857, + "step": 22419 + }, + { + "epoch": 6.881522406384285, + "grad_norm": 0.1895827353000641, + "learning_rate": 2.3411925474861856e-05, + "loss": 1.7075, + "step": 22420 + }, + { + "epoch": 6.88182934315531, + "grad_norm": 0.2058400958776474, + "learning_rate": 2.3407716068134334e-05, + "loss": 1.7623, + "step": 22421 + }, + { + "epoch": 6.8821362799263355, + "grad_norm": 0.18390826880931854, + "learning_rate": 2.3403506924207346e-05, + "loss": 1.6686, + "step": 22422 + }, + { + "epoch": 6.882443216697361, + "grad_norm": 0.1742098331451416, + "learning_rate": 2.3399298043122497e-05, + "loss": 1.6846, + "step": 22423 + }, + { + "epoch": 6.882750153468385, + "grad_norm": 0.18958622217178345, + "learning_rate": 2.3395089424921368e-05, + "loss": 1.7603, + "step": 22424 + }, + { + "epoch": 6.883057090239411, + "grad_norm": 0.21827174723148346, + "learning_rate": 2.3390881069645564e-05, + "loss": 1.6706, + "step": 22425 + }, + { + "epoch": 6.883364027010436, + "grad_norm": 0.17859303951263428, + "learning_rate": 2.338667297733667e-05, + "loss": 1.7612, + "step": 22426 + }, + { + "epoch": 6.883670963781461, + "grad_norm": 0.22383756935596466, + "learning_rate": 2.338246514803627e-05, + "loss": 1.7507, + "step": 22427 + }, + { + "epoch": 6.883977900552486, + "grad_norm": 0.20317313075065613, + "learning_rate": 2.3378257581785934e-05, + "loss": 1.6912, + "step": 22428 + }, + { + "epoch": 6.884284837323511, + "grad_norm": 0.20238614082336426, + "learning_rate": 2.3374050278627297e-05, + "loss": 1.7336, + "step": 22429 + }, + { + "epoch": 6.884591774094536, + "grad_norm": 0.2134159654378891, + "learning_rate": 2.336984323860188e-05, + "loss": 1.7252, + "step": 22430 + }, + { + "epoch": 6.884898710865562, + "grad_norm": 0.17153076827526093, + "learning_rate": 2.3365636461751277e-05, + "loss": 1.6769, + "step": 22431 + }, + { + "epoch": 6.885205647636587, + "grad_norm": 0.19001254439353943, + "learning_rate": 2.3361429948117075e-05, + "loss": 1.7812, + "step": 22432 + }, + { + "epoch": 6.885512584407612, + "grad_norm": 0.2074522078037262, + "learning_rate": 2.335722369774081e-05, + "loss": 1.7433, + "step": 22433 + }, + { + "epoch": 6.885819521178637, + "grad_norm": 0.22863705456256866, + "learning_rate": 2.3353017710664117e-05, + "loss": 1.7476, + "step": 22434 + }, + { + "epoch": 6.886126457949662, + "grad_norm": 0.19350804388523102, + "learning_rate": 2.334881198692848e-05, + "loss": 1.7071, + "step": 22435 + }, + { + "epoch": 6.8864333947206875, + "grad_norm": 0.22915633022785187, + "learning_rate": 2.3344606526575524e-05, + "loss": 1.7283, + "step": 22436 + }, + { + "epoch": 6.886740331491713, + "grad_norm": 0.21576058864593506, + "learning_rate": 2.3340401329646795e-05, + "loss": 1.7062, + "step": 22437 + }, + { + "epoch": 6.887047268262738, + "grad_norm": 0.17844067513942719, + "learning_rate": 2.333619639618384e-05, + "loss": 1.6994, + "step": 22438 + }, + { + "epoch": 6.887354205033763, + "grad_norm": 0.21019738912582397, + "learning_rate": 2.333199172622822e-05, + "loss": 1.6654, + "step": 22439 + }, + { + "epoch": 6.887661141804788, + "grad_norm": 0.1901654452085495, + "learning_rate": 2.3327787319821486e-05, + "loss": 1.7847, + "step": 22440 + }, + { + "epoch": 6.887968078575813, + "grad_norm": 0.21838930249214172, + "learning_rate": 2.3323583177005198e-05, + "loss": 1.6517, + "step": 22441 + }, + { + "epoch": 6.888275015346839, + "grad_norm": 0.16078172624111176, + "learning_rate": 2.3319379297820892e-05, + "loss": 1.7052, + "step": 22442 + }, + { + "epoch": 6.888581952117864, + "grad_norm": 0.19161897897720337, + "learning_rate": 2.331517568231012e-05, + "loss": 1.675, + "step": 22443 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 0.1874416172504425, + "learning_rate": 2.331097233051442e-05, + "loss": 1.7025, + "step": 22444 + }, + { + "epoch": 6.889195825659914, + "grad_norm": 0.1817546933889389, + "learning_rate": 2.3306769242475318e-05, + "loss": 1.7103, + "step": 22445 + }, + { + "epoch": 6.889502762430939, + "grad_norm": 0.18423372507095337, + "learning_rate": 2.3302566418234406e-05, + "loss": 1.6883, + "step": 22446 + }, + { + "epoch": 6.889809699201964, + "grad_norm": 0.1712140440940857, + "learning_rate": 2.3298363857833162e-05, + "loss": 1.7076, + "step": 22447 + }, + { + "epoch": 6.89011663597299, + "grad_norm": 0.15992864966392517, + "learning_rate": 2.3294161561313133e-05, + "loss": 1.6514, + "step": 22448 + }, + { + "epoch": 6.890423572744015, + "grad_norm": 0.24126072227954865, + "learning_rate": 2.3289959528715855e-05, + "loss": 1.7385, + "step": 22449 + }, + { + "epoch": 6.8907305095150395, + "grad_norm": 0.18130798637866974, + "learning_rate": 2.3285757760082832e-05, + "loss": 1.691, + "step": 22450 + }, + { + "epoch": 6.891037446286065, + "grad_norm": 0.20070049166679382, + "learning_rate": 2.3281556255455644e-05, + "loss": 1.7166, + "step": 22451 + }, + { + "epoch": 6.89134438305709, + "grad_norm": 0.20706996321678162, + "learning_rate": 2.327735501487574e-05, + "loss": 1.6763, + "step": 22452 + }, + { + "epoch": 6.8916513198281155, + "grad_norm": 0.22404810786247253, + "learning_rate": 2.327315403838472e-05, + "loss": 1.761, + "step": 22453 + }, + { + "epoch": 6.891958256599141, + "grad_norm": 0.21240194141864777, + "learning_rate": 2.3268953326024013e-05, + "loss": 1.7038, + "step": 22454 + }, + { + "epoch": 6.892265193370166, + "grad_norm": 0.24251966178417206, + "learning_rate": 2.32647528778352e-05, + "loss": 1.7829, + "step": 22455 + }, + { + "epoch": 6.892572130141191, + "grad_norm": 0.21213467419147491, + "learning_rate": 2.3260552693859765e-05, + "loss": 1.7433, + "step": 22456 + }, + { + "epoch": 6.892879066912216, + "grad_norm": 0.18008530139923096, + "learning_rate": 2.325635277413922e-05, + "loss": 1.7238, + "step": 22457 + }, + { + "epoch": 6.893186003683241, + "grad_norm": 0.18252789974212646, + "learning_rate": 2.325215311871508e-05, + "loss": 1.7143, + "step": 22458 + }, + { + "epoch": 6.893492940454267, + "grad_norm": 0.17830567061901093, + "learning_rate": 2.3247953727628833e-05, + "loss": 1.687, + "step": 22459 + }, + { + "epoch": 6.893799877225292, + "grad_norm": 0.19980686902999878, + "learning_rate": 2.3243754600921992e-05, + "loss": 1.7096, + "step": 22460 + }, + { + "epoch": 6.894106813996316, + "grad_norm": 0.1713438183069229, + "learning_rate": 2.3239555738636044e-05, + "loss": 1.6791, + "step": 22461 + }, + { + "epoch": 6.894413750767342, + "grad_norm": 0.17678281664848328, + "learning_rate": 2.3235357140812475e-05, + "loss": 1.6689, + "step": 22462 + }, + { + "epoch": 6.894720687538367, + "grad_norm": 0.20409992337226868, + "learning_rate": 2.3231158807492837e-05, + "loss": 1.7746, + "step": 22463 + }, + { + "epoch": 6.895027624309392, + "grad_norm": 0.19227825105190277, + "learning_rate": 2.3226960738718552e-05, + "loss": 1.7101, + "step": 22464 + }, + { + "epoch": 6.895334561080418, + "grad_norm": 0.24029433727264404, + "learning_rate": 2.3222762934531132e-05, + "loss": 1.7842, + "step": 22465 + }, + { + "epoch": 6.895641497851443, + "grad_norm": 0.21887856721878052, + "learning_rate": 2.321856539497207e-05, + "loss": 1.7032, + "step": 22466 + }, + { + "epoch": 6.8959484346224675, + "grad_norm": 0.17346082627773285, + "learning_rate": 2.321436812008282e-05, + "loss": 1.683, + "step": 22467 + }, + { + "epoch": 6.896255371393493, + "grad_norm": 0.18920177221298218, + "learning_rate": 2.3210171109904914e-05, + "loss": 1.7057, + "step": 22468 + }, + { + "epoch": 6.896562308164518, + "grad_norm": 0.21199388802051544, + "learning_rate": 2.320597436447977e-05, + "loss": 1.7534, + "step": 22469 + }, + { + "epoch": 6.8968692449355435, + "grad_norm": 0.1867530792951584, + "learning_rate": 2.320177788384893e-05, + "loss": 1.7185, + "step": 22470 + }, + { + "epoch": 6.897176181706568, + "grad_norm": 0.21009495854377747, + "learning_rate": 2.3197581668053785e-05, + "loss": 1.7379, + "step": 22471 + }, + { + "epoch": 6.897483118477593, + "grad_norm": 0.20078743994235992, + "learning_rate": 2.3193385717135874e-05, + "loss": 1.7226, + "step": 22472 + }, + { + "epoch": 6.897790055248619, + "grad_norm": 0.2135045975446701, + "learning_rate": 2.318919003113663e-05, + "loss": 1.7531, + "step": 22473 + }, + { + "epoch": 6.898096992019644, + "grad_norm": 0.18811136484146118, + "learning_rate": 2.3184994610097526e-05, + "loss": 1.6542, + "step": 22474 + }, + { + "epoch": 6.898403928790669, + "grad_norm": 0.2323937565088272, + "learning_rate": 2.3180799454060025e-05, + "loss": 1.7369, + "step": 22475 + }, + { + "epoch": 6.898710865561695, + "grad_norm": 0.19270992279052734, + "learning_rate": 2.317660456306558e-05, + "loss": 1.6818, + "step": 22476 + }, + { + "epoch": 6.899017802332719, + "grad_norm": 0.18951043486595154, + "learning_rate": 2.3172409937155654e-05, + "loss": 1.7183, + "step": 22477 + }, + { + "epoch": 6.899324739103744, + "grad_norm": 0.1758934110403061, + "learning_rate": 2.3168215576371694e-05, + "loss": 1.6826, + "step": 22478 + }, + { + "epoch": 6.89963167587477, + "grad_norm": 0.2048143893480301, + "learning_rate": 2.3164021480755133e-05, + "loss": 1.7769, + "step": 22479 + }, + { + "epoch": 6.899938612645795, + "grad_norm": 0.20538486540317535, + "learning_rate": 2.315982765034748e-05, + "loss": 1.7035, + "step": 22480 + }, + { + "epoch": 6.9002455494168204, + "grad_norm": 0.18417708575725555, + "learning_rate": 2.3155634085190124e-05, + "loss": 1.7533, + "step": 22481 + }, + { + "epoch": 6.900552486187845, + "grad_norm": 0.1978628784418106, + "learning_rate": 2.315144078532453e-05, + "loss": 1.691, + "step": 22482 + }, + { + "epoch": 6.90085942295887, + "grad_norm": 0.17665794491767883, + "learning_rate": 2.3147247750792128e-05, + "loss": 1.7018, + "step": 22483 + }, + { + "epoch": 6.901166359729896, + "grad_norm": 0.20218273997306824, + "learning_rate": 2.314305498163435e-05, + "loss": 1.7277, + "step": 22484 + }, + { + "epoch": 6.901473296500921, + "grad_norm": 0.18791642785072327, + "learning_rate": 2.3138862477892674e-05, + "loss": 1.7247, + "step": 22485 + }, + { + "epoch": 6.901780233271946, + "grad_norm": 0.1945842206478119, + "learning_rate": 2.313467023960847e-05, + "loss": 1.6648, + "step": 22486 + }, + { + "epoch": 6.902087170042972, + "grad_norm": 0.1871321201324463, + "learning_rate": 2.3130478266823237e-05, + "loss": 1.6978, + "step": 22487 + }, + { + "epoch": 6.902394106813996, + "grad_norm": 0.20094287395477295, + "learning_rate": 2.312628655957833e-05, + "loss": 1.7763, + "step": 22488 + }, + { + "epoch": 6.902701043585021, + "grad_norm": 0.1804366111755371, + "learning_rate": 2.3122095117915226e-05, + "loss": 1.689, + "step": 22489 + }, + { + "epoch": 6.903007980356047, + "grad_norm": 0.1846652776002884, + "learning_rate": 2.311790394187534e-05, + "loss": 1.7088, + "step": 22490 + }, + { + "epoch": 6.903314917127072, + "grad_norm": 0.18339675664901733, + "learning_rate": 2.311371303150008e-05, + "loss": 1.6974, + "step": 22491 + }, + { + "epoch": 6.903621853898097, + "grad_norm": 0.21333162486553192, + "learning_rate": 2.3109522386830863e-05, + "loss": 1.7614, + "step": 22492 + }, + { + "epoch": 6.903928790669122, + "grad_norm": 0.19845318794250488, + "learning_rate": 2.3105332007909104e-05, + "loss": 1.6895, + "step": 22493 + }, + { + "epoch": 6.904235727440147, + "grad_norm": 0.21082347631454468, + "learning_rate": 2.3101141894776224e-05, + "loss": 1.7397, + "step": 22494 + }, + { + "epoch": 6.9045426642111725, + "grad_norm": 0.16360893845558167, + "learning_rate": 2.3096952047473623e-05, + "loss": 1.6716, + "step": 22495 + }, + { + "epoch": 6.904849600982198, + "grad_norm": 0.2287478744983673, + "learning_rate": 2.3092762466042687e-05, + "loss": 1.7673, + "step": 22496 + }, + { + "epoch": 6.905156537753223, + "grad_norm": 0.17231078445911407, + "learning_rate": 2.308857315052489e-05, + "loss": 1.6744, + "step": 22497 + }, + { + "epoch": 6.9054634745242485, + "grad_norm": 0.2887173295021057, + "learning_rate": 2.3084384100961565e-05, + "loss": 1.7358, + "step": 22498 + }, + { + "epoch": 6.905770411295273, + "grad_norm": 0.1977192759513855, + "learning_rate": 2.3080195317394127e-05, + "loss": 1.7514, + "step": 22499 + }, + { + "epoch": 6.906077348066298, + "grad_norm": 0.24933035671710968, + "learning_rate": 2.307600679986398e-05, + "loss": 1.6845, + "step": 22500 + }, + { + "epoch": 6.906384284837324, + "grad_norm": 0.17288708686828613, + "learning_rate": 2.30718185484125e-05, + "loss": 1.7211, + "step": 22501 + }, + { + "epoch": 6.906691221608349, + "grad_norm": 0.22192007303237915, + "learning_rate": 2.306763056308112e-05, + "loss": 1.6924, + "step": 22502 + }, + { + "epoch": 6.906998158379373, + "grad_norm": 0.20500123500823975, + "learning_rate": 2.3063442843911172e-05, + "loss": 1.7412, + "step": 22503 + }, + { + "epoch": 6.907305095150399, + "grad_norm": 0.30658698081970215, + "learning_rate": 2.30592553909441e-05, + "loss": 1.7965, + "step": 22504 + }, + { + "epoch": 6.907612031921424, + "grad_norm": 0.177829772233963, + "learning_rate": 2.3055068204221224e-05, + "loss": 1.6914, + "step": 22505 + }, + { + "epoch": 6.907918968692449, + "grad_norm": 0.20281876623630524, + "learning_rate": 2.3050881283783977e-05, + "loss": 1.6946, + "step": 22506 + }, + { + "epoch": 6.908225905463475, + "grad_norm": 0.16111700236797333, + "learning_rate": 2.3046694629673716e-05, + "loss": 1.7004, + "step": 22507 + }, + { + "epoch": 6.9085328422345, + "grad_norm": 0.1911575049161911, + "learning_rate": 2.3042508241931814e-05, + "loss": 1.7013, + "step": 22508 + }, + { + "epoch": 6.9088397790055245, + "grad_norm": 0.17862342298030853, + "learning_rate": 2.303832212059965e-05, + "loss": 1.7053, + "step": 22509 + }, + { + "epoch": 6.90914671577655, + "grad_norm": 0.2268948256969452, + "learning_rate": 2.303413626571858e-05, + "loss": 1.7241, + "step": 22510 + }, + { + "epoch": 6.909453652547575, + "grad_norm": 0.1997457593679428, + "learning_rate": 2.3029950677329992e-05, + "loss": 1.6927, + "step": 22511 + }, + { + "epoch": 6.9097605893186005, + "grad_norm": 0.22120819985866547, + "learning_rate": 2.3025765355475232e-05, + "loss": 1.7447, + "step": 22512 + }, + { + "epoch": 6.910067526089626, + "grad_norm": 0.22097964584827423, + "learning_rate": 2.302158030019565e-05, + "loss": 1.7399, + "step": 22513 + }, + { + "epoch": 6.91037446286065, + "grad_norm": 0.2171044498682022, + "learning_rate": 2.3017395511532664e-05, + "loss": 1.7252, + "step": 22514 + }, + { + "epoch": 6.910681399631676, + "grad_norm": 0.1987348347902298, + "learning_rate": 2.301321098952757e-05, + "loss": 1.7071, + "step": 22515 + }, + { + "epoch": 6.910988336402701, + "grad_norm": 0.2131081372499466, + "learning_rate": 2.3009026734221746e-05, + "loss": 1.7314, + "step": 22516 + }, + { + "epoch": 6.911295273173726, + "grad_norm": 0.18867900967597961, + "learning_rate": 2.3004842745656536e-05, + "loss": 1.7431, + "step": 22517 + }, + { + "epoch": 6.911602209944752, + "grad_norm": 0.22853058576583862, + "learning_rate": 2.3000659023873277e-05, + "loss": 1.7234, + "step": 22518 + }, + { + "epoch": 6.911909146715777, + "grad_norm": 0.23441165685653687, + "learning_rate": 2.2996475568913366e-05, + "loss": 1.7535, + "step": 22519 + }, + { + "epoch": 6.912216083486801, + "grad_norm": 0.2376382052898407, + "learning_rate": 2.299229238081807e-05, + "loss": 1.7582, + "step": 22520 + }, + { + "epoch": 6.912523020257827, + "grad_norm": 0.2571510076522827, + "learning_rate": 2.2988109459628814e-05, + "loss": 1.722, + "step": 22521 + }, + { + "epoch": 6.912829957028852, + "grad_norm": 0.19782103598117828, + "learning_rate": 2.298392680538685e-05, + "loss": 1.7052, + "step": 22522 + }, + { + "epoch": 6.913136893799877, + "grad_norm": 0.24070625007152557, + "learning_rate": 2.297974441813358e-05, + "loss": 1.7306, + "step": 22523 + }, + { + "epoch": 6.913443830570903, + "grad_norm": 0.1783500611782074, + "learning_rate": 2.2975562297910307e-05, + "loss": 1.7077, + "step": 22524 + }, + { + "epoch": 6.913750767341927, + "grad_norm": 0.19469089806079865, + "learning_rate": 2.2971380444758373e-05, + "loss": 1.7275, + "step": 22525 + }, + { + "epoch": 6.9140577041129525, + "grad_norm": 0.21449480950832367, + "learning_rate": 2.2967198858719092e-05, + "loss": 1.7682, + "step": 22526 + }, + { + "epoch": 6.914364640883978, + "grad_norm": 0.21686261892318726, + "learning_rate": 2.2963017539833803e-05, + "loss": 1.6794, + "step": 22527 + }, + { + "epoch": 6.914671577655003, + "grad_norm": 0.2061273604631424, + "learning_rate": 2.2958836488143813e-05, + "loss": 1.7612, + "step": 22528 + }, + { + "epoch": 6.9149785144260285, + "grad_norm": 0.2708517611026764, + "learning_rate": 2.295465570369046e-05, + "loss": 1.7291, + "step": 22529 + }, + { + "epoch": 6.915285451197054, + "grad_norm": 0.17011860013008118, + "learning_rate": 2.295047518651503e-05, + "loss": 1.6541, + "step": 22530 + }, + { + "epoch": 6.915592387968078, + "grad_norm": 0.255305677652359, + "learning_rate": 2.294629493665889e-05, + "loss": 1.7063, + "step": 22531 + }, + { + "epoch": 6.915899324739104, + "grad_norm": 0.20172207057476044, + "learning_rate": 2.2942114954163306e-05, + "loss": 1.6678, + "step": 22532 + }, + { + "epoch": 6.916206261510129, + "grad_norm": 0.23726679384708405, + "learning_rate": 2.2937935239069603e-05, + "loss": 1.6762, + "step": 22533 + }, + { + "epoch": 6.916513198281154, + "grad_norm": 0.17716684937477112, + "learning_rate": 2.2933755791419082e-05, + "loss": 1.7302, + "step": 22534 + }, + { + "epoch": 6.91682013505218, + "grad_norm": 0.2513270974159241, + "learning_rate": 2.2929576611253035e-05, + "loss": 1.7371, + "step": 22535 + }, + { + "epoch": 6.917127071823204, + "grad_norm": 0.21994394063949585, + "learning_rate": 2.292539769861281e-05, + "loss": 1.7007, + "step": 22536 + }, + { + "epoch": 6.917434008594229, + "grad_norm": 0.2095540314912796, + "learning_rate": 2.292121905353964e-05, + "loss": 1.71, + "step": 22537 + }, + { + "epoch": 6.917740945365255, + "grad_norm": 0.24400855600833893, + "learning_rate": 2.2917040676074892e-05, + "loss": 1.7859, + "step": 22538 + }, + { + "epoch": 6.91804788213628, + "grad_norm": 0.23217935860157013, + "learning_rate": 2.2912862566259785e-05, + "loss": 1.8218, + "step": 22539 + }, + { + "epoch": 6.918354818907305, + "grad_norm": 0.23555497825145721, + "learning_rate": 2.2908684724135666e-05, + "loss": 1.7145, + "step": 22540 + }, + { + "epoch": 6.918661755678331, + "grad_norm": 0.17844347655773163, + "learning_rate": 2.2904507149743804e-05, + "loss": 1.6767, + "step": 22541 + }, + { + "epoch": 6.918968692449355, + "grad_norm": 0.20810428261756897, + "learning_rate": 2.290032984312548e-05, + "loss": 1.7359, + "step": 22542 + }, + { + "epoch": 6.9192756292203805, + "grad_norm": 0.20082542300224304, + "learning_rate": 2.289615280432198e-05, + "loss": 1.7623, + "step": 22543 + }, + { + "epoch": 6.919582565991406, + "grad_norm": 0.2005007117986679, + "learning_rate": 2.2891976033374584e-05, + "loss": 1.745, + "step": 22544 + }, + { + "epoch": 6.919889502762431, + "grad_norm": 0.18054969608783722, + "learning_rate": 2.2887799530324572e-05, + "loss": 1.6959, + "step": 22545 + }, + { + "epoch": 6.920196439533456, + "grad_norm": 0.18410442769527435, + "learning_rate": 2.2883623295213214e-05, + "loss": 1.7052, + "step": 22546 + }, + { + "epoch": 6.920503376304481, + "grad_norm": 0.17380426824092865, + "learning_rate": 2.2879447328081765e-05, + "loss": 1.6735, + "step": 22547 + }, + { + "epoch": 6.920810313075506, + "grad_norm": 0.19082246720790863, + "learning_rate": 2.2875271628971557e-05, + "loss": 1.7192, + "step": 22548 + }, + { + "epoch": 6.921117249846532, + "grad_norm": 0.17682792246341705, + "learning_rate": 2.2871096197923784e-05, + "loss": 1.649, + "step": 22549 + }, + { + "epoch": 6.921424186617557, + "grad_norm": 0.19127340614795685, + "learning_rate": 2.286692103497975e-05, + "loss": 1.7366, + "step": 22550 + }, + { + "epoch": 6.921731123388582, + "grad_norm": 0.1636040210723877, + "learning_rate": 2.2862746140180696e-05, + "loss": 1.6749, + "step": 22551 + }, + { + "epoch": 6.922038060159607, + "grad_norm": 0.2121013104915619, + "learning_rate": 2.285857151356788e-05, + "loss": 1.7342, + "step": 22552 + }, + { + "epoch": 6.922344996930632, + "grad_norm": 0.19183295965194702, + "learning_rate": 2.28543971551826e-05, + "loss": 1.7506, + "step": 22553 + }, + { + "epoch": 6.922651933701657, + "grad_norm": 0.23838891088962555, + "learning_rate": 2.285022306506604e-05, + "loss": 1.6875, + "step": 22554 + }, + { + "epoch": 6.922958870472683, + "grad_norm": 0.17147624492645264, + "learning_rate": 2.2846049243259526e-05, + "loss": 1.7074, + "step": 22555 + }, + { + "epoch": 6.923265807243708, + "grad_norm": 0.2254270762205124, + "learning_rate": 2.2841875689804236e-05, + "loss": 1.7589, + "step": 22556 + }, + { + "epoch": 6.9235727440147325, + "grad_norm": 0.249015673995018, + "learning_rate": 2.2837702404741462e-05, + "loss": 1.7708, + "step": 22557 + }, + { + "epoch": 6.923879680785758, + "grad_norm": 0.19401927292346954, + "learning_rate": 2.283352938811244e-05, + "loss": 1.696, + "step": 22558 + }, + { + "epoch": 6.924186617556783, + "grad_norm": 0.21134993433952332, + "learning_rate": 2.2829356639958398e-05, + "loss": 1.7136, + "step": 22559 + }, + { + "epoch": 6.9244935543278086, + "grad_norm": 0.17600105702877045, + "learning_rate": 2.2825184160320578e-05, + "loss": 1.679, + "step": 22560 + }, + { + "epoch": 6.924800491098834, + "grad_norm": 0.2426912486553192, + "learning_rate": 2.282101194924022e-05, + "loss": 1.7011, + "step": 22561 + }, + { + "epoch": 6.925107427869859, + "grad_norm": 0.20040342211723328, + "learning_rate": 2.281684000675855e-05, + "loss": 1.6844, + "step": 22562 + }, + { + "epoch": 6.925414364640884, + "grad_norm": 0.23790770769119263, + "learning_rate": 2.2812668332916798e-05, + "loss": 1.7318, + "step": 22563 + }, + { + "epoch": 6.925721301411909, + "grad_norm": 0.21387948095798492, + "learning_rate": 2.2808496927756196e-05, + "loss": 1.6903, + "step": 22564 + }, + { + "epoch": 6.926028238182934, + "grad_norm": 0.20471405982971191, + "learning_rate": 2.280432579131796e-05, + "loss": 1.7231, + "step": 22565 + }, + { + "epoch": 6.92633517495396, + "grad_norm": 0.1953156590461731, + "learning_rate": 2.280015492364332e-05, + "loss": 1.7322, + "step": 22566 + }, + { + "epoch": 6.926642111724985, + "grad_norm": 0.3107415437698364, + "learning_rate": 2.279598432477349e-05, + "loss": 1.7833, + "step": 22567 + }, + { + "epoch": 6.9269490484960095, + "grad_norm": 0.2114095836877823, + "learning_rate": 2.279181399474969e-05, + "loss": 1.6923, + "step": 22568 + }, + { + "epoch": 6.927255985267035, + "grad_norm": 0.21373972296714783, + "learning_rate": 2.2787643933613107e-05, + "loss": 1.6897, + "step": 22569 + }, + { + "epoch": 6.92756292203806, + "grad_norm": 0.17955096065998077, + "learning_rate": 2.278347414140502e-05, + "loss": 1.7443, + "step": 22570 + }, + { + "epoch": 6.9278698588090855, + "grad_norm": 0.19275230169296265, + "learning_rate": 2.2779304618166554e-05, + "loss": 1.7109, + "step": 22571 + }, + { + "epoch": 6.928176795580111, + "grad_norm": 0.16774436831474304, + "learning_rate": 2.277513536393899e-05, + "loss": 1.7059, + "step": 22572 + }, + { + "epoch": 6.928483732351136, + "grad_norm": 0.25093573331832886, + "learning_rate": 2.2770966378763457e-05, + "loss": 1.7501, + "step": 22573 + }, + { + "epoch": 6.928790669122161, + "grad_norm": 0.24859540164470673, + "learning_rate": 2.2766797662681216e-05, + "loss": 1.7315, + "step": 22574 + }, + { + "epoch": 6.929097605893186, + "grad_norm": 0.1736115962266922, + "learning_rate": 2.2762629215733438e-05, + "loss": 1.7422, + "step": 22575 + }, + { + "epoch": 6.929404542664211, + "grad_norm": 0.23705001175403595, + "learning_rate": 2.2758461037961326e-05, + "loss": 1.7818, + "step": 22576 + }, + { + "epoch": 6.929711479435237, + "grad_norm": 0.21123656630516052, + "learning_rate": 2.2754293129406073e-05, + "loss": 1.7652, + "step": 22577 + }, + { + "epoch": 6.930018416206261, + "grad_norm": 0.2195751667022705, + "learning_rate": 2.2750125490108858e-05, + "loss": 1.7103, + "step": 22578 + }, + { + "epoch": 6.930325352977286, + "grad_norm": 0.17324887216091156, + "learning_rate": 2.274595812011088e-05, + "loss": 1.7386, + "step": 22579 + }, + { + "epoch": 6.930632289748312, + "grad_norm": 0.3175726532936096, + "learning_rate": 2.2741791019453313e-05, + "loss": 1.7608, + "step": 22580 + }, + { + "epoch": 6.930939226519337, + "grad_norm": 0.26266980171203613, + "learning_rate": 2.273762418817734e-05, + "loss": 1.691, + "step": 22581 + }, + { + "epoch": 6.931246163290362, + "grad_norm": 0.21905983984470367, + "learning_rate": 2.273345762632415e-05, + "loss": 1.6886, + "step": 22582 + }, + { + "epoch": 6.931553100061388, + "grad_norm": 0.2201247364282608, + "learning_rate": 2.2729291333934914e-05, + "loss": 1.7313, + "step": 22583 + }, + { + "epoch": 6.931860036832412, + "grad_norm": 0.2844204306602478, + "learning_rate": 2.2725125311050805e-05, + "loss": 1.6918, + "step": 22584 + }, + { + "epoch": 6.9321669736034375, + "grad_norm": 0.22451715171337128, + "learning_rate": 2.272095955771299e-05, + "loss": 1.699, + "step": 22585 + }, + { + "epoch": 6.932473910374463, + "grad_norm": 0.27357545495033264, + "learning_rate": 2.2716794073962645e-05, + "loss": 1.7709, + "step": 22586 + }, + { + "epoch": 6.932780847145488, + "grad_norm": 0.2605188190937042, + "learning_rate": 2.271262885984093e-05, + "loss": 1.7812, + "step": 22587 + }, + { + "epoch": 6.9330877839165135, + "grad_norm": 0.1866278201341629, + "learning_rate": 2.270846391538899e-05, + "loss": 1.7204, + "step": 22588 + }, + { + "epoch": 6.933394720687538, + "grad_norm": 0.24624690413475037, + "learning_rate": 2.2704299240648043e-05, + "loss": 1.7345, + "step": 22589 + }, + { + "epoch": 6.933701657458563, + "grad_norm": 0.18003861606121063, + "learning_rate": 2.2700134835659175e-05, + "loss": 1.73, + "step": 22590 + }, + { + "epoch": 6.934008594229589, + "grad_norm": 0.2330949604511261, + "learning_rate": 2.269597070046359e-05, + "loss": 1.7614, + "step": 22591 + }, + { + "epoch": 6.934315531000614, + "grad_norm": 0.18806515634059906, + "learning_rate": 2.269180683510243e-05, + "loss": 1.7364, + "step": 22592 + }, + { + "epoch": 6.934622467771639, + "grad_norm": 0.23998546600341797, + "learning_rate": 2.268764323961684e-05, + "loss": 1.6858, + "step": 22593 + }, + { + "epoch": 6.934929404542665, + "grad_norm": 0.1707296371459961, + "learning_rate": 2.268347991404797e-05, + "loss": 1.6703, + "step": 22594 + }, + { + "epoch": 6.935236341313689, + "grad_norm": 0.19724871218204498, + "learning_rate": 2.267931685843696e-05, + "loss": 1.7338, + "step": 22595 + }, + { + "epoch": 6.935543278084714, + "grad_norm": 0.20384611189365387, + "learning_rate": 2.2675154072824955e-05, + "loss": 1.7224, + "step": 22596 + }, + { + "epoch": 6.93585021485574, + "grad_norm": 0.18632391095161438, + "learning_rate": 2.2670991557253092e-05, + "loss": 1.7006, + "step": 22597 + }, + { + "epoch": 6.936157151626765, + "grad_norm": 0.22928105294704437, + "learning_rate": 2.2666829311762505e-05, + "loss": 1.7462, + "step": 22598 + }, + { + "epoch": 6.93646408839779, + "grad_norm": 0.1905689388513565, + "learning_rate": 2.266266733639434e-05, + "loss": 1.7071, + "step": 22599 + }, + { + "epoch": 6.936771025168815, + "grad_norm": 0.2051437795162201, + "learning_rate": 2.2658505631189708e-05, + "loss": 1.6872, + "step": 22600 + }, + { + "epoch": 6.93707796193984, + "grad_norm": 0.178196981549263, + "learning_rate": 2.265434419618976e-05, + "loss": 1.7044, + "step": 22601 + }, + { + "epoch": 6.9373848987108655, + "grad_norm": 0.21399027109146118, + "learning_rate": 2.26501830314356e-05, + "loss": 1.7529, + "step": 22602 + }, + { + "epoch": 6.937691835481891, + "grad_norm": 0.21747443079948425, + "learning_rate": 2.264602213696837e-05, + "loss": 1.7662, + "step": 22603 + }, + { + "epoch": 6.937998772252916, + "grad_norm": 0.1939898282289505, + "learning_rate": 2.2641861512829177e-05, + "loss": 1.7194, + "step": 22604 + }, + { + "epoch": 6.9383057090239415, + "grad_norm": 0.2183499038219452, + "learning_rate": 2.2637701159059128e-05, + "loss": 1.6659, + "step": 22605 + }, + { + "epoch": 6.938612645794966, + "grad_norm": 0.21971984207630157, + "learning_rate": 2.2633541075699387e-05, + "loss": 1.7729, + "step": 22606 + }, + { + "epoch": 6.938919582565991, + "grad_norm": 0.2611743211746216, + "learning_rate": 2.2629381262790998e-05, + "loss": 1.8, + "step": 22607 + }, + { + "epoch": 6.939226519337017, + "grad_norm": 0.22962158918380737, + "learning_rate": 2.2625221720375144e-05, + "loss": 1.7244, + "step": 22608 + }, + { + "epoch": 6.939533456108042, + "grad_norm": 0.20961032807826996, + "learning_rate": 2.2621062448492858e-05, + "loss": 1.7107, + "step": 22609 + }, + { + "epoch": 6.939840392879067, + "grad_norm": 0.2370155155658722, + "learning_rate": 2.2616903447185293e-05, + "loss": 1.7185, + "step": 22610 + }, + { + "epoch": 6.940147329650092, + "grad_norm": 0.19033893942832947, + "learning_rate": 2.2612744716493544e-05, + "loss": 1.7034, + "step": 22611 + }, + { + "epoch": 6.940454266421117, + "grad_norm": 0.22657649219036102, + "learning_rate": 2.2608586256458704e-05, + "loss": 1.6987, + "step": 22612 + }, + { + "epoch": 6.940761203192142, + "grad_norm": 0.17767953872680664, + "learning_rate": 2.2604428067121862e-05, + "loss": 1.6934, + "step": 22613 + }, + { + "epoch": 6.941068139963168, + "grad_norm": 0.209768146276474, + "learning_rate": 2.2600270148524123e-05, + "loss": 1.7148, + "step": 22614 + }, + { + "epoch": 6.941375076734193, + "grad_norm": 0.21234147250652313, + "learning_rate": 2.2596112500706574e-05, + "loss": 1.7147, + "step": 22615 + }, + { + "epoch": 6.941682013505218, + "grad_norm": 0.17608872056007385, + "learning_rate": 2.2591955123710307e-05, + "loss": 1.6873, + "step": 22616 + }, + { + "epoch": 6.941988950276243, + "grad_norm": 0.1743561178445816, + "learning_rate": 2.25877980175764e-05, + "loss": 1.7273, + "step": 22617 + }, + { + "epoch": 6.942295887047268, + "grad_norm": 0.22064091265201569, + "learning_rate": 2.258364118234594e-05, + "loss": 1.7785, + "step": 22618 + }, + { + "epoch": 6.9426028238182935, + "grad_norm": 0.20353585481643677, + "learning_rate": 2.2579484618060005e-05, + "loss": 1.7518, + "step": 22619 + }, + { + "epoch": 6.942909760589319, + "grad_norm": 0.23978710174560547, + "learning_rate": 2.2575328324759676e-05, + "loss": 1.7576, + "step": 22620 + }, + { + "epoch": 6.943216697360343, + "grad_norm": 0.24991966784000397, + "learning_rate": 2.257117230248603e-05, + "loss": 1.7383, + "step": 22621 + }, + { + "epoch": 6.943523634131369, + "grad_norm": 0.20734381675720215, + "learning_rate": 2.256701655128011e-05, + "loss": 1.7063, + "step": 22622 + }, + { + "epoch": 6.943830570902394, + "grad_norm": 0.20097215473651886, + "learning_rate": 2.2562861071183057e-05, + "loss": 1.7647, + "step": 22623 + }, + { + "epoch": 6.944137507673419, + "grad_norm": 0.20144836604595184, + "learning_rate": 2.2558705862235852e-05, + "loss": 1.7165, + "step": 22624 + }, + { + "epoch": 6.944444444444445, + "grad_norm": 0.20394138991832733, + "learning_rate": 2.255455092447964e-05, + "loss": 1.7048, + "step": 22625 + }, + { + "epoch": 6.94475138121547, + "grad_norm": 0.21430160105228424, + "learning_rate": 2.2550396257955396e-05, + "loss": 1.7233, + "step": 22626 + }, + { + "epoch": 6.945058317986494, + "grad_norm": 0.19071494042873383, + "learning_rate": 2.254624186270425e-05, + "loss": 1.7407, + "step": 22627 + }, + { + "epoch": 6.94536525475752, + "grad_norm": 0.19658641517162323, + "learning_rate": 2.2542087738767232e-05, + "loss": 1.6371, + "step": 22628 + }, + { + "epoch": 6.945672191528545, + "grad_norm": 0.19009098410606384, + "learning_rate": 2.25379338861854e-05, + "loss": 1.7515, + "step": 22629 + }, + { + "epoch": 6.94597912829957, + "grad_norm": 0.21250933408737183, + "learning_rate": 2.2533780304999796e-05, + "loss": 1.7308, + "step": 22630 + }, + { + "epoch": 6.946286065070596, + "grad_norm": 0.22148491442203522, + "learning_rate": 2.2529626995251475e-05, + "loss": 1.705, + "step": 22631 + }, + { + "epoch": 6.94659300184162, + "grad_norm": 0.190248504281044, + "learning_rate": 2.252547395698148e-05, + "loss": 1.7507, + "step": 22632 + }, + { + "epoch": 6.9468999386126455, + "grad_norm": 0.20005743205547333, + "learning_rate": 2.2521321190230855e-05, + "loss": 1.7622, + "step": 22633 + }, + { + "epoch": 6.947206875383671, + "grad_norm": 0.24233438074588776, + "learning_rate": 2.251716869504064e-05, + "loss": 1.7119, + "step": 22634 + }, + { + "epoch": 6.947513812154696, + "grad_norm": 0.20823299884796143, + "learning_rate": 2.2513016471451874e-05, + "loss": 1.69, + "step": 22635 + }, + { + "epoch": 6.9478207489257215, + "grad_norm": 0.21486341953277588, + "learning_rate": 2.250886451950559e-05, + "loss": 1.6528, + "step": 22636 + }, + { + "epoch": 6.948127685696747, + "grad_norm": 0.22201848030090332, + "learning_rate": 2.2504712839242813e-05, + "loss": 1.7454, + "step": 22637 + }, + { + "epoch": 6.948434622467771, + "grad_norm": 0.25179341435432434, + "learning_rate": 2.2500561430704588e-05, + "loss": 1.7226, + "step": 22638 + }, + { + "epoch": 6.948741559238797, + "grad_norm": 0.2510581910610199, + "learning_rate": 2.2496410293931913e-05, + "loss": 1.7048, + "step": 22639 + }, + { + "epoch": 6.949048496009822, + "grad_norm": 0.2406487911939621, + "learning_rate": 2.2492259428965866e-05, + "loss": 1.6751, + "step": 22640 + }, + { + "epoch": 6.949355432780847, + "grad_norm": 0.2555276155471802, + "learning_rate": 2.24881088358474e-05, + "loss": 1.7369, + "step": 22641 + }, + { + "epoch": 6.949662369551873, + "grad_norm": 0.19703364372253418, + "learning_rate": 2.2483958514617597e-05, + "loss": 1.7196, + "step": 22642 + }, + { + "epoch": 6.949969306322897, + "grad_norm": 0.18491938710212708, + "learning_rate": 2.2479808465317414e-05, + "loss": 1.6923, + "step": 22643 + }, + { + "epoch": 6.9502762430939224, + "grad_norm": 0.21588458120822906, + "learning_rate": 2.247565868798791e-05, + "loss": 1.6797, + "step": 22644 + }, + { + "epoch": 6.950583179864948, + "grad_norm": 0.18480601906776428, + "learning_rate": 2.247150918267008e-05, + "loss": 1.6672, + "step": 22645 + }, + { + "epoch": 6.950890116635973, + "grad_norm": 0.261846125125885, + "learning_rate": 2.246735994940493e-05, + "loss": 1.7594, + "step": 22646 + }, + { + "epoch": 6.9511970534069984, + "grad_norm": 0.24510261416435242, + "learning_rate": 2.2463210988233468e-05, + "loss": 1.7712, + "step": 22647 + }, + { + "epoch": 6.951503990178024, + "grad_norm": 0.25896379351615906, + "learning_rate": 2.24590622991967e-05, + "loss": 1.6811, + "step": 22648 + }, + { + "epoch": 6.951810926949048, + "grad_norm": 0.26284709572792053, + "learning_rate": 2.245491388233561e-05, + "loss": 1.7269, + "step": 22649 + }, + { + "epoch": 6.952117863720074, + "grad_norm": 0.1613062471151352, + "learning_rate": 2.245076573769121e-05, + "loss": 1.6162, + "step": 22650 + }, + { + "epoch": 6.952424800491099, + "grad_norm": 0.203482523560524, + "learning_rate": 2.244661786530449e-05, + "loss": 1.7124, + "step": 22651 + }, + { + "epoch": 6.952731737262124, + "grad_norm": 0.18294258415699005, + "learning_rate": 2.2442470265216446e-05, + "loss": 1.7101, + "step": 22652 + }, + { + "epoch": 6.953038674033149, + "grad_norm": 0.1841319352388382, + "learning_rate": 2.2438322937468058e-05, + "loss": 1.723, + "step": 22653 + }, + { + "epoch": 6.953345610804174, + "grad_norm": 0.1600010097026825, + "learning_rate": 2.2434175882100322e-05, + "loss": 1.6867, + "step": 22654 + }, + { + "epoch": 6.953652547575199, + "grad_norm": 0.16904005408287048, + "learning_rate": 2.243002909915421e-05, + "loss": 1.6993, + "step": 22655 + }, + { + "epoch": 6.953959484346225, + "grad_norm": 0.20069406926631927, + "learning_rate": 2.2425882588670692e-05, + "loss": 1.6995, + "step": 22656 + }, + { + "epoch": 6.95426642111725, + "grad_norm": 0.170061394572258, + "learning_rate": 2.2421736350690808e-05, + "loss": 1.7217, + "step": 22657 + }, + { + "epoch": 6.954573357888275, + "grad_norm": 0.20549608767032623, + "learning_rate": 2.241759038525545e-05, + "loss": 1.7229, + "step": 22658 + }, + { + "epoch": 6.9548802946593, + "grad_norm": 0.20916205644607544, + "learning_rate": 2.241344469240566e-05, + "loss": 1.7499, + "step": 22659 + }, + { + "epoch": 6.955187231430325, + "grad_norm": 0.156641885638237, + "learning_rate": 2.2409299272182348e-05, + "loss": 1.6827, + "step": 22660 + }, + { + "epoch": 6.9554941682013505, + "grad_norm": 0.17876049876213074, + "learning_rate": 2.240515412462653e-05, + "loss": 1.6745, + "step": 22661 + }, + { + "epoch": 6.955801104972376, + "grad_norm": 0.17265759408473969, + "learning_rate": 2.2401009249779153e-05, + "loss": 1.7687, + "step": 22662 + }, + { + "epoch": 6.956108041743401, + "grad_norm": 0.18822525441646576, + "learning_rate": 2.2396864647681175e-05, + "loss": 1.6974, + "step": 22663 + }, + { + "epoch": 6.956414978514426, + "grad_norm": 0.18686626851558685, + "learning_rate": 2.2392720318373567e-05, + "loss": 1.7522, + "step": 22664 + }, + { + "epoch": 6.956721915285451, + "grad_norm": 0.1668211668729782, + "learning_rate": 2.238857626189727e-05, + "loss": 1.7198, + "step": 22665 + }, + { + "epoch": 6.957028852056476, + "grad_norm": 0.23307017982006073, + "learning_rate": 2.238443247829325e-05, + "loss": 1.7377, + "step": 22666 + }, + { + "epoch": 6.957335788827502, + "grad_norm": 0.1771896481513977, + "learning_rate": 2.2380288967602453e-05, + "loss": 1.7626, + "step": 22667 + }, + { + "epoch": 6.957642725598527, + "grad_norm": 0.185984805226326, + "learning_rate": 2.237614572986583e-05, + "loss": 1.7328, + "step": 22668 + }, + { + "epoch": 6.957949662369552, + "grad_norm": 0.3076271414756775, + "learning_rate": 2.2372002765124327e-05, + "loss": 1.7081, + "step": 22669 + }, + { + "epoch": 6.958256599140577, + "grad_norm": 0.17874667048454285, + "learning_rate": 2.2367860073418885e-05, + "loss": 1.6752, + "step": 22670 + }, + { + "epoch": 6.958563535911602, + "grad_norm": 0.2044304609298706, + "learning_rate": 2.2363717654790445e-05, + "loss": 1.7325, + "step": 22671 + }, + { + "epoch": 6.958870472682627, + "grad_norm": 0.19335824251174927, + "learning_rate": 2.2359575509279945e-05, + "loss": 1.7192, + "step": 22672 + }, + { + "epoch": 6.959177409453653, + "grad_norm": 0.19514116644859314, + "learning_rate": 2.23554336369283e-05, + "loss": 1.7186, + "step": 22673 + }, + { + "epoch": 6.959484346224678, + "grad_norm": 0.2779110372066498, + "learning_rate": 2.23512920377765e-05, + "loss": 1.7391, + "step": 22674 + }, + { + "epoch": 6.9597912829957025, + "grad_norm": 0.17390480637550354, + "learning_rate": 2.2347150711865406e-05, + "loss": 1.6538, + "step": 22675 + }, + { + "epoch": 6.960098219766728, + "grad_norm": 0.1640262007713318, + "learning_rate": 2.234300965923601e-05, + "loss": 1.6534, + "step": 22676 + }, + { + "epoch": 6.960405156537753, + "grad_norm": 0.17519034445285797, + "learning_rate": 2.2338868879929165e-05, + "loss": 1.6931, + "step": 22677 + }, + { + "epoch": 6.9607120933087785, + "grad_norm": 0.16885873675346375, + "learning_rate": 2.2334728373985847e-05, + "loss": 1.7204, + "step": 22678 + }, + { + "epoch": 6.961019030079804, + "grad_norm": 0.16997110843658447, + "learning_rate": 2.2330588141446963e-05, + "loss": 1.7063, + "step": 22679 + }, + { + "epoch": 6.961325966850829, + "grad_norm": 0.17793773114681244, + "learning_rate": 2.2326448182353422e-05, + "loss": 1.7382, + "step": 22680 + }, + { + "epoch": 6.961632903621854, + "grad_norm": 0.1809101551771164, + "learning_rate": 2.2322308496746134e-05, + "loss": 1.6874, + "step": 22681 + }, + { + "epoch": 6.961939840392879, + "grad_norm": 0.19095295667648315, + "learning_rate": 2.2318169084666023e-05, + "loss": 1.7122, + "step": 22682 + }, + { + "epoch": 6.962246777163904, + "grad_norm": 0.19206218421459198, + "learning_rate": 2.2314029946153992e-05, + "loss": 1.6733, + "step": 22683 + }, + { + "epoch": 6.96255371393493, + "grad_norm": 0.21243152022361755, + "learning_rate": 2.2309891081250938e-05, + "loss": 1.7026, + "step": 22684 + }, + { + "epoch": 6.962860650705955, + "grad_norm": 0.17602933943271637, + "learning_rate": 2.2305752489997777e-05, + "loss": 1.7073, + "step": 22685 + }, + { + "epoch": 6.963167587476979, + "grad_norm": 0.21810807287693024, + "learning_rate": 2.2301614172435398e-05, + "loss": 1.7323, + "step": 22686 + }, + { + "epoch": 6.963474524248005, + "grad_norm": 0.20711791515350342, + "learning_rate": 2.2297476128604706e-05, + "loss": 1.7228, + "step": 22687 + }, + { + "epoch": 6.96378146101903, + "grad_norm": 0.20376695692539215, + "learning_rate": 2.2293338358546583e-05, + "loss": 1.715, + "step": 22688 + }, + { + "epoch": 6.964088397790055, + "grad_norm": 0.20096196234226227, + "learning_rate": 2.228920086230194e-05, + "loss": 1.7239, + "step": 22689 + }, + { + "epoch": 6.964395334561081, + "grad_norm": 0.24215486645698547, + "learning_rate": 2.228506363991163e-05, + "loss": 1.7879, + "step": 22690 + }, + { + "epoch": 6.964702271332106, + "grad_norm": 0.1917567104101181, + "learning_rate": 2.2280926691416603e-05, + "loss": 1.6903, + "step": 22691 + }, + { + "epoch": 6.9650092081031305, + "grad_norm": 0.19827421009540558, + "learning_rate": 2.2276790016857673e-05, + "loss": 1.7654, + "step": 22692 + }, + { + "epoch": 6.965316144874156, + "grad_norm": 0.20852476358413696, + "learning_rate": 2.2272653616275784e-05, + "loss": 1.7452, + "step": 22693 + }, + { + "epoch": 6.965623081645181, + "grad_norm": 0.21223776042461395, + "learning_rate": 2.2268517489711755e-05, + "loss": 1.6973, + "step": 22694 + }, + { + "epoch": 6.9659300184162065, + "grad_norm": 0.1903543621301651, + "learning_rate": 2.22643816372065e-05, + "loss": 1.7398, + "step": 22695 + }, + { + "epoch": 6.966236955187231, + "grad_norm": 0.21726597845554352, + "learning_rate": 2.2260246058800888e-05, + "loss": 1.7813, + "step": 22696 + }, + { + "epoch": 6.966543891958256, + "grad_norm": 0.1710241734981537, + "learning_rate": 2.225611075453578e-05, + "loss": 1.6647, + "step": 22697 + }, + { + "epoch": 6.966850828729282, + "grad_norm": 0.199532151222229, + "learning_rate": 2.2251975724452045e-05, + "loss": 1.7503, + "step": 22698 + }, + { + "epoch": 6.967157765500307, + "grad_norm": 0.18966728448867798, + "learning_rate": 2.224784096859055e-05, + "loss": 1.8113, + "step": 22699 + }, + { + "epoch": 6.967464702271332, + "grad_norm": 0.1977413445711136, + "learning_rate": 2.2243706486992162e-05, + "loss": 1.7036, + "step": 22700 + }, + { + "epoch": 6.967771639042358, + "grad_norm": 0.1794840395450592, + "learning_rate": 2.223957227969773e-05, + "loss": 1.714, + "step": 22701 + }, + { + "epoch": 6.968078575813382, + "grad_norm": 0.1811632663011551, + "learning_rate": 2.2235438346748117e-05, + "loss": 1.6845, + "step": 22702 + }, + { + "epoch": 6.968385512584407, + "grad_norm": 0.17478540539741516, + "learning_rate": 2.2231304688184172e-05, + "loss": 1.7078, + "step": 22703 + }, + { + "epoch": 6.968692449355433, + "grad_norm": 0.22631226480007172, + "learning_rate": 2.2227171304046756e-05, + "loss": 1.7576, + "step": 22704 + }, + { + "epoch": 6.968999386126458, + "grad_norm": 0.20498304069042206, + "learning_rate": 2.2223038194376712e-05, + "loss": 1.7342, + "step": 22705 + }, + { + "epoch": 6.969306322897483, + "grad_norm": 0.18556833267211914, + "learning_rate": 2.221890535921488e-05, + "loss": 1.6583, + "step": 22706 + }, + { + "epoch": 6.969613259668508, + "grad_norm": 0.19878216087818146, + "learning_rate": 2.221477279860209e-05, + "loss": 1.7536, + "step": 22707 + }, + { + "epoch": 6.969920196439533, + "grad_norm": 0.20304621756076813, + "learning_rate": 2.221064051257924e-05, + "loss": 1.7263, + "step": 22708 + }, + { + "epoch": 6.9702271332105585, + "grad_norm": 0.18725872039794922, + "learning_rate": 2.220650850118709e-05, + "loss": 1.7174, + "step": 22709 + }, + { + "epoch": 6.970534069981584, + "grad_norm": 0.28994759917259216, + "learning_rate": 2.2202376764466554e-05, + "loss": 1.7401, + "step": 22710 + }, + { + "epoch": 6.970841006752609, + "grad_norm": 0.19320951402187347, + "learning_rate": 2.2198245302458383e-05, + "loss": 1.7204, + "step": 22711 + }, + { + "epoch": 6.9711479435236345, + "grad_norm": 0.24737104773521423, + "learning_rate": 2.2194114115203464e-05, + "loss": 1.7418, + "step": 22712 + }, + { + "epoch": 6.971454880294659, + "grad_norm": 0.18811406195163727, + "learning_rate": 2.218998320274261e-05, + "loss": 1.6999, + "step": 22713 + }, + { + "epoch": 6.971761817065684, + "grad_norm": 0.20729362964630127, + "learning_rate": 2.2185852565116638e-05, + "loss": 1.6833, + "step": 22714 + }, + { + "epoch": 6.97206875383671, + "grad_norm": 0.1862284392118454, + "learning_rate": 2.2181722202366378e-05, + "loss": 1.7232, + "step": 22715 + }, + { + "epoch": 6.972375690607735, + "grad_norm": 0.24128347635269165, + "learning_rate": 2.217759211453264e-05, + "loss": 1.7081, + "step": 22716 + }, + { + "epoch": 6.97268262737876, + "grad_norm": 0.2007059007883072, + "learning_rate": 2.217346230165625e-05, + "loss": 1.7383, + "step": 22717 + }, + { + "epoch": 6.972989564149785, + "grad_norm": 0.2177598625421524, + "learning_rate": 2.216933276377801e-05, + "loss": 1.7494, + "step": 22718 + }, + { + "epoch": 6.97329650092081, + "grad_norm": 0.20965704321861267, + "learning_rate": 2.2165203500938735e-05, + "loss": 1.7326, + "step": 22719 + }, + { + "epoch": 6.973603437691835, + "grad_norm": 0.17255879938602448, + "learning_rate": 2.2161074513179237e-05, + "loss": 1.6713, + "step": 22720 + }, + { + "epoch": 6.973910374462861, + "grad_norm": 0.21480637788772583, + "learning_rate": 2.215694580054032e-05, + "loss": 1.7248, + "step": 22721 + }, + { + "epoch": 6.974217311233886, + "grad_norm": 0.15835267305374146, + "learning_rate": 2.215281736306278e-05, + "loss": 1.7086, + "step": 22722 + }, + { + "epoch": 6.974524248004911, + "grad_norm": 0.20524290204048157, + "learning_rate": 2.2148689200787415e-05, + "loss": 1.7472, + "step": 22723 + }, + { + "epoch": 6.974831184775936, + "grad_norm": 0.16152524948120117, + "learning_rate": 2.214456131375502e-05, + "loss": 1.6373, + "step": 22724 + }, + { + "epoch": 6.975138121546961, + "grad_norm": 0.1995699107646942, + "learning_rate": 2.2140433702006425e-05, + "loss": 1.6949, + "step": 22725 + }, + { + "epoch": 6.975445058317987, + "grad_norm": 0.19927829504013062, + "learning_rate": 2.213630636558236e-05, + "loss": 1.7875, + "step": 22726 + }, + { + "epoch": 6.975751995089012, + "grad_norm": 0.19159351289272308, + "learning_rate": 2.213217930452368e-05, + "loss": 1.7067, + "step": 22727 + }, + { + "epoch": 6.976058931860036, + "grad_norm": 0.21832366287708282, + "learning_rate": 2.2128052518871107e-05, + "loss": 1.6952, + "step": 22728 + }, + { + "epoch": 6.976365868631062, + "grad_norm": 0.2433125376701355, + "learning_rate": 2.212392600866547e-05, + "loss": 1.7503, + "step": 22729 + }, + { + "epoch": 6.976672805402087, + "grad_norm": 0.25504401326179504, + "learning_rate": 2.2119799773947535e-05, + "loss": 1.7289, + "step": 22730 + }, + { + "epoch": 6.976979742173112, + "grad_norm": 0.20463863015174866, + "learning_rate": 2.211567381475808e-05, + "loss": 1.7442, + "step": 22731 + }, + { + "epoch": 6.977286678944138, + "grad_norm": 0.21862375736236572, + "learning_rate": 2.2111548131137883e-05, + "loss": 1.7266, + "step": 22732 + }, + { + "epoch": 6.977593615715163, + "grad_norm": 0.2124018520116806, + "learning_rate": 2.210742272312771e-05, + "loss": 1.7555, + "step": 22733 + }, + { + "epoch": 6.9779005524861875, + "grad_norm": 0.2911135256290436, + "learning_rate": 2.2103297590768334e-05, + "loss": 1.711, + "step": 22734 + }, + { + "epoch": 6.978207489257213, + "grad_norm": 0.2172393649816513, + "learning_rate": 2.2099172734100525e-05, + "loss": 1.7054, + "step": 22735 + }, + { + "epoch": 6.978514426028238, + "grad_norm": 0.28964513540267944, + "learning_rate": 2.2095048153165043e-05, + "loss": 1.7231, + "step": 22736 + }, + { + "epoch": 6.9788213627992635, + "grad_norm": 0.2557905316352844, + "learning_rate": 2.209092384800265e-05, + "loss": 1.7219, + "step": 22737 + }, + { + "epoch": 6.979128299570289, + "grad_norm": 0.23358628153800964, + "learning_rate": 2.2086799818654102e-05, + "loss": 1.7627, + "step": 22738 + }, + { + "epoch": 6.979435236341313, + "grad_norm": 0.18856312334537506, + "learning_rate": 2.2082676065160163e-05, + "loss": 1.6577, + "step": 22739 + }, + { + "epoch": 6.979742173112339, + "grad_norm": 0.18412479758262634, + "learning_rate": 2.207855258756158e-05, + "loss": 1.6661, + "step": 22740 + }, + { + "epoch": 6.980049109883364, + "grad_norm": 0.20592401921749115, + "learning_rate": 2.207442938589911e-05, + "loss": 1.6737, + "step": 22741 + }, + { + "epoch": 6.980356046654389, + "grad_norm": 0.2015630006790161, + "learning_rate": 2.2070306460213493e-05, + "loss": 1.73, + "step": 22742 + }, + { + "epoch": 6.980662983425415, + "grad_norm": 0.23446126282215118, + "learning_rate": 2.2066183810545454e-05, + "loss": 1.7391, + "step": 22743 + }, + { + "epoch": 6.98096992019644, + "grad_norm": 0.1810954511165619, + "learning_rate": 2.2062061436935803e-05, + "loss": 1.689, + "step": 22744 + }, + { + "epoch": 6.981276856967464, + "grad_norm": 0.25031471252441406, + "learning_rate": 2.20579393394252e-05, + "loss": 1.8161, + "step": 22745 + }, + { + "epoch": 6.98158379373849, + "grad_norm": 0.183212012052536, + "learning_rate": 2.2053817518054433e-05, + "loss": 1.6494, + "step": 22746 + }, + { + "epoch": 6.981890730509515, + "grad_norm": 0.2115766555070877, + "learning_rate": 2.204969597286422e-05, + "loss": 1.6912, + "step": 22747 + }, + { + "epoch": 6.98219766728054, + "grad_norm": 0.19966226816177368, + "learning_rate": 2.2045574703895296e-05, + "loss": 1.7002, + "step": 22748 + }, + { + "epoch": 6.982504604051566, + "grad_norm": 0.20601172745227814, + "learning_rate": 2.2041453711188385e-05, + "loss": 1.7839, + "step": 22749 + }, + { + "epoch": 6.98281154082259, + "grad_norm": 0.2174808531999588, + "learning_rate": 2.2037332994784222e-05, + "loss": 1.7169, + "step": 22750 + }, + { + "epoch": 6.9831184775936155, + "grad_norm": 0.1921808421611786, + "learning_rate": 2.2033212554723514e-05, + "loss": 1.6754, + "step": 22751 + }, + { + "epoch": 6.983425414364641, + "grad_norm": 0.1977350264787674, + "learning_rate": 2.2029092391046997e-05, + "loss": 1.7408, + "step": 22752 + }, + { + "epoch": 6.983732351135666, + "grad_norm": 0.18366695940494537, + "learning_rate": 2.2024972503795383e-05, + "loss": 1.6818, + "step": 22753 + }, + { + "epoch": 6.9840392879066915, + "grad_norm": 0.18127809464931488, + "learning_rate": 2.2020852893009387e-05, + "loss": 1.7392, + "step": 22754 + }, + { + "epoch": 6.984346224677717, + "grad_norm": 0.1973503679037094, + "learning_rate": 2.2016733558729718e-05, + "loss": 1.7416, + "step": 22755 + }, + { + "epoch": 6.984653161448741, + "grad_norm": 0.1971634328365326, + "learning_rate": 2.2012614500997096e-05, + "loss": 1.7545, + "step": 22756 + }, + { + "epoch": 6.984960098219767, + "grad_norm": 0.17244087159633636, + "learning_rate": 2.2008495719852218e-05, + "loss": 1.7348, + "step": 22757 + }, + { + "epoch": 6.985267034990792, + "grad_norm": 0.19024424254894257, + "learning_rate": 2.200437721533579e-05, + "loss": 1.6647, + "step": 22758 + }, + { + "epoch": 6.985573971761817, + "grad_norm": 0.18455122411251068, + "learning_rate": 2.200025898748852e-05, + "loss": 1.7528, + "step": 22759 + }, + { + "epoch": 6.985880908532843, + "grad_norm": 0.24437187612056732, + "learning_rate": 2.199614103635108e-05, + "loss": 1.7101, + "step": 22760 + }, + { + "epoch": 6.986187845303867, + "grad_norm": 0.18844331800937653, + "learning_rate": 2.1992023361964224e-05, + "loss": 1.6864, + "step": 22761 + }, + { + "epoch": 6.986494782074892, + "grad_norm": 0.18768003582954407, + "learning_rate": 2.1987905964368576e-05, + "loss": 1.6482, + "step": 22762 + }, + { + "epoch": 6.986801718845918, + "grad_norm": 0.19491778314113617, + "learning_rate": 2.1983788843604898e-05, + "loss": 1.7106, + "step": 22763 + }, + { + "epoch": 6.987108655616943, + "grad_norm": 0.23565757274627686, + "learning_rate": 2.1979671999713797e-05, + "loss": 1.7362, + "step": 22764 + }, + { + "epoch": 6.987415592387968, + "grad_norm": 0.2097240835428238, + "learning_rate": 2.1975555432736018e-05, + "loss": 1.7305, + "step": 22765 + }, + { + "epoch": 6.987722529158994, + "grad_norm": 0.2171555608510971, + "learning_rate": 2.197143914271223e-05, + "loss": 1.7213, + "step": 22766 + }, + { + "epoch": 6.988029465930018, + "grad_norm": 0.1993926763534546, + "learning_rate": 2.196732312968311e-05, + "loss": 1.6901, + "step": 22767 + }, + { + "epoch": 6.9883364027010435, + "grad_norm": 0.2345978319644928, + "learning_rate": 2.1963207393689346e-05, + "loss": 1.7456, + "step": 22768 + }, + { + "epoch": 6.988643339472069, + "grad_norm": 0.20831161737442017, + "learning_rate": 2.1959091934771564e-05, + "loss": 1.764, + "step": 22769 + }, + { + "epoch": 6.988950276243094, + "grad_norm": 0.24944809079170227, + "learning_rate": 2.195497675297049e-05, + "loss": 1.7398, + "step": 22770 + }, + { + "epoch": 6.989257213014119, + "grad_norm": 0.25463199615478516, + "learning_rate": 2.1950861848326777e-05, + "loss": 1.7002, + "step": 22771 + }, + { + "epoch": 6.989564149785144, + "grad_norm": 0.2298898696899414, + "learning_rate": 2.194674722088108e-05, + "loss": 1.755, + "step": 22772 + }, + { + "epoch": 6.989871086556169, + "grad_norm": 0.21839721500873566, + "learning_rate": 2.194263287067408e-05, + "loss": 1.6667, + "step": 22773 + }, + { + "epoch": 6.990178023327195, + "grad_norm": 0.2197437435388565, + "learning_rate": 2.1938518797746417e-05, + "loss": 1.6774, + "step": 22774 + }, + { + "epoch": 6.99048496009822, + "grad_norm": 0.23588024079799652, + "learning_rate": 2.1934405002138763e-05, + "loss": 1.6916, + "step": 22775 + }, + { + "epoch": 6.990791896869245, + "grad_norm": 0.20632316172122955, + "learning_rate": 2.1930291483891767e-05, + "loss": 1.6682, + "step": 22776 + }, + { + "epoch": 6.99109883364027, + "grad_norm": 0.22786293923854828, + "learning_rate": 2.192617824304607e-05, + "loss": 1.7138, + "step": 22777 + }, + { + "epoch": 6.991405770411295, + "grad_norm": 0.3235599994659424, + "learning_rate": 2.1922065279642363e-05, + "loss": 1.7545, + "step": 22778 + }, + { + "epoch": 6.99171270718232, + "grad_norm": 0.1919393390417099, + "learning_rate": 2.191795259372123e-05, + "loss": 1.7422, + "step": 22779 + }, + { + "epoch": 6.992019643953346, + "grad_norm": 0.16472585499286652, + "learning_rate": 2.1913840185323385e-05, + "loss": 1.6824, + "step": 22780 + }, + { + "epoch": 6.992326580724371, + "grad_norm": 0.21422579884529114, + "learning_rate": 2.1909728054489397e-05, + "loss": 1.696, + "step": 22781 + }, + { + "epoch": 6.9926335174953955, + "grad_norm": 0.18965782225131989, + "learning_rate": 2.190561620125996e-05, + "loss": 1.7026, + "step": 22782 + }, + { + "epoch": 6.992940454266421, + "grad_norm": 0.184856116771698, + "learning_rate": 2.190150462567569e-05, + "loss": 1.7202, + "step": 22783 + }, + { + "epoch": 6.993247391037446, + "grad_norm": 0.18382076919078827, + "learning_rate": 2.1897393327777223e-05, + "loss": 1.7525, + "step": 22784 + }, + { + "epoch": 6.9935543278084715, + "grad_norm": 0.17239750921726227, + "learning_rate": 2.1893282307605202e-05, + "loss": 1.7297, + "step": 22785 + }, + { + "epoch": 6.993861264579497, + "grad_norm": 0.18522322177886963, + "learning_rate": 2.18891715652002e-05, + "loss": 1.6952, + "step": 22786 + }, + { + "epoch": 6.994168201350522, + "grad_norm": 0.1946135014295578, + "learning_rate": 2.18850611006029e-05, + "loss": 1.6879, + "step": 22787 + }, + { + "epoch": 6.994475138121547, + "grad_norm": 0.2028069645166397, + "learning_rate": 2.188095091385391e-05, + "loss": 1.7412, + "step": 22788 + }, + { + "epoch": 6.994782074892572, + "grad_norm": 0.18794523179531097, + "learning_rate": 2.1876841004993838e-05, + "loss": 1.6936, + "step": 22789 + }, + { + "epoch": 6.995089011663597, + "grad_norm": 0.1912194788455963, + "learning_rate": 2.187273137406331e-05, + "loss": 1.7051, + "step": 22790 + }, + { + "epoch": 6.995395948434623, + "grad_norm": 0.1528688222169876, + "learning_rate": 2.1868622021102934e-05, + "loss": 1.6816, + "step": 22791 + }, + { + "epoch": 6.995702885205648, + "grad_norm": 0.2108357548713684, + "learning_rate": 2.1864512946153325e-05, + "loss": 1.7018, + "step": 22792 + }, + { + "epoch": 6.996009821976672, + "grad_norm": 0.16667310893535614, + "learning_rate": 2.1860404149255092e-05, + "loss": 1.7235, + "step": 22793 + }, + { + "epoch": 6.996316758747698, + "grad_norm": 0.16995872557163239, + "learning_rate": 2.185629563044882e-05, + "loss": 1.7086, + "step": 22794 + }, + { + "epoch": 6.996623695518723, + "grad_norm": 0.1962304711341858, + "learning_rate": 2.1852187389775165e-05, + "loss": 1.7523, + "step": 22795 + }, + { + "epoch": 6.996930632289748, + "grad_norm": 0.17774102091789246, + "learning_rate": 2.1848079427274655e-05, + "loss": 1.6649, + "step": 22796 + }, + { + "epoch": 6.997237569060774, + "grad_norm": 0.18844567239284515, + "learning_rate": 2.184397174298796e-05, + "loss": 1.7281, + "step": 22797 + }, + { + "epoch": 6.997544505831799, + "grad_norm": 0.15324150025844574, + "learning_rate": 2.1839864336955607e-05, + "loss": 1.6496, + "step": 22798 + }, + { + "epoch": 6.9978514426028235, + "grad_norm": 0.25148099660873413, + "learning_rate": 2.1835757209218233e-05, + "loss": 1.7889, + "step": 22799 + }, + { + "epoch": 6.998158379373849, + "grad_norm": 0.22258763015270233, + "learning_rate": 2.1831650359816414e-05, + "loss": 1.7303, + "step": 22800 + }, + { + "epoch": 6.998465316144874, + "grad_norm": 0.21465472877025604, + "learning_rate": 2.182754378879074e-05, + "loss": 1.733, + "step": 22801 + }, + { + "epoch": 6.9987722529158995, + "grad_norm": 0.1894017904996872, + "learning_rate": 2.182343749618181e-05, + "loss": 1.7104, + "step": 22802 + }, + { + "epoch": 6.999079189686924, + "grad_norm": 0.19616369903087616, + "learning_rate": 2.181933148203014e-05, + "loss": 1.7015, + "step": 22803 + }, + { + "epoch": 6.999386126457949, + "grad_norm": 0.1720295250415802, + "learning_rate": 2.181522574637638e-05, + "loss": 1.6609, + "step": 22804 + }, + { + "epoch": 6.999693063228975, + "grad_norm": 0.2508579194545746, + "learning_rate": 2.1811120289261077e-05, + "loss": 1.7485, + "step": 22805 + }, + { + "epoch": 7.0, + "grad_norm": 0.1701229363679886, + "learning_rate": 2.1807015110724805e-05, + "loss": 1.6822, + "step": 22806 + }, + { + "epoch": 7.000306936771025, + "grad_norm": 0.17413921654224396, + "learning_rate": 2.1802910210808135e-05, + "loss": 1.6944, + "step": 22807 + }, + { + "epoch": 7.000613873542051, + "grad_norm": 0.22573722898960114, + "learning_rate": 2.179880558955163e-05, + "loss": 1.7499, + "step": 22808 + }, + { + "epoch": 7.000920810313075, + "grad_norm": 0.2477746456861496, + "learning_rate": 2.1794701246995857e-05, + "loss": 1.7663, + "step": 22809 + }, + { + "epoch": 7.0012277470841005, + "grad_norm": 0.15338411927223206, + "learning_rate": 2.1790597183181384e-05, + "loss": 1.6425, + "step": 22810 + }, + { + "epoch": 7.001534683855126, + "grad_norm": 0.2119540572166443, + "learning_rate": 2.1786493398148738e-05, + "loss": 1.6695, + "step": 22811 + }, + { + "epoch": 7.001841620626151, + "grad_norm": 0.283037930727005, + "learning_rate": 2.178238989193854e-05, + "loss": 1.7479, + "step": 22812 + }, + { + "epoch": 7.0021485573971765, + "grad_norm": 0.2939838767051697, + "learning_rate": 2.1778286664591276e-05, + "loss": 1.733, + "step": 22813 + }, + { + "epoch": 7.002455494168202, + "grad_norm": 0.21681749820709229, + "learning_rate": 2.1774183716147552e-05, + "loss": 1.6804, + "step": 22814 + }, + { + "epoch": 7.002762430939226, + "grad_norm": 0.29066696763038635, + "learning_rate": 2.177008104664785e-05, + "loss": 1.7435, + "step": 22815 + }, + { + "epoch": 7.003069367710252, + "grad_norm": 0.17104873061180115, + "learning_rate": 2.1765978656132773e-05, + "loss": 1.6637, + "step": 22816 + }, + { + "epoch": 7.003376304481277, + "grad_norm": 0.29808685183525085, + "learning_rate": 2.1761876544642846e-05, + "loss": 1.7342, + "step": 22817 + }, + { + "epoch": 7.003683241252302, + "grad_norm": 0.20467214286327362, + "learning_rate": 2.1757774712218603e-05, + "loss": 1.7638, + "step": 22818 + }, + { + "epoch": 7.003990178023328, + "grad_norm": 0.23166583478450775, + "learning_rate": 2.1753673158900607e-05, + "loss": 1.6972, + "step": 22819 + }, + { + "epoch": 7.004297114794352, + "grad_norm": 0.20098255574703217, + "learning_rate": 2.1749571884729332e-05, + "loss": 1.6973, + "step": 22820 + }, + { + "epoch": 7.004604051565377, + "grad_norm": 0.212421715259552, + "learning_rate": 2.1745470889745358e-05, + "loss": 1.7183, + "step": 22821 + }, + { + "epoch": 7.004910988336403, + "grad_norm": 0.2496720403432846, + "learning_rate": 2.17413701739892e-05, + "loss": 1.7928, + "step": 22822 + }, + { + "epoch": 7.005217925107428, + "grad_norm": 0.21050602197647095, + "learning_rate": 2.1737269737501394e-05, + "loss": 1.7379, + "step": 22823 + }, + { + "epoch": 7.005524861878453, + "grad_norm": 0.18321558833122253, + "learning_rate": 2.1733169580322448e-05, + "loss": 1.733, + "step": 22824 + }, + { + "epoch": 7.005831798649478, + "grad_norm": 0.19890302419662476, + "learning_rate": 2.1729069702492887e-05, + "loss": 1.6799, + "step": 22825 + }, + { + "epoch": 7.006138735420503, + "grad_norm": 0.19961030781269073, + "learning_rate": 2.172497010405323e-05, + "loss": 1.6754, + "step": 22826 + }, + { + "epoch": 7.0064456721915285, + "grad_norm": 0.19672131538391113, + "learning_rate": 2.1720870785043988e-05, + "loss": 1.7099, + "step": 22827 + }, + { + "epoch": 7.006752608962554, + "grad_norm": 0.16798892617225647, + "learning_rate": 2.1716771745505666e-05, + "loss": 1.7096, + "step": 22828 + }, + { + "epoch": 7.007059545733579, + "grad_norm": 0.2276654690504074, + "learning_rate": 2.1712672985478815e-05, + "loss": 1.7627, + "step": 22829 + }, + { + "epoch": 7.0073664825046045, + "grad_norm": 0.17108316719532013, + "learning_rate": 2.1708574505003872e-05, + "loss": 1.6941, + "step": 22830 + }, + { + "epoch": 7.007673419275629, + "grad_norm": 0.2094760239124298, + "learning_rate": 2.1704476304121413e-05, + "loss": 1.7152, + "step": 22831 + }, + { + "epoch": 7.007980356046654, + "grad_norm": 0.17183393239974976, + "learning_rate": 2.1700378382871872e-05, + "loss": 1.6668, + "step": 22832 + }, + { + "epoch": 7.00828729281768, + "grad_norm": 0.2075900435447693, + "learning_rate": 2.1696280741295795e-05, + "loss": 1.7732, + "step": 22833 + }, + { + "epoch": 7.008594229588705, + "grad_norm": 0.20075511932373047, + "learning_rate": 2.169218337943368e-05, + "loss": 1.7228, + "step": 22834 + }, + { + "epoch": 7.00890116635973, + "grad_norm": 0.19461359083652496, + "learning_rate": 2.168808629732596e-05, + "loss": 1.6942, + "step": 22835 + }, + { + "epoch": 7.009208103130755, + "grad_norm": 0.18972480297088623, + "learning_rate": 2.16839894950132e-05, + "loss": 1.7087, + "step": 22836 + }, + { + "epoch": 7.00951503990178, + "grad_norm": 0.19522632658481598, + "learning_rate": 2.167989297253582e-05, + "loss": 1.7427, + "step": 22837 + }, + { + "epoch": 7.009821976672805, + "grad_norm": 0.2088990956544876, + "learning_rate": 2.1675796729934355e-05, + "loss": 1.786, + "step": 22838 + }, + { + "epoch": 7.010128913443831, + "grad_norm": 0.2052021473646164, + "learning_rate": 2.167170076724927e-05, + "loss": 1.765, + "step": 22839 + }, + { + "epoch": 7.010435850214856, + "grad_norm": 0.19566771388053894, + "learning_rate": 2.1667605084521043e-05, + "loss": 1.703, + "step": 22840 + }, + { + "epoch": 7.0107427869858805, + "grad_norm": 0.24589677155017853, + "learning_rate": 2.166350968179014e-05, + "loss": 1.7544, + "step": 22841 + }, + { + "epoch": 7.011049723756906, + "grad_norm": 0.28059569001197815, + "learning_rate": 2.1659414559097053e-05, + "loss": 1.7081, + "step": 22842 + }, + { + "epoch": 7.011356660527931, + "grad_norm": 0.20781446993350983, + "learning_rate": 2.1655319716482237e-05, + "loss": 1.6968, + "step": 22843 + }, + { + "epoch": 7.0116635972989565, + "grad_norm": 0.31703317165374756, + "learning_rate": 2.1651225153986167e-05, + "loss": 1.704, + "step": 22844 + }, + { + "epoch": 7.011970534069982, + "grad_norm": 0.19668029248714447, + "learning_rate": 2.1647130871649283e-05, + "loss": 1.738, + "step": 22845 + }, + { + "epoch": 7.012277470841007, + "grad_norm": 0.3768141567707062, + "learning_rate": 2.1643036869512105e-05, + "loss": 1.7407, + "step": 22846 + }, + { + "epoch": 7.012584407612032, + "grad_norm": 0.22228674590587616, + "learning_rate": 2.1638943147615032e-05, + "loss": 1.7162, + "step": 22847 + }, + { + "epoch": 7.012891344383057, + "grad_norm": 0.26087433099746704, + "learning_rate": 2.1634849705998572e-05, + "loss": 1.6916, + "step": 22848 + }, + { + "epoch": 7.013198281154082, + "grad_norm": 0.19660449028015137, + "learning_rate": 2.1630756544703117e-05, + "loss": 1.7024, + "step": 22849 + }, + { + "epoch": 7.013505217925108, + "grad_norm": 0.2287406474351883, + "learning_rate": 2.1626663663769176e-05, + "loss": 1.6761, + "step": 22850 + }, + { + "epoch": 7.013812154696133, + "grad_norm": 0.18974192440509796, + "learning_rate": 2.162257106323719e-05, + "loss": 1.6721, + "step": 22851 + }, + { + "epoch": 7.014119091467157, + "grad_norm": 0.25081944465637207, + "learning_rate": 2.1618478743147558e-05, + "loss": 1.7042, + "step": 22852 + }, + { + "epoch": 7.014426028238183, + "grad_norm": 0.187479630112648, + "learning_rate": 2.1614386703540785e-05, + "loss": 1.7057, + "step": 22853 + }, + { + "epoch": 7.014732965009208, + "grad_norm": 0.24785932898521423, + "learning_rate": 2.1610294944457243e-05, + "loss": 1.8033, + "step": 22854 + }, + { + "epoch": 7.015039901780233, + "grad_norm": 0.21570228040218353, + "learning_rate": 2.160620346593743e-05, + "loss": 1.7129, + "step": 22855 + }, + { + "epoch": 7.015346838551259, + "grad_norm": 0.19304436445236206, + "learning_rate": 2.160211226802175e-05, + "loss": 1.7384, + "step": 22856 + }, + { + "epoch": 7.015653775322283, + "grad_norm": 0.18901783227920532, + "learning_rate": 2.1598021350750648e-05, + "loss": 1.6851, + "step": 22857 + }, + { + "epoch": 7.0159607120933085, + "grad_norm": 0.21754276752471924, + "learning_rate": 2.159393071416454e-05, + "loss": 1.7242, + "step": 22858 + }, + { + "epoch": 7.016267648864334, + "grad_norm": 0.18334844708442688, + "learning_rate": 2.1589840358303858e-05, + "loss": 1.66, + "step": 22859 + }, + { + "epoch": 7.016574585635359, + "grad_norm": 0.17688371241092682, + "learning_rate": 2.1585750283209026e-05, + "loss": 1.6693, + "step": 22860 + }, + { + "epoch": 7.0168815224063845, + "grad_norm": 0.17173215746879578, + "learning_rate": 2.158166048892047e-05, + "loss": 1.675, + "step": 22861 + }, + { + "epoch": 7.01718845917741, + "grad_norm": 0.2144075632095337, + "learning_rate": 2.157757097547857e-05, + "loss": 1.7843, + "step": 22862 + }, + { + "epoch": 7.017495395948434, + "grad_norm": 0.18811818957328796, + "learning_rate": 2.1573481742923824e-05, + "loss": 1.6932, + "step": 22863 + }, + { + "epoch": 7.01780233271946, + "grad_norm": 0.19978533685207367, + "learning_rate": 2.1569392791296548e-05, + "loss": 1.7426, + "step": 22864 + }, + { + "epoch": 7.018109269490485, + "grad_norm": 0.19639068841934204, + "learning_rate": 2.1565304120637237e-05, + "loss": 1.7479, + "step": 22865 + }, + { + "epoch": 7.01841620626151, + "grad_norm": 0.2269967794418335, + "learning_rate": 2.1561215730986212e-05, + "loss": 1.7507, + "step": 22866 + }, + { + "epoch": 7.018723143032536, + "grad_norm": 0.19511014223098755, + "learning_rate": 2.1557127622383948e-05, + "loss": 1.7317, + "step": 22867 + }, + { + "epoch": 7.01903007980356, + "grad_norm": 0.23975026607513428, + "learning_rate": 2.1553039794870834e-05, + "loss": 1.7901, + "step": 22868 + }, + { + "epoch": 7.019337016574585, + "grad_norm": 0.20757955312728882, + "learning_rate": 2.154895224848722e-05, + "loss": 1.7823, + "step": 22869 + }, + { + "epoch": 7.019643953345611, + "grad_norm": 0.1893112063407898, + "learning_rate": 2.154486498327357e-05, + "loss": 1.6939, + "step": 22870 + }, + { + "epoch": 7.019950890116636, + "grad_norm": 0.23006685078144073, + "learning_rate": 2.1540777999270205e-05, + "loss": 1.8061, + "step": 22871 + }, + { + "epoch": 7.020257826887661, + "grad_norm": 0.25516194105148315, + "learning_rate": 2.1536691296517573e-05, + "loss": 1.6801, + "step": 22872 + }, + { + "epoch": 7.020564763658686, + "grad_norm": 0.2138557732105255, + "learning_rate": 2.153260487505604e-05, + "loss": 1.7689, + "step": 22873 + }, + { + "epoch": 7.020871700429711, + "grad_norm": 0.2618521749973297, + "learning_rate": 2.152851873492599e-05, + "loss": 1.712, + "step": 22874 + }, + { + "epoch": 7.0211786372007365, + "grad_norm": 0.19639171659946442, + "learning_rate": 2.1524432876167812e-05, + "loss": 1.6883, + "step": 22875 + }, + { + "epoch": 7.021485573971762, + "grad_norm": 0.20283572375774384, + "learning_rate": 2.152034729882187e-05, + "loss": 1.7259, + "step": 22876 + }, + { + "epoch": 7.021792510742787, + "grad_norm": 0.247970849275589, + "learning_rate": 2.151626200292855e-05, + "loss": 1.6714, + "step": 22877 + }, + { + "epoch": 7.0220994475138125, + "grad_norm": 0.20877771079540253, + "learning_rate": 2.1512176988528227e-05, + "loss": 1.7378, + "step": 22878 + }, + { + "epoch": 7.022406384284837, + "grad_norm": 0.2515791356563568, + "learning_rate": 2.1508092255661245e-05, + "loss": 1.743, + "step": 22879 + }, + { + "epoch": 7.022713321055862, + "grad_norm": 0.21451319754123688, + "learning_rate": 2.150400780436804e-05, + "loss": 1.7102, + "step": 22880 + }, + { + "epoch": 7.023020257826888, + "grad_norm": 0.23944756388664246, + "learning_rate": 2.1499923634688886e-05, + "loss": 1.7739, + "step": 22881 + }, + { + "epoch": 7.023327194597913, + "grad_norm": 0.22423309087753296, + "learning_rate": 2.149583974666423e-05, + "loss": 1.7598, + "step": 22882 + }, + { + "epoch": 7.023634131368938, + "grad_norm": 0.31337371468544006, + "learning_rate": 2.1491756140334358e-05, + "loss": 1.7417, + "step": 22883 + }, + { + "epoch": 7.023941068139963, + "grad_norm": 0.22430868446826935, + "learning_rate": 2.148767281573968e-05, + "loss": 1.712, + "step": 22884 + }, + { + "epoch": 7.024248004910988, + "grad_norm": 0.26083487272262573, + "learning_rate": 2.148358977292054e-05, + "loss": 1.6816, + "step": 22885 + }, + { + "epoch": 7.024554941682013, + "grad_norm": 0.2283557504415512, + "learning_rate": 2.1479507011917255e-05, + "loss": 1.7539, + "step": 22886 + }, + { + "epoch": 7.024861878453039, + "grad_norm": 0.22732317447662354, + "learning_rate": 2.1475424532770232e-05, + "loss": 1.697, + "step": 22887 + }, + { + "epoch": 7.025168815224064, + "grad_norm": 0.19614318013191223, + "learning_rate": 2.1471342335519746e-05, + "loss": 1.7267, + "step": 22888 + }, + { + "epoch": 7.0254757519950894, + "grad_norm": 0.23076513409614563, + "learning_rate": 2.1467260420206192e-05, + "loss": 1.7749, + "step": 22889 + }, + { + "epoch": 7.025782688766114, + "grad_norm": 0.1969364732503891, + "learning_rate": 2.1463178786869892e-05, + "loss": 1.6975, + "step": 22890 + }, + { + "epoch": 7.026089625537139, + "grad_norm": 0.2126578837633133, + "learning_rate": 2.145909743555119e-05, + "loss": 1.6815, + "step": 22891 + }, + { + "epoch": 7.026396562308165, + "grad_norm": 0.20841559767723083, + "learning_rate": 2.1455016366290414e-05, + "loss": 1.727, + "step": 22892 + }, + { + "epoch": 7.02670349907919, + "grad_norm": 0.2523893713951111, + "learning_rate": 2.1450935579127896e-05, + "loss": 1.7213, + "step": 22893 + }, + { + "epoch": 7.027010435850215, + "grad_norm": 0.16219666600227356, + "learning_rate": 2.1446855074103968e-05, + "loss": 1.6406, + "step": 22894 + }, + { + "epoch": 7.02731737262124, + "grad_norm": 0.28709226846694946, + "learning_rate": 2.144277485125895e-05, + "loss": 1.7021, + "step": 22895 + }, + { + "epoch": 7.027624309392265, + "grad_norm": 0.23238243162631989, + "learning_rate": 2.1438694910633174e-05, + "loss": 1.7347, + "step": 22896 + }, + { + "epoch": 7.02793124616329, + "grad_norm": 0.2692428231239319, + "learning_rate": 2.1434615252266948e-05, + "loss": 1.7192, + "step": 22897 + }, + { + "epoch": 7.028238182934316, + "grad_norm": 0.21163232624530792, + "learning_rate": 2.1430535876200584e-05, + "loss": 1.7437, + "step": 22898 + }, + { + "epoch": 7.028545119705341, + "grad_norm": 0.23896420001983643, + "learning_rate": 2.1426456782474446e-05, + "loss": 1.6773, + "step": 22899 + }, + { + "epoch": 7.0288520564763655, + "grad_norm": 0.19021281599998474, + "learning_rate": 2.142237797112877e-05, + "loss": 1.7084, + "step": 22900 + }, + { + "epoch": 7.029158993247391, + "grad_norm": 0.23483091592788696, + "learning_rate": 2.1418299442203926e-05, + "loss": 1.7678, + "step": 22901 + }, + { + "epoch": 7.029465930018416, + "grad_norm": 0.20831161737442017, + "learning_rate": 2.1414221195740213e-05, + "loss": 1.7454, + "step": 22902 + }, + { + "epoch": 7.0297728667894415, + "grad_norm": 0.1961016207933426, + "learning_rate": 2.141014323177789e-05, + "loss": 1.7231, + "step": 22903 + }, + { + "epoch": 7.030079803560467, + "grad_norm": 0.1877545267343521, + "learning_rate": 2.1406065550357322e-05, + "loss": 1.6925, + "step": 22904 + }, + { + "epoch": 7.030386740331492, + "grad_norm": 0.20815789699554443, + "learning_rate": 2.1401988151518738e-05, + "loss": 1.7762, + "step": 22905 + }, + { + "epoch": 7.030693677102517, + "grad_norm": 0.1902543157339096, + "learning_rate": 2.1397911035302487e-05, + "loss": 1.7663, + "step": 22906 + }, + { + "epoch": 7.031000613873542, + "grad_norm": 0.20552431046962738, + "learning_rate": 2.1393834201748846e-05, + "loss": 1.7048, + "step": 22907 + }, + { + "epoch": 7.031307550644567, + "grad_norm": 0.2380477488040924, + "learning_rate": 2.13897576508981e-05, + "loss": 1.7685, + "step": 22908 + }, + { + "epoch": 7.031614487415593, + "grad_norm": 0.18351083993911743, + "learning_rate": 2.1385681382790536e-05, + "loss": 1.7058, + "step": 22909 + }, + { + "epoch": 7.031921424186618, + "grad_norm": 0.21992792189121246, + "learning_rate": 2.1381605397466442e-05, + "loss": 1.7608, + "step": 22910 + }, + { + "epoch": 7.032228360957642, + "grad_norm": 0.24412932991981506, + "learning_rate": 2.1377529694966097e-05, + "loss": 1.7205, + "step": 22911 + }, + { + "epoch": 7.032535297728668, + "grad_norm": 0.20398534834384918, + "learning_rate": 2.137345427532978e-05, + "loss": 1.7318, + "step": 22912 + }, + { + "epoch": 7.032842234499693, + "grad_norm": 0.2346884161233902, + "learning_rate": 2.136937913859776e-05, + "loss": 1.7159, + "step": 22913 + }, + { + "epoch": 7.033149171270718, + "grad_norm": 0.19422392547130585, + "learning_rate": 2.1365304284810327e-05, + "loss": 1.7229, + "step": 22914 + }, + { + "epoch": 7.033456108041744, + "grad_norm": 0.24088126420974731, + "learning_rate": 2.1361229714007714e-05, + "loss": 1.77, + "step": 22915 + }, + { + "epoch": 7.033763044812768, + "grad_norm": 0.18886598944664001, + "learning_rate": 2.135715542623026e-05, + "loss": 1.7724, + "step": 22916 + }, + { + "epoch": 7.0340699815837935, + "grad_norm": 0.18816733360290527, + "learning_rate": 2.135308142151814e-05, + "loss": 1.7174, + "step": 22917 + }, + { + "epoch": 7.034376918354819, + "grad_norm": 0.184849813580513, + "learning_rate": 2.1349007699911694e-05, + "loss": 1.7026, + "step": 22918 + }, + { + "epoch": 7.034683855125844, + "grad_norm": 0.1638055443763733, + "learning_rate": 2.134493426145113e-05, + "loss": 1.683, + "step": 22919 + }, + { + "epoch": 7.0349907918968695, + "grad_norm": 0.18030275404453278, + "learning_rate": 2.1340861106176713e-05, + "loss": 1.6963, + "step": 22920 + }, + { + "epoch": 7.035297728667895, + "grad_norm": 0.221226304769516, + "learning_rate": 2.133678823412873e-05, + "loss": 1.7851, + "step": 22921 + }, + { + "epoch": 7.035604665438919, + "grad_norm": 0.18877451121807098, + "learning_rate": 2.1332715645347373e-05, + "loss": 1.7111, + "step": 22922 + }, + { + "epoch": 7.035911602209945, + "grad_norm": 0.17179232835769653, + "learning_rate": 2.1328643339872938e-05, + "loss": 1.6737, + "step": 22923 + }, + { + "epoch": 7.03621853898097, + "grad_norm": 0.17912441492080688, + "learning_rate": 2.1324571317745657e-05, + "loss": 1.7798, + "step": 22924 + }, + { + "epoch": 7.036525475751995, + "grad_norm": 0.2120780050754547, + "learning_rate": 2.132049957900577e-05, + "loss": 1.7353, + "step": 22925 + }, + { + "epoch": 7.036832412523021, + "grad_norm": 0.17286419868469238, + "learning_rate": 2.1316428123693517e-05, + "loss": 1.667, + "step": 22926 + }, + { + "epoch": 7.037139349294045, + "grad_norm": 0.1824301779270172, + "learning_rate": 2.1312356951849126e-05, + "loss": 1.6925, + "step": 22927 + }, + { + "epoch": 7.03744628606507, + "grad_norm": 0.16392327845096588, + "learning_rate": 2.1308286063512843e-05, + "loss": 1.7145, + "step": 22928 + }, + { + "epoch": 7.037753222836096, + "grad_norm": 0.18268297612667084, + "learning_rate": 2.1304215458724895e-05, + "loss": 1.7251, + "step": 22929 + }, + { + "epoch": 7.038060159607121, + "grad_norm": 0.19878868758678436, + "learning_rate": 2.1300145137525505e-05, + "loss": 1.7192, + "step": 22930 + }, + { + "epoch": 7.038367096378146, + "grad_norm": 0.18570293486118317, + "learning_rate": 2.1296075099954908e-05, + "loss": 1.718, + "step": 22931 + }, + { + "epoch": 7.038674033149171, + "grad_norm": 0.16497015953063965, + "learning_rate": 2.12920053460533e-05, + "loss": 1.6914, + "step": 22932 + }, + { + "epoch": 7.038980969920196, + "grad_norm": 0.20224586129188538, + "learning_rate": 2.128793587586096e-05, + "loss": 1.6941, + "step": 22933 + }, + { + "epoch": 7.0392879066912215, + "grad_norm": 0.22124920785427094, + "learning_rate": 2.1283866689418024e-05, + "loss": 1.7921, + "step": 22934 + }, + { + "epoch": 7.039594843462247, + "grad_norm": 0.20548123121261597, + "learning_rate": 2.127979778676479e-05, + "loss": 1.7488, + "step": 22935 + }, + { + "epoch": 7.039901780233272, + "grad_norm": 0.17604656517505646, + "learning_rate": 2.1275729167941405e-05, + "loss": 1.7145, + "step": 22936 + }, + { + "epoch": 7.0402087170042975, + "grad_norm": 0.17899781465530396, + "learning_rate": 2.127166083298809e-05, + "loss": 1.6703, + "step": 22937 + }, + { + "epoch": 7.040515653775322, + "grad_norm": 0.16101998090744019, + "learning_rate": 2.126759278194509e-05, + "loss": 1.715, + "step": 22938 + }, + { + "epoch": 7.040822590546347, + "grad_norm": 0.22807051241397858, + "learning_rate": 2.1263525014852542e-05, + "loss": 1.7409, + "step": 22939 + }, + { + "epoch": 7.041129527317373, + "grad_norm": 0.19442932307720184, + "learning_rate": 2.125945753175072e-05, + "loss": 1.6953, + "step": 22940 + }, + { + "epoch": 7.041436464088398, + "grad_norm": 0.24816946685314178, + "learning_rate": 2.1255390332679755e-05, + "loss": 1.7527, + "step": 22941 + }, + { + "epoch": 7.041743400859423, + "grad_norm": 0.26748740673065186, + "learning_rate": 2.1251323417679882e-05, + "loss": 1.7703, + "step": 22942 + }, + { + "epoch": 7.042050337630448, + "grad_norm": 0.19965825974941254, + "learning_rate": 2.124725678679128e-05, + "loss": 1.7303, + "step": 22943 + }, + { + "epoch": 7.042357274401473, + "grad_norm": 0.2442217618227005, + "learning_rate": 2.124319044005414e-05, + "loss": 1.7183, + "step": 22944 + }, + { + "epoch": 7.042664211172498, + "grad_norm": 0.21421664953231812, + "learning_rate": 2.1239124377508646e-05, + "loss": 1.7348, + "step": 22945 + }, + { + "epoch": 7.042971147943524, + "grad_norm": 0.26072144508361816, + "learning_rate": 2.1235058599194984e-05, + "loss": 1.7396, + "step": 22946 + }, + { + "epoch": 7.043278084714549, + "grad_norm": 0.20694412291049957, + "learning_rate": 2.1230993105153335e-05, + "loss": 1.7871, + "step": 22947 + }, + { + "epoch": 7.043585021485574, + "grad_norm": 0.298551082611084, + "learning_rate": 2.122692789542387e-05, + "loss": 1.7051, + "step": 22948 + }, + { + "epoch": 7.043891958256599, + "grad_norm": 0.22547855973243713, + "learning_rate": 2.1222862970046752e-05, + "loss": 1.7392, + "step": 22949 + }, + { + "epoch": 7.044198895027624, + "grad_norm": 0.3150571882724762, + "learning_rate": 2.1218798329062205e-05, + "loss": 1.6705, + "step": 22950 + }, + { + "epoch": 7.0445058317986495, + "grad_norm": 0.2025378942489624, + "learning_rate": 2.1214733972510327e-05, + "loss": 1.7114, + "step": 22951 + }, + { + "epoch": 7.044812768569675, + "grad_norm": 0.29046711325645447, + "learning_rate": 2.1210669900431353e-05, + "loss": 1.7745, + "step": 22952 + }, + { + "epoch": 7.0451197053407, + "grad_norm": 0.23395368456840515, + "learning_rate": 2.1206606112865396e-05, + "loss": 1.7829, + "step": 22953 + }, + { + "epoch": 7.045426642111725, + "grad_norm": 0.21395133435726166, + "learning_rate": 2.1202542609852616e-05, + "loss": 1.7211, + "step": 22954 + }, + { + "epoch": 7.04573357888275, + "grad_norm": 0.18077452480793, + "learning_rate": 2.1198479391433223e-05, + "loss": 1.7584, + "step": 22955 + }, + { + "epoch": 7.046040515653775, + "grad_norm": 0.17318682372570038, + "learning_rate": 2.1194416457647302e-05, + "loss": 1.7525, + "step": 22956 + }, + { + "epoch": 7.046347452424801, + "grad_norm": 0.18798092007637024, + "learning_rate": 2.119035380853508e-05, + "loss": 1.7525, + "step": 22957 + }, + { + "epoch": 7.046654389195826, + "grad_norm": 0.18679840862751007, + "learning_rate": 2.118629144413663e-05, + "loss": 1.7729, + "step": 22958 + }, + { + "epoch": 7.04696132596685, + "grad_norm": 0.17846907675266266, + "learning_rate": 2.1182229364492156e-05, + "loss": 1.7354, + "step": 22959 + }, + { + "epoch": 7.047268262737876, + "grad_norm": 0.22771520912647247, + "learning_rate": 2.1178167569641783e-05, + "loss": 1.7086, + "step": 22960 + }, + { + "epoch": 7.047575199508901, + "grad_norm": 0.1541738212108612, + "learning_rate": 2.1174106059625642e-05, + "loss": 1.67, + "step": 22961 + }, + { + "epoch": 7.047882136279926, + "grad_norm": 0.17698390781879425, + "learning_rate": 2.117004483448389e-05, + "loss": 1.68, + "step": 22962 + }, + { + "epoch": 7.048189073050952, + "grad_norm": 0.2220597118139267, + "learning_rate": 2.1165983894256647e-05, + "loss": 1.7783, + "step": 22963 + }, + { + "epoch": 7.048496009821977, + "grad_norm": 0.20971544086933136, + "learning_rate": 2.1161923238984055e-05, + "loss": 1.7318, + "step": 22964 + }, + { + "epoch": 7.0488029465930016, + "grad_norm": 0.2032100409269333, + "learning_rate": 2.1157862868706242e-05, + "loss": 1.6736, + "step": 22965 + }, + { + "epoch": 7.049109883364027, + "grad_norm": 0.19177256524562836, + "learning_rate": 2.115380278346331e-05, + "loss": 1.74, + "step": 22966 + }, + { + "epoch": 7.049416820135052, + "grad_norm": 0.1956746131181717, + "learning_rate": 2.1149742983295446e-05, + "loss": 1.7251, + "step": 22967 + }, + { + "epoch": 7.0497237569060776, + "grad_norm": 0.16200929880142212, + "learning_rate": 2.114568346824269e-05, + "loss": 1.6735, + "step": 22968 + }, + { + "epoch": 7.050030693677103, + "grad_norm": 0.19551095366477966, + "learning_rate": 2.1141624238345242e-05, + "loss": 1.7185, + "step": 22969 + }, + { + "epoch": 7.050337630448127, + "grad_norm": 0.17967839539051056, + "learning_rate": 2.1137565293643158e-05, + "loss": 1.7262, + "step": 22970 + }, + { + "epoch": 7.050644567219153, + "grad_norm": 0.15093082189559937, + "learning_rate": 2.1133506634176552e-05, + "loss": 1.6695, + "step": 22971 + }, + { + "epoch": 7.050951503990178, + "grad_norm": 0.20207351446151733, + "learning_rate": 2.1129448259985595e-05, + "loss": 1.7448, + "step": 22972 + }, + { + "epoch": 7.051258440761203, + "grad_norm": 0.20243801176548004, + "learning_rate": 2.112539017111031e-05, + "loss": 1.7496, + "step": 22973 + }, + { + "epoch": 7.051565377532229, + "grad_norm": 0.1967451572418213, + "learning_rate": 2.112133236759088e-05, + "loss": 1.718, + "step": 22974 + }, + { + "epoch": 7.051872314303253, + "grad_norm": 0.17668583989143372, + "learning_rate": 2.1117274849467334e-05, + "loss": 1.7295, + "step": 22975 + }, + { + "epoch": 7.0521792510742785, + "grad_norm": 0.17461778223514557, + "learning_rate": 2.1113217616779824e-05, + "loss": 1.7166, + "step": 22976 + }, + { + "epoch": 7.052486187845304, + "grad_norm": 0.18184112012386322, + "learning_rate": 2.110916066956843e-05, + "loss": 1.7092, + "step": 22977 + }, + { + "epoch": 7.052793124616329, + "grad_norm": 0.18001540005207062, + "learning_rate": 2.1105104007873246e-05, + "loss": 1.7129, + "step": 22978 + }, + { + "epoch": 7.0531000613873545, + "grad_norm": 0.15966519713401794, + "learning_rate": 2.1101047631734355e-05, + "loss": 1.7121, + "step": 22979 + }, + { + "epoch": 7.05340699815838, + "grad_norm": 0.20201170444488525, + "learning_rate": 2.109699154119185e-05, + "loss": 1.7266, + "step": 22980 + }, + { + "epoch": 7.053713934929404, + "grad_norm": 0.19559438526630402, + "learning_rate": 2.1092935736285817e-05, + "loss": 1.7492, + "step": 22981 + }, + { + "epoch": 7.05402087170043, + "grad_norm": 0.17783302068710327, + "learning_rate": 2.108888021705634e-05, + "loss": 1.6901, + "step": 22982 + }, + { + "epoch": 7.054327808471455, + "grad_norm": 0.22052957117557526, + "learning_rate": 2.108482498354347e-05, + "loss": 1.6771, + "step": 22983 + }, + { + "epoch": 7.05463474524248, + "grad_norm": 0.1899181455373764, + "learning_rate": 2.1080770035787346e-05, + "loss": 1.7011, + "step": 22984 + }, + { + "epoch": 7.054941682013506, + "grad_norm": 0.19773316383361816, + "learning_rate": 2.1076715373827964e-05, + "loss": 1.7535, + "step": 22985 + }, + { + "epoch": 7.05524861878453, + "grad_norm": 0.2244229018688202, + "learning_rate": 2.1072660997705475e-05, + "loss": 1.7938, + "step": 22986 + }, + { + "epoch": 7.055555555555555, + "grad_norm": 0.18881015479564667, + "learning_rate": 2.106860690745988e-05, + "loss": 1.6753, + "step": 22987 + }, + { + "epoch": 7.055862492326581, + "grad_norm": 0.19642052054405212, + "learning_rate": 2.106455310313126e-05, + "loss": 1.735, + "step": 22988 + }, + { + "epoch": 7.056169429097606, + "grad_norm": 0.23549412190914154, + "learning_rate": 2.106049958475971e-05, + "loss": 1.7705, + "step": 22989 + }, + { + "epoch": 7.056476365868631, + "grad_norm": 0.21001911163330078, + "learning_rate": 2.1056446352385235e-05, + "loss": 1.6802, + "step": 22990 + }, + { + "epoch": 7.056783302639656, + "grad_norm": 0.1821003556251526, + "learning_rate": 2.1052393406047953e-05, + "loss": 1.7144, + "step": 22991 + }, + { + "epoch": 7.057090239410681, + "grad_norm": 0.1979309767484665, + "learning_rate": 2.104834074578786e-05, + "loss": 1.6983, + "step": 22992 + }, + { + "epoch": 7.0573971761817065, + "grad_norm": 0.18264134228229523, + "learning_rate": 2.1044288371645045e-05, + "loss": 1.7001, + "step": 22993 + }, + { + "epoch": 7.057704112952732, + "grad_norm": 0.17276059091091156, + "learning_rate": 2.104023628365954e-05, + "loss": 1.6976, + "step": 22994 + }, + { + "epoch": 7.058011049723757, + "grad_norm": 0.18879400193691254, + "learning_rate": 2.1036184481871402e-05, + "loss": 1.6954, + "step": 22995 + }, + { + "epoch": 7.0583179864947825, + "grad_norm": 0.1956210434436798, + "learning_rate": 2.103213296632066e-05, + "loss": 1.7329, + "step": 22996 + }, + { + "epoch": 7.058624923265807, + "grad_norm": 0.21108154952526093, + "learning_rate": 2.1028081737047356e-05, + "loss": 1.7299, + "step": 22997 + }, + { + "epoch": 7.058931860036832, + "grad_norm": 0.17981186509132385, + "learning_rate": 2.1024030794091537e-05, + "loss": 1.7162, + "step": 22998 + }, + { + "epoch": 7.059238796807858, + "grad_norm": 0.1699269711971283, + "learning_rate": 2.101998013749322e-05, + "loss": 1.6842, + "step": 22999 + }, + { + "epoch": 7.059545733578883, + "grad_norm": 0.17033198475837708, + "learning_rate": 2.1015929767292435e-05, + "loss": 1.6735, + "step": 23000 + }, + { + "epoch": 7.059852670349908, + "grad_norm": 0.18620076775550842, + "learning_rate": 2.101187968352925e-05, + "loss": 1.7328, + "step": 23001 + }, + { + "epoch": 7.060159607120933, + "grad_norm": 0.17528964579105377, + "learning_rate": 2.100782988624363e-05, + "loss": 1.6567, + "step": 23002 + }, + { + "epoch": 7.060466543891958, + "grad_norm": 0.1946999728679657, + "learning_rate": 2.100378037547566e-05, + "loss": 1.7349, + "step": 23003 + }, + { + "epoch": 7.060773480662983, + "grad_norm": 0.23345647752285004, + "learning_rate": 2.0999731151265312e-05, + "loss": 1.7185, + "step": 23004 + }, + { + "epoch": 7.061080417434009, + "grad_norm": 0.20169813930988312, + "learning_rate": 2.0995682213652603e-05, + "loss": 1.7223, + "step": 23005 + }, + { + "epoch": 7.061387354205034, + "grad_norm": 0.2397730052471161, + "learning_rate": 2.0991633562677594e-05, + "loss": 1.7542, + "step": 23006 + }, + { + "epoch": 7.0616942909760585, + "grad_norm": 0.20421954989433289, + "learning_rate": 2.0987585198380227e-05, + "loss": 1.6888, + "step": 23007 + }, + { + "epoch": 7.062001227747084, + "grad_norm": 0.21555101871490479, + "learning_rate": 2.0983537120800584e-05, + "loss": 1.6796, + "step": 23008 + }, + { + "epoch": 7.062308164518109, + "grad_norm": 0.17311134934425354, + "learning_rate": 2.0979489329978603e-05, + "loss": 1.7199, + "step": 23009 + }, + { + "epoch": 7.0626151012891345, + "grad_norm": 0.25064393877983093, + "learning_rate": 2.0975441825954334e-05, + "loss": 1.6947, + "step": 23010 + }, + { + "epoch": 7.06292203806016, + "grad_norm": 0.19135847687721252, + "learning_rate": 2.0971394608767757e-05, + "loss": 1.702, + "step": 23011 + }, + { + "epoch": 7.063228974831185, + "grad_norm": 0.22994364798069, + "learning_rate": 2.0967347678458876e-05, + "loss": 1.6814, + "step": 23012 + }, + { + "epoch": 7.06353591160221, + "grad_norm": 0.21897611021995544, + "learning_rate": 2.0963301035067685e-05, + "loss": 1.7063, + "step": 23013 + }, + { + "epoch": 7.063842848373235, + "grad_norm": 0.23615150153636932, + "learning_rate": 2.0959254678634166e-05, + "loss": 1.7299, + "step": 23014 + }, + { + "epoch": 7.06414978514426, + "grad_norm": 0.1837770640850067, + "learning_rate": 2.0955208609198314e-05, + "loss": 1.7236, + "step": 23015 + }, + { + "epoch": 7.064456721915286, + "grad_norm": 0.16823385655879974, + "learning_rate": 2.0951162826800118e-05, + "loss": 1.6687, + "step": 23016 + }, + { + "epoch": 7.064763658686311, + "grad_norm": 0.17042338848114014, + "learning_rate": 2.094711733147954e-05, + "loss": 1.6907, + "step": 23017 + }, + { + "epoch": 7.065070595457335, + "grad_norm": 0.1753006875514984, + "learning_rate": 2.094307212327661e-05, + "loss": 1.7313, + "step": 23018 + }, + { + "epoch": 7.065377532228361, + "grad_norm": 0.19618375599384308, + "learning_rate": 2.093902720223123e-05, + "loss": 1.7147, + "step": 23019 + }, + { + "epoch": 7.065684468999386, + "grad_norm": 0.20214296877384186, + "learning_rate": 2.093498256838346e-05, + "loss": 1.7056, + "step": 23020 + }, + { + "epoch": 7.065991405770411, + "grad_norm": 0.20230883359909058, + "learning_rate": 2.093093822177321e-05, + "loss": 1.6628, + "step": 23021 + }, + { + "epoch": 7.066298342541437, + "grad_norm": 0.19913128018379211, + "learning_rate": 2.0926894162440446e-05, + "loss": 1.7286, + "step": 23022 + }, + { + "epoch": 7.066605279312462, + "grad_norm": 0.19535091519355774, + "learning_rate": 2.0922850390425193e-05, + "loss": 1.745, + "step": 23023 + }, + { + "epoch": 7.0669122160834865, + "grad_norm": 0.19679825007915497, + "learning_rate": 2.0918806905767337e-05, + "loss": 1.694, + "step": 23024 + }, + { + "epoch": 7.067219152854512, + "grad_norm": 0.1821403056383133, + "learning_rate": 2.0914763708506913e-05, + "loss": 1.7163, + "step": 23025 + }, + { + "epoch": 7.067526089625537, + "grad_norm": 0.17138415575027466, + "learning_rate": 2.0910720798683803e-05, + "loss": 1.6946, + "step": 23026 + }, + { + "epoch": 7.0678330263965625, + "grad_norm": 0.20219111442565918, + "learning_rate": 2.0906678176338017e-05, + "loss": 1.7437, + "step": 23027 + }, + { + "epoch": 7.068139963167588, + "grad_norm": 0.1985882669687271, + "learning_rate": 2.0902635841509494e-05, + "loss": 1.6762, + "step": 23028 + }, + { + "epoch": 7.068446899938612, + "grad_norm": 0.18586322665214539, + "learning_rate": 2.0898593794238174e-05, + "loss": 1.7296, + "step": 23029 + }, + { + "epoch": 7.068753836709638, + "grad_norm": 0.19222751259803772, + "learning_rate": 2.0894552034564013e-05, + "loss": 1.7186, + "step": 23030 + }, + { + "epoch": 7.069060773480663, + "grad_norm": 0.16107569634914398, + "learning_rate": 2.0890510562526944e-05, + "loss": 1.6898, + "step": 23031 + }, + { + "epoch": 7.069367710251688, + "grad_norm": 0.23859064280986786, + "learning_rate": 2.088646937816691e-05, + "loss": 1.7992, + "step": 23032 + }, + { + "epoch": 7.069674647022714, + "grad_norm": 0.22927051782608032, + "learning_rate": 2.0882428481523853e-05, + "loss": 1.7162, + "step": 23033 + }, + { + "epoch": 7.069981583793738, + "grad_norm": 0.18094350397586823, + "learning_rate": 2.0878387872637684e-05, + "loss": 1.7297, + "step": 23034 + }, + { + "epoch": 7.070288520564763, + "grad_norm": 0.20562811195850372, + "learning_rate": 2.087434755154839e-05, + "loss": 1.7475, + "step": 23035 + }, + { + "epoch": 7.070595457335789, + "grad_norm": 0.18405984342098236, + "learning_rate": 2.087030751829583e-05, + "loss": 1.6954, + "step": 23036 + }, + { + "epoch": 7.070902394106814, + "grad_norm": 0.26286160945892334, + "learning_rate": 2.0866267772919994e-05, + "loss": 1.7406, + "step": 23037 + }, + { + "epoch": 7.071209330877839, + "grad_norm": 0.1688467413187027, + "learning_rate": 2.086222831546077e-05, + "loss": 1.7375, + "step": 23038 + }, + { + "epoch": 7.071516267648865, + "grad_norm": 0.25445011258125305, + "learning_rate": 2.0858189145958057e-05, + "loss": 1.7479, + "step": 23039 + }, + { + "epoch": 7.071823204419889, + "grad_norm": 0.20637978613376617, + "learning_rate": 2.085415026445184e-05, + "loss": 1.7653, + "step": 23040 + }, + { + "epoch": 7.0721301411909145, + "grad_norm": 0.21693937480449677, + "learning_rate": 2.0850111670981952e-05, + "loss": 1.7392, + "step": 23041 + }, + { + "epoch": 7.07243707796194, + "grad_norm": 0.1999017745256424, + "learning_rate": 2.0846073365588388e-05, + "loss": 1.753, + "step": 23042 + }, + { + "epoch": 7.072744014732965, + "grad_norm": 0.2271260917186737, + "learning_rate": 2.0842035348310973e-05, + "loss": 1.7136, + "step": 23043 + }, + { + "epoch": 7.0730509515039905, + "grad_norm": 0.1915169358253479, + "learning_rate": 2.0837997619189675e-05, + "loss": 1.7142, + "step": 23044 + }, + { + "epoch": 7.073357888275015, + "grad_norm": 0.2250204086303711, + "learning_rate": 2.0833960178264377e-05, + "loss": 1.8039, + "step": 23045 + }, + { + "epoch": 7.07366482504604, + "grad_norm": 0.20920081436634064, + "learning_rate": 2.0829923025574976e-05, + "loss": 1.767, + "step": 23046 + }, + { + "epoch": 7.073971761817066, + "grad_norm": 0.16039173305034637, + "learning_rate": 2.082588616116138e-05, + "loss": 1.6895, + "step": 23047 + }, + { + "epoch": 7.074278698588091, + "grad_norm": 0.1849806159734726, + "learning_rate": 2.082184958506347e-05, + "loss": 1.7323, + "step": 23048 + }, + { + "epoch": 7.074585635359116, + "grad_norm": 0.22370420396327972, + "learning_rate": 2.081781329732115e-05, + "loss": 1.7478, + "step": 23049 + }, + { + "epoch": 7.074892572130141, + "grad_norm": 0.1600474864244461, + "learning_rate": 2.0813777297974296e-05, + "loss": 1.6754, + "step": 23050 + }, + { + "epoch": 7.075199508901166, + "grad_norm": 0.18357187509536743, + "learning_rate": 2.080974158706281e-05, + "loss": 1.694, + "step": 23051 + }, + { + "epoch": 7.0755064456721914, + "grad_norm": 0.17667005956172943, + "learning_rate": 2.080570616462656e-05, + "loss": 1.7053, + "step": 23052 + }, + { + "epoch": 7.075813382443217, + "grad_norm": 0.19393591582775116, + "learning_rate": 2.0801671030705417e-05, + "loss": 1.7917, + "step": 23053 + }, + { + "epoch": 7.076120319214242, + "grad_norm": 0.19432564079761505, + "learning_rate": 2.0797636185339307e-05, + "loss": 1.7276, + "step": 23054 + }, + { + "epoch": 7.0764272559852675, + "grad_norm": 0.17960594594478607, + "learning_rate": 2.079360162856806e-05, + "loss": 1.6988, + "step": 23055 + }, + { + "epoch": 7.076734192756292, + "grad_norm": 0.183505579829216, + "learning_rate": 2.0789567360431538e-05, + "loss": 1.7106, + "step": 23056 + }, + { + "epoch": 7.077041129527317, + "grad_norm": 0.27859750390052795, + "learning_rate": 2.0785533380969673e-05, + "loss": 1.779, + "step": 23057 + }, + { + "epoch": 7.077348066298343, + "grad_norm": 0.1903255134820938, + "learning_rate": 2.078149969022225e-05, + "loss": 1.7334, + "step": 23058 + }, + { + "epoch": 7.077655003069368, + "grad_norm": 0.2221076786518097, + "learning_rate": 2.0777466288229207e-05, + "loss": 1.6863, + "step": 23059 + }, + { + "epoch": 7.077961939840393, + "grad_norm": 0.15516065061092377, + "learning_rate": 2.0773433175030336e-05, + "loss": 1.6633, + "step": 23060 + }, + { + "epoch": 7.078268876611418, + "grad_norm": 0.20073910057544708, + "learning_rate": 2.0769400350665553e-05, + "loss": 1.7057, + "step": 23061 + }, + { + "epoch": 7.078575813382443, + "grad_norm": 0.1680205762386322, + "learning_rate": 2.076536781517468e-05, + "loss": 1.6659, + "step": 23062 + }, + { + "epoch": 7.078882750153468, + "grad_norm": 0.20825456082820892, + "learning_rate": 2.0761335568597584e-05, + "loss": 1.751, + "step": 23063 + }, + { + "epoch": 7.079189686924494, + "grad_norm": 0.17365674674510956, + "learning_rate": 2.0757303610974098e-05, + "loss": 1.6591, + "step": 23064 + }, + { + "epoch": 7.079496623695519, + "grad_norm": 0.21712929010391235, + "learning_rate": 2.0753271942344087e-05, + "loss": 1.7357, + "step": 23065 + }, + { + "epoch": 7.0798035604665435, + "grad_norm": 0.1841089278459549, + "learning_rate": 2.074924056274738e-05, + "loss": 1.6818, + "step": 23066 + }, + { + "epoch": 7.080110497237569, + "grad_norm": 0.20433486998081207, + "learning_rate": 2.074520947222382e-05, + "loss": 1.76, + "step": 23067 + }, + { + "epoch": 7.080417434008594, + "grad_norm": 0.1712963879108429, + "learning_rate": 2.074117867081325e-05, + "loss": 1.6426, + "step": 23068 + }, + { + "epoch": 7.0807243707796195, + "grad_norm": 0.19894109666347504, + "learning_rate": 2.0737148158555504e-05, + "loss": 1.7529, + "step": 23069 + }, + { + "epoch": 7.081031307550645, + "grad_norm": 0.19338269531726837, + "learning_rate": 2.0733117935490386e-05, + "loss": 1.8274, + "step": 23070 + }, + { + "epoch": 7.08133824432167, + "grad_norm": 0.20883139967918396, + "learning_rate": 2.0729088001657794e-05, + "loss": 1.7275, + "step": 23071 + }, + { + "epoch": 7.081645181092695, + "grad_norm": 0.18498694896697998, + "learning_rate": 2.0725058357097487e-05, + "loss": 1.6648, + "step": 23072 + }, + { + "epoch": 7.08195211786372, + "grad_norm": 0.1727421134710312, + "learning_rate": 2.0721029001849313e-05, + "loss": 1.7709, + "step": 23073 + }, + { + "epoch": 7.082259054634745, + "grad_norm": 0.16965949535369873, + "learning_rate": 2.0716999935953096e-05, + "loss": 1.6876, + "step": 23074 + }, + { + "epoch": 7.082565991405771, + "grad_norm": 0.16905519366264343, + "learning_rate": 2.0712971159448623e-05, + "loss": 1.6576, + "step": 23075 + }, + { + "epoch": 7.082872928176796, + "grad_norm": 0.2863580882549286, + "learning_rate": 2.0708942672375776e-05, + "loss": 1.7631, + "step": 23076 + }, + { + "epoch": 7.08317986494782, + "grad_norm": 0.26248931884765625, + "learning_rate": 2.070491447477429e-05, + "loss": 1.7692, + "step": 23077 + }, + { + "epoch": 7.083486801718846, + "grad_norm": 0.17670878767967224, + "learning_rate": 2.0700886566684024e-05, + "loss": 1.6725, + "step": 23078 + }, + { + "epoch": 7.083793738489871, + "grad_norm": 0.19245800375938416, + "learning_rate": 2.0696858948144775e-05, + "loss": 1.7249, + "step": 23079 + }, + { + "epoch": 7.084100675260896, + "grad_norm": 0.18651939928531647, + "learning_rate": 2.0692831619196335e-05, + "loss": 1.7616, + "step": 23080 + }, + { + "epoch": 7.084407612031922, + "grad_norm": 0.21432510018348694, + "learning_rate": 2.0688804579878514e-05, + "loss": 1.743, + "step": 23081 + }, + { + "epoch": 7.084714548802946, + "grad_norm": 0.18530069291591644, + "learning_rate": 2.0684777830231106e-05, + "loss": 1.7257, + "step": 23082 + }, + { + "epoch": 7.0850214855739715, + "grad_norm": 0.1974172443151474, + "learning_rate": 2.0680751370293903e-05, + "loss": 1.6918, + "step": 23083 + }, + { + "epoch": 7.085328422344997, + "grad_norm": 0.19517268240451813, + "learning_rate": 2.0676725200106706e-05, + "loss": 1.7421, + "step": 23084 + }, + { + "epoch": 7.085635359116022, + "grad_norm": 0.28572699427604675, + "learning_rate": 2.067269931970929e-05, + "loss": 1.7575, + "step": 23085 + }, + { + "epoch": 7.0859422958870475, + "grad_norm": 0.2062397003173828, + "learning_rate": 2.0668673729141452e-05, + "loss": 1.7085, + "step": 23086 + }, + { + "epoch": 7.086249232658073, + "grad_norm": 0.21619725227355957, + "learning_rate": 2.0664648428442973e-05, + "loss": 1.7783, + "step": 23087 + }, + { + "epoch": 7.086556169429097, + "grad_norm": 0.2732481360435486, + "learning_rate": 2.066062341765363e-05, + "loss": 1.7089, + "step": 23088 + }, + { + "epoch": 7.086863106200123, + "grad_norm": 0.19897356629371643, + "learning_rate": 2.06565986968132e-05, + "loss": 1.6487, + "step": 23089 + }, + { + "epoch": 7.087170042971148, + "grad_norm": 0.2578796148300171, + "learning_rate": 2.0652574265961466e-05, + "loss": 1.7385, + "step": 23090 + }, + { + "epoch": 7.087476979742173, + "grad_norm": 0.18980316817760468, + "learning_rate": 2.0648550125138195e-05, + "loss": 1.6651, + "step": 23091 + }, + { + "epoch": 7.087783916513199, + "grad_norm": 0.279580682516098, + "learning_rate": 2.064452627438313e-05, + "loss": 1.7189, + "step": 23092 + }, + { + "epoch": 7.088090853284223, + "grad_norm": 0.18652775883674622, + "learning_rate": 2.0640502713736103e-05, + "loss": 1.7085, + "step": 23093 + }, + { + "epoch": 7.088397790055248, + "grad_norm": 0.2729358673095703, + "learning_rate": 2.06364794432368e-05, + "loss": 1.6812, + "step": 23094 + }, + { + "epoch": 7.088704726826274, + "grad_norm": 0.1756472885608673, + "learning_rate": 2.0632456462925053e-05, + "loss": 1.6835, + "step": 23095 + }, + { + "epoch": 7.089011663597299, + "grad_norm": 0.2352994978427887, + "learning_rate": 2.062843377284055e-05, + "loss": 1.6898, + "step": 23096 + }, + { + "epoch": 7.089318600368324, + "grad_norm": 0.20231495797634125, + "learning_rate": 2.0624411373023093e-05, + "loss": 1.7294, + "step": 23097 + }, + { + "epoch": 7.08962553713935, + "grad_norm": 0.276114821434021, + "learning_rate": 2.0620389263512424e-05, + "loss": 1.6864, + "step": 23098 + }, + { + "epoch": 7.089932473910374, + "grad_norm": 0.2178632766008377, + "learning_rate": 2.0616367444348288e-05, + "loss": 1.7353, + "step": 23099 + }, + { + "epoch": 7.0902394106813995, + "grad_norm": 0.20966552197933197, + "learning_rate": 2.061234591557043e-05, + "loss": 1.6579, + "step": 23100 + }, + { + "epoch": 7.090546347452425, + "grad_norm": 0.16496559977531433, + "learning_rate": 2.0608324677218592e-05, + "loss": 1.7137, + "step": 23101 + }, + { + "epoch": 7.09085328422345, + "grad_norm": 0.19176827371120453, + "learning_rate": 2.0604303729332525e-05, + "loss": 1.6996, + "step": 23102 + }, + { + "epoch": 7.0911602209944755, + "grad_norm": 0.20933480560779572, + "learning_rate": 2.060028307195195e-05, + "loss": 1.7887, + "step": 23103 + }, + { + "epoch": 7.0914671577655, + "grad_norm": 0.1925809681415558, + "learning_rate": 2.0596262705116613e-05, + "loss": 1.6974, + "step": 23104 + }, + { + "epoch": 7.091774094536525, + "grad_norm": 0.1582585573196411, + "learning_rate": 2.0592242628866236e-05, + "loss": 1.6731, + "step": 23105 + }, + { + "epoch": 7.092081031307551, + "grad_norm": 0.20380592346191406, + "learning_rate": 2.058822284324056e-05, + "loss": 1.6911, + "step": 23106 + }, + { + "epoch": 7.092387968078576, + "grad_norm": 0.17984862625598907, + "learning_rate": 2.0584203348279307e-05, + "loss": 1.7218, + "step": 23107 + }, + { + "epoch": 7.092694904849601, + "grad_norm": 0.22097790241241455, + "learning_rate": 2.058018414402219e-05, + "loss": 1.7223, + "step": 23108 + }, + { + "epoch": 7.093001841620626, + "grad_norm": 0.20519912242889404, + "learning_rate": 2.0576165230508926e-05, + "loss": 1.7197, + "step": 23109 + }, + { + "epoch": 7.093308778391651, + "grad_norm": 0.2156807780265808, + "learning_rate": 2.0572146607779274e-05, + "loss": 1.7079, + "step": 23110 + }, + { + "epoch": 7.093615715162676, + "grad_norm": 0.21810726821422577, + "learning_rate": 2.056812827587288e-05, + "loss": 1.7456, + "step": 23111 + }, + { + "epoch": 7.093922651933702, + "grad_norm": 0.2288726568222046, + "learning_rate": 2.0564110234829536e-05, + "loss": 1.8113, + "step": 23112 + }, + { + "epoch": 7.094229588704727, + "grad_norm": 0.21279199421405792, + "learning_rate": 2.056009248468887e-05, + "loss": 1.7554, + "step": 23113 + }, + { + "epoch": 7.094536525475752, + "grad_norm": 0.18577606976032257, + "learning_rate": 2.055607502549064e-05, + "loss": 1.661, + "step": 23114 + }, + { + "epoch": 7.094843462246777, + "grad_norm": 0.17938728630542755, + "learning_rate": 2.0552057857274536e-05, + "loss": 1.6998, + "step": 23115 + }, + { + "epoch": 7.095150399017802, + "grad_norm": 0.1946432888507843, + "learning_rate": 2.0548040980080258e-05, + "loss": 1.7146, + "step": 23116 + }, + { + "epoch": 7.0954573357888275, + "grad_norm": 0.21220463514328003, + "learning_rate": 2.0544024393947496e-05, + "loss": 1.7345, + "step": 23117 + }, + { + "epoch": 7.095764272559853, + "grad_norm": 0.2006370723247528, + "learning_rate": 2.0540008098915954e-05, + "loss": 1.7636, + "step": 23118 + }, + { + "epoch": 7.096071209330878, + "grad_norm": 0.17251192033290863, + "learning_rate": 2.0535992095025312e-05, + "loss": 1.7103, + "step": 23119 + }, + { + "epoch": 7.096378146101903, + "grad_norm": 0.2393570840358734, + "learning_rate": 2.0531976382315277e-05, + "loss": 1.7636, + "step": 23120 + }, + { + "epoch": 7.096685082872928, + "grad_norm": 0.16999265551567078, + "learning_rate": 2.0527960960825516e-05, + "loss": 1.6571, + "step": 23121 + }, + { + "epoch": 7.096992019643953, + "grad_norm": 0.17626826465129852, + "learning_rate": 2.052394583059572e-05, + "loss": 1.713, + "step": 23122 + }, + { + "epoch": 7.097298956414979, + "grad_norm": 0.18373346328735352, + "learning_rate": 2.051993099166557e-05, + "loss": 1.7102, + "step": 23123 + }, + { + "epoch": 7.097605893186004, + "grad_norm": 0.1913219541311264, + "learning_rate": 2.0515916444074734e-05, + "loss": 1.7441, + "step": 23124 + }, + { + "epoch": 7.097912829957028, + "grad_norm": 0.19664399325847626, + "learning_rate": 2.0511902187862903e-05, + "loss": 1.6866, + "step": 23125 + }, + { + "epoch": 7.098219766728054, + "grad_norm": 0.16524936258792877, + "learning_rate": 2.050788822306971e-05, + "loss": 1.6709, + "step": 23126 + }, + { + "epoch": 7.098526703499079, + "grad_norm": 0.19291190803050995, + "learning_rate": 2.050387454973489e-05, + "loss": 1.7033, + "step": 23127 + }, + { + "epoch": 7.098833640270104, + "grad_norm": 0.19915525615215302, + "learning_rate": 2.0499861167898037e-05, + "loss": 1.7425, + "step": 23128 + }, + { + "epoch": 7.09914057704113, + "grad_norm": 0.21295227110385895, + "learning_rate": 2.0495848077598883e-05, + "loss": 1.7516, + "step": 23129 + }, + { + "epoch": 7.099447513812155, + "grad_norm": 0.21469831466674805, + "learning_rate": 2.0491835278877014e-05, + "loss": 1.7129, + "step": 23130 + }, + { + "epoch": 7.0997544505831796, + "grad_norm": 0.16860374808311462, + "learning_rate": 2.0487822771772143e-05, + "loss": 1.7172, + "step": 23131 + }, + { + "epoch": 7.100061387354205, + "grad_norm": 0.22386015951633453, + "learning_rate": 2.04838105563239e-05, + "loss": 1.7829, + "step": 23132 + }, + { + "epoch": 7.10036832412523, + "grad_norm": 0.22635474801063538, + "learning_rate": 2.047979863257195e-05, + "loss": 1.6956, + "step": 23133 + }, + { + "epoch": 7.100675260896256, + "grad_norm": 0.20508790016174316, + "learning_rate": 2.0475787000555924e-05, + "loss": 1.7404, + "step": 23134 + }, + { + "epoch": 7.100982197667281, + "grad_norm": 0.2055993378162384, + "learning_rate": 2.047177566031548e-05, + "loss": 1.7064, + "step": 23135 + }, + { + "epoch": 7.101289134438305, + "grad_norm": 0.19258326292037964, + "learning_rate": 2.0467764611890254e-05, + "loss": 1.7078, + "step": 23136 + }, + { + "epoch": 7.101596071209331, + "grad_norm": 0.20766718685626984, + "learning_rate": 2.046375385531989e-05, + "loss": 1.6854, + "step": 23137 + }, + { + "epoch": 7.101903007980356, + "grad_norm": 0.17945602536201477, + "learning_rate": 2.045974339064402e-05, + "loss": 1.6986, + "step": 23138 + }, + { + "epoch": 7.102209944751381, + "grad_norm": 0.17283397912979126, + "learning_rate": 2.045573321790228e-05, + "loss": 1.7296, + "step": 23139 + }, + { + "epoch": 7.102516881522407, + "grad_norm": 0.19000805914402008, + "learning_rate": 2.0451723337134298e-05, + "loss": 1.7005, + "step": 23140 + }, + { + "epoch": 7.102823818293431, + "grad_norm": 0.1966131180524826, + "learning_rate": 2.044771374837971e-05, + "loss": 1.7574, + "step": 23141 + }, + { + "epoch": 7.1031307550644565, + "grad_norm": 0.2411719709634781, + "learning_rate": 2.0443704451678137e-05, + "loss": 1.7599, + "step": 23142 + }, + { + "epoch": 7.103437691835482, + "grad_norm": 0.23902751505374908, + "learning_rate": 2.0439695447069173e-05, + "loss": 1.6805, + "step": 23143 + }, + { + "epoch": 7.103744628606507, + "grad_norm": 0.19117529690265656, + "learning_rate": 2.0435686734592508e-05, + "loss": 1.7482, + "step": 23144 + }, + { + "epoch": 7.1040515653775325, + "grad_norm": 0.18491674959659576, + "learning_rate": 2.0431678314287678e-05, + "loss": 1.6764, + "step": 23145 + }, + { + "epoch": 7.104358502148558, + "grad_norm": 0.21000699698925018, + "learning_rate": 2.042767018619437e-05, + "loss": 1.7185, + "step": 23146 + }, + { + "epoch": 7.104665438919582, + "grad_norm": 0.17373491823673248, + "learning_rate": 2.0423662350352117e-05, + "loss": 1.6945, + "step": 23147 + }, + { + "epoch": 7.104972375690608, + "grad_norm": 0.18387937545776367, + "learning_rate": 2.041965480680059e-05, + "loss": 1.766, + "step": 23148 + }, + { + "epoch": 7.105279312461633, + "grad_norm": 0.15976013243198395, + "learning_rate": 2.0415647555579376e-05, + "loss": 1.6446, + "step": 23149 + }, + { + "epoch": 7.105586249232658, + "grad_norm": 0.19251346588134766, + "learning_rate": 2.0411640596728066e-05, + "loss": 1.7122, + "step": 23150 + }, + { + "epoch": 7.105893186003684, + "grad_norm": 0.1640147864818573, + "learning_rate": 2.040763393028627e-05, + "loss": 1.7057, + "step": 23151 + }, + { + "epoch": 7.106200122774708, + "grad_norm": 0.20366166532039642, + "learning_rate": 2.0403627556293577e-05, + "loss": 1.7173, + "step": 23152 + }, + { + "epoch": 7.106507059545733, + "grad_norm": 0.18549348413944244, + "learning_rate": 2.039962147478958e-05, + "loss": 1.7215, + "step": 23153 + }, + { + "epoch": 7.106813996316759, + "grad_norm": 0.16964925825595856, + "learning_rate": 2.039561568581388e-05, + "loss": 1.6931, + "step": 23154 + }, + { + "epoch": 7.107120933087784, + "grad_norm": 0.16923274099826813, + "learning_rate": 2.0391610189406058e-05, + "loss": 1.6976, + "step": 23155 + }, + { + "epoch": 7.107427869858809, + "grad_norm": 0.17707234621047974, + "learning_rate": 2.038760498560569e-05, + "loss": 1.7102, + "step": 23156 + }, + { + "epoch": 7.107734806629834, + "grad_norm": 0.2048260122537613, + "learning_rate": 2.0383600074452376e-05, + "loss": 1.7116, + "step": 23157 + }, + { + "epoch": 7.108041743400859, + "grad_norm": 0.17328095436096191, + "learning_rate": 2.037959545598568e-05, + "loss": 1.6683, + "step": 23158 + }, + { + "epoch": 7.1083486801718845, + "grad_norm": 0.15829013288021088, + "learning_rate": 2.037559113024518e-05, + "loss": 1.6617, + "step": 23159 + }, + { + "epoch": 7.10865561694291, + "grad_norm": 0.21150968968868256, + "learning_rate": 2.037158709727044e-05, + "loss": 1.7057, + "step": 23160 + }, + { + "epoch": 7.108962553713935, + "grad_norm": 0.20321892201900482, + "learning_rate": 2.0367583357101072e-05, + "loss": 1.6811, + "step": 23161 + }, + { + "epoch": 7.1092694904849605, + "grad_norm": 0.19491781294345856, + "learning_rate": 2.0363579909776583e-05, + "loss": 1.6794, + "step": 23162 + }, + { + "epoch": 7.109576427255985, + "grad_norm": 0.155877947807312, + "learning_rate": 2.0359576755336594e-05, + "loss": 1.7434, + "step": 23163 + }, + { + "epoch": 7.10988336402701, + "grad_norm": 0.17822639644145966, + "learning_rate": 2.0355573893820613e-05, + "loss": 1.7029, + "step": 23164 + }, + { + "epoch": 7.110190300798036, + "grad_norm": 0.18152910470962524, + "learning_rate": 2.0351571325268242e-05, + "loss": 1.7277, + "step": 23165 + }, + { + "epoch": 7.110497237569061, + "grad_norm": 0.19928498566150665, + "learning_rate": 2.034756904971902e-05, + "loss": 1.7852, + "step": 23166 + }, + { + "epoch": 7.110804174340086, + "grad_norm": 0.19099318981170654, + "learning_rate": 2.0343567067212504e-05, + "loss": 1.7258, + "step": 23167 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 0.19800841808319092, + "learning_rate": 2.033956537778824e-05, + "loss": 1.7647, + "step": 23168 + }, + { + "epoch": 7.111418047882136, + "grad_norm": 0.20110327005386353, + "learning_rate": 2.0335563981485768e-05, + "loss": 1.7111, + "step": 23169 + }, + { + "epoch": 7.111724984653161, + "grad_norm": 0.1875200718641281, + "learning_rate": 2.0331562878344645e-05, + "loss": 1.7145, + "step": 23170 + }, + { + "epoch": 7.112031921424187, + "grad_norm": 0.17586658895015717, + "learning_rate": 2.032756206840441e-05, + "loss": 1.663, + "step": 23171 + }, + { + "epoch": 7.112338858195212, + "grad_norm": 0.1783432811498642, + "learning_rate": 2.032356155170459e-05, + "loss": 1.7146, + "step": 23172 + }, + { + "epoch": 7.112645794966237, + "grad_norm": 0.16075368225574493, + "learning_rate": 2.0319561328284737e-05, + "loss": 1.6414, + "step": 23173 + }, + { + "epoch": 7.112952731737262, + "grad_norm": 0.22822627425193787, + "learning_rate": 2.0315561398184367e-05, + "loss": 1.7363, + "step": 23174 + }, + { + "epoch": 7.113259668508287, + "grad_norm": 0.1882331818342209, + "learning_rate": 2.0311561761443026e-05, + "loss": 1.7384, + "step": 23175 + }, + { + "epoch": 7.1135666052793125, + "grad_norm": 0.21478623151779175, + "learning_rate": 2.0307562418100228e-05, + "loss": 1.7314, + "step": 23176 + }, + { + "epoch": 7.113873542050338, + "grad_norm": 0.18545235693454742, + "learning_rate": 2.0303563368195483e-05, + "loss": 1.7046, + "step": 23177 + }, + { + "epoch": 7.114180478821363, + "grad_norm": 0.1965286284685135, + "learning_rate": 2.0299564611768367e-05, + "loss": 1.7423, + "step": 23178 + }, + { + "epoch": 7.114487415592388, + "grad_norm": 0.1679733693599701, + "learning_rate": 2.0295566148858332e-05, + "loss": 1.6861, + "step": 23179 + }, + { + "epoch": 7.114794352363413, + "grad_norm": 0.18930186331272125, + "learning_rate": 2.029156797950495e-05, + "loss": 1.6609, + "step": 23180 + }, + { + "epoch": 7.115101289134438, + "grad_norm": 0.20774266123771667, + "learning_rate": 2.0287570103747672e-05, + "loss": 1.6919, + "step": 23181 + }, + { + "epoch": 7.115408225905464, + "grad_norm": 0.1866706907749176, + "learning_rate": 2.028357252162606e-05, + "loss": 1.7385, + "step": 23182 + }, + { + "epoch": 7.115715162676489, + "grad_norm": 0.21728016436100006, + "learning_rate": 2.0279575233179605e-05, + "loss": 1.7574, + "step": 23183 + }, + { + "epoch": 7.116022099447513, + "grad_norm": 0.16665934026241302, + "learning_rate": 2.02755782384478e-05, + "loss": 1.7046, + "step": 23184 + }, + { + "epoch": 7.116329036218539, + "grad_norm": 0.17275744676589966, + "learning_rate": 2.027158153747016e-05, + "loss": 1.6914, + "step": 23185 + }, + { + "epoch": 7.116635972989564, + "grad_norm": 0.15803802013397217, + "learning_rate": 2.026758513028617e-05, + "loss": 1.6932, + "step": 23186 + }, + { + "epoch": 7.116942909760589, + "grad_norm": 0.17434535920619965, + "learning_rate": 2.0263589016935336e-05, + "loss": 1.6714, + "step": 23187 + }, + { + "epoch": 7.117249846531615, + "grad_norm": 0.18005578219890594, + "learning_rate": 2.025959319745714e-05, + "loss": 1.6728, + "step": 23188 + }, + { + "epoch": 7.11755678330264, + "grad_norm": 0.19545695185661316, + "learning_rate": 2.025559767189108e-05, + "loss": 1.7475, + "step": 23189 + }, + { + "epoch": 7.1178637200736645, + "grad_norm": 0.19226810336112976, + "learning_rate": 2.025160244027663e-05, + "loss": 1.7447, + "step": 23190 + }, + { + "epoch": 7.11817065684469, + "grad_norm": 0.1682211458683014, + "learning_rate": 2.0247607502653286e-05, + "loss": 1.687, + "step": 23191 + }, + { + "epoch": 7.118477593615715, + "grad_norm": 0.1883849948644638, + "learning_rate": 2.0243612859060524e-05, + "loss": 1.7556, + "step": 23192 + }, + { + "epoch": 7.1187845303867405, + "grad_norm": 0.16668641567230225, + "learning_rate": 2.0239618509537817e-05, + "loss": 1.6683, + "step": 23193 + }, + { + "epoch": 7.119091467157766, + "grad_norm": 0.21448664367198944, + "learning_rate": 2.023562445412463e-05, + "loss": 1.709, + "step": 23194 + }, + { + "epoch": 7.11939840392879, + "grad_norm": 0.24347564578056335, + "learning_rate": 2.0231630692860476e-05, + "loss": 1.7775, + "step": 23195 + }, + { + "epoch": 7.119705340699816, + "grad_norm": 0.20289309322834015, + "learning_rate": 2.0227637225784767e-05, + "loss": 1.8258, + "step": 23196 + }, + { + "epoch": 7.120012277470841, + "grad_norm": 0.20075447857379913, + "learning_rate": 2.022364405293703e-05, + "loss": 1.686, + "step": 23197 + }, + { + "epoch": 7.120319214241866, + "grad_norm": 0.17129302024841309, + "learning_rate": 2.021965117435666e-05, + "loss": 1.6937, + "step": 23198 + }, + { + "epoch": 7.120626151012892, + "grad_norm": 0.222218856215477, + "learning_rate": 2.0215658590083164e-05, + "loss": 1.6812, + "step": 23199 + }, + { + "epoch": 7.120933087783916, + "grad_norm": 0.1955309957265854, + "learning_rate": 2.0211666300155996e-05, + "loss": 1.7652, + "step": 23200 + }, + { + "epoch": 7.121240024554941, + "grad_norm": 0.20479047298431396, + "learning_rate": 2.0207674304614595e-05, + "loss": 1.7393, + "step": 23201 + }, + { + "epoch": 7.121546961325967, + "grad_norm": 0.14726878702640533, + "learning_rate": 2.020368260349842e-05, + "loss": 1.6766, + "step": 23202 + }, + { + "epoch": 7.121853898096992, + "grad_norm": 0.19149260222911835, + "learning_rate": 2.0199691196846914e-05, + "loss": 1.7176, + "step": 23203 + }, + { + "epoch": 7.122160834868017, + "grad_norm": 0.17182055115699768, + "learning_rate": 2.019570008469953e-05, + "loss": 1.6828, + "step": 23204 + }, + { + "epoch": 7.122467771639043, + "grad_norm": 0.16044408082962036, + "learning_rate": 2.019170926709571e-05, + "loss": 1.6595, + "step": 23205 + }, + { + "epoch": 7.122774708410067, + "grad_norm": 0.21787980198860168, + "learning_rate": 2.0187718744074885e-05, + "loss": 1.7114, + "step": 23206 + }, + { + "epoch": 7.1230816451810925, + "grad_norm": 0.16959737241268158, + "learning_rate": 2.01837285156765e-05, + "loss": 1.7128, + "step": 23207 + }, + { + "epoch": 7.123388581952118, + "grad_norm": 0.28120318055152893, + "learning_rate": 2.0179738581939983e-05, + "loss": 1.8386, + "step": 23208 + }, + { + "epoch": 7.123695518723143, + "grad_norm": 0.19752691686153412, + "learning_rate": 2.017574894290477e-05, + "loss": 1.7123, + "step": 23209 + }, + { + "epoch": 7.1240024554941686, + "grad_norm": 0.19860398769378662, + "learning_rate": 2.0171759598610286e-05, + "loss": 1.7041, + "step": 23210 + }, + { + "epoch": 7.124309392265193, + "grad_norm": 0.17429523169994354, + "learning_rate": 2.0167770549095937e-05, + "loss": 1.6963, + "step": 23211 + }, + { + "epoch": 7.124616329036218, + "grad_norm": 0.27635815739631653, + "learning_rate": 2.01637817944012e-05, + "loss": 1.8261, + "step": 23212 + }, + { + "epoch": 7.124923265807244, + "grad_norm": 0.17512556910514832, + "learning_rate": 2.0159793334565424e-05, + "loss": 1.7311, + "step": 23213 + }, + { + "epoch": 7.125230202578269, + "grad_norm": 0.1964988112449646, + "learning_rate": 2.01558051696281e-05, + "loss": 1.6829, + "step": 23214 + }, + { + "epoch": 7.125537139349294, + "grad_norm": 0.20796819031238556, + "learning_rate": 2.0151817299628563e-05, + "loss": 1.7084, + "step": 23215 + }, + { + "epoch": 7.12584407612032, + "grad_norm": 0.19875051081180573, + "learning_rate": 2.0147829724606278e-05, + "loss": 1.7197, + "step": 23216 + }, + { + "epoch": 7.126151012891344, + "grad_norm": 0.22590650618076324, + "learning_rate": 2.0143842444600635e-05, + "loss": 1.7923, + "step": 23217 + }, + { + "epoch": 7.1264579496623695, + "grad_norm": 0.19106422364711761, + "learning_rate": 2.0139855459651042e-05, + "loss": 1.7096, + "step": 23218 + }, + { + "epoch": 7.126764886433395, + "grad_norm": 0.2105991542339325, + "learning_rate": 2.01358687697969e-05, + "loss": 1.6836, + "step": 23219 + }, + { + "epoch": 7.12707182320442, + "grad_norm": 0.18826960027217865, + "learning_rate": 2.013188237507761e-05, + "loss": 1.7347, + "step": 23220 + }, + { + "epoch": 7.1273787599754455, + "grad_norm": 0.1865578591823578, + "learning_rate": 2.012789627553256e-05, + "loss": 1.7115, + "step": 23221 + }, + { + "epoch": 7.12768569674647, + "grad_norm": 0.18389549851417542, + "learning_rate": 2.0123910471201145e-05, + "loss": 1.6817, + "step": 23222 + }, + { + "epoch": 7.127992633517495, + "grad_norm": 0.18351595103740692, + "learning_rate": 2.0119924962122766e-05, + "loss": 1.6898, + "step": 23223 + }, + { + "epoch": 7.128299570288521, + "grad_norm": 0.1913219839334488, + "learning_rate": 2.01159397483368e-05, + "loss": 1.7536, + "step": 23224 + }, + { + "epoch": 7.128606507059546, + "grad_norm": 0.17707225680351257, + "learning_rate": 2.0111954829882628e-05, + "loss": 1.6894, + "step": 23225 + }, + { + "epoch": 7.128913443830571, + "grad_norm": 0.17774651944637299, + "learning_rate": 2.0107970206799637e-05, + "loss": 1.6599, + "step": 23226 + }, + { + "epoch": 7.129220380601596, + "grad_norm": 0.14530350267887115, + "learning_rate": 2.0103985879127207e-05, + "loss": 1.6264, + "step": 23227 + }, + { + "epoch": 7.129527317372621, + "grad_norm": 0.15673531591892242, + "learning_rate": 2.010000184690471e-05, + "loss": 1.6577, + "step": 23228 + }, + { + "epoch": 7.129834254143646, + "grad_norm": 0.20691752433776855, + "learning_rate": 2.009601811017152e-05, + "loss": 1.7129, + "step": 23229 + }, + { + "epoch": 7.130141190914672, + "grad_norm": 0.16686022281646729, + "learning_rate": 2.0092034668966987e-05, + "loss": 1.6738, + "step": 23230 + }, + { + "epoch": 7.130448127685697, + "grad_norm": 0.17799030244350433, + "learning_rate": 2.0088051523330536e-05, + "loss": 1.7312, + "step": 23231 + }, + { + "epoch": 7.1307550644567215, + "grad_norm": 0.16749511659145355, + "learning_rate": 2.0084068673301454e-05, + "loss": 1.6616, + "step": 23232 + }, + { + "epoch": 7.131062001227747, + "grad_norm": 0.18347670137882233, + "learning_rate": 2.0080086118919156e-05, + "loss": 1.6622, + "step": 23233 + }, + { + "epoch": 7.131368937998772, + "grad_norm": 0.19747060537338257, + "learning_rate": 2.007610386022299e-05, + "loss": 1.7341, + "step": 23234 + }, + { + "epoch": 7.1316758747697975, + "grad_norm": 0.21067634224891663, + "learning_rate": 2.0072121897252295e-05, + "loss": 1.7252, + "step": 23235 + }, + { + "epoch": 7.131982811540823, + "grad_norm": 0.2095600962638855, + "learning_rate": 2.006814023004644e-05, + "loss": 1.7769, + "step": 23236 + }, + { + "epoch": 7.132289748311848, + "grad_norm": 0.23090791702270508, + "learning_rate": 2.0064158858644765e-05, + "loss": 1.7734, + "step": 23237 + }, + { + "epoch": 7.132596685082873, + "grad_norm": 0.19060610234737396, + "learning_rate": 2.0060177783086614e-05, + "loss": 1.7209, + "step": 23238 + }, + { + "epoch": 7.132903621853898, + "grad_norm": 0.18050087988376617, + "learning_rate": 2.0056197003411342e-05, + "loss": 1.6882, + "step": 23239 + }, + { + "epoch": 7.133210558624923, + "grad_norm": 0.1504158228635788, + "learning_rate": 2.005221651965828e-05, + "loss": 1.687, + "step": 23240 + }, + { + "epoch": 7.133517495395949, + "grad_norm": 0.22980810701847076, + "learning_rate": 2.004823633186676e-05, + "loss": 1.7254, + "step": 23241 + }, + { + "epoch": 7.133824432166974, + "grad_norm": 0.20092199742794037, + "learning_rate": 2.004425644007613e-05, + "loss": 1.7234, + "step": 23242 + }, + { + "epoch": 7.134131368937998, + "grad_norm": 0.21002927422523499, + "learning_rate": 2.0040276844325718e-05, + "loss": 1.7272, + "step": 23243 + }, + { + "epoch": 7.134438305709024, + "grad_norm": 0.18524625897407532, + "learning_rate": 2.003629754465484e-05, + "loss": 1.7189, + "step": 23244 + }, + { + "epoch": 7.134745242480049, + "grad_norm": 0.21095192432403564, + "learning_rate": 2.0032318541102845e-05, + "loss": 1.7177, + "step": 23245 + }, + { + "epoch": 7.135052179251074, + "grad_norm": 0.1700662076473236, + "learning_rate": 2.0028339833709037e-05, + "loss": 1.6925, + "step": 23246 + }, + { + "epoch": 7.1353591160221, + "grad_norm": 0.2123938947916031, + "learning_rate": 2.002436142251272e-05, + "loss": 1.7623, + "step": 23247 + }, + { + "epoch": 7.135666052793125, + "grad_norm": 0.194299578666687, + "learning_rate": 2.0020383307553275e-05, + "loss": 1.6898, + "step": 23248 + }, + { + "epoch": 7.1359729895641495, + "grad_norm": 0.18740688264369965, + "learning_rate": 2.001640548886993e-05, + "loss": 1.6519, + "step": 23249 + }, + { + "epoch": 7.136279926335175, + "grad_norm": 0.18891027569770813, + "learning_rate": 2.0012427966502085e-05, + "loss": 1.6895, + "step": 23250 + }, + { + "epoch": 7.1365868631062, + "grad_norm": 0.21313735842704773, + "learning_rate": 2.000845074048896e-05, + "loss": 1.6829, + "step": 23251 + }, + { + "epoch": 7.1368937998772255, + "grad_norm": 0.2438332885503769, + "learning_rate": 2.0004473810869923e-05, + "loss": 1.7723, + "step": 23252 + }, + { + "epoch": 7.137200736648251, + "grad_norm": 0.24475115537643433, + "learning_rate": 2.0000497177684257e-05, + "loss": 1.7192, + "step": 23253 + }, + { + "epoch": 7.137507673419275, + "grad_norm": 0.1936563402414322, + "learning_rate": 1.9996520840971267e-05, + "loss": 1.7462, + "step": 23254 + }, + { + "epoch": 7.137814610190301, + "grad_norm": 0.22365616261959076, + "learning_rate": 1.9992544800770236e-05, + "loss": 1.7405, + "step": 23255 + }, + { + "epoch": 7.138121546961326, + "grad_norm": 0.191316619515419, + "learning_rate": 1.9988569057120472e-05, + "loss": 1.6466, + "step": 23256 + }, + { + "epoch": 7.138428483732351, + "grad_norm": 0.24758055806159973, + "learning_rate": 1.9984593610061253e-05, + "loss": 1.7689, + "step": 23257 + }, + { + "epoch": 7.138735420503377, + "grad_norm": 0.2144414782524109, + "learning_rate": 1.9980618459631874e-05, + "loss": 1.7158, + "step": 23258 + }, + { + "epoch": 7.139042357274401, + "grad_norm": 0.24254034459590912, + "learning_rate": 1.9976643605871614e-05, + "loss": 1.7998, + "step": 23259 + }, + { + "epoch": 7.139349294045426, + "grad_norm": 0.21013480424880981, + "learning_rate": 1.9972669048819765e-05, + "loss": 1.7231, + "step": 23260 + }, + { + "epoch": 7.139656230816452, + "grad_norm": 0.2169421911239624, + "learning_rate": 1.9968694788515603e-05, + "loss": 1.7182, + "step": 23261 + }, + { + "epoch": 7.139963167587477, + "grad_norm": 0.19591476023197174, + "learning_rate": 1.9964720824998395e-05, + "loss": 1.7114, + "step": 23262 + }, + { + "epoch": 7.140270104358502, + "grad_norm": 0.1775221824645996, + "learning_rate": 1.9960747158307417e-05, + "loss": 1.6754, + "step": 23263 + }, + { + "epoch": 7.140577041129528, + "grad_norm": 0.19318300485610962, + "learning_rate": 1.995677378848193e-05, + "loss": 1.6794, + "step": 23264 + }, + { + "epoch": 7.140883977900552, + "grad_norm": 0.19659662246704102, + "learning_rate": 1.995280071556125e-05, + "loss": 1.703, + "step": 23265 + }, + { + "epoch": 7.1411909146715775, + "grad_norm": 0.22100697457790375, + "learning_rate": 1.994882793958457e-05, + "loss": 1.6821, + "step": 23266 + }, + { + "epoch": 7.141497851442603, + "grad_norm": 0.20475365221500397, + "learning_rate": 1.9944855460591217e-05, + "loss": 1.727, + "step": 23267 + }, + { + "epoch": 7.141804788213628, + "grad_norm": 0.2202025055885315, + "learning_rate": 1.9940883278620383e-05, + "loss": 1.7248, + "step": 23268 + }, + { + "epoch": 7.1421117249846535, + "grad_norm": 0.1800462007522583, + "learning_rate": 1.993691139371138e-05, + "loss": 1.7276, + "step": 23269 + }, + { + "epoch": 7.142418661755678, + "grad_norm": 0.2896895110607147, + "learning_rate": 1.9932939805903433e-05, + "loss": 1.7275, + "step": 23270 + }, + { + "epoch": 7.142725598526703, + "grad_norm": 0.21308782696723938, + "learning_rate": 1.99289685152358e-05, + "loss": 1.6645, + "step": 23271 + }, + { + "epoch": 7.143032535297729, + "grad_norm": 0.20210005342960358, + "learning_rate": 1.992499752174773e-05, + "loss": 1.6899, + "step": 23272 + }, + { + "epoch": 7.143339472068754, + "grad_norm": 0.18419797718524933, + "learning_rate": 1.9921026825478455e-05, + "loss": 1.7088, + "step": 23273 + }, + { + "epoch": 7.143646408839779, + "grad_norm": 0.19155149161815643, + "learning_rate": 1.9917056426467227e-05, + "loss": 1.719, + "step": 23274 + }, + { + "epoch": 7.143953345610804, + "grad_norm": 0.17220313847064972, + "learning_rate": 1.9913086324753278e-05, + "loss": 1.7408, + "step": 23275 + }, + { + "epoch": 7.144260282381829, + "grad_norm": 0.18474969267845154, + "learning_rate": 1.990911652037585e-05, + "loss": 1.7189, + "step": 23276 + }, + { + "epoch": 7.144567219152854, + "grad_norm": 0.18529154360294342, + "learning_rate": 1.9905147013374165e-05, + "loss": 1.7075, + "step": 23277 + }, + { + "epoch": 7.14487415592388, + "grad_norm": 0.18569569289684296, + "learning_rate": 1.9901177803787452e-05, + "loss": 1.7116, + "step": 23278 + }, + { + "epoch": 7.145181092694905, + "grad_norm": 0.17149175703525543, + "learning_rate": 1.9897208891654946e-05, + "loss": 1.6873, + "step": 23279 + }, + { + "epoch": 7.14548802946593, + "grad_norm": 0.18012240529060364, + "learning_rate": 1.9893240277015868e-05, + "loss": 1.709, + "step": 23280 + }, + { + "epoch": 7.145794966236955, + "grad_norm": 0.18372172117233276, + "learning_rate": 1.9889271959909412e-05, + "loss": 1.7134, + "step": 23281 + }, + { + "epoch": 7.14610190300798, + "grad_norm": 0.20667128264904022, + "learning_rate": 1.9885303940374856e-05, + "loss": 1.7452, + "step": 23282 + }, + { + "epoch": 7.1464088397790055, + "grad_norm": 0.18145184218883514, + "learning_rate": 1.9881336218451346e-05, + "loss": 1.7358, + "step": 23283 + }, + { + "epoch": 7.146715776550031, + "grad_norm": 0.179911807179451, + "learning_rate": 1.987736879417816e-05, + "loss": 1.6698, + "step": 23284 + }, + { + "epoch": 7.147022713321056, + "grad_norm": 0.18944865465164185, + "learning_rate": 1.9873401667594426e-05, + "loss": 1.7725, + "step": 23285 + }, + { + "epoch": 7.147329650092081, + "grad_norm": 0.1926117241382599, + "learning_rate": 1.986943483873942e-05, + "loss": 1.7829, + "step": 23286 + }, + { + "epoch": 7.147636586863106, + "grad_norm": 0.330503910779953, + "learning_rate": 1.9865468307652318e-05, + "loss": 1.7408, + "step": 23287 + }, + { + "epoch": 7.147943523634131, + "grad_norm": 0.22677597403526306, + "learning_rate": 1.9861502074372324e-05, + "loss": 1.7013, + "step": 23288 + }, + { + "epoch": 7.148250460405157, + "grad_norm": 0.1859201192855835, + "learning_rate": 1.9857536138938627e-05, + "loss": 1.7215, + "step": 23289 + }, + { + "epoch": 7.148557397176182, + "grad_norm": 0.22151269018650055, + "learning_rate": 1.9853570501390427e-05, + "loss": 1.6781, + "step": 23290 + }, + { + "epoch": 7.148864333947207, + "grad_norm": 0.16455405950546265, + "learning_rate": 1.984960516176691e-05, + "loss": 1.6518, + "step": 23291 + }, + { + "epoch": 7.149171270718232, + "grad_norm": 0.19687162339687347, + "learning_rate": 1.9845640120107267e-05, + "loss": 1.7375, + "step": 23292 + }, + { + "epoch": 7.149478207489257, + "grad_norm": 0.19174890220165253, + "learning_rate": 1.9841675376450686e-05, + "loss": 1.7017, + "step": 23293 + }, + { + "epoch": 7.149785144260282, + "grad_norm": 0.18458877503871918, + "learning_rate": 1.983771093083634e-05, + "loss": 1.7256, + "step": 23294 + }, + { + "epoch": 7.150092081031308, + "grad_norm": 0.212035670876503, + "learning_rate": 1.983374678330342e-05, + "loss": 1.698, + "step": 23295 + }, + { + "epoch": 7.150399017802333, + "grad_norm": 0.1793123185634613, + "learning_rate": 1.982978293389109e-05, + "loss": 1.7012, + "step": 23296 + }, + { + "epoch": 7.150705954573358, + "grad_norm": 0.2359405905008316, + "learning_rate": 1.9825819382638526e-05, + "loss": 1.7423, + "step": 23297 + }, + { + "epoch": 7.151012891344383, + "grad_norm": 0.17125526070594788, + "learning_rate": 1.9821856129584888e-05, + "loss": 1.6825, + "step": 23298 + }, + { + "epoch": 7.151319828115408, + "grad_norm": 0.2084828019142151, + "learning_rate": 1.9817893174769392e-05, + "loss": 1.6991, + "step": 23299 + }, + { + "epoch": 7.151626764886434, + "grad_norm": 0.27647483348846436, + "learning_rate": 1.9813930518231127e-05, + "loss": 1.7425, + "step": 23300 + }, + { + "epoch": 7.151933701657459, + "grad_norm": 0.23517926037311554, + "learning_rate": 1.980996816000933e-05, + "loss": 1.8411, + "step": 23301 + }, + { + "epoch": 7.152240638428483, + "grad_norm": 0.19960010051727295, + "learning_rate": 1.980600610014309e-05, + "loss": 1.7302, + "step": 23302 + }, + { + "epoch": 7.152547575199509, + "grad_norm": 0.18953165411949158, + "learning_rate": 1.9802044338671604e-05, + "loss": 1.7252, + "step": 23303 + }, + { + "epoch": 7.152854511970534, + "grad_norm": 0.1718905121088028, + "learning_rate": 1.979808287563402e-05, + "loss": 1.656, + "step": 23304 + }, + { + "epoch": 7.153161448741559, + "grad_norm": 0.17233465611934662, + "learning_rate": 1.9794121711069487e-05, + "loss": 1.6732, + "step": 23305 + }, + { + "epoch": 7.153468385512585, + "grad_norm": 0.17677003145217896, + "learning_rate": 1.979016084501714e-05, + "loss": 1.7266, + "step": 23306 + }, + { + "epoch": 7.153775322283609, + "grad_norm": 0.1815326064825058, + "learning_rate": 1.9786200277516136e-05, + "loss": 1.7029, + "step": 23307 + }, + { + "epoch": 7.1540822590546345, + "grad_norm": 0.20937341451644897, + "learning_rate": 1.978224000860561e-05, + "loss": 1.711, + "step": 23308 + }, + { + "epoch": 7.15438919582566, + "grad_norm": 0.2045155018568039, + "learning_rate": 1.97782800383247e-05, + "loss": 1.7557, + "step": 23309 + }, + { + "epoch": 7.154696132596685, + "grad_norm": 0.16426041722297668, + "learning_rate": 1.9774320366712533e-05, + "loss": 1.7373, + "step": 23310 + }, + { + "epoch": 7.1550030693677105, + "grad_norm": 0.18058224022388458, + "learning_rate": 1.977036099380825e-05, + "loss": 1.6957, + "step": 23311 + }, + { + "epoch": 7.155310006138736, + "grad_norm": 0.23552078008651733, + "learning_rate": 1.9766401919650983e-05, + "loss": 1.8032, + "step": 23312 + }, + { + "epoch": 7.15561694290976, + "grad_norm": 0.19097596406936646, + "learning_rate": 1.9762443144279852e-05, + "loss": 1.7447, + "step": 23313 + }, + { + "epoch": 7.155923879680786, + "grad_norm": 0.17892403900623322, + "learning_rate": 1.975848466773398e-05, + "loss": 1.7117, + "step": 23314 + }, + { + "epoch": 7.156230816451811, + "grad_norm": 0.18331217765808105, + "learning_rate": 1.9754526490052467e-05, + "loss": 1.6669, + "step": 23315 + }, + { + "epoch": 7.156537753222836, + "grad_norm": 0.19914311170578003, + "learning_rate": 1.975056861127449e-05, + "loss": 1.6731, + "step": 23316 + }, + { + "epoch": 7.156844689993862, + "grad_norm": 0.21710485219955444, + "learning_rate": 1.9746611031439083e-05, + "loss": 1.7214, + "step": 23317 + }, + { + "epoch": 7.157151626764886, + "grad_norm": 0.19703111052513123, + "learning_rate": 1.9742653750585437e-05, + "loss": 1.7185, + "step": 23318 + }, + { + "epoch": 7.157458563535911, + "grad_norm": 0.18581365048885345, + "learning_rate": 1.9738696768752585e-05, + "loss": 1.7113, + "step": 23319 + }, + { + "epoch": 7.157765500306937, + "grad_norm": 0.1703677624464035, + "learning_rate": 1.9734740085979687e-05, + "loss": 1.6755, + "step": 23320 + }, + { + "epoch": 7.158072437077962, + "grad_norm": 0.16760937869548798, + "learning_rate": 1.9730783702305826e-05, + "loss": 1.7082, + "step": 23321 + }, + { + "epoch": 7.158379373848987, + "grad_norm": 0.20183983445167542, + "learning_rate": 1.97268276177701e-05, + "loss": 1.7503, + "step": 23322 + }, + { + "epoch": 7.158686310620013, + "grad_norm": 0.18407952785491943, + "learning_rate": 1.972287183241163e-05, + "loss": 1.6807, + "step": 23323 + }, + { + "epoch": 7.158993247391037, + "grad_norm": 0.20135276019573212, + "learning_rate": 1.9718916346269446e-05, + "loss": 1.8001, + "step": 23324 + }, + { + "epoch": 7.1593001841620625, + "grad_norm": 0.1781267672777176, + "learning_rate": 1.9714961159382693e-05, + "loss": 1.683, + "step": 23325 + }, + { + "epoch": 7.159607120933088, + "grad_norm": 0.24990373849868774, + "learning_rate": 1.971100627179045e-05, + "loss": 1.7235, + "step": 23326 + }, + { + "epoch": 7.159914057704113, + "grad_norm": 0.19463174045085907, + "learning_rate": 1.9707051683531796e-05, + "loss": 1.735, + "step": 23327 + }, + { + "epoch": 7.1602209944751385, + "grad_norm": 0.1988895982503891, + "learning_rate": 1.9703097394645813e-05, + "loss": 1.7495, + "step": 23328 + }, + { + "epoch": 7.160527931246163, + "grad_norm": 0.1760931760072708, + "learning_rate": 1.9699143405171576e-05, + "loss": 1.6914, + "step": 23329 + }, + { + "epoch": 7.160834868017188, + "grad_norm": 0.18537557125091553, + "learning_rate": 1.9695189715148166e-05, + "loss": 1.7601, + "step": 23330 + }, + { + "epoch": 7.161141804788214, + "grad_norm": 0.2476375252008438, + "learning_rate": 1.9691236324614654e-05, + "loss": 1.8218, + "step": 23331 + }, + { + "epoch": 7.161448741559239, + "grad_norm": 0.17736093699932098, + "learning_rate": 1.968728323361009e-05, + "loss": 1.6872, + "step": 23332 + }, + { + "epoch": 7.161755678330264, + "grad_norm": 0.1851162612438202, + "learning_rate": 1.9683330442173598e-05, + "loss": 1.712, + "step": 23333 + }, + { + "epoch": 7.162062615101289, + "grad_norm": 0.20326650142669678, + "learning_rate": 1.967937795034417e-05, + "loss": 1.7668, + "step": 23334 + }, + { + "epoch": 7.162369551872314, + "grad_norm": 0.21020451188087463, + "learning_rate": 1.9675425758160925e-05, + "loss": 1.7135, + "step": 23335 + }, + { + "epoch": 7.162676488643339, + "grad_norm": 0.21629111468791962, + "learning_rate": 1.967147386566287e-05, + "loss": 1.7181, + "step": 23336 + }, + { + "epoch": 7.162983425414365, + "grad_norm": 0.18086732923984528, + "learning_rate": 1.9667522272889104e-05, + "loss": 1.7107, + "step": 23337 + }, + { + "epoch": 7.16329036218539, + "grad_norm": 0.16542381048202515, + "learning_rate": 1.9663570979878658e-05, + "loss": 1.7156, + "step": 23338 + }, + { + "epoch": 7.163597298956415, + "grad_norm": 0.18775032460689545, + "learning_rate": 1.9659619986670587e-05, + "loss": 1.6955, + "step": 23339 + }, + { + "epoch": 7.16390423572744, + "grad_norm": 0.19227592647075653, + "learning_rate": 1.9655669293303953e-05, + "loss": 1.7545, + "step": 23340 + }, + { + "epoch": 7.164211172498465, + "grad_norm": 0.1935085654258728, + "learning_rate": 1.9651718899817746e-05, + "loss": 1.7183, + "step": 23341 + }, + { + "epoch": 7.1645181092694905, + "grad_norm": 0.17873792350292206, + "learning_rate": 1.9647768806251056e-05, + "loss": 1.6644, + "step": 23342 + }, + { + "epoch": 7.164825046040516, + "grad_norm": 0.25024256110191345, + "learning_rate": 1.96438190126429e-05, + "loss": 1.7621, + "step": 23343 + }, + { + "epoch": 7.165131982811541, + "grad_norm": 0.15957331657409668, + "learning_rate": 1.9639869519032323e-05, + "loss": 1.6525, + "step": 23344 + }, + { + "epoch": 7.165438919582566, + "grad_norm": 0.19967027008533478, + "learning_rate": 1.9635920325458347e-05, + "loss": 1.7533, + "step": 23345 + }, + { + "epoch": 7.165745856353591, + "grad_norm": 0.17413713037967682, + "learning_rate": 1.9631971431960005e-05, + "loss": 1.6962, + "step": 23346 + }, + { + "epoch": 7.166052793124616, + "grad_norm": 0.19787384569644928, + "learning_rate": 1.9628022838576315e-05, + "loss": 1.7369, + "step": 23347 + }, + { + "epoch": 7.166359729895642, + "grad_norm": 0.1726577877998352, + "learning_rate": 1.962407454534631e-05, + "loss": 1.7004, + "step": 23348 + }, + { + "epoch": 7.166666666666667, + "grad_norm": 0.2136315256357193, + "learning_rate": 1.962012655230899e-05, + "loss": 1.7411, + "step": 23349 + }, + { + "epoch": 7.166973603437691, + "grad_norm": 0.18257126212120056, + "learning_rate": 1.9616178859503414e-05, + "loss": 1.7155, + "step": 23350 + }, + { + "epoch": 7.167280540208717, + "grad_norm": 0.18696577847003937, + "learning_rate": 1.961223146696854e-05, + "loss": 1.7272, + "step": 23351 + }, + { + "epoch": 7.167587476979742, + "grad_norm": 0.16375793516635895, + "learning_rate": 1.9608284374743435e-05, + "loss": 1.6706, + "step": 23352 + }, + { + "epoch": 7.167894413750767, + "grad_norm": 0.19589200615882874, + "learning_rate": 1.960433758286704e-05, + "loss": 1.7018, + "step": 23353 + }, + { + "epoch": 7.168201350521793, + "grad_norm": 0.18434208631515503, + "learning_rate": 1.9600391091378417e-05, + "loss": 1.6776, + "step": 23354 + }, + { + "epoch": 7.168508287292818, + "grad_norm": 0.23839476704597473, + "learning_rate": 1.9596444900316545e-05, + "loss": 1.7501, + "step": 23355 + }, + { + "epoch": 7.1688152240638425, + "grad_norm": 0.20229686796665192, + "learning_rate": 1.9592499009720428e-05, + "loss": 1.7249, + "step": 23356 + }, + { + "epoch": 7.169122160834868, + "grad_norm": 0.2422642856836319, + "learning_rate": 1.9588553419629076e-05, + "loss": 1.7621, + "step": 23357 + }, + { + "epoch": 7.169429097605893, + "grad_norm": 0.21856555342674255, + "learning_rate": 1.9584608130081422e-05, + "loss": 1.7362, + "step": 23358 + }, + { + "epoch": 7.1697360343769185, + "grad_norm": 0.19434040784835815, + "learning_rate": 1.958066314111652e-05, + "loss": 1.6888, + "step": 23359 + }, + { + "epoch": 7.170042971147944, + "grad_norm": 0.19806630909442902, + "learning_rate": 1.9576718452773335e-05, + "loss": 1.7461, + "step": 23360 + }, + { + "epoch": 7.170349907918968, + "grad_norm": 0.19190531969070435, + "learning_rate": 1.957277406509085e-05, + "loss": 1.6992, + "step": 23361 + }, + { + "epoch": 7.170656844689994, + "grad_norm": 0.20990152657032013, + "learning_rate": 1.9568829978108044e-05, + "loss": 1.7095, + "step": 23362 + }, + { + "epoch": 7.170963781461019, + "grad_norm": 0.18638263642787933, + "learning_rate": 1.9564886191863897e-05, + "loss": 1.7024, + "step": 23363 + }, + { + "epoch": 7.171270718232044, + "grad_norm": 0.1974666863679886, + "learning_rate": 1.9560942706397383e-05, + "loss": 1.6901, + "step": 23364 + }, + { + "epoch": 7.17157765500307, + "grad_norm": 0.171469047665596, + "learning_rate": 1.955699952174747e-05, + "loss": 1.717, + "step": 23365 + }, + { + "epoch": 7.171884591774095, + "grad_norm": 0.17386725544929504, + "learning_rate": 1.955305663795312e-05, + "loss": 1.7069, + "step": 23366 + }, + { + "epoch": 7.172191528545119, + "grad_norm": 0.1869814246892929, + "learning_rate": 1.954911405505334e-05, + "loss": 1.7478, + "step": 23367 + }, + { + "epoch": 7.172498465316145, + "grad_norm": 0.19253556430339813, + "learning_rate": 1.9545171773087033e-05, + "loss": 1.7129, + "step": 23368 + }, + { + "epoch": 7.17280540208717, + "grad_norm": 0.1625998616218567, + "learning_rate": 1.954122979209322e-05, + "loss": 1.7055, + "step": 23369 + }, + { + "epoch": 7.173112338858195, + "grad_norm": 0.172325998544693, + "learning_rate": 1.953728811211079e-05, + "loss": 1.71, + "step": 23370 + }, + { + "epoch": 7.173419275629221, + "grad_norm": 0.22542965412139893, + "learning_rate": 1.9533346733178753e-05, + "loss": 1.7548, + "step": 23371 + }, + { + "epoch": 7.173726212400245, + "grad_norm": 0.1547299474477768, + "learning_rate": 1.9529405655336042e-05, + "loss": 1.6509, + "step": 23372 + }, + { + "epoch": 7.1740331491712706, + "grad_norm": 0.21720515191555023, + "learning_rate": 1.95254648786216e-05, + "loss": 1.7427, + "step": 23373 + }, + { + "epoch": 7.174340085942296, + "grad_norm": 0.18855944275856018, + "learning_rate": 1.95215244030744e-05, + "loss": 1.7471, + "step": 23374 + }, + { + "epoch": 7.174647022713321, + "grad_norm": 0.21088628470897675, + "learning_rate": 1.951758422873332e-05, + "loss": 1.7457, + "step": 23375 + }, + { + "epoch": 7.1749539594843466, + "grad_norm": 0.20596840977668762, + "learning_rate": 1.951364435563736e-05, + "loss": 1.7098, + "step": 23376 + }, + { + "epoch": 7.175260896255371, + "grad_norm": 0.20098064839839935, + "learning_rate": 1.9509704783825433e-05, + "loss": 1.7225, + "step": 23377 + }, + { + "epoch": 7.175567833026396, + "grad_norm": 0.20860125124454498, + "learning_rate": 1.950576551333647e-05, + "loss": 1.7071, + "step": 23378 + }, + { + "epoch": 7.175874769797422, + "grad_norm": 0.1914912760257721, + "learning_rate": 1.950182654420941e-05, + "loss": 1.7262, + "step": 23379 + }, + { + "epoch": 7.176181706568447, + "grad_norm": 0.21109424531459808, + "learning_rate": 1.9497887876483178e-05, + "loss": 1.6601, + "step": 23380 + }, + { + "epoch": 7.176488643339472, + "grad_norm": 0.20514877140522003, + "learning_rate": 1.949394951019669e-05, + "loss": 1.7612, + "step": 23381 + }, + { + "epoch": 7.176795580110497, + "grad_norm": 0.20280246436595917, + "learning_rate": 1.949001144538888e-05, + "loss": 1.6754, + "step": 23382 + }, + { + "epoch": 7.177102516881522, + "grad_norm": 0.1724841594696045, + "learning_rate": 1.9486073682098654e-05, + "loss": 1.7252, + "step": 23383 + }, + { + "epoch": 7.1774094536525475, + "grad_norm": 0.16961625218391418, + "learning_rate": 1.948213622036493e-05, + "loss": 1.6835, + "step": 23384 + }, + { + "epoch": 7.177716390423573, + "grad_norm": 0.17938925325870514, + "learning_rate": 1.947819906022661e-05, + "loss": 1.6909, + "step": 23385 + }, + { + "epoch": 7.178023327194598, + "grad_norm": 0.19711901247501373, + "learning_rate": 1.9474262201722655e-05, + "loss": 1.7275, + "step": 23386 + }, + { + "epoch": 7.1783302639656235, + "grad_norm": 0.19549165666103363, + "learning_rate": 1.947032564489189e-05, + "loss": 1.7609, + "step": 23387 + }, + { + "epoch": 7.178637200736648, + "grad_norm": 0.20358525216579437, + "learning_rate": 1.9466389389773284e-05, + "loss": 1.7127, + "step": 23388 + }, + { + "epoch": 7.178944137507673, + "grad_norm": 0.18345355987548828, + "learning_rate": 1.946245343640571e-05, + "loss": 1.6807, + "step": 23389 + }, + { + "epoch": 7.179251074278699, + "grad_norm": 0.20261847972869873, + "learning_rate": 1.9458517784828074e-05, + "loss": 1.717, + "step": 23390 + }, + { + "epoch": 7.179558011049724, + "grad_norm": 0.18042106926441193, + "learning_rate": 1.9454582435079275e-05, + "loss": 1.7415, + "step": 23391 + }, + { + "epoch": 7.179864947820749, + "grad_norm": 0.1731836199760437, + "learning_rate": 1.945064738719817e-05, + "loss": 1.6661, + "step": 23392 + }, + { + "epoch": 7.180171884591774, + "grad_norm": 0.1971052885055542, + "learning_rate": 1.9446712641223685e-05, + "loss": 1.753, + "step": 23393 + }, + { + "epoch": 7.180478821362799, + "grad_norm": 0.22370313107967377, + "learning_rate": 1.94427781971947e-05, + "loss": 1.7118, + "step": 23394 + }, + { + "epoch": 7.180785758133824, + "grad_norm": 0.23129026591777802, + "learning_rate": 1.9438844055150086e-05, + "loss": 1.8087, + "step": 23395 + }, + { + "epoch": 7.18109269490485, + "grad_norm": 0.26353758573532104, + "learning_rate": 1.9434910215128727e-05, + "loss": 1.7147, + "step": 23396 + }, + { + "epoch": 7.181399631675875, + "grad_norm": 0.22333624958992004, + "learning_rate": 1.9430976677169504e-05, + "loss": 1.7403, + "step": 23397 + }, + { + "epoch": 7.1817065684469, + "grad_norm": 0.22191296517848969, + "learning_rate": 1.9427043441311284e-05, + "loss": 1.7125, + "step": 23398 + }, + { + "epoch": 7.182013505217925, + "grad_norm": 0.19174177944660187, + "learning_rate": 1.942311050759294e-05, + "loss": 1.7026, + "step": 23399 + }, + { + "epoch": 7.18232044198895, + "grad_norm": 0.2175525426864624, + "learning_rate": 1.9419177876053342e-05, + "loss": 1.6947, + "step": 23400 + }, + { + "epoch": 7.1826273787599755, + "grad_norm": 0.19419047236442566, + "learning_rate": 1.9415245546731348e-05, + "loss": 1.7309, + "step": 23401 + }, + { + "epoch": 7.182934315531001, + "grad_norm": 0.22568467259407043, + "learning_rate": 1.9411313519665806e-05, + "loss": 1.7177, + "step": 23402 + }, + { + "epoch": 7.183241252302026, + "grad_norm": 0.26983609795570374, + "learning_rate": 1.9407381794895635e-05, + "loss": 1.6779, + "step": 23403 + }, + { + "epoch": 7.183548189073051, + "grad_norm": 0.1651962548494339, + "learning_rate": 1.9403450372459602e-05, + "loss": 1.6718, + "step": 23404 + }, + { + "epoch": 7.183855125844076, + "grad_norm": 0.2337920367717743, + "learning_rate": 1.9399519252396653e-05, + "loss": 1.7271, + "step": 23405 + }, + { + "epoch": 7.184162062615101, + "grad_norm": 0.20093166828155518, + "learning_rate": 1.9395588434745547e-05, + "loss": 1.7274, + "step": 23406 + }, + { + "epoch": 7.184468999386127, + "grad_norm": 0.22497716546058655, + "learning_rate": 1.9391657919545193e-05, + "loss": 1.7419, + "step": 23407 + }, + { + "epoch": 7.184775936157152, + "grad_norm": 0.22474822402000427, + "learning_rate": 1.938772770683443e-05, + "loss": 1.8317, + "step": 23408 + }, + { + "epoch": 7.185082872928176, + "grad_norm": 0.18015392124652863, + "learning_rate": 1.9383797796652052e-05, + "loss": 1.6568, + "step": 23409 + }, + { + "epoch": 7.185389809699202, + "grad_norm": 0.18696026504039764, + "learning_rate": 1.9379868189036947e-05, + "loss": 1.6722, + "step": 23410 + }, + { + "epoch": 7.185696746470227, + "grad_norm": 0.1828698217868805, + "learning_rate": 1.9375938884027934e-05, + "loss": 1.7477, + "step": 23411 + }, + { + "epoch": 7.186003683241252, + "grad_norm": 0.20442047715187073, + "learning_rate": 1.937200988166384e-05, + "loss": 1.7269, + "step": 23412 + }, + { + "epoch": 7.186310620012278, + "grad_norm": 0.17201031744480133, + "learning_rate": 1.9368081181983494e-05, + "loss": 1.6893, + "step": 23413 + }, + { + "epoch": 7.186617556783303, + "grad_norm": 0.21501687169075012, + "learning_rate": 1.9364152785025723e-05, + "loss": 1.771, + "step": 23414 + }, + { + "epoch": 7.1869244935543275, + "grad_norm": 0.18059030175209045, + "learning_rate": 1.936022469082936e-05, + "loss": 1.7088, + "step": 23415 + }, + { + "epoch": 7.187231430325353, + "grad_norm": 0.18079128861427307, + "learning_rate": 1.9356296899433206e-05, + "loss": 1.764, + "step": 23416 + }, + { + "epoch": 7.187538367096378, + "grad_norm": 0.1960453987121582, + "learning_rate": 1.9352369410876086e-05, + "loss": 1.7302, + "step": 23417 + }, + { + "epoch": 7.1878453038674035, + "grad_norm": 0.19896337389945984, + "learning_rate": 1.9348442225196815e-05, + "loss": 1.7228, + "step": 23418 + }, + { + "epoch": 7.188152240638429, + "grad_norm": 0.19272227585315704, + "learning_rate": 1.9344515342434192e-05, + "loss": 1.7164, + "step": 23419 + }, + { + "epoch": 7.188459177409453, + "grad_norm": 0.16746973991394043, + "learning_rate": 1.9340588762627066e-05, + "loss": 1.696, + "step": 23420 + }, + { + "epoch": 7.188766114180479, + "grad_norm": 0.2421095222234726, + "learning_rate": 1.9336662485814178e-05, + "loss": 1.766, + "step": 23421 + }, + { + "epoch": 7.189073050951504, + "grad_norm": 0.17857256531715393, + "learning_rate": 1.93327365120344e-05, + "loss": 1.7216, + "step": 23422 + }, + { + "epoch": 7.189379987722529, + "grad_norm": 0.19336672127246857, + "learning_rate": 1.932881084132646e-05, + "loss": 1.7124, + "step": 23423 + }, + { + "epoch": 7.189686924493555, + "grad_norm": 0.1555519700050354, + "learning_rate": 1.9324885473729204e-05, + "loss": 1.6491, + "step": 23424 + }, + { + "epoch": 7.189993861264579, + "grad_norm": 0.17879530787467957, + "learning_rate": 1.9320960409281425e-05, + "loss": 1.697, + "step": 23425 + }, + { + "epoch": 7.190300798035604, + "grad_norm": 0.17966939508914948, + "learning_rate": 1.9317035648021862e-05, + "loss": 1.6786, + "step": 23426 + }, + { + "epoch": 7.19060773480663, + "grad_norm": 0.21742603182792664, + "learning_rate": 1.9313111189989375e-05, + "loss": 1.734, + "step": 23427 + }, + { + "epoch": 7.190914671577655, + "grad_norm": 0.22135521471500397, + "learning_rate": 1.9309187035222675e-05, + "loss": 1.7154, + "step": 23428 + }, + { + "epoch": 7.19122160834868, + "grad_norm": 0.17866137623786926, + "learning_rate": 1.930526318376059e-05, + "loss": 1.6723, + "step": 23429 + }, + { + "epoch": 7.191528545119706, + "grad_norm": 0.26034823060035706, + "learning_rate": 1.9301339635641887e-05, + "loss": 1.6975, + "step": 23430 + }, + { + "epoch": 7.19183548189073, + "grad_norm": 0.21550825238227844, + "learning_rate": 1.929741639090534e-05, + "loss": 1.7401, + "step": 23431 + }, + { + "epoch": 7.1921424186617555, + "grad_norm": 0.19205132126808167, + "learning_rate": 1.9293493449589718e-05, + "loss": 1.6543, + "step": 23432 + }, + { + "epoch": 7.192449355432781, + "grad_norm": 0.18724635243415833, + "learning_rate": 1.928957081173379e-05, + "loss": 1.7752, + "step": 23433 + }, + { + "epoch": 7.192756292203806, + "grad_norm": 0.2392650544643402, + "learning_rate": 1.928564847737633e-05, + "loss": 1.7008, + "step": 23434 + }, + { + "epoch": 7.1930632289748315, + "grad_norm": 0.18950903415679932, + "learning_rate": 1.9281726446556088e-05, + "loss": 1.7193, + "step": 23435 + }, + { + "epoch": 7.193370165745856, + "grad_norm": 0.2542276978492737, + "learning_rate": 1.9277804719311808e-05, + "loss": 1.7192, + "step": 23436 + }, + { + "epoch": 7.193677102516881, + "grad_norm": 0.1987142711877823, + "learning_rate": 1.927388329568231e-05, + "loss": 1.6943, + "step": 23437 + }, + { + "epoch": 7.193984039287907, + "grad_norm": 0.18837273120880127, + "learning_rate": 1.9269962175706275e-05, + "loss": 1.7443, + "step": 23438 + }, + { + "epoch": 7.194290976058932, + "grad_norm": 0.20432044565677643, + "learning_rate": 1.9266041359422514e-05, + "loss": 1.741, + "step": 23439 + }, + { + "epoch": 7.194597912829957, + "grad_norm": 0.17763052880764008, + "learning_rate": 1.9262120846869715e-05, + "loss": 1.6696, + "step": 23440 + }, + { + "epoch": 7.194904849600983, + "grad_norm": 0.1747766137123108, + "learning_rate": 1.9258200638086665e-05, + "loss": 1.6727, + "step": 23441 + }, + { + "epoch": 7.195211786372007, + "grad_norm": 0.22058527171611786, + "learning_rate": 1.9254280733112117e-05, + "loss": 1.7387, + "step": 23442 + }, + { + "epoch": 7.195518723143032, + "grad_norm": 0.2247757911682129, + "learning_rate": 1.925036113198475e-05, + "loss": 1.7828, + "step": 23443 + }, + { + "epoch": 7.195825659914058, + "grad_norm": 0.16923101246356964, + "learning_rate": 1.924644183474337e-05, + "loss": 1.6655, + "step": 23444 + }, + { + "epoch": 7.196132596685083, + "grad_norm": 0.1599757820367813, + "learning_rate": 1.924252284142665e-05, + "loss": 1.7002, + "step": 23445 + }, + { + "epoch": 7.196439533456108, + "grad_norm": 0.1916438341140747, + "learning_rate": 1.9238604152073358e-05, + "loss": 1.71, + "step": 23446 + }, + { + "epoch": 7.196746470227133, + "grad_norm": 0.18037991225719452, + "learning_rate": 1.9234685766722216e-05, + "loss": 1.6786, + "step": 23447 + }, + { + "epoch": 7.197053406998158, + "grad_norm": 0.20671263337135315, + "learning_rate": 1.9230767685411938e-05, + "loss": 1.7228, + "step": 23448 + }, + { + "epoch": 7.1973603437691835, + "grad_norm": 0.18949514627456665, + "learning_rate": 1.9226849908181243e-05, + "loss": 1.7794, + "step": 23449 + }, + { + "epoch": 7.197667280540209, + "grad_norm": 0.19457660615444183, + "learning_rate": 1.9222932435068857e-05, + "loss": 1.7153, + "step": 23450 + }, + { + "epoch": 7.197974217311234, + "grad_norm": 0.16834792494773865, + "learning_rate": 1.9219015266113494e-05, + "loss": 1.646, + "step": 23451 + }, + { + "epoch": 7.198281154082259, + "grad_norm": 0.21668508648872375, + "learning_rate": 1.9215098401353866e-05, + "loss": 1.7232, + "step": 23452 + }, + { + "epoch": 7.198588090853284, + "grad_norm": 0.1675579994916916, + "learning_rate": 1.9211181840828656e-05, + "loss": 1.6963, + "step": 23453 + }, + { + "epoch": 7.198895027624309, + "grad_norm": 0.19915352761745453, + "learning_rate": 1.9207265584576627e-05, + "loss": 1.7043, + "step": 23454 + }, + { + "epoch": 7.199201964395335, + "grad_norm": 0.23872216045856476, + "learning_rate": 1.920334963263642e-05, + "loss": 1.7784, + "step": 23455 + }, + { + "epoch": 7.19950890116636, + "grad_norm": 0.261321485042572, + "learning_rate": 1.919943398504679e-05, + "loss": 1.8024, + "step": 23456 + }, + { + "epoch": 7.199815837937384, + "grad_norm": 0.17026741802692413, + "learning_rate": 1.9195518641846377e-05, + "loss": 1.7451, + "step": 23457 + }, + { + "epoch": 7.20012277470841, + "grad_norm": 0.20935678482055664, + "learning_rate": 1.9191603603073915e-05, + "loss": 1.752, + "step": 23458 + }, + { + "epoch": 7.200429711479435, + "grad_norm": 0.1756788194179535, + "learning_rate": 1.9187688868768107e-05, + "loss": 1.7008, + "step": 23459 + }, + { + "epoch": 7.2007366482504604, + "grad_norm": 0.23286345601081848, + "learning_rate": 1.9183774438967577e-05, + "loss": 1.7603, + "step": 23460 + }, + { + "epoch": 7.201043585021486, + "grad_norm": 0.17519986629486084, + "learning_rate": 1.917986031371109e-05, + "loss": 1.7127, + "step": 23461 + }, + { + "epoch": 7.201350521792511, + "grad_norm": 0.2603212893009186, + "learning_rate": 1.917594649303725e-05, + "loss": 1.7169, + "step": 23462 + }, + { + "epoch": 7.201657458563536, + "grad_norm": 0.2664981484413147, + "learning_rate": 1.9172032976984792e-05, + "loss": 1.7349, + "step": 23463 + }, + { + "epoch": 7.201964395334561, + "grad_norm": 0.15484265983104706, + "learning_rate": 1.9168119765592375e-05, + "loss": 1.6753, + "step": 23464 + }, + { + "epoch": 7.202271332105586, + "grad_norm": 0.22310250997543335, + "learning_rate": 1.9164206858898664e-05, + "loss": 1.6994, + "step": 23465 + }, + { + "epoch": 7.202578268876612, + "grad_norm": 0.1998710036277771, + "learning_rate": 1.9160294256942336e-05, + "loss": 1.7556, + "step": 23466 + }, + { + "epoch": 7.202885205647637, + "grad_norm": 0.2092670500278473, + "learning_rate": 1.9156381959762058e-05, + "loss": 1.6883, + "step": 23467 + }, + { + "epoch": 7.203192142418661, + "grad_norm": 0.20657336711883545, + "learning_rate": 1.915246996739649e-05, + "loss": 1.8035, + "step": 23468 + }, + { + "epoch": 7.203499079189687, + "grad_norm": 0.2175077497959137, + "learning_rate": 1.9148558279884294e-05, + "loss": 1.7173, + "step": 23469 + }, + { + "epoch": 7.203806015960712, + "grad_norm": 0.16851630806922913, + "learning_rate": 1.9144646897264114e-05, + "loss": 1.6874, + "step": 23470 + }, + { + "epoch": 7.204112952731737, + "grad_norm": 0.23194117844104767, + "learning_rate": 1.9140735819574647e-05, + "loss": 1.7156, + "step": 23471 + }, + { + "epoch": 7.204419889502763, + "grad_norm": 0.17139053344726562, + "learning_rate": 1.9136825046854483e-05, + "loss": 1.6997, + "step": 23472 + }, + { + "epoch": 7.204726826273788, + "grad_norm": 0.18561725318431854, + "learning_rate": 1.913291457914234e-05, + "loss": 1.6575, + "step": 23473 + }, + { + "epoch": 7.2050337630448125, + "grad_norm": 0.2333156019449234, + "learning_rate": 1.9129004416476793e-05, + "loss": 1.7453, + "step": 23474 + }, + { + "epoch": 7.205340699815838, + "grad_norm": 0.2594338655471802, + "learning_rate": 1.9125094558896534e-05, + "loss": 1.7087, + "step": 23475 + }, + { + "epoch": 7.205647636586863, + "grad_norm": 0.16303664445877075, + "learning_rate": 1.91211850064402e-05, + "loss": 1.6985, + "step": 23476 + }, + { + "epoch": 7.2059545733578885, + "grad_norm": 0.2592144012451172, + "learning_rate": 1.9117275759146387e-05, + "loss": 1.7196, + "step": 23477 + }, + { + "epoch": 7.206261510128914, + "grad_norm": 0.1643611341714859, + "learning_rate": 1.9113366817053784e-05, + "loss": 1.686, + "step": 23478 + }, + { + "epoch": 7.206568446899938, + "grad_norm": 0.19730710983276367, + "learning_rate": 1.9109458180200966e-05, + "loss": 1.6883, + "step": 23479 + }, + { + "epoch": 7.206875383670964, + "grad_norm": 0.16942749917507172, + "learning_rate": 1.9105549848626602e-05, + "loss": 1.7272, + "step": 23480 + }, + { + "epoch": 7.207182320441989, + "grad_norm": 0.21967467665672302, + "learning_rate": 1.91016418223693e-05, + "loss": 1.7501, + "step": 23481 + }, + { + "epoch": 7.207489257213014, + "grad_norm": 0.17037035524845123, + "learning_rate": 1.9097734101467684e-05, + "loss": 1.72, + "step": 23482 + }, + { + "epoch": 7.20779619398404, + "grad_norm": 0.21497979760169983, + "learning_rate": 1.9093826685960374e-05, + "loss": 1.6993, + "step": 23483 + }, + { + "epoch": 7.208103130755064, + "grad_norm": 0.1462371051311493, + "learning_rate": 1.9089919575885985e-05, + "loss": 1.6249, + "step": 23484 + }, + { + "epoch": 7.208410067526089, + "grad_norm": 0.1863165646791458, + "learning_rate": 1.9086012771283122e-05, + "loss": 1.6343, + "step": 23485 + }, + { + "epoch": 7.208717004297115, + "grad_norm": 0.1705196648836136, + "learning_rate": 1.9082106272190403e-05, + "loss": 1.7115, + "step": 23486 + }, + { + "epoch": 7.20902394106814, + "grad_norm": 0.20928895473480225, + "learning_rate": 1.9078200078646413e-05, + "loss": 1.6953, + "step": 23487 + }, + { + "epoch": 7.209330877839165, + "grad_norm": 0.2172931581735611, + "learning_rate": 1.9074294190689812e-05, + "loss": 1.7436, + "step": 23488 + }, + { + "epoch": 7.209637814610191, + "grad_norm": 0.1760822981595993, + "learning_rate": 1.9070388608359124e-05, + "loss": 1.6898, + "step": 23489 + }, + { + "epoch": 7.209944751381215, + "grad_norm": 0.28154727816581726, + "learning_rate": 1.9066483331693018e-05, + "loss": 1.7583, + "step": 23490 + }, + { + "epoch": 7.2102516881522405, + "grad_norm": 0.28375890851020813, + "learning_rate": 1.9062578360730027e-05, + "loss": 1.7428, + "step": 23491 + }, + { + "epoch": 7.210558624923266, + "grad_norm": 0.2173614352941513, + "learning_rate": 1.905867369550878e-05, + "loss": 1.6902, + "step": 23492 + }, + { + "epoch": 7.210865561694291, + "grad_norm": 0.2525392174720764, + "learning_rate": 1.9054769336067875e-05, + "loss": 1.7205, + "step": 23493 + }, + { + "epoch": 7.2111724984653165, + "grad_norm": 0.22913219034671783, + "learning_rate": 1.905086528244584e-05, + "loss": 1.7269, + "step": 23494 + }, + { + "epoch": 7.211479435236341, + "grad_norm": 0.2174263298511505, + "learning_rate": 1.9046961534681327e-05, + "loss": 1.7058, + "step": 23495 + }, + { + "epoch": 7.211786372007366, + "grad_norm": 0.2277042120695114, + "learning_rate": 1.9043058092812848e-05, + "loss": 1.7048, + "step": 23496 + }, + { + "epoch": 7.212093308778392, + "grad_norm": 0.17835062742233276, + "learning_rate": 1.9039154956879036e-05, + "loss": 1.7258, + "step": 23497 + }, + { + "epoch": 7.212400245549417, + "grad_norm": 0.22751156985759735, + "learning_rate": 1.903525212691844e-05, + "loss": 1.708, + "step": 23498 + }, + { + "epoch": 7.212707182320442, + "grad_norm": 0.21247950196266174, + "learning_rate": 1.903134960296963e-05, + "loss": 1.7142, + "step": 23499 + }, + { + "epoch": 7.213014119091467, + "grad_norm": 0.2256091684103012, + "learning_rate": 1.9027447385071175e-05, + "loss": 1.6826, + "step": 23500 + }, + { + "epoch": 7.213321055862492, + "grad_norm": 0.16704921424388885, + "learning_rate": 1.902354547326164e-05, + "loss": 1.6639, + "step": 23501 + }, + { + "epoch": 7.213627992633517, + "grad_norm": 0.20211774110794067, + "learning_rate": 1.901964386757958e-05, + "loss": 1.7448, + "step": 23502 + }, + { + "epoch": 7.213934929404543, + "grad_norm": 0.2090187519788742, + "learning_rate": 1.901574256806356e-05, + "loss": 1.7425, + "step": 23503 + }, + { + "epoch": 7.214241866175568, + "grad_norm": 0.1942494809627533, + "learning_rate": 1.9011841574752114e-05, + "loss": 1.721, + "step": 23504 + }, + { + "epoch": 7.214548802946593, + "grad_norm": 0.1842714548110962, + "learning_rate": 1.900794088768385e-05, + "loss": 1.7092, + "step": 23505 + }, + { + "epoch": 7.214855739717618, + "grad_norm": 0.16807401180267334, + "learning_rate": 1.900404050689724e-05, + "loss": 1.6788, + "step": 23506 + }, + { + "epoch": 7.215162676488643, + "grad_norm": 0.16467349231243134, + "learning_rate": 1.9000140432430907e-05, + "loss": 1.6544, + "step": 23507 + }, + { + "epoch": 7.2154696132596685, + "grad_norm": 0.1806645542383194, + "learning_rate": 1.899624066432332e-05, + "loss": 1.6871, + "step": 23508 + }, + { + "epoch": 7.215776550030694, + "grad_norm": 0.16891708970069885, + "learning_rate": 1.8992341202613073e-05, + "loss": 1.6912, + "step": 23509 + }, + { + "epoch": 7.216083486801719, + "grad_norm": 0.21191391348838806, + "learning_rate": 1.89884420473387e-05, + "loss": 1.7843, + "step": 23510 + }, + { + "epoch": 7.216390423572744, + "grad_norm": 0.18484020233154297, + "learning_rate": 1.8984543198538684e-05, + "loss": 1.699, + "step": 23511 + }, + { + "epoch": 7.216697360343769, + "grad_norm": 0.2106105536222458, + "learning_rate": 1.8980644656251627e-05, + "loss": 1.7239, + "step": 23512 + }, + { + "epoch": 7.217004297114794, + "grad_norm": 0.19923320412635803, + "learning_rate": 1.8976746420515988e-05, + "loss": 1.7989, + "step": 23513 + }, + { + "epoch": 7.21731123388582, + "grad_norm": 0.21371988952159882, + "learning_rate": 1.897284849137034e-05, + "loss": 1.7071, + "step": 23514 + }, + { + "epoch": 7.217618170656845, + "grad_norm": 0.20450851321220398, + "learning_rate": 1.8968950868853184e-05, + "loss": 1.7051, + "step": 23515 + }, + { + "epoch": 7.21792510742787, + "grad_norm": 0.22700995206832886, + "learning_rate": 1.8965053553003055e-05, + "loss": 1.7556, + "step": 23516 + }, + { + "epoch": 7.218232044198895, + "grad_norm": 0.26295945048332214, + "learning_rate": 1.896115654385845e-05, + "loss": 1.7893, + "step": 23517 + }, + { + "epoch": 7.21853898096992, + "grad_norm": 0.17091867327690125, + "learning_rate": 1.8957259841457885e-05, + "loss": 1.7289, + "step": 23518 + }, + { + "epoch": 7.218845917740945, + "grad_norm": 0.24840304255485535, + "learning_rate": 1.8953363445839877e-05, + "loss": 1.6958, + "step": 23519 + }, + { + "epoch": 7.219152854511971, + "grad_norm": 0.20042046904563904, + "learning_rate": 1.8949467357042926e-05, + "loss": 1.743, + "step": 23520 + }, + { + "epoch": 7.219459791282996, + "grad_norm": 0.18286047875881195, + "learning_rate": 1.894557157510552e-05, + "loss": 1.7065, + "step": 23521 + }, + { + "epoch": 7.2197667280540205, + "grad_norm": 0.18324656784534454, + "learning_rate": 1.894167610006622e-05, + "loss": 1.7083, + "step": 23522 + }, + { + "epoch": 7.220073664825046, + "grad_norm": 0.17110426723957062, + "learning_rate": 1.8937780931963432e-05, + "loss": 1.7016, + "step": 23523 + }, + { + "epoch": 7.220380601596071, + "grad_norm": 0.19164881110191345, + "learning_rate": 1.8933886070835743e-05, + "loss": 1.7011, + "step": 23524 + }, + { + "epoch": 7.2206875383670965, + "grad_norm": 0.16899923980236053, + "learning_rate": 1.892999151672157e-05, + "loss": 1.7227, + "step": 23525 + }, + { + "epoch": 7.220994475138122, + "grad_norm": 0.18763495981693268, + "learning_rate": 1.8926097269659437e-05, + "loss": 1.6956, + "step": 23526 + }, + { + "epoch": 7.221301411909146, + "grad_norm": 0.1665162295103073, + "learning_rate": 1.8922203329687847e-05, + "loss": 1.7039, + "step": 23527 + }, + { + "epoch": 7.221608348680172, + "grad_norm": 0.20766250789165497, + "learning_rate": 1.8918309696845226e-05, + "loss": 1.7703, + "step": 23528 + }, + { + "epoch": 7.221915285451197, + "grad_norm": 0.1813010275363922, + "learning_rate": 1.891441637117012e-05, + "loss": 1.6709, + "step": 23529 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 0.15327073633670807, + "learning_rate": 1.891052335270094e-05, + "loss": 1.6518, + "step": 23530 + }, + { + "epoch": 7.222529158993248, + "grad_norm": 0.17191094160079956, + "learning_rate": 1.8906630641476203e-05, + "loss": 1.7193, + "step": 23531 + }, + { + "epoch": 7.222836095764273, + "grad_norm": 0.17976176738739014, + "learning_rate": 1.8902738237534363e-05, + "loss": 1.7162, + "step": 23532 + }, + { + "epoch": 7.223143032535297, + "grad_norm": 0.1828993558883667, + "learning_rate": 1.8898846140913894e-05, + "loss": 1.7163, + "step": 23533 + }, + { + "epoch": 7.223449969306323, + "grad_norm": 0.15828034281730652, + "learning_rate": 1.889495435165326e-05, + "loss": 1.6734, + "step": 23534 + }, + { + "epoch": 7.223756906077348, + "grad_norm": 0.2171369194984436, + "learning_rate": 1.8891062869790915e-05, + "loss": 1.7508, + "step": 23535 + }, + { + "epoch": 7.224063842848373, + "grad_norm": 0.18747110664844513, + "learning_rate": 1.888717169536532e-05, + "loss": 1.7162, + "step": 23536 + }, + { + "epoch": 7.224370779619399, + "grad_norm": 0.19177328050136566, + "learning_rate": 1.8883280828414927e-05, + "loss": 1.7044, + "step": 23537 + }, + { + "epoch": 7.224677716390423, + "grad_norm": 0.175906702876091, + "learning_rate": 1.88793902689782e-05, + "loss": 1.7126, + "step": 23538 + }, + { + "epoch": 7.2249846531614486, + "grad_norm": 0.17842896282672882, + "learning_rate": 1.887550001709357e-05, + "loss": 1.7469, + "step": 23539 + }, + { + "epoch": 7.225291589932474, + "grad_norm": 0.23797607421875, + "learning_rate": 1.8871610072799478e-05, + "loss": 1.7343, + "step": 23540 + }, + { + "epoch": 7.225598526703499, + "grad_norm": 0.2297922819852829, + "learning_rate": 1.8867720436134412e-05, + "loss": 1.7453, + "step": 23541 + }, + { + "epoch": 7.225905463474525, + "grad_norm": 0.19950568675994873, + "learning_rate": 1.8863831107136748e-05, + "loss": 1.6984, + "step": 23542 + }, + { + "epoch": 7.226212400245549, + "grad_norm": 0.2809087038040161, + "learning_rate": 1.8859942085844974e-05, + "loss": 1.7815, + "step": 23543 + }, + { + "epoch": 7.226519337016574, + "grad_norm": 0.20534642040729523, + "learning_rate": 1.8856053372297515e-05, + "loss": 1.7455, + "step": 23544 + }, + { + "epoch": 7.2268262737876, + "grad_norm": 0.20052307844161987, + "learning_rate": 1.885216496653276e-05, + "loss": 1.6655, + "step": 23545 + }, + { + "epoch": 7.227133210558625, + "grad_norm": 0.1948573738336563, + "learning_rate": 1.8848276868589205e-05, + "loss": 1.7036, + "step": 23546 + }, + { + "epoch": 7.22744014732965, + "grad_norm": 0.16764269769191742, + "learning_rate": 1.8844389078505197e-05, + "loss": 1.6605, + "step": 23547 + }, + { + "epoch": 7.227747084100676, + "grad_norm": 0.17951633036136627, + "learning_rate": 1.8840501596319214e-05, + "loss": 1.6948, + "step": 23548 + }, + { + "epoch": 7.2280540208717, + "grad_norm": 0.1906418353319168, + "learning_rate": 1.883661442206966e-05, + "loss": 1.7122, + "step": 23549 + }, + { + "epoch": 7.2283609576427255, + "grad_norm": 0.19535204768180847, + "learning_rate": 1.8832727555794943e-05, + "loss": 1.7089, + "step": 23550 + }, + { + "epoch": 7.228667894413751, + "grad_norm": 0.20654071867465973, + "learning_rate": 1.8828840997533488e-05, + "loss": 1.7113, + "step": 23551 + }, + { + "epoch": 7.228974831184776, + "grad_norm": 0.18860456347465515, + "learning_rate": 1.8824954747323692e-05, + "loss": 1.7475, + "step": 23552 + }, + { + "epoch": 7.2292817679558015, + "grad_norm": 0.21949729323387146, + "learning_rate": 1.882106880520396e-05, + "loss": 1.7819, + "step": 23553 + }, + { + "epoch": 7.229588704726826, + "grad_norm": 0.2177286595106125, + "learning_rate": 1.881718317121271e-05, + "loss": 1.7554, + "step": 23554 + }, + { + "epoch": 7.229895641497851, + "grad_norm": 0.21143296360969543, + "learning_rate": 1.8813297845388328e-05, + "loss": 1.7811, + "step": 23555 + }, + { + "epoch": 7.230202578268877, + "grad_norm": 0.24787208437919617, + "learning_rate": 1.880941282776922e-05, + "loss": 1.707, + "step": 23556 + }, + { + "epoch": 7.230509515039902, + "grad_norm": 0.18048164248466492, + "learning_rate": 1.880552811839375e-05, + "loss": 1.6841, + "step": 23557 + }, + { + "epoch": 7.230816451810927, + "grad_norm": 0.24056772887706757, + "learning_rate": 1.8801643717300375e-05, + "loss": 1.7868, + "step": 23558 + }, + { + "epoch": 7.231123388581952, + "grad_norm": 0.18564146757125854, + "learning_rate": 1.879775962452741e-05, + "loss": 1.7506, + "step": 23559 + }, + { + "epoch": 7.231430325352977, + "grad_norm": 0.25965458154678345, + "learning_rate": 1.87938758401133e-05, + "loss": 1.7307, + "step": 23560 + }, + { + "epoch": 7.231737262124002, + "grad_norm": 0.17774315178394318, + "learning_rate": 1.8789992364096394e-05, + "loss": 1.7089, + "step": 23561 + }, + { + "epoch": 7.232044198895028, + "grad_norm": 0.2488560527563095, + "learning_rate": 1.878610919651505e-05, + "loss": 1.6811, + "step": 23562 + }, + { + "epoch": 7.232351135666053, + "grad_norm": 0.1963108628988266, + "learning_rate": 1.8782226337407703e-05, + "loss": 1.6512, + "step": 23563 + }, + { + "epoch": 7.232658072437078, + "grad_norm": 0.25702449679374695, + "learning_rate": 1.8778343786812663e-05, + "loss": 1.7697, + "step": 23564 + }, + { + "epoch": 7.232965009208103, + "grad_norm": 0.18145591020584106, + "learning_rate": 1.8774461544768347e-05, + "loss": 1.6842, + "step": 23565 + }, + { + "epoch": 7.233271945979128, + "grad_norm": 0.2482728213071823, + "learning_rate": 1.87705796113131e-05, + "loss": 1.7028, + "step": 23566 + }, + { + "epoch": 7.2335788827501535, + "grad_norm": 0.16365976631641388, + "learning_rate": 1.8766697986485293e-05, + "loss": 1.7266, + "step": 23567 + }, + { + "epoch": 7.233885819521179, + "grad_norm": 0.1877463459968567, + "learning_rate": 1.876281667032328e-05, + "loss": 1.6909, + "step": 23568 + }, + { + "epoch": 7.234192756292204, + "grad_norm": 0.19121702015399933, + "learning_rate": 1.8758935662865423e-05, + "loss": 1.7303, + "step": 23569 + }, + { + "epoch": 7.234499693063229, + "grad_norm": 0.1783505082130432, + "learning_rate": 1.8755054964150072e-05, + "loss": 1.7209, + "step": 23570 + }, + { + "epoch": 7.234806629834254, + "grad_norm": 0.172771617770195, + "learning_rate": 1.8751174574215585e-05, + "loss": 1.6824, + "step": 23571 + }, + { + "epoch": 7.235113566605279, + "grad_norm": 0.1675102859735489, + "learning_rate": 1.8747294493100304e-05, + "loss": 1.6664, + "step": 23572 + }, + { + "epoch": 7.235420503376305, + "grad_norm": 0.18213391304016113, + "learning_rate": 1.8743414720842578e-05, + "loss": 1.6725, + "step": 23573 + }, + { + "epoch": 7.23572744014733, + "grad_norm": 0.2204304337501526, + "learning_rate": 1.8739535257480728e-05, + "loss": 1.7662, + "step": 23574 + }, + { + "epoch": 7.236034376918354, + "grad_norm": 0.22732098400592804, + "learning_rate": 1.873565610305315e-05, + "loss": 1.7808, + "step": 23575 + }, + { + "epoch": 7.23634131368938, + "grad_norm": 0.17859263718128204, + "learning_rate": 1.8731777257598128e-05, + "loss": 1.6767, + "step": 23576 + }, + { + "epoch": 7.236648250460405, + "grad_norm": 0.16690675914287567, + "learning_rate": 1.8727898721154007e-05, + "loss": 1.6523, + "step": 23577 + }, + { + "epoch": 7.23695518723143, + "grad_norm": 0.17576774954795837, + "learning_rate": 1.872402049375912e-05, + "loss": 1.6951, + "step": 23578 + }, + { + "epoch": 7.237262124002456, + "grad_norm": 0.20455172657966614, + "learning_rate": 1.8720142575451777e-05, + "loss": 1.7402, + "step": 23579 + }, + { + "epoch": 7.237569060773481, + "grad_norm": 0.2122879922389984, + "learning_rate": 1.8716264966270352e-05, + "loss": 1.7571, + "step": 23580 + }, + { + "epoch": 7.2378759975445055, + "grad_norm": 0.17752611637115479, + "learning_rate": 1.87123876662531e-05, + "loss": 1.7185, + "step": 23581 + }, + { + "epoch": 7.238182934315531, + "grad_norm": 0.21253602206707, + "learning_rate": 1.87085106754384e-05, + "loss": 1.7281, + "step": 23582 + }, + { + "epoch": 7.238489871086556, + "grad_norm": 0.19470329582691193, + "learning_rate": 1.8704633993864514e-05, + "loss": 1.6772, + "step": 23583 + }, + { + "epoch": 7.2387968078575815, + "grad_norm": 0.19556869566440582, + "learning_rate": 1.8700757621569786e-05, + "loss": 1.6888, + "step": 23584 + }, + { + "epoch": 7.239103744628607, + "grad_norm": 0.20525780320167542, + "learning_rate": 1.869688155859252e-05, + "loss": 1.7517, + "step": 23585 + }, + { + "epoch": 7.239410681399631, + "grad_norm": 0.23367032408714294, + "learning_rate": 1.869300580497102e-05, + "loss": 1.781, + "step": 23586 + }, + { + "epoch": 7.239717618170657, + "grad_norm": 0.1893240362405777, + "learning_rate": 1.8689130360743583e-05, + "loss": 1.7265, + "step": 23587 + }, + { + "epoch": 7.240024554941682, + "grad_norm": 0.17136700451374054, + "learning_rate": 1.868525522594851e-05, + "loss": 1.6631, + "step": 23588 + }, + { + "epoch": 7.240331491712707, + "grad_norm": 0.1984632909297943, + "learning_rate": 1.8681380400624103e-05, + "loss": 1.7337, + "step": 23589 + }, + { + "epoch": 7.240638428483733, + "grad_norm": 0.19046886265277863, + "learning_rate": 1.867750588480865e-05, + "loss": 1.7094, + "step": 23590 + }, + { + "epoch": 7.240945365254758, + "grad_norm": 0.18242189288139343, + "learning_rate": 1.8673631678540427e-05, + "loss": 1.692, + "step": 23591 + }, + { + "epoch": 7.241252302025782, + "grad_norm": 0.1741522252559662, + "learning_rate": 1.8669757781857768e-05, + "loss": 1.6975, + "step": 23592 + }, + { + "epoch": 7.241559238796808, + "grad_norm": 0.1778191328048706, + "learning_rate": 1.866588419479891e-05, + "loss": 1.7092, + "step": 23593 + }, + { + "epoch": 7.241866175567833, + "grad_norm": 0.17402158677577972, + "learning_rate": 1.866201091740215e-05, + "loss": 1.7072, + "step": 23594 + }, + { + "epoch": 7.242173112338858, + "grad_norm": 0.22215119004249573, + "learning_rate": 1.8658137949705763e-05, + "loss": 1.7205, + "step": 23595 + }, + { + "epoch": 7.242480049109884, + "grad_norm": 0.15291182696819305, + "learning_rate": 1.8654265291748013e-05, + "loss": 1.7341, + "step": 23596 + }, + { + "epoch": 7.242786985880908, + "grad_norm": 0.18226875364780426, + "learning_rate": 1.8650392943567217e-05, + "loss": 1.6731, + "step": 23597 + }, + { + "epoch": 7.2430939226519335, + "grad_norm": 0.19169047474861145, + "learning_rate": 1.864652090520158e-05, + "loss": 1.777, + "step": 23598 + }, + { + "epoch": 7.243400859422959, + "grad_norm": 0.2063349187374115, + "learning_rate": 1.8642649176689437e-05, + "loss": 1.7258, + "step": 23599 + }, + { + "epoch": 7.243707796193984, + "grad_norm": 0.18550212681293488, + "learning_rate": 1.863877775806898e-05, + "loss": 1.7041, + "step": 23600 + }, + { + "epoch": 7.2440147329650095, + "grad_norm": 0.21196649968624115, + "learning_rate": 1.8634906649378514e-05, + "loss": 1.6672, + "step": 23601 + }, + { + "epoch": 7.244321669736034, + "grad_norm": 0.26801541447639465, + "learning_rate": 1.863103585065629e-05, + "loss": 1.6981, + "step": 23602 + }, + { + "epoch": 7.244628606507059, + "grad_norm": 0.1854090690612793, + "learning_rate": 1.862716536194055e-05, + "loss": 1.7406, + "step": 23603 + }, + { + "epoch": 7.244935543278085, + "grad_norm": 0.15906888246536255, + "learning_rate": 1.8623295183269556e-05, + "loss": 1.6721, + "step": 23604 + }, + { + "epoch": 7.24524248004911, + "grad_norm": 0.2210245132446289, + "learning_rate": 1.8619425314681547e-05, + "loss": 1.7717, + "step": 23605 + }, + { + "epoch": 7.245549416820135, + "grad_norm": 0.17654140293598175, + "learning_rate": 1.861555575621477e-05, + "loss": 1.7428, + "step": 23606 + }, + { + "epoch": 7.245856353591161, + "grad_norm": 0.1582319736480713, + "learning_rate": 1.8611686507907466e-05, + "loss": 1.6814, + "step": 23607 + }, + { + "epoch": 7.246163290362185, + "grad_norm": 0.18817248940467834, + "learning_rate": 1.8607817569797852e-05, + "loss": 1.74, + "step": 23608 + }, + { + "epoch": 7.24647022713321, + "grad_norm": 0.26141074299812317, + "learning_rate": 1.8603948941924227e-05, + "loss": 1.6966, + "step": 23609 + }, + { + "epoch": 7.246777163904236, + "grad_norm": 0.16877111792564392, + "learning_rate": 1.8600080624324757e-05, + "loss": 1.6849, + "step": 23610 + }, + { + "epoch": 7.247084100675261, + "grad_norm": 0.16188141703605652, + "learning_rate": 1.8596212617037694e-05, + "loss": 1.6342, + "step": 23611 + }, + { + "epoch": 7.247391037446286, + "grad_norm": 0.19506491720676422, + "learning_rate": 1.8592344920101267e-05, + "loss": 1.6874, + "step": 23612 + }, + { + "epoch": 7.247697974217311, + "grad_norm": 0.1865006536245346, + "learning_rate": 1.8588477533553677e-05, + "loss": 1.7365, + "step": 23613 + }, + { + "epoch": 7.248004910988336, + "grad_norm": 0.16737428307533264, + "learning_rate": 1.85846104574332e-05, + "loss": 1.6971, + "step": 23614 + }, + { + "epoch": 7.2483118477593615, + "grad_norm": 0.1754695028066635, + "learning_rate": 1.858074369177798e-05, + "loss": 1.7133, + "step": 23615 + }, + { + "epoch": 7.248618784530387, + "grad_norm": 0.21066173911094666, + "learning_rate": 1.85768772366263e-05, + "loss": 1.7737, + "step": 23616 + }, + { + "epoch": 7.248925721301412, + "grad_norm": 0.2530418932437897, + "learning_rate": 1.8573011092016303e-05, + "loss": 1.7962, + "step": 23617 + }, + { + "epoch": 7.249232658072437, + "grad_norm": 0.17780029773712158, + "learning_rate": 1.8569145257986247e-05, + "loss": 1.6691, + "step": 23618 + }, + { + "epoch": 7.249539594843462, + "grad_norm": 0.2105826437473297, + "learning_rate": 1.856527973457432e-05, + "loss": 1.6943, + "step": 23619 + }, + { + "epoch": 7.249846531614487, + "grad_norm": 0.20929837226867676, + "learning_rate": 1.856141452181872e-05, + "loss": 1.7223, + "step": 23620 + }, + { + "epoch": 7.250153468385513, + "grad_norm": 0.17105531692504883, + "learning_rate": 1.8557549619757653e-05, + "loss": 1.6956, + "step": 23621 + }, + { + "epoch": 7.250460405156538, + "grad_norm": 0.21282736957073212, + "learning_rate": 1.8553685028429306e-05, + "loss": 1.7299, + "step": 23622 + }, + { + "epoch": 7.250767341927563, + "grad_norm": 0.1673511266708374, + "learning_rate": 1.8549820747871882e-05, + "loss": 1.7184, + "step": 23623 + }, + { + "epoch": 7.251074278698588, + "grad_norm": 0.1877487152814865, + "learning_rate": 1.854595677812356e-05, + "loss": 1.6989, + "step": 23624 + }, + { + "epoch": 7.251381215469613, + "grad_norm": 0.1709173619747162, + "learning_rate": 1.8542093119222504e-05, + "loss": 1.6994, + "step": 23625 + }, + { + "epoch": 7.2516881522406385, + "grad_norm": 0.18894633650779724, + "learning_rate": 1.8538229771206962e-05, + "loss": 1.665, + "step": 23626 + }, + { + "epoch": 7.251995089011664, + "grad_norm": 0.17623448371887207, + "learning_rate": 1.8534366734115056e-05, + "loss": 1.6999, + "step": 23627 + }, + { + "epoch": 7.252302025782689, + "grad_norm": 0.20008981227874756, + "learning_rate": 1.8530504007984982e-05, + "loss": 1.7147, + "step": 23628 + }, + { + "epoch": 7.252608962553714, + "grad_norm": 0.2506260573863983, + "learning_rate": 1.852664159285491e-05, + "loss": 1.7485, + "step": 23629 + }, + { + "epoch": 7.252915899324739, + "grad_norm": 0.17746438086032867, + "learning_rate": 1.8522779488763e-05, + "loss": 1.7534, + "step": 23630 + }, + { + "epoch": 7.253222836095764, + "grad_norm": 0.1910836547613144, + "learning_rate": 1.8518917695747462e-05, + "loss": 1.7167, + "step": 23631 + }, + { + "epoch": 7.25352977286679, + "grad_norm": 0.18009543418884277, + "learning_rate": 1.8515056213846398e-05, + "loss": 1.6849, + "step": 23632 + }, + { + "epoch": 7.253836709637815, + "grad_norm": 0.18150615692138672, + "learning_rate": 1.851119504309804e-05, + "loss": 1.7077, + "step": 23633 + }, + { + "epoch": 7.25414364640884, + "grad_norm": 0.1874052882194519, + "learning_rate": 1.850733418354047e-05, + "loss": 1.7398, + "step": 23634 + }, + { + "epoch": 7.254450583179865, + "grad_norm": 0.18285217881202698, + "learning_rate": 1.8503473635211897e-05, + "loss": 1.7433, + "step": 23635 + }, + { + "epoch": 7.25475751995089, + "grad_norm": 0.19326861202716827, + "learning_rate": 1.8499613398150463e-05, + "loss": 1.7095, + "step": 23636 + }, + { + "epoch": 7.255064456721915, + "grad_norm": 0.21128259599208832, + "learning_rate": 1.849575347239431e-05, + "loss": 1.7352, + "step": 23637 + }, + { + "epoch": 7.255371393492941, + "grad_norm": 0.19309113919734955, + "learning_rate": 1.849189385798159e-05, + "loss": 1.7098, + "step": 23638 + }, + { + "epoch": 7.255678330263966, + "grad_norm": 0.1877751648426056, + "learning_rate": 1.848803455495044e-05, + "loss": 1.7279, + "step": 23639 + }, + { + "epoch": 7.2559852670349905, + "grad_norm": 0.18840502202510834, + "learning_rate": 1.8484175563339e-05, + "loss": 1.7174, + "step": 23640 + }, + { + "epoch": 7.256292203806016, + "grad_norm": 0.1912582963705063, + "learning_rate": 1.848031688318541e-05, + "loss": 1.6964, + "step": 23641 + }, + { + "epoch": 7.256599140577041, + "grad_norm": 0.188243106007576, + "learning_rate": 1.847645851452779e-05, + "loss": 1.7296, + "step": 23642 + }, + { + "epoch": 7.2569060773480665, + "grad_norm": 0.15838554501533508, + "learning_rate": 1.8472600457404317e-05, + "loss": 1.6276, + "step": 23643 + }, + { + "epoch": 7.257213014119092, + "grad_norm": 0.1605941653251648, + "learning_rate": 1.8468742711853065e-05, + "loss": 1.7015, + "step": 23644 + }, + { + "epoch": 7.257519950890116, + "grad_norm": 0.23647825419902802, + "learning_rate": 1.846488527791218e-05, + "loss": 1.775, + "step": 23645 + }, + { + "epoch": 7.257826887661142, + "grad_norm": 0.2414257973432541, + "learning_rate": 1.846102815561978e-05, + "loss": 1.7456, + "step": 23646 + }, + { + "epoch": 7.258133824432167, + "grad_norm": 0.221851646900177, + "learning_rate": 1.845717134501397e-05, + "loss": 1.6875, + "step": 23647 + }, + { + "epoch": 7.258440761203192, + "grad_norm": 0.20732705295085907, + "learning_rate": 1.8453314846132914e-05, + "loss": 1.6619, + "step": 23648 + }, + { + "epoch": 7.258747697974218, + "grad_norm": 0.18818728625774384, + "learning_rate": 1.8449458659014657e-05, + "loss": 1.6961, + "step": 23649 + }, + { + "epoch": 7.259054634745242, + "grad_norm": 0.19335074722766876, + "learning_rate": 1.8445602783697374e-05, + "loss": 1.6816, + "step": 23650 + }, + { + "epoch": 7.259361571516267, + "grad_norm": 0.27334100008010864, + "learning_rate": 1.844174722021911e-05, + "loss": 1.7435, + "step": 23651 + }, + { + "epoch": 7.259668508287293, + "grad_norm": 0.18763858079910278, + "learning_rate": 1.843789196861801e-05, + "loss": 1.713, + "step": 23652 + }, + { + "epoch": 7.259975445058318, + "grad_norm": 0.2585131525993347, + "learning_rate": 1.843403702893216e-05, + "loss": 1.7151, + "step": 23653 + }, + { + "epoch": 7.260282381829343, + "grad_norm": 0.182148277759552, + "learning_rate": 1.843018240119966e-05, + "loss": 1.7018, + "step": 23654 + }, + { + "epoch": 7.260589318600369, + "grad_norm": 0.31881436705589294, + "learning_rate": 1.84263280854586e-05, + "loss": 1.7428, + "step": 23655 + }, + { + "epoch": 7.260896255371393, + "grad_norm": 0.20997895300388336, + "learning_rate": 1.8422474081747073e-05, + "loss": 1.724, + "step": 23656 + }, + { + "epoch": 7.2612031921424185, + "grad_norm": 0.25038522481918335, + "learning_rate": 1.8418620390103163e-05, + "loss": 1.739, + "step": 23657 + }, + { + "epoch": 7.261510128913444, + "grad_norm": 0.22313323616981506, + "learning_rate": 1.841476701056496e-05, + "loss": 1.7493, + "step": 23658 + }, + { + "epoch": 7.261817065684469, + "grad_norm": 0.22516389191150665, + "learning_rate": 1.8410913943170522e-05, + "loss": 1.79, + "step": 23659 + }, + { + "epoch": 7.2621240024554945, + "grad_norm": 0.1966279298067093, + "learning_rate": 1.8407061187957982e-05, + "loss": 1.7418, + "step": 23660 + }, + { + "epoch": 7.262430939226519, + "grad_norm": 0.18697889149188995, + "learning_rate": 1.840320874496536e-05, + "loss": 1.7347, + "step": 23661 + }, + { + "epoch": 7.262737875997544, + "grad_norm": 0.18226566910743713, + "learning_rate": 1.8399356614230755e-05, + "loss": 1.6979, + "step": 23662 + }, + { + "epoch": 7.26304481276857, + "grad_norm": 0.18880577385425568, + "learning_rate": 1.839550479579223e-05, + "loss": 1.6612, + "step": 23663 + }, + { + "epoch": 7.263351749539595, + "grad_norm": 0.2048085480928421, + "learning_rate": 1.8391653289687826e-05, + "loss": 1.7313, + "step": 23664 + }, + { + "epoch": 7.26365868631062, + "grad_norm": 0.238912895321846, + "learning_rate": 1.838780209595567e-05, + "loss": 1.7522, + "step": 23665 + }, + { + "epoch": 7.263965623081646, + "grad_norm": 0.1656452864408493, + "learning_rate": 1.838395121463375e-05, + "loss": 1.6742, + "step": 23666 + }, + { + "epoch": 7.26427255985267, + "grad_norm": 0.2209266573190689, + "learning_rate": 1.8380100645760186e-05, + "loss": 1.6592, + "step": 23667 + }, + { + "epoch": 7.264579496623695, + "grad_norm": 0.19701217114925385, + "learning_rate": 1.8376250389372967e-05, + "loss": 1.7211, + "step": 23668 + }, + { + "epoch": 7.264886433394721, + "grad_norm": 0.229326069355011, + "learning_rate": 1.837240044551019e-05, + "loss": 1.7044, + "step": 23669 + }, + { + "epoch": 7.265193370165746, + "grad_norm": 0.18499960005283356, + "learning_rate": 1.8368550814209894e-05, + "loss": 1.705, + "step": 23670 + }, + { + "epoch": 7.265500306936771, + "grad_norm": 0.25504955649375916, + "learning_rate": 1.8364701495510117e-05, + "loss": 1.7246, + "step": 23671 + }, + { + "epoch": 7.265807243707796, + "grad_norm": 0.25998997688293457, + "learning_rate": 1.8360852489448903e-05, + "loss": 1.8311, + "step": 23672 + }, + { + "epoch": 7.266114180478821, + "grad_norm": 0.2437162697315216, + "learning_rate": 1.8357003796064294e-05, + "loss": 1.6467, + "step": 23673 + }, + { + "epoch": 7.2664211172498465, + "grad_norm": 0.20784614980220795, + "learning_rate": 1.8353155415394315e-05, + "loss": 1.7361, + "step": 23674 + }, + { + "epoch": 7.266728054020872, + "grad_norm": 0.22633932530879974, + "learning_rate": 1.8349307347476998e-05, + "loss": 1.6518, + "step": 23675 + }, + { + "epoch": 7.267034990791897, + "grad_norm": 0.19307547807693481, + "learning_rate": 1.8345459592350367e-05, + "loss": 1.7469, + "step": 23676 + }, + { + "epoch": 7.267341927562922, + "grad_norm": 0.20418168604373932, + "learning_rate": 1.8341612150052483e-05, + "loss": 1.6892, + "step": 23677 + }, + { + "epoch": 7.267648864333947, + "grad_norm": 0.1574825942516327, + "learning_rate": 1.8337765020621332e-05, + "loss": 1.6682, + "step": 23678 + }, + { + "epoch": 7.267955801104972, + "grad_norm": 0.31023111939430237, + "learning_rate": 1.8333918204094947e-05, + "loss": 1.7382, + "step": 23679 + }, + { + "epoch": 7.268262737875998, + "grad_norm": 0.18148623406887054, + "learning_rate": 1.833007170051134e-05, + "loss": 1.726, + "step": 23680 + }, + { + "epoch": 7.268569674647023, + "grad_norm": 0.19278696179389954, + "learning_rate": 1.832622550990851e-05, + "loss": 1.7176, + "step": 23681 + }, + { + "epoch": 7.268876611418047, + "grad_norm": 0.18298377096652985, + "learning_rate": 1.832237963232452e-05, + "loss": 1.6703, + "step": 23682 + }, + { + "epoch": 7.269183548189073, + "grad_norm": 0.2019357681274414, + "learning_rate": 1.8318534067797304e-05, + "loss": 1.7771, + "step": 23683 + }, + { + "epoch": 7.269490484960098, + "grad_norm": 0.21978864073753357, + "learning_rate": 1.8314688816364944e-05, + "loss": 1.7938, + "step": 23684 + }, + { + "epoch": 7.269797421731123, + "grad_norm": 0.20009377598762512, + "learning_rate": 1.831084387806536e-05, + "loss": 1.7312, + "step": 23685 + }, + { + "epoch": 7.270104358502149, + "grad_norm": 0.16587263345718384, + "learning_rate": 1.8306999252936608e-05, + "loss": 1.7098, + "step": 23686 + }, + { + "epoch": 7.270411295273174, + "grad_norm": 0.20567362010478973, + "learning_rate": 1.8303154941016666e-05, + "loss": 1.6893, + "step": 23687 + }, + { + "epoch": 7.2707182320441985, + "grad_norm": 0.1916830986738205, + "learning_rate": 1.8299310942343527e-05, + "loss": 1.7995, + "step": 23688 + }, + { + "epoch": 7.271025168815224, + "grad_norm": 0.18361486494541168, + "learning_rate": 1.8295467256955174e-05, + "loss": 1.6708, + "step": 23689 + }, + { + "epoch": 7.271332105586249, + "grad_norm": 0.20620734989643097, + "learning_rate": 1.8291623884889597e-05, + "loss": 1.7314, + "step": 23690 + }, + { + "epoch": 7.2716390423572745, + "grad_norm": 0.22560660541057587, + "learning_rate": 1.828778082618478e-05, + "loss": 1.7418, + "step": 23691 + }, + { + "epoch": 7.2719459791283, + "grad_norm": 0.2113492786884308, + "learning_rate": 1.8283938080878697e-05, + "loss": 1.724, + "step": 23692 + }, + { + "epoch": 7.272252915899324, + "grad_norm": 0.26234012842178345, + "learning_rate": 1.8280095649009334e-05, + "loss": 1.7723, + "step": 23693 + }, + { + "epoch": 7.27255985267035, + "grad_norm": 0.1675095111131668, + "learning_rate": 1.827625353061465e-05, + "loss": 1.7473, + "step": 23694 + }, + { + "epoch": 7.272866789441375, + "grad_norm": 0.17751236259937286, + "learning_rate": 1.8272411725732623e-05, + "loss": 1.7374, + "step": 23695 + }, + { + "epoch": 7.2731737262124, + "grad_norm": 0.23158904910087585, + "learning_rate": 1.826857023440122e-05, + "loss": 1.8111, + "step": 23696 + }, + { + "epoch": 7.273480662983426, + "grad_norm": 0.17262183129787445, + "learning_rate": 1.8264729056658407e-05, + "loss": 1.7546, + "step": 23697 + }, + { + "epoch": 7.273787599754451, + "grad_norm": 0.20811094343662262, + "learning_rate": 1.8260888192542126e-05, + "loss": 1.8059, + "step": 23698 + }, + { + "epoch": 7.274094536525475, + "grad_norm": 0.17156411707401276, + "learning_rate": 1.825704764209038e-05, + "loss": 1.7261, + "step": 23699 + }, + { + "epoch": 7.274401473296501, + "grad_norm": 0.18523572385311127, + "learning_rate": 1.8253207405341067e-05, + "loss": 1.7139, + "step": 23700 + }, + { + "epoch": 7.274708410067526, + "grad_norm": 0.20626066625118256, + "learning_rate": 1.824936748233219e-05, + "loss": 1.7269, + "step": 23701 + }, + { + "epoch": 7.2750153468385514, + "grad_norm": 0.1717548966407776, + "learning_rate": 1.8245527873101647e-05, + "loss": 1.7168, + "step": 23702 + }, + { + "epoch": 7.275322283609577, + "grad_norm": 0.16322405636310577, + "learning_rate": 1.8241688577687426e-05, + "loss": 1.7392, + "step": 23703 + }, + { + "epoch": 7.275629220380601, + "grad_norm": 0.19775766134262085, + "learning_rate": 1.8237849596127447e-05, + "loss": 1.7055, + "step": 23704 + }, + { + "epoch": 7.275936157151627, + "grad_norm": 0.1969427913427353, + "learning_rate": 1.823401092845966e-05, + "loss": 1.7418, + "step": 23705 + }, + { + "epoch": 7.276243093922652, + "grad_norm": 0.1791812628507614, + "learning_rate": 1.8230172574721992e-05, + "loss": 1.6512, + "step": 23706 + }, + { + "epoch": 7.276550030693677, + "grad_norm": 0.18583156168460846, + "learning_rate": 1.8226334534952384e-05, + "loss": 1.7357, + "step": 23707 + }, + { + "epoch": 7.276856967464703, + "grad_norm": 0.20729652047157288, + "learning_rate": 1.822249680918876e-05, + "loss": 1.7323, + "step": 23708 + }, + { + "epoch": 7.277163904235728, + "grad_norm": 0.20089028775691986, + "learning_rate": 1.8218659397469045e-05, + "loss": 1.6835, + "step": 23709 + }, + { + "epoch": 7.277470841006752, + "grad_norm": 0.16569854319095612, + "learning_rate": 1.8214822299831168e-05, + "loss": 1.7486, + "step": 23710 + }, + { + "epoch": 7.277777777777778, + "grad_norm": 0.19979944825172424, + "learning_rate": 1.8210985516313044e-05, + "loss": 1.7338, + "step": 23711 + }, + { + "epoch": 7.278084714548803, + "grad_norm": 0.23528912663459778, + "learning_rate": 1.82071490469526e-05, + "loss": 1.8086, + "step": 23712 + }, + { + "epoch": 7.278391651319828, + "grad_norm": 0.18231599032878876, + "learning_rate": 1.8203312891787737e-05, + "loss": 1.744, + "step": 23713 + }, + { + "epoch": 7.278698588090854, + "grad_norm": 0.2208651602268219, + "learning_rate": 1.8199477050856374e-05, + "loss": 1.7592, + "step": 23714 + }, + { + "epoch": 7.279005524861878, + "grad_norm": 0.22329792380332947, + "learning_rate": 1.8195641524196417e-05, + "loss": 1.7242, + "step": 23715 + }, + { + "epoch": 7.2793124616329035, + "grad_norm": 0.17745757102966309, + "learning_rate": 1.8191806311845778e-05, + "loss": 1.7162, + "step": 23716 + }, + { + "epoch": 7.279619398403929, + "grad_norm": 0.19536735117435455, + "learning_rate": 1.8187971413842324e-05, + "loss": 1.6814, + "step": 23717 + }, + { + "epoch": 7.279926335174954, + "grad_norm": 0.21853455901145935, + "learning_rate": 1.8184136830224025e-05, + "loss": 1.7049, + "step": 23718 + }, + { + "epoch": 7.2802332719459795, + "grad_norm": 0.1701575070619583, + "learning_rate": 1.8180302561028696e-05, + "loss": 1.6879, + "step": 23719 + }, + { + "epoch": 7.280540208717004, + "grad_norm": 0.18729525804519653, + "learning_rate": 1.8176468606294288e-05, + "loss": 1.6944, + "step": 23720 + }, + { + "epoch": 7.280847145488029, + "grad_norm": 0.20020832121372223, + "learning_rate": 1.8172634966058667e-05, + "loss": 1.7415, + "step": 23721 + }, + { + "epoch": 7.281154082259055, + "grad_norm": 0.1983461081981659, + "learning_rate": 1.8168801640359724e-05, + "loss": 1.7198, + "step": 23722 + }, + { + "epoch": 7.28146101903008, + "grad_norm": 0.17578791081905365, + "learning_rate": 1.8164968629235334e-05, + "loss": 1.7155, + "step": 23723 + }, + { + "epoch": 7.281767955801105, + "grad_norm": 0.1944401115179062, + "learning_rate": 1.8161135932723388e-05, + "loss": 1.7579, + "step": 23724 + }, + { + "epoch": 7.28207489257213, + "grad_norm": 0.20413067936897278, + "learning_rate": 1.8157303550861753e-05, + "loss": 1.7105, + "step": 23725 + }, + { + "epoch": 7.282381829343155, + "grad_norm": 0.17515964806079865, + "learning_rate": 1.8153471483688318e-05, + "loss": 1.7448, + "step": 23726 + }, + { + "epoch": 7.28268876611418, + "grad_norm": 0.2039034515619278, + "learning_rate": 1.8149639731240938e-05, + "loss": 1.691, + "step": 23727 + }, + { + "epoch": 7.282995702885206, + "grad_norm": 0.2136354148387909, + "learning_rate": 1.8145808293557483e-05, + "loss": 1.656, + "step": 23728 + }, + { + "epoch": 7.283302639656231, + "grad_norm": 0.23029537498950958, + "learning_rate": 1.814197717067582e-05, + "loss": 1.7588, + "step": 23729 + }, + { + "epoch": 7.283609576427256, + "grad_norm": 0.371910035610199, + "learning_rate": 1.8138146362633816e-05, + "loss": 1.8138, + "step": 23730 + }, + { + "epoch": 7.283916513198281, + "grad_norm": 0.2273472249507904, + "learning_rate": 1.8134315869469327e-05, + "loss": 1.6985, + "step": 23731 + }, + { + "epoch": 7.284223449969306, + "grad_norm": 0.33206698298454285, + "learning_rate": 1.81304856912202e-05, + "loss": 1.7015, + "step": 23732 + }, + { + "epoch": 7.2845303867403315, + "grad_norm": 0.20799405872821808, + "learning_rate": 1.8126655827924295e-05, + "loss": 1.6932, + "step": 23733 + }, + { + "epoch": 7.284837323511357, + "grad_norm": 0.28721246123313904, + "learning_rate": 1.8122826279619437e-05, + "loss": 1.7726, + "step": 23734 + }, + { + "epoch": 7.285144260282382, + "grad_norm": 0.2365201711654663, + "learning_rate": 1.8118997046343533e-05, + "loss": 1.7609, + "step": 23735 + }, + { + "epoch": 7.285451197053407, + "grad_norm": 0.24772630631923676, + "learning_rate": 1.811516812813435e-05, + "loss": 1.7057, + "step": 23736 + }, + { + "epoch": 7.285758133824432, + "grad_norm": 0.19344007968902588, + "learning_rate": 1.8111339525029802e-05, + "loss": 1.7526, + "step": 23737 + }, + { + "epoch": 7.286065070595457, + "grad_norm": 0.2454877346754074, + "learning_rate": 1.8107511237067648e-05, + "loss": 1.6474, + "step": 23738 + }, + { + "epoch": 7.286372007366483, + "grad_norm": 0.18084865808486938, + "learning_rate": 1.810368326428578e-05, + "loss": 1.7381, + "step": 23739 + }, + { + "epoch": 7.286678944137508, + "grad_norm": 0.26264744997024536, + "learning_rate": 1.8099855606722012e-05, + "loss": 1.6585, + "step": 23740 + }, + { + "epoch": 7.286985880908533, + "grad_norm": 0.20219333469867706, + "learning_rate": 1.809602826441416e-05, + "loss": 1.7552, + "step": 23741 + }, + { + "epoch": 7.287292817679558, + "grad_norm": 0.23982326686382294, + "learning_rate": 1.8092201237400064e-05, + "loss": 1.6784, + "step": 23742 + }, + { + "epoch": 7.287599754450583, + "grad_norm": 0.22838538885116577, + "learning_rate": 1.8088374525717534e-05, + "loss": 1.6976, + "step": 23743 + }, + { + "epoch": 7.287906691221608, + "grad_norm": 0.22077307105064392, + "learning_rate": 1.8084548129404395e-05, + "loss": 1.721, + "step": 23744 + }, + { + "epoch": 7.288213627992634, + "grad_norm": 0.19811047613620758, + "learning_rate": 1.8080722048498448e-05, + "loss": 1.7317, + "step": 23745 + }, + { + "epoch": 7.288520564763659, + "grad_norm": 0.25160667300224304, + "learning_rate": 1.8076896283037525e-05, + "loss": 1.7725, + "step": 23746 + }, + { + "epoch": 7.2888275015346835, + "grad_norm": 0.19819392263889313, + "learning_rate": 1.807307083305942e-05, + "loss": 1.7243, + "step": 23747 + }, + { + "epoch": 7.289134438305709, + "grad_norm": 0.21769097447395325, + "learning_rate": 1.806924569860194e-05, + "loss": 1.74, + "step": 23748 + }, + { + "epoch": 7.289441375076734, + "grad_norm": 0.23126530647277832, + "learning_rate": 1.806542087970289e-05, + "loss": 1.7479, + "step": 23749 + }, + { + "epoch": 7.2897483118477595, + "grad_norm": 0.21002748608589172, + "learning_rate": 1.8061596376400065e-05, + "loss": 1.6547, + "step": 23750 + }, + { + "epoch": 7.290055248618785, + "grad_norm": 0.242569699883461, + "learning_rate": 1.8057772188731255e-05, + "loss": 1.7587, + "step": 23751 + }, + { + "epoch": 7.290362185389809, + "grad_norm": 0.19619157910346985, + "learning_rate": 1.8053948316734287e-05, + "loss": 1.6619, + "step": 23752 + }, + { + "epoch": 7.290669122160835, + "grad_norm": 0.2086232304573059, + "learning_rate": 1.8050124760446896e-05, + "loss": 1.6535, + "step": 23753 + }, + { + "epoch": 7.29097605893186, + "grad_norm": 0.1955464631319046, + "learning_rate": 1.8046301519906932e-05, + "loss": 1.6814, + "step": 23754 + }, + { + "epoch": 7.291282995702885, + "grad_norm": 0.20373155176639557, + "learning_rate": 1.8042478595152117e-05, + "loss": 1.7006, + "step": 23755 + }, + { + "epoch": 7.291589932473911, + "grad_norm": 0.20233015716075897, + "learning_rate": 1.8038655986220272e-05, + "loss": 1.7478, + "step": 23756 + }, + { + "epoch": 7.291896869244935, + "grad_norm": 0.18800894916057587, + "learning_rate": 1.803483369314916e-05, + "loss": 1.747, + "step": 23757 + }, + { + "epoch": 7.29220380601596, + "grad_norm": 0.1838926076889038, + "learning_rate": 1.8031011715976558e-05, + "loss": 1.7086, + "step": 23758 + }, + { + "epoch": 7.292510742786986, + "grad_norm": 0.1806635707616806, + "learning_rate": 1.8027190054740234e-05, + "loss": 1.6682, + "step": 23759 + }, + { + "epoch": 7.292817679558011, + "grad_norm": 0.19762687385082245, + "learning_rate": 1.802336870947796e-05, + "loss": 1.7514, + "step": 23760 + }, + { + "epoch": 7.293124616329036, + "grad_norm": 0.1739082932472229, + "learning_rate": 1.80195476802275e-05, + "loss": 1.7031, + "step": 23761 + }, + { + "epoch": 7.293431553100062, + "grad_norm": 0.18887469172477722, + "learning_rate": 1.8015726967026615e-05, + "loss": 1.7199, + "step": 23762 + }, + { + "epoch": 7.293738489871086, + "grad_norm": 0.17344269156455994, + "learning_rate": 1.8011906569913056e-05, + "loss": 1.693, + "step": 23763 + }, + { + "epoch": 7.2940454266421115, + "grad_norm": 0.16480129957199097, + "learning_rate": 1.800808648892459e-05, + "loss": 1.722, + "step": 23764 + }, + { + "epoch": 7.294352363413137, + "grad_norm": 0.17336638271808624, + "learning_rate": 1.8004266724098963e-05, + "loss": 1.6635, + "step": 23765 + }, + { + "epoch": 7.294659300184162, + "grad_norm": 0.16539151966571808, + "learning_rate": 1.8000447275473925e-05, + "loss": 1.7709, + "step": 23766 + }, + { + "epoch": 7.2949662369551875, + "grad_norm": 0.20660065114498138, + "learning_rate": 1.7996628143087226e-05, + "loss": 1.7262, + "step": 23767 + }, + { + "epoch": 7.295273173726212, + "grad_norm": 0.2292039543390274, + "learning_rate": 1.7992809326976584e-05, + "loss": 1.7444, + "step": 23768 + }, + { + "epoch": 7.295580110497237, + "grad_norm": 0.20323103666305542, + "learning_rate": 1.7988990827179795e-05, + "loss": 1.7456, + "step": 23769 + }, + { + "epoch": 7.295887047268263, + "grad_norm": 0.16919885575771332, + "learning_rate": 1.7985172643734532e-05, + "loss": 1.7304, + "step": 23770 + }, + { + "epoch": 7.296193984039288, + "grad_norm": 0.19135236740112305, + "learning_rate": 1.798135477667859e-05, + "loss": 1.7067, + "step": 23771 + }, + { + "epoch": 7.296500920810313, + "grad_norm": 0.19812993705272675, + "learning_rate": 1.7977537226049627e-05, + "loss": 1.7701, + "step": 23772 + }, + { + "epoch": 7.296807857581339, + "grad_norm": 0.22823916375637054, + "learning_rate": 1.797371999188543e-05, + "loss": 1.737, + "step": 23773 + }, + { + "epoch": 7.297114794352363, + "grad_norm": 0.1862197369337082, + "learning_rate": 1.7969903074223705e-05, + "loss": 1.675, + "step": 23774 + }, + { + "epoch": 7.297421731123388, + "grad_norm": 0.18780425190925598, + "learning_rate": 1.7966086473102168e-05, + "loss": 1.7237, + "step": 23775 + }, + { + "epoch": 7.297728667894414, + "grad_norm": 0.174093559384346, + "learning_rate": 1.7962270188558543e-05, + "loss": 1.7129, + "step": 23776 + }, + { + "epoch": 7.298035604665439, + "grad_norm": 0.22659943997859955, + "learning_rate": 1.7958454220630543e-05, + "loss": 1.7257, + "step": 23777 + }, + { + "epoch": 7.298342541436464, + "grad_norm": 0.18077917397022247, + "learning_rate": 1.7954638569355875e-05, + "loss": 1.6972, + "step": 23778 + }, + { + "epoch": 7.298649478207489, + "grad_norm": 0.18380658328533173, + "learning_rate": 1.795082323477225e-05, + "loss": 1.6577, + "step": 23779 + }, + { + "epoch": 7.298956414978514, + "grad_norm": 0.17016704380512238, + "learning_rate": 1.7947008216917384e-05, + "loss": 1.7222, + "step": 23780 + }, + { + "epoch": 7.2992633517495396, + "grad_norm": 0.2016153484582901, + "learning_rate": 1.794319351582896e-05, + "loss": 1.6833, + "step": 23781 + }, + { + "epoch": 7.299570288520565, + "grad_norm": 0.26723918318748474, + "learning_rate": 1.7939379131544687e-05, + "loss": 1.7417, + "step": 23782 + }, + { + "epoch": 7.29987722529159, + "grad_norm": 0.2555576264858246, + "learning_rate": 1.7935565064102267e-05, + "loss": 1.7373, + "step": 23783 + }, + { + "epoch": 7.300184162062616, + "grad_norm": 0.2036418914794922, + "learning_rate": 1.793175131353938e-05, + "loss": 1.7052, + "step": 23784 + }, + { + "epoch": 7.30049109883364, + "grad_norm": 0.1789570152759552, + "learning_rate": 1.792793787989371e-05, + "loss": 1.6327, + "step": 23785 + }, + { + "epoch": 7.300798035604665, + "grad_norm": 0.2353249490261078, + "learning_rate": 1.7924124763202987e-05, + "loss": 1.7771, + "step": 23786 + }, + { + "epoch": 7.301104972375691, + "grad_norm": 0.19072949886322021, + "learning_rate": 1.792031196350483e-05, + "loss": 1.7095, + "step": 23787 + }, + { + "epoch": 7.301411909146716, + "grad_norm": 0.24063248932361603, + "learning_rate": 1.791649948083699e-05, + "loss": 1.7247, + "step": 23788 + }, + { + "epoch": 7.301718845917741, + "grad_norm": 0.1916036456823349, + "learning_rate": 1.791268731523707e-05, + "loss": 1.6844, + "step": 23789 + }, + { + "epoch": 7.302025782688766, + "grad_norm": 0.2606290876865387, + "learning_rate": 1.7908875466742797e-05, + "loss": 1.771, + "step": 23790 + }, + { + "epoch": 7.302332719459791, + "grad_norm": 0.23444804549217224, + "learning_rate": 1.7905063935391824e-05, + "loss": 1.747, + "step": 23791 + }, + { + "epoch": 7.3026396562308165, + "grad_norm": 0.28058725595474243, + "learning_rate": 1.7901252721221822e-05, + "loss": 1.7284, + "step": 23792 + }, + { + "epoch": 7.302946593001842, + "grad_norm": 0.23268578946590424, + "learning_rate": 1.7897441824270456e-05, + "loss": 1.7222, + "step": 23793 + }, + { + "epoch": 7.303253529772867, + "grad_norm": 0.275336354970932, + "learning_rate": 1.789363124457539e-05, + "loss": 1.7495, + "step": 23794 + }, + { + "epoch": 7.303560466543892, + "grad_norm": 0.21838977932929993, + "learning_rate": 1.788982098217427e-05, + "loss": 1.725, + "step": 23795 + }, + { + "epoch": 7.303867403314917, + "grad_norm": 0.24108058214187622, + "learning_rate": 1.7886011037104767e-05, + "loss": 1.7804, + "step": 23796 + }, + { + "epoch": 7.304174340085942, + "grad_norm": 0.23003144562244415, + "learning_rate": 1.788220140940452e-05, + "loss": 1.8189, + "step": 23797 + }, + { + "epoch": 7.304481276856968, + "grad_norm": 0.20129653811454773, + "learning_rate": 1.7878392099111186e-05, + "loss": 1.6603, + "step": 23798 + }, + { + "epoch": 7.304788213627993, + "grad_norm": 0.26172930002212524, + "learning_rate": 1.7874583106262404e-05, + "loss": 1.7095, + "step": 23799 + }, + { + "epoch": 7.305095150399017, + "grad_norm": 0.212156742811203, + "learning_rate": 1.7870774430895825e-05, + "loss": 1.7272, + "step": 23800 + }, + { + "epoch": 7.305402087170043, + "grad_norm": 0.2775247097015381, + "learning_rate": 1.7866966073049084e-05, + "loss": 1.773, + "step": 23801 + }, + { + "epoch": 7.305709023941068, + "grad_norm": 0.23456308245658875, + "learning_rate": 1.7863158032759803e-05, + "loss": 1.7173, + "step": 23802 + }, + { + "epoch": 7.306015960712093, + "grad_norm": 0.23986588418483734, + "learning_rate": 1.785935031006566e-05, + "loss": 1.6924, + "step": 23803 + }, + { + "epoch": 7.306322897483119, + "grad_norm": 0.1909915804862976, + "learning_rate": 1.7855542905004225e-05, + "loss": 1.7047, + "step": 23804 + }, + { + "epoch": 7.306629834254144, + "grad_norm": 0.20676325261592865, + "learning_rate": 1.7851735817613192e-05, + "loss": 1.6606, + "step": 23805 + }, + { + "epoch": 7.3069367710251685, + "grad_norm": 0.1910121887922287, + "learning_rate": 1.7847929047930106e-05, + "loss": 1.7555, + "step": 23806 + }, + { + "epoch": 7.307243707796194, + "grad_norm": 0.22737936675548553, + "learning_rate": 1.784412259599265e-05, + "loss": 1.7346, + "step": 23807 + }, + { + "epoch": 7.307550644567219, + "grad_norm": 0.1553424894809723, + "learning_rate": 1.7840316461838426e-05, + "loss": 1.6755, + "step": 23808 + }, + { + "epoch": 7.3078575813382445, + "grad_norm": 0.17937089502811432, + "learning_rate": 1.7836510645505044e-05, + "loss": 1.684, + "step": 23809 + }, + { + "epoch": 7.30816451810927, + "grad_norm": 0.20183639228343964, + "learning_rate": 1.783270514703011e-05, + "loss": 1.7617, + "step": 23810 + }, + { + "epoch": 7.308471454880294, + "grad_norm": 0.21359068155288696, + "learning_rate": 1.782889996645124e-05, + "loss": 1.6897, + "step": 23811 + }, + { + "epoch": 7.30877839165132, + "grad_norm": 0.19640007615089417, + "learning_rate": 1.782509510380604e-05, + "loss": 1.7029, + "step": 23812 + }, + { + "epoch": 7.309085328422345, + "grad_norm": 0.22678261995315552, + "learning_rate": 1.7821290559132104e-05, + "loss": 1.7241, + "step": 23813 + }, + { + "epoch": 7.30939226519337, + "grad_norm": 0.1797642707824707, + "learning_rate": 1.7817486332467037e-05, + "loss": 1.7127, + "step": 23814 + }, + { + "epoch": 7.309699201964396, + "grad_norm": 0.18758134543895721, + "learning_rate": 1.7813682423848432e-05, + "loss": 1.7394, + "step": 23815 + }, + { + "epoch": 7.310006138735421, + "grad_norm": 0.2064354121685028, + "learning_rate": 1.7809878833313887e-05, + "loss": 1.7477, + "step": 23816 + }, + { + "epoch": 7.310313075506445, + "grad_norm": 0.30564701557159424, + "learning_rate": 1.780607556090098e-05, + "loss": 1.7006, + "step": 23817 + }, + { + "epoch": 7.310620012277471, + "grad_norm": 0.23694200813770294, + "learning_rate": 1.7802272606647308e-05, + "loss": 1.7821, + "step": 23818 + }, + { + "epoch": 7.310926949048496, + "grad_norm": 0.20436422526836395, + "learning_rate": 1.779846997059043e-05, + "loss": 1.6681, + "step": 23819 + }, + { + "epoch": 7.311233885819521, + "grad_norm": 0.21899428963661194, + "learning_rate": 1.779466765276798e-05, + "loss": 1.7416, + "step": 23820 + }, + { + "epoch": 7.311540822590547, + "grad_norm": 0.24186378717422485, + "learning_rate": 1.779086565321747e-05, + "loss": 1.7258, + "step": 23821 + }, + { + "epoch": 7.311847759361571, + "grad_norm": 0.22940407693386078, + "learning_rate": 1.778706397197653e-05, + "loss": 1.7211, + "step": 23822 + }, + { + "epoch": 7.3121546961325965, + "grad_norm": 0.18643233180046082, + "learning_rate": 1.778326260908268e-05, + "loss": 1.6778, + "step": 23823 + }, + { + "epoch": 7.312461632903622, + "grad_norm": 0.25372037291526794, + "learning_rate": 1.7779461564573526e-05, + "loss": 1.7252, + "step": 23824 + }, + { + "epoch": 7.312768569674647, + "grad_norm": 0.21126380562782288, + "learning_rate": 1.7775660838486612e-05, + "loss": 1.6655, + "step": 23825 + }, + { + "epoch": 7.3130755064456725, + "grad_norm": 0.19614748656749725, + "learning_rate": 1.777186043085951e-05, + "loss": 1.7223, + "step": 23826 + }, + { + "epoch": 7.313382443216697, + "grad_norm": 0.2111951857805252, + "learning_rate": 1.7768060341729768e-05, + "loss": 1.708, + "step": 23827 + }, + { + "epoch": 7.313689379987722, + "grad_norm": 0.2675856053829193, + "learning_rate": 1.7764260571134956e-05, + "loss": 1.7387, + "step": 23828 + }, + { + "epoch": 7.313996316758748, + "grad_norm": 0.19827900826931, + "learning_rate": 1.7760461119112603e-05, + "loss": 1.6809, + "step": 23829 + }, + { + "epoch": 7.314303253529773, + "grad_norm": 0.24213160574436188, + "learning_rate": 1.775666198570028e-05, + "loss": 1.7064, + "step": 23830 + }, + { + "epoch": 7.314610190300798, + "grad_norm": 0.20035916566848755, + "learning_rate": 1.7752863170935514e-05, + "loss": 1.6874, + "step": 23831 + }, + { + "epoch": 7.314917127071823, + "grad_norm": 0.23662878572940826, + "learning_rate": 1.774906467485586e-05, + "loss": 1.7651, + "step": 23832 + }, + { + "epoch": 7.315224063842848, + "grad_norm": 0.18523871898651123, + "learning_rate": 1.7745266497498847e-05, + "loss": 1.7003, + "step": 23833 + }, + { + "epoch": 7.315531000613873, + "grad_norm": 0.21452756226062775, + "learning_rate": 1.7741468638902016e-05, + "loss": 1.7012, + "step": 23834 + }, + { + "epoch": 7.315837937384899, + "grad_norm": 0.17513468861579895, + "learning_rate": 1.7737671099102904e-05, + "loss": 1.6965, + "step": 23835 + }, + { + "epoch": 7.316144874155924, + "grad_norm": 0.29025998711586, + "learning_rate": 1.7733873878139012e-05, + "loss": 1.7347, + "step": 23836 + }, + { + "epoch": 7.316451810926949, + "grad_norm": 0.14812500774860382, + "learning_rate": 1.7730076976047926e-05, + "loss": 1.6469, + "step": 23837 + }, + { + "epoch": 7.316758747697974, + "grad_norm": 0.23575027287006378, + "learning_rate": 1.77262803928671e-05, + "loss": 1.7267, + "step": 23838 + }, + { + "epoch": 7.317065684468999, + "grad_norm": 0.17986448109149933, + "learning_rate": 1.7722484128634125e-05, + "loss": 1.7206, + "step": 23839 + }, + { + "epoch": 7.3173726212400245, + "grad_norm": 0.22515927255153656, + "learning_rate": 1.7718688183386446e-05, + "loss": 1.7216, + "step": 23840 + }, + { + "epoch": 7.31767955801105, + "grad_norm": 0.1903398036956787, + "learning_rate": 1.7714892557161624e-05, + "loss": 1.7108, + "step": 23841 + }, + { + "epoch": 7.317986494782075, + "grad_norm": 0.23623183369636536, + "learning_rate": 1.7711097249997162e-05, + "loss": 1.6866, + "step": 23842 + }, + { + "epoch": 7.3182934315531, + "grad_norm": 0.18501855432987213, + "learning_rate": 1.7707302261930554e-05, + "loss": 1.6643, + "step": 23843 + }, + { + "epoch": 7.318600368324125, + "grad_norm": 0.21865275502204895, + "learning_rate": 1.770350759299932e-05, + "loss": 1.6932, + "step": 23844 + }, + { + "epoch": 7.31890730509515, + "grad_norm": 0.22363261878490448, + "learning_rate": 1.7699713243240945e-05, + "loss": 1.721, + "step": 23845 + }, + { + "epoch": 7.319214241866176, + "grad_norm": 0.25587835907936096, + "learning_rate": 1.769591921269294e-05, + "loss": 1.7375, + "step": 23846 + }, + { + "epoch": 7.319521178637201, + "grad_norm": 0.22086483240127563, + "learning_rate": 1.76921255013928e-05, + "loss": 1.6957, + "step": 23847 + }, + { + "epoch": 7.319828115408226, + "grad_norm": 0.21197499334812164, + "learning_rate": 1.7688332109378007e-05, + "loss": 1.6993, + "step": 23848 + }, + { + "epoch": 7.320135052179251, + "grad_norm": 0.21211451292037964, + "learning_rate": 1.7684539036686054e-05, + "loss": 1.7329, + "step": 23849 + }, + { + "epoch": 7.320441988950276, + "grad_norm": 0.16938872635364532, + "learning_rate": 1.7680746283354433e-05, + "loss": 1.6895, + "step": 23850 + }, + { + "epoch": 7.320748925721301, + "grad_norm": 0.21465681493282318, + "learning_rate": 1.7676953849420613e-05, + "loss": 1.7156, + "step": 23851 + }, + { + "epoch": 7.321055862492327, + "grad_norm": 0.16188180446624756, + "learning_rate": 1.7673161734922084e-05, + "loss": 1.6307, + "step": 23852 + }, + { + "epoch": 7.321362799263352, + "grad_norm": 0.2152155190706253, + "learning_rate": 1.7669369939896302e-05, + "loss": 1.7135, + "step": 23853 + }, + { + "epoch": 7.3216697360343765, + "grad_norm": 0.15789814293384552, + "learning_rate": 1.7665578464380788e-05, + "loss": 1.7269, + "step": 23854 + }, + { + "epoch": 7.321976672805402, + "grad_norm": 0.17263127863407135, + "learning_rate": 1.7661787308412948e-05, + "loss": 1.6624, + "step": 23855 + }, + { + "epoch": 7.322283609576427, + "grad_norm": 0.19711650907993317, + "learning_rate": 1.7657996472030308e-05, + "loss": 1.7837, + "step": 23856 + }, + { + "epoch": 7.3225905463474525, + "grad_norm": 0.1847725212574005, + "learning_rate": 1.765420595527027e-05, + "loss": 1.707, + "step": 23857 + }, + { + "epoch": 7.322897483118478, + "grad_norm": 0.21316368877887726, + "learning_rate": 1.7650415758170345e-05, + "loss": 1.715, + "step": 23858 + }, + { + "epoch": 7.323204419889503, + "grad_norm": 0.1912030428647995, + "learning_rate": 1.7646625880767976e-05, + "loss": 1.7465, + "step": 23859 + }, + { + "epoch": 7.323511356660528, + "grad_norm": 0.16245616972446442, + "learning_rate": 1.7642836323100614e-05, + "loss": 1.7365, + "step": 23860 + }, + { + "epoch": 7.323818293431553, + "grad_norm": 0.20665429532527924, + "learning_rate": 1.76390470852057e-05, + "loss": 1.7435, + "step": 23861 + }, + { + "epoch": 7.324125230202578, + "grad_norm": 0.17079970240592957, + "learning_rate": 1.76352581671207e-05, + "loss": 1.7094, + "step": 23862 + }, + { + "epoch": 7.324432166973604, + "grad_norm": 0.17388395965099335, + "learning_rate": 1.7631469568883042e-05, + "loss": 1.7275, + "step": 23863 + }, + { + "epoch": 7.324739103744629, + "grad_norm": 0.20209765434265137, + "learning_rate": 1.7627681290530175e-05, + "loss": 1.7755, + "step": 23864 + }, + { + "epoch": 7.3250460405156534, + "grad_norm": 0.16459977626800537, + "learning_rate": 1.7623893332099538e-05, + "loss": 1.6765, + "step": 23865 + }, + { + "epoch": 7.325352977286679, + "grad_norm": 0.18313255906105042, + "learning_rate": 1.7620105693628556e-05, + "loss": 1.6792, + "step": 23866 + }, + { + "epoch": 7.325659914057704, + "grad_norm": 0.1651672124862671, + "learning_rate": 1.761631837515468e-05, + "loss": 1.6999, + "step": 23867 + }, + { + "epoch": 7.3259668508287294, + "grad_norm": 0.17414255440235138, + "learning_rate": 1.7612531376715317e-05, + "loss": 1.69, + "step": 23868 + }, + { + "epoch": 7.326273787599755, + "grad_norm": 0.1824718415737152, + "learning_rate": 1.7608744698347908e-05, + "loss": 1.6822, + "step": 23869 + }, + { + "epoch": 7.326580724370779, + "grad_norm": 0.19557121396064758, + "learning_rate": 1.760495834008986e-05, + "loss": 1.6852, + "step": 23870 + }, + { + "epoch": 7.326887661141805, + "grad_norm": 0.17803436517715454, + "learning_rate": 1.7601172301978606e-05, + "loss": 1.7523, + "step": 23871 + }, + { + "epoch": 7.32719459791283, + "grad_norm": 0.24077050387859344, + "learning_rate": 1.7597386584051545e-05, + "loss": 1.8044, + "step": 23872 + }, + { + "epoch": 7.327501534683855, + "grad_norm": 0.20061948895454407, + "learning_rate": 1.7593601186346127e-05, + "loss": 1.7298, + "step": 23873 + }, + { + "epoch": 7.327808471454881, + "grad_norm": 0.17362944781780243, + "learning_rate": 1.758981610889971e-05, + "loss": 1.7116, + "step": 23874 + }, + { + "epoch": 7.328115408225905, + "grad_norm": 0.20858663320541382, + "learning_rate": 1.758603135174974e-05, + "loss": 1.6765, + "step": 23875 + }, + { + "epoch": 7.32842234499693, + "grad_norm": 0.1805036962032318, + "learning_rate": 1.7582246914933604e-05, + "loss": 1.694, + "step": 23876 + }, + { + "epoch": 7.328729281767956, + "grad_norm": 0.26010429859161377, + "learning_rate": 1.7578462798488704e-05, + "loss": 1.7373, + "step": 23877 + }, + { + "epoch": 7.329036218538981, + "grad_norm": 0.19902443885803223, + "learning_rate": 1.7574679002452444e-05, + "loss": 1.72, + "step": 23878 + }, + { + "epoch": 7.329343155310006, + "grad_norm": 0.21231114864349365, + "learning_rate": 1.7570895526862202e-05, + "loss": 1.7526, + "step": 23879 + }, + { + "epoch": 7.329650092081032, + "grad_norm": 0.2075740098953247, + "learning_rate": 1.7567112371755384e-05, + "loss": 1.773, + "step": 23880 + }, + { + "epoch": 7.329957028852056, + "grad_norm": 0.21381771564483643, + "learning_rate": 1.756332953716937e-05, + "loss": 1.733, + "step": 23881 + }, + { + "epoch": 7.3302639656230815, + "grad_norm": 0.21689461171627045, + "learning_rate": 1.755954702314155e-05, + "loss": 1.7234, + "step": 23882 + }, + { + "epoch": 7.330570902394107, + "grad_norm": 0.21094383299350739, + "learning_rate": 1.755576482970929e-05, + "loss": 1.7074, + "step": 23883 + }, + { + "epoch": 7.330877839165132, + "grad_norm": 0.18460774421691895, + "learning_rate": 1.7551982956909985e-05, + "loss": 1.6706, + "step": 23884 + }, + { + "epoch": 7.3311847759361575, + "grad_norm": 0.18868015706539154, + "learning_rate": 1.7548201404781e-05, + "loss": 1.6371, + "step": 23885 + }, + { + "epoch": 7.331491712707182, + "grad_norm": 0.18036094307899475, + "learning_rate": 1.7544420173359715e-05, + "loss": 1.7115, + "step": 23886 + }, + { + "epoch": 7.331798649478207, + "grad_norm": 0.17143553495407104, + "learning_rate": 1.754063926268349e-05, + "loss": 1.668, + "step": 23887 + }, + { + "epoch": 7.332105586249233, + "grad_norm": 0.1700706034898758, + "learning_rate": 1.7536858672789684e-05, + "loss": 1.7244, + "step": 23888 + }, + { + "epoch": 7.332412523020258, + "grad_norm": 0.1740385890007019, + "learning_rate": 1.7533078403715665e-05, + "loss": 1.7163, + "step": 23889 + }, + { + "epoch": 7.332719459791283, + "grad_norm": 0.206922248005867, + "learning_rate": 1.752929845549882e-05, + "loss": 1.7572, + "step": 23890 + }, + { + "epoch": 7.333026396562309, + "grad_norm": 0.22770223021507263, + "learning_rate": 1.7525518828176445e-05, + "loss": 1.7391, + "step": 23891 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.203486829996109, + "learning_rate": 1.7521739521785962e-05, + "loss": 1.7664, + "step": 23892 + }, + { + "epoch": 7.333640270104358, + "grad_norm": 0.15539827942848206, + "learning_rate": 1.7517960536364652e-05, + "loss": 1.675, + "step": 23893 + }, + { + "epoch": 7.333947206875384, + "grad_norm": 0.18226636946201324, + "learning_rate": 1.7514181871949913e-05, + "loss": 1.7097, + "step": 23894 + }, + { + "epoch": 7.334254143646409, + "grad_norm": 0.1522573083639145, + "learning_rate": 1.751040352857907e-05, + "loss": 1.6783, + "step": 23895 + }, + { + "epoch": 7.334561080417434, + "grad_norm": 0.18082024157047272, + "learning_rate": 1.750662550628946e-05, + "loss": 1.752, + "step": 23896 + }, + { + "epoch": 7.334868017188459, + "grad_norm": 0.1968161165714264, + "learning_rate": 1.750284780511844e-05, + "loss": 1.7773, + "step": 23897 + }, + { + "epoch": 7.335174953959484, + "grad_norm": 0.17520470917224884, + "learning_rate": 1.7499070425103286e-05, + "loss": 1.7244, + "step": 23898 + }, + { + "epoch": 7.3354818907305095, + "grad_norm": 0.32224342226982117, + "learning_rate": 1.749529336628139e-05, + "loss": 1.8087, + "step": 23899 + }, + { + "epoch": 7.335788827501535, + "grad_norm": 0.25473707914352417, + "learning_rate": 1.7491516628690053e-05, + "loss": 1.7677, + "step": 23900 + }, + { + "epoch": 7.33609576427256, + "grad_norm": 0.20730654895305634, + "learning_rate": 1.7487740212366604e-05, + "loss": 1.7261, + "step": 23901 + }, + { + "epoch": 7.336402701043585, + "grad_norm": 0.22070205211639404, + "learning_rate": 1.748396411734836e-05, + "loss": 1.8024, + "step": 23902 + }, + { + "epoch": 7.33670963781461, + "grad_norm": 0.16921460628509521, + "learning_rate": 1.7480188343672647e-05, + "loss": 1.6823, + "step": 23903 + }, + { + "epoch": 7.337016574585635, + "grad_norm": 0.16576658189296722, + "learning_rate": 1.747641289137677e-05, + "loss": 1.6563, + "step": 23904 + }, + { + "epoch": 7.337323511356661, + "grad_norm": 0.19541388750076294, + "learning_rate": 1.7472637760498046e-05, + "loss": 1.8023, + "step": 23905 + }, + { + "epoch": 7.337630448127686, + "grad_norm": 0.19848179817199707, + "learning_rate": 1.7468862951073754e-05, + "loss": 1.7395, + "step": 23906 + }, + { + "epoch": 7.337937384898711, + "grad_norm": 0.1627921313047409, + "learning_rate": 1.746508846314127e-05, + "loss": 1.6569, + "step": 23907 + }, + { + "epoch": 7.338244321669736, + "grad_norm": 0.1798046976327896, + "learning_rate": 1.7461314296737813e-05, + "loss": 1.6927, + "step": 23908 + }, + { + "epoch": 7.338551258440761, + "grad_norm": 0.17935742437839508, + "learning_rate": 1.7457540451900757e-05, + "loss": 1.701, + "step": 23909 + }, + { + "epoch": 7.338858195211786, + "grad_norm": 0.16761814057826996, + "learning_rate": 1.745376692866732e-05, + "loss": 1.6701, + "step": 23910 + }, + { + "epoch": 7.339165131982812, + "grad_norm": 0.1733570694923401, + "learning_rate": 1.7449993727074855e-05, + "loss": 1.705, + "step": 23911 + }, + { + "epoch": 7.339472068753837, + "grad_norm": 0.21162372827529907, + "learning_rate": 1.7446220847160626e-05, + "loss": 1.7703, + "step": 23912 + }, + { + "epoch": 7.3397790055248615, + "grad_norm": 0.18743988871574402, + "learning_rate": 1.7442448288961928e-05, + "loss": 1.6899, + "step": 23913 + }, + { + "epoch": 7.340085942295887, + "grad_norm": 0.19185546040534973, + "learning_rate": 1.743867605251605e-05, + "loss": 1.7483, + "step": 23914 + }, + { + "epoch": 7.340392879066912, + "grad_norm": 0.23066233098506927, + "learning_rate": 1.7434904137860232e-05, + "loss": 1.7564, + "step": 23915 + }, + { + "epoch": 7.3406998158379375, + "grad_norm": 0.18159757554531097, + "learning_rate": 1.743113254503179e-05, + "loss": 1.7136, + "step": 23916 + }, + { + "epoch": 7.341006752608963, + "grad_norm": 0.22666020691394806, + "learning_rate": 1.7427361274067995e-05, + "loss": 1.7589, + "step": 23917 + }, + { + "epoch": 7.341313689379987, + "grad_norm": 0.18986108899116516, + "learning_rate": 1.74235903250061e-05, + "loss": 1.7429, + "step": 23918 + }, + { + "epoch": 7.341620626151013, + "grad_norm": 0.17987726628780365, + "learning_rate": 1.741981969788338e-05, + "loss": 1.7457, + "step": 23919 + }, + { + "epoch": 7.341927562922038, + "grad_norm": 0.2370992749929428, + "learning_rate": 1.7416049392737093e-05, + "loss": 1.7594, + "step": 23920 + }, + { + "epoch": 7.342234499693063, + "grad_norm": 0.18698690831661224, + "learning_rate": 1.7412279409604508e-05, + "loss": 1.7555, + "step": 23921 + }, + { + "epoch": 7.342541436464089, + "grad_norm": 0.18401117622852325, + "learning_rate": 1.7408509748522882e-05, + "loss": 1.7355, + "step": 23922 + }, + { + "epoch": 7.342848373235114, + "grad_norm": 0.22045543789863586, + "learning_rate": 1.7404740409529448e-05, + "loss": 1.7227, + "step": 23923 + }, + { + "epoch": 7.343155310006138, + "grad_norm": 0.24414709210395813, + "learning_rate": 1.7400971392661502e-05, + "loss": 1.7551, + "step": 23924 + }, + { + "epoch": 7.343462246777164, + "grad_norm": 0.1906892955303192, + "learning_rate": 1.739720269795623e-05, + "loss": 1.7204, + "step": 23925 + }, + { + "epoch": 7.343769183548189, + "grad_norm": 0.1840149164199829, + "learning_rate": 1.7393434325450948e-05, + "loss": 1.74, + "step": 23926 + }, + { + "epoch": 7.344076120319214, + "grad_norm": 0.21434549987316132, + "learning_rate": 1.7389666275182825e-05, + "loss": 1.6961, + "step": 23927 + }, + { + "epoch": 7.34438305709024, + "grad_norm": 0.19110503792762756, + "learning_rate": 1.7385898547189146e-05, + "loss": 1.7731, + "step": 23928 + }, + { + "epoch": 7.344689993861264, + "grad_norm": 0.18905460834503174, + "learning_rate": 1.7382131141507136e-05, + "loss": 1.6925, + "step": 23929 + }, + { + "epoch": 7.3449969306322895, + "grad_norm": 0.16336308419704437, + "learning_rate": 1.7378364058174024e-05, + "loss": 1.7073, + "step": 23930 + }, + { + "epoch": 7.345303867403315, + "grad_norm": 0.16707782447338104, + "learning_rate": 1.7374597297227056e-05, + "loss": 1.7036, + "step": 23931 + }, + { + "epoch": 7.34561080417434, + "grad_norm": 0.19958938658237457, + "learning_rate": 1.7370830858703406e-05, + "loss": 1.7035, + "step": 23932 + }, + { + "epoch": 7.3459177409453655, + "grad_norm": 0.18446899950504303, + "learning_rate": 1.7367064742640348e-05, + "loss": 1.754, + "step": 23933 + }, + { + "epoch": 7.346224677716391, + "grad_norm": 0.19238999485969543, + "learning_rate": 1.736329894907508e-05, + "loss": 1.6903, + "step": 23934 + }, + { + "epoch": 7.346531614487415, + "grad_norm": 0.1985396146774292, + "learning_rate": 1.7359533478044825e-05, + "loss": 1.7342, + "step": 23935 + }, + { + "epoch": 7.346838551258441, + "grad_norm": 0.19200150668621063, + "learning_rate": 1.7355768329586784e-05, + "loss": 1.6915, + "step": 23936 + }, + { + "epoch": 7.347145488029466, + "grad_norm": 0.19772231578826904, + "learning_rate": 1.7352003503738186e-05, + "loss": 1.7341, + "step": 23937 + }, + { + "epoch": 7.347452424800491, + "grad_norm": 0.1961035579442978, + "learning_rate": 1.7348239000536214e-05, + "loss": 1.7395, + "step": 23938 + }, + { + "epoch": 7.347759361571517, + "grad_norm": 0.15188434720039368, + "learning_rate": 1.7344474820018087e-05, + "loss": 1.635, + "step": 23939 + }, + { + "epoch": 7.348066298342541, + "grad_norm": 0.18748410046100616, + "learning_rate": 1.734071096222098e-05, + "loss": 1.6878, + "step": 23940 + }, + { + "epoch": 7.348373235113566, + "grad_norm": 0.19337952136993408, + "learning_rate": 1.7336947427182143e-05, + "loss": 1.7532, + "step": 23941 + }, + { + "epoch": 7.348680171884592, + "grad_norm": 0.14804427325725555, + "learning_rate": 1.73331842149387e-05, + "loss": 1.683, + "step": 23942 + }, + { + "epoch": 7.348987108655617, + "grad_norm": 0.18310968577861786, + "learning_rate": 1.7329421325527916e-05, + "loss": 1.718, + "step": 23943 + }, + { + "epoch": 7.349294045426642, + "grad_norm": 0.18589583039283752, + "learning_rate": 1.7325658758986906e-05, + "loss": 1.7115, + "step": 23944 + }, + { + "epoch": 7.349600982197667, + "grad_norm": 0.1618955284357071, + "learning_rate": 1.7321896515352904e-05, + "loss": 1.6757, + "step": 23945 + }, + { + "epoch": 7.349907918968692, + "grad_norm": 0.20092655718326569, + "learning_rate": 1.731813459466307e-05, + "loss": 1.7537, + "step": 23946 + }, + { + "epoch": 7.350214855739718, + "grad_norm": 0.17287038266658783, + "learning_rate": 1.7314372996954592e-05, + "loss": 1.6744, + "step": 23947 + }, + { + "epoch": 7.350521792510743, + "grad_norm": 0.19176220893859863, + "learning_rate": 1.731061172226465e-05, + "loss": 1.7279, + "step": 23948 + }, + { + "epoch": 7.350828729281768, + "grad_norm": 0.2060871571302414, + "learning_rate": 1.7306850770630367e-05, + "loss": 1.7802, + "step": 23949 + }, + { + "epoch": 7.351135666052793, + "grad_norm": 0.27185341715812683, + "learning_rate": 1.7303090142088967e-05, + "loss": 1.7234, + "step": 23950 + }, + { + "epoch": 7.351442602823818, + "grad_norm": 0.19845733046531677, + "learning_rate": 1.729932983667759e-05, + "loss": 1.7503, + "step": 23951 + }, + { + "epoch": 7.351749539594843, + "grad_norm": 0.19455648958683014, + "learning_rate": 1.729556985443341e-05, + "loss": 1.8096, + "step": 23952 + }, + { + "epoch": 7.352056476365869, + "grad_norm": 0.19090545177459717, + "learning_rate": 1.729181019539357e-05, + "loss": 1.6776, + "step": 23953 + }, + { + "epoch": 7.352363413136894, + "grad_norm": 0.16086700558662415, + "learning_rate": 1.728805085959524e-05, + "loss": 1.6829, + "step": 23954 + }, + { + "epoch": 7.352670349907919, + "grad_norm": 0.2156524360179901, + "learning_rate": 1.7284291847075555e-05, + "loss": 1.7147, + "step": 23955 + }, + { + "epoch": 7.352977286678944, + "grad_norm": 0.20258861780166626, + "learning_rate": 1.728053315787168e-05, + "loss": 1.7085, + "step": 23956 + }, + { + "epoch": 7.353284223449969, + "grad_norm": 0.1877330094575882, + "learning_rate": 1.7276774792020735e-05, + "loss": 1.7311, + "step": 23957 + }, + { + "epoch": 7.3535911602209945, + "grad_norm": 0.22096484899520874, + "learning_rate": 1.727301674955992e-05, + "loss": 1.6712, + "step": 23958 + }, + { + "epoch": 7.35389809699202, + "grad_norm": 0.21456706523895264, + "learning_rate": 1.726925903052629e-05, + "loss": 1.7773, + "step": 23959 + }, + { + "epoch": 7.354205033763045, + "grad_norm": 0.2114667296409607, + "learning_rate": 1.7265501634957072e-05, + "loss": 1.669, + "step": 23960 + }, + { + "epoch": 7.35451197053407, + "grad_norm": 0.1676410287618637, + "learning_rate": 1.726174456288931e-05, + "loss": 1.6673, + "step": 23961 + }, + { + "epoch": 7.354818907305095, + "grad_norm": 0.19883838295936584, + "learning_rate": 1.72579878143602e-05, + "loss": 1.6821, + "step": 23962 + }, + { + "epoch": 7.35512584407612, + "grad_norm": 0.19240599870681763, + "learning_rate": 1.725423138940684e-05, + "loss": 1.741, + "step": 23963 + }, + { + "epoch": 7.355432780847146, + "grad_norm": 0.230613574385643, + "learning_rate": 1.7250475288066363e-05, + "loss": 1.6937, + "step": 23964 + }, + { + "epoch": 7.355739717618171, + "grad_norm": 0.17126981914043427, + "learning_rate": 1.7246719510375898e-05, + "loss": 1.6791, + "step": 23965 + }, + { + "epoch": 7.356046654389196, + "grad_norm": 0.1852734386920929, + "learning_rate": 1.7242964056372518e-05, + "loss": 1.7196, + "step": 23966 + }, + { + "epoch": 7.356353591160221, + "grad_norm": 0.1922985464334488, + "learning_rate": 1.723920892609338e-05, + "loss": 1.794, + "step": 23967 + }, + { + "epoch": 7.356660527931246, + "grad_norm": 0.1918993592262268, + "learning_rate": 1.7235454119575582e-05, + "loss": 1.7725, + "step": 23968 + }, + { + "epoch": 7.356967464702271, + "grad_norm": 0.21787014603614807, + "learning_rate": 1.723169963685623e-05, + "loss": 1.7382, + "step": 23969 + }, + { + "epoch": 7.357274401473297, + "grad_norm": 0.23753544688224792, + "learning_rate": 1.722794547797243e-05, + "loss": 1.7924, + "step": 23970 + }, + { + "epoch": 7.357581338244322, + "grad_norm": 0.2251000851392746, + "learning_rate": 1.722419164296128e-05, + "loss": 1.6794, + "step": 23971 + }, + { + "epoch": 7.3578882750153465, + "grad_norm": 0.21573983132839203, + "learning_rate": 1.7220438131859878e-05, + "loss": 1.796, + "step": 23972 + }, + { + "epoch": 7.358195211786372, + "grad_norm": 0.217384472489357, + "learning_rate": 1.721668494470532e-05, + "loss": 1.7305, + "step": 23973 + }, + { + "epoch": 7.358502148557397, + "grad_norm": 0.21815331280231476, + "learning_rate": 1.7212932081534677e-05, + "loss": 1.7348, + "step": 23974 + }, + { + "epoch": 7.3588090853284225, + "grad_norm": 0.19974499940872192, + "learning_rate": 1.7209179542385097e-05, + "loss": 1.7383, + "step": 23975 + }, + { + "epoch": 7.359116022099448, + "grad_norm": 0.20518191158771515, + "learning_rate": 1.7205427327293582e-05, + "loss": 1.7087, + "step": 23976 + }, + { + "epoch": 7.359422958870473, + "grad_norm": 0.17104744911193848, + "learning_rate": 1.7201675436297293e-05, + "loss": 1.718, + "step": 23977 + }, + { + "epoch": 7.359729895641498, + "grad_norm": 0.2165975421667099, + "learning_rate": 1.7197923869433235e-05, + "loss": 1.7907, + "step": 23978 + }, + { + "epoch": 7.360036832412523, + "grad_norm": 0.1784742921590805, + "learning_rate": 1.719417262673854e-05, + "loss": 1.6354, + "step": 23979 + }, + { + "epoch": 7.360343769183548, + "grad_norm": 0.1867162138223648, + "learning_rate": 1.719042170825026e-05, + "loss": 1.7264, + "step": 23980 + }, + { + "epoch": 7.360650705954574, + "grad_norm": 0.19704937934875488, + "learning_rate": 1.7186671114005458e-05, + "loss": 1.72, + "step": 23981 + }, + { + "epoch": 7.360957642725599, + "grad_norm": 0.20316866040229797, + "learning_rate": 1.718292084404123e-05, + "loss": 1.759, + "step": 23982 + }, + { + "epoch": 7.361264579496623, + "grad_norm": 0.20339833199977875, + "learning_rate": 1.717917089839457e-05, + "loss": 1.7537, + "step": 23983 + }, + { + "epoch": 7.361571516267649, + "grad_norm": 0.18114012479782104, + "learning_rate": 1.71754212771026e-05, + "loss": 1.7207, + "step": 23984 + }, + { + "epoch": 7.361878453038674, + "grad_norm": 0.16071686148643494, + "learning_rate": 1.7171671980202353e-05, + "loss": 1.6534, + "step": 23985 + }, + { + "epoch": 7.362185389809699, + "grad_norm": 0.15212370455265045, + "learning_rate": 1.7167923007730892e-05, + "loss": 1.6638, + "step": 23986 + }, + { + "epoch": 7.362492326580725, + "grad_norm": 0.16284595429897308, + "learning_rate": 1.7164174359725253e-05, + "loss": 1.7442, + "step": 23987 + }, + { + "epoch": 7.362799263351749, + "grad_norm": 0.18302884697914124, + "learning_rate": 1.7160426036222494e-05, + "loss": 1.7087, + "step": 23988 + }, + { + "epoch": 7.3631062001227745, + "grad_norm": 0.18764640390872955, + "learning_rate": 1.715667803725965e-05, + "loss": 1.702, + "step": 23989 + }, + { + "epoch": 7.3634131368938, + "grad_norm": 0.16912522912025452, + "learning_rate": 1.7152930362873758e-05, + "loss": 1.742, + "step": 23990 + }, + { + "epoch": 7.363720073664825, + "grad_norm": 0.21137015521526337, + "learning_rate": 1.714918301310185e-05, + "loss": 1.7074, + "step": 23991 + }, + { + "epoch": 7.3640270104358505, + "grad_norm": 0.17562401294708252, + "learning_rate": 1.7145435987981008e-05, + "loss": 1.69, + "step": 23992 + }, + { + "epoch": 7.364333947206875, + "grad_norm": 0.15575642883777618, + "learning_rate": 1.714168928754818e-05, + "loss": 1.6986, + "step": 23993 + }, + { + "epoch": 7.3646408839779, + "grad_norm": 0.18057680130004883, + "learning_rate": 1.7137942911840477e-05, + "loss": 1.7661, + "step": 23994 + }, + { + "epoch": 7.364947820748926, + "grad_norm": 0.18899883329868317, + "learning_rate": 1.7134196860894853e-05, + "loss": 1.6841, + "step": 23995 + }, + { + "epoch": 7.365254757519951, + "grad_norm": 0.15350781381130219, + "learning_rate": 1.7130451134748367e-05, + "loss": 1.7005, + "step": 23996 + }, + { + "epoch": 7.365561694290976, + "grad_norm": 0.20394811034202576, + "learning_rate": 1.7126705733438037e-05, + "loss": 1.7342, + "step": 23997 + }, + { + "epoch": 7.365868631062002, + "grad_norm": 0.1881636083126068, + "learning_rate": 1.7122960657000864e-05, + "loss": 1.6985, + "step": 23998 + }, + { + "epoch": 7.366175567833026, + "grad_norm": 0.1619534194469452, + "learning_rate": 1.711921590547388e-05, + "loss": 1.6579, + "step": 23999 + }, + { + "epoch": 7.366482504604051, + "grad_norm": 0.16795861721038818, + "learning_rate": 1.711547147889404e-05, + "loss": 1.717, + "step": 24000 + }, + { + "epoch": 7.366789441375077, + "grad_norm": 0.1452684998512268, + "learning_rate": 1.711172737729841e-05, + "loss": 1.6792, + "step": 24001 + }, + { + "epoch": 7.367096378146102, + "grad_norm": 0.14940062165260315, + "learning_rate": 1.710798360072396e-05, + "loss": 1.6731, + "step": 24002 + }, + { + "epoch": 7.367403314917127, + "grad_norm": 0.21277321875095367, + "learning_rate": 1.7104240149207694e-05, + "loss": 1.7145, + "step": 24003 + }, + { + "epoch": 7.367710251688152, + "grad_norm": 0.17097726464271545, + "learning_rate": 1.710049702278661e-05, + "loss": 1.7052, + "step": 24004 + }, + { + "epoch": 7.368017188459177, + "grad_norm": 0.15970511734485626, + "learning_rate": 1.7096754221497702e-05, + "loss": 1.6586, + "step": 24005 + }, + { + "epoch": 7.3683241252302025, + "grad_norm": 0.198451429605484, + "learning_rate": 1.7093011745377945e-05, + "loss": 1.7449, + "step": 24006 + }, + { + "epoch": 7.368631062001228, + "grad_norm": 0.19554266333580017, + "learning_rate": 1.7089269594464342e-05, + "loss": 1.7455, + "step": 24007 + }, + { + "epoch": 7.368937998772253, + "grad_norm": 0.1854190230369568, + "learning_rate": 1.7085527768793847e-05, + "loss": 1.7355, + "step": 24008 + }, + { + "epoch": 7.3692449355432785, + "grad_norm": 0.17093004286289215, + "learning_rate": 1.708178626840349e-05, + "loss": 1.6813, + "step": 24009 + }, + { + "epoch": 7.369551872314303, + "grad_norm": 0.15385115146636963, + "learning_rate": 1.707804509333018e-05, + "loss": 1.664, + "step": 24010 + }, + { + "epoch": 7.369858809085328, + "grad_norm": 0.18747489154338837, + "learning_rate": 1.7074304243610963e-05, + "loss": 1.787, + "step": 24011 + }, + { + "epoch": 7.370165745856354, + "grad_norm": 0.21749509871006012, + "learning_rate": 1.7070563719282734e-05, + "loss": 1.723, + "step": 24012 + }, + { + "epoch": 7.370472682627379, + "grad_norm": 0.18973985314369202, + "learning_rate": 1.7066823520382508e-05, + "loss": 1.7415, + "step": 24013 + }, + { + "epoch": 7.370779619398404, + "grad_norm": 0.24844922125339508, + "learning_rate": 1.706308364694724e-05, + "loss": 1.7617, + "step": 24014 + }, + { + "epoch": 7.371086556169429, + "grad_norm": 0.16565518081188202, + "learning_rate": 1.705934409901388e-05, + "loss": 1.6781, + "step": 24015 + }, + { + "epoch": 7.371393492940454, + "grad_norm": 0.22595234215259552, + "learning_rate": 1.705560487661941e-05, + "loss": 1.7706, + "step": 24016 + }, + { + "epoch": 7.371700429711479, + "grad_norm": 0.2452661544084549, + "learning_rate": 1.7051865979800723e-05, + "loss": 1.8227, + "step": 24017 + }, + { + "epoch": 7.372007366482505, + "grad_norm": 0.2285550981760025, + "learning_rate": 1.7048127408594834e-05, + "loss": 1.7554, + "step": 24018 + }, + { + "epoch": 7.37231430325353, + "grad_norm": 0.22723950445652008, + "learning_rate": 1.7044389163038656e-05, + "loss": 1.7152, + "step": 24019 + }, + { + "epoch": 7.3726212400245545, + "grad_norm": 0.20335997641086578, + "learning_rate": 1.7040651243169143e-05, + "loss": 1.6661, + "step": 24020 + }, + { + "epoch": 7.37292817679558, + "grad_norm": 0.27618682384490967, + "learning_rate": 1.703691364902323e-05, + "loss": 1.8375, + "step": 24021 + }, + { + "epoch": 7.373235113566605, + "grad_norm": 0.24076996743679047, + "learning_rate": 1.7033176380637856e-05, + "loss": 1.7581, + "step": 24022 + }, + { + "epoch": 7.3735420503376305, + "grad_norm": 0.21615716814994812, + "learning_rate": 1.702943943804996e-05, + "loss": 1.7047, + "step": 24023 + }, + { + "epoch": 7.373848987108656, + "grad_norm": 0.23503927886486053, + "learning_rate": 1.7025702821296462e-05, + "loss": 1.7926, + "step": 24024 + }, + { + "epoch": 7.37415592387968, + "grad_norm": 0.2344675064086914, + "learning_rate": 1.7021966530414303e-05, + "loss": 1.747, + "step": 24025 + }, + { + "epoch": 7.374462860650706, + "grad_norm": 0.20946700870990753, + "learning_rate": 1.701823056544039e-05, + "loss": 1.746, + "step": 24026 + }, + { + "epoch": 7.374769797421731, + "grad_norm": 0.26749730110168457, + "learning_rate": 1.7014494926411645e-05, + "loss": 1.7375, + "step": 24027 + }, + { + "epoch": 7.375076734192756, + "grad_norm": 0.19716335833072662, + "learning_rate": 1.701075961336503e-05, + "loss": 1.6677, + "step": 24028 + }, + { + "epoch": 7.375383670963782, + "grad_norm": 0.1999496966600418, + "learning_rate": 1.7007024626337382e-05, + "loss": 1.6665, + "step": 24029 + }, + { + "epoch": 7.375690607734807, + "grad_norm": 0.188812255859375, + "learning_rate": 1.7003289965365676e-05, + "loss": 1.7344, + "step": 24030 + }, + { + "epoch": 7.3759975445058314, + "grad_norm": 0.20171904563903809, + "learning_rate": 1.6999555630486795e-05, + "loss": 1.7452, + "step": 24031 + }, + { + "epoch": 7.376304481276857, + "grad_norm": 0.21260966360569, + "learning_rate": 1.6995821621737655e-05, + "loss": 1.7759, + "step": 24032 + }, + { + "epoch": 7.376611418047882, + "grad_norm": 0.1913561075925827, + "learning_rate": 1.699208793915516e-05, + "loss": 1.7342, + "step": 24033 + }, + { + "epoch": 7.3769183548189075, + "grad_norm": 0.1907757967710495, + "learning_rate": 1.6988354582776166e-05, + "loss": 1.6511, + "step": 24034 + }, + { + "epoch": 7.377225291589933, + "grad_norm": 0.15012076497077942, + "learning_rate": 1.6984621552637625e-05, + "loss": 1.6638, + "step": 24035 + }, + { + "epoch": 7.377532228360957, + "grad_norm": 0.17761732637882233, + "learning_rate": 1.6980888848776394e-05, + "loss": 1.7035, + "step": 24036 + }, + { + "epoch": 7.377839165131983, + "grad_norm": 0.15940140187740326, + "learning_rate": 1.6977156471229376e-05, + "loss": 1.6532, + "step": 24037 + }, + { + "epoch": 7.378146101903008, + "grad_norm": 0.19022013247013092, + "learning_rate": 1.6973424420033455e-05, + "loss": 1.7545, + "step": 24038 + }, + { + "epoch": 7.378453038674033, + "grad_norm": 0.1900233030319214, + "learning_rate": 1.6969692695225513e-05, + "loss": 1.7051, + "step": 24039 + }, + { + "epoch": 7.378759975445059, + "grad_norm": 0.17687582969665527, + "learning_rate": 1.6965961296842425e-05, + "loss": 1.6819, + "step": 24040 + }, + { + "epoch": 7.379066912216084, + "grad_norm": 0.16323260962963104, + "learning_rate": 1.696223022492107e-05, + "loss": 1.6642, + "step": 24041 + }, + { + "epoch": 7.379373848987108, + "grad_norm": 0.21163886785507202, + "learning_rate": 1.695849947949832e-05, + "loss": 1.6973, + "step": 24042 + }, + { + "epoch": 7.379680785758134, + "grad_norm": 0.1713307648897171, + "learning_rate": 1.6954769060611043e-05, + "loss": 1.677, + "step": 24043 + }, + { + "epoch": 7.379987722529159, + "grad_norm": 0.19575951993465424, + "learning_rate": 1.695103896829609e-05, + "loss": 1.7305, + "step": 24044 + }, + { + "epoch": 7.380294659300184, + "grad_norm": 0.16087177395820618, + "learning_rate": 1.6947309202590377e-05, + "loss": 1.6435, + "step": 24045 + }, + { + "epoch": 7.38060159607121, + "grad_norm": 0.2088652402162552, + "learning_rate": 1.6943579763530692e-05, + "loss": 1.7136, + "step": 24046 + }, + { + "epoch": 7.380908532842234, + "grad_norm": 0.18253973126411438, + "learning_rate": 1.693985065115396e-05, + "loss": 1.7461, + "step": 24047 + }, + { + "epoch": 7.3812154696132595, + "grad_norm": 0.272062212228775, + "learning_rate": 1.6936121865496967e-05, + "loss": 1.7455, + "step": 24048 + }, + { + "epoch": 7.381522406384285, + "grad_norm": 0.1884320080280304, + "learning_rate": 1.6932393406596613e-05, + "loss": 1.7242, + "step": 24049 + }, + { + "epoch": 7.38182934315531, + "grad_norm": 0.22986121475696564, + "learning_rate": 1.6928665274489748e-05, + "loss": 1.7461, + "step": 24050 + }, + { + "epoch": 7.3821362799263355, + "grad_norm": 0.19400665163993835, + "learning_rate": 1.6924937469213158e-05, + "loss": 1.7468, + "step": 24051 + }, + { + "epoch": 7.382443216697361, + "grad_norm": 0.1990167796611786, + "learning_rate": 1.6921209990803744e-05, + "loss": 1.7253, + "step": 24052 + }, + { + "epoch": 7.382750153468385, + "grad_norm": 0.16667480766773224, + "learning_rate": 1.691748283929832e-05, + "loss": 1.6763, + "step": 24053 + }, + { + "epoch": 7.383057090239411, + "grad_norm": 0.20539991557598114, + "learning_rate": 1.691375601473372e-05, + "loss": 1.7408, + "step": 24054 + }, + { + "epoch": 7.383364027010436, + "grad_norm": 0.18021859228610992, + "learning_rate": 1.6910029517146776e-05, + "loss": 1.7075, + "step": 24055 + }, + { + "epoch": 7.383670963781461, + "grad_norm": 0.17450939118862152, + "learning_rate": 1.6906303346574314e-05, + "loss": 1.7074, + "step": 24056 + }, + { + "epoch": 7.383977900552487, + "grad_norm": 0.1690986454486847, + "learning_rate": 1.690257750305316e-05, + "loss": 1.6911, + "step": 24057 + }, + { + "epoch": 7.384284837323511, + "grad_norm": 0.19716380536556244, + "learning_rate": 1.6898851986620136e-05, + "loss": 1.7075, + "step": 24058 + }, + { + "epoch": 7.384591774094536, + "grad_norm": 0.20165397226810455, + "learning_rate": 1.6895126797312054e-05, + "loss": 1.7201, + "step": 24059 + }, + { + "epoch": 7.384898710865562, + "grad_norm": 0.22149543464183807, + "learning_rate": 1.6891401935165734e-05, + "loss": 1.7407, + "step": 24060 + }, + { + "epoch": 7.385205647636587, + "grad_norm": 0.1575438529253006, + "learning_rate": 1.6887677400217966e-05, + "loss": 1.6451, + "step": 24061 + }, + { + "epoch": 7.385512584407612, + "grad_norm": 0.18075503408908844, + "learning_rate": 1.688395319250562e-05, + "loss": 1.7084, + "step": 24062 + }, + { + "epoch": 7.385819521178637, + "grad_norm": 0.16428421437740326, + "learning_rate": 1.6880229312065414e-05, + "loss": 1.7047, + "step": 24063 + }, + { + "epoch": 7.386126457949662, + "grad_norm": 0.18372805416584015, + "learning_rate": 1.6876505758934237e-05, + "loss": 1.6726, + "step": 24064 + }, + { + "epoch": 7.3864333947206875, + "grad_norm": 0.199292853474617, + "learning_rate": 1.687278253314882e-05, + "loss": 1.7472, + "step": 24065 + }, + { + "epoch": 7.386740331491713, + "grad_norm": 0.20381483435630798, + "learning_rate": 1.686905963474597e-05, + "loss": 1.7128, + "step": 24066 + }, + { + "epoch": 7.387047268262738, + "grad_norm": 0.18497546017169952, + "learning_rate": 1.6865337063762527e-05, + "loss": 1.736, + "step": 24067 + }, + { + "epoch": 7.387354205033763, + "grad_norm": 0.21320439875125885, + "learning_rate": 1.6861614820235206e-05, + "loss": 1.7391, + "step": 24068 + }, + { + "epoch": 7.387661141804788, + "grad_norm": 0.22324618697166443, + "learning_rate": 1.6857892904200863e-05, + "loss": 1.7384, + "step": 24069 + }, + { + "epoch": 7.387968078575813, + "grad_norm": 0.18035978078842163, + "learning_rate": 1.6854171315696216e-05, + "loss": 1.7029, + "step": 24070 + }, + { + "epoch": 7.388275015346839, + "grad_norm": 0.1727912276983261, + "learning_rate": 1.6850450054758092e-05, + "loss": 1.6649, + "step": 24071 + }, + { + "epoch": 7.388581952117864, + "grad_norm": 0.19713124632835388, + "learning_rate": 1.6846729121423256e-05, + "loss": 1.7508, + "step": 24072 + }, + { + "epoch": 7.388888888888889, + "grad_norm": 0.19403581321239471, + "learning_rate": 1.6843008515728464e-05, + "loss": 1.7807, + "step": 24073 + }, + { + "epoch": 7.389195825659914, + "grad_norm": 0.20204444229602814, + "learning_rate": 1.6839288237710503e-05, + "loss": 1.778, + "step": 24074 + }, + { + "epoch": 7.389502762430939, + "grad_norm": 0.20021478831768036, + "learning_rate": 1.6835568287406127e-05, + "loss": 1.7544, + "step": 24075 + }, + { + "epoch": 7.389809699201964, + "grad_norm": 0.2247730791568756, + "learning_rate": 1.6831848664852107e-05, + "loss": 1.7422, + "step": 24076 + }, + { + "epoch": 7.39011663597299, + "grad_norm": 0.21600402891635895, + "learning_rate": 1.68281293700852e-05, + "loss": 1.7491, + "step": 24077 + }, + { + "epoch": 7.390423572744015, + "grad_norm": 0.1854497194290161, + "learning_rate": 1.6824410403142145e-05, + "loss": 1.7292, + "step": 24078 + }, + { + "epoch": 7.3907305095150395, + "grad_norm": 0.21738949418067932, + "learning_rate": 1.6820691764059736e-05, + "loss": 1.6996, + "step": 24079 + }, + { + "epoch": 7.391037446286065, + "grad_norm": 0.20114775002002716, + "learning_rate": 1.6816973452874674e-05, + "loss": 1.7299, + "step": 24080 + }, + { + "epoch": 7.39134438305709, + "grad_norm": 0.17267082631587982, + "learning_rate": 1.681325546962376e-05, + "loss": 1.7181, + "step": 24081 + }, + { + "epoch": 7.3916513198281155, + "grad_norm": 0.1681009829044342, + "learning_rate": 1.680953781434369e-05, + "loss": 1.6826, + "step": 24082 + }, + { + "epoch": 7.391958256599141, + "grad_norm": 0.18807077407836914, + "learning_rate": 1.6805820487071205e-05, + "loss": 1.6934, + "step": 24083 + }, + { + "epoch": 7.392265193370166, + "grad_norm": 0.1859835982322693, + "learning_rate": 1.680210348784309e-05, + "loss": 1.7065, + "step": 24084 + }, + { + "epoch": 7.392572130141191, + "grad_norm": 0.20433956384658813, + "learning_rate": 1.679838681669601e-05, + "loss": 1.7934, + "step": 24085 + }, + { + "epoch": 7.392879066912216, + "grad_norm": 0.2428809553384781, + "learning_rate": 1.679467047366677e-05, + "loss": 1.7619, + "step": 24086 + }, + { + "epoch": 7.393186003683241, + "grad_norm": 0.25117191672325134, + "learning_rate": 1.6790954458792025e-05, + "loss": 1.7254, + "step": 24087 + }, + { + "epoch": 7.393492940454267, + "grad_norm": 0.19429172575473785, + "learning_rate": 1.6787238772108544e-05, + "loss": 1.6946, + "step": 24088 + }, + { + "epoch": 7.393799877225292, + "grad_norm": 0.18574993312358856, + "learning_rate": 1.678352341365304e-05, + "loss": 1.6953, + "step": 24089 + }, + { + "epoch": 7.394106813996316, + "grad_norm": 0.21022208034992218, + "learning_rate": 1.6779808383462227e-05, + "loss": 1.7866, + "step": 24090 + }, + { + "epoch": 7.394413750767342, + "grad_norm": 0.16711890697479248, + "learning_rate": 1.6776093681572818e-05, + "loss": 1.6988, + "step": 24091 + }, + { + "epoch": 7.394720687538367, + "grad_norm": 0.23661695420742035, + "learning_rate": 1.6772379308021524e-05, + "loss": 1.7152, + "step": 24092 + }, + { + "epoch": 7.395027624309392, + "grad_norm": 0.18410098552703857, + "learning_rate": 1.6768665262845052e-05, + "loss": 1.6643, + "step": 24093 + }, + { + "epoch": 7.395334561080418, + "grad_norm": 0.19566760957241058, + "learning_rate": 1.676495154608011e-05, + "loss": 1.7371, + "step": 24094 + }, + { + "epoch": 7.395641497851442, + "grad_norm": 0.18130381405353546, + "learning_rate": 1.6761238157763375e-05, + "loss": 1.6934, + "step": 24095 + }, + { + "epoch": 7.3959484346224675, + "grad_norm": 0.16141927242279053, + "learning_rate": 1.6757525097931603e-05, + "loss": 1.6629, + "step": 24096 + }, + { + "epoch": 7.396255371393493, + "grad_norm": 0.18370656669139862, + "learning_rate": 1.6753812366621418e-05, + "loss": 1.6931, + "step": 24097 + }, + { + "epoch": 7.396562308164518, + "grad_norm": 0.17368416488170624, + "learning_rate": 1.675009996386958e-05, + "loss": 1.7028, + "step": 24098 + }, + { + "epoch": 7.3968692449355435, + "grad_norm": 0.1704222410917282, + "learning_rate": 1.6746387889712722e-05, + "loss": 1.7241, + "step": 24099 + }, + { + "epoch": 7.397176181706568, + "grad_norm": 0.19127961993217468, + "learning_rate": 1.674267614418754e-05, + "loss": 1.6606, + "step": 24100 + }, + { + "epoch": 7.397483118477593, + "grad_norm": 0.20173178613185883, + "learning_rate": 1.673896472733075e-05, + "loss": 1.7293, + "step": 24101 + }, + { + "epoch": 7.397790055248619, + "grad_norm": 0.194651797413826, + "learning_rate": 1.6735253639178977e-05, + "loss": 1.6889, + "step": 24102 + }, + { + "epoch": 7.398096992019644, + "grad_norm": 0.16184480488300323, + "learning_rate": 1.6731542879768957e-05, + "loss": 1.6929, + "step": 24103 + }, + { + "epoch": 7.398403928790669, + "grad_norm": 0.21806742250919342, + "learning_rate": 1.67278324491373e-05, + "loss": 1.6944, + "step": 24104 + }, + { + "epoch": 7.398710865561695, + "grad_norm": 0.1599469929933548, + "learning_rate": 1.6724122347320715e-05, + "loss": 1.7107, + "step": 24105 + }, + { + "epoch": 7.399017802332719, + "grad_norm": 0.18621234595775604, + "learning_rate": 1.672041257435586e-05, + "loss": 1.6856, + "step": 24106 + }, + { + "epoch": 7.399324739103744, + "grad_norm": 0.20682603120803833, + "learning_rate": 1.6716703130279393e-05, + "loss": 1.7699, + "step": 24107 + }, + { + "epoch": 7.39963167587477, + "grad_norm": 0.19649554789066315, + "learning_rate": 1.6712994015127976e-05, + "loss": 1.7049, + "step": 24108 + }, + { + "epoch": 7.399938612645795, + "grad_norm": 0.15894706547260284, + "learning_rate": 1.6709285228938255e-05, + "loss": 1.7352, + "step": 24109 + }, + { + "epoch": 7.4002455494168204, + "grad_norm": 0.22186337411403656, + "learning_rate": 1.6705576771746896e-05, + "loss": 1.7353, + "step": 24110 + }, + { + "epoch": 7.400552486187845, + "grad_norm": 0.14689651131629944, + "learning_rate": 1.670186864359054e-05, + "loss": 1.7155, + "step": 24111 + }, + { + "epoch": 7.40085942295887, + "grad_norm": 0.2055603563785553, + "learning_rate": 1.6698160844505817e-05, + "loss": 1.6897, + "step": 24112 + }, + { + "epoch": 7.401166359729896, + "grad_norm": 0.1641531139612198, + "learning_rate": 1.6694453374529423e-05, + "loss": 1.67, + "step": 24113 + }, + { + "epoch": 7.401473296500921, + "grad_norm": 0.21150687336921692, + "learning_rate": 1.6690746233697923e-05, + "loss": 1.7507, + "step": 24114 + }, + { + "epoch": 7.401780233271946, + "grad_norm": 0.1844765543937683, + "learning_rate": 1.6687039422048035e-05, + "loss": 1.702, + "step": 24115 + }, + { + "epoch": 7.402087170042972, + "grad_norm": 0.1695966124534607, + "learning_rate": 1.6683332939616326e-05, + "loss": 1.6683, + "step": 24116 + }, + { + "epoch": 7.402394106813996, + "grad_norm": 0.17938567698001862, + "learning_rate": 1.667962678643943e-05, + "loss": 1.6947, + "step": 24117 + }, + { + "epoch": 7.402701043585021, + "grad_norm": 0.16420964896678925, + "learning_rate": 1.6675920962554027e-05, + "loss": 1.755, + "step": 24118 + }, + { + "epoch": 7.403007980356047, + "grad_norm": 0.16095438599586487, + "learning_rate": 1.667221546799667e-05, + "loss": 1.6855, + "step": 24119 + }, + { + "epoch": 7.403314917127072, + "grad_norm": 0.2089291363954544, + "learning_rate": 1.6668510302804052e-05, + "loss": 1.7213, + "step": 24120 + }, + { + "epoch": 7.403621853898097, + "grad_norm": 0.18369436264038086, + "learning_rate": 1.6664805467012717e-05, + "loss": 1.6913, + "step": 24121 + }, + { + "epoch": 7.403928790669122, + "grad_norm": 0.16405323147773743, + "learning_rate": 1.6661100960659326e-05, + "loss": 1.6529, + "step": 24122 + }, + { + "epoch": 7.404235727440147, + "grad_norm": 0.20792648196220398, + "learning_rate": 1.6657396783780477e-05, + "loss": 1.6855, + "step": 24123 + }, + { + "epoch": 7.4045426642111725, + "grad_norm": 0.17733097076416016, + "learning_rate": 1.6653692936412773e-05, + "loss": 1.727, + "step": 24124 + }, + { + "epoch": 7.404849600982198, + "grad_norm": 0.16196851432323456, + "learning_rate": 1.6649989418592825e-05, + "loss": 1.7376, + "step": 24125 + }, + { + "epoch": 7.405156537753223, + "grad_norm": 0.17193716764450073, + "learning_rate": 1.664628623035723e-05, + "loss": 1.6802, + "step": 24126 + }, + { + "epoch": 7.4054634745242485, + "grad_norm": 0.22076182067394257, + "learning_rate": 1.6642583371742576e-05, + "loss": 1.7512, + "step": 24127 + }, + { + "epoch": 7.405770411295273, + "grad_norm": 0.20766951143741608, + "learning_rate": 1.663888084278547e-05, + "loss": 1.7457, + "step": 24128 + }, + { + "epoch": 7.406077348066298, + "grad_norm": 0.16815492510795593, + "learning_rate": 1.663517864352248e-05, + "loss": 1.6867, + "step": 24129 + }, + { + "epoch": 7.406384284837324, + "grad_norm": 0.19644804298877716, + "learning_rate": 1.6631476773990246e-05, + "loss": 1.6996, + "step": 24130 + }, + { + "epoch": 7.406691221608349, + "grad_norm": 0.18717117607593536, + "learning_rate": 1.662777523422528e-05, + "loss": 1.7745, + "step": 24131 + }, + { + "epoch": 7.406998158379374, + "grad_norm": 0.1679331511259079, + "learning_rate": 1.662407402426423e-05, + "loss": 1.7213, + "step": 24132 + }, + { + "epoch": 7.407305095150399, + "grad_norm": 0.1721929907798767, + "learning_rate": 1.662037314414363e-05, + "loss": 1.6759, + "step": 24133 + }, + { + "epoch": 7.407612031921424, + "grad_norm": 0.15507890284061432, + "learning_rate": 1.661667259390005e-05, + "loss": 1.6658, + "step": 24134 + }, + { + "epoch": 7.407918968692449, + "grad_norm": 0.20528049767017365, + "learning_rate": 1.6612972373570114e-05, + "loss": 1.7508, + "step": 24135 + }, + { + "epoch": 7.408225905463475, + "grad_norm": 0.20593658089637756, + "learning_rate": 1.6609272483190315e-05, + "loss": 1.8078, + "step": 24136 + }, + { + "epoch": 7.4085328422345, + "grad_norm": 0.19905441999435425, + "learning_rate": 1.6605572922797292e-05, + "loss": 1.7933, + "step": 24137 + }, + { + "epoch": 7.4088397790055245, + "grad_norm": 0.17571881413459778, + "learning_rate": 1.6601873692427537e-05, + "loss": 1.6908, + "step": 24138 + }, + { + "epoch": 7.40914671577655, + "grad_norm": 0.2244982272386551, + "learning_rate": 1.6598174792117655e-05, + "loss": 1.6998, + "step": 24139 + }, + { + "epoch": 7.409453652547575, + "grad_norm": 0.15267951786518097, + "learning_rate": 1.6594476221904193e-05, + "loss": 1.6399, + "step": 24140 + }, + { + "epoch": 7.4097605893186005, + "grad_norm": 0.24161390960216522, + "learning_rate": 1.659077798182369e-05, + "loss": 1.6776, + "step": 24141 + }, + { + "epoch": 7.410067526089626, + "grad_norm": 0.17184343934059143, + "learning_rate": 1.658708007191271e-05, + "loss": 1.7169, + "step": 24142 + }, + { + "epoch": 7.41037446286065, + "grad_norm": 0.1589801162481308, + "learning_rate": 1.6583382492207778e-05, + "loss": 1.6727, + "step": 24143 + }, + { + "epoch": 7.410681399631676, + "grad_norm": 0.18666890263557434, + "learning_rate": 1.6579685242745452e-05, + "loss": 1.7429, + "step": 24144 + }, + { + "epoch": 7.410988336402701, + "grad_norm": 0.22418901324272156, + "learning_rate": 1.6575988323562265e-05, + "loss": 1.7834, + "step": 24145 + }, + { + "epoch": 7.411295273173726, + "grad_norm": 0.1897875964641571, + "learning_rate": 1.6572291734694734e-05, + "loss": 1.7271, + "step": 24146 + }, + { + "epoch": 7.411602209944752, + "grad_norm": 0.18204644322395325, + "learning_rate": 1.6568595476179445e-05, + "loss": 1.7003, + "step": 24147 + }, + { + "epoch": 7.411909146715777, + "grad_norm": 0.19130240380764008, + "learning_rate": 1.6564899548052853e-05, + "loss": 1.6803, + "step": 24148 + }, + { + "epoch": 7.412216083486801, + "grad_norm": 0.19467706978321075, + "learning_rate": 1.6561203950351554e-05, + "loss": 1.7529, + "step": 24149 + }, + { + "epoch": 7.412523020257827, + "grad_norm": 0.20290352404117584, + "learning_rate": 1.655750868311202e-05, + "loss": 1.7742, + "step": 24150 + }, + { + "epoch": 7.412829957028852, + "grad_norm": 0.18538729846477509, + "learning_rate": 1.6553813746370772e-05, + "loss": 1.68, + "step": 24151 + }, + { + "epoch": 7.413136893799877, + "grad_norm": 0.23339742422103882, + "learning_rate": 1.655011914016437e-05, + "loss": 1.7499, + "step": 24152 + }, + { + "epoch": 7.413443830570903, + "grad_norm": 0.21964092552661896, + "learning_rate": 1.654642486452927e-05, + "loss": 1.7394, + "step": 24153 + }, + { + "epoch": 7.413750767341927, + "grad_norm": 0.2131531536579132, + "learning_rate": 1.6542730919502032e-05, + "loss": 1.6928, + "step": 24154 + }, + { + "epoch": 7.4140577041129525, + "grad_norm": 0.20840130746364594, + "learning_rate": 1.653903730511911e-05, + "loss": 1.6785, + "step": 24155 + }, + { + "epoch": 7.414364640883978, + "grad_norm": 0.1519836038351059, + "learning_rate": 1.653534402141705e-05, + "loss": 1.6882, + "step": 24156 + }, + { + "epoch": 7.414671577655003, + "grad_norm": 0.21539351344108582, + "learning_rate": 1.653165106843233e-05, + "loss": 1.7041, + "step": 24157 + }, + { + "epoch": 7.4149785144260285, + "grad_norm": 0.2050703912973404, + "learning_rate": 1.6527958446201453e-05, + "loss": 1.7854, + "step": 24158 + }, + { + "epoch": 7.415285451197054, + "grad_norm": 0.21595771610736847, + "learning_rate": 1.652426615476091e-05, + "loss": 1.7305, + "step": 24159 + }, + { + "epoch": 7.415592387968078, + "grad_norm": 0.19248713552951813, + "learning_rate": 1.6520574194147186e-05, + "loss": 1.6834, + "step": 24160 + }, + { + "epoch": 7.415899324739104, + "grad_norm": 0.178158700466156, + "learning_rate": 1.6516882564396774e-05, + "loss": 1.7312, + "step": 24161 + }, + { + "epoch": 7.416206261510129, + "grad_norm": 0.18686197698116302, + "learning_rate": 1.6513191265546152e-05, + "loss": 1.7025, + "step": 24162 + }, + { + "epoch": 7.416513198281154, + "grad_norm": 0.1544325053691864, + "learning_rate": 1.6509500297631787e-05, + "loss": 1.6773, + "step": 24163 + }, + { + "epoch": 7.41682013505218, + "grad_norm": 0.1787567138671875, + "learning_rate": 1.6505809660690197e-05, + "loss": 1.6941, + "step": 24164 + }, + { + "epoch": 7.417127071823204, + "grad_norm": 0.16545183956623077, + "learning_rate": 1.65021193547578e-05, + "loss": 1.6618, + "step": 24165 + }, + { + "epoch": 7.417434008594229, + "grad_norm": 0.23889821767807007, + "learning_rate": 1.6498429379871126e-05, + "loss": 1.7651, + "step": 24166 + }, + { + "epoch": 7.417740945365255, + "grad_norm": 0.2012832909822464, + "learning_rate": 1.649473973606659e-05, + "loss": 1.7477, + "step": 24167 + }, + { + "epoch": 7.41804788213628, + "grad_norm": 0.18035975098609924, + "learning_rate": 1.6491050423380662e-05, + "loss": 1.6747, + "step": 24168 + }, + { + "epoch": 7.418354818907305, + "grad_norm": 0.14925292134284973, + "learning_rate": 1.6487361441849842e-05, + "loss": 1.6817, + "step": 24169 + }, + { + "epoch": 7.41866175567833, + "grad_norm": 0.19253355264663696, + "learning_rate": 1.6483672791510523e-05, + "loss": 1.6943, + "step": 24170 + }, + { + "epoch": 7.418968692449355, + "grad_norm": 0.17203082144260406, + "learning_rate": 1.6479984472399234e-05, + "loss": 1.692, + "step": 24171 + }, + { + "epoch": 7.4192756292203805, + "grad_norm": 0.19132022559642792, + "learning_rate": 1.647629648455235e-05, + "loss": 1.7029, + "step": 24172 + }, + { + "epoch": 7.419582565991406, + "grad_norm": 0.17949101328849792, + "learning_rate": 1.647260882800637e-05, + "loss": 1.6944, + "step": 24173 + }, + { + "epoch": 7.419889502762431, + "grad_norm": 0.17752930521965027, + "learning_rate": 1.646892150279772e-05, + "loss": 1.6875, + "step": 24174 + }, + { + "epoch": 7.420196439533456, + "grad_norm": 0.19464492797851562, + "learning_rate": 1.6465234508962836e-05, + "loss": 1.6988, + "step": 24175 + }, + { + "epoch": 7.420503376304481, + "grad_norm": 0.20154574513435364, + "learning_rate": 1.6461547846538168e-05, + "loss": 1.7305, + "step": 24176 + }, + { + "epoch": 7.420810313075506, + "grad_norm": 0.20944970846176147, + "learning_rate": 1.6457861515560136e-05, + "loss": 1.7699, + "step": 24177 + }, + { + "epoch": 7.421117249846532, + "grad_norm": 0.22422203421592712, + "learning_rate": 1.6454175516065175e-05, + "loss": 1.6607, + "step": 24178 + }, + { + "epoch": 7.421424186617557, + "grad_norm": 0.16106431186199188, + "learning_rate": 1.6450489848089717e-05, + "loss": 1.7204, + "step": 24179 + }, + { + "epoch": 7.421731123388582, + "grad_norm": 0.24394269287586212, + "learning_rate": 1.644680451167018e-05, + "loss": 1.7161, + "step": 24180 + }, + { + "epoch": 7.422038060159607, + "grad_norm": 0.1999186873435974, + "learning_rate": 1.644311950684299e-05, + "loss": 1.7486, + "step": 24181 + }, + { + "epoch": 7.422344996930632, + "grad_norm": 0.1865876019001007, + "learning_rate": 1.6439434833644545e-05, + "loss": 1.737, + "step": 24182 + }, + { + "epoch": 7.422651933701657, + "grad_norm": 0.18088236451148987, + "learning_rate": 1.643575049211131e-05, + "loss": 1.6821, + "step": 24183 + }, + { + "epoch": 7.422958870472683, + "grad_norm": 0.17456914484500885, + "learning_rate": 1.643206648227964e-05, + "loss": 1.7379, + "step": 24184 + }, + { + "epoch": 7.423265807243708, + "grad_norm": 0.18160004913806915, + "learning_rate": 1.642838280418595e-05, + "loss": 1.7364, + "step": 24185 + }, + { + "epoch": 7.4235727440147325, + "grad_norm": 0.18081973493099213, + "learning_rate": 1.6424699457866688e-05, + "loss": 1.7591, + "step": 24186 + }, + { + "epoch": 7.423879680785758, + "grad_norm": 0.20753513276576996, + "learning_rate": 1.6421016443358195e-05, + "loss": 1.7299, + "step": 24187 + }, + { + "epoch": 7.424186617556783, + "grad_norm": 0.2102874517440796, + "learning_rate": 1.641733376069693e-05, + "loss": 1.7876, + "step": 24188 + }, + { + "epoch": 7.4244935543278086, + "grad_norm": 0.19360920786857605, + "learning_rate": 1.6413651409919224e-05, + "loss": 1.7578, + "step": 24189 + }, + { + "epoch": 7.424800491098834, + "grad_norm": 0.1954938918352127, + "learning_rate": 1.6409969391061514e-05, + "loss": 1.7074, + "step": 24190 + }, + { + "epoch": 7.425107427869859, + "grad_norm": 0.2228705734014511, + "learning_rate": 1.6406287704160177e-05, + "loss": 1.7261, + "step": 24191 + }, + { + "epoch": 7.425414364640884, + "grad_norm": 0.18695802986621857, + "learning_rate": 1.6402606349251597e-05, + "loss": 1.7074, + "step": 24192 + }, + { + "epoch": 7.425721301411909, + "grad_norm": 0.19026046991348267, + "learning_rate": 1.639892532637215e-05, + "loss": 1.7546, + "step": 24193 + }, + { + "epoch": 7.426028238182934, + "grad_norm": 0.2086167335510254, + "learning_rate": 1.639524463555822e-05, + "loss": 1.7551, + "step": 24194 + }, + { + "epoch": 7.42633517495396, + "grad_norm": 0.201420396566391, + "learning_rate": 1.639156427684618e-05, + "loss": 1.6961, + "step": 24195 + }, + { + "epoch": 7.426642111724985, + "grad_norm": 0.1735599786043167, + "learning_rate": 1.6387884250272394e-05, + "loss": 1.7461, + "step": 24196 + }, + { + "epoch": 7.4269490484960095, + "grad_norm": 0.23944853246212006, + "learning_rate": 1.6384204555873238e-05, + "loss": 1.7001, + "step": 24197 + }, + { + "epoch": 7.427255985267035, + "grad_norm": 0.15605413913726807, + "learning_rate": 1.638052519368508e-05, + "loss": 1.7105, + "step": 24198 + }, + { + "epoch": 7.42756292203806, + "grad_norm": 0.21450987458229065, + "learning_rate": 1.6376846163744257e-05, + "loss": 1.7309, + "step": 24199 + }, + { + "epoch": 7.4278698588090855, + "grad_norm": 0.20542307198047638, + "learning_rate": 1.637316746608718e-05, + "loss": 1.72, + "step": 24200 + }, + { + "epoch": 7.428176795580111, + "grad_norm": 0.18612053990364075, + "learning_rate": 1.6369489100750157e-05, + "loss": 1.6714, + "step": 24201 + }, + { + "epoch": 7.428483732351136, + "grad_norm": 0.16587957739830017, + "learning_rate": 1.6365811067769553e-05, + "loss": 1.7494, + "step": 24202 + }, + { + "epoch": 7.428790669122161, + "grad_norm": 0.247777059674263, + "learning_rate": 1.636213336718172e-05, + "loss": 1.7048, + "step": 24203 + }, + { + "epoch": 7.429097605893186, + "grad_norm": 0.2000289410352707, + "learning_rate": 1.635845599902298e-05, + "loss": 1.7568, + "step": 24204 + }, + { + "epoch": 7.429404542664211, + "grad_norm": 0.21887128055095673, + "learning_rate": 1.6354778963329732e-05, + "loss": 1.6708, + "step": 24205 + }, + { + "epoch": 7.429711479435237, + "grad_norm": 0.18932145833969116, + "learning_rate": 1.6351102260138247e-05, + "loss": 1.7184, + "step": 24206 + }, + { + "epoch": 7.430018416206262, + "grad_norm": 0.20103856921195984, + "learning_rate": 1.63474258894849e-05, + "loss": 1.7031, + "step": 24207 + }, + { + "epoch": 7.430325352977286, + "grad_norm": 0.22598737478256226, + "learning_rate": 1.634374985140602e-05, + "loss": 1.7803, + "step": 24208 + }, + { + "epoch": 7.430632289748312, + "grad_norm": 0.22468316555023193, + "learning_rate": 1.6340074145937934e-05, + "loss": 1.7635, + "step": 24209 + }, + { + "epoch": 7.430939226519337, + "grad_norm": 0.16173744201660156, + "learning_rate": 1.6336398773116962e-05, + "loss": 1.6877, + "step": 24210 + }, + { + "epoch": 7.431246163290362, + "grad_norm": 0.17869406938552856, + "learning_rate": 1.6332723732979426e-05, + "loss": 1.6436, + "step": 24211 + }, + { + "epoch": 7.431553100061388, + "grad_norm": 0.1828129142522812, + "learning_rate": 1.6329049025561648e-05, + "loss": 1.7191, + "step": 24212 + }, + { + "epoch": 7.431860036832412, + "grad_norm": 0.19169248640537262, + "learning_rate": 1.6325374650899944e-05, + "loss": 1.7607, + "step": 24213 + }, + { + "epoch": 7.4321669736034375, + "grad_norm": 0.1680343598127365, + "learning_rate": 1.632170060903062e-05, + "loss": 1.6736, + "step": 24214 + }, + { + "epoch": 7.432473910374463, + "grad_norm": 0.20647180080413818, + "learning_rate": 1.6318026899989996e-05, + "loss": 1.7875, + "step": 24215 + }, + { + "epoch": 7.432780847145488, + "grad_norm": 0.29225587844848633, + "learning_rate": 1.6314353523814352e-05, + "loss": 1.8164, + "step": 24216 + }, + { + "epoch": 7.4330877839165135, + "grad_norm": 0.1633446216583252, + "learning_rate": 1.6310680480540048e-05, + "loss": 1.6529, + "step": 24217 + }, + { + "epoch": 7.433394720687538, + "grad_norm": 0.21215081214904785, + "learning_rate": 1.6307007770203326e-05, + "loss": 1.6323, + "step": 24218 + }, + { + "epoch": 7.433701657458563, + "grad_norm": 0.1934979110956192, + "learning_rate": 1.63033353928405e-05, + "loss": 1.7299, + "step": 24219 + }, + { + "epoch": 7.434008594229589, + "grad_norm": 0.2581390142440796, + "learning_rate": 1.6299663348487865e-05, + "loss": 1.7308, + "step": 24220 + }, + { + "epoch": 7.434315531000614, + "grad_norm": 0.2711075246334076, + "learning_rate": 1.629599163718169e-05, + "loss": 1.8736, + "step": 24221 + }, + { + "epoch": 7.434622467771639, + "grad_norm": 0.2620790898799896, + "learning_rate": 1.6292320258958316e-05, + "loss": 1.7326, + "step": 24222 + }, + { + "epoch": 7.434929404542665, + "grad_norm": 0.16254334151744843, + "learning_rate": 1.6288649213853958e-05, + "loss": 1.6996, + "step": 24223 + }, + { + "epoch": 7.435236341313689, + "grad_norm": 0.22968515753746033, + "learning_rate": 1.628497850190496e-05, + "loss": 1.694, + "step": 24224 + }, + { + "epoch": 7.435543278084714, + "grad_norm": 0.20458953082561493, + "learning_rate": 1.6281308123147533e-05, + "loss": 1.7558, + "step": 24225 + }, + { + "epoch": 7.43585021485574, + "grad_norm": 0.2327413409948349, + "learning_rate": 1.6277638077617995e-05, + "loss": 1.7581, + "step": 24226 + }, + { + "epoch": 7.436157151626765, + "grad_norm": 0.18312111496925354, + "learning_rate": 1.6273968365352604e-05, + "loss": 1.6713, + "step": 24227 + }, + { + "epoch": 7.43646408839779, + "grad_norm": 0.15935418009757996, + "learning_rate": 1.6270298986387628e-05, + "loss": 1.6996, + "step": 24228 + }, + { + "epoch": 7.436771025168815, + "grad_norm": 0.17424416542053223, + "learning_rate": 1.6266629940759322e-05, + "loss": 1.6826, + "step": 24229 + }, + { + "epoch": 7.43707796193984, + "grad_norm": 0.18982923030853271, + "learning_rate": 1.6262961228503953e-05, + "loss": 1.741, + "step": 24230 + }, + { + "epoch": 7.4373848987108655, + "grad_norm": 0.16608789563179016, + "learning_rate": 1.6259292849657777e-05, + "loss": 1.7205, + "step": 24231 + }, + { + "epoch": 7.437691835481891, + "grad_norm": 0.19830825924873352, + "learning_rate": 1.625562480425704e-05, + "loss": 1.7159, + "step": 24232 + }, + { + "epoch": 7.437998772252916, + "grad_norm": 0.1889072209596634, + "learning_rate": 1.6251957092337988e-05, + "loss": 1.7427, + "step": 24233 + }, + { + "epoch": 7.4383057090239415, + "grad_norm": 0.18454046547412872, + "learning_rate": 1.6248289713936903e-05, + "loss": 1.6962, + "step": 24234 + }, + { + "epoch": 7.438612645794966, + "grad_norm": 0.20041033625602722, + "learning_rate": 1.6244622669089987e-05, + "loss": 1.7763, + "step": 24235 + }, + { + "epoch": 7.438919582565991, + "grad_norm": 0.17226676642894745, + "learning_rate": 1.62409559578335e-05, + "loss": 1.6783, + "step": 24236 + }, + { + "epoch": 7.439226519337017, + "grad_norm": 0.1761687994003296, + "learning_rate": 1.6237289580203662e-05, + "loss": 1.6761, + "step": 24237 + }, + { + "epoch": 7.439533456108042, + "grad_norm": 0.24213027954101562, + "learning_rate": 1.6233623536236707e-05, + "loss": 1.724, + "step": 24238 + }, + { + "epoch": 7.439840392879067, + "grad_norm": 0.15541739761829376, + "learning_rate": 1.6229957825968913e-05, + "loss": 1.6594, + "step": 24239 + }, + { + "epoch": 7.440147329650092, + "grad_norm": 0.20755749940872192, + "learning_rate": 1.622629244943643e-05, + "loss": 1.7229, + "step": 24240 + }, + { + "epoch": 7.440454266421117, + "grad_norm": 0.20716612040996552, + "learning_rate": 1.6222627406675555e-05, + "loss": 1.699, + "step": 24241 + }, + { + "epoch": 7.440761203192142, + "grad_norm": 0.17423541843891144, + "learning_rate": 1.621896269772244e-05, + "loss": 1.7175, + "step": 24242 + }, + { + "epoch": 7.441068139963168, + "grad_norm": 0.17913730442523956, + "learning_rate": 1.6215298322613347e-05, + "loss": 1.7287, + "step": 24243 + }, + { + "epoch": 7.441375076734193, + "grad_norm": 0.21801607310771942, + "learning_rate": 1.6211634281384486e-05, + "loss": 1.8157, + "step": 24244 + }, + { + "epoch": 7.4416820135052175, + "grad_norm": 0.23132582008838654, + "learning_rate": 1.6207970574072056e-05, + "loss": 1.7921, + "step": 24245 + }, + { + "epoch": 7.441988950276243, + "grad_norm": 0.18289685249328613, + "learning_rate": 1.6204307200712266e-05, + "loss": 1.7222, + "step": 24246 + }, + { + "epoch": 7.442295887047268, + "grad_norm": 0.15289388597011566, + "learning_rate": 1.620064416134132e-05, + "loss": 1.6409, + "step": 24247 + }, + { + "epoch": 7.4426028238182935, + "grad_norm": 0.1684839129447937, + "learning_rate": 1.619698145599542e-05, + "loss": 1.7362, + "step": 24248 + }, + { + "epoch": 7.442909760589319, + "grad_norm": 0.16812102496623993, + "learning_rate": 1.619331908471076e-05, + "loss": 1.6849, + "step": 24249 + }, + { + "epoch": 7.443216697360343, + "grad_norm": 0.16095775365829468, + "learning_rate": 1.6189657047523526e-05, + "loss": 1.7032, + "step": 24250 + }, + { + "epoch": 7.443523634131369, + "grad_norm": 0.167144313454628, + "learning_rate": 1.6185995344469946e-05, + "loss": 1.6539, + "step": 24251 + }, + { + "epoch": 7.443830570902394, + "grad_norm": 0.18129989504814148, + "learning_rate": 1.618233397558616e-05, + "loss": 1.7057, + "step": 24252 + }, + { + "epoch": 7.444137507673419, + "grad_norm": 0.17299556732177734, + "learning_rate": 1.6178672940908374e-05, + "loss": 1.6965, + "step": 24253 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 0.14944438636302948, + "learning_rate": 1.6175012240472765e-05, + "loss": 1.6666, + "step": 24254 + }, + { + "epoch": 7.44475138121547, + "grad_norm": 0.20333626866340637, + "learning_rate": 1.6171351874315494e-05, + "loss": 1.748, + "step": 24255 + }, + { + "epoch": 7.445058317986494, + "grad_norm": 0.2233068197965622, + "learning_rate": 1.6167691842472783e-05, + "loss": 1.7662, + "step": 24256 + }, + { + "epoch": 7.44536525475752, + "grad_norm": 0.22628507018089294, + "learning_rate": 1.6164032144980738e-05, + "loss": 1.747, + "step": 24257 + }, + { + "epoch": 7.445672191528545, + "grad_norm": 0.18167820572853088, + "learning_rate": 1.6160372781875594e-05, + "loss": 1.7311, + "step": 24258 + }, + { + "epoch": 7.44597912829957, + "grad_norm": 0.1975218504667282, + "learning_rate": 1.6156713753193446e-05, + "loss": 1.7334, + "step": 24259 + }, + { + "epoch": 7.446286065070596, + "grad_norm": 0.18606813251972198, + "learning_rate": 1.6153055058970508e-05, + "loss": 1.7118, + "step": 24260 + }, + { + "epoch": 7.44659300184162, + "grad_norm": 0.14817847311496735, + "learning_rate": 1.6149396699242914e-05, + "loss": 1.6385, + "step": 24261 + }, + { + "epoch": 7.4468999386126455, + "grad_norm": 0.19018684327602386, + "learning_rate": 1.6145738674046825e-05, + "loss": 1.7511, + "step": 24262 + }, + { + "epoch": 7.447206875383671, + "grad_norm": 0.17089374363422394, + "learning_rate": 1.6142080983418385e-05, + "loss": 1.7523, + "step": 24263 + }, + { + "epoch": 7.447513812154696, + "grad_norm": 0.16370832920074463, + "learning_rate": 1.613842362739375e-05, + "loss": 1.6636, + "step": 24264 + }, + { + "epoch": 7.4478207489257215, + "grad_norm": 0.16432829201221466, + "learning_rate": 1.6134766606009055e-05, + "loss": 1.7355, + "step": 24265 + }, + { + "epoch": 7.448127685696747, + "grad_norm": 0.15270906686782837, + "learning_rate": 1.6131109919300453e-05, + "loss": 1.7169, + "step": 24266 + }, + { + "epoch": 7.448434622467771, + "grad_norm": 0.14986950159072876, + "learning_rate": 1.6127453567304053e-05, + "loss": 1.7021, + "step": 24267 + }, + { + "epoch": 7.448741559238797, + "grad_norm": 0.17727383971214294, + "learning_rate": 1.6123797550056042e-05, + "loss": 1.7144, + "step": 24268 + }, + { + "epoch": 7.449048496009822, + "grad_norm": 0.1471523940563202, + "learning_rate": 1.6120141867592504e-05, + "loss": 1.694, + "step": 24269 + }, + { + "epoch": 7.449355432780847, + "grad_norm": 0.15561319887638092, + "learning_rate": 1.611648651994958e-05, + "loss": 1.6672, + "step": 24270 + }, + { + "epoch": 7.449662369551873, + "grad_norm": 0.19121745228767395, + "learning_rate": 1.61128315071634e-05, + "loss": 1.7317, + "step": 24271 + }, + { + "epoch": 7.449969306322897, + "grad_norm": 0.27333202958106995, + "learning_rate": 1.6109176829270062e-05, + "loss": 1.7943, + "step": 24272 + }, + { + "epoch": 7.4502762430939224, + "grad_norm": 0.16996058821678162, + "learning_rate": 1.6105522486305736e-05, + "loss": 1.6883, + "step": 24273 + }, + { + "epoch": 7.450583179864948, + "grad_norm": 0.17687207460403442, + "learning_rate": 1.610186847830647e-05, + "loss": 1.6967, + "step": 24274 + }, + { + "epoch": 7.450890116635973, + "grad_norm": 0.2191249281167984, + "learning_rate": 1.6098214805308436e-05, + "loss": 1.7644, + "step": 24275 + }, + { + "epoch": 7.4511970534069984, + "grad_norm": 0.17267808318138123, + "learning_rate": 1.6094561467347684e-05, + "loss": 1.6963, + "step": 24276 + }, + { + "epoch": 7.451503990178024, + "grad_norm": 0.16276031732559204, + "learning_rate": 1.609090846446037e-05, + "loss": 1.6795, + "step": 24277 + }, + { + "epoch": 7.451810926949048, + "grad_norm": 0.16677677631378174, + "learning_rate": 1.6087255796682572e-05, + "loss": 1.699, + "step": 24278 + }, + { + "epoch": 7.452117863720074, + "grad_norm": 0.17163679003715515, + "learning_rate": 1.6083603464050383e-05, + "loss": 1.6906, + "step": 24279 + }, + { + "epoch": 7.452424800491099, + "grad_norm": 0.16087757050991058, + "learning_rate": 1.6079951466599908e-05, + "loss": 1.7173, + "step": 24280 + }, + { + "epoch": 7.452731737262124, + "grad_norm": 0.19389556348323822, + "learning_rate": 1.6076299804367228e-05, + "loss": 1.6985, + "step": 24281 + }, + { + "epoch": 7.45303867403315, + "grad_norm": 0.20400559902191162, + "learning_rate": 1.6072648477388447e-05, + "loss": 1.7336, + "step": 24282 + }, + { + "epoch": 7.453345610804174, + "grad_norm": 0.16443994641304016, + "learning_rate": 1.6068997485699632e-05, + "loss": 1.6909, + "step": 24283 + }, + { + "epoch": 7.453652547575199, + "grad_norm": 0.18333028256893158, + "learning_rate": 1.606534682933686e-05, + "loss": 1.6749, + "step": 24284 + }, + { + "epoch": 7.453959484346225, + "grad_norm": 0.21596840023994446, + "learning_rate": 1.6061696508336244e-05, + "loss": 1.7856, + "step": 24285 + }, + { + "epoch": 7.45426642111725, + "grad_norm": 0.18656609952449799, + "learning_rate": 1.6058046522733827e-05, + "loss": 1.6892, + "step": 24286 + }, + { + "epoch": 7.454573357888275, + "grad_norm": 0.18110665678977966, + "learning_rate": 1.6054396872565687e-05, + "loss": 1.7063, + "step": 24287 + }, + { + "epoch": 7.4548802946593, + "grad_norm": 0.19452248513698578, + "learning_rate": 1.605074755786789e-05, + "loss": 1.7637, + "step": 24288 + }, + { + "epoch": 7.455187231430325, + "grad_norm": 0.18945640325546265, + "learning_rate": 1.604709857867649e-05, + "loss": 1.7498, + "step": 24289 + }, + { + "epoch": 7.4554941682013505, + "grad_norm": 0.1847696155309677, + "learning_rate": 1.6043449935027592e-05, + "loss": 1.702, + "step": 24290 + }, + { + "epoch": 7.455801104972376, + "grad_norm": 0.18882444500923157, + "learning_rate": 1.6039801626957197e-05, + "loss": 1.728, + "step": 24291 + }, + { + "epoch": 7.456108041743401, + "grad_norm": 0.1981150358915329, + "learning_rate": 1.603615365450142e-05, + "loss": 1.7114, + "step": 24292 + }, + { + "epoch": 7.456414978514426, + "grad_norm": 0.2305375188589096, + "learning_rate": 1.6032506017696242e-05, + "loss": 1.7234, + "step": 24293 + }, + { + "epoch": 7.456721915285451, + "grad_norm": 0.17539730668067932, + "learning_rate": 1.6028858716577764e-05, + "loss": 1.6305, + "step": 24294 + }, + { + "epoch": 7.457028852056476, + "grad_norm": 0.19684432446956635, + "learning_rate": 1.602521175118202e-05, + "loss": 1.6958, + "step": 24295 + }, + { + "epoch": 7.457335788827502, + "grad_norm": 0.20957234501838684, + "learning_rate": 1.602156512154504e-05, + "loss": 1.6984, + "step": 24296 + }, + { + "epoch": 7.457642725598527, + "grad_norm": 0.18523702025413513, + "learning_rate": 1.6017918827702877e-05, + "loss": 1.7817, + "step": 24297 + }, + { + "epoch": 7.457949662369552, + "grad_norm": 0.1964758187532425, + "learning_rate": 1.601427286969155e-05, + "loss": 1.7597, + "step": 24298 + }, + { + "epoch": 7.458256599140577, + "grad_norm": 0.199961856007576, + "learning_rate": 1.6010627247547106e-05, + "loss": 1.6988, + "step": 24299 + }, + { + "epoch": 7.458563535911602, + "grad_norm": 0.16149461269378662, + "learning_rate": 1.6006981961305555e-05, + "loss": 1.6673, + "step": 24300 + }, + { + "epoch": 7.458870472682627, + "grad_norm": 0.2198258489370346, + "learning_rate": 1.600333701100293e-05, + "loss": 1.7159, + "step": 24301 + }, + { + "epoch": 7.459177409453653, + "grad_norm": 0.157994344830513, + "learning_rate": 1.5999692396675277e-05, + "loss": 1.7118, + "step": 24302 + }, + { + "epoch": 7.459484346224678, + "grad_norm": 0.21911758184432983, + "learning_rate": 1.5996048118358575e-05, + "loss": 1.7209, + "step": 24303 + }, + { + "epoch": 7.4597912829957025, + "grad_norm": 0.20648738741874695, + "learning_rate": 1.599240417608886e-05, + "loss": 1.7844, + "step": 24304 + }, + { + "epoch": 7.460098219766728, + "grad_norm": 0.18746837973594666, + "learning_rate": 1.598876056990214e-05, + "loss": 1.7079, + "step": 24305 + }, + { + "epoch": 7.460405156537753, + "grad_norm": 0.17767341434955597, + "learning_rate": 1.5985117299834407e-05, + "loss": 1.7579, + "step": 24306 + }, + { + "epoch": 7.4607120933087785, + "grad_norm": 0.18997585773468018, + "learning_rate": 1.598147436592171e-05, + "loss": 1.7556, + "step": 24307 + }, + { + "epoch": 7.461019030079804, + "grad_norm": 0.19356711208820343, + "learning_rate": 1.597783176819999e-05, + "loss": 1.7315, + "step": 24308 + }, + { + "epoch": 7.461325966850829, + "grad_norm": 0.23354102671146393, + "learning_rate": 1.597418950670531e-05, + "loss": 1.7622, + "step": 24309 + }, + { + "epoch": 7.461632903621854, + "grad_norm": 0.18773409724235535, + "learning_rate": 1.5970547581473604e-05, + "loss": 1.6582, + "step": 24310 + }, + { + "epoch": 7.461939840392879, + "grad_norm": 0.23704196512699127, + "learning_rate": 1.596690599254091e-05, + "loss": 1.7207, + "step": 24311 + }, + { + "epoch": 7.462246777163904, + "grad_norm": 0.1943788379430771, + "learning_rate": 1.596326473994319e-05, + "loss": 1.696, + "step": 24312 + }, + { + "epoch": 7.46255371393493, + "grad_norm": 0.22303985059261322, + "learning_rate": 1.595962382371644e-05, + "loss": 1.6963, + "step": 24313 + }, + { + "epoch": 7.462860650705955, + "grad_norm": 0.20158524811267853, + "learning_rate": 1.5955983243896643e-05, + "loss": 1.7017, + "step": 24314 + }, + { + "epoch": 7.463167587476979, + "grad_norm": 0.18768194317817688, + "learning_rate": 1.595234300051977e-05, + "loss": 1.6743, + "step": 24315 + }, + { + "epoch": 7.463474524248005, + "grad_norm": 0.27407020330429077, + "learning_rate": 1.5948703093621803e-05, + "loss": 1.7522, + "step": 24316 + }, + { + "epoch": 7.46378146101903, + "grad_norm": 0.2027997523546219, + "learning_rate": 1.5945063523238706e-05, + "loss": 1.7515, + "step": 24317 + }, + { + "epoch": 7.464088397790055, + "grad_norm": 0.2728271782398224, + "learning_rate": 1.5941424289406454e-05, + "loss": 1.7611, + "step": 24318 + }, + { + "epoch": 7.464395334561081, + "grad_norm": 0.1704578548669815, + "learning_rate": 1.593778539216101e-05, + "loss": 1.6602, + "step": 24319 + }, + { + "epoch": 7.464702271332105, + "grad_norm": 0.19684311747550964, + "learning_rate": 1.5934146831538332e-05, + "loss": 1.6824, + "step": 24320 + }, + { + "epoch": 7.4650092081031305, + "grad_norm": 0.196905255317688, + "learning_rate": 1.5930508607574386e-05, + "loss": 1.691, + "step": 24321 + }, + { + "epoch": 7.465316144874156, + "grad_norm": 0.18543855845928192, + "learning_rate": 1.5926870720305122e-05, + "loss": 1.6936, + "step": 24322 + }, + { + "epoch": 7.465623081645181, + "grad_norm": 0.24634000658988953, + "learning_rate": 1.592323316976647e-05, + "loss": 1.6857, + "step": 24323 + }, + { + "epoch": 7.4659300184162065, + "grad_norm": 0.1976090669631958, + "learning_rate": 1.5919595955994444e-05, + "loss": 1.7248, + "step": 24324 + }, + { + "epoch": 7.466236955187231, + "grad_norm": 0.21902409195899963, + "learning_rate": 1.5915959079024907e-05, + "loss": 1.7184, + "step": 24325 + }, + { + "epoch": 7.466543891958256, + "grad_norm": 0.14501455426216125, + "learning_rate": 1.591232253889387e-05, + "loss": 1.6351, + "step": 24326 + }, + { + "epoch": 7.466850828729282, + "grad_norm": 0.20591090619564056, + "learning_rate": 1.5908686335637213e-05, + "loss": 1.7188, + "step": 24327 + }, + { + "epoch": 7.467157765500307, + "grad_norm": 0.17669445276260376, + "learning_rate": 1.590505046929091e-05, + "loss": 1.6735, + "step": 24328 + }, + { + "epoch": 7.467464702271332, + "grad_norm": 0.19642697274684906, + "learning_rate": 1.590141493989089e-05, + "loss": 1.6599, + "step": 24329 + }, + { + "epoch": 7.467771639042358, + "grad_norm": 0.2049490511417389, + "learning_rate": 1.589777974747307e-05, + "loss": 1.77, + "step": 24330 + }, + { + "epoch": 7.468078575813382, + "grad_norm": 0.1877276450395584, + "learning_rate": 1.5894144892073377e-05, + "loss": 1.6774, + "step": 24331 + }, + { + "epoch": 7.468385512584407, + "grad_norm": 0.18437768518924713, + "learning_rate": 1.5890510373727735e-05, + "loss": 1.7054, + "step": 24332 + }, + { + "epoch": 7.468692449355433, + "grad_norm": 0.1850978136062622, + "learning_rate": 1.5886876192472062e-05, + "loss": 1.6664, + "step": 24333 + }, + { + "epoch": 7.468999386126458, + "grad_norm": 0.16257111728191376, + "learning_rate": 1.588324234834227e-05, + "loss": 1.7438, + "step": 24334 + }, + { + "epoch": 7.469306322897483, + "grad_norm": 0.1776656061410904, + "learning_rate": 1.5879608841374277e-05, + "loss": 1.6913, + "step": 24335 + }, + { + "epoch": 7.469613259668508, + "grad_norm": 0.183144673705101, + "learning_rate": 1.587597567160398e-05, + "loss": 1.6737, + "step": 24336 + }, + { + "epoch": 7.469920196439533, + "grad_norm": 0.15030701458454132, + "learning_rate": 1.5872342839067306e-05, + "loss": 1.6776, + "step": 24337 + }, + { + "epoch": 7.4702271332105585, + "grad_norm": 0.1987701952457428, + "learning_rate": 1.586871034380013e-05, + "loss": 1.7119, + "step": 24338 + }, + { + "epoch": 7.470534069981584, + "grad_norm": 0.20000997185707092, + "learning_rate": 1.5865078185838373e-05, + "loss": 1.6794, + "step": 24339 + }, + { + "epoch": 7.470841006752609, + "grad_norm": 0.1674201786518097, + "learning_rate": 1.5861446365217902e-05, + "loss": 1.6826, + "step": 24340 + }, + { + "epoch": 7.4711479435236345, + "grad_norm": 0.22385969758033752, + "learning_rate": 1.585781488197466e-05, + "loss": 1.7012, + "step": 24341 + }, + { + "epoch": 7.471454880294659, + "grad_norm": 0.18635201454162598, + "learning_rate": 1.585418373614446e-05, + "loss": 1.7086, + "step": 24342 + }, + { + "epoch": 7.471761817065684, + "grad_norm": 0.17345300316810608, + "learning_rate": 1.5850552927763274e-05, + "loss": 1.7068, + "step": 24343 + }, + { + "epoch": 7.47206875383671, + "grad_norm": 0.1777433305978775, + "learning_rate": 1.5846922456866904e-05, + "loss": 1.6618, + "step": 24344 + }, + { + "epoch": 7.472375690607735, + "grad_norm": 0.1821276843547821, + "learning_rate": 1.584329232349128e-05, + "loss": 1.7451, + "step": 24345 + }, + { + "epoch": 7.47268262737876, + "grad_norm": 0.1714404970407486, + "learning_rate": 1.5839662527672262e-05, + "loss": 1.7289, + "step": 24346 + }, + { + "epoch": 7.472989564149785, + "grad_norm": 0.159423828125, + "learning_rate": 1.583603306944572e-05, + "loss": 1.667, + "step": 24347 + }, + { + "epoch": 7.47329650092081, + "grad_norm": 0.22563552856445312, + "learning_rate": 1.5832403948847523e-05, + "loss": 1.7755, + "step": 24348 + }, + { + "epoch": 7.473603437691835, + "grad_norm": 0.17239433526992798, + "learning_rate": 1.582877516591354e-05, + "loss": 1.6577, + "step": 24349 + }, + { + "epoch": 7.473910374462861, + "grad_norm": 0.1671951860189438, + "learning_rate": 1.5825146720679624e-05, + "loss": 1.7438, + "step": 24350 + }, + { + "epoch": 7.474217311233886, + "grad_norm": 0.1802397519350052, + "learning_rate": 1.582151861318164e-05, + "loss": 1.686, + "step": 24351 + }, + { + "epoch": 7.474524248004911, + "grad_norm": 0.21424922347068787, + "learning_rate": 1.5817890843455442e-05, + "loss": 1.7871, + "step": 24352 + }, + { + "epoch": 7.474831184775936, + "grad_norm": 0.2275305986404419, + "learning_rate": 1.5814263411536884e-05, + "loss": 1.7461, + "step": 24353 + }, + { + "epoch": 7.475138121546961, + "grad_norm": 0.1682458072900772, + "learning_rate": 1.581063631746181e-05, + "loss": 1.6362, + "step": 24354 + }, + { + "epoch": 7.475445058317987, + "grad_norm": 0.165358304977417, + "learning_rate": 1.5807009561266068e-05, + "loss": 1.7057, + "step": 24355 + }, + { + "epoch": 7.475751995089012, + "grad_norm": 0.18032164871692657, + "learning_rate": 1.5803383142985496e-05, + "loss": 1.7645, + "step": 24356 + }, + { + "epoch": 7.476058931860037, + "grad_norm": 0.1694670170545578, + "learning_rate": 1.5799757062655935e-05, + "loss": 1.6848, + "step": 24357 + }, + { + "epoch": 7.476365868631062, + "grad_norm": 0.17879679799079895, + "learning_rate": 1.5796131320313225e-05, + "loss": 1.7425, + "step": 24358 + }, + { + "epoch": 7.476672805402087, + "grad_norm": 0.16042493283748627, + "learning_rate": 1.579250591599317e-05, + "loss": 1.6389, + "step": 24359 + }, + { + "epoch": 7.476979742173112, + "grad_norm": 0.19134685397148132, + "learning_rate": 1.5788880849731658e-05, + "loss": 1.7504, + "step": 24360 + }, + { + "epoch": 7.477286678944138, + "grad_norm": 0.16545429825782776, + "learning_rate": 1.578525612156444e-05, + "loss": 1.7184, + "step": 24361 + }, + { + "epoch": 7.477593615715163, + "grad_norm": 0.18139231204986572, + "learning_rate": 1.5781631731527397e-05, + "loss": 1.6794, + "step": 24362 + }, + { + "epoch": 7.4779005524861875, + "grad_norm": 0.19043901562690735, + "learning_rate": 1.5778007679656326e-05, + "loss": 1.7184, + "step": 24363 + }, + { + "epoch": 7.478207489257213, + "grad_norm": 0.19410157203674316, + "learning_rate": 1.577438396598703e-05, + "loss": 1.7599, + "step": 24364 + }, + { + "epoch": 7.478514426028238, + "grad_norm": 0.18464741110801697, + "learning_rate": 1.5770760590555344e-05, + "loss": 1.652, + "step": 24365 + }, + { + "epoch": 7.4788213627992635, + "grad_norm": 0.19959059357643127, + "learning_rate": 1.576713755339706e-05, + "loss": 1.7509, + "step": 24366 + }, + { + "epoch": 7.479128299570289, + "grad_norm": 0.20312312245368958, + "learning_rate": 1.576351485454799e-05, + "loss": 1.758, + "step": 24367 + }, + { + "epoch": 7.479435236341313, + "grad_norm": 0.23994365334510803, + "learning_rate": 1.5759892494043933e-05, + "loss": 1.7124, + "step": 24368 + }, + { + "epoch": 7.479742173112339, + "grad_norm": 0.22661323845386505, + "learning_rate": 1.575627047192068e-05, + "loss": 1.7251, + "step": 24369 + }, + { + "epoch": 7.480049109883364, + "grad_norm": 0.2599529027938843, + "learning_rate": 1.5752648788214038e-05, + "loss": 1.7351, + "step": 24370 + }, + { + "epoch": 7.480356046654389, + "grad_norm": 0.17298145592212677, + "learning_rate": 1.5749027442959795e-05, + "loss": 1.681, + "step": 24371 + }, + { + "epoch": 7.480662983425415, + "grad_norm": 0.18189257383346558, + "learning_rate": 1.574540643619373e-05, + "loss": 1.6938, + "step": 24372 + }, + { + "epoch": 7.48096992019644, + "grad_norm": 0.2658606767654419, + "learning_rate": 1.5741785767951645e-05, + "loss": 1.7043, + "step": 24373 + }, + { + "epoch": 7.481276856967464, + "grad_norm": 0.17898595333099365, + "learning_rate": 1.573816543826931e-05, + "loss": 1.7299, + "step": 24374 + }, + { + "epoch": 7.48158379373849, + "grad_norm": 0.2529693841934204, + "learning_rate": 1.573454544718251e-05, + "loss": 1.6378, + "step": 24375 + }, + { + "epoch": 7.481890730509515, + "grad_norm": 0.1542833298444748, + "learning_rate": 1.5730925794726993e-05, + "loss": 1.6847, + "step": 24376 + }, + { + "epoch": 7.48219766728054, + "grad_norm": 0.24731594324111938, + "learning_rate": 1.5727306480938586e-05, + "loss": 1.7028, + "step": 24377 + }, + { + "epoch": 7.482504604051566, + "grad_norm": 0.21095556020736694, + "learning_rate": 1.572368750585299e-05, + "loss": 1.7371, + "step": 24378 + }, + { + "epoch": 7.48281154082259, + "grad_norm": 0.24208855628967285, + "learning_rate": 1.5720068869506037e-05, + "loss": 1.7982, + "step": 24379 + }, + { + "epoch": 7.4831184775936155, + "grad_norm": 0.23290614783763885, + "learning_rate": 1.571645057193343e-05, + "loss": 1.7443, + "step": 24380 + }, + { + "epoch": 7.483425414364641, + "grad_norm": 0.2146376222372055, + "learning_rate": 1.5712832613170963e-05, + "loss": 1.7258, + "step": 24381 + }, + { + "epoch": 7.483732351135666, + "grad_norm": 0.20540264248847961, + "learning_rate": 1.5709214993254385e-05, + "loss": 1.6495, + "step": 24382 + }, + { + "epoch": 7.4840392879066915, + "grad_norm": 0.16472755372524261, + "learning_rate": 1.570559771221944e-05, + "loss": 1.7118, + "step": 24383 + }, + { + "epoch": 7.484346224677717, + "grad_norm": 0.194668248295784, + "learning_rate": 1.5701980770101876e-05, + "loss": 1.6948, + "step": 24384 + }, + { + "epoch": 7.484653161448741, + "grad_norm": 0.19188909232616425, + "learning_rate": 1.569836416693744e-05, + "loss": 1.7376, + "step": 24385 + }, + { + "epoch": 7.484960098219767, + "grad_norm": 0.1935901939868927, + "learning_rate": 1.569474790276188e-05, + "loss": 1.7009, + "step": 24386 + }, + { + "epoch": 7.485267034990792, + "grad_norm": 0.18449221551418304, + "learning_rate": 1.5691131977610924e-05, + "loss": 1.7542, + "step": 24387 + }, + { + "epoch": 7.485573971761817, + "grad_norm": 0.18543820083141327, + "learning_rate": 1.568751639152031e-05, + "loss": 1.7125, + "step": 24388 + }, + { + "epoch": 7.485880908532843, + "grad_norm": 0.17343461513519287, + "learning_rate": 1.5683901144525776e-05, + "loss": 1.7189, + "step": 24389 + }, + { + "epoch": 7.486187845303867, + "grad_norm": 0.16813276708126068, + "learning_rate": 1.568028623666304e-05, + "loss": 1.6416, + "step": 24390 + }, + { + "epoch": 7.486494782074892, + "grad_norm": 0.16296882927417755, + "learning_rate": 1.567667166796783e-05, + "loss": 1.6971, + "step": 24391 + }, + { + "epoch": 7.486801718845918, + "grad_norm": 0.206793412566185, + "learning_rate": 1.5673057438475875e-05, + "loss": 1.8139, + "step": 24392 + }, + { + "epoch": 7.487108655616943, + "grad_norm": 0.1937340795993805, + "learning_rate": 1.566944354822286e-05, + "loss": 1.7606, + "step": 24393 + }, + { + "epoch": 7.487415592387968, + "grad_norm": 0.19251857697963715, + "learning_rate": 1.566582999724456e-05, + "loss": 1.7225, + "step": 24394 + }, + { + "epoch": 7.487722529158993, + "grad_norm": 0.1551857739686966, + "learning_rate": 1.566221678557663e-05, + "loss": 1.6546, + "step": 24395 + }, + { + "epoch": 7.488029465930018, + "grad_norm": 0.19435563683509827, + "learning_rate": 1.565860391325482e-05, + "loss": 1.7444, + "step": 24396 + }, + { + "epoch": 7.4883364027010435, + "grad_norm": 0.21196971833705902, + "learning_rate": 1.565499138031479e-05, + "loss": 1.7124, + "step": 24397 + }, + { + "epoch": 7.488643339472069, + "grad_norm": 0.2145242542028427, + "learning_rate": 1.5651379186792276e-05, + "loss": 1.7571, + "step": 24398 + }, + { + "epoch": 7.488950276243094, + "grad_norm": 0.17056338489055634, + "learning_rate": 1.5647767332722964e-05, + "loss": 1.6514, + "step": 24399 + }, + { + "epoch": 7.4892572130141195, + "grad_norm": 0.17161786556243896, + "learning_rate": 1.5644155818142553e-05, + "loss": 1.675, + "step": 24400 + }, + { + "epoch": 7.489564149785144, + "grad_norm": 0.18978877365589142, + "learning_rate": 1.564054464308673e-05, + "loss": 1.7123, + "step": 24401 + }, + { + "epoch": 7.489871086556169, + "grad_norm": 0.16004881262779236, + "learning_rate": 1.5636933807591186e-05, + "loss": 1.6555, + "step": 24402 + }, + { + "epoch": 7.490178023327195, + "grad_norm": 0.19739225506782532, + "learning_rate": 1.56333233116916e-05, + "loss": 1.7441, + "step": 24403 + }, + { + "epoch": 7.49048496009822, + "grad_norm": 0.20770032703876495, + "learning_rate": 1.5629713155423657e-05, + "loss": 1.6704, + "step": 24404 + }, + { + "epoch": 7.490791896869245, + "grad_norm": 0.17897675931453705, + "learning_rate": 1.5626103338823033e-05, + "loss": 1.7281, + "step": 24405 + }, + { + "epoch": 7.49109883364027, + "grad_norm": 0.20801669359207153, + "learning_rate": 1.5622493861925402e-05, + "loss": 1.7008, + "step": 24406 + }, + { + "epoch": 7.491405770411295, + "grad_norm": 0.2027266025543213, + "learning_rate": 1.5618884724766442e-05, + "loss": 1.7619, + "step": 24407 + }, + { + "epoch": 7.49171270718232, + "grad_norm": 0.19207318127155304, + "learning_rate": 1.5615275927381806e-05, + "loss": 1.6985, + "step": 24408 + }, + { + "epoch": 7.492019643953346, + "grad_norm": 0.19694732129573822, + "learning_rate": 1.5611667469807175e-05, + "loss": 1.7455, + "step": 24409 + }, + { + "epoch": 7.492326580724371, + "grad_norm": 0.170238196849823, + "learning_rate": 1.560805935207818e-05, + "loss": 1.7179, + "step": 24410 + }, + { + "epoch": 7.4926335174953955, + "grad_norm": 0.16890759766101837, + "learning_rate": 1.5604451574230532e-05, + "loss": 1.7323, + "step": 24411 + }, + { + "epoch": 7.492940454266421, + "grad_norm": 0.18043142557144165, + "learning_rate": 1.5600844136299824e-05, + "loss": 1.6958, + "step": 24412 + }, + { + "epoch": 7.493247391037446, + "grad_norm": 0.23966364562511444, + "learning_rate": 1.5597237038321764e-05, + "loss": 1.754, + "step": 24413 + }, + { + "epoch": 7.4935543278084715, + "grad_norm": 0.23342584073543549, + "learning_rate": 1.5593630280331945e-05, + "loss": 1.8008, + "step": 24414 + }, + { + "epoch": 7.493861264579497, + "grad_norm": 0.17365418374538422, + "learning_rate": 1.5590023862366054e-05, + "loss": 1.7166, + "step": 24415 + }, + { + "epoch": 7.494168201350522, + "grad_norm": 0.1934911608695984, + "learning_rate": 1.558641778445971e-05, + "loss": 1.7113, + "step": 24416 + }, + { + "epoch": 7.494475138121547, + "grad_norm": 0.1935805231332779, + "learning_rate": 1.558281204664856e-05, + "loss": 1.7549, + "step": 24417 + }, + { + "epoch": 7.494782074892572, + "grad_norm": 0.18467992544174194, + "learning_rate": 1.5579206648968236e-05, + "loss": 1.6889, + "step": 24418 + }, + { + "epoch": 7.495089011663597, + "grad_norm": 0.17173317074775696, + "learning_rate": 1.5575601591454365e-05, + "loss": 1.686, + "step": 24419 + }, + { + "epoch": 7.495395948434623, + "grad_norm": 0.1706855744123459, + "learning_rate": 1.5571996874142574e-05, + "loss": 1.6747, + "step": 24420 + }, + { + "epoch": 7.495702885205648, + "grad_norm": 0.2233184576034546, + "learning_rate": 1.556839249706849e-05, + "loss": 1.7855, + "step": 24421 + }, + { + "epoch": 7.496009821976672, + "grad_norm": 0.22118456661701202, + "learning_rate": 1.5564788460267733e-05, + "loss": 1.7487, + "step": 24422 + }, + { + "epoch": 7.496316758747698, + "grad_norm": 0.21284142136573792, + "learning_rate": 1.5561184763775916e-05, + "loss": 1.7367, + "step": 24423 + }, + { + "epoch": 7.496623695518723, + "grad_norm": 0.17366403341293335, + "learning_rate": 1.5557581407628656e-05, + "loss": 1.655, + "step": 24424 + }, + { + "epoch": 7.496930632289748, + "grad_norm": 0.19864381849765778, + "learning_rate": 1.555397839186157e-05, + "loss": 1.6691, + "step": 24425 + }, + { + "epoch": 7.497237569060774, + "grad_norm": 0.1787605881690979, + "learning_rate": 1.555037571651025e-05, + "loss": 1.7063, + "step": 24426 + }, + { + "epoch": 7.497544505831799, + "grad_norm": 0.19520068168640137, + "learning_rate": 1.5546773381610302e-05, + "loss": 1.7044, + "step": 24427 + }, + { + "epoch": 7.4978514426028235, + "grad_norm": 0.18771123886108398, + "learning_rate": 1.5543171387197362e-05, + "loss": 1.6959, + "step": 24428 + }, + { + "epoch": 7.498158379373849, + "grad_norm": 0.21876849234104156, + "learning_rate": 1.5539569733306964e-05, + "loss": 1.7486, + "step": 24429 + }, + { + "epoch": 7.498465316144874, + "grad_norm": 0.21685563027858734, + "learning_rate": 1.5535968419974772e-05, + "loss": 1.7541, + "step": 24430 + }, + { + "epoch": 7.4987722529158995, + "grad_norm": 0.19595225155353546, + "learning_rate": 1.5532367447236307e-05, + "loss": 1.6882, + "step": 24431 + }, + { + "epoch": 7.499079189686925, + "grad_norm": 0.18359199166297913, + "learning_rate": 1.5528766815127198e-05, + "loss": 1.687, + "step": 24432 + }, + { + "epoch": 7.499386126457949, + "grad_norm": 0.17955231666564941, + "learning_rate": 1.5525166523683028e-05, + "loss": 1.6759, + "step": 24433 + }, + { + "epoch": 7.499693063228975, + "grad_norm": 0.18786758184432983, + "learning_rate": 1.5521566572939368e-05, + "loss": 1.7118, + "step": 24434 + }, + { + "epoch": 7.5, + "grad_norm": 0.16672605276107788, + "learning_rate": 1.551796696293179e-05, + "loss": 1.6618, + "step": 24435 + }, + { + "epoch": 7.500306936771025, + "grad_norm": 0.17066839337348938, + "learning_rate": 1.5514367693695875e-05, + "loss": 1.6974, + "step": 24436 + }, + { + "epoch": 7.500613873542051, + "grad_norm": 0.17299650609493256, + "learning_rate": 1.5510768765267193e-05, + "loss": 1.7074, + "step": 24437 + }, + { + "epoch": 7.500920810313076, + "grad_norm": 0.17507639527320862, + "learning_rate": 1.5507170177681306e-05, + "loss": 1.7295, + "step": 24438 + }, + { + "epoch": 7.5012277470841005, + "grad_norm": 0.1909082531929016, + "learning_rate": 1.5503571930973786e-05, + "loss": 1.7153, + "step": 24439 + }, + { + "epoch": 7.501534683855126, + "grad_norm": 0.2334289401769638, + "learning_rate": 1.5499974025180185e-05, + "loss": 1.713, + "step": 24440 + }, + { + "epoch": 7.501841620626151, + "grad_norm": 0.18382340669631958, + "learning_rate": 1.5496376460336058e-05, + "loss": 1.6706, + "step": 24441 + }, + { + "epoch": 7.5021485573971765, + "grad_norm": 0.1901310533285141, + "learning_rate": 1.5492779236476967e-05, + "loss": 1.7106, + "step": 24442 + }, + { + "epoch": 7.502455494168201, + "grad_norm": 0.17336180806159973, + "learning_rate": 1.5489182353638452e-05, + "loss": 1.7467, + "step": 24443 + }, + { + "epoch": 7.502762430939226, + "grad_norm": 0.18670998513698578, + "learning_rate": 1.548558581185605e-05, + "loss": 1.7101, + "step": 24444 + }, + { + "epoch": 7.503069367710252, + "grad_norm": 0.18341238796710968, + "learning_rate": 1.5481989611165353e-05, + "loss": 1.719, + "step": 24445 + }, + { + "epoch": 7.503376304481277, + "grad_norm": 0.21832694113254547, + "learning_rate": 1.5478393751601833e-05, + "loss": 1.7143, + "step": 24446 + }, + { + "epoch": 7.503683241252302, + "grad_norm": 0.1715303659439087, + "learning_rate": 1.5474798233201094e-05, + "loss": 1.6962, + "step": 24447 + }, + { + "epoch": 7.503990178023328, + "grad_norm": 0.26411953568458557, + "learning_rate": 1.5471203055998595e-05, + "loss": 1.7182, + "step": 24448 + }, + { + "epoch": 7.504297114794352, + "grad_norm": 0.1646965742111206, + "learning_rate": 1.5467608220029926e-05, + "loss": 1.6979, + "step": 24449 + }, + { + "epoch": 7.504604051565377, + "grad_norm": 0.1664915233850479, + "learning_rate": 1.5464013725330595e-05, + "loss": 1.6809, + "step": 24450 + }, + { + "epoch": 7.504910988336403, + "grad_norm": 0.1711970716714859, + "learning_rate": 1.5460419571936125e-05, + "loss": 1.6975, + "step": 24451 + }, + { + "epoch": 7.505217925107428, + "grad_norm": 0.19235998392105103, + "learning_rate": 1.5456825759882028e-05, + "loss": 1.7515, + "step": 24452 + }, + { + "epoch": 7.505524861878453, + "grad_norm": 0.2137441486120224, + "learning_rate": 1.5453232289203822e-05, + "loss": 1.7575, + "step": 24453 + }, + { + "epoch": 7.505831798649478, + "grad_norm": 0.19337041676044464, + "learning_rate": 1.544963915993703e-05, + "loss": 1.776, + "step": 24454 + }, + { + "epoch": 7.506138735420503, + "grad_norm": 0.227366104722023, + "learning_rate": 1.5446046372117152e-05, + "loss": 1.7736, + "step": 24455 + }, + { + "epoch": 7.5064456721915285, + "grad_norm": 0.1712712198495865, + "learning_rate": 1.5442453925779694e-05, + "loss": 1.6663, + "step": 24456 + }, + { + "epoch": 7.506752608962554, + "grad_norm": 0.19359993934631348, + "learning_rate": 1.5438861820960164e-05, + "loss": 1.6826, + "step": 24457 + }, + { + "epoch": 7.507059545733579, + "grad_norm": 0.22883851826190948, + "learning_rate": 1.5435270057694056e-05, + "loss": 1.7782, + "step": 24458 + }, + { + "epoch": 7.5073664825046045, + "grad_norm": 0.17109328508377075, + "learning_rate": 1.543167863601687e-05, + "loss": 1.7435, + "step": 24459 + }, + { + "epoch": 7.507673419275629, + "grad_norm": 0.21545098721981049, + "learning_rate": 1.54280875559641e-05, + "loss": 1.7277, + "step": 24460 + }, + { + "epoch": 7.507980356046654, + "grad_norm": 0.18345774710178375, + "learning_rate": 1.542449681757121e-05, + "loss": 1.7255, + "step": 24461 + }, + { + "epoch": 7.50828729281768, + "grad_norm": 0.15472757816314697, + "learning_rate": 1.5420906420873744e-05, + "loss": 1.6615, + "step": 24462 + }, + { + "epoch": 7.508594229588705, + "grad_norm": 0.2084251195192337, + "learning_rate": 1.5417316365907113e-05, + "loss": 1.6747, + "step": 24463 + }, + { + "epoch": 7.50890116635973, + "grad_norm": 0.19010984897613525, + "learning_rate": 1.5413726652706868e-05, + "loss": 1.7188, + "step": 24464 + }, + { + "epoch": 7.509208103130755, + "grad_norm": 0.22481444478034973, + "learning_rate": 1.5410137281308408e-05, + "loss": 1.8028, + "step": 24465 + }, + { + "epoch": 7.50951503990178, + "grad_norm": 0.22309516370296478, + "learning_rate": 1.5406548251747266e-05, + "loss": 1.7806, + "step": 24466 + }, + { + "epoch": 7.509821976672805, + "grad_norm": 0.19050204753875732, + "learning_rate": 1.540295956405889e-05, + "loss": 1.7188, + "step": 24467 + }, + { + "epoch": 7.510128913443831, + "grad_norm": 0.1956445276737213, + "learning_rate": 1.5399371218278745e-05, + "loss": 1.7468, + "step": 24468 + }, + { + "epoch": 7.510435850214856, + "grad_norm": 0.3492142856121063, + "learning_rate": 1.5395783214442294e-05, + "loss": 1.7502, + "step": 24469 + }, + { + "epoch": 7.510742786985881, + "grad_norm": 0.15318654477596283, + "learning_rate": 1.5392195552584997e-05, + "loss": 1.6782, + "step": 24470 + }, + { + "epoch": 7.511049723756906, + "grad_norm": 0.18576723337173462, + "learning_rate": 1.5388608232742308e-05, + "loss": 1.7455, + "step": 24471 + }, + { + "epoch": 7.511356660527931, + "grad_norm": 0.14923253655433655, + "learning_rate": 1.5385021254949677e-05, + "loss": 1.687, + "step": 24472 + }, + { + "epoch": 7.5116635972989565, + "grad_norm": 0.17453742027282715, + "learning_rate": 1.5381434619242553e-05, + "loss": 1.7072, + "step": 24473 + }, + { + "epoch": 7.511970534069982, + "grad_norm": 0.18869875371456146, + "learning_rate": 1.5377848325656384e-05, + "loss": 1.7681, + "step": 24474 + }, + { + "epoch": 7.512277470841006, + "grad_norm": 0.22205953299999237, + "learning_rate": 1.5374262374226612e-05, + "loss": 1.7526, + "step": 24475 + }, + { + "epoch": 7.512584407612032, + "grad_norm": 0.1634155809879303, + "learning_rate": 1.537067676498867e-05, + "loss": 1.704, + "step": 24476 + }, + { + "epoch": 7.512891344383057, + "grad_norm": 0.19530873000621796, + "learning_rate": 1.5367091497978004e-05, + "loss": 1.7469, + "step": 24477 + }, + { + "epoch": 7.513198281154082, + "grad_norm": 0.17038139700889587, + "learning_rate": 1.5363506573230017e-05, + "loss": 1.6363, + "step": 24478 + }, + { + "epoch": 7.513505217925108, + "grad_norm": 0.17695361375808716, + "learning_rate": 1.535992199078019e-05, + "loss": 1.7191, + "step": 24479 + }, + { + "epoch": 7.513812154696133, + "grad_norm": 0.2216692715883255, + "learning_rate": 1.535633775066389e-05, + "loss": 1.8042, + "step": 24480 + }, + { + "epoch": 7.514119091467157, + "grad_norm": 0.16862058639526367, + "learning_rate": 1.5352753852916595e-05, + "loss": 1.697, + "step": 24481 + }, + { + "epoch": 7.514426028238183, + "grad_norm": 0.20376496016979218, + "learning_rate": 1.5349170297573662e-05, + "loss": 1.7274, + "step": 24482 + }, + { + "epoch": 7.514732965009208, + "grad_norm": 0.16290763020515442, + "learning_rate": 1.5345587084670554e-05, + "loss": 1.6929, + "step": 24483 + }, + { + "epoch": 7.515039901780233, + "grad_norm": 0.21416328847408295, + "learning_rate": 1.5342004214242667e-05, + "loss": 1.756, + "step": 24484 + }, + { + "epoch": 7.515346838551259, + "grad_norm": 0.14708222448825836, + "learning_rate": 1.533842168632541e-05, + "loss": 1.6816, + "step": 24485 + }, + { + "epoch": 7.515653775322283, + "grad_norm": 0.1860494166612625, + "learning_rate": 1.5334839500954178e-05, + "loss": 1.7114, + "step": 24486 + }, + { + "epoch": 7.5159607120933085, + "grad_norm": 0.16551998257637024, + "learning_rate": 1.533125765816439e-05, + "loss": 1.6564, + "step": 24487 + }, + { + "epoch": 7.516267648864334, + "grad_norm": 0.16971731185913086, + "learning_rate": 1.5327676157991428e-05, + "loss": 1.6722, + "step": 24488 + }, + { + "epoch": 7.516574585635359, + "grad_norm": 0.17433905601501465, + "learning_rate": 1.532409500047069e-05, + "loss": 1.6944, + "step": 24489 + }, + { + "epoch": 7.5168815224063845, + "grad_norm": 0.15625490248203278, + "learning_rate": 1.5320514185637575e-05, + "loss": 1.6997, + "step": 24490 + }, + { + "epoch": 7.51718845917741, + "grad_norm": 0.19038623571395874, + "learning_rate": 1.531693371352746e-05, + "loss": 1.6999, + "step": 24491 + }, + { + "epoch": 7.517495395948434, + "grad_norm": 0.16037517786026, + "learning_rate": 1.5313353584175736e-05, + "loss": 1.6568, + "step": 24492 + }, + { + "epoch": 7.51780233271946, + "grad_norm": 0.1515430361032486, + "learning_rate": 1.5309773797617787e-05, + "loss": 1.693, + "step": 24493 + }, + { + "epoch": 7.518109269490485, + "grad_norm": 0.1792028695344925, + "learning_rate": 1.530619435388898e-05, + "loss": 1.7034, + "step": 24494 + }, + { + "epoch": 7.51841620626151, + "grad_norm": 0.18456964194774628, + "learning_rate": 1.530261525302468e-05, + "loss": 1.7565, + "step": 24495 + }, + { + "epoch": 7.518723143032536, + "grad_norm": 0.17504090070724487, + "learning_rate": 1.529903649506031e-05, + "loss": 1.7121, + "step": 24496 + }, + { + "epoch": 7.51903007980356, + "grad_norm": 0.19688715040683746, + "learning_rate": 1.529545808003116e-05, + "loss": 1.7507, + "step": 24497 + }, + { + "epoch": 7.519337016574585, + "grad_norm": 0.21039338409900665, + "learning_rate": 1.529188000797267e-05, + "loss": 1.709, + "step": 24498 + }, + { + "epoch": 7.519643953345611, + "grad_norm": 0.18255522847175598, + "learning_rate": 1.5288302278920136e-05, + "loss": 1.7497, + "step": 24499 + }, + { + "epoch": 7.519950890116636, + "grad_norm": 0.19913412630558014, + "learning_rate": 1.5284724892908958e-05, + "loss": 1.7244, + "step": 24500 + }, + { + "epoch": 7.520257826887661, + "grad_norm": 0.15792223811149597, + "learning_rate": 1.5281147849974476e-05, + "loss": 1.6916, + "step": 24501 + }, + { + "epoch": 7.520564763658687, + "grad_norm": 0.2078406661748886, + "learning_rate": 1.5277571150152038e-05, + "loss": 1.6959, + "step": 24502 + }, + { + "epoch": 7.520871700429711, + "grad_norm": 0.15596020221710205, + "learning_rate": 1.5273994793477e-05, + "loss": 1.7217, + "step": 24503 + }, + { + "epoch": 7.5211786372007365, + "grad_norm": 0.18951189517974854, + "learning_rate": 1.527041877998469e-05, + "loss": 1.7322, + "step": 24504 + }, + { + "epoch": 7.521485573971762, + "grad_norm": 0.16445964574813843, + "learning_rate": 1.526684310971046e-05, + "loss": 1.6668, + "step": 24505 + }, + { + "epoch": 7.521792510742787, + "grad_norm": 0.19513604044914246, + "learning_rate": 1.5263267782689644e-05, + "loss": 1.7464, + "step": 24506 + }, + { + "epoch": 7.5220994475138125, + "grad_norm": 0.20289716124534607, + "learning_rate": 1.525969279895758e-05, + "loss": 1.7472, + "step": 24507 + }, + { + "epoch": 7.522406384284837, + "grad_norm": 0.1716226041316986, + "learning_rate": 1.5256118158549588e-05, + "loss": 1.6872, + "step": 24508 + }, + { + "epoch": 7.522713321055862, + "grad_norm": 0.18939872086048126, + "learning_rate": 1.5252543861501006e-05, + "loss": 1.7365, + "step": 24509 + }, + { + "epoch": 7.523020257826888, + "grad_norm": 0.21382616460323334, + "learning_rate": 1.524896990784715e-05, + "loss": 1.7129, + "step": 24510 + }, + { + "epoch": 7.523327194597913, + "grad_norm": 0.18226614594459534, + "learning_rate": 1.5245396297623338e-05, + "loss": 1.7426, + "step": 24511 + }, + { + "epoch": 7.523634131368938, + "grad_norm": 0.15880146622657776, + "learning_rate": 1.5241823030864893e-05, + "loss": 1.6848, + "step": 24512 + }, + { + "epoch": 7.523941068139964, + "grad_norm": 0.1782255917787552, + "learning_rate": 1.5238250107607121e-05, + "loss": 1.7263, + "step": 24513 + }, + { + "epoch": 7.524248004910988, + "grad_norm": 0.20365844666957855, + "learning_rate": 1.5234677527885328e-05, + "loss": 1.7035, + "step": 24514 + }, + { + "epoch": 7.524554941682013, + "grad_norm": 0.1776183694601059, + "learning_rate": 1.5231105291734855e-05, + "loss": 1.6837, + "step": 24515 + }, + { + "epoch": 7.524861878453039, + "grad_norm": 0.14594987034797668, + "learning_rate": 1.5227533399190946e-05, + "loss": 1.6428, + "step": 24516 + }, + { + "epoch": 7.525168815224064, + "grad_norm": 0.19371397793293, + "learning_rate": 1.5223961850288947e-05, + "loss": 1.7108, + "step": 24517 + }, + { + "epoch": 7.525475751995089, + "grad_norm": 0.1695355474948883, + "learning_rate": 1.5220390645064148e-05, + "loss": 1.6777, + "step": 24518 + }, + { + "epoch": 7.525782688766114, + "grad_norm": 0.14815635979175568, + "learning_rate": 1.5216819783551828e-05, + "loss": 1.6967, + "step": 24519 + }, + { + "epoch": 7.526089625537139, + "grad_norm": 0.19655495882034302, + "learning_rate": 1.5213249265787283e-05, + "loss": 1.7358, + "step": 24520 + }, + { + "epoch": 7.526396562308165, + "grad_norm": 0.1817864030599594, + "learning_rate": 1.5209679091805795e-05, + "loss": 1.7132, + "step": 24521 + }, + { + "epoch": 7.52670349907919, + "grad_norm": 0.209315687417984, + "learning_rate": 1.5206109261642654e-05, + "loss": 1.7161, + "step": 24522 + }, + { + "epoch": 7.527010435850215, + "grad_norm": 0.18493252992630005, + "learning_rate": 1.520253977533313e-05, + "loss": 1.7136, + "step": 24523 + }, + { + "epoch": 7.52731737262124, + "grad_norm": 0.21916678547859192, + "learning_rate": 1.5198970632912508e-05, + "loss": 1.7464, + "step": 24524 + }, + { + "epoch": 7.527624309392265, + "grad_norm": 0.14470849931240082, + "learning_rate": 1.519540183441605e-05, + "loss": 1.6676, + "step": 24525 + }, + { + "epoch": 7.52793124616329, + "grad_norm": 0.20077016949653625, + "learning_rate": 1.5191833379879033e-05, + "loss": 1.7052, + "step": 24526 + }, + { + "epoch": 7.528238182934316, + "grad_norm": 0.17593151330947876, + "learning_rate": 1.5188265269336722e-05, + "loss": 1.7309, + "step": 24527 + }, + { + "epoch": 7.528545119705341, + "grad_norm": 0.20170791447162628, + "learning_rate": 1.518469750282438e-05, + "loss": 1.7335, + "step": 24528 + }, + { + "epoch": 7.5288520564763655, + "grad_norm": 0.1703701615333557, + "learning_rate": 1.518113008037726e-05, + "loss": 1.7141, + "step": 24529 + }, + { + "epoch": 7.529158993247391, + "grad_norm": 0.1897478848695755, + "learning_rate": 1.517756300203062e-05, + "loss": 1.7059, + "step": 24530 + }, + { + "epoch": 7.529465930018416, + "grad_norm": 0.17487141489982605, + "learning_rate": 1.5173996267819695e-05, + "loss": 1.7559, + "step": 24531 + }, + { + "epoch": 7.5297728667894415, + "grad_norm": 0.19167299568653107, + "learning_rate": 1.5170429877779785e-05, + "loss": 1.7287, + "step": 24532 + }, + { + "epoch": 7.530079803560467, + "grad_norm": 0.19433172047138214, + "learning_rate": 1.5166863831946072e-05, + "loss": 1.7182, + "step": 24533 + }, + { + "epoch": 7.530386740331492, + "grad_norm": 0.293734073638916, + "learning_rate": 1.5163298130353853e-05, + "loss": 1.7362, + "step": 24534 + }, + { + "epoch": 7.530693677102517, + "grad_norm": 0.18647685647010803, + "learning_rate": 1.515973277303831e-05, + "loss": 1.7271, + "step": 24535 + }, + { + "epoch": 7.531000613873542, + "grad_norm": 0.20918485522270203, + "learning_rate": 1.5156167760034729e-05, + "loss": 1.7225, + "step": 24536 + }, + { + "epoch": 7.531307550644567, + "grad_norm": 0.22056303918361664, + "learning_rate": 1.5152603091378315e-05, + "loss": 1.6524, + "step": 24537 + }, + { + "epoch": 7.531614487415593, + "grad_norm": 0.13695760071277618, + "learning_rate": 1.5149038767104307e-05, + "loss": 1.6639, + "step": 24538 + }, + { + "epoch": 7.531921424186618, + "grad_norm": 0.25396111607551575, + "learning_rate": 1.514547478724792e-05, + "loss": 1.7025, + "step": 24539 + }, + { + "epoch": 7.532228360957642, + "grad_norm": 0.18192961812019348, + "learning_rate": 1.5141911151844384e-05, + "loss": 1.7288, + "step": 24540 + }, + { + "epoch": 7.532535297728668, + "grad_norm": 0.24748951196670532, + "learning_rate": 1.5138347860928908e-05, + "loss": 1.7379, + "step": 24541 + }, + { + "epoch": 7.532842234499693, + "grad_norm": 0.1841045767068863, + "learning_rate": 1.5134784914536715e-05, + "loss": 1.7876, + "step": 24542 + }, + { + "epoch": 7.533149171270718, + "grad_norm": 0.21867021918296814, + "learning_rate": 1.5131222312703014e-05, + "loss": 1.7608, + "step": 24543 + }, + { + "epoch": 7.533456108041744, + "grad_norm": 0.1972149908542633, + "learning_rate": 1.512766005546301e-05, + "loss": 1.6927, + "step": 24544 + }, + { + "epoch": 7.533763044812769, + "grad_norm": 0.1728486567735672, + "learning_rate": 1.5124098142851906e-05, + "loss": 1.7656, + "step": 24545 + }, + { + "epoch": 7.5340699815837935, + "grad_norm": 0.2591659724712372, + "learning_rate": 1.512053657490491e-05, + "loss": 1.6844, + "step": 24546 + }, + { + "epoch": 7.534376918354819, + "grad_norm": 0.17187906801700592, + "learning_rate": 1.5116975351657215e-05, + "loss": 1.707, + "step": 24547 + }, + { + "epoch": 7.534683855125844, + "grad_norm": 0.26111504435539246, + "learning_rate": 1.5113414473143993e-05, + "loss": 1.7273, + "step": 24548 + }, + { + "epoch": 7.5349907918968695, + "grad_norm": 0.2153446227312088, + "learning_rate": 1.5109853939400498e-05, + "loss": 1.7458, + "step": 24549 + }, + { + "epoch": 7.535297728667894, + "grad_norm": 0.20768530666828156, + "learning_rate": 1.5106293750461835e-05, + "loss": 1.749, + "step": 24550 + }, + { + "epoch": 7.535604665438919, + "grad_norm": 0.2211574763059616, + "learning_rate": 1.5102733906363264e-05, + "loss": 1.7236, + "step": 24551 + }, + { + "epoch": 7.535911602209945, + "grad_norm": 0.15983305871486664, + "learning_rate": 1.5099174407139905e-05, + "loss": 1.6682, + "step": 24552 + }, + { + "epoch": 7.53621853898097, + "grad_norm": 0.23821383714675903, + "learning_rate": 1.5095615252826967e-05, + "loss": 1.7173, + "step": 24553 + }, + { + "epoch": 7.536525475751995, + "grad_norm": 0.1726350039243698, + "learning_rate": 1.5092056443459624e-05, + "loss": 1.7566, + "step": 24554 + }, + { + "epoch": 7.536832412523021, + "grad_norm": 0.19859814643859863, + "learning_rate": 1.5088497979073035e-05, + "loss": 1.7005, + "step": 24555 + }, + { + "epoch": 7.537139349294045, + "grad_norm": 0.14776331186294556, + "learning_rate": 1.508493985970239e-05, + "loss": 1.68, + "step": 24556 + }, + { + "epoch": 7.53744628606507, + "grad_norm": 0.20928993821144104, + "learning_rate": 1.50813820853828e-05, + "loss": 1.7536, + "step": 24557 + }, + { + "epoch": 7.537753222836096, + "grad_norm": 0.18914662301540375, + "learning_rate": 1.5077824656149475e-05, + "loss": 1.7476, + "step": 24558 + }, + { + "epoch": 7.538060159607121, + "grad_norm": 0.24415937066078186, + "learning_rate": 1.5074267572037554e-05, + "loss": 1.7225, + "step": 24559 + }, + { + "epoch": 7.538367096378146, + "grad_norm": 0.18504458665847778, + "learning_rate": 1.5070710833082196e-05, + "loss": 1.7028, + "step": 24560 + }, + { + "epoch": 7.538674033149171, + "grad_norm": 0.1846696138381958, + "learning_rate": 1.5067154439318542e-05, + "loss": 1.7204, + "step": 24561 + }, + { + "epoch": 7.538980969920196, + "grad_norm": 0.20846717059612274, + "learning_rate": 1.5063598390781747e-05, + "loss": 1.73, + "step": 24562 + }, + { + "epoch": 7.5392879066912215, + "grad_norm": 0.1950647234916687, + "learning_rate": 1.5060042687506943e-05, + "loss": 1.7008, + "step": 24563 + }, + { + "epoch": 7.539594843462247, + "grad_norm": 0.1880638748407364, + "learning_rate": 1.5056487329529278e-05, + "loss": 1.6965, + "step": 24564 + }, + { + "epoch": 7.539901780233272, + "grad_norm": 0.24405652284622192, + "learning_rate": 1.5052932316883872e-05, + "loss": 1.7407, + "step": 24565 + }, + { + "epoch": 7.5402087170042975, + "grad_norm": 0.15719062089920044, + "learning_rate": 1.5049377649605906e-05, + "loss": 1.6613, + "step": 24566 + }, + { + "epoch": 7.540515653775322, + "grad_norm": 0.20888090133666992, + "learning_rate": 1.5045823327730441e-05, + "loss": 1.7805, + "step": 24567 + }, + { + "epoch": 7.540822590546347, + "grad_norm": 0.1656443029642105, + "learning_rate": 1.504226935129267e-05, + "loss": 1.7047, + "step": 24568 + }, + { + "epoch": 7.541129527317373, + "grad_norm": 0.28847959637641907, + "learning_rate": 1.503871572032765e-05, + "loss": 1.8711, + "step": 24569 + }, + { + "epoch": 7.541436464088398, + "grad_norm": 0.1724858433008194, + "learning_rate": 1.5035162434870548e-05, + "loss": 1.6734, + "step": 24570 + }, + { + "epoch": 7.541743400859423, + "grad_norm": 0.2064351737499237, + "learning_rate": 1.5031609494956484e-05, + "loss": 1.7032, + "step": 24571 + }, + { + "epoch": 7.542050337630448, + "grad_norm": 0.175388365983963, + "learning_rate": 1.5028056900620513e-05, + "loss": 1.6606, + "step": 24572 + }, + { + "epoch": 7.542357274401473, + "grad_norm": 0.20802471041679382, + "learning_rate": 1.5024504651897814e-05, + "loss": 1.7324, + "step": 24573 + }, + { + "epoch": 7.542664211172498, + "grad_norm": 0.187152698636055, + "learning_rate": 1.502095274882343e-05, + "loss": 1.7222, + "step": 24574 + }, + { + "epoch": 7.542971147943524, + "grad_norm": 0.20112092792987823, + "learning_rate": 1.5017401191432511e-05, + "loss": 1.6959, + "step": 24575 + }, + { + "epoch": 7.543278084714549, + "grad_norm": 0.17968857288360596, + "learning_rate": 1.5013849979760136e-05, + "loss": 1.6957, + "step": 24576 + }, + { + "epoch": 7.543585021485574, + "grad_norm": 0.20532584190368652, + "learning_rate": 1.5010299113841397e-05, + "loss": 1.7471, + "step": 24577 + }, + { + "epoch": 7.543891958256599, + "grad_norm": 0.16475969552993774, + "learning_rate": 1.5006748593711394e-05, + "loss": 1.7665, + "step": 24578 + }, + { + "epoch": 7.544198895027624, + "grad_norm": 0.17632076144218445, + "learning_rate": 1.5003198419405213e-05, + "loss": 1.7317, + "step": 24579 + }, + { + "epoch": 7.5445058317986495, + "grad_norm": 0.18197286128997803, + "learning_rate": 1.4999648590957937e-05, + "loss": 1.7278, + "step": 24580 + }, + { + "epoch": 7.544812768569675, + "grad_norm": 0.18043744564056396, + "learning_rate": 1.4996099108404648e-05, + "loss": 1.7335, + "step": 24581 + }, + { + "epoch": 7.5451197053407, + "grad_norm": 0.17072297632694244, + "learning_rate": 1.4992549971780407e-05, + "loss": 1.7236, + "step": 24582 + }, + { + "epoch": 7.545426642111725, + "grad_norm": 0.17413046956062317, + "learning_rate": 1.4989001181120338e-05, + "loss": 1.6794, + "step": 24583 + }, + { + "epoch": 7.54573357888275, + "grad_norm": 0.1684887856245041, + "learning_rate": 1.4985452736459443e-05, + "loss": 1.718, + "step": 24584 + }, + { + "epoch": 7.546040515653775, + "grad_norm": 0.19497069716453552, + "learning_rate": 1.4981904637832866e-05, + "loss": 1.7323, + "step": 24585 + }, + { + "epoch": 7.546347452424801, + "grad_norm": 0.24838820099830627, + "learning_rate": 1.4978356885275596e-05, + "loss": 1.7584, + "step": 24586 + }, + { + "epoch": 7.546654389195826, + "grad_norm": 0.20870071649551392, + "learning_rate": 1.4974809478822749e-05, + "loss": 1.738, + "step": 24587 + }, + { + "epoch": 7.546961325966851, + "grad_norm": 0.21980242431163788, + "learning_rate": 1.497126241850938e-05, + "loss": 1.763, + "step": 24588 + }, + { + "epoch": 7.547268262737876, + "grad_norm": 0.2156188189983368, + "learning_rate": 1.4967715704370488e-05, + "loss": 1.7357, + "step": 24589 + }, + { + "epoch": 7.547575199508901, + "grad_norm": 0.1864207684993744, + "learning_rate": 1.4964169336441202e-05, + "loss": 1.676, + "step": 24590 + }, + { + "epoch": 7.547882136279926, + "grad_norm": 0.18940003216266632, + "learning_rate": 1.4960623314756494e-05, + "loss": 1.7614, + "step": 24591 + }, + { + "epoch": 7.548189073050952, + "grad_norm": 0.19220350682735443, + "learning_rate": 1.4957077639351463e-05, + "loss": 1.7266, + "step": 24592 + }, + { + "epoch": 7.548496009821976, + "grad_norm": 0.15492811799049377, + "learning_rate": 1.4953532310261126e-05, + "loss": 1.7359, + "step": 24593 + }, + { + "epoch": 7.5488029465930016, + "grad_norm": 0.25591567158699036, + "learning_rate": 1.4949987327520526e-05, + "loss": 1.7, + "step": 24594 + }, + { + "epoch": 7.549109883364027, + "grad_norm": 0.18157868087291718, + "learning_rate": 1.4946442691164697e-05, + "loss": 1.7204, + "step": 24595 + }, + { + "epoch": 7.549416820135052, + "grad_norm": 0.17679910361766815, + "learning_rate": 1.4942898401228662e-05, + "loss": 1.6871, + "step": 24596 + }, + { + "epoch": 7.5497237569060776, + "grad_norm": 0.2000853717327118, + "learning_rate": 1.4939354457747456e-05, + "loss": 1.7186, + "step": 24597 + }, + { + "epoch": 7.550030693677103, + "grad_norm": 0.19947710633277893, + "learning_rate": 1.49358108607561e-05, + "loss": 1.6853, + "step": 24598 + }, + { + "epoch": 7.550337630448127, + "grad_norm": 0.16325148940086365, + "learning_rate": 1.4932267610289596e-05, + "loss": 1.7027, + "step": 24599 + }, + { + "epoch": 7.550644567219153, + "grad_norm": 0.22839638590812683, + "learning_rate": 1.4928724706383007e-05, + "loss": 1.7887, + "step": 24600 + }, + { + "epoch": 7.550951503990178, + "grad_norm": 0.16242358088493347, + "learning_rate": 1.4925182149071286e-05, + "loss": 1.6617, + "step": 24601 + }, + { + "epoch": 7.551258440761203, + "grad_norm": 0.1674090027809143, + "learning_rate": 1.4921639938389504e-05, + "loss": 1.656, + "step": 24602 + }, + { + "epoch": 7.551565377532229, + "grad_norm": 0.1628156453371048, + "learning_rate": 1.4918098074372605e-05, + "loss": 1.683, + "step": 24603 + }, + { + "epoch": 7.551872314303253, + "grad_norm": 0.19156567752361298, + "learning_rate": 1.4914556557055637e-05, + "loss": 1.7174, + "step": 24604 + }, + { + "epoch": 7.5521792510742785, + "grad_norm": 0.19634003937244415, + "learning_rate": 1.4911015386473603e-05, + "loss": 1.6605, + "step": 24605 + }, + { + "epoch": 7.552486187845304, + "grad_norm": 0.19273599982261658, + "learning_rate": 1.490747456266145e-05, + "loss": 1.7092, + "step": 24606 + }, + { + "epoch": 7.552793124616329, + "grad_norm": 0.23641756176948547, + "learning_rate": 1.4903934085654231e-05, + "loss": 1.7524, + "step": 24607 + }, + { + "epoch": 7.5531000613873545, + "grad_norm": 0.19623206555843353, + "learning_rate": 1.490039395548688e-05, + "loss": 1.7281, + "step": 24608 + }, + { + "epoch": 7.55340699815838, + "grad_norm": 0.1978278011083603, + "learning_rate": 1.489685417219442e-05, + "loss": 1.7099, + "step": 24609 + }, + { + "epoch": 7.553713934929404, + "grad_norm": 0.19635866582393646, + "learning_rate": 1.489331473581182e-05, + "loss": 1.7146, + "step": 24610 + }, + { + "epoch": 7.55402087170043, + "grad_norm": 0.2121066302061081, + "learning_rate": 1.4889775646374065e-05, + "loss": 1.7598, + "step": 24611 + }, + { + "epoch": 7.554327808471455, + "grad_norm": 0.17944596707820892, + "learning_rate": 1.4886236903916122e-05, + "loss": 1.6778, + "step": 24612 + }, + { + "epoch": 7.55463474524248, + "grad_norm": 0.15834666788578033, + "learning_rate": 1.488269850847297e-05, + "loss": 1.6498, + "step": 24613 + }, + { + "epoch": 7.554941682013506, + "grad_norm": 0.18597754836082458, + "learning_rate": 1.4879160460079573e-05, + "loss": 1.7145, + "step": 24614 + }, + { + "epoch": 7.55524861878453, + "grad_norm": 0.18300876021385193, + "learning_rate": 1.4875622758770897e-05, + "loss": 1.7253, + "step": 24615 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 0.17805244028568268, + "learning_rate": 1.4872085404581887e-05, + "loss": 1.7152, + "step": 24616 + }, + { + "epoch": 7.555862492326581, + "grad_norm": 0.1987949162721634, + "learning_rate": 1.486854839754755e-05, + "loss": 1.7501, + "step": 24617 + }, + { + "epoch": 7.556169429097606, + "grad_norm": 0.17301858961582184, + "learning_rate": 1.4865011737702777e-05, + "loss": 1.7122, + "step": 24618 + }, + { + "epoch": 7.556476365868631, + "grad_norm": 0.180507093667984, + "learning_rate": 1.4861475425082583e-05, + "loss": 1.7192, + "step": 24619 + }, + { + "epoch": 7.556783302639657, + "grad_norm": 0.16658489406108856, + "learning_rate": 1.4857939459721854e-05, + "loss": 1.6879, + "step": 24620 + }, + { + "epoch": 7.557090239410681, + "grad_norm": 0.19498902559280396, + "learning_rate": 1.4854403841655578e-05, + "loss": 1.7395, + "step": 24621 + }, + { + "epoch": 7.5573971761817065, + "grad_norm": 0.1737620085477829, + "learning_rate": 1.4850868570918702e-05, + "loss": 1.7029, + "step": 24622 + }, + { + "epoch": 7.557704112952732, + "grad_norm": 0.1600165218114853, + "learning_rate": 1.4847333647546113e-05, + "loss": 1.7194, + "step": 24623 + }, + { + "epoch": 7.558011049723757, + "grad_norm": 0.18392407894134521, + "learning_rate": 1.4843799071572806e-05, + "loss": 1.6838, + "step": 24624 + }, + { + "epoch": 7.558317986494782, + "grad_norm": 0.19074605405330658, + "learning_rate": 1.4840264843033651e-05, + "loss": 1.7069, + "step": 24625 + }, + { + "epoch": 7.558624923265807, + "grad_norm": 0.18156903982162476, + "learning_rate": 1.4836730961963619e-05, + "loss": 1.6494, + "step": 24626 + }, + { + "epoch": 7.558931860036832, + "grad_norm": 0.16716471314430237, + "learning_rate": 1.4833197428397627e-05, + "loss": 1.7516, + "step": 24627 + }, + { + "epoch": 7.559238796807858, + "grad_norm": 0.18882833421230316, + "learning_rate": 1.4829664242370588e-05, + "loss": 1.7117, + "step": 24628 + }, + { + "epoch": 7.559545733578883, + "grad_norm": 0.19933676719665527, + "learning_rate": 1.482613140391742e-05, + "loss": 1.6928, + "step": 24629 + }, + { + "epoch": 7.559852670349908, + "grad_norm": 0.15574946999549866, + "learning_rate": 1.4822598913073039e-05, + "loss": 1.702, + "step": 24630 + }, + { + "epoch": 7.560159607120933, + "grad_norm": 0.1953001618385315, + "learning_rate": 1.4819066769872353e-05, + "loss": 1.75, + "step": 24631 + }, + { + "epoch": 7.560466543891958, + "grad_norm": 0.18364208936691284, + "learning_rate": 1.481553497435027e-05, + "loss": 1.6697, + "step": 24632 + }, + { + "epoch": 7.560773480662983, + "grad_norm": 0.16670002043247223, + "learning_rate": 1.4812003526541673e-05, + "loss": 1.6919, + "step": 24633 + }, + { + "epoch": 7.561080417434009, + "grad_norm": 0.19388388097286224, + "learning_rate": 1.4808472426481518e-05, + "loss": 1.7412, + "step": 24634 + }, + { + "epoch": 7.561387354205034, + "grad_norm": 0.19203592836856842, + "learning_rate": 1.4804941674204631e-05, + "loss": 1.7128, + "step": 24635 + }, + { + "epoch": 7.5616942909760585, + "grad_norm": 0.18893340229988098, + "learning_rate": 1.4801411269745974e-05, + "loss": 1.7018, + "step": 24636 + }, + { + "epoch": 7.562001227747084, + "grad_norm": 0.1825447529554367, + "learning_rate": 1.4797881213140363e-05, + "loss": 1.7216, + "step": 24637 + }, + { + "epoch": 7.562308164518109, + "grad_norm": 0.19031697511672974, + "learning_rate": 1.4794351504422743e-05, + "loss": 1.7479, + "step": 24638 + }, + { + "epoch": 7.5626151012891345, + "grad_norm": 0.18328487873077393, + "learning_rate": 1.4790822143627991e-05, + "loss": 1.7222, + "step": 24639 + }, + { + "epoch": 7.56292203806016, + "grad_norm": 0.17531271278858185, + "learning_rate": 1.4787293130790941e-05, + "loss": 1.7197, + "step": 24640 + }, + { + "epoch": 7.563228974831185, + "grad_norm": 0.17078469693660736, + "learning_rate": 1.4783764465946526e-05, + "loss": 1.7715, + "step": 24641 + }, + { + "epoch": 7.56353591160221, + "grad_norm": 0.1859765648841858, + "learning_rate": 1.4780236149129567e-05, + "loss": 1.698, + "step": 24642 + }, + { + "epoch": 7.563842848373235, + "grad_norm": 0.18488194048404694, + "learning_rate": 1.4776708180374965e-05, + "loss": 1.6943, + "step": 24643 + }, + { + "epoch": 7.56414978514426, + "grad_norm": 0.1741705685853958, + "learning_rate": 1.4773180559717586e-05, + "loss": 1.6966, + "step": 24644 + }, + { + "epoch": 7.564456721915286, + "grad_norm": 0.20310313999652863, + "learning_rate": 1.476965328719228e-05, + "loss": 1.7572, + "step": 24645 + }, + { + "epoch": 7.564763658686311, + "grad_norm": 0.20557743310928345, + "learning_rate": 1.476612636283391e-05, + "loss": 1.7419, + "step": 24646 + }, + { + "epoch": 7.565070595457335, + "grad_norm": 0.20597940683364868, + "learning_rate": 1.4762599786677329e-05, + "loss": 1.7147, + "step": 24647 + }, + { + "epoch": 7.565377532228361, + "grad_norm": 0.21609526872634888, + "learning_rate": 1.4759073558757391e-05, + "loss": 1.7678, + "step": 24648 + }, + { + "epoch": 7.565684468999386, + "grad_norm": 0.2233472615480423, + "learning_rate": 1.4755547679108945e-05, + "loss": 1.7381, + "step": 24649 + }, + { + "epoch": 7.565991405770411, + "grad_norm": 0.19561493396759033, + "learning_rate": 1.4752022147766814e-05, + "loss": 1.7254, + "step": 24650 + }, + { + "epoch": 7.566298342541437, + "grad_norm": 0.16491469740867615, + "learning_rate": 1.4748496964765896e-05, + "loss": 1.6834, + "step": 24651 + }, + { + "epoch": 7.566605279312462, + "grad_norm": 0.16946618258953094, + "learning_rate": 1.4744972130140955e-05, + "loss": 1.7154, + "step": 24652 + }, + { + "epoch": 7.5669122160834865, + "grad_norm": 0.1625654697418213, + "learning_rate": 1.4741447643926904e-05, + "loss": 1.6941, + "step": 24653 + }, + { + "epoch": 7.567219152854512, + "grad_norm": 0.16875535249710083, + "learning_rate": 1.4737923506158491e-05, + "loss": 1.6875, + "step": 24654 + }, + { + "epoch": 7.567526089625537, + "grad_norm": 0.1625872105360031, + "learning_rate": 1.4734399716870607e-05, + "loss": 1.6558, + "step": 24655 + }, + { + "epoch": 7.5678330263965625, + "grad_norm": 0.17323140799999237, + "learning_rate": 1.4730876276098071e-05, + "loss": 1.7468, + "step": 24656 + }, + { + "epoch": 7.568139963167588, + "grad_norm": 0.18788693845272064, + "learning_rate": 1.472735318387566e-05, + "loss": 1.7345, + "step": 24657 + }, + { + "epoch": 7.568446899938612, + "grad_norm": 0.18096889555454254, + "learning_rate": 1.472383044023824e-05, + "loss": 1.725, + "step": 24658 + }, + { + "epoch": 7.568753836709638, + "grad_norm": 0.2327791154384613, + "learning_rate": 1.4720308045220577e-05, + "loss": 1.7367, + "step": 24659 + }, + { + "epoch": 7.569060773480663, + "grad_norm": 0.187728151679039, + "learning_rate": 1.4716785998857525e-05, + "loss": 1.6967, + "step": 24660 + }, + { + "epoch": 7.569367710251688, + "grad_norm": 0.18520617485046387, + "learning_rate": 1.4713264301183876e-05, + "loss": 1.6576, + "step": 24661 + }, + { + "epoch": 7.569674647022714, + "grad_norm": 0.20537808537483215, + "learning_rate": 1.4709742952234428e-05, + "loss": 1.6911, + "step": 24662 + }, + { + "epoch": 7.569981583793739, + "grad_norm": 0.18872039020061493, + "learning_rate": 1.4706221952043986e-05, + "loss": 1.745, + "step": 24663 + }, + { + "epoch": 7.570288520564763, + "grad_norm": 0.16083933413028717, + "learning_rate": 1.4702701300647343e-05, + "loss": 1.6875, + "step": 24664 + }, + { + "epoch": 7.570595457335789, + "grad_norm": 0.19390366971492767, + "learning_rate": 1.4699180998079293e-05, + "loss": 1.6996, + "step": 24665 + }, + { + "epoch": 7.570902394106814, + "grad_norm": 0.20478816330432892, + "learning_rate": 1.4695661044374632e-05, + "loss": 1.7359, + "step": 24666 + }, + { + "epoch": 7.571209330877839, + "grad_norm": 0.17485570907592773, + "learning_rate": 1.4692141439568136e-05, + "loss": 1.696, + "step": 24667 + }, + { + "epoch": 7.571516267648864, + "grad_norm": 0.18266968429088593, + "learning_rate": 1.4688622183694594e-05, + "loss": 1.713, + "step": 24668 + }, + { + "epoch": 7.571823204419889, + "grad_norm": 0.14412200450897217, + "learning_rate": 1.468510327678877e-05, + "loss": 1.6938, + "step": 24669 + }, + { + "epoch": 7.5721301411909145, + "grad_norm": 0.18144819140434265, + "learning_rate": 1.4681584718885488e-05, + "loss": 1.7523, + "step": 24670 + }, + { + "epoch": 7.57243707796194, + "grad_norm": 0.32198768854141235, + "learning_rate": 1.467806651001945e-05, + "loss": 1.71, + "step": 24671 + }, + { + "epoch": 7.572744014732965, + "grad_norm": 0.1535005122423172, + "learning_rate": 1.4674548650225483e-05, + "loss": 1.6912, + "step": 24672 + }, + { + "epoch": 7.5730509515039905, + "grad_norm": 0.17982423305511475, + "learning_rate": 1.4671031139538343e-05, + "loss": 1.6928, + "step": 24673 + }, + { + "epoch": 7.573357888275015, + "grad_norm": 0.16811783611774445, + "learning_rate": 1.4667513977992747e-05, + "loss": 1.6954, + "step": 24674 + }, + { + "epoch": 7.57366482504604, + "grad_norm": 0.18918997049331665, + "learning_rate": 1.4663997165623522e-05, + "loss": 1.6967, + "step": 24675 + }, + { + "epoch": 7.573971761817066, + "grad_norm": 0.16559816896915436, + "learning_rate": 1.4660480702465357e-05, + "loss": 1.7097, + "step": 24676 + }, + { + "epoch": 7.574278698588091, + "grad_norm": 0.20471042394638062, + "learning_rate": 1.4656964588553046e-05, + "loss": 1.7032, + "step": 24677 + }, + { + "epoch": 7.574585635359116, + "grad_norm": 0.16387851536273956, + "learning_rate": 1.4653448823921329e-05, + "loss": 1.7066, + "step": 24678 + }, + { + "epoch": 7.574892572130141, + "grad_norm": 0.19144418835639954, + "learning_rate": 1.4649933408604949e-05, + "loss": 1.7272, + "step": 24679 + }, + { + "epoch": 7.575199508901166, + "grad_norm": 0.17270216345787048, + "learning_rate": 1.4646418342638646e-05, + "loss": 1.7456, + "step": 24680 + }, + { + "epoch": 7.5755064456721914, + "grad_norm": 0.1937440037727356, + "learning_rate": 1.4642903626057159e-05, + "loss": 1.6973, + "step": 24681 + }, + { + "epoch": 7.575813382443217, + "grad_norm": 0.18958482146263123, + "learning_rate": 1.463938925889522e-05, + "loss": 1.7549, + "step": 24682 + }, + { + "epoch": 7.576120319214242, + "grad_norm": 0.20584101974964142, + "learning_rate": 1.4635875241187558e-05, + "loss": 1.7013, + "step": 24683 + }, + { + "epoch": 7.5764272559852675, + "grad_norm": 0.22839057445526123, + "learning_rate": 1.463236157296891e-05, + "loss": 1.7282, + "step": 24684 + }, + { + "epoch": 7.576734192756292, + "grad_norm": 0.19894570112228394, + "learning_rate": 1.4628848254273996e-05, + "loss": 1.7115, + "step": 24685 + }, + { + "epoch": 7.577041129527317, + "grad_norm": 0.1880837082862854, + "learning_rate": 1.4625335285137515e-05, + "loss": 1.6526, + "step": 24686 + }, + { + "epoch": 7.577348066298343, + "grad_norm": 0.21545001864433289, + "learning_rate": 1.4621822665594238e-05, + "loss": 1.6709, + "step": 24687 + }, + { + "epoch": 7.577655003069368, + "grad_norm": 0.2091502994298935, + "learning_rate": 1.4618310395678813e-05, + "loss": 1.6792, + "step": 24688 + }, + { + "epoch": 7.577961939840393, + "grad_norm": 0.2100556343793869, + "learning_rate": 1.4614798475426018e-05, + "loss": 1.7112, + "step": 24689 + }, + { + "epoch": 7.578268876611418, + "grad_norm": 0.17702727019786835, + "learning_rate": 1.4611286904870502e-05, + "loss": 1.6353, + "step": 24690 + }, + { + "epoch": 7.578575813382443, + "grad_norm": 0.1935967355966568, + "learning_rate": 1.4607775684046975e-05, + "loss": 1.6638, + "step": 24691 + }, + { + "epoch": 7.578882750153468, + "grad_norm": 0.13495506346225739, + "learning_rate": 1.4604264812990193e-05, + "loss": 1.6526, + "step": 24692 + }, + { + "epoch": 7.579189686924494, + "grad_norm": 0.20418134331703186, + "learning_rate": 1.4600754291734774e-05, + "loss": 1.731, + "step": 24693 + }, + { + "epoch": 7.579496623695519, + "grad_norm": 0.1541702151298523, + "learning_rate": 1.4597244120315467e-05, + "loss": 1.7047, + "step": 24694 + }, + { + "epoch": 7.579803560466544, + "grad_norm": 0.2106262892484665, + "learning_rate": 1.4593734298766942e-05, + "loss": 1.696, + "step": 24695 + }, + { + "epoch": 7.580110497237569, + "grad_norm": 0.15727077424526215, + "learning_rate": 1.4590224827123889e-05, + "loss": 1.6782, + "step": 24696 + }, + { + "epoch": 7.580417434008594, + "grad_norm": 0.19231721758842468, + "learning_rate": 1.4586715705420983e-05, + "loss": 1.7832, + "step": 24697 + }, + { + "epoch": 7.5807243707796195, + "grad_norm": 0.18290117383003235, + "learning_rate": 1.4583206933692916e-05, + "loss": 1.6715, + "step": 24698 + }, + { + "epoch": 7.581031307550645, + "grad_norm": 0.21551427245140076, + "learning_rate": 1.4579698511974355e-05, + "loss": 1.7326, + "step": 24699 + }, + { + "epoch": 7.581338244321669, + "grad_norm": 0.21561767160892487, + "learning_rate": 1.457619044029997e-05, + "loss": 1.6682, + "step": 24700 + }, + { + "epoch": 7.581645181092695, + "grad_norm": 0.15537963807582855, + "learning_rate": 1.457268271870444e-05, + "loss": 1.719, + "step": 24701 + }, + { + "epoch": 7.58195211786372, + "grad_norm": 0.18738612532615662, + "learning_rate": 1.456917534722242e-05, + "loss": 1.7415, + "step": 24702 + }, + { + "epoch": 7.582259054634745, + "grad_norm": 0.15522584319114685, + "learning_rate": 1.456566832588856e-05, + "loss": 1.6931, + "step": 24703 + }, + { + "epoch": 7.582565991405771, + "grad_norm": 0.192890003323555, + "learning_rate": 1.4562161654737567e-05, + "loss": 1.7726, + "step": 24704 + }, + { + "epoch": 7.582872928176796, + "grad_norm": 0.2163987159729004, + "learning_rate": 1.4558655333804028e-05, + "loss": 1.7459, + "step": 24705 + }, + { + "epoch": 7.58317986494782, + "grad_norm": 0.1635672152042389, + "learning_rate": 1.4555149363122667e-05, + "loss": 1.7407, + "step": 24706 + }, + { + "epoch": 7.583486801718846, + "grad_norm": 0.1858159899711609, + "learning_rate": 1.4551643742728072e-05, + "loss": 1.7175, + "step": 24707 + }, + { + "epoch": 7.583793738489871, + "grad_norm": 0.23077011108398438, + "learning_rate": 1.4548138472654904e-05, + "loss": 1.7739, + "step": 24708 + }, + { + "epoch": 7.584100675260896, + "grad_norm": 0.22413180768489838, + "learning_rate": 1.4544633552937836e-05, + "loss": 1.7208, + "step": 24709 + }, + { + "epoch": 7.584407612031922, + "grad_norm": 0.16147246956825256, + "learning_rate": 1.4541128983611445e-05, + "loss": 1.7021, + "step": 24710 + }, + { + "epoch": 7.584714548802946, + "grad_norm": 0.17363815009593964, + "learning_rate": 1.4537624764710439e-05, + "loss": 1.6863, + "step": 24711 + }, + { + "epoch": 7.5850214855739715, + "grad_norm": 0.14971798658370972, + "learning_rate": 1.4534120896269377e-05, + "loss": 1.655, + "step": 24712 + }, + { + "epoch": 7.585328422344997, + "grad_norm": 0.15934213995933533, + "learning_rate": 1.4530617378322937e-05, + "loss": 1.6771, + "step": 24713 + }, + { + "epoch": 7.585635359116022, + "grad_norm": 0.17807291448116302, + "learning_rate": 1.4527114210905724e-05, + "loss": 1.7419, + "step": 24714 + }, + { + "epoch": 7.5859422958870475, + "grad_norm": 0.1727002114057541, + "learning_rate": 1.4523611394052356e-05, + "loss": 1.7232, + "step": 24715 + }, + { + "epoch": 7.586249232658073, + "grad_norm": 0.1625738888978958, + "learning_rate": 1.452010892779746e-05, + "loss": 1.6967, + "step": 24716 + }, + { + "epoch": 7.586556169429097, + "grad_norm": 0.2153816670179367, + "learning_rate": 1.4516606812175636e-05, + "loss": 1.7339, + "step": 24717 + }, + { + "epoch": 7.586863106200123, + "grad_norm": 0.19343912601470947, + "learning_rate": 1.451310504722151e-05, + "loss": 1.7059, + "step": 24718 + }, + { + "epoch": 7.587170042971148, + "grad_norm": 0.16220279037952423, + "learning_rate": 1.450960363296967e-05, + "loss": 1.6825, + "step": 24719 + }, + { + "epoch": 7.587476979742173, + "grad_norm": 0.1678459346294403, + "learning_rate": 1.4506102569454716e-05, + "loss": 1.728, + "step": 24720 + }, + { + "epoch": 7.587783916513199, + "grad_norm": 0.19833502173423767, + "learning_rate": 1.4502601856711295e-05, + "loss": 1.7733, + "step": 24721 + }, + { + "epoch": 7.588090853284223, + "grad_norm": 0.1593111902475357, + "learning_rate": 1.4499101494773931e-05, + "loss": 1.7017, + "step": 24722 + }, + { + "epoch": 7.588397790055248, + "grad_norm": 0.2083328664302826, + "learning_rate": 1.449560148367729e-05, + "loss": 1.7661, + "step": 24723 + }, + { + "epoch": 7.588704726826274, + "grad_norm": 0.19797182083129883, + "learning_rate": 1.4492101823455906e-05, + "loss": 1.788, + "step": 24724 + }, + { + "epoch": 7.589011663597299, + "grad_norm": 0.15613096952438354, + "learning_rate": 1.4488602514144373e-05, + "loss": 1.7295, + "step": 24725 + }, + { + "epoch": 7.589318600368324, + "grad_norm": 0.18078529834747314, + "learning_rate": 1.4485103555777307e-05, + "loss": 1.7165, + "step": 24726 + }, + { + "epoch": 7.58962553713935, + "grad_norm": 0.14951148629188538, + "learning_rate": 1.4481604948389238e-05, + "loss": 1.6431, + "step": 24727 + }, + { + "epoch": 7.589932473910374, + "grad_norm": 0.19518490135669708, + "learning_rate": 1.4478106692014797e-05, + "loss": 1.7332, + "step": 24728 + }, + { + "epoch": 7.5902394106813995, + "grad_norm": 0.17438004910945892, + "learning_rate": 1.4474608786688493e-05, + "loss": 1.6677, + "step": 24729 + }, + { + "epoch": 7.590546347452425, + "grad_norm": 0.2767544090747833, + "learning_rate": 1.4471111232444944e-05, + "loss": 1.7649, + "step": 24730 + }, + { + "epoch": 7.59085328422345, + "grad_norm": 0.21649987995624542, + "learning_rate": 1.4467614029318699e-05, + "loss": 1.7349, + "step": 24731 + }, + { + "epoch": 7.5911602209944755, + "grad_norm": 0.26566463708877563, + "learning_rate": 1.4464117177344316e-05, + "loss": 1.7474, + "step": 24732 + }, + { + "epoch": 7.5914671577655, + "grad_norm": 0.19050925970077515, + "learning_rate": 1.4460620676556358e-05, + "loss": 1.7066, + "step": 24733 + }, + { + "epoch": 7.591774094536525, + "grad_norm": 0.20030665397644043, + "learning_rate": 1.4457124526989375e-05, + "loss": 1.6589, + "step": 24734 + }, + { + "epoch": 7.592081031307551, + "grad_norm": 0.18715742230415344, + "learning_rate": 1.4453628728677921e-05, + "loss": 1.7186, + "step": 24735 + }, + { + "epoch": 7.592387968078576, + "grad_norm": 0.241498664021492, + "learning_rate": 1.4450133281656542e-05, + "loss": 1.6686, + "step": 24736 + }, + { + "epoch": 7.592694904849601, + "grad_norm": 0.20305299758911133, + "learning_rate": 1.4446638185959765e-05, + "loss": 1.7351, + "step": 24737 + }, + { + "epoch": 7.593001841620627, + "grad_norm": 0.177521750330925, + "learning_rate": 1.444314344162218e-05, + "loss": 1.6383, + "step": 24738 + }, + { + "epoch": 7.593308778391651, + "grad_norm": 0.19877439737319946, + "learning_rate": 1.443964904867826e-05, + "loss": 1.7335, + "step": 24739 + }, + { + "epoch": 7.593615715162676, + "grad_norm": 0.16544201970100403, + "learning_rate": 1.4436155007162605e-05, + "loss": 1.6952, + "step": 24740 + }, + { + "epoch": 7.593922651933702, + "grad_norm": 0.20925499498844147, + "learning_rate": 1.443266131710969e-05, + "loss": 1.7042, + "step": 24741 + }, + { + "epoch": 7.594229588704727, + "grad_norm": 0.16688574850559235, + "learning_rate": 1.4429167978554054e-05, + "loss": 1.6797, + "step": 24742 + }, + { + "epoch": 7.5945365254757515, + "grad_norm": 0.2231293022632599, + "learning_rate": 1.4425674991530258e-05, + "loss": 1.8697, + "step": 24743 + }, + { + "epoch": 7.594843462246777, + "grad_norm": 0.2114260196685791, + "learning_rate": 1.442218235607276e-05, + "loss": 1.7404, + "step": 24744 + }, + { + "epoch": 7.595150399017802, + "grad_norm": 0.1842830628156662, + "learning_rate": 1.441869007221614e-05, + "loss": 1.7687, + "step": 24745 + }, + { + "epoch": 7.5954573357888275, + "grad_norm": 0.17780441045761108, + "learning_rate": 1.4415198139994846e-05, + "loss": 1.7492, + "step": 24746 + }, + { + "epoch": 7.595764272559853, + "grad_norm": 0.18805068731307983, + "learning_rate": 1.4411706559443438e-05, + "loss": 1.757, + "step": 24747 + }, + { + "epoch": 7.596071209330878, + "grad_norm": 0.18918974697589874, + "learning_rate": 1.4408215330596403e-05, + "loss": 1.7006, + "step": 24748 + }, + { + "epoch": 7.596378146101903, + "grad_norm": 0.17850689589977264, + "learning_rate": 1.440472445348825e-05, + "loss": 1.6565, + "step": 24749 + }, + { + "epoch": 7.596685082872928, + "grad_norm": 0.20043544471263885, + "learning_rate": 1.4401233928153468e-05, + "loss": 1.7314, + "step": 24750 + }, + { + "epoch": 7.596992019643953, + "grad_norm": 0.1963229477405548, + "learning_rate": 1.4397743754626564e-05, + "loss": 1.6946, + "step": 24751 + }, + { + "epoch": 7.597298956414979, + "grad_norm": 0.2203695923089981, + "learning_rate": 1.4394253932942014e-05, + "loss": 1.7128, + "step": 24752 + }, + { + "epoch": 7.597605893186004, + "grad_norm": 0.19254128634929657, + "learning_rate": 1.4390764463134322e-05, + "loss": 1.6748, + "step": 24753 + }, + { + "epoch": 7.597912829957028, + "grad_norm": 0.19880495965480804, + "learning_rate": 1.438727534523795e-05, + "loss": 1.7155, + "step": 24754 + }, + { + "epoch": 7.598219766728054, + "grad_norm": 0.17486177384853363, + "learning_rate": 1.4383786579287428e-05, + "loss": 1.7484, + "step": 24755 + }, + { + "epoch": 7.598526703499079, + "grad_norm": 0.17247791588306427, + "learning_rate": 1.4380298165317168e-05, + "loss": 1.7225, + "step": 24756 + }, + { + "epoch": 7.598833640270104, + "grad_norm": 0.1802847534418106, + "learning_rate": 1.4376810103361714e-05, + "loss": 1.7009, + "step": 24757 + }, + { + "epoch": 7.59914057704113, + "grad_norm": 0.1934153437614441, + "learning_rate": 1.4373322393455485e-05, + "loss": 1.6957, + "step": 24758 + }, + { + "epoch": 7.599447513812155, + "grad_norm": 0.1508229374885559, + "learning_rate": 1.436983503563295e-05, + "loss": 1.6677, + "step": 24759 + }, + { + "epoch": 7.5997544505831796, + "grad_norm": 0.16684283316135406, + "learning_rate": 1.4366348029928623e-05, + "loss": 1.7394, + "step": 24760 + }, + { + "epoch": 7.600061387354205, + "grad_norm": 0.22492031753063202, + "learning_rate": 1.4362861376376896e-05, + "loss": 1.7302, + "step": 24761 + }, + { + "epoch": 7.60036832412523, + "grad_norm": 0.1654716283082962, + "learning_rate": 1.4359375075012294e-05, + "loss": 1.6487, + "step": 24762 + }, + { + "epoch": 7.600675260896256, + "grad_norm": 0.17514392733573914, + "learning_rate": 1.4355889125869198e-05, + "loss": 1.6952, + "step": 24763 + }, + { + "epoch": 7.600982197667281, + "grad_norm": 0.21000738441944122, + "learning_rate": 1.4352403528982123e-05, + "loss": 1.714, + "step": 24764 + }, + { + "epoch": 7.601289134438305, + "grad_norm": 0.18791960179805756, + "learning_rate": 1.4348918284385481e-05, + "loss": 1.7334, + "step": 24765 + }, + { + "epoch": 7.601596071209331, + "grad_norm": 0.267089307308197, + "learning_rate": 1.4345433392113734e-05, + "loss": 1.7567, + "step": 24766 + }, + { + "epoch": 7.601903007980356, + "grad_norm": 0.1814621239900589, + "learning_rate": 1.4341948852201304e-05, + "loss": 1.7031, + "step": 24767 + }, + { + "epoch": 7.602209944751381, + "grad_norm": 0.16144737601280212, + "learning_rate": 1.4338464664682639e-05, + "loss": 1.6844, + "step": 24768 + }, + { + "epoch": 7.602516881522407, + "grad_norm": 0.14824162423610687, + "learning_rate": 1.433498082959217e-05, + "loss": 1.6854, + "step": 24769 + }, + { + "epoch": 7.602823818293432, + "grad_norm": 0.1837405115365982, + "learning_rate": 1.4331497346964318e-05, + "loss": 1.7087, + "step": 24770 + }, + { + "epoch": 7.6031307550644565, + "grad_norm": 0.20706148445606232, + "learning_rate": 1.4328014216833508e-05, + "loss": 1.7816, + "step": 24771 + }, + { + "epoch": 7.603437691835482, + "grad_norm": 0.16134382784366608, + "learning_rate": 1.4324531439234196e-05, + "loss": 1.7095, + "step": 24772 + }, + { + "epoch": 7.603744628606507, + "grad_norm": 0.15924426913261414, + "learning_rate": 1.4321049014200737e-05, + "loss": 1.7115, + "step": 24773 + }, + { + "epoch": 7.6040515653775325, + "grad_norm": 0.14942041039466858, + "learning_rate": 1.4317566941767625e-05, + "loss": 1.6872, + "step": 24774 + }, + { + "epoch": 7.604358502148557, + "grad_norm": 0.1646505445241928, + "learning_rate": 1.4314085221969209e-05, + "loss": 1.663, + "step": 24775 + }, + { + "epoch": 7.604665438919582, + "grad_norm": 0.17342600226402283, + "learning_rate": 1.4310603854839904e-05, + "loss": 1.7702, + "step": 24776 + }, + { + "epoch": 7.604972375690608, + "grad_norm": 0.17148490250110626, + "learning_rate": 1.4307122840414167e-05, + "loss": 1.7392, + "step": 24777 + }, + { + "epoch": 7.605279312461633, + "grad_norm": 0.22112305462360382, + "learning_rate": 1.4303642178726328e-05, + "loss": 1.6784, + "step": 24778 + }, + { + "epoch": 7.605586249232658, + "grad_norm": 0.22548529505729675, + "learning_rate": 1.4300161869810846e-05, + "loss": 1.7405, + "step": 24779 + }, + { + "epoch": 7.605893186003684, + "grad_norm": 0.179958313703537, + "learning_rate": 1.4296681913702065e-05, + "loss": 1.6848, + "step": 24780 + }, + { + "epoch": 7.606200122774708, + "grad_norm": 0.16872282326221466, + "learning_rate": 1.4293202310434407e-05, + "loss": 1.6973, + "step": 24781 + }, + { + "epoch": 7.606507059545733, + "grad_norm": 0.20554648339748383, + "learning_rate": 1.428972306004226e-05, + "loss": 1.7111, + "step": 24782 + }, + { + "epoch": 7.606813996316759, + "grad_norm": 0.1803034543991089, + "learning_rate": 1.4286244162559993e-05, + "loss": 1.6895, + "step": 24783 + }, + { + "epoch": 7.607120933087784, + "grad_norm": 0.18902915716171265, + "learning_rate": 1.4282765618021999e-05, + "loss": 1.766, + "step": 24784 + }, + { + "epoch": 7.607427869858809, + "grad_norm": 0.16692081093788147, + "learning_rate": 1.4279287426462646e-05, + "loss": 1.688, + "step": 24785 + }, + { + "epoch": 7.607734806629834, + "grad_norm": 0.1538083851337433, + "learning_rate": 1.4275809587916317e-05, + "loss": 1.6611, + "step": 24786 + }, + { + "epoch": 7.608041743400859, + "grad_norm": 0.1921710968017578, + "learning_rate": 1.4272332102417369e-05, + "loss": 1.7338, + "step": 24787 + }, + { + "epoch": 7.6083486801718845, + "grad_norm": 0.1812380999326706, + "learning_rate": 1.4268854970000167e-05, + "loss": 1.7613, + "step": 24788 + }, + { + "epoch": 7.60865561694291, + "grad_norm": 0.1762949675321579, + "learning_rate": 1.4265378190699108e-05, + "loss": 1.6796, + "step": 24789 + }, + { + "epoch": 7.608962553713935, + "grad_norm": 0.17698180675506592, + "learning_rate": 1.4261901764548497e-05, + "loss": 1.7065, + "step": 24790 + }, + { + "epoch": 7.6092694904849605, + "grad_norm": 0.18398644030094147, + "learning_rate": 1.4258425691582756e-05, + "loss": 1.7322, + "step": 24791 + }, + { + "epoch": 7.609576427255985, + "grad_norm": 0.18370044231414795, + "learning_rate": 1.425494997183618e-05, + "loss": 1.7565, + "step": 24792 + }, + { + "epoch": 7.60988336402701, + "grad_norm": 0.19615988433361053, + "learning_rate": 1.4251474605343124e-05, + "loss": 1.7507, + "step": 24793 + }, + { + "epoch": 7.610190300798036, + "grad_norm": 0.17218533158302307, + "learning_rate": 1.4247999592137979e-05, + "loss": 1.6692, + "step": 24794 + }, + { + "epoch": 7.610497237569061, + "grad_norm": 0.19105172157287598, + "learning_rate": 1.4244524932255027e-05, + "loss": 1.7421, + "step": 24795 + }, + { + "epoch": 7.610804174340086, + "grad_norm": 0.21565218269824982, + "learning_rate": 1.424105062572867e-05, + "loss": 1.7143, + "step": 24796 + }, + { + "epoch": 7.611111111111111, + "grad_norm": 0.17394152283668518, + "learning_rate": 1.4237576672593178e-05, + "loss": 1.7202, + "step": 24797 + }, + { + "epoch": 7.611418047882136, + "grad_norm": 0.18680404126644135, + "learning_rate": 1.4234103072882926e-05, + "loss": 1.7155, + "step": 24798 + }, + { + "epoch": 7.611724984653161, + "grad_norm": 0.16173312067985535, + "learning_rate": 1.4230629826632237e-05, + "loss": 1.6549, + "step": 24799 + }, + { + "epoch": 7.612031921424187, + "grad_norm": 0.2055300772190094, + "learning_rate": 1.4227156933875423e-05, + "loss": 1.7382, + "step": 24800 + }, + { + "epoch": 7.612338858195212, + "grad_norm": 0.17331050336360931, + "learning_rate": 1.4223684394646813e-05, + "loss": 1.719, + "step": 24801 + }, + { + "epoch": 7.612645794966237, + "grad_norm": 0.23106786608695984, + "learning_rate": 1.4220212208980727e-05, + "loss": 1.7083, + "step": 24802 + }, + { + "epoch": 7.612952731737262, + "grad_norm": 0.21011751890182495, + "learning_rate": 1.4216740376911469e-05, + "loss": 1.7629, + "step": 24803 + }, + { + "epoch": 7.613259668508287, + "grad_norm": 0.15120279788970947, + "learning_rate": 1.4213268898473359e-05, + "loss": 1.673, + "step": 24804 + }, + { + "epoch": 7.6135666052793125, + "grad_norm": 0.17431862652301788, + "learning_rate": 1.4209797773700684e-05, + "loss": 1.672, + "step": 24805 + }, + { + "epoch": 7.613873542050338, + "grad_norm": 0.1592133790254593, + "learning_rate": 1.42063270026278e-05, + "loss": 1.7102, + "step": 24806 + }, + { + "epoch": 7.614180478821363, + "grad_norm": 0.22535641491413116, + "learning_rate": 1.4202856585288954e-05, + "loss": 1.7177, + "step": 24807 + }, + { + "epoch": 7.614487415592388, + "grad_norm": 0.2111314982175827, + "learning_rate": 1.4199386521718455e-05, + "loss": 1.7399, + "step": 24808 + }, + { + "epoch": 7.614794352363413, + "grad_norm": 0.18377532064914703, + "learning_rate": 1.419591681195061e-05, + "loss": 1.6713, + "step": 24809 + }, + { + "epoch": 7.615101289134438, + "grad_norm": 0.19743949174880981, + "learning_rate": 1.4192447456019681e-05, + "loss": 1.7761, + "step": 24810 + }, + { + "epoch": 7.615408225905464, + "grad_norm": 0.17827409505844116, + "learning_rate": 1.4188978453960006e-05, + "loss": 1.7091, + "step": 24811 + }, + { + "epoch": 7.615715162676489, + "grad_norm": 0.18304505944252014, + "learning_rate": 1.4185509805805802e-05, + "loss": 1.7496, + "step": 24812 + }, + { + "epoch": 7.616022099447514, + "grad_norm": 0.19510503113269806, + "learning_rate": 1.4182041511591415e-05, + "loss": 1.7436, + "step": 24813 + }, + { + "epoch": 7.616329036218539, + "grad_norm": 0.17127136886119843, + "learning_rate": 1.4178573571351056e-05, + "loss": 1.6598, + "step": 24814 + }, + { + "epoch": 7.616635972989564, + "grad_norm": 0.20133370161056519, + "learning_rate": 1.4175105985119041e-05, + "loss": 1.7802, + "step": 24815 + }, + { + "epoch": 7.616942909760589, + "grad_norm": 0.17706145346164703, + "learning_rate": 1.4171638752929634e-05, + "loss": 1.7105, + "step": 24816 + }, + { + "epoch": 7.617249846531615, + "grad_norm": 0.179647758603096, + "learning_rate": 1.4168171874817088e-05, + "loss": 1.732, + "step": 24817 + }, + { + "epoch": 7.617556783302639, + "grad_norm": 0.16380085051059723, + "learning_rate": 1.4164705350815665e-05, + "loss": 1.6671, + "step": 24818 + }, + { + "epoch": 7.6178637200736645, + "grad_norm": 0.19407404959201813, + "learning_rate": 1.4161239180959635e-05, + "loss": 1.7261, + "step": 24819 + }, + { + "epoch": 7.61817065684469, + "grad_norm": 0.1647375524044037, + "learning_rate": 1.415777336528324e-05, + "loss": 1.7438, + "step": 24820 + }, + { + "epoch": 7.618477593615715, + "grad_norm": 0.21532754600048065, + "learning_rate": 1.4154307903820735e-05, + "loss": 1.7674, + "step": 24821 + }, + { + "epoch": 7.6187845303867405, + "grad_norm": 0.1834939867258072, + "learning_rate": 1.4150842796606372e-05, + "loss": 1.7027, + "step": 24822 + }, + { + "epoch": 7.619091467157766, + "grad_norm": 0.15102218091487885, + "learning_rate": 1.4147378043674397e-05, + "loss": 1.6858, + "step": 24823 + }, + { + "epoch": 7.61939840392879, + "grad_norm": 0.161713644862175, + "learning_rate": 1.4143913645059038e-05, + "loss": 1.7149, + "step": 24824 + }, + { + "epoch": 7.619705340699816, + "grad_norm": 0.15568867325782776, + "learning_rate": 1.4140449600794547e-05, + "loss": 1.6642, + "step": 24825 + }, + { + "epoch": 7.620012277470841, + "grad_norm": 0.15993504226207733, + "learning_rate": 1.4136985910915147e-05, + "loss": 1.6497, + "step": 24826 + }, + { + "epoch": 7.620319214241866, + "grad_norm": 0.16981028020381927, + "learning_rate": 1.4133522575455055e-05, + "loss": 1.7347, + "step": 24827 + }, + { + "epoch": 7.620626151012892, + "grad_norm": 0.16143053770065308, + "learning_rate": 1.4130059594448547e-05, + "loss": 1.7166, + "step": 24828 + }, + { + "epoch": 7.620933087783916, + "grad_norm": 0.16914571821689606, + "learning_rate": 1.4126596967929789e-05, + "loss": 1.7008, + "step": 24829 + }, + { + "epoch": 7.621240024554941, + "grad_norm": 0.20040032267570496, + "learning_rate": 1.4123134695933049e-05, + "loss": 1.7099, + "step": 24830 + }, + { + "epoch": 7.621546961325967, + "grad_norm": 0.17086143791675568, + "learning_rate": 1.4119672778492493e-05, + "loss": 1.6913, + "step": 24831 + }, + { + "epoch": 7.621853898096992, + "grad_norm": 0.16268399357795715, + "learning_rate": 1.4116211215642378e-05, + "loss": 1.6919, + "step": 24832 + }, + { + "epoch": 7.622160834868017, + "grad_norm": 0.21211197972297668, + "learning_rate": 1.4112750007416891e-05, + "loss": 1.7493, + "step": 24833 + }, + { + "epoch": 7.622467771639043, + "grad_norm": 0.16767694056034088, + "learning_rate": 1.4109289153850247e-05, + "loss": 1.6863, + "step": 24834 + }, + { + "epoch": 7.622774708410067, + "grad_norm": 0.1769869178533554, + "learning_rate": 1.4105828654976639e-05, + "loss": 1.7303, + "step": 24835 + }, + { + "epoch": 7.6230816451810925, + "grad_norm": 0.2202748954296112, + "learning_rate": 1.4102368510830278e-05, + "loss": 1.7648, + "step": 24836 + }, + { + "epoch": 7.623388581952118, + "grad_norm": 0.18347454071044922, + "learning_rate": 1.4098908721445342e-05, + "loss": 1.7615, + "step": 24837 + }, + { + "epoch": 7.623695518723143, + "grad_norm": 0.17966698110103607, + "learning_rate": 1.4095449286856039e-05, + "loss": 1.7031, + "step": 24838 + }, + { + "epoch": 7.6240024554941686, + "grad_norm": 0.1794397532939911, + "learning_rate": 1.409199020709655e-05, + "loss": 1.7129, + "step": 24839 + }, + { + "epoch": 7.624309392265193, + "grad_norm": 0.1838780641555786, + "learning_rate": 1.4088531482201056e-05, + "loss": 1.6936, + "step": 24840 + }, + { + "epoch": 7.624616329036218, + "grad_norm": 0.1940378099679947, + "learning_rate": 1.4085073112203745e-05, + "loss": 1.71, + "step": 24841 + }, + { + "epoch": 7.624923265807244, + "grad_norm": 0.17340345680713654, + "learning_rate": 1.4081615097138796e-05, + "loss": 1.711, + "step": 24842 + }, + { + "epoch": 7.625230202578269, + "grad_norm": 0.23193266987800598, + "learning_rate": 1.4078157437040374e-05, + "loss": 1.7366, + "step": 24843 + }, + { + "epoch": 7.625537139349294, + "grad_norm": 0.1742531955242157, + "learning_rate": 1.4074700131942653e-05, + "loss": 1.7179, + "step": 24844 + }, + { + "epoch": 7.62584407612032, + "grad_norm": 0.22453147172927856, + "learning_rate": 1.4071243181879806e-05, + "loss": 1.708, + "step": 24845 + }, + { + "epoch": 7.626151012891344, + "grad_norm": 0.16176854074001312, + "learning_rate": 1.4067786586885977e-05, + "loss": 1.7012, + "step": 24846 + }, + { + "epoch": 7.6264579496623695, + "grad_norm": 0.16796015202999115, + "learning_rate": 1.4064330346995369e-05, + "loss": 1.6918, + "step": 24847 + }, + { + "epoch": 7.626764886433395, + "grad_norm": 0.1737142950296402, + "learning_rate": 1.4060874462242085e-05, + "loss": 1.6908, + "step": 24848 + }, + { + "epoch": 7.62707182320442, + "grad_norm": 0.1697089523077011, + "learning_rate": 1.4057418932660315e-05, + "loss": 1.6811, + "step": 24849 + }, + { + "epoch": 7.627378759975445, + "grad_norm": 0.19860011339187622, + "learning_rate": 1.40539637582842e-05, + "loss": 1.7803, + "step": 24850 + }, + { + "epoch": 7.62768569674647, + "grad_norm": 0.16383512318134308, + "learning_rate": 1.4050508939147883e-05, + "loss": 1.7004, + "step": 24851 + }, + { + "epoch": 7.627992633517495, + "grad_norm": 0.18878768384456635, + "learning_rate": 1.404705447528551e-05, + "loss": 1.6916, + "step": 24852 + }, + { + "epoch": 7.628299570288521, + "grad_norm": 0.1417449563741684, + "learning_rate": 1.4043600366731213e-05, + "loss": 1.6908, + "step": 24853 + }, + { + "epoch": 7.628606507059546, + "grad_norm": 0.19786077737808228, + "learning_rate": 1.4040146613519134e-05, + "loss": 1.7307, + "step": 24854 + }, + { + "epoch": 7.628913443830571, + "grad_norm": 0.17295710742473602, + "learning_rate": 1.40366932156834e-05, + "loss": 1.7111, + "step": 24855 + }, + { + "epoch": 7.629220380601596, + "grad_norm": 0.2160167098045349, + "learning_rate": 1.4033240173258144e-05, + "loss": 1.71, + "step": 24856 + }, + { + "epoch": 7.629527317372621, + "grad_norm": 0.1741226315498352, + "learning_rate": 1.402978748627749e-05, + "loss": 1.7024, + "step": 24857 + }, + { + "epoch": 7.629834254143646, + "grad_norm": 0.18043182790279388, + "learning_rate": 1.4026335154775561e-05, + "loss": 1.7046, + "step": 24858 + }, + { + "epoch": 7.630141190914672, + "grad_norm": 0.1592903584241867, + "learning_rate": 1.4022883178786472e-05, + "loss": 1.6913, + "step": 24859 + }, + { + "epoch": 7.630448127685697, + "grad_norm": 0.25504007935523987, + "learning_rate": 1.4019431558344337e-05, + "loss": 1.7221, + "step": 24860 + }, + { + "epoch": 7.6307550644567215, + "grad_norm": 0.15307627618312836, + "learning_rate": 1.4015980293483272e-05, + "loss": 1.6725, + "step": 24861 + }, + { + "epoch": 7.631062001227747, + "grad_norm": 0.2595232129096985, + "learning_rate": 1.4012529384237372e-05, + "loss": 1.7309, + "step": 24862 + }, + { + "epoch": 7.631368937998772, + "grad_norm": 0.19494156539440155, + "learning_rate": 1.4009078830640743e-05, + "loss": 1.737, + "step": 24863 + }, + { + "epoch": 7.6316758747697975, + "grad_norm": 0.19264118373394012, + "learning_rate": 1.4005628632727518e-05, + "loss": 1.7337, + "step": 24864 + }, + { + "epoch": 7.631982811540823, + "grad_norm": 0.18758688867092133, + "learning_rate": 1.400217879053174e-05, + "loss": 1.684, + "step": 24865 + }, + { + "epoch": 7.632289748311848, + "grad_norm": 0.17094476521015167, + "learning_rate": 1.399872930408756e-05, + "loss": 1.6724, + "step": 24866 + }, + { + "epoch": 7.632596685082873, + "grad_norm": 0.18967430293560028, + "learning_rate": 1.3995280173429003e-05, + "loss": 1.6852, + "step": 24867 + }, + { + "epoch": 7.632903621853898, + "grad_norm": 0.1686837375164032, + "learning_rate": 1.399183139859021e-05, + "loss": 1.6673, + "step": 24868 + }, + { + "epoch": 7.633210558624923, + "grad_norm": 0.19091126322746277, + "learning_rate": 1.398838297960524e-05, + "loss": 1.7423, + "step": 24869 + }, + { + "epoch": 7.633517495395949, + "grad_norm": 0.20197629928588867, + "learning_rate": 1.3984934916508186e-05, + "loss": 1.7217, + "step": 24870 + }, + { + "epoch": 7.633824432166974, + "grad_norm": 0.1490679830312729, + "learning_rate": 1.3981487209333105e-05, + "loss": 1.6367, + "step": 24871 + }, + { + "epoch": 7.634131368937998, + "grad_norm": 0.14664824306964874, + "learning_rate": 1.3978039858114084e-05, + "loss": 1.68, + "step": 24872 + }, + { + "epoch": 7.634438305709024, + "grad_norm": 0.19181138277053833, + "learning_rate": 1.3974592862885182e-05, + "loss": 1.766, + "step": 24873 + }, + { + "epoch": 7.634745242480049, + "grad_norm": 0.17716391384601593, + "learning_rate": 1.397114622368047e-05, + "loss": 1.7479, + "step": 24874 + }, + { + "epoch": 7.635052179251074, + "grad_norm": 0.16603589057922363, + "learning_rate": 1.3967699940534006e-05, + "loss": 1.6455, + "step": 24875 + }, + { + "epoch": 7.6353591160221, + "grad_norm": 0.19060885906219482, + "learning_rate": 1.3964254013479855e-05, + "loss": 1.7367, + "step": 24876 + }, + { + "epoch": 7.635666052793125, + "grad_norm": 0.18182092905044556, + "learning_rate": 1.3960808442552064e-05, + "loss": 1.7235, + "step": 24877 + }, + { + "epoch": 7.6359729895641495, + "grad_norm": 0.22578656673431396, + "learning_rate": 1.3957363227784691e-05, + "loss": 1.7229, + "step": 24878 + }, + { + "epoch": 7.636279926335175, + "grad_norm": 0.25397053360939026, + "learning_rate": 1.3953918369211776e-05, + "loss": 1.7094, + "step": 24879 + }, + { + "epoch": 7.6365868631062, + "grad_norm": 0.164917454123497, + "learning_rate": 1.3950473866867353e-05, + "loss": 1.695, + "step": 24880 + }, + { + "epoch": 7.6368937998772255, + "grad_norm": 0.18737520277500153, + "learning_rate": 1.3947029720785503e-05, + "loss": 1.6719, + "step": 24881 + }, + { + "epoch": 7.637200736648251, + "grad_norm": 0.1839492917060852, + "learning_rate": 1.3943585931000213e-05, + "loss": 1.7136, + "step": 24882 + }, + { + "epoch": 7.637507673419275, + "grad_norm": 0.17182856798171997, + "learning_rate": 1.3940142497545566e-05, + "loss": 1.678, + "step": 24883 + }, + { + "epoch": 7.637814610190301, + "grad_norm": 0.20733827352523804, + "learning_rate": 1.393669942045554e-05, + "loss": 1.6398, + "step": 24884 + }, + { + "epoch": 7.638121546961326, + "grad_norm": 0.19326196610927582, + "learning_rate": 1.3933256699764196e-05, + "loss": 1.7351, + "step": 24885 + }, + { + "epoch": 7.638428483732351, + "grad_norm": 0.2368818074464798, + "learning_rate": 1.3929814335505552e-05, + "loss": 1.7567, + "step": 24886 + }, + { + "epoch": 7.638735420503377, + "grad_norm": 0.16702532768249512, + "learning_rate": 1.3926372327713626e-05, + "loss": 1.6791, + "step": 24887 + }, + { + "epoch": 7.639042357274402, + "grad_norm": 0.18634511530399323, + "learning_rate": 1.3922930676422435e-05, + "loss": 1.691, + "step": 24888 + }, + { + "epoch": 7.639349294045426, + "grad_norm": 0.19349521398544312, + "learning_rate": 1.3919489381665985e-05, + "loss": 1.7037, + "step": 24889 + }, + { + "epoch": 7.639656230816452, + "grad_norm": 0.16760465502738953, + "learning_rate": 1.3916048443478286e-05, + "loss": 1.6871, + "step": 24890 + }, + { + "epoch": 7.639963167587477, + "grad_norm": 0.25489017367362976, + "learning_rate": 1.3912607861893351e-05, + "loss": 1.6914, + "step": 24891 + }, + { + "epoch": 7.640270104358502, + "grad_norm": 0.17488406598567963, + "learning_rate": 1.390916763694517e-05, + "loss": 1.6826, + "step": 24892 + }, + { + "epoch": 7.640577041129527, + "grad_norm": 0.2128411829471588, + "learning_rate": 1.3905727768667753e-05, + "loss": 1.711, + "step": 24893 + }, + { + "epoch": 7.640883977900552, + "grad_norm": 0.17478415369987488, + "learning_rate": 1.3902288257095087e-05, + "loss": 1.7174, + "step": 24894 + }, + { + "epoch": 7.6411909146715775, + "grad_norm": 0.20493042469024658, + "learning_rate": 1.3898849102261168e-05, + "loss": 1.7649, + "step": 24895 + }, + { + "epoch": 7.641497851442603, + "grad_norm": 0.16712170839309692, + "learning_rate": 1.3895410304199979e-05, + "loss": 1.6785, + "step": 24896 + }, + { + "epoch": 7.641804788213628, + "grad_norm": 0.18580594658851624, + "learning_rate": 1.3891971862945497e-05, + "loss": 1.7001, + "step": 24897 + }, + { + "epoch": 7.6421117249846535, + "grad_norm": 0.19040817022323608, + "learning_rate": 1.3888533778531737e-05, + "loss": 1.709, + "step": 24898 + }, + { + "epoch": 7.642418661755678, + "grad_norm": 0.17573465406894684, + "learning_rate": 1.3885096050992624e-05, + "loss": 1.7205, + "step": 24899 + }, + { + "epoch": 7.642725598526703, + "grad_norm": 0.19123490154743195, + "learning_rate": 1.3881658680362186e-05, + "loss": 1.6882, + "step": 24900 + }, + { + "epoch": 7.643032535297729, + "grad_norm": 0.18465565145015717, + "learning_rate": 1.387822166667434e-05, + "loss": 1.7294, + "step": 24901 + }, + { + "epoch": 7.643339472068754, + "grad_norm": 0.17927341163158417, + "learning_rate": 1.3874785009963098e-05, + "loss": 1.7625, + "step": 24902 + }, + { + "epoch": 7.643646408839779, + "grad_norm": 0.15983298420906067, + "learning_rate": 1.38713487102624e-05, + "loss": 1.6939, + "step": 24903 + }, + { + "epoch": 7.643953345610804, + "grad_norm": 0.20288127660751343, + "learning_rate": 1.3867912767606211e-05, + "loss": 1.7461, + "step": 24904 + }, + { + "epoch": 7.644260282381829, + "grad_norm": 0.18587160110473633, + "learning_rate": 1.3864477182028484e-05, + "loss": 1.7389, + "step": 24905 + }, + { + "epoch": 7.644567219152854, + "grad_norm": 0.17089903354644775, + "learning_rate": 1.3861041953563175e-05, + "loss": 1.6697, + "step": 24906 + }, + { + "epoch": 7.64487415592388, + "grad_norm": 0.20302993059158325, + "learning_rate": 1.3857607082244228e-05, + "loss": 1.7199, + "step": 24907 + }, + { + "epoch": 7.645181092694905, + "grad_norm": 0.14781002700328827, + "learning_rate": 1.3854172568105594e-05, + "loss": 1.687, + "step": 24908 + }, + { + "epoch": 7.64548802946593, + "grad_norm": 0.17847368121147156, + "learning_rate": 1.3850738411181214e-05, + "loss": 1.6511, + "step": 24909 + }, + { + "epoch": 7.645794966236955, + "grad_norm": 0.1448936015367508, + "learning_rate": 1.3847304611505019e-05, + "loss": 1.6601, + "step": 24910 + }, + { + "epoch": 7.64610190300798, + "grad_norm": 0.19413447380065918, + "learning_rate": 1.3843871169110955e-05, + "loss": 1.6901, + "step": 24911 + }, + { + "epoch": 7.6464088397790055, + "grad_norm": 0.18118292093276978, + "learning_rate": 1.3840438084032947e-05, + "loss": 1.7574, + "step": 24912 + }, + { + "epoch": 7.646715776550031, + "grad_norm": 0.16136041283607483, + "learning_rate": 1.3837005356304921e-05, + "loss": 1.6826, + "step": 24913 + }, + { + "epoch": 7.647022713321056, + "grad_norm": 0.1773926019668579, + "learning_rate": 1.3833572985960792e-05, + "loss": 1.7136, + "step": 24914 + }, + { + "epoch": 7.647329650092081, + "grad_norm": 0.15100078284740448, + "learning_rate": 1.3830140973034522e-05, + "loss": 1.7331, + "step": 24915 + }, + { + "epoch": 7.647636586863106, + "grad_norm": 0.16588352620601654, + "learning_rate": 1.3826709317559966e-05, + "loss": 1.6883, + "step": 24916 + }, + { + "epoch": 7.647943523634131, + "grad_norm": 0.14271478354930878, + "learning_rate": 1.3823278019571106e-05, + "loss": 1.6566, + "step": 24917 + }, + { + "epoch": 7.648250460405157, + "grad_norm": 0.18383146822452545, + "learning_rate": 1.3819847079101782e-05, + "loss": 1.7006, + "step": 24918 + }, + { + "epoch": 7.648557397176182, + "grad_norm": 0.20069970190525055, + "learning_rate": 1.3816416496185952e-05, + "loss": 1.696, + "step": 24919 + }, + { + "epoch": 7.648864333947207, + "grad_norm": 0.15686273574829102, + "learning_rate": 1.3812986270857497e-05, + "loss": 1.6998, + "step": 24920 + }, + { + "epoch": 7.649171270718232, + "grad_norm": 0.14733602106571198, + "learning_rate": 1.3809556403150326e-05, + "loss": 1.6692, + "step": 24921 + }, + { + "epoch": 7.649478207489257, + "grad_norm": 0.16720153391361237, + "learning_rate": 1.3806126893098332e-05, + "loss": 1.6841, + "step": 24922 + }, + { + "epoch": 7.649785144260282, + "grad_norm": 0.1548861712217331, + "learning_rate": 1.3802697740735404e-05, + "loss": 1.6914, + "step": 24923 + }, + { + "epoch": 7.650092081031308, + "grad_norm": 0.1591617912054062, + "learning_rate": 1.3799268946095433e-05, + "loss": 1.7121, + "step": 24924 + }, + { + "epoch": 7.650399017802332, + "grad_norm": 0.19735665619373322, + "learning_rate": 1.3795840509212305e-05, + "loss": 1.741, + "step": 24925 + }, + { + "epoch": 7.650705954573358, + "grad_norm": 0.16886921226978302, + "learning_rate": 1.37924124301199e-05, + "loss": 1.7166, + "step": 24926 + }, + { + "epoch": 7.651012891344383, + "grad_norm": 0.2084806114435196, + "learning_rate": 1.3788984708852098e-05, + "loss": 1.7525, + "step": 24927 + }, + { + "epoch": 7.651319828115408, + "grad_norm": 0.15286533534526825, + "learning_rate": 1.3785557345442773e-05, + "loss": 1.6754, + "step": 24928 + }, + { + "epoch": 7.651626764886434, + "grad_norm": 0.19647163152694702, + "learning_rate": 1.3782130339925792e-05, + "loss": 1.7114, + "step": 24929 + }, + { + "epoch": 7.651933701657459, + "grad_norm": 0.18526645004749298, + "learning_rate": 1.3778703692335031e-05, + "loss": 1.7258, + "step": 24930 + }, + { + "epoch": 7.652240638428484, + "grad_norm": 0.19880451261997223, + "learning_rate": 1.3775277402704334e-05, + "loss": 1.7065, + "step": 24931 + }, + { + "epoch": 7.652547575199509, + "grad_norm": 0.18702107667922974, + "learning_rate": 1.377185147106761e-05, + "loss": 1.7171, + "step": 24932 + }, + { + "epoch": 7.652854511970534, + "grad_norm": 0.1455291509628296, + "learning_rate": 1.3768425897458654e-05, + "loss": 1.6824, + "step": 24933 + }, + { + "epoch": 7.653161448741559, + "grad_norm": 0.16770213842391968, + "learning_rate": 1.3765000681911377e-05, + "loss": 1.6544, + "step": 24934 + }, + { + "epoch": 7.653468385512585, + "grad_norm": 0.18496285378932953, + "learning_rate": 1.3761575824459572e-05, + "loss": 1.7206, + "step": 24935 + }, + { + "epoch": 7.653775322283609, + "grad_norm": 0.1832813024520874, + "learning_rate": 1.3758151325137131e-05, + "loss": 1.7673, + "step": 24936 + }, + { + "epoch": 7.6540822590546345, + "grad_norm": 0.20916350185871124, + "learning_rate": 1.3754727183977878e-05, + "loss": 1.7224, + "step": 24937 + }, + { + "epoch": 7.65438919582566, + "grad_norm": 0.1878765970468521, + "learning_rate": 1.3751303401015653e-05, + "loss": 1.6966, + "step": 24938 + }, + { + "epoch": 7.654696132596685, + "grad_norm": 0.17944355309009552, + "learning_rate": 1.37478799762843e-05, + "loss": 1.6752, + "step": 24939 + }, + { + "epoch": 7.6550030693677105, + "grad_norm": 0.20930083096027374, + "learning_rate": 1.3744456909817638e-05, + "loss": 1.7632, + "step": 24940 + }, + { + "epoch": 7.655310006138736, + "grad_norm": 0.19838237762451172, + "learning_rate": 1.3741034201649511e-05, + "loss": 1.7039, + "step": 24941 + }, + { + "epoch": 7.65561694290976, + "grad_norm": 0.233023539185524, + "learning_rate": 1.373761185181373e-05, + "loss": 1.7117, + "step": 24942 + }, + { + "epoch": 7.655923879680786, + "grad_norm": 0.16270874440670013, + "learning_rate": 1.3734189860344127e-05, + "loss": 1.6603, + "step": 24943 + }, + { + "epoch": 7.656230816451811, + "grad_norm": 0.18456563353538513, + "learning_rate": 1.373076822727451e-05, + "loss": 1.6891, + "step": 24944 + }, + { + "epoch": 7.656537753222836, + "grad_norm": 0.17064985632896423, + "learning_rate": 1.3727346952638703e-05, + "loss": 1.6788, + "step": 24945 + }, + { + "epoch": 7.656844689993862, + "grad_norm": 0.17548689246177673, + "learning_rate": 1.3723926036470513e-05, + "loss": 1.6699, + "step": 24946 + }, + { + "epoch": 7.657151626764886, + "grad_norm": 0.1660275012254715, + "learning_rate": 1.3720505478803753e-05, + "loss": 1.6706, + "step": 24947 + }, + { + "epoch": 7.657458563535911, + "grad_norm": 0.2977990508079529, + "learning_rate": 1.3717085279672199e-05, + "loss": 1.7463, + "step": 24948 + }, + { + "epoch": 7.657765500306937, + "grad_norm": 0.24440810084342957, + "learning_rate": 1.3713665439109708e-05, + "loss": 1.7528, + "step": 24949 + }, + { + "epoch": 7.658072437077962, + "grad_norm": 0.1579941064119339, + "learning_rate": 1.3710245957150015e-05, + "loss": 1.6902, + "step": 24950 + }, + { + "epoch": 7.658379373848987, + "grad_norm": 0.197731152176857, + "learning_rate": 1.3706826833826968e-05, + "loss": 1.7377, + "step": 24951 + }, + { + "epoch": 7.658686310620013, + "grad_norm": 0.16704770922660828, + "learning_rate": 1.3703408069174301e-05, + "loss": 1.7057, + "step": 24952 + }, + { + "epoch": 7.658993247391037, + "grad_norm": 0.2167888730764389, + "learning_rate": 1.3699989663225848e-05, + "loss": 1.7668, + "step": 24953 + }, + { + "epoch": 7.6593001841620625, + "grad_norm": 0.16870343685150146, + "learning_rate": 1.369657161601537e-05, + "loss": 1.6781, + "step": 24954 + }, + { + "epoch": 7.659607120933088, + "grad_norm": 0.22422032058238983, + "learning_rate": 1.3693153927576646e-05, + "loss": 1.7034, + "step": 24955 + }, + { + "epoch": 7.659914057704113, + "grad_norm": 0.20777738094329834, + "learning_rate": 1.3689736597943465e-05, + "loss": 1.7401, + "step": 24956 + }, + { + "epoch": 7.6602209944751385, + "grad_norm": 0.17802980542182922, + "learning_rate": 1.3686319627149579e-05, + "loss": 1.7067, + "step": 24957 + }, + { + "epoch": 7.660527931246163, + "grad_norm": 0.21444065868854523, + "learning_rate": 1.368290301522877e-05, + "loss": 1.6731, + "step": 24958 + }, + { + "epoch": 7.660834868017188, + "grad_norm": 0.17638131976127625, + "learning_rate": 1.3679486762214805e-05, + "loss": 1.738, + "step": 24959 + }, + { + "epoch": 7.661141804788214, + "grad_norm": 0.1900044083595276, + "learning_rate": 1.3676070868141432e-05, + "loss": 1.7673, + "step": 24960 + }, + { + "epoch": 7.661448741559239, + "grad_norm": 0.20749469101428986, + "learning_rate": 1.3672655333042422e-05, + "loss": 1.7341, + "step": 24961 + }, + { + "epoch": 7.661755678330264, + "grad_norm": 0.21292604506015778, + "learning_rate": 1.3669240156951518e-05, + "loss": 1.7114, + "step": 24962 + }, + { + "epoch": 7.66206261510129, + "grad_norm": 0.21506401896476746, + "learning_rate": 1.3665825339902482e-05, + "loss": 1.7412, + "step": 24963 + }, + { + "epoch": 7.662369551872314, + "grad_norm": 0.21838976442813873, + "learning_rate": 1.3662410881929055e-05, + "loss": 1.7178, + "step": 24964 + }, + { + "epoch": 7.662676488643339, + "grad_norm": 0.18973253667354584, + "learning_rate": 1.365899678306497e-05, + "loss": 1.7161, + "step": 24965 + }, + { + "epoch": 7.662983425414365, + "grad_norm": 0.19278603792190552, + "learning_rate": 1.3655583043344006e-05, + "loss": 1.6952, + "step": 24966 + }, + { + "epoch": 7.66329036218539, + "grad_norm": 0.2025471180677414, + "learning_rate": 1.365216966279984e-05, + "loss": 1.6893, + "step": 24967 + }, + { + "epoch": 7.6635972989564145, + "grad_norm": 0.14461325109004974, + "learning_rate": 1.364875664146627e-05, + "loss": 1.6762, + "step": 24968 + }, + { + "epoch": 7.66390423572744, + "grad_norm": 0.22851425409317017, + "learning_rate": 1.3645343979376962e-05, + "loss": 1.7743, + "step": 24969 + }, + { + "epoch": 7.664211172498465, + "grad_norm": 0.16862350702285767, + "learning_rate": 1.3641931676565688e-05, + "loss": 1.6385, + "step": 24970 + }, + { + "epoch": 7.6645181092694905, + "grad_norm": 0.20482461154460907, + "learning_rate": 1.3638519733066157e-05, + "loss": 1.7824, + "step": 24971 + }, + { + "epoch": 7.664825046040516, + "grad_norm": 0.18505734205245972, + "learning_rate": 1.3635108148912085e-05, + "loss": 1.6845, + "step": 24972 + }, + { + "epoch": 7.665131982811541, + "grad_norm": 0.18774990737438202, + "learning_rate": 1.3631696924137189e-05, + "loss": 1.7091, + "step": 24973 + }, + { + "epoch": 7.665438919582566, + "grad_norm": 0.1967296153306961, + "learning_rate": 1.362828605877518e-05, + "loss": 1.6953, + "step": 24974 + }, + { + "epoch": 7.665745856353591, + "grad_norm": 0.16951262950897217, + "learning_rate": 1.3624875552859767e-05, + "loss": 1.7302, + "step": 24975 + }, + { + "epoch": 7.666052793124616, + "grad_norm": 0.21003109216690063, + "learning_rate": 1.3621465406424656e-05, + "loss": 1.7567, + "step": 24976 + }, + { + "epoch": 7.666359729895642, + "grad_norm": 0.19087877869606018, + "learning_rate": 1.361805561950354e-05, + "loss": 1.7373, + "step": 24977 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 0.17799946665763855, + "learning_rate": 1.3614646192130126e-05, + "loss": 1.7121, + "step": 24978 + }, + { + "epoch": 7.666973603437691, + "grad_norm": 0.15956062078475952, + "learning_rate": 1.3611237124338105e-05, + "loss": 1.6654, + "step": 24979 + }, + { + "epoch": 7.667280540208717, + "grad_norm": 0.1963697075843811, + "learning_rate": 1.3607828416161167e-05, + "loss": 1.7902, + "step": 24980 + }, + { + "epoch": 7.667587476979742, + "grad_norm": 0.22204460203647614, + "learning_rate": 1.3604420067632995e-05, + "loss": 1.8199, + "step": 24981 + }, + { + "epoch": 7.667894413750767, + "grad_norm": 0.20523740351200104, + "learning_rate": 1.3601012078787268e-05, + "loss": 1.7253, + "step": 24982 + }, + { + "epoch": 7.668201350521793, + "grad_norm": 0.18693773448467255, + "learning_rate": 1.3597604449657697e-05, + "loss": 1.7032, + "step": 24983 + }, + { + "epoch": 7.668508287292818, + "grad_norm": 0.17661312222480774, + "learning_rate": 1.3594197180277906e-05, + "loss": 1.6648, + "step": 24984 + }, + { + "epoch": 7.6688152240638425, + "grad_norm": 0.19099490344524384, + "learning_rate": 1.3590790270681631e-05, + "loss": 1.7107, + "step": 24985 + }, + { + "epoch": 7.669122160834868, + "grad_norm": 0.1854488104581833, + "learning_rate": 1.3587383720902469e-05, + "loss": 1.7241, + "step": 24986 + }, + { + "epoch": 7.669429097605893, + "grad_norm": 0.18763068318367004, + "learning_rate": 1.3583977530974146e-05, + "loss": 1.7207, + "step": 24987 + }, + { + "epoch": 7.6697360343769185, + "grad_norm": 0.15608854591846466, + "learning_rate": 1.3580571700930295e-05, + "loss": 1.6835, + "step": 24988 + }, + { + "epoch": 7.670042971147944, + "grad_norm": 0.1587948501110077, + "learning_rate": 1.3577166230804584e-05, + "loss": 1.6801, + "step": 24989 + }, + { + "epoch": 7.670349907918968, + "grad_norm": 0.21106089651584625, + "learning_rate": 1.3573761120630668e-05, + "loss": 1.7411, + "step": 24990 + }, + { + "epoch": 7.670656844689994, + "grad_norm": 0.17361705005168915, + "learning_rate": 1.3570356370442188e-05, + "loss": 1.7123, + "step": 24991 + }, + { + "epoch": 7.670963781461019, + "grad_norm": 0.16272610425949097, + "learning_rate": 1.3566951980272802e-05, + "loss": 1.7002, + "step": 24992 + }, + { + "epoch": 7.671270718232044, + "grad_norm": 0.18787643313407898, + "learning_rate": 1.3563547950156147e-05, + "loss": 1.7364, + "step": 24993 + }, + { + "epoch": 7.67157765500307, + "grad_norm": 0.18257403373718262, + "learning_rate": 1.3560144280125869e-05, + "loss": 1.6783, + "step": 24994 + }, + { + "epoch": 7.671884591774095, + "grad_norm": 0.21298269927501678, + "learning_rate": 1.3556740970215608e-05, + "loss": 1.815, + "step": 24995 + }, + { + "epoch": 7.672191528545119, + "grad_norm": 0.1805877983570099, + "learning_rate": 1.3553338020458988e-05, + "loss": 1.719, + "step": 24996 + }, + { + "epoch": 7.672498465316145, + "grad_norm": 0.210116446018219, + "learning_rate": 1.3549935430889643e-05, + "loss": 1.7603, + "step": 24997 + }, + { + "epoch": 7.67280540208717, + "grad_norm": 0.18893682956695557, + "learning_rate": 1.35465332015412e-05, + "loss": 1.6681, + "step": 24998 + }, + { + "epoch": 7.673112338858195, + "grad_norm": 0.17718489468097687, + "learning_rate": 1.354313133244729e-05, + "loss": 1.6799, + "step": 24999 + }, + { + "epoch": 7.67341927562922, + "grad_norm": 0.20092631876468658, + "learning_rate": 1.3539729823641517e-05, + "loss": 1.7273, + "step": 25000 + }, + { + "epoch": 7.673726212400245, + "grad_norm": 0.20800542831420898, + "learning_rate": 1.353632867515749e-05, + "loss": 1.7214, + "step": 25001 + }, + { + "epoch": 7.6740331491712706, + "grad_norm": 0.2119656354188919, + "learning_rate": 1.3532927887028861e-05, + "loss": 1.6701, + "step": 25002 + }, + { + "epoch": 7.674340085942296, + "grad_norm": 0.1645115315914154, + "learning_rate": 1.3529527459289188e-05, + "loss": 1.7199, + "step": 25003 + }, + { + "epoch": 7.674647022713321, + "grad_norm": 0.24434153735637665, + "learning_rate": 1.3526127391972116e-05, + "loss": 1.7295, + "step": 25004 + }, + { + "epoch": 7.6749539594843466, + "grad_norm": 0.20978261530399323, + "learning_rate": 1.3522727685111231e-05, + "loss": 1.8069, + "step": 25005 + }, + { + "epoch": 7.675260896255372, + "grad_norm": 0.19354932010173798, + "learning_rate": 1.3519328338740128e-05, + "loss": 1.7601, + "step": 25006 + }, + { + "epoch": 7.675567833026396, + "grad_norm": 0.19636447727680206, + "learning_rate": 1.3515929352892403e-05, + "loss": 1.7871, + "step": 25007 + }, + { + "epoch": 7.675874769797422, + "grad_norm": 0.18915504217147827, + "learning_rate": 1.3512530727601653e-05, + "loss": 1.6926, + "step": 25008 + }, + { + "epoch": 7.676181706568447, + "grad_norm": 0.18168985843658447, + "learning_rate": 1.3509132462901458e-05, + "loss": 1.7272, + "step": 25009 + }, + { + "epoch": 7.676488643339472, + "grad_norm": 0.17246222496032715, + "learning_rate": 1.3505734558825406e-05, + "loss": 1.7186, + "step": 25010 + }, + { + "epoch": 7.676795580110497, + "grad_norm": 0.2694617211818695, + "learning_rate": 1.3502337015407074e-05, + "loss": 1.8334, + "step": 25011 + }, + { + "epoch": 7.677102516881522, + "grad_norm": 0.1549377590417862, + "learning_rate": 1.3498939832680035e-05, + "loss": 1.7003, + "step": 25012 + }, + { + "epoch": 7.6774094536525475, + "grad_norm": 0.1559179425239563, + "learning_rate": 1.349554301067787e-05, + "loss": 1.7028, + "step": 25013 + }, + { + "epoch": 7.677716390423573, + "grad_norm": 0.17349909245967865, + "learning_rate": 1.3492146549434149e-05, + "loss": 1.6749, + "step": 25014 + }, + { + "epoch": 7.678023327194598, + "grad_norm": 0.19697749614715576, + "learning_rate": 1.348875044898243e-05, + "loss": 1.8291, + "step": 25015 + }, + { + "epoch": 7.6783302639656235, + "grad_norm": 0.17260968685150146, + "learning_rate": 1.3485354709356279e-05, + "loss": 1.6686, + "step": 25016 + }, + { + "epoch": 7.678637200736648, + "grad_norm": 0.16892582178115845, + "learning_rate": 1.3481959330589255e-05, + "loss": 1.755, + "step": 25017 + }, + { + "epoch": 7.678944137507673, + "grad_norm": 0.17961645126342773, + "learning_rate": 1.3478564312714898e-05, + "loss": 1.6937, + "step": 25018 + }, + { + "epoch": 7.679251074278699, + "grad_norm": 0.20795513689517975, + "learning_rate": 1.34751696557668e-05, + "loss": 1.799, + "step": 25019 + }, + { + "epoch": 7.679558011049724, + "grad_norm": 0.16439545154571533, + "learning_rate": 1.3471775359778461e-05, + "loss": 1.6942, + "step": 25020 + }, + { + "epoch": 7.679864947820749, + "grad_norm": 0.19526144862174988, + "learning_rate": 1.3468381424783472e-05, + "loss": 1.7255, + "step": 25021 + }, + { + "epoch": 7.680171884591774, + "grad_norm": 0.18183457851409912, + "learning_rate": 1.3464987850815319e-05, + "loss": 1.7027, + "step": 25022 + }, + { + "epoch": 7.680478821362799, + "grad_norm": 0.18443404138088226, + "learning_rate": 1.3461594637907587e-05, + "loss": 1.6973, + "step": 25023 + }, + { + "epoch": 7.680785758133824, + "grad_norm": 0.18545331060886383, + "learning_rate": 1.3458201786093794e-05, + "loss": 1.7479, + "step": 25024 + }, + { + "epoch": 7.68109269490485, + "grad_norm": 0.18329958617687225, + "learning_rate": 1.3454809295407467e-05, + "loss": 1.7301, + "step": 25025 + }, + { + "epoch": 7.681399631675875, + "grad_norm": 0.19131959974765778, + "learning_rate": 1.3451417165882136e-05, + "loss": 1.7402, + "step": 25026 + }, + { + "epoch": 7.6817065684469, + "grad_norm": 0.1782912164926529, + "learning_rate": 1.3448025397551323e-05, + "loss": 1.6771, + "step": 25027 + }, + { + "epoch": 7.682013505217925, + "grad_norm": 0.1757265031337738, + "learning_rate": 1.3444633990448546e-05, + "loss": 1.7336, + "step": 25028 + }, + { + "epoch": 7.68232044198895, + "grad_norm": 0.16550128161907196, + "learning_rate": 1.3441242944607318e-05, + "loss": 1.6335, + "step": 25029 + }, + { + "epoch": 7.6826273787599755, + "grad_norm": 0.18069832026958466, + "learning_rate": 1.3437852260061162e-05, + "loss": 1.7172, + "step": 25030 + }, + { + "epoch": 7.682934315531001, + "grad_norm": 0.21195535361766815, + "learning_rate": 1.3434461936843573e-05, + "loss": 1.7248, + "step": 25031 + }, + { + "epoch": 7.683241252302026, + "grad_norm": 0.17209839820861816, + "learning_rate": 1.3431071974988068e-05, + "loss": 1.666, + "step": 25032 + }, + { + "epoch": 7.683548189073051, + "grad_norm": 0.20565249025821686, + "learning_rate": 1.342768237452814e-05, + "loss": 1.7839, + "step": 25033 + }, + { + "epoch": 7.683855125844076, + "grad_norm": 0.2549617290496826, + "learning_rate": 1.342429313549729e-05, + "loss": 1.714, + "step": 25034 + }, + { + "epoch": 7.684162062615101, + "grad_norm": 0.1980191171169281, + "learning_rate": 1.3420904257929001e-05, + "loss": 1.7267, + "step": 25035 + }, + { + "epoch": 7.684468999386127, + "grad_norm": 0.1763298362493515, + "learning_rate": 1.3417515741856806e-05, + "loss": 1.6754, + "step": 25036 + }, + { + "epoch": 7.684775936157152, + "grad_norm": 0.15831413865089417, + "learning_rate": 1.341412758731413e-05, + "loss": 1.6885, + "step": 25037 + }, + { + "epoch": 7.685082872928177, + "grad_norm": 0.15696564316749573, + "learning_rate": 1.341073979433452e-05, + "loss": 1.7032, + "step": 25038 + }, + { + "epoch": 7.685389809699202, + "grad_norm": 0.19193214178085327, + "learning_rate": 1.3407352362951392e-05, + "loss": 1.7708, + "step": 25039 + }, + { + "epoch": 7.685696746470227, + "grad_norm": 0.1886630803346634, + "learning_rate": 1.3403965293198273e-05, + "loss": 1.7323, + "step": 25040 + }, + { + "epoch": 7.686003683241252, + "grad_norm": 0.16137991845607758, + "learning_rate": 1.340057858510862e-05, + "loss": 1.703, + "step": 25041 + }, + { + "epoch": 7.686310620012278, + "grad_norm": 0.21111373603343964, + "learning_rate": 1.33971922387159e-05, + "loss": 1.7428, + "step": 25042 + }, + { + "epoch": 7.686617556783302, + "grad_norm": 0.20256482064723969, + "learning_rate": 1.3393806254053582e-05, + "loss": 1.7651, + "step": 25043 + }, + { + "epoch": 7.6869244935543275, + "grad_norm": 0.19125118851661682, + "learning_rate": 1.3390420631155121e-05, + "loss": 1.7253, + "step": 25044 + }, + { + "epoch": 7.687231430325353, + "grad_norm": 0.22446562349796295, + "learning_rate": 1.3387035370053985e-05, + "loss": 1.7363, + "step": 25045 + }, + { + "epoch": 7.687538367096378, + "grad_norm": 0.17356424033641815, + "learning_rate": 1.3383650470783621e-05, + "loss": 1.7384, + "step": 25046 + }, + { + "epoch": 7.6878453038674035, + "grad_norm": 0.27287909388542175, + "learning_rate": 1.3380265933377489e-05, + "loss": 1.6754, + "step": 25047 + }, + { + "epoch": 7.688152240638429, + "grad_norm": 0.14978452026844025, + "learning_rate": 1.3376881757869032e-05, + "loss": 1.6693, + "step": 25048 + }, + { + "epoch": 7.688459177409453, + "grad_norm": 0.1746874898672104, + "learning_rate": 1.3373497944291691e-05, + "loss": 1.6878, + "step": 25049 + }, + { + "epoch": 7.688766114180479, + "grad_norm": 0.18032371997833252, + "learning_rate": 1.3370114492678915e-05, + "loss": 1.7153, + "step": 25050 + }, + { + "epoch": 7.689073050951504, + "grad_norm": 0.23111680150032043, + "learning_rate": 1.3366731403064131e-05, + "loss": 1.7132, + "step": 25051 + }, + { + "epoch": 7.689379987722529, + "grad_norm": 0.1587868630886078, + "learning_rate": 1.3363348675480768e-05, + "loss": 1.6692, + "step": 25052 + }, + { + "epoch": 7.689686924493555, + "grad_norm": 0.14336444437503815, + "learning_rate": 1.3359966309962301e-05, + "loss": 1.6648, + "step": 25053 + }, + { + "epoch": 7.689993861264579, + "grad_norm": 0.3048984408378601, + "learning_rate": 1.3356584306542086e-05, + "loss": 1.8109, + "step": 25054 + }, + { + "epoch": 7.690300798035604, + "grad_norm": 0.19389018416404724, + "learning_rate": 1.3353202665253617e-05, + "loss": 1.6725, + "step": 25055 + }, + { + "epoch": 7.69060773480663, + "grad_norm": 0.19246982038021088, + "learning_rate": 1.3349821386130246e-05, + "loss": 1.726, + "step": 25056 + }, + { + "epoch": 7.690914671577655, + "grad_norm": 0.19062727689743042, + "learning_rate": 1.3346440469205435e-05, + "loss": 1.7685, + "step": 25057 + }, + { + "epoch": 7.69122160834868, + "grad_norm": 0.16987577080726624, + "learning_rate": 1.3343059914512585e-05, + "loss": 1.7032, + "step": 25058 + }, + { + "epoch": 7.691528545119706, + "grad_norm": 0.17328599095344543, + "learning_rate": 1.3339679722085103e-05, + "loss": 1.7271, + "step": 25059 + }, + { + "epoch": 7.69183548189073, + "grad_norm": 0.2677443325519562, + "learning_rate": 1.3336299891956405e-05, + "loss": 1.8, + "step": 25060 + }, + { + "epoch": 7.6921424186617555, + "grad_norm": 0.18369975686073303, + "learning_rate": 1.333292042415985e-05, + "loss": 1.7483, + "step": 25061 + }, + { + "epoch": 7.692449355432781, + "grad_norm": 0.17269635200500488, + "learning_rate": 1.3329541318728883e-05, + "loss": 1.7016, + "step": 25062 + }, + { + "epoch": 7.692756292203806, + "grad_norm": 0.17280563712120056, + "learning_rate": 1.3326162575696889e-05, + "loss": 1.742, + "step": 25063 + }, + { + "epoch": 7.6930632289748315, + "grad_norm": 0.2000025361776352, + "learning_rate": 1.3322784195097243e-05, + "loss": 1.6947, + "step": 25064 + }, + { + "epoch": 7.693370165745856, + "grad_norm": 0.17853626608848572, + "learning_rate": 1.3319406176963344e-05, + "loss": 1.7075, + "step": 25065 + }, + { + "epoch": 7.693677102516881, + "grad_norm": 0.18445543944835663, + "learning_rate": 1.3316028521328571e-05, + "loss": 1.7138, + "step": 25066 + }, + { + "epoch": 7.693984039287907, + "grad_norm": 0.1965894103050232, + "learning_rate": 1.3312651228226302e-05, + "loss": 1.6904, + "step": 25067 + }, + { + "epoch": 7.694290976058932, + "grad_norm": 0.1890837699174881, + "learning_rate": 1.3309274297689923e-05, + "loss": 1.7307, + "step": 25068 + }, + { + "epoch": 7.694597912829957, + "grad_norm": 0.2157326638698578, + "learning_rate": 1.3305897729752787e-05, + "loss": 1.7466, + "step": 25069 + }, + { + "epoch": 7.694904849600983, + "grad_norm": 0.19773493707180023, + "learning_rate": 1.3302521524448302e-05, + "loss": 1.7265, + "step": 25070 + }, + { + "epoch": 7.695211786372007, + "grad_norm": 0.16688357293605804, + "learning_rate": 1.3299145681809776e-05, + "loss": 1.7049, + "step": 25071 + }, + { + "epoch": 7.695518723143032, + "grad_norm": 0.24347764253616333, + "learning_rate": 1.3295770201870639e-05, + "loss": 1.7706, + "step": 25072 + }, + { + "epoch": 7.695825659914058, + "grad_norm": 0.16198144853115082, + "learning_rate": 1.3292395084664183e-05, + "loss": 1.6873, + "step": 25073 + }, + { + "epoch": 7.696132596685083, + "grad_norm": 0.17321841418743134, + "learning_rate": 1.3289020330223806e-05, + "loss": 1.7463, + "step": 25074 + }, + { + "epoch": 7.696439533456108, + "grad_norm": 0.2611647844314575, + "learning_rate": 1.3285645938582847e-05, + "loss": 1.811, + "step": 25075 + }, + { + "epoch": 7.696746470227133, + "grad_norm": 0.18129383027553558, + "learning_rate": 1.3282271909774657e-05, + "loss": 1.7257, + "step": 25076 + }, + { + "epoch": 7.697053406998158, + "grad_norm": 0.19985437393188477, + "learning_rate": 1.3278898243832588e-05, + "loss": 1.7311, + "step": 25077 + }, + { + "epoch": 7.6973603437691835, + "grad_norm": 0.21517722308635712, + "learning_rate": 1.3275524940789941e-05, + "loss": 1.7582, + "step": 25078 + }, + { + "epoch": 7.697667280540209, + "grad_norm": 0.2302769422531128, + "learning_rate": 1.32721520006801e-05, + "loss": 1.7192, + "step": 25079 + }, + { + "epoch": 7.697974217311234, + "grad_norm": 0.18356913328170776, + "learning_rate": 1.3268779423536375e-05, + "loss": 1.6916, + "step": 25080 + }, + { + "epoch": 7.6982811540822595, + "grad_norm": 0.19134142994880676, + "learning_rate": 1.3265407209392105e-05, + "loss": 1.7309, + "step": 25081 + }, + { + "epoch": 7.698588090853284, + "grad_norm": 0.17634150385856628, + "learning_rate": 1.3262035358280605e-05, + "loss": 1.7537, + "step": 25082 + }, + { + "epoch": 7.698895027624309, + "grad_norm": 0.1921558827161789, + "learning_rate": 1.325866387023521e-05, + "loss": 1.7102, + "step": 25083 + }, + { + "epoch": 7.699201964395335, + "grad_norm": 0.15972480177879333, + "learning_rate": 1.3255292745289233e-05, + "loss": 1.6759, + "step": 25084 + }, + { + "epoch": 7.69950890116636, + "grad_norm": 0.15172120928764343, + "learning_rate": 1.325192198347599e-05, + "loss": 1.6766, + "step": 25085 + }, + { + "epoch": 7.699815837937384, + "grad_norm": 0.17827558517456055, + "learning_rate": 1.3248551584828777e-05, + "loss": 1.7421, + "step": 25086 + }, + { + "epoch": 7.70012277470841, + "grad_norm": 0.1675274819135666, + "learning_rate": 1.3245181549380948e-05, + "loss": 1.701, + "step": 25087 + }, + { + "epoch": 7.700429711479435, + "grad_norm": 0.17937950789928436, + "learning_rate": 1.3241811877165744e-05, + "loss": 1.7284, + "step": 25088 + }, + { + "epoch": 7.7007366482504604, + "grad_norm": 0.16373637318611145, + "learning_rate": 1.3238442568216535e-05, + "loss": 1.6834, + "step": 25089 + }, + { + "epoch": 7.701043585021486, + "grad_norm": 0.16055652499198914, + "learning_rate": 1.3235073622566552e-05, + "loss": 1.7087, + "step": 25090 + }, + { + "epoch": 7.701350521792511, + "grad_norm": 0.15083225071430206, + "learning_rate": 1.3231705040249131e-05, + "loss": 1.7313, + "step": 25091 + }, + { + "epoch": 7.701657458563536, + "grad_norm": 0.21110820770263672, + "learning_rate": 1.322833682129756e-05, + "loss": 1.6758, + "step": 25092 + }, + { + "epoch": 7.701964395334561, + "grad_norm": 0.18439972400665283, + "learning_rate": 1.322496896574511e-05, + "loss": 1.737, + "step": 25093 + }, + { + "epoch": 7.702271332105586, + "grad_norm": 0.18655124306678772, + "learning_rate": 1.322160147362509e-05, + "loss": 1.7268, + "step": 25094 + }, + { + "epoch": 7.702578268876612, + "grad_norm": 0.17620640993118286, + "learning_rate": 1.3218234344970725e-05, + "loss": 1.6829, + "step": 25095 + }, + { + "epoch": 7.702885205647637, + "grad_norm": 0.19085893034934998, + "learning_rate": 1.3214867579815343e-05, + "loss": 1.7382, + "step": 25096 + }, + { + "epoch": 7.703192142418661, + "grad_norm": 0.2206689864397049, + "learning_rate": 1.3211501178192203e-05, + "loss": 1.7666, + "step": 25097 + }, + { + "epoch": 7.703499079189687, + "grad_norm": 0.2047509402036667, + "learning_rate": 1.320813514013457e-05, + "loss": 1.7209, + "step": 25098 + }, + { + "epoch": 7.703806015960712, + "grad_norm": 0.22249147295951843, + "learning_rate": 1.3204769465675709e-05, + "loss": 1.8067, + "step": 25099 + }, + { + "epoch": 7.704112952731737, + "grad_norm": 0.16225707530975342, + "learning_rate": 1.3201404154848885e-05, + "loss": 1.6715, + "step": 25100 + }, + { + "epoch": 7.704419889502763, + "grad_norm": 0.19165070354938507, + "learning_rate": 1.3198039207687352e-05, + "loss": 1.7233, + "step": 25101 + }, + { + "epoch": 7.704726826273788, + "grad_norm": 0.18720564246177673, + "learning_rate": 1.3194674624224368e-05, + "loss": 1.7129, + "step": 25102 + }, + { + "epoch": 7.7050337630448125, + "grad_norm": 0.16703814268112183, + "learning_rate": 1.3191310404493163e-05, + "loss": 1.7314, + "step": 25103 + }, + { + "epoch": 7.705340699815838, + "grad_norm": 0.20206168293952942, + "learning_rate": 1.3187946548527036e-05, + "loss": 1.7278, + "step": 25104 + }, + { + "epoch": 7.705647636586863, + "grad_norm": 0.1774030476808548, + "learning_rate": 1.3184583056359163e-05, + "loss": 1.6986, + "step": 25105 + }, + { + "epoch": 7.7059545733578885, + "grad_norm": 0.1729336827993393, + "learning_rate": 1.3181219928022853e-05, + "loss": 1.7251, + "step": 25106 + }, + { + "epoch": 7.706261510128914, + "grad_norm": 0.23351258039474487, + "learning_rate": 1.3177857163551276e-05, + "loss": 1.7311, + "step": 25107 + }, + { + "epoch": 7.706568446899938, + "grad_norm": 0.2041054517030716, + "learning_rate": 1.3174494762977713e-05, + "loss": 1.7122, + "step": 25108 + }, + { + "epoch": 7.706875383670964, + "grad_norm": 0.178013876080513, + "learning_rate": 1.3171132726335373e-05, + "loss": 1.7255, + "step": 25109 + }, + { + "epoch": 7.707182320441989, + "grad_norm": 0.19265221059322357, + "learning_rate": 1.3167771053657491e-05, + "loss": 1.6747, + "step": 25110 + }, + { + "epoch": 7.707489257213014, + "grad_norm": 0.18968601524829865, + "learning_rate": 1.3164409744977297e-05, + "loss": 1.71, + "step": 25111 + }, + { + "epoch": 7.70779619398404, + "grad_norm": 0.17041562497615814, + "learning_rate": 1.3161048800327963e-05, + "loss": 1.7202, + "step": 25112 + }, + { + "epoch": 7.708103130755065, + "grad_norm": 0.20094618201255798, + "learning_rate": 1.3157688219742754e-05, + "loss": 1.7375, + "step": 25113 + }, + { + "epoch": 7.708410067526089, + "grad_norm": 0.14012686908245087, + "learning_rate": 1.3154328003254862e-05, + "loss": 1.6426, + "step": 25114 + }, + { + "epoch": 7.708717004297115, + "grad_norm": 0.18826791644096375, + "learning_rate": 1.3150968150897497e-05, + "loss": 1.7114, + "step": 25115 + }, + { + "epoch": 7.70902394106814, + "grad_norm": 0.15521864593029022, + "learning_rate": 1.3147608662703864e-05, + "loss": 1.7031, + "step": 25116 + }, + { + "epoch": 7.709330877839165, + "grad_norm": 0.19424815475940704, + "learning_rate": 1.314424953870716e-05, + "loss": 1.6815, + "step": 25117 + }, + { + "epoch": 7.70963781461019, + "grad_norm": 0.30089494585990906, + "learning_rate": 1.3140890778940584e-05, + "loss": 1.7444, + "step": 25118 + }, + { + "epoch": 7.709944751381215, + "grad_norm": 0.1784239560365677, + "learning_rate": 1.3137532383437334e-05, + "loss": 1.6659, + "step": 25119 + }, + { + "epoch": 7.7102516881522405, + "grad_norm": 0.18670935928821564, + "learning_rate": 1.3134174352230571e-05, + "loss": 1.7007, + "step": 25120 + }, + { + "epoch": 7.710558624923266, + "grad_norm": 0.21140475571155548, + "learning_rate": 1.3130816685353541e-05, + "loss": 1.7716, + "step": 25121 + }, + { + "epoch": 7.710865561694291, + "grad_norm": 0.20546187460422516, + "learning_rate": 1.3127459382839363e-05, + "loss": 1.6434, + "step": 25122 + }, + { + "epoch": 7.7111724984653165, + "grad_norm": 0.15188902616500854, + "learning_rate": 1.312410244472127e-05, + "loss": 1.6843, + "step": 25123 + }, + { + "epoch": 7.711479435236341, + "grad_norm": 0.2020019143819809, + "learning_rate": 1.3120745871032375e-05, + "loss": 1.6846, + "step": 25124 + }, + { + "epoch": 7.711786372007366, + "grad_norm": 0.19839881360530853, + "learning_rate": 1.3117389661805907e-05, + "loss": 1.7026, + "step": 25125 + }, + { + "epoch": 7.712093308778392, + "grad_norm": 0.19400818645954132, + "learning_rate": 1.311403381707501e-05, + "loss": 1.705, + "step": 25126 + }, + { + "epoch": 7.712400245549417, + "grad_norm": 0.21366959810256958, + "learning_rate": 1.311067833687285e-05, + "loss": 1.7184, + "step": 25127 + }, + { + "epoch": 7.712707182320442, + "grad_norm": 0.17402227222919464, + "learning_rate": 1.3107323221232604e-05, + "loss": 1.6613, + "step": 25128 + }, + { + "epoch": 7.713014119091467, + "grad_norm": 0.24356254935264587, + "learning_rate": 1.3103968470187384e-05, + "loss": 1.7343, + "step": 25129 + }, + { + "epoch": 7.713321055862492, + "grad_norm": 0.18612951040267944, + "learning_rate": 1.3100614083770386e-05, + "loss": 1.7298, + "step": 25130 + }, + { + "epoch": 7.713627992633517, + "grad_norm": 0.27073535323143005, + "learning_rate": 1.3097260062014743e-05, + "loss": 1.7554, + "step": 25131 + }, + { + "epoch": 7.713934929404543, + "grad_norm": 0.1498921662569046, + "learning_rate": 1.309390640495361e-05, + "loss": 1.6506, + "step": 25132 + }, + { + "epoch": 7.714241866175568, + "grad_norm": 0.2159748524427414, + "learning_rate": 1.309055311262013e-05, + "loss": 1.6549, + "step": 25133 + }, + { + "epoch": 7.714548802946593, + "grad_norm": 0.2060365229845047, + "learning_rate": 1.3087200185047433e-05, + "loss": 1.7224, + "step": 25134 + }, + { + "epoch": 7.714855739717618, + "grad_norm": 0.22525639832019806, + "learning_rate": 1.3083847622268659e-05, + "loss": 1.7508, + "step": 25135 + }, + { + "epoch": 7.715162676488643, + "grad_norm": 0.20023567974567413, + "learning_rate": 1.3080495424316936e-05, + "loss": 1.7277, + "step": 25136 + }, + { + "epoch": 7.7154696132596685, + "grad_norm": 0.19702760875225067, + "learning_rate": 1.3077143591225389e-05, + "loss": 1.7291, + "step": 25137 + }, + { + "epoch": 7.715776550030694, + "grad_norm": 0.1713123917579651, + "learning_rate": 1.3073792123027173e-05, + "loss": 1.689, + "step": 25138 + }, + { + "epoch": 7.716083486801719, + "grad_norm": 0.17696695029735565, + "learning_rate": 1.3070441019755358e-05, + "loss": 1.6816, + "step": 25139 + }, + { + "epoch": 7.716390423572744, + "grad_norm": 0.1802004724740982, + "learning_rate": 1.3067090281443122e-05, + "loss": 1.754, + "step": 25140 + }, + { + "epoch": 7.716697360343769, + "grad_norm": 0.1829070895910263, + "learning_rate": 1.3063739908123518e-05, + "loss": 1.7389, + "step": 25141 + }, + { + "epoch": 7.717004297114794, + "grad_norm": 0.16842049360275269, + "learning_rate": 1.30603898998297e-05, + "loss": 1.7257, + "step": 25142 + }, + { + "epoch": 7.71731123388582, + "grad_norm": 0.18215791881084442, + "learning_rate": 1.305704025659476e-05, + "loss": 1.6765, + "step": 25143 + }, + { + "epoch": 7.717618170656845, + "grad_norm": 0.16992273926734924, + "learning_rate": 1.3053690978451799e-05, + "loss": 1.6729, + "step": 25144 + }, + { + "epoch": 7.71792510742787, + "grad_norm": 0.1847899854183197, + "learning_rate": 1.3050342065433935e-05, + "loss": 1.6972, + "step": 25145 + }, + { + "epoch": 7.718232044198895, + "grad_norm": 0.18730273842811584, + "learning_rate": 1.3046993517574219e-05, + "loss": 1.6996, + "step": 25146 + }, + { + "epoch": 7.71853898096992, + "grad_norm": 0.1695355772972107, + "learning_rate": 1.304364533490578e-05, + "loss": 1.7581, + "step": 25147 + }, + { + "epoch": 7.718845917740945, + "grad_norm": 0.17106328904628754, + "learning_rate": 1.3040297517461709e-05, + "loss": 1.6479, + "step": 25148 + }, + { + "epoch": 7.719152854511971, + "grad_norm": 0.1726374626159668, + "learning_rate": 1.3036950065275072e-05, + "loss": 1.7078, + "step": 25149 + }, + { + "epoch": 7.719459791282996, + "grad_norm": 0.21725010871887207, + "learning_rate": 1.3033602978378962e-05, + "loss": 1.8195, + "step": 25150 + }, + { + "epoch": 7.7197667280540205, + "grad_norm": 0.24786241352558136, + "learning_rate": 1.3030256256806455e-05, + "loss": 1.7439, + "step": 25151 + }, + { + "epoch": 7.720073664825046, + "grad_norm": 0.16550323367118835, + "learning_rate": 1.3026909900590622e-05, + "loss": 1.7267, + "step": 25152 + }, + { + "epoch": 7.720380601596071, + "grad_norm": 0.1833605021238327, + "learning_rate": 1.3023563909764542e-05, + "loss": 1.6675, + "step": 25153 + }, + { + "epoch": 7.7206875383670965, + "grad_norm": 0.16360491514205933, + "learning_rate": 1.3020218284361268e-05, + "loss": 1.684, + "step": 25154 + }, + { + "epoch": 7.720994475138122, + "grad_norm": 0.20423299074172974, + "learning_rate": 1.3016873024413878e-05, + "loss": 1.708, + "step": 25155 + }, + { + "epoch": 7.721301411909147, + "grad_norm": 0.1743123084306717, + "learning_rate": 1.301352812995541e-05, + "loss": 1.7497, + "step": 25156 + }, + { + "epoch": 7.721608348680172, + "grad_norm": 0.237883523106575, + "learning_rate": 1.301018360101896e-05, + "loss": 1.6859, + "step": 25157 + }, + { + "epoch": 7.721915285451197, + "grad_norm": 0.17953886091709137, + "learning_rate": 1.300683943763753e-05, + "loss": 1.6948, + "step": 25158 + }, + { + "epoch": 7.722222222222222, + "grad_norm": 0.19036953151226044, + "learning_rate": 1.3003495639844209e-05, + "loss": 1.7207, + "step": 25159 + }, + { + "epoch": 7.722529158993248, + "grad_norm": 0.17385275661945343, + "learning_rate": 1.3000152207672028e-05, + "loss": 1.7088, + "step": 25160 + }, + { + "epoch": 7.722836095764272, + "grad_norm": 0.1848379373550415, + "learning_rate": 1.2996809141154031e-05, + "loss": 1.7351, + "step": 25161 + }, + { + "epoch": 7.723143032535297, + "grad_norm": 0.1964390128850937, + "learning_rate": 1.2993466440323271e-05, + "loss": 1.7243, + "step": 25162 + }, + { + "epoch": 7.723449969306323, + "grad_norm": 0.23729266226291656, + "learning_rate": 1.299012410521273e-05, + "loss": 1.7588, + "step": 25163 + }, + { + "epoch": 7.723756906077348, + "grad_norm": 0.16980098187923431, + "learning_rate": 1.2986782135855496e-05, + "loss": 1.7092, + "step": 25164 + }, + { + "epoch": 7.724063842848373, + "grad_norm": 0.1993054747581482, + "learning_rate": 1.2983440532284568e-05, + "loss": 1.7245, + "step": 25165 + }, + { + "epoch": 7.724370779619399, + "grad_norm": 0.18817138671875, + "learning_rate": 1.2980099294532982e-05, + "loss": 1.7019, + "step": 25166 + }, + { + "epoch": 7.724677716390423, + "grad_norm": 0.20675966143608093, + "learning_rate": 1.297675842263375e-05, + "loss": 1.6949, + "step": 25167 + }, + { + "epoch": 7.7249846531614486, + "grad_norm": 0.21214626729488373, + "learning_rate": 1.2973417916619895e-05, + "loss": 1.7056, + "step": 25168 + }, + { + "epoch": 7.725291589932474, + "grad_norm": 0.1676976978778839, + "learning_rate": 1.2970077776524426e-05, + "loss": 1.7183, + "step": 25169 + }, + { + "epoch": 7.725598526703499, + "grad_norm": 0.2368413507938385, + "learning_rate": 1.2966738002380347e-05, + "loss": 1.7868, + "step": 25170 + }, + { + "epoch": 7.725905463474525, + "grad_norm": 0.22054153680801392, + "learning_rate": 1.2963398594220672e-05, + "loss": 1.7214, + "step": 25171 + }, + { + "epoch": 7.726212400245549, + "grad_norm": 0.20026426017284393, + "learning_rate": 1.2960059552078402e-05, + "loss": 1.7703, + "step": 25172 + }, + { + "epoch": 7.726519337016574, + "grad_norm": 0.1900193840265274, + "learning_rate": 1.2956720875986516e-05, + "loss": 1.7513, + "step": 25173 + }, + { + "epoch": 7.7268262737876, + "grad_norm": 0.17151880264282227, + "learning_rate": 1.2953382565978057e-05, + "loss": 1.7382, + "step": 25174 + }, + { + "epoch": 7.727133210558625, + "grad_norm": 0.2654723525047302, + "learning_rate": 1.2950044622085955e-05, + "loss": 1.7526, + "step": 25175 + }, + { + "epoch": 7.72744014732965, + "grad_norm": 0.19927532970905304, + "learning_rate": 1.2946707044343259e-05, + "loss": 1.7208, + "step": 25176 + }, + { + "epoch": 7.727747084100676, + "grad_norm": 0.3037160038948059, + "learning_rate": 1.2943369832782887e-05, + "loss": 1.8081, + "step": 25177 + }, + { + "epoch": 7.7280540208717, + "grad_norm": 0.20067723095417023, + "learning_rate": 1.2940032987437873e-05, + "loss": 1.685, + "step": 25178 + }, + { + "epoch": 7.7283609576427255, + "grad_norm": 0.16820429265499115, + "learning_rate": 1.2936696508341189e-05, + "loss": 1.7328, + "step": 25179 + }, + { + "epoch": 7.728667894413751, + "grad_norm": 0.15474672615528107, + "learning_rate": 1.2933360395525763e-05, + "loss": 1.708, + "step": 25180 + }, + { + "epoch": 7.728974831184776, + "grad_norm": 0.17825615406036377, + "learning_rate": 1.2930024649024609e-05, + "loss": 1.7416, + "step": 25181 + }, + { + "epoch": 7.7292817679558015, + "grad_norm": 0.20498061180114746, + "learning_rate": 1.292668926887068e-05, + "loss": 1.736, + "step": 25182 + }, + { + "epoch": 7.729588704726826, + "grad_norm": 0.22965869307518005, + "learning_rate": 1.2923354255096937e-05, + "loss": 1.7167, + "step": 25183 + }, + { + "epoch": 7.729895641497851, + "grad_norm": 0.1687164008617401, + "learning_rate": 1.2920019607736338e-05, + "loss": 1.6988, + "step": 25184 + }, + { + "epoch": 7.730202578268877, + "grad_norm": 0.18255390226840973, + "learning_rate": 1.2916685326821842e-05, + "loss": 1.6891, + "step": 25185 + }, + { + "epoch": 7.730509515039902, + "grad_norm": 0.1519697606563568, + "learning_rate": 1.2913351412386393e-05, + "loss": 1.6553, + "step": 25186 + }, + { + "epoch": 7.730816451810927, + "grad_norm": 0.19137845933437347, + "learning_rate": 1.2910017864462942e-05, + "loss": 1.7246, + "step": 25187 + }, + { + "epoch": 7.731123388581953, + "grad_norm": 0.19998718798160553, + "learning_rate": 1.2906684683084436e-05, + "loss": 1.7324, + "step": 25188 + }, + { + "epoch": 7.731430325352977, + "grad_norm": 0.18066956102848053, + "learning_rate": 1.2903351868283808e-05, + "loss": 1.7299, + "step": 25189 + }, + { + "epoch": 7.731737262124002, + "grad_norm": 0.18489640951156616, + "learning_rate": 1.290001942009399e-05, + "loss": 1.7249, + "step": 25190 + }, + { + "epoch": 7.732044198895028, + "grad_norm": 0.14994095265865326, + "learning_rate": 1.2896687338547958e-05, + "loss": 1.6466, + "step": 25191 + }, + { + "epoch": 7.732351135666053, + "grad_norm": 0.19937917590141296, + "learning_rate": 1.2893355623678571e-05, + "loss": 1.7298, + "step": 25192 + }, + { + "epoch": 7.7326580724370775, + "grad_norm": 0.1435725837945938, + "learning_rate": 1.2890024275518826e-05, + "loss": 1.7384, + "step": 25193 + }, + { + "epoch": 7.732965009208103, + "grad_norm": 0.23283594846725464, + "learning_rate": 1.2886693294101582e-05, + "loss": 1.7765, + "step": 25194 + }, + { + "epoch": 7.733271945979128, + "grad_norm": 0.15489891171455383, + "learning_rate": 1.2883362679459803e-05, + "loss": 1.6911, + "step": 25195 + }, + { + "epoch": 7.7335788827501535, + "grad_norm": 0.17880970239639282, + "learning_rate": 1.2880032431626404e-05, + "loss": 1.6557, + "step": 25196 + }, + { + "epoch": 7.733885819521179, + "grad_norm": 0.1717783808708191, + "learning_rate": 1.287670255063425e-05, + "loss": 1.7112, + "step": 25197 + }, + { + "epoch": 7.734192756292204, + "grad_norm": 0.17371709644794464, + "learning_rate": 1.2873373036516313e-05, + "loss": 1.7591, + "step": 25198 + }, + { + "epoch": 7.734499693063229, + "grad_norm": 0.15894445776939392, + "learning_rate": 1.2870043889305432e-05, + "loss": 1.6615, + "step": 25199 + }, + { + "epoch": 7.734806629834254, + "grad_norm": 0.17047199606895447, + "learning_rate": 1.2866715109034554e-05, + "loss": 1.7376, + "step": 25200 + }, + { + "epoch": 7.735113566605279, + "grad_norm": 0.17434459924697876, + "learning_rate": 1.2863386695736562e-05, + "loss": 1.6871, + "step": 25201 + }, + { + "epoch": 7.735420503376305, + "grad_norm": 0.18515460193157196, + "learning_rate": 1.2860058649444351e-05, + "loss": 1.7475, + "step": 25202 + }, + { + "epoch": 7.73572744014733, + "grad_norm": 0.1510036140680313, + "learning_rate": 1.2856730970190806e-05, + "loss": 1.7101, + "step": 25203 + }, + { + "epoch": 7.736034376918354, + "grad_norm": 0.1886061728000641, + "learning_rate": 1.2853403658008817e-05, + "loss": 1.7253, + "step": 25204 + }, + { + "epoch": 7.73634131368938, + "grad_norm": 0.15830372273921967, + "learning_rate": 1.2850076712931269e-05, + "loss": 1.7024, + "step": 25205 + }, + { + "epoch": 7.736648250460405, + "grad_norm": 0.3030432462692261, + "learning_rate": 1.2846750134991031e-05, + "loss": 1.7702, + "step": 25206 + }, + { + "epoch": 7.73695518723143, + "grad_norm": 0.1946970373392105, + "learning_rate": 1.2843423924220977e-05, + "loss": 1.7199, + "step": 25207 + }, + { + "epoch": 7.737262124002456, + "grad_norm": 0.19842801988124847, + "learning_rate": 1.2840098080654012e-05, + "loss": 1.7435, + "step": 25208 + }, + { + "epoch": 7.737569060773481, + "grad_norm": 0.17269715666770935, + "learning_rate": 1.2836772604322945e-05, + "loss": 1.6837, + "step": 25209 + }, + { + "epoch": 7.7378759975445055, + "grad_norm": 0.14366893470287323, + "learning_rate": 1.2833447495260703e-05, + "loss": 1.6453, + "step": 25210 + }, + { + "epoch": 7.738182934315531, + "grad_norm": 0.2189856618642807, + "learning_rate": 1.283012275350009e-05, + "loss": 1.7341, + "step": 25211 + }, + { + "epoch": 7.738489871086556, + "grad_norm": 0.14334678649902344, + "learning_rate": 1.2826798379074007e-05, + "loss": 1.6505, + "step": 25212 + }, + { + "epoch": 7.7387968078575815, + "grad_norm": 0.2020469605922699, + "learning_rate": 1.2823474372015304e-05, + "loss": 1.7915, + "step": 25213 + }, + { + "epoch": 7.739103744628607, + "grad_norm": 0.14702250063419342, + "learning_rate": 1.2820150732356783e-05, + "loss": 1.6682, + "step": 25214 + }, + { + "epoch": 7.739410681399631, + "grad_norm": 0.2310563623905182, + "learning_rate": 1.281682746013136e-05, + "loss": 1.7447, + "step": 25215 + }, + { + "epoch": 7.739717618170657, + "grad_norm": 0.16534216701984406, + "learning_rate": 1.2813504555371808e-05, + "loss": 1.6641, + "step": 25216 + }, + { + "epoch": 7.740024554941682, + "grad_norm": 0.1390565037727356, + "learning_rate": 1.2810182018111012e-05, + "loss": 1.6912, + "step": 25217 + }, + { + "epoch": 7.740331491712707, + "grad_norm": 0.16568928956985474, + "learning_rate": 1.2806859848381797e-05, + "loss": 1.7375, + "step": 25218 + }, + { + "epoch": 7.740638428483733, + "grad_norm": 0.18870174884796143, + "learning_rate": 1.2803538046216995e-05, + "loss": 1.7158, + "step": 25219 + }, + { + "epoch": 7.740945365254758, + "grad_norm": 0.18347607553005219, + "learning_rate": 1.2800216611649429e-05, + "loss": 1.7766, + "step": 25220 + }, + { + "epoch": 7.741252302025782, + "grad_norm": 0.21285377442836761, + "learning_rate": 1.2796895544711929e-05, + "loss": 1.6876, + "step": 25221 + }, + { + "epoch": 7.741559238796808, + "grad_norm": 0.26524603366851807, + "learning_rate": 1.2793574845437311e-05, + "loss": 1.6679, + "step": 25222 + }, + { + "epoch": 7.741866175567833, + "grad_norm": 0.1671147346496582, + "learning_rate": 1.2790254513858397e-05, + "loss": 1.6853, + "step": 25223 + }, + { + "epoch": 7.742173112338858, + "grad_norm": 0.21713866293430328, + "learning_rate": 1.2786934550007979e-05, + "loss": 1.8124, + "step": 25224 + }, + { + "epoch": 7.742480049109884, + "grad_norm": 0.17161360383033752, + "learning_rate": 1.2783614953918916e-05, + "loss": 1.6862, + "step": 25225 + }, + { + "epoch": 7.742786985880908, + "grad_norm": 0.1513087898492813, + "learning_rate": 1.2780295725623947e-05, + "loss": 1.6644, + "step": 25226 + }, + { + "epoch": 7.7430939226519335, + "grad_norm": 0.13013005256652832, + "learning_rate": 1.2776976865155948e-05, + "loss": 1.6612, + "step": 25227 + }, + { + "epoch": 7.743400859422959, + "grad_norm": 0.15204063057899475, + "learning_rate": 1.2773658372547648e-05, + "loss": 1.6391, + "step": 25228 + }, + { + "epoch": 7.743707796193984, + "grad_norm": 0.15421196818351746, + "learning_rate": 1.2770340247831891e-05, + "loss": 1.7005, + "step": 25229 + }, + { + "epoch": 7.7440147329650095, + "grad_norm": 0.14045587182044983, + "learning_rate": 1.276702249104147e-05, + "loss": 1.6448, + "step": 25230 + }, + { + "epoch": 7.744321669736035, + "grad_norm": 0.17244049906730652, + "learning_rate": 1.2763705102209123e-05, + "loss": 1.6737, + "step": 25231 + }, + { + "epoch": 7.744628606507059, + "grad_norm": 0.16891124844551086, + "learning_rate": 1.2760388081367697e-05, + "loss": 1.6625, + "step": 25232 + }, + { + "epoch": 7.744935543278085, + "grad_norm": 0.18271134793758392, + "learning_rate": 1.275707142854991e-05, + "loss": 1.6963, + "step": 25233 + }, + { + "epoch": 7.74524248004911, + "grad_norm": 0.18582625687122345, + "learning_rate": 1.2753755143788593e-05, + "loss": 1.6731, + "step": 25234 + }, + { + "epoch": 7.745549416820135, + "grad_norm": 0.17610707879066467, + "learning_rate": 1.2750439227116495e-05, + "loss": 1.6976, + "step": 25235 + }, + { + "epoch": 7.74585635359116, + "grad_norm": 0.20406337082386017, + "learning_rate": 1.2747123678566391e-05, + "loss": 1.7287, + "step": 25236 + }, + { + "epoch": 7.746163290362185, + "grad_norm": 0.16879913210868835, + "learning_rate": 1.2743808498171046e-05, + "loss": 1.6594, + "step": 25237 + }, + { + "epoch": 7.74647022713321, + "grad_norm": 0.1405191272497177, + "learning_rate": 1.2740493685963217e-05, + "loss": 1.6565, + "step": 25238 + }, + { + "epoch": 7.746777163904236, + "grad_norm": 0.1460784375667572, + "learning_rate": 1.2737179241975671e-05, + "loss": 1.6336, + "step": 25239 + }, + { + "epoch": 7.747084100675261, + "grad_norm": 0.16206084191799164, + "learning_rate": 1.273386516624116e-05, + "loss": 1.7501, + "step": 25240 + }, + { + "epoch": 7.747391037446286, + "grad_norm": 0.17040394246578217, + "learning_rate": 1.2730551458792422e-05, + "loss": 1.7532, + "step": 25241 + }, + { + "epoch": 7.747697974217311, + "grad_norm": 0.15487439930438995, + "learning_rate": 1.2727238119662243e-05, + "loss": 1.6757, + "step": 25242 + }, + { + "epoch": 7.748004910988336, + "grad_norm": 0.139495387673378, + "learning_rate": 1.272392514888332e-05, + "loss": 1.6431, + "step": 25243 + }, + { + "epoch": 7.7483118477593615, + "grad_norm": 0.16329489648342133, + "learning_rate": 1.2720612546488447e-05, + "loss": 1.7353, + "step": 25244 + }, + { + "epoch": 7.748618784530387, + "grad_norm": 0.14997398853302002, + "learning_rate": 1.27173003125103e-05, + "loss": 1.6977, + "step": 25245 + }, + { + "epoch": 7.748925721301412, + "grad_norm": 0.2005717009305954, + "learning_rate": 1.2713988446981656e-05, + "loss": 1.757, + "step": 25246 + }, + { + "epoch": 7.749232658072437, + "grad_norm": 0.2027040272951126, + "learning_rate": 1.2710676949935246e-05, + "loss": 1.7506, + "step": 25247 + }, + { + "epoch": 7.749539594843462, + "grad_norm": 0.18176981806755066, + "learning_rate": 1.2707365821403755e-05, + "loss": 1.7132, + "step": 25248 + }, + { + "epoch": 7.749846531614487, + "grad_norm": 0.18690772354602814, + "learning_rate": 1.2704055061419961e-05, + "loss": 1.7725, + "step": 25249 + }, + { + "epoch": 7.750153468385513, + "grad_norm": 0.18360945582389832, + "learning_rate": 1.270074467001653e-05, + "loss": 1.6779, + "step": 25250 + }, + { + "epoch": 7.750460405156538, + "grad_norm": 0.18498149514198303, + "learning_rate": 1.269743464722621e-05, + "loss": 1.7105, + "step": 25251 + }, + { + "epoch": 7.750767341927563, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.2694124993081707e-05, + "loss": 1.7273, + "step": 25252 + }, + { + "epoch": 7.751074278698588, + "grad_norm": 0.17312094569206238, + "learning_rate": 1.2690815707615727e-05, + "loss": 1.7532, + "step": 25253 + }, + { + "epoch": 7.751381215469613, + "grad_norm": 0.18758632242679596, + "learning_rate": 1.2687506790860976e-05, + "loss": 1.7394, + "step": 25254 + }, + { + "epoch": 7.7516881522406385, + "grad_norm": 0.1642044633626938, + "learning_rate": 1.2684198242850149e-05, + "loss": 1.6699, + "step": 25255 + }, + { + "epoch": 7.751995089011664, + "grad_norm": 0.34566664695739746, + "learning_rate": 1.2680890063615947e-05, + "loss": 1.7048, + "step": 25256 + }, + { + "epoch": 7.752302025782689, + "grad_norm": 0.15046556293964386, + "learning_rate": 1.2677582253191066e-05, + "loss": 1.659, + "step": 25257 + }, + { + "epoch": 7.752608962553714, + "grad_norm": 0.1504966914653778, + "learning_rate": 1.2674274811608171e-05, + "loss": 1.6841, + "step": 25258 + }, + { + "epoch": 7.752915899324739, + "grad_norm": 0.2226656973361969, + "learning_rate": 1.2670967738900009e-05, + "loss": 1.7139, + "step": 25259 + }, + { + "epoch": 7.753222836095764, + "grad_norm": 0.18797673285007477, + "learning_rate": 1.2667661035099188e-05, + "loss": 1.7726, + "step": 25260 + }, + { + "epoch": 7.75352977286679, + "grad_norm": 0.15428531169891357, + "learning_rate": 1.266435470023845e-05, + "loss": 1.6831, + "step": 25261 + }, + { + "epoch": 7.753836709637815, + "grad_norm": 0.20027057826519012, + "learning_rate": 1.2661048734350412e-05, + "loss": 1.741, + "step": 25262 + }, + { + "epoch": 7.75414364640884, + "grad_norm": 0.14779487252235413, + "learning_rate": 1.2657743137467793e-05, + "loss": 1.6974, + "step": 25263 + }, + { + "epoch": 7.754450583179865, + "grad_norm": 0.17618241906166077, + "learning_rate": 1.2654437909623258e-05, + "loss": 1.7374, + "step": 25264 + }, + { + "epoch": 7.75475751995089, + "grad_norm": 0.18769881129264832, + "learning_rate": 1.2651133050849423e-05, + "loss": 1.7241, + "step": 25265 + }, + { + "epoch": 7.755064456721915, + "grad_norm": 0.18645870685577393, + "learning_rate": 1.2647828561179015e-05, + "loss": 1.7176, + "step": 25266 + }, + { + "epoch": 7.755371393492941, + "grad_norm": 0.17507290840148926, + "learning_rate": 1.2644524440644628e-05, + "loss": 1.6994, + "step": 25267 + }, + { + "epoch": 7.755678330263965, + "grad_norm": 0.15264524519443512, + "learning_rate": 1.264122068927896e-05, + "loss": 1.6993, + "step": 25268 + }, + { + "epoch": 7.7559852670349905, + "grad_norm": 0.1749732941389084, + "learning_rate": 1.263791730711465e-05, + "loss": 1.7265, + "step": 25269 + }, + { + "epoch": 7.756292203806016, + "grad_norm": 0.15777049958705902, + "learning_rate": 1.2634614294184332e-05, + "loss": 1.6219, + "step": 25270 + }, + { + "epoch": 7.756599140577041, + "grad_norm": 0.17740310728549957, + "learning_rate": 1.263131165052066e-05, + "loss": 1.7373, + "step": 25271 + }, + { + "epoch": 7.7569060773480665, + "grad_norm": 0.22577044367790222, + "learning_rate": 1.262800937615627e-05, + "loss": 1.7492, + "step": 25272 + }, + { + "epoch": 7.757213014119092, + "grad_norm": 0.155413419008255, + "learning_rate": 1.2624707471123791e-05, + "loss": 1.7037, + "step": 25273 + }, + { + "epoch": 7.757519950890116, + "grad_norm": 0.1755802482366562, + "learning_rate": 1.2621405935455866e-05, + "loss": 1.7057, + "step": 25274 + }, + { + "epoch": 7.757826887661142, + "grad_norm": 0.15870101749897003, + "learning_rate": 1.2618104769185096e-05, + "loss": 1.6951, + "step": 25275 + }, + { + "epoch": 7.758133824432167, + "grad_norm": 0.18285419046878815, + "learning_rate": 1.2614803972344158e-05, + "loss": 1.7443, + "step": 25276 + }, + { + "epoch": 7.758440761203192, + "grad_norm": 0.1669059544801712, + "learning_rate": 1.2611503544965609e-05, + "loss": 1.6442, + "step": 25277 + }, + { + "epoch": 7.758747697974218, + "grad_norm": 0.17830590903759003, + "learning_rate": 1.2608203487082121e-05, + "loss": 1.7432, + "step": 25278 + }, + { + "epoch": 7.759054634745242, + "grad_norm": 0.18318989872932434, + "learning_rate": 1.2604903798726259e-05, + "loss": 1.7128, + "step": 25279 + }, + { + "epoch": 7.759361571516267, + "grad_norm": 0.17735294997692108, + "learning_rate": 1.2601604479930663e-05, + "loss": 1.6719, + "step": 25280 + }, + { + "epoch": 7.759668508287293, + "grad_norm": 0.14324752986431122, + "learning_rate": 1.2598305530727949e-05, + "loss": 1.688, + "step": 25281 + }, + { + "epoch": 7.759975445058318, + "grad_norm": 0.17677859961986542, + "learning_rate": 1.2595006951150678e-05, + "loss": 1.7016, + "step": 25282 + }, + { + "epoch": 7.760282381829343, + "grad_norm": 0.16832831501960754, + "learning_rate": 1.2591708741231495e-05, + "loss": 1.6669, + "step": 25283 + }, + { + "epoch": 7.760589318600369, + "grad_norm": 0.20717547833919525, + "learning_rate": 1.2588410901002944e-05, + "loss": 1.7275, + "step": 25284 + }, + { + "epoch": 7.760896255371393, + "grad_norm": 0.2471853792667389, + "learning_rate": 1.2585113430497658e-05, + "loss": 1.779, + "step": 25285 + }, + { + "epoch": 7.7612031921424185, + "grad_norm": 0.2646878957748413, + "learning_rate": 1.2581816329748214e-05, + "loss": 1.8003, + "step": 25286 + }, + { + "epoch": 7.761510128913444, + "grad_norm": 0.2102949321269989, + "learning_rate": 1.2578519598787191e-05, + "loss": 1.764, + "step": 25287 + }, + { + "epoch": 7.761817065684469, + "grad_norm": 0.16151423752307892, + "learning_rate": 1.2575223237647171e-05, + "loss": 1.7233, + "step": 25288 + }, + { + "epoch": 7.7621240024554945, + "grad_norm": 0.22221817076206207, + "learning_rate": 1.2571927246360727e-05, + "loss": 1.7485, + "step": 25289 + }, + { + "epoch": 7.762430939226519, + "grad_norm": 0.16470851004123688, + "learning_rate": 1.2568631624960441e-05, + "loss": 1.6844, + "step": 25290 + }, + { + "epoch": 7.762737875997544, + "grad_norm": 0.17529261112213135, + "learning_rate": 1.256533637347887e-05, + "loss": 1.7409, + "step": 25291 + }, + { + "epoch": 7.76304481276857, + "grad_norm": 0.19055718183517456, + "learning_rate": 1.2562041491948579e-05, + "loss": 1.6861, + "step": 25292 + }, + { + "epoch": 7.763351749539595, + "grad_norm": 0.19183041155338287, + "learning_rate": 1.2558746980402159e-05, + "loss": 1.7493, + "step": 25293 + }, + { + "epoch": 7.76365868631062, + "grad_norm": 0.20031596720218658, + "learning_rate": 1.2555452838872123e-05, + "loss": 1.705, + "step": 25294 + }, + { + "epoch": 7.763965623081646, + "grad_norm": 0.16234149038791656, + "learning_rate": 1.2552159067391072e-05, + "loss": 1.7407, + "step": 25295 + }, + { + "epoch": 7.76427255985267, + "grad_norm": 0.15412569046020508, + "learning_rate": 1.254886566599151e-05, + "loss": 1.6599, + "step": 25296 + }, + { + "epoch": 7.764579496623695, + "grad_norm": 0.17393885552883148, + "learning_rate": 1.2545572634706022e-05, + "loss": 1.7372, + "step": 25297 + }, + { + "epoch": 7.764886433394721, + "grad_norm": 0.18662036955356598, + "learning_rate": 1.254227997356715e-05, + "loss": 1.7681, + "step": 25298 + }, + { + "epoch": 7.765193370165746, + "grad_norm": 0.16661690175533295, + "learning_rate": 1.2538987682607395e-05, + "loss": 1.754, + "step": 25299 + }, + { + "epoch": 7.765500306936771, + "grad_norm": 0.21453191339969635, + "learning_rate": 1.253569576185935e-05, + "loss": 1.7802, + "step": 25300 + }, + { + "epoch": 7.765807243707796, + "grad_norm": 0.14639903604984283, + "learning_rate": 1.2532404211355486e-05, + "loss": 1.6478, + "step": 25301 + }, + { + "epoch": 7.766114180478821, + "grad_norm": 0.17430682480335236, + "learning_rate": 1.2529113031128382e-05, + "loss": 1.687, + "step": 25302 + }, + { + "epoch": 7.7664211172498465, + "grad_norm": 0.21582552790641785, + "learning_rate": 1.2525822221210543e-05, + "loss": 1.7723, + "step": 25303 + }, + { + "epoch": 7.766728054020872, + "grad_norm": 0.21142803132534027, + "learning_rate": 1.2522531781634495e-05, + "loss": 1.7986, + "step": 25304 + }, + { + "epoch": 7.767034990791897, + "grad_norm": 0.1637791097164154, + "learning_rate": 1.251924171243275e-05, + "loss": 1.6884, + "step": 25305 + }, + { + "epoch": 7.7673419275629225, + "grad_norm": 0.19218359887599945, + "learning_rate": 1.2515952013637832e-05, + "loss": 1.7972, + "step": 25306 + }, + { + "epoch": 7.767648864333947, + "grad_norm": 0.14534975588321686, + "learning_rate": 1.2512662685282245e-05, + "loss": 1.6602, + "step": 25307 + }, + { + "epoch": 7.767955801104972, + "grad_norm": 0.2955080568790436, + "learning_rate": 1.2509373727398494e-05, + "loss": 1.763, + "step": 25308 + }, + { + "epoch": 7.768262737875998, + "grad_norm": 0.17220059037208557, + "learning_rate": 1.2506085140019086e-05, + "loss": 1.672, + "step": 25309 + }, + { + "epoch": 7.768569674647023, + "grad_norm": 0.17092043161392212, + "learning_rate": 1.2502796923176524e-05, + "loss": 1.7014, + "step": 25310 + }, + { + "epoch": 7.768876611418047, + "grad_norm": 0.2363509237766266, + "learning_rate": 1.2499509076903288e-05, + "loss": 1.7489, + "step": 25311 + }, + { + "epoch": 7.769183548189073, + "grad_norm": 0.19223156571388245, + "learning_rate": 1.2496221601231906e-05, + "loss": 1.7194, + "step": 25312 + }, + { + "epoch": 7.769490484960098, + "grad_norm": 0.18292652070522308, + "learning_rate": 1.249293449619483e-05, + "loss": 1.7422, + "step": 25313 + }, + { + "epoch": 7.769797421731123, + "grad_norm": 0.17120866477489471, + "learning_rate": 1.2489647761824547e-05, + "loss": 1.7367, + "step": 25314 + }, + { + "epoch": 7.770104358502149, + "grad_norm": 0.22178049385547638, + "learning_rate": 1.248636139815358e-05, + "loss": 1.7451, + "step": 25315 + }, + { + "epoch": 7.770411295273174, + "grad_norm": 0.15707750618457794, + "learning_rate": 1.2483075405214346e-05, + "loss": 1.6748, + "step": 25316 + }, + { + "epoch": 7.7707182320441985, + "grad_norm": 0.1570693850517273, + "learning_rate": 1.2479789783039381e-05, + "loss": 1.6895, + "step": 25317 + }, + { + "epoch": 7.771025168815224, + "grad_norm": 0.1687897890806198, + "learning_rate": 1.2476504531661093e-05, + "loss": 1.7145, + "step": 25318 + }, + { + "epoch": 7.771332105586249, + "grad_norm": 0.16047275066375732, + "learning_rate": 1.2473219651112e-05, + "loss": 1.6675, + "step": 25319 + }, + { + "epoch": 7.7716390423572745, + "grad_norm": 0.16817785799503326, + "learning_rate": 1.2469935141424544e-05, + "loss": 1.6678, + "step": 25320 + }, + { + "epoch": 7.7719459791283, + "grad_norm": 0.1511528342962265, + "learning_rate": 1.246665100263118e-05, + "loss": 1.7054, + "step": 25321 + }, + { + "epoch": 7.772252915899324, + "grad_norm": 0.145367830991745, + "learning_rate": 1.2463367234764373e-05, + "loss": 1.7037, + "step": 25322 + }, + { + "epoch": 7.77255985267035, + "grad_norm": 0.1794048696756363, + "learning_rate": 1.2460083837856573e-05, + "loss": 1.7372, + "step": 25323 + }, + { + "epoch": 7.772866789441375, + "grad_norm": 0.21238376200199127, + "learning_rate": 1.2456800811940227e-05, + "loss": 1.7796, + "step": 25324 + }, + { + "epoch": 7.7731737262124, + "grad_norm": 0.23305723071098328, + "learning_rate": 1.2453518157047784e-05, + "loss": 1.7124, + "step": 25325 + }, + { + "epoch": 7.773480662983426, + "grad_norm": 0.18229269981384277, + "learning_rate": 1.2450235873211673e-05, + "loss": 1.7202, + "step": 25326 + }, + { + "epoch": 7.773787599754451, + "grad_norm": 0.19145874679088593, + "learning_rate": 1.2446953960464346e-05, + "loss": 1.6701, + "step": 25327 + }, + { + "epoch": 7.774094536525475, + "grad_norm": 0.26310765743255615, + "learning_rate": 1.2443672418838215e-05, + "loss": 1.7674, + "step": 25328 + }, + { + "epoch": 7.774401473296501, + "grad_norm": 0.18370535969734192, + "learning_rate": 1.2440391248365756e-05, + "loss": 1.7027, + "step": 25329 + }, + { + "epoch": 7.774708410067526, + "grad_norm": 0.24704128503799438, + "learning_rate": 1.2437110449079348e-05, + "loss": 1.7238, + "step": 25330 + }, + { + "epoch": 7.7750153468385514, + "grad_norm": 0.194215789437294, + "learning_rate": 1.2433830021011433e-05, + "loss": 1.735, + "step": 25331 + }, + { + "epoch": 7.775322283609577, + "grad_norm": 0.24099037051200867, + "learning_rate": 1.2430549964194427e-05, + "loss": 1.7335, + "step": 25332 + }, + { + "epoch": 7.775629220380601, + "grad_norm": 0.1665026843547821, + "learning_rate": 1.242727027866073e-05, + "loss": 1.7245, + "step": 25333 + }, + { + "epoch": 7.775936157151627, + "grad_norm": 0.18005968630313873, + "learning_rate": 1.24239909644428e-05, + "loss": 1.6227, + "step": 25334 + }, + { + "epoch": 7.776243093922652, + "grad_norm": 0.2306728959083557, + "learning_rate": 1.2420712021572983e-05, + "loss": 1.7136, + "step": 25335 + }, + { + "epoch": 7.776550030693677, + "grad_norm": 0.1916062831878662, + "learning_rate": 1.2417433450083738e-05, + "loss": 1.7912, + "step": 25336 + }, + { + "epoch": 7.776856967464703, + "grad_norm": 0.1999555081129074, + "learning_rate": 1.2414155250007437e-05, + "loss": 1.7685, + "step": 25337 + }, + { + "epoch": 7.777163904235728, + "grad_norm": 0.18222710490226746, + "learning_rate": 1.2410877421376488e-05, + "loss": 1.7024, + "step": 25338 + }, + { + "epoch": 7.777470841006752, + "grad_norm": 0.22534650564193726, + "learning_rate": 1.2407599964223276e-05, + "loss": 1.7263, + "step": 25339 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.3313053250312805, + "learning_rate": 1.2404322878580199e-05, + "loss": 1.6988, + "step": 25340 + }, + { + "epoch": 7.778084714548803, + "grad_norm": 0.23691575229167938, + "learning_rate": 1.2401046164479635e-05, + "loss": 1.7771, + "step": 25341 + }, + { + "epoch": 7.778391651319828, + "grad_norm": 0.2119995355606079, + "learning_rate": 1.2397769821953976e-05, + "loss": 1.709, + "step": 25342 + }, + { + "epoch": 7.778698588090853, + "grad_norm": 0.20468266308307648, + "learning_rate": 1.2394493851035588e-05, + "loss": 1.7914, + "step": 25343 + }, + { + "epoch": 7.779005524861878, + "grad_norm": 0.19825033843517303, + "learning_rate": 1.2391218251756854e-05, + "loss": 1.727, + "step": 25344 + }, + { + "epoch": 7.7793124616329035, + "grad_norm": 0.19072072207927704, + "learning_rate": 1.2387943024150134e-05, + "loss": 1.7498, + "step": 25345 + }, + { + "epoch": 7.779619398403929, + "grad_norm": 0.15986371040344238, + "learning_rate": 1.2384668168247832e-05, + "loss": 1.6807, + "step": 25346 + }, + { + "epoch": 7.779926335174954, + "grad_norm": 0.1731162816286087, + "learning_rate": 1.238139368408227e-05, + "loss": 1.7, + "step": 25347 + }, + { + "epoch": 7.7802332719459795, + "grad_norm": 0.1496593952178955, + "learning_rate": 1.237811957168583e-05, + "loss": 1.6558, + "step": 25348 + }, + { + "epoch": 7.780540208717004, + "grad_norm": 0.1982542872428894, + "learning_rate": 1.2374845831090859e-05, + "loss": 1.7888, + "step": 25349 + }, + { + "epoch": 7.780847145488029, + "grad_norm": 0.1517801433801651, + "learning_rate": 1.2371572462329706e-05, + "loss": 1.6743, + "step": 25350 + }, + { + "epoch": 7.781154082259055, + "grad_norm": 0.23794496059417725, + "learning_rate": 1.2368299465434752e-05, + "loss": 1.7332, + "step": 25351 + }, + { + "epoch": 7.78146101903008, + "grad_norm": 0.20220822095870972, + "learning_rate": 1.2365026840438288e-05, + "loss": 1.7444, + "step": 25352 + }, + { + "epoch": 7.781767955801105, + "grad_norm": 0.18997377157211304, + "learning_rate": 1.236175458737272e-05, + "loss": 1.771, + "step": 25353 + }, + { + "epoch": 7.78207489257213, + "grad_norm": 0.15465202927589417, + "learning_rate": 1.2358482706270325e-05, + "loss": 1.7072, + "step": 25354 + }, + { + "epoch": 7.782381829343155, + "grad_norm": 0.1759808510541916, + "learning_rate": 1.235521119716348e-05, + "loss": 1.6761, + "step": 25355 + }, + { + "epoch": 7.78268876611418, + "grad_norm": 0.17520606517791748, + "learning_rate": 1.2351940060084505e-05, + "loss": 1.6702, + "step": 25356 + }, + { + "epoch": 7.782995702885206, + "grad_norm": 0.20305509865283966, + "learning_rate": 1.2348669295065717e-05, + "loss": 1.746, + "step": 25357 + }, + { + "epoch": 7.783302639656231, + "grad_norm": 0.14459536969661713, + "learning_rate": 1.2345398902139454e-05, + "loss": 1.6907, + "step": 25358 + }, + { + "epoch": 7.783609576427256, + "grad_norm": 0.18058347702026367, + "learning_rate": 1.2342128881338027e-05, + "loss": 1.796, + "step": 25359 + }, + { + "epoch": 7.783916513198281, + "grad_norm": 0.1778976023197174, + "learning_rate": 1.2338859232693756e-05, + "loss": 1.715, + "step": 25360 + }, + { + "epoch": 7.784223449969306, + "grad_norm": 0.1644120067358017, + "learning_rate": 1.2335589956238953e-05, + "loss": 1.6786, + "step": 25361 + }, + { + "epoch": 7.7845303867403315, + "grad_norm": 0.15315432846546173, + "learning_rate": 1.2332321052005907e-05, + "loss": 1.6503, + "step": 25362 + }, + { + "epoch": 7.784837323511357, + "grad_norm": 0.19160087406635284, + "learning_rate": 1.2329052520026973e-05, + "loss": 1.7131, + "step": 25363 + }, + { + "epoch": 7.785144260282382, + "grad_norm": 0.1778041124343872, + "learning_rate": 1.2325784360334408e-05, + "loss": 1.754, + "step": 25364 + }, + { + "epoch": 7.785451197053407, + "grad_norm": 0.17478828132152557, + "learning_rate": 1.2322516572960519e-05, + "loss": 1.7122, + "step": 25365 + }, + { + "epoch": 7.785758133824432, + "grad_norm": 0.2239549458026886, + "learning_rate": 1.2319249157937612e-05, + "loss": 1.7589, + "step": 25366 + }, + { + "epoch": 7.786065070595457, + "grad_norm": 0.21565821766853333, + "learning_rate": 1.2315982115297953e-05, + "loss": 1.7468, + "step": 25367 + }, + { + "epoch": 7.786372007366483, + "grad_norm": 0.1859208643436432, + "learning_rate": 1.231271544507387e-05, + "loss": 1.7289, + "step": 25368 + }, + { + "epoch": 7.786678944137508, + "grad_norm": 0.14813102781772614, + "learning_rate": 1.2309449147297596e-05, + "loss": 1.6543, + "step": 25369 + }, + { + "epoch": 7.786985880908533, + "grad_norm": 0.14101989567279816, + "learning_rate": 1.2306183222001472e-05, + "loss": 1.6775, + "step": 25370 + }, + { + "epoch": 7.787292817679558, + "grad_norm": 0.2041245847940445, + "learning_rate": 1.2302917669217701e-05, + "loss": 1.6874, + "step": 25371 + }, + { + "epoch": 7.787599754450583, + "grad_norm": 0.17343124747276306, + "learning_rate": 1.2299652488978614e-05, + "loss": 1.7005, + "step": 25372 + }, + { + "epoch": 7.787906691221608, + "grad_norm": 0.20174655318260193, + "learning_rate": 1.2296387681316451e-05, + "loss": 1.8073, + "step": 25373 + }, + { + "epoch": 7.788213627992634, + "grad_norm": 0.21615192294120789, + "learning_rate": 1.2293123246263488e-05, + "loss": 1.7045, + "step": 25374 + }, + { + "epoch": 7.788520564763659, + "grad_norm": 0.18587705492973328, + "learning_rate": 1.2289859183851981e-05, + "loss": 1.7497, + "step": 25375 + }, + { + "epoch": 7.7888275015346835, + "grad_norm": 0.16649113595485687, + "learning_rate": 1.228659549411419e-05, + "loss": 1.6695, + "step": 25376 + }, + { + "epoch": 7.789134438305709, + "grad_norm": 0.16547587513923645, + "learning_rate": 1.2283332177082362e-05, + "loss": 1.7119, + "step": 25377 + }, + { + "epoch": 7.789441375076734, + "grad_norm": 0.17672663927078247, + "learning_rate": 1.2280069232788755e-05, + "loss": 1.7458, + "step": 25378 + }, + { + "epoch": 7.7897483118477595, + "grad_norm": 0.15436655282974243, + "learning_rate": 1.22768066612656e-05, + "loss": 1.723, + "step": 25379 + }, + { + "epoch": 7.790055248618785, + "grad_norm": 0.1699141561985016, + "learning_rate": 1.2273544462545178e-05, + "loss": 1.7083, + "step": 25380 + }, + { + "epoch": 7.79036218538981, + "grad_norm": 0.18014399707317352, + "learning_rate": 1.2270282636659686e-05, + "loss": 1.7512, + "step": 25381 + }, + { + "epoch": 7.790669122160835, + "grad_norm": 0.1807268261909485, + "learning_rate": 1.2267021183641375e-05, + "loss": 1.7404, + "step": 25382 + }, + { + "epoch": 7.79097605893186, + "grad_norm": 0.16704204678535461, + "learning_rate": 1.2263760103522481e-05, + "loss": 1.6723, + "step": 25383 + }, + { + "epoch": 7.791282995702885, + "grad_norm": 0.1551518738269806, + "learning_rate": 1.2260499396335206e-05, + "loss": 1.7, + "step": 25384 + }, + { + "epoch": 7.791589932473911, + "grad_norm": 0.16270415484905243, + "learning_rate": 1.225723906211183e-05, + "loss": 1.7238, + "step": 25385 + }, + { + "epoch": 7.791896869244935, + "grad_norm": 0.19548700749874115, + "learning_rate": 1.225397910088451e-05, + "loss": 1.7192, + "step": 25386 + }, + { + "epoch": 7.79220380601596, + "grad_norm": 0.19115851819515228, + "learning_rate": 1.225071951268552e-05, + "loss": 1.753, + "step": 25387 + }, + { + "epoch": 7.792510742786986, + "grad_norm": 0.1557070016860962, + "learning_rate": 1.224746029754702e-05, + "loss": 1.6791, + "step": 25388 + }, + { + "epoch": 7.792817679558011, + "grad_norm": 0.16580358147621155, + "learning_rate": 1.2244201455501252e-05, + "loss": 1.6799, + "step": 25389 + }, + { + "epoch": 7.793124616329036, + "grad_norm": 0.18099573254585266, + "learning_rate": 1.2240942986580422e-05, + "loss": 1.7546, + "step": 25390 + }, + { + "epoch": 7.793431553100062, + "grad_norm": 0.2411479502916336, + "learning_rate": 1.223768489081672e-05, + "loss": 1.7315, + "step": 25391 + }, + { + "epoch": 7.793738489871086, + "grad_norm": 0.14678087830543518, + "learning_rate": 1.2234427168242351e-05, + "loss": 1.6733, + "step": 25392 + }, + { + "epoch": 7.7940454266421115, + "grad_norm": 0.17501497268676758, + "learning_rate": 1.223116981888951e-05, + "loss": 1.7416, + "step": 25393 + }, + { + "epoch": 7.794352363413137, + "grad_norm": 0.25460878014564514, + "learning_rate": 1.2227912842790384e-05, + "loss": 1.7873, + "step": 25394 + }, + { + "epoch": 7.794659300184162, + "grad_norm": 0.1701650321483612, + "learning_rate": 1.2224656239977161e-05, + "loss": 1.686, + "step": 25395 + }, + { + "epoch": 7.7949662369551875, + "grad_norm": 0.15684448182582855, + "learning_rate": 1.2221400010482009e-05, + "loss": 1.6768, + "step": 25396 + }, + { + "epoch": 7.795273173726212, + "grad_norm": 0.19048964977264404, + "learning_rate": 1.2218144154337158e-05, + "loss": 1.744, + "step": 25397 + }, + { + "epoch": 7.795580110497237, + "grad_norm": 0.20939184725284576, + "learning_rate": 1.2214888671574737e-05, + "loss": 1.818, + "step": 25398 + }, + { + "epoch": 7.795887047268263, + "grad_norm": 0.18450765311717987, + "learning_rate": 1.2211633562226932e-05, + "loss": 1.6972, + "step": 25399 + }, + { + "epoch": 7.796193984039288, + "grad_norm": 0.20349545776844025, + "learning_rate": 1.2208378826325912e-05, + "loss": 1.7784, + "step": 25400 + }, + { + "epoch": 7.796500920810313, + "grad_norm": 0.17835615575313568, + "learning_rate": 1.2205124463903828e-05, + "loss": 1.7203, + "step": 25401 + }, + { + "epoch": 7.796807857581339, + "grad_norm": 0.1525154411792755, + "learning_rate": 1.2201870474992882e-05, + "loss": 1.7194, + "step": 25402 + }, + { + "epoch": 7.797114794352363, + "grad_norm": 0.15197598934173584, + "learning_rate": 1.2198616859625184e-05, + "loss": 1.6787, + "step": 25403 + }, + { + "epoch": 7.797421731123388, + "grad_norm": 0.1602524071931839, + "learning_rate": 1.2195363617832934e-05, + "loss": 1.6919, + "step": 25404 + }, + { + "epoch": 7.797728667894414, + "grad_norm": 0.15638625621795654, + "learning_rate": 1.2192110749648233e-05, + "loss": 1.6945, + "step": 25405 + }, + { + "epoch": 7.798035604665439, + "grad_norm": 0.15247012674808502, + "learning_rate": 1.2188858255103264e-05, + "loss": 1.673, + "step": 25406 + }, + { + "epoch": 7.798342541436464, + "grad_norm": 0.16753807663917542, + "learning_rate": 1.218560613423016e-05, + "loss": 1.7088, + "step": 25407 + }, + { + "epoch": 7.798649478207489, + "grad_norm": 0.17434635758399963, + "learning_rate": 1.2182354387061063e-05, + "loss": 1.7279, + "step": 25408 + }, + { + "epoch": 7.798956414978514, + "grad_norm": 0.21984371542930603, + "learning_rate": 1.2179103013628108e-05, + "loss": 1.7203, + "step": 25409 + }, + { + "epoch": 7.7992633517495396, + "grad_norm": 0.18304525315761566, + "learning_rate": 1.2175852013963418e-05, + "loss": 1.6937, + "step": 25410 + }, + { + "epoch": 7.799570288520565, + "grad_norm": 0.20372866094112396, + "learning_rate": 1.2172601388099131e-05, + "loss": 1.6911, + "step": 25411 + }, + { + "epoch": 7.79987722529159, + "grad_norm": 0.2012174129486084, + "learning_rate": 1.216935113606737e-05, + "loss": 1.7365, + "step": 25412 + }, + { + "epoch": 7.800184162062616, + "grad_norm": 0.2146923542022705, + "learning_rate": 1.2166101257900236e-05, + "loss": 1.711, + "step": 25413 + }, + { + "epoch": 7.80049109883364, + "grad_norm": 0.202762633562088, + "learning_rate": 1.2162851753629895e-05, + "loss": 1.7459, + "step": 25414 + }, + { + "epoch": 7.800798035604665, + "grad_norm": 0.19161204993724823, + "learning_rate": 1.2159602623288418e-05, + "loss": 1.687, + "step": 25415 + }, + { + "epoch": 7.801104972375691, + "grad_norm": 0.2027188539505005, + "learning_rate": 1.2156353866907927e-05, + "loss": 1.7482, + "step": 25416 + }, + { + "epoch": 7.801411909146716, + "grad_norm": 0.17790403962135315, + "learning_rate": 1.2153105484520521e-05, + "loss": 1.7047, + "step": 25417 + }, + { + "epoch": 7.8017188459177405, + "grad_norm": 0.18325060606002808, + "learning_rate": 1.21498574761583e-05, + "loss": 1.693, + "step": 25418 + }, + { + "epoch": 7.802025782688766, + "grad_norm": 0.14223991334438324, + "learning_rate": 1.2146609841853401e-05, + "loss": 1.7168, + "step": 25419 + }, + { + "epoch": 7.802332719459791, + "grad_norm": 0.18397340178489685, + "learning_rate": 1.2143362581637863e-05, + "loss": 1.7234, + "step": 25420 + }, + { + "epoch": 7.8026396562308165, + "grad_norm": 0.16903668642044067, + "learning_rate": 1.214011569554383e-05, + "loss": 1.6884, + "step": 25421 + }, + { + "epoch": 7.802946593001842, + "grad_norm": 0.15086103975772858, + "learning_rate": 1.2136869183603339e-05, + "loss": 1.6712, + "step": 25422 + }, + { + "epoch": 7.803253529772867, + "grad_norm": 0.1743185818195343, + "learning_rate": 1.2133623045848507e-05, + "loss": 1.7167, + "step": 25423 + }, + { + "epoch": 7.803560466543892, + "grad_norm": 0.160976842045784, + "learning_rate": 1.2130377282311411e-05, + "loss": 1.7749, + "step": 25424 + }, + { + "epoch": 7.803867403314917, + "grad_norm": 0.2554323971271515, + "learning_rate": 1.2127131893024123e-05, + "loss": 1.7156, + "step": 25425 + }, + { + "epoch": 7.804174340085942, + "grad_norm": 0.1582731157541275, + "learning_rate": 1.2123886878018714e-05, + "loss": 1.7088, + "step": 25426 + }, + { + "epoch": 7.804481276856968, + "grad_norm": 0.18008622527122498, + "learning_rate": 1.2120642237327257e-05, + "loss": 1.6928, + "step": 25427 + }, + { + "epoch": 7.804788213627993, + "grad_norm": 0.29349491000175476, + "learning_rate": 1.2117397970981815e-05, + "loss": 1.7596, + "step": 25428 + }, + { + "epoch": 7.805095150399017, + "grad_norm": 0.20927627384662628, + "learning_rate": 1.211415407901445e-05, + "loss": 1.7113, + "step": 25429 + }, + { + "epoch": 7.805402087170043, + "grad_norm": 0.2126142680644989, + "learning_rate": 1.21109105614572e-05, + "loss": 1.7125, + "step": 25430 + }, + { + "epoch": 7.805709023941068, + "grad_norm": 0.20456665754318237, + "learning_rate": 1.2107667418342172e-05, + "loss": 1.7619, + "step": 25431 + }, + { + "epoch": 7.806015960712093, + "grad_norm": 0.17268066108226776, + "learning_rate": 1.2104424649701373e-05, + "loss": 1.6462, + "step": 25432 + }, + { + "epoch": 7.806322897483119, + "grad_norm": 0.16213946044445038, + "learning_rate": 1.2101182255566856e-05, + "loss": 1.6787, + "step": 25433 + }, + { + "epoch": 7.806629834254144, + "grad_norm": 0.17202046513557434, + "learning_rate": 1.2097940235970673e-05, + "loss": 1.7081, + "step": 25434 + }, + { + "epoch": 7.8069367710251685, + "grad_norm": 0.2076229751110077, + "learning_rate": 1.2094698590944842e-05, + "loss": 1.6832, + "step": 25435 + }, + { + "epoch": 7.807243707796194, + "grad_norm": 0.17209482192993164, + "learning_rate": 1.2091457320521448e-05, + "loss": 1.7722, + "step": 25436 + }, + { + "epoch": 7.807550644567219, + "grad_norm": 0.2185208946466446, + "learning_rate": 1.2088216424732463e-05, + "loss": 1.7536, + "step": 25437 + }, + { + "epoch": 7.8078575813382445, + "grad_norm": 0.1812329739332199, + "learning_rate": 1.2084975903609968e-05, + "loss": 1.7275, + "step": 25438 + }, + { + "epoch": 7.80816451810927, + "grad_norm": 0.20143690705299377, + "learning_rate": 1.208173575718594e-05, + "loss": 1.7533, + "step": 25439 + }, + { + "epoch": 7.808471454880294, + "grad_norm": 0.18351776897907257, + "learning_rate": 1.2078495985492433e-05, + "loss": 1.6831, + "step": 25440 + }, + { + "epoch": 7.80877839165132, + "grad_norm": 0.15470999479293823, + "learning_rate": 1.2075256588561462e-05, + "loss": 1.6862, + "step": 25441 + }, + { + "epoch": 7.809085328422345, + "grad_norm": 0.1751607209444046, + "learning_rate": 1.2072017566425032e-05, + "loss": 1.7182, + "step": 25442 + }, + { + "epoch": 7.80939226519337, + "grad_norm": 0.16465237736701965, + "learning_rate": 1.2068778919115153e-05, + "loss": 1.7055, + "step": 25443 + }, + { + "epoch": 7.809699201964396, + "grad_norm": 0.13899528980255127, + "learning_rate": 1.2065540646663832e-05, + "loss": 1.634, + "step": 25444 + }, + { + "epoch": 7.810006138735421, + "grad_norm": 0.21526047587394714, + "learning_rate": 1.2062302749103072e-05, + "loss": 1.759, + "step": 25445 + }, + { + "epoch": 7.810313075506445, + "grad_norm": 0.1628599315881729, + "learning_rate": 1.2059065226464872e-05, + "loss": 1.6782, + "step": 25446 + }, + { + "epoch": 7.810620012277471, + "grad_norm": 0.16853751242160797, + "learning_rate": 1.2055828078781217e-05, + "loss": 1.7123, + "step": 25447 + }, + { + "epoch": 7.810926949048496, + "grad_norm": 0.17399325966835022, + "learning_rate": 1.2052591306084138e-05, + "loss": 1.7394, + "step": 25448 + }, + { + "epoch": 7.811233885819521, + "grad_norm": 0.16147997975349426, + "learning_rate": 1.2049354908405574e-05, + "loss": 1.66, + "step": 25449 + }, + { + "epoch": 7.811540822590547, + "grad_norm": 0.1806066632270813, + "learning_rate": 1.204611888577753e-05, + "loss": 1.7193, + "step": 25450 + }, + { + "epoch": 7.811847759361571, + "grad_norm": 0.14491340517997742, + "learning_rate": 1.2042883238231984e-05, + "loss": 1.6996, + "step": 25451 + }, + { + "epoch": 7.8121546961325965, + "grad_norm": 0.24257591366767883, + "learning_rate": 1.2039647965800905e-05, + "loss": 1.734, + "step": 25452 + }, + { + "epoch": 7.812461632903622, + "grad_norm": 0.17281031608581543, + "learning_rate": 1.2036413068516295e-05, + "loss": 1.7469, + "step": 25453 + }, + { + "epoch": 7.812768569674647, + "grad_norm": 0.16350387036800385, + "learning_rate": 1.2033178546410073e-05, + "loss": 1.6755, + "step": 25454 + }, + { + "epoch": 7.8130755064456725, + "grad_norm": 0.21092571318149567, + "learning_rate": 1.202994439951427e-05, + "loss": 1.7538, + "step": 25455 + }, + { + "epoch": 7.813382443216698, + "grad_norm": 0.13705989718437195, + "learning_rate": 1.2026710627860777e-05, + "loss": 1.6563, + "step": 25456 + }, + { + "epoch": 7.813689379987722, + "grad_norm": 0.2368711531162262, + "learning_rate": 1.20234772314816e-05, + "loss": 1.7685, + "step": 25457 + }, + { + "epoch": 7.813996316758748, + "grad_norm": 0.19303718209266663, + "learning_rate": 1.2020244210408682e-05, + "loss": 1.7286, + "step": 25458 + }, + { + "epoch": 7.814303253529773, + "grad_norm": 0.17113862931728363, + "learning_rate": 1.2017011564673974e-05, + "loss": 1.6336, + "step": 25459 + }, + { + "epoch": 7.814610190300798, + "grad_norm": 0.2151467204093933, + "learning_rate": 1.2013779294309418e-05, + "loss": 1.7585, + "step": 25460 + }, + { + "epoch": 7.814917127071823, + "grad_norm": 0.21620413661003113, + "learning_rate": 1.2010547399346961e-05, + "loss": 1.7058, + "step": 25461 + }, + { + "epoch": 7.815224063842848, + "grad_norm": 0.20134735107421875, + "learning_rate": 1.2007315879818537e-05, + "loss": 1.7833, + "step": 25462 + }, + { + "epoch": 7.815531000613873, + "grad_norm": 0.16653650999069214, + "learning_rate": 1.2004084735756088e-05, + "loss": 1.7022, + "step": 25463 + }, + { + "epoch": 7.815837937384899, + "grad_norm": 0.2135760486125946, + "learning_rate": 1.2000853967191527e-05, + "loss": 1.7502, + "step": 25464 + }, + { + "epoch": 7.816144874155924, + "grad_norm": 0.19773945212364197, + "learning_rate": 1.199762357415683e-05, + "loss": 1.7369, + "step": 25465 + }, + { + "epoch": 7.816451810926949, + "grad_norm": 0.1873825341463089, + "learning_rate": 1.1994393556683876e-05, + "loss": 1.6921, + "step": 25466 + }, + { + "epoch": 7.816758747697974, + "grad_norm": 0.19304445385932922, + "learning_rate": 1.1991163914804604e-05, + "loss": 1.6934, + "step": 25467 + }, + { + "epoch": 7.817065684468999, + "grad_norm": 0.16338905692100525, + "learning_rate": 1.1987934648550924e-05, + "loss": 1.6523, + "step": 25468 + }, + { + "epoch": 7.8173726212400245, + "grad_norm": 0.16972069442272186, + "learning_rate": 1.198470575795474e-05, + "loss": 1.6907, + "step": 25469 + }, + { + "epoch": 7.81767955801105, + "grad_norm": 0.17251834273338318, + "learning_rate": 1.1981477243048e-05, + "loss": 1.7336, + "step": 25470 + }, + { + "epoch": 7.817986494782075, + "grad_norm": 0.17767611145973206, + "learning_rate": 1.197824910386256e-05, + "loss": 1.6809, + "step": 25471 + }, + { + "epoch": 7.8182934315531, + "grad_norm": 0.1854296773672104, + "learning_rate": 1.197502134043038e-05, + "loss": 1.6938, + "step": 25472 + }, + { + "epoch": 7.818600368324125, + "grad_norm": 0.15811395645141602, + "learning_rate": 1.1971793952783295e-05, + "loss": 1.6346, + "step": 25473 + }, + { + "epoch": 7.81890730509515, + "grad_norm": 0.1668241322040558, + "learning_rate": 1.196856694095324e-05, + "loss": 1.7014, + "step": 25474 + }, + { + "epoch": 7.819214241866176, + "grad_norm": 0.16705112159252167, + "learning_rate": 1.1965340304972105e-05, + "loss": 1.7509, + "step": 25475 + }, + { + "epoch": 7.819521178637201, + "grad_norm": 0.1737189143896103, + "learning_rate": 1.1962114044871764e-05, + "loss": 1.6934, + "step": 25476 + }, + { + "epoch": 7.819828115408226, + "grad_norm": 0.21887148916721344, + "learning_rate": 1.1958888160684112e-05, + "loss": 1.7163, + "step": 25477 + }, + { + "epoch": 7.820135052179251, + "grad_norm": 0.19267810881137848, + "learning_rate": 1.1955662652441018e-05, + "loss": 1.6941, + "step": 25478 + }, + { + "epoch": 7.820441988950276, + "grad_norm": 0.19797572493553162, + "learning_rate": 1.195243752017437e-05, + "loss": 1.7067, + "step": 25479 + }, + { + "epoch": 7.820748925721301, + "grad_norm": 0.20177066326141357, + "learning_rate": 1.1949212763916035e-05, + "loss": 1.7186, + "step": 25480 + }, + { + "epoch": 7.821055862492327, + "grad_norm": 0.1789240539073944, + "learning_rate": 1.1945988383697876e-05, + "loss": 1.7533, + "step": 25481 + }, + { + "epoch": 7.821362799263352, + "grad_norm": 0.2210909128189087, + "learning_rate": 1.1942764379551769e-05, + "loss": 1.7255, + "step": 25482 + }, + { + "epoch": 7.8216697360343765, + "grad_norm": 0.17705149948596954, + "learning_rate": 1.193954075150957e-05, + "loss": 1.6797, + "step": 25483 + }, + { + "epoch": 7.821976672805402, + "grad_norm": 0.17962488532066345, + "learning_rate": 1.1936317499603134e-05, + "loss": 1.7134, + "step": 25484 + }, + { + "epoch": 7.822283609576427, + "grad_norm": 0.2144375741481781, + "learning_rate": 1.193309462386432e-05, + "loss": 1.6837, + "step": 25485 + }, + { + "epoch": 7.8225905463474525, + "grad_norm": 0.19018805027008057, + "learning_rate": 1.1929872124324976e-05, + "loss": 1.7377, + "step": 25486 + }, + { + "epoch": 7.822897483118478, + "grad_norm": 0.2281246781349182, + "learning_rate": 1.1926650001016953e-05, + "loss": 1.755, + "step": 25487 + }, + { + "epoch": 7.823204419889503, + "grad_norm": 0.17724375426769257, + "learning_rate": 1.1923428253972069e-05, + "loss": 1.7018, + "step": 25488 + }, + { + "epoch": 7.823511356660528, + "grad_norm": 0.19313837587833405, + "learning_rate": 1.1920206883222218e-05, + "loss": 1.705, + "step": 25489 + }, + { + "epoch": 7.823818293431553, + "grad_norm": 0.1883455514907837, + "learning_rate": 1.191698588879917e-05, + "loss": 1.66, + "step": 25490 + }, + { + "epoch": 7.824125230202578, + "grad_norm": 0.20110155642032623, + "learning_rate": 1.1913765270734805e-05, + "loss": 1.7456, + "step": 25491 + }, + { + "epoch": 7.824432166973604, + "grad_norm": 0.23234841227531433, + "learning_rate": 1.1910545029060938e-05, + "loss": 1.6987, + "step": 25492 + }, + { + "epoch": 7.824739103744628, + "grad_norm": 0.208989679813385, + "learning_rate": 1.1907325163809386e-05, + "loss": 1.7753, + "step": 25493 + }, + { + "epoch": 7.8250460405156534, + "grad_norm": 0.19063059985637665, + "learning_rate": 1.1904105675011972e-05, + "loss": 1.6664, + "step": 25494 + }, + { + "epoch": 7.825352977286679, + "grad_norm": 0.16878041625022888, + "learning_rate": 1.1900886562700519e-05, + "loss": 1.6886, + "step": 25495 + }, + { + "epoch": 7.825659914057704, + "grad_norm": 0.19139298796653748, + "learning_rate": 1.1897667826906834e-05, + "loss": 1.7195, + "step": 25496 + }, + { + "epoch": 7.8259668508287294, + "grad_norm": 0.255795419216156, + "learning_rate": 1.1894449467662728e-05, + "loss": 1.7835, + "step": 25497 + }, + { + "epoch": 7.826273787599755, + "grad_norm": 0.17967084050178528, + "learning_rate": 1.1891231485000004e-05, + "loss": 1.6959, + "step": 25498 + }, + { + "epoch": 7.82658072437078, + "grad_norm": 0.23582984507083893, + "learning_rate": 1.1888013878950471e-05, + "loss": 1.7252, + "step": 25499 + }, + { + "epoch": 7.826887661141805, + "grad_norm": 0.189914271235466, + "learning_rate": 1.188479664954592e-05, + "loss": 1.7216, + "step": 25500 + }, + { + "epoch": 7.82719459791283, + "grad_norm": 0.19840605556964874, + "learning_rate": 1.1881579796818148e-05, + "loss": 1.714, + "step": 25501 + }, + { + "epoch": 7.827501534683855, + "grad_norm": 0.25255537033081055, + "learning_rate": 1.1878363320798946e-05, + "loss": 1.7008, + "step": 25502 + }, + { + "epoch": 7.827808471454881, + "grad_norm": 0.1863456666469574, + "learning_rate": 1.1875147221520105e-05, + "loss": 1.7804, + "step": 25503 + }, + { + "epoch": 7.828115408225905, + "grad_norm": 0.2700684368610382, + "learning_rate": 1.1871931499013405e-05, + "loss": 1.6756, + "step": 25504 + }, + { + "epoch": 7.82842234499693, + "grad_norm": 0.19838537275791168, + "learning_rate": 1.1868716153310604e-05, + "loss": 1.6828, + "step": 25505 + }, + { + "epoch": 7.828729281767956, + "grad_norm": 0.1896767020225525, + "learning_rate": 1.1865501184443533e-05, + "loss": 1.7014, + "step": 25506 + }, + { + "epoch": 7.829036218538981, + "grad_norm": 0.2330249398946762, + "learning_rate": 1.1862286592443905e-05, + "loss": 1.7509, + "step": 25507 + }, + { + "epoch": 7.829343155310006, + "grad_norm": 0.17078560590744019, + "learning_rate": 1.1859072377343539e-05, + "loss": 1.6742, + "step": 25508 + }, + { + "epoch": 7.829650092081032, + "grad_norm": 0.2834900915622711, + "learning_rate": 1.1855858539174146e-05, + "loss": 1.7676, + "step": 25509 + }, + { + "epoch": 7.829957028852056, + "grad_norm": 0.18936461210250854, + "learning_rate": 1.1852645077967533e-05, + "loss": 1.7374, + "step": 25510 + }, + { + "epoch": 7.8302639656230815, + "grad_norm": 0.2720448970794678, + "learning_rate": 1.1849431993755439e-05, + "loss": 1.7001, + "step": 25511 + }, + { + "epoch": 7.830570902394107, + "grad_norm": 0.18198262155056, + "learning_rate": 1.184621928656962e-05, + "loss": 1.6679, + "step": 25512 + }, + { + "epoch": 7.830877839165132, + "grad_norm": 0.16957701742649078, + "learning_rate": 1.1843006956441821e-05, + "loss": 1.7064, + "step": 25513 + }, + { + "epoch": 7.8311847759361575, + "grad_norm": 0.18632464110851288, + "learning_rate": 1.1839795003403798e-05, + "loss": 1.6857, + "step": 25514 + }, + { + "epoch": 7.831491712707182, + "grad_norm": 0.15639352798461914, + "learning_rate": 1.183658342748728e-05, + "loss": 1.695, + "step": 25515 + }, + { + "epoch": 7.831798649478207, + "grad_norm": 0.17000986635684967, + "learning_rate": 1.1833372228724016e-05, + "loss": 1.696, + "step": 25516 + }, + { + "epoch": 7.832105586249233, + "grad_norm": 0.23334810137748718, + "learning_rate": 1.1830161407145735e-05, + "loss": 1.7574, + "step": 25517 + }, + { + "epoch": 7.832412523020258, + "grad_norm": 0.16260294616222382, + "learning_rate": 1.1826950962784177e-05, + "loss": 1.667, + "step": 25518 + }, + { + "epoch": 7.832719459791283, + "grad_norm": 0.18244150280952454, + "learning_rate": 1.1823740895671059e-05, + "loss": 1.6836, + "step": 25519 + }, + { + "epoch": 7.833026396562309, + "grad_norm": 0.18404243886470795, + "learning_rate": 1.182053120583811e-05, + "loss": 1.6922, + "step": 25520 + }, + { + "epoch": 7.833333333333333, + "grad_norm": 0.22713635861873627, + "learning_rate": 1.1817321893317052e-05, + "loss": 1.8055, + "step": 25521 + }, + { + "epoch": 7.833640270104358, + "grad_norm": 0.14314736425876617, + "learning_rate": 1.1814112958139577e-05, + "loss": 1.6624, + "step": 25522 + }, + { + "epoch": 7.833947206875384, + "grad_norm": 0.1947709321975708, + "learning_rate": 1.1810904400337458e-05, + "loss": 1.8108, + "step": 25523 + }, + { + "epoch": 7.834254143646409, + "grad_norm": 0.1811491698026657, + "learning_rate": 1.1807696219942326e-05, + "loss": 1.7258, + "step": 25524 + }, + { + "epoch": 7.834561080417434, + "grad_norm": 0.16776522994041443, + "learning_rate": 1.1804488416985966e-05, + "loss": 1.6834, + "step": 25525 + }, + { + "epoch": 7.834868017188459, + "grad_norm": 0.1590484231710434, + "learning_rate": 1.1801280991500002e-05, + "loss": 1.6797, + "step": 25526 + }, + { + "epoch": 7.835174953959484, + "grad_norm": 0.1564435064792633, + "learning_rate": 1.179807394351618e-05, + "loss": 1.7035, + "step": 25527 + }, + { + "epoch": 7.8354818907305095, + "grad_norm": 0.17740637063980103, + "learning_rate": 1.1794867273066184e-05, + "loss": 1.6844, + "step": 25528 + }, + { + "epoch": 7.835788827501535, + "grad_norm": 0.17152990400791168, + "learning_rate": 1.1791660980181707e-05, + "loss": 1.6745, + "step": 25529 + }, + { + "epoch": 7.83609576427256, + "grad_norm": 0.17763324081897736, + "learning_rate": 1.1788455064894427e-05, + "loss": 1.6941, + "step": 25530 + }, + { + "epoch": 7.8364027010435855, + "grad_norm": 0.16168560087680817, + "learning_rate": 1.178524952723603e-05, + "loss": 1.6955, + "step": 25531 + }, + { + "epoch": 7.83670963781461, + "grad_norm": 0.1819266527891159, + "learning_rate": 1.1782044367238199e-05, + "loss": 1.6838, + "step": 25532 + }, + { + "epoch": 7.837016574585635, + "grad_norm": 0.16239593923091888, + "learning_rate": 1.1778839584932605e-05, + "loss": 1.7045, + "step": 25533 + }, + { + "epoch": 7.837323511356661, + "grad_norm": 0.18346372246742249, + "learning_rate": 1.177563518035092e-05, + "loss": 1.7418, + "step": 25534 + }, + { + "epoch": 7.837630448127686, + "grad_norm": 0.18437781929969788, + "learning_rate": 1.177243115352481e-05, + "loss": 1.7138, + "step": 25535 + }, + { + "epoch": 7.83793738489871, + "grad_norm": 0.16199420392513275, + "learning_rate": 1.1769227504485942e-05, + "loss": 1.7115, + "step": 25536 + }, + { + "epoch": 7.838244321669736, + "grad_norm": 0.174173504114151, + "learning_rate": 1.1766024233265977e-05, + "loss": 1.7115, + "step": 25537 + }, + { + "epoch": 7.838551258440761, + "grad_norm": 0.1924828737974167, + "learning_rate": 1.1762821339896567e-05, + "loss": 1.7343, + "step": 25538 + }, + { + "epoch": 7.838858195211786, + "grad_norm": 0.20509763062000275, + "learning_rate": 1.1759618824409357e-05, + "loss": 1.7296, + "step": 25539 + }, + { + "epoch": 7.839165131982812, + "grad_norm": 0.1762499213218689, + "learning_rate": 1.1756416686836035e-05, + "loss": 1.6721, + "step": 25540 + }, + { + "epoch": 7.839472068753837, + "grad_norm": 0.17260326445102692, + "learning_rate": 1.175321492720819e-05, + "loss": 1.7238, + "step": 25541 + }, + { + "epoch": 7.8397790055248615, + "grad_norm": 0.21378587186336517, + "learning_rate": 1.175001354555752e-05, + "loss": 1.7442, + "step": 25542 + }, + { + "epoch": 7.840085942295887, + "grad_norm": 0.20900048315525055, + "learning_rate": 1.1746812541915608e-05, + "loss": 1.7426, + "step": 25543 + }, + { + "epoch": 7.840392879066912, + "grad_norm": 0.2082734853029251, + "learning_rate": 1.1743611916314129e-05, + "loss": 1.7209, + "step": 25544 + }, + { + "epoch": 7.8406998158379375, + "grad_norm": 0.1696191281080246, + "learning_rate": 1.1740411668784701e-05, + "loss": 1.7039, + "step": 25545 + }, + { + "epoch": 7.841006752608963, + "grad_norm": 0.18812915682792664, + "learning_rate": 1.173721179935895e-05, + "loss": 1.6873, + "step": 25546 + }, + { + "epoch": 7.841313689379987, + "grad_norm": 0.19983457028865814, + "learning_rate": 1.1734012308068493e-05, + "loss": 1.701, + "step": 25547 + }, + { + "epoch": 7.841620626151013, + "grad_norm": 0.18811485171318054, + "learning_rate": 1.1730813194944962e-05, + "loss": 1.7466, + "step": 25548 + }, + { + "epoch": 7.841927562922038, + "grad_norm": 0.16648226976394653, + "learning_rate": 1.172761446001996e-05, + "loss": 1.7449, + "step": 25549 + }, + { + "epoch": 7.842234499693063, + "grad_norm": 0.17902494966983795, + "learning_rate": 1.1724416103325104e-05, + "loss": 1.7395, + "step": 25550 + }, + { + "epoch": 7.842541436464089, + "grad_norm": 0.2420952469110489, + "learning_rate": 1.1721218124892003e-05, + "loss": 1.728, + "step": 25551 + }, + { + "epoch": 7.842848373235114, + "grad_norm": 0.16240666806697845, + "learning_rate": 1.1718020524752266e-05, + "loss": 1.6368, + "step": 25552 + }, + { + "epoch": 7.843155310006138, + "grad_norm": 0.17968396842479706, + "learning_rate": 1.1714823302937483e-05, + "loss": 1.729, + "step": 25553 + }, + { + "epoch": 7.843462246777164, + "grad_norm": 0.17617417871952057, + "learning_rate": 1.1711626459479252e-05, + "loss": 1.6975, + "step": 25554 + }, + { + "epoch": 7.843769183548189, + "grad_norm": 0.1679859161376953, + "learning_rate": 1.1708429994409176e-05, + "loss": 1.6955, + "step": 25555 + }, + { + "epoch": 7.844076120319214, + "grad_norm": 0.1653962880373001, + "learning_rate": 1.1705233907758823e-05, + "loss": 1.7107, + "step": 25556 + }, + { + "epoch": 7.84438305709024, + "grad_norm": 0.190699502825737, + "learning_rate": 1.1702038199559817e-05, + "loss": 1.75, + "step": 25557 + }, + { + "epoch": 7.844689993861264, + "grad_norm": 0.17185768485069275, + "learning_rate": 1.1698842869843696e-05, + "loss": 1.7087, + "step": 25558 + }, + { + "epoch": 7.8449969306322895, + "grad_norm": 0.17880931496620178, + "learning_rate": 1.1695647918642084e-05, + "loss": 1.7082, + "step": 25559 + }, + { + "epoch": 7.845303867403315, + "grad_norm": 0.15360671281814575, + "learning_rate": 1.1692453345986498e-05, + "loss": 1.7028, + "step": 25560 + }, + { + "epoch": 7.84561080417434, + "grad_norm": 0.16576705873012543, + "learning_rate": 1.168925915190856e-05, + "loss": 1.7147, + "step": 25561 + }, + { + "epoch": 7.8459177409453655, + "grad_norm": 0.14623773097991943, + "learning_rate": 1.1686065336439817e-05, + "loss": 1.682, + "step": 25562 + }, + { + "epoch": 7.846224677716391, + "grad_norm": 0.16677425801753998, + "learning_rate": 1.168287189961183e-05, + "loss": 1.7089, + "step": 25563 + }, + { + "epoch": 7.846531614487415, + "grad_norm": 0.160381019115448, + "learning_rate": 1.1679678841456164e-05, + "loss": 1.6929, + "step": 25564 + }, + { + "epoch": 7.846838551258441, + "grad_norm": 0.1775302290916443, + "learning_rate": 1.1676486162004374e-05, + "loss": 1.6947, + "step": 25565 + }, + { + "epoch": 7.847145488029466, + "grad_norm": 0.1681419014930725, + "learning_rate": 1.1673293861288003e-05, + "loss": 1.7173, + "step": 25566 + }, + { + "epoch": 7.847452424800491, + "grad_norm": 0.18374401330947876, + "learning_rate": 1.1670101939338613e-05, + "loss": 1.7175, + "step": 25567 + }, + { + "epoch": 7.847759361571516, + "grad_norm": 0.19383086264133453, + "learning_rate": 1.1666910396187736e-05, + "loss": 1.6962, + "step": 25568 + }, + { + "epoch": 7.848066298342541, + "grad_norm": 0.16849574446678162, + "learning_rate": 1.1663719231866921e-05, + "loss": 1.6717, + "step": 25569 + }, + { + "epoch": 7.848373235113566, + "grad_norm": 0.2510664165019989, + "learning_rate": 1.1660528446407703e-05, + "loss": 1.7983, + "step": 25570 + }, + { + "epoch": 7.848680171884592, + "grad_norm": 0.21037714183330536, + "learning_rate": 1.1657338039841614e-05, + "loss": 1.7287, + "step": 25571 + }, + { + "epoch": 7.848987108655617, + "grad_norm": 0.15170596539974213, + "learning_rate": 1.1654148012200184e-05, + "loss": 1.7076, + "step": 25572 + }, + { + "epoch": 7.849294045426642, + "grad_norm": 0.2093864530324936, + "learning_rate": 1.1650958363514919e-05, + "loss": 1.7469, + "step": 25573 + }, + { + "epoch": 7.849600982197668, + "grad_norm": 0.15684813261032104, + "learning_rate": 1.1647769093817395e-05, + "loss": 1.6731, + "step": 25574 + }, + { + "epoch": 7.849907918968692, + "grad_norm": 0.1600468009710312, + "learning_rate": 1.1644580203139066e-05, + "loss": 1.6394, + "step": 25575 + }, + { + "epoch": 7.850214855739718, + "grad_norm": 0.1863955557346344, + "learning_rate": 1.1641391691511505e-05, + "loss": 1.7025, + "step": 25576 + }, + { + "epoch": 7.850521792510743, + "grad_norm": 0.189132422208786, + "learning_rate": 1.1638203558966166e-05, + "loss": 1.7095, + "step": 25577 + }, + { + "epoch": 7.850828729281768, + "grad_norm": 0.166460782289505, + "learning_rate": 1.1635015805534593e-05, + "loss": 1.6756, + "step": 25578 + }, + { + "epoch": 7.851135666052793, + "grad_norm": 0.15910424292087555, + "learning_rate": 1.1631828431248288e-05, + "loss": 1.6664, + "step": 25579 + }, + { + "epoch": 7.851442602823818, + "grad_norm": 0.14848501980304718, + "learning_rate": 1.1628641436138738e-05, + "loss": 1.6434, + "step": 25580 + }, + { + "epoch": 7.851749539594843, + "grad_norm": 0.1700928956270218, + "learning_rate": 1.1625454820237446e-05, + "loss": 1.7039, + "step": 25581 + }, + { + "epoch": 7.852056476365869, + "grad_norm": 0.17468976974487305, + "learning_rate": 1.1622268583575902e-05, + "loss": 1.7073, + "step": 25582 + }, + { + "epoch": 7.852363413136894, + "grad_norm": 0.18980912864208221, + "learning_rate": 1.1619082726185587e-05, + "loss": 1.6939, + "step": 25583 + }, + { + "epoch": 7.852670349907919, + "grad_norm": 0.1658385694026947, + "learning_rate": 1.1615897248098e-05, + "loss": 1.6892, + "step": 25584 + }, + { + "epoch": 7.852977286678944, + "grad_norm": 0.18137763440608978, + "learning_rate": 1.1612712149344612e-05, + "loss": 1.6608, + "step": 25585 + }, + { + "epoch": 7.853284223449969, + "grad_norm": 0.1642989218235016, + "learning_rate": 1.16095274299569e-05, + "loss": 1.6527, + "step": 25586 + }, + { + "epoch": 7.8535911602209945, + "grad_norm": 0.17476631700992584, + "learning_rate": 1.1606343089966343e-05, + "loss": 1.6622, + "step": 25587 + }, + { + "epoch": 7.85389809699202, + "grad_norm": 0.14995649456977844, + "learning_rate": 1.16031591294044e-05, + "loss": 1.6382, + "step": 25588 + }, + { + "epoch": 7.854205033763045, + "grad_norm": 0.16073103249073029, + "learning_rate": 1.1599975548302549e-05, + "loss": 1.6888, + "step": 25589 + }, + { + "epoch": 7.85451197053407, + "grad_norm": 0.1630357801914215, + "learning_rate": 1.159679234669223e-05, + "loss": 1.6717, + "step": 25590 + }, + { + "epoch": 7.854818907305095, + "grad_norm": 0.1537420153617859, + "learning_rate": 1.1593609524604948e-05, + "loss": 1.6836, + "step": 25591 + }, + { + "epoch": 7.85512584407612, + "grad_norm": 0.16389401257038116, + "learning_rate": 1.1590427082072103e-05, + "loss": 1.6941, + "step": 25592 + }, + { + "epoch": 7.855432780847146, + "grad_norm": 0.24554979801177979, + "learning_rate": 1.1587245019125192e-05, + "loss": 1.8018, + "step": 25593 + }, + { + "epoch": 7.855739717618171, + "grad_norm": 0.15020978450775146, + "learning_rate": 1.1584063335795614e-05, + "loss": 1.6815, + "step": 25594 + }, + { + "epoch": 7.856046654389196, + "grad_norm": 0.1830887496471405, + "learning_rate": 1.1580882032114853e-05, + "loss": 1.7134, + "step": 25595 + }, + { + "epoch": 7.856353591160221, + "grad_norm": 0.2381841540336609, + "learning_rate": 1.157770110811433e-05, + "loss": 1.7505, + "step": 25596 + }, + { + "epoch": 7.856660527931246, + "grad_norm": 0.210253044962883, + "learning_rate": 1.1574520563825491e-05, + "loss": 1.8048, + "step": 25597 + }, + { + "epoch": 7.856967464702271, + "grad_norm": 0.15428896248340607, + "learning_rate": 1.1571340399279756e-05, + "loss": 1.6624, + "step": 25598 + }, + { + "epoch": 7.857274401473297, + "grad_norm": 0.2932582199573517, + "learning_rate": 1.1568160614508567e-05, + "loss": 1.7192, + "step": 25599 + }, + { + "epoch": 7.857581338244322, + "grad_norm": 0.19450223445892334, + "learning_rate": 1.156498120954333e-05, + "loss": 1.753, + "step": 25600 + }, + { + "epoch": 7.8578882750153465, + "grad_norm": 0.16950540244579315, + "learning_rate": 1.1561802184415482e-05, + "loss": 1.7107, + "step": 25601 + }, + { + "epoch": 7.858195211786372, + "grad_norm": 0.18616287410259247, + "learning_rate": 1.1558623539156433e-05, + "loss": 1.6747, + "step": 25602 + }, + { + "epoch": 7.858502148557397, + "grad_norm": 0.20991890132427216, + "learning_rate": 1.1555445273797599e-05, + "loss": 1.6635, + "step": 25603 + }, + { + "epoch": 7.8588090853284225, + "grad_norm": 0.18592311441898346, + "learning_rate": 1.1552267388370386e-05, + "loss": 1.7327, + "step": 25604 + }, + { + "epoch": 7.859116022099448, + "grad_norm": 0.16478584706783295, + "learning_rate": 1.1549089882906206e-05, + "loss": 1.6523, + "step": 25605 + }, + { + "epoch": 7.859422958870473, + "grad_norm": 0.17281852662563324, + "learning_rate": 1.154591275743645e-05, + "loss": 1.7282, + "step": 25606 + }, + { + "epoch": 7.859729895641498, + "grad_norm": 0.17098689079284668, + "learning_rate": 1.1542736011992512e-05, + "loss": 1.7533, + "step": 25607 + }, + { + "epoch": 7.860036832412523, + "grad_norm": 0.1766287386417389, + "learning_rate": 1.1539559646605824e-05, + "loss": 1.6338, + "step": 25608 + }, + { + "epoch": 7.860343769183548, + "grad_norm": 0.15519756078720093, + "learning_rate": 1.1536383661307726e-05, + "loss": 1.6908, + "step": 25609 + }, + { + "epoch": 7.860650705954574, + "grad_norm": 0.18422503769397736, + "learning_rate": 1.1533208056129651e-05, + "loss": 1.6983, + "step": 25610 + }, + { + "epoch": 7.860957642725598, + "grad_norm": 0.1900123953819275, + "learning_rate": 1.1530032831102933e-05, + "loss": 1.7082, + "step": 25611 + }, + { + "epoch": 7.861264579496623, + "grad_norm": 0.15542784333229065, + "learning_rate": 1.1526857986259e-05, + "loss": 1.6979, + "step": 25612 + }, + { + "epoch": 7.861571516267649, + "grad_norm": 0.17173884809017181, + "learning_rate": 1.1523683521629197e-05, + "loss": 1.7329, + "step": 25613 + }, + { + "epoch": 7.861878453038674, + "grad_norm": 0.2399773746728897, + "learning_rate": 1.1520509437244908e-05, + "loss": 1.7224, + "step": 25614 + }, + { + "epoch": 7.862185389809699, + "grad_norm": 0.14101925492286682, + "learning_rate": 1.1517335733137502e-05, + "loss": 1.6676, + "step": 25615 + }, + { + "epoch": 7.862492326580725, + "grad_norm": 0.18625333905220032, + "learning_rate": 1.1514162409338336e-05, + "loss": 1.7269, + "step": 25616 + }, + { + "epoch": 7.862799263351749, + "grad_norm": 0.18385125696659088, + "learning_rate": 1.1510989465878774e-05, + "loss": 1.7197, + "step": 25617 + }, + { + "epoch": 7.8631062001227745, + "grad_norm": 0.16189569234848022, + "learning_rate": 1.1507816902790176e-05, + "loss": 1.662, + "step": 25618 + }, + { + "epoch": 7.8634131368938, + "grad_norm": 0.18526791036128998, + "learning_rate": 1.1504644720103885e-05, + "loss": 1.7521, + "step": 25619 + }, + { + "epoch": 7.863720073664825, + "grad_norm": 0.16588367521762848, + "learning_rate": 1.1501472917851263e-05, + "loss": 1.7238, + "step": 25620 + }, + { + "epoch": 7.8640270104358505, + "grad_norm": 0.15427199006080627, + "learning_rate": 1.1498301496063652e-05, + "loss": 1.6566, + "step": 25621 + }, + { + "epoch": 7.864333947206875, + "grad_norm": 0.1694655865430832, + "learning_rate": 1.149513045477239e-05, + "loss": 1.7446, + "step": 25622 + }, + { + "epoch": 7.8646408839779, + "grad_norm": 0.18305882811546326, + "learning_rate": 1.1491959794008823e-05, + "loss": 1.7093, + "step": 25623 + }, + { + "epoch": 7.864947820748926, + "grad_norm": 0.15975148975849152, + "learning_rate": 1.148878951380426e-05, + "loss": 1.6911, + "step": 25624 + }, + { + "epoch": 7.865254757519951, + "grad_norm": 0.18298782408237457, + "learning_rate": 1.148561961419008e-05, + "loss": 1.7188, + "step": 25625 + }, + { + "epoch": 7.865561694290976, + "grad_norm": 0.16258102655410767, + "learning_rate": 1.148245009519755e-05, + "loss": 1.6901, + "step": 25626 + }, + { + "epoch": 7.865868631062002, + "grad_norm": 0.19591568410396576, + "learning_rate": 1.1479280956858057e-05, + "loss": 1.7521, + "step": 25627 + }, + { + "epoch": 7.866175567833026, + "grad_norm": 0.15821373462677002, + "learning_rate": 1.1476112199202853e-05, + "loss": 1.6503, + "step": 25628 + }, + { + "epoch": 7.866482504604051, + "grad_norm": 0.1531122773885727, + "learning_rate": 1.147294382226331e-05, + "loss": 1.6802, + "step": 25629 + }, + { + "epoch": 7.866789441375077, + "grad_norm": 0.2105177342891693, + "learning_rate": 1.1469775826070711e-05, + "loss": 1.7705, + "step": 25630 + }, + { + "epoch": 7.867096378146102, + "grad_norm": 0.22782234847545624, + "learning_rate": 1.1466608210656377e-05, + "loss": 1.6813, + "step": 25631 + }, + { + "epoch": 7.867403314917127, + "grad_norm": 0.1824047863483429, + "learning_rate": 1.1463440976051598e-05, + "loss": 1.7149, + "step": 25632 + }, + { + "epoch": 7.867710251688152, + "grad_norm": 0.19195812940597534, + "learning_rate": 1.1460274122287685e-05, + "loss": 1.6912, + "step": 25633 + }, + { + "epoch": 7.868017188459177, + "grad_norm": 0.22274719178676605, + "learning_rate": 1.1457107649395937e-05, + "loss": 1.8499, + "step": 25634 + }, + { + "epoch": 7.8683241252302025, + "grad_norm": 0.21217535436153412, + "learning_rate": 1.1453941557407638e-05, + "loss": 1.7345, + "step": 25635 + }, + { + "epoch": 7.868631062001228, + "grad_norm": 0.20042434334754944, + "learning_rate": 1.1450775846354078e-05, + "loss": 1.6902, + "step": 25636 + }, + { + "epoch": 7.868937998772253, + "grad_norm": 0.17045147716999054, + "learning_rate": 1.1447610516266548e-05, + "loss": 1.6641, + "step": 25637 + }, + { + "epoch": 7.8692449355432785, + "grad_norm": 0.18817269802093506, + "learning_rate": 1.1444445567176326e-05, + "loss": 1.7063, + "step": 25638 + }, + { + "epoch": 7.869551872314303, + "grad_norm": 0.1746743619441986, + "learning_rate": 1.1441280999114694e-05, + "loss": 1.6838, + "step": 25639 + }, + { + "epoch": 7.869858809085328, + "grad_norm": 0.1734321415424347, + "learning_rate": 1.1438116812112925e-05, + "loss": 1.6939, + "step": 25640 + }, + { + "epoch": 7.870165745856354, + "grad_norm": 0.1745334416627884, + "learning_rate": 1.1434953006202281e-05, + "loss": 1.71, + "step": 25641 + }, + { + "epoch": 7.870472682627379, + "grad_norm": 0.20883594453334808, + "learning_rate": 1.1431789581414043e-05, + "loss": 1.6941, + "step": 25642 + }, + { + "epoch": 7.870779619398404, + "grad_norm": 0.1664251685142517, + "learning_rate": 1.1428626537779447e-05, + "loss": 1.6995, + "step": 25643 + }, + { + "epoch": 7.871086556169429, + "grad_norm": 0.16561046242713928, + "learning_rate": 1.1425463875329795e-05, + "loss": 1.7093, + "step": 25644 + }, + { + "epoch": 7.871393492940454, + "grad_norm": 0.21409009397029877, + "learning_rate": 1.1422301594096297e-05, + "loss": 1.6919, + "step": 25645 + }, + { + "epoch": 7.871700429711479, + "grad_norm": 0.19574479758739471, + "learning_rate": 1.1419139694110236e-05, + "loss": 1.777, + "step": 25646 + }, + { + "epoch": 7.872007366482505, + "grad_norm": 0.15032227337360382, + "learning_rate": 1.1415978175402853e-05, + "loss": 1.6759, + "step": 25647 + }, + { + "epoch": 7.87231430325353, + "grad_norm": 0.18372420966625214, + "learning_rate": 1.1412817038005386e-05, + "loss": 1.7304, + "step": 25648 + }, + { + "epoch": 7.872621240024555, + "grad_norm": 0.16073383390903473, + "learning_rate": 1.1409656281949077e-05, + "loss": 1.6784, + "step": 25649 + }, + { + "epoch": 7.87292817679558, + "grad_norm": 0.15698374807834625, + "learning_rate": 1.1406495907265163e-05, + "loss": 1.6877, + "step": 25650 + }, + { + "epoch": 7.873235113566605, + "grad_norm": 0.18749327957630157, + "learning_rate": 1.140333591398488e-05, + "loss": 1.708, + "step": 25651 + }, + { + "epoch": 7.8735420503376305, + "grad_norm": 0.15412451326847076, + "learning_rate": 1.1400176302139448e-05, + "loss": 1.6661, + "step": 25652 + }, + { + "epoch": 7.873848987108656, + "grad_norm": 0.22467148303985596, + "learning_rate": 1.1397017071760102e-05, + "loss": 1.8204, + "step": 25653 + }, + { + "epoch": 7.87415592387968, + "grad_norm": 0.14625288546085358, + "learning_rate": 1.1393858222878063e-05, + "loss": 1.7008, + "step": 25654 + }, + { + "epoch": 7.874462860650706, + "grad_norm": 0.14440159499645233, + "learning_rate": 1.1390699755524537e-05, + "loss": 1.652, + "step": 25655 + }, + { + "epoch": 7.874769797421731, + "grad_norm": 0.14738808572292328, + "learning_rate": 1.138754166973075e-05, + "loss": 1.6305, + "step": 25656 + }, + { + "epoch": 7.875076734192756, + "grad_norm": 0.17714212834835052, + "learning_rate": 1.1384383965527906e-05, + "loss": 1.7011, + "step": 25657 + }, + { + "epoch": 7.875383670963782, + "grad_norm": 0.17601121962070465, + "learning_rate": 1.1381226642947213e-05, + "loss": 1.7425, + "step": 25658 + }, + { + "epoch": 7.875690607734807, + "grad_norm": 0.1893182396888733, + "learning_rate": 1.1378069702019877e-05, + "loss": 1.7215, + "step": 25659 + }, + { + "epoch": 7.8759975445058314, + "grad_norm": 0.20073552429676056, + "learning_rate": 1.1374913142777077e-05, + "loss": 1.7025, + "step": 25660 + }, + { + "epoch": 7.876304481276857, + "grad_norm": 0.17025165259838104, + "learning_rate": 1.1371756965250052e-05, + "loss": 1.7046, + "step": 25661 + }, + { + "epoch": 7.876611418047882, + "grad_norm": 0.17612501978874207, + "learning_rate": 1.1368601169469933e-05, + "loss": 1.7452, + "step": 25662 + }, + { + "epoch": 7.8769183548189075, + "grad_norm": 0.2542072534561157, + "learning_rate": 1.1365445755467974e-05, + "loss": 1.765, + "step": 25663 + }, + { + "epoch": 7.877225291589933, + "grad_norm": 0.25291866064071655, + "learning_rate": 1.1362290723275293e-05, + "loss": 1.7477, + "step": 25664 + }, + { + "epoch": 7.877532228360957, + "grad_norm": 0.1848495602607727, + "learning_rate": 1.1359136072923121e-05, + "loss": 1.7278, + "step": 25665 + }, + { + "epoch": 7.877839165131983, + "grad_norm": 0.18354780972003937, + "learning_rate": 1.1355981804442605e-05, + "loss": 1.7469, + "step": 25666 + }, + { + "epoch": 7.878146101903008, + "grad_norm": 0.1843772530555725, + "learning_rate": 1.1352827917864934e-05, + "loss": 1.7654, + "step": 25667 + }, + { + "epoch": 7.878453038674033, + "grad_norm": 0.144758403301239, + "learning_rate": 1.1349674413221267e-05, + "loss": 1.6649, + "step": 25668 + }, + { + "epoch": 7.878759975445059, + "grad_norm": 0.15747511386871338, + "learning_rate": 1.1346521290542772e-05, + "loss": 1.6386, + "step": 25669 + }, + { + "epoch": 7.879066912216084, + "grad_norm": 0.17898736894130707, + "learning_rate": 1.134336854986061e-05, + "loss": 1.7, + "step": 25670 + }, + { + "epoch": 7.879373848987108, + "grad_norm": 0.19453589618206024, + "learning_rate": 1.1340216191205939e-05, + "loss": 1.7108, + "step": 25671 + }, + { + "epoch": 7.879680785758134, + "grad_norm": 0.17470498383045197, + "learning_rate": 1.1337064214609905e-05, + "loss": 1.7705, + "step": 25672 + }, + { + "epoch": 7.879987722529159, + "grad_norm": 0.1897793561220169, + "learning_rate": 1.1333912620103665e-05, + "loss": 1.7358, + "step": 25673 + }, + { + "epoch": 7.880294659300184, + "grad_norm": 0.1659744381904602, + "learning_rate": 1.1330761407718366e-05, + "loss": 1.724, + "step": 25674 + }, + { + "epoch": 7.88060159607121, + "grad_norm": 0.15303891897201538, + "learning_rate": 1.1327610577485148e-05, + "loss": 1.6878, + "step": 25675 + }, + { + "epoch": 7.880908532842234, + "grad_norm": 0.16346490383148193, + "learning_rate": 1.1324460129435144e-05, + "loss": 1.6544, + "step": 25676 + }, + { + "epoch": 7.8812154696132595, + "grad_norm": 0.19887791574001312, + "learning_rate": 1.1321310063599483e-05, + "loss": 1.7169, + "step": 25677 + }, + { + "epoch": 7.881522406384285, + "grad_norm": 0.1658533811569214, + "learning_rate": 1.1318160380009334e-05, + "loss": 1.6902, + "step": 25678 + }, + { + "epoch": 7.88182934315531, + "grad_norm": 0.16859948635101318, + "learning_rate": 1.131501107869577e-05, + "loss": 1.7015, + "step": 25679 + }, + { + "epoch": 7.8821362799263355, + "grad_norm": 0.20775821805000305, + "learning_rate": 1.1311862159689968e-05, + "loss": 1.7519, + "step": 25680 + }, + { + "epoch": 7.882443216697361, + "grad_norm": 0.18174295127391815, + "learning_rate": 1.1308713623022987e-05, + "loss": 1.7161, + "step": 25681 + }, + { + "epoch": 7.882750153468385, + "grad_norm": 0.1843954473733902, + "learning_rate": 1.1305565468725993e-05, + "loss": 1.6753, + "step": 25682 + }, + { + "epoch": 7.883057090239411, + "grad_norm": 0.1856461614370346, + "learning_rate": 1.130241769683008e-05, + "loss": 1.7139, + "step": 25683 + }, + { + "epoch": 7.883364027010436, + "grad_norm": 0.15803632140159607, + "learning_rate": 1.129927030736636e-05, + "loss": 1.6705, + "step": 25684 + }, + { + "epoch": 7.883670963781461, + "grad_norm": 0.1680101901292801, + "learning_rate": 1.1296123300365947e-05, + "loss": 1.6757, + "step": 25685 + }, + { + "epoch": 7.883977900552486, + "grad_norm": 0.157195046544075, + "learning_rate": 1.1292976675859895e-05, + "loss": 1.6922, + "step": 25686 + }, + { + "epoch": 7.884284837323511, + "grad_norm": 0.17270046472549438, + "learning_rate": 1.1289830433879356e-05, + "loss": 1.6909, + "step": 25687 + }, + { + "epoch": 7.884591774094536, + "grad_norm": 0.1880030781030655, + "learning_rate": 1.1286684574455398e-05, + "loss": 1.7139, + "step": 25688 + }, + { + "epoch": 7.884898710865562, + "grad_norm": 0.1882653832435608, + "learning_rate": 1.1283539097619112e-05, + "loss": 1.7464, + "step": 25689 + }, + { + "epoch": 7.885205647636587, + "grad_norm": 0.2060890644788742, + "learning_rate": 1.128039400340159e-05, + "loss": 1.6749, + "step": 25690 + }, + { + "epoch": 7.885512584407612, + "grad_norm": 0.20780493319034576, + "learning_rate": 1.1277249291833903e-05, + "loss": 1.7581, + "step": 25691 + }, + { + "epoch": 7.885819521178637, + "grad_norm": 0.1929686814546585, + "learning_rate": 1.1274104962947135e-05, + "loss": 1.6962, + "step": 25692 + }, + { + "epoch": 7.886126457949662, + "grad_norm": 0.21474432945251465, + "learning_rate": 1.1270961016772363e-05, + "loss": 1.6984, + "step": 25693 + }, + { + "epoch": 7.8864333947206875, + "grad_norm": 0.17453257739543915, + "learning_rate": 1.126781745334064e-05, + "loss": 1.679, + "step": 25694 + }, + { + "epoch": 7.886740331491713, + "grad_norm": 0.21506772935390472, + "learning_rate": 1.1264674272683073e-05, + "loss": 1.7209, + "step": 25695 + }, + { + "epoch": 7.887047268262738, + "grad_norm": 0.2470129430294037, + "learning_rate": 1.1261531474830672e-05, + "loss": 1.7183, + "step": 25696 + }, + { + "epoch": 7.887354205033763, + "grad_norm": 0.2026570737361908, + "learning_rate": 1.1258389059814545e-05, + "loss": 1.6579, + "step": 25697 + }, + { + "epoch": 7.887661141804788, + "grad_norm": 0.18859948217868805, + "learning_rate": 1.1255247027665699e-05, + "loss": 1.6831, + "step": 25698 + }, + { + "epoch": 7.887968078575813, + "grad_norm": 0.2106257677078247, + "learning_rate": 1.1252105378415229e-05, + "loss": 1.724, + "step": 25699 + }, + { + "epoch": 7.888275015346839, + "grad_norm": 0.17260697484016418, + "learning_rate": 1.1248964112094162e-05, + "loss": 1.6875, + "step": 25700 + }, + { + "epoch": 7.888581952117864, + "grad_norm": 0.20596550405025482, + "learning_rate": 1.1245823228733542e-05, + "loss": 1.7569, + "step": 25701 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 0.1724967509508133, + "learning_rate": 1.1242682728364428e-05, + "loss": 1.7063, + "step": 25702 + }, + { + "epoch": 7.889195825659914, + "grad_norm": 0.2189379185438156, + "learning_rate": 1.123954261101781e-05, + "loss": 1.789, + "step": 25703 + }, + { + "epoch": 7.889502762430939, + "grad_norm": 0.1539442539215088, + "learning_rate": 1.1236402876724766e-05, + "loss": 1.6573, + "step": 25704 + }, + { + "epoch": 7.889809699201964, + "grad_norm": 0.2854970693588257, + "learning_rate": 1.1233263525516313e-05, + "loss": 1.7683, + "step": 25705 + }, + { + "epoch": 7.89011663597299, + "grad_norm": 0.18263237178325653, + "learning_rate": 1.1230124557423465e-05, + "loss": 1.6911, + "step": 25706 + }, + { + "epoch": 7.890423572744015, + "grad_norm": 0.2098342627286911, + "learning_rate": 1.122698597247725e-05, + "loss": 1.7306, + "step": 25707 + }, + { + "epoch": 7.8907305095150395, + "grad_norm": 0.20822781324386597, + "learning_rate": 1.122384777070869e-05, + "loss": 1.7777, + "step": 25708 + }, + { + "epoch": 7.891037446286065, + "grad_norm": 0.24466483294963837, + "learning_rate": 1.122070995214879e-05, + "loss": 1.6966, + "step": 25709 + }, + { + "epoch": 7.89134438305709, + "grad_norm": 0.1500372439622879, + "learning_rate": 1.1217572516828561e-05, + "loss": 1.6787, + "step": 25710 + }, + { + "epoch": 7.8916513198281155, + "grad_norm": 0.2238166481256485, + "learning_rate": 1.1214435464779006e-05, + "loss": 1.7957, + "step": 25711 + }, + { + "epoch": 7.891958256599141, + "grad_norm": 0.22993433475494385, + "learning_rate": 1.1211298796031156e-05, + "loss": 1.7142, + "step": 25712 + }, + { + "epoch": 7.892265193370166, + "grad_norm": 0.15912945568561554, + "learning_rate": 1.1208162510615955e-05, + "loss": 1.7188, + "step": 25713 + }, + { + "epoch": 7.892572130141191, + "grad_norm": 0.2096986174583435, + "learning_rate": 1.1205026608564461e-05, + "loss": 1.7409, + "step": 25714 + }, + { + "epoch": 7.892879066912216, + "grad_norm": 0.18928684294223785, + "learning_rate": 1.1201891089907601e-05, + "loss": 1.6703, + "step": 25715 + }, + { + "epoch": 7.893186003683241, + "grad_norm": 0.19096077978610992, + "learning_rate": 1.119875595467641e-05, + "loss": 1.7393, + "step": 25716 + }, + { + "epoch": 7.893492940454267, + "grad_norm": 0.2286420315504074, + "learning_rate": 1.1195621202901851e-05, + "loss": 1.6995, + "step": 25717 + }, + { + "epoch": 7.893799877225292, + "grad_norm": 0.16288414597511292, + "learning_rate": 1.1192486834614912e-05, + "loss": 1.7334, + "step": 25718 + }, + { + "epoch": 7.894106813996316, + "grad_norm": 0.17358547449111938, + "learning_rate": 1.118935284984658e-05, + "loss": 1.7114, + "step": 25719 + }, + { + "epoch": 7.894413750767342, + "grad_norm": 0.16833151876926422, + "learning_rate": 1.1186219248627777e-05, + "loss": 1.6998, + "step": 25720 + }, + { + "epoch": 7.894720687538367, + "grad_norm": 0.14409767091274261, + "learning_rate": 1.118308603098952e-05, + "loss": 1.713, + "step": 25721 + }, + { + "epoch": 7.895027624309392, + "grad_norm": 0.18832024931907654, + "learning_rate": 1.1179953196962761e-05, + "loss": 1.6862, + "step": 25722 + }, + { + "epoch": 7.895334561080418, + "grad_norm": 0.1837761402130127, + "learning_rate": 1.1176820746578454e-05, + "loss": 1.6674, + "step": 25723 + }, + { + "epoch": 7.895641497851443, + "grad_norm": 0.14717474579811096, + "learning_rate": 1.1173688679867561e-05, + "loss": 1.6619, + "step": 25724 + }, + { + "epoch": 7.8959484346224675, + "grad_norm": 0.13512545824050903, + "learning_rate": 1.1170556996861032e-05, + "loss": 1.664, + "step": 25725 + }, + { + "epoch": 7.896255371393493, + "grad_norm": 0.21533837914466858, + "learning_rate": 1.1167425697589817e-05, + "loss": 1.7205, + "step": 25726 + }, + { + "epoch": 7.896562308164518, + "grad_norm": 0.15241803228855133, + "learning_rate": 1.1164294782084866e-05, + "loss": 1.6838, + "step": 25727 + }, + { + "epoch": 7.8968692449355435, + "grad_norm": 0.14889933168888092, + "learning_rate": 1.1161164250377099e-05, + "loss": 1.7197, + "step": 25728 + }, + { + "epoch": 7.897176181706568, + "grad_norm": 0.15948614478111267, + "learning_rate": 1.11580341024975e-05, + "loss": 1.6948, + "step": 25729 + }, + { + "epoch": 7.897483118477593, + "grad_norm": 0.17862235009670258, + "learning_rate": 1.1154904338476946e-05, + "loss": 1.743, + "step": 25730 + }, + { + "epoch": 7.897790055248619, + "grad_norm": 0.18168844282627106, + "learning_rate": 1.1151774958346422e-05, + "loss": 1.7291, + "step": 25731 + }, + { + "epoch": 7.898096992019644, + "grad_norm": 0.17636772990226746, + "learning_rate": 1.11486459621368e-05, + "loss": 1.7428, + "step": 25732 + }, + { + "epoch": 7.898403928790669, + "grad_norm": 0.1677904576063156, + "learning_rate": 1.1145517349879048e-05, + "loss": 1.7026, + "step": 25733 + }, + { + "epoch": 7.898710865561695, + "grad_norm": 0.1851150244474411, + "learning_rate": 1.1142389121604063e-05, + "loss": 1.7743, + "step": 25734 + }, + { + "epoch": 7.899017802332719, + "grad_norm": 0.19713786244392395, + "learning_rate": 1.1139261277342767e-05, + "loss": 1.7287, + "step": 25735 + }, + { + "epoch": 7.899324739103744, + "grad_norm": 0.2060006707906723, + "learning_rate": 1.1136133817126076e-05, + "loss": 1.7377, + "step": 25736 + }, + { + "epoch": 7.89963167587477, + "grad_norm": 0.18026013672351837, + "learning_rate": 1.1133006740984864e-05, + "loss": 1.7322, + "step": 25737 + }, + { + "epoch": 7.899938612645795, + "grad_norm": 0.1787644922733307, + "learning_rate": 1.1129880048950075e-05, + "loss": 1.7457, + "step": 25738 + }, + { + "epoch": 7.9002455494168204, + "grad_norm": 0.16092467308044434, + "learning_rate": 1.1126753741052593e-05, + "loss": 1.7451, + "step": 25739 + }, + { + "epoch": 7.900552486187845, + "grad_norm": 0.15322941541671753, + "learning_rate": 1.1123627817323318e-05, + "loss": 1.667, + "step": 25740 + }, + { + "epoch": 7.90085942295887, + "grad_norm": 0.1488087922334671, + "learning_rate": 1.1120502277793137e-05, + "loss": 1.684, + "step": 25741 + }, + { + "epoch": 7.901166359729896, + "grad_norm": 0.15332907438278198, + "learning_rate": 1.111737712249294e-05, + "loss": 1.6646, + "step": 25742 + }, + { + "epoch": 7.901473296500921, + "grad_norm": 0.19801980257034302, + "learning_rate": 1.1114252351453614e-05, + "loss": 1.7469, + "step": 25743 + }, + { + "epoch": 7.901780233271946, + "grad_norm": 0.17123407125473022, + "learning_rate": 1.1111127964706035e-05, + "loss": 1.7319, + "step": 25744 + }, + { + "epoch": 7.902087170042972, + "grad_norm": 0.1753319650888443, + "learning_rate": 1.1108003962281066e-05, + "loss": 1.7212, + "step": 25745 + }, + { + "epoch": 7.902394106813996, + "grad_norm": 0.1598043441772461, + "learning_rate": 1.1104880344209634e-05, + "loss": 1.6823, + "step": 25746 + }, + { + "epoch": 7.902701043585021, + "grad_norm": 0.14227038621902466, + "learning_rate": 1.1101757110522538e-05, + "loss": 1.6665, + "step": 25747 + }, + { + "epoch": 7.903007980356047, + "grad_norm": 0.1531791388988495, + "learning_rate": 1.1098634261250706e-05, + "loss": 1.717, + "step": 25748 + }, + { + "epoch": 7.903314917127072, + "grad_norm": 0.18077540397644043, + "learning_rate": 1.109551179642494e-05, + "loss": 1.7237, + "step": 25749 + }, + { + "epoch": 7.903621853898097, + "grad_norm": 0.22373250126838684, + "learning_rate": 1.1092389716076145e-05, + "loss": 1.7678, + "step": 25750 + }, + { + "epoch": 7.903928790669122, + "grad_norm": 0.16022193431854248, + "learning_rate": 1.1089268020235166e-05, + "loss": 1.6985, + "step": 25751 + }, + { + "epoch": 7.904235727440147, + "grad_norm": 0.17306078970432281, + "learning_rate": 1.1086146708932837e-05, + "loss": 1.6653, + "step": 25752 + }, + { + "epoch": 7.9045426642111725, + "grad_norm": 0.16284874081611633, + "learning_rate": 1.1083025782200035e-05, + "loss": 1.6762, + "step": 25753 + }, + { + "epoch": 7.904849600982198, + "grad_norm": 0.17309556901454926, + "learning_rate": 1.107990524006755e-05, + "loss": 1.7103, + "step": 25754 + }, + { + "epoch": 7.905156537753223, + "grad_norm": 0.1508374810218811, + "learning_rate": 1.107678508256627e-05, + "loss": 1.6932, + "step": 25755 + }, + { + "epoch": 7.9054634745242485, + "grad_norm": 0.1941400021314621, + "learning_rate": 1.1073665309727016e-05, + "loss": 1.7922, + "step": 25756 + }, + { + "epoch": 7.905770411295273, + "grad_norm": 0.1890190988779068, + "learning_rate": 1.107054592158061e-05, + "loss": 1.6765, + "step": 25757 + }, + { + "epoch": 7.906077348066298, + "grad_norm": 0.19425363838672638, + "learning_rate": 1.1067426918157892e-05, + "loss": 1.7284, + "step": 25758 + }, + { + "epoch": 7.906384284837324, + "grad_norm": 0.18147888779640198, + "learning_rate": 1.1064308299489678e-05, + "loss": 1.7099, + "step": 25759 + }, + { + "epoch": 7.906691221608349, + "grad_norm": 0.19644278287887573, + "learning_rate": 1.106119006560679e-05, + "loss": 1.7691, + "step": 25760 + }, + { + "epoch": 7.906998158379373, + "grad_norm": 0.14809735119342804, + "learning_rate": 1.1058072216540045e-05, + "loss": 1.6735, + "step": 25761 + }, + { + "epoch": 7.907305095150399, + "grad_norm": 0.17835088074207306, + "learning_rate": 1.105495475232024e-05, + "loss": 1.6928, + "step": 25762 + }, + { + "epoch": 7.907612031921424, + "grad_norm": 0.18341144919395447, + "learning_rate": 1.1051837672978227e-05, + "loss": 1.7393, + "step": 25763 + }, + { + "epoch": 7.907918968692449, + "grad_norm": 0.2026391327381134, + "learning_rate": 1.1048720978544753e-05, + "loss": 1.7037, + "step": 25764 + }, + { + "epoch": 7.908225905463475, + "grad_norm": 0.19855152070522308, + "learning_rate": 1.104560466905068e-05, + "loss": 1.7341, + "step": 25765 + }, + { + "epoch": 7.9085328422345, + "grad_norm": 0.18974080681800842, + "learning_rate": 1.1042488744526741e-05, + "loss": 1.6717, + "step": 25766 + }, + { + "epoch": 7.9088397790055245, + "grad_norm": 0.1727920025587082, + "learning_rate": 1.1039373205003784e-05, + "loss": 1.6994, + "step": 25767 + }, + { + "epoch": 7.90914671577655, + "grad_norm": 0.20549818873405457, + "learning_rate": 1.1036258050512566e-05, + "loss": 1.7055, + "step": 25768 + }, + { + "epoch": 7.909453652547575, + "grad_norm": 0.15696507692337036, + "learning_rate": 1.1033143281083891e-05, + "loss": 1.678, + "step": 25769 + }, + { + "epoch": 7.9097605893186005, + "grad_norm": 0.1568988859653473, + "learning_rate": 1.1030028896748546e-05, + "loss": 1.6855, + "step": 25770 + }, + { + "epoch": 7.910067526089626, + "grad_norm": 0.17795592546463013, + "learning_rate": 1.1026914897537266e-05, + "loss": 1.7306, + "step": 25771 + }, + { + "epoch": 7.91037446286065, + "grad_norm": 0.19906511902809143, + "learning_rate": 1.1023801283480872e-05, + "loss": 1.7125, + "step": 25772 + }, + { + "epoch": 7.910681399631676, + "grad_norm": 0.16972185671329498, + "learning_rate": 1.1020688054610118e-05, + "loss": 1.714, + "step": 25773 + }, + { + "epoch": 7.910988336402701, + "grad_norm": 0.20585502684116364, + "learning_rate": 1.1017575210955772e-05, + "loss": 1.7342, + "step": 25774 + }, + { + "epoch": 7.911295273173726, + "grad_norm": 0.1772177368402481, + "learning_rate": 1.1014462752548592e-05, + "loss": 1.7091, + "step": 25775 + }, + { + "epoch": 7.911602209944752, + "grad_norm": 0.1818380057811737, + "learning_rate": 1.1011350679419341e-05, + "loss": 1.7131, + "step": 25776 + }, + { + "epoch": 7.911909146715777, + "grad_norm": 0.17451459169387817, + "learning_rate": 1.1008238991598779e-05, + "loss": 1.6633, + "step": 25777 + }, + { + "epoch": 7.912216083486801, + "grad_norm": 0.18837687373161316, + "learning_rate": 1.100512768911765e-05, + "loss": 1.7132, + "step": 25778 + }, + { + "epoch": 7.912523020257827, + "grad_norm": 0.15283817052841187, + "learning_rate": 1.1002016772006695e-05, + "loss": 1.6833, + "step": 25779 + }, + { + "epoch": 7.912829957028852, + "grad_norm": 0.15264299511909485, + "learning_rate": 1.0998906240296692e-05, + "loss": 1.7098, + "step": 25780 + }, + { + "epoch": 7.913136893799877, + "grad_norm": 0.18866822123527527, + "learning_rate": 1.099579609401833e-05, + "loss": 1.7173, + "step": 25781 + }, + { + "epoch": 7.913443830570903, + "grad_norm": 0.19261083006858826, + "learning_rate": 1.0992686333202401e-05, + "loss": 1.7269, + "step": 25782 + }, + { + "epoch": 7.913750767341927, + "grad_norm": 0.19681799411773682, + "learning_rate": 1.0989576957879577e-05, + "loss": 1.6594, + "step": 25783 + }, + { + "epoch": 7.9140577041129525, + "grad_norm": 0.21298938989639282, + "learning_rate": 1.0986467968080639e-05, + "loss": 1.8509, + "step": 25784 + }, + { + "epoch": 7.914364640883978, + "grad_norm": 0.17769277095794678, + "learning_rate": 1.0983359363836287e-05, + "loss": 1.7177, + "step": 25785 + }, + { + "epoch": 7.914671577655003, + "grad_norm": 0.19831274449825287, + "learning_rate": 1.0980251145177246e-05, + "loss": 1.7107, + "step": 25786 + }, + { + "epoch": 7.9149785144260285, + "grad_norm": 0.16204139590263367, + "learning_rate": 1.0977143312134248e-05, + "loss": 1.7052, + "step": 25787 + }, + { + "epoch": 7.915285451197054, + "grad_norm": 0.1709459275007248, + "learning_rate": 1.0974035864737958e-05, + "loss": 1.6944, + "step": 25788 + }, + { + "epoch": 7.915592387968078, + "grad_norm": 0.17710284888744354, + "learning_rate": 1.0970928803019142e-05, + "loss": 1.7253, + "step": 25789 + }, + { + "epoch": 7.915899324739104, + "grad_norm": 0.17316623032093048, + "learning_rate": 1.0967822127008481e-05, + "loss": 1.6458, + "step": 25790 + }, + { + "epoch": 7.916206261510129, + "grad_norm": 0.15644441545009613, + "learning_rate": 1.0964715836736677e-05, + "loss": 1.6749, + "step": 25791 + }, + { + "epoch": 7.916513198281154, + "grad_norm": 0.1425870954990387, + "learning_rate": 1.096160993223443e-05, + "loss": 1.7283, + "step": 25792 + }, + { + "epoch": 7.91682013505218, + "grad_norm": 0.1724596619606018, + "learning_rate": 1.0958504413532438e-05, + "loss": 1.7152, + "step": 25793 + }, + { + "epoch": 7.917127071823204, + "grad_norm": 0.20472319424152374, + "learning_rate": 1.0955399280661383e-05, + "loss": 1.7818, + "step": 25794 + }, + { + "epoch": 7.917434008594229, + "grad_norm": 0.18012158572673798, + "learning_rate": 1.0952294533651963e-05, + "loss": 1.6995, + "step": 25795 + }, + { + "epoch": 7.917740945365255, + "grad_norm": 0.1460564136505127, + "learning_rate": 1.0949190172534851e-05, + "loss": 1.6752, + "step": 25796 + }, + { + "epoch": 7.91804788213628, + "grad_norm": 0.16467545926570892, + "learning_rate": 1.0946086197340733e-05, + "loss": 1.7, + "step": 25797 + }, + { + "epoch": 7.918354818907305, + "grad_norm": 0.20123273134231567, + "learning_rate": 1.0942982608100266e-05, + "loss": 1.7423, + "step": 25798 + }, + { + "epoch": 7.918661755678331, + "grad_norm": 0.160671204328537, + "learning_rate": 1.0939879404844167e-05, + "loss": 1.6992, + "step": 25799 + }, + { + "epoch": 7.918968692449355, + "grad_norm": 0.18679293990135193, + "learning_rate": 1.0936776587603043e-05, + "loss": 1.7789, + "step": 25800 + }, + { + "epoch": 7.9192756292203805, + "grad_norm": 0.1598452925682068, + "learning_rate": 1.0933674156407602e-05, + "loss": 1.6961, + "step": 25801 + }, + { + "epoch": 7.919582565991406, + "grad_norm": 0.13918142020702362, + "learning_rate": 1.0930572111288506e-05, + "loss": 1.6727, + "step": 25802 + }, + { + "epoch": 7.919889502762431, + "grad_norm": 0.16652320325374603, + "learning_rate": 1.0927470452276367e-05, + "loss": 1.7135, + "step": 25803 + }, + { + "epoch": 7.920196439533456, + "grad_norm": 0.1637706309556961, + "learning_rate": 1.0924369179401893e-05, + "loss": 1.7078, + "step": 25804 + }, + { + "epoch": 7.920503376304481, + "grad_norm": 0.19709086418151855, + "learning_rate": 1.092126829269568e-05, + "loss": 1.7425, + "step": 25805 + }, + { + "epoch": 7.920810313075506, + "grad_norm": 0.13402192294597626, + "learning_rate": 1.091816779218841e-05, + "loss": 1.663, + "step": 25806 + }, + { + "epoch": 7.921117249846532, + "grad_norm": 0.18932323157787323, + "learning_rate": 1.0915067677910718e-05, + "loss": 1.7651, + "step": 25807 + }, + { + "epoch": 7.921424186617557, + "grad_norm": 0.1586374193429947, + "learning_rate": 1.0911967949893231e-05, + "loss": 1.6709, + "step": 25808 + }, + { + "epoch": 7.921731123388582, + "grad_norm": 0.1570933312177658, + "learning_rate": 1.0908868608166589e-05, + "loss": 1.7166, + "step": 25809 + }, + { + "epoch": 7.922038060159607, + "grad_norm": 0.19786952435970306, + "learning_rate": 1.0905769652761416e-05, + "loss": 1.7347, + "step": 25810 + }, + { + "epoch": 7.922344996930632, + "grad_norm": 0.14969857037067413, + "learning_rate": 1.0902671083708343e-05, + "loss": 1.6471, + "step": 25811 + }, + { + "epoch": 7.922651933701657, + "grad_norm": 0.17460933327674866, + "learning_rate": 1.089957290103799e-05, + "loss": 1.7594, + "step": 25812 + }, + { + "epoch": 7.922958870472683, + "grad_norm": 0.17380566895008087, + "learning_rate": 1.0896475104780974e-05, + "loss": 1.6721, + "step": 25813 + }, + { + "epoch": 7.923265807243708, + "grad_norm": 0.1599249392747879, + "learning_rate": 1.0893377694967916e-05, + "loss": 1.6842, + "step": 25814 + }, + { + "epoch": 7.9235727440147325, + "grad_norm": 0.15319927036762238, + "learning_rate": 1.0890280671629398e-05, + "loss": 1.6529, + "step": 25815 + }, + { + "epoch": 7.923879680785758, + "grad_norm": 0.20122043788433075, + "learning_rate": 1.0887184034796082e-05, + "loss": 1.8009, + "step": 25816 + }, + { + "epoch": 7.924186617556783, + "grad_norm": 0.1726430058479309, + "learning_rate": 1.0884087784498515e-05, + "loss": 1.7595, + "step": 25817 + }, + { + "epoch": 7.9244935543278086, + "grad_norm": 0.1657346487045288, + "learning_rate": 1.0880991920767336e-05, + "loss": 1.7051, + "step": 25818 + }, + { + "epoch": 7.924800491098834, + "grad_norm": 0.19500960409641266, + "learning_rate": 1.0877896443633117e-05, + "loss": 1.6809, + "step": 25819 + }, + { + "epoch": 7.925107427869859, + "grad_norm": 0.18751180171966553, + "learning_rate": 1.087480135312644e-05, + "loss": 1.7613, + "step": 25820 + }, + { + "epoch": 7.925414364640884, + "grad_norm": 0.20735877752304077, + "learning_rate": 1.0871706649277935e-05, + "loss": 1.7515, + "step": 25821 + }, + { + "epoch": 7.925721301411909, + "grad_norm": 0.19349408149719238, + "learning_rate": 1.0868612332118133e-05, + "loss": 1.7053, + "step": 25822 + }, + { + "epoch": 7.926028238182934, + "grad_norm": 0.15639854967594147, + "learning_rate": 1.0865518401677649e-05, + "loss": 1.6907, + "step": 25823 + }, + { + "epoch": 7.92633517495396, + "grad_norm": 0.18366692960262299, + "learning_rate": 1.0862424857987059e-05, + "loss": 1.6791, + "step": 25824 + }, + { + "epoch": 7.926642111724985, + "grad_norm": 0.1648077666759491, + "learning_rate": 1.0859331701076913e-05, + "loss": 1.6671, + "step": 25825 + }, + { + "epoch": 7.9269490484960095, + "grad_norm": 0.17894984781742096, + "learning_rate": 1.0856238930977802e-05, + "loss": 1.736, + "step": 25826 + }, + { + "epoch": 7.927255985267035, + "grad_norm": 0.13542817533016205, + "learning_rate": 1.0853146547720278e-05, + "loss": 1.6613, + "step": 25827 + }, + { + "epoch": 7.92756292203806, + "grad_norm": 0.1598762571811676, + "learning_rate": 1.0850054551334905e-05, + "loss": 1.6828, + "step": 25828 + }, + { + "epoch": 7.9278698588090855, + "grad_norm": 0.19212616980075836, + "learning_rate": 1.0846962941852235e-05, + "loss": 1.8198, + "step": 25829 + }, + { + "epoch": 7.928176795580111, + "grad_norm": 0.19344113767147064, + "learning_rate": 1.0843871719302829e-05, + "loss": 1.7804, + "step": 25830 + }, + { + "epoch": 7.928483732351136, + "grad_norm": 0.15460920333862305, + "learning_rate": 1.0840780883717233e-05, + "loss": 1.7372, + "step": 25831 + }, + { + "epoch": 7.928790669122161, + "grad_norm": 0.19987867772579193, + "learning_rate": 1.083769043512598e-05, + "loss": 1.6923, + "step": 25832 + }, + { + "epoch": 7.929097605893186, + "grad_norm": 0.15390315651893616, + "learning_rate": 1.083460037355965e-05, + "loss": 1.6864, + "step": 25833 + }, + { + "epoch": 7.929404542664211, + "grad_norm": 0.18596698343753815, + "learning_rate": 1.0831510699048724e-05, + "loss": 1.7135, + "step": 25834 + }, + { + "epoch": 7.929711479435237, + "grad_norm": 0.172935351729393, + "learning_rate": 1.0828421411623796e-05, + "loss": 1.7426, + "step": 25835 + }, + { + "epoch": 7.930018416206261, + "grad_norm": 0.2046828418970108, + "learning_rate": 1.0825332511315356e-05, + "loss": 1.7178, + "step": 25836 + }, + { + "epoch": 7.930325352977286, + "grad_norm": 0.1382901519536972, + "learning_rate": 1.0822243998153925e-05, + "loss": 1.6811, + "step": 25837 + }, + { + "epoch": 7.930632289748312, + "grad_norm": 0.1675405353307724, + "learning_rate": 1.0819155872170068e-05, + "loss": 1.7278, + "step": 25838 + }, + { + "epoch": 7.930939226519337, + "grad_norm": 0.16732639074325562, + "learning_rate": 1.0816068133394252e-05, + "loss": 1.6847, + "step": 25839 + }, + { + "epoch": 7.931246163290362, + "grad_norm": 0.17154982686042786, + "learning_rate": 1.0812980781857047e-05, + "loss": 1.7411, + "step": 25840 + }, + { + "epoch": 7.931553100061388, + "grad_norm": 0.16475310921669006, + "learning_rate": 1.08098938175889e-05, + "loss": 1.7222, + "step": 25841 + }, + { + "epoch": 7.931860036832412, + "grad_norm": 0.1613023579120636, + "learning_rate": 1.080680724062037e-05, + "loss": 1.718, + "step": 25842 + }, + { + "epoch": 7.9321669736034375, + "grad_norm": 0.16330939531326294, + "learning_rate": 1.0803721050981941e-05, + "loss": 1.7087, + "step": 25843 + }, + { + "epoch": 7.932473910374463, + "grad_norm": 0.15881259739398956, + "learning_rate": 1.0800635248704117e-05, + "loss": 1.7309, + "step": 25844 + }, + { + "epoch": 7.932780847145488, + "grad_norm": 0.19191724061965942, + "learning_rate": 1.0797549833817389e-05, + "loss": 1.7131, + "step": 25845 + }, + { + "epoch": 7.9330877839165135, + "grad_norm": 0.17083698511123657, + "learning_rate": 1.079446480635225e-05, + "loss": 1.7117, + "step": 25846 + }, + { + "epoch": 7.933394720687538, + "grad_norm": 0.18097929656505585, + "learning_rate": 1.0791380166339193e-05, + "loss": 1.7017, + "step": 25847 + }, + { + "epoch": 7.933701657458563, + "grad_norm": 0.1556827276945114, + "learning_rate": 1.0788295913808694e-05, + "loss": 1.7589, + "step": 25848 + }, + { + "epoch": 7.934008594229589, + "grad_norm": 0.1667819619178772, + "learning_rate": 1.0785212048791226e-05, + "loss": 1.6735, + "step": 25849 + }, + { + "epoch": 7.934315531000614, + "grad_norm": 0.18772241473197937, + "learning_rate": 1.0782128571317302e-05, + "loss": 1.6984, + "step": 25850 + }, + { + "epoch": 7.934622467771639, + "grad_norm": 0.1752445250749588, + "learning_rate": 1.0779045481417343e-05, + "loss": 1.6662, + "step": 25851 + }, + { + "epoch": 7.934929404542665, + "grad_norm": 0.16619165241718292, + "learning_rate": 1.0775962779121873e-05, + "loss": 1.765, + "step": 25852 + }, + { + "epoch": 7.935236341313689, + "grad_norm": 0.1685585081577301, + "learning_rate": 1.0772880464461316e-05, + "loss": 1.6692, + "step": 25853 + }, + { + "epoch": 7.935543278084714, + "grad_norm": 0.16806848347187042, + "learning_rate": 1.076979853746613e-05, + "loss": 1.7081, + "step": 25854 + }, + { + "epoch": 7.93585021485574, + "grad_norm": 0.14273032546043396, + "learning_rate": 1.076671699816682e-05, + "loss": 1.6668, + "step": 25855 + }, + { + "epoch": 7.936157151626765, + "grad_norm": 0.24727863073349, + "learning_rate": 1.0763635846593778e-05, + "loss": 1.7624, + "step": 25856 + }, + { + "epoch": 7.93646408839779, + "grad_norm": 0.15679748356342316, + "learning_rate": 1.0760555082777506e-05, + "loss": 1.6851, + "step": 25857 + }, + { + "epoch": 7.936771025168815, + "grad_norm": 0.23388828337192535, + "learning_rate": 1.075747470674841e-05, + "loss": 1.7557, + "step": 25858 + }, + { + "epoch": 7.93707796193984, + "grad_norm": 0.15266747772693634, + "learning_rate": 1.0754394718536958e-05, + "loss": 1.6559, + "step": 25859 + }, + { + "epoch": 7.9373848987108655, + "grad_norm": 0.1945476084947586, + "learning_rate": 1.0751315118173577e-05, + "loss": 1.745, + "step": 25860 + }, + { + "epoch": 7.937691835481891, + "grad_norm": 0.18018878996372223, + "learning_rate": 1.0748235905688709e-05, + "loss": 1.7016, + "step": 25861 + }, + { + "epoch": 7.937998772252916, + "grad_norm": 0.1748870611190796, + "learning_rate": 1.0745157081112777e-05, + "loss": 1.6989, + "step": 25862 + }, + { + "epoch": 7.9383057090239415, + "grad_norm": 0.18253664672374725, + "learning_rate": 1.0742078644476217e-05, + "loss": 1.7554, + "step": 25863 + }, + { + "epoch": 7.938612645794966, + "grad_norm": 0.17009632289409637, + "learning_rate": 1.073900059580944e-05, + "loss": 1.7244, + "step": 25864 + }, + { + "epoch": 7.938919582565991, + "grad_norm": 0.17612707614898682, + "learning_rate": 1.0735922935142873e-05, + "loss": 1.6939, + "step": 25865 + }, + { + "epoch": 7.939226519337017, + "grad_norm": 0.21207575500011444, + "learning_rate": 1.0732845662506913e-05, + "loss": 1.7097, + "step": 25866 + }, + { + "epoch": 7.939533456108042, + "grad_norm": 0.2073012739419937, + "learning_rate": 1.0729768777932014e-05, + "loss": 1.7658, + "step": 25867 + }, + { + "epoch": 7.939840392879067, + "grad_norm": 0.18888477981090546, + "learning_rate": 1.072669228144853e-05, + "loss": 1.7496, + "step": 25868 + }, + { + "epoch": 7.940147329650092, + "grad_norm": 0.1822361946105957, + "learning_rate": 1.0723616173086926e-05, + "loss": 1.7344, + "step": 25869 + }, + { + "epoch": 7.940454266421117, + "grad_norm": 0.18642890453338623, + "learning_rate": 1.0720540452877547e-05, + "loss": 1.7135, + "step": 25870 + }, + { + "epoch": 7.940761203192142, + "grad_norm": 0.19198815524578094, + "learning_rate": 1.0717465120850795e-05, + "loss": 1.7128, + "step": 25871 + }, + { + "epoch": 7.941068139963168, + "grad_norm": 0.1886969953775406, + "learning_rate": 1.0714390177037109e-05, + "loss": 1.7161, + "step": 25872 + }, + { + "epoch": 7.941375076734193, + "grad_norm": 0.19693820178508759, + "learning_rate": 1.0711315621466816e-05, + "loss": 1.7086, + "step": 25873 + }, + { + "epoch": 7.941682013505218, + "grad_norm": 0.19052870571613312, + "learning_rate": 1.0708241454170353e-05, + "loss": 1.7274, + "step": 25874 + }, + { + "epoch": 7.941988950276243, + "grad_norm": 0.23586300015449524, + "learning_rate": 1.0705167675178057e-05, + "loss": 1.7169, + "step": 25875 + }, + { + "epoch": 7.942295887047268, + "grad_norm": 0.2077670842409134, + "learning_rate": 1.0702094284520336e-05, + "loss": 1.7573, + "step": 25876 + }, + { + "epoch": 7.9426028238182935, + "grad_norm": 0.20345431566238403, + "learning_rate": 1.069902128222755e-05, + "loss": 1.6821, + "step": 25877 + }, + { + "epoch": 7.942909760589319, + "grad_norm": 0.1869240552186966, + "learning_rate": 1.0695948668330075e-05, + "loss": 1.6978, + "step": 25878 + }, + { + "epoch": 7.943216697360343, + "grad_norm": 0.17814506590366364, + "learning_rate": 1.0692876442858274e-05, + "loss": 1.7027, + "step": 25879 + }, + { + "epoch": 7.943523634131369, + "grad_norm": 0.19093535840511322, + "learning_rate": 1.0689804605842502e-05, + "loss": 1.7863, + "step": 25880 + }, + { + "epoch": 7.943830570902394, + "grad_norm": 0.17859873175621033, + "learning_rate": 1.0686733157313123e-05, + "loss": 1.7431, + "step": 25881 + }, + { + "epoch": 7.944137507673419, + "grad_norm": 0.16613568365573883, + "learning_rate": 1.0683662097300484e-05, + "loss": 1.7517, + "step": 25882 + }, + { + "epoch": 7.944444444444445, + "grad_norm": 0.1588357836008072, + "learning_rate": 1.0680591425834934e-05, + "loss": 1.7017, + "step": 25883 + }, + { + "epoch": 7.94475138121547, + "grad_norm": 0.1667826622724533, + "learning_rate": 1.067752114294685e-05, + "loss": 1.6965, + "step": 25884 + }, + { + "epoch": 7.945058317986494, + "grad_norm": 0.2015296071767807, + "learning_rate": 1.0674451248666522e-05, + "loss": 1.7625, + "step": 25885 + }, + { + "epoch": 7.94536525475752, + "grad_norm": 0.17073483765125275, + "learning_rate": 1.0671381743024344e-05, + "loss": 1.7194, + "step": 25886 + }, + { + "epoch": 7.945672191528545, + "grad_norm": 0.16649815440177917, + "learning_rate": 1.0668312626050608e-05, + "loss": 1.7233, + "step": 25887 + }, + { + "epoch": 7.94597912829957, + "grad_norm": 0.14395855367183685, + "learning_rate": 1.0665243897775645e-05, + "loss": 1.6859, + "step": 25888 + }, + { + "epoch": 7.946286065070596, + "grad_norm": 0.18934515118598938, + "learning_rate": 1.0662175558229826e-05, + "loss": 1.6832, + "step": 25889 + }, + { + "epoch": 7.94659300184162, + "grad_norm": 0.16819562017917633, + "learning_rate": 1.0659107607443419e-05, + "loss": 1.7592, + "step": 25890 + }, + { + "epoch": 7.9468999386126455, + "grad_norm": 0.1701207458972931, + "learning_rate": 1.0656040045446798e-05, + "loss": 1.6909, + "step": 25891 + }, + { + "epoch": 7.947206875383671, + "grad_norm": 0.18011561036109924, + "learning_rate": 1.0652972872270217e-05, + "loss": 1.7687, + "step": 25892 + }, + { + "epoch": 7.947513812154696, + "grad_norm": 0.15422853827476501, + "learning_rate": 1.0649906087944034e-05, + "loss": 1.6957, + "step": 25893 + }, + { + "epoch": 7.9478207489257215, + "grad_norm": 0.17223568260669708, + "learning_rate": 1.0646839692498545e-05, + "loss": 1.7368, + "step": 25894 + }, + { + "epoch": 7.948127685696747, + "grad_norm": 0.16706988215446472, + "learning_rate": 1.0643773685964053e-05, + "loss": 1.6981, + "step": 25895 + }, + { + "epoch": 7.948434622467771, + "grad_norm": 0.15490150451660156, + "learning_rate": 1.0640708068370853e-05, + "loss": 1.705, + "step": 25896 + }, + { + "epoch": 7.948741559238797, + "grad_norm": 0.16119123995304108, + "learning_rate": 1.0637642839749246e-05, + "loss": 1.7519, + "step": 25897 + }, + { + "epoch": 7.949048496009822, + "grad_norm": 0.1669061779975891, + "learning_rate": 1.0634578000129524e-05, + "loss": 1.7228, + "step": 25898 + }, + { + "epoch": 7.949355432780847, + "grad_norm": 0.1974606215953827, + "learning_rate": 1.0631513549541976e-05, + "loss": 1.7188, + "step": 25899 + }, + { + "epoch": 7.949662369551873, + "grad_norm": 0.204077810049057, + "learning_rate": 1.0628449488016873e-05, + "loss": 1.7397, + "step": 25900 + }, + { + "epoch": 7.949969306322897, + "grad_norm": 0.13561539351940155, + "learning_rate": 1.0625385815584537e-05, + "loss": 1.6457, + "step": 25901 + }, + { + "epoch": 7.9502762430939224, + "grad_norm": 0.1736447811126709, + "learning_rate": 1.0622322532275186e-05, + "loss": 1.7278, + "step": 25902 + }, + { + "epoch": 7.950583179864948, + "grad_norm": 0.1712762862443924, + "learning_rate": 1.061925963811915e-05, + "loss": 1.7208, + "step": 25903 + }, + { + "epoch": 7.950890116635973, + "grad_norm": 0.15313011407852173, + "learning_rate": 1.0616197133146661e-05, + "loss": 1.671, + "step": 25904 + }, + { + "epoch": 7.9511970534069984, + "grad_norm": 0.15110735595226288, + "learning_rate": 1.0613135017387981e-05, + "loss": 1.6568, + "step": 25905 + }, + { + "epoch": 7.951503990178024, + "grad_norm": 0.22678901255130768, + "learning_rate": 1.0610073290873413e-05, + "loss": 1.7415, + "step": 25906 + }, + { + "epoch": 7.951810926949048, + "grad_norm": 0.16936101019382477, + "learning_rate": 1.0607011953633162e-05, + "loss": 1.6983, + "step": 25907 + }, + { + "epoch": 7.952117863720074, + "grad_norm": 0.18443427979946136, + "learning_rate": 1.0603951005697533e-05, + "loss": 1.7334, + "step": 25908 + }, + { + "epoch": 7.952424800491099, + "grad_norm": 0.2290949672460556, + "learning_rate": 1.0600890447096729e-05, + "loss": 1.7219, + "step": 25909 + }, + { + "epoch": 7.952731737262124, + "grad_norm": 0.19244399666786194, + "learning_rate": 1.0597830277861026e-05, + "loss": 1.7047, + "step": 25910 + }, + { + "epoch": 7.953038674033149, + "grad_norm": 0.15806549787521362, + "learning_rate": 1.0594770498020657e-05, + "loss": 1.667, + "step": 25911 + }, + { + "epoch": 7.953345610804174, + "grad_norm": 0.23782655596733093, + "learning_rate": 1.0591711107605867e-05, + "loss": 1.7271, + "step": 25912 + }, + { + "epoch": 7.953652547575199, + "grad_norm": 0.18427079916000366, + "learning_rate": 1.0588652106646885e-05, + "loss": 1.7644, + "step": 25913 + }, + { + "epoch": 7.953959484346225, + "grad_norm": 0.18687991797924042, + "learning_rate": 1.058559349517394e-05, + "loss": 1.7045, + "step": 25914 + }, + { + "epoch": 7.95426642111725, + "grad_norm": 0.17435906827449799, + "learning_rate": 1.0582535273217265e-05, + "loss": 1.6681, + "step": 25915 + }, + { + "epoch": 7.954573357888275, + "grad_norm": 0.17601260542869568, + "learning_rate": 1.0579477440807079e-05, + "loss": 1.7141, + "step": 25916 + }, + { + "epoch": 7.9548802946593, + "grad_norm": 0.19225506484508514, + "learning_rate": 1.0576419997973586e-05, + "loss": 1.7224, + "step": 25917 + }, + { + "epoch": 7.955187231430325, + "grad_norm": 0.18801991641521454, + "learning_rate": 1.0573362944747045e-05, + "loss": 1.715, + "step": 25918 + }, + { + "epoch": 7.9554941682013505, + "grad_norm": 0.21490465104579926, + "learning_rate": 1.0570306281157616e-05, + "loss": 1.7931, + "step": 25919 + }, + { + "epoch": 7.955801104972376, + "grad_norm": 0.1877163052558899, + "learning_rate": 1.0567250007235557e-05, + "loss": 1.7365, + "step": 25920 + }, + { + "epoch": 7.956108041743401, + "grad_norm": 0.18460121750831604, + "learning_rate": 1.0564194123011029e-05, + "loss": 1.7092, + "step": 25921 + }, + { + "epoch": 7.956414978514426, + "grad_norm": 0.1663859337568283, + "learning_rate": 1.0561138628514239e-05, + "loss": 1.6847, + "step": 25922 + }, + { + "epoch": 7.956721915285451, + "grad_norm": 0.1676093488931656, + "learning_rate": 1.0558083523775413e-05, + "loss": 1.6788, + "step": 25923 + }, + { + "epoch": 7.957028852056476, + "grad_norm": 0.17470842599868774, + "learning_rate": 1.0555028808824702e-05, + "loss": 1.7658, + "step": 25924 + }, + { + "epoch": 7.957335788827502, + "grad_norm": 0.17770788073539734, + "learning_rate": 1.0551974483692346e-05, + "loss": 1.6875, + "step": 25925 + }, + { + "epoch": 7.957642725598527, + "grad_norm": 0.17924711108207703, + "learning_rate": 1.054892054840847e-05, + "loss": 1.7024, + "step": 25926 + }, + { + "epoch": 7.957949662369552, + "grad_norm": 0.19387175142765045, + "learning_rate": 1.0545867003003296e-05, + "loss": 1.7806, + "step": 25927 + }, + { + "epoch": 7.958256599140577, + "grad_norm": 0.176667258143425, + "learning_rate": 1.0542813847506988e-05, + "loss": 1.7187, + "step": 25928 + }, + { + "epoch": 7.958563535911602, + "grad_norm": 0.1730370670557022, + "learning_rate": 1.0539761081949723e-05, + "loss": 1.6912, + "step": 25929 + }, + { + "epoch": 7.958870472682627, + "grad_norm": 0.1836516112089157, + "learning_rate": 1.0536708706361665e-05, + "loss": 1.684, + "step": 25930 + }, + { + "epoch": 7.959177409453653, + "grad_norm": 0.17236517369747162, + "learning_rate": 1.0533656720772983e-05, + "loss": 1.6799, + "step": 25931 + }, + { + "epoch": 7.959484346224678, + "grad_norm": 0.1655581295490265, + "learning_rate": 1.0530605125213832e-05, + "loss": 1.755, + "step": 25932 + }, + { + "epoch": 7.9597912829957025, + "grad_norm": 0.1801871806383133, + "learning_rate": 1.0527553919714383e-05, + "loss": 1.6998, + "step": 25933 + }, + { + "epoch": 7.960098219766728, + "grad_norm": 0.20504651963710785, + "learning_rate": 1.052450310430476e-05, + "loss": 1.7793, + "step": 25934 + }, + { + "epoch": 7.960405156537753, + "grad_norm": 0.2522159516811371, + "learning_rate": 1.052145267901517e-05, + "loss": 1.754, + "step": 25935 + }, + { + "epoch": 7.9607120933087785, + "grad_norm": 0.18074269592761993, + "learning_rate": 1.0518402643875691e-05, + "loss": 1.717, + "step": 25936 + }, + { + "epoch": 7.961019030079804, + "grad_norm": 0.16463595628738403, + "learning_rate": 1.0515352998916527e-05, + "loss": 1.6994, + "step": 25937 + }, + { + "epoch": 7.961325966850829, + "grad_norm": 0.17102178931236267, + "learning_rate": 1.0512303744167778e-05, + "loss": 1.6571, + "step": 25938 + }, + { + "epoch": 7.961632903621854, + "grad_norm": 0.14453014731407166, + "learning_rate": 1.0509254879659569e-05, + "loss": 1.6725, + "step": 25939 + }, + { + "epoch": 7.961939840392879, + "grad_norm": 0.1980808526277542, + "learning_rate": 1.050620640542208e-05, + "loss": 1.6847, + "step": 25940 + }, + { + "epoch": 7.962246777163904, + "grad_norm": 0.15021857619285583, + "learning_rate": 1.0503158321485378e-05, + "loss": 1.6896, + "step": 25941 + }, + { + "epoch": 7.96255371393493, + "grad_norm": 0.2223394513130188, + "learning_rate": 1.0500110627879639e-05, + "loss": 1.7167, + "step": 25942 + }, + { + "epoch": 7.962860650705955, + "grad_norm": 0.17636358737945557, + "learning_rate": 1.0497063324634937e-05, + "loss": 1.6625, + "step": 25943 + }, + { + "epoch": 7.963167587476979, + "grad_norm": 0.1823662370443344, + "learning_rate": 1.049401641178142e-05, + "loss": 1.7139, + "step": 25944 + }, + { + "epoch": 7.963474524248005, + "grad_norm": 0.1740594059228897, + "learning_rate": 1.0490969889349189e-05, + "loss": 1.7447, + "step": 25945 + }, + { + "epoch": 7.96378146101903, + "grad_norm": 0.15838129818439484, + "learning_rate": 1.0487923757368351e-05, + "loss": 1.7051, + "step": 25946 + }, + { + "epoch": 7.964088397790055, + "grad_norm": 0.4309011399745941, + "learning_rate": 1.0484878015869005e-05, + "loss": 1.7442, + "step": 25947 + }, + { + "epoch": 7.964395334561081, + "grad_norm": 0.17090202867984772, + "learning_rate": 1.0481832664881257e-05, + "loss": 1.652, + "step": 25948 + }, + { + "epoch": 7.964702271332106, + "grad_norm": 0.16977159678936005, + "learning_rate": 1.0478787704435206e-05, + "loss": 1.6894, + "step": 25949 + }, + { + "epoch": 7.9650092081031305, + "grad_norm": 0.20473513007164001, + "learning_rate": 1.0475743134560934e-05, + "loss": 1.8141, + "step": 25950 + }, + { + "epoch": 7.965316144874156, + "grad_norm": 0.1775660663843155, + "learning_rate": 1.0472698955288535e-05, + "loss": 1.7204, + "step": 25951 + }, + { + "epoch": 7.965623081645181, + "grad_norm": 0.21351923048496246, + "learning_rate": 1.046965516664809e-05, + "loss": 1.7364, + "step": 25952 + }, + { + "epoch": 7.9659300184162065, + "grad_norm": 0.2034255862236023, + "learning_rate": 1.0466611768669671e-05, + "loss": 1.7096, + "step": 25953 + }, + { + "epoch": 7.966236955187231, + "grad_norm": 0.17075900733470917, + "learning_rate": 1.0463568761383396e-05, + "loss": 1.6928, + "step": 25954 + }, + { + "epoch": 7.966543891958256, + "grad_norm": 0.18142712116241455, + "learning_rate": 1.0460526144819288e-05, + "loss": 1.7146, + "step": 25955 + }, + { + "epoch": 7.966850828729282, + "grad_norm": 0.14901846647262573, + "learning_rate": 1.0457483919007427e-05, + "loss": 1.6841, + "step": 25956 + }, + { + "epoch": 7.967157765500307, + "grad_norm": 0.17380031943321228, + "learning_rate": 1.0454442083977912e-05, + "loss": 1.6911, + "step": 25957 + }, + { + "epoch": 7.967464702271332, + "grad_norm": 0.15983760356903076, + "learning_rate": 1.045140063976075e-05, + "loss": 1.6866, + "step": 25958 + }, + { + "epoch": 7.967771639042358, + "grad_norm": 0.1559101641178131, + "learning_rate": 1.0448359586386058e-05, + "loss": 1.6793, + "step": 25959 + }, + { + "epoch": 7.968078575813382, + "grad_norm": 0.14843949675559998, + "learning_rate": 1.0445318923883829e-05, + "loss": 1.6835, + "step": 25960 + }, + { + "epoch": 7.968385512584407, + "grad_norm": 0.16452330350875854, + "learning_rate": 1.0442278652284155e-05, + "loss": 1.7304, + "step": 25961 + }, + { + "epoch": 7.968692449355433, + "grad_norm": 0.18997763097286224, + "learning_rate": 1.0439238771617066e-05, + "loss": 1.7425, + "step": 25962 + }, + { + "epoch": 7.968999386126458, + "grad_norm": 0.1654025912284851, + "learning_rate": 1.0436199281912611e-05, + "loss": 1.6909, + "step": 25963 + }, + { + "epoch": 7.969306322897483, + "grad_norm": 0.1313011646270752, + "learning_rate": 1.0433160183200823e-05, + "loss": 1.6572, + "step": 25964 + }, + { + "epoch": 7.969613259668508, + "grad_norm": 0.1584165096282959, + "learning_rate": 1.043012147551174e-05, + "loss": 1.7257, + "step": 25965 + }, + { + "epoch": 7.969920196439533, + "grad_norm": 0.17830775678157806, + "learning_rate": 1.0427083158875384e-05, + "loss": 1.7382, + "step": 25966 + }, + { + "epoch": 7.9702271332105585, + "grad_norm": 0.19006042182445526, + "learning_rate": 1.0424045233321788e-05, + "loss": 1.7366, + "step": 25967 + }, + { + "epoch": 7.970534069981584, + "grad_norm": 0.15366297960281372, + "learning_rate": 1.0421007698880974e-05, + "loss": 1.7235, + "step": 25968 + }, + { + "epoch": 7.970841006752609, + "grad_norm": 0.14415831863880157, + "learning_rate": 1.0417970555582963e-05, + "loss": 1.6945, + "step": 25969 + }, + { + "epoch": 7.9711479435236345, + "grad_norm": 0.16916446387767792, + "learning_rate": 1.041493380345775e-05, + "loss": 1.7099, + "step": 25970 + }, + { + "epoch": 7.971454880294659, + "grad_norm": 0.1456119269132614, + "learning_rate": 1.041189744253539e-05, + "loss": 1.6544, + "step": 25971 + }, + { + "epoch": 7.971761817065684, + "grad_norm": 0.20085962116718292, + "learning_rate": 1.040886147284585e-05, + "loss": 1.699, + "step": 25972 + }, + { + "epoch": 7.97206875383671, + "grad_norm": 0.1815454363822937, + "learning_rate": 1.0405825894419141e-05, + "loss": 1.7503, + "step": 25973 + }, + { + "epoch": 7.972375690607735, + "grad_norm": 0.2010805308818817, + "learning_rate": 1.040279070728527e-05, + "loss": 1.7061, + "step": 25974 + }, + { + "epoch": 7.97268262737876, + "grad_norm": 0.22105813026428223, + "learning_rate": 1.0399755911474218e-05, + "loss": 1.7262, + "step": 25975 + }, + { + "epoch": 7.972989564149785, + "grad_norm": 0.16186046600341797, + "learning_rate": 1.0396721507016017e-05, + "loss": 1.7229, + "step": 25976 + }, + { + "epoch": 7.97329650092081, + "grad_norm": 0.19990484416484833, + "learning_rate": 1.0393687493940597e-05, + "loss": 1.7006, + "step": 25977 + }, + { + "epoch": 7.973603437691835, + "grad_norm": 0.2377716600894928, + "learning_rate": 1.0390653872277983e-05, + "loss": 1.7302, + "step": 25978 + }, + { + "epoch": 7.973910374462861, + "grad_norm": 0.14087189733982086, + "learning_rate": 1.0387620642058148e-05, + "loss": 1.6563, + "step": 25979 + }, + { + "epoch": 7.974217311233886, + "grad_norm": 0.246252179145813, + "learning_rate": 1.0384587803311063e-05, + "loss": 1.6661, + "step": 25980 + }, + { + "epoch": 7.974524248004911, + "grad_norm": 0.18734396994113922, + "learning_rate": 1.0381555356066697e-05, + "loss": 1.7566, + "step": 25981 + }, + { + "epoch": 7.974831184775936, + "grad_norm": 0.1621570736169815, + "learning_rate": 1.0378523300355025e-05, + "loss": 1.6863, + "step": 25982 + }, + { + "epoch": 7.975138121546961, + "grad_norm": 0.2571845054626465, + "learning_rate": 1.0375491636206002e-05, + "loss": 1.7589, + "step": 25983 + }, + { + "epoch": 7.975445058317987, + "grad_norm": 0.1880367249250412, + "learning_rate": 1.0372460363649606e-05, + "loss": 1.6999, + "step": 25984 + }, + { + "epoch": 7.975751995089012, + "grad_norm": 0.20473778247833252, + "learning_rate": 1.0369429482715776e-05, + "loss": 1.749, + "step": 25985 + }, + { + "epoch": 7.976058931860036, + "grad_norm": 0.19917427003383636, + "learning_rate": 1.0366398993434473e-05, + "loss": 1.701, + "step": 25986 + }, + { + "epoch": 7.976365868631062, + "grad_norm": 0.1758740097284317, + "learning_rate": 1.0363368895835635e-05, + "loss": 1.6774, + "step": 25987 + }, + { + "epoch": 7.976672805402087, + "grad_norm": 0.26412737369537354, + "learning_rate": 1.0360339189949242e-05, + "loss": 1.6778, + "step": 25988 + }, + { + "epoch": 7.976979742173112, + "grad_norm": 0.19599425792694092, + "learning_rate": 1.0357309875805194e-05, + "loss": 1.777, + "step": 25989 + }, + { + "epoch": 7.977286678944138, + "grad_norm": 0.2095821648836136, + "learning_rate": 1.0354280953433449e-05, + "loss": 1.7106, + "step": 25990 + }, + { + "epoch": 7.977593615715163, + "grad_norm": 0.1743748039007187, + "learning_rate": 1.0351252422863934e-05, + "loss": 1.6891, + "step": 25991 + }, + { + "epoch": 7.9779005524861875, + "grad_norm": 0.17273737490177155, + "learning_rate": 1.0348224284126573e-05, + "loss": 1.7254, + "step": 25992 + }, + { + "epoch": 7.978207489257213, + "grad_norm": 0.2032385915517807, + "learning_rate": 1.0345196537251322e-05, + "loss": 1.707, + "step": 25993 + }, + { + "epoch": 7.978514426028238, + "grad_norm": 0.17978399991989136, + "learning_rate": 1.0342169182268057e-05, + "loss": 1.695, + "step": 25994 + }, + { + "epoch": 7.9788213627992635, + "grad_norm": 0.20567134022712708, + "learning_rate": 1.0339142219206744e-05, + "loss": 1.6726, + "step": 25995 + }, + { + "epoch": 7.979128299570289, + "grad_norm": 0.19649706780910492, + "learning_rate": 1.033611564809725e-05, + "loss": 1.737, + "step": 25996 + }, + { + "epoch": 7.979435236341313, + "grad_norm": 0.1640859991312027, + "learning_rate": 1.033308946896952e-05, + "loss": 1.6993, + "step": 25997 + }, + { + "epoch": 7.979742173112339, + "grad_norm": 0.21497343480587006, + "learning_rate": 1.0330063681853452e-05, + "loss": 1.7387, + "step": 25998 + }, + { + "epoch": 7.980049109883364, + "grad_norm": 0.14995479583740234, + "learning_rate": 1.0327038286778946e-05, + "loss": 1.6671, + "step": 25999 + }, + { + "epoch": 7.980356046654389, + "grad_norm": 0.1836833655834198, + "learning_rate": 1.0324013283775895e-05, + "loss": 1.7279, + "step": 26000 + }, + { + "epoch": 7.980662983425415, + "grad_norm": 0.14769285917282104, + "learning_rate": 1.032098867287421e-05, + "loss": 1.707, + "step": 26001 + }, + { + "epoch": 7.98096992019644, + "grad_norm": 0.24206426739692688, + "learning_rate": 1.0317964454103762e-05, + "loss": 1.8122, + "step": 26002 + }, + { + "epoch": 7.981276856967464, + "grad_norm": 0.16573204100131989, + "learning_rate": 1.0314940627494451e-05, + "loss": 1.7079, + "step": 26003 + }, + { + "epoch": 7.98158379373849, + "grad_norm": 0.1825968325138092, + "learning_rate": 1.0311917193076143e-05, + "loss": 1.6795, + "step": 26004 + }, + { + "epoch": 7.981890730509515, + "grad_norm": 0.14462140202522278, + "learning_rate": 1.0308894150878761e-05, + "loss": 1.7152, + "step": 26005 + }, + { + "epoch": 7.98219766728054, + "grad_norm": 0.15220513939857483, + "learning_rate": 1.0305871500932135e-05, + "loss": 1.6657, + "step": 26006 + }, + { + "epoch": 7.982504604051566, + "grad_norm": 0.17780731618404388, + "learning_rate": 1.030284924326615e-05, + "loss": 1.6852, + "step": 26007 + }, + { + "epoch": 7.98281154082259, + "grad_norm": 0.13492488861083984, + "learning_rate": 1.0299827377910681e-05, + "loss": 1.6331, + "step": 26008 + }, + { + "epoch": 7.9831184775936155, + "grad_norm": 0.1566525399684906, + "learning_rate": 1.0296805904895568e-05, + "loss": 1.6918, + "step": 26009 + }, + { + "epoch": 7.983425414364641, + "grad_norm": 0.17075398564338684, + "learning_rate": 1.0293784824250725e-05, + "loss": 1.7107, + "step": 26010 + }, + { + "epoch": 7.983732351135666, + "grad_norm": 0.16693715751171112, + "learning_rate": 1.0290764136005937e-05, + "loss": 1.6773, + "step": 26011 + }, + { + "epoch": 7.9840392879066915, + "grad_norm": 0.23020583391189575, + "learning_rate": 1.0287743840191122e-05, + "loss": 1.7389, + "step": 26012 + }, + { + "epoch": 7.984346224677717, + "grad_norm": 0.2185986489057541, + "learning_rate": 1.0284723936836071e-05, + "loss": 1.7039, + "step": 26013 + }, + { + "epoch": 7.984653161448741, + "grad_norm": 0.1527925282716751, + "learning_rate": 1.0281704425970673e-05, + "loss": 1.6981, + "step": 26014 + }, + { + "epoch": 7.984960098219767, + "grad_norm": 0.23389141261577606, + "learning_rate": 1.0278685307624747e-05, + "loss": 1.7511, + "step": 26015 + }, + { + "epoch": 7.985267034990792, + "grad_norm": 0.1481025218963623, + "learning_rate": 1.0275666581828137e-05, + "loss": 1.6551, + "step": 26016 + }, + { + "epoch": 7.985573971761817, + "grad_norm": 0.18131811916828156, + "learning_rate": 1.0272648248610672e-05, + "loss": 1.7024, + "step": 26017 + }, + { + "epoch": 7.985880908532843, + "grad_norm": 0.15969321131706238, + "learning_rate": 1.0269630308002182e-05, + "loss": 1.7269, + "step": 26018 + }, + { + "epoch": 7.986187845303867, + "grad_norm": 0.16655376553535461, + "learning_rate": 1.026661276003249e-05, + "loss": 1.6649, + "step": 26019 + }, + { + "epoch": 7.986494782074892, + "grad_norm": 0.16438528895378113, + "learning_rate": 1.0263595604731425e-05, + "loss": 1.6901, + "step": 26020 + }, + { + "epoch": 7.986801718845918, + "grad_norm": 0.23586809635162354, + "learning_rate": 1.0260578842128782e-05, + "loss": 1.7983, + "step": 26021 + }, + { + "epoch": 7.987108655616943, + "grad_norm": 0.15142324566841125, + "learning_rate": 1.0257562472254417e-05, + "loss": 1.6327, + "step": 26022 + }, + { + "epoch": 7.987415592387968, + "grad_norm": 0.17198510468006134, + "learning_rate": 1.0254546495138096e-05, + "loss": 1.7119, + "step": 26023 + }, + { + "epoch": 7.987722529158994, + "grad_norm": 0.1675531417131424, + "learning_rate": 1.0251530910809648e-05, + "loss": 1.695, + "step": 26024 + }, + { + "epoch": 7.988029465930018, + "grad_norm": 0.17403315007686615, + "learning_rate": 1.0248515719298867e-05, + "loss": 1.7216, + "step": 26025 + }, + { + "epoch": 7.9883364027010435, + "grad_norm": 0.16039720177650452, + "learning_rate": 1.0245500920635537e-05, + "loss": 1.7315, + "step": 26026 + }, + { + "epoch": 7.988643339472069, + "grad_norm": 0.19715416431427002, + "learning_rate": 1.0242486514849498e-05, + "loss": 1.7308, + "step": 26027 + }, + { + "epoch": 7.988950276243094, + "grad_norm": 0.14576783776283264, + "learning_rate": 1.0239472501970482e-05, + "loss": 1.6589, + "step": 26028 + }, + { + "epoch": 7.989257213014119, + "grad_norm": 0.1631615310907364, + "learning_rate": 1.0236458882028333e-05, + "loss": 1.7494, + "step": 26029 + }, + { + "epoch": 7.989564149785144, + "grad_norm": 0.19368192553520203, + "learning_rate": 1.023344565505277e-05, + "loss": 1.735, + "step": 26030 + }, + { + "epoch": 7.989871086556169, + "grad_norm": 0.1902317851781845, + "learning_rate": 1.023043282107362e-05, + "loss": 1.7573, + "step": 26031 + }, + { + "epoch": 7.990178023327195, + "grad_norm": 0.18496233224868774, + "learning_rate": 1.0227420380120651e-05, + "loss": 1.7368, + "step": 26032 + }, + { + "epoch": 7.99048496009822, + "grad_norm": 0.172613263130188, + "learning_rate": 1.0224408332223617e-05, + "loss": 1.6943, + "step": 26033 + }, + { + "epoch": 7.990791896869245, + "grad_norm": 0.19840112328529358, + "learning_rate": 1.0221396677412293e-05, + "loss": 1.7562, + "step": 26034 + }, + { + "epoch": 7.99109883364027, + "grad_norm": 0.18129339814186096, + "learning_rate": 1.0218385415716441e-05, + "loss": 1.6746, + "step": 26035 + }, + { + "epoch": 7.991405770411295, + "grad_norm": 0.17933470010757446, + "learning_rate": 1.021537454716583e-05, + "loss": 1.7324, + "step": 26036 + }, + { + "epoch": 7.99171270718232, + "grad_norm": 0.14947326481342316, + "learning_rate": 1.0212364071790198e-05, + "loss": 1.632, + "step": 26037 + }, + { + "epoch": 7.992019643953346, + "grad_norm": 0.18452878296375275, + "learning_rate": 1.0209353989619291e-05, + "loss": 1.6737, + "step": 26038 + }, + { + "epoch": 7.992326580724371, + "grad_norm": 0.18882198631763458, + "learning_rate": 1.0206344300682901e-05, + "loss": 1.7529, + "step": 26039 + }, + { + "epoch": 7.9926335174953955, + "grad_norm": 0.1855655312538147, + "learning_rate": 1.0203335005010722e-05, + "loss": 1.7347, + "step": 26040 + }, + { + "epoch": 7.992940454266421, + "grad_norm": 0.16447728872299194, + "learning_rate": 1.0200326102632518e-05, + "loss": 1.6659, + "step": 26041 + }, + { + "epoch": 7.993247391037446, + "grad_norm": 0.17379891872406006, + "learning_rate": 1.0197317593578016e-05, + "loss": 1.6962, + "step": 26042 + }, + { + "epoch": 7.9935543278084715, + "grad_norm": 0.16298875212669373, + "learning_rate": 1.0194309477876934e-05, + "loss": 1.6815, + "step": 26043 + }, + { + "epoch": 7.993861264579497, + "grad_norm": 0.1883227378129959, + "learning_rate": 1.0191301755559047e-05, + "loss": 1.7053, + "step": 26044 + }, + { + "epoch": 7.994168201350522, + "grad_norm": 0.20746919512748718, + "learning_rate": 1.0188294426654021e-05, + "loss": 1.7476, + "step": 26045 + }, + { + "epoch": 7.994475138121547, + "grad_norm": 0.1882137805223465, + "learning_rate": 1.0185287491191631e-05, + "loss": 1.7078, + "step": 26046 + }, + { + "epoch": 7.994782074892572, + "grad_norm": 0.21140792965888977, + "learning_rate": 1.0182280949201539e-05, + "loss": 1.7729, + "step": 26047 + }, + { + "epoch": 7.995089011663597, + "grad_norm": 0.18779736757278442, + "learning_rate": 1.0179274800713501e-05, + "loss": 1.7413, + "step": 26048 + }, + { + "epoch": 7.995395948434623, + "grad_norm": 0.1841782033443451, + "learning_rate": 1.0176269045757202e-05, + "loss": 1.7058, + "step": 26049 + }, + { + "epoch": 7.995702885205648, + "grad_norm": 0.19872064888477325, + "learning_rate": 1.017326368436236e-05, + "loss": 1.7522, + "step": 26050 + }, + { + "epoch": 7.996009821976672, + "grad_norm": 0.1763429492712021, + "learning_rate": 1.0170258716558667e-05, + "loss": 1.7178, + "step": 26051 + }, + { + "epoch": 7.996316758747698, + "grad_norm": 0.20209169387817383, + "learning_rate": 1.0167254142375826e-05, + "loss": 1.723, + "step": 26052 + }, + { + "epoch": 7.996623695518723, + "grad_norm": 0.15985172986984253, + "learning_rate": 1.0164249961843519e-05, + "loss": 1.6985, + "step": 26053 + }, + { + "epoch": 7.996930632289748, + "grad_norm": 0.1985132247209549, + "learning_rate": 1.0161246174991451e-05, + "loss": 1.7982, + "step": 26054 + }, + { + "epoch": 7.997237569060774, + "grad_norm": 0.17600803077220917, + "learning_rate": 1.0158242781849292e-05, + "loss": 1.7009, + "step": 26055 + }, + { + "epoch": 7.997544505831799, + "grad_norm": 0.15485480427742004, + "learning_rate": 1.015523978244673e-05, + "loss": 1.675, + "step": 26056 + }, + { + "epoch": 7.9978514426028235, + "grad_norm": 0.18465322256088257, + "learning_rate": 1.0152237176813446e-05, + "loss": 1.7156, + "step": 26057 + }, + { + "epoch": 7.998158379373849, + "grad_norm": 0.2183876633644104, + "learning_rate": 1.014923496497911e-05, + "loss": 1.7805, + "step": 26058 + }, + { + "epoch": 7.998465316144874, + "grad_norm": 0.18724960088729858, + "learning_rate": 1.014623314697339e-05, + "loss": 1.7047, + "step": 26059 + }, + { + "epoch": 7.9987722529158995, + "grad_norm": 0.15459159016609192, + "learning_rate": 1.0143231722825936e-05, + "loss": 1.6595, + "step": 26060 + }, + { + "epoch": 7.999079189686924, + "grad_norm": 0.16338171064853668, + "learning_rate": 1.0140230692566454e-05, + "loss": 1.6907, + "step": 26061 + }, + { + "epoch": 7.999386126457949, + "grad_norm": 0.16223935782909393, + "learning_rate": 1.013723005622455e-05, + "loss": 1.6866, + "step": 26062 + }, + { + "epoch": 7.999693063228975, + "grad_norm": 0.18934771418571472, + "learning_rate": 1.0134229813829931e-05, + "loss": 1.706, + "step": 26063 + }, + { + "epoch": 8.0, + "grad_norm": 0.19117574393749237, + "learning_rate": 1.0131229965412191e-05, + "loss": 1.7392, + "step": 26064 + }, + { + "epoch": 8.000306936771025, + "grad_norm": 0.20491363108158112, + "learning_rate": 1.0128230511001019e-05, + "loss": 1.7488, + "step": 26065 + }, + { + "epoch": 8.00061387354205, + "grad_norm": 0.16383573412895203, + "learning_rate": 1.0125231450626043e-05, + "loss": 1.6958, + "step": 26066 + }, + { + "epoch": 8.000920810313076, + "grad_norm": 0.17405575513839722, + "learning_rate": 1.0122232784316898e-05, + "loss": 1.701, + "step": 26067 + }, + { + "epoch": 8.001227747084101, + "grad_norm": 0.1504749059677124, + "learning_rate": 1.0119234512103226e-05, + "loss": 1.6588, + "step": 26068 + }, + { + "epoch": 8.001534683855127, + "grad_norm": 0.15705156326293945, + "learning_rate": 1.0116236634014647e-05, + "loss": 1.6746, + "step": 26069 + }, + { + "epoch": 8.00184162062615, + "grad_norm": 0.18729639053344727, + "learning_rate": 1.01132391500808e-05, + "loss": 1.7634, + "step": 26070 + }, + { + "epoch": 8.002148557397176, + "grad_norm": 0.1855447143316269, + "learning_rate": 1.0110242060331304e-05, + "loss": 1.7588, + "step": 26071 + }, + { + "epoch": 8.002455494168201, + "grad_norm": 0.16488726437091827, + "learning_rate": 1.010724536479577e-05, + "loss": 1.7406, + "step": 26072 + }, + { + "epoch": 8.002762430939226, + "grad_norm": 0.17228275537490845, + "learning_rate": 1.0104249063503823e-05, + "loss": 1.7323, + "step": 26073 + }, + { + "epoch": 8.003069367710252, + "grad_norm": 0.1483743041753769, + "learning_rate": 1.0101253156485069e-05, + "loss": 1.7033, + "step": 26074 + }, + { + "epoch": 8.003376304481277, + "grad_norm": 0.2499883621931076, + "learning_rate": 1.0098257643769116e-05, + "loss": 1.7127, + "step": 26075 + }, + { + "epoch": 8.003683241252302, + "grad_norm": 0.22971376776695251, + "learning_rate": 1.0095262525385568e-05, + "loss": 1.7582, + "step": 26076 + }, + { + "epoch": 8.003990178023328, + "grad_norm": 0.18424302339553833, + "learning_rate": 1.0092267801364014e-05, + "loss": 1.6948, + "step": 26077 + }, + { + "epoch": 8.004297114794353, + "grad_norm": 0.20067891478538513, + "learning_rate": 1.0089273471734085e-05, + "loss": 1.7259, + "step": 26078 + }, + { + "epoch": 8.004604051565378, + "grad_norm": 0.2022552639245987, + "learning_rate": 1.0086279536525322e-05, + "loss": 1.7332, + "step": 26079 + }, + { + "epoch": 8.004910988336404, + "grad_norm": 0.1658320426940918, + "learning_rate": 1.0083285995767362e-05, + "loss": 1.7424, + "step": 26080 + }, + { + "epoch": 8.005217925107427, + "grad_norm": 0.16180957853794098, + "learning_rate": 1.0080292849489741e-05, + "loss": 1.6797, + "step": 26081 + }, + { + "epoch": 8.005524861878452, + "grad_norm": 0.18383777141571045, + "learning_rate": 1.007730009772208e-05, + "loss": 1.7597, + "step": 26082 + }, + { + "epoch": 8.005831798649478, + "grad_norm": 0.17468489706516266, + "learning_rate": 1.0074307740493938e-05, + "loss": 1.7266, + "step": 26083 + }, + { + "epoch": 8.006138735420503, + "grad_norm": 0.1647786945104599, + "learning_rate": 1.0071315777834883e-05, + "loss": 1.6742, + "step": 26084 + }, + { + "epoch": 8.006445672191528, + "grad_norm": 0.23006537556648254, + "learning_rate": 1.0068324209774493e-05, + "loss": 1.6649, + "step": 26085 + }, + { + "epoch": 8.006752608962554, + "grad_norm": 0.19266989827156067, + "learning_rate": 1.0065333036342328e-05, + "loss": 1.7484, + "step": 26086 + }, + { + "epoch": 8.00705954573358, + "grad_norm": 0.1709250807762146, + "learning_rate": 1.0062342257567947e-05, + "loss": 1.6569, + "step": 26087 + }, + { + "epoch": 8.007366482504604, + "grad_norm": 0.15847361087799072, + "learning_rate": 1.005935187348091e-05, + "loss": 1.6907, + "step": 26088 + }, + { + "epoch": 8.00767341927563, + "grad_norm": 0.14707811176776886, + "learning_rate": 1.0056361884110765e-05, + "loss": 1.7121, + "step": 26089 + }, + { + "epoch": 8.007980356046655, + "grad_norm": 0.1740313321352005, + "learning_rate": 1.0053372289487067e-05, + "loss": 1.6978, + "step": 26090 + }, + { + "epoch": 8.008287292817679, + "grad_norm": 0.17271417379379272, + "learning_rate": 1.0050383089639354e-05, + "loss": 1.7673, + "step": 26091 + }, + { + "epoch": 8.008594229588704, + "grad_norm": 0.179611936211586, + "learning_rate": 1.0047394284597173e-05, + "loss": 1.7291, + "step": 26092 + }, + { + "epoch": 8.00890116635973, + "grad_norm": 0.1823183298110962, + "learning_rate": 1.0044405874390057e-05, + "loss": 1.7215, + "step": 26093 + }, + { + "epoch": 8.009208103130755, + "grad_norm": 0.2914387881755829, + "learning_rate": 1.004141785904753e-05, + "loss": 1.8169, + "step": 26094 + }, + { + "epoch": 8.00951503990178, + "grad_norm": 0.21860483288764954, + "learning_rate": 1.0038430238599156e-05, + "loss": 1.8372, + "step": 26095 + }, + { + "epoch": 8.009821976672805, + "grad_norm": 0.2060404270887375, + "learning_rate": 1.0035443013074407e-05, + "loss": 1.7224, + "step": 26096 + }, + { + "epoch": 8.01012891344383, + "grad_norm": 0.21953152120113373, + "learning_rate": 1.003245618250287e-05, + "loss": 1.7571, + "step": 26097 + }, + { + "epoch": 8.010435850214856, + "grad_norm": 0.16731835901737213, + "learning_rate": 1.0029469746913995e-05, + "loss": 1.7222, + "step": 26098 + }, + { + "epoch": 8.010742786985881, + "grad_norm": 0.19284974038600922, + "learning_rate": 1.0026483706337336e-05, + "loss": 1.6582, + "step": 26099 + }, + { + "epoch": 8.011049723756907, + "grad_norm": 0.14466765522956848, + "learning_rate": 1.00234980608024e-05, + "loss": 1.6772, + "step": 26100 + }, + { + "epoch": 8.011356660527932, + "grad_norm": 0.19553600251674652, + "learning_rate": 1.0020512810338688e-05, + "loss": 1.6841, + "step": 26101 + }, + { + "epoch": 8.011663597298956, + "grad_norm": 0.19986452162265778, + "learning_rate": 1.0017527954975698e-05, + "loss": 1.7025, + "step": 26102 + }, + { + "epoch": 8.011970534069981, + "grad_norm": 0.17204077541828156, + "learning_rate": 1.0014543494742933e-05, + "loss": 1.7508, + "step": 26103 + }, + { + "epoch": 8.012277470841006, + "grad_norm": 0.19889704883098602, + "learning_rate": 1.0011559429669887e-05, + "loss": 1.6973, + "step": 26104 + }, + { + "epoch": 8.012584407612032, + "grad_norm": 0.16140232980251312, + "learning_rate": 1.0008575759786042e-05, + "loss": 1.7932, + "step": 26105 + }, + { + "epoch": 8.012891344383057, + "grad_norm": 0.21359173953533173, + "learning_rate": 1.0005592485120896e-05, + "loss": 1.6986, + "step": 26106 + }, + { + "epoch": 8.013198281154082, + "grad_norm": 0.1766652911901474, + "learning_rate": 1.0002609605703927e-05, + "loss": 1.7275, + "step": 26107 + }, + { + "epoch": 8.013505217925108, + "grad_norm": 0.176233172416687, + "learning_rate": 9.999627121564614e-06, + "loss": 1.6787, + "step": 26108 + }, + { + "epoch": 8.013812154696133, + "grad_norm": 0.15688678622245789, + "learning_rate": 9.996645032732426e-06, + "loss": 1.6917, + "step": 26109 + }, + { + "epoch": 8.014119091467158, + "grad_norm": 0.1363043189048767, + "learning_rate": 9.993663339236842e-06, + "loss": 1.6621, + "step": 26110 + }, + { + "epoch": 8.014426028238184, + "grad_norm": 0.1586332768201828, + "learning_rate": 9.990682041107313e-06, + "loss": 1.7161, + "step": 26111 + }, + { + "epoch": 8.014732965009209, + "grad_norm": 0.19763816893100739, + "learning_rate": 9.987701138373334e-06, + "loss": 1.736, + "step": 26112 + }, + { + "epoch": 8.015039901780233, + "grad_norm": 0.15302304923534393, + "learning_rate": 9.984720631064326e-06, + "loss": 1.6814, + "step": 26113 + }, + { + "epoch": 8.015346838551258, + "grad_norm": 0.1768827736377716, + "learning_rate": 9.981740519209786e-06, + "loss": 1.7006, + "step": 26114 + }, + { + "epoch": 8.015653775322283, + "grad_norm": 0.14857567846775055, + "learning_rate": 9.978760802839116e-06, + "loss": 1.6891, + "step": 26115 + }, + { + "epoch": 8.015960712093309, + "grad_norm": 0.20578980445861816, + "learning_rate": 9.9757814819818e-06, + "loss": 1.7798, + "step": 26116 + }, + { + "epoch": 8.016267648864334, + "grad_norm": 0.16164197027683258, + "learning_rate": 9.97280255666727e-06, + "loss": 1.6855, + "step": 26117 + }, + { + "epoch": 8.01657458563536, + "grad_norm": 0.2176574170589447, + "learning_rate": 9.969824026924968e-06, + "loss": 1.8144, + "step": 26118 + }, + { + "epoch": 8.016881522406385, + "grad_norm": 0.16946040093898773, + "learning_rate": 9.966845892784326e-06, + "loss": 1.7029, + "step": 26119 + }, + { + "epoch": 8.01718845917741, + "grad_norm": 0.17593413591384888, + "learning_rate": 9.96386815427478e-06, + "loss": 1.6993, + "step": 26120 + }, + { + "epoch": 8.017495395948435, + "grad_norm": 0.16679200530052185, + "learning_rate": 9.96089081142575e-06, + "loss": 1.6993, + "step": 26121 + }, + { + "epoch": 8.01780233271946, + "grad_norm": 0.19294987618923187, + "learning_rate": 9.957913864266667e-06, + "loss": 1.7417, + "step": 26122 + }, + { + "epoch": 8.018109269490484, + "grad_norm": 0.17427025735378265, + "learning_rate": 9.954937312826951e-06, + "loss": 1.6957, + "step": 26123 + }, + { + "epoch": 8.01841620626151, + "grad_norm": 0.1996718794107437, + "learning_rate": 9.951961157136013e-06, + "loss": 1.7348, + "step": 26124 + }, + { + "epoch": 8.018723143032535, + "grad_norm": 0.19701123237609863, + "learning_rate": 9.948985397223271e-06, + "loss": 1.7336, + "step": 26125 + }, + { + "epoch": 8.01903007980356, + "grad_norm": 0.15205782651901245, + "learning_rate": 9.946010033118124e-06, + "loss": 1.6971, + "step": 26126 + }, + { + "epoch": 8.019337016574585, + "grad_norm": 0.16516798734664917, + "learning_rate": 9.943035064849986e-06, + "loss": 1.7176, + "step": 26127 + }, + { + "epoch": 8.01964395334561, + "grad_norm": 0.18073998391628265, + "learning_rate": 9.94006049244825e-06, + "loss": 1.7344, + "step": 26128 + }, + { + "epoch": 8.019950890116636, + "grad_norm": 0.15453651547431946, + "learning_rate": 9.937086315942324e-06, + "loss": 1.7268, + "step": 26129 + }, + { + "epoch": 8.020257826887661, + "grad_norm": 0.17114359140396118, + "learning_rate": 9.934112535361574e-06, + "loss": 1.6708, + "step": 26130 + }, + { + "epoch": 8.020564763658687, + "grad_norm": 0.15452778339385986, + "learning_rate": 9.931139150735431e-06, + "loss": 1.697, + "step": 26131 + }, + { + "epoch": 8.020871700429712, + "grad_norm": 0.18605299293994904, + "learning_rate": 9.928166162093234e-06, + "loss": 1.7463, + "step": 26132 + }, + { + "epoch": 8.021178637200737, + "grad_norm": 0.14081695675849915, + "learning_rate": 9.925193569464398e-06, + "loss": 1.678, + "step": 26133 + }, + { + "epoch": 8.021485573971761, + "grad_norm": 0.15573516488075256, + "learning_rate": 9.922221372878288e-06, + "loss": 1.7125, + "step": 26134 + }, + { + "epoch": 8.021792510742786, + "grad_norm": 0.1690043956041336, + "learning_rate": 9.919249572364275e-06, + "loss": 1.7067, + "step": 26135 + }, + { + "epoch": 8.022099447513812, + "grad_norm": 0.1895153820514679, + "learning_rate": 9.91627816795173e-06, + "loss": 1.7098, + "step": 26136 + }, + { + "epoch": 8.022406384284837, + "grad_norm": 0.1467704176902771, + "learning_rate": 9.913307159670022e-06, + "loss": 1.666, + "step": 26137 + }, + { + "epoch": 8.022713321055862, + "grad_norm": 0.17272399365901947, + "learning_rate": 9.910336547548505e-06, + "loss": 1.7017, + "step": 26138 + }, + { + "epoch": 8.023020257826888, + "grad_norm": 0.16714219748973846, + "learning_rate": 9.907366331616541e-06, + "loss": 1.7096, + "step": 26139 + }, + { + "epoch": 8.023327194597913, + "grad_norm": 0.1545754224061966, + "learning_rate": 9.90439651190348e-06, + "loss": 1.6768, + "step": 26140 + }, + { + "epoch": 8.023634131368938, + "grad_norm": 0.17502975463867188, + "learning_rate": 9.901427088438675e-06, + "loss": 1.6879, + "step": 26141 + }, + { + "epoch": 8.023941068139964, + "grad_norm": 0.15835684537887573, + "learning_rate": 9.898458061251465e-06, + "loss": 1.6908, + "step": 26142 + }, + { + "epoch": 8.024248004910989, + "grad_norm": 0.19534549117088318, + "learning_rate": 9.895489430371202e-06, + "loss": 1.7235, + "step": 26143 + }, + { + "epoch": 8.024554941682014, + "grad_norm": 0.18291355669498444, + "learning_rate": 9.89252119582722e-06, + "loss": 1.7618, + "step": 26144 + }, + { + "epoch": 8.024861878453038, + "grad_norm": 0.1474599689245224, + "learning_rate": 9.889553357648844e-06, + "loss": 1.7011, + "step": 26145 + }, + { + "epoch": 8.025168815224063, + "grad_norm": 0.1801324188709259, + "learning_rate": 9.886585915865421e-06, + "loss": 1.7386, + "step": 26146 + }, + { + "epoch": 8.025475751995089, + "grad_norm": 0.16178105771541595, + "learning_rate": 9.883618870506245e-06, + "loss": 1.6903, + "step": 26147 + }, + { + "epoch": 8.025782688766114, + "grad_norm": 0.15138550102710724, + "learning_rate": 9.880652221600694e-06, + "loss": 1.7064, + "step": 26148 + }, + { + "epoch": 8.02608962553714, + "grad_norm": 0.22056828439235687, + "learning_rate": 9.877685969178018e-06, + "loss": 1.7879, + "step": 26149 + }, + { + "epoch": 8.026396562308165, + "grad_norm": 0.15810613334178925, + "learning_rate": 9.874720113267599e-06, + "loss": 1.6895, + "step": 26150 + }, + { + "epoch": 8.02670349907919, + "grad_norm": 0.15241321921348572, + "learning_rate": 9.871754653898685e-06, + "loss": 1.7103, + "step": 26151 + }, + { + "epoch": 8.027010435850215, + "grad_norm": 0.1609175056219101, + "learning_rate": 9.868789591100625e-06, + "loss": 1.6845, + "step": 26152 + }, + { + "epoch": 8.02731737262124, + "grad_norm": 0.16068117320537567, + "learning_rate": 9.865824924902706e-06, + "loss": 1.6688, + "step": 26153 + }, + { + "epoch": 8.027624309392266, + "grad_norm": 0.14036257565021515, + "learning_rate": 9.862860655334233e-06, + "loss": 1.6881, + "step": 26154 + }, + { + "epoch": 8.027931246163291, + "grad_norm": 0.16418461501598358, + "learning_rate": 9.859896782424494e-06, + "loss": 1.7265, + "step": 26155 + }, + { + "epoch": 8.028238182934315, + "grad_norm": 0.19456401467323303, + "learning_rate": 9.856933306202782e-06, + "loss": 1.7152, + "step": 26156 + }, + { + "epoch": 8.02854511970534, + "grad_norm": 0.14537569880485535, + "learning_rate": 9.853970226698384e-06, + "loss": 1.6918, + "step": 26157 + }, + { + "epoch": 8.028852056476365, + "grad_norm": 0.18725928664207458, + "learning_rate": 9.851007543940578e-06, + "loss": 1.6815, + "step": 26158 + }, + { + "epoch": 8.02915899324739, + "grad_norm": 0.17676733434200287, + "learning_rate": 9.848045257958649e-06, + "loss": 1.7741, + "step": 26159 + }, + { + "epoch": 8.029465930018416, + "grad_norm": 0.1890053004026413, + "learning_rate": 9.845083368781877e-06, + "loss": 1.7433, + "step": 26160 + }, + { + "epoch": 8.029772866789441, + "grad_norm": 0.16931703686714172, + "learning_rate": 9.84212187643952e-06, + "loss": 1.7474, + "step": 26161 + }, + { + "epoch": 8.030079803560467, + "grad_norm": 0.17416565120220184, + "learning_rate": 9.839160780960855e-06, + "loss": 1.7259, + "step": 26162 + }, + { + "epoch": 8.030386740331492, + "grad_norm": 0.17702054977416992, + "learning_rate": 9.83620008237514e-06, + "loss": 1.7166, + "step": 26163 + }, + { + "epoch": 8.030693677102517, + "grad_norm": 0.1579936146736145, + "learning_rate": 9.833239780711622e-06, + "loss": 1.6593, + "step": 26164 + }, + { + "epoch": 8.031000613873543, + "grad_norm": 0.2263452112674713, + "learning_rate": 9.830279875999604e-06, + "loss": 1.7735, + "step": 26165 + }, + { + "epoch": 8.031307550644566, + "grad_norm": 0.160926952958107, + "learning_rate": 9.827320368268273e-06, + "loss": 1.7, + "step": 26166 + }, + { + "epoch": 8.031614487415592, + "grad_norm": 0.21756359934806824, + "learning_rate": 9.824361257546938e-06, + "loss": 1.736, + "step": 26167 + }, + { + "epoch": 8.031921424186617, + "grad_norm": 0.20553551614284515, + "learning_rate": 9.821402543864783e-06, + "loss": 1.7254, + "step": 26168 + }, + { + "epoch": 8.032228360957642, + "grad_norm": 0.14283208549022675, + "learning_rate": 9.818444227251089e-06, + "loss": 1.6532, + "step": 26169 + }, + { + "epoch": 8.032535297728668, + "grad_norm": 0.22624479234218597, + "learning_rate": 9.815486307735084e-06, + "loss": 1.7933, + "step": 26170 + }, + { + "epoch": 8.032842234499693, + "grad_norm": 0.15582896769046783, + "learning_rate": 9.812528785345999e-06, + "loss": 1.6959, + "step": 26171 + }, + { + "epoch": 8.033149171270718, + "grad_norm": 0.19829398393630981, + "learning_rate": 9.809571660113055e-06, + "loss": 1.7431, + "step": 26172 + }, + { + "epoch": 8.033456108041744, + "grad_norm": 0.1469334214925766, + "learning_rate": 9.806614932065477e-06, + "loss": 1.7441, + "step": 26173 + }, + { + "epoch": 8.033763044812769, + "grad_norm": 0.17737391591072083, + "learning_rate": 9.803658601232491e-06, + "loss": 1.719, + "step": 26174 + }, + { + "epoch": 8.034069981583794, + "grad_norm": 0.16895830631256104, + "learning_rate": 9.800702667643314e-06, + "loss": 1.7169, + "step": 26175 + }, + { + "epoch": 8.03437691835482, + "grad_norm": 0.17256470024585724, + "learning_rate": 9.79774713132715e-06, + "loss": 1.712, + "step": 26176 + }, + { + "epoch": 8.034683855125843, + "grad_norm": 0.1516820341348648, + "learning_rate": 9.794791992313213e-06, + "loss": 1.6345, + "step": 26177 + }, + { + "epoch": 8.034990791896869, + "grad_norm": 0.20021840929985046, + "learning_rate": 9.79183725063071e-06, + "loss": 1.6962, + "step": 26178 + }, + { + "epoch": 8.035297728667894, + "grad_norm": 0.19088859856128693, + "learning_rate": 9.788882906308832e-06, + "loss": 1.7719, + "step": 26179 + }, + { + "epoch": 8.03560466543892, + "grad_norm": 0.16831208765506744, + "learning_rate": 9.78592895937679e-06, + "loss": 1.7101, + "step": 26180 + }, + { + "epoch": 8.035911602209945, + "grad_norm": 0.15665093064308167, + "learning_rate": 9.782975409863749e-06, + "loss": 1.7328, + "step": 26181 + }, + { + "epoch": 8.03621853898097, + "grad_norm": 0.20523908734321594, + "learning_rate": 9.780022257798943e-06, + "loss": 1.7338, + "step": 26182 + }, + { + "epoch": 8.036525475751995, + "grad_norm": 0.15819329023361206, + "learning_rate": 9.777069503211505e-06, + "loss": 1.7116, + "step": 26183 + }, + { + "epoch": 8.03683241252302, + "grad_norm": 0.14828373491764069, + "learning_rate": 9.774117146130673e-06, + "loss": 1.6671, + "step": 26184 + }, + { + "epoch": 8.037139349294046, + "grad_norm": 0.17743347585201263, + "learning_rate": 9.771165186585563e-06, + "loss": 1.7474, + "step": 26185 + }, + { + "epoch": 8.037446286065071, + "grad_norm": 0.14112113416194916, + "learning_rate": 9.768213624605388e-06, + "loss": 1.6324, + "step": 26186 + }, + { + "epoch": 8.037753222836097, + "grad_norm": 0.14532047510147095, + "learning_rate": 9.76526246021931e-06, + "loss": 1.6814, + "step": 26187 + }, + { + "epoch": 8.03806015960712, + "grad_norm": 0.16272012889385223, + "learning_rate": 9.762311693456489e-06, + "loss": 1.6556, + "step": 26188 + }, + { + "epoch": 8.038367096378146, + "grad_norm": 0.17599201202392578, + "learning_rate": 9.759361324346088e-06, + "loss": 1.7186, + "step": 26189 + }, + { + "epoch": 8.03867403314917, + "grad_norm": 0.20449498295783997, + "learning_rate": 9.75641135291726e-06, + "loss": 1.7324, + "step": 26190 + }, + { + "epoch": 8.038980969920196, + "grad_norm": 0.1787404716014862, + "learning_rate": 9.753461779199168e-06, + "loss": 1.7038, + "step": 26191 + }, + { + "epoch": 8.039287906691222, + "grad_norm": 0.15954211354255676, + "learning_rate": 9.750512603220956e-06, + "loss": 1.6926, + "step": 26192 + }, + { + "epoch": 8.039594843462247, + "grad_norm": 0.21806633472442627, + "learning_rate": 9.747563825011768e-06, + "loss": 1.7317, + "step": 26193 + }, + { + "epoch": 8.039901780233272, + "grad_norm": 0.14846986532211304, + "learning_rate": 9.744615444600746e-06, + "loss": 1.655, + "step": 26194 + }, + { + "epoch": 8.040208717004298, + "grad_norm": 0.17799098789691925, + "learning_rate": 9.74166746201703e-06, + "loss": 1.6899, + "step": 26195 + }, + { + "epoch": 8.040515653775323, + "grad_norm": 0.1648644655942917, + "learning_rate": 9.738719877289754e-06, + "loss": 1.7181, + "step": 26196 + }, + { + "epoch": 8.040822590546348, + "grad_norm": 0.17811881005764008, + "learning_rate": 9.735772690448042e-06, + "loss": 1.7257, + "step": 26197 + }, + { + "epoch": 8.041129527317372, + "grad_norm": 0.19059741497039795, + "learning_rate": 9.732825901521014e-06, + "loss": 1.7306, + "step": 26198 + }, + { + "epoch": 8.041436464088397, + "grad_norm": 0.17326456308364868, + "learning_rate": 9.729879510537825e-06, + "loss": 1.6922, + "step": 26199 + }, + { + "epoch": 8.041743400859422, + "grad_norm": 0.1428811252117157, + "learning_rate": 9.726933517527548e-06, + "loss": 1.6495, + "step": 26200 + }, + { + "epoch": 8.042050337630448, + "grad_norm": 0.1494823843240738, + "learning_rate": 9.72398792251934e-06, + "loss": 1.6779, + "step": 26201 + }, + { + "epoch": 8.042357274401473, + "grad_norm": 0.19112205505371094, + "learning_rate": 9.721042725542267e-06, + "loss": 1.7794, + "step": 26202 + }, + { + "epoch": 8.042664211172498, + "grad_norm": 0.15820644795894623, + "learning_rate": 9.718097926625468e-06, + "loss": 1.6834, + "step": 26203 + }, + { + "epoch": 8.042971147943524, + "grad_norm": 0.17020943760871887, + "learning_rate": 9.715153525798043e-06, + "loss": 1.6852, + "step": 26204 + }, + { + "epoch": 8.043278084714549, + "grad_norm": 0.18933680653572083, + "learning_rate": 9.712209523089072e-06, + "loss": 1.7412, + "step": 26205 + }, + { + "epoch": 8.043585021485574, + "grad_norm": 0.16407641768455505, + "learning_rate": 9.709265918527666e-06, + "loss": 1.7209, + "step": 26206 + }, + { + "epoch": 8.0438919582566, + "grad_norm": 0.19043506681919098, + "learning_rate": 9.706322712142912e-06, + "loss": 1.7351, + "step": 26207 + }, + { + "epoch": 8.044198895027625, + "grad_norm": 0.14904475212097168, + "learning_rate": 9.703379903963889e-06, + "loss": 1.7484, + "step": 26208 + }, + { + "epoch": 8.044505831798649, + "grad_norm": 0.14778849482536316, + "learning_rate": 9.700437494019682e-06, + "loss": 1.7231, + "step": 26209 + }, + { + "epoch": 8.044812768569674, + "grad_norm": 0.186212420463562, + "learning_rate": 9.697495482339374e-06, + "loss": 1.7153, + "step": 26210 + }, + { + "epoch": 8.0451197053407, + "grad_norm": 0.13795694708824158, + "learning_rate": 9.694553868952044e-06, + "loss": 1.693, + "step": 26211 + }, + { + "epoch": 8.045426642111725, + "grad_norm": 0.16083405911922455, + "learning_rate": 9.69161265388675e-06, + "loss": 1.669, + "step": 26212 + }, + { + "epoch": 8.04573357888275, + "grad_norm": 0.15548262000083923, + "learning_rate": 9.688671837172569e-06, + "loss": 1.7265, + "step": 26213 + }, + { + "epoch": 8.046040515653775, + "grad_norm": 0.14771351218223572, + "learning_rate": 9.685731418838556e-06, + "loss": 1.6978, + "step": 26214 + }, + { + "epoch": 8.0463474524248, + "grad_norm": 0.1525130569934845, + "learning_rate": 9.682791398913765e-06, + "loss": 1.731, + "step": 26215 + }, + { + "epoch": 8.046654389195826, + "grad_norm": 0.16103293001651764, + "learning_rate": 9.679851777427284e-06, + "loss": 1.7015, + "step": 26216 + }, + { + "epoch": 8.046961325966851, + "grad_norm": 0.16990229487419128, + "learning_rate": 9.676912554408112e-06, + "loss": 1.6995, + "step": 26217 + }, + { + "epoch": 8.047268262737877, + "grad_norm": 0.14605717360973358, + "learning_rate": 9.673973729885355e-06, + "loss": 1.7085, + "step": 26218 + }, + { + "epoch": 8.047575199508902, + "grad_norm": 0.19646432995796204, + "learning_rate": 9.671035303887993e-06, + "loss": 1.8441, + "step": 26219 + }, + { + "epoch": 8.047882136279926, + "grad_norm": 0.2000361531972885, + "learning_rate": 9.668097276445115e-06, + "loss": 1.7126, + "step": 26220 + }, + { + "epoch": 8.04818907305095, + "grad_norm": 0.2262575775384903, + "learning_rate": 9.665159647585736e-06, + "loss": 1.7721, + "step": 26221 + }, + { + "epoch": 8.048496009821976, + "grad_norm": 0.1880655288696289, + "learning_rate": 9.662222417338895e-06, + "loss": 1.7151, + "step": 26222 + }, + { + "epoch": 8.048802946593002, + "grad_norm": 0.1746743619441986, + "learning_rate": 9.659285585733613e-06, + "loss": 1.6745, + "step": 26223 + }, + { + "epoch": 8.049109883364027, + "grad_norm": 0.14917364716529846, + "learning_rate": 9.656349152798916e-06, + "loss": 1.6541, + "step": 26224 + }, + { + "epoch": 8.049416820135052, + "grad_norm": 0.18189994990825653, + "learning_rate": 9.65341311856382e-06, + "loss": 1.7361, + "step": 26225 + }, + { + "epoch": 8.049723756906078, + "grad_norm": 0.16237786412239075, + "learning_rate": 9.650477483057346e-06, + "loss": 1.7446, + "step": 26226 + }, + { + "epoch": 8.050030693677103, + "grad_norm": 0.1651264876127243, + "learning_rate": 9.647542246308506e-06, + "loss": 1.7604, + "step": 26227 + }, + { + "epoch": 8.050337630448128, + "grad_norm": 0.1673632264137268, + "learning_rate": 9.644607408346296e-06, + "loss": 1.678, + "step": 26228 + }, + { + "epoch": 8.050644567219154, + "grad_norm": 0.20457343757152557, + "learning_rate": 9.641672969199738e-06, + "loss": 1.6963, + "step": 26229 + }, + { + "epoch": 8.050951503990179, + "grad_norm": 0.15247805416584015, + "learning_rate": 9.638738928897816e-06, + "loss": 1.7036, + "step": 26230 + }, + { + "epoch": 8.051258440761202, + "grad_norm": 0.21655996143817902, + "learning_rate": 9.635805287469535e-06, + "loss": 1.7422, + "step": 26231 + }, + { + "epoch": 8.051565377532228, + "grad_norm": 0.1631101369857788, + "learning_rate": 9.632872044943869e-06, + "loss": 1.6681, + "step": 26232 + }, + { + "epoch": 8.051872314303253, + "grad_norm": 0.18587349355220795, + "learning_rate": 9.629939201349853e-06, + "loss": 1.7036, + "step": 26233 + }, + { + "epoch": 8.052179251074278, + "grad_norm": 0.272533655166626, + "learning_rate": 9.627006756716405e-06, + "loss": 1.818, + "step": 26234 + }, + { + "epoch": 8.052486187845304, + "grad_norm": 0.1740235984325409, + "learning_rate": 9.624074711072572e-06, + "loss": 1.7074, + "step": 26235 + }, + { + "epoch": 8.05279312461633, + "grad_norm": 0.21405693888664246, + "learning_rate": 9.621143064447274e-06, + "loss": 1.7473, + "step": 26236 + }, + { + "epoch": 8.053100061387354, + "grad_norm": 0.172579824924469, + "learning_rate": 9.618211816869515e-06, + "loss": 1.7154, + "step": 26237 + }, + { + "epoch": 8.05340699815838, + "grad_norm": 0.19767756760120392, + "learning_rate": 9.615280968368257e-06, + "loss": 1.7011, + "step": 26238 + }, + { + "epoch": 8.053713934929405, + "grad_norm": 0.18467654287815094, + "learning_rate": 9.612350518972463e-06, + "loss": 1.6922, + "step": 26239 + }, + { + "epoch": 8.05402087170043, + "grad_norm": 0.1530679613351822, + "learning_rate": 9.609420468711088e-06, + "loss": 1.6633, + "step": 26240 + }, + { + "epoch": 8.054327808471454, + "grad_norm": 0.3850557804107666, + "learning_rate": 9.6064908176131e-06, + "loss": 1.7637, + "step": 26241 + }, + { + "epoch": 8.05463474524248, + "grad_norm": 0.1556573212146759, + "learning_rate": 9.603561565707441e-06, + "loss": 1.6853, + "step": 26242 + }, + { + "epoch": 8.054941682013505, + "grad_norm": 0.2009180188179016, + "learning_rate": 9.600632713023067e-06, + "loss": 1.7172, + "step": 26243 + }, + { + "epoch": 8.05524861878453, + "grad_norm": 0.18538115918636322, + "learning_rate": 9.597704259588919e-06, + "loss": 1.7517, + "step": 26244 + }, + { + "epoch": 8.055555555555555, + "grad_norm": 0.1626463681459427, + "learning_rate": 9.594776205433936e-06, + "loss": 1.697, + "step": 26245 + }, + { + "epoch": 8.05586249232658, + "grad_norm": 0.15908029675483704, + "learning_rate": 9.591848550587062e-06, + "loss": 1.7355, + "step": 26246 + }, + { + "epoch": 8.056169429097606, + "grad_norm": 0.1679108589887619, + "learning_rate": 9.588921295077219e-06, + "loss": 1.6732, + "step": 26247 + }, + { + "epoch": 8.056476365868631, + "grad_norm": 0.17123237252235413, + "learning_rate": 9.585994438933344e-06, + "loss": 1.7627, + "step": 26248 + }, + { + "epoch": 8.056783302639657, + "grad_norm": 0.2438436597585678, + "learning_rate": 9.583067982184346e-06, + "loss": 1.7475, + "step": 26249 + }, + { + "epoch": 8.057090239410682, + "grad_norm": 0.18769577145576477, + "learning_rate": 9.580141924859182e-06, + "loss": 1.7165, + "step": 26250 + }, + { + "epoch": 8.057397176181707, + "grad_norm": 0.18146662414073944, + "learning_rate": 9.577216266986727e-06, + "loss": 1.7601, + "step": 26251 + }, + { + "epoch": 8.057704112952731, + "grad_norm": 0.20209676027297974, + "learning_rate": 9.574291008595932e-06, + "loss": 1.7635, + "step": 26252 + }, + { + "epoch": 8.058011049723756, + "grad_norm": 0.16949260234832764, + "learning_rate": 9.571366149715665e-06, + "loss": 1.7437, + "step": 26253 + }, + { + "epoch": 8.058317986494782, + "grad_norm": 0.14449356496334076, + "learning_rate": 9.568441690374868e-06, + "loss": 1.6906, + "step": 26254 + }, + { + "epoch": 8.058624923265807, + "grad_norm": 0.21796976029872894, + "learning_rate": 9.565517630602428e-06, + "loss": 1.7986, + "step": 26255 + }, + { + "epoch": 8.058931860036832, + "grad_norm": 0.15194009244441986, + "learning_rate": 9.562593970427241e-06, + "loss": 1.6838, + "step": 26256 + }, + { + "epoch": 8.059238796807858, + "grad_norm": 0.19820080697536469, + "learning_rate": 9.559670709878198e-06, + "loss": 1.7327, + "step": 26257 + }, + { + "epoch": 8.059545733578883, + "grad_norm": 0.1478637307882309, + "learning_rate": 9.5567478489842e-06, + "loss": 1.6814, + "step": 26258 + }, + { + "epoch": 8.059852670349908, + "grad_norm": 0.147980734705925, + "learning_rate": 9.553825387774118e-06, + "loss": 1.693, + "step": 26259 + }, + { + "epoch": 8.060159607120934, + "grad_norm": 0.16274768114089966, + "learning_rate": 9.550903326276839e-06, + "loss": 1.7275, + "step": 26260 + }, + { + "epoch": 8.060466543891959, + "grad_norm": 0.16221144795417786, + "learning_rate": 9.547981664521244e-06, + "loss": 1.7071, + "step": 26261 + }, + { + "epoch": 8.060773480662984, + "grad_norm": 0.18921487033367157, + "learning_rate": 9.545060402536204e-06, + "loss": 1.6771, + "step": 26262 + }, + { + "epoch": 8.061080417434008, + "grad_norm": 0.19136327505111694, + "learning_rate": 9.542139540350586e-06, + "loss": 1.7235, + "step": 26263 + }, + { + "epoch": 8.061387354205033, + "grad_norm": 0.18764656782150269, + "learning_rate": 9.539219077993261e-06, + "loss": 1.7374, + "step": 26264 + }, + { + "epoch": 8.061694290976058, + "grad_norm": 0.16516967117786407, + "learning_rate": 9.53629901549309e-06, + "loss": 1.7124, + "step": 26265 + }, + { + "epoch": 8.062001227747084, + "grad_norm": 0.1457880437374115, + "learning_rate": 9.533379352878907e-06, + "loss": 1.6471, + "step": 26266 + }, + { + "epoch": 8.06230816451811, + "grad_norm": 0.1898411363363266, + "learning_rate": 9.530460090179622e-06, + "loss": 1.7745, + "step": 26267 + }, + { + "epoch": 8.062615101289135, + "grad_norm": 0.18252579867839813, + "learning_rate": 9.52754122742402e-06, + "loss": 1.7165, + "step": 26268 + }, + { + "epoch": 8.06292203806016, + "grad_norm": 0.1838676929473877, + "learning_rate": 9.524622764641006e-06, + "loss": 1.7169, + "step": 26269 + }, + { + "epoch": 8.063228974831185, + "grad_norm": 0.1684531718492508, + "learning_rate": 9.521704701859362e-06, + "loss": 1.6831, + "step": 26270 + }, + { + "epoch": 8.06353591160221, + "grad_norm": 0.18296435475349426, + "learning_rate": 9.51878703910798e-06, + "loss": 1.6952, + "step": 26271 + }, + { + "epoch": 8.063842848373236, + "grad_norm": 0.20634715259075165, + "learning_rate": 9.515869776415665e-06, + "loss": 1.6899, + "step": 26272 + }, + { + "epoch": 8.06414978514426, + "grad_norm": 0.18681001663208008, + "learning_rate": 9.512952913811252e-06, + "loss": 1.6648, + "step": 26273 + }, + { + "epoch": 8.064456721915285, + "grad_norm": 0.19397646188735962, + "learning_rate": 9.510036451323568e-06, + "loss": 1.7309, + "step": 26274 + }, + { + "epoch": 8.06476365868631, + "grad_norm": 0.17254865169525146, + "learning_rate": 9.507120388981438e-06, + "loss": 1.6671, + "step": 26275 + }, + { + "epoch": 8.065070595457335, + "grad_norm": 0.16224531829357147, + "learning_rate": 9.504204726813682e-06, + "loss": 1.6881, + "step": 26276 + }, + { + "epoch": 8.06537753222836, + "grad_norm": 0.16534289717674255, + "learning_rate": 9.501289464849106e-06, + "loss": 1.7372, + "step": 26277 + }, + { + "epoch": 8.065684468999386, + "grad_norm": 0.20247776806354523, + "learning_rate": 9.498374603116523e-06, + "loss": 1.7108, + "step": 26278 + }, + { + "epoch": 8.065991405770411, + "grad_norm": 0.1420232504606247, + "learning_rate": 9.49546014164474e-06, + "loss": 1.6403, + "step": 26279 + }, + { + "epoch": 8.066298342541437, + "grad_norm": 0.139396533370018, + "learning_rate": 9.492546080462567e-06, + "loss": 1.6578, + "step": 26280 + }, + { + "epoch": 8.066605279312462, + "grad_norm": 0.17437872290611267, + "learning_rate": 9.489632419598788e-06, + "loss": 1.7094, + "step": 26281 + }, + { + "epoch": 8.066912216083487, + "grad_norm": 0.29614368081092834, + "learning_rate": 9.486719159082209e-06, + "loss": 1.773, + "step": 26282 + }, + { + "epoch": 8.067219152854513, + "grad_norm": 0.20771834254264832, + "learning_rate": 9.483806298941617e-06, + "loss": 1.7421, + "step": 26283 + }, + { + "epoch": 8.067526089625536, + "grad_norm": 0.20772570371627808, + "learning_rate": 9.4808938392058e-06, + "loss": 1.7437, + "step": 26284 + }, + { + "epoch": 8.067833026396562, + "grad_norm": 0.1837359070777893, + "learning_rate": 9.477981779903522e-06, + "loss": 1.7142, + "step": 26285 + }, + { + "epoch": 8.068139963167587, + "grad_norm": 0.18425285816192627, + "learning_rate": 9.475070121063607e-06, + "loss": 1.6804, + "step": 26286 + }, + { + "epoch": 8.068446899938612, + "grad_norm": 0.16501453518867493, + "learning_rate": 9.472158862714775e-06, + "loss": 1.7466, + "step": 26287 + }, + { + "epoch": 8.068753836709638, + "grad_norm": 0.17685455083847046, + "learning_rate": 9.469248004885839e-06, + "loss": 1.6839, + "step": 26288 + }, + { + "epoch": 8.069060773480663, + "grad_norm": 0.18923965096473694, + "learning_rate": 9.466337547605547e-06, + "loss": 1.6774, + "step": 26289 + }, + { + "epoch": 8.069367710251688, + "grad_norm": 0.17584268748760223, + "learning_rate": 9.463427490902665e-06, + "loss": 1.6904, + "step": 26290 + }, + { + "epoch": 8.069674647022714, + "grad_norm": 0.25477278232574463, + "learning_rate": 9.460517834805966e-06, + "loss": 1.7898, + "step": 26291 + }, + { + "epoch": 8.069981583793739, + "grad_norm": 0.23453976213932037, + "learning_rate": 9.457608579344169e-06, + "loss": 1.7456, + "step": 26292 + }, + { + "epoch": 8.070288520564764, + "grad_norm": 0.20332537591457367, + "learning_rate": 9.45469972454605e-06, + "loss": 1.76, + "step": 26293 + }, + { + "epoch": 8.07059545733579, + "grad_norm": 0.1937316656112671, + "learning_rate": 9.451791270440358e-06, + "loss": 1.698, + "step": 26294 + }, + { + "epoch": 8.070902394106813, + "grad_norm": 0.19909465312957764, + "learning_rate": 9.448883217055832e-06, + "loss": 1.7373, + "step": 26295 + }, + { + "epoch": 8.071209330877839, + "grad_norm": 0.16824916005134583, + "learning_rate": 9.445975564421206e-06, + "loss": 1.6619, + "step": 26296 + }, + { + "epoch": 8.071516267648864, + "grad_norm": 0.17873473465442657, + "learning_rate": 9.443068312565222e-06, + "loss": 1.7438, + "step": 26297 + }, + { + "epoch": 8.07182320441989, + "grad_norm": 0.152094304561615, + "learning_rate": 9.440161461516606e-06, + "loss": 1.6513, + "step": 26298 + }, + { + "epoch": 8.072130141190915, + "grad_norm": 0.14592084288597107, + "learning_rate": 9.43725501130409e-06, + "loss": 1.6503, + "step": 26299 + }, + { + "epoch": 8.07243707796194, + "grad_norm": 0.16904598474502563, + "learning_rate": 9.434348961956396e-06, + "loss": 1.6929, + "step": 26300 + }, + { + "epoch": 8.072744014732965, + "grad_norm": 0.15297052264213562, + "learning_rate": 9.431443313502235e-06, + "loss": 1.6871, + "step": 26301 + }, + { + "epoch": 8.07305095150399, + "grad_norm": 0.20306609570980072, + "learning_rate": 9.428538065970321e-06, + "loss": 1.7779, + "step": 26302 + }, + { + "epoch": 8.073357888275016, + "grad_norm": 0.177826926112175, + "learning_rate": 9.425633219389401e-06, + "loss": 1.7021, + "step": 26303 + }, + { + "epoch": 8.073664825046041, + "grad_norm": 0.22192324697971344, + "learning_rate": 9.422728773788125e-06, + "loss": 1.7713, + "step": 26304 + }, + { + "epoch": 8.073971761817067, + "grad_norm": 0.16998204588890076, + "learning_rate": 9.419824729195253e-06, + "loss": 1.6994, + "step": 26305 + }, + { + "epoch": 8.07427869858809, + "grad_norm": 0.1606592983007431, + "learning_rate": 9.416921085639436e-06, + "loss": 1.7274, + "step": 26306 + }, + { + "epoch": 8.074585635359115, + "grad_norm": 0.17434780299663544, + "learning_rate": 9.414017843149398e-06, + "loss": 1.714, + "step": 26307 + }, + { + "epoch": 8.07489257213014, + "grad_norm": 0.16548825800418854, + "learning_rate": 9.411115001753839e-06, + "loss": 1.7361, + "step": 26308 + }, + { + "epoch": 8.075199508901166, + "grad_norm": 0.23958922922611237, + "learning_rate": 9.408212561481405e-06, + "loss": 1.7286, + "step": 26309 + }, + { + "epoch": 8.075506445672191, + "grad_norm": 0.1900513619184494, + "learning_rate": 9.405310522360821e-06, + "loss": 1.7309, + "step": 26310 + }, + { + "epoch": 8.075813382443217, + "grad_norm": 0.1576761156320572, + "learning_rate": 9.402408884420755e-06, + "loss": 1.7039, + "step": 26311 + }, + { + "epoch": 8.076120319214242, + "grad_norm": 0.17078427970409393, + "learning_rate": 9.399507647689875e-06, + "loss": 1.737, + "step": 26312 + }, + { + "epoch": 8.076427255985267, + "grad_norm": 0.138477623462677, + "learning_rate": 9.396606812196856e-06, + "loss": 1.6673, + "step": 26313 + }, + { + "epoch": 8.076734192756293, + "grad_norm": 0.1546505093574524, + "learning_rate": 9.393706377970368e-06, + "loss": 1.7146, + "step": 26314 + }, + { + "epoch": 8.077041129527318, + "grad_norm": 0.14440344274044037, + "learning_rate": 9.390806345039077e-06, + "loss": 1.7044, + "step": 26315 + }, + { + "epoch": 8.077348066298342, + "grad_norm": 0.1944594532251358, + "learning_rate": 9.387906713431632e-06, + "loss": 1.7685, + "step": 26316 + }, + { + "epoch": 8.077655003069367, + "grad_norm": 0.17758207023143768, + "learning_rate": 9.385007483176706e-06, + "loss": 1.7068, + "step": 26317 + }, + { + "epoch": 8.077961939840392, + "grad_norm": 0.20713698863983154, + "learning_rate": 9.382108654302934e-06, + "loss": 1.6488, + "step": 26318 + }, + { + "epoch": 8.078268876611418, + "grad_norm": 0.14699894189834595, + "learning_rate": 9.379210226838958e-06, + "loss": 1.6746, + "step": 26319 + }, + { + "epoch": 8.078575813382443, + "grad_norm": 0.15119978785514832, + "learning_rate": 9.376312200813465e-06, + "loss": 1.6919, + "step": 26320 + }, + { + "epoch": 8.078882750153468, + "grad_norm": 0.14071249961853027, + "learning_rate": 9.373414576255041e-06, + "loss": 1.6755, + "step": 26321 + }, + { + "epoch": 8.079189686924494, + "grad_norm": 0.22004422545433044, + "learning_rate": 9.370517353192365e-06, + "loss": 1.7808, + "step": 26322 + }, + { + "epoch": 8.079496623695519, + "grad_norm": 0.15764497220516205, + "learning_rate": 9.36762053165403e-06, + "loss": 1.7108, + "step": 26323 + }, + { + "epoch": 8.079803560466544, + "grad_norm": 0.17802847921848297, + "learning_rate": 9.364724111668693e-06, + "loss": 1.7274, + "step": 26324 + }, + { + "epoch": 8.08011049723757, + "grad_norm": 0.16950444877147675, + "learning_rate": 9.361828093264984e-06, + "loss": 1.7196, + "step": 26325 + }, + { + "epoch": 8.080417434008595, + "grad_norm": 0.16647809743881226, + "learning_rate": 9.358932476471488e-06, + "loss": 1.7027, + "step": 26326 + }, + { + "epoch": 8.080724370779619, + "grad_norm": 0.20012708008289337, + "learning_rate": 9.356037261316863e-06, + "loss": 1.7101, + "step": 26327 + }, + { + "epoch": 8.081031307550644, + "grad_norm": 0.19795066118240356, + "learning_rate": 9.353142447829672e-06, + "loss": 1.7142, + "step": 26328 + }, + { + "epoch": 8.08133824432167, + "grad_norm": 0.1786295473575592, + "learning_rate": 9.350248036038567e-06, + "loss": 1.6646, + "step": 26329 + }, + { + "epoch": 8.081645181092695, + "grad_norm": 0.17646436393260956, + "learning_rate": 9.347354025972138e-06, + "loss": 1.7044, + "step": 26330 + }, + { + "epoch": 8.08195211786372, + "grad_norm": 0.24095231294631958, + "learning_rate": 9.344460417658979e-06, + "loss": 1.823, + "step": 26331 + }, + { + "epoch": 8.082259054634745, + "grad_norm": 0.16094247996807098, + "learning_rate": 9.341567211127694e-06, + "loss": 1.6933, + "step": 26332 + }, + { + "epoch": 8.08256599140577, + "grad_norm": 0.22386589646339417, + "learning_rate": 9.338674406406872e-06, + "loss": 1.7219, + "step": 26333 + }, + { + "epoch": 8.082872928176796, + "grad_norm": 0.2110683023929596, + "learning_rate": 9.3357820035251e-06, + "loss": 1.6951, + "step": 26334 + }, + { + "epoch": 8.083179864947821, + "grad_norm": 0.2240242063999176, + "learning_rate": 9.33289000251097e-06, + "loss": 1.756, + "step": 26335 + }, + { + "epoch": 8.083486801718847, + "grad_norm": 0.19035838544368744, + "learning_rate": 9.329998403393036e-06, + "loss": 1.7657, + "step": 26336 + }, + { + "epoch": 8.083793738489872, + "grad_norm": 0.20213502645492554, + "learning_rate": 9.327107206199925e-06, + "loss": 1.6938, + "step": 26337 + }, + { + "epoch": 8.084100675260895, + "grad_norm": 0.20297139883041382, + "learning_rate": 9.324216410960157e-06, + "loss": 1.7476, + "step": 26338 + }, + { + "epoch": 8.08440761203192, + "grad_norm": 0.23968154191970825, + "learning_rate": 9.321326017702348e-06, + "loss": 1.7418, + "step": 26339 + }, + { + "epoch": 8.084714548802946, + "grad_norm": 0.19853347539901733, + "learning_rate": 9.318436026455008e-06, + "loss": 1.6943, + "step": 26340 + }, + { + "epoch": 8.085021485573971, + "grad_norm": 0.1835598647594452, + "learning_rate": 9.315546437246742e-06, + "loss": 1.7071, + "step": 26341 + }, + { + "epoch": 8.085328422344997, + "grad_norm": 0.22876964509487152, + "learning_rate": 9.312657250106106e-06, + "loss": 1.7717, + "step": 26342 + }, + { + "epoch": 8.085635359116022, + "grad_norm": 0.1632407158613205, + "learning_rate": 9.309768465061613e-06, + "loss": 1.6506, + "step": 26343 + }, + { + "epoch": 8.085942295887047, + "grad_norm": 0.1812858134508133, + "learning_rate": 9.306880082141861e-06, + "loss": 1.6826, + "step": 26344 + }, + { + "epoch": 8.086249232658073, + "grad_norm": 0.24607063829898834, + "learning_rate": 9.303992101375347e-06, + "loss": 1.7109, + "step": 26345 + }, + { + "epoch": 8.086556169429098, + "grad_norm": 0.1401972472667694, + "learning_rate": 9.301104522790648e-06, + "loss": 1.6612, + "step": 26346 + }, + { + "epoch": 8.086863106200123, + "grad_norm": 0.22876517474651337, + "learning_rate": 9.298217346416287e-06, + "loss": 1.6857, + "step": 26347 + }, + { + "epoch": 8.087170042971149, + "grad_norm": 0.22353915870189667, + "learning_rate": 9.295330572280803e-06, + "loss": 1.7071, + "step": 26348 + }, + { + "epoch": 8.087476979742172, + "grad_norm": 0.22349561750888824, + "learning_rate": 9.292444200412715e-06, + "loss": 1.7098, + "step": 26349 + }, + { + "epoch": 8.087783916513198, + "grad_norm": 0.17078392207622528, + "learning_rate": 9.289558230840556e-06, + "loss": 1.6732, + "step": 26350 + }, + { + "epoch": 8.088090853284223, + "grad_norm": 0.19569413363933563, + "learning_rate": 9.286672663592843e-06, + "loss": 1.7489, + "step": 26351 + }, + { + "epoch": 8.088397790055248, + "grad_norm": 0.1565880924463272, + "learning_rate": 9.283787498698093e-06, + "loss": 1.6984, + "step": 26352 + }, + { + "epoch": 8.088704726826274, + "grad_norm": 0.21362969279289246, + "learning_rate": 9.28090273618481e-06, + "loss": 1.7157, + "step": 26353 + }, + { + "epoch": 8.089011663597299, + "grad_norm": 0.15077799558639526, + "learning_rate": 9.278018376081532e-06, + "loss": 1.707, + "step": 26354 + }, + { + "epoch": 8.089318600368324, + "grad_norm": 0.19006888568401337, + "learning_rate": 9.27513441841672e-06, + "loss": 1.7379, + "step": 26355 + }, + { + "epoch": 8.08962553713935, + "grad_norm": 0.17935799062252045, + "learning_rate": 9.272250863218928e-06, + "loss": 1.7529, + "step": 26356 + }, + { + "epoch": 8.089932473910375, + "grad_norm": 0.1539749801158905, + "learning_rate": 9.269367710516596e-06, + "loss": 1.6717, + "step": 26357 + }, + { + "epoch": 8.0902394106814, + "grad_norm": 0.20954270660877228, + "learning_rate": 9.266484960338262e-06, + "loss": 1.7511, + "step": 26358 + }, + { + "epoch": 8.090546347452424, + "grad_norm": 0.1744573712348938, + "learning_rate": 9.263602612712408e-06, + "loss": 1.747, + "step": 26359 + }, + { + "epoch": 8.09085328422345, + "grad_norm": 0.198909193277359, + "learning_rate": 9.260720667667482e-06, + "loss": 1.6854, + "step": 26360 + }, + { + "epoch": 8.091160220994475, + "grad_norm": 0.16504423320293427, + "learning_rate": 9.25783912523202e-06, + "loss": 1.7346, + "step": 26361 + }, + { + "epoch": 8.0914671577655, + "grad_norm": 0.16309323906898499, + "learning_rate": 9.254957985434449e-06, + "loss": 1.695, + "step": 26362 + }, + { + "epoch": 8.091774094536525, + "grad_norm": 0.178558811545372, + "learning_rate": 9.25207724830327e-06, + "loss": 1.7091, + "step": 26363 + }, + { + "epoch": 8.09208103130755, + "grad_norm": 0.1758749783039093, + "learning_rate": 9.249196913866954e-06, + "loss": 1.732, + "step": 26364 + }, + { + "epoch": 8.092387968078576, + "grad_norm": 0.16251471638679504, + "learning_rate": 9.246316982153957e-06, + "loss": 1.6783, + "step": 26365 + }, + { + "epoch": 8.092694904849601, + "grad_norm": 0.1818319857120514, + "learning_rate": 9.243437453192739e-06, + "loss": 1.7208, + "step": 26366 + }, + { + "epoch": 8.093001841620627, + "grad_norm": 0.2009693682193756, + "learning_rate": 9.240558327011761e-06, + "loss": 1.7345, + "step": 26367 + }, + { + "epoch": 8.093308778391652, + "grad_norm": 0.19003108143806458, + "learning_rate": 9.237679603639477e-06, + "loss": 1.7141, + "step": 26368 + }, + { + "epoch": 8.093615715162677, + "grad_norm": 0.19530169665813446, + "learning_rate": 9.234801283104338e-06, + "loss": 1.6945, + "step": 26369 + }, + { + "epoch": 8.0939226519337, + "grad_norm": 0.14184506237506866, + "learning_rate": 9.231923365434769e-06, + "loss": 1.6484, + "step": 26370 + }, + { + "epoch": 8.094229588704726, + "grad_norm": 0.14682452380657196, + "learning_rate": 9.229045850659252e-06, + "loss": 1.6534, + "step": 26371 + }, + { + "epoch": 8.094536525475752, + "grad_norm": 0.21143727004528046, + "learning_rate": 9.22616873880618e-06, + "loss": 1.7439, + "step": 26372 + }, + { + "epoch": 8.094843462246777, + "grad_norm": 0.1664114147424698, + "learning_rate": 9.223292029904029e-06, + "loss": 1.7568, + "step": 26373 + }, + { + "epoch": 8.095150399017802, + "grad_norm": 0.17671625316143036, + "learning_rate": 9.22041572398118e-06, + "loss": 1.6594, + "step": 26374 + }, + { + "epoch": 8.095457335788828, + "grad_norm": 0.1968437135219574, + "learning_rate": 9.217539821066101e-06, + "loss": 1.734, + "step": 26375 + }, + { + "epoch": 8.095764272559853, + "grad_norm": 0.18740740418434143, + "learning_rate": 9.214664321187206e-06, + "loss": 1.7223, + "step": 26376 + }, + { + "epoch": 8.096071209330878, + "grad_norm": 0.16954728960990906, + "learning_rate": 9.21178922437288e-06, + "loss": 1.7282, + "step": 26377 + }, + { + "epoch": 8.096378146101904, + "grad_norm": 0.1979333609342575, + "learning_rate": 9.20891453065158e-06, + "loss": 1.7254, + "step": 26378 + }, + { + "epoch": 8.096685082872929, + "grad_norm": 0.1495361626148224, + "learning_rate": 9.206040240051677e-06, + "loss": 1.6936, + "step": 26379 + }, + { + "epoch": 8.096992019643954, + "grad_norm": 0.159287691116333, + "learning_rate": 9.203166352601605e-06, + "loss": 1.6658, + "step": 26380 + }, + { + "epoch": 8.097298956414978, + "grad_norm": 0.175196573138237, + "learning_rate": 9.200292868329751e-06, + "loss": 1.7779, + "step": 26381 + }, + { + "epoch": 8.097605893186003, + "grad_norm": 0.17131435871124268, + "learning_rate": 9.197419787264522e-06, + "loss": 1.7435, + "step": 26382 + }, + { + "epoch": 8.097912829957028, + "grad_norm": 0.14529173076152802, + "learning_rate": 9.194547109434299e-06, + "loss": 1.7083, + "step": 26383 + }, + { + "epoch": 8.098219766728054, + "grad_norm": 0.1824452430009842, + "learning_rate": 9.191674834867482e-06, + "loss": 1.7134, + "step": 26384 + }, + { + "epoch": 8.098526703499079, + "grad_norm": 0.18507611751556396, + "learning_rate": 9.188802963592453e-06, + "loss": 1.673, + "step": 26385 + }, + { + "epoch": 8.098833640270104, + "grad_norm": 0.19102542102336884, + "learning_rate": 9.185931495637595e-06, + "loss": 1.7058, + "step": 26386 + }, + { + "epoch": 8.09914057704113, + "grad_norm": 0.17001433670520782, + "learning_rate": 9.183060431031271e-06, + "loss": 1.6827, + "step": 26387 + }, + { + "epoch": 8.099447513812155, + "grad_norm": 0.1718425452709198, + "learning_rate": 9.18018976980189e-06, + "loss": 1.7375, + "step": 26388 + }, + { + "epoch": 8.09975445058318, + "grad_norm": 0.15681782364845276, + "learning_rate": 9.177319511977772e-06, + "loss": 1.6989, + "step": 26389 + }, + { + "epoch": 8.100061387354206, + "grad_norm": 0.156332865357399, + "learning_rate": 9.174449657587341e-06, + "loss": 1.7229, + "step": 26390 + }, + { + "epoch": 8.10036832412523, + "grad_norm": 0.2014407366514206, + "learning_rate": 9.171580206658898e-06, + "loss": 1.7589, + "step": 26391 + }, + { + "epoch": 8.100675260896255, + "grad_norm": 0.16946980357170105, + "learning_rate": 9.168711159220845e-06, + "loss": 1.7053, + "step": 26392 + }, + { + "epoch": 8.10098219766728, + "grad_norm": 0.1604216992855072, + "learning_rate": 9.165842515301526e-06, + "loss": 1.7338, + "step": 26393 + }, + { + "epoch": 8.101289134438305, + "grad_norm": 0.19191038608551025, + "learning_rate": 9.162974274929265e-06, + "loss": 1.721, + "step": 26394 + }, + { + "epoch": 8.10159607120933, + "grad_norm": 0.17082683742046356, + "learning_rate": 9.160106438132454e-06, + "loss": 1.707, + "step": 26395 + }, + { + "epoch": 8.101903007980356, + "grad_norm": 0.15988127887248993, + "learning_rate": 9.157239004939377e-06, + "loss": 1.6787, + "step": 26396 + }, + { + "epoch": 8.102209944751381, + "grad_norm": 0.21586796641349792, + "learning_rate": 9.154371975378423e-06, + "loss": 1.7105, + "step": 26397 + }, + { + "epoch": 8.102516881522407, + "grad_norm": 0.17289277911186218, + "learning_rate": 9.151505349477902e-06, + "loss": 1.7165, + "step": 26398 + }, + { + "epoch": 8.102823818293432, + "grad_norm": 0.16819556057453156, + "learning_rate": 9.148639127266145e-06, + "loss": 1.6965, + "step": 26399 + }, + { + "epoch": 8.103130755064457, + "grad_norm": 0.2234455943107605, + "learning_rate": 9.145773308771483e-06, + "loss": 1.8059, + "step": 26400 + }, + { + "epoch": 8.103437691835483, + "grad_norm": 0.15835164487361908, + "learning_rate": 9.142907894022235e-06, + "loss": 1.6851, + "step": 26401 + }, + { + "epoch": 8.103744628606506, + "grad_norm": 0.18604053556919098, + "learning_rate": 9.140042883046718e-06, + "loss": 1.7105, + "step": 26402 + }, + { + "epoch": 8.104051565377532, + "grad_norm": 0.1927308589220047, + "learning_rate": 9.137178275873243e-06, + "loss": 1.7236, + "step": 26403 + }, + { + "epoch": 8.104358502148557, + "grad_norm": 0.16214077174663544, + "learning_rate": 9.134314072530115e-06, + "loss": 1.7394, + "step": 26404 + }, + { + "epoch": 8.104665438919582, + "grad_norm": 0.2051863819360733, + "learning_rate": 9.131450273045667e-06, + "loss": 1.701, + "step": 26405 + }, + { + "epoch": 8.104972375690608, + "grad_norm": 0.1917528212070465, + "learning_rate": 9.128586877448158e-06, + "loss": 1.6984, + "step": 26406 + }, + { + "epoch": 8.105279312461633, + "grad_norm": 0.19591490924358368, + "learning_rate": 9.125723885765935e-06, + "loss": 1.7678, + "step": 26407 + }, + { + "epoch": 8.105586249232658, + "grad_norm": 0.22388321161270142, + "learning_rate": 9.122861298027242e-06, + "loss": 1.7398, + "step": 26408 + }, + { + "epoch": 8.105893186003684, + "grad_norm": 0.13983963429927826, + "learning_rate": 9.119999114260402e-06, + "loss": 1.6868, + "step": 26409 + }, + { + "epoch": 8.106200122774709, + "grad_norm": 0.16611455380916595, + "learning_rate": 9.117137334493708e-06, + "loss": 1.7029, + "step": 26410 + }, + { + "epoch": 8.106507059545734, + "grad_norm": 0.22045908868312836, + "learning_rate": 9.114275958755397e-06, + "loss": 1.7598, + "step": 26411 + }, + { + "epoch": 8.10681399631676, + "grad_norm": 0.1717766672372818, + "learning_rate": 9.111414987073801e-06, + "loss": 1.7197, + "step": 26412 + }, + { + "epoch": 8.107120933087783, + "grad_norm": 0.1627349704504013, + "learning_rate": 9.108554419477138e-06, + "loss": 1.6514, + "step": 26413 + }, + { + "epoch": 8.107427869858808, + "grad_norm": 0.16213741898536682, + "learning_rate": 9.105694255993725e-06, + "loss": 1.6873, + "step": 26414 + }, + { + "epoch": 8.107734806629834, + "grad_norm": 0.15004312992095947, + "learning_rate": 9.102834496651812e-06, + "loss": 1.7057, + "step": 26415 + }, + { + "epoch": 8.10804174340086, + "grad_norm": 0.16030706465244293, + "learning_rate": 9.099975141479655e-06, + "loss": 1.7006, + "step": 26416 + }, + { + "epoch": 8.108348680171884, + "grad_norm": 0.18823765218257904, + "learning_rate": 9.097116190505516e-06, + "loss": 1.6734, + "step": 26417 + }, + { + "epoch": 8.10865561694291, + "grad_norm": 0.19617006182670593, + "learning_rate": 9.094257643757653e-06, + "loss": 1.7135, + "step": 26418 + }, + { + "epoch": 8.108962553713935, + "grad_norm": 0.2009502351284027, + "learning_rate": 9.091399501264308e-06, + "loss": 1.7573, + "step": 26419 + }, + { + "epoch": 8.10926949048496, + "grad_norm": 0.1545785665512085, + "learning_rate": 9.088541763053732e-06, + "loss": 1.7154, + "step": 26420 + }, + { + "epoch": 8.109576427255986, + "grad_norm": 0.19506138563156128, + "learning_rate": 9.085684429154152e-06, + "loss": 1.7116, + "step": 26421 + }, + { + "epoch": 8.109883364027011, + "grad_norm": 0.15998101234436035, + "learning_rate": 9.082827499593843e-06, + "loss": 1.7107, + "step": 26422 + }, + { + "epoch": 8.110190300798035, + "grad_norm": 0.16210505366325378, + "learning_rate": 9.079970974400992e-06, + "loss": 1.6625, + "step": 26423 + }, + { + "epoch": 8.11049723756906, + "grad_norm": 0.14739912748336792, + "learning_rate": 9.077114853603875e-06, + "loss": 1.6993, + "step": 26424 + }, + { + "epoch": 8.110804174340085, + "grad_norm": 0.16882890462875366, + "learning_rate": 9.074259137230667e-06, + "loss": 1.7666, + "step": 26425 + }, + { + "epoch": 8.11111111111111, + "grad_norm": 0.1667594611644745, + "learning_rate": 9.071403825309633e-06, + "loss": 1.6876, + "step": 26426 + }, + { + "epoch": 8.111418047882136, + "grad_norm": 0.14678725600242615, + "learning_rate": 9.06854891786899e-06, + "loss": 1.6458, + "step": 26427 + }, + { + "epoch": 8.111724984653161, + "grad_norm": 0.15207096934318542, + "learning_rate": 9.06569441493691e-06, + "loss": 1.6551, + "step": 26428 + }, + { + "epoch": 8.112031921424187, + "grad_norm": 0.2019769251346588, + "learning_rate": 9.062840316541654e-06, + "loss": 1.7812, + "step": 26429 + }, + { + "epoch": 8.112338858195212, + "grad_norm": 0.12371024489402771, + "learning_rate": 9.05998662271138e-06, + "loss": 1.6389, + "step": 26430 + }, + { + "epoch": 8.112645794966237, + "grad_norm": 0.21813201904296875, + "learning_rate": 9.057133333474332e-06, + "loss": 1.6922, + "step": 26431 + }, + { + "epoch": 8.112952731737263, + "grad_norm": 0.15330322086811066, + "learning_rate": 9.054280448858682e-06, + "loss": 1.6975, + "step": 26432 + }, + { + "epoch": 8.113259668508288, + "grad_norm": 0.17849069833755493, + "learning_rate": 9.051427968892635e-06, + "loss": 1.7239, + "step": 26433 + }, + { + "epoch": 8.113566605279312, + "grad_norm": 0.13501322269439697, + "learning_rate": 9.048575893604377e-06, + "loss": 1.66, + "step": 26434 + }, + { + "epoch": 8.113873542050337, + "grad_norm": 0.1584496796131134, + "learning_rate": 9.045724223022096e-06, + "loss": 1.6864, + "step": 26435 + }, + { + "epoch": 8.114180478821362, + "grad_norm": 0.1788417398929596, + "learning_rate": 9.04287295717397e-06, + "loss": 1.7785, + "step": 26436 + }, + { + "epoch": 8.114487415592388, + "grad_norm": 0.16028213500976562, + "learning_rate": 9.04002209608818e-06, + "loss": 1.6908, + "step": 26437 + }, + { + "epoch": 8.114794352363413, + "grad_norm": 0.19472184777259827, + "learning_rate": 9.037171639792895e-06, + "loss": 1.7963, + "step": 26438 + }, + { + "epoch": 8.115101289134438, + "grad_norm": 0.155779629945755, + "learning_rate": 9.034321588316297e-06, + "loss": 1.6975, + "step": 26439 + }, + { + "epoch": 8.115408225905464, + "grad_norm": 0.191580668091774, + "learning_rate": 9.031471941686525e-06, + "loss": 1.6926, + "step": 26440 + }, + { + "epoch": 8.115715162676489, + "grad_norm": 0.13917100429534912, + "learning_rate": 9.028622699931788e-06, + "loss": 1.6735, + "step": 26441 + }, + { + "epoch": 8.116022099447514, + "grad_norm": 0.13983212411403656, + "learning_rate": 9.025773863080188e-06, + "loss": 1.6995, + "step": 26442 + }, + { + "epoch": 8.11632903621854, + "grad_norm": 0.1471131443977356, + "learning_rate": 9.022925431159922e-06, + "loss": 1.7002, + "step": 26443 + }, + { + "epoch": 8.116635972989565, + "grad_norm": 0.16679814457893372, + "learning_rate": 9.020077404199134e-06, + "loss": 1.7124, + "step": 26444 + }, + { + "epoch": 8.116942909760589, + "grad_norm": 0.1366356909275055, + "learning_rate": 9.017229782225938e-06, + "loss": 1.663, + "step": 26445 + }, + { + "epoch": 8.117249846531614, + "grad_norm": 0.1389543116092682, + "learning_rate": 9.01438256526852e-06, + "loss": 1.6991, + "step": 26446 + }, + { + "epoch": 8.11755678330264, + "grad_norm": 0.1784060299396515, + "learning_rate": 9.011535753354972e-06, + "loss": 1.769, + "step": 26447 + }, + { + "epoch": 8.117863720073665, + "grad_norm": 0.17633236944675446, + "learning_rate": 9.008689346513466e-06, + "loss": 1.7466, + "step": 26448 + }, + { + "epoch": 8.11817065684469, + "grad_norm": 0.15887171030044556, + "learning_rate": 9.005843344772119e-06, + "loss": 1.7395, + "step": 26449 + }, + { + "epoch": 8.118477593615715, + "grad_norm": 0.20275244116783142, + "learning_rate": 9.002997748159054e-06, + "loss": 1.6971, + "step": 26450 + }, + { + "epoch": 8.11878453038674, + "grad_norm": 0.18063177168369293, + "learning_rate": 9.00015255670239e-06, + "loss": 1.7438, + "step": 26451 + }, + { + "epoch": 8.119091467157766, + "grad_norm": 0.14861668646335602, + "learning_rate": 8.997307770430252e-06, + "loss": 1.645, + "step": 26452 + }, + { + "epoch": 8.119398403928791, + "grad_norm": 0.20455077290534973, + "learning_rate": 8.99446338937075e-06, + "loss": 1.6791, + "step": 26453 + }, + { + "epoch": 8.119705340699817, + "grad_norm": 0.15492217242717743, + "learning_rate": 8.991619413551999e-06, + "loss": 1.6897, + "step": 26454 + }, + { + "epoch": 8.120012277470842, + "grad_norm": 0.1854604184627533, + "learning_rate": 8.988775843002095e-06, + "loss": 1.7379, + "step": 26455 + }, + { + "epoch": 8.120319214241865, + "grad_norm": 0.16705256700515747, + "learning_rate": 8.985932677749155e-06, + "loss": 1.7181, + "step": 26456 + }, + { + "epoch": 8.12062615101289, + "grad_norm": 0.1571042388677597, + "learning_rate": 8.983089917821246e-06, + "loss": 1.6962, + "step": 26457 + }, + { + "epoch": 8.120933087783916, + "grad_norm": 0.1818968802690506, + "learning_rate": 8.980247563246508e-06, + "loss": 1.6954, + "step": 26458 + }, + { + "epoch": 8.121240024554941, + "grad_norm": 0.1823234111070633, + "learning_rate": 8.977405614052986e-06, + "loss": 1.6936, + "step": 26459 + }, + { + "epoch": 8.121546961325967, + "grad_norm": 0.1767190843820572, + "learning_rate": 8.97456407026881e-06, + "loss": 1.7147, + "step": 26460 + }, + { + "epoch": 8.121853898096992, + "grad_norm": 0.17461732029914856, + "learning_rate": 8.971722931922023e-06, + "loss": 1.7039, + "step": 26461 + }, + { + "epoch": 8.122160834868017, + "grad_norm": 0.13968271017074585, + "learning_rate": 8.968882199040702e-06, + "loss": 1.655, + "step": 26462 + }, + { + "epoch": 8.122467771639043, + "grad_norm": 0.16950756311416626, + "learning_rate": 8.966041871652969e-06, + "loss": 1.689, + "step": 26463 + }, + { + "epoch": 8.122774708410068, + "grad_norm": 0.148970365524292, + "learning_rate": 8.963201949786831e-06, + "loss": 1.6998, + "step": 26464 + }, + { + "epoch": 8.123081645181093, + "grad_norm": 0.2081855684518814, + "learning_rate": 8.960362433470392e-06, + "loss": 1.7287, + "step": 26465 + }, + { + "epoch": 8.123388581952117, + "grad_norm": 0.14865393936634064, + "learning_rate": 8.957523322731714e-06, + "loss": 1.6789, + "step": 26466 + }, + { + "epoch": 8.123695518723142, + "grad_norm": 0.19252106547355652, + "learning_rate": 8.954684617598841e-06, + "loss": 1.7475, + "step": 26467 + }, + { + "epoch": 8.124002455494168, + "grad_norm": 0.1915684938430786, + "learning_rate": 8.951846318099837e-06, + "loss": 1.6937, + "step": 26468 + }, + { + "epoch": 8.124309392265193, + "grad_norm": 0.15057072043418884, + "learning_rate": 8.949008424262744e-06, + "loss": 1.6748, + "step": 26469 + }, + { + "epoch": 8.124616329036218, + "grad_norm": 0.1801072657108307, + "learning_rate": 8.946170936115611e-06, + "loss": 1.7411, + "step": 26470 + }, + { + "epoch": 8.124923265807244, + "grad_norm": 0.1449461281299591, + "learning_rate": 8.943333853686476e-06, + "loss": 1.6751, + "step": 26471 + }, + { + "epoch": 8.125230202578269, + "grad_norm": 0.19249948859214783, + "learning_rate": 8.940497177003383e-06, + "loss": 1.6876, + "step": 26472 + }, + { + "epoch": 8.125537139349294, + "grad_norm": 0.19512195885181427, + "learning_rate": 8.937660906094359e-06, + "loss": 1.7275, + "step": 26473 + }, + { + "epoch": 8.12584407612032, + "grad_norm": 0.15998144447803497, + "learning_rate": 8.934825040987433e-06, + "loss": 1.7151, + "step": 26474 + }, + { + "epoch": 8.126151012891345, + "grad_norm": 0.17573381960391998, + "learning_rate": 8.931989581710654e-06, + "loss": 1.713, + "step": 26475 + }, + { + "epoch": 8.12645794966237, + "grad_norm": 0.16745707392692566, + "learning_rate": 8.929154528292e-06, + "loss": 1.7758, + "step": 26476 + }, + { + "epoch": 8.126764886433394, + "grad_norm": 0.14445005357265472, + "learning_rate": 8.926319880759538e-06, + "loss": 1.6821, + "step": 26477 + }, + { + "epoch": 8.12707182320442, + "grad_norm": 0.20462681353092194, + "learning_rate": 8.923485639141244e-06, + "loss": 1.7083, + "step": 26478 + }, + { + "epoch": 8.127378759975445, + "grad_norm": 0.16262570023536682, + "learning_rate": 8.92065180346513e-06, + "loss": 1.7031, + "step": 26479 + }, + { + "epoch": 8.12768569674647, + "grad_norm": 0.14214366674423218, + "learning_rate": 8.917818373759235e-06, + "loss": 1.6752, + "step": 26480 + }, + { + "epoch": 8.127992633517495, + "grad_norm": 0.18373169004917145, + "learning_rate": 8.914985350051513e-06, + "loss": 1.7211, + "step": 26481 + }, + { + "epoch": 8.12829957028852, + "grad_norm": 0.1702071875333786, + "learning_rate": 8.912152732370015e-06, + "loss": 1.7513, + "step": 26482 + }, + { + "epoch": 8.128606507059546, + "grad_norm": 0.16515198349952698, + "learning_rate": 8.90932052074268e-06, + "loss": 1.7379, + "step": 26483 + }, + { + "epoch": 8.128913443830571, + "grad_norm": 0.17008109390735626, + "learning_rate": 8.906488715197537e-06, + "loss": 1.7243, + "step": 26484 + }, + { + "epoch": 8.129220380601597, + "grad_norm": 0.15695080161094666, + "learning_rate": 8.903657315762554e-06, + "loss": 1.6951, + "step": 26485 + }, + { + "epoch": 8.129527317372622, + "grad_norm": 0.16403819620609283, + "learning_rate": 8.900826322465716e-06, + "loss": 1.7755, + "step": 26486 + }, + { + "epoch": 8.129834254143647, + "grad_norm": 0.21355034410953522, + "learning_rate": 8.897995735335007e-06, + "loss": 1.7505, + "step": 26487 + }, + { + "epoch": 8.13014119091467, + "grad_norm": 0.15604349970817566, + "learning_rate": 8.895165554398394e-06, + "loss": 1.7452, + "step": 26488 + }, + { + "epoch": 8.130448127685696, + "grad_norm": 0.18299458920955658, + "learning_rate": 8.892335779683842e-06, + "loss": 1.6737, + "step": 26489 + }, + { + "epoch": 8.130755064456721, + "grad_norm": 0.1939994990825653, + "learning_rate": 8.889506411219329e-06, + "loss": 1.7219, + "step": 26490 + }, + { + "epoch": 8.131062001227747, + "grad_norm": 0.17785221338272095, + "learning_rate": 8.886677449032794e-06, + "loss": 1.7007, + "step": 26491 + }, + { + "epoch": 8.131368937998772, + "grad_norm": 0.2067573517560959, + "learning_rate": 8.88384889315223e-06, + "loss": 1.7918, + "step": 26492 + }, + { + "epoch": 8.131675874769797, + "grad_norm": 0.18033906817436218, + "learning_rate": 8.88102074360555e-06, + "loss": 1.7, + "step": 26493 + }, + { + "epoch": 8.131982811540823, + "grad_norm": 0.17076243460178375, + "learning_rate": 8.878193000420748e-06, + "loss": 1.6883, + "step": 26494 + }, + { + "epoch": 8.132289748311848, + "grad_norm": 0.19102394580841064, + "learning_rate": 8.875365663625729e-06, + "loss": 1.7387, + "step": 26495 + }, + { + "epoch": 8.132596685082873, + "grad_norm": 0.22587478160858154, + "learning_rate": 8.872538733248442e-06, + "loss": 1.7852, + "step": 26496 + }, + { + "epoch": 8.132903621853899, + "grad_norm": 0.17067384719848633, + "learning_rate": 8.869712209316861e-06, + "loss": 1.6813, + "step": 26497 + }, + { + "epoch": 8.133210558624924, + "grad_norm": 0.19232873618602753, + "learning_rate": 8.866886091858856e-06, + "loss": 1.6644, + "step": 26498 + }, + { + "epoch": 8.133517495395948, + "grad_norm": 0.18685118854045868, + "learning_rate": 8.864060380902423e-06, + "loss": 1.6766, + "step": 26499 + }, + { + "epoch": 8.133824432166973, + "grad_norm": 0.18342606723308563, + "learning_rate": 8.861235076475433e-06, + "loss": 1.6694, + "step": 26500 + }, + { + "epoch": 8.134131368937998, + "grad_norm": 0.15469637513160706, + "learning_rate": 8.858410178605842e-06, + "loss": 1.6882, + "step": 26501 + }, + { + "epoch": 8.134438305709024, + "grad_norm": 0.19094935059547424, + "learning_rate": 8.855585687321549e-06, + "loss": 1.6662, + "step": 26502 + }, + { + "epoch": 8.134745242480049, + "grad_norm": 0.19613660871982574, + "learning_rate": 8.852761602650479e-06, + "loss": 1.6518, + "step": 26503 + }, + { + "epoch": 8.135052179251074, + "grad_norm": 0.1342541128396988, + "learning_rate": 8.849937924620538e-06, + "loss": 1.6728, + "step": 26504 + }, + { + "epoch": 8.1353591160221, + "grad_norm": 0.19099827110767365, + "learning_rate": 8.847114653259624e-06, + "loss": 1.714, + "step": 26505 + }, + { + "epoch": 8.135666052793125, + "grad_norm": 0.18886728584766388, + "learning_rate": 8.84429178859565e-06, + "loss": 1.7222, + "step": 26506 + }, + { + "epoch": 8.13597298956415, + "grad_norm": 0.16177545487880707, + "learning_rate": 8.841469330656499e-06, + "loss": 1.754, + "step": 26507 + }, + { + "epoch": 8.136279926335176, + "grad_norm": 0.1589137762784958, + "learning_rate": 8.838647279470063e-06, + "loss": 1.6889, + "step": 26508 + }, + { + "epoch": 8.1365868631062, + "grad_norm": 0.16074521839618683, + "learning_rate": 8.835825635064266e-06, + "loss": 1.6882, + "step": 26509 + }, + { + "epoch": 8.136893799877225, + "grad_norm": 0.15532740950584412, + "learning_rate": 8.833004397466937e-06, + "loss": 1.6786, + "step": 26510 + }, + { + "epoch": 8.13720073664825, + "grad_norm": 0.18151862919330597, + "learning_rate": 8.830183566706019e-06, + "loss": 1.7075, + "step": 26511 + }, + { + "epoch": 8.137507673419275, + "grad_norm": 0.15345066785812378, + "learning_rate": 8.827363142809342e-06, + "loss": 1.6895, + "step": 26512 + }, + { + "epoch": 8.1378146101903, + "grad_norm": 0.16954976320266724, + "learning_rate": 8.824543125804785e-06, + "loss": 1.727, + "step": 26513 + }, + { + "epoch": 8.138121546961326, + "grad_norm": 0.1679479032754898, + "learning_rate": 8.821723515720249e-06, + "loss": 1.7391, + "step": 26514 + }, + { + "epoch": 8.138428483732351, + "grad_norm": 0.15377631783485413, + "learning_rate": 8.818904312583547e-06, + "loss": 1.6954, + "step": 26515 + }, + { + "epoch": 8.138735420503377, + "grad_norm": 0.20345479249954224, + "learning_rate": 8.8160855164226e-06, + "loss": 1.7424, + "step": 26516 + }, + { + "epoch": 8.139042357274402, + "grad_norm": 0.18770255148410797, + "learning_rate": 8.813267127265207e-06, + "loss": 1.67, + "step": 26517 + }, + { + "epoch": 8.139349294045427, + "grad_norm": 0.16253206133842468, + "learning_rate": 8.810449145139265e-06, + "loss": 1.7004, + "step": 26518 + }, + { + "epoch": 8.139656230816453, + "grad_norm": 0.18429701030254364, + "learning_rate": 8.807631570072606e-06, + "loss": 1.7289, + "step": 26519 + }, + { + "epoch": 8.139963167587476, + "grad_norm": 0.18926598131656647, + "learning_rate": 8.80481440209307e-06, + "loss": 1.7907, + "step": 26520 + }, + { + "epoch": 8.140270104358502, + "grad_norm": 0.17855983972549438, + "learning_rate": 8.80199764122851e-06, + "loss": 1.7008, + "step": 26521 + }, + { + "epoch": 8.140577041129527, + "grad_norm": 0.20559640228748322, + "learning_rate": 8.799181287506752e-06, + "loss": 1.724, + "step": 26522 + }, + { + "epoch": 8.140883977900552, + "grad_norm": 0.1707194298505783, + "learning_rate": 8.79636534095563e-06, + "loss": 1.7274, + "step": 26523 + }, + { + "epoch": 8.141190914671578, + "grad_norm": 0.1882070004940033, + "learning_rate": 8.793549801602984e-06, + "loss": 1.7503, + "step": 26524 + }, + { + "epoch": 8.141497851442603, + "grad_norm": 0.24269217252731323, + "learning_rate": 8.790734669476613e-06, + "loss": 1.7459, + "step": 26525 + }, + { + "epoch": 8.141804788213628, + "grad_norm": 0.20310194790363312, + "learning_rate": 8.787919944604383e-06, + "loss": 1.7158, + "step": 26526 + }, + { + "epoch": 8.142111724984654, + "grad_norm": 0.18653319776058197, + "learning_rate": 8.785105627014056e-06, + "loss": 1.7135, + "step": 26527 + }, + { + "epoch": 8.142418661755679, + "grad_norm": 0.1896388828754425, + "learning_rate": 8.782291716733499e-06, + "loss": 1.7407, + "step": 26528 + }, + { + "epoch": 8.142725598526704, + "grad_norm": 0.17392487823963165, + "learning_rate": 8.779478213790482e-06, + "loss": 1.6863, + "step": 26529 + }, + { + "epoch": 8.14303253529773, + "grad_norm": 0.2389729917049408, + "learning_rate": 8.776665118212807e-06, + "loss": 1.7565, + "step": 26530 + }, + { + "epoch": 8.143339472068753, + "grad_norm": 0.1907578408718109, + "learning_rate": 8.773852430028312e-06, + "loss": 1.7135, + "step": 26531 + }, + { + "epoch": 8.143646408839778, + "grad_norm": 0.1867230087518692, + "learning_rate": 8.771040149264748e-06, + "loss": 1.657, + "step": 26532 + }, + { + "epoch": 8.143953345610804, + "grad_norm": 0.16111065447330475, + "learning_rate": 8.768228275949953e-06, + "loss": 1.6849, + "step": 26533 + }, + { + "epoch": 8.144260282381829, + "grad_norm": 0.24071912467479706, + "learning_rate": 8.76541681011167e-06, + "loss": 1.7563, + "step": 26534 + }, + { + "epoch": 8.144567219152854, + "grad_norm": 0.18996769189834595, + "learning_rate": 8.76260575177772e-06, + "loss": 1.7099, + "step": 26535 + }, + { + "epoch": 8.14487415592388, + "grad_norm": 0.17230607569217682, + "learning_rate": 8.75979510097587e-06, + "loss": 1.6848, + "step": 26536 + }, + { + "epoch": 8.145181092694905, + "grad_norm": 0.19319802522659302, + "learning_rate": 8.756984857733896e-06, + "loss": 1.7806, + "step": 26537 + }, + { + "epoch": 8.14548802946593, + "grad_norm": 0.16848497092723846, + "learning_rate": 8.754175022079569e-06, + "loss": 1.7099, + "step": 26538 + }, + { + "epoch": 8.145794966236956, + "grad_norm": 0.16230639815330505, + "learning_rate": 8.751365594040662e-06, + "loss": 1.6618, + "step": 26539 + }, + { + "epoch": 8.146101903007981, + "grad_norm": 0.15458232164382935, + "learning_rate": 8.748556573644935e-06, + "loss": 1.6975, + "step": 26540 + }, + { + "epoch": 8.146408839779005, + "grad_norm": 0.15948891639709473, + "learning_rate": 8.745747960920153e-06, + "loss": 1.6977, + "step": 26541 + }, + { + "epoch": 8.14671577655003, + "grad_norm": 0.17533692717552185, + "learning_rate": 8.742939755894053e-06, + "loss": 1.7314, + "step": 26542 + }, + { + "epoch": 8.147022713321055, + "grad_norm": 0.13606345653533936, + "learning_rate": 8.740131958594433e-06, + "loss": 1.6245, + "step": 26543 + }, + { + "epoch": 8.14732965009208, + "grad_norm": 0.1749604493379593, + "learning_rate": 8.737324569048993e-06, + "loss": 1.6881, + "step": 26544 + }, + { + "epoch": 8.147636586863106, + "grad_norm": 0.15416191518306732, + "learning_rate": 8.7345175872855e-06, + "loss": 1.6755, + "step": 26545 + }, + { + "epoch": 8.147943523634131, + "grad_norm": 0.19732356071472168, + "learning_rate": 8.731711013331695e-06, + "loss": 1.7068, + "step": 26546 + }, + { + "epoch": 8.148250460405157, + "grad_norm": 0.19295896589756012, + "learning_rate": 8.728904847215291e-06, + "loss": 1.7282, + "step": 26547 + }, + { + "epoch": 8.148557397176182, + "grad_norm": 0.18414302170276642, + "learning_rate": 8.726099088964069e-06, + "loss": 1.7059, + "step": 26548 + }, + { + "epoch": 8.148864333947207, + "grad_norm": 0.17527544498443604, + "learning_rate": 8.723293738605697e-06, + "loss": 1.6947, + "step": 26549 + }, + { + "epoch": 8.149171270718233, + "grad_norm": 0.1913319230079651, + "learning_rate": 8.720488796167958e-06, + "loss": 1.6988, + "step": 26550 + }, + { + "epoch": 8.149478207489258, + "grad_norm": 0.1604306846857071, + "learning_rate": 8.71768426167852e-06, + "loss": 1.6937, + "step": 26551 + }, + { + "epoch": 8.149785144260282, + "grad_norm": 0.1562403291463852, + "learning_rate": 8.714880135165132e-06, + "loss": 1.6633, + "step": 26552 + }, + { + "epoch": 8.150092081031307, + "grad_norm": 0.16940948367118835, + "learning_rate": 8.712076416655495e-06, + "loss": 1.6774, + "step": 26553 + }, + { + "epoch": 8.150399017802332, + "grad_norm": 0.14607203006744385, + "learning_rate": 8.709273106177324e-06, + "loss": 1.6912, + "step": 26554 + }, + { + "epoch": 8.150705954573358, + "grad_norm": 0.1811707615852356, + "learning_rate": 8.706470203758316e-06, + "loss": 1.7291, + "step": 26555 + }, + { + "epoch": 8.151012891344383, + "grad_norm": 0.18188659846782684, + "learning_rate": 8.703667709426166e-06, + "loss": 1.6994, + "step": 26556 + }, + { + "epoch": 8.151319828115408, + "grad_norm": 0.16499698162078857, + "learning_rate": 8.700865623208581e-06, + "loss": 1.7065, + "step": 26557 + }, + { + "epoch": 8.151626764886434, + "grad_norm": 0.17506305873394012, + "learning_rate": 8.69806394513325e-06, + "loss": 1.75, + "step": 26558 + }, + { + "epoch": 8.151933701657459, + "grad_norm": 0.14843741059303284, + "learning_rate": 8.695262675227844e-06, + "loss": 1.6645, + "step": 26559 + }, + { + "epoch": 8.152240638428484, + "grad_norm": 0.15281017124652863, + "learning_rate": 8.692461813520087e-06, + "loss": 1.7166, + "step": 26560 + }, + { + "epoch": 8.15254757519951, + "grad_norm": 0.17245371639728546, + "learning_rate": 8.689661360037621e-06, + "loss": 1.7418, + "step": 26561 + }, + { + "epoch": 8.152854511970535, + "grad_norm": 0.17387856543064117, + "learning_rate": 8.686861314808131e-06, + "loss": 1.6865, + "step": 26562 + }, + { + "epoch": 8.153161448741558, + "grad_norm": 0.1463180035352707, + "learning_rate": 8.684061677859296e-06, + "loss": 1.6867, + "step": 26563 + }, + { + "epoch": 8.153468385512584, + "grad_norm": 0.16704687476158142, + "learning_rate": 8.681262449218769e-06, + "loss": 1.6985, + "step": 26564 + }, + { + "epoch": 8.15377532228361, + "grad_norm": 0.17754648625850677, + "learning_rate": 8.678463628914246e-06, + "loss": 1.7067, + "step": 26565 + }, + { + "epoch": 8.154082259054634, + "grad_norm": 0.12470053881406784, + "learning_rate": 8.675665216973339e-06, + "loss": 1.6468, + "step": 26566 + }, + { + "epoch": 8.15438919582566, + "grad_norm": 0.17551906406879425, + "learning_rate": 8.672867213423757e-06, + "loss": 1.76, + "step": 26567 + }, + { + "epoch": 8.154696132596685, + "grad_norm": 0.13165321946144104, + "learning_rate": 8.670069618293098e-06, + "loss": 1.6672, + "step": 26568 + }, + { + "epoch": 8.15500306936771, + "grad_norm": 0.1410796046257019, + "learning_rate": 8.667272431609041e-06, + "loss": 1.649, + "step": 26569 + }, + { + "epoch": 8.155310006138736, + "grad_norm": 0.17227822542190552, + "learning_rate": 8.664475653399235e-06, + "loss": 1.7028, + "step": 26570 + }, + { + "epoch": 8.155616942909761, + "grad_norm": 0.15770387649536133, + "learning_rate": 8.661679283691298e-06, + "loss": 1.7608, + "step": 26571 + }, + { + "epoch": 8.155923879680786, + "grad_norm": 0.1425134390592575, + "learning_rate": 8.658883322512885e-06, + "loss": 1.6821, + "step": 26572 + }, + { + "epoch": 8.15623081645181, + "grad_norm": 0.19647212326526642, + "learning_rate": 8.656087769891608e-06, + "loss": 1.7787, + "step": 26573 + }, + { + "epoch": 8.156537753222835, + "grad_norm": 0.15315282344818115, + "learning_rate": 8.653292625855108e-06, + "loss": 1.6464, + "step": 26574 + }, + { + "epoch": 8.15684468999386, + "grad_norm": 0.1664622575044632, + "learning_rate": 8.650497890431009e-06, + "loss": 1.7189, + "step": 26575 + }, + { + "epoch": 8.157151626764886, + "grad_norm": 0.19525103271007538, + "learning_rate": 8.647703563646908e-06, + "loss": 1.71, + "step": 26576 + }, + { + "epoch": 8.157458563535911, + "grad_norm": 0.2435453087091446, + "learning_rate": 8.644909645530464e-06, + "loss": 1.7312, + "step": 26577 + }, + { + "epoch": 8.157765500306937, + "grad_norm": 0.20554441213607788, + "learning_rate": 8.642116136109252e-06, + "loss": 1.7102, + "step": 26578 + }, + { + "epoch": 8.158072437077962, + "grad_norm": 0.21100008487701416, + "learning_rate": 8.639323035410885e-06, + "loss": 1.6513, + "step": 26579 + }, + { + "epoch": 8.158379373848987, + "grad_norm": 0.20069560408592224, + "learning_rate": 8.636530343462973e-06, + "loss": 1.7457, + "step": 26580 + }, + { + "epoch": 8.158686310620013, + "grad_norm": 0.19240780174732208, + "learning_rate": 8.633738060293095e-06, + "loss": 1.6761, + "step": 26581 + }, + { + "epoch": 8.158993247391038, + "grad_norm": 0.17970497906208038, + "learning_rate": 8.63094618592889e-06, + "loss": 1.7571, + "step": 26582 + }, + { + "epoch": 8.159300184162063, + "grad_norm": 0.19709791243076324, + "learning_rate": 8.628154720397902e-06, + "loss": 1.7826, + "step": 26583 + }, + { + "epoch": 8.159607120933087, + "grad_norm": 0.2084866315126419, + "learning_rate": 8.62536366372776e-06, + "loss": 1.7113, + "step": 26584 + }, + { + "epoch": 8.159914057704112, + "grad_norm": 0.18584266304969788, + "learning_rate": 8.622573015945995e-06, + "loss": 1.675, + "step": 26585 + }, + { + "epoch": 8.160220994475138, + "grad_norm": 0.21233049035072327, + "learning_rate": 8.619782777080232e-06, + "loss": 1.7438, + "step": 26586 + }, + { + "epoch": 8.160527931246163, + "grad_norm": 0.180323526263237, + "learning_rate": 8.61699294715803e-06, + "loss": 1.6923, + "step": 26587 + }, + { + "epoch": 8.160834868017188, + "grad_norm": 0.182667076587677, + "learning_rate": 8.614203526206955e-06, + "loss": 1.7302, + "step": 26588 + }, + { + "epoch": 8.161141804788214, + "grad_norm": 0.19673213362693787, + "learning_rate": 8.611414514254584e-06, + "loss": 1.7282, + "step": 26589 + }, + { + "epoch": 8.161448741559239, + "grad_norm": 0.14357072114944458, + "learning_rate": 8.608625911328466e-06, + "loss": 1.6964, + "step": 26590 + }, + { + "epoch": 8.161755678330264, + "grad_norm": 0.25598716735839844, + "learning_rate": 8.605837717456172e-06, + "loss": 1.788, + "step": 26591 + }, + { + "epoch": 8.16206261510129, + "grad_norm": 0.16914238035678864, + "learning_rate": 8.603049932665252e-06, + "loss": 1.6069, + "step": 26592 + }, + { + "epoch": 8.162369551872315, + "grad_norm": 0.1468336582183838, + "learning_rate": 8.60026255698324e-06, + "loss": 1.7009, + "step": 26593 + }, + { + "epoch": 8.16267648864334, + "grad_norm": 0.20125585794448853, + "learning_rate": 8.597475590437726e-06, + "loss": 1.7166, + "step": 26594 + }, + { + "epoch": 8.162983425414364, + "grad_norm": 0.12715741991996765, + "learning_rate": 8.594689033056214e-06, + "loss": 1.6488, + "step": 26595 + }, + { + "epoch": 8.16329036218539, + "grad_norm": 0.2659800350666046, + "learning_rate": 8.591902884866254e-06, + "loss": 1.7325, + "step": 26596 + }, + { + "epoch": 8.163597298956415, + "grad_norm": 0.1939239799976349, + "learning_rate": 8.589117145895376e-06, + "loss": 1.6882, + "step": 26597 + }, + { + "epoch": 8.16390423572744, + "grad_norm": 0.18982990086078644, + "learning_rate": 8.586331816171101e-06, + "loss": 1.7222, + "step": 26598 + }, + { + "epoch": 8.164211172498465, + "grad_norm": 0.16025054454803467, + "learning_rate": 8.583546895720995e-06, + "loss": 1.6672, + "step": 26599 + }, + { + "epoch": 8.16451810926949, + "grad_norm": 0.1923390030860901, + "learning_rate": 8.580762384572533e-06, + "loss": 1.7261, + "step": 26600 + }, + { + "epoch": 8.164825046040516, + "grad_norm": 0.1467374712228775, + "learning_rate": 8.577978282753274e-06, + "loss": 1.6969, + "step": 26601 + }, + { + "epoch": 8.165131982811541, + "grad_norm": 0.2210266888141632, + "learning_rate": 8.575194590290685e-06, + "loss": 1.74, + "step": 26602 + }, + { + "epoch": 8.165438919582567, + "grad_norm": 0.1852598935365677, + "learning_rate": 8.572411307212319e-06, + "loss": 1.7522, + "step": 26603 + }, + { + "epoch": 8.165745856353592, + "grad_norm": 0.19316701591014862, + "learning_rate": 8.569628433545662e-06, + "loss": 1.7389, + "step": 26604 + }, + { + "epoch": 8.166052793124617, + "grad_norm": 0.2102174311876297, + "learning_rate": 8.566845969318227e-06, + "loss": 1.7134, + "step": 26605 + }, + { + "epoch": 8.16635972989564, + "grad_norm": 0.1948329359292984, + "learning_rate": 8.564063914557496e-06, + "loss": 1.7368, + "step": 26606 + }, + { + "epoch": 8.166666666666666, + "grad_norm": 0.14721956849098206, + "learning_rate": 8.561282269290977e-06, + "loss": 1.6526, + "step": 26607 + }, + { + "epoch": 8.166973603437691, + "grad_norm": 0.17424573004245758, + "learning_rate": 8.558501033546158e-06, + "loss": 1.6954, + "step": 26608 + }, + { + "epoch": 8.167280540208717, + "grad_norm": 0.14784085750579834, + "learning_rate": 8.555720207350514e-06, + "loss": 1.7166, + "step": 26609 + }, + { + "epoch": 8.167587476979742, + "grad_norm": 0.1619582176208496, + "learning_rate": 8.55293979073154e-06, + "loss": 1.716, + "step": 26610 + }, + { + "epoch": 8.167894413750767, + "grad_norm": 0.2342625856399536, + "learning_rate": 8.550159783716705e-06, + "loss": 1.7399, + "step": 26611 + }, + { + "epoch": 8.168201350521793, + "grad_norm": 0.16116589307785034, + "learning_rate": 8.547380186333482e-06, + "loss": 1.6727, + "step": 26612 + }, + { + "epoch": 8.168508287292818, + "grad_norm": 0.20995540916919708, + "learning_rate": 8.544600998609349e-06, + "loss": 1.703, + "step": 26613 + }, + { + "epoch": 8.168815224063843, + "grad_norm": 0.18031500279903412, + "learning_rate": 8.541822220571766e-06, + "loss": 1.6953, + "step": 26614 + }, + { + "epoch": 8.169122160834869, + "grad_norm": 0.1851302981376648, + "learning_rate": 8.539043852248197e-06, + "loss": 1.6931, + "step": 26615 + }, + { + "epoch": 8.169429097605892, + "grad_norm": 0.2262948453426361, + "learning_rate": 8.536265893666096e-06, + "loss": 1.7167, + "step": 26616 + }, + { + "epoch": 8.169736034376918, + "grad_norm": 0.1456020325422287, + "learning_rate": 8.533488344852903e-06, + "loss": 1.6686, + "step": 26617 + }, + { + "epoch": 8.170042971147943, + "grad_norm": 0.17165613174438477, + "learning_rate": 8.530711205836112e-06, + "loss": 1.6641, + "step": 26618 + }, + { + "epoch": 8.170349907918968, + "grad_norm": 0.18926110863685608, + "learning_rate": 8.527934476643112e-06, + "loss": 1.7155, + "step": 26619 + }, + { + "epoch": 8.170656844689994, + "grad_norm": 0.1722220927476883, + "learning_rate": 8.525158157301383e-06, + "loss": 1.7188, + "step": 26620 + }, + { + "epoch": 8.170963781461019, + "grad_norm": 0.1791582554578781, + "learning_rate": 8.522382247838351e-06, + "loss": 1.7195, + "step": 26621 + }, + { + "epoch": 8.171270718232044, + "grad_norm": 0.18020455539226532, + "learning_rate": 8.519606748281445e-06, + "loss": 1.7068, + "step": 26622 + }, + { + "epoch": 8.17157765500307, + "grad_norm": 0.17394676804542542, + "learning_rate": 8.516831658658098e-06, + "loss": 1.6977, + "step": 26623 + }, + { + "epoch": 8.171884591774095, + "grad_norm": 0.24079330265522003, + "learning_rate": 8.514056978995739e-06, + "loss": 1.7152, + "step": 26624 + }, + { + "epoch": 8.17219152854512, + "grad_norm": 0.16567498445510864, + "learning_rate": 8.511282709321784e-06, + "loss": 1.7048, + "step": 26625 + }, + { + "epoch": 8.172498465316146, + "grad_norm": 0.21935853362083435, + "learning_rate": 8.508508849663649e-06, + "loss": 1.7445, + "step": 26626 + }, + { + "epoch": 8.17280540208717, + "grad_norm": 0.18325531482696533, + "learning_rate": 8.505735400048748e-06, + "loss": 1.7343, + "step": 26627 + }, + { + "epoch": 8.173112338858195, + "grad_norm": 0.16334550082683563, + "learning_rate": 8.50296236050449e-06, + "loss": 1.727, + "step": 26628 + }, + { + "epoch": 8.17341927562922, + "grad_norm": 0.23685503005981445, + "learning_rate": 8.500189731058284e-06, + "loss": 1.6718, + "step": 26629 + }, + { + "epoch": 8.173726212400245, + "grad_norm": 0.17057496309280396, + "learning_rate": 8.49741751173752e-06, + "loss": 1.7083, + "step": 26630 + }, + { + "epoch": 8.17403314917127, + "grad_norm": 0.19941039383411407, + "learning_rate": 8.49464570256961e-06, + "loss": 1.6496, + "step": 26631 + }, + { + "epoch": 8.174340085942296, + "grad_norm": 0.1887839138507843, + "learning_rate": 8.49187430358193e-06, + "loss": 1.7896, + "step": 26632 + }, + { + "epoch": 8.174647022713321, + "grad_norm": 0.16285917162895203, + "learning_rate": 8.489103314801883e-06, + "loss": 1.6923, + "step": 26633 + }, + { + "epoch": 8.174953959484347, + "grad_norm": 0.1405196487903595, + "learning_rate": 8.48633273625683e-06, + "loss": 1.6907, + "step": 26634 + }, + { + "epoch": 8.175260896255372, + "grad_norm": 0.17885157465934753, + "learning_rate": 8.483562567974196e-06, + "loss": 1.7036, + "step": 26635 + }, + { + "epoch": 8.175567833026397, + "grad_norm": 0.1427285224199295, + "learning_rate": 8.480792809981309e-06, + "loss": 1.6997, + "step": 26636 + }, + { + "epoch": 8.175874769797423, + "grad_norm": 0.15711882710456848, + "learning_rate": 8.478023462305579e-06, + "loss": 1.6874, + "step": 26637 + }, + { + "epoch": 8.176181706568446, + "grad_norm": 0.19080850481987, + "learning_rate": 8.47525452497434e-06, + "loss": 1.7078, + "step": 26638 + }, + { + "epoch": 8.176488643339471, + "grad_norm": 0.17063139379024506, + "learning_rate": 8.472485998014984e-06, + "loss": 1.7147, + "step": 26639 + }, + { + "epoch": 8.176795580110497, + "grad_norm": 0.151056706905365, + "learning_rate": 8.469717881454865e-06, + "loss": 1.685, + "step": 26640 + }, + { + "epoch": 8.177102516881522, + "grad_norm": 0.16712957620620728, + "learning_rate": 8.466950175321331e-06, + "loss": 1.7142, + "step": 26641 + }, + { + "epoch": 8.177409453652547, + "grad_norm": 0.13982228934764862, + "learning_rate": 8.46418287964174e-06, + "loss": 1.6707, + "step": 26642 + }, + { + "epoch": 8.177716390423573, + "grad_norm": 0.14738497138023376, + "learning_rate": 8.461415994443439e-06, + "loss": 1.7381, + "step": 26643 + }, + { + "epoch": 8.178023327194598, + "grad_norm": 0.1691005975008011, + "learning_rate": 8.45864951975377e-06, + "loss": 1.6956, + "step": 26644 + }, + { + "epoch": 8.178330263965623, + "grad_norm": 0.1477413773536682, + "learning_rate": 8.455883455600078e-06, + "loss": 1.6646, + "step": 26645 + }, + { + "epoch": 8.178637200736649, + "grad_norm": 0.15620499849319458, + "learning_rate": 8.453117802009697e-06, + "loss": 1.7031, + "step": 26646 + }, + { + "epoch": 8.178944137507674, + "grad_norm": 0.1572941690683365, + "learning_rate": 8.45035255900995e-06, + "loss": 1.6509, + "step": 26647 + }, + { + "epoch": 8.1792510742787, + "grad_norm": 0.20386455953121185, + "learning_rate": 8.447587726628176e-06, + "loss": 1.7166, + "step": 26648 + }, + { + "epoch": 8.179558011049723, + "grad_norm": 0.2131095975637436, + "learning_rate": 8.444823304891697e-06, + "loss": 1.6934, + "step": 26649 + }, + { + "epoch": 8.179864947820748, + "grad_norm": 0.15402472019195557, + "learning_rate": 8.442059293827826e-06, + "loss": 1.7538, + "step": 26650 + }, + { + "epoch": 8.180171884591774, + "grad_norm": 0.17687393724918365, + "learning_rate": 8.439295693463872e-06, + "loss": 1.7374, + "step": 26651 + }, + { + "epoch": 8.180478821362799, + "grad_norm": 0.16971834003925323, + "learning_rate": 8.436532503827188e-06, + "loss": 1.7142, + "step": 26652 + }, + { + "epoch": 8.180785758133824, + "grad_norm": 0.17651747167110443, + "learning_rate": 8.433769724945017e-06, + "loss": 1.7109, + "step": 26653 + }, + { + "epoch": 8.18109269490485, + "grad_norm": 0.18742668628692627, + "learning_rate": 8.431007356844728e-06, + "loss": 1.7024, + "step": 26654 + }, + { + "epoch": 8.181399631675875, + "grad_norm": 0.1686297208070755, + "learning_rate": 8.428245399553559e-06, + "loss": 1.7669, + "step": 26655 + }, + { + "epoch": 8.1817065684469, + "grad_norm": 0.1667923480272293, + "learning_rate": 8.425483853098848e-06, + "loss": 1.6928, + "step": 26656 + }, + { + "epoch": 8.182013505217926, + "grad_norm": 0.16002421081066132, + "learning_rate": 8.422722717507874e-06, + "loss": 1.7058, + "step": 26657 + }, + { + "epoch": 8.182320441988951, + "grad_norm": 0.1531311571598053, + "learning_rate": 8.419961992807928e-06, + "loss": 1.7096, + "step": 26658 + }, + { + "epoch": 8.182627378759975, + "grad_norm": 0.16212326288223267, + "learning_rate": 8.417201679026282e-06, + "loss": 1.6849, + "step": 26659 + }, + { + "epoch": 8.182934315531, + "grad_norm": 0.17276698350906372, + "learning_rate": 8.414441776190224e-06, + "loss": 1.6697, + "step": 26660 + }, + { + "epoch": 8.183241252302025, + "grad_norm": 0.15050961077213287, + "learning_rate": 8.411682284327028e-06, + "loss": 1.6972, + "step": 26661 + }, + { + "epoch": 8.18354818907305, + "grad_norm": 0.14593006670475006, + "learning_rate": 8.40892320346396e-06, + "loss": 1.7005, + "step": 26662 + }, + { + "epoch": 8.183855125844076, + "grad_norm": 0.18584349751472473, + "learning_rate": 8.406164533628291e-06, + "loss": 1.7366, + "step": 26663 + }, + { + "epoch": 8.184162062615101, + "grad_norm": 0.18662385642528534, + "learning_rate": 8.403406274847287e-06, + "loss": 1.77, + "step": 26664 + }, + { + "epoch": 8.184468999386127, + "grad_norm": 0.1735418438911438, + "learning_rate": 8.4006484271482e-06, + "loss": 1.692, + "step": 26665 + }, + { + "epoch": 8.184775936157152, + "grad_norm": 0.22115837037563324, + "learning_rate": 8.397890990558283e-06, + "loss": 1.7321, + "step": 26666 + }, + { + "epoch": 8.185082872928177, + "grad_norm": 0.1662493795156479, + "learning_rate": 8.395133965104796e-06, + "loss": 1.7016, + "step": 26667 + }, + { + "epoch": 8.185389809699203, + "grad_norm": 0.20966672897338867, + "learning_rate": 8.392377350814967e-06, + "loss": 1.6703, + "step": 26668 + }, + { + "epoch": 8.185696746470228, + "grad_norm": 0.16722753643989563, + "learning_rate": 8.389621147716076e-06, + "loss": 1.7429, + "step": 26669 + }, + { + "epoch": 8.186003683241251, + "grad_norm": 0.20280788838863373, + "learning_rate": 8.386865355835316e-06, + "loss": 1.7155, + "step": 26670 + }, + { + "epoch": 8.186310620012277, + "grad_norm": 0.20596744120121002, + "learning_rate": 8.384109975199967e-06, + "loss": 1.7266, + "step": 26671 + }, + { + "epoch": 8.186617556783302, + "grad_norm": 0.1525292545557022, + "learning_rate": 8.381355005837205e-06, + "loss": 1.6692, + "step": 26672 + }, + { + "epoch": 8.186924493554327, + "grad_norm": 0.21745061874389648, + "learning_rate": 8.378600447774304e-06, + "loss": 1.7048, + "step": 26673 + }, + { + "epoch": 8.187231430325353, + "grad_norm": 0.2355356216430664, + "learning_rate": 8.375846301038465e-06, + "loss": 1.7842, + "step": 26674 + }, + { + "epoch": 8.187538367096378, + "grad_norm": 0.18660607933998108, + "learning_rate": 8.37309256565691e-06, + "loss": 1.698, + "step": 26675 + }, + { + "epoch": 8.187845303867404, + "grad_norm": 0.1690683364868164, + "learning_rate": 8.370339241656855e-06, + "loss": 1.6967, + "step": 26676 + }, + { + "epoch": 8.188152240638429, + "grad_norm": 0.16226762533187866, + "learning_rate": 8.367586329065508e-06, + "loss": 1.6849, + "step": 26677 + }, + { + "epoch": 8.188459177409454, + "grad_norm": 0.192795068025589, + "learning_rate": 8.364833827910074e-06, + "loss": 1.7037, + "step": 26678 + }, + { + "epoch": 8.18876611418048, + "grad_norm": 0.13591274619102478, + "learning_rate": 8.362081738217752e-06, + "loss": 1.6517, + "step": 26679 + }, + { + "epoch": 8.189073050951505, + "grad_norm": 0.16879263520240784, + "learning_rate": 8.359330060015747e-06, + "loss": 1.6751, + "step": 26680 + }, + { + "epoch": 8.189379987722528, + "grad_norm": 0.16385328769683838, + "learning_rate": 8.356578793331243e-06, + "loss": 1.7151, + "step": 26681 + }, + { + "epoch": 8.189686924493554, + "grad_norm": 0.14804807305335999, + "learning_rate": 8.353827938191438e-06, + "loss": 1.6601, + "step": 26682 + }, + { + "epoch": 8.189993861264579, + "grad_norm": 0.1534065157175064, + "learning_rate": 8.351077494623516e-06, + "loss": 1.7664, + "step": 26683 + }, + { + "epoch": 8.190300798035604, + "grad_norm": 0.16167859733104706, + "learning_rate": 8.348327462654659e-06, + "loss": 1.6573, + "step": 26684 + }, + { + "epoch": 8.19060773480663, + "grad_norm": 0.1433487832546234, + "learning_rate": 8.34557784231203e-06, + "loss": 1.6768, + "step": 26685 + }, + { + "epoch": 8.190914671577655, + "grad_norm": 0.1636372059583664, + "learning_rate": 8.342828633622834e-06, + "loss": 1.6648, + "step": 26686 + }, + { + "epoch": 8.19122160834868, + "grad_norm": 0.13938350975513458, + "learning_rate": 8.340079836614206e-06, + "loss": 1.6511, + "step": 26687 + }, + { + "epoch": 8.191528545119706, + "grad_norm": 0.19098511338233948, + "learning_rate": 8.337331451313346e-06, + "loss": 1.7305, + "step": 26688 + }, + { + "epoch": 8.191835481890731, + "grad_norm": 0.15734615921974182, + "learning_rate": 8.33458347774737e-06, + "loss": 1.6777, + "step": 26689 + }, + { + "epoch": 8.192142418661756, + "grad_norm": 0.1523539125919342, + "learning_rate": 8.331835915943475e-06, + "loss": 1.7173, + "step": 26690 + }, + { + "epoch": 8.192449355432782, + "grad_norm": 0.17726896703243256, + "learning_rate": 8.329088765928799e-06, + "loss": 1.6904, + "step": 26691 + }, + { + "epoch": 8.192756292203805, + "grad_norm": 0.18954375386238098, + "learning_rate": 8.326342027730493e-06, + "loss": 1.7062, + "step": 26692 + }, + { + "epoch": 8.19306322897483, + "grad_norm": 0.21199224889278412, + "learning_rate": 8.323595701375702e-06, + "loss": 1.7747, + "step": 26693 + }, + { + "epoch": 8.193370165745856, + "grad_norm": 0.15305975079536438, + "learning_rate": 8.320849786891566e-06, + "loss": 1.6829, + "step": 26694 + }, + { + "epoch": 8.193677102516881, + "grad_norm": 0.1407271921634674, + "learning_rate": 8.318104284305216e-06, + "loss": 1.6774, + "step": 26695 + }, + { + "epoch": 8.193984039287907, + "grad_norm": 0.15379782021045685, + "learning_rate": 8.315359193643796e-06, + "loss": 1.7037, + "step": 26696 + }, + { + "epoch": 8.194290976058932, + "grad_norm": 0.21377405524253845, + "learning_rate": 8.31261451493443e-06, + "loss": 1.7258, + "step": 26697 + }, + { + "epoch": 8.194597912829957, + "grad_norm": 0.1975884586572647, + "learning_rate": 8.309870248204238e-06, + "loss": 1.718, + "step": 26698 + }, + { + "epoch": 8.194904849600983, + "grad_norm": 0.1985187530517578, + "learning_rate": 8.307126393480341e-06, + "loss": 1.7199, + "step": 26699 + }, + { + "epoch": 8.195211786372008, + "grad_norm": 0.17664451897144318, + "learning_rate": 8.304382950789857e-06, + "loss": 1.744, + "step": 26700 + }, + { + "epoch": 8.195518723143033, + "grad_norm": 0.16517753899097443, + "learning_rate": 8.301639920159904e-06, + "loss": 1.7289, + "step": 26701 + }, + { + "epoch": 8.195825659914057, + "grad_norm": 0.15431733429431915, + "learning_rate": 8.29889730161757e-06, + "loss": 1.6854, + "step": 26702 + }, + { + "epoch": 8.196132596685082, + "grad_norm": 0.14390075206756592, + "learning_rate": 8.296155095190005e-06, + "loss": 1.6806, + "step": 26703 + }, + { + "epoch": 8.196439533456108, + "grad_norm": 0.1450011432170868, + "learning_rate": 8.293413300904246e-06, + "loss": 1.6579, + "step": 26704 + }, + { + "epoch": 8.196746470227133, + "grad_norm": 0.20312175154685974, + "learning_rate": 8.290671918787452e-06, + "loss": 1.7053, + "step": 26705 + }, + { + "epoch": 8.197053406998158, + "grad_norm": 0.13979235291481018, + "learning_rate": 8.287930948866656e-06, + "loss": 1.6751, + "step": 26706 + }, + { + "epoch": 8.197360343769184, + "grad_norm": 0.1665562391281128, + "learning_rate": 8.28519039116899e-06, + "loss": 1.7523, + "step": 26707 + }, + { + "epoch": 8.197667280540209, + "grad_norm": 0.15326659381389618, + "learning_rate": 8.282450245721524e-06, + "loss": 1.6788, + "step": 26708 + }, + { + "epoch": 8.197974217311234, + "grad_norm": 0.14121493697166443, + "learning_rate": 8.279710512551331e-06, + "loss": 1.6351, + "step": 26709 + }, + { + "epoch": 8.19828115408226, + "grad_norm": 0.16965799033641815, + "learning_rate": 8.276971191685495e-06, + "loss": 1.7694, + "step": 26710 + }, + { + "epoch": 8.198588090853285, + "grad_norm": 0.21316587924957275, + "learning_rate": 8.274232283151085e-06, + "loss": 1.6922, + "step": 26711 + }, + { + "epoch": 8.19889502762431, + "grad_norm": 0.1613110601902008, + "learning_rate": 8.271493786975165e-06, + "loss": 1.7221, + "step": 26712 + }, + { + "epoch": 8.199201964395334, + "grad_norm": 0.19140063226222992, + "learning_rate": 8.268755703184804e-06, + "loss": 1.7457, + "step": 26713 + }, + { + "epoch": 8.199508901166359, + "grad_norm": 0.1680840253829956, + "learning_rate": 8.26601803180706e-06, + "loss": 1.6948, + "step": 26714 + }, + { + "epoch": 8.199815837937384, + "grad_norm": 0.17642726004123688, + "learning_rate": 8.263280772868982e-06, + "loss": 1.6996, + "step": 26715 + }, + { + "epoch": 8.20012277470841, + "grad_norm": 0.21370023488998413, + "learning_rate": 8.26054392639763e-06, + "loss": 1.7585, + "step": 26716 + }, + { + "epoch": 8.200429711479435, + "grad_norm": 0.20721369981765747, + "learning_rate": 8.257807492420044e-06, + "loss": 1.7127, + "step": 26717 + }, + { + "epoch": 8.20073664825046, + "grad_norm": 0.14441120624542236, + "learning_rate": 8.255071470963272e-06, + "loss": 1.6627, + "step": 26718 + }, + { + "epoch": 8.201043585021486, + "grad_norm": 0.17547503113746643, + "learning_rate": 8.25233586205434e-06, + "loss": 1.7764, + "step": 26719 + }, + { + "epoch": 8.201350521792511, + "grad_norm": 0.1724909394979477, + "learning_rate": 8.24960066572032e-06, + "loss": 1.6978, + "step": 26720 + }, + { + "epoch": 8.201657458563536, + "grad_norm": 0.16465766727924347, + "learning_rate": 8.246865881988186e-06, + "loss": 1.7302, + "step": 26721 + }, + { + "epoch": 8.201964395334562, + "grad_norm": 0.18594282865524292, + "learning_rate": 8.244131510885023e-06, + "loss": 1.7354, + "step": 26722 + }, + { + "epoch": 8.202271332105587, + "grad_norm": 0.163459911942482, + "learning_rate": 8.241397552437803e-06, + "loss": 1.7069, + "step": 26723 + }, + { + "epoch": 8.20257826887661, + "grad_norm": 0.1712186485528946, + "learning_rate": 8.23866400667358e-06, + "loss": 1.7029, + "step": 26724 + }, + { + "epoch": 8.202885205647636, + "grad_norm": 0.155457004904747, + "learning_rate": 8.235930873619357e-06, + "loss": 1.6806, + "step": 26725 + }, + { + "epoch": 8.203192142418661, + "grad_norm": 0.19597770273685455, + "learning_rate": 8.233198153302146e-06, + "loss": 1.7271, + "step": 26726 + }, + { + "epoch": 8.203499079189687, + "grad_norm": 0.17909370362758636, + "learning_rate": 8.230465845748946e-06, + "loss": 1.7334, + "step": 26727 + }, + { + "epoch": 8.203806015960712, + "grad_norm": 0.1566748470067978, + "learning_rate": 8.227733950986766e-06, + "loss": 1.7965, + "step": 26728 + }, + { + "epoch": 8.204112952731737, + "grad_norm": 0.23624123632907867, + "learning_rate": 8.225002469042603e-06, + "loss": 1.7154, + "step": 26729 + }, + { + "epoch": 8.204419889502763, + "grad_norm": 0.17100931704044342, + "learning_rate": 8.222271399943448e-06, + "loss": 1.6745, + "step": 26730 + }, + { + "epoch": 8.204726826273788, + "grad_norm": 0.1762385219335556, + "learning_rate": 8.219540743716298e-06, + "loss": 1.7199, + "step": 26731 + }, + { + "epoch": 8.205033763044813, + "grad_norm": 0.19741147756576538, + "learning_rate": 8.216810500388134e-06, + "loss": 1.7582, + "step": 26732 + }, + { + "epoch": 8.205340699815839, + "grad_norm": 0.14669859409332275, + "learning_rate": 8.214080669985941e-06, + "loss": 1.6859, + "step": 26733 + }, + { + "epoch": 8.205647636586862, + "grad_norm": 0.16434574127197266, + "learning_rate": 8.211351252536692e-06, + "loss": 1.7129, + "step": 26734 + }, + { + "epoch": 8.205954573357888, + "grad_norm": 0.17041419446468353, + "learning_rate": 8.208622248067361e-06, + "loss": 1.7145, + "step": 26735 + }, + { + "epoch": 8.206261510128913, + "grad_norm": 0.16507895290851593, + "learning_rate": 8.205893656604907e-06, + "loss": 1.7486, + "step": 26736 + }, + { + "epoch": 8.206568446899938, + "grad_norm": 0.19548171758651733, + "learning_rate": 8.203165478176334e-06, + "loss": 1.7135, + "step": 26737 + }, + { + "epoch": 8.206875383670964, + "grad_norm": 0.16964592039585114, + "learning_rate": 8.200437712808556e-06, + "loss": 1.703, + "step": 26738 + }, + { + "epoch": 8.207182320441989, + "grad_norm": 0.1599748432636261, + "learning_rate": 8.197710360528571e-06, + "loss": 1.7065, + "step": 26739 + }, + { + "epoch": 8.207489257213014, + "grad_norm": 0.1665380746126175, + "learning_rate": 8.194983421363294e-06, + "loss": 1.6927, + "step": 26740 + }, + { + "epoch": 8.20779619398404, + "grad_norm": 0.13410761952400208, + "learning_rate": 8.192256895339701e-06, + "loss": 1.6373, + "step": 26741 + }, + { + "epoch": 8.208103130755065, + "grad_norm": 0.17461349070072174, + "learning_rate": 8.189530782484733e-06, + "loss": 1.7058, + "step": 26742 + }, + { + "epoch": 8.20841006752609, + "grad_norm": 0.15213793516159058, + "learning_rate": 8.186805082825327e-06, + "loss": 1.6664, + "step": 26743 + }, + { + "epoch": 8.208717004297116, + "grad_norm": 0.17611466348171234, + "learning_rate": 8.184079796388421e-06, + "loss": 1.7029, + "step": 26744 + }, + { + "epoch": 8.20902394106814, + "grad_norm": 0.16301874816417694, + "learning_rate": 8.181354923200945e-06, + "loss": 1.7024, + "step": 26745 + }, + { + "epoch": 8.209330877839164, + "grad_norm": 0.12992535531520844, + "learning_rate": 8.178630463289833e-06, + "loss": 1.6471, + "step": 26746 + }, + { + "epoch": 8.20963781461019, + "grad_norm": 0.1948312669992447, + "learning_rate": 8.175906416682006e-06, + "loss": 1.7359, + "step": 26747 + }, + { + "epoch": 8.209944751381215, + "grad_norm": 0.16086861491203308, + "learning_rate": 8.173182783404387e-06, + "loss": 1.7312, + "step": 26748 + }, + { + "epoch": 8.21025168815224, + "grad_norm": 0.20091786980628967, + "learning_rate": 8.17045956348389e-06, + "loss": 1.7038, + "step": 26749 + }, + { + "epoch": 8.210558624923266, + "grad_norm": 0.18929384648799896, + "learning_rate": 8.16773675694743e-06, + "loss": 1.7129, + "step": 26750 + }, + { + "epoch": 8.210865561694291, + "grad_norm": 0.1536511927843094, + "learning_rate": 8.16501436382191e-06, + "loss": 1.7031, + "step": 26751 + }, + { + "epoch": 8.211172498465316, + "grad_norm": 0.15490883588790894, + "learning_rate": 8.162292384134245e-06, + "loss": 1.6625, + "step": 26752 + }, + { + "epoch": 8.211479435236342, + "grad_norm": 0.18852801620960236, + "learning_rate": 8.159570817911311e-06, + "loss": 1.7691, + "step": 26753 + }, + { + "epoch": 8.211786372007367, + "grad_norm": 0.21555860340595245, + "learning_rate": 8.15684966518005e-06, + "loss": 1.7919, + "step": 26754 + }, + { + "epoch": 8.212093308778392, + "grad_norm": 0.19634628295898438, + "learning_rate": 8.154128925967297e-06, + "loss": 1.7174, + "step": 26755 + }, + { + "epoch": 8.212400245549416, + "grad_norm": 0.15788821876049042, + "learning_rate": 8.151408600299998e-06, + "loss": 1.6956, + "step": 26756 + }, + { + "epoch": 8.212707182320441, + "grad_norm": 0.17314517498016357, + "learning_rate": 8.148688688204975e-06, + "loss": 1.75, + "step": 26757 + }, + { + "epoch": 8.213014119091467, + "grad_norm": 0.15606027841567993, + "learning_rate": 8.145969189709158e-06, + "loss": 1.6696, + "step": 26758 + }, + { + "epoch": 8.213321055862492, + "grad_norm": 0.17407195270061493, + "learning_rate": 8.143250104839406e-06, + "loss": 1.7279, + "step": 26759 + }, + { + "epoch": 8.213627992633517, + "grad_norm": 0.1557784378528595, + "learning_rate": 8.140531433622589e-06, + "loss": 1.7221, + "step": 26760 + }, + { + "epoch": 8.213934929404543, + "grad_norm": 0.1544533222913742, + "learning_rate": 8.137813176085574e-06, + "loss": 1.6805, + "step": 26761 + }, + { + "epoch": 8.214241866175568, + "grad_norm": 0.1605178564786911, + "learning_rate": 8.135095332255222e-06, + "loss": 1.7783, + "step": 26762 + }, + { + "epoch": 8.214548802946593, + "grad_norm": 0.14513778686523438, + "learning_rate": 8.1323779021584e-06, + "loss": 1.6933, + "step": 26763 + }, + { + "epoch": 8.214855739717619, + "grad_norm": 0.1282239407300949, + "learning_rate": 8.12966088582196e-06, + "loss": 1.6598, + "step": 26764 + }, + { + "epoch": 8.215162676488644, + "grad_norm": 0.1373436003923416, + "learning_rate": 8.126944283272748e-06, + "loss": 1.6227, + "step": 26765 + }, + { + "epoch": 8.215469613259668, + "grad_norm": 0.1634049266576767, + "learning_rate": 8.124228094537617e-06, + "loss": 1.7346, + "step": 26766 + }, + { + "epoch": 8.215776550030693, + "grad_norm": 0.16928012669086456, + "learning_rate": 8.12151231964341e-06, + "loss": 1.6958, + "step": 26767 + }, + { + "epoch": 8.216083486801718, + "grad_norm": 0.15764811635017395, + "learning_rate": 8.11879695861696e-06, + "loss": 1.6965, + "step": 26768 + }, + { + "epoch": 8.216390423572744, + "grad_norm": 0.1514546275138855, + "learning_rate": 8.11608201148511e-06, + "loss": 1.6804, + "step": 26769 + }, + { + "epoch": 8.216697360343769, + "grad_norm": 0.17304199934005737, + "learning_rate": 8.113367478274686e-06, + "loss": 1.7869, + "step": 26770 + }, + { + "epoch": 8.217004297114794, + "grad_norm": 0.19664239883422852, + "learning_rate": 8.11065335901251e-06, + "loss": 1.7082, + "step": 26771 + }, + { + "epoch": 8.21731123388582, + "grad_norm": 0.13926036655902863, + "learning_rate": 8.107939653725405e-06, + "loss": 1.6758, + "step": 26772 + }, + { + "epoch": 8.217618170656845, + "grad_norm": 0.14624418318271637, + "learning_rate": 8.10522636244021e-06, + "loss": 1.6716, + "step": 26773 + }, + { + "epoch": 8.21792510742787, + "grad_norm": 0.15462076663970947, + "learning_rate": 8.102513485183704e-06, + "loss": 1.6953, + "step": 26774 + }, + { + "epoch": 8.218232044198896, + "grad_norm": 0.21293844282627106, + "learning_rate": 8.099801021982729e-06, + "loss": 1.69, + "step": 26775 + }, + { + "epoch": 8.218538980969921, + "grad_norm": 0.16696035861968994, + "learning_rate": 8.09708897286408e-06, + "loss": 1.721, + "step": 26776 + }, + { + "epoch": 8.218845917740945, + "grad_norm": 0.1741570085287094, + "learning_rate": 8.094377337854553e-06, + "loss": 1.69, + "step": 26777 + }, + { + "epoch": 8.21915285451197, + "grad_norm": 0.17061090469360352, + "learning_rate": 8.091666116980957e-06, + "loss": 1.6886, + "step": 26778 + }, + { + "epoch": 8.219459791282995, + "grad_norm": 0.16761218011379242, + "learning_rate": 8.088955310270075e-06, + "loss": 1.6951, + "step": 26779 + }, + { + "epoch": 8.21976672805402, + "grad_norm": 0.21173669397830963, + "learning_rate": 8.086244917748703e-06, + "loss": 1.7714, + "step": 26780 + }, + { + "epoch": 8.220073664825046, + "grad_norm": 0.1629040539264679, + "learning_rate": 8.083534939443626e-06, + "loss": 1.6712, + "step": 26781 + }, + { + "epoch": 8.220380601596071, + "grad_norm": 0.14620709419250488, + "learning_rate": 8.080825375381623e-06, + "loss": 1.6638, + "step": 26782 + }, + { + "epoch": 8.220687538367097, + "grad_norm": 0.16511180996894836, + "learning_rate": 8.078116225589477e-06, + "loss": 1.6739, + "step": 26783 + }, + { + "epoch": 8.220994475138122, + "grad_norm": 0.155776247382164, + "learning_rate": 8.075407490093951e-06, + "loss": 1.7098, + "step": 26784 + }, + { + "epoch": 8.221301411909147, + "grad_norm": 0.18273292481899261, + "learning_rate": 8.072699168921826e-06, + "loss": 1.7595, + "step": 26785 + }, + { + "epoch": 8.221608348680173, + "grad_norm": 0.20691648125648499, + "learning_rate": 8.069991262099862e-06, + "loss": 1.7044, + "step": 26786 + }, + { + "epoch": 8.221915285451198, + "grad_norm": 0.13940884172916412, + "learning_rate": 8.06728376965482e-06, + "loss": 1.6651, + "step": 26787 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 0.1676037758588791, + "learning_rate": 8.064576691613457e-06, + "loss": 1.7215, + "step": 26788 + }, + { + "epoch": 8.222529158993247, + "grad_norm": 0.18815284967422485, + "learning_rate": 8.06187002800251e-06, + "loss": 1.771, + "step": 26789 + }, + { + "epoch": 8.222836095764272, + "grad_norm": 0.16505572199821472, + "learning_rate": 8.059163778848771e-06, + "loss": 1.7072, + "step": 26790 + }, + { + "epoch": 8.223143032535297, + "grad_norm": 0.15086548030376434, + "learning_rate": 8.056457944178936e-06, + "loss": 1.6874, + "step": 26791 + }, + { + "epoch": 8.223449969306323, + "grad_norm": 0.13147135078907013, + "learning_rate": 8.053752524019792e-06, + "loss": 1.6604, + "step": 26792 + }, + { + "epoch": 8.223756906077348, + "grad_norm": 0.13695500791072845, + "learning_rate": 8.051047518398024e-06, + "loss": 1.6498, + "step": 26793 + }, + { + "epoch": 8.224063842848373, + "grad_norm": 0.16654162108898163, + "learning_rate": 8.048342927340407e-06, + "loss": 1.6993, + "step": 26794 + }, + { + "epoch": 8.224370779619399, + "grad_norm": 0.15318933129310608, + "learning_rate": 8.045638750873652e-06, + "loss": 1.716, + "step": 26795 + }, + { + "epoch": 8.224677716390424, + "grad_norm": 0.17502783238887787, + "learning_rate": 8.04293498902448e-06, + "loss": 1.6953, + "step": 26796 + }, + { + "epoch": 8.22498465316145, + "grad_norm": 0.17295950651168823, + "learning_rate": 8.040231641819623e-06, + "loss": 1.6794, + "step": 26797 + }, + { + "epoch": 8.225291589932475, + "grad_norm": 0.14702807366847992, + "learning_rate": 8.03752870928579e-06, + "loss": 1.6389, + "step": 26798 + }, + { + "epoch": 8.225598526703498, + "grad_norm": 0.21157263219356537, + "learning_rate": 8.034826191449691e-06, + "loss": 1.6817, + "step": 26799 + }, + { + "epoch": 8.225905463474524, + "grad_norm": 0.1675570011138916, + "learning_rate": 8.03212408833804e-06, + "loss": 1.7636, + "step": 26800 + }, + { + "epoch": 8.226212400245549, + "grad_norm": 0.24485285580158234, + "learning_rate": 8.029422399977531e-06, + "loss": 1.7017, + "step": 26801 + }, + { + "epoch": 8.226519337016574, + "grad_norm": 0.15588007867336273, + "learning_rate": 8.026721126394871e-06, + "loss": 1.6781, + "step": 26802 + }, + { + "epoch": 8.2268262737876, + "grad_norm": 0.16810667514801025, + "learning_rate": 8.024020267616756e-06, + "loss": 1.7046, + "step": 26803 + }, + { + "epoch": 8.227133210558625, + "grad_norm": 0.2029539942741394, + "learning_rate": 8.021319823669875e-06, + "loss": 1.6735, + "step": 26804 + }, + { + "epoch": 8.22744014732965, + "grad_norm": 0.18706166744232178, + "learning_rate": 8.018619794580917e-06, + "loss": 1.6818, + "step": 26805 + }, + { + "epoch": 8.227747084100676, + "grad_norm": 0.18221300840377808, + "learning_rate": 8.01592018037655e-06, + "loss": 1.7349, + "step": 26806 + }, + { + "epoch": 8.228054020871701, + "grad_norm": 0.20281676948070526, + "learning_rate": 8.013220981083492e-06, + "loss": 1.6942, + "step": 26807 + }, + { + "epoch": 8.228360957642726, + "grad_norm": 0.16217820346355438, + "learning_rate": 8.01052219672837e-06, + "loss": 1.6693, + "step": 26808 + }, + { + "epoch": 8.22866789441375, + "grad_norm": 0.19438619911670685, + "learning_rate": 8.007823827337901e-06, + "loss": 1.7195, + "step": 26809 + }, + { + "epoch": 8.228974831184775, + "grad_norm": 0.229817733168602, + "learning_rate": 8.005125872938707e-06, + "loss": 1.7621, + "step": 26810 + }, + { + "epoch": 8.2292817679558, + "grad_norm": 0.20305906236171722, + "learning_rate": 8.002428333557488e-06, + "loss": 1.7132, + "step": 26811 + }, + { + "epoch": 8.229588704726826, + "grad_norm": 0.16244050860404968, + "learning_rate": 7.999731209220884e-06, + "loss": 1.729, + "step": 26812 + }, + { + "epoch": 8.229895641497851, + "grad_norm": 0.18119513988494873, + "learning_rate": 7.997034499955552e-06, + "loss": 1.7431, + "step": 26813 + }, + { + "epoch": 8.230202578268877, + "grad_norm": 0.1475009173154831, + "learning_rate": 7.99433820578816e-06, + "loss": 1.7229, + "step": 26814 + }, + { + "epoch": 8.230509515039902, + "grad_norm": 0.16200442612171173, + "learning_rate": 7.991642326745314e-06, + "loss": 1.7491, + "step": 26815 + }, + { + "epoch": 8.230816451810927, + "grad_norm": 0.17432551085948944, + "learning_rate": 7.988946862853686e-06, + "loss": 1.6997, + "step": 26816 + }, + { + "epoch": 8.231123388581953, + "grad_norm": 0.2010595202445984, + "learning_rate": 7.986251814139916e-06, + "loss": 1.795, + "step": 26817 + }, + { + "epoch": 8.231430325352978, + "grad_norm": 0.15220746397972107, + "learning_rate": 7.983557180630625e-06, + "loss": 1.6912, + "step": 26818 + }, + { + "epoch": 8.231737262124003, + "grad_norm": 0.1524961143732071, + "learning_rate": 7.980862962352454e-06, + "loss": 1.6924, + "step": 26819 + }, + { + "epoch": 8.232044198895027, + "grad_norm": 0.16850624978542328, + "learning_rate": 7.978169159332016e-06, + "loss": 1.7111, + "step": 26820 + }, + { + "epoch": 8.232351135666052, + "grad_norm": 0.19621838629245758, + "learning_rate": 7.975475771595947e-06, + "loss": 1.7237, + "step": 26821 + }, + { + "epoch": 8.232658072437077, + "grad_norm": 0.23287613689899445, + "learning_rate": 7.972782799170858e-06, + "loss": 1.7222, + "step": 26822 + }, + { + "epoch": 8.232965009208103, + "grad_norm": 0.15631796419620514, + "learning_rate": 7.970090242083344e-06, + "loss": 1.7252, + "step": 26823 + }, + { + "epoch": 8.233271945979128, + "grad_norm": 0.17921209335327148, + "learning_rate": 7.967398100360062e-06, + "loss": 1.7018, + "step": 26824 + }, + { + "epoch": 8.233578882750153, + "grad_norm": 0.16767734289169312, + "learning_rate": 7.964706374027564e-06, + "loss": 1.7457, + "step": 26825 + }, + { + "epoch": 8.233885819521179, + "grad_norm": 0.15360240638256073, + "learning_rate": 7.9620150631125e-06, + "loss": 1.6886, + "step": 26826 + }, + { + "epoch": 8.234192756292204, + "grad_norm": 0.17534345388412476, + "learning_rate": 7.959324167641413e-06, + "loss": 1.7167, + "step": 26827 + }, + { + "epoch": 8.23449969306323, + "grad_norm": 0.17453409731388092, + "learning_rate": 7.956633687640941e-06, + "loss": 1.7468, + "step": 26828 + }, + { + "epoch": 8.234806629834255, + "grad_norm": 0.1416994333267212, + "learning_rate": 7.953943623137654e-06, + "loss": 1.6991, + "step": 26829 + }, + { + "epoch": 8.23511356660528, + "grad_norm": 0.14629559218883514, + "learning_rate": 7.951253974158147e-06, + "loss": 1.6891, + "step": 26830 + }, + { + "epoch": 8.235420503376304, + "grad_norm": 0.15972918272018433, + "learning_rate": 7.948564740728998e-06, + "loss": 1.711, + "step": 26831 + }, + { + "epoch": 8.235727440147329, + "grad_norm": 0.184038445353508, + "learning_rate": 7.945875922876761e-06, + "loss": 1.7481, + "step": 26832 + }, + { + "epoch": 8.236034376918354, + "grad_norm": 0.1788245588541031, + "learning_rate": 7.943187520628037e-06, + "loss": 1.7744, + "step": 26833 + }, + { + "epoch": 8.23634131368938, + "grad_norm": 0.18042324483394623, + "learning_rate": 7.940499534009382e-06, + "loss": 1.6905, + "step": 26834 + }, + { + "epoch": 8.236648250460405, + "grad_norm": 0.16115914285182953, + "learning_rate": 7.937811963047364e-06, + "loss": 1.6923, + "step": 26835 + }, + { + "epoch": 8.23695518723143, + "grad_norm": 0.18805812299251556, + "learning_rate": 7.935124807768546e-06, + "loss": 1.7636, + "step": 26836 + }, + { + "epoch": 8.237262124002456, + "grad_norm": 0.14013023674488068, + "learning_rate": 7.932438068199477e-06, + "loss": 1.657, + "step": 26837 + }, + { + "epoch": 8.237569060773481, + "grad_norm": 0.17245794832706451, + "learning_rate": 7.929751744366709e-06, + "loss": 1.7162, + "step": 26838 + }, + { + "epoch": 8.237875997544506, + "grad_norm": 0.20234355330467224, + "learning_rate": 7.927065836296793e-06, + "loss": 1.741, + "step": 26839 + }, + { + "epoch": 8.238182934315532, + "grad_norm": 0.1728539764881134, + "learning_rate": 7.924380344016264e-06, + "loss": 1.7037, + "step": 26840 + }, + { + "epoch": 8.238489871086557, + "grad_norm": 0.20881959795951843, + "learning_rate": 7.921695267551688e-06, + "loss": 1.7446, + "step": 26841 + }, + { + "epoch": 8.23879680785758, + "grad_norm": 0.15921615064144135, + "learning_rate": 7.919010606929562e-06, + "loss": 1.6777, + "step": 26842 + }, + { + "epoch": 8.239103744628606, + "grad_norm": 0.15142741799354553, + "learning_rate": 7.916326362176462e-06, + "loss": 1.6647, + "step": 26843 + }, + { + "epoch": 8.239410681399631, + "grad_norm": 0.14777293801307678, + "learning_rate": 7.913642533318865e-06, + "loss": 1.7008, + "step": 26844 + }, + { + "epoch": 8.239717618170657, + "grad_norm": 0.14506451785564423, + "learning_rate": 7.910959120383332e-06, + "loss": 1.7156, + "step": 26845 + }, + { + "epoch": 8.240024554941682, + "grad_norm": 0.17617642879486084, + "learning_rate": 7.908276123396369e-06, + "loss": 1.707, + "step": 26846 + }, + { + "epoch": 8.240331491712707, + "grad_norm": 0.1640050709247589, + "learning_rate": 7.905593542384493e-06, + "loss": 1.6965, + "step": 26847 + }, + { + "epoch": 8.240638428483733, + "grad_norm": 0.2035178244113922, + "learning_rate": 7.902911377374229e-06, + "loss": 1.7679, + "step": 26848 + }, + { + "epoch": 8.240945365254758, + "grad_norm": 0.16591937839984894, + "learning_rate": 7.900229628392041e-06, + "loss": 1.705, + "step": 26849 + }, + { + "epoch": 8.241252302025783, + "grad_norm": 0.1770060807466507, + "learning_rate": 7.897548295464474e-06, + "loss": 1.6812, + "step": 26850 + }, + { + "epoch": 8.241559238796809, + "grad_norm": 0.1637604683637619, + "learning_rate": 7.89486737861801e-06, + "loss": 1.718, + "step": 26851 + }, + { + "epoch": 8.241866175567832, + "grad_norm": 0.1458534151315689, + "learning_rate": 7.892186877879148e-06, + "loss": 1.6834, + "step": 26852 + }, + { + "epoch": 8.242173112338858, + "grad_norm": 0.14899462461471558, + "learning_rate": 7.889506793274371e-06, + "loss": 1.6815, + "step": 26853 + }, + { + "epoch": 8.242480049109883, + "grad_norm": 0.16069386899471283, + "learning_rate": 7.88682712483017e-06, + "loss": 1.7522, + "step": 26854 + }, + { + "epoch": 8.242786985880908, + "grad_norm": 0.17499712109565735, + "learning_rate": 7.884147872573034e-06, + "loss": 1.6805, + "step": 26855 + }, + { + "epoch": 8.243093922651934, + "grad_norm": 0.1455364227294922, + "learning_rate": 7.881469036529427e-06, + "loss": 1.6797, + "step": 26856 + }, + { + "epoch": 8.243400859422959, + "grad_norm": 0.2292124629020691, + "learning_rate": 7.878790616725818e-06, + "loss": 1.6923, + "step": 26857 + }, + { + "epoch": 8.243707796193984, + "grad_norm": 0.17365983128547668, + "learning_rate": 7.876112613188713e-06, + "loss": 1.713, + "step": 26858 + }, + { + "epoch": 8.24401473296501, + "grad_norm": 0.17498542368412018, + "learning_rate": 7.873435025944525e-06, + "loss": 1.6834, + "step": 26859 + }, + { + "epoch": 8.244321669736035, + "grad_norm": 0.19340896606445312, + "learning_rate": 7.870757855019772e-06, + "loss": 1.7246, + "step": 26860 + }, + { + "epoch": 8.24462860650706, + "grad_norm": 0.16443613171577454, + "learning_rate": 7.868081100440855e-06, + "loss": 1.7217, + "step": 26861 + }, + { + "epoch": 8.244935543278086, + "grad_norm": 0.1470339596271515, + "learning_rate": 7.865404762234268e-06, + "loss": 1.6504, + "step": 26862 + }, + { + "epoch": 8.245242480049109, + "grad_norm": 0.14689552783966064, + "learning_rate": 7.862728840426453e-06, + "loss": 1.7231, + "step": 26863 + }, + { + "epoch": 8.245549416820134, + "grad_norm": 0.25354984402656555, + "learning_rate": 7.860053335043843e-06, + "loss": 1.7951, + "step": 26864 + }, + { + "epoch": 8.24585635359116, + "grad_norm": 0.1774766445159912, + "learning_rate": 7.857378246112896e-06, + "loss": 1.6702, + "step": 26865 + }, + { + "epoch": 8.246163290362185, + "grad_norm": 0.16365554928779602, + "learning_rate": 7.854703573660015e-06, + "loss": 1.6945, + "step": 26866 + }, + { + "epoch": 8.24647022713321, + "grad_norm": 0.15043000876903534, + "learning_rate": 7.852029317711669e-06, + "loss": 1.6341, + "step": 26867 + }, + { + "epoch": 8.246777163904236, + "grad_norm": 0.18268270790576935, + "learning_rate": 7.849355478294274e-06, + "loss": 1.7246, + "step": 26868 + }, + { + "epoch": 8.247084100675261, + "grad_norm": 0.2022860199213028, + "learning_rate": 7.84668205543425e-06, + "loss": 1.7527, + "step": 26869 + }, + { + "epoch": 8.247391037446286, + "grad_norm": 0.15406467020511627, + "learning_rate": 7.844009049158024e-06, + "loss": 1.6678, + "step": 26870 + }, + { + "epoch": 8.247697974217312, + "grad_norm": 0.168084055185318, + "learning_rate": 7.841336459492005e-06, + "loss": 1.7018, + "step": 26871 + }, + { + "epoch": 8.248004910988337, + "grad_norm": 0.15184715390205383, + "learning_rate": 7.83866428646261e-06, + "loss": 1.6636, + "step": 26872 + }, + { + "epoch": 8.248311847759362, + "grad_norm": 0.18516378104686737, + "learning_rate": 7.835992530096248e-06, + "loss": 1.7746, + "step": 26873 + }, + { + "epoch": 8.248618784530386, + "grad_norm": 0.22552374005317688, + "learning_rate": 7.833321190419313e-06, + "loss": 1.7307, + "step": 26874 + }, + { + "epoch": 8.248925721301411, + "grad_norm": 0.14845159649848938, + "learning_rate": 7.830650267458228e-06, + "loss": 1.6831, + "step": 26875 + }, + { + "epoch": 8.249232658072437, + "grad_norm": 0.17764155566692352, + "learning_rate": 7.827979761239356e-06, + "loss": 1.7569, + "step": 26876 + }, + { + "epoch": 8.249539594843462, + "grad_norm": 0.13525958359241486, + "learning_rate": 7.825309671789128e-06, + "loss": 1.6447, + "step": 26877 + }, + { + "epoch": 8.249846531614487, + "grad_norm": 0.1541098952293396, + "learning_rate": 7.822639999133885e-06, + "loss": 1.7054, + "step": 26878 + }, + { + "epoch": 8.250153468385513, + "grad_norm": 0.1462734043598175, + "learning_rate": 7.819970743300042e-06, + "loss": 1.6801, + "step": 26879 + }, + { + "epoch": 8.250460405156538, + "grad_norm": 0.16271938383579254, + "learning_rate": 7.817301904313979e-06, + "loss": 1.7342, + "step": 26880 + }, + { + "epoch": 8.250767341927563, + "grad_norm": 0.18730363249778748, + "learning_rate": 7.814633482202055e-06, + "loss": 1.7656, + "step": 26881 + }, + { + "epoch": 8.251074278698589, + "grad_norm": 0.1343161165714264, + "learning_rate": 7.811965476990663e-06, + "loss": 1.6738, + "step": 26882 + }, + { + "epoch": 8.251381215469614, + "grad_norm": 0.18782657384872437, + "learning_rate": 7.809297888706135e-06, + "loss": 1.6946, + "step": 26883 + }, + { + "epoch": 8.25168815224064, + "grad_norm": 0.16619306802749634, + "learning_rate": 7.806630717374862e-06, + "loss": 1.7024, + "step": 26884 + }, + { + "epoch": 8.251995089011663, + "grad_norm": 0.18570290505886078, + "learning_rate": 7.803963963023192e-06, + "loss": 1.7602, + "step": 26885 + }, + { + "epoch": 8.252302025782688, + "grad_norm": 0.19790740311145782, + "learning_rate": 7.80129762567749e-06, + "loss": 1.6965, + "step": 26886 + }, + { + "epoch": 8.252608962553714, + "grad_norm": 0.17269279062747955, + "learning_rate": 7.79863170536409e-06, + "loss": 1.7585, + "step": 26887 + }, + { + "epoch": 8.252915899324739, + "grad_norm": 0.17961835861206055, + "learning_rate": 7.79596620210935e-06, + "loss": 1.6992, + "step": 26888 + }, + { + "epoch": 8.253222836095764, + "grad_norm": 0.15848924219608307, + "learning_rate": 7.793301115939611e-06, + "loss": 1.6849, + "step": 26889 + }, + { + "epoch": 8.25352977286679, + "grad_norm": 0.16328901052474976, + "learning_rate": 7.790636446881205e-06, + "loss": 1.7049, + "step": 26890 + }, + { + "epoch": 8.253836709637815, + "grad_norm": 0.15410196781158447, + "learning_rate": 7.787972194960463e-06, + "loss": 1.6764, + "step": 26891 + }, + { + "epoch": 8.25414364640884, + "grad_norm": 0.15541456639766693, + "learning_rate": 7.78530836020374e-06, + "loss": 1.6692, + "step": 26892 + }, + { + "epoch": 8.254450583179866, + "grad_norm": 0.1663745492696762, + "learning_rate": 7.782644942637318e-06, + "loss": 1.708, + "step": 26893 + }, + { + "epoch": 8.254757519950891, + "grad_norm": 0.2212733030319214, + "learning_rate": 7.779981942287567e-06, + "loss": 1.7978, + "step": 26894 + }, + { + "epoch": 8.255064456721914, + "grad_norm": 0.15269914269447327, + "learning_rate": 7.777319359180756e-06, + "loss": 1.6688, + "step": 26895 + }, + { + "epoch": 8.25537139349294, + "grad_norm": 0.18167565762996674, + "learning_rate": 7.774657193343238e-06, + "loss": 1.7394, + "step": 26896 + }, + { + "epoch": 8.255678330263965, + "grad_norm": 0.18649235367774963, + "learning_rate": 7.771995444801306e-06, + "loss": 1.7438, + "step": 26897 + }, + { + "epoch": 8.25598526703499, + "grad_norm": 0.14753280580043793, + "learning_rate": 7.769334113581267e-06, + "loss": 1.6624, + "step": 26898 + }, + { + "epoch": 8.256292203806016, + "grad_norm": 0.1815260797739029, + "learning_rate": 7.76667319970943e-06, + "loss": 1.7091, + "step": 26899 + }, + { + "epoch": 8.256599140577041, + "grad_norm": 0.18099220097064972, + "learning_rate": 7.764012703212059e-06, + "loss": 1.7285, + "step": 26900 + }, + { + "epoch": 8.256906077348066, + "grad_norm": 0.15976406633853912, + "learning_rate": 7.76135262411548e-06, + "loss": 1.7038, + "step": 26901 + }, + { + "epoch": 8.257213014119092, + "grad_norm": 0.20424988865852356, + "learning_rate": 7.758692962445974e-06, + "loss": 1.7398, + "step": 26902 + }, + { + "epoch": 8.257519950890117, + "grad_norm": 0.17021317780017853, + "learning_rate": 7.756033718229816e-06, + "loss": 1.7422, + "step": 26903 + }, + { + "epoch": 8.257826887661142, + "grad_norm": 0.2599583566188812, + "learning_rate": 7.753374891493298e-06, + "loss": 1.6943, + "step": 26904 + }, + { + "epoch": 8.258133824432168, + "grad_norm": 0.16305646300315857, + "learning_rate": 7.750716482262693e-06, + "loss": 1.7129, + "step": 26905 + }, + { + "epoch": 8.258440761203191, + "grad_norm": 0.136509507894516, + "learning_rate": 7.74805849056427e-06, + "loss": 1.666, + "step": 26906 + }, + { + "epoch": 8.258747697974217, + "grad_norm": 0.14928071200847626, + "learning_rate": 7.745400916424294e-06, + "loss": 1.6842, + "step": 26907 + }, + { + "epoch": 8.259054634745242, + "grad_norm": 0.20410865545272827, + "learning_rate": 7.74274375986902e-06, + "loss": 1.7376, + "step": 26908 + }, + { + "epoch": 8.259361571516267, + "grad_norm": 0.16844697296619415, + "learning_rate": 7.740087020924746e-06, + "loss": 1.7125, + "step": 26909 + }, + { + "epoch": 8.259668508287293, + "grad_norm": 0.1874905675649643, + "learning_rate": 7.737430699617681e-06, + "loss": 1.7534, + "step": 26910 + }, + { + "epoch": 8.259975445058318, + "grad_norm": 0.15867100656032562, + "learning_rate": 7.734774795974114e-06, + "loss": 1.7329, + "step": 26911 + }, + { + "epoch": 8.260282381829343, + "grad_norm": 0.14987660944461823, + "learning_rate": 7.732119310020258e-06, + "loss": 1.7038, + "step": 26912 + }, + { + "epoch": 8.260589318600369, + "grad_norm": 0.259883314371109, + "learning_rate": 7.729464241782381e-06, + "loss": 1.7677, + "step": 26913 + }, + { + "epoch": 8.260896255371394, + "grad_norm": 0.2080366462469101, + "learning_rate": 7.726809591286716e-06, + "loss": 1.7662, + "step": 26914 + }, + { + "epoch": 8.26120319214242, + "grad_norm": 0.1707276701927185, + "learning_rate": 7.724155358559492e-06, + "loss": 1.671, + "step": 26915 + }, + { + "epoch": 8.261510128913443, + "grad_norm": 0.17241668701171875, + "learning_rate": 7.721501543626958e-06, + "loss": 1.7227, + "step": 26916 + }, + { + "epoch": 8.261817065684468, + "grad_norm": 0.18578803539276123, + "learning_rate": 7.718848146515301e-06, + "loss": 1.6962, + "step": 26917 + }, + { + "epoch": 8.262124002455494, + "grad_norm": 0.16692428290843964, + "learning_rate": 7.716195167250778e-06, + "loss": 1.6918, + "step": 26918 + }, + { + "epoch": 8.262430939226519, + "grad_norm": 0.18908677995204926, + "learning_rate": 7.713542605859602e-06, + "loss": 1.7271, + "step": 26919 + }, + { + "epoch": 8.262737875997544, + "grad_norm": 0.2003175914287567, + "learning_rate": 7.710890462367981e-06, + "loss": 1.729, + "step": 26920 + }, + { + "epoch": 8.26304481276857, + "grad_norm": 0.16058455407619476, + "learning_rate": 7.708238736802125e-06, + "loss": 1.671, + "step": 26921 + }, + { + "epoch": 8.263351749539595, + "grad_norm": 0.1803000271320343, + "learning_rate": 7.705587429188244e-06, + "loss": 1.7582, + "step": 26922 + }, + { + "epoch": 8.26365868631062, + "grad_norm": 0.218659445643425, + "learning_rate": 7.70293653955254e-06, + "loss": 1.7431, + "step": 26923 + }, + { + "epoch": 8.263965623081646, + "grad_norm": 0.13701553642749786, + "learning_rate": 7.700286067921204e-06, + "loss": 1.6806, + "step": 26924 + }, + { + "epoch": 8.264272559852671, + "grad_norm": 0.15342164039611816, + "learning_rate": 7.697636014320436e-06, + "loss": 1.6501, + "step": 26925 + }, + { + "epoch": 8.264579496623696, + "grad_norm": 0.18738442659378052, + "learning_rate": 7.69498637877642e-06, + "loss": 1.7032, + "step": 26926 + }, + { + "epoch": 8.26488643339472, + "grad_norm": 0.14805950224399567, + "learning_rate": 7.692337161315338e-06, + "loss": 1.6641, + "step": 26927 + }, + { + "epoch": 8.265193370165745, + "grad_norm": 0.18155299127101898, + "learning_rate": 7.689688361963398e-06, + "loss": 1.6967, + "step": 26928 + }, + { + "epoch": 8.26550030693677, + "grad_norm": 0.13954955339431763, + "learning_rate": 7.68703998074673e-06, + "loss": 1.6865, + "step": 26929 + }, + { + "epoch": 8.265807243707796, + "grad_norm": 0.1464248150587082, + "learning_rate": 7.684392017691549e-06, + "loss": 1.6702, + "step": 26930 + }, + { + "epoch": 8.266114180478821, + "grad_norm": 0.16407039761543274, + "learning_rate": 7.68174447282401e-06, + "loss": 1.7265, + "step": 26931 + }, + { + "epoch": 8.266421117249847, + "grad_norm": 0.13243085145950317, + "learning_rate": 7.679097346170272e-06, + "loss": 1.67, + "step": 26932 + }, + { + "epoch": 8.266728054020872, + "grad_norm": 0.18284925818443298, + "learning_rate": 7.67645063775651e-06, + "loss": 1.7524, + "step": 26933 + }, + { + "epoch": 8.267034990791897, + "grad_norm": 0.16042175889015198, + "learning_rate": 7.673804347608849e-06, + "loss": 1.7244, + "step": 26934 + }, + { + "epoch": 8.267341927562923, + "grad_norm": 0.18213023245334625, + "learning_rate": 7.67115847575347e-06, + "loss": 1.7241, + "step": 26935 + }, + { + "epoch": 8.267648864333948, + "grad_norm": 0.1590288132429123, + "learning_rate": 7.668513022216517e-06, + "loss": 1.7056, + "step": 26936 + }, + { + "epoch": 8.267955801104973, + "grad_norm": 0.17236095666885376, + "learning_rate": 7.665867987024122e-06, + "loss": 1.7251, + "step": 26937 + }, + { + "epoch": 8.268262737875997, + "grad_norm": 0.14264018833637238, + "learning_rate": 7.663223370202439e-06, + "loss": 1.6672, + "step": 26938 + }, + { + "epoch": 8.268569674647022, + "grad_norm": 0.15768232941627502, + "learning_rate": 7.660579171777599e-06, + "loss": 1.6846, + "step": 26939 + }, + { + "epoch": 8.268876611418047, + "grad_norm": 0.12978656589984894, + "learning_rate": 7.657935391775727e-06, + "loss": 1.6615, + "step": 26940 + }, + { + "epoch": 8.269183548189073, + "grad_norm": 0.18869580328464508, + "learning_rate": 7.655292030222955e-06, + "loss": 1.7056, + "step": 26941 + }, + { + "epoch": 8.269490484960098, + "grad_norm": 0.16662544012069702, + "learning_rate": 7.652649087145409e-06, + "loss": 1.7559, + "step": 26942 + }, + { + "epoch": 8.269797421731123, + "grad_norm": 0.20138496160507202, + "learning_rate": 7.650006562569201e-06, + "loss": 1.7428, + "step": 26943 + }, + { + "epoch": 8.270104358502149, + "grad_norm": 0.16201090812683105, + "learning_rate": 7.647364456520439e-06, + "loss": 1.7456, + "step": 26944 + }, + { + "epoch": 8.270411295273174, + "grad_norm": 0.16562269628047943, + "learning_rate": 7.644722769025275e-06, + "loss": 1.7282, + "step": 26945 + }, + { + "epoch": 8.2707182320442, + "grad_norm": 0.1434047371149063, + "learning_rate": 7.642081500109754e-06, + "loss": 1.6959, + "step": 26946 + }, + { + "epoch": 8.271025168815225, + "grad_norm": 0.1424918919801712, + "learning_rate": 7.63944064980004e-06, + "loss": 1.7133, + "step": 26947 + }, + { + "epoch": 8.27133210558625, + "grad_norm": 0.23540155589580536, + "learning_rate": 7.636800218122176e-06, + "loss": 1.7156, + "step": 26948 + }, + { + "epoch": 8.271639042357274, + "grad_norm": 0.1890154927968979, + "learning_rate": 7.634160205102292e-06, + "loss": 1.7452, + "step": 26949 + }, + { + "epoch": 8.271945979128299, + "grad_norm": 0.1555023491382599, + "learning_rate": 7.631520610766486e-06, + "loss": 1.7096, + "step": 26950 + }, + { + "epoch": 8.272252915899324, + "grad_norm": 0.16713875532150269, + "learning_rate": 7.628881435140794e-06, + "loss": 1.6832, + "step": 26951 + }, + { + "epoch": 8.27255985267035, + "grad_norm": 0.18925394117832184, + "learning_rate": 7.626242678251349e-06, + "loss": 1.7755, + "step": 26952 + }, + { + "epoch": 8.272866789441375, + "grad_norm": 0.19905491173267365, + "learning_rate": 7.6236043401242074e-06, + "loss": 1.6915, + "step": 26953 + }, + { + "epoch": 8.2731737262124, + "grad_norm": 0.13694030046463013, + "learning_rate": 7.620966420785447e-06, + "loss": 1.6935, + "step": 26954 + }, + { + "epoch": 8.273480662983426, + "grad_norm": 0.1292782723903656, + "learning_rate": 7.61832892026113e-06, + "loss": 1.6823, + "step": 26955 + }, + { + "epoch": 8.273787599754451, + "grad_norm": 0.15123988687992096, + "learning_rate": 7.615691838577333e-06, + "loss": 1.6807, + "step": 26956 + }, + { + "epoch": 8.274094536525476, + "grad_norm": 0.14225423336029053, + "learning_rate": 7.6130551757601084e-06, + "loss": 1.6616, + "step": 26957 + }, + { + "epoch": 8.274401473296502, + "grad_norm": 0.15328221023082733, + "learning_rate": 7.610418931835517e-06, + "loss": 1.7211, + "step": 26958 + }, + { + "epoch": 8.274708410067525, + "grad_norm": 0.168446883559227, + "learning_rate": 7.6077831068296134e-06, + "loss": 1.7211, + "step": 26959 + }, + { + "epoch": 8.27501534683855, + "grad_norm": 0.1877220869064331, + "learning_rate": 7.6051477007684444e-06, + "loss": 1.7139, + "step": 26960 + }, + { + "epoch": 8.275322283609576, + "grad_norm": 0.14273744821548462, + "learning_rate": 7.602512713678039e-06, + "loss": 1.6996, + "step": 26961 + }, + { + "epoch": 8.275629220380601, + "grad_norm": 0.1611991822719574, + "learning_rate": 7.599878145584477e-06, + "loss": 1.6837, + "step": 26962 + }, + { + "epoch": 8.275936157151627, + "grad_norm": 0.13847516477108002, + "learning_rate": 7.597243996513747e-06, + "loss": 1.6449, + "step": 26963 + }, + { + "epoch": 8.276243093922652, + "grad_norm": 0.16816900670528412, + "learning_rate": 7.59461026649193e-06, + "loss": 1.747, + "step": 26964 + }, + { + "epoch": 8.276550030693677, + "grad_norm": 0.15942460298538208, + "learning_rate": 7.5919769555450046e-06, + "loss": 1.7461, + "step": 26965 + }, + { + "epoch": 8.276856967464703, + "grad_norm": 0.16706149280071259, + "learning_rate": 7.589344063699033e-06, + "loss": 1.7136, + "step": 26966 + }, + { + "epoch": 8.277163904235728, + "grad_norm": 0.16727334260940552, + "learning_rate": 7.586711590980028e-06, + "loss": 1.7186, + "step": 26967 + }, + { + "epoch": 8.277470841006753, + "grad_norm": 0.1510261744260788, + "learning_rate": 7.5840795374139795e-06, + "loss": 1.6795, + "step": 26968 + }, + { + "epoch": 8.277777777777779, + "grad_norm": 0.1705521196126938, + "learning_rate": 7.581447903026939e-06, + "loss": 1.6903, + "step": 26969 + }, + { + "epoch": 8.278084714548802, + "grad_norm": 0.15767472982406616, + "learning_rate": 7.57881668784487e-06, + "loss": 1.7264, + "step": 26970 + }, + { + "epoch": 8.278391651319827, + "grad_norm": 0.15771441161632538, + "learning_rate": 7.576185891893805e-06, + "loss": 1.7091, + "step": 26971 + }, + { + "epoch": 8.278698588090853, + "grad_norm": 0.22973434627056122, + "learning_rate": 7.5735555151997425e-06, + "loss": 1.7357, + "step": 26972 + }, + { + "epoch": 8.279005524861878, + "grad_norm": 0.15931910276412964, + "learning_rate": 7.570925557788672e-06, + "loss": 1.7026, + "step": 26973 + }, + { + "epoch": 8.279312461632903, + "grad_norm": 0.1451634019613266, + "learning_rate": 7.568296019686583e-06, + "loss": 1.6824, + "step": 26974 + }, + { + "epoch": 8.279619398403929, + "grad_norm": 0.14617015421390533, + "learning_rate": 7.56566690091946e-06, + "loss": 1.677, + "step": 26975 + }, + { + "epoch": 8.279926335174954, + "grad_norm": 0.14465895295143127, + "learning_rate": 7.5630382015132895e-06, + "loss": 1.7193, + "step": 26976 + }, + { + "epoch": 8.28023327194598, + "grad_norm": 0.1751926839351654, + "learning_rate": 7.560409921494044e-06, + "loss": 1.7366, + "step": 26977 + }, + { + "epoch": 8.280540208717005, + "grad_norm": 0.1478777974843979, + "learning_rate": 7.557782060887697e-06, + "loss": 1.6948, + "step": 26978 + }, + { + "epoch": 8.28084714548803, + "grad_norm": 0.25690537691116333, + "learning_rate": 7.555154619720245e-06, + "loss": 1.7284, + "step": 26979 + }, + { + "epoch": 8.281154082259055, + "grad_norm": 0.1380864977836609, + "learning_rate": 7.552527598017611e-06, + "loss": 1.6753, + "step": 26980 + }, + { + "epoch": 8.281461019030079, + "grad_norm": 0.21658651530742645, + "learning_rate": 7.5499009958057975e-06, + "loss": 1.8076, + "step": 26981 + }, + { + "epoch": 8.281767955801104, + "grad_norm": 0.16225802898406982, + "learning_rate": 7.547274813110727e-06, + "loss": 1.6716, + "step": 26982 + }, + { + "epoch": 8.28207489257213, + "grad_norm": 0.18264736235141754, + "learning_rate": 7.544649049958375e-06, + "loss": 1.7241, + "step": 26983 + }, + { + "epoch": 8.282381829343155, + "grad_norm": 0.17512252926826477, + "learning_rate": 7.542023706374695e-06, + "loss": 1.6709, + "step": 26984 + }, + { + "epoch": 8.28268876611418, + "grad_norm": 0.16799452900886536, + "learning_rate": 7.5393987823856035e-06, + "loss": 1.7333, + "step": 26985 + }, + { + "epoch": 8.282995702885206, + "grad_norm": 0.1569952517747879, + "learning_rate": 7.5367742780170835e-06, + "loss": 1.6701, + "step": 26986 + }, + { + "epoch": 8.283302639656231, + "grad_norm": 0.17452387511730194, + "learning_rate": 7.534150193295026e-06, + "loss": 1.6843, + "step": 26987 + }, + { + "epoch": 8.283609576427256, + "grad_norm": 0.1564214676618576, + "learning_rate": 7.531526528245392e-06, + "loss": 1.7154, + "step": 26988 + }, + { + "epoch": 8.283916513198282, + "grad_norm": 0.14093104004859924, + "learning_rate": 7.528903282894107e-06, + "loss": 1.6448, + "step": 26989 + }, + { + "epoch": 8.284223449969307, + "grad_norm": 0.2950015664100647, + "learning_rate": 7.526280457267093e-06, + "loss": 1.7657, + "step": 26990 + }, + { + "epoch": 8.284530386740332, + "grad_norm": 0.1342417150735855, + "learning_rate": 7.5236580513902756e-06, + "loss": 1.6761, + "step": 26991 + }, + { + "epoch": 8.284837323511356, + "grad_norm": 0.16559085249900818, + "learning_rate": 7.52103606528956e-06, + "loss": 1.7029, + "step": 26992 + }, + { + "epoch": 8.285144260282381, + "grad_norm": 0.14937730133533478, + "learning_rate": 7.5184144989908665e-06, + "loss": 1.6848, + "step": 26993 + }, + { + "epoch": 8.285451197053407, + "grad_norm": 0.14847339689731598, + "learning_rate": 7.515793352520095e-06, + "loss": 1.6735, + "step": 26994 + }, + { + "epoch": 8.285758133824432, + "grad_norm": 0.1866399198770523, + "learning_rate": 7.513172625903148e-06, + "loss": 1.6553, + "step": 26995 + }, + { + "epoch": 8.286065070595457, + "grad_norm": 0.15781863033771515, + "learning_rate": 7.510552319165953e-06, + "loss": 1.699, + "step": 26996 + }, + { + "epoch": 8.286372007366483, + "grad_norm": 0.1402381956577301, + "learning_rate": 7.507932432334358e-06, + "loss": 1.6778, + "step": 26997 + }, + { + "epoch": 8.286678944137508, + "grad_norm": 0.16515657305717468, + "learning_rate": 7.505312965434308e-06, + "loss": 1.6834, + "step": 26998 + }, + { + "epoch": 8.286985880908533, + "grad_norm": 0.16752316057682037, + "learning_rate": 7.502693918491638e-06, + "loss": 1.7714, + "step": 26999 + }, + { + "epoch": 8.287292817679559, + "grad_norm": 0.17935164272785187, + "learning_rate": 7.500075291532266e-06, + "loss": 1.6858, + "step": 27000 + }, + { + "epoch": 8.287599754450584, + "grad_norm": 0.1805913746356964, + "learning_rate": 7.497457084582065e-06, + "loss": 1.7451, + "step": 27001 + }, + { + "epoch": 8.287906691221608, + "grad_norm": 0.15834343433380127, + "learning_rate": 7.494839297666889e-06, + "loss": 1.6675, + "step": 27002 + }, + { + "epoch": 8.288213627992633, + "grad_norm": 0.18627049028873444, + "learning_rate": 7.492221930812648e-06, + "loss": 1.7207, + "step": 27003 + }, + { + "epoch": 8.288520564763658, + "grad_norm": 0.15027324855327606, + "learning_rate": 7.489604984045157e-06, + "loss": 1.686, + "step": 27004 + }, + { + "epoch": 8.288827501534684, + "grad_norm": 0.14771342277526855, + "learning_rate": 7.48698845739032e-06, + "loss": 1.6647, + "step": 27005 + }, + { + "epoch": 8.289134438305709, + "grad_norm": 0.14141151309013367, + "learning_rate": 7.48437235087398e-06, + "loss": 1.7005, + "step": 27006 + }, + { + "epoch": 8.289441375076734, + "grad_norm": 0.14843317866325378, + "learning_rate": 7.481756664521994e-06, + "loss": 1.6768, + "step": 27007 + }, + { + "epoch": 8.28974831184776, + "grad_norm": 0.21505968272686005, + "learning_rate": 7.479141398360206e-06, + "loss": 1.764, + "step": 27008 + }, + { + "epoch": 8.290055248618785, + "grad_norm": 0.1906919926404953, + "learning_rate": 7.476526552414464e-06, + "loss": 1.7079, + "step": 27009 + }, + { + "epoch": 8.29036218538981, + "grad_norm": 0.15975503623485565, + "learning_rate": 7.473912126710614e-06, + "loss": 1.7035, + "step": 27010 + }, + { + "epoch": 8.290669122160836, + "grad_norm": 0.16221746802330017, + "learning_rate": 7.471298121274489e-06, + "loss": 1.6707, + "step": 27011 + }, + { + "epoch": 8.29097605893186, + "grad_norm": 0.17168673872947693, + "learning_rate": 7.468684536131909e-06, + "loss": 1.7119, + "step": 27012 + }, + { + "epoch": 8.291282995702884, + "grad_norm": 0.15114913880825043, + "learning_rate": 7.466071371308742e-06, + "loss": 1.6867, + "step": 27013 + }, + { + "epoch": 8.29158993247391, + "grad_norm": 0.20300740003585815, + "learning_rate": 7.463458626830766e-06, + "loss": 1.7578, + "step": 27014 + }, + { + "epoch": 8.291896869244935, + "grad_norm": 0.1570715457201004, + "learning_rate": 7.460846302723845e-06, + "loss": 1.6588, + "step": 27015 + }, + { + "epoch": 8.29220380601596, + "grad_norm": 0.21273213624954224, + "learning_rate": 7.458234399013747e-06, + "loss": 1.7467, + "step": 27016 + }, + { + "epoch": 8.292510742786986, + "grad_norm": 0.16550743579864502, + "learning_rate": 7.455622915726324e-06, + "loss": 1.699, + "step": 27017 + }, + { + "epoch": 8.292817679558011, + "grad_norm": 0.20360049605369568, + "learning_rate": 7.453011852887387e-06, + "loss": 1.7572, + "step": 27018 + }, + { + "epoch": 8.293124616329036, + "grad_norm": 0.2043008953332901, + "learning_rate": 7.4504012105227004e-06, + "loss": 1.7181, + "step": 27019 + }, + { + "epoch": 8.293431553100062, + "grad_norm": 0.18581026792526245, + "learning_rate": 7.44779098865811e-06, + "loss": 1.742, + "step": 27020 + }, + { + "epoch": 8.293738489871087, + "grad_norm": 0.18011118471622467, + "learning_rate": 7.445181187319367e-06, + "loss": 1.7329, + "step": 27021 + }, + { + "epoch": 8.294045426642112, + "grad_norm": 0.18868795037269592, + "learning_rate": 7.442571806532295e-06, + "loss": 1.7289, + "step": 27022 + }, + { + "epoch": 8.294352363413138, + "grad_norm": 0.15835118293762207, + "learning_rate": 7.439962846322673e-06, + "loss": 1.6878, + "step": 27023 + }, + { + "epoch": 8.294659300184161, + "grad_norm": 0.23331916332244873, + "learning_rate": 7.437354306716282e-06, + "loss": 1.7144, + "step": 27024 + }, + { + "epoch": 8.294966236955187, + "grad_norm": 0.18101559579372406, + "learning_rate": 7.434746187738906e-06, + "loss": 1.7452, + "step": 27025 + }, + { + "epoch": 8.295273173726212, + "grad_norm": 0.16906292736530304, + "learning_rate": 7.432138489416318e-06, + "loss": 1.6772, + "step": 27026 + }, + { + "epoch": 8.295580110497237, + "grad_norm": 0.20603033900260925, + "learning_rate": 7.429531211774282e-06, + "loss": 1.7622, + "step": 27027 + }, + { + "epoch": 8.295887047268263, + "grad_norm": 0.19412389397621155, + "learning_rate": 7.426924354838571e-06, + "loss": 1.6973, + "step": 27028 + }, + { + "epoch": 8.296193984039288, + "grad_norm": 0.1702510118484497, + "learning_rate": 7.424317918634938e-06, + "loss": 1.7119, + "step": 27029 + }, + { + "epoch": 8.296500920810313, + "grad_norm": 0.1476033478975296, + "learning_rate": 7.421711903189171e-06, + "loss": 1.6961, + "step": 27030 + }, + { + "epoch": 8.296807857581339, + "grad_norm": 0.16404536366462708, + "learning_rate": 7.419106308526979e-06, + "loss": 1.6928, + "step": 27031 + }, + { + "epoch": 8.297114794352364, + "grad_norm": 0.15021127462387085, + "learning_rate": 7.416501134674159e-06, + "loss": 1.642, + "step": 27032 + }, + { + "epoch": 8.29742173112339, + "grad_norm": 0.20728830993175507, + "learning_rate": 7.4138963816564266e-06, + "loss": 1.7142, + "step": 27033 + }, + { + "epoch": 8.297728667894415, + "grad_norm": 0.16802074015140533, + "learning_rate": 7.411292049499513e-06, + "loss": 1.6983, + "step": 27034 + }, + { + "epoch": 8.298035604665438, + "grad_norm": 0.15957842767238617, + "learning_rate": 7.408688138229198e-06, + "loss": 1.6535, + "step": 27035 + }, + { + "epoch": 8.298342541436464, + "grad_norm": 0.17618007957935333, + "learning_rate": 7.40608464787117e-06, + "loss": 1.7024, + "step": 27036 + }, + { + "epoch": 8.298649478207489, + "grad_norm": 0.14615842700004578, + "learning_rate": 7.4034815784511994e-06, + "loss": 1.7188, + "step": 27037 + }, + { + "epoch": 8.298956414978514, + "grad_norm": 0.16748850047588348, + "learning_rate": 7.40087892999497e-06, + "loss": 1.6763, + "step": 27038 + }, + { + "epoch": 8.29926335174954, + "grad_norm": 0.15271888673305511, + "learning_rate": 7.398276702528229e-06, + "loss": 1.6766, + "step": 27039 + }, + { + "epoch": 8.299570288520565, + "grad_norm": 0.21336700022220612, + "learning_rate": 7.395674896076693e-06, + "loss": 1.7113, + "step": 27040 + }, + { + "epoch": 8.29987722529159, + "grad_norm": 0.15377891063690186, + "learning_rate": 7.3930735106660655e-06, + "loss": 1.7083, + "step": 27041 + }, + { + "epoch": 8.300184162062616, + "grad_norm": 0.1341678500175476, + "learning_rate": 7.390472546322058e-06, + "loss": 1.6411, + "step": 27042 + }, + { + "epoch": 8.300491098833641, + "grad_norm": 0.1506323516368866, + "learning_rate": 7.3878720030703785e-06, + "loss": 1.6784, + "step": 27043 + }, + { + "epoch": 8.300798035604666, + "grad_norm": 0.20630323886871338, + "learning_rate": 7.385271880936723e-06, + "loss": 1.7296, + "step": 27044 + }, + { + "epoch": 8.30110497237569, + "grad_norm": 0.1514928787946701, + "learning_rate": 7.382672179946787e-06, + "loss": 1.631, + "step": 27045 + }, + { + "epoch": 8.301411909146715, + "grad_norm": 0.21939171850681305, + "learning_rate": 7.3800729001262505e-06, + "loss": 1.7484, + "step": 27046 + }, + { + "epoch": 8.30171884591774, + "grad_norm": 0.13756778836250305, + "learning_rate": 7.377474041500837e-06, + "loss": 1.71, + "step": 27047 + }, + { + "epoch": 8.302025782688766, + "grad_norm": 0.23617541790008545, + "learning_rate": 7.374875604096188e-06, + "loss": 1.7366, + "step": 27048 + }, + { + "epoch": 8.302332719459791, + "grad_norm": 0.236005499958992, + "learning_rate": 7.37227758793802e-06, + "loss": 1.7263, + "step": 27049 + }, + { + "epoch": 8.302639656230816, + "grad_norm": 0.28162217140197754, + "learning_rate": 7.369679993051981e-06, + "loss": 1.7159, + "step": 27050 + }, + { + "epoch": 8.302946593001842, + "grad_norm": 0.18274159729480743, + "learning_rate": 7.3670828194637385e-06, + "loss": 1.695, + "step": 27051 + }, + { + "epoch": 8.303253529772867, + "grad_norm": 0.14628291130065918, + "learning_rate": 7.364486067198994e-06, + "loss": 1.712, + "step": 27052 + }, + { + "epoch": 8.303560466543892, + "grad_norm": 0.16443926095962524, + "learning_rate": 7.361889736283362e-06, + "loss": 1.7003, + "step": 27053 + }, + { + "epoch": 8.303867403314918, + "grad_norm": 0.24396912753582, + "learning_rate": 7.3592938267425525e-06, + "loss": 1.7882, + "step": 27054 + }, + { + "epoch": 8.304174340085943, + "grad_norm": 0.16564849019050598, + "learning_rate": 7.356698338602169e-06, + "loss": 1.7095, + "step": 27055 + }, + { + "epoch": 8.304481276856967, + "grad_norm": 0.17034487426280975, + "learning_rate": 7.3541032718879024e-06, + "loss": 1.7198, + "step": 27056 + }, + { + "epoch": 8.304788213627992, + "grad_norm": 0.15630117058753967, + "learning_rate": 7.351508626625381e-06, + "loss": 1.6642, + "step": 27057 + }, + { + "epoch": 8.305095150399017, + "grad_norm": 0.17507393658161163, + "learning_rate": 7.348914402840246e-06, + "loss": 1.7295, + "step": 27058 + }, + { + "epoch": 8.305402087170043, + "grad_norm": 0.13145345449447632, + "learning_rate": 7.346320600558138e-06, + "loss": 1.6654, + "step": 27059 + }, + { + "epoch": 8.305709023941068, + "grad_norm": 0.17676126956939697, + "learning_rate": 7.343727219804692e-06, + "loss": 1.7347, + "step": 27060 + }, + { + "epoch": 8.306015960712093, + "grad_norm": 0.16341568529605865, + "learning_rate": 7.341134260605536e-06, + "loss": 1.6905, + "step": 27061 + }, + { + "epoch": 8.306322897483119, + "grad_norm": 0.18549038469791412, + "learning_rate": 7.338541722986292e-06, + "loss": 1.7508, + "step": 27062 + }, + { + "epoch": 8.306629834254144, + "grad_norm": 0.15528292953968048, + "learning_rate": 7.335949606972575e-06, + "loss": 1.7261, + "step": 27063 + }, + { + "epoch": 8.30693677102517, + "grad_norm": 0.14363928139209747, + "learning_rate": 7.333357912590028e-06, + "loss": 1.6494, + "step": 27064 + }, + { + "epoch": 8.307243707796195, + "grad_norm": 0.33007505536079407, + "learning_rate": 7.3307666398642285e-06, + "loss": 1.7844, + "step": 27065 + }, + { + "epoch": 8.307550644567218, + "grad_norm": 0.18550951778888702, + "learning_rate": 7.328175788820818e-06, + "loss": 1.7699, + "step": 27066 + }, + { + "epoch": 8.307857581338244, + "grad_norm": 0.1789010763168335, + "learning_rate": 7.325585359485382e-06, + "loss": 1.6903, + "step": 27067 + }, + { + "epoch": 8.308164518109269, + "grad_norm": 0.17079691588878632, + "learning_rate": 7.322995351883505e-06, + "loss": 1.6704, + "step": 27068 + }, + { + "epoch": 8.308471454880294, + "grad_norm": 0.17510086297988892, + "learning_rate": 7.320405766040828e-06, + "loss": 1.7222, + "step": 27069 + }, + { + "epoch": 8.30877839165132, + "grad_norm": 0.1619461178779602, + "learning_rate": 7.317816601982896e-06, + "loss": 1.6573, + "step": 27070 + }, + { + "epoch": 8.309085328422345, + "grad_norm": 0.15886032581329346, + "learning_rate": 7.315227859735335e-06, + "loss": 1.7281, + "step": 27071 + }, + { + "epoch": 8.30939226519337, + "grad_norm": 0.1636921614408493, + "learning_rate": 7.31263953932369e-06, + "loss": 1.7061, + "step": 27072 + }, + { + "epoch": 8.309699201964396, + "grad_norm": 0.16119423508644104, + "learning_rate": 7.3100516407735745e-06, + "loss": 1.7102, + "step": 27073 + }, + { + "epoch": 8.310006138735421, + "grad_norm": 0.2373964637517929, + "learning_rate": 7.3074641641105445e-06, + "loss": 1.7585, + "step": 27074 + }, + { + "epoch": 8.310313075506446, + "grad_norm": 0.17123030126094818, + "learning_rate": 7.304877109360181e-06, + "loss": 1.737, + "step": 27075 + }, + { + "epoch": 8.310620012277472, + "grad_norm": 0.14955085515975952, + "learning_rate": 7.302290476548046e-06, + "loss": 1.6676, + "step": 27076 + }, + { + "epoch": 8.310926949048495, + "grad_norm": 0.19933636486530304, + "learning_rate": 7.299704265699703e-06, + "loss": 1.6926, + "step": 27077 + }, + { + "epoch": 8.31123388581952, + "grad_norm": 0.15449854731559753, + "learning_rate": 7.297118476840709e-06, + "loss": 1.6826, + "step": 27078 + }, + { + "epoch": 8.311540822590546, + "grad_norm": 0.16641317307949066, + "learning_rate": 7.294533109996621e-06, + "loss": 1.7117, + "step": 27079 + }, + { + "epoch": 8.311847759361571, + "grad_norm": 0.18311664462089539, + "learning_rate": 7.291948165192974e-06, + "loss": 1.7376, + "step": 27080 + }, + { + "epoch": 8.312154696132596, + "grad_norm": 0.17437715828418732, + "learning_rate": 7.289363642455349e-06, + "loss": 1.7373, + "step": 27081 + }, + { + "epoch": 8.312461632903622, + "grad_norm": 0.16356121003627777, + "learning_rate": 7.286779541809241e-06, + "loss": 1.6847, + "step": 27082 + }, + { + "epoch": 8.312768569674647, + "grad_norm": 0.182320237159729, + "learning_rate": 7.284195863280241e-06, + "loss": 1.6853, + "step": 27083 + }, + { + "epoch": 8.313075506445673, + "grad_norm": 0.1541421264410019, + "learning_rate": 7.281612606893839e-06, + "loss": 1.7121, + "step": 27084 + }, + { + "epoch": 8.313382443216698, + "grad_norm": 0.16640879213809967, + "learning_rate": 7.2790297726755716e-06, + "loss": 1.6914, + "step": 27085 + }, + { + "epoch": 8.313689379987723, + "grad_norm": 0.18245746195316315, + "learning_rate": 7.27644736065099e-06, + "loss": 1.7544, + "step": 27086 + }, + { + "epoch": 8.313996316758749, + "grad_norm": 0.13833735883235931, + "learning_rate": 7.273865370845573e-06, + "loss": 1.6519, + "step": 27087 + }, + { + "epoch": 8.314303253529772, + "grad_norm": 0.19455993175506592, + "learning_rate": 7.271283803284889e-06, + "loss": 1.7017, + "step": 27088 + }, + { + "epoch": 8.314610190300797, + "grad_norm": 0.16859467327594757, + "learning_rate": 7.268702657994397e-06, + "loss": 1.7173, + "step": 27089 + }, + { + "epoch": 8.314917127071823, + "grad_norm": 0.1667163074016571, + "learning_rate": 7.266121934999642e-06, + "loss": 1.731, + "step": 27090 + }, + { + "epoch": 8.315224063842848, + "grad_norm": 0.161153182387352, + "learning_rate": 7.263541634326115e-06, + "loss": 1.7223, + "step": 27091 + }, + { + "epoch": 8.315531000613873, + "grad_norm": 0.17027638852596283, + "learning_rate": 7.2609617559993234e-06, + "loss": 1.6741, + "step": 27092 + }, + { + "epoch": 8.315837937384899, + "grad_norm": 0.1516280472278595, + "learning_rate": 7.2583823000447526e-06, + "loss": 1.6974, + "step": 27093 + }, + { + "epoch": 8.316144874155924, + "grad_norm": 0.18429140746593475, + "learning_rate": 7.2558032664879035e-06, + "loss": 1.7003, + "step": 27094 + }, + { + "epoch": 8.31645181092695, + "grad_norm": 0.13946834206581116, + "learning_rate": 7.253224655354257e-06, + "loss": 1.7349, + "step": 27095 + }, + { + "epoch": 8.316758747697975, + "grad_norm": 0.17642852663993835, + "learning_rate": 7.250646466669303e-06, + "loss": 1.7131, + "step": 27096 + }, + { + "epoch": 8.317065684469, + "grad_norm": 0.1700926125049591, + "learning_rate": 7.2480687004585155e-06, + "loss": 1.7496, + "step": 27097 + }, + { + "epoch": 8.317372621240025, + "grad_norm": 0.19472727179527283, + "learning_rate": 7.245491356747369e-06, + "loss": 1.73, + "step": 27098 + }, + { + "epoch": 8.317679558011049, + "grad_norm": 0.16857488453388214, + "learning_rate": 7.242914435561327e-06, + "loss": 1.7275, + "step": 27099 + }, + { + "epoch": 8.317986494782074, + "grad_norm": 0.18735560774803162, + "learning_rate": 7.240337936925884e-06, + "loss": 1.7236, + "step": 27100 + }, + { + "epoch": 8.3182934315531, + "grad_norm": 0.2252741903066635, + "learning_rate": 7.237761860866476e-06, + "loss": 1.7347, + "step": 27101 + }, + { + "epoch": 8.318600368324125, + "grad_norm": 0.16848546266555786, + "learning_rate": 7.2351862074085674e-06, + "loss": 1.6956, + "step": 27102 + }, + { + "epoch": 8.31890730509515, + "grad_norm": 0.13781076669692993, + "learning_rate": 7.232610976577614e-06, + "loss": 1.7018, + "step": 27103 + }, + { + "epoch": 8.319214241866176, + "grad_norm": 0.13122199475765228, + "learning_rate": 7.230036168399052e-06, + "loss": 1.652, + "step": 27104 + }, + { + "epoch": 8.319521178637201, + "grad_norm": 0.16110749542713165, + "learning_rate": 7.22746178289837e-06, + "loss": 1.6778, + "step": 27105 + }, + { + "epoch": 8.319828115408226, + "grad_norm": 0.19378480315208435, + "learning_rate": 7.224887820100951e-06, + "loss": 1.7753, + "step": 27106 + }, + { + "epoch": 8.320135052179252, + "grad_norm": 0.18464957177639008, + "learning_rate": 7.2223142800322775e-06, + "loss": 1.7455, + "step": 27107 + }, + { + "epoch": 8.320441988950277, + "grad_norm": 0.16992080211639404, + "learning_rate": 7.2197411627177636e-06, + "loss": 1.731, + "step": 27108 + }, + { + "epoch": 8.3207489257213, + "grad_norm": 0.16602276265621185, + "learning_rate": 7.2171684681828444e-06, + "loss": 1.7236, + "step": 27109 + }, + { + "epoch": 8.321055862492326, + "grad_norm": 0.16713769733905792, + "learning_rate": 7.214596196452944e-06, + "loss": 1.6636, + "step": 27110 + }, + { + "epoch": 8.321362799263351, + "grad_norm": 0.14015473425388336, + "learning_rate": 7.212024347553475e-06, + "loss": 1.6785, + "step": 27111 + }, + { + "epoch": 8.321669736034377, + "grad_norm": 0.25452539324760437, + "learning_rate": 7.209452921509868e-06, + "loss": 1.7434, + "step": 27112 + }, + { + "epoch": 8.321976672805402, + "grad_norm": 0.14998821914196014, + "learning_rate": 7.206881918347524e-06, + "loss": 1.6973, + "step": 27113 + }, + { + "epoch": 8.322283609576427, + "grad_norm": 0.16751673817634583, + "learning_rate": 7.2043113380918515e-06, + "loss": 1.7364, + "step": 27114 + }, + { + "epoch": 8.322590546347453, + "grad_norm": 0.14287763833999634, + "learning_rate": 7.201741180768262e-06, + "loss": 1.6576, + "step": 27115 + }, + { + "epoch": 8.322897483118478, + "grad_norm": 0.14396314322948456, + "learning_rate": 7.199171446402136e-06, + "loss": 1.6541, + "step": 27116 + }, + { + "epoch": 8.323204419889503, + "grad_norm": 0.1835038661956787, + "learning_rate": 7.196602135018915e-06, + "loss": 1.6925, + "step": 27117 + }, + { + "epoch": 8.323511356660529, + "grad_norm": 0.15047648549079895, + "learning_rate": 7.194033246643939e-06, + "loss": 1.7234, + "step": 27118 + }, + { + "epoch": 8.323818293431554, + "grad_norm": 0.1479605883359909, + "learning_rate": 7.19146478130262e-06, + "loss": 1.6702, + "step": 27119 + }, + { + "epoch": 8.324125230202577, + "grad_norm": 0.15971851348876953, + "learning_rate": 7.188896739020335e-06, + "loss": 1.7189, + "step": 27120 + }, + { + "epoch": 8.324432166973603, + "grad_norm": 0.1598353087902069, + "learning_rate": 7.186329119822455e-06, + "loss": 1.7015, + "step": 27121 + }, + { + "epoch": 8.324739103744628, + "grad_norm": 0.18845009803771973, + "learning_rate": 7.183761923734389e-06, + "loss": 1.6771, + "step": 27122 + }, + { + "epoch": 8.325046040515653, + "grad_norm": 0.15288181602954865, + "learning_rate": 7.181195150781456e-06, + "loss": 1.69, + "step": 27123 + }, + { + "epoch": 8.325352977286679, + "grad_norm": 0.16455978155136108, + "learning_rate": 7.178628800989073e-06, + "loss": 1.74, + "step": 27124 + }, + { + "epoch": 8.325659914057704, + "grad_norm": 0.23335149884223938, + "learning_rate": 7.176062874382561e-06, + "loss": 1.7591, + "step": 27125 + }, + { + "epoch": 8.32596685082873, + "grad_norm": 0.16988953948020935, + "learning_rate": 7.173497370987303e-06, + "loss": 1.744, + "step": 27126 + }, + { + "epoch": 8.326273787599755, + "grad_norm": 0.16113093495368958, + "learning_rate": 7.170932290828647e-06, + "loss": 1.6717, + "step": 27127 + }, + { + "epoch": 8.32658072437078, + "grad_norm": 0.16654139757156372, + "learning_rate": 7.168367633931938e-06, + "loss": 1.6797, + "step": 27128 + }, + { + "epoch": 8.326887661141805, + "grad_norm": 0.16671477258205414, + "learning_rate": 7.165803400322524e-06, + "loss": 1.7299, + "step": 27129 + }, + { + "epoch": 8.32719459791283, + "grad_norm": 0.18269041180610657, + "learning_rate": 7.16323959002575e-06, + "loss": 1.7371, + "step": 27130 + }, + { + "epoch": 8.327501534683854, + "grad_norm": 0.17919829487800598, + "learning_rate": 7.160676203066946e-06, + "loss": 1.7158, + "step": 27131 + }, + { + "epoch": 8.32780847145488, + "grad_norm": 0.17928342521190643, + "learning_rate": 7.158113239471453e-06, + "loss": 1.6964, + "step": 27132 + }, + { + "epoch": 8.328115408225905, + "grad_norm": 0.19797661900520325, + "learning_rate": 7.155550699264585e-06, + "loss": 1.7244, + "step": 27133 + }, + { + "epoch": 8.32842234499693, + "grad_norm": 0.15853050351142883, + "learning_rate": 7.1529885824716926e-06, + "loss": 1.6674, + "step": 27134 + }, + { + "epoch": 8.328729281767956, + "grad_norm": 0.20006918907165527, + "learning_rate": 7.150426889118078e-06, + "loss": 1.7601, + "step": 27135 + }, + { + "epoch": 8.329036218538981, + "grad_norm": 0.18851491808891296, + "learning_rate": 7.147865619229055e-06, + "loss": 1.7139, + "step": 27136 + }, + { + "epoch": 8.329343155310006, + "grad_norm": 0.2384614497423172, + "learning_rate": 7.145304772829936e-06, + "loss": 1.7343, + "step": 27137 + }, + { + "epoch": 8.329650092081032, + "grad_norm": 0.15243887901306152, + "learning_rate": 7.142744349946029e-06, + "loss": 1.7071, + "step": 27138 + }, + { + "epoch": 8.329957028852057, + "grad_norm": 0.20257025957107544, + "learning_rate": 7.140184350602663e-06, + "loss": 1.7255, + "step": 27139 + }, + { + "epoch": 8.330263965623082, + "grad_norm": 0.18863585591316223, + "learning_rate": 7.137624774825091e-06, + "loss": 1.6798, + "step": 27140 + }, + { + "epoch": 8.330570902394108, + "grad_norm": 0.19403952360153198, + "learning_rate": 7.135065622638659e-06, + "loss": 1.7354, + "step": 27141 + }, + { + "epoch": 8.330877839165131, + "grad_norm": 0.17294439673423767, + "learning_rate": 7.132506894068608e-06, + "loss": 1.6935, + "step": 27142 + }, + { + "epoch": 8.331184775936157, + "grad_norm": 0.20410899817943573, + "learning_rate": 7.129948589140262e-06, + "loss": 1.7625, + "step": 27143 + }, + { + "epoch": 8.331491712707182, + "grad_norm": 0.1795405000448227, + "learning_rate": 7.127390707878889e-06, + "loss": 1.6756, + "step": 27144 + }, + { + "epoch": 8.331798649478207, + "grad_norm": 0.1823110431432724, + "learning_rate": 7.12483325030977e-06, + "loss": 1.6844, + "step": 27145 + }, + { + "epoch": 8.332105586249233, + "grad_norm": 0.18655838072299957, + "learning_rate": 7.122276216458179e-06, + "loss": 1.7289, + "step": 27146 + }, + { + "epoch": 8.332412523020258, + "grad_norm": 0.16892722249031067, + "learning_rate": 7.119719606349384e-06, + "loss": 1.7003, + "step": 27147 + }, + { + "epoch": 8.332719459791283, + "grad_norm": 0.17768113315105438, + "learning_rate": 7.117163420008654e-06, + "loss": 1.6859, + "step": 27148 + }, + { + "epoch": 8.333026396562309, + "grad_norm": 0.14221824705600739, + "learning_rate": 7.114607657461253e-06, + "loss": 1.6752, + "step": 27149 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.17095401883125305, + "learning_rate": 7.112052318732421e-06, + "loss": 1.7354, + "step": 27150 + }, + { + "epoch": 8.33364027010436, + "grad_norm": 0.1910656839609146, + "learning_rate": 7.109497403847448e-06, + "loss": 1.7124, + "step": 27151 + }, + { + "epoch": 8.333947206875383, + "grad_norm": 0.1857171505689621, + "learning_rate": 7.106942912831549e-06, + "loss": 1.7716, + "step": 27152 + }, + { + "epoch": 8.334254143646408, + "grad_norm": 0.16951163113117218, + "learning_rate": 7.104388845709981e-06, + "loss": 1.7508, + "step": 27153 + }, + { + "epoch": 8.334561080417433, + "grad_norm": 0.18096883594989777, + "learning_rate": 7.101835202507983e-06, + "loss": 1.7064, + "step": 27154 + }, + { + "epoch": 8.334868017188459, + "grad_norm": 0.19499589502811432, + "learning_rate": 7.099281983250783e-06, + "loss": 1.712, + "step": 27155 + }, + { + "epoch": 8.335174953959484, + "grad_norm": 0.23200182616710663, + "learning_rate": 7.096729187963647e-06, + "loss": 1.8253, + "step": 27156 + }, + { + "epoch": 8.33548189073051, + "grad_norm": 0.3447387218475342, + "learning_rate": 7.094176816671755e-06, + "loss": 1.7531, + "step": 27157 + }, + { + "epoch": 8.335788827501535, + "grad_norm": 0.14633947610855103, + "learning_rate": 7.091624869400376e-06, + "loss": 1.6866, + "step": 27158 + }, + { + "epoch": 8.33609576427256, + "grad_norm": 0.19512905180454254, + "learning_rate": 7.0890733461746905e-06, + "loss": 1.6853, + "step": 27159 + }, + { + "epoch": 8.336402701043585, + "grad_norm": 0.20525458455085754, + "learning_rate": 7.086522247019944e-06, + "loss": 1.69, + "step": 27160 + }, + { + "epoch": 8.33670963781461, + "grad_norm": 0.15972889959812164, + "learning_rate": 7.08397157196134e-06, + "loss": 1.6949, + "step": 27161 + }, + { + "epoch": 8.337016574585636, + "grad_norm": 0.18894724547863007, + "learning_rate": 7.081421321024079e-06, + "loss": 1.7254, + "step": 27162 + }, + { + "epoch": 8.33732351135666, + "grad_norm": 0.17392434179782867, + "learning_rate": 7.078871494233364e-06, + "loss": 1.7449, + "step": 27163 + }, + { + "epoch": 8.337630448127685, + "grad_norm": 0.16262824833393097, + "learning_rate": 7.076322091614401e-06, + "loss": 1.734, + "step": 27164 + }, + { + "epoch": 8.33793738489871, + "grad_norm": 0.1960107982158661, + "learning_rate": 7.073773113192383e-06, + "loss": 1.6464, + "step": 27165 + }, + { + "epoch": 8.338244321669736, + "grad_norm": 0.1750497817993164, + "learning_rate": 7.071224558992501e-06, + "loss": 1.7187, + "step": 27166 + }, + { + "epoch": 8.338551258440761, + "grad_norm": 0.2179764360189438, + "learning_rate": 7.068676429039928e-06, + "loss": 1.7207, + "step": 27167 + }, + { + "epoch": 8.338858195211786, + "grad_norm": 0.17758040130138397, + "learning_rate": 7.066128723359877e-06, + "loss": 1.7248, + "step": 27168 + }, + { + "epoch": 8.339165131982812, + "grad_norm": 0.16506128013134003, + "learning_rate": 7.063581441977496e-06, + "loss": 1.7788, + "step": 27169 + }, + { + "epoch": 8.339472068753837, + "grad_norm": 0.18444709479808807, + "learning_rate": 7.061034584917963e-06, + "loss": 1.6958, + "step": 27170 + }, + { + "epoch": 8.339779005524862, + "grad_norm": 0.19419504702091217, + "learning_rate": 7.0584881522064605e-06, + "loss": 1.7459, + "step": 27171 + }, + { + "epoch": 8.340085942295888, + "grad_norm": 0.19482584297657013, + "learning_rate": 7.055942143868133e-06, + "loss": 1.7043, + "step": 27172 + }, + { + "epoch": 8.340392879066913, + "grad_norm": 0.20925387740135193, + "learning_rate": 7.053396559928183e-06, + "loss": 1.7817, + "step": 27173 + }, + { + "epoch": 8.340699815837937, + "grad_norm": 0.2067698836326599, + "learning_rate": 7.050851400411712e-06, + "loss": 1.729, + "step": 27174 + }, + { + "epoch": 8.341006752608962, + "grad_norm": 0.1617327481508255, + "learning_rate": 7.048306665343923e-06, + "loss": 1.6888, + "step": 27175 + }, + { + "epoch": 8.341313689379987, + "grad_norm": 0.16514994204044342, + "learning_rate": 7.045762354749924e-06, + "loss": 1.7152, + "step": 27176 + }, + { + "epoch": 8.341620626151013, + "grad_norm": 0.17930150032043457, + "learning_rate": 7.043218468654889e-06, + "loss": 1.8112, + "step": 27177 + }, + { + "epoch": 8.341927562922038, + "grad_norm": 0.17400570213794708, + "learning_rate": 7.040675007083941e-06, + "loss": 1.7071, + "step": 27178 + }, + { + "epoch": 8.342234499693063, + "grad_norm": 0.18226927518844604, + "learning_rate": 7.038131970062228e-06, + "loss": 1.7786, + "step": 27179 + }, + { + "epoch": 8.342541436464089, + "grad_norm": 0.15586300194263458, + "learning_rate": 7.035589357614869e-06, + "loss": 1.7414, + "step": 27180 + }, + { + "epoch": 8.342848373235114, + "grad_norm": 0.18447721004486084, + "learning_rate": 7.033047169767004e-06, + "loss": 1.7123, + "step": 27181 + }, + { + "epoch": 8.34315531000614, + "grad_norm": 0.16714699566364288, + "learning_rate": 7.030505406543747e-06, + "loss": 1.728, + "step": 27182 + }, + { + "epoch": 8.343462246777165, + "grad_norm": 0.15295952558517456, + "learning_rate": 7.027964067970228e-06, + "loss": 1.6926, + "step": 27183 + }, + { + "epoch": 8.34376918354819, + "grad_norm": 0.14499974250793457, + "learning_rate": 7.025423154071537e-06, + "loss": 1.6841, + "step": 27184 + }, + { + "epoch": 8.344076120319214, + "grad_norm": 0.15066829323768616, + "learning_rate": 7.022882664872827e-06, + "loss": 1.6593, + "step": 27185 + }, + { + "epoch": 8.344383057090239, + "grad_norm": 0.17318779230117798, + "learning_rate": 7.020342600399166e-06, + "loss": 1.698, + "step": 27186 + }, + { + "epoch": 8.344689993861264, + "grad_norm": 0.19946762919425964, + "learning_rate": 7.017802960675674e-06, + "loss": 1.7257, + "step": 27187 + }, + { + "epoch": 8.34499693063229, + "grad_norm": 0.17052631080150604, + "learning_rate": 7.015263745727441e-06, + "loss": 1.7299, + "step": 27188 + }, + { + "epoch": 8.345303867403315, + "grad_norm": 0.16269686818122864, + "learning_rate": 7.012724955579558e-06, + "loss": 1.7385, + "step": 27189 + }, + { + "epoch": 8.34561080417434, + "grad_norm": 0.19195757806301117, + "learning_rate": 7.010186590257145e-06, + "loss": 1.7264, + "step": 27190 + }, + { + "epoch": 8.345917740945366, + "grad_norm": 0.14985592663288116, + "learning_rate": 7.007648649785248e-06, + "loss": 1.7135, + "step": 27191 + }, + { + "epoch": 8.34622467771639, + "grad_norm": 0.16438701748847961, + "learning_rate": 7.00511113418898e-06, + "loss": 1.6876, + "step": 27192 + }, + { + "epoch": 8.346531614487416, + "grad_norm": 0.241184800863266, + "learning_rate": 7.002574043493387e-06, + "loss": 1.8587, + "step": 27193 + }, + { + "epoch": 8.346838551258442, + "grad_norm": 0.17353931069374084, + "learning_rate": 7.000037377723567e-06, + "loss": 1.7465, + "step": 27194 + }, + { + "epoch": 8.347145488029465, + "grad_norm": 0.1923576444387436, + "learning_rate": 6.997501136904583e-06, + "loss": 1.7859, + "step": 27195 + }, + { + "epoch": 8.34745242480049, + "grad_norm": 0.1997295618057251, + "learning_rate": 6.994965321061492e-06, + "loss": 1.7612, + "step": 27196 + }, + { + "epoch": 8.347759361571516, + "grad_norm": 0.184821218252182, + "learning_rate": 6.992429930219363e-06, + "loss": 1.6761, + "step": 27197 + }, + { + "epoch": 8.348066298342541, + "grad_norm": 0.14091727137565613, + "learning_rate": 6.989894964403248e-06, + "loss": 1.6541, + "step": 27198 + }, + { + "epoch": 8.348373235113566, + "grad_norm": 0.13829854130744934, + "learning_rate": 6.987360423638206e-06, + "loss": 1.6814, + "step": 27199 + }, + { + "epoch": 8.348680171884592, + "grad_norm": 0.12685348093509674, + "learning_rate": 6.984826307949272e-06, + "loss": 1.6498, + "step": 27200 + }, + { + "epoch": 8.348987108655617, + "grad_norm": 0.17062726616859436, + "learning_rate": 6.9822926173614856e-06, + "loss": 1.7138, + "step": 27201 + }, + { + "epoch": 8.349294045426642, + "grad_norm": 0.15178726613521576, + "learning_rate": 6.979759351899923e-06, + "loss": 1.756, + "step": 27202 + }, + { + "epoch": 8.349600982197668, + "grad_norm": 0.1897916942834854, + "learning_rate": 6.97722651158958e-06, + "loss": 1.7317, + "step": 27203 + }, + { + "epoch": 8.349907918968693, + "grad_norm": 0.13750115036964417, + "learning_rate": 6.974694096455503e-06, + "loss": 1.6853, + "step": 27204 + }, + { + "epoch": 8.350214855739718, + "grad_norm": 0.17380347847938538, + "learning_rate": 6.972162106522717e-06, + "loss": 1.728, + "step": 27205 + }, + { + "epoch": 8.350521792510742, + "grad_norm": 0.1593543291091919, + "learning_rate": 6.96963054181623e-06, + "loss": 1.6904, + "step": 27206 + }, + { + "epoch": 8.350828729281767, + "grad_norm": 0.1569581925868988, + "learning_rate": 6.967099402361099e-06, + "loss": 1.6995, + "step": 27207 + }, + { + "epoch": 8.351135666052793, + "grad_norm": 0.180283784866333, + "learning_rate": 6.9645686881822935e-06, + "loss": 1.6755, + "step": 27208 + }, + { + "epoch": 8.351442602823818, + "grad_norm": 0.2145276516675949, + "learning_rate": 6.9620383993048654e-06, + "loss": 1.7705, + "step": 27209 + }, + { + "epoch": 8.351749539594843, + "grad_norm": 0.15903061628341675, + "learning_rate": 6.959508535753772e-06, + "loss": 1.702, + "step": 27210 + }, + { + "epoch": 8.352056476365869, + "grad_norm": 0.16429775953292847, + "learning_rate": 6.9569790975540565e-06, + "loss": 1.6656, + "step": 27211 + }, + { + "epoch": 8.352363413136894, + "grad_norm": 0.1546638011932373, + "learning_rate": 6.954450084730707e-06, + "loss": 1.681, + "step": 27212 + }, + { + "epoch": 8.35267034990792, + "grad_norm": 0.17022907733917236, + "learning_rate": 6.951921497308705e-06, + "loss": 1.7094, + "step": 27213 + }, + { + "epoch": 8.352977286678945, + "grad_norm": 0.18317057192325592, + "learning_rate": 6.949393335313048e-06, + "loss": 1.7395, + "step": 27214 + }, + { + "epoch": 8.35328422344997, + "grad_norm": 0.1707061231136322, + "learning_rate": 6.94686559876872e-06, + "loss": 1.6918, + "step": 27215 + }, + { + "epoch": 8.353591160220994, + "grad_norm": 0.171799436211586, + "learning_rate": 6.944338287700697e-06, + "loss": 1.7173, + "step": 27216 + }, + { + "epoch": 8.353898096992019, + "grad_norm": 0.14982536435127258, + "learning_rate": 6.941811402133963e-06, + "loss": 1.7244, + "step": 27217 + }, + { + "epoch": 8.354205033763044, + "grad_norm": 0.1584668904542923, + "learning_rate": 6.939284942093471e-06, + "loss": 1.7023, + "step": 27218 + }, + { + "epoch": 8.35451197053407, + "grad_norm": 0.18367518484592438, + "learning_rate": 6.93675890760423e-06, + "loss": 1.6977, + "step": 27219 + }, + { + "epoch": 8.354818907305095, + "grad_norm": 0.2665458619594574, + "learning_rate": 6.934233298691167e-06, + "loss": 1.7711, + "step": 27220 + }, + { + "epoch": 8.35512584407612, + "grad_norm": 0.1657658815383911, + "learning_rate": 6.931708115379249e-06, + "loss": 1.6957, + "step": 27221 + }, + { + "epoch": 8.355432780847146, + "grad_norm": 0.17687681317329407, + "learning_rate": 6.929183357693436e-06, + "loss": 1.7163, + "step": 27222 + }, + { + "epoch": 8.355739717618171, + "grad_norm": 0.1775265783071518, + "learning_rate": 6.926659025658666e-06, + "loss": 1.7595, + "step": 27223 + }, + { + "epoch": 8.356046654389196, + "grad_norm": 0.1962285041809082, + "learning_rate": 6.924135119299919e-06, + "loss": 1.7852, + "step": 27224 + }, + { + "epoch": 8.356353591160222, + "grad_norm": 0.17352642118930817, + "learning_rate": 6.921611638642095e-06, + "loss": 1.748, + "step": 27225 + }, + { + "epoch": 8.356660527931247, + "grad_norm": 0.19602125883102417, + "learning_rate": 6.919088583710176e-06, + "loss": 1.685, + "step": 27226 + }, + { + "epoch": 8.35696746470227, + "grad_norm": 0.15199948847293854, + "learning_rate": 6.9165659545290525e-06, + "loss": 1.6641, + "step": 27227 + }, + { + "epoch": 8.357274401473296, + "grad_norm": 0.15671736001968384, + "learning_rate": 6.914043751123683e-06, + "loss": 1.6915, + "step": 27228 + }, + { + "epoch": 8.357581338244321, + "grad_norm": 0.19513672590255737, + "learning_rate": 6.911521973518992e-06, + "loss": 1.7526, + "step": 27229 + }, + { + "epoch": 8.357888275015346, + "grad_norm": 0.15108506381511688, + "learning_rate": 6.9090006217398975e-06, + "loss": 1.7167, + "step": 27230 + }, + { + "epoch": 8.358195211786372, + "grad_norm": 0.19638952612876892, + "learning_rate": 6.906479695811307e-06, + "loss": 1.6937, + "step": 27231 + }, + { + "epoch": 8.358502148557397, + "grad_norm": 0.14345301687717438, + "learning_rate": 6.903959195758148e-06, + "loss": 1.7295, + "step": 27232 + }, + { + "epoch": 8.358809085328422, + "grad_norm": 0.1557627171278, + "learning_rate": 6.901439121605324e-06, + "loss": 1.7146, + "step": 27233 + }, + { + "epoch": 8.359116022099448, + "grad_norm": 0.15030202269554138, + "learning_rate": 6.898919473377741e-06, + "loss": 1.6974, + "step": 27234 + }, + { + "epoch": 8.359422958870473, + "grad_norm": 0.24213968217372894, + "learning_rate": 6.896400251100283e-06, + "loss": 1.8179, + "step": 27235 + }, + { + "epoch": 8.359729895641498, + "grad_norm": 0.1646348387002945, + "learning_rate": 6.893881454797885e-06, + "loss": 1.7001, + "step": 27236 + }, + { + "epoch": 8.360036832412524, + "grad_norm": 0.18399927020072937, + "learning_rate": 6.891363084495406e-06, + "loss": 1.746, + "step": 27237 + }, + { + "epoch": 8.360343769183547, + "grad_norm": 0.19470340013504028, + "learning_rate": 6.8888451402177365e-06, + "loss": 1.7442, + "step": 27238 + }, + { + "epoch": 8.360650705954573, + "grad_norm": 0.1420234590768814, + "learning_rate": 6.886327621989775e-06, + "loss": 1.6481, + "step": 27239 + }, + { + "epoch": 8.360957642725598, + "grad_norm": 0.1827881634235382, + "learning_rate": 6.883810529836382e-06, + "loss": 1.6842, + "step": 27240 + }, + { + "epoch": 8.361264579496623, + "grad_norm": 0.19096913933753967, + "learning_rate": 6.881293863782468e-06, + "loss": 1.7061, + "step": 27241 + }, + { + "epoch": 8.361571516267649, + "grad_norm": 0.1871458888053894, + "learning_rate": 6.878777623852855e-06, + "loss": 1.7607, + "step": 27242 + }, + { + "epoch": 8.361878453038674, + "grad_norm": 0.13643455505371094, + "learning_rate": 6.876261810072459e-06, + "loss": 1.6747, + "step": 27243 + }, + { + "epoch": 8.3621853898097, + "grad_norm": 0.16990543901920319, + "learning_rate": 6.8737464224660985e-06, + "loss": 1.7318, + "step": 27244 + }, + { + "epoch": 8.362492326580725, + "grad_norm": 0.16357167065143585, + "learning_rate": 6.871231461058658e-06, + "loss": 1.6609, + "step": 27245 + }, + { + "epoch": 8.36279926335175, + "grad_norm": 0.20114652812480927, + "learning_rate": 6.868716925874996e-06, + "loss": 1.7647, + "step": 27246 + }, + { + "epoch": 8.363106200122775, + "grad_norm": 0.18387655913829803, + "learning_rate": 6.866202816939949e-06, + "loss": 1.7213, + "step": 27247 + }, + { + "epoch": 8.3634131368938, + "grad_norm": 0.18712659180164337, + "learning_rate": 6.863689134278367e-06, + "loss": 1.7144, + "step": 27248 + }, + { + "epoch": 8.363720073664824, + "grad_norm": 0.19831795990467072, + "learning_rate": 6.861175877915088e-06, + "loss": 1.7396, + "step": 27249 + }, + { + "epoch": 8.36402701043585, + "grad_norm": 0.2181798815727234, + "learning_rate": 6.858663047874958e-06, + "loss": 1.7523, + "step": 27250 + }, + { + "epoch": 8.364333947206875, + "grad_norm": 0.17912371456623077, + "learning_rate": 6.856150644182807e-06, + "loss": 1.7617, + "step": 27251 + }, + { + "epoch": 8.3646408839779, + "grad_norm": 0.16200366616249084, + "learning_rate": 6.85363866686346e-06, + "loss": 1.6886, + "step": 27252 + }, + { + "epoch": 8.364947820748926, + "grad_norm": 0.18456755578517914, + "learning_rate": 6.851127115941747e-06, + "loss": 1.6873, + "step": 27253 + }, + { + "epoch": 8.365254757519951, + "grad_norm": 0.1649440973997116, + "learning_rate": 6.848615991442487e-06, + "loss": 1.7024, + "step": 27254 + }, + { + "epoch": 8.365561694290976, + "grad_norm": 0.17722025513648987, + "learning_rate": 6.846105293390492e-06, + "loss": 1.7401, + "step": 27255 + }, + { + "epoch": 8.365868631062002, + "grad_norm": 0.18342679738998413, + "learning_rate": 6.843595021810578e-06, + "loss": 1.7285, + "step": 27256 + }, + { + "epoch": 8.366175567833027, + "grad_norm": 0.13590754568576813, + "learning_rate": 6.841085176727557e-06, + "loss": 1.6704, + "step": 27257 + }, + { + "epoch": 8.366482504604052, + "grad_norm": 0.16721662878990173, + "learning_rate": 6.838575758166221e-06, + "loss": 1.7371, + "step": 27258 + }, + { + "epoch": 8.366789441375076, + "grad_norm": 0.15011465549468994, + "learning_rate": 6.836066766151372e-06, + "loss": 1.668, + "step": 27259 + }, + { + "epoch": 8.367096378146101, + "grad_norm": 0.15394380688667297, + "learning_rate": 6.833558200707835e-06, + "loss": 1.7402, + "step": 27260 + }, + { + "epoch": 8.367403314917127, + "grad_norm": 0.2134244441986084, + "learning_rate": 6.83105006186035e-06, + "loss": 1.6979, + "step": 27261 + }, + { + "epoch": 8.367710251688152, + "grad_norm": 0.2169496864080429, + "learning_rate": 6.8285423496337375e-06, + "loss": 1.7821, + "step": 27262 + }, + { + "epoch": 8.368017188459177, + "grad_norm": 0.16033586859703064, + "learning_rate": 6.8260350640527774e-06, + "loss": 1.6976, + "step": 27263 + }, + { + "epoch": 8.368324125230203, + "grad_norm": 0.2089877724647522, + "learning_rate": 6.823528205142244e-06, + "loss": 1.7532, + "step": 27264 + }, + { + "epoch": 8.368631062001228, + "grad_norm": 0.12897463142871857, + "learning_rate": 6.821021772926911e-06, + "loss": 1.6445, + "step": 27265 + }, + { + "epoch": 8.368937998772253, + "grad_norm": 0.18726956844329834, + "learning_rate": 6.818515767431549e-06, + "loss": 1.7296, + "step": 27266 + }, + { + "epoch": 8.369244935543279, + "grad_norm": 0.1857292354106903, + "learning_rate": 6.816010188680927e-06, + "loss": 1.7747, + "step": 27267 + }, + { + "epoch": 8.369551872314304, + "grad_norm": 0.24680334329605103, + "learning_rate": 6.813505036699802e-06, + "loss": 1.7877, + "step": 27268 + }, + { + "epoch": 8.36985880908533, + "grad_norm": 0.1404808908700943, + "learning_rate": 6.811000311512927e-06, + "loss": 1.6769, + "step": 27269 + }, + { + "epoch": 8.370165745856353, + "grad_norm": 0.18543009459972382, + "learning_rate": 6.808496013145066e-06, + "loss": 1.7325, + "step": 27270 + }, + { + "epoch": 8.370472682627378, + "grad_norm": 0.13881617784500122, + "learning_rate": 6.805992141620959e-06, + "loss": 1.7022, + "step": 27271 + }, + { + "epoch": 8.370779619398403, + "grad_norm": 0.18534715473651886, + "learning_rate": 6.80348869696536e-06, + "loss": 1.7609, + "step": 27272 + }, + { + "epoch": 8.371086556169429, + "grad_norm": 0.20225360989570618, + "learning_rate": 6.800985679202998e-06, + "loss": 1.7159, + "step": 27273 + }, + { + "epoch": 8.371393492940454, + "grad_norm": 0.1462840884923935, + "learning_rate": 6.79848308835862e-06, + "loss": 1.6607, + "step": 27274 + }, + { + "epoch": 8.37170042971148, + "grad_norm": 0.17453989386558533, + "learning_rate": 6.795980924456952e-06, + "loss": 1.7705, + "step": 27275 + }, + { + "epoch": 8.372007366482505, + "grad_norm": 0.15709565579891205, + "learning_rate": 6.793479187522711e-06, + "loss": 1.6961, + "step": 27276 + }, + { + "epoch": 8.37231430325353, + "grad_norm": 0.14979243278503418, + "learning_rate": 6.790977877580656e-06, + "loss": 1.6817, + "step": 27277 + }, + { + "epoch": 8.372621240024555, + "grad_norm": 0.16452275216579437, + "learning_rate": 6.7884769946554575e-06, + "loss": 1.693, + "step": 27278 + }, + { + "epoch": 8.37292817679558, + "grad_norm": 0.18353265523910522, + "learning_rate": 6.785976538771882e-06, + "loss": 1.7003, + "step": 27279 + }, + { + "epoch": 8.373235113566606, + "grad_norm": 0.15123683214187622, + "learning_rate": 6.783476509954595e-06, + "loss": 1.6611, + "step": 27280 + }, + { + "epoch": 8.37354205033763, + "grad_norm": 0.19939517974853516, + "learning_rate": 6.780976908228332e-06, + "loss": 1.7969, + "step": 27281 + }, + { + "epoch": 8.373848987108655, + "grad_norm": 0.2997080981731415, + "learning_rate": 6.778477733617783e-06, + "loss": 1.7822, + "step": 27282 + }, + { + "epoch": 8.37415592387968, + "grad_norm": 0.13474299013614655, + "learning_rate": 6.775978986147657e-06, + "loss": 1.7155, + "step": 27283 + }, + { + "epoch": 8.374462860650706, + "grad_norm": 0.15992368757724762, + "learning_rate": 6.773480665842635e-06, + "loss": 1.6985, + "step": 27284 + }, + { + "epoch": 8.374769797421731, + "grad_norm": 0.15250587463378906, + "learning_rate": 6.770982772727413e-06, + "loss": 1.7007, + "step": 27285 + }, + { + "epoch": 8.375076734192756, + "grad_norm": 0.1373993456363678, + "learning_rate": 6.768485306826683e-06, + "loss": 1.6852, + "step": 27286 + }, + { + "epoch": 8.375383670963782, + "grad_norm": 0.15772612392902374, + "learning_rate": 6.765988268165113e-06, + "loss": 1.6881, + "step": 27287 + }, + { + "epoch": 8.375690607734807, + "grad_norm": 0.13689690828323364, + "learning_rate": 6.76349165676739e-06, + "loss": 1.6747, + "step": 27288 + }, + { + "epoch": 8.375997544505832, + "grad_norm": 0.18657375872135162, + "learning_rate": 6.7609954726581825e-06, + "loss": 1.7324, + "step": 27289 + }, + { + "epoch": 8.376304481276858, + "grad_norm": 0.16617898643016815, + "learning_rate": 6.758499715862166e-06, + "loss": 1.6832, + "step": 27290 + }, + { + "epoch": 8.376611418047883, + "grad_norm": 0.16960306465625763, + "learning_rate": 6.756004386403996e-06, + "loss": 1.7353, + "step": 27291 + }, + { + "epoch": 8.376918354818907, + "grad_norm": 0.17030803859233856, + "learning_rate": 6.753509484308334e-06, + "loss": 1.7079, + "step": 27292 + }, + { + "epoch": 8.377225291589932, + "grad_norm": 0.16151085495948792, + "learning_rate": 6.751015009599831e-06, + "loss": 1.6706, + "step": 27293 + }, + { + "epoch": 8.377532228360957, + "grad_norm": 0.1715710461139679, + "learning_rate": 6.748520962303173e-06, + "loss": 1.7116, + "step": 27294 + }, + { + "epoch": 8.377839165131983, + "grad_norm": 0.20747625827789307, + "learning_rate": 6.746027342442951e-06, + "loss": 1.731, + "step": 27295 + }, + { + "epoch": 8.378146101903008, + "grad_norm": 0.1645912081003189, + "learning_rate": 6.743534150043867e-06, + "loss": 1.7076, + "step": 27296 + }, + { + "epoch": 8.378453038674033, + "grad_norm": 0.16044393181800842, + "learning_rate": 6.741041385130509e-06, + "loss": 1.7105, + "step": 27297 + }, + { + "epoch": 8.378759975445059, + "grad_norm": 0.18224483728408813, + "learning_rate": 6.738549047727543e-06, + "loss": 1.7258, + "step": 27298 + }, + { + "epoch": 8.379066912216084, + "grad_norm": 0.17351657152175903, + "learning_rate": 6.7360571378595915e-06, + "loss": 1.7369, + "step": 27299 + }, + { + "epoch": 8.37937384898711, + "grad_norm": 0.18293599784374237, + "learning_rate": 6.733565655551283e-06, + "loss": 1.7151, + "step": 27300 + }, + { + "epoch": 8.379680785758135, + "grad_norm": 0.1593983918428421, + "learning_rate": 6.731074600827242e-06, + "loss": 1.6544, + "step": 27301 + }, + { + "epoch": 8.379987722529158, + "grad_norm": 0.16315947473049164, + "learning_rate": 6.728583973712077e-06, + "loss": 1.7442, + "step": 27302 + }, + { + "epoch": 8.380294659300183, + "grad_norm": 0.13841219246387482, + "learning_rate": 6.726093774230408e-06, + "loss": 1.6639, + "step": 27303 + }, + { + "epoch": 8.380601596071209, + "grad_norm": 0.14162768423557281, + "learning_rate": 6.723604002406847e-06, + "loss": 1.6713, + "step": 27304 + }, + { + "epoch": 8.380908532842234, + "grad_norm": 0.1737380474805832, + "learning_rate": 6.721114658265992e-06, + "loss": 1.7197, + "step": 27305 + }, + { + "epoch": 8.38121546961326, + "grad_norm": 0.15531061589717865, + "learning_rate": 6.718625741832452e-06, + "loss": 1.7337, + "step": 27306 + }, + { + "epoch": 8.381522406384285, + "grad_norm": 0.1833781898021698, + "learning_rate": 6.716137253130816e-06, + "loss": 1.7838, + "step": 27307 + }, + { + "epoch": 8.38182934315531, + "grad_norm": 0.23010820150375366, + "learning_rate": 6.713649192185683e-06, + "loss": 1.7023, + "step": 27308 + }, + { + "epoch": 8.382136279926335, + "grad_norm": 0.14409376680850983, + "learning_rate": 6.7111615590216445e-06, + "loss": 1.6968, + "step": 27309 + }, + { + "epoch": 8.38244321669736, + "grad_norm": 0.19448643922805786, + "learning_rate": 6.7086743536632635e-06, + "loss": 1.7117, + "step": 27310 + }, + { + "epoch": 8.382750153468386, + "grad_norm": 0.18580564856529236, + "learning_rate": 6.706187576135159e-06, + "loss": 1.8183, + "step": 27311 + }, + { + "epoch": 8.383057090239411, + "grad_norm": 0.20270103216171265, + "learning_rate": 6.7037012264618675e-06, + "loss": 1.7666, + "step": 27312 + }, + { + "epoch": 8.383364027010435, + "grad_norm": 0.16575069725513458, + "learning_rate": 6.7012153046679904e-06, + "loss": 1.7542, + "step": 27313 + }, + { + "epoch": 8.38367096378146, + "grad_norm": 0.16375242173671722, + "learning_rate": 6.698729810778065e-06, + "loss": 1.7117, + "step": 27314 + }, + { + "epoch": 8.383977900552486, + "grad_norm": 0.2082248479127884, + "learning_rate": 6.696244744816682e-06, + "loss": 1.7687, + "step": 27315 + }, + { + "epoch": 8.384284837323511, + "grad_norm": 0.1562620848417282, + "learning_rate": 6.693760106808389e-06, + "loss": 1.6782, + "step": 27316 + }, + { + "epoch": 8.384591774094536, + "grad_norm": 0.1883714199066162, + "learning_rate": 6.6912758967777435e-06, + "loss": 1.7023, + "step": 27317 + }, + { + "epoch": 8.384898710865562, + "grad_norm": 0.17445886135101318, + "learning_rate": 6.688792114749292e-06, + "loss": 1.7019, + "step": 27318 + }, + { + "epoch": 8.385205647636587, + "grad_norm": 0.20479950308799744, + "learning_rate": 6.686308760747584e-06, + "loss": 1.7514, + "step": 27319 + }, + { + "epoch": 8.385512584407612, + "grad_norm": 0.21790143847465515, + "learning_rate": 6.683825834797153e-06, + "loss": 1.7243, + "step": 27320 + }, + { + "epoch": 8.385819521178638, + "grad_norm": 0.1784016340970993, + "learning_rate": 6.681343336922552e-06, + "loss": 1.7301, + "step": 27321 + }, + { + "epoch": 8.386126457949663, + "grad_norm": 0.22286179661750793, + "learning_rate": 6.678861267148301e-06, + "loss": 1.7231, + "step": 27322 + }, + { + "epoch": 8.386433394720688, + "grad_norm": 0.17854957282543182, + "learning_rate": 6.676379625498935e-06, + "loss": 1.7216, + "step": 27323 + }, + { + "epoch": 8.386740331491712, + "grad_norm": 0.1750447154045105, + "learning_rate": 6.67389841199898e-06, + "loss": 1.7603, + "step": 27324 + }, + { + "epoch": 8.387047268262737, + "grad_norm": 0.17893844842910767, + "learning_rate": 6.6714176266729545e-06, + "loss": 1.7229, + "step": 27325 + }, + { + "epoch": 8.387354205033763, + "grad_norm": 0.18705782294273376, + "learning_rate": 6.6689372695453725e-06, + "loss": 1.7021, + "step": 27326 + }, + { + "epoch": 8.387661141804788, + "grad_norm": 0.18719066679477692, + "learning_rate": 6.666457340640742e-06, + "loss": 1.7216, + "step": 27327 + }, + { + "epoch": 8.387968078575813, + "grad_norm": 0.16408847272396088, + "learning_rate": 6.663977839983604e-06, + "loss": 1.6937, + "step": 27328 + }, + { + "epoch": 8.388275015346839, + "grad_norm": 0.1739223599433899, + "learning_rate": 6.661498767598407e-06, + "loss": 1.6533, + "step": 27329 + }, + { + "epoch": 8.388581952117864, + "grad_norm": 0.19943352043628693, + "learning_rate": 6.6590201235097075e-06, + "loss": 1.753, + "step": 27330 + }, + { + "epoch": 8.38888888888889, + "grad_norm": 0.1412268429994583, + "learning_rate": 6.656541907741954e-06, + "loss": 1.6669, + "step": 27331 + }, + { + "epoch": 8.389195825659915, + "grad_norm": 0.17952445149421692, + "learning_rate": 6.654064120319664e-06, + "loss": 1.6921, + "step": 27332 + }, + { + "epoch": 8.38950276243094, + "grad_norm": 0.22117477655410767, + "learning_rate": 6.65158676126732e-06, + "loss": 1.7677, + "step": 27333 + }, + { + "epoch": 8.389809699201965, + "grad_norm": 0.1926339566707611, + "learning_rate": 6.649109830609401e-06, + "loss": 1.7237, + "step": 27334 + }, + { + "epoch": 8.390116635972989, + "grad_norm": 0.3306657671928406, + "learning_rate": 6.646633328370394e-06, + "loss": 1.7735, + "step": 27335 + }, + { + "epoch": 8.390423572744014, + "grad_norm": 0.14908578991889954, + "learning_rate": 6.644157254574762e-06, + "loss": 1.7109, + "step": 27336 + }, + { + "epoch": 8.39073050951504, + "grad_norm": 0.20824603736400604, + "learning_rate": 6.64168160924698e-06, + "loss": 1.7177, + "step": 27337 + }, + { + "epoch": 8.391037446286065, + "grad_norm": 0.22669748961925507, + "learning_rate": 6.6392063924115125e-06, + "loss": 1.7842, + "step": 27338 + }, + { + "epoch": 8.39134438305709, + "grad_norm": 0.16690780222415924, + "learning_rate": 6.6367316040928215e-06, + "loss": 1.739, + "step": 27339 + }, + { + "epoch": 8.391651319828116, + "grad_norm": 0.17900501191616058, + "learning_rate": 6.634257244315367e-06, + "loss": 1.705, + "step": 27340 + }, + { + "epoch": 8.39195825659914, + "grad_norm": 0.18606948852539062, + "learning_rate": 6.631783313103595e-06, + "loss": 1.7324, + "step": 27341 + }, + { + "epoch": 8.392265193370166, + "grad_norm": 0.15370480716228485, + "learning_rate": 6.629309810481965e-06, + "loss": 1.6834, + "step": 27342 + }, + { + "epoch": 8.392572130141192, + "grad_norm": 0.13654825091362, + "learning_rate": 6.626836736474917e-06, + "loss": 1.6729, + "step": 27343 + }, + { + "epoch": 8.392879066912217, + "grad_norm": 0.21128645539283752, + "learning_rate": 6.624364091106877e-06, + "loss": 1.7494, + "step": 27344 + }, + { + "epoch": 8.39318600368324, + "grad_norm": 0.1608622819185257, + "learning_rate": 6.621891874402314e-06, + "loss": 1.6951, + "step": 27345 + }, + { + "epoch": 8.393492940454266, + "grad_norm": 0.20148086547851562, + "learning_rate": 6.619420086385619e-06, + "loss": 1.7616, + "step": 27346 + }, + { + "epoch": 8.393799877225291, + "grad_norm": 0.1927247792482376, + "learning_rate": 6.616948727081262e-06, + "loss": 1.7088, + "step": 27347 + }, + { + "epoch": 8.394106813996316, + "grad_norm": 0.18318399786949158, + "learning_rate": 6.614477796513629e-06, + "loss": 1.7176, + "step": 27348 + }, + { + "epoch": 8.394413750767342, + "grad_norm": 0.20923443138599396, + "learning_rate": 6.612007294707162e-06, + "loss": 1.758, + "step": 27349 + }, + { + "epoch": 8.394720687538367, + "grad_norm": 0.20041905343532562, + "learning_rate": 6.609537221686268e-06, + "loss": 1.6843, + "step": 27350 + }, + { + "epoch": 8.395027624309392, + "grad_norm": 0.13480354845523834, + "learning_rate": 6.607067577475362e-06, + "loss": 1.6766, + "step": 27351 + }, + { + "epoch": 8.395334561080418, + "grad_norm": 0.2022085338830948, + "learning_rate": 6.604598362098846e-06, + "loss": 1.7448, + "step": 27352 + }, + { + "epoch": 8.395641497851443, + "grad_norm": 0.21842770278453827, + "learning_rate": 6.602129575581123e-06, + "loss": 1.7202, + "step": 27353 + }, + { + "epoch": 8.395948434622468, + "grad_norm": 0.16519947350025177, + "learning_rate": 6.599661217946596e-06, + "loss": 1.7036, + "step": 27354 + }, + { + "epoch": 8.396255371393494, + "grad_norm": 0.14931483566761017, + "learning_rate": 6.59719328921965e-06, + "loss": 1.7244, + "step": 27355 + }, + { + "epoch": 8.396562308164517, + "grad_norm": 0.22807423770427704, + "learning_rate": 6.594725789424683e-06, + "loss": 1.7758, + "step": 27356 + }, + { + "epoch": 8.396869244935543, + "grad_norm": 0.15723249316215515, + "learning_rate": 6.592258718586075e-06, + "loss": 1.7033, + "step": 27357 + }, + { + "epoch": 8.397176181706568, + "grad_norm": 0.1934487521648407, + "learning_rate": 6.589792076728207e-06, + "loss": 1.7767, + "step": 27358 + }, + { + "epoch": 8.397483118477593, + "grad_norm": 0.16923396289348602, + "learning_rate": 6.587325863875454e-06, + "loss": 1.7125, + "step": 27359 + }, + { + "epoch": 8.397790055248619, + "grad_norm": 0.1533476561307907, + "learning_rate": 6.584860080052196e-06, + "loss": 1.7245, + "step": 27360 + }, + { + "epoch": 8.398096992019644, + "grad_norm": 0.1610613465309143, + "learning_rate": 6.582394725282786e-06, + "loss": 1.6974, + "step": 27361 + }, + { + "epoch": 8.39840392879067, + "grad_norm": 0.19170965254306793, + "learning_rate": 6.579929799591622e-06, + "loss": 1.6956, + "step": 27362 + }, + { + "epoch": 8.398710865561695, + "grad_norm": 0.17479272186756134, + "learning_rate": 6.5774653030030164e-06, + "loss": 1.699, + "step": 27363 + }, + { + "epoch": 8.39901780233272, + "grad_norm": 0.15651267766952515, + "learning_rate": 6.575001235541378e-06, + "loss": 1.655, + "step": 27364 + }, + { + "epoch": 8.399324739103745, + "grad_norm": 0.13939335942268372, + "learning_rate": 6.572537597230999e-06, + "loss": 1.6963, + "step": 27365 + }, + { + "epoch": 8.399631675874769, + "grad_norm": 0.16157624125480652, + "learning_rate": 6.570074388096275e-06, + "loss": 1.6811, + "step": 27366 + }, + { + "epoch": 8.399938612645794, + "grad_norm": 0.16065873205661774, + "learning_rate": 6.567611608161528e-06, + "loss": 1.7104, + "step": 27367 + }, + { + "epoch": 8.40024554941682, + "grad_norm": 0.1657525599002838, + "learning_rate": 6.565149257451098e-06, + "loss": 1.6884, + "step": 27368 + }, + { + "epoch": 8.400552486187845, + "grad_norm": 0.1757468432188034, + "learning_rate": 6.56268733598932e-06, + "loss": 1.7112, + "step": 27369 + }, + { + "epoch": 8.40085942295887, + "grad_norm": 0.16591452062129974, + "learning_rate": 6.560225843800527e-06, + "loss": 1.7227, + "step": 27370 + }, + { + "epoch": 8.401166359729896, + "grad_norm": 0.12153175473213196, + "learning_rate": 6.557764780909048e-06, + "loss": 1.6843, + "step": 27371 + }, + { + "epoch": 8.401473296500921, + "grad_norm": 0.13953842222690582, + "learning_rate": 6.5553041473391914e-06, + "loss": 1.6518, + "step": 27372 + }, + { + "epoch": 8.401780233271946, + "grad_norm": 0.22707831859588623, + "learning_rate": 6.552843943115289e-06, + "loss": 1.7594, + "step": 27373 + }, + { + "epoch": 8.402087170042972, + "grad_norm": 0.18743011355400085, + "learning_rate": 6.550384168261647e-06, + "loss": 1.705, + "step": 27374 + }, + { + "epoch": 8.402394106813997, + "grad_norm": 0.1784582883119583, + "learning_rate": 6.547924822802576e-06, + "loss": 1.7861, + "step": 27375 + }, + { + "epoch": 8.402701043585022, + "grad_norm": 0.18942677974700928, + "learning_rate": 6.545465906762377e-06, + "loss": 1.7489, + "step": 27376 + }, + { + "epoch": 8.403007980356048, + "grad_norm": 0.1783999502658844, + "learning_rate": 6.543007420165354e-06, + "loss": 1.7533, + "step": 27377 + }, + { + "epoch": 8.403314917127071, + "grad_norm": 0.1497674137353897, + "learning_rate": 6.540549363035791e-06, + "loss": 1.6768, + "step": 27378 + }, + { + "epoch": 8.403621853898096, + "grad_norm": 0.15912608802318573, + "learning_rate": 6.538091735398016e-06, + "loss": 1.7656, + "step": 27379 + }, + { + "epoch": 8.403928790669122, + "grad_norm": 0.1886531114578247, + "learning_rate": 6.535634537276269e-06, + "loss": 1.7368, + "step": 27380 + }, + { + "epoch": 8.404235727440147, + "grad_norm": 0.1976786106824875, + "learning_rate": 6.5331777686948756e-06, + "loss": 1.7627, + "step": 27381 + }, + { + "epoch": 8.404542664211172, + "grad_norm": 0.1442447006702423, + "learning_rate": 6.5307214296780775e-06, + "loss": 1.6787, + "step": 27382 + }, + { + "epoch": 8.404849600982198, + "grad_norm": 0.21066388487815857, + "learning_rate": 6.528265520250182e-06, + "loss": 1.741, + "step": 27383 + }, + { + "epoch": 8.405156537753223, + "grad_norm": 0.19657589495182037, + "learning_rate": 6.525810040435443e-06, + "loss": 1.74, + "step": 27384 + }, + { + "epoch": 8.405463474524248, + "grad_norm": 0.20377841591835022, + "learning_rate": 6.5233549902581296e-06, + "loss": 1.7086, + "step": 27385 + }, + { + "epoch": 8.405770411295274, + "grad_norm": 0.16641706228256226, + "learning_rate": 6.520900369742505e-06, + "loss": 1.6897, + "step": 27386 + }, + { + "epoch": 8.4060773480663, + "grad_norm": 0.177897647023201, + "learning_rate": 6.518446178912829e-06, + "loss": 1.7781, + "step": 27387 + }, + { + "epoch": 8.406384284837323, + "grad_norm": 0.2529480755329132, + "learning_rate": 6.515992417793354e-06, + "loss": 1.7227, + "step": 27388 + }, + { + "epoch": 8.406691221608348, + "grad_norm": 0.17020392417907715, + "learning_rate": 6.513539086408327e-06, + "loss": 1.6836, + "step": 27389 + }, + { + "epoch": 8.406998158379373, + "grad_norm": 0.1621706336736679, + "learning_rate": 6.5110861847819944e-06, + "loss": 1.7263, + "step": 27390 + }, + { + "epoch": 8.407305095150399, + "grad_norm": 0.15788327157497406, + "learning_rate": 6.508633712938594e-06, + "loss": 1.7155, + "step": 27391 + }, + { + "epoch": 8.407612031921424, + "grad_norm": 0.1595151722431183, + "learning_rate": 6.5061816709023724e-06, + "loss": 1.7051, + "step": 27392 + }, + { + "epoch": 8.40791896869245, + "grad_norm": 0.2065821886062622, + "learning_rate": 6.503730058697555e-06, + "loss": 1.7435, + "step": 27393 + }, + { + "epoch": 8.408225905463475, + "grad_norm": 0.18513742089271545, + "learning_rate": 6.501278876348371e-06, + "loss": 1.7976, + "step": 27394 + }, + { + "epoch": 8.4085328422345, + "grad_norm": 0.1819298416376114, + "learning_rate": 6.4988281238790305e-06, + "loss": 1.7656, + "step": 27395 + }, + { + "epoch": 8.408839779005525, + "grad_norm": 0.17593856155872345, + "learning_rate": 6.496377801313791e-06, + "loss": 1.7436, + "step": 27396 + }, + { + "epoch": 8.40914671577655, + "grad_norm": 0.1425786167383194, + "learning_rate": 6.493927908676822e-06, + "loss": 1.7365, + "step": 27397 + }, + { + "epoch": 8.409453652547576, + "grad_norm": 0.1689717322587967, + "learning_rate": 6.491478445992383e-06, + "loss": 1.7116, + "step": 27398 + }, + { + "epoch": 8.4097605893186, + "grad_norm": 0.1530478596687317, + "learning_rate": 6.489029413284631e-06, + "loss": 1.7232, + "step": 27399 + }, + { + "epoch": 8.410067526089625, + "grad_norm": 0.16928789019584656, + "learning_rate": 6.486580810577802e-06, + "loss": 1.713, + "step": 27400 + }, + { + "epoch": 8.41037446286065, + "grad_norm": 0.19086188077926636, + "learning_rate": 6.484132637896085e-06, + "loss": 1.7495, + "step": 27401 + }, + { + "epoch": 8.410681399631676, + "grad_norm": 0.18510590493679047, + "learning_rate": 6.481684895263679e-06, + "loss": 1.7445, + "step": 27402 + }, + { + "epoch": 8.410988336402701, + "grad_norm": 0.144667387008667, + "learning_rate": 6.479237582704767e-06, + "loss": 1.6994, + "step": 27403 + }, + { + "epoch": 8.411295273173726, + "grad_norm": 0.15467962622642517, + "learning_rate": 6.476790700243535e-06, + "loss": 1.6807, + "step": 27404 + }, + { + "epoch": 8.411602209944752, + "grad_norm": 0.13533028960227966, + "learning_rate": 6.474344247904168e-06, + "loss": 1.6746, + "step": 27405 + }, + { + "epoch": 8.411909146715777, + "grad_norm": 0.13948698341846466, + "learning_rate": 6.471898225710843e-06, + "loss": 1.7072, + "step": 27406 + }, + { + "epoch": 8.412216083486802, + "grad_norm": 0.1758929044008255, + "learning_rate": 6.469452633687734e-06, + "loss": 1.6993, + "step": 27407 + }, + { + "epoch": 8.412523020257828, + "grad_norm": 0.20594100654125214, + "learning_rate": 6.46700747185901e-06, + "loss": 1.7468, + "step": 27408 + }, + { + "epoch": 8.412829957028851, + "grad_norm": 0.18665185570716858, + "learning_rate": 6.464562740248831e-06, + "loss": 1.6829, + "step": 27409 + }, + { + "epoch": 8.413136893799877, + "grad_norm": 0.1637166142463684, + "learning_rate": 6.4621184388813595e-06, + "loss": 1.7118, + "step": 27410 + }, + { + "epoch": 8.413443830570902, + "grad_norm": 0.1653725504875183, + "learning_rate": 6.459674567780749e-06, + "loss": 1.6986, + "step": 27411 + }, + { + "epoch": 8.413750767341927, + "grad_norm": 0.16381777822971344, + "learning_rate": 6.457231126971158e-06, + "loss": 1.7389, + "step": 27412 + }, + { + "epoch": 8.414057704112953, + "grad_norm": 0.14706309139728546, + "learning_rate": 6.454788116476734e-06, + "loss": 1.6629, + "step": 27413 + }, + { + "epoch": 8.414364640883978, + "grad_norm": 0.17818714678287506, + "learning_rate": 6.4523455363215964e-06, + "loss": 1.761, + "step": 27414 + }, + { + "epoch": 8.414671577655003, + "grad_norm": 0.18425707519054413, + "learning_rate": 6.449903386529932e-06, + "loss": 1.7169, + "step": 27415 + }, + { + "epoch": 8.414978514426029, + "grad_norm": 0.182805597782135, + "learning_rate": 6.4474616671258255e-06, + "loss": 1.6916, + "step": 27416 + }, + { + "epoch": 8.415285451197054, + "grad_norm": 0.1802895963191986, + "learning_rate": 6.4450203781334426e-06, + "loss": 1.7786, + "step": 27417 + }, + { + "epoch": 8.41559238796808, + "grad_norm": 0.18067243695259094, + "learning_rate": 6.442579519576891e-06, + "loss": 1.7489, + "step": 27418 + }, + { + "epoch": 8.415899324739105, + "grad_norm": 0.20373223721981049, + "learning_rate": 6.4401390914803075e-06, + "loss": 1.7519, + "step": 27419 + }, + { + "epoch": 8.416206261510128, + "grad_norm": 0.1414610594511032, + "learning_rate": 6.437699093867794e-06, + "loss": 1.6656, + "step": 27420 + }, + { + "epoch": 8.416513198281153, + "grad_norm": 0.14516517519950867, + "learning_rate": 6.4352595267634706e-06, + "loss": 1.6599, + "step": 27421 + }, + { + "epoch": 8.416820135052179, + "grad_norm": 0.16276796162128448, + "learning_rate": 6.4328203901914465e-06, + "loss": 1.7026, + "step": 27422 + }, + { + "epoch": 8.417127071823204, + "grad_norm": 0.15957671403884888, + "learning_rate": 6.430381684175829e-06, + "loss": 1.7185, + "step": 27423 + }, + { + "epoch": 8.41743400859423, + "grad_norm": 0.1594170182943344, + "learning_rate": 6.4279434087407166e-06, + "loss": 1.7144, + "step": 27424 + }, + { + "epoch": 8.417740945365255, + "grad_norm": 0.14235691726207733, + "learning_rate": 6.425505563910206e-06, + "loss": 1.6487, + "step": 27425 + }, + { + "epoch": 8.41804788213628, + "grad_norm": 0.17203880846500397, + "learning_rate": 6.423068149708389e-06, + "loss": 1.7252, + "step": 27426 + }, + { + "epoch": 8.418354818907305, + "grad_norm": 0.15193019807338715, + "learning_rate": 6.420631166159352e-06, + "loss": 1.7346, + "step": 27427 + }, + { + "epoch": 8.41866175567833, + "grad_norm": 0.17005006968975067, + "learning_rate": 6.418194613287182e-06, + "loss": 1.7679, + "step": 27428 + }, + { + "epoch": 8.418968692449356, + "grad_norm": 0.15492422878742218, + "learning_rate": 6.415758491115953e-06, + "loss": 1.6962, + "step": 27429 + }, + { + "epoch": 8.419275629220381, + "grad_norm": 0.13465845584869385, + "learning_rate": 6.413322799669752e-06, + "loss": 1.676, + "step": 27430 + }, + { + "epoch": 8.419582565991405, + "grad_norm": 0.20086030662059784, + "learning_rate": 6.410887538972626e-06, + "loss": 1.7341, + "step": 27431 + }, + { + "epoch": 8.41988950276243, + "grad_norm": 0.12862804532051086, + "learning_rate": 6.408452709048679e-06, + "loss": 1.6456, + "step": 27432 + }, + { + "epoch": 8.420196439533456, + "grad_norm": 0.1520070731639862, + "learning_rate": 6.40601830992193e-06, + "loss": 1.7169, + "step": 27433 + }, + { + "epoch": 8.420503376304481, + "grad_norm": 0.15394441783428192, + "learning_rate": 6.4035843416164865e-06, + "loss": 1.6876, + "step": 27434 + }, + { + "epoch": 8.420810313075506, + "grad_norm": 0.15149196982383728, + "learning_rate": 6.4011508041563475e-06, + "loss": 1.7126, + "step": 27435 + }, + { + "epoch": 8.421117249846532, + "grad_norm": 0.14014703035354614, + "learning_rate": 6.398717697565604e-06, + "loss": 1.6554, + "step": 27436 + }, + { + "epoch": 8.421424186617557, + "grad_norm": 0.1493537575006485, + "learning_rate": 6.3962850218682865e-06, + "loss": 1.6915, + "step": 27437 + }, + { + "epoch": 8.421731123388582, + "grad_norm": 0.16197362542152405, + "learning_rate": 6.393852777088438e-06, + "loss": 1.7108, + "step": 27438 + }, + { + "epoch": 8.422038060159608, + "grad_norm": 0.2058446705341339, + "learning_rate": 6.391420963250094e-06, + "loss": 1.806, + "step": 27439 + }, + { + "epoch": 8.422344996930633, + "grad_norm": 0.16983431577682495, + "learning_rate": 6.388989580377291e-06, + "loss": 1.7265, + "step": 27440 + }, + { + "epoch": 8.422651933701658, + "grad_norm": 0.15896758437156677, + "learning_rate": 6.386558628494049e-06, + "loss": 1.7081, + "step": 27441 + }, + { + "epoch": 8.422958870472682, + "grad_norm": 0.15534810721874237, + "learning_rate": 6.384128107624399e-06, + "loss": 1.7218, + "step": 27442 + }, + { + "epoch": 8.423265807243707, + "grad_norm": 0.20577791333198547, + "learning_rate": 6.381698017792365e-06, + "loss": 1.7799, + "step": 27443 + }, + { + "epoch": 8.423572744014733, + "grad_norm": 0.183476984500885, + "learning_rate": 6.37926835902195e-06, + "loss": 1.7432, + "step": 27444 + }, + { + "epoch": 8.423879680785758, + "grad_norm": 0.1834617555141449, + "learning_rate": 6.376839131337175e-06, + "loss": 1.7333, + "step": 27445 + }, + { + "epoch": 8.424186617556783, + "grad_norm": 0.15556102991104126, + "learning_rate": 6.374410334762043e-06, + "loss": 1.7119, + "step": 27446 + }, + { + "epoch": 8.424493554327809, + "grad_norm": 0.14469701051712036, + "learning_rate": 6.3719819693205565e-06, + "loss": 1.6883, + "step": 27447 + }, + { + "epoch": 8.424800491098834, + "grad_norm": 0.1339770257472992, + "learning_rate": 6.369554035036706e-06, + "loss": 1.692, + "step": 27448 + }, + { + "epoch": 8.42510742786986, + "grad_norm": 0.18144701421260834, + "learning_rate": 6.367126531934514e-06, + "loss": 1.7192, + "step": 27449 + }, + { + "epoch": 8.425414364640885, + "grad_norm": 0.20075814425945282, + "learning_rate": 6.364699460037931e-06, + "loss": 1.6681, + "step": 27450 + }, + { + "epoch": 8.42572130141191, + "grad_norm": 0.14828181266784668, + "learning_rate": 6.36227281937099e-06, + "loss": 1.6955, + "step": 27451 + }, + { + "epoch": 8.426028238182933, + "grad_norm": 0.1502649486064911, + "learning_rate": 6.35984660995762e-06, + "loss": 1.6695, + "step": 27452 + }, + { + "epoch": 8.426335174953959, + "grad_norm": 0.16594241559505463, + "learning_rate": 6.3574208318218364e-06, + "loss": 1.7092, + "step": 27453 + }, + { + "epoch": 8.426642111724984, + "grad_norm": 0.2585645020008087, + "learning_rate": 6.354995484987597e-06, + "loss": 1.7358, + "step": 27454 + }, + { + "epoch": 8.42694904849601, + "grad_norm": 0.1694081574678421, + "learning_rate": 6.352570569478877e-06, + "loss": 1.7421, + "step": 27455 + }, + { + "epoch": 8.427255985267035, + "grad_norm": 0.178135946393013, + "learning_rate": 6.350146085319647e-06, + "loss": 1.7157, + "step": 27456 + }, + { + "epoch": 8.42756292203806, + "grad_norm": 0.19647614657878876, + "learning_rate": 6.347722032533837e-06, + "loss": 1.7843, + "step": 27457 + }, + { + "epoch": 8.427869858809085, + "grad_norm": 0.1510474979877472, + "learning_rate": 6.345298411145434e-06, + "loss": 1.688, + "step": 27458 + }, + { + "epoch": 8.42817679558011, + "grad_norm": 0.2130916565656662, + "learning_rate": 6.342875221178374e-06, + "loss": 1.7817, + "step": 27459 + }, + { + "epoch": 8.428483732351136, + "grad_norm": 0.1456206738948822, + "learning_rate": 6.340452462656615e-06, + "loss": 1.6839, + "step": 27460 + }, + { + "epoch": 8.428790669122161, + "grad_norm": 0.16592659056186676, + "learning_rate": 6.338030135604089e-06, + "loss": 1.7395, + "step": 27461 + }, + { + "epoch": 8.429097605893187, + "grad_norm": 0.15017202496528625, + "learning_rate": 6.335608240044744e-06, + "loss": 1.6815, + "step": 27462 + }, + { + "epoch": 8.42940454266421, + "grad_norm": 0.14279332756996155, + "learning_rate": 6.333186776002514e-06, + "loss": 1.6845, + "step": 27463 + }, + { + "epoch": 8.429711479435236, + "grad_norm": 0.15117228031158447, + "learning_rate": 6.330765743501321e-06, + "loss": 1.7421, + "step": 27464 + }, + { + "epoch": 8.430018416206261, + "grad_norm": 0.19822575151920319, + "learning_rate": 6.328345142565084e-06, + "loss": 1.7297, + "step": 27465 + }, + { + "epoch": 8.430325352977286, + "grad_norm": 0.1589222550392151, + "learning_rate": 6.325924973217762e-06, + "loss": 1.7151, + "step": 27466 + }, + { + "epoch": 8.430632289748312, + "grad_norm": 0.19120970368385315, + "learning_rate": 6.323505235483229e-06, + "loss": 1.7373, + "step": 27467 + }, + { + "epoch": 8.430939226519337, + "grad_norm": 0.1859981119632721, + "learning_rate": 6.321085929385434e-06, + "loss": 1.6912, + "step": 27468 + }, + { + "epoch": 8.431246163290362, + "grad_norm": 0.1745872050523758, + "learning_rate": 6.318667054948246e-06, + "loss": 1.6773, + "step": 27469 + }, + { + "epoch": 8.431553100061388, + "grad_norm": 0.13402412831783295, + "learning_rate": 6.316248612195607e-06, + "loss": 1.6905, + "step": 27470 + }, + { + "epoch": 8.431860036832413, + "grad_norm": 0.22629496455192566, + "learning_rate": 6.3138306011514045e-06, + "loss": 1.7012, + "step": 27471 + }, + { + "epoch": 8.432166973603438, + "grad_norm": 0.18746718764305115, + "learning_rate": 6.31141302183953e-06, + "loss": 1.7573, + "step": 27472 + }, + { + "epoch": 8.432473910374464, + "grad_norm": 0.18313723802566528, + "learning_rate": 6.308995874283891e-06, + "loss": 1.7358, + "step": 27473 + }, + { + "epoch": 8.432780847145487, + "grad_norm": 0.19075456261634827, + "learning_rate": 6.306579158508341e-06, + "loss": 1.7091, + "step": 27474 + }, + { + "epoch": 8.433087783916513, + "grad_norm": 0.18092980980873108, + "learning_rate": 6.304162874536796e-06, + "loss": 1.6739, + "step": 27475 + }, + { + "epoch": 8.433394720687538, + "grad_norm": 0.15624219179153442, + "learning_rate": 6.301747022393123e-06, + "loss": 1.6637, + "step": 27476 + }, + { + "epoch": 8.433701657458563, + "grad_norm": 0.14825348556041718, + "learning_rate": 6.299331602101199e-06, + "loss": 1.6865, + "step": 27477 + }, + { + "epoch": 8.434008594229589, + "grad_norm": 0.2204820215702057, + "learning_rate": 6.2969166136848946e-06, + "loss": 1.7842, + "step": 27478 + }, + { + "epoch": 8.434315531000614, + "grad_norm": 0.15570053458213806, + "learning_rate": 6.294502057168072e-06, + "loss": 1.69, + "step": 27479 + }, + { + "epoch": 8.43462246777164, + "grad_norm": 0.1686720848083496, + "learning_rate": 6.292087932574603e-06, + "loss": 1.6787, + "step": 27480 + }, + { + "epoch": 8.434929404542665, + "grad_norm": 0.2100359946489334, + "learning_rate": 6.289674239928334e-06, + "loss": 1.7374, + "step": 27481 + }, + { + "epoch": 8.43523634131369, + "grad_norm": 0.1607038378715515, + "learning_rate": 6.287260979253112e-06, + "loss": 1.7067, + "step": 27482 + }, + { + "epoch": 8.435543278084715, + "grad_norm": 0.153702512383461, + "learning_rate": 6.2848481505728254e-06, + "loss": 1.6762, + "step": 27483 + }, + { + "epoch": 8.43585021485574, + "grad_norm": 0.15967734158039093, + "learning_rate": 6.282435753911264e-06, + "loss": 1.6543, + "step": 27484 + }, + { + "epoch": 8.436157151626764, + "grad_norm": 0.18866287171840668, + "learning_rate": 6.280023789292322e-06, + "loss": 1.7481, + "step": 27485 + }, + { + "epoch": 8.43646408839779, + "grad_norm": 0.13347187638282776, + "learning_rate": 6.277612256739784e-06, + "loss": 1.6398, + "step": 27486 + }, + { + "epoch": 8.436771025168815, + "grad_norm": 0.1626890003681183, + "learning_rate": 6.275201156277521e-06, + "loss": 1.7258, + "step": 27487 + }, + { + "epoch": 8.43707796193984, + "grad_norm": 0.21519014239311218, + "learning_rate": 6.272790487929353e-06, + "loss": 1.7762, + "step": 27488 + }, + { + "epoch": 8.437384898710865, + "grad_norm": 0.1610138863325119, + "learning_rate": 6.2703802517190935e-06, + "loss": 1.6999, + "step": 27489 + }, + { + "epoch": 8.43769183548189, + "grad_norm": 0.20251847803592682, + "learning_rate": 6.267970447670579e-06, + "loss": 1.6953, + "step": 27490 + }, + { + "epoch": 8.437998772252916, + "grad_norm": 0.15717832744121552, + "learning_rate": 6.265561075807591e-06, + "loss": 1.623, + "step": 27491 + }, + { + "epoch": 8.438305709023942, + "grad_norm": 0.1399519294500351, + "learning_rate": 6.2631521361539716e-06, + "loss": 1.693, + "step": 27492 + }, + { + "epoch": 8.438612645794967, + "grad_norm": 0.17747904360294342, + "learning_rate": 6.260743628733517e-06, + "loss": 1.7019, + "step": 27493 + }, + { + "epoch": 8.438919582565992, + "grad_norm": 0.1724942922592163, + "learning_rate": 6.258335553570032e-06, + "loss": 1.6647, + "step": 27494 + }, + { + "epoch": 8.439226519337016, + "grad_norm": 0.15294337272644043, + "learning_rate": 6.255927910687315e-06, + "loss": 1.7492, + "step": 27495 + }, + { + "epoch": 8.439533456108041, + "grad_norm": 0.16880661249160767, + "learning_rate": 6.253520700109156e-06, + "loss": 1.731, + "step": 27496 + }, + { + "epoch": 8.439840392879066, + "grad_norm": 0.16098125278949738, + "learning_rate": 6.251113921859347e-06, + "loss": 1.6668, + "step": 27497 + }, + { + "epoch": 8.440147329650092, + "grad_norm": 0.17218537628650665, + "learning_rate": 6.248707575961671e-06, + "loss": 1.6943, + "step": 27498 + }, + { + "epoch": 8.440454266421117, + "grad_norm": 0.19593006372451782, + "learning_rate": 6.2463016624398965e-06, + "loss": 1.7213, + "step": 27499 + }, + { + "epoch": 8.440761203192142, + "grad_norm": 0.15833450853824615, + "learning_rate": 6.243896181317837e-06, + "loss": 1.6787, + "step": 27500 + }, + { + "epoch": 8.441068139963168, + "grad_norm": 0.1378611922264099, + "learning_rate": 6.241491132619226e-06, + "loss": 1.6777, + "step": 27501 + }, + { + "epoch": 8.441375076734193, + "grad_norm": 0.25010615587234497, + "learning_rate": 6.239086516367865e-06, + "loss": 1.7474, + "step": 27502 + }, + { + "epoch": 8.441682013505218, + "grad_norm": 0.1281466782093048, + "learning_rate": 6.236682332587474e-06, + "loss": 1.6946, + "step": 27503 + }, + { + "epoch": 8.441988950276244, + "grad_norm": 0.19045543670654297, + "learning_rate": 6.234278581301855e-06, + "loss": 1.7198, + "step": 27504 + }, + { + "epoch": 8.442295887047269, + "grad_norm": 0.17753495275974274, + "learning_rate": 6.231875262534748e-06, + "loss": 1.7324, + "step": 27505 + }, + { + "epoch": 8.442602823818293, + "grad_norm": 0.14088352024555206, + "learning_rate": 6.229472376309897e-06, + "loss": 1.6683, + "step": 27506 + }, + { + "epoch": 8.442909760589318, + "grad_norm": 0.16781100630760193, + "learning_rate": 6.2270699226510685e-06, + "loss": 1.7271, + "step": 27507 + }, + { + "epoch": 8.443216697360343, + "grad_norm": 0.1857508271932602, + "learning_rate": 6.224667901581971e-06, + "loss": 1.7596, + "step": 27508 + }, + { + "epoch": 8.443523634131369, + "grad_norm": 0.18411888182163239, + "learning_rate": 6.222266313126374e-06, + "loss": 1.8193, + "step": 27509 + }, + { + "epoch": 8.443830570902394, + "grad_norm": 0.1530957967042923, + "learning_rate": 6.2198651573079965e-06, + "loss": 1.6958, + "step": 27510 + }, + { + "epoch": 8.44413750767342, + "grad_norm": 0.19102713465690613, + "learning_rate": 6.217464434150572e-06, + "loss": 1.7172, + "step": 27511 + }, + { + "epoch": 8.444444444444445, + "grad_norm": 0.16886062920093536, + "learning_rate": 6.215064143677829e-06, + "loss": 1.6811, + "step": 27512 + }, + { + "epoch": 8.44475138121547, + "grad_norm": 0.15974819660186768, + "learning_rate": 6.212664285913483e-06, + "loss": 1.694, + "step": 27513 + }, + { + "epoch": 8.445058317986495, + "grad_norm": 0.19709718227386475, + "learning_rate": 6.2102648608812544e-06, + "loss": 1.7647, + "step": 27514 + }, + { + "epoch": 8.44536525475752, + "grad_norm": 0.15339697897434235, + "learning_rate": 6.207865868604857e-06, + "loss": 1.7169, + "step": 27515 + }, + { + "epoch": 8.445672191528546, + "grad_norm": 0.14088544249534607, + "learning_rate": 6.2054673091079815e-06, + "loss": 1.6902, + "step": 27516 + }, + { + "epoch": 8.44597912829957, + "grad_norm": 0.17412640154361725, + "learning_rate": 6.203069182414367e-06, + "loss": 1.7205, + "step": 27517 + }, + { + "epoch": 8.446286065070595, + "grad_norm": 0.18837641179561615, + "learning_rate": 6.200671488547677e-06, + "loss": 1.7756, + "step": 27518 + }, + { + "epoch": 8.44659300184162, + "grad_norm": 0.18904593586921692, + "learning_rate": 6.198274227531642e-06, + "loss": 1.732, + "step": 27519 + }, + { + "epoch": 8.446899938612646, + "grad_norm": 0.13136132061481476, + "learning_rate": 6.19587739938991e-06, + "loss": 1.6844, + "step": 27520 + }, + { + "epoch": 8.44720687538367, + "grad_norm": 0.15678717195987701, + "learning_rate": 6.1934810041462066e-06, + "loss": 1.7029, + "step": 27521 + }, + { + "epoch": 8.447513812154696, + "grad_norm": 0.1661362200975418, + "learning_rate": 6.191085041824207e-06, + "loss": 1.6656, + "step": 27522 + }, + { + "epoch": 8.447820748925722, + "grad_norm": 0.1749318689107895, + "learning_rate": 6.188689512447565e-06, + "loss": 1.7412, + "step": 27523 + }, + { + "epoch": 8.448127685696747, + "grad_norm": 0.17242331802845, + "learning_rate": 6.18629441603999e-06, + "loss": 1.7037, + "step": 27524 + }, + { + "epoch": 8.448434622467772, + "grad_norm": 0.16092433035373688, + "learning_rate": 6.183899752625116e-06, + "loss": 1.6817, + "step": 27525 + }, + { + "epoch": 8.448741559238798, + "grad_norm": 0.16177381575107574, + "learning_rate": 6.1815055222266325e-06, + "loss": 1.6678, + "step": 27526 + }, + { + "epoch": 8.449048496009823, + "grad_norm": 0.1489405483007431, + "learning_rate": 6.179111724868197e-06, + "loss": 1.6839, + "step": 27527 + }, + { + "epoch": 8.449355432780846, + "grad_norm": 0.15873265266418457, + "learning_rate": 6.176718360573458e-06, + "loss": 1.6749, + "step": 27528 + }, + { + "epoch": 8.449662369551872, + "grad_norm": 0.17511235177516937, + "learning_rate": 6.174325429366079e-06, + "loss": 1.6962, + "step": 27529 + }, + { + "epoch": 8.449969306322897, + "grad_norm": 0.1452886015176773, + "learning_rate": 6.171932931269702e-06, + "loss": 1.7141, + "step": 27530 + }, + { + "epoch": 8.450276243093922, + "grad_norm": 0.20559509098529816, + "learning_rate": 6.169540866307977e-06, + "loss": 1.7116, + "step": 27531 + }, + { + "epoch": 8.450583179864948, + "grad_norm": 0.17642420530319214, + "learning_rate": 6.167149234504532e-06, + "loss": 1.7209, + "step": 27532 + }, + { + "epoch": 8.450890116635973, + "grad_norm": 0.13833492994308472, + "learning_rate": 6.164758035883001e-06, + "loss": 1.6522, + "step": 27533 + }, + { + "epoch": 8.451197053406998, + "grad_norm": 0.18079428374767303, + "learning_rate": 6.162367270467045e-06, + "loss": 1.7348, + "step": 27534 + }, + { + "epoch": 8.451503990178024, + "grad_norm": 0.19325628876686096, + "learning_rate": 6.159976938280249e-06, + "loss": 1.6947, + "step": 27535 + }, + { + "epoch": 8.45181092694905, + "grad_norm": 0.17844507098197937, + "learning_rate": 6.15758703934628e-06, + "loss": 1.7206, + "step": 27536 + }, + { + "epoch": 8.452117863720074, + "grad_norm": 0.186324343085289, + "learning_rate": 6.155197573688703e-06, + "loss": 1.743, + "step": 27537 + }, + { + "epoch": 8.452424800491098, + "grad_norm": 0.15700562298297882, + "learning_rate": 6.152808541331184e-06, + "loss": 1.7109, + "step": 27538 + }, + { + "epoch": 8.452731737262123, + "grad_norm": 0.13879023492336273, + "learning_rate": 6.150419942297314e-06, + "loss": 1.6737, + "step": 27539 + }, + { + "epoch": 8.453038674033149, + "grad_norm": 0.14589501917362213, + "learning_rate": 6.148031776610675e-06, + "loss": 1.6884, + "step": 27540 + }, + { + "epoch": 8.453345610804174, + "grad_norm": 0.14402590692043304, + "learning_rate": 6.1456440442949125e-06, + "loss": 1.6949, + "step": 27541 + }, + { + "epoch": 8.4536525475752, + "grad_norm": 0.16506166756153107, + "learning_rate": 6.143256745373571e-06, + "loss": 1.725, + "step": 27542 + }, + { + "epoch": 8.453959484346225, + "grad_norm": 0.15663643181324005, + "learning_rate": 6.140869879870287e-06, + "loss": 1.7069, + "step": 27543 + }, + { + "epoch": 8.45426642111725, + "grad_norm": 0.16058720648288727, + "learning_rate": 6.138483447808635e-06, + "loss": 1.7264, + "step": 27544 + }, + { + "epoch": 8.454573357888275, + "grad_norm": 0.23160551488399506, + "learning_rate": 6.136097449212197e-06, + "loss": 1.7573, + "step": 27545 + }, + { + "epoch": 8.4548802946593, + "grad_norm": 0.15130533277988434, + "learning_rate": 6.133711884104554e-06, + "loss": 1.705, + "step": 27546 + }, + { + "epoch": 8.455187231430326, + "grad_norm": 0.16825515031814575, + "learning_rate": 6.131326752509281e-06, + "loss": 1.7405, + "step": 27547 + }, + { + "epoch": 8.455494168201351, + "grad_norm": 0.19265486299991608, + "learning_rate": 6.128942054449943e-06, + "loss": 1.7026, + "step": 27548 + }, + { + "epoch": 8.455801104972375, + "grad_norm": 0.18873640894889832, + "learning_rate": 6.126557789950121e-06, + "loss": 1.6825, + "step": 27549 + }, + { + "epoch": 8.4561080417434, + "grad_norm": 0.13833044469356537, + "learning_rate": 6.124173959033358e-06, + "loss": 1.6589, + "step": 27550 + }, + { + "epoch": 8.456414978514426, + "grad_norm": 0.16894219815731049, + "learning_rate": 6.1217905617232394e-06, + "loss": 1.7781, + "step": 27551 + }, + { + "epoch": 8.456721915285451, + "grad_norm": 0.18338344991207123, + "learning_rate": 6.119407598043292e-06, + "loss": 1.7348, + "step": 27552 + }, + { + "epoch": 8.457028852056476, + "grad_norm": 0.17766039073467255, + "learning_rate": 6.117025068017096e-06, + "loss": 1.7126, + "step": 27553 + }, + { + "epoch": 8.457335788827502, + "grad_norm": 0.18717309832572937, + "learning_rate": 6.114642971668155e-06, + "loss": 1.7193, + "step": 27554 + }, + { + "epoch": 8.457642725598527, + "grad_norm": 0.15229196846485138, + "learning_rate": 6.112261309020045e-06, + "loss": 1.665, + "step": 27555 + }, + { + "epoch": 8.457949662369552, + "grad_norm": 0.15391093492507935, + "learning_rate": 6.109880080096303e-06, + "loss": 1.6813, + "step": 27556 + }, + { + "epoch": 8.458256599140578, + "grad_norm": 0.1363036334514618, + "learning_rate": 6.107499284920432e-06, + "loss": 1.6912, + "step": 27557 + }, + { + "epoch": 8.458563535911603, + "grad_norm": 0.15193909406661987, + "learning_rate": 6.105118923516001e-06, + "loss": 1.7219, + "step": 27558 + }, + { + "epoch": 8.458870472682626, + "grad_norm": 0.1312003880739212, + "learning_rate": 6.102738995906487e-06, + "loss": 1.7317, + "step": 27559 + }, + { + "epoch": 8.459177409453652, + "grad_norm": 0.12835659086704254, + "learning_rate": 6.100359502115449e-06, + "loss": 1.6556, + "step": 27560 + }, + { + "epoch": 8.459484346224677, + "grad_norm": 0.17296236753463745, + "learning_rate": 6.09798044216639e-06, + "loss": 1.7331, + "step": 27561 + }, + { + "epoch": 8.459791282995702, + "grad_norm": 0.1607210338115692, + "learning_rate": 6.095601816082819e-06, + "loss": 1.7297, + "step": 27562 + }, + { + "epoch": 8.460098219766728, + "grad_norm": 0.1841181367635727, + "learning_rate": 6.093223623888245e-06, + "loss": 1.7382, + "step": 27563 + }, + { + "epoch": 8.460405156537753, + "grad_norm": 0.15751226246356964, + "learning_rate": 6.090845865606165e-06, + "loss": 1.6952, + "step": 27564 + }, + { + "epoch": 8.460712093308778, + "grad_norm": 0.15703023970127106, + "learning_rate": 6.0884685412600835e-06, + "loss": 1.7476, + "step": 27565 + }, + { + "epoch": 8.461019030079804, + "grad_norm": 0.17819096148014069, + "learning_rate": 6.0860916508734985e-06, + "loss": 1.7761, + "step": 27566 + }, + { + "epoch": 8.46132596685083, + "grad_norm": 0.168768510222435, + "learning_rate": 6.08371519446988e-06, + "loss": 1.7534, + "step": 27567 + }, + { + "epoch": 8.461632903621854, + "grad_norm": 0.1577196717262268, + "learning_rate": 6.081339172072747e-06, + "loss": 1.6533, + "step": 27568 + }, + { + "epoch": 8.46193984039288, + "grad_norm": 0.19285355508327484, + "learning_rate": 6.078963583705544e-06, + "loss": 1.7127, + "step": 27569 + }, + { + "epoch": 8.462246777163903, + "grad_norm": 0.15905390679836273, + "learning_rate": 6.076588429391788e-06, + "loss": 1.6851, + "step": 27570 + }, + { + "epoch": 8.462553713934929, + "grad_norm": 0.14860354363918304, + "learning_rate": 6.074213709154908e-06, + "loss": 1.7016, + "step": 27571 + }, + { + "epoch": 8.462860650705954, + "grad_norm": 0.2003553956747055, + "learning_rate": 6.0718394230184e-06, + "loss": 1.819, + "step": 27572 + }, + { + "epoch": 8.46316758747698, + "grad_norm": 0.1739475131034851, + "learning_rate": 6.069465571005733e-06, + "loss": 1.7539, + "step": 27573 + }, + { + "epoch": 8.463474524248005, + "grad_norm": 0.20145776867866516, + "learning_rate": 6.067092153140341e-06, + "loss": 1.7472, + "step": 27574 + }, + { + "epoch": 8.46378146101903, + "grad_norm": 0.2065812349319458, + "learning_rate": 6.06471916944571e-06, + "loss": 1.7871, + "step": 27575 + }, + { + "epoch": 8.464088397790055, + "grad_norm": 0.16987882554531097, + "learning_rate": 6.0623466199452585e-06, + "loss": 1.7299, + "step": 27576 + }, + { + "epoch": 8.46439533456108, + "grad_norm": 0.1477213054895401, + "learning_rate": 6.059974504662458e-06, + "loss": 1.6829, + "step": 27577 + }, + { + "epoch": 8.464702271332106, + "grad_norm": 0.16443482041358948, + "learning_rate": 6.05760282362074e-06, + "loss": 1.7352, + "step": 27578 + }, + { + "epoch": 8.465009208103131, + "grad_norm": 0.15927115082740784, + "learning_rate": 6.055231576843551e-06, + "loss": 1.7175, + "step": 27579 + }, + { + "epoch": 8.465316144874157, + "grad_norm": 0.17477387189865112, + "learning_rate": 6.052860764354318e-06, + "loss": 1.6609, + "step": 27580 + }, + { + "epoch": 8.46562308164518, + "grad_norm": 0.22039631009101868, + "learning_rate": 6.050490386176477e-06, + "loss": 1.7664, + "step": 27581 + }, + { + "epoch": 8.465930018416206, + "grad_norm": 0.1699618101119995, + "learning_rate": 6.048120442333449e-06, + "loss": 1.7231, + "step": 27582 + }, + { + "epoch": 8.466236955187231, + "grad_norm": 0.1548585742712021, + "learning_rate": 6.045750932848654e-06, + "loss": 1.7503, + "step": 27583 + }, + { + "epoch": 8.466543891958256, + "grad_norm": 0.17046836018562317, + "learning_rate": 6.043381857745506e-06, + "loss": 1.6993, + "step": 27584 + }, + { + "epoch": 8.466850828729282, + "grad_norm": 0.1857844740152359, + "learning_rate": 6.041013217047431e-06, + "loss": 1.7132, + "step": 27585 + }, + { + "epoch": 8.467157765500307, + "grad_norm": 0.15656128525733948, + "learning_rate": 6.0386450107778105e-06, + "loss": 1.6713, + "step": 27586 + }, + { + "epoch": 8.467464702271332, + "grad_norm": 0.20369650423526764, + "learning_rate": 6.036277238960092e-06, + "loss": 1.7296, + "step": 27587 + }, + { + "epoch": 8.467771639042358, + "grad_norm": 0.15926989912986755, + "learning_rate": 6.0339099016176295e-06, + "loss": 1.6766, + "step": 27588 + }, + { + "epoch": 8.468078575813383, + "grad_norm": 0.16353332996368408, + "learning_rate": 6.0315429987738596e-06, + "loss": 1.7084, + "step": 27589 + }, + { + "epoch": 8.468385512584408, + "grad_norm": 0.16328907012939453, + "learning_rate": 6.029176530452141e-06, + "loss": 1.715, + "step": 27590 + }, + { + "epoch": 8.468692449355434, + "grad_norm": 0.20153367519378662, + "learning_rate": 6.026810496675861e-06, + "loss": 1.7363, + "step": 27591 + }, + { + "epoch": 8.468999386126457, + "grad_norm": 0.1374381184577942, + "learning_rate": 6.024444897468435e-06, + "loss": 1.6633, + "step": 27592 + }, + { + "epoch": 8.469306322897483, + "grad_norm": 0.20331406593322754, + "learning_rate": 6.022079732853198e-06, + "loss": 1.7544, + "step": 27593 + }, + { + "epoch": 8.469613259668508, + "grad_norm": 0.18052712082862854, + "learning_rate": 6.019715002853554e-06, + "loss": 1.7032, + "step": 27594 + }, + { + "epoch": 8.469920196439533, + "grad_norm": 0.18305034935474396, + "learning_rate": 6.017350707492863e-06, + "loss": 1.7249, + "step": 27595 + }, + { + "epoch": 8.470227133210559, + "grad_norm": 0.1608239710330963, + "learning_rate": 6.014986846794496e-06, + "loss": 1.7049, + "step": 27596 + }, + { + "epoch": 8.470534069981584, + "grad_norm": 0.16582928597927094, + "learning_rate": 6.012623420781804e-06, + "loss": 1.6777, + "step": 27597 + }, + { + "epoch": 8.47084100675261, + "grad_norm": 0.18023556470870972, + "learning_rate": 6.010260429478154e-06, + "loss": 1.6996, + "step": 27598 + }, + { + "epoch": 8.471147943523635, + "grad_norm": 0.1994815319776535, + "learning_rate": 6.007897872906892e-06, + "loss": 1.7455, + "step": 27599 + }, + { + "epoch": 8.47145488029466, + "grad_norm": 0.17772625386714935, + "learning_rate": 6.005535751091368e-06, + "loss": 1.7431, + "step": 27600 + }, + { + "epoch": 8.471761817065685, + "grad_norm": 0.17297807335853577, + "learning_rate": 6.003174064054929e-06, + "loss": 1.7087, + "step": 27601 + }, + { + "epoch": 8.472068753836709, + "grad_norm": 0.14986321330070496, + "learning_rate": 6.000812811820905e-06, + "loss": 1.681, + "step": 27602 + }, + { + "epoch": 8.472375690607734, + "grad_norm": 0.17512932419776917, + "learning_rate": 5.998451994412629e-06, + "loss": 1.7669, + "step": 27603 + }, + { + "epoch": 8.47268262737876, + "grad_norm": 0.18424493074417114, + "learning_rate": 5.996091611853466e-06, + "loss": 1.7296, + "step": 27604 + }, + { + "epoch": 8.472989564149785, + "grad_norm": 0.1246834322810173, + "learning_rate": 5.9937316641666906e-06, + "loss": 1.6747, + "step": 27605 + }, + { + "epoch": 8.47329650092081, + "grad_norm": 0.14435335993766785, + "learning_rate": 5.991372151375674e-06, + "loss": 1.6225, + "step": 27606 + }, + { + "epoch": 8.473603437691835, + "grad_norm": 0.16726957261562347, + "learning_rate": 5.989013073503702e-06, + "loss": 1.7052, + "step": 27607 + }, + { + "epoch": 8.47391037446286, + "grad_norm": 0.15307356417179108, + "learning_rate": 5.98665443057409e-06, + "loss": 1.7199, + "step": 27608 + }, + { + "epoch": 8.474217311233886, + "grad_norm": 0.14373189210891724, + "learning_rate": 5.984296222610175e-06, + "loss": 1.6808, + "step": 27609 + }, + { + "epoch": 8.474524248004911, + "grad_norm": 0.13142740726470947, + "learning_rate": 5.981938449635222e-06, + "loss": 1.6868, + "step": 27610 + }, + { + "epoch": 8.474831184775937, + "grad_norm": 0.13838545978069305, + "learning_rate": 5.979581111672572e-06, + "loss": 1.6723, + "step": 27611 + }, + { + "epoch": 8.475138121546962, + "grad_norm": 0.15346096456050873, + "learning_rate": 5.977224208745485e-06, + "loss": 1.7066, + "step": 27612 + }, + { + "epoch": 8.475445058317986, + "grad_norm": 0.127261221408844, + "learning_rate": 5.974867740877283e-06, + "loss": 1.6285, + "step": 27613 + }, + { + "epoch": 8.475751995089011, + "grad_norm": 0.12636838853359222, + "learning_rate": 5.972511708091239e-06, + "loss": 1.6707, + "step": 27614 + }, + { + "epoch": 8.476058931860036, + "grad_norm": 0.22297553718090057, + "learning_rate": 5.970156110410641e-06, + "loss": 1.693, + "step": 27615 + }, + { + "epoch": 8.476365868631062, + "grad_norm": 0.21933813393115997, + "learning_rate": 5.967800947858765e-06, + "loss": 1.7622, + "step": 27616 + }, + { + "epoch": 8.476672805402087, + "grad_norm": 0.19202767312526703, + "learning_rate": 5.965446220458887e-06, + "loss": 1.723, + "step": 27617 + }, + { + "epoch": 8.476979742173112, + "grad_norm": 0.13845433294773102, + "learning_rate": 5.963091928234283e-06, + "loss": 1.6824, + "step": 27618 + }, + { + "epoch": 8.477286678944138, + "grad_norm": 0.1829427033662796, + "learning_rate": 5.960738071208211e-06, + "loss": 1.7441, + "step": 27619 + }, + { + "epoch": 8.477593615715163, + "grad_norm": 0.17720428109169006, + "learning_rate": 5.958384649403931e-06, + "loss": 1.7108, + "step": 27620 + }, + { + "epoch": 8.477900552486188, + "grad_norm": 0.12632785737514496, + "learning_rate": 5.95603166284473e-06, + "loss": 1.6762, + "step": 27621 + }, + { + "epoch": 8.478207489257214, + "grad_norm": 0.15774594247341156, + "learning_rate": 5.953679111553812e-06, + "loss": 1.7076, + "step": 27622 + }, + { + "epoch": 8.478514426028239, + "grad_norm": 0.16115643084049225, + "learning_rate": 5.9513269955544795e-06, + "loss": 1.757, + "step": 27623 + }, + { + "epoch": 8.478821362799263, + "grad_norm": 0.13887029886245728, + "learning_rate": 5.948975314869937e-06, + "loss": 1.7462, + "step": 27624 + }, + { + "epoch": 8.479128299570288, + "grad_norm": 0.1517426073551178, + "learning_rate": 5.946624069523432e-06, + "loss": 1.6912, + "step": 27625 + }, + { + "epoch": 8.479435236341313, + "grad_norm": 0.15509237349033356, + "learning_rate": 5.94427325953823e-06, + "loss": 1.7022, + "step": 27626 + }, + { + "epoch": 8.479742173112339, + "grad_norm": 0.1656811237335205, + "learning_rate": 5.9419228849375175e-06, + "loss": 1.713, + "step": 27627 + }, + { + "epoch": 8.480049109883364, + "grad_norm": 0.2257215678691864, + "learning_rate": 5.93957294574457e-06, + "loss": 1.7452, + "step": 27628 + }, + { + "epoch": 8.48035604665439, + "grad_norm": 0.15382499992847443, + "learning_rate": 5.9372234419825645e-06, + "loss": 1.7056, + "step": 27629 + }, + { + "epoch": 8.480662983425415, + "grad_norm": 0.1773097813129425, + "learning_rate": 5.934874373674754e-06, + "loss": 1.7161, + "step": 27630 + }, + { + "epoch": 8.48096992019644, + "grad_norm": 0.16455380618572235, + "learning_rate": 5.932525740844341e-06, + "loss": 1.7454, + "step": 27631 + }, + { + "epoch": 8.481276856967465, + "grad_norm": 0.15213815867900848, + "learning_rate": 5.930177543514542e-06, + "loss": 1.7049, + "step": 27632 + }, + { + "epoch": 8.48158379373849, + "grad_norm": 0.17395392060279846, + "learning_rate": 5.927829781708555e-06, + "loss": 1.7026, + "step": 27633 + }, + { + "epoch": 8.481890730509516, + "grad_norm": 0.18553678691387177, + "learning_rate": 5.925482455449588e-06, + "loss": 1.7437, + "step": 27634 + }, + { + "epoch": 8.48219766728054, + "grad_norm": 0.15735404193401337, + "learning_rate": 5.9231355647608346e-06, + "loss": 1.7171, + "step": 27635 + }, + { + "epoch": 8.482504604051565, + "grad_norm": 0.14466318488121033, + "learning_rate": 5.920789109665487e-06, + "loss": 1.6698, + "step": 27636 + }, + { + "epoch": 8.48281154082259, + "grad_norm": 0.159750834107399, + "learning_rate": 5.918443090186732e-06, + "loss": 1.7045, + "step": 27637 + }, + { + "epoch": 8.483118477593615, + "grad_norm": 0.14026959240436554, + "learning_rate": 5.916097506347773e-06, + "loss": 1.6751, + "step": 27638 + }, + { + "epoch": 8.48342541436464, + "grad_norm": 0.18119752407073975, + "learning_rate": 5.913752358171765e-06, + "loss": 1.7768, + "step": 27639 + }, + { + "epoch": 8.483732351135666, + "grad_norm": 0.20957626402378082, + "learning_rate": 5.91140764568191e-06, + "loss": 1.72, + "step": 27640 + }, + { + "epoch": 8.484039287906691, + "grad_norm": 0.1649177372455597, + "learning_rate": 5.909063368901357e-06, + "loss": 1.6938, + "step": 27641 + }, + { + "epoch": 8.484346224677717, + "grad_norm": 0.17464084923267365, + "learning_rate": 5.906719527853271e-06, + "loss": 1.7369, + "step": 27642 + }, + { + "epoch": 8.484653161448742, + "grad_norm": 0.14213840663433075, + "learning_rate": 5.90437612256085e-06, + "loss": 1.6985, + "step": 27643 + }, + { + "epoch": 8.484960098219767, + "grad_norm": 0.2008642852306366, + "learning_rate": 5.902033153047209e-06, + "loss": 1.7394, + "step": 27644 + }, + { + "epoch": 8.485267034990791, + "grad_norm": 0.15051651000976562, + "learning_rate": 5.899690619335541e-06, + "loss": 1.6729, + "step": 27645 + }, + { + "epoch": 8.485573971761816, + "grad_norm": 0.17977653443813324, + "learning_rate": 5.897348521448958e-06, + "loss": 1.7501, + "step": 27646 + }, + { + "epoch": 8.485880908532842, + "grad_norm": 0.2593468427658081, + "learning_rate": 5.89500685941064e-06, + "loss": 1.7174, + "step": 27647 + }, + { + "epoch": 8.486187845303867, + "grad_norm": 0.23924550414085388, + "learning_rate": 5.8926656332437105e-06, + "loss": 1.7383, + "step": 27648 + }, + { + "epoch": 8.486494782074892, + "grad_norm": 0.1751977503299713, + "learning_rate": 5.8903248429713124e-06, + "loss": 1.7024, + "step": 27649 + }, + { + "epoch": 8.486801718845918, + "grad_norm": 0.21737132966518402, + "learning_rate": 5.887984488616582e-06, + "loss": 1.7214, + "step": 27650 + }, + { + "epoch": 8.487108655616943, + "grad_norm": 0.2042747437953949, + "learning_rate": 5.885644570202636e-06, + "loss": 1.7126, + "step": 27651 + }, + { + "epoch": 8.487415592387968, + "grad_norm": 0.14556188881397247, + "learning_rate": 5.883305087752611e-06, + "loss": 1.6919, + "step": 27652 + }, + { + "epoch": 8.487722529158994, + "grad_norm": 0.210098534822464, + "learning_rate": 5.880966041289626e-06, + "loss": 1.6728, + "step": 27653 + }, + { + "epoch": 8.488029465930019, + "grad_norm": 0.26891016960144043, + "learning_rate": 5.878627430836781e-06, + "loss": 1.7356, + "step": 27654 + }, + { + "epoch": 8.488336402701044, + "grad_norm": 0.13008984923362732, + "learning_rate": 5.876289256417217e-06, + "loss": 1.6685, + "step": 27655 + }, + { + "epoch": 8.488643339472068, + "grad_norm": 0.2077993005514145, + "learning_rate": 5.873951518054005e-06, + "loss": 1.6983, + "step": 27656 + }, + { + "epoch": 8.488950276243093, + "grad_norm": 0.19198927283287048, + "learning_rate": 5.871614215770294e-06, + "loss": 1.6703, + "step": 27657 + }, + { + "epoch": 8.489257213014119, + "grad_norm": 0.18122628331184387, + "learning_rate": 5.869277349589137e-06, + "loss": 1.8012, + "step": 27658 + }, + { + "epoch": 8.489564149785144, + "grad_norm": 0.2359529435634613, + "learning_rate": 5.866940919533642e-06, + "loss": 1.7194, + "step": 27659 + }, + { + "epoch": 8.48987108655617, + "grad_norm": 0.15916365385055542, + "learning_rate": 5.864604925626921e-06, + "loss": 1.6929, + "step": 27660 + }, + { + "epoch": 8.490178023327195, + "grad_norm": 0.16607709228992462, + "learning_rate": 5.862269367892026e-06, + "loss": 1.7001, + "step": 27661 + }, + { + "epoch": 8.49048496009822, + "grad_norm": 0.17609505355358124, + "learning_rate": 5.859934246352072e-06, + "loss": 1.736, + "step": 27662 + }, + { + "epoch": 8.490791896869245, + "grad_norm": 0.17898498475551605, + "learning_rate": 5.857599561030103e-06, + "loss": 1.7397, + "step": 27663 + }, + { + "epoch": 8.49109883364027, + "grad_norm": 0.17502975463867188, + "learning_rate": 5.855265311949215e-06, + "loss": 1.6874, + "step": 27664 + }, + { + "epoch": 8.491405770411296, + "grad_norm": 0.16041016578674316, + "learning_rate": 5.852931499132469e-06, + "loss": 1.7494, + "step": 27665 + }, + { + "epoch": 8.491712707182321, + "grad_norm": 0.12939618527889252, + "learning_rate": 5.850598122602929e-06, + "loss": 1.6397, + "step": 27666 + }, + { + "epoch": 8.492019643953345, + "grad_norm": 0.1685323715209961, + "learning_rate": 5.848265182383656e-06, + "loss": 1.7465, + "step": 27667 + }, + { + "epoch": 8.49232658072437, + "grad_norm": 0.14007940888404846, + "learning_rate": 5.845932678497707e-06, + "loss": 1.6718, + "step": 27668 + }, + { + "epoch": 8.492633517495396, + "grad_norm": 0.14807704091072083, + "learning_rate": 5.843600610968125e-06, + "loss": 1.6858, + "step": 27669 + }, + { + "epoch": 8.49294045426642, + "grad_norm": 0.14770758152008057, + "learning_rate": 5.841268979817965e-06, + "loss": 1.6655, + "step": 27670 + }, + { + "epoch": 8.493247391037446, + "grad_norm": 0.13218273222446442, + "learning_rate": 5.838937785070258e-06, + "loss": 1.7132, + "step": 27671 + }, + { + "epoch": 8.493554327808472, + "grad_norm": 0.1349583864212036, + "learning_rate": 5.836607026748076e-06, + "loss": 1.6704, + "step": 27672 + }, + { + "epoch": 8.493861264579497, + "grad_norm": 0.22880202531814575, + "learning_rate": 5.834276704874403e-06, + "loss": 1.7297, + "step": 27673 + }, + { + "epoch": 8.494168201350522, + "grad_norm": 0.17375829815864563, + "learning_rate": 5.831946819472317e-06, + "loss": 1.6857, + "step": 27674 + }, + { + "epoch": 8.494475138121548, + "grad_norm": 0.15201902389526367, + "learning_rate": 5.829617370564805e-06, + "loss": 1.7148, + "step": 27675 + }, + { + "epoch": 8.494782074892573, + "grad_norm": 0.1489444226026535, + "learning_rate": 5.827288358174898e-06, + "loss": 1.7477, + "step": 27676 + }, + { + "epoch": 8.495089011663598, + "grad_norm": 0.1331137716770172, + "learning_rate": 5.824959782325634e-06, + "loss": 1.7282, + "step": 27677 + }, + { + "epoch": 8.495395948434622, + "grad_norm": 0.1779918074607849, + "learning_rate": 5.822631643039994e-06, + "loss": 1.6677, + "step": 27678 + }, + { + "epoch": 8.495702885205647, + "grad_norm": 0.17707432806491852, + "learning_rate": 5.820303940341021e-06, + "loss": 1.7627, + "step": 27679 + }, + { + "epoch": 8.496009821976672, + "grad_norm": 0.19686660170555115, + "learning_rate": 5.817976674251674e-06, + "loss": 1.8057, + "step": 27680 + }, + { + "epoch": 8.496316758747698, + "grad_norm": 0.17378473281860352, + "learning_rate": 5.81564984479499e-06, + "loss": 1.763, + "step": 27681 + }, + { + "epoch": 8.496623695518723, + "grad_norm": 0.13753214478492737, + "learning_rate": 5.813323451993952e-06, + "loss": 1.6567, + "step": 27682 + }, + { + "epoch": 8.496930632289748, + "grad_norm": 0.19319739937782288, + "learning_rate": 5.810997495871551e-06, + "loss": 1.7447, + "step": 27683 + }, + { + "epoch": 8.497237569060774, + "grad_norm": 0.1459372490644455, + "learning_rate": 5.808671976450775e-06, + "loss": 1.6978, + "step": 27684 + }, + { + "epoch": 8.497544505831799, + "grad_norm": 0.1829099804162979, + "learning_rate": 5.806346893754599e-06, + "loss": 1.7399, + "step": 27685 + }, + { + "epoch": 8.497851442602824, + "grad_norm": 0.14952246844768524, + "learning_rate": 5.804022247806007e-06, + "loss": 1.683, + "step": 27686 + }, + { + "epoch": 8.49815837937385, + "grad_norm": 0.14325882494449615, + "learning_rate": 5.801698038627973e-06, + "loss": 1.689, + "step": 27687 + }, + { + "epoch": 8.498465316144873, + "grad_norm": 0.17999286949634552, + "learning_rate": 5.799374266243451e-06, + "loss": 1.7358, + "step": 27688 + }, + { + "epoch": 8.498772252915899, + "grad_norm": 0.17262579500675201, + "learning_rate": 5.797050930675441e-06, + "loss": 1.7249, + "step": 27689 + }, + { + "epoch": 8.499079189686924, + "grad_norm": 0.17032817006111145, + "learning_rate": 5.794728031946861e-06, + "loss": 1.7124, + "step": 27690 + }, + { + "epoch": 8.49938612645795, + "grad_norm": 0.16629208624362946, + "learning_rate": 5.7924055700807115e-06, + "loss": 1.6981, + "step": 27691 + }, + { + "epoch": 8.499693063228975, + "grad_norm": 0.19601507484912872, + "learning_rate": 5.7900835450999115e-06, + "loss": 1.6582, + "step": 27692 + }, + { + "epoch": 8.5, + "grad_norm": 0.2122369408607483, + "learning_rate": 5.787761957027405e-06, + "loss": 1.7509, + "step": 27693 + }, + { + "epoch": 8.500306936771025, + "grad_norm": 0.16086016595363617, + "learning_rate": 5.785440805886166e-06, + "loss": 1.7011, + "step": 27694 + }, + { + "epoch": 8.50061387354205, + "grad_norm": 0.15793873369693756, + "learning_rate": 5.783120091699101e-06, + "loss": 1.6879, + "step": 27695 + }, + { + "epoch": 8.500920810313076, + "grad_norm": 0.15392783284187317, + "learning_rate": 5.7807998144891735e-06, + "loss": 1.6973, + "step": 27696 + }, + { + "epoch": 8.501227747084101, + "grad_norm": 0.17782802879810333, + "learning_rate": 5.778479974279288e-06, + "loss": 1.7319, + "step": 27697 + }, + { + "epoch": 8.501534683855127, + "grad_norm": 0.139020636677742, + "learning_rate": 5.776160571092387e-06, + "loss": 1.6655, + "step": 27698 + }, + { + "epoch": 8.50184162062615, + "grad_norm": 0.1582586020231247, + "learning_rate": 5.773841604951391e-06, + "loss": 1.7134, + "step": 27699 + }, + { + "epoch": 8.502148557397176, + "grad_norm": 0.1685703545808792, + "learning_rate": 5.77152307587921e-06, + "loss": 1.7504, + "step": 27700 + }, + { + "epoch": 8.502455494168201, + "grad_norm": 0.15043340623378754, + "learning_rate": 5.769204983898763e-06, + "loss": 1.6837, + "step": 27701 + }, + { + "epoch": 8.502762430939226, + "grad_norm": 0.18134978413581848, + "learning_rate": 5.7668873290329605e-06, + "loss": 1.7698, + "step": 27702 + }, + { + "epoch": 8.503069367710252, + "grad_norm": 0.18589314818382263, + "learning_rate": 5.764570111304696e-06, + "loss": 1.7565, + "step": 27703 + }, + { + "epoch": 8.503376304481277, + "grad_norm": 0.17075087130069733, + "learning_rate": 5.762253330736883e-06, + "loss": 1.6888, + "step": 27704 + }, + { + "epoch": 8.503683241252302, + "grad_norm": 0.13238663971424103, + "learning_rate": 5.759936987352399e-06, + "loss": 1.6708, + "step": 27705 + }, + { + "epoch": 8.503990178023328, + "grad_norm": 0.1714777648448944, + "learning_rate": 5.75762108117417e-06, + "loss": 1.6934, + "step": 27706 + }, + { + "epoch": 8.504297114794353, + "grad_norm": 0.13476133346557617, + "learning_rate": 5.755305612225037e-06, + "loss": 1.707, + "step": 27707 + }, + { + "epoch": 8.504604051565378, + "grad_norm": 0.1355150043964386, + "learning_rate": 5.7529905805279285e-06, + "loss": 1.695, + "step": 27708 + }, + { + "epoch": 8.504910988336402, + "grad_norm": 0.15239351987838745, + "learning_rate": 5.750675986105686e-06, + "loss": 1.7146, + "step": 27709 + }, + { + "epoch": 8.505217925107427, + "grad_norm": 0.1348891258239746, + "learning_rate": 5.748361828981197e-06, + "loss": 1.7087, + "step": 27710 + }, + { + "epoch": 8.505524861878452, + "grad_norm": 0.1657278686761856, + "learning_rate": 5.746048109177349e-06, + "loss": 1.7222, + "step": 27711 + }, + { + "epoch": 8.505831798649478, + "grad_norm": 0.17044055461883545, + "learning_rate": 5.743734826716967e-06, + "loss": 1.7917, + "step": 27712 + }, + { + "epoch": 8.506138735420503, + "grad_norm": 0.13258327543735504, + "learning_rate": 5.741421981622963e-06, + "loss": 1.6859, + "step": 27713 + }, + { + "epoch": 8.506445672191528, + "grad_norm": 0.13243085145950317, + "learning_rate": 5.7391095739181495e-06, + "loss": 1.6832, + "step": 27714 + }, + { + "epoch": 8.506752608962554, + "grad_norm": 0.14863869547843933, + "learning_rate": 5.736797603625405e-06, + "loss": 1.6961, + "step": 27715 + }, + { + "epoch": 8.50705954573358, + "grad_norm": 0.13942895829677582, + "learning_rate": 5.73448607076757e-06, + "loss": 1.6847, + "step": 27716 + }, + { + "epoch": 8.507366482504604, + "grad_norm": 0.13684460520744324, + "learning_rate": 5.732174975367482e-06, + "loss": 1.6888, + "step": 27717 + }, + { + "epoch": 8.50767341927563, + "grad_norm": 0.1887209117412567, + "learning_rate": 5.7298643174479974e-06, + "loss": 1.7091, + "step": 27718 + }, + { + "epoch": 8.507980356046655, + "grad_norm": 0.17502547800540924, + "learning_rate": 5.727554097031934e-06, + "loss": 1.7103, + "step": 27719 + }, + { + "epoch": 8.50828729281768, + "grad_norm": 0.17275308072566986, + "learning_rate": 5.725244314142137e-06, + "loss": 1.7392, + "step": 27720 + }, + { + "epoch": 8.508594229588704, + "grad_norm": 0.13890086114406586, + "learning_rate": 5.722934968801419e-06, + "loss": 1.6711, + "step": 27721 + }, + { + "epoch": 8.50890116635973, + "grad_norm": 0.16987508535385132, + "learning_rate": 5.720626061032603e-06, + "loss": 1.6784, + "step": 27722 + }, + { + "epoch": 8.509208103130755, + "grad_norm": 0.12734577059745789, + "learning_rate": 5.718317590858529e-06, + "loss": 1.668, + "step": 27723 + }, + { + "epoch": 8.50951503990178, + "grad_norm": 0.17097610235214233, + "learning_rate": 5.716009558301977e-06, + "loss": 1.7419, + "step": 27724 + }, + { + "epoch": 8.509821976672805, + "grad_norm": 0.15415556728839874, + "learning_rate": 5.713701963385798e-06, + "loss": 1.6794, + "step": 27725 + }, + { + "epoch": 8.51012891344383, + "grad_norm": 0.115156389772892, + "learning_rate": 5.711394806132758e-06, + "loss": 1.6364, + "step": 27726 + }, + { + "epoch": 8.510435850214856, + "grad_norm": 0.1583303064107895, + "learning_rate": 5.709088086565667e-06, + "loss": 1.7185, + "step": 27727 + }, + { + "epoch": 8.510742786985881, + "grad_norm": 0.17150144279003143, + "learning_rate": 5.706781804707345e-06, + "loss": 1.7122, + "step": 27728 + }, + { + "epoch": 8.511049723756907, + "grad_norm": 0.14469772577285767, + "learning_rate": 5.7044759605805464e-06, + "loss": 1.6806, + "step": 27729 + }, + { + "epoch": 8.511356660527932, + "grad_norm": 0.1671745926141739, + "learning_rate": 5.702170554208102e-06, + "loss": 1.7051, + "step": 27730 + }, + { + "epoch": 8.511663597298956, + "grad_norm": 0.14769956469535828, + "learning_rate": 5.699865585612746e-06, + "loss": 1.7052, + "step": 27731 + }, + { + "epoch": 8.511970534069981, + "grad_norm": 0.17527055740356445, + "learning_rate": 5.697561054817296e-06, + "loss": 1.7397, + "step": 27732 + }, + { + "epoch": 8.512277470841006, + "grad_norm": 0.16712914407253265, + "learning_rate": 5.695256961844519e-06, + "loss": 1.7025, + "step": 27733 + }, + { + "epoch": 8.512584407612032, + "grad_norm": 0.14546720683574677, + "learning_rate": 5.6929533067171745e-06, + "loss": 1.667, + "step": 27734 + }, + { + "epoch": 8.512891344383057, + "grad_norm": 0.1326368749141693, + "learning_rate": 5.690650089458038e-06, + "loss": 1.7109, + "step": 27735 + }, + { + "epoch": 8.513198281154082, + "grad_norm": 0.14168506860733032, + "learning_rate": 5.688347310089864e-06, + "loss": 1.6497, + "step": 27736 + }, + { + "epoch": 8.513505217925108, + "grad_norm": 0.18198592960834503, + "learning_rate": 5.686044968635418e-06, + "loss": 1.7167, + "step": 27737 + }, + { + "epoch": 8.513812154696133, + "grad_norm": 0.14291147887706757, + "learning_rate": 5.683743065117447e-06, + "loss": 1.6855, + "step": 27738 + }, + { + "epoch": 8.514119091467158, + "grad_norm": 0.17336830496788025, + "learning_rate": 5.681441599558701e-06, + "loss": 1.738, + "step": 27739 + }, + { + "epoch": 8.514426028238184, + "grad_norm": 0.1447203904390335, + "learning_rate": 5.679140571981922e-06, + "loss": 1.7217, + "step": 27740 + }, + { + "epoch": 8.514732965009209, + "grad_norm": 0.19665221869945526, + "learning_rate": 5.676839982409849e-06, + "loss": 1.7395, + "step": 27741 + }, + { + "epoch": 8.515039901780233, + "grad_norm": 0.1405279040336609, + "learning_rate": 5.6745398308652386e-06, + "loss": 1.6559, + "step": 27742 + }, + { + "epoch": 8.515346838551258, + "grad_norm": 0.15195727348327637, + "learning_rate": 5.672240117370797e-06, + "loss": 1.6977, + "step": 27743 + }, + { + "epoch": 8.515653775322283, + "grad_norm": 0.11381472647190094, + "learning_rate": 5.669940841949261e-06, + "loss": 1.6594, + "step": 27744 + }, + { + "epoch": 8.515960712093309, + "grad_norm": 0.17271532118320465, + "learning_rate": 5.667642004623347e-06, + "loss": 1.7323, + "step": 27745 + }, + { + "epoch": 8.516267648864334, + "grad_norm": 0.15365839004516602, + "learning_rate": 5.665343605415774e-06, + "loss": 1.7257, + "step": 27746 + }, + { + "epoch": 8.51657458563536, + "grad_norm": 0.22701260447502136, + "learning_rate": 5.66304564434928e-06, + "loss": 1.6939, + "step": 27747 + }, + { + "epoch": 8.516881522406385, + "grad_norm": 0.14642612636089325, + "learning_rate": 5.660748121446535e-06, + "loss": 1.6985, + "step": 27748 + }, + { + "epoch": 8.51718845917741, + "grad_norm": 0.1659226268529892, + "learning_rate": 5.658451036730272e-06, + "loss": 1.7439, + "step": 27749 + }, + { + "epoch": 8.517495395948435, + "grad_norm": 0.14763525128364563, + "learning_rate": 5.65615439022319e-06, + "loss": 1.6714, + "step": 27750 + }, + { + "epoch": 8.51780233271946, + "grad_norm": 0.17457270622253418, + "learning_rate": 5.65385818194798e-06, + "loss": 1.7214, + "step": 27751 + }, + { + "epoch": 8.518109269490484, + "grad_norm": 0.15170279145240784, + "learning_rate": 5.651562411927335e-06, + "loss": 1.7121, + "step": 27752 + }, + { + "epoch": 8.51841620626151, + "grad_norm": 0.16129034757614136, + "learning_rate": 5.649267080183945e-06, + "loss": 1.6916, + "step": 27753 + }, + { + "epoch": 8.518723143032535, + "grad_norm": 0.20800361037254333, + "learning_rate": 5.64697218674049e-06, + "loss": 1.7482, + "step": 27754 + }, + { + "epoch": 8.51903007980356, + "grad_norm": 0.16350114345550537, + "learning_rate": 5.644677731619652e-06, + "loss": 1.6705, + "step": 27755 + }, + { + "epoch": 8.519337016574585, + "grad_norm": 0.15720658004283905, + "learning_rate": 5.642383714844107e-06, + "loss": 1.6871, + "step": 27756 + }, + { + "epoch": 8.51964395334561, + "grad_norm": 0.21885983645915985, + "learning_rate": 5.640090136436526e-06, + "loss": 1.7057, + "step": 27757 + }, + { + "epoch": 8.519950890116636, + "grad_norm": 0.1411464810371399, + "learning_rate": 5.637796996419564e-06, + "loss": 1.7103, + "step": 27758 + }, + { + "epoch": 8.520257826887661, + "grad_norm": 0.14518170058727264, + "learning_rate": 5.635504294815913e-06, + "loss": 1.7184, + "step": 27759 + }, + { + "epoch": 8.520564763658687, + "grad_norm": 0.17998449504375458, + "learning_rate": 5.633212031648199e-06, + "loss": 1.6822, + "step": 27760 + }, + { + "epoch": 8.520871700429712, + "grad_norm": 0.1301501840353012, + "learning_rate": 5.630920206939094e-06, + "loss": 1.6878, + "step": 27761 + }, + { + "epoch": 8.521178637200737, + "grad_norm": 0.16201011836528778, + "learning_rate": 5.628628820711235e-06, + "loss": 1.7581, + "step": 27762 + }, + { + "epoch": 8.521485573971761, + "grad_norm": 0.20399747788906097, + "learning_rate": 5.626337872987269e-06, + "loss": 1.7281, + "step": 27763 + }, + { + "epoch": 8.521792510742786, + "grad_norm": 0.18675439059734344, + "learning_rate": 5.624047363789858e-06, + "loss": 1.7445, + "step": 27764 + }, + { + "epoch": 8.522099447513812, + "grad_norm": 0.1858585625886917, + "learning_rate": 5.621757293141594e-06, + "loss": 1.729, + "step": 27765 + }, + { + "epoch": 8.522406384284837, + "grad_norm": 0.1731054186820984, + "learning_rate": 5.619467661065164e-06, + "loss": 1.6709, + "step": 27766 + }, + { + "epoch": 8.522713321055862, + "grad_norm": 0.2048177868127823, + "learning_rate": 5.617178467583145e-06, + "loss": 1.8187, + "step": 27767 + }, + { + "epoch": 8.523020257826888, + "grad_norm": 0.1944245547056198, + "learning_rate": 5.614889712718191e-06, + "loss": 1.7238, + "step": 27768 + }, + { + "epoch": 8.523327194597913, + "grad_norm": 0.16106872260570526, + "learning_rate": 5.612601396492906e-06, + "loss": 1.7089, + "step": 27769 + }, + { + "epoch": 8.523634131368938, + "grad_norm": 0.1933506578207016, + "learning_rate": 5.610313518929916e-06, + "loss": 1.6702, + "step": 27770 + }, + { + "epoch": 8.523941068139964, + "grad_norm": 0.14211905002593994, + "learning_rate": 5.608026080051826e-06, + "loss": 1.686, + "step": 27771 + }, + { + "epoch": 8.524248004910989, + "grad_norm": 0.1588355004787445, + "learning_rate": 5.605739079881239e-06, + "loss": 1.691, + "step": 27772 + }, + { + "epoch": 8.524554941682014, + "grad_norm": 0.2026119977235794, + "learning_rate": 5.60345251844076e-06, + "loss": 1.7024, + "step": 27773 + }, + { + "epoch": 8.524861878453038, + "grad_norm": 0.19816550612449646, + "learning_rate": 5.601166395752988e-06, + "loss": 1.7793, + "step": 27774 + }, + { + "epoch": 8.525168815224063, + "grad_norm": 0.1687595695257187, + "learning_rate": 5.59888071184051e-06, + "loss": 1.7066, + "step": 27775 + }, + { + "epoch": 8.525475751995089, + "grad_norm": 0.1844881922006607, + "learning_rate": 5.5965954667259125e-06, + "loss": 1.7091, + "step": 27776 + }, + { + "epoch": 8.525782688766114, + "grad_norm": 0.13911494612693787, + "learning_rate": 5.5943106604317895e-06, + "loss": 1.6611, + "step": 27777 + }, + { + "epoch": 8.52608962553714, + "grad_norm": 0.215097114443779, + "learning_rate": 5.592026292980718e-06, + "loss": 1.7436, + "step": 27778 + }, + { + "epoch": 8.526396562308165, + "grad_norm": 0.19177651405334473, + "learning_rate": 5.589742364395267e-06, + "loss": 1.7198, + "step": 27779 + }, + { + "epoch": 8.52670349907919, + "grad_norm": 0.16470259428024292, + "learning_rate": 5.587458874697998e-06, + "loss": 1.7405, + "step": 27780 + }, + { + "epoch": 8.527010435850215, + "grad_norm": 0.13213464617729187, + "learning_rate": 5.585175823911515e-06, + "loss": 1.6651, + "step": 27781 + }, + { + "epoch": 8.52731737262124, + "grad_norm": 0.18105588853359222, + "learning_rate": 5.582893212058338e-06, + "loss": 1.7169, + "step": 27782 + }, + { + "epoch": 8.527624309392266, + "grad_norm": 0.19358783960342407, + "learning_rate": 5.580611039161065e-06, + "loss": 1.7165, + "step": 27783 + }, + { + "epoch": 8.527931246163291, + "grad_norm": 0.13674969971179962, + "learning_rate": 5.578329305242208e-06, + "loss": 1.7086, + "step": 27784 + }, + { + "epoch": 8.528238182934315, + "grad_norm": 0.1365654170513153, + "learning_rate": 5.5760480103243475e-06, + "loss": 1.7031, + "step": 27785 + }, + { + "epoch": 8.52854511970534, + "grad_norm": 0.17749033868312836, + "learning_rate": 5.573767154430015e-06, + "loss": 1.7717, + "step": 27786 + }, + { + "epoch": 8.528852056476365, + "grad_norm": 0.16521626710891724, + "learning_rate": 5.5714867375817545e-06, + "loss": 1.6859, + "step": 27787 + }, + { + "epoch": 8.52915899324739, + "grad_norm": 0.14327271282672882, + "learning_rate": 5.569206759802103e-06, + "loss": 1.6996, + "step": 27788 + }, + { + "epoch": 8.529465930018416, + "grad_norm": 0.1895138919353485, + "learning_rate": 5.5669272211135934e-06, + "loss": 1.7127, + "step": 27789 + }, + { + "epoch": 8.529772866789441, + "grad_norm": 0.16256090998649597, + "learning_rate": 5.564648121538757e-06, + "loss": 1.7083, + "step": 27790 + }, + { + "epoch": 8.530079803560467, + "grad_norm": 0.18591371178627014, + "learning_rate": 5.562369461100103e-06, + "loss": 1.7852, + "step": 27791 + }, + { + "epoch": 8.530386740331492, + "grad_norm": 0.15933659672737122, + "learning_rate": 5.560091239820165e-06, + "loss": 1.69, + "step": 27792 + }, + { + "epoch": 8.530693677102517, + "grad_norm": 0.15374226868152618, + "learning_rate": 5.5578134577214505e-06, + "loss": 1.7397, + "step": 27793 + }, + { + "epoch": 8.531000613873543, + "grad_norm": 0.1786707490682602, + "learning_rate": 5.555536114826476e-06, + "loss": 1.7456, + "step": 27794 + }, + { + "epoch": 8.531307550644566, + "grad_norm": 0.16859668493270874, + "learning_rate": 5.553259211157741e-06, + "loss": 1.724, + "step": 27795 + }, + { + "epoch": 8.531614487415592, + "grad_norm": 0.21200759708881378, + "learning_rate": 5.5509827467377485e-06, + "loss": 1.7326, + "step": 27796 + }, + { + "epoch": 8.531921424186617, + "grad_norm": 0.16948217153549194, + "learning_rate": 5.548706721588986e-06, + "loss": 1.7082, + "step": 27797 + }, + { + "epoch": 8.532228360957642, + "grad_norm": 0.17014150321483612, + "learning_rate": 5.546431135733976e-06, + "loss": 1.7344, + "step": 27798 + }, + { + "epoch": 8.532535297728668, + "grad_norm": 0.20479294657707214, + "learning_rate": 5.544155989195171e-06, + "loss": 1.8121, + "step": 27799 + }, + { + "epoch": 8.532842234499693, + "grad_norm": 0.16958604753017426, + "learning_rate": 5.541881281995093e-06, + "loss": 1.773, + "step": 27800 + }, + { + "epoch": 8.533149171270718, + "grad_norm": 0.17606206238269806, + "learning_rate": 5.539607014156184e-06, + "loss": 1.6937, + "step": 27801 + }, + { + "epoch": 8.533456108041744, + "grad_norm": 0.1357482373714447, + "learning_rate": 5.537333185700943e-06, + "loss": 1.7234, + "step": 27802 + }, + { + "epoch": 8.533763044812769, + "grad_norm": 0.17217469215393066, + "learning_rate": 5.535059796651837e-06, + "loss": 1.722, + "step": 27803 + }, + { + "epoch": 8.534069981583794, + "grad_norm": 0.14100955426692963, + "learning_rate": 5.532786847031335e-06, + "loss": 1.6574, + "step": 27804 + }, + { + "epoch": 8.53437691835482, + "grad_norm": 0.1515544354915619, + "learning_rate": 5.530514336861897e-06, + "loss": 1.7489, + "step": 27805 + }, + { + "epoch": 8.534683855125843, + "grad_norm": 0.15518932044506073, + "learning_rate": 5.528242266165978e-06, + "loss": 1.7338, + "step": 27806 + }, + { + "epoch": 8.534990791896869, + "grad_norm": 0.15764978528022766, + "learning_rate": 5.525970634966033e-06, + "loss": 1.6971, + "step": 27807 + }, + { + "epoch": 8.535297728667894, + "grad_norm": 0.13838590681552887, + "learning_rate": 5.523699443284513e-06, + "loss": 1.723, + "step": 27808 + }, + { + "epoch": 8.53560466543892, + "grad_norm": 0.17713284492492676, + "learning_rate": 5.521428691143865e-06, + "loss": 1.7227, + "step": 27809 + }, + { + "epoch": 8.535911602209945, + "grad_norm": 0.19389420747756958, + "learning_rate": 5.51915837856653e-06, + "loss": 1.703, + "step": 27810 + }, + { + "epoch": 8.53621853898097, + "grad_norm": 0.13955099880695343, + "learning_rate": 5.516888505574941e-06, + "loss": 1.7093, + "step": 27811 + }, + { + "epoch": 8.536525475751995, + "grad_norm": 0.1319018006324768, + "learning_rate": 5.514619072191535e-06, + "loss": 1.7093, + "step": 27812 + }, + { + "epoch": 8.53683241252302, + "grad_norm": 0.14604489505290985, + "learning_rate": 5.512350078438733e-06, + "loss": 1.7113, + "step": 27813 + }, + { + "epoch": 8.537139349294046, + "grad_norm": 0.14439311623573303, + "learning_rate": 5.510081524338956e-06, + "loss": 1.7164, + "step": 27814 + }, + { + "epoch": 8.537446286065071, + "grad_norm": 0.17546533048152924, + "learning_rate": 5.507813409914647e-06, + "loss": 1.7432, + "step": 27815 + }, + { + "epoch": 8.537753222836095, + "grad_norm": 0.15710201859474182, + "learning_rate": 5.505545735188189e-06, + "loss": 1.7353, + "step": 27816 + }, + { + "epoch": 8.53806015960712, + "grad_norm": 0.19635994732379913, + "learning_rate": 5.503278500182019e-06, + "loss": 1.7042, + "step": 27817 + }, + { + "epoch": 8.538367096378146, + "grad_norm": 0.17653462290763855, + "learning_rate": 5.501011704918519e-06, + "loss": 1.7007, + "step": 27818 + }, + { + "epoch": 8.53867403314917, + "grad_norm": 0.1532578021287918, + "learning_rate": 5.498745349420109e-06, + "loss": 1.7111, + "step": 27819 + }, + { + "epoch": 8.538980969920196, + "grad_norm": 0.15368299186229706, + "learning_rate": 5.496479433709178e-06, + "loss": 1.7073, + "step": 27820 + }, + { + "epoch": 8.539287906691222, + "grad_norm": 0.19518911838531494, + "learning_rate": 5.494213957808126e-06, + "loss": 1.756, + "step": 27821 + }, + { + "epoch": 8.539594843462247, + "grad_norm": 0.13748668134212494, + "learning_rate": 5.4919489217393376e-06, + "loss": 1.6636, + "step": 27822 + }, + { + "epoch": 8.539901780233272, + "grad_norm": 0.2104724794626236, + "learning_rate": 5.489684325525191e-06, + "loss": 1.7734, + "step": 27823 + }, + { + "epoch": 8.540208717004298, + "grad_norm": 0.15495489537715912, + "learning_rate": 5.4874201691880786e-06, + "loss": 1.6858, + "step": 27824 + }, + { + "epoch": 8.540515653775323, + "grad_norm": 0.16447420418262482, + "learning_rate": 5.4851564527503674e-06, + "loss": 1.7053, + "step": 27825 + }, + { + "epoch": 8.540822590546348, + "grad_norm": 0.1427844911813736, + "learning_rate": 5.482893176234433e-06, + "loss": 1.6885, + "step": 27826 + }, + { + "epoch": 8.541129527317374, + "grad_norm": 0.14386583864688873, + "learning_rate": 5.4806303396626344e-06, + "loss": 1.6762, + "step": 27827 + }, + { + "epoch": 8.541436464088397, + "grad_norm": 0.15933938324451447, + "learning_rate": 5.478367943057344e-06, + "loss": 1.6945, + "step": 27828 + }, + { + "epoch": 8.541743400859422, + "grad_norm": 0.3127610385417938, + "learning_rate": 5.476105986440922e-06, + "loss": 1.772, + "step": 27829 + }, + { + "epoch": 8.542050337630448, + "grad_norm": 0.168161079287529, + "learning_rate": 5.473844469835709e-06, + "loss": 1.7398, + "step": 27830 + }, + { + "epoch": 8.542357274401473, + "grad_norm": 0.17208287119865417, + "learning_rate": 5.471583393264057e-06, + "loss": 1.7345, + "step": 27831 + }, + { + "epoch": 8.542664211172498, + "grad_norm": 0.18009017407894135, + "learning_rate": 5.469322756748335e-06, + "loss": 1.7785, + "step": 27832 + }, + { + "epoch": 8.542971147943524, + "grad_norm": 0.17091695964336395, + "learning_rate": 5.467062560310843e-06, + "loss": 1.689, + "step": 27833 + }, + { + "epoch": 8.543278084714549, + "grad_norm": 0.1495637446641922, + "learning_rate": 5.4648028039739675e-06, + "loss": 1.7409, + "step": 27834 + }, + { + "epoch": 8.543585021485574, + "grad_norm": 0.19924791157245636, + "learning_rate": 5.462543487759986e-06, + "loss": 1.7136, + "step": 27835 + }, + { + "epoch": 8.5438919582566, + "grad_norm": 0.19490383565425873, + "learning_rate": 5.460284611691269e-06, + "loss": 1.7371, + "step": 27836 + }, + { + "epoch": 8.544198895027625, + "grad_norm": 0.20383320748806, + "learning_rate": 5.458026175790127e-06, + "loss": 1.7268, + "step": 27837 + }, + { + "epoch": 8.544505831798649, + "grad_norm": 0.20110821723937988, + "learning_rate": 5.455768180078869e-06, + "loss": 1.7069, + "step": 27838 + }, + { + "epoch": 8.544812768569674, + "grad_norm": 0.16181184351444244, + "learning_rate": 5.453510624579827e-06, + "loss": 1.7158, + "step": 27839 + }, + { + "epoch": 8.5451197053407, + "grad_norm": 0.17110773921012878, + "learning_rate": 5.451253509315296e-06, + "loss": 1.6925, + "step": 27840 + }, + { + "epoch": 8.545426642111725, + "grad_norm": 0.16039033234119415, + "learning_rate": 5.448996834307591e-06, + "loss": 1.7281, + "step": 27841 + }, + { + "epoch": 8.54573357888275, + "grad_norm": 0.12631241977214813, + "learning_rate": 5.446740599579014e-06, + "loss": 1.6816, + "step": 27842 + }, + { + "epoch": 8.546040515653775, + "grad_norm": 0.20419110357761383, + "learning_rate": 5.444484805151856e-06, + "loss": 1.7594, + "step": 27843 + }, + { + "epoch": 8.5463474524248, + "grad_norm": 0.25453490018844604, + "learning_rate": 5.442229451048414e-06, + "loss": 1.7423, + "step": 27844 + }, + { + "epoch": 8.546654389195826, + "grad_norm": 0.15445558726787567, + "learning_rate": 5.439974537290982e-06, + "loss": 1.729, + "step": 27845 + }, + { + "epoch": 8.546961325966851, + "grad_norm": 0.16175805032253265, + "learning_rate": 5.43772006390183e-06, + "loss": 1.7515, + "step": 27846 + }, + { + "epoch": 8.547268262737877, + "grad_norm": 0.1958928406238556, + "learning_rate": 5.435466030903253e-06, + "loss": 1.7203, + "step": 27847 + }, + { + "epoch": 8.547575199508902, + "grad_norm": 0.17533376812934875, + "learning_rate": 5.433212438317514e-06, + "loss": 1.7393, + "step": 27848 + }, + { + "epoch": 8.547882136279926, + "grad_norm": 0.16437608003616333, + "learning_rate": 5.430959286166904e-06, + "loss": 1.7284, + "step": 27849 + }, + { + "epoch": 8.54818907305095, + "grad_norm": 0.16348768770694733, + "learning_rate": 5.428706574473663e-06, + "loss": 1.7284, + "step": 27850 + }, + { + "epoch": 8.548496009821976, + "grad_norm": 0.136602982878685, + "learning_rate": 5.426454303260081e-06, + "loss": 1.6606, + "step": 27851 + }, + { + "epoch": 8.548802946593002, + "grad_norm": 0.1359151154756546, + "learning_rate": 5.42420247254839e-06, + "loss": 1.6989, + "step": 27852 + }, + { + "epoch": 8.549109883364027, + "grad_norm": 0.17593000829219818, + "learning_rate": 5.421951082360866e-06, + "loss": 1.7483, + "step": 27853 + }, + { + "epoch": 8.549416820135052, + "grad_norm": 0.1791890412569046, + "learning_rate": 5.419700132719746e-06, + "loss": 1.7032, + "step": 27854 + }, + { + "epoch": 8.549723756906078, + "grad_norm": 0.15925002098083496, + "learning_rate": 5.417449623647281e-06, + "loss": 1.7055, + "step": 27855 + }, + { + "epoch": 8.550030693677103, + "grad_norm": 0.16391295194625854, + "learning_rate": 5.415199555165706e-06, + "loss": 1.6555, + "step": 27856 + }, + { + "epoch": 8.550337630448128, + "grad_norm": 0.18588928878307343, + "learning_rate": 5.412949927297262e-06, + "loss": 1.6723, + "step": 27857 + }, + { + "epoch": 8.550644567219154, + "grad_norm": 0.15956605970859528, + "learning_rate": 5.410700740064184e-06, + "loss": 1.7148, + "step": 27858 + }, + { + "epoch": 8.550951503990177, + "grad_norm": 0.14419449865818024, + "learning_rate": 5.408451993488689e-06, + "loss": 1.6997, + "step": 27859 + }, + { + "epoch": 8.551258440761202, + "grad_norm": 0.18104690313339233, + "learning_rate": 5.406203687593014e-06, + "loss": 1.7121, + "step": 27860 + }, + { + "epoch": 8.551565377532228, + "grad_norm": 0.15283553302288055, + "learning_rate": 5.40395582239937e-06, + "loss": 1.6536, + "step": 27861 + }, + { + "epoch": 8.551872314303253, + "grad_norm": 0.14498579502105713, + "learning_rate": 5.401708397929972e-06, + "loss": 1.6649, + "step": 27862 + }, + { + "epoch": 8.552179251074278, + "grad_norm": 0.1828843504190445, + "learning_rate": 5.39946141420703e-06, + "loss": 1.718, + "step": 27863 + }, + { + "epoch": 8.552486187845304, + "grad_norm": 0.20626986026763916, + "learning_rate": 5.397214871252754e-06, + "loss": 1.7561, + "step": 27864 + }, + { + "epoch": 8.55279312461633, + "grad_norm": 0.16986799240112305, + "learning_rate": 5.394968769089331e-06, + "loss": 1.7386, + "step": 27865 + }, + { + "epoch": 8.553100061387354, + "grad_norm": 0.16921544075012207, + "learning_rate": 5.392723107738995e-06, + "loss": 1.6939, + "step": 27866 + }, + { + "epoch": 8.55340699815838, + "grad_norm": 0.19882866740226746, + "learning_rate": 5.390477887223888e-06, + "loss": 1.7376, + "step": 27867 + }, + { + "epoch": 8.553713934929405, + "grad_norm": 0.17440463602542877, + "learning_rate": 5.3882331075662486e-06, + "loss": 1.7142, + "step": 27868 + }, + { + "epoch": 8.55402087170043, + "grad_norm": 0.1494864523410797, + "learning_rate": 5.38598876878822e-06, + "loss": 1.6953, + "step": 27869 + }, + { + "epoch": 8.554327808471456, + "grad_norm": 0.18791508674621582, + "learning_rate": 5.383744870912006e-06, + "loss": 1.7863, + "step": 27870 + }, + { + "epoch": 8.55463474524248, + "grad_norm": 0.19124576449394226, + "learning_rate": 5.381501413959777e-06, + "loss": 1.6668, + "step": 27871 + }, + { + "epoch": 8.554941682013505, + "grad_norm": 0.17011114954948425, + "learning_rate": 5.3792583979537016e-06, + "loss": 1.7356, + "step": 27872 + }, + { + "epoch": 8.55524861878453, + "grad_norm": 0.1780267208814621, + "learning_rate": 5.377015822915949e-06, + "loss": 1.7428, + "step": 27873 + }, + { + "epoch": 8.555555555555555, + "grad_norm": 0.18539096415042877, + "learning_rate": 5.374773688868678e-06, + "loss": 1.7534, + "step": 27874 + }, + { + "epoch": 8.55586249232658, + "grad_norm": 0.1668393909931183, + "learning_rate": 5.372531995834051e-06, + "loss": 1.6884, + "step": 27875 + }, + { + "epoch": 8.556169429097606, + "grad_norm": 0.15957699716091156, + "learning_rate": 5.3702907438342165e-06, + "loss": 1.6739, + "step": 27876 + }, + { + "epoch": 8.556476365868631, + "grad_norm": 0.17210347950458527, + "learning_rate": 5.368049932891334e-06, + "loss": 1.7062, + "step": 27877 + }, + { + "epoch": 8.556783302639657, + "grad_norm": 0.1614166796207428, + "learning_rate": 5.365809563027535e-06, + "loss": 1.675, + "step": 27878 + }, + { + "epoch": 8.557090239410682, + "grad_norm": 0.17495310306549072, + "learning_rate": 5.36356963426497e-06, + "loss": 1.7694, + "step": 27879 + }, + { + "epoch": 8.557397176181707, + "grad_norm": 0.1660371571779251, + "learning_rate": 5.361330146625771e-06, + "loss": 1.6573, + "step": 27880 + }, + { + "epoch": 8.557704112952731, + "grad_norm": 0.1997743546962738, + "learning_rate": 5.359091100132074e-06, + "loss": 1.7006, + "step": 27881 + }, + { + "epoch": 8.558011049723756, + "grad_norm": 0.21383358538150787, + "learning_rate": 5.356852494805992e-06, + "loss": 1.7677, + "step": 27882 + }, + { + "epoch": 8.558317986494782, + "grad_norm": 0.15339766442775726, + "learning_rate": 5.354614330669677e-06, + "loss": 1.6852, + "step": 27883 + }, + { + "epoch": 8.558624923265807, + "grad_norm": 0.16808396577835083, + "learning_rate": 5.352376607745213e-06, + "loss": 1.7046, + "step": 27884 + }, + { + "epoch": 8.558931860036832, + "grad_norm": 0.19627085328102112, + "learning_rate": 5.350139326054748e-06, + "loss": 1.7255, + "step": 27885 + }, + { + "epoch": 8.559238796807858, + "grad_norm": 0.16882671415805817, + "learning_rate": 5.347902485620365e-06, + "loss": 1.6823, + "step": 27886 + }, + { + "epoch": 8.559545733578883, + "grad_norm": 0.19045037031173706, + "learning_rate": 5.3456660864641846e-06, + "loss": 1.7901, + "step": 27887 + }, + { + "epoch": 8.559852670349908, + "grad_norm": 0.16998142004013062, + "learning_rate": 5.3434301286083064e-06, + "loss": 1.7226, + "step": 27888 + }, + { + "epoch": 8.560159607120934, + "grad_norm": 0.16370677947998047, + "learning_rate": 5.341194612074824e-06, + "loss": 1.7151, + "step": 27889 + }, + { + "epoch": 8.560466543891959, + "grad_norm": 0.16379667818546295, + "learning_rate": 5.3389595368858345e-06, + "loss": 1.6742, + "step": 27890 + }, + { + "epoch": 8.560773480662984, + "grad_norm": 0.1741562932729721, + "learning_rate": 5.336724903063423e-06, + "loss": 1.7162, + "step": 27891 + }, + { + "epoch": 8.561080417434008, + "grad_norm": 0.17712807655334473, + "learning_rate": 5.334490710629675e-06, + "loss": 1.71, + "step": 27892 + }, + { + "epoch": 8.561387354205033, + "grad_norm": 0.16719931364059448, + "learning_rate": 5.332256959606669e-06, + "loss": 1.7299, + "step": 27893 + }, + { + "epoch": 8.561694290976058, + "grad_norm": 0.3024488389492035, + "learning_rate": 5.330023650016475e-06, + "loss": 1.7435, + "step": 27894 + }, + { + "epoch": 8.562001227747084, + "grad_norm": 0.13923676311969757, + "learning_rate": 5.3277907818811755e-06, + "loss": 1.6856, + "step": 27895 + }, + { + "epoch": 8.56230816451811, + "grad_norm": 0.1582731008529663, + "learning_rate": 5.325558355222826e-06, + "loss": 1.7057, + "step": 27896 + }, + { + "epoch": 8.562615101289135, + "grad_norm": 0.17576326429843903, + "learning_rate": 5.323326370063497e-06, + "loss": 1.7439, + "step": 27897 + }, + { + "epoch": 8.56292203806016, + "grad_norm": 0.16990134119987488, + "learning_rate": 5.321094826425238e-06, + "loss": 1.7366, + "step": 27898 + }, + { + "epoch": 8.563228974831185, + "grad_norm": 0.14154621958732605, + "learning_rate": 5.318863724330114e-06, + "loss": 1.6824, + "step": 27899 + }, + { + "epoch": 8.56353591160221, + "grad_norm": 0.1460665911436081, + "learning_rate": 5.3166330638001635e-06, + "loss": 1.729, + "step": 27900 + }, + { + "epoch": 8.563842848373236, + "grad_norm": 0.14366431534290314, + "learning_rate": 5.314402844857424e-06, + "loss": 1.704, + "step": 27901 + }, + { + "epoch": 8.56414978514426, + "grad_norm": 0.15405386686325073, + "learning_rate": 5.312173067523968e-06, + "loss": 1.7357, + "step": 27902 + }, + { + "epoch": 8.564456721915285, + "grad_norm": 0.12789638340473175, + "learning_rate": 5.309943731821787e-06, + "loss": 1.634, + "step": 27903 + }, + { + "epoch": 8.56476365868631, + "grad_norm": 0.17007184028625488, + "learning_rate": 5.307714837772948e-06, + "loss": 1.7065, + "step": 27904 + }, + { + "epoch": 8.565070595457335, + "grad_norm": 0.1982787400484085, + "learning_rate": 5.305486385399466e-06, + "loss": 1.7459, + "step": 27905 + }, + { + "epoch": 8.56537753222836, + "grad_norm": 0.18433566391468048, + "learning_rate": 5.303258374723363e-06, + "loss": 1.7414, + "step": 27906 + }, + { + "epoch": 8.565684468999386, + "grad_norm": 0.13842104375362396, + "learning_rate": 5.30103080576666e-06, + "loss": 1.6988, + "step": 27907 + }, + { + "epoch": 8.565991405770411, + "grad_norm": 0.14736461639404297, + "learning_rate": 5.298803678551373e-06, + "loss": 1.6828, + "step": 27908 + }, + { + "epoch": 8.566298342541437, + "grad_norm": 0.14953723549842834, + "learning_rate": 5.2965769930995e-06, + "loss": 1.6896, + "step": 27909 + }, + { + "epoch": 8.566605279312462, + "grad_norm": 0.15445443987846375, + "learning_rate": 5.294350749433058e-06, + "loss": 1.7096, + "step": 27910 + }, + { + "epoch": 8.566912216083487, + "grad_norm": 0.180703803896904, + "learning_rate": 5.292124947574045e-06, + "loss": 1.7191, + "step": 27911 + }, + { + "epoch": 8.567219152854513, + "grad_norm": 0.13825593888759613, + "learning_rate": 5.289899587544461e-06, + "loss": 1.6928, + "step": 27912 + }, + { + "epoch": 8.567526089625538, + "grad_norm": 0.15663209557533264, + "learning_rate": 5.287674669366294e-06, + "loss": 1.7004, + "step": 27913 + }, + { + "epoch": 8.567833026396562, + "grad_norm": 0.14148147404193878, + "learning_rate": 5.285450193061526e-06, + "loss": 1.6961, + "step": 27914 + }, + { + "epoch": 8.568139963167587, + "grad_norm": 0.12393147498369217, + "learning_rate": 5.283226158652155e-06, + "loss": 1.6515, + "step": 27915 + }, + { + "epoch": 8.568446899938612, + "grad_norm": 0.1855689138174057, + "learning_rate": 5.281002566160148e-06, + "loss": 1.8017, + "step": 27916 + }, + { + "epoch": 8.568753836709638, + "grad_norm": 0.1665579080581665, + "learning_rate": 5.2787794156074824e-06, + "loss": 1.6935, + "step": 27917 + }, + { + "epoch": 8.569060773480663, + "grad_norm": 0.1853685826063156, + "learning_rate": 5.276556707016123e-06, + "loss": 1.7504, + "step": 27918 + }, + { + "epoch": 8.569367710251688, + "grad_norm": 0.16065651178359985, + "learning_rate": 5.274334440408063e-06, + "loss": 1.7549, + "step": 27919 + }, + { + "epoch": 8.569674647022714, + "grad_norm": 0.1630239635705948, + "learning_rate": 5.272112615805225e-06, + "loss": 1.7404, + "step": 27920 + }, + { + "epoch": 8.569981583793739, + "grad_norm": 0.1681451052427292, + "learning_rate": 5.269891233229607e-06, + "loss": 1.704, + "step": 27921 + }, + { + "epoch": 8.570288520564764, + "grad_norm": 0.14546994864940643, + "learning_rate": 5.267670292703119e-06, + "loss": 1.6656, + "step": 27922 + }, + { + "epoch": 8.57059545733579, + "grad_norm": 0.1499837189912796, + "learning_rate": 5.265449794247746e-06, + "loss": 1.6908, + "step": 27923 + }, + { + "epoch": 8.570902394106813, + "grad_norm": 0.14691168069839478, + "learning_rate": 5.263229737885417e-06, + "loss": 1.6887, + "step": 27924 + }, + { + "epoch": 8.571209330877839, + "grad_norm": 0.16261856257915497, + "learning_rate": 5.261010123638066e-06, + "loss": 1.6981, + "step": 27925 + }, + { + "epoch": 8.571516267648864, + "grad_norm": 0.1549815535545349, + "learning_rate": 5.2587909515276425e-06, + "loss": 1.6971, + "step": 27926 + }, + { + "epoch": 8.57182320441989, + "grad_norm": 0.15067234635353088, + "learning_rate": 5.256572221576067e-06, + "loss": 1.7101, + "step": 27927 + }, + { + "epoch": 8.572130141190915, + "grad_norm": 0.13761483132839203, + "learning_rate": 5.254353933805273e-06, + "loss": 1.6657, + "step": 27928 + }, + { + "epoch": 8.57243707796194, + "grad_norm": 0.1590275913476944, + "learning_rate": 5.252136088237175e-06, + "loss": 1.6776, + "step": 27929 + }, + { + "epoch": 8.572744014732965, + "grad_norm": 0.1633618026971817, + "learning_rate": 5.249918684893695e-06, + "loss": 1.724, + "step": 27930 + }, + { + "epoch": 8.57305095150399, + "grad_norm": 0.2603756785392761, + "learning_rate": 5.247701723796755e-06, + "loss": 1.7071, + "step": 27931 + }, + { + "epoch": 8.573357888275016, + "grad_norm": 0.21079567074775696, + "learning_rate": 5.245485204968248e-06, + "loss": 1.7983, + "step": 27932 + }, + { + "epoch": 8.573664825046041, + "grad_norm": 0.15369223058223724, + "learning_rate": 5.243269128430095e-06, + "loss": 1.7566, + "step": 27933 + }, + { + "epoch": 8.573971761817067, + "grad_norm": 0.19392070174217224, + "learning_rate": 5.241053494204185e-06, + "loss": 1.7287, + "step": 27934 + }, + { + "epoch": 8.57427869858809, + "grad_norm": 0.16017836332321167, + "learning_rate": 5.23883830231241e-06, + "loss": 1.6909, + "step": 27935 + }, + { + "epoch": 8.574585635359115, + "grad_norm": 0.1943294107913971, + "learning_rate": 5.2366235527766876e-06, + "loss": 1.7844, + "step": 27936 + }, + { + "epoch": 8.57489257213014, + "grad_norm": 0.17875424027442932, + "learning_rate": 5.234409245618871e-06, + "loss": 1.7385, + "step": 27937 + }, + { + "epoch": 8.575199508901166, + "grad_norm": 0.1900254637002945, + "learning_rate": 5.232195380860877e-06, + "loss": 1.7303, + "step": 27938 + }, + { + "epoch": 8.575506445672191, + "grad_norm": 0.13633303344249725, + "learning_rate": 5.229981958524549e-06, + "loss": 1.6949, + "step": 27939 + }, + { + "epoch": 8.575813382443217, + "grad_norm": 0.18683885037899017, + "learning_rate": 5.227768978631792e-06, + "loss": 1.7366, + "step": 27940 + }, + { + "epoch": 8.576120319214242, + "grad_norm": 0.15012286603450775, + "learning_rate": 5.2255564412044656e-06, + "loss": 1.71, + "step": 27941 + }, + { + "epoch": 8.576427255985267, + "grad_norm": 0.14521601796150208, + "learning_rate": 5.22334434626443e-06, + "loss": 1.724, + "step": 27942 + }, + { + "epoch": 8.576734192756293, + "grad_norm": 0.1809433549642563, + "learning_rate": 5.221132693833547e-06, + "loss": 1.7851, + "step": 27943 + }, + { + "epoch": 8.577041129527318, + "grad_norm": 0.1676371693611145, + "learning_rate": 5.218921483933681e-06, + "loss": 1.7542, + "step": 27944 + }, + { + "epoch": 8.577348066298342, + "grad_norm": 0.16963952779769897, + "learning_rate": 5.216710716586676e-06, + "loss": 1.767, + "step": 27945 + }, + { + "epoch": 8.577655003069367, + "grad_norm": 0.18276773393154144, + "learning_rate": 5.214500391814387e-06, + "loss": 1.662, + "step": 27946 + }, + { + "epoch": 8.577961939840392, + "grad_norm": 0.16285058856010437, + "learning_rate": 5.212290509638656e-06, + "loss": 1.6853, + "step": 27947 + }, + { + "epoch": 8.578268876611418, + "grad_norm": 0.18186792731285095, + "learning_rate": 5.210081070081318e-06, + "loss": 1.7408, + "step": 27948 + }, + { + "epoch": 8.578575813382443, + "grad_norm": 0.15637101233005524, + "learning_rate": 5.207872073164216e-06, + "loss": 1.7026, + "step": 27949 + }, + { + "epoch": 8.578882750153468, + "grad_norm": 0.16442300379276276, + "learning_rate": 5.2056635189091704e-06, + "loss": 1.7136, + "step": 27950 + }, + { + "epoch": 8.579189686924494, + "grad_norm": 0.18907669186592102, + "learning_rate": 5.203455407338015e-06, + "loss": 1.7706, + "step": 27951 + }, + { + "epoch": 8.579496623695519, + "grad_norm": 0.17700283229351044, + "learning_rate": 5.201247738472559e-06, + "loss": 1.7104, + "step": 27952 + }, + { + "epoch": 8.579803560466544, + "grad_norm": 0.19882333278656006, + "learning_rate": 5.199040512334647e-06, + "loss": 1.7692, + "step": 27953 + }, + { + "epoch": 8.58011049723757, + "grad_norm": 0.14343376457691193, + "learning_rate": 5.19683372894606e-06, + "loss": 1.6775, + "step": 27954 + }, + { + "epoch": 8.580417434008595, + "grad_norm": 0.13688595592975616, + "learning_rate": 5.194627388328638e-06, + "loss": 1.6787, + "step": 27955 + }, + { + "epoch": 8.580724370779619, + "grad_norm": 0.15786845982074738, + "learning_rate": 5.192421490504157e-06, + "loss": 1.7218, + "step": 27956 + }, + { + "epoch": 8.581031307550644, + "grad_norm": 0.3297908902168274, + "learning_rate": 5.190216035494433e-06, + "loss": 1.7533, + "step": 27957 + }, + { + "epoch": 8.58133824432167, + "grad_norm": 0.16763067245483398, + "learning_rate": 5.18801102332126e-06, + "loss": 1.7278, + "step": 27958 + }, + { + "epoch": 8.581645181092695, + "grad_norm": 0.18505536019802094, + "learning_rate": 5.185806454006426e-06, + "loss": 1.7291, + "step": 27959 + }, + { + "epoch": 8.58195211786372, + "grad_norm": 0.1536751091480255, + "learning_rate": 5.183602327571718e-06, + "loss": 1.7014, + "step": 27960 + }, + { + "epoch": 8.582259054634745, + "grad_norm": 0.2561737596988678, + "learning_rate": 5.181398644038921e-06, + "loss": 1.8127, + "step": 27961 + }, + { + "epoch": 8.58256599140577, + "grad_norm": 0.15304888784885406, + "learning_rate": 5.17919540342981e-06, + "loss": 1.7001, + "step": 27962 + }, + { + "epoch": 8.582872928176796, + "grad_norm": 0.16688644886016846, + "learning_rate": 5.176992605766162e-06, + "loss": 1.7398, + "step": 27963 + }, + { + "epoch": 8.583179864947821, + "grad_norm": 0.1351930946111679, + "learning_rate": 5.174790251069744e-06, + "loss": 1.6947, + "step": 27964 + }, + { + "epoch": 8.583486801718847, + "grad_norm": 0.23985813558101654, + "learning_rate": 5.172588339362322e-06, + "loss": 1.7495, + "step": 27965 + }, + { + "epoch": 8.58379373848987, + "grad_norm": 0.17094407975673676, + "learning_rate": 5.170386870665656e-06, + "loss": 1.74, + "step": 27966 + }, + { + "epoch": 8.584100675260895, + "grad_norm": 0.17786560952663422, + "learning_rate": 5.168185845001505e-06, + "loss": 1.7438, + "step": 27967 + }, + { + "epoch": 8.58440761203192, + "grad_norm": 0.16682226955890656, + "learning_rate": 5.165985262391615e-06, + "loss": 1.7193, + "step": 27968 + }, + { + "epoch": 8.584714548802946, + "grad_norm": 0.17371125519275665, + "learning_rate": 5.163785122857728e-06, + "loss": 1.677, + "step": 27969 + }, + { + "epoch": 8.585021485573971, + "grad_norm": 0.16753411293029785, + "learning_rate": 5.161585426421617e-06, + "loss": 1.6558, + "step": 27970 + }, + { + "epoch": 8.585328422344997, + "grad_norm": 0.14469672739505768, + "learning_rate": 5.159386173104979e-06, + "loss": 1.7, + "step": 27971 + }, + { + "epoch": 8.585635359116022, + "grad_norm": 0.14450986683368683, + "learning_rate": 5.157187362929583e-06, + "loss": 1.6843, + "step": 27972 + }, + { + "epoch": 8.585942295887047, + "grad_norm": 0.15462568402290344, + "learning_rate": 5.1549889959171315e-06, + "loss": 1.7028, + "step": 27973 + }, + { + "epoch": 8.586249232658073, + "grad_norm": 0.19757840037345886, + "learning_rate": 5.1527910720893694e-06, + "loss": 1.7578, + "step": 27974 + }, + { + "epoch": 8.586556169429098, + "grad_norm": 0.16309098899364471, + "learning_rate": 5.150593591468017e-06, + "loss": 1.6736, + "step": 27975 + }, + { + "epoch": 8.586863106200123, + "grad_norm": 0.20989231765270233, + "learning_rate": 5.14839655407478e-06, + "loss": 1.7418, + "step": 27976 + }, + { + "epoch": 8.587170042971149, + "grad_norm": 0.14988306164741516, + "learning_rate": 5.14619995993138e-06, + "loss": 1.6834, + "step": 27977 + }, + { + "epoch": 8.587476979742172, + "grad_norm": 0.1826607882976532, + "learning_rate": 5.144003809059522e-06, + "loss": 1.7598, + "step": 27978 + }, + { + "epoch": 8.587783916513198, + "grad_norm": 0.16675019264221191, + "learning_rate": 5.141808101480905e-06, + "loss": 1.7388, + "step": 27979 + }, + { + "epoch": 8.588090853284223, + "grad_norm": 0.17474086582660675, + "learning_rate": 5.139612837217233e-06, + "loss": 1.6897, + "step": 27980 + }, + { + "epoch": 8.588397790055248, + "grad_norm": 0.15096940100193024, + "learning_rate": 5.137418016290207e-06, + "loss": 1.6959, + "step": 27981 + }, + { + "epoch": 8.588704726826274, + "grad_norm": 0.13225309550762177, + "learning_rate": 5.1352236387215035e-06, + "loss": 1.6946, + "step": 27982 + }, + { + "epoch": 8.589011663597299, + "grad_norm": 0.13731913268566132, + "learning_rate": 5.133029704532821e-06, + "loss": 1.7076, + "step": 27983 + }, + { + "epoch": 8.589318600368324, + "grad_norm": 0.1227266862988472, + "learning_rate": 5.130836213745832e-06, + "loss": 1.6966, + "step": 27984 + }, + { + "epoch": 8.58962553713935, + "grad_norm": 0.16979724168777466, + "learning_rate": 5.128643166382224e-06, + "loss": 1.7365, + "step": 27985 + }, + { + "epoch": 8.589932473910375, + "grad_norm": 0.13253070414066315, + "learning_rate": 5.126450562463653e-06, + "loss": 1.6748, + "step": 27986 + }, + { + "epoch": 8.5902394106814, + "grad_norm": 0.13287228345870972, + "learning_rate": 5.124258402011817e-06, + "loss": 1.666, + "step": 27987 + }, + { + "epoch": 8.590546347452424, + "grad_norm": 0.1884436458349228, + "learning_rate": 5.122066685048338e-06, + "loss": 1.6974, + "step": 27988 + }, + { + "epoch": 8.59085328422345, + "grad_norm": 0.17336542904376984, + "learning_rate": 5.119875411594927e-06, + "loss": 1.6884, + "step": 27989 + }, + { + "epoch": 8.591160220994475, + "grad_norm": 0.19136151671409607, + "learning_rate": 5.117684581673188e-06, + "loss": 1.6976, + "step": 27990 + }, + { + "epoch": 8.5914671577655, + "grad_norm": 0.18627271056175232, + "learning_rate": 5.115494195304804e-06, + "loss": 1.7255, + "step": 27991 + }, + { + "epoch": 8.591774094536525, + "grad_norm": 0.1341535747051239, + "learning_rate": 5.1133042525114194e-06, + "loss": 1.661, + "step": 27992 + }, + { + "epoch": 8.59208103130755, + "grad_norm": 0.172500878572464, + "learning_rate": 5.1111147533146665e-06, + "loss": 1.7408, + "step": 27993 + }, + { + "epoch": 8.592387968078576, + "grad_norm": 0.14429397881031036, + "learning_rate": 5.108925697736188e-06, + "loss": 1.7025, + "step": 27994 + }, + { + "epoch": 8.592694904849601, + "grad_norm": 0.16930191218852997, + "learning_rate": 5.106737085797625e-06, + "loss": 1.7451, + "step": 27995 + }, + { + "epoch": 8.593001841620627, + "grad_norm": 0.17311960458755493, + "learning_rate": 5.104548917520591e-06, + "loss": 1.7077, + "step": 27996 + }, + { + "epoch": 8.593308778391652, + "grad_norm": 0.17147377133369446, + "learning_rate": 5.102361192926719e-06, + "loss": 1.701, + "step": 27997 + }, + { + "epoch": 8.593615715162677, + "grad_norm": 0.16215240955352783, + "learning_rate": 5.100173912037631e-06, + "loss": 1.6896, + "step": 27998 + }, + { + "epoch": 8.5939226519337, + "grad_norm": 0.1764577031135559, + "learning_rate": 5.097987074874944e-06, + "loss": 1.6895, + "step": 27999 + }, + { + "epoch": 8.594229588704726, + "grad_norm": 0.1574433147907257, + "learning_rate": 5.095800681460261e-06, + "loss": 1.7219, + "step": 28000 + }, + { + "epoch": 8.594536525475752, + "grad_norm": 0.1465912163257599, + "learning_rate": 5.0936147318152e-06, + "loss": 1.7077, + "step": 28001 + }, + { + "epoch": 8.594843462246777, + "grad_norm": 0.2024395614862442, + "learning_rate": 5.0914292259613524e-06, + "loss": 1.7956, + "step": 28002 + }, + { + "epoch": 8.595150399017802, + "grad_norm": 0.16168762743473053, + "learning_rate": 5.0892441639203205e-06, + "loss": 1.7311, + "step": 28003 + }, + { + "epoch": 8.595457335788828, + "grad_norm": 0.1713251769542694, + "learning_rate": 5.0870595457137185e-06, + "loss": 1.7123, + "step": 28004 + }, + { + "epoch": 8.595764272559853, + "grad_norm": 0.22206412255764008, + "learning_rate": 5.084875371363096e-06, + "loss": 1.7057, + "step": 28005 + }, + { + "epoch": 8.596071209330878, + "grad_norm": 0.14937512576580048, + "learning_rate": 5.082691640890081e-06, + "loss": 1.7231, + "step": 28006 + }, + { + "epoch": 8.596378146101904, + "grad_norm": 0.22501800954341888, + "learning_rate": 5.0805083543162155e-06, + "loss": 1.7729, + "step": 28007 + }, + { + "epoch": 8.596685082872929, + "grad_norm": 0.150779128074646, + "learning_rate": 5.0783255116631015e-06, + "loss": 1.6887, + "step": 28008 + }, + { + "epoch": 8.596992019643952, + "grad_norm": 0.1489362120628357, + "learning_rate": 5.076143112952308e-06, + "loss": 1.6774, + "step": 28009 + }, + { + "epoch": 8.597298956414978, + "grad_norm": 0.17022615671157837, + "learning_rate": 5.073961158205398e-06, + "loss": 1.6974, + "step": 28010 + }, + { + "epoch": 8.597605893186003, + "grad_norm": 0.16300532221794128, + "learning_rate": 5.071779647443931e-06, + "loss": 1.7194, + "step": 28011 + }, + { + "epoch": 8.597912829957028, + "grad_norm": 0.14973211288452148, + "learning_rate": 5.069598580689477e-06, + "loss": 1.7238, + "step": 28012 + }, + { + "epoch": 8.598219766728054, + "grad_norm": 0.1345965713262558, + "learning_rate": 5.067417957963583e-06, + "loss": 1.6372, + "step": 28013 + }, + { + "epoch": 8.598526703499079, + "grad_norm": 0.18125082552433014, + "learning_rate": 5.065237779287802e-06, + "loss": 1.7174, + "step": 28014 + }, + { + "epoch": 8.598833640270104, + "grad_norm": 0.1619734913110733, + "learning_rate": 5.063058044683671e-06, + "loss": 1.6951, + "step": 28015 + }, + { + "epoch": 8.59914057704113, + "grad_norm": 0.14732249081134796, + "learning_rate": 5.060878754172749e-06, + "loss": 1.7291, + "step": 28016 + }, + { + "epoch": 8.599447513812155, + "grad_norm": 0.14982318878173828, + "learning_rate": 5.058699907776554e-06, + "loss": 1.6962, + "step": 28017 + }, + { + "epoch": 8.59975445058318, + "grad_norm": 0.15376806259155273, + "learning_rate": 5.056521505516632e-06, + "loss": 1.6867, + "step": 28018 + }, + { + "epoch": 8.600061387354206, + "grad_norm": 0.1546332985162735, + "learning_rate": 5.054343547414509e-06, + "loss": 1.7219, + "step": 28019 + }, + { + "epoch": 8.600368324125231, + "grad_norm": 0.17485050857067108, + "learning_rate": 5.0521660334916895e-06, + "loss": 1.7266, + "step": 28020 + }, + { + "epoch": 8.600675260896255, + "grad_norm": 0.15625739097595215, + "learning_rate": 5.049988963769736e-06, + "loss": 1.7328, + "step": 28021 + }, + { + "epoch": 8.60098219766728, + "grad_norm": 0.26432421803474426, + "learning_rate": 5.0478123382701136e-06, + "loss": 1.7452, + "step": 28022 + }, + { + "epoch": 8.601289134438305, + "grad_norm": 0.16437242925167084, + "learning_rate": 5.045636157014377e-06, + "loss": 1.6945, + "step": 28023 + }, + { + "epoch": 8.60159607120933, + "grad_norm": 0.17274139821529388, + "learning_rate": 5.043460420023999e-06, + "loss": 1.6952, + "step": 28024 + }, + { + "epoch": 8.601903007980356, + "grad_norm": 0.2380651980638504, + "learning_rate": 5.0412851273205e-06, + "loss": 1.7412, + "step": 28025 + }, + { + "epoch": 8.602209944751381, + "grad_norm": 0.1543026566505432, + "learning_rate": 5.039110278925374e-06, + "loss": 1.7063, + "step": 28026 + }, + { + "epoch": 8.602516881522407, + "grad_norm": 0.15819939970970154, + "learning_rate": 5.036935874860111e-06, + "loss": 1.703, + "step": 28027 + }, + { + "epoch": 8.602823818293432, + "grad_norm": 0.20054341852664948, + "learning_rate": 5.034761915146208e-06, + "loss": 1.741, + "step": 28028 + }, + { + "epoch": 8.603130755064457, + "grad_norm": 0.1404278427362442, + "learning_rate": 5.032588399805127e-06, + "loss": 1.6822, + "step": 28029 + }, + { + "epoch": 8.603437691835483, + "grad_norm": 0.1339765340089798, + "learning_rate": 5.030415328858374e-06, + "loss": 1.6741, + "step": 28030 + }, + { + "epoch": 8.603744628606506, + "grad_norm": 0.17520250380039215, + "learning_rate": 5.028242702327413e-06, + "loss": 1.7655, + "step": 28031 + }, + { + "epoch": 8.604051565377532, + "grad_norm": 0.1701551079750061, + "learning_rate": 5.0260705202337165e-06, + "loss": 1.7219, + "step": 28032 + }, + { + "epoch": 8.604358502148557, + "grad_norm": 0.1882735937833786, + "learning_rate": 5.023898782598752e-06, + "loss": 1.7482, + "step": 28033 + }, + { + "epoch": 8.604665438919582, + "grad_norm": 0.1356845200061798, + "learning_rate": 5.021727489443984e-06, + "loss": 1.6647, + "step": 28034 + }, + { + "epoch": 8.604972375690608, + "grad_norm": 0.1686328649520874, + "learning_rate": 5.019556640790862e-06, + "loss": 1.7454, + "step": 28035 + }, + { + "epoch": 8.605279312461633, + "grad_norm": 0.16747170686721802, + "learning_rate": 5.017386236660848e-06, + "loss": 1.6747, + "step": 28036 + }, + { + "epoch": 8.605586249232658, + "grad_norm": 0.18954692780971527, + "learning_rate": 5.0152162770753795e-06, + "loss": 1.7351, + "step": 28037 + }, + { + "epoch": 8.605893186003684, + "grad_norm": 0.19075840711593628, + "learning_rate": 5.013046762055929e-06, + "loss": 1.8257, + "step": 28038 + }, + { + "epoch": 8.606200122774709, + "grad_norm": 0.22513258457183838, + "learning_rate": 5.010877691623894e-06, + "loss": 1.7548, + "step": 28039 + }, + { + "epoch": 8.606507059545734, + "grad_norm": 0.15815886855125427, + "learning_rate": 5.00870906580076e-06, + "loss": 1.6793, + "step": 28040 + }, + { + "epoch": 8.60681399631676, + "grad_norm": 0.15267199277877808, + "learning_rate": 5.006540884607913e-06, + "loss": 1.6703, + "step": 28041 + }, + { + "epoch": 8.607120933087783, + "grad_norm": 0.14877180755138397, + "learning_rate": 5.00437314806681e-06, + "loss": 1.6859, + "step": 28042 + }, + { + "epoch": 8.607427869858808, + "grad_norm": 0.18780232965946198, + "learning_rate": 5.002205856198861e-06, + "loss": 1.7205, + "step": 28043 + }, + { + "epoch": 8.607734806629834, + "grad_norm": 0.1645117998123169, + "learning_rate": 5.000039009025492e-06, + "loss": 1.7726, + "step": 28044 + }, + { + "epoch": 8.60804174340086, + "grad_norm": 0.1449744552373886, + "learning_rate": 4.997872606568116e-06, + "loss": 1.6704, + "step": 28045 + }, + { + "epoch": 8.608348680171884, + "grad_norm": 0.15839919447898865, + "learning_rate": 4.9957066488481255e-06, + "loss": 1.6844, + "step": 28046 + }, + { + "epoch": 8.60865561694291, + "grad_norm": 0.16456182301044464, + "learning_rate": 4.993541135886948e-06, + "loss": 1.7141, + "step": 28047 + }, + { + "epoch": 8.608962553713935, + "grad_norm": 0.154433935880661, + "learning_rate": 4.991376067705977e-06, + "loss": 1.7077, + "step": 28048 + }, + { + "epoch": 8.60926949048496, + "grad_norm": 0.13631665706634521, + "learning_rate": 4.989211444326608e-06, + "loss": 1.6819, + "step": 28049 + }, + { + "epoch": 8.609576427255986, + "grad_norm": 0.13026617467403412, + "learning_rate": 4.987047265770234e-06, + "loss": 1.6929, + "step": 28050 + }, + { + "epoch": 8.609883364027011, + "grad_norm": 0.1359538435935974, + "learning_rate": 4.984883532058243e-06, + "loss": 1.6534, + "step": 28051 + }, + { + "epoch": 8.610190300798035, + "grad_norm": 0.13192327320575714, + "learning_rate": 4.982720243212014e-06, + "loss": 1.694, + "step": 28052 + }, + { + "epoch": 8.61049723756906, + "grad_norm": 0.17191945016384125, + "learning_rate": 4.980557399252928e-06, + "loss": 1.7402, + "step": 28053 + }, + { + "epoch": 8.610804174340085, + "grad_norm": 0.12728241086006165, + "learning_rate": 4.978395000202363e-06, + "loss": 1.7231, + "step": 28054 + }, + { + "epoch": 8.61111111111111, + "grad_norm": 0.15232713520526886, + "learning_rate": 4.976233046081685e-06, + "loss": 1.6805, + "step": 28055 + }, + { + "epoch": 8.611418047882136, + "grad_norm": 0.13869190216064453, + "learning_rate": 4.974071536912256e-06, + "loss": 1.6771, + "step": 28056 + }, + { + "epoch": 8.611724984653161, + "grad_norm": 0.16099198162555695, + "learning_rate": 4.971910472715458e-06, + "loss": 1.6853, + "step": 28057 + }, + { + "epoch": 8.612031921424187, + "grad_norm": 0.147923544049263, + "learning_rate": 4.969749853512612e-06, + "loss": 1.7173, + "step": 28058 + }, + { + "epoch": 8.612338858195212, + "grad_norm": 0.16606341302394867, + "learning_rate": 4.967589679325102e-06, + "loss": 1.7262, + "step": 28059 + }, + { + "epoch": 8.612645794966237, + "grad_norm": 0.12743404507637024, + "learning_rate": 4.965429950174266e-06, + "loss": 1.6612, + "step": 28060 + }, + { + "epoch": 8.612952731737263, + "grad_norm": 0.12468522787094116, + "learning_rate": 4.9632706660814436e-06, + "loss": 1.6835, + "step": 28061 + }, + { + "epoch": 8.613259668508288, + "grad_norm": 0.16881446540355682, + "learning_rate": 4.9611118270679935e-06, + "loss": 1.7433, + "step": 28062 + }, + { + "epoch": 8.613566605279313, + "grad_norm": 0.2030627429485321, + "learning_rate": 4.958953433155211e-06, + "loss": 1.7739, + "step": 28063 + }, + { + "epoch": 8.613873542050337, + "grad_norm": 0.18076404929161072, + "learning_rate": 4.956795484364457e-06, + "loss": 1.7316, + "step": 28064 + }, + { + "epoch": 8.614180478821362, + "grad_norm": 0.12519899010658264, + "learning_rate": 4.954637980717058e-06, + "loss": 1.6686, + "step": 28065 + }, + { + "epoch": 8.614487415592388, + "grad_norm": 0.16320455074310303, + "learning_rate": 4.95248092223432e-06, + "loss": 1.744, + "step": 28066 + }, + { + "epoch": 8.614794352363413, + "grad_norm": 0.18789352476596832, + "learning_rate": 4.950324308937576e-06, + "loss": 1.7619, + "step": 28067 + }, + { + "epoch": 8.615101289134438, + "grad_norm": 0.13703711330890656, + "learning_rate": 4.948168140848125e-06, + "loss": 1.6652, + "step": 28068 + }, + { + "epoch": 8.615408225905464, + "grad_norm": 0.16874989867210388, + "learning_rate": 4.946012417987289e-06, + "loss": 1.6783, + "step": 28069 + }, + { + "epoch": 8.615715162676489, + "grad_norm": 0.1780901849269867, + "learning_rate": 4.943857140376362e-06, + "loss": 1.7224, + "step": 28070 + }, + { + "epoch": 8.616022099447514, + "grad_norm": 0.19460240006446838, + "learning_rate": 4.941702308036644e-06, + "loss": 1.7314, + "step": 28071 + }, + { + "epoch": 8.61632903621854, + "grad_norm": 0.14954718947410583, + "learning_rate": 4.9395479209894404e-06, + "loss": 1.708, + "step": 28072 + }, + { + "epoch": 8.616635972989565, + "grad_norm": 0.17461352050304413, + "learning_rate": 4.937393979256016e-06, + "loss": 1.7458, + "step": 28073 + }, + { + "epoch": 8.616942909760589, + "grad_norm": 0.17088642716407776, + "learning_rate": 4.935240482857706e-06, + "loss": 1.7315, + "step": 28074 + }, + { + "epoch": 8.617249846531614, + "grad_norm": 0.1478833556175232, + "learning_rate": 4.933087431815736e-06, + "loss": 1.6646, + "step": 28075 + }, + { + "epoch": 8.61755678330264, + "grad_norm": 0.1860690414905548, + "learning_rate": 4.930934826151435e-06, + "loss": 1.6472, + "step": 28076 + }, + { + "epoch": 8.617863720073665, + "grad_norm": 0.23674537241458893, + "learning_rate": 4.928782665886028e-06, + "loss": 1.7677, + "step": 28077 + }, + { + "epoch": 8.61817065684469, + "grad_norm": 0.1638643592596054, + "learning_rate": 4.926630951040817e-06, + "loss": 1.7438, + "step": 28078 + }, + { + "epoch": 8.618477593615715, + "grad_norm": 0.1631689965724945, + "learning_rate": 4.924479681637067e-06, + "loss": 1.7167, + "step": 28079 + }, + { + "epoch": 8.61878453038674, + "grad_norm": 0.1493348926305771, + "learning_rate": 4.922328857696012e-06, + "loss": 1.6929, + "step": 28080 + }, + { + "epoch": 8.619091467157766, + "grad_norm": 0.1545657068490982, + "learning_rate": 4.920178479238935e-06, + "loss": 1.7048, + "step": 28081 + }, + { + "epoch": 8.619398403928791, + "grad_norm": 0.20011793076992035, + "learning_rate": 4.918028546287073e-06, + "loss": 1.726, + "step": 28082 + }, + { + "epoch": 8.619705340699817, + "grad_norm": 0.1705177128314972, + "learning_rate": 4.915879058861678e-06, + "loss": 1.7774, + "step": 28083 + }, + { + "epoch": 8.620012277470842, + "grad_norm": 0.15467505156993866, + "learning_rate": 4.913730016983992e-06, + "loss": 1.6933, + "step": 28084 + }, + { + "epoch": 8.620319214241865, + "grad_norm": 0.1319204419851303, + "learning_rate": 4.911581420675248e-06, + "loss": 1.7309, + "step": 28085 + }, + { + "epoch": 8.62062615101289, + "grad_norm": 0.163784459233284, + "learning_rate": 4.909433269956687e-06, + "loss": 1.7221, + "step": 28086 + }, + { + "epoch": 8.620933087783916, + "grad_norm": 0.15852972865104675, + "learning_rate": 4.907285564849534e-06, + "loss": 1.7018, + "step": 28087 + }, + { + "epoch": 8.621240024554941, + "grad_norm": 0.14603203535079956, + "learning_rate": 4.905138305375018e-06, + "loss": 1.6786, + "step": 28088 + }, + { + "epoch": 8.621546961325967, + "grad_norm": 0.14899590611457825, + "learning_rate": 4.902991491554348e-06, + "loss": 1.7039, + "step": 28089 + }, + { + "epoch": 8.621853898096992, + "grad_norm": 0.13559244573116302, + "learning_rate": 4.9008451234087426e-06, + "loss": 1.6831, + "step": 28090 + }, + { + "epoch": 8.622160834868017, + "grad_norm": 0.1433703601360321, + "learning_rate": 4.898699200959439e-06, + "loss": 1.6567, + "step": 28091 + }, + { + "epoch": 8.622467771639043, + "grad_norm": 0.12275373190641403, + "learning_rate": 4.89655372422761e-06, + "loss": 1.6897, + "step": 28092 + }, + { + "epoch": 8.622774708410068, + "grad_norm": 0.12706153094768524, + "learning_rate": 4.894408693234487e-06, + "loss": 1.6287, + "step": 28093 + }, + { + "epoch": 8.623081645181093, + "grad_norm": 0.18988971412181854, + "learning_rate": 4.892264108001232e-06, + "loss": 1.7021, + "step": 28094 + }, + { + "epoch": 8.623388581952117, + "grad_norm": 0.17477858066558838, + "learning_rate": 4.8901199685490785e-06, + "loss": 1.7289, + "step": 28095 + }, + { + "epoch": 8.623695518723142, + "grad_norm": 0.16172516345977783, + "learning_rate": 4.887976274899203e-06, + "loss": 1.7265, + "step": 28096 + }, + { + "epoch": 8.624002455494168, + "grad_norm": 0.14414304494857788, + "learning_rate": 4.885833027072772e-06, + "loss": 1.6795, + "step": 28097 + }, + { + "epoch": 8.624309392265193, + "grad_norm": 0.17894591391086578, + "learning_rate": 4.8836902250909975e-06, + "loss": 1.7564, + "step": 28098 + }, + { + "epoch": 8.624616329036218, + "grad_norm": 0.141717329621315, + "learning_rate": 4.881547868975022e-06, + "loss": 1.7047, + "step": 28099 + }, + { + "epoch": 8.624923265807244, + "grad_norm": 0.2184356302022934, + "learning_rate": 4.879405958746047e-06, + "loss": 1.7447, + "step": 28100 + }, + { + "epoch": 8.625230202578269, + "grad_norm": 0.1739104986190796, + "learning_rate": 4.877264494425227e-06, + "loss": 1.7003, + "step": 28101 + }, + { + "epoch": 8.625537139349294, + "grad_norm": 0.17033645510673523, + "learning_rate": 4.875123476033721e-06, + "loss": 1.7019, + "step": 28102 + }, + { + "epoch": 8.62584407612032, + "grad_norm": 0.1620563268661499, + "learning_rate": 4.872982903592699e-06, + "loss": 1.6955, + "step": 28103 + }, + { + "epoch": 8.626151012891345, + "grad_norm": 0.16582414507865906, + "learning_rate": 4.870842777123308e-06, + "loss": 1.6687, + "step": 28104 + }, + { + "epoch": 8.62645794966237, + "grad_norm": 0.1620030403137207, + "learning_rate": 4.8687030966466985e-06, + "loss": 1.6762, + "step": 28105 + }, + { + "epoch": 8.626764886433394, + "grad_norm": 0.16777098178863525, + "learning_rate": 4.86656386218402e-06, + "loss": 1.7117, + "step": 28106 + }, + { + "epoch": 8.62707182320442, + "grad_norm": 0.16074253618717194, + "learning_rate": 4.8644250737564014e-06, + "loss": 1.7205, + "step": 28107 + }, + { + "epoch": 8.627378759975445, + "grad_norm": 0.1414494514465332, + "learning_rate": 4.862286731385007e-06, + "loss": 1.6936, + "step": 28108 + }, + { + "epoch": 8.62768569674647, + "grad_norm": 0.206336110830307, + "learning_rate": 4.860148835090933e-06, + "loss": 1.7443, + "step": 28109 + }, + { + "epoch": 8.627992633517495, + "grad_norm": 0.16304929554462433, + "learning_rate": 4.858011384895345e-06, + "loss": 1.7525, + "step": 28110 + }, + { + "epoch": 8.62829957028852, + "grad_norm": 0.16839462518692017, + "learning_rate": 4.855874380819325e-06, + "loss": 1.7462, + "step": 28111 + }, + { + "epoch": 8.628606507059546, + "grad_norm": 0.16088010370731354, + "learning_rate": 4.8537378228840246e-06, + "loss": 1.7662, + "step": 28112 + }, + { + "epoch": 8.628913443830571, + "grad_norm": 0.1818089783191681, + "learning_rate": 4.851601711110559e-06, + "loss": 1.752, + "step": 28113 + }, + { + "epoch": 8.629220380601597, + "grad_norm": 0.19034543633460999, + "learning_rate": 4.8494660455200065e-06, + "loss": 1.8474, + "step": 28114 + }, + { + "epoch": 8.629527317372622, + "grad_norm": 0.15762893855571747, + "learning_rate": 4.847330826133517e-06, + "loss": 1.7615, + "step": 28115 + }, + { + "epoch": 8.629834254143645, + "grad_norm": 0.14152835309505463, + "learning_rate": 4.845196052972145e-06, + "loss": 1.702, + "step": 28116 + }, + { + "epoch": 8.63014119091467, + "grad_norm": 0.14755114912986755, + "learning_rate": 4.8430617260570245e-06, + "loss": 1.7044, + "step": 28117 + }, + { + "epoch": 8.630448127685696, + "grad_norm": 0.1483534872531891, + "learning_rate": 4.840927845409238e-06, + "loss": 1.6798, + "step": 28118 + }, + { + "epoch": 8.630755064456721, + "grad_norm": 0.15526263415813446, + "learning_rate": 4.8387944110498685e-06, + "loss": 1.7316, + "step": 28119 + }, + { + "epoch": 8.631062001227747, + "grad_norm": 0.21519999206066132, + "learning_rate": 4.836661422999999e-06, + "loss": 1.763, + "step": 28120 + }, + { + "epoch": 8.631368937998772, + "grad_norm": 0.14445212483406067, + "learning_rate": 4.8345288812807144e-06, + "loss": 1.6894, + "step": 28121 + }, + { + "epoch": 8.631675874769797, + "grad_norm": 0.1482388973236084, + "learning_rate": 4.832396785913091e-06, + "loss": 1.6629, + "step": 28122 + }, + { + "epoch": 8.631982811540823, + "grad_norm": 0.17132261395454407, + "learning_rate": 4.830265136918194e-06, + "loss": 1.7254, + "step": 28123 + }, + { + "epoch": 8.632289748311848, + "grad_norm": 0.1567879170179367, + "learning_rate": 4.828133934317081e-06, + "loss": 1.711, + "step": 28124 + }, + { + "epoch": 8.632596685082873, + "grad_norm": 0.18352550268173218, + "learning_rate": 4.826003178130845e-06, + "loss": 1.6853, + "step": 28125 + }, + { + "epoch": 8.632903621853899, + "grad_norm": 0.17370788753032684, + "learning_rate": 4.823872868380502e-06, + "loss": 1.7716, + "step": 28126 + }, + { + "epoch": 8.633210558624924, + "grad_norm": 0.14186492562294006, + "learning_rate": 4.821743005087148e-06, + "loss": 1.7003, + "step": 28127 + }, + { + "epoch": 8.633517495395948, + "grad_norm": 0.1501329094171524, + "learning_rate": 4.819613588271788e-06, + "loss": 1.7249, + "step": 28128 + }, + { + "epoch": 8.633824432166973, + "grad_norm": 0.13921687006950378, + "learning_rate": 4.817484617955498e-06, + "loss": 1.6646, + "step": 28129 + }, + { + "epoch": 8.634131368937998, + "grad_norm": 0.14346352219581604, + "learning_rate": 4.815356094159318e-06, + "loss": 1.6784, + "step": 28130 + }, + { + "epoch": 8.634438305709024, + "grad_norm": 0.1550782024860382, + "learning_rate": 4.813228016904247e-06, + "loss": 1.7052, + "step": 28131 + }, + { + "epoch": 8.634745242480049, + "grad_norm": 0.13514211773872375, + "learning_rate": 4.81110038621137e-06, + "loss": 1.7095, + "step": 28132 + }, + { + "epoch": 8.635052179251074, + "grad_norm": 0.14162956178188324, + "learning_rate": 4.8089732021016575e-06, + "loss": 1.7001, + "step": 28133 + }, + { + "epoch": 8.6353591160221, + "grad_norm": 0.14066293835639954, + "learning_rate": 4.806846464596177e-06, + "loss": 1.7037, + "step": 28134 + }, + { + "epoch": 8.635666052793125, + "grad_norm": 0.1918545961380005, + "learning_rate": 4.804720173715921e-06, + "loss": 1.7334, + "step": 28135 + }, + { + "epoch": 8.63597298956415, + "grad_norm": 0.13358080387115479, + "learning_rate": 4.802594329481913e-06, + "loss": 1.7063, + "step": 28136 + }, + { + "epoch": 8.636279926335176, + "grad_norm": 0.14988988637924194, + "learning_rate": 4.800468931915158e-06, + "loss": 1.6871, + "step": 28137 + }, + { + "epoch": 8.6365868631062, + "grad_norm": 0.1423332244157791, + "learning_rate": 4.798343981036663e-06, + "loss": 1.7133, + "step": 28138 + }, + { + "epoch": 8.636893799877225, + "grad_norm": 0.1372760534286499, + "learning_rate": 4.796219476867425e-06, + "loss": 1.6522, + "step": 28139 + }, + { + "epoch": 8.63720073664825, + "grad_norm": 0.14779186248779297, + "learning_rate": 4.794095419428446e-06, + "loss": 1.669, + "step": 28140 + }, + { + "epoch": 8.637507673419275, + "grad_norm": 0.1412673145532608, + "learning_rate": 4.7919718087406975e-06, + "loss": 1.6767, + "step": 28141 + }, + { + "epoch": 8.6378146101903, + "grad_norm": 0.13006745278835297, + "learning_rate": 4.789848644825201e-06, + "loss": 1.6804, + "step": 28142 + }, + { + "epoch": 8.638121546961326, + "grad_norm": 0.15673677623271942, + "learning_rate": 4.787725927702896e-06, + "loss": 1.7053, + "step": 28143 + }, + { + "epoch": 8.638428483732351, + "grad_norm": 0.17693878710269928, + "learning_rate": 4.785603657394805e-06, + "loss": 1.7207, + "step": 28144 + }, + { + "epoch": 8.638735420503377, + "grad_norm": 0.15449829399585724, + "learning_rate": 4.7834818339218654e-06, + "loss": 1.7433, + "step": 28145 + }, + { + "epoch": 8.639042357274402, + "grad_norm": 0.14260755479335785, + "learning_rate": 4.781360457305062e-06, + "loss": 1.6707, + "step": 28146 + }, + { + "epoch": 8.639349294045427, + "grad_norm": 0.13936764001846313, + "learning_rate": 4.7792395275653715e-06, + "loss": 1.6749, + "step": 28147 + }, + { + "epoch": 8.639656230816453, + "grad_norm": 0.14369705319404602, + "learning_rate": 4.7771190447237215e-06, + "loss": 1.6943, + "step": 28148 + }, + { + "epoch": 8.639963167587476, + "grad_norm": 0.18439368903636932, + "learning_rate": 4.774999008801107e-06, + "loss": 1.7714, + "step": 28149 + }, + { + "epoch": 8.640270104358502, + "grad_norm": 0.15348297357559204, + "learning_rate": 4.772879419818438e-06, + "loss": 1.7315, + "step": 28150 + }, + { + "epoch": 8.640577041129527, + "grad_norm": 0.16643862426280975, + "learning_rate": 4.770760277796693e-06, + "loss": 1.7196, + "step": 28151 + }, + { + "epoch": 8.640883977900552, + "grad_norm": 0.16105540096759796, + "learning_rate": 4.768641582756811e-06, + "loss": 1.7504, + "step": 28152 + }, + { + "epoch": 8.641190914671578, + "grad_norm": 0.135291188955307, + "learning_rate": 4.766523334719714e-06, + "loss": 1.663, + "step": 28153 + }, + { + "epoch": 8.641497851442603, + "grad_norm": 0.15021322667598724, + "learning_rate": 4.764405533706351e-06, + "loss": 1.7318, + "step": 28154 + }, + { + "epoch": 8.641804788213628, + "grad_norm": 0.13949114084243774, + "learning_rate": 4.762288179737645e-06, + "loss": 1.6909, + "step": 28155 + }, + { + "epoch": 8.642111724984654, + "grad_norm": 0.17211735248565674, + "learning_rate": 4.760171272834524e-06, + "loss": 1.7539, + "step": 28156 + }, + { + "epoch": 8.642418661755679, + "grad_norm": 0.12576675415039062, + "learning_rate": 4.7580548130179034e-06, + "loss": 1.6816, + "step": 28157 + }, + { + "epoch": 8.642725598526704, + "grad_norm": 0.18624669313430786, + "learning_rate": 4.755938800308696e-06, + "loss": 1.7976, + "step": 28158 + }, + { + "epoch": 8.643032535297728, + "grad_norm": 0.20610935986042023, + "learning_rate": 4.753823234727834e-06, + "loss": 1.7192, + "step": 28159 + }, + { + "epoch": 8.643339472068753, + "grad_norm": 0.15127690136432648, + "learning_rate": 4.751708116296194e-06, + "loss": 1.6918, + "step": 28160 + }, + { + "epoch": 8.643646408839778, + "grad_norm": 0.14993508160114288, + "learning_rate": 4.7495934450347115e-06, + "loss": 1.7075, + "step": 28161 + }, + { + "epoch": 8.643953345610804, + "grad_norm": 0.16896332800388336, + "learning_rate": 4.747479220964252e-06, + "loss": 1.6971, + "step": 28162 + }, + { + "epoch": 8.644260282381829, + "grad_norm": 0.20022685825824738, + "learning_rate": 4.745365444105737e-06, + "loss": 1.7479, + "step": 28163 + }, + { + "epoch": 8.644567219152854, + "grad_norm": 0.1731337308883667, + "learning_rate": 4.7432521144800565e-06, + "loss": 1.7384, + "step": 28164 + }, + { + "epoch": 8.64487415592388, + "grad_norm": 0.13517920672893524, + "learning_rate": 4.7411392321080605e-06, + "loss": 1.6611, + "step": 28165 + }, + { + "epoch": 8.645181092694905, + "grad_norm": 0.177021324634552, + "learning_rate": 4.739026797010676e-06, + "loss": 1.7779, + "step": 28166 + }, + { + "epoch": 8.64548802946593, + "grad_norm": 0.14956676959991455, + "learning_rate": 4.736914809208737e-06, + "loss": 1.6933, + "step": 28167 + }, + { + "epoch": 8.645794966236956, + "grad_norm": 0.15683145821094513, + "learning_rate": 4.734803268723143e-06, + "loss": 1.7067, + "step": 28168 + }, + { + "epoch": 8.646101903007981, + "grad_norm": 0.198720321059227, + "learning_rate": 4.732692175574755e-06, + "loss": 1.6567, + "step": 28169 + }, + { + "epoch": 8.646408839779006, + "grad_norm": 0.18899580836296082, + "learning_rate": 4.730581529784439e-06, + "loss": 1.7069, + "step": 28170 + }, + { + "epoch": 8.64671577655003, + "grad_norm": 0.17795316874980927, + "learning_rate": 4.728471331373041e-06, + "loss": 1.6803, + "step": 28171 + }, + { + "epoch": 8.647022713321055, + "grad_norm": 0.18296107649803162, + "learning_rate": 4.7263615803614325e-06, + "loss": 1.7774, + "step": 28172 + }, + { + "epoch": 8.64732965009208, + "grad_norm": 0.13994812965393066, + "learning_rate": 4.724252276770453e-06, + "loss": 1.6826, + "step": 28173 + }, + { + "epoch": 8.647636586863106, + "grad_norm": 0.14969824254512787, + "learning_rate": 4.722143420620945e-06, + "loss": 1.6529, + "step": 28174 + }, + { + "epoch": 8.647943523634131, + "grad_norm": 0.14949028193950653, + "learning_rate": 4.7200350119337485e-06, + "loss": 1.7007, + "step": 28175 + }, + { + "epoch": 8.648250460405157, + "grad_norm": 0.14786000549793243, + "learning_rate": 4.71792705072972e-06, + "loss": 1.6999, + "step": 28176 + }, + { + "epoch": 8.648557397176182, + "grad_norm": 0.12665456533432007, + "learning_rate": 4.715819537029659e-06, + "loss": 1.6414, + "step": 28177 + }, + { + "epoch": 8.648864333947207, + "grad_norm": 0.19015786051750183, + "learning_rate": 4.713712470854437e-06, + "loss": 1.7328, + "step": 28178 + }, + { + "epoch": 8.649171270718233, + "grad_norm": 0.20775510370731354, + "learning_rate": 4.711605852224827e-06, + "loss": 1.7735, + "step": 28179 + }, + { + "epoch": 8.649478207489258, + "grad_norm": 0.13774684071540833, + "learning_rate": 4.709499681161678e-06, + "loss": 1.7139, + "step": 28180 + }, + { + "epoch": 8.649785144260282, + "grad_norm": 0.17355668544769287, + "learning_rate": 4.707393957685813e-06, + "loss": 1.7046, + "step": 28181 + }, + { + "epoch": 8.650092081031307, + "grad_norm": 0.21687985956668854, + "learning_rate": 4.70528868181801e-06, + "loss": 1.6736, + "step": 28182 + }, + { + "epoch": 8.650399017802332, + "grad_norm": 0.13978178799152374, + "learning_rate": 4.703183853579107e-06, + "loss": 1.6841, + "step": 28183 + }, + { + "epoch": 8.650705954573358, + "grad_norm": 0.1476740539073944, + "learning_rate": 4.701079472989878e-06, + "loss": 1.6633, + "step": 28184 + }, + { + "epoch": 8.651012891344383, + "grad_norm": 0.17175909876823425, + "learning_rate": 4.698975540071138e-06, + "loss": 1.7059, + "step": 28185 + }, + { + "epoch": 8.651319828115408, + "grad_norm": 0.16164059937000275, + "learning_rate": 4.696872054843671e-06, + "loss": 1.7038, + "step": 28186 + }, + { + "epoch": 8.651626764886434, + "grad_norm": 0.1541287899017334, + "learning_rate": 4.694769017328271e-06, + "loss": 1.6583, + "step": 28187 + }, + { + "epoch": 8.651933701657459, + "grad_norm": 0.19379135966300964, + "learning_rate": 4.6926664275457165e-06, + "loss": 1.7375, + "step": 28188 + }, + { + "epoch": 8.652240638428484, + "grad_norm": 0.12427667528390884, + "learning_rate": 4.690564285516785e-06, + "loss": 1.6434, + "step": 28189 + }, + { + "epoch": 8.65254757519951, + "grad_norm": 0.15416522324085236, + "learning_rate": 4.6884625912622605e-06, + "loss": 1.7551, + "step": 28190 + }, + { + "epoch": 8.652854511970535, + "grad_norm": 0.1467018723487854, + "learning_rate": 4.6863613448029035e-06, + "loss": 1.704, + "step": 28191 + }, + { + "epoch": 8.653161448741558, + "grad_norm": 0.15078933537006378, + "learning_rate": 4.684260546159469e-06, + "loss": 1.7382, + "step": 28192 + }, + { + "epoch": 8.653468385512584, + "grad_norm": 0.13681283593177795, + "learning_rate": 4.682160195352758e-06, + "loss": 1.6732, + "step": 28193 + }, + { + "epoch": 8.65377532228361, + "grad_norm": 0.16412119567394257, + "learning_rate": 4.680060292403476e-06, + "loss": 1.7394, + "step": 28194 + }, + { + "epoch": 8.654082259054634, + "grad_norm": 0.14504186809062958, + "learning_rate": 4.677960837332423e-06, + "loss": 1.6602, + "step": 28195 + }, + { + "epoch": 8.65438919582566, + "grad_norm": 0.15267091989517212, + "learning_rate": 4.6758618301603105e-06, + "loss": 1.7041, + "step": 28196 + }, + { + "epoch": 8.654696132596685, + "grad_norm": 0.1807365119457245, + "learning_rate": 4.673763270907899e-06, + "loss": 1.7556, + "step": 28197 + }, + { + "epoch": 8.65500306936771, + "grad_norm": 0.16227813065052032, + "learning_rate": 4.671665159595939e-06, + "loss": 1.6976, + "step": 28198 + }, + { + "epoch": 8.655310006138736, + "grad_norm": 0.16095015406608582, + "learning_rate": 4.6695674962451305e-06, + "loss": 1.7078, + "step": 28199 + }, + { + "epoch": 8.655616942909761, + "grad_norm": 0.1518808901309967, + "learning_rate": 4.667470280876246e-06, + "loss": 1.6999, + "step": 28200 + }, + { + "epoch": 8.655923879680786, + "grad_norm": 0.13343939185142517, + "learning_rate": 4.665373513509974e-06, + "loss": 1.7186, + "step": 28201 + }, + { + "epoch": 8.65623081645181, + "grad_norm": 0.1545572429895401, + "learning_rate": 4.6632771941670535e-06, + "loss": 1.7281, + "step": 28202 + }, + { + "epoch": 8.656537753222835, + "grad_norm": 0.13296550512313843, + "learning_rate": 4.661181322868208e-06, + "loss": 1.6632, + "step": 28203 + }, + { + "epoch": 8.65684468999386, + "grad_norm": 0.15362371504306793, + "learning_rate": 4.659085899634141e-06, + "loss": 1.7415, + "step": 28204 + }, + { + "epoch": 8.657151626764886, + "grad_norm": 0.14498870074748993, + "learning_rate": 4.65699092448556e-06, + "loss": 1.7342, + "step": 28205 + }, + { + "epoch": 8.657458563535911, + "grad_norm": 0.19409331679344177, + "learning_rate": 4.654896397443176e-06, + "loss": 1.7562, + "step": 28206 + }, + { + "epoch": 8.657765500306937, + "grad_norm": 0.15481562912464142, + "learning_rate": 4.652802318527677e-06, + "loss": 1.6905, + "step": 28207 + }, + { + "epoch": 8.658072437077962, + "grad_norm": 0.17566657066345215, + "learning_rate": 4.650708687759769e-06, + "loss": 1.6902, + "step": 28208 + }, + { + "epoch": 8.658379373848987, + "grad_norm": 0.13994581997394562, + "learning_rate": 4.648615505160125e-06, + "loss": 1.672, + "step": 28209 + }, + { + "epoch": 8.658686310620013, + "grad_norm": 0.34969639778137207, + "learning_rate": 4.646522770749467e-06, + "loss": 1.6959, + "step": 28210 + }, + { + "epoch": 8.658993247391038, + "grad_norm": 0.16637352108955383, + "learning_rate": 4.644430484548428e-06, + "loss": 1.7119, + "step": 28211 + }, + { + "epoch": 8.659300184162063, + "grad_norm": 0.16540484130382538, + "learning_rate": 4.642338646577738e-06, + "loss": 1.7541, + "step": 28212 + }, + { + "epoch": 8.659607120933089, + "grad_norm": 0.13890287280082703, + "learning_rate": 4.640247256858016e-06, + "loss": 1.7117, + "step": 28213 + }, + { + "epoch": 8.659914057704112, + "grad_norm": 0.1403251439332962, + "learning_rate": 4.63815631540997e-06, + "loss": 1.697, + "step": 28214 + }, + { + "epoch": 8.660220994475138, + "grad_norm": 0.13313040137290955, + "learning_rate": 4.63606582225426e-06, + "loss": 1.6587, + "step": 28215 + }, + { + "epoch": 8.660527931246163, + "grad_norm": 0.12887243926525116, + "learning_rate": 4.63397577741152e-06, + "loss": 1.6441, + "step": 28216 + }, + { + "epoch": 8.660834868017188, + "grad_norm": 0.15074272453784943, + "learning_rate": 4.631886180902434e-06, + "loss": 1.7176, + "step": 28217 + }, + { + "epoch": 8.661141804788214, + "grad_norm": 0.12572859227657318, + "learning_rate": 4.629797032747624e-06, + "loss": 1.6779, + "step": 28218 + }, + { + "epoch": 8.661448741559239, + "grad_norm": 0.1607646495103836, + "learning_rate": 4.627708332967762e-06, + "loss": 1.747, + "step": 28219 + }, + { + "epoch": 8.661755678330264, + "grad_norm": 0.14080339670181274, + "learning_rate": 4.625620081583482e-06, + "loss": 1.7063, + "step": 28220 + }, + { + "epoch": 8.66206261510129, + "grad_norm": 0.17140309512615204, + "learning_rate": 4.623532278615411e-06, + "loss": 1.7265, + "step": 28221 + }, + { + "epoch": 8.662369551872315, + "grad_norm": 0.1564357578754425, + "learning_rate": 4.621444924084195e-06, + "loss": 1.7265, + "step": 28222 + }, + { + "epoch": 8.66267648864334, + "grad_norm": 0.20058012008666992, + "learning_rate": 4.619358018010461e-06, + "loss": 1.7824, + "step": 28223 + }, + { + "epoch": 8.662983425414364, + "grad_norm": 0.16060246527194977, + "learning_rate": 4.617271560414827e-06, + "loss": 1.7329, + "step": 28224 + }, + { + "epoch": 8.66329036218539, + "grad_norm": 0.1967579573392868, + "learning_rate": 4.6151855513179136e-06, + "loss": 1.7386, + "step": 28225 + }, + { + "epoch": 8.663597298956415, + "grad_norm": 0.14853200316429138, + "learning_rate": 4.613099990740338e-06, + "loss": 1.6727, + "step": 28226 + }, + { + "epoch": 8.66390423572744, + "grad_norm": 0.1625850945711136, + "learning_rate": 4.611014878702713e-06, + "loss": 1.7074, + "step": 28227 + }, + { + "epoch": 8.664211172498465, + "grad_norm": 0.15605251491069794, + "learning_rate": 4.608930215225627e-06, + "loss": 1.7092, + "step": 28228 + }, + { + "epoch": 8.66451810926949, + "grad_norm": 0.14355498552322388, + "learning_rate": 4.606846000329723e-06, + "loss": 1.6819, + "step": 28229 + }, + { + "epoch": 8.664825046040516, + "grad_norm": 0.16151221096515656, + "learning_rate": 4.604762234035548e-06, + "loss": 1.7251, + "step": 28230 + }, + { + "epoch": 8.665131982811541, + "grad_norm": 0.1165589988231659, + "learning_rate": 4.60267891636374e-06, + "loss": 1.644, + "step": 28231 + }, + { + "epoch": 8.665438919582567, + "grad_norm": 0.13766367733478546, + "learning_rate": 4.6005960473348594e-06, + "loss": 1.6526, + "step": 28232 + }, + { + "epoch": 8.665745856353592, + "grad_norm": 0.15400783717632294, + "learning_rate": 4.598513626969486e-06, + "loss": 1.7356, + "step": 28233 + }, + { + "epoch": 8.666052793124617, + "grad_norm": 0.1635274887084961, + "learning_rate": 4.596431655288236e-06, + "loss": 1.6846, + "step": 28234 + }, + { + "epoch": 8.66635972989564, + "grad_norm": 0.17310741543769836, + "learning_rate": 4.5943501323116365e-06, + "loss": 1.7321, + "step": 28235 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.14390932023525238, + "learning_rate": 4.592269058060295e-06, + "loss": 1.6606, + "step": 28236 + }, + { + "epoch": 8.666973603437691, + "grad_norm": 0.15254996716976166, + "learning_rate": 4.590188432554759e-06, + "loss": 1.6796, + "step": 28237 + }, + { + "epoch": 8.667280540208717, + "grad_norm": 0.16224564611911774, + "learning_rate": 4.588108255815599e-06, + "loss": 1.7139, + "step": 28238 + }, + { + "epoch": 8.667587476979742, + "grad_norm": 0.14472807943820953, + "learning_rate": 4.586028527863373e-06, + "loss": 1.681, + "step": 28239 + }, + { + "epoch": 8.667894413750767, + "grad_norm": 0.17748364806175232, + "learning_rate": 4.583949248718627e-06, + "loss": 1.7205, + "step": 28240 + }, + { + "epoch": 8.668201350521793, + "grad_norm": 0.16917170584201813, + "learning_rate": 4.581870418401918e-06, + "loss": 1.7475, + "step": 28241 + }, + { + "epoch": 8.668508287292818, + "grad_norm": 0.15715333819389343, + "learning_rate": 4.579792036933784e-06, + "loss": 1.6988, + "step": 28242 + }, + { + "epoch": 8.668815224063843, + "grad_norm": 0.18384969234466553, + "learning_rate": 4.577714104334768e-06, + "loss": 1.715, + "step": 28243 + }, + { + "epoch": 8.669122160834869, + "grad_norm": 0.20845188200473785, + "learning_rate": 4.575636620625401e-06, + "loss": 1.784, + "step": 28244 + }, + { + "epoch": 8.669429097605892, + "grad_norm": 0.16388222575187683, + "learning_rate": 4.5735595858262095e-06, + "loss": 1.7091, + "step": 28245 + }, + { + "epoch": 8.669736034376918, + "grad_norm": 0.27372440695762634, + "learning_rate": 4.571482999957744e-06, + "loss": 1.6903, + "step": 28246 + }, + { + "epoch": 8.670042971147943, + "grad_norm": 0.14129513502120972, + "learning_rate": 4.569406863040493e-06, + "loss": 1.692, + "step": 28247 + }, + { + "epoch": 8.670349907918968, + "grad_norm": 0.1707242876291275, + "learning_rate": 4.567331175095013e-06, + "loss": 1.7542, + "step": 28248 + }, + { + "epoch": 8.670656844689994, + "grad_norm": 0.16061219573020935, + "learning_rate": 4.565255936141783e-06, + "loss": 1.7086, + "step": 28249 + }, + { + "epoch": 8.670963781461019, + "grad_norm": 0.186256542801857, + "learning_rate": 4.5631811462013116e-06, + "loss": 1.7298, + "step": 28250 + }, + { + "epoch": 8.671270718232044, + "grad_norm": 0.19365312159061432, + "learning_rate": 4.561106805294141e-06, + "loss": 1.7714, + "step": 28251 + }, + { + "epoch": 8.67157765500307, + "grad_norm": 0.12306032329797745, + "learning_rate": 4.55903291344072e-06, + "loss": 1.7148, + "step": 28252 + }, + { + "epoch": 8.671884591774095, + "grad_norm": 0.14681962132453918, + "learning_rate": 4.556959470661592e-06, + "loss": 1.6909, + "step": 28253 + }, + { + "epoch": 8.67219152854512, + "grad_norm": 0.22181211411952972, + "learning_rate": 4.554886476977205e-06, + "loss": 1.7747, + "step": 28254 + }, + { + "epoch": 8.672498465316146, + "grad_norm": 0.15168124437332153, + "learning_rate": 4.5528139324080784e-06, + "loss": 1.7206, + "step": 28255 + }, + { + "epoch": 8.67280540208717, + "grad_norm": 0.15613441169261932, + "learning_rate": 4.550741836974676e-06, + "loss": 1.7062, + "step": 28256 + }, + { + "epoch": 8.673112338858195, + "grad_norm": 0.1939506232738495, + "learning_rate": 4.548670190697485e-06, + "loss": 1.747, + "step": 28257 + }, + { + "epoch": 8.67341927562922, + "grad_norm": 0.15883082151412964, + "learning_rate": 4.5465989935969785e-06, + "loss": 1.7169, + "step": 28258 + }, + { + "epoch": 8.673726212400245, + "grad_norm": 0.14583253860473633, + "learning_rate": 4.5445282456936185e-06, + "loss": 1.6918, + "step": 28259 + }, + { + "epoch": 8.67403314917127, + "grad_norm": 0.12797339260578156, + "learning_rate": 4.5424579470078725e-06, + "loss": 1.6791, + "step": 28260 + }, + { + "epoch": 8.674340085942296, + "grad_norm": 0.18248072266578674, + "learning_rate": 4.5403880975602e-06, + "loss": 1.7338, + "step": 28261 + }, + { + "epoch": 8.674647022713321, + "grad_norm": 0.1547573208808899, + "learning_rate": 4.538318697371047e-06, + "loss": 1.6259, + "step": 28262 + }, + { + "epoch": 8.674953959484347, + "grad_norm": 0.18609635531902313, + "learning_rate": 4.536249746460897e-06, + "loss": 1.6943, + "step": 28263 + }, + { + "epoch": 8.675260896255372, + "grad_norm": 0.15615214407444, + "learning_rate": 4.534181244850161e-06, + "loss": 1.6851, + "step": 28264 + }, + { + "epoch": 8.675567833026397, + "grad_norm": 0.17061203718185425, + "learning_rate": 4.532113192559296e-06, + "loss": 1.7612, + "step": 28265 + }, + { + "epoch": 8.675874769797423, + "grad_norm": 0.17611360549926758, + "learning_rate": 4.530045589608739e-06, + "loss": 1.7109, + "step": 28266 + }, + { + "epoch": 8.676181706568446, + "grad_norm": 0.14381951093673706, + "learning_rate": 4.527978436018915e-06, + "loss": 1.6914, + "step": 28267 + }, + { + "epoch": 8.676488643339471, + "grad_norm": 0.18309952318668365, + "learning_rate": 4.525911731810273e-06, + "loss": 1.8044, + "step": 28268 + }, + { + "epoch": 8.676795580110497, + "grad_norm": 0.16398122906684875, + "learning_rate": 4.523845477003208e-06, + "loss": 1.7002, + "step": 28269 + }, + { + "epoch": 8.677102516881522, + "grad_norm": 0.12263865768909454, + "learning_rate": 4.521779671618176e-06, + "loss": 1.6777, + "step": 28270 + }, + { + "epoch": 8.677409453652547, + "grad_norm": 0.17702268064022064, + "learning_rate": 4.519714315675555e-06, + "loss": 1.697, + "step": 28271 + }, + { + "epoch": 8.677716390423573, + "grad_norm": 0.1558506339788437, + "learning_rate": 4.517649409195779e-06, + "loss": 1.7151, + "step": 28272 + }, + { + "epoch": 8.678023327194598, + "grad_norm": 0.19969215989112854, + "learning_rate": 4.5155849521992536e-06, + "loss": 1.7952, + "step": 28273 + }, + { + "epoch": 8.678330263965623, + "grad_norm": 0.14770828187465668, + "learning_rate": 4.513520944706379e-06, + "loss": 1.6846, + "step": 28274 + }, + { + "epoch": 8.678637200736649, + "grad_norm": 0.22692953050136566, + "learning_rate": 4.511457386737544e-06, + "loss": 1.7599, + "step": 28275 + }, + { + "epoch": 8.678944137507674, + "grad_norm": 0.1689091920852661, + "learning_rate": 4.509394278313156e-06, + "loss": 1.67, + "step": 28276 + }, + { + "epoch": 8.6792510742787, + "grad_norm": 0.12909743189811707, + "learning_rate": 4.507331619453592e-06, + "loss": 1.7062, + "step": 28277 + }, + { + "epoch": 8.679558011049723, + "grad_norm": 0.15877538919448853, + "learning_rate": 4.505269410179241e-06, + "loss": 1.688, + "step": 28278 + }, + { + "epoch": 8.679864947820748, + "grad_norm": 0.13565565645694733, + "learning_rate": 4.503207650510477e-06, + "loss": 1.6742, + "step": 28279 + }, + { + "epoch": 8.680171884591774, + "grad_norm": 0.1718231737613678, + "learning_rate": 4.501146340467699e-06, + "loss": 1.71, + "step": 28280 + }, + { + "epoch": 8.680478821362799, + "grad_norm": 0.14713016152381897, + "learning_rate": 4.499085480071252e-06, + "loss": 1.698, + "step": 28281 + }, + { + "epoch": 8.680785758133824, + "grad_norm": 0.15546689927577972, + "learning_rate": 4.49702506934151e-06, + "loss": 1.6863, + "step": 28282 + }, + { + "epoch": 8.68109269490485, + "grad_norm": 0.1528242826461792, + "learning_rate": 4.494965108298832e-06, + "loss": 1.7236, + "step": 28283 + }, + { + "epoch": 8.681399631675875, + "grad_norm": 0.14601372182369232, + "learning_rate": 4.4929055969635755e-06, + "loss": 1.7008, + "step": 28284 + }, + { + "epoch": 8.6817065684469, + "grad_norm": 0.18398553133010864, + "learning_rate": 4.490846535356119e-06, + "loss": 1.7117, + "step": 28285 + }, + { + "epoch": 8.682013505217926, + "grad_norm": 0.16242702305316925, + "learning_rate": 4.4887879234967675e-06, + "loss": 1.7204, + "step": 28286 + }, + { + "epoch": 8.682320441988951, + "grad_norm": 0.11883296817541122, + "learning_rate": 4.486729761405911e-06, + "loss": 1.665, + "step": 28287 + }, + { + "epoch": 8.682627378759975, + "grad_norm": 0.157135009765625, + "learning_rate": 4.484672049103844e-06, + "loss": 1.7438, + "step": 28288 + }, + { + "epoch": 8.682934315531, + "grad_norm": 0.17938226461410522, + "learning_rate": 4.482614786610939e-06, + "loss": 1.7022, + "step": 28289 + }, + { + "epoch": 8.683241252302025, + "grad_norm": 0.20547567307949066, + "learning_rate": 4.480557973947514e-06, + "loss": 1.7818, + "step": 28290 + }, + { + "epoch": 8.68354818907305, + "grad_norm": 0.2329530566930771, + "learning_rate": 4.478501611133889e-06, + "loss": 1.7702, + "step": 28291 + }, + { + "epoch": 8.683855125844076, + "grad_norm": 0.1893717646598816, + "learning_rate": 4.476445698190396e-06, + "loss": 1.7614, + "step": 28292 + }, + { + "epoch": 8.684162062615101, + "grad_norm": 0.17520616948604584, + "learning_rate": 4.474390235137349e-06, + "loss": 1.7585, + "step": 28293 + }, + { + "epoch": 8.684468999386127, + "grad_norm": 0.14743252098560333, + "learning_rate": 4.4723352219950605e-06, + "loss": 1.7008, + "step": 28294 + }, + { + "epoch": 8.684775936157152, + "grad_norm": 0.1734410971403122, + "learning_rate": 4.470280658783843e-06, + "loss": 1.6979, + "step": 28295 + }, + { + "epoch": 8.685082872928177, + "grad_norm": 0.1811109185218811, + "learning_rate": 4.468226545523985e-06, + "loss": 1.7124, + "step": 28296 + }, + { + "epoch": 8.685389809699203, + "grad_norm": 0.12056677043437958, + "learning_rate": 4.466172882235819e-06, + "loss": 1.6642, + "step": 28297 + }, + { + "epoch": 8.685696746470228, + "grad_norm": 0.159573495388031, + "learning_rate": 4.464119668939609e-06, + "loss": 1.7055, + "step": 28298 + }, + { + "epoch": 8.686003683241251, + "grad_norm": 0.17341920733451843, + "learning_rate": 4.46206690565566e-06, + "loss": 1.7036, + "step": 28299 + }, + { + "epoch": 8.686310620012277, + "grad_norm": 0.1660631000995636, + "learning_rate": 4.46001459240426e-06, + "loss": 1.7303, + "step": 28300 + }, + { + "epoch": 8.686617556783302, + "grad_norm": 0.18377192318439484, + "learning_rate": 4.4579627292056724e-06, + "loss": 1.7301, + "step": 28301 + }, + { + "epoch": 8.686924493554327, + "grad_norm": 0.13730384409427643, + "learning_rate": 4.455911316080213e-06, + "loss": 1.6399, + "step": 28302 + }, + { + "epoch": 8.687231430325353, + "grad_norm": 0.25353705883026123, + "learning_rate": 4.453860353048112e-06, + "loss": 1.7682, + "step": 28303 + }, + { + "epoch": 8.687538367096378, + "grad_norm": 0.15051604807376862, + "learning_rate": 4.451809840129673e-06, + "loss": 1.7268, + "step": 28304 + }, + { + "epoch": 8.687845303867404, + "grad_norm": 0.2090475857257843, + "learning_rate": 4.449759777345131e-06, + "loss": 1.7697, + "step": 28305 + }, + { + "epoch": 8.688152240638429, + "grad_norm": 0.13042283058166504, + "learning_rate": 4.4477101647147745e-06, + "loss": 1.667, + "step": 28306 + }, + { + "epoch": 8.688459177409454, + "grad_norm": 0.1518186628818512, + "learning_rate": 4.445661002258838e-06, + "loss": 1.7095, + "step": 28307 + }, + { + "epoch": 8.68876611418048, + "grad_norm": 0.13992765545845032, + "learning_rate": 4.443612289997584e-06, + "loss": 1.6761, + "step": 28308 + }, + { + "epoch": 8.689073050951503, + "grad_norm": 0.17726075649261475, + "learning_rate": 4.44156402795125e-06, + "loss": 1.7444, + "step": 28309 + }, + { + "epoch": 8.689379987722528, + "grad_norm": 0.15143834054470062, + "learning_rate": 4.439516216140088e-06, + "loss": 1.7078, + "step": 28310 + }, + { + "epoch": 8.689686924493554, + "grad_norm": 0.17791767418384552, + "learning_rate": 4.437468854584326e-06, + "loss": 1.7402, + "step": 28311 + }, + { + "epoch": 8.689993861264579, + "grad_norm": 0.19582994282245636, + "learning_rate": 4.435421943304208e-06, + "loss": 1.757, + "step": 28312 + }, + { + "epoch": 8.690300798035604, + "grad_norm": 0.19730351865291595, + "learning_rate": 4.43337548231994e-06, + "loss": 1.6982, + "step": 28313 + }, + { + "epoch": 8.69060773480663, + "grad_norm": 0.16093717515468597, + "learning_rate": 4.43132947165179e-06, + "loss": 1.7116, + "step": 28314 + }, + { + "epoch": 8.690914671577655, + "grad_norm": 0.16639035940170288, + "learning_rate": 4.429283911319937e-06, + "loss": 1.7166, + "step": 28315 + }, + { + "epoch": 8.69122160834868, + "grad_norm": 0.13834281265735626, + "learning_rate": 4.427238801344608e-06, + "loss": 1.7058, + "step": 28316 + }, + { + "epoch": 8.691528545119706, + "grad_norm": 0.1761016994714737, + "learning_rate": 4.4251941417460194e-06, + "loss": 1.7155, + "step": 28317 + }, + { + "epoch": 8.691835481890731, + "grad_norm": 0.17754366993904114, + "learning_rate": 4.423149932544363e-06, + "loss": 1.768, + "step": 28318 + }, + { + "epoch": 8.692142418661756, + "grad_norm": 0.1563618779182434, + "learning_rate": 4.42110617375987e-06, + "loss": 1.706, + "step": 28319 + }, + { + "epoch": 8.692449355432782, + "grad_norm": 0.16851158440113068, + "learning_rate": 4.419062865412704e-06, + "loss": 1.7084, + "step": 28320 + }, + { + "epoch": 8.692756292203805, + "grad_norm": 0.16056731343269348, + "learning_rate": 4.4170200075230925e-06, + "loss": 1.6771, + "step": 28321 + }, + { + "epoch": 8.69306322897483, + "grad_norm": 0.17098097503185272, + "learning_rate": 4.414977600111192e-06, + "loss": 1.712, + "step": 28322 + }, + { + "epoch": 8.693370165745856, + "grad_norm": 0.17442475259304047, + "learning_rate": 4.412935643197208e-06, + "loss": 1.7725, + "step": 28323 + }, + { + "epoch": 8.693677102516881, + "grad_norm": 0.16090531647205353, + "learning_rate": 4.410894136801308e-06, + "loss": 1.6996, + "step": 28324 + }, + { + "epoch": 8.693984039287907, + "grad_norm": 0.17448033392429352, + "learning_rate": 4.408853080943681e-06, + "loss": 1.6934, + "step": 28325 + }, + { + "epoch": 8.694290976058932, + "grad_norm": 0.15201367437839508, + "learning_rate": 4.406812475644484e-06, + "loss": 1.6671, + "step": 28326 + }, + { + "epoch": 8.694597912829957, + "grad_norm": 0.15211759507656097, + "learning_rate": 4.404772320923889e-06, + "loss": 1.7281, + "step": 28327 + }, + { + "epoch": 8.694904849600983, + "grad_norm": 0.1757364720106125, + "learning_rate": 4.402732616802063e-06, + "loss": 1.7085, + "step": 28328 + }, + { + "epoch": 8.695211786372008, + "grad_norm": 0.17995139956474304, + "learning_rate": 4.400693363299152e-06, + "loss": 1.7335, + "step": 28329 + }, + { + "epoch": 8.695518723143033, + "grad_norm": 0.1404990553855896, + "learning_rate": 4.398654560435312e-06, + "loss": 1.7102, + "step": 28330 + }, + { + "epoch": 8.695825659914057, + "grad_norm": 0.17141692340373993, + "learning_rate": 4.396616208230708e-06, + "loss": 1.7195, + "step": 28331 + }, + { + "epoch": 8.696132596685082, + "grad_norm": 0.17162097990512848, + "learning_rate": 4.394578306705471e-06, + "loss": 1.7075, + "step": 28332 + }, + { + "epoch": 8.696439533456108, + "grad_norm": 0.18884550034999847, + "learning_rate": 4.392540855879734e-06, + "loss": 1.72, + "step": 28333 + }, + { + "epoch": 8.696746470227133, + "grad_norm": 0.21365602314472198, + "learning_rate": 4.3905038557736425e-06, + "loss": 1.8024, + "step": 28334 + }, + { + "epoch": 8.697053406998158, + "grad_norm": 0.1939813494682312, + "learning_rate": 4.388467306407318e-06, + "loss": 1.6694, + "step": 28335 + }, + { + "epoch": 8.697360343769184, + "grad_norm": 0.20518864691257477, + "learning_rate": 4.386431207800906e-06, + "loss": 1.7708, + "step": 28336 + }, + { + "epoch": 8.697667280540209, + "grad_norm": 0.16070924699306488, + "learning_rate": 4.3843955599745025e-06, + "loss": 1.7496, + "step": 28337 + }, + { + "epoch": 8.697974217311234, + "grad_norm": 0.17010091245174408, + "learning_rate": 4.3823603629482514e-06, + "loss": 1.6996, + "step": 28338 + }, + { + "epoch": 8.69828115408226, + "grad_norm": 0.14453141391277313, + "learning_rate": 4.380325616742237e-06, + "loss": 1.7032, + "step": 28339 + }, + { + "epoch": 8.698588090853285, + "grad_norm": 0.1959836632013321, + "learning_rate": 4.378291321376593e-06, + "loss": 1.7861, + "step": 28340 + }, + { + "epoch": 8.69889502762431, + "grad_norm": 0.12473960220813751, + "learning_rate": 4.376257476871415e-06, + "loss": 1.6465, + "step": 28341 + }, + { + "epoch": 8.699201964395334, + "grad_norm": 0.17088855803012848, + "learning_rate": 4.374224083246797e-06, + "loss": 1.7701, + "step": 28342 + }, + { + "epoch": 8.699508901166359, + "grad_norm": 0.17513783276081085, + "learning_rate": 4.372191140522846e-06, + "loss": 1.7107, + "step": 28343 + }, + { + "epoch": 8.699815837937384, + "grad_norm": 0.15522748231887817, + "learning_rate": 4.370158648719641e-06, + "loss": 1.6961, + "step": 28344 + }, + { + "epoch": 8.70012277470841, + "grad_norm": 0.1434583216905594, + "learning_rate": 4.36812660785727e-06, + "loss": 1.6927, + "step": 28345 + }, + { + "epoch": 8.700429711479435, + "grad_norm": 0.1571590155363083, + "learning_rate": 4.366095017955824e-06, + "loss": 1.6747, + "step": 28346 + }, + { + "epoch": 8.70073664825046, + "grad_norm": 0.15448859333992004, + "learning_rate": 4.364063879035357e-06, + "loss": 1.7052, + "step": 28347 + }, + { + "epoch": 8.701043585021486, + "grad_norm": 0.18512596189975739, + "learning_rate": 4.362033191115983e-06, + "loss": 1.7516, + "step": 28348 + }, + { + "epoch": 8.701350521792511, + "grad_norm": 0.14646342396736145, + "learning_rate": 4.360002954217734e-06, + "loss": 1.7152, + "step": 28349 + }, + { + "epoch": 8.701657458563536, + "grad_norm": 0.15107101202011108, + "learning_rate": 4.357973168360691e-06, + "loss": 1.6659, + "step": 28350 + }, + { + "epoch": 8.701964395334562, + "grad_norm": 0.1887415051460266, + "learning_rate": 4.355943833564908e-06, + "loss": 1.7506, + "step": 28351 + }, + { + "epoch": 8.702271332105585, + "grad_norm": 0.17195916175842285, + "learning_rate": 4.353914949850424e-06, + "loss": 1.7571, + "step": 28352 + }, + { + "epoch": 8.70257826887661, + "grad_norm": 0.1679403930902481, + "learning_rate": 4.35188651723733e-06, + "loss": 1.7321, + "step": 28353 + }, + { + "epoch": 8.702885205647636, + "grad_norm": 0.1917678713798523, + "learning_rate": 4.349858535745633e-06, + "loss": 1.7387, + "step": 28354 + }, + { + "epoch": 8.703192142418661, + "grad_norm": 0.1321115791797638, + "learning_rate": 4.347831005395408e-06, + "loss": 1.7221, + "step": 28355 + }, + { + "epoch": 8.703499079189687, + "grad_norm": 0.14510731399059296, + "learning_rate": 4.345803926206654e-06, + "loss": 1.6905, + "step": 28356 + }, + { + "epoch": 8.703806015960712, + "grad_norm": 0.158061221241951, + "learning_rate": 4.343777298199431e-06, + "loss": 1.6605, + "step": 28357 + }, + { + "epoch": 8.704112952731737, + "grad_norm": 0.15366631746292114, + "learning_rate": 4.341751121393767e-06, + "loss": 1.7069, + "step": 28358 + }, + { + "epoch": 8.704419889502763, + "grad_norm": 0.20126941800117493, + "learning_rate": 4.339725395809674e-06, + "loss": 1.7704, + "step": 28359 + }, + { + "epoch": 8.704726826273788, + "grad_norm": 0.14276063442230225, + "learning_rate": 4.337700121467181e-06, + "loss": 1.6704, + "step": 28360 + }, + { + "epoch": 8.705033763044813, + "grad_norm": 0.15362146496772766, + "learning_rate": 4.335675298386293e-06, + "loss": 1.6486, + "step": 28361 + }, + { + "epoch": 8.705340699815839, + "grad_norm": 0.16178739070892334, + "learning_rate": 4.333650926587035e-06, + "loss": 1.703, + "step": 28362 + }, + { + "epoch": 8.705647636586864, + "grad_norm": 0.16188332438468933, + "learning_rate": 4.331627006089395e-06, + "loss": 1.6912, + "step": 28363 + }, + { + "epoch": 8.705954573357888, + "grad_norm": 0.1567341834306717, + "learning_rate": 4.3296035369133846e-06, + "loss": 1.6767, + "step": 28364 + }, + { + "epoch": 8.706261510128913, + "grad_norm": 0.16202545166015625, + "learning_rate": 4.327580519079011e-06, + "loss": 1.6836, + "step": 28365 + }, + { + "epoch": 8.706568446899938, + "grad_norm": 0.17161825299263, + "learning_rate": 4.325557952606252e-06, + "loss": 1.7271, + "step": 28366 + }, + { + "epoch": 8.706875383670964, + "grad_norm": 0.14774417877197266, + "learning_rate": 4.323535837515097e-06, + "loss": 1.6815, + "step": 28367 + }, + { + "epoch": 8.707182320441989, + "grad_norm": 0.19654276967048645, + "learning_rate": 4.321514173825531e-06, + "loss": 1.6633, + "step": 28368 + }, + { + "epoch": 8.707489257213014, + "grad_norm": 0.18064813315868378, + "learning_rate": 4.319492961557531e-06, + "loss": 1.7222, + "step": 28369 + }, + { + "epoch": 8.70779619398404, + "grad_norm": 0.14830774068832397, + "learning_rate": 4.317472200731087e-06, + "loss": 1.6921, + "step": 28370 + }, + { + "epoch": 8.708103130755065, + "grad_norm": 0.17077864706516266, + "learning_rate": 4.315451891366146e-06, + "loss": 1.6785, + "step": 28371 + }, + { + "epoch": 8.70841006752609, + "grad_norm": 0.1815696656703949, + "learning_rate": 4.313432033482701e-06, + "loss": 1.6865, + "step": 28372 + }, + { + "epoch": 8.708717004297116, + "grad_norm": 0.17936676740646362, + "learning_rate": 4.311412627100686e-06, + "loss": 1.7477, + "step": 28373 + }, + { + "epoch": 8.70902394106814, + "grad_norm": 0.16955824196338654, + "learning_rate": 4.30939367224007e-06, + "loss": 1.6906, + "step": 28374 + }, + { + "epoch": 8.709330877839164, + "grad_norm": 0.14489254355430603, + "learning_rate": 4.307375168920813e-06, + "loss": 1.6777, + "step": 28375 + }, + { + "epoch": 8.70963781461019, + "grad_norm": 0.18070191144943237, + "learning_rate": 4.305357117162856e-06, + "loss": 1.6955, + "step": 28376 + }, + { + "epoch": 8.709944751381215, + "grad_norm": 0.18469898402690887, + "learning_rate": 4.3033395169861375e-06, + "loss": 1.7364, + "step": 28377 + }, + { + "epoch": 8.71025168815224, + "grad_norm": 0.13740944862365723, + "learning_rate": 4.301322368410604e-06, + "loss": 1.6781, + "step": 28378 + }, + { + "epoch": 8.710558624923266, + "grad_norm": 0.16305440664291382, + "learning_rate": 4.299305671456189e-06, + "loss": 1.7277, + "step": 28379 + }, + { + "epoch": 8.710865561694291, + "grad_norm": 0.15460261702537537, + "learning_rate": 4.29728942614282e-06, + "loss": 1.7536, + "step": 28380 + }, + { + "epoch": 8.711172498465316, + "grad_norm": 0.13714177906513214, + "learning_rate": 4.2952736324904205e-06, + "loss": 1.7417, + "step": 28381 + }, + { + "epoch": 8.711479435236342, + "grad_norm": 0.22590506076812744, + "learning_rate": 4.29325829051892e-06, + "loss": 1.6888, + "step": 28382 + }, + { + "epoch": 8.711786372007367, + "grad_norm": 0.17581406235694885, + "learning_rate": 4.291243400248229e-06, + "loss": 1.7781, + "step": 28383 + }, + { + "epoch": 8.712093308778392, + "grad_norm": 0.15321393311023712, + "learning_rate": 4.289228961698266e-06, + "loss": 1.6613, + "step": 28384 + }, + { + "epoch": 8.712400245549416, + "grad_norm": 0.1657101809978485, + "learning_rate": 4.287214974888931e-06, + "loss": 1.7152, + "step": 28385 + }, + { + "epoch": 8.712707182320441, + "grad_norm": 0.18134190142154694, + "learning_rate": 4.28520143984013e-06, + "loss": 1.7265, + "step": 28386 + }, + { + "epoch": 8.713014119091467, + "grad_norm": 0.1232382282614708, + "learning_rate": 4.28318835657176e-06, + "loss": 1.6457, + "step": 28387 + }, + { + "epoch": 8.713321055862492, + "grad_norm": 0.1339728981256485, + "learning_rate": 4.281175725103715e-06, + "loss": 1.6516, + "step": 28388 + }, + { + "epoch": 8.713627992633517, + "grad_norm": 0.15603719651699066, + "learning_rate": 4.2791635454559e-06, + "loss": 1.717, + "step": 28389 + }, + { + "epoch": 8.713934929404543, + "grad_norm": 0.17226538062095642, + "learning_rate": 4.277151817648179e-06, + "loss": 1.7088, + "step": 28390 + }, + { + "epoch": 8.714241866175568, + "grad_norm": 0.17237617075443268, + "learning_rate": 4.275140541700445e-06, + "loss": 1.7467, + "step": 28391 + }, + { + "epoch": 8.714548802946593, + "grad_norm": 0.1798042505979538, + "learning_rate": 4.2731297176325734e-06, + "loss": 1.7157, + "step": 28392 + }, + { + "epoch": 8.714855739717619, + "grad_norm": 0.1701999455690384, + "learning_rate": 4.271119345464436e-06, + "loss": 1.7575, + "step": 28393 + }, + { + "epoch": 8.715162676488644, + "grad_norm": 0.13981005549430847, + "learning_rate": 4.2691094252159e-06, + "loss": 1.7315, + "step": 28394 + }, + { + "epoch": 8.715469613259668, + "grad_norm": 0.19189679622650146, + "learning_rate": 4.267099956906828e-06, + "loss": 1.7338, + "step": 28395 + }, + { + "epoch": 8.715776550030693, + "grad_norm": 0.14194947481155396, + "learning_rate": 4.265090940557076e-06, + "loss": 1.6999, + "step": 28396 + }, + { + "epoch": 8.716083486801718, + "grad_norm": 0.15809695422649384, + "learning_rate": 4.263082376186506e-06, + "loss": 1.6643, + "step": 28397 + }, + { + "epoch": 8.716390423572744, + "grad_norm": 0.12897074222564697, + "learning_rate": 4.261074263814963e-06, + "loss": 1.7096, + "step": 28398 + }, + { + "epoch": 8.716697360343769, + "grad_norm": 0.1517125964164734, + "learning_rate": 4.259066603462292e-06, + "loss": 1.7101, + "step": 28399 + }, + { + "epoch": 8.717004297114794, + "grad_norm": 0.1489602029323578, + "learning_rate": 4.257059395148333e-06, + "loss": 1.7097, + "step": 28400 + }, + { + "epoch": 8.71731123388582, + "grad_norm": 0.15182913839817047, + "learning_rate": 4.255052638892926e-06, + "loss": 1.7161, + "step": 28401 + }, + { + "epoch": 8.717618170656845, + "grad_norm": 0.1973588615655899, + "learning_rate": 4.253046334715899e-06, + "loss": 1.7452, + "step": 28402 + }, + { + "epoch": 8.71792510742787, + "grad_norm": 0.17291557788848877, + "learning_rate": 4.251040482637081e-06, + "loss": 1.7671, + "step": 28403 + }, + { + "epoch": 8.718232044198896, + "grad_norm": 0.1525208055973053, + "learning_rate": 4.249035082676295e-06, + "loss": 1.6891, + "step": 28404 + }, + { + "epoch": 8.718538980969921, + "grad_norm": 0.1681409627199173, + "learning_rate": 4.247030134853352e-06, + "loss": 1.728, + "step": 28405 + }, + { + "epoch": 8.718845917740946, + "grad_norm": 0.18142938613891602, + "learning_rate": 4.245025639188094e-06, + "loss": 1.6952, + "step": 28406 + }, + { + "epoch": 8.71915285451197, + "grad_norm": 0.17891576886177063, + "learning_rate": 4.243021595700286e-06, + "loss": 1.7304, + "step": 28407 + }, + { + "epoch": 8.719459791282995, + "grad_norm": 0.1676199585199356, + "learning_rate": 4.24101800440978e-06, + "loss": 1.6756, + "step": 28408 + }, + { + "epoch": 8.71976672805402, + "grad_norm": 0.16762350499629974, + "learning_rate": 4.239014865336339e-06, + "loss": 1.6899, + "step": 28409 + }, + { + "epoch": 8.720073664825046, + "grad_norm": 0.14751142263412476, + "learning_rate": 4.2370121784997776e-06, + "loss": 1.677, + "step": 28410 + }, + { + "epoch": 8.720380601596071, + "grad_norm": 0.16818544268608093, + "learning_rate": 4.235009943919887e-06, + "loss": 1.7132, + "step": 28411 + }, + { + "epoch": 8.720687538367097, + "grad_norm": 0.14754259586334229, + "learning_rate": 4.233008161616453e-06, + "loss": 1.6744, + "step": 28412 + }, + { + "epoch": 8.720994475138122, + "grad_norm": 0.1303185522556305, + "learning_rate": 4.231006831609258e-06, + "loss": 1.6783, + "step": 28413 + }, + { + "epoch": 8.721301411909147, + "grad_norm": 0.14147131145000458, + "learning_rate": 4.229005953918075e-06, + "loss": 1.6911, + "step": 28414 + }, + { + "epoch": 8.721608348680173, + "grad_norm": 0.19011028110980988, + "learning_rate": 4.227005528562688e-06, + "loss": 1.7245, + "step": 28415 + }, + { + "epoch": 8.721915285451198, + "grad_norm": 0.1327231526374817, + "learning_rate": 4.225005555562855e-06, + "loss": 1.6676, + "step": 28416 + }, + { + "epoch": 8.722222222222221, + "grad_norm": 0.13436436653137207, + "learning_rate": 4.223006034938354e-06, + "loss": 1.6926, + "step": 28417 + }, + { + "epoch": 8.722529158993247, + "grad_norm": 0.18722930550575256, + "learning_rate": 4.221006966708929e-06, + "loss": 1.7759, + "step": 28418 + }, + { + "epoch": 8.722836095764272, + "grad_norm": 0.18999920785427094, + "learning_rate": 4.219008350894355e-06, + "loss": 1.7385, + "step": 28419 + }, + { + "epoch": 8.723143032535297, + "grad_norm": 0.14250624179840088, + "learning_rate": 4.217010187514364e-06, + "loss": 1.7263, + "step": 28420 + }, + { + "epoch": 8.723449969306323, + "grad_norm": 0.1577407717704773, + "learning_rate": 4.21501247658872e-06, + "loss": 1.8055, + "step": 28421 + }, + { + "epoch": 8.723756906077348, + "grad_norm": 0.120110422372818, + "learning_rate": 4.213015218137145e-06, + "loss": 1.6519, + "step": 28422 + }, + { + "epoch": 8.724063842848373, + "grad_norm": 0.17998605966567993, + "learning_rate": 4.211018412179407e-06, + "loss": 1.6827, + "step": 28423 + }, + { + "epoch": 8.724370779619399, + "grad_norm": 0.14941653609275818, + "learning_rate": 4.209022058735213e-06, + "loss": 1.7089, + "step": 28424 + }, + { + "epoch": 8.724677716390424, + "grad_norm": 0.13641475141048431, + "learning_rate": 4.207026157824312e-06, + "loss": 1.6825, + "step": 28425 + }, + { + "epoch": 8.72498465316145, + "grad_norm": 0.1666809320449829, + "learning_rate": 4.205030709466401e-06, + "loss": 1.6958, + "step": 28426 + }, + { + "epoch": 8.725291589932475, + "grad_norm": 0.1236952468752861, + "learning_rate": 4.20303571368123e-06, + "loss": 1.6417, + "step": 28427 + }, + { + "epoch": 8.725598526703498, + "grad_norm": 0.1483321338891983, + "learning_rate": 4.201041170488501e-06, + "loss": 1.7082, + "step": 28428 + }, + { + "epoch": 8.725905463474524, + "grad_norm": 0.17827022075653076, + "learning_rate": 4.1990470799079255e-06, + "loss": 1.6506, + "step": 28429 + }, + { + "epoch": 8.726212400245549, + "grad_norm": 0.17171478271484375, + "learning_rate": 4.197053441959215e-06, + "loss": 1.7403, + "step": 28430 + }, + { + "epoch": 8.726519337016574, + "grad_norm": 0.18554572761058807, + "learning_rate": 4.195060256662064e-06, + "loss": 1.6899, + "step": 28431 + }, + { + "epoch": 8.7268262737876, + "grad_norm": 0.30604809522628784, + "learning_rate": 4.193067524036176e-06, + "loss": 1.7656, + "step": 28432 + }, + { + "epoch": 8.727133210558625, + "grad_norm": 0.1759488433599472, + "learning_rate": 4.191075244101245e-06, + "loss": 1.7167, + "step": 28433 + }, + { + "epoch": 8.72744014732965, + "grad_norm": 0.15285685658454895, + "learning_rate": 4.18908341687696e-06, + "loss": 1.6576, + "step": 28434 + }, + { + "epoch": 8.727747084100676, + "grad_norm": 0.17283809185028076, + "learning_rate": 4.187092042382995e-06, + "loss": 1.719, + "step": 28435 + }, + { + "epoch": 8.728054020871701, + "grad_norm": 0.1511228382587433, + "learning_rate": 4.1851011206390455e-06, + "loss": 1.6499, + "step": 28436 + }, + { + "epoch": 8.728360957642726, + "grad_norm": 0.13646523654460907, + "learning_rate": 4.183110651664779e-06, + "loss": 1.703, + "step": 28437 + }, + { + "epoch": 8.72866789441375, + "grad_norm": 0.16112352907657623, + "learning_rate": 4.181120635479863e-06, + "loss": 1.6963, + "step": 28438 + }, + { + "epoch": 8.728974831184775, + "grad_norm": 0.23064331710338593, + "learning_rate": 4.179131072103964e-06, + "loss": 1.7347, + "step": 28439 + }, + { + "epoch": 8.7292817679558, + "grad_norm": 0.17859068512916565, + "learning_rate": 4.177141961556763e-06, + "loss": 1.7963, + "step": 28440 + }, + { + "epoch": 8.729588704726826, + "grad_norm": 0.16455049812793732, + "learning_rate": 4.175153303857887e-06, + "loss": 1.6893, + "step": 28441 + }, + { + "epoch": 8.729895641497851, + "grad_norm": 0.1353607475757599, + "learning_rate": 4.173165099027021e-06, + "loss": 1.7165, + "step": 28442 + }, + { + "epoch": 8.730202578268877, + "grad_norm": 0.20421212911605835, + "learning_rate": 4.171177347083783e-06, + "loss": 1.7256, + "step": 28443 + }, + { + "epoch": 8.730509515039902, + "grad_norm": 0.17925186455249786, + "learning_rate": 4.169190048047833e-06, + "loss": 1.6819, + "step": 28444 + }, + { + "epoch": 8.730816451810927, + "grad_norm": 0.17959848046302795, + "learning_rate": 4.167203201938819e-06, + "loss": 1.7275, + "step": 28445 + }, + { + "epoch": 8.731123388581953, + "grad_norm": 0.13794639706611633, + "learning_rate": 4.165216808776357e-06, + "loss": 1.6694, + "step": 28446 + }, + { + "epoch": 8.731430325352978, + "grad_norm": 0.15895675122737885, + "learning_rate": 4.163230868580092e-06, + "loss": 1.7159, + "step": 28447 + }, + { + "epoch": 8.731737262124003, + "grad_norm": 0.16645625233650208, + "learning_rate": 4.161245381369644e-06, + "loss": 1.7068, + "step": 28448 + }, + { + "epoch": 8.732044198895027, + "grad_norm": 0.17593564093112946, + "learning_rate": 4.15926034716464e-06, + "loss": 1.7013, + "step": 28449 + }, + { + "epoch": 8.732351135666052, + "grad_norm": 0.1613699495792389, + "learning_rate": 4.157275765984692e-06, + "loss": 1.6925, + "step": 28450 + }, + { + "epoch": 8.732658072437077, + "grad_norm": 0.21205542981624603, + "learning_rate": 4.155291637849412e-06, + "loss": 1.8401, + "step": 28451 + }, + { + "epoch": 8.732965009208103, + "grad_norm": 0.16209860146045685, + "learning_rate": 4.153307962778408e-06, + "loss": 1.7068, + "step": 28452 + }, + { + "epoch": 8.733271945979128, + "grad_norm": 0.17571625113487244, + "learning_rate": 4.1513247407912905e-06, + "loss": 1.7245, + "step": 28453 + }, + { + "epoch": 8.733578882750153, + "grad_norm": 0.12565423548221588, + "learning_rate": 4.149341971907655e-06, + "loss": 1.6714, + "step": 28454 + }, + { + "epoch": 8.733885819521179, + "grad_norm": 0.14843232929706573, + "learning_rate": 4.147359656147093e-06, + "loss": 1.6685, + "step": 28455 + }, + { + "epoch": 8.734192756292204, + "grad_norm": 0.1699068695306778, + "learning_rate": 4.145377793529193e-06, + "loss": 1.6808, + "step": 28456 + }, + { + "epoch": 8.73449969306323, + "grad_norm": 0.18543531000614166, + "learning_rate": 4.143396384073556e-06, + "loss": 1.7721, + "step": 28457 + }, + { + "epoch": 8.734806629834255, + "grad_norm": 0.15792638063430786, + "learning_rate": 4.141415427799744e-06, + "loss": 1.6804, + "step": 28458 + }, + { + "epoch": 8.735113566605278, + "grad_norm": 0.19353818893432617, + "learning_rate": 4.139434924727359e-06, + "loss": 1.7062, + "step": 28459 + }, + { + "epoch": 8.735420503376304, + "grad_norm": 0.14087705314159393, + "learning_rate": 4.137454874875935e-06, + "loss": 1.6287, + "step": 28460 + }, + { + "epoch": 8.735727440147329, + "grad_norm": 0.14594002068042755, + "learning_rate": 4.135475278265077e-06, + "loss": 1.6741, + "step": 28461 + }, + { + "epoch": 8.736034376918354, + "grad_norm": 0.13943135738372803, + "learning_rate": 4.133496134914333e-06, + "loss": 1.7261, + "step": 28462 + }, + { + "epoch": 8.73634131368938, + "grad_norm": 0.20119191706180573, + "learning_rate": 4.131517444843264e-06, + "loss": 1.7719, + "step": 28463 + }, + { + "epoch": 8.736648250460405, + "grad_norm": 0.15612776577472687, + "learning_rate": 4.12953920807142e-06, + "loss": 1.6694, + "step": 28464 + }, + { + "epoch": 8.73695518723143, + "grad_norm": 0.15517298877239227, + "learning_rate": 4.127561424618359e-06, + "loss": 1.7225, + "step": 28465 + }, + { + "epoch": 8.737262124002456, + "grad_norm": 0.18650169670581818, + "learning_rate": 4.125584094503626e-06, + "loss": 1.7589, + "step": 28466 + }, + { + "epoch": 8.737569060773481, + "grad_norm": 0.19337934255599976, + "learning_rate": 4.123607217746755e-06, + "loss": 1.6754, + "step": 28467 + }, + { + "epoch": 8.737875997544506, + "grad_norm": 0.15818046033382416, + "learning_rate": 4.121630794367287e-06, + "loss": 1.7176, + "step": 28468 + }, + { + "epoch": 8.738182934315532, + "grad_norm": 0.14257800579071045, + "learning_rate": 4.11965482438475e-06, + "loss": 1.6961, + "step": 28469 + }, + { + "epoch": 8.738489871086557, + "grad_norm": 0.15100477635860443, + "learning_rate": 4.1176793078186785e-06, + "loss": 1.7161, + "step": 28470 + }, + { + "epoch": 8.73879680785758, + "grad_norm": 0.14171260595321655, + "learning_rate": 4.115704244688595e-06, + "loss": 1.6812, + "step": 28471 + }, + { + "epoch": 8.739103744628606, + "grad_norm": 0.13742563128471375, + "learning_rate": 4.1137296350140134e-06, + "loss": 1.6968, + "step": 28472 + }, + { + "epoch": 8.739410681399631, + "grad_norm": 0.131202831864357, + "learning_rate": 4.111755478814439e-06, + "loss": 1.6859, + "step": 28473 + }, + { + "epoch": 8.739717618170657, + "grad_norm": 0.14671406149864197, + "learning_rate": 4.109781776109411e-06, + "loss": 1.7227, + "step": 28474 + }, + { + "epoch": 8.740024554941682, + "grad_norm": 0.17391672730445862, + "learning_rate": 4.107808526918405e-06, + "loss": 1.6926, + "step": 28475 + }, + { + "epoch": 8.740331491712707, + "grad_norm": 0.16088297963142395, + "learning_rate": 4.105835731260943e-06, + "loss": 1.7296, + "step": 28476 + }, + { + "epoch": 8.740638428483733, + "grad_norm": 0.15273302793502808, + "learning_rate": 4.1038633891564985e-06, + "loss": 1.6888, + "step": 28477 + }, + { + "epoch": 8.740945365254758, + "grad_norm": 0.16602970659732819, + "learning_rate": 4.101891500624588e-06, + "loss": 1.6924, + "step": 28478 + }, + { + "epoch": 8.741252302025783, + "grad_norm": 0.13952100276947021, + "learning_rate": 4.099920065684681e-06, + "loss": 1.6972, + "step": 28479 + }, + { + "epoch": 8.741559238796809, + "grad_norm": 0.18140468001365662, + "learning_rate": 4.097949084356273e-06, + "loss": 1.7417, + "step": 28480 + }, + { + "epoch": 8.741866175567832, + "grad_norm": 0.19571609795093536, + "learning_rate": 4.095978556658831e-06, + "loss": 1.7261, + "step": 28481 + }, + { + "epoch": 8.742173112338858, + "grad_norm": 0.1748526245355606, + "learning_rate": 4.094008482611838e-06, + "loss": 1.7975, + "step": 28482 + }, + { + "epoch": 8.742480049109883, + "grad_norm": 0.1984734982252121, + "learning_rate": 4.092038862234759e-06, + "loss": 1.7941, + "step": 28483 + }, + { + "epoch": 8.742786985880908, + "grad_norm": 0.1336900144815445, + "learning_rate": 4.090069695547055e-06, + "loss": 1.6612, + "step": 28484 + }, + { + "epoch": 8.743093922651934, + "grad_norm": 0.1755249798297882, + "learning_rate": 4.088100982568193e-06, + "loss": 1.679, + "step": 28485 + }, + { + "epoch": 8.743400859422959, + "grad_norm": 0.17111645638942719, + "learning_rate": 4.086132723317631e-06, + "loss": 1.739, + "step": 28486 + }, + { + "epoch": 8.743707796193984, + "grad_norm": 0.18933364748954773, + "learning_rate": 4.084164917814815e-06, + "loss": 1.7469, + "step": 28487 + }, + { + "epoch": 8.74401473296501, + "grad_norm": 0.15212221443653107, + "learning_rate": 4.082197566079188e-06, + "loss": 1.7137, + "step": 28488 + }, + { + "epoch": 8.744321669736035, + "grad_norm": 0.1428573727607727, + "learning_rate": 4.080230668130203e-06, + "loss": 1.67, + "step": 28489 + }, + { + "epoch": 8.74462860650706, + "grad_norm": 0.1688205450773239, + "learning_rate": 4.078264223987283e-06, + "loss": 1.7149, + "step": 28490 + }, + { + "epoch": 8.744935543278086, + "grad_norm": 0.23390214145183563, + "learning_rate": 4.07629823366989e-06, + "loss": 1.7647, + "step": 28491 + }, + { + "epoch": 8.745242480049109, + "grad_norm": 0.163333460688591, + "learning_rate": 4.074332697197419e-06, + "loss": 1.7047, + "step": 28492 + }, + { + "epoch": 8.745549416820134, + "grad_norm": 0.14970998466014862, + "learning_rate": 4.072367614589323e-06, + "loss": 1.6921, + "step": 28493 + }, + { + "epoch": 8.74585635359116, + "grad_norm": 0.18369705975055695, + "learning_rate": 4.070402985864996e-06, + "loss": 1.7266, + "step": 28494 + }, + { + "epoch": 8.746163290362185, + "grad_norm": 0.17579036951065063, + "learning_rate": 4.068438811043873e-06, + "loss": 1.742, + "step": 28495 + }, + { + "epoch": 8.74647022713321, + "grad_norm": 0.1286322921514511, + "learning_rate": 4.066475090145355e-06, + "loss": 1.6656, + "step": 28496 + }, + { + "epoch": 8.746777163904236, + "grad_norm": 0.1595929116010666, + "learning_rate": 4.06451182318886e-06, + "loss": 1.7079, + "step": 28497 + }, + { + "epoch": 8.747084100675261, + "grad_norm": 0.14556388556957245, + "learning_rate": 4.062549010193778e-06, + "loss": 1.6948, + "step": 28498 + }, + { + "epoch": 8.747391037446286, + "grad_norm": 0.19447384774684906, + "learning_rate": 4.060586651179516e-06, + "loss": 1.7648, + "step": 28499 + }, + { + "epoch": 8.747697974217312, + "grad_norm": 0.147284135222435, + "learning_rate": 4.058624746165457e-06, + "loss": 1.713, + "step": 28500 + }, + { + "epoch": 8.748004910988337, + "grad_norm": 0.17068512737751007, + "learning_rate": 4.056663295170998e-06, + "loss": 1.708, + "step": 28501 + }, + { + "epoch": 8.74831184775936, + "grad_norm": 0.15625207126140594, + "learning_rate": 4.054702298215523e-06, + "loss": 1.7152, + "step": 28502 + }, + { + "epoch": 8.748618784530386, + "grad_norm": 0.14633874595165253, + "learning_rate": 4.052741755318407e-06, + "loss": 1.7221, + "step": 28503 + }, + { + "epoch": 8.748925721301411, + "grad_norm": 0.15166686475276947, + "learning_rate": 4.0507816664990265e-06, + "loss": 1.7179, + "step": 28504 + }, + { + "epoch": 8.749232658072437, + "grad_norm": 0.12509481608867645, + "learning_rate": 4.0488220317767555e-06, + "loss": 1.6743, + "step": 28505 + }, + { + "epoch": 8.749539594843462, + "grad_norm": 0.20686158537864685, + "learning_rate": 4.046862851170957e-06, + "loss": 1.6925, + "step": 28506 + }, + { + "epoch": 8.749846531614487, + "grad_norm": 0.12619495391845703, + "learning_rate": 4.044904124700983e-06, + "loss": 1.6932, + "step": 28507 + }, + { + "epoch": 8.750153468385513, + "grad_norm": 0.1770995706319809, + "learning_rate": 4.0429458523862205e-06, + "loss": 1.7948, + "step": 28508 + }, + { + "epoch": 8.750460405156538, + "grad_norm": 0.22418050467967987, + "learning_rate": 4.040988034245991e-06, + "loss": 1.7008, + "step": 28509 + }, + { + "epoch": 8.750767341927563, + "grad_norm": 0.14798377454280853, + "learning_rate": 4.039030670299665e-06, + "loss": 1.6673, + "step": 28510 + }, + { + "epoch": 8.751074278698589, + "grad_norm": 0.182883620262146, + "learning_rate": 4.037073760566562e-06, + "loss": 1.7223, + "step": 28511 + }, + { + "epoch": 8.751381215469614, + "grad_norm": 0.14968620240688324, + "learning_rate": 4.035117305066044e-06, + "loss": 1.6656, + "step": 28512 + }, + { + "epoch": 8.75168815224064, + "grad_norm": 0.19700272381305695, + "learning_rate": 4.03316130381744e-06, + "loss": 1.7207, + "step": 28513 + }, + { + "epoch": 8.751995089011663, + "grad_norm": 0.17926210165023804, + "learning_rate": 4.031205756840073e-06, + "loss": 1.7131, + "step": 28514 + }, + { + "epoch": 8.752302025782688, + "grad_norm": 0.1471911519765854, + "learning_rate": 4.029250664153278e-06, + "loss": 1.6731, + "step": 28515 + }, + { + "epoch": 8.752608962553714, + "grad_norm": 0.18923047184944153, + "learning_rate": 4.0272960257763725e-06, + "loss": 1.7795, + "step": 28516 + }, + { + "epoch": 8.752915899324739, + "grad_norm": 0.14930424094200134, + "learning_rate": 4.025341841728675e-06, + "loss": 1.7201, + "step": 28517 + }, + { + "epoch": 8.753222836095764, + "grad_norm": 0.17335213720798492, + "learning_rate": 4.0233881120294915e-06, + "loss": 1.7297, + "step": 28518 + }, + { + "epoch": 8.75352977286679, + "grad_norm": 0.14489638805389404, + "learning_rate": 4.021434836698135e-06, + "loss": 1.7314, + "step": 28519 + }, + { + "epoch": 8.753836709637815, + "grad_norm": 0.16861389577388763, + "learning_rate": 4.019482015753912e-06, + "loss": 1.7362, + "step": 28520 + }, + { + "epoch": 8.75414364640884, + "grad_norm": 0.1467277705669403, + "learning_rate": 4.0175296492161115e-06, + "loss": 1.6607, + "step": 28521 + }, + { + "epoch": 8.754450583179866, + "grad_norm": 0.1556902825832367, + "learning_rate": 4.015577737104037e-06, + "loss": 1.747, + "step": 28522 + }, + { + "epoch": 8.754757519950891, + "grad_norm": 0.13337039947509766, + "learning_rate": 4.013626279436977e-06, + "loss": 1.7271, + "step": 28523 + }, + { + "epoch": 8.755064456721914, + "grad_norm": 0.1599043607711792, + "learning_rate": 4.011675276234206e-06, + "loss": 1.6859, + "step": 28524 + }, + { + "epoch": 8.75537139349294, + "grad_norm": 0.11567290872335434, + "learning_rate": 4.009724727515035e-06, + "loss": 1.6577, + "step": 28525 + }, + { + "epoch": 8.755678330263965, + "grad_norm": 0.16317762434482574, + "learning_rate": 4.0077746332987e-06, + "loss": 1.7041, + "step": 28526 + }, + { + "epoch": 8.75598526703499, + "grad_norm": 0.13116325438022614, + "learning_rate": 4.005824993604506e-06, + "loss": 1.6847, + "step": 28527 + }, + { + "epoch": 8.756292203806016, + "grad_norm": 0.14927831292152405, + "learning_rate": 4.003875808451696e-06, + "loss": 1.6312, + "step": 28528 + }, + { + "epoch": 8.756599140577041, + "grad_norm": 0.15273495018482208, + "learning_rate": 4.001927077859552e-06, + "loss": 1.7027, + "step": 28529 + }, + { + "epoch": 8.756906077348066, + "grad_norm": 0.17557594180107117, + "learning_rate": 3.999978801847326e-06, + "loss": 1.7294, + "step": 28530 + }, + { + "epoch": 8.757213014119092, + "grad_norm": 0.16061940789222717, + "learning_rate": 3.998030980434269e-06, + "loss": 1.7179, + "step": 28531 + }, + { + "epoch": 8.757519950890117, + "grad_norm": 0.1431310772895813, + "learning_rate": 3.996083613639634e-06, + "loss": 1.6811, + "step": 28532 + }, + { + "epoch": 8.757826887661142, + "grad_norm": 0.16931994259357452, + "learning_rate": 3.994136701482659e-06, + "loss": 1.7246, + "step": 28533 + }, + { + "epoch": 8.758133824432168, + "grad_norm": 0.13671527802944183, + "learning_rate": 3.992190243982596e-06, + "loss": 1.6877, + "step": 28534 + }, + { + "epoch": 8.758440761203191, + "grad_norm": 0.11943815648555756, + "learning_rate": 3.990244241158675e-06, + "loss": 1.6476, + "step": 28535 + }, + { + "epoch": 8.758747697974217, + "grad_norm": 0.17011673748493195, + "learning_rate": 3.988298693030124e-06, + "loss": 1.7105, + "step": 28536 + }, + { + "epoch": 8.759054634745242, + "grad_norm": 0.1379362791776657, + "learning_rate": 3.986353599616177e-06, + "loss": 1.6691, + "step": 28537 + }, + { + "epoch": 8.759361571516267, + "grad_norm": 0.13264621794223785, + "learning_rate": 3.984408960936048e-06, + "loss": 1.6766, + "step": 28538 + }, + { + "epoch": 8.759668508287293, + "grad_norm": 0.16023825109004974, + "learning_rate": 3.982464777008965e-06, + "loss": 1.6906, + "step": 28539 + }, + { + "epoch": 8.759975445058318, + "grad_norm": 0.1602984219789505, + "learning_rate": 3.980521047854135e-06, + "loss": 1.7094, + "step": 28540 + }, + { + "epoch": 8.760282381829343, + "grad_norm": 0.15421636402606964, + "learning_rate": 3.978577773490772e-06, + "loss": 1.7467, + "step": 28541 + }, + { + "epoch": 8.760589318600369, + "grad_norm": 0.1427018642425537, + "learning_rate": 3.976634953938074e-06, + "loss": 1.7093, + "step": 28542 + }, + { + "epoch": 8.760896255371394, + "grad_norm": 0.143124058842659, + "learning_rate": 3.97469258921524e-06, + "loss": 1.6795, + "step": 28543 + }, + { + "epoch": 8.76120319214242, + "grad_norm": 0.14654754102230072, + "learning_rate": 3.97275067934148e-06, + "loss": 1.7246, + "step": 28544 + }, + { + "epoch": 8.761510128913443, + "grad_norm": 0.17374441027641296, + "learning_rate": 3.970809224335964e-06, + "loss": 1.6828, + "step": 28545 + }, + { + "epoch": 8.761817065684468, + "grad_norm": 0.1596260517835617, + "learning_rate": 3.968868224217898e-06, + "loss": 1.7816, + "step": 28546 + }, + { + "epoch": 8.762124002455494, + "grad_norm": 0.1467326581478119, + "learning_rate": 3.966927679006455e-06, + "loss": 1.6933, + "step": 28547 + }, + { + "epoch": 8.762430939226519, + "grad_norm": 0.12959735095500946, + "learning_rate": 3.9649875887208085e-06, + "loss": 1.6839, + "step": 28548 + }, + { + "epoch": 8.762737875997544, + "grad_norm": 0.13395267724990845, + "learning_rate": 3.963047953380145e-06, + "loss": 1.6968, + "step": 28549 + }, + { + "epoch": 8.76304481276857, + "grad_norm": 0.1369883418083191, + "learning_rate": 3.961108773003619e-06, + "loss": 1.6849, + "step": 28550 + }, + { + "epoch": 8.763351749539595, + "grad_norm": 0.19795149564743042, + "learning_rate": 3.959170047610405e-06, + "loss": 1.7593, + "step": 28551 + }, + { + "epoch": 8.76365868631062, + "grad_norm": 0.14946505427360535, + "learning_rate": 3.9572317772196555e-06, + "loss": 1.7309, + "step": 28552 + }, + { + "epoch": 8.763965623081646, + "grad_norm": 0.14034941792488098, + "learning_rate": 3.955293961850526e-06, + "loss": 1.6906, + "step": 28553 + }, + { + "epoch": 8.764272559852671, + "grad_norm": 0.1528625339269638, + "learning_rate": 3.9533566015221735e-06, + "loss": 1.7318, + "step": 28554 + }, + { + "epoch": 8.764579496623696, + "grad_norm": 0.15130504965782166, + "learning_rate": 3.951419696253733e-06, + "loss": 1.7147, + "step": 28555 + }, + { + "epoch": 8.764886433394722, + "grad_norm": 0.12917234003543854, + "learning_rate": 3.949483246064361e-06, + "loss": 1.687, + "step": 28556 + }, + { + "epoch": 8.765193370165745, + "grad_norm": 0.1918531060218811, + "learning_rate": 3.947547250973182e-06, + "loss": 1.7411, + "step": 28557 + }, + { + "epoch": 8.76550030693677, + "grad_norm": 0.16794945299625397, + "learning_rate": 3.9456117109993366e-06, + "loss": 1.762, + "step": 28558 + }, + { + "epoch": 8.765807243707796, + "grad_norm": 0.18833400309085846, + "learning_rate": 3.9436766261619465e-06, + "loss": 1.7641, + "step": 28559 + }, + { + "epoch": 8.766114180478821, + "grad_norm": 0.1939263939857483, + "learning_rate": 3.941741996480131e-06, + "loss": 1.7633, + "step": 28560 + }, + { + "epoch": 8.766421117249847, + "grad_norm": 0.15766844153404236, + "learning_rate": 3.939807821973029e-06, + "loss": 1.6989, + "step": 28561 + }, + { + "epoch": 8.766728054020872, + "grad_norm": 0.14704185724258423, + "learning_rate": 3.937874102659733e-06, + "loss": 1.7006, + "step": 28562 + }, + { + "epoch": 8.767034990791897, + "grad_norm": 0.1752765029668808, + "learning_rate": 3.935940838559376e-06, + "loss": 1.6738, + "step": 28563 + }, + { + "epoch": 8.767341927562923, + "grad_norm": 0.1801508069038391, + "learning_rate": 3.934008029691033e-06, + "loss": 1.7578, + "step": 28564 + }, + { + "epoch": 8.767648864333948, + "grad_norm": 0.17966793477535248, + "learning_rate": 3.932075676073838e-06, + "loss": 1.7347, + "step": 28565 + }, + { + "epoch": 8.767955801104973, + "grad_norm": 0.1435980200767517, + "learning_rate": 3.930143777726863e-06, + "loss": 1.6907, + "step": 28566 + }, + { + "epoch": 8.768262737875997, + "grad_norm": 0.1439833641052246, + "learning_rate": 3.928212334669218e-06, + "loss": 1.6804, + "step": 28567 + }, + { + "epoch": 8.768569674647022, + "grad_norm": 0.18037080764770508, + "learning_rate": 3.92628134691998e-06, + "loss": 1.7287, + "step": 28568 + }, + { + "epoch": 8.768876611418047, + "grad_norm": 0.1484454721212387, + "learning_rate": 3.924350814498229e-06, + "loss": 1.7128, + "step": 28569 + }, + { + "epoch": 8.769183548189073, + "grad_norm": 0.1302090734243393, + "learning_rate": 3.922420737423055e-06, + "loss": 1.647, + "step": 28570 + }, + { + "epoch": 8.769490484960098, + "grad_norm": 0.16756890714168549, + "learning_rate": 3.920491115713526e-06, + "loss": 1.7613, + "step": 28571 + }, + { + "epoch": 8.769797421731123, + "grad_norm": 0.17668041586875916, + "learning_rate": 3.918561949388705e-06, + "loss": 1.6957, + "step": 28572 + }, + { + "epoch": 8.770104358502149, + "grad_norm": 0.14288358390331268, + "learning_rate": 3.916633238467671e-06, + "loss": 1.6879, + "step": 28573 + }, + { + "epoch": 8.770411295273174, + "grad_norm": 0.16978147625923157, + "learning_rate": 3.9147049829694746e-06, + "loss": 1.7456, + "step": 28574 + }, + { + "epoch": 8.7707182320442, + "grad_norm": 0.13802385330200195, + "learning_rate": 3.91277718291318e-06, + "loss": 1.6799, + "step": 28575 + }, + { + "epoch": 8.771025168815225, + "grad_norm": 0.16819354891777039, + "learning_rate": 3.910849838317826e-06, + "loss": 1.7277, + "step": 28576 + }, + { + "epoch": 8.77133210558625, + "grad_norm": 0.16395528614521027, + "learning_rate": 3.908922949202465e-06, + "loss": 1.6976, + "step": 28577 + }, + { + "epoch": 8.771639042357274, + "grad_norm": 0.14518797397613525, + "learning_rate": 3.906996515586159e-06, + "loss": 1.6962, + "step": 28578 + }, + { + "epoch": 8.771945979128299, + "grad_norm": 0.17786560952663422, + "learning_rate": 3.905070537487909e-06, + "loss": 1.6593, + "step": 28579 + }, + { + "epoch": 8.772252915899324, + "grad_norm": 0.1793101727962494, + "learning_rate": 3.9031450149267845e-06, + "loss": 1.7699, + "step": 28580 + }, + { + "epoch": 8.77255985267035, + "grad_norm": 0.2498319298028946, + "learning_rate": 3.901219947921786e-06, + "loss": 1.745, + "step": 28581 + }, + { + "epoch": 8.772866789441375, + "grad_norm": 0.14886927604675293, + "learning_rate": 3.899295336491959e-06, + "loss": 1.6886, + "step": 28582 + }, + { + "epoch": 8.7731737262124, + "grad_norm": 0.1918812394142151, + "learning_rate": 3.897371180656317e-06, + "loss": 1.7717, + "step": 28583 + }, + { + "epoch": 8.773480662983426, + "grad_norm": 0.15470977127552032, + "learning_rate": 3.895447480433873e-06, + "loss": 1.6747, + "step": 28584 + }, + { + "epoch": 8.773787599754451, + "grad_norm": 0.15075071156024933, + "learning_rate": 3.893524235843648e-06, + "loss": 1.6753, + "step": 28585 + }, + { + "epoch": 8.774094536525476, + "grad_norm": 0.14186562597751617, + "learning_rate": 3.891601446904625e-06, + "loss": 1.6535, + "step": 28586 + }, + { + "epoch": 8.774401473296502, + "grad_norm": 0.16147254407405853, + "learning_rate": 3.8896791136358305e-06, + "loss": 1.6939, + "step": 28587 + }, + { + "epoch": 8.774708410067525, + "grad_norm": 0.1621028035879135, + "learning_rate": 3.8877572360562554e-06, + "loss": 1.7311, + "step": 28588 + }, + { + "epoch": 8.77501534683855, + "grad_norm": 0.1451268047094345, + "learning_rate": 3.885835814184885e-06, + "loss": 1.7029, + "step": 28589 + }, + { + "epoch": 8.775322283609576, + "grad_norm": 0.1404246985912323, + "learning_rate": 3.883914848040715e-06, + "loss": 1.7338, + "step": 28590 + }, + { + "epoch": 8.775629220380601, + "grad_norm": 0.15817701816558838, + "learning_rate": 3.881994337642731e-06, + "loss": 1.6944, + "step": 28591 + }, + { + "epoch": 8.775936157151627, + "grad_norm": 0.15462549030780792, + "learning_rate": 3.880074283009905e-06, + "loss": 1.7406, + "step": 28592 + }, + { + "epoch": 8.776243093922652, + "grad_norm": 0.1545121818780899, + "learning_rate": 3.878154684161217e-06, + "loss": 1.7009, + "step": 28593 + }, + { + "epoch": 8.776550030693677, + "grad_norm": 0.13072805106639862, + "learning_rate": 3.8762355411156305e-06, + "loss": 1.6798, + "step": 28594 + }, + { + "epoch": 8.776856967464703, + "grad_norm": 0.16369932889938354, + "learning_rate": 3.8743168538921344e-06, + "loss": 1.7046, + "step": 28595 + }, + { + "epoch": 8.777163904235728, + "grad_norm": 0.151187926530838, + "learning_rate": 3.8723986225096596e-06, + "loss": 1.7383, + "step": 28596 + }, + { + "epoch": 8.777470841006753, + "grad_norm": 0.16651193797588348, + "learning_rate": 3.8704808469871955e-06, + "loss": 1.7178, + "step": 28597 + }, + { + "epoch": 8.777777777777779, + "grad_norm": 0.1387864351272583, + "learning_rate": 3.868563527343655e-06, + "loss": 1.6644, + "step": 28598 + }, + { + "epoch": 8.778084714548802, + "grad_norm": 0.14454610645771027, + "learning_rate": 3.866646663598022e-06, + "loss": 1.6699, + "step": 28599 + }, + { + "epoch": 8.778391651319827, + "grad_norm": 0.1706279069185257, + "learning_rate": 3.864730255769223e-06, + "loss": 1.7251, + "step": 28600 + }, + { + "epoch": 8.778698588090853, + "grad_norm": 0.14636628329753876, + "learning_rate": 3.8628143038762e-06, + "loss": 1.6774, + "step": 28601 + }, + { + "epoch": 8.779005524861878, + "grad_norm": 0.17533506453037262, + "learning_rate": 3.860898807937902e-06, + "loss": 1.7587, + "step": 28602 + }, + { + "epoch": 8.779312461632903, + "grad_norm": 0.2628023326396942, + "learning_rate": 3.858983767973223e-06, + "loss": 1.7571, + "step": 28603 + }, + { + "epoch": 8.779619398403929, + "grad_norm": 0.1412924826145172, + "learning_rate": 3.857069184001116e-06, + "loss": 1.699, + "step": 28604 + }, + { + "epoch": 8.779926335174954, + "grad_norm": 0.16076254844665527, + "learning_rate": 3.855155056040505e-06, + "loss": 1.7327, + "step": 28605 + }, + { + "epoch": 8.78023327194598, + "grad_norm": 0.1440654993057251, + "learning_rate": 3.85324138411029e-06, + "loss": 1.6941, + "step": 28606 + }, + { + "epoch": 8.780540208717005, + "grad_norm": 0.1956651359796524, + "learning_rate": 3.8513281682293956e-06, + "loss": 1.728, + "step": 28607 + }, + { + "epoch": 8.78084714548803, + "grad_norm": 0.14176496863365173, + "learning_rate": 3.849415408416723e-06, + "loss": 1.7139, + "step": 28608 + }, + { + "epoch": 8.781154082259054, + "grad_norm": 0.18848197162151337, + "learning_rate": 3.84750310469118e-06, + "loss": 1.7092, + "step": 28609 + }, + { + "epoch": 8.781461019030079, + "grad_norm": 0.1622554361820221, + "learning_rate": 3.8455912570716565e-06, + "loss": 1.7137, + "step": 28610 + }, + { + "epoch": 8.781767955801104, + "grad_norm": 0.14255301654338837, + "learning_rate": 3.843679865577049e-06, + "loss": 1.6759, + "step": 28611 + }, + { + "epoch": 8.78207489257213, + "grad_norm": 0.15052112936973572, + "learning_rate": 3.841768930226264e-06, + "loss": 1.6749, + "step": 28612 + }, + { + "epoch": 8.782381829343155, + "grad_norm": 0.19591687619686127, + "learning_rate": 3.8398584510381584e-06, + "loss": 1.7263, + "step": 28613 + }, + { + "epoch": 8.78268876611418, + "grad_norm": 0.1651594340801239, + "learning_rate": 3.83794842803164e-06, + "loss": 1.763, + "step": 28614 + }, + { + "epoch": 8.782995702885206, + "grad_norm": 0.15854987502098083, + "learning_rate": 3.83603886122556e-06, + "loss": 1.7128, + "step": 28615 + }, + { + "epoch": 8.783302639656231, + "grad_norm": 0.14012815058231354, + "learning_rate": 3.834129750638804e-06, + "loss": 1.6711, + "step": 28616 + }, + { + "epoch": 8.783609576427256, + "grad_norm": 0.19335302710533142, + "learning_rate": 3.832221096290245e-06, + "loss": 1.7082, + "step": 28617 + }, + { + "epoch": 8.783916513198282, + "grad_norm": 0.13030263781547546, + "learning_rate": 3.830312898198729e-06, + "loss": 1.6831, + "step": 28618 + }, + { + "epoch": 8.784223449969307, + "grad_norm": 0.14048850536346436, + "learning_rate": 3.82840515638313e-06, + "loss": 1.7419, + "step": 28619 + }, + { + "epoch": 8.784530386740332, + "grad_norm": 0.1761157363653183, + "learning_rate": 3.826497870862284e-06, + "loss": 1.7285, + "step": 28620 + }, + { + "epoch": 8.784837323511356, + "grad_norm": 0.16928929090499878, + "learning_rate": 3.824591041655051e-06, + "loss": 1.7597, + "step": 28621 + }, + { + "epoch": 8.785144260282381, + "grad_norm": 0.12604424357414246, + "learning_rate": 3.822684668780275e-06, + "loss": 1.6895, + "step": 28622 + }, + { + "epoch": 8.785451197053407, + "grad_norm": 0.1835777759552002, + "learning_rate": 3.820778752256793e-06, + "loss": 1.7131, + "step": 28623 + }, + { + "epoch": 8.785758133824432, + "grad_norm": 0.1577402502298355, + "learning_rate": 3.818873292103447e-06, + "loss": 1.7159, + "step": 28624 + }, + { + "epoch": 8.786065070595457, + "grad_norm": 0.14781227707862854, + "learning_rate": 3.8169682883390565e-06, + "loss": 1.7179, + "step": 28625 + }, + { + "epoch": 8.786372007366483, + "grad_norm": 0.19881610572338104, + "learning_rate": 3.815063740982461e-06, + "loss": 1.7586, + "step": 28626 + }, + { + "epoch": 8.786678944137508, + "grad_norm": 0.16822806000709534, + "learning_rate": 3.813159650052467e-06, + "loss": 1.7628, + "step": 28627 + }, + { + "epoch": 8.786985880908533, + "grad_norm": 0.14510734379291534, + "learning_rate": 3.811256015567899e-06, + "loss": 1.654, + "step": 28628 + }, + { + "epoch": 8.787292817679559, + "grad_norm": 0.1547134667634964, + "learning_rate": 3.8093528375475863e-06, + "loss": 1.7204, + "step": 28629 + }, + { + "epoch": 8.787599754450584, + "grad_norm": 0.19592107832431793, + "learning_rate": 3.8074501160103027e-06, + "loss": 1.7084, + "step": 28630 + }, + { + "epoch": 8.787906691221608, + "grad_norm": 0.1543792486190796, + "learning_rate": 3.8055478509748887e-06, + "loss": 1.7322, + "step": 28631 + }, + { + "epoch": 8.788213627992633, + "grad_norm": 0.17076534032821655, + "learning_rate": 3.8036460424601128e-06, + "loss": 1.7004, + "step": 28632 + }, + { + "epoch": 8.788520564763658, + "grad_norm": 0.13622300326824188, + "learning_rate": 3.8017446904847875e-06, + "loss": 1.6867, + "step": 28633 + }, + { + "epoch": 8.788827501534684, + "grad_norm": 0.3221909999847412, + "learning_rate": 3.7998437950677035e-06, + "loss": 1.7559, + "step": 28634 + }, + { + "epoch": 8.789134438305709, + "grad_norm": 0.1811852902173996, + "learning_rate": 3.79794335622764e-06, + "loss": 1.7439, + "step": 28635 + }, + { + "epoch": 8.789441375076734, + "grad_norm": 0.1573752760887146, + "learning_rate": 3.7960433739833877e-06, + "loss": 1.7129, + "step": 28636 + }, + { + "epoch": 8.78974831184776, + "grad_norm": 0.13165032863616943, + "learning_rate": 3.7941438483536986e-06, + "loss": 1.6926, + "step": 28637 + }, + { + "epoch": 8.790055248618785, + "grad_norm": 0.14245405793190002, + "learning_rate": 3.792244779357368e-06, + "loss": 1.7072, + "step": 28638 + }, + { + "epoch": 8.79036218538981, + "grad_norm": 0.16790303587913513, + "learning_rate": 3.790346167013159e-06, + "loss": 1.6979, + "step": 28639 + }, + { + "epoch": 8.790669122160836, + "grad_norm": 0.15134595334529877, + "learning_rate": 3.7884480113398345e-06, + "loss": 1.7035, + "step": 28640 + }, + { + "epoch": 8.79097605893186, + "grad_norm": 0.1418851763010025, + "learning_rate": 3.7865503123561575e-06, + "loss": 1.6462, + "step": 28641 + }, + { + "epoch": 8.791282995702884, + "grad_norm": 0.13052044808864594, + "learning_rate": 3.784653070080868e-06, + "loss": 1.6559, + "step": 28642 + }, + { + "epoch": 8.79158993247391, + "grad_norm": 0.14758886396884918, + "learning_rate": 3.782756284532729e-06, + "loss": 1.6948, + "step": 28643 + }, + { + "epoch": 8.791896869244935, + "grad_norm": 0.1561112254858017, + "learning_rate": 3.7808599557304814e-06, + "loss": 1.6465, + "step": 28644 + }, + { + "epoch": 8.79220380601596, + "grad_norm": 0.17403864860534668, + "learning_rate": 3.77896408369286e-06, + "loss": 1.7397, + "step": 28645 + }, + { + "epoch": 8.792510742786986, + "grad_norm": 0.147226944565773, + "learning_rate": 3.7770686684386158e-06, + "loss": 1.6707, + "step": 28646 + }, + { + "epoch": 8.792817679558011, + "grad_norm": 0.1681959182024002, + "learning_rate": 3.7751737099864627e-06, + "loss": 1.6786, + "step": 28647 + }, + { + "epoch": 8.793124616329036, + "grad_norm": 0.15970535576343536, + "learning_rate": 3.773279208355146e-06, + "loss": 1.6652, + "step": 28648 + }, + { + "epoch": 8.793431553100062, + "grad_norm": 0.18252034485340118, + "learning_rate": 3.771385163563368e-06, + "loss": 1.7478, + "step": 28649 + }, + { + "epoch": 8.793738489871087, + "grad_norm": 0.22270283102989197, + "learning_rate": 3.7694915756298576e-06, + "loss": 1.7683, + "step": 28650 + }, + { + "epoch": 8.794045426642112, + "grad_norm": 0.13913489878177643, + "learning_rate": 3.7675984445733337e-06, + "loss": 1.7275, + "step": 28651 + }, + { + "epoch": 8.794352363413136, + "grad_norm": 0.16266898810863495, + "learning_rate": 3.7657057704124976e-06, + "loss": 1.7145, + "step": 28652 + }, + { + "epoch": 8.794659300184161, + "grad_norm": 0.18106494843959808, + "learning_rate": 3.763813553166068e-06, + "loss": 1.6936, + "step": 28653 + }, + { + "epoch": 8.794966236955187, + "grad_norm": 0.17213653028011322, + "learning_rate": 3.761921792852713e-06, + "loss": 1.7223, + "step": 28654 + }, + { + "epoch": 8.795273173726212, + "grad_norm": 0.14013275504112244, + "learning_rate": 3.7600304894911562e-06, + "loss": 1.7082, + "step": 28655 + }, + { + "epoch": 8.795580110497237, + "grad_norm": 0.1625421643257141, + "learning_rate": 3.758139643100078e-06, + "loss": 1.719, + "step": 28656 + }, + { + "epoch": 8.795887047268263, + "grad_norm": 0.15947094559669495, + "learning_rate": 3.756249253698174e-06, + "loss": 1.7448, + "step": 28657 + }, + { + "epoch": 8.796193984039288, + "grad_norm": 0.16739755868911743, + "learning_rate": 3.754359321304113e-06, + "loss": 1.7048, + "step": 28658 + }, + { + "epoch": 8.796500920810313, + "grad_norm": 0.17619092762470245, + "learning_rate": 3.7524698459365794e-06, + "loss": 1.7247, + "step": 28659 + }, + { + "epoch": 8.796807857581339, + "grad_norm": 0.19410766661167145, + "learning_rate": 3.7505808276142473e-06, + "loss": 1.6918, + "step": 28660 + }, + { + "epoch": 8.797114794352364, + "grad_norm": 0.13881324231624603, + "learning_rate": 3.74869226635578e-06, + "loss": 1.6997, + "step": 28661 + }, + { + "epoch": 8.79742173112339, + "grad_norm": 0.16185659170150757, + "learning_rate": 3.74680416217984e-06, + "loss": 1.6951, + "step": 28662 + }, + { + "epoch": 8.797728667894415, + "grad_norm": 0.4652320444583893, + "learning_rate": 3.744916515105107e-06, + "loss": 1.7521, + "step": 28663 + }, + { + "epoch": 8.798035604665438, + "grad_norm": 0.1286199539899826, + "learning_rate": 3.7430293251501992e-06, + "loss": 1.7106, + "step": 28664 + }, + { + "epoch": 8.798342541436464, + "grad_norm": 0.18184927105903625, + "learning_rate": 3.741142592333807e-06, + "loss": 1.7297, + "step": 28665 + }, + { + "epoch": 8.798649478207489, + "grad_norm": 0.1292438805103302, + "learning_rate": 3.7392563166745443e-06, + "loss": 1.6701, + "step": 28666 + }, + { + "epoch": 8.798956414978514, + "grad_norm": 0.16631865501403809, + "learning_rate": 3.7373704981910673e-06, + "loss": 1.7572, + "step": 28667 + }, + { + "epoch": 8.79926335174954, + "grad_norm": 0.13093185424804688, + "learning_rate": 3.7354851369020117e-06, + "loss": 1.6912, + "step": 28668 + }, + { + "epoch": 8.799570288520565, + "grad_norm": 0.16165922582149506, + "learning_rate": 3.7336002328260123e-06, + "loss": 1.668, + "step": 28669 + }, + { + "epoch": 8.79987722529159, + "grad_norm": 0.1431419402360916, + "learning_rate": 3.7317157859816987e-06, + "loss": 1.6499, + "step": 28670 + }, + { + "epoch": 8.800184162062616, + "grad_norm": 0.16933713853359222, + "learning_rate": 3.729831796387667e-06, + "loss": 1.7081, + "step": 28671 + }, + { + "epoch": 8.800491098833641, + "grad_norm": 0.15956951677799225, + "learning_rate": 3.727948264062575e-06, + "loss": 1.6981, + "step": 28672 + }, + { + "epoch": 8.800798035604666, + "grad_norm": 0.17684711515903473, + "learning_rate": 3.726065189025013e-06, + "loss": 1.7254, + "step": 28673 + }, + { + "epoch": 8.80110497237569, + "grad_norm": 0.20180673897266388, + "learning_rate": 3.7241825712935997e-06, + "loss": 1.764, + "step": 28674 + }, + { + "epoch": 8.801411909146715, + "grad_norm": 0.165853351354599, + "learning_rate": 3.7223004108869307e-06, + "loss": 1.7275, + "step": 28675 + }, + { + "epoch": 8.80171884591774, + "grad_norm": 0.25295981764793396, + "learning_rate": 3.72041870782362e-06, + "loss": 1.8427, + "step": 28676 + }, + { + "epoch": 8.802025782688766, + "grad_norm": 0.14879196882247925, + "learning_rate": 3.7185374621222567e-06, + "loss": 1.6921, + "step": 28677 + }, + { + "epoch": 8.802332719459791, + "grad_norm": 0.159479022026062, + "learning_rate": 3.716656673801433e-06, + "loss": 1.699, + "step": 28678 + }, + { + "epoch": 8.802639656230816, + "grad_norm": 0.1288701742887497, + "learning_rate": 3.714776342879722e-06, + "loss": 1.6872, + "step": 28679 + }, + { + "epoch": 8.802946593001842, + "grad_norm": 0.15079650282859802, + "learning_rate": 3.712896469375743e-06, + "loss": 1.6873, + "step": 28680 + }, + { + "epoch": 8.803253529772867, + "grad_norm": 0.1662154346704483, + "learning_rate": 3.7110170533080304e-06, + "loss": 1.7451, + "step": 28681 + }, + { + "epoch": 8.803560466543892, + "grad_norm": 0.1374291628599167, + "learning_rate": 3.709138094695197e-06, + "loss": 1.6698, + "step": 28682 + }, + { + "epoch": 8.803867403314918, + "grad_norm": 0.13723774254322052, + "learning_rate": 3.707259593555773e-06, + "loss": 1.734, + "step": 28683 + }, + { + "epoch": 8.804174340085943, + "grad_norm": 0.15156403183937073, + "learning_rate": 3.7053815499083543e-06, + "loss": 1.7228, + "step": 28684 + }, + { + "epoch": 8.804481276856967, + "grad_norm": 0.15390744805335999, + "learning_rate": 3.7035039637714876e-06, + "loss": 1.7659, + "step": 28685 + }, + { + "epoch": 8.804788213627992, + "grad_norm": 0.13234136998653412, + "learning_rate": 3.7016268351637297e-06, + "loss": 1.684, + "step": 28686 + }, + { + "epoch": 8.805095150399017, + "grad_norm": 0.20412379503250122, + "learning_rate": 3.699750164103638e-06, + "loss": 1.7228, + "step": 28687 + }, + { + "epoch": 8.805402087170043, + "grad_norm": 0.15076974034309387, + "learning_rate": 3.697873950609737e-06, + "loss": 1.7029, + "step": 28688 + }, + { + "epoch": 8.805709023941068, + "grad_norm": 0.13920028507709503, + "learning_rate": 3.6959981947005952e-06, + "loss": 1.6905, + "step": 28689 + }, + { + "epoch": 8.806015960712093, + "grad_norm": 0.13444112241268158, + "learning_rate": 3.694122896394736e-06, + "loss": 1.6483, + "step": 28690 + }, + { + "epoch": 8.806322897483119, + "grad_norm": 0.18719401955604553, + "learning_rate": 3.692248055710701e-06, + "loss": 1.7326, + "step": 28691 + }, + { + "epoch": 8.806629834254144, + "grad_norm": 0.2103775292634964, + "learning_rate": 3.690373672667008e-06, + "loss": 1.8134, + "step": 28692 + }, + { + "epoch": 8.80693677102517, + "grad_norm": 0.14053337275981903, + "learning_rate": 3.6884997472821814e-06, + "loss": 1.713, + "step": 28693 + }, + { + "epoch": 8.807243707796195, + "grad_norm": 0.21146062016487122, + "learning_rate": 3.686626279574751e-06, + "loss": 1.767, + "step": 28694 + }, + { + "epoch": 8.807550644567218, + "grad_norm": 0.1462959349155426, + "learning_rate": 3.6847532695632236e-06, + "loss": 1.7002, + "step": 28695 + }, + { + "epoch": 8.807857581338244, + "grad_norm": 0.13064992427825928, + "learning_rate": 3.682880717266102e-06, + "loss": 1.6927, + "step": 28696 + }, + { + "epoch": 8.808164518109269, + "grad_norm": 0.11652515083551407, + "learning_rate": 3.6810086227019147e-06, + "loss": 1.6717, + "step": 28697 + }, + { + "epoch": 8.808471454880294, + "grad_norm": 0.14266341924667358, + "learning_rate": 3.679136985889131e-06, + "loss": 1.6843, + "step": 28698 + }, + { + "epoch": 8.80877839165132, + "grad_norm": 0.15322953462600708, + "learning_rate": 3.677265806846286e-06, + "loss": 1.6947, + "step": 28699 + }, + { + "epoch": 8.809085328422345, + "grad_norm": 0.1330055147409439, + "learning_rate": 3.675395085591832e-06, + "loss": 1.7386, + "step": 28700 + }, + { + "epoch": 8.80939226519337, + "grad_norm": 0.14793124794960022, + "learning_rate": 3.6735248221442807e-06, + "loss": 1.6841, + "step": 28701 + }, + { + "epoch": 8.809699201964396, + "grad_norm": 0.13912439346313477, + "learning_rate": 3.6716550165221185e-06, + "loss": 1.697, + "step": 28702 + }, + { + "epoch": 8.810006138735421, + "grad_norm": 0.17170770466327667, + "learning_rate": 3.669785668743808e-06, + "loss": 1.7158, + "step": 28703 + }, + { + "epoch": 8.810313075506446, + "grad_norm": 0.14432193338871002, + "learning_rate": 3.66791677882784e-06, + "loss": 1.6617, + "step": 28704 + }, + { + "epoch": 8.810620012277472, + "grad_norm": 0.14610548317432404, + "learning_rate": 3.666048346792661e-06, + "loss": 1.6677, + "step": 28705 + }, + { + "epoch": 8.810926949048497, + "grad_norm": 0.15598154067993164, + "learning_rate": 3.664180372656756e-06, + "loss": 1.6847, + "step": 28706 + }, + { + "epoch": 8.81123388581952, + "grad_norm": 0.11805412918329239, + "learning_rate": 3.662312856438577e-06, + "loss": 1.668, + "step": 28707 + }, + { + "epoch": 8.811540822590546, + "grad_norm": 0.16846078634262085, + "learning_rate": 3.660445798156581e-06, + "loss": 1.7295, + "step": 28708 + }, + { + "epoch": 8.811847759361571, + "grad_norm": 0.11984262615442276, + "learning_rate": 3.658579197829226e-06, + "loss": 1.6711, + "step": 28709 + }, + { + "epoch": 8.812154696132596, + "grad_norm": 0.13624878227710724, + "learning_rate": 3.6567130554749476e-06, + "loss": 1.665, + "step": 28710 + }, + { + "epoch": 8.812461632903622, + "grad_norm": 0.19053621590137482, + "learning_rate": 3.654847371112197e-06, + "loss": 1.7301, + "step": 28711 + }, + { + "epoch": 8.812768569674647, + "grad_norm": 0.12689290940761566, + "learning_rate": 3.6529821447594036e-06, + "loss": 1.6683, + "step": 28712 + }, + { + "epoch": 8.813075506445673, + "grad_norm": 0.20414969325065613, + "learning_rate": 3.6511173764350094e-06, + "loss": 1.7787, + "step": 28713 + }, + { + "epoch": 8.813382443216698, + "grad_norm": 0.1935388743877411, + "learning_rate": 3.6492530661574377e-06, + "loss": 1.7021, + "step": 28714 + }, + { + "epoch": 8.813689379987723, + "grad_norm": 0.15490898489952087, + "learning_rate": 3.6473892139451072e-06, + "loss": 1.7155, + "step": 28715 + }, + { + "epoch": 8.813996316758749, + "grad_norm": 0.2282942682504654, + "learning_rate": 3.6455258198164587e-06, + "loss": 1.6895, + "step": 28716 + }, + { + "epoch": 8.814303253529772, + "grad_norm": 0.12892891466617584, + "learning_rate": 3.643662883789878e-06, + "loss": 1.6478, + "step": 28717 + }, + { + "epoch": 8.814610190300797, + "grad_norm": 0.12005404382944107, + "learning_rate": 3.641800405883811e-06, + "loss": 1.6955, + "step": 28718 + }, + { + "epoch": 8.814917127071823, + "grad_norm": 0.15036113560199738, + "learning_rate": 3.639938386116626e-06, + "loss": 1.7104, + "step": 28719 + }, + { + "epoch": 8.815224063842848, + "grad_norm": 0.13082142174243927, + "learning_rate": 3.6380768245067478e-06, + "loss": 1.6797, + "step": 28720 + }, + { + "epoch": 8.815531000613873, + "grad_norm": 0.12086073309183121, + "learning_rate": 3.6362157210725778e-06, + "loss": 1.6478, + "step": 28721 + }, + { + "epoch": 8.815837937384899, + "grad_norm": 0.15807145833969116, + "learning_rate": 3.6343550758324797e-06, + "loss": 1.6987, + "step": 28722 + }, + { + "epoch": 8.816144874155924, + "grad_norm": 0.1517954170703888, + "learning_rate": 3.6324948888048715e-06, + "loss": 1.7048, + "step": 28723 + }, + { + "epoch": 8.81645181092695, + "grad_norm": 0.12381365150213242, + "learning_rate": 3.6306351600081223e-06, + "loss": 1.6788, + "step": 28724 + }, + { + "epoch": 8.816758747697975, + "grad_norm": 0.14769119024276733, + "learning_rate": 3.6287758894606173e-06, + "loss": 1.6961, + "step": 28725 + }, + { + "epoch": 8.817065684469, + "grad_norm": 0.13606438040733337, + "learning_rate": 3.6269170771807305e-06, + "loss": 1.6603, + "step": 28726 + }, + { + "epoch": 8.817372621240025, + "grad_norm": 0.1724759191274643, + "learning_rate": 3.625058723186825e-06, + "loss": 1.7054, + "step": 28727 + }, + { + "epoch": 8.817679558011049, + "grad_norm": 0.1703757792711258, + "learning_rate": 3.6232008274972753e-06, + "loss": 1.7539, + "step": 28728 + }, + { + "epoch": 8.817986494782074, + "grad_norm": 0.17725473642349243, + "learning_rate": 3.621343390130433e-06, + "loss": 1.7774, + "step": 28729 + }, + { + "epoch": 8.8182934315531, + "grad_norm": 0.12104978412389755, + "learning_rate": 3.6194864111046558e-06, + "loss": 1.6966, + "step": 28730 + }, + { + "epoch": 8.818600368324125, + "grad_norm": 0.15737809240818024, + "learning_rate": 3.6176298904383066e-06, + "loss": 1.7527, + "step": 28731 + }, + { + "epoch": 8.81890730509515, + "grad_norm": 0.2053712159395218, + "learning_rate": 3.61577382814971e-06, + "loss": 1.695, + "step": 28732 + }, + { + "epoch": 8.819214241866176, + "grad_norm": 0.17244333028793335, + "learning_rate": 3.61391822425724e-06, + "loss": 1.7748, + "step": 28733 + }, + { + "epoch": 8.819521178637201, + "grad_norm": 0.10550814867019653, + "learning_rate": 3.612063078779204e-06, + "loss": 1.6216, + "step": 28734 + }, + { + "epoch": 8.819828115408226, + "grad_norm": 0.12428541481494904, + "learning_rate": 3.6102083917339657e-06, + "loss": 1.6863, + "step": 28735 + }, + { + "epoch": 8.820135052179252, + "grad_norm": 0.1403985470533371, + "learning_rate": 3.608354163139821e-06, + "loss": 1.7582, + "step": 28736 + }, + { + "epoch": 8.820441988950277, + "grad_norm": 0.14146897196769714, + "learning_rate": 3.6065003930151163e-06, + "loss": 1.6711, + "step": 28737 + }, + { + "epoch": 8.8207489257213, + "grad_norm": 0.1309487670660019, + "learning_rate": 3.6046470813781763e-06, + "loss": 1.6553, + "step": 28738 + }, + { + "epoch": 8.821055862492326, + "grad_norm": 0.16398943960666656, + "learning_rate": 3.602794228247297e-06, + "loss": 1.7097, + "step": 28739 + }, + { + "epoch": 8.821362799263351, + "grad_norm": 0.13138768076896667, + "learning_rate": 3.6009418336408085e-06, + "loss": 1.6641, + "step": 28740 + }, + { + "epoch": 8.821669736034377, + "grad_norm": 0.14470353722572327, + "learning_rate": 3.599089897576996e-06, + "loss": 1.6626, + "step": 28741 + }, + { + "epoch": 8.821976672805402, + "grad_norm": 0.17124676704406738, + "learning_rate": 3.597238420074178e-06, + "loss": 1.7347, + "step": 28742 + }, + { + "epoch": 8.822283609576427, + "grad_norm": 0.19663479924201965, + "learning_rate": 3.595387401150652e-06, + "loss": 1.7267, + "step": 28743 + }, + { + "epoch": 8.822590546347453, + "grad_norm": 0.14935022592544556, + "learning_rate": 3.5935368408247016e-06, + "loss": 1.7001, + "step": 28744 + }, + { + "epoch": 8.822897483118478, + "grad_norm": 0.13796019554138184, + "learning_rate": 3.591686739114625e-06, + "loss": 1.6774, + "step": 28745 + }, + { + "epoch": 8.823204419889503, + "grad_norm": 0.19741731882095337, + "learning_rate": 3.5898370960386952e-06, + "loss": 1.6887, + "step": 28746 + }, + { + "epoch": 8.823511356660529, + "grad_norm": 0.17089900374412537, + "learning_rate": 3.5879879116151984e-06, + "loss": 1.6869, + "step": 28747 + }, + { + "epoch": 8.823818293431554, + "grad_norm": 0.13532526791095734, + "learning_rate": 3.5861391858624083e-06, + "loss": 1.6525, + "step": 28748 + }, + { + "epoch": 8.824125230202577, + "grad_norm": 0.15727277100086212, + "learning_rate": 3.5842909187985886e-06, + "loss": 1.725, + "step": 28749 + }, + { + "epoch": 8.824432166973603, + "grad_norm": 0.14250576496124268, + "learning_rate": 3.5824431104420298e-06, + "loss": 1.6728, + "step": 28750 + }, + { + "epoch": 8.824739103744628, + "grad_norm": 0.1596658080816269, + "learning_rate": 3.580595760810951e-06, + "loss": 1.6933, + "step": 28751 + }, + { + "epoch": 8.825046040515653, + "grad_norm": 0.2319880872964859, + "learning_rate": 3.5787488699236537e-06, + "loss": 1.744, + "step": 28752 + }, + { + "epoch": 8.825352977286679, + "grad_norm": 0.12813101708889008, + "learning_rate": 3.5769024377983517e-06, + "loss": 1.7022, + "step": 28753 + }, + { + "epoch": 8.825659914057704, + "grad_norm": 0.1346128284931183, + "learning_rate": 3.5750564644533137e-06, + "loss": 1.6755, + "step": 28754 + }, + { + "epoch": 8.82596685082873, + "grad_norm": 0.1405024230480194, + "learning_rate": 3.5732109499067913e-06, + "loss": 1.6662, + "step": 28755 + }, + { + "epoch": 8.826273787599755, + "grad_norm": 0.16663044691085815, + "learning_rate": 3.571365894176992e-06, + "loss": 1.7237, + "step": 28756 + }, + { + "epoch": 8.82658072437078, + "grad_norm": 0.19339314103126526, + "learning_rate": 3.56952129728218e-06, + "loss": 1.729, + "step": 28757 + }, + { + "epoch": 8.826887661141805, + "grad_norm": 0.18851202726364136, + "learning_rate": 3.5676771592405624e-06, + "loss": 1.6923, + "step": 28758 + }, + { + "epoch": 8.82719459791283, + "grad_norm": 0.15386530756950378, + "learning_rate": 3.5658334800703797e-06, + "loss": 1.695, + "step": 28759 + }, + { + "epoch": 8.827501534683854, + "grad_norm": 0.17883063852787018, + "learning_rate": 3.5639902597898455e-06, + "loss": 1.746, + "step": 28760 + }, + { + "epoch": 8.82780847145488, + "grad_norm": 0.15690109133720398, + "learning_rate": 3.5621474984171733e-06, + "loss": 1.6937, + "step": 28761 + }, + { + "epoch": 8.828115408225905, + "grad_norm": 0.19555453956127167, + "learning_rate": 3.5603051959705815e-06, + "loss": 1.7524, + "step": 28762 + }, + { + "epoch": 8.82842234499693, + "grad_norm": 0.13835586607456207, + "learning_rate": 3.558463352468272e-06, + "loss": 1.6975, + "step": 28763 + }, + { + "epoch": 8.828729281767956, + "grad_norm": 0.13608703017234802, + "learning_rate": 3.556621967928453e-06, + "loss": 1.6588, + "step": 28764 + }, + { + "epoch": 8.829036218538981, + "grad_norm": 0.1849900633096695, + "learning_rate": 3.5547810423693096e-06, + "loss": 1.7236, + "step": 28765 + }, + { + "epoch": 8.829343155310006, + "grad_norm": 0.13603585958480835, + "learning_rate": 3.5529405758090382e-06, + "loss": 1.69, + "step": 28766 + }, + { + "epoch": 8.829650092081032, + "grad_norm": 0.12596213817596436, + "learning_rate": 3.5511005682658473e-06, + "loss": 1.7069, + "step": 28767 + }, + { + "epoch": 8.829957028852057, + "grad_norm": 0.17949149012565613, + "learning_rate": 3.549261019757888e-06, + "loss": 1.7836, + "step": 28768 + }, + { + "epoch": 8.830263965623082, + "grad_norm": 0.17237712442874908, + "learning_rate": 3.547421930303374e-06, + "loss": 1.6978, + "step": 28769 + }, + { + "epoch": 8.830570902394108, + "grad_norm": 0.16467876732349396, + "learning_rate": 3.5455832999204517e-06, + "loss": 1.7526, + "step": 28770 + }, + { + "epoch": 8.830877839165131, + "grad_norm": 0.1549120396375656, + "learning_rate": 3.5437451286273014e-06, + "loss": 1.6955, + "step": 28771 + }, + { + "epoch": 8.831184775936157, + "grad_norm": 0.24028703570365906, + "learning_rate": 3.541907416442103e-06, + "loss": 1.7547, + "step": 28772 + }, + { + "epoch": 8.831491712707182, + "grad_norm": 0.17325441539287567, + "learning_rate": 3.5400701633829856e-06, + "loss": 1.7041, + "step": 28773 + }, + { + "epoch": 8.831798649478207, + "grad_norm": 0.15597397089004517, + "learning_rate": 3.5382333694681467e-06, + "loss": 1.6997, + "step": 28774 + }, + { + "epoch": 8.832105586249233, + "grad_norm": 0.14938347041606903, + "learning_rate": 3.5363970347156994e-06, + "loss": 1.7271, + "step": 28775 + }, + { + "epoch": 8.832412523020258, + "grad_norm": 0.17745234072208405, + "learning_rate": 3.534561159143823e-06, + "loss": 1.714, + "step": 28776 + }, + { + "epoch": 8.832719459791283, + "grad_norm": 0.15323567390441895, + "learning_rate": 3.532725742770643e-06, + "loss": 1.7079, + "step": 28777 + }, + { + "epoch": 8.833026396562309, + "grad_norm": 0.15351314842700958, + "learning_rate": 3.5308907856143046e-06, + "loss": 1.733, + "step": 28778 + }, + { + "epoch": 8.833333333333334, + "grad_norm": 0.19209100306034088, + "learning_rate": 3.5290562876929388e-06, + "loss": 1.7362, + "step": 28779 + }, + { + "epoch": 8.83364027010436, + "grad_norm": 0.2092818021774292, + "learning_rate": 3.5272222490246753e-06, + "loss": 1.7682, + "step": 28780 + }, + { + "epoch": 8.833947206875383, + "grad_norm": 0.21600767970085144, + "learning_rate": 3.5253886696276383e-06, + "loss": 1.8015, + "step": 28781 + }, + { + "epoch": 8.834254143646408, + "grad_norm": 0.11457479000091553, + "learning_rate": 3.5235555495199525e-06, + "loss": 1.6582, + "step": 28782 + }, + { + "epoch": 8.834561080417433, + "grad_norm": 0.1698341816663742, + "learning_rate": 3.5217228887197253e-06, + "loss": 1.7348, + "step": 28783 + }, + { + "epoch": 8.834868017188459, + "grad_norm": 0.1234394982457161, + "learning_rate": 3.5198906872450866e-06, + "loss": 1.6819, + "step": 28784 + }, + { + "epoch": 8.835174953959484, + "grad_norm": 0.15412946045398712, + "learning_rate": 3.518058945114117e-06, + "loss": 1.6972, + "step": 28785 + }, + { + "epoch": 8.83548189073051, + "grad_norm": 0.16202808916568756, + "learning_rate": 3.516227662344951e-06, + "loss": 1.7439, + "step": 28786 + }, + { + "epoch": 8.835788827501535, + "grad_norm": 0.1599927842617035, + "learning_rate": 3.514396838955658e-06, + "loss": 1.7012, + "step": 28787 + }, + { + "epoch": 8.83609576427256, + "grad_norm": 0.1487586498260498, + "learning_rate": 3.512566474964335e-06, + "loss": 1.6844, + "step": 28788 + }, + { + "epoch": 8.836402701043585, + "grad_norm": 0.18033012747764587, + "learning_rate": 3.5107365703890892e-06, + "loss": 1.7855, + "step": 28789 + }, + { + "epoch": 8.83670963781461, + "grad_norm": 0.18171031773090363, + "learning_rate": 3.508907125247979e-06, + "loss": 1.703, + "step": 28790 + }, + { + "epoch": 8.837016574585636, + "grad_norm": 0.14102062582969666, + "learning_rate": 3.507078139559117e-06, + "loss": 1.6627, + "step": 28791 + }, + { + "epoch": 8.83732351135666, + "grad_norm": 0.16365323960781097, + "learning_rate": 3.505249613340539e-06, + "loss": 1.7317, + "step": 28792 + }, + { + "epoch": 8.837630448127685, + "grad_norm": 0.1492282748222351, + "learning_rate": 3.5034215466103417e-06, + "loss": 1.6633, + "step": 28793 + }, + { + "epoch": 8.83793738489871, + "grad_norm": 0.18670693039894104, + "learning_rate": 3.5015939393865937e-06, + "loss": 1.7233, + "step": 28794 + }, + { + "epoch": 8.838244321669736, + "grad_norm": 0.16062071919441223, + "learning_rate": 3.499766791687342e-06, + "loss": 1.7238, + "step": 28795 + }, + { + "epoch": 8.838551258440761, + "grad_norm": 0.158021941781044, + "learning_rate": 3.4979401035306504e-06, + "loss": 1.705, + "step": 28796 + }, + { + "epoch": 8.838858195211786, + "grad_norm": 0.14865651726722717, + "learning_rate": 3.49611387493457e-06, + "loss": 1.6777, + "step": 28797 + }, + { + "epoch": 8.839165131982812, + "grad_norm": 0.12111876904964447, + "learning_rate": 3.4942881059171483e-06, + "loss": 1.6273, + "step": 28798 + }, + { + "epoch": 8.839472068753837, + "grad_norm": 0.12468799948692322, + "learning_rate": 3.4924627964964318e-06, + "loss": 1.6626, + "step": 28799 + }, + { + "epoch": 8.839779005524862, + "grad_norm": 0.12292506545782089, + "learning_rate": 3.490637946690445e-06, + "loss": 1.6448, + "step": 28800 + }, + { + "epoch": 8.840085942295888, + "grad_norm": 0.16731779277324677, + "learning_rate": 3.4888135565172563e-06, + "loss": 1.7541, + "step": 28801 + }, + { + "epoch": 8.840392879066911, + "grad_norm": 0.16351507604122162, + "learning_rate": 3.486989625994852e-06, + "loss": 1.699, + "step": 28802 + }, + { + "epoch": 8.840699815837937, + "grad_norm": 0.12385114282369614, + "learning_rate": 3.485166155141295e-06, + "loss": 1.6852, + "step": 28803 + }, + { + "epoch": 8.841006752608962, + "grad_norm": 0.20780152082443237, + "learning_rate": 3.4833431439745822e-06, + "loss": 1.7179, + "step": 28804 + }, + { + "epoch": 8.841313689379987, + "grad_norm": 0.16182561218738556, + "learning_rate": 3.481520592512727e-06, + "loss": 1.7457, + "step": 28805 + }, + { + "epoch": 8.841620626151013, + "grad_norm": 0.1332414746284485, + "learning_rate": 3.4796985007737705e-06, + "loss": 1.7272, + "step": 28806 + }, + { + "epoch": 8.841927562922038, + "grad_norm": 0.14266319572925568, + "learning_rate": 3.477876868775681e-06, + "loss": 1.7207, + "step": 28807 + }, + { + "epoch": 8.842234499693063, + "grad_norm": 0.162164106965065, + "learning_rate": 3.4760556965364953e-06, + "loss": 1.6948, + "step": 28808 + }, + { + "epoch": 8.842541436464089, + "grad_norm": 0.14134974777698517, + "learning_rate": 3.474234984074182e-06, + "loss": 1.676, + "step": 28809 + }, + { + "epoch": 8.842848373235114, + "grad_norm": 0.16302376985549927, + "learning_rate": 3.4724147314067534e-06, + "loss": 1.7279, + "step": 28810 + }, + { + "epoch": 8.84315531000614, + "grad_norm": 0.1352432370185852, + "learning_rate": 3.4705949385521964e-06, + "loss": 1.7065, + "step": 28811 + }, + { + "epoch": 8.843462246777165, + "grad_norm": 0.13483819365501404, + "learning_rate": 3.46877560552849e-06, + "loss": 1.7275, + "step": 28812 + }, + { + "epoch": 8.84376918354819, + "grad_norm": 0.12226319313049316, + "learning_rate": 3.4669567323536157e-06, + "loss": 1.6965, + "step": 28813 + }, + { + "epoch": 8.844076120319214, + "grad_norm": 0.1687331646680832, + "learning_rate": 3.465138319045552e-06, + "loss": 1.6949, + "step": 28814 + }, + { + "epoch": 8.844383057090239, + "grad_norm": 0.17721997201442719, + "learning_rate": 3.4633203656222635e-06, + "loss": 1.6981, + "step": 28815 + }, + { + "epoch": 8.844689993861264, + "grad_norm": 0.14818120002746582, + "learning_rate": 3.4615028721017186e-06, + "loss": 1.687, + "step": 28816 + }, + { + "epoch": 8.84499693063229, + "grad_norm": 0.15871183574199677, + "learning_rate": 3.459685838501875e-06, + "loss": 1.7403, + "step": 28817 + }, + { + "epoch": 8.845303867403315, + "grad_norm": 0.16533036530017853, + "learning_rate": 3.4578692648407076e-06, + "loss": 1.7879, + "step": 28818 + }, + { + "epoch": 8.84561080417434, + "grad_norm": 0.18678778409957886, + "learning_rate": 3.456053151136135e-06, + "loss": 1.7474, + "step": 28819 + }, + { + "epoch": 8.845917740945366, + "grad_norm": 0.12712402641773224, + "learning_rate": 3.4542374974061488e-06, + "loss": 1.6635, + "step": 28820 + }, + { + "epoch": 8.84622467771639, + "grad_norm": 0.15502063930034637, + "learning_rate": 3.4524223036686566e-06, + "loss": 1.7133, + "step": 28821 + }, + { + "epoch": 8.846531614487416, + "grad_norm": 0.17015717923641205, + "learning_rate": 3.4506075699416e-06, + "loss": 1.7514, + "step": 28822 + }, + { + "epoch": 8.846838551258442, + "grad_norm": 0.15805409848690033, + "learning_rate": 3.4487932962429415e-06, + "loss": 1.7253, + "step": 28823 + }, + { + "epoch": 8.847145488029465, + "grad_norm": 0.14090047776699066, + "learning_rate": 3.446979482590579e-06, + "loss": 1.6763, + "step": 28824 + }, + { + "epoch": 8.84745242480049, + "grad_norm": 0.18115323781967163, + "learning_rate": 3.445166129002464e-06, + "loss": 1.7575, + "step": 28825 + }, + { + "epoch": 8.847759361571516, + "grad_norm": 0.18050703406333923, + "learning_rate": 3.443353235496488e-06, + "loss": 1.7688, + "step": 28826 + }, + { + "epoch": 8.848066298342541, + "grad_norm": 0.13750851154327393, + "learning_rate": 3.441540802090587e-06, + "loss": 1.7416, + "step": 28827 + }, + { + "epoch": 8.848373235113566, + "grad_norm": 0.14183515310287476, + "learning_rate": 3.439728828802674e-06, + "loss": 1.6924, + "step": 28828 + }, + { + "epoch": 8.848680171884592, + "grad_norm": 0.16401416063308716, + "learning_rate": 3.4379173156506517e-06, + "loss": 1.7041, + "step": 28829 + }, + { + "epoch": 8.848987108655617, + "grad_norm": 0.1347450613975525, + "learning_rate": 3.4361062626524166e-06, + "loss": 1.7331, + "step": 28830 + }, + { + "epoch": 8.849294045426642, + "grad_norm": 0.16579827666282654, + "learning_rate": 3.4342956698258768e-06, + "loss": 1.7628, + "step": 28831 + }, + { + "epoch": 8.849600982197668, + "grad_norm": 0.18201382458209991, + "learning_rate": 3.4324855371889177e-06, + "loss": 1.7054, + "step": 28832 + }, + { + "epoch": 8.849907918968693, + "grad_norm": 0.1637437641620636, + "learning_rate": 3.430675864759425e-06, + "loss": 1.7393, + "step": 28833 + }, + { + "epoch": 8.850214855739718, + "grad_norm": 0.1596134454011917, + "learning_rate": 3.4288666525552848e-06, + "loss": 1.7102, + "step": 28834 + }, + { + "epoch": 8.850521792510742, + "grad_norm": 0.1999501883983612, + "learning_rate": 3.4270579005943994e-06, + "loss": 1.7547, + "step": 28835 + }, + { + "epoch": 8.850828729281767, + "grad_norm": 0.15011270344257355, + "learning_rate": 3.4252496088946097e-06, + "loss": 1.6387, + "step": 28836 + }, + { + "epoch": 8.851135666052793, + "grad_norm": 0.12606796622276306, + "learning_rate": 3.4234417774738124e-06, + "loss": 1.6633, + "step": 28837 + }, + { + "epoch": 8.851442602823818, + "grad_norm": 0.19459915161132812, + "learning_rate": 3.421634406349855e-06, + "loss": 1.7424, + "step": 28838 + }, + { + "epoch": 8.851749539594843, + "grad_norm": 0.1512998342514038, + "learning_rate": 3.4198274955406062e-06, + "loss": 1.7007, + "step": 28839 + }, + { + "epoch": 8.852056476365869, + "grad_norm": 0.19419771432876587, + "learning_rate": 3.4180210450639295e-06, + "loss": 1.7223, + "step": 28840 + }, + { + "epoch": 8.852363413136894, + "grad_norm": 0.17737379670143127, + "learning_rate": 3.41621505493766e-06, + "loss": 1.7309, + "step": 28841 + }, + { + "epoch": 8.85267034990792, + "grad_norm": 0.14393949508666992, + "learning_rate": 3.414409525179674e-06, + "loss": 1.7213, + "step": 28842 + }, + { + "epoch": 8.852977286678945, + "grad_norm": 0.11586382240056992, + "learning_rate": 3.412604455807783e-06, + "loss": 1.6675, + "step": 28843 + }, + { + "epoch": 8.85328422344997, + "grad_norm": 0.18049278855323792, + "learning_rate": 3.410799846839846e-06, + "loss": 1.7558, + "step": 28844 + }, + { + "epoch": 8.853591160220994, + "grad_norm": 0.20962421596050262, + "learning_rate": 3.408995698293693e-06, + "loss": 1.7222, + "step": 28845 + }, + { + "epoch": 8.853898096992019, + "grad_norm": 0.12382032722234726, + "learning_rate": 3.4071920101871547e-06, + "loss": 1.7149, + "step": 28846 + }, + { + "epoch": 8.854205033763044, + "grad_norm": 0.15395772457122803, + "learning_rate": 3.405388782538049e-06, + "loss": 1.6986, + "step": 28847 + }, + { + "epoch": 8.85451197053407, + "grad_norm": 0.1579637974500656, + "learning_rate": 3.403586015364202e-06, + "loss": 1.7208, + "step": 28848 + }, + { + "epoch": 8.854818907305095, + "grad_norm": 0.18486931920051575, + "learning_rate": 3.4017837086834315e-06, + "loss": 1.7554, + "step": 28849 + }, + { + "epoch": 8.85512584407612, + "grad_norm": 0.1619080752134323, + "learning_rate": 3.399981862513546e-06, + "loss": 1.7581, + "step": 28850 + }, + { + "epoch": 8.855432780847146, + "grad_norm": 0.14540675282478333, + "learning_rate": 3.3981804768723425e-06, + "loss": 1.7391, + "step": 28851 + }, + { + "epoch": 8.855739717618171, + "grad_norm": 0.17640653252601624, + "learning_rate": 3.396379551777651e-06, + "loss": 1.807, + "step": 28852 + }, + { + "epoch": 8.856046654389196, + "grad_norm": 0.18279080092906952, + "learning_rate": 3.394579087247235e-06, + "loss": 1.7195, + "step": 28853 + }, + { + "epoch": 8.856353591160222, + "grad_norm": 0.17531390488147736, + "learning_rate": 3.3927790832989247e-06, + "loss": 1.7253, + "step": 28854 + }, + { + "epoch": 8.856660527931247, + "grad_norm": 0.14441180229187012, + "learning_rate": 3.3909795399504783e-06, + "loss": 1.7078, + "step": 28855 + }, + { + "epoch": 8.856967464702272, + "grad_norm": 0.16991926729679108, + "learning_rate": 3.3891804572196816e-06, + "loss": 1.6953, + "step": 28856 + }, + { + "epoch": 8.857274401473296, + "grad_norm": 0.17067831754684448, + "learning_rate": 3.3873818351243426e-06, + "loss": 1.7294, + "step": 28857 + }, + { + "epoch": 8.857581338244321, + "grad_norm": 0.14316415786743164, + "learning_rate": 3.3855836736821967e-06, + "loss": 1.7152, + "step": 28858 + }, + { + "epoch": 8.857888275015346, + "grad_norm": 0.13260309398174286, + "learning_rate": 3.383785972911052e-06, + "loss": 1.6761, + "step": 28859 + }, + { + "epoch": 8.858195211786372, + "grad_norm": 0.12228702753782272, + "learning_rate": 3.3819887328286394e-06, + "loss": 1.6802, + "step": 28860 + }, + { + "epoch": 8.858502148557397, + "grad_norm": 0.18033485114574432, + "learning_rate": 3.3801919534527495e-06, + "loss": 1.7828, + "step": 28861 + }, + { + "epoch": 8.858809085328422, + "grad_norm": 0.1613384336233139, + "learning_rate": 3.3783956348011235e-06, + "loss": 1.7068, + "step": 28862 + }, + { + "epoch": 8.859116022099448, + "grad_norm": 0.19849342107772827, + "learning_rate": 3.3765997768915204e-06, + "loss": 1.7139, + "step": 28863 + }, + { + "epoch": 8.859422958870473, + "grad_norm": 0.1470731794834137, + "learning_rate": 3.3748043797416804e-06, + "loss": 1.7104, + "step": 28864 + }, + { + "epoch": 8.859729895641498, + "grad_norm": 0.15868861973285675, + "learning_rate": 3.373009443369357e-06, + "loss": 1.7662, + "step": 28865 + }, + { + "epoch": 8.860036832412524, + "grad_norm": 0.17230434715747833, + "learning_rate": 3.37121496779228e-06, + "loss": 1.6877, + "step": 28866 + }, + { + "epoch": 8.860343769183547, + "grad_norm": 0.1297665536403656, + "learning_rate": 3.3694209530281905e-06, + "loss": 1.6687, + "step": 28867 + }, + { + "epoch": 8.860650705954573, + "grad_norm": 0.13699746131896973, + "learning_rate": 3.3676273990948136e-06, + "loss": 1.6773, + "step": 28868 + }, + { + "epoch": 8.860957642725598, + "grad_norm": 0.12981395423412323, + "learning_rate": 3.3658343060098685e-06, + "loss": 1.6752, + "step": 28869 + }, + { + "epoch": 8.861264579496623, + "grad_norm": 0.15934717655181885, + "learning_rate": 3.3640416737910794e-06, + "loss": 1.7449, + "step": 28870 + }, + { + "epoch": 8.861571516267649, + "grad_norm": 0.13023978471755981, + "learning_rate": 3.3622495024561827e-06, + "loss": 1.698, + "step": 28871 + }, + { + "epoch": 8.861878453038674, + "grad_norm": 0.14700792729854584, + "learning_rate": 3.3604577920228585e-06, + "loss": 1.732, + "step": 28872 + }, + { + "epoch": 8.8621853898097, + "grad_norm": 0.1421707421541214, + "learning_rate": 3.3586665425088314e-06, + "loss": 1.7032, + "step": 28873 + }, + { + "epoch": 8.862492326580725, + "grad_norm": 0.1941523402929306, + "learning_rate": 3.356875753931793e-06, + "loss": 1.7407, + "step": 28874 + }, + { + "epoch": 8.86279926335175, + "grad_norm": 0.15837855637073517, + "learning_rate": 3.3550854263094454e-06, + "loss": 1.755, + "step": 28875 + }, + { + "epoch": 8.863106200122775, + "grad_norm": 0.1624121218919754, + "learning_rate": 3.3532955596594916e-06, + "loss": 1.738, + "step": 28876 + }, + { + "epoch": 8.8634131368938, + "grad_norm": 0.15944771468639374, + "learning_rate": 3.3515061539996007e-06, + "loss": 1.6955, + "step": 28877 + }, + { + "epoch": 8.863720073664824, + "grad_norm": 0.17303216457366943, + "learning_rate": 3.349717209347475e-06, + "loss": 1.7012, + "step": 28878 + }, + { + "epoch": 8.86402701043585, + "grad_norm": 0.14601273834705353, + "learning_rate": 3.347928725720789e-06, + "loss": 1.696, + "step": 28879 + }, + { + "epoch": 8.864333947206875, + "grad_norm": 0.1746055781841278, + "learning_rate": 3.3461407031372125e-06, + "loss": 1.6991, + "step": 28880 + }, + { + "epoch": 8.8646408839779, + "grad_norm": 0.12818776071071625, + "learning_rate": 3.3443531416144147e-06, + "loss": 1.6828, + "step": 28881 + }, + { + "epoch": 8.864947820748926, + "grad_norm": 0.12297061085700989, + "learning_rate": 3.3425660411700697e-06, + "loss": 1.6483, + "step": 28882 + }, + { + "epoch": 8.865254757519951, + "grad_norm": 0.1359318494796753, + "learning_rate": 3.3407794018218307e-06, + "loss": 1.7182, + "step": 28883 + }, + { + "epoch": 8.865561694290976, + "grad_norm": 0.11981796473264694, + "learning_rate": 3.3389932235873612e-06, + "loss": 1.6935, + "step": 28884 + }, + { + "epoch": 8.865868631062002, + "grad_norm": 0.1271422654390335, + "learning_rate": 3.337207506484308e-06, + "loss": 1.6776, + "step": 28885 + }, + { + "epoch": 8.866175567833027, + "grad_norm": 0.1494673788547516, + "learning_rate": 3.335422250530318e-06, + "loss": 1.7041, + "step": 28886 + }, + { + "epoch": 8.866482504604052, + "grad_norm": 0.15046460926532745, + "learning_rate": 3.3336374557430272e-06, + "loss": 1.6714, + "step": 28887 + }, + { + "epoch": 8.866789441375076, + "grad_norm": 0.17862144112586975, + "learning_rate": 3.331853122140105e-06, + "loss": 1.7805, + "step": 28888 + }, + { + "epoch": 8.867096378146101, + "grad_norm": 0.13172993063926697, + "learning_rate": 3.3300692497391483e-06, + "loss": 1.6841, + "step": 28889 + }, + { + "epoch": 8.867403314917127, + "grad_norm": 0.20627157390117645, + "learning_rate": 3.3282858385578098e-06, + "loss": 1.8127, + "step": 28890 + }, + { + "epoch": 8.867710251688152, + "grad_norm": 0.22035779058933258, + "learning_rate": 3.326502888613697e-06, + "loss": 1.7813, + "step": 28891 + }, + { + "epoch": 8.868017188459177, + "grad_norm": 0.15250372886657715, + "learning_rate": 3.3247203999244358e-06, + "loss": 1.7192, + "step": 28892 + }, + { + "epoch": 8.868324125230203, + "grad_norm": 0.1745261251926422, + "learning_rate": 3.3229383725076614e-06, + "loss": 1.72, + "step": 28893 + }, + { + "epoch": 8.868631062001228, + "grad_norm": 0.1768372803926468, + "learning_rate": 3.3211568063809483e-06, + "loss": 1.7582, + "step": 28894 + }, + { + "epoch": 8.868937998772253, + "grad_norm": 0.14829827845096588, + "learning_rate": 3.3193757015619443e-06, + "loss": 1.6749, + "step": 28895 + }, + { + "epoch": 8.869244935543279, + "grad_norm": 0.13321566581726074, + "learning_rate": 3.3175950580682123e-06, + "loss": 1.6854, + "step": 28896 + }, + { + "epoch": 8.869551872314304, + "grad_norm": 0.12003330886363983, + "learning_rate": 3.315814875917372e-06, + "loss": 1.6611, + "step": 28897 + }, + { + "epoch": 8.86985880908533, + "grad_norm": 0.1468251645565033, + "learning_rate": 3.3140351551270157e-06, + "loss": 1.6674, + "step": 28898 + }, + { + "epoch": 8.870165745856355, + "grad_norm": 0.2222270667552948, + "learning_rate": 3.312255895714722e-06, + "loss": 1.6472, + "step": 28899 + }, + { + "epoch": 8.870472682627378, + "grad_norm": 0.14377200603485107, + "learning_rate": 3.3104770976980836e-06, + "loss": 1.6835, + "step": 28900 + }, + { + "epoch": 8.870779619398403, + "grad_norm": 0.19064709544181824, + "learning_rate": 3.3086987610946807e-06, + "loss": 1.7172, + "step": 28901 + }, + { + "epoch": 8.871086556169429, + "grad_norm": 0.21035094559192657, + "learning_rate": 3.306920885922077e-06, + "loss": 1.7199, + "step": 28902 + }, + { + "epoch": 8.871393492940454, + "grad_norm": 0.1529282182455063, + "learning_rate": 3.3051434721978526e-06, + "loss": 1.672, + "step": 28903 + }, + { + "epoch": 8.87170042971148, + "grad_norm": 0.13990004360675812, + "learning_rate": 3.3033665199395546e-06, + "loss": 1.7204, + "step": 28904 + }, + { + "epoch": 8.872007366482505, + "grad_norm": 0.20450010895729065, + "learning_rate": 3.3015900291647805e-06, + "loss": 1.7619, + "step": 28905 + }, + { + "epoch": 8.87231430325353, + "grad_norm": 0.13215813040733337, + "learning_rate": 3.2998139998910547e-06, + "loss": 1.6999, + "step": 28906 + }, + { + "epoch": 8.872621240024555, + "grad_norm": 0.12693628668785095, + "learning_rate": 3.2980384321359413e-06, + "loss": 1.7075, + "step": 28907 + }, + { + "epoch": 8.87292817679558, + "grad_norm": 0.1447865515947342, + "learning_rate": 3.2962633259169817e-06, + "loss": 1.697, + "step": 28908 + }, + { + "epoch": 8.873235113566606, + "grad_norm": 0.16820397973060608, + "learning_rate": 3.2944886812517173e-06, + "loss": 1.7087, + "step": 28909 + }, + { + "epoch": 8.87354205033763, + "grad_norm": 0.12102416902780533, + "learning_rate": 3.2927144981577007e-06, + "loss": 1.6655, + "step": 28910 + }, + { + "epoch": 8.873848987108655, + "grad_norm": 0.17087550461292267, + "learning_rate": 3.290940776652446e-06, + "loss": 1.7518, + "step": 28911 + }, + { + "epoch": 8.87415592387968, + "grad_norm": 0.15695004165172577, + "learning_rate": 3.2891675167535054e-06, + "loss": 1.6848, + "step": 28912 + }, + { + "epoch": 8.874462860650706, + "grad_norm": 0.16303250193595886, + "learning_rate": 3.2873947184783705e-06, + "loss": 1.7705, + "step": 28913 + }, + { + "epoch": 8.874769797421731, + "grad_norm": 0.1679360568523407, + "learning_rate": 3.2856223818445885e-06, + "loss": 1.6923, + "step": 28914 + }, + { + "epoch": 8.875076734192756, + "grad_norm": 0.1721598356962204, + "learning_rate": 3.283850506869668e-06, + "loss": 1.7164, + "step": 28915 + }, + { + "epoch": 8.875383670963782, + "grad_norm": 0.14126230776309967, + "learning_rate": 3.2820790935711223e-06, + "loss": 1.6794, + "step": 28916 + }, + { + "epoch": 8.875690607734807, + "grad_norm": 0.14232057332992554, + "learning_rate": 3.2803081419664484e-06, + "loss": 1.6844, + "step": 28917 + }, + { + "epoch": 8.875997544505832, + "grad_norm": 0.15812624990940094, + "learning_rate": 3.278537652073149e-06, + "loss": 1.6951, + "step": 28918 + }, + { + "epoch": 8.876304481276858, + "grad_norm": 0.15904119610786438, + "learning_rate": 3.276767623908733e-06, + "loss": 1.6761, + "step": 28919 + }, + { + "epoch": 8.876611418047883, + "grad_norm": 0.18227824568748474, + "learning_rate": 3.2749980574906803e-06, + "loss": 1.7714, + "step": 28920 + }, + { + "epoch": 8.876918354818907, + "grad_norm": 0.1715840995311737, + "learning_rate": 3.2732289528364766e-06, + "loss": 1.7491, + "step": 28921 + }, + { + "epoch": 8.877225291589932, + "grad_norm": 0.15899239480495453, + "learning_rate": 3.2714603099636256e-06, + "loss": 1.7188, + "step": 28922 + }, + { + "epoch": 8.877532228360957, + "grad_norm": 0.14183032512664795, + "learning_rate": 3.269692128889584e-06, + "loss": 1.71, + "step": 28923 + }, + { + "epoch": 8.877839165131983, + "grad_norm": 0.145817831158638, + "learning_rate": 3.2679244096318396e-06, + "loss": 1.7475, + "step": 28924 + }, + { + "epoch": 8.878146101903008, + "grad_norm": 0.20818611979484558, + "learning_rate": 3.2661571522078493e-06, + "loss": 1.7292, + "step": 28925 + }, + { + "epoch": 8.878453038674033, + "grad_norm": 0.18658684194087982, + "learning_rate": 3.264390356635083e-06, + "loss": 1.7588, + "step": 28926 + }, + { + "epoch": 8.878759975445059, + "grad_norm": 0.14851678907871246, + "learning_rate": 3.2626240229310214e-06, + "loss": 1.7177, + "step": 28927 + }, + { + "epoch": 8.879066912216084, + "grad_norm": 0.14433394372463226, + "learning_rate": 3.260858151113083e-06, + "loss": 1.7033, + "step": 28928 + }, + { + "epoch": 8.87937384898711, + "grad_norm": 0.18791940808296204, + "learning_rate": 3.2590927411987547e-06, + "loss": 1.7142, + "step": 28929 + }, + { + "epoch": 8.879680785758135, + "grad_norm": 0.15765266120433807, + "learning_rate": 3.2573277932054504e-06, + "loss": 1.7294, + "step": 28930 + }, + { + "epoch": 8.879987722529158, + "grad_norm": 0.17016790807247162, + "learning_rate": 3.255563307150644e-06, + "loss": 1.7263, + "step": 28931 + }, + { + "epoch": 8.880294659300183, + "grad_norm": 0.18677684664726257, + "learning_rate": 3.2537992830517505e-06, + "loss": 1.708, + "step": 28932 + }, + { + "epoch": 8.880601596071209, + "grad_norm": 0.13736851513385773, + "learning_rate": 3.2520357209262165e-06, + "loss": 1.6971, + "step": 28933 + }, + { + "epoch": 8.880908532842234, + "grad_norm": 0.15366335213184357, + "learning_rate": 3.250272620791467e-06, + "loss": 1.7093, + "step": 28934 + }, + { + "epoch": 8.88121546961326, + "grad_norm": 0.15538384020328522, + "learning_rate": 3.248509982664921e-06, + "loss": 1.7036, + "step": 28935 + }, + { + "epoch": 8.881522406384285, + "grad_norm": 0.137898787856102, + "learning_rate": 3.2467478065639988e-06, + "loss": 1.6654, + "step": 28936 + }, + { + "epoch": 8.88182934315531, + "grad_norm": 0.15095695853233337, + "learning_rate": 3.244986092506125e-06, + "loss": 1.736, + "step": 28937 + }, + { + "epoch": 8.882136279926335, + "grad_norm": 0.15554696321487427, + "learning_rate": 3.2432248405086908e-06, + "loss": 1.7172, + "step": 28938 + }, + { + "epoch": 8.88244321669736, + "grad_norm": 0.18302778899669647, + "learning_rate": 3.241464050589127e-06, + "loss": 1.7441, + "step": 28939 + }, + { + "epoch": 8.882750153468386, + "grad_norm": 0.18259480595588684, + "learning_rate": 3.2397037227648142e-06, + "loss": 1.6983, + "step": 28940 + }, + { + "epoch": 8.883057090239411, + "grad_norm": 0.14723163843154907, + "learning_rate": 3.2379438570531608e-06, + "loss": 1.7007, + "step": 28941 + }, + { + "epoch": 8.883364027010435, + "grad_norm": 0.1403069794178009, + "learning_rate": 3.2361844534715524e-06, + "loss": 1.6545, + "step": 28942 + }, + { + "epoch": 8.88367096378146, + "grad_norm": 0.1433728039264679, + "learning_rate": 3.2344255120373644e-06, + "loss": 1.6977, + "step": 28943 + }, + { + "epoch": 8.883977900552486, + "grad_norm": 0.18680740892887115, + "learning_rate": 3.2326670327680165e-06, + "loss": 1.756, + "step": 28944 + }, + { + "epoch": 8.884284837323511, + "grad_norm": 0.13080160319805145, + "learning_rate": 3.2309090156808498e-06, + "loss": 1.703, + "step": 28945 + }, + { + "epoch": 8.884591774094536, + "grad_norm": 0.126779243350029, + "learning_rate": 3.2291514607932616e-06, + "loss": 1.6717, + "step": 28946 + }, + { + "epoch": 8.884898710865562, + "grad_norm": 0.15787595510482788, + "learning_rate": 3.2273943681225992e-06, + "loss": 1.7005, + "step": 28947 + }, + { + "epoch": 8.885205647636587, + "grad_norm": 0.13189679384231567, + "learning_rate": 3.225637737686249e-06, + "loss": 1.6599, + "step": 28948 + }, + { + "epoch": 8.885512584407612, + "grad_norm": 0.13954944908618927, + "learning_rate": 3.2238815695015635e-06, + "loss": 1.7261, + "step": 28949 + }, + { + "epoch": 8.885819521178638, + "grad_norm": 0.2115267813205719, + "learning_rate": 3.2221258635858897e-06, + "loss": 1.7459, + "step": 28950 + }, + { + "epoch": 8.886126457949663, + "grad_norm": 0.15017318725585938, + "learning_rate": 3.220370619956592e-06, + "loss": 1.6929, + "step": 28951 + }, + { + "epoch": 8.886433394720687, + "grad_norm": 0.16980741918087006, + "learning_rate": 3.218615838631006e-06, + "loss": 1.802, + "step": 28952 + }, + { + "epoch": 8.886740331491712, + "grad_norm": 0.1366024613380432, + "learning_rate": 3.216861519626485e-06, + "loss": 1.6886, + "step": 28953 + }, + { + "epoch": 8.887047268262737, + "grad_norm": 0.16248583793640137, + "learning_rate": 3.2151076629603537e-06, + "loss": 1.6992, + "step": 28954 + }, + { + "epoch": 8.887354205033763, + "grad_norm": 0.1727447360754013, + "learning_rate": 3.213354268649943e-06, + "loss": 1.7412, + "step": 28955 + }, + { + "epoch": 8.887661141804788, + "grad_norm": 0.12872622907161713, + "learning_rate": 3.2116013367125996e-06, + "loss": 1.641, + "step": 28956 + }, + { + "epoch": 8.887968078575813, + "grad_norm": 0.12361441552639008, + "learning_rate": 3.2098488671656323e-06, + "loss": 1.6764, + "step": 28957 + }, + { + "epoch": 8.888275015346839, + "grad_norm": 0.1612539142370224, + "learning_rate": 3.2080968600263604e-06, + "loss": 1.6646, + "step": 28958 + }, + { + "epoch": 8.888581952117864, + "grad_norm": 0.15859587490558624, + "learning_rate": 3.2063453153121035e-06, + "loss": 1.6981, + "step": 28959 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 0.12860243022441864, + "learning_rate": 3.204594233040159e-06, + "loss": 1.6645, + "step": 28960 + }, + { + "epoch": 8.889195825659915, + "grad_norm": 0.232563316822052, + "learning_rate": 3.202843613227857e-06, + "loss": 1.6965, + "step": 28961 + }, + { + "epoch": 8.88950276243094, + "grad_norm": 0.15783043205738068, + "learning_rate": 3.2010934558924676e-06, + "loss": 1.7294, + "step": 28962 + }, + { + "epoch": 8.889809699201965, + "grad_norm": 0.13369722664356232, + "learning_rate": 3.199343761051321e-06, + "loss": 1.6778, + "step": 28963 + }, + { + "epoch": 8.890116635972989, + "grad_norm": 0.14463269710540771, + "learning_rate": 3.1975945287216756e-06, + "loss": 1.7211, + "step": 28964 + }, + { + "epoch": 8.890423572744014, + "grad_norm": 0.22744107246398926, + "learning_rate": 3.1958457589208346e-06, + "loss": 1.7234, + "step": 28965 + }, + { + "epoch": 8.89073050951504, + "grad_norm": 0.17402450740337372, + "learning_rate": 3.1940974516660836e-06, + "loss": 1.7355, + "step": 28966 + }, + { + "epoch": 8.891037446286065, + "grad_norm": 0.14022772014141083, + "learning_rate": 3.1923496069746927e-06, + "loss": 1.7029, + "step": 28967 + }, + { + "epoch": 8.89134438305709, + "grad_norm": 0.18977795541286469, + "learning_rate": 3.1906022248639368e-06, + "loss": 1.7213, + "step": 28968 + }, + { + "epoch": 8.891651319828116, + "grad_norm": 0.11371618509292603, + "learning_rate": 3.1888553053510905e-06, + "loss": 1.6521, + "step": 28969 + }, + { + "epoch": 8.89195825659914, + "grad_norm": 0.16720212996006012, + "learning_rate": 3.1871088484534073e-06, + "loss": 1.7186, + "step": 28970 + }, + { + "epoch": 8.892265193370166, + "grad_norm": 0.1317000538110733, + "learning_rate": 3.1853628541881563e-06, + "loss": 1.6905, + "step": 28971 + }, + { + "epoch": 8.892572130141192, + "grad_norm": 0.15759915113449097, + "learning_rate": 3.1836173225725797e-06, + "loss": 1.7293, + "step": 28972 + }, + { + "epoch": 8.892879066912217, + "grad_norm": 0.1597949117422104, + "learning_rate": 3.181872253623952e-06, + "loss": 1.6696, + "step": 28973 + }, + { + "epoch": 8.89318600368324, + "grad_norm": 0.12234945595264435, + "learning_rate": 3.1801276473594934e-06, + "loss": 1.7154, + "step": 28974 + }, + { + "epoch": 8.893492940454266, + "grad_norm": 0.12929682433605194, + "learning_rate": 3.1783835037964616e-06, + "loss": 1.7071, + "step": 28975 + }, + { + "epoch": 8.893799877225291, + "grad_norm": 0.1875714361667633, + "learning_rate": 3.176639822952082e-06, + "loss": 1.7708, + "step": 28976 + }, + { + "epoch": 8.894106813996316, + "grad_norm": 0.13817653059959412, + "learning_rate": 3.1748966048435858e-06, + "loss": 1.6894, + "step": 28977 + }, + { + "epoch": 8.894413750767342, + "grad_norm": 0.16731882095336914, + "learning_rate": 3.1731538494882198e-06, + "loss": 1.7706, + "step": 28978 + }, + { + "epoch": 8.894720687538367, + "grad_norm": 0.16811375319957733, + "learning_rate": 3.171411556903181e-06, + "loss": 1.7372, + "step": 28979 + }, + { + "epoch": 8.895027624309392, + "grad_norm": 0.11702638864517212, + "learning_rate": 3.1696697271057117e-06, + "loss": 1.6523, + "step": 28980 + }, + { + "epoch": 8.895334561080418, + "grad_norm": 0.12287343293428421, + "learning_rate": 3.1679283601130037e-06, + "loss": 1.6938, + "step": 28981 + }, + { + "epoch": 8.895641497851443, + "grad_norm": 0.10473133623600006, + "learning_rate": 3.166187455942282e-06, + "loss": 1.6731, + "step": 28982 + }, + { + "epoch": 8.895948434622468, + "grad_norm": 0.13022342324256897, + "learning_rate": 3.164447014610744e-06, + "loss": 1.679, + "step": 28983 + }, + { + "epoch": 8.896255371393494, + "grad_norm": 0.16077135503292084, + "learning_rate": 3.1627070361355925e-06, + "loss": 1.7466, + "step": 28984 + }, + { + "epoch": 8.896562308164517, + "grad_norm": 0.14103242754936218, + "learning_rate": 3.160967520534025e-06, + "loss": 1.6936, + "step": 28985 + }, + { + "epoch": 8.896869244935543, + "grad_norm": 0.12953349947929382, + "learning_rate": 3.1592284678232277e-06, + "loss": 1.7125, + "step": 28986 + }, + { + "epoch": 8.897176181706568, + "grad_norm": 0.11083797365427017, + "learning_rate": 3.157489878020392e-06, + "loss": 1.6455, + "step": 28987 + }, + { + "epoch": 8.897483118477593, + "grad_norm": 0.12037435173988342, + "learning_rate": 3.1557517511426936e-06, + "loss": 1.6569, + "step": 28988 + }, + { + "epoch": 8.897790055248619, + "grad_norm": 0.17309941351413727, + "learning_rate": 3.154014087207302e-06, + "loss": 1.7142, + "step": 28989 + }, + { + "epoch": 8.898096992019644, + "grad_norm": 0.15349642932415009, + "learning_rate": 3.15227688623142e-06, + "loss": 1.7375, + "step": 28990 + }, + { + "epoch": 8.89840392879067, + "grad_norm": 0.175978422164917, + "learning_rate": 3.1505401482321896e-06, + "loss": 1.7023, + "step": 28991 + }, + { + "epoch": 8.898710865561695, + "grad_norm": 0.13710327446460724, + "learning_rate": 3.14880387322678e-06, + "loss": 1.6462, + "step": 28992 + }, + { + "epoch": 8.89901780233272, + "grad_norm": 0.11777636408805847, + "learning_rate": 3.14706806123235e-06, + "loss": 1.6187, + "step": 28993 + }, + { + "epoch": 8.899324739103745, + "grad_norm": 0.1707836240530014, + "learning_rate": 3.145332712266047e-06, + "loss": 1.7314, + "step": 28994 + }, + { + "epoch": 8.899631675874769, + "grad_norm": 0.15286721289157867, + "learning_rate": 3.143597826345046e-06, + "loss": 1.6874, + "step": 28995 + }, + { + "epoch": 8.899938612645794, + "grad_norm": 0.1401689052581787, + "learning_rate": 3.141863403486456e-06, + "loss": 1.6795, + "step": 28996 + }, + { + "epoch": 8.90024554941682, + "grad_norm": 0.13194917142391205, + "learning_rate": 3.1401294437074512e-06, + "loss": 1.6967, + "step": 28997 + }, + { + "epoch": 8.900552486187845, + "grad_norm": 0.1518833339214325, + "learning_rate": 3.1383959470251413e-06, + "loss": 1.6914, + "step": 28998 + }, + { + "epoch": 8.90085942295887, + "grad_norm": 0.12354082614183426, + "learning_rate": 3.1366629134566727e-06, + "loss": 1.6809, + "step": 28999 + }, + { + "epoch": 8.901166359729896, + "grad_norm": 0.2156827449798584, + "learning_rate": 3.1349303430191712e-06, + "loss": 1.7617, + "step": 29000 + }, + { + "epoch": 8.901473296500921, + "grad_norm": 0.15934047102928162, + "learning_rate": 3.133198235729756e-06, + "loss": 1.7443, + "step": 29001 + }, + { + "epoch": 8.901780233271946, + "grad_norm": 0.13422276079654694, + "learning_rate": 3.1314665916055473e-06, + "loss": 1.7238, + "step": 29002 + }, + { + "epoch": 8.902087170042972, + "grad_norm": 0.1727958619594574, + "learning_rate": 3.1297354106636535e-06, + "loss": 1.7208, + "step": 29003 + }, + { + "epoch": 8.902394106813997, + "grad_norm": 0.14110971987247467, + "learning_rate": 3.1280046929211827e-06, + "loss": 1.6586, + "step": 29004 + }, + { + "epoch": 8.902701043585022, + "grad_norm": 0.1527067869901657, + "learning_rate": 3.126274438395249e-06, + "loss": 1.6908, + "step": 29005 + }, + { + "epoch": 8.903007980356048, + "grad_norm": 0.1663844734430313, + "learning_rate": 3.1245446471029392e-06, + "loss": 1.7263, + "step": 29006 + }, + { + "epoch": 8.903314917127071, + "grad_norm": 0.23200902342796326, + "learning_rate": 3.1228153190613563e-06, + "loss": 1.7564, + "step": 29007 + }, + { + "epoch": 8.903621853898096, + "grad_norm": 0.1557004153728485, + "learning_rate": 3.1210864542875917e-06, + "loss": 1.721, + "step": 29008 + }, + { + "epoch": 8.903928790669122, + "grad_norm": 0.1682535856962204, + "learning_rate": 3.1193580527987208e-06, + "loss": 1.7244, + "step": 29009 + }, + { + "epoch": 8.904235727440147, + "grad_norm": 0.17813025414943695, + "learning_rate": 3.117630114611836e-06, + "loss": 1.6873, + "step": 29010 + }, + { + "epoch": 8.904542664211172, + "grad_norm": 0.16720467805862427, + "learning_rate": 3.1159026397440007e-06, + "loss": 1.7588, + "step": 29011 + }, + { + "epoch": 8.904849600982198, + "grad_norm": 0.12350224703550339, + "learning_rate": 3.114175628212307e-06, + "loss": 1.6641, + "step": 29012 + }, + { + "epoch": 8.905156537753223, + "grad_norm": 0.16594655811786652, + "learning_rate": 3.112449080033797e-06, + "loss": 1.6896, + "step": 29013 + }, + { + "epoch": 8.905463474524248, + "grad_norm": 0.11925587058067322, + "learning_rate": 3.110722995225562e-06, + "loss": 1.6751, + "step": 29014 + }, + { + "epoch": 8.905770411295274, + "grad_norm": 0.15165284276008606, + "learning_rate": 3.108997373804634e-06, + "loss": 1.6983, + "step": 29015 + }, + { + "epoch": 8.9060773480663, + "grad_norm": 0.1934432089328766, + "learning_rate": 3.107272215788082e-06, + "loss": 1.6972, + "step": 29016 + }, + { + "epoch": 8.906384284837323, + "grad_norm": 0.1574355512857437, + "learning_rate": 3.1055475211929474e-06, + "loss": 1.751, + "step": 29017 + }, + { + "epoch": 8.906691221608348, + "grad_norm": 0.17686793208122253, + "learning_rate": 3.1038232900362787e-06, + "loss": 1.7705, + "step": 29018 + }, + { + "epoch": 8.906998158379373, + "grad_norm": 0.20089837908744812, + "learning_rate": 3.102099522335117e-06, + "loss": 1.8083, + "step": 29019 + }, + { + "epoch": 8.907305095150399, + "grad_norm": 0.1398555189371109, + "learning_rate": 3.1003762181064986e-06, + "loss": 1.7181, + "step": 29020 + }, + { + "epoch": 8.907612031921424, + "grad_norm": 0.14177222549915314, + "learning_rate": 3.09865337736745e-06, + "loss": 1.671, + "step": 29021 + }, + { + "epoch": 8.90791896869245, + "grad_norm": 0.17582249641418457, + "learning_rate": 3.0969310001349948e-06, + "loss": 1.7112, + "step": 29022 + }, + { + "epoch": 8.908225905463475, + "grad_norm": 0.16887766122817993, + "learning_rate": 3.0952090864261594e-06, + "loss": 1.7281, + "step": 29023 + }, + { + "epoch": 8.9085328422345, + "grad_norm": 0.1768682301044464, + "learning_rate": 3.093487636257958e-06, + "loss": 1.6584, + "step": 29024 + }, + { + "epoch": 8.908839779005525, + "grad_norm": 0.15997330844402313, + "learning_rate": 3.0917666496474095e-06, + "loss": 1.7051, + "step": 29025 + }, + { + "epoch": 8.90914671577655, + "grad_norm": 0.16596661508083344, + "learning_rate": 3.0900461266115124e-06, + "loss": 1.6899, + "step": 29026 + }, + { + "epoch": 8.909453652547576, + "grad_norm": 0.1477203071117401, + "learning_rate": 3.088326067167274e-06, + "loss": 1.6982, + "step": 29027 + }, + { + "epoch": 8.9097605893186, + "grad_norm": 0.170956552028656, + "learning_rate": 3.086606471331699e-06, + "loss": 1.6561, + "step": 29028 + }, + { + "epoch": 8.910067526089625, + "grad_norm": 0.1777859330177307, + "learning_rate": 3.0848873391217727e-06, + "loss": 1.7638, + "step": 29029 + }, + { + "epoch": 8.91037446286065, + "grad_norm": 0.20077209174633026, + "learning_rate": 3.083168670554476e-06, + "loss": 1.7588, + "step": 29030 + }, + { + "epoch": 8.910681399631676, + "grad_norm": 0.15471714735031128, + "learning_rate": 3.0814504656468234e-06, + "loss": 1.682, + "step": 29031 + }, + { + "epoch": 8.910988336402701, + "grad_norm": 0.1711329072713852, + "learning_rate": 3.0797327244157624e-06, + "loss": 1.6883, + "step": 29032 + }, + { + "epoch": 8.911295273173726, + "grad_norm": 0.11440590023994446, + "learning_rate": 3.0780154468782905e-06, + "loss": 1.6861, + "step": 29033 + }, + { + "epoch": 8.911602209944752, + "grad_norm": 0.15305832028388977, + "learning_rate": 3.0762986330513722e-06, + "loss": 1.7208, + "step": 29034 + }, + { + "epoch": 8.911909146715777, + "grad_norm": 0.13767275214195251, + "learning_rate": 3.0745822829519766e-06, + "loss": 1.7319, + "step": 29035 + }, + { + "epoch": 8.912216083486802, + "grad_norm": 0.15172621607780457, + "learning_rate": 3.0728663965970573e-06, + "loss": 1.7003, + "step": 29036 + }, + { + "epoch": 8.912523020257828, + "grad_norm": 0.16932672262191772, + "learning_rate": 3.071150974003578e-06, + "loss": 1.709, + "step": 29037 + }, + { + "epoch": 8.912829957028851, + "grad_norm": 0.13176152110099792, + "learning_rate": 3.069436015188493e-06, + "loss": 1.6714, + "step": 29038 + }, + { + "epoch": 8.913136893799877, + "grad_norm": 0.17337891459465027, + "learning_rate": 3.067721520168748e-06, + "loss": 1.7786, + "step": 29039 + }, + { + "epoch": 8.913443830570902, + "grad_norm": 0.12546442449092865, + "learning_rate": 3.0660074889612867e-06, + "loss": 1.7219, + "step": 29040 + }, + { + "epoch": 8.913750767341927, + "grad_norm": 0.21087953448295593, + "learning_rate": 3.0642939215830444e-06, + "loss": 1.7541, + "step": 29041 + }, + { + "epoch": 8.914057704112953, + "grad_norm": 0.16880549490451813, + "learning_rate": 3.062580818050964e-06, + "loss": 1.7299, + "step": 29042 + }, + { + "epoch": 8.914364640883978, + "grad_norm": 0.15600517392158508, + "learning_rate": 3.0608681783819705e-06, + "loss": 1.6801, + "step": 29043 + }, + { + "epoch": 8.914671577655003, + "grad_norm": 0.11458457261323929, + "learning_rate": 3.059156002592989e-06, + "loss": 1.6393, + "step": 29044 + }, + { + "epoch": 8.914978514426029, + "grad_norm": 0.15529881417751312, + "learning_rate": 3.0574442907009393e-06, + "loss": 1.7288, + "step": 29045 + }, + { + "epoch": 8.915285451197054, + "grad_norm": 0.15211673080921173, + "learning_rate": 3.0557330427227415e-06, + "loss": 1.6784, + "step": 29046 + }, + { + "epoch": 8.91559238796808, + "grad_norm": 0.13714905083179474, + "learning_rate": 3.054022258675293e-06, + "loss": 1.7047, + "step": 29047 + }, + { + "epoch": 8.915899324739105, + "grad_norm": 0.1595524698495865, + "learning_rate": 3.0523119385755304e-06, + "loss": 1.722, + "step": 29048 + }, + { + "epoch": 8.91620626151013, + "grad_norm": 0.16744185984134674, + "learning_rate": 3.0506020824403235e-06, + "loss": 1.6754, + "step": 29049 + }, + { + "epoch": 8.916513198281153, + "grad_norm": 0.13333237171173096, + "learning_rate": 3.048892690286598e-06, + "loss": 1.7332, + "step": 29050 + }, + { + "epoch": 8.916820135052179, + "grad_norm": 0.19067470729351044, + "learning_rate": 3.0471837621312228e-06, + "loss": 1.7034, + "step": 29051 + }, + { + "epoch": 8.917127071823204, + "grad_norm": 0.1292569637298584, + "learning_rate": 3.0454752979911018e-06, + "loss": 1.652, + "step": 29052 + }, + { + "epoch": 8.91743400859423, + "grad_norm": 0.15452222526073456, + "learning_rate": 3.0437672978831155e-06, + "loss": 1.7183, + "step": 29053 + }, + { + "epoch": 8.917740945365255, + "grad_norm": 0.16528162360191345, + "learning_rate": 3.04205976182414e-06, + "loss": 1.7099, + "step": 29054 + }, + { + "epoch": 8.91804788213628, + "grad_norm": 0.22729776799678802, + "learning_rate": 3.0403526898310553e-06, + "loss": 1.7353, + "step": 29055 + }, + { + "epoch": 8.918354818907305, + "grad_norm": 0.134805828332901, + "learning_rate": 3.038646081920732e-06, + "loss": 1.6975, + "step": 29056 + }, + { + "epoch": 8.91866175567833, + "grad_norm": 0.15781652927398682, + "learning_rate": 3.0369399381100282e-06, + "loss": 1.7197, + "step": 29057 + }, + { + "epoch": 8.918968692449356, + "grad_norm": 0.19794493913650513, + "learning_rate": 3.0352342584158146e-06, + "loss": 1.6894, + "step": 29058 + }, + { + "epoch": 8.919275629220381, + "grad_norm": 0.14306722581386566, + "learning_rate": 3.033529042854938e-06, + "loss": 1.6885, + "step": 29059 + }, + { + "epoch": 8.919582565991405, + "grad_norm": 0.1341150999069214, + "learning_rate": 3.0318242914442574e-06, + "loss": 1.7154, + "step": 29060 + }, + { + "epoch": 8.91988950276243, + "grad_norm": 0.2001344859600067, + "learning_rate": 3.0301200042006208e-06, + "loss": 1.7537, + "step": 29061 + }, + { + "epoch": 8.920196439533456, + "grad_norm": 0.22544899582862854, + "learning_rate": 3.028416181140864e-06, + "loss": 1.7656, + "step": 29062 + }, + { + "epoch": 8.920503376304481, + "grad_norm": 0.13061828911304474, + "learning_rate": 3.0267128222818298e-06, + "loss": 1.6929, + "step": 29063 + }, + { + "epoch": 8.920810313075506, + "grad_norm": 0.19021448493003845, + "learning_rate": 3.025009927640349e-06, + "loss": 1.7858, + "step": 29064 + }, + { + "epoch": 8.921117249846532, + "grad_norm": 0.15748682618141174, + "learning_rate": 3.023307497233263e-06, + "loss": 1.6983, + "step": 29065 + }, + { + "epoch": 8.921424186617557, + "grad_norm": 0.20138932764530182, + "learning_rate": 3.0216055310773704e-06, + "loss": 1.7891, + "step": 29066 + }, + { + "epoch": 8.921731123388582, + "grad_norm": 0.11930065602064133, + "learning_rate": 3.0199040291895242e-06, + "loss": 1.6733, + "step": 29067 + }, + { + "epoch": 8.922038060159608, + "grad_norm": 0.17451462149620056, + "learning_rate": 3.0182029915865107e-06, + "loss": 1.717, + "step": 29068 + }, + { + "epoch": 8.922344996930633, + "grad_norm": 0.13890404999256134, + "learning_rate": 3.0165024182851553e-06, + "loss": 1.6821, + "step": 29069 + }, + { + "epoch": 8.922651933701658, + "grad_norm": 0.15502439439296722, + "learning_rate": 3.0148023093022613e-06, + "loss": 1.6746, + "step": 29070 + }, + { + "epoch": 8.922958870472682, + "grad_norm": 0.14066965878009796, + "learning_rate": 3.013102664654627e-06, + "loss": 1.6979, + "step": 29071 + }, + { + "epoch": 8.923265807243707, + "grad_norm": 0.15466643869876862, + "learning_rate": 3.01140348435906e-06, + "loss": 1.7306, + "step": 29072 + }, + { + "epoch": 8.923572744014733, + "grad_norm": 0.15576320886611938, + "learning_rate": 3.0097047684323363e-06, + "loss": 1.7241, + "step": 29073 + }, + { + "epoch": 8.923879680785758, + "grad_norm": 0.15748077630996704, + "learning_rate": 3.008006516891254e-06, + "loss": 1.7053, + "step": 29074 + }, + { + "epoch": 8.924186617556783, + "grad_norm": 0.19139769673347473, + "learning_rate": 3.0063087297525995e-06, + "loss": 1.7361, + "step": 29075 + }, + { + "epoch": 8.924493554327809, + "grad_norm": 0.12561291456222534, + "learning_rate": 3.0046114070331423e-06, + "loss": 1.6982, + "step": 29076 + }, + { + "epoch": 8.924800491098834, + "grad_norm": 0.140936940908432, + "learning_rate": 3.002914548749658e-06, + "loss": 1.66, + "step": 29077 + }, + { + "epoch": 8.92510742786986, + "grad_norm": 0.19634532928466797, + "learning_rate": 3.001218154918922e-06, + "loss": 1.6947, + "step": 29078 + }, + { + "epoch": 8.925414364640885, + "grad_norm": 0.1971811205148697, + "learning_rate": 2.999522225557694e-06, + "loss": 1.7133, + "step": 29079 + }, + { + "epoch": 8.92572130141191, + "grad_norm": 0.15782490372657776, + "learning_rate": 2.9978267606827314e-06, + "loss": 1.6724, + "step": 29080 + }, + { + "epoch": 8.926028238182933, + "grad_norm": 0.1563064008951187, + "learning_rate": 2.9961317603107887e-06, + "loss": 1.7942, + "step": 29081 + }, + { + "epoch": 8.926335174953959, + "grad_norm": 0.1192200556397438, + "learning_rate": 2.994437224458635e-06, + "loss": 1.6736, + "step": 29082 + }, + { + "epoch": 8.926642111724984, + "grad_norm": 0.14355097711086273, + "learning_rate": 2.9927431531429905e-06, + "loss": 1.6968, + "step": 29083 + }, + { + "epoch": 8.92694904849601, + "grad_norm": 0.17257769405841827, + "learning_rate": 2.9910495463806255e-06, + "loss": 1.7353, + "step": 29084 + }, + { + "epoch": 8.927255985267035, + "grad_norm": 0.16805051267147064, + "learning_rate": 2.9893564041882484e-06, + "loss": 1.7711, + "step": 29085 + }, + { + "epoch": 8.92756292203806, + "grad_norm": 0.123812235891819, + "learning_rate": 2.9876637265826123e-06, + "loss": 1.6197, + "step": 29086 + }, + { + "epoch": 8.927869858809085, + "grad_norm": 0.38423335552215576, + "learning_rate": 2.985971513580432e-06, + "loss": 1.726, + "step": 29087 + }, + { + "epoch": 8.92817679558011, + "grad_norm": 0.14887484908103943, + "learning_rate": 2.9842797651984443e-06, + "loss": 1.7067, + "step": 29088 + }, + { + "epoch": 8.928483732351136, + "grad_norm": 0.17092695832252502, + "learning_rate": 2.982588481453358e-06, + "loss": 1.6883, + "step": 29089 + }, + { + "epoch": 8.928790669122161, + "grad_norm": 0.1591298133134842, + "learning_rate": 2.9808976623618867e-06, + "loss": 1.7219, + "step": 29090 + }, + { + "epoch": 8.929097605893187, + "grad_norm": 0.17864398658275604, + "learning_rate": 2.979207307940746e-06, + "loss": 1.7378, + "step": 29091 + }, + { + "epoch": 8.92940454266421, + "grad_norm": 0.15053904056549072, + "learning_rate": 2.977517418206638e-06, + "loss": 1.679, + "step": 29092 + }, + { + "epoch": 8.929711479435236, + "grad_norm": 0.15586422383785248, + "learning_rate": 2.975827993176267e-06, + "loss": 1.7276, + "step": 29093 + }, + { + "epoch": 8.930018416206261, + "grad_norm": 0.13955895602703094, + "learning_rate": 2.9741390328663243e-06, + "loss": 1.6727, + "step": 29094 + }, + { + "epoch": 8.930325352977286, + "grad_norm": 0.15469470620155334, + "learning_rate": 2.9724505372934973e-06, + "loss": 1.6993, + "step": 29095 + }, + { + "epoch": 8.930632289748312, + "grad_norm": 0.13510502874851227, + "learning_rate": 2.970762506474484e-06, + "loss": 1.6991, + "step": 29096 + }, + { + "epoch": 8.930939226519337, + "grad_norm": 0.13071557879447937, + "learning_rate": 2.9690749404259587e-06, + "loss": 1.6787, + "step": 29097 + }, + { + "epoch": 8.931246163290362, + "grad_norm": 0.13370119035243988, + "learning_rate": 2.9673878391645927e-06, + "loss": 1.6966, + "step": 29098 + }, + { + "epoch": 8.931553100061388, + "grad_norm": 0.21600082516670227, + "learning_rate": 2.9657012027070774e-06, + "loss": 1.7137, + "step": 29099 + }, + { + "epoch": 8.931860036832413, + "grad_norm": 0.17746025323867798, + "learning_rate": 2.964015031070061e-06, + "loss": 1.7406, + "step": 29100 + }, + { + "epoch": 8.932166973603438, + "grad_norm": 0.1861608922481537, + "learning_rate": 2.96232932427023e-06, + "loss": 1.7615, + "step": 29101 + }, + { + "epoch": 8.932473910374462, + "grad_norm": 0.128297820687294, + "learning_rate": 2.9606440823242155e-06, + "loss": 1.6525, + "step": 29102 + }, + { + "epoch": 8.932780847145487, + "grad_norm": 0.1617307960987091, + "learning_rate": 2.958959305248693e-06, + "loss": 1.6735, + "step": 29103 + }, + { + "epoch": 8.933087783916513, + "grad_norm": 0.1898767054080963, + "learning_rate": 2.9572749930603107e-06, + "loss": 1.7426, + "step": 29104 + }, + { + "epoch": 8.933394720687538, + "grad_norm": 0.14279016852378845, + "learning_rate": 2.955591145775705e-06, + "loss": 1.6855, + "step": 29105 + }, + { + "epoch": 8.933701657458563, + "grad_norm": 0.15879136323928833, + "learning_rate": 2.953907763411523e-06, + "loss": 1.6833, + "step": 29106 + }, + { + "epoch": 8.934008594229589, + "grad_norm": 0.14285622537136078, + "learning_rate": 2.9522248459843972e-06, + "loss": 1.6821, + "step": 29107 + }, + { + "epoch": 8.934315531000614, + "grad_norm": 0.1237918958067894, + "learning_rate": 2.950542393510963e-06, + "loss": 1.6676, + "step": 29108 + }, + { + "epoch": 8.93462246777164, + "grad_norm": 0.16011624038219452, + "learning_rate": 2.9488604060078473e-06, + "loss": 1.6881, + "step": 29109 + }, + { + "epoch": 8.934929404542665, + "grad_norm": 0.19365482032299042, + "learning_rate": 2.9471788834916692e-06, + "loss": 1.6895, + "step": 29110 + }, + { + "epoch": 8.93523634131369, + "grad_norm": 0.1855025440454483, + "learning_rate": 2.9454978259790435e-06, + "loss": 1.7745, + "step": 29111 + }, + { + "epoch": 8.935543278084715, + "grad_norm": 0.1319892704486847, + "learning_rate": 2.9438172334865898e-06, + "loss": 1.6836, + "step": 29112 + }, + { + "epoch": 8.93585021485574, + "grad_norm": 0.19831378757953644, + "learning_rate": 2.942137106030918e-06, + "loss": 1.7398, + "step": 29113 + }, + { + "epoch": 8.936157151626764, + "grad_norm": 0.16073055565357208, + "learning_rate": 2.9404574436286246e-06, + "loss": 1.6617, + "step": 29114 + }, + { + "epoch": 8.93646408839779, + "grad_norm": 0.19067524373531342, + "learning_rate": 2.938778246296309e-06, + "loss": 1.7244, + "step": 29115 + }, + { + "epoch": 8.936771025168815, + "grad_norm": 0.13316050171852112, + "learning_rate": 2.9370995140505843e-06, + "loss": 1.6371, + "step": 29116 + }, + { + "epoch": 8.93707796193984, + "grad_norm": 0.19948840141296387, + "learning_rate": 2.9354212469080156e-06, + "loss": 1.7279, + "step": 29117 + }, + { + "epoch": 8.937384898710865, + "grad_norm": 0.15221990644931793, + "learning_rate": 2.933743444885206e-06, + "loss": 1.7516, + "step": 29118 + }, + { + "epoch": 8.93769183548189, + "grad_norm": 0.15257437527179718, + "learning_rate": 2.932066107998721e-06, + "loss": 1.7471, + "step": 29119 + }, + { + "epoch": 8.937998772252916, + "grad_norm": 0.1491934210062027, + "learning_rate": 2.930389236265152e-06, + "loss": 1.6896, + "step": 29120 + }, + { + "epoch": 8.938305709023942, + "grad_norm": 0.12303795665502548, + "learning_rate": 2.928712829701069e-06, + "loss": 1.6793, + "step": 29121 + }, + { + "epoch": 8.938612645794967, + "grad_norm": 0.09865713864564896, + "learning_rate": 2.9270368883230313e-06, + "loss": 1.6063, + "step": 29122 + }, + { + "epoch": 8.938919582565992, + "grad_norm": 0.1656254678964615, + "learning_rate": 2.9253614121476037e-06, + "loss": 1.7507, + "step": 29123 + }, + { + "epoch": 8.939226519337016, + "grad_norm": 0.11997068673372269, + "learning_rate": 2.9236864011913445e-06, + "loss": 1.6393, + "step": 29124 + }, + { + "epoch": 8.939533456108041, + "grad_norm": 0.16391901671886444, + "learning_rate": 2.922011855470813e-06, + "loss": 1.6926, + "step": 29125 + }, + { + "epoch": 8.939840392879066, + "grad_norm": 0.1461794674396515, + "learning_rate": 2.920337775002552e-06, + "loss": 1.7243, + "step": 29126 + }, + { + "epoch": 8.940147329650092, + "grad_norm": 0.12928323447704315, + "learning_rate": 2.918664159803108e-06, + "loss": 1.6457, + "step": 29127 + }, + { + "epoch": 8.940454266421117, + "grad_norm": 0.16596664488315582, + "learning_rate": 2.9169910098890196e-06, + "loss": 1.6878, + "step": 29128 + }, + { + "epoch": 8.940761203192142, + "grad_norm": 0.1567634493112564, + "learning_rate": 2.9153183252768224e-06, + "loss": 1.6947, + "step": 29129 + }, + { + "epoch": 8.941068139963168, + "grad_norm": 0.1472834199666977, + "learning_rate": 2.9136461059830476e-06, + "loss": 1.6707, + "step": 29130 + }, + { + "epoch": 8.941375076734193, + "grad_norm": 0.1658584028482437, + "learning_rate": 2.9119743520242217e-06, + "loss": 1.7321, + "step": 29131 + }, + { + "epoch": 8.941682013505218, + "grad_norm": 0.20524124801158905, + "learning_rate": 2.9103030634168525e-06, + "loss": 1.7065, + "step": 29132 + }, + { + "epoch": 8.941988950276244, + "grad_norm": 0.16881074011325836, + "learning_rate": 2.908632240177489e-06, + "loss": 1.7052, + "step": 29133 + }, + { + "epoch": 8.942295887047269, + "grad_norm": 0.15819382667541504, + "learning_rate": 2.906961882322601e-06, + "loss": 1.7388, + "step": 29134 + }, + { + "epoch": 8.942602823818293, + "grad_norm": 0.13994456827640533, + "learning_rate": 2.905291989868736e-06, + "loss": 1.6932, + "step": 29135 + }, + { + "epoch": 8.942909760589318, + "grad_norm": 0.18177597224712372, + "learning_rate": 2.9036225628323644e-06, + "loss": 1.707, + "step": 29136 + }, + { + "epoch": 8.943216697360343, + "grad_norm": 0.14273816347122192, + "learning_rate": 2.9019536012300063e-06, + "loss": 1.6902, + "step": 29137 + }, + { + "epoch": 8.943523634131369, + "grad_norm": 0.2221340835094452, + "learning_rate": 2.9002851050781486e-06, + "loss": 1.7369, + "step": 29138 + }, + { + "epoch": 8.943830570902394, + "grad_norm": 0.14513340592384338, + "learning_rate": 2.8986170743932782e-06, + "loss": 1.7307, + "step": 29139 + }, + { + "epoch": 8.94413750767342, + "grad_norm": 0.16813357174396515, + "learning_rate": 2.8969495091918763e-06, + "loss": 1.769, + "step": 29140 + }, + { + "epoch": 8.944444444444445, + "grad_norm": 0.15906141698360443, + "learning_rate": 2.895282409490435e-06, + "loss": 1.6929, + "step": 29141 + }, + { + "epoch": 8.94475138121547, + "grad_norm": 0.16236159205436707, + "learning_rate": 2.893615775305419e-06, + "loss": 1.7309, + "step": 29142 + }, + { + "epoch": 8.945058317986495, + "grad_norm": 0.12328501045703888, + "learning_rate": 2.891949606653299e-06, + "loss": 1.7063, + "step": 29143 + }, + { + "epoch": 8.94536525475752, + "grad_norm": 0.15831345319747925, + "learning_rate": 2.89028390355055e-06, + "loss": 1.6602, + "step": 29144 + }, + { + "epoch": 8.945672191528544, + "grad_norm": 0.12445748597383499, + "learning_rate": 2.8886186660136206e-06, + "loss": 1.6565, + "step": 29145 + }, + { + "epoch": 8.94597912829957, + "grad_norm": 0.12890103459358215, + "learning_rate": 2.88695389405898e-06, + "loss": 1.7209, + "step": 29146 + }, + { + "epoch": 8.946286065070595, + "grad_norm": 0.14477044343948364, + "learning_rate": 2.885289587703072e-06, + "loss": 1.6782, + "step": 29147 + }, + { + "epoch": 8.94659300184162, + "grad_norm": 0.12625789642333984, + "learning_rate": 2.8836257469623482e-06, + "loss": 1.6538, + "step": 29148 + }, + { + "epoch": 8.946899938612646, + "grad_norm": 0.16041505336761475, + "learning_rate": 2.8819623718532418e-06, + "loss": 1.7327, + "step": 29149 + }, + { + "epoch": 8.94720687538367, + "grad_norm": 0.16730013489723206, + "learning_rate": 2.880299462392216e-06, + "loss": 1.7036, + "step": 29150 + }, + { + "epoch": 8.947513812154696, + "grad_norm": 0.1525142341852188, + "learning_rate": 2.87863701859567e-06, + "loss": 1.7013, + "step": 29151 + }, + { + "epoch": 8.947820748925722, + "grad_norm": 0.10877451300621033, + "learning_rate": 2.876975040480073e-06, + "loss": 1.6294, + "step": 29152 + }, + { + "epoch": 8.948127685696747, + "grad_norm": 0.11804116517305374, + "learning_rate": 2.875313528061807e-06, + "loss": 1.6885, + "step": 29153 + }, + { + "epoch": 8.948434622467772, + "grad_norm": 0.1718084067106247, + "learning_rate": 2.873652481357325e-06, + "loss": 1.682, + "step": 29154 + }, + { + "epoch": 8.948741559238798, + "grad_norm": 0.1881963163614273, + "learning_rate": 2.871991900383031e-06, + "loss": 1.7851, + "step": 29155 + }, + { + "epoch": 8.949048496009823, + "grad_norm": 0.14475038647651672, + "learning_rate": 2.8703317851553334e-06, + "loss": 1.6933, + "step": 29156 + }, + { + "epoch": 8.949355432780846, + "grad_norm": 0.15759755671024323, + "learning_rate": 2.8686721356906423e-06, + "loss": 1.7322, + "step": 29157 + }, + { + "epoch": 8.949662369551872, + "grad_norm": 0.13722626864910126, + "learning_rate": 2.8670129520053547e-06, + "loss": 1.7027, + "step": 29158 + }, + { + "epoch": 8.949969306322897, + "grad_norm": 0.14574597775936127, + "learning_rate": 2.8653542341158744e-06, + "loss": 1.6934, + "step": 29159 + }, + { + "epoch": 8.950276243093922, + "grad_norm": 0.1554742455482483, + "learning_rate": 2.863695982038589e-06, + "loss": 1.7272, + "step": 29160 + }, + { + "epoch": 8.950583179864948, + "grad_norm": 0.17200839519500732, + "learning_rate": 2.8620381957898845e-06, + "loss": 1.7501, + "step": 29161 + }, + { + "epoch": 8.950890116635973, + "grad_norm": 0.18733108043670654, + "learning_rate": 2.860380875386154e-06, + "loss": 1.8017, + "step": 29162 + }, + { + "epoch": 8.951197053406998, + "grad_norm": 0.13730700314044952, + "learning_rate": 2.8587240208437614e-06, + "loss": 1.6831, + "step": 29163 + }, + { + "epoch": 8.951503990178024, + "grad_norm": 0.1442563533782959, + "learning_rate": 2.8570676321790946e-06, + "loss": 1.7231, + "step": 29164 + }, + { + "epoch": 8.95181092694905, + "grad_norm": 0.14817926287651062, + "learning_rate": 2.855411709408512e-06, + "loss": 1.7043, + "step": 29165 + }, + { + "epoch": 8.952117863720074, + "grad_norm": 0.14757658541202545, + "learning_rate": 2.8537562525483787e-06, + "loss": 1.6519, + "step": 29166 + }, + { + "epoch": 8.952424800491098, + "grad_norm": 0.17929381132125854, + "learning_rate": 2.85210126161507e-06, + "loss": 1.7523, + "step": 29167 + }, + { + "epoch": 8.952731737262123, + "grad_norm": 0.13454876840114594, + "learning_rate": 2.850446736624923e-06, + "loss": 1.6921, + "step": 29168 + }, + { + "epoch": 8.953038674033149, + "grad_norm": 0.17734326422214508, + "learning_rate": 2.8487926775943085e-06, + "loss": 1.7082, + "step": 29169 + }, + { + "epoch": 8.953345610804174, + "grad_norm": 0.15544986724853516, + "learning_rate": 2.8471390845395406e-06, + "loss": 1.7067, + "step": 29170 + }, + { + "epoch": 8.9536525475752, + "grad_norm": 0.1256217509508133, + "learning_rate": 2.8454859574769955e-06, + "loss": 1.6546, + "step": 29171 + }, + { + "epoch": 8.953959484346225, + "grad_norm": 0.17201638221740723, + "learning_rate": 2.843833296422993e-06, + "loss": 1.7554, + "step": 29172 + }, + { + "epoch": 8.95426642111725, + "grad_norm": 0.1437663435935974, + "learning_rate": 2.8421811013938703e-06, + "loss": 1.6985, + "step": 29173 + }, + { + "epoch": 8.954573357888275, + "grad_norm": 0.11889111250638962, + "learning_rate": 2.8405293724059532e-06, + "loss": 1.7046, + "step": 29174 + }, + { + "epoch": 8.9548802946593, + "grad_norm": 0.21805889904499054, + "learning_rate": 2.838878109475568e-06, + "loss": 1.7835, + "step": 29175 + }, + { + "epoch": 8.955187231430326, + "grad_norm": 0.17459547519683838, + "learning_rate": 2.8372273126190342e-06, + "loss": 1.6986, + "step": 29176 + }, + { + "epoch": 8.955494168201351, + "grad_norm": 0.16686071455478668, + "learning_rate": 2.835576981852656e-06, + "loss": 1.6858, + "step": 29177 + }, + { + "epoch": 8.955801104972375, + "grad_norm": 0.19014745950698853, + "learning_rate": 2.833927117192753e-06, + "loss": 1.742, + "step": 29178 + }, + { + "epoch": 8.9561080417434, + "grad_norm": 0.10640473663806915, + "learning_rate": 2.832277718655629e-06, + "loss": 1.6363, + "step": 29179 + }, + { + "epoch": 8.956414978514426, + "grad_norm": 0.12378805875778198, + "learning_rate": 2.8306287862575777e-06, + "loss": 1.6359, + "step": 29180 + }, + { + "epoch": 8.956721915285451, + "grad_norm": 0.1519845575094223, + "learning_rate": 2.828980320014901e-06, + "loss": 1.7112, + "step": 29181 + }, + { + "epoch": 8.957028852056476, + "grad_norm": 0.1550975888967514, + "learning_rate": 2.827332319943893e-06, + "loss": 1.7417, + "step": 29182 + }, + { + "epoch": 8.957335788827502, + "grad_norm": 0.1387033611536026, + "learning_rate": 2.8256847860608224e-06, + "loss": 1.6567, + "step": 29183 + }, + { + "epoch": 8.957642725598527, + "grad_norm": 0.14006295800209045, + "learning_rate": 2.8240377183820053e-06, + "loss": 1.7156, + "step": 29184 + }, + { + "epoch": 8.957949662369552, + "grad_norm": 0.13202004134655, + "learning_rate": 2.8223911169236782e-06, + "loss": 1.6567, + "step": 29185 + }, + { + "epoch": 8.958256599140578, + "grad_norm": 0.12789477407932281, + "learning_rate": 2.8207449817021505e-06, + "loss": 1.7102, + "step": 29186 + }, + { + "epoch": 8.958563535911603, + "grad_norm": 0.1773017793893814, + "learning_rate": 2.8190993127336583e-06, + "loss": 1.7004, + "step": 29187 + }, + { + "epoch": 8.958870472682626, + "grad_norm": 0.17584890127182007, + "learning_rate": 2.81745411003449e-06, + "loss": 1.7513, + "step": 29188 + }, + { + "epoch": 8.959177409453652, + "grad_norm": 0.1679183840751648, + "learning_rate": 2.8158093736208923e-06, + "loss": 1.7319, + "step": 29189 + }, + { + "epoch": 8.959484346224677, + "grad_norm": 0.14683100581169128, + "learning_rate": 2.8141651035091255e-06, + "loss": 1.6594, + "step": 29190 + }, + { + "epoch": 8.959791282995702, + "grad_norm": 0.17727963626384735, + "learning_rate": 2.8125212997154316e-06, + "loss": 1.7577, + "step": 29191 + }, + { + "epoch": 8.960098219766728, + "grad_norm": 0.12865738570690155, + "learning_rate": 2.810877962256059e-06, + "loss": 1.656, + "step": 29192 + }, + { + "epoch": 8.960405156537753, + "grad_norm": 0.15322017669677734, + "learning_rate": 2.80923509114725e-06, + "loss": 1.6994, + "step": 29193 + }, + { + "epoch": 8.960712093308778, + "grad_norm": 0.11874222010374069, + "learning_rate": 2.8075926864052417e-06, + "loss": 1.6514, + "step": 29194 + }, + { + "epoch": 8.961019030079804, + "grad_norm": 0.13674114644527435, + "learning_rate": 2.80595074804626e-06, + "loss": 1.6781, + "step": 29195 + }, + { + "epoch": 8.96132596685083, + "grad_norm": 0.13738766312599182, + "learning_rate": 2.8043092760865364e-06, + "loss": 1.7214, + "step": 29196 + }, + { + "epoch": 8.961632903621854, + "grad_norm": 0.15917620062828064, + "learning_rate": 2.8026682705422914e-06, + "loss": 1.7561, + "step": 29197 + }, + { + "epoch": 8.96193984039288, + "grad_norm": 0.18082000315189362, + "learning_rate": 2.8010277314297395e-06, + "loss": 1.7021, + "step": 29198 + }, + { + "epoch": 8.962246777163905, + "grad_norm": 0.1440226435661316, + "learning_rate": 2.799387658765096e-06, + "loss": 1.6829, + "step": 29199 + }, + { + "epoch": 8.962553713934929, + "grad_norm": 0.18358100950717926, + "learning_rate": 2.7977480525645692e-06, + "loss": 1.7207, + "step": 29200 + }, + { + "epoch": 8.962860650705954, + "grad_norm": 0.12614849209785461, + "learning_rate": 2.796108912844364e-06, + "loss": 1.705, + "step": 29201 + }, + { + "epoch": 8.96316758747698, + "grad_norm": 0.11331766098737717, + "learning_rate": 2.7944702396206666e-06, + "loss": 1.6343, + "step": 29202 + }, + { + "epoch": 8.963474524248005, + "grad_norm": 0.17110171914100647, + "learning_rate": 2.792832032909698e-06, + "loss": 1.8129, + "step": 29203 + }, + { + "epoch": 8.96378146101903, + "grad_norm": 0.19446058571338654, + "learning_rate": 2.791194292727617e-06, + "loss": 1.7015, + "step": 29204 + }, + { + "epoch": 8.964088397790055, + "grad_norm": 0.17975226044654846, + "learning_rate": 2.789557019090644e-06, + "loss": 1.7408, + "step": 29205 + }, + { + "epoch": 8.96439533456108, + "grad_norm": 0.15492287278175354, + "learning_rate": 2.787920212014922e-06, + "loss": 1.7307, + "step": 29206 + }, + { + "epoch": 8.964702271332106, + "grad_norm": 0.14430275559425354, + "learning_rate": 2.7862838715166485e-06, + "loss": 1.7112, + "step": 29207 + }, + { + "epoch": 8.965009208103131, + "grad_norm": 0.13850049674510956, + "learning_rate": 2.7846479976119944e-06, + "loss": 1.7177, + "step": 29208 + }, + { + "epoch": 8.965316144874157, + "grad_norm": 0.17376014590263367, + "learning_rate": 2.783012590317119e-06, + "loss": 1.7612, + "step": 29209 + }, + { + "epoch": 8.96562308164518, + "grad_norm": 0.13757693767547607, + "learning_rate": 2.7813776496481868e-06, + "loss": 1.7246, + "step": 29210 + }, + { + "epoch": 8.965930018416206, + "grad_norm": 0.17782050371170044, + "learning_rate": 2.7797431756213633e-06, + "loss": 1.7196, + "step": 29211 + }, + { + "epoch": 8.966236955187231, + "grad_norm": 0.14082394540309906, + "learning_rate": 2.7781091682527906e-06, + "loss": 1.7074, + "step": 29212 + }, + { + "epoch": 8.966543891958256, + "grad_norm": 0.2748696506023407, + "learning_rate": 2.7764756275586168e-06, + "loss": 1.819, + "step": 29213 + }, + { + "epoch": 8.966850828729282, + "grad_norm": 0.134973406791687, + "learning_rate": 2.774842553554996e-06, + "loss": 1.6725, + "step": 29214 + }, + { + "epoch": 8.967157765500307, + "grad_norm": 0.15217997133731842, + "learning_rate": 2.7732099462580594e-06, + "loss": 1.6953, + "step": 29215 + }, + { + "epoch": 8.967464702271332, + "grad_norm": 0.15674369037151337, + "learning_rate": 2.771577805683939e-06, + "loss": 1.7108, + "step": 29216 + }, + { + "epoch": 8.967771639042358, + "grad_norm": 0.13885504007339478, + "learning_rate": 2.769946131848772e-06, + "loss": 1.7106, + "step": 29217 + }, + { + "epoch": 8.968078575813383, + "grad_norm": 0.13795867562294006, + "learning_rate": 2.768314924768678e-06, + "loss": 1.6831, + "step": 29218 + }, + { + "epoch": 8.968385512584408, + "grad_norm": 0.15533487498760223, + "learning_rate": 2.7666841844597724e-06, + "loss": 1.7278, + "step": 29219 + }, + { + "epoch": 8.968692449355434, + "grad_norm": 0.13686540722846985, + "learning_rate": 2.7650539109381867e-06, + "loss": 1.6854, + "step": 29220 + }, + { + "epoch": 8.968999386126457, + "grad_norm": 0.1479746252298355, + "learning_rate": 2.763424104220019e-06, + "loss": 1.7119, + "step": 29221 + }, + { + "epoch": 8.969306322897483, + "grad_norm": 0.12035561352968216, + "learning_rate": 2.7617947643213906e-06, + "loss": 1.6295, + "step": 29222 + }, + { + "epoch": 8.969613259668508, + "grad_norm": 0.12784910202026367, + "learning_rate": 2.7601658912583763e-06, + "loss": 1.6952, + "step": 29223 + }, + { + "epoch": 8.969920196439533, + "grad_norm": 0.14596527814865112, + "learning_rate": 2.7585374850471025e-06, + "loss": 1.7003, + "step": 29224 + }, + { + "epoch": 8.970227133210559, + "grad_norm": 0.17561540007591248, + "learning_rate": 2.7569095457036455e-06, + "loss": 1.7687, + "step": 29225 + }, + { + "epoch": 8.970534069981584, + "grad_norm": 0.17456963658332825, + "learning_rate": 2.7552820732441032e-06, + "loss": 1.6927, + "step": 29226 + }, + { + "epoch": 8.97084100675261, + "grad_norm": 0.15346206724643707, + "learning_rate": 2.7536550676845574e-06, + "loss": 1.7057, + "step": 29227 + }, + { + "epoch": 8.971147943523635, + "grad_norm": 0.113531194627285, + "learning_rate": 2.752028529041073e-06, + "loss": 1.6844, + "step": 29228 + }, + { + "epoch": 8.97145488029466, + "grad_norm": 0.18523596227169037, + "learning_rate": 2.7504024573297426e-06, + "loss": 1.7468, + "step": 29229 + }, + { + "epoch": 8.971761817065685, + "grad_norm": 0.14123110473155975, + "learning_rate": 2.7487768525666313e-06, + "loss": 1.699, + "step": 29230 + }, + { + "epoch": 8.972068753836709, + "grad_norm": 0.17675861716270447, + "learning_rate": 2.747151714767798e-06, + "loss": 1.745, + "step": 29231 + }, + { + "epoch": 8.972375690607734, + "grad_norm": 0.1529264897108078, + "learning_rate": 2.7455270439493085e-06, + "loss": 1.686, + "step": 29232 + }, + { + "epoch": 8.97268262737876, + "grad_norm": 0.14173699915409088, + "learning_rate": 2.743902840127216e-06, + "loss": 1.6717, + "step": 29233 + }, + { + "epoch": 8.972989564149785, + "grad_norm": 0.15535210072994232, + "learning_rate": 2.7422791033175743e-06, + "loss": 1.7433, + "step": 29234 + }, + { + "epoch": 8.97329650092081, + "grad_norm": 0.12831814587116241, + "learning_rate": 2.740655833536432e-06, + "loss": 1.7548, + "step": 29235 + }, + { + "epoch": 8.973603437691835, + "grad_norm": 0.19681085646152496, + "learning_rate": 2.739033030799815e-06, + "loss": 1.7841, + "step": 29236 + }, + { + "epoch": 8.97391037446286, + "grad_norm": 0.1496504247188568, + "learning_rate": 2.737410695123793e-06, + "loss": 1.6646, + "step": 29237 + }, + { + "epoch": 8.974217311233886, + "grad_norm": 0.15000486373901367, + "learning_rate": 2.735788826524366e-06, + "loss": 1.6938, + "step": 29238 + }, + { + "epoch": 8.974524248004911, + "grad_norm": 0.11816641688346863, + "learning_rate": 2.734167425017592e-06, + "loss": 1.6738, + "step": 29239 + }, + { + "epoch": 8.974831184775937, + "grad_norm": 0.12041781097650528, + "learning_rate": 2.7325464906194585e-06, + "loss": 1.6798, + "step": 29240 + }, + { + "epoch": 8.975138121546962, + "grad_norm": 0.1780797690153122, + "learning_rate": 2.7309260233460143e-06, + "loss": 1.7608, + "step": 29241 + }, + { + "epoch": 8.975445058317986, + "grad_norm": 0.19122804701328278, + "learning_rate": 2.7293060232132683e-06, + "loss": 1.7706, + "step": 29242 + }, + { + "epoch": 8.975751995089011, + "grad_norm": 0.16770713031291962, + "learning_rate": 2.7276864902372244e-06, + "loss": 1.736, + "step": 29243 + }, + { + "epoch": 8.976058931860036, + "grad_norm": 0.17613980174064636, + "learning_rate": 2.7260674244338922e-06, + "loss": 1.7674, + "step": 29244 + }, + { + "epoch": 8.976365868631062, + "grad_norm": 0.17744678258895874, + "learning_rate": 2.7244488258192648e-06, + "loss": 1.7564, + "step": 29245 + }, + { + "epoch": 8.976672805402087, + "grad_norm": 0.15087327361106873, + "learning_rate": 2.7228306944093394e-06, + "loss": 1.7245, + "step": 29246 + }, + { + "epoch": 8.976979742173112, + "grad_norm": 0.16417519748210907, + "learning_rate": 2.721213030220121e-06, + "loss": 1.7329, + "step": 29247 + }, + { + "epoch": 8.977286678944138, + "grad_norm": 0.15511249005794525, + "learning_rate": 2.7195958332675796e-06, + "loss": 1.6803, + "step": 29248 + }, + { + "epoch": 8.977593615715163, + "grad_norm": 0.18222862482070923, + "learning_rate": 2.7179791035677083e-06, + "loss": 1.7186, + "step": 29249 + }, + { + "epoch": 8.977900552486188, + "grad_norm": 0.16677385568618774, + "learning_rate": 2.716362841136477e-06, + "loss": 1.688, + "step": 29250 + }, + { + "epoch": 8.978207489257214, + "grad_norm": 0.1820213794708252, + "learning_rate": 2.714747045989863e-06, + "loss": 1.7801, + "step": 29251 + }, + { + "epoch": 8.978514426028239, + "grad_norm": 0.1464485377073288, + "learning_rate": 2.7131317181438355e-06, + "loss": 1.6667, + "step": 29252 + }, + { + "epoch": 8.978821362799263, + "grad_norm": 0.13353987038135529, + "learning_rate": 2.711516857614349e-06, + "loss": 1.6492, + "step": 29253 + }, + { + "epoch": 8.979128299570288, + "grad_norm": 0.14857034385204315, + "learning_rate": 2.70990246441738e-06, + "loss": 1.6902, + "step": 29254 + }, + { + "epoch": 8.979435236341313, + "grad_norm": 0.1581316888332367, + "learning_rate": 2.708288538568865e-06, + "loss": 1.7188, + "step": 29255 + }, + { + "epoch": 8.979742173112339, + "grad_norm": 0.1437988132238388, + "learning_rate": 2.7066750800847695e-06, + "loss": 1.6982, + "step": 29256 + }, + { + "epoch": 8.980049109883364, + "grad_norm": 0.15172283351421356, + "learning_rate": 2.705062088981014e-06, + "loss": 1.6898, + "step": 29257 + }, + { + "epoch": 8.98035604665439, + "grad_norm": 0.2507859170436859, + "learning_rate": 2.703449565273569e-06, + "loss": 1.7433, + "step": 29258 + }, + { + "epoch": 8.980662983425415, + "grad_norm": 0.19917117059230804, + "learning_rate": 2.701837508978361e-06, + "loss": 1.7411, + "step": 29259 + }, + { + "epoch": 8.98096992019644, + "grad_norm": 0.17466393113136292, + "learning_rate": 2.7002259201113044e-06, + "loss": 1.712, + "step": 29260 + }, + { + "epoch": 8.981276856967465, + "grad_norm": 0.1595284342765808, + "learning_rate": 2.698614798688348e-06, + "loss": 1.768, + "step": 29261 + }, + { + "epoch": 8.98158379373849, + "grad_norm": 0.1435062289237976, + "learning_rate": 2.6970041447253956e-06, + "loss": 1.6715, + "step": 29262 + }, + { + "epoch": 8.981890730509516, + "grad_norm": 0.16341650485992432, + "learning_rate": 2.695393958238379e-06, + "loss": 1.7563, + "step": 29263 + }, + { + "epoch": 8.98219766728054, + "grad_norm": 0.1981598138809204, + "learning_rate": 2.6937842392432023e-06, + "loss": 1.744, + "step": 29264 + }, + { + "epoch": 8.982504604051565, + "grad_norm": 0.1611155867576599, + "learning_rate": 2.6921749877557802e-06, + "loss": 1.6874, + "step": 29265 + }, + { + "epoch": 8.98281154082259, + "grad_norm": 0.17430151998996735, + "learning_rate": 2.690566203792011e-06, + "loss": 1.7338, + "step": 29266 + }, + { + "epoch": 8.983118477593615, + "grad_norm": 0.13210003077983856, + "learning_rate": 2.688957887367799e-06, + "loss": 1.7221, + "step": 29267 + }, + { + "epoch": 8.98342541436464, + "grad_norm": 0.167892724275589, + "learning_rate": 2.6873500384990313e-06, + "loss": 1.6985, + "step": 29268 + }, + { + "epoch": 8.983732351135666, + "grad_norm": 0.1600649207830429, + "learning_rate": 2.685742657201601e-06, + "loss": 1.7309, + "step": 29269 + }, + { + "epoch": 8.984039287906691, + "grad_norm": 0.1755276322364807, + "learning_rate": 2.6841357434913892e-06, + "loss": 1.7173, + "step": 29270 + }, + { + "epoch": 8.984346224677717, + "grad_norm": 0.14754937589168549, + "learning_rate": 2.682529297384295e-06, + "loss": 1.6948, + "step": 29271 + }, + { + "epoch": 8.984653161448742, + "grad_norm": 0.1670856773853302, + "learning_rate": 2.6809233188961614e-06, + "loss": 1.7302, + "step": 29272 + }, + { + "epoch": 8.984960098219767, + "grad_norm": 0.18906234204769135, + "learning_rate": 2.6793178080428973e-06, + "loss": 1.7336, + "step": 29273 + }, + { + "epoch": 8.985267034990791, + "grad_norm": 0.17759168148040771, + "learning_rate": 2.6777127648403345e-06, + "loss": 1.762, + "step": 29274 + }, + { + "epoch": 8.985573971761816, + "grad_norm": 0.12218867987394333, + "learning_rate": 2.676108189304355e-06, + "loss": 1.6987, + "step": 29275 + }, + { + "epoch": 8.985880908532842, + "grad_norm": 0.1504579335451126, + "learning_rate": 2.674504081450824e-06, + "loss": 1.6683, + "step": 29276 + }, + { + "epoch": 8.986187845303867, + "grad_norm": 0.15826797485351562, + "learning_rate": 2.6729004412955616e-06, + "loss": 1.7131, + "step": 29277 + }, + { + "epoch": 8.986494782074892, + "grad_norm": 0.12599892914295197, + "learning_rate": 2.671297268854456e-06, + "loss": 1.6603, + "step": 29278 + }, + { + "epoch": 8.986801718845918, + "grad_norm": 0.17663413286209106, + "learning_rate": 2.6696945641433157e-06, + "loss": 1.7231, + "step": 29279 + }, + { + "epoch": 8.987108655616943, + "grad_norm": 0.16194280982017517, + "learning_rate": 2.668092327178001e-06, + "loss": 1.695, + "step": 29280 + }, + { + "epoch": 8.987415592387968, + "grad_norm": 0.1310044527053833, + "learning_rate": 2.6664905579743384e-06, + "loss": 1.6997, + "step": 29281 + }, + { + "epoch": 8.987722529158994, + "grad_norm": 0.18553194403648376, + "learning_rate": 2.6648892565481587e-06, + "loss": 1.7594, + "step": 29282 + }, + { + "epoch": 8.988029465930019, + "grad_norm": 0.17653048038482666, + "learning_rate": 2.6632884229152887e-06, + "loss": 1.7687, + "step": 29283 + }, + { + "epoch": 8.988336402701044, + "grad_norm": 0.14085285365581512, + "learning_rate": 2.661688057091549e-06, + "loss": 1.6875, + "step": 29284 + }, + { + "epoch": 8.988643339472068, + "grad_norm": 0.14821402728557587, + "learning_rate": 2.6600881590927553e-06, + "loss": 1.7579, + "step": 29285 + }, + { + "epoch": 8.988950276243093, + "grad_norm": 0.16718199849128723, + "learning_rate": 2.658488728934716e-06, + "loss": 1.7093, + "step": 29286 + }, + { + "epoch": 8.989257213014119, + "grad_norm": 0.16012485325336456, + "learning_rate": 2.6568897666332303e-06, + "loss": 1.6937, + "step": 29287 + }, + { + "epoch": 8.989564149785144, + "grad_norm": 0.186227485537529, + "learning_rate": 2.655291272204119e-06, + "loss": 1.6682, + "step": 29288 + }, + { + "epoch": 8.98987108655617, + "grad_norm": 0.15328755974769592, + "learning_rate": 2.653693245663158e-06, + "loss": 1.7221, + "step": 29289 + }, + { + "epoch": 8.990178023327195, + "grad_norm": 0.11358486860990524, + "learning_rate": 2.6520956870261684e-06, + "loss": 1.6721, + "step": 29290 + }, + { + "epoch": 8.99048496009822, + "grad_norm": 0.16672687232494354, + "learning_rate": 2.6504985963089035e-06, + "loss": 1.7192, + "step": 29291 + }, + { + "epoch": 8.990791896869245, + "grad_norm": 0.13929708302021027, + "learning_rate": 2.6489019735271734e-06, + "loss": 1.69, + "step": 29292 + }, + { + "epoch": 8.99109883364027, + "grad_norm": 0.1592891961336136, + "learning_rate": 2.647305818696749e-06, + "loss": 1.6943, + "step": 29293 + }, + { + "epoch": 8.991405770411296, + "grad_norm": 0.1534394770860672, + "learning_rate": 2.6457101318333957e-06, + "loss": 1.6993, + "step": 29294 + }, + { + "epoch": 8.99171270718232, + "grad_norm": 0.17096973955631256, + "learning_rate": 2.6441149129529e-06, + "loss": 1.7627, + "step": 29295 + }, + { + "epoch": 8.992019643953345, + "grad_norm": 0.13695703446865082, + "learning_rate": 2.642520162071005e-06, + "loss": 1.7047, + "step": 29296 + }, + { + "epoch": 8.99232658072437, + "grad_norm": 0.13649116456508636, + "learning_rate": 2.6409258792034873e-06, + "loss": 1.6666, + "step": 29297 + }, + { + "epoch": 8.992633517495396, + "grad_norm": 0.13003148138523102, + "learning_rate": 2.639332064366096e-06, + "loss": 1.6862, + "step": 29298 + }, + { + "epoch": 8.99294045426642, + "grad_norm": 0.1290612667798996, + "learning_rate": 2.6377387175745894e-06, + "loss": 1.703, + "step": 29299 + }, + { + "epoch": 8.993247391037446, + "grad_norm": 0.14106552302837372, + "learning_rate": 2.636145838844706e-06, + "loss": 1.6771, + "step": 29300 + }, + { + "epoch": 8.993554327808472, + "grad_norm": 0.13510754704475403, + "learning_rate": 2.6345534281921937e-06, + "loss": 1.6569, + "step": 29301 + }, + { + "epoch": 8.993861264579497, + "grad_norm": 0.11940879374742508, + "learning_rate": 2.632961485632779e-06, + "loss": 1.6719, + "step": 29302 + }, + { + "epoch": 8.994168201350522, + "grad_norm": 0.22212430834770203, + "learning_rate": 2.6313700111822104e-06, + "loss": 1.7285, + "step": 29303 + }, + { + "epoch": 8.994475138121548, + "grad_norm": 0.144329234957695, + "learning_rate": 2.629779004856192e-06, + "loss": 1.6928, + "step": 29304 + }, + { + "epoch": 8.994782074892573, + "grad_norm": 0.14428433775901794, + "learning_rate": 2.6281884666704837e-06, + "loss": 1.7371, + "step": 29305 + }, + { + "epoch": 8.995089011663598, + "grad_norm": 0.12600816786289215, + "learning_rate": 2.6265983966407615e-06, + "loss": 1.6803, + "step": 29306 + }, + { + "epoch": 8.995395948434622, + "grad_norm": 0.14739328622817993, + "learning_rate": 2.6250087947827793e-06, + "loss": 1.7135, + "step": 29307 + }, + { + "epoch": 8.995702885205647, + "grad_norm": 0.14694075286388397, + "learning_rate": 2.623419661112209e-06, + "loss": 1.7161, + "step": 29308 + }, + { + "epoch": 8.996009821976672, + "grad_norm": 0.1703605204820633, + "learning_rate": 2.6218309956447864e-06, + "loss": 1.7415, + "step": 29309 + }, + { + "epoch": 8.996316758747698, + "grad_norm": 0.1334623247385025, + "learning_rate": 2.6202427983961996e-06, + "loss": 1.7227, + "step": 29310 + }, + { + "epoch": 8.996623695518723, + "grad_norm": 0.16613437235355377, + "learning_rate": 2.6186550693821364e-06, + "loss": 1.6925, + "step": 29311 + }, + { + "epoch": 8.996930632289748, + "grad_norm": 0.12817926704883575, + "learning_rate": 2.617067808618301e-06, + "loss": 1.6296, + "step": 29312 + }, + { + "epoch": 8.997237569060774, + "grad_norm": 0.13783088326454163, + "learning_rate": 2.6154810161203693e-06, + "loss": 1.6801, + "step": 29313 + }, + { + "epoch": 8.997544505831799, + "grad_norm": 0.19866502285003662, + "learning_rate": 2.6138946919040285e-06, + "loss": 1.7817, + "step": 29314 + }, + { + "epoch": 8.997851442602824, + "grad_norm": 0.12466265261173248, + "learning_rate": 2.61230883598495e-06, + "loss": 1.7001, + "step": 29315 + }, + { + "epoch": 8.99815837937385, + "grad_norm": 0.13250842690467834, + "learning_rate": 2.6107234483788158e-06, + "loss": 1.6932, + "step": 29316 + }, + { + "epoch": 8.998465316144873, + "grad_norm": 0.13475441932678223, + "learning_rate": 2.6091385291012904e-06, + "loss": 1.6906, + "step": 29317 + }, + { + "epoch": 8.998772252915899, + "grad_norm": 0.14250501990318298, + "learning_rate": 2.6075540781680284e-06, + "loss": 1.7032, + "step": 29318 + }, + { + "epoch": 8.999079189686924, + "grad_norm": 0.11724159866571426, + "learning_rate": 2.6059700955947007e-06, + "loss": 1.6319, + "step": 29319 + }, + { + "epoch": 8.99938612645795, + "grad_norm": 0.15192265808582306, + "learning_rate": 2.6043865813969505e-06, + "loss": 1.699, + "step": 29320 + }, + { + "epoch": 8.999693063228975, + "grad_norm": 0.14814937114715576, + "learning_rate": 2.6028035355904257e-06, + "loss": 1.7313, + "step": 29321 + }, + { + "epoch": 9.0, + "grad_norm": 0.20881028473377228, + "learning_rate": 2.6012209581907922e-06, + "loss": 1.8009, + "step": 29322 + } + ], + "logging_steps": 1.0, + "max_steps": 32580, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0919321470413958e+21, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-29322/training_args.bin b/checkpoint-29322/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..b6af1535a0e2165d1952d5e6c3ab94ece29b3d2e --- /dev/null +++ b/checkpoint-29322/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9c7985017961b4019da284385f3d0f4b95f69682383f9cf12749177ae67aa87 +size 7288 diff --git a/checkpoint-29322/zero_to_fp32.py b/checkpoint-29322/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-29322/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-3258/config.json b/checkpoint-3258/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a29af639fbf705188c21aae22660a85fee1ca26e --- /dev/null +++ b/checkpoint-3258/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "freeze_mm_mlp_adapter": false, + "gen_hidden_size": 1792, + "gen_pooling": "early_pool2d_4", + "gen_vision_tower": "eva-clip-E-14-plus", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "image_aspect_ratio": "square", + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "llava_llama", + "n_query": 64, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "tokenizer_model_max_length": 256, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "tune_mm_mlp_adapter": false, + "use_cache": false, + "use_mm_proj": true, + "vision_tower_pretrained": null, + "vocab_size": 128260 +} diff --git a/checkpoint-3258/generation_config.json b/checkpoint-3258/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..05892c70fa899883072c585fa444b4aa7175d6bc --- /dev/null +++ b/checkpoint-3258/generation_config.json @@ -0,0 +1,13 @@ +{ + "attn_implementation": "flash_attention_2", + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-3258/latest b/checkpoint-3258/latest new file mode 100644 index 0000000000000000000000000000000000000000..814e3a3a52e84f61a04817734a1aa11078fd4850 --- /dev/null +++ b/checkpoint-3258/latest @@ -0,0 +1 @@ +global_step3258 \ No newline at end of file diff --git a/checkpoint-3258/model-00001-of-00003.safetensors b/checkpoint-3258/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..09c4f247ada6e402e513ec3a63958b284b782388 --- /dev/null +++ b/checkpoint-3258/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f389cf4740c032fe4e526435ea12b05a159853d5a93dfebe57681d4e8905c56 +size 4955415870 diff --git a/checkpoint-3258/model-00002-of-00003.safetensors b/checkpoint-3258/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1c6f3bf70f8abb1e7ffb233219debc10bc20bfc --- /dev/null +++ b/checkpoint-3258/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b088e0e2c4fb5916f448522fa5aef361db713e2c2c0ceac534662c8d52e330d +size 4971563008 diff --git a/checkpoint-3258/model-00003-of-00003.safetensors b/checkpoint-3258/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b5158a4d021bdee42ea01f9a84b219a5eb23bad --- /dev/null +++ b/checkpoint-3258/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa18dab168e0cf382c842c34ce4f00fab62f403acbf86a537a41dd15c6808fab +size 4180840856 diff --git a/checkpoint-3258/model.safetensors.index.json b/checkpoint-3258/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c911c94f46f802ae304903dd7796da96c28604 --- /dev/null +++ b/checkpoint-3258/model.safetensors.index.json @@ -0,0 +1,2358 @@ +{ + "metadata": { + "total_size": 14107506086 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.bias": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.cls_token": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.pos_embed": "model-00001-of-00003.safetensors", + "model.latent_queries": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/checkpoint-3258/rng_state_0.pth b/checkpoint-3258/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a212218f2987b607dc9750dd231bb0237312cf1 --- /dev/null +++ b/checkpoint-3258/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55cd7bf9a6c9aef4aa6855ef3d981b77405ceaddc2328cf5c4c1993c08776fd8 +size 15984 diff --git a/checkpoint-3258/rng_state_1.pth b/checkpoint-3258/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ebf86d7d58d02218ecf2aab42300f06a5960da4 --- /dev/null +++ b/checkpoint-3258/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b37d12fb8d08477723f6520aefe182d99f9be19d5be74326ccd1df2b5526865 +size 15984 diff --git a/checkpoint-3258/rng_state_10.pth b/checkpoint-3258/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..15448d406d7ab50049dd5cfc92ebd47297546f91 --- /dev/null +++ b/checkpoint-3258/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:661ce124e7e24ef7544cc49adf510557d147af1f24c00640fa604bfad0a193e8 +size 15997 diff --git a/checkpoint-3258/rng_state_11.pth b/checkpoint-3258/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..4b3a885079f66d9be09663344c6eacc9297dc57f --- /dev/null +++ b/checkpoint-3258/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e2ed6f4f72f3d41836830f7bf578b8dbb53a9f3a3081538c64224451a550594 +size 15997 diff --git a/checkpoint-3258/rng_state_12.pth b/checkpoint-3258/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..56a783f3184804c60004df2c7f3e6425e8a25dba --- /dev/null +++ b/checkpoint-3258/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db8f6f2505713cbb0a928d0af76cd86c742ab67e7f505b9a61c7c606eb851fb7 +size 15997 diff --git a/checkpoint-3258/rng_state_13.pth b/checkpoint-3258/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..673e2ce2266e61e3713156c1868d2b19e51f3915 --- /dev/null +++ b/checkpoint-3258/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f56fe82e760d14e83f3e52f2a4d8f65d0e6e9a6de8f5602baef319e15ee222cc +size 15997 diff --git a/checkpoint-3258/rng_state_14.pth b/checkpoint-3258/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef1906c679b353b97f2cd956276609744e85d287 --- /dev/null +++ b/checkpoint-3258/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33a715463de7bcc3aa08872035f18f2dfa509934e8a99a9c4303101d23e31df9 +size 15997 diff --git a/checkpoint-3258/rng_state_15.pth b/checkpoint-3258/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..77200b3cc48dc792f19a7a0e6e7eb248707fc32d --- /dev/null +++ b/checkpoint-3258/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3211bd8e996e7e1d00ba6616aab373430819cf28dd513182829c99541c95637 +size 15997 diff --git a/checkpoint-3258/rng_state_16.pth b/checkpoint-3258/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..aad5bc29705ff131a3ade979b44a376a2a06a4df --- /dev/null +++ b/checkpoint-3258/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f21e030a8605858cfd9f4d33ce3cc2a61e199c64595c710213b1f4162d17da7 +size 15997 diff --git a/checkpoint-3258/rng_state_17.pth b/checkpoint-3258/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..2298514b2f76a71e6d93de570e5846957a360790 --- /dev/null +++ b/checkpoint-3258/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeeeea82f1ad2b2b85a8f5a4d4b7d293cddb4e4454d2c733284413b3d49e08f6 +size 15997 diff --git a/checkpoint-3258/rng_state_18.pth b/checkpoint-3258/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..72ce554c5c848252b8a97a87cb173593bcbf84ae --- /dev/null +++ b/checkpoint-3258/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36fa54a9799646d6e522b4569afb4d911eff4a419332ab4f75c4edcb289ecf1d +size 15997 diff --git a/checkpoint-3258/rng_state_19.pth b/checkpoint-3258/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..1440d4c90a2064029c1fe0dd511070d472468980 --- /dev/null +++ b/checkpoint-3258/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:543dae054f9852cea22015ce7f1d762c70e4bf9af508afd087f8264e39e31e33 +size 15997 diff --git a/checkpoint-3258/rng_state_2.pth b/checkpoint-3258/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7cb11f473ade0e6471a1bd6180d5794a9be4f8e --- /dev/null +++ b/checkpoint-3258/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c85cb4a49bf743f08f5f725268b55cfea72643fed30c51710ce5b7a82ce8546 +size 15984 diff --git a/checkpoint-3258/rng_state_20.pth b/checkpoint-3258/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..db8fba7622961fad96ce5f8cbc6a1eeb5e6393bb --- /dev/null +++ b/checkpoint-3258/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26d945af7bf8709b2a1c4a2c5fbfddd489f019cc1c40bd62fc635d68552f0025 +size 15997 diff --git a/checkpoint-3258/rng_state_21.pth b/checkpoint-3258/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..13dd5e5767409bd585ef2ae4c4657838ed9c4f91 --- /dev/null +++ b/checkpoint-3258/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f82f2d996c5b8c7efd3980d23092f7e620ab34bf4da53e1dec91a38c7087691 +size 15997 diff --git a/checkpoint-3258/rng_state_22.pth b/checkpoint-3258/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a6b8b360cb0ffb63f25b30ac91ba26ec66fc944 --- /dev/null +++ b/checkpoint-3258/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56181a7e86230ce096cb4684280d0dcd04a62aaf8cc7bac4141055a8c7c417db +size 15997 diff --git a/checkpoint-3258/rng_state_23.pth b/checkpoint-3258/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..ec84188087b5a9b9914c63a4d3a76388b60490dd --- /dev/null +++ b/checkpoint-3258/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe4eb8204dbbeff7607fce3cce4ae80183b8c33582867be2ee875f243ebc97d5 +size 15997 diff --git a/checkpoint-3258/rng_state_24.pth b/checkpoint-3258/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..730fe0fef3b977a991db8f2ea1ca956be7200447 --- /dev/null +++ b/checkpoint-3258/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b40c32846be2592cd55346786b020be1d11a586f01af2814a43f9743109067d +size 15997 diff --git a/checkpoint-3258/rng_state_25.pth b/checkpoint-3258/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..839456e1f944595038c70b7a365b90b1a4ed973f --- /dev/null +++ b/checkpoint-3258/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b2684e2520efbf5b65f4862d4b482fc4423b0092486324aaaaf4f35ed6f63cf +size 15997 diff --git a/checkpoint-3258/rng_state_26.pth b/checkpoint-3258/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..b1ee9ec9bd3d7fcdfac1d76b63b41e7a39a91ab6 --- /dev/null +++ b/checkpoint-3258/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df7b8e0db9cecf695d7e938afd4cafbc3be9c08bf9cab94779f723ce0b61f27f +size 15997 diff --git a/checkpoint-3258/rng_state_27.pth b/checkpoint-3258/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..abbca3aecb9d6dcc6ac60c2d48f44fdd87287d00 --- /dev/null +++ b/checkpoint-3258/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cab8efd50fe373e1a8964405e1c158fd3db5aeb1838b5135d2db1dab2eda245f +size 15997 diff --git a/checkpoint-3258/rng_state_28.pth b/checkpoint-3258/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..fca3157eb84e28ffa6a5729d91733097161a31a7 --- /dev/null +++ b/checkpoint-3258/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:199562f404fdb785573aba497f259cfc97d41ebba3a4d6b09188d85f51621342 +size 15997 diff --git a/checkpoint-3258/rng_state_29.pth b/checkpoint-3258/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..6713a21f1d2a88b1ddde7f7e4667ab7b1633fa67 --- /dev/null +++ b/checkpoint-3258/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:871ce9c8984bb4112b0866578c8cffc2c535ae199dff498292e50f540db4f89f +size 15997 diff --git a/checkpoint-3258/rng_state_3.pth b/checkpoint-3258/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e4c03967a06818a86039a8971592ae2222ef6b9b --- /dev/null +++ b/checkpoint-3258/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c33611df037ba90e4ede4a70cca0ce3abd927d894035fc88be2f021c8b44594 +size 15984 diff --git a/checkpoint-3258/rng_state_30.pth b/checkpoint-3258/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..42fb53a72923294a5861de569ff3ee6aa6563686 --- /dev/null +++ b/checkpoint-3258/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eb93f4adee988933140b0a42b71c044bbef7c5e3bcf32fd9b39cf38f873269b +size 15997 diff --git a/checkpoint-3258/rng_state_31.pth b/checkpoint-3258/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc7a821bfab41848c655be812b5e77f926148cee --- /dev/null +++ b/checkpoint-3258/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8cbb6678317b2ed5995373a59a82005e55e6ecc5024a877ec8b52096dc4e049 +size 15997 diff --git a/checkpoint-3258/rng_state_32.pth b/checkpoint-3258/rng_state_32.pth new file mode 100644 index 0000000000000000000000000000000000000000..de7cd69a9cbac8a11f2fa90b5984de3e8ec37f32 --- /dev/null +++ b/checkpoint-3258/rng_state_32.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:696ba89a9485b4c0872e66e9ab37a7e82ea43a2ef4ae1ddf4d89fea06f505961 +size 15997 diff --git a/checkpoint-3258/rng_state_33.pth b/checkpoint-3258/rng_state_33.pth new file mode 100644 index 0000000000000000000000000000000000000000..03570d6c203ada5573239a36b83d6177dfe385c6 --- /dev/null +++ b/checkpoint-3258/rng_state_33.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b393130f0ca558c304fdac74dceff820f81b78e690efcd7ba23d68777feca95a +size 15997 diff --git a/checkpoint-3258/rng_state_34.pth b/checkpoint-3258/rng_state_34.pth new file mode 100644 index 0000000000000000000000000000000000000000..676a70af4af49e80a290014ff89c0d8b8c23e4b4 --- /dev/null +++ b/checkpoint-3258/rng_state_34.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:324eeee49003b9933194d1bfb0cd74ec5a57a483ae5a4b83a78431f522fc2f2b +size 15997 diff --git a/checkpoint-3258/rng_state_35.pth b/checkpoint-3258/rng_state_35.pth new file mode 100644 index 0000000000000000000000000000000000000000..627b5ddf85b0ca70f65774ce29694ee5ad42e5a2 --- /dev/null +++ b/checkpoint-3258/rng_state_35.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0ffefec8f8a70c7b058d7a78c0a75d6731e15204eff3a85f50e1e221d185bd2 +size 15997 diff --git a/checkpoint-3258/rng_state_36.pth b/checkpoint-3258/rng_state_36.pth new file mode 100644 index 0000000000000000000000000000000000000000..879c3be772d882232b6ecd7925fe8a5834aa0eda --- /dev/null +++ b/checkpoint-3258/rng_state_36.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24fa96b0c6d5cda35192fcbfaa80f28e796da4e1bb34e63b72bca3ac54b15fa1 +size 15997 diff --git a/checkpoint-3258/rng_state_37.pth b/checkpoint-3258/rng_state_37.pth new file mode 100644 index 0000000000000000000000000000000000000000..746718927ba0627a5cb92a96d87d4ce8b21d9d2a --- /dev/null +++ b/checkpoint-3258/rng_state_37.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d27446754666167760ed1341ca6c162c3afdfa9f5c9bb2a66271df770034406 +size 15997 diff --git a/checkpoint-3258/rng_state_38.pth b/checkpoint-3258/rng_state_38.pth new file mode 100644 index 0000000000000000000000000000000000000000..c3365d425a68f5267c1de60e51360dc493e90ac1 --- /dev/null +++ b/checkpoint-3258/rng_state_38.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:383a76f7099918af5f84b63bf8747fd3e3822aa74fb33a4bae8870daa43ea30e +size 15997 diff --git a/checkpoint-3258/rng_state_39.pth b/checkpoint-3258/rng_state_39.pth new file mode 100644 index 0000000000000000000000000000000000000000..879d1a1b9e674623bb0c96458e133bfe5fa600a5 --- /dev/null +++ b/checkpoint-3258/rng_state_39.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05014be3d5b2187c8478cba2f39558185e610e581cff9d9d5e7c853195cb1d96 +size 15997 diff --git a/checkpoint-3258/rng_state_4.pth b/checkpoint-3258/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ca94cdd530f51b8560e7a71c243a761d14a46f6 --- /dev/null +++ b/checkpoint-3258/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:855f375180daf5e1faa4731b7360fd6cc7ef9e0a7b478ac2c1662c54abbc3d6e +size 15984 diff --git a/checkpoint-3258/rng_state_40.pth b/checkpoint-3258/rng_state_40.pth new file mode 100644 index 0000000000000000000000000000000000000000..1cc015739d412fed1e6311819e8e206ca66ed42b --- /dev/null +++ b/checkpoint-3258/rng_state_40.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a6f21f250b52fadea3d29b9e1e23a34b931d85a379ea1921aced3c60f65b5ed +size 15997 diff --git a/checkpoint-3258/rng_state_41.pth b/checkpoint-3258/rng_state_41.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9186def431e33cff269e601fc36352c4aabccf5 --- /dev/null +++ b/checkpoint-3258/rng_state_41.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdc80b7fcf65decf6a0a3879fb64eade654b4a6ced1983aa9755ca6ad660491a +size 15997 diff --git a/checkpoint-3258/rng_state_42.pth b/checkpoint-3258/rng_state_42.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c1da12d5ff73cab321fa604f1f9efbf1aea6ef9 --- /dev/null +++ b/checkpoint-3258/rng_state_42.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaf8bb2c7bb4065784441a1e57ff63ed48f8ba37e5e5a6b0262baad7ebb13fcc +size 15997 diff --git a/checkpoint-3258/rng_state_43.pth b/checkpoint-3258/rng_state_43.pth new file mode 100644 index 0000000000000000000000000000000000000000..971b133ad89c2bb5f01cbe15e7fcde95a5dcf89f --- /dev/null +++ b/checkpoint-3258/rng_state_43.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca899930bdef7f04f7f78394e8bee7ae44b3b4e04a8036e1814fbc5a6348d9d0 +size 15997 diff --git a/checkpoint-3258/rng_state_44.pth b/checkpoint-3258/rng_state_44.pth new file mode 100644 index 0000000000000000000000000000000000000000..cdfeeef0d3800dc18cbe395d8f5d2fef3780f8d6 --- /dev/null +++ b/checkpoint-3258/rng_state_44.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c17fc119bbf286e898054336039fe363f170e39f1109c9d9cf050028bf73adae +size 15997 diff --git a/checkpoint-3258/rng_state_45.pth b/checkpoint-3258/rng_state_45.pth new file mode 100644 index 0000000000000000000000000000000000000000..20c812917daed01ea1f377e298fbbd2677c347e0 --- /dev/null +++ b/checkpoint-3258/rng_state_45.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bd204a06ef2fb5dbe0a3e619af00359b4397bba62e08e7571cec496a8681dd0 +size 15997 diff --git a/checkpoint-3258/rng_state_46.pth b/checkpoint-3258/rng_state_46.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fbe8370b4a50d25c1bb3a5db3977985a7203c12 --- /dev/null +++ b/checkpoint-3258/rng_state_46.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7171bfeb8b9ba0435a00ea8259a3c0c766931ce8e48722dce44e8d7f3124aa51 +size 15997 diff --git a/checkpoint-3258/rng_state_47.pth b/checkpoint-3258/rng_state_47.pth new file mode 100644 index 0000000000000000000000000000000000000000..344596fa613e46a7dd208f695291fdea05efe3b5 --- /dev/null +++ b/checkpoint-3258/rng_state_47.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3bf0afa06b4d0a39ea441f46ff09078651d9e595df99b2a8ebce1108d49746f +size 15997 diff --git a/checkpoint-3258/rng_state_48.pth b/checkpoint-3258/rng_state_48.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e891081c0542157a1bd2405e7635165d7242599 --- /dev/null +++ b/checkpoint-3258/rng_state_48.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31178375ad075cbca93a8c5283b1e710841d30bbbffd3742b49e409d582bbba3 +size 15997 diff --git a/checkpoint-3258/rng_state_49.pth b/checkpoint-3258/rng_state_49.pth new file mode 100644 index 0000000000000000000000000000000000000000..f59dbc91b7d6c0c6c796415d7ad113043a9dff3b --- /dev/null +++ b/checkpoint-3258/rng_state_49.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdf2c68ff1daeab0a31f86d74e084b357f1868b411435c20ad3a61eb407022c0 +size 15997 diff --git a/checkpoint-3258/rng_state_5.pth b/checkpoint-3258/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..d7b5a50909029520709e3dc13fb21389275080fb --- /dev/null +++ b/checkpoint-3258/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc6aca3db484d1b58df0ddf3ceb26a412a49782ac1e0f63cd8c8ecc63d5edb1f +size 15984 diff --git a/checkpoint-3258/rng_state_50.pth b/checkpoint-3258/rng_state_50.pth new file mode 100644 index 0000000000000000000000000000000000000000..08898fb0a1ae0d4886bfb9b6f71e3623c22b30e9 --- /dev/null +++ b/checkpoint-3258/rng_state_50.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f12c01ed65a481b2a5189aa349b8ffe26ca4279ecf8e14ebc5c13f529ce0594 +size 15997 diff --git a/checkpoint-3258/rng_state_51.pth b/checkpoint-3258/rng_state_51.pth new file mode 100644 index 0000000000000000000000000000000000000000..2e01224458a898f76f02f2d4e9ab897319fb4237 --- /dev/null +++ b/checkpoint-3258/rng_state_51.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58163b35c565ddad0745cb3c1eed54705036137c957489010198cee49aff2598 +size 15997 diff --git a/checkpoint-3258/rng_state_52.pth b/checkpoint-3258/rng_state_52.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a3bacece8b61a150472b459f7d0b85de38e9550 --- /dev/null +++ b/checkpoint-3258/rng_state_52.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a02e92889c9ed0f64b4becf16476622315d0ee03afc3b313e647455f4b48ba3 +size 15997 diff --git a/checkpoint-3258/rng_state_53.pth b/checkpoint-3258/rng_state_53.pth new file mode 100644 index 0000000000000000000000000000000000000000..2547ae45cfa090915fbcf0b6e534d43aa482fcbe --- /dev/null +++ b/checkpoint-3258/rng_state_53.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b237d10728e2103d592c7112ec2989b5c92b4de000b6edc697c635a0dc7660c +size 15997 diff --git a/checkpoint-3258/rng_state_54.pth b/checkpoint-3258/rng_state_54.pth new file mode 100644 index 0000000000000000000000000000000000000000..ffc0ca97cee5f30ea766679c0f02460ba8f2f767 --- /dev/null +++ b/checkpoint-3258/rng_state_54.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21f30dd3fda8a280701517df99ec871a7ac0b5fd18d8a5ca57e042b58f3787e0 +size 15997 diff --git a/checkpoint-3258/rng_state_55.pth b/checkpoint-3258/rng_state_55.pth new file mode 100644 index 0000000000000000000000000000000000000000..09c2010a44daaba90f0bbc744d0236d930a5e16e --- /dev/null +++ b/checkpoint-3258/rng_state_55.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:393db576d6306baee1c15c48d536a87a6073eea333f03eee2bde6a8cef4b78ad +size 15997 diff --git a/checkpoint-3258/rng_state_56.pth b/checkpoint-3258/rng_state_56.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d54bf0f292609fe2328922b190572d3c50341cb --- /dev/null +++ b/checkpoint-3258/rng_state_56.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23c69d05bf14c63b937bbc3632cbedf488c16a4981bf720268b729f0d481e66c +size 15997 diff --git a/checkpoint-3258/rng_state_57.pth b/checkpoint-3258/rng_state_57.pth new file mode 100644 index 0000000000000000000000000000000000000000..05bebf22252dc75c5577b74f7d589213e1e5a0b8 --- /dev/null +++ b/checkpoint-3258/rng_state_57.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:929d28b6a729826d9957fd7b55c67d9639c52035a3a37c8faacb49e6bbf7a610 +size 15997 diff --git a/checkpoint-3258/rng_state_58.pth b/checkpoint-3258/rng_state_58.pth new file mode 100644 index 0000000000000000000000000000000000000000..134587ce27957bc0adfdf27b8f004c393fdd4dfc --- /dev/null +++ b/checkpoint-3258/rng_state_58.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90c00e42c6d53f3038c5a8bbf1b04b97d1fbaafb6b8070b3df365d3c5fde7033 +size 15997 diff --git a/checkpoint-3258/rng_state_59.pth b/checkpoint-3258/rng_state_59.pth new file mode 100644 index 0000000000000000000000000000000000000000..c014bf5062f42741607c6e8a29ff731f428c0f88 --- /dev/null +++ b/checkpoint-3258/rng_state_59.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efe7a621bfc48cb55e883996f102f34b8fca4d4b5c7f23656df2f7425e846462 +size 15997 diff --git a/checkpoint-3258/rng_state_6.pth b/checkpoint-3258/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..b1ecd86f4f742f2e1b49811df53ea8e96a32d49c --- /dev/null +++ b/checkpoint-3258/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:705f26b8ff84eeb9f6559adbe496d2121179efba08e5fbbe1b5bdba9e88c2276 +size 15984 diff --git a/checkpoint-3258/rng_state_60.pth b/checkpoint-3258/rng_state_60.pth new file mode 100644 index 0000000000000000000000000000000000000000..426864d3605d7e8590d977d2b8dcec678833e224 --- /dev/null +++ b/checkpoint-3258/rng_state_60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:311cf715949f75ca92132d949d75e88fe37d5e46453d757b162574b46135742f +size 15997 diff --git a/checkpoint-3258/rng_state_61.pth b/checkpoint-3258/rng_state_61.pth new file mode 100644 index 0000000000000000000000000000000000000000..828a91c1c1f8f16fe5770e022e0658500a97f1f6 --- /dev/null +++ b/checkpoint-3258/rng_state_61.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:595db66a814c727da3ce1a3f729ebe614550e53cf4608f4dc5a9e78edc0e1d14 +size 15997 diff --git a/checkpoint-3258/rng_state_62.pth b/checkpoint-3258/rng_state_62.pth new file mode 100644 index 0000000000000000000000000000000000000000..8dadca6da5328cde639b3943ad074c2f3dec966d --- /dev/null +++ b/checkpoint-3258/rng_state_62.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af926ad2585d0e492c96939c3412d525a55b7159244849eb3bada8e26f87d24e +size 15997 diff --git a/checkpoint-3258/rng_state_63.pth b/checkpoint-3258/rng_state_63.pth new file mode 100644 index 0000000000000000000000000000000000000000..eb81d0f1fea6a0dcd09cd4de3af1f8e62834054b --- /dev/null +++ b/checkpoint-3258/rng_state_63.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:852cfd820c9599182c4858c85891055f90cf8b23123923616d4c45c14085096e +size 15997 diff --git a/checkpoint-3258/rng_state_7.pth b/checkpoint-3258/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee653dc8bbdf0c8c910285382e38807d733f73f6 --- /dev/null +++ b/checkpoint-3258/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86098c2fa38ae7c832679cfcb9eb58ed1c8c7b8b18f3fb9aece16600e2f0e6d5 +size 15984 diff --git a/checkpoint-3258/rng_state_8.pth b/checkpoint-3258/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..53e172e472362ff6078f59b7730afb430feb618b --- /dev/null +++ b/checkpoint-3258/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f35b1651b3cc2b7565c4814cf486fb5fe1a407a1d4728132eebad2f24f65511 +size 15984 diff --git a/checkpoint-3258/rng_state_9.pth b/checkpoint-3258/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9323d7c2d97851a4afd886ee811de803c68739b --- /dev/null +++ b/checkpoint-3258/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f563102eecd3a4af2679bbff6041d65c05accdb5b9e6c627fb76c997dc608e91 +size 15984 diff --git a/checkpoint-3258/scheduler.pt b/checkpoint-3258/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..13cf2a1fd6723cc0b4e1b59e62a68f04628a5db1 --- /dev/null +++ b/checkpoint-3258/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0581610e23713e54c036eb38e40211287a39422855dcb6c4cd537970c4a7f93 +size 1064 diff --git a/checkpoint-3258/special_tokens_map.json b/checkpoint-3258/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad43db72a0e94321a5a9455dce616c68d1f9673 --- /dev/null +++ b/checkpoint-3258/special_tokens_map.json @@ -0,0 +1,46 @@ +{ + "additional_special_tokens": [ + { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-3258/tokenizer.json b/checkpoint-3258/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..444d43e1c25d11b63381073024becd006c83d4f6 --- /dev/null +++ b/checkpoint-3258/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fbef9068a1d82c7fafc3fdfd7c717524c8bfbcaea19c14ce4f8a4e616deb57 +size 17210651 diff --git a/checkpoint-3258/tokenizer_config.json b/checkpoint-3258/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a54102d00c210427fe2da524cea00c5ace13686 --- /dev/null +++ b/checkpoint-3258/tokenizer_config.json @@ -0,0 +1,2102 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "[IMG]", + "[/IMG]", + "" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 256, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-3258/trainer_state.json b/checkpoint-3258/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c3b6e6a9decacb1ecb53bf5297a7ab2c6f2a50ba --- /dev/null +++ b/checkpoint-3258/trainer_state.json @@ -0,0 +1,22840 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3258, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003069367710251688, + "grad_norm": 1.3492016792297363, + "learning_rate": 0.0, + "loss": 6.5185, + "step": 1 + }, + { + "epoch": 0.0006138735420503376, + "grad_norm": 1.4303781986236572, + "learning_rate": 1.0224948875255626e-07, + "loss": 6.5124, + "step": 2 + }, + { + "epoch": 0.0009208103130755065, + "grad_norm": 1.3981783390045166, + "learning_rate": 2.0449897750511251e-07, + "loss": 6.5204, + "step": 3 + }, + { + "epoch": 0.0012277470841006752, + "grad_norm": 1.3760672807693481, + "learning_rate": 3.0674846625766876e-07, + "loss": 6.502, + "step": 4 + }, + { + "epoch": 0.001534683855125844, + "grad_norm": 1.3704107999801636, + "learning_rate": 4.0899795501022503e-07, + "loss": 6.5021, + "step": 5 + }, + { + "epoch": 0.001841620626151013, + "grad_norm": 1.3109549283981323, + "learning_rate": 5.112474437627812e-07, + "loss": 6.521, + "step": 6 + }, + { + "epoch": 0.002148557397176182, + "grad_norm": 1.475183367729187, + "learning_rate": 6.134969325153375e-07, + "loss": 6.521, + "step": 7 + }, + { + "epoch": 0.0024554941682013503, + "grad_norm": 1.4563297033309937, + "learning_rate": 7.157464212678937e-07, + "loss": 6.5075, + "step": 8 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 1.437183141708374, + "learning_rate": 8.179959100204501e-07, + "loss": 6.5135, + "step": 9 + }, + { + "epoch": 0.003069367710251688, + "grad_norm": 1.336928129196167, + "learning_rate": 9.202453987730062e-07, + "loss": 6.5138, + "step": 10 + }, + { + "epoch": 0.003376304481276857, + "grad_norm": 1.3220698833465576, + "learning_rate": 1.0224948875255625e-06, + "loss": 6.5187, + "step": 11 + }, + { + "epoch": 0.003683241252302026, + "grad_norm": 1.3990652561187744, + "learning_rate": 1.1247443762781187e-06, + "loss": 6.5129, + "step": 12 + }, + { + "epoch": 0.003990178023327195, + "grad_norm": 1.4394340515136719, + "learning_rate": 1.226993865030675e-06, + "loss": 6.5078, + "step": 13 + }, + { + "epoch": 0.004297114794352364, + "grad_norm": 1.3675259351730347, + "learning_rate": 1.3292433537832312e-06, + "loss": 6.5115, + "step": 14 + }, + { + "epoch": 0.004604051565377533, + "grad_norm": 1.3085063695907593, + "learning_rate": 1.4314928425357874e-06, + "loss": 6.5092, + "step": 15 + }, + { + "epoch": 0.004910988336402701, + "grad_norm": 1.4214227199554443, + "learning_rate": 1.5337423312883435e-06, + "loss": 6.5026, + "step": 16 + }, + { + "epoch": 0.0052179251074278695, + "grad_norm": 1.377146601676941, + "learning_rate": 1.6359918200409001e-06, + "loss": 6.4882, + "step": 17 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.3461124897003174, + "learning_rate": 1.7382413087934563e-06, + "loss": 6.4935, + "step": 18 + }, + { + "epoch": 0.005831798649478207, + "grad_norm": 1.3161669969558716, + "learning_rate": 1.8404907975460124e-06, + "loss": 6.4795, + "step": 19 + }, + { + "epoch": 0.006138735420503376, + "grad_norm": 1.2915974855422974, + "learning_rate": 1.942740286298569e-06, + "loss": 6.4529, + "step": 20 + }, + { + "epoch": 0.006445672191528545, + "grad_norm": 1.2675414085388184, + "learning_rate": 2.044989775051125e-06, + "loss": 6.454, + "step": 21 + }, + { + "epoch": 0.006752608962553714, + "grad_norm": 1.2769283056259155, + "learning_rate": 2.147239263803681e-06, + "loss": 6.4574, + "step": 22 + }, + { + "epoch": 0.007059545733578883, + "grad_norm": 1.2556813955307007, + "learning_rate": 2.2494887525562373e-06, + "loss": 6.4486, + "step": 23 + }, + { + "epoch": 0.007366482504604052, + "grad_norm": 1.2158268690109253, + "learning_rate": 2.3517382413087935e-06, + "loss": 6.4357, + "step": 24 + }, + { + "epoch": 0.007673419275629221, + "grad_norm": 1.2383767366409302, + "learning_rate": 2.45398773006135e-06, + "loss": 6.4347, + "step": 25 + }, + { + "epoch": 0.00798035604665439, + "grad_norm": 1.2865383625030518, + "learning_rate": 2.5562372188139062e-06, + "loss": 6.3611, + "step": 26 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 1.1501989364624023, + "learning_rate": 2.6584867075664624e-06, + "loss": 6.3247, + "step": 27 + }, + { + "epoch": 0.008594229588704727, + "grad_norm": 1.0971378087997437, + "learning_rate": 2.7607361963190186e-06, + "loss": 6.3078, + "step": 28 + }, + { + "epoch": 0.008901166359729895, + "grad_norm": 1.1365599632263184, + "learning_rate": 2.8629856850715747e-06, + "loss": 6.3211, + "step": 29 + }, + { + "epoch": 0.009208103130755065, + "grad_norm": 1.1228944063186646, + "learning_rate": 2.965235173824131e-06, + "loss": 6.3185, + "step": 30 + }, + { + "epoch": 0.009515039901780233, + "grad_norm": 1.126287579536438, + "learning_rate": 3.067484662576687e-06, + "loss": 6.2845, + "step": 31 + }, + { + "epoch": 0.009821976672805401, + "grad_norm": 1.1070353984832764, + "learning_rate": 3.1697341513292436e-06, + "loss": 6.2855, + "step": 32 + }, + { + "epoch": 0.010128913443830571, + "grad_norm": 1.101291537284851, + "learning_rate": 3.2719836400818002e-06, + "loss": 6.2764, + "step": 33 + }, + { + "epoch": 0.010435850214855739, + "grad_norm": 1.0643113851547241, + "learning_rate": 3.374233128834356e-06, + "loss": 6.2363, + "step": 34 + }, + { + "epoch": 0.010742786985880909, + "grad_norm": 0.9714563488960266, + "learning_rate": 3.4764826175869125e-06, + "loss": 6.1771, + "step": 35 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8998560309410095, + "learning_rate": 3.5787321063394683e-06, + "loss": 6.1202, + "step": 36 + }, + { + "epoch": 0.011356660527931247, + "grad_norm": 0.8481987714767456, + "learning_rate": 3.680981595092025e-06, + "loss": 6.0954, + "step": 37 + }, + { + "epoch": 0.011663597298956415, + "grad_norm": 0.8124909996986389, + "learning_rate": 3.783231083844581e-06, + "loss": 6.0832, + "step": 38 + }, + { + "epoch": 0.011970534069981584, + "grad_norm": 0.7968178391456604, + "learning_rate": 3.885480572597138e-06, + "loss": 6.0661, + "step": 39 + }, + { + "epoch": 0.012277470841006752, + "grad_norm": 0.7714207768440247, + "learning_rate": 3.987730061349693e-06, + "loss": 6.0385, + "step": 40 + }, + { + "epoch": 0.012584407612031922, + "grad_norm": 0.7436742782592773, + "learning_rate": 4.08997955010225e-06, + "loss": 6.0227, + "step": 41 + }, + { + "epoch": 0.01289134438305709, + "grad_norm": 0.7447277307510376, + "learning_rate": 4.192229038854806e-06, + "loss": 6.0208, + "step": 42 + }, + { + "epoch": 0.013198281154082258, + "grad_norm": 0.6983785629272461, + "learning_rate": 4.294478527607362e-06, + "loss": 6.0295, + "step": 43 + }, + { + "epoch": 0.013505217925107428, + "grad_norm": 0.6630908250808716, + "learning_rate": 4.3967280163599184e-06, + "loss": 6.004, + "step": 44 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.6481929421424866, + "learning_rate": 4.498977505112475e-06, + "loss": 5.9986, + "step": 45 + }, + { + "epoch": 0.014119091467157766, + "grad_norm": 0.7187685966491699, + "learning_rate": 4.601226993865031e-06, + "loss": 6.0008, + "step": 46 + }, + { + "epoch": 0.014426028238182934, + "grad_norm": 0.6550983190536499, + "learning_rate": 4.703476482617587e-06, + "loss": 5.9735, + "step": 47 + }, + { + "epoch": 0.014732965009208104, + "grad_norm": 0.6780675649642944, + "learning_rate": 4.805725971370143e-06, + "loss": 5.9568, + "step": 48 + }, + { + "epoch": 0.015039901780233272, + "grad_norm": 0.703427791595459, + "learning_rate": 4.9079754601227e-06, + "loss": 5.961, + "step": 49 + }, + { + "epoch": 0.015346838551258441, + "grad_norm": 0.6507543921470642, + "learning_rate": 5.0102249488752554e-06, + "loss": 5.9557, + "step": 50 + }, + { + "epoch": 0.01565377532228361, + "grad_norm": 0.5959481000900269, + "learning_rate": 5.1124744376278124e-06, + "loss": 5.9391, + "step": 51 + }, + { + "epoch": 0.01596071209330878, + "grad_norm": 0.5798730254173279, + "learning_rate": 5.214723926380368e-06, + "loss": 5.9488, + "step": 52 + }, + { + "epoch": 0.016267648864333947, + "grad_norm": 0.5932896137237549, + "learning_rate": 5.316973415132925e-06, + "loss": 5.9176, + "step": 53 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5772561430931091, + "learning_rate": 5.419222903885481e-06, + "loss": 5.9069, + "step": 54 + }, + { + "epoch": 0.016881522406384283, + "grad_norm": 0.5578178763389587, + "learning_rate": 5.521472392638037e-06, + "loss": 5.8924, + "step": 55 + }, + { + "epoch": 0.017188459177409455, + "grad_norm": 0.5458457469940186, + "learning_rate": 5.623721881390593e-06, + "loss": 5.9001, + "step": 56 + }, + { + "epoch": 0.017495395948434623, + "grad_norm": 0.5381231904029846, + "learning_rate": 5.7259713701431494e-06, + "loss": 5.8827, + "step": 57 + }, + { + "epoch": 0.01780233271945979, + "grad_norm": 0.540920615196228, + "learning_rate": 5.828220858895706e-06, + "loss": 5.8763, + "step": 58 + }, + { + "epoch": 0.01810926949048496, + "grad_norm": 0.5378615260124207, + "learning_rate": 5.930470347648262e-06, + "loss": 5.865, + "step": 59 + }, + { + "epoch": 0.01841620626151013, + "grad_norm": 0.5139282941818237, + "learning_rate": 6.032719836400819e-06, + "loss": 5.873, + "step": 60 + }, + { + "epoch": 0.0187231430325353, + "grad_norm": 0.5298904776573181, + "learning_rate": 6.134969325153374e-06, + "loss": 5.861, + "step": 61 + }, + { + "epoch": 0.019030079803560467, + "grad_norm": 0.503131628036499, + "learning_rate": 6.237218813905931e-06, + "loss": 5.844, + "step": 62 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.5133433938026428, + "learning_rate": 6.339468302658487e-06, + "loss": 5.8535, + "step": 63 + }, + { + "epoch": 0.019643953345610803, + "grad_norm": 0.4909187853336334, + "learning_rate": 6.4417177914110434e-06, + "loss": 5.8378, + "step": 64 + }, + { + "epoch": 0.019950890116635974, + "grad_norm": 0.6916642785072327, + "learning_rate": 6.5439672801636004e-06, + "loss": 5.8385, + "step": 65 + }, + { + "epoch": 0.020257826887661142, + "grad_norm": 0.4801484942436218, + "learning_rate": 6.646216768916155e-06, + "loss": 5.8089, + "step": 66 + }, + { + "epoch": 0.02056476365868631, + "grad_norm": 0.47745251655578613, + "learning_rate": 6.748466257668712e-06, + "loss": 5.8119, + "step": 67 + }, + { + "epoch": 0.020871700429711478, + "grad_norm": 0.4693359136581421, + "learning_rate": 6.850715746421268e-06, + "loss": 5.8038, + "step": 68 + }, + { + "epoch": 0.02117863720073665, + "grad_norm": 0.46996453404426575, + "learning_rate": 6.952965235173825e-06, + "loss": 5.7966, + "step": 69 + }, + { + "epoch": 0.021485573971761818, + "grad_norm": 0.45779168605804443, + "learning_rate": 7.05521472392638e-06, + "loss": 5.7959, + "step": 70 + }, + { + "epoch": 0.021792510742786986, + "grad_norm": 0.49008259177207947, + "learning_rate": 7.1574642126789366e-06, + "loss": 5.7861, + "step": 71 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.44727766513824463, + "learning_rate": 7.259713701431494e-06, + "loss": 5.7716, + "step": 72 + }, + { + "epoch": 0.022406384284837322, + "grad_norm": 0.4392741918563843, + "learning_rate": 7.36196319018405e-06, + "loss": 5.7776, + "step": 73 + }, + { + "epoch": 0.022713321055862493, + "grad_norm": 0.43525391817092896, + "learning_rate": 7.464212678936605e-06, + "loss": 5.7687, + "step": 74 + }, + { + "epoch": 0.02302025782688766, + "grad_norm": 0.4370710253715515, + "learning_rate": 7.566462167689162e-06, + "loss": 5.7504, + "step": 75 + }, + { + "epoch": 0.02332719459791283, + "grad_norm": 0.4349770247936249, + "learning_rate": 7.668711656441718e-06, + "loss": 5.7425, + "step": 76 + }, + { + "epoch": 0.023634131368937997, + "grad_norm": 0.42710933089256287, + "learning_rate": 7.770961145194275e-06, + "loss": 5.7562, + "step": 77 + }, + { + "epoch": 0.02394106813996317, + "grad_norm": 0.42816224694252014, + "learning_rate": 7.87321063394683e-06, + "loss": 5.7301, + "step": 78 + }, + { + "epoch": 0.024248004910988337, + "grad_norm": 0.4183364510536194, + "learning_rate": 7.975460122699386e-06, + "loss": 5.7131, + "step": 79 + }, + { + "epoch": 0.024554941682013505, + "grad_norm": 0.4179428517818451, + "learning_rate": 8.077709611451943e-06, + "loss": 5.7057, + "step": 80 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.40880727767944336, + "learning_rate": 8.1799591002045e-06, + "loss": 5.7179, + "step": 81 + }, + { + "epoch": 0.025168815224063844, + "grad_norm": 0.40961235761642456, + "learning_rate": 8.282208588957055e-06, + "loss": 5.7008, + "step": 82 + }, + { + "epoch": 0.025475751995089013, + "grad_norm": 0.46789029240608215, + "learning_rate": 8.384458077709612e-06, + "loss": 5.7071, + "step": 83 + }, + { + "epoch": 0.02578268876611418, + "grad_norm": 0.4776248335838318, + "learning_rate": 8.486707566462168e-06, + "loss": 5.6829, + "step": 84 + }, + { + "epoch": 0.02608962553713935, + "grad_norm": 0.40660589933395386, + "learning_rate": 8.588957055214725e-06, + "loss": 5.6732, + "step": 85 + }, + { + "epoch": 0.026396562308164517, + "grad_norm": 0.3984324038028717, + "learning_rate": 8.69120654396728e-06, + "loss": 5.6777, + "step": 86 + }, + { + "epoch": 0.026703499079189688, + "grad_norm": 0.3972148597240448, + "learning_rate": 8.793456032719837e-06, + "loss": 5.6598, + "step": 87 + }, + { + "epoch": 0.027010435850214856, + "grad_norm": 0.3906182050704956, + "learning_rate": 8.895705521472392e-06, + "loss": 5.6468, + "step": 88 + }, + { + "epoch": 0.027317372621240024, + "grad_norm": 0.38598939776420593, + "learning_rate": 8.99795501022495e-06, + "loss": 5.6452, + "step": 89 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.405943363904953, + "learning_rate": 9.100204498977506e-06, + "loss": 5.6408, + "step": 90 + }, + { + "epoch": 0.027931246163290364, + "grad_norm": 0.3859459161758423, + "learning_rate": 9.202453987730062e-06, + "loss": 5.613, + "step": 91 + }, + { + "epoch": 0.028238182934315532, + "grad_norm": 0.3773545026779175, + "learning_rate": 9.304703476482619e-06, + "loss": 5.6277, + "step": 92 + }, + { + "epoch": 0.0285451197053407, + "grad_norm": 0.36915943026542664, + "learning_rate": 9.406952965235174e-06, + "loss": 5.618, + "step": 93 + }, + { + "epoch": 0.028852056476365868, + "grad_norm": 0.3732316792011261, + "learning_rate": 9.509202453987731e-06, + "loss": 5.6066, + "step": 94 + }, + { + "epoch": 0.029158993247391036, + "grad_norm": 0.3670802414417267, + "learning_rate": 9.611451942740286e-06, + "loss": 5.6189, + "step": 95 + }, + { + "epoch": 0.029465930018416207, + "grad_norm": 0.3672202229499817, + "learning_rate": 9.713701431492843e-06, + "loss": 5.6046, + "step": 96 + }, + { + "epoch": 0.029772866789441375, + "grad_norm": 0.3624509871006012, + "learning_rate": 9.8159509202454e-06, + "loss": 5.585, + "step": 97 + }, + { + "epoch": 0.030079803560466543, + "grad_norm": 0.36265870928764343, + "learning_rate": 9.918200408997956e-06, + "loss": 5.5867, + "step": 98 + }, + { + "epoch": 0.03038674033149171, + "grad_norm": 0.3606979548931122, + "learning_rate": 1.0020449897750511e-05, + "loss": 5.5658, + "step": 99 + }, + { + "epoch": 0.030693677102516883, + "grad_norm": 0.36800363659858704, + "learning_rate": 1.0122699386503068e-05, + "loss": 5.5494, + "step": 100 + }, + { + "epoch": 0.03100061387354205, + "grad_norm": 0.3641016483306885, + "learning_rate": 1.0224948875255625e-05, + "loss": 5.5553, + "step": 101 + }, + { + "epoch": 0.03130755064456722, + "grad_norm": 0.36807990074157715, + "learning_rate": 1.032719836400818e-05, + "loss": 5.5315, + "step": 102 + }, + { + "epoch": 0.03161448741559239, + "grad_norm": 0.37071728706359863, + "learning_rate": 1.0429447852760736e-05, + "loss": 5.522, + "step": 103 + }, + { + "epoch": 0.03192142418661756, + "grad_norm": 0.3549076020717621, + "learning_rate": 1.0531697341513293e-05, + "loss": 5.5354, + "step": 104 + }, + { + "epoch": 0.03222836095764273, + "grad_norm": 0.3589537441730499, + "learning_rate": 1.063394683026585e-05, + "loss": 5.534, + "step": 105 + }, + { + "epoch": 0.032535297728667895, + "grad_norm": 0.4341397285461426, + "learning_rate": 1.0736196319018407e-05, + "loss": 5.5088, + "step": 106 + }, + { + "epoch": 0.03284223449969306, + "grad_norm": 0.37220680713653564, + "learning_rate": 1.0838445807770962e-05, + "loss": 5.5213, + "step": 107 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.3776145875453949, + "learning_rate": 1.0940695296523517e-05, + "loss": 5.4955, + "step": 108 + }, + { + "epoch": 0.0334561080417434, + "grad_norm": 0.38651829957962036, + "learning_rate": 1.1042944785276074e-05, + "loss": 5.4916, + "step": 109 + }, + { + "epoch": 0.03376304481276857, + "grad_norm": 0.3749970495700836, + "learning_rate": 1.1145194274028631e-05, + "loss": 5.4686, + "step": 110 + }, + { + "epoch": 0.03406998158379374, + "grad_norm": 0.38184404373168945, + "learning_rate": 1.1247443762781187e-05, + "loss": 5.4694, + "step": 111 + }, + { + "epoch": 0.03437691835481891, + "grad_norm": 0.38783952593803406, + "learning_rate": 1.1349693251533742e-05, + "loss": 5.4447, + "step": 112 + }, + { + "epoch": 0.03468385512584408, + "grad_norm": 0.369125097990036, + "learning_rate": 1.1451942740286299e-05, + "loss": 5.4506, + "step": 113 + }, + { + "epoch": 0.034990791896869246, + "grad_norm": 0.3773012161254883, + "learning_rate": 1.1554192229038856e-05, + "loss": 5.4637, + "step": 114 + }, + { + "epoch": 0.035297728667894414, + "grad_norm": 0.47702446579933167, + "learning_rate": 1.1656441717791411e-05, + "loss": 5.4487, + "step": 115 + }, + { + "epoch": 0.03560466543891958, + "grad_norm": 0.5288241505622864, + "learning_rate": 1.1758691206543968e-05, + "loss": 5.4216, + "step": 116 + }, + { + "epoch": 0.03591160220994475, + "grad_norm": 0.49916699528694153, + "learning_rate": 1.1860940695296524e-05, + "loss": 5.4055, + "step": 117 + }, + { + "epoch": 0.03621853898096992, + "grad_norm": 0.5027921795845032, + "learning_rate": 1.196319018404908e-05, + "loss": 5.4141, + "step": 118 + }, + { + "epoch": 0.036525475751995086, + "grad_norm": 0.5069209933280945, + "learning_rate": 1.2065439672801638e-05, + "loss": 5.4277, + "step": 119 + }, + { + "epoch": 0.03683241252302026, + "grad_norm": 0.5208525657653809, + "learning_rate": 1.2167689161554193e-05, + "loss": 5.4023, + "step": 120 + }, + { + "epoch": 0.03713934929404543, + "grad_norm": 0.7059593796730042, + "learning_rate": 1.2269938650306748e-05, + "loss": 5.3797, + "step": 121 + }, + { + "epoch": 0.0374462860650706, + "grad_norm": 0.71112060546875, + "learning_rate": 1.2372188139059305e-05, + "loss": 5.3619, + "step": 122 + }, + { + "epoch": 0.037753222836095765, + "grad_norm": 0.5095361471176147, + "learning_rate": 1.2474437627811862e-05, + "loss": 5.3667, + "step": 123 + }, + { + "epoch": 0.03806015960712093, + "grad_norm": 0.986062228679657, + "learning_rate": 1.2576687116564418e-05, + "loss": 5.3459, + "step": 124 + }, + { + "epoch": 0.0383670963781461, + "grad_norm": 0.693392813205719, + "learning_rate": 1.2678936605316975e-05, + "loss": 5.3165, + "step": 125 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7835625410079956, + "learning_rate": 1.278118609406953e-05, + "loss": 5.3205, + "step": 126 + }, + { + "epoch": 0.03898096992019644, + "grad_norm": 0.6314569711685181, + "learning_rate": 1.2883435582822087e-05, + "loss": 5.3287, + "step": 127 + }, + { + "epoch": 0.039287906691221605, + "grad_norm": 0.9079526662826538, + "learning_rate": 1.2985685071574644e-05, + "loss": 5.2935, + "step": 128 + }, + { + "epoch": 0.03959484346224678, + "grad_norm": 0.6998131275177002, + "learning_rate": 1.3087934560327201e-05, + "loss": 5.315, + "step": 129 + }, + { + "epoch": 0.03990178023327195, + "grad_norm": 0.7570182085037231, + "learning_rate": 1.3190184049079754e-05, + "loss": 5.293, + "step": 130 + }, + { + "epoch": 0.040208717004297116, + "grad_norm": 0.6972737908363342, + "learning_rate": 1.329243353783231e-05, + "loss": 5.2863, + "step": 131 + }, + { + "epoch": 0.040515653775322284, + "grad_norm": 0.8841190934181213, + "learning_rate": 1.3394683026584867e-05, + "loss": 5.2518, + "step": 132 + }, + { + "epoch": 0.04082259054634745, + "grad_norm": 0.6792641282081604, + "learning_rate": 1.3496932515337424e-05, + "loss": 5.2386, + "step": 133 + }, + { + "epoch": 0.04112952731737262, + "grad_norm": 0.9234145879745483, + "learning_rate": 1.359918200408998e-05, + "loss": 5.2418, + "step": 134 + }, + { + "epoch": 0.04143646408839779, + "grad_norm": 1.1438226699829102, + "learning_rate": 1.3701431492842536e-05, + "loss": 5.2298, + "step": 135 + }, + { + "epoch": 0.041743400859422956, + "grad_norm": 0.910861074924469, + "learning_rate": 1.3803680981595093e-05, + "loss": 5.2437, + "step": 136 + }, + { + "epoch": 0.042050337630448124, + "grad_norm": 0.8995844721794128, + "learning_rate": 1.390593047034765e-05, + "loss": 5.2456, + "step": 137 + }, + { + "epoch": 0.0423572744014733, + "grad_norm": 0.8543404936790466, + "learning_rate": 1.4008179959100204e-05, + "loss": 5.1888, + "step": 138 + }, + { + "epoch": 0.04266421117249847, + "grad_norm": 0.7565917372703552, + "learning_rate": 1.411042944785276e-05, + "loss": 5.1939, + "step": 139 + }, + { + "epoch": 0.042971147943523635, + "grad_norm": 0.7103878259658813, + "learning_rate": 1.4212678936605318e-05, + "loss": 5.1693, + "step": 140 + }, + { + "epoch": 0.0432780847145488, + "grad_norm": 1.008686900138855, + "learning_rate": 1.4314928425357873e-05, + "loss": 5.1467, + "step": 141 + }, + { + "epoch": 0.04358502148557397, + "grad_norm": 1.1134421825408936, + "learning_rate": 1.441717791411043e-05, + "loss": 5.1695, + "step": 142 + }, + { + "epoch": 0.04389195825659914, + "grad_norm": 0.7418283820152283, + "learning_rate": 1.4519427402862987e-05, + "loss": 5.1556, + "step": 143 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 1.3332276344299316, + "learning_rate": 1.4621676891615542e-05, + "loss": 5.1736, + "step": 144 + }, + { + "epoch": 0.044505831798649476, + "grad_norm": 0.99709153175354, + "learning_rate": 1.47239263803681e-05, + "loss": 5.1326, + "step": 145 + }, + { + "epoch": 0.044812768569674644, + "grad_norm": 2.0185158252716064, + "learning_rate": 1.4826175869120657e-05, + "loss": 5.1075, + "step": 146 + }, + { + "epoch": 0.04511970534069982, + "grad_norm": 0.9810693264007568, + "learning_rate": 1.492842535787321e-05, + "loss": 5.1181, + "step": 147 + }, + { + "epoch": 0.04542664211172499, + "grad_norm": 1.3122087717056274, + "learning_rate": 1.5030674846625767e-05, + "loss": 5.1104, + "step": 148 + }, + { + "epoch": 0.045733578882750155, + "grad_norm": 1.230662226676941, + "learning_rate": 1.5132924335378324e-05, + "loss": 5.0721, + "step": 149 + }, + { + "epoch": 0.04604051565377532, + "grad_norm": 0.9584419131278992, + "learning_rate": 1.523517382413088e-05, + "loss": 5.0574, + "step": 150 + }, + { + "epoch": 0.04634745242480049, + "grad_norm": 1.3933353424072266, + "learning_rate": 1.5337423312883436e-05, + "loss": 5.0468, + "step": 151 + }, + { + "epoch": 0.04665438919582566, + "grad_norm": 1.2336134910583496, + "learning_rate": 1.5439672801635993e-05, + "loss": 5.0596, + "step": 152 + }, + { + "epoch": 0.04696132596685083, + "grad_norm": 1.3005256652832031, + "learning_rate": 1.554192229038855e-05, + "loss": 5.0236, + "step": 153 + }, + { + "epoch": 0.047268262737875995, + "grad_norm": 1.2528692483901978, + "learning_rate": 1.5644171779141108e-05, + "loss": 5.0269, + "step": 154 + }, + { + "epoch": 0.04757519950890117, + "grad_norm": 1.0448148250579834, + "learning_rate": 1.574642126789366e-05, + "loss": 5.0338, + "step": 155 + }, + { + "epoch": 0.04788213627992634, + "grad_norm": 1.2372045516967773, + "learning_rate": 1.5848670756646218e-05, + "loss": 4.9544, + "step": 156 + }, + { + "epoch": 0.048189073050951506, + "grad_norm": 1.2700645923614502, + "learning_rate": 1.5950920245398772e-05, + "loss": 4.9723, + "step": 157 + }, + { + "epoch": 0.048496009821976674, + "grad_norm": 1.1283228397369385, + "learning_rate": 1.605316973415133e-05, + "loss": 4.9801, + "step": 158 + }, + { + "epoch": 0.04880294659300184, + "grad_norm": 1.5563665628433228, + "learning_rate": 1.6155419222903886e-05, + "loss": 4.9118, + "step": 159 + }, + { + "epoch": 0.04910988336402701, + "grad_norm": 1.3759487867355347, + "learning_rate": 1.6257668711656443e-05, + "loss": 4.9552, + "step": 160 + }, + { + "epoch": 0.04941682013505218, + "grad_norm": 1.2167878150939941, + "learning_rate": 1.6359918200409e-05, + "loss": 4.9186, + "step": 161 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.6424930095672607, + "learning_rate": 1.6462167689161557e-05, + "loss": 4.9143, + "step": 162 + }, + { + "epoch": 0.050030693677102514, + "grad_norm": 1.0009948015213013, + "learning_rate": 1.656441717791411e-05, + "loss": 4.8615, + "step": 163 + }, + { + "epoch": 0.05033763044812769, + "grad_norm": 1.8803274631500244, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.8558, + "step": 164 + }, + { + "epoch": 0.05064456721915286, + "grad_norm": 1.1819735765457153, + "learning_rate": 1.6768916155419224e-05, + "loss": 4.8453, + "step": 165 + }, + { + "epoch": 0.050951503990178025, + "grad_norm": 1.9724273681640625, + "learning_rate": 1.6871165644171778e-05, + "loss": 4.8573, + "step": 166 + }, + { + "epoch": 0.05125844076120319, + "grad_norm": 1.4624557495117188, + "learning_rate": 1.6973415132924335e-05, + "loss": 4.8494, + "step": 167 + }, + { + "epoch": 0.05156537753222836, + "grad_norm": 1.4750267267227173, + "learning_rate": 1.7075664621676892e-05, + "loss": 4.8296, + "step": 168 + }, + { + "epoch": 0.05187231430325353, + "grad_norm": 1.3206923007965088, + "learning_rate": 1.717791411042945e-05, + "loss": 4.7834, + "step": 169 + }, + { + "epoch": 0.0521792510742787, + "grad_norm": 1.4332681894302368, + "learning_rate": 1.7280163599182006e-05, + "loss": 4.8008, + "step": 170 + }, + { + "epoch": 0.052486187845303865, + "grad_norm": 1.612804651260376, + "learning_rate": 1.738241308793456e-05, + "loss": 4.7885, + "step": 171 + }, + { + "epoch": 0.05279312461632903, + "grad_norm": 1.3880311250686646, + "learning_rate": 1.7484662576687117e-05, + "loss": 4.8034, + "step": 172 + }, + { + "epoch": 0.05310006138735421, + "grad_norm": 1.7550631761550903, + "learning_rate": 1.7586912065439674e-05, + "loss": 4.7568, + "step": 173 + }, + { + "epoch": 0.053406998158379376, + "grad_norm": 1.653678297996521, + "learning_rate": 1.768916155419223e-05, + "loss": 4.7294, + "step": 174 + }, + { + "epoch": 0.053713934929404544, + "grad_norm": 1.6094826459884644, + "learning_rate": 1.7791411042944784e-05, + "loss": 4.7409, + "step": 175 + }, + { + "epoch": 0.05402087170042971, + "grad_norm": 1.7453033924102783, + "learning_rate": 1.789366053169734e-05, + "loss": 4.7191, + "step": 176 + }, + { + "epoch": 0.05432780847145488, + "grad_norm": 1.3073794841766357, + "learning_rate": 1.79959100204499e-05, + "loss": 4.7347, + "step": 177 + }, + { + "epoch": 0.05463474524248005, + "grad_norm": 2.096515655517578, + "learning_rate": 1.8098159509202455e-05, + "loss": 4.7396, + "step": 178 + }, + { + "epoch": 0.054941682013505216, + "grad_norm": 1.3826024532318115, + "learning_rate": 1.8200408997955012e-05, + "loss": 4.6988, + "step": 179 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 1.9290310144424438, + "learning_rate": 1.8302658486707566e-05, + "loss": 4.6653, + "step": 180 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.7404149770736694, + "learning_rate": 1.8404907975460123e-05, + "loss": 4.7102, + "step": 181 + }, + { + "epoch": 0.05586249232658073, + "grad_norm": 1.7535779476165771, + "learning_rate": 1.850715746421268e-05, + "loss": 4.7124, + "step": 182 + }, + { + "epoch": 0.056169429097605895, + "grad_norm": 1.7792351245880127, + "learning_rate": 1.8609406952965237e-05, + "loss": 4.6969, + "step": 183 + }, + { + "epoch": 0.056476365868631064, + "grad_norm": 2.048332452774048, + "learning_rate": 1.8711656441717794e-05, + "loss": 4.6134, + "step": 184 + }, + { + "epoch": 0.05678330263965623, + "grad_norm": 1.9558366537094116, + "learning_rate": 1.8813905930470348e-05, + "loss": 4.6739, + "step": 185 + }, + { + "epoch": 0.0570902394106814, + "grad_norm": 2.5299644470214844, + "learning_rate": 1.8916155419222905e-05, + "loss": 4.6248, + "step": 186 + }, + { + "epoch": 0.05739717618170657, + "grad_norm": 2.143704891204834, + "learning_rate": 1.9018404907975462e-05, + "loss": 4.6664, + "step": 187 + }, + { + "epoch": 0.057704112952731736, + "grad_norm": 1.925010323524475, + "learning_rate": 1.9120654396728015e-05, + "loss": 4.5657, + "step": 188 + }, + { + "epoch": 0.058011049723756904, + "grad_norm": 1.8223596811294556, + "learning_rate": 1.9222903885480572e-05, + "loss": 4.6124, + "step": 189 + }, + { + "epoch": 0.05831798649478207, + "grad_norm": 1.9519827365875244, + "learning_rate": 1.932515337423313e-05, + "loss": 4.5937, + "step": 190 + }, + { + "epoch": 0.05862492326580725, + "grad_norm": 2.062534809112549, + "learning_rate": 1.9427402862985686e-05, + "loss": 4.6023, + "step": 191 + }, + { + "epoch": 0.058931860036832415, + "grad_norm": 1.8512892723083496, + "learning_rate": 1.9529652351738243e-05, + "loss": 4.5709, + "step": 192 + }, + { + "epoch": 0.05923879680785758, + "grad_norm": 2.7771248817443848, + "learning_rate": 1.96319018404908e-05, + "loss": 4.5902, + "step": 193 + }, + { + "epoch": 0.05954573357888275, + "grad_norm": 1.8911874294281006, + "learning_rate": 1.9734151329243354e-05, + "loss": 4.4973, + "step": 194 + }, + { + "epoch": 0.05985267034990792, + "grad_norm": 2.261096715927124, + "learning_rate": 1.983640081799591e-05, + "loss": 4.5343, + "step": 195 + }, + { + "epoch": 0.06015960712093309, + "grad_norm": 1.833983302116394, + "learning_rate": 1.9938650306748465e-05, + "loss": 4.5604, + "step": 196 + }, + { + "epoch": 0.060466543891958255, + "grad_norm": 2.6909141540527344, + "learning_rate": 2.0040899795501022e-05, + "loss": 4.5411, + "step": 197 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.8085883855819702, + "learning_rate": 2.014314928425358e-05, + "loss": 4.5127, + "step": 198 + }, + { + "epoch": 0.06108041743400859, + "grad_norm": 3.082063913345337, + "learning_rate": 2.0245398773006136e-05, + "loss": 4.5055, + "step": 199 + }, + { + "epoch": 0.061387354205033766, + "grad_norm": 1.6942392587661743, + "learning_rate": 2.0347648261758693e-05, + "loss": 4.4852, + "step": 200 + }, + { + "epoch": 0.061694290976058934, + "grad_norm": 2.428569793701172, + "learning_rate": 2.044989775051125e-05, + "loss": 4.4876, + "step": 201 + }, + { + "epoch": 0.0620012277470841, + "grad_norm": 2.1669068336486816, + "learning_rate": 2.0552147239263807e-05, + "loss": 4.5156, + "step": 202 + }, + { + "epoch": 0.06230816451810927, + "grad_norm": 1.8558237552642822, + "learning_rate": 2.065439672801636e-05, + "loss": 4.495, + "step": 203 + }, + { + "epoch": 0.06261510128913444, + "grad_norm": 2.86224627494812, + "learning_rate": 2.0756646216768917e-05, + "loss": 4.4881, + "step": 204 + }, + { + "epoch": 0.06292203806015961, + "grad_norm": 2.263230562210083, + "learning_rate": 2.085889570552147e-05, + "loss": 4.4349, + "step": 205 + }, + { + "epoch": 0.06322897483118478, + "grad_norm": 2.533039093017578, + "learning_rate": 2.0961145194274028e-05, + "loss": 4.4921, + "step": 206 + }, + { + "epoch": 0.06353591160220995, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.1063394683026585e-05, + "loss": 4.4581, + "step": 207 + }, + { + "epoch": 0.06384284837323512, + "grad_norm": 1.9801981449127197, + "learning_rate": 2.1165644171779142e-05, + "loss": 4.4646, + "step": 208 + }, + { + "epoch": 0.06414978514426029, + "grad_norm": 2.8499860763549805, + "learning_rate": 2.12678936605317e-05, + "loss": 4.3913, + "step": 209 + }, + { + "epoch": 0.06445672191528545, + "grad_norm": 1.8176993131637573, + "learning_rate": 2.1370143149284256e-05, + "loss": 4.4414, + "step": 210 + }, + { + "epoch": 0.06476365868631062, + "grad_norm": 3.1497061252593994, + "learning_rate": 2.1472392638036813e-05, + "loss": 4.4164, + "step": 211 + }, + { + "epoch": 0.06507059545733579, + "grad_norm": 2.0509049892425537, + "learning_rate": 2.1574642126789367e-05, + "loss": 4.4198, + "step": 212 + }, + { + "epoch": 0.06537753222836096, + "grad_norm": 2.5346014499664307, + "learning_rate": 2.1676891615541924e-05, + "loss": 4.3628, + "step": 213 + }, + { + "epoch": 0.06568446899938613, + "grad_norm": 2.281947135925293, + "learning_rate": 2.1779141104294477e-05, + "loss": 4.3824, + "step": 214 + }, + { + "epoch": 0.0659914057704113, + "grad_norm": 2.9005074501037598, + "learning_rate": 2.1881390593047034e-05, + "loss": 4.4227, + "step": 215 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 2.5869741439819336, + "learning_rate": 2.198364008179959e-05, + "loss": 4.4231, + "step": 216 + }, + { + "epoch": 0.06660527931246163, + "grad_norm": 2.339655637741089, + "learning_rate": 2.208588957055215e-05, + "loss": 4.3901, + "step": 217 + }, + { + "epoch": 0.0669122160834868, + "grad_norm": 2.430664539337158, + "learning_rate": 2.2188139059304705e-05, + "loss": 4.3487, + "step": 218 + }, + { + "epoch": 0.06721915285451197, + "grad_norm": 2.1791040897369385, + "learning_rate": 2.2290388548057262e-05, + "loss": 4.3404, + "step": 219 + }, + { + "epoch": 0.06752608962553713, + "grad_norm": 2.7054920196533203, + "learning_rate": 2.239263803680982e-05, + "loss": 4.4186, + "step": 220 + }, + { + "epoch": 0.0678330263965623, + "grad_norm": 2.516566514968872, + "learning_rate": 2.2494887525562373e-05, + "loss": 4.4102, + "step": 221 + }, + { + "epoch": 0.06813996316758748, + "grad_norm": 2.3522324562072754, + "learning_rate": 2.259713701431493e-05, + "loss": 4.4062, + "step": 222 + }, + { + "epoch": 0.06844689993861265, + "grad_norm": 2.557600259780884, + "learning_rate": 2.2699386503067484e-05, + "loss": 4.3711, + "step": 223 + }, + { + "epoch": 0.06875383670963782, + "grad_norm": 2.0590531826019287, + "learning_rate": 2.280163599182004e-05, + "loss": 4.3546, + "step": 224 + }, + { + "epoch": 0.06906077348066299, + "grad_norm": 4.704878330230713, + "learning_rate": 2.2903885480572598e-05, + "loss": 4.39, + "step": 225 + }, + { + "epoch": 0.06936771025168816, + "grad_norm": 2.237440347671509, + "learning_rate": 2.3006134969325155e-05, + "loss": 4.3425, + "step": 226 + }, + { + "epoch": 0.06967464702271332, + "grad_norm": 3.9394450187683105, + "learning_rate": 2.3108384458077712e-05, + "loss": 4.3641, + "step": 227 + }, + { + "epoch": 0.06998158379373849, + "grad_norm": 2.4857213497161865, + "learning_rate": 2.321063394683027e-05, + "loss": 4.3435, + "step": 228 + }, + { + "epoch": 0.07028852056476366, + "grad_norm": 2.893437147140503, + "learning_rate": 2.3312883435582822e-05, + "loss": 4.329, + "step": 229 + }, + { + "epoch": 0.07059545733578883, + "grad_norm": 2.6498284339904785, + "learning_rate": 2.341513292433538e-05, + "loss": 4.3058, + "step": 230 + }, + { + "epoch": 0.070902394106814, + "grad_norm": 2.4182214736938477, + "learning_rate": 2.3517382413087936e-05, + "loss": 4.3147, + "step": 231 + }, + { + "epoch": 0.07120933087783916, + "grad_norm": 2.532050371170044, + "learning_rate": 2.361963190184049e-05, + "loss": 4.3388, + "step": 232 + }, + { + "epoch": 0.07151626764886433, + "grad_norm": 2.5818533897399902, + "learning_rate": 2.3721881390593047e-05, + "loss": 4.3023, + "step": 233 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 2.1860098838806152, + "learning_rate": 2.3824130879345604e-05, + "loss": 4.2571, + "step": 234 + }, + { + "epoch": 0.07213014119091467, + "grad_norm": 3.5780131816864014, + "learning_rate": 2.392638036809816e-05, + "loss": 4.3336, + "step": 235 + }, + { + "epoch": 0.07243707796193984, + "grad_norm": 2.24653697013855, + "learning_rate": 2.4028629856850718e-05, + "loss": 4.3013, + "step": 236 + }, + { + "epoch": 0.072744014732965, + "grad_norm": 3.59663987159729, + "learning_rate": 2.4130879345603275e-05, + "loss": 4.3248, + "step": 237 + }, + { + "epoch": 0.07305095150399017, + "grad_norm": 2.818321943283081, + "learning_rate": 2.423312883435583e-05, + "loss": 4.2876, + "step": 238 + }, + { + "epoch": 0.07335788827501534, + "grad_norm": 2.457371950149536, + "learning_rate": 2.4335378323108386e-05, + "loss": 4.2584, + "step": 239 + }, + { + "epoch": 0.07366482504604052, + "grad_norm": 3.6243598461151123, + "learning_rate": 2.4437627811860943e-05, + "loss": 4.2786, + "step": 240 + }, + { + "epoch": 0.07397176181706569, + "grad_norm": 2.113060474395752, + "learning_rate": 2.4539877300613496e-05, + "loss": 4.2071, + "step": 241 + }, + { + "epoch": 0.07427869858809086, + "grad_norm": 5.355374813079834, + "learning_rate": 2.4642126789366053e-05, + "loss": 4.2871, + "step": 242 + }, + { + "epoch": 0.07458563535911603, + "grad_norm": 2.4509847164154053, + "learning_rate": 2.474437627811861e-05, + "loss": 4.2073, + "step": 243 + }, + { + "epoch": 0.0748925721301412, + "grad_norm": 3.313793659210205, + "learning_rate": 2.4846625766871167e-05, + "loss": 4.2938, + "step": 244 + }, + { + "epoch": 0.07519950890116636, + "grad_norm": 2.731903553009033, + "learning_rate": 2.4948875255623724e-05, + "loss": 4.2023, + "step": 245 + }, + { + "epoch": 0.07550644567219153, + "grad_norm": 2.6218042373657227, + "learning_rate": 2.505112474437628e-05, + "loss": 4.2492, + "step": 246 + }, + { + "epoch": 0.0758133824432167, + "grad_norm": 3.2865426540374756, + "learning_rate": 2.5153374233128835e-05, + "loss": 4.2358, + "step": 247 + }, + { + "epoch": 0.07612031921424187, + "grad_norm": 2.21870756149292, + "learning_rate": 2.5255623721881395e-05, + "loss": 4.1989, + "step": 248 + }, + { + "epoch": 0.07642725598526703, + "grad_norm": 4.095842361450195, + "learning_rate": 2.535787321063395e-05, + "loss": 4.2484, + "step": 249 + }, + { + "epoch": 0.0767341927562922, + "grad_norm": 2.21420955657959, + "learning_rate": 2.5460122699386503e-05, + "loss": 4.1985, + "step": 250 + }, + { + "epoch": 0.07704112952731737, + "grad_norm": 3.011272668838501, + "learning_rate": 2.556237218813906e-05, + "loss": 4.2182, + "step": 251 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 2.930999279022217, + "learning_rate": 2.5664621676891613e-05, + "loss": 4.1985, + "step": 252 + }, + { + "epoch": 0.0776550030693677, + "grad_norm": 2.8528032302856445, + "learning_rate": 2.5766871165644174e-05, + "loss": 4.1859, + "step": 253 + }, + { + "epoch": 0.07796193984039287, + "grad_norm": 3.215587854385376, + "learning_rate": 2.5869120654396727e-05, + "loss": 4.2416, + "step": 254 + }, + { + "epoch": 0.07826887661141804, + "grad_norm": 3.1349990367889404, + "learning_rate": 2.5971370143149288e-05, + "loss": 4.2204, + "step": 255 + }, + { + "epoch": 0.07857581338244321, + "grad_norm": 3.146942377090454, + "learning_rate": 2.607361963190184e-05, + "loss": 4.17, + "step": 256 + }, + { + "epoch": 0.07888275015346839, + "grad_norm": 2.2611942291259766, + "learning_rate": 2.6175869120654402e-05, + "loss": 4.191, + "step": 257 + }, + { + "epoch": 0.07918968692449356, + "grad_norm": 3.434574604034424, + "learning_rate": 2.6278118609406955e-05, + "loss": 4.1854, + "step": 258 + }, + { + "epoch": 0.07949662369551873, + "grad_norm": 2.3132400512695312, + "learning_rate": 2.638036809815951e-05, + "loss": 4.233, + "step": 259 + }, + { + "epoch": 0.0798035604665439, + "grad_norm": 3.2676596641540527, + "learning_rate": 2.6482617586912066e-05, + "loss": 4.1586, + "step": 260 + }, + { + "epoch": 0.08011049723756906, + "grad_norm": 2.6182920932769775, + "learning_rate": 2.658486707566462e-05, + "loss": 4.164, + "step": 261 + }, + { + "epoch": 0.08041743400859423, + "grad_norm": 2.872018814086914, + "learning_rate": 2.668711656441718e-05, + "loss": 4.1642, + "step": 262 + }, + { + "epoch": 0.0807243707796194, + "grad_norm": 3.147237539291382, + "learning_rate": 2.6789366053169734e-05, + "loss": 4.147, + "step": 263 + }, + { + "epoch": 0.08103130755064457, + "grad_norm": 2.363360643386841, + "learning_rate": 2.6891615541922294e-05, + "loss": 4.1388, + "step": 264 + }, + { + "epoch": 0.08133824432166974, + "grad_norm": 3.364442825317383, + "learning_rate": 2.6993865030674848e-05, + "loss": 4.1678, + "step": 265 + }, + { + "epoch": 0.0816451810926949, + "grad_norm": 2.393705368041992, + "learning_rate": 2.7096114519427408e-05, + "loss": 4.1626, + "step": 266 + }, + { + "epoch": 0.08195211786372007, + "grad_norm": 3.8512558937072754, + "learning_rate": 2.719836400817996e-05, + "loss": 4.1613, + "step": 267 + }, + { + "epoch": 0.08225905463474524, + "grad_norm": 3.0992584228515625, + "learning_rate": 2.7300613496932515e-05, + "loss": 4.1486, + "step": 268 + }, + { + "epoch": 0.08256599140577041, + "grad_norm": 3.481079578399658, + "learning_rate": 2.7402862985685072e-05, + "loss": 4.1772, + "step": 269 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 3.2167513370513916, + "learning_rate": 2.7505112474437626e-05, + "loss": 4.1253, + "step": 270 + }, + { + "epoch": 0.08317986494782074, + "grad_norm": 2.9698429107666016, + "learning_rate": 2.7607361963190186e-05, + "loss": 4.0897, + "step": 271 + }, + { + "epoch": 0.08348680171884591, + "grad_norm": 3.2549962997436523, + "learning_rate": 2.770961145194274e-05, + "loss": 4.0851, + "step": 272 + }, + { + "epoch": 0.08379373848987108, + "grad_norm": 3.089301824569702, + "learning_rate": 2.78118609406953e-05, + "loss": 4.1378, + "step": 273 + }, + { + "epoch": 0.08410067526089625, + "grad_norm": 3.1799745559692383, + "learning_rate": 2.7914110429447854e-05, + "loss": 4.159, + "step": 274 + }, + { + "epoch": 0.08440761203192143, + "grad_norm": 2.7577199935913086, + "learning_rate": 2.8016359918200408e-05, + "loss": 4.0524, + "step": 275 + }, + { + "epoch": 0.0847145488029466, + "grad_norm": 3.709740161895752, + "learning_rate": 2.8118609406952968e-05, + "loss": 4.0877, + "step": 276 + }, + { + "epoch": 0.08502148557397177, + "grad_norm": 2.930482864379883, + "learning_rate": 2.822085889570552e-05, + "loss": 4.0408, + "step": 277 + }, + { + "epoch": 0.08532842234499693, + "grad_norm": 3.8216278553009033, + "learning_rate": 2.832310838445808e-05, + "loss": 4.0915, + "step": 278 + }, + { + "epoch": 0.0856353591160221, + "grad_norm": 2.7614903450012207, + "learning_rate": 2.8425357873210636e-05, + "loss": 4.0793, + "step": 279 + }, + { + "epoch": 0.08594229588704727, + "grad_norm": 4.005281448364258, + "learning_rate": 2.8527607361963193e-05, + "loss": 4.1234, + "step": 280 + }, + { + "epoch": 0.08624923265807244, + "grad_norm": 2.731640338897705, + "learning_rate": 2.8629856850715746e-05, + "loss": 4.1408, + "step": 281 + }, + { + "epoch": 0.0865561694290976, + "grad_norm": 4.439471244812012, + "learning_rate": 2.8732106339468307e-05, + "loss": 4.08, + "step": 282 + }, + { + "epoch": 0.08686310620012277, + "grad_norm": 2.929032564163208, + "learning_rate": 2.883435582822086e-05, + "loss": 4.0521, + "step": 283 + }, + { + "epoch": 0.08717004297114794, + "grad_norm": 3.3943557739257812, + "learning_rate": 2.8936605316973414e-05, + "loss": 4.0936, + "step": 284 + }, + { + "epoch": 0.08747697974217311, + "grad_norm": 2.9899704456329346, + "learning_rate": 2.9038854805725974e-05, + "loss": 4.0985, + "step": 285 + }, + { + "epoch": 0.08778391651319828, + "grad_norm": 2.8169870376586914, + "learning_rate": 2.9141104294478528e-05, + "loss": 4.1044, + "step": 286 + }, + { + "epoch": 0.08809085328422345, + "grad_norm": 4.312693119049072, + "learning_rate": 2.9243353783231085e-05, + "loss": 4.0515, + "step": 287 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 2.9270846843719482, + "learning_rate": 2.9345603271983642e-05, + "loss": 4.0221, + "step": 288 + }, + { + "epoch": 0.08870472682627378, + "grad_norm": 3.9831974506378174, + "learning_rate": 2.94478527607362e-05, + "loss": 4.0807, + "step": 289 + }, + { + "epoch": 0.08901166359729895, + "grad_norm": 2.721794605255127, + "learning_rate": 2.9550102249488753e-05, + "loss": 4.0732, + "step": 290 + }, + { + "epoch": 0.08931860036832412, + "grad_norm": 4.721047878265381, + "learning_rate": 2.9652351738241313e-05, + "loss": 4.0457, + "step": 291 + }, + { + "epoch": 0.08962553713934929, + "grad_norm": 2.785738229751587, + "learning_rate": 2.9754601226993867e-05, + "loss": 4.0288, + "step": 292 + }, + { + "epoch": 0.08993247391037447, + "grad_norm": 4.842009544372559, + "learning_rate": 2.985685071574642e-05, + "loss": 4.1193, + "step": 293 + }, + { + "epoch": 0.09023941068139964, + "grad_norm": 2.802044153213501, + "learning_rate": 2.995910020449898e-05, + "loss": 4.0055, + "step": 294 + }, + { + "epoch": 0.0905463474524248, + "grad_norm": 3.7060954570770264, + "learning_rate": 3.0061349693251534e-05, + "loss": 4.0478, + "step": 295 + }, + { + "epoch": 0.09085328422344997, + "grad_norm": 2.8033370971679688, + "learning_rate": 3.0163599182004095e-05, + "loss": 4.0344, + "step": 296 + }, + { + "epoch": 0.09116022099447514, + "grad_norm": 3.148653984069824, + "learning_rate": 3.026584867075665e-05, + "loss": 3.9825, + "step": 297 + }, + { + "epoch": 0.09146715776550031, + "grad_norm": 3.925459384918213, + "learning_rate": 3.0368098159509205e-05, + "loss": 4.0253, + "step": 298 + }, + { + "epoch": 0.09177409453652548, + "grad_norm": 2.8502724170684814, + "learning_rate": 3.047034764826176e-05, + "loss": 4.0192, + "step": 299 + }, + { + "epoch": 0.09208103130755065, + "grad_norm": 3.8444268703460693, + "learning_rate": 3.057259713701431e-05, + "loss": 4.0354, + "step": 300 + }, + { + "epoch": 0.09238796807857581, + "grad_norm": 2.935976982116699, + "learning_rate": 3.067484662576687e-05, + "loss": 4.0397, + "step": 301 + }, + { + "epoch": 0.09269490484960098, + "grad_norm": 2.9375271797180176, + "learning_rate": 3.0777096114519427e-05, + "loss": 3.975, + "step": 302 + }, + { + "epoch": 0.09300184162062615, + "grad_norm": 3.7623329162597656, + "learning_rate": 3.087934560327199e-05, + "loss": 4.0259, + "step": 303 + }, + { + "epoch": 0.09330877839165132, + "grad_norm": 3.1480228900909424, + "learning_rate": 3.098159509202454e-05, + "loss": 3.9676, + "step": 304 + }, + { + "epoch": 0.09361571516267649, + "grad_norm": 4.572622299194336, + "learning_rate": 3.10838445807771e-05, + "loss": 4.0123, + "step": 305 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 2.469806671142578, + "learning_rate": 3.1186094069529655e-05, + "loss": 4.012, + "step": 306 + }, + { + "epoch": 0.09422958870472682, + "grad_norm": 5.133090019226074, + "learning_rate": 3.1288343558282215e-05, + "loss": 3.9892, + "step": 307 + }, + { + "epoch": 0.09453652547575199, + "grad_norm": 3.379105567932129, + "learning_rate": 3.139059304703477e-05, + "loss": 4.0286, + "step": 308 + }, + { + "epoch": 0.09484346224677716, + "grad_norm": 3.1413521766662598, + "learning_rate": 3.149284253578732e-05, + "loss": 4.0238, + "step": 309 + }, + { + "epoch": 0.09515039901780234, + "grad_norm": 2.832242250442505, + "learning_rate": 3.159509202453988e-05, + "loss": 3.9955, + "step": 310 + }, + { + "epoch": 0.09545733578882751, + "grad_norm": 4.405134201049805, + "learning_rate": 3.1697341513292436e-05, + "loss": 4.0093, + "step": 311 + }, + { + "epoch": 0.09576427255985268, + "grad_norm": 2.8928587436676025, + "learning_rate": 3.179959100204499e-05, + "loss": 3.9518, + "step": 312 + }, + { + "epoch": 0.09607120933087784, + "grad_norm": 3.8899731636047363, + "learning_rate": 3.1901840490797544e-05, + "loss": 3.9773, + "step": 313 + }, + { + "epoch": 0.09637814610190301, + "grad_norm": 2.768199920654297, + "learning_rate": 3.2004089979550104e-05, + "loss": 3.9671, + "step": 314 + }, + { + "epoch": 0.09668508287292818, + "grad_norm": 3.834092378616333, + "learning_rate": 3.210633946830266e-05, + "loss": 3.9641, + "step": 315 + }, + { + "epoch": 0.09699201964395335, + "grad_norm": 3.566220998764038, + "learning_rate": 3.220858895705521e-05, + "loss": 3.9585, + "step": 316 + }, + { + "epoch": 0.09729895641497852, + "grad_norm": 3.1876113414764404, + "learning_rate": 3.231083844580777e-05, + "loss": 3.9689, + "step": 317 + }, + { + "epoch": 0.09760589318600368, + "grad_norm": 3.122142791748047, + "learning_rate": 3.2413087934560325e-05, + "loss": 3.9601, + "step": 318 + }, + { + "epoch": 0.09791282995702885, + "grad_norm": 3.825195789337158, + "learning_rate": 3.2515337423312886e-05, + "loss": 3.9413, + "step": 319 + }, + { + "epoch": 0.09821976672805402, + "grad_norm": 3.3126778602600098, + "learning_rate": 3.261758691206544e-05, + "loss": 4.0414, + "step": 320 + }, + { + "epoch": 0.09852670349907919, + "grad_norm": 3.7704360485076904, + "learning_rate": 3.2719836400818e-05, + "loss": 3.9224, + "step": 321 + }, + { + "epoch": 0.09883364027010436, + "grad_norm": 2.997194290161133, + "learning_rate": 3.282208588957055e-05, + "loss": 3.9454, + "step": 322 + }, + { + "epoch": 0.09914057704112952, + "grad_norm": 3.4990131855010986, + "learning_rate": 3.2924335378323114e-05, + "loss": 3.8682, + "step": 323 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 3.146879196166992, + "learning_rate": 3.302658486707567e-05, + "loss": 3.8863, + "step": 324 + }, + { + "epoch": 0.09975445058317986, + "grad_norm": 4.963291645050049, + "learning_rate": 3.312883435582822e-05, + "loss": 3.9951, + "step": 325 + }, + { + "epoch": 0.10006138735420503, + "grad_norm": 2.4511775970458984, + "learning_rate": 3.323108384458078e-05, + "loss": 3.875, + "step": 326 + }, + { + "epoch": 0.1003683241252302, + "grad_norm": 5.670922756195068, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.0446, + "step": 327 + }, + { + "epoch": 0.10067526089625538, + "grad_norm": 3.54237699508667, + "learning_rate": 3.3435582822085895e-05, + "loss": 3.9877, + "step": 328 + }, + { + "epoch": 0.10098219766728055, + "grad_norm": 2.9059271812438965, + "learning_rate": 3.353783231083845e-05, + "loss": 3.949, + "step": 329 + }, + { + "epoch": 0.10128913443830571, + "grad_norm": 3.870962381362915, + "learning_rate": 3.3640081799591e-05, + "loss": 3.8985, + "step": 330 + }, + { + "epoch": 0.10159607120933088, + "grad_norm": 3.275129556655884, + "learning_rate": 3.3742331288343556e-05, + "loss": 4.0209, + "step": 331 + }, + { + "epoch": 0.10190300798035605, + "grad_norm": 3.040931224822998, + "learning_rate": 3.3844580777096117e-05, + "loss": 3.9938, + "step": 332 + }, + { + "epoch": 0.10220994475138122, + "grad_norm": 4.3355584144592285, + "learning_rate": 3.394683026584867e-05, + "loss": 3.876, + "step": 333 + }, + { + "epoch": 0.10251688152240639, + "grad_norm": 3.0981085300445557, + "learning_rate": 3.4049079754601224e-05, + "loss": 3.9014, + "step": 334 + }, + { + "epoch": 0.10282381829343155, + "grad_norm": 3.2902655601501465, + "learning_rate": 3.4151329243353784e-05, + "loss": 3.9599, + "step": 335 + }, + { + "epoch": 0.10313075506445672, + "grad_norm": 3.496514081954956, + "learning_rate": 3.425357873210634e-05, + "loss": 3.9005, + "step": 336 + }, + { + "epoch": 0.10343769183548189, + "grad_norm": 3.4680685997009277, + "learning_rate": 3.43558282208589e-05, + "loss": 3.8591, + "step": 337 + }, + { + "epoch": 0.10374462860650706, + "grad_norm": 3.3041694164276123, + "learning_rate": 3.445807770961145e-05, + "loss": 3.9566, + "step": 338 + }, + { + "epoch": 0.10405156537753223, + "grad_norm": 3.519709825515747, + "learning_rate": 3.456032719836401e-05, + "loss": 3.9219, + "step": 339 + }, + { + "epoch": 0.1043585021485574, + "grad_norm": 3.932344436645508, + "learning_rate": 3.4662576687116566e-05, + "loss": 3.9155, + "step": 340 + }, + { + "epoch": 0.10466543891958256, + "grad_norm": 3.3109822273254395, + "learning_rate": 3.476482617586912e-05, + "loss": 3.9729, + "step": 341 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 4.556341648101807, + "learning_rate": 3.486707566462168e-05, + "loss": 3.9459, + "step": 342 + }, + { + "epoch": 0.1052793124616329, + "grad_norm": 2.9105725288391113, + "learning_rate": 3.4969325153374234e-05, + "loss": 3.9384, + "step": 343 + }, + { + "epoch": 0.10558624923265807, + "grad_norm": 3.865682601928711, + "learning_rate": 3.5071574642126794e-05, + "loss": 3.9826, + "step": 344 + }, + { + "epoch": 0.10589318600368323, + "grad_norm": 2.8606700897216797, + "learning_rate": 3.517382413087935e-05, + "loss": 3.8184, + "step": 345 + }, + { + "epoch": 0.10620012277470842, + "grad_norm": 4.323507785797119, + "learning_rate": 3.527607361963191e-05, + "loss": 3.8772, + "step": 346 + }, + { + "epoch": 0.10650705954573358, + "grad_norm": 2.890390157699585, + "learning_rate": 3.537832310838446e-05, + "loss": 3.8769, + "step": 347 + }, + { + "epoch": 0.10681399631675875, + "grad_norm": 4.008283615112305, + "learning_rate": 3.5480572597137015e-05, + "loss": 3.8796, + "step": 348 + }, + { + "epoch": 0.10712093308778392, + "grad_norm": 3.3605823516845703, + "learning_rate": 3.558282208588957e-05, + "loss": 3.8924, + "step": 349 + }, + { + "epoch": 0.10742786985880909, + "grad_norm": 3.6573123931884766, + "learning_rate": 3.568507157464213e-05, + "loss": 3.812, + "step": 350 + }, + { + "epoch": 0.10773480662983426, + "grad_norm": 3.0771777629852295, + "learning_rate": 3.578732106339468e-05, + "loss": 3.8958, + "step": 351 + }, + { + "epoch": 0.10804174340085942, + "grad_norm": 3.6483314037323, + "learning_rate": 3.5889570552147236e-05, + "loss": 3.8863, + "step": 352 + }, + { + "epoch": 0.10834868017188459, + "grad_norm": 3.1320669651031494, + "learning_rate": 3.59918200408998e-05, + "loss": 3.8194, + "step": 353 + }, + { + "epoch": 0.10865561694290976, + "grad_norm": 3.6510627269744873, + "learning_rate": 3.609406952965235e-05, + "loss": 3.8916, + "step": 354 + }, + { + "epoch": 0.10896255371393493, + "grad_norm": 3.0419273376464844, + "learning_rate": 3.619631901840491e-05, + "loss": 3.7907, + "step": 355 + }, + { + "epoch": 0.1092694904849601, + "grad_norm": 4.519289493560791, + "learning_rate": 3.6298568507157465e-05, + "loss": 3.8902, + "step": 356 + }, + { + "epoch": 0.10957642725598526, + "grad_norm": 2.938493251800537, + "learning_rate": 3.6400817995910025e-05, + "loss": 3.8675, + "step": 357 + }, + { + "epoch": 0.10988336402701043, + "grad_norm": 4.398004531860352, + "learning_rate": 3.650306748466258e-05, + "loss": 3.9535, + "step": 358 + }, + { + "epoch": 0.1101903007980356, + "grad_norm": 2.9128408432006836, + "learning_rate": 3.660531697341513e-05, + "loss": 3.944, + "step": 359 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 5.364169597625732, + "learning_rate": 3.670756646216769e-05, + "loss": 3.9289, + "step": 360 + }, + { + "epoch": 0.11080417434008594, + "grad_norm": 2.8434085845947266, + "learning_rate": 3.6809815950920246e-05, + "loss": 3.8204, + "step": 361 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.494234561920166, + "learning_rate": 3.6912065439672807e-05, + "loss": 3.8518, + "step": 362 + }, + { + "epoch": 0.11141804788213629, + "grad_norm": 2.959608554840088, + "learning_rate": 3.701431492842536e-05, + "loss": 3.8365, + "step": 363 + }, + { + "epoch": 0.11172498465316145, + "grad_norm": 3.4115726947784424, + "learning_rate": 3.711656441717792e-05, + "loss": 3.8507, + "step": 364 + }, + { + "epoch": 0.11203192142418662, + "grad_norm": 3.8023531436920166, + "learning_rate": 3.7218813905930474e-05, + "loss": 3.8544, + "step": 365 + }, + { + "epoch": 0.11233885819521179, + "grad_norm": 3.0639398097991943, + "learning_rate": 3.732106339468303e-05, + "loss": 3.8772, + "step": 366 + }, + { + "epoch": 0.11264579496623696, + "grad_norm": 4.241199016571045, + "learning_rate": 3.742331288343559e-05, + "loss": 3.7739, + "step": 367 + }, + { + "epoch": 0.11295273173726213, + "grad_norm": 2.977330446243286, + "learning_rate": 3.752556237218814e-05, + "loss": 3.8376, + "step": 368 + }, + { + "epoch": 0.1132596685082873, + "grad_norm": 4.574001789093018, + "learning_rate": 3.7627811860940696e-05, + "loss": 3.8761, + "step": 369 + }, + { + "epoch": 0.11356660527931246, + "grad_norm": 3.1499617099761963, + "learning_rate": 3.773006134969325e-05, + "loss": 3.8884, + "step": 370 + }, + { + "epoch": 0.11387354205033763, + "grad_norm": 3.81887149810791, + "learning_rate": 3.783231083844581e-05, + "loss": 3.8474, + "step": 371 + }, + { + "epoch": 0.1141804788213628, + "grad_norm": 3.424117088317871, + "learning_rate": 3.793456032719836e-05, + "loss": 3.8715, + "step": 372 + }, + { + "epoch": 0.11448741559238797, + "grad_norm": 4.431595325469971, + "learning_rate": 3.8036809815950924e-05, + "loss": 3.8305, + "step": 373 + }, + { + "epoch": 0.11479435236341314, + "grad_norm": 3.1664443016052246, + "learning_rate": 3.813905930470348e-05, + "loss": 3.8203, + "step": 374 + }, + { + "epoch": 0.1151012891344383, + "grad_norm": 4.312273025512695, + "learning_rate": 3.824130879345603e-05, + "loss": 3.8195, + "step": 375 + }, + { + "epoch": 0.11540822590546347, + "grad_norm": 3.0893726348876953, + "learning_rate": 3.834355828220859e-05, + "loss": 3.8248, + "step": 376 + }, + { + "epoch": 0.11571516267648864, + "grad_norm": 4.526726722717285, + "learning_rate": 3.8445807770961145e-05, + "loss": 3.8505, + "step": 377 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 2.5805325508117676, + "learning_rate": 3.8548057259713705e-05, + "loss": 3.8153, + "step": 378 + }, + { + "epoch": 0.11632903621853898, + "grad_norm": 4.6043381690979, + "learning_rate": 3.865030674846626e-05, + "loss": 3.8248, + "step": 379 + }, + { + "epoch": 0.11663597298956414, + "grad_norm": 3.0713136196136475, + "learning_rate": 3.875255623721882e-05, + "loss": 3.7687, + "step": 380 + }, + { + "epoch": 0.11694290976058933, + "grad_norm": 3.6344685554504395, + "learning_rate": 3.885480572597137e-05, + "loss": 3.8061, + "step": 381 + }, + { + "epoch": 0.1172498465316145, + "grad_norm": 3.6261723041534424, + "learning_rate": 3.895705521472393e-05, + "loss": 3.7939, + "step": 382 + }, + { + "epoch": 0.11755678330263966, + "grad_norm": 3.811779260635376, + "learning_rate": 3.905930470347649e-05, + "loss": 3.7973, + "step": 383 + }, + { + "epoch": 0.11786372007366483, + "grad_norm": 3.741685628890991, + "learning_rate": 3.916155419222904e-05, + "loss": 3.8149, + "step": 384 + }, + { + "epoch": 0.11817065684469, + "grad_norm": 3.330526351928711, + "learning_rate": 3.92638036809816e-05, + "loss": 3.8058, + "step": 385 + }, + { + "epoch": 0.11847759361571517, + "grad_norm": 3.2102115154266357, + "learning_rate": 3.9366053169734155e-05, + "loss": 3.7199, + "step": 386 + }, + { + "epoch": 0.11878453038674033, + "grad_norm": 3.670474052429199, + "learning_rate": 3.946830265848671e-05, + "loss": 3.8087, + "step": 387 + }, + { + "epoch": 0.1190914671577655, + "grad_norm": 3.218390941619873, + "learning_rate": 3.957055214723926e-05, + "loss": 3.7631, + "step": 388 + }, + { + "epoch": 0.11939840392879067, + "grad_norm": 4.2256693840026855, + "learning_rate": 3.967280163599182e-05, + "loss": 3.7624, + "step": 389 + }, + { + "epoch": 0.11970534069981584, + "grad_norm": 2.86247181892395, + "learning_rate": 3.9775051124744376e-05, + "loss": 3.7638, + "step": 390 + }, + { + "epoch": 0.120012277470841, + "grad_norm": 4.083118915557861, + "learning_rate": 3.987730061349693e-05, + "loss": 3.7581, + "step": 391 + }, + { + "epoch": 0.12031921424186617, + "grad_norm": 2.836794376373291, + "learning_rate": 3.997955010224949e-05, + "loss": 3.7466, + "step": 392 + }, + { + "epoch": 0.12062615101289134, + "grad_norm": 4.071137428283691, + "learning_rate": 4.0081799591002043e-05, + "loss": 3.7836, + "step": 393 + }, + { + "epoch": 0.12093308778391651, + "grad_norm": 3.3141064643859863, + "learning_rate": 4.0184049079754604e-05, + "loss": 3.754, + "step": 394 + }, + { + "epoch": 0.12124002455494168, + "grad_norm": 3.6064393520355225, + "learning_rate": 4.028629856850716e-05, + "loss": 3.8379, + "step": 395 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 3.7306606769561768, + "learning_rate": 4.038854805725972e-05, + "loss": 3.6848, + "step": 396 + }, + { + "epoch": 0.12185389809699201, + "grad_norm": 3.5877859592437744, + "learning_rate": 4.049079754601227e-05, + "loss": 3.8201, + "step": 397 + }, + { + "epoch": 0.12216083486801718, + "grad_norm": 3.930271625518799, + "learning_rate": 4.059304703476483e-05, + "loss": 3.7507, + "step": 398 + }, + { + "epoch": 0.12246777163904236, + "grad_norm": 2.974968194961548, + "learning_rate": 4.0695296523517386e-05, + "loss": 3.7545, + "step": 399 + }, + { + "epoch": 0.12277470841006753, + "grad_norm": 4.655934810638428, + "learning_rate": 4.079754601226994e-05, + "loss": 3.8093, + "step": 400 + }, + { + "epoch": 0.1230816451810927, + "grad_norm": 3.201986312866211, + "learning_rate": 4.08997955010225e-05, + "loss": 3.7252, + "step": 401 + }, + { + "epoch": 0.12338858195211787, + "grad_norm": 4.447626113891602, + "learning_rate": 4.100204498977505e-05, + "loss": 3.7132, + "step": 402 + }, + { + "epoch": 0.12369551872314304, + "grad_norm": 2.6518118381500244, + "learning_rate": 4.1104294478527614e-05, + "loss": 3.7637, + "step": 403 + }, + { + "epoch": 0.1240024554941682, + "grad_norm": 5.116448402404785, + "learning_rate": 4.120654396728017e-05, + "loss": 3.6991, + "step": 404 + }, + { + "epoch": 0.12430939226519337, + "grad_norm": 2.7780613899230957, + "learning_rate": 4.130879345603272e-05, + "loss": 3.7555, + "step": 405 + }, + { + "epoch": 0.12461632903621854, + "grad_norm": 4.281010627746582, + "learning_rate": 4.1411042944785274e-05, + "loss": 3.688, + "step": 406 + }, + { + "epoch": 0.12492326580724371, + "grad_norm": 2.851562023162842, + "learning_rate": 4.1513292433537835e-05, + "loss": 3.7557, + "step": 407 + }, + { + "epoch": 0.1252302025782689, + "grad_norm": 4.092229843139648, + "learning_rate": 4.161554192229039e-05, + "loss": 3.7179, + "step": 408 + }, + { + "epoch": 0.12553713934929406, + "grad_norm": 3.410094976425171, + "learning_rate": 4.171779141104294e-05, + "loss": 3.7292, + "step": 409 + }, + { + "epoch": 0.12584407612031923, + "grad_norm": 4.266562461853027, + "learning_rate": 4.18200408997955e-05, + "loss": 3.8204, + "step": 410 + }, + { + "epoch": 0.1261510128913444, + "grad_norm": 2.997642755508423, + "learning_rate": 4.1922290388548056e-05, + "loss": 3.7773, + "step": 411 + }, + { + "epoch": 0.12645794966236956, + "grad_norm": 4.50873327255249, + "learning_rate": 4.2024539877300617e-05, + "loss": 3.7255, + "step": 412 + }, + { + "epoch": 0.12676488643339473, + "grad_norm": 3.65312123298645, + "learning_rate": 4.212678936605317e-05, + "loss": 3.6472, + "step": 413 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 3.985487222671509, + "learning_rate": 4.222903885480573e-05, + "loss": 3.6915, + "step": 414 + }, + { + "epoch": 0.12737875997544507, + "grad_norm": 3.6020219326019287, + "learning_rate": 4.2331288343558284e-05, + "loss": 3.7299, + "step": 415 + }, + { + "epoch": 0.12768569674647023, + "grad_norm": 3.414529323577881, + "learning_rate": 4.243353783231084e-05, + "loss": 3.7827, + "step": 416 + }, + { + "epoch": 0.1279926335174954, + "grad_norm": 3.537292718887329, + "learning_rate": 4.25357873210634e-05, + "loss": 3.751, + "step": 417 + }, + { + "epoch": 0.12829957028852057, + "grad_norm": 3.5442280769348145, + "learning_rate": 4.263803680981595e-05, + "loss": 3.6828, + "step": 418 + }, + { + "epoch": 0.12860650705954574, + "grad_norm": 3.9816019535064697, + "learning_rate": 4.274028629856851e-05, + "loss": 3.7668, + "step": 419 + }, + { + "epoch": 0.1289134438305709, + "grad_norm": 3.1632657051086426, + "learning_rate": 4.2842535787321066e-05, + "loss": 3.6946, + "step": 420 + }, + { + "epoch": 0.12922038060159607, + "grad_norm": 4.731013298034668, + "learning_rate": 4.2944785276073626e-05, + "loss": 3.7078, + "step": 421 + }, + { + "epoch": 0.12952731737262124, + "grad_norm": 2.7973382472991943, + "learning_rate": 4.304703476482618e-05, + "loss": 3.5934, + "step": 422 + }, + { + "epoch": 0.1298342541436464, + "grad_norm": 4.555461406707764, + "learning_rate": 4.3149284253578733e-05, + "loss": 3.7406, + "step": 423 + }, + { + "epoch": 0.13014119091467158, + "grad_norm": 3.25795841217041, + "learning_rate": 4.3251533742331294e-05, + "loss": 3.6302, + "step": 424 + }, + { + "epoch": 0.13044812768569675, + "grad_norm": 3.9974427223205566, + "learning_rate": 4.335378323108385e-05, + "loss": 3.6995, + "step": 425 + }, + { + "epoch": 0.13075506445672191, + "grad_norm": 3.4234917163848877, + "learning_rate": 4.34560327198364e-05, + "loss": 3.727, + "step": 426 + }, + { + "epoch": 0.13106200122774708, + "grad_norm": 3.40573787689209, + "learning_rate": 4.3558282208588955e-05, + "loss": 3.6964, + "step": 427 + }, + { + "epoch": 0.13136893799877225, + "grad_norm": 3.6903765201568604, + "learning_rate": 4.3660531697341515e-05, + "loss": 3.7139, + "step": 428 + }, + { + "epoch": 0.13167587476979742, + "grad_norm": 3.3252439498901367, + "learning_rate": 4.376278118609407e-05, + "loss": 3.7221, + "step": 429 + }, + { + "epoch": 0.1319828115408226, + "grad_norm": 3.591610908508301, + "learning_rate": 4.386503067484663e-05, + "loss": 3.6592, + "step": 430 + }, + { + "epoch": 0.13228974831184775, + "grad_norm": 3.584683418273926, + "learning_rate": 4.396728016359918e-05, + "loss": 3.695, + "step": 431 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 3.5093443393707275, + "learning_rate": 4.4069529652351736e-05, + "loss": 3.6368, + "step": 432 + }, + { + "epoch": 0.1329036218538981, + "grad_norm": 3.5040347576141357, + "learning_rate": 4.41717791411043e-05, + "loss": 3.6463, + "step": 433 + }, + { + "epoch": 0.13321055862492326, + "grad_norm": 3.534536361694336, + "learning_rate": 4.427402862985685e-05, + "loss": 3.681, + "step": 434 + }, + { + "epoch": 0.13351749539594843, + "grad_norm": 4.016106605529785, + "learning_rate": 4.437627811860941e-05, + "loss": 3.7592, + "step": 435 + }, + { + "epoch": 0.1338244321669736, + "grad_norm": 3.4661898612976074, + "learning_rate": 4.4478527607361964e-05, + "loss": 3.6437, + "step": 436 + }, + { + "epoch": 0.13413136893799876, + "grad_norm": 3.917189359664917, + "learning_rate": 4.4580777096114525e-05, + "loss": 3.6809, + "step": 437 + }, + { + "epoch": 0.13443830570902393, + "grad_norm": 3.472147226333618, + "learning_rate": 4.468302658486708e-05, + "loss": 3.5978, + "step": 438 + }, + { + "epoch": 0.1347452424800491, + "grad_norm": 3.2357044219970703, + "learning_rate": 4.478527607361964e-05, + "loss": 3.6758, + "step": 439 + }, + { + "epoch": 0.13505217925107427, + "grad_norm": 3.8607826232910156, + "learning_rate": 4.488752556237219e-05, + "loss": 3.7155, + "step": 440 + }, + { + "epoch": 0.13535911602209943, + "grad_norm": 3.085242509841919, + "learning_rate": 4.4989775051124746e-05, + "loss": 3.674, + "step": 441 + }, + { + "epoch": 0.1356660527931246, + "grad_norm": 4.0473432540893555, + "learning_rate": 4.5092024539877307e-05, + "loss": 3.6542, + "step": 442 + }, + { + "epoch": 0.1359729895641498, + "grad_norm": 3.4742088317871094, + "learning_rate": 4.519427402862986e-05, + "loss": 3.6226, + "step": 443 + }, + { + "epoch": 0.13627992633517497, + "grad_norm": 3.8838884830474854, + "learning_rate": 4.5296523517382414e-05, + "loss": 3.695, + "step": 444 + }, + { + "epoch": 0.13658686310620013, + "grad_norm": 3.1551895141601562, + "learning_rate": 4.539877300613497e-05, + "loss": 3.6886, + "step": 445 + }, + { + "epoch": 0.1368937998772253, + "grad_norm": 3.6824824810028076, + "learning_rate": 4.550102249488753e-05, + "loss": 3.6397, + "step": 446 + }, + { + "epoch": 0.13720073664825047, + "grad_norm": 3.3671298027038574, + "learning_rate": 4.560327198364008e-05, + "loss": 3.5983, + "step": 447 + }, + { + "epoch": 0.13750767341927564, + "grad_norm": 4.11976957321167, + "learning_rate": 4.570552147239264e-05, + "loss": 3.6371, + "step": 448 + }, + { + "epoch": 0.1378146101903008, + "grad_norm": 3.2035205364227295, + "learning_rate": 4.5807770961145195e-05, + "loss": 3.6097, + "step": 449 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 4.944174289703369, + "learning_rate": 4.591002044989775e-05, + "loss": 3.6317, + "step": 450 + }, + { + "epoch": 0.13842848373235114, + "grad_norm": 3.0040266513824463, + "learning_rate": 4.601226993865031e-05, + "loss": 3.6407, + "step": 451 + }, + { + "epoch": 0.1387354205033763, + "grad_norm": 5.124639511108398, + "learning_rate": 4.611451942740286e-05, + "loss": 3.6539, + "step": 452 + }, + { + "epoch": 0.13904235727440148, + "grad_norm": 2.792884349822998, + "learning_rate": 4.6216768916155423e-05, + "loss": 3.6542, + "step": 453 + }, + { + "epoch": 0.13934929404542665, + "grad_norm": 4.394725799560547, + "learning_rate": 4.631901840490798e-05, + "loss": 3.6811, + "step": 454 + }, + { + "epoch": 0.13965623081645182, + "grad_norm": 3.209400177001953, + "learning_rate": 4.642126789366054e-05, + "loss": 3.6635, + "step": 455 + }, + { + "epoch": 0.13996316758747698, + "grad_norm": 3.6599526405334473, + "learning_rate": 4.652351738241309e-05, + "loss": 3.5732, + "step": 456 + }, + { + "epoch": 0.14027010435850215, + "grad_norm": 3.6527204513549805, + "learning_rate": 4.6625766871165645e-05, + "loss": 3.5979, + "step": 457 + }, + { + "epoch": 0.14057704112952732, + "grad_norm": 3.4562110900878906, + "learning_rate": 4.6728016359918205e-05, + "loss": 3.6761, + "step": 458 + }, + { + "epoch": 0.1408839779005525, + "grad_norm": 3.5935721397399902, + "learning_rate": 4.683026584867076e-05, + "loss": 3.6598, + "step": 459 + }, + { + "epoch": 0.14119091467157766, + "grad_norm": 3.4518251419067383, + "learning_rate": 4.693251533742332e-05, + "loss": 3.5707, + "step": 460 + }, + { + "epoch": 0.14149785144260282, + "grad_norm": 3.3248815536499023, + "learning_rate": 4.703476482617587e-05, + "loss": 3.6949, + "step": 461 + }, + { + "epoch": 0.141804788213628, + "grad_norm": 3.6379971504211426, + "learning_rate": 4.7137014314928426e-05, + "loss": 3.6265, + "step": 462 + }, + { + "epoch": 0.14211172498465316, + "grad_norm": 4.068325996398926, + "learning_rate": 4.723926380368098e-05, + "loss": 3.6096, + "step": 463 + }, + { + "epoch": 0.14241866175567833, + "grad_norm": 3.0870959758758545, + "learning_rate": 4.734151329243354e-05, + "loss": 3.5201, + "step": 464 + }, + { + "epoch": 0.1427255985267035, + "grad_norm": 4.013638973236084, + "learning_rate": 4.7443762781186094e-05, + "loss": 3.5845, + "step": 465 + }, + { + "epoch": 0.14303253529772866, + "grad_norm": 3.421921968460083, + "learning_rate": 4.754601226993865e-05, + "loss": 3.6718, + "step": 466 + }, + { + "epoch": 0.14333947206875383, + "grad_norm": 3.4814112186431885, + "learning_rate": 4.764826175869121e-05, + "loss": 3.6225, + "step": 467 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 2.9323105812072754, + "learning_rate": 4.775051124744376e-05, + "loss": 3.5881, + "step": 468 + }, + { + "epoch": 0.14395334561080417, + "grad_norm": 3.862344264984131, + "learning_rate": 4.785276073619632e-05, + "loss": 3.6264, + "step": 469 + }, + { + "epoch": 0.14426028238182934, + "grad_norm": 2.950495481491089, + "learning_rate": 4.7955010224948876e-05, + "loss": 3.5891, + "step": 470 + }, + { + "epoch": 0.1445672191528545, + "grad_norm": 4.360744476318359, + "learning_rate": 4.8057259713701436e-05, + "loss": 3.6746, + "step": 471 + }, + { + "epoch": 0.14487415592387967, + "grad_norm": 2.689297914505005, + "learning_rate": 4.815950920245399e-05, + "loss": 3.616, + "step": 472 + }, + { + "epoch": 0.14518109269490484, + "grad_norm": 4.433006286621094, + "learning_rate": 4.826175869120655e-05, + "loss": 3.6259, + "step": 473 + }, + { + "epoch": 0.14548802946593, + "grad_norm": 2.9184467792510986, + "learning_rate": 4.8364008179959104e-05, + "loss": 3.59, + "step": 474 + }, + { + "epoch": 0.14579496623695518, + "grad_norm": 4.472714424133301, + "learning_rate": 4.846625766871166e-05, + "loss": 3.5608, + "step": 475 + }, + { + "epoch": 0.14610190300798034, + "grad_norm": 3.0839431285858154, + "learning_rate": 4.856850715746422e-05, + "loss": 3.6069, + "step": 476 + }, + { + "epoch": 0.1464088397790055, + "grad_norm": 3.8900411128997803, + "learning_rate": 4.867075664621677e-05, + "loss": 3.5387, + "step": 477 + }, + { + "epoch": 0.14671577655003068, + "grad_norm": 3.0446956157684326, + "learning_rate": 4.877300613496933e-05, + "loss": 3.5374, + "step": 478 + }, + { + "epoch": 0.14702271332105588, + "grad_norm": 3.805018901824951, + "learning_rate": 4.8875255623721885e-05, + "loss": 3.6032, + "step": 479 + }, + { + "epoch": 0.14732965009208104, + "grad_norm": 2.9937491416931152, + "learning_rate": 4.897750511247444e-05, + "loss": 3.548, + "step": 480 + }, + { + "epoch": 0.1476365868631062, + "grad_norm": 4.103757858276367, + "learning_rate": 4.907975460122699e-05, + "loss": 3.6292, + "step": 481 + }, + { + "epoch": 0.14794352363413138, + "grad_norm": 2.8275530338287354, + "learning_rate": 4.918200408997955e-05, + "loss": 3.5885, + "step": 482 + }, + { + "epoch": 0.14825046040515655, + "grad_norm": 4.104444980621338, + "learning_rate": 4.928425357873211e-05, + "loss": 3.5566, + "step": 483 + }, + { + "epoch": 0.14855739717618172, + "grad_norm": 2.820648670196533, + "learning_rate": 4.938650306748466e-05, + "loss": 3.6576, + "step": 484 + }, + { + "epoch": 0.14886433394720688, + "grad_norm": 4.639568328857422, + "learning_rate": 4.948875255623722e-05, + "loss": 3.583, + "step": 485 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 2.8675858974456787, + "learning_rate": 4.9591002044989774e-05, + "loss": 3.5982, + "step": 486 + }, + { + "epoch": 0.14947820748925722, + "grad_norm": 4.820484638214111, + "learning_rate": 4.9693251533742335e-05, + "loss": 3.5479, + "step": 487 + }, + { + "epoch": 0.1497851442602824, + "grad_norm": 2.9569075107574463, + "learning_rate": 4.979550102249489e-05, + "loss": 3.5846, + "step": 488 + }, + { + "epoch": 0.15009208103130756, + "grad_norm": 4.402152061462402, + "learning_rate": 4.989775051124745e-05, + "loss": 3.5368, + "step": 489 + }, + { + "epoch": 0.15039901780233272, + "grad_norm": 3.0454704761505127, + "learning_rate": 5e-05, + "loss": 3.5233, + "step": 490 + }, + { + "epoch": 0.1507059545733579, + "grad_norm": 3.564425468444824, + "learning_rate": 5.010224948875256e-05, + "loss": 3.5747, + "step": 491 + }, + { + "epoch": 0.15101289134438306, + "grad_norm": 3.2065536975860596, + "learning_rate": 5.020449897750511e-05, + "loss": 3.4803, + "step": 492 + }, + { + "epoch": 0.15131982811540823, + "grad_norm": 4.06170129776001, + "learning_rate": 5.030674846625767e-05, + "loss": 3.5867, + "step": 493 + }, + { + "epoch": 0.1516267648864334, + "grad_norm": 2.937181234359741, + "learning_rate": 5.040899795501023e-05, + "loss": 3.5098, + "step": 494 + }, + { + "epoch": 0.15193370165745856, + "grad_norm": 3.7272653579711914, + "learning_rate": 5.051124744376279e-05, + "loss": 3.5959, + "step": 495 + }, + { + "epoch": 0.15224063842848373, + "grad_norm": 2.8606886863708496, + "learning_rate": 5.061349693251534e-05, + "loss": 3.4881, + "step": 496 + }, + { + "epoch": 0.1525475751995089, + "grad_norm": 3.4861185550689697, + "learning_rate": 5.07157464212679e-05, + "loss": 3.563, + "step": 497 + }, + { + "epoch": 0.15285451197053407, + "grad_norm": 3.1362967491149902, + "learning_rate": 5.081799591002045e-05, + "loss": 3.5564, + "step": 498 + }, + { + "epoch": 0.15316144874155924, + "grad_norm": 3.360508441925049, + "learning_rate": 5.0920245398773005e-05, + "loss": 3.5307, + "step": 499 + }, + { + "epoch": 0.1534683855125844, + "grad_norm": 3.2896840572357178, + "learning_rate": 5.1022494887525566e-05, + "loss": 3.4843, + "step": 500 + }, + { + "epoch": 0.15377532228360957, + "grad_norm": 3.320429801940918, + "learning_rate": 5.112474437627812e-05, + "loss": 3.484, + "step": 501 + }, + { + "epoch": 0.15408225905463474, + "grad_norm": 3.409586191177368, + "learning_rate": 5.122699386503068e-05, + "loss": 3.506, + "step": 502 + }, + { + "epoch": 0.1543891958256599, + "grad_norm": 3.0944409370422363, + "learning_rate": 5.1329243353783227e-05, + "loss": 3.5011, + "step": 503 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 3.7220418453216553, + "learning_rate": 5.143149284253579e-05, + "loss": 3.5629, + "step": 504 + }, + { + "epoch": 0.15500306936771024, + "grad_norm": 3.217435359954834, + "learning_rate": 5.153374233128835e-05, + "loss": 3.4957, + "step": 505 + }, + { + "epoch": 0.1553100061387354, + "grad_norm": 4.0457444190979, + "learning_rate": 5.163599182004091e-05, + "loss": 3.5152, + "step": 506 + }, + { + "epoch": 0.15561694290976058, + "grad_norm": 2.9380006790161133, + "learning_rate": 5.1738241308793455e-05, + "loss": 3.5261, + "step": 507 + }, + { + "epoch": 0.15592387968078575, + "grad_norm": 4.134535312652588, + "learning_rate": 5.1840490797546015e-05, + "loss": 3.5622, + "step": 508 + }, + { + "epoch": 0.15623081645181092, + "grad_norm": 2.8209407329559326, + "learning_rate": 5.1942740286298575e-05, + "loss": 3.5335, + "step": 509 + }, + { + "epoch": 0.15653775322283608, + "grad_norm": 4.4260711669921875, + "learning_rate": 5.204498977505112e-05, + "loss": 3.5554, + "step": 510 + }, + { + "epoch": 0.15684468999386125, + "grad_norm": 2.8649590015411377, + "learning_rate": 5.214723926380368e-05, + "loss": 3.4989, + "step": 511 + }, + { + "epoch": 0.15715162676488642, + "grad_norm": 4.0349812507629395, + "learning_rate": 5.224948875255624e-05, + "loss": 3.4883, + "step": 512 + }, + { + "epoch": 0.1574585635359116, + "grad_norm": 2.841923475265503, + "learning_rate": 5.2351738241308803e-05, + "loss": 3.4748, + "step": 513 + }, + { + "epoch": 0.15776550030693678, + "grad_norm": 3.8810653686523438, + "learning_rate": 5.245398773006135e-05, + "loss": 3.5403, + "step": 514 + }, + { + "epoch": 0.15807243707796195, + "grad_norm": 3.0830774307250977, + "learning_rate": 5.255623721881391e-05, + "loss": 3.513, + "step": 515 + }, + { + "epoch": 0.15837937384898712, + "grad_norm": 3.8688604831695557, + "learning_rate": 5.265848670756647e-05, + "loss": 3.5409, + "step": 516 + }, + { + "epoch": 0.1586863106200123, + "grad_norm": 2.854600429534912, + "learning_rate": 5.276073619631902e-05, + "loss": 3.4441, + "step": 517 + }, + { + "epoch": 0.15899324739103746, + "grad_norm": 3.9125611782073975, + "learning_rate": 5.286298568507158e-05, + "loss": 3.4953, + "step": 518 + }, + { + "epoch": 0.15930018416206262, + "grad_norm": 2.8626177310943604, + "learning_rate": 5.296523517382413e-05, + "loss": 3.5279, + "step": 519 + }, + { + "epoch": 0.1596071209330878, + "grad_norm": 3.5023677349090576, + "learning_rate": 5.306748466257669e-05, + "loss": 3.4886, + "step": 520 + }, + { + "epoch": 0.15991405770411296, + "grad_norm": 2.960505962371826, + "learning_rate": 5.316973415132924e-05, + "loss": 3.5278, + "step": 521 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 3.976245164871216, + "learning_rate": 5.32719836400818e-05, + "loss": 3.5236, + "step": 522 + }, + { + "epoch": 0.1605279312461633, + "grad_norm": 3.078248977661133, + "learning_rate": 5.337423312883436e-05, + "loss": 3.5194, + "step": 523 + }, + { + "epoch": 0.16083486801718846, + "grad_norm": 3.7498552799224854, + "learning_rate": 5.347648261758691e-05, + "loss": 3.5315, + "step": 524 + }, + { + "epoch": 0.16114180478821363, + "grad_norm": 2.87638258934021, + "learning_rate": 5.357873210633947e-05, + "loss": 3.434, + "step": 525 + }, + { + "epoch": 0.1614487415592388, + "grad_norm": 3.786454677581787, + "learning_rate": 5.368098159509203e-05, + "loss": 3.4985, + "step": 526 + }, + { + "epoch": 0.16175567833026397, + "grad_norm": 2.915156364440918, + "learning_rate": 5.378323108384459e-05, + "loss": 3.4979, + "step": 527 + }, + { + "epoch": 0.16206261510128914, + "grad_norm": 4.095824718475342, + "learning_rate": 5.3885480572597135e-05, + "loss": 3.4605, + "step": 528 + }, + { + "epoch": 0.1623695518723143, + "grad_norm": 2.793501853942871, + "learning_rate": 5.3987730061349695e-05, + "loss": 3.476, + "step": 529 + }, + { + "epoch": 0.16267648864333947, + "grad_norm": 3.9074480533599854, + "learning_rate": 5.4089979550102256e-05, + "loss": 3.4636, + "step": 530 + }, + { + "epoch": 0.16298342541436464, + "grad_norm": 2.8382515907287598, + "learning_rate": 5.4192229038854816e-05, + "loss": 3.4364, + "step": 531 + }, + { + "epoch": 0.1632903621853898, + "grad_norm": 3.4670751094818115, + "learning_rate": 5.429447852760736e-05, + "loss": 3.5033, + "step": 532 + }, + { + "epoch": 0.16359729895641498, + "grad_norm": 2.8805580139160156, + "learning_rate": 5.439672801635992e-05, + "loss": 3.471, + "step": 533 + }, + { + "epoch": 0.16390423572744015, + "grad_norm": 3.745434522628784, + "learning_rate": 5.4498977505112484e-05, + "loss": 3.4565, + "step": 534 + }, + { + "epoch": 0.1642111724984653, + "grad_norm": 3.290579319000244, + "learning_rate": 5.460122699386503e-05, + "loss": 3.47, + "step": 535 + }, + { + "epoch": 0.16451810926949048, + "grad_norm": 3.2988481521606445, + "learning_rate": 5.470347648261759e-05, + "loss": 3.3781, + "step": 536 + }, + { + "epoch": 0.16482504604051565, + "grad_norm": 3.3673248291015625, + "learning_rate": 5.4805725971370145e-05, + "loss": 3.4891, + "step": 537 + }, + { + "epoch": 0.16513198281154082, + "grad_norm": 3.1917717456817627, + "learning_rate": 5.4907975460122705e-05, + "loss": 3.4493, + "step": 538 + }, + { + "epoch": 0.16543891958256599, + "grad_norm": 3.3869614601135254, + "learning_rate": 5.501022494887525e-05, + "loss": 3.3954, + "step": 539 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 2.896742820739746, + "learning_rate": 5.511247443762781e-05, + "loss": 3.4465, + "step": 540 + }, + { + "epoch": 0.16605279312461632, + "grad_norm": 3.771268844604492, + "learning_rate": 5.521472392638037e-05, + "loss": 3.4889, + "step": 541 + }, + { + "epoch": 0.1663597298956415, + "grad_norm": 2.8693349361419678, + "learning_rate": 5.531697341513292e-05, + "loss": 3.3661, + "step": 542 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.093103885650635, + "learning_rate": 5.541922290388548e-05, + "loss": 3.4451, + "step": 543 + }, + { + "epoch": 0.16697360343769183, + "grad_norm": 3.050361394882202, + "learning_rate": 5.552147239263804e-05, + "loss": 3.4203, + "step": 544 + }, + { + "epoch": 0.167280540208717, + "grad_norm": 3.041480302810669, + "learning_rate": 5.56237218813906e-05, + "loss": 3.4173, + "step": 545 + }, + { + "epoch": 0.16758747697974216, + "grad_norm": 3.385680675506592, + "learning_rate": 5.572597137014315e-05, + "loss": 3.4408, + "step": 546 + }, + { + "epoch": 0.16789441375076733, + "grad_norm": 2.88845157623291, + "learning_rate": 5.582822085889571e-05, + "loss": 3.4536, + "step": 547 + }, + { + "epoch": 0.1682013505217925, + "grad_norm": 3.7155961990356445, + "learning_rate": 5.593047034764827e-05, + "loss": 3.4392, + "step": 548 + }, + { + "epoch": 0.1685082872928177, + "grad_norm": 3.4626615047454834, + "learning_rate": 5.6032719836400815e-05, + "loss": 3.4395, + "step": 549 + }, + { + "epoch": 0.16881522406384286, + "grad_norm": 3.182154417037964, + "learning_rate": 5.6134969325153376e-05, + "loss": 3.5239, + "step": 550 + }, + { + "epoch": 0.16912216083486803, + "grad_norm": 3.478602886199951, + "learning_rate": 5.6237218813905936e-05, + "loss": 3.4258, + "step": 551 + }, + { + "epoch": 0.1694290976058932, + "grad_norm": 2.9652369022369385, + "learning_rate": 5.6339468302658496e-05, + "loss": 3.3919, + "step": 552 + }, + { + "epoch": 0.16973603437691837, + "grad_norm": 3.736821413040161, + "learning_rate": 5.644171779141104e-05, + "loss": 3.4491, + "step": 553 + }, + { + "epoch": 0.17004297114794353, + "grad_norm": 2.7791361808776855, + "learning_rate": 5.6543967280163604e-05, + "loss": 3.4748, + "step": 554 + }, + { + "epoch": 0.1703499079189687, + "grad_norm": 4.583637714385986, + "learning_rate": 5.664621676891616e-05, + "loss": 3.4554, + "step": 555 + }, + { + "epoch": 0.17065684468999387, + "grad_norm": 2.8527474403381348, + "learning_rate": 5.674846625766872e-05, + "loss": 3.4327, + "step": 556 + }, + { + "epoch": 0.17096378146101904, + "grad_norm": 4.116163730621338, + "learning_rate": 5.685071574642127e-05, + "loss": 3.4043, + "step": 557 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 3.0130903720855713, + "learning_rate": 5.6952965235173825e-05, + "loss": 3.4823, + "step": 558 + }, + { + "epoch": 0.17157765500306937, + "grad_norm": 3.3556432723999023, + "learning_rate": 5.7055214723926385e-05, + "loss": 3.4464, + "step": 559 + }, + { + "epoch": 0.17188459177409454, + "grad_norm": 2.854952573776245, + "learning_rate": 5.715746421267893e-05, + "loss": 3.3768, + "step": 560 + }, + { + "epoch": 0.1721915285451197, + "grad_norm": 3.9891982078552246, + "learning_rate": 5.725971370143149e-05, + "loss": 3.3949, + "step": 561 + }, + { + "epoch": 0.17249846531614488, + "grad_norm": 2.980468511581421, + "learning_rate": 5.736196319018405e-05, + "loss": 3.459, + "step": 562 + }, + { + "epoch": 0.17280540208717005, + "grad_norm": 3.453510284423828, + "learning_rate": 5.7464212678936613e-05, + "loss": 3.4549, + "step": 563 + }, + { + "epoch": 0.1731123388581952, + "grad_norm": 2.8926782608032227, + "learning_rate": 5.756646216768916e-05, + "loss": 3.392, + "step": 564 + }, + { + "epoch": 0.17341927562922038, + "grad_norm": 3.3722894191741943, + "learning_rate": 5.766871165644172e-05, + "loss": 3.4002, + "step": 565 + }, + { + "epoch": 0.17372621240024555, + "grad_norm": 2.8093647956848145, + "learning_rate": 5.777096114519428e-05, + "loss": 3.3862, + "step": 566 + }, + { + "epoch": 0.17403314917127072, + "grad_norm": 4.1722731590271, + "learning_rate": 5.787321063394683e-05, + "loss": 3.3903, + "step": 567 + }, + { + "epoch": 0.17434008594229589, + "grad_norm": 2.778069257736206, + "learning_rate": 5.797546012269939e-05, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.17464702271332105, + "grad_norm": 3.8501908779144287, + "learning_rate": 5.807770961145195e-05, + "loss": 3.4094, + "step": 569 + }, + { + "epoch": 0.17495395948434622, + "grad_norm": 2.5164549350738525, + "learning_rate": 5.817995910020451e-05, + "loss": 3.4343, + "step": 570 + }, + { + "epoch": 0.1752608962553714, + "grad_norm": 4.0673065185546875, + "learning_rate": 5.8282208588957056e-05, + "loss": 3.3993, + "step": 571 + }, + { + "epoch": 0.17556783302639656, + "grad_norm": 2.7882072925567627, + "learning_rate": 5.8384458077709616e-05, + "loss": 3.4759, + "step": 572 + }, + { + "epoch": 0.17587476979742173, + "grad_norm": 3.3252487182617188, + "learning_rate": 5.848670756646217e-05, + "loss": 3.3562, + "step": 573 + }, + { + "epoch": 0.1761817065684469, + "grad_norm": 2.7499115467071533, + "learning_rate": 5.8588957055214724e-05, + "loss": 3.3376, + "step": 574 + }, + { + "epoch": 0.17648864333947206, + "grad_norm": 4.061224460601807, + "learning_rate": 5.8691206543967284e-05, + "loss": 3.3521, + "step": 575 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 3.022193431854248, + "learning_rate": 5.879345603271984e-05, + "loss": 3.3933, + "step": 576 + }, + { + "epoch": 0.1771025168815224, + "grad_norm": 3.2442128658294678, + "learning_rate": 5.88957055214724e-05, + "loss": 3.4531, + "step": 577 + }, + { + "epoch": 0.17740945365254757, + "grad_norm": 2.9524872303009033, + "learning_rate": 5.8997955010224945e-05, + "loss": 3.332, + "step": 578 + }, + { + "epoch": 0.17771639042357273, + "grad_norm": 3.4604902267456055, + "learning_rate": 5.9100204498977505e-05, + "loss": 3.3706, + "step": 579 + }, + { + "epoch": 0.1780233271945979, + "grad_norm": 3.05216646194458, + "learning_rate": 5.9202453987730066e-05, + "loss": 3.463, + "step": 580 + }, + { + "epoch": 0.17833026396562307, + "grad_norm": 3.427311658859253, + "learning_rate": 5.9304703476482626e-05, + "loss": 3.4204, + "step": 581 + }, + { + "epoch": 0.17863720073664824, + "grad_norm": 2.5583856105804443, + "learning_rate": 5.940695296523517e-05, + "loss": 3.4686, + "step": 582 + }, + { + "epoch": 0.1789441375076734, + "grad_norm": 3.85471248626709, + "learning_rate": 5.950920245398773e-05, + "loss": 3.4518, + "step": 583 + }, + { + "epoch": 0.17925107427869857, + "grad_norm": 2.6894235610961914, + "learning_rate": 5.9611451942740294e-05, + "loss": 3.4179, + "step": 584 + }, + { + "epoch": 0.17955801104972377, + "grad_norm": 3.7592904567718506, + "learning_rate": 5.971370143149284e-05, + "loss": 3.3197, + "step": 585 + }, + { + "epoch": 0.17986494782074894, + "grad_norm": 2.8180313110351562, + "learning_rate": 5.98159509202454e-05, + "loss": 3.4098, + "step": 586 + }, + { + "epoch": 0.1801718845917741, + "grad_norm": 3.5678224563598633, + "learning_rate": 5.991820040899796e-05, + "loss": 3.3644, + "step": 587 + }, + { + "epoch": 0.18047882136279927, + "grad_norm": 2.920607328414917, + "learning_rate": 6.002044989775052e-05, + "loss": 3.4158, + "step": 588 + }, + { + "epoch": 0.18078575813382444, + "grad_norm": 2.9465436935424805, + "learning_rate": 6.012269938650307e-05, + "loss": 3.3369, + "step": 589 + }, + { + "epoch": 0.1810926949048496, + "grad_norm": 3.8760533332824707, + "learning_rate": 6.022494887525563e-05, + "loss": 3.4205, + "step": 590 + }, + { + "epoch": 0.18139963167587478, + "grad_norm": 3.2972259521484375, + "learning_rate": 6.032719836400819e-05, + "loss": 3.3234, + "step": 591 + }, + { + "epoch": 0.18170656844689995, + "grad_norm": 2.8855841159820557, + "learning_rate": 6.0429447852760736e-05, + "loss": 3.4172, + "step": 592 + }, + { + "epoch": 0.18201350521792511, + "grad_norm": 3.3035166263580322, + "learning_rate": 6.05316973415133e-05, + "loss": 3.3235, + "step": 593 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 2.5975232124328613, + "learning_rate": 6.063394683026585e-05, + "loss": 3.3245, + "step": 594 + }, + { + "epoch": 0.18262737875997545, + "grad_norm": 3.68007755279541, + "learning_rate": 6.073619631901841e-05, + "loss": 3.4348, + "step": 595 + }, + { + "epoch": 0.18293431553100062, + "grad_norm": 2.774419069290161, + "learning_rate": 6.083844580777096e-05, + "loss": 3.2763, + "step": 596 + }, + { + "epoch": 0.1832412523020258, + "grad_norm": 3.686140298843384, + "learning_rate": 6.094069529652352e-05, + "loss": 3.29, + "step": 597 + }, + { + "epoch": 0.18354818907305095, + "grad_norm": 2.71142315864563, + "learning_rate": 6.104294478527609e-05, + "loss": 3.3899, + "step": 598 + }, + { + "epoch": 0.18385512584407612, + "grad_norm": 3.725736141204834, + "learning_rate": 6.114519427402863e-05, + "loss": 3.3844, + "step": 599 + }, + { + "epoch": 0.1841620626151013, + "grad_norm": 2.691237211227417, + "learning_rate": 6.124744376278119e-05, + "loss": 3.3138, + "step": 600 + }, + { + "epoch": 0.18446899938612646, + "grad_norm": 3.467499256134033, + "learning_rate": 6.134969325153375e-05, + "loss": 3.3501, + "step": 601 + }, + { + "epoch": 0.18477593615715163, + "grad_norm": 2.776309013366699, + "learning_rate": 6.14519427402863e-05, + "loss": 3.3278, + "step": 602 + }, + { + "epoch": 0.1850828729281768, + "grad_norm": 3.4674019813537598, + "learning_rate": 6.155419222903885e-05, + "loss": 3.262, + "step": 603 + }, + { + "epoch": 0.18538980969920196, + "grad_norm": 2.8091421127319336, + "learning_rate": 6.165644171779141e-05, + "loss": 3.3296, + "step": 604 + }, + { + "epoch": 0.18569674647022713, + "grad_norm": 3.4938528537750244, + "learning_rate": 6.175869120654397e-05, + "loss": 3.4028, + "step": 605 + }, + { + "epoch": 0.1860036832412523, + "grad_norm": 2.5200188159942627, + "learning_rate": 6.186094069529653e-05, + "loss": 3.3726, + "step": 606 + }, + { + "epoch": 0.18631062001227747, + "grad_norm": 3.6415109634399414, + "learning_rate": 6.196319018404908e-05, + "loss": 3.3539, + "step": 607 + }, + { + "epoch": 0.18661755678330263, + "grad_norm": 2.553532123565674, + "learning_rate": 6.206543967280163e-05, + "loss": 3.2971, + "step": 608 + }, + { + "epoch": 0.1869244935543278, + "grad_norm": 3.7287046909332275, + "learning_rate": 6.21676891615542e-05, + "loss": 3.3987, + "step": 609 + }, + { + "epoch": 0.18723143032535297, + "grad_norm": 2.6285226345062256, + "learning_rate": 6.226993865030674e-05, + "loss": 3.2446, + "step": 610 + }, + { + "epoch": 0.18753836709637814, + "grad_norm": 3.453766107559204, + "learning_rate": 6.237218813905931e-05, + "loss": 3.2644, + "step": 611 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 2.7924115657806396, + "learning_rate": 6.247443762781186e-05, + "loss": 3.3056, + "step": 612 + }, + { + "epoch": 0.18815224063842848, + "grad_norm": 3.4854533672332764, + "learning_rate": 6.257668711656443e-05, + "loss": 3.3468, + "step": 613 + }, + { + "epoch": 0.18845917740945364, + "grad_norm": 2.8738653659820557, + "learning_rate": 6.267893660531697e-05, + "loss": 3.3079, + "step": 614 + }, + { + "epoch": 0.1887661141804788, + "grad_norm": 3.496342420578003, + "learning_rate": 6.278118609406954e-05, + "loss": 3.3453, + "step": 615 + }, + { + "epoch": 0.18907305095150398, + "grad_norm": 3.1935245990753174, + "learning_rate": 6.288343558282209e-05, + "loss": 3.303, + "step": 616 + }, + { + "epoch": 0.18937998772252915, + "grad_norm": 2.9726579189300537, + "learning_rate": 6.298568507157464e-05, + "loss": 3.284, + "step": 617 + }, + { + "epoch": 0.18968692449355432, + "grad_norm": 2.8515241146087646, + "learning_rate": 6.30879345603272e-05, + "loss": 3.2748, + "step": 618 + }, + { + "epoch": 0.18999386126457948, + "grad_norm": 3.216681480407715, + "learning_rate": 6.319018404907977e-05, + "loss": 3.2613, + "step": 619 + }, + { + "epoch": 0.19030079803560468, + "grad_norm": 2.9164562225341797, + "learning_rate": 6.329243353783232e-05, + "loss": 3.3234, + "step": 620 + }, + { + "epoch": 0.19060773480662985, + "grad_norm": 2.6724259853363037, + "learning_rate": 6.339468302658487e-05, + "loss": 3.3271, + "step": 621 + }, + { + "epoch": 0.19091467157765502, + "grad_norm": 3.298551082611084, + "learning_rate": 6.349693251533743e-05, + "loss": 3.2715, + "step": 622 + }, + { + "epoch": 0.19122160834868018, + "grad_norm": 2.609632968902588, + "learning_rate": 6.359918200408998e-05, + "loss": 3.2392, + "step": 623 + }, + { + "epoch": 0.19152854511970535, + "grad_norm": 3.6469385623931885, + "learning_rate": 6.370143149284253e-05, + "loss": 3.428, + "step": 624 + }, + { + "epoch": 0.19183548189073052, + "grad_norm": 2.4231622219085693, + "learning_rate": 6.380368098159509e-05, + "loss": 3.3436, + "step": 625 + }, + { + "epoch": 0.1921424186617557, + "grad_norm": 3.9182474613189697, + "learning_rate": 6.390593047034765e-05, + "loss": 3.3375, + "step": 626 + }, + { + "epoch": 0.19244935543278086, + "grad_norm": 2.3975942134857178, + "learning_rate": 6.400817995910021e-05, + "loss": 3.2711, + "step": 627 + }, + { + "epoch": 0.19275629220380602, + "grad_norm": 3.061039447784424, + "learning_rate": 6.411042944785276e-05, + "loss": 3.3124, + "step": 628 + }, + { + "epoch": 0.1930632289748312, + "grad_norm": 2.9461817741394043, + "learning_rate": 6.421267893660532e-05, + "loss": 3.2954, + "step": 629 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 2.6603662967681885, + "learning_rate": 6.431492842535788e-05, + "loss": 3.2138, + "step": 630 + }, + { + "epoch": 0.19367710251688153, + "grad_norm": 3.339444875717163, + "learning_rate": 6.441717791411042e-05, + "loss": 3.2796, + "step": 631 + }, + { + "epoch": 0.1939840392879067, + "grad_norm": 2.59061861038208, + "learning_rate": 6.451942740286299e-05, + "loss": 3.3906, + "step": 632 + }, + { + "epoch": 0.19429097605893186, + "grad_norm": 3.704300880432129, + "learning_rate": 6.462167689161554e-05, + "loss": 3.2604, + "step": 633 + }, + { + "epoch": 0.19459791282995703, + "grad_norm": 3.110203266143799, + "learning_rate": 6.472392638036811e-05, + "loss": 3.3236, + "step": 634 + }, + { + "epoch": 0.1949048496009822, + "grad_norm": 3.016730308532715, + "learning_rate": 6.482617586912065e-05, + "loss": 3.2911, + "step": 635 + }, + { + "epoch": 0.19521178637200737, + "grad_norm": 2.896956205368042, + "learning_rate": 6.492842535787322e-05, + "loss": 3.35, + "step": 636 + }, + { + "epoch": 0.19551872314303254, + "grad_norm": 2.7913663387298584, + "learning_rate": 6.503067484662577e-05, + "loss": 3.3474, + "step": 637 + }, + { + "epoch": 0.1958256599140577, + "grad_norm": 3.285518169403076, + "learning_rate": 6.513292433537832e-05, + "loss": 3.2131, + "step": 638 + }, + { + "epoch": 0.19613259668508287, + "grad_norm": 2.588491201400757, + "learning_rate": 6.523517382413088e-05, + "loss": 3.2955, + "step": 639 + }, + { + "epoch": 0.19643953345610804, + "grad_norm": 2.9417827129364014, + "learning_rate": 6.533742331288345e-05, + "loss": 3.2917, + "step": 640 + }, + { + "epoch": 0.1967464702271332, + "grad_norm": 3.2209408283233643, + "learning_rate": 6.5439672801636e-05, + "loss": 3.233, + "step": 641 + }, + { + "epoch": 0.19705340699815838, + "grad_norm": 2.8424925804138184, + "learning_rate": 6.554192229038855e-05, + "loss": 3.3194, + "step": 642 + }, + { + "epoch": 0.19736034376918354, + "grad_norm": 2.9005842208862305, + "learning_rate": 6.56441717791411e-05, + "loss": 3.275, + "step": 643 + }, + { + "epoch": 0.1976672805402087, + "grad_norm": 3.0277016162872314, + "learning_rate": 6.574642126789366e-05, + "loss": 3.2881, + "step": 644 + }, + { + "epoch": 0.19797421731123388, + "grad_norm": 2.8932368755340576, + "learning_rate": 6.584867075664623e-05, + "loss": 3.2799, + "step": 645 + }, + { + "epoch": 0.19828115408225905, + "grad_norm": 2.994464635848999, + "learning_rate": 6.595092024539877e-05, + "loss": 3.258, + "step": 646 + }, + { + "epoch": 0.19858809085328422, + "grad_norm": 2.943040132522583, + "learning_rate": 6.605316973415133e-05, + "loss": 3.1994, + "step": 647 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 2.942765712738037, + "learning_rate": 6.615541922290389e-05, + "loss": 3.1802, + "step": 648 + }, + { + "epoch": 0.19920196439533455, + "grad_norm": 2.8036246299743652, + "learning_rate": 6.625766871165644e-05, + "loss": 3.2426, + "step": 649 + }, + { + "epoch": 0.19950890116635972, + "grad_norm": 2.814507484436035, + "learning_rate": 6.6359918200409e-05, + "loss": 3.2978, + "step": 650 + }, + { + "epoch": 0.1998158379373849, + "grad_norm": 2.8133158683776855, + "learning_rate": 6.646216768916156e-05, + "loss": 3.2435, + "step": 651 + }, + { + "epoch": 0.20012277470841006, + "grad_norm": 2.8596129417419434, + "learning_rate": 6.656441717791412e-05, + "loss": 3.2154, + "step": 652 + }, + { + "epoch": 0.20042971147943522, + "grad_norm": 2.663926839828491, + "learning_rate": 6.666666666666667e-05, + "loss": 3.2487, + "step": 653 + }, + { + "epoch": 0.2007366482504604, + "grad_norm": 3.40561580657959, + "learning_rate": 6.676891615541922e-05, + "loss": 3.1509, + "step": 654 + }, + { + "epoch": 0.20104358502148556, + "grad_norm": 2.5786798000335693, + "learning_rate": 6.687116564417179e-05, + "loss": 3.2686, + "step": 655 + }, + { + "epoch": 0.20135052179251076, + "grad_norm": 3.007436752319336, + "learning_rate": 6.697341513292433e-05, + "loss": 3.2543, + "step": 656 + }, + { + "epoch": 0.20165745856353592, + "grad_norm": 2.5966951847076416, + "learning_rate": 6.70756646216769e-05, + "loss": 3.2643, + "step": 657 + }, + { + "epoch": 0.2019643953345611, + "grad_norm": 3.2698333263397217, + "learning_rate": 6.717791411042945e-05, + "loss": 3.2002, + "step": 658 + }, + { + "epoch": 0.20227133210558626, + "grad_norm": 2.513129472732544, + "learning_rate": 6.7280163599182e-05, + "loss": 3.1551, + "step": 659 + }, + { + "epoch": 0.20257826887661143, + "grad_norm": 2.9690299034118652, + "learning_rate": 6.738241308793456e-05, + "loss": 3.3037, + "step": 660 + }, + { + "epoch": 0.2028852056476366, + "grad_norm": 2.6644227504730225, + "learning_rate": 6.748466257668711e-05, + "loss": 3.3225, + "step": 661 + }, + { + "epoch": 0.20319214241866176, + "grad_norm": 2.6990232467651367, + "learning_rate": 6.758691206543968e-05, + "loss": 3.227, + "step": 662 + }, + { + "epoch": 0.20349907918968693, + "grad_norm": 3.6271350383758545, + "learning_rate": 6.768916155419223e-05, + "loss": 3.32, + "step": 663 + }, + { + "epoch": 0.2038060159607121, + "grad_norm": 2.6351428031921387, + "learning_rate": 6.779141104294479e-05, + "loss": 3.2104, + "step": 664 + }, + { + "epoch": 0.20411295273173727, + "grad_norm": 3.980685234069824, + "learning_rate": 6.789366053169734e-05, + "loss": 3.2602, + "step": 665 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 2.5207509994506836, + "learning_rate": 6.799591002044991e-05, + "loss": 3.2256, + "step": 666 + }, + { + "epoch": 0.2047268262737876, + "grad_norm": 3.0568666458129883, + "learning_rate": 6.809815950920245e-05, + "loss": 3.2918, + "step": 667 + }, + { + "epoch": 0.20503376304481277, + "grad_norm": 2.6476826667785645, + "learning_rate": 6.820040899795501e-05, + "loss": 3.2745, + "step": 668 + }, + { + "epoch": 0.20534069981583794, + "grad_norm": 3.0413191318511963, + "learning_rate": 6.830265848670757e-05, + "loss": 3.2683, + "step": 669 + }, + { + "epoch": 0.2056476365868631, + "grad_norm": 2.6214709281921387, + "learning_rate": 6.840490797546014e-05, + "loss": 3.1399, + "step": 670 + }, + { + "epoch": 0.20595457335788828, + "grad_norm": 3.0577988624572754, + "learning_rate": 6.850715746421268e-05, + "loss": 3.2131, + "step": 671 + }, + { + "epoch": 0.20626151012891344, + "grad_norm": 2.795365571975708, + "learning_rate": 6.860940695296524e-05, + "loss": 3.1633, + "step": 672 + }, + { + "epoch": 0.2065684468999386, + "grad_norm": 3.3030495643615723, + "learning_rate": 6.87116564417178e-05, + "loss": 3.2036, + "step": 673 + }, + { + "epoch": 0.20687538367096378, + "grad_norm": 2.3182966709136963, + "learning_rate": 6.881390593047035e-05, + "loss": 3.2154, + "step": 674 + }, + { + "epoch": 0.20718232044198895, + "grad_norm": 3.133702039718628, + "learning_rate": 6.89161554192229e-05, + "loss": 3.1828, + "step": 675 + }, + { + "epoch": 0.20748925721301412, + "grad_norm": 2.555358409881592, + "learning_rate": 6.901840490797547e-05, + "loss": 3.1434, + "step": 676 + }, + { + "epoch": 0.20779619398403928, + "grad_norm": 2.990675687789917, + "learning_rate": 6.912065439672802e-05, + "loss": 3.2182, + "step": 677 + }, + { + "epoch": 0.20810313075506445, + "grad_norm": 2.5072035789489746, + "learning_rate": 6.922290388548058e-05, + "loss": 3.2735, + "step": 678 + }, + { + "epoch": 0.20841006752608962, + "grad_norm": 3.311474323272705, + "learning_rate": 6.932515337423313e-05, + "loss": 3.2152, + "step": 679 + }, + { + "epoch": 0.2087170042971148, + "grad_norm": 2.7110986709594727, + "learning_rate": 6.942740286298569e-05, + "loss": 3.1633, + "step": 680 + }, + { + "epoch": 0.20902394106813996, + "grad_norm": 2.6963095664978027, + "learning_rate": 6.952965235173824e-05, + "loss": 3.2097, + "step": 681 + }, + { + "epoch": 0.20933087783916512, + "grad_norm": 2.7126448154449463, + "learning_rate": 6.963190184049079e-05, + "loss": 3.232, + "step": 682 + }, + { + "epoch": 0.2096378146101903, + "grad_norm": 2.723257541656494, + "learning_rate": 6.973415132924336e-05, + "loss": 3.1024, + "step": 683 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 2.985407829284668, + "learning_rate": 6.983640081799591e-05, + "loss": 3.215, + "step": 684 + }, + { + "epoch": 0.21025168815224063, + "grad_norm": 2.4878063201904297, + "learning_rate": 6.993865030674847e-05, + "loss": 3.2543, + "step": 685 + }, + { + "epoch": 0.2105586249232658, + "grad_norm": 3.417191505432129, + "learning_rate": 7.004089979550102e-05, + "loss": 3.217, + "step": 686 + }, + { + "epoch": 0.21086556169429096, + "grad_norm": 2.606513738632202, + "learning_rate": 7.014314928425359e-05, + "loss": 3.1831, + "step": 687 + }, + { + "epoch": 0.21117249846531613, + "grad_norm": 2.777334213256836, + "learning_rate": 7.024539877300614e-05, + "loss": 3.1513, + "step": 688 + }, + { + "epoch": 0.2114794352363413, + "grad_norm": 2.718494415283203, + "learning_rate": 7.03476482617587e-05, + "loss": 3.1695, + "step": 689 + }, + { + "epoch": 0.21178637200736647, + "grad_norm": 3.041794776916504, + "learning_rate": 7.044989775051125e-05, + "loss": 3.2078, + "step": 690 + }, + { + "epoch": 0.21209330877839166, + "grad_norm": 2.6473169326782227, + "learning_rate": 7.055214723926382e-05, + "loss": 3.177, + "step": 691 + }, + { + "epoch": 0.21240024554941683, + "grad_norm": 3.2349517345428467, + "learning_rate": 7.065439672801636e-05, + "loss": 3.2144, + "step": 692 + }, + { + "epoch": 0.212707182320442, + "grad_norm": 2.6024651527404785, + "learning_rate": 7.075664621676892e-05, + "loss": 3.2204, + "step": 693 + }, + { + "epoch": 0.21301411909146717, + "grad_norm": 2.9090511798858643, + "learning_rate": 7.085889570552148e-05, + "loss": 3.2473, + "step": 694 + }, + { + "epoch": 0.21332105586249234, + "grad_norm": 3.230525255203247, + "learning_rate": 7.096114519427403e-05, + "loss": 3.2552, + "step": 695 + }, + { + "epoch": 0.2136279926335175, + "grad_norm": 2.2609128952026367, + "learning_rate": 7.106339468302658e-05, + "loss": 3.1302, + "step": 696 + }, + { + "epoch": 0.21393492940454267, + "grad_norm": 3.484372854232788, + "learning_rate": 7.116564417177914e-05, + "loss": 3.1578, + "step": 697 + }, + { + "epoch": 0.21424186617556784, + "grad_norm": 2.130702257156372, + "learning_rate": 7.12678936605317e-05, + "loss": 3.2089, + "step": 698 + }, + { + "epoch": 0.214548802946593, + "grad_norm": 3.0673611164093018, + "learning_rate": 7.137014314928426e-05, + "loss": 3.214, + "step": 699 + }, + { + "epoch": 0.21485573971761818, + "grad_norm": 2.572826862335205, + "learning_rate": 7.147239263803681e-05, + "loss": 3.1824, + "step": 700 + }, + { + "epoch": 0.21516267648864335, + "grad_norm": 2.8327746391296387, + "learning_rate": 7.157464212678937e-05, + "loss": 3.2384, + "step": 701 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 2.863041877746582, + "learning_rate": 7.167689161554193e-05, + "loss": 3.1102, + "step": 702 + }, + { + "epoch": 0.21577655003069368, + "grad_norm": 2.2519750595092773, + "learning_rate": 7.177914110429447e-05, + "loss": 3.1541, + "step": 703 + }, + { + "epoch": 0.21608348680171885, + "grad_norm": 3.197129011154175, + "learning_rate": 7.188139059304704e-05, + "loss": 3.2407, + "step": 704 + }, + { + "epoch": 0.21639042357274402, + "grad_norm": 2.32582426071167, + "learning_rate": 7.19836400817996e-05, + "loss": 3.1895, + "step": 705 + }, + { + "epoch": 0.21669736034376919, + "grad_norm": 3.0128488540649414, + "learning_rate": 7.208588957055215e-05, + "loss": 3.2839, + "step": 706 + }, + { + "epoch": 0.21700429711479435, + "grad_norm": 2.503342390060425, + "learning_rate": 7.21881390593047e-05, + "loss": 3.2093, + "step": 707 + }, + { + "epoch": 0.21731123388581952, + "grad_norm": 2.7540833950042725, + "learning_rate": 7.229038854805727e-05, + "loss": 3.2143, + "step": 708 + }, + { + "epoch": 0.2176181706568447, + "grad_norm": 2.8838772773742676, + "learning_rate": 7.239263803680982e-05, + "loss": 3.2051, + "step": 709 + }, + { + "epoch": 0.21792510742786986, + "grad_norm": 2.7495758533477783, + "learning_rate": 7.249488752556238e-05, + "loss": 3.0701, + "step": 710 + }, + { + "epoch": 0.21823204419889503, + "grad_norm": 2.684539794921875, + "learning_rate": 7.259713701431493e-05, + "loss": 3.1917, + "step": 711 + }, + { + "epoch": 0.2185389809699202, + "grad_norm": 2.8330819606781006, + "learning_rate": 7.26993865030675e-05, + "loss": 3.1685, + "step": 712 + }, + { + "epoch": 0.21884591774094536, + "grad_norm": 2.6974711418151855, + "learning_rate": 7.280163599182005e-05, + "loss": 3.0953, + "step": 713 + }, + { + "epoch": 0.21915285451197053, + "grad_norm": 2.5129306316375732, + "learning_rate": 7.29038854805726e-05, + "loss": 3.1371, + "step": 714 + }, + { + "epoch": 0.2194597912829957, + "grad_norm": 2.7884230613708496, + "learning_rate": 7.300613496932516e-05, + "loss": 3.1386, + "step": 715 + }, + { + "epoch": 0.21976672805402087, + "grad_norm": 2.296306610107422, + "learning_rate": 7.310838445807771e-05, + "loss": 3.1735, + "step": 716 + }, + { + "epoch": 0.22007366482504603, + "grad_norm": 2.777911424636841, + "learning_rate": 7.321063394683026e-05, + "loss": 3.1726, + "step": 717 + }, + { + "epoch": 0.2203806015960712, + "grad_norm": 2.5349695682525635, + "learning_rate": 7.331288343558282e-05, + "loss": 3.1603, + "step": 718 + }, + { + "epoch": 0.22068753836709637, + "grad_norm": 2.415412425994873, + "learning_rate": 7.341513292433539e-05, + "loss": 3.1378, + "step": 719 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 2.7188358306884766, + "learning_rate": 7.351738241308794e-05, + "loss": 3.1321, + "step": 720 + }, + { + "epoch": 0.2213014119091467, + "grad_norm": 2.4872183799743652, + "learning_rate": 7.361963190184049e-05, + "loss": 3.1283, + "step": 721 + }, + { + "epoch": 0.22160834868017187, + "grad_norm": 2.454535961151123, + "learning_rate": 7.372188139059305e-05, + "loss": 3.1085, + "step": 722 + }, + { + "epoch": 0.22191528545119704, + "grad_norm": 2.5621426105499268, + "learning_rate": 7.382413087934561e-05, + "loss": 3.1307, + "step": 723 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.513777256011963, + "learning_rate": 7.392638036809815e-05, + "loss": 3.1103, + "step": 724 + }, + { + "epoch": 0.22252915899324738, + "grad_norm": 2.596559762954712, + "learning_rate": 7.402862985685072e-05, + "loss": 3.1563, + "step": 725 + }, + { + "epoch": 0.22283609576427257, + "grad_norm": 2.371487617492676, + "learning_rate": 7.413087934560327e-05, + "loss": 3.1344, + "step": 726 + }, + { + "epoch": 0.22314303253529774, + "grad_norm": 2.7252206802368164, + "learning_rate": 7.423312883435584e-05, + "loss": 3.2139, + "step": 727 + }, + { + "epoch": 0.2234499693063229, + "grad_norm": 2.2834722995758057, + "learning_rate": 7.433537832310838e-05, + "loss": 3.1461, + "step": 728 + }, + { + "epoch": 0.22375690607734808, + "grad_norm": 3.0965540409088135, + "learning_rate": 7.443762781186095e-05, + "loss": 3.1433, + "step": 729 + }, + { + "epoch": 0.22406384284837325, + "grad_norm": 2.351365804672241, + "learning_rate": 7.45398773006135e-05, + "loss": 3.1737, + "step": 730 + }, + { + "epoch": 0.2243707796193984, + "grad_norm": 3.0938596725463867, + "learning_rate": 7.464212678936606e-05, + "loss": 3.1689, + "step": 731 + }, + { + "epoch": 0.22467771639042358, + "grad_norm": 2.415039300918579, + "learning_rate": 7.474437627811861e-05, + "loss": 3.1146, + "step": 732 + }, + { + "epoch": 0.22498465316144875, + "grad_norm": 2.8242318630218506, + "learning_rate": 7.484662576687118e-05, + "loss": 3.0812, + "step": 733 + }, + { + "epoch": 0.22529158993247392, + "grad_norm": 2.4347777366638184, + "learning_rate": 7.494887525562373e-05, + "loss": 3.203, + "step": 734 + }, + { + "epoch": 0.22559852670349909, + "grad_norm": 2.953418016433716, + "learning_rate": 7.505112474437628e-05, + "loss": 3.109, + "step": 735 + }, + { + "epoch": 0.22590546347452425, + "grad_norm": 2.600888252258301, + "learning_rate": 7.515337423312884e-05, + "loss": 3.1859, + "step": 736 + }, + { + "epoch": 0.22621240024554942, + "grad_norm": 2.7484869956970215, + "learning_rate": 7.525562372188139e-05, + "loss": 3.1169, + "step": 737 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 2.4797677993774414, + "learning_rate": 7.535787321063396e-05, + "loss": 3.0696, + "step": 738 + }, + { + "epoch": 0.22682627378759976, + "grad_norm": 2.641873359680176, + "learning_rate": 7.54601226993865e-05, + "loss": 3.1545, + "step": 739 + }, + { + "epoch": 0.22713321055862493, + "grad_norm": 2.3956825733184814, + "learning_rate": 7.556237218813907e-05, + "loss": 3.1295, + "step": 740 + }, + { + "epoch": 0.2274401473296501, + "grad_norm": 2.8832130432128906, + "learning_rate": 7.566462167689162e-05, + "loss": 3.1119, + "step": 741 + }, + { + "epoch": 0.22774708410067526, + "grad_norm": 2.3001184463500977, + "learning_rate": 7.576687116564417e-05, + "loss": 3.0068, + "step": 742 + }, + { + "epoch": 0.22805402087170043, + "grad_norm": 2.8682122230529785, + "learning_rate": 7.586912065439673e-05, + "loss": 3.0562, + "step": 743 + }, + { + "epoch": 0.2283609576427256, + "grad_norm": 2.2176413536071777, + "learning_rate": 7.59713701431493e-05, + "loss": 3.1395, + "step": 744 + }, + { + "epoch": 0.22866789441375077, + "grad_norm": 3.698274612426758, + "learning_rate": 7.607361963190185e-05, + "loss": 3.209, + "step": 745 + }, + { + "epoch": 0.22897483118477593, + "grad_norm": 2.141063928604126, + "learning_rate": 7.61758691206544e-05, + "loss": 3.1734, + "step": 746 + }, + { + "epoch": 0.2292817679558011, + "grad_norm": 2.728498697280884, + "learning_rate": 7.627811860940695e-05, + "loss": 3.1498, + "step": 747 + }, + { + "epoch": 0.22958870472682627, + "grad_norm": 2.271678924560547, + "learning_rate": 7.638036809815952e-05, + "loss": 3.1538, + "step": 748 + }, + { + "epoch": 0.22989564149785144, + "grad_norm": 2.6095521450042725, + "learning_rate": 7.648261758691206e-05, + "loss": 3.155, + "step": 749 + }, + { + "epoch": 0.2302025782688766, + "grad_norm": 2.410792112350464, + "learning_rate": 7.658486707566463e-05, + "loss": 3.0478, + "step": 750 + }, + { + "epoch": 0.23050951503990177, + "grad_norm": 2.6980888843536377, + "learning_rate": 7.668711656441718e-05, + "loss": 3.1369, + "step": 751 + }, + { + "epoch": 0.23081645181092694, + "grad_norm": 2.353308916091919, + "learning_rate": 7.678936605316974e-05, + "loss": 3.0052, + "step": 752 + }, + { + "epoch": 0.2311233885819521, + "grad_norm": 2.4530155658721924, + "learning_rate": 7.689161554192229e-05, + "loss": 3.1348, + "step": 753 + }, + { + "epoch": 0.23143032535297728, + "grad_norm": 2.393601894378662, + "learning_rate": 7.699386503067484e-05, + "loss": 2.9941, + "step": 754 + }, + { + "epoch": 0.23173726212400245, + "grad_norm": 2.576876401901245, + "learning_rate": 7.709611451942741e-05, + "loss": 3.114, + "step": 755 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 2.0420913696289062, + "learning_rate": 7.719836400817996e-05, + "loss": 3.132, + "step": 756 + }, + { + "epoch": 0.23235113566605278, + "grad_norm": 3.0095622539520264, + "learning_rate": 7.730061349693252e-05, + "loss": 3.1763, + "step": 757 + }, + { + "epoch": 0.23265807243707795, + "grad_norm": 2.224005937576294, + "learning_rate": 7.740286298568507e-05, + "loss": 3.0703, + "step": 758 + }, + { + "epoch": 0.23296500920810312, + "grad_norm": 2.7559845447540283, + "learning_rate": 7.750511247443764e-05, + "loss": 3.1026, + "step": 759 + }, + { + "epoch": 0.2332719459791283, + "grad_norm": 2.2965753078460693, + "learning_rate": 7.760736196319018e-05, + "loss": 3.0284, + "step": 760 + }, + { + "epoch": 0.23357888275015345, + "grad_norm": 2.374398708343506, + "learning_rate": 7.770961145194275e-05, + "loss": 3.0636, + "step": 761 + }, + { + "epoch": 0.23388581952117865, + "grad_norm": 2.4315314292907715, + "learning_rate": 7.78118609406953e-05, + "loss": 3.0906, + "step": 762 + }, + { + "epoch": 0.23419275629220382, + "grad_norm": 2.5609946250915527, + "learning_rate": 7.791411042944787e-05, + "loss": 3.0692, + "step": 763 + }, + { + "epoch": 0.234499693063229, + "grad_norm": 2.419597864151001, + "learning_rate": 7.80163599182004e-05, + "loss": 3.1934, + "step": 764 + }, + { + "epoch": 0.23480662983425415, + "grad_norm": 3.0499062538146973, + "learning_rate": 7.811860940695297e-05, + "loss": 3.18, + "step": 765 + }, + { + "epoch": 0.23511356660527932, + "grad_norm": 2.464421510696411, + "learning_rate": 7.822085889570553e-05, + "loss": 3.1591, + "step": 766 + }, + { + "epoch": 0.2354205033763045, + "grad_norm": 3.4370174407958984, + "learning_rate": 7.832310838445808e-05, + "loss": 3.1156, + "step": 767 + }, + { + "epoch": 0.23572744014732966, + "grad_norm": 2.207406520843506, + "learning_rate": 7.842535787321063e-05, + "loss": 3.0557, + "step": 768 + }, + { + "epoch": 0.23603437691835483, + "grad_norm": 2.484807014465332, + "learning_rate": 7.85276073619632e-05, + "loss": 3.1003, + "step": 769 + }, + { + "epoch": 0.23634131368938, + "grad_norm": 2.33217716217041, + "learning_rate": 7.862985685071576e-05, + "loss": 3.0707, + "step": 770 + }, + { + "epoch": 0.23664825046040516, + "grad_norm": 2.493717670440674, + "learning_rate": 7.873210633946831e-05, + "loss": 3.127, + "step": 771 + }, + { + "epoch": 0.23695518723143033, + "grad_norm": 2.5824413299560547, + "learning_rate": 7.883435582822086e-05, + "loss": 3.1042, + "step": 772 + }, + { + "epoch": 0.2372621240024555, + "grad_norm": 2.4137654304504395, + "learning_rate": 7.893660531697342e-05, + "loss": 3.136, + "step": 773 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 2.4657833576202393, + "learning_rate": 7.903885480572597e-05, + "loss": 3.038, + "step": 774 + }, + { + "epoch": 0.23787599754450584, + "grad_norm": 2.426260471343994, + "learning_rate": 7.914110429447852e-05, + "loss": 3.0102, + "step": 775 + }, + { + "epoch": 0.238182934315531, + "grad_norm": 2.4658050537109375, + "learning_rate": 7.924335378323109e-05, + "loss": 3.0645, + "step": 776 + }, + { + "epoch": 0.23848987108655617, + "grad_norm": 2.186267614364624, + "learning_rate": 7.934560327198364e-05, + "loss": 3.0585, + "step": 777 + }, + { + "epoch": 0.23879680785758134, + "grad_norm": 2.8824141025543213, + "learning_rate": 7.94478527607362e-05, + "loss": 3.0796, + "step": 778 + }, + { + "epoch": 0.2391037446286065, + "grad_norm": 1.9940539598464966, + "learning_rate": 7.955010224948875e-05, + "loss": 2.9894, + "step": 779 + }, + { + "epoch": 0.23941068139963168, + "grad_norm": 2.9386861324310303, + "learning_rate": 7.965235173824132e-05, + "loss": 3.1147, + "step": 780 + }, + { + "epoch": 0.23971761817065684, + "grad_norm": 2.241983413696289, + "learning_rate": 7.975460122699386e-05, + "loss": 2.9977, + "step": 781 + }, + { + "epoch": 0.240024554941682, + "grad_norm": 2.4796900749206543, + "learning_rate": 7.985685071574643e-05, + "loss": 3.0507, + "step": 782 + }, + { + "epoch": 0.24033149171270718, + "grad_norm": 2.6178741455078125, + "learning_rate": 7.995910020449898e-05, + "loss": 3.0299, + "step": 783 + }, + { + "epoch": 0.24063842848373235, + "grad_norm": 2.157179594039917, + "learning_rate": 8.006134969325155e-05, + "loss": 3.0419, + "step": 784 + }, + { + "epoch": 0.24094536525475752, + "grad_norm": 2.49029541015625, + "learning_rate": 8.016359918200409e-05, + "loss": 3.0785, + "step": 785 + }, + { + "epoch": 0.24125230202578268, + "grad_norm": 2.254014492034912, + "learning_rate": 8.026584867075665e-05, + "loss": 3.0009, + "step": 786 + }, + { + "epoch": 0.24155923879680785, + "grad_norm": 2.514465570449829, + "learning_rate": 8.036809815950921e-05, + "loss": 3.0221, + "step": 787 + }, + { + "epoch": 0.24186617556783302, + "grad_norm": 2.309812545776367, + "learning_rate": 8.047034764826176e-05, + "loss": 2.9822, + "step": 788 + }, + { + "epoch": 0.2421731123388582, + "grad_norm": 2.5367796421051025, + "learning_rate": 8.057259713701431e-05, + "loss": 2.966, + "step": 789 + }, + { + "epoch": 0.24248004910988336, + "grad_norm": 2.4668943881988525, + "learning_rate": 8.067484662576688e-05, + "loss": 3.1177, + "step": 790 + }, + { + "epoch": 0.24278698588090852, + "grad_norm": 2.9424917697906494, + "learning_rate": 8.077709611451944e-05, + "loss": 3.078, + "step": 791 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 2.3068933486938477, + "learning_rate": 8.087934560327199e-05, + "loss": 3.0415, + "step": 792 + }, + { + "epoch": 0.24340085942295886, + "grad_norm": 2.675631284713745, + "learning_rate": 8.098159509202454e-05, + "loss": 3.012, + "step": 793 + }, + { + "epoch": 0.24370779619398403, + "grad_norm": 2.0261662006378174, + "learning_rate": 8.10838445807771e-05, + "loss": 3.0023, + "step": 794 + }, + { + "epoch": 0.2440147329650092, + "grad_norm": 3.32330322265625, + "learning_rate": 8.118609406952966e-05, + "loss": 3.0992, + "step": 795 + }, + { + "epoch": 0.24432166973603436, + "grad_norm": 2.1587088108062744, + "learning_rate": 8.12883435582822e-05, + "loss": 3.0922, + "step": 796 + }, + { + "epoch": 0.24462860650705956, + "grad_norm": 2.639254331588745, + "learning_rate": 8.139059304703477e-05, + "loss": 2.9856, + "step": 797 + }, + { + "epoch": 0.24493554327808473, + "grad_norm": 1.9976975917816162, + "learning_rate": 8.149284253578732e-05, + "loss": 3.0015, + "step": 798 + }, + { + "epoch": 0.2452424800491099, + "grad_norm": 2.763504981994629, + "learning_rate": 8.159509202453988e-05, + "loss": 3.0437, + "step": 799 + }, + { + "epoch": 0.24554941682013506, + "grad_norm": 1.9080138206481934, + "learning_rate": 8.169734151329243e-05, + "loss": 3.0009, + "step": 800 + }, + { + "epoch": 0.24585635359116023, + "grad_norm": 3.1276164054870605, + "learning_rate": 8.1799591002045e-05, + "loss": 3.0433, + "step": 801 + }, + { + "epoch": 0.2461632903621854, + "grad_norm": 2.0463218688964844, + "learning_rate": 8.190184049079755e-05, + "loss": 2.988, + "step": 802 + }, + { + "epoch": 0.24647022713321057, + "grad_norm": 2.8476648330688477, + "learning_rate": 8.20040899795501e-05, + "loss": 3.0238, + "step": 803 + }, + { + "epoch": 0.24677716390423574, + "grad_norm": 1.9715898036956787, + "learning_rate": 8.210633946830266e-05, + "loss": 3.0657, + "step": 804 + }, + { + "epoch": 0.2470841006752609, + "grad_norm": 3.369995594024658, + "learning_rate": 8.220858895705523e-05, + "loss": 3.0181, + "step": 805 + }, + { + "epoch": 0.24739103744628607, + "grad_norm": 2.0333900451660156, + "learning_rate": 8.231083844580777e-05, + "loss": 3.0589, + "step": 806 + }, + { + "epoch": 0.24769797421731124, + "grad_norm": 2.5702931880950928, + "learning_rate": 8.241308793456033e-05, + "loss": 2.9908, + "step": 807 + }, + { + "epoch": 0.2480049109883364, + "grad_norm": 2.12131929397583, + "learning_rate": 8.251533742331289e-05, + "loss": 3.0519, + "step": 808 + }, + { + "epoch": 0.24831184775936158, + "grad_norm": 2.5457377433776855, + "learning_rate": 8.261758691206544e-05, + "loss": 3.019, + "step": 809 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 2.0954740047454834, + "learning_rate": 8.2719836400818e-05, + "loss": 2.9805, + "step": 810 + }, + { + "epoch": 0.2489257213014119, + "grad_norm": 2.2456700801849365, + "learning_rate": 8.282208588957055e-05, + "loss": 3.0627, + "step": 811 + }, + { + "epoch": 0.24923265807243708, + "grad_norm": 2.4453790187835693, + "learning_rate": 8.292433537832312e-05, + "loss": 3.0447, + "step": 812 + }, + { + "epoch": 0.24953959484346225, + "grad_norm": 2.1835873126983643, + "learning_rate": 8.302658486707567e-05, + "loss": 3.0008, + "step": 813 + }, + { + "epoch": 0.24984653161448742, + "grad_norm": 2.292989492416382, + "learning_rate": 8.312883435582822e-05, + "loss": 2.9175, + "step": 814 + }, + { + "epoch": 0.2501534683855126, + "grad_norm": 2.408888816833496, + "learning_rate": 8.323108384458078e-05, + "loss": 2.9649, + "step": 815 + }, + { + "epoch": 0.2504604051565378, + "grad_norm": 2.1873834133148193, + "learning_rate": 8.333333333333334e-05, + "loss": 2.9812, + "step": 816 + }, + { + "epoch": 0.25076734192756295, + "grad_norm": 2.2599284648895264, + "learning_rate": 8.343558282208588e-05, + "loss": 3.0086, + "step": 817 + }, + { + "epoch": 0.2510742786985881, + "grad_norm": 2.1902761459350586, + "learning_rate": 8.353783231083845e-05, + "loss": 2.9295, + "step": 818 + }, + { + "epoch": 0.2513812154696133, + "grad_norm": 2.4830422401428223, + "learning_rate": 8.3640081799591e-05, + "loss": 2.9808, + "step": 819 + }, + { + "epoch": 0.25168815224063845, + "grad_norm": 2.2274281978607178, + "learning_rate": 8.374233128834357e-05, + "loss": 2.9525, + "step": 820 + }, + { + "epoch": 0.2519950890116636, + "grad_norm": 2.2949111461639404, + "learning_rate": 8.384458077709611e-05, + "loss": 3.0313, + "step": 821 + }, + { + "epoch": 0.2523020257826888, + "grad_norm": 2.2345564365386963, + "learning_rate": 8.394683026584868e-05, + "loss": 2.9024, + "step": 822 + }, + { + "epoch": 0.25260896255371396, + "grad_norm": 2.488744020462036, + "learning_rate": 8.404907975460123e-05, + "loss": 2.9907, + "step": 823 + }, + { + "epoch": 0.2529158993247391, + "grad_norm": 1.9192837476730347, + "learning_rate": 8.415132924335379e-05, + "loss": 2.9792, + "step": 824 + }, + { + "epoch": 0.2532228360957643, + "grad_norm": 2.6426947116851807, + "learning_rate": 8.425357873210634e-05, + "loss": 2.972, + "step": 825 + }, + { + "epoch": 0.25352977286678946, + "grad_norm": 1.9950047731399536, + "learning_rate": 8.435582822085891e-05, + "loss": 2.9885, + "step": 826 + }, + { + "epoch": 0.25383670963781463, + "grad_norm": 2.30191969871521, + "learning_rate": 8.445807770961146e-05, + "loss": 2.9358, + "step": 827 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 2.1111395359039307, + "learning_rate": 8.456032719836401e-05, + "loss": 3.0343, + "step": 828 + }, + { + "epoch": 0.25445058317986496, + "grad_norm": 2.7292258739471436, + "learning_rate": 8.466257668711657e-05, + "loss": 2.9465, + "step": 829 + }, + { + "epoch": 0.25475751995089013, + "grad_norm": 1.9130604267120361, + "learning_rate": 8.476482617586912e-05, + "loss": 2.9443, + "step": 830 + }, + { + "epoch": 0.2550644567219153, + "grad_norm": 2.4240024089813232, + "learning_rate": 8.486707566462168e-05, + "loss": 2.963, + "step": 831 + }, + { + "epoch": 0.25537139349294047, + "grad_norm": 2.062875509262085, + "learning_rate": 8.496932515337423e-05, + "loss": 3.0127, + "step": 832 + }, + { + "epoch": 0.25567833026396564, + "grad_norm": 2.223639726638794, + "learning_rate": 8.50715746421268e-05, + "loss": 2.944, + "step": 833 + }, + { + "epoch": 0.2559852670349908, + "grad_norm": 2.2969272136688232, + "learning_rate": 8.517382413087935e-05, + "loss": 2.9495, + "step": 834 + }, + { + "epoch": 0.256292203806016, + "grad_norm": 2.1343178749084473, + "learning_rate": 8.52760736196319e-05, + "loss": 3.0383, + "step": 835 + }, + { + "epoch": 0.25659914057704114, + "grad_norm": 2.2348313331604004, + "learning_rate": 8.537832310838446e-05, + "loss": 2.9205, + "step": 836 + }, + { + "epoch": 0.2569060773480663, + "grad_norm": 2.2653896808624268, + "learning_rate": 8.548057259713702e-05, + "loss": 2.9699, + "step": 837 + }, + { + "epoch": 0.2572130141190915, + "grad_norm": 2.1332547664642334, + "learning_rate": 8.558282208588958e-05, + "loss": 2.9318, + "step": 838 + }, + { + "epoch": 0.25751995089011664, + "grad_norm": 2.5935778617858887, + "learning_rate": 8.568507157464213e-05, + "loss": 2.9754, + "step": 839 + }, + { + "epoch": 0.2578268876611418, + "grad_norm": 2.073923110961914, + "learning_rate": 8.578732106339469e-05, + "loss": 3.0396, + "step": 840 + }, + { + "epoch": 0.258133824432167, + "grad_norm": 2.485049247741699, + "learning_rate": 8.588957055214725e-05, + "loss": 2.9297, + "step": 841 + }, + { + "epoch": 0.25844076120319215, + "grad_norm": 1.9425253868103027, + "learning_rate": 8.599182004089979e-05, + "loss": 3.0131, + "step": 842 + }, + { + "epoch": 0.2587476979742173, + "grad_norm": 2.6248724460601807, + "learning_rate": 8.609406952965236e-05, + "loss": 3.0345, + "step": 843 + }, + { + "epoch": 0.2590546347452425, + "grad_norm": 1.9123374223709106, + "learning_rate": 8.619631901840491e-05, + "loss": 3.0259, + "step": 844 + }, + { + "epoch": 0.25936157151626765, + "grad_norm": 2.457913637161255, + "learning_rate": 8.629856850715747e-05, + "loss": 3.0015, + "step": 845 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 2.0444202423095703, + "learning_rate": 8.640081799591002e-05, + "loss": 2.9663, + "step": 846 + }, + { + "epoch": 0.259975445058318, + "grad_norm": 2.1673583984375, + "learning_rate": 8.650306748466259e-05, + "loss": 3.0646, + "step": 847 + }, + { + "epoch": 0.26028238182934316, + "grad_norm": 2.1198627948760986, + "learning_rate": 8.660531697341514e-05, + "loss": 2.8769, + "step": 848 + }, + { + "epoch": 0.2605893186003683, + "grad_norm": 2.379960775375366, + "learning_rate": 8.67075664621677e-05, + "loss": 2.9637, + "step": 849 + }, + { + "epoch": 0.2608962553713935, + "grad_norm": 2.3954226970672607, + "learning_rate": 8.680981595092025e-05, + "loss": 3.025, + "step": 850 + }, + { + "epoch": 0.26120319214241866, + "grad_norm": 2.254746198654175, + "learning_rate": 8.69120654396728e-05, + "loss": 2.9962, + "step": 851 + }, + { + "epoch": 0.26151012891344383, + "grad_norm": 2.0851991176605225, + "learning_rate": 8.701431492842537e-05, + "loss": 2.9399, + "step": 852 + }, + { + "epoch": 0.261817065684469, + "grad_norm": 2.2800698280334473, + "learning_rate": 8.711656441717791e-05, + "loss": 2.9465, + "step": 853 + }, + { + "epoch": 0.26212400245549416, + "grad_norm": 2.3628437519073486, + "learning_rate": 8.721881390593048e-05, + "loss": 3.0298, + "step": 854 + }, + { + "epoch": 0.26243093922651933, + "grad_norm": 1.9642207622528076, + "learning_rate": 8.732106339468303e-05, + "loss": 2.8462, + "step": 855 + }, + { + "epoch": 0.2627378759975445, + "grad_norm": 2.5833423137664795, + "learning_rate": 8.742331288343558e-05, + "loss": 2.9024, + "step": 856 + }, + { + "epoch": 0.26304481276856967, + "grad_norm": 1.7022998332977295, + "learning_rate": 8.752556237218814e-05, + "loss": 2.9948, + "step": 857 + }, + { + "epoch": 0.26335174953959484, + "grad_norm": 3.181725025177002, + "learning_rate": 8.76278118609407e-05, + "loss": 3.0634, + "step": 858 + }, + { + "epoch": 0.26365868631062, + "grad_norm": 1.8931077718734741, + "learning_rate": 8.773006134969326e-05, + "loss": 2.9974, + "step": 859 + }, + { + "epoch": 0.2639656230816452, + "grad_norm": 2.5016703605651855, + "learning_rate": 8.783231083844581e-05, + "loss": 3.0109, + "step": 860 + }, + { + "epoch": 0.26427255985267034, + "grad_norm": 1.810957908630371, + "learning_rate": 8.793456032719837e-05, + "loss": 3.0143, + "step": 861 + }, + { + "epoch": 0.2645794966236955, + "grad_norm": 2.3004086017608643, + "learning_rate": 8.803680981595093e-05, + "loss": 2.9825, + "step": 862 + }, + { + "epoch": 0.2648864333947207, + "grad_norm": 2.23740816116333, + "learning_rate": 8.813905930470347e-05, + "loss": 2.8897, + "step": 863 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 2.441157579421997, + "learning_rate": 8.824130879345604e-05, + "loss": 2.8966, + "step": 864 + }, + { + "epoch": 0.265500306936771, + "grad_norm": 2.063201665878296, + "learning_rate": 8.83435582822086e-05, + "loss": 2.9468, + "step": 865 + }, + { + "epoch": 0.2658072437077962, + "grad_norm": 2.1484951972961426, + "learning_rate": 8.844580777096115e-05, + "loss": 2.9199, + "step": 866 + }, + { + "epoch": 0.26611418047882135, + "grad_norm": 2.167827844619751, + "learning_rate": 8.85480572597137e-05, + "loss": 2.9403, + "step": 867 + }, + { + "epoch": 0.2664211172498465, + "grad_norm": 2.193556070327759, + "learning_rate": 8.865030674846625e-05, + "loss": 2.9171, + "step": 868 + }, + { + "epoch": 0.2667280540208717, + "grad_norm": 2.0754151344299316, + "learning_rate": 8.875255623721882e-05, + "loss": 2.9605, + "step": 869 + }, + { + "epoch": 0.26703499079189685, + "grad_norm": 2.1351094245910645, + "learning_rate": 8.885480572597138e-05, + "loss": 2.9272, + "step": 870 + }, + { + "epoch": 0.267341927562922, + "grad_norm": 2.0486347675323486, + "learning_rate": 8.895705521472393e-05, + "loss": 3.0308, + "step": 871 + }, + { + "epoch": 0.2676488643339472, + "grad_norm": 2.3303308486938477, + "learning_rate": 8.905930470347648e-05, + "loss": 2.9061, + "step": 872 + }, + { + "epoch": 0.26795580110497236, + "grad_norm": 1.9345083236694336, + "learning_rate": 8.916155419222905e-05, + "loss": 2.9644, + "step": 873 + }, + { + "epoch": 0.2682627378759975, + "grad_norm": 2.451918601989746, + "learning_rate": 8.926380368098159e-05, + "loss": 2.9536, + "step": 874 + }, + { + "epoch": 0.2685696746470227, + "grad_norm": 1.6964573860168457, + "learning_rate": 8.936605316973416e-05, + "loss": 2.9228, + "step": 875 + }, + { + "epoch": 0.26887661141804786, + "grad_norm": 2.2414000034332275, + "learning_rate": 8.946830265848671e-05, + "loss": 2.9776, + "step": 876 + }, + { + "epoch": 0.26918354818907303, + "grad_norm": 1.725002408027649, + "learning_rate": 8.957055214723928e-05, + "loss": 2.9837, + "step": 877 + }, + { + "epoch": 0.2694904849600982, + "grad_norm": 2.1498587131500244, + "learning_rate": 8.967280163599182e-05, + "loss": 2.8684, + "step": 878 + }, + { + "epoch": 0.26979742173112337, + "grad_norm": 1.814738392829895, + "learning_rate": 8.977505112474438e-05, + "loss": 2.9077, + "step": 879 + }, + { + "epoch": 0.27010435850214853, + "grad_norm": 2.3086628913879395, + "learning_rate": 8.987730061349694e-05, + "loss": 2.9482, + "step": 880 + }, + { + "epoch": 0.2704112952731737, + "grad_norm": 1.7470855712890625, + "learning_rate": 8.997955010224949e-05, + "loss": 2.9775, + "step": 881 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 2.2822775840759277, + "learning_rate": 9.008179959100205e-05, + "loss": 3.0004, + "step": 882 + }, + { + "epoch": 0.27102516881522404, + "grad_norm": 1.9530903100967407, + "learning_rate": 9.018404907975461e-05, + "loss": 2.949, + "step": 883 + }, + { + "epoch": 0.2713321055862492, + "grad_norm": 2.0626885890960693, + "learning_rate": 9.028629856850717e-05, + "loss": 2.9184, + "step": 884 + }, + { + "epoch": 0.2716390423572744, + "grad_norm": 2.0040712356567383, + "learning_rate": 9.038854805725972e-05, + "loss": 2.8562, + "step": 885 + }, + { + "epoch": 0.2719459791282996, + "grad_norm": 2.026193141937256, + "learning_rate": 9.049079754601227e-05, + "loss": 2.883, + "step": 886 + }, + { + "epoch": 0.27225291589932477, + "grad_norm": 1.8337095975875854, + "learning_rate": 9.059304703476483e-05, + "loss": 2.8512, + "step": 887 + }, + { + "epoch": 0.27255985267034993, + "grad_norm": 2.1098122596740723, + "learning_rate": 9.069529652351738e-05, + "loss": 2.9024, + "step": 888 + }, + { + "epoch": 0.2728667894413751, + "grad_norm": 2.065650701522827, + "learning_rate": 9.079754601226993e-05, + "loss": 2.9291, + "step": 889 + }, + { + "epoch": 0.27317372621240027, + "grad_norm": 2.204819679260254, + "learning_rate": 9.08997955010225e-05, + "loss": 2.9153, + "step": 890 + }, + { + "epoch": 0.27348066298342544, + "grad_norm": 1.7931475639343262, + "learning_rate": 9.100204498977506e-05, + "loss": 2.9104, + "step": 891 + }, + { + "epoch": 0.2737875997544506, + "grad_norm": 2.4288859367370605, + "learning_rate": 9.110429447852761e-05, + "loss": 2.9974, + "step": 892 + }, + { + "epoch": 0.2740945365254758, + "grad_norm": 2.095872640609741, + "learning_rate": 9.120654396728016e-05, + "loss": 2.8446, + "step": 893 + }, + { + "epoch": 0.27440147329650094, + "grad_norm": 2.054410696029663, + "learning_rate": 9.130879345603273e-05, + "loss": 2.9008, + "step": 894 + }, + { + "epoch": 0.2747084100675261, + "grad_norm": 2.1989710330963135, + "learning_rate": 9.141104294478528e-05, + "loss": 2.8808, + "step": 895 + }, + { + "epoch": 0.2750153468385513, + "grad_norm": 2.531081199645996, + "learning_rate": 9.151329243353784e-05, + "loss": 2.8928, + "step": 896 + }, + { + "epoch": 0.27532228360957645, + "grad_norm": 2.010425567626953, + "learning_rate": 9.161554192229039e-05, + "loss": 2.9051, + "step": 897 + }, + { + "epoch": 0.2756292203806016, + "grad_norm": 1.9320241212844849, + "learning_rate": 9.171779141104296e-05, + "loss": 2.8675, + "step": 898 + }, + { + "epoch": 0.2759361571516268, + "grad_norm": 2.2280430793762207, + "learning_rate": 9.18200408997955e-05, + "loss": 2.9082, + "step": 899 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 1.9172335863113403, + "learning_rate": 9.192229038854807e-05, + "loss": 2.8947, + "step": 900 + }, + { + "epoch": 0.2765500306936771, + "grad_norm": 2.0846056938171387, + "learning_rate": 9.202453987730062e-05, + "loss": 2.9161, + "step": 901 + }, + { + "epoch": 0.2768569674647023, + "grad_norm": 1.875034213066101, + "learning_rate": 9.212678936605317e-05, + "loss": 2.8937, + "step": 902 + }, + { + "epoch": 0.27716390423572745, + "grad_norm": 2.230164051055908, + "learning_rate": 9.222903885480573e-05, + "loss": 2.8396, + "step": 903 + }, + { + "epoch": 0.2774708410067526, + "grad_norm": 1.6204382181167603, + "learning_rate": 9.233128834355828e-05, + "loss": 2.9367, + "step": 904 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.4218156337738037, + "learning_rate": 9.243353783231085e-05, + "loss": 2.9727, + "step": 905 + }, + { + "epoch": 0.27808471454880296, + "grad_norm": 1.7401793003082275, + "learning_rate": 9.25357873210634e-05, + "loss": 2.8957, + "step": 906 + }, + { + "epoch": 0.2783916513198281, + "grad_norm": 2.2128076553344727, + "learning_rate": 9.263803680981595e-05, + "loss": 2.8725, + "step": 907 + }, + { + "epoch": 0.2786985880908533, + "grad_norm": 2.004179000854492, + "learning_rate": 9.274028629856851e-05, + "loss": 2.8879, + "step": 908 + }, + { + "epoch": 0.27900552486187846, + "grad_norm": 2.198784112930298, + "learning_rate": 9.284253578732107e-05, + "loss": 2.9655, + "step": 909 + }, + { + "epoch": 0.27931246163290363, + "grad_norm": 1.8064004182815552, + "learning_rate": 9.294478527607362e-05, + "loss": 2.7801, + "step": 910 + }, + { + "epoch": 0.2796193984039288, + "grad_norm": 2.1273581981658936, + "learning_rate": 9.304703476482618e-05, + "loss": 2.8615, + "step": 911 + }, + { + "epoch": 0.27992633517495397, + "grad_norm": 1.7843197584152222, + "learning_rate": 9.314928425357874e-05, + "loss": 2.8735, + "step": 912 + }, + { + "epoch": 0.28023327194597913, + "grad_norm": 2.234886884689331, + "learning_rate": 9.325153374233129e-05, + "loss": 2.9444, + "step": 913 + }, + { + "epoch": 0.2805402087170043, + "grad_norm": 2.0565783977508545, + "learning_rate": 9.335378323108384e-05, + "loss": 2.9784, + "step": 914 + }, + { + "epoch": 0.28084714548802947, + "grad_norm": 1.836901068687439, + "learning_rate": 9.345603271983641e-05, + "loss": 2.9217, + "step": 915 + }, + { + "epoch": 0.28115408225905464, + "grad_norm": 2.0981357097625732, + "learning_rate": 9.355828220858896e-05, + "loss": 2.9091, + "step": 916 + }, + { + "epoch": 0.2814610190300798, + "grad_norm": 1.9199821949005127, + "learning_rate": 9.366053169734152e-05, + "loss": 2.8882, + "step": 917 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 1.9928756952285767, + "learning_rate": 9.376278118609407e-05, + "loss": 2.8463, + "step": 918 + }, + { + "epoch": 0.28207489257213014, + "grad_norm": 1.9580156803131104, + "learning_rate": 9.386503067484664e-05, + "loss": 2.7814, + "step": 919 + }, + { + "epoch": 0.2823818293431553, + "grad_norm": 2.016144275665283, + "learning_rate": 9.396728016359919e-05, + "loss": 2.8725, + "step": 920 + }, + { + "epoch": 0.2826887661141805, + "grad_norm": 1.967668890953064, + "learning_rate": 9.406952965235175e-05, + "loss": 2.912, + "step": 921 + }, + { + "epoch": 0.28299570288520565, + "grad_norm": 1.8826593160629272, + "learning_rate": 9.41717791411043e-05, + "loss": 2.7885, + "step": 922 + }, + { + "epoch": 0.2833026396562308, + "grad_norm": 2.0615732669830322, + "learning_rate": 9.427402862985685e-05, + "loss": 2.9111, + "step": 923 + }, + { + "epoch": 0.283609576427256, + "grad_norm": 1.7132701873779297, + "learning_rate": 9.43762781186094e-05, + "loss": 2.89, + "step": 924 + }, + { + "epoch": 0.28391651319828115, + "grad_norm": 2.1561272144317627, + "learning_rate": 9.447852760736196e-05, + "loss": 2.8741, + "step": 925 + }, + { + "epoch": 0.2842234499693063, + "grad_norm": 1.727338433265686, + "learning_rate": 9.458077709611453e-05, + "loss": 2.8449, + "step": 926 + }, + { + "epoch": 0.2845303867403315, + "grad_norm": 2.19234299659729, + "learning_rate": 9.468302658486708e-05, + "loss": 2.8499, + "step": 927 + }, + { + "epoch": 0.28483732351135665, + "grad_norm": 1.7370812892913818, + "learning_rate": 9.478527607361963e-05, + "loss": 2.882, + "step": 928 + }, + { + "epoch": 0.2851442602823818, + "grad_norm": 2.0576157569885254, + "learning_rate": 9.488752556237219e-05, + "loss": 2.7869, + "step": 929 + }, + { + "epoch": 0.285451197053407, + "grad_norm": 1.7926486730575562, + "learning_rate": 9.498977505112476e-05, + "loss": 2.906, + "step": 930 + }, + { + "epoch": 0.28575813382443216, + "grad_norm": 1.6877856254577637, + "learning_rate": 9.50920245398773e-05, + "loss": 2.8422, + "step": 931 + }, + { + "epoch": 0.2860650705954573, + "grad_norm": 2.3053178787231445, + "learning_rate": 9.519427402862986e-05, + "loss": 2.9039, + "step": 932 + }, + { + "epoch": 0.2863720073664825, + "grad_norm": 1.7746092081069946, + "learning_rate": 9.529652351738242e-05, + "loss": 2.9082, + "step": 933 + }, + { + "epoch": 0.28667894413750766, + "grad_norm": 2.1900086402893066, + "learning_rate": 9.539877300613498e-05, + "loss": 2.8511, + "step": 934 + }, + { + "epoch": 0.28698588090853283, + "grad_norm": 1.781988501548767, + "learning_rate": 9.550102249488752e-05, + "loss": 2.8264, + "step": 935 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 1.845797061920166, + "learning_rate": 9.560327198364009e-05, + "loss": 2.8657, + "step": 936 + }, + { + "epoch": 0.28759975445058317, + "grad_norm": 1.8794586658477783, + "learning_rate": 9.570552147239264e-05, + "loss": 2.8365, + "step": 937 + }, + { + "epoch": 0.28790669122160834, + "grad_norm": 2.078359603881836, + "learning_rate": 9.58077709611452e-05, + "loss": 2.8829, + "step": 938 + }, + { + "epoch": 0.2882136279926335, + "grad_norm": 1.8091285228729248, + "learning_rate": 9.591002044989775e-05, + "loss": 2.8083, + "step": 939 + }, + { + "epoch": 0.28852056476365867, + "grad_norm": 2.0130608081817627, + "learning_rate": 9.601226993865032e-05, + "loss": 2.8922, + "step": 940 + }, + { + "epoch": 0.28882750153468384, + "grad_norm": 1.8504360914230347, + "learning_rate": 9.611451942740287e-05, + "loss": 2.8034, + "step": 941 + }, + { + "epoch": 0.289134438305709, + "grad_norm": 1.860420823097229, + "learning_rate": 9.621676891615543e-05, + "loss": 2.8249, + "step": 942 + }, + { + "epoch": 0.2894413750767342, + "grad_norm": 2.157158374786377, + "learning_rate": 9.631901840490798e-05, + "loss": 2.8629, + "step": 943 + }, + { + "epoch": 0.28974831184775934, + "grad_norm": 1.8066895008087158, + "learning_rate": 9.642126789366053e-05, + "loss": 2.7965, + "step": 944 + }, + { + "epoch": 0.2900552486187845, + "grad_norm": 1.9674500226974487, + "learning_rate": 9.65235173824131e-05, + "loss": 2.8043, + "step": 945 + }, + { + "epoch": 0.2903621853898097, + "grad_norm": 1.7899354696273804, + "learning_rate": 9.662576687116564e-05, + "loss": 2.8803, + "step": 946 + }, + { + "epoch": 0.29066912216083485, + "grad_norm": 2.220201015472412, + "learning_rate": 9.672801635991821e-05, + "loss": 2.8201, + "step": 947 + }, + { + "epoch": 0.29097605893186, + "grad_norm": 1.76320219039917, + "learning_rate": 9.683026584867076e-05, + "loss": 2.8921, + "step": 948 + }, + { + "epoch": 0.2912829957028852, + "grad_norm": 1.6863081455230713, + "learning_rate": 9.693251533742331e-05, + "loss": 2.8208, + "step": 949 + }, + { + "epoch": 0.29158993247391035, + "grad_norm": 2.1578476428985596, + "learning_rate": 9.703476482617587e-05, + "loss": 2.8972, + "step": 950 + }, + { + "epoch": 0.2918968692449355, + "grad_norm": 1.6925181150436401, + "learning_rate": 9.713701431492844e-05, + "loss": 2.8225, + "step": 951 + }, + { + "epoch": 0.2922038060159607, + "grad_norm": 1.8861147165298462, + "learning_rate": 9.723926380368099e-05, + "loss": 2.8707, + "step": 952 + }, + { + "epoch": 0.29251074278698586, + "grad_norm": 1.5894604921340942, + "learning_rate": 9.734151329243354e-05, + "loss": 2.7576, + "step": 953 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 1.9092673063278198, + "learning_rate": 9.74437627811861e-05, + "loss": 2.8659, + "step": 954 + }, + { + "epoch": 0.2931246163290362, + "grad_norm": 1.8600605726242065, + "learning_rate": 9.754601226993866e-05, + "loss": 2.752, + "step": 955 + }, + { + "epoch": 0.29343155310006136, + "grad_norm": 2.005805015563965, + "learning_rate": 9.76482617586912e-05, + "loss": 2.8511, + "step": 956 + }, + { + "epoch": 0.2937384898710866, + "grad_norm": 1.9485148191452026, + "learning_rate": 9.775051124744377e-05, + "loss": 2.9726, + "step": 957 + }, + { + "epoch": 0.29404542664211175, + "grad_norm": 1.9197280406951904, + "learning_rate": 9.785276073619632e-05, + "loss": 2.7753, + "step": 958 + }, + { + "epoch": 0.2943523634131369, + "grad_norm": 1.6279773712158203, + "learning_rate": 9.795501022494888e-05, + "loss": 2.8855, + "step": 959 + }, + { + "epoch": 0.2946593001841621, + "grad_norm": 2.0233097076416016, + "learning_rate": 9.805725971370143e-05, + "loss": 2.749, + "step": 960 + }, + { + "epoch": 0.29496623695518726, + "grad_norm": 1.550295352935791, + "learning_rate": 9.815950920245399e-05, + "loss": 2.7991, + "step": 961 + }, + { + "epoch": 0.2952731737262124, + "grad_norm": 2.3194360733032227, + "learning_rate": 9.826175869120655e-05, + "loss": 2.8208, + "step": 962 + }, + { + "epoch": 0.2955801104972376, + "grad_norm": 1.634867787361145, + "learning_rate": 9.83640081799591e-05, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.29588704726826276, + "grad_norm": 2.1152596473693848, + "learning_rate": 9.846625766871166e-05, + "loss": 2.7667, + "step": 964 + }, + { + "epoch": 0.2961939840392879, + "grad_norm": 1.8927233219146729, + "learning_rate": 9.856850715746421e-05, + "loss": 2.8308, + "step": 965 + }, + { + "epoch": 0.2965009208103131, + "grad_norm": 1.765026330947876, + "learning_rate": 9.867075664621678e-05, + "loss": 2.7546, + "step": 966 + }, + { + "epoch": 0.29680785758133826, + "grad_norm": 1.7491015195846558, + "learning_rate": 9.877300613496932e-05, + "loss": 2.8156, + "step": 967 + }, + { + "epoch": 0.29711479435236343, + "grad_norm": 1.8352077007293701, + "learning_rate": 9.887525562372189e-05, + "loss": 2.8542, + "step": 968 + }, + { + "epoch": 0.2974217311233886, + "grad_norm": 1.8892323970794678, + "learning_rate": 9.897750511247444e-05, + "loss": 2.8216, + "step": 969 + }, + { + "epoch": 0.29772866789441377, + "grad_norm": 1.7171403169631958, + "learning_rate": 9.907975460122701e-05, + "loss": 2.8428, + "step": 970 + }, + { + "epoch": 0.29803560466543894, + "grad_norm": 1.8318040370941162, + "learning_rate": 9.918200408997955e-05, + "loss": 2.7821, + "step": 971 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 1.5829975605010986, + "learning_rate": 9.928425357873212e-05, + "loss": 2.9091, + "step": 972 + }, + { + "epoch": 0.29864947820748927, + "grad_norm": 1.7248235940933228, + "learning_rate": 9.938650306748467e-05, + "loss": 2.7914, + "step": 973 + }, + { + "epoch": 0.29895641497851444, + "grad_norm": 1.7741187810897827, + "learning_rate": 9.948875255623722e-05, + "loss": 2.8711, + "step": 974 + }, + { + "epoch": 0.2992633517495396, + "grad_norm": 1.7419151067733765, + "learning_rate": 9.959100204498978e-05, + "loss": 2.8933, + "step": 975 + }, + { + "epoch": 0.2995702885205648, + "grad_norm": 1.6603926420211792, + "learning_rate": 9.969325153374234e-05, + "loss": 2.7138, + "step": 976 + }, + { + "epoch": 0.29987722529158994, + "grad_norm": 1.8423576354980469, + "learning_rate": 9.97955010224949e-05, + "loss": 2.7776, + "step": 977 + }, + { + "epoch": 0.3001841620626151, + "grad_norm": 1.5548568964004517, + "learning_rate": 9.989775051124745e-05, + "loss": 2.8193, + "step": 978 + }, + { + "epoch": 0.3004910988336403, + "grad_norm": 1.711785078048706, + "learning_rate": 0.0001, + "loss": 2.7082, + "step": 979 + }, + { + "epoch": 0.30079803560466545, + "grad_norm": 1.6395221948623657, + "learning_rate": 9.999999975293535e-05, + "loss": 2.7526, + "step": 980 + }, + { + "epoch": 0.3011049723756906, + "grad_norm": 1.829174518585205, + "learning_rate": 9.999999901174139e-05, + "loss": 2.7555, + "step": 981 + }, + { + "epoch": 0.3014119091467158, + "grad_norm": 1.5807569026947021, + "learning_rate": 9.999999777641814e-05, + "loss": 2.848, + "step": 982 + }, + { + "epoch": 0.30171884591774095, + "grad_norm": 2.014803171157837, + "learning_rate": 9.99999960469656e-05, + "loss": 2.8318, + "step": 983 + }, + { + "epoch": 0.3020257826887661, + "grad_norm": 1.4732542037963867, + "learning_rate": 9.99999938233838e-05, + "loss": 2.8143, + "step": 984 + }, + { + "epoch": 0.3023327194597913, + "grad_norm": 2.4888343811035156, + "learning_rate": 9.999999110567275e-05, + "loss": 2.7979, + "step": 985 + }, + { + "epoch": 0.30263965623081646, + "grad_norm": 1.4265737533569336, + "learning_rate": 9.99999878938325e-05, + "loss": 2.7968, + "step": 986 + }, + { + "epoch": 0.3029465930018416, + "grad_norm": 2.0397326946258545, + "learning_rate": 9.999998418786303e-05, + "loss": 2.7413, + "step": 987 + }, + { + "epoch": 0.3032535297728668, + "grad_norm": 1.6565579175949097, + "learning_rate": 9.999997998776443e-05, + "loss": 2.8249, + "step": 988 + }, + { + "epoch": 0.30356046654389196, + "grad_norm": 1.8470033407211304, + "learning_rate": 9.999997529353673e-05, + "loss": 2.7815, + "step": 989 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 1.571768045425415, + "learning_rate": 9.999997010517995e-05, + "loss": 2.7202, + "step": 990 + }, + { + "epoch": 0.3041743400859423, + "grad_norm": 1.6217811107635498, + "learning_rate": 9.999996442269417e-05, + "loss": 2.832, + "step": 991 + }, + { + "epoch": 0.30448127685696746, + "grad_norm": 1.745591640472412, + "learning_rate": 9.999995824607943e-05, + "loss": 2.8271, + "step": 992 + }, + { + "epoch": 0.30478821362799263, + "grad_norm": 1.6469355821609497, + "learning_rate": 9.99999515753358e-05, + "loss": 2.7699, + "step": 993 + }, + { + "epoch": 0.3050951503990178, + "grad_norm": 1.733182430267334, + "learning_rate": 9.999994441046334e-05, + "loss": 2.7927, + "step": 994 + }, + { + "epoch": 0.30540208717004297, + "grad_norm": 1.6043230295181274, + "learning_rate": 9.999993675146213e-05, + "loss": 2.7536, + "step": 995 + }, + { + "epoch": 0.30570902394106814, + "grad_norm": 1.8154711723327637, + "learning_rate": 9.999992859833222e-05, + "loss": 2.7795, + "step": 996 + }, + { + "epoch": 0.3060159607120933, + "grad_norm": 1.7553666830062866, + "learning_rate": 9.999991995107374e-05, + "loss": 2.8128, + "step": 997 + }, + { + "epoch": 0.3063228974831185, + "grad_norm": 1.702697992324829, + "learning_rate": 9.999991080968672e-05, + "loss": 2.7234, + "step": 998 + }, + { + "epoch": 0.30662983425414364, + "grad_norm": 1.512619972229004, + "learning_rate": 9.99999011741713e-05, + "loss": 2.7555, + "step": 999 + }, + { + "epoch": 0.3069367710251688, + "grad_norm": 1.735844612121582, + "learning_rate": 9.999989104452753e-05, + "loss": 2.7847, + "step": 1000 + }, + { + "epoch": 0.307243707796194, + "grad_norm": 1.4687904119491577, + "learning_rate": 9.999988042075555e-05, + "loss": 2.8039, + "step": 1001 + }, + { + "epoch": 0.30755064456721914, + "grad_norm": 1.6867917776107788, + "learning_rate": 9.999986930285542e-05, + "loss": 2.7643, + "step": 1002 + }, + { + "epoch": 0.3078575813382443, + "grad_norm": 1.6974400281906128, + "learning_rate": 9.99998576908273e-05, + "loss": 2.7284, + "step": 1003 + }, + { + "epoch": 0.3081645181092695, + "grad_norm": 1.6622353792190552, + "learning_rate": 9.999984558467126e-05, + "loss": 2.8364, + "step": 1004 + }, + { + "epoch": 0.30847145488029465, + "grad_norm": 1.7920496463775635, + "learning_rate": 9.999983298438744e-05, + "loss": 2.7769, + "step": 1005 + }, + { + "epoch": 0.3087783916513198, + "grad_norm": 1.7111997604370117, + "learning_rate": 9.999981988997598e-05, + "loss": 2.7323, + "step": 1006 + }, + { + "epoch": 0.309085328422345, + "grad_norm": 1.6372064352035522, + "learning_rate": 9.9999806301437e-05, + "loss": 2.8128, + "step": 1007 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 1.841002345085144, + "learning_rate": 9.999979221877061e-05, + "loss": 2.7049, + "step": 1008 + }, + { + "epoch": 0.3096992019643953, + "grad_norm": 1.4474141597747803, + "learning_rate": 9.999977764197697e-05, + "loss": 2.64, + "step": 1009 + }, + { + "epoch": 0.3100061387354205, + "grad_norm": 1.6599560976028442, + "learning_rate": 9.999976257105622e-05, + "loss": 2.7989, + "step": 1010 + }, + { + "epoch": 0.31031307550644566, + "grad_norm": 1.7502890825271606, + "learning_rate": 9.999974700600851e-05, + "loss": 2.7949, + "step": 1011 + }, + { + "epoch": 0.3106200122774708, + "grad_norm": 1.8119313716888428, + "learning_rate": 9.9999730946834e-05, + "loss": 2.7577, + "step": 1012 + }, + { + "epoch": 0.310926949048496, + "grad_norm": 1.4398404359817505, + "learning_rate": 9.999971439353284e-05, + "loss": 2.7369, + "step": 1013 + }, + { + "epoch": 0.31123388581952116, + "grad_norm": 1.8501840829849243, + "learning_rate": 9.999969734610522e-05, + "loss": 2.6651, + "step": 1014 + }, + { + "epoch": 0.31154082259054633, + "grad_norm": 1.450804352760315, + "learning_rate": 9.999967980455125e-05, + "loss": 2.7231, + "step": 1015 + }, + { + "epoch": 0.3118477593615715, + "grad_norm": 1.9445282220840454, + "learning_rate": 9.999966176887115e-05, + "loss": 2.795, + "step": 1016 + }, + { + "epoch": 0.31215469613259667, + "grad_norm": 1.6361008882522583, + "learning_rate": 9.99996432390651e-05, + "loss": 2.8894, + "step": 1017 + }, + { + "epoch": 0.31246163290362183, + "grad_norm": 2.0804831981658936, + "learning_rate": 9.999962421513325e-05, + "loss": 2.8313, + "step": 1018 + }, + { + "epoch": 0.312768569674647, + "grad_norm": 1.3779852390289307, + "learning_rate": 9.999960469707582e-05, + "loss": 2.6776, + "step": 1019 + }, + { + "epoch": 0.31307550644567217, + "grad_norm": 1.7727700471878052, + "learning_rate": 9.999958468489299e-05, + "loss": 2.8076, + "step": 1020 + }, + { + "epoch": 0.31338244321669734, + "grad_norm": 1.5273795127868652, + "learning_rate": 9.999956417858496e-05, + "loss": 2.7069, + "step": 1021 + }, + { + "epoch": 0.3136893799877225, + "grad_norm": 1.8135402202606201, + "learning_rate": 9.999954317815193e-05, + "loss": 2.7375, + "step": 1022 + }, + { + "epoch": 0.3139963167587477, + "grad_norm": 1.6642818450927734, + "learning_rate": 9.99995216835941e-05, + "loss": 2.8085, + "step": 1023 + }, + { + "epoch": 0.31430325352977284, + "grad_norm": 1.681378722190857, + "learning_rate": 9.999949969491169e-05, + "loss": 2.807, + "step": 1024 + }, + { + "epoch": 0.314610190300798, + "grad_norm": 1.5521160364151, + "learning_rate": 9.999947721210493e-05, + "loss": 2.7266, + "step": 1025 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 1.486830711364746, + "learning_rate": 9.999945423517403e-05, + "loss": 2.774, + "step": 1026 + }, + { + "epoch": 0.3152240638428484, + "grad_norm": 1.5730900764465332, + "learning_rate": 9.99994307641192e-05, + "loss": 2.7101, + "step": 1027 + }, + { + "epoch": 0.31553100061387357, + "grad_norm": 1.4835596084594727, + "learning_rate": 9.999940679894071e-05, + "loss": 2.8195, + "step": 1028 + }, + { + "epoch": 0.31583793738489874, + "grad_norm": 1.7885956764221191, + "learning_rate": 9.999938233963877e-05, + "loss": 2.796, + "step": 1029 + }, + { + "epoch": 0.3161448741559239, + "grad_norm": 1.4036259651184082, + "learning_rate": 9.999935738621362e-05, + "loss": 2.7167, + "step": 1030 + }, + { + "epoch": 0.3164518109269491, + "grad_norm": 1.7480512857437134, + "learning_rate": 9.999933193866554e-05, + "loss": 2.6774, + "step": 1031 + }, + { + "epoch": 0.31675874769797424, + "grad_norm": 1.66177499294281, + "learning_rate": 9.999930599699473e-05, + "loss": 2.7635, + "step": 1032 + }, + { + "epoch": 0.3170656844689994, + "grad_norm": 1.5088306665420532, + "learning_rate": 9.999927956120147e-05, + "loss": 2.7284, + "step": 1033 + }, + { + "epoch": 0.3173726212400246, + "grad_norm": 1.6847199201583862, + "learning_rate": 9.999925263128605e-05, + "loss": 2.8287, + "step": 1034 + }, + { + "epoch": 0.31767955801104975, + "grad_norm": 1.6092369556427002, + "learning_rate": 9.999922520724869e-05, + "loss": 2.7189, + "step": 1035 + }, + { + "epoch": 0.3179864947820749, + "grad_norm": 1.41717529296875, + "learning_rate": 9.999919728908969e-05, + "loss": 2.7134, + "step": 1036 + }, + { + "epoch": 0.3182934315531001, + "grad_norm": 1.6256498098373413, + "learning_rate": 9.999916887680931e-05, + "loss": 2.7312, + "step": 1037 + }, + { + "epoch": 0.31860036832412525, + "grad_norm": 1.4934377670288086, + "learning_rate": 9.999913997040784e-05, + "loss": 2.7548, + "step": 1038 + }, + { + "epoch": 0.3189073050951504, + "grad_norm": 1.6037719249725342, + "learning_rate": 9.999911056988557e-05, + "loss": 2.7682, + "step": 1039 + }, + { + "epoch": 0.3192142418661756, + "grad_norm": 1.4746284484863281, + "learning_rate": 9.999908067524277e-05, + "loss": 2.7256, + "step": 1040 + }, + { + "epoch": 0.31952117863720075, + "grad_norm": 1.4633710384368896, + "learning_rate": 9.999905028647976e-05, + "loss": 2.6779, + "step": 1041 + }, + { + "epoch": 0.3198281154082259, + "grad_norm": 1.6108646392822266, + "learning_rate": 9.999901940359684e-05, + "loss": 2.781, + "step": 1042 + }, + { + "epoch": 0.3201350521792511, + "grad_norm": 1.4130996465682983, + "learning_rate": 9.999898802659428e-05, + "loss": 2.6327, + "step": 1043 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 2.110307455062866, + "learning_rate": 9.999895615547244e-05, + "loss": 2.7965, + "step": 1044 + }, + { + "epoch": 0.3207489257213014, + "grad_norm": 1.500618815422058, + "learning_rate": 9.99989237902316e-05, + "loss": 2.7874, + "step": 1045 + }, + { + "epoch": 0.3210558624923266, + "grad_norm": 1.577890157699585, + "learning_rate": 9.999889093087207e-05, + "loss": 2.6816, + "step": 1046 + }, + { + "epoch": 0.32136279926335176, + "grad_norm": 1.2820981740951538, + "learning_rate": 9.999885757739422e-05, + "loss": 2.6799, + "step": 1047 + }, + { + "epoch": 0.32166973603437693, + "grad_norm": 1.629936695098877, + "learning_rate": 9.999882372979835e-05, + "loss": 2.6783, + "step": 1048 + }, + { + "epoch": 0.3219766728054021, + "grad_norm": 1.3119972944259644, + "learning_rate": 9.999878938808478e-05, + "loss": 2.6403, + "step": 1049 + }, + { + "epoch": 0.32228360957642727, + "grad_norm": 1.720093846321106, + "learning_rate": 9.999875455225389e-05, + "loss": 2.709, + "step": 1050 + }, + { + "epoch": 0.32259054634745243, + "grad_norm": 1.446273922920227, + "learning_rate": 9.999871922230599e-05, + "loss": 2.6463, + "step": 1051 + }, + { + "epoch": 0.3228974831184776, + "grad_norm": 1.5000908374786377, + "learning_rate": 9.999868339824145e-05, + "loss": 2.7502, + "step": 1052 + }, + { + "epoch": 0.32320441988950277, + "grad_norm": 1.6257869005203247, + "learning_rate": 9.999864708006061e-05, + "loss": 2.6984, + "step": 1053 + }, + { + "epoch": 0.32351135666052794, + "grad_norm": 1.509638786315918, + "learning_rate": 9.999861026776384e-05, + "loss": 2.6931, + "step": 1054 + }, + { + "epoch": 0.3238182934315531, + "grad_norm": 1.5305874347686768, + "learning_rate": 9.999857296135149e-05, + "loss": 2.8423, + "step": 1055 + }, + { + "epoch": 0.3241252302025783, + "grad_norm": 1.7664300203323364, + "learning_rate": 9.999853516082394e-05, + "loss": 2.7703, + "step": 1056 + }, + { + "epoch": 0.32443216697360344, + "grad_norm": 1.4633153676986694, + "learning_rate": 9.999849686618157e-05, + "loss": 2.7588, + "step": 1057 + }, + { + "epoch": 0.3247391037446286, + "grad_norm": 1.5177773237228394, + "learning_rate": 9.999845807742473e-05, + "loss": 2.7376, + "step": 1058 + }, + { + "epoch": 0.3250460405156538, + "grad_norm": 1.6122089624404907, + "learning_rate": 9.999841879455383e-05, + "loss": 2.7871, + "step": 1059 + }, + { + "epoch": 0.32535297728667895, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.999837901756926e-05, + "loss": 2.6602, + "step": 1060 + }, + { + "epoch": 0.3256599140577041, + "grad_norm": 1.5714327096939087, + "learning_rate": 9.99983387464714e-05, + "loss": 2.6279, + "step": 1061 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 1.399731993675232, + "learning_rate": 9.999829798126065e-05, + "loss": 2.7957, + "step": 1062 + }, + { + "epoch": 0.32627378759975445, + "grad_norm": 1.694368839263916, + "learning_rate": 9.999825672193741e-05, + "loss": 2.6859, + "step": 1063 + }, + { + "epoch": 0.3265807243707796, + "grad_norm": 1.2585967779159546, + "learning_rate": 9.99982149685021e-05, + "loss": 2.7964, + "step": 1064 + }, + { + "epoch": 0.3268876611418048, + "grad_norm": 1.802262306213379, + "learning_rate": 9.999817272095512e-05, + "loss": 2.6325, + "step": 1065 + }, + { + "epoch": 0.32719459791282995, + "grad_norm": 1.213222861289978, + "learning_rate": 9.99981299792969e-05, + "loss": 2.718, + "step": 1066 + }, + { + "epoch": 0.3275015346838551, + "grad_norm": 1.5745760202407837, + "learning_rate": 9.999808674352785e-05, + "loss": 2.8589, + "step": 1067 + }, + { + "epoch": 0.3278084714548803, + "grad_norm": 1.516995906829834, + "learning_rate": 9.999804301364839e-05, + "loss": 2.6691, + "step": 1068 + }, + { + "epoch": 0.32811540822590546, + "grad_norm": 1.4223122596740723, + "learning_rate": 9.999799878965897e-05, + "loss": 2.6899, + "step": 1069 + }, + { + "epoch": 0.3284223449969306, + "grad_norm": 1.4502828121185303, + "learning_rate": 9.999795407156003e-05, + "loss": 2.7801, + "step": 1070 + }, + { + "epoch": 0.3287292817679558, + "grad_norm": 1.4692026376724243, + "learning_rate": 9.999790885935198e-05, + "loss": 2.6869, + "step": 1071 + }, + { + "epoch": 0.32903621853898096, + "grad_norm": 1.4182246923446655, + "learning_rate": 9.999786315303532e-05, + "loss": 2.7802, + "step": 1072 + }, + { + "epoch": 0.32934315531000613, + "grad_norm": 1.781173586845398, + "learning_rate": 9.999781695261046e-05, + "loss": 2.7522, + "step": 1073 + }, + { + "epoch": 0.3296500920810313, + "grad_norm": 1.3958306312561035, + "learning_rate": 9.999777025807786e-05, + "loss": 2.6894, + "step": 1074 + }, + { + "epoch": 0.32995702885205647, + "grad_norm": 1.7938110828399658, + "learning_rate": 9.9997723069438e-05, + "loss": 2.6468, + "step": 1075 + }, + { + "epoch": 0.33026396562308163, + "grad_norm": 1.2314528226852417, + "learning_rate": 9.999767538669134e-05, + "loss": 2.7446, + "step": 1076 + }, + { + "epoch": 0.3305709023941068, + "grad_norm": 1.4881565570831299, + "learning_rate": 9.999762720983835e-05, + "loss": 2.6904, + "step": 1077 + }, + { + "epoch": 0.33087783916513197, + "grad_norm": 1.3903130292892456, + "learning_rate": 9.999757853887948e-05, + "loss": 2.7315, + "step": 1078 + }, + { + "epoch": 0.33118477593615714, + "grad_norm": 1.491129755973816, + "learning_rate": 9.999752937381525e-05, + "loss": 2.7325, + "step": 1079 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 1.4748190641403198, + "learning_rate": 9.999747971464612e-05, + "loss": 2.7288, + "step": 1080 + }, + { + "epoch": 0.3317986494782075, + "grad_norm": 1.5664055347442627, + "learning_rate": 9.99974295613726e-05, + "loss": 2.8225, + "step": 1081 + }, + { + "epoch": 0.33210558624923264, + "grad_norm": 1.4422696828842163, + "learning_rate": 9.999737891399518e-05, + "loss": 2.6537, + "step": 1082 + }, + { + "epoch": 0.3324125230202578, + "grad_norm": 1.397817850112915, + "learning_rate": 9.999732777251436e-05, + "loss": 2.6329, + "step": 1083 + }, + { + "epoch": 0.332719459791283, + "grad_norm": 1.4253548383712769, + "learning_rate": 9.999727613693063e-05, + "loss": 2.7028, + "step": 1084 + }, + { + "epoch": 0.33302639656230815, + "grad_norm": 1.4327688217163086, + "learning_rate": 9.999722400724451e-05, + "loss": 2.6524, + "step": 1085 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.999717138345654e-05, + "loss": 2.7278, + "step": 1086 + }, + { + "epoch": 0.3336402701043585, + "grad_norm": 1.536656379699707, + "learning_rate": 9.999711826556719e-05, + "loss": 2.5858, + "step": 1087 + }, + { + "epoch": 0.33394720687538365, + "grad_norm": 1.4210286140441895, + "learning_rate": 9.999706465357703e-05, + "loss": 2.7057, + "step": 1088 + }, + { + "epoch": 0.3342541436464088, + "grad_norm": 1.4605839252471924, + "learning_rate": 9.999701054748657e-05, + "loss": 2.6461, + "step": 1089 + }, + { + "epoch": 0.334561080417434, + "grad_norm": 1.4764037132263184, + "learning_rate": 9.999695594729636e-05, + "loss": 2.608, + "step": 1090 + }, + { + "epoch": 0.33486801718845915, + "grad_norm": 1.630843162536621, + "learning_rate": 9.99969008530069e-05, + "loss": 2.6165, + "step": 1091 + }, + { + "epoch": 0.3351749539594843, + "grad_norm": 1.3693522214889526, + "learning_rate": 9.999684526461879e-05, + "loss": 2.72, + "step": 1092 + }, + { + "epoch": 0.3354818907305095, + "grad_norm": 1.609580636024475, + "learning_rate": 9.999678918213254e-05, + "loss": 2.7602, + "step": 1093 + }, + { + "epoch": 0.33578882750153466, + "grad_norm": 1.3815720081329346, + "learning_rate": 9.999673260554872e-05, + "loss": 2.6297, + "step": 1094 + }, + { + "epoch": 0.3360957642725598, + "grad_norm": 1.4511120319366455, + "learning_rate": 9.999667553486787e-05, + "loss": 2.7515, + "step": 1095 + }, + { + "epoch": 0.336402701043585, + "grad_norm": 1.486387848854065, + "learning_rate": 9.999661797009057e-05, + "loss": 2.6839, + "step": 1096 + }, + { + "epoch": 0.33670963781461016, + "grad_norm": 1.239160180091858, + "learning_rate": 9.999655991121739e-05, + "loss": 2.6033, + "step": 1097 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 1.499598741531372, + "learning_rate": 9.999650135824891e-05, + "loss": 2.5582, + "step": 1098 + }, + { + "epoch": 0.33732351135666055, + "grad_norm": 1.32973051071167, + "learning_rate": 9.999644231118571e-05, + "loss": 2.6253, + "step": 1099 + }, + { + "epoch": 0.3376304481276857, + "grad_norm": 1.4025259017944336, + "learning_rate": 9.999638277002833e-05, + "loss": 2.6199, + "step": 1100 + }, + { + "epoch": 0.3379373848987109, + "grad_norm": 1.3162082433700562, + "learning_rate": 9.999632273477742e-05, + "loss": 2.5528, + "step": 1101 + }, + { + "epoch": 0.33824432166973606, + "grad_norm": 1.5454723834991455, + "learning_rate": 9.999626220543352e-05, + "loss": 2.6724, + "step": 1102 + }, + { + "epoch": 0.3385512584407612, + "grad_norm": 1.45896315574646, + "learning_rate": 9.999620118199727e-05, + "loss": 2.688, + "step": 1103 + }, + { + "epoch": 0.3388581952117864, + "grad_norm": 1.3940998315811157, + "learning_rate": 9.999613966446926e-05, + "loss": 2.6991, + "step": 1104 + }, + { + "epoch": 0.33916513198281156, + "grad_norm": 1.4427480697631836, + "learning_rate": 9.999607765285009e-05, + "loss": 2.6869, + "step": 1105 + }, + { + "epoch": 0.33947206875383673, + "grad_norm": 1.260373830795288, + "learning_rate": 9.999601514714036e-05, + "loss": 2.7011, + "step": 1106 + }, + { + "epoch": 0.3397790055248619, + "grad_norm": 1.5985103845596313, + "learning_rate": 9.999595214734072e-05, + "loss": 2.599, + "step": 1107 + }, + { + "epoch": 0.34008594229588707, + "grad_norm": 1.1968494653701782, + "learning_rate": 9.999588865345179e-05, + "loss": 2.6346, + "step": 1108 + }, + { + "epoch": 0.34039287906691224, + "grad_norm": 1.4565916061401367, + "learning_rate": 9.999582466547417e-05, + "loss": 2.6303, + "step": 1109 + }, + { + "epoch": 0.3406998158379374, + "grad_norm": 1.2992361783981323, + "learning_rate": 9.999576018340851e-05, + "loss": 2.6121, + "step": 1110 + }, + { + "epoch": 0.34100675260896257, + "grad_norm": 1.402471899986267, + "learning_rate": 9.999569520725543e-05, + "loss": 2.6697, + "step": 1111 + }, + { + "epoch": 0.34131368937998774, + "grad_norm": 1.3006439208984375, + "learning_rate": 9.99956297370156e-05, + "loss": 2.6347, + "step": 1112 + }, + { + "epoch": 0.3416206261510129, + "grad_norm": 1.4235650300979614, + "learning_rate": 9.999556377268966e-05, + "loss": 2.6869, + "step": 1113 + }, + { + "epoch": 0.3419275629220381, + "grad_norm": 1.3288183212280273, + "learning_rate": 9.999549731427824e-05, + "loss": 2.5834, + "step": 1114 + }, + { + "epoch": 0.34223449969306324, + "grad_norm": 1.430736780166626, + "learning_rate": 9.999543036178203e-05, + "loss": 2.6248, + "step": 1115 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 1.467417597770691, + "learning_rate": 9.999536291520167e-05, + "loss": 2.6563, + "step": 1116 + }, + { + "epoch": 0.3428483732351136, + "grad_norm": 1.3988397121429443, + "learning_rate": 9.999529497453782e-05, + "loss": 2.6634, + "step": 1117 + }, + { + "epoch": 0.34315531000613875, + "grad_norm": 1.2072746753692627, + "learning_rate": 9.999522653979117e-05, + "loss": 2.6129, + "step": 1118 + }, + { + "epoch": 0.3434622467771639, + "grad_norm": 1.5297373533248901, + "learning_rate": 9.999515761096239e-05, + "loss": 2.6359, + "step": 1119 + }, + { + "epoch": 0.3437691835481891, + "grad_norm": 1.2022082805633545, + "learning_rate": 9.999508818805214e-05, + "loss": 2.6934, + "step": 1120 + }, + { + "epoch": 0.34407612031921425, + "grad_norm": 1.5655800104141235, + "learning_rate": 9.999501827106114e-05, + "loss": 2.6132, + "step": 1121 + }, + { + "epoch": 0.3443830570902394, + "grad_norm": 1.1639407873153687, + "learning_rate": 9.999494785999007e-05, + "loss": 2.6416, + "step": 1122 + }, + { + "epoch": 0.3446899938612646, + "grad_norm": 1.5784116983413696, + "learning_rate": 9.999487695483962e-05, + "loss": 2.5967, + "step": 1123 + }, + { + "epoch": 0.34499693063228976, + "grad_norm": 1.1812770366668701, + "learning_rate": 9.999480555561049e-05, + "loss": 2.6303, + "step": 1124 + }, + { + "epoch": 0.3453038674033149, + "grad_norm": 1.5105888843536377, + "learning_rate": 9.99947336623034e-05, + "loss": 2.58, + "step": 1125 + }, + { + "epoch": 0.3456108041743401, + "grad_norm": 1.2969506978988647, + "learning_rate": 9.999466127491904e-05, + "loss": 2.6857, + "step": 1126 + }, + { + "epoch": 0.34591774094536526, + "grad_norm": 1.679018259048462, + "learning_rate": 9.999458839345812e-05, + "loss": 2.6304, + "step": 1127 + }, + { + "epoch": 0.3462246777163904, + "grad_norm": 1.2718015909194946, + "learning_rate": 9.99945150179214e-05, + "loss": 2.6929, + "step": 1128 + }, + { + "epoch": 0.3465316144874156, + "grad_norm": 1.5834014415740967, + "learning_rate": 9.999444114830957e-05, + "loss": 2.6477, + "step": 1129 + }, + { + "epoch": 0.34683855125844076, + "grad_norm": 1.1575955152511597, + "learning_rate": 9.999436678462338e-05, + "loss": 2.6908, + "step": 1130 + }, + { + "epoch": 0.34714548802946593, + "grad_norm": 1.6231988668441772, + "learning_rate": 9.999429192686352e-05, + "loss": 2.6741, + "step": 1131 + }, + { + "epoch": 0.3474524248004911, + "grad_norm": 1.1616390943527222, + "learning_rate": 9.99942165750308e-05, + "loss": 2.5977, + "step": 1132 + }, + { + "epoch": 0.34775936157151627, + "grad_norm": 1.6188498735427856, + "learning_rate": 9.999414072912592e-05, + "loss": 2.6776, + "step": 1133 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 1.3885529041290283, + "learning_rate": 9.999406438914962e-05, + "loss": 2.7136, + "step": 1134 + }, + { + "epoch": 0.3483732351135666, + "grad_norm": 1.4522851705551147, + "learning_rate": 9.999398755510269e-05, + "loss": 2.6817, + "step": 1135 + }, + { + "epoch": 0.34868017188459177, + "grad_norm": 1.2695082426071167, + "learning_rate": 9.999391022698588e-05, + "loss": 2.6257, + "step": 1136 + }, + { + "epoch": 0.34898710865561694, + "grad_norm": 1.1735594272613525, + "learning_rate": 9.999383240479993e-05, + "loss": 2.5908, + "step": 1137 + }, + { + "epoch": 0.3492940454266421, + "grad_norm": 1.4158523082733154, + "learning_rate": 9.999375408854564e-05, + "loss": 2.572, + "step": 1138 + }, + { + "epoch": 0.3496009821976673, + "grad_norm": 1.1342333555221558, + "learning_rate": 9.999367527822376e-05, + "loss": 2.6918, + "step": 1139 + }, + { + "epoch": 0.34990791896869244, + "grad_norm": 1.4462997913360596, + "learning_rate": 9.999359597383509e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 0.3502148557397176, + "grad_norm": 1.254346251487732, + "learning_rate": 9.99935161753804e-05, + "loss": 2.6426, + "step": 1141 + }, + { + "epoch": 0.3505217925107428, + "grad_norm": 1.5101851224899292, + "learning_rate": 9.999343588286048e-05, + "loss": 2.6261, + "step": 1142 + }, + { + "epoch": 0.35082872928176795, + "grad_norm": 1.2910065650939941, + "learning_rate": 9.999335509627612e-05, + "loss": 2.5587, + "step": 1143 + }, + { + "epoch": 0.3511356660527931, + "grad_norm": 1.4421133995056152, + "learning_rate": 9.999327381562812e-05, + "loss": 2.6812, + "step": 1144 + }, + { + "epoch": 0.3514426028238183, + "grad_norm": 1.3265037536621094, + "learning_rate": 9.999319204091728e-05, + "loss": 2.6506, + "step": 1145 + }, + { + "epoch": 0.35174953959484345, + "grad_norm": 1.346258521080017, + "learning_rate": 9.999310977214443e-05, + "loss": 2.7038, + "step": 1146 + }, + { + "epoch": 0.3520564763658686, + "grad_norm": 1.3683836460113525, + "learning_rate": 9.999302700931037e-05, + "loss": 2.5823, + "step": 1147 + }, + { + "epoch": 0.3523634131368938, + "grad_norm": 1.3593783378601074, + "learning_rate": 9.99929437524159e-05, + "loss": 2.5705, + "step": 1148 + }, + { + "epoch": 0.35267034990791896, + "grad_norm": 1.4077095985412598, + "learning_rate": 9.999286000146186e-05, + "loss": 2.6259, + "step": 1149 + }, + { + "epoch": 0.3529772866789441, + "grad_norm": 1.3095922470092773, + "learning_rate": 9.99927757564491e-05, + "loss": 2.683, + "step": 1150 + }, + { + "epoch": 0.3532842234499693, + "grad_norm": 1.4188631772994995, + "learning_rate": 9.999269101737841e-05, + "loss": 2.619, + "step": 1151 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 1.2483123540878296, + "learning_rate": 9.999260578425063e-05, + "loss": 2.6477, + "step": 1152 + }, + { + "epoch": 0.35389809699201963, + "grad_norm": 1.4601099491119385, + "learning_rate": 9.999252005706663e-05, + "loss": 2.5861, + "step": 1153 + }, + { + "epoch": 0.3542050337630448, + "grad_norm": 1.107335090637207, + "learning_rate": 9.999243383582726e-05, + "loss": 2.6308, + "step": 1154 + }, + { + "epoch": 0.35451197053406996, + "grad_norm": 1.60590398311615, + "learning_rate": 9.999234712053334e-05, + "loss": 2.7057, + "step": 1155 + }, + { + "epoch": 0.35481890730509513, + "grad_norm": 1.2256578207015991, + "learning_rate": 9.999225991118575e-05, + "loss": 2.6371, + "step": 1156 + }, + { + "epoch": 0.3551258440761203, + "grad_norm": 1.4451910257339478, + "learning_rate": 9.999217220778535e-05, + "loss": 2.6424, + "step": 1157 + }, + { + "epoch": 0.35543278084714547, + "grad_norm": 1.184781789779663, + "learning_rate": 9.999208401033299e-05, + "loss": 2.6576, + "step": 1158 + }, + { + "epoch": 0.35573971761817064, + "grad_norm": 1.3395711183547974, + "learning_rate": 9.999199531882956e-05, + "loss": 2.6109, + "step": 1159 + }, + { + "epoch": 0.3560466543891958, + "grad_norm": 1.2052571773529053, + "learning_rate": 9.999190613327594e-05, + "loss": 2.5486, + "step": 1160 + }, + { + "epoch": 0.356353591160221, + "grad_norm": 1.2690850496292114, + "learning_rate": 9.999181645367299e-05, + "loss": 2.6457, + "step": 1161 + }, + { + "epoch": 0.35666052793124614, + "grad_norm": 1.2832787036895752, + "learning_rate": 9.999172628002162e-05, + "loss": 2.6097, + "step": 1162 + }, + { + "epoch": 0.3569674647022713, + "grad_norm": 1.3791579008102417, + "learning_rate": 9.999163561232272e-05, + "loss": 2.7458, + "step": 1163 + }, + { + "epoch": 0.3572744014732965, + "grad_norm": 1.260743498802185, + "learning_rate": 9.999154445057715e-05, + "loss": 2.594, + "step": 1164 + }, + { + "epoch": 0.35758133824432164, + "grad_norm": 1.1595406532287598, + "learning_rate": 9.999145279478585e-05, + "loss": 2.5315, + "step": 1165 + }, + { + "epoch": 0.3578882750153468, + "grad_norm": 1.3424396514892578, + "learning_rate": 9.999136064494972e-05, + "loss": 2.6017, + "step": 1166 + }, + { + "epoch": 0.358195211786372, + "grad_norm": 1.317750334739685, + "learning_rate": 9.999126800106963e-05, + "loss": 2.5787, + "step": 1167 + }, + { + "epoch": 0.35850214855739715, + "grad_norm": 1.104471206665039, + "learning_rate": 9.999117486314657e-05, + "loss": 2.6801, + "step": 1168 + }, + { + "epoch": 0.3588090853284224, + "grad_norm": 1.5555830001831055, + "learning_rate": 9.99910812311814e-05, + "loss": 2.6575, + "step": 1169 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 1.1883453130722046, + "learning_rate": 9.999098710517507e-05, + "loss": 2.5801, + "step": 1170 + }, + { + "epoch": 0.3594229588704727, + "grad_norm": 1.3885222673416138, + "learning_rate": 9.99908924851285e-05, + "loss": 2.5637, + "step": 1171 + }, + { + "epoch": 0.3597298956414979, + "grad_norm": 1.1860510110855103, + "learning_rate": 9.999079737104262e-05, + "loss": 2.6528, + "step": 1172 + }, + { + "epoch": 0.36003683241252304, + "grad_norm": 1.4319096803665161, + "learning_rate": 9.99907017629184e-05, + "loss": 2.579, + "step": 1173 + }, + { + "epoch": 0.3603437691835482, + "grad_norm": 1.256819725036621, + "learning_rate": 9.999060566075676e-05, + "loss": 2.5638, + "step": 1174 + }, + { + "epoch": 0.3606507059545734, + "grad_norm": 1.5452641248703003, + "learning_rate": 9.999050906455865e-05, + "loss": 2.6318, + "step": 1175 + }, + { + "epoch": 0.36095764272559855, + "grad_norm": 1.1933847665786743, + "learning_rate": 9.999041197432503e-05, + "loss": 2.5451, + "step": 1176 + }, + { + "epoch": 0.3612645794966237, + "grad_norm": 1.245689034461975, + "learning_rate": 9.999031439005684e-05, + "loss": 2.5452, + "step": 1177 + }, + { + "epoch": 0.3615715162676489, + "grad_norm": 1.2228111028671265, + "learning_rate": 9.99902163117551e-05, + "loss": 2.5856, + "step": 1178 + }, + { + "epoch": 0.36187845303867405, + "grad_norm": 1.3547098636627197, + "learning_rate": 9.999011773942071e-05, + "loss": 2.6604, + "step": 1179 + }, + { + "epoch": 0.3621853898096992, + "grad_norm": 1.25395929813385, + "learning_rate": 9.999001867305469e-05, + "loss": 2.5947, + "step": 1180 + }, + { + "epoch": 0.3624923265807244, + "grad_norm": 1.1676687002182007, + "learning_rate": 9.9989919112658e-05, + "loss": 2.5728, + "step": 1181 + }, + { + "epoch": 0.36279926335174956, + "grad_norm": 1.2076375484466553, + "learning_rate": 9.998981905823163e-05, + "loss": 2.569, + "step": 1182 + }, + { + "epoch": 0.3631062001227747, + "grad_norm": 1.3417900800704956, + "learning_rate": 9.998971850977659e-05, + "loss": 2.5552, + "step": 1183 + }, + { + "epoch": 0.3634131368937999, + "grad_norm": 1.135088324546814, + "learning_rate": 9.998961746729383e-05, + "loss": 2.5883, + "step": 1184 + }, + { + "epoch": 0.36372007366482506, + "grad_norm": 1.3329869508743286, + "learning_rate": 9.998951593078438e-05, + "loss": 2.6398, + "step": 1185 + }, + { + "epoch": 0.36402701043585023, + "grad_norm": 1.1681292057037354, + "learning_rate": 9.998941390024923e-05, + "loss": 2.6082, + "step": 1186 + }, + { + "epoch": 0.3643339472068754, + "grad_norm": 1.4083843231201172, + "learning_rate": 9.998931137568939e-05, + "loss": 2.6585, + "step": 1187 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 1.0879896879196167, + "learning_rate": 9.998920835710587e-05, + "loss": 2.4779, + "step": 1188 + }, + { + "epoch": 0.36494782074892573, + "grad_norm": 1.2977828979492188, + "learning_rate": 9.99891048444997e-05, + "loss": 2.6586, + "step": 1189 + }, + { + "epoch": 0.3652547575199509, + "grad_norm": 1.2552378177642822, + "learning_rate": 9.998900083787188e-05, + "loss": 2.5211, + "step": 1190 + }, + { + "epoch": 0.36556169429097607, + "grad_norm": 1.178227186203003, + "learning_rate": 9.998889633722348e-05, + "loss": 2.5365, + "step": 1191 + }, + { + "epoch": 0.36586863106200124, + "grad_norm": 1.36601722240448, + "learning_rate": 9.99887913425555e-05, + "loss": 2.6108, + "step": 1192 + }, + { + "epoch": 0.3661755678330264, + "grad_norm": 1.1947816610336304, + "learning_rate": 9.998868585386898e-05, + "loss": 2.5269, + "step": 1193 + }, + { + "epoch": 0.3664825046040516, + "grad_norm": 1.3113429546356201, + "learning_rate": 9.998857987116497e-05, + "loss": 2.5241, + "step": 1194 + }, + { + "epoch": 0.36678944137507674, + "grad_norm": 1.1573466062545776, + "learning_rate": 9.99884733944445e-05, + "loss": 2.5772, + "step": 1195 + }, + { + "epoch": 0.3670963781461019, + "grad_norm": 1.3841795921325684, + "learning_rate": 9.998836642370866e-05, + "loss": 2.6254, + "step": 1196 + }, + { + "epoch": 0.3674033149171271, + "grad_norm": 1.3332045078277588, + "learning_rate": 9.998825895895848e-05, + "loss": 2.6846, + "step": 1197 + }, + { + "epoch": 0.36771025168815225, + "grad_norm": 1.1578748226165771, + "learning_rate": 9.9988151000195e-05, + "loss": 2.4717, + "step": 1198 + }, + { + "epoch": 0.3680171884591774, + "grad_norm": 1.1045753955841064, + "learning_rate": 9.998804254741934e-05, + "loss": 2.6433, + "step": 1199 + }, + { + "epoch": 0.3683241252302026, + "grad_norm": 1.3260962963104248, + "learning_rate": 9.998793360063254e-05, + "loss": 2.6385, + "step": 1200 + }, + { + "epoch": 0.36863106200122775, + "grad_norm": 1.1483805179595947, + "learning_rate": 9.998782415983568e-05, + "loss": 2.6013, + "step": 1201 + }, + { + "epoch": 0.3689379987722529, + "grad_norm": 1.1897181272506714, + "learning_rate": 9.998771422502984e-05, + "loss": 2.485, + "step": 1202 + }, + { + "epoch": 0.3692449355432781, + "grad_norm": 1.2124346494674683, + "learning_rate": 9.99876037962161e-05, + "loss": 2.6271, + "step": 1203 + }, + { + "epoch": 0.36955187231430325, + "grad_norm": 1.2274240255355835, + "learning_rate": 9.998749287339557e-05, + "loss": 2.6072, + "step": 1204 + }, + { + "epoch": 0.3698588090853284, + "grad_norm": 1.2045015096664429, + "learning_rate": 9.998738145656934e-05, + "loss": 2.5567, + "step": 1205 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 1.187698483467102, + "learning_rate": 9.998726954573852e-05, + "loss": 2.6251, + "step": 1206 + }, + { + "epoch": 0.37047268262737876, + "grad_norm": 1.1760836839675903, + "learning_rate": 9.998715714090419e-05, + "loss": 2.6544, + "step": 1207 + }, + { + "epoch": 0.3707796193984039, + "grad_norm": 1.2181260585784912, + "learning_rate": 9.998704424206746e-05, + "loss": 2.6258, + "step": 1208 + }, + { + "epoch": 0.3710865561694291, + "grad_norm": 1.2106094360351562, + "learning_rate": 9.998693084922947e-05, + "loss": 2.5932, + "step": 1209 + }, + { + "epoch": 0.37139349294045426, + "grad_norm": 1.2973625659942627, + "learning_rate": 9.998681696239133e-05, + "loss": 2.5257, + "step": 1210 + }, + { + "epoch": 0.37170042971147943, + "grad_norm": 1.2477924823760986, + "learning_rate": 9.998670258155417e-05, + "loss": 2.6579, + "step": 1211 + }, + { + "epoch": 0.3720073664825046, + "grad_norm": 1.3301422595977783, + "learning_rate": 9.998658770671913e-05, + "loss": 2.4903, + "step": 1212 + }, + { + "epoch": 0.37231430325352977, + "grad_norm": 1.224321722984314, + "learning_rate": 9.998647233788732e-05, + "loss": 2.5865, + "step": 1213 + }, + { + "epoch": 0.37262124002455493, + "grad_norm": 1.3110655546188354, + "learning_rate": 9.99863564750599e-05, + "loss": 2.6134, + "step": 1214 + }, + { + "epoch": 0.3729281767955801, + "grad_norm": 1.2323014736175537, + "learning_rate": 9.998624011823801e-05, + "loss": 2.5892, + "step": 1215 + }, + { + "epoch": 0.37323511356660527, + "grad_norm": 1.0873770713806152, + "learning_rate": 9.998612326742279e-05, + "loss": 2.4897, + "step": 1216 + }, + { + "epoch": 0.37354205033763044, + "grad_norm": 1.2789679765701294, + "learning_rate": 9.998600592261539e-05, + "loss": 2.5603, + "step": 1217 + }, + { + "epoch": 0.3738489871086556, + "grad_norm": 1.1311540603637695, + "learning_rate": 9.998588808381699e-05, + "loss": 2.5327, + "step": 1218 + }, + { + "epoch": 0.3741559238796808, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.998576975102876e-05, + "loss": 2.4789, + "step": 1219 + }, + { + "epoch": 0.37446286065070594, + "grad_norm": 1.1840651035308838, + "learning_rate": 9.998565092425182e-05, + "loss": 2.5026, + "step": 1220 + }, + { + "epoch": 0.3747697974217311, + "grad_norm": 1.3145099878311157, + "learning_rate": 9.998553160348743e-05, + "loss": 2.5424, + "step": 1221 + }, + { + "epoch": 0.3750767341927563, + "grad_norm": 1.2192758321762085, + "learning_rate": 9.998541178873668e-05, + "loss": 2.5556, + "step": 1222 + }, + { + "epoch": 0.37538367096378145, + "grad_norm": 1.1329905986785889, + "learning_rate": 9.99852914800008e-05, + "loss": 2.4624, + "step": 1223 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 1.2490339279174805, + "learning_rate": 9.9985170677281e-05, + "loss": 2.5016, + "step": 1224 + }, + { + "epoch": 0.3759975445058318, + "grad_norm": 1.1884582042694092, + "learning_rate": 9.998504938057841e-05, + "loss": 2.5345, + "step": 1225 + }, + { + "epoch": 0.37630448127685695, + "grad_norm": 1.2075775861740112, + "learning_rate": 9.998492758989428e-05, + "loss": 2.5206, + "step": 1226 + }, + { + "epoch": 0.3766114180478821, + "grad_norm": 1.238457441329956, + "learning_rate": 9.99848053052298e-05, + "loss": 2.6748, + "step": 1227 + }, + { + "epoch": 0.3769183548189073, + "grad_norm": 1.3056883811950684, + "learning_rate": 9.998468252658618e-05, + "loss": 2.6146, + "step": 1228 + }, + { + "epoch": 0.37722529158993245, + "grad_norm": 1.191575050354004, + "learning_rate": 9.998455925396461e-05, + "loss": 2.4743, + "step": 1229 + }, + { + "epoch": 0.3775322283609576, + "grad_norm": 1.2834603786468506, + "learning_rate": 9.998443548736635e-05, + "loss": 2.5504, + "step": 1230 + }, + { + "epoch": 0.3778391651319828, + "grad_norm": 1.3023632764816284, + "learning_rate": 9.99843112267926e-05, + "loss": 2.5832, + "step": 1231 + }, + { + "epoch": 0.37814610190300796, + "grad_norm": 1.1219336986541748, + "learning_rate": 9.998418647224458e-05, + "loss": 2.5715, + "step": 1232 + }, + { + "epoch": 0.3784530386740331, + "grad_norm": 1.0666810274124146, + "learning_rate": 9.998406122372354e-05, + "loss": 2.4865, + "step": 1233 + }, + { + "epoch": 0.3787599754450583, + "grad_norm": 1.3699263334274292, + "learning_rate": 9.998393548123072e-05, + "loss": 2.5523, + "step": 1234 + }, + { + "epoch": 0.37906691221608346, + "grad_norm": 1.1383014917373657, + "learning_rate": 9.998380924476733e-05, + "loss": 2.7054, + "step": 1235 + }, + { + "epoch": 0.37937384898710863, + "grad_norm": 1.1304205656051636, + "learning_rate": 9.998368251433465e-05, + "loss": 2.5007, + "step": 1236 + }, + { + "epoch": 0.3796807857581338, + "grad_norm": 1.2220405340194702, + "learning_rate": 9.998355528993394e-05, + "loss": 2.5635, + "step": 1237 + }, + { + "epoch": 0.37998772252915897, + "grad_norm": 1.1126691102981567, + "learning_rate": 9.998342757156642e-05, + "loss": 2.5795, + "step": 1238 + }, + { + "epoch": 0.38029465930018413, + "grad_norm": 1.1675945520401, + "learning_rate": 9.998329935923339e-05, + "loss": 2.564, + "step": 1239 + }, + { + "epoch": 0.38060159607120936, + "grad_norm": 1.1286569833755493, + "learning_rate": 9.998317065293607e-05, + "loss": 2.5476, + "step": 1240 + }, + { + "epoch": 0.3809085328422345, + "grad_norm": 1.1252213716506958, + "learning_rate": 9.998304145267579e-05, + "loss": 2.5406, + "step": 1241 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 1.1931700706481934, + "learning_rate": 9.998291175845378e-05, + "loss": 2.5277, + "step": 1242 + }, + { + "epoch": 0.38152240638428486, + "grad_norm": 1.2148306369781494, + "learning_rate": 9.998278157027136e-05, + "loss": 2.5178, + "step": 1243 + }, + { + "epoch": 0.38182934315531003, + "grad_norm": 1.1597660779953003, + "learning_rate": 9.998265088812978e-05, + "loss": 2.5522, + "step": 1244 + }, + { + "epoch": 0.3821362799263352, + "grad_norm": 1.105973243713379, + "learning_rate": 9.998251971203035e-05, + "loss": 2.4558, + "step": 1245 + }, + { + "epoch": 0.38244321669736037, + "grad_norm": 1.1082781553268433, + "learning_rate": 9.998238804197437e-05, + "loss": 2.5504, + "step": 1246 + }, + { + "epoch": 0.38275015346838553, + "grad_norm": 1.2124732732772827, + "learning_rate": 9.998225587796312e-05, + "loss": 2.5536, + "step": 1247 + }, + { + "epoch": 0.3830570902394107, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.998212321999795e-05, + "loss": 2.4837, + "step": 1248 + }, + { + "epoch": 0.38336402701043587, + "grad_norm": 1.353562355041504, + "learning_rate": 9.998199006808014e-05, + "loss": 2.4554, + "step": 1249 + }, + { + "epoch": 0.38367096378146104, + "grad_norm": 1.2103357315063477, + "learning_rate": 9.998185642221098e-05, + "loss": 2.4843, + "step": 1250 + }, + { + "epoch": 0.3839779005524862, + "grad_norm": 1.2572352886199951, + "learning_rate": 9.998172228239185e-05, + "loss": 2.497, + "step": 1251 + }, + { + "epoch": 0.3842848373235114, + "grad_norm": 1.0910226106643677, + "learning_rate": 9.998158764862402e-05, + "loss": 2.577, + "step": 1252 + }, + { + "epoch": 0.38459177409453654, + "grad_norm": 1.2550606727600098, + "learning_rate": 9.998145252090886e-05, + "loss": 2.5087, + "step": 1253 + }, + { + "epoch": 0.3848987108655617, + "grad_norm": 1.0103787183761597, + "learning_rate": 9.998131689924768e-05, + "loss": 2.5306, + "step": 1254 + }, + { + "epoch": 0.3852056476365869, + "grad_norm": 1.2965941429138184, + "learning_rate": 9.998118078364184e-05, + "loss": 2.5622, + "step": 1255 + }, + { + "epoch": 0.38551258440761205, + "grad_norm": 1.0791535377502441, + "learning_rate": 9.998104417409269e-05, + "loss": 2.5608, + "step": 1256 + }, + { + "epoch": 0.3858195211786372, + "grad_norm": 1.3277596235275269, + "learning_rate": 9.998090707060155e-05, + "loss": 2.5748, + "step": 1257 + }, + { + "epoch": 0.3861264579496624, + "grad_norm": 1.004031777381897, + "learning_rate": 9.99807694731698e-05, + "loss": 2.5532, + "step": 1258 + }, + { + "epoch": 0.38643339472068755, + "grad_norm": 1.4802277088165283, + "learning_rate": 9.998063138179877e-05, + "loss": 2.585, + "step": 1259 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 1.0821146965026855, + "learning_rate": 9.998049279648987e-05, + "loss": 2.5248, + "step": 1260 + }, + { + "epoch": 0.3870472682627379, + "grad_norm": 1.2902108430862427, + "learning_rate": 9.998035371724443e-05, + "loss": 2.5134, + "step": 1261 + }, + { + "epoch": 0.38735420503376305, + "grad_norm": 1.082943320274353, + "learning_rate": 9.998021414406385e-05, + "loss": 2.5937, + "step": 1262 + }, + { + "epoch": 0.3876611418047882, + "grad_norm": 1.2164193391799927, + "learning_rate": 9.998007407694949e-05, + "loss": 2.5106, + "step": 1263 + }, + { + "epoch": 0.3879680785758134, + "grad_norm": 1.0999115705490112, + "learning_rate": 9.997993351590276e-05, + "loss": 2.5458, + "step": 1264 + }, + { + "epoch": 0.38827501534683856, + "grad_norm": 1.2275537252426147, + "learning_rate": 9.997979246092503e-05, + "loss": 2.5664, + "step": 1265 + }, + { + "epoch": 0.3885819521178637, + "grad_norm": 1.3246204853057861, + "learning_rate": 9.997965091201769e-05, + "loss": 2.5289, + "step": 1266 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.2404677867889404, + "learning_rate": 9.997950886918214e-05, + "loss": 2.5302, + "step": 1267 + }, + { + "epoch": 0.38919582565991406, + "grad_norm": 1.0993810892105103, + "learning_rate": 9.99793663324198e-05, + "loss": 2.5085, + "step": 1268 + }, + { + "epoch": 0.38950276243093923, + "grad_norm": 1.3394049406051636, + "learning_rate": 9.997922330173206e-05, + "loss": 2.5882, + "step": 1269 + }, + { + "epoch": 0.3898096992019644, + "grad_norm": 1.1464321613311768, + "learning_rate": 9.997907977712036e-05, + "loss": 2.5211, + "step": 1270 + }, + { + "epoch": 0.39011663597298957, + "grad_norm": 1.1246297359466553, + "learning_rate": 9.997893575858608e-05, + "loss": 2.4204, + "step": 1271 + }, + { + "epoch": 0.39042357274401474, + "grad_norm": 1.1278076171875, + "learning_rate": 9.997879124613067e-05, + "loss": 2.4405, + "step": 1272 + }, + { + "epoch": 0.3907305095150399, + "grad_norm": 1.2284942865371704, + "learning_rate": 9.997864623975555e-05, + "loss": 2.5674, + "step": 1273 + }, + { + "epoch": 0.39103744628606507, + "grad_norm": 1.1243138313293457, + "learning_rate": 9.997850073946215e-05, + "loss": 2.489, + "step": 1274 + }, + { + "epoch": 0.39134438305709024, + "grad_norm": 1.198461890220642, + "learning_rate": 9.997835474525193e-05, + "loss": 2.51, + "step": 1275 + }, + { + "epoch": 0.3916513198281154, + "grad_norm": 1.1643213033676147, + "learning_rate": 9.997820825712629e-05, + "loss": 2.5688, + "step": 1276 + }, + { + "epoch": 0.3919582565991406, + "grad_norm": 1.2107082605361938, + "learning_rate": 9.997806127508671e-05, + "loss": 2.5614, + "step": 1277 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 1.1856440305709839, + "learning_rate": 9.997791379913464e-05, + "loss": 2.5893, + "step": 1278 + }, + { + "epoch": 0.3925721301411909, + "grad_norm": 1.166395664215088, + "learning_rate": 9.997776582927153e-05, + "loss": 2.539, + "step": 1279 + }, + { + "epoch": 0.3928790669122161, + "grad_norm": 1.1638765335083008, + "learning_rate": 9.997761736549886e-05, + "loss": 2.5384, + "step": 1280 + }, + { + "epoch": 0.39318600368324125, + "grad_norm": 1.107485055923462, + "learning_rate": 9.997746840781806e-05, + "loss": 2.559, + "step": 1281 + }, + { + "epoch": 0.3934929404542664, + "grad_norm": 1.174592137336731, + "learning_rate": 9.997731895623063e-05, + "loss": 2.5132, + "step": 1282 + }, + { + "epoch": 0.3937998772252916, + "grad_norm": 1.0407745838165283, + "learning_rate": 9.997716901073806e-05, + "loss": 2.4871, + "step": 1283 + }, + { + "epoch": 0.39410681399631675, + "grad_norm": 1.059743046760559, + "learning_rate": 9.997701857134179e-05, + "loss": 2.4865, + "step": 1284 + }, + { + "epoch": 0.3944137507673419, + "grad_norm": 1.0606070756912231, + "learning_rate": 9.997686763804335e-05, + "loss": 2.5651, + "step": 1285 + }, + { + "epoch": 0.3947206875383671, + "grad_norm": 1.0753284692764282, + "learning_rate": 9.99767162108442e-05, + "loss": 2.4699, + "step": 1286 + }, + { + "epoch": 0.39502762430939226, + "grad_norm": 1.1155509948730469, + "learning_rate": 9.997656428974585e-05, + "loss": 2.5326, + "step": 1287 + }, + { + "epoch": 0.3953345610804174, + "grad_norm": 1.2243739366531372, + "learning_rate": 9.99764118747498e-05, + "loss": 2.5189, + "step": 1288 + }, + { + "epoch": 0.3956414978514426, + "grad_norm": 1.2526514530181885, + "learning_rate": 9.997625896585757e-05, + "loss": 2.5464, + "step": 1289 + }, + { + "epoch": 0.39594843462246776, + "grad_norm": 1.297153115272522, + "learning_rate": 9.997610556307062e-05, + "loss": 2.5752, + "step": 1290 + }, + { + "epoch": 0.39625537139349293, + "grad_norm": 1.1064956188201904, + "learning_rate": 9.997595166639054e-05, + "loss": 2.5743, + "step": 1291 + }, + { + "epoch": 0.3965623081645181, + "grad_norm": 1.255810022354126, + "learning_rate": 9.997579727581879e-05, + "loss": 2.7087, + "step": 1292 + }, + { + "epoch": 0.39686924493554326, + "grad_norm": 1.4290298223495483, + "learning_rate": 9.997564239135692e-05, + "loss": 2.5417, + "step": 1293 + }, + { + "epoch": 0.39717618170656843, + "grad_norm": 1.1937109231948853, + "learning_rate": 9.997548701300648e-05, + "loss": 2.4862, + "step": 1294 + }, + { + "epoch": 0.3974831184775936, + "grad_norm": 1.1707425117492676, + "learning_rate": 9.997533114076897e-05, + "loss": 2.4715, + "step": 1295 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 1.1248551607131958, + "learning_rate": 9.997517477464596e-05, + "loss": 2.4859, + "step": 1296 + }, + { + "epoch": 0.39809699201964394, + "grad_norm": 1.1656453609466553, + "learning_rate": 9.997501791463897e-05, + "loss": 2.5402, + "step": 1297 + }, + { + "epoch": 0.3984039287906691, + "grad_norm": 0.9916674494743347, + "learning_rate": 9.997486056074956e-05, + "loss": 2.5116, + "step": 1298 + }, + { + "epoch": 0.39871086556169427, + "grad_norm": 1.3229619264602661, + "learning_rate": 9.997470271297928e-05, + "loss": 2.5565, + "step": 1299 + }, + { + "epoch": 0.39901780233271944, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.997454437132971e-05, + "loss": 2.5191, + "step": 1300 + }, + { + "epoch": 0.3993247391037446, + "grad_norm": 1.2117778062820435, + "learning_rate": 9.997438553580241e-05, + "loss": 2.558, + "step": 1301 + }, + { + "epoch": 0.3996316758747698, + "grad_norm": 1.1083563566207886, + "learning_rate": 9.997422620639892e-05, + "loss": 2.4734, + "step": 1302 + }, + { + "epoch": 0.39993861264579494, + "grad_norm": 0.9662174582481384, + "learning_rate": 9.997406638312084e-05, + "loss": 2.4866, + "step": 1303 + }, + { + "epoch": 0.4002455494168201, + "grad_norm": 1.0886632204055786, + "learning_rate": 9.997390606596976e-05, + "loss": 2.5397, + "step": 1304 + }, + { + "epoch": 0.4005524861878453, + "grad_norm": 1.2318742275238037, + "learning_rate": 9.997374525494723e-05, + "loss": 2.6281, + "step": 1305 + }, + { + "epoch": 0.40085942295887045, + "grad_norm": 1.1717815399169922, + "learning_rate": 9.997358395005487e-05, + "loss": 2.5202, + "step": 1306 + }, + { + "epoch": 0.4011663597298956, + "grad_norm": 1.0533723831176758, + "learning_rate": 9.997342215129427e-05, + "loss": 2.5096, + "step": 1307 + }, + { + "epoch": 0.4014732965009208, + "grad_norm": 1.0814248323440552, + "learning_rate": 9.997325985866701e-05, + "loss": 2.5513, + "step": 1308 + }, + { + "epoch": 0.40178023327194595, + "grad_norm": 1.078261137008667, + "learning_rate": 9.997309707217472e-05, + "loss": 2.5115, + "step": 1309 + }, + { + "epoch": 0.4020871700429711, + "grad_norm": 1.0834710597991943, + "learning_rate": 9.997293379181897e-05, + "loss": 2.4754, + "step": 1310 + }, + { + "epoch": 0.40239410681399634, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.997277001760142e-05, + "loss": 2.5068, + "step": 1311 + }, + { + "epoch": 0.4027010435850215, + "grad_norm": 1.3008345365524292, + "learning_rate": 9.997260574952366e-05, + "loss": 2.4675, + "step": 1312 + }, + { + "epoch": 0.4030079803560467, + "grad_norm": 1.176858901977539, + "learning_rate": 9.997244098758732e-05, + "loss": 2.4786, + "step": 1313 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 1.0121303796768188, + "learning_rate": 9.997227573179403e-05, + "loss": 2.476, + "step": 1314 + }, + { + "epoch": 0.403621853898097, + "grad_norm": 1.326298713684082, + "learning_rate": 9.997210998214542e-05, + "loss": 2.4093, + "step": 1315 + }, + { + "epoch": 0.4039287906691222, + "grad_norm": 0.9008898735046387, + "learning_rate": 9.997194373864314e-05, + "loss": 2.4523, + "step": 1316 + }, + { + "epoch": 0.40423572744014735, + "grad_norm": 1.0441854000091553, + "learning_rate": 9.99717770012888e-05, + "loss": 2.5419, + "step": 1317 + }, + { + "epoch": 0.4045426642111725, + "grad_norm": 1.0490028858184814, + "learning_rate": 9.997160977008408e-05, + "loss": 2.4855, + "step": 1318 + }, + { + "epoch": 0.4048496009821977, + "grad_norm": 1.0244388580322266, + "learning_rate": 9.997144204503063e-05, + "loss": 2.4555, + "step": 1319 + }, + { + "epoch": 0.40515653775322286, + "grad_norm": 1.1217700242996216, + "learning_rate": 9.99712738261301e-05, + "loss": 2.4872, + "step": 1320 + }, + { + "epoch": 0.405463474524248, + "grad_norm": 1.031691551208496, + "learning_rate": 9.997110511338414e-05, + "loss": 2.4094, + "step": 1321 + }, + { + "epoch": 0.4057704112952732, + "grad_norm": 1.1658705472946167, + "learning_rate": 9.997093590679444e-05, + "loss": 2.407, + "step": 1322 + }, + { + "epoch": 0.40607734806629836, + "grad_norm": 1.1527072191238403, + "learning_rate": 9.997076620636266e-05, + "loss": 2.5041, + "step": 1323 + }, + { + "epoch": 0.40638428483732353, + "grad_norm": 1.2039116621017456, + "learning_rate": 9.997059601209049e-05, + "loss": 2.4682, + "step": 1324 + }, + { + "epoch": 0.4066912216083487, + "grad_norm": 1.142160177230835, + "learning_rate": 9.997042532397957e-05, + "loss": 2.4629, + "step": 1325 + }, + { + "epoch": 0.40699815837937386, + "grad_norm": 0.972081184387207, + "learning_rate": 9.997025414203164e-05, + "loss": 2.3941, + "step": 1326 + }, + { + "epoch": 0.40730509515039903, + "grad_norm": 1.0181753635406494, + "learning_rate": 9.99700824662484e-05, + "loss": 2.5649, + "step": 1327 + }, + { + "epoch": 0.4076120319214242, + "grad_norm": 1.145769715309143, + "learning_rate": 9.996991029663148e-05, + "loss": 2.5284, + "step": 1328 + }, + { + "epoch": 0.40791896869244937, + "grad_norm": 1.0604028701782227, + "learning_rate": 9.996973763318262e-05, + "loss": 2.4488, + "step": 1329 + }, + { + "epoch": 0.40822590546347454, + "grad_norm": 1.161383867263794, + "learning_rate": 9.996956447590354e-05, + "loss": 2.6081, + "step": 1330 + }, + { + "epoch": 0.4085328422344997, + "grad_norm": 1.0880714654922485, + "learning_rate": 9.996939082479591e-05, + "loss": 2.4695, + "step": 1331 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 1.036556601524353, + "learning_rate": 9.99692166798615e-05, + "loss": 2.4428, + "step": 1332 + }, + { + "epoch": 0.40914671577655004, + "grad_norm": 1.079179286956787, + "learning_rate": 9.996904204110198e-05, + "loss": 2.4543, + "step": 1333 + }, + { + "epoch": 0.4094536525475752, + "grad_norm": 1.0588144063949585, + "learning_rate": 9.996886690851912e-05, + "loss": 2.4755, + "step": 1334 + }, + { + "epoch": 0.4097605893186004, + "grad_norm": 1.0359580516815186, + "learning_rate": 9.996869128211462e-05, + "loss": 2.4933, + "step": 1335 + }, + { + "epoch": 0.41006752608962554, + "grad_norm": 1.0067389011383057, + "learning_rate": 9.996851516189021e-05, + "loss": 2.4291, + "step": 1336 + }, + { + "epoch": 0.4103744628606507, + "grad_norm": 1.0173524618148804, + "learning_rate": 9.996833854784766e-05, + "loss": 2.4856, + "step": 1337 + }, + { + "epoch": 0.4106813996316759, + "grad_norm": 1.0740927457809448, + "learning_rate": 9.99681614399887e-05, + "loss": 2.5248, + "step": 1338 + }, + { + "epoch": 0.41098833640270105, + "grad_norm": 0.9638547301292419, + "learning_rate": 9.99679838383151e-05, + "loss": 2.4777, + "step": 1339 + }, + { + "epoch": 0.4112952731737262, + "grad_norm": 1.0349369049072266, + "learning_rate": 9.996780574282856e-05, + "loss": 2.5188, + "step": 1340 + }, + { + "epoch": 0.4116022099447514, + "grad_norm": 1.099743127822876, + "learning_rate": 9.996762715353089e-05, + "loss": 2.4141, + "step": 1341 + }, + { + "epoch": 0.41190914671577655, + "grad_norm": 1.027178406715393, + "learning_rate": 9.996744807042386e-05, + "loss": 2.5134, + "step": 1342 + }, + { + "epoch": 0.4122160834868017, + "grad_norm": 1.1933472156524658, + "learning_rate": 9.996726849350922e-05, + "loss": 2.4821, + "step": 1343 + }, + { + "epoch": 0.4125230202578269, + "grad_norm": 1.1663923263549805, + "learning_rate": 9.996708842278872e-05, + "loss": 2.4593, + "step": 1344 + }, + { + "epoch": 0.41282995702885206, + "grad_norm": 1.2633854150772095, + "learning_rate": 9.996690785826418e-05, + "loss": 2.5524, + "step": 1345 + }, + { + "epoch": 0.4131368937998772, + "grad_norm": 1.03873610496521, + "learning_rate": 9.996672679993737e-05, + "loss": 2.5403, + "step": 1346 + }, + { + "epoch": 0.4134438305709024, + "grad_norm": 1.106656789779663, + "learning_rate": 9.996654524781009e-05, + "loss": 2.5172, + "step": 1347 + }, + { + "epoch": 0.41375076734192756, + "grad_norm": 1.015608310699463, + "learning_rate": 9.996636320188411e-05, + "loss": 2.423, + "step": 1348 + }, + { + "epoch": 0.41405770411295273, + "grad_norm": 1.0672087669372559, + "learning_rate": 9.996618066216124e-05, + "loss": 2.4861, + "step": 1349 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 1.1289842128753662, + "learning_rate": 9.996599762864329e-05, + "loss": 2.3944, + "step": 1350 + }, + { + "epoch": 0.41467157765500307, + "grad_norm": 1.080428957939148, + "learning_rate": 9.996581410133207e-05, + "loss": 2.4563, + "step": 1351 + }, + { + "epoch": 0.41497851442602823, + "grad_norm": 1.257104516029358, + "learning_rate": 9.996563008022939e-05, + "loss": 2.437, + "step": 1352 + }, + { + "epoch": 0.4152854511970534, + "grad_norm": 1.039293646812439, + "learning_rate": 9.996544556533706e-05, + "loss": 2.4654, + "step": 1353 + }, + { + "epoch": 0.41559238796807857, + "grad_norm": 1.0976085662841797, + "learning_rate": 9.996526055665692e-05, + "loss": 2.4755, + "step": 1354 + }, + { + "epoch": 0.41589932473910374, + "grad_norm": 0.937647819519043, + "learning_rate": 9.996507505419078e-05, + "loss": 2.4687, + "step": 1355 + }, + { + "epoch": 0.4162062615101289, + "grad_norm": 1.0461267232894897, + "learning_rate": 9.996488905794047e-05, + "loss": 2.4092, + "step": 1356 + }, + { + "epoch": 0.4165131982811541, + "grad_norm": 1.0510658025741577, + "learning_rate": 9.996470256790787e-05, + "loss": 2.4806, + "step": 1357 + }, + { + "epoch": 0.41682013505217924, + "grad_norm": 1.2323371171951294, + "learning_rate": 9.996451558409478e-05, + "loss": 2.5017, + "step": 1358 + }, + { + "epoch": 0.4171270718232044, + "grad_norm": 0.9880139827728271, + "learning_rate": 9.996432810650307e-05, + "loss": 2.5171, + "step": 1359 + }, + { + "epoch": 0.4174340085942296, + "grad_norm": 1.2572466135025024, + "learning_rate": 9.996414013513458e-05, + "loss": 2.4285, + "step": 1360 + }, + { + "epoch": 0.41774094536525475, + "grad_norm": 0.8809559941291809, + "learning_rate": 9.996395166999118e-05, + "loss": 2.398, + "step": 1361 + }, + { + "epoch": 0.4180478821362799, + "grad_norm": 0.9389429688453674, + "learning_rate": 9.996376271107471e-05, + "loss": 2.4539, + "step": 1362 + }, + { + "epoch": 0.4183548189073051, + "grad_norm": 0.8821789026260376, + "learning_rate": 9.996357325838705e-05, + "loss": 2.4762, + "step": 1363 + }, + { + "epoch": 0.41866175567833025, + "grad_norm": 1.0148484706878662, + "learning_rate": 9.99633833119301e-05, + "loss": 2.5292, + "step": 1364 + }, + { + "epoch": 0.4189686924493554, + "grad_norm": 0.9861947894096375, + "learning_rate": 9.996319287170569e-05, + "loss": 2.4285, + "step": 1365 + }, + { + "epoch": 0.4192756292203806, + "grad_norm": 1.1907099485397339, + "learning_rate": 9.996300193771573e-05, + "loss": 2.4325, + "step": 1366 + }, + { + "epoch": 0.41958256599140575, + "grad_norm": 1.0746681690216064, + "learning_rate": 9.99628105099621e-05, + "loss": 2.3349, + "step": 1367 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 1.2040268182754517, + "learning_rate": 9.996261858844669e-05, + "loss": 2.4427, + "step": 1368 + }, + { + "epoch": 0.4201964395334561, + "grad_norm": 1.0487430095672607, + "learning_rate": 9.99624261731714e-05, + "loss": 2.4305, + "step": 1369 + }, + { + "epoch": 0.42050337630448126, + "grad_norm": 1.0047999620437622, + "learning_rate": 9.996223326413812e-05, + "loss": 2.4442, + "step": 1370 + }, + { + "epoch": 0.4208103130755064, + "grad_norm": 1.147078275680542, + "learning_rate": 9.996203986134879e-05, + "loss": 2.5189, + "step": 1371 + }, + { + "epoch": 0.4211172498465316, + "grad_norm": 1.2269455194473267, + "learning_rate": 9.996184596480529e-05, + "loss": 2.3905, + "step": 1372 + }, + { + "epoch": 0.42142418661755676, + "grad_norm": 0.9716771245002747, + "learning_rate": 9.996165157450954e-05, + "loss": 2.4246, + "step": 1373 + }, + { + "epoch": 0.42173112338858193, + "grad_norm": 1.0569939613342285, + "learning_rate": 9.996145669046347e-05, + "loss": 2.529, + "step": 1374 + }, + { + "epoch": 0.4220380601596071, + "grad_norm": 1.1145942211151123, + "learning_rate": 9.996126131266899e-05, + "loss": 2.3965, + "step": 1375 + }, + { + "epoch": 0.42234499693063227, + "grad_norm": 0.9990974068641663, + "learning_rate": 9.996106544112805e-05, + "loss": 2.4991, + "step": 1376 + }, + { + "epoch": 0.42265193370165743, + "grad_norm": 0.9536247253417969, + "learning_rate": 9.99608690758426e-05, + "loss": 2.4347, + "step": 1377 + }, + { + "epoch": 0.4229588704726826, + "grad_norm": 1.0053460597991943, + "learning_rate": 9.996067221681452e-05, + "loss": 2.4213, + "step": 1378 + }, + { + "epoch": 0.42326580724370777, + "grad_norm": 1.0727168321609497, + "learning_rate": 9.99604748640458e-05, + "loss": 2.4479, + "step": 1379 + }, + { + "epoch": 0.42357274401473294, + "grad_norm": 1.2539277076721191, + "learning_rate": 9.996027701753841e-05, + "loss": 2.4721, + "step": 1380 + }, + { + "epoch": 0.4238796807857581, + "grad_norm": 1.0348230600357056, + "learning_rate": 9.996007867729427e-05, + "loss": 2.4263, + "step": 1381 + }, + { + "epoch": 0.42418661755678333, + "grad_norm": 1.051802158355713, + "learning_rate": 9.995987984331533e-05, + "loss": 2.4492, + "step": 1382 + }, + { + "epoch": 0.4244935543278085, + "grad_norm": 1.0394505262374878, + "learning_rate": 9.995968051560361e-05, + "loss": 2.4625, + "step": 1383 + }, + { + "epoch": 0.42480049109883367, + "grad_norm": 1.1121852397918701, + "learning_rate": 9.995948069416103e-05, + "loss": 2.4999, + "step": 1384 + }, + { + "epoch": 0.42510742786985883, + "grad_norm": 0.9693613052368164, + "learning_rate": 9.995928037898957e-05, + "loss": 2.4112, + "step": 1385 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 1.1416810750961304, + "learning_rate": 9.995907957009123e-05, + "loss": 2.5452, + "step": 1386 + }, + { + "epoch": 0.42572130141190917, + "grad_norm": 1.010640025138855, + "learning_rate": 9.995887826746797e-05, + "loss": 2.412, + "step": 1387 + }, + { + "epoch": 0.42602823818293434, + "grad_norm": 1.0800373554229736, + "learning_rate": 9.99586764711218e-05, + "loss": 2.4451, + "step": 1388 + }, + { + "epoch": 0.4263351749539595, + "grad_norm": 1.058931589126587, + "learning_rate": 9.995847418105471e-05, + "loss": 2.474, + "step": 1389 + }, + { + "epoch": 0.4266421117249847, + "grad_norm": 1.0727131366729736, + "learning_rate": 9.99582713972687e-05, + "loss": 2.468, + "step": 1390 + }, + { + "epoch": 0.42694904849600984, + "grad_norm": 1.0237464904785156, + "learning_rate": 9.995806811976576e-05, + "loss": 2.5208, + "step": 1391 + }, + { + "epoch": 0.427255985267035, + "grad_norm": 1.036582112312317, + "learning_rate": 9.995786434854793e-05, + "loss": 2.4338, + "step": 1392 + }, + { + "epoch": 0.4275629220380602, + "grad_norm": 0.9617817997932434, + "learning_rate": 9.995766008361719e-05, + "loss": 2.4465, + "step": 1393 + }, + { + "epoch": 0.42786985880908535, + "grad_norm": 1.2188911437988281, + "learning_rate": 9.995745532497556e-05, + "loss": 2.5069, + "step": 1394 + }, + { + "epoch": 0.4281767955801105, + "grad_norm": 1.0796585083007812, + "learning_rate": 9.99572500726251e-05, + "loss": 2.4839, + "step": 1395 + }, + { + "epoch": 0.4284837323511357, + "grad_norm": 0.9843130111694336, + "learning_rate": 9.99570443265678e-05, + "loss": 2.4968, + "step": 1396 + }, + { + "epoch": 0.42879066912216085, + "grad_norm": 1.0441415309906006, + "learning_rate": 9.99568380868057e-05, + "loss": 2.4134, + "step": 1397 + }, + { + "epoch": 0.429097605893186, + "grad_norm": 0.9156177639961243, + "learning_rate": 9.995663135334085e-05, + "loss": 2.4891, + "step": 1398 + }, + { + "epoch": 0.4294045426642112, + "grad_norm": 1.1159545183181763, + "learning_rate": 9.995642412617529e-05, + "loss": 2.4507, + "step": 1399 + }, + { + "epoch": 0.42971147943523635, + "grad_norm": 0.8944577574729919, + "learning_rate": 9.995621640531107e-05, + "loss": 2.4465, + "step": 1400 + }, + { + "epoch": 0.4300184162062615, + "grad_norm": 0.9043408036231995, + "learning_rate": 9.995600819075025e-05, + "loss": 2.3726, + "step": 1401 + }, + { + "epoch": 0.4303253529772867, + "grad_norm": 0.9028464555740356, + "learning_rate": 9.995579948249486e-05, + "loss": 2.427, + "step": 1402 + }, + { + "epoch": 0.43063228974831186, + "grad_norm": 0.9497705101966858, + "learning_rate": 9.995559028054699e-05, + "loss": 2.4666, + "step": 1403 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.927601158618927, + "learning_rate": 9.995538058490868e-05, + "loss": 2.3679, + "step": 1404 + }, + { + "epoch": 0.4312461632903622, + "grad_norm": 1.050394892692566, + "learning_rate": 9.995517039558204e-05, + "loss": 2.4096, + "step": 1405 + }, + { + "epoch": 0.43155310006138736, + "grad_norm": 1.3011974096298218, + "learning_rate": 9.995495971256911e-05, + "loss": 2.4439, + "step": 1406 + }, + { + "epoch": 0.43186003683241253, + "grad_norm": 1.0740708112716675, + "learning_rate": 9.9954748535872e-05, + "loss": 2.4891, + "step": 1407 + }, + { + "epoch": 0.4321669736034377, + "grad_norm": 1.1132466793060303, + "learning_rate": 9.995453686549279e-05, + "loss": 2.46, + "step": 1408 + }, + { + "epoch": 0.43247391037446287, + "grad_norm": 1.063275933265686, + "learning_rate": 9.995432470143356e-05, + "loss": 2.5035, + "step": 1409 + }, + { + "epoch": 0.43278084714548803, + "grad_norm": 1.065679669380188, + "learning_rate": 9.99541120436964e-05, + "loss": 2.4471, + "step": 1410 + }, + { + "epoch": 0.4330877839165132, + "grad_norm": 1.017587423324585, + "learning_rate": 9.995389889228344e-05, + "loss": 2.4879, + "step": 1411 + }, + { + "epoch": 0.43339472068753837, + "grad_norm": 0.9744442701339722, + "learning_rate": 9.995368524719678e-05, + "loss": 2.3923, + "step": 1412 + }, + { + "epoch": 0.43370165745856354, + "grad_norm": 0.8916706442832947, + "learning_rate": 9.995347110843851e-05, + "loss": 2.3965, + "step": 1413 + }, + { + "epoch": 0.4340085942295887, + "grad_norm": 0.916221559047699, + "learning_rate": 9.995325647601075e-05, + "loss": 2.4742, + "step": 1414 + }, + { + "epoch": 0.4343155310006139, + "grad_norm": 0.9388782978057861, + "learning_rate": 9.995304134991565e-05, + "loss": 2.453, + "step": 1415 + }, + { + "epoch": 0.43462246777163904, + "grad_norm": 1.057085633277893, + "learning_rate": 9.995282573015532e-05, + "loss": 2.5791, + "step": 1416 + }, + { + "epoch": 0.4349294045426642, + "grad_norm": 1.055145025253296, + "learning_rate": 9.995260961673187e-05, + "loss": 2.3565, + "step": 1417 + }, + { + "epoch": 0.4352363413136894, + "grad_norm": 1.0733528137207031, + "learning_rate": 9.995239300964747e-05, + "loss": 2.5413, + "step": 1418 + }, + { + "epoch": 0.43554327808471455, + "grad_norm": 1.1478198766708374, + "learning_rate": 9.995217590890425e-05, + "loss": 2.4093, + "step": 1419 + }, + { + "epoch": 0.4358502148557397, + "grad_norm": 0.8663081526756287, + "learning_rate": 9.995195831450432e-05, + "loss": 2.3968, + "step": 1420 + }, + { + "epoch": 0.4361571516267649, + "grad_norm": 0.9811860918998718, + "learning_rate": 9.995174022644988e-05, + "loss": 2.3536, + "step": 1421 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.9883477687835693, + "learning_rate": 9.995152164474306e-05, + "loss": 2.5372, + "step": 1422 + }, + { + "epoch": 0.4367710251688152, + "grad_norm": 1.2196532487869263, + "learning_rate": 9.995130256938603e-05, + "loss": 2.429, + "step": 1423 + }, + { + "epoch": 0.4370779619398404, + "grad_norm": 1.000264286994934, + "learning_rate": 9.995108300038096e-05, + "loss": 2.4116, + "step": 1424 + }, + { + "epoch": 0.43738489871086556, + "grad_norm": 1.1259286403656006, + "learning_rate": 9.995086293773e-05, + "loss": 2.4405, + "step": 1425 + }, + { + "epoch": 0.4376918354818907, + "grad_norm": 0.9334595203399658, + "learning_rate": 9.995064238143533e-05, + "loss": 2.3849, + "step": 1426 + }, + { + "epoch": 0.4379987722529159, + "grad_norm": 0.8880285620689392, + "learning_rate": 9.995042133149914e-05, + "loss": 2.4177, + "step": 1427 + }, + { + "epoch": 0.43830570902394106, + "grad_norm": 0.8823251724243164, + "learning_rate": 9.995019978792362e-05, + "loss": 2.4876, + "step": 1428 + }, + { + "epoch": 0.4386126457949662, + "grad_norm": 0.9289014339447021, + "learning_rate": 9.994997775071094e-05, + "loss": 2.4725, + "step": 1429 + }, + { + "epoch": 0.4389195825659914, + "grad_norm": 0.9100427627563477, + "learning_rate": 9.994975521986329e-05, + "loss": 2.3834, + "step": 1430 + }, + { + "epoch": 0.43922651933701656, + "grad_norm": 0.8956978917121887, + "learning_rate": 9.99495321953829e-05, + "loss": 2.4418, + "step": 1431 + }, + { + "epoch": 0.43953345610804173, + "grad_norm": 1.1248396635055542, + "learning_rate": 9.994930867727195e-05, + "loss": 2.4389, + "step": 1432 + }, + { + "epoch": 0.4398403928790669, + "grad_norm": 0.9285669922828674, + "learning_rate": 9.994908466553266e-05, + "loss": 2.3922, + "step": 1433 + }, + { + "epoch": 0.44014732965009207, + "grad_norm": 0.9604844450950623, + "learning_rate": 9.994886016016723e-05, + "loss": 2.4365, + "step": 1434 + }, + { + "epoch": 0.44045426642111724, + "grad_norm": 1.0534024238586426, + "learning_rate": 9.99486351611779e-05, + "loss": 2.4377, + "step": 1435 + }, + { + "epoch": 0.4407612031921424, + "grad_norm": 1.1028003692626953, + "learning_rate": 9.994840966856686e-05, + "loss": 2.4299, + "step": 1436 + }, + { + "epoch": 0.44106813996316757, + "grad_norm": 1.119832158088684, + "learning_rate": 9.994818368233639e-05, + "loss": 2.4656, + "step": 1437 + }, + { + "epoch": 0.44137507673419274, + "grad_norm": 0.9782878160476685, + "learning_rate": 9.994795720248867e-05, + "loss": 2.3661, + "step": 1438 + }, + { + "epoch": 0.4416820135052179, + "grad_norm": 1.0002741813659668, + "learning_rate": 9.994773022902597e-05, + "loss": 2.4157, + "step": 1439 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 1.051486611366272, + "learning_rate": 9.994750276195053e-05, + "loss": 2.452, + "step": 1440 + }, + { + "epoch": 0.44229588704726824, + "grad_norm": 1.0375488996505737, + "learning_rate": 9.994727480126457e-05, + "loss": 2.4406, + "step": 1441 + }, + { + "epoch": 0.4426028238182934, + "grad_norm": 0.9407445192337036, + "learning_rate": 9.99470463469704e-05, + "loss": 2.3434, + "step": 1442 + }, + { + "epoch": 0.4429097605893186, + "grad_norm": 1.0371474027633667, + "learning_rate": 9.994681739907022e-05, + "loss": 2.5094, + "step": 1443 + }, + { + "epoch": 0.44321669736034375, + "grad_norm": 1.057519555091858, + "learning_rate": 9.994658795756632e-05, + "loss": 2.4501, + "step": 1444 + }, + { + "epoch": 0.4435236341313689, + "grad_norm": 0.9340078234672546, + "learning_rate": 9.994635802246097e-05, + "loss": 2.4151, + "step": 1445 + }, + { + "epoch": 0.4438305709023941, + "grad_norm": 0.8906050324440002, + "learning_rate": 9.994612759375644e-05, + "loss": 2.3837, + "step": 1446 + }, + { + "epoch": 0.44413750767341925, + "grad_norm": 0.8349595665931702, + "learning_rate": 9.994589667145497e-05, + "loss": 2.4317, + "step": 1447 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.9362117648124695, + "learning_rate": 9.994566525555891e-05, + "loss": 2.4586, + "step": 1448 + }, + { + "epoch": 0.4447513812154696, + "grad_norm": 0.869215190410614, + "learning_rate": 9.99454333460705e-05, + "loss": 2.4458, + "step": 1449 + }, + { + "epoch": 0.44505831798649476, + "grad_norm": 0.904531717300415, + "learning_rate": 9.994520094299204e-05, + "loss": 2.4198, + "step": 1450 + }, + { + "epoch": 0.4453652547575199, + "grad_norm": 0.9153178930282593, + "learning_rate": 9.994496804632583e-05, + "loss": 2.3718, + "step": 1451 + }, + { + "epoch": 0.44567219152854515, + "grad_norm": 1.0229307413101196, + "learning_rate": 9.994473465607418e-05, + "loss": 2.3787, + "step": 1452 + }, + { + "epoch": 0.4459791282995703, + "grad_norm": 1.0449415445327759, + "learning_rate": 9.994450077223938e-05, + "loss": 2.4965, + "step": 1453 + }, + { + "epoch": 0.4462860650705955, + "grad_norm": 1.0524135828018188, + "learning_rate": 9.994426639482375e-05, + "loss": 2.3518, + "step": 1454 + }, + { + "epoch": 0.44659300184162065, + "grad_norm": 1.0612086057662964, + "learning_rate": 9.994403152382961e-05, + "loss": 2.4501, + "step": 1455 + }, + { + "epoch": 0.4468999386126458, + "grad_norm": 1.0568779706954956, + "learning_rate": 9.994379615925929e-05, + "loss": 2.3754, + "step": 1456 + }, + { + "epoch": 0.447206875383671, + "grad_norm": 1.0984265804290771, + "learning_rate": 9.994356030111509e-05, + "loss": 2.4318, + "step": 1457 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.9227646589279175, + "learning_rate": 9.994332394939936e-05, + "loss": 2.3928, + "step": 1458 + }, + { + "epoch": 0.4478207489257213, + "grad_norm": 1.0073471069335938, + "learning_rate": 9.994308710411442e-05, + "loss": 2.4203, + "step": 1459 + }, + { + "epoch": 0.4481276856967465, + "grad_norm": 1.1347973346710205, + "learning_rate": 9.994284976526263e-05, + "loss": 2.4991, + "step": 1460 + }, + { + "epoch": 0.44843462246777166, + "grad_norm": 0.9912654757499695, + "learning_rate": 9.994261193284631e-05, + "loss": 2.471, + "step": 1461 + }, + { + "epoch": 0.4487415592387968, + "grad_norm": 1.0599550008773804, + "learning_rate": 9.994237360686784e-05, + "loss": 2.505, + "step": 1462 + }, + { + "epoch": 0.449048496009822, + "grad_norm": 0.9811004996299744, + "learning_rate": 9.994213478732957e-05, + "loss": 2.3868, + "step": 1463 + }, + { + "epoch": 0.44935543278084716, + "grad_norm": 0.8389631509780884, + "learning_rate": 9.994189547423384e-05, + "loss": 2.4766, + "step": 1464 + }, + { + "epoch": 0.44966236955187233, + "grad_norm": 0.8475043773651123, + "learning_rate": 9.994165566758302e-05, + "loss": 2.3666, + "step": 1465 + }, + { + "epoch": 0.4499693063228975, + "grad_norm": 0.8922824859619141, + "learning_rate": 9.994141536737951e-05, + "loss": 2.3823, + "step": 1466 + }, + { + "epoch": 0.45027624309392267, + "grad_norm": 1.0286083221435547, + "learning_rate": 9.994117457362564e-05, + "loss": 2.4639, + "step": 1467 + }, + { + "epoch": 0.45058317986494784, + "grad_norm": 1.094282865524292, + "learning_rate": 9.994093328632383e-05, + "loss": 2.3984, + "step": 1468 + }, + { + "epoch": 0.450890116635973, + "grad_norm": 1.0993603467941284, + "learning_rate": 9.994069150547642e-05, + "loss": 2.3719, + "step": 1469 + }, + { + "epoch": 0.45119705340699817, + "grad_norm": 1.0274133682250977, + "learning_rate": 9.994044923108585e-05, + "loss": 2.3644, + "step": 1470 + }, + { + "epoch": 0.45150399017802334, + "grad_norm": 0.8834434747695923, + "learning_rate": 9.994020646315448e-05, + "loss": 2.4955, + "step": 1471 + }, + { + "epoch": 0.4518109269490485, + "grad_norm": 0.8540776968002319, + "learning_rate": 9.993996320168473e-05, + "loss": 2.4292, + "step": 1472 + }, + { + "epoch": 0.4521178637200737, + "grad_norm": 0.8735383749008179, + "learning_rate": 9.993971944667897e-05, + "loss": 2.4343, + "step": 1473 + }, + { + "epoch": 0.45242480049109884, + "grad_norm": 0.976224422454834, + "learning_rate": 9.993947519813965e-05, + "loss": 2.4173, + "step": 1474 + }, + { + "epoch": 0.452731737262124, + "grad_norm": 0.9638139009475708, + "learning_rate": 9.993923045606917e-05, + "loss": 2.4322, + "step": 1475 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.9689927697181702, + "learning_rate": 9.993898522046992e-05, + "loss": 2.4625, + "step": 1476 + }, + { + "epoch": 0.45334561080417435, + "grad_norm": 1.0496052503585815, + "learning_rate": 9.993873949134437e-05, + "loss": 2.4788, + "step": 1477 + }, + { + "epoch": 0.4536525475751995, + "grad_norm": 1.0285090208053589, + "learning_rate": 9.993849326869491e-05, + "loss": 2.4119, + "step": 1478 + }, + { + "epoch": 0.4539594843462247, + "grad_norm": 0.9423730373382568, + "learning_rate": 9.993824655252401e-05, + "loss": 2.3919, + "step": 1479 + }, + { + "epoch": 0.45426642111724985, + "grad_norm": 1.0312988758087158, + "learning_rate": 9.993799934283407e-05, + "loss": 2.3829, + "step": 1480 + }, + { + "epoch": 0.454573357888275, + "grad_norm": 1.0985655784606934, + "learning_rate": 9.993775163962755e-05, + "loss": 2.3958, + "step": 1481 + }, + { + "epoch": 0.4548802946593002, + "grad_norm": 0.9346623420715332, + "learning_rate": 9.993750344290691e-05, + "loss": 2.3611, + "step": 1482 + }, + { + "epoch": 0.45518723143032536, + "grad_norm": 1.039681315422058, + "learning_rate": 9.993725475267459e-05, + "loss": 2.3989, + "step": 1483 + }, + { + "epoch": 0.4554941682013505, + "grad_norm": 0.9941854476928711, + "learning_rate": 9.993700556893304e-05, + "loss": 2.3092, + "step": 1484 + }, + { + "epoch": 0.4558011049723757, + "grad_norm": 0.9752130508422852, + "learning_rate": 9.993675589168473e-05, + "loss": 2.3727, + "step": 1485 + }, + { + "epoch": 0.45610804174340086, + "grad_norm": 0.9946039319038391, + "learning_rate": 9.993650572093216e-05, + "loss": 2.4121, + "step": 1486 + }, + { + "epoch": 0.45641497851442603, + "grad_norm": 1.1340489387512207, + "learning_rate": 9.993625505667774e-05, + "loss": 2.4477, + "step": 1487 + }, + { + "epoch": 0.4567219152854512, + "grad_norm": 0.9300981760025024, + "learning_rate": 9.993600389892399e-05, + "loss": 2.4045, + "step": 1488 + }, + { + "epoch": 0.45702885205647636, + "grad_norm": 0.8670973181724548, + "learning_rate": 9.993575224767338e-05, + "loss": 2.3596, + "step": 1489 + }, + { + "epoch": 0.45733578882750153, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.99355001029284e-05, + "loss": 2.4191, + "step": 1490 + }, + { + "epoch": 0.4576427255985267, + "grad_norm": 0.9099079370498657, + "learning_rate": 9.993524746469154e-05, + "loss": 2.4139, + "step": 1491 + }, + { + "epoch": 0.45794966236955187, + "grad_norm": 0.9740153551101685, + "learning_rate": 9.99349943329653e-05, + "loss": 2.4269, + "step": 1492 + }, + { + "epoch": 0.45825659914057704, + "grad_norm": 0.9112171530723572, + "learning_rate": 9.993474070775217e-05, + "loss": 2.3575, + "step": 1493 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 1.124553918838501, + "learning_rate": 9.993448658905466e-05, + "loss": 2.5518, + "step": 1494 + }, + { + "epoch": 0.4588704726826274, + "grad_norm": 1.1732012033462524, + "learning_rate": 9.99342319768753e-05, + "loss": 2.4346, + "step": 1495 + }, + { + "epoch": 0.45917740945365254, + "grad_norm": 0.8880025148391724, + "learning_rate": 9.993397687121659e-05, + "loss": 2.3593, + "step": 1496 + }, + { + "epoch": 0.4594843462246777, + "grad_norm": 0.9916797876358032, + "learning_rate": 9.993372127208105e-05, + "loss": 2.3283, + "step": 1497 + }, + { + "epoch": 0.4597912829957029, + "grad_norm": 0.9372622966766357, + "learning_rate": 9.99334651794712e-05, + "loss": 2.3868, + "step": 1498 + }, + { + "epoch": 0.46009821976672804, + "grad_norm": 1.0630989074707031, + "learning_rate": 9.99332085933896e-05, + "loss": 2.3605, + "step": 1499 + }, + { + "epoch": 0.4604051565377532, + "grad_norm": 1.000473976135254, + "learning_rate": 9.993295151383874e-05, + "loss": 2.3478, + "step": 1500 + }, + { + "epoch": 0.4607120933087784, + "grad_norm": 1.0269688367843628, + "learning_rate": 9.99326939408212e-05, + "loss": 2.4104, + "step": 1501 + }, + { + "epoch": 0.46101903007980355, + "grad_norm": 0.9003174901008606, + "learning_rate": 9.993243587433952e-05, + "loss": 2.3461, + "step": 1502 + }, + { + "epoch": 0.4613259668508287, + "grad_norm": 0.7938058972358704, + "learning_rate": 9.993217731439623e-05, + "loss": 2.3463, + "step": 1503 + }, + { + "epoch": 0.4616329036218539, + "grad_norm": 0.8715407252311707, + "learning_rate": 9.993191826099391e-05, + "loss": 2.3962, + "step": 1504 + }, + { + "epoch": 0.46193984039287905, + "grad_norm": 0.8319756984710693, + "learning_rate": 9.99316587141351e-05, + "loss": 2.342, + "step": 1505 + }, + { + "epoch": 0.4622467771639042, + "grad_norm": 0.846592903137207, + "learning_rate": 9.993139867382238e-05, + "loss": 2.4064, + "step": 1506 + }, + { + "epoch": 0.4625537139349294, + "grad_norm": 0.8567312955856323, + "learning_rate": 9.99311381400583e-05, + "loss": 2.3603, + "step": 1507 + }, + { + "epoch": 0.46286065070595456, + "grad_norm": 0.8784321546554565, + "learning_rate": 9.993087711284546e-05, + "loss": 2.4031, + "step": 1508 + }, + { + "epoch": 0.4631675874769797, + "grad_norm": 0.838233232498169, + "learning_rate": 9.993061559218641e-05, + "loss": 2.3156, + "step": 1509 + }, + { + "epoch": 0.4634745242480049, + "grad_norm": 0.8804462552070618, + "learning_rate": 9.993035357808376e-05, + "loss": 2.4322, + "step": 1510 + }, + { + "epoch": 0.46378146101903006, + "grad_norm": 1.1055982112884521, + "learning_rate": 9.99300910705401e-05, + "loss": 2.5006, + "step": 1511 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.9872145056724548, + "learning_rate": 9.992982806955799e-05, + "loss": 2.3547, + "step": 1512 + }, + { + "epoch": 0.4643953345610804, + "grad_norm": 1.0710479021072388, + "learning_rate": 9.99295645751401e-05, + "loss": 2.4867, + "step": 1513 + }, + { + "epoch": 0.46470227133210557, + "grad_norm": 0.9858919382095337, + "learning_rate": 9.992930058728894e-05, + "loss": 2.2986, + "step": 1514 + }, + { + "epoch": 0.46500920810313073, + "grad_norm": 0.9031065702438354, + "learning_rate": 9.992903610600719e-05, + "loss": 2.3172, + "step": 1515 + }, + { + "epoch": 0.4653161448741559, + "grad_norm": 0.923160970211029, + "learning_rate": 9.992877113129744e-05, + "loss": 2.4231, + "step": 1516 + }, + { + "epoch": 0.46562308164518107, + "grad_norm": 1.0130947828292847, + "learning_rate": 9.992850566316231e-05, + "loss": 2.3593, + "step": 1517 + }, + { + "epoch": 0.46593001841620624, + "grad_norm": 0.8947033286094666, + "learning_rate": 9.992823970160441e-05, + "loss": 2.3324, + "step": 1518 + }, + { + "epoch": 0.4662369551872314, + "grad_norm": 0.8819900155067444, + "learning_rate": 9.992797324662639e-05, + "loss": 2.2885, + "step": 1519 + }, + { + "epoch": 0.4665438919582566, + "grad_norm": 0.9434374570846558, + "learning_rate": 9.99277062982309e-05, + "loss": 2.427, + "step": 1520 + }, + { + "epoch": 0.46685082872928174, + "grad_norm": 0.9568646550178528, + "learning_rate": 9.99274388564205e-05, + "loss": 2.4059, + "step": 1521 + }, + { + "epoch": 0.4671577655003069, + "grad_norm": 0.9125105142593384, + "learning_rate": 9.992717092119794e-05, + "loss": 2.3306, + "step": 1522 + }, + { + "epoch": 0.46746470227133213, + "grad_norm": 0.8893206715583801, + "learning_rate": 9.992690249256578e-05, + "loss": 2.4211, + "step": 1523 + }, + { + "epoch": 0.4677716390423573, + "grad_norm": 0.8655402660369873, + "learning_rate": 9.992663357052672e-05, + "loss": 2.3493, + "step": 1524 + }, + { + "epoch": 0.46807857581338247, + "grad_norm": 0.7973037958145142, + "learning_rate": 9.99263641550834e-05, + "loss": 2.4255, + "step": 1525 + }, + { + "epoch": 0.46838551258440764, + "grad_norm": 0.8158934116363525, + "learning_rate": 9.992609424623849e-05, + "loss": 2.3518, + "step": 1526 + }, + { + "epoch": 0.4686924493554328, + "grad_norm": 0.7919436693191528, + "learning_rate": 9.992582384399465e-05, + "loss": 2.3762, + "step": 1527 + }, + { + "epoch": 0.468999386126458, + "grad_norm": 0.911490261554718, + "learning_rate": 9.992555294835455e-05, + "loss": 2.454, + "step": 1528 + }, + { + "epoch": 0.46930632289748314, + "grad_norm": 0.9504674077033997, + "learning_rate": 9.992528155932088e-05, + "loss": 2.3554, + "step": 1529 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.9833991527557373, + "learning_rate": 9.99250096768963e-05, + "loss": 2.4245, + "step": 1530 + }, + { + "epoch": 0.4699201964395335, + "grad_norm": 0.9994687438011169, + "learning_rate": 9.992473730108354e-05, + "loss": 2.3269, + "step": 1531 + }, + { + "epoch": 0.47022713321055865, + "grad_norm": 0.977237343788147, + "learning_rate": 9.992446443188526e-05, + "loss": 2.3938, + "step": 1532 + }, + { + "epoch": 0.4705340699815838, + "grad_norm": 1.018334150314331, + "learning_rate": 9.992419106930415e-05, + "loss": 2.3076, + "step": 1533 + }, + { + "epoch": 0.470841006752609, + "grad_norm": 0.9752077460289001, + "learning_rate": 9.992391721334293e-05, + "loss": 2.4224, + "step": 1534 + }, + { + "epoch": 0.47114794352363415, + "grad_norm": 0.9457291960716248, + "learning_rate": 9.992364286400428e-05, + "loss": 2.3859, + "step": 1535 + }, + { + "epoch": 0.4714548802946593, + "grad_norm": 0.9112275838851929, + "learning_rate": 9.992336802129096e-05, + "loss": 2.3343, + "step": 1536 + }, + { + "epoch": 0.4717618170656845, + "grad_norm": 0.7701164484024048, + "learning_rate": 9.992309268520563e-05, + "loss": 2.3912, + "step": 1537 + }, + { + "epoch": 0.47206875383670965, + "grad_norm": 0.826822817325592, + "learning_rate": 9.992281685575105e-05, + "loss": 2.3794, + "step": 1538 + }, + { + "epoch": 0.4723756906077348, + "grad_norm": 0.8690019249916077, + "learning_rate": 9.992254053292994e-05, + "loss": 2.3474, + "step": 1539 + }, + { + "epoch": 0.47268262737876, + "grad_norm": 0.935954213142395, + "learning_rate": 9.9922263716745e-05, + "loss": 2.3794, + "step": 1540 + }, + { + "epoch": 0.47298956414978516, + "grad_norm": 1.0606616735458374, + "learning_rate": 9.992198640719901e-05, + "loss": 2.3491, + "step": 1541 + }, + { + "epoch": 0.4732965009208103, + "grad_norm": 1.0020630359649658, + "learning_rate": 9.992170860429469e-05, + "loss": 2.4723, + "step": 1542 + }, + { + "epoch": 0.4736034376918355, + "grad_norm": 0.9738268256187439, + "learning_rate": 9.992143030803476e-05, + "loss": 2.4282, + "step": 1543 + }, + { + "epoch": 0.47391037446286066, + "grad_norm": 1.0320461988449097, + "learning_rate": 9.992115151842203e-05, + "loss": 2.3935, + "step": 1544 + }, + { + "epoch": 0.47421731123388583, + "grad_norm": 0.926980197429657, + "learning_rate": 9.992087223545921e-05, + "loss": 2.4403, + "step": 1545 + }, + { + "epoch": 0.474524248004911, + "grad_norm": 0.8760805130004883, + "learning_rate": 9.992059245914906e-05, + "loss": 2.3282, + "step": 1546 + }, + { + "epoch": 0.47483118477593617, + "grad_norm": 0.807569146156311, + "learning_rate": 9.992031218949435e-05, + "loss": 2.351, + "step": 1547 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.7491574883460999, + "learning_rate": 9.992003142649788e-05, + "loss": 2.3788, + "step": 1548 + }, + { + "epoch": 0.4754450583179865, + "grad_norm": 0.8402566909790039, + "learning_rate": 9.99197501701624e-05, + "loss": 2.4025, + "step": 1549 + }, + { + "epoch": 0.47575199508901167, + "grad_norm": 0.9501824975013733, + "learning_rate": 9.991946842049067e-05, + "loss": 2.4433, + "step": 1550 + }, + { + "epoch": 0.47605893186003684, + "grad_norm": 1.0070267915725708, + "learning_rate": 9.99191861774855e-05, + "loss": 2.4267, + "step": 1551 + }, + { + "epoch": 0.476365868631062, + "grad_norm": 0.9052779078483582, + "learning_rate": 9.991890344114969e-05, + "loss": 2.37, + "step": 1552 + }, + { + "epoch": 0.4766728054020872, + "grad_norm": 0.9453344345092773, + "learning_rate": 9.9918620211486e-05, + "loss": 2.4687, + "step": 1553 + }, + { + "epoch": 0.47697974217311234, + "grad_norm": 0.9836863875389099, + "learning_rate": 9.991833648849725e-05, + "loss": 2.4005, + "step": 1554 + }, + { + "epoch": 0.4772866789441375, + "grad_norm": 0.856532633304596, + "learning_rate": 9.991805227218624e-05, + "loss": 2.329, + "step": 1555 + }, + { + "epoch": 0.4775936157151627, + "grad_norm": 0.8338705897331238, + "learning_rate": 9.991776756255579e-05, + "loss": 2.3648, + "step": 1556 + }, + { + "epoch": 0.47790055248618785, + "grad_norm": 0.7738644480705261, + "learning_rate": 9.991748235960869e-05, + "loss": 2.2784, + "step": 1557 + }, + { + "epoch": 0.478207489257213, + "grad_norm": 0.7771223783493042, + "learning_rate": 9.991719666334778e-05, + "loss": 2.2747, + "step": 1558 + }, + { + "epoch": 0.4785144260282382, + "grad_norm": 0.7564612627029419, + "learning_rate": 9.991691047377588e-05, + "loss": 2.2964, + "step": 1559 + }, + { + "epoch": 0.47882136279926335, + "grad_norm": 0.7877290844917297, + "learning_rate": 9.99166237908958e-05, + "loss": 2.3149, + "step": 1560 + }, + { + "epoch": 0.4791282995702885, + "grad_norm": 0.7967450022697449, + "learning_rate": 9.991633661471039e-05, + "loss": 2.4035, + "step": 1561 + }, + { + "epoch": 0.4794352363413137, + "grad_norm": 0.8993534445762634, + "learning_rate": 9.991604894522248e-05, + "loss": 2.4028, + "step": 1562 + }, + { + "epoch": 0.47974217311233885, + "grad_norm": 0.9135516881942749, + "learning_rate": 9.991576078243494e-05, + "loss": 2.3968, + "step": 1563 + }, + { + "epoch": 0.480049109883364, + "grad_norm": 0.8438525795936584, + "learning_rate": 9.991547212635057e-05, + "loss": 2.3589, + "step": 1564 + }, + { + "epoch": 0.4803560466543892, + "grad_norm": 0.8979686498641968, + "learning_rate": 9.991518297697226e-05, + "loss": 2.3835, + "step": 1565 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.8821539878845215, + "learning_rate": 9.991489333430286e-05, + "loss": 2.3503, + "step": 1566 + }, + { + "epoch": 0.4809699201964395, + "grad_norm": 0.8649077415466309, + "learning_rate": 9.991460319834523e-05, + "loss": 2.3806, + "step": 1567 + }, + { + "epoch": 0.4812768569674647, + "grad_norm": 0.8360965847969055, + "learning_rate": 9.991431256910223e-05, + "loss": 2.3997, + "step": 1568 + }, + { + "epoch": 0.48158379373848986, + "grad_norm": 0.9178828597068787, + "learning_rate": 9.991402144657673e-05, + "loss": 2.3611, + "step": 1569 + }, + { + "epoch": 0.48189073050951503, + "grad_norm": 0.7961607575416565, + "learning_rate": 9.991372983077161e-05, + "loss": 2.3588, + "step": 1570 + }, + { + "epoch": 0.4821976672805402, + "grad_norm": 0.8136993646621704, + "learning_rate": 9.991343772168978e-05, + "loss": 2.3241, + "step": 1571 + }, + { + "epoch": 0.48250460405156537, + "grad_norm": 0.8421273231506348, + "learning_rate": 9.991314511933407e-05, + "loss": 2.3493, + "step": 1572 + }, + { + "epoch": 0.48281154082259053, + "grad_norm": 0.774861752986908, + "learning_rate": 9.991285202370743e-05, + "loss": 2.362, + "step": 1573 + }, + { + "epoch": 0.4831184775936157, + "grad_norm": 0.9181589484214783, + "learning_rate": 9.991255843481273e-05, + "loss": 2.443, + "step": 1574 + }, + { + "epoch": 0.48342541436464087, + "grad_norm": 0.873884379863739, + "learning_rate": 9.991226435265286e-05, + "loss": 2.3819, + "step": 1575 + }, + { + "epoch": 0.48373235113566604, + "grad_norm": 0.923200786113739, + "learning_rate": 9.991196977723077e-05, + "loss": 2.4152, + "step": 1576 + }, + { + "epoch": 0.4840392879066912, + "grad_norm": 0.9097923040390015, + "learning_rate": 9.99116747085493e-05, + "loss": 2.4072, + "step": 1577 + }, + { + "epoch": 0.4843462246777164, + "grad_norm": 0.8885805010795593, + "learning_rate": 9.991137914661143e-05, + "loss": 2.3963, + "step": 1578 + }, + { + "epoch": 0.48465316144874154, + "grad_norm": 0.9016655683517456, + "learning_rate": 9.991108309142006e-05, + "loss": 2.4287, + "step": 1579 + }, + { + "epoch": 0.4849600982197667, + "grad_norm": 0.957548201084137, + "learning_rate": 9.99107865429781e-05, + "loss": 2.4306, + "step": 1580 + }, + { + "epoch": 0.4852670349907919, + "grad_norm": 0.9604195356369019, + "learning_rate": 9.99104895012885e-05, + "loss": 2.3721, + "step": 1581 + }, + { + "epoch": 0.48557397176181705, + "grad_norm": 1.0423815250396729, + "learning_rate": 9.991019196635419e-05, + "loss": 2.3847, + "step": 1582 + }, + { + "epoch": 0.4858809085328422, + "grad_norm": 0.9538045525550842, + "learning_rate": 9.990989393817809e-05, + "loss": 2.4307, + "step": 1583 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 1.0103334188461304, + "learning_rate": 9.990959541676318e-05, + "loss": 2.409, + "step": 1584 + }, + { + "epoch": 0.48649478207489255, + "grad_norm": 1.0780646800994873, + "learning_rate": 9.99092964021124e-05, + "loss": 2.3314, + "step": 1585 + }, + { + "epoch": 0.4868017188459177, + "grad_norm": 1.0062072277069092, + "learning_rate": 9.99089968942287e-05, + "loss": 2.3922, + "step": 1586 + }, + { + "epoch": 0.4871086556169429, + "grad_norm": 1.0575196743011475, + "learning_rate": 9.990869689311504e-05, + "loss": 2.4156, + "step": 1587 + }, + { + "epoch": 0.48741559238796806, + "grad_norm": 0.9953998923301697, + "learning_rate": 9.990839639877438e-05, + "loss": 2.381, + "step": 1588 + }, + { + "epoch": 0.4877225291589932, + "grad_norm": 0.8848470449447632, + "learning_rate": 9.99080954112097e-05, + "loss": 2.4178, + "step": 1589 + }, + { + "epoch": 0.4880294659300184, + "grad_norm": 0.7849117517471313, + "learning_rate": 9.990779393042397e-05, + "loss": 2.3021, + "step": 1590 + }, + { + "epoch": 0.48833640270104356, + "grad_norm": 0.7611599564552307, + "learning_rate": 9.990749195642016e-05, + "loss": 2.4426, + "step": 1591 + }, + { + "epoch": 0.4886433394720687, + "grad_norm": 0.8361895084381104, + "learning_rate": 9.990718948920127e-05, + "loss": 2.3442, + "step": 1592 + }, + { + "epoch": 0.4889502762430939, + "grad_norm": 0.8249576687812805, + "learning_rate": 9.990688652877028e-05, + "loss": 2.2745, + "step": 1593 + }, + { + "epoch": 0.4892572130141191, + "grad_norm": 0.763889729976654, + "learning_rate": 9.990658307513019e-05, + "loss": 2.3123, + "step": 1594 + }, + { + "epoch": 0.4895641497851443, + "grad_norm": 0.7517281770706177, + "learning_rate": 9.990627912828399e-05, + "loss": 2.3811, + "step": 1595 + }, + { + "epoch": 0.48987108655616945, + "grad_norm": 0.8254112005233765, + "learning_rate": 9.990597468823468e-05, + "loss": 2.4269, + "step": 1596 + }, + { + "epoch": 0.4901780233271946, + "grad_norm": 0.8267236948013306, + "learning_rate": 9.99056697549853e-05, + "loss": 2.354, + "step": 1597 + }, + { + "epoch": 0.4904849600982198, + "grad_norm": 0.8511303067207336, + "learning_rate": 9.990536432853881e-05, + "loss": 2.3755, + "step": 1598 + }, + { + "epoch": 0.49079189686924496, + "grad_norm": 0.8639636635780334, + "learning_rate": 9.990505840889828e-05, + "loss": 2.3828, + "step": 1599 + }, + { + "epoch": 0.4910988336402701, + "grad_norm": 0.8371795415878296, + "learning_rate": 9.990475199606672e-05, + "loss": 2.4235, + "step": 1600 + }, + { + "epoch": 0.4914057704112953, + "grad_norm": 0.7639186382293701, + "learning_rate": 9.990444509004713e-05, + "loss": 2.3547, + "step": 1601 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.7835492491722107, + "learning_rate": 9.990413769084257e-05, + "loss": 2.2983, + "step": 1602 + }, + { + "epoch": 0.49201964395334563, + "grad_norm": 0.8301565647125244, + "learning_rate": 9.990382979845609e-05, + "loss": 2.4109, + "step": 1603 + }, + { + "epoch": 0.4923265807243708, + "grad_norm": 0.9005976915359497, + "learning_rate": 9.99035214128907e-05, + "loss": 2.3618, + "step": 1604 + }, + { + "epoch": 0.49263351749539597, + "grad_norm": 1.0234936475753784, + "learning_rate": 9.990321253414945e-05, + "loss": 2.4622, + "step": 1605 + }, + { + "epoch": 0.49294045426642114, + "grad_norm": 1.1613819599151611, + "learning_rate": 9.990290316223542e-05, + "loss": 2.3231, + "step": 1606 + }, + { + "epoch": 0.4932473910374463, + "grad_norm": 0.9382983446121216, + "learning_rate": 9.990259329715165e-05, + "loss": 2.357, + "step": 1607 + }, + { + "epoch": 0.49355432780847147, + "grad_norm": 1.0277435779571533, + "learning_rate": 9.990228293890121e-05, + "loss": 2.3497, + "step": 1608 + }, + { + "epoch": 0.49386126457949664, + "grad_norm": 0.9809542894363403, + "learning_rate": 9.990197208748716e-05, + "loss": 2.363, + "step": 1609 + }, + { + "epoch": 0.4941682013505218, + "grad_norm": 1.151412844657898, + "learning_rate": 9.990166074291255e-05, + "loss": 2.4859, + "step": 1610 + }, + { + "epoch": 0.494475138121547, + "grad_norm": 0.9663482308387756, + "learning_rate": 9.990134890518051e-05, + "loss": 2.3848, + "step": 1611 + }, + { + "epoch": 0.49478207489257214, + "grad_norm": 0.9619266986846924, + "learning_rate": 9.990103657429405e-05, + "loss": 2.3381, + "step": 1612 + }, + { + "epoch": 0.4950890116635973, + "grad_norm": 1.1306475400924683, + "learning_rate": 9.990072375025634e-05, + "loss": 2.3859, + "step": 1613 + }, + { + "epoch": 0.4953959484346225, + "grad_norm": 1.127801537513733, + "learning_rate": 9.990041043307043e-05, + "loss": 2.4259, + "step": 1614 + }, + { + "epoch": 0.49570288520564765, + "grad_norm": 0.9880200624465942, + "learning_rate": 9.990009662273941e-05, + "loss": 2.3629, + "step": 1615 + }, + { + "epoch": 0.4960098219766728, + "grad_norm": 0.940493643283844, + "learning_rate": 9.989978231926636e-05, + "loss": 2.3716, + "step": 1616 + }, + { + "epoch": 0.496316758747698, + "grad_norm": 0.7923702597618103, + "learning_rate": 9.989946752265445e-05, + "loss": 2.3017, + "step": 1617 + }, + { + "epoch": 0.49662369551872315, + "grad_norm": 0.7668408155441284, + "learning_rate": 9.989915223290673e-05, + "loss": 2.3273, + "step": 1618 + }, + { + "epoch": 0.4969306322897483, + "grad_norm": 0.7134098410606384, + "learning_rate": 9.989883645002636e-05, + "loss": 2.302, + "step": 1619 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.6878800392150879, + "learning_rate": 9.989852017401643e-05, + "loss": 2.3047, + "step": 1620 + }, + { + "epoch": 0.49754450583179866, + "grad_norm": 0.8099397420883179, + "learning_rate": 9.989820340488008e-05, + "loss": 2.4747, + "step": 1621 + }, + { + "epoch": 0.4978514426028238, + "grad_norm": 0.9677640795707703, + "learning_rate": 9.989788614262043e-05, + "loss": 2.3347, + "step": 1622 + }, + { + "epoch": 0.498158379373849, + "grad_norm": 0.7592893838882446, + "learning_rate": 9.989756838724064e-05, + "loss": 2.3238, + "step": 1623 + }, + { + "epoch": 0.49846531614487416, + "grad_norm": 0.872529923915863, + "learning_rate": 9.989725013874382e-05, + "loss": 2.4117, + "step": 1624 + }, + { + "epoch": 0.49877225291589933, + "grad_norm": 1.023362159729004, + "learning_rate": 9.989693139713315e-05, + "loss": 2.3307, + "step": 1625 + }, + { + "epoch": 0.4990791896869245, + "grad_norm": 0.8994693756103516, + "learning_rate": 9.989661216241172e-05, + "loss": 2.3661, + "step": 1626 + }, + { + "epoch": 0.49938612645794966, + "grad_norm": 0.8854429125785828, + "learning_rate": 9.989629243458275e-05, + "loss": 2.311, + "step": 1627 + }, + { + "epoch": 0.49969306322897483, + "grad_norm": 0.8326926231384277, + "learning_rate": 9.989597221364937e-05, + "loss": 2.302, + "step": 1628 + }, + { + "epoch": 0.5, + "grad_norm": 0.8778239488601685, + "learning_rate": 9.989565149961475e-05, + "loss": 2.4653, + "step": 1629 + }, + { + "epoch": 0.5003069367710252, + "grad_norm": 0.9369759559631348, + "learning_rate": 9.989533029248205e-05, + "loss": 2.4165, + "step": 1630 + }, + { + "epoch": 0.5006138735420503, + "grad_norm": 0.8510915637016296, + "learning_rate": 9.989500859225445e-05, + "loss": 2.3345, + "step": 1631 + }, + { + "epoch": 0.5009208103130756, + "grad_norm": 0.787972629070282, + "learning_rate": 9.989468639893513e-05, + "loss": 2.283, + "step": 1632 + }, + { + "epoch": 0.5012277470841007, + "grad_norm": 0.7370568513870239, + "learning_rate": 9.989436371252729e-05, + "loss": 2.2867, + "step": 1633 + }, + { + "epoch": 0.5015346838551259, + "grad_norm": 0.8459502458572388, + "learning_rate": 9.989404053303409e-05, + "loss": 2.2875, + "step": 1634 + }, + { + "epoch": 0.501841620626151, + "grad_norm": 0.9123181700706482, + "learning_rate": 9.989371686045874e-05, + "loss": 2.2653, + "step": 1635 + }, + { + "epoch": 0.5021485573971762, + "grad_norm": 1.1908178329467773, + "learning_rate": 9.989339269480445e-05, + "loss": 2.4849, + "step": 1636 + }, + { + "epoch": 0.5024554941682013, + "grad_norm": 0.8162623643875122, + "learning_rate": 9.989306803607439e-05, + "loss": 2.2409, + "step": 1637 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.9289522171020508, + "learning_rate": 9.98927428842718e-05, + "loss": 2.455, + "step": 1638 + }, + { + "epoch": 0.5030693677102517, + "grad_norm": 1.212346076965332, + "learning_rate": 9.989241723939988e-05, + "loss": 2.3461, + "step": 1639 + }, + { + "epoch": 0.5033763044812769, + "grad_norm": 0.8971593976020813, + "learning_rate": 9.989209110146184e-05, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.503683241252302, + "grad_norm": 0.9293156862258911, + "learning_rate": 9.989176447046092e-05, + "loss": 2.3235, + "step": 1641 + }, + { + "epoch": 0.5039901780233272, + "grad_norm": 0.8665596842765808, + "learning_rate": 9.989143734640034e-05, + "loss": 2.4694, + "step": 1642 + }, + { + "epoch": 0.5042971147943524, + "grad_norm": 0.7732648253440857, + "learning_rate": 9.989110972928333e-05, + "loss": 2.1985, + "step": 1643 + }, + { + "epoch": 0.5046040515653776, + "grad_norm": 0.8124692440032959, + "learning_rate": 9.989078161911314e-05, + "loss": 2.315, + "step": 1644 + }, + { + "epoch": 0.5049109883364027, + "grad_norm": 0.8534342050552368, + "learning_rate": 9.989045301589301e-05, + "loss": 2.3491, + "step": 1645 + }, + { + "epoch": 0.5052179251074279, + "grad_norm": 0.8351274132728577, + "learning_rate": 9.989012391962617e-05, + "loss": 2.3416, + "step": 1646 + }, + { + "epoch": 0.505524861878453, + "grad_norm": 0.9143189787864685, + "learning_rate": 9.988979433031588e-05, + "loss": 2.4665, + "step": 1647 + }, + { + "epoch": 0.5058317986494782, + "grad_norm": 0.8978474140167236, + "learning_rate": 9.988946424796542e-05, + "loss": 2.389, + "step": 1648 + }, + { + "epoch": 0.5061387354205034, + "grad_norm": 1.0245648622512817, + "learning_rate": 9.988913367257802e-05, + "loss": 2.3391, + "step": 1649 + }, + { + "epoch": 0.5064456721915286, + "grad_norm": 0.9991573691368103, + "learning_rate": 9.988880260415695e-05, + "loss": 2.405, + "step": 1650 + }, + { + "epoch": 0.5067526089625537, + "grad_norm": 1.042378306388855, + "learning_rate": 9.98884710427055e-05, + "loss": 2.3467, + "step": 1651 + }, + { + "epoch": 0.5070595457335789, + "grad_norm": 0.9569510817527771, + "learning_rate": 9.988813898822694e-05, + "loss": 2.31, + "step": 1652 + }, + { + "epoch": 0.507366482504604, + "grad_norm": 0.9343158006668091, + "learning_rate": 9.988780644072456e-05, + "loss": 2.3659, + "step": 1653 + }, + { + "epoch": 0.5076734192756293, + "grad_norm": 0.7857093811035156, + "learning_rate": 9.988747340020162e-05, + "loss": 2.3424, + "step": 1654 + }, + { + "epoch": 0.5079803560466544, + "grad_norm": 0.7613041996955872, + "learning_rate": 9.988713986666144e-05, + "loss": 2.2698, + "step": 1655 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.8077516555786133, + "learning_rate": 9.98868058401073e-05, + "loss": 2.3827, + "step": 1656 + }, + { + "epoch": 0.5085942295887047, + "grad_norm": 0.8794304132461548, + "learning_rate": 9.98864713205425e-05, + "loss": 2.3079, + "step": 1657 + }, + { + "epoch": 0.5089011663597299, + "grad_norm": 0.8333674073219299, + "learning_rate": 9.988613630797036e-05, + "loss": 2.3622, + "step": 1658 + }, + { + "epoch": 0.509208103130755, + "grad_norm": 0.9654781222343445, + "learning_rate": 9.988580080239417e-05, + "loss": 2.3979, + "step": 1659 + }, + { + "epoch": 0.5095150399017803, + "grad_norm": 0.9278727769851685, + "learning_rate": 9.988546480381727e-05, + "loss": 2.3728, + "step": 1660 + }, + { + "epoch": 0.5098219766728054, + "grad_norm": 0.7971704006195068, + "learning_rate": 9.988512831224298e-05, + "loss": 2.2983, + "step": 1661 + }, + { + "epoch": 0.5101289134438306, + "grad_norm": 0.8991698026657104, + "learning_rate": 9.988479132767459e-05, + "loss": 2.3992, + "step": 1662 + }, + { + "epoch": 0.5104358502148557, + "grad_norm": 1.0208392143249512, + "learning_rate": 9.988445385011546e-05, + "loss": 2.3847, + "step": 1663 + }, + { + "epoch": 0.5107427869858809, + "grad_norm": 0.878237247467041, + "learning_rate": 9.988411587956891e-05, + "loss": 2.2851, + "step": 1664 + }, + { + "epoch": 0.511049723756906, + "grad_norm": 0.903287410736084, + "learning_rate": 9.98837774160383e-05, + "loss": 2.4233, + "step": 1665 + }, + { + "epoch": 0.5113566605279313, + "grad_norm": 0.8845674991607666, + "learning_rate": 9.988343845952697e-05, + "loss": 2.2923, + "step": 1666 + }, + { + "epoch": 0.5116635972989564, + "grad_norm": 0.7729392051696777, + "learning_rate": 9.988309901003825e-05, + "loss": 2.3044, + "step": 1667 + }, + { + "epoch": 0.5119705340699816, + "grad_norm": 0.719302237033844, + "learning_rate": 9.988275906757551e-05, + "loss": 2.3207, + "step": 1668 + }, + { + "epoch": 0.5122774708410067, + "grad_norm": 0.7205179333686829, + "learning_rate": 9.988241863214211e-05, + "loss": 2.341, + "step": 1669 + }, + { + "epoch": 0.512584407612032, + "grad_norm": 0.7318145036697388, + "learning_rate": 9.988207770374142e-05, + "loss": 2.3419, + "step": 1670 + }, + { + "epoch": 0.5128913443830571, + "grad_norm": 0.770630955696106, + "learning_rate": 9.98817362823768e-05, + "loss": 2.27, + "step": 1671 + }, + { + "epoch": 0.5131982811540823, + "grad_norm": 0.6485452651977539, + "learning_rate": 9.988139436805162e-05, + "loss": 2.2715, + "step": 1672 + }, + { + "epoch": 0.5135052179251074, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.988105196076925e-05, + "loss": 2.2806, + "step": 1673 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.695818305015564, + "learning_rate": 9.98807090605331e-05, + "loss": 2.3387, + "step": 1674 + }, + { + "epoch": 0.5141190914671577, + "grad_norm": 0.7685426473617554, + "learning_rate": 9.988036566734655e-05, + "loss": 2.2921, + "step": 1675 + }, + { + "epoch": 0.514426028238183, + "grad_norm": 0.6522897481918335, + "learning_rate": 9.988002178121301e-05, + "loss": 2.2507, + "step": 1676 + }, + { + "epoch": 0.5147329650092081, + "grad_norm": 0.7442181706428528, + "learning_rate": 9.987967740213583e-05, + "loss": 2.3292, + "step": 1677 + }, + { + "epoch": 0.5150399017802333, + "grad_norm": 0.8093023300170898, + "learning_rate": 9.987933253011846e-05, + "loss": 2.3384, + "step": 1678 + }, + { + "epoch": 0.5153468385512584, + "grad_norm": 0.8014655113220215, + "learning_rate": 9.987898716516428e-05, + "loss": 2.3619, + "step": 1679 + }, + { + "epoch": 0.5156537753222836, + "grad_norm": 0.8230258822441101, + "learning_rate": 9.987864130727671e-05, + "loss": 2.3242, + "step": 1680 + }, + { + "epoch": 0.5159607120933087, + "grad_norm": 0.9222247004508972, + "learning_rate": 9.987829495645918e-05, + "loss": 2.3907, + "step": 1681 + }, + { + "epoch": 0.516267648864334, + "grad_norm": 0.9293351769447327, + "learning_rate": 9.987794811271511e-05, + "loss": 2.3632, + "step": 1682 + }, + { + "epoch": 0.5165745856353591, + "grad_norm": 0.9555168747901917, + "learning_rate": 9.987760077604791e-05, + "loss": 2.3273, + "step": 1683 + }, + { + "epoch": 0.5168815224063843, + "grad_norm": 0.9839370250701904, + "learning_rate": 9.987725294646102e-05, + "loss": 2.3451, + "step": 1684 + }, + { + "epoch": 0.5171884591774094, + "grad_norm": 1.097970962524414, + "learning_rate": 9.987690462395791e-05, + "loss": 2.308, + "step": 1685 + }, + { + "epoch": 0.5174953959484346, + "grad_norm": 0.9345484972000122, + "learning_rate": 9.987655580854198e-05, + "loss": 2.3051, + "step": 1686 + }, + { + "epoch": 0.5178023327194597, + "grad_norm": 0.8075851798057556, + "learning_rate": 9.987620650021668e-05, + "loss": 2.3005, + "step": 1687 + }, + { + "epoch": 0.518109269490485, + "grad_norm": 0.7287935614585876, + "learning_rate": 9.987585669898549e-05, + "loss": 2.3709, + "step": 1688 + }, + { + "epoch": 0.5184162062615101, + "grad_norm": 0.7611173987388611, + "learning_rate": 9.987550640485184e-05, + "loss": 2.3265, + "step": 1689 + }, + { + "epoch": 0.5187231430325353, + "grad_norm": 0.7932588458061218, + "learning_rate": 9.987515561781921e-05, + "loss": 2.3625, + "step": 1690 + }, + { + "epoch": 0.5190300798035604, + "grad_norm": 0.7837479114532471, + "learning_rate": 9.987480433789106e-05, + "loss": 2.2614, + "step": 1691 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.905799925327301, + "learning_rate": 9.987445256507085e-05, + "loss": 2.2915, + "step": 1692 + }, + { + "epoch": 0.5196439533456108, + "grad_norm": 0.9417183995246887, + "learning_rate": 9.987410029936208e-05, + "loss": 2.3624, + "step": 1693 + }, + { + "epoch": 0.519950890116636, + "grad_norm": 0.9971327185630798, + "learning_rate": 9.987374754076822e-05, + "loss": 2.3913, + "step": 1694 + }, + { + "epoch": 0.5202578268876611, + "grad_norm": 0.8719072341918945, + "learning_rate": 9.987339428929274e-05, + "loss": 2.3412, + "step": 1695 + }, + { + "epoch": 0.5205647636586863, + "grad_norm": 0.8198116421699524, + "learning_rate": 9.987304054493916e-05, + "loss": 2.333, + "step": 1696 + }, + { + "epoch": 0.5208717004297114, + "grad_norm": 0.7450931668281555, + "learning_rate": 9.987268630771096e-05, + "loss": 2.2817, + "step": 1697 + }, + { + "epoch": 0.5211786372007366, + "grad_norm": 0.6867587566375732, + "learning_rate": 9.987233157761164e-05, + "loss": 2.3456, + "step": 1698 + }, + { + "epoch": 0.5214855739717618, + "grad_norm": 0.7537778615951538, + "learning_rate": 9.987197635464471e-05, + "loss": 2.176, + "step": 1699 + }, + { + "epoch": 0.521792510742787, + "grad_norm": 0.8347577452659607, + "learning_rate": 9.987162063881366e-05, + "loss": 2.3296, + "step": 1700 + }, + { + "epoch": 0.5220994475138122, + "grad_norm": 0.8714643120765686, + "learning_rate": 9.987126443012205e-05, + "loss": 2.3648, + "step": 1701 + }, + { + "epoch": 0.5224063842848373, + "grad_norm": 0.8579849004745483, + "learning_rate": 9.987090772857336e-05, + "loss": 2.4189, + "step": 1702 + }, + { + "epoch": 0.5227133210558625, + "grad_norm": 0.8651238083839417, + "learning_rate": 9.987055053417114e-05, + "loss": 2.3036, + "step": 1703 + }, + { + "epoch": 0.5230202578268877, + "grad_norm": 0.8447873592376709, + "learning_rate": 9.98701928469189e-05, + "loss": 2.3243, + "step": 1704 + }, + { + "epoch": 0.5233271945979129, + "grad_norm": 0.8218941688537598, + "learning_rate": 9.986983466682019e-05, + "loss": 2.3888, + "step": 1705 + }, + { + "epoch": 0.523634131368938, + "grad_norm": 0.7862920761108398, + "learning_rate": 9.986947599387855e-05, + "loss": 2.335, + "step": 1706 + }, + { + "epoch": 0.5239410681399632, + "grad_norm": 0.8096200227737427, + "learning_rate": 9.986911682809749e-05, + "loss": 2.4034, + "step": 1707 + }, + { + "epoch": 0.5242480049109883, + "grad_norm": 0.8217427730560303, + "learning_rate": 9.986875716948062e-05, + "loss": 2.2659, + "step": 1708 + }, + { + "epoch": 0.5245549416820136, + "grad_norm": 0.7676928043365479, + "learning_rate": 9.986839701803146e-05, + "loss": 2.2736, + "step": 1709 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.7783572673797607, + "learning_rate": 9.986803637375356e-05, + "loss": 2.3611, + "step": 1710 + }, + { + "epoch": 0.5251688152240639, + "grad_norm": 0.7657338380813599, + "learning_rate": 9.98676752366505e-05, + "loss": 2.3573, + "step": 1711 + }, + { + "epoch": 0.525475751995089, + "grad_norm": 0.8946976065635681, + "learning_rate": 9.986731360672585e-05, + "loss": 2.3443, + "step": 1712 + }, + { + "epoch": 0.5257826887661142, + "grad_norm": 0.8047227263450623, + "learning_rate": 9.986695148398318e-05, + "loss": 2.345, + "step": 1713 + }, + { + "epoch": 0.5260896255371393, + "grad_norm": 0.8407939672470093, + "learning_rate": 9.986658886842605e-05, + "loss": 2.2828, + "step": 1714 + }, + { + "epoch": 0.5263965623081646, + "grad_norm": 0.8460215330123901, + "learning_rate": 9.986622576005806e-05, + "loss": 2.2786, + "step": 1715 + }, + { + "epoch": 0.5267034990791897, + "grad_norm": 0.8291949033737183, + "learning_rate": 9.986586215888283e-05, + "loss": 2.3491, + "step": 1716 + }, + { + "epoch": 0.5270104358502149, + "grad_norm": 0.8812628388404846, + "learning_rate": 9.98654980649039e-05, + "loss": 2.3392, + "step": 1717 + }, + { + "epoch": 0.52731737262124, + "grad_norm": 0.8666933178901672, + "learning_rate": 9.98651334781249e-05, + "loss": 2.2585, + "step": 1718 + }, + { + "epoch": 0.5276243093922652, + "grad_norm": 0.8393275737762451, + "learning_rate": 9.986476839854941e-05, + "loss": 2.3315, + "step": 1719 + }, + { + "epoch": 0.5279312461632903, + "grad_norm": 0.8431777954101562, + "learning_rate": 9.986440282618105e-05, + "loss": 2.268, + "step": 1720 + }, + { + "epoch": 0.5282381829343156, + "grad_norm": 0.8020747900009155, + "learning_rate": 9.986403676102346e-05, + "loss": 2.2306, + "step": 1721 + }, + { + "epoch": 0.5285451197053407, + "grad_norm": 0.817395806312561, + "learning_rate": 9.986367020308022e-05, + "loss": 2.2914, + "step": 1722 + }, + { + "epoch": 0.5288520564763659, + "grad_norm": 0.8034493327140808, + "learning_rate": 9.986330315235497e-05, + "loss": 2.3598, + "step": 1723 + }, + { + "epoch": 0.529158993247391, + "grad_norm": 0.9001252055168152, + "learning_rate": 9.986293560885131e-05, + "loss": 2.3456, + "step": 1724 + }, + { + "epoch": 0.5294659300184162, + "grad_norm": 0.9782349467277527, + "learning_rate": 9.986256757257293e-05, + "loss": 2.231, + "step": 1725 + }, + { + "epoch": 0.5297728667894414, + "grad_norm": 1.0022578239440918, + "learning_rate": 9.98621990435234e-05, + "loss": 2.3457, + "step": 1726 + }, + { + "epoch": 0.5300798035604666, + "grad_norm": 1.0705206394195557, + "learning_rate": 9.986183002170642e-05, + "loss": 2.2775, + "step": 1727 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.8464064598083496, + "learning_rate": 9.98614605071256e-05, + "loss": 2.4006, + "step": 1728 + }, + { + "epoch": 0.5306936771025169, + "grad_norm": 0.7128132581710815, + "learning_rate": 9.98610904997846e-05, + "loss": 2.3273, + "step": 1729 + }, + { + "epoch": 0.531000613873542, + "grad_norm": 0.8113927245140076, + "learning_rate": 9.986071999968706e-05, + "loss": 2.3467, + "step": 1730 + }, + { + "epoch": 0.5313075506445673, + "grad_norm": 0.9236831665039062, + "learning_rate": 9.986034900683669e-05, + "loss": 2.3815, + "step": 1731 + }, + { + "epoch": 0.5316144874155924, + "grad_norm": 0.9325668811798096, + "learning_rate": 9.985997752123713e-05, + "loss": 2.3411, + "step": 1732 + }, + { + "epoch": 0.5319214241866176, + "grad_norm": 0.9585117101669312, + "learning_rate": 9.985960554289203e-05, + "loss": 2.3309, + "step": 1733 + }, + { + "epoch": 0.5322283609576427, + "grad_norm": 0.9459986686706543, + "learning_rate": 9.98592330718051e-05, + "loss": 2.3525, + "step": 1734 + }, + { + "epoch": 0.5325352977286679, + "grad_norm": 0.971592366695404, + "learning_rate": 9.985886010797997e-05, + "loss": 2.3665, + "step": 1735 + }, + { + "epoch": 0.532842234499693, + "grad_norm": 0.8533779978752136, + "learning_rate": 9.985848665142039e-05, + "loss": 2.26, + "step": 1736 + }, + { + "epoch": 0.5331491712707183, + "grad_norm": 0.8224228620529175, + "learning_rate": 9.985811270213002e-05, + "loss": 2.3523, + "step": 1737 + }, + { + "epoch": 0.5334561080417434, + "grad_norm": 0.8649810552597046, + "learning_rate": 9.985773826011255e-05, + "loss": 2.3262, + "step": 1738 + }, + { + "epoch": 0.5337630448127686, + "grad_norm": 0.8099339604377747, + "learning_rate": 9.98573633253717e-05, + "loss": 2.3038, + "step": 1739 + }, + { + "epoch": 0.5340699815837937, + "grad_norm": 0.6788219213485718, + "learning_rate": 9.985698789791115e-05, + "loss": 2.3278, + "step": 1740 + }, + { + "epoch": 0.5343769183548189, + "grad_norm": 0.8716040253639221, + "learning_rate": 9.985661197773464e-05, + "loss": 2.2955, + "step": 1741 + }, + { + "epoch": 0.534683855125844, + "grad_norm": 0.8377614617347717, + "learning_rate": 9.985623556484587e-05, + "loss": 2.2801, + "step": 1742 + }, + { + "epoch": 0.5349907918968693, + "grad_norm": 0.8452683091163635, + "learning_rate": 9.985585865924853e-05, + "loss": 2.3313, + "step": 1743 + }, + { + "epoch": 0.5352977286678944, + "grad_norm": 0.8226203918457031, + "learning_rate": 9.98554812609464e-05, + "loss": 2.3464, + "step": 1744 + }, + { + "epoch": 0.5356046654389196, + "grad_norm": 0.7476974725723267, + "learning_rate": 9.985510336994316e-05, + "loss": 2.3721, + "step": 1745 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.7132230997085571, + "learning_rate": 9.98547249862426e-05, + "loss": 2.2657, + "step": 1746 + }, + { + "epoch": 0.5362185389809699, + "grad_norm": 0.7022002339363098, + "learning_rate": 9.98543461098484e-05, + "loss": 2.2656, + "step": 1747 + }, + { + "epoch": 0.536525475751995, + "grad_norm": 0.7174789309501648, + "learning_rate": 9.985396674076435e-05, + "loss": 2.2914, + "step": 1748 + }, + { + "epoch": 0.5368324125230203, + "grad_norm": 0.78509920835495, + "learning_rate": 9.985358687899417e-05, + "loss": 2.3155, + "step": 1749 + }, + { + "epoch": 0.5371393492940454, + "grad_norm": 0.7670894861221313, + "learning_rate": 9.985320652454162e-05, + "loss": 2.2608, + "step": 1750 + }, + { + "epoch": 0.5374462860650706, + "grad_norm": 0.6196603178977966, + "learning_rate": 9.985282567741047e-05, + "loss": 2.2796, + "step": 1751 + }, + { + "epoch": 0.5377532228360957, + "grad_norm": 0.7119829058647156, + "learning_rate": 9.985244433760448e-05, + "loss": 2.2262, + "step": 1752 + }, + { + "epoch": 0.538060159607121, + "grad_norm": 0.6665359735488892, + "learning_rate": 9.98520625051274e-05, + "loss": 2.2714, + "step": 1753 + }, + { + "epoch": 0.5383670963781461, + "grad_norm": 0.7960934042930603, + "learning_rate": 9.985168017998303e-05, + "loss": 2.3703, + "step": 1754 + }, + { + "epoch": 0.5386740331491713, + "grad_norm": 0.9428521394729614, + "learning_rate": 9.985129736217513e-05, + "loss": 2.3334, + "step": 1755 + }, + { + "epoch": 0.5389809699201964, + "grad_norm": 0.9900842905044556, + "learning_rate": 9.985091405170751e-05, + "loss": 2.2369, + "step": 1756 + }, + { + "epoch": 0.5392879066912216, + "grad_norm": 0.9340593814849854, + "learning_rate": 9.985053024858393e-05, + "loss": 2.4332, + "step": 1757 + }, + { + "epoch": 0.5395948434622467, + "grad_norm": 0.9241896271705627, + "learning_rate": 9.985014595280818e-05, + "loss": 2.3484, + "step": 1758 + }, + { + "epoch": 0.539901780233272, + "grad_norm": 0.7724506258964539, + "learning_rate": 9.984976116438408e-05, + "loss": 2.282, + "step": 1759 + }, + { + "epoch": 0.5402087170042971, + "grad_norm": 0.9098101854324341, + "learning_rate": 9.984937588331543e-05, + "loss": 2.3039, + "step": 1760 + }, + { + "epoch": 0.5405156537753223, + "grad_norm": 0.9430370330810547, + "learning_rate": 9.984899010960601e-05, + "loss": 2.2555, + "step": 1761 + }, + { + "epoch": 0.5408225905463474, + "grad_norm": 0.8927021026611328, + "learning_rate": 9.984860384325965e-05, + "loss": 2.3034, + "step": 1762 + }, + { + "epoch": 0.5411295273173726, + "grad_norm": 0.8331896662712097, + "learning_rate": 9.98482170842802e-05, + "loss": 2.3341, + "step": 1763 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.8311246633529663, + "learning_rate": 9.984782983267142e-05, + "loss": 2.3913, + "step": 1764 + }, + { + "epoch": 0.541743400859423, + "grad_norm": 0.7459335923194885, + "learning_rate": 9.98474420884372e-05, + "loss": 2.2912, + "step": 1765 + }, + { + "epoch": 0.5420503376304481, + "grad_norm": 0.84760981798172, + "learning_rate": 9.984705385158131e-05, + "loss": 2.316, + "step": 1766 + }, + { + "epoch": 0.5423572744014733, + "grad_norm": 0.888793408870697, + "learning_rate": 9.984666512210762e-05, + "loss": 2.3452, + "step": 1767 + }, + { + "epoch": 0.5426642111724984, + "grad_norm": 0.7977499961853027, + "learning_rate": 9.984627590001999e-05, + "loss": 2.3325, + "step": 1768 + }, + { + "epoch": 0.5429711479435236, + "grad_norm": 0.8059934377670288, + "learning_rate": 9.984588618532224e-05, + "loss": 2.3347, + "step": 1769 + }, + { + "epoch": 0.5432780847145487, + "grad_norm": 0.8190197348594666, + "learning_rate": 9.984549597801822e-05, + "loss": 2.3446, + "step": 1770 + }, + { + "epoch": 0.543585021485574, + "grad_norm": 0.774773895740509, + "learning_rate": 9.98451052781118e-05, + "loss": 2.2598, + "step": 1771 + }, + { + "epoch": 0.5438919582565992, + "grad_norm": 0.7341485023498535, + "learning_rate": 9.984471408560682e-05, + "loss": 2.2728, + "step": 1772 + }, + { + "epoch": 0.5441988950276243, + "grad_norm": 0.6881145238876343, + "learning_rate": 9.984432240050719e-05, + "loss": 2.2922, + "step": 1773 + }, + { + "epoch": 0.5445058317986495, + "grad_norm": 0.6896151304244995, + "learning_rate": 9.984393022281673e-05, + "loss": 2.2915, + "step": 1774 + }, + { + "epoch": 0.5448127685696746, + "grad_norm": 0.6902059316635132, + "learning_rate": 9.984353755253932e-05, + "loss": 2.31, + "step": 1775 + }, + { + "epoch": 0.5451197053406999, + "grad_norm": 0.7594140768051147, + "learning_rate": 9.984314438967888e-05, + "loss": 2.3092, + "step": 1776 + }, + { + "epoch": 0.545426642111725, + "grad_norm": 0.8682328462600708, + "learning_rate": 9.984275073423927e-05, + "loss": 2.2851, + "step": 1777 + }, + { + "epoch": 0.5457335788827502, + "grad_norm": 0.8747107982635498, + "learning_rate": 9.98423565862244e-05, + "loss": 2.2927, + "step": 1778 + }, + { + "epoch": 0.5460405156537753, + "grad_norm": 0.9824326038360596, + "learning_rate": 9.984196194563813e-05, + "loss": 2.3622, + "step": 1779 + }, + { + "epoch": 0.5463474524248005, + "grad_norm": 1.0006790161132812, + "learning_rate": 9.984156681248438e-05, + "loss": 2.2531, + "step": 1780 + }, + { + "epoch": 0.5466543891958257, + "grad_norm": 0.9501944184303284, + "learning_rate": 9.984117118676705e-05, + "loss": 2.3902, + "step": 1781 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.7835353016853333, + "learning_rate": 9.984077506849005e-05, + "loss": 2.2754, + "step": 1782 + }, + { + "epoch": 0.547268262737876, + "grad_norm": 0.7310026288032532, + "learning_rate": 9.984037845765732e-05, + "loss": 2.2742, + "step": 1783 + }, + { + "epoch": 0.5475751995089012, + "grad_norm": 0.9469361901283264, + "learning_rate": 9.983998135427275e-05, + "loss": 2.4026, + "step": 1784 + }, + { + "epoch": 0.5478821362799263, + "grad_norm": 1.0639240741729736, + "learning_rate": 9.983958375834025e-05, + "loss": 2.3522, + "step": 1785 + }, + { + "epoch": 0.5481890730509515, + "grad_norm": 0.7771989703178406, + "learning_rate": 9.983918566986379e-05, + "loss": 2.216, + "step": 1786 + }, + { + "epoch": 0.5484960098219767, + "grad_norm": 0.6809307932853699, + "learning_rate": 9.983878708884728e-05, + "loss": 2.256, + "step": 1787 + }, + { + "epoch": 0.5488029465930019, + "grad_norm": 0.7300165891647339, + "learning_rate": 9.983838801529469e-05, + "loss": 2.3156, + "step": 1788 + }, + { + "epoch": 0.549109883364027, + "grad_norm": 0.8352389335632324, + "learning_rate": 9.98379884492099e-05, + "loss": 2.3344, + "step": 1789 + }, + { + "epoch": 0.5494168201350522, + "grad_norm": 0.830585777759552, + "learning_rate": 9.983758839059692e-05, + "loss": 2.3076, + "step": 1790 + }, + { + "epoch": 0.5497237569060773, + "grad_norm": 0.7384640574455261, + "learning_rate": 9.983718783945968e-05, + "loss": 2.2387, + "step": 1791 + }, + { + "epoch": 0.5500306936771026, + "grad_norm": 0.7133243083953857, + "learning_rate": 9.983678679580213e-05, + "loss": 2.2933, + "step": 1792 + }, + { + "epoch": 0.5503376304481277, + "grad_norm": 0.8462459444999695, + "learning_rate": 9.983638525962823e-05, + "loss": 2.3294, + "step": 1793 + }, + { + "epoch": 0.5506445672191529, + "grad_norm": 0.7841110825538635, + "learning_rate": 9.983598323094199e-05, + "loss": 2.3156, + "step": 1794 + }, + { + "epoch": 0.550951503990178, + "grad_norm": 0.8454114198684692, + "learning_rate": 9.983558070974735e-05, + "loss": 2.2203, + "step": 1795 + }, + { + "epoch": 0.5512584407612032, + "grad_norm": 0.7741531729698181, + "learning_rate": 9.983517769604826e-05, + "loss": 2.2585, + "step": 1796 + }, + { + "epoch": 0.5515653775322283, + "grad_norm": 0.717714250087738, + "learning_rate": 9.983477418984876e-05, + "loss": 2.3127, + "step": 1797 + }, + { + "epoch": 0.5518723143032536, + "grad_norm": 0.7546361088752747, + "learning_rate": 9.983437019115283e-05, + "loss": 2.2591, + "step": 1798 + }, + { + "epoch": 0.5521792510742787, + "grad_norm": 0.7947681546211243, + "learning_rate": 9.983396569996442e-05, + "loss": 2.337, + "step": 1799 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.9286270141601562, + "learning_rate": 9.983356071628756e-05, + "loss": 2.371, + "step": 1800 + }, + { + "epoch": 0.552793124616329, + "grad_norm": 1.0236682891845703, + "learning_rate": 9.983315524012625e-05, + "loss": 2.2673, + "step": 1801 + }, + { + "epoch": 0.5531000613873542, + "grad_norm": 1.043534278869629, + "learning_rate": 9.983274927148447e-05, + "loss": 2.3204, + "step": 1802 + }, + { + "epoch": 0.5534069981583793, + "grad_norm": 0.9694257378578186, + "learning_rate": 9.983234281036626e-05, + "loss": 2.2642, + "step": 1803 + }, + { + "epoch": 0.5537139349294046, + "grad_norm": 0.8890992403030396, + "learning_rate": 9.983193585677563e-05, + "loss": 2.2546, + "step": 1804 + }, + { + "epoch": 0.5540208717004297, + "grad_norm": 0.8109140396118164, + "learning_rate": 9.983152841071662e-05, + "loss": 2.3088, + "step": 1805 + }, + { + "epoch": 0.5543278084714549, + "grad_norm": 0.7762413620948792, + "learning_rate": 9.983112047219323e-05, + "loss": 2.2277, + "step": 1806 + }, + { + "epoch": 0.55463474524248, + "grad_norm": 0.7949336767196655, + "learning_rate": 9.983071204120951e-05, + "loss": 2.3004, + "step": 1807 + }, + { + "epoch": 0.5549416820135052, + "grad_norm": 0.9118300080299377, + "learning_rate": 9.983030311776946e-05, + "loss": 2.3986, + "step": 1808 + }, + { + "epoch": 0.5552486187845304, + "grad_norm": 0.874891996383667, + "learning_rate": 9.982989370187717e-05, + "loss": 2.2721, + "step": 1809 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.8089940547943115, + "learning_rate": 9.982948379353667e-05, + "loss": 2.2846, + "step": 1810 + }, + { + "epoch": 0.5558624923265807, + "grad_norm": 0.7407395839691162, + "learning_rate": 9.982907339275198e-05, + "loss": 2.2848, + "step": 1811 + }, + { + "epoch": 0.5561694290976059, + "grad_norm": 0.7487329244613647, + "learning_rate": 9.982866249952721e-05, + "loss": 2.266, + "step": 1812 + }, + { + "epoch": 0.556476365868631, + "grad_norm": 0.7910557389259338, + "learning_rate": 9.982825111386638e-05, + "loss": 2.2975, + "step": 1813 + }, + { + "epoch": 0.5567833026396563, + "grad_norm": 0.767186164855957, + "learning_rate": 9.982783923577356e-05, + "loss": 2.2867, + "step": 1814 + }, + { + "epoch": 0.5570902394106814, + "grad_norm": 0.7296959757804871, + "learning_rate": 9.982742686525284e-05, + "loss": 2.2167, + "step": 1815 + }, + { + "epoch": 0.5573971761817066, + "grad_norm": 0.6536411643028259, + "learning_rate": 9.982701400230827e-05, + "loss": 2.2278, + "step": 1816 + }, + { + "epoch": 0.5577041129527317, + "grad_norm": 0.7393643260002136, + "learning_rate": 9.982660064694394e-05, + "loss": 2.3275, + "step": 1817 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.7837240099906921, + "learning_rate": 9.982618679916396e-05, + "loss": 2.3516, + "step": 1818 + }, + { + "epoch": 0.558317986494782, + "grad_norm": 0.8186847567558289, + "learning_rate": 9.982577245897238e-05, + "loss": 2.4104, + "step": 1819 + }, + { + "epoch": 0.5586249232658073, + "grad_norm": 0.733651340007782, + "learning_rate": 9.98253576263733e-05, + "loss": 2.2151, + "step": 1820 + }, + { + "epoch": 0.5589318600368324, + "grad_norm": 0.7452411651611328, + "learning_rate": 9.982494230137086e-05, + "loss": 2.3288, + "step": 1821 + }, + { + "epoch": 0.5592387968078576, + "grad_norm": 0.7369456887245178, + "learning_rate": 9.982452648396913e-05, + "loss": 2.3023, + "step": 1822 + }, + { + "epoch": 0.5595457335788827, + "grad_norm": 0.794789731502533, + "learning_rate": 9.982411017417222e-05, + "loss": 2.2774, + "step": 1823 + }, + { + "epoch": 0.5598526703499079, + "grad_norm": 0.7677412033081055, + "learning_rate": 9.982369337198425e-05, + "loss": 2.3213, + "step": 1824 + }, + { + "epoch": 0.560159607120933, + "grad_norm": 0.8195241689682007, + "learning_rate": 9.982327607740934e-05, + "loss": 2.3721, + "step": 1825 + }, + { + "epoch": 0.5604665438919583, + "grad_norm": 0.867115318775177, + "learning_rate": 9.982285829045162e-05, + "loss": 2.3653, + "step": 1826 + }, + { + "epoch": 0.5607734806629834, + "grad_norm": 0.8519865870475769, + "learning_rate": 9.98224400111152e-05, + "loss": 2.3646, + "step": 1827 + }, + { + "epoch": 0.5610804174340086, + "grad_norm": 0.9408721923828125, + "learning_rate": 9.982202123940425e-05, + "loss": 2.2051, + "step": 1828 + }, + { + "epoch": 0.5613873542050337, + "grad_norm": 0.985325813293457, + "learning_rate": 9.982160197532287e-05, + "loss": 2.3402, + "step": 1829 + }, + { + "epoch": 0.5616942909760589, + "grad_norm": 1.018094539642334, + "learning_rate": 9.982118221887521e-05, + "loss": 2.2712, + "step": 1830 + }, + { + "epoch": 0.562001227747084, + "grad_norm": 0.9246920347213745, + "learning_rate": 9.982076197006543e-05, + "loss": 2.3808, + "step": 1831 + }, + { + "epoch": 0.5623081645181093, + "grad_norm": 0.8519729971885681, + "learning_rate": 9.982034122889768e-05, + "loss": 2.3774, + "step": 1832 + }, + { + "epoch": 0.5626151012891344, + "grad_norm": 0.801567018032074, + "learning_rate": 9.981991999537612e-05, + "loss": 2.2713, + "step": 1833 + }, + { + "epoch": 0.5629220380601596, + "grad_norm": 0.7212518453598022, + "learning_rate": 9.981949826950492e-05, + "loss": 2.1902, + "step": 1834 + }, + { + "epoch": 0.5632289748311847, + "grad_norm": 0.7644798755645752, + "learning_rate": 9.981907605128822e-05, + "loss": 2.2751, + "step": 1835 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.7941999435424805, + "learning_rate": 9.981865334073022e-05, + "loss": 2.2991, + "step": 1836 + }, + { + "epoch": 0.5638428483732351, + "grad_norm": 0.7274888753890991, + "learning_rate": 9.981823013783508e-05, + "loss": 2.3536, + "step": 1837 + }, + { + "epoch": 0.5641497851442603, + "grad_norm": 0.845024585723877, + "learning_rate": 9.9817806442607e-05, + "loss": 2.2796, + "step": 1838 + }, + { + "epoch": 0.5644567219152854, + "grad_norm": 0.8225597739219666, + "learning_rate": 9.981738225505015e-05, + "loss": 2.3339, + "step": 1839 + }, + { + "epoch": 0.5647636586863106, + "grad_norm": 0.8456425070762634, + "learning_rate": 9.981695757516873e-05, + "loss": 2.2583, + "step": 1840 + }, + { + "epoch": 0.5650705954573357, + "grad_norm": 1.0066497325897217, + "learning_rate": 9.981653240296695e-05, + "loss": 2.3628, + "step": 1841 + }, + { + "epoch": 0.565377532228361, + "grad_norm": 0.9574379920959473, + "learning_rate": 9.981610673844899e-05, + "loss": 2.306, + "step": 1842 + }, + { + "epoch": 0.5656844689993862, + "grad_norm": 0.7427437901496887, + "learning_rate": 9.981568058161905e-05, + "loss": 2.267, + "step": 1843 + }, + { + "epoch": 0.5659914057704113, + "grad_norm": 0.6984857320785522, + "learning_rate": 9.981525393248138e-05, + "loss": 2.2095, + "step": 1844 + }, + { + "epoch": 0.5662983425414365, + "grad_norm": 0.748062789440155, + "learning_rate": 9.981482679104016e-05, + "loss": 2.211, + "step": 1845 + }, + { + "epoch": 0.5666052793124616, + "grad_norm": 0.7978217005729675, + "learning_rate": 9.981439915729964e-05, + "loss": 2.2437, + "step": 1846 + }, + { + "epoch": 0.5669122160834869, + "grad_norm": 0.807849109172821, + "learning_rate": 9.981397103126401e-05, + "loss": 2.3063, + "step": 1847 + }, + { + "epoch": 0.567219152854512, + "grad_norm": 0.8626619577407837, + "learning_rate": 9.981354241293752e-05, + "loss": 2.3616, + "step": 1848 + }, + { + "epoch": 0.5675260896255372, + "grad_norm": 0.8991526961326599, + "learning_rate": 9.981311330232442e-05, + "loss": 2.2355, + "step": 1849 + }, + { + "epoch": 0.5678330263965623, + "grad_norm": 0.7399953007698059, + "learning_rate": 9.981268369942894e-05, + "loss": 2.2452, + "step": 1850 + }, + { + "epoch": 0.5681399631675875, + "grad_norm": 0.7787104845046997, + "learning_rate": 9.981225360425533e-05, + "loss": 2.4141, + "step": 1851 + }, + { + "epoch": 0.5684468999386126, + "grad_norm": 0.8570892214775085, + "learning_rate": 9.98118230168078e-05, + "loss": 2.2487, + "step": 1852 + }, + { + "epoch": 0.5687538367096379, + "grad_norm": 0.8277538418769836, + "learning_rate": 9.981139193709068e-05, + "loss": 2.2602, + "step": 1853 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.7638106942176819, + "learning_rate": 9.981096036510817e-05, + "loss": 2.2886, + "step": 1854 + }, + { + "epoch": 0.5693677102516882, + "grad_norm": 0.8480616807937622, + "learning_rate": 9.981052830086454e-05, + "loss": 2.2893, + "step": 1855 + }, + { + "epoch": 0.5696746470227133, + "grad_norm": 0.8568599820137024, + "learning_rate": 9.98100957443641e-05, + "loss": 2.3802, + "step": 1856 + }, + { + "epoch": 0.5699815837937385, + "grad_norm": 0.7863987684249878, + "learning_rate": 9.98096626956111e-05, + "loss": 2.2996, + "step": 1857 + }, + { + "epoch": 0.5702885205647636, + "grad_norm": 0.7636334896087646, + "learning_rate": 9.980922915460979e-05, + "loss": 2.2569, + "step": 1858 + }, + { + "epoch": 0.5705954573357889, + "grad_norm": 0.7514677047729492, + "learning_rate": 9.98087951213645e-05, + "loss": 2.3317, + "step": 1859 + }, + { + "epoch": 0.570902394106814, + "grad_norm": 0.717637300491333, + "learning_rate": 9.980836059587951e-05, + "loss": 2.2855, + "step": 1860 + }, + { + "epoch": 0.5712093308778392, + "grad_norm": 0.728518545627594, + "learning_rate": 9.98079255781591e-05, + "loss": 2.3166, + "step": 1861 + }, + { + "epoch": 0.5715162676488643, + "grad_norm": 0.7158043384552002, + "learning_rate": 9.980749006820757e-05, + "loss": 2.2639, + "step": 1862 + }, + { + "epoch": 0.5718232044198895, + "grad_norm": 0.7565107941627502, + "learning_rate": 9.980705406602924e-05, + "loss": 2.2833, + "step": 1863 + }, + { + "epoch": 0.5721301411909147, + "grad_norm": 0.7873388528823853, + "learning_rate": 9.980661757162841e-05, + "loss": 2.201, + "step": 1864 + }, + { + "epoch": 0.5724370779619399, + "grad_norm": 0.7818259596824646, + "learning_rate": 9.980618058500939e-05, + "loss": 2.242, + "step": 1865 + }, + { + "epoch": 0.572744014732965, + "grad_norm": 0.7464665770530701, + "learning_rate": 9.98057431061765e-05, + "loss": 2.2325, + "step": 1866 + }, + { + "epoch": 0.5730509515039902, + "grad_norm": 0.7778184413909912, + "learning_rate": 9.980530513513406e-05, + "loss": 2.3258, + "step": 1867 + }, + { + "epoch": 0.5733578882750153, + "grad_norm": 0.825661301612854, + "learning_rate": 9.980486667188642e-05, + "loss": 2.3477, + "step": 1868 + }, + { + "epoch": 0.5736648250460405, + "grad_norm": 0.8448848724365234, + "learning_rate": 9.980442771643788e-05, + "loss": 2.3523, + "step": 1869 + }, + { + "epoch": 0.5739717618170657, + "grad_norm": 0.8330404758453369, + "learning_rate": 9.98039882687928e-05, + "loss": 2.2274, + "step": 1870 + }, + { + "epoch": 0.5742786985880909, + "grad_norm": 0.7520943284034729, + "learning_rate": 9.98035483289555e-05, + "loss": 2.2773, + "step": 1871 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.8312448263168335, + "learning_rate": 9.980310789693037e-05, + "loss": 2.302, + "step": 1872 + }, + { + "epoch": 0.5748925721301412, + "grad_norm": 0.7383994460105896, + "learning_rate": 9.980266697272173e-05, + "loss": 2.2168, + "step": 1873 + }, + { + "epoch": 0.5751995089011663, + "grad_norm": 0.9612922072410583, + "learning_rate": 9.980222555633394e-05, + "loss": 2.3558, + "step": 1874 + }, + { + "epoch": 0.5755064456721916, + "grad_norm": 0.9921227097511292, + "learning_rate": 9.980178364777136e-05, + "loss": 2.2913, + "step": 1875 + }, + { + "epoch": 0.5758133824432167, + "grad_norm": 0.9152889847755432, + "learning_rate": 9.980134124703837e-05, + "loss": 2.2615, + "step": 1876 + }, + { + "epoch": 0.5761203192142419, + "grad_norm": 0.8090541362762451, + "learning_rate": 9.980089835413936e-05, + "loss": 2.2661, + "step": 1877 + }, + { + "epoch": 0.576427255985267, + "grad_norm": 0.8074322938919067, + "learning_rate": 9.980045496907865e-05, + "loss": 2.3209, + "step": 1878 + }, + { + "epoch": 0.5767341927562922, + "grad_norm": 0.784649670124054, + "learning_rate": 9.980001109186065e-05, + "loss": 2.241, + "step": 1879 + }, + { + "epoch": 0.5770411295273173, + "grad_norm": 0.768108069896698, + "learning_rate": 9.979956672248978e-05, + "loss": 2.3333, + "step": 1880 + }, + { + "epoch": 0.5773480662983426, + "grad_norm": 0.798058271408081, + "learning_rate": 9.97991218609704e-05, + "loss": 2.3564, + "step": 1881 + }, + { + "epoch": 0.5776550030693677, + "grad_norm": 0.7606865763664246, + "learning_rate": 9.97986765073069e-05, + "loss": 2.2277, + "step": 1882 + }, + { + "epoch": 0.5779619398403929, + "grad_norm": 0.8320558667182922, + "learning_rate": 9.979823066150369e-05, + "loss": 2.3715, + "step": 1883 + }, + { + "epoch": 0.578268876611418, + "grad_norm": 0.7935798168182373, + "learning_rate": 9.979778432356517e-05, + "loss": 2.2605, + "step": 1884 + }, + { + "epoch": 0.5785758133824432, + "grad_norm": 0.6914796829223633, + "learning_rate": 9.979733749349578e-05, + "loss": 2.2699, + "step": 1885 + }, + { + "epoch": 0.5788827501534684, + "grad_norm": 0.6546899676322937, + "learning_rate": 9.979689017129989e-05, + "loss": 2.1908, + "step": 1886 + }, + { + "epoch": 0.5791896869244936, + "grad_norm": 0.7231267094612122, + "learning_rate": 9.979644235698195e-05, + "loss": 2.2084, + "step": 1887 + }, + { + "epoch": 0.5794966236955187, + "grad_norm": 0.668933093547821, + "learning_rate": 9.979599405054639e-05, + "loss": 2.2722, + "step": 1888 + }, + { + "epoch": 0.5798035604665439, + "grad_norm": 0.678191602230072, + "learning_rate": 9.979554525199763e-05, + "loss": 2.2312, + "step": 1889 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.6407462954521179, + "learning_rate": 9.97950959613401e-05, + "loss": 2.2381, + "step": 1890 + }, + { + "epoch": 0.5804174340085942, + "grad_norm": 0.6920403242111206, + "learning_rate": 9.979464617857826e-05, + "loss": 2.2678, + "step": 1891 + }, + { + "epoch": 0.5807243707796194, + "grad_norm": 0.6907110810279846, + "learning_rate": 9.979419590371651e-05, + "loss": 2.2579, + "step": 1892 + }, + { + "epoch": 0.5810313075506446, + "grad_norm": 0.7683933973312378, + "learning_rate": 9.979374513675935e-05, + "loss": 2.2184, + "step": 1893 + }, + { + "epoch": 0.5813382443216697, + "grad_norm": 0.797286868095398, + "learning_rate": 9.979329387771121e-05, + "loss": 2.2518, + "step": 1894 + }, + { + "epoch": 0.5816451810926949, + "grad_norm": 0.8192877769470215, + "learning_rate": 9.979284212657657e-05, + "loss": 2.2271, + "step": 1895 + }, + { + "epoch": 0.58195211786372, + "grad_norm": 0.7510090470314026, + "learning_rate": 9.979238988335986e-05, + "loss": 2.2864, + "step": 1896 + }, + { + "epoch": 0.5822590546347453, + "grad_norm": 0.7541393041610718, + "learning_rate": 9.979193714806558e-05, + "loss": 2.239, + "step": 1897 + }, + { + "epoch": 0.5825659914057704, + "grad_norm": 0.7353073358535767, + "learning_rate": 9.97914839206982e-05, + "loss": 2.2145, + "step": 1898 + }, + { + "epoch": 0.5828729281767956, + "grad_norm": 0.6813456416130066, + "learning_rate": 9.979103020126218e-05, + "loss": 2.194, + "step": 1899 + }, + { + "epoch": 0.5831798649478207, + "grad_norm": 0.6922066807746887, + "learning_rate": 9.979057598976202e-05, + "loss": 2.2335, + "step": 1900 + }, + { + "epoch": 0.5834868017188459, + "grad_norm": 0.5800344944000244, + "learning_rate": 9.97901212862022e-05, + "loss": 2.2159, + "step": 1901 + }, + { + "epoch": 0.583793738489871, + "grad_norm": 0.5770835280418396, + "learning_rate": 9.978966609058722e-05, + "loss": 2.2217, + "step": 1902 + }, + { + "epoch": 0.5841006752608963, + "grad_norm": 0.6217128038406372, + "learning_rate": 9.978921040292158e-05, + "loss": 2.2703, + "step": 1903 + }, + { + "epoch": 0.5844076120319214, + "grad_norm": 0.6684436798095703, + "learning_rate": 9.97887542232098e-05, + "loss": 2.2747, + "step": 1904 + }, + { + "epoch": 0.5847145488029466, + "grad_norm": 0.6261670589447021, + "learning_rate": 9.978829755145633e-05, + "loss": 2.2867, + "step": 1905 + }, + { + "epoch": 0.5850214855739717, + "grad_norm": 0.646051824092865, + "learning_rate": 9.978784038766575e-05, + "loss": 2.2493, + "step": 1906 + }, + { + "epoch": 0.5853284223449969, + "grad_norm": 0.6757060885429382, + "learning_rate": 9.978738273184254e-05, + "loss": 2.218, + "step": 1907 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.7867937684059143, + "learning_rate": 9.978692458399122e-05, + "loss": 2.3405, + "step": 1908 + }, + { + "epoch": 0.5859422958870473, + "grad_norm": 0.8349789381027222, + "learning_rate": 9.978646594411636e-05, + "loss": 2.3292, + "step": 1909 + }, + { + "epoch": 0.5862492326580724, + "grad_norm": 0.8739562034606934, + "learning_rate": 9.978600681222243e-05, + "loss": 2.2132, + "step": 1910 + }, + { + "epoch": 0.5865561694290976, + "grad_norm": 0.8187520503997803, + "learning_rate": 9.978554718831402e-05, + "loss": 2.3078, + "step": 1911 + }, + { + "epoch": 0.5868631062001227, + "grad_norm": 0.8463271856307983, + "learning_rate": 9.978508707239565e-05, + "loss": 2.1924, + "step": 1912 + }, + { + "epoch": 0.5871700429711479, + "grad_norm": 0.8674206733703613, + "learning_rate": 9.978462646447187e-05, + "loss": 2.2185, + "step": 1913 + }, + { + "epoch": 0.5874769797421732, + "grad_norm": 0.7828893065452576, + "learning_rate": 9.978416536454722e-05, + "loss": 2.3137, + "step": 1914 + }, + { + "epoch": 0.5877839165131983, + "grad_norm": 0.7868914604187012, + "learning_rate": 9.978370377262629e-05, + "loss": 2.2202, + "step": 1915 + }, + { + "epoch": 0.5880908532842235, + "grad_norm": 0.811596155166626, + "learning_rate": 9.97832416887136e-05, + "loss": 2.3463, + "step": 1916 + }, + { + "epoch": 0.5883977900552486, + "grad_norm": 0.9281075596809387, + "learning_rate": 9.978277911281375e-05, + "loss": 2.2394, + "step": 1917 + }, + { + "epoch": 0.5887047268262738, + "grad_norm": 0.8862313628196716, + "learning_rate": 9.978231604493129e-05, + "loss": 2.2456, + "step": 1918 + }, + { + "epoch": 0.589011663597299, + "grad_norm": 0.8411116600036621, + "learning_rate": 9.978185248507081e-05, + "loss": 2.2409, + "step": 1919 + }, + { + "epoch": 0.5893186003683242, + "grad_norm": 0.8205060958862305, + "learning_rate": 9.978138843323688e-05, + "loss": 2.2468, + "step": 1920 + }, + { + "epoch": 0.5896255371393493, + "grad_norm": 0.8103171586990356, + "learning_rate": 9.97809238894341e-05, + "loss": 2.2979, + "step": 1921 + }, + { + "epoch": 0.5899324739103745, + "grad_norm": 0.7937025427818298, + "learning_rate": 9.978045885366704e-05, + "loss": 2.3582, + "step": 1922 + }, + { + "epoch": 0.5902394106813996, + "grad_norm": 0.7983896136283875, + "learning_rate": 9.977999332594032e-05, + "loss": 2.2725, + "step": 1923 + }, + { + "epoch": 0.5905463474524248, + "grad_norm": 0.8274399042129517, + "learning_rate": 9.977952730625852e-05, + "loss": 2.3091, + "step": 1924 + }, + { + "epoch": 0.59085328422345, + "grad_norm": 0.9385362863540649, + "learning_rate": 9.977906079462627e-05, + "loss": 2.4322, + "step": 1925 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.8405537009239197, + "learning_rate": 9.977859379104814e-05, + "loss": 2.1606, + "step": 1926 + }, + { + "epoch": 0.5914671577655003, + "grad_norm": 0.8082418441772461, + "learning_rate": 9.97781262955288e-05, + "loss": 2.2929, + "step": 1927 + }, + { + "epoch": 0.5917740945365255, + "grad_norm": 0.7444280385971069, + "learning_rate": 9.977765830807283e-05, + "loss": 2.3217, + "step": 1928 + }, + { + "epoch": 0.5920810313075506, + "grad_norm": 0.7369982600212097, + "learning_rate": 9.977718982868485e-05, + "loss": 2.2658, + "step": 1929 + }, + { + "epoch": 0.5923879680785759, + "grad_norm": 0.6842257380485535, + "learning_rate": 9.977672085736951e-05, + "loss": 2.2243, + "step": 1930 + }, + { + "epoch": 0.592694904849601, + "grad_norm": 0.6954882740974426, + "learning_rate": 9.977625139413145e-05, + "loss": 2.2802, + "step": 1931 + }, + { + "epoch": 0.5930018416206262, + "grad_norm": 0.749829888343811, + "learning_rate": 9.97757814389753e-05, + "loss": 2.3166, + "step": 1932 + }, + { + "epoch": 0.5933087783916513, + "grad_norm": 0.7725609540939331, + "learning_rate": 9.977531099190569e-05, + "loss": 2.2367, + "step": 1933 + }, + { + "epoch": 0.5936157151626765, + "grad_norm": 0.7467440366744995, + "learning_rate": 9.977484005292728e-05, + "loss": 2.2704, + "step": 1934 + }, + { + "epoch": 0.5939226519337016, + "grad_norm": 0.7104424834251404, + "learning_rate": 9.977436862204475e-05, + "loss": 2.1983, + "step": 1935 + }, + { + "epoch": 0.5942295887047269, + "grad_norm": 0.7562711834907532, + "learning_rate": 9.977389669926272e-05, + "loss": 2.2857, + "step": 1936 + }, + { + "epoch": 0.594536525475752, + "grad_norm": 0.7803298830986023, + "learning_rate": 9.977342428458585e-05, + "loss": 2.3526, + "step": 1937 + }, + { + "epoch": 0.5948434622467772, + "grad_norm": 0.7487826943397522, + "learning_rate": 9.977295137801885e-05, + "loss": 2.2338, + "step": 1938 + }, + { + "epoch": 0.5951503990178023, + "grad_norm": 0.6969291567802429, + "learning_rate": 9.977247797956639e-05, + "loss": 2.2185, + "step": 1939 + }, + { + "epoch": 0.5954573357888275, + "grad_norm": 0.6293052434921265, + "learning_rate": 9.977200408923311e-05, + "loss": 2.2767, + "step": 1940 + }, + { + "epoch": 0.5957642725598526, + "grad_norm": 0.7457680702209473, + "learning_rate": 9.97715297070237e-05, + "loss": 2.2688, + "step": 1941 + }, + { + "epoch": 0.5960712093308779, + "grad_norm": 0.7255130410194397, + "learning_rate": 9.977105483294288e-05, + "loss": 2.2157, + "step": 1942 + }, + { + "epoch": 0.596378146101903, + "grad_norm": 0.739815890789032, + "learning_rate": 9.977057946699532e-05, + "loss": 2.306, + "step": 1943 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.7493855357170105, + "learning_rate": 9.977010360918571e-05, + "loss": 2.1893, + "step": 1944 + }, + { + "epoch": 0.5969920196439533, + "grad_norm": 0.7976173758506775, + "learning_rate": 9.976962725951878e-05, + "loss": 2.3288, + "step": 1945 + }, + { + "epoch": 0.5972989564149785, + "grad_norm": 0.9487287998199463, + "learning_rate": 9.976915041799921e-05, + "loss": 2.4484, + "step": 1946 + }, + { + "epoch": 0.5976058931860037, + "grad_norm": 0.9866845011711121, + "learning_rate": 9.976867308463174e-05, + "loss": 2.3223, + "step": 1947 + }, + { + "epoch": 0.5979128299570289, + "grad_norm": 0.9258660674095154, + "learning_rate": 9.976819525942107e-05, + "loss": 2.2358, + "step": 1948 + }, + { + "epoch": 0.598219766728054, + "grad_norm": 0.9822832345962524, + "learning_rate": 9.976771694237192e-05, + "loss": 2.2951, + "step": 1949 + }, + { + "epoch": 0.5985267034990792, + "grad_norm": 1.005528450012207, + "learning_rate": 9.976723813348902e-05, + "loss": 2.2604, + "step": 1950 + }, + { + "epoch": 0.5988336402701043, + "grad_norm": 0.8988018035888672, + "learning_rate": 9.976675883277711e-05, + "loss": 2.3419, + "step": 1951 + }, + { + "epoch": 0.5991405770411296, + "grad_norm": 0.7386319041252136, + "learning_rate": 9.976627904024091e-05, + "loss": 2.2357, + "step": 1952 + }, + { + "epoch": 0.5994475138121547, + "grad_norm": 0.7715404033660889, + "learning_rate": 9.976579875588518e-05, + "loss": 2.3482, + "step": 1953 + }, + { + "epoch": 0.5997544505831799, + "grad_norm": 0.7529712319374084, + "learning_rate": 9.976531797971464e-05, + "loss": 2.1735, + "step": 1954 + }, + { + "epoch": 0.600061387354205, + "grad_norm": 0.8589643836021423, + "learning_rate": 9.97648367117341e-05, + "loss": 2.305, + "step": 1955 + }, + { + "epoch": 0.6003683241252302, + "grad_norm": 0.9038915634155273, + "learning_rate": 9.976435495194823e-05, + "loss": 2.2123, + "step": 1956 + }, + { + "epoch": 0.6006752608962553, + "grad_norm": 0.9388678073883057, + "learning_rate": 9.976387270036186e-05, + "loss": 2.1792, + "step": 1957 + }, + { + "epoch": 0.6009821976672806, + "grad_norm": 0.7970952391624451, + "learning_rate": 9.976338995697974e-05, + "loss": 2.2425, + "step": 1958 + }, + { + "epoch": 0.6012891344383057, + "grad_norm": 0.7219900488853455, + "learning_rate": 9.976290672180662e-05, + "loss": 2.1984, + "step": 1959 + }, + { + "epoch": 0.6015960712093309, + "grad_norm": 0.639715313911438, + "learning_rate": 9.976242299484728e-05, + "loss": 2.2796, + "step": 1960 + }, + { + "epoch": 0.601903007980356, + "grad_norm": 0.6734911799430847, + "learning_rate": 9.976193877610652e-05, + "loss": 2.3066, + "step": 1961 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.8328932523727417, + "learning_rate": 9.976145406558912e-05, + "loss": 2.3958, + "step": 1962 + }, + { + "epoch": 0.6025168815224063, + "grad_norm": 0.9552088379859924, + "learning_rate": 9.976096886329986e-05, + "loss": 2.3246, + "step": 1963 + }, + { + "epoch": 0.6028238182934316, + "grad_norm": 0.8407328128814697, + "learning_rate": 9.976048316924354e-05, + "loss": 2.2922, + "step": 1964 + }, + { + "epoch": 0.6031307550644567, + "grad_norm": 0.6899709105491638, + "learning_rate": 9.975999698342495e-05, + "loss": 2.1808, + "step": 1965 + }, + { + "epoch": 0.6034376918354819, + "grad_norm": 0.8114390969276428, + "learning_rate": 9.975951030584892e-05, + "loss": 2.3516, + "step": 1966 + }, + { + "epoch": 0.603744628606507, + "grad_norm": 0.8071461319923401, + "learning_rate": 9.975902313652024e-05, + "loss": 2.2044, + "step": 1967 + }, + { + "epoch": 0.6040515653775322, + "grad_norm": 0.8767913579940796, + "learning_rate": 9.975853547544372e-05, + "loss": 2.24, + "step": 1968 + }, + { + "epoch": 0.6043585021485574, + "grad_norm": 0.817095935344696, + "learning_rate": 9.975804732262419e-05, + "loss": 2.169, + "step": 1969 + }, + { + "epoch": 0.6046654389195826, + "grad_norm": 0.6818623542785645, + "learning_rate": 9.975755867806648e-05, + "loss": 2.2869, + "step": 1970 + }, + { + "epoch": 0.6049723756906077, + "grad_norm": 0.7248693704605103, + "learning_rate": 9.97570695417754e-05, + "loss": 2.2159, + "step": 1971 + }, + { + "epoch": 0.6052793124616329, + "grad_norm": 0.6425455212593079, + "learning_rate": 9.975657991375581e-05, + "loss": 2.2173, + "step": 1972 + }, + { + "epoch": 0.605586249232658, + "grad_norm": 0.6856566071510315, + "learning_rate": 9.975608979401252e-05, + "loss": 2.2994, + "step": 1973 + }, + { + "epoch": 0.6058931860036832, + "grad_norm": 0.6731004118919373, + "learning_rate": 9.97555991825504e-05, + "loss": 2.2286, + "step": 1974 + }, + { + "epoch": 0.6062001227747084, + "grad_norm": 0.7461759448051453, + "learning_rate": 9.975510807937428e-05, + "loss": 2.2057, + "step": 1975 + }, + { + "epoch": 0.6065070595457336, + "grad_norm": 0.7256236672401428, + "learning_rate": 9.975461648448902e-05, + "loss": 2.2686, + "step": 1976 + }, + { + "epoch": 0.6068139963167587, + "grad_norm": 0.7254514098167419, + "learning_rate": 9.975412439789949e-05, + "loss": 2.2748, + "step": 1977 + }, + { + "epoch": 0.6071209330877839, + "grad_norm": 0.7280047535896301, + "learning_rate": 9.975363181961052e-05, + "loss": 2.27, + "step": 1978 + }, + { + "epoch": 0.607427869858809, + "grad_norm": 0.6801813244819641, + "learning_rate": 9.9753138749627e-05, + "loss": 2.2356, + "step": 1979 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.841946005821228, + "learning_rate": 9.975264518795382e-05, + "loss": 2.3887, + "step": 1980 + }, + { + "epoch": 0.6080417434008594, + "grad_norm": 0.9610007405281067, + "learning_rate": 9.975215113459582e-05, + "loss": 2.2857, + "step": 1981 + }, + { + "epoch": 0.6083486801718846, + "grad_norm": 0.8726536631584167, + "learning_rate": 9.975165658955791e-05, + "loss": 2.3137, + "step": 1982 + }, + { + "epoch": 0.6086556169429097, + "grad_norm": 0.9275946021080017, + "learning_rate": 9.975116155284498e-05, + "loss": 2.291, + "step": 1983 + }, + { + "epoch": 0.6089625537139349, + "grad_norm": 0.9045402407646179, + "learning_rate": 9.97506660244619e-05, + "loss": 2.2183, + "step": 1984 + }, + { + "epoch": 0.6092694904849602, + "grad_norm": 0.7913599610328674, + "learning_rate": 9.975017000441358e-05, + "loss": 2.349, + "step": 1985 + }, + { + "epoch": 0.6095764272559853, + "grad_norm": 0.714824378490448, + "learning_rate": 9.974967349270492e-05, + "loss": 2.2163, + "step": 1986 + }, + { + "epoch": 0.6098833640270105, + "grad_norm": 0.7178559899330139, + "learning_rate": 9.974917648934084e-05, + "loss": 2.2338, + "step": 1987 + }, + { + "epoch": 0.6101903007980356, + "grad_norm": 0.8417280912399292, + "learning_rate": 9.97486789943262e-05, + "loss": 2.1961, + "step": 1988 + }, + { + "epoch": 0.6104972375690608, + "grad_norm": 0.8488532304763794, + "learning_rate": 9.9748181007666e-05, + "loss": 2.2509, + "step": 1989 + }, + { + "epoch": 0.6108041743400859, + "grad_norm": 0.796309769153595, + "learning_rate": 9.974768252936509e-05, + "loss": 2.2948, + "step": 1990 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.7163965702056885, + "learning_rate": 9.974718355942843e-05, + "loss": 2.2136, + "step": 1991 + }, + { + "epoch": 0.6114180478821363, + "grad_norm": 0.6620060205459595, + "learning_rate": 9.974668409786095e-05, + "loss": 2.2442, + "step": 1992 + }, + { + "epoch": 0.6117249846531615, + "grad_norm": 0.6843542456626892, + "learning_rate": 9.974618414466759e-05, + "loss": 2.1972, + "step": 1993 + }, + { + "epoch": 0.6120319214241866, + "grad_norm": 0.699847936630249, + "learning_rate": 9.974568369985327e-05, + "loss": 2.2194, + "step": 1994 + }, + { + "epoch": 0.6123388581952118, + "grad_norm": 0.693384051322937, + "learning_rate": 9.974518276342293e-05, + "loss": 2.2446, + "step": 1995 + }, + { + "epoch": 0.612645794966237, + "grad_norm": 0.6022316813468933, + "learning_rate": 9.974468133538155e-05, + "loss": 2.2037, + "step": 1996 + }, + { + "epoch": 0.6129527317372622, + "grad_norm": 0.6317062377929688, + "learning_rate": 9.974417941573409e-05, + "loss": 2.1855, + "step": 1997 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.7291355133056641, + "learning_rate": 9.974367700448547e-05, + "loss": 2.2179, + "step": 1998 + }, + { + "epoch": 0.6135666052793125, + "grad_norm": 0.6776867508888245, + "learning_rate": 9.97431741016407e-05, + "loss": 2.2437, + "step": 1999 + }, + { + "epoch": 0.6138735420503376, + "grad_norm": 0.6598517298698425, + "learning_rate": 9.97426707072047e-05, + "loss": 2.2775, + "step": 2000 + }, + { + "epoch": 0.6141804788213628, + "grad_norm": 0.6681709289550781, + "learning_rate": 9.974216682118249e-05, + "loss": 2.2004, + "step": 2001 + }, + { + "epoch": 0.614487415592388, + "grad_norm": 0.6725168228149414, + "learning_rate": 9.974166244357903e-05, + "loss": 2.2922, + "step": 2002 + }, + { + "epoch": 0.6147943523634132, + "grad_norm": 0.6547908782958984, + "learning_rate": 9.974115757439931e-05, + "loss": 2.2195, + "step": 2003 + }, + { + "epoch": 0.6151012891344383, + "grad_norm": 0.7195348739624023, + "learning_rate": 9.974065221364831e-05, + "loss": 2.2862, + "step": 2004 + }, + { + "epoch": 0.6154082259054635, + "grad_norm": 0.7992655038833618, + "learning_rate": 9.974014636133103e-05, + "loss": 2.3109, + "step": 2005 + }, + { + "epoch": 0.6157151626764886, + "grad_norm": 0.7932934165000916, + "learning_rate": 9.973964001745249e-05, + "loss": 2.2869, + "step": 2006 + }, + { + "epoch": 0.6160220994475138, + "grad_norm": 0.7778924107551575, + "learning_rate": 9.973913318201763e-05, + "loss": 2.2046, + "step": 2007 + }, + { + "epoch": 0.616329036218539, + "grad_norm": 0.7951294183731079, + "learning_rate": 9.973862585503155e-05, + "loss": 2.221, + "step": 2008 + }, + { + "epoch": 0.6166359729895642, + "grad_norm": 0.729552686214447, + "learning_rate": 9.97381180364992e-05, + "loss": 2.2929, + "step": 2009 + }, + { + "epoch": 0.6169429097605893, + "grad_norm": 0.731516420841217, + "learning_rate": 9.973760972642561e-05, + "loss": 2.2673, + "step": 2010 + }, + { + "epoch": 0.6172498465316145, + "grad_norm": 0.6950094103813171, + "learning_rate": 9.973710092481581e-05, + "loss": 2.2029, + "step": 2011 + }, + { + "epoch": 0.6175567833026396, + "grad_norm": 0.6260825395584106, + "learning_rate": 9.973659163167484e-05, + "loss": 2.3037, + "step": 2012 + }, + { + "epoch": 0.6178637200736649, + "grad_norm": 0.6949467658996582, + "learning_rate": 9.97360818470077e-05, + "loss": 2.2699, + "step": 2013 + }, + { + "epoch": 0.61817065684469, + "grad_norm": 0.7322572469711304, + "learning_rate": 9.973557157081945e-05, + "loss": 2.2921, + "step": 2014 + }, + { + "epoch": 0.6184775936157152, + "grad_norm": 0.8999563455581665, + "learning_rate": 9.973506080311514e-05, + "loss": 2.2499, + "step": 2015 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.9269914031028748, + "learning_rate": 9.973454954389981e-05, + "loss": 2.2676, + "step": 2016 + }, + { + "epoch": 0.6190914671577655, + "grad_norm": 0.8630712628364563, + "learning_rate": 9.973403779317852e-05, + "loss": 2.1379, + "step": 2017 + }, + { + "epoch": 0.6193984039287906, + "grad_norm": 0.8249645233154297, + "learning_rate": 9.97335255509563e-05, + "loss": 2.3109, + "step": 2018 + }, + { + "epoch": 0.6197053406998159, + "grad_norm": 0.7832711338996887, + "learning_rate": 9.973301281723824e-05, + "loss": 2.1316, + "step": 2019 + }, + { + "epoch": 0.620012277470841, + "grad_norm": 0.7502821683883667, + "learning_rate": 9.97324995920294e-05, + "loss": 2.2188, + "step": 2020 + }, + { + "epoch": 0.6203192142418662, + "grad_norm": 0.7804487347602844, + "learning_rate": 9.973198587533483e-05, + "loss": 2.2639, + "step": 2021 + }, + { + "epoch": 0.6206261510128913, + "grad_norm": 0.9198356866836548, + "learning_rate": 9.973147166715963e-05, + "loss": 2.2574, + "step": 2022 + }, + { + "epoch": 0.6209330877839165, + "grad_norm": 0.8792869448661804, + "learning_rate": 9.97309569675089e-05, + "loss": 2.2228, + "step": 2023 + }, + { + "epoch": 0.6212400245549416, + "grad_norm": 0.779772937297821, + "learning_rate": 9.97304417763877e-05, + "loss": 2.2179, + "step": 2024 + }, + { + "epoch": 0.6215469613259669, + "grad_norm": 0.7702100276947021, + "learning_rate": 9.972992609380111e-05, + "loss": 2.3872, + "step": 2025 + }, + { + "epoch": 0.621853898096992, + "grad_norm": 0.8576669096946716, + "learning_rate": 9.972940991975426e-05, + "loss": 2.2279, + "step": 2026 + }, + { + "epoch": 0.6221608348680172, + "grad_norm": 0.8312802314758301, + "learning_rate": 9.972889325425223e-05, + "loss": 2.3507, + "step": 2027 + }, + { + "epoch": 0.6224677716390423, + "grad_norm": 0.7873719930648804, + "learning_rate": 9.972837609730013e-05, + "loss": 2.2252, + "step": 2028 + }, + { + "epoch": 0.6227747084100675, + "grad_norm": 0.7763897180557251, + "learning_rate": 9.972785844890307e-05, + "loss": 2.2559, + "step": 2029 + }, + { + "epoch": 0.6230816451810927, + "grad_norm": 0.7053700685501099, + "learning_rate": 9.972734030906617e-05, + "loss": 2.2248, + "step": 2030 + }, + { + "epoch": 0.6233885819521179, + "grad_norm": 0.8800643682479858, + "learning_rate": 9.972682167779453e-05, + "loss": 2.3111, + "step": 2031 + }, + { + "epoch": 0.623695518723143, + "grad_norm": 0.7237632274627686, + "learning_rate": 9.97263025550933e-05, + "loss": 2.2255, + "step": 2032 + }, + { + "epoch": 0.6240024554941682, + "grad_norm": 0.7139064073562622, + "learning_rate": 9.97257829409676e-05, + "loss": 2.2065, + "step": 2033 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.6514315009117126, + "learning_rate": 9.972526283542259e-05, + "loss": 2.2176, + "step": 2034 + }, + { + "epoch": 0.6246163290362186, + "grad_norm": 0.726828932762146, + "learning_rate": 9.972474223846337e-05, + "loss": 2.2236, + "step": 2035 + }, + { + "epoch": 0.6249232658072437, + "grad_norm": 0.7121313810348511, + "learning_rate": 9.97242211500951e-05, + "loss": 2.2696, + "step": 2036 + }, + { + "epoch": 0.6252302025782689, + "grad_norm": 0.7203021049499512, + "learning_rate": 9.972369957032293e-05, + "loss": 2.2418, + "step": 2037 + }, + { + "epoch": 0.625537139349294, + "grad_norm": 0.6843051910400391, + "learning_rate": 9.972317749915203e-05, + "loss": 2.2408, + "step": 2038 + }, + { + "epoch": 0.6258440761203192, + "grad_norm": 0.6523141264915466, + "learning_rate": 9.972265493658754e-05, + "loss": 2.1693, + "step": 2039 + }, + { + "epoch": 0.6261510128913443, + "grad_norm": 0.6263946294784546, + "learning_rate": 9.972213188263463e-05, + "loss": 2.2477, + "step": 2040 + }, + { + "epoch": 0.6264579496623696, + "grad_norm": 0.6428464651107788, + "learning_rate": 9.972160833729847e-05, + "loss": 2.2131, + "step": 2041 + }, + { + "epoch": 0.6267648864333947, + "grad_norm": 0.6333484649658203, + "learning_rate": 9.972108430058423e-05, + "loss": 2.2806, + "step": 2042 + }, + { + "epoch": 0.6270718232044199, + "grad_norm": 0.7168832421302795, + "learning_rate": 9.97205597724971e-05, + "loss": 2.2468, + "step": 2043 + }, + { + "epoch": 0.627378759975445, + "grad_norm": 0.7522227168083191, + "learning_rate": 9.972003475304226e-05, + "loss": 2.249, + "step": 2044 + }, + { + "epoch": 0.6276856967464702, + "grad_norm": 0.6810066103935242, + "learning_rate": 9.971950924222488e-05, + "loss": 2.1988, + "step": 2045 + }, + { + "epoch": 0.6279926335174953, + "grad_norm": 0.6983187198638916, + "learning_rate": 9.971898324005018e-05, + "loss": 2.2444, + "step": 2046 + }, + { + "epoch": 0.6282995702885206, + "grad_norm": 0.7261439561843872, + "learning_rate": 9.971845674652333e-05, + "loss": 2.1789, + "step": 2047 + }, + { + "epoch": 0.6286065070595457, + "grad_norm": 0.6844322681427002, + "learning_rate": 9.971792976164957e-05, + "loss": 2.2666, + "step": 2048 + }, + { + "epoch": 0.6289134438305709, + "grad_norm": 0.7166746258735657, + "learning_rate": 9.971740228543407e-05, + "loss": 2.3002, + "step": 2049 + }, + { + "epoch": 0.629220380601596, + "grad_norm": 0.7386785745620728, + "learning_rate": 9.971687431788207e-05, + "loss": 2.1798, + "step": 2050 + }, + { + "epoch": 0.6295273173726212, + "grad_norm": 0.6873611211776733, + "learning_rate": 9.971634585899878e-05, + "loss": 2.184, + "step": 2051 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.8005948066711426, + "learning_rate": 9.971581690878941e-05, + "loss": 2.2778, + "step": 2052 + }, + { + "epoch": 0.6301411909146716, + "grad_norm": 0.8972415924072266, + "learning_rate": 9.971528746725922e-05, + "loss": 2.2822, + "step": 2053 + }, + { + "epoch": 0.6304481276856968, + "grad_norm": 0.7935822010040283, + "learning_rate": 9.97147575344134e-05, + "loss": 2.1732, + "step": 2054 + }, + { + "epoch": 0.6307550644567219, + "grad_norm": 0.7891644239425659, + "learning_rate": 9.971422711025721e-05, + "loss": 2.2765, + "step": 2055 + }, + { + "epoch": 0.6310620012277471, + "grad_norm": 0.7857005000114441, + "learning_rate": 9.971369619479589e-05, + "loss": 2.2386, + "step": 2056 + }, + { + "epoch": 0.6313689379987723, + "grad_norm": 0.6909852623939514, + "learning_rate": 9.97131647880347e-05, + "loss": 2.1251, + "step": 2057 + }, + { + "epoch": 0.6316758747697975, + "grad_norm": 0.6352387070655823, + "learning_rate": 9.971263288997885e-05, + "loss": 2.1883, + "step": 2058 + }, + { + "epoch": 0.6319828115408226, + "grad_norm": 0.5811386704444885, + "learning_rate": 9.971210050063364e-05, + "loss": 2.281, + "step": 2059 + }, + { + "epoch": 0.6322897483118478, + "grad_norm": 0.6227630376815796, + "learning_rate": 9.971156762000432e-05, + "loss": 2.1346, + "step": 2060 + }, + { + "epoch": 0.6325966850828729, + "grad_norm": 0.6628422737121582, + "learning_rate": 9.971103424809616e-05, + "loss": 2.2617, + "step": 2061 + }, + { + "epoch": 0.6329036218538981, + "grad_norm": 0.7212308645248413, + "learning_rate": 9.97105003849144e-05, + "loss": 2.1764, + "step": 2062 + }, + { + "epoch": 0.6332105586249233, + "grad_norm": 0.8368894457817078, + "learning_rate": 9.970996603046435e-05, + "loss": 2.2897, + "step": 2063 + }, + { + "epoch": 0.6335174953959485, + "grad_norm": 0.8797467350959778, + "learning_rate": 9.970943118475129e-05, + "loss": 2.1987, + "step": 2064 + }, + { + "epoch": 0.6338244321669736, + "grad_norm": 0.9241101145744324, + "learning_rate": 9.970889584778047e-05, + "loss": 2.2759, + "step": 2065 + }, + { + "epoch": 0.6341313689379988, + "grad_norm": 0.8636183142662048, + "learning_rate": 9.970836001955723e-05, + "loss": 2.2188, + "step": 2066 + }, + { + "epoch": 0.6344383057090239, + "grad_norm": 0.8965754508972168, + "learning_rate": 9.970782370008682e-05, + "loss": 2.2845, + "step": 2067 + }, + { + "epoch": 0.6347452424800492, + "grad_norm": 0.9064372777938843, + "learning_rate": 9.970728688937459e-05, + "loss": 2.1787, + "step": 2068 + }, + { + "epoch": 0.6350521792510743, + "grad_norm": 0.7387171387672424, + "learning_rate": 9.970674958742579e-05, + "loss": 2.1805, + "step": 2069 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.6220484972000122, + "learning_rate": 9.970621179424578e-05, + "loss": 2.2762, + "step": 2070 + }, + { + "epoch": 0.6356660527931246, + "grad_norm": 0.6268464922904968, + "learning_rate": 9.970567350983984e-05, + "loss": 2.2491, + "step": 2071 + }, + { + "epoch": 0.6359729895641498, + "grad_norm": 0.6385738253593445, + "learning_rate": 9.97051347342133e-05, + "loss": 2.2126, + "step": 2072 + }, + { + "epoch": 0.6362799263351749, + "grad_norm": 0.7084285020828247, + "learning_rate": 9.970459546737148e-05, + "loss": 2.2364, + "step": 2073 + }, + { + "epoch": 0.6365868631062002, + "grad_norm": 0.6957145929336548, + "learning_rate": 9.97040557093197e-05, + "loss": 2.266, + "step": 2074 + }, + { + "epoch": 0.6368937998772253, + "grad_norm": 0.6037309169769287, + "learning_rate": 9.970351546006334e-05, + "loss": 2.1514, + "step": 2075 + }, + { + "epoch": 0.6372007366482505, + "grad_norm": 0.6342970132827759, + "learning_rate": 9.97029747196077e-05, + "loss": 2.1602, + "step": 2076 + }, + { + "epoch": 0.6375076734192756, + "grad_norm": 0.5793863534927368, + "learning_rate": 9.970243348795812e-05, + "loss": 2.1853, + "step": 2077 + }, + { + "epoch": 0.6378146101903008, + "grad_norm": 0.5420103073120117, + "learning_rate": 9.970189176511997e-05, + "loss": 2.1885, + "step": 2078 + }, + { + "epoch": 0.638121546961326, + "grad_norm": 0.6713188886642456, + "learning_rate": 9.97013495510986e-05, + "loss": 2.2641, + "step": 2079 + }, + { + "epoch": 0.6384284837323512, + "grad_norm": 0.7410796880722046, + "learning_rate": 9.970080684589935e-05, + "loss": 2.2248, + "step": 2080 + }, + { + "epoch": 0.6387354205033763, + "grad_norm": 0.7138017416000366, + "learning_rate": 9.970026364952761e-05, + "loss": 2.1975, + "step": 2081 + }, + { + "epoch": 0.6390423572744015, + "grad_norm": 0.7553584575653076, + "learning_rate": 9.969971996198873e-05, + "loss": 2.2482, + "step": 2082 + }, + { + "epoch": 0.6393492940454266, + "grad_norm": 0.7082852125167847, + "learning_rate": 9.969917578328808e-05, + "loss": 2.1681, + "step": 2083 + }, + { + "epoch": 0.6396562308164518, + "grad_norm": 0.6190223097801208, + "learning_rate": 9.969863111343105e-05, + "loss": 2.1995, + "step": 2084 + }, + { + "epoch": 0.639963167587477, + "grad_norm": 0.6640429496765137, + "learning_rate": 9.969808595242302e-05, + "loss": 2.2969, + "step": 2085 + }, + { + "epoch": 0.6402701043585022, + "grad_norm": 0.761377215385437, + "learning_rate": 9.969754030026936e-05, + "loss": 2.2412, + "step": 2086 + }, + { + "epoch": 0.6405770411295273, + "grad_norm": 0.7226401567459106, + "learning_rate": 9.969699415697551e-05, + "loss": 2.1852, + "step": 2087 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.6474639177322388, + "learning_rate": 9.969644752254681e-05, + "loss": 2.1867, + "step": 2088 + }, + { + "epoch": 0.6411909146715776, + "grad_norm": 0.6725835800170898, + "learning_rate": 9.96959003969887e-05, + "loss": 2.1962, + "step": 2089 + }, + { + "epoch": 0.6414978514426029, + "grad_norm": 0.6669641733169556, + "learning_rate": 9.969535278030657e-05, + "loss": 2.2045, + "step": 2090 + }, + { + "epoch": 0.641804788213628, + "grad_norm": 0.7604048252105713, + "learning_rate": 9.969480467250583e-05, + "loss": 2.2543, + "step": 2091 + }, + { + "epoch": 0.6421117249846532, + "grad_norm": 0.9369953870773315, + "learning_rate": 9.969425607359191e-05, + "loss": 2.2461, + "step": 2092 + }, + { + "epoch": 0.6424186617556783, + "grad_norm": 1.116156816482544, + "learning_rate": 9.969370698357022e-05, + "loss": 2.2447, + "step": 2093 + }, + { + "epoch": 0.6427255985267035, + "grad_norm": 0.9179674983024597, + "learning_rate": 9.96931574024462e-05, + "loss": 2.2164, + "step": 2094 + }, + { + "epoch": 0.6430325352977286, + "grad_norm": 0.7629393339157104, + "learning_rate": 9.969260733022526e-05, + "loss": 2.22, + "step": 2095 + }, + { + "epoch": 0.6433394720687539, + "grad_norm": 0.7152948379516602, + "learning_rate": 9.969205676691286e-05, + "loss": 2.1967, + "step": 2096 + }, + { + "epoch": 0.643646408839779, + "grad_norm": 0.7527763247489929, + "learning_rate": 9.969150571251442e-05, + "loss": 2.2263, + "step": 2097 + }, + { + "epoch": 0.6439533456108042, + "grad_norm": 0.9889422655105591, + "learning_rate": 9.96909541670354e-05, + "loss": 2.2127, + "step": 2098 + }, + { + "epoch": 0.6442602823818293, + "grad_norm": 1.0340619087219238, + "learning_rate": 9.969040213048125e-05, + "loss": 2.2392, + "step": 2099 + }, + { + "epoch": 0.6445672191528545, + "grad_norm": 0.735322892665863, + "learning_rate": 9.968984960285743e-05, + "loss": 2.1351, + "step": 2100 + }, + { + "epoch": 0.6448741559238796, + "grad_norm": 0.6575397849082947, + "learning_rate": 9.968929658416936e-05, + "loss": 2.2481, + "step": 2101 + }, + { + "epoch": 0.6451810926949049, + "grad_norm": 0.6891960501670837, + "learning_rate": 9.968874307442258e-05, + "loss": 2.2164, + "step": 2102 + }, + { + "epoch": 0.64548802946593, + "grad_norm": 0.792298436164856, + "learning_rate": 9.968818907362248e-05, + "loss": 2.1681, + "step": 2103 + }, + { + "epoch": 0.6457949662369552, + "grad_norm": 0.8438142538070679, + "learning_rate": 9.968763458177459e-05, + "loss": 2.2123, + "step": 2104 + }, + { + "epoch": 0.6461019030079803, + "grad_norm": 0.7494921088218689, + "learning_rate": 9.968707959888436e-05, + "loss": 2.1863, + "step": 2105 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.7049927115440369, + "learning_rate": 9.968652412495731e-05, + "loss": 2.2364, + "step": 2106 + }, + { + "epoch": 0.6467157765500307, + "grad_norm": 0.7586455345153809, + "learning_rate": 9.968596815999889e-05, + "loss": 2.1976, + "step": 2107 + }, + { + "epoch": 0.6470227133210559, + "grad_norm": 0.7762691974639893, + "learning_rate": 9.968541170401462e-05, + "loss": 2.2323, + "step": 2108 + }, + { + "epoch": 0.647329650092081, + "grad_norm": 0.8127642869949341, + "learning_rate": 9.968485475700998e-05, + "loss": 2.1577, + "step": 2109 + }, + { + "epoch": 0.6476365868631062, + "grad_norm": 0.6762635111808777, + "learning_rate": 9.968429731899049e-05, + "loss": 2.1972, + "step": 2110 + }, + { + "epoch": 0.6479435236341313, + "grad_norm": 0.675707995891571, + "learning_rate": 9.968373938996165e-05, + "loss": 2.1932, + "step": 2111 + }, + { + "epoch": 0.6482504604051565, + "grad_norm": 0.6996815204620361, + "learning_rate": 9.968318096992898e-05, + "loss": 2.2695, + "step": 2112 + }, + { + "epoch": 0.6485573971761817, + "grad_norm": 0.8519851565361023, + "learning_rate": 9.968262205889799e-05, + "loss": 2.2662, + "step": 2113 + }, + { + "epoch": 0.6488643339472069, + "grad_norm": 0.7621145844459534, + "learning_rate": 9.968206265687421e-05, + "loss": 2.2888, + "step": 2114 + }, + { + "epoch": 0.649171270718232, + "grad_norm": 0.786609411239624, + "learning_rate": 9.968150276386317e-05, + "loss": 2.3354, + "step": 2115 + }, + { + "epoch": 0.6494782074892572, + "grad_norm": 0.7693428993225098, + "learning_rate": 9.96809423798704e-05, + "loss": 2.1981, + "step": 2116 + }, + { + "epoch": 0.6497851442602823, + "grad_norm": 0.72762131690979, + "learning_rate": 9.968038150490145e-05, + "loss": 2.2387, + "step": 2117 + }, + { + "epoch": 0.6500920810313076, + "grad_norm": 0.737617015838623, + "learning_rate": 9.967982013896184e-05, + "loss": 2.258, + "step": 2118 + }, + { + "epoch": 0.6503990178023327, + "grad_norm": 0.7320968508720398, + "learning_rate": 9.967925828205712e-05, + "loss": 2.3248, + "step": 2119 + }, + { + "epoch": 0.6507059545733579, + "grad_norm": 0.7904484868049622, + "learning_rate": 9.967869593419286e-05, + "loss": 2.2121, + "step": 2120 + }, + { + "epoch": 0.651012891344383, + "grad_norm": 0.7519722580909729, + "learning_rate": 9.967813309537461e-05, + "loss": 2.1999, + "step": 2121 + }, + { + "epoch": 0.6513198281154082, + "grad_norm": 0.7201504707336426, + "learning_rate": 9.967756976560793e-05, + "loss": 2.2022, + "step": 2122 + }, + { + "epoch": 0.6516267648864333, + "grad_norm": 0.6134514808654785, + "learning_rate": 9.96770059448984e-05, + "loss": 2.2105, + "step": 2123 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.6086028218269348, + "learning_rate": 9.967644163325156e-05, + "loss": 2.212, + "step": 2124 + }, + { + "epoch": 0.6522406384284838, + "grad_norm": 0.6550475358963013, + "learning_rate": 9.967587683067302e-05, + "loss": 2.181, + "step": 2125 + }, + { + "epoch": 0.6525475751995089, + "grad_norm": 0.7557916045188904, + "learning_rate": 9.967531153716835e-05, + "loss": 2.3194, + "step": 2126 + }, + { + "epoch": 0.6528545119705341, + "grad_norm": 0.8859965801239014, + "learning_rate": 9.967474575274314e-05, + "loss": 2.2104, + "step": 2127 + }, + { + "epoch": 0.6531614487415592, + "grad_norm": 0.8049005270004272, + "learning_rate": 9.967417947740296e-05, + "loss": 2.2949, + "step": 2128 + }, + { + "epoch": 0.6534683855125845, + "grad_norm": 0.708297073841095, + "learning_rate": 9.967361271115343e-05, + "loss": 2.1703, + "step": 2129 + }, + { + "epoch": 0.6537753222836096, + "grad_norm": 0.6764169335365295, + "learning_rate": 9.967304545400016e-05, + "loss": 2.2177, + "step": 2130 + }, + { + "epoch": 0.6540822590546348, + "grad_norm": 0.6987971067428589, + "learning_rate": 9.967247770594872e-05, + "loss": 2.1699, + "step": 2131 + }, + { + "epoch": 0.6543891958256599, + "grad_norm": 0.7212976217269897, + "learning_rate": 9.967190946700476e-05, + "loss": 2.1217, + "step": 2132 + }, + { + "epoch": 0.6546961325966851, + "grad_norm": 0.6805562973022461, + "learning_rate": 9.967134073717386e-05, + "loss": 2.2295, + "step": 2133 + }, + { + "epoch": 0.6550030693677102, + "grad_norm": 0.665428102016449, + "learning_rate": 9.967077151646167e-05, + "loss": 2.1742, + "step": 2134 + }, + { + "epoch": 0.6553100061387355, + "grad_norm": 0.6691353917121887, + "learning_rate": 9.967020180487378e-05, + "loss": 2.2313, + "step": 2135 + }, + { + "epoch": 0.6556169429097606, + "grad_norm": 0.7095547914505005, + "learning_rate": 9.966963160241587e-05, + "loss": 2.1367, + "step": 2136 + }, + { + "epoch": 0.6559238796807858, + "grad_norm": 0.7050215601921082, + "learning_rate": 9.966906090909353e-05, + "loss": 2.3234, + "step": 2137 + }, + { + "epoch": 0.6562308164518109, + "grad_norm": 0.7592353820800781, + "learning_rate": 9.966848972491245e-05, + "loss": 2.1722, + "step": 2138 + }, + { + "epoch": 0.6565377532228361, + "grad_norm": 0.6520100831985474, + "learning_rate": 9.96679180498782e-05, + "loss": 2.2401, + "step": 2139 + }, + { + "epoch": 0.6568446899938613, + "grad_norm": 0.6650902628898621, + "learning_rate": 9.966734588399651e-05, + "loss": 2.2094, + "step": 2140 + }, + { + "epoch": 0.6571516267648865, + "grad_norm": 0.7236151099205017, + "learning_rate": 9.966677322727299e-05, + "loss": 2.3021, + "step": 2141 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.7160753011703491, + "learning_rate": 9.966620007971327e-05, + "loss": 2.1992, + "step": 2142 + }, + { + "epoch": 0.6577655003069368, + "grad_norm": 0.6761705279350281, + "learning_rate": 9.966562644132309e-05, + "loss": 2.1853, + "step": 2143 + }, + { + "epoch": 0.6580724370779619, + "grad_norm": 0.7017555236816406, + "learning_rate": 9.966505231210806e-05, + "loss": 2.208, + "step": 2144 + }, + { + "epoch": 0.6583793738489871, + "grad_norm": 0.7652586102485657, + "learning_rate": 9.966447769207387e-05, + "loss": 2.3065, + "step": 2145 + }, + { + "epoch": 0.6586863106200123, + "grad_norm": 0.7148436307907104, + "learning_rate": 9.966390258122621e-05, + "loss": 2.1388, + "step": 2146 + }, + { + "epoch": 0.6589932473910375, + "grad_norm": 0.5885360240936279, + "learning_rate": 9.966332697957076e-05, + "loss": 2.1463, + "step": 2147 + }, + { + "epoch": 0.6593001841620626, + "grad_norm": 0.6800816655158997, + "learning_rate": 9.966275088711321e-05, + "loss": 2.3397, + "step": 2148 + }, + { + "epoch": 0.6596071209330878, + "grad_norm": 0.6856956481933594, + "learning_rate": 9.966217430385925e-05, + "loss": 2.0893, + "step": 2149 + }, + { + "epoch": 0.6599140577041129, + "grad_norm": 0.6302888989448547, + "learning_rate": 9.966159722981456e-05, + "loss": 2.1108, + "step": 2150 + }, + { + "epoch": 0.6602209944751382, + "grad_norm": 0.6145252585411072, + "learning_rate": 9.966101966498486e-05, + "loss": 2.2668, + "step": 2151 + }, + { + "epoch": 0.6605279312461633, + "grad_norm": 0.7258949279785156, + "learning_rate": 9.966044160937586e-05, + "loss": 2.2163, + "step": 2152 + }, + { + "epoch": 0.6608348680171885, + "grad_norm": 0.6809847950935364, + "learning_rate": 9.965986306299327e-05, + "loss": 2.1828, + "step": 2153 + }, + { + "epoch": 0.6611418047882136, + "grad_norm": 0.6673223376274109, + "learning_rate": 9.96592840258428e-05, + "loss": 2.232, + "step": 2154 + }, + { + "epoch": 0.6614487415592388, + "grad_norm": 0.6483572721481323, + "learning_rate": 9.96587044979302e-05, + "loss": 2.199, + "step": 2155 + }, + { + "epoch": 0.6617556783302639, + "grad_norm": 0.6227185726165771, + "learning_rate": 9.965812447926115e-05, + "loss": 2.166, + "step": 2156 + }, + { + "epoch": 0.6620626151012892, + "grad_norm": 0.5982463955879211, + "learning_rate": 9.965754396984142e-05, + "loss": 2.2074, + "step": 2157 + }, + { + "epoch": 0.6623695518723143, + "grad_norm": 0.6357809901237488, + "learning_rate": 9.965696296967673e-05, + "loss": 2.2086, + "step": 2158 + }, + { + "epoch": 0.6626764886433395, + "grad_norm": 0.5908147692680359, + "learning_rate": 9.965638147877283e-05, + "loss": 2.1103, + "step": 2159 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.591332733631134, + "learning_rate": 9.965579949713545e-05, + "loss": 2.1698, + "step": 2160 + }, + { + "epoch": 0.6632903621853898, + "grad_norm": 0.5748336911201477, + "learning_rate": 9.965521702477038e-05, + "loss": 2.1812, + "step": 2161 + }, + { + "epoch": 0.663597298956415, + "grad_norm": 0.6643908023834229, + "learning_rate": 9.965463406168334e-05, + "loss": 2.2129, + "step": 2162 + }, + { + "epoch": 0.6639042357274402, + "grad_norm": 0.637627124786377, + "learning_rate": 9.965405060788011e-05, + "loss": 2.226, + "step": 2163 + }, + { + "epoch": 0.6642111724984653, + "grad_norm": 0.6170387268066406, + "learning_rate": 9.965346666336644e-05, + "loss": 2.2025, + "step": 2164 + }, + { + "epoch": 0.6645181092694905, + "grad_norm": 0.6038833260536194, + "learning_rate": 9.965288222814812e-05, + "loss": 2.1761, + "step": 2165 + }, + { + "epoch": 0.6648250460405156, + "grad_norm": 0.5705585479736328, + "learning_rate": 9.965229730223092e-05, + "loss": 2.1511, + "step": 2166 + }, + { + "epoch": 0.6651319828115408, + "grad_norm": 0.5994759798049927, + "learning_rate": 9.965171188562059e-05, + "loss": 2.1763, + "step": 2167 + }, + { + "epoch": 0.665438919582566, + "grad_norm": 0.5887313485145569, + "learning_rate": 9.965112597832296e-05, + "loss": 2.2185, + "step": 2168 + }, + { + "epoch": 0.6657458563535912, + "grad_norm": 0.5688689947128296, + "learning_rate": 9.96505395803438e-05, + "loss": 2.2387, + "step": 2169 + }, + { + "epoch": 0.6660527931246163, + "grad_norm": 0.6121554970741272, + "learning_rate": 9.96499526916889e-05, + "loss": 2.1938, + "step": 2170 + }, + { + "epoch": 0.6663597298956415, + "grad_norm": 0.6048038005828857, + "learning_rate": 9.964936531236407e-05, + "loss": 2.197, + "step": 2171 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6442995071411133, + "learning_rate": 9.96487774423751e-05, + "loss": 2.1725, + "step": 2172 + }, + { + "epoch": 0.6669736034376919, + "grad_norm": 0.7136862874031067, + "learning_rate": 9.964818908172783e-05, + "loss": 2.2166, + "step": 2173 + }, + { + "epoch": 0.667280540208717, + "grad_norm": 0.6902804970741272, + "learning_rate": 9.964760023042805e-05, + "loss": 2.2318, + "step": 2174 + }, + { + "epoch": 0.6675874769797422, + "grad_norm": 0.6946488618850708, + "learning_rate": 9.964701088848158e-05, + "loss": 2.177, + "step": 2175 + }, + { + "epoch": 0.6678944137507673, + "grad_norm": 0.6283712983131409, + "learning_rate": 9.964642105589425e-05, + "loss": 2.2227, + "step": 2176 + }, + { + "epoch": 0.6682013505217925, + "grad_norm": 0.5768510103225708, + "learning_rate": 9.96458307326719e-05, + "loss": 2.1559, + "step": 2177 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.6045784950256348, + "learning_rate": 9.964523991882035e-05, + "loss": 2.2018, + "step": 2178 + }, + { + "epoch": 0.6688152240638429, + "grad_norm": 0.5962889790534973, + "learning_rate": 9.964464861434544e-05, + "loss": 2.1898, + "step": 2179 + }, + { + "epoch": 0.669122160834868, + "grad_norm": 0.6611660718917847, + "learning_rate": 9.964405681925301e-05, + "loss": 2.1989, + "step": 2180 + }, + { + "epoch": 0.6694290976058932, + "grad_norm": 0.6764575242996216, + "learning_rate": 9.964346453354891e-05, + "loss": 2.2764, + "step": 2181 + }, + { + "epoch": 0.6697360343769183, + "grad_norm": 0.6795048117637634, + "learning_rate": 9.964287175723899e-05, + "loss": 2.1313, + "step": 2182 + }, + { + "epoch": 0.6700429711479435, + "grad_norm": 0.6697003841400146, + "learning_rate": 9.964227849032914e-05, + "loss": 2.1999, + "step": 2183 + }, + { + "epoch": 0.6703499079189686, + "grad_norm": 0.669682502746582, + "learning_rate": 9.964168473282519e-05, + "loss": 2.202, + "step": 2184 + }, + { + "epoch": 0.6706568446899939, + "grad_norm": 0.6823530793190002, + "learning_rate": 9.9641090484733e-05, + "loss": 2.2326, + "step": 2185 + }, + { + "epoch": 0.670963781461019, + "grad_norm": 0.7460775971412659, + "learning_rate": 9.964049574605848e-05, + "loss": 2.1594, + "step": 2186 + }, + { + "epoch": 0.6712707182320442, + "grad_norm": 0.8075460195541382, + "learning_rate": 9.963990051680744e-05, + "loss": 2.1506, + "step": 2187 + }, + { + "epoch": 0.6715776550030693, + "grad_norm": 0.8041695356369019, + "learning_rate": 9.963930479698585e-05, + "loss": 2.123, + "step": 2188 + }, + { + "epoch": 0.6718845917740945, + "grad_norm": 0.9129732251167297, + "learning_rate": 9.963870858659955e-05, + "loss": 2.116, + "step": 2189 + }, + { + "epoch": 0.6721915285451197, + "grad_norm": 0.9989685416221619, + "learning_rate": 9.963811188565444e-05, + "loss": 2.3194, + "step": 2190 + }, + { + "epoch": 0.6724984653161449, + "grad_norm": 1.0353670120239258, + "learning_rate": 9.96375146941564e-05, + "loss": 2.113, + "step": 2191 + }, + { + "epoch": 0.67280540208717, + "grad_norm": 0.897750735282898, + "learning_rate": 9.963691701211135e-05, + "loss": 2.1038, + "step": 2192 + }, + { + "epoch": 0.6731123388581952, + "grad_norm": 0.7353916168212891, + "learning_rate": 9.96363188395252e-05, + "loss": 2.2185, + "step": 2193 + }, + { + "epoch": 0.6734192756292203, + "grad_norm": 0.6474063992500305, + "learning_rate": 9.963572017640385e-05, + "loss": 2.2229, + "step": 2194 + }, + { + "epoch": 0.6737262124002455, + "grad_norm": 0.7194583415985107, + "learning_rate": 9.963512102275322e-05, + "loss": 2.2172, + "step": 2195 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.6638131737709045, + "learning_rate": 9.963452137857926e-05, + "loss": 2.2212, + "step": 2196 + }, + { + "epoch": 0.6743400859422959, + "grad_norm": 0.7219048738479614, + "learning_rate": 9.963392124388782e-05, + "loss": 2.3302, + "step": 2197 + }, + { + "epoch": 0.6746470227133211, + "grad_norm": 0.7941164374351501, + "learning_rate": 9.963332061868491e-05, + "loss": 2.2982, + "step": 2198 + }, + { + "epoch": 0.6749539594843462, + "grad_norm": 0.7356888055801392, + "learning_rate": 9.963271950297643e-05, + "loss": 2.1761, + "step": 2199 + }, + { + "epoch": 0.6752608962553714, + "grad_norm": 0.6705774664878845, + "learning_rate": 9.963211789676831e-05, + "loss": 2.2483, + "step": 2200 + }, + { + "epoch": 0.6755678330263966, + "grad_norm": 0.7958056926727295, + "learning_rate": 9.963151580006653e-05, + "loss": 2.2209, + "step": 2201 + }, + { + "epoch": 0.6758747697974218, + "grad_norm": 0.7215412259101868, + "learning_rate": 9.9630913212877e-05, + "loss": 2.1676, + "step": 2202 + }, + { + "epoch": 0.6761817065684469, + "grad_norm": 0.705649197101593, + "learning_rate": 9.963031013520572e-05, + "loss": 2.1855, + "step": 2203 + }, + { + "epoch": 0.6764886433394721, + "grad_norm": 0.7050254344940186, + "learning_rate": 9.962970656705861e-05, + "loss": 2.171, + "step": 2204 + }, + { + "epoch": 0.6767955801104972, + "grad_norm": 0.7163556218147278, + "learning_rate": 9.962910250844167e-05, + "loss": 2.1295, + "step": 2205 + }, + { + "epoch": 0.6771025168815225, + "grad_norm": 0.7195280194282532, + "learning_rate": 9.962849795936083e-05, + "loss": 2.1436, + "step": 2206 + }, + { + "epoch": 0.6774094536525476, + "grad_norm": 0.7356030344963074, + "learning_rate": 9.962789291982208e-05, + "loss": 2.2739, + "step": 2207 + }, + { + "epoch": 0.6777163904235728, + "grad_norm": 0.783649742603302, + "learning_rate": 9.962728738983143e-05, + "loss": 2.2461, + "step": 2208 + }, + { + "epoch": 0.6780233271945979, + "grad_norm": 0.6966754794120789, + "learning_rate": 9.962668136939481e-05, + "loss": 2.1977, + "step": 2209 + }, + { + "epoch": 0.6783302639656231, + "grad_norm": 0.6986487507820129, + "learning_rate": 9.962607485851825e-05, + "loss": 2.1806, + "step": 2210 + }, + { + "epoch": 0.6786372007366482, + "grad_norm": 0.6502536535263062, + "learning_rate": 9.962546785720774e-05, + "loss": 2.174, + "step": 2211 + }, + { + "epoch": 0.6789441375076735, + "grad_norm": 0.6797144412994385, + "learning_rate": 9.962486036546926e-05, + "loss": 2.2635, + "step": 2212 + }, + { + "epoch": 0.6792510742786986, + "grad_norm": 0.7190150022506714, + "learning_rate": 9.962425238330884e-05, + "loss": 2.2231, + "step": 2213 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.6770560145378113, + "learning_rate": 9.962364391073245e-05, + "loss": 2.1639, + "step": 2214 + }, + { + "epoch": 0.6798649478207489, + "grad_norm": 0.624911904335022, + "learning_rate": 9.962303494774614e-05, + "loss": 2.1754, + "step": 2215 + }, + { + "epoch": 0.6801718845917741, + "grad_norm": 0.7127423286437988, + "learning_rate": 9.96224254943559e-05, + "loss": 2.2047, + "step": 2216 + }, + { + "epoch": 0.6804788213627992, + "grad_norm": 0.6729345321655273, + "learning_rate": 9.962181555056778e-05, + "loss": 2.2245, + "step": 2217 + }, + { + "epoch": 0.6807857581338245, + "grad_norm": 0.7142044901847839, + "learning_rate": 9.96212051163878e-05, + "loss": 2.1827, + "step": 2218 + }, + { + "epoch": 0.6810926949048496, + "grad_norm": 0.686295211315155, + "learning_rate": 9.962059419182196e-05, + "loss": 2.1784, + "step": 2219 + }, + { + "epoch": 0.6813996316758748, + "grad_norm": 0.7207211256027222, + "learning_rate": 9.961998277687634e-05, + "loss": 2.2603, + "step": 2220 + }, + { + "epoch": 0.6817065684468999, + "grad_norm": 0.814552903175354, + "learning_rate": 9.961937087155697e-05, + "loss": 2.2328, + "step": 2221 + }, + { + "epoch": 0.6820135052179251, + "grad_norm": 0.851860761642456, + "learning_rate": 9.96187584758699e-05, + "loss": 2.2334, + "step": 2222 + }, + { + "epoch": 0.6823204419889503, + "grad_norm": 0.9232058525085449, + "learning_rate": 9.961814558982117e-05, + "loss": 2.2259, + "step": 2223 + }, + { + "epoch": 0.6826273787599755, + "grad_norm": 0.8393358588218689, + "learning_rate": 9.961753221341684e-05, + "loss": 2.1347, + "step": 2224 + }, + { + "epoch": 0.6829343155310006, + "grad_norm": 0.7124439477920532, + "learning_rate": 9.961691834666297e-05, + "loss": 2.195, + "step": 2225 + }, + { + "epoch": 0.6832412523020258, + "grad_norm": 0.644290566444397, + "learning_rate": 9.961630398956565e-05, + "loss": 2.1967, + "step": 2226 + }, + { + "epoch": 0.6835481890730509, + "grad_norm": 0.6896283030509949, + "learning_rate": 9.961568914213092e-05, + "loss": 2.1781, + "step": 2227 + }, + { + "epoch": 0.6838551258440762, + "grad_norm": 0.711643636226654, + "learning_rate": 9.961507380436487e-05, + "loss": 2.1091, + "step": 2228 + }, + { + "epoch": 0.6841620626151013, + "grad_norm": 0.7056689858436584, + "learning_rate": 9.961445797627358e-05, + "loss": 2.1848, + "step": 2229 + }, + { + "epoch": 0.6844689993861265, + "grad_norm": 0.60573410987854, + "learning_rate": 9.961384165786314e-05, + "loss": 2.1156, + "step": 2230 + }, + { + "epoch": 0.6847759361571516, + "grad_norm": 0.5612443089485168, + "learning_rate": 9.961322484913963e-05, + "loss": 2.2311, + "step": 2231 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.6356449723243713, + "learning_rate": 9.961260755010916e-05, + "loss": 2.1945, + "step": 2232 + }, + { + "epoch": 0.6853898096992019, + "grad_norm": 0.7393341660499573, + "learning_rate": 9.961198976077782e-05, + "loss": 2.2743, + "step": 2233 + }, + { + "epoch": 0.6856967464702272, + "grad_norm": 0.7658794522285461, + "learning_rate": 9.961137148115171e-05, + "loss": 2.1729, + "step": 2234 + }, + { + "epoch": 0.6860036832412523, + "grad_norm": 0.790540337562561, + "learning_rate": 9.961075271123697e-05, + "loss": 2.1372, + "step": 2235 + }, + { + "epoch": 0.6863106200122775, + "grad_norm": 0.71295565366745, + "learning_rate": 9.961013345103968e-05, + "loss": 2.1325, + "step": 2236 + }, + { + "epoch": 0.6866175567833026, + "grad_norm": 0.6648302674293518, + "learning_rate": 9.960951370056597e-05, + "loss": 2.1626, + "step": 2237 + }, + { + "epoch": 0.6869244935543278, + "grad_norm": 0.6276865601539612, + "learning_rate": 9.960889345982198e-05, + "loss": 2.1848, + "step": 2238 + }, + { + "epoch": 0.6872314303253529, + "grad_norm": 0.6786942481994629, + "learning_rate": 9.960827272881383e-05, + "loss": 2.2402, + "step": 2239 + }, + { + "epoch": 0.6875383670963782, + "grad_norm": 0.7752293348312378, + "learning_rate": 9.960765150754764e-05, + "loss": 2.2187, + "step": 2240 + }, + { + "epoch": 0.6878453038674033, + "grad_norm": 0.7958577871322632, + "learning_rate": 9.960702979602956e-05, + "loss": 2.1995, + "step": 2241 + }, + { + "epoch": 0.6881522406384285, + "grad_norm": 0.7327582240104675, + "learning_rate": 9.960640759426575e-05, + "loss": 2.1709, + "step": 2242 + }, + { + "epoch": 0.6884591774094536, + "grad_norm": 0.7002710103988647, + "learning_rate": 9.960578490226233e-05, + "loss": 2.1966, + "step": 2243 + }, + { + "epoch": 0.6887661141804788, + "grad_norm": 0.6163785457611084, + "learning_rate": 9.960516172002548e-05, + "loss": 2.2012, + "step": 2244 + }, + { + "epoch": 0.689073050951504, + "grad_norm": 0.6808127760887146, + "learning_rate": 9.960453804756134e-05, + "loss": 2.1704, + "step": 2245 + }, + { + "epoch": 0.6893799877225292, + "grad_norm": 0.6571208834648132, + "learning_rate": 9.960391388487609e-05, + "loss": 2.17, + "step": 2246 + }, + { + "epoch": 0.6896869244935543, + "grad_norm": 0.7180834412574768, + "learning_rate": 9.960328923197588e-05, + "loss": 2.229, + "step": 2247 + }, + { + "epoch": 0.6899938612645795, + "grad_norm": 0.7283746600151062, + "learning_rate": 9.96026640888669e-05, + "loss": 2.195, + "step": 2248 + }, + { + "epoch": 0.6903007980356046, + "grad_norm": 0.6808122992515564, + "learning_rate": 9.960203845555531e-05, + "loss": 2.1327, + "step": 2249 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.7105094790458679, + "learning_rate": 9.960141233204731e-05, + "loss": 2.2747, + "step": 2250 + }, + { + "epoch": 0.690914671577655, + "grad_norm": 0.7650291919708252, + "learning_rate": 9.960078571834909e-05, + "loss": 2.2751, + "step": 2251 + }, + { + "epoch": 0.6912216083486802, + "grad_norm": 0.8347647786140442, + "learning_rate": 9.960015861446684e-05, + "loss": 2.2101, + "step": 2252 + }, + { + "epoch": 0.6915285451197053, + "grad_norm": 0.7774063348770142, + "learning_rate": 9.959953102040672e-05, + "loss": 2.1275, + "step": 2253 + }, + { + "epoch": 0.6918354818907305, + "grad_norm": 0.7466274499893188, + "learning_rate": 9.959890293617497e-05, + "loss": 2.1352, + "step": 2254 + }, + { + "epoch": 0.6921424186617556, + "grad_norm": 0.7451669573783875, + "learning_rate": 9.959827436177781e-05, + "loss": 2.1229, + "step": 2255 + }, + { + "epoch": 0.6924493554327809, + "grad_norm": 0.651746392250061, + "learning_rate": 9.959764529722142e-05, + "loss": 2.1416, + "step": 2256 + }, + { + "epoch": 0.692756292203806, + "grad_norm": 0.6267968416213989, + "learning_rate": 9.959701574251203e-05, + "loss": 2.1346, + "step": 2257 + }, + { + "epoch": 0.6930632289748312, + "grad_norm": 0.6087000966072083, + "learning_rate": 9.959638569765586e-05, + "loss": 2.2136, + "step": 2258 + }, + { + "epoch": 0.6933701657458563, + "grad_norm": 0.6032208204269409, + "learning_rate": 9.959575516265914e-05, + "loss": 2.1211, + "step": 2259 + }, + { + "epoch": 0.6936771025168815, + "grad_norm": 0.83074551820755, + "learning_rate": 9.95951241375281e-05, + "loss": 2.2951, + "step": 2260 + }, + { + "epoch": 0.6939840392879066, + "grad_norm": 0.8564106225967407, + "learning_rate": 9.959449262226897e-05, + "loss": 2.1496, + "step": 2261 + }, + { + "epoch": 0.6942909760589319, + "grad_norm": 0.8558153510093689, + "learning_rate": 9.9593860616888e-05, + "loss": 2.2325, + "step": 2262 + }, + { + "epoch": 0.694597912829957, + "grad_norm": 0.7391008734703064, + "learning_rate": 9.959322812139143e-05, + "loss": 2.1133, + "step": 2263 + }, + { + "epoch": 0.6949048496009822, + "grad_norm": 0.6090536713600159, + "learning_rate": 9.959259513578552e-05, + "loss": 2.1453, + "step": 2264 + }, + { + "epoch": 0.6952117863720073, + "grad_norm": 0.5893986821174622, + "learning_rate": 9.95919616600765e-05, + "loss": 2.2035, + "step": 2265 + }, + { + "epoch": 0.6955187231430325, + "grad_norm": 0.6274020671844482, + "learning_rate": 9.959132769427065e-05, + "loss": 2.2118, + "step": 2266 + }, + { + "epoch": 0.6958256599140578, + "grad_norm": 0.6287395358085632, + "learning_rate": 9.959069323837424e-05, + "loss": 2.2167, + "step": 2267 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.6281611323356628, + "learning_rate": 9.959005829239354e-05, + "loss": 2.1945, + "step": 2268 + }, + { + "epoch": 0.6964395334561081, + "grad_norm": 0.6422389149665833, + "learning_rate": 9.958942285633481e-05, + "loss": 2.1826, + "step": 2269 + }, + { + "epoch": 0.6967464702271332, + "grad_norm": 0.6461887955665588, + "learning_rate": 9.958878693020434e-05, + "loss": 2.2454, + "step": 2270 + }, + { + "epoch": 0.6970534069981584, + "grad_norm": 0.562102735042572, + "learning_rate": 9.958815051400841e-05, + "loss": 2.1375, + "step": 2271 + }, + { + "epoch": 0.6973603437691835, + "grad_norm": 0.5737003087997437, + "learning_rate": 9.958751360775331e-05, + "loss": 2.2344, + "step": 2272 + }, + { + "epoch": 0.6976672805402088, + "grad_norm": 0.5516494512557983, + "learning_rate": 9.958687621144535e-05, + "loss": 2.249, + "step": 2273 + }, + { + "epoch": 0.6979742173112339, + "grad_norm": 0.7148357629776001, + "learning_rate": 9.958623832509081e-05, + "loss": 2.2383, + "step": 2274 + }, + { + "epoch": 0.6982811540822591, + "grad_norm": 0.7151525020599365, + "learning_rate": 9.958559994869599e-05, + "loss": 2.1697, + "step": 2275 + }, + { + "epoch": 0.6985880908532842, + "grad_norm": 0.6927846670150757, + "learning_rate": 9.958496108226722e-05, + "loss": 2.1534, + "step": 2276 + }, + { + "epoch": 0.6988950276243094, + "grad_norm": 0.811660647392273, + "learning_rate": 9.958432172581079e-05, + "loss": 2.2197, + "step": 2277 + }, + { + "epoch": 0.6992019643953346, + "grad_norm": 0.9680081009864807, + "learning_rate": 9.958368187933305e-05, + "loss": 2.2241, + "step": 2278 + }, + { + "epoch": 0.6995089011663598, + "grad_norm": 0.9996320605278015, + "learning_rate": 9.958304154284028e-05, + "loss": 2.1598, + "step": 2279 + }, + { + "epoch": 0.6998158379373849, + "grad_norm": 1.008695363998413, + "learning_rate": 9.958240071633884e-05, + "loss": 2.2082, + "step": 2280 + }, + { + "epoch": 0.7001227747084101, + "grad_norm": 0.9931860566139221, + "learning_rate": 9.958175939983506e-05, + "loss": 2.1478, + "step": 2281 + }, + { + "epoch": 0.7004297114794352, + "grad_norm": 0.8637800812721252, + "learning_rate": 9.958111759333528e-05, + "loss": 2.149, + "step": 2282 + }, + { + "epoch": 0.7007366482504604, + "grad_norm": 0.7089012861251831, + "learning_rate": 9.958047529684582e-05, + "loss": 2.1845, + "step": 2283 + }, + { + "epoch": 0.7010435850214856, + "grad_norm": 0.6083673238754272, + "learning_rate": 9.957983251037303e-05, + "loss": 2.1542, + "step": 2284 + }, + { + "epoch": 0.7013505217925108, + "grad_norm": 0.7092905044555664, + "learning_rate": 9.957918923392331e-05, + "loss": 2.2305, + "step": 2285 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.8416675925254822, + "learning_rate": 9.957854546750297e-05, + "loss": 2.2975, + "step": 2286 + }, + { + "epoch": 0.7019643953345611, + "grad_norm": 0.7778663039207458, + "learning_rate": 9.957790121111838e-05, + "loss": 2.2363, + "step": 2287 + }, + { + "epoch": 0.7022713321055862, + "grad_norm": 0.7886617183685303, + "learning_rate": 9.957725646477592e-05, + "loss": 2.1547, + "step": 2288 + }, + { + "epoch": 0.7025782688766115, + "grad_norm": 0.6596038937568665, + "learning_rate": 9.957661122848194e-05, + "loss": 2.1537, + "step": 2289 + }, + { + "epoch": 0.7028852056476366, + "grad_norm": 0.6441544890403748, + "learning_rate": 9.957596550224285e-05, + "loss": 2.1678, + "step": 2290 + }, + { + "epoch": 0.7031921424186618, + "grad_norm": 0.7106116414070129, + "learning_rate": 9.957531928606499e-05, + "loss": 2.2039, + "step": 2291 + }, + { + "epoch": 0.7034990791896869, + "grad_norm": 0.6948207020759583, + "learning_rate": 9.957467257995476e-05, + "loss": 2.176, + "step": 2292 + }, + { + "epoch": 0.7038060159607121, + "grad_norm": 0.6834874153137207, + "learning_rate": 9.957402538391859e-05, + "loss": 2.2182, + "step": 2293 + }, + { + "epoch": 0.7041129527317372, + "grad_norm": 0.6246630549430847, + "learning_rate": 9.957337769796282e-05, + "loss": 2.1181, + "step": 2294 + }, + { + "epoch": 0.7044198895027625, + "grad_norm": 0.6421988606452942, + "learning_rate": 9.957272952209389e-05, + "loss": 2.1352, + "step": 2295 + }, + { + "epoch": 0.7047268262737876, + "grad_norm": 0.5955870151519775, + "learning_rate": 9.95720808563182e-05, + "loss": 2.1852, + "step": 2296 + }, + { + "epoch": 0.7050337630448128, + "grad_norm": 0.6961265206336975, + "learning_rate": 9.957143170064214e-05, + "loss": 2.242, + "step": 2297 + }, + { + "epoch": 0.7053406998158379, + "grad_norm": 0.6966063380241394, + "learning_rate": 9.957078205507213e-05, + "loss": 2.1505, + "step": 2298 + }, + { + "epoch": 0.7056476365868631, + "grad_norm": 0.6155996322631836, + "learning_rate": 9.957013191961459e-05, + "loss": 2.1928, + "step": 2299 + }, + { + "epoch": 0.7059545733578882, + "grad_norm": 0.6092718839645386, + "learning_rate": 9.956948129427597e-05, + "loss": 2.138, + "step": 2300 + }, + { + "epoch": 0.7062615101289135, + "grad_norm": 0.645746111869812, + "learning_rate": 9.95688301790627e-05, + "loss": 2.2334, + "step": 2301 + }, + { + "epoch": 0.7065684468999386, + "grad_norm": 0.5959149599075317, + "learning_rate": 9.956817857398116e-05, + "loss": 2.1985, + "step": 2302 + }, + { + "epoch": 0.7068753836709638, + "grad_norm": 0.7127073407173157, + "learning_rate": 9.956752647903785e-05, + "loss": 2.2157, + "step": 2303 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.5589274764060974, + "learning_rate": 9.956687389423917e-05, + "loss": 2.1251, + "step": 2304 + }, + { + "epoch": 0.7074892572130141, + "grad_norm": 0.5502300262451172, + "learning_rate": 9.95662208195916e-05, + "loss": 2.1344, + "step": 2305 + }, + { + "epoch": 0.7077961939840393, + "grad_norm": 0.6577275991439819, + "learning_rate": 9.95655672551016e-05, + "loss": 2.1646, + "step": 2306 + }, + { + "epoch": 0.7081031307550645, + "grad_norm": 0.6241618394851685, + "learning_rate": 9.956491320077559e-05, + "loss": 2.1153, + "step": 2307 + }, + { + "epoch": 0.7084100675260896, + "grad_norm": 0.5846728086471558, + "learning_rate": 9.956425865662007e-05, + "loss": 2.1477, + "step": 2308 + }, + { + "epoch": 0.7087170042971148, + "grad_norm": 0.6005275249481201, + "learning_rate": 9.95636036226415e-05, + "loss": 2.2034, + "step": 2309 + }, + { + "epoch": 0.7090239410681399, + "grad_norm": 0.6545519828796387, + "learning_rate": 9.956294809884635e-05, + "loss": 2.23, + "step": 2310 + }, + { + "epoch": 0.7093308778391652, + "grad_norm": 0.7513750791549683, + "learning_rate": 9.956229208524108e-05, + "loss": 2.2497, + "step": 2311 + }, + { + "epoch": 0.7096378146101903, + "grad_norm": 0.7308349609375, + "learning_rate": 9.956163558183219e-05, + "loss": 2.166, + "step": 2312 + }, + { + "epoch": 0.7099447513812155, + "grad_norm": 0.6278798580169678, + "learning_rate": 9.956097858862619e-05, + "loss": 2.1994, + "step": 2313 + }, + { + "epoch": 0.7102516881522406, + "grad_norm": 0.6725621223449707, + "learning_rate": 9.956032110562953e-05, + "loss": 2.2212, + "step": 2314 + }, + { + "epoch": 0.7105586249232658, + "grad_norm": 0.7116945385932922, + "learning_rate": 9.955966313284872e-05, + "loss": 2.2033, + "step": 2315 + }, + { + "epoch": 0.7108655616942909, + "grad_norm": 0.5906245112419128, + "learning_rate": 9.95590046702903e-05, + "loss": 2.1419, + "step": 2316 + }, + { + "epoch": 0.7111724984653162, + "grad_norm": 0.6911863684654236, + "learning_rate": 9.955834571796073e-05, + "loss": 2.1697, + "step": 2317 + }, + { + "epoch": 0.7114794352363413, + "grad_norm": 0.600350558757782, + "learning_rate": 9.955768627586655e-05, + "loss": 2.0864, + "step": 2318 + }, + { + "epoch": 0.7117863720073665, + "grad_norm": 0.6246278285980225, + "learning_rate": 9.955702634401427e-05, + "loss": 2.1549, + "step": 2319 + }, + { + "epoch": 0.7120933087783916, + "grad_norm": 0.6530009508132935, + "learning_rate": 9.95563659224104e-05, + "loss": 2.1457, + "step": 2320 + }, + { + "epoch": 0.7124002455494168, + "grad_norm": 0.6566256880760193, + "learning_rate": 9.955570501106148e-05, + "loss": 2.1589, + "step": 2321 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.6607041358947754, + "learning_rate": 9.955504360997404e-05, + "loss": 2.1692, + "step": 2322 + }, + { + "epoch": 0.7130141190914672, + "grad_norm": 0.7257810235023499, + "learning_rate": 9.95543817191546e-05, + "loss": 2.2067, + "step": 2323 + }, + { + "epoch": 0.7133210558624923, + "grad_norm": 0.7413349151611328, + "learning_rate": 9.955371933860973e-05, + "loss": 2.1817, + "step": 2324 + }, + { + "epoch": 0.7136279926335175, + "grad_norm": 0.6968317031860352, + "learning_rate": 9.955305646834596e-05, + "loss": 2.2574, + "step": 2325 + }, + { + "epoch": 0.7139349294045426, + "grad_norm": 0.8065732717514038, + "learning_rate": 9.955239310836983e-05, + "loss": 2.1957, + "step": 2326 + }, + { + "epoch": 0.7142418661755678, + "grad_norm": 0.7563133835792542, + "learning_rate": 9.955172925868792e-05, + "loss": 2.2113, + "step": 2327 + }, + { + "epoch": 0.714548802946593, + "grad_norm": 0.6790496110916138, + "learning_rate": 9.955106491930678e-05, + "loss": 2.103, + "step": 2328 + }, + { + "epoch": 0.7148557397176182, + "grad_norm": 0.65167236328125, + "learning_rate": 9.955040009023298e-05, + "loss": 2.1919, + "step": 2329 + }, + { + "epoch": 0.7151626764886433, + "grad_norm": 0.6869332790374756, + "learning_rate": 9.954973477147307e-05, + "loss": 2.2141, + "step": 2330 + }, + { + "epoch": 0.7154696132596685, + "grad_norm": 0.8613699078559875, + "learning_rate": 9.954906896303363e-05, + "loss": 2.1962, + "step": 2331 + }, + { + "epoch": 0.7157765500306936, + "grad_norm": 0.8827282786369324, + "learning_rate": 9.954840266492127e-05, + "loss": 2.216, + "step": 2332 + }, + { + "epoch": 0.7160834868017188, + "grad_norm": 0.9737905263900757, + "learning_rate": 9.954773587714255e-05, + "loss": 2.2118, + "step": 2333 + }, + { + "epoch": 0.716390423572744, + "grad_norm": 0.9978635311126709, + "learning_rate": 9.954706859970404e-05, + "loss": 2.0998, + "step": 2334 + }, + { + "epoch": 0.7166973603437692, + "grad_norm": 0.8694623112678528, + "learning_rate": 9.954640083261238e-05, + "loss": 2.1533, + "step": 2335 + }, + { + "epoch": 0.7170042971147943, + "grad_norm": 0.641293466091156, + "learning_rate": 9.954573257587415e-05, + "loss": 2.2095, + "step": 2336 + }, + { + "epoch": 0.7173112338858195, + "grad_norm": 0.6289860010147095, + "learning_rate": 9.954506382949594e-05, + "loss": 2.1683, + "step": 2337 + }, + { + "epoch": 0.7176181706568447, + "grad_norm": 0.8292246460914612, + "learning_rate": 9.954439459348437e-05, + "loss": 2.1729, + "step": 2338 + }, + { + "epoch": 0.7179251074278699, + "grad_norm": 0.8990920782089233, + "learning_rate": 9.954372486784605e-05, + "loss": 2.0888, + "step": 2339 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.7905614376068115, + "learning_rate": 9.954305465258762e-05, + "loss": 2.2262, + "step": 2340 + }, + { + "epoch": 0.7185389809699202, + "grad_norm": 0.7142611145973206, + "learning_rate": 9.954238394771567e-05, + "loss": 2.1311, + "step": 2341 + }, + { + "epoch": 0.7188459177409454, + "grad_norm": 0.68161541223526, + "learning_rate": 9.954171275323684e-05, + "loss": 2.2622, + "step": 2342 + }, + { + "epoch": 0.7191528545119705, + "grad_norm": 0.7524895668029785, + "learning_rate": 9.954104106915779e-05, + "loss": 2.1709, + "step": 2343 + }, + { + "epoch": 0.7194597912829958, + "grad_norm": 0.7419885396957397, + "learning_rate": 9.954036889548511e-05, + "loss": 2.1528, + "step": 2344 + }, + { + "epoch": 0.7197667280540209, + "grad_norm": 0.8045634031295776, + "learning_rate": 9.953969623222547e-05, + "loss": 2.1774, + "step": 2345 + }, + { + "epoch": 0.7200736648250461, + "grad_norm": 0.6680217385292053, + "learning_rate": 9.953902307938554e-05, + "loss": 2.2345, + "step": 2346 + }, + { + "epoch": 0.7203806015960712, + "grad_norm": 0.6900907754898071, + "learning_rate": 9.953834943697193e-05, + "loss": 2.1696, + "step": 2347 + }, + { + "epoch": 0.7206875383670964, + "grad_norm": 0.7231009006500244, + "learning_rate": 9.953767530499132e-05, + "loss": 2.2556, + "step": 2348 + }, + { + "epoch": 0.7209944751381215, + "grad_norm": 0.7766092419624329, + "learning_rate": 9.953700068345036e-05, + "loss": 2.1522, + "step": 2349 + }, + { + "epoch": 0.7213014119091468, + "grad_norm": 0.7361852526664734, + "learning_rate": 9.953632557235574e-05, + "loss": 2.2427, + "step": 2350 + }, + { + "epoch": 0.7216083486801719, + "grad_norm": 0.7170109152793884, + "learning_rate": 9.953564997171411e-05, + "loss": 2.2439, + "step": 2351 + }, + { + "epoch": 0.7219152854511971, + "grad_norm": 0.7192662954330444, + "learning_rate": 9.953497388153214e-05, + "loss": 2.1242, + "step": 2352 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.7363288402557373, + "learning_rate": 9.953429730181653e-05, + "loss": 2.2748, + "step": 2353 + }, + { + "epoch": 0.7225291589932474, + "grad_norm": 0.8516983985900879, + "learning_rate": 9.953362023257397e-05, + "loss": 2.2471, + "step": 2354 + }, + { + "epoch": 0.7228360957642725, + "grad_norm": 0.7928574681282043, + "learning_rate": 9.953294267381114e-05, + "loss": 2.164, + "step": 2355 + }, + { + "epoch": 0.7231430325352978, + "grad_norm": 0.6803320646286011, + "learning_rate": 9.953226462553474e-05, + "loss": 2.1671, + "step": 2356 + }, + { + "epoch": 0.7234499693063229, + "grad_norm": 0.6811994910240173, + "learning_rate": 9.953158608775147e-05, + "loss": 2.1042, + "step": 2357 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.6077840328216553, + "learning_rate": 9.953090706046804e-05, + "loss": 2.2161, + "step": 2358 + }, + { + "epoch": 0.7240638428483732, + "grad_norm": 0.5938412547111511, + "learning_rate": 9.953022754369114e-05, + "loss": 2.1177, + "step": 2359 + }, + { + "epoch": 0.7243707796193984, + "grad_norm": 0.6752299070358276, + "learning_rate": 9.952954753742751e-05, + "loss": 2.2255, + "step": 2360 + }, + { + "epoch": 0.7246777163904236, + "grad_norm": 0.6745245456695557, + "learning_rate": 9.952886704168387e-05, + "loss": 2.1817, + "step": 2361 + }, + { + "epoch": 0.7249846531614488, + "grad_norm": 0.6645397543907166, + "learning_rate": 9.95281860564669e-05, + "loss": 2.2495, + "step": 2362 + }, + { + "epoch": 0.7252915899324739, + "grad_norm": 0.6758745312690735, + "learning_rate": 9.95275045817834e-05, + "loss": 2.2059, + "step": 2363 + }, + { + "epoch": 0.7255985267034991, + "grad_norm": 0.6584516763687134, + "learning_rate": 9.952682261764006e-05, + "loss": 2.1868, + "step": 2364 + }, + { + "epoch": 0.7259054634745242, + "grad_norm": 0.6335561871528625, + "learning_rate": 9.952614016404363e-05, + "loss": 2.1352, + "step": 2365 + }, + { + "epoch": 0.7262124002455494, + "grad_norm": 0.6656816601753235, + "learning_rate": 9.952545722100087e-05, + "loss": 2.1805, + "step": 2366 + }, + { + "epoch": 0.7265193370165746, + "grad_norm": 0.6262782216072083, + "learning_rate": 9.95247737885185e-05, + "loss": 2.1435, + "step": 2367 + }, + { + "epoch": 0.7268262737875998, + "grad_norm": 0.569795548915863, + "learning_rate": 9.952408986660329e-05, + "loss": 2.1547, + "step": 2368 + }, + { + "epoch": 0.7271332105586249, + "grad_norm": 0.5249118208885193, + "learning_rate": 9.952340545526199e-05, + "loss": 2.1213, + "step": 2369 + }, + { + "epoch": 0.7274401473296501, + "grad_norm": 0.5581740140914917, + "learning_rate": 9.952272055450139e-05, + "loss": 2.1866, + "step": 2370 + }, + { + "epoch": 0.7277470841006752, + "grad_norm": 0.5986969470977783, + "learning_rate": 9.952203516432821e-05, + "loss": 2.143, + "step": 2371 + }, + { + "epoch": 0.7280540208717005, + "grad_norm": 0.6426723599433899, + "learning_rate": 9.952134928474926e-05, + "loss": 2.2132, + "step": 2372 + }, + { + "epoch": 0.7283609576427256, + "grad_norm": 0.5856953263282776, + "learning_rate": 9.952066291577133e-05, + "loss": 2.1502, + "step": 2373 + }, + { + "epoch": 0.7286678944137508, + "grad_norm": 0.5420570969581604, + "learning_rate": 9.951997605740117e-05, + "loss": 2.1213, + "step": 2374 + }, + { + "epoch": 0.7289748311847759, + "grad_norm": 0.6201688647270203, + "learning_rate": 9.951928870964558e-05, + "loss": 2.218, + "step": 2375 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.7023850083351135, + "learning_rate": 9.951860087251137e-05, + "loss": 2.2787, + "step": 2376 + }, + { + "epoch": 0.7295887047268262, + "grad_norm": 0.733650803565979, + "learning_rate": 9.951791254600532e-05, + "loss": 2.1861, + "step": 2377 + }, + { + "epoch": 0.7298956414978515, + "grad_norm": 0.7177363038063049, + "learning_rate": 9.951722373013421e-05, + "loss": 2.1905, + "step": 2378 + }, + { + "epoch": 0.7302025782688766, + "grad_norm": 0.7963547706604004, + "learning_rate": 9.95165344249049e-05, + "loss": 2.1842, + "step": 2379 + }, + { + "epoch": 0.7305095150399018, + "grad_norm": 0.8466546535491943, + "learning_rate": 9.951584463032416e-05, + "loss": 2.1661, + "step": 2380 + }, + { + "epoch": 0.7308164518109269, + "grad_norm": 0.7288870811462402, + "learning_rate": 9.951515434639882e-05, + "loss": 2.1153, + "step": 2381 + }, + { + "epoch": 0.7311233885819521, + "grad_norm": 0.6168704032897949, + "learning_rate": 9.951446357313571e-05, + "loss": 2.121, + "step": 2382 + }, + { + "epoch": 0.7314303253529773, + "grad_norm": 0.6534848809242249, + "learning_rate": 9.951377231054166e-05, + "loss": 2.2087, + "step": 2383 + }, + { + "epoch": 0.7317372621240025, + "grad_norm": 0.7872020602226257, + "learning_rate": 9.951308055862347e-05, + "loss": 2.2428, + "step": 2384 + }, + { + "epoch": 0.7320441988950276, + "grad_norm": 0.864799439907074, + "learning_rate": 9.9512388317388e-05, + "loss": 2.2392, + "step": 2385 + }, + { + "epoch": 0.7323511356660528, + "grad_norm": 0.7365485429763794, + "learning_rate": 9.95116955868421e-05, + "loss": 2.1614, + "step": 2386 + }, + { + "epoch": 0.7326580724370779, + "grad_norm": 0.6509390473365784, + "learning_rate": 9.95110023669926e-05, + "loss": 2.1917, + "step": 2387 + }, + { + "epoch": 0.7329650092081031, + "grad_norm": 0.7660403847694397, + "learning_rate": 9.951030865784635e-05, + "loss": 2.2414, + "step": 2388 + }, + { + "epoch": 0.7332719459791283, + "grad_norm": 0.9997872114181519, + "learning_rate": 9.950961445941022e-05, + "loss": 2.2063, + "step": 2389 + }, + { + "epoch": 0.7335788827501535, + "grad_norm": 1.0113418102264404, + "learning_rate": 9.950891977169106e-05, + "loss": 2.1898, + "step": 2390 + }, + { + "epoch": 0.7338858195211786, + "grad_norm": 0.8849206566810608, + "learning_rate": 9.950822459469573e-05, + "loss": 2.1503, + "step": 2391 + }, + { + "epoch": 0.7341927562922038, + "grad_norm": 0.6561055779457092, + "learning_rate": 9.950752892843112e-05, + "loss": 2.1234, + "step": 2392 + }, + { + "epoch": 0.7344996930632289, + "grad_norm": 0.5568758845329285, + "learning_rate": 9.950683277290407e-05, + "loss": 2.2129, + "step": 2393 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.7019078135490417, + "learning_rate": 9.950613612812149e-05, + "loss": 2.1162, + "step": 2394 + }, + { + "epoch": 0.7351135666052793, + "grad_norm": 0.7633521556854248, + "learning_rate": 9.950543899409026e-05, + "loss": 2.2427, + "step": 2395 + }, + { + "epoch": 0.7354205033763045, + "grad_norm": 0.6743205785751343, + "learning_rate": 9.950474137081726e-05, + "loss": 2.2213, + "step": 2396 + }, + { + "epoch": 0.7357274401473296, + "grad_norm": 0.6008336544036865, + "learning_rate": 9.950404325830941e-05, + "loss": 2.1605, + "step": 2397 + }, + { + "epoch": 0.7360343769183548, + "grad_norm": 0.648760199546814, + "learning_rate": 9.950334465657357e-05, + "loss": 2.2298, + "step": 2398 + }, + { + "epoch": 0.7363413136893799, + "grad_norm": 0.6996559500694275, + "learning_rate": 9.950264556561667e-05, + "loss": 2.1616, + "step": 2399 + }, + { + "epoch": 0.7366482504604052, + "grad_norm": 0.741629421710968, + "learning_rate": 9.950194598544561e-05, + "loss": 2.2162, + "step": 2400 + }, + { + "epoch": 0.7369551872314303, + "grad_norm": 0.6144673824310303, + "learning_rate": 9.95012459160673e-05, + "loss": 2.15, + "step": 2401 + }, + { + "epoch": 0.7372621240024555, + "grad_norm": 0.5826541781425476, + "learning_rate": 9.950054535748867e-05, + "loss": 2.1792, + "step": 2402 + }, + { + "epoch": 0.7375690607734806, + "grad_norm": 0.6489288806915283, + "learning_rate": 9.949984430971665e-05, + "loss": 2.1703, + "step": 2403 + }, + { + "epoch": 0.7378759975445058, + "grad_norm": 0.6752250790596008, + "learning_rate": 9.949914277275814e-05, + "loss": 2.2561, + "step": 2404 + }, + { + "epoch": 0.738182934315531, + "grad_norm": 0.5570092797279358, + "learning_rate": 9.94984407466201e-05, + "loss": 2.1418, + "step": 2405 + }, + { + "epoch": 0.7384898710865562, + "grad_norm": 0.5966812968254089, + "learning_rate": 9.949773823130944e-05, + "loss": 2.2168, + "step": 2406 + }, + { + "epoch": 0.7387968078575813, + "grad_norm": 0.6253142952919006, + "learning_rate": 9.949703522683314e-05, + "loss": 2.1646, + "step": 2407 + }, + { + "epoch": 0.7391037446286065, + "grad_norm": 0.6673659086227417, + "learning_rate": 9.94963317331981e-05, + "loss": 2.1904, + "step": 2408 + }, + { + "epoch": 0.7394106813996317, + "grad_norm": 0.6243279576301575, + "learning_rate": 9.949562775041133e-05, + "loss": 2.2568, + "step": 2409 + }, + { + "epoch": 0.7397176181706568, + "grad_norm": 0.7014298439025879, + "learning_rate": 9.949492327847973e-05, + "loss": 2.2331, + "step": 2410 + }, + { + "epoch": 0.7400245549416821, + "grad_norm": 0.698403537273407, + "learning_rate": 9.94942183174103e-05, + "loss": 2.1928, + "step": 2411 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.6354022026062012, + "learning_rate": 9.949351286721001e-05, + "loss": 2.0975, + "step": 2412 + }, + { + "epoch": 0.7406384284837324, + "grad_norm": 0.595302164554596, + "learning_rate": 9.949280692788579e-05, + "loss": 2.177, + "step": 2413 + }, + { + "epoch": 0.7409453652547575, + "grad_norm": 0.6844484210014343, + "learning_rate": 9.949210049944465e-05, + "loss": 2.1962, + "step": 2414 + }, + { + "epoch": 0.7412523020257827, + "grad_norm": 0.6242616176605225, + "learning_rate": 9.949139358189357e-05, + "loss": 2.2143, + "step": 2415 + }, + { + "epoch": 0.7415592387968079, + "grad_norm": 0.6524595022201538, + "learning_rate": 9.949068617523954e-05, + "loss": 2.1438, + "step": 2416 + }, + { + "epoch": 0.7418661755678331, + "grad_norm": 0.6667510867118835, + "learning_rate": 9.948997827948953e-05, + "loss": 2.2115, + "step": 2417 + }, + { + "epoch": 0.7421731123388582, + "grad_norm": 0.7688906192779541, + "learning_rate": 9.948926989465056e-05, + "loss": 2.1887, + "step": 2418 + }, + { + "epoch": 0.7424800491098834, + "grad_norm": 0.6888165473937988, + "learning_rate": 9.948856102072958e-05, + "loss": 2.1349, + "step": 2419 + }, + { + "epoch": 0.7427869858809085, + "grad_norm": 0.5672495365142822, + "learning_rate": 9.948785165773367e-05, + "loss": 2.1109, + "step": 2420 + }, + { + "epoch": 0.7430939226519337, + "grad_norm": 0.5714489221572876, + "learning_rate": 9.94871418056698e-05, + "loss": 2.1483, + "step": 2421 + }, + { + "epoch": 0.7434008594229589, + "grad_norm": 0.6061533093452454, + "learning_rate": 9.948643146454498e-05, + "loss": 2.211, + "step": 2422 + }, + { + "epoch": 0.7437077961939841, + "grad_norm": 0.6132726073265076, + "learning_rate": 9.948572063436625e-05, + "loss": 2.23, + "step": 2423 + }, + { + "epoch": 0.7440147329650092, + "grad_norm": 0.684301495552063, + "learning_rate": 9.948500931514062e-05, + "loss": 2.129, + "step": 2424 + }, + { + "epoch": 0.7443216697360344, + "grad_norm": 0.6325442790985107, + "learning_rate": 9.948429750687512e-05, + "loss": 2.129, + "step": 2425 + }, + { + "epoch": 0.7446286065070595, + "grad_norm": 0.6245989203453064, + "learning_rate": 9.948358520957678e-05, + "loss": 2.1999, + "step": 2426 + }, + { + "epoch": 0.7449355432780848, + "grad_norm": 0.6638534069061279, + "learning_rate": 9.948287242325267e-05, + "loss": 2.203, + "step": 2427 + }, + { + "epoch": 0.7452424800491099, + "grad_norm": 0.6121437549591064, + "learning_rate": 9.94821591479098e-05, + "loss": 2.1204, + "step": 2428 + }, + { + "epoch": 0.7455494168201351, + "grad_norm": 0.7919846177101135, + "learning_rate": 9.948144538355522e-05, + "loss": 2.2353, + "step": 2429 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.7246984839439392, + "learning_rate": 9.948073113019602e-05, + "loss": 2.1284, + "step": 2430 + }, + { + "epoch": 0.7461632903621854, + "grad_norm": 0.6120265126228333, + "learning_rate": 9.948001638783921e-05, + "loss": 2.0873, + "step": 2431 + }, + { + "epoch": 0.7464702271332105, + "grad_norm": 0.628588080406189, + "learning_rate": 9.947930115649189e-05, + "loss": 2.1713, + "step": 2432 + }, + { + "epoch": 0.7467771639042358, + "grad_norm": 0.63116854429245, + "learning_rate": 9.947858543616111e-05, + "loss": 2.123, + "step": 2433 + }, + { + "epoch": 0.7470841006752609, + "grad_norm": 0.6533017754554749, + "learning_rate": 9.947786922685394e-05, + "loss": 2.1593, + "step": 2434 + }, + { + "epoch": 0.7473910374462861, + "grad_norm": 0.6854177117347717, + "learning_rate": 9.947715252857749e-05, + "loss": 2.162, + "step": 2435 + }, + { + "epoch": 0.7476979742173112, + "grad_norm": 0.7257967591285706, + "learning_rate": 9.94764353413388e-05, + "loss": 2.2644, + "step": 2436 + }, + { + "epoch": 0.7480049109883364, + "grad_norm": 0.6806700825691223, + "learning_rate": 9.947571766514498e-05, + "loss": 2.0875, + "step": 2437 + }, + { + "epoch": 0.7483118477593615, + "grad_norm": 0.6616181135177612, + "learning_rate": 9.947499950000312e-05, + "loss": 2.1353, + "step": 2438 + }, + { + "epoch": 0.7486187845303868, + "grad_norm": 0.7249685525894165, + "learning_rate": 9.947428084592032e-05, + "loss": 2.148, + "step": 2439 + }, + { + "epoch": 0.7489257213014119, + "grad_norm": 0.6372905969619751, + "learning_rate": 9.947356170290369e-05, + "loss": 2.1749, + "step": 2440 + }, + { + "epoch": 0.7492326580724371, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.947284207096031e-05, + "loss": 2.1909, + "step": 2441 + }, + { + "epoch": 0.7495395948434622, + "grad_norm": 0.5830507278442383, + "learning_rate": 9.94721219500973e-05, + "loss": 2.1351, + "step": 2442 + }, + { + "epoch": 0.7498465316144874, + "grad_norm": 0.650262713432312, + "learning_rate": 9.94714013403218e-05, + "loss": 2.2602, + "step": 2443 + }, + { + "epoch": 0.7501534683855126, + "grad_norm": 0.6658717393875122, + "learning_rate": 9.947068024164091e-05, + "loss": 2.0919, + "step": 2444 + }, + { + "epoch": 0.7504604051565378, + "grad_norm": 0.7299105525016785, + "learning_rate": 9.946995865406177e-05, + "loss": 2.2079, + "step": 2445 + }, + { + "epoch": 0.7507673419275629, + "grad_norm": 0.762246310710907, + "learning_rate": 9.946923657759148e-05, + "loss": 2.2225, + "step": 2446 + }, + { + "epoch": 0.7510742786985881, + "grad_norm": 0.7019835710525513, + "learning_rate": 9.946851401223722e-05, + "loss": 2.175, + "step": 2447 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.6214791536331177, + "learning_rate": 9.946779095800611e-05, + "loss": 2.2095, + "step": 2448 + }, + { + "epoch": 0.7516881522406385, + "grad_norm": 0.6380667090415955, + "learning_rate": 9.94670674149053e-05, + "loss": 2.2325, + "step": 2449 + }, + { + "epoch": 0.7519950890116636, + "grad_norm": 0.6175886392593384, + "learning_rate": 9.946634338294191e-05, + "loss": 2.1431, + "step": 2450 + }, + { + "epoch": 0.7523020257826888, + "grad_norm": 0.6642621159553528, + "learning_rate": 9.946561886212315e-05, + "loss": 2.1538, + "step": 2451 + }, + { + "epoch": 0.7526089625537139, + "grad_norm": 0.7078617215156555, + "learning_rate": 9.946489385245614e-05, + "loss": 2.1544, + "step": 2452 + }, + { + "epoch": 0.7529158993247391, + "grad_norm": 0.6939398050308228, + "learning_rate": 9.946416835394806e-05, + "loss": 2.1131, + "step": 2453 + }, + { + "epoch": 0.7532228360957642, + "grad_norm": 0.7080716490745544, + "learning_rate": 9.946344236660608e-05, + "loss": 2.2135, + "step": 2454 + }, + { + "epoch": 0.7535297728667895, + "grad_norm": 0.7451115250587463, + "learning_rate": 9.946271589043736e-05, + "loss": 2.1475, + "step": 2455 + }, + { + "epoch": 0.7538367096378146, + "grad_norm": 0.6718367338180542, + "learning_rate": 9.946198892544909e-05, + "loss": 2.1853, + "step": 2456 + }, + { + "epoch": 0.7541436464088398, + "grad_norm": 0.7071637511253357, + "learning_rate": 9.946126147164847e-05, + "loss": 2.0981, + "step": 2457 + }, + { + "epoch": 0.7544505831798649, + "grad_norm": 0.6745624542236328, + "learning_rate": 9.946053352904267e-05, + "loss": 2.1914, + "step": 2458 + }, + { + "epoch": 0.7547575199508901, + "grad_norm": 0.7267486453056335, + "learning_rate": 9.945980509763888e-05, + "loss": 2.1091, + "step": 2459 + }, + { + "epoch": 0.7550644567219152, + "grad_norm": 0.6128695607185364, + "learning_rate": 9.94590761774443e-05, + "loss": 2.1721, + "step": 2460 + }, + { + "epoch": 0.7553713934929405, + "grad_norm": 0.6574678421020508, + "learning_rate": 9.945834676846615e-05, + "loss": 2.1609, + "step": 2461 + }, + { + "epoch": 0.7556783302639656, + "grad_norm": 0.6209995150566101, + "learning_rate": 9.945761687071164e-05, + "loss": 2.1889, + "step": 2462 + }, + { + "epoch": 0.7559852670349908, + "grad_norm": 0.7425361275672913, + "learning_rate": 9.945688648418795e-05, + "loss": 2.2189, + "step": 2463 + }, + { + "epoch": 0.7562922038060159, + "grad_norm": 1.0604934692382812, + "learning_rate": 9.945615560890234e-05, + "loss": 2.1858, + "step": 2464 + }, + { + "epoch": 0.7565991405770411, + "grad_norm": 0.7162829041481018, + "learning_rate": 9.945542424486201e-05, + "loss": 2.101, + "step": 2465 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.6361207962036133, + "learning_rate": 9.945469239207416e-05, + "loss": 2.0836, + "step": 2466 + }, + { + "epoch": 0.7572130141190915, + "grad_norm": 0.5858156085014343, + "learning_rate": 9.945396005054609e-05, + "loss": 2.2059, + "step": 2467 + }, + { + "epoch": 0.7575199508901166, + "grad_norm": 0.7322074174880981, + "learning_rate": 9.945322722028498e-05, + "loss": 2.2295, + "step": 2468 + }, + { + "epoch": 0.7578268876611418, + "grad_norm": 0.775900661945343, + "learning_rate": 9.945249390129811e-05, + "loss": 2.2171, + "step": 2469 + }, + { + "epoch": 0.7581338244321669, + "grad_norm": 0.8801379799842834, + "learning_rate": 9.94517600935927e-05, + "loss": 2.1632, + "step": 2470 + }, + { + "epoch": 0.7584407612031921, + "grad_norm": 0.8258405923843384, + "learning_rate": 9.945102579717602e-05, + "loss": 2.1591, + "step": 2471 + }, + { + "epoch": 0.7587476979742173, + "grad_norm": 0.7472482323646545, + "learning_rate": 9.945029101205532e-05, + "loss": 2.2242, + "step": 2472 + }, + { + "epoch": 0.7590546347452425, + "grad_norm": 0.6594643592834473, + "learning_rate": 9.944955573823785e-05, + "loss": 2.1217, + "step": 2473 + }, + { + "epoch": 0.7593615715162676, + "grad_norm": 0.6547524333000183, + "learning_rate": 9.944881997573088e-05, + "loss": 2.131, + "step": 2474 + }, + { + "epoch": 0.7596685082872928, + "grad_norm": 0.6630129814147949, + "learning_rate": 9.94480837245417e-05, + "loss": 2.1264, + "step": 2475 + }, + { + "epoch": 0.7599754450583179, + "grad_norm": 0.6877384781837463, + "learning_rate": 9.944734698467757e-05, + "loss": 2.2453, + "step": 2476 + }, + { + "epoch": 0.7602823818293432, + "grad_norm": 0.6736158728599548, + "learning_rate": 9.944660975614579e-05, + "loss": 2.1425, + "step": 2477 + }, + { + "epoch": 0.7605893186003683, + "grad_norm": 0.6140786409378052, + "learning_rate": 9.944587203895361e-05, + "loss": 2.1345, + "step": 2478 + }, + { + "epoch": 0.7608962553713935, + "grad_norm": 0.5515910387039185, + "learning_rate": 9.944513383310837e-05, + "loss": 2.086, + "step": 2479 + }, + { + "epoch": 0.7612031921424187, + "grad_norm": 0.49419671297073364, + "learning_rate": 9.944439513861731e-05, + "loss": 2.1069, + "step": 2480 + }, + { + "epoch": 0.7615101289134438, + "grad_norm": 0.5526577234268188, + "learning_rate": 9.944365595548777e-05, + "loss": 2.1702, + "step": 2481 + }, + { + "epoch": 0.761817065684469, + "grad_norm": 0.5430580973625183, + "learning_rate": 9.944291628372702e-05, + "loss": 2.121, + "step": 2482 + }, + { + "epoch": 0.7621240024554942, + "grad_norm": 0.5333554148674011, + "learning_rate": 9.94421761233424e-05, + "loss": 2.1154, + "step": 2483 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.5856761932373047, + "learning_rate": 9.944143547434124e-05, + "loss": 2.1734, + "step": 2484 + }, + { + "epoch": 0.7627378759975445, + "grad_norm": 0.6619083881378174, + "learning_rate": 9.944069433673082e-05, + "loss": 2.2068, + "step": 2485 + }, + { + "epoch": 0.7630448127685697, + "grad_norm": 0.5791018009185791, + "learning_rate": 9.943995271051849e-05, + "loss": 2.0834, + "step": 2486 + }, + { + "epoch": 0.7633517495395948, + "grad_norm": 0.5942522287368774, + "learning_rate": 9.943921059571155e-05, + "loss": 2.2001, + "step": 2487 + }, + { + "epoch": 0.7636586863106201, + "grad_norm": 0.6285880208015442, + "learning_rate": 9.943846799231738e-05, + "loss": 2.1601, + "step": 2488 + }, + { + "epoch": 0.7639656230816452, + "grad_norm": 0.6337715983390808, + "learning_rate": 9.943772490034326e-05, + "loss": 2.1722, + "step": 2489 + }, + { + "epoch": 0.7642725598526704, + "grad_norm": 0.6912121772766113, + "learning_rate": 9.94369813197966e-05, + "loss": 2.1933, + "step": 2490 + }, + { + "epoch": 0.7645794966236955, + "grad_norm": 0.8028284311294556, + "learning_rate": 9.943623725068469e-05, + "loss": 2.129, + "step": 2491 + }, + { + "epoch": 0.7648864333947207, + "grad_norm": 0.8527138233184814, + "learning_rate": 9.943549269301491e-05, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.7651933701657458, + "grad_norm": 0.8422580361366272, + "learning_rate": 9.943474764679462e-05, + "loss": 2.2958, + "step": 2493 + }, + { + "epoch": 0.7655003069367711, + "grad_norm": 0.7698150873184204, + "learning_rate": 9.943400211203118e-05, + "loss": 2.1415, + "step": 2494 + }, + { + "epoch": 0.7658072437077962, + "grad_norm": 0.6360690593719482, + "learning_rate": 9.943325608873196e-05, + "loss": 2.1188, + "step": 2495 + }, + { + "epoch": 0.7661141804788214, + "grad_norm": 0.6225799918174744, + "learning_rate": 9.943250957690433e-05, + "loss": 2.1006, + "step": 2496 + }, + { + "epoch": 0.7664211172498465, + "grad_norm": 0.6694490909576416, + "learning_rate": 9.943176257655567e-05, + "loss": 2.2455, + "step": 2497 + }, + { + "epoch": 0.7667280540208717, + "grad_norm": 0.6188158988952637, + "learning_rate": 9.943101508769335e-05, + "loss": 2.0853, + "step": 2498 + }, + { + "epoch": 0.7670349907918969, + "grad_norm": 0.5934504866600037, + "learning_rate": 9.943026711032477e-05, + "loss": 2.0718, + "step": 2499 + }, + { + "epoch": 0.7673419275629221, + "grad_norm": 0.6261292695999146, + "learning_rate": 9.942951864445732e-05, + "loss": 2.1747, + "step": 2500 + }, + { + "epoch": 0.7676488643339472, + "grad_norm": 0.5891184210777283, + "learning_rate": 9.94287696900984e-05, + "loss": 2.1637, + "step": 2501 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.5321740508079529, + "learning_rate": 9.94280202472554e-05, + "loss": 2.0717, + "step": 2502 + }, + { + "epoch": 0.7682627378759975, + "grad_norm": 0.5563281178474426, + "learning_rate": 9.942727031593573e-05, + "loss": 2.1654, + "step": 2503 + }, + { + "epoch": 0.7685696746470227, + "grad_norm": 0.5672664046287537, + "learning_rate": 9.942651989614681e-05, + "loss": 2.0853, + "step": 2504 + }, + { + "epoch": 0.7688766114180479, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.942576898789606e-05, + "loss": 2.0636, + "step": 2505 + }, + { + "epoch": 0.7691835481890731, + "grad_norm": 0.5802470445632935, + "learning_rate": 9.942501759119088e-05, + "loss": 2.0924, + "step": 2506 + }, + { + "epoch": 0.7694904849600982, + "grad_norm": 0.5630003213882446, + "learning_rate": 9.94242657060387e-05, + "loss": 2.1975, + "step": 2507 + }, + { + "epoch": 0.7697974217311234, + "grad_norm": 0.6001835465431213, + "learning_rate": 9.942351333244697e-05, + "loss": 2.1187, + "step": 2508 + }, + { + "epoch": 0.7701043585021485, + "grad_norm": 0.6702088117599487, + "learning_rate": 9.942276047042311e-05, + "loss": 2.1489, + "step": 2509 + }, + { + "epoch": 0.7704112952731738, + "grad_norm": 0.7941808700561523, + "learning_rate": 9.942200711997456e-05, + "loss": 2.1404, + "step": 2510 + }, + { + "epoch": 0.7707182320441989, + "grad_norm": 0.8202539682388306, + "learning_rate": 9.942125328110876e-05, + "loss": 2.1242, + "step": 2511 + }, + { + "epoch": 0.7710251688152241, + "grad_norm": 0.7667655348777771, + "learning_rate": 9.942049895383319e-05, + "loss": 2.118, + "step": 2512 + }, + { + "epoch": 0.7713321055862492, + "grad_norm": 0.6766887307167053, + "learning_rate": 9.941974413815527e-05, + "loss": 2.2632, + "step": 2513 + }, + { + "epoch": 0.7716390423572744, + "grad_norm": 0.5923287272453308, + "learning_rate": 9.941898883408248e-05, + "loss": 2.1096, + "step": 2514 + }, + { + "epoch": 0.7719459791282995, + "grad_norm": 0.8847586512565613, + "learning_rate": 9.941823304162227e-05, + "loss": 2.2629, + "step": 2515 + }, + { + "epoch": 0.7722529158993248, + "grad_norm": 1.2274069786071777, + "learning_rate": 9.941747676078211e-05, + "loss": 2.2493, + "step": 2516 + }, + { + "epoch": 0.7725598526703499, + "grad_norm": 0.8637729287147522, + "learning_rate": 9.94167199915695e-05, + "loss": 2.1545, + "step": 2517 + }, + { + "epoch": 0.7728667894413751, + "grad_norm": 0.7852178812026978, + "learning_rate": 9.941596273399187e-05, + "loss": 2.1984, + "step": 2518 + }, + { + "epoch": 0.7731737262124002, + "grad_norm": 0.6839576959609985, + "learning_rate": 9.941520498805677e-05, + "loss": 2.1913, + "step": 2519 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.7051649689674377, + "learning_rate": 9.941444675377163e-05, + "loss": 2.1678, + "step": 2520 + }, + { + "epoch": 0.7737875997544506, + "grad_norm": 0.702549159526825, + "learning_rate": 9.941368803114395e-05, + "loss": 2.1426, + "step": 2521 + }, + { + "epoch": 0.7740945365254758, + "grad_norm": 0.6717942953109741, + "learning_rate": 9.941292882018127e-05, + "loss": 2.1873, + "step": 2522 + }, + { + "epoch": 0.7744014732965009, + "grad_norm": 0.6705282926559448, + "learning_rate": 9.941216912089104e-05, + "loss": 2.1363, + "step": 2523 + }, + { + "epoch": 0.7747084100675261, + "grad_norm": 0.5858317017555237, + "learning_rate": 9.941140893328082e-05, + "loss": 2.1019, + "step": 2524 + }, + { + "epoch": 0.7750153468385512, + "grad_norm": 0.6353682279586792, + "learning_rate": 9.941064825735808e-05, + "loss": 2.1765, + "step": 2525 + }, + { + "epoch": 0.7753222836095764, + "grad_norm": 0.6573354601860046, + "learning_rate": 9.940988709313035e-05, + "loss": 2.0636, + "step": 2526 + }, + { + "epoch": 0.7756292203806016, + "grad_norm": 0.6040489077568054, + "learning_rate": 9.940912544060517e-05, + "loss": 2.0902, + "step": 2527 + }, + { + "epoch": 0.7759361571516268, + "grad_norm": 0.7024530172348022, + "learning_rate": 9.940836329979004e-05, + "loss": 2.2198, + "step": 2528 + }, + { + "epoch": 0.7762430939226519, + "grad_norm": 0.6910196542739868, + "learning_rate": 9.940760067069251e-05, + "loss": 2.0546, + "step": 2529 + }, + { + "epoch": 0.7765500306936771, + "grad_norm": 0.6841506361961365, + "learning_rate": 9.940683755332012e-05, + "loss": 2.2159, + "step": 2530 + }, + { + "epoch": 0.7768569674647022, + "grad_norm": 0.6503066420555115, + "learning_rate": 9.940607394768038e-05, + "loss": 2.2156, + "step": 2531 + }, + { + "epoch": 0.7771639042357275, + "grad_norm": 0.6512146592140198, + "learning_rate": 9.940530985378089e-05, + "loss": 2.1417, + "step": 2532 + }, + { + "epoch": 0.7774708410067526, + "grad_norm": 0.6234787106513977, + "learning_rate": 9.940454527162914e-05, + "loss": 2.1315, + "step": 2533 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.6279457211494446, + "learning_rate": 9.940378020123273e-05, + "loss": 2.2699, + "step": 2534 + }, + { + "epoch": 0.7780847145488029, + "grad_norm": 0.6793956160545349, + "learning_rate": 9.940301464259921e-05, + "loss": 2.2488, + "step": 2535 + }, + { + "epoch": 0.7783916513198281, + "grad_norm": 0.721234142780304, + "learning_rate": 9.940224859573614e-05, + "loss": 2.1183, + "step": 2536 + }, + { + "epoch": 0.7786985880908532, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.94014820606511e-05, + "loss": 2.0995, + "step": 2537 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.6358578205108643, + "learning_rate": 9.940071503735165e-05, + "loss": 2.2024, + "step": 2538 + }, + { + "epoch": 0.7793124616329036, + "grad_norm": 0.6250868439674377, + "learning_rate": 9.939994752584538e-05, + "loss": 2.1574, + "step": 2539 + }, + { + "epoch": 0.7796193984039288, + "grad_norm": 0.7657763361930847, + "learning_rate": 9.939917952613989e-05, + "loss": 2.2625, + "step": 2540 + }, + { + "epoch": 0.7799263351749539, + "grad_norm": 0.7625400424003601, + "learning_rate": 9.939841103824275e-05, + "loss": 2.1809, + "step": 2541 + }, + { + "epoch": 0.7802332719459791, + "grad_norm": 0.8593107461929321, + "learning_rate": 9.939764206216155e-05, + "loss": 2.2359, + "step": 2542 + }, + { + "epoch": 0.7805402087170042, + "grad_norm": 0.8441007733345032, + "learning_rate": 9.93968725979039e-05, + "loss": 2.1844, + "step": 2543 + }, + { + "epoch": 0.7808471454880295, + "grad_norm": 0.6408470273017883, + "learning_rate": 9.93961026454774e-05, + "loss": 2.1871, + "step": 2544 + }, + { + "epoch": 0.7811540822590546, + "grad_norm": 0.6779976487159729, + "learning_rate": 9.939533220488966e-05, + "loss": 2.1651, + "step": 2545 + }, + { + "epoch": 0.7814610190300798, + "grad_norm": 0.5885556936264038, + "learning_rate": 9.93945612761483e-05, + "loss": 2.0172, + "step": 2546 + }, + { + "epoch": 0.7817679558011049, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.939378985926094e-05, + "loss": 2.1358, + "step": 2547 + }, + { + "epoch": 0.7820748925721301, + "grad_norm": 0.685183584690094, + "learning_rate": 9.939301795423519e-05, + "loss": 2.1822, + "step": 2548 + }, + { + "epoch": 0.7823818293431553, + "grad_norm": 0.6666997671127319, + "learning_rate": 9.939224556107869e-05, + "loss": 2.288, + "step": 2549 + }, + { + "epoch": 0.7826887661141805, + "grad_norm": 0.6401170492172241, + "learning_rate": 9.939147267979905e-05, + "loss": 2.1038, + "step": 2550 + }, + { + "epoch": 0.7829957028852057, + "grad_norm": 0.645182728767395, + "learning_rate": 9.939069931040396e-05, + "loss": 2.1285, + "step": 2551 + }, + { + "epoch": 0.7833026396562308, + "grad_norm": 0.6795851588249207, + "learning_rate": 9.9389925452901e-05, + "loss": 2.1844, + "step": 2552 + }, + { + "epoch": 0.783609576427256, + "grad_norm": 0.7027488946914673, + "learning_rate": 9.938915110729788e-05, + "loss": 2.1712, + "step": 2553 + }, + { + "epoch": 0.7839165131982812, + "grad_norm": 0.7076524496078491, + "learning_rate": 9.93883762736022e-05, + "loss": 2.1812, + "step": 2554 + }, + { + "epoch": 0.7842234499693064, + "grad_norm": 0.5979459881782532, + "learning_rate": 9.938760095182165e-05, + "loss": 2.0877, + "step": 2555 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.6408665776252747, + "learning_rate": 9.938682514196387e-05, + "loss": 2.191, + "step": 2556 + }, + { + "epoch": 0.7848373235113567, + "grad_norm": 0.6545908451080322, + "learning_rate": 9.938604884403654e-05, + "loss": 2.0933, + "step": 2557 + }, + { + "epoch": 0.7851442602823818, + "grad_norm": 0.7271838784217834, + "learning_rate": 9.938527205804733e-05, + "loss": 2.1804, + "step": 2558 + }, + { + "epoch": 0.785451197053407, + "grad_norm": 0.6371840834617615, + "learning_rate": 9.938449478400391e-05, + "loss": 2.1161, + "step": 2559 + }, + { + "epoch": 0.7857581338244322, + "grad_norm": 0.5922467708587646, + "learning_rate": 9.938371702191398e-05, + "loss": 2.0929, + "step": 2560 + }, + { + "epoch": 0.7860650705954574, + "grad_norm": 0.536125898361206, + "learning_rate": 9.938293877178522e-05, + "loss": 2.0815, + "step": 2561 + }, + { + "epoch": 0.7863720073664825, + "grad_norm": 0.6026225090026855, + "learning_rate": 9.93821600336253e-05, + "loss": 2.1719, + "step": 2562 + }, + { + "epoch": 0.7866789441375077, + "grad_norm": 0.584267795085907, + "learning_rate": 9.938138080744192e-05, + "loss": 2.1515, + "step": 2563 + }, + { + "epoch": 0.7869858809085328, + "grad_norm": 0.6616362929344177, + "learning_rate": 9.938060109324281e-05, + "loss": 2.2425, + "step": 2564 + }, + { + "epoch": 0.787292817679558, + "grad_norm": 0.669987678527832, + "learning_rate": 9.937982089103566e-05, + "loss": 2.1883, + "step": 2565 + }, + { + "epoch": 0.7875997544505832, + "grad_norm": 0.6769465208053589, + "learning_rate": 9.937904020082815e-05, + "loss": 2.1508, + "step": 2566 + }, + { + "epoch": 0.7879066912216084, + "grad_norm": 0.5796112418174744, + "learning_rate": 9.937825902262805e-05, + "loss": 2.0925, + "step": 2567 + }, + { + "epoch": 0.7882136279926335, + "grad_norm": 0.5895870923995972, + "learning_rate": 9.937747735644305e-05, + "loss": 2.1002, + "step": 2568 + }, + { + "epoch": 0.7885205647636587, + "grad_norm": 0.5870219469070435, + "learning_rate": 9.937669520228088e-05, + "loss": 2.1189, + "step": 2569 + }, + { + "epoch": 0.7888275015346838, + "grad_norm": 0.6191404461860657, + "learning_rate": 9.937591256014925e-05, + "loss": 2.1783, + "step": 2570 + }, + { + "epoch": 0.7891344383057091, + "grad_norm": 0.6033806204795837, + "learning_rate": 9.937512943005592e-05, + "loss": 2.1507, + "step": 2571 + }, + { + "epoch": 0.7894413750767342, + "grad_norm": 0.6319470405578613, + "learning_rate": 9.937434581200863e-05, + "loss": 2.2088, + "step": 2572 + }, + { + "epoch": 0.7897483118477594, + "grad_norm": 0.621004581451416, + "learning_rate": 9.93735617060151e-05, + "loss": 2.1523, + "step": 2573 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.6069821715354919, + "learning_rate": 9.937277711208311e-05, + "loss": 2.1437, + "step": 2574 + }, + { + "epoch": 0.7903621853898097, + "grad_norm": 0.6186996102333069, + "learning_rate": 9.937199203022039e-05, + "loss": 2.1541, + "step": 2575 + }, + { + "epoch": 0.7906691221608348, + "grad_norm": 0.6531949639320374, + "learning_rate": 9.937120646043471e-05, + "loss": 2.1928, + "step": 2576 + }, + { + "epoch": 0.7909760589318601, + "grad_norm": 0.5974560379981995, + "learning_rate": 9.937042040273383e-05, + "loss": 2.1814, + "step": 2577 + }, + { + "epoch": 0.7912829957028852, + "grad_norm": 0.59506756067276, + "learning_rate": 9.936963385712552e-05, + "loss": 2.2143, + "step": 2578 + }, + { + "epoch": 0.7915899324739104, + "grad_norm": 0.5878757834434509, + "learning_rate": 9.936884682361755e-05, + "loss": 2.0718, + "step": 2579 + }, + { + "epoch": 0.7918968692449355, + "grad_norm": 0.6318243145942688, + "learning_rate": 9.936805930221769e-05, + "loss": 2.1465, + "step": 2580 + }, + { + "epoch": 0.7922038060159607, + "grad_norm": 0.6474836468696594, + "learning_rate": 9.936727129293376e-05, + "loss": 2.0869, + "step": 2581 + }, + { + "epoch": 0.7925107427869859, + "grad_norm": 0.6589438915252686, + "learning_rate": 9.936648279577349e-05, + "loss": 2.1422, + "step": 2582 + }, + { + "epoch": 0.7928176795580111, + "grad_norm": 0.6935134530067444, + "learning_rate": 9.93656938107447e-05, + "loss": 2.1571, + "step": 2583 + }, + { + "epoch": 0.7931246163290362, + "grad_norm": 0.655430793762207, + "learning_rate": 9.936490433785522e-05, + "loss": 2.1044, + "step": 2584 + }, + { + "epoch": 0.7934315531000614, + "grad_norm": 0.6856111288070679, + "learning_rate": 9.93641143771128e-05, + "loss": 2.0551, + "step": 2585 + }, + { + "epoch": 0.7937384898710865, + "grad_norm": 0.6783097386360168, + "learning_rate": 9.936332392852527e-05, + "loss": 2.1475, + "step": 2586 + }, + { + "epoch": 0.7940454266421118, + "grad_norm": 0.6746678948402405, + "learning_rate": 9.936253299210045e-05, + "loss": 2.1462, + "step": 2587 + }, + { + "epoch": 0.7943523634131369, + "grad_norm": 0.6854017972946167, + "learning_rate": 9.936174156784614e-05, + "loss": 2.1649, + "step": 2588 + }, + { + "epoch": 0.7946593001841621, + "grad_norm": 0.6740380525588989, + "learning_rate": 9.936094965577017e-05, + "loss": 2.06, + "step": 2589 + }, + { + "epoch": 0.7949662369551872, + "grad_norm": 0.6354179978370667, + "learning_rate": 9.936015725588037e-05, + "loss": 2.1938, + "step": 2590 + }, + { + "epoch": 0.7952731737262124, + "grad_norm": 0.6496716141700745, + "learning_rate": 9.935936436818453e-05, + "loss": 2.089, + "step": 2591 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.5996106266975403, + "learning_rate": 9.935857099269057e-05, + "loss": 2.2254, + "step": 2592 + }, + { + "epoch": 0.7958870472682628, + "grad_norm": 0.5630382895469666, + "learning_rate": 9.935777712940625e-05, + "loss": 2.069, + "step": 2593 + }, + { + "epoch": 0.7961939840392879, + "grad_norm": 0.5480468273162842, + "learning_rate": 9.935698277833946e-05, + "loss": 2.1288, + "step": 2594 + }, + { + "epoch": 0.7965009208103131, + "grad_norm": 0.5127096772193909, + "learning_rate": 9.935618793949803e-05, + "loss": 2.0753, + "step": 2595 + }, + { + "epoch": 0.7968078575813382, + "grad_norm": 0.6451439261436462, + "learning_rate": 9.935539261288983e-05, + "loss": 2.3005, + "step": 2596 + }, + { + "epoch": 0.7971147943523634, + "grad_norm": 0.7047737836837769, + "learning_rate": 9.935459679852271e-05, + "loss": 2.1307, + "step": 2597 + }, + { + "epoch": 0.7974217311233885, + "grad_norm": 0.6382983922958374, + "learning_rate": 9.935380049640454e-05, + "loss": 2.1136, + "step": 2598 + }, + { + "epoch": 0.7977286678944138, + "grad_norm": 0.7337773442268372, + "learning_rate": 9.935300370654317e-05, + "loss": 2.0719, + "step": 2599 + }, + { + "epoch": 0.7980356046654389, + "grad_norm": 0.7481197118759155, + "learning_rate": 9.935220642894652e-05, + "loss": 2.2263, + "step": 2600 + }, + { + "epoch": 0.7983425414364641, + "grad_norm": 0.7383365631103516, + "learning_rate": 9.93514086636224e-05, + "loss": 2.2207, + "step": 2601 + }, + { + "epoch": 0.7986494782074892, + "grad_norm": 0.800762951374054, + "learning_rate": 9.935061041057876e-05, + "loss": 2.1848, + "step": 2602 + }, + { + "epoch": 0.7989564149785144, + "grad_norm": 0.6972829699516296, + "learning_rate": 9.934981166982346e-05, + "loss": 2.1301, + "step": 2603 + }, + { + "epoch": 0.7992633517495396, + "grad_norm": 0.5842304229736328, + "learning_rate": 9.93490124413644e-05, + "loss": 2.1311, + "step": 2604 + }, + { + "epoch": 0.7995702885205648, + "grad_norm": 0.6070491075515747, + "learning_rate": 9.934821272520946e-05, + "loss": 2.2226, + "step": 2605 + }, + { + "epoch": 0.7998772252915899, + "grad_norm": 0.6141406297683716, + "learning_rate": 9.934741252136656e-05, + "loss": 2.1425, + "step": 2606 + }, + { + "epoch": 0.8001841620626151, + "grad_norm": 0.5515148043632507, + "learning_rate": 9.934661182984363e-05, + "loss": 2.1138, + "step": 2607 + }, + { + "epoch": 0.8004910988336402, + "grad_norm": 0.5819688439369202, + "learning_rate": 9.934581065064854e-05, + "loss": 2.0835, + "step": 2608 + }, + { + "epoch": 0.8007980356046654, + "grad_norm": 0.593979001045227, + "learning_rate": 9.934500898378922e-05, + "loss": 2.2262, + "step": 2609 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.6978363990783691, + "learning_rate": 9.934420682927361e-05, + "loss": 2.1283, + "step": 2610 + }, + { + "epoch": 0.8014119091467158, + "grad_norm": 0.6205853223800659, + "learning_rate": 9.934340418710963e-05, + "loss": 2.1254, + "step": 2611 + }, + { + "epoch": 0.8017188459177409, + "grad_norm": 0.5547113418579102, + "learning_rate": 9.93426010573052e-05, + "loss": 2.0895, + "step": 2612 + }, + { + "epoch": 0.8020257826887661, + "grad_norm": 0.5652415156364441, + "learning_rate": 9.934179743986827e-05, + "loss": 2.1496, + "step": 2613 + }, + { + "epoch": 0.8023327194597912, + "grad_norm": 0.5833094120025635, + "learning_rate": 9.934099333480678e-05, + "loss": 2.1159, + "step": 2614 + }, + { + "epoch": 0.8026396562308165, + "grad_norm": 0.5929473638534546, + "learning_rate": 9.934018874212866e-05, + "loss": 2.1512, + "step": 2615 + }, + { + "epoch": 0.8029465930018416, + "grad_norm": 0.6359207630157471, + "learning_rate": 9.93393836618419e-05, + "loss": 2.1384, + "step": 2616 + }, + { + "epoch": 0.8032535297728668, + "grad_norm": 0.5934728384017944, + "learning_rate": 9.933857809395441e-05, + "loss": 2.1087, + "step": 2617 + }, + { + "epoch": 0.8035604665438919, + "grad_norm": 0.5685787796974182, + "learning_rate": 9.933777203847418e-05, + "loss": 2.1521, + "step": 2618 + }, + { + "epoch": 0.8038674033149171, + "grad_norm": 0.6276339292526245, + "learning_rate": 9.933696549540918e-05, + "loss": 2.1151, + "step": 2619 + }, + { + "epoch": 0.8041743400859422, + "grad_norm": 0.6206804513931274, + "learning_rate": 9.933615846476736e-05, + "loss": 2.1872, + "step": 2620 + }, + { + "epoch": 0.8044812768569675, + "grad_norm": 0.6645623445510864, + "learning_rate": 9.933535094655671e-05, + "loss": 2.217, + "step": 2621 + }, + { + "epoch": 0.8047882136279927, + "grad_norm": 0.6639950275421143, + "learning_rate": 9.93345429407852e-05, + "loss": 2.1479, + "step": 2622 + }, + { + "epoch": 0.8050951503990178, + "grad_norm": 0.6284301280975342, + "learning_rate": 9.933373444746081e-05, + "loss": 2.1763, + "step": 2623 + }, + { + "epoch": 0.805402087170043, + "grad_norm": 0.5974198579788208, + "learning_rate": 9.933292546659156e-05, + "loss": 2.1453, + "step": 2624 + }, + { + "epoch": 0.8057090239410681, + "grad_norm": 0.6465814113616943, + "learning_rate": 9.933211599818541e-05, + "loss": 2.1999, + "step": 2625 + }, + { + "epoch": 0.8060159607120934, + "grad_norm": 0.6099503040313721, + "learning_rate": 9.933130604225038e-05, + "loss": 2.1523, + "step": 2626 + }, + { + "epoch": 0.8063228974831185, + "grad_norm": 0.5749596953392029, + "learning_rate": 9.933049559879448e-05, + "loss": 2.0802, + "step": 2627 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.5541282892227173, + "learning_rate": 9.93296846678257e-05, + "loss": 2.0851, + "step": 2628 + }, + { + "epoch": 0.8069367710251688, + "grad_norm": 0.5884469747543335, + "learning_rate": 9.932887324935207e-05, + "loss": 2.1824, + "step": 2629 + }, + { + "epoch": 0.807243707796194, + "grad_norm": 0.7330854535102844, + "learning_rate": 9.93280613433816e-05, + "loss": 2.1463, + "step": 2630 + }, + { + "epoch": 0.8075506445672191, + "grad_norm": 0.7012677192687988, + "learning_rate": 9.932724894992232e-05, + "loss": 2.0907, + "step": 2631 + }, + { + "epoch": 0.8078575813382444, + "grad_norm": 0.6487980484962463, + "learning_rate": 9.932643606898224e-05, + "loss": 2.2131, + "step": 2632 + }, + { + "epoch": 0.8081645181092695, + "grad_norm": 0.7956567406654358, + "learning_rate": 9.932562270056941e-05, + "loss": 2.2289, + "step": 2633 + }, + { + "epoch": 0.8084714548802947, + "grad_norm": 0.7904889583587646, + "learning_rate": 9.932480884469187e-05, + "loss": 2.195, + "step": 2634 + }, + { + "epoch": 0.8087783916513198, + "grad_norm": 0.8088505864143372, + "learning_rate": 9.932399450135766e-05, + "loss": 2.1199, + "step": 2635 + }, + { + "epoch": 0.809085328422345, + "grad_norm": 0.7557070851325989, + "learning_rate": 9.932317967057483e-05, + "loss": 2.177, + "step": 2636 + }, + { + "epoch": 0.8093922651933702, + "grad_norm": 0.8585113286972046, + "learning_rate": 9.932236435235143e-05, + "loss": 2.2215, + "step": 2637 + }, + { + "epoch": 0.8096992019643954, + "grad_norm": 0.9541242718696594, + "learning_rate": 9.932154854669551e-05, + "loss": 2.0971, + "step": 2638 + }, + { + "epoch": 0.8100061387354205, + "grad_norm": 0.9696017503738403, + "learning_rate": 9.932073225361513e-05, + "loss": 2.1723, + "step": 2639 + }, + { + "epoch": 0.8103130755064457, + "grad_norm": 0.9876028895378113, + "learning_rate": 9.931991547311839e-05, + "loss": 2.2266, + "step": 2640 + }, + { + "epoch": 0.8106200122774708, + "grad_norm": 0.9169884324073792, + "learning_rate": 9.931909820521332e-05, + "loss": 2.1453, + "step": 2641 + }, + { + "epoch": 0.810926949048496, + "grad_norm": 0.7645174860954285, + "learning_rate": 9.931828044990801e-05, + "loss": 2.1683, + "step": 2642 + }, + { + "epoch": 0.8112338858195212, + "grad_norm": 0.6733110547065735, + "learning_rate": 9.931746220721056e-05, + "loss": 2.0869, + "step": 2643 + }, + { + "epoch": 0.8115408225905464, + "grad_norm": 0.6033461689949036, + "learning_rate": 9.931664347712904e-05, + "loss": 2.1395, + "step": 2644 + }, + { + "epoch": 0.8118477593615715, + "grad_norm": 0.5953301191329956, + "learning_rate": 9.931582425967154e-05, + "loss": 2.0886, + "step": 2645 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.6587704420089722, + "learning_rate": 9.931500455484616e-05, + "loss": 2.1846, + "step": 2646 + }, + { + "epoch": 0.8124616329036218, + "grad_norm": 0.5837808847427368, + "learning_rate": 9.931418436266101e-05, + "loss": 2.0953, + "step": 2647 + }, + { + "epoch": 0.8127685696746471, + "grad_norm": 0.5593163967132568, + "learning_rate": 9.931336368312417e-05, + "loss": 2.1044, + "step": 2648 + }, + { + "epoch": 0.8130755064456722, + "grad_norm": 0.5758668780326843, + "learning_rate": 9.931254251624378e-05, + "loss": 2.1813, + "step": 2649 + }, + { + "epoch": 0.8133824432166974, + "grad_norm": 0.7128240466117859, + "learning_rate": 9.931172086202793e-05, + "loss": 2.1743, + "step": 2650 + }, + { + "epoch": 0.8136893799877225, + "grad_norm": 0.6214346885681152, + "learning_rate": 9.931089872048476e-05, + "loss": 2.0566, + "step": 2651 + }, + { + "epoch": 0.8139963167587477, + "grad_norm": 0.6279975771903992, + "learning_rate": 9.931007609162239e-05, + "loss": 2.1487, + "step": 2652 + }, + { + "epoch": 0.8143032535297728, + "grad_norm": 0.6137428879737854, + "learning_rate": 9.930925297544895e-05, + "loss": 2.1281, + "step": 2653 + }, + { + "epoch": 0.8146101903007981, + "grad_norm": 0.7433622479438782, + "learning_rate": 9.930842937197255e-05, + "loss": 2.2398, + "step": 2654 + }, + { + "epoch": 0.8149171270718232, + "grad_norm": 0.7490934729576111, + "learning_rate": 9.930760528120137e-05, + "loss": 2.0626, + "step": 2655 + }, + { + "epoch": 0.8152240638428484, + "grad_norm": 0.6829020380973816, + "learning_rate": 9.930678070314352e-05, + "loss": 2.0685, + "step": 2656 + }, + { + "epoch": 0.8155310006138735, + "grad_norm": 0.6328942775726318, + "learning_rate": 9.930595563780718e-05, + "loss": 2.1415, + "step": 2657 + }, + { + "epoch": 0.8158379373848987, + "grad_norm": 0.6919183135032654, + "learning_rate": 9.930513008520048e-05, + "loss": 2.1764, + "step": 2658 + }, + { + "epoch": 0.8161448741559238, + "grad_norm": 0.6600683331489563, + "learning_rate": 9.930430404533158e-05, + "loss": 2.2252, + "step": 2659 + }, + { + "epoch": 0.8164518109269491, + "grad_norm": 0.6614112257957458, + "learning_rate": 9.930347751820866e-05, + "loss": 2.0842, + "step": 2660 + }, + { + "epoch": 0.8167587476979742, + "grad_norm": 0.634395182132721, + "learning_rate": 9.930265050383987e-05, + "loss": 2.1784, + "step": 2661 + }, + { + "epoch": 0.8170656844689994, + "grad_norm": 0.6563819050788879, + "learning_rate": 9.930182300223338e-05, + "loss": 2.1845, + "step": 2662 + }, + { + "epoch": 0.8173726212400245, + "grad_norm": 0.7023175954818726, + "learning_rate": 9.93009950133974e-05, + "loss": 2.1913, + "step": 2663 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.6042037010192871, + "learning_rate": 9.930016653734007e-05, + "loss": 2.1624, + "step": 2664 + }, + { + "epoch": 0.8179864947820749, + "grad_norm": 0.5729875564575195, + "learning_rate": 9.929933757406962e-05, + "loss": 2.0439, + "step": 2665 + }, + { + "epoch": 0.8182934315531001, + "grad_norm": 0.5399687886238098, + "learning_rate": 9.929850812359421e-05, + "loss": 2.1438, + "step": 2666 + }, + { + "epoch": 0.8186003683241252, + "grad_norm": 0.6325745582580566, + "learning_rate": 9.929767818592205e-05, + "loss": 2.1644, + "step": 2667 + }, + { + "epoch": 0.8189073050951504, + "grad_norm": 0.6303146481513977, + "learning_rate": 9.929684776106134e-05, + "loss": 2.1106, + "step": 2668 + }, + { + "epoch": 0.8192142418661755, + "grad_norm": 0.6482712030410767, + "learning_rate": 9.929601684902027e-05, + "loss": 2.0877, + "step": 2669 + }, + { + "epoch": 0.8195211786372008, + "grad_norm": 0.6858036518096924, + "learning_rate": 9.92951854498071e-05, + "loss": 2.1263, + "step": 2670 + }, + { + "epoch": 0.8198281154082259, + "grad_norm": 0.6214284896850586, + "learning_rate": 9.929435356343e-05, + "loss": 2.1516, + "step": 2671 + }, + { + "epoch": 0.8201350521792511, + "grad_norm": 0.5486865639686584, + "learning_rate": 9.92935211898972e-05, + "loss": 2.1199, + "step": 2672 + }, + { + "epoch": 0.8204419889502762, + "grad_norm": 0.62936931848526, + "learning_rate": 9.929268832921693e-05, + "loss": 2.1555, + "step": 2673 + }, + { + "epoch": 0.8207489257213014, + "grad_norm": 0.6402064561843872, + "learning_rate": 9.929185498139744e-05, + "loss": 2.1017, + "step": 2674 + }, + { + "epoch": 0.8210558624923265, + "grad_norm": 0.7254593372344971, + "learning_rate": 9.929102114644693e-05, + "loss": 2.1145, + "step": 2675 + }, + { + "epoch": 0.8213627992633518, + "grad_norm": 0.776472806930542, + "learning_rate": 9.929018682437366e-05, + "loss": 2.2582, + "step": 2676 + }, + { + "epoch": 0.8216697360343769, + "grad_norm": 0.7073757648468018, + "learning_rate": 9.928935201518587e-05, + "loss": 2.1135, + "step": 2677 + }, + { + "epoch": 0.8219766728054021, + "grad_norm": 0.7075079679489136, + "learning_rate": 9.928851671889184e-05, + "loss": 2.128, + "step": 2678 + }, + { + "epoch": 0.8222836095764272, + "grad_norm": 0.7937450408935547, + "learning_rate": 9.928768093549979e-05, + "loss": 2.1401, + "step": 2679 + }, + { + "epoch": 0.8225905463474524, + "grad_norm": 0.7523970603942871, + "learning_rate": 9.928684466501797e-05, + "loss": 2.2055, + "step": 2680 + }, + { + "epoch": 0.8228974831184775, + "grad_norm": 0.6644876599311829, + "learning_rate": 9.928600790745466e-05, + "loss": 2.1449, + "step": 2681 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.6054069399833679, + "learning_rate": 9.928517066281816e-05, + "loss": 2.1191, + "step": 2682 + }, + { + "epoch": 0.8235113566605279, + "grad_norm": 0.6610973477363586, + "learning_rate": 9.92843329311167e-05, + "loss": 2.2247, + "step": 2683 + }, + { + "epoch": 0.8238182934315531, + "grad_norm": 0.69968181848526, + "learning_rate": 9.928349471235858e-05, + "loss": 2.149, + "step": 2684 + }, + { + "epoch": 0.8241252302025782, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.928265600655206e-05, + "loss": 2.1906, + "step": 2685 + }, + { + "epoch": 0.8244321669736034, + "grad_norm": 0.6621972918510437, + "learning_rate": 9.928181681370547e-05, + "loss": 2.1259, + "step": 2686 + }, + { + "epoch": 0.8247391037446286, + "grad_norm": 0.6452053785324097, + "learning_rate": 9.928097713382708e-05, + "loss": 2.1301, + "step": 2687 + }, + { + "epoch": 0.8250460405156538, + "grad_norm": 0.6137326955795288, + "learning_rate": 9.928013696692519e-05, + "loss": 2.0942, + "step": 2688 + }, + { + "epoch": 0.8253529772866789, + "grad_norm": 0.6449215412139893, + "learning_rate": 9.92792963130081e-05, + "loss": 2.2135, + "step": 2689 + }, + { + "epoch": 0.8256599140577041, + "grad_norm": 0.5838732123374939, + "learning_rate": 9.927845517208411e-05, + "loss": 2.1161, + "step": 2690 + }, + { + "epoch": 0.8259668508287292, + "grad_norm": 0.6642805337905884, + "learning_rate": 9.927761354416157e-05, + "loss": 2.1228, + "step": 2691 + }, + { + "epoch": 0.8262737875997545, + "grad_norm": 0.653274416923523, + "learning_rate": 9.927677142924874e-05, + "loss": 2.1777, + "step": 2692 + }, + { + "epoch": 0.8265807243707797, + "grad_norm": 0.6471827030181885, + "learning_rate": 9.927592882735398e-05, + "loss": 2.0756, + "step": 2693 + }, + { + "epoch": 0.8268876611418048, + "grad_norm": 0.6215457916259766, + "learning_rate": 9.927508573848562e-05, + "loss": 2.0691, + "step": 2694 + }, + { + "epoch": 0.82719459791283, + "grad_norm": 0.6343390345573425, + "learning_rate": 9.927424216265198e-05, + "loss": 2.2145, + "step": 2695 + }, + { + "epoch": 0.8275015346838551, + "grad_norm": 0.5296334624290466, + "learning_rate": 9.927339809986138e-05, + "loss": 2.0861, + "step": 2696 + }, + { + "epoch": 0.8278084714548803, + "grad_norm": 0.6457146406173706, + "learning_rate": 9.92725535501222e-05, + "loss": 2.1703, + "step": 2697 + }, + { + "epoch": 0.8281154082259055, + "grad_norm": 0.753579318523407, + "learning_rate": 9.927170851344276e-05, + "loss": 2.1628, + "step": 2698 + }, + { + "epoch": 0.8284223449969307, + "grad_norm": 0.7327163815498352, + "learning_rate": 9.927086298983141e-05, + "loss": 2.105, + "step": 2699 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.7786175608634949, + "learning_rate": 9.927001697929653e-05, + "loss": 2.084, + "step": 2700 + }, + { + "epoch": 0.829036218538981, + "grad_norm": 0.6370857357978821, + "learning_rate": 9.926917048184646e-05, + "loss": 2.0888, + "step": 2701 + }, + { + "epoch": 0.8293431553100061, + "grad_norm": 0.6600006818771362, + "learning_rate": 9.926832349748955e-05, + "loss": 2.148, + "step": 2702 + }, + { + "epoch": 0.8296500920810314, + "grad_norm": 0.6266845464706421, + "learning_rate": 9.926747602623422e-05, + "loss": 2.2182, + "step": 2703 + }, + { + "epoch": 0.8299570288520565, + "grad_norm": 0.588934600353241, + "learning_rate": 9.92666280680888e-05, + "loss": 2.1879, + "step": 2704 + }, + { + "epoch": 0.8302639656230817, + "grad_norm": 0.6467881202697754, + "learning_rate": 9.926577962306168e-05, + "loss": 2.1082, + "step": 2705 + }, + { + "epoch": 0.8305709023941068, + "grad_norm": 0.6256638765335083, + "learning_rate": 9.926493069116127e-05, + "loss": 2.1007, + "step": 2706 + }, + { + "epoch": 0.830877839165132, + "grad_norm": 0.5710256099700928, + "learning_rate": 9.926408127239592e-05, + "loss": 2.0783, + "step": 2707 + }, + { + "epoch": 0.8311847759361571, + "grad_norm": 0.5836597681045532, + "learning_rate": 9.926323136677405e-05, + "loss": 2.1292, + "step": 2708 + }, + { + "epoch": 0.8314917127071824, + "grad_norm": 0.6420408487319946, + "learning_rate": 9.926238097430405e-05, + "loss": 2.1191, + "step": 2709 + }, + { + "epoch": 0.8317986494782075, + "grad_norm": 0.6192520260810852, + "learning_rate": 9.926153009499433e-05, + "loss": 2.1401, + "step": 2710 + }, + { + "epoch": 0.8321055862492327, + "grad_norm": 0.5986925959587097, + "learning_rate": 9.92606787288533e-05, + "loss": 2.0466, + "step": 2711 + }, + { + "epoch": 0.8324125230202578, + "grad_norm": 0.6386710405349731, + "learning_rate": 9.925982687588937e-05, + "loss": 2.1975, + "step": 2712 + }, + { + "epoch": 0.832719459791283, + "grad_norm": 0.6678250432014465, + "learning_rate": 9.925897453611095e-05, + "loss": 2.1744, + "step": 2713 + }, + { + "epoch": 0.8330263965623081, + "grad_norm": 0.628873348236084, + "learning_rate": 9.925812170952648e-05, + "loss": 2.0901, + "step": 2714 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6365368366241455, + "learning_rate": 9.925726839614438e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.8336402701043585, + "grad_norm": 0.6812825798988342, + "learning_rate": 9.925641459597309e-05, + "loss": 2.1163, + "step": 2716 + }, + { + "epoch": 0.8339472068753837, + "grad_norm": 0.6961301565170288, + "learning_rate": 9.925556030902103e-05, + "loss": 2.1634, + "step": 2717 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.687017023563385, + "learning_rate": 9.925470553529666e-05, + "loss": 2.1921, + "step": 2718 + }, + { + "epoch": 0.834561080417434, + "grad_norm": 0.6528787612915039, + "learning_rate": 9.925385027480841e-05, + "loss": 2.1148, + "step": 2719 + }, + { + "epoch": 0.8348680171884592, + "grad_norm": 0.6092917323112488, + "learning_rate": 9.925299452756476e-05, + "loss": 2.0154, + "step": 2720 + }, + { + "epoch": 0.8351749539594844, + "grad_norm": 0.6537092328071594, + "learning_rate": 9.925213829357413e-05, + "loss": 2.1775, + "step": 2721 + }, + { + "epoch": 0.8354818907305095, + "grad_norm": 0.6560773849487305, + "learning_rate": 9.925128157284503e-05, + "loss": 2.1628, + "step": 2722 + }, + { + "epoch": 0.8357888275015347, + "grad_norm": 0.5976104140281677, + "learning_rate": 9.925042436538588e-05, + "loss": 2.1527, + "step": 2723 + }, + { + "epoch": 0.8360957642725598, + "grad_norm": 0.6577131152153015, + "learning_rate": 9.924956667120516e-05, + "loss": 2.1449, + "step": 2724 + }, + { + "epoch": 0.836402701043585, + "grad_norm": 0.6574232578277588, + "learning_rate": 9.924870849031136e-05, + "loss": 2.0517, + "step": 2725 + }, + { + "epoch": 0.8367096378146102, + "grad_norm": 0.5988326072692871, + "learning_rate": 9.924784982271297e-05, + "loss": 2.0975, + "step": 2726 + }, + { + "epoch": 0.8370165745856354, + "grad_norm": 0.5970706939697266, + "learning_rate": 9.924699066841845e-05, + "loss": 2.1754, + "step": 2727 + }, + { + "epoch": 0.8373235113566605, + "grad_norm": 0.6547200679779053, + "learning_rate": 9.924613102743632e-05, + "loss": 2.1651, + "step": 2728 + }, + { + "epoch": 0.8376304481276857, + "grad_norm": 0.643358588218689, + "learning_rate": 9.924527089977504e-05, + "loss": 2.1355, + "step": 2729 + }, + { + "epoch": 0.8379373848987108, + "grad_norm": 0.6696504950523376, + "learning_rate": 9.924441028544314e-05, + "loss": 2.1444, + "step": 2730 + }, + { + "epoch": 0.8382443216697361, + "grad_norm": 0.5923263430595398, + "learning_rate": 9.924354918444911e-05, + "loss": 2.1656, + "step": 2731 + }, + { + "epoch": 0.8385512584407612, + "grad_norm": 0.6507698893547058, + "learning_rate": 9.924268759680146e-05, + "loss": 2.1172, + "step": 2732 + }, + { + "epoch": 0.8388581952117864, + "grad_norm": 0.6240561008453369, + "learning_rate": 9.924182552250873e-05, + "loss": 2.113, + "step": 2733 + }, + { + "epoch": 0.8391651319828115, + "grad_norm": 0.7350605726242065, + "learning_rate": 9.92409629615794e-05, + "loss": 2.2099, + "step": 2734 + }, + { + "epoch": 0.8394720687538367, + "grad_norm": 0.679027795791626, + "learning_rate": 9.924009991402202e-05, + "loss": 2.1202, + "step": 2735 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.7187801003456116, + "learning_rate": 9.923923637984512e-05, + "loss": 2.1994, + "step": 2736 + }, + { + "epoch": 0.8400859422958871, + "grad_norm": 0.7437569499015808, + "learning_rate": 9.92383723590572e-05, + "loss": 2.1778, + "step": 2737 + }, + { + "epoch": 0.8403928790669122, + "grad_norm": 0.7004902958869934, + "learning_rate": 9.923750785166686e-05, + "loss": 2.1478, + "step": 2738 + }, + { + "epoch": 0.8406998158379374, + "grad_norm": 0.632478654384613, + "learning_rate": 9.923664285768258e-05, + "loss": 2.1785, + "step": 2739 + }, + { + "epoch": 0.8410067526089625, + "grad_norm": 0.6399826407432556, + "learning_rate": 9.923577737711295e-05, + "loss": 2.1708, + "step": 2740 + }, + { + "epoch": 0.8413136893799877, + "grad_norm": 0.649340033531189, + "learning_rate": 9.92349114099665e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.8416206261510129, + "grad_norm": 0.6143749952316284, + "learning_rate": 9.923404495625182e-05, + "loss": 2.0696, + "step": 2742 + }, + { + "epoch": 0.8419275629220381, + "grad_norm": 0.655846357345581, + "learning_rate": 9.923317801597742e-05, + "loss": 2.1163, + "step": 2743 + }, + { + "epoch": 0.8422344996930632, + "grad_norm": 0.588096022605896, + "learning_rate": 9.923231058915192e-05, + "loss": 2.0893, + "step": 2744 + }, + { + "epoch": 0.8425414364640884, + "grad_norm": 0.5445908904075623, + "learning_rate": 9.923144267578386e-05, + "loss": 2.1223, + "step": 2745 + }, + { + "epoch": 0.8428483732351135, + "grad_norm": 0.5372910499572754, + "learning_rate": 9.923057427588182e-05, + "loss": 2.1386, + "step": 2746 + }, + { + "epoch": 0.8431553100061387, + "grad_norm": 0.5118899345397949, + "learning_rate": 9.922970538945442e-05, + "loss": 2.0532, + "step": 2747 + }, + { + "epoch": 0.8434622467771639, + "grad_norm": 0.5252440571784973, + "learning_rate": 9.922883601651019e-05, + "loss": 2.1679, + "step": 2748 + }, + { + "epoch": 0.8437691835481891, + "grad_norm": 0.5978875160217285, + "learning_rate": 9.922796615705776e-05, + "loss": 2.2054, + "step": 2749 + }, + { + "epoch": 0.8440761203192142, + "grad_norm": 0.5642610788345337, + "learning_rate": 9.922709581110572e-05, + "loss": 2.1886, + "step": 2750 + }, + { + "epoch": 0.8443830570902394, + "grad_norm": 0.6332407593727112, + "learning_rate": 9.922622497866265e-05, + "loss": 2.1618, + "step": 2751 + }, + { + "epoch": 0.8446899938612645, + "grad_norm": 0.6971728801727295, + "learning_rate": 9.922535365973718e-05, + "loss": 2.1011, + "step": 2752 + }, + { + "epoch": 0.8449969306322898, + "grad_norm": 0.6917250156402588, + "learning_rate": 9.922448185433792e-05, + "loss": 2.1408, + "step": 2753 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.748960554599762, + "learning_rate": 9.922360956247348e-05, + "loss": 2.1612, + "step": 2754 + }, + { + "epoch": 0.8456108041743401, + "grad_norm": 0.6739722490310669, + "learning_rate": 9.922273678415245e-05, + "loss": 2.1234, + "step": 2755 + }, + { + "epoch": 0.8459177409453652, + "grad_norm": 0.6310722827911377, + "learning_rate": 9.922186351938351e-05, + "loss": 2.1476, + "step": 2756 + }, + { + "epoch": 0.8462246777163904, + "grad_norm": 0.5992079973220825, + "learning_rate": 9.922098976817527e-05, + "loss": 2.1009, + "step": 2757 + }, + { + "epoch": 0.8465316144874155, + "grad_norm": 0.5697188973426819, + "learning_rate": 9.922011553053637e-05, + "loss": 2.1277, + "step": 2758 + }, + { + "epoch": 0.8468385512584408, + "grad_norm": 0.7005256414413452, + "learning_rate": 9.921924080647541e-05, + "loss": 2.1592, + "step": 2759 + }, + { + "epoch": 0.8471454880294659, + "grad_norm": 0.7664382457733154, + "learning_rate": 9.921836559600109e-05, + "loss": 2.2328, + "step": 2760 + }, + { + "epoch": 0.8474524248004911, + "grad_norm": 0.8668230772018433, + "learning_rate": 9.921748989912201e-05, + "loss": 2.2285, + "step": 2761 + }, + { + "epoch": 0.8477593615715162, + "grad_norm": 0.9423169493675232, + "learning_rate": 9.921661371584685e-05, + "loss": 2.1172, + "step": 2762 + }, + { + "epoch": 0.8480662983425414, + "grad_norm": 0.8547552824020386, + "learning_rate": 9.921573704618428e-05, + "loss": 2.1426, + "step": 2763 + }, + { + "epoch": 0.8483732351135667, + "grad_norm": 0.7568690776824951, + "learning_rate": 9.921485989014294e-05, + "loss": 2.0861, + "step": 2764 + }, + { + "epoch": 0.8486801718845918, + "grad_norm": 0.6535828709602356, + "learning_rate": 9.92139822477315e-05, + "loss": 2.1705, + "step": 2765 + }, + { + "epoch": 0.848987108655617, + "grad_norm": 0.6099218130111694, + "learning_rate": 9.921310411895867e-05, + "loss": 2.1666, + "step": 2766 + }, + { + "epoch": 0.8492940454266421, + "grad_norm": 0.6315065026283264, + "learning_rate": 9.92122255038331e-05, + "loss": 2.1868, + "step": 2767 + }, + { + "epoch": 0.8496009821976673, + "grad_norm": 0.6861329078674316, + "learning_rate": 9.921134640236344e-05, + "loss": 2.1056, + "step": 2768 + }, + { + "epoch": 0.8499079189686924, + "grad_norm": 0.6357519626617432, + "learning_rate": 9.921046681455844e-05, + "loss": 2.1272, + "step": 2769 + }, + { + "epoch": 0.8502148557397177, + "grad_norm": 0.6245810389518738, + "learning_rate": 9.920958674042676e-05, + "loss": 2.1313, + "step": 2770 + }, + { + "epoch": 0.8505217925107428, + "grad_norm": 0.6087192296981812, + "learning_rate": 9.920870617997709e-05, + "loss": 2.123, + "step": 2771 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.6384228467941284, + "learning_rate": 9.920782513321814e-05, + "loss": 2.1343, + "step": 2772 + }, + { + "epoch": 0.8511356660527931, + "grad_norm": 0.6143882274627686, + "learning_rate": 9.920694360015863e-05, + "loss": 2.0706, + "step": 2773 + }, + { + "epoch": 0.8514426028238183, + "grad_norm": 0.5561975240707397, + "learning_rate": 9.920606158080725e-05, + "loss": 2.1015, + "step": 2774 + }, + { + "epoch": 0.8517495395948435, + "grad_norm": 0.5434146523475647, + "learning_rate": 9.920517907517275e-05, + "loss": 2.1306, + "step": 2775 + }, + { + "epoch": 0.8520564763658687, + "grad_norm": 0.6028591990470886, + "learning_rate": 9.920429608326382e-05, + "loss": 2.1665, + "step": 2776 + }, + { + "epoch": 0.8523634131368938, + "grad_norm": 0.6491599082946777, + "learning_rate": 9.920341260508918e-05, + "loss": 2.0715, + "step": 2777 + }, + { + "epoch": 0.852670349907919, + "grad_norm": 0.6350167989730835, + "learning_rate": 9.92025286406576e-05, + "loss": 2.1492, + "step": 2778 + }, + { + "epoch": 0.8529772866789441, + "grad_norm": 0.5726897120475769, + "learning_rate": 9.92016441899778e-05, + "loss": 2.1128, + "step": 2779 + }, + { + "epoch": 0.8532842234499693, + "grad_norm": 0.5680630207061768, + "learning_rate": 9.92007592530585e-05, + "loss": 2.0718, + "step": 2780 + }, + { + "epoch": 0.8535911602209945, + "grad_norm": 0.5901346802711487, + "learning_rate": 9.919987382990845e-05, + "loss": 2.0577, + "step": 2781 + }, + { + "epoch": 0.8538980969920197, + "grad_norm": 0.5756994485855103, + "learning_rate": 9.919898792053643e-05, + "loss": 2.106, + "step": 2782 + }, + { + "epoch": 0.8542050337630448, + "grad_norm": 0.5831238031387329, + "learning_rate": 9.919810152495116e-05, + "loss": 2.0507, + "step": 2783 + }, + { + "epoch": 0.85451197053407, + "grad_norm": 0.529931902885437, + "learning_rate": 9.919721464316143e-05, + "loss": 2.0934, + "step": 2784 + }, + { + "epoch": 0.8548189073050951, + "grad_norm": 0.603672981262207, + "learning_rate": 9.919632727517597e-05, + "loss": 2.164, + "step": 2785 + }, + { + "epoch": 0.8551258440761204, + "grad_norm": 0.5741528868675232, + "learning_rate": 9.919543942100357e-05, + "loss": 2.0948, + "step": 2786 + }, + { + "epoch": 0.8554327808471455, + "grad_norm": 0.5689142942428589, + "learning_rate": 9.919455108065303e-05, + "loss": 2.1572, + "step": 2787 + }, + { + "epoch": 0.8557397176181707, + "grad_norm": 0.5767523646354675, + "learning_rate": 9.919366225413308e-05, + "loss": 2.0528, + "step": 2788 + }, + { + "epoch": 0.8560466543891958, + "grad_norm": 0.6004374623298645, + "learning_rate": 9.919277294145252e-05, + "loss": 2.1078, + "step": 2789 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.6199560761451721, + "learning_rate": 9.919188314262017e-05, + "loss": 2.034, + "step": 2790 + }, + { + "epoch": 0.8566605279312461, + "grad_norm": 0.5928464531898499, + "learning_rate": 9.919099285764478e-05, + "loss": 2.1226, + "step": 2791 + }, + { + "epoch": 0.8569674647022714, + "grad_norm": 0.5620111227035522, + "learning_rate": 9.919010208653517e-05, + "loss": 2.1387, + "step": 2792 + }, + { + "epoch": 0.8572744014732965, + "grad_norm": 0.6035314798355103, + "learning_rate": 9.918921082930015e-05, + "loss": 2.0888, + "step": 2793 + }, + { + "epoch": 0.8575813382443217, + "grad_norm": 0.6842171549797058, + "learning_rate": 9.91883190859485e-05, + "loss": 2.15, + "step": 2794 + }, + { + "epoch": 0.8578882750153468, + "grad_norm": 0.7600229978561401, + "learning_rate": 9.918742685648906e-05, + "loss": 2.1776, + "step": 2795 + }, + { + "epoch": 0.858195211786372, + "grad_norm": 0.641504168510437, + "learning_rate": 9.918653414093065e-05, + "loss": 2.086, + "step": 2796 + }, + { + "epoch": 0.8585021485573971, + "grad_norm": 0.6062462329864502, + "learning_rate": 9.918564093928207e-05, + "loss": 2.0772, + "step": 2797 + }, + { + "epoch": 0.8588090853284224, + "grad_norm": 0.5259165167808533, + "learning_rate": 9.918474725155214e-05, + "loss": 2.1034, + "step": 2798 + }, + { + "epoch": 0.8591160220994475, + "grad_norm": 0.532511830329895, + "learning_rate": 9.918385307774973e-05, + "loss": 2.103, + "step": 2799 + }, + { + "epoch": 0.8594229588704727, + "grad_norm": 0.5996485352516174, + "learning_rate": 9.918295841788366e-05, + "loss": 2.1698, + "step": 2800 + }, + { + "epoch": 0.8597298956414978, + "grad_norm": 0.5895976424217224, + "learning_rate": 9.918206327196276e-05, + "loss": 2.132, + "step": 2801 + }, + { + "epoch": 0.860036832412523, + "grad_norm": 0.6363179087638855, + "learning_rate": 9.918116763999588e-05, + "loss": 2.0967, + "step": 2802 + }, + { + "epoch": 0.8603437691835482, + "grad_norm": 0.6594113707542419, + "learning_rate": 9.918027152199187e-05, + "loss": 2.1266, + "step": 2803 + }, + { + "epoch": 0.8606507059545734, + "grad_norm": 0.694879412651062, + "learning_rate": 9.917937491795961e-05, + "loss": 2.0694, + "step": 2804 + }, + { + "epoch": 0.8609576427255985, + "grad_norm": 0.6310710906982422, + "learning_rate": 9.917847782790793e-05, + "loss": 2.1546, + "step": 2805 + }, + { + "epoch": 0.8612645794966237, + "grad_norm": 0.6166081428527832, + "learning_rate": 9.917758025184572e-05, + "loss": 2.131, + "step": 2806 + }, + { + "epoch": 0.8615715162676488, + "grad_norm": 0.5857066512107849, + "learning_rate": 9.917668218978182e-05, + "loss": 2.1529, + "step": 2807 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.6374151706695557, + "learning_rate": 9.917578364172513e-05, + "loss": 2.151, + "step": 2808 + }, + { + "epoch": 0.8621853898096992, + "grad_norm": 0.6760959625244141, + "learning_rate": 9.917488460768453e-05, + "loss": 2.1955, + "step": 2809 + }, + { + "epoch": 0.8624923265807244, + "grad_norm": 0.6308501362800598, + "learning_rate": 9.917398508766889e-05, + "loss": 2.1449, + "step": 2810 + }, + { + "epoch": 0.8627992633517495, + "grad_norm": 0.615181028842926, + "learning_rate": 9.91730850816871e-05, + "loss": 2.0326, + "step": 2811 + }, + { + "epoch": 0.8631062001227747, + "grad_norm": 0.6746891736984253, + "learning_rate": 9.917218458974809e-05, + "loss": 2.1472, + "step": 2812 + }, + { + "epoch": 0.8634131368937998, + "grad_norm": 0.6594959497451782, + "learning_rate": 9.91712836118607e-05, + "loss": 2.0879, + "step": 2813 + }, + { + "epoch": 0.8637200736648251, + "grad_norm": 0.6843087077140808, + "learning_rate": 9.91703821480339e-05, + "loss": 2.13, + "step": 2814 + }, + { + "epoch": 0.8640270104358502, + "grad_norm": 0.7513928413391113, + "learning_rate": 9.916948019827653e-05, + "loss": 2.1866, + "step": 2815 + }, + { + "epoch": 0.8643339472068754, + "grad_norm": 0.7352319955825806, + "learning_rate": 9.916857776259755e-05, + "loss": 2.0844, + "step": 2816 + }, + { + "epoch": 0.8646408839779005, + "grad_norm": 0.6901769638061523, + "learning_rate": 9.916767484100587e-05, + "loss": 2.086, + "step": 2817 + }, + { + "epoch": 0.8649478207489257, + "grad_norm": 0.621734619140625, + "learning_rate": 9.91667714335104e-05, + "loss": 2.0764, + "step": 2818 + }, + { + "epoch": 0.8652547575199508, + "grad_norm": 0.5779813528060913, + "learning_rate": 9.916586754012008e-05, + "loss": 2.0568, + "step": 2819 + }, + { + "epoch": 0.8655616942909761, + "grad_norm": 0.566251814365387, + "learning_rate": 9.916496316084385e-05, + "loss": 2.1624, + "step": 2820 + }, + { + "epoch": 0.8658686310620012, + "grad_norm": 0.6039763689041138, + "learning_rate": 9.916405829569062e-05, + "loss": 2.0412, + "step": 2821 + }, + { + "epoch": 0.8661755678330264, + "grad_norm": 0.587469220161438, + "learning_rate": 9.916315294466935e-05, + "loss": 2.1513, + "step": 2822 + }, + { + "epoch": 0.8664825046040515, + "grad_norm": 0.5792883634567261, + "learning_rate": 9.916224710778901e-05, + "loss": 2.055, + "step": 2823 + }, + { + "epoch": 0.8667894413750767, + "grad_norm": 0.5533844232559204, + "learning_rate": 9.916134078505852e-05, + "loss": 2.1237, + "step": 2824 + }, + { + "epoch": 0.8670963781461019, + "grad_norm": 0.6140845417976379, + "learning_rate": 9.916043397648685e-05, + "loss": 2.1481, + "step": 2825 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.6092365384101868, + "learning_rate": 9.915952668208295e-05, + "loss": 2.1567, + "step": 2826 + }, + { + "epoch": 0.8677102516881522, + "grad_norm": 0.5712884068489075, + "learning_rate": 9.915861890185578e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.8680171884591774, + "grad_norm": 0.5314213633537292, + "learning_rate": 9.915771063581434e-05, + "loss": 2.0408, + "step": 2828 + }, + { + "epoch": 0.8683241252302025, + "grad_norm": 0.5258345007896423, + "learning_rate": 9.915680188396759e-05, + "loss": 2.0968, + "step": 2829 + }, + { + "epoch": 0.8686310620012277, + "grad_norm": 0.6071497797966003, + "learning_rate": 9.915589264632453e-05, + "loss": 2.0924, + "step": 2830 + }, + { + "epoch": 0.8689379987722529, + "grad_norm": 0.6742420792579651, + "learning_rate": 9.915498292289408e-05, + "loss": 2.1276, + "step": 2831 + }, + { + "epoch": 0.8692449355432781, + "grad_norm": 0.7642729878425598, + "learning_rate": 9.915407271368533e-05, + "loss": 2.204, + "step": 2832 + }, + { + "epoch": 0.8695518723143032, + "grad_norm": 0.8024489283561707, + "learning_rate": 9.915316201870718e-05, + "loss": 2.163, + "step": 2833 + }, + { + "epoch": 0.8698588090853284, + "grad_norm": 0.8268367648124695, + "learning_rate": 9.915225083796871e-05, + "loss": 2.117, + "step": 2834 + }, + { + "epoch": 0.8701657458563536, + "grad_norm": 0.7761407494544983, + "learning_rate": 9.915133917147888e-05, + "loss": 2.0727, + "step": 2835 + }, + { + "epoch": 0.8704726826273788, + "grad_norm": 0.7515753507614136, + "learning_rate": 9.91504270192467e-05, + "loss": 2.075, + "step": 2836 + }, + { + "epoch": 0.870779619398404, + "grad_norm": 0.6203973889350891, + "learning_rate": 9.914951438128119e-05, + "loss": 2.1163, + "step": 2837 + }, + { + "epoch": 0.8710865561694291, + "grad_norm": 0.6056976318359375, + "learning_rate": 9.914860125759138e-05, + "loss": 2.1515, + "step": 2838 + }, + { + "epoch": 0.8713934929404543, + "grad_norm": 0.6472234725952148, + "learning_rate": 9.914768764818627e-05, + "loss": 2.1618, + "step": 2839 + }, + { + "epoch": 0.8717004297114794, + "grad_norm": 0.5981749892234802, + "learning_rate": 9.914677355307491e-05, + "loss": 2.0763, + "step": 2840 + }, + { + "epoch": 0.8720073664825047, + "grad_norm": 0.5721938014030457, + "learning_rate": 9.914585897226634e-05, + "loss": 2.0916, + "step": 2841 + }, + { + "epoch": 0.8723143032535298, + "grad_norm": 0.6079535484313965, + "learning_rate": 9.914494390576958e-05, + "loss": 2.0767, + "step": 2842 + }, + { + "epoch": 0.872621240024555, + "grad_norm": 0.6684066653251648, + "learning_rate": 9.914402835359368e-05, + "loss": 2.2712, + "step": 2843 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.6992711424827576, + "learning_rate": 9.91431123157477e-05, + "loss": 2.0813, + "step": 2844 + }, + { + "epoch": 0.8732351135666053, + "grad_norm": 0.6585392951965332, + "learning_rate": 9.914219579224065e-05, + "loss": 2.1303, + "step": 2845 + }, + { + "epoch": 0.8735420503376304, + "grad_norm": 0.7267395257949829, + "learning_rate": 9.914127878308164e-05, + "loss": 2.2253, + "step": 2846 + }, + { + "epoch": 0.8738489871086557, + "grad_norm": 0.6764006018638611, + "learning_rate": 9.91403612882797e-05, + "loss": 2.0886, + "step": 2847 + }, + { + "epoch": 0.8741559238796808, + "grad_norm": 0.612808108329773, + "learning_rate": 9.91394433078439e-05, + "loss": 2.0469, + "step": 2848 + }, + { + "epoch": 0.874462860650706, + "grad_norm": 0.5598782896995544, + "learning_rate": 9.913852484178334e-05, + "loss": 2.1745, + "step": 2849 + }, + { + "epoch": 0.8747697974217311, + "grad_norm": 0.6498168706893921, + "learning_rate": 9.913760589010707e-05, + "loss": 2.2657, + "step": 2850 + }, + { + "epoch": 0.8750767341927563, + "grad_norm": 0.6796014904975891, + "learning_rate": 9.913668645282418e-05, + "loss": 2.1056, + "step": 2851 + }, + { + "epoch": 0.8753836709637814, + "grad_norm": 0.7409440279006958, + "learning_rate": 9.913576652994376e-05, + "loss": 2.1533, + "step": 2852 + }, + { + "epoch": 0.8756906077348067, + "grad_norm": 0.7044464945793152, + "learning_rate": 9.913484612147488e-05, + "loss": 2.2088, + "step": 2853 + }, + { + "epoch": 0.8759975445058318, + "grad_norm": 0.6333544254302979, + "learning_rate": 9.913392522742666e-05, + "loss": 2.132, + "step": 2854 + }, + { + "epoch": 0.876304481276857, + "grad_norm": 0.603382408618927, + "learning_rate": 9.91330038478082e-05, + "loss": 2.0657, + "step": 2855 + }, + { + "epoch": 0.8766114180478821, + "grad_norm": 0.5919856429100037, + "learning_rate": 9.913208198262858e-05, + "loss": 2.0854, + "step": 2856 + }, + { + "epoch": 0.8769183548189073, + "grad_norm": 0.6033365726470947, + "learning_rate": 9.913115963189694e-05, + "loss": 2.0825, + "step": 2857 + }, + { + "epoch": 0.8772252915899325, + "grad_norm": 0.5917964577674866, + "learning_rate": 9.913023679562238e-05, + "loss": 2.1608, + "step": 2858 + }, + { + "epoch": 0.8775322283609577, + "grad_norm": 0.5953360795974731, + "learning_rate": 9.912931347381402e-05, + "loss": 2.1454, + "step": 2859 + }, + { + "epoch": 0.8778391651319828, + "grad_norm": 0.5949352979660034, + "learning_rate": 9.9128389666481e-05, + "loss": 2.1575, + "step": 2860 + }, + { + "epoch": 0.878146101903008, + "grad_norm": 0.5468181371688843, + "learning_rate": 9.912746537363243e-05, + "loss": 2.151, + "step": 2861 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.5476632714271545, + "learning_rate": 9.912654059527746e-05, + "loss": 2.1015, + "step": 2862 + }, + { + "epoch": 0.8787599754450584, + "grad_norm": 0.6881390810012817, + "learning_rate": 9.912561533142521e-05, + "loss": 2.2002, + "step": 2863 + }, + { + "epoch": 0.8790669122160835, + "grad_norm": 0.6663404703140259, + "learning_rate": 9.912468958208486e-05, + "loss": 2.0691, + "step": 2864 + }, + { + "epoch": 0.8793738489871087, + "grad_norm": 0.5739100575447083, + "learning_rate": 9.91237633472655e-05, + "loss": 2.0852, + "step": 2865 + }, + { + "epoch": 0.8796807857581338, + "grad_norm": 0.5227558016777039, + "learning_rate": 9.912283662697635e-05, + "loss": 2.1144, + "step": 2866 + }, + { + "epoch": 0.879987722529159, + "grad_norm": 0.5626821517944336, + "learning_rate": 9.912190942122652e-05, + "loss": 2.0796, + "step": 2867 + }, + { + "epoch": 0.8802946593001841, + "grad_norm": 0.5367855429649353, + "learning_rate": 9.912098173002518e-05, + "loss": 2.0768, + "step": 2868 + }, + { + "epoch": 0.8806015960712094, + "grad_norm": 0.5285482406616211, + "learning_rate": 9.912005355338152e-05, + "loss": 2.0832, + "step": 2869 + }, + { + "epoch": 0.8809085328422345, + "grad_norm": 0.5384502410888672, + "learning_rate": 9.91191248913047e-05, + "loss": 2.0187, + "step": 2870 + }, + { + "epoch": 0.8812154696132597, + "grad_norm": 0.5099567770957947, + "learning_rate": 9.91181957438039e-05, + "loss": 2.0865, + "step": 2871 + }, + { + "epoch": 0.8815224063842848, + "grad_norm": 0.5513966679573059, + "learning_rate": 9.911726611088831e-05, + "loss": 2.1097, + "step": 2872 + }, + { + "epoch": 0.88182934315531, + "grad_norm": 0.5411790609359741, + "learning_rate": 9.911633599256709e-05, + "loss": 2.0964, + "step": 2873 + }, + { + "epoch": 0.8821362799263351, + "grad_norm": 0.6151100397109985, + "learning_rate": 9.911540538884947e-05, + "loss": 2.1006, + "step": 2874 + }, + { + "epoch": 0.8824432166973604, + "grad_norm": 0.754391610622406, + "learning_rate": 9.911447429974461e-05, + "loss": 2.1493, + "step": 2875 + }, + { + "epoch": 0.8827501534683855, + "grad_norm": 0.7485715746879578, + "learning_rate": 9.911354272526172e-05, + "loss": 2.1136, + "step": 2876 + }, + { + "epoch": 0.8830570902394107, + "grad_norm": 0.6808591485023499, + "learning_rate": 9.911261066541003e-05, + "loss": 2.1238, + "step": 2877 + }, + { + "epoch": 0.8833640270104358, + "grad_norm": 0.5771127343177795, + "learning_rate": 9.911167812019874e-05, + "loss": 2.0846, + "step": 2878 + }, + { + "epoch": 0.883670963781461, + "grad_norm": 0.5991767048835754, + "learning_rate": 9.911074508963705e-05, + "loss": 2.1486, + "step": 2879 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.6899440884590149, + "learning_rate": 9.91098115737342e-05, + "loss": 2.1357, + "step": 2880 + }, + { + "epoch": 0.8842848373235114, + "grad_norm": 0.7102574110031128, + "learning_rate": 9.91088775724994e-05, + "loss": 2.1269, + "step": 2881 + }, + { + "epoch": 0.8845917740945365, + "grad_norm": 0.7238754034042358, + "learning_rate": 9.910794308594189e-05, + "loss": 2.0829, + "step": 2882 + }, + { + "epoch": 0.8848987108655617, + "grad_norm": 0.7232441902160645, + "learning_rate": 9.91070081140709e-05, + "loss": 2.1704, + "step": 2883 + }, + { + "epoch": 0.8852056476365868, + "grad_norm": 0.7136173844337463, + "learning_rate": 9.910607265689569e-05, + "loss": 2.1553, + "step": 2884 + }, + { + "epoch": 0.885512584407612, + "grad_norm": 0.6566216945648193, + "learning_rate": 9.910513671442547e-05, + "loss": 2.0856, + "step": 2885 + }, + { + "epoch": 0.8858195211786372, + "grad_norm": 0.5712916851043701, + "learning_rate": 9.910420028666951e-05, + "loss": 2.1399, + "step": 2886 + }, + { + "epoch": 0.8861264579496624, + "grad_norm": 0.727664589881897, + "learning_rate": 9.910326337363707e-05, + "loss": 2.088, + "step": 2887 + }, + { + "epoch": 0.8864333947206875, + "grad_norm": 0.799963653087616, + "learning_rate": 9.91023259753374e-05, + "loss": 2.0984, + "step": 2888 + }, + { + "epoch": 0.8867403314917127, + "grad_norm": 0.9462977051734924, + "learning_rate": 9.910138809177975e-05, + "loss": 2.1262, + "step": 2889 + }, + { + "epoch": 0.8870472682627378, + "grad_norm": 0.9130533933639526, + "learning_rate": 9.910044972297343e-05, + "loss": 2.1967, + "step": 2890 + }, + { + "epoch": 0.887354205033763, + "grad_norm": 0.6971304416656494, + "learning_rate": 9.909951086892767e-05, + "loss": 2.0797, + "step": 2891 + }, + { + "epoch": 0.8876611418047882, + "grad_norm": 0.5822353363037109, + "learning_rate": 9.909857152965176e-05, + "loss": 2.1152, + "step": 2892 + }, + { + "epoch": 0.8879680785758134, + "grad_norm": 0.5885453820228577, + "learning_rate": 9.9097631705155e-05, + "loss": 2.0323, + "step": 2893 + }, + { + "epoch": 0.8882750153468385, + "grad_norm": 0.6249284744262695, + "learning_rate": 9.909669139544666e-05, + "loss": 2.1076, + "step": 2894 + }, + { + "epoch": 0.8885819521178637, + "grad_norm": 0.6117702722549438, + "learning_rate": 9.909575060053604e-05, + "loss": 2.0608, + "step": 2895 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.560357928276062, + "learning_rate": 9.909480932043245e-05, + "loss": 2.145, + "step": 2896 + }, + { + "epoch": 0.8891958256599141, + "grad_norm": 0.5442607998847961, + "learning_rate": 9.909386755514516e-05, + "loss": 2.1091, + "step": 2897 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.5653077363967896, + "learning_rate": 9.909292530468351e-05, + "loss": 2.1097, + "step": 2898 + }, + { + "epoch": 0.8898096992019644, + "grad_norm": 0.531939685344696, + "learning_rate": 9.909198256905679e-05, + "loss": 2.0866, + "step": 2899 + }, + { + "epoch": 0.8901166359729895, + "grad_norm": 0.6238400340080261, + "learning_rate": 9.909103934827433e-05, + "loss": 2.1421, + "step": 2900 + }, + { + "epoch": 0.8904235727440147, + "grad_norm": 0.5685901045799255, + "learning_rate": 9.909009564234543e-05, + "loss": 2.0019, + "step": 2901 + }, + { + "epoch": 0.8907305095150398, + "grad_norm": 0.5979083180427551, + "learning_rate": 9.908915145127945e-05, + "loss": 2.0891, + "step": 2902 + }, + { + "epoch": 0.8910374462860651, + "grad_norm": 0.5847237706184387, + "learning_rate": 9.90882067750857e-05, + "loss": 2.1165, + "step": 2903 + }, + { + "epoch": 0.8913443830570903, + "grad_norm": 0.6281530261039734, + "learning_rate": 9.908726161377351e-05, + "loss": 2.1396, + "step": 2904 + }, + { + "epoch": 0.8916513198281154, + "grad_norm": 0.5685252547264099, + "learning_rate": 9.908631596735225e-05, + "loss": 2.0781, + "step": 2905 + }, + { + "epoch": 0.8919582565991406, + "grad_norm": 0.5427065491676331, + "learning_rate": 9.908536983583123e-05, + "loss": 2.1387, + "step": 2906 + }, + { + "epoch": 0.8922651933701657, + "grad_norm": 0.5972270965576172, + "learning_rate": 9.908442321921982e-05, + "loss": 2.0546, + "step": 2907 + }, + { + "epoch": 0.892572130141191, + "grad_norm": 0.562685489654541, + "learning_rate": 9.908347611752735e-05, + "loss": 2.093, + "step": 2908 + }, + { + "epoch": 0.8928790669122161, + "grad_norm": 0.6781734824180603, + "learning_rate": 9.908252853076323e-05, + "loss": 2.1589, + "step": 2909 + }, + { + "epoch": 0.8931860036832413, + "grad_norm": 0.7591540813446045, + "learning_rate": 9.908158045893678e-05, + "loss": 2.164, + "step": 2910 + }, + { + "epoch": 0.8934929404542664, + "grad_norm": 0.7161938548088074, + "learning_rate": 9.908063190205738e-05, + "loss": 2.079, + "step": 2911 + }, + { + "epoch": 0.8937998772252916, + "grad_norm": 0.7338036298751831, + "learning_rate": 9.907968286013442e-05, + "loss": 2.0033, + "step": 2912 + }, + { + "epoch": 0.8941068139963168, + "grad_norm": 0.7641176581382751, + "learning_rate": 9.907873333317727e-05, + "loss": 2.187, + "step": 2913 + }, + { + "epoch": 0.894413750767342, + "grad_norm": 0.6073760390281677, + "learning_rate": 9.90777833211953e-05, + "loss": 2.0589, + "step": 2914 + }, + { + "epoch": 0.8947206875383671, + "grad_norm": 0.49493756890296936, + "learning_rate": 9.907683282419791e-05, + "loss": 2.0555, + "step": 2915 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.6428996920585632, + "learning_rate": 9.907588184219449e-05, + "loss": 2.1083, + "step": 2916 + }, + { + "epoch": 0.8953345610804174, + "grad_norm": 0.6752644777297974, + "learning_rate": 9.907493037519447e-05, + "loss": 2.0987, + "step": 2917 + }, + { + "epoch": 0.8956414978514426, + "grad_norm": 0.5719494223594666, + "learning_rate": 9.907397842320719e-05, + "loss": 2.1735, + "step": 2918 + }, + { + "epoch": 0.8959484346224678, + "grad_norm": 0.5799626111984253, + "learning_rate": 9.907302598624211e-05, + "loss": 2.0978, + "step": 2919 + }, + { + "epoch": 0.896255371393493, + "grad_norm": 0.5407500267028809, + "learning_rate": 9.907207306430861e-05, + "loss": 2.0303, + "step": 2920 + }, + { + "epoch": 0.8965623081645181, + "grad_norm": 0.5950884222984314, + "learning_rate": 9.907111965741614e-05, + "loss": 2.0721, + "step": 2921 + }, + { + "epoch": 0.8968692449355433, + "grad_norm": 0.7711441516876221, + "learning_rate": 9.907016576557409e-05, + "loss": 2.1693, + "step": 2922 + }, + { + "epoch": 0.8971761817065684, + "grad_norm": 0.5522177815437317, + "learning_rate": 9.906921138879191e-05, + "loss": 2.1057, + "step": 2923 + }, + { + "epoch": 0.8974831184775937, + "grad_norm": 0.5743894577026367, + "learning_rate": 9.906825652707903e-05, + "loss": 2.119, + "step": 2924 + }, + { + "epoch": 0.8977900552486188, + "grad_norm": 0.5996440649032593, + "learning_rate": 9.906730118044486e-05, + "loss": 2.1251, + "step": 2925 + }, + { + "epoch": 0.898096992019644, + "grad_norm": 0.691302478313446, + "learning_rate": 9.906634534889887e-05, + "loss": 2.1459, + "step": 2926 + }, + { + "epoch": 0.8984039287906691, + "grad_norm": 0.6125866770744324, + "learning_rate": 9.90653890324505e-05, + "loss": 2.0739, + "step": 2927 + }, + { + "epoch": 0.8987108655616943, + "grad_norm": 0.5285681486129761, + "learning_rate": 9.906443223110919e-05, + "loss": 2.0398, + "step": 2928 + }, + { + "epoch": 0.8990178023327194, + "grad_norm": 0.5747935771942139, + "learning_rate": 9.90634749448844e-05, + "loss": 2.0688, + "step": 2929 + }, + { + "epoch": 0.8993247391037447, + "grad_norm": 0.5686646103858948, + "learning_rate": 9.90625171737856e-05, + "loss": 2.1196, + "step": 2930 + }, + { + "epoch": 0.8996316758747698, + "grad_norm": 0.5320247411727905, + "learning_rate": 9.906155891782225e-05, + "loss": 2.1069, + "step": 2931 + }, + { + "epoch": 0.899938612645795, + "grad_norm": 0.5626047849655151, + "learning_rate": 9.906060017700383e-05, + "loss": 2.1091, + "step": 2932 + }, + { + "epoch": 0.9002455494168201, + "grad_norm": 0.5284978151321411, + "learning_rate": 9.905964095133979e-05, + "loss": 2.036, + "step": 2933 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.5362093448638916, + "learning_rate": 9.905868124083962e-05, + "loss": 2.1273, + "step": 2934 + }, + { + "epoch": 0.9008594229588704, + "grad_norm": 0.5583781599998474, + "learning_rate": 9.90577210455128e-05, + "loss": 2.0871, + "step": 2935 + }, + { + "epoch": 0.9011663597298957, + "grad_norm": 0.5552016496658325, + "learning_rate": 9.905676036536883e-05, + "loss": 2.0785, + "step": 2936 + }, + { + "epoch": 0.9014732965009208, + "grad_norm": 0.6875657439231873, + "learning_rate": 9.905579920041724e-05, + "loss": 2.083, + "step": 2937 + }, + { + "epoch": 0.901780233271946, + "grad_norm": 0.5396340489387512, + "learning_rate": 9.905483755066744e-05, + "loss": 2.0717, + "step": 2938 + }, + { + "epoch": 0.9020871700429711, + "grad_norm": 0.594739556312561, + "learning_rate": 9.9053875416129e-05, + "loss": 2.1305, + "step": 2939 + }, + { + "epoch": 0.9023941068139963, + "grad_norm": 0.6208831667900085, + "learning_rate": 9.905291279681143e-05, + "loss": 2.0034, + "step": 2940 + }, + { + "epoch": 0.9027010435850215, + "grad_norm": 0.5154325366020203, + "learning_rate": 9.90519496927242e-05, + "loss": 2.098, + "step": 2941 + }, + { + "epoch": 0.9030079803560467, + "grad_norm": 0.5217738151550293, + "learning_rate": 9.905098610387687e-05, + "loss": 2.0467, + "step": 2942 + }, + { + "epoch": 0.9033149171270718, + "grad_norm": 0.5623623728752136, + "learning_rate": 9.905002203027894e-05, + "loss": 2.1854, + "step": 2943 + }, + { + "epoch": 0.903621853898097, + "grad_norm": 0.5365456938743591, + "learning_rate": 9.904905747193993e-05, + "loss": 2.1021, + "step": 2944 + }, + { + "epoch": 0.9039287906691221, + "grad_norm": 0.5391906499862671, + "learning_rate": 9.904809242886941e-05, + "loss": 2.1102, + "step": 2945 + }, + { + "epoch": 0.9042357274401474, + "grad_norm": 0.5439971685409546, + "learning_rate": 9.904712690107687e-05, + "loss": 2.0691, + "step": 2946 + }, + { + "epoch": 0.9045426642111725, + "grad_norm": 0.539383053779602, + "learning_rate": 9.904616088857189e-05, + "loss": 2.0514, + "step": 2947 + }, + { + "epoch": 0.9048496009821977, + "grad_norm": 0.5370060801506042, + "learning_rate": 9.904519439136399e-05, + "loss": 2.1069, + "step": 2948 + }, + { + "epoch": 0.9051565377532228, + "grad_norm": 0.5136541724205017, + "learning_rate": 9.904422740946274e-05, + "loss": 2.0519, + "step": 2949 + }, + { + "epoch": 0.905463474524248, + "grad_norm": 0.4970051348209381, + "learning_rate": 9.904325994287768e-05, + "loss": 2.0624, + "step": 2950 + }, + { + "epoch": 0.9057704112952731, + "grad_norm": 0.5003986954689026, + "learning_rate": 9.90422919916184e-05, + "loss": 2.135, + "step": 2951 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.5559821724891663, + "learning_rate": 9.904132355569443e-05, + "loss": 2.0733, + "step": 2952 + }, + { + "epoch": 0.9063842848373235, + "grad_norm": 0.5450533628463745, + "learning_rate": 9.904035463511537e-05, + "loss": 2.1491, + "step": 2953 + }, + { + "epoch": 0.9066912216083487, + "grad_norm": 0.5789141058921814, + "learning_rate": 9.903938522989076e-05, + "loss": 2.0604, + "step": 2954 + }, + { + "epoch": 0.9069981583793738, + "grad_norm": 0.6327412128448486, + "learning_rate": 9.903841534003023e-05, + "loss": 2.1307, + "step": 2955 + }, + { + "epoch": 0.907305095150399, + "grad_norm": 0.5694023966789246, + "learning_rate": 9.90374449655433e-05, + "loss": 2.1322, + "step": 2956 + }, + { + "epoch": 0.9076120319214241, + "grad_norm": 0.6241337060928345, + "learning_rate": 9.903647410643963e-05, + "loss": 2.1026, + "step": 2957 + }, + { + "epoch": 0.9079189686924494, + "grad_norm": 0.6257766485214233, + "learning_rate": 9.903550276272878e-05, + "loss": 2.0449, + "step": 2958 + }, + { + "epoch": 0.9082259054634745, + "grad_norm": 0.708626389503479, + "learning_rate": 9.903453093442032e-05, + "loss": 2.095, + "step": 2959 + }, + { + "epoch": 0.9085328422344997, + "grad_norm": 0.6769086122512817, + "learning_rate": 9.903355862152391e-05, + "loss": 2.0939, + "step": 2960 + }, + { + "epoch": 0.9088397790055248, + "grad_norm": 0.6221890449523926, + "learning_rate": 9.903258582404913e-05, + "loss": 2.1552, + "step": 2961 + }, + { + "epoch": 0.90914671577655, + "grad_norm": 0.7477858662605286, + "learning_rate": 9.903161254200561e-05, + "loss": 2.1155, + "step": 2962 + }, + { + "epoch": 0.9094536525475752, + "grad_norm": 0.665538489818573, + "learning_rate": 9.903063877540294e-05, + "loss": 2.1032, + "step": 2963 + }, + { + "epoch": 0.9097605893186004, + "grad_norm": 0.5973435044288635, + "learning_rate": 9.902966452425076e-05, + "loss": 2.0793, + "step": 2964 + }, + { + "epoch": 0.9100675260896255, + "grad_norm": 0.6544547080993652, + "learning_rate": 9.90286897885587e-05, + "loss": 2.1566, + "step": 2965 + }, + { + "epoch": 0.9103744628606507, + "grad_norm": 0.7162452936172485, + "learning_rate": 9.90277145683364e-05, + "loss": 2.1234, + "step": 2966 + }, + { + "epoch": 0.9106813996316758, + "grad_norm": 0.8400503993034363, + "learning_rate": 9.902673886359349e-05, + "loss": 2.216, + "step": 2967 + }, + { + "epoch": 0.910988336402701, + "grad_norm": 1.0350611209869385, + "learning_rate": 9.902576267433961e-05, + "loss": 2.0785, + "step": 2968 + }, + { + "epoch": 0.9112952731737262, + "grad_norm": 0.9551987051963806, + "learning_rate": 9.90247860005844e-05, + "loss": 2.0652, + "step": 2969 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.839712381362915, + "learning_rate": 9.902380884233751e-05, + "loss": 2.1197, + "step": 2970 + }, + { + "epoch": 0.9119091467157765, + "grad_norm": 0.6588022708892822, + "learning_rate": 9.902283119960863e-05, + "loss": 2.155, + "step": 2971 + }, + { + "epoch": 0.9122160834868017, + "grad_norm": 0.6532430052757263, + "learning_rate": 9.902185307240739e-05, + "loss": 2.0947, + "step": 2972 + }, + { + "epoch": 0.9125230202578268, + "grad_norm": 0.7890481352806091, + "learning_rate": 9.902087446074346e-05, + "loss": 2.0246, + "step": 2973 + }, + { + "epoch": 0.9128299570288521, + "grad_norm": 0.6234511137008667, + "learning_rate": 9.901989536462652e-05, + "loss": 2.1033, + "step": 2974 + }, + { + "epoch": 0.9131368937998773, + "grad_norm": 0.5875300168991089, + "learning_rate": 9.901891578406623e-05, + "loss": 2.0553, + "step": 2975 + }, + { + "epoch": 0.9134438305709024, + "grad_norm": 0.6868174076080322, + "learning_rate": 9.901793571907231e-05, + "loss": 2.1398, + "step": 2976 + }, + { + "epoch": 0.9137507673419276, + "grad_norm": 0.7423301339149475, + "learning_rate": 9.90169551696544e-05, + "loss": 2.1034, + "step": 2977 + }, + { + "epoch": 0.9140577041129527, + "grad_norm": 0.588916003704071, + "learning_rate": 9.901597413582222e-05, + "loss": 2.078, + "step": 2978 + }, + { + "epoch": 0.914364640883978, + "grad_norm": 0.5895309448242188, + "learning_rate": 9.901499261758544e-05, + "loss": 2.0902, + "step": 2979 + }, + { + "epoch": 0.9146715776550031, + "grad_norm": 0.5403301119804382, + "learning_rate": 9.901401061495379e-05, + "loss": 2.0291, + "step": 2980 + }, + { + "epoch": 0.9149785144260283, + "grad_norm": 0.6102077960968018, + "learning_rate": 9.901302812793696e-05, + "loss": 2.0415, + "step": 2981 + }, + { + "epoch": 0.9152854511970534, + "grad_norm": 0.6728450059890747, + "learning_rate": 9.901204515654465e-05, + "loss": 2.105, + "step": 2982 + }, + { + "epoch": 0.9155923879680786, + "grad_norm": 0.5886163711547852, + "learning_rate": 9.901106170078657e-05, + "loss": 2.0186, + "step": 2983 + }, + { + "epoch": 0.9158993247391037, + "grad_norm": 0.539252758026123, + "learning_rate": 9.901007776067247e-05, + "loss": 2.0604, + "step": 2984 + }, + { + "epoch": 0.916206261510129, + "grad_norm": 0.6169516444206238, + "learning_rate": 9.900909333621205e-05, + "loss": 2.1257, + "step": 2985 + }, + { + "epoch": 0.9165131982811541, + "grad_norm": 0.5624274015426636, + "learning_rate": 9.900810842741506e-05, + "loss": 2.0325, + "step": 2986 + }, + { + "epoch": 0.9168201350521793, + "grad_norm": 0.5931735634803772, + "learning_rate": 9.900712303429119e-05, + "loss": 2.0815, + "step": 2987 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.5720505714416504, + "learning_rate": 9.900613715685023e-05, + "loss": 2.1261, + "step": 2988 + }, + { + "epoch": 0.9174340085942296, + "grad_norm": 0.5752067565917969, + "learning_rate": 9.900515079510189e-05, + "loss": 2.1402, + "step": 2989 + }, + { + "epoch": 0.9177409453652547, + "grad_norm": 0.5836917757987976, + "learning_rate": 9.900416394905591e-05, + "loss": 2.0523, + "step": 2990 + }, + { + "epoch": 0.91804788213628, + "grad_norm": 0.6408325433731079, + "learning_rate": 9.900317661872209e-05, + "loss": 2.1874, + "step": 2991 + }, + { + "epoch": 0.9183548189073051, + "grad_norm": 0.6188341379165649, + "learning_rate": 9.900218880411013e-05, + "loss": 2.0903, + "step": 2992 + }, + { + "epoch": 0.9186617556783303, + "grad_norm": 0.5740565657615662, + "learning_rate": 9.900120050522985e-05, + "loss": 2.1243, + "step": 2993 + }, + { + "epoch": 0.9189686924493554, + "grad_norm": 0.635638952255249, + "learning_rate": 9.900021172209096e-05, + "loss": 2.089, + "step": 2994 + }, + { + "epoch": 0.9192756292203806, + "grad_norm": 0.5538209676742554, + "learning_rate": 9.899922245470326e-05, + "loss": 2.0489, + "step": 2995 + }, + { + "epoch": 0.9195825659914058, + "grad_norm": 0.5440292954444885, + "learning_rate": 9.899823270307654e-05, + "loss": 2.0534, + "step": 2996 + }, + { + "epoch": 0.919889502762431, + "grad_norm": 0.6203792691230774, + "learning_rate": 9.899724246722055e-05, + "loss": 2.2799, + "step": 2997 + }, + { + "epoch": 0.9201964395334561, + "grad_norm": 0.6299278140068054, + "learning_rate": 9.89962517471451e-05, + "loss": 2.0813, + "step": 2998 + }, + { + "epoch": 0.9205033763044813, + "grad_norm": 0.6156774759292603, + "learning_rate": 9.899526054285997e-05, + "loss": 2.1345, + "step": 2999 + }, + { + "epoch": 0.9208103130755064, + "grad_norm": 0.5940032601356506, + "learning_rate": 9.899426885437496e-05, + "loss": 2.133, + "step": 3000 + }, + { + "epoch": 0.9211172498465316, + "grad_norm": 0.6210232377052307, + "learning_rate": 9.899327668169987e-05, + "loss": 2.0275, + "step": 3001 + }, + { + "epoch": 0.9214241866175568, + "grad_norm": 0.5578985214233398, + "learning_rate": 9.89922840248445e-05, + "loss": 2.0806, + "step": 3002 + }, + { + "epoch": 0.921731123388582, + "grad_norm": 0.5264963507652283, + "learning_rate": 9.899129088381866e-05, + "loss": 2.1233, + "step": 3003 + }, + { + "epoch": 0.9220380601596071, + "grad_norm": 0.5414119958877563, + "learning_rate": 9.899029725863218e-05, + "loss": 2.1052, + "step": 3004 + }, + { + "epoch": 0.9223449969306323, + "grad_norm": 0.5933207869529724, + "learning_rate": 9.898930314929486e-05, + "loss": 2.108, + "step": 3005 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.6170317530632019, + "learning_rate": 9.898830855581654e-05, + "loss": 2.0997, + "step": 3006 + }, + { + "epoch": 0.9229588704726827, + "grad_norm": 0.5930282473564148, + "learning_rate": 9.898731347820705e-05, + "loss": 2.0507, + "step": 3007 + }, + { + "epoch": 0.9232658072437078, + "grad_norm": 0.5894142985343933, + "learning_rate": 9.898631791647619e-05, + "loss": 2.0687, + "step": 3008 + }, + { + "epoch": 0.923572744014733, + "grad_norm": 0.6560437083244324, + "learning_rate": 9.898532187063383e-05, + "loss": 2.096, + "step": 3009 + }, + { + "epoch": 0.9238796807857581, + "grad_norm": 0.6083245873451233, + "learning_rate": 9.898432534068983e-05, + "loss": 2.0526, + "step": 3010 + }, + { + "epoch": 0.9241866175567833, + "grad_norm": 0.5152565240859985, + "learning_rate": 9.8983328326654e-05, + "loss": 2.0802, + "step": 3011 + }, + { + "epoch": 0.9244935543278084, + "grad_norm": 0.6326588988304138, + "learning_rate": 9.89823308285362e-05, + "loss": 2.1246, + "step": 3012 + }, + { + "epoch": 0.9248004910988337, + "grad_norm": 0.6821309328079224, + "learning_rate": 9.898133284634632e-05, + "loss": 2.1106, + "step": 3013 + }, + { + "epoch": 0.9251074278698588, + "grad_norm": 0.6192164421081543, + "learning_rate": 9.898033438009419e-05, + "loss": 2.0475, + "step": 3014 + }, + { + "epoch": 0.925414364640884, + "grad_norm": 0.6112427115440369, + "learning_rate": 9.897933542978967e-05, + "loss": 2.0904, + "step": 3015 + }, + { + "epoch": 0.9257213014119091, + "grad_norm": 0.5729427933692932, + "learning_rate": 9.897833599544268e-05, + "loss": 2.1151, + "step": 3016 + }, + { + "epoch": 0.9260282381829343, + "grad_norm": 0.6200255751609802, + "learning_rate": 9.897733607706305e-05, + "loss": 2.0815, + "step": 3017 + }, + { + "epoch": 0.9263351749539595, + "grad_norm": 0.635920524597168, + "learning_rate": 9.897633567466068e-05, + "loss": 2.0724, + "step": 3018 + }, + { + "epoch": 0.9266421117249847, + "grad_norm": 0.5916038155555725, + "learning_rate": 9.897533478824546e-05, + "loss": 2.1527, + "step": 3019 + }, + { + "epoch": 0.9269490484960098, + "grad_norm": 0.5552941560745239, + "learning_rate": 9.897433341782727e-05, + "loss": 2.0958, + "step": 3020 + }, + { + "epoch": 0.927255985267035, + "grad_norm": 0.562383770942688, + "learning_rate": 9.897333156341602e-05, + "loss": 2.0939, + "step": 3021 + }, + { + "epoch": 0.9275629220380601, + "grad_norm": 0.5227869153022766, + "learning_rate": 9.897232922502158e-05, + "loss": 2.1358, + "step": 3022 + }, + { + "epoch": 0.9278698588090853, + "grad_norm": 0.5671074986457825, + "learning_rate": 9.897132640265391e-05, + "loss": 2.0877, + "step": 3023 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.5176356434822083, + "learning_rate": 9.897032309632287e-05, + "loss": 2.0392, + "step": 3024 + }, + { + "epoch": 0.9284837323511357, + "grad_norm": 0.5160155296325684, + "learning_rate": 9.89693193060384e-05, + "loss": 2.069, + "step": 3025 + }, + { + "epoch": 0.9287906691221608, + "grad_norm": 0.5034440159797668, + "learning_rate": 9.896831503181042e-05, + "loss": 2.0348, + "step": 3026 + }, + { + "epoch": 0.929097605893186, + "grad_norm": 0.5146151781082153, + "learning_rate": 9.896731027364884e-05, + "loss": 2.0884, + "step": 3027 + }, + { + "epoch": 0.9294045426642111, + "grad_norm": 0.7153071165084839, + "learning_rate": 9.896630503156361e-05, + "loss": 2.2295, + "step": 3028 + }, + { + "epoch": 0.9297114794352364, + "grad_norm": 0.7201753258705139, + "learning_rate": 9.896529930556464e-05, + "loss": 2.1285, + "step": 3029 + }, + { + "epoch": 0.9300184162062615, + "grad_norm": 0.7110029458999634, + "learning_rate": 9.89642930956619e-05, + "loss": 2.1371, + "step": 3030 + }, + { + "epoch": 0.9303253529772867, + "grad_norm": 0.695444643497467, + "learning_rate": 9.896328640186531e-05, + "loss": 2.0698, + "step": 3031 + }, + { + "epoch": 0.9306322897483118, + "grad_norm": 0.6157357096672058, + "learning_rate": 9.896227922418482e-05, + "loss": 2.1294, + "step": 3032 + }, + { + "epoch": 0.930939226519337, + "grad_norm": 0.5473730564117432, + "learning_rate": 9.896127156263039e-05, + "loss": 2.0487, + "step": 3033 + }, + { + "epoch": 0.9312461632903621, + "grad_norm": 0.6400229334831238, + "learning_rate": 9.896026341721198e-05, + "loss": 2.0422, + "step": 3034 + }, + { + "epoch": 0.9315531000613874, + "grad_norm": 0.5046324729919434, + "learning_rate": 9.895925478793955e-05, + "loss": 2.0715, + "step": 3035 + }, + { + "epoch": 0.9318600368324125, + "grad_norm": 0.5316528081893921, + "learning_rate": 9.895824567482307e-05, + "loss": 2.11, + "step": 3036 + }, + { + "epoch": 0.9321669736034377, + "grad_norm": 0.5760478973388672, + "learning_rate": 9.895723607787251e-05, + "loss": 2.0885, + "step": 3037 + }, + { + "epoch": 0.9324739103744628, + "grad_norm": 0.5034705996513367, + "learning_rate": 9.895622599709785e-05, + "loss": 2.0024, + "step": 3038 + }, + { + "epoch": 0.932780847145488, + "grad_norm": 0.46088743209838867, + "learning_rate": 9.895521543250906e-05, + "loss": 2.0794, + "step": 3039 + }, + { + "epoch": 0.9330877839165131, + "grad_norm": 0.5219544172286987, + "learning_rate": 9.895420438411616e-05, + "loss": 2.1002, + "step": 3040 + }, + { + "epoch": 0.9333947206875384, + "grad_norm": 0.5363453030586243, + "learning_rate": 9.89531928519291e-05, + "loss": 2.0629, + "step": 3041 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.5860787630081177, + "learning_rate": 9.89521808359579e-05, + "loss": 2.0999, + "step": 3042 + }, + { + "epoch": 0.9340085942295887, + "grad_norm": 0.7155836224555969, + "learning_rate": 9.895116833621255e-05, + "loss": 2.1674, + "step": 3043 + }, + { + "epoch": 0.9343155310006138, + "grad_norm": 0.8029196262359619, + "learning_rate": 9.895015535270307e-05, + "loss": 2.0776, + "step": 3044 + }, + { + "epoch": 0.934622467771639, + "grad_norm": 0.6973832845687866, + "learning_rate": 9.894914188543946e-05, + "loss": 2.0537, + "step": 3045 + }, + { + "epoch": 0.9349294045426643, + "grad_norm": 0.6646706461906433, + "learning_rate": 9.894812793443175e-05, + "loss": 2.0857, + "step": 3046 + }, + { + "epoch": 0.9352363413136894, + "grad_norm": 0.6343888640403748, + "learning_rate": 9.894711349968995e-05, + "loss": 2.0832, + "step": 3047 + }, + { + "epoch": 0.9355432780847146, + "grad_norm": 0.54819256067276, + "learning_rate": 9.894609858122407e-05, + "loss": 2.1576, + "step": 3048 + }, + { + "epoch": 0.9358502148557397, + "grad_norm": 0.6905701160430908, + "learning_rate": 9.894508317904419e-05, + "loss": 2.0685, + "step": 3049 + }, + { + "epoch": 0.9361571516267649, + "grad_norm": 0.605591356754303, + "learning_rate": 9.894406729316028e-05, + "loss": 2.0931, + "step": 3050 + }, + { + "epoch": 0.93646408839779, + "grad_norm": 0.5702943801879883, + "learning_rate": 9.89430509235824e-05, + "loss": 2.1224, + "step": 3051 + }, + { + "epoch": 0.9367710251688153, + "grad_norm": 0.5855122804641724, + "learning_rate": 9.894203407032064e-05, + "loss": 2.0747, + "step": 3052 + }, + { + "epoch": 0.9370779619398404, + "grad_norm": 0.6002167463302612, + "learning_rate": 9.894101673338498e-05, + "loss": 2.0991, + "step": 3053 + }, + { + "epoch": 0.9373848987108656, + "grad_norm": 0.5914842486381531, + "learning_rate": 9.893999891278553e-05, + "loss": 2.0427, + "step": 3054 + }, + { + "epoch": 0.9376918354818907, + "grad_norm": 0.6283048391342163, + "learning_rate": 9.893898060853232e-05, + "loss": 2.0558, + "step": 3055 + }, + { + "epoch": 0.937998772252916, + "grad_norm": 0.5955209136009216, + "learning_rate": 9.893796182063542e-05, + "loss": 2.1286, + "step": 3056 + }, + { + "epoch": 0.9383057090239411, + "grad_norm": 0.5579878687858582, + "learning_rate": 9.893694254910489e-05, + "loss": 2.0799, + "step": 3057 + }, + { + "epoch": 0.9386126457949663, + "grad_norm": 0.5690281391143799, + "learning_rate": 9.893592279395082e-05, + "loss": 2.0699, + "step": 3058 + }, + { + "epoch": 0.9389195825659914, + "grad_norm": 0.5189259648323059, + "learning_rate": 9.893490255518327e-05, + "loss": 2.0627, + "step": 3059 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.5205439925193787, + "learning_rate": 9.893388183281233e-05, + "loss": 2.0136, + "step": 3060 + }, + { + "epoch": 0.9395334561080417, + "grad_norm": 0.492593914270401, + "learning_rate": 9.89328606268481e-05, + "loss": 2.0799, + "step": 3061 + }, + { + "epoch": 0.939840392879067, + "grad_norm": 0.6511666178703308, + "learning_rate": 9.893183893730067e-05, + "loss": 2.1297, + "step": 3062 + }, + { + "epoch": 0.9401473296500921, + "grad_norm": 0.7640050053596497, + "learning_rate": 9.89308167641801e-05, + "loss": 2.1384, + "step": 3063 + }, + { + "epoch": 0.9404542664211173, + "grad_norm": 0.7526536583900452, + "learning_rate": 9.892979410749654e-05, + "loss": 2.0454, + "step": 3064 + }, + { + "epoch": 0.9407612031921424, + "grad_norm": 0.7140639424324036, + "learning_rate": 9.892877096726007e-05, + "loss": 2.0219, + "step": 3065 + }, + { + "epoch": 0.9410681399631676, + "grad_norm": 0.6584374308586121, + "learning_rate": 9.89277473434808e-05, + "loss": 2.0943, + "step": 3066 + }, + { + "epoch": 0.9413750767341927, + "grad_norm": 0.5889024138450623, + "learning_rate": 9.892672323616888e-05, + "loss": 2.1088, + "step": 3067 + }, + { + "epoch": 0.941682013505218, + "grad_norm": 0.6196749806404114, + "learning_rate": 9.892569864533438e-05, + "loss": 2.101, + "step": 3068 + }, + { + "epoch": 0.9419889502762431, + "grad_norm": 0.6432211399078369, + "learning_rate": 9.892467357098744e-05, + "loss": 2.0828, + "step": 3069 + }, + { + "epoch": 0.9422958870472683, + "grad_norm": 0.6448069214820862, + "learning_rate": 9.892364801313823e-05, + "loss": 2.1389, + "step": 3070 + }, + { + "epoch": 0.9426028238182934, + "grad_norm": 0.597197949886322, + "learning_rate": 9.892262197179682e-05, + "loss": 2.0902, + "step": 3071 + }, + { + "epoch": 0.9429097605893186, + "grad_norm": 0.625348687171936, + "learning_rate": 9.892159544697341e-05, + "loss": 2.0659, + "step": 3072 + }, + { + "epoch": 0.9432166973603437, + "grad_norm": 0.5109166502952576, + "learning_rate": 9.892056843867812e-05, + "loss": 2.0895, + "step": 3073 + }, + { + "epoch": 0.943523634131369, + "grad_norm": 0.5917959213256836, + "learning_rate": 9.891954094692108e-05, + "loss": 2.0646, + "step": 3074 + }, + { + "epoch": 0.9438305709023941, + "grad_norm": 0.5320633053779602, + "learning_rate": 9.891851297171249e-05, + "loss": 2.107, + "step": 3075 + }, + { + "epoch": 0.9441375076734193, + "grad_norm": 0.5271332263946533, + "learning_rate": 9.891748451306246e-05, + "loss": 2.0984, + "step": 3076 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.5389983057975769, + "learning_rate": 9.89164555709812e-05, + "loss": 2.1097, + "step": 3077 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.5536573529243469, + "learning_rate": 9.891542614547885e-05, + "loss": 2.1271, + "step": 3078 + }, + { + "epoch": 0.9450583179864948, + "grad_norm": 0.5481712222099304, + "learning_rate": 9.891439623656558e-05, + "loss": 2.0975, + "step": 3079 + }, + { + "epoch": 0.94536525475752, + "grad_norm": 0.626431941986084, + "learning_rate": 9.891336584425157e-05, + "loss": 2.1561, + "step": 3080 + }, + { + "epoch": 0.9456721915285451, + "grad_norm": 0.7452689409255981, + "learning_rate": 9.891233496854702e-05, + "loss": 2.0791, + "step": 3081 + }, + { + "epoch": 0.9459791282995703, + "grad_norm": 0.9399113059043884, + "learning_rate": 9.89113036094621e-05, + "loss": 2.0706, + "step": 3082 + }, + { + "epoch": 0.9462860650705954, + "grad_norm": 1.0733267068862915, + "learning_rate": 9.891027176700701e-05, + "loss": 2.0705, + "step": 3083 + }, + { + "epoch": 0.9465930018416207, + "grad_norm": 0.7521542906761169, + "learning_rate": 9.890923944119194e-05, + "loss": 2.0862, + "step": 3084 + }, + { + "epoch": 0.9468999386126458, + "grad_norm": 0.5447198152542114, + "learning_rate": 9.890820663202713e-05, + "loss": 2.1047, + "step": 3085 + }, + { + "epoch": 0.947206875383671, + "grad_norm": 0.5733833312988281, + "learning_rate": 9.890717333952273e-05, + "loss": 2.121, + "step": 3086 + }, + { + "epoch": 0.9475138121546961, + "grad_norm": 0.7225440144538879, + "learning_rate": 9.890613956368899e-05, + "loss": 2.0533, + "step": 3087 + }, + { + "epoch": 0.9478207489257213, + "grad_norm": 0.6377096176147461, + "learning_rate": 9.89051053045361e-05, + "loss": 2.07, + "step": 3088 + }, + { + "epoch": 0.9481276856967464, + "grad_norm": 0.556656002998352, + "learning_rate": 9.890407056207432e-05, + "loss": 2.1103, + "step": 3089 + }, + { + "epoch": 0.9484346224677717, + "grad_norm": 0.6807621121406555, + "learning_rate": 9.890303533631382e-05, + "loss": 2.1351, + "step": 3090 + }, + { + "epoch": 0.9487415592387968, + "grad_norm": 0.7187803983688354, + "learning_rate": 9.890199962726487e-05, + "loss": 2.0582, + "step": 3091 + }, + { + "epoch": 0.949048496009822, + "grad_norm": 0.6201196908950806, + "learning_rate": 9.890096343493771e-05, + "loss": 2.0799, + "step": 3092 + }, + { + "epoch": 0.9493554327808471, + "grad_norm": 0.6258496046066284, + "learning_rate": 9.889992675934257e-05, + "loss": 2.156, + "step": 3093 + }, + { + "epoch": 0.9496623695518723, + "grad_norm": 0.6191570162773132, + "learning_rate": 9.889888960048967e-05, + "loss": 2.0121, + "step": 3094 + }, + { + "epoch": 0.9499693063228974, + "grad_norm": 0.5668848752975464, + "learning_rate": 9.88978519583893e-05, + "loss": 2.0954, + "step": 3095 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.5596859455108643, + "learning_rate": 9.88968138330517e-05, + "loss": 2.1274, + "step": 3096 + }, + { + "epoch": 0.9505831798649478, + "grad_norm": 0.6199706196784973, + "learning_rate": 9.889577522448712e-05, + "loss": 2.0588, + "step": 3097 + }, + { + "epoch": 0.950890116635973, + "grad_norm": 0.5129860639572144, + "learning_rate": 9.889473613270584e-05, + "loss": 2.0722, + "step": 3098 + }, + { + "epoch": 0.9511970534069981, + "grad_norm": 0.513263463973999, + "learning_rate": 9.88936965577181e-05, + "loss": 2.0298, + "step": 3099 + }, + { + "epoch": 0.9515039901780233, + "grad_norm": 0.4870156943798065, + "learning_rate": 9.88926564995342e-05, + "loss": 2.025, + "step": 3100 + }, + { + "epoch": 0.9518109269490485, + "grad_norm": 0.5310595035552979, + "learning_rate": 9.889161595816442e-05, + "loss": 2.0767, + "step": 3101 + }, + { + "epoch": 0.9521178637200737, + "grad_norm": 0.5993812084197998, + "learning_rate": 9.889057493361903e-05, + "loss": 2.1931, + "step": 3102 + }, + { + "epoch": 0.9524248004910988, + "grad_norm": 0.6157637238502502, + "learning_rate": 9.888953342590832e-05, + "loss": 2.0757, + "step": 3103 + }, + { + "epoch": 0.952731737262124, + "grad_norm": 0.6280032992362976, + "learning_rate": 9.88884914350426e-05, + "loss": 2.0042, + "step": 3104 + }, + { + "epoch": 0.9530386740331491, + "grad_norm": 0.6740781664848328, + "learning_rate": 9.888744896103212e-05, + "loss": 2.0663, + "step": 3105 + }, + { + "epoch": 0.9533456108041743, + "grad_norm": 0.5851804614067078, + "learning_rate": 9.888640600388725e-05, + "loss": 2.0585, + "step": 3106 + }, + { + "epoch": 0.9536525475751995, + "grad_norm": 0.6590312719345093, + "learning_rate": 9.888536256361825e-05, + "loss": 2.0698, + "step": 3107 + }, + { + "epoch": 0.9539594843462247, + "grad_norm": 0.5356595516204834, + "learning_rate": 9.888431864023544e-05, + "loss": 2.1019, + "step": 3108 + }, + { + "epoch": 0.9542664211172498, + "grad_norm": 0.6401084661483765, + "learning_rate": 9.888327423374915e-05, + "loss": 2.1176, + "step": 3109 + }, + { + "epoch": 0.954573357888275, + "grad_norm": 0.6582900285720825, + "learning_rate": 9.888222934416968e-05, + "loss": 2.0375, + "step": 3110 + }, + { + "epoch": 0.9548802946593001, + "grad_norm": 0.6245424151420593, + "learning_rate": 9.888118397150738e-05, + "loss": 1.9913, + "step": 3111 + }, + { + "epoch": 0.9551872314303254, + "grad_norm": 0.5871780514717102, + "learning_rate": 9.888013811577256e-05, + "loss": 2.1434, + "step": 3112 + }, + { + "epoch": 0.9554941682013505, + "grad_norm": 0.6295487284660339, + "learning_rate": 9.887909177697559e-05, + "loss": 2.0805, + "step": 3113 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.5844045877456665, + "learning_rate": 9.887804495512676e-05, + "loss": 2.076, + "step": 3114 + }, + { + "epoch": 0.9561080417434008, + "grad_norm": 0.5581921339035034, + "learning_rate": 9.887699765023645e-05, + "loss": 2.131, + "step": 3115 + }, + { + "epoch": 0.956414978514426, + "grad_norm": 0.6659174561500549, + "learning_rate": 9.8875949862315e-05, + "loss": 2.0759, + "step": 3116 + }, + { + "epoch": 0.9567219152854513, + "grad_norm": 0.5852961540222168, + "learning_rate": 9.887490159137276e-05, + "loss": 2.0486, + "step": 3117 + }, + { + "epoch": 0.9570288520564764, + "grad_norm": 0.6077566146850586, + "learning_rate": 9.887385283742011e-05, + "loss": 2.1132, + "step": 3118 + }, + { + "epoch": 0.9573357888275016, + "grad_norm": 0.5991361141204834, + "learning_rate": 9.88728036004674e-05, + "loss": 2.0322, + "step": 3119 + }, + { + "epoch": 0.9576427255985267, + "grad_norm": 0.5832391977310181, + "learning_rate": 9.887175388052499e-05, + "loss": 2.135, + "step": 3120 + }, + { + "epoch": 0.9579496623695519, + "grad_norm": 0.5479732751846313, + "learning_rate": 9.887070367760327e-05, + "loss": 2.1222, + "step": 3121 + }, + { + "epoch": 0.958256599140577, + "grad_norm": 0.5630220770835876, + "learning_rate": 9.88696529917126e-05, + "loss": 2.1247, + "step": 3122 + }, + { + "epoch": 0.9585635359116023, + "grad_norm": 0.7052439451217651, + "learning_rate": 9.88686018228634e-05, + "loss": 2.204, + "step": 3123 + }, + { + "epoch": 0.9588704726826274, + "grad_norm": 0.5995638370513916, + "learning_rate": 9.8867550171066e-05, + "loss": 2.0153, + "step": 3124 + }, + { + "epoch": 0.9591774094536526, + "grad_norm": 0.5689408779144287, + "learning_rate": 9.886649803633086e-05, + "loss": 2.0341, + "step": 3125 + }, + { + "epoch": 0.9594843462246777, + "grad_norm": 0.5247456431388855, + "learning_rate": 9.886544541866832e-05, + "loss": 2.0657, + "step": 3126 + }, + { + "epoch": 0.9597912829957029, + "grad_norm": 0.5596463084220886, + "learning_rate": 9.886439231808882e-05, + "loss": 2.0829, + "step": 3127 + }, + { + "epoch": 0.960098219766728, + "grad_norm": 0.4993874430656433, + "learning_rate": 9.886333873460275e-05, + "loss": 2.0517, + "step": 3128 + }, + { + "epoch": 0.9604051565377533, + "grad_norm": 0.5776910185813904, + "learning_rate": 9.886228466822054e-05, + "loss": 2.0124, + "step": 3129 + }, + { + "epoch": 0.9607120933087784, + "grad_norm": 0.5871354341506958, + "learning_rate": 9.886123011895258e-05, + "loss": 2.0327, + "step": 3130 + }, + { + "epoch": 0.9610190300798036, + "grad_norm": 0.5873207449913025, + "learning_rate": 9.886017508680931e-05, + "loss": 2.0756, + "step": 3131 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.6422720551490784, + "learning_rate": 9.885911957180113e-05, + "loss": 2.0649, + "step": 3132 + }, + { + "epoch": 0.9616329036218539, + "grad_norm": 0.6040814518928528, + "learning_rate": 9.885806357393853e-05, + "loss": 2.066, + "step": 3133 + }, + { + "epoch": 0.961939840392879, + "grad_norm": 0.6629621982574463, + "learning_rate": 9.885700709323189e-05, + "loss": 2.0824, + "step": 3134 + }, + { + "epoch": 0.9622467771639043, + "grad_norm": 0.572485625743866, + "learning_rate": 9.885595012969168e-05, + "loss": 2.0572, + "step": 3135 + }, + { + "epoch": 0.9625537139349294, + "grad_norm": 0.5050783753395081, + "learning_rate": 9.885489268332833e-05, + "loss": 2.0645, + "step": 3136 + }, + { + "epoch": 0.9628606507059546, + "grad_norm": 0.5744417309761047, + "learning_rate": 9.885383475415229e-05, + "loss": 2.0549, + "step": 3137 + }, + { + "epoch": 0.9631675874769797, + "grad_norm": 0.5604275465011597, + "learning_rate": 9.885277634217403e-05, + "loss": 2.1339, + "step": 3138 + }, + { + "epoch": 0.963474524248005, + "grad_norm": 0.6182584762573242, + "learning_rate": 9.8851717447404e-05, + "loss": 2.0397, + "step": 3139 + }, + { + "epoch": 0.9637814610190301, + "grad_norm": 0.510515570640564, + "learning_rate": 9.885065806985266e-05, + "loss": 1.9761, + "step": 3140 + }, + { + "epoch": 0.9640883977900553, + "grad_norm": 0.4881763756275177, + "learning_rate": 9.884959820953048e-05, + "loss": 2.005, + "step": 3141 + }, + { + "epoch": 0.9643953345610804, + "grad_norm": 0.47206851840019226, + "learning_rate": 9.884853786644794e-05, + "loss": 2.0661, + "step": 3142 + }, + { + "epoch": 0.9647022713321056, + "grad_norm": 0.5691676735877991, + "learning_rate": 9.884747704061552e-05, + "loss": 2.1316, + "step": 3143 + }, + { + "epoch": 0.9650092081031307, + "grad_norm": 0.5338765978813171, + "learning_rate": 9.884641573204372e-05, + "loss": 2.0715, + "step": 3144 + }, + { + "epoch": 0.965316144874156, + "grad_norm": 0.5721597075462341, + "learning_rate": 9.884535394074299e-05, + "loss": 2.1004, + "step": 3145 + }, + { + "epoch": 0.9656230816451811, + "grad_norm": 0.5269518494606018, + "learning_rate": 9.884429166672384e-05, + "loss": 2.1233, + "step": 3146 + }, + { + "epoch": 0.9659300184162063, + "grad_norm": 0.5264385342597961, + "learning_rate": 9.884322890999678e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.9662369551872314, + "grad_norm": 0.6094604730606079, + "learning_rate": 9.88421656705723e-05, + "loss": 2.1009, + "step": 3148 + }, + { + "epoch": 0.9665438919582566, + "grad_norm": 0.5538906455039978, + "learning_rate": 9.884110194846093e-05, + "loss": 2.0055, + "step": 3149 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.591526985168457, + "learning_rate": 9.884003774367313e-05, + "loss": 2.0655, + "step": 3150 + }, + { + "epoch": 0.967157765500307, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.883897305621948e-05, + "loss": 2.0775, + "step": 3151 + }, + { + "epoch": 0.9674647022713321, + "grad_norm": 0.5074640512466431, + "learning_rate": 9.883790788611045e-05, + "loss": 2.0322, + "step": 3152 + }, + { + "epoch": 0.9677716390423573, + "grad_norm": 0.5111376047134399, + "learning_rate": 9.883684223335661e-05, + "loss": 2.0972, + "step": 3153 + }, + { + "epoch": 0.9680785758133824, + "grad_norm": 0.5187644362449646, + "learning_rate": 9.883577609796846e-05, + "loss": 2.072, + "step": 3154 + }, + { + "epoch": 0.9683855125844076, + "grad_norm": 0.5285201072692871, + "learning_rate": 9.883470947995654e-05, + "loss": 2.0468, + "step": 3155 + }, + { + "epoch": 0.9686924493554327, + "grad_norm": 0.49360916018486023, + "learning_rate": 9.883364237933142e-05, + "loss": 2.07, + "step": 3156 + }, + { + "epoch": 0.968999386126458, + "grad_norm": 0.6359294056892395, + "learning_rate": 9.88325747961036e-05, + "loss": 2.1169, + "step": 3157 + }, + { + "epoch": 0.9693063228974831, + "grad_norm": 0.6274764537811279, + "learning_rate": 9.883150673028367e-05, + "loss": 2.1412, + "step": 3158 + }, + { + "epoch": 0.9696132596685083, + "grad_norm": 0.5755917429924011, + "learning_rate": 9.883043818188215e-05, + "loss": 2.0547, + "step": 3159 + }, + { + "epoch": 0.9699201964395334, + "grad_norm": 0.4765770137310028, + "learning_rate": 9.882936915090964e-05, + "loss": 2.02, + "step": 3160 + }, + { + "epoch": 0.9702271332105586, + "grad_norm": 0.5085053443908691, + "learning_rate": 9.882829963737667e-05, + "loss": 2.0355, + "step": 3161 + }, + { + "epoch": 0.9705340699815838, + "grad_norm": 0.49804505705833435, + "learning_rate": 9.882722964129385e-05, + "loss": 2.1274, + "step": 3162 + }, + { + "epoch": 0.970841006752609, + "grad_norm": 0.5575076341629028, + "learning_rate": 9.882615916267171e-05, + "loss": 2.0661, + "step": 3163 + }, + { + "epoch": 0.9711479435236341, + "grad_norm": 0.5678727626800537, + "learning_rate": 9.882508820152084e-05, + "loss": 2.1135, + "step": 3164 + }, + { + "epoch": 0.9714548802946593, + "grad_norm": 0.5505611896514893, + "learning_rate": 9.882401675785185e-05, + "loss": 2.0888, + "step": 3165 + }, + { + "epoch": 0.9717618170656844, + "grad_norm": 0.5224125385284424, + "learning_rate": 9.88229448316753e-05, + "loss": 2.0492, + "step": 3166 + }, + { + "epoch": 0.9720687538367097, + "grad_norm": 0.437215656042099, + "learning_rate": 9.882187242300178e-05, + "loss": 1.9927, + "step": 3167 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.4914848804473877, + "learning_rate": 9.882079953184192e-05, + "loss": 2.0309, + "step": 3168 + }, + { + "epoch": 0.97268262737876, + "grad_norm": 0.4990764260292053, + "learning_rate": 9.88197261582063e-05, + "loss": 2.0408, + "step": 3169 + }, + { + "epoch": 0.9729895641497851, + "grad_norm": 0.5283234715461731, + "learning_rate": 9.881865230210552e-05, + "loss": 2.0627, + "step": 3170 + }, + { + "epoch": 0.9732965009208103, + "grad_norm": 0.5771347284317017, + "learning_rate": 9.88175779635502e-05, + "loss": 2.1591, + "step": 3171 + }, + { + "epoch": 0.9736034376918354, + "grad_norm": 0.5020268559455872, + "learning_rate": 9.881650314255098e-05, + "loss": 2.0311, + "step": 3172 + }, + { + "epoch": 0.9739103744628607, + "grad_norm": 0.5476529002189636, + "learning_rate": 9.881542783911846e-05, + "loss": 2.1114, + "step": 3173 + }, + { + "epoch": 0.9742173112338858, + "grad_norm": 0.5630559921264648, + "learning_rate": 9.881435205326327e-05, + "loss": 2.0617, + "step": 3174 + }, + { + "epoch": 0.974524248004911, + "grad_norm": 0.5931001305580139, + "learning_rate": 9.881327578499604e-05, + "loss": 2.0376, + "step": 3175 + }, + { + "epoch": 0.9748311847759361, + "grad_norm": 0.6123979091644287, + "learning_rate": 9.881219903432742e-05, + "loss": 2.0995, + "step": 3176 + }, + { + "epoch": 0.9751381215469613, + "grad_norm": 0.6064465641975403, + "learning_rate": 9.881112180126802e-05, + "loss": 2.0533, + "step": 3177 + }, + { + "epoch": 0.9754450583179864, + "grad_norm": 0.6071485877037048, + "learning_rate": 9.881004408582852e-05, + "loss": 2.1007, + "step": 3178 + }, + { + "epoch": 0.9757519950890117, + "grad_norm": 0.6021482944488525, + "learning_rate": 9.880896588801954e-05, + "loss": 2.0528, + "step": 3179 + }, + { + "epoch": 0.9760589318600368, + "grad_norm": 0.5204832553863525, + "learning_rate": 9.880788720785177e-05, + "loss": 2.0489, + "step": 3180 + }, + { + "epoch": 0.976365868631062, + "grad_norm": 0.5347138047218323, + "learning_rate": 9.880680804533585e-05, + "loss": 2.1021, + "step": 3181 + }, + { + "epoch": 0.9766728054020871, + "grad_norm": 0.6318790912628174, + "learning_rate": 9.880572840048243e-05, + "loss": 2.0808, + "step": 3182 + }, + { + "epoch": 0.9769797421731123, + "grad_norm": 0.6978665590286255, + "learning_rate": 9.88046482733022e-05, + "loss": 2.0067, + "step": 3183 + }, + { + "epoch": 0.9772866789441375, + "grad_norm": 0.7986917495727539, + "learning_rate": 9.880356766380582e-05, + "loss": 2.0239, + "step": 3184 + }, + { + "epoch": 0.9775936157151627, + "grad_norm": 0.853898286819458, + "learning_rate": 9.880248657200402e-05, + "loss": 2.085, + "step": 3185 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.8207793235778809, + "learning_rate": 9.880140499790741e-05, + "loss": 2.0504, + "step": 3186 + }, + { + "epoch": 0.978207489257213, + "grad_norm": 0.7750336527824402, + "learning_rate": 9.880032294152673e-05, + "loss": 2.0962, + "step": 3187 + }, + { + "epoch": 0.9785144260282382, + "grad_norm": 0.7141241431236267, + "learning_rate": 9.879924040287263e-05, + "loss": 2.0655, + "step": 3188 + }, + { + "epoch": 0.9788213627992634, + "grad_norm": 0.6119080781936646, + "learning_rate": 9.879815738195585e-05, + "loss": 2.0611, + "step": 3189 + }, + { + "epoch": 0.9791282995702886, + "grad_norm": 0.5963751673698425, + "learning_rate": 9.879707387878708e-05, + "loss": 2.0978, + "step": 3190 + }, + { + "epoch": 0.9794352363413137, + "grad_norm": 0.5016428828239441, + "learning_rate": 9.879598989337703e-05, + "loss": 2.0323, + "step": 3191 + }, + { + "epoch": 0.9797421731123389, + "grad_norm": 0.5610151290893555, + "learning_rate": 9.87949054257364e-05, + "loss": 2.1362, + "step": 3192 + }, + { + "epoch": 0.980049109883364, + "grad_norm": 0.5687069296836853, + "learning_rate": 9.879382047587591e-05, + "loss": 2.0234, + "step": 3193 + }, + { + "epoch": 0.9803560466543892, + "grad_norm": 0.6210914254188538, + "learning_rate": 9.87927350438063e-05, + "loss": 2.0455, + "step": 3194 + }, + { + "epoch": 0.9806629834254144, + "grad_norm": 0.530215322971344, + "learning_rate": 9.879164912953827e-05, + "loss": 2.0607, + "step": 3195 + }, + { + "epoch": 0.9809699201964396, + "grad_norm": 0.5462486147880554, + "learning_rate": 9.879056273308258e-05, + "loss": 2.1229, + "step": 3196 + }, + { + "epoch": 0.9812768569674647, + "grad_norm": 0.5765405297279358, + "learning_rate": 9.878947585444994e-05, + "loss": 2.0575, + "step": 3197 + }, + { + "epoch": 0.9815837937384899, + "grad_norm": 0.531679630279541, + "learning_rate": 9.878838849365111e-05, + "loss": 2.0208, + "step": 3198 + }, + { + "epoch": 0.981890730509515, + "grad_norm": 0.5190781950950623, + "learning_rate": 9.878730065069683e-05, + "loss": 2.0073, + "step": 3199 + }, + { + "epoch": 0.9821976672805403, + "grad_norm": 0.6260761022567749, + "learning_rate": 9.878621232559784e-05, + "loss": 2.1144, + "step": 3200 + }, + { + "epoch": 0.9825046040515654, + "grad_norm": 0.664830207824707, + "learning_rate": 9.878512351836491e-05, + "loss": 2.1423, + "step": 3201 + }, + { + "epoch": 0.9828115408225906, + "grad_norm": 0.7107433676719666, + "learning_rate": 9.878403422900881e-05, + "loss": 2.0851, + "step": 3202 + }, + { + "epoch": 0.9831184775936157, + "grad_norm": 0.7426268458366394, + "learning_rate": 9.878294445754027e-05, + "loss": 2.0637, + "step": 3203 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.7643515467643738, + "learning_rate": 9.878185420397008e-05, + "loss": 2.0623, + "step": 3204 + }, + { + "epoch": 0.983732351135666, + "grad_norm": 0.644257664680481, + "learning_rate": 9.878076346830904e-05, + "loss": 2.103, + "step": 3205 + }, + { + "epoch": 0.9840392879066913, + "grad_norm": 0.5871284008026123, + "learning_rate": 9.877967225056787e-05, + "loss": 2.0695, + "step": 3206 + }, + { + "epoch": 0.9843462246777164, + "grad_norm": 0.6907737851142883, + "learning_rate": 9.877858055075742e-05, + "loss": 2.1148, + "step": 3207 + }, + { + "epoch": 0.9846531614487416, + "grad_norm": 0.6685691475868225, + "learning_rate": 9.877748836888843e-05, + "loss": 2.0356, + "step": 3208 + }, + { + "epoch": 0.9849600982197667, + "grad_norm": 0.797210156917572, + "learning_rate": 9.87763957049717e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.9852670349907919, + "grad_norm": 0.8397588133811951, + "learning_rate": 9.877530255901806e-05, + "loss": 2.0697, + "step": 3210 + }, + { + "epoch": 0.985573971761817, + "grad_norm": 0.6988976001739502, + "learning_rate": 9.877420893103828e-05, + "loss": 2.0676, + "step": 3211 + }, + { + "epoch": 0.9858809085328423, + "grad_norm": 0.5828577876091003, + "learning_rate": 9.877311482104319e-05, + "loss": 2.0988, + "step": 3212 + }, + { + "epoch": 0.9861878453038674, + "grad_norm": 0.66143798828125, + "learning_rate": 9.877202022904359e-05, + "loss": 2.101, + "step": 3213 + }, + { + "epoch": 0.9864947820748926, + "grad_norm": 0.7351155877113342, + "learning_rate": 9.877092515505028e-05, + "loss": 2.0198, + "step": 3214 + }, + { + "epoch": 0.9868017188459177, + "grad_norm": 0.6817437410354614, + "learning_rate": 9.876982959907413e-05, + "loss": 2.1182, + "step": 3215 + }, + { + "epoch": 0.9871086556169429, + "grad_norm": 0.6640676259994507, + "learning_rate": 9.876873356112592e-05, + "loss": 2.1264, + "step": 3216 + }, + { + "epoch": 0.987415592387968, + "grad_norm": 0.6146695017814636, + "learning_rate": 9.876763704121652e-05, + "loss": 2.0378, + "step": 3217 + }, + { + "epoch": 0.9877225291589933, + "grad_norm": 0.6681298017501831, + "learning_rate": 9.876654003935672e-05, + "loss": 2.1916, + "step": 3218 + }, + { + "epoch": 0.9880294659300184, + "grad_norm": 0.7407983541488647, + "learning_rate": 9.876544255555742e-05, + "loss": 2.0996, + "step": 3219 + }, + { + "epoch": 0.9883364027010436, + "grad_norm": 0.5995208621025085, + "learning_rate": 9.876434458982941e-05, + "loss": 2.0023, + "step": 3220 + }, + { + "epoch": 0.9886433394720687, + "grad_norm": 0.6491377949714661, + "learning_rate": 9.876324614218357e-05, + "loss": 2.129, + "step": 3221 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.6356569528579712, + "learning_rate": 9.876214721263074e-05, + "loss": 2.1396, + "step": 3222 + }, + { + "epoch": 0.9892572130141191, + "grad_norm": 0.6149557828903198, + "learning_rate": 9.876104780118182e-05, + "loss": 2.0204, + "step": 3223 + }, + { + "epoch": 0.9895641497851443, + "grad_norm": 0.600841224193573, + "learning_rate": 9.875994790784764e-05, + "loss": 2.0585, + "step": 3224 + }, + { + "epoch": 0.9898710865561694, + "grad_norm": 0.6398041248321533, + "learning_rate": 9.875884753263906e-05, + "loss": 2.1296, + "step": 3225 + }, + { + "epoch": 0.9901780233271946, + "grad_norm": 0.5978466272354126, + "learning_rate": 9.875774667556697e-05, + "loss": 1.9765, + "step": 3226 + }, + { + "epoch": 0.9904849600982197, + "grad_norm": 0.49499931931495667, + "learning_rate": 9.875664533664227e-05, + "loss": 2.0516, + "step": 3227 + }, + { + "epoch": 0.990791896869245, + "grad_norm": 0.5660768151283264, + "learning_rate": 9.875554351587579e-05, + "loss": 2.0743, + "step": 3228 + }, + { + "epoch": 0.9910988336402701, + "grad_norm": 0.56971275806427, + "learning_rate": 9.875444121327849e-05, + "loss": 2.0794, + "step": 3229 + }, + { + "epoch": 0.9914057704112953, + "grad_norm": 0.5806300044059753, + "learning_rate": 9.87533384288612e-05, + "loss": 2.1636, + "step": 3230 + }, + { + "epoch": 0.9917127071823204, + "grad_norm": 0.5485837459564209, + "learning_rate": 9.875223516263485e-05, + "loss": 2.025, + "step": 3231 + }, + { + "epoch": 0.9920196439533456, + "grad_norm": 0.6353451013565063, + "learning_rate": 9.875113141461034e-05, + "loss": 2.1033, + "step": 3232 + }, + { + "epoch": 0.9923265807243707, + "grad_norm": 0.577608048915863, + "learning_rate": 9.875002718479858e-05, + "loss": 2.1306, + "step": 3233 + }, + { + "epoch": 0.992633517495396, + "grad_norm": 0.5305901765823364, + "learning_rate": 9.874892247321046e-05, + "loss": 2.1123, + "step": 3234 + }, + { + "epoch": 0.9929404542664211, + "grad_norm": 0.5554118752479553, + "learning_rate": 9.874781727985693e-05, + "loss": 2.0524, + "step": 3235 + }, + { + "epoch": 0.9932473910374463, + "grad_norm": 0.48555269837379456, + "learning_rate": 9.87467116047489e-05, + "loss": 2.0699, + "step": 3236 + }, + { + "epoch": 0.9935543278084714, + "grad_norm": 0.578976035118103, + "learning_rate": 9.874560544789729e-05, + "loss": 2.0747, + "step": 3237 + }, + { + "epoch": 0.9938612645794966, + "grad_norm": 0.5508282780647278, + "learning_rate": 9.874449880931304e-05, + "loss": 2.0947, + "step": 3238 + }, + { + "epoch": 0.9941682013505218, + "grad_norm": 0.5458595752716064, + "learning_rate": 9.874339168900707e-05, + "loss": 2.0417, + "step": 3239 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.5668261647224426, + "learning_rate": 9.874228408699035e-05, + "loss": 2.0948, + "step": 3240 + }, + { + "epoch": 0.9947820748925721, + "grad_norm": 0.6127253174781799, + "learning_rate": 9.87411760032738e-05, + "loss": 2.0904, + "step": 3241 + }, + { + "epoch": 0.9950890116635973, + "grad_norm": 0.5736191868782043, + "learning_rate": 9.874006743786839e-05, + "loss": 2.0637, + "step": 3242 + }, + { + "epoch": 0.9953959484346224, + "grad_norm": 0.574163019657135, + "learning_rate": 9.873895839078507e-05, + "loss": 2.0925, + "step": 3243 + }, + { + "epoch": 0.9957028852056476, + "grad_norm": 0.5660602450370789, + "learning_rate": 9.873784886203478e-05, + "loss": 2.0743, + "step": 3244 + }, + { + "epoch": 0.9960098219766728, + "grad_norm": 0.6037993431091309, + "learning_rate": 9.87367388516285e-05, + "loss": 2.1274, + "step": 3245 + }, + { + "epoch": 0.996316758747698, + "grad_norm": 0.5664488673210144, + "learning_rate": 9.873562835957722e-05, + "loss": 2.0403, + "step": 3246 + }, + { + "epoch": 0.9966236955187231, + "grad_norm": 0.6170254349708557, + "learning_rate": 9.873451738589188e-05, + "loss": 2.0198, + "step": 3247 + }, + { + "epoch": 0.9969306322897483, + "grad_norm": 0.5582032799720764, + "learning_rate": 9.873340593058348e-05, + "loss": 2.1494, + "step": 3248 + }, + { + "epoch": 0.9972375690607734, + "grad_norm": 0.5565598607063293, + "learning_rate": 9.8732293993663e-05, + "loss": 2.1062, + "step": 3249 + }, + { + "epoch": 0.9975445058317987, + "grad_norm": 0.5526474118232727, + "learning_rate": 9.873118157514142e-05, + "loss": 2.1184, + "step": 3250 + }, + { + "epoch": 0.9978514426028238, + "grad_norm": 0.5864302515983582, + "learning_rate": 9.873006867502975e-05, + "loss": 2.1389, + "step": 3251 + }, + { + "epoch": 0.998158379373849, + "grad_norm": 0.5295118689537048, + "learning_rate": 9.872895529333899e-05, + "loss": 2.05, + "step": 3252 + }, + { + "epoch": 0.9984653161448741, + "grad_norm": 0.553537905216217, + "learning_rate": 9.872784143008012e-05, + "loss": 2.0464, + "step": 3253 + }, + { + "epoch": 0.9987722529158993, + "grad_norm": 0.558159589767456, + "learning_rate": 9.872672708526416e-05, + "loss": 2.1013, + "step": 3254 + }, + { + "epoch": 0.9990791896869244, + "grad_norm": 0.5479860901832581, + "learning_rate": 9.872561225890211e-05, + "loss": 2.0497, + "step": 3255 + }, + { + "epoch": 0.9993861264579497, + "grad_norm": 0.5538234114646912, + "learning_rate": 9.872449695100503e-05, + "loss": 2.1239, + "step": 3256 + }, + { + "epoch": 0.9996930632289748, + "grad_norm": 0.5970771908760071, + "learning_rate": 9.872338116158389e-05, + "loss": 2.0693, + "step": 3257 + }, + { + "epoch": 1.0, + "grad_norm": 0.5118132829666138, + "learning_rate": 9.872226489064975e-05, + "loss": 2.0302, + "step": 3258 + } + ], + "logging_steps": 1.0, + "max_steps": 32580, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2132790998935378e+20, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3258/training_args.bin b/checkpoint-3258/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9db7ad91da5423a229826113feb3e9db3ef40c31 --- /dev/null +++ b/checkpoint-3258/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682b697e933b6e2693e5f9af9a0654effab1ca392c8500bf8af0eb089116a263 +size 7288 diff --git a/checkpoint-3258/zero_to_fp32.py b/checkpoint-3258/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-3258/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-32580/config.json b/checkpoint-32580/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a29af639fbf705188c21aae22660a85fee1ca26e --- /dev/null +++ b/checkpoint-32580/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "freeze_mm_mlp_adapter": false, + "gen_hidden_size": 1792, + "gen_pooling": "early_pool2d_4", + "gen_vision_tower": "eva-clip-E-14-plus", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "image_aspect_ratio": "square", + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "llava_llama", + "n_query": 64, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "tokenizer_model_max_length": 256, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "tune_mm_mlp_adapter": false, + "use_cache": false, + "use_mm_proj": true, + "vision_tower_pretrained": null, + "vocab_size": 128260 +} diff --git a/checkpoint-32580/generation_config.json b/checkpoint-32580/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..05892c70fa899883072c585fa444b4aa7175d6bc --- /dev/null +++ b/checkpoint-32580/generation_config.json @@ -0,0 +1,13 @@ +{ + "attn_implementation": "flash_attention_2", + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-32580/latest b/checkpoint-32580/latest new file mode 100644 index 0000000000000000000000000000000000000000..b0acb80827947565809146811d5e93bc21627d05 --- /dev/null +++ b/checkpoint-32580/latest @@ -0,0 +1 @@ +global_step32580 \ No newline at end of file diff --git a/checkpoint-32580/model-00001-of-00003.safetensors b/checkpoint-32580/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..40e286fc97157dac65404b04dbb26baec8c2e3c2 --- /dev/null +++ b/checkpoint-32580/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a9449c8f5827ef7e6fe120565d7012681fa3d7ed957714ca215f7fddbb30ee9 +size 4955415870 diff --git a/checkpoint-32580/model-00002-of-00003.safetensors b/checkpoint-32580/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1c6f3bf70f8abb1e7ffb233219debc10bc20bfc --- /dev/null +++ b/checkpoint-32580/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b088e0e2c4fb5916f448522fa5aef361db713e2c2c0ceac534662c8d52e330d +size 4971563008 diff --git a/checkpoint-32580/model-00003-of-00003.safetensors b/checkpoint-32580/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dc65a5dfde6d99312e188a257227571ac71c350f --- /dev/null +++ b/checkpoint-32580/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30b6ea9ccdee0d6276c4086d831ffff162f36df75bd46ab1fe988a95097285b8 +size 4180840856 diff --git a/checkpoint-32580/model.safetensors.index.json b/checkpoint-32580/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c911c94f46f802ae304903dd7796da96c28604 --- /dev/null +++ b/checkpoint-32580/model.safetensors.index.json @@ -0,0 +1,2358 @@ +{ + "metadata": { + "total_size": 14107506086 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.bias": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.cls_token": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.pos_embed": "model-00001-of-00003.safetensors", + "model.latent_queries": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/checkpoint-32580/rng_state_0.pth b/checkpoint-32580/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d2058b856a99c4a27a84743ad08fad7b6d118ffc --- /dev/null +++ b/checkpoint-32580/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:650dae34f734712dda14e9af8569295c405fa8995f92cd8cf8bd8af8324994f6 +size 15984 diff --git a/checkpoint-32580/rng_state_1.pth b/checkpoint-32580/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4af14b9200f31e697990ed58a6b880d63a12feb8 --- /dev/null +++ b/checkpoint-32580/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:437007933276e176436feac6cfe817fdec509c6b0bbe602afc9b34e43ea495a8 +size 15984 diff --git a/checkpoint-32580/rng_state_10.pth b/checkpoint-32580/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6f8202f0364da9564169fa05bcbe87f5761acd1 --- /dev/null +++ b/checkpoint-32580/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56fb135866da8e05a29939a7907d4ab5f093cc8cb34082225408fe64927148f4 +size 15997 diff --git a/checkpoint-32580/rng_state_11.pth b/checkpoint-32580/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..cca3156a893e2e182832609ac1c1e39796fad9da --- /dev/null +++ b/checkpoint-32580/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:380004e88b79f7ccdcfd4129c48b923fe9b8629b5ca65e56951e33e93acedc7f +size 15997 diff --git a/checkpoint-32580/rng_state_12.pth b/checkpoint-32580/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1bcacc733d30cb5cd045effcede67026352378d --- /dev/null +++ b/checkpoint-32580/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57352df285324db69047c8b32393115e998c8d57677e02d63adb30a701a673f1 +size 15997 diff --git a/checkpoint-32580/rng_state_13.pth b/checkpoint-32580/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..49c77f88378b1acdb6bd8078f61e98bfeb807515 --- /dev/null +++ b/checkpoint-32580/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68d6ea85ac9f0c8481da0004661cfcfb63bc966159d119b47eb15ad42ddd077a +size 15997 diff --git a/checkpoint-32580/rng_state_14.pth b/checkpoint-32580/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..dca0d2350bae9024770397728e01851ca8c7b431 --- /dev/null +++ b/checkpoint-32580/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b35bd074191eba74fbdfca20de9fef7e72e73fbe2ef995cf1468f167bd46b7d +size 15997 diff --git a/checkpoint-32580/rng_state_15.pth b/checkpoint-32580/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..5881b9592b66092433fee6e9d00e6fefa635a581 --- /dev/null +++ b/checkpoint-32580/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23904256cccb2a394d04dba40003ada5f4532057110f3caab54405414557ecab +size 15997 diff --git a/checkpoint-32580/rng_state_16.pth b/checkpoint-32580/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..6324a3c9b71ad435b94ad7e42150132dcb767d1e --- /dev/null +++ b/checkpoint-32580/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0861d6b6d84e0c514ead2f769fc30df1b5197d6241f809c2b72b245d83876d3 +size 15997 diff --git a/checkpoint-32580/rng_state_17.pth b/checkpoint-32580/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..8cd28dc6fdc7acb289e13b40f27b348a1043cbec --- /dev/null +++ b/checkpoint-32580/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b07b44bc47a5818ed43a5e83c17362294279cd68781b6588fcfdecf948a1cbea +size 15997 diff --git a/checkpoint-32580/rng_state_18.pth b/checkpoint-32580/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..99a26c356bc20398bc00fd9a8c82a1fdcf314382 --- /dev/null +++ b/checkpoint-32580/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3e272bb790c8f3895fb5f6899868a3cdada4df4306cb7d0ebcf1318ee267b22 +size 15997 diff --git a/checkpoint-32580/rng_state_19.pth b/checkpoint-32580/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..544c7c25f88139713ba7e66cd00a004a2193b20e --- /dev/null +++ b/checkpoint-32580/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ae1059ce5b19c349db20c95445b316dd349d052a6336ff0f1fd6482916bfdc4 +size 15997 diff --git a/checkpoint-32580/rng_state_2.pth b/checkpoint-32580/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..94da9e40d15d7911c5bee94dd40aafb0a71b9e89 --- /dev/null +++ b/checkpoint-32580/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2f0dc5f18c8cea7386ed79a5b515416ce4a479ae5b0094612b331c346b651ee +size 15984 diff --git a/checkpoint-32580/rng_state_20.pth b/checkpoint-32580/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..e1a6d7cc67b2c0104258b0de7a4bd20b784e702a --- /dev/null +++ b/checkpoint-32580/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26efb4f03a3173d7ae4f9dbe21bc9a7d0b37e1f5cf0449f011f269ec7130d1db +size 15997 diff --git a/checkpoint-32580/rng_state_21.pth b/checkpoint-32580/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..1de12d92bf6d14d1fcd8eb0affab92845f820332 --- /dev/null +++ b/checkpoint-32580/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b975e21cb5681f5d4cf6948ecbf96b53dd03724ff9bb5ac500b2863fe80f0f5 +size 15997 diff --git a/checkpoint-32580/rng_state_22.pth b/checkpoint-32580/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..a394316e52937b617c8ba80c4752c40dc050d4b1 --- /dev/null +++ b/checkpoint-32580/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a18f986aaba150dc47edfd61c2b44a847f4ca9218b4d5fdc11b86fa7543d3e17 +size 15997 diff --git a/checkpoint-32580/rng_state_23.pth b/checkpoint-32580/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..c304d001257d279d0e9cca682c98777715b09003 --- /dev/null +++ b/checkpoint-32580/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed4c6cee344127aeecbc5199f5a5d340a79e5c684364bc874e150b01dba4fea0 +size 15997 diff --git a/checkpoint-32580/rng_state_24.pth b/checkpoint-32580/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..1274a346e75f1d7f95da37ad13d8e75d5875abbd --- /dev/null +++ b/checkpoint-32580/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bfd1812771e5073a3aef2d5929be1bb9d8c4a8874abb6d6e2587c651de9d9fb +size 15997 diff --git a/checkpoint-32580/rng_state_25.pth b/checkpoint-32580/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f5780bd84e6cb3da68434a3678b14c02f34ce23 --- /dev/null +++ b/checkpoint-32580/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7432699a73fad43df02ddf2ce5e0978de25cf11adfd8fd8dc5fef85c28849183 +size 15997 diff --git a/checkpoint-32580/rng_state_26.pth b/checkpoint-32580/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..34cf03881883f65282d8f39b1988b70691a19f0f --- /dev/null +++ b/checkpoint-32580/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96f9986920c4f6657049b86196e1930e5a9f60eb6cc21c57cca69d34bb435af0 +size 15997 diff --git a/checkpoint-32580/rng_state_27.pth b/checkpoint-32580/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d0a68d23f1660c03c02b510809d612370ceb955 --- /dev/null +++ b/checkpoint-32580/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a4c3047646c33b3a79c4ae3edbf7158208c6206fa5ac4aee4d3b093ac75e098 +size 15997 diff --git a/checkpoint-32580/rng_state_28.pth b/checkpoint-32580/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..053ea61cbdb2aa303d8ab842cd5fc8e5ba1cc650 --- /dev/null +++ b/checkpoint-32580/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3058758f23c6616e654f827b985cf6f9e1401016bb8eafac73cc637146b5d3c8 +size 15997 diff --git a/checkpoint-32580/rng_state_29.pth b/checkpoint-32580/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..6cef51e3554a194ab31297c8b5d19a007ceaed61 --- /dev/null +++ b/checkpoint-32580/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b253bcec4a685b6fd58d5b3f40d7748646e0c9da0121691d0581d20cf74a46a +size 15997 diff --git a/checkpoint-32580/rng_state_3.pth b/checkpoint-32580/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3353bc18e02cd5c787bdf332829a8fc174c247cb --- /dev/null +++ b/checkpoint-32580/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e123fa74d83f40079f9a62db048ca9a5d51d41be5cf96ae6e5b6a1a51909adb +size 15984 diff --git a/checkpoint-32580/rng_state_30.pth b/checkpoint-32580/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..93217900ecafc56c84dc02d47e7c4d0384774c3c --- /dev/null +++ b/checkpoint-32580/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4080cfbf06e6305ba49e53844dba51649b6ddf51a6ff3e3707dcc5368f33936e +size 15997 diff --git a/checkpoint-32580/rng_state_31.pth b/checkpoint-32580/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..6d6175a42cb774971abc12fbc3f53e4131956ebe --- /dev/null +++ b/checkpoint-32580/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3588219fbf3ec52f1cd29821fd7aa935df6be1042a6ca5e386cfd60eb4c795e8 +size 15997 diff --git a/checkpoint-32580/rng_state_32.pth b/checkpoint-32580/rng_state_32.pth new file mode 100644 index 0000000000000000000000000000000000000000..123453255c36e8011357022d7405434ba6d833fa --- /dev/null +++ b/checkpoint-32580/rng_state_32.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49caa27d674f13f8f97041802a98663682aa5724e250e3abcc31f91b6b0448b2 +size 15997 diff --git a/checkpoint-32580/rng_state_33.pth b/checkpoint-32580/rng_state_33.pth new file mode 100644 index 0000000000000000000000000000000000000000..857b780dbda2b24e5d752113b4015cc89ab3e09c --- /dev/null +++ b/checkpoint-32580/rng_state_33.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5519d8413af18de4456b13580c6a8eb1fcd440aa793ce7a514138b281668845f +size 15997 diff --git a/checkpoint-32580/rng_state_34.pth b/checkpoint-32580/rng_state_34.pth new file mode 100644 index 0000000000000000000000000000000000000000..b3df1fe1921096c64536ce48e153e20edbafa2f0 --- /dev/null +++ b/checkpoint-32580/rng_state_34.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:291e88f25c6bf84b3fa05836013025e5abd75d21133c972461ed82ecb5cfba90 +size 15997 diff --git a/checkpoint-32580/rng_state_35.pth b/checkpoint-32580/rng_state_35.pth new file mode 100644 index 0000000000000000000000000000000000000000..66c1200bee53fa706ff5c0ba72a55f828e21b0e4 --- /dev/null +++ b/checkpoint-32580/rng_state_35.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17aa80d9e6cf41d222b4ee043e9af861a01e0b958f57a7398adfd3575fc14c58 +size 15997 diff --git a/checkpoint-32580/rng_state_36.pth b/checkpoint-32580/rng_state_36.pth new file mode 100644 index 0000000000000000000000000000000000000000..86cd3996029a7fae6fddb4c682d34c979b69999a --- /dev/null +++ b/checkpoint-32580/rng_state_36.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f24cedff1e4fb8125494bbb38685848d708d02b4c06b85c30075046a6851877 +size 15997 diff --git a/checkpoint-32580/rng_state_37.pth b/checkpoint-32580/rng_state_37.pth new file mode 100644 index 0000000000000000000000000000000000000000..5bd175e70305c9c8923f84e58b00fc9a728ac267 --- /dev/null +++ b/checkpoint-32580/rng_state_37.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebff462cd515e335854bd249ebb344e15af234e93266afca974fcf7890500e76 +size 15997 diff --git a/checkpoint-32580/rng_state_38.pth b/checkpoint-32580/rng_state_38.pth new file mode 100644 index 0000000000000000000000000000000000000000..dbba1b8c17ae4b4c0e67c98572bb7ef3004db32f --- /dev/null +++ b/checkpoint-32580/rng_state_38.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c40ac754c94e26f091b6c292df40d16ea4479ad6916593d5621e1a215be43908 +size 15997 diff --git a/checkpoint-32580/rng_state_39.pth b/checkpoint-32580/rng_state_39.pth new file mode 100644 index 0000000000000000000000000000000000000000..be73811b6676fa2c1ae8da8414b69f7951802009 --- /dev/null +++ b/checkpoint-32580/rng_state_39.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78c528543f8cfb84bc1e133f5a6c336fbe714da43d940320f0a757b1531ac841 +size 15997 diff --git a/checkpoint-32580/rng_state_4.pth b/checkpoint-32580/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..0a12a2e64d391d34e5b37cb75d8d35f606f5c9cd --- /dev/null +++ b/checkpoint-32580/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d01206027859825f6b1ec22f1c39816cd74d8a6707e4febb86578f08016f7ad +size 15984 diff --git a/checkpoint-32580/rng_state_40.pth b/checkpoint-32580/rng_state_40.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f96d8cad66e49aed3a159380a0a8349cd09fd8a --- /dev/null +++ b/checkpoint-32580/rng_state_40.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151cb583ae00853b7308458b3759720f1b5fdf1a204f148e09817e7e481e3a30 +size 15997 diff --git a/checkpoint-32580/rng_state_41.pth b/checkpoint-32580/rng_state_41.pth new file mode 100644 index 0000000000000000000000000000000000000000..d401b7c3e668bff2e1c2e0d6ed3cc0c9f05d3331 --- /dev/null +++ b/checkpoint-32580/rng_state_41.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f881eed23b2f4158903133d85057a8e3cfca481a8a25776ce3a4bb9c162857f3 +size 15997 diff --git a/checkpoint-32580/rng_state_42.pth b/checkpoint-32580/rng_state_42.pth new file mode 100644 index 0000000000000000000000000000000000000000..cb99de61233402e4c74e71f47f3893f61eff5599 --- /dev/null +++ b/checkpoint-32580/rng_state_42.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2ad0cd5183279094d816bad077fdee222c034a296c2628e00e2f4c7160ac95d +size 15997 diff --git a/checkpoint-32580/rng_state_43.pth b/checkpoint-32580/rng_state_43.pth new file mode 100644 index 0000000000000000000000000000000000000000..58376d0cf0aa3d4cf23efe3e0915b5ee0876f130 --- /dev/null +++ b/checkpoint-32580/rng_state_43.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a33e3f5ef6f9ed47df45338ddcb321e4a8f3c76b301ad26133801f3533acfd8 +size 15997 diff --git a/checkpoint-32580/rng_state_44.pth b/checkpoint-32580/rng_state_44.pth new file mode 100644 index 0000000000000000000000000000000000000000..6fb14d700fbcc64044df524af705d16ab58862e3 --- /dev/null +++ b/checkpoint-32580/rng_state_44.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba6d599dbd616f540abe7d24814f7b7d6904715fae0de1214f9efdbabcc62047 +size 15997 diff --git a/checkpoint-32580/rng_state_45.pth b/checkpoint-32580/rng_state_45.pth new file mode 100644 index 0000000000000000000000000000000000000000..697068e58359b0ae9253902d8c500ef83e6f5cd9 --- /dev/null +++ b/checkpoint-32580/rng_state_45.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c913f35bc5948b030e4b3d48a01b98356689691d3ad959f5ecd6f033b8e3cf8 +size 15997 diff --git a/checkpoint-32580/rng_state_46.pth b/checkpoint-32580/rng_state_46.pth new file mode 100644 index 0000000000000000000000000000000000000000..3aa75a7e6ca8826da0f5255b86a74ae952a5c795 --- /dev/null +++ b/checkpoint-32580/rng_state_46.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b4a7b16c28ba2911f9d458d2e202116f3cce930d7f7de55bc8fb80d2ce27128 +size 15997 diff --git a/checkpoint-32580/rng_state_47.pth b/checkpoint-32580/rng_state_47.pth new file mode 100644 index 0000000000000000000000000000000000000000..c3171f856ccdad39ca408def4206122095aa71ed --- /dev/null +++ b/checkpoint-32580/rng_state_47.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed711f9009ba9063a631a660c298264aa7a0d3f92f00909657e9adba39c7f9bd +size 15997 diff --git a/checkpoint-32580/rng_state_48.pth b/checkpoint-32580/rng_state_48.pth new file mode 100644 index 0000000000000000000000000000000000000000..12e93aef6bb5a69f160eaf13e20521c283445cd2 --- /dev/null +++ b/checkpoint-32580/rng_state_48.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af50c0e1b47ce13c8daf84f6859657552b5e01045bfa79654131437fbb3b7fee +size 15997 diff --git a/checkpoint-32580/rng_state_49.pth b/checkpoint-32580/rng_state_49.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4a5cf8d2e3e20e30f19a4a6518fad731d402784 --- /dev/null +++ b/checkpoint-32580/rng_state_49.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a11c5504e55909e1b76900b650d12f17e0b8c883a7c5730114d13ae20d8f5616 +size 15997 diff --git a/checkpoint-32580/rng_state_5.pth b/checkpoint-32580/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..a5d2f394ccff0c2f768e557e642b77a13fa2e545 --- /dev/null +++ b/checkpoint-32580/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4106bcb50b77907bb13211f6eb0ae7497adbd0793b88731beeaa9d0eabfd278 +size 15984 diff --git a/checkpoint-32580/rng_state_50.pth b/checkpoint-32580/rng_state_50.pth new file mode 100644 index 0000000000000000000000000000000000000000..013a5acdd3ef5f76315cef33431c7725ffa1e378 --- /dev/null +++ b/checkpoint-32580/rng_state_50.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95719ab527ef50afec70656b7a4c8862a795325c27abd2181bb4028a741a7881 +size 15997 diff --git a/checkpoint-32580/rng_state_51.pth b/checkpoint-32580/rng_state_51.pth new file mode 100644 index 0000000000000000000000000000000000000000..8bb1c68815143195fd558821210368d7c72468df --- /dev/null +++ b/checkpoint-32580/rng_state_51.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f408bfb9b4bada1086c9c381981be999d6141a99c6897ac482ada01ee14fa878 +size 15997 diff --git a/checkpoint-32580/rng_state_52.pth b/checkpoint-32580/rng_state_52.pth new file mode 100644 index 0000000000000000000000000000000000000000..98147bb5f632940116bd283d62320a0c4d49cd44 --- /dev/null +++ b/checkpoint-32580/rng_state_52.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2d09153b0136b42432e4baefa282d4b7da282971089f6e8df7e7d483e9ee5ca +size 15997 diff --git a/checkpoint-32580/rng_state_53.pth b/checkpoint-32580/rng_state_53.pth new file mode 100644 index 0000000000000000000000000000000000000000..4b77ed09b2e47bf8be4ee390c548591704b18ee3 --- /dev/null +++ b/checkpoint-32580/rng_state_53.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb5a3ba161a783752ddd2791ca0f3da5ab989e2c30e833b4ecd558bb37600480 +size 15997 diff --git a/checkpoint-32580/rng_state_54.pth b/checkpoint-32580/rng_state_54.pth new file mode 100644 index 0000000000000000000000000000000000000000..7c4df9d1e1e2df5d9e208e46d723e96f263f0841 --- /dev/null +++ b/checkpoint-32580/rng_state_54.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd909838c0ccc84d7d1a3ce707a307e7b2aa0e3d7d6e6f51c27b74ddcdf225a2 +size 15997 diff --git a/checkpoint-32580/rng_state_55.pth b/checkpoint-32580/rng_state_55.pth new file mode 100644 index 0000000000000000000000000000000000000000..28646f4adc43eaaf615dd2debd51f5d23810c965 --- /dev/null +++ b/checkpoint-32580/rng_state_55.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d629e6e1af48caf09c06ef49ede190ff9273d8e00b326a1085c4315eab1c706 +size 15997 diff --git a/checkpoint-32580/rng_state_56.pth b/checkpoint-32580/rng_state_56.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3bb89ffde54928ef85cf37b226380256b699bb7 --- /dev/null +++ b/checkpoint-32580/rng_state_56.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7578b073caaa642e11940babc82d85189cc52ae213b76a75a4f7704e4ef67e37 +size 15997 diff --git a/checkpoint-32580/rng_state_57.pth b/checkpoint-32580/rng_state_57.pth new file mode 100644 index 0000000000000000000000000000000000000000..98118c0ce072740ff92fe734f54cd82138f8eb78 --- /dev/null +++ b/checkpoint-32580/rng_state_57.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e211ab98444154a163231fbc48007314b950342e8c452fe9b4bb41f13dea276 +size 15997 diff --git a/checkpoint-32580/rng_state_58.pth b/checkpoint-32580/rng_state_58.pth new file mode 100644 index 0000000000000000000000000000000000000000..1fa3e240232eb2bb73a13f8f86722c51acce2cdb --- /dev/null +++ b/checkpoint-32580/rng_state_58.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17b1997ba7e2222f1447891f3cbb61ce047df0ae0b4e357e992b71ca89a7a974 +size 15997 diff --git a/checkpoint-32580/rng_state_59.pth b/checkpoint-32580/rng_state_59.pth new file mode 100644 index 0000000000000000000000000000000000000000..eda16a87e870c07bc0af9b6f169077424285c5c8 --- /dev/null +++ b/checkpoint-32580/rng_state_59.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab86a073ebda9890291c78e18a5ee167cfddb95bf8afddbd5429e965eb48a995 +size 15997 diff --git a/checkpoint-32580/rng_state_6.pth b/checkpoint-32580/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..79cd09014fe3ee76468b92192ef8257942b37023 --- /dev/null +++ b/checkpoint-32580/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b65ef18a871126e4f0b75660139bd1364eda828803125b1e876d8d114b82d75b +size 15984 diff --git a/checkpoint-32580/rng_state_60.pth b/checkpoint-32580/rng_state_60.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3f799a78c8ad293e8f0a0660e083b4fcba68e2c --- /dev/null +++ b/checkpoint-32580/rng_state_60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3b2f6d3ab169e34952db1395aa10ea2907c33cf4abdad913598d8d6051683fd +size 15997 diff --git a/checkpoint-32580/rng_state_61.pth b/checkpoint-32580/rng_state_61.pth new file mode 100644 index 0000000000000000000000000000000000000000..3146836de0de6831273207ac619730f5bd2512fb --- /dev/null +++ b/checkpoint-32580/rng_state_61.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51f5b2d05c1026bbf2423f268a8bcf2ef93f7a41ddab9af8739a96c96c712054 +size 15997 diff --git a/checkpoint-32580/rng_state_62.pth b/checkpoint-32580/rng_state_62.pth new file mode 100644 index 0000000000000000000000000000000000000000..204c8a73c2a08d6a8aeda2116ae3d3dbe6a8152d --- /dev/null +++ b/checkpoint-32580/rng_state_62.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a351d5ec3fd6fdabf8f88c42c4e572a2a8b3155255f3375cb983c7f8473e9296 +size 15997 diff --git a/checkpoint-32580/rng_state_63.pth b/checkpoint-32580/rng_state_63.pth new file mode 100644 index 0000000000000000000000000000000000000000..0c925b989c8215a0bff2d62afef376758e1b870f --- /dev/null +++ b/checkpoint-32580/rng_state_63.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e318f3f9ab0b661d2439ed3505f8ea38a7f75542a974f45d9c707aab0393cde2 +size 15997 diff --git a/checkpoint-32580/rng_state_7.pth b/checkpoint-32580/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..93cf5b8000101cbe8ee87f6535cc081e33f32ccd --- /dev/null +++ b/checkpoint-32580/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:745384aa24910c40873f62b870a641d1812815438fa620e13bf9aed45890c0e5 +size 15984 diff --git a/checkpoint-32580/rng_state_8.pth b/checkpoint-32580/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d97f62c4027e7b6eb93ff786adb4f60863bc45e --- /dev/null +++ b/checkpoint-32580/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5570ce7d835fda9ea90674bc33a362dbb77868ef28eab45e24cf74d9adc8844c +size 15984 diff --git a/checkpoint-32580/rng_state_9.pth b/checkpoint-32580/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..23dda87a08ac23bc61109c364c4c05234f207599 --- /dev/null +++ b/checkpoint-32580/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d4e145e65559ae7dcc1df0517e7d55afef354b27c755408dce524e1a98fe500 +size 15984 diff --git a/checkpoint-32580/scheduler.pt b/checkpoint-32580/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e39d199833c6ea39245b4a35e55b77566d53a68 --- /dev/null +++ b/checkpoint-32580/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea6a23bc2a8b491cae02aa96c7eeb19c6ccc01afce3ffa6c698eb293b41caf52 +size 1064 diff --git a/checkpoint-32580/special_tokens_map.json b/checkpoint-32580/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad43db72a0e94321a5a9455dce616c68d1f9673 --- /dev/null +++ b/checkpoint-32580/special_tokens_map.json @@ -0,0 +1,46 @@ +{ + "additional_special_tokens": [ + { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-32580/tokenizer.json b/checkpoint-32580/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..444d43e1c25d11b63381073024becd006c83d4f6 --- /dev/null +++ b/checkpoint-32580/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fbef9068a1d82c7fafc3fdfd7c717524c8bfbcaea19c14ce4f8a4e616deb57 +size 17210651 diff --git a/checkpoint-32580/tokenizer_config.json b/checkpoint-32580/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a54102d00c210427fe2da524cea00c5ace13686 --- /dev/null +++ b/checkpoint-32580/tokenizer_config.json @@ -0,0 +1,2102 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "[IMG]", + "[/IMG]", + "" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 256, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-32580/trainer_state.json b/checkpoint-32580/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7b9446f0a7c930e8eaeabf10ffb1b02795f1cc7e --- /dev/null +++ b/checkpoint-32580/trainer_state.json @@ -0,0 +1,228094 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 32580, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003069367710251688, + "grad_norm": 1.3492016792297363, + "learning_rate": 0.0, + "loss": 6.5185, + "step": 1 + }, + { + "epoch": 0.0006138735420503376, + "grad_norm": 1.4303781986236572, + "learning_rate": 1.0224948875255626e-07, + "loss": 6.5124, + "step": 2 + }, + { + "epoch": 0.0009208103130755065, + "grad_norm": 1.3981783390045166, + "learning_rate": 2.0449897750511251e-07, + "loss": 6.5204, + "step": 3 + }, + { + "epoch": 0.0012277470841006752, + "grad_norm": 1.3760672807693481, + "learning_rate": 3.0674846625766876e-07, + "loss": 6.502, + "step": 4 + }, + { + "epoch": 0.001534683855125844, + "grad_norm": 1.3704107999801636, + "learning_rate": 4.0899795501022503e-07, + "loss": 6.5021, + "step": 5 + }, + { + "epoch": 0.001841620626151013, + "grad_norm": 1.3109549283981323, + "learning_rate": 5.112474437627812e-07, + "loss": 6.521, + "step": 6 + }, + { + "epoch": 0.002148557397176182, + "grad_norm": 1.475183367729187, + "learning_rate": 6.134969325153375e-07, + "loss": 6.521, + "step": 7 + }, + { + "epoch": 0.0024554941682013503, + "grad_norm": 1.4563297033309937, + "learning_rate": 7.157464212678937e-07, + "loss": 6.5075, + "step": 8 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 1.437183141708374, + "learning_rate": 8.179959100204501e-07, + "loss": 6.5135, + "step": 9 + }, + { + "epoch": 0.003069367710251688, + "grad_norm": 1.336928129196167, + "learning_rate": 9.202453987730062e-07, + "loss": 6.5138, + "step": 10 + }, + { + "epoch": 0.003376304481276857, + "grad_norm": 1.3220698833465576, + "learning_rate": 1.0224948875255625e-06, + "loss": 6.5187, + "step": 11 + }, + { + "epoch": 0.003683241252302026, + "grad_norm": 1.3990652561187744, + "learning_rate": 1.1247443762781187e-06, + "loss": 6.5129, + "step": 12 + }, + { + "epoch": 0.003990178023327195, + "grad_norm": 1.4394340515136719, + "learning_rate": 1.226993865030675e-06, + "loss": 6.5078, + "step": 13 + }, + { + "epoch": 0.004297114794352364, + "grad_norm": 1.3675259351730347, + "learning_rate": 1.3292433537832312e-06, + "loss": 6.5115, + "step": 14 + }, + { + "epoch": 0.004604051565377533, + "grad_norm": 1.3085063695907593, + "learning_rate": 1.4314928425357874e-06, + "loss": 6.5092, + "step": 15 + }, + { + "epoch": 0.004910988336402701, + "grad_norm": 1.4214227199554443, + "learning_rate": 1.5337423312883435e-06, + "loss": 6.5026, + "step": 16 + }, + { + "epoch": 0.0052179251074278695, + "grad_norm": 1.377146601676941, + "learning_rate": 1.6359918200409001e-06, + "loss": 6.4882, + "step": 17 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.3461124897003174, + "learning_rate": 1.7382413087934563e-06, + "loss": 6.4935, + "step": 18 + }, + { + "epoch": 0.005831798649478207, + "grad_norm": 1.3161669969558716, + "learning_rate": 1.8404907975460124e-06, + "loss": 6.4795, + "step": 19 + }, + { + "epoch": 0.006138735420503376, + "grad_norm": 1.2915974855422974, + "learning_rate": 1.942740286298569e-06, + "loss": 6.4529, + "step": 20 + }, + { + "epoch": 0.006445672191528545, + "grad_norm": 1.2675414085388184, + "learning_rate": 2.044989775051125e-06, + "loss": 6.454, + "step": 21 + }, + { + "epoch": 0.006752608962553714, + "grad_norm": 1.2769283056259155, + "learning_rate": 2.147239263803681e-06, + "loss": 6.4574, + "step": 22 + }, + { + "epoch": 0.007059545733578883, + "grad_norm": 1.2556813955307007, + "learning_rate": 2.2494887525562373e-06, + "loss": 6.4486, + "step": 23 + }, + { + "epoch": 0.007366482504604052, + "grad_norm": 1.2158268690109253, + "learning_rate": 2.3517382413087935e-06, + "loss": 6.4357, + "step": 24 + }, + { + "epoch": 0.007673419275629221, + "grad_norm": 1.2383767366409302, + "learning_rate": 2.45398773006135e-06, + "loss": 6.4347, + "step": 25 + }, + { + "epoch": 0.00798035604665439, + "grad_norm": 1.2865383625030518, + "learning_rate": 2.5562372188139062e-06, + "loss": 6.3611, + "step": 26 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 1.1501989364624023, + "learning_rate": 2.6584867075664624e-06, + "loss": 6.3247, + "step": 27 + }, + { + "epoch": 0.008594229588704727, + "grad_norm": 1.0971378087997437, + "learning_rate": 2.7607361963190186e-06, + "loss": 6.3078, + "step": 28 + }, + { + "epoch": 0.008901166359729895, + "grad_norm": 1.1365599632263184, + "learning_rate": 2.8629856850715747e-06, + "loss": 6.3211, + "step": 29 + }, + { + "epoch": 0.009208103130755065, + "grad_norm": 1.1228944063186646, + "learning_rate": 2.965235173824131e-06, + "loss": 6.3185, + "step": 30 + }, + { + "epoch": 0.009515039901780233, + "grad_norm": 1.126287579536438, + "learning_rate": 3.067484662576687e-06, + "loss": 6.2845, + "step": 31 + }, + { + "epoch": 0.009821976672805401, + "grad_norm": 1.1070353984832764, + "learning_rate": 3.1697341513292436e-06, + "loss": 6.2855, + "step": 32 + }, + { + "epoch": 0.010128913443830571, + "grad_norm": 1.101291537284851, + "learning_rate": 3.2719836400818002e-06, + "loss": 6.2764, + "step": 33 + }, + { + "epoch": 0.010435850214855739, + "grad_norm": 1.0643113851547241, + "learning_rate": 3.374233128834356e-06, + "loss": 6.2363, + "step": 34 + }, + { + "epoch": 0.010742786985880909, + "grad_norm": 0.9714563488960266, + "learning_rate": 3.4764826175869125e-06, + "loss": 6.1771, + "step": 35 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8998560309410095, + "learning_rate": 3.5787321063394683e-06, + "loss": 6.1202, + "step": 36 + }, + { + "epoch": 0.011356660527931247, + "grad_norm": 0.8481987714767456, + "learning_rate": 3.680981595092025e-06, + "loss": 6.0954, + "step": 37 + }, + { + "epoch": 0.011663597298956415, + "grad_norm": 0.8124909996986389, + "learning_rate": 3.783231083844581e-06, + "loss": 6.0832, + "step": 38 + }, + { + "epoch": 0.011970534069981584, + "grad_norm": 0.7968178391456604, + "learning_rate": 3.885480572597138e-06, + "loss": 6.0661, + "step": 39 + }, + { + "epoch": 0.012277470841006752, + "grad_norm": 0.7714207768440247, + "learning_rate": 3.987730061349693e-06, + "loss": 6.0385, + "step": 40 + }, + { + "epoch": 0.012584407612031922, + "grad_norm": 0.7436742782592773, + "learning_rate": 4.08997955010225e-06, + "loss": 6.0227, + "step": 41 + }, + { + "epoch": 0.01289134438305709, + "grad_norm": 0.7447277307510376, + "learning_rate": 4.192229038854806e-06, + "loss": 6.0208, + "step": 42 + }, + { + "epoch": 0.013198281154082258, + "grad_norm": 0.6983785629272461, + "learning_rate": 4.294478527607362e-06, + "loss": 6.0295, + "step": 43 + }, + { + "epoch": 0.013505217925107428, + "grad_norm": 0.6630908250808716, + "learning_rate": 4.3967280163599184e-06, + "loss": 6.004, + "step": 44 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.6481929421424866, + "learning_rate": 4.498977505112475e-06, + "loss": 5.9986, + "step": 45 + }, + { + "epoch": 0.014119091467157766, + "grad_norm": 0.7187685966491699, + "learning_rate": 4.601226993865031e-06, + "loss": 6.0008, + "step": 46 + }, + { + "epoch": 0.014426028238182934, + "grad_norm": 0.6550983190536499, + "learning_rate": 4.703476482617587e-06, + "loss": 5.9735, + "step": 47 + }, + { + "epoch": 0.014732965009208104, + "grad_norm": 0.6780675649642944, + "learning_rate": 4.805725971370143e-06, + "loss": 5.9568, + "step": 48 + }, + { + "epoch": 0.015039901780233272, + "grad_norm": 0.703427791595459, + "learning_rate": 4.9079754601227e-06, + "loss": 5.961, + "step": 49 + }, + { + "epoch": 0.015346838551258441, + "grad_norm": 0.6507543921470642, + "learning_rate": 5.0102249488752554e-06, + "loss": 5.9557, + "step": 50 + }, + { + "epoch": 0.01565377532228361, + "grad_norm": 0.5959481000900269, + "learning_rate": 5.1124744376278124e-06, + "loss": 5.9391, + "step": 51 + }, + { + "epoch": 0.01596071209330878, + "grad_norm": 0.5798730254173279, + "learning_rate": 5.214723926380368e-06, + "loss": 5.9488, + "step": 52 + }, + { + "epoch": 0.016267648864333947, + "grad_norm": 0.5932896137237549, + "learning_rate": 5.316973415132925e-06, + "loss": 5.9176, + "step": 53 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5772561430931091, + "learning_rate": 5.419222903885481e-06, + "loss": 5.9069, + "step": 54 + }, + { + "epoch": 0.016881522406384283, + "grad_norm": 0.5578178763389587, + "learning_rate": 5.521472392638037e-06, + "loss": 5.8924, + "step": 55 + }, + { + "epoch": 0.017188459177409455, + "grad_norm": 0.5458457469940186, + "learning_rate": 5.623721881390593e-06, + "loss": 5.9001, + "step": 56 + }, + { + "epoch": 0.017495395948434623, + "grad_norm": 0.5381231904029846, + "learning_rate": 5.7259713701431494e-06, + "loss": 5.8827, + "step": 57 + }, + { + "epoch": 0.01780233271945979, + "grad_norm": 0.540920615196228, + "learning_rate": 5.828220858895706e-06, + "loss": 5.8763, + "step": 58 + }, + { + "epoch": 0.01810926949048496, + "grad_norm": 0.5378615260124207, + "learning_rate": 5.930470347648262e-06, + "loss": 5.865, + "step": 59 + }, + { + "epoch": 0.01841620626151013, + "grad_norm": 0.5139282941818237, + "learning_rate": 6.032719836400819e-06, + "loss": 5.873, + "step": 60 + }, + { + "epoch": 0.0187231430325353, + "grad_norm": 0.5298904776573181, + "learning_rate": 6.134969325153374e-06, + "loss": 5.861, + "step": 61 + }, + { + "epoch": 0.019030079803560467, + "grad_norm": 0.503131628036499, + "learning_rate": 6.237218813905931e-06, + "loss": 5.844, + "step": 62 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.5133433938026428, + "learning_rate": 6.339468302658487e-06, + "loss": 5.8535, + "step": 63 + }, + { + "epoch": 0.019643953345610803, + "grad_norm": 0.4909187853336334, + "learning_rate": 6.4417177914110434e-06, + "loss": 5.8378, + "step": 64 + }, + { + "epoch": 0.019950890116635974, + "grad_norm": 0.6916642785072327, + "learning_rate": 6.5439672801636004e-06, + "loss": 5.8385, + "step": 65 + }, + { + "epoch": 0.020257826887661142, + "grad_norm": 0.4801484942436218, + "learning_rate": 6.646216768916155e-06, + "loss": 5.8089, + "step": 66 + }, + { + "epoch": 0.02056476365868631, + "grad_norm": 0.47745251655578613, + "learning_rate": 6.748466257668712e-06, + "loss": 5.8119, + "step": 67 + }, + { + "epoch": 0.020871700429711478, + "grad_norm": 0.4693359136581421, + "learning_rate": 6.850715746421268e-06, + "loss": 5.8038, + "step": 68 + }, + { + "epoch": 0.02117863720073665, + "grad_norm": 0.46996453404426575, + "learning_rate": 6.952965235173825e-06, + "loss": 5.7966, + "step": 69 + }, + { + "epoch": 0.021485573971761818, + "grad_norm": 0.45779168605804443, + "learning_rate": 7.05521472392638e-06, + "loss": 5.7959, + "step": 70 + }, + { + "epoch": 0.021792510742786986, + "grad_norm": 0.49008259177207947, + "learning_rate": 7.1574642126789366e-06, + "loss": 5.7861, + "step": 71 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.44727766513824463, + "learning_rate": 7.259713701431494e-06, + "loss": 5.7716, + "step": 72 + }, + { + "epoch": 0.022406384284837322, + "grad_norm": 0.4392741918563843, + "learning_rate": 7.36196319018405e-06, + "loss": 5.7776, + "step": 73 + }, + { + "epoch": 0.022713321055862493, + "grad_norm": 0.43525391817092896, + "learning_rate": 7.464212678936605e-06, + "loss": 5.7687, + "step": 74 + }, + { + "epoch": 0.02302025782688766, + "grad_norm": 0.4370710253715515, + "learning_rate": 7.566462167689162e-06, + "loss": 5.7504, + "step": 75 + }, + { + "epoch": 0.02332719459791283, + "grad_norm": 0.4349770247936249, + "learning_rate": 7.668711656441718e-06, + "loss": 5.7425, + "step": 76 + }, + { + "epoch": 0.023634131368937997, + "grad_norm": 0.42710933089256287, + "learning_rate": 7.770961145194275e-06, + "loss": 5.7562, + "step": 77 + }, + { + "epoch": 0.02394106813996317, + "grad_norm": 0.42816224694252014, + "learning_rate": 7.87321063394683e-06, + "loss": 5.7301, + "step": 78 + }, + { + "epoch": 0.024248004910988337, + "grad_norm": 0.4183364510536194, + "learning_rate": 7.975460122699386e-06, + "loss": 5.7131, + "step": 79 + }, + { + "epoch": 0.024554941682013505, + "grad_norm": 0.4179428517818451, + "learning_rate": 8.077709611451943e-06, + "loss": 5.7057, + "step": 80 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.40880727767944336, + "learning_rate": 8.1799591002045e-06, + "loss": 5.7179, + "step": 81 + }, + { + "epoch": 0.025168815224063844, + "grad_norm": 0.40961235761642456, + "learning_rate": 8.282208588957055e-06, + "loss": 5.7008, + "step": 82 + }, + { + "epoch": 0.025475751995089013, + "grad_norm": 0.46789029240608215, + "learning_rate": 8.384458077709612e-06, + "loss": 5.7071, + "step": 83 + }, + { + "epoch": 0.02578268876611418, + "grad_norm": 0.4776248335838318, + "learning_rate": 8.486707566462168e-06, + "loss": 5.6829, + "step": 84 + }, + { + "epoch": 0.02608962553713935, + "grad_norm": 0.40660589933395386, + "learning_rate": 8.588957055214725e-06, + "loss": 5.6732, + "step": 85 + }, + { + "epoch": 0.026396562308164517, + "grad_norm": 0.3984324038028717, + "learning_rate": 8.69120654396728e-06, + "loss": 5.6777, + "step": 86 + }, + { + "epoch": 0.026703499079189688, + "grad_norm": 0.3972148597240448, + "learning_rate": 8.793456032719837e-06, + "loss": 5.6598, + "step": 87 + }, + { + "epoch": 0.027010435850214856, + "grad_norm": 0.3906182050704956, + "learning_rate": 8.895705521472392e-06, + "loss": 5.6468, + "step": 88 + }, + { + "epoch": 0.027317372621240024, + "grad_norm": 0.38598939776420593, + "learning_rate": 8.99795501022495e-06, + "loss": 5.6452, + "step": 89 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.405943363904953, + "learning_rate": 9.100204498977506e-06, + "loss": 5.6408, + "step": 90 + }, + { + "epoch": 0.027931246163290364, + "grad_norm": 0.3859459161758423, + "learning_rate": 9.202453987730062e-06, + "loss": 5.613, + "step": 91 + }, + { + "epoch": 0.028238182934315532, + "grad_norm": 0.3773545026779175, + "learning_rate": 9.304703476482619e-06, + "loss": 5.6277, + "step": 92 + }, + { + "epoch": 0.0285451197053407, + "grad_norm": 0.36915943026542664, + "learning_rate": 9.406952965235174e-06, + "loss": 5.618, + "step": 93 + }, + { + "epoch": 0.028852056476365868, + "grad_norm": 0.3732316792011261, + "learning_rate": 9.509202453987731e-06, + "loss": 5.6066, + "step": 94 + }, + { + "epoch": 0.029158993247391036, + "grad_norm": 0.3670802414417267, + "learning_rate": 9.611451942740286e-06, + "loss": 5.6189, + "step": 95 + }, + { + "epoch": 0.029465930018416207, + "grad_norm": 0.3672202229499817, + "learning_rate": 9.713701431492843e-06, + "loss": 5.6046, + "step": 96 + }, + { + "epoch": 0.029772866789441375, + "grad_norm": 0.3624509871006012, + "learning_rate": 9.8159509202454e-06, + "loss": 5.585, + "step": 97 + }, + { + "epoch": 0.030079803560466543, + "grad_norm": 0.36265870928764343, + "learning_rate": 9.918200408997956e-06, + "loss": 5.5867, + "step": 98 + }, + { + "epoch": 0.03038674033149171, + "grad_norm": 0.3606979548931122, + "learning_rate": 1.0020449897750511e-05, + "loss": 5.5658, + "step": 99 + }, + { + "epoch": 0.030693677102516883, + "grad_norm": 0.36800363659858704, + "learning_rate": 1.0122699386503068e-05, + "loss": 5.5494, + "step": 100 + }, + { + "epoch": 0.03100061387354205, + "grad_norm": 0.3641016483306885, + "learning_rate": 1.0224948875255625e-05, + "loss": 5.5553, + "step": 101 + }, + { + "epoch": 0.03130755064456722, + "grad_norm": 0.36807990074157715, + "learning_rate": 1.032719836400818e-05, + "loss": 5.5315, + "step": 102 + }, + { + "epoch": 0.03161448741559239, + "grad_norm": 0.37071728706359863, + "learning_rate": 1.0429447852760736e-05, + "loss": 5.522, + "step": 103 + }, + { + "epoch": 0.03192142418661756, + "grad_norm": 0.3549076020717621, + "learning_rate": 1.0531697341513293e-05, + "loss": 5.5354, + "step": 104 + }, + { + "epoch": 0.03222836095764273, + "grad_norm": 0.3589537441730499, + "learning_rate": 1.063394683026585e-05, + "loss": 5.534, + "step": 105 + }, + { + "epoch": 0.032535297728667895, + "grad_norm": 0.4341397285461426, + "learning_rate": 1.0736196319018407e-05, + "loss": 5.5088, + "step": 106 + }, + { + "epoch": 0.03284223449969306, + "grad_norm": 0.37220680713653564, + "learning_rate": 1.0838445807770962e-05, + "loss": 5.5213, + "step": 107 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.3776145875453949, + "learning_rate": 1.0940695296523517e-05, + "loss": 5.4955, + "step": 108 + }, + { + "epoch": 0.0334561080417434, + "grad_norm": 0.38651829957962036, + "learning_rate": 1.1042944785276074e-05, + "loss": 5.4916, + "step": 109 + }, + { + "epoch": 0.03376304481276857, + "grad_norm": 0.3749970495700836, + "learning_rate": 1.1145194274028631e-05, + "loss": 5.4686, + "step": 110 + }, + { + "epoch": 0.03406998158379374, + "grad_norm": 0.38184404373168945, + "learning_rate": 1.1247443762781187e-05, + "loss": 5.4694, + "step": 111 + }, + { + "epoch": 0.03437691835481891, + "grad_norm": 0.38783952593803406, + "learning_rate": 1.1349693251533742e-05, + "loss": 5.4447, + "step": 112 + }, + { + "epoch": 0.03468385512584408, + "grad_norm": 0.369125097990036, + "learning_rate": 1.1451942740286299e-05, + "loss": 5.4506, + "step": 113 + }, + { + "epoch": 0.034990791896869246, + "grad_norm": 0.3773012161254883, + "learning_rate": 1.1554192229038856e-05, + "loss": 5.4637, + "step": 114 + }, + { + "epoch": 0.035297728667894414, + "grad_norm": 0.47702446579933167, + "learning_rate": 1.1656441717791411e-05, + "loss": 5.4487, + "step": 115 + }, + { + "epoch": 0.03560466543891958, + "grad_norm": 0.5288241505622864, + "learning_rate": 1.1758691206543968e-05, + "loss": 5.4216, + "step": 116 + }, + { + "epoch": 0.03591160220994475, + "grad_norm": 0.49916699528694153, + "learning_rate": 1.1860940695296524e-05, + "loss": 5.4055, + "step": 117 + }, + { + "epoch": 0.03621853898096992, + "grad_norm": 0.5027921795845032, + "learning_rate": 1.196319018404908e-05, + "loss": 5.4141, + "step": 118 + }, + { + "epoch": 0.036525475751995086, + "grad_norm": 0.5069209933280945, + "learning_rate": 1.2065439672801638e-05, + "loss": 5.4277, + "step": 119 + }, + { + "epoch": 0.03683241252302026, + "grad_norm": 0.5208525657653809, + "learning_rate": 1.2167689161554193e-05, + "loss": 5.4023, + "step": 120 + }, + { + "epoch": 0.03713934929404543, + "grad_norm": 0.7059593796730042, + "learning_rate": 1.2269938650306748e-05, + "loss": 5.3797, + "step": 121 + }, + { + "epoch": 0.0374462860650706, + "grad_norm": 0.71112060546875, + "learning_rate": 1.2372188139059305e-05, + "loss": 5.3619, + "step": 122 + }, + { + "epoch": 0.037753222836095765, + "grad_norm": 0.5095361471176147, + "learning_rate": 1.2474437627811862e-05, + "loss": 5.3667, + "step": 123 + }, + { + "epoch": 0.03806015960712093, + "grad_norm": 0.986062228679657, + "learning_rate": 1.2576687116564418e-05, + "loss": 5.3459, + "step": 124 + }, + { + "epoch": 0.0383670963781461, + "grad_norm": 0.693392813205719, + "learning_rate": 1.2678936605316975e-05, + "loss": 5.3165, + "step": 125 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7835625410079956, + "learning_rate": 1.278118609406953e-05, + "loss": 5.3205, + "step": 126 + }, + { + "epoch": 0.03898096992019644, + "grad_norm": 0.6314569711685181, + "learning_rate": 1.2883435582822087e-05, + "loss": 5.3287, + "step": 127 + }, + { + "epoch": 0.039287906691221605, + "grad_norm": 0.9079526662826538, + "learning_rate": 1.2985685071574644e-05, + "loss": 5.2935, + "step": 128 + }, + { + "epoch": 0.03959484346224678, + "grad_norm": 0.6998131275177002, + "learning_rate": 1.3087934560327201e-05, + "loss": 5.315, + "step": 129 + }, + { + "epoch": 0.03990178023327195, + "grad_norm": 0.7570182085037231, + "learning_rate": 1.3190184049079754e-05, + "loss": 5.293, + "step": 130 + }, + { + "epoch": 0.040208717004297116, + "grad_norm": 0.6972737908363342, + "learning_rate": 1.329243353783231e-05, + "loss": 5.2863, + "step": 131 + }, + { + "epoch": 0.040515653775322284, + "grad_norm": 0.8841190934181213, + "learning_rate": 1.3394683026584867e-05, + "loss": 5.2518, + "step": 132 + }, + { + "epoch": 0.04082259054634745, + "grad_norm": 0.6792641282081604, + "learning_rate": 1.3496932515337424e-05, + "loss": 5.2386, + "step": 133 + }, + { + "epoch": 0.04112952731737262, + "grad_norm": 0.9234145879745483, + "learning_rate": 1.359918200408998e-05, + "loss": 5.2418, + "step": 134 + }, + { + "epoch": 0.04143646408839779, + "grad_norm": 1.1438226699829102, + "learning_rate": 1.3701431492842536e-05, + "loss": 5.2298, + "step": 135 + }, + { + "epoch": 0.041743400859422956, + "grad_norm": 0.910861074924469, + "learning_rate": 1.3803680981595093e-05, + "loss": 5.2437, + "step": 136 + }, + { + "epoch": 0.042050337630448124, + "grad_norm": 0.8995844721794128, + "learning_rate": 1.390593047034765e-05, + "loss": 5.2456, + "step": 137 + }, + { + "epoch": 0.0423572744014733, + "grad_norm": 0.8543404936790466, + "learning_rate": 1.4008179959100204e-05, + "loss": 5.1888, + "step": 138 + }, + { + "epoch": 0.04266421117249847, + "grad_norm": 0.7565917372703552, + "learning_rate": 1.411042944785276e-05, + "loss": 5.1939, + "step": 139 + }, + { + "epoch": 0.042971147943523635, + "grad_norm": 0.7103878259658813, + "learning_rate": 1.4212678936605318e-05, + "loss": 5.1693, + "step": 140 + }, + { + "epoch": 0.0432780847145488, + "grad_norm": 1.008686900138855, + "learning_rate": 1.4314928425357873e-05, + "loss": 5.1467, + "step": 141 + }, + { + "epoch": 0.04358502148557397, + "grad_norm": 1.1134421825408936, + "learning_rate": 1.441717791411043e-05, + "loss": 5.1695, + "step": 142 + }, + { + "epoch": 0.04389195825659914, + "grad_norm": 0.7418283820152283, + "learning_rate": 1.4519427402862987e-05, + "loss": 5.1556, + "step": 143 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 1.3332276344299316, + "learning_rate": 1.4621676891615542e-05, + "loss": 5.1736, + "step": 144 + }, + { + "epoch": 0.044505831798649476, + "grad_norm": 0.99709153175354, + "learning_rate": 1.47239263803681e-05, + "loss": 5.1326, + "step": 145 + }, + { + "epoch": 0.044812768569674644, + "grad_norm": 2.0185158252716064, + "learning_rate": 1.4826175869120657e-05, + "loss": 5.1075, + "step": 146 + }, + { + "epoch": 0.04511970534069982, + "grad_norm": 0.9810693264007568, + "learning_rate": 1.492842535787321e-05, + "loss": 5.1181, + "step": 147 + }, + { + "epoch": 0.04542664211172499, + "grad_norm": 1.3122087717056274, + "learning_rate": 1.5030674846625767e-05, + "loss": 5.1104, + "step": 148 + }, + { + "epoch": 0.045733578882750155, + "grad_norm": 1.230662226676941, + "learning_rate": 1.5132924335378324e-05, + "loss": 5.0721, + "step": 149 + }, + { + "epoch": 0.04604051565377532, + "grad_norm": 0.9584419131278992, + "learning_rate": 1.523517382413088e-05, + "loss": 5.0574, + "step": 150 + }, + { + "epoch": 0.04634745242480049, + "grad_norm": 1.3933353424072266, + "learning_rate": 1.5337423312883436e-05, + "loss": 5.0468, + "step": 151 + }, + { + "epoch": 0.04665438919582566, + "grad_norm": 1.2336134910583496, + "learning_rate": 1.5439672801635993e-05, + "loss": 5.0596, + "step": 152 + }, + { + "epoch": 0.04696132596685083, + "grad_norm": 1.3005256652832031, + "learning_rate": 1.554192229038855e-05, + "loss": 5.0236, + "step": 153 + }, + { + "epoch": 0.047268262737875995, + "grad_norm": 1.2528692483901978, + "learning_rate": 1.5644171779141108e-05, + "loss": 5.0269, + "step": 154 + }, + { + "epoch": 0.04757519950890117, + "grad_norm": 1.0448148250579834, + "learning_rate": 1.574642126789366e-05, + "loss": 5.0338, + "step": 155 + }, + { + "epoch": 0.04788213627992634, + "grad_norm": 1.2372045516967773, + "learning_rate": 1.5848670756646218e-05, + "loss": 4.9544, + "step": 156 + }, + { + "epoch": 0.048189073050951506, + "grad_norm": 1.2700645923614502, + "learning_rate": 1.5950920245398772e-05, + "loss": 4.9723, + "step": 157 + }, + { + "epoch": 0.048496009821976674, + "grad_norm": 1.1283228397369385, + "learning_rate": 1.605316973415133e-05, + "loss": 4.9801, + "step": 158 + }, + { + "epoch": 0.04880294659300184, + "grad_norm": 1.5563665628433228, + "learning_rate": 1.6155419222903886e-05, + "loss": 4.9118, + "step": 159 + }, + { + "epoch": 0.04910988336402701, + "grad_norm": 1.3759487867355347, + "learning_rate": 1.6257668711656443e-05, + "loss": 4.9552, + "step": 160 + }, + { + "epoch": 0.04941682013505218, + "grad_norm": 1.2167878150939941, + "learning_rate": 1.6359918200409e-05, + "loss": 4.9186, + "step": 161 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.6424930095672607, + "learning_rate": 1.6462167689161557e-05, + "loss": 4.9143, + "step": 162 + }, + { + "epoch": 0.050030693677102514, + "grad_norm": 1.0009948015213013, + "learning_rate": 1.656441717791411e-05, + "loss": 4.8615, + "step": 163 + }, + { + "epoch": 0.05033763044812769, + "grad_norm": 1.8803274631500244, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.8558, + "step": 164 + }, + { + "epoch": 0.05064456721915286, + "grad_norm": 1.1819735765457153, + "learning_rate": 1.6768916155419224e-05, + "loss": 4.8453, + "step": 165 + }, + { + "epoch": 0.050951503990178025, + "grad_norm": 1.9724273681640625, + "learning_rate": 1.6871165644171778e-05, + "loss": 4.8573, + "step": 166 + }, + { + "epoch": 0.05125844076120319, + "grad_norm": 1.4624557495117188, + "learning_rate": 1.6973415132924335e-05, + "loss": 4.8494, + "step": 167 + }, + { + "epoch": 0.05156537753222836, + "grad_norm": 1.4750267267227173, + "learning_rate": 1.7075664621676892e-05, + "loss": 4.8296, + "step": 168 + }, + { + "epoch": 0.05187231430325353, + "grad_norm": 1.3206923007965088, + "learning_rate": 1.717791411042945e-05, + "loss": 4.7834, + "step": 169 + }, + { + "epoch": 0.0521792510742787, + "grad_norm": 1.4332681894302368, + "learning_rate": 1.7280163599182006e-05, + "loss": 4.8008, + "step": 170 + }, + { + "epoch": 0.052486187845303865, + "grad_norm": 1.612804651260376, + "learning_rate": 1.738241308793456e-05, + "loss": 4.7885, + "step": 171 + }, + { + "epoch": 0.05279312461632903, + "grad_norm": 1.3880311250686646, + "learning_rate": 1.7484662576687117e-05, + "loss": 4.8034, + "step": 172 + }, + { + "epoch": 0.05310006138735421, + "grad_norm": 1.7550631761550903, + "learning_rate": 1.7586912065439674e-05, + "loss": 4.7568, + "step": 173 + }, + { + "epoch": 0.053406998158379376, + "grad_norm": 1.653678297996521, + "learning_rate": 1.768916155419223e-05, + "loss": 4.7294, + "step": 174 + }, + { + "epoch": 0.053713934929404544, + "grad_norm": 1.6094826459884644, + "learning_rate": 1.7791411042944784e-05, + "loss": 4.7409, + "step": 175 + }, + { + "epoch": 0.05402087170042971, + "grad_norm": 1.7453033924102783, + "learning_rate": 1.789366053169734e-05, + "loss": 4.7191, + "step": 176 + }, + { + "epoch": 0.05432780847145488, + "grad_norm": 1.3073794841766357, + "learning_rate": 1.79959100204499e-05, + "loss": 4.7347, + "step": 177 + }, + { + "epoch": 0.05463474524248005, + "grad_norm": 2.096515655517578, + "learning_rate": 1.8098159509202455e-05, + "loss": 4.7396, + "step": 178 + }, + { + "epoch": 0.054941682013505216, + "grad_norm": 1.3826024532318115, + "learning_rate": 1.8200408997955012e-05, + "loss": 4.6988, + "step": 179 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 1.9290310144424438, + "learning_rate": 1.8302658486707566e-05, + "loss": 4.6653, + "step": 180 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.7404149770736694, + "learning_rate": 1.8404907975460123e-05, + "loss": 4.7102, + "step": 181 + }, + { + "epoch": 0.05586249232658073, + "grad_norm": 1.7535779476165771, + "learning_rate": 1.850715746421268e-05, + "loss": 4.7124, + "step": 182 + }, + { + "epoch": 0.056169429097605895, + "grad_norm": 1.7792351245880127, + "learning_rate": 1.8609406952965237e-05, + "loss": 4.6969, + "step": 183 + }, + { + "epoch": 0.056476365868631064, + "grad_norm": 2.048332452774048, + "learning_rate": 1.8711656441717794e-05, + "loss": 4.6134, + "step": 184 + }, + { + "epoch": 0.05678330263965623, + "grad_norm": 1.9558366537094116, + "learning_rate": 1.8813905930470348e-05, + "loss": 4.6739, + "step": 185 + }, + { + "epoch": 0.0570902394106814, + "grad_norm": 2.5299644470214844, + "learning_rate": 1.8916155419222905e-05, + "loss": 4.6248, + "step": 186 + }, + { + "epoch": 0.05739717618170657, + "grad_norm": 2.143704891204834, + "learning_rate": 1.9018404907975462e-05, + "loss": 4.6664, + "step": 187 + }, + { + "epoch": 0.057704112952731736, + "grad_norm": 1.925010323524475, + "learning_rate": 1.9120654396728015e-05, + "loss": 4.5657, + "step": 188 + }, + { + "epoch": 0.058011049723756904, + "grad_norm": 1.8223596811294556, + "learning_rate": 1.9222903885480572e-05, + "loss": 4.6124, + "step": 189 + }, + { + "epoch": 0.05831798649478207, + "grad_norm": 1.9519827365875244, + "learning_rate": 1.932515337423313e-05, + "loss": 4.5937, + "step": 190 + }, + { + "epoch": 0.05862492326580725, + "grad_norm": 2.062534809112549, + "learning_rate": 1.9427402862985686e-05, + "loss": 4.6023, + "step": 191 + }, + { + "epoch": 0.058931860036832415, + "grad_norm": 1.8512892723083496, + "learning_rate": 1.9529652351738243e-05, + "loss": 4.5709, + "step": 192 + }, + { + "epoch": 0.05923879680785758, + "grad_norm": 2.7771248817443848, + "learning_rate": 1.96319018404908e-05, + "loss": 4.5902, + "step": 193 + }, + { + "epoch": 0.05954573357888275, + "grad_norm": 1.8911874294281006, + "learning_rate": 1.9734151329243354e-05, + "loss": 4.4973, + "step": 194 + }, + { + "epoch": 0.05985267034990792, + "grad_norm": 2.261096715927124, + "learning_rate": 1.983640081799591e-05, + "loss": 4.5343, + "step": 195 + }, + { + "epoch": 0.06015960712093309, + "grad_norm": 1.833983302116394, + "learning_rate": 1.9938650306748465e-05, + "loss": 4.5604, + "step": 196 + }, + { + "epoch": 0.060466543891958255, + "grad_norm": 2.6909141540527344, + "learning_rate": 2.0040899795501022e-05, + "loss": 4.5411, + "step": 197 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.8085883855819702, + "learning_rate": 2.014314928425358e-05, + "loss": 4.5127, + "step": 198 + }, + { + "epoch": 0.06108041743400859, + "grad_norm": 3.082063913345337, + "learning_rate": 2.0245398773006136e-05, + "loss": 4.5055, + "step": 199 + }, + { + "epoch": 0.061387354205033766, + "grad_norm": 1.6942392587661743, + "learning_rate": 2.0347648261758693e-05, + "loss": 4.4852, + "step": 200 + }, + { + "epoch": 0.061694290976058934, + "grad_norm": 2.428569793701172, + "learning_rate": 2.044989775051125e-05, + "loss": 4.4876, + "step": 201 + }, + { + "epoch": 0.0620012277470841, + "grad_norm": 2.1669068336486816, + "learning_rate": 2.0552147239263807e-05, + "loss": 4.5156, + "step": 202 + }, + { + "epoch": 0.06230816451810927, + "grad_norm": 1.8558237552642822, + "learning_rate": 2.065439672801636e-05, + "loss": 4.495, + "step": 203 + }, + { + "epoch": 0.06261510128913444, + "grad_norm": 2.86224627494812, + "learning_rate": 2.0756646216768917e-05, + "loss": 4.4881, + "step": 204 + }, + { + "epoch": 0.06292203806015961, + "grad_norm": 2.263230562210083, + "learning_rate": 2.085889570552147e-05, + "loss": 4.4349, + "step": 205 + }, + { + "epoch": 0.06322897483118478, + "grad_norm": 2.533039093017578, + "learning_rate": 2.0961145194274028e-05, + "loss": 4.4921, + "step": 206 + }, + { + "epoch": 0.06353591160220995, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.1063394683026585e-05, + "loss": 4.4581, + "step": 207 + }, + { + "epoch": 0.06384284837323512, + "grad_norm": 1.9801981449127197, + "learning_rate": 2.1165644171779142e-05, + "loss": 4.4646, + "step": 208 + }, + { + "epoch": 0.06414978514426029, + "grad_norm": 2.8499860763549805, + "learning_rate": 2.12678936605317e-05, + "loss": 4.3913, + "step": 209 + }, + { + "epoch": 0.06445672191528545, + "grad_norm": 1.8176993131637573, + "learning_rate": 2.1370143149284256e-05, + "loss": 4.4414, + "step": 210 + }, + { + "epoch": 0.06476365868631062, + "grad_norm": 3.1497061252593994, + "learning_rate": 2.1472392638036813e-05, + "loss": 4.4164, + "step": 211 + }, + { + "epoch": 0.06507059545733579, + "grad_norm": 2.0509049892425537, + "learning_rate": 2.1574642126789367e-05, + "loss": 4.4198, + "step": 212 + }, + { + "epoch": 0.06537753222836096, + "grad_norm": 2.5346014499664307, + "learning_rate": 2.1676891615541924e-05, + "loss": 4.3628, + "step": 213 + }, + { + "epoch": 0.06568446899938613, + "grad_norm": 2.281947135925293, + "learning_rate": 2.1779141104294477e-05, + "loss": 4.3824, + "step": 214 + }, + { + "epoch": 0.0659914057704113, + "grad_norm": 2.9005074501037598, + "learning_rate": 2.1881390593047034e-05, + "loss": 4.4227, + "step": 215 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 2.5869741439819336, + "learning_rate": 2.198364008179959e-05, + "loss": 4.4231, + "step": 216 + }, + { + "epoch": 0.06660527931246163, + "grad_norm": 2.339655637741089, + "learning_rate": 2.208588957055215e-05, + "loss": 4.3901, + "step": 217 + }, + { + "epoch": 0.0669122160834868, + "grad_norm": 2.430664539337158, + "learning_rate": 2.2188139059304705e-05, + "loss": 4.3487, + "step": 218 + }, + { + "epoch": 0.06721915285451197, + "grad_norm": 2.1791040897369385, + "learning_rate": 2.2290388548057262e-05, + "loss": 4.3404, + "step": 219 + }, + { + "epoch": 0.06752608962553713, + "grad_norm": 2.7054920196533203, + "learning_rate": 2.239263803680982e-05, + "loss": 4.4186, + "step": 220 + }, + { + "epoch": 0.0678330263965623, + "grad_norm": 2.516566514968872, + "learning_rate": 2.2494887525562373e-05, + "loss": 4.4102, + "step": 221 + }, + { + "epoch": 0.06813996316758748, + "grad_norm": 2.3522324562072754, + "learning_rate": 2.259713701431493e-05, + "loss": 4.4062, + "step": 222 + }, + { + "epoch": 0.06844689993861265, + "grad_norm": 2.557600259780884, + "learning_rate": 2.2699386503067484e-05, + "loss": 4.3711, + "step": 223 + }, + { + "epoch": 0.06875383670963782, + "grad_norm": 2.0590531826019287, + "learning_rate": 2.280163599182004e-05, + "loss": 4.3546, + "step": 224 + }, + { + "epoch": 0.06906077348066299, + "grad_norm": 4.704878330230713, + "learning_rate": 2.2903885480572598e-05, + "loss": 4.39, + "step": 225 + }, + { + "epoch": 0.06936771025168816, + "grad_norm": 2.237440347671509, + "learning_rate": 2.3006134969325155e-05, + "loss": 4.3425, + "step": 226 + }, + { + "epoch": 0.06967464702271332, + "grad_norm": 3.9394450187683105, + "learning_rate": 2.3108384458077712e-05, + "loss": 4.3641, + "step": 227 + }, + { + "epoch": 0.06998158379373849, + "grad_norm": 2.4857213497161865, + "learning_rate": 2.321063394683027e-05, + "loss": 4.3435, + "step": 228 + }, + { + "epoch": 0.07028852056476366, + "grad_norm": 2.893437147140503, + "learning_rate": 2.3312883435582822e-05, + "loss": 4.329, + "step": 229 + }, + { + "epoch": 0.07059545733578883, + "grad_norm": 2.6498284339904785, + "learning_rate": 2.341513292433538e-05, + "loss": 4.3058, + "step": 230 + }, + { + "epoch": 0.070902394106814, + "grad_norm": 2.4182214736938477, + "learning_rate": 2.3517382413087936e-05, + "loss": 4.3147, + "step": 231 + }, + { + "epoch": 0.07120933087783916, + "grad_norm": 2.532050371170044, + "learning_rate": 2.361963190184049e-05, + "loss": 4.3388, + "step": 232 + }, + { + "epoch": 0.07151626764886433, + "grad_norm": 2.5818533897399902, + "learning_rate": 2.3721881390593047e-05, + "loss": 4.3023, + "step": 233 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 2.1860098838806152, + "learning_rate": 2.3824130879345604e-05, + "loss": 4.2571, + "step": 234 + }, + { + "epoch": 0.07213014119091467, + "grad_norm": 3.5780131816864014, + "learning_rate": 2.392638036809816e-05, + "loss": 4.3336, + "step": 235 + }, + { + "epoch": 0.07243707796193984, + "grad_norm": 2.24653697013855, + "learning_rate": 2.4028629856850718e-05, + "loss": 4.3013, + "step": 236 + }, + { + "epoch": 0.072744014732965, + "grad_norm": 3.59663987159729, + "learning_rate": 2.4130879345603275e-05, + "loss": 4.3248, + "step": 237 + }, + { + "epoch": 0.07305095150399017, + "grad_norm": 2.818321943283081, + "learning_rate": 2.423312883435583e-05, + "loss": 4.2876, + "step": 238 + }, + { + "epoch": 0.07335788827501534, + "grad_norm": 2.457371950149536, + "learning_rate": 2.4335378323108386e-05, + "loss": 4.2584, + "step": 239 + }, + { + "epoch": 0.07366482504604052, + "grad_norm": 3.6243598461151123, + "learning_rate": 2.4437627811860943e-05, + "loss": 4.2786, + "step": 240 + }, + { + "epoch": 0.07397176181706569, + "grad_norm": 2.113060474395752, + "learning_rate": 2.4539877300613496e-05, + "loss": 4.2071, + "step": 241 + }, + { + "epoch": 0.07427869858809086, + "grad_norm": 5.355374813079834, + "learning_rate": 2.4642126789366053e-05, + "loss": 4.2871, + "step": 242 + }, + { + "epoch": 0.07458563535911603, + "grad_norm": 2.4509847164154053, + "learning_rate": 2.474437627811861e-05, + "loss": 4.2073, + "step": 243 + }, + { + "epoch": 0.0748925721301412, + "grad_norm": 3.313793659210205, + "learning_rate": 2.4846625766871167e-05, + "loss": 4.2938, + "step": 244 + }, + { + "epoch": 0.07519950890116636, + "grad_norm": 2.731903553009033, + "learning_rate": 2.4948875255623724e-05, + "loss": 4.2023, + "step": 245 + }, + { + "epoch": 0.07550644567219153, + "grad_norm": 2.6218042373657227, + "learning_rate": 2.505112474437628e-05, + "loss": 4.2492, + "step": 246 + }, + { + "epoch": 0.0758133824432167, + "grad_norm": 3.2865426540374756, + "learning_rate": 2.5153374233128835e-05, + "loss": 4.2358, + "step": 247 + }, + { + "epoch": 0.07612031921424187, + "grad_norm": 2.21870756149292, + "learning_rate": 2.5255623721881395e-05, + "loss": 4.1989, + "step": 248 + }, + { + "epoch": 0.07642725598526703, + "grad_norm": 4.095842361450195, + "learning_rate": 2.535787321063395e-05, + "loss": 4.2484, + "step": 249 + }, + { + "epoch": 0.0767341927562922, + "grad_norm": 2.21420955657959, + "learning_rate": 2.5460122699386503e-05, + "loss": 4.1985, + "step": 250 + }, + { + "epoch": 0.07704112952731737, + "grad_norm": 3.011272668838501, + "learning_rate": 2.556237218813906e-05, + "loss": 4.2182, + "step": 251 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 2.930999279022217, + "learning_rate": 2.5664621676891613e-05, + "loss": 4.1985, + "step": 252 + }, + { + "epoch": 0.0776550030693677, + "grad_norm": 2.8528032302856445, + "learning_rate": 2.5766871165644174e-05, + "loss": 4.1859, + "step": 253 + }, + { + "epoch": 0.07796193984039287, + "grad_norm": 3.215587854385376, + "learning_rate": 2.5869120654396727e-05, + "loss": 4.2416, + "step": 254 + }, + { + "epoch": 0.07826887661141804, + "grad_norm": 3.1349990367889404, + "learning_rate": 2.5971370143149288e-05, + "loss": 4.2204, + "step": 255 + }, + { + "epoch": 0.07857581338244321, + "grad_norm": 3.146942377090454, + "learning_rate": 2.607361963190184e-05, + "loss": 4.17, + "step": 256 + }, + { + "epoch": 0.07888275015346839, + "grad_norm": 2.2611942291259766, + "learning_rate": 2.6175869120654402e-05, + "loss": 4.191, + "step": 257 + }, + { + "epoch": 0.07918968692449356, + "grad_norm": 3.434574604034424, + "learning_rate": 2.6278118609406955e-05, + "loss": 4.1854, + "step": 258 + }, + { + "epoch": 0.07949662369551873, + "grad_norm": 2.3132400512695312, + "learning_rate": 2.638036809815951e-05, + "loss": 4.233, + "step": 259 + }, + { + "epoch": 0.0798035604665439, + "grad_norm": 3.2676596641540527, + "learning_rate": 2.6482617586912066e-05, + "loss": 4.1586, + "step": 260 + }, + { + "epoch": 0.08011049723756906, + "grad_norm": 2.6182920932769775, + "learning_rate": 2.658486707566462e-05, + "loss": 4.164, + "step": 261 + }, + { + "epoch": 0.08041743400859423, + "grad_norm": 2.872018814086914, + "learning_rate": 2.668711656441718e-05, + "loss": 4.1642, + "step": 262 + }, + { + "epoch": 0.0807243707796194, + "grad_norm": 3.147237539291382, + "learning_rate": 2.6789366053169734e-05, + "loss": 4.147, + "step": 263 + }, + { + "epoch": 0.08103130755064457, + "grad_norm": 2.363360643386841, + "learning_rate": 2.6891615541922294e-05, + "loss": 4.1388, + "step": 264 + }, + { + "epoch": 0.08133824432166974, + "grad_norm": 3.364442825317383, + "learning_rate": 2.6993865030674848e-05, + "loss": 4.1678, + "step": 265 + }, + { + "epoch": 0.0816451810926949, + "grad_norm": 2.393705368041992, + "learning_rate": 2.7096114519427408e-05, + "loss": 4.1626, + "step": 266 + }, + { + "epoch": 0.08195211786372007, + "grad_norm": 3.8512558937072754, + "learning_rate": 2.719836400817996e-05, + "loss": 4.1613, + "step": 267 + }, + { + "epoch": 0.08225905463474524, + "grad_norm": 3.0992584228515625, + "learning_rate": 2.7300613496932515e-05, + "loss": 4.1486, + "step": 268 + }, + { + "epoch": 0.08256599140577041, + "grad_norm": 3.481079578399658, + "learning_rate": 2.7402862985685072e-05, + "loss": 4.1772, + "step": 269 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 3.2167513370513916, + "learning_rate": 2.7505112474437626e-05, + "loss": 4.1253, + "step": 270 + }, + { + "epoch": 0.08317986494782074, + "grad_norm": 2.9698429107666016, + "learning_rate": 2.7607361963190186e-05, + "loss": 4.0897, + "step": 271 + }, + { + "epoch": 0.08348680171884591, + "grad_norm": 3.2549962997436523, + "learning_rate": 2.770961145194274e-05, + "loss": 4.0851, + "step": 272 + }, + { + "epoch": 0.08379373848987108, + "grad_norm": 3.089301824569702, + "learning_rate": 2.78118609406953e-05, + "loss": 4.1378, + "step": 273 + }, + { + "epoch": 0.08410067526089625, + "grad_norm": 3.1799745559692383, + "learning_rate": 2.7914110429447854e-05, + "loss": 4.159, + "step": 274 + }, + { + "epoch": 0.08440761203192143, + "grad_norm": 2.7577199935913086, + "learning_rate": 2.8016359918200408e-05, + "loss": 4.0524, + "step": 275 + }, + { + "epoch": 0.0847145488029466, + "grad_norm": 3.709740161895752, + "learning_rate": 2.8118609406952968e-05, + "loss": 4.0877, + "step": 276 + }, + { + "epoch": 0.08502148557397177, + "grad_norm": 2.930482864379883, + "learning_rate": 2.822085889570552e-05, + "loss": 4.0408, + "step": 277 + }, + { + "epoch": 0.08532842234499693, + "grad_norm": 3.8216278553009033, + "learning_rate": 2.832310838445808e-05, + "loss": 4.0915, + "step": 278 + }, + { + "epoch": 0.0856353591160221, + "grad_norm": 2.7614903450012207, + "learning_rate": 2.8425357873210636e-05, + "loss": 4.0793, + "step": 279 + }, + { + "epoch": 0.08594229588704727, + "grad_norm": 4.005281448364258, + "learning_rate": 2.8527607361963193e-05, + "loss": 4.1234, + "step": 280 + }, + { + "epoch": 0.08624923265807244, + "grad_norm": 2.731640338897705, + "learning_rate": 2.8629856850715746e-05, + "loss": 4.1408, + "step": 281 + }, + { + "epoch": 0.0865561694290976, + "grad_norm": 4.439471244812012, + "learning_rate": 2.8732106339468307e-05, + "loss": 4.08, + "step": 282 + }, + { + "epoch": 0.08686310620012277, + "grad_norm": 2.929032564163208, + "learning_rate": 2.883435582822086e-05, + "loss": 4.0521, + "step": 283 + }, + { + "epoch": 0.08717004297114794, + "grad_norm": 3.3943557739257812, + "learning_rate": 2.8936605316973414e-05, + "loss": 4.0936, + "step": 284 + }, + { + "epoch": 0.08747697974217311, + "grad_norm": 2.9899704456329346, + "learning_rate": 2.9038854805725974e-05, + "loss": 4.0985, + "step": 285 + }, + { + "epoch": 0.08778391651319828, + "grad_norm": 2.8169870376586914, + "learning_rate": 2.9141104294478528e-05, + "loss": 4.1044, + "step": 286 + }, + { + "epoch": 0.08809085328422345, + "grad_norm": 4.312693119049072, + "learning_rate": 2.9243353783231085e-05, + "loss": 4.0515, + "step": 287 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 2.9270846843719482, + "learning_rate": 2.9345603271983642e-05, + "loss": 4.0221, + "step": 288 + }, + { + "epoch": 0.08870472682627378, + "grad_norm": 3.9831974506378174, + "learning_rate": 2.94478527607362e-05, + "loss": 4.0807, + "step": 289 + }, + { + "epoch": 0.08901166359729895, + "grad_norm": 2.721794605255127, + "learning_rate": 2.9550102249488753e-05, + "loss": 4.0732, + "step": 290 + }, + { + "epoch": 0.08931860036832412, + "grad_norm": 4.721047878265381, + "learning_rate": 2.9652351738241313e-05, + "loss": 4.0457, + "step": 291 + }, + { + "epoch": 0.08962553713934929, + "grad_norm": 2.785738229751587, + "learning_rate": 2.9754601226993867e-05, + "loss": 4.0288, + "step": 292 + }, + { + "epoch": 0.08993247391037447, + "grad_norm": 4.842009544372559, + "learning_rate": 2.985685071574642e-05, + "loss": 4.1193, + "step": 293 + }, + { + "epoch": 0.09023941068139964, + "grad_norm": 2.802044153213501, + "learning_rate": 2.995910020449898e-05, + "loss": 4.0055, + "step": 294 + }, + { + "epoch": 0.0905463474524248, + "grad_norm": 3.7060954570770264, + "learning_rate": 3.0061349693251534e-05, + "loss": 4.0478, + "step": 295 + }, + { + "epoch": 0.09085328422344997, + "grad_norm": 2.8033370971679688, + "learning_rate": 3.0163599182004095e-05, + "loss": 4.0344, + "step": 296 + }, + { + "epoch": 0.09116022099447514, + "grad_norm": 3.148653984069824, + "learning_rate": 3.026584867075665e-05, + "loss": 3.9825, + "step": 297 + }, + { + "epoch": 0.09146715776550031, + "grad_norm": 3.925459384918213, + "learning_rate": 3.0368098159509205e-05, + "loss": 4.0253, + "step": 298 + }, + { + "epoch": 0.09177409453652548, + "grad_norm": 2.8502724170684814, + "learning_rate": 3.047034764826176e-05, + "loss": 4.0192, + "step": 299 + }, + { + "epoch": 0.09208103130755065, + "grad_norm": 3.8444268703460693, + "learning_rate": 3.057259713701431e-05, + "loss": 4.0354, + "step": 300 + }, + { + "epoch": 0.09238796807857581, + "grad_norm": 2.935976982116699, + "learning_rate": 3.067484662576687e-05, + "loss": 4.0397, + "step": 301 + }, + { + "epoch": 0.09269490484960098, + "grad_norm": 2.9375271797180176, + "learning_rate": 3.0777096114519427e-05, + "loss": 3.975, + "step": 302 + }, + { + "epoch": 0.09300184162062615, + "grad_norm": 3.7623329162597656, + "learning_rate": 3.087934560327199e-05, + "loss": 4.0259, + "step": 303 + }, + { + "epoch": 0.09330877839165132, + "grad_norm": 3.1480228900909424, + "learning_rate": 3.098159509202454e-05, + "loss": 3.9676, + "step": 304 + }, + { + "epoch": 0.09361571516267649, + "grad_norm": 4.572622299194336, + "learning_rate": 3.10838445807771e-05, + "loss": 4.0123, + "step": 305 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 2.469806671142578, + "learning_rate": 3.1186094069529655e-05, + "loss": 4.012, + "step": 306 + }, + { + "epoch": 0.09422958870472682, + "grad_norm": 5.133090019226074, + "learning_rate": 3.1288343558282215e-05, + "loss": 3.9892, + "step": 307 + }, + { + "epoch": 0.09453652547575199, + "grad_norm": 3.379105567932129, + "learning_rate": 3.139059304703477e-05, + "loss": 4.0286, + "step": 308 + }, + { + "epoch": 0.09484346224677716, + "grad_norm": 3.1413521766662598, + "learning_rate": 3.149284253578732e-05, + "loss": 4.0238, + "step": 309 + }, + { + "epoch": 0.09515039901780234, + "grad_norm": 2.832242250442505, + "learning_rate": 3.159509202453988e-05, + "loss": 3.9955, + "step": 310 + }, + { + "epoch": 0.09545733578882751, + "grad_norm": 4.405134201049805, + "learning_rate": 3.1697341513292436e-05, + "loss": 4.0093, + "step": 311 + }, + { + "epoch": 0.09576427255985268, + "grad_norm": 2.8928587436676025, + "learning_rate": 3.179959100204499e-05, + "loss": 3.9518, + "step": 312 + }, + { + "epoch": 0.09607120933087784, + "grad_norm": 3.8899731636047363, + "learning_rate": 3.1901840490797544e-05, + "loss": 3.9773, + "step": 313 + }, + { + "epoch": 0.09637814610190301, + "grad_norm": 2.768199920654297, + "learning_rate": 3.2004089979550104e-05, + "loss": 3.9671, + "step": 314 + }, + { + "epoch": 0.09668508287292818, + "grad_norm": 3.834092378616333, + "learning_rate": 3.210633946830266e-05, + "loss": 3.9641, + "step": 315 + }, + { + "epoch": 0.09699201964395335, + "grad_norm": 3.566220998764038, + "learning_rate": 3.220858895705521e-05, + "loss": 3.9585, + "step": 316 + }, + { + "epoch": 0.09729895641497852, + "grad_norm": 3.1876113414764404, + "learning_rate": 3.231083844580777e-05, + "loss": 3.9689, + "step": 317 + }, + { + "epoch": 0.09760589318600368, + "grad_norm": 3.122142791748047, + "learning_rate": 3.2413087934560325e-05, + "loss": 3.9601, + "step": 318 + }, + { + "epoch": 0.09791282995702885, + "grad_norm": 3.825195789337158, + "learning_rate": 3.2515337423312886e-05, + "loss": 3.9413, + "step": 319 + }, + { + "epoch": 0.09821976672805402, + "grad_norm": 3.3126778602600098, + "learning_rate": 3.261758691206544e-05, + "loss": 4.0414, + "step": 320 + }, + { + "epoch": 0.09852670349907919, + "grad_norm": 3.7704360485076904, + "learning_rate": 3.2719836400818e-05, + "loss": 3.9224, + "step": 321 + }, + { + "epoch": 0.09883364027010436, + "grad_norm": 2.997194290161133, + "learning_rate": 3.282208588957055e-05, + "loss": 3.9454, + "step": 322 + }, + { + "epoch": 0.09914057704112952, + "grad_norm": 3.4990131855010986, + "learning_rate": 3.2924335378323114e-05, + "loss": 3.8682, + "step": 323 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 3.146879196166992, + "learning_rate": 3.302658486707567e-05, + "loss": 3.8863, + "step": 324 + }, + { + "epoch": 0.09975445058317986, + "grad_norm": 4.963291645050049, + "learning_rate": 3.312883435582822e-05, + "loss": 3.9951, + "step": 325 + }, + { + "epoch": 0.10006138735420503, + "grad_norm": 2.4511775970458984, + "learning_rate": 3.323108384458078e-05, + "loss": 3.875, + "step": 326 + }, + { + "epoch": 0.1003683241252302, + "grad_norm": 5.670922756195068, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.0446, + "step": 327 + }, + { + "epoch": 0.10067526089625538, + "grad_norm": 3.54237699508667, + "learning_rate": 3.3435582822085895e-05, + "loss": 3.9877, + "step": 328 + }, + { + "epoch": 0.10098219766728055, + "grad_norm": 2.9059271812438965, + "learning_rate": 3.353783231083845e-05, + "loss": 3.949, + "step": 329 + }, + { + "epoch": 0.10128913443830571, + "grad_norm": 3.870962381362915, + "learning_rate": 3.3640081799591e-05, + "loss": 3.8985, + "step": 330 + }, + { + "epoch": 0.10159607120933088, + "grad_norm": 3.275129556655884, + "learning_rate": 3.3742331288343556e-05, + "loss": 4.0209, + "step": 331 + }, + { + "epoch": 0.10190300798035605, + "grad_norm": 3.040931224822998, + "learning_rate": 3.3844580777096117e-05, + "loss": 3.9938, + "step": 332 + }, + { + "epoch": 0.10220994475138122, + "grad_norm": 4.3355584144592285, + "learning_rate": 3.394683026584867e-05, + "loss": 3.876, + "step": 333 + }, + { + "epoch": 0.10251688152240639, + "grad_norm": 3.0981085300445557, + "learning_rate": 3.4049079754601224e-05, + "loss": 3.9014, + "step": 334 + }, + { + "epoch": 0.10282381829343155, + "grad_norm": 3.2902655601501465, + "learning_rate": 3.4151329243353784e-05, + "loss": 3.9599, + "step": 335 + }, + { + "epoch": 0.10313075506445672, + "grad_norm": 3.496514081954956, + "learning_rate": 3.425357873210634e-05, + "loss": 3.9005, + "step": 336 + }, + { + "epoch": 0.10343769183548189, + "grad_norm": 3.4680685997009277, + "learning_rate": 3.43558282208589e-05, + "loss": 3.8591, + "step": 337 + }, + { + "epoch": 0.10374462860650706, + "grad_norm": 3.3041694164276123, + "learning_rate": 3.445807770961145e-05, + "loss": 3.9566, + "step": 338 + }, + { + "epoch": 0.10405156537753223, + "grad_norm": 3.519709825515747, + "learning_rate": 3.456032719836401e-05, + "loss": 3.9219, + "step": 339 + }, + { + "epoch": 0.1043585021485574, + "grad_norm": 3.932344436645508, + "learning_rate": 3.4662576687116566e-05, + "loss": 3.9155, + "step": 340 + }, + { + "epoch": 0.10466543891958256, + "grad_norm": 3.3109822273254395, + "learning_rate": 3.476482617586912e-05, + "loss": 3.9729, + "step": 341 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 4.556341648101807, + "learning_rate": 3.486707566462168e-05, + "loss": 3.9459, + "step": 342 + }, + { + "epoch": 0.1052793124616329, + "grad_norm": 2.9105725288391113, + "learning_rate": 3.4969325153374234e-05, + "loss": 3.9384, + "step": 343 + }, + { + "epoch": 0.10558624923265807, + "grad_norm": 3.865682601928711, + "learning_rate": 3.5071574642126794e-05, + "loss": 3.9826, + "step": 344 + }, + { + "epoch": 0.10589318600368323, + "grad_norm": 2.8606700897216797, + "learning_rate": 3.517382413087935e-05, + "loss": 3.8184, + "step": 345 + }, + { + "epoch": 0.10620012277470842, + "grad_norm": 4.323507785797119, + "learning_rate": 3.527607361963191e-05, + "loss": 3.8772, + "step": 346 + }, + { + "epoch": 0.10650705954573358, + "grad_norm": 2.890390157699585, + "learning_rate": 3.537832310838446e-05, + "loss": 3.8769, + "step": 347 + }, + { + "epoch": 0.10681399631675875, + "grad_norm": 4.008283615112305, + "learning_rate": 3.5480572597137015e-05, + "loss": 3.8796, + "step": 348 + }, + { + "epoch": 0.10712093308778392, + "grad_norm": 3.3605823516845703, + "learning_rate": 3.558282208588957e-05, + "loss": 3.8924, + "step": 349 + }, + { + "epoch": 0.10742786985880909, + "grad_norm": 3.6573123931884766, + "learning_rate": 3.568507157464213e-05, + "loss": 3.812, + "step": 350 + }, + { + "epoch": 0.10773480662983426, + "grad_norm": 3.0771777629852295, + "learning_rate": 3.578732106339468e-05, + "loss": 3.8958, + "step": 351 + }, + { + "epoch": 0.10804174340085942, + "grad_norm": 3.6483314037323, + "learning_rate": 3.5889570552147236e-05, + "loss": 3.8863, + "step": 352 + }, + { + "epoch": 0.10834868017188459, + "grad_norm": 3.1320669651031494, + "learning_rate": 3.59918200408998e-05, + "loss": 3.8194, + "step": 353 + }, + { + "epoch": 0.10865561694290976, + "grad_norm": 3.6510627269744873, + "learning_rate": 3.609406952965235e-05, + "loss": 3.8916, + "step": 354 + }, + { + "epoch": 0.10896255371393493, + "grad_norm": 3.0419273376464844, + "learning_rate": 3.619631901840491e-05, + "loss": 3.7907, + "step": 355 + }, + { + "epoch": 0.1092694904849601, + "grad_norm": 4.519289493560791, + "learning_rate": 3.6298568507157465e-05, + "loss": 3.8902, + "step": 356 + }, + { + "epoch": 0.10957642725598526, + "grad_norm": 2.938493251800537, + "learning_rate": 3.6400817995910025e-05, + "loss": 3.8675, + "step": 357 + }, + { + "epoch": 0.10988336402701043, + "grad_norm": 4.398004531860352, + "learning_rate": 3.650306748466258e-05, + "loss": 3.9535, + "step": 358 + }, + { + "epoch": 0.1101903007980356, + "grad_norm": 2.9128408432006836, + "learning_rate": 3.660531697341513e-05, + "loss": 3.944, + "step": 359 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 5.364169597625732, + "learning_rate": 3.670756646216769e-05, + "loss": 3.9289, + "step": 360 + }, + { + "epoch": 0.11080417434008594, + "grad_norm": 2.8434085845947266, + "learning_rate": 3.6809815950920246e-05, + "loss": 3.8204, + "step": 361 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.494234561920166, + "learning_rate": 3.6912065439672807e-05, + "loss": 3.8518, + "step": 362 + }, + { + "epoch": 0.11141804788213629, + "grad_norm": 2.959608554840088, + "learning_rate": 3.701431492842536e-05, + "loss": 3.8365, + "step": 363 + }, + { + "epoch": 0.11172498465316145, + "grad_norm": 3.4115726947784424, + "learning_rate": 3.711656441717792e-05, + "loss": 3.8507, + "step": 364 + }, + { + "epoch": 0.11203192142418662, + "grad_norm": 3.8023531436920166, + "learning_rate": 3.7218813905930474e-05, + "loss": 3.8544, + "step": 365 + }, + { + "epoch": 0.11233885819521179, + "grad_norm": 3.0639398097991943, + "learning_rate": 3.732106339468303e-05, + "loss": 3.8772, + "step": 366 + }, + { + "epoch": 0.11264579496623696, + "grad_norm": 4.241199016571045, + "learning_rate": 3.742331288343559e-05, + "loss": 3.7739, + "step": 367 + }, + { + "epoch": 0.11295273173726213, + "grad_norm": 2.977330446243286, + "learning_rate": 3.752556237218814e-05, + "loss": 3.8376, + "step": 368 + }, + { + "epoch": 0.1132596685082873, + "grad_norm": 4.574001789093018, + "learning_rate": 3.7627811860940696e-05, + "loss": 3.8761, + "step": 369 + }, + { + "epoch": 0.11356660527931246, + "grad_norm": 3.1499617099761963, + "learning_rate": 3.773006134969325e-05, + "loss": 3.8884, + "step": 370 + }, + { + "epoch": 0.11387354205033763, + "grad_norm": 3.81887149810791, + "learning_rate": 3.783231083844581e-05, + "loss": 3.8474, + "step": 371 + }, + { + "epoch": 0.1141804788213628, + "grad_norm": 3.424117088317871, + "learning_rate": 3.793456032719836e-05, + "loss": 3.8715, + "step": 372 + }, + { + "epoch": 0.11448741559238797, + "grad_norm": 4.431595325469971, + "learning_rate": 3.8036809815950924e-05, + "loss": 3.8305, + "step": 373 + }, + { + "epoch": 0.11479435236341314, + "grad_norm": 3.1664443016052246, + "learning_rate": 3.813905930470348e-05, + "loss": 3.8203, + "step": 374 + }, + { + "epoch": 0.1151012891344383, + "grad_norm": 4.312273025512695, + "learning_rate": 3.824130879345603e-05, + "loss": 3.8195, + "step": 375 + }, + { + "epoch": 0.11540822590546347, + "grad_norm": 3.0893726348876953, + "learning_rate": 3.834355828220859e-05, + "loss": 3.8248, + "step": 376 + }, + { + "epoch": 0.11571516267648864, + "grad_norm": 4.526726722717285, + "learning_rate": 3.8445807770961145e-05, + "loss": 3.8505, + "step": 377 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 2.5805325508117676, + "learning_rate": 3.8548057259713705e-05, + "loss": 3.8153, + "step": 378 + }, + { + "epoch": 0.11632903621853898, + "grad_norm": 4.6043381690979, + "learning_rate": 3.865030674846626e-05, + "loss": 3.8248, + "step": 379 + }, + { + "epoch": 0.11663597298956414, + "grad_norm": 3.0713136196136475, + "learning_rate": 3.875255623721882e-05, + "loss": 3.7687, + "step": 380 + }, + { + "epoch": 0.11694290976058933, + "grad_norm": 3.6344685554504395, + "learning_rate": 3.885480572597137e-05, + "loss": 3.8061, + "step": 381 + }, + { + "epoch": 0.1172498465316145, + "grad_norm": 3.6261723041534424, + "learning_rate": 3.895705521472393e-05, + "loss": 3.7939, + "step": 382 + }, + { + "epoch": 0.11755678330263966, + "grad_norm": 3.811779260635376, + "learning_rate": 3.905930470347649e-05, + "loss": 3.7973, + "step": 383 + }, + { + "epoch": 0.11786372007366483, + "grad_norm": 3.741685628890991, + "learning_rate": 3.916155419222904e-05, + "loss": 3.8149, + "step": 384 + }, + { + "epoch": 0.11817065684469, + "grad_norm": 3.330526351928711, + "learning_rate": 3.92638036809816e-05, + "loss": 3.8058, + "step": 385 + }, + { + "epoch": 0.11847759361571517, + "grad_norm": 3.2102115154266357, + "learning_rate": 3.9366053169734155e-05, + "loss": 3.7199, + "step": 386 + }, + { + "epoch": 0.11878453038674033, + "grad_norm": 3.670474052429199, + "learning_rate": 3.946830265848671e-05, + "loss": 3.8087, + "step": 387 + }, + { + "epoch": 0.1190914671577655, + "grad_norm": 3.218390941619873, + "learning_rate": 3.957055214723926e-05, + "loss": 3.7631, + "step": 388 + }, + { + "epoch": 0.11939840392879067, + "grad_norm": 4.2256693840026855, + "learning_rate": 3.967280163599182e-05, + "loss": 3.7624, + "step": 389 + }, + { + "epoch": 0.11970534069981584, + "grad_norm": 2.86247181892395, + "learning_rate": 3.9775051124744376e-05, + "loss": 3.7638, + "step": 390 + }, + { + "epoch": 0.120012277470841, + "grad_norm": 4.083118915557861, + "learning_rate": 3.987730061349693e-05, + "loss": 3.7581, + "step": 391 + }, + { + "epoch": 0.12031921424186617, + "grad_norm": 2.836794376373291, + "learning_rate": 3.997955010224949e-05, + "loss": 3.7466, + "step": 392 + }, + { + "epoch": 0.12062615101289134, + "grad_norm": 4.071137428283691, + "learning_rate": 4.0081799591002043e-05, + "loss": 3.7836, + "step": 393 + }, + { + "epoch": 0.12093308778391651, + "grad_norm": 3.3141064643859863, + "learning_rate": 4.0184049079754604e-05, + "loss": 3.754, + "step": 394 + }, + { + "epoch": 0.12124002455494168, + "grad_norm": 3.6064393520355225, + "learning_rate": 4.028629856850716e-05, + "loss": 3.8379, + "step": 395 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 3.7306606769561768, + "learning_rate": 4.038854805725972e-05, + "loss": 3.6848, + "step": 396 + }, + { + "epoch": 0.12185389809699201, + "grad_norm": 3.5877859592437744, + "learning_rate": 4.049079754601227e-05, + "loss": 3.8201, + "step": 397 + }, + { + "epoch": 0.12216083486801718, + "grad_norm": 3.930271625518799, + "learning_rate": 4.059304703476483e-05, + "loss": 3.7507, + "step": 398 + }, + { + "epoch": 0.12246777163904236, + "grad_norm": 2.974968194961548, + "learning_rate": 4.0695296523517386e-05, + "loss": 3.7545, + "step": 399 + }, + { + "epoch": 0.12277470841006753, + "grad_norm": 4.655934810638428, + "learning_rate": 4.079754601226994e-05, + "loss": 3.8093, + "step": 400 + }, + { + "epoch": 0.1230816451810927, + "grad_norm": 3.201986312866211, + "learning_rate": 4.08997955010225e-05, + "loss": 3.7252, + "step": 401 + }, + { + "epoch": 0.12338858195211787, + "grad_norm": 4.447626113891602, + "learning_rate": 4.100204498977505e-05, + "loss": 3.7132, + "step": 402 + }, + { + "epoch": 0.12369551872314304, + "grad_norm": 2.6518118381500244, + "learning_rate": 4.1104294478527614e-05, + "loss": 3.7637, + "step": 403 + }, + { + "epoch": 0.1240024554941682, + "grad_norm": 5.116448402404785, + "learning_rate": 4.120654396728017e-05, + "loss": 3.6991, + "step": 404 + }, + { + "epoch": 0.12430939226519337, + "grad_norm": 2.7780613899230957, + "learning_rate": 4.130879345603272e-05, + "loss": 3.7555, + "step": 405 + }, + { + "epoch": 0.12461632903621854, + "grad_norm": 4.281010627746582, + "learning_rate": 4.1411042944785274e-05, + "loss": 3.688, + "step": 406 + }, + { + "epoch": 0.12492326580724371, + "grad_norm": 2.851562023162842, + "learning_rate": 4.1513292433537835e-05, + "loss": 3.7557, + "step": 407 + }, + { + "epoch": 0.1252302025782689, + "grad_norm": 4.092229843139648, + "learning_rate": 4.161554192229039e-05, + "loss": 3.7179, + "step": 408 + }, + { + "epoch": 0.12553713934929406, + "grad_norm": 3.410094976425171, + "learning_rate": 4.171779141104294e-05, + "loss": 3.7292, + "step": 409 + }, + { + "epoch": 0.12584407612031923, + "grad_norm": 4.266562461853027, + "learning_rate": 4.18200408997955e-05, + "loss": 3.8204, + "step": 410 + }, + { + "epoch": 0.1261510128913444, + "grad_norm": 2.997642755508423, + "learning_rate": 4.1922290388548056e-05, + "loss": 3.7773, + "step": 411 + }, + { + "epoch": 0.12645794966236956, + "grad_norm": 4.50873327255249, + "learning_rate": 4.2024539877300617e-05, + "loss": 3.7255, + "step": 412 + }, + { + "epoch": 0.12676488643339473, + "grad_norm": 3.65312123298645, + "learning_rate": 4.212678936605317e-05, + "loss": 3.6472, + "step": 413 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 3.985487222671509, + "learning_rate": 4.222903885480573e-05, + "loss": 3.6915, + "step": 414 + }, + { + "epoch": 0.12737875997544507, + "grad_norm": 3.6020219326019287, + "learning_rate": 4.2331288343558284e-05, + "loss": 3.7299, + "step": 415 + }, + { + "epoch": 0.12768569674647023, + "grad_norm": 3.414529323577881, + "learning_rate": 4.243353783231084e-05, + "loss": 3.7827, + "step": 416 + }, + { + "epoch": 0.1279926335174954, + "grad_norm": 3.537292718887329, + "learning_rate": 4.25357873210634e-05, + "loss": 3.751, + "step": 417 + }, + { + "epoch": 0.12829957028852057, + "grad_norm": 3.5442280769348145, + "learning_rate": 4.263803680981595e-05, + "loss": 3.6828, + "step": 418 + }, + { + "epoch": 0.12860650705954574, + "grad_norm": 3.9816019535064697, + "learning_rate": 4.274028629856851e-05, + "loss": 3.7668, + "step": 419 + }, + { + "epoch": 0.1289134438305709, + "grad_norm": 3.1632657051086426, + "learning_rate": 4.2842535787321066e-05, + "loss": 3.6946, + "step": 420 + }, + { + "epoch": 0.12922038060159607, + "grad_norm": 4.731013298034668, + "learning_rate": 4.2944785276073626e-05, + "loss": 3.7078, + "step": 421 + }, + { + "epoch": 0.12952731737262124, + "grad_norm": 2.7973382472991943, + "learning_rate": 4.304703476482618e-05, + "loss": 3.5934, + "step": 422 + }, + { + "epoch": 0.1298342541436464, + "grad_norm": 4.555461406707764, + "learning_rate": 4.3149284253578733e-05, + "loss": 3.7406, + "step": 423 + }, + { + "epoch": 0.13014119091467158, + "grad_norm": 3.25795841217041, + "learning_rate": 4.3251533742331294e-05, + "loss": 3.6302, + "step": 424 + }, + { + "epoch": 0.13044812768569675, + "grad_norm": 3.9974427223205566, + "learning_rate": 4.335378323108385e-05, + "loss": 3.6995, + "step": 425 + }, + { + "epoch": 0.13075506445672191, + "grad_norm": 3.4234917163848877, + "learning_rate": 4.34560327198364e-05, + "loss": 3.727, + "step": 426 + }, + { + "epoch": 0.13106200122774708, + "grad_norm": 3.40573787689209, + "learning_rate": 4.3558282208588955e-05, + "loss": 3.6964, + "step": 427 + }, + { + "epoch": 0.13136893799877225, + "grad_norm": 3.6903765201568604, + "learning_rate": 4.3660531697341515e-05, + "loss": 3.7139, + "step": 428 + }, + { + "epoch": 0.13167587476979742, + "grad_norm": 3.3252439498901367, + "learning_rate": 4.376278118609407e-05, + "loss": 3.7221, + "step": 429 + }, + { + "epoch": 0.1319828115408226, + "grad_norm": 3.591610908508301, + "learning_rate": 4.386503067484663e-05, + "loss": 3.6592, + "step": 430 + }, + { + "epoch": 0.13228974831184775, + "grad_norm": 3.584683418273926, + "learning_rate": 4.396728016359918e-05, + "loss": 3.695, + "step": 431 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 3.5093443393707275, + "learning_rate": 4.4069529652351736e-05, + "loss": 3.6368, + "step": 432 + }, + { + "epoch": 0.1329036218538981, + "grad_norm": 3.5040347576141357, + "learning_rate": 4.41717791411043e-05, + "loss": 3.6463, + "step": 433 + }, + { + "epoch": 0.13321055862492326, + "grad_norm": 3.534536361694336, + "learning_rate": 4.427402862985685e-05, + "loss": 3.681, + "step": 434 + }, + { + "epoch": 0.13351749539594843, + "grad_norm": 4.016106605529785, + "learning_rate": 4.437627811860941e-05, + "loss": 3.7592, + "step": 435 + }, + { + "epoch": 0.1338244321669736, + "grad_norm": 3.4661898612976074, + "learning_rate": 4.4478527607361964e-05, + "loss": 3.6437, + "step": 436 + }, + { + "epoch": 0.13413136893799876, + "grad_norm": 3.917189359664917, + "learning_rate": 4.4580777096114525e-05, + "loss": 3.6809, + "step": 437 + }, + { + "epoch": 0.13443830570902393, + "grad_norm": 3.472147226333618, + "learning_rate": 4.468302658486708e-05, + "loss": 3.5978, + "step": 438 + }, + { + "epoch": 0.1347452424800491, + "grad_norm": 3.2357044219970703, + "learning_rate": 4.478527607361964e-05, + "loss": 3.6758, + "step": 439 + }, + { + "epoch": 0.13505217925107427, + "grad_norm": 3.8607826232910156, + "learning_rate": 4.488752556237219e-05, + "loss": 3.7155, + "step": 440 + }, + { + "epoch": 0.13535911602209943, + "grad_norm": 3.085242509841919, + "learning_rate": 4.4989775051124746e-05, + "loss": 3.674, + "step": 441 + }, + { + "epoch": 0.1356660527931246, + "grad_norm": 4.0473432540893555, + "learning_rate": 4.5092024539877307e-05, + "loss": 3.6542, + "step": 442 + }, + { + "epoch": 0.1359729895641498, + "grad_norm": 3.4742088317871094, + "learning_rate": 4.519427402862986e-05, + "loss": 3.6226, + "step": 443 + }, + { + "epoch": 0.13627992633517497, + "grad_norm": 3.8838884830474854, + "learning_rate": 4.5296523517382414e-05, + "loss": 3.695, + "step": 444 + }, + { + "epoch": 0.13658686310620013, + "grad_norm": 3.1551895141601562, + "learning_rate": 4.539877300613497e-05, + "loss": 3.6886, + "step": 445 + }, + { + "epoch": 0.1368937998772253, + "grad_norm": 3.6824824810028076, + "learning_rate": 4.550102249488753e-05, + "loss": 3.6397, + "step": 446 + }, + { + "epoch": 0.13720073664825047, + "grad_norm": 3.3671298027038574, + "learning_rate": 4.560327198364008e-05, + "loss": 3.5983, + "step": 447 + }, + { + "epoch": 0.13750767341927564, + "grad_norm": 4.11976957321167, + "learning_rate": 4.570552147239264e-05, + "loss": 3.6371, + "step": 448 + }, + { + "epoch": 0.1378146101903008, + "grad_norm": 3.2035205364227295, + "learning_rate": 4.5807770961145195e-05, + "loss": 3.6097, + "step": 449 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 4.944174289703369, + "learning_rate": 4.591002044989775e-05, + "loss": 3.6317, + "step": 450 + }, + { + "epoch": 0.13842848373235114, + "grad_norm": 3.0040266513824463, + "learning_rate": 4.601226993865031e-05, + "loss": 3.6407, + "step": 451 + }, + { + "epoch": 0.1387354205033763, + "grad_norm": 5.124639511108398, + "learning_rate": 4.611451942740286e-05, + "loss": 3.6539, + "step": 452 + }, + { + "epoch": 0.13904235727440148, + "grad_norm": 2.792884349822998, + "learning_rate": 4.6216768916155423e-05, + "loss": 3.6542, + "step": 453 + }, + { + "epoch": 0.13934929404542665, + "grad_norm": 4.394725799560547, + "learning_rate": 4.631901840490798e-05, + "loss": 3.6811, + "step": 454 + }, + { + "epoch": 0.13965623081645182, + "grad_norm": 3.209400177001953, + "learning_rate": 4.642126789366054e-05, + "loss": 3.6635, + "step": 455 + }, + { + "epoch": 0.13996316758747698, + "grad_norm": 3.6599526405334473, + "learning_rate": 4.652351738241309e-05, + "loss": 3.5732, + "step": 456 + }, + { + "epoch": 0.14027010435850215, + "grad_norm": 3.6527204513549805, + "learning_rate": 4.6625766871165645e-05, + "loss": 3.5979, + "step": 457 + }, + { + "epoch": 0.14057704112952732, + "grad_norm": 3.4562110900878906, + "learning_rate": 4.6728016359918205e-05, + "loss": 3.6761, + "step": 458 + }, + { + "epoch": 0.1408839779005525, + "grad_norm": 3.5935721397399902, + "learning_rate": 4.683026584867076e-05, + "loss": 3.6598, + "step": 459 + }, + { + "epoch": 0.14119091467157766, + "grad_norm": 3.4518251419067383, + "learning_rate": 4.693251533742332e-05, + "loss": 3.5707, + "step": 460 + }, + { + "epoch": 0.14149785144260282, + "grad_norm": 3.3248815536499023, + "learning_rate": 4.703476482617587e-05, + "loss": 3.6949, + "step": 461 + }, + { + "epoch": 0.141804788213628, + "grad_norm": 3.6379971504211426, + "learning_rate": 4.7137014314928426e-05, + "loss": 3.6265, + "step": 462 + }, + { + "epoch": 0.14211172498465316, + "grad_norm": 4.068325996398926, + "learning_rate": 4.723926380368098e-05, + "loss": 3.6096, + "step": 463 + }, + { + "epoch": 0.14241866175567833, + "grad_norm": 3.0870959758758545, + "learning_rate": 4.734151329243354e-05, + "loss": 3.5201, + "step": 464 + }, + { + "epoch": 0.1427255985267035, + "grad_norm": 4.013638973236084, + "learning_rate": 4.7443762781186094e-05, + "loss": 3.5845, + "step": 465 + }, + { + "epoch": 0.14303253529772866, + "grad_norm": 3.421921968460083, + "learning_rate": 4.754601226993865e-05, + "loss": 3.6718, + "step": 466 + }, + { + "epoch": 0.14333947206875383, + "grad_norm": 3.4814112186431885, + "learning_rate": 4.764826175869121e-05, + "loss": 3.6225, + "step": 467 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 2.9323105812072754, + "learning_rate": 4.775051124744376e-05, + "loss": 3.5881, + "step": 468 + }, + { + "epoch": 0.14395334561080417, + "grad_norm": 3.862344264984131, + "learning_rate": 4.785276073619632e-05, + "loss": 3.6264, + "step": 469 + }, + { + "epoch": 0.14426028238182934, + "grad_norm": 2.950495481491089, + "learning_rate": 4.7955010224948876e-05, + "loss": 3.5891, + "step": 470 + }, + { + "epoch": 0.1445672191528545, + "grad_norm": 4.360744476318359, + "learning_rate": 4.8057259713701436e-05, + "loss": 3.6746, + "step": 471 + }, + { + "epoch": 0.14487415592387967, + "grad_norm": 2.689297914505005, + "learning_rate": 4.815950920245399e-05, + "loss": 3.616, + "step": 472 + }, + { + "epoch": 0.14518109269490484, + "grad_norm": 4.433006286621094, + "learning_rate": 4.826175869120655e-05, + "loss": 3.6259, + "step": 473 + }, + { + "epoch": 0.14548802946593, + "grad_norm": 2.9184467792510986, + "learning_rate": 4.8364008179959104e-05, + "loss": 3.59, + "step": 474 + }, + { + "epoch": 0.14579496623695518, + "grad_norm": 4.472714424133301, + "learning_rate": 4.846625766871166e-05, + "loss": 3.5608, + "step": 475 + }, + { + "epoch": 0.14610190300798034, + "grad_norm": 3.0839431285858154, + "learning_rate": 4.856850715746422e-05, + "loss": 3.6069, + "step": 476 + }, + { + "epoch": 0.1464088397790055, + "grad_norm": 3.8900411128997803, + "learning_rate": 4.867075664621677e-05, + "loss": 3.5387, + "step": 477 + }, + { + "epoch": 0.14671577655003068, + "grad_norm": 3.0446956157684326, + "learning_rate": 4.877300613496933e-05, + "loss": 3.5374, + "step": 478 + }, + { + "epoch": 0.14702271332105588, + "grad_norm": 3.805018901824951, + "learning_rate": 4.8875255623721885e-05, + "loss": 3.6032, + "step": 479 + }, + { + "epoch": 0.14732965009208104, + "grad_norm": 2.9937491416931152, + "learning_rate": 4.897750511247444e-05, + "loss": 3.548, + "step": 480 + }, + { + "epoch": 0.1476365868631062, + "grad_norm": 4.103757858276367, + "learning_rate": 4.907975460122699e-05, + "loss": 3.6292, + "step": 481 + }, + { + "epoch": 0.14794352363413138, + "grad_norm": 2.8275530338287354, + "learning_rate": 4.918200408997955e-05, + "loss": 3.5885, + "step": 482 + }, + { + "epoch": 0.14825046040515655, + "grad_norm": 4.104444980621338, + "learning_rate": 4.928425357873211e-05, + "loss": 3.5566, + "step": 483 + }, + { + "epoch": 0.14855739717618172, + "grad_norm": 2.820648670196533, + "learning_rate": 4.938650306748466e-05, + "loss": 3.6576, + "step": 484 + }, + { + "epoch": 0.14886433394720688, + "grad_norm": 4.639568328857422, + "learning_rate": 4.948875255623722e-05, + "loss": 3.583, + "step": 485 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 2.8675858974456787, + "learning_rate": 4.9591002044989774e-05, + "loss": 3.5982, + "step": 486 + }, + { + "epoch": 0.14947820748925722, + "grad_norm": 4.820484638214111, + "learning_rate": 4.9693251533742335e-05, + "loss": 3.5479, + "step": 487 + }, + { + "epoch": 0.1497851442602824, + "grad_norm": 2.9569075107574463, + "learning_rate": 4.979550102249489e-05, + "loss": 3.5846, + "step": 488 + }, + { + "epoch": 0.15009208103130756, + "grad_norm": 4.402152061462402, + "learning_rate": 4.989775051124745e-05, + "loss": 3.5368, + "step": 489 + }, + { + "epoch": 0.15039901780233272, + "grad_norm": 3.0454704761505127, + "learning_rate": 5e-05, + "loss": 3.5233, + "step": 490 + }, + { + "epoch": 0.1507059545733579, + "grad_norm": 3.564425468444824, + "learning_rate": 5.010224948875256e-05, + "loss": 3.5747, + "step": 491 + }, + { + "epoch": 0.15101289134438306, + "grad_norm": 3.2065536975860596, + "learning_rate": 5.020449897750511e-05, + "loss": 3.4803, + "step": 492 + }, + { + "epoch": 0.15131982811540823, + "grad_norm": 4.06170129776001, + "learning_rate": 5.030674846625767e-05, + "loss": 3.5867, + "step": 493 + }, + { + "epoch": 0.1516267648864334, + "grad_norm": 2.937181234359741, + "learning_rate": 5.040899795501023e-05, + "loss": 3.5098, + "step": 494 + }, + { + "epoch": 0.15193370165745856, + "grad_norm": 3.7272653579711914, + "learning_rate": 5.051124744376279e-05, + "loss": 3.5959, + "step": 495 + }, + { + "epoch": 0.15224063842848373, + "grad_norm": 2.8606886863708496, + "learning_rate": 5.061349693251534e-05, + "loss": 3.4881, + "step": 496 + }, + { + "epoch": 0.1525475751995089, + "grad_norm": 3.4861185550689697, + "learning_rate": 5.07157464212679e-05, + "loss": 3.563, + "step": 497 + }, + { + "epoch": 0.15285451197053407, + "grad_norm": 3.1362967491149902, + "learning_rate": 5.081799591002045e-05, + "loss": 3.5564, + "step": 498 + }, + { + "epoch": 0.15316144874155924, + "grad_norm": 3.360508441925049, + "learning_rate": 5.0920245398773005e-05, + "loss": 3.5307, + "step": 499 + }, + { + "epoch": 0.1534683855125844, + "grad_norm": 3.2896840572357178, + "learning_rate": 5.1022494887525566e-05, + "loss": 3.4843, + "step": 500 + }, + { + "epoch": 0.15377532228360957, + "grad_norm": 3.320429801940918, + "learning_rate": 5.112474437627812e-05, + "loss": 3.484, + "step": 501 + }, + { + "epoch": 0.15408225905463474, + "grad_norm": 3.409586191177368, + "learning_rate": 5.122699386503068e-05, + "loss": 3.506, + "step": 502 + }, + { + "epoch": 0.1543891958256599, + "grad_norm": 3.0944409370422363, + "learning_rate": 5.1329243353783227e-05, + "loss": 3.5011, + "step": 503 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 3.7220418453216553, + "learning_rate": 5.143149284253579e-05, + "loss": 3.5629, + "step": 504 + }, + { + "epoch": 0.15500306936771024, + "grad_norm": 3.217435359954834, + "learning_rate": 5.153374233128835e-05, + "loss": 3.4957, + "step": 505 + }, + { + "epoch": 0.1553100061387354, + "grad_norm": 4.0457444190979, + "learning_rate": 5.163599182004091e-05, + "loss": 3.5152, + "step": 506 + }, + { + "epoch": 0.15561694290976058, + "grad_norm": 2.9380006790161133, + "learning_rate": 5.1738241308793455e-05, + "loss": 3.5261, + "step": 507 + }, + { + "epoch": 0.15592387968078575, + "grad_norm": 4.134535312652588, + "learning_rate": 5.1840490797546015e-05, + "loss": 3.5622, + "step": 508 + }, + { + "epoch": 0.15623081645181092, + "grad_norm": 2.8209407329559326, + "learning_rate": 5.1942740286298575e-05, + "loss": 3.5335, + "step": 509 + }, + { + "epoch": 0.15653775322283608, + "grad_norm": 4.4260711669921875, + "learning_rate": 5.204498977505112e-05, + "loss": 3.5554, + "step": 510 + }, + { + "epoch": 0.15684468999386125, + "grad_norm": 2.8649590015411377, + "learning_rate": 5.214723926380368e-05, + "loss": 3.4989, + "step": 511 + }, + { + "epoch": 0.15715162676488642, + "grad_norm": 4.0349812507629395, + "learning_rate": 5.224948875255624e-05, + "loss": 3.4883, + "step": 512 + }, + { + "epoch": 0.1574585635359116, + "grad_norm": 2.841923475265503, + "learning_rate": 5.2351738241308803e-05, + "loss": 3.4748, + "step": 513 + }, + { + "epoch": 0.15776550030693678, + "grad_norm": 3.8810653686523438, + "learning_rate": 5.245398773006135e-05, + "loss": 3.5403, + "step": 514 + }, + { + "epoch": 0.15807243707796195, + "grad_norm": 3.0830774307250977, + "learning_rate": 5.255623721881391e-05, + "loss": 3.513, + "step": 515 + }, + { + "epoch": 0.15837937384898712, + "grad_norm": 3.8688604831695557, + "learning_rate": 5.265848670756647e-05, + "loss": 3.5409, + "step": 516 + }, + { + "epoch": 0.1586863106200123, + "grad_norm": 2.854600429534912, + "learning_rate": 5.276073619631902e-05, + "loss": 3.4441, + "step": 517 + }, + { + "epoch": 0.15899324739103746, + "grad_norm": 3.9125611782073975, + "learning_rate": 5.286298568507158e-05, + "loss": 3.4953, + "step": 518 + }, + { + "epoch": 0.15930018416206262, + "grad_norm": 2.8626177310943604, + "learning_rate": 5.296523517382413e-05, + "loss": 3.5279, + "step": 519 + }, + { + "epoch": 0.1596071209330878, + "grad_norm": 3.5023677349090576, + "learning_rate": 5.306748466257669e-05, + "loss": 3.4886, + "step": 520 + }, + { + "epoch": 0.15991405770411296, + "grad_norm": 2.960505962371826, + "learning_rate": 5.316973415132924e-05, + "loss": 3.5278, + "step": 521 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 3.976245164871216, + "learning_rate": 5.32719836400818e-05, + "loss": 3.5236, + "step": 522 + }, + { + "epoch": 0.1605279312461633, + "grad_norm": 3.078248977661133, + "learning_rate": 5.337423312883436e-05, + "loss": 3.5194, + "step": 523 + }, + { + "epoch": 0.16083486801718846, + "grad_norm": 3.7498552799224854, + "learning_rate": 5.347648261758691e-05, + "loss": 3.5315, + "step": 524 + }, + { + "epoch": 0.16114180478821363, + "grad_norm": 2.87638258934021, + "learning_rate": 5.357873210633947e-05, + "loss": 3.434, + "step": 525 + }, + { + "epoch": 0.1614487415592388, + "grad_norm": 3.786454677581787, + "learning_rate": 5.368098159509203e-05, + "loss": 3.4985, + "step": 526 + }, + { + "epoch": 0.16175567833026397, + "grad_norm": 2.915156364440918, + "learning_rate": 5.378323108384459e-05, + "loss": 3.4979, + "step": 527 + }, + { + "epoch": 0.16206261510128914, + "grad_norm": 4.095824718475342, + "learning_rate": 5.3885480572597135e-05, + "loss": 3.4605, + "step": 528 + }, + { + "epoch": 0.1623695518723143, + "grad_norm": 2.793501853942871, + "learning_rate": 5.3987730061349695e-05, + "loss": 3.476, + "step": 529 + }, + { + "epoch": 0.16267648864333947, + "grad_norm": 3.9074480533599854, + "learning_rate": 5.4089979550102256e-05, + "loss": 3.4636, + "step": 530 + }, + { + "epoch": 0.16298342541436464, + "grad_norm": 2.8382515907287598, + "learning_rate": 5.4192229038854816e-05, + "loss": 3.4364, + "step": 531 + }, + { + "epoch": 0.1632903621853898, + "grad_norm": 3.4670751094818115, + "learning_rate": 5.429447852760736e-05, + "loss": 3.5033, + "step": 532 + }, + { + "epoch": 0.16359729895641498, + "grad_norm": 2.8805580139160156, + "learning_rate": 5.439672801635992e-05, + "loss": 3.471, + "step": 533 + }, + { + "epoch": 0.16390423572744015, + "grad_norm": 3.745434522628784, + "learning_rate": 5.4498977505112484e-05, + "loss": 3.4565, + "step": 534 + }, + { + "epoch": 0.1642111724984653, + "grad_norm": 3.290579319000244, + "learning_rate": 5.460122699386503e-05, + "loss": 3.47, + "step": 535 + }, + { + "epoch": 0.16451810926949048, + "grad_norm": 3.2988481521606445, + "learning_rate": 5.470347648261759e-05, + "loss": 3.3781, + "step": 536 + }, + { + "epoch": 0.16482504604051565, + "grad_norm": 3.3673248291015625, + "learning_rate": 5.4805725971370145e-05, + "loss": 3.4891, + "step": 537 + }, + { + "epoch": 0.16513198281154082, + "grad_norm": 3.1917717456817627, + "learning_rate": 5.4907975460122705e-05, + "loss": 3.4493, + "step": 538 + }, + { + "epoch": 0.16543891958256599, + "grad_norm": 3.3869614601135254, + "learning_rate": 5.501022494887525e-05, + "loss": 3.3954, + "step": 539 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 2.896742820739746, + "learning_rate": 5.511247443762781e-05, + "loss": 3.4465, + "step": 540 + }, + { + "epoch": 0.16605279312461632, + "grad_norm": 3.771268844604492, + "learning_rate": 5.521472392638037e-05, + "loss": 3.4889, + "step": 541 + }, + { + "epoch": 0.1663597298956415, + "grad_norm": 2.8693349361419678, + "learning_rate": 5.531697341513292e-05, + "loss": 3.3661, + "step": 542 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.093103885650635, + "learning_rate": 5.541922290388548e-05, + "loss": 3.4451, + "step": 543 + }, + { + "epoch": 0.16697360343769183, + "grad_norm": 3.050361394882202, + "learning_rate": 5.552147239263804e-05, + "loss": 3.4203, + "step": 544 + }, + { + "epoch": 0.167280540208717, + "grad_norm": 3.041480302810669, + "learning_rate": 5.56237218813906e-05, + "loss": 3.4173, + "step": 545 + }, + { + "epoch": 0.16758747697974216, + "grad_norm": 3.385680675506592, + "learning_rate": 5.572597137014315e-05, + "loss": 3.4408, + "step": 546 + }, + { + "epoch": 0.16789441375076733, + "grad_norm": 2.88845157623291, + "learning_rate": 5.582822085889571e-05, + "loss": 3.4536, + "step": 547 + }, + { + "epoch": 0.1682013505217925, + "grad_norm": 3.7155961990356445, + "learning_rate": 5.593047034764827e-05, + "loss": 3.4392, + "step": 548 + }, + { + "epoch": 0.1685082872928177, + "grad_norm": 3.4626615047454834, + "learning_rate": 5.6032719836400815e-05, + "loss": 3.4395, + "step": 549 + }, + { + "epoch": 0.16881522406384286, + "grad_norm": 3.182154417037964, + "learning_rate": 5.6134969325153376e-05, + "loss": 3.5239, + "step": 550 + }, + { + "epoch": 0.16912216083486803, + "grad_norm": 3.478602886199951, + "learning_rate": 5.6237218813905936e-05, + "loss": 3.4258, + "step": 551 + }, + { + "epoch": 0.1694290976058932, + "grad_norm": 2.9652369022369385, + "learning_rate": 5.6339468302658496e-05, + "loss": 3.3919, + "step": 552 + }, + { + "epoch": 0.16973603437691837, + "grad_norm": 3.736821413040161, + "learning_rate": 5.644171779141104e-05, + "loss": 3.4491, + "step": 553 + }, + { + "epoch": 0.17004297114794353, + "grad_norm": 2.7791361808776855, + "learning_rate": 5.6543967280163604e-05, + "loss": 3.4748, + "step": 554 + }, + { + "epoch": 0.1703499079189687, + "grad_norm": 4.583637714385986, + "learning_rate": 5.664621676891616e-05, + "loss": 3.4554, + "step": 555 + }, + { + "epoch": 0.17065684468999387, + "grad_norm": 2.8527474403381348, + "learning_rate": 5.674846625766872e-05, + "loss": 3.4327, + "step": 556 + }, + { + "epoch": 0.17096378146101904, + "grad_norm": 4.116163730621338, + "learning_rate": 5.685071574642127e-05, + "loss": 3.4043, + "step": 557 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 3.0130903720855713, + "learning_rate": 5.6952965235173825e-05, + "loss": 3.4823, + "step": 558 + }, + { + "epoch": 0.17157765500306937, + "grad_norm": 3.3556432723999023, + "learning_rate": 5.7055214723926385e-05, + "loss": 3.4464, + "step": 559 + }, + { + "epoch": 0.17188459177409454, + "grad_norm": 2.854952573776245, + "learning_rate": 5.715746421267893e-05, + "loss": 3.3768, + "step": 560 + }, + { + "epoch": 0.1721915285451197, + "grad_norm": 3.9891982078552246, + "learning_rate": 5.725971370143149e-05, + "loss": 3.3949, + "step": 561 + }, + { + "epoch": 0.17249846531614488, + "grad_norm": 2.980468511581421, + "learning_rate": 5.736196319018405e-05, + "loss": 3.459, + "step": 562 + }, + { + "epoch": 0.17280540208717005, + "grad_norm": 3.453510284423828, + "learning_rate": 5.7464212678936613e-05, + "loss": 3.4549, + "step": 563 + }, + { + "epoch": 0.1731123388581952, + "grad_norm": 2.8926782608032227, + "learning_rate": 5.756646216768916e-05, + "loss": 3.392, + "step": 564 + }, + { + "epoch": 0.17341927562922038, + "grad_norm": 3.3722894191741943, + "learning_rate": 5.766871165644172e-05, + "loss": 3.4002, + "step": 565 + }, + { + "epoch": 0.17372621240024555, + "grad_norm": 2.8093647956848145, + "learning_rate": 5.777096114519428e-05, + "loss": 3.3862, + "step": 566 + }, + { + "epoch": 0.17403314917127072, + "grad_norm": 4.1722731590271, + "learning_rate": 5.787321063394683e-05, + "loss": 3.3903, + "step": 567 + }, + { + "epoch": 0.17434008594229589, + "grad_norm": 2.778069257736206, + "learning_rate": 5.797546012269939e-05, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.17464702271332105, + "grad_norm": 3.8501908779144287, + "learning_rate": 5.807770961145195e-05, + "loss": 3.4094, + "step": 569 + }, + { + "epoch": 0.17495395948434622, + "grad_norm": 2.5164549350738525, + "learning_rate": 5.817995910020451e-05, + "loss": 3.4343, + "step": 570 + }, + { + "epoch": 0.1752608962553714, + "grad_norm": 4.0673065185546875, + "learning_rate": 5.8282208588957056e-05, + "loss": 3.3993, + "step": 571 + }, + { + "epoch": 0.17556783302639656, + "grad_norm": 2.7882072925567627, + "learning_rate": 5.8384458077709616e-05, + "loss": 3.4759, + "step": 572 + }, + { + "epoch": 0.17587476979742173, + "grad_norm": 3.3252487182617188, + "learning_rate": 5.848670756646217e-05, + "loss": 3.3562, + "step": 573 + }, + { + "epoch": 0.1761817065684469, + "grad_norm": 2.7499115467071533, + "learning_rate": 5.8588957055214724e-05, + "loss": 3.3376, + "step": 574 + }, + { + "epoch": 0.17648864333947206, + "grad_norm": 4.061224460601807, + "learning_rate": 5.8691206543967284e-05, + "loss": 3.3521, + "step": 575 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 3.022193431854248, + "learning_rate": 5.879345603271984e-05, + "loss": 3.3933, + "step": 576 + }, + { + "epoch": 0.1771025168815224, + "grad_norm": 3.2442128658294678, + "learning_rate": 5.88957055214724e-05, + "loss": 3.4531, + "step": 577 + }, + { + "epoch": 0.17740945365254757, + "grad_norm": 2.9524872303009033, + "learning_rate": 5.8997955010224945e-05, + "loss": 3.332, + "step": 578 + }, + { + "epoch": 0.17771639042357273, + "grad_norm": 3.4604902267456055, + "learning_rate": 5.9100204498977505e-05, + "loss": 3.3706, + "step": 579 + }, + { + "epoch": 0.1780233271945979, + "grad_norm": 3.05216646194458, + "learning_rate": 5.9202453987730066e-05, + "loss": 3.463, + "step": 580 + }, + { + "epoch": 0.17833026396562307, + "grad_norm": 3.427311658859253, + "learning_rate": 5.9304703476482626e-05, + "loss": 3.4204, + "step": 581 + }, + { + "epoch": 0.17863720073664824, + "grad_norm": 2.5583856105804443, + "learning_rate": 5.940695296523517e-05, + "loss": 3.4686, + "step": 582 + }, + { + "epoch": 0.1789441375076734, + "grad_norm": 3.85471248626709, + "learning_rate": 5.950920245398773e-05, + "loss": 3.4518, + "step": 583 + }, + { + "epoch": 0.17925107427869857, + "grad_norm": 2.6894235610961914, + "learning_rate": 5.9611451942740294e-05, + "loss": 3.4179, + "step": 584 + }, + { + "epoch": 0.17955801104972377, + "grad_norm": 3.7592904567718506, + "learning_rate": 5.971370143149284e-05, + "loss": 3.3197, + "step": 585 + }, + { + "epoch": 0.17986494782074894, + "grad_norm": 2.8180313110351562, + "learning_rate": 5.98159509202454e-05, + "loss": 3.4098, + "step": 586 + }, + { + "epoch": 0.1801718845917741, + "grad_norm": 3.5678224563598633, + "learning_rate": 5.991820040899796e-05, + "loss": 3.3644, + "step": 587 + }, + { + "epoch": 0.18047882136279927, + "grad_norm": 2.920607328414917, + "learning_rate": 6.002044989775052e-05, + "loss": 3.4158, + "step": 588 + }, + { + "epoch": 0.18078575813382444, + "grad_norm": 2.9465436935424805, + "learning_rate": 6.012269938650307e-05, + "loss": 3.3369, + "step": 589 + }, + { + "epoch": 0.1810926949048496, + "grad_norm": 3.8760533332824707, + "learning_rate": 6.022494887525563e-05, + "loss": 3.4205, + "step": 590 + }, + { + "epoch": 0.18139963167587478, + "grad_norm": 3.2972259521484375, + "learning_rate": 6.032719836400819e-05, + "loss": 3.3234, + "step": 591 + }, + { + "epoch": 0.18170656844689995, + "grad_norm": 2.8855841159820557, + "learning_rate": 6.0429447852760736e-05, + "loss": 3.4172, + "step": 592 + }, + { + "epoch": 0.18201350521792511, + "grad_norm": 3.3035166263580322, + "learning_rate": 6.05316973415133e-05, + "loss": 3.3235, + "step": 593 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 2.5975232124328613, + "learning_rate": 6.063394683026585e-05, + "loss": 3.3245, + "step": 594 + }, + { + "epoch": 0.18262737875997545, + "grad_norm": 3.68007755279541, + "learning_rate": 6.073619631901841e-05, + "loss": 3.4348, + "step": 595 + }, + { + "epoch": 0.18293431553100062, + "grad_norm": 2.774419069290161, + "learning_rate": 6.083844580777096e-05, + "loss": 3.2763, + "step": 596 + }, + { + "epoch": 0.1832412523020258, + "grad_norm": 3.686140298843384, + "learning_rate": 6.094069529652352e-05, + "loss": 3.29, + "step": 597 + }, + { + "epoch": 0.18354818907305095, + "grad_norm": 2.71142315864563, + "learning_rate": 6.104294478527609e-05, + "loss": 3.3899, + "step": 598 + }, + { + "epoch": 0.18385512584407612, + "grad_norm": 3.725736141204834, + "learning_rate": 6.114519427402863e-05, + "loss": 3.3844, + "step": 599 + }, + { + "epoch": 0.1841620626151013, + "grad_norm": 2.691237211227417, + "learning_rate": 6.124744376278119e-05, + "loss": 3.3138, + "step": 600 + }, + { + "epoch": 0.18446899938612646, + "grad_norm": 3.467499256134033, + "learning_rate": 6.134969325153375e-05, + "loss": 3.3501, + "step": 601 + }, + { + "epoch": 0.18477593615715163, + "grad_norm": 2.776309013366699, + "learning_rate": 6.14519427402863e-05, + "loss": 3.3278, + "step": 602 + }, + { + "epoch": 0.1850828729281768, + "grad_norm": 3.4674019813537598, + "learning_rate": 6.155419222903885e-05, + "loss": 3.262, + "step": 603 + }, + { + "epoch": 0.18538980969920196, + "grad_norm": 2.8091421127319336, + "learning_rate": 6.165644171779141e-05, + "loss": 3.3296, + "step": 604 + }, + { + "epoch": 0.18569674647022713, + "grad_norm": 3.4938528537750244, + "learning_rate": 6.175869120654397e-05, + "loss": 3.4028, + "step": 605 + }, + { + "epoch": 0.1860036832412523, + "grad_norm": 2.5200188159942627, + "learning_rate": 6.186094069529653e-05, + "loss": 3.3726, + "step": 606 + }, + { + "epoch": 0.18631062001227747, + "grad_norm": 3.6415109634399414, + "learning_rate": 6.196319018404908e-05, + "loss": 3.3539, + "step": 607 + }, + { + "epoch": 0.18661755678330263, + "grad_norm": 2.553532123565674, + "learning_rate": 6.206543967280163e-05, + "loss": 3.2971, + "step": 608 + }, + { + "epoch": 0.1869244935543278, + "grad_norm": 3.7287046909332275, + "learning_rate": 6.21676891615542e-05, + "loss": 3.3987, + "step": 609 + }, + { + "epoch": 0.18723143032535297, + "grad_norm": 2.6285226345062256, + "learning_rate": 6.226993865030674e-05, + "loss": 3.2446, + "step": 610 + }, + { + "epoch": 0.18753836709637814, + "grad_norm": 3.453766107559204, + "learning_rate": 6.237218813905931e-05, + "loss": 3.2644, + "step": 611 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 2.7924115657806396, + "learning_rate": 6.247443762781186e-05, + "loss": 3.3056, + "step": 612 + }, + { + "epoch": 0.18815224063842848, + "grad_norm": 3.4854533672332764, + "learning_rate": 6.257668711656443e-05, + "loss": 3.3468, + "step": 613 + }, + { + "epoch": 0.18845917740945364, + "grad_norm": 2.8738653659820557, + "learning_rate": 6.267893660531697e-05, + "loss": 3.3079, + "step": 614 + }, + { + "epoch": 0.1887661141804788, + "grad_norm": 3.496342420578003, + "learning_rate": 6.278118609406954e-05, + "loss": 3.3453, + "step": 615 + }, + { + "epoch": 0.18907305095150398, + "grad_norm": 3.1935245990753174, + "learning_rate": 6.288343558282209e-05, + "loss": 3.303, + "step": 616 + }, + { + "epoch": 0.18937998772252915, + "grad_norm": 2.9726579189300537, + "learning_rate": 6.298568507157464e-05, + "loss": 3.284, + "step": 617 + }, + { + "epoch": 0.18968692449355432, + "grad_norm": 2.8515241146087646, + "learning_rate": 6.30879345603272e-05, + "loss": 3.2748, + "step": 618 + }, + { + "epoch": 0.18999386126457948, + "grad_norm": 3.216681480407715, + "learning_rate": 6.319018404907977e-05, + "loss": 3.2613, + "step": 619 + }, + { + "epoch": 0.19030079803560468, + "grad_norm": 2.9164562225341797, + "learning_rate": 6.329243353783232e-05, + "loss": 3.3234, + "step": 620 + }, + { + "epoch": 0.19060773480662985, + "grad_norm": 2.6724259853363037, + "learning_rate": 6.339468302658487e-05, + "loss": 3.3271, + "step": 621 + }, + { + "epoch": 0.19091467157765502, + "grad_norm": 3.298551082611084, + "learning_rate": 6.349693251533743e-05, + "loss": 3.2715, + "step": 622 + }, + { + "epoch": 0.19122160834868018, + "grad_norm": 2.609632968902588, + "learning_rate": 6.359918200408998e-05, + "loss": 3.2392, + "step": 623 + }, + { + "epoch": 0.19152854511970535, + "grad_norm": 3.6469385623931885, + "learning_rate": 6.370143149284253e-05, + "loss": 3.428, + "step": 624 + }, + { + "epoch": 0.19183548189073052, + "grad_norm": 2.4231622219085693, + "learning_rate": 6.380368098159509e-05, + "loss": 3.3436, + "step": 625 + }, + { + "epoch": 0.1921424186617557, + "grad_norm": 3.9182474613189697, + "learning_rate": 6.390593047034765e-05, + "loss": 3.3375, + "step": 626 + }, + { + "epoch": 0.19244935543278086, + "grad_norm": 2.3975942134857178, + "learning_rate": 6.400817995910021e-05, + "loss": 3.2711, + "step": 627 + }, + { + "epoch": 0.19275629220380602, + "grad_norm": 3.061039447784424, + "learning_rate": 6.411042944785276e-05, + "loss": 3.3124, + "step": 628 + }, + { + "epoch": 0.1930632289748312, + "grad_norm": 2.9461817741394043, + "learning_rate": 6.421267893660532e-05, + "loss": 3.2954, + "step": 629 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 2.6603662967681885, + "learning_rate": 6.431492842535788e-05, + "loss": 3.2138, + "step": 630 + }, + { + "epoch": 0.19367710251688153, + "grad_norm": 3.339444875717163, + "learning_rate": 6.441717791411042e-05, + "loss": 3.2796, + "step": 631 + }, + { + "epoch": 0.1939840392879067, + "grad_norm": 2.59061861038208, + "learning_rate": 6.451942740286299e-05, + "loss": 3.3906, + "step": 632 + }, + { + "epoch": 0.19429097605893186, + "grad_norm": 3.704300880432129, + "learning_rate": 6.462167689161554e-05, + "loss": 3.2604, + "step": 633 + }, + { + "epoch": 0.19459791282995703, + "grad_norm": 3.110203266143799, + "learning_rate": 6.472392638036811e-05, + "loss": 3.3236, + "step": 634 + }, + { + "epoch": 0.1949048496009822, + "grad_norm": 3.016730308532715, + "learning_rate": 6.482617586912065e-05, + "loss": 3.2911, + "step": 635 + }, + { + "epoch": 0.19521178637200737, + "grad_norm": 2.896956205368042, + "learning_rate": 6.492842535787322e-05, + "loss": 3.35, + "step": 636 + }, + { + "epoch": 0.19551872314303254, + "grad_norm": 2.7913663387298584, + "learning_rate": 6.503067484662577e-05, + "loss": 3.3474, + "step": 637 + }, + { + "epoch": 0.1958256599140577, + "grad_norm": 3.285518169403076, + "learning_rate": 6.513292433537832e-05, + "loss": 3.2131, + "step": 638 + }, + { + "epoch": 0.19613259668508287, + "grad_norm": 2.588491201400757, + "learning_rate": 6.523517382413088e-05, + "loss": 3.2955, + "step": 639 + }, + { + "epoch": 0.19643953345610804, + "grad_norm": 2.9417827129364014, + "learning_rate": 6.533742331288345e-05, + "loss": 3.2917, + "step": 640 + }, + { + "epoch": 0.1967464702271332, + "grad_norm": 3.2209408283233643, + "learning_rate": 6.5439672801636e-05, + "loss": 3.233, + "step": 641 + }, + { + "epoch": 0.19705340699815838, + "grad_norm": 2.8424925804138184, + "learning_rate": 6.554192229038855e-05, + "loss": 3.3194, + "step": 642 + }, + { + "epoch": 0.19736034376918354, + "grad_norm": 2.9005842208862305, + "learning_rate": 6.56441717791411e-05, + "loss": 3.275, + "step": 643 + }, + { + "epoch": 0.1976672805402087, + "grad_norm": 3.0277016162872314, + "learning_rate": 6.574642126789366e-05, + "loss": 3.2881, + "step": 644 + }, + { + "epoch": 0.19797421731123388, + "grad_norm": 2.8932368755340576, + "learning_rate": 6.584867075664623e-05, + "loss": 3.2799, + "step": 645 + }, + { + "epoch": 0.19828115408225905, + "grad_norm": 2.994464635848999, + "learning_rate": 6.595092024539877e-05, + "loss": 3.258, + "step": 646 + }, + { + "epoch": 0.19858809085328422, + "grad_norm": 2.943040132522583, + "learning_rate": 6.605316973415133e-05, + "loss": 3.1994, + "step": 647 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 2.942765712738037, + "learning_rate": 6.615541922290389e-05, + "loss": 3.1802, + "step": 648 + }, + { + "epoch": 0.19920196439533455, + "grad_norm": 2.8036246299743652, + "learning_rate": 6.625766871165644e-05, + "loss": 3.2426, + "step": 649 + }, + { + "epoch": 0.19950890116635972, + "grad_norm": 2.814507484436035, + "learning_rate": 6.6359918200409e-05, + "loss": 3.2978, + "step": 650 + }, + { + "epoch": 0.1998158379373849, + "grad_norm": 2.8133158683776855, + "learning_rate": 6.646216768916156e-05, + "loss": 3.2435, + "step": 651 + }, + { + "epoch": 0.20012277470841006, + "grad_norm": 2.8596129417419434, + "learning_rate": 6.656441717791412e-05, + "loss": 3.2154, + "step": 652 + }, + { + "epoch": 0.20042971147943522, + "grad_norm": 2.663926839828491, + "learning_rate": 6.666666666666667e-05, + "loss": 3.2487, + "step": 653 + }, + { + "epoch": 0.2007366482504604, + "grad_norm": 3.40561580657959, + "learning_rate": 6.676891615541922e-05, + "loss": 3.1509, + "step": 654 + }, + { + "epoch": 0.20104358502148556, + "grad_norm": 2.5786798000335693, + "learning_rate": 6.687116564417179e-05, + "loss": 3.2686, + "step": 655 + }, + { + "epoch": 0.20135052179251076, + "grad_norm": 3.007436752319336, + "learning_rate": 6.697341513292433e-05, + "loss": 3.2543, + "step": 656 + }, + { + "epoch": 0.20165745856353592, + "grad_norm": 2.5966951847076416, + "learning_rate": 6.70756646216769e-05, + "loss": 3.2643, + "step": 657 + }, + { + "epoch": 0.2019643953345611, + "grad_norm": 3.2698333263397217, + "learning_rate": 6.717791411042945e-05, + "loss": 3.2002, + "step": 658 + }, + { + "epoch": 0.20227133210558626, + "grad_norm": 2.513129472732544, + "learning_rate": 6.7280163599182e-05, + "loss": 3.1551, + "step": 659 + }, + { + "epoch": 0.20257826887661143, + "grad_norm": 2.9690299034118652, + "learning_rate": 6.738241308793456e-05, + "loss": 3.3037, + "step": 660 + }, + { + "epoch": 0.2028852056476366, + "grad_norm": 2.6644227504730225, + "learning_rate": 6.748466257668711e-05, + "loss": 3.3225, + "step": 661 + }, + { + "epoch": 0.20319214241866176, + "grad_norm": 2.6990232467651367, + "learning_rate": 6.758691206543968e-05, + "loss": 3.227, + "step": 662 + }, + { + "epoch": 0.20349907918968693, + "grad_norm": 3.6271350383758545, + "learning_rate": 6.768916155419223e-05, + "loss": 3.32, + "step": 663 + }, + { + "epoch": 0.2038060159607121, + "grad_norm": 2.6351428031921387, + "learning_rate": 6.779141104294479e-05, + "loss": 3.2104, + "step": 664 + }, + { + "epoch": 0.20411295273173727, + "grad_norm": 3.980685234069824, + "learning_rate": 6.789366053169734e-05, + "loss": 3.2602, + "step": 665 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 2.5207509994506836, + "learning_rate": 6.799591002044991e-05, + "loss": 3.2256, + "step": 666 + }, + { + "epoch": 0.2047268262737876, + "grad_norm": 3.0568666458129883, + "learning_rate": 6.809815950920245e-05, + "loss": 3.2918, + "step": 667 + }, + { + "epoch": 0.20503376304481277, + "grad_norm": 2.6476826667785645, + "learning_rate": 6.820040899795501e-05, + "loss": 3.2745, + "step": 668 + }, + { + "epoch": 0.20534069981583794, + "grad_norm": 3.0413191318511963, + "learning_rate": 6.830265848670757e-05, + "loss": 3.2683, + "step": 669 + }, + { + "epoch": 0.2056476365868631, + "grad_norm": 2.6214709281921387, + "learning_rate": 6.840490797546014e-05, + "loss": 3.1399, + "step": 670 + }, + { + "epoch": 0.20595457335788828, + "grad_norm": 3.0577988624572754, + "learning_rate": 6.850715746421268e-05, + "loss": 3.2131, + "step": 671 + }, + { + "epoch": 0.20626151012891344, + "grad_norm": 2.795365571975708, + "learning_rate": 6.860940695296524e-05, + "loss": 3.1633, + "step": 672 + }, + { + "epoch": 0.2065684468999386, + "grad_norm": 3.3030495643615723, + "learning_rate": 6.87116564417178e-05, + "loss": 3.2036, + "step": 673 + }, + { + "epoch": 0.20687538367096378, + "grad_norm": 2.3182966709136963, + "learning_rate": 6.881390593047035e-05, + "loss": 3.2154, + "step": 674 + }, + { + "epoch": 0.20718232044198895, + "grad_norm": 3.133702039718628, + "learning_rate": 6.89161554192229e-05, + "loss": 3.1828, + "step": 675 + }, + { + "epoch": 0.20748925721301412, + "grad_norm": 2.555358409881592, + "learning_rate": 6.901840490797547e-05, + "loss": 3.1434, + "step": 676 + }, + { + "epoch": 0.20779619398403928, + "grad_norm": 2.990675687789917, + "learning_rate": 6.912065439672802e-05, + "loss": 3.2182, + "step": 677 + }, + { + "epoch": 0.20810313075506445, + "grad_norm": 2.5072035789489746, + "learning_rate": 6.922290388548058e-05, + "loss": 3.2735, + "step": 678 + }, + { + "epoch": 0.20841006752608962, + "grad_norm": 3.311474323272705, + "learning_rate": 6.932515337423313e-05, + "loss": 3.2152, + "step": 679 + }, + { + "epoch": 0.2087170042971148, + "grad_norm": 2.7110986709594727, + "learning_rate": 6.942740286298569e-05, + "loss": 3.1633, + "step": 680 + }, + { + "epoch": 0.20902394106813996, + "grad_norm": 2.6963095664978027, + "learning_rate": 6.952965235173824e-05, + "loss": 3.2097, + "step": 681 + }, + { + "epoch": 0.20933087783916512, + "grad_norm": 2.7126448154449463, + "learning_rate": 6.963190184049079e-05, + "loss": 3.232, + "step": 682 + }, + { + "epoch": 0.2096378146101903, + "grad_norm": 2.723257541656494, + "learning_rate": 6.973415132924336e-05, + "loss": 3.1024, + "step": 683 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 2.985407829284668, + "learning_rate": 6.983640081799591e-05, + "loss": 3.215, + "step": 684 + }, + { + "epoch": 0.21025168815224063, + "grad_norm": 2.4878063201904297, + "learning_rate": 6.993865030674847e-05, + "loss": 3.2543, + "step": 685 + }, + { + "epoch": 0.2105586249232658, + "grad_norm": 3.417191505432129, + "learning_rate": 7.004089979550102e-05, + "loss": 3.217, + "step": 686 + }, + { + "epoch": 0.21086556169429096, + "grad_norm": 2.606513738632202, + "learning_rate": 7.014314928425359e-05, + "loss": 3.1831, + "step": 687 + }, + { + "epoch": 0.21117249846531613, + "grad_norm": 2.777334213256836, + "learning_rate": 7.024539877300614e-05, + "loss": 3.1513, + "step": 688 + }, + { + "epoch": 0.2114794352363413, + "grad_norm": 2.718494415283203, + "learning_rate": 7.03476482617587e-05, + "loss": 3.1695, + "step": 689 + }, + { + "epoch": 0.21178637200736647, + "grad_norm": 3.041794776916504, + "learning_rate": 7.044989775051125e-05, + "loss": 3.2078, + "step": 690 + }, + { + "epoch": 0.21209330877839166, + "grad_norm": 2.6473169326782227, + "learning_rate": 7.055214723926382e-05, + "loss": 3.177, + "step": 691 + }, + { + "epoch": 0.21240024554941683, + "grad_norm": 3.2349517345428467, + "learning_rate": 7.065439672801636e-05, + "loss": 3.2144, + "step": 692 + }, + { + "epoch": 0.212707182320442, + "grad_norm": 2.6024651527404785, + "learning_rate": 7.075664621676892e-05, + "loss": 3.2204, + "step": 693 + }, + { + "epoch": 0.21301411909146717, + "grad_norm": 2.9090511798858643, + "learning_rate": 7.085889570552148e-05, + "loss": 3.2473, + "step": 694 + }, + { + "epoch": 0.21332105586249234, + "grad_norm": 3.230525255203247, + "learning_rate": 7.096114519427403e-05, + "loss": 3.2552, + "step": 695 + }, + { + "epoch": 0.2136279926335175, + "grad_norm": 2.2609128952026367, + "learning_rate": 7.106339468302658e-05, + "loss": 3.1302, + "step": 696 + }, + { + "epoch": 0.21393492940454267, + "grad_norm": 3.484372854232788, + "learning_rate": 7.116564417177914e-05, + "loss": 3.1578, + "step": 697 + }, + { + "epoch": 0.21424186617556784, + "grad_norm": 2.130702257156372, + "learning_rate": 7.12678936605317e-05, + "loss": 3.2089, + "step": 698 + }, + { + "epoch": 0.214548802946593, + "grad_norm": 3.0673611164093018, + "learning_rate": 7.137014314928426e-05, + "loss": 3.214, + "step": 699 + }, + { + "epoch": 0.21485573971761818, + "grad_norm": 2.572826862335205, + "learning_rate": 7.147239263803681e-05, + "loss": 3.1824, + "step": 700 + }, + { + "epoch": 0.21516267648864335, + "grad_norm": 2.8327746391296387, + "learning_rate": 7.157464212678937e-05, + "loss": 3.2384, + "step": 701 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 2.863041877746582, + "learning_rate": 7.167689161554193e-05, + "loss": 3.1102, + "step": 702 + }, + { + "epoch": 0.21577655003069368, + "grad_norm": 2.2519750595092773, + "learning_rate": 7.177914110429447e-05, + "loss": 3.1541, + "step": 703 + }, + { + "epoch": 0.21608348680171885, + "grad_norm": 3.197129011154175, + "learning_rate": 7.188139059304704e-05, + "loss": 3.2407, + "step": 704 + }, + { + "epoch": 0.21639042357274402, + "grad_norm": 2.32582426071167, + "learning_rate": 7.19836400817996e-05, + "loss": 3.1895, + "step": 705 + }, + { + "epoch": 0.21669736034376919, + "grad_norm": 3.0128488540649414, + "learning_rate": 7.208588957055215e-05, + "loss": 3.2839, + "step": 706 + }, + { + "epoch": 0.21700429711479435, + "grad_norm": 2.503342390060425, + "learning_rate": 7.21881390593047e-05, + "loss": 3.2093, + "step": 707 + }, + { + "epoch": 0.21731123388581952, + "grad_norm": 2.7540833950042725, + "learning_rate": 7.229038854805727e-05, + "loss": 3.2143, + "step": 708 + }, + { + "epoch": 0.2176181706568447, + "grad_norm": 2.8838772773742676, + "learning_rate": 7.239263803680982e-05, + "loss": 3.2051, + "step": 709 + }, + { + "epoch": 0.21792510742786986, + "grad_norm": 2.7495758533477783, + "learning_rate": 7.249488752556238e-05, + "loss": 3.0701, + "step": 710 + }, + { + "epoch": 0.21823204419889503, + "grad_norm": 2.684539794921875, + "learning_rate": 7.259713701431493e-05, + "loss": 3.1917, + "step": 711 + }, + { + "epoch": 0.2185389809699202, + "grad_norm": 2.8330819606781006, + "learning_rate": 7.26993865030675e-05, + "loss": 3.1685, + "step": 712 + }, + { + "epoch": 0.21884591774094536, + "grad_norm": 2.6974711418151855, + "learning_rate": 7.280163599182005e-05, + "loss": 3.0953, + "step": 713 + }, + { + "epoch": 0.21915285451197053, + "grad_norm": 2.5129306316375732, + "learning_rate": 7.29038854805726e-05, + "loss": 3.1371, + "step": 714 + }, + { + "epoch": 0.2194597912829957, + "grad_norm": 2.7884230613708496, + "learning_rate": 7.300613496932516e-05, + "loss": 3.1386, + "step": 715 + }, + { + "epoch": 0.21976672805402087, + "grad_norm": 2.296306610107422, + "learning_rate": 7.310838445807771e-05, + "loss": 3.1735, + "step": 716 + }, + { + "epoch": 0.22007366482504603, + "grad_norm": 2.777911424636841, + "learning_rate": 7.321063394683026e-05, + "loss": 3.1726, + "step": 717 + }, + { + "epoch": 0.2203806015960712, + "grad_norm": 2.5349695682525635, + "learning_rate": 7.331288343558282e-05, + "loss": 3.1603, + "step": 718 + }, + { + "epoch": 0.22068753836709637, + "grad_norm": 2.415412425994873, + "learning_rate": 7.341513292433539e-05, + "loss": 3.1378, + "step": 719 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 2.7188358306884766, + "learning_rate": 7.351738241308794e-05, + "loss": 3.1321, + "step": 720 + }, + { + "epoch": 0.2213014119091467, + "grad_norm": 2.4872183799743652, + "learning_rate": 7.361963190184049e-05, + "loss": 3.1283, + "step": 721 + }, + { + "epoch": 0.22160834868017187, + "grad_norm": 2.454535961151123, + "learning_rate": 7.372188139059305e-05, + "loss": 3.1085, + "step": 722 + }, + { + "epoch": 0.22191528545119704, + "grad_norm": 2.5621426105499268, + "learning_rate": 7.382413087934561e-05, + "loss": 3.1307, + "step": 723 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.513777256011963, + "learning_rate": 7.392638036809815e-05, + "loss": 3.1103, + "step": 724 + }, + { + "epoch": 0.22252915899324738, + "grad_norm": 2.596559762954712, + "learning_rate": 7.402862985685072e-05, + "loss": 3.1563, + "step": 725 + }, + { + "epoch": 0.22283609576427257, + "grad_norm": 2.371487617492676, + "learning_rate": 7.413087934560327e-05, + "loss": 3.1344, + "step": 726 + }, + { + "epoch": 0.22314303253529774, + "grad_norm": 2.7252206802368164, + "learning_rate": 7.423312883435584e-05, + "loss": 3.2139, + "step": 727 + }, + { + "epoch": 0.2234499693063229, + "grad_norm": 2.2834722995758057, + "learning_rate": 7.433537832310838e-05, + "loss": 3.1461, + "step": 728 + }, + { + "epoch": 0.22375690607734808, + "grad_norm": 3.0965540409088135, + "learning_rate": 7.443762781186095e-05, + "loss": 3.1433, + "step": 729 + }, + { + "epoch": 0.22406384284837325, + "grad_norm": 2.351365804672241, + "learning_rate": 7.45398773006135e-05, + "loss": 3.1737, + "step": 730 + }, + { + "epoch": 0.2243707796193984, + "grad_norm": 3.0938596725463867, + "learning_rate": 7.464212678936606e-05, + "loss": 3.1689, + "step": 731 + }, + { + "epoch": 0.22467771639042358, + "grad_norm": 2.415039300918579, + "learning_rate": 7.474437627811861e-05, + "loss": 3.1146, + "step": 732 + }, + { + "epoch": 0.22498465316144875, + "grad_norm": 2.8242318630218506, + "learning_rate": 7.484662576687118e-05, + "loss": 3.0812, + "step": 733 + }, + { + "epoch": 0.22529158993247392, + "grad_norm": 2.4347777366638184, + "learning_rate": 7.494887525562373e-05, + "loss": 3.203, + "step": 734 + }, + { + "epoch": 0.22559852670349909, + "grad_norm": 2.953418016433716, + "learning_rate": 7.505112474437628e-05, + "loss": 3.109, + "step": 735 + }, + { + "epoch": 0.22590546347452425, + "grad_norm": 2.600888252258301, + "learning_rate": 7.515337423312884e-05, + "loss": 3.1859, + "step": 736 + }, + { + "epoch": 0.22621240024554942, + "grad_norm": 2.7484869956970215, + "learning_rate": 7.525562372188139e-05, + "loss": 3.1169, + "step": 737 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 2.4797677993774414, + "learning_rate": 7.535787321063396e-05, + "loss": 3.0696, + "step": 738 + }, + { + "epoch": 0.22682627378759976, + "grad_norm": 2.641873359680176, + "learning_rate": 7.54601226993865e-05, + "loss": 3.1545, + "step": 739 + }, + { + "epoch": 0.22713321055862493, + "grad_norm": 2.3956825733184814, + "learning_rate": 7.556237218813907e-05, + "loss": 3.1295, + "step": 740 + }, + { + "epoch": 0.2274401473296501, + "grad_norm": 2.8832130432128906, + "learning_rate": 7.566462167689162e-05, + "loss": 3.1119, + "step": 741 + }, + { + "epoch": 0.22774708410067526, + "grad_norm": 2.3001184463500977, + "learning_rate": 7.576687116564417e-05, + "loss": 3.0068, + "step": 742 + }, + { + "epoch": 0.22805402087170043, + "grad_norm": 2.8682122230529785, + "learning_rate": 7.586912065439673e-05, + "loss": 3.0562, + "step": 743 + }, + { + "epoch": 0.2283609576427256, + "grad_norm": 2.2176413536071777, + "learning_rate": 7.59713701431493e-05, + "loss": 3.1395, + "step": 744 + }, + { + "epoch": 0.22866789441375077, + "grad_norm": 3.698274612426758, + "learning_rate": 7.607361963190185e-05, + "loss": 3.209, + "step": 745 + }, + { + "epoch": 0.22897483118477593, + "grad_norm": 2.141063928604126, + "learning_rate": 7.61758691206544e-05, + "loss": 3.1734, + "step": 746 + }, + { + "epoch": 0.2292817679558011, + "grad_norm": 2.728498697280884, + "learning_rate": 7.627811860940695e-05, + "loss": 3.1498, + "step": 747 + }, + { + "epoch": 0.22958870472682627, + "grad_norm": 2.271678924560547, + "learning_rate": 7.638036809815952e-05, + "loss": 3.1538, + "step": 748 + }, + { + "epoch": 0.22989564149785144, + "grad_norm": 2.6095521450042725, + "learning_rate": 7.648261758691206e-05, + "loss": 3.155, + "step": 749 + }, + { + "epoch": 0.2302025782688766, + "grad_norm": 2.410792112350464, + "learning_rate": 7.658486707566463e-05, + "loss": 3.0478, + "step": 750 + }, + { + "epoch": 0.23050951503990177, + "grad_norm": 2.6980888843536377, + "learning_rate": 7.668711656441718e-05, + "loss": 3.1369, + "step": 751 + }, + { + "epoch": 0.23081645181092694, + "grad_norm": 2.353308916091919, + "learning_rate": 7.678936605316974e-05, + "loss": 3.0052, + "step": 752 + }, + { + "epoch": 0.2311233885819521, + "grad_norm": 2.4530155658721924, + "learning_rate": 7.689161554192229e-05, + "loss": 3.1348, + "step": 753 + }, + { + "epoch": 0.23143032535297728, + "grad_norm": 2.393601894378662, + "learning_rate": 7.699386503067484e-05, + "loss": 2.9941, + "step": 754 + }, + { + "epoch": 0.23173726212400245, + "grad_norm": 2.576876401901245, + "learning_rate": 7.709611451942741e-05, + "loss": 3.114, + "step": 755 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 2.0420913696289062, + "learning_rate": 7.719836400817996e-05, + "loss": 3.132, + "step": 756 + }, + { + "epoch": 0.23235113566605278, + "grad_norm": 3.0095622539520264, + "learning_rate": 7.730061349693252e-05, + "loss": 3.1763, + "step": 757 + }, + { + "epoch": 0.23265807243707795, + "grad_norm": 2.224005937576294, + "learning_rate": 7.740286298568507e-05, + "loss": 3.0703, + "step": 758 + }, + { + "epoch": 0.23296500920810312, + "grad_norm": 2.7559845447540283, + "learning_rate": 7.750511247443764e-05, + "loss": 3.1026, + "step": 759 + }, + { + "epoch": 0.2332719459791283, + "grad_norm": 2.2965753078460693, + "learning_rate": 7.760736196319018e-05, + "loss": 3.0284, + "step": 760 + }, + { + "epoch": 0.23357888275015345, + "grad_norm": 2.374398708343506, + "learning_rate": 7.770961145194275e-05, + "loss": 3.0636, + "step": 761 + }, + { + "epoch": 0.23388581952117865, + "grad_norm": 2.4315314292907715, + "learning_rate": 7.78118609406953e-05, + "loss": 3.0906, + "step": 762 + }, + { + "epoch": 0.23419275629220382, + "grad_norm": 2.5609946250915527, + "learning_rate": 7.791411042944787e-05, + "loss": 3.0692, + "step": 763 + }, + { + "epoch": 0.234499693063229, + "grad_norm": 2.419597864151001, + "learning_rate": 7.80163599182004e-05, + "loss": 3.1934, + "step": 764 + }, + { + "epoch": 0.23480662983425415, + "grad_norm": 3.0499062538146973, + "learning_rate": 7.811860940695297e-05, + "loss": 3.18, + "step": 765 + }, + { + "epoch": 0.23511356660527932, + "grad_norm": 2.464421510696411, + "learning_rate": 7.822085889570553e-05, + "loss": 3.1591, + "step": 766 + }, + { + "epoch": 0.2354205033763045, + "grad_norm": 3.4370174407958984, + "learning_rate": 7.832310838445808e-05, + "loss": 3.1156, + "step": 767 + }, + { + "epoch": 0.23572744014732966, + "grad_norm": 2.207406520843506, + "learning_rate": 7.842535787321063e-05, + "loss": 3.0557, + "step": 768 + }, + { + "epoch": 0.23603437691835483, + "grad_norm": 2.484807014465332, + "learning_rate": 7.85276073619632e-05, + "loss": 3.1003, + "step": 769 + }, + { + "epoch": 0.23634131368938, + "grad_norm": 2.33217716217041, + "learning_rate": 7.862985685071576e-05, + "loss": 3.0707, + "step": 770 + }, + { + "epoch": 0.23664825046040516, + "grad_norm": 2.493717670440674, + "learning_rate": 7.873210633946831e-05, + "loss": 3.127, + "step": 771 + }, + { + "epoch": 0.23695518723143033, + "grad_norm": 2.5824413299560547, + "learning_rate": 7.883435582822086e-05, + "loss": 3.1042, + "step": 772 + }, + { + "epoch": 0.2372621240024555, + "grad_norm": 2.4137654304504395, + "learning_rate": 7.893660531697342e-05, + "loss": 3.136, + "step": 773 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 2.4657833576202393, + "learning_rate": 7.903885480572597e-05, + "loss": 3.038, + "step": 774 + }, + { + "epoch": 0.23787599754450584, + "grad_norm": 2.426260471343994, + "learning_rate": 7.914110429447852e-05, + "loss": 3.0102, + "step": 775 + }, + { + "epoch": 0.238182934315531, + "grad_norm": 2.4658050537109375, + "learning_rate": 7.924335378323109e-05, + "loss": 3.0645, + "step": 776 + }, + { + "epoch": 0.23848987108655617, + "grad_norm": 2.186267614364624, + "learning_rate": 7.934560327198364e-05, + "loss": 3.0585, + "step": 777 + }, + { + "epoch": 0.23879680785758134, + "grad_norm": 2.8824141025543213, + "learning_rate": 7.94478527607362e-05, + "loss": 3.0796, + "step": 778 + }, + { + "epoch": 0.2391037446286065, + "grad_norm": 1.9940539598464966, + "learning_rate": 7.955010224948875e-05, + "loss": 2.9894, + "step": 779 + }, + { + "epoch": 0.23941068139963168, + "grad_norm": 2.9386861324310303, + "learning_rate": 7.965235173824132e-05, + "loss": 3.1147, + "step": 780 + }, + { + "epoch": 0.23971761817065684, + "grad_norm": 2.241983413696289, + "learning_rate": 7.975460122699386e-05, + "loss": 2.9977, + "step": 781 + }, + { + "epoch": 0.240024554941682, + "grad_norm": 2.4796900749206543, + "learning_rate": 7.985685071574643e-05, + "loss": 3.0507, + "step": 782 + }, + { + "epoch": 0.24033149171270718, + "grad_norm": 2.6178741455078125, + "learning_rate": 7.995910020449898e-05, + "loss": 3.0299, + "step": 783 + }, + { + "epoch": 0.24063842848373235, + "grad_norm": 2.157179594039917, + "learning_rate": 8.006134969325155e-05, + "loss": 3.0419, + "step": 784 + }, + { + "epoch": 0.24094536525475752, + "grad_norm": 2.49029541015625, + "learning_rate": 8.016359918200409e-05, + "loss": 3.0785, + "step": 785 + }, + { + "epoch": 0.24125230202578268, + "grad_norm": 2.254014492034912, + "learning_rate": 8.026584867075665e-05, + "loss": 3.0009, + "step": 786 + }, + { + "epoch": 0.24155923879680785, + "grad_norm": 2.514465570449829, + "learning_rate": 8.036809815950921e-05, + "loss": 3.0221, + "step": 787 + }, + { + "epoch": 0.24186617556783302, + "grad_norm": 2.309812545776367, + "learning_rate": 8.047034764826176e-05, + "loss": 2.9822, + "step": 788 + }, + { + "epoch": 0.2421731123388582, + "grad_norm": 2.5367796421051025, + "learning_rate": 8.057259713701431e-05, + "loss": 2.966, + "step": 789 + }, + { + "epoch": 0.24248004910988336, + "grad_norm": 2.4668943881988525, + "learning_rate": 8.067484662576688e-05, + "loss": 3.1177, + "step": 790 + }, + { + "epoch": 0.24278698588090852, + "grad_norm": 2.9424917697906494, + "learning_rate": 8.077709611451944e-05, + "loss": 3.078, + "step": 791 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 2.3068933486938477, + "learning_rate": 8.087934560327199e-05, + "loss": 3.0415, + "step": 792 + }, + { + "epoch": 0.24340085942295886, + "grad_norm": 2.675631284713745, + "learning_rate": 8.098159509202454e-05, + "loss": 3.012, + "step": 793 + }, + { + "epoch": 0.24370779619398403, + "grad_norm": 2.0261662006378174, + "learning_rate": 8.10838445807771e-05, + "loss": 3.0023, + "step": 794 + }, + { + "epoch": 0.2440147329650092, + "grad_norm": 3.32330322265625, + "learning_rate": 8.118609406952966e-05, + "loss": 3.0992, + "step": 795 + }, + { + "epoch": 0.24432166973603436, + "grad_norm": 2.1587088108062744, + "learning_rate": 8.12883435582822e-05, + "loss": 3.0922, + "step": 796 + }, + { + "epoch": 0.24462860650705956, + "grad_norm": 2.639254331588745, + "learning_rate": 8.139059304703477e-05, + "loss": 2.9856, + "step": 797 + }, + { + "epoch": 0.24493554327808473, + "grad_norm": 1.9976975917816162, + "learning_rate": 8.149284253578732e-05, + "loss": 3.0015, + "step": 798 + }, + { + "epoch": 0.2452424800491099, + "grad_norm": 2.763504981994629, + "learning_rate": 8.159509202453988e-05, + "loss": 3.0437, + "step": 799 + }, + { + "epoch": 0.24554941682013506, + "grad_norm": 1.9080138206481934, + "learning_rate": 8.169734151329243e-05, + "loss": 3.0009, + "step": 800 + }, + { + "epoch": 0.24585635359116023, + "grad_norm": 3.1276164054870605, + "learning_rate": 8.1799591002045e-05, + "loss": 3.0433, + "step": 801 + }, + { + "epoch": 0.2461632903621854, + "grad_norm": 2.0463218688964844, + "learning_rate": 8.190184049079755e-05, + "loss": 2.988, + "step": 802 + }, + { + "epoch": 0.24647022713321057, + "grad_norm": 2.8476648330688477, + "learning_rate": 8.20040899795501e-05, + "loss": 3.0238, + "step": 803 + }, + { + "epoch": 0.24677716390423574, + "grad_norm": 1.9715898036956787, + "learning_rate": 8.210633946830266e-05, + "loss": 3.0657, + "step": 804 + }, + { + "epoch": 0.2470841006752609, + "grad_norm": 3.369995594024658, + "learning_rate": 8.220858895705523e-05, + "loss": 3.0181, + "step": 805 + }, + { + "epoch": 0.24739103744628607, + "grad_norm": 2.0333900451660156, + "learning_rate": 8.231083844580777e-05, + "loss": 3.0589, + "step": 806 + }, + { + "epoch": 0.24769797421731124, + "grad_norm": 2.5702931880950928, + "learning_rate": 8.241308793456033e-05, + "loss": 2.9908, + "step": 807 + }, + { + "epoch": 0.2480049109883364, + "grad_norm": 2.12131929397583, + "learning_rate": 8.251533742331289e-05, + "loss": 3.0519, + "step": 808 + }, + { + "epoch": 0.24831184775936158, + "grad_norm": 2.5457377433776855, + "learning_rate": 8.261758691206544e-05, + "loss": 3.019, + "step": 809 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 2.0954740047454834, + "learning_rate": 8.2719836400818e-05, + "loss": 2.9805, + "step": 810 + }, + { + "epoch": 0.2489257213014119, + "grad_norm": 2.2456700801849365, + "learning_rate": 8.282208588957055e-05, + "loss": 3.0627, + "step": 811 + }, + { + "epoch": 0.24923265807243708, + "grad_norm": 2.4453790187835693, + "learning_rate": 8.292433537832312e-05, + "loss": 3.0447, + "step": 812 + }, + { + "epoch": 0.24953959484346225, + "grad_norm": 2.1835873126983643, + "learning_rate": 8.302658486707567e-05, + "loss": 3.0008, + "step": 813 + }, + { + "epoch": 0.24984653161448742, + "grad_norm": 2.292989492416382, + "learning_rate": 8.312883435582822e-05, + "loss": 2.9175, + "step": 814 + }, + { + "epoch": 0.2501534683855126, + "grad_norm": 2.408888816833496, + "learning_rate": 8.323108384458078e-05, + "loss": 2.9649, + "step": 815 + }, + { + "epoch": 0.2504604051565378, + "grad_norm": 2.1873834133148193, + "learning_rate": 8.333333333333334e-05, + "loss": 2.9812, + "step": 816 + }, + { + "epoch": 0.25076734192756295, + "grad_norm": 2.2599284648895264, + "learning_rate": 8.343558282208588e-05, + "loss": 3.0086, + "step": 817 + }, + { + "epoch": 0.2510742786985881, + "grad_norm": 2.1902761459350586, + "learning_rate": 8.353783231083845e-05, + "loss": 2.9295, + "step": 818 + }, + { + "epoch": 0.2513812154696133, + "grad_norm": 2.4830422401428223, + "learning_rate": 8.3640081799591e-05, + "loss": 2.9808, + "step": 819 + }, + { + "epoch": 0.25168815224063845, + "grad_norm": 2.2274281978607178, + "learning_rate": 8.374233128834357e-05, + "loss": 2.9525, + "step": 820 + }, + { + "epoch": 0.2519950890116636, + "grad_norm": 2.2949111461639404, + "learning_rate": 8.384458077709611e-05, + "loss": 3.0313, + "step": 821 + }, + { + "epoch": 0.2523020257826888, + "grad_norm": 2.2345564365386963, + "learning_rate": 8.394683026584868e-05, + "loss": 2.9024, + "step": 822 + }, + { + "epoch": 0.25260896255371396, + "grad_norm": 2.488744020462036, + "learning_rate": 8.404907975460123e-05, + "loss": 2.9907, + "step": 823 + }, + { + "epoch": 0.2529158993247391, + "grad_norm": 1.9192837476730347, + "learning_rate": 8.415132924335379e-05, + "loss": 2.9792, + "step": 824 + }, + { + "epoch": 0.2532228360957643, + "grad_norm": 2.6426947116851807, + "learning_rate": 8.425357873210634e-05, + "loss": 2.972, + "step": 825 + }, + { + "epoch": 0.25352977286678946, + "grad_norm": 1.9950047731399536, + "learning_rate": 8.435582822085891e-05, + "loss": 2.9885, + "step": 826 + }, + { + "epoch": 0.25383670963781463, + "grad_norm": 2.30191969871521, + "learning_rate": 8.445807770961146e-05, + "loss": 2.9358, + "step": 827 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 2.1111395359039307, + "learning_rate": 8.456032719836401e-05, + "loss": 3.0343, + "step": 828 + }, + { + "epoch": 0.25445058317986496, + "grad_norm": 2.7292258739471436, + "learning_rate": 8.466257668711657e-05, + "loss": 2.9465, + "step": 829 + }, + { + "epoch": 0.25475751995089013, + "grad_norm": 1.9130604267120361, + "learning_rate": 8.476482617586912e-05, + "loss": 2.9443, + "step": 830 + }, + { + "epoch": 0.2550644567219153, + "grad_norm": 2.4240024089813232, + "learning_rate": 8.486707566462168e-05, + "loss": 2.963, + "step": 831 + }, + { + "epoch": 0.25537139349294047, + "grad_norm": 2.062875509262085, + "learning_rate": 8.496932515337423e-05, + "loss": 3.0127, + "step": 832 + }, + { + "epoch": 0.25567833026396564, + "grad_norm": 2.223639726638794, + "learning_rate": 8.50715746421268e-05, + "loss": 2.944, + "step": 833 + }, + { + "epoch": 0.2559852670349908, + "grad_norm": 2.2969272136688232, + "learning_rate": 8.517382413087935e-05, + "loss": 2.9495, + "step": 834 + }, + { + "epoch": 0.256292203806016, + "grad_norm": 2.1343178749084473, + "learning_rate": 8.52760736196319e-05, + "loss": 3.0383, + "step": 835 + }, + { + "epoch": 0.25659914057704114, + "grad_norm": 2.2348313331604004, + "learning_rate": 8.537832310838446e-05, + "loss": 2.9205, + "step": 836 + }, + { + "epoch": 0.2569060773480663, + "grad_norm": 2.2653896808624268, + "learning_rate": 8.548057259713702e-05, + "loss": 2.9699, + "step": 837 + }, + { + "epoch": 0.2572130141190915, + "grad_norm": 2.1332547664642334, + "learning_rate": 8.558282208588958e-05, + "loss": 2.9318, + "step": 838 + }, + { + "epoch": 0.25751995089011664, + "grad_norm": 2.5935778617858887, + "learning_rate": 8.568507157464213e-05, + "loss": 2.9754, + "step": 839 + }, + { + "epoch": 0.2578268876611418, + "grad_norm": 2.073923110961914, + "learning_rate": 8.578732106339469e-05, + "loss": 3.0396, + "step": 840 + }, + { + "epoch": 0.258133824432167, + "grad_norm": 2.485049247741699, + "learning_rate": 8.588957055214725e-05, + "loss": 2.9297, + "step": 841 + }, + { + "epoch": 0.25844076120319215, + "grad_norm": 1.9425253868103027, + "learning_rate": 8.599182004089979e-05, + "loss": 3.0131, + "step": 842 + }, + { + "epoch": 0.2587476979742173, + "grad_norm": 2.6248724460601807, + "learning_rate": 8.609406952965236e-05, + "loss": 3.0345, + "step": 843 + }, + { + "epoch": 0.2590546347452425, + "grad_norm": 1.9123374223709106, + "learning_rate": 8.619631901840491e-05, + "loss": 3.0259, + "step": 844 + }, + { + "epoch": 0.25936157151626765, + "grad_norm": 2.457913637161255, + "learning_rate": 8.629856850715747e-05, + "loss": 3.0015, + "step": 845 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 2.0444202423095703, + "learning_rate": 8.640081799591002e-05, + "loss": 2.9663, + "step": 846 + }, + { + "epoch": 0.259975445058318, + "grad_norm": 2.1673583984375, + "learning_rate": 8.650306748466259e-05, + "loss": 3.0646, + "step": 847 + }, + { + "epoch": 0.26028238182934316, + "grad_norm": 2.1198627948760986, + "learning_rate": 8.660531697341514e-05, + "loss": 2.8769, + "step": 848 + }, + { + "epoch": 0.2605893186003683, + "grad_norm": 2.379960775375366, + "learning_rate": 8.67075664621677e-05, + "loss": 2.9637, + "step": 849 + }, + { + "epoch": 0.2608962553713935, + "grad_norm": 2.3954226970672607, + "learning_rate": 8.680981595092025e-05, + "loss": 3.025, + "step": 850 + }, + { + "epoch": 0.26120319214241866, + "grad_norm": 2.254746198654175, + "learning_rate": 8.69120654396728e-05, + "loss": 2.9962, + "step": 851 + }, + { + "epoch": 0.26151012891344383, + "grad_norm": 2.0851991176605225, + "learning_rate": 8.701431492842537e-05, + "loss": 2.9399, + "step": 852 + }, + { + "epoch": 0.261817065684469, + "grad_norm": 2.2800698280334473, + "learning_rate": 8.711656441717791e-05, + "loss": 2.9465, + "step": 853 + }, + { + "epoch": 0.26212400245549416, + "grad_norm": 2.3628437519073486, + "learning_rate": 8.721881390593048e-05, + "loss": 3.0298, + "step": 854 + }, + { + "epoch": 0.26243093922651933, + "grad_norm": 1.9642207622528076, + "learning_rate": 8.732106339468303e-05, + "loss": 2.8462, + "step": 855 + }, + { + "epoch": 0.2627378759975445, + "grad_norm": 2.5833423137664795, + "learning_rate": 8.742331288343558e-05, + "loss": 2.9024, + "step": 856 + }, + { + "epoch": 0.26304481276856967, + "grad_norm": 1.7022998332977295, + "learning_rate": 8.752556237218814e-05, + "loss": 2.9948, + "step": 857 + }, + { + "epoch": 0.26335174953959484, + "grad_norm": 3.181725025177002, + "learning_rate": 8.76278118609407e-05, + "loss": 3.0634, + "step": 858 + }, + { + "epoch": 0.26365868631062, + "grad_norm": 1.8931077718734741, + "learning_rate": 8.773006134969326e-05, + "loss": 2.9974, + "step": 859 + }, + { + "epoch": 0.2639656230816452, + "grad_norm": 2.5016703605651855, + "learning_rate": 8.783231083844581e-05, + "loss": 3.0109, + "step": 860 + }, + { + "epoch": 0.26427255985267034, + "grad_norm": 1.810957908630371, + "learning_rate": 8.793456032719837e-05, + "loss": 3.0143, + "step": 861 + }, + { + "epoch": 0.2645794966236955, + "grad_norm": 2.3004086017608643, + "learning_rate": 8.803680981595093e-05, + "loss": 2.9825, + "step": 862 + }, + { + "epoch": 0.2648864333947207, + "grad_norm": 2.23740816116333, + "learning_rate": 8.813905930470347e-05, + "loss": 2.8897, + "step": 863 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 2.441157579421997, + "learning_rate": 8.824130879345604e-05, + "loss": 2.8966, + "step": 864 + }, + { + "epoch": 0.265500306936771, + "grad_norm": 2.063201665878296, + "learning_rate": 8.83435582822086e-05, + "loss": 2.9468, + "step": 865 + }, + { + "epoch": 0.2658072437077962, + "grad_norm": 2.1484951972961426, + "learning_rate": 8.844580777096115e-05, + "loss": 2.9199, + "step": 866 + }, + { + "epoch": 0.26611418047882135, + "grad_norm": 2.167827844619751, + "learning_rate": 8.85480572597137e-05, + "loss": 2.9403, + "step": 867 + }, + { + "epoch": 0.2664211172498465, + "grad_norm": 2.193556070327759, + "learning_rate": 8.865030674846625e-05, + "loss": 2.9171, + "step": 868 + }, + { + "epoch": 0.2667280540208717, + "grad_norm": 2.0754151344299316, + "learning_rate": 8.875255623721882e-05, + "loss": 2.9605, + "step": 869 + }, + { + "epoch": 0.26703499079189685, + "grad_norm": 2.1351094245910645, + "learning_rate": 8.885480572597138e-05, + "loss": 2.9272, + "step": 870 + }, + { + "epoch": 0.267341927562922, + "grad_norm": 2.0486347675323486, + "learning_rate": 8.895705521472393e-05, + "loss": 3.0308, + "step": 871 + }, + { + "epoch": 0.2676488643339472, + "grad_norm": 2.3303308486938477, + "learning_rate": 8.905930470347648e-05, + "loss": 2.9061, + "step": 872 + }, + { + "epoch": 0.26795580110497236, + "grad_norm": 1.9345083236694336, + "learning_rate": 8.916155419222905e-05, + "loss": 2.9644, + "step": 873 + }, + { + "epoch": 0.2682627378759975, + "grad_norm": 2.451918601989746, + "learning_rate": 8.926380368098159e-05, + "loss": 2.9536, + "step": 874 + }, + { + "epoch": 0.2685696746470227, + "grad_norm": 1.6964573860168457, + "learning_rate": 8.936605316973416e-05, + "loss": 2.9228, + "step": 875 + }, + { + "epoch": 0.26887661141804786, + "grad_norm": 2.2414000034332275, + "learning_rate": 8.946830265848671e-05, + "loss": 2.9776, + "step": 876 + }, + { + "epoch": 0.26918354818907303, + "grad_norm": 1.725002408027649, + "learning_rate": 8.957055214723928e-05, + "loss": 2.9837, + "step": 877 + }, + { + "epoch": 0.2694904849600982, + "grad_norm": 2.1498587131500244, + "learning_rate": 8.967280163599182e-05, + "loss": 2.8684, + "step": 878 + }, + { + "epoch": 0.26979742173112337, + "grad_norm": 1.814738392829895, + "learning_rate": 8.977505112474438e-05, + "loss": 2.9077, + "step": 879 + }, + { + "epoch": 0.27010435850214853, + "grad_norm": 2.3086628913879395, + "learning_rate": 8.987730061349694e-05, + "loss": 2.9482, + "step": 880 + }, + { + "epoch": 0.2704112952731737, + "grad_norm": 1.7470855712890625, + "learning_rate": 8.997955010224949e-05, + "loss": 2.9775, + "step": 881 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 2.2822775840759277, + "learning_rate": 9.008179959100205e-05, + "loss": 3.0004, + "step": 882 + }, + { + "epoch": 0.27102516881522404, + "grad_norm": 1.9530903100967407, + "learning_rate": 9.018404907975461e-05, + "loss": 2.949, + "step": 883 + }, + { + "epoch": 0.2713321055862492, + "grad_norm": 2.0626885890960693, + "learning_rate": 9.028629856850717e-05, + "loss": 2.9184, + "step": 884 + }, + { + "epoch": 0.2716390423572744, + "grad_norm": 2.0040712356567383, + "learning_rate": 9.038854805725972e-05, + "loss": 2.8562, + "step": 885 + }, + { + "epoch": 0.2719459791282996, + "grad_norm": 2.026193141937256, + "learning_rate": 9.049079754601227e-05, + "loss": 2.883, + "step": 886 + }, + { + "epoch": 0.27225291589932477, + "grad_norm": 1.8337095975875854, + "learning_rate": 9.059304703476483e-05, + "loss": 2.8512, + "step": 887 + }, + { + "epoch": 0.27255985267034993, + "grad_norm": 2.1098122596740723, + "learning_rate": 9.069529652351738e-05, + "loss": 2.9024, + "step": 888 + }, + { + "epoch": 0.2728667894413751, + "grad_norm": 2.065650701522827, + "learning_rate": 9.079754601226993e-05, + "loss": 2.9291, + "step": 889 + }, + { + "epoch": 0.27317372621240027, + "grad_norm": 2.204819679260254, + "learning_rate": 9.08997955010225e-05, + "loss": 2.9153, + "step": 890 + }, + { + "epoch": 0.27348066298342544, + "grad_norm": 1.7931475639343262, + "learning_rate": 9.100204498977506e-05, + "loss": 2.9104, + "step": 891 + }, + { + "epoch": 0.2737875997544506, + "grad_norm": 2.4288859367370605, + "learning_rate": 9.110429447852761e-05, + "loss": 2.9974, + "step": 892 + }, + { + "epoch": 0.2740945365254758, + "grad_norm": 2.095872640609741, + "learning_rate": 9.120654396728016e-05, + "loss": 2.8446, + "step": 893 + }, + { + "epoch": 0.27440147329650094, + "grad_norm": 2.054410696029663, + "learning_rate": 9.130879345603273e-05, + "loss": 2.9008, + "step": 894 + }, + { + "epoch": 0.2747084100675261, + "grad_norm": 2.1989710330963135, + "learning_rate": 9.141104294478528e-05, + "loss": 2.8808, + "step": 895 + }, + { + "epoch": 0.2750153468385513, + "grad_norm": 2.531081199645996, + "learning_rate": 9.151329243353784e-05, + "loss": 2.8928, + "step": 896 + }, + { + "epoch": 0.27532228360957645, + "grad_norm": 2.010425567626953, + "learning_rate": 9.161554192229039e-05, + "loss": 2.9051, + "step": 897 + }, + { + "epoch": 0.2756292203806016, + "grad_norm": 1.9320241212844849, + "learning_rate": 9.171779141104296e-05, + "loss": 2.8675, + "step": 898 + }, + { + "epoch": 0.2759361571516268, + "grad_norm": 2.2280430793762207, + "learning_rate": 9.18200408997955e-05, + "loss": 2.9082, + "step": 899 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 1.9172335863113403, + "learning_rate": 9.192229038854807e-05, + "loss": 2.8947, + "step": 900 + }, + { + "epoch": 0.2765500306936771, + "grad_norm": 2.0846056938171387, + "learning_rate": 9.202453987730062e-05, + "loss": 2.9161, + "step": 901 + }, + { + "epoch": 0.2768569674647023, + "grad_norm": 1.875034213066101, + "learning_rate": 9.212678936605317e-05, + "loss": 2.8937, + "step": 902 + }, + { + "epoch": 0.27716390423572745, + "grad_norm": 2.230164051055908, + "learning_rate": 9.222903885480573e-05, + "loss": 2.8396, + "step": 903 + }, + { + "epoch": 0.2774708410067526, + "grad_norm": 1.6204382181167603, + "learning_rate": 9.233128834355828e-05, + "loss": 2.9367, + "step": 904 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.4218156337738037, + "learning_rate": 9.243353783231085e-05, + "loss": 2.9727, + "step": 905 + }, + { + "epoch": 0.27808471454880296, + "grad_norm": 1.7401793003082275, + "learning_rate": 9.25357873210634e-05, + "loss": 2.8957, + "step": 906 + }, + { + "epoch": 0.2783916513198281, + "grad_norm": 2.2128076553344727, + "learning_rate": 9.263803680981595e-05, + "loss": 2.8725, + "step": 907 + }, + { + "epoch": 0.2786985880908533, + "grad_norm": 2.004179000854492, + "learning_rate": 9.274028629856851e-05, + "loss": 2.8879, + "step": 908 + }, + { + "epoch": 0.27900552486187846, + "grad_norm": 2.198784112930298, + "learning_rate": 9.284253578732107e-05, + "loss": 2.9655, + "step": 909 + }, + { + "epoch": 0.27931246163290363, + "grad_norm": 1.8064004182815552, + "learning_rate": 9.294478527607362e-05, + "loss": 2.7801, + "step": 910 + }, + { + "epoch": 0.2796193984039288, + "grad_norm": 2.1273581981658936, + "learning_rate": 9.304703476482618e-05, + "loss": 2.8615, + "step": 911 + }, + { + "epoch": 0.27992633517495397, + "grad_norm": 1.7843197584152222, + "learning_rate": 9.314928425357874e-05, + "loss": 2.8735, + "step": 912 + }, + { + "epoch": 0.28023327194597913, + "grad_norm": 2.234886884689331, + "learning_rate": 9.325153374233129e-05, + "loss": 2.9444, + "step": 913 + }, + { + "epoch": 0.2805402087170043, + "grad_norm": 2.0565783977508545, + "learning_rate": 9.335378323108384e-05, + "loss": 2.9784, + "step": 914 + }, + { + "epoch": 0.28084714548802947, + "grad_norm": 1.836901068687439, + "learning_rate": 9.345603271983641e-05, + "loss": 2.9217, + "step": 915 + }, + { + "epoch": 0.28115408225905464, + "grad_norm": 2.0981357097625732, + "learning_rate": 9.355828220858896e-05, + "loss": 2.9091, + "step": 916 + }, + { + "epoch": 0.2814610190300798, + "grad_norm": 1.9199821949005127, + "learning_rate": 9.366053169734152e-05, + "loss": 2.8882, + "step": 917 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 1.9928756952285767, + "learning_rate": 9.376278118609407e-05, + "loss": 2.8463, + "step": 918 + }, + { + "epoch": 0.28207489257213014, + "grad_norm": 1.9580156803131104, + "learning_rate": 9.386503067484664e-05, + "loss": 2.7814, + "step": 919 + }, + { + "epoch": 0.2823818293431553, + "grad_norm": 2.016144275665283, + "learning_rate": 9.396728016359919e-05, + "loss": 2.8725, + "step": 920 + }, + { + "epoch": 0.2826887661141805, + "grad_norm": 1.967668890953064, + "learning_rate": 9.406952965235175e-05, + "loss": 2.912, + "step": 921 + }, + { + "epoch": 0.28299570288520565, + "grad_norm": 1.8826593160629272, + "learning_rate": 9.41717791411043e-05, + "loss": 2.7885, + "step": 922 + }, + { + "epoch": 0.2833026396562308, + "grad_norm": 2.0615732669830322, + "learning_rate": 9.427402862985685e-05, + "loss": 2.9111, + "step": 923 + }, + { + "epoch": 0.283609576427256, + "grad_norm": 1.7132701873779297, + "learning_rate": 9.43762781186094e-05, + "loss": 2.89, + "step": 924 + }, + { + "epoch": 0.28391651319828115, + "grad_norm": 2.1561272144317627, + "learning_rate": 9.447852760736196e-05, + "loss": 2.8741, + "step": 925 + }, + { + "epoch": 0.2842234499693063, + "grad_norm": 1.727338433265686, + "learning_rate": 9.458077709611453e-05, + "loss": 2.8449, + "step": 926 + }, + { + "epoch": 0.2845303867403315, + "grad_norm": 2.19234299659729, + "learning_rate": 9.468302658486708e-05, + "loss": 2.8499, + "step": 927 + }, + { + "epoch": 0.28483732351135665, + "grad_norm": 1.7370812892913818, + "learning_rate": 9.478527607361963e-05, + "loss": 2.882, + "step": 928 + }, + { + "epoch": 0.2851442602823818, + "grad_norm": 2.0576157569885254, + "learning_rate": 9.488752556237219e-05, + "loss": 2.7869, + "step": 929 + }, + { + "epoch": 0.285451197053407, + "grad_norm": 1.7926486730575562, + "learning_rate": 9.498977505112476e-05, + "loss": 2.906, + "step": 930 + }, + { + "epoch": 0.28575813382443216, + "grad_norm": 1.6877856254577637, + "learning_rate": 9.50920245398773e-05, + "loss": 2.8422, + "step": 931 + }, + { + "epoch": 0.2860650705954573, + "grad_norm": 2.3053178787231445, + "learning_rate": 9.519427402862986e-05, + "loss": 2.9039, + "step": 932 + }, + { + "epoch": 0.2863720073664825, + "grad_norm": 1.7746092081069946, + "learning_rate": 9.529652351738242e-05, + "loss": 2.9082, + "step": 933 + }, + { + "epoch": 0.28667894413750766, + "grad_norm": 2.1900086402893066, + "learning_rate": 9.539877300613498e-05, + "loss": 2.8511, + "step": 934 + }, + { + "epoch": 0.28698588090853283, + "grad_norm": 1.781988501548767, + "learning_rate": 9.550102249488752e-05, + "loss": 2.8264, + "step": 935 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 1.845797061920166, + "learning_rate": 9.560327198364009e-05, + "loss": 2.8657, + "step": 936 + }, + { + "epoch": 0.28759975445058317, + "grad_norm": 1.8794586658477783, + "learning_rate": 9.570552147239264e-05, + "loss": 2.8365, + "step": 937 + }, + { + "epoch": 0.28790669122160834, + "grad_norm": 2.078359603881836, + "learning_rate": 9.58077709611452e-05, + "loss": 2.8829, + "step": 938 + }, + { + "epoch": 0.2882136279926335, + "grad_norm": 1.8091285228729248, + "learning_rate": 9.591002044989775e-05, + "loss": 2.8083, + "step": 939 + }, + { + "epoch": 0.28852056476365867, + "grad_norm": 2.0130608081817627, + "learning_rate": 9.601226993865032e-05, + "loss": 2.8922, + "step": 940 + }, + { + "epoch": 0.28882750153468384, + "grad_norm": 1.8504360914230347, + "learning_rate": 9.611451942740287e-05, + "loss": 2.8034, + "step": 941 + }, + { + "epoch": 0.289134438305709, + "grad_norm": 1.860420823097229, + "learning_rate": 9.621676891615543e-05, + "loss": 2.8249, + "step": 942 + }, + { + "epoch": 0.2894413750767342, + "grad_norm": 2.157158374786377, + "learning_rate": 9.631901840490798e-05, + "loss": 2.8629, + "step": 943 + }, + { + "epoch": 0.28974831184775934, + "grad_norm": 1.8066895008087158, + "learning_rate": 9.642126789366053e-05, + "loss": 2.7965, + "step": 944 + }, + { + "epoch": 0.2900552486187845, + "grad_norm": 1.9674500226974487, + "learning_rate": 9.65235173824131e-05, + "loss": 2.8043, + "step": 945 + }, + { + "epoch": 0.2903621853898097, + "grad_norm": 1.7899354696273804, + "learning_rate": 9.662576687116564e-05, + "loss": 2.8803, + "step": 946 + }, + { + "epoch": 0.29066912216083485, + "grad_norm": 2.220201015472412, + "learning_rate": 9.672801635991821e-05, + "loss": 2.8201, + "step": 947 + }, + { + "epoch": 0.29097605893186, + "grad_norm": 1.76320219039917, + "learning_rate": 9.683026584867076e-05, + "loss": 2.8921, + "step": 948 + }, + { + "epoch": 0.2912829957028852, + "grad_norm": 1.6863081455230713, + "learning_rate": 9.693251533742331e-05, + "loss": 2.8208, + "step": 949 + }, + { + "epoch": 0.29158993247391035, + "grad_norm": 2.1578476428985596, + "learning_rate": 9.703476482617587e-05, + "loss": 2.8972, + "step": 950 + }, + { + "epoch": 0.2918968692449355, + "grad_norm": 1.6925181150436401, + "learning_rate": 9.713701431492844e-05, + "loss": 2.8225, + "step": 951 + }, + { + "epoch": 0.2922038060159607, + "grad_norm": 1.8861147165298462, + "learning_rate": 9.723926380368099e-05, + "loss": 2.8707, + "step": 952 + }, + { + "epoch": 0.29251074278698586, + "grad_norm": 1.5894604921340942, + "learning_rate": 9.734151329243354e-05, + "loss": 2.7576, + "step": 953 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 1.9092673063278198, + "learning_rate": 9.74437627811861e-05, + "loss": 2.8659, + "step": 954 + }, + { + "epoch": 0.2931246163290362, + "grad_norm": 1.8600605726242065, + "learning_rate": 9.754601226993866e-05, + "loss": 2.752, + "step": 955 + }, + { + "epoch": 0.29343155310006136, + "grad_norm": 2.005805015563965, + "learning_rate": 9.76482617586912e-05, + "loss": 2.8511, + "step": 956 + }, + { + "epoch": 0.2937384898710866, + "grad_norm": 1.9485148191452026, + "learning_rate": 9.775051124744377e-05, + "loss": 2.9726, + "step": 957 + }, + { + "epoch": 0.29404542664211175, + "grad_norm": 1.9197280406951904, + "learning_rate": 9.785276073619632e-05, + "loss": 2.7753, + "step": 958 + }, + { + "epoch": 0.2943523634131369, + "grad_norm": 1.6279773712158203, + "learning_rate": 9.795501022494888e-05, + "loss": 2.8855, + "step": 959 + }, + { + "epoch": 0.2946593001841621, + "grad_norm": 2.0233097076416016, + "learning_rate": 9.805725971370143e-05, + "loss": 2.749, + "step": 960 + }, + { + "epoch": 0.29496623695518726, + "grad_norm": 1.550295352935791, + "learning_rate": 9.815950920245399e-05, + "loss": 2.7991, + "step": 961 + }, + { + "epoch": 0.2952731737262124, + "grad_norm": 2.3194360733032227, + "learning_rate": 9.826175869120655e-05, + "loss": 2.8208, + "step": 962 + }, + { + "epoch": 0.2955801104972376, + "grad_norm": 1.634867787361145, + "learning_rate": 9.83640081799591e-05, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.29588704726826276, + "grad_norm": 2.1152596473693848, + "learning_rate": 9.846625766871166e-05, + "loss": 2.7667, + "step": 964 + }, + { + "epoch": 0.2961939840392879, + "grad_norm": 1.8927233219146729, + "learning_rate": 9.856850715746421e-05, + "loss": 2.8308, + "step": 965 + }, + { + "epoch": 0.2965009208103131, + "grad_norm": 1.765026330947876, + "learning_rate": 9.867075664621678e-05, + "loss": 2.7546, + "step": 966 + }, + { + "epoch": 0.29680785758133826, + "grad_norm": 1.7491015195846558, + "learning_rate": 9.877300613496932e-05, + "loss": 2.8156, + "step": 967 + }, + { + "epoch": 0.29711479435236343, + "grad_norm": 1.8352077007293701, + "learning_rate": 9.887525562372189e-05, + "loss": 2.8542, + "step": 968 + }, + { + "epoch": 0.2974217311233886, + "grad_norm": 1.8892323970794678, + "learning_rate": 9.897750511247444e-05, + "loss": 2.8216, + "step": 969 + }, + { + "epoch": 0.29772866789441377, + "grad_norm": 1.7171403169631958, + "learning_rate": 9.907975460122701e-05, + "loss": 2.8428, + "step": 970 + }, + { + "epoch": 0.29803560466543894, + "grad_norm": 1.8318040370941162, + "learning_rate": 9.918200408997955e-05, + "loss": 2.7821, + "step": 971 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 1.5829975605010986, + "learning_rate": 9.928425357873212e-05, + "loss": 2.9091, + "step": 972 + }, + { + "epoch": 0.29864947820748927, + "grad_norm": 1.7248235940933228, + "learning_rate": 9.938650306748467e-05, + "loss": 2.7914, + "step": 973 + }, + { + "epoch": 0.29895641497851444, + "grad_norm": 1.7741187810897827, + "learning_rate": 9.948875255623722e-05, + "loss": 2.8711, + "step": 974 + }, + { + "epoch": 0.2992633517495396, + "grad_norm": 1.7419151067733765, + "learning_rate": 9.959100204498978e-05, + "loss": 2.8933, + "step": 975 + }, + { + "epoch": 0.2995702885205648, + "grad_norm": 1.6603926420211792, + "learning_rate": 9.969325153374234e-05, + "loss": 2.7138, + "step": 976 + }, + { + "epoch": 0.29987722529158994, + "grad_norm": 1.8423576354980469, + "learning_rate": 9.97955010224949e-05, + "loss": 2.7776, + "step": 977 + }, + { + "epoch": 0.3001841620626151, + "grad_norm": 1.5548568964004517, + "learning_rate": 9.989775051124745e-05, + "loss": 2.8193, + "step": 978 + }, + { + "epoch": 0.3004910988336403, + "grad_norm": 1.711785078048706, + "learning_rate": 0.0001, + "loss": 2.7082, + "step": 979 + }, + { + "epoch": 0.30079803560466545, + "grad_norm": 1.6395221948623657, + "learning_rate": 9.999999975293535e-05, + "loss": 2.7526, + "step": 980 + }, + { + "epoch": 0.3011049723756906, + "grad_norm": 1.829174518585205, + "learning_rate": 9.999999901174139e-05, + "loss": 2.7555, + "step": 981 + }, + { + "epoch": 0.3014119091467158, + "grad_norm": 1.5807569026947021, + "learning_rate": 9.999999777641814e-05, + "loss": 2.848, + "step": 982 + }, + { + "epoch": 0.30171884591774095, + "grad_norm": 2.014803171157837, + "learning_rate": 9.99999960469656e-05, + "loss": 2.8318, + "step": 983 + }, + { + "epoch": 0.3020257826887661, + "grad_norm": 1.4732542037963867, + "learning_rate": 9.99999938233838e-05, + "loss": 2.8143, + "step": 984 + }, + { + "epoch": 0.3023327194597913, + "grad_norm": 2.4888343811035156, + "learning_rate": 9.999999110567275e-05, + "loss": 2.7979, + "step": 985 + }, + { + "epoch": 0.30263965623081646, + "grad_norm": 1.4265737533569336, + "learning_rate": 9.99999878938325e-05, + "loss": 2.7968, + "step": 986 + }, + { + "epoch": 0.3029465930018416, + "grad_norm": 2.0397326946258545, + "learning_rate": 9.999998418786303e-05, + "loss": 2.7413, + "step": 987 + }, + { + "epoch": 0.3032535297728668, + "grad_norm": 1.6565579175949097, + "learning_rate": 9.999997998776443e-05, + "loss": 2.8249, + "step": 988 + }, + { + "epoch": 0.30356046654389196, + "grad_norm": 1.8470033407211304, + "learning_rate": 9.999997529353673e-05, + "loss": 2.7815, + "step": 989 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 1.571768045425415, + "learning_rate": 9.999997010517995e-05, + "loss": 2.7202, + "step": 990 + }, + { + "epoch": 0.3041743400859423, + "grad_norm": 1.6217811107635498, + "learning_rate": 9.999996442269417e-05, + "loss": 2.832, + "step": 991 + }, + { + "epoch": 0.30448127685696746, + "grad_norm": 1.745591640472412, + "learning_rate": 9.999995824607943e-05, + "loss": 2.8271, + "step": 992 + }, + { + "epoch": 0.30478821362799263, + "grad_norm": 1.6469355821609497, + "learning_rate": 9.99999515753358e-05, + "loss": 2.7699, + "step": 993 + }, + { + "epoch": 0.3050951503990178, + "grad_norm": 1.733182430267334, + "learning_rate": 9.999994441046334e-05, + "loss": 2.7927, + "step": 994 + }, + { + "epoch": 0.30540208717004297, + "grad_norm": 1.6043230295181274, + "learning_rate": 9.999993675146213e-05, + "loss": 2.7536, + "step": 995 + }, + { + "epoch": 0.30570902394106814, + "grad_norm": 1.8154711723327637, + "learning_rate": 9.999992859833222e-05, + "loss": 2.7795, + "step": 996 + }, + { + "epoch": 0.3060159607120933, + "grad_norm": 1.7553666830062866, + "learning_rate": 9.999991995107374e-05, + "loss": 2.8128, + "step": 997 + }, + { + "epoch": 0.3063228974831185, + "grad_norm": 1.702697992324829, + "learning_rate": 9.999991080968672e-05, + "loss": 2.7234, + "step": 998 + }, + { + "epoch": 0.30662983425414364, + "grad_norm": 1.512619972229004, + "learning_rate": 9.99999011741713e-05, + "loss": 2.7555, + "step": 999 + }, + { + "epoch": 0.3069367710251688, + "grad_norm": 1.735844612121582, + "learning_rate": 9.999989104452753e-05, + "loss": 2.7847, + "step": 1000 + }, + { + "epoch": 0.307243707796194, + "grad_norm": 1.4687904119491577, + "learning_rate": 9.999988042075555e-05, + "loss": 2.8039, + "step": 1001 + }, + { + "epoch": 0.30755064456721914, + "grad_norm": 1.6867917776107788, + "learning_rate": 9.999986930285542e-05, + "loss": 2.7643, + "step": 1002 + }, + { + "epoch": 0.3078575813382443, + "grad_norm": 1.6974400281906128, + "learning_rate": 9.99998576908273e-05, + "loss": 2.7284, + "step": 1003 + }, + { + "epoch": 0.3081645181092695, + "grad_norm": 1.6622353792190552, + "learning_rate": 9.999984558467126e-05, + "loss": 2.8364, + "step": 1004 + }, + { + "epoch": 0.30847145488029465, + "grad_norm": 1.7920496463775635, + "learning_rate": 9.999983298438744e-05, + "loss": 2.7769, + "step": 1005 + }, + { + "epoch": 0.3087783916513198, + "grad_norm": 1.7111997604370117, + "learning_rate": 9.999981988997598e-05, + "loss": 2.7323, + "step": 1006 + }, + { + "epoch": 0.309085328422345, + "grad_norm": 1.6372064352035522, + "learning_rate": 9.9999806301437e-05, + "loss": 2.8128, + "step": 1007 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 1.841002345085144, + "learning_rate": 9.999979221877061e-05, + "loss": 2.7049, + "step": 1008 + }, + { + "epoch": 0.3096992019643953, + "grad_norm": 1.4474141597747803, + "learning_rate": 9.999977764197697e-05, + "loss": 2.64, + "step": 1009 + }, + { + "epoch": 0.3100061387354205, + "grad_norm": 1.6599560976028442, + "learning_rate": 9.999976257105622e-05, + "loss": 2.7989, + "step": 1010 + }, + { + "epoch": 0.31031307550644566, + "grad_norm": 1.7502890825271606, + "learning_rate": 9.999974700600851e-05, + "loss": 2.7949, + "step": 1011 + }, + { + "epoch": 0.3106200122774708, + "grad_norm": 1.8119313716888428, + "learning_rate": 9.9999730946834e-05, + "loss": 2.7577, + "step": 1012 + }, + { + "epoch": 0.310926949048496, + "grad_norm": 1.4398404359817505, + "learning_rate": 9.999971439353284e-05, + "loss": 2.7369, + "step": 1013 + }, + { + "epoch": 0.31123388581952116, + "grad_norm": 1.8501840829849243, + "learning_rate": 9.999969734610522e-05, + "loss": 2.6651, + "step": 1014 + }, + { + "epoch": 0.31154082259054633, + "grad_norm": 1.450804352760315, + "learning_rate": 9.999967980455125e-05, + "loss": 2.7231, + "step": 1015 + }, + { + "epoch": 0.3118477593615715, + "grad_norm": 1.9445282220840454, + "learning_rate": 9.999966176887115e-05, + "loss": 2.795, + "step": 1016 + }, + { + "epoch": 0.31215469613259667, + "grad_norm": 1.6361008882522583, + "learning_rate": 9.99996432390651e-05, + "loss": 2.8894, + "step": 1017 + }, + { + "epoch": 0.31246163290362183, + "grad_norm": 2.0804831981658936, + "learning_rate": 9.999962421513325e-05, + "loss": 2.8313, + "step": 1018 + }, + { + "epoch": 0.312768569674647, + "grad_norm": 1.3779852390289307, + "learning_rate": 9.999960469707582e-05, + "loss": 2.6776, + "step": 1019 + }, + { + "epoch": 0.31307550644567217, + "grad_norm": 1.7727700471878052, + "learning_rate": 9.999958468489299e-05, + "loss": 2.8076, + "step": 1020 + }, + { + "epoch": 0.31338244321669734, + "grad_norm": 1.5273795127868652, + "learning_rate": 9.999956417858496e-05, + "loss": 2.7069, + "step": 1021 + }, + { + "epoch": 0.3136893799877225, + "grad_norm": 1.8135402202606201, + "learning_rate": 9.999954317815193e-05, + "loss": 2.7375, + "step": 1022 + }, + { + "epoch": 0.3139963167587477, + "grad_norm": 1.6642818450927734, + "learning_rate": 9.99995216835941e-05, + "loss": 2.8085, + "step": 1023 + }, + { + "epoch": 0.31430325352977284, + "grad_norm": 1.681378722190857, + "learning_rate": 9.999949969491169e-05, + "loss": 2.807, + "step": 1024 + }, + { + "epoch": 0.314610190300798, + "grad_norm": 1.5521160364151, + "learning_rate": 9.999947721210493e-05, + "loss": 2.7266, + "step": 1025 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 1.486830711364746, + "learning_rate": 9.999945423517403e-05, + "loss": 2.774, + "step": 1026 + }, + { + "epoch": 0.3152240638428484, + "grad_norm": 1.5730900764465332, + "learning_rate": 9.99994307641192e-05, + "loss": 2.7101, + "step": 1027 + }, + { + "epoch": 0.31553100061387357, + "grad_norm": 1.4835596084594727, + "learning_rate": 9.999940679894071e-05, + "loss": 2.8195, + "step": 1028 + }, + { + "epoch": 0.31583793738489874, + "grad_norm": 1.7885956764221191, + "learning_rate": 9.999938233963877e-05, + "loss": 2.796, + "step": 1029 + }, + { + "epoch": 0.3161448741559239, + "grad_norm": 1.4036259651184082, + "learning_rate": 9.999935738621362e-05, + "loss": 2.7167, + "step": 1030 + }, + { + "epoch": 0.3164518109269491, + "grad_norm": 1.7480512857437134, + "learning_rate": 9.999933193866554e-05, + "loss": 2.6774, + "step": 1031 + }, + { + "epoch": 0.31675874769797424, + "grad_norm": 1.66177499294281, + "learning_rate": 9.999930599699473e-05, + "loss": 2.7635, + "step": 1032 + }, + { + "epoch": 0.3170656844689994, + "grad_norm": 1.5088306665420532, + "learning_rate": 9.999927956120147e-05, + "loss": 2.7284, + "step": 1033 + }, + { + "epoch": 0.3173726212400246, + "grad_norm": 1.6847199201583862, + "learning_rate": 9.999925263128605e-05, + "loss": 2.8287, + "step": 1034 + }, + { + "epoch": 0.31767955801104975, + "grad_norm": 1.6092369556427002, + "learning_rate": 9.999922520724869e-05, + "loss": 2.7189, + "step": 1035 + }, + { + "epoch": 0.3179864947820749, + "grad_norm": 1.41717529296875, + "learning_rate": 9.999919728908969e-05, + "loss": 2.7134, + "step": 1036 + }, + { + "epoch": 0.3182934315531001, + "grad_norm": 1.6256498098373413, + "learning_rate": 9.999916887680931e-05, + "loss": 2.7312, + "step": 1037 + }, + { + "epoch": 0.31860036832412525, + "grad_norm": 1.4934377670288086, + "learning_rate": 9.999913997040784e-05, + "loss": 2.7548, + "step": 1038 + }, + { + "epoch": 0.3189073050951504, + "grad_norm": 1.6037719249725342, + "learning_rate": 9.999911056988557e-05, + "loss": 2.7682, + "step": 1039 + }, + { + "epoch": 0.3192142418661756, + "grad_norm": 1.4746284484863281, + "learning_rate": 9.999908067524277e-05, + "loss": 2.7256, + "step": 1040 + }, + { + "epoch": 0.31952117863720075, + "grad_norm": 1.4633710384368896, + "learning_rate": 9.999905028647976e-05, + "loss": 2.6779, + "step": 1041 + }, + { + "epoch": 0.3198281154082259, + "grad_norm": 1.6108646392822266, + "learning_rate": 9.999901940359684e-05, + "loss": 2.781, + "step": 1042 + }, + { + "epoch": 0.3201350521792511, + "grad_norm": 1.4130996465682983, + "learning_rate": 9.999898802659428e-05, + "loss": 2.6327, + "step": 1043 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 2.110307455062866, + "learning_rate": 9.999895615547244e-05, + "loss": 2.7965, + "step": 1044 + }, + { + "epoch": 0.3207489257213014, + "grad_norm": 1.500618815422058, + "learning_rate": 9.99989237902316e-05, + "loss": 2.7874, + "step": 1045 + }, + { + "epoch": 0.3210558624923266, + "grad_norm": 1.577890157699585, + "learning_rate": 9.999889093087207e-05, + "loss": 2.6816, + "step": 1046 + }, + { + "epoch": 0.32136279926335176, + "grad_norm": 1.2820981740951538, + "learning_rate": 9.999885757739422e-05, + "loss": 2.6799, + "step": 1047 + }, + { + "epoch": 0.32166973603437693, + "grad_norm": 1.629936695098877, + "learning_rate": 9.999882372979835e-05, + "loss": 2.6783, + "step": 1048 + }, + { + "epoch": 0.3219766728054021, + "grad_norm": 1.3119972944259644, + "learning_rate": 9.999878938808478e-05, + "loss": 2.6403, + "step": 1049 + }, + { + "epoch": 0.32228360957642727, + "grad_norm": 1.720093846321106, + "learning_rate": 9.999875455225389e-05, + "loss": 2.709, + "step": 1050 + }, + { + "epoch": 0.32259054634745243, + "grad_norm": 1.446273922920227, + "learning_rate": 9.999871922230599e-05, + "loss": 2.6463, + "step": 1051 + }, + { + "epoch": 0.3228974831184776, + "grad_norm": 1.5000908374786377, + "learning_rate": 9.999868339824145e-05, + "loss": 2.7502, + "step": 1052 + }, + { + "epoch": 0.32320441988950277, + "grad_norm": 1.6257869005203247, + "learning_rate": 9.999864708006061e-05, + "loss": 2.6984, + "step": 1053 + }, + { + "epoch": 0.32351135666052794, + "grad_norm": 1.509638786315918, + "learning_rate": 9.999861026776384e-05, + "loss": 2.6931, + "step": 1054 + }, + { + "epoch": 0.3238182934315531, + "grad_norm": 1.5305874347686768, + "learning_rate": 9.999857296135149e-05, + "loss": 2.8423, + "step": 1055 + }, + { + "epoch": 0.3241252302025783, + "grad_norm": 1.7664300203323364, + "learning_rate": 9.999853516082394e-05, + "loss": 2.7703, + "step": 1056 + }, + { + "epoch": 0.32443216697360344, + "grad_norm": 1.4633153676986694, + "learning_rate": 9.999849686618157e-05, + "loss": 2.7588, + "step": 1057 + }, + { + "epoch": 0.3247391037446286, + "grad_norm": 1.5177773237228394, + "learning_rate": 9.999845807742473e-05, + "loss": 2.7376, + "step": 1058 + }, + { + "epoch": 0.3250460405156538, + "grad_norm": 1.6122089624404907, + "learning_rate": 9.999841879455383e-05, + "loss": 2.7871, + "step": 1059 + }, + { + "epoch": 0.32535297728667895, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.999837901756926e-05, + "loss": 2.6602, + "step": 1060 + }, + { + "epoch": 0.3256599140577041, + "grad_norm": 1.5714327096939087, + "learning_rate": 9.99983387464714e-05, + "loss": 2.6279, + "step": 1061 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 1.399731993675232, + "learning_rate": 9.999829798126065e-05, + "loss": 2.7957, + "step": 1062 + }, + { + "epoch": 0.32627378759975445, + "grad_norm": 1.694368839263916, + "learning_rate": 9.999825672193741e-05, + "loss": 2.6859, + "step": 1063 + }, + { + "epoch": 0.3265807243707796, + "grad_norm": 1.2585967779159546, + "learning_rate": 9.99982149685021e-05, + "loss": 2.7964, + "step": 1064 + }, + { + "epoch": 0.3268876611418048, + "grad_norm": 1.802262306213379, + "learning_rate": 9.999817272095512e-05, + "loss": 2.6325, + "step": 1065 + }, + { + "epoch": 0.32719459791282995, + "grad_norm": 1.213222861289978, + "learning_rate": 9.99981299792969e-05, + "loss": 2.718, + "step": 1066 + }, + { + "epoch": 0.3275015346838551, + "grad_norm": 1.5745760202407837, + "learning_rate": 9.999808674352785e-05, + "loss": 2.8589, + "step": 1067 + }, + { + "epoch": 0.3278084714548803, + "grad_norm": 1.516995906829834, + "learning_rate": 9.999804301364839e-05, + "loss": 2.6691, + "step": 1068 + }, + { + "epoch": 0.32811540822590546, + "grad_norm": 1.4223122596740723, + "learning_rate": 9.999799878965897e-05, + "loss": 2.6899, + "step": 1069 + }, + { + "epoch": 0.3284223449969306, + "grad_norm": 1.4502828121185303, + "learning_rate": 9.999795407156003e-05, + "loss": 2.7801, + "step": 1070 + }, + { + "epoch": 0.3287292817679558, + "grad_norm": 1.4692026376724243, + "learning_rate": 9.999790885935198e-05, + "loss": 2.6869, + "step": 1071 + }, + { + "epoch": 0.32903621853898096, + "grad_norm": 1.4182246923446655, + "learning_rate": 9.999786315303532e-05, + "loss": 2.7802, + "step": 1072 + }, + { + "epoch": 0.32934315531000613, + "grad_norm": 1.781173586845398, + "learning_rate": 9.999781695261046e-05, + "loss": 2.7522, + "step": 1073 + }, + { + "epoch": 0.3296500920810313, + "grad_norm": 1.3958306312561035, + "learning_rate": 9.999777025807786e-05, + "loss": 2.6894, + "step": 1074 + }, + { + "epoch": 0.32995702885205647, + "grad_norm": 1.7938110828399658, + "learning_rate": 9.9997723069438e-05, + "loss": 2.6468, + "step": 1075 + }, + { + "epoch": 0.33026396562308163, + "grad_norm": 1.2314528226852417, + "learning_rate": 9.999767538669134e-05, + "loss": 2.7446, + "step": 1076 + }, + { + "epoch": 0.3305709023941068, + "grad_norm": 1.4881565570831299, + "learning_rate": 9.999762720983835e-05, + "loss": 2.6904, + "step": 1077 + }, + { + "epoch": 0.33087783916513197, + "grad_norm": 1.3903130292892456, + "learning_rate": 9.999757853887948e-05, + "loss": 2.7315, + "step": 1078 + }, + { + "epoch": 0.33118477593615714, + "grad_norm": 1.491129755973816, + "learning_rate": 9.999752937381525e-05, + "loss": 2.7325, + "step": 1079 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 1.4748190641403198, + "learning_rate": 9.999747971464612e-05, + "loss": 2.7288, + "step": 1080 + }, + { + "epoch": 0.3317986494782075, + "grad_norm": 1.5664055347442627, + "learning_rate": 9.99974295613726e-05, + "loss": 2.8225, + "step": 1081 + }, + { + "epoch": 0.33210558624923264, + "grad_norm": 1.4422696828842163, + "learning_rate": 9.999737891399518e-05, + "loss": 2.6537, + "step": 1082 + }, + { + "epoch": 0.3324125230202578, + "grad_norm": 1.397817850112915, + "learning_rate": 9.999732777251436e-05, + "loss": 2.6329, + "step": 1083 + }, + { + "epoch": 0.332719459791283, + "grad_norm": 1.4253548383712769, + "learning_rate": 9.999727613693063e-05, + "loss": 2.7028, + "step": 1084 + }, + { + "epoch": 0.33302639656230815, + "grad_norm": 1.4327688217163086, + "learning_rate": 9.999722400724451e-05, + "loss": 2.6524, + "step": 1085 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.999717138345654e-05, + "loss": 2.7278, + "step": 1086 + }, + { + "epoch": 0.3336402701043585, + "grad_norm": 1.536656379699707, + "learning_rate": 9.999711826556719e-05, + "loss": 2.5858, + "step": 1087 + }, + { + "epoch": 0.33394720687538365, + "grad_norm": 1.4210286140441895, + "learning_rate": 9.999706465357703e-05, + "loss": 2.7057, + "step": 1088 + }, + { + "epoch": 0.3342541436464088, + "grad_norm": 1.4605839252471924, + "learning_rate": 9.999701054748657e-05, + "loss": 2.6461, + "step": 1089 + }, + { + "epoch": 0.334561080417434, + "grad_norm": 1.4764037132263184, + "learning_rate": 9.999695594729636e-05, + "loss": 2.608, + "step": 1090 + }, + { + "epoch": 0.33486801718845915, + "grad_norm": 1.630843162536621, + "learning_rate": 9.99969008530069e-05, + "loss": 2.6165, + "step": 1091 + }, + { + "epoch": 0.3351749539594843, + "grad_norm": 1.3693522214889526, + "learning_rate": 9.999684526461879e-05, + "loss": 2.72, + "step": 1092 + }, + { + "epoch": 0.3354818907305095, + "grad_norm": 1.609580636024475, + "learning_rate": 9.999678918213254e-05, + "loss": 2.7602, + "step": 1093 + }, + { + "epoch": 0.33578882750153466, + "grad_norm": 1.3815720081329346, + "learning_rate": 9.999673260554872e-05, + "loss": 2.6297, + "step": 1094 + }, + { + "epoch": 0.3360957642725598, + "grad_norm": 1.4511120319366455, + "learning_rate": 9.999667553486787e-05, + "loss": 2.7515, + "step": 1095 + }, + { + "epoch": 0.336402701043585, + "grad_norm": 1.486387848854065, + "learning_rate": 9.999661797009057e-05, + "loss": 2.6839, + "step": 1096 + }, + { + "epoch": 0.33670963781461016, + "grad_norm": 1.239160180091858, + "learning_rate": 9.999655991121739e-05, + "loss": 2.6033, + "step": 1097 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 1.499598741531372, + "learning_rate": 9.999650135824891e-05, + "loss": 2.5582, + "step": 1098 + }, + { + "epoch": 0.33732351135666055, + "grad_norm": 1.32973051071167, + "learning_rate": 9.999644231118571e-05, + "loss": 2.6253, + "step": 1099 + }, + { + "epoch": 0.3376304481276857, + "grad_norm": 1.4025259017944336, + "learning_rate": 9.999638277002833e-05, + "loss": 2.6199, + "step": 1100 + }, + { + "epoch": 0.3379373848987109, + "grad_norm": 1.3162082433700562, + "learning_rate": 9.999632273477742e-05, + "loss": 2.5528, + "step": 1101 + }, + { + "epoch": 0.33824432166973606, + "grad_norm": 1.5454723834991455, + "learning_rate": 9.999626220543352e-05, + "loss": 2.6724, + "step": 1102 + }, + { + "epoch": 0.3385512584407612, + "grad_norm": 1.45896315574646, + "learning_rate": 9.999620118199727e-05, + "loss": 2.688, + "step": 1103 + }, + { + "epoch": 0.3388581952117864, + "grad_norm": 1.3940998315811157, + "learning_rate": 9.999613966446926e-05, + "loss": 2.6991, + "step": 1104 + }, + { + "epoch": 0.33916513198281156, + "grad_norm": 1.4427480697631836, + "learning_rate": 9.999607765285009e-05, + "loss": 2.6869, + "step": 1105 + }, + { + "epoch": 0.33947206875383673, + "grad_norm": 1.260373830795288, + "learning_rate": 9.999601514714036e-05, + "loss": 2.7011, + "step": 1106 + }, + { + "epoch": 0.3397790055248619, + "grad_norm": 1.5985103845596313, + "learning_rate": 9.999595214734072e-05, + "loss": 2.599, + "step": 1107 + }, + { + "epoch": 0.34008594229588707, + "grad_norm": 1.1968494653701782, + "learning_rate": 9.999588865345179e-05, + "loss": 2.6346, + "step": 1108 + }, + { + "epoch": 0.34039287906691224, + "grad_norm": 1.4565916061401367, + "learning_rate": 9.999582466547417e-05, + "loss": 2.6303, + "step": 1109 + }, + { + "epoch": 0.3406998158379374, + "grad_norm": 1.2992361783981323, + "learning_rate": 9.999576018340851e-05, + "loss": 2.6121, + "step": 1110 + }, + { + "epoch": 0.34100675260896257, + "grad_norm": 1.402471899986267, + "learning_rate": 9.999569520725543e-05, + "loss": 2.6697, + "step": 1111 + }, + { + "epoch": 0.34131368937998774, + "grad_norm": 1.3006439208984375, + "learning_rate": 9.99956297370156e-05, + "loss": 2.6347, + "step": 1112 + }, + { + "epoch": 0.3416206261510129, + "grad_norm": 1.4235650300979614, + "learning_rate": 9.999556377268966e-05, + "loss": 2.6869, + "step": 1113 + }, + { + "epoch": 0.3419275629220381, + "grad_norm": 1.3288183212280273, + "learning_rate": 9.999549731427824e-05, + "loss": 2.5834, + "step": 1114 + }, + { + "epoch": 0.34223449969306324, + "grad_norm": 1.430736780166626, + "learning_rate": 9.999543036178203e-05, + "loss": 2.6248, + "step": 1115 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 1.467417597770691, + "learning_rate": 9.999536291520167e-05, + "loss": 2.6563, + "step": 1116 + }, + { + "epoch": 0.3428483732351136, + "grad_norm": 1.3988397121429443, + "learning_rate": 9.999529497453782e-05, + "loss": 2.6634, + "step": 1117 + }, + { + "epoch": 0.34315531000613875, + "grad_norm": 1.2072746753692627, + "learning_rate": 9.999522653979117e-05, + "loss": 2.6129, + "step": 1118 + }, + { + "epoch": 0.3434622467771639, + "grad_norm": 1.5297373533248901, + "learning_rate": 9.999515761096239e-05, + "loss": 2.6359, + "step": 1119 + }, + { + "epoch": 0.3437691835481891, + "grad_norm": 1.2022082805633545, + "learning_rate": 9.999508818805214e-05, + "loss": 2.6934, + "step": 1120 + }, + { + "epoch": 0.34407612031921425, + "grad_norm": 1.5655800104141235, + "learning_rate": 9.999501827106114e-05, + "loss": 2.6132, + "step": 1121 + }, + { + "epoch": 0.3443830570902394, + "grad_norm": 1.1639407873153687, + "learning_rate": 9.999494785999007e-05, + "loss": 2.6416, + "step": 1122 + }, + { + "epoch": 0.3446899938612646, + "grad_norm": 1.5784116983413696, + "learning_rate": 9.999487695483962e-05, + "loss": 2.5967, + "step": 1123 + }, + { + "epoch": 0.34499693063228976, + "grad_norm": 1.1812770366668701, + "learning_rate": 9.999480555561049e-05, + "loss": 2.6303, + "step": 1124 + }, + { + "epoch": 0.3453038674033149, + "grad_norm": 1.5105888843536377, + "learning_rate": 9.99947336623034e-05, + "loss": 2.58, + "step": 1125 + }, + { + "epoch": 0.3456108041743401, + "grad_norm": 1.2969506978988647, + "learning_rate": 9.999466127491904e-05, + "loss": 2.6857, + "step": 1126 + }, + { + "epoch": 0.34591774094536526, + "grad_norm": 1.679018259048462, + "learning_rate": 9.999458839345812e-05, + "loss": 2.6304, + "step": 1127 + }, + { + "epoch": 0.3462246777163904, + "grad_norm": 1.2718015909194946, + "learning_rate": 9.99945150179214e-05, + "loss": 2.6929, + "step": 1128 + }, + { + "epoch": 0.3465316144874156, + "grad_norm": 1.5834014415740967, + "learning_rate": 9.999444114830957e-05, + "loss": 2.6477, + "step": 1129 + }, + { + "epoch": 0.34683855125844076, + "grad_norm": 1.1575955152511597, + "learning_rate": 9.999436678462338e-05, + "loss": 2.6908, + "step": 1130 + }, + { + "epoch": 0.34714548802946593, + "grad_norm": 1.6231988668441772, + "learning_rate": 9.999429192686352e-05, + "loss": 2.6741, + "step": 1131 + }, + { + "epoch": 0.3474524248004911, + "grad_norm": 1.1616390943527222, + "learning_rate": 9.99942165750308e-05, + "loss": 2.5977, + "step": 1132 + }, + { + "epoch": 0.34775936157151627, + "grad_norm": 1.6188498735427856, + "learning_rate": 9.999414072912592e-05, + "loss": 2.6776, + "step": 1133 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 1.3885529041290283, + "learning_rate": 9.999406438914962e-05, + "loss": 2.7136, + "step": 1134 + }, + { + "epoch": 0.3483732351135666, + "grad_norm": 1.4522851705551147, + "learning_rate": 9.999398755510269e-05, + "loss": 2.6817, + "step": 1135 + }, + { + "epoch": 0.34868017188459177, + "grad_norm": 1.2695082426071167, + "learning_rate": 9.999391022698588e-05, + "loss": 2.6257, + "step": 1136 + }, + { + "epoch": 0.34898710865561694, + "grad_norm": 1.1735594272613525, + "learning_rate": 9.999383240479993e-05, + "loss": 2.5908, + "step": 1137 + }, + { + "epoch": 0.3492940454266421, + "grad_norm": 1.4158523082733154, + "learning_rate": 9.999375408854564e-05, + "loss": 2.572, + "step": 1138 + }, + { + "epoch": 0.3496009821976673, + "grad_norm": 1.1342333555221558, + "learning_rate": 9.999367527822376e-05, + "loss": 2.6918, + "step": 1139 + }, + { + "epoch": 0.34990791896869244, + "grad_norm": 1.4462997913360596, + "learning_rate": 9.999359597383509e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 0.3502148557397176, + "grad_norm": 1.254346251487732, + "learning_rate": 9.99935161753804e-05, + "loss": 2.6426, + "step": 1141 + }, + { + "epoch": 0.3505217925107428, + "grad_norm": 1.5101851224899292, + "learning_rate": 9.999343588286048e-05, + "loss": 2.6261, + "step": 1142 + }, + { + "epoch": 0.35082872928176795, + "grad_norm": 1.2910065650939941, + "learning_rate": 9.999335509627612e-05, + "loss": 2.5587, + "step": 1143 + }, + { + "epoch": 0.3511356660527931, + "grad_norm": 1.4421133995056152, + "learning_rate": 9.999327381562812e-05, + "loss": 2.6812, + "step": 1144 + }, + { + "epoch": 0.3514426028238183, + "grad_norm": 1.3265037536621094, + "learning_rate": 9.999319204091728e-05, + "loss": 2.6506, + "step": 1145 + }, + { + "epoch": 0.35174953959484345, + "grad_norm": 1.346258521080017, + "learning_rate": 9.999310977214443e-05, + "loss": 2.7038, + "step": 1146 + }, + { + "epoch": 0.3520564763658686, + "grad_norm": 1.3683836460113525, + "learning_rate": 9.999302700931037e-05, + "loss": 2.5823, + "step": 1147 + }, + { + "epoch": 0.3523634131368938, + "grad_norm": 1.3593783378601074, + "learning_rate": 9.99929437524159e-05, + "loss": 2.5705, + "step": 1148 + }, + { + "epoch": 0.35267034990791896, + "grad_norm": 1.4077095985412598, + "learning_rate": 9.999286000146186e-05, + "loss": 2.6259, + "step": 1149 + }, + { + "epoch": 0.3529772866789441, + "grad_norm": 1.3095922470092773, + "learning_rate": 9.99927757564491e-05, + "loss": 2.683, + "step": 1150 + }, + { + "epoch": 0.3532842234499693, + "grad_norm": 1.4188631772994995, + "learning_rate": 9.999269101737841e-05, + "loss": 2.619, + "step": 1151 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 1.2483123540878296, + "learning_rate": 9.999260578425063e-05, + "loss": 2.6477, + "step": 1152 + }, + { + "epoch": 0.35389809699201963, + "grad_norm": 1.4601099491119385, + "learning_rate": 9.999252005706663e-05, + "loss": 2.5861, + "step": 1153 + }, + { + "epoch": 0.3542050337630448, + "grad_norm": 1.107335090637207, + "learning_rate": 9.999243383582726e-05, + "loss": 2.6308, + "step": 1154 + }, + { + "epoch": 0.35451197053406996, + "grad_norm": 1.60590398311615, + "learning_rate": 9.999234712053334e-05, + "loss": 2.7057, + "step": 1155 + }, + { + "epoch": 0.35481890730509513, + "grad_norm": 1.2256578207015991, + "learning_rate": 9.999225991118575e-05, + "loss": 2.6371, + "step": 1156 + }, + { + "epoch": 0.3551258440761203, + "grad_norm": 1.4451910257339478, + "learning_rate": 9.999217220778535e-05, + "loss": 2.6424, + "step": 1157 + }, + { + "epoch": 0.35543278084714547, + "grad_norm": 1.184781789779663, + "learning_rate": 9.999208401033299e-05, + "loss": 2.6576, + "step": 1158 + }, + { + "epoch": 0.35573971761817064, + "grad_norm": 1.3395711183547974, + "learning_rate": 9.999199531882956e-05, + "loss": 2.6109, + "step": 1159 + }, + { + "epoch": 0.3560466543891958, + "grad_norm": 1.2052571773529053, + "learning_rate": 9.999190613327594e-05, + "loss": 2.5486, + "step": 1160 + }, + { + "epoch": 0.356353591160221, + "grad_norm": 1.2690850496292114, + "learning_rate": 9.999181645367299e-05, + "loss": 2.6457, + "step": 1161 + }, + { + "epoch": 0.35666052793124614, + "grad_norm": 1.2832787036895752, + "learning_rate": 9.999172628002162e-05, + "loss": 2.6097, + "step": 1162 + }, + { + "epoch": 0.3569674647022713, + "grad_norm": 1.3791579008102417, + "learning_rate": 9.999163561232272e-05, + "loss": 2.7458, + "step": 1163 + }, + { + "epoch": 0.3572744014732965, + "grad_norm": 1.260743498802185, + "learning_rate": 9.999154445057715e-05, + "loss": 2.594, + "step": 1164 + }, + { + "epoch": 0.35758133824432164, + "grad_norm": 1.1595406532287598, + "learning_rate": 9.999145279478585e-05, + "loss": 2.5315, + "step": 1165 + }, + { + "epoch": 0.3578882750153468, + "grad_norm": 1.3424396514892578, + "learning_rate": 9.999136064494972e-05, + "loss": 2.6017, + "step": 1166 + }, + { + "epoch": 0.358195211786372, + "grad_norm": 1.317750334739685, + "learning_rate": 9.999126800106963e-05, + "loss": 2.5787, + "step": 1167 + }, + { + "epoch": 0.35850214855739715, + "grad_norm": 1.104471206665039, + "learning_rate": 9.999117486314657e-05, + "loss": 2.6801, + "step": 1168 + }, + { + "epoch": 0.3588090853284224, + "grad_norm": 1.5555830001831055, + "learning_rate": 9.99910812311814e-05, + "loss": 2.6575, + "step": 1169 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 1.1883453130722046, + "learning_rate": 9.999098710517507e-05, + "loss": 2.5801, + "step": 1170 + }, + { + "epoch": 0.3594229588704727, + "grad_norm": 1.3885222673416138, + "learning_rate": 9.99908924851285e-05, + "loss": 2.5637, + "step": 1171 + }, + { + "epoch": 0.3597298956414979, + "grad_norm": 1.1860510110855103, + "learning_rate": 9.999079737104262e-05, + "loss": 2.6528, + "step": 1172 + }, + { + "epoch": 0.36003683241252304, + "grad_norm": 1.4319096803665161, + "learning_rate": 9.99907017629184e-05, + "loss": 2.579, + "step": 1173 + }, + { + "epoch": 0.3603437691835482, + "grad_norm": 1.256819725036621, + "learning_rate": 9.999060566075676e-05, + "loss": 2.5638, + "step": 1174 + }, + { + "epoch": 0.3606507059545734, + "grad_norm": 1.5452641248703003, + "learning_rate": 9.999050906455865e-05, + "loss": 2.6318, + "step": 1175 + }, + { + "epoch": 0.36095764272559855, + "grad_norm": 1.1933847665786743, + "learning_rate": 9.999041197432503e-05, + "loss": 2.5451, + "step": 1176 + }, + { + "epoch": 0.3612645794966237, + "grad_norm": 1.245689034461975, + "learning_rate": 9.999031439005684e-05, + "loss": 2.5452, + "step": 1177 + }, + { + "epoch": 0.3615715162676489, + "grad_norm": 1.2228111028671265, + "learning_rate": 9.99902163117551e-05, + "loss": 2.5856, + "step": 1178 + }, + { + "epoch": 0.36187845303867405, + "grad_norm": 1.3547098636627197, + "learning_rate": 9.999011773942071e-05, + "loss": 2.6604, + "step": 1179 + }, + { + "epoch": 0.3621853898096992, + "grad_norm": 1.25395929813385, + "learning_rate": 9.999001867305469e-05, + "loss": 2.5947, + "step": 1180 + }, + { + "epoch": 0.3624923265807244, + "grad_norm": 1.1676687002182007, + "learning_rate": 9.9989919112658e-05, + "loss": 2.5728, + "step": 1181 + }, + { + "epoch": 0.36279926335174956, + "grad_norm": 1.2076375484466553, + "learning_rate": 9.998981905823163e-05, + "loss": 2.569, + "step": 1182 + }, + { + "epoch": 0.3631062001227747, + "grad_norm": 1.3417900800704956, + "learning_rate": 9.998971850977659e-05, + "loss": 2.5552, + "step": 1183 + }, + { + "epoch": 0.3634131368937999, + "grad_norm": 1.135088324546814, + "learning_rate": 9.998961746729383e-05, + "loss": 2.5883, + "step": 1184 + }, + { + "epoch": 0.36372007366482506, + "grad_norm": 1.3329869508743286, + "learning_rate": 9.998951593078438e-05, + "loss": 2.6398, + "step": 1185 + }, + { + "epoch": 0.36402701043585023, + "grad_norm": 1.1681292057037354, + "learning_rate": 9.998941390024923e-05, + "loss": 2.6082, + "step": 1186 + }, + { + "epoch": 0.3643339472068754, + "grad_norm": 1.4083843231201172, + "learning_rate": 9.998931137568939e-05, + "loss": 2.6585, + "step": 1187 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 1.0879896879196167, + "learning_rate": 9.998920835710587e-05, + "loss": 2.4779, + "step": 1188 + }, + { + "epoch": 0.36494782074892573, + "grad_norm": 1.2977828979492188, + "learning_rate": 9.99891048444997e-05, + "loss": 2.6586, + "step": 1189 + }, + { + "epoch": 0.3652547575199509, + "grad_norm": 1.2552378177642822, + "learning_rate": 9.998900083787188e-05, + "loss": 2.5211, + "step": 1190 + }, + { + "epoch": 0.36556169429097607, + "grad_norm": 1.178227186203003, + "learning_rate": 9.998889633722348e-05, + "loss": 2.5365, + "step": 1191 + }, + { + "epoch": 0.36586863106200124, + "grad_norm": 1.36601722240448, + "learning_rate": 9.99887913425555e-05, + "loss": 2.6108, + "step": 1192 + }, + { + "epoch": 0.3661755678330264, + "grad_norm": 1.1947816610336304, + "learning_rate": 9.998868585386898e-05, + "loss": 2.5269, + "step": 1193 + }, + { + "epoch": 0.3664825046040516, + "grad_norm": 1.3113429546356201, + "learning_rate": 9.998857987116497e-05, + "loss": 2.5241, + "step": 1194 + }, + { + "epoch": 0.36678944137507674, + "grad_norm": 1.1573466062545776, + "learning_rate": 9.99884733944445e-05, + "loss": 2.5772, + "step": 1195 + }, + { + "epoch": 0.3670963781461019, + "grad_norm": 1.3841795921325684, + "learning_rate": 9.998836642370866e-05, + "loss": 2.6254, + "step": 1196 + }, + { + "epoch": 0.3674033149171271, + "grad_norm": 1.3332045078277588, + "learning_rate": 9.998825895895848e-05, + "loss": 2.6846, + "step": 1197 + }, + { + "epoch": 0.36771025168815225, + "grad_norm": 1.1578748226165771, + "learning_rate": 9.9988151000195e-05, + "loss": 2.4717, + "step": 1198 + }, + { + "epoch": 0.3680171884591774, + "grad_norm": 1.1045753955841064, + "learning_rate": 9.998804254741934e-05, + "loss": 2.6433, + "step": 1199 + }, + { + "epoch": 0.3683241252302026, + "grad_norm": 1.3260962963104248, + "learning_rate": 9.998793360063254e-05, + "loss": 2.6385, + "step": 1200 + }, + { + "epoch": 0.36863106200122775, + "grad_norm": 1.1483805179595947, + "learning_rate": 9.998782415983568e-05, + "loss": 2.6013, + "step": 1201 + }, + { + "epoch": 0.3689379987722529, + "grad_norm": 1.1897181272506714, + "learning_rate": 9.998771422502984e-05, + "loss": 2.485, + "step": 1202 + }, + { + "epoch": 0.3692449355432781, + "grad_norm": 1.2124346494674683, + "learning_rate": 9.99876037962161e-05, + "loss": 2.6271, + "step": 1203 + }, + { + "epoch": 0.36955187231430325, + "grad_norm": 1.2274240255355835, + "learning_rate": 9.998749287339557e-05, + "loss": 2.6072, + "step": 1204 + }, + { + "epoch": 0.3698588090853284, + "grad_norm": 1.2045015096664429, + "learning_rate": 9.998738145656934e-05, + "loss": 2.5567, + "step": 1205 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 1.187698483467102, + "learning_rate": 9.998726954573852e-05, + "loss": 2.6251, + "step": 1206 + }, + { + "epoch": 0.37047268262737876, + "grad_norm": 1.1760836839675903, + "learning_rate": 9.998715714090419e-05, + "loss": 2.6544, + "step": 1207 + }, + { + "epoch": 0.3707796193984039, + "grad_norm": 1.2181260585784912, + "learning_rate": 9.998704424206746e-05, + "loss": 2.6258, + "step": 1208 + }, + { + "epoch": 0.3710865561694291, + "grad_norm": 1.2106094360351562, + "learning_rate": 9.998693084922947e-05, + "loss": 2.5932, + "step": 1209 + }, + { + "epoch": 0.37139349294045426, + "grad_norm": 1.2973625659942627, + "learning_rate": 9.998681696239133e-05, + "loss": 2.5257, + "step": 1210 + }, + { + "epoch": 0.37170042971147943, + "grad_norm": 1.2477924823760986, + "learning_rate": 9.998670258155417e-05, + "loss": 2.6579, + "step": 1211 + }, + { + "epoch": 0.3720073664825046, + "grad_norm": 1.3301422595977783, + "learning_rate": 9.998658770671913e-05, + "loss": 2.4903, + "step": 1212 + }, + { + "epoch": 0.37231430325352977, + "grad_norm": 1.224321722984314, + "learning_rate": 9.998647233788732e-05, + "loss": 2.5865, + "step": 1213 + }, + { + "epoch": 0.37262124002455493, + "grad_norm": 1.3110655546188354, + "learning_rate": 9.99863564750599e-05, + "loss": 2.6134, + "step": 1214 + }, + { + "epoch": 0.3729281767955801, + "grad_norm": 1.2323014736175537, + "learning_rate": 9.998624011823801e-05, + "loss": 2.5892, + "step": 1215 + }, + { + "epoch": 0.37323511356660527, + "grad_norm": 1.0873770713806152, + "learning_rate": 9.998612326742279e-05, + "loss": 2.4897, + "step": 1216 + }, + { + "epoch": 0.37354205033763044, + "grad_norm": 1.2789679765701294, + "learning_rate": 9.998600592261539e-05, + "loss": 2.5603, + "step": 1217 + }, + { + "epoch": 0.3738489871086556, + "grad_norm": 1.1311540603637695, + "learning_rate": 9.998588808381699e-05, + "loss": 2.5327, + "step": 1218 + }, + { + "epoch": 0.3741559238796808, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.998576975102876e-05, + "loss": 2.4789, + "step": 1219 + }, + { + "epoch": 0.37446286065070594, + "grad_norm": 1.1840651035308838, + "learning_rate": 9.998565092425182e-05, + "loss": 2.5026, + "step": 1220 + }, + { + "epoch": 0.3747697974217311, + "grad_norm": 1.3145099878311157, + "learning_rate": 9.998553160348743e-05, + "loss": 2.5424, + "step": 1221 + }, + { + "epoch": 0.3750767341927563, + "grad_norm": 1.2192758321762085, + "learning_rate": 9.998541178873668e-05, + "loss": 2.5556, + "step": 1222 + }, + { + "epoch": 0.37538367096378145, + "grad_norm": 1.1329905986785889, + "learning_rate": 9.99852914800008e-05, + "loss": 2.4624, + "step": 1223 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 1.2490339279174805, + "learning_rate": 9.9985170677281e-05, + "loss": 2.5016, + "step": 1224 + }, + { + "epoch": 0.3759975445058318, + "grad_norm": 1.1884582042694092, + "learning_rate": 9.998504938057841e-05, + "loss": 2.5345, + "step": 1225 + }, + { + "epoch": 0.37630448127685695, + "grad_norm": 1.2075775861740112, + "learning_rate": 9.998492758989428e-05, + "loss": 2.5206, + "step": 1226 + }, + { + "epoch": 0.3766114180478821, + "grad_norm": 1.238457441329956, + "learning_rate": 9.99848053052298e-05, + "loss": 2.6748, + "step": 1227 + }, + { + "epoch": 0.3769183548189073, + "grad_norm": 1.3056883811950684, + "learning_rate": 9.998468252658618e-05, + "loss": 2.6146, + "step": 1228 + }, + { + "epoch": 0.37722529158993245, + "grad_norm": 1.191575050354004, + "learning_rate": 9.998455925396461e-05, + "loss": 2.4743, + "step": 1229 + }, + { + "epoch": 0.3775322283609576, + "grad_norm": 1.2834603786468506, + "learning_rate": 9.998443548736635e-05, + "loss": 2.5504, + "step": 1230 + }, + { + "epoch": 0.3778391651319828, + "grad_norm": 1.3023632764816284, + "learning_rate": 9.99843112267926e-05, + "loss": 2.5832, + "step": 1231 + }, + { + "epoch": 0.37814610190300796, + "grad_norm": 1.1219336986541748, + "learning_rate": 9.998418647224458e-05, + "loss": 2.5715, + "step": 1232 + }, + { + "epoch": 0.3784530386740331, + "grad_norm": 1.0666810274124146, + "learning_rate": 9.998406122372354e-05, + "loss": 2.4865, + "step": 1233 + }, + { + "epoch": 0.3787599754450583, + "grad_norm": 1.3699263334274292, + "learning_rate": 9.998393548123072e-05, + "loss": 2.5523, + "step": 1234 + }, + { + "epoch": 0.37906691221608346, + "grad_norm": 1.1383014917373657, + "learning_rate": 9.998380924476733e-05, + "loss": 2.7054, + "step": 1235 + }, + { + "epoch": 0.37937384898710863, + "grad_norm": 1.1304205656051636, + "learning_rate": 9.998368251433465e-05, + "loss": 2.5007, + "step": 1236 + }, + { + "epoch": 0.3796807857581338, + "grad_norm": 1.2220405340194702, + "learning_rate": 9.998355528993394e-05, + "loss": 2.5635, + "step": 1237 + }, + { + "epoch": 0.37998772252915897, + "grad_norm": 1.1126691102981567, + "learning_rate": 9.998342757156642e-05, + "loss": 2.5795, + "step": 1238 + }, + { + "epoch": 0.38029465930018413, + "grad_norm": 1.1675945520401, + "learning_rate": 9.998329935923339e-05, + "loss": 2.564, + "step": 1239 + }, + { + "epoch": 0.38060159607120936, + "grad_norm": 1.1286569833755493, + "learning_rate": 9.998317065293607e-05, + "loss": 2.5476, + "step": 1240 + }, + { + "epoch": 0.3809085328422345, + "grad_norm": 1.1252213716506958, + "learning_rate": 9.998304145267579e-05, + "loss": 2.5406, + "step": 1241 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 1.1931700706481934, + "learning_rate": 9.998291175845378e-05, + "loss": 2.5277, + "step": 1242 + }, + { + "epoch": 0.38152240638428486, + "grad_norm": 1.2148306369781494, + "learning_rate": 9.998278157027136e-05, + "loss": 2.5178, + "step": 1243 + }, + { + "epoch": 0.38182934315531003, + "grad_norm": 1.1597660779953003, + "learning_rate": 9.998265088812978e-05, + "loss": 2.5522, + "step": 1244 + }, + { + "epoch": 0.3821362799263352, + "grad_norm": 1.105973243713379, + "learning_rate": 9.998251971203035e-05, + "loss": 2.4558, + "step": 1245 + }, + { + "epoch": 0.38244321669736037, + "grad_norm": 1.1082781553268433, + "learning_rate": 9.998238804197437e-05, + "loss": 2.5504, + "step": 1246 + }, + { + "epoch": 0.38275015346838553, + "grad_norm": 1.2124732732772827, + "learning_rate": 9.998225587796312e-05, + "loss": 2.5536, + "step": 1247 + }, + { + "epoch": 0.3830570902394107, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.998212321999795e-05, + "loss": 2.4837, + "step": 1248 + }, + { + "epoch": 0.38336402701043587, + "grad_norm": 1.353562355041504, + "learning_rate": 9.998199006808014e-05, + "loss": 2.4554, + "step": 1249 + }, + { + "epoch": 0.38367096378146104, + "grad_norm": 1.2103357315063477, + "learning_rate": 9.998185642221098e-05, + "loss": 2.4843, + "step": 1250 + }, + { + "epoch": 0.3839779005524862, + "grad_norm": 1.2572352886199951, + "learning_rate": 9.998172228239185e-05, + "loss": 2.497, + "step": 1251 + }, + { + "epoch": 0.3842848373235114, + "grad_norm": 1.0910226106643677, + "learning_rate": 9.998158764862402e-05, + "loss": 2.577, + "step": 1252 + }, + { + "epoch": 0.38459177409453654, + "grad_norm": 1.2550606727600098, + "learning_rate": 9.998145252090886e-05, + "loss": 2.5087, + "step": 1253 + }, + { + "epoch": 0.3848987108655617, + "grad_norm": 1.0103787183761597, + "learning_rate": 9.998131689924768e-05, + "loss": 2.5306, + "step": 1254 + }, + { + "epoch": 0.3852056476365869, + "grad_norm": 1.2965941429138184, + "learning_rate": 9.998118078364184e-05, + "loss": 2.5622, + "step": 1255 + }, + { + "epoch": 0.38551258440761205, + "grad_norm": 1.0791535377502441, + "learning_rate": 9.998104417409269e-05, + "loss": 2.5608, + "step": 1256 + }, + { + "epoch": 0.3858195211786372, + "grad_norm": 1.3277596235275269, + "learning_rate": 9.998090707060155e-05, + "loss": 2.5748, + "step": 1257 + }, + { + "epoch": 0.3861264579496624, + "grad_norm": 1.004031777381897, + "learning_rate": 9.99807694731698e-05, + "loss": 2.5532, + "step": 1258 + }, + { + "epoch": 0.38643339472068755, + "grad_norm": 1.4802277088165283, + "learning_rate": 9.998063138179877e-05, + "loss": 2.585, + "step": 1259 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 1.0821146965026855, + "learning_rate": 9.998049279648987e-05, + "loss": 2.5248, + "step": 1260 + }, + { + "epoch": 0.3870472682627379, + "grad_norm": 1.2902108430862427, + "learning_rate": 9.998035371724443e-05, + "loss": 2.5134, + "step": 1261 + }, + { + "epoch": 0.38735420503376305, + "grad_norm": 1.082943320274353, + "learning_rate": 9.998021414406385e-05, + "loss": 2.5937, + "step": 1262 + }, + { + "epoch": 0.3876611418047882, + "grad_norm": 1.2164193391799927, + "learning_rate": 9.998007407694949e-05, + "loss": 2.5106, + "step": 1263 + }, + { + "epoch": 0.3879680785758134, + "grad_norm": 1.0999115705490112, + "learning_rate": 9.997993351590276e-05, + "loss": 2.5458, + "step": 1264 + }, + { + "epoch": 0.38827501534683856, + "grad_norm": 1.2275537252426147, + "learning_rate": 9.997979246092503e-05, + "loss": 2.5664, + "step": 1265 + }, + { + "epoch": 0.3885819521178637, + "grad_norm": 1.3246204853057861, + "learning_rate": 9.997965091201769e-05, + "loss": 2.5289, + "step": 1266 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.2404677867889404, + "learning_rate": 9.997950886918214e-05, + "loss": 2.5302, + "step": 1267 + }, + { + "epoch": 0.38919582565991406, + "grad_norm": 1.0993810892105103, + "learning_rate": 9.99793663324198e-05, + "loss": 2.5085, + "step": 1268 + }, + { + "epoch": 0.38950276243093923, + "grad_norm": 1.3394049406051636, + "learning_rate": 9.997922330173206e-05, + "loss": 2.5882, + "step": 1269 + }, + { + "epoch": 0.3898096992019644, + "grad_norm": 1.1464321613311768, + "learning_rate": 9.997907977712036e-05, + "loss": 2.5211, + "step": 1270 + }, + { + "epoch": 0.39011663597298957, + "grad_norm": 1.1246297359466553, + "learning_rate": 9.997893575858608e-05, + "loss": 2.4204, + "step": 1271 + }, + { + "epoch": 0.39042357274401474, + "grad_norm": 1.1278076171875, + "learning_rate": 9.997879124613067e-05, + "loss": 2.4405, + "step": 1272 + }, + { + "epoch": 0.3907305095150399, + "grad_norm": 1.2284942865371704, + "learning_rate": 9.997864623975555e-05, + "loss": 2.5674, + "step": 1273 + }, + { + "epoch": 0.39103744628606507, + "grad_norm": 1.1243138313293457, + "learning_rate": 9.997850073946215e-05, + "loss": 2.489, + "step": 1274 + }, + { + "epoch": 0.39134438305709024, + "grad_norm": 1.198461890220642, + "learning_rate": 9.997835474525193e-05, + "loss": 2.51, + "step": 1275 + }, + { + "epoch": 0.3916513198281154, + "grad_norm": 1.1643213033676147, + "learning_rate": 9.997820825712629e-05, + "loss": 2.5688, + "step": 1276 + }, + { + "epoch": 0.3919582565991406, + "grad_norm": 1.2107082605361938, + "learning_rate": 9.997806127508671e-05, + "loss": 2.5614, + "step": 1277 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 1.1856440305709839, + "learning_rate": 9.997791379913464e-05, + "loss": 2.5893, + "step": 1278 + }, + { + "epoch": 0.3925721301411909, + "grad_norm": 1.166395664215088, + "learning_rate": 9.997776582927153e-05, + "loss": 2.539, + "step": 1279 + }, + { + "epoch": 0.3928790669122161, + "grad_norm": 1.1638765335083008, + "learning_rate": 9.997761736549886e-05, + "loss": 2.5384, + "step": 1280 + }, + { + "epoch": 0.39318600368324125, + "grad_norm": 1.107485055923462, + "learning_rate": 9.997746840781806e-05, + "loss": 2.559, + "step": 1281 + }, + { + "epoch": 0.3934929404542664, + "grad_norm": 1.174592137336731, + "learning_rate": 9.997731895623063e-05, + "loss": 2.5132, + "step": 1282 + }, + { + "epoch": 0.3937998772252916, + "grad_norm": 1.0407745838165283, + "learning_rate": 9.997716901073806e-05, + "loss": 2.4871, + "step": 1283 + }, + { + "epoch": 0.39410681399631675, + "grad_norm": 1.059743046760559, + "learning_rate": 9.997701857134179e-05, + "loss": 2.4865, + "step": 1284 + }, + { + "epoch": 0.3944137507673419, + "grad_norm": 1.0606070756912231, + "learning_rate": 9.997686763804335e-05, + "loss": 2.5651, + "step": 1285 + }, + { + "epoch": 0.3947206875383671, + "grad_norm": 1.0753284692764282, + "learning_rate": 9.99767162108442e-05, + "loss": 2.4699, + "step": 1286 + }, + { + "epoch": 0.39502762430939226, + "grad_norm": 1.1155509948730469, + "learning_rate": 9.997656428974585e-05, + "loss": 2.5326, + "step": 1287 + }, + { + "epoch": 0.3953345610804174, + "grad_norm": 1.2243739366531372, + "learning_rate": 9.99764118747498e-05, + "loss": 2.5189, + "step": 1288 + }, + { + "epoch": 0.3956414978514426, + "grad_norm": 1.2526514530181885, + "learning_rate": 9.997625896585757e-05, + "loss": 2.5464, + "step": 1289 + }, + { + "epoch": 0.39594843462246776, + "grad_norm": 1.297153115272522, + "learning_rate": 9.997610556307062e-05, + "loss": 2.5752, + "step": 1290 + }, + { + "epoch": 0.39625537139349293, + "grad_norm": 1.1064956188201904, + "learning_rate": 9.997595166639054e-05, + "loss": 2.5743, + "step": 1291 + }, + { + "epoch": 0.3965623081645181, + "grad_norm": 1.255810022354126, + "learning_rate": 9.997579727581879e-05, + "loss": 2.7087, + "step": 1292 + }, + { + "epoch": 0.39686924493554326, + "grad_norm": 1.4290298223495483, + "learning_rate": 9.997564239135692e-05, + "loss": 2.5417, + "step": 1293 + }, + { + "epoch": 0.39717618170656843, + "grad_norm": 1.1937109231948853, + "learning_rate": 9.997548701300648e-05, + "loss": 2.4862, + "step": 1294 + }, + { + "epoch": 0.3974831184775936, + "grad_norm": 1.1707425117492676, + "learning_rate": 9.997533114076897e-05, + "loss": 2.4715, + "step": 1295 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 1.1248551607131958, + "learning_rate": 9.997517477464596e-05, + "loss": 2.4859, + "step": 1296 + }, + { + "epoch": 0.39809699201964394, + "grad_norm": 1.1656453609466553, + "learning_rate": 9.997501791463897e-05, + "loss": 2.5402, + "step": 1297 + }, + { + "epoch": 0.3984039287906691, + "grad_norm": 0.9916674494743347, + "learning_rate": 9.997486056074956e-05, + "loss": 2.5116, + "step": 1298 + }, + { + "epoch": 0.39871086556169427, + "grad_norm": 1.3229619264602661, + "learning_rate": 9.997470271297928e-05, + "loss": 2.5565, + "step": 1299 + }, + { + "epoch": 0.39901780233271944, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.997454437132971e-05, + "loss": 2.5191, + "step": 1300 + }, + { + "epoch": 0.3993247391037446, + "grad_norm": 1.2117778062820435, + "learning_rate": 9.997438553580241e-05, + "loss": 2.558, + "step": 1301 + }, + { + "epoch": 0.3996316758747698, + "grad_norm": 1.1083563566207886, + "learning_rate": 9.997422620639892e-05, + "loss": 2.4734, + "step": 1302 + }, + { + "epoch": 0.39993861264579494, + "grad_norm": 0.9662174582481384, + "learning_rate": 9.997406638312084e-05, + "loss": 2.4866, + "step": 1303 + }, + { + "epoch": 0.4002455494168201, + "grad_norm": 1.0886632204055786, + "learning_rate": 9.997390606596976e-05, + "loss": 2.5397, + "step": 1304 + }, + { + "epoch": 0.4005524861878453, + "grad_norm": 1.2318742275238037, + "learning_rate": 9.997374525494723e-05, + "loss": 2.6281, + "step": 1305 + }, + { + "epoch": 0.40085942295887045, + "grad_norm": 1.1717815399169922, + "learning_rate": 9.997358395005487e-05, + "loss": 2.5202, + "step": 1306 + }, + { + "epoch": 0.4011663597298956, + "grad_norm": 1.0533723831176758, + "learning_rate": 9.997342215129427e-05, + "loss": 2.5096, + "step": 1307 + }, + { + "epoch": 0.4014732965009208, + "grad_norm": 1.0814248323440552, + "learning_rate": 9.997325985866701e-05, + "loss": 2.5513, + "step": 1308 + }, + { + "epoch": 0.40178023327194595, + "grad_norm": 1.078261137008667, + "learning_rate": 9.997309707217472e-05, + "loss": 2.5115, + "step": 1309 + }, + { + "epoch": 0.4020871700429711, + "grad_norm": 1.0834710597991943, + "learning_rate": 9.997293379181897e-05, + "loss": 2.4754, + "step": 1310 + }, + { + "epoch": 0.40239410681399634, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.997277001760142e-05, + "loss": 2.5068, + "step": 1311 + }, + { + "epoch": 0.4027010435850215, + "grad_norm": 1.3008345365524292, + "learning_rate": 9.997260574952366e-05, + "loss": 2.4675, + "step": 1312 + }, + { + "epoch": 0.4030079803560467, + "grad_norm": 1.176858901977539, + "learning_rate": 9.997244098758732e-05, + "loss": 2.4786, + "step": 1313 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 1.0121303796768188, + "learning_rate": 9.997227573179403e-05, + "loss": 2.476, + "step": 1314 + }, + { + "epoch": 0.403621853898097, + "grad_norm": 1.326298713684082, + "learning_rate": 9.997210998214542e-05, + "loss": 2.4093, + "step": 1315 + }, + { + "epoch": 0.4039287906691222, + "grad_norm": 0.9008898735046387, + "learning_rate": 9.997194373864314e-05, + "loss": 2.4523, + "step": 1316 + }, + { + "epoch": 0.40423572744014735, + "grad_norm": 1.0441854000091553, + "learning_rate": 9.99717770012888e-05, + "loss": 2.5419, + "step": 1317 + }, + { + "epoch": 0.4045426642111725, + "grad_norm": 1.0490028858184814, + "learning_rate": 9.997160977008408e-05, + "loss": 2.4855, + "step": 1318 + }, + { + "epoch": 0.4048496009821977, + "grad_norm": 1.0244388580322266, + "learning_rate": 9.997144204503063e-05, + "loss": 2.4555, + "step": 1319 + }, + { + "epoch": 0.40515653775322286, + "grad_norm": 1.1217700242996216, + "learning_rate": 9.99712738261301e-05, + "loss": 2.4872, + "step": 1320 + }, + { + "epoch": 0.405463474524248, + "grad_norm": 1.031691551208496, + "learning_rate": 9.997110511338414e-05, + "loss": 2.4094, + "step": 1321 + }, + { + "epoch": 0.4057704112952732, + "grad_norm": 1.1658705472946167, + "learning_rate": 9.997093590679444e-05, + "loss": 2.407, + "step": 1322 + }, + { + "epoch": 0.40607734806629836, + "grad_norm": 1.1527072191238403, + "learning_rate": 9.997076620636266e-05, + "loss": 2.5041, + "step": 1323 + }, + { + "epoch": 0.40638428483732353, + "grad_norm": 1.2039116621017456, + "learning_rate": 9.997059601209049e-05, + "loss": 2.4682, + "step": 1324 + }, + { + "epoch": 0.4066912216083487, + "grad_norm": 1.142160177230835, + "learning_rate": 9.997042532397957e-05, + "loss": 2.4629, + "step": 1325 + }, + { + "epoch": 0.40699815837937386, + "grad_norm": 0.972081184387207, + "learning_rate": 9.997025414203164e-05, + "loss": 2.3941, + "step": 1326 + }, + { + "epoch": 0.40730509515039903, + "grad_norm": 1.0181753635406494, + "learning_rate": 9.99700824662484e-05, + "loss": 2.5649, + "step": 1327 + }, + { + "epoch": 0.4076120319214242, + "grad_norm": 1.145769715309143, + "learning_rate": 9.996991029663148e-05, + "loss": 2.5284, + "step": 1328 + }, + { + "epoch": 0.40791896869244937, + "grad_norm": 1.0604028701782227, + "learning_rate": 9.996973763318262e-05, + "loss": 2.4488, + "step": 1329 + }, + { + "epoch": 0.40822590546347454, + "grad_norm": 1.161383867263794, + "learning_rate": 9.996956447590354e-05, + "loss": 2.6081, + "step": 1330 + }, + { + "epoch": 0.4085328422344997, + "grad_norm": 1.0880714654922485, + "learning_rate": 9.996939082479591e-05, + "loss": 2.4695, + "step": 1331 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 1.036556601524353, + "learning_rate": 9.99692166798615e-05, + "loss": 2.4428, + "step": 1332 + }, + { + "epoch": 0.40914671577655004, + "grad_norm": 1.079179286956787, + "learning_rate": 9.996904204110198e-05, + "loss": 2.4543, + "step": 1333 + }, + { + "epoch": 0.4094536525475752, + "grad_norm": 1.0588144063949585, + "learning_rate": 9.996886690851912e-05, + "loss": 2.4755, + "step": 1334 + }, + { + "epoch": 0.4097605893186004, + "grad_norm": 1.0359580516815186, + "learning_rate": 9.996869128211462e-05, + "loss": 2.4933, + "step": 1335 + }, + { + "epoch": 0.41006752608962554, + "grad_norm": 1.0067389011383057, + "learning_rate": 9.996851516189021e-05, + "loss": 2.4291, + "step": 1336 + }, + { + "epoch": 0.4103744628606507, + "grad_norm": 1.0173524618148804, + "learning_rate": 9.996833854784766e-05, + "loss": 2.4856, + "step": 1337 + }, + { + "epoch": 0.4106813996316759, + "grad_norm": 1.0740927457809448, + "learning_rate": 9.99681614399887e-05, + "loss": 2.5248, + "step": 1338 + }, + { + "epoch": 0.41098833640270105, + "grad_norm": 0.9638547301292419, + "learning_rate": 9.99679838383151e-05, + "loss": 2.4777, + "step": 1339 + }, + { + "epoch": 0.4112952731737262, + "grad_norm": 1.0349369049072266, + "learning_rate": 9.996780574282856e-05, + "loss": 2.5188, + "step": 1340 + }, + { + "epoch": 0.4116022099447514, + "grad_norm": 1.099743127822876, + "learning_rate": 9.996762715353089e-05, + "loss": 2.4141, + "step": 1341 + }, + { + "epoch": 0.41190914671577655, + "grad_norm": 1.027178406715393, + "learning_rate": 9.996744807042386e-05, + "loss": 2.5134, + "step": 1342 + }, + { + "epoch": 0.4122160834868017, + "grad_norm": 1.1933472156524658, + "learning_rate": 9.996726849350922e-05, + "loss": 2.4821, + "step": 1343 + }, + { + "epoch": 0.4125230202578269, + "grad_norm": 1.1663923263549805, + "learning_rate": 9.996708842278872e-05, + "loss": 2.4593, + "step": 1344 + }, + { + "epoch": 0.41282995702885206, + "grad_norm": 1.2633854150772095, + "learning_rate": 9.996690785826418e-05, + "loss": 2.5524, + "step": 1345 + }, + { + "epoch": 0.4131368937998772, + "grad_norm": 1.03873610496521, + "learning_rate": 9.996672679993737e-05, + "loss": 2.5403, + "step": 1346 + }, + { + "epoch": 0.4134438305709024, + "grad_norm": 1.106656789779663, + "learning_rate": 9.996654524781009e-05, + "loss": 2.5172, + "step": 1347 + }, + { + "epoch": 0.41375076734192756, + "grad_norm": 1.015608310699463, + "learning_rate": 9.996636320188411e-05, + "loss": 2.423, + "step": 1348 + }, + { + "epoch": 0.41405770411295273, + "grad_norm": 1.0672087669372559, + "learning_rate": 9.996618066216124e-05, + "loss": 2.4861, + "step": 1349 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 1.1289842128753662, + "learning_rate": 9.996599762864329e-05, + "loss": 2.3944, + "step": 1350 + }, + { + "epoch": 0.41467157765500307, + "grad_norm": 1.080428957939148, + "learning_rate": 9.996581410133207e-05, + "loss": 2.4563, + "step": 1351 + }, + { + "epoch": 0.41497851442602823, + "grad_norm": 1.257104516029358, + "learning_rate": 9.996563008022939e-05, + "loss": 2.437, + "step": 1352 + }, + { + "epoch": 0.4152854511970534, + "grad_norm": 1.039293646812439, + "learning_rate": 9.996544556533706e-05, + "loss": 2.4654, + "step": 1353 + }, + { + "epoch": 0.41559238796807857, + "grad_norm": 1.0976085662841797, + "learning_rate": 9.996526055665692e-05, + "loss": 2.4755, + "step": 1354 + }, + { + "epoch": 0.41589932473910374, + "grad_norm": 0.937647819519043, + "learning_rate": 9.996507505419078e-05, + "loss": 2.4687, + "step": 1355 + }, + { + "epoch": 0.4162062615101289, + "grad_norm": 1.0461267232894897, + "learning_rate": 9.996488905794047e-05, + "loss": 2.4092, + "step": 1356 + }, + { + "epoch": 0.4165131982811541, + "grad_norm": 1.0510658025741577, + "learning_rate": 9.996470256790787e-05, + "loss": 2.4806, + "step": 1357 + }, + { + "epoch": 0.41682013505217924, + "grad_norm": 1.2323371171951294, + "learning_rate": 9.996451558409478e-05, + "loss": 2.5017, + "step": 1358 + }, + { + "epoch": 0.4171270718232044, + "grad_norm": 0.9880139827728271, + "learning_rate": 9.996432810650307e-05, + "loss": 2.5171, + "step": 1359 + }, + { + "epoch": 0.4174340085942296, + "grad_norm": 1.2572466135025024, + "learning_rate": 9.996414013513458e-05, + "loss": 2.4285, + "step": 1360 + }, + { + "epoch": 0.41774094536525475, + "grad_norm": 0.8809559941291809, + "learning_rate": 9.996395166999118e-05, + "loss": 2.398, + "step": 1361 + }, + { + "epoch": 0.4180478821362799, + "grad_norm": 0.9389429688453674, + "learning_rate": 9.996376271107471e-05, + "loss": 2.4539, + "step": 1362 + }, + { + "epoch": 0.4183548189073051, + "grad_norm": 0.8821789026260376, + "learning_rate": 9.996357325838705e-05, + "loss": 2.4762, + "step": 1363 + }, + { + "epoch": 0.41866175567833025, + "grad_norm": 1.0148484706878662, + "learning_rate": 9.99633833119301e-05, + "loss": 2.5292, + "step": 1364 + }, + { + "epoch": 0.4189686924493554, + "grad_norm": 0.9861947894096375, + "learning_rate": 9.996319287170569e-05, + "loss": 2.4285, + "step": 1365 + }, + { + "epoch": 0.4192756292203806, + "grad_norm": 1.1907099485397339, + "learning_rate": 9.996300193771573e-05, + "loss": 2.4325, + "step": 1366 + }, + { + "epoch": 0.41958256599140575, + "grad_norm": 1.0746681690216064, + "learning_rate": 9.99628105099621e-05, + "loss": 2.3349, + "step": 1367 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 1.2040268182754517, + "learning_rate": 9.996261858844669e-05, + "loss": 2.4427, + "step": 1368 + }, + { + "epoch": 0.4201964395334561, + "grad_norm": 1.0487430095672607, + "learning_rate": 9.99624261731714e-05, + "loss": 2.4305, + "step": 1369 + }, + { + "epoch": 0.42050337630448126, + "grad_norm": 1.0047999620437622, + "learning_rate": 9.996223326413812e-05, + "loss": 2.4442, + "step": 1370 + }, + { + "epoch": 0.4208103130755064, + "grad_norm": 1.147078275680542, + "learning_rate": 9.996203986134879e-05, + "loss": 2.5189, + "step": 1371 + }, + { + "epoch": 0.4211172498465316, + "grad_norm": 1.2269455194473267, + "learning_rate": 9.996184596480529e-05, + "loss": 2.3905, + "step": 1372 + }, + { + "epoch": 0.42142418661755676, + "grad_norm": 0.9716771245002747, + "learning_rate": 9.996165157450954e-05, + "loss": 2.4246, + "step": 1373 + }, + { + "epoch": 0.42173112338858193, + "grad_norm": 1.0569939613342285, + "learning_rate": 9.996145669046347e-05, + "loss": 2.529, + "step": 1374 + }, + { + "epoch": 0.4220380601596071, + "grad_norm": 1.1145942211151123, + "learning_rate": 9.996126131266899e-05, + "loss": 2.3965, + "step": 1375 + }, + { + "epoch": 0.42234499693063227, + "grad_norm": 0.9990974068641663, + "learning_rate": 9.996106544112805e-05, + "loss": 2.4991, + "step": 1376 + }, + { + "epoch": 0.42265193370165743, + "grad_norm": 0.9536247253417969, + "learning_rate": 9.99608690758426e-05, + "loss": 2.4347, + "step": 1377 + }, + { + "epoch": 0.4229588704726826, + "grad_norm": 1.0053460597991943, + "learning_rate": 9.996067221681452e-05, + "loss": 2.4213, + "step": 1378 + }, + { + "epoch": 0.42326580724370777, + "grad_norm": 1.0727168321609497, + "learning_rate": 9.99604748640458e-05, + "loss": 2.4479, + "step": 1379 + }, + { + "epoch": 0.42357274401473294, + "grad_norm": 1.2539277076721191, + "learning_rate": 9.996027701753841e-05, + "loss": 2.4721, + "step": 1380 + }, + { + "epoch": 0.4238796807857581, + "grad_norm": 1.0348230600357056, + "learning_rate": 9.996007867729427e-05, + "loss": 2.4263, + "step": 1381 + }, + { + "epoch": 0.42418661755678333, + "grad_norm": 1.051802158355713, + "learning_rate": 9.995987984331533e-05, + "loss": 2.4492, + "step": 1382 + }, + { + "epoch": 0.4244935543278085, + "grad_norm": 1.0394505262374878, + "learning_rate": 9.995968051560361e-05, + "loss": 2.4625, + "step": 1383 + }, + { + "epoch": 0.42480049109883367, + "grad_norm": 1.1121852397918701, + "learning_rate": 9.995948069416103e-05, + "loss": 2.4999, + "step": 1384 + }, + { + "epoch": 0.42510742786985883, + "grad_norm": 0.9693613052368164, + "learning_rate": 9.995928037898957e-05, + "loss": 2.4112, + "step": 1385 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 1.1416810750961304, + "learning_rate": 9.995907957009123e-05, + "loss": 2.5452, + "step": 1386 + }, + { + "epoch": 0.42572130141190917, + "grad_norm": 1.010640025138855, + "learning_rate": 9.995887826746797e-05, + "loss": 2.412, + "step": 1387 + }, + { + "epoch": 0.42602823818293434, + "grad_norm": 1.0800373554229736, + "learning_rate": 9.99586764711218e-05, + "loss": 2.4451, + "step": 1388 + }, + { + "epoch": 0.4263351749539595, + "grad_norm": 1.058931589126587, + "learning_rate": 9.995847418105471e-05, + "loss": 2.474, + "step": 1389 + }, + { + "epoch": 0.4266421117249847, + "grad_norm": 1.0727131366729736, + "learning_rate": 9.99582713972687e-05, + "loss": 2.468, + "step": 1390 + }, + { + "epoch": 0.42694904849600984, + "grad_norm": 1.0237464904785156, + "learning_rate": 9.995806811976576e-05, + "loss": 2.5208, + "step": 1391 + }, + { + "epoch": 0.427255985267035, + "grad_norm": 1.036582112312317, + "learning_rate": 9.995786434854793e-05, + "loss": 2.4338, + "step": 1392 + }, + { + "epoch": 0.4275629220380602, + "grad_norm": 0.9617817997932434, + "learning_rate": 9.995766008361719e-05, + "loss": 2.4465, + "step": 1393 + }, + { + "epoch": 0.42786985880908535, + "grad_norm": 1.2188911437988281, + "learning_rate": 9.995745532497556e-05, + "loss": 2.5069, + "step": 1394 + }, + { + "epoch": 0.4281767955801105, + "grad_norm": 1.0796585083007812, + "learning_rate": 9.99572500726251e-05, + "loss": 2.4839, + "step": 1395 + }, + { + "epoch": 0.4284837323511357, + "grad_norm": 0.9843130111694336, + "learning_rate": 9.99570443265678e-05, + "loss": 2.4968, + "step": 1396 + }, + { + "epoch": 0.42879066912216085, + "grad_norm": 1.0441415309906006, + "learning_rate": 9.99568380868057e-05, + "loss": 2.4134, + "step": 1397 + }, + { + "epoch": 0.429097605893186, + "grad_norm": 0.9156177639961243, + "learning_rate": 9.995663135334085e-05, + "loss": 2.4891, + "step": 1398 + }, + { + "epoch": 0.4294045426642112, + "grad_norm": 1.1159545183181763, + "learning_rate": 9.995642412617529e-05, + "loss": 2.4507, + "step": 1399 + }, + { + "epoch": 0.42971147943523635, + "grad_norm": 0.8944577574729919, + "learning_rate": 9.995621640531107e-05, + "loss": 2.4465, + "step": 1400 + }, + { + "epoch": 0.4300184162062615, + "grad_norm": 0.9043408036231995, + "learning_rate": 9.995600819075025e-05, + "loss": 2.3726, + "step": 1401 + }, + { + "epoch": 0.4303253529772867, + "grad_norm": 0.9028464555740356, + "learning_rate": 9.995579948249486e-05, + "loss": 2.427, + "step": 1402 + }, + { + "epoch": 0.43063228974831186, + "grad_norm": 0.9497705101966858, + "learning_rate": 9.995559028054699e-05, + "loss": 2.4666, + "step": 1403 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.927601158618927, + "learning_rate": 9.995538058490868e-05, + "loss": 2.3679, + "step": 1404 + }, + { + "epoch": 0.4312461632903622, + "grad_norm": 1.050394892692566, + "learning_rate": 9.995517039558204e-05, + "loss": 2.4096, + "step": 1405 + }, + { + "epoch": 0.43155310006138736, + "grad_norm": 1.3011974096298218, + "learning_rate": 9.995495971256911e-05, + "loss": 2.4439, + "step": 1406 + }, + { + "epoch": 0.43186003683241253, + "grad_norm": 1.0740708112716675, + "learning_rate": 9.9954748535872e-05, + "loss": 2.4891, + "step": 1407 + }, + { + "epoch": 0.4321669736034377, + "grad_norm": 1.1132466793060303, + "learning_rate": 9.995453686549279e-05, + "loss": 2.46, + "step": 1408 + }, + { + "epoch": 0.43247391037446287, + "grad_norm": 1.063275933265686, + "learning_rate": 9.995432470143356e-05, + "loss": 2.5035, + "step": 1409 + }, + { + "epoch": 0.43278084714548803, + "grad_norm": 1.065679669380188, + "learning_rate": 9.99541120436964e-05, + "loss": 2.4471, + "step": 1410 + }, + { + "epoch": 0.4330877839165132, + "grad_norm": 1.017587423324585, + "learning_rate": 9.995389889228344e-05, + "loss": 2.4879, + "step": 1411 + }, + { + "epoch": 0.43339472068753837, + "grad_norm": 0.9744442701339722, + "learning_rate": 9.995368524719678e-05, + "loss": 2.3923, + "step": 1412 + }, + { + "epoch": 0.43370165745856354, + "grad_norm": 0.8916706442832947, + "learning_rate": 9.995347110843851e-05, + "loss": 2.3965, + "step": 1413 + }, + { + "epoch": 0.4340085942295887, + "grad_norm": 0.916221559047699, + "learning_rate": 9.995325647601075e-05, + "loss": 2.4742, + "step": 1414 + }, + { + "epoch": 0.4343155310006139, + "grad_norm": 0.9388782978057861, + "learning_rate": 9.995304134991565e-05, + "loss": 2.453, + "step": 1415 + }, + { + "epoch": 0.43462246777163904, + "grad_norm": 1.057085633277893, + "learning_rate": 9.995282573015532e-05, + "loss": 2.5791, + "step": 1416 + }, + { + "epoch": 0.4349294045426642, + "grad_norm": 1.055145025253296, + "learning_rate": 9.995260961673187e-05, + "loss": 2.3565, + "step": 1417 + }, + { + "epoch": 0.4352363413136894, + "grad_norm": 1.0733528137207031, + "learning_rate": 9.995239300964747e-05, + "loss": 2.5413, + "step": 1418 + }, + { + "epoch": 0.43554327808471455, + "grad_norm": 1.1478198766708374, + "learning_rate": 9.995217590890425e-05, + "loss": 2.4093, + "step": 1419 + }, + { + "epoch": 0.4358502148557397, + "grad_norm": 0.8663081526756287, + "learning_rate": 9.995195831450432e-05, + "loss": 2.3968, + "step": 1420 + }, + { + "epoch": 0.4361571516267649, + "grad_norm": 0.9811860918998718, + "learning_rate": 9.995174022644988e-05, + "loss": 2.3536, + "step": 1421 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.9883477687835693, + "learning_rate": 9.995152164474306e-05, + "loss": 2.5372, + "step": 1422 + }, + { + "epoch": 0.4367710251688152, + "grad_norm": 1.2196532487869263, + "learning_rate": 9.995130256938603e-05, + "loss": 2.429, + "step": 1423 + }, + { + "epoch": 0.4370779619398404, + "grad_norm": 1.000264286994934, + "learning_rate": 9.995108300038096e-05, + "loss": 2.4116, + "step": 1424 + }, + { + "epoch": 0.43738489871086556, + "grad_norm": 1.1259286403656006, + "learning_rate": 9.995086293773e-05, + "loss": 2.4405, + "step": 1425 + }, + { + "epoch": 0.4376918354818907, + "grad_norm": 0.9334595203399658, + "learning_rate": 9.995064238143533e-05, + "loss": 2.3849, + "step": 1426 + }, + { + "epoch": 0.4379987722529159, + "grad_norm": 0.8880285620689392, + "learning_rate": 9.995042133149914e-05, + "loss": 2.4177, + "step": 1427 + }, + { + "epoch": 0.43830570902394106, + "grad_norm": 0.8823251724243164, + "learning_rate": 9.995019978792362e-05, + "loss": 2.4876, + "step": 1428 + }, + { + "epoch": 0.4386126457949662, + "grad_norm": 0.9289014339447021, + "learning_rate": 9.994997775071094e-05, + "loss": 2.4725, + "step": 1429 + }, + { + "epoch": 0.4389195825659914, + "grad_norm": 0.9100427627563477, + "learning_rate": 9.994975521986329e-05, + "loss": 2.3834, + "step": 1430 + }, + { + "epoch": 0.43922651933701656, + "grad_norm": 0.8956978917121887, + "learning_rate": 9.99495321953829e-05, + "loss": 2.4418, + "step": 1431 + }, + { + "epoch": 0.43953345610804173, + "grad_norm": 1.1248396635055542, + "learning_rate": 9.994930867727195e-05, + "loss": 2.4389, + "step": 1432 + }, + { + "epoch": 0.4398403928790669, + "grad_norm": 0.9285669922828674, + "learning_rate": 9.994908466553266e-05, + "loss": 2.3922, + "step": 1433 + }, + { + "epoch": 0.44014732965009207, + "grad_norm": 0.9604844450950623, + "learning_rate": 9.994886016016723e-05, + "loss": 2.4365, + "step": 1434 + }, + { + "epoch": 0.44045426642111724, + "grad_norm": 1.0534024238586426, + "learning_rate": 9.99486351611779e-05, + "loss": 2.4377, + "step": 1435 + }, + { + "epoch": 0.4407612031921424, + "grad_norm": 1.1028003692626953, + "learning_rate": 9.994840966856686e-05, + "loss": 2.4299, + "step": 1436 + }, + { + "epoch": 0.44106813996316757, + "grad_norm": 1.119832158088684, + "learning_rate": 9.994818368233639e-05, + "loss": 2.4656, + "step": 1437 + }, + { + "epoch": 0.44137507673419274, + "grad_norm": 0.9782878160476685, + "learning_rate": 9.994795720248867e-05, + "loss": 2.3661, + "step": 1438 + }, + { + "epoch": 0.4416820135052179, + "grad_norm": 1.0002741813659668, + "learning_rate": 9.994773022902597e-05, + "loss": 2.4157, + "step": 1439 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 1.051486611366272, + "learning_rate": 9.994750276195053e-05, + "loss": 2.452, + "step": 1440 + }, + { + "epoch": 0.44229588704726824, + "grad_norm": 1.0375488996505737, + "learning_rate": 9.994727480126457e-05, + "loss": 2.4406, + "step": 1441 + }, + { + "epoch": 0.4426028238182934, + "grad_norm": 0.9407445192337036, + "learning_rate": 9.99470463469704e-05, + "loss": 2.3434, + "step": 1442 + }, + { + "epoch": 0.4429097605893186, + "grad_norm": 1.0371474027633667, + "learning_rate": 9.994681739907022e-05, + "loss": 2.5094, + "step": 1443 + }, + { + "epoch": 0.44321669736034375, + "grad_norm": 1.057519555091858, + "learning_rate": 9.994658795756632e-05, + "loss": 2.4501, + "step": 1444 + }, + { + "epoch": 0.4435236341313689, + "grad_norm": 0.9340078234672546, + "learning_rate": 9.994635802246097e-05, + "loss": 2.4151, + "step": 1445 + }, + { + "epoch": 0.4438305709023941, + "grad_norm": 0.8906050324440002, + "learning_rate": 9.994612759375644e-05, + "loss": 2.3837, + "step": 1446 + }, + { + "epoch": 0.44413750767341925, + "grad_norm": 0.8349595665931702, + "learning_rate": 9.994589667145497e-05, + "loss": 2.4317, + "step": 1447 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.9362117648124695, + "learning_rate": 9.994566525555891e-05, + "loss": 2.4586, + "step": 1448 + }, + { + "epoch": 0.4447513812154696, + "grad_norm": 0.869215190410614, + "learning_rate": 9.99454333460705e-05, + "loss": 2.4458, + "step": 1449 + }, + { + "epoch": 0.44505831798649476, + "grad_norm": 0.904531717300415, + "learning_rate": 9.994520094299204e-05, + "loss": 2.4198, + "step": 1450 + }, + { + "epoch": 0.4453652547575199, + "grad_norm": 0.9153178930282593, + "learning_rate": 9.994496804632583e-05, + "loss": 2.3718, + "step": 1451 + }, + { + "epoch": 0.44567219152854515, + "grad_norm": 1.0229307413101196, + "learning_rate": 9.994473465607418e-05, + "loss": 2.3787, + "step": 1452 + }, + { + "epoch": 0.4459791282995703, + "grad_norm": 1.0449415445327759, + "learning_rate": 9.994450077223938e-05, + "loss": 2.4965, + "step": 1453 + }, + { + "epoch": 0.4462860650705955, + "grad_norm": 1.0524135828018188, + "learning_rate": 9.994426639482375e-05, + "loss": 2.3518, + "step": 1454 + }, + { + "epoch": 0.44659300184162065, + "grad_norm": 1.0612086057662964, + "learning_rate": 9.994403152382961e-05, + "loss": 2.4501, + "step": 1455 + }, + { + "epoch": 0.4468999386126458, + "grad_norm": 1.0568779706954956, + "learning_rate": 9.994379615925929e-05, + "loss": 2.3754, + "step": 1456 + }, + { + "epoch": 0.447206875383671, + "grad_norm": 1.0984265804290771, + "learning_rate": 9.994356030111509e-05, + "loss": 2.4318, + "step": 1457 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.9227646589279175, + "learning_rate": 9.994332394939936e-05, + "loss": 2.3928, + "step": 1458 + }, + { + "epoch": 0.4478207489257213, + "grad_norm": 1.0073471069335938, + "learning_rate": 9.994308710411442e-05, + "loss": 2.4203, + "step": 1459 + }, + { + "epoch": 0.4481276856967465, + "grad_norm": 1.1347973346710205, + "learning_rate": 9.994284976526263e-05, + "loss": 2.4991, + "step": 1460 + }, + { + "epoch": 0.44843462246777166, + "grad_norm": 0.9912654757499695, + "learning_rate": 9.994261193284631e-05, + "loss": 2.471, + "step": 1461 + }, + { + "epoch": 0.4487415592387968, + "grad_norm": 1.0599550008773804, + "learning_rate": 9.994237360686784e-05, + "loss": 2.505, + "step": 1462 + }, + { + "epoch": 0.449048496009822, + "grad_norm": 0.9811004996299744, + "learning_rate": 9.994213478732957e-05, + "loss": 2.3868, + "step": 1463 + }, + { + "epoch": 0.44935543278084716, + "grad_norm": 0.8389631509780884, + "learning_rate": 9.994189547423384e-05, + "loss": 2.4766, + "step": 1464 + }, + { + "epoch": 0.44966236955187233, + "grad_norm": 0.8475043773651123, + "learning_rate": 9.994165566758302e-05, + "loss": 2.3666, + "step": 1465 + }, + { + "epoch": 0.4499693063228975, + "grad_norm": 0.8922824859619141, + "learning_rate": 9.994141536737951e-05, + "loss": 2.3823, + "step": 1466 + }, + { + "epoch": 0.45027624309392267, + "grad_norm": 1.0286083221435547, + "learning_rate": 9.994117457362564e-05, + "loss": 2.4639, + "step": 1467 + }, + { + "epoch": 0.45058317986494784, + "grad_norm": 1.094282865524292, + "learning_rate": 9.994093328632383e-05, + "loss": 2.3984, + "step": 1468 + }, + { + "epoch": 0.450890116635973, + "grad_norm": 1.0993603467941284, + "learning_rate": 9.994069150547642e-05, + "loss": 2.3719, + "step": 1469 + }, + { + "epoch": 0.45119705340699817, + "grad_norm": 1.0274133682250977, + "learning_rate": 9.994044923108585e-05, + "loss": 2.3644, + "step": 1470 + }, + { + "epoch": 0.45150399017802334, + "grad_norm": 0.8834434747695923, + "learning_rate": 9.994020646315448e-05, + "loss": 2.4955, + "step": 1471 + }, + { + "epoch": 0.4518109269490485, + "grad_norm": 0.8540776968002319, + "learning_rate": 9.993996320168473e-05, + "loss": 2.4292, + "step": 1472 + }, + { + "epoch": 0.4521178637200737, + "grad_norm": 0.8735383749008179, + "learning_rate": 9.993971944667897e-05, + "loss": 2.4343, + "step": 1473 + }, + { + "epoch": 0.45242480049109884, + "grad_norm": 0.976224422454834, + "learning_rate": 9.993947519813965e-05, + "loss": 2.4173, + "step": 1474 + }, + { + "epoch": 0.452731737262124, + "grad_norm": 0.9638139009475708, + "learning_rate": 9.993923045606917e-05, + "loss": 2.4322, + "step": 1475 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.9689927697181702, + "learning_rate": 9.993898522046992e-05, + "loss": 2.4625, + "step": 1476 + }, + { + "epoch": 0.45334561080417435, + "grad_norm": 1.0496052503585815, + "learning_rate": 9.993873949134437e-05, + "loss": 2.4788, + "step": 1477 + }, + { + "epoch": 0.4536525475751995, + "grad_norm": 1.0285090208053589, + "learning_rate": 9.993849326869491e-05, + "loss": 2.4119, + "step": 1478 + }, + { + "epoch": 0.4539594843462247, + "grad_norm": 0.9423730373382568, + "learning_rate": 9.993824655252401e-05, + "loss": 2.3919, + "step": 1479 + }, + { + "epoch": 0.45426642111724985, + "grad_norm": 1.0312988758087158, + "learning_rate": 9.993799934283407e-05, + "loss": 2.3829, + "step": 1480 + }, + { + "epoch": 0.454573357888275, + "grad_norm": 1.0985655784606934, + "learning_rate": 9.993775163962755e-05, + "loss": 2.3958, + "step": 1481 + }, + { + "epoch": 0.4548802946593002, + "grad_norm": 0.9346623420715332, + "learning_rate": 9.993750344290691e-05, + "loss": 2.3611, + "step": 1482 + }, + { + "epoch": 0.45518723143032536, + "grad_norm": 1.039681315422058, + "learning_rate": 9.993725475267459e-05, + "loss": 2.3989, + "step": 1483 + }, + { + "epoch": 0.4554941682013505, + "grad_norm": 0.9941854476928711, + "learning_rate": 9.993700556893304e-05, + "loss": 2.3092, + "step": 1484 + }, + { + "epoch": 0.4558011049723757, + "grad_norm": 0.9752130508422852, + "learning_rate": 9.993675589168473e-05, + "loss": 2.3727, + "step": 1485 + }, + { + "epoch": 0.45610804174340086, + "grad_norm": 0.9946039319038391, + "learning_rate": 9.993650572093216e-05, + "loss": 2.4121, + "step": 1486 + }, + { + "epoch": 0.45641497851442603, + "grad_norm": 1.1340489387512207, + "learning_rate": 9.993625505667774e-05, + "loss": 2.4477, + "step": 1487 + }, + { + "epoch": 0.4567219152854512, + "grad_norm": 0.9300981760025024, + "learning_rate": 9.993600389892399e-05, + "loss": 2.4045, + "step": 1488 + }, + { + "epoch": 0.45702885205647636, + "grad_norm": 0.8670973181724548, + "learning_rate": 9.993575224767338e-05, + "loss": 2.3596, + "step": 1489 + }, + { + "epoch": 0.45733578882750153, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.99355001029284e-05, + "loss": 2.4191, + "step": 1490 + }, + { + "epoch": 0.4576427255985267, + "grad_norm": 0.9099079370498657, + "learning_rate": 9.993524746469154e-05, + "loss": 2.4139, + "step": 1491 + }, + { + "epoch": 0.45794966236955187, + "grad_norm": 0.9740153551101685, + "learning_rate": 9.99349943329653e-05, + "loss": 2.4269, + "step": 1492 + }, + { + "epoch": 0.45825659914057704, + "grad_norm": 0.9112171530723572, + "learning_rate": 9.993474070775217e-05, + "loss": 2.3575, + "step": 1493 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 1.124553918838501, + "learning_rate": 9.993448658905466e-05, + "loss": 2.5518, + "step": 1494 + }, + { + "epoch": 0.4588704726826274, + "grad_norm": 1.1732012033462524, + "learning_rate": 9.99342319768753e-05, + "loss": 2.4346, + "step": 1495 + }, + { + "epoch": 0.45917740945365254, + "grad_norm": 0.8880025148391724, + "learning_rate": 9.993397687121659e-05, + "loss": 2.3593, + "step": 1496 + }, + { + "epoch": 0.4594843462246777, + "grad_norm": 0.9916797876358032, + "learning_rate": 9.993372127208105e-05, + "loss": 2.3283, + "step": 1497 + }, + { + "epoch": 0.4597912829957029, + "grad_norm": 0.9372622966766357, + "learning_rate": 9.99334651794712e-05, + "loss": 2.3868, + "step": 1498 + }, + { + "epoch": 0.46009821976672804, + "grad_norm": 1.0630989074707031, + "learning_rate": 9.99332085933896e-05, + "loss": 2.3605, + "step": 1499 + }, + { + "epoch": 0.4604051565377532, + "grad_norm": 1.000473976135254, + "learning_rate": 9.993295151383874e-05, + "loss": 2.3478, + "step": 1500 + }, + { + "epoch": 0.4607120933087784, + "grad_norm": 1.0269688367843628, + "learning_rate": 9.99326939408212e-05, + "loss": 2.4104, + "step": 1501 + }, + { + "epoch": 0.46101903007980355, + "grad_norm": 0.9003174901008606, + "learning_rate": 9.993243587433952e-05, + "loss": 2.3461, + "step": 1502 + }, + { + "epoch": 0.4613259668508287, + "grad_norm": 0.7938058972358704, + "learning_rate": 9.993217731439623e-05, + "loss": 2.3463, + "step": 1503 + }, + { + "epoch": 0.4616329036218539, + "grad_norm": 0.8715407252311707, + "learning_rate": 9.993191826099391e-05, + "loss": 2.3962, + "step": 1504 + }, + { + "epoch": 0.46193984039287905, + "grad_norm": 0.8319756984710693, + "learning_rate": 9.99316587141351e-05, + "loss": 2.342, + "step": 1505 + }, + { + "epoch": 0.4622467771639042, + "grad_norm": 0.846592903137207, + "learning_rate": 9.993139867382238e-05, + "loss": 2.4064, + "step": 1506 + }, + { + "epoch": 0.4625537139349294, + "grad_norm": 0.8567312955856323, + "learning_rate": 9.99311381400583e-05, + "loss": 2.3603, + "step": 1507 + }, + { + "epoch": 0.46286065070595456, + "grad_norm": 0.8784321546554565, + "learning_rate": 9.993087711284546e-05, + "loss": 2.4031, + "step": 1508 + }, + { + "epoch": 0.4631675874769797, + "grad_norm": 0.838233232498169, + "learning_rate": 9.993061559218641e-05, + "loss": 2.3156, + "step": 1509 + }, + { + "epoch": 0.4634745242480049, + "grad_norm": 0.8804462552070618, + "learning_rate": 9.993035357808376e-05, + "loss": 2.4322, + "step": 1510 + }, + { + "epoch": 0.46378146101903006, + "grad_norm": 1.1055982112884521, + "learning_rate": 9.99300910705401e-05, + "loss": 2.5006, + "step": 1511 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.9872145056724548, + "learning_rate": 9.992982806955799e-05, + "loss": 2.3547, + "step": 1512 + }, + { + "epoch": 0.4643953345610804, + "grad_norm": 1.0710479021072388, + "learning_rate": 9.99295645751401e-05, + "loss": 2.4867, + "step": 1513 + }, + { + "epoch": 0.46470227133210557, + "grad_norm": 0.9858919382095337, + "learning_rate": 9.992930058728894e-05, + "loss": 2.2986, + "step": 1514 + }, + { + "epoch": 0.46500920810313073, + "grad_norm": 0.9031065702438354, + "learning_rate": 9.992903610600719e-05, + "loss": 2.3172, + "step": 1515 + }, + { + "epoch": 0.4653161448741559, + "grad_norm": 0.923160970211029, + "learning_rate": 9.992877113129744e-05, + "loss": 2.4231, + "step": 1516 + }, + { + "epoch": 0.46562308164518107, + "grad_norm": 1.0130947828292847, + "learning_rate": 9.992850566316231e-05, + "loss": 2.3593, + "step": 1517 + }, + { + "epoch": 0.46593001841620624, + "grad_norm": 0.8947033286094666, + "learning_rate": 9.992823970160441e-05, + "loss": 2.3324, + "step": 1518 + }, + { + "epoch": 0.4662369551872314, + "grad_norm": 0.8819900155067444, + "learning_rate": 9.992797324662639e-05, + "loss": 2.2885, + "step": 1519 + }, + { + "epoch": 0.4665438919582566, + "grad_norm": 0.9434374570846558, + "learning_rate": 9.99277062982309e-05, + "loss": 2.427, + "step": 1520 + }, + { + "epoch": 0.46685082872928174, + "grad_norm": 0.9568646550178528, + "learning_rate": 9.99274388564205e-05, + "loss": 2.4059, + "step": 1521 + }, + { + "epoch": 0.4671577655003069, + "grad_norm": 0.9125105142593384, + "learning_rate": 9.992717092119794e-05, + "loss": 2.3306, + "step": 1522 + }, + { + "epoch": 0.46746470227133213, + "grad_norm": 0.8893206715583801, + "learning_rate": 9.992690249256578e-05, + "loss": 2.4211, + "step": 1523 + }, + { + "epoch": 0.4677716390423573, + "grad_norm": 0.8655402660369873, + "learning_rate": 9.992663357052672e-05, + "loss": 2.3493, + "step": 1524 + }, + { + "epoch": 0.46807857581338247, + "grad_norm": 0.7973037958145142, + "learning_rate": 9.99263641550834e-05, + "loss": 2.4255, + "step": 1525 + }, + { + "epoch": 0.46838551258440764, + "grad_norm": 0.8158934116363525, + "learning_rate": 9.992609424623849e-05, + "loss": 2.3518, + "step": 1526 + }, + { + "epoch": 0.4686924493554328, + "grad_norm": 0.7919436693191528, + "learning_rate": 9.992582384399465e-05, + "loss": 2.3762, + "step": 1527 + }, + { + "epoch": 0.468999386126458, + "grad_norm": 0.911490261554718, + "learning_rate": 9.992555294835455e-05, + "loss": 2.454, + "step": 1528 + }, + { + "epoch": 0.46930632289748314, + "grad_norm": 0.9504674077033997, + "learning_rate": 9.992528155932088e-05, + "loss": 2.3554, + "step": 1529 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.9833991527557373, + "learning_rate": 9.99250096768963e-05, + "loss": 2.4245, + "step": 1530 + }, + { + "epoch": 0.4699201964395335, + "grad_norm": 0.9994687438011169, + "learning_rate": 9.992473730108354e-05, + "loss": 2.3269, + "step": 1531 + }, + { + "epoch": 0.47022713321055865, + "grad_norm": 0.977237343788147, + "learning_rate": 9.992446443188526e-05, + "loss": 2.3938, + "step": 1532 + }, + { + "epoch": 0.4705340699815838, + "grad_norm": 1.018334150314331, + "learning_rate": 9.992419106930415e-05, + "loss": 2.3076, + "step": 1533 + }, + { + "epoch": 0.470841006752609, + "grad_norm": 0.9752077460289001, + "learning_rate": 9.992391721334293e-05, + "loss": 2.4224, + "step": 1534 + }, + { + "epoch": 0.47114794352363415, + "grad_norm": 0.9457291960716248, + "learning_rate": 9.992364286400428e-05, + "loss": 2.3859, + "step": 1535 + }, + { + "epoch": 0.4714548802946593, + "grad_norm": 0.9112275838851929, + "learning_rate": 9.992336802129096e-05, + "loss": 2.3343, + "step": 1536 + }, + { + "epoch": 0.4717618170656845, + "grad_norm": 0.7701164484024048, + "learning_rate": 9.992309268520563e-05, + "loss": 2.3912, + "step": 1537 + }, + { + "epoch": 0.47206875383670965, + "grad_norm": 0.826822817325592, + "learning_rate": 9.992281685575105e-05, + "loss": 2.3794, + "step": 1538 + }, + { + "epoch": 0.4723756906077348, + "grad_norm": 0.8690019249916077, + "learning_rate": 9.992254053292994e-05, + "loss": 2.3474, + "step": 1539 + }, + { + "epoch": 0.47268262737876, + "grad_norm": 0.935954213142395, + "learning_rate": 9.9922263716745e-05, + "loss": 2.3794, + "step": 1540 + }, + { + "epoch": 0.47298956414978516, + "grad_norm": 1.0606616735458374, + "learning_rate": 9.992198640719901e-05, + "loss": 2.3491, + "step": 1541 + }, + { + "epoch": 0.4732965009208103, + "grad_norm": 1.0020630359649658, + "learning_rate": 9.992170860429469e-05, + "loss": 2.4723, + "step": 1542 + }, + { + "epoch": 0.4736034376918355, + "grad_norm": 0.9738268256187439, + "learning_rate": 9.992143030803476e-05, + "loss": 2.4282, + "step": 1543 + }, + { + "epoch": 0.47391037446286066, + "grad_norm": 1.0320461988449097, + "learning_rate": 9.992115151842203e-05, + "loss": 2.3935, + "step": 1544 + }, + { + "epoch": 0.47421731123388583, + "grad_norm": 0.926980197429657, + "learning_rate": 9.992087223545921e-05, + "loss": 2.4403, + "step": 1545 + }, + { + "epoch": 0.474524248004911, + "grad_norm": 0.8760805130004883, + "learning_rate": 9.992059245914906e-05, + "loss": 2.3282, + "step": 1546 + }, + { + "epoch": 0.47483118477593617, + "grad_norm": 0.807569146156311, + "learning_rate": 9.992031218949435e-05, + "loss": 2.351, + "step": 1547 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.7491574883460999, + "learning_rate": 9.992003142649788e-05, + "loss": 2.3788, + "step": 1548 + }, + { + "epoch": 0.4754450583179865, + "grad_norm": 0.8402566909790039, + "learning_rate": 9.99197501701624e-05, + "loss": 2.4025, + "step": 1549 + }, + { + "epoch": 0.47575199508901167, + "grad_norm": 0.9501824975013733, + "learning_rate": 9.991946842049067e-05, + "loss": 2.4433, + "step": 1550 + }, + { + "epoch": 0.47605893186003684, + "grad_norm": 1.0070267915725708, + "learning_rate": 9.99191861774855e-05, + "loss": 2.4267, + "step": 1551 + }, + { + "epoch": 0.476365868631062, + "grad_norm": 0.9052779078483582, + "learning_rate": 9.991890344114969e-05, + "loss": 2.37, + "step": 1552 + }, + { + "epoch": 0.4766728054020872, + "grad_norm": 0.9453344345092773, + "learning_rate": 9.9918620211486e-05, + "loss": 2.4687, + "step": 1553 + }, + { + "epoch": 0.47697974217311234, + "grad_norm": 0.9836863875389099, + "learning_rate": 9.991833648849725e-05, + "loss": 2.4005, + "step": 1554 + }, + { + "epoch": 0.4772866789441375, + "grad_norm": 0.856532633304596, + "learning_rate": 9.991805227218624e-05, + "loss": 2.329, + "step": 1555 + }, + { + "epoch": 0.4775936157151627, + "grad_norm": 0.8338705897331238, + "learning_rate": 9.991776756255579e-05, + "loss": 2.3648, + "step": 1556 + }, + { + "epoch": 0.47790055248618785, + "grad_norm": 0.7738644480705261, + "learning_rate": 9.991748235960869e-05, + "loss": 2.2784, + "step": 1557 + }, + { + "epoch": 0.478207489257213, + "grad_norm": 0.7771223783493042, + "learning_rate": 9.991719666334778e-05, + "loss": 2.2747, + "step": 1558 + }, + { + "epoch": 0.4785144260282382, + "grad_norm": 0.7564612627029419, + "learning_rate": 9.991691047377588e-05, + "loss": 2.2964, + "step": 1559 + }, + { + "epoch": 0.47882136279926335, + "grad_norm": 0.7877290844917297, + "learning_rate": 9.99166237908958e-05, + "loss": 2.3149, + "step": 1560 + }, + { + "epoch": 0.4791282995702885, + "grad_norm": 0.7967450022697449, + "learning_rate": 9.991633661471039e-05, + "loss": 2.4035, + "step": 1561 + }, + { + "epoch": 0.4794352363413137, + "grad_norm": 0.8993534445762634, + "learning_rate": 9.991604894522248e-05, + "loss": 2.4028, + "step": 1562 + }, + { + "epoch": 0.47974217311233885, + "grad_norm": 0.9135516881942749, + "learning_rate": 9.991576078243494e-05, + "loss": 2.3968, + "step": 1563 + }, + { + "epoch": 0.480049109883364, + "grad_norm": 0.8438525795936584, + "learning_rate": 9.991547212635057e-05, + "loss": 2.3589, + "step": 1564 + }, + { + "epoch": 0.4803560466543892, + "grad_norm": 0.8979686498641968, + "learning_rate": 9.991518297697226e-05, + "loss": 2.3835, + "step": 1565 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.8821539878845215, + "learning_rate": 9.991489333430286e-05, + "loss": 2.3503, + "step": 1566 + }, + { + "epoch": 0.4809699201964395, + "grad_norm": 0.8649077415466309, + "learning_rate": 9.991460319834523e-05, + "loss": 2.3806, + "step": 1567 + }, + { + "epoch": 0.4812768569674647, + "grad_norm": 0.8360965847969055, + "learning_rate": 9.991431256910223e-05, + "loss": 2.3997, + "step": 1568 + }, + { + "epoch": 0.48158379373848986, + "grad_norm": 0.9178828597068787, + "learning_rate": 9.991402144657673e-05, + "loss": 2.3611, + "step": 1569 + }, + { + "epoch": 0.48189073050951503, + "grad_norm": 0.7961607575416565, + "learning_rate": 9.991372983077161e-05, + "loss": 2.3588, + "step": 1570 + }, + { + "epoch": 0.4821976672805402, + "grad_norm": 0.8136993646621704, + "learning_rate": 9.991343772168978e-05, + "loss": 2.3241, + "step": 1571 + }, + { + "epoch": 0.48250460405156537, + "grad_norm": 0.8421273231506348, + "learning_rate": 9.991314511933407e-05, + "loss": 2.3493, + "step": 1572 + }, + { + "epoch": 0.48281154082259053, + "grad_norm": 0.774861752986908, + "learning_rate": 9.991285202370743e-05, + "loss": 2.362, + "step": 1573 + }, + { + "epoch": 0.4831184775936157, + "grad_norm": 0.9181589484214783, + "learning_rate": 9.991255843481273e-05, + "loss": 2.443, + "step": 1574 + }, + { + "epoch": 0.48342541436464087, + "grad_norm": 0.873884379863739, + "learning_rate": 9.991226435265286e-05, + "loss": 2.3819, + "step": 1575 + }, + { + "epoch": 0.48373235113566604, + "grad_norm": 0.923200786113739, + "learning_rate": 9.991196977723077e-05, + "loss": 2.4152, + "step": 1576 + }, + { + "epoch": 0.4840392879066912, + "grad_norm": 0.9097923040390015, + "learning_rate": 9.99116747085493e-05, + "loss": 2.4072, + "step": 1577 + }, + { + "epoch": 0.4843462246777164, + "grad_norm": 0.8885805010795593, + "learning_rate": 9.991137914661143e-05, + "loss": 2.3963, + "step": 1578 + }, + { + "epoch": 0.48465316144874154, + "grad_norm": 0.9016655683517456, + "learning_rate": 9.991108309142006e-05, + "loss": 2.4287, + "step": 1579 + }, + { + "epoch": 0.4849600982197667, + "grad_norm": 0.957548201084137, + "learning_rate": 9.99107865429781e-05, + "loss": 2.4306, + "step": 1580 + }, + { + "epoch": 0.4852670349907919, + "grad_norm": 0.9604195356369019, + "learning_rate": 9.99104895012885e-05, + "loss": 2.3721, + "step": 1581 + }, + { + "epoch": 0.48557397176181705, + "grad_norm": 1.0423815250396729, + "learning_rate": 9.991019196635419e-05, + "loss": 2.3847, + "step": 1582 + }, + { + "epoch": 0.4858809085328422, + "grad_norm": 0.9538045525550842, + "learning_rate": 9.990989393817809e-05, + "loss": 2.4307, + "step": 1583 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 1.0103334188461304, + "learning_rate": 9.990959541676318e-05, + "loss": 2.409, + "step": 1584 + }, + { + "epoch": 0.48649478207489255, + "grad_norm": 1.0780646800994873, + "learning_rate": 9.99092964021124e-05, + "loss": 2.3314, + "step": 1585 + }, + { + "epoch": 0.4868017188459177, + "grad_norm": 1.0062072277069092, + "learning_rate": 9.99089968942287e-05, + "loss": 2.3922, + "step": 1586 + }, + { + "epoch": 0.4871086556169429, + "grad_norm": 1.0575196743011475, + "learning_rate": 9.990869689311504e-05, + "loss": 2.4156, + "step": 1587 + }, + { + "epoch": 0.48741559238796806, + "grad_norm": 0.9953998923301697, + "learning_rate": 9.990839639877438e-05, + "loss": 2.381, + "step": 1588 + }, + { + "epoch": 0.4877225291589932, + "grad_norm": 0.8848470449447632, + "learning_rate": 9.99080954112097e-05, + "loss": 2.4178, + "step": 1589 + }, + { + "epoch": 0.4880294659300184, + "grad_norm": 0.7849117517471313, + "learning_rate": 9.990779393042397e-05, + "loss": 2.3021, + "step": 1590 + }, + { + "epoch": 0.48833640270104356, + "grad_norm": 0.7611599564552307, + "learning_rate": 9.990749195642016e-05, + "loss": 2.4426, + "step": 1591 + }, + { + "epoch": 0.4886433394720687, + "grad_norm": 0.8361895084381104, + "learning_rate": 9.990718948920127e-05, + "loss": 2.3442, + "step": 1592 + }, + { + "epoch": 0.4889502762430939, + "grad_norm": 0.8249576687812805, + "learning_rate": 9.990688652877028e-05, + "loss": 2.2745, + "step": 1593 + }, + { + "epoch": 0.4892572130141191, + "grad_norm": 0.763889729976654, + "learning_rate": 9.990658307513019e-05, + "loss": 2.3123, + "step": 1594 + }, + { + "epoch": 0.4895641497851443, + "grad_norm": 0.7517281770706177, + "learning_rate": 9.990627912828399e-05, + "loss": 2.3811, + "step": 1595 + }, + { + "epoch": 0.48987108655616945, + "grad_norm": 0.8254112005233765, + "learning_rate": 9.990597468823468e-05, + "loss": 2.4269, + "step": 1596 + }, + { + "epoch": 0.4901780233271946, + "grad_norm": 0.8267236948013306, + "learning_rate": 9.99056697549853e-05, + "loss": 2.354, + "step": 1597 + }, + { + "epoch": 0.4904849600982198, + "grad_norm": 0.8511303067207336, + "learning_rate": 9.990536432853881e-05, + "loss": 2.3755, + "step": 1598 + }, + { + "epoch": 0.49079189686924496, + "grad_norm": 0.8639636635780334, + "learning_rate": 9.990505840889828e-05, + "loss": 2.3828, + "step": 1599 + }, + { + "epoch": 0.4910988336402701, + "grad_norm": 0.8371795415878296, + "learning_rate": 9.990475199606672e-05, + "loss": 2.4235, + "step": 1600 + }, + { + "epoch": 0.4914057704112953, + "grad_norm": 0.7639186382293701, + "learning_rate": 9.990444509004713e-05, + "loss": 2.3547, + "step": 1601 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.7835492491722107, + "learning_rate": 9.990413769084257e-05, + "loss": 2.2983, + "step": 1602 + }, + { + "epoch": 0.49201964395334563, + "grad_norm": 0.8301565647125244, + "learning_rate": 9.990382979845609e-05, + "loss": 2.4109, + "step": 1603 + }, + { + "epoch": 0.4923265807243708, + "grad_norm": 0.9005976915359497, + "learning_rate": 9.99035214128907e-05, + "loss": 2.3618, + "step": 1604 + }, + { + "epoch": 0.49263351749539597, + "grad_norm": 1.0234936475753784, + "learning_rate": 9.990321253414945e-05, + "loss": 2.4622, + "step": 1605 + }, + { + "epoch": 0.49294045426642114, + "grad_norm": 1.1613819599151611, + "learning_rate": 9.990290316223542e-05, + "loss": 2.3231, + "step": 1606 + }, + { + "epoch": 0.4932473910374463, + "grad_norm": 0.9382983446121216, + "learning_rate": 9.990259329715165e-05, + "loss": 2.357, + "step": 1607 + }, + { + "epoch": 0.49355432780847147, + "grad_norm": 1.0277435779571533, + "learning_rate": 9.990228293890121e-05, + "loss": 2.3497, + "step": 1608 + }, + { + "epoch": 0.49386126457949664, + "grad_norm": 0.9809542894363403, + "learning_rate": 9.990197208748716e-05, + "loss": 2.363, + "step": 1609 + }, + { + "epoch": 0.4941682013505218, + "grad_norm": 1.151412844657898, + "learning_rate": 9.990166074291255e-05, + "loss": 2.4859, + "step": 1610 + }, + { + "epoch": 0.494475138121547, + "grad_norm": 0.9663482308387756, + "learning_rate": 9.990134890518051e-05, + "loss": 2.3848, + "step": 1611 + }, + { + "epoch": 0.49478207489257214, + "grad_norm": 0.9619266986846924, + "learning_rate": 9.990103657429405e-05, + "loss": 2.3381, + "step": 1612 + }, + { + "epoch": 0.4950890116635973, + "grad_norm": 1.1306475400924683, + "learning_rate": 9.990072375025634e-05, + "loss": 2.3859, + "step": 1613 + }, + { + "epoch": 0.4953959484346225, + "grad_norm": 1.127801537513733, + "learning_rate": 9.990041043307043e-05, + "loss": 2.4259, + "step": 1614 + }, + { + "epoch": 0.49570288520564765, + "grad_norm": 0.9880200624465942, + "learning_rate": 9.990009662273941e-05, + "loss": 2.3629, + "step": 1615 + }, + { + "epoch": 0.4960098219766728, + "grad_norm": 0.940493643283844, + "learning_rate": 9.989978231926636e-05, + "loss": 2.3716, + "step": 1616 + }, + { + "epoch": 0.496316758747698, + "grad_norm": 0.7923702597618103, + "learning_rate": 9.989946752265445e-05, + "loss": 2.3017, + "step": 1617 + }, + { + "epoch": 0.49662369551872315, + "grad_norm": 0.7668408155441284, + "learning_rate": 9.989915223290673e-05, + "loss": 2.3273, + "step": 1618 + }, + { + "epoch": 0.4969306322897483, + "grad_norm": 0.7134098410606384, + "learning_rate": 9.989883645002636e-05, + "loss": 2.302, + "step": 1619 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.6878800392150879, + "learning_rate": 9.989852017401643e-05, + "loss": 2.3047, + "step": 1620 + }, + { + "epoch": 0.49754450583179866, + "grad_norm": 0.8099397420883179, + "learning_rate": 9.989820340488008e-05, + "loss": 2.4747, + "step": 1621 + }, + { + "epoch": 0.4978514426028238, + "grad_norm": 0.9677640795707703, + "learning_rate": 9.989788614262043e-05, + "loss": 2.3347, + "step": 1622 + }, + { + "epoch": 0.498158379373849, + "grad_norm": 0.7592893838882446, + "learning_rate": 9.989756838724064e-05, + "loss": 2.3238, + "step": 1623 + }, + { + "epoch": 0.49846531614487416, + "grad_norm": 0.872529923915863, + "learning_rate": 9.989725013874382e-05, + "loss": 2.4117, + "step": 1624 + }, + { + "epoch": 0.49877225291589933, + "grad_norm": 1.023362159729004, + "learning_rate": 9.989693139713315e-05, + "loss": 2.3307, + "step": 1625 + }, + { + "epoch": 0.4990791896869245, + "grad_norm": 0.8994693756103516, + "learning_rate": 9.989661216241172e-05, + "loss": 2.3661, + "step": 1626 + }, + { + "epoch": 0.49938612645794966, + "grad_norm": 0.8854429125785828, + "learning_rate": 9.989629243458275e-05, + "loss": 2.311, + "step": 1627 + }, + { + "epoch": 0.49969306322897483, + "grad_norm": 0.8326926231384277, + "learning_rate": 9.989597221364937e-05, + "loss": 2.302, + "step": 1628 + }, + { + "epoch": 0.5, + "grad_norm": 0.8778239488601685, + "learning_rate": 9.989565149961475e-05, + "loss": 2.4653, + "step": 1629 + }, + { + "epoch": 0.5003069367710252, + "grad_norm": 0.9369759559631348, + "learning_rate": 9.989533029248205e-05, + "loss": 2.4165, + "step": 1630 + }, + { + "epoch": 0.5006138735420503, + "grad_norm": 0.8510915637016296, + "learning_rate": 9.989500859225445e-05, + "loss": 2.3345, + "step": 1631 + }, + { + "epoch": 0.5009208103130756, + "grad_norm": 0.787972629070282, + "learning_rate": 9.989468639893513e-05, + "loss": 2.283, + "step": 1632 + }, + { + "epoch": 0.5012277470841007, + "grad_norm": 0.7370568513870239, + "learning_rate": 9.989436371252729e-05, + "loss": 2.2867, + "step": 1633 + }, + { + "epoch": 0.5015346838551259, + "grad_norm": 0.8459502458572388, + "learning_rate": 9.989404053303409e-05, + "loss": 2.2875, + "step": 1634 + }, + { + "epoch": 0.501841620626151, + "grad_norm": 0.9123181700706482, + "learning_rate": 9.989371686045874e-05, + "loss": 2.2653, + "step": 1635 + }, + { + "epoch": 0.5021485573971762, + "grad_norm": 1.1908178329467773, + "learning_rate": 9.989339269480445e-05, + "loss": 2.4849, + "step": 1636 + }, + { + "epoch": 0.5024554941682013, + "grad_norm": 0.8162623643875122, + "learning_rate": 9.989306803607439e-05, + "loss": 2.2409, + "step": 1637 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.9289522171020508, + "learning_rate": 9.98927428842718e-05, + "loss": 2.455, + "step": 1638 + }, + { + "epoch": 0.5030693677102517, + "grad_norm": 1.212346076965332, + "learning_rate": 9.989241723939988e-05, + "loss": 2.3461, + "step": 1639 + }, + { + "epoch": 0.5033763044812769, + "grad_norm": 0.8971593976020813, + "learning_rate": 9.989209110146184e-05, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.503683241252302, + "grad_norm": 0.9293156862258911, + "learning_rate": 9.989176447046092e-05, + "loss": 2.3235, + "step": 1641 + }, + { + "epoch": 0.5039901780233272, + "grad_norm": 0.8665596842765808, + "learning_rate": 9.989143734640034e-05, + "loss": 2.4694, + "step": 1642 + }, + { + "epoch": 0.5042971147943524, + "grad_norm": 0.7732648253440857, + "learning_rate": 9.989110972928333e-05, + "loss": 2.1985, + "step": 1643 + }, + { + "epoch": 0.5046040515653776, + "grad_norm": 0.8124692440032959, + "learning_rate": 9.989078161911314e-05, + "loss": 2.315, + "step": 1644 + }, + { + "epoch": 0.5049109883364027, + "grad_norm": 0.8534342050552368, + "learning_rate": 9.989045301589301e-05, + "loss": 2.3491, + "step": 1645 + }, + { + "epoch": 0.5052179251074279, + "grad_norm": 0.8351274132728577, + "learning_rate": 9.989012391962617e-05, + "loss": 2.3416, + "step": 1646 + }, + { + "epoch": 0.505524861878453, + "grad_norm": 0.9143189787864685, + "learning_rate": 9.988979433031588e-05, + "loss": 2.4665, + "step": 1647 + }, + { + "epoch": 0.5058317986494782, + "grad_norm": 0.8978474140167236, + "learning_rate": 9.988946424796542e-05, + "loss": 2.389, + "step": 1648 + }, + { + "epoch": 0.5061387354205034, + "grad_norm": 1.0245648622512817, + "learning_rate": 9.988913367257802e-05, + "loss": 2.3391, + "step": 1649 + }, + { + "epoch": 0.5064456721915286, + "grad_norm": 0.9991573691368103, + "learning_rate": 9.988880260415695e-05, + "loss": 2.405, + "step": 1650 + }, + { + "epoch": 0.5067526089625537, + "grad_norm": 1.042378306388855, + "learning_rate": 9.98884710427055e-05, + "loss": 2.3467, + "step": 1651 + }, + { + "epoch": 0.5070595457335789, + "grad_norm": 0.9569510817527771, + "learning_rate": 9.988813898822694e-05, + "loss": 2.31, + "step": 1652 + }, + { + "epoch": 0.507366482504604, + "grad_norm": 0.9343158006668091, + "learning_rate": 9.988780644072456e-05, + "loss": 2.3659, + "step": 1653 + }, + { + "epoch": 0.5076734192756293, + "grad_norm": 0.7857093811035156, + "learning_rate": 9.988747340020162e-05, + "loss": 2.3424, + "step": 1654 + }, + { + "epoch": 0.5079803560466544, + "grad_norm": 0.7613041996955872, + "learning_rate": 9.988713986666144e-05, + "loss": 2.2698, + "step": 1655 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.8077516555786133, + "learning_rate": 9.98868058401073e-05, + "loss": 2.3827, + "step": 1656 + }, + { + "epoch": 0.5085942295887047, + "grad_norm": 0.8794304132461548, + "learning_rate": 9.98864713205425e-05, + "loss": 2.3079, + "step": 1657 + }, + { + "epoch": 0.5089011663597299, + "grad_norm": 0.8333674073219299, + "learning_rate": 9.988613630797036e-05, + "loss": 2.3622, + "step": 1658 + }, + { + "epoch": 0.509208103130755, + "grad_norm": 0.9654781222343445, + "learning_rate": 9.988580080239417e-05, + "loss": 2.3979, + "step": 1659 + }, + { + "epoch": 0.5095150399017803, + "grad_norm": 0.9278727769851685, + "learning_rate": 9.988546480381727e-05, + "loss": 2.3728, + "step": 1660 + }, + { + "epoch": 0.5098219766728054, + "grad_norm": 0.7971704006195068, + "learning_rate": 9.988512831224298e-05, + "loss": 2.2983, + "step": 1661 + }, + { + "epoch": 0.5101289134438306, + "grad_norm": 0.8991698026657104, + "learning_rate": 9.988479132767459e-05, + "loss": 2.3992, + "step": 1662 + }, + { + "epoch": 0.5104358502148557, + "grad_norm": 1.0208392143249512, + "learning_rate": 9.988445385011546e-05, + "loss": 2.3847, + "step": 1663 + }, + { + "epoch": 0.5107427869858809, + "grad_norm": 0.878237247467041, + "learning_rate": 9.988411587956891e-05, + "loss": 2.2851, + "step": 1664 + }, + { + "epoch": 0.511049723756906, + "grad_norm": 0.903287410736084, + "learning_rate": 9.98837774160383e-05, + "loss": 2.4233, + "step": 1665 + }, + { + "epoch": 0.5113566605279313, + "grad_norm": 0.8845674991607666, + "learning_rate": 9.988343845952697e-05, + "loss": 2.2923, + "step": 1666 + }, + { + "epoch": 0.5116635972989564, + "grad_norm": 0.7729392051696777, + "learning_rate": 9.988309901003825e-05, + "loss": 2.3044, + "step": 1667 + }, + { + "epoch": 0.5119705340699816, + "grad_norm": 0.719302237033844, + "learning_rate": 9.988275906757551e-05, + "loss": 2.3207, + "step": 1668 + }, + { + "epoch": 0.5122774708410067, + "grad_norm": 0.7205179333686829, + "learning_rate": 9.988241863214211e-05, + "loss": 2.341, + "step": 1669 + }, + { + "epoch": 0.512584407612032, + "grad_norm": 0.7318145036697388, + "learning_rate": 9.988207770374142e-05, + "loss": 2.3419, + "step": 1670 + }, + { + "epoch": 0.5128913443830571, + "grad_norm": 0.770630955696106, + "learning_rate": 9.98817362823768e-05, + "loss": 2.27, + "step": 1671 + }, + { + "epoch": 0.5131982811540823, + "grad_norm": 0.6485452651977539, + "learning_rate": 9.988139436805162e-05, + "loss": 2.2715, + "step": 1672 + }, + { + "epoch": 0.5135052179251074, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.988105196076925e-05, + "loss": 2.2806, + "step": 1673 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.695818305015564, + "learning_rate": 9.98807090605331e-05, + "loss": 2.3387, + "step": 1674 + }, + { + "epoch": 0.5141190914671577, + "grad_norm": 0.7685426473617554, + "learning_rate": 9.988036566734655e-05, + "loss": 2.2921, + "step": 1675 + }, + { + "epoch": 0.514426028238183, + "grad_norm": 0.6522897481918335, + "learning_rate": 9.988002178121301e-05, + "loss": 2.2507, + "step": 1676 + }, + { + "epoch": 0.5147329650092081, + "grad_norm": 0.7442181706428528, + "learning_rate": 9.987967740213583e-05, + "loss": 2.3292, + "step": 1677 + }, + { + "epoch": 0.5150399017802333, + "grad_norm": 0.8093023300170898, + "learning_rate": 9.987933253011846e-05, + "loss": 2.3384, + "step": 1678 + }, + { + "epoch": 0.5153468385512584, + "grad_norm": 0.8014655113220215, + "learning_rate": 9.987898716516428e-05, + "loss": 2.3619, + "step": 1679 + }, + { + "epoch": 0.5156537753222836, + "grad_norm": 0.8230258822441101, + "learning_rate": 9.987864130727671e-05, + "loss": 2.3242, + "step": 1680 + }, + { + "epoch": 0.5159607120933087, + "grad_norm": 0.9222247004508972, + "learning_rate": 9.987829495645918e-05, + "loss": 2.3907, + "step": 1681 + }, + { + "epoch": 0.516267648864334, + "grad_norm": 0.9293351769447327, + "learning_rate": 9.987794811271511e-05, + "loss": 2.3632, + "step": 1682 + }, + { + "epoch": 0.5165745856353591, + "grad_norm": 0.9555168747901917, + "learning_rate": 9.987760077604791e-05, + "loss": 2.3273, + "step": 1683 + }, + { + "epoch": 0.5168815224063843, + "grad_norm": 0.9839370250701904, + "learning_rate": 9.987725294646102e-05, + "loss": 2.3451, + "step": 1684 + }, + { + "epoch": 0.5171884591774094, + "grad_norm": 1.097970962524414, + "learning_rate": 9.987690462395791e-05, + "loss": 2.308, + "step": 1685 + }, + { + "epoch": 0.5174953959484346, + "grad_norm": 0.9345484972000122, + "learning_rate": 9.987655580854198e-05, + "loss": 2.3051, + "step": 1686 + }, + { + "epoch": 0.5178023327194597, + "grad_norm": 0.8075851798057556, + "learning_rate": 9.987620650021668e-05, + "loss": 2.3005, + "step": 1687 + }, + { + "epoch": 0.518109269490485, + "grad_norm": 0.7287935614585876, + "learning_rate": 9.987585669898549e-05, + "loss": 2.3709, + "step": 1688 + }, + { + "epoch": 0.5184162062615101, + "grad_norm": 0.7611173987388611, + "learning_rate": 9.987550640485184e-05, + "loss": 2.3265, + "step": 1689 + }, + { + "epoch": 0.5187231430325353, + "grad_norm": 0.7932588458061218, + "learning_rate": 9.987515561781921e-05, + "loss": 2.3625, + "step": 1690 + }, + { + "epoch": 0.5190300798035604, + "grad_norm": 0.7837479114532471, + "learning_rate": 9.987480433789106e-05, + "loss": 2.2614, + "step": 1691 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.905799925327301, + "learning_rate": 9.987445256507085e-05, + "loss": 2.2915, + "step": 1692 + }, + { + "epoch": 0.5196439533456108, + "grad_norm": 0.9417183995246887, + "learning_rate": 9.987410029936208e-05, + "loss": 2.3624, + "step": 1693 + }, + { + "epoch": 0.519950890116636, + "grad_norm": 0.9971327185630798, + "learning_rate": 9.987374754076822e-05, + "loss": 2.3913, + "step": 1694 + }, + { + "epoch": 0.5202578268876611, + "grad_norm": 0.8719072341918945, + "learning_rate": 9.987339428929274e-05, + "loss": 2.3412, + "step": 1695 + }, + { + "epoch": 0.5205647636586863, + "grad_norm": 0.8198116421699524, + "learning_rate": 9.987304054493916e-05, + "loss": 2.333, + "step": 1696 + }, + { + "epoch": 0.5208717004297114, + "grad_norm": 0.7450931668281555, + "learning_rate": 9.987268630771096e-05, + "loss": 2.2817, + "step": 1697 + }, + { + "epoch": 0.5211786372007366, + "grad_norm": 0.6867587566375732, + "learning_rate": 9.987233157761164e-05, + "loss": 2.3456, + "step": 1698 + }, + { + "epoch": 0.5214855739717618, + "grad_norm": 0.7537778615951538, + "learning_rate": 9.987197635464471e-05, + "loss": 2.176, + "step": 1699 + }, + { + "epoch": 0.521792510742787, + "grad_norm": 0.8347577452659607, + "learning_rate": 9.987162063881366e-05, + "loss": 2.3296, + "step": 1700 + }, + { + "epoch": 0.5220994475138122, + "grad_norm": 0.8714643120765686, + "learning_rate": 9.987126443012205e-05, + "loss": 2.3648, + "step": 1701 + }, + { + "epoch": 0.5224063842848373, + "grad_norm": 0.8579849004745483, + "learning_rate": 9.987090772857336e-05, + "loss": 2.4189, + "step": 1702 + }, + { + "epoch": 0.5227133210558625, + "grad_norm": 0.8651238083839417, + "learning_rate": 9.987055053417114e-05, + "loss": 2.3036, + "step": 1703 + }, + { + "epoch": 0.5230202578268877, + "grad_norm": 0.8447873592376709, + "learning_rate": 9.98701928469189e-05, + "loss": 2.3243, + "step": 1704 + }, + { + "epoch": 0.5233271945979129, + "grad_norm": 0.8218941688537598, + "learning_rate": 9.986983466682019e-05, + "loss": 2.3888, + "step": 1705 + }, + { + "epoch": 0.523634131368938, + "grad_norm": 0.7862920761108398, + "learning_rate": 9.986947599387855e-05, + "loss": 2.335, + "step": 1706 + }, + { + "epoch": 0.5239410681399632, + "grad_norm": 0.8096200227737427, + "learning_rate": 9.986911682809749e-05, + "loss": 2.4034, + "step": 1707 + }, + { + "epoch": 0.5242480049109883, + "grad_norm": 0.8217427730560303, + "learning_rate": 9.986875716948062e-05, + "loss": 2.2659, + "step": 1708 + }, + { + "epoch": 0.5245549416820136, + "grad_norm": 0.7676928043365479, + "learning_rate": 9.986839701803146e-05, + "loss": 2.2736, + "step": 1709 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.7783572673797607, + "learning_rate": 9.986803637375356e-05, + "loss": 2.3611, + "step": 1710 + }, + { + "epoch": 0.5251688152240639, + "grad_norm": 0.7657338380813599, + "learning_rate": 9.98676752366505e-05, + "loss": 2.3573, + "step": 1711 + }, + { + "epoch": 0.525475751995089, + "grad_norm": 0.8946976065635681, + "learning_rate": 9.986731360672585e-05, + "loss": 2.3443, + "step": 1712 + }, + { + "epoch": 0.5257826887661142, + "grad_norm": 0.8047227263450623, + "learning_rate": 9.986695148398318e-05, + "loss": 2.345, + "step": 1713 + }, + { + "epoch": 0.5260896255371393, + "grad_norm": 0.8407939672470093, + "learning_rate": 9.986658886842605e-05, + "loss": 2.2828, + "step": 1714 + }, + { + "epoch": 0.5263965623081646, + "grad_norm": 0.8460215330123901, + "learning_rate": 9.986622576005806e-05, + "loss": 2.2786, + "step": 1715 + }, + { + "epoch": 0.5267034990791897, + "grad_norm": 0.8291949033737183, + "learning_rate": 9.986586215888283e-05, + "loss": 2.3491, + "step": 1716 + }, + { + "epoch": 0.5270104358502149, + "grad_norm": 0.8812628388404846, + "learning_rate": 9.98654980649039e-05, + "loss": 2.3392, + "step": 1717 + }, + { + "epoch": 0.52731737262124, + "grad_norm": 0.8666933178901672, + "learning_rate": 9.98651334781249e-05, + "loss": 2.2585, + "step": 1718 + }, + { + "epoch": 0.5276243093922652, + "grad_norm": 0.8393275737762451, + "learning_rate": 9.986476839854941e-05, + "loss": 2.3315, + "step": 1719 + }, + { + "epoch": 0.5279312461632903, + "grad_norm": 0.8431777954101562, + "learning_rate": 9.986440282618105e-05, + "loss": 2.268, + "step": 1720 + }, + { + "epoch": 0.5282381829343156, + "grad_norm": 0.8020747900009155, + "learning_rate": 9.986403676102346e-05, + "loss": 2.2306, + "step": 1721 + }, + { + "epoch": 0.5285451197053407, + "grad_norm": 0.817395806312561, + "learning_rate": 9.986367020308022e-05, + "loss": 2.2914, + "step": 1722 + }, + { + "epoch": 0.5288520564763659, + "grad_norm": 0.8034493327140808, + "learning_rate": 9.986330315235497e-05, + "loss": 2.3598, + "step": 1723 + }, + { + "epoch": 0.529158993247391, + "grad_norm": 0.9001252055168152, + "learning_rate": 9.986293560885131e-05, + "loss": 2.3456, + "step": 1724 + }, + { + "epoch": 0.5294659300184162, + "grad_norm": 0.9782349467277527, + "learning_rate": 9.986256757257293e-05, + "loss": 2.231, + "step": 1725 + }, + { + "epoch": 0.5297728667894414, + "grad_norm": 1.0022578239440918, + "learning_rate": 9.98621990435234e-05, + "loss": 2.3457, + "step": 1726 + }, + { + "epoch": 0.5300798035604666, + "grad_norm": 1.0705206394195557, + "learning_rate": 9.986183002170642e-05, + "loss": 2.2775, + "step": 1727 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.8464064598083496, + "learning_rate": 9.98614605071256e-05, + "loss": 2.4006, + "step": 1728 + }, + { + "epoch": 0.5306936771025169, + "grad_norm": 0.7128132581710815, + "learning_rate": 9.98610904997846e-05, + "loss": 2.3273, + "step": 1729 + }, + { + "epoch": 0.531000613873542, + "grad_norm": 0.8113927245140076, + "learning_rate": 9.986071999968706e-05, + "loss": 2.3467, + "step": 1730 + }, + { + "epoch": 0.5313075506445673, + "grad_norm": 0.9236831665039062, + "learning_rate": 9.986034900683669e-05, + "loss": 2.3815, + "step": 1731 + }, + { + "epoch": 0.5316144874155924, + "grad_norm": 0.9325668811798096, + "learning_rate": 9.985997752123713e-05, + "loss": 2.3411, + "step": 1732 + }, + { + "epoch": 0.5319214241866176, + "grad_norm": 0.9585117101669312, + "learning_rate": 9.985960554289203e-05, + "loss": 2.3309, + "step": 1733 + }, + { + "epoch": 0.5322283609576427, + "grad_norm": 0.9459986686706543, + "learning_rate": 9.98592330718051e-05, + "loss": 2.3525, + "step": 1734 + }, + { + "epoch": 0.5325352977286679, + "grad_norm": 0.971592366695404, + "learning_rate": 9.985886010797997e-05, + "loss": 2.3665, + "step": 1735 + }, + { + "epoch": 0.532842234499693, + "grad_norm": 0.8533779978752136, + "learning_rate": 9.985848665142039e-05, + "loss": 2.26, + "step": 1736 + }, + { + "epoch": 0.5331491712707183, + "grad_norm": 0.8224228620529175, + "learning_rate": 9.985811270213002e-05, + "loss": 2.3523, + "step": 1737 + }, + { + "epoch": 0.5334561080417434, + "grad_norm": 0.8649810552597046, + "learning_rate": 9.985773826011255e-05, + "loss": 2.3262, + "step": 1738 + }, + { + "epoch": 0.5337630448127686, + "grad_norm": 0.8099339604377747, + "learning_rate": 9.98573633253717e-05, + "loss": 2.3038, + "step": 1739 + }, + { + "epoch": 0.5340699815837937, + "grad_norm": 0.6788219213485718, + "learning_rate": 9.985698789791115e-05, + "loss": 2.3278, + "step": 1740 + }, + { + "epoch": 0.5343769183548189, + "grad_norm": 0.8716040253639221, + "learning_rate": 9.985661197773464e-05, + "loss": 2.2955, + "step": 1741 + }, + { + "epoch": 0.534683855125844, + "grad_norm": 0.8377614617347717, + "learning_rate": 9.985623556484587e-05, + "loss": 2.2801, + "step": 1742 + }, + { + "epoch": 0.5349907918968693, + "grad_norm": 0.8452683091163635, + "learning_rate": 9.985585865924853e-05, + "loss": 2.3313, + "step": 1743 + }, + { + "epoch": 0.5352977286678944, + "grad_norm": 0.8226203918457031, + "learning_rate": 9.98554812609464e-05, + "loss": 2.3464, + "step": 1744 + }, + { + "epoch": 0.5356046654389196, + "grad_norm": 0.7476974725723267, + "learning_rate": 9.985510336994316e-05, + "loss": 2.3721, + "step": 1745 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.7132230997085571, + "learning_rate": 9.98547249862426e-05, + "loss": 2.2657, + "step": 1746 + }, + { + "epoch": 0.5362185389809699, + "grad_norm": 0.7022002339363098, + "learning_rate": 9.98543461098484e-05, + "loss": 2.2656, + "step": 1747 + }, + { + "epoch": 0.536525475751995, + "grad_norm": 0.7174789309501648, + "learning_rate": 9.985396674076435e-05, + "loss": 2.2914, + "step": 1748 + }, + { + "epoch": 0.5368324125230203, + "grad_norm": 0.78509920835495, + "learning_rate": 9.985358687899417e-05, + "loss": 2.3155, + "step": 1749 + }, + { + "epoch": 0.5371393492940454, + "grad_norm": 0.7670894861221313, + "learning_rate": 9.985320652454162e-05, + "loss": 2.2608, + "step": 1750 + }, + { + "epoch": 0.5374462860650706, + "grad_norm": 0.6196603178977966, + "learning_rate": 9.985282567741047e-05, + "loss": 2.2796, + "step": 1751 + }, + { + "epoch": 0.5377532228360957, + "grad_norm": 0.7119829058647156, + "learning_rate": 9.985244433760448e-05, + "loss": 2.2262, + "step": 1752 + }, + { + "epoch": 0.538060159607121, + "grad_norm": 0.6665359735488892, + "learning_rate": 9.98520625051274e-05, + "loss": 2.2714, + "step": 1753 + }, + { + "epoch": 0.5383670963781461, + "grad_norm": 0.7960934042930603, + "learning_rate": 9.985168017998303e-05, + "loss": 2.3703, + "step": 1754 + }, + { + "epoch": 0.5386740331491713, + "grad_norm": 0.9428521394729614, + "learning_rate": 9.985129736217513e-05, + "loss": 2.3334, + "step": 1755 + }, + { + "epoch": 0.5389809699201964, + "grad_norm": 0.9900842905044556, + "learning_rate": 9.985091405170751e-05, + "loss": 2.2369, + "step": 1756 + }, + { + "epoch": 0.5392879066912216, + "grad_norm": 0.9340593814849854, + "learning_rate": 9.985053024858393e-05, + "loss": 2.4332, + "step": 1757 + }, + { + "epoch": 0.5395948434622467, + "grad_norm": 0.9241896271705627, + "learning_rate": 9.985014595280818e-05, + "loss": 2.3484, + "step": 1758 + }, + { + "epoch": 0.539901780233272, + "grad_norm": 0.7724506258964539, + "learning_rate": 9.984976116438408e-05, + "loss": 2.282, + "step": 1759 + }, + { + "epoch": 0.5402087170042971, + "grad_norm": 0.9098101854324341, + "learning_rate": 9.984937588331543e-05, + "loss": 2.3039, + "step": 1760 + }, + { + "epoch": 0.5405156537753223, + "grad_norm": 0.9430370330810547, + "learning_rate": 9.984899010960601e-05, + "loss": 2.2555, + "step": 1761 + }, + { + "epoch": 0.5408225905463474, + "grad_norm": 0.8927021026611328, + "learning_rate": 9.984860384325965e-05, + "loss": 2.3034, + "step": 1762 + }, + { + "epoch": 0.5411295273173726, + "grad_norm": 0.8331896662712097, + "learning_rate": 9.98482170842802e-05, + "loss": 2.3341, + "step": 1763 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.8311246633529663, + "learning_rate": 9.984782983267142e-05, + "loss": 2.3913, + "step": 1764 + }, + { + "epoch": 0.541743400859423, + "grad_norm": 0.7459335923194885, + "learning_rate": 9.98474420884372e-05, + "loss": 2.2912, + "step": 1765 + }, + { + "epoch": 0.5420503376304481, + "grad_norm": 0.84760981798172, + "learning_rate": 9.984705385158131e-05, + "loss": 2.316, + "step": 1766 + }, + { + "epoch": 0.5423572744014733, + "grad_norm": 0.888793408870697, + "learning_rate": 9.984666512210762e-05, + "loss": 2.3452, + "step": 1767 + }, + { + "epoch": 0.5426642111724984, + "grad_norm": 0.7977499961853027, + "learning_rate": 9.984627590001999e-05, + "loss": 2.3325, + "step": 1768 + }, + { + "epoch": 0.5429711479435236, + "grad_norm": 0.8059934377670288, + "learning_rate": 9.984588618532224e-05, + "loss": 2.3347, + "step": 1769 + }, + { + "epoch": 0.5432780847145487, + "grad_norm": 0.8190197348594666, + "learning_rate": 9.984549597801822e-05, + "loss": 2.3446, + "step": 1770 + }, + { + "epoch": 0.543585021485574, + "grad_norm": 0.774773895740509, + "learning_rate": 9.98451052781118e-05, + "loss": 2.2598, + "step": 1771 + }, + { + "epoch": 0.5438919582565992, + "grad_norm": 0.7341485023498535, + "learning_rate": 9.984471408560682e-05, + "loss": 2.2728, + "step": 1772 + }, + { + "epoch": 0.5441988950276243, + "grad_norm": 0.6881145238876343, + "learning_rate": 9.984432240050719e-05, + "loss": 2.2922, + "step": 1773 + }, + { + "epoch": 0.5445058317986495, + "grad_norm": 0.6896151304244995, + "learning_rate": 9.984393022281673e-05, + "loss": 2.2915, + "step": 1774 + }, + { + "epoch": 0.5448127685696746, + "grad_norm": 0.6902059316635132, + "learning_rate": 9.984353755253932e-05, + "loss": 2.31, + "step": 1775 + }, + { + "epoch": 0.5451197053406999, + "grad_norm": 0.7594140768051147, + "learning_rate": 9.984314438967888e-05, + "loss": 2.3092, + "step": 1776 + }, + { + "epoch": 0.545426642111725, + "grad_norm": 0.8682328462600708, + "learning_rate": 9.984275073423927e-05, + "loss": 2.2851, + "step": 1777 + }, + { + "epoch": 0.5457335788827502, + "grad_norm": 0.8747107982635498, + "learning_rate": 9.98423565862244e-05, + "loss": 2.2927, + "step": 1778 + }, + { + "epoch": 0.5460405156537753, + "grad_norm": 0.9824326038360596, + "learning_rate": 9.984196194563813e-05, + "loss": 2.3622, + "step": 1779 + }, + { + "epoch": 0.5463474524248005, + "grad_norm": 1.0006790161132812, + "learning_rate": 9.984156681248438e-05, + "loss": 2.2531, + "step": 1780 + }, + { + "epoch": 0.5466543891958257, + "grad_norm": 0.9501944184303284, + "learning_rate": 9.984117118676705e-05, + "loss": 2.3902, + "step": 1781 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.7835353016853333, + "learning_rate": 9.984077506849005e-05, + "loss": 2.2754, + "step": 1782 + }, + { + "epoch": 0.547268262737876, + "grad_norm": 0.7310026288032532, + "learning_rate": 9.984037845765732e-05, + "loss": 2.2742, + "step": 1783 + }, + { + "epoch": 0.5475751995089012, + "grad_norm": 0.9469361901283264, + "learning_rate": 9.983998135427275e-05, + "loss": 2.4026, + "step": 1784 + }, + { + "epoch": 0.5478821362799263, + "grad_norm": 1.0639240741729736, + "learning_rate": 9.983958375834025e-05, + "loss": 2.3522, + "step": 1785 + }, + { + "epoch": 0.5481890730509515, + "grad_norm": 0.7771989703178406, + "learning_rate": 9.983918566986379e-05, + "loss": 2.216, + "step": 1786 + }, + { + "epoch": 0.5484960098219767, + "grad_norm": 0.6809307932853699, + "learning_rate": 9.983878708884728e-05, + "loss": 2.256, + "step": 1787 + }, + { + "epoch": 0.5488029465930019, + "grad_norm": 0.7300165891647339, + "learning_rate": 9.983838801529469e-05, + "loss": 2.3156, + "step": 1788 + }, + { + "epoch": 0.549109883364027, + "grad_norm": 0.8352389335632324, + "learning_rate": 9.98379884492099e-05, + "loss": 2.3344, + "step": 1789 + }, + { + "epoch": 0.5494168201350522, + "grad_norm": 0.830585777759552, + "learning_rate": 9.983758839059692e-05, + "loss": 2.3076, + "step": 1790 + }, + { + "epoch": 0.5497237569060773, + "grad_norm": 0.7384640574455261, + "learning_rate": 9.983718783945968e-05, + "loss": 2.2387, + "step": 1791 + }, + { + "epoch": 0.5500306936771026, + "grad_norm": 0.7133243083953857, + "learning_rate": 9.983678679580213e-05, + "loss": 2.2933, + "step": 1792 + }, + { + "epoch": 0.5503376304481277, + "grad_norm": 0.8462459444999695, + "learning_rate": 9.983638525962823e-05, + "loss": 2.3294, + "step": 1793 + }, + { + "epoch": 0.5506445672191529, + "grad_norm": 0.7841110825538635, + "learning_rate": 9.983598323094199e-05, + "loss": 2.3156, + "step": 1794 + }, + { + "epoch": 0.550951503990178, + "grad_norm": 0.8454114198684692, + "learning_rate": 9.983558070974735e-05, + "loss": 2.2203, + "step": 1795 + }, + { + "epoch": 0.5512584407612032, + "grad_norm": 0.7741531729698181, + "learning_rate": 9.983517769604826e-05, + "loss": 2.2585, + "step": 1796 + }, + { + "epoch": 0.5515653775322283, + "grad_norm": 0.717714250087738, + "learning_rate": 9.983477418984876e-05, + "loss": 2.3127, + "step": 1797 + }, + { + "epoch": 0.5518723143032536, + "grad_norm": 0.7546361088752747, + "learning_rate": 9.983437019115283e-05, + "loss": 2.2591, + "step": 1798 + }, + { + "epoch": 0.5521792510742787, + "grad_norm": 0.7947681546211243, + "learning_rate": 9.983396569996442e-05, + "loss": 2.337, + "step": 1799 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.9286270141601562, + "learning_rate": 9.983356071628756e-05, + "loss": 2.371, + "step": 1800 + }, + { + "epoch": 0.552793124616329, + "grad_norm": 1.0236682891845703, + "learning_rate": 9.983315524012625e-05, + "loss": 2.2673, + "step": 1801 + }, + { + "epoch": 0.5531000613873542, + "grad_norm": 1.043534278869629, + "learning_rate": 9.983274927148447e-05, + "loss": 2.3204, + "step": 1802 + }, + { + "epoch": 0.5534069981583793, + "grad_norm": 0.9694257378578186, + "learning_rate": 9.983234281036626e-05, + "loss": 2.2642, + "step": 1803 + }, + { + "epoch": 0.5537139349294046, + "grad_norm": 0.8890992403030396, + "learning_rate": 9.983193585677563e-05, + "loss": 2.2546, + "step": 1804 + }, + { + "epoch": 0.5540208717004297, + "grad_norm": 0.8109140396118164, + "learning_rate": 9.983152841071662e-05, + "loss": 2.3088, + "step": 1805 + }, + { + "epoch": 0.5543278084714549, + "grad_norm": 0.7762413620948792, + "learning_rate": 9.983112047219323e-05, + "loss": 2.2277, + "step": 1806 + }, + { + "epoch": 0.55463474524248, + "grad_norm": 0.7949336767196655, + "learning_rate": 9.983071204120951e-05, + "loss": 2.3004, + "step": 1807 + }, + { + "epoch": 0.5549416820135052, + "grad_norm": 0.9118300080299377, + "learning_rate": 9.983030311776946e-05, + "loss": 2.3986, + "step": 1808 + }, + { + "epoch": 0.5552486187845304, + "grad_norm": 0.874891996383667, + "learning_rate": 9.982989370187717e-05, + "loss": 2.2721, + "step": 1809 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.8089940547943115, + "learning_rate": 9.982948379353667e-05, + "loss": 2.2846, + "step": 1810 + }, + { + "epoch": 0.5558624923265807, + "grad_norm": 0.7407395839691162, + "learning_rate": 9.982907339275198e-05, + "loss": 2.2848, + "step": 1811 + }, + { + "epoch": 0.5561694290976059, + "grad_norm": 0.7487329244613647, + "learning_rate": 9.982866249952721e-05, + "loss": 2.266, + "step": 1812 + }, + { + "epoch": 0.556476365868631, + "grad_norm": 0.7910557389259338, + "learning_rate": 9.982825111386638e-05, + "loss": 2.2975, + "step": 1813 + }, + { + "epoch": 0.5567833026396563, + "grad_norm": 0.767186164855957, + "learning_rate": 9.982783923577356e-05, + "loss": 2.2867, + "step": 1814 + }, + { + "epoch": 0.5570902394106814, + "grad_norm": 0.7296959757804871, + "learning_rate": 9.982742686525284e-05, + "loss": 2.2167, + "step": 1815 + }, + { + "epoch": 0.5573971761817066, + "grad_norm": 0.6536411643028259, + "learning_rate": 9.982701400230827e-05, + "loss": 2.2278, + "step": 1816 + }, + { + "epoch": 0.5577041129527317, + "grad_norm": 0.7393643260002136, + "learning_rate": 9.982660064694394e-05, + "loss": 2.3275, + "step": 1817 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.7837240099906921, + "learning_rate": 9.982618679916396e-05, + "loss": 2.3516, + "step": 1818 + }, + { + "epoch": 0.558317986494782, + "grad_norm": 0.8186847567558289, + "learning_rate": 9.982577245897238e-05, + "loss": 2.4104, + "step": 1819 + }, + { + "epoch": 0.5586249232658073, + "grad_norm": 0.733651340007782, + "learning_rate": 9.98253576263733e-05, + "loss": 2.2151, + "step": 1820 + }, + { + "epoch": 0.5589318600368324, + "grad_norm": 0.7452411651611328, + "learning_rate": 9.982494230137086e-05, + "loss": 2.3288, + "step": 1821 + }, + { + "epoch": 0.5592387968078576, + "grad_norm": 0.7369456887245178, + "learning_rate": 9.982452648396913e-05, + "loss": 2.3023, + "step": 1822 + }, + { + "epoch": 0.5595457335788827, + "grad_norm": 0.794789731502533, + "learning_rate": 9.982411017417222e-05, + "loss": 2.2774, + "step": 1823 + }, + { + "epoch": 0.5598526703499079, + "grad_norm": 0.7677412033081055, + "learning_rate": 9.982369337198425e-05, + "loss": 2.3213, + "step": 1824 + }, + { + "epoch": 0.560159607120933, + "grad_norm": 0.8195241689682007, + "learning_rate": 9.982327607740934e-05, + "loss": 2.3721, + "step": 1825 + }, + { + "epoch": 0.5604665438919583, + "grad_norm": 0.867115318775177, + "learning_rate": 9.982285829045162e-05, + "loss": 2.3653, + "step": 1826 + }, + { + "epoch": 0.5607734806629834, + "grad_norm": 0.8519865870475769, + "learning_rate": 9.98224400111152e-05, + "loss": 2.3646, + "step": 1827 + }, + { + "epoch": 0.5610804174340086, + "grad_norm": 0.9408721923828125, + "learning_rate": 9.982202123940425e-05, + "loss": 2.2051, + "step": 1828 + }, + { + "epoch": 0.5613873542050337, + "grad_norm": 0.985325813293457, + "learning_rate": 9.982160197532287e-05, + "loss": 2.3402, + "step": 1829 + }, + { + "epoch": 0.5616942909760589, + "grad_norm": 1.018094539642334, + "learning_rate": 9.982118221887521e-05, + "loss": 2.2712, + "step": 1830 + }, + { + "epoch": 0.562001227747084, + "grad_norm": 0.9246920347213745, + "learning_rate": 9.982076197006543e-05, + "loss": 2.3808, + "step": 1831 + }, + { + "epoch": 0.5623081645181093, + "grad_norm": 0.8519729971885681, + "learning_rate": 9.982034122889768e-05, + "loss": 2.3774, + "step": 1832 + }, + { + "epoch": 0.5626151012891344, + "grad_norm": 0.801567018032074, + "learning_rate": 9.981991999537612e-05, + "loss": 2.2713, + "step": 1833 + }, + { + "epoch": 0.5629220380601596, + "grad_norm": 0.7212518453598022, + "learning_rate": 9.981949826950492e-05, + "loss": 2.1902, + "step": 1834 + }, + { + "epoch": 0.5632289748311847, + "grad_norm": 0.7644798755645752, + "learning_rate": 9.981907605128822e-05, + "loss": 2.2751, + "step": 1835 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.7941999435424805, + "learning_rate": 9.981865334073022e-05, + "loss": 2.2991, + "step": 1836 + }, + { + "epoch": 0.5638428483732351, + "grad_norm": 0.7274888753890991, + "learning_rate": 9.981823013783508e-05, + "loss": 2.3536, + "step": 1837 + }, + { + "epoch": 0.5641497851442603, + "grad_norm": 0.845024585723877, + "learning_rate": 9.9817806442607e-05, + "loss": 2.2796, + "step": 1838 + }, + { + "epoch": 0.5644567219152854, + "grad_norm": 0.8225597739219666, + "learning_rate": 9.981738225505015e-05, + "loss": 2.3339, + "step": 1839 + }, + { + "epoch": 0.5647636586863106, + "grad_norm": 0.8456425070762634, + "learning_rate": 9.981695757516873e-05, + "loss": 2.2583, + "step": 1840 + }, + { + "epoch": 0.5650705954573357, + "grad_norm": 1.0066497325897217, + "learning_rate": 9.981653240296695e-05, + "loss": 2.3628, + "step": 1841 + }, + { + "epoch": 0.565377532228361, + "grad_norm": 0.9574379920959473, + "learning_rate": 9.981610673844899e-05, + "loss": 2.306, + "step": 1842 + }, + { + "epoch": 0.5656844689993862, + "grad_norm": 0.7427437901496887, + "learning_rate": 9.981568058161905e-05, + "loss": 2.267, + "step": 1843 + }, + { + "epoch": 0.5659914057704113, + "grad_norm": 0.6984857320785522, + "learning_rate": 9.981525393248138e-05, + "loss": 2.2095, + "step": 1844 + }, + { + "epoch": 0.5662983425414365, + "grad_norm": 0.748062789440155, + "learning_rate": 9.981482679104016e-05, + "loss": 2.211, + "step": 1845 + }, + { + "epoch": 0.5666052793124616, + "grad_norm": 0.7978217005729675, + "learning_rate": 9.981439915729964e-05, + "loss": 2.2437, + "step": 1846 + }, + { + "epoch": 0.5669122160834869, + "grad_norm": 0.807849109172821, + "learning_rate": 9.981397103126401e-05, + "loss": 2.3063, + "step": 1847 + }, + { + "epoch": 0.567219152854512, + "grad_norm": 0.8626619577407837, + "learning_rate": 9.981354241293752e-05, + "loss": 2.3616, + "step": 1848 + }, + { + "epoch": 0.5675260896255372, + "grad_norm": 0.8991526961326599, + "learning_rate": 9.981311330232442e-05, + "loss": 2.2355, + "step": 1849 + }, + { + "epoch": 0.5678330263965623, + "grad_norm": 0.7399953007698059, + "learning_rate": 9.981268369942894e-05, + "loss": 2.2452, + "step": 1850 + }, + { + "epoch": 0.5681399631675875, + "grad_norm": 0.7787104845046997, + "learning_rate": 9.981225360425533e-05, + "loss": 2.4141, + "step": 1851 + }, + { + "epoch": 0.5684468999386126, + "grad_norm": 0.8570892214775085, + "learning_rate": 9.98118230168078e-05, + "loss": 2.2487, + "step": 1852 + }, + { + "epoch": 0.5687538367096379, + "grad_norm": 0.8277538418769836, + "learning_rate": 9.981139193709068e-05, + "loss": 2.2602, + "step": 1853 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.7638106942176819, + "learning_rate": 9.981096036510817e-05, + "loss": 2.2886, + "step": 1854 + }, + { + "epoch": 0.5693677102516882, + "grad_norm": 0.8480616807937622, + "learning_rate": 9.981052830086454e-05, + "loss": 2.2893, + "step": 1855 + }, + { + "epoch": 0.5696746470227133, + "grad_norm": 0.8568599820137024, + "learning_rate": 9.98100957443641e-05, + "loss": 2.3802, + "step": 1856 + }, + { + "epoch": 0.5699815837937385, + "grad_norm": 0.7863987684249878, + "learning_rate": 9.98096626956111e-05, + "loss": 2.2996, + "step": 1857 + }, + { + "epoch": 0.5702885205647636, + "grad_norm": 0.7636334896087646, + "learning_rate": 9.980922915460979e-05, + "loss": 2.2569, + "step": 1858 + }, + { + "epoch": 0.5705954573357889, + "grad_norm": 0.7514677047729492, + "learning_rate": 9.98087951213645e-05, + "loss": 2.3317, + "step": 1859 + }, + { + "epoch": 0.570902394106814, + "grad_norm": 0.717637300491333, + "learning_rate": 9.980836059587951e-05, + "loss": 2.2855, + "step": 1860 + }, + { + "epoch": 0.5712093308778392, + "grad_norm": 0.728518545627594, + "learning_rate": 9.98079255781591e-05, + "loss": 2.3166, + "step": 1861 + }, + { + "epoch": 0.5715162676488643, + "grad_norm": 0.7158043384552002, + "learning_rate": 9.980749006820757e-05, + "loss": 2.2639, + "step": 1862 + }, + { + "epoch": 0.5718232044198895, + "grad_norm": 0.7565107941627502, + "learning_rate": 9.980705406602924e-05, + "loss": 2.2833, + "step": 1863 + }, + { + "epoch": 0.5721301411909147, + "grad_norm": 0.7873388528823853, + "learning_rate": 9.980661757162841e-05, + "loss": 2.201, + "step": 1864 + }, + { + "epoch": 0.5724370779619399, + "grad_norm": 0.7818259596824646, + "learning_rate": 9.980618058500939e-05, + "loss": 2.242, + "step": 1865 + }, + { + "epoch": 0.572744014732965, + "grad_norm": 0.7464665770530701, + "learning_rate": 9.98057431061765e-05, + "loss": 2.2325, + "step": 1866 + }, + { + "epoch": 0.5730509515039902, + "grad_norm": 0.7778184413909912, + "learning_rate": 9.980530513513406e-05, + "loss": 2.3258, + "step": 1867 + }, + { + "epoch": 0.5733578882750153, + "grad_norm": 0.825661301612854, + "learning_rate": 9.980486667188642e-05, + "loss": 2.3477, + "step": 1868 + }, + { + "epoch": 0.5736648250460405, + "grad_norm": 0.8448848724365234, + "learning_rate": 9.980442771643788e-05, + "loss": 2.3523, + "step": 1869 + }, + { + "epoch": 0.5739717618170657, + "grad_norm": 0.8330404758453369, + "learning_rate": 9.98039882687928e-05, + "loss": 2.2274, + "step": 1870 + }, + { + "epoch": 0.5742786985880909, + "grad_norm": 0.7520943284034729, + "learning_rate": 9.98035483289555e-05, + "loss": 2.2773, + "step": 1871 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.8312448263168335, + "learning_rate": 9.980310789693037e-05, + "loss": 2.302, + "step": 1872 + }, + { + "epoch": 0.5748925721301412, + "grad_norm": 0.7383994460105896, + "learning_rate": 9.980266697272173e-05, + "loss": 2.2168, + "step": 1873 + }, + { + "epoch": 0.5751995089011663, + "grad_norm": 0.9612922072410583, + "learning_rate": 9.980222555633394e-05, + "loss": 2.3558, + "step": 1874 + }, + { + "epoch": 0.5755064456721916, + "grad_norm": 0.9921227097511292, + "learning_rate": 9.980178364777136e-05, + "loss": 2.2913, + "step": 1875 + }, + { + "epoch": 0.5758133824432167, + "grad_norm": 0.9152889847755432, + "learning_rate": 9.980134124703837e-05, + "loss": 2.2615, + "step": 1876 + }, + { + "epoch": 0.5761203192142419, + "grad_norm": 0.8090541362762451, + "learning_rate": 9.980089835413936e-05, + "loss": 2.2661, + "step": 1877 + }, + { + "epoch": 0.576427255985267, + "grad_norm": 0.8074322938919067, + "learning_rate": 9.980045496907865e-05, + "loss": 2.3209, + "step": 1878 + }, + { + "epoch": 0.5767341927562922, + "grad_norm": 0.784649670124054, + "learning_rate": 9.980001109186065e-05, + "loss": 2.241, + "step": 1879 + }, + { + "epoch": 0.5770411295273173, + "grad_norm": 0.768108069896698, + "learning_rate": 9.979956672248978e-05, + "loss": 2.3333, + "step": 1880 + }, + { + "epoch": 0.5773480662983426, + "grad_norm": 0.798058271408081, + "learning_rate": 9.97991218609704e-05, + "loss": 2.3564, + "step": 1881 + }, + { + "epoch": 0.5776550030693677, + "grad_norm": 0.7606865763664246, + "learning_rate": 9.97986765073069e-05, + "loss": 2.2277, + "step": 1882 + }, + { + "epoch": 0.5779619398403929, + "grad_norm": 0.8320558667182922, + "learning_rate": 9.979823066150369e-05, + "loss": 2.3715, + "step": 1883 + }, + { + "epoch": 0.578268876611418, + "grad_norm": 0.7935798168182373, + "learning_rate": 9.979778432356517e-05, + "loss": 2.2605, + "step": 1884 + }, + { + "epoch": 0.5785758133824432, + "grad_norm": 0.6914796829223633, + "learning_rate": 9.979733749349578e-05, + "loss": 2.2699, + "step": 1885 + }, + { + "epoch": 0.5788827501534684, + "grad_norm": 0.6546899676322937, + "learning_rate": 9.979689017129989e-05, + "loss": 2.1908, + "step": 1886 + }, + { + "epoch": 0.5791896869244936, + "grad_norm": 0.7231267094612122, + "learning_rate": 9.979644235698195e-05, + "loss": 2.2084, + "step": 1887 + }, + { + "epoch": 0.5794966236955187, + "grad_norm": 0.668933093547821, + "learning_rate": 9.979599405054639e-05, + "loss": 2.2722, + "step": 1888 + }, + { + "epoch": 0.5798035604665439, + "grad_norm": 0.678191602230072, + "learning_rate": 9.979554525199763e-05, + "loss": 2.2312, + "step": 1889 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.6407462954521179, + "learning_rate": 9.97950959613401e-05, + "loss": 2.2381, + "step": 1890 + }, + { + "epoch": 0.5804174340085942, + "grad_norm": 0.6920403242111206, + "learning_rate": 9.979464617857826e-05, + "loss": 2.2678, + "step": 1891 + }, + { + "epoch": 0.5807243707796194, + "grad_norm": 0.6907110810279846, + "learning_rate": 9.979419590371651e-05, + "loss": 2.2579, + "step": 1892 + }, + { + "epoch": 0.5810313075506446, + "grad_norm": 0.7683933973312378, + "learning_rate": 9.979374513675935e-05, + "loss": 2.2184, + "step": 1893 + }, + { + "epoch": 0.5813382443216697, + "grad_norm": 0.797286868095398, + "learning_rate": 9.979329387771121e-05, + "loss": 2.2518, + "step": 1894 + }, + { + "epoch": 0.5816451810926949, + "grad_norm": 0.8192877769470215, + "learning_rate": 9.979284212657657e-05, + "loss": 2.2271, + "step": 1895 + }, + { + "epoch": 0.58195211786372, + "grad_norm": 0.7510090470314026, + "learning_rate": 9.979238988335986e-05, + "loss": 2.2864, + "step": 1896 + }, + { + "epoch": 0.5822590546347453, + "grad_norm": 0.7541393041610718, + "learning_rate": 9.979193714806558e-05, + "loss": 2.239, + "step": 1897 + }, + { + "epoch": 0.5825659914057704, + "grad_norm": 0.7353073358535767, + "learning_rate": 9.97914839206982e-05, + "loss": 2.2145, + "step": 1898 + }, + { + "epoch": 0.5828729281767956, + "grad_norm": 0.6813456416130066, + "learning_rate": 9.979103020126218e-05, + "loss": 2.194, + "step": 1899 + }, + { + "epoch": 0.5831798649478207, + "grad_norm": 0.6922066807746887, + "learning_rate": 9.979057598976202e-05, + "loss": 2.2335, + "step": 1900 + }, + { + "epoch": 0.5834868017188459, + "grad_norm": 0.5800344944000244, + "learning_rate": 9.97901212862022e-05, + "loss": 2.2159, + "step": 1901 + }, + { + "epoch": 0.583793738489871, + "grad_norm": 0.5770835280418396, + "learning_rate": 9.978966609058722e-05, + "loss": 2.2217, + "step": 1902 + }, + { + "epoch": 0.5841006752608963, + "grad_norm": 0.6217128038406372, + "learning_rate": 9.978921040292158e-05, + "loss": 2.2703, + "step": 1903 + }, + { + "epoch": 0.5844076120319214, + "grad_norm": 0.6684436798095703, + "learning_rate": 9.97887542232098e-05, + "loss": 2.2747, + "step": 1904 + }, + { + "epoch": 0.5847145488029466, + "grad_norm": 0.6261670589447021, + "learning_rate": 9.978829755145633e-05, + "loss": 2.2867, + "step": 1905 + }, + { + "epoch": 0.5850214855739717, + "grad_norm": 0.646051824092865, + "learning_rate": 9.978784038766575e-05, + "loss": 2.2493, + "step": 1906 + }, + { + "epoch": 0.5853284223449969, + "grad_norm": 0.6757060885429382, + "learning_rate": 9.978738273184254e-05, + "loss": 2.218, + "step": 1907 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.7867937684059143, + "learning_rate": 9.978692458399122e-05, + "loss": 2.3405, + "step": 1908 + }, + { + "epoch": 0.5859422958870473, + "grad_norm": 0.8349789381027222, + "learning_rate": 9.978646594411636e-05, + "loss": 2.3292, + "step": 1909 + }, + { + "epoch": 0.5862492326580724, + "grad_norm": 0.8739562034606934, + "learning_rate": 9.978600681222243e-05, + "loss": 2.2132, + "step": 1910 + }, + { + "epoch": 0.5865561694290976, + "grad_norm": 0.8187520503997803, + "learning_rate": 9.978554718831402e-05, + "loss": 2.3078, + "step": 1911 + }, + { + "epoch": 0.5868631062001227, + "grad_norm": 0.8463271856307983, + "learning_rate": 9.978508707239565e-05, + "loss": 2.1924, + "step": 1912 + }, + { + "epoch": 0.5871700429711479, + "grad_norm": 0.8674206733703613, + "learning_rate": 9.978462646447187e-05, + "loss": 2.2185, + "step": 1913 + }, + { + "epoch": 0.5874769797421732, + "grad_norm": 0.7828893065452576, + "learning_rate": 9.978416536454722e-05, + "loss": 2.3137, + "step": 1914 + }, + { + "epoch": 0.5877839165131983, + "grad_norm": 0.7868914604187012, + "learning_rate": 9.978370377262629e-05, + "loss": 2.2202, + "step": 1915 + }, + { + "epoch": 0.5880908532842235, + "grad_norm": 0.811596155166626, + "learning_rate": 9.97832416887136e-05, + "loss": 2.3463, + "step": 1916 + }, + { + "epoch": 0.5883977900552486, + "grad_norm": 0.9281075596809387, + "learning_rate": 9.978277911281375e-05, + "loss": 2.2394, + "step": 1917 + }, + { + "epoch": 0.5887047268262738, + "grad_norm": 0.8862313628196716, + "learning_rate": 9.978231604493129e-05, + "loss": 2.2456, + "step": 1918 + }, + { + "epoch": 0.589011663597299, + "grad_norm": 0.8411116600036621, + "learning_rate": 9.978185248507081e-05, + "loss": 2.2409, + "step": 1919 + }, + { + "epoch": 0.5893186003683242, + "grad_norm": 0.8205060958862305, + "learning_rate": 9.978138843323688e-05, + "loss": 2.2468, + "step": 1920 + }, + { + "epoch": 0.5896255371393493, + "grad_norm": 0.8103171586990356, + "learning_rate": 9.97809238894341e-05, + "loss": 2.2979, + "step": 1921 + }, + { + "epoch": 0.5899324739103745, + "grad_norm": 0.7937025427818298, + "learning_rate": 9.978045885366704e-05, + "loss": 2.3582, + "step": 1922 + }, + { + "epoch": 0.5902394106813996, + "grad_norm": 0.7983896136283875, + "learning_rate": 9.977999332594032e-05, + "loss": 2.2725, + "step": 1923 + }, + { + "epoch": 0.5905463474524248, + "grad_norm": 0.8274399042129517, + "learning_rate": 9.977952730625852e-05, + "loss": 2.3091, + "step": 1924 + }, + { + "epoch": 0.59085328422345, + "grad_norm": 0.9385362863540649, + "learning_rate": 9.977906079462627e-05, + "loss": 2.4322, + "step": 1925 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.8405537009239197, + "learning_rate": 9.977859379104814e-05, + "loss": 2.1606, + "step": 1926 + }, + { + "epoch": 0.5914671577655003, + "grad_norm": 0.8082418441772461, + "learning_rate": 9.97781262955288e-05, + "loss": 2.2929, + "step": 1927 + }, + { + "epoch": 0.5917740945365255, + "grad_norm": 0.7444280385971069, + "learning_rate": 9.977765830807283e-05, + "loss": 2.3217, + "step": 1928 + }, + { + "epoch": 0.5920810313075506, + "grad_norm": 0.7369982600212097, + "learning_rate": 9.977718982868485e-05, + "loss": 2.2658, + "step": 1929 + }, + { + "epoch": 0.5923879680785759, + "grad_norm": 0.6842257380485535, + "learning_rate": 9.977672085736951e-05, + "loss": 2.2243, + "step": 1930 + }, + { + "epoch": 0.592694904849601, + "grad_norm": 0.6954882740974426, + "learning_rate": 9.977625139413145e-05, + "loss": 2.2802, + "step": 1931 + }, + { + "epoch": 0.5930018416206262, + "grad_norm": 0.749829888343811, + "learning_rate": 9.97757814389753e-05, + "loss": 2.3166, + "step": 1932 + }, + { + "epoch": 0.5933087783916513, + "grad_norm": 0.7725609540939331, + "learning_rate": 9.977531099190569e-05, + "loss": 2.2367, + "step": 1933 + }, + { + "epoch": 0.5936157151626765, + "grad_norm": 0.7467440366744995, + "learning_rate": 9.977484005292728e-05, + "loss": 2.2704, + "step": 1934 + }, + { + "epoch": 0.5939226519337016, + "grad_norm": 0.7104424834251404, + "learning_rate": 9.977436862204475e-05, + "loss": 2.1983, + "step": 1935 + }, + { + "epoch": 0.5942295887047269, + "grad_norm": 0.7562711834907532, + "learning_rate": 9.977389669926272e-05, + "loss": 2.2857, + "step": 1936 + }, + { + "epoch": 0.594536525475752, + "grad_norm": 0.7803298830986023, + "learning_rate": 9.977342428458585e-05, + "loss": 2.3526, + "step": 1937 + }, + { + "epoch": 0.5948434622467772, + "grad_norm": 0.7487826943397522, + "learning_rate": 9.977295137801885e-05, + "loss": 2.2338, + "step": 1938 + }, + { + "epoch": 0.5951503990178023, + "grad_norm": 0.6969291567802429, + "learning_rate": 9.977247797956639e-05, + "loss": 2.2185, + "step": 1939 + }, + { + "epoch": 0.5954573357888275, + "grad_norm": 0.6293052434921265, + "learning_rate": 9.977200408923311e-05, + "loss": 2.2767, + "step": 1940 + }, + { + "epoch": 0.5957642725598526, + "grad_norm": 0.7457680702209473, + "learning_rate": 9.97715297070237e-05, + "loss": 2.2688, + "step": 1941 + }, + { + "epoch": 0.5960712093308779, + "grad_norm": 0.7255130410194397, + "learning_rate": 9.977105483294288e-05, + "loss": 2.2157, + "step": 1942 + }, + { + "epoch": 0.596378146101903, + "grad_norm": 0.739815890789032, + "learning_rate": 9.977057946699532e-05, + "loss": 2.306, + "step": 1943 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.7493855357170105, + "learning_rate": 9.977010360918571e-05, + "loss": 2.1893, + "step": 1944 + }, + { + "epoch": 0.5969920196439533, + "grad_norm": 0.7976173758506775, + "learning_rate": 9.976962725951878e-05, + "loss": 2.3288, + "step": 1945 + }, + { + "epoch": 0.5972989564149785, + "grad_norm": 0.9487287998199463, + "learning_rate": 9.976915041799921e-05, + "loss": 2.4484, + "step": 1946 + }, + { + "epoch": 0.5976058931860037, + "grad_norm": 0.9866845011711121, + "learning_rate": 9.976867308463174e-05, + "loss": 2.3223, + "step": 1947 + }, + { + "epoch": 0.5979128299570289, + "grad_norm": 0.9258660674095154, + "learning_rate": 9.976819525942107e-05, + "loss": 2.2358, + "step": 1948 + }, + { + "epoch": 0.598219766728054, + "grad_norm": 0.9822832345962524, + "learning_rate": 9.976771694237192e-05, + "loss": 2.2951, + "step": 1949 + }, + { + "epoch": 0.5985267034990792, + "grad_norm": 1.005528450012207, + "learning_rate": 9.976723813348902e-05, + "loss": 2.2604, + "step": 1950 + }, + { + "epoch": 0.5988336402701043, + "grad_norm": 0.8988018035888672, + "learning_rate": 9.976675883277711e-05, + "loss": 2.3419, + "step": 1951 + }, + { + "epoch": 0.5991405770411296, + "grad_norm": 0.7386319041252136, + "learning_rate": 9.976627904024091e-05, + "loss": 2.2357, + "step": 1952 + }, + { + "epoch": 0.5994475138121547, + "grad_norm": 0.7715404033660889, + "learning_rate": 9.976579875588518e-05, + "loss": 2.3482, + "step": 1953 + }, + { + "epoch": 0.5997544505831799, + "grad_norm": 0.7529712319374084, + "learning_rate": 9.976531797971464e-05, + "loss": 2.1735, + "step": 1954 + }, + { + "epoch": 0.600061387354205, + "grad_norm": 0.8589643836021423, + "learning_rate": 9.97648367117341e-05, + "loss": 2.305, + "step": 1955 + }, + { + "epoch": 0.6003683241252302, + "grad_norm": 0.9038915634155273, + "learning_rate": 9.976435495194823e-05, + "loss": 2.2123, + "step": 1956 + }, + { + "epoch": 0.6006752608962553, + "grad_norm": 0.9388678073883057, + "learning_rate": 9.976387270036186e-05, + "loss": 2.1792, + "step": 1957 + }, + { + "epoch": 0.6009821976672806, + "grad_norm": 0.7970952391624451, + "learning_rate": 9.976338995697974e-05, + "loss": 2.2425, + "step": 1958 + }, + { + "epoch": 0.6012891344383057, + "grad_norm": 0.7219900488853455, + "learning_rate": 9.976290672180662e-05, + "loss": 2.1984, + "step": 1959 + }, + { + "epoch": 0.6015960712093309, + "grad_norm": 0.639715313911438, + "learning_rate": 9.976242299484728e-05, + "loss": 2.2796, + "step": 1960 + }, + { + "epoch": 0.601903007980356, + "grad_norm": 0.6734911799430847, + "learning_rate": 9.976193877610652e-05, + "loss": 2.3066, + "step": 1961 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.8328932523727417, + "learning_rate": 9.976145406558912e-05, + "loss": 2.3958, + "step": 1962 + }, + { + "epoch": 0.6025168815224063, + "grad_norm": 0.9552088379859924, + "learning_rate": 9.976096886329986e-05, + "loss": 2.3246, + "step": 1963 + }, + { + "epoch": 0.6028238182934316, + "grad_norm": 0.8407328128814697, + "learning_rate": 9.976048316924354e-05, + "loss": 2.2922, + "step": 1964 + }, + { + "epoch": 0.6031307550644567, + "grad_norm": 0.6899709105491638, + "learning_rate": 9.975999698342495e-05, + "loss": 2.1808, + "step": 1965 + }, + { + "epoch": 0.6034376918354819, + "grad_norm": 0.8114390969276428, + "learning_rate": 9.975951030584892e-05, + "loss": 2.3516, + "step": 1966 + }, + { + "epoch": 0.603744628606507, + "grad_norm": 0.8071461319923401, + "learning_rate": 9.975902313652024e-05, + "loss": 2.2044, + "step": 1967 + }, + { + "epoch": 0.6040515653775322, + "grad_norm": 0.8767913579940796, + "learning_rate": 9.975853547544372e-05, + "loss": 2.24, + "step": 1968 + }, + { + "epoch": 0.6043585021485574, + "grad_norm": 0.817095935344696, + "learning_rate": 9.975804732262419e-05, + "loss": 2.169, + "step": 1969 + }, + { + "epoch": 0.6046654389195826, + "grad_norm": 0.6818623542785645, + "learning_rate": 9.975755867806648e-05, + "loss": 2.2869, + "step": 1970 + }, + { + "epoch": 0.6049723756906077, + "grad_norm": 0.7248693704605103, + "learning_rate": 9.97570695417754e-05, + "loss": 2.2159, + "step": 1971 + }, + { + "epoch": 0.6052793124616329, + "grad_norm": 0.6425455212593079, + "learning_rate": 9.975657991375581e-05, + "loss": 2.2173, + "step": 1972 + }, + { + "epoch": 0.605586249232658, + "grad_norm": 0.6856566071510315, + "learning_rate": 9.975608979401252e-05, + "loss": 2.2994, + "step": 1973 + }, + { + "epoch": 0.6058931860036832, + "grad_norm": 0.6731004118919373, + "learning_rate": 9.97555991825504e-05, + "loss": 2.2286, + "step": 1974 + }, + { + "epoch": 0.6062001227747084, + "grad_norm": 0.7461759448051453, + "learning_rate": 9.975510807937428e-05, + "loss": 2.2057, + "step": 1975 + }, + { + "epoch": 0.6065070595457336, + "grad_norm": 0.7256236672401428, + "learning_rate": 9.975461648448902e-05, + "loss": 2.2686, + "step": 1976 + }, + { + "epoch": 0.6068139963167587, + "grad_norm": 0.7254514098167419, + "learning_rate": 9.975412439789949e-05, + "loss": 2.2748, + "step": 1977 + }, + { + "epoch": 0.6071209330877839, + "grad_norm": 0.7280047535896301, + "learning_rate": 9.975363181961052e-05, + "loss": 2.27, + "step": 1978 + }, + { + "epoch": 0.607427869858809, + "grad_norm": 0.6801813244819641, + "learning_rate": 9.9753138749627e-05, + "loss": 2.2356, + "step": 1979 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.841946005821228, + "learning_rate": 9.975264518795382e-05, + "loss": 2.3887, + "step": 1980 + }, + { + "epoch": 0.6080417434008594, + "grad_norm": 0.9610007405281067, + "learning_rate": 9.975215113459582e-05, + "loss": 2.2857, + "step": 1981 + }, + { + "epoch": 0.6083486801718846, + "grad_norm": 0.8726536631584167, + "learning_rate": 9.975165658955791e-05, + "loss": 2.3137, + "step": 1982 + }, + { + "epoch": 0.6086556169429097, + "grad_norm": 0.9275946021080017, + "learning_rate": 9.975116155284498e-05, + "loss": 2.291, + "step": 1983 + }, + { + "epoch": 0.6089625537139349, + "grad_norm": 0.9045402407646179, + "learning_rate": 9.97506660244619e-05, + "loss": 2.2183, + "step": 1984 + }, + { + "epoch": 0.6092694904849602, + "grad_norm": 0.7913599610328674, + "learning_rate": 9.975017000441358e-05, + "loss": 2.349, + "step": 1985 + }, + { + "epoch": 0.6095764272559853, + "grad_norm": 0.714824378490448, + "learning_rate": 9.974967349270492e-05, + "loss": 2.2163, + "step": 1986 + }, + { + "epoch": 0.6098833640270105, + "grad_norm": 0.7178559899330139, + "learning_rate": 9.974917648934084e-05, + "loss": 2.2338, + "step": 1987 + }, + { + "epoch": 0.6101903007980356, + "grad_norm": 0.8417280912399292, + "learning_rate": 9.97486789943262e-05, + "loss": 2.1961, + "step": 1988 + }, + { + "epoch": 0.6104972375690608, + "grad_norm": 0.8488532304763794, + "learning_rate": 9.9748181007666e-05, + "loss": 2.2509, + "step": 1989 + }, + { + "epoch": 0.6108041743400859, + "grad_norm": 0.796309769153595, + "learning_rate": 9.974768252936509e-05, + "loss": 2.2948, + "step": 1990 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.7163965702056885, + "learning_rate": 9.974718355942843e-05, + "loss": 2.2136, + "step": 1991 + }, + { + "epoch": 0.6114180478821363, + "grad_norm": 0.6620060205459595, + "learning_rate": 9.974668409786095e-05, + "loss": 2.2442, + "step": 1992 + }, + { + "epoch": 0.6117249846531615, + "grad_norm": 0.6843542456626892, + "learning_rate": 9.974618414466759e-05, + "loss": 2.1972, + "step": 1993 + }, + { + "epoch": 0.6120319214241866, + "grad_norm": 0.699847936630249, + "learning_rate": 9.974568369985327e-05, + "loss": 2.2194, + "step": 1994 + }, + { + "epoch": 0.6123388581952118, + "grad_norm": 0.693384051322937, + "learning_rate": 9.974518276342293e-05, + "loss": 2.2446, + "step": 1995 + }, + { + "epoch": 0.612645794966237, + "grad_norm": 0.6022316813468933, + "learning_rate": 9.974468133538155e-05, + "loss": 2.2037, + "step": 1996 + }, + { + "epoch": 0.6129527317372622, + "grad_norm": 0.6317062377929688, + "learning_rate": 9.974417941573409e-05, + "loss": 2.1855, + "step": 1997 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.7291355133056641, + "learning_rate": 9.974367700448547e-05, + "loss": 2.2179, + "step": 1998 + }, + { + "epoch": 0.6135666052793125, + "grad_norm": 0.6776867508888245, + "learning_rate": 9.97431741016407e-05, + "loss": 2.2437, + "step": 1999 + }, + { + "epoch": 0.6138735420503376, + "grad_norm": 0.6598517298698425, + "learning_rate": 9.97426707072047e-05, + "loss": 2.2775, + "step": 2000 + }, + { + "epoch": 0.6141804788213628, + "grad_norm": 0.6681709289550781, + "learning_rate": 9.974216682118249e-05, + "loss": 2.2004, + "step": 2001 + }, + { + "epoch": 0.614487415592388, + "grad_norm": 0.6725168228149414, + "learning_rate": 9.974166244357903e-05, + "loss": 2.2922, + "step": 2002 + }, + { + "epoch": 0.6147943523634132, + "grad_norm": 0.6547908782958984, + "learning_rate": 9.974115757439931e-05, + "loss": 2.2195, + "step": 2003 + }, + { + "epoch": 0.6151012891344383, + "grad_norm": 0.7195348739624023, + "learning_rate": 9.974065221364831e-05, + "loss": 2.2862, + "step": 2004 + }, + { + "epoch": 0.6154082259054635, + "grad_norm": 0.7992655038833618, + "learning_rate": 9.974014636133103e-05, + "loss": 2.3109, + "step": 2005 + }, + { + "epoch": 0.6157151626764886, + "grad_norm": 0.7932934165000916, + "learning_rate": 9.973964001745249e-05, + "loss": 2.2869, + "step": 2006 + }, + { + "epoch": 0.6160220994475138, + "grad_norm": 0.7778924107551575, + "learning_rate": 9.973913318201763e-05, + "loss": 2.2046, + "step": 2007 + }, + { + "epoch": 0.616329036218539, + "grad_norm": 0.7951294183731079, + "learning_rate": 9.973862585503155e-05, + "loss": 2.221, + "step": 2008 + }, + { + "epoch": 0.6166359729895642, + "grad_norm": 0.729552686214447, + "learning_rate": 9.97381180364992e-05, + "loss": 2.2929, + "step": 2009 + }, + { + "epoch": 0.6169429097605893, + "grad_norm": 0.731516420841217, + "learning_rate": 9.973760972642561e-05, + "loss": 2.2673, + "step": 2010 + }, + { + "epoch": 0.6172498465316145, + "grad_norm": 0.6950094103813171, + "learning_rate": 9.973710092481581e-05, + "loss": 2.2029, + "step": 2011 + }, + { + "epoch": 0.6175567833026396, + "grad_norm": 0.6260825395584106, + "learning_rate": 9.973659163167484e-05, + "loss": 2.3037, + "step": 2012 + }, + { + "epoch": 0.6178637200736649, + "grad_norm": 0.6949467658996582, + "learning_rate": 9.97360818470077e-05, + "loss": 2.2699, + "step": 2013 + }, + { + "epoch": 0.61817065684469, + "grad_norm": 0.7322572469711304, + "learning_rate": 9.973557157081945e-05, + "loss": 2.2921, + "step": 2014 + }, + { + "epoch": 0.6184775936157152, + "grad_norm": 0.8999563455581665, + "learning_rate": 9.973506080311514e-05, + "loss": 2.2499, + "step": 2015 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.9269914031028748, + "learning_rate": 9.973454954389981e-05, + "loss": 2.2676, + "step": 2016 + }, + { + "epoch": 0.6190914671577655, + "grad_norm": 0.8630712628364563, + "learning_rate": 9.973403779317852e-05, + "loss": 2.1379, + "step": 2017 + }, + { + "epoch": 0.6193984039287906, + "grad_norm": 0.8249645233154297, + "learning_rate": 9.97335255509563e-05, + "loss": 2.3109, + "step": 2018 + }, + { + "epoch": 0.6197053406998159, + "grad_norm": 0.7832711338996887, + "learning_rate": 9.973301281723824e-05, + "loss": 2.1316, + "step": 2019 + }, + { + "epoch": 0.620012277470841, + "grad_norm": 0.7502821683883667, + "learning_rate": 9.97324995920294e-05, + "loss": 2.2188, + "step": 2020 + }, + { + "epoch": 0.6203192142418662, + "grad_norm": 0.7804487347602844, + "learning_rate": 9.973198587533483e-05, + "loss": 2.2639, + "step": 2021 + }, + { + "epoch": 0.6206261510128913, + "grad_norm": 0.9198356866836548, + "learning_rate": 9.973147166715963e-05, + "loss": 2.2574, + "step": 2022 + }, + { + "epoch": 0.6209330877839165, + "grad_norm": 0.8792869448661804, + "learning_rate": 9.97309569675089e-05, + "loss": 2.2228, + "step": 2023 + }, + { + "epoch": 0.6212400245549416, + "grad_norm": 0.779772937297821, + "learning_rate": 9.97304417763877e-05, + "loss": 2.2179, + "step": 2024 + }, + { + "epoch": 0.6215469613259669, + "grad_norm": 0.7702100276947021, + "learning_rate": 9.972992609380111e-05, + "loss": 2.3872, + "step": 2025 + }, + { + "epoch": 0.621853898096992, + "grad_norm": 0.8576669096946716, + "learning_rate": 9.972940991975426e-05, + "loss": 2.2279, + "step": 2026 + }, + { + "epoch": 0.6221608348680172, + "grad_norm": 0.8312802314758301, + "learning_rate": 9.972889325425223e-05, + "loss": 2.3507, + "step": 2027 + }, + { + "epoch": 0.6224677716390423, + "grad_norm": 0.7873719930648804, + "learning_rate": 9.972837609730013e-05, + "loss": 2.2252, + "step": 2028 + }, + { + "epoch": 0.6227747084100675, + "grad_norm": 0.7763897180557251, + "learning_rate": 9.972785844890307e-05, + "loss": 2.2559, + "step": 2029 + }, + { + "epoch": 0.6230816451810927, + "grad_norm": 0.7053700685501099, + "learning_rate": 9.972734030906617e-05, + "loss": 2.2248, + "step": 2030 + }, + { + "epoch": 0.6233885819521179, + "grad_norm": 0.8800643682479858, + "learning_rate": 9.972682167779453e-05, + "loss": 2.3111, + "step": 2031 + }, + { + "epoch": 0.623695518723143, + "grad_norm": 0.7237632274627686, + "learning_rate": 9.97263025550933e-05, + "loss": 2.2255, + "step": 2032 + }, + { + "epoch": 0.6240024554941682, + "grad_norm": 0.7139064073562622, + "learning_rate": 9.97257829409676e-05, + "loss": 2.2065, + "step": 2033 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.6514315009117126, + "learning_rate": 9.972526283542259e-05, + "loss": 2.2176, + "step": 2034 + }, + { + "epoch": 0.6246163290362186, + "grad_norm": 0.726828932762146, + "learning_rate": 9.972474223846337e-05, + "loss": 2.2236, + "step": 2035 + }, + { + "epoch": 0.6249232658072437, + "grad_norm": 0.7121313810348511, + "learning_rate": 9.97242211500951e-05, + "loss": 2.2696, + "step": 2036 + }, + { + "epoch": 0.6252302025782689, + "grad_norm": 0.7203021049499512, + "learning_rate": 9.972369957032293e-05, + "loss": 2.2418, + "step": 2037 + }, + { + "epoch": 0.625537139349294, + "grad_norm": 0.6843051910400391, + "learning_rate": 9.972317749915203e-05, + "loss": 2.2408, + "step": 2038 + }, + { + "epoch": 0.6258440761203192, + "grad_norm": 0.6523141264915466, + "learning_rate": 9.972265493658754e-05, + "loss": 2.1693, + "step": 2039 + }, + { + "epoch": 0.6261510128913443, + "grad_norm": 0.6263946294784546, + "learning_rate": 9.972213188263463e-05, + "loss": 2.2477, + "step": 2040 + }, + { + "epoch": 0.6264579496623696, + "grad_norm": 0.6428464651107788, + "learning_rate": 9.972160833729847e-05, + "loss": 2.2131, + "step": 2041 + }, + { + "epoch": 0.6267648864333947, + "grad_norm": 0.6333484649658203, + "learning_rate": 9.972108430058423e-05, + "loss": 2.2806, + "step": 2042 + }, + { + "epoch": 0.6270718232044199, + "grad_norm": 0.7168832421302795, + "learning_rate": 9.97205597724971e-05, + "loss": 2.2468, + "step": 2043 + }, + { + "epoch": 0.627378759975445, + "grad_norm": 0.7522227168083191, + "learning_rate": 9.972003475304226e-05, + "loss": 2.249, + "step": 2044 + }, + { + "epoch": 0.6276856967464702, + "grad_norm": 0.6810066103935242, + "learning_rate": 9.971950924222488e-05, + "loss": 2.1988, + "step": 2045 + }, + { + "epoch": 0.6279926335174953, + "grad_norm": 0.6983187198638916, + "learning_rate": 9.971898324005018e-05, + "loss": 2.2444, + "step": 2046 + }, + { + "epoch": 0.6282995702885206, + "grad_norm": 0.7261439561843872, + "learning_rate": 9.971845674652333e-05, + "loss": 2.1789, + "step": 2047 + }, + { + "epoch": 0.6286065070595457, + "grad_norm": 0.6844322681427002, + "learning_rate": 9.971792976164957e-05, + "loss": 2.2666, + "step": 2048 + }, + { + "epoch": 0.6289134438305709, + "grad_norm": 0.7166746258735657, + "learning_rate": 9.971740228543407e-05, + "loss": 2.3002, + "step": 2049 + }, + { + "epoch": 0.629220380601596, + "grad_norm": 0.7386785745620728, + "learning_rate": 9.971687431788207e-05, + "loss": 2.1798, + "step": 2050 + }, + { + "epoch": 0.6295273173726212, + "grad_norm": 0.6873611211776733, + "learning_rate": 9.971634585899878e-05, + "loss": 2.184, + "step": 2051 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.8005948066711426, + "learning_rate": 9.971581690878941e-05, + "loss": 2.2778, + "step": 2052 + }, + { + "epoch": 0.6301411909146716, + "grad_norm": 0.8972415924072266, + "learning_rate": 9.971528746725922e-05, + "loss": 2.2822, + "step": 2053 + }, + { + "epoch": 0.6304481276856968, + "grad_norm": 0.7935822010040283, + "learning_rate": 9.97147575344134e-05, + "loss": 2.1732, + "step": 2054 + }, + { + "epoch": 0.6307550644567219, + "grad_norm": 0.7891644239425659, + "learning_rate": 9.971422711025721e-05, + "loss": 2.2765, + "step": 2055 + }, + { + "epoch": 0.6310620012277471, + "grad_norm": 0.7857005000114441, + "learning_rate": 9.971369619479589e-05, + "loss": 2.2386, + "step": 2056 + }, + { + "epoch": 0.6313689379987723, + "grad_norm": 0.6909852623939514, + "learning_rate": 9.97131647880347e-05, + "loss": 2.1251, + "step": 2057 + }, + { + "epoch": 0.6316758747697975, + "grad_norm": 0.6352387070655823, + "learning_rate": 9.971263288997885e-05, + "loss": 2.1883, + "step": 2058 + }, + { + "epoch": 0.6319828115408226, + "grad_norm": 0.5811386704444885, + "learning_rate": 9.971210050063364e-05, + "loss": 2.281, + "step": 2059 + }, + { + "epoch": 0.6322897483118478, + "grad_norm": 0.6227630376815796, + "learning_rate": 9.971156762000432e-05, + "loss": 2.1346, + "step": 2060 + }, + { + "epoch": 0.6325966850828729, + "grad_norm": 0.6628422737121582, + "learning_rate": 9.971103424809616e-05, + "loss": 2.2617, + "step": 2061 + }, + { + "epoch": 0.6329036218538981, + "grad_norm": 0.7212308645248413, + "learning_rate": 9.97105003849144e-05, + "loss": 2.1764, + "step": 2062 + }, + { + "epoch": 0.6332105586249233, + "grad_norm": 0.8368894457817078, + "learning_rate": 9.970996603046435e-05, + "loss": 2.2897, + "step": 2063 + }, + { + "epoch": 0.6335174953959485, + "grad_norm": 0.8797467350959778, + "learning_rate": 9.970943118475129e-05, + "loss": 2.1987, + "step": 2064 + }, + { + "epoch": 0.6338244321669736, + "grad_norm": 0.9241101145744324, + "learning_rate": 9.970889584778047e-05, + "loss": 2.2759, + "step": 2065 + }, + { + "epoch": 0.6341313689379988, + "grad_norm": 0.8636183142662048, + "learning_rate": 9.970836001955723e-05, + "loss": 2.2188, + "step": 2066 + }, + { + "epoch": 0.6344383057090239, + "grad_norm": 0.8965754508972168, + "learning_rate": 9.970782370008682e-05, + "loss": 2.2845, + "step": 2067 + }, + { + "epoch": 0.6347452424800492, + "grad_norm": 0.9064372777938843, + "learning_rate": 9.970728688937459e-05, + "loss": 2.1787, + "step": 2068 + }, + { + "epoch": 0.6350521792510743, + "grad_norm": 0.7387171387672424, + "learning_rate": 9.970674958742579e-05, + "loss": 2.1805, + "step": 2069 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.6220484972000122, + "learning_rate": 9.970621179424578e-05, + "loss": 2.2762, + "step": 2070 + }, + { + "epoch": 0.6356660527931246, + "grad_norm": 0.6268464922904968, + "learning_rate": 9.970567350983984e-05, + "loss": 2.2491, + "step": 2071 + }, + { + "epoch": 0.6359729895641498, + "grad_norm": 0.6385738253593445, + "learning_rate": 9.97051347342133e-05, + "loss": 2.2126, + "step": 2072 + }, + { + "epoch": 0.6362799263351749, + "grad_norm": 0.7084285020828247, + "learning_rate": 9.970459546737148e-05, + "loss": 2.2364, + "step": 2073 + }, + { + "epoch": 0.6365868631062002, + "grad_norm": 0.6957145929336548, + "learning_rate": 9.97040557093197e-05, + "loss": 2.266, + "step": 2074 + }, + { + "epoch": 0.6368937998772253, + "grad_norm": 0.6037309169769287, + "learning_rate": 9.970351546006334e-05, + "loss": 2.1514, + "step": 2075 + }, + { + "epoch": 0.6372007366482505, + "grad_norm": 0.6342970132827759, + "learning_rate": 9.97029747196077e-05, + "loss": 2.1602, + "step": 2076 + }, + { + "epoch": 0.6375076734192756, + "grad_norm": 0.5793863534927368, + "learning_rate": 9.970243348795812e-05, + "loss": 2.1853, + "step": 2077 + }, + { + "epoch": 0.6378146101903008, + "grad_norm": 0.5420103073120117, + "learning_rate": 9.970189176511997e-05, + "loss": 2.1885, + "step": 2078 + }, + { + "epoch": 0.638121546961326, + "grad_norm": 0.6713188886642456, + "learning_rate": 9.97013495510986e-05, + "loss": 2.2641, + "step": 2079 + }, + { + "epoch": 0.6384284837323512, + "grad_norm": 0.7410796880722046, + "learning_rate": 9.970080684589935e-05, + "loss": 2.2248, + "step": 2080 + }, + { + "epoch": 0.6387354205033763, + "grad_norm": 0.7138017416000366, + "learning_rate": 9.970026364952761e-05, + "loss": 2.1975, + "step": 2081 + }, + { + "epoch": 0.6390423572744015, + "grad_norm": 0.7553584575653076, + "learning_rate": 9.969971996198873e-05, + "loss": 2.2482, + "step": 2082 + }, + { + "epoch": 0.6393492940454266, + "grad_norm": 0.7082852125167847, + "learning_rate": 9.969917578328808e-05, + "loss": 2.1681, + "step": 2083 + }, + { + "epoch": 0.6396562308164518, + "grad_norm": 0.6190223097801208, + "learning_rate": 9.969863111343105e-05, + "loss": 2.1995, + "step": 2084 + }, + { + "epoch": 0.639963167587477, + "grad_norm": 0.6640429496765137, + "learning_rate": 9.969808595242302e-05, + "loss": 2.2969, + "step": 2085 + }, + { + "epoch": 0.6402701043585022, + "grad_norm": 0.761377215385437, + "learning_rate": 9.969754030026936e-05, + "loss": 2.2412, + "step": 2086 + }, + { + "epoch": 0.6405770411295273, + "grad_norm": 0.7226401567459106, + "learning_rate": 9.969699415697551e-05, + "loss": 2.1852, + "step": 2087 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.6474639177322388, + "learning_rate": 9.969644752254681e-05, + "loss": 2.1867, + "step": 2088 + }, + { + "epoch": 0.6411909146715776, + "grad_norm": 0.6725835800170898, + "learning_rate": 9.96959003969887e-05, + "loss": 2.1962, + "step": 2089 + }, + { + "epoch": 0.6414978514426029, + "grad_norm": 0.6669641733169556, + "learning_rate": 9.969535278030657e-05, + "loss": 2.2045, + "step": 2090 + }, + { + "epoch": 0.641804788213628, + "grad_norm": 0.7604048252105713, + "learning_rate": 9.969480467250583e-05, + "loss": 2.2543, + "step": 2091 + }, + { + "epoch": 0.6421117249846532, + "grad_norm": 0.9369953870773315, + "learning_rate": 9.969425607359191e-05, + "loss": 2.2461, + "step": 2092 + }, + { + "epoch": 0.6424186617556783, + "grad_norm": 1.116156816482544, + "learning_rate": 9.969370698357022e-05, + "loss": 2.2447, + "step": 2093 + }, + { + "epoch": 0.6427255985267035, + "grad_norm": 0.9179674983024597, + "learning_rate": 9.96931574024462e-05, + "loss": 2.2164, + "step": 2094 + }, + { + "epoch": 0.6430325352977286, + "grad_norm": 0.7629393339157104, + "learning_rate": 9.969260733022526e-05, + "loss": 2.22, + "step": 2095 + }, + { + "epoch": 0.6433394720687539, + "grad_norm": 0.7152948379516602, + "learning_rate": 9.969205676691286e-05, + "loss": 2.1967, + "step": 2096 + }, + { + "epoch": 0.643646408839779, + "grad_norm": 0.7527763247489929, + "learning_rate": 9.969150571251442e-05, + "loss": 2.2263, + "step": 2097 + }, + { + "epoch": 0.6439533456108042, + "grad_norm": 0.9889422655105591, + "learning_rate": 9.96909541670354e-05, + "loss": 2.2127, + "step": 2098 + }, + { + "epoch": 0.6442602823818293, + "grad_norm": 1.0340619087219238, + "learning_rate": 9.969040213048125e-05, + "loss": 2.2392, + "step": 2099 + }, + { + "epoch": 0.6445672191528545, + "grad_norm": 0.735322892665863, + "learning_rate": 9.968984960285743e-05, + "loss": 2.1351, + "step": 2100 + }, + { + "epoch": 0.6448741559238796, + "grad_norm": 0.6575397849082947, + "learning_rate": 9.968929658416936e-05, + "loss": 2.2481, + "step": 2101 + }, + { + "epoch": 0.6451810926949049, + "grad_norm": 0.6891960501670837, + "learning_rate": 9.968874307442258e-05, + "loss": 2.2164, + "step": 2102 + }, + { + "epoch": 0.64548802946593, + "grad_norm": 0.792298436164856, + "learning_rate": 9.968818907362248e-05, + "loss": 2.1681, + "step": 2103 + }, + { + "epoch": 0.6457949662369552, + "grad_norm": 0.8438142538070679, + "learning_rate": 9.968763458177459e-05, + "loss": 2.2123, + "step": 2104 + }, + { + "epoch": 0.6461019030079803, + "grad_norm": 0.7494921088218689, + "learning_rate": 9.968707959888436e-05, + "loss": 2.1863, + "step": 2105 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.7049927115440369, + "learning_rate": 9.968652412495731e-05, + "loss": 2.2364, + "step": 2106 + }, + { + "epoch": 0.6467157765500307, + "grad_norm": 0.7586455345153809, + "learning_rate": 9.968596815999889e-05, + "loss": 2.1976, + "step": 2107 + }, + { + "epoch": 0.6470227133210559, + "grad_norm": 0.7762691974639893, + "learning_rate": 9.968541170401462e-05, + "loss": 2.2323, + "step": 2108 + }, + { + "epoch": 0.647329650092081, + "grad_norm": 0.8127642869949341, + "learning_rate": 9.968485475700998e-05, + "loss": 2.1577, + "step": 2109 + }, + { + "epoch": 0.6476365868631062, + "grad_norm": 0.6762635111808777, + "learning_rate": 9.968429731899049e-05, + "loss": 2.1972, + "step": 2110 + }, + { + "epoch": 0.6479435236341313, + "grad_norm": 0.675707995891571, + "learning_rate": 9.968373938996165e-05, + "loss": 2.1932, + "step": 2111 + }, + { + "epoch": 0.6482504604051565, + "grad_norm": 0.6996815204620361, + "learning_rate": 9.968318096992898e-05, + "loss": 2.2695, + "step": 2112 + }, + { + "epoch": 0.6485573971761817, + "grad_norm": 0.8519851565361023, + "learning_rate": 9.968262205889799e-05, + "loss": 2.2662, + "step": 2113 + }, + { + "epoch": 0.6488643339472069, + "grad_norm": 0.7621145844459534, + "learning_rate": 9.968206265687421e-05, + "loss": 2.2888, + "step": 2114 + }, + { + "epoch": 0.649171270718232, + "grad_norm": 0.786609411239624, + "learning_rate": 9.968150276386317e-05, + "loss": 2.3354, + "step": 2115 + }, + { + "epoch": 0.6494782074892572, + "grad_norm": 0.7693428993225098, + "learning_rate": 9.96809423798704e-05, + "loss": 2.1981, + "step": 2116 + }, + { + "epoch": 0.6497851442602823, + "grad_norm": 0.72762131690979, + "learning_rate": 9.968038150490145e-05, + "loss": 2.2387, + "step": 2117 + }, + { + "epoch": 0.6500920810313076, + "grad_norm": 0.737617015838623, + "learning_rate": 9.967982013896184e-05, + "loss": 2.258, + "step": 2118 + }, + { + "epoch": 0.6503990178023327, + "grad_norm": 0.7320968508720398, + "learning_rate": 9.967925828205712e-05, + "loss": 2.3248, + "step": 2119 + }, + { + "epoch": 0.6507059545733579, + "grad_norm": 0.7904484868049622, + "learning_rate": 9.967869593419286e-05, + "loss": 2.2121, + "step": 2120 + }, + { + "epoch": 0.651012891344383, + "grad_norm": 0.7519722580909729, + "learning_rate": 9.967813309537461e-05, + "loss": 2.1999, + "step": 2121 + }, + { + "epoch": 0.6513198281154082, + "grad_norm": 0.7201504707336426, + "learning_rate": 9.967756976560793e-05, + "loss": 2.2022, + "step": 2122 + }, + { + "epoch": 0.6516267648864333, + "grad_norm": 0.6134514808654785, + "learning_rate": 9.96770059448984e-05, + "loss": 2.2105, + "step": 2123 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.6086028218269348, + "learning_rate": 9.967644163325156e-05, + "loss": 2.212, + "step": 2124 + }, + { + "epoch": 0.6522406384284838, + "grad_norm": 0.6550475358963013, + "learning_rate": 9.967587683067302e-05, + "loss": 2.181, + "step": 2125 + }, + { + "epoch": 0.6525475751995089, + "grad_norm": 0.7557916045188904, + "learning_rate": 9.967531153716835e-05, + "loss": 2.3194, + "step": 2126 + }, + { + "epoch": 0.6528545119705341, + "grad_norm": 0.8859965801239014, + "learning_rate": 9.967474575274314e-05, + "loss": 2.2104, + "step": 2127 + }, + { + "epoch": 0.6531614487415592, + "grad_norm": 0.8049005270004272, + "learning_rate": 9.967417947740296e-05, + "loss": 2.2949, + "step": 2128 + }, + { + "epoch": 0.6534683855125845, + "grad_norm": 0.708297073841095, + "learning_rate": 9.967361271115343e-05, + "loss": 2.1703, + "step": 2129 + }, + { + "epoch": 0.6537753222836096, + "grad_norm": 0.6764169335365295, + "learning_rate": 9.967304545400016e-05, + "loss": 2.2177, + "step": 2130 + }, + { + "epoch": 0.6540822590546348, + "grad_norm": 0.6987971067428589, + "learning_rate": 9.967247770594872e-05, + "loss": 2.1699, + "step": 2131 + }, + { + "epoch": 0.6543891958256599, + "grad_norm": 0.7212976217269897, + "learning_rate": 9.967190946700476e-05, + "loss": 2.1217, + "step": 2132 + }, + { + "epoch": 0.6546961325966851, + "grad_norm": 0.6805562973022461, + "learning_rate": 9.967134073717386e-05, + "loss": 2.2295, + "step": 2133 + }, + { + "epoch": 0.6550030693677102, + "grad_norm": 0.665428102016449, + "learning_rate": 9.967077151646167e-05, + "loss": 2.1742, + "step": 2134 + }, + { + "epoch": 0.6553100061387355, + "grad_norm": 0.6691353917121887, + "learning_rate": 9.967020180487378e-05, + "loss": 2.2313, + "step": 2135 + }, + { + "epoch": 0.6556169429097606, + "grad_norm": 0.7095547914505005, + "learning_rate": 9.966963160241587e-05, + "loss": 2.1367, + "step": 2136 + }, + { + "epoch": 0.6559238796807858, + "grad_norm": 0.7050215601921082, + "learning_rate": 9.966906090909353e-05, + "loss": 2.3234, + "step": 2137 + }, + { + "epoch": 0.6562308164518109, + "grad_norm": 0.7592353820800781, + "learning_rate": 9.966848972491245e-05, + "loss": 2.1722, + "step": 2138 + }, + { + "epoch": 0.6565377532228361, + "grad_norm": 0.6520100831985474, + "learning_rate": 9.96679180498782e-05, + "loss": 2.2401, + "step": 2139 + }, + { + "epoch": 0.6568446899938613, + "grad_norm": 0.6650902628898621, + "learning_rate": 9.966734588399651e-05, + "loss": 2.2094, + "step": 2140 + }, + { + "epoch": 0.6571516267648865, + "grad_norm": 0.7236151099205017, + "learning_rate": 9.966677322727299e-05, + "loss": 2.3021, + "step": 2141 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.7160753011703491, + "learning_rate": 9.966620007971327e-05, + "loss": 2.1992, + "step": 2142 + }, + { + "epoch": 0.6577655003069368, + "grad_norm": 0.6761705279350281, + "learning_rate": 9.966562644132309e-05, + "loss": 2.1853, + "step": 2143 + }, + { + "epoch": 0.6580724370779619, + "grad_norm": 0.7017555236816406, + "learning_rate": 9.966505231210806e-05, + "loss": 2.208, + "step": 2144 + }, + { + "epoch": 0.6583793738489871, + "grad_norm": 0.7652586102485657, + "learning_rate": 9.966447769207387e-05, + "loss": 2.3065, + "step": 2145 + }, + { + "epoch": 0.6586863106200123, + "grad_norm": 0.7148436307907104, + "learning_rate": 9.966390258122621e-05, + "loss": 2.1388, + "step": 2146 + }, + { + "epoch": 0.6589932473910375, + "grad_norm": 0.5885360240936279, + "learning_rate": 9.966332697957076e-05, + "loss": 2.1463, + "step": 2147 + }, + { + "epoch": 0.6593001841620626, + "grad_norm": 0.6800816655158997, + "learning_rate": 9.966275088711321e-05, + "loss": 2.3397, + "step": 2148 + }, + { + "epoch": 0.6596071209330878, + "grad_norm": 0.6856956481933594, + "learning_rate": 9.966217430385925e-05, + "loss": 2.0893, + "step": 2149 + }, + { + "epoch": 0.6599140577041129, + "grad_norm": 0.6302888989448547, + "learning_rate": 9.966159722981456e-05, + "loss": 2.1108, + "step": 2150 + }, + { + "epoch": 0.6602209944751382, + "grad_norm": 0.6145252585411072, + "learning_rate": 9.966101966498486e-05, + "loss": 2.2668, + "step": 2151 + }, + { + "epoch": 0.6605279312461633, + "grad_norm": 0.7258949279785156, + "learning_rate": 9.966044160937586e-05, + "loss": 2.2163, + "step": 2152 + }, + { + "epoch": 0.6608348680171885, + "grad_norm": 0.6809847950935364, + "learning_rate": 9.965986306299327e-05, + "loss": 2.1828, + "step": 2153 + }, + { + "epoch": 0.6611418047882136, + "grad_norm": 0.6673223376274109, + "learning_rate": 9.96592840258428e-05, + "loss": 2.232, + "step": 2154 + }, + { + "epoch": 0.6614487415592388, + "grad_norm": 0.6483572721481323, + "learning_rate": 9.96587044979302e-05, + "loss": 2.199, + "step": 2155 + }, + { + "epoch": 0.6617556783302639, + "grad_norm": 0.6227185726165771, + "learning_rate": 9.965812447926115e-05, + "loss": 2.166, + "step": 2156 + }, + { + "epoch": 0.6620626151012892, + "grad_norm": 0.5982463955879211, + "learning_rate": 9.965754396984142e-05, + "loss": 2.2074, + "step": 2157 + }, + { + "epoch": 0.6623695518723143, + "grad_norm": 0.6357809901237488, + "learning_rate": 9.965696296967673e-05, + "loss": 2.2086, + "step": 2158 + }, + { + "epoch": 0.6626764886433395, + "grad_norm": 0.5908147692680359, + "learning_rate": 9.965638147877283e-05, + "loss": 2.1103, + "step": 2159 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.591332733631134, + "learning_rate": 9.965579949713545e-05, + "loss": 2.1698, + "step": 2160 + }, + { + "epoch": 0.6632903621853898, + "grad_norm": 0.5748336911201477, + "learning_rate": 9.965521702477038e-05, + "loss": 2.1812, + "step": 2161 + }, + { + "epoch": 0.663597298956415, + "grad_norm": 0.6643908023834229, + "learning_rate": 9.965463406168334e-05, + "loss": 2.2129, + "step": 2162 + }, + { + "epoch": 0.6639042357274402, + "grad_norm": 0.637627124786377, + "learning_rate": 9.965405060788011e-05, + "loss": 2.226, + "step": 2163 + }, + { + "epoch": 0.6642111724984653, + "grad_norm": 0.6170387268066406, + "learning_rate": 9.965346666336644e-05, + "loss": 2.2025, + "step": 2164 + }, + { + "epoch": 0.6645181092694905, + "grad_norm": 0.6038833260536194, + "learning_rate": 9.965288222814812e-05, + "loss": 2.1761, + "step": 2165 + }, + { + "epoch": 0.6648250460405156, + "grad_norm": 0.5705585479736328, + "learning_rate": 9.965229730223092e-05, + "loss": 2.1511, + "step": 2166 + }, + { + "epoch": 0.6651319828115408, + "grad_norm": 0.5994759798049927, + "learning_rate": 9.965171188562059e-05, + "loss": 2.1763, + "step": 2167 + }, + { + "epoch": 0.665438919582566, + "grad_norm": 0.5887313485145569, + "learning_rate": 9.965112597832296e-05, + "loss": 2.2185, + "step": 2168 + }, + { + "epoch": 0.6657458563535912, + "grad_norm": 0.5688689947128296, + "learning_rate": 9.96505395803438e-05, + "loss": 2.2387, + "step": 2169 + }, + { + "epoch": 0.6660527931246163, + "grad_norm": 0.6121554970741272, + "learning_rate": 9.96499526916889e-05, + "loss": 2.1938, + "step": 2170 + }, + { + "epoch": 0.6663597298956415, + "grad_norm": 0.6048038005828857, + "learning_rate": 9.964936531236407e-05, + "loss": 2.197, + "step": 2171 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6442995071411133, + "learning_rate": 9.96487774423751e-05, + "loss": 2.1725, + "step": 2172 + }, + { + "epoch": 0.6669736034376919, + "grad_norm": 0.7136862874031067, + "learning_rate": 9.964818908172783e-05, + "loss": 2.2166, + "step": 2173 + }, + { + "epoch": 0.667280540208717, + "grad_norm": 0.6902804970741272, + "learning_rate": 9.964760023042805e-05, + "loss": 2.2318, + "step": 2174 + }, + { + "epoch": 0.6675874769797422, + "grad_norm": 0.6946488618850708, + "learning_rate": 9.964701088848158e-05, + "loss": 2.177, + "step": 2175 + }, + { + "epoch": 0.6678944137507673, + "grad_norm": 0.6283712983131409, + "learning_rate": 9.964642105589425e-05, + "loss": 2.2227, + "step": 2176 + }, + { + "epoch": 0.6682013505217925, + "grad_norm": 0.5768510103225708, + "learning_rate": 9.96458307326719e-05, + "loss": 2.1559, + "step": 2177 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.6045784950256348, + "learning_rate": 9.964523991882035e-05, + "loss": 2.2018, + "step": 2178 + }, + { + "epoch": 0.6688152240638429, + "grad_norm": 0.5962889790534973, + "learning_rate": 9.964464861434544e-05, + "loss": 2.1898, + "step": 2179 + }, + { + "epoch": 0.669122160834868, + "grad_norm": 0.6611660718917847, + "learning_rate": 9.964405681925301e-05, + "loss": 2.1989, + "step": 2180 + }, + { + "epoch": 0.6694290976058932, + "grad_norm": 0.6764575242996216, + "learning_rate": 9.964346453354891e-05, + "loss": 2.2764, + "step": 2181 + }, + { + "epoch": 0.6697360343769183, + "grad_norm": 0.6795048117637634, + "learning_rate": 9.964287175723899e-05, + "loss": 2.1313, + "step": 2182 + }, + { + "epoch": 0.6700429711479435, + "grad_norm": 0.6697003841400146, + "learning_rate": 9.964227849032914e-05, + "loss": 2.1999, + "step": 2183 + }, + { + "epoch": 0.6703499079189686, + "grad_norm": 0.669682502746582, + "learning_rate": 9.964168473282519e-05, + "loss": 2.202, + "step": 2184 + }, + { + "epoch": 0.6706568446899939, + "grad_norm": 0.6823530793190002, + "learning_rate": 9.9641090484733e-05, + "loss": 2.2326, + "step": 2185 + }, + { + "epoch": 0.670963781461019, + "grad_norm": 0.7460775971412659, + "learning_rate": 9.964049574605848e-05, + "loss": 2.1594, + "step": 2186 + }, + { + "epoch": 0.6712707182320442, + "grad_norm": 0.8075460195541382, + "learning_rate": 9.963990051680744e-05, + "loss": 2.1506, + "step": 2187 + }, + { + "epoch": 0.6715776550030693, + "grad_norm": 0.8041695356369019, + "learning_rate": 9.963930479698585e-05, + "loss": 2.123, + "step": 2188 + }, + { + "epoch": 0.6718845917740945, + "grad_norm": 0.9129732251167297, + "learning_rate": 9.963870858659955e-05, + "loss": 2.116, + "step": 2189 + }, + { + "epoch": 0.6721915285451197, + "grad_norm": 0.9989685416221619, + "learning_rate": 9.963811188565444e-05, + "loss": 2.3194, + "step": 2190 + }, + { + "epoch": 0.6724984653161449, + "grad_norm": 1.0353670120239258, + "learning_rate": 9.96375146941564e-05, + "loss": 2.113, + "step": 2191 + }, + { + "epoch": 0.67280540208717, + "grad_norm": 0.897750735282898, + "learning_rate": 9.963691701211135e-05, + "loss": 2.1038, + "step": 2192 + }, + { + "epoch": 0.6731123388581952, + "grad_norm": 0.7353916168212891, + "learning_rate": 9.96363188395252e-05, + "loss": 2.2185, + "step": 2193 + }, + { + "epoch": 0.6734192756292203, + "grad_norm": 0.6474063992500305, + "learning_rate": 9.963572017640385e-05, + "loss": 2.2229, + "step": 2194 + }, + { + "epoch": 0.6737262124002455, + "grad_norm": 0.7194583415985107, + "learning_rate": 9.963512102275322e-05, + "loss": 2.2172, + "step": 2195 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.6638131737709045, + "learning_rate": 9.963452137857926e-05, + "loss": 2.2212, + "step": 2196 + }, + { + "epoch": 0.6743400859422959, + "grad_norm": 0.7219048738479614, + "learning_rate": 9.963392124388782e-05, + "loss": 2.3302, + "step": 2197 + }, + { + "epoch": 0.6746470227133211, + "grad_norm": 0.7941164374351501, + "learning_rate": 9.963332061868491e-05, + "loss": 2.2982, + "step": 2198 + }, + { + "epoch": 0.6749539594843462, + "grad_norm": 0.7356888055801392, + "learning_rate": 9.963271950297643e-05, + "loss": 2.1761, + "step": 2199 + }, + { + "epoch": 0.6752608962553714, + "grad_norm": 0.6705774664878845, + "learning_rate": 9.963211789676831e-05, + "loss": 2.2483, + "step": 2200 + }, + { + "epoch": 0.6755678330263966, + "grad_norm": 0.7958056926727295, + "learning_rate": 9.963151580006653e-05, + "loss": 2.2209, + "step": 2201 + }, + { + "epoch": 0.6758747697974218, + "grad_norm": 0.7215412259101868, + "learning_rate": 9.9630913212877e-05, + "loss": 2.1676, + "step": 2202 + }, + { + "epoch": 0.6761817065684469, + "grad_norm": 0.705649197101593, + "learning_rate": 9.963031013520572e-05, + "loss": 2.1855, + "step": 2203 + }, + { + "epoch": 0.6764886433394721, + "grad_norm": 0.7050254344940186, + "learning_rate": 9.962970656705861e-05, + "loss": 2.171, + "step": 2204 + }, + { + "epoch": 0.6767955801104972, + "grad_norm": 0.7163556218147278, + "learning_rate": 9.962910250844167e-05, + "loss": 2.1295, + "step": 2205 + }, + { + "epoch": 0.6771025168815225, + "grad_norm": 0.7195280194282532, + "learning_rate": 9.962849795936083e-05, + "loss": 2.1436, + "step": 2206 + }, + { + "epoch": 0.6774094536525476, + "grad_norm": 0.7356030344963074, + "learning_rate": 9.962789291982208e-05, + "loss": 2.2739, + "step": 2207 + }, + { + "epoch": 0.6777163904235728, + "grad_norm": 0.783649742603302, + "learning_rate": 9.962728738983143e-05, + "loss": 2.2461, + "step": 2208 + }, + { + "epoch": 0.6780233271945979, + "grad_norm": 0.6966754794120789, + "learning_rate": 9.962668136939481e-05, + "loss": 2.1977, + "step": 2209 + }, + { + "epoch": 0.6783302639656231, + "grad_norm": 0.6986487507820129, + "learning_rate": 9.962607485851825e-05, + "loss": 2.1806, + "step": 2210 + }, + { + "epoch": 0.6786372007366482, + "grad_norm": 0.6502536535263062, + "learning_rate": 9.962546785720774e-05, + "loss": 2.174, + "step": 2211 + }, + { + "epoch": 0.6789441375076735, + "grad_norm": 0.6797144412994385, + "learning_rate": 9.962486036546926e-05, + "loss": 2.2635, + "step": 2212 + }, + { + "epoch": 0.6792510742786986, + "grad_norm": 0.7190150022506714, + "learning_rate": 9.962425238330884e-05, + "loss": 2.2231, + "step": 2213 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.6770560145378113, + "learning_rate": 9.962364391073245e-05, + "loss": 2.1639, + "step": 2214 + }, + { + "epoch": 0.6798649478207489, + "grad_norm": 0.624911904335022, + "learning_rate": 9.962303494774614e-05, + "loss": 2.1754, + "step": 2215 + }, + { + "epoch": 0.6801718845917741, + "grad_norm": 0.7127423286437988, + "learning_rate": 9.96224254943559e-05, + "loss": 2.2047, + "step": 2216 + }, + { + "epoch": 0.6804788213627992, + "grad_norm": 0.6729345321655273, + "learning_rate": 9.962181555056778e-05, + "loss": 2.2245, + "step": 2217 + }, + { + "epoch": 0.6807857581338245, + "grad_norm": 0.7142044901847839, + "learning_rate": 9.96212051163878e-05, + "loss": 2.1827, + "step": 2218 + }, + { + "epoch": 0.6810926949048496, + "grad_norm": 0.686295211315155, + "learning_rate": 9.962059419182196e-05, + "loss": 2.1784, + "step": 2219 + }, + { + "epoch": 0.6813996316758748, + "grad_norm": 0.7207211256027222, + "learning_rate": 9.961998277687634e-05, + "loss": 2.2603, + "step": 2220 + }, + { + "epoch": 0.6817065684468999, + "grad_norm": 0.814552903175354, + "learning_rate": 9.961937087155697e-05, + "loss": 2.2328, + "step": 2221 + }, + { + "epoch": 0.6820135052179251, + "grad_norm": 0.851860761642456, + "learning_rate": 9.96187584758699e-05, + "loss": 2.2334, + "step": 2222 + }, + { + "epoch": 0.6823204419889503, + "grad_norm": 0.9232058525085449, + "learning_rate": 9.961814558982117e-05, + "loss": 2.2259, + "step": 2223 + }, + { + "epoch": 0.6826273787599755, + "grad_norm": 0.8393358588218689, + "learning_rate": 9.961753221341684e-05, + "loss": 2.1347, + "step": 2224 + }, + { + "epoch": 0.6829343155310006, + "grad_norm": 0.7124439477920532, + "learning_rate": 9.961691834666297e-05, + "loss": 2.195, + "step": 2225 + }, + { + "epoch": 0.6832412523020258, + "grad_norm": 0.644290566444397, + "learning_rate": 9.961630398956565e-05, + "loss": 2.1967, + "step": 2226 + }, + { + "epoch": 0.6835481890730509, + "grad_norm": 0.6896283030509949, + "learning_rate": 9.961568914213092e-05, + "loss": 2.1781, + "step": 2227 + }, + { + "epoch": 0.6838551258440762, + "grad_norm": 0.711643636226654, + "learning_rate": 9.961507380436487e-05, + "loss": 2.1091, + "step": 2228 + }, + { + "epoch": 0.6841620626151013, + "grad_norm": 0.7056689858436584, + "learning_rate": 9.961445797627358e-05, + "loss": 2.1848, + "step": 2229 + }, + { + "epoch": 0.6844689993861265, + "grad_norm": 0.60573410987854, + "learning_rate": 9.961384165786314e-05, + "loss": 2.1156, + "step": 2230 + }, + { + "epoch": 0.6847759361571516, + "grad_norm": 0.5612443089485168, + "learning_rate": 9.961322484913963e-05, + "loss": 2.2311, + "step": 2231 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.6356449723243713, + "learning_rate": 9.961260755010916e-05, + "loss": 2.1945, + "step": 2232 + }, + { + "epoch": 0.6853898096992019, + "grad_norm": 0.7393341660499573, + "learning_rate": 9.961198976077782e-05, + "loss": 2.2743, + "step": 2233 + }, + { + "epoch": 0.6856967464702272, + "grad_norm": 0.7658794522285461, + "learning_rate": 9.961137148115171e-05, + "loss": 2.1729, + "step": 2234 + }, + { + "epoch": 0.6860036832412523, + "grad_norm": 0.790540337562561, + "learning_rate": 9.961075271123697e-05, + "loss": 2.1372, + "step": 2235 + }, + { + "epoch": 0.6863106200122775, + "grad_norm": 0.71295565366745, + "learning_rate": 9.961013345103968e-05, + "loss": 2.1325, + "step": 2236 + }, + { + "epoch": 0.6866175567833026, + "grad_norm": 0.6648302674293518, + "learning_rate": 9.960951370056597e-05, + "loss": 2.1626, + "step": 2237 + }, + { + "epoch": 0.6869244935543278, + "grad_norm": 0.6276865601539612, + "learning_rate": 9.960889345982198e-05, + "loss": 2.1848, + "step": 2238 + }, + { + "epoch": 0.6872314303253529, + "grad_norm": 0.6786942481994629, + "learning_rate": 9.960827272881383e-05, + "loss": 2.2402, + "step": 2239 + }, + { + "epoch": 0.6875383670963782, + "grad_norm": 0.7752293348312378, + "learning_rate": 9.960765150754764e-05, + "loss": 2.2187, + "step": 2240 + }, + { + "epoch": 0.6878453038674033, + "grad_norm": 0.7958577871322632, + "learning_rate": 9.960702979602956e-05, + "loss": 2.1995, + "step": 2241 + }, + { + "epoch": 0.6881522406384285, + "grad_norm": 0.7327582240104675, + "learning_rate": 9.960640759426575e-05, + "loss": 2.1709, + "step": 2242 + }, + { + "epoch": 0.6884591774094536, + "grad_norm": 0.7002710103988647, + "learning_rate": 9.960578490226233e-05, + "loss": 2.1966, + "step": 2243 + }, + { + "epoch": 0.6887661141804788, + "grad_norm": 0.6163785457611084, + "learning_rate": 9.960516172002548e-05, + "loss": 2.2012, + "step": 2244 + }, + { + "epoch": 0.689073050951504, + "grad_norm": 0.6808127760887146, + "learning_rate": 9.960453804756134e-05, + "loss": 2.1704, + "step": 2245 + }, + { + "epoch": 0.6893799877225292, + "grad_norm": 0.6571208834648132, + "learning_rate": 9.960391388487609e-05, + "loss": 2.17, + "step": 2246 + }, + { + "epoch": 0.6896869244935543, + "grad_norm": 0.7180834412574768, + "learning_rate": 9.960328923197588e-05, + "loss": 2.229, + "step": 2247 + }, + { + "epoch": 0.6899938612645795, + "grad_norm": 0.7283746600151062, + "learning_rate": 9.96026640888669e-05, + "loss": 2.195, + "step": 2248 + }, + { + "epoch": 0.6903007980356046, + "grad_norm": 0.6808122992515564, + "learning_rate": 9.960203845555531e-05, + "loss": 2.1327, + "step": 2249 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.7105094790458679, + "learning_rate": 9.960141233204731e-05, + "loss": 2.2747, + "step": 2250 + }, + { + "epoch": 0.690914671577655, + "grad_norm": 0.7650291919708252, + "learning_rate": 9.960078571834909e-05, + "loss": 2.2751, + "step": 2251 + }, + { + "epoch": 0.6912216083486802, + "grad_norm": 0.8347647786140442, + "learning_rate": 9.960015861446684e-05, + "loss": 2.2101, + "step": 2252 + }, + { + "epoch": 0.6915285451197053, + "grad_norm": 0.7774063348770142, + "learning_rate": 9.959953102040672e-05, + "loss": 2.1275, + "step": 2253 + }, + { + "epoch": 0.6918354818907305, + "grad_norm": 0.7466274499893188, + "learning_rate": 9.959890293617497e-05, + "loss": 2.1352, + "step": 2254 + }, + { + "epoch": 0.6921424186617556, + "grad_norm": 0.7451669573783875, + "learning_rate": 9.959827436177781e-05, + "loss": 2.1229, + "step": 2255 + }, + { + "epoch": 0.6924493554327809, + "grad_norm": 0.651746392250061, + "learning_rate": 9.959764529722142e-05, + "loss": 2.1416, + "step": 2256 + }, + { + "epoch": 0.692756292203806, + "grad_norm": 0.6267968416213989, + "learning_rate": 9.959701574251203e-05, + "loss": 2.1346, + "step": 2257 + }, + { + "epoch": 0.6930632289748312, + "grad_norm": 0.6087000966072083, + "learning_rate": 9.959638569765586e-05, + "loss": 2.2136, + "step": 2258 + }, + { + "epoch": 0.6933701657458563, + "grad_norm": 0.6032208204269409, + "learning_rate": 9.959575516265914e-05, + "loss": 2.1211, + "step": 2259 + }, + { + "epoch": 0.6936771025168815, + "grad_norm": 0.83074551820755, + "learning_rate": 9.95951241375281e-05, + "loss": 2.2951, + "step": 2260 + }, + { + "epoch": 0.6939840392879066, + "grad_norm": 0.8564106225967407, + "learning_rate": 9.959449262226897e-05, + "loss": 2.1496, + "step": 2261 + }, + { + "epoch": 0.6942909760589319, + "grad_norm": 0.8558153510093689, + "learning_rate": 9.9593860616888e-05, + "loss": 2.2325, + "step": 2262 + }, + { + "epoch": 0.694597912829957, + "grad_norm": 0.7391008734703064, + "learning_rate": 9.959322812139143e-05, + "loss": 2.1133, + "step": 2263 + }, + { + "epoch": 0.6949048496009822, + "grad_norm": 0.6090536713600159, + "learning_rate": 9.959259513578552e-05, + "loss": 2.1453, + "step": 2264 + }, + { + "epoch": 0.6952117863720073, + "grad_norm": 0.5893986821174622, + "learning_rate": 9.95919616600765e-05, + "loss": 2.2035, + "step": 2265 + }, + { + "epoch": 0.6955187231430325, + "grad_norm": 0.6274020671844482, + "learning_rate": 9.959132769427065e-05, + "loss": 2.2118, + "step": 2266 + }, + { + "epoch": 0.6958256599140578, + "grad_norm": 0.6287395358085632, + "learning_rate": 9.959069323837424e-05, + "loss": 2.2167, + "step": 2267 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.6281611323356628, + "learning_rate": 9.959005829239354e-05, + "loss": 2.1945, + "step": 2268 + }, + { + "epoch": 0.6964395334561081, + "grad_norm": 0.6422389149665833, + "learning_rate": 9.958942285633481e-05, + "loss": 2.1826, + "step": 2269 + }, + { + "epoch": 0.6967464702271332, + "grad_norm": 0.6461887955665588, + "learning_rate": 9.958878693020434e-05, + "loss": 2.2454, + "step": 2270 + }, + { + "epoch": 0.6970534069981584, + "grad_norm": 0.562102735042572, + "learning_rate": 9.958815051400841e-05, + "loss": 2.1375, + "step": 2271 + }, + { + "epoch": 0.6973603437691835, + "grad_norm": 0.5737003087997437, + "learning_rate": 9.958751360775331e-05, + "loss": 2.2344, + "step": 2272 + }, + { + "epoch": 0.6976672805402088, + "grad_norm": 0.5516494512557983, + "learning_rate": 9.958687621144535e-05, + "loss": 2.249, + "step": 2273 + }, + { + "epoch": 0.6979742173112339, + "grad_norm": 0.7148357629776001, + "learning_rate": 9.958623832509081e-05, + "loss": 2.2383, + "step": 2274 + }, + { + "epoch": 0.6982811540822591, + "grad_norm": 0.7151525020599365, + "learning_rate": 9.958559994869599e-05, + "loss": 2.1697, + "step": 2275 + }, + { + "epoch": 0.6985880908532842, + "grad_norm": 0.6927846670150757, + "learning_rate": 9.958496108226722e-05, + "loss": 2.1534, + "step": 2276 + }, + { + "epoch": 0.6988950276243094, + "grad_norm": 0.811660647392273, + "learning_rate": 9.958432172581079e-05, + "loss": 2.2197, + "step": 2277 + }, + { + "epoch": 0.6992019643953346, + "grad_norm": 0.9680081009864807, + "learning_rate": 9.958368187933305e-05, + "loss": 2.2241, + "step": 2278 + }, + { + "epoch": 0.6995089011663598, + "grad_norm": 0.9996320605278015, + "learning_rate": 9.958304154284028e-05, + "loss": 2.1598, + "step": 2279 + }, + { + "epoch": 0.6998158379373849, + "grad_norm": 1.008695363998413, + "learning_rate": 9.958240071633884e-05, + "loss": 2.2082, + "step": 2280 + }, + { + "epoch": 0.7001227747084101, + "grad_norm": 0.9931860566139221, + "learning_rate": 9.958175939983506e-05, + "loss": 2.1478, + "step": 2281 + }, + { + "epoch": 0.7004297114794352, + "grad_norm": 0.8637800812721252, + "learning_rate": 9.958111759333528e-05, + "loss": 2.149, + "step": 2282 + }, + { + "epoch": 0.7007366482504604, + "grad_norm": 0.7089012861251831, + "learning_rate": 9.958047529684582e-05, + "loss": 2.1845, + "step": 2283 + }, + { + "epoch": 0.7010435850214856, + "grad_norm": 0.6083673238754272, + "learning_rate": 9.957983251037303e-05, + "loss": 2.1542, + "step": 2284 + }, + { + "epoch": 0.7013505217925108, + "grad_norm": 0.7092905044555664, + "learning_rate": 9.957918923392331e-05, + "loss": 2.2305, + "step": 2285 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.8416675925254822, + "learning_rate": 9.957854546750297e-05, + "loss": 2.2975, + "step": 2286 + }, + { + "epoch": 0.7019643953345611, + "grad_norm": 0.7778663039207458, + "learning_rate": 9.957790121111838e-05, + "loss": 2.2363, + "step": 2287 + }, + { + "epoch": 0.7022713321055862, + "grad_norm": 0.7886617183685303, + "learning_rate": 9.957725646477592e-05, + "loss": 2.1547, + "step": 2288 + }, + { + "epoch": 0.7025782688766115, + "grad_norm": 0.6596038937568665, + "learning_rate": 9.957661122848194e-05, + "loss": 2.1537, + "step": 2289 + }, + { + "epoch": 0.7028852056476366, + "grad_norm": 0.6441544890403748, + "learning_rate": 9.957596550224285e-05, + "loss": 2.1678, + "step": 2290 + }, + { + "epoch": 0.7031921424186618, + "grad_norm": 0.7106116414070129, + "learning_rate": 9.957531928606499e-05, + "loss": 2.2039, + "step": 2291 + }, + { + "epoch": 0.7034990791896869, + "grad_norm": 0.6948207020759583, + "learning_rate": 9.957467257995476e-05, + "loss": 2.176, + "step": 2292 + }, + { + "epoch": 0.7038060159607121, + "grad_norm": 0.6834874153137207, + "learning_rate": 9.957402538391859e-05, + "loss": 2.2182, + "step": 2293 + }, + { + "epoch": 0.7041129527317372, + "grad_norm": 0.6246630549430847, + "learning_rate": 9.957337769796282e-05, + "loss": 2.1181, + "step": 2294 + }, + { + "epoch": 0.7044198895027625, + "grad_norm": 0.6421988606452942, + "learning_rate": 9.957272952209389e-05, + "loss": 2.1352, + "step": 2295 + }, + { + "epoch": 0.7047268262737876, + "grad_norm": 0.5955870151519775, + "learning_rate": 9.95720808563182e-05, + "loss": 2.1852, + "step": 2296 + }, + { + "epoch": 0.7050337630448128, + "grad_norm": 0.6961265206336975, + "learning_rate": 9.957143170064214e-05, + "loss": 2.242, + "step": 2297 + }, + { + "epoch": 0.7053406998158379, + "grad_norm": 0.6966063380241394, + "learning_rate": 9.957078205507213e-05, + "loss": 2.1505, + "step": 2298 + }, + { + "epoch": 0.7056476365868631, + "grad_norm": 0.6155996322631836, + "learning_rate": 9.957013191961459e-05, + "loss": 2.1928, + "step": 2299 + }, + { + "epoch": 0.7059545733578882, + "grad_norm": 0.6092718839645386, + "learning_rate": 9.956948129427597e-05, + "loss": 2.138, + "step": 2300 + }, + { + "epoch": 0.7062615101289135, + "grad_norm": 0.645746111869812, + "learning_rate": 9.95688301790627e-05, + "loss": 2.2334, + "step": 2301 + }, + { + "epoch": 0.7065684468999386, + "grad_norm": 0.5959149599075317, + "learning_rate": 9.956817857398116e-05, + "loss": 2.1985, + "step": 2302 + }, + { + "epoch": 0.7068753836709638, + "grad_norm": 0.7127073407173157, + "learning_rate": 9.956752647903785e-05, + "loss": 2.2157, + "step": 2303 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.5589274764060974, + "learning_rate": 9.956687389423917e-05, + "loss": 2.1251, + "step": 2304 + }, + { + "epoch": 0.7074892572130141, + "grad_norm": 0.5502300262451172, + "learning_rate": 9.95662208195916e-05, + "loss": 2.1344, + "step": 2305 + }, + { + "epoch": 0.7077961939840393, + "grad_norm": 0.6577275991439819, + "learning_rate": 9.95655672551016e-05, + "loss": 2.1646, + "step": 2306 + }, + { + "epoch": 0.7081031307550645, + "grad_norm": 0.6241618394851685, + "learning_rate": 9.956491320077559e-05, + "loss": 2.1153, + "step": 2307 + }, + { + "epoch": 0.7084100675260896, + "grad_norm": 0.5846728086471558, + "learning_rate": 9.956425865662007e-05, + "loss": 2.1477, + "step": 2308 + }, + { + "epoch": 0.7087170042971148, + "grad_norm": 0.6005275249481201, + "learning_rate": 9.95636036226415e-05, + "loss": 2.2034, + "step": 2309 + }, + { + "epoch": 0.7090239410681399, + "grad_norm": 0.6545519828796387, + "learning_rate": 9.956294809884635e-05, + "loss": 2.23, + "step": 2310 + }, + { + "epoch": 0.7093308778391652, + "grad_norm": 0.7513750791549683, + "learning_rate": 9.956229208524108e-05, + "loss": 2.2497, + "step": 2311 + }, + { + "epoch": 0.7096378146101903, + "grad_norm": 0.7308349609375, + "learning_rate": 9.956163558183219e-05, + "loss": 2.166, + "step": 2312 + }, + { + "epoch": 0.7099447513812155, + "grad_norm": 0.6278798580169678, + "learning_rate": 9.956097858862619e-05, + "loss": 2.1994, + "step": 2313 + }, + { + "epoch": 0.7102516881522406, + "grad_norm": 0.6725621223449707, + "learning_rate": 9.956032110562953e-05, + "loss": 2.2212, + "step": 2314 + }, + { + "epoch": 0.7105586249232658, + "grad_norm": 0.7116945385932922, + "learning_rate": 9.955966313284872e-05, + "loss": 2.2033, + "step": 2315 + }, + { + "epoch": 0.7108655616942909, + "grad_norm": 0.5906245112419128, + "learning_rate": 9.95590046702903e-05, + "loss": 2.1419, + "step": 2316 + }, + { + "epoch": 0.7111724984653162, + "grad_norm": 0.6911863684654236, + "learning_rate": 9.955834571796073e-05, + "loss": 2.1697, + "step": 2317 + }, + { + "epoch": 0.7114794352363413, + "grad_norm": 0.600350558757782, + "learning_rate": 9.955768627586655e-05, + "loss": 2.0864, + "step": 2318 + }, + { + "epoch": 0.7117863720073665, + "grad_norm": 0.6246278285980225, + "learning_rate": 9.955702634401427e-05, + "loss": 2.1549, + "step": 2319 + }, + { + "epoch": 0.7120933087783916, + "grad_norm": 0.6530009508132935, + "learning_rate": 9.95563659224104e-05, + "loss": 2.1457, + "step": 2320 + }, + { + "epoch": 0.7124002455494168, + "grad_norm": 0.6566256880760193, + "learning_rate": 9.955570501106148e-05, + "loss": 2.1589, + "step": 2321 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.6607041358947754, + "learning_rate": 9.955504360997404e-05, + "loss": 2.1692, + "step": 2322 + }, + { + "epoch": 0.7130141190914672, + "grad_norm": 0.7257810235023499, + "learning_rate": 9.95543817191546e-05, + "loss": 2.2067, + "step": 2323 + }, + { + "epoch": 0.7133210558624923, + "grad_norm": 0.7413349151611328, + "learning_rate": 9.955371933860973e-05, + "loss": 2.1817, + "step": 2324 + }, + { + "epoch": 0.7136279926335175, + "grad_norm": 0.6968317031860352, + "learning_rate": 9.955305646834596e-05, + "loss": 2.2574, + "step": 2325 + }, + { + "epoch": 0.7139349294045426, + "grad_norm": 0.8065732717514038, + "learning_rate": 9.955239310836983e-05, + "loss": 2.1957, + "step": 2326 + }, + { + "epoch": 0.7142418661755678, + "grad_norm": 0.7563133835792542, + "learning_rate": 9.955172925868792e-05, + "loss": 2.2113, + "step": 2327 + }, + { + "epoch": 0.714548802946593, + "grad_norm": 0.6790496110916138, + "learning_rate": 9.955106491930678e-05, + "loss": 2.103, + "step": 2328 + }, + { + "epoch": 0.7148557397176182, + "grad_norm": 0.65167236328125, + "learning_rate": 9.955040009023298e-05, + "loss": 2.1919, + "step": 2329 + }, + { + "epoch": 0.7151626764886433, + "grad_norm": 0.6869332790374756, + "learning_rate": 9.954973477147307e-05, + "loss": 2.2141, + "step": 2330 + }, + { + "epoch": 0.7154696132596685, + "grad_norm": 0.8613699078559875, + "learning_rate": 9.954906896303363e-05, + "loss": 2.1962, + "step": 2331 + }, + { + "epoch": 0.7157765500306936, + "grad_norm": 0.8827282786369324, + "learning_rate": 9.954840266492127e-05, + "loss": 2.216, + "step": 2332 + }, + { + "epoch": 0.7160834868017188, + "grad_norm": 0.9737905263900757, + "learning_rate": 9.954773587714255e-05, + "loss": 2.2118, + "step": 2333 + }, + { + "epoch": 0.716390423572744, + "grad_norm": 0.9978635311126709, + "learning_rate": 9.954706859970404e-05, + "loss": 2.0998, + "step": 2334 + }, + { + "epoch": 0.7166973603437692, + "grad_norm": 0.8694623112678528, + "learning_rate": 9.954640083261238e-05, + "loss": 2.1533, + "step": 2335 + }, + { + "epoch": 0.7170042971147943, + "grad_norm": 0.641293466091156, + "learning_rate": 9.954573257587415e-05, + "loss": 2.2095, + "step": 2336 + }, + { + "epoch": 0.7173112338858195, + "grad_norm": 0.6289860010147095, + "learning_rate": 9.954506382949594e-05, + "loss": 2.1683, + "step": 2337 + }, + { + "epoch": 0.7176181706568447, + "grad_norm": 0.8292246460914612, + "learning_rate": 9.954439459348437e-05, + "loss": 2.1729, + "step": 2338 + }, + { + "epoch": 0.7179251074278699, + "grad_norm": 0.8990920782089233, + "learning_rate": 9.954372486784605e-05, + "loss": 2.0888, + "step": 2339 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.7905614376068115, + "learning_rate": 9.954305465258762e-05, + "loss": 2.2262, + "step": 2340 + }, + { + "epoch": 0.7185389809699202, + "grad_norm": 0.7142611145973206, + "learning_rate": 9.954238394771567e-05, + "loss": 2.1311, + "step": 2341 + }, + { + "epoch": 0.7188459177409454, + "grad_norm": 0.68161541223526, + "learning_rate": 9.954171275323684e-05, + "loss": 2.2622, + "step": 2342 + }, + { + "epoch": 0.7191528545119705, + "grad_norm": 0.7524895668029785, + "learning_rate": 9.954104106915779e-05, + "loss": 2.1709, + "step": 2343 + }, + { + "epoch": 0.7194597912829958, + "grad_norm": 0.7419885396957397, + "learning_rate": 9.954036889548511e-05, + "loss": 2.1528, + "step": 2344 + }, + { + "epoch": 0.7197667280540209, + "grad_norm": 0.8045634031295776, + "learning_rate": 9.953969623222547e-05, + "loss": 2.1774, + "step": 2345 + }, + { + "epoch": 0.7200736648250461, + "grad_norm": 0.6680217385292053, + "learning_rate": 9.953902307938554e-05, + "loss": 2.2345, + "step": 2346 + }, + { + "epoch": 0.7203806015960712, + "grad_norm": 0.6900907754898071, + "learning_rate": 9.953834943697193e-05, + "loss": 2.1696, + "step": 2347 + }, + { + "epoch": 0.7206875383670964, + "grad_norm": 0.7231009006500244, + "learning_rate": 9.953767530499132e-05, + "loss": 2.2556, + "step": 2348 + }, + { + "epoch": 0.7209944751381215, + "grad_norm": 0.7766092419624329, + "learning_rate": 9.953700068345036e-05, + "loss": 2.1522, + "step": 2349 + }, + { + "epoch": 0.7213014119091468, + "grad_norm": 0.7361852526664734, + "learning_rate": 9.953632557235574e-05, + "loss": 2.2427, + "step": 2350 + }, + { + "epoch": 0.7216083486801719, + "grad_norm": 0.7170109152793884, + "learning_rate": 9.953564997171411e-05, + "loss": 2.2439, + "step": 2351 + }, + { + "epoch": 0.7219152854511971, + "grad_norm": 0.7192662954330444, + "learning_rate": 9.953497388153214e-05, + "loss": 2.1242, + "step": 2352 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.7363288402557373, + "learning_rate": 9.953429730181653e-05, + "loss": 2.2748, + "step": 2353 + }, + { + "epoch": 0.7225291589932474, + "grad_norm": 0.8516983985900879, + "learning_rate": 9.953362023257397e-05, + "loss": 2.2471, + "step": 2354 + }, + { + "epoch": 0.7228360957642725, + "grad_norm": 0.7928574681282043, + "learning_rate": 9.953294267381114e-05, + "loss": 2.164, + "step": 2355 + }, + { + "epoch": 0.7231430325352978, + "grad_norm": 0.6803320646286011, + "learning_rate": 9.953226462553474e-05, + "loss": 2.1671, + "step": 2356 + }, + { + "epoch": 0.7234499693063229, + "grad_norm": 0.6811994910240173, + "learning_rate": 9.953158608775147e-05, + "loss": 2.1042, + "step": 2357 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.6077840328216553, + "learning_rate": 9.953090706046804e-05, + "loss": 2.2161, + "step": 2358 + }, + { + "epoch": 0.7240638428483732, + "grad_norm": 0.5938412547111511, + "learning_rate": 9.953022754369114e-05, + "loss": 2.1177, + "step": 2359 + }, + { + "epoch": 0.7243707796193984, + "grad_norm": 0.6752299070358276, + "learning_rate": 9.952954753742751e-05, + "loss": 2.2255, + "step": 2360 + }, + { + "epoch": 0.7246777163904236, + "grad_norm": 0.6745245456695557, + "learning_rate": 9.952886704168387e-05, + "loss": 2.1817, + "step": 2361 + }, + { + "epoch": 0.7249846531614488, + "grad_norm": 0.6645397543907166, + "learning_rate": 9.95281860564669e-05, + "loss": 2.2495, + "step": 2362 + }, + { + "epoch": 0.7252915899324739, + "grad_norm": 0.6758745312690735, + "learning_rate": 9.95275045817834e-05, + "loss": 2.2059, + "step": 2363 + }, + { + "epoch": 0.7255985267034991, + "grad_norm": 0.6584516763687134, + "learning_rate": 9.952682261764006e-05, + "loss": 2.1868, + "step": 2364 + }, + { + "epoch": 0.7259054634745242, + "grad_norm": 0.6335561871528625, + "learning_rate": 9.952614016404363e-05, + "loss": 2.1352, + "step": 2365 + }, + { + "epoch": 0.7262124002455494, + "grad_norm": 0.6656816601753235, + "learning_rate": 9.952545722100087e-05, + "loss": 2.1805, + "step": 2366 + }, + { + "epoch": 0.7265193370165746, + "grad_norm": 0.6262782216072083, + "learning_rate": 9.95247737885185e-05, + "loss": 2.1435, + "step": 2367 + }, + { + "epoch": 0.7268262737875998, + "grad_norm": 0.569795548915863, + "learning_rate": 9.952408986660329e-05, + "loss": 2.1547, + "step": 2368 + }, + { + "epoch": 0.7271332105586249, + "grad_norm": 0.5249118208885193, + "learning_rate": 9.952340545526199e-05, + "loss": 2.1213, + "step": 2369 + }, + { + "epoch": 0.7274401473296501, + "grad_norm": 0.5581740140914917, + "learning_rate": 9.952272055450139e-05, + "loss": 2.1866, + "step": 2370 + }, + { + "epoch": 0.7277470841006752, + "grad_norm": 0.5986969470977783, + "learning_rate": 9.952203516432821e-05, + "loss": 2.143, + "step": 2371 + }, + { + "epoch": 0.7280540208717005, + "grad_norm": 0.6426723599433899, + "learning_rate": 9.952134928474926e-05, + "loss": 2.2132, + "step": 2372 + }, + { + "epoch": 0.7283609576427256, + "grad_norm": 0.5856953263282776, + "learning_rate": 9.952066291577133e-05, + "loss": 2.1502, + "step": 2373 + }, + { + "epoch": 0.7286678944137508, + "grad_norm": 0.5420570969581604, + "learning_rate": 9.951997605740117e-05, + "loss": 2.1213, + "step": 2374 + }, + { + "epoch": 0.7289748311847759, + "grad_norm": 0.6201688647270203, + "learning_rate": 9.951928870964558e-05, + "loss": 2.218, + "step": 2375 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.7023850083351135, + "learning_rate": 9.951860087251137e-05, + "loss": 2.2787, + "step": 2376 + }, + { + "epoch": 0.7295887047268262, + "grad_norm": 0.733650803565979, + "learning_rate": 9.951791254600532e-05, + "loss": 2.1861, + "step": 2377 + }, + { + "epoch": 0.7298956414978515, + "grad_norm": 0.7177363038063049, + "learning_rate": 9.951722373013421e-05, + "loss": 2.1905, + "step": 2378 + }, + { + "epoch": 0.7302025782688766, + "grad_norm": 0.7963547706604004, + "learning_rate": 9.95165344249049e-05, + "loss": 2.1842, + "step": 2379 + }, + { + "epoch": 0.7305095150399018, + "grad_norm": 0.8466546535491943, + "learning_rate": 9.951584463032416e-05, + "loss": 2.1661, + "step": 2380 + }, + { + "epoch": 0.7308164518109269, + "grad_norm": 0.7288870811462402, + "learning_rate": 9.951515434639882e-05, + "loss": 2.1153, + "step": 2381 + }, + { + "epoch": 0.7311233885819521, + "grad_norm": 0.6168704032897949, + "learning_rate": 9.951446357313571e-05, + "loss": 2.121, + "step": 2382 + }, + { + "epoch": 0.7314303253529773, + "grad_norm": 0.6534848809242249, + "learning_rate": 9.951377231054166e-05, + "loss": 2.2087, + "step": 2383 + }, + { + "epoch": 0.7317372621240025, + "grad_norm": 0.7872020602226257, + "learning_rate": 9.951308055862347e-05, + "loss": 2.2428, + "step": 2384 + }, + { + "epoch": 0.7320441988950276, + "grad_norm": 0.864799439907074, + "learning_rate": 9.9512388317388e-05, + "loss": 2.2392, + "step": 2385 + }, + { + "epoch": 0.7323511356660528, + "grad_norm": 0.7365485429763794, + "learning_rate": 9.95116955868421e-05, + "loss": 2.1614, + "step": 2386 + }, + { + "epoch": 0.7326580724370779, + "grad_norm": 0.6509390473365784, + "learning_rate": 9.95110023669926e-05, + "loss": 2.1917, + "step": 2387 + }, + { + "epoch": 0.7329650092081031, + "grad_norm": 0.7660403847694397, + "learning_rate": 9.951030865784635e-05, + "loss": 2.2414, + "step": 2388 + }, + { + "epoch": 0.7332719459791283, + "grad_norm": 0.9997872114181519, + "learning_rate": 9.950961445941022e-05, + "loss": 2.2063, + "step": 2389 + }, + { + "epoch": 0.7335788827501535, + "grad_norm": 1.0113418102264404, + "learning_rate": 9.950891977169106e-05, + "loss": 2.1898, + "step": 2390 + }, + { + "epoch": 0.7338858195211786, + "grad_norm": 0.8849206566810608, + "learning_rate": 9.950822459469573e-05, + "loss": 2.1503, + "step": 2391 + }, + { + "epoch": 0.7341927562922038, + "grad_norm": 0.6561055779457092, + "learning_rate": 9.950752892843112e-05, + "loss": 2.1234, + "step": 2392 + }, + { + "epoch": 0.7344996930632289, + "grad_norm": 0.5568758845329285, + "learning_rate": 9.950683277290407e-05, + "loss": 2.2129, + "step": 2393 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.7019078135490417, + "learning_rate": 9.950613612812149e-05, + "loss": 2.1162, + "step": 2394 + }, + { + "epoch": 0.7351135666052793, + "grad_norm": 0.7633521556854248, + "learning_rate": 9.950543899409026e-05, + "loss": 2.2427, + "step": 2395 + }, + { + "epoch": 0.7354205033763045, + "grad_norm": 0.6743205785751343, + "learning_rate": 9.950474137081726e-05, + "loss": 2.2213, + "step": 2396 + }, + { + "epoch": 0.7357274401473296, + "grad_norm": 0.6008336544036865, + "learning_rate": 9.950404325830941e-05, + "loss": 2.1605, + "step": 2397 + }, + { + "epoch": 0.7360343769183548, + "grad_norm": 0.648760199546814, + "learning_rate": 9.950334465657357e-05, + "loss": 2.2298, + "step": 2398 + }, + { + "epoch": 0.7363413136893799, + "grad_norm": 0.6996559500694275, + "learning_rate": 9.950264556561667e-05, + "loss": 2.1616, + "step": 2399 + }, + { + "epoch": 0.7366482504604052, + "grad_norm": 0.741629421710968, + "learning_rate": 9.950194598544561e-05, + "loss": 2.2162, + "step": 2400 + }, + { + "epoch": 0.7369551872314303, + "grad_norm": 0.6144673824310303, + "learning_rate": 9.95012459160673e-05, + "loss": 2.15, + "step": 2401 + }, + { + "epoch": 0.7372621240024555, + "grad_norm": 0.5826541781425476, + "learning_rate": 9.950054535748867e-05, + "loss": 2.1792, + "step": 2402 + }, + { + "epoch": 0.7375690607734806, + "grad_norm": 0.6489288806915283, + "learning_rate": 9.949984430971665e-05, + "loss": 2.1703, + "step": 2403 + }, + { + "epoch": 0.7378759975445058, + "grad_norm": 0.6752250790596008, + "learning_rate": 9.949914277275814e-05, + "loss": 2.2561, + "step": 2404 + }, + { + "epoch": 0.738182934315531, + "grad_norm": 0.5570092797279358, + "learning_rate": 9.94984407466201e-05, + "loss": 2.1418, + "step": 2405 + }, + { + "epoch": 0.7384898710865562, + "grad_norm": 0.5966812968254089, + "learning_rate": 9.949773823130944e-05, + "loss": 2.2168, + "step": 2406 + }, + { + "epoch": 0.7387968078575813, + "grad_norm": 0.6253142952919006, + "learning_rate": 9.949703522683314e-05, + "loss": 2.1646, + "step": 2407 + }, + { + "epoch": 0.7391037446286065, + "grad_norm": 0.6673659086227417, + "learning_rate": 9.94963317331981e-05, + "loss": 2.1904, + "step": 2408 + }, + { + "epoch": 0.7394106813996317, + "grad_norm": 0.6243279576301575, + "learning_rate": 9.949562775041133e-05, + "loss": 2.2568, + "step": 2409 + }, + { + "epoch": 0.7397176181706568, + "grad_norm": 0.7014298439025879, + "learning_rate": 9.949492327847973e-05, + "loss": 2.2331, + "step": 2410 + }, + { + "epoch": 0.7400245549416821, + "grad_norm": 0.698403537273407, + "learning_rate": 9.94942183174103e-05, + "loss": 2.1928, + "step": 2411 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.6354022026062012, + "learning_rate": 9.949351286721001e-05, + "loss": 2.0975, + "step": 2412 + }, + { + "epoch": 0.7406384284837324, + "grad_norm": 0.595302164554596, + "learning_rate": 9.949280692788579e-05, + "loss": 2.177, + "step": 2413 + }, + { + "epoch": 0.7409453652547575, + "grad_norm": 0.6844484210014343, + "learning_rate": 9.949210049944465e-05, + "loss": 2.1962, + "step": 2414 + }, + { + "epoch": 0.7412523020257827, + "grad_norm": 0.6242616176605225, + "learning_rate": 9.949139358189357e-05, + "loss": 2.2143, + "step": 2415 + }, + { + "epoch": 0.7415592387968079, + "grad_norm": 0.6524595022201538, + "learning_rate": 9.949068617523954e-05, + "loss": 2.1438, + "step": 2416 + }, + { + "epoch": 0.7418661755678331, + "grad_norm": 0.6667510867118835, + "learning_rate": 9.948997827948953e-05, + "loss": 2.2115, + "step": 2417 + }, + { + "epoch": 0.7421731123388582, + "grad_norm": 0.7688906192779541, + "learning_rate": 9.948926989465056e-05, + "loss": 2.1887, + "step": 2418 + }, + { + "epoch": 0.7424800491098834, + "grad_norm": 0.6888165473937988, + "learning_rate": 9.948856102072958e-05, + "loss": 2.1349, + "step": 2419 + }, + { + "epoch": 0.7427869858809085, + "grad_norm": 0.5672495365142822, + "learning_rate": 9.948785165773367e-05, + "loss": 2.1109, + "step": 2420 + }, + { + "epoch": 0.7430939226519337, + "grad_norm": 0.5714489221572876, + "learning_rate": 9.94871418056698e-05, + "loss": 2.1483, + "step": 2421 + }, + { + "epoch": 0.7434008594229589, + "grad_norm": 0.6061533093452454, + "learning_rate": 9.948643146454498e-05, + "loss": 2.211, + "step": 2422 + }, + { + "epoch": 0.7437077961939841, + "grad_norm": 0.6132726073265076, + "learning_rate": 9.948572063436625e-05, + "loss": 2.23, + "step": 2423 + }, + { + "epoch": 0.7440147329650092, + "grad_norm": 0.684301495552063, + "learning_rate": 9.948500931514062e-05, + "loss": 2.129, + "step": 2424 + }, + { + "epoch": 0.7443216697360344, + "grad_norm": 0.6325442790985107, + "learning_rate": 9.948429750687512e-05, + "loss": 2.129, + "step": 2425 + }, + { + "epoch": 0.7446286065070595, + "grad_norm": 0.6245989203453064, + "learning_rate": 9.948358520957678e-05, + "loss": 2.1999, + "step": 2426 + }, + { + "epoch": 0.7449355432780848, + "grad_norm": 0.6638534069061279, + "learning_rate": 9.948287242325267e-05, + "loss": 2.203, + "step": 2427 + }, + { + "epoch": 0.7452424800491099, + "grad_norm": 0.6121437549591064, + "learning_rate": 9.94821591479098e-05, + "loss": 2.1204, + "step": 2428 + }, + { + "epoch": 0.7455494168201351, + "grad_norm": 0.7919846177101135, + "learning_rate": 9.948144538355522e-05, + "loss": 2.2353, + "step": 2429 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.7246984839439392, + "learning_rate": 9.948073113019602e-05, + "loss": 2.1284, + "step": 2430 + }, + { + "epoch": 0.7461632903621854, + "grad_norm": 0.6120265126228333, + "learning_rate": 9.948001638783921e-05, + "loss": 2.0873, + "step": 2431 + }, + { + "epoch": 0.7464702271332105, + "grad_norm": 0.628588080406189, + "learning_rate": 9.947930115649189e-05, + "loss": 2.1713, + "step": 2432 + }, + { + "epoch": 0.7467771639042358, + "grad_norm": 0.63116854429245, + "learning_rate": 9.947858543616111e-05, + "loss": 2.123, + "step": 2433 + }, + { + "epoch": 0.7470841006752609, + "grad_norm": 0.6533017754554749, + "learning_rate": 9.947786922685394e-05, + "loss": 2.1593, + "step": 2434 + }, + { + "epoch": 0.7473910374462861, + "grad_norm": 0.6854177117347717, + "learning_rate": 9.947715252857749e-05, + "loss": 2.162, + "step": 2435 + }, + { + "epoch": 0.7476979742173112, + "grad_norm": 0.7257967591285706, + "learning_rate": 9.94764353413388e-05, + "loss": 2.2644, + "step": 2436 + }, + { + "epoch": 0.7480049109883364, + "grad_norm": 0.6806700825691223, + "learning_rate": 9.947571766514498e-05, + "loss": 2.0875, + "step": 2437 + }, + { + "epoch": 0.7483118477593615, + "grad_norm": 0.6616181135177612, + "learning_rate": 9.947499950000312e-05, + "loss": 2.1353, + "step": 2438 + }, + { + "epoch": 0.7486187845303868, + "grad_norm": 0.7249685525894165, + "learning_rate": 9.947428084592032e-05, + "loss": 2.148, + "step": 2439 + }, + { + "epoch": 0.7489257213014119, + "grad_norm": 0.6372905969619751, + "learning_rate": 9.947356170290369e-05, + "loss": 2.1749, + "step": 2440 + }, + { + "epoch": 0.7492326580724371, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.947284207096031e-05, + "loss": 2.1909, + "step": 2441 + }, + { + "epoch": 0.7495395948434622, + "grad_norm": 0.5830507278442383, + "learning_rate": 9.94721219500973e-05, + "loss": 2.1351, + "step": 2442 + }, + { + "epoch": 0.7498465316144874, + "grad_norm": 0.650262713432312, + "learning_rate": 9.94714013403218e-05, + "loss": 2.2602, + "step": 2443 + }, + { + "epoch": 0.7501534683855126, + "grad_norm": 0.6658717393875122, + "learning_rate": 9.947068024164091e-05, + "loss": 2.0919, + "step": 2444 + }, + { + "epoch": 0.7504604051565378, + "grad_norm": 0.7299105525016785, + "learning_rate": 9.946995865406177e-05, + "loss": 2.2079, + "step": 2445 + }, + { + "epoch": 0.7507673419275629, + "grad_norm": 0.762246310710907, + "learning_rate": 9.946923657759148e-05, + "loss": 2.2225, + "step": 2446 + }, + { + "epoch": 0.7510742786985881, + "grad_norm": 0.7019835710525513, + "learning_rate": 9.946851401223722e-05, + "loss": 2.175, + "step": 2447 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.6214791536331177, + "learning_rate": 9.946779095800611e-05, + "loss": 2.2095, + "step": 2448 + }, + { + "epoch": 0.7516881522406385, + "grad_norm": 0.6380667090415955, + "learning_rate": 9.94670674149053e-05, + "loss": 2.2325, + "step": 2449 + }, + { + "epoch": 0.7519950890116636, + "grad_norm": 0.6175886392593384, + "learning_rate": 9.946634338294191e-05, + "loss": 2.1431, + "step": 2450 + }, + { + "epoch": 0.7523020257826888, + "grad_norm": 0.6642621159553528, + "learning_rate": 9.946561886212315e-05, + "loss": 2.1538, + "step": 2451 + }, + { + "epoch": 0.7526089625537139, + "grad_norm": 0.7078617215156555, + "learning_rate": 9.946489385245614e-05, + "loss": 2.1544, + "step": 2452 + }, + { + "epoch": 0.7529158993247391, + "grad_norm": 0.6939398050308228, + "learning_rate": 9.946416835394806e-05, + "loss": 2.1131, + "step": 2453 + }, + { + "epoch": 0.7532228360957642, + "grad_norm": 0.7080716490745544, + "learning_rate": 9.946344236660608e-05, + "loss": 2.2135, + "step": 2454 + }, + { + "epoch": 0.7535297728667895, + "grad_norm": 0.7451115250587463, + "learning_rate": 9.946271589043736e-05, + "loss": 2.1475, + "step": 2455 + }, + { + "epoch": 0.7538367096378146, + "grad_norm": 0.6718367338180542, + "learning_rate": 9.946198892544909e-05, + "loss": 2.1853, + "step": 2456 + }, + { + "epoch": 0.7541436464088398, + "grad_norm": 0.7071637511253357, + "learning_rate": 9.946126147164847e-05, + "loss": 2.0981, + "step": 2457 + }, + { + "epoch": 0.7544505831798649, + "grad_norm": 0.6745624542236328, + "learning_rate": 9.946053352904267e-05, + "loss": 2.1914, + "step": 2458 + }, + { + "epoch": 0.7547575199508901, + "grad_norm": 0.7267486453056335, + "learning_rate": 9.945980509763888e-05, + "loss": 2.1091, + "step": 2459 + }, + { + "epoch": 0.7550644567219152, + "grad_norm": 0.6128695607185364, + "learning_rate": 9.94590761774443e-05, + "loss": 2.1721, + "step": 2460 + }, + { + "epoch": 0.7553713934929405, + "grad_norm": 0.6574678421020508, + "learning_rate": 9.945834676846615e-05, + "loss": 2.1609, + "step": 2461 + }, + { + "epoch": 0.7556783302639656, + "grad_norm": 0.6209995150566101, + "learning_rate": 9.945761687071164e-05, + "loss": 2.1889, + "step": 2462 + }, + { + "epoch": 0.7559852670349908, + "grad_norm": 0.7425361275672913, + "learning_rate": 9.945688648418795e-05, + "loss": 2.2189, + "step": 2463 + }, + { + "epoch": 0.7562922038060159, + "grad_norm": 1.0604934692382812, + "learning_rate": 9.945615560890234e-05, + "loss": 2.1858, + "step": 2464 + }, + { + "epoch": 0.7565991405770411, + "grad_norm": 0.7162829041481018, + "learning_rate": 9.945542424486201e-05, + "loss": 2.101, + "step": 2465 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.6361207962036133, + "learning_rate": 9.945469239207416e-05, + "loss": 2.0836, + "step": 2466 + }, + { + "epoch": 0.7572130141190915, + "grad_norm": 0.5858156085014343, + "learning_rate": 9.945396005054609e-05, + "loss": 2.2059, + "step": 2467 + }, + { + "epoch": 0.7575199508901166, + "grad_norm": 0.7322074174880981, + "learning_rate": 9.945322722028498e-05, + "loss": 2.2295, + "step": 2468 + }, + { + "epoch": 0.7578268876611418, + "grad_norm": 0.775900661945343, + "learning_rate": 9.945249390129811e-05, + "loss": 2.2171, + "step": 2469 + }, + { + "epoch": 0.7581338244321669, + "grad_norm": 0.8801379799842834, + "learning_rate": 9.94517600935927e-05, + "loss": 2.1632, + "step": 2470 + }, + { + "epoch": 0.7584407612031921, + "grad_norm": 0.8258405923843384, + "learning_rate": 9.945102579717602e-05, + "loss": 2.1591, + "step": 2471 + }, + { + "epoch": 0.7587476979742173, + "grad_norm": 0.7472482323646545, + "learning_rate": 9.945029101205532e-05, + "loss": 2.2242, + "step": 2472 + }, + { + "epoch": 0.7590546347452425, + "grad_norm": 0.6594643592834473, + "learning_rate": 9.944955573823785e-05, + "loss": 2.1217, + "step": 2473 + }, + { + "epoch": 0.7593615715162676, + "grad_norm": 0.6547524333000183, + "learning_rate": 9.944881997573088e-05, + "loss": 2.131, + "step": 2474 + }, + { + "epoch": 0.7596685082872928, + "grad_norm": 0.6630129814147949, + "learning_rate": 9.94480837245417e-05, + "loss": 2.1264, + "step": 2475 + }, + { + "epoch": 0.7599754450583179, + "grad_norm": 0.6877384781837463, + "learning_rate": 9.944734698467757e-05, + "loss": 2.2453, + "step": 2476 + }, + { + "epoch": 0.7602823818293432, + "grad_norm": 0.6736158728599548, + "learning_rate": 9.944660975614579e-05, + "loss": 2.1425, + "step": 2477 + }, + { + "epoch": 0.7605893186003683, + "grad_norm": 0.6140786409378052, + "learning_rate": 9.944587203895361e-05, + "loss": 2.1345, + "step": 2478 + }, + { + "epoch": 0.7608962553713935, + "grad_norm": 0.5515910387039185, + "learning_rate": 9.944513383310837e-05, + "loss": 2.086, + "step": 2479 + }, + { + "epoch": 0.7612031921424187, + "grad_norm": 0.49419671297073364, + "learning_rate": 9.944439513861731e-05, + "loss": 2.1069, + "step": 2480 + }, + { + "epoch": 0.7615101289134438, + "grad_norm": 0.5526577234268188, + "learning_rate": 9.944365595548777e-05, + "loss": 2.1702, + "step": 2481 + }, + { + "epoch": 0.761817065684469, + "grad_norm": 0.5430580973625183, + "learning_rate": 9.944291628372702e-05, + "loss": 2.121, + "step": 2482 + }, + { + "epoch": 0.7621240024554942, + "grad_norm": 0.5333554148674011, + "learning_rate": 9.94421761233424e-05, + "loss": 2.1154, + "step": 2483 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.5856761932373047, + "learning_rate": 9.944143547434124e-05, + "loss": 2.1734, + "step": 2484 + }, + { + "epoch": 0.7627378759975445, + "grad_norm": 0.6619083881378174, + "learning_rate": 9.944069433673082e-05, + "loss": 2.2068, + "step": 2485 + }, + { + "epoch": 0.7630448127685697, + "grad_norm": 0.5791018009185791, + "learning_rate": 9.943995271051849e-05, + "loss": 2.0834, + "step": 2486 + }, + { + "epoch": 0.7633517495395948, + "grad_norm": 0.5942522287368774, + "learning_rate": 9.943921059571155e-05, + "loss": 2.2001, + "step": 2487 + }, + { + "epoch": 0.7636586863106201, + "grad_norm": 0.6285880208015442, + "learning_rate": 9.943846799231738e-05, + "loss": 2.1601, + "step": 2488 + }, + { + "epoch": 0.7639656230816452, + "grad_norm": 0.6337715983390808, + "learning_rate": 9.943772490034326e-05, + "loss": 2.1722, + "step": 2489 + }, + { + "epoch": 0.7642725598526704, + "grad_norm": 0.6912121772766113, + "learning_rate": 9.94369813197966e-05, + "loss": 2.1933, + "step": 2490 + }, + { + "epoch": 0.7645794966236955, + "grad_norm": 0.8028284311294556, + "learning_rate": 9.943623725068469e-05, + "loss": 2.129, + "step": 2491 + }, + { + "epoch": 0.7648864333947207, + "grad_norm": 0.8527138233184814, + "learning_rate": 9.943549269301491e-05, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.7651933701657458, + "grad_norm": 0.8422580361366272, + "learning_rate": 9.943474764679462e-05, + "loss": 2.2958, + "step": 2493 + }, + { + "epoch": 0.7655003069367711, + "grad_norm": 0.7698150873184204, + "learning_rate": 9.943400211203118e-05, + "loss": 2.1415, + "step": 2494 + }, + { + "epoch": 0.7658072437077962, + "grad_norm": 0.6360690593719482, + "learning_rate": 9.943325608873196e-05, + "loss": 2.1188, + "step": 2495 + }, + { + "epoch": 0.7661141804788214, + "grad_norm": 0.6225799918174744, + "learning_rate": 9.943250957690433e-05, + "loss": 2.1006, + "step": 2496 + }, + { + "epoch": 0.7664211172498465, + "grad_norm": 0.6694490909576416, + "learning_rate": 9.943176257655567e-05, + "loss": 2.2455, + "step": 2497 + }, + { + "epoch": 0.7667280540208717, + "grad_norm": 0.6188158988952637, + "learning_rate": 9.943101508769335e-05, + "loss": 2.0853, + "step": 2498 + }, + { + "epoch": 0.7670349907918969, + "grad_norm": 0.5934504866600037, + "learning_rate": 9.943026711032477e-05, + "loss": 2.0718, + "step": 2499 + }, + { + "epoch": 0.7673419275629221, + "grad_norm": 0.6261292695999146, + "learning_rate": 9.942951864445732e-05, + "loss": 2.1747, + "step": 2500 + }, + { + "epoch": 0.7676488643339472, + "grad_norm": 0.5891184210777283, + "learning_rate": 9.94287696900984e-05, + "loss": 2.1637, + "step": 2501 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.5321740508079529, + "learning_rate": 9.94280202472554e-05, + "loss": 2.0717, + "step": 2502 + }, + { + "epoch": 0.7682627378759975, + "grad_norm": 0.5563281178474426, + "learning_rate": 9.942727031593573e-05, + "loss": 2.1654, + "step": 2503 + }, + { + "epoch": 0.7685696746470227, + "grad_norm": 0.5672664046287537, + "learning_rate": 9.942651989614681e-05, + "loss": 2.0853, + "step": 2504 + }, + { + "epoch": 0.7688766114180479, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.942576898789606e-05, + "loss": 2.0636, + "step": 2505 + }, + { + "epoch": 0.7691835481890731, + "grad_norm": 0.5802470445632935, + "learning_rate": 9.942501759119088e-05, + "loss": 2.0924, + "step": 2506 + }, + { + "epoch": 0.7694904849600982, + "grad_norm": 0.5630003213882446, + "learning_rate": 9.94242657060387e-05, + "loss": 2.1975, + "step": 2507 + }, + { + "epoch": 0.7697974217311234, + "grad_norm": 0.6001835465431213, + "learning_rate": 9.942351333244697e-05, + "loss": 2.1187, + "step": 2508 + }, + { + "epoch": 0.7701043585021485, + "grad_norm": 0.6702088117599487, + "learning_rate": 9.942276047042311e-05, + "loss": 2.1489, + "step": 2509 + }, + { + "epoch": 0.7704112952731738, + "grad_norm": 0.7941808700561523, + "learning_rate": 9.942200711997456e-05, + "loss": 2.1404, + "step": 2510 + }, + { + "epoch": 0.7707182320441989, + "grad_norm": 0.8202539682388306, + "learning_rate": 9.942125328110876e-05, + "loss": 2.1242, + "step": 2511 + }, + { + "epoch": 0.7710251688152241, + "grad_norm": 0.7667655348777771, + "learning_rate": 9.942049895383319e-05, + "loss": 2.118, + "step": 2512 + }, + { + "epoch": 0.7713321055862492, + "grad_norm": 0.6766887307167053, + "learning_rate": 9.941974413815527e-05, + "loss": 2.2632, + "step": 2513 + }, + { + "epoch": 0.7716390423572744, + "grad_norm": 0.5923287272453308, + "learning_rate": 9.941898883408248e-05, + "loss": 2.1096, + "step": 2514 + }, + { + "epoch": 0.7719459791282995, + "grad_norm": 0.8847586512565613, + "learning_rate": 9.941823304162227e-05, + "loss": 2.2629, + "step": 2515 + }, + { + "epoch": 0.7722529158993248, + "grad_norm": 1.2274069786071777, + "learning_rate": 9.941747676078211e-05, + "loss": 2.2493, + "step": 2516 + }, + { + "epoch": 0.7725598526703499, + "grad_norm": 0.8637729287147522, + "learning_rate": 9.94167199915695e-05, + "loss": 2.1545, + "step": 2517 + }, + { + "epoch": 0.7728667894413751, + "grad_norm": 0.7852178812026978, + "learning_rate": 9.941596273399187e-05, + "loss": 2.1984, + "step": 2518 + }, + { + "epoch": 0.7731737262124002, + "grad_norm": 0.6839576959609985, + "learning_rate": 9.941520498805677e-05, + "loss": 2.1913, + "step": 2519 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.7051649689674377, + "learning_rate": 9.941444675377163e-05, + "loss": 2.1678, + "step": 2520 + }, + { + "epoch": 0.7737875997544506, + "grad_norm": 0.702549159526825, + "learning_rate": 9.941368803114395e-05, + "loss": 2.1426, + "step": 2521 + }, + { + "epoch": 0.7740945365254758, + "grad_norm": 0.6717942953109741, + "learning_rate": 9.941292882018127e-05, + "loss": 2.1873, + "step": 2522 + }, + { + "epoch": 0.7744014732965009, + "grad_norm": 0.6705282926559448, + "learning_rate": 9.941216912089104e-05, + "loss": 2.1363, + "step": 2523 + }, + { + "epoch": 0.7747084100675261, + "grad_norm": 0.5858317017555237, + "learning_rate": 9.941140893328082e-05, + "loss": 2.1019, + "step": 2524 + }, + { + "epoch": 0.7750153468385512, + "grad_norm": 0.6353682279586792, + "learning_rate": 9.941064825735808e-05, + "loss": 2.1765, + "step": 2525 + }, + { + "epoch": 0.7753222836095764, + "grad_norm": 0.6573354601860046, + "learning_rate": 9.940988709313035e-05, + "loss": 2.0636, + "step": 2526 + }, + { + "epoch": 0.7756292203806016, + "grad_norm": 0.6040489077568054, + "learning_rate": 9.940912544060517e-05, + "loss": 2.0902, + "step": 2527 + }, + { + "epoch": 0.7759361571516268, + "grad_norm": 0.7024530172348022, + "learning_rate": 9.940836329979004e-05, + "loss": 2.2198, + "step": 2528 + }, + { + "epoch": 0.7762430939226519, + "grad_norm": 0.6910196542739868, + "learning_rate": 9.940760067069251e-05, + "loss": 2.0546, + "step": 2529 + }, + { + "epoch": 0.7765500306936771, + "grad_norm": 0.6841506361961365, + "learning_rate": 9.940683755332012e-05, + "loss": 2.2159, + "step": 2530 + }, + { + "epoch": 0.7768569674647022, + "grad_norm": 0.6503066420555115, + "learning_rate": 9.940607394768038e-05, + "loss": 2.2156, + "step": 2531 + }, + { + "epoch": 0.7771639042357275, + "grad_norm": 0.6512146592140198, + "learning_rate": 9.940530985378089e-05, + "loss": 2.1417, + "step": 2532 + }, + { + "epoch": 0.7774708410067526, + "grad_norm": 0.6234787106513977, + "learning_rate": 9.940454527162914e-05, + "loss": 2.1315, + "step": 2533 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.6279457211494446, + "learning_rate": 9.940378020123273e-05, + "loss": 2.2699, + "step": 2534 + }, + { + "epoch": 0.7780847145488029, + "grad_norm": 0.6793956160545349, + "learning_rate": 9.940301464259921e-05, + "loss": 2.2488, + "step": 2535 + }, + { + "epoch": 0.7783916513198281, + "grad_norm": 0.721234142780304, + "learning_rate": 9.940224859573614e-05, + "loss": 2.1183, + "step": 2536 + }, + { + "epoch": 0.7786985880908532, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.94014820606511e-05, + "loss": 2.0995, + "step": 2537 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.6358578205108643, + "learning_rate": 9.940071503735165e-05, + "loss": 2.2024, + "step": 2538 + }, + { + "epoch": 0.7793124616329036, + "grad_norm": 0.6250868439674377, + "learning_rate": 9.939994752584538e-05, + "loss": 2.1574, + "step": 2539 + }, + { + "epoch": 0.7796193984039288, + "grad_norm": 0.7657763361930847, + "learning_rate": 9.939917952613989e-05, + "loss": 2.2625, + "step": 2540 + }, + { + "epoch": 0.7799263351749539, + "grad_norm": 0.7625400424003601, + "learning_rate": 9.939841103824275e-05, + "loss": 2.1809, + "step": 2541 + }, + { + "epoch": 0.7802332719459791, + "grad_norm": 0.8593107461929321, + "learning_rate": 9.939764206216155e-05, + "loss": 2.2359, + "step": 2542 + }, + { + "epoch": 0.7805402087170042, + "grad_norm": 0.8441007733345032, + "learning_rate": 9.93968725979039e-05, + "loss": 2.1844, + "step": 2543 + }, + { + "epoch": 0.7808471454880295, + "grad_norm": 0.6408470273017883, + "learning_rate": 9.93961026454774e-05, + "loss": 2.1871, + "step": 2544 + }, + { + "epoch": 0.7811540822590546, + "grad_norm": 0.6779976487159729, + "learning_rate": 9.939533220488966e-05, + "loss": 2.1651, + "step": 2545 + }, + { + "epoch": 0.7814610190300798, + "grad_norm": 0.5885556936264038, + "learning_rate": 9.93945612761483e-05, + "loss": 2.0172, + "step": 2546 + }, + { + "epoch": 0.7817679558011049, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.939378985926094e-05, + "loss": 2.1358, + "step": 2547 + }, + { + "epoch": 0.7820748925721301, + "grad_norm": 0.685183584690094, + "learning_rate": 9.939301795423519e-05, + "loss": 2.1822, + "step": 2548 + }, + { + "epoch": 0.7823818293431553, + "grad_norm": 0.6666997671127319, + "learning_rate": 9.939224556107869e-05, + "loss": 2.288, + "step": 2549 + }, + { + "epoch": 0.7826887661141805, + "grad_norm": 0.6401170492172241, + "learning_rate": 9.939147267979905e-05, + "loss": 2.1038, + "step": 2550 + }, + { + "epoch": 0.7829957028852057, + "grad_norm": 0.645182728767395, + "learning_rate": 9.939069931040396e-05, + "loss": 2.1285, + "step": 2551 + }, + { + "epoch": 0.7833026396562308, + "grad_norm": 0.6795851588249207, + "learning_rate": 9.9389925452901e-05, + "loss": 2.1844, + "step": 2552 + }, + { + "epoch": 0.783609576427256, + "grad_norm": 0.7027488946914673, + "learning_rate": 9.938915110729788e-05, + "loss": 2.1712, + "step": 2553 + }, + { + "epoch": 0.7839165131982812, + "grad_norm": 0.7076524496078491, + "learning_rate": 9.93883762736022e-05, + "loss": 2.1812, + "step": 2554 + }, + { + "epoch": 0.7842234499693064, + "grad_norm": 0.5979459881782532, + "learning_rate": 9.938760095182165e-05, + "loss": 2.0877, + "step": 2555 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.6408665776252747, + "learning_rate": 9.938682514196387e-05, + "loss": 2.191, + "step": 2556 + }, + { + "epoch": 0.7848373235113567, + "grad_norm": 0.6545908451080322, + "learning_rate": 9.938604884403654e-05, + "loss": 2.0933, + "step": 2557 + }, + { + "epoch": 0.7851442602823818, + "grad_norm": 0.7271838784217834, + "learning_rate": 9.938527205804733e-05, + "loss": 2.1804, + "step": 2558 + }, + { + "epoch": 0.785451197053407, + "grad_norm": 0.6371840834617615, + "learning_rate": 9.938449478400391e-05, + "loss": 2.1161, + "step": 2559 + }, + { + "epoch": 0.7857581338244322, + "grad_norm": 0.5922467708587646, + "learning_rate": 9.938371702191398e-05, + "loss": 2.0929, + "step": 2560 + }, + { + "epoch": 0.7860650705954574, + "grad_norm": 0.536125898361206, + "learning_rate": 9.938293877178522e-05, + "loss": 2.0815, + "step": 2561 + }, + { + "epoch": 0.7863720073664825, + "grad_norm": 0.6026225090026855, + "learning_rate": 9.93821600336253e-05, + "loss": 2.1719, + "step": 2562 + }, + { + "epoch": 0.7866789441375077, + "grad_norm": 0.584267795085907, + "learning_rate": 9.938138080744192e-05, + "loss": 2.1515, + "step": 2563 + }, + { + "epoch": 0.7869858809085328, + "grad_norm": 0.6616362929344177, + "learning_rate": 9.938060109324281e-05, + "loss": 2.2425, + "step": 2564 + }, + { + "epoch": 0.787292817679558, + "grad_norm": 0.669987678527832, + "learning_rate": 9.937982089103566e-05, + "loss": 2.1883, + "step": 2565 + }, + { + "epoch": 0.7875997544505832, + "grad_norm": 0.6769465208053589, + "learning_rate": 9.937904020082815e-05, + "loss": 2.1508, + "step": 2566 + }, + { + "epoch": 0.7879066912216084, + "grad_norm": 0.5796112418174744, + "learning_rate": 9.937825902262805e-05, + "loss": 2.0925, + "step": 2567 + }, + { + "epoch": 0.7882136279926335, + "grad_norm": 0.5895870923995972, + "learning_rate": 9.937747735644305e-05, + "loss": 2.1002, + "step": 2568 + }, + { + "epoch": 0.7885205647636587, + "grad_norm": 0.5870219469070435, + "learning_rate": 9.937669520228088e-05, + "loss": 2.1189, + "step": 2569 + }, + { + "epoch": 0.7888275015346838, + "grad_norm": 0.6191404461860657, + "learning_rate": 9.937591256014925e-05, + "loss": 2.1783, + "step": 2570 + }, + { + "epoch": 0.7891344383057091, + "grad_norm": 0.6033806204795837, + "learning_rate": 9.937512943005592e-05, + "loss": 2.1507, + "step": 2571 + }, + { + "epoch": 0.7894413750767342, + "grad_norm": 0.6319470405578613, + "learning_rate": 9.937434581200863e-05, + "loss": 2.2088, + "step": 2572 + }, + { + "epoch": 0.7897483118477594, + "grad_norm": 0.621004581451416, + "learning_rate": 9.93735617060151e-05, + "loss": 2.1523, + "step": 2573 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.6069821715354919, + "learning_rate": 9.937277711208311e-05, + "loss": 2.1437, + "step": 2574 + }, + { + "epoch": 0.7903621853898097, + "grad_norm": 0.6186996102333069, + "learning_rate": 9.937199203022039e-05, + "loss": 2.1541, + "step": 2575 + }, + { + "epoch": 0.7906691221608348, + "grad_norm": 0.6531949639320374, + "learning_rate": 9.937120646043471e-05, + "loss": 2.1928, + "step": 2576 + }, + { + "epoch": 0.7909760589318601, + "grad_norm": 0.5974560379981995, + "learning_rate": 9.937042040273383e-05, + "loss": 2.1814, + "step": 2577 + }, + { + "epoch": 0.7912829957028852, + "grad_norm": 0.59506756067276, + "learning_rate": 9.936963385712552e-05, + "loss": 2.2143, + "step": 2578 + }, + { + "epoch": 0.7915899324739104, + "grad_norm": 0.5878757834434509, + "learning_rate": 9.936884682361755e-05, + "loss": 2.0718, + "step": 2579 + }, + { + "epoch": 0.7918968692449355, + "grad_norm": 0.6318243145942688, + "learning_rate": 9.936805930221769e-05, + "loss": 2.1465, + "step": 2580 + }, + { + "epoch": 0.7922038060159607, + "grad_norm": 0.6474836468696594, + "learning_rate": 9.936727129293376e-05, + "loss": 2.0869, + "step": 2581 + }, + { + "epoch": 0.7925107427869859, + "grad_norm": 0.6589438915252686, + "learning_rate": 9.936648279577349e-05, + "loss": 2.1422, + "step": 2582 + }, + { + "epoch": 0.7928176795580111, + "grad_norm": 0.6935134530067444, + "learning_rate": 9.93656938107447e-05, + "loss": 2.1571, + "step": 2583 + }, + { + "epoch": 0.7931246163290362, + "grad_norm": 0.655430793762207, + "learning_rate": 9.936490433785522e-05, + "loss": 2.1044, + "step": 2584 + }, + { + "epoch": 0.7934315531000614, + "grad_norm": 0.6856111288070679, + "learning_rate": 9.93641143771128e-05, + "loss": 2.0551, + "step": 2585 + }, + { + "epoch": 0.7937384898710865, + "grad_norm": 0.6783097386360168, + "learning_rate": 9.936332392852527e-05, + "loss": 2.1475, + "step": 2586 + }, + { + "epoch": 0.7940454266421118, + "grad_norm": 0.6746678948402405, + "learning_rate": 9.936253299210045e-05, + "loss": 2.1462, + "step": 2587 + }, + { + "epoch": 0.7943523634131369, + "grad_norm": 0.6854017972946167, + "learning_rate": 9.936174156784614e-05, + "loss": 2.1649, + "step": 2588 + }, + { + "epoch": 0.7946593001841621, + "grad_norm": 0.6740380525588989, + "learning_rate": 9.936094965577017e-05, + "loss": 2.06, + "step": 2589 + }, + { + "epoch": 0.7949662369551872, + "grad_norm": 0.6354179978370667, + "learning_rate": 9.936015725588037e-05, + "loss": 2.1938, + "step": 2590 + }, + { + "epoch": 0.7952731737262124, + "grad_norm": 0.6496716141700745, + "learning_rate": 9.935936436818453e-05, + "loss": 2.089, + "step": 2591 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.5996106266975403, + "learning_rate": 9.935857099269057e-05, + "loss": 2.2254, + "step": 2592 + }, + { + "epoch": 0.7958870472682628, + "grad_norm": 0.5630382895469666, + "learning_rate": 9.935777712940625e-05, + "loss": 2.069, + "step": 2593 + }, + { + "epoch": 0.7961939840392879, + "grad_norm": 0.5480468273162842, + "learning_rate": 9.935698277833946e-05, + "loss": 2.1288, + "step": 2594 + }, + { + "epoch": 0.7965009208103131, + "grad_norm": 0.5127096772193909, + "learning_rate": 9.935618793949803e-05, + "loss": 2.0753, + "step": 2595 + }, + { + "epoch": 0.7968078575813382, + "grad_norm": 0.6451439261436462, + "learning_rate": 9.935539261288983e-05, + "loss": 2.3005, + "step": 2596 + }, + { + "epoch": 0.7971147943523634, + "grad_norm": 0.7047737836837769, + "learning_rate": 9.935459679852271e-05, + "loss": 2.1307, + "step": 2597 + }, + { + "epoch": 0.7974217311233885, + "grad_norm": 0.6382983922958374, + "learning_rate": 9.935380049640454e-05, + "loss": 2.1136, + "step": 2598 + }, + { + "epoch": 0.7977286678944138, + "grad_norm": 0.7337773442268372, + "learning_rate": 9.935300370654317e-05, + "loss": 2.0719, + "step": 2599 + }, + { + "epoch": 0.7980356046654389, + "grad_norm": 0.7481197118759155, + "learning_rate": 9.935220642894652e-05, + "loss": 2.2263, + "step": 2600 + }, + { + "epoch": 0.7983425414364641, + "grad_norm": 0.7383365631103516, + "learning_rate": 9.93514086636224e-05, + "loss": 2.2207, + "step": 2601 + }, + { + "epoch": 0.7986494782074892, + "grad_norm": 0.800762951374054, + "learning_rate": 9.935061041057876e-05, + "loss": 2.1848, + "step": 2602 + }, + { + "epoch": 0.7989564149785144, + "grad_norm": 0.6972829699516296, + "learning_rate": 9.934981166982346e-05, + "loss": 2.1301, + "step": 2603 + }, + { + "epoch": 0.7992633517495396, + "grad_norm": 0.5842304229736328, + "learning_rate": 9.93490124413644e-05, + "loss": 2.1311, + "step": 2604 + }, + { + "epoch": 0.7995702885205648, + "grad_norm": 0.6070491075515747, + "learning_rate": 9.934821272520946e-05, + "loss": 2.2226, + "step": 2605 + }, + { + "epoch": 0.7998772252915899, + "grad_norm": 0.6141406297683716, + "learning_rate": 9.934741252136656e-05, + "loss": 2.1425, + "step": 2606 + }, + { + "epoch": 0.8001841620626151, + "grad_norm": 0.5515148043632507, + "learning_rate": 9.934661182984363e-05, + "loss": 2.1138, + "step": 2607 + }, + { + "epoch": 0.8004910988336402, + "grad_norm": 0.5819688439369202, + "learning_rate": 9.934581065064854e-05, + "loss": 2.0835, + "step": 2608 + }, + { + "epoch": 0.8007980356046654, + "grad_norm": 0.593979001045227, + "learning_rate": 9.934500898378922e-05, + "loss": 2.2262, + "step": 2609 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.6978363990783691, + "learning_rate": 9.934420682927361e-05, + "loss": 2.1283, + "step": 2610 + }, + { + "epoch": 0.8014119091467158, + "grad_norm": 0.6205853223800659, + "learning_rate": 9.934340418710963e-05, + "loss": 2.1254, + "step": 2611 + }, + { + "epoch": 0.8017188459177409, + "grad_norm": 0.5547113418579102, + "learning_rate": 9.93426010573052e-05, + "loss": 2.0895, + "step": 2612 + }, + { + "epoch": 0.8020257826887661, + "grad_norm": 0.5652415156364441, + "learning_rate": 9.934179743986827e-05, + "loss": 2.1496, + "step": 2613 + }, + { + "epoch": 0.8023327194597912, + "grad_norm": 0.5833094120025635, + "learning_rate": 9.934099333480678e-05, + "loss": 2.1159, + "step": 2614 + }, + { + "epoch": 0.8026396562308165, + "grad_norm": 0.5929473638534546, + "learning_rate": 9.934018874212866e-05, + "loss": 2.1512, + "step": 2615 + }, + { + "epoch": 0.8029465930018416, + "grad_norm": 0.6359207630157471, + "learning_rate": 9.93393836618419e-05, + "loss": 2.1384, + "step": 2616 + }, + { + "epoch": 0.8032535297728668, + "grad_norm": 0.5934728384017944, + "learning_rate": 9.933857809395441e-05, + "loss": 2.1087, + "step": 2617 + }, + { + "epoch": 0.8035604665438919, + "grad_norm": 0.5685787796974182, + "learning_rate": 9.933777203847418e-05, + "loss": 2.1521, + "step": 2618 + }, + { + "epoch": 0.8038674033149171, + "grad_norm": 0.6276339292526245, + "learning_rate": 9.933696549540918e-05, + "loss": 2.1151, + "step": 2619 + }, + { + "epoch": 0.8041743400859422, + "grad_norm": 0.6206804513931274, + "learning_rate": 9.933615846476736e-05, + "loss": 2.1872, + "step": 2620 + }, + { + "epoch": 0.8044812768569675, + "grad_norm": 0.6645623445510864, + "learning_rate": 9.933535094655671e-05, + "loss": 2.217, + "step": 2621 + }, + { + "epoch": 0.8047882136279927, + "grad_norm": 0.6639950275421143, + "learning_rate": 9.93345429407852e-05, + "loss": 2.1479, + "step": 2622 + }, + { + "epoch": 0.8050951503990178, + "grad_norm": 0.6284301280975342, + "learning_rate": 9.933373444746081e-05, + "loss": 2.1763, + "step": 2623 + }, + { + "epoch": 0.805402087170043, + "grad_norm": 0.5974198579788208, + "learning_rate": 9.933292546659156e-05, + "loss": 2.1453, + "step": 2624 + }, + { + "epoch": 0.8057090239410681, + "grad_norm": 0.6465814113616943, + "learning_rate": 9.933211599818541e-05, + "loss": 2.1999, + "step": 2625 + }, + { + "epoch": 0.8060159607120934, + "grad_norm": 0.6099503040313721, + "learning_rate": 9.933130604225038e-05, + "loss": 2.1523, + "step": 2626 + }, + { + "epoch": 0.8063228974831185, + "grad_norm": 0.5749596953392029, + "learning_rate": 9.933049559879448e-05, + "loss": 2.0802, + "step": 2627 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.5541282892227173, + "learning_rate": 9.93296846678257e-05, + "loss": 2.0851, + "step": 2628 + }, + { + "epoch": 0.8069367710251688, + "grad_norm": 0.5884469747543335, + "learning_rate": 9.932887324935207e-05, + "loss": 2.1824, + "step": 2629 + }, + { + "epoch": 0.807243707796194, + "grad_norm": 0.7330854535102844, + "learning_rate": 9.93280613433816e-05, + "loss": 2.1463, + "step": 2630 + }, + { + "epoch": 0.8075506445672191, + "grad_norm": 0.7012677192687988, + "learning_rate": 9.932724894992232e-05, + "loss": 2.0907, + "step": 2631 + }, + { + "epoch": 0.8078575813382444, + "grad_norm": 0.6487980484962463, + "learning_rate": 9.932643606898224e-05, + "loss": 2.2131, + "step": 2632 + }, + { + "epoch": 0.8081645181092695, + "grad_norm": 0.7956567406654358, + "learning_rate": 9.932562270056941e-05, + "loss": 2.2289, + "step": 2633 + }, + { + "epoch": 0.8084714548802947, + "grad_norm": 0.7904889583587646, + "learning_rate": 9.932480884469187e-05, + "loss": 2.195, + "step": 2634 + }, + { + "epoch": 0.8087783916513198, + "grad_norm": 0.8088505864143372, + "learning_rate": 9.932399450135766e-05, + "loss": 2.1199, + "step": 2635 + }, + { + "epoch": 0.809085328422345, + "grad_norm": 0.7557070851325989, + "learning_rate": 9.932317967057483e-05, + "loss": 2.177, + "step": 2636 + }, + { + "epoch": 0.8093922651933702, + "grad_norm": 0.8585113286972046, + "learning_rate": 9.932236435235143e-05, + "loss": 2.2215, + "step": 2637 + }, + { + "epoch": 0.8096992019643954, + "grad_norm": 0.9541242718696594, + "learning_rate": 9.932154854669551e-05, + "loss": 2.0971, + "step": 2638 + }, + { + "epoch": 0.8100061387354205, + "grad_norm": 0.9696017503738403, + "learning_rate": 9.932073225361513e-05, + "loss": 2.1723, + "step": 2639 + }, + { + "epoch": 0.8103130755064457, + "grad_norm": 0.9876028895378113, + "learning_rate": 9.931991547311839e-05, + "loss": 2.2266, + "step": 2640 + }, + { + "epoch": 0.8106200122774708, + "grad_norm": 0.9169884324073792, + "learning_rate": 9.931909820521332e-05, + "loss": 2.1453, + "step": 2641 + }, + { + "epoch": 0.810926949048496, + "grad_norm": 0.7645174860954285, + "learning_rate": 9.931828044990801e-05, + "loss": 2.1683, + "step": 2642 + }, + { + "epoch": 0.8112338858195212, + "grad_norm": 0.6733110547065735, + "learning_rate": 9.931746220721056e-05, + "loss": 2.0869, + "step": 2643 + }, + { + "epoch": 0.8115408225905464, + "grad_norm": 0.6033461689949036, + "learning_rate": 9.931664347712904e-05, + "loss": 2.1395, + "step": 2644 + }, + { + "epoch": 0.8118477593615715, + "grad_norm": 0.5953301191329956, + "learning_rate": 9.931582425967154e-05, + "loss": 2.0886, + "step": 2645 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.6587704420089722, + "learning_rate": 9.931500455484616e-05, + "loss": 2.1846, + "step": 2646 + }, + { + "epoch": 0.8124616329036218, + "grad_norm": 0.5837808847427368, + "learning_rate": 9.931418436266101e-05, + "loss": 2.0953, + "step": 2647 + }, + { + "epoch": 0.8127685696746471, + "grad_norm": 0.5593163967132568, + "learning_rate": 9.931336368312417e-05, + "loss": 2.1044, + "step": 2648 + }, + { + "epoch": 0.8130755064456722, + "grad_norm": 0.5758668780326843, + "learning_rate": 9.931254251624378e-05, + "loss": 2.1813, + "step": 2649 + }, + { + "epoch": 0.8133824432166974, + "grad_norm": 0.7128240466117859, + "learning_rate": 9.931172086202793e-05, + "loss": 2.1743, + "step": 2650 + }, + { + "epoch": 0.8136893799877225, + "grad_norm": 0.6214346885681152, + "learning_rate": 9.931089872048476e-05, + "loss": 2.0566, + "step": 2651 + }, + { + "epoch": 0.8139963167587477, + "grad_norm": 0.6279975771903992, + "learning_rate": 9.931007609162239e-05, + "loss": 2.1487, + "step": 2652 + }, + { + "epoch": 0.8143032535297728, + "grad_norm": 0.6137428879737854, + "learning_rate": 9.930925297544895e-05, + "loss": 2.1281, + "step": 2653 + }, + { + "epoch": 0.8146101903007981, + "grad_norm": 0.7433622479438782, + "learning_rate": 9.930842937197255e-05, + "loss": 2.2398, + "step": 2654 + }, + { + "epoch": 0.8149171270718232, + "grad_norm": 0.7490934729576111, + "learning_rate": 9.930760528120137e-05, + "loss": 2.0626, + "step": 2655 + }, + { + "epoch": 0.8152240638428484, + "grad_norm": 0.6829020380973816, + "learning_rate": 9.930678070314352e-05, + "loss": 2.0685, + "step": 2656 + }, + { + "epoch": 0.8155310006138735, + "grad_norm": 0.6328942775726318, + "learning_rate": 9.930595563780718e-05, + "loss": 2.1415, + "step": 2657 + }, + { + "epoch": 0.8158379373848987, + "grad_norm": 0.6919183135032654, + "learning_rate": 9.930513008520048e-05, + "loss": 2.1764, + "step": 2658 + }, + { + "epoch": 0.8161448741559238, + "grad_norm": 0.6600683331489563, + "learning_rate": 9.930430404533158e-05, + "loss": 2.2252, + "step": 2659 + }, + { + "epoch": 0.8164518109269491, + "grad_norm": 0.6614112257957458, + "learning_rate": 9.930347751820866e-05, + "loss": 2.0842, + "step": 2660 + }, + { + "epoch": 0.8167587476979742, + "grad_norm": 0.634395182132721, + "learning_rate": 9.930265050383987e-05, + "loss": 2.1784, + "step": 2661 + }, + { + "epoch": 0.8170656844689994, + "grad_norm": 0.6563819050788879, + "learning_rate": 9.930182300223338e-05, + "loss": 2.1845, + "step": 2662 + }, + { + "epoch": 0.8173726212400245, + "grad_norm": 0.7023175954818726, + "learning_rate": 9.93009950133974e-05, + "loss": 2.1913, + "step": 2663 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.6042037010192871, + "learning_rate": 9.930016653734007e-05, + "loss": 2.1624, + "step": 2664 + }, + { + "epoch": 0.8179864947820749, + "grad_norm": 0.5729875564575195, + "learning_rate": 9.929933757406962e-05, + "loss": 2.0439, + "step": 2665 + }, + { + "epoch": 0.8182934315531001, + "grad_norm": 0.5399687886238098, + "learning_rate": 9.929850812359421e-05, + "loss": 2.1438, + "step": 2666 + }, + { + "epoch": 0.8186003683241252, + "grad_norm": 0.6325745582580566, + "learning_rate": 9.929767818592205e-05, + "loss": 2.1644, + "step": 2667 + }, + { + "epoch": 0.8189073050951504, + "grad_norm": 0.6303146481513977, + "learning_rate": 9.929684776106134e-05, + "loss": 2.1106, + "step": 2668 + }, + { + "epoch": 0.8192142418661755, + "grad_norm": 0.6482712030410767, + "learning_rate": 9.929601684902027e-05, + "loss": 2.0877, + "step": 2669 + }, + { + "epoch": 0.8195211786372008, + "grad_norm": 0.6858036518096924, + "learning_rate": 9.92951854498071e-05, + "loss": 2.1263, + "step": 2670 + }, + { + "epoch": 0.8198281154082259, + "grad_norm": 0.6214284896850586, + "learning_rate": 9.929435356343e-05, + "loss": 2.1516, + "step": 2671 + }, + { + "epoch": 0.8201350521792511, + "grad_norm": 0.5486865639686584, + "learning_rate": 9.92935211898972e-05, + "loss": 2.1199, + "step": 2672 + }, + { + "epoch": 0.8204419889502762, + "grad_norm": 0.62936931848526, + "learning_rate": 9.929268832921693e-05, + "loss": 2.1555, + "step": 2673 + }, + { + "epoch": 0.8207489257213014, + "grad_norm": 0.6402064561843872, + "learning_rate": 9.929185498139744e-05, + "loss": 2.1017, + "step": 2674 + }, + { + "epoch": 0.8210558624923265, + "grad_norm": 0.7254593372344971, + "learning_rate": 9.929102114644693e-05, + "loss": 2.1145, + "step": 2675 + }, + { + "epoch": 0.8213627992633518, + "grad_norm": 0.776472806930542, + "learning_rate": 9.929018682437366e-05, + "loss": 2.2582, + "step": 2676 + }, + { + "epoch": 0.8216697360343769, + "grad_norm": 0.7073757648468018, + "learning_rate": 9.928935201518587e-05, + "loss": 2.1135, + "step": 2677 + }, + { + "epoch": 0.8219766728054021, + "grad_norm": 0.7075079679489136, + "learning_rate": 9.928851671889184e-05, + "loss": 2.128, + "step": 2678 + }, + { + "epoch": 0.8222836095764272, + "grad_norm": 0.7937450408935547, + "learning_rate": 9.928768093549979e-05, + "loss": 2.1401, + "step": 2679 + }, + { + "epoch": 0.8225905463474524, + "grad_norm": 0.7523970603942871, + "learning_rate": 9.928684466501797e-05, + "loss": 2.2055, + "step": 2680 + }, + { + "epoch": 0.8228974831184775, + "grad_norm": 0.6644876599311829, + "learning_rate": 9.928600790745466e-05, + "loss": 2.1449, + "step": 2681 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.6054069399833679, + "learning_rate": 9.928517066281816e-05, + "loss": 2.1191, + "step": 2682 + }, + { + "epoch": 0.8235113566605279, + "grad_norm": 0.6610973477363586, + "learning_rate": 9.92843329311167e-05, + "loss": 2.2247, + "step": 2683 + }, + { + "epoch": 0.8238182934315531, + "grad_norm": 0.69968181848526, + "learning_rate": 9.928349471235858e-05, + "loss": 2.149, + "step": 2684 + }, + { + "epoch": 0.8241252302025782, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.928265600655206e-05, + "loss": 2.1906, + "step": 2685 + }, + { + "epoch": 0.8244321669736034, + "grad_norm": 0.6621972918510437, + "learning_rate": 9.928181681370547e-05, + "loss": 2.1259, + "step": 2686 + }, + { + "epoch": 0.8247391037446286, + "grad_norm": 0.6452053785324097, + "learning_rate": 9.928097713382708e-05, + "loss": 2.1301, + "step": 2687 + }, + { + "epoch": 0.8250460405156538, + "grad_norm": 0.6137326955795288, + "learning_rate": 9.928013696692519e-05, + "loss": 2.0942, + "step": 2688 + }, + { + "epoch": 0.8253529772866789, + "grad_norm": 0.6449215412139893, + "learning_rate": 9.92792963130081e-05, + "loss": 2.2135, + "step": 2689 + }, + { + "epoch": 0.8256599140577041, + "grad_norm": 0.5838732123374939, + "learning_rate": 9.927845517208411e-05, + "loss": 2.1161, + "step": 2690 + }, + { + "epoch": 0.8259668508287292, + "grad_norm": 0.6642805337905884, + "learning_rate": 9.927761354416157e-05, + "loss": 2.1228, + "step": 2691 + }, + { + "epoch": 0.8262737875997545, + "grad_norm": 0.653274416923523, + "learning_rate": 9.927677142924874e-05, + "loss": 2.1777, + "step": 2692 + }, + { + "epoch": 0.8265807243707797, + "grad_norm": 0.6471827030181885, + "learning_rate": 9.927592882735398e-05, + "loss": 2.0756, + "step": 2693 + }, + { + "epoch": 0.8268876611418048, + "grad_norm": 0.6215457916259766, + "learning_rate": 9.927508573848562e-05, + "loss": 2.0691, + "step": 2694 + }, + { + "epoch": 0.82719459791283, + "grad_norm": 0.6343390345573425, + "learning_rate": 9.927424216265198e-05, + "loss": 2.2145, + "step": 2695 + }, + { + "epoch": 0.8275015346838551, + "grad_norm": 0.5296334624290466, + "learning_rate": 9.927339809986138e-05, + "loss": 2.0861, + "step": 2696 + }, + { + "epoch": 0.8278084714548803, + "grad_norm": 0.6457146406173706, + "learning_rate": 9.92725535501222e-05, + "loss": 2.1703, + "step": 2697 + }, + { + "epoch": 0.8281154082259055, + "grad_norm": 0.753579318523407, + "learning_rate": 9.927170851344276e-05, + "loss": 2.1628, + "step": 2698 + }, + { + "epoch": 0.8284223449969307, + "grad_norm": 0.7327163815498352, + "learning_rate": 9.927086298983141e-05, + "loss": 2.105, + "step": 2699 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.7786175608634949, + "learning_rate": 9.927001697929653e-05, + "loss": 2.084, + "step": 2700 + }, + { + "epoch": 0.829036218538981, + "grad_norm": 0.6370857357978821, + "learning_rate": 9.926917048184646e-05, + "loss": 2.0888, + "step": 2701 + }, + { + "epoch": 0.8293431553100061, + "grad_norm": 0.6600006818771362, + "learning_rate": 9.926832349748955e-05, + "loss": 2.148, + "step": 2702 + }, + { + "epoch": 0.8296500920810314, + "grad_norm": 0.6266845464706421, + "learning_rate": 9.926747602623422e-05, + "loss": 2.2182, + "step": 2703 + }, + { + "epoch": 0.8299570288520565, + "grad_norm": 0.588934600353241, + "learning_rate": 9.92666280680888e-05, + "loss": 2.1879, + "step": 2704 + }, + { + "epoch": 0.8302639656230817, + "grad_norm": 0.6467881202697754, + "learning_rate": 9.926577962306168e-05, + "loss": 2.1082, + "step": 2705 + }, + { + "epoch": 0.8305709023941068, + "grad_norm": 0.6256638765335083, + "learning_rate": 9.926493069116127e-05, + "loss": 2.1007, + "step": 2706 + }, + { + "epoch": 0.830877839165132, + "grad_norm": 0.5710256099700928, + "learning_rate": 9.926408127239592e-05, + "loss": 2.0783, + "step": 2707 + }, + { + "epoch": 0.8311847759361571, + "grad_norm": 0.5836597681045532, + "learning_rate": 9.926323136677405e-05, + "loss": 2.1292, + "step": 2708 + }, + { + "epoch": 0.8314917127071824, + "grad_norm": 0.6420408487319946, + "learning_rate": 9.926238097430405e-05, + "loss": 2.1191, + "step": 2709 + }, + { + "epoch": 0.8317986494782075, + "grad_norm": 0.6192520260810852, + "learning_rate": 9.926153009499433e-05, + "loss": 2.1401, + "step": 2710 + }, + { + "epoch": 0.8321055862492327, + "grad_norm": 0.5986925959587097, + "learning_rate": 9.92606787288533e-05, + "loss": 2.0466, + "step": 2711 + }, + { + "epoch": 0.8324125230202578, + "grad_norm": 0.6386710405349731, + "learning_rate": 9.925982687588937e-05, + "loss": 2.1975, + "step": 2712 + }, + { + "epoch": 0.832719459791283, + "grad_norm": 0.6678250432014465, + "learning_rate": 9.925897453611095e-05, + "loss": 2.1744, + "step": 2713 + }, + { + "epoch": 0.8330263965623081, + "grad_norm": 0.628873348236084, + "learning_rate": 9.925812170952648e-05, + "loss": 2.0901, + "step": 2714 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6365368366241455, + "learning_rate": 9.925726839614438e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.8336402701043585, + "grad_norm": 0.6812825798988342, + "learning_rate": 9.925641459597309e-05, + "loss": 2.1163, + "step": 2716 + }, + { + "epoch": 0.8339472068753837, + "grad_norm": 0.6961301565170288, + "learning_rate": 9.925556030902103e-05, + "loss": 2.1634, + "step": 2717 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.687017023563385, + "learning_rate": 9.925470553529666e-05, + "loss": 2.1921, + "step": 2718 + }, + { + "epoch": 0.834561080417434, + "grad_norm": 0.6528787612915039, + "learning_rate": 9.925385027480841e-05, + "loss": 2.1148, + "step": 2719 + }, + { + "epoch": 0.8348680171884592, + "grad_norm": 0.6092917323112488, + "learning_rate": 9.925299452756476e-05, + "loss": 2.0154, + "step": 2720 + }, + { + "epoch": 0.8351749539594844, + "grad_norm": 0.6537092328071594, + "learning_rate": 9.925213829357413e-05, + "loss": 2.1775, + "step": 2721 + }, + { + "epoch": 0.8354818907305095, + "grad_norm": 0.6560773849487305, + "learning_rate": 9.925128157284503e-05, + "loss": 2.1628, + "step": 2722 + }, + { + "epoch": 0.8357888275015347, + "grad_norm": 0.5976104140281677, + "learning_rate": 9.925042436538588e-05, + "loss": 2.1527, + "step": 2723 + }, + { + "epoch": 0.8360957642725598, + "grad_norm": 0.6577131152153015, + "learning_rate": 9.924956667120516e-05, + "loss": 2.1449, + "step": 2724 + }, + { + "epoch": 0.836402701043585, + "grad_norm": 0.6574232578277588, + "learning_rate": 9.924870849031136e-05, + "loss": 2.0517, + "step": 2725 + }, + { + "epoch": 0.8367096378146102, + "grad_norm": 0.5988326072692871, + "learning_rate": 9.924784982271297e-05, + "loss": 2.0975, + "step": 2726 + }, + { + "epoch": 0.8370165745856354, + "grad_norm": 0.5970706939697266, + "learning_rate": 9.924699066841845e-05, + "loss": 2.1754, + "step": 2727 + }, + { + "epoch": 0.8373235113566605, + "grad_norm": 0.6547200679779053, + "learning_rate": 9.924613102743632e-05, + "loss": 2.1651, + "step": 2728 + }, + { + "epoch": 0.8376304481276857, + "grad_norm": 0.643358588218689, + "learning_rate": 9.924527089977504e-05, + "loss": 2.1355, + "step": 2729 + }, + { + "epoch": 0.8379373848987108, + "grad_norm": 0.6696504950523376, + "learning_rate": 9.924441028544314e-05, + "loss": 2.1444, + "step": 2730 + }, + { + "epoch": 0.8382443216697361, + "grad_norm": 0.5923263430595398, + "learning_rate": 9.924354918444911e-05, + "loss": 2.1656, + "step": 2731 + }, + { + "epoch": 0.8385512584407612, + "grad_norm": 0.6507698893547058, + "learning_rate": 9.924268759680146e-05, + "loss": 2.1172, + "step": 2732 + }, + { + "epoch": 0.8388581952117864, + "grad_norm": 0.6240561008453369, + "learning_rate": 9.924182552250873e-05, + "loss": 2.113, + "step": 2733 + }, + { + "epoch": 0.8391651319828115, + "grad_norm": 0.7350605726242065, + "learning_rate": 9.92409629615794e-05, + "loss": 2.2099, + "step": 2734 + }, + { + "epoch": 0.8394720687538367, + "grad_norm": 0.679027795791626, + "learning_rate": 9.924009991402202e-05, + "loss": 2.1202, + "step": 2735 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.7187801003456116, + "learning_rate": 9.923923637984512e-05, + "loss": 2.1994, + "step": 2736 + }, + { + "epoch": 0.8400859422958871, + "grad_norm": 0.7437569499015808, + "learning_rate": 9.92383723590572e-05, + "loss": 2.1778, + "step": 2737 + }, + { + "epoch": 0.8403928790669122, + "grad_norm": 0.7004902958869934, + "learning_rate": 9.923750785166686e-05, + "loss": 2.1478, + "step": 2738 + }, + { + "epoch": 0.8406998158379374, + "grad_norm": 0.632478654384613, + "learning_rate": 9.923664285768258e-05, + "loss": 2.1785, + "step": 2739 + }, + { + "epoch": 0.8410067526089625, + "grad_norm": 0.6399826407432556, + "learning_rate": 9.923577737711295e-05, + "loss": 2.1708, + "step": 2740 + }, + { + "epoch": 0.8413136893799877, + "grad_norm": 0.649340033531189, + "learning_rate": 9.92349114099665e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.8416206261510129, + "grad_norm": 0.6143749952316284, + "learning_rate": 9.923404495625182e-05, + "loss": 2.0696, + "step": 2742 + }, + { + "epoch": 0.8419275629220381, + "grad_norm": 0.655846357345581, + "learning_rate": 9.923317801597742e-05, + "loss": 2.1163, + "step": 2743 + }, + { + "epoch": 0.8422344996930632, + "grad_norm": 0.588096022605896, + "learning_rate": 9.923231058915192e-05, + "loss": 2.0893, + "step": 2744 + }, + { + "epoch": 0.8425414364640884, + "grad_norm": 0.5445908904075623, + "learning_rate": 9.923144267578386e-05, + "loss": 2.1223, + "step": 2745 + }, + { + "epoch": 0.8428483732351135, + "grad_norm": 0.5372910499572754, + "learning_rate": 9.923057427588182e-05, + "loss": 2.1386, + "step": 2746 + }, + { + "epoch": 0.8431553100061387, + "grad_norm": 0.5118899345397949, + "learning_rate": 9.922970538945442e-05, + "loss": 2.0532, + "step": 2747 + }, + { + "epoch": 0.8434622467771639, + "grad_norm": 0.5252440571784973, + "learning_rate": 9.922883601651019e-05, + "loss": 2.1679, + "step": 2748 + }, + { + "epoch": 0.8437691835481891, + "grad_norm": 0.5978875160217285, + "learning_rate": 9.922796615705776e-05, + "loss": 2.2054, + "step": 2749 + }, + { + "epoch": 0.8440761203192142, + "grad_norm": 0.5642610788345337, + "learning_rate": 9.922709581110572e-05, + "loss": 2.1886, + "step": 2750 + }, + { + "epoch": 0.8443830570902394, + "grad_norm": 0.6332407593727112, + "learning_rate": 9.922622497866265e-05, + "loss": 2.1618, + "step": 2751 + }, + { + "epoch": 0.8446899938612645, + "grad_norm": 0.6971728801727295, + "learning_rate": 9.922535365973718e-05, + "loss": 2.1011, + "step": 2752 + }, + { + "epoch": 0.8449969306322898, + "grad_norm": 0.6917250156402588, + "learning_rate": 9.922448185433792e-05, + "loss": 2.1408, + "step": 2753 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.748960554599762, + "learning_rate": 9.922360956247348e-05, + "loss": 2.1612, + "step": 2754 + }, + { + "epoch": 0.8456108041743401, + "grad_norm": 0.6739722490310669, + "learning_rate": 9.922273678415245e-05, + "loss": 2.1234, + "step": 2755 + }, + { + "epoch": 0.8459177409453652, + "grad_norm": 0.6310722827911377, + "learning_rate": 9.922186351938351e-05, + "loss": 2.1476, + "step": 2756 + }, + { + "epoch": 0.8462246777163904, + "grad_norm": 0.5992079973220825, + "learning_rate": 9.922098976817527e-05, + "loss": 2.1009, + "step": 2757 + }, + { + "epoch": 0.8465316144874155, + "grad_norm": 0.5697188973426819, + "learning_rate": 9.922011553053637e-05, + "loss": 2.1277, + "step": 2758 + }, + { + "epoch": 0.8468385512584408, + "grad_norm": 0.7005256414413452, + "learning_rate": 9.921924080647541e-05, + "loss": 2.1592, + "step": 2759 + }, + { + "epoch": 0.8471454880294659, + "grad_norm": 0.7664382457733154, + "learning_rate": 9.921836559600109e-05, + "loss": 2.2328, + "step": 2760 + }, + { + "epoch": 0.8474524248004911, + "grad_norm": 0.8668230772018433, + "learning_rate": 9.921748989912201e-05, + "loss": 2.2285, + "step": 2761 + }, + { + "epoch": 0.8477593615715162, + "grad_norm": 0.9423169493675232, + "learning_rate": 9.921661371584685e-05, + "loss": 2.1172, + "step": 2762 + }, + { + "epoch": 0.8480662983425414, + "grad_norm": 0.8547552824020386, + "learning_rate": 9.921573704618428e-05, + "loss": 2.1426, + "step": 2763 + }, + { + "epoch": 0.8483732351135667, + "grad_norm": 0.7568690776824951, + "learning_rate": 9.921485989014294e-05, + "loss": 2.0861, + "step": 2764 + }, + { + "epoch": 0.8486801718845918, + "grad_norm": 0.6535828709602356, + "learning_rate": 9.92139822477315e-05, + "loss": 2.1705, + "step": 2765 + }, + { + "epoch": 0.848987108655617, + "grad_norm": 0.6099218130111694, + "learning_rate": 9.921310411895867e-05, + "loss": 2.1666, + "step": 2766 + }, + { + "epoch": 0.8492940454266421, + "grad_norm": 0.6315065026283264, + "learning_rate": 9.92122255038331e-05, + "loss": 2.1868, + "step": 2767 + }, + { + "epoch": 0.8496009821976673, + "grad_norm": 0.6861329078674316, + "learning_rate": 9.921134640236344e-05, + "loss": 2.1056, + "step": 2768 + }, + { + "epoch": 0.8499079189686924, + "grad_norm": 0.6357519626617432, + "learning_rate": 9.921046681455844e-05, + "loss": 2.1272, + "step": 2769 + }, + { + "epoch": 0.8502148557397177, + "grad_norm": 0.6245810389518738, + "learning_rate": 9.920958674042676e-05, + "loss": 2.1313, + "step": 2770 + }, + { + "epoch": 0.8505217925107428, + "grad_norm": 0.6087192296981812, + "learning_rate": 9.920870617997709e-05, + "loss": 2.123, + "step": 2771 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.6384228467941284, + "learning_rate": 9.920782513321814e-05, + "loss": 2.1343, + "step": 2772 + }, + { + "epoch": 0.8511356660527931, + "grad_norm": 0.6143882274627686, + "learning_rate": 9.920694360015863e-05, + "loss": 2.0706, + "step": 2773 + }, + { + "epoch": 0.8514426028238183, + "grad_norm": 0.5561975240707397, + "learning_rate": 9.920606158080725e-05, + "loss": 2.1015, + "step": 2774 + }, + { + "epoch": 0.8517495395948435, + "grad_norm": 0.5434146523475647, + "learning_rate": 9.920517907517275e-05, + "loss": 2.1306, + "step": 2775 + }, + { + "epoch": 0.8520564763658687, + "grad_norm": 0.6028591990470886, + "learning_rate": 9.920429608326382e-05, + "loss": 2.1665, + "step": 2776 + }, + { + "epoch": 0.8523634131368938, + "grad_norm": 0.6491599082946777, + "learning_rate": 9.920341260508918e-05, + "loss": 2.0715, + "step": 2777 + }, + { + "epoch": 0.852670349907919, + "grad_norm": 0.6350167989730835, + "learning_rate": 9.92025286406576e-05, + "loss": 2.1492, + "step": 2778 + }, + { + "epoch": 0.8529772866789441, + "grad_norm": 0.5726897120475769, + "learning_rate": 9.92016441899778e-05, + "loss": 2.1128, + "step": 2779 + }, + { + "epoch": 0.8532842234499693, + "grad_norm": 0.5680630207061768, + "learning_rate": 9.92007592530585e-05, + "loss": 2.0718, + "step": 2780 + }, + { + "epoch": 0.8535911602209945, + "grad_norm": 0.5901346802711487, + "learning_rate": 9.919987382990845e-05, + "loss": 2.0577, + "step": 2781 + }, + { + "epoch": 0.8538980969920197, + "grad_norm": 0.5756994485855103, + "learning_rate": 9.919898792053643e-05, + "loss": 2.106, + "step": 2782 + }, + { + "epoch": 0.8542050337630448, + "grad_norm": 0.5831238031387329, + "learning_rate": 9.919810152495116e-05, + "loss": 2.0507, + "step": 2783 + }, + { + "epoch": 0.85451197053407, + "grad_norm": 0.529931902885437, + "learning_rate": 9.919721464316143e-05, + "loss": 2.0934, + "step": 2784 + }, + { + "epoch": 0.8548189073050951, + "grad_norm": 0.603672981262207, + "learning_rate": 9.919632727517597e-05, + "loss": 2.164, + "step": 2785 + }, + { + "epoch": 0.8551258440761204, + "grad_norm": 0.5741528868675232, + "learning_rate": 9.919543942100357e-05, + "loss": 2.0948, + "step": 2786 + }, + { + "epoch": 0.8554327808471455, + "grad_norm": 0.5689142942428589, + "learning_rate": 9.919455108065303e-05, + "loss": 2.1572, + "step": 2787 + }, + { + "epoch": 0.8557397176181707, + "grad_norm": 0.5767523646354675, + "learning_rate": 9.919366225413308e-05, + "loss": 2.0528, + "step": 2788 + }, + { + "epoch": 0.8560466543891958, + "grad_norm": 0.6004374623298645, + "learning_rate": 9.919277294145252e-05, + "loss": 2.1078, + "step": 2789 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.6199560761451721, + "learning_rate": 9.919188314262017e-05, + "loss": 2.034, + "step": 2790 + }, + { + "epoch": 0.8566605279312461, + "grad_norm": 0.5928464531898499, + "learning_rate": 9.919099285764478e-05, + "loss": 2.1226, + "step": 2791 + }, + { + "epoch": 0.8569674647022714, + "grad_norm": 0.5620111227035522, + "learning_rate": 9.919010208653517e-05, + "loss": 2.1387, + "step": 2792 + }, + { + "epoch": 0.8572744014732965, + "grad_norm": 0.6035314798355103, + "learning_rate": 9.918921082930015e-05, + "loss": 2.0888, + "step": 2793 + }, + { + "epoch": 0.8575813382443217, + "grad_norm": 0.6842171549797058, + "learning_rate": 9.91883190859485e-05, + "loss": 2.15, + "step": 2794 + }, + { + "epoch": 0.8578882750153468, + "grad_norm": 0.7600229978561401, + "learning_rate": 9.918742685648906e-05, + "loss": 2.1776, + "step": 2795 + }, + { + "epoch": 0.858195211786372, + "grad_norm": 0.641504168510437, + "learning_rate": 9.918653414093065e-05, + "loss": 2.086, + "step": 2796 + }, + { + "epoch": 0.8585021485573971, + "grad_norm": 0.6062462329864502, + "learning_rate": 9.918564093928207e-05, + "loss": 2.0772, + "step": 2797 + }, + { + "epoch": 0.8588090853284224, + "grad_norm": 0.5259165167808533, + "learning_rate": 9.918474725155214e-05, + "loss": 2.1034, + "step": 2798 + }, + { + "epoch": 0.8591160220994475, + "grad_norm": 0.532511830329895, + "learning_rate": 9.918385307774973e-05, + "loss": 2.103, + "step": 2799 + }, + { + "epoch": 0.8594229588704727, + "grad_norm": 0.5996485352516174, + "learning_rate": 9.918295841788366e-05, + "loss": 2.1698, + "step": 2800 + }, + { + "epoch": 0.8597298956414978, + "grad_norm": 0.5895976424217224, + "learning_rate": 9.918206327196276e-05, + "loss": 2.132, + "step": 2801 + }, + { + "epoch": 0.860036832412523, + "grad_norm": 0.6363179087638855, + "learning_rate": 9.918116763999588e-05, + "loss": 2.0967, + "step": 2802 + }, + { + "epoch": 0.8603437691835482, + "grad_norm": 0.6594113707542419, + "learning_rate": 9.918027152199187e-05, + "loss": 2.1266, + "step": 2803 + }, + { + "epoch": 0.8606507059545734, + "grad_norm": 0.694879412651062, + "learning_rate": 9.917937491795961e-05, + "loss": 2.0694, + "step": 2804 + }, + { + "epoch": 0.8609576427255985, + "grad_norm": 0.6310710906982422, + "learning_rate": 9.917847782790793e-05, + "loss": 2.1546, + "step": 2805 + }, + { + "epoch": 0.8612645794966237, + "grad_norm": 0.6166081428527832, + "learning_rate": 9.917758025184572e-05, + "loss": 2.131, + "step": 2806 + }, + { + "epoch": 0.8615715162676488, + "grad_norm": 0.5857066512107849, + "learning_rate": 9.917668218978182e-05, + "loss": 2.1529, + "step": 2807 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.6374151706695557, + "learning_rate": 9.917578364172513e-05, + "loss": 2.151, + "step": 2808 + }, + { + "epoch": 0.8621853898096992, + "grad_norm": 0.6760959625244141, + "learning_rate": 9.917488460768453e-05, + "loss": 2.1955, + "step": 2809 + }, + { + "epoch": 0.8624923265807244, + "grad_norm": 0.6308501362800598, + "learning_rate": 9.917398508766889e-05, + "loss": 2.1449, + "step": 2810 + }, + { + "epoch": 0.8627992633517495, + "grad_norm": 0.615181028842926, + "learning_rate": 9.91730850816871e-05, + "loss": 2.0326, + "step": 2811 + }, + { + "epoch": 0.8631062001227747, + "grad_norm": 0.6746891736984253, + "learning_rate": 9.917218458974809e-05, + "loss": 2.1472, + "step": 2812 + }, + { + "epoch": 0.8634131368937998, + "grad_norm": 0.6594959497451782, + "learning_rate": 9.91712836118607e-05, + "loss": 2.0879, + "step": 2813 + }, + { + "epoch": 0.8637200736648251, + "grad_norm": 0.6843087077140808, + "learning_rate": 9.91703821480339e-05, + "loss": 2.13, + "step": 2814 + }, + { + "epoch": 0.8640270104358502, + "grad_norm": 0.7513928413391113, + "learning_rate": 9.916948019827653e-05, + "loss": 2.1866, + "step": 2815 + }, + { + "epoch": 0.8643339472068754, + "grad_norm": 0.7352319955825806, + "learning_rate": 9.916857776259755e-05, + "loss": 2.0844, + "step": 2816 + }, + { + "epoch": 0.8646408839779005, + "grad_norm": 0.6901769638061523, + "learning_rate": 9.916767484100587e-05, + "loss": 2.086, + "step": 2817 + }, + { + "epoch": 0.8649478207489257, + "grad_norm": 0.621734619140625, + "learning_rate": 9.91667714335104e-05, + "loss": 2.0764, + "step": 2818 + }, + { + "epoch": 0.8652547575199508, + "grad_norm": 0.5779813528060913, + "learning_rate": 9.916586754012008e-05, + "loss": 2.0568, + "step": 2819 + }, + { + "epoch": 0.8655616942909761, + "grad_norm": 0.566251814365387, + "learning_rate": 9.916496316084385e-05, + "loss": 2.1624, + "step": 2820 + }, + { + "epoch": 0.8658686310620012, + "grad_norm": 0.6039763689041138, + "learning_rate": 9.916405829569062e-05, + "loss": 2.0412, + "step": 2821 + }, + { + "epoch": 0.8661755678330264, + "grad_norm": 0.587469220161438, + "learning_rate": 9.916315294466935e-05, + "loss": 2.1513, + "step": 2822 + }, + { + "epoch": 0.8664825046040515, + "grad_norm": 0.5792883634567261, + "learning_rate": 9.916224710778901e-05, + "loss": 2.055, + "step": 2823 + }, + { + "epoch": 0.8667894413750767, + "grad_norm": 0.5533844232559204, + "learning_rate": 9.916134078505852e-05, + "loss": 2.1237, + "step": 2824 + }, + { + "epoch": 0.8670963781461019, + "grad_norm": 0.6140845417976379, + "learning_rate": 9.916043397648685e-05, + "loss": 2.1481, + "step": 2825 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.6092365384101868, + "learning_rate": 9.915952668208295e-05, + "loss": 2.1567, + "step": 2826 + }, + { + "epoch": 0.8677102516881522, + "grad_norm": 0.5712884068489075, + "learning_rate": 9.915861890185578e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.8680171884591774, + "grad_norm": 0.5314213633537292, + "learning_rate": 9.915771063581434e-05, + "loss": 2.0408, + "step": 2828 + }, + { + "epoch": 0.8683241252302025, + "grad_norm": 0.5258345007896423, + "learning_rate": 9.915680188396759e-05, + "loss": 2.0968, + "step": 2829 + }, + { + "epoch": 0.8686310620012277, + "grad_norm": 0.6071497797966003, + "learning_rate": 9.915589264632453e-05, + "loss": 2.0924, + "step": 2830 + }, + { + "epoch": 0.8689379987722529, + "grad_norm": 0.6742420792579651, + "learning_rate": 9.915498292289408e-05, + "loss": 2.1276, + "step": 2831 + }, + { + "epoch": 0.8692449355432781, + "grad_norm": 0.7642729878425598, + "learning_rate": 9.915407271368533e-05, + "loss": 2.204, + "step": 2832 + }, + { + "epoch": 0.8695518723143032, + "grad_norm": 0.8024489283561707, + "learning_rate": 9.915316201870718e-05, + "loss": 2.163, + "step": 2833 + }, + { + "epoch": 0.8698588090853284, + "grad_norm": 0.8268367648124695, + "learning_rate": 9.915225083796871e-05, + "loss": 2.117, + "step": 2834 + }, + { + "epoch": 0.8701657458563536, + "grad_norm": 0.7761407494544983, + "learning_rate": 9.915133917147888e-05, + "loss": 2.0727, + "step": 2835 + }, + { + "epoch": 0.8704726826273788, + "grad_norm": 0.7515753507614136, + "learning_rate": 9.91504270192467e-05, + "loss": 2.075, + "step": 2836 + }, + { + "epoch": 0.870779619398404, + "grad_norm": 0.6203973889350891, + "learning_rate": 9.914951438128119e-05, + "loss": 2.1163, + "step": 2837 + }, + { + "epoch": 0.8710865561694291, + "grad_norm": 0.6056976318359375, + "learning_rate": 9.914860125759138e-05, + "loss": 2.1515, + "step": 2838 + }, + { + "epoch": 0.8713934929404543, + "grad_norm": 0.6472234725952148, + "learning_rate": 9.914768764818627e-05, + "loss": 2.1618, + "step": 2839 + }, + { + "epoch": 0.8717004297114794, + "grad_norm": 0.5981749892234802, + "learning_rate": 9.914677355307491e-05, + "loss": 2.0763, + "step": 2840 + }, + { + "epoch": 0.8720073664825047, + "grad_norm": 0.5721938014030457, + "learning_rate": 9.914585897226634e-05, + "loss": 2.0916, + "step": 2841 + }, + { + "epoch": 0.8723143032535298, + "grad_norm": 0.6079535484313965, + "learning_rate": 9.914494390576958e-05, + "loss": 2.0767, + "step": 2842 + }, + { + "epoch": 0.872621240024555, + "grad_norm": 0.6684066653251648, + "learning_rate": 9.914402835359368e-05, + "loss": 2.2712, + "step": 2843 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.6992711424827576, + "learning_rate": 9.91431123157477e-05, + "loss": 2.0813, + "step": 2844 + }, + { + "epoch": 0.8732351135666053, + "grad_norm": 0.6585392951965332, + "learning_rate": 9.914219579224065e-05, + "loss": 2.1303, + "step": 2845 + }, + { + "epoch": 0.8735420503376304, + "grad_norm": 0.7267395257949829, + "learning_rate": 9.914127878308164e-05, + "loss": 2.2253, + "step": 2846 + }, + { + "epoch": 0.8738489871086557, + "grad_norm": 0.6764006018638611, + "learning_rate": 9.91403612882797e-05, + "loss": 2.0886, + "step": 2847 + }, + { + "epoch": 0.8741559238796808, + "grad_norm": 0.612808108329773, + "learning_rate": 9.91394433078439e-05, + "loss": 2.0469, + "step": 2848 + }, + { + "epoch": 0.874462860650706, + "grad_norm": 0.5598782896995544, + "learning_rate": 9.913852484178334e-05, + "loss": 2.1745, + "step": 2849 + }, + { + "epoch": 0.8747697974217311, + "grad_norm": 0.6498168706893921, + "learning_rate": 9.913760589010707e-05, + "loss": 2.2657, + "step": 2850 + }, + { + "epoch": 0.8750767341927563, + "grad_norm": 0.6796014904975891, + "learning_rate": 9.913668645282418e-05, + "loss": 2.1056, + "step": 2851 + }, + { + "epoch": 0.8753836709637814, + "grad_norm": 0.7409440279006958, + "learning_rate": 9.913576652994376e-05, + "loss": 2.1533, + "step": 2852 + }, + { + "epoch": 0.8756906077348067, + "grad_norm": 0.7044464945793152, + "learning_rate": 9.913484612147488e-05, + "loss": 2.2088, + "step": 2853 + }, + { + "epoch": 0.8759975445058318, + "grad_norm": 0.6333544254302979, + "learning_rate": 9.913392522742666e-05, + "loss": 2.132, + "step": 2854 + }, + { + "epoch": 0.876304481276857, + "grad_norm": 0.603382408618927, + "learning_rate": 9.91330038478082e-05, + "loss": 2.0657, + "step": 2855 + }, + { + "epoch": 0.8766114180478821, + "grad_norm": 0.5919856429100037, + "learning_rate": 9.913208198262858e-05, + "loss": 2.0854, + "step": 2856 + }, + { + "epoch": 0.8769183548189073, + "grad_norm": 0.6033365726470947, + "learning_rate": 9.913115963189694e-05, + "loss": 2.0825, + "step": 2857 + }, + { + "epoch": 0.8772252915899325, + "grad_norm": 0.5917964577674866, + "learning_rate": 9.913023679562238e-05, + "loss": 2.1608, + "step": 2858 + }, + { + "epoch": 0.8775322283609577, + "grad_norm": 0.5953360795974731, + "learning_rate": 9.912931347381402e-05, + "loss": 2.1454, + "step": 2859 + }, + { + "epoch": 0.8778391651319828, + "grad_norm": 0.5949352979660034, + "learning_rate": 9.9128389666481e-05, + "loss": 2.1575, + "step": 2860 + }, + { + "epoch": 0.878146101903008, + "grad_norm": 0.5468181371688843, + "learning_rate": 9.912746537363243e-05, + "loss": 2.151, + "step": 2861 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.5476632714271545, + "learning_rate": 9.912654059527746e-05, + "loss": 2.1015, + "step": 2862 + }, + { + "epoch": 0.8787599754450584, + "grad_norm": 0.6881390810012817, + "learning_rate": 9.912561533142521e-05, + "loss": 2.2002, + "step": 2863 + }, + { + "epoch": 0.8790669122160835, + "grad_norm": 0.6663404703140259, + "learning_rate": 9.912468958208486e-05, + "loss": 2.0691, + "step": 2864 + }, + { + "epoch": 0.8793738489871087, + "grad_norm": 0.5739100575447083, + "learning_rate": 9.91237633472655e-05, + "loss": 2.0852, + "step": 2865 + }, + { + "epoch": 0.8796807857581338, + "grad_norm": 0.5227558016777039, + "learning_rate": 9.912283662697635e-05, + "loss": 2.1144, + "step": 2866 + }, + { + "epoch": 0.879987722529159, + "grad_norm": 0.5626821517944336, + "learning_rate": 9.912190942122652e-05, + "loss": 2.0796, + "step": 2867 + }, + { + "epoch": 0.8802946593001841, + "grad_norm": 0.5367855429649353, + "learning_rate": 9.912098173002518e-05, + "loss": 2.0768, + "step": 2868 + }, + { + "epoch": 0.8806015960712094, + "grad_norm": 0.5285482406616211, + "learning_rate": 9.912005355338152e-05, + "loss": 2.0832, + "step": 2869 + }, + { + "epoch": 0.8809085328422345, + "grad_norm": 0.5384502410888672, + "learning_rate": 9.91191248913047e-05, + "loss": 2.0187, + "step": 2870 + }, + { + "epoch": 0.8812154696132597, + "grad_norm": 0.5099567770957947, + "learning_rate": 9.91181957438039e-05, + "loss": 2.0865, + "step": 2871 + }, + { + "epoch": 0.8815224063842848, + "grad_norm": 0.5513966679573059, + "learning_rate": 9.911726611088831e-05, + "loss": 2.1097, + "step": 2872 + }, + { + "epoch": 0.88182934315531, + "grad_norm": 0.5411790609359741, + "learning_rate": 9.911633599256709e-05, + "loss": 2.0964, + "step": 2873 + }, + { + "epoch": 0.8821362799263351, + "grad_norm": 0.6151100397109985, + "learning_rate": 9.911540538884947e-05, + "loss": 2.1006, + "step": 2874 + }, + { + "epoch": 0.8824432166973604, + "grad_norm": 0.754391610622406, + "learning_rate": 9.911447429974461e-05, + "loss": 2.1493, + "step": 2875 + }, + { + "epoch": 0.8827501534683855, + "grad_norm": 0.7485715746879578, + "learning_rate": 9.911354272526172e-05, + "loss": 2.1136, + "step": 2876 + }, + { + "epoch": 0.8830570902394107, + "grad_norm": 0.6808591485023499, + "learning_rate": 9.911261066541003e-05, + "loss": 2.1238, + "step": 2877 + }, + { + "epoch": 0.8833640270104358, + "grad_norm": 0.5771127343177795, + "learning_rate": 9.911167812019874e-05, + "loss": 2.0846, + "step": 2878 + }, + { + "epoch": 0.883670963781461, + "grad_norm": 0.5991767048835754, + "learning_rate": 9.911074508963705e-05, + "loss": 2.1486, + "step": 2879 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.6899440884590149, + "learning_rate": 9.91098115737342e-05, + "loss": 2.1357, + "step": 2880 + }, + { + "epoch": 0.8842848373235114, + "grad_norm": 0.7102574110031128, + "learning_rate": 9.91088775724994e-05, + "loss": 2.1269, + "step": 2881 + }, + { + "epoch": 0.8845917740945365, + "grad_norm": 0.7238754034042358, + "learning_rate": 9.910794308594189e-05, + "loss": 2.0829, + "step": 2882 + }, + { + "epoch": 0.8848987108655617, + "grad_norm": 0.7232441902160645, + "learning_rate": 9.91070081140709e-05, + "loss": 2.1704, + "step": 2883 + }, + { + "epoch": 0.8852056476365868, + "grad_norm": 0.7136173844337463, + "learning_rate": 9.910607265689569e-05, + "loss": 2.1553, + "step": 2884 + }, + { + "epoch": 0.885512584407612, + "grad_norm": 0.6566216945648193, + "learning_rate": 9.910513671442547e-05, + "loss": 2.0856, + "step": 2885 + }, + { + "epoch": 0.8858195211786372, + "grad_norm": 0.5712916851043701, + "learning_rate": 9.910420028666951e-05, + "loss": 2.1399, + "step": 2886 + }, + { + "epoch": 0.8861264579496624, + "grad_norm": 0.727664589881897, + "learning_rate": 9.910326337363707e-05, + "loss": 2.088, + "step": 2887 + }, + { + "epoch": 0.8864333947206875, + "grad_norm": 0.799963653087616, + "learning_rate": 9.91023259753374e-05, + "loss": 2.0984, + "step": 2888 + }, + { + "epoch": 0.8867403314917127, + "grad_norm": 0.9462977051734924, + "learning_rate": 9.910138809177975e-05, + "loss": 2.1262, + "step": 2889 + }, + { + "epoch": 0.8870472682627378, + "grad_norm": 0.9130533933639526, + "learning_rate": 9.910044972297343e-05, + "loss": 2.1967, + "step": 2890 + }, + { + "epoch": 0.887354205033763, + "grad_norm": 0.6971304416656494, + "learning_rate": 9.909951086892767e-05, + "loss": 2.0797, + "step": 2891 + }, + { + "epoch": 0.8876611418047882, + "grad_norm": 0.5822353363037109, + "learning_rate": 9.909857152965176e-05, + "loss": 2.1152, + "step": 2892 + }, + { + "epoch": 0.8879680785758134, + "grad_norm": 0.5885453820228577, + "learning_rate": 9.9097631705155e-05, + "loss": 2.0323, + "step": 2893 + }, + { + "epoch": 0.8882750153468385, + "grad_norm": 0.6249284744262695, + "learning_rate": 9.909669139544666e-05, + "loss": 2.1076, + "step": 2894 + }, + { + "epoch": 0.8885819521178637, + "grad_norm": 0.6117702722549438, + "learning_rate": 9.909575060053604e-05, + "loss": 2.0608, + "step": 2895 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.560357928276062, + "learning_rate": 9.909480932043245e-05, + "loss": 2.145, + "step": 2896 + }, + { + "epoch": 0.8891958256599141, + "grad_norm": 0.5442607998847961, + "learning_rate": 9.909386755514516e-05, + "loss": 2.1091, + "step": 2897 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.5653077363967896, + "learning_rate": 9.909292530468351e-05, + "loss": 2.1097, + "step": 2898 + }, + { + "epoch": 0.8898096992019644, + "grad_norm": 0.531939685344696, + "learning_rate": 9.909198256905679e-05, + "loss": 2.0866, + "step": 2899 + }, + { + "epoch": 0.8901166359729895, + "grad_norm": 0.6238400340080261, + "learning_rate": 9.909103934827433e-05, + "loss": 2.1421, + "step": 2900 + }, + { + "epoch": 0.8904235727440147, + "grad_norm": 0.5685901045799255, + "learning_rate": 9.909009564234543e-05, + "loss": 2.0019, + "step": 2901 + }, + { + "epoch": 0.8907305095150398, + "grad_norm": 0.5979083180427551, + "learning_rate": 9.908915145127945e-05, + "loss": 2.0891, + "step": 2902 + }, + { + "epoch": 0.8910374462860651, + "grad_norm": 0.5847237706184387, + "learning_rate": 9.90882067750857e-05, + "loss": 2.1165, + "step": 2903 + }, + { + "epoch": 0.8913443830570903, + "grad_norm": 0.6281530261039734, + "learning_rate": 9.908726161377351e-05, + "loss": 2.1396, + "step": 2904 + }, + { + "epoch": 0.8916513198281154, + "grad_norm": 0.5685252547264099, + "learning_rate": 9.908631596735225e-05, + "loss": 2.0781, + "step": 2905 + }, + { + "epoch": 0.8919582565991406, + "grad_norm": 0.5427065491676331, + "learning_rate": 9.908536983583123e-05, + "loss": 2.1387, + "step": 2906 + }, + { + "epoch": 0.8922651933701657, + "grad_norm": 0.5972270965576172, + "learning_rate": 9.908442321921982e-05, + "loss": 2.0546, + "step": 2907 + }, + { + "epoch": 0.892572130141191, + "grad_norm": 0.562685489654541, + "learning_rate": 9.908347611752735e-05, + "loss": 2.093, + "step": 2908 + }, + { + "epoch": 0.8928790669122161, + "grad_norm": 0.6781734824180603, + "learning_rate": 9.908252853076323e-05, + "loss": 2.1589, + "step": 2909 + }, + { + "epoch": 0.8931860036832413, + "grad_norm": 0.7591540813446045, + "learning_rate": 9.908158045893678e-05, + "loss": 2.164, + "step": 2910 + }, + { + "epoch": 0.8934929404542664, + "grad_norm": 0.7161938548088074, + "learning_rate": 9.908063190205738e-05, + "loss": 2.079, + "step": 2911 + }, + { + "epoch": 0.8937998772252916, + "grad_norm": 0.7338036298751831, + "learning_rate": 9.907968286013442e-05, + "loss": 2.0033, + "step": 2912 + }, + { + "epoch": 0.8941068139963168, + "grad_norm": 0.7641176581382751, + "learning_rate": 9.907873333317727e-05, + "loss": 2.187, + "step": 2913 + }, + { + "epoch": 0.894413750767342, + "grad_norm": 0.6073760390281677, + "learning_rate": 9.90777833211953e-05, + "loss": 2.0589, + "step": 2914 + }, + { + "epoch": 0.8947206875383671, + "grad_norm": 0.49493756890296936, + "learning_rate": 9.907683282419791e-05, + "loss": 2.0555, + "step": 2915 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.6428996920585632, + "learning_rate": 9.907588184219449e-05, + "loss": 2.1083, + "step": 2916 + }, + { + "epoch": 0.8953345610804174, + "grad_norm": 0.6752644777297974, + "learning_rate": 9.907493037519447e-05, + "loss": 2.0987, + "step": 2917 + }, + { + "epoch": 0.8956414978514426, + "grad_norm": 0.5719494223594666, + "learning_rate": 9.907397842320719e-05, + "loss": 2.1735, + "step": 2918 + }, + { + "epoch": 0.8959484346224678, + "grad_norm": 0.5799626111984253, + "learning_rate": 9.907302598624211e-05, + "loss": 2.0978, + "step": 2919 + }, + { + "epoch": 0.896255371393493, + "grad_norm": 0.5407500267028809, + "learning_rate": 9.907207306430861e-05, + "loss": 2.0303, + "step": 2920 + }, + { + "epoch": 0.8965623081645181, + "grad_norm": 0.5950884222984314, + "learning_rate": 9.907111965741614e-05, + "loss": 2.0721, + "step": 2921 + }, + { + "epoch": 0.8968692449355433, + "grad_norm": 0.7711441516876221, + "learning_rate": 9.907016576557409e-05, + "loss": 2.1693, + "step": 2922 + }, + { + "epoch": 0.8971761817065684, + "grad_norm": 0.5522177815437317, + "learning_rate": 9.906921138879191e-05, + "loss": 2.1057, + "step": 2923 + }, + { + "epoch": 0.8974831184775937, + "grad_norm": 0.5743894577026367, + "learning_rate": 9.906825652707903e-05, + "loss": 2.119, + "step": 2924 + }, + { + "epoch": 0.8977900552486188, + "grad_norm": 0.5996440649032593, + "learning_rate": 9.906730118044486e-05, + "loss": 2.1251, + "step": 2925 + }, + { + "epoch": 0.898096992019644, + "grad_norm": 0.691302478313446, + "learning_rate": 9.906634534889887e-05, + "loss": 2.1459, + "step": 2926 + }, + { + "epoch": 0.8984039287906691, + "grad_norm": 0.6125866770744324, + "learning_rate": 9.90653890324505e-05, + "loss": 2.0739, + "step": 2927 + }, + { + "epoch": 0.8987108655616943, + "grad_norm": 0.5285681486129761, + "learning_rate": 9.906443223110919e-05, + "loss": 2.0398, + "step": 2928 + }, + { + "epoch": 0.8990178023327194, + "grad_norm": 0.5747935771942139, + "learning_rate": 9.90634749448844e-05, + "loss": 2.0688, + "step": 2929 + }, + { + "epoch": 0.8993247391037447, + "grad_norm": 0.5686646103858948, + "learning_rate": 9.90625171737856e-05, + "loss": 2.1196, + "step": 2930 + }, + { + "epoch": 0.8996316758747698, + "grad_norm": 0.5320247411727905, + "learning_rate": 9.906155891782225e-05, + "loss": 2.1069, + "step": 2931 + }, + { + "epoch": 0.899938612645795, + "grad_norm": 0.5626047849655151, + "learning_rate": 9.906060017700383e-05, + "loss": 2.1091, + "step": 2932 + }, + { + "epoch": 0.9002455494168201, + "grad_norm": 0.5284978151321411, + "learning_rate": 9.905964095133979e-05, + "loss": 2.036, + "step": 2933 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.5362093448638916, + "learning_rate": 9.905868124083962e-05, + "loss": 2.1273, + "step": 2934 + }, + { + "epoch": 0.9008594229588704, + "grad_norm": 0.5583781599998474, + "learning_rate": 9.90577210455128e-05, + "loss": 2.0871, + "step": 2935 + }, + { + "epoch": 0.9011663597298957, + "grad_norm": 0.5552016496658325, + "learning_rate": 9.905676036536883e-05, + "loss": 2.0785, + "step": 2936 + }, + { + "epoch": 0.9014732965009208, + "grad_norm": 0.6875657439231873, + "learning_rate": 9.905579920041724e-05, + "loss": 2.083, + "step": 2937 + }, + { + "epoch": 0.901780233271946, + "grad_norm": 0.5396340489387512, + "learning_rate": 9.905483755066744e-05, + "loss": 2.0717, + "step": 2938 + }, + { + "epoch": 0.9020871700429711, + "grad_norm": 0.594739556312561, + "learning_rate": 9.9053875416129e-05, + "loss": 2.1305, + "step": 2939 + }, + { + "epoch": 0.9023941068139963, + "grad_norm": 0.6208831667900085, + "learning_rate": 9.905291279681143e-05, + "loss": 2.0034, + "step": 2940 + }, + { + "epoch": 0.9027010435850215, + "grad_norm": 0.5154325366020203, + "learning_rate": 9.90519496927242e-05, + "loss": 2.098, + "step": 2941 + }, + { + "epoch": 0.9030079803560467, + "grad_norm": 0.5217738151550293, + "learning_rate": 9.905098610387687e-05, + "loss": 2.0467, + "step": 2942 + }, + { + "epoch": 0.9033149171270718, + "grad_norm": 0.5623623728752136, + "learning_rate": 9.905002203027894e-05, + "loss": 2.1854, + "step": 2943 + }, + { + "epoch": 0.903621853898097, + "grad_norm": 0.5365456938743591, + "learning_rate": 9.904905747193993e-05, + "loss": 2.1021, + "step": 2944 + }, + { + "epoch": 0.9039287906691221, + "grad_norm": 0.5391906499862671, + "learning_rate": 9.904809242886941e-05, + "loss": 2.1102, + "step": 2945 + }, + { + "epoch": 0.9042357274401474, + "grad_norm": 0.5439971685409546, + "learning_rate": 9.904712690107687e-05, + "loss": 2.0691, + "step": 2946 + }, + { + "epoch": 0.9045426642111725, + "grad_norm": 0.539383053779602, + "learning_rate": 9.904616088857189e-05, + "loss": 2.0514, + "step": 2947 + }, + { + "epoch": 0.9048496009821977, + "grad_norm": 0.5370060801506042, + "learning_rate": 9.904519439136399e-05, + "loss": 2.1069, + "step": 2948 + }, + { + "epoch": 0.9051565377532228, + "grad_norm": 0.5136541724205017, + "learning_rate": 9.904422740946274e-05, + "loss": 2.0519, + "step": 2949 + }, + { + "epoch": 0.905463474524248, + "grad_norm": 0.4970051348209381, + "learning_rate": 9.904325994287768e-05, + "loss": 2.0624, + "step": 2950 + }, + { + "epoch": 0.9057704112952731, + "grad_norm": 0.5003986954689026, + "learning_rate": 9.90422919916184e-05, + "loss": 2.135, + "step": 2951 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.5559821724891663, + "learning_rate": 9.904132355569443e-05, + "loss": 2.0733, + "step": 2952 + }, + { + "epoch": 0.9063842848373235, + "grad_norm": 0.5450533628463745, + "learning_rate": 9.904035463511537e-05, + "loss": 2.1491, + "step": 2953 + }, + { + "epoch": 0.9066912216083487, + "grad_norm": 0.5789141058921814, + "learning_rate": 9.903938522989076e-05, + "loss": 2.0604, + "step": 2954 + }, + { + "epoch": 0.9069981583793738, + "grad_norm": 0.6327412128448486, + "learning_rate": 9.903841534003023e-05, + "loss": 2.1307, + "step": 2955 + }, + { + "epoch": 0.907305095150399, + "grad_norm": 0.5694023966789246, + "learning_rate": 9.90374449655433e-05, + "loss": 2.1322, + "step": 2956 + }, + { + "epoch": 0.9076120319214241, + "grad_norm": 0.6241337060928345, + "learning_rate": 9.903647410643963e-05, + "loss": 2.1026, + "step": 2957 + }, + { + "epoch": 0.9079189686924494, + "grad_norm": 0.6257766485214233, + "learning_rate": 9.903550276272878e-05, + "loss": 2.0449, + "step": 2958 + }, + { + "epoch": 0.9082259054634745, + "grad_norm": 0.708626389503479, + "learning_rate": 9.903453093442032e-05, + "loss": 2.095, + "step": 2959 + }, + { + "epoch": 0.9085328422344997, + "grad_norm": 0.6769086122512817, + "learning_rate": 9.903355862152391e-05, + "loss": 2.0939, + "step": 2960 + }, + { + "epoch": 0.9088397790055248, + "grad_norm": 0.6221890449523926, + "learning_rate": 9.903258582404913e-05, + "loss": 2.1552, + "step": 2961 + }, + { + "epoch": 0.90914671577655, + "grad_norm": 0.7477858662605286, + "learning_rate": 9.903161254200561e-05, + "loss": 2.1155, + "step": 2962 + }, + { + "epoch": 0.9094536525475752, + "grad_norm": 0.665538489818573, + "learning_rate": 9.903063877540294e-05, + "loss": 2.1032, + "step": 2963 + }, + { + "epoch": 0.9097605893186004, + "grad_norm": 0.5973435044288635, + "learning_rate": 9.902966452425076e-05, + "loss": 2.0793, + "step": 2964 + }, + { + "epoch": 0.9100675260896255, + "grad_norm": 0.6544547080993652, + "learning_rate": 9.90286897885587e-05, + "loss": 2.1566, + "step": 2965 + }, + { + "epoch": 0.9103744628606507, + "grad_norm": 0.7162452936172485, + "learning_rate": 9.90277145683364e-05, + "loss": 2.1234, + "step": 2966 + }, + { + "epoch": 0.9106813996316758, + "grad_norm": 0.8400503993034363, + "learning_rate": 9.902673886359349e-05, + "loss": 2.216, + "step": 2967 + }, + { + "epoch": 0.910988336402701, + "grad_norm": 1.0350611209869385, + "learning_rate": 9.902576267433961e-05, + "loss": 2.0785, + "step": 2968 + }, + { + "epoch": 0.9112952731737262, + "grad_norm": 0.9551987051963806, + "learning_rate": 9.90247860005844e-05, + "loss": 2.0652, + "step": 2969 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.839712381362915, + "learning_rate": 9.902380884233751e-05, + "loss": 2.1197, + "step": 2970 + }, + { + "epoch": 0.9119091467157765, + "grad_norm": 0.6588022708892822, + "learning_rate": 9.902283119960863e-05, + "loss": 2.155, + "step": 2971 + }, + { + "epoch": 0.9122160834868017, + "grad_norm": 0.6532430052757263, + "learning_rate": 9.902185307240739e-05, + "loss": 2.0947, + "step": 2972 + }, + { + "epoch": 0.9125230202578268, + "grad_norm": 0.7890481352806091, + "learning_rate": 9.902087446074346e-05, + "loss": 2.0246, + "step": 2973 + }, + { + "epoch": 0.9128299570288521, + "grad_norm": 0.6234511137008667, + "learning_rate": 9.901989536462652e-05, + "loss": 2.1033, + "step": 2974 + }, + { + "epoch": 0.9131368937998773, + "grad_norm": 0.5875300168991089, + "learning_rate": 9.901891578406623e-05, + "loss": 2.0553, + "step": 2975 + }, + { + "epoch": 0.9134438305709024, + "grad_norm": 0.6868174076080322, + "learning_rate": 9.901793571907231e-05, + "loss": 2.1398, + "step": 2976 + }, + { + "epoch": 0.9137507673419276, + "grad_norm": 0.7423301339149475, + "learning_rate": 9.90169551696544e-05, + "loss": 2.1034, + "step": 2977 + }, + { + "epoch": 0.9140577041129527, + "grad_norm": 0.588916003704071, + "learning_rate": 9.901597413582222e-05, + "loss": 2.078, + "step": 2978 + }, + { + "epoch": 0.914364640883978, + "grad_norm": 0.5895309448242188, + "learning_rate": 9.901499261758544e-05, + "loss": 2.0902, + "step": 2979 + }, + { + "epoch": 0.9146715776550031, + "grad_norm": 0.5403301119804382, + "learning_rate": 9.901401061495379e-05, + "loss": 2.0291, + "step": 2980 + }, + { + "epoch": 0.9149785144260283, + "grad_norm": 0.6102077960968018, + "learning_rate": 9.901302812793696e-05, + "loss": 2.0415, + "step": 2981 + }, + { + "epoch": 0.9152854511970534, + "grad_norm": 0.6728450059890747, + "learning_rate": 9.901204515654465e-05, + "loss": 2.105, + "step": 2982 + }, + { + "epoch": 0.9155923879680786, + "grad_norm": 0.5886163711547852, + "learning_rate": 9.901106170078657e-05, + "loss": 2.0186, + "step": 2983 + }, + { + "epoch": 0.9158993247391037, + "grad_norm": 0.539252758026123, + "learning_rate": 9.901007776067247e-05, + "loss": 2.0604, + "step": 2984 + }, + { + "epoch": 0.916206261510129, + "grad_norm": 0.6169516444206238, + "learning_rate": 9.900909333621205e-05, + "loss": 2.1257, + "step": 2985 + }, + { + "epoch": 0.9165131982811541, + "grad_norm": 0.5624274015426636, + "learning_rate": 9.900810842741506e-05, + "loss": 2.0325, + "step": 2986 + }, + { + "epoch": 0.9168201350521793, + "grad_norm": 0.5931735634803772, + "learning_rate": 9.900712303429119e-05, + "loss": 2.0815, + "step": 2987 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.5720505714416504, + "learning_rate": 9.900613715685023e-05, + "loss": 2.1261, + "step": 2988 + }, + { + "epoch": 0.9174340085942296, + "grad_norm": 0.5752067565917969, + "learning_rate": 9.900515079510189e-05, + "loss": 2.1402, + "step": 2989 + }, + { + "epoch": 0.9177409453652547, + "grad_norm": 0.5836917757987976, + "learning_rate": 9.900416394905591e-05, + "loss": 2.0523, + "step": 2990 + }, + { + "epoch": 0.91804788213628, + "grad_norm": 0.6408325433731079, + "learning_rate": 9.900317661872209e-05, + "loss": 2.1874, + "step": 2991 + }, + { + "epoch": 0.9183548189073051, + "grad_norm": 0.6188341379165649, + "learning_rate": 9.900218880411013e-05, + "loss": 2.0903, + "step": 2992 + }, + { + "epoch": 0.9186617556783303, + "grad_norm": 0.5740565657615662, + "learning_rate": 9.900120050522985e-05, + "loss": 2.1243, + "step": 2993 + }, + { + "epoch": 0.9189686924493554, + "grad_norm": 0.635638952255249, + "learning_rate": 9.900021172209096e-05, + "loss": 2.089, + "step": 2994 + }, + { + "epoch": 0.9192756292203806, + "grad_norm": 0.5538209676742554, + "learning_rate": 9.899922245470326e-05, + "loss": 2.0489, + "step": 2995 + }, + { + "epoch": 0.9195825659914058, + "grad_norm": 0.5440292954444885, + "learning_rate": 9.899823270307654e-05, + "loss": 2.0534, + "step": 2996 + }, + { + "epoch": 0.919889502762431, + "grad_norm": 0.6203792691230774, + "learning_rate": 9.899724246722055e-05, + "loss": 2.2799, + "step": 2997 + }, + { + "epoch": 0.9201964395334561, + "grad_norm": 0.6299278140068054, + "learning_rate": 9.89962517471451e-05, + "loss": 2.0813, + "step": 2998 + }, + { + "epoch": 0.9205033763044813, + "grad_norm": 0.6156774759292603, + "learning_rate": 9.899526054285997e-05, + "loss": 2.1345, + "step": 2999 + }, + { + "epoch": 0.9208103130755064, + "grad_norm": 0.5940032601356506, + "learning_rate": 9.899426885437496e-05, + "loss": 2.133, + "step": 3000 + }, + { + "epoch": 0.9211172498465316, + "grad_norm": 0.6210232377052307, + "learning_rate": 9.899327668169987e-05, + "loss": 2.0275, + "step": 3001 + }, + { + "epoch": 0.9214241866175568, + "grad_norm": 0.5578985214233398, + "learning_rate": 9.89922840248445e-05, + "loss": 2.0806, + "step": 3002 + }, + { + "epoch": 0.921731123388582, + "grad_norm": 0.5264963507652283, + "learning_rate": 9.899129088381866e-05, + "loss": 2.1233, + "step": 3003 + }, + { + "epoch": 0.9220380601596071, + "grad_norm": 0.5414119958877563, + "learning_rate": 9.899029725863218e-05, + "loss": 2.1052, + "step": 3004 + }, + { + "epoch": 0.9223449969306323, + "grad_norm": 0.5933207869529724, + "learning_rate": 9.898930314929486e-05, + "loss": 2.108, + "step": 3005 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.6170317530632019, + "learning_rate": 9.898830855581654e-05, + "loss": 2.0997, + "step": 3006 + }, + { + "epoch": 0.9229588704726827, + "grad_norm": 0.5930282473564148, + "learning_rate": 9.898731347820705e-05, + "loss": 2.0507, + "step": 3007 + }, + { + "epoch": 0.9232658072437078, + "grad_norm": 0.5894142985343933, + "learning_rate": 9.898631791647619e-05, + "loss": 2.0687, + "step": 3008 + }, + { + "epoch": 0.923572744014733, + "grad_norm": 0.6560437083244324, + "learning_rate": 9.898532187063383e-05, + "loss": 2.096, + "step": 3009 + }, + { + "epoch": 0.9238796807857581, + "grad_norm": 0.6083245873451233, + "learning_rate": 9.898432534068983e-05, + "loss": 2.0526, + "step": 3010 + }, + { + "epoch": 0.9241866175567833, + "grad_norm": 0.5152565240859985, + "learning_rate": 9.8983328326654e-05, + "loss": 2.0802, + "step": 3011 + }, + { + "epoch": 0.9244935543278084, + "grad_norm": 0.6326588988304138, + "learning_rate": 9.89823308285362e-05, + "loss": 2.1246, + "step": 3012 + }, + { + "epoch": 0.9248004910988337, + "grad_norm": 0.6821309328079224, + "learning_rate": 9.898133284634632e-05, + "loss": 2.1106, + "step": 3013 + }, + { + "epoch": 0.9251074278698588, + "grad_norm": 0.6192164421081543, + "learning_rate": 9.898033438009419e-05, + "loss": 2.0475, + "step": 3014 + }, + { + "epoch": 0.925414364640884, + "grad_norm": 0.6112427115440369, + "learning_rate": 9.897933542978967e-05, + "loss": 2.0904, + "step": 3015 + }, + { + "epoch": 0.9257213014119091, + "grad_norm": 0.5729427933692932, + "learning_rate": 9.897833599544268e-05, + "loss": 2.1151, + "step": 3016 + }, + { + "epoch": 0.9260282381829343, + "grad_norm": 0.6200255751609802, + "learning_rate": 9.897733607706305e-05, + "loss": 2.0815, + "step": 3017 + }, + { + "epoch": 0.9263351749539595, + "grad_norm": 0.635920524597168, + "learning_rate": 9.897633567466068e-05, + "loss": 2.0724, + "step": 3018 + }, + { + "epoch": 0.9266421117249847, + "grad_norm": 0.5916038155555725, + "learning_rate": 9.897533478824546e-05, + "loss": 2.1527, + "step": 3019 + }, + { + "epoch": 0.9269490484960098, + "grad_norm": 0.5552941560745239, + "learning_rate": 9.897433341782727e-05, + "loss": 2.0958, + "step": 3020 + }, + { + "epoch": 0.927255985267035, + "grad_norm": 0.562383770942688, + "learning_rate": 9.897333156341602e-05, + "loss": 2.0939, + "step": 3021 + }, + { + "epoch": 0.9275629220380601, + "grad_norm": 0.5227869153022766, + "learning_rate": 9.897232922502158e-05, + "loss": 2.1358, + "step": 3022 + }, + { + "epoch": 0.9278698588090853, + "grad_norm": 0.5671074986457825, + "learning_rate": 9.897132640265391e-05, + "loss": 2.0877, + "step": 3023 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.5176356434822083, + "learning_rate": 9.897032309632287e-05, + "loss": 2.0392, + "step": 3024 + }, + { + "epoch": 0.9284837323511357, + "grad_norm": 0.5160155296325684, + "learning_rate": 9.89693193060384e-05, + "loss": 2.069, + "step": 3025 + }, + { + "epoch": 0.9287906691221608, + "grad_norm": 0.5034440159797668, + "learning_rate": 9.896831503181042e-05, + "loss": 2.0348, + "step": 3026 + }, + { + "epoch": 0.929097605893186, + "grad_norm": 0.5146151781082153, + "learning_rate": 9.896731027364884e-05, + "loss": 2.0884, + "step": 3027 + }, + { + "epoch": 0.9294045426642111, + "grad_norm": 0.7153071165084839, + "learning_rate": 9.896630503156361e-05, + "loss": 2.2295, + "step": 3028 + }, + { + "epoch": 0.9297114794352364, + "grad_norm": 0.7201753258705139, + "learning_rate": 9.896529930556464e-05, + "loss": 2.1285, + "step": 3029 + }, + { + "epoch": 0.9300184162062615, + "grad_norm": 0.7110029458999634, + "learning_rate": 9.89642930956619e-05, + "loss": 2.1371, + "step": 3030 + }, + { + "epoch": 0.9303253529772867, + "grad_norm": 0.695444643497467, + "learning_rate": 9.896328640186531e-05, + "loss": 2.0698, + "step": 3031 + }, + { + "epoch": 0.9306322897483118, + "grad_norm": 0.6157357096672058, + "learning_rate": 9.896227922418482e-05, + "loss": 2.1294, + "step": 3032 + }, + { + "epoch": 0.930939226519337, + "grad_norm": 0.5473730564117432, + "learning_rate": 9.896127156263039e-05, + "loss": 2.0487, + "step": 3033 + }, + { + "epoch": 0.9312461632903621, + "grad_norm": 0.6400229334831238, + "learning_rate": 9.896026341721198e-05, + "loss": 2.0422, + "step": 3034 + }, + { + "epoch": 0.9315531000613874, + "grad_norm": 0.5046324729919434, + "learning_rate": 9.895925478793955e-05, + "loss": 2.0715, + "step": 3035 + }, + { + "epoch": 0.9318600368324125, + "grad_norm": 0.5316528081893921, + "learning_rate": 9.895824567482307e-05, + "loss": 2.11, + "step": 3036 + }, + { + "epoch": 0.9321669736034377, + "grad_norm": 0.5760478973388672, + "learning_rate": 9.895723607787251e-05, + "loss": 2.0885, + "step": 3037 + }, + { + "epoch": 0.9324739103744628, + "grad_norm": 0.5034705996513367, + "learning_rate": 9.895622599709785e-05, + "loss": 2.0024, + "step": 3038 + }, + { + "epoch": 0.932780847145488, + "grad_norm": 0.46088743209838867, + "learning_rate": 9.895521543250906e-05, + "loss": 2.0794, + "step": 3039 + }, + { + "epoch": 0.9330877839165131, + "grad_norm": 0.5219544172286987, + "learning_rate": 9.895420438411616e-05, + "loss": 2.1002, + "step": 3040 + }, + { + "epoch": 0.9333947206875384, + "grad_norm": 0.5363453030586243, + "learning_rate": 9.89531928519291e-05, + "loss": 2.0629, + "step": 3041 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.5860787630081177, + "learning_rate": 9.89521808359579e-05, + "loss": 2.0999, + "step": 3042 + }, + { + "epoch": 0.9340085942295887, + "grad_norm": 0.7155836224555969, + "learning_rate": 9.895116833621255e-05, + "loss": 2.1674, + "step": 3043 + }, + { + "epoch": 0.9343155310006138, + "grad_norm": 0.8029196262359619, + "learning_rate": 9.895015535270307e-05, + "loss": 2.0776, + "step": 3044 + }, + { + "epoch": 0.934622467771639, + "grad_norm": 0.6973832845687866, + "learning_rate": 9.894914188543946e-05, + "loss": 2.0537, + "step": 3045 + }, + { + "epoch": 0.9349294045426643, + "grad_norm": 0.6646706461906433, + "learning_rate": 9.894812793443175e-05, + "loss": 2.0857, + "step": 3046 + }, + { + "epoch": 0.9352363413136894, + "grad_norm": 0.6343888640403748, + "learning_rate": 9.894711349968995e-05, + "loss": 2.0832, + "step": 3047 + }, + { + "epoch": 0.9355432780847146, + "grad_norm": 0.54819256067276, + "learning_rate": 9.894609858122407e-05, + "loss": 2.1576, + "step": 3048 + }, + { + "epoch": 0.9358502148557397, + "grad_norm": 0.6905701160430908, + "learning_rate": 9.894508317904419e-05, + "loss": 2.0685, + "step": 3049 + }, + { + "epoch": 0.9361571516267649, + "grad_norm": 0.605591356754303, + "learning_rate": 9.894406729316028e-05, + "loss": 2.0931, + "step": 3050 + }, + { + "epoch": 0.93646408839779, + "grad_norm": 0.5702943801879883, + "learning_rate": 9.89430509235824e-05, + "loss": 2.1224, + "step": 3051 + }, + { + "epoch": 0.9367710251688153, + "grad_norm": 0.5855122804641724, + "learning_rate": 9.894203407032064e-05, + "loss": 2.0747, + "step": 3052 + }, + { + "epoch": 0.9370779619398404, + "grad_norm": 0.6002167463302612, + "learning_rate": 9.894101673338498e-05, + "loss": 2.0991, + "step": 3053 + }, + { + "epoch": 0.9373848987108656, + "grad_norm": 0.5914842486381531, + "learning_rate": 9.893999891278553e-05, + "loss": 2.0427, + "step": 3054 + }, + { + "epoch": 0.9376918354818907, + "grad_norm": 0.6283048391342163, + "learning_rate": 9.893898060853232e-05, + "loss": 2.0558, + "step": 3055 + }, + { + "epoch": 0.937998772252916, + "grad_norm": 0.5955209136009216, + "learning_rate": 9.893796182063542e-05, + "loss": 2.1286, + "step": 3056 + }, + { + "epoch": 0.9383057090239411, + "grad_norm": 0.5579878687858582, + "learning_rate": 9.893694254910489e-05, + "loss": 2.0799, + "step": 3057 + }, + { + "epoch": 0.9386126457949663, + "grad_norm": 0.5690281391143799, + "learning_rate": 9.893592279395082e-05, + "loss": 2.0699, + "step": 3058 + }, + { + "epoch": 0.9389195825659914, + "grad_norm": 0.5189259648323059, + "learning_rate": 9.893490255518327e-05, + "loss": 2.0627, + "step": 3059 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.5205439925193787, + "learning_rate": 9.893388183281233e-05, + "loss": 2.0136, + "step": 3060 + }, + { + "epoch": 0.9395334561080417, + "grad_norm": 0.492593914270401, + "learning_rate": 9.89328606268481e-05, + "loss": 2.0799, + "step": 3061 + }, + { + "epoch": 0.939840392879067, + "grad_norm": 0.6511666178703308, + "learning_rate": 9.893183893730067e-05, + "loss": 2.1297, + "step": 3062 + }, + { + "epoch": 0.9401473296500921, + "grad_norm": 0.7640050053596497, + "learning_rate": 9.89308167641801e-05, + "loss": 2.1384, + "step": 3063 + }, + { + "epoch": 0.9404542664211173, + "grad_norm": 0.7526536583900452, + "learning_rate": 9.892979410749654e-05, + "loss": 2.0454, + "step": 3064 + }, + { + "epoch": 0.9407612031921424, + "grad_norm": 0.7140639424324036, + "learning_rate": 9.892877096726007e-05, + "loss": 2.0219, + "step": 3065 + }, + { + "epoch": 0.9410681399631676, + "grad_norm": 0.6584374308586121, + "learning_rate": 9.89277473434808e-05, + "loss": 2.0943, + "step": 3066 + }, + { + "epoch": 0.9413750767341927, + "grad_norm": 0.5889024138450623, + "learning_rate": 9.892672323616888e-05, + "loss": 2.1088, + "step": 3067 + }, + { + "epoch": 0.941682013505218, + "grad_norm": 0.6196749806404114, + "learning_rate": 9.892569864533438e-05, + "loss": 2.101, + "step": 3068 + }, + { + "epoch": 0.9419889502762431, + "grad_norm": 0.6432211399078369, + "learning_rate": 9.892467357098744e-05, + "loss": 2.0828, + "step": 3069 + }, + { + "epoch": 0.9422958870472683, + "grad_norm": 0.6448069214820862, + "learning_rate": 9.892364801313823e-05, + "loss": 2.1389, + "step": 3070 + }, + { + "epoch": 0.9426028238182934, + "grad_norm": 0.597197949886322, + "learning_rate": 9.892262197179682e-05, + "loss": 2.0902, + "step": 3071 + }, + { + "epoch": 0.9429097605893186, + "grad_norm": 0.625348687171936, + "learning_rate": 9.892159544697341e-05, + "loss": 2.0659, + "step": 3072 + }, + { + "epoch": 0.9432166973603437, + "grad_norm": 0.5109166502952576, + "learning_rate": 9.892056843867812e-05, + "loss": 2.0895, + "step": 3073 + }, + { + "epoch": 0.943523634131369, + "grad_norm": 0.5917959213256836, + "learning_rate": 9.891954094692108e-05, + "loss": 2.0646, + "step": 3074 + }, + { + "epoch": 0.9438305709023941, + "grad_norm": 0.5320633053779602, + "learning_rate": 9.891851297171249e-05, + "loss": 2.107, + "step": 3075 + }, + { + "epoch": 0.9441375076734193, + "grad_norm": 0.5271332263946533, + "learning_rate": 9.891748451306246e-05, + "loss": 2.0984, + "step": 3076 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.5389983057975769, + "learning_rate": 9.89164555709812e-05, + "loss": 2.1097, + "step": 3077 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.5536573529243469, + "learning_rate": 9.891542614547885e-05, + "loss": 2.1271, + "step": 3078 + }, + { + "epoch": 0.9450583179864948, + "grad_norm": 0.5481712222099304, + "learning_rate": 9.891439623656558e-05, + "loss": 2.0975, + "step": 3079 + }, + { + "epoch": 0.94536525475752, + "grad_norm": 0.626431941986084, + "learning_rate": 9.891336584425157e-05, + "loss": 2.1561, + "step": 3080 + }, + { + "epoch": 0.9456721915285451, + "grad_norm": 0.7452689409255981, + "learning_rate": 9.891233496854702e-05, + "loss": 2.0791, + "step": 3081 + }, + { + "epoch": 0.9459791282995703, + "grad_norm": 0.9399113059043884, + "learning_rate": 9.89113036094621e-05, + "loss": 2.0706, + "step": 3082 + }, + { + "epoch": 0.9462860650705954, + "grad_norm": 1.0733267068862915, + "learning_rate": 9.891027176700701e-05, + "loss": 2.0705, + "step": 3083 + }, + { + "epoch": 0.9465930018416207, + "grad_norm": 0.7521542906761169, + "learning_rate": 9.890923944119194e-05, + "loss": 2.0862, + "step": 3084 + }, + { + "epoch": 0.9468999386126458, + "grad_norm": 0.5447198152542114, + "learning_rate": 9.890820663202713e-05, + "loss": 2.1047, + "step": 3085 + }, + { + "epoch": 0.947206875383671, + "grad_norm": 0.5733833312988281, + "learning_rate": 9.890717333952273e-05, + "loss": 2.121, + "step": 3086 + }, + { + "epoch": 0.9475138121546961, + "grad_norm": 0.7225440144538879, + "learning_rate": 9.890613956368899e-05, + "loss": 2.0533, + "step": 3087 + }, + { + "epoch": 0.9478207489257213, + "grad_norm": 0.6377096176147461, + "learning_rate": 9.89051053045361e-05, + "loss": 2.07, + "step": 3088 + }, + { + "epoch": 0.9481276856967464, + "grad_norm": 0.556656002998352, + "learning_rate": 9.890407056207432e-05, + "loss": 2.1103, + "step": 3089 + }, + { + "epoch": 0.9484346224677717, + "grad_norm": 0.6807621121406555, + "learning_rate": 9.890303533631382e-05, + "loss": 2.1351, + "step": 3090 + }, + { + "epoch": 0.9487415592387968, + "grad_norm": 0.7187803983688354, + "learning_rate": 9.890199962726487e-05, + "loss": 2.0582, + "step": 3091 + }, + { + "epoch": 0.949048496009822, + "grad_norm": 0.6201196908950806, + "learning_rate": 9.890096343493771e-05, + "loss": 2.0799, + "step": 3092 + }, + { + "epoch": 0.9493554327808471, + "grad_norm": 0.6258496046066284, + "learning_rate": 9.889992675934257e-05, + "loss": 2.156, + "step": 3093 + }, + { + "epoch": 0.9496623695518723, + "grad_norm": 0.6191570162773132, + "learning_rate": 9.889888960048967e-05, + "loss": 2.0121, + "step": 3094 + }, + { + "epoch": 0.9499693063228974, + "grad_norm": 0.5668848752975464, + "learning_rate": 9.88978519583893e-05, + "loss": 2.0954, + "step": 3095 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.5596859455108643, + "learning_rate": 9.88968138330517e-05, + "loss": 2.1274, + "step": 3096 + }, + { + "epoch": 0.9505831798649478, + "grad_norm": 0.6199706196784973, + "learning_rate": 9.889577522448712e-05, + "loss": 2.0588, + "step": 3097 + }, + { + "epoch": 0.950890116635973, + "grad_norm": 0.5129860639572144, + "learning_rate": 9.889473613270584e-05, + "loss": 2.0722, + "step": 3098 + }, + { + "epoch": 0.9511970534069981, + "grad_norm": 0.513263463973999, + "learning_rate": 9.88936965577181e-05, + "loss": 2.0298, + "step": 3099 + }, + { + "epoch": 0.9515039901780233, + "grad_norm": 0.4870156943798065, + "learning_rate": 9.88926564995342e-05, + "loss": 2.025, + "step": 3100 + }, + { + "epoch": 0.9518109269490485, + "grad_norm": 0.5310595035552979, + "learning_rate": 9.889161595816442e-05, + "loss": 2.0767, + "step": 3101 + }, + { + "epoch": 0.9521178637200737, + "grad_norm": 0.5993812084197998, + "learning_rate": 9.889057493361903e-05, + "loss": 2.1931, + "step": 3102 + }, + { + "epoch": 0.9524248004910988, + "grad_norm": 0.6157637238502502, + "learning_rate": 9.888953342590832e-05, + "loss": 2.0757, + "step": 3103 + }, + { + "epoch": 0.952731737262124, + "grad_norm": 0.6280032992362976, + "learning_rate": 9.88884914350426e-05, + "loss": 2.0042, + "step": 3104 + }, + { + "epoch": 0.9530386740331491, + "grad_norm": 0.6740781664848328, + "learning_rate": 9.888744896103212e-05, + "loss": 2.0663, + "step": 3105 + }, + { + "epoch": 0.9533456108041743, + "grad_norm": 0.5851804614067078, + "learning_rate": 9.888640600388725e-05, + "loss": 2.0585, + "step": 3106 + }, + { + "epoch": 0.9536525475751995, + "grad_norm": 0.6590312719345093, + "learning_rate": 9.888536256361825e-05, + "loss": 2.0698, + "step": 3107 + }, + { + "epoch": 0.9539594843462247, + "grad_norm": 0.5356595516204834, + "learning_rate": 9.888431864023544e-05, + "loss": 2.1019, + "step": 3108 + }, + { + "epoch": 0.9542664211172498, + "grad_norm": 0.6401084661483765, + "learning_rate": 9.888327423374915e-05, + "loss": 2.1176, + "step": 3109 + }, + { + "epoch": 0.954573357888275, + "grad_norm": 0.6582900285720825, + "learning_rate": 9.888222934416968e-05, + "loss": 2.0375, + "step": 3110 + }, + { + "epoch": 0.9548802946593001, + "grad_norm": 0.6245424151420593, + "learning_rate": 9.888118397150738e-05, + "loss": 1.9913, + "step": 3111 + }, + { + "epoch": 0.9551872314303254, + "grad_norm": 0.5871780514717102, + "learning_rate": 9.888013811577256e-05, + "loss": 2.1434, + "step": 3112 + }, + { + "epoch": 0.9554941682013505, + "grad_norm": 0.6295487284660339, + "learning_rate": 9.887909177697559e-05, + "loss": 2.0805, + "step": 3113 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.5844045877456665, + "learning_rate": 9.887804495512676e-05, + "loss": 2.076, + "step": 3114 + }, + { + "epoch": 0.9561080417434008, + "grad_norm": 0.5581921339035034, + "learning_rate": 9.887699765023645e-05, + "loss": 2.131, + "step": 3115 + }, + { + "epoch": 0.956414978514426, + "grad_norm": 0.6659174561500549, + "learning_rate": 9.8875949862315e-05, + "loss": 2.0759, + "step": 3116 + }, + { + "epoch": 0.9567219152854513, + "grad_norm": 0.5852961540222168, + "learning_rate": 9.887490159137276e-05, + "loss": 2.0486, + "step": 3117 + }, + { + "epoch": 0.9570288520564764, + "grad_norm": 0.6077566146850586, + "learning_rate": 9.887385283742011e-05, + "loss": 2.1132, + "step": 3118 + }, + { + "epoch": 0.9573357888275016, + "grad_norm": 0.5991361141204834, + "learning_rate": 9.88728036004674e-05, + "loss": 2.0322, + "step": 3119 + }, + { + "epoch": 0.9576427255985267, + "grad_norm": 0.5832391977310181, + "learning_rate": 9.887175388052499e-05, + "loss": 2.135, + "step": 3120 + }, + { + "epoch": 0.9579496623695519, + "grad_norm": 0.5479732751846313, + "learning_rate": 9.887070367760327e-05, + "loss": 2.1222, + "step": 3121 + }, + { + "epoch": 0.958256599140577, + "grad_norm": 0.5630220770835876, + "learning_rate": 9.88696529917126e-05, + "loss": 2.1247, + "step": 3122 + }, + { + "epoch": 0.9585635359116023, + "grad_norm": 0.7052439451217651, + "learning_rate": 9.88686018228634e-05, + "loss": 2.204, + "step": 3123 + }, + { + "epoch": 0.9588704726826274, + "grad_norm": 0.5995638370513916, + "learning_rate": 9.8867550171066e-05, + "loss": 2.0153, + "step": 3124 + }, + { + "epoch": 0.9591774094536526, + "grad_norm": 0.5689408779144287, + "learning_rate": 9.886649803633086e-05, + "loss": 2.0341, + "step": 3125 + }, + { + "epoch": 0.9594843462246777, + "grad_norm": 0.5247456431388855, + "learning_rate": 9.886544541866832e-05, + "loss": 2.0657, + "step": 3126 + }, + { + "epoch": 0.9597912829957029, + "grad_norm": 0.5596463084220886, + "learning_rate": 9.886439231808882e-05, + "loss": 2.0829, + "step": 3127 + }, + { + "epoch": 0.960098219766728, + "grad_norm": 0.4993874430656433, + "learning_rate": 9.886333873460275e-05, + "loss": 2.0517, + "step": 3128 + }, + { + "epoch": 0.9604051565377533, + "grad_norm": 0.5776910185813904, + "learning_rate": 9.886228466822054e-05, + "loss": 2.0124, + "step": 3129 + }, + { + "epoch": 0.9607120933087784, + "grad_norm": 0.5871354341506958, + "learning_rate": 9.886123011895258e-05, + "loss": 2.0327, + "step": 3130 + }, + { + "epoch": 0.9610190300798036, + "grad_norm": 0.5873207449913025, + "learning_rate": 9.886017508680931e-05, + "loss": 2.0756, + "step": 3131 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.6422720551490784, + "learning_rate": 9.885911957180113e-05, + "loss": 2.0649, + "step": 3132 + }, + { + "epoch": 0.9616329036218539, + "grad_norm": 0.6040814518928528, + "learning_rate": 9.885806357393853e-05, + "loss": 2.066, + "step": 3133 + }, + { + "epoch": 0.961939840392879, + "grad_norm": 0.6629621982574463, + "learning_rate": 9.885700709323189e-05, + "loss": 2.0824, + "step": 3134 + }, + { + "epoch": 0.9622467771639043, + "grad_norm": 0.572485625743866, + "learning_rate": 9.885595012969168e-05, + "loss": 2.0572, + "step": 3135 + }, + { + "epoch": 0.9625537139349294, + "grad_norm": 0.5050783753395081, + "learning_rate": 9.885489268332833e-05, + "loss": 2.0645, + "step": 3136 + }, + { + "epoch": 0.9628606507059546, + "grad_norm": 0.5744417309761047, + "learning_rate": 9.885383475415229e-05, + "loss": 2.0549, + "step": 3137 + }, + { + "epoch": 0.9631675874769797, + "grad_norm": 0.5604275465011597, + "learning_rate": 9.885277634217403e-05, + "loss": 2.1339, + "step": 3138 + }, + { + "epoch": 0.963474524248005, + "grad_norm": 0.6182584762573242, + "learning_rate": 9.8851717447404e-05, + "loss": 2.0397, + "step": 3139 + }, + { + "epoch": 0.9637814610190301, + "grad_norm": 0.510515570640564, + "learning_rate": 9.885065806985266e-05, + "loss": 1.9761, + "step": 3140 + }, + { + "epoch": 0.9640883977900553, + "grad_norm": 0.4881763756275177, + "learning_rate": 9.884959820953048e-05, + "loss": 2.005, + "step": 3141 + }, + { + "epoch": 0.9643953345610804, + "grad_norm": 0.47206851840019226, + "learning_rate": 9.884853786644794e-05, + "loss": 2.0661, + "step": 3142 + }, + { + "epoch": 0.9647022713321056, + "grad_norm": 0.5691676735877991, + "learning_rate": 9.884747704061552e-05, + "loss": 2.1316, + "step": 3143 + }, + { + "epoch": 0.9650092081031307, + "grad_norm": 0.5338765978813171, + "learning_rate": 9.884641573204372e-05, + "loss": 2.0715, + "step": 3144 + }, + { + "epoch": 0.965316144874156, + "grad_norm": 0.5721597075462341, + "learning_rate": 9.884535394074299e-05, + "loss": 2.1004, + "step": 3145 + }, + { + "epoch": 0.9656230816451811, + "grad_norm": 0.5269518494606018, + "learning_rate": 9.884429166672384e-05, + "loss": 2.1233, + "step": 3146 + }, + { + "epoch": 0.9659300184162063, + "grad_norm": 0.5264385342597961, + "learning_rate": 9.884322890999678e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.9662369551872314, + "grad_norm": 0.6094604730606079, + "learning_rate": 9.88421656705723e-05, + "loss": 2.1009, + "step": 3148 + }, + { + "epoch": 0.9665438919582566, + "grad_norm": 0.5538906455039978, + "learning_rate": 9.884110194846093e-05, + "loss": 2.0055, + "step": 3149 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.591526985168457, + "learning_rate": 9.884003774367313e-05, + "loss": 2.0655, + "step": 3150 + }, + { + "epoch": 0.967157765500307, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.883897305621948e-05, + "loss": 2.0775, + "step": 3151 + }, + { + "epoch": 0.9674647022713321, + "grad_norm": 0.5074640512466431, + "learning_rate": 9.883790788611045e-05, + "loss": 2.0322, + "step": 3152 + }, + { + "epoch": 0.9677716390423573, + "grad_norm": 0.5111376047134399, + "learning_rate": 9.883684223335661e-05, + "loss": 2.0972, + "step": 3153 + }, + { + "epoch": 0.9680785758133824, + "grad_norm": 0.5187644362449646, + "learning_rate": 9.883577609796846e-05, + "loss": 2.072, + "step": 3154 + }, + { + "epoch": 0.9683855125844076, + "grad_norm": 0.5285201072692871, + "learning_rate": 9.883470947995654e-05, + "loss": 2.0468, + "step": 3155 + }, + { + "epoch": 0.9686924493554327, + "grad_norm": 0.49360916018486023, + "learning_rate": 9.883364237933142e-05, + "loss": 2.07, + "step": 3156 + }, + { + "epoch": 0.968999386126458, + "grad_norm": 0.6359294056892395, + "learning_rate": 9.88325747961036e-05, + "loss": 2.1169, + "step": 3157 + }, + { + "epoch": 0.9693063228974831, + "grad_norm": 0.6274764537811279, + "learning_rate": 9.883150673028367e-05, + "loss": 2.1412, + "step": 3158 + }, + { + "epoch": 0.9696132596685083, + "grad_norm": 0.5755917429924011, + "learning_rate": 9.883043818188215e-05, + "loss": 2.0547, + "step": 3159 + }, + { + "epoch": 0.9699201964395334, + "grad_norm": 0.4765770137310028, + "learning_rate": 9.882936915090964e-05, + "loss": 2.02, + "step": 3160 + }, + { + "epoch": 0.9702271332105586, + "grad_norm": 0.5085053443908691, + "learning_rate": 9.882829963737667e-05, + "loss": 2.0355, + "step": 3161 + }, + { + "epoch": 0.9705340699815838, + "grad_norm": 0.49804505705833435, + "learning_rate": 9.882722964129385e-05, + "loss": 2.1274, + "step": 3162 + }, + { + "epoch": 0.970841006752609, + "grad_norm": 0.5575076341629028, + "learning_rate": 9.882615916267171e-05, + "loss": 2.0661, + "step": 3163 + }, + { + "epoch": 0.9711479435236341, + "grad_norm": 0.5678727626800537, + "learning_rate": 9.882508820152084e-05, + "loss": 2.1135, + "step": 3164 + }, + { + "epoch": 0.9714548802946593, + "grad_norm": 0.5505611896514893, + "learning_rate": 9.882401675785185e-05, + "loss": 2.0888, + "step": 3165 + }, + { + "epoch": 0.9717618170656844, + "grad_norm": 0.5224125385284424, + "learning_rate": 9.88229448316753e-05, + "loss": 2.0492, + "step": 3166 + }, + { + "epoch": 0.9720687538367097, + "grad_norm": 0.437215656042099, + "learning_rate": 9.882187242300178e-05, + "loss": 1.9927, + "step": 3167 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.4914848804473877, + "learning_rate": 9.882079953184192e-05, + "loss": 2.0309, + "step": 3168 + }, + { + "epoch": 0.97268262737876, + "grad_norm": 0.4990764260292053, + "learning_rate": 9.88197261582063e-05, + "loss": 2.0408, + "step": 3169 + }, + { + "epoch": 0.9729895641497851, + "grad_norm": 0.5283234715461731, + "learning_rate": 9.881865230210552e-05, + "loss": 2.0627, + "step": 3170 + }, + { + "epoch": 0.9732965009208103, + "grad_norm": 0.5771347284317017, + "learning_rate": 9.88175779635502e-05, + "loss": 2.1591, + "step": 3171 + }, + { + "epoch": 0.9736034376918354, + "grad_norm": 0.5020268559455872, + "learning_rate": 9.881650314255098e-05, + "loss": 2.0311, + "step": 3172 + }, + { + "epoch": 0.9739103744628607, + "grad_norm": 0.5476529002189636, + "learning_rate": 9.881542783911846e-05, + "loss": 2.1114, + "step": 3173 + }, + { + "epoch": 0.9742173112338858, + "grad_norm": 0.5630559921264648, + "learning_rate": 9.881435205326327e-05, + "loss": 2.0617, + "step": 3174 + }, + { + "epoch": 0.974524248004911, + "grad_norm": 0.5931001305580139, + "learning_rate": 9.881327578499604e-05, + "loss": 2.0376, + "step": 3175 + }, + { + "epoch": 0.9748311847759361, + "grad_norm": 0.6123979091644287, + "learning_rate": 9.881219903432742e-05, + "loss": 2.0995, + "step": 3176 + }, + { + "epoch": 0.9751381215469613, + "grad_norm": 0.6064465641975403, + "learning_rate": 9.881112180126802e-05, + "loss": 2.0533, + "step": 3177 + }, + { + "epoch": 0.9754450583179864, + "grad_norm": 0.6071485877037048, + "learning_rate": 9.881004408582852e-05, + "loss": 2.1007, + "step": 3178 + }, + { + "epoch": 0.9757519950890117, + "grad_norm": 0.6021482944488525, + "learning_rate": 9.880896588801954e-05, + "loss": 2.0528, + "step": 3179 + }, + { + "epoch": 0.9760589318600368, + "grad_norm": 0.5204832553863525, + "learning_rate": 9.880788720785177e-05, + "loss": 2.0489, + "step": 3180 + }, + { + "epoch": 0.976365868631062, + "grad_norm": 0.5347138047218323, + "learning_rate": 9.880680804533585e-05, + "loss": 2.1021, + "step": 3181 + }, + { + "epoch": 0.9766728054020871, + "grad_norm": 0.6318790912628174, + "learning_rate": 9.880572840048243e-05, + "loss": 2.0808, + "step": 3182 + }, + { + "epoch": 0.9769797421731123, + "grad_norm": 0.6978665590286255, + "learning_rate": 9.88046482733022e-05, + "loss": 2.0067, + "step": 3183 + }, + { + "epoch": 0.9772866789441375, + "grad_norm": 0.7986917495727539, + "learning_rate": 9.880356766380582e-05, + "loss": 2.0239, + "step": 3184 + }, + { + "epoch": 0.9775936157151627, + "grad_norm": 0.853898286819458, + "learning_rate": 9.880248657200402e-05, + "loss": 2.085, + "step": 3185 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.8207793235778809, + "learning_rate": 9.880140499790741e-05, + "loss": 2.0504, + "step": 3186 + }, + { + "epoch": 0.978207489257213, + "grad_norm": 0.7750336527824402, + "learning_rate": 9.880032294152673e-05, + "loss": 2.0962, + "step": 3187 + }, + { + "epoch": 0.9785144260282382, + "grad_norm": 0.7141241431236267, + "learning_rate": 9.879924040287263e-05, + "loss": 2.0655, + "step": 3188 + }, + { + "epoch": 0.9788213627992634, + "grad_norm": 0.6119080781936646, + "learning_rate": 9.879815738195585e-05, + "loss": 2.0611, + "step": 3189 + }, + { + "epoch": 0.9791282995702886, + "grad_norm": 0.5963751673698425, + "learning_rate": 9.879707387878708e-05, + "loss": 2.0978, + "step": 3190 + }, + { + "epoch": 0.9794352363413137, + "grad_norm": 0.5016428828239441, + "learning_rate": 9.879598989337703e-05, + "loss": 2.0323, + "step": 3191 + }, + { + "epoch": 0.9797421731123389, + "grad_norm": 0.5610151290893555, + "learning_rate": 9.87949054257364e-05, + "loss": 2.1362, + "step": 3192 + }, + { + "epoch": 0.980049109883364, + "grad_norm": 0.5687069296836853, + "learning_rate": 9.879382047587591e-05, + "loss": 2.0234, + "step": 3193 + }, + { + "epoch": 0.9803560466543892, + "grad_norm": 0.6210914254188538, + "learning_rate": 9.87927350438063e-05, + "loss": 2.0455, + "step": 3194 + }, + { + "epoch": 0.9806629834254144, + "grad_norm": 0.530215322971344, + "learning_rate": 9.879164912953827e-05, + "loss": 2.0607, + "step": 3195 + }, + { + "epoch": 0.9809699201964396, + "grad_norm": 0.5462486147880554, + "learning_rate": 9.879056273308258e-05, + "loss": 2.1229, + "step": 3196 + }, + { + "epoch": 0.9812768569674647, + "grad_norm": 0.5765405297279358, + "learning_rate": 9.878947585444994e-05, + "loss": 2.0575, + "step": 3197 + }, + { + "epoch": 0.9815837937384899, + "grad_norm": 0.531679630279541, + "learning_rate": 9.878838849365111e-05, + "loss": 2.0208, + "step": 3198 + }, + { + "epoch": 0.981890730509515, + "grad_norm": 0.5190781950950623, + "learning_rate": 9.878730065069683e-05, + "loss": 2.0073, + "step": 3199 + }, + { + "epoch": 0.9821976672805403, + "grad_norm": 0.6260761022567749, + "learning_rate": 9.878621232559784e-05, + "loss": 2.1144, + "step": 3200 + }, + { + "epoch": 0.9825046040515654, + "grad_norm": 0.664830207824707, + "learning_rate": 9.878512351836491e-05, + "loss": 2.1423, + "step": 3201 + }, + { + "epoch": 0.9828115408225906, + "grad_norm": 0.7107433676719666, + "learning_rate": 9.878403422900881e-05, + "loss": 2.0851, + "step": 3202 + }, + { + "epoch": 0.9831184775936157, + "grad_norm": 0.7426268458366394, + "learning_rate": 9.878294445754027e-05, + "loss": 2.0637, + "step": 3203 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.7643515467643738, + "learning_rate": 9.878185420397008e-05, + "loss": 2.0623, + "step": 3204 + }, + { + "epoch": 0.983732351135666, + "grad_norm": 0.644257664680481, + "learning_rate": 9.878076346830904e-05, + "loss": 2.103, + "step": 3205 + }, + { + "epoch": 0.9840392879066913, + "grad_norm": 0.5871284008026123, + "learning_rate": 9.877967225056787e-05, + "loss": 2.0695, + "step": 3206 + }, + { + "epoch": 0.9843462246777164, + "grad_norm": 0.6907737851142883, + "learning_rate": 9.877858055075742e-05, + "loss": 2.1148, + "step": 3207 + }, + { + "epoch": 0.9846531614487416, + "grad_norm": 0.6685691475868225, + "learning_rate": 9.877748836888843e-05, + "loss": 2.0356, + "step": 3208 + }, + { + "epoch": 0.9849600982197667, + "grad_norm": 0.797210156917572, + "learning_rate": 9.87763957049717e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.9852670349907919, + "grad_norm": 0.8397588133811951, + "learning_rate": 9.877530255901806e-05, + "loss": 2.0697, + "step": 3210 + }, + { + "epoch": 0.985573971761817, + "grad_norm": 0.6988976001739502, + "learning_rate": 9.877420893103828e-05, + "loss": 2.0676, + "step": 3211 + }, + { + "epoch": 0.9858809085328423, + "grad_norm": 0.5828577876091003, + "learning_rate": 9.877311482104319e-05, + "loss": 2.0988, + "step": 3212 + }, + { + "epoch": 0.9861878453038674, + "grad_norm": 0.66143798828125, + "learning_rate": 9.877202022904359e-05, + "loss": 2.101, + "step": 3213 + }, + { + "epoch": 0.9864947820748926, + "grad_norm": 0.7351155877113342, + "learning_rate": 9.877092515505028e-05, + "loss": 2.0198, + "step": 3214 + }, + { + "epoch": 0.9868017188459177, + "grad_norm": 0.6817437410354614, + "learning_rate": 9.876982959907413e-05, + "loss": 2.1182, + "step": 3215 + }, + { + "epoch": 0.9871086556169429, + "grad_norm": 0.6640676259994507, + "learning_rate": 9.876873356112592e-05, + "loss": 2.1264, + "step": 3216 + }, + { + "epoch": 0.987415592387968, + "grad_norm": 0.6146695017814636, + "learning_rate": 9.876763704121652e-05, + "loss": 2.0378, + "step": 3217 + }, + { + "epoch": 0.9877225291589933, + "grad_norm": 0.6681298017501831, + "learning_rate": 9.876654003935672e-05, + "loss": 2.1916, + "step": 3218 + }, + { + "epoch": 0.9880294659300184, + "grad_norm": 0.7407983541488647, + "learning_rate": 9.876544255555742e-05, + "loss": 2.0996, + "step": 3219 + }, + { + "epoch": 0.9883364027010436, + "grad_norm": 0.5995208621025085, + "learning_rate": 9.876434458982941e-05, + "loss": 2.0023, + "step": 3220 + }, + { + "epoch": 0.9886433394720687, + "grad_norm": 0.6491377949714661, + "learning_rate": 9.876324614218357e-05, + "loss": 2.129, + "step": 3221 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.6356569528579712, + "learning_rate": 9.876214721263074e-05, + "loss": 2.1396, + "step": 3222 + }, + { + "epoch": 0.9892572130141191, + "grad_norm": 0.6149557828903198, + "learning_rate": 9.876104780118182e-05, + "loss": 2.0204, + "step": 3223 + }, + { + "epoch": 0.9895641497851443, + "grad_norm": 0.600841224193573, + "learning_rate": 9.875994790784764e-05, + "loss": 2.0585, + "step": 3224 + }, + { + "epoch": 0.9898710865561694, + "grad_norm": 0.6398041248321533, + "learning_rate": 9.875884753263906e-05, + "loss": 2.1296, + "step": 3225 + }, + { + "epoch": 0.9901780233271946, + "grad_norm": 0.5978466272354126, + "learning_rate": 9.875774667556697e-05, + "loss": 1.9765, + "step": 3226 + }, + { + "epoch": 0.9904849600982197, + "grad_norm": 0.49499931931495667, + "learning_rate": 9.875664533664227e-05, + "loss": 2.0516, + "step": 3227 + }, + { + "epoch": 0.990791896869245, + "grad_norm": 0.5660768151283264, + "learning_rate": 9.875554351587579e-05, + "loss": 2.0743, + "step": 3228 + }, + { + "epoch": 0.9910988336402701, + "grad_norm": 0.56971275806427, + "learning_rate": 9.875444121327849e-05, + "loss": 2.0794, + "step": 3229 + }, + { + "epoch": 0.9914057704112953, + "grad_norm": 0.5806300044059753, + "learning_rate": 9.87533384288612e-05, + "loss": 2.1636, + "step": 3230 + }, + { + "epoch": 0.9917127071823204, + "grad_norm": 0.5485837459564209, + "learning_rate": 9.875223516263485e-05, + "loss": 2.025, + "step": 3231 + }, + { + "epoch": 0.9920196439533456, + "grad_norm": 0.6353451013565063, + "learning_rate": 9.875113141461034e-05, + "loss": 2.1033, + "step": 3232 + }, + { + "epoch": 0.9923265807243707, + "grad_norm": 0.577608048915863, + "learning_rate": 9.875002718479858e-05, + "loss": 2.1306, + "step": 3233 + }, + { + "epoch": 0.992633517495396, + "grad_norm": 0.5305901765823364, + "learning_rate": 9.874892247321046e-05, + "loss": 2.1123, + "step": 3234 + }, + { + "epoch": 0.9929404542664211, + "grad_norm": 0.5554118752479553, + "learning_rate": 9.874781727985693e-05, + "loss": 2.0524, + "step": 3235 + }, + { + "epoch": 0.9932473910374463, + "grad_norm": 0.48555269837379456, + "learning_rate": 9.87467116047489e-05, + "loss": 2.0699, + "step": 3236 + }, + { + "epoch": 0.9935543278084714, + "grad_norm": 0.578976035118103, + "learning_rate": 9.874560544789729e-05, + "loss": 2.0747, + "step": 3237 + }, + { + "epoch": 0.9938612645794966, + "grad_norm": 0.5508282780647278, + "learning_rate": 9.874449880931304e-05, + "loss": 2.0947, + "step": 3238 + }, + { + "epoch": 0.9941682013505218, + "grad_norm": 0.5458595752716064, + "learning_rate": 9.874339168900707e-05, + "loss": 2.0417, + "step": 3239 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.5668261647224426, + "learning_rate": 9.874228408699035e-05, + "loss": 2.0948, + "step": 3240 + }, + { + "epoch": 0.9947820748925721, + "grad_norm": 0.6127253174781799, + "learning_rate": 9.87411760032738e-05, + "loss": 2.0904, + "step": 3241 + }, + { + "epoch": 0.9950890116635973, + "grad_norm": 0.5736191868782043, + "learning_rate": 9.874006743786839e-05, + "loss": 2.0637, + "step": 3242 + }, + { + "epoch": 0.9953959484346224, + "grad_norm": 0.574163019657135, + "learning_rate": 9.873895839078507e-05, + "loss": 2.0925, + "step": 3243 + }, + { + "epoch": 0.9957028852056476, + "grad_norm": 0.5660602450370789, + "learning_rate": 9.873784886203478e-05, + "loss": 2.0743, + "step": 3244 + }, + { + "epoch": 0.9960098219766728, + "grad_norm": 0.6037993431091309, + "learning_rate": 9.87367388516285e-05, + "loss": 2.1274, + "step": 3245 + }, + { + "epoch": 0.996316758747698, + "grad_norm": 0.5664488673210144, + "learning_rate": 9.873562835957722e-05, + "loss": 2.0403, + "step": 3246 + }, + { + "epoch": 0.9966236955187231, + "grad_norm": 0.6170254349708557, + "learning_rate": 9.873451738589188e-05, + "loss": 2.0198, + "step": 3247 + }, + { + "epoch": 0.9969306322897483, + "grad_norm": 0.5582032799720764, + "learning_rate": 9.873340593058348e-05, + "loss": 2.1494, + "step": 3248 + }, + { + "epoch": 0.9972375690607734, + "grad_norm": 0.5565598607063293, + "learning_rate": 9.8732293993663e-05, + "loss": 2.1062, + "step": 3249 + }, + { + "epoch": 0.9975445058317987, + "grad_norm": 0.5526474118232727, + "learning_rate": 9.873118157514142e-05, + "loss": 2.1184, + "step": 3250 + }, + { + "epoch": 0.9978514426028238, + "grad_norm": 0.5864302515983582, + "learning_rate": 9.873006867502975e-05, + "loss": 2.1389, + "step": 3251 + }, + { + "epoch": 0.998158379373849, + "grad_norm": 0.5295118689537048, + "learning_rate": 9.872895529333899e-05, + "loss": 2.05, + "step": 3252 + }, + { + "epoch": 0.9984653161448741, + "grad_norm": 0.553537905216217, + "learning_rate": 9.872784143008012e-05, + "loss": 2.0464, + "step": 3253 + }, + { + "epoch": 0.9987722529158993, + "grad_norm": 0.558159589767456, + "learning_rate": 9.872672708526416e-05, + "loss": 2.1013, + "step": 3254 + }, + { + "epoch": 0.9990791896869244, + "grad_norm": 0.5479860901832581, + "learning_rate": 9.872561225890211e-05, + "loss": 2.0497, + "step": 3255 + }, + { + "epoch": 0.9993861264579497, + "grad_norm": 0.5538234114646912, + "learning_rate": 9.872449695100503e-05, + "loss": 2.1239, + "step": 3256 + }, + { + "epoch": 0.9996930632289748, + "grad_norm": 0.5970771908760071, + "learning_rate": 9.872338116158389e-05, + "loss": 2.0693, + "step": 3257 + }, + { + "epoch": 1.0, + "grad_norm": 0.5118132829666138, + "learning_rate": 9.872226489064975e-05, + "loss": 2.0302, + "step": 3258 + }, + { + "epoch": 1.0003069367710251, + "grad_norm": 0.538902223110199, + "learning_rate": 9.872114813821363e-05, + "loss": 2.0604, + "step": 3259 + }, + { + "epoch": 1.0006138735420504, + "grad_norm": 0.47124916315078735, + "learning_rate": 9.872003090428657e-05, + "loss": 2.054, + "step": 3260 + }, + { + "epoch": 1.0009208103130756, + "grad_norm": 0.5109235048294067, + "learning_rate": 9.87189131888796e-05, + "loss": 2.0107, + "step": 3261 + }, + { + "epoch": 1.0012277470841007, + "grad_norm": 0.5530306696891785, + "learning_rate": 9.871779499200377e-05, + "loss": 2.0914, + "step": 3262 + }, + { + "epoch": 1.0015346838551258, + "grad_norm": 0.6271992325782776, + "learning_rate": 9.871667631367017e-05, + "loss": 1.9855, + "step": 3263 + }, + { + "epoch": 1.0018416206261511, + "grad_norm": 0.5752004384994507, + "learning_rate": 9.871555715388978e-05, + "loss": 2.0689, + "step": 3264 + }, + { + "epoch": 1.0021485573971762, + "grad_norm": 0.6185278296470642, + "learning_rate": 9.871443751267373e-05, + "loss": 2.0751, + "step": 3265 + }, + { + "epoch": 1.0024554941682013, + "grad_norm": 0.625248908996582, + "learning_rate": 9.871331739003304e-05, + "loss": 2.102, + "step": 3266 + }, + { + "epoch": 1.0027624309392265, + "grad_norm": 0.6345300078392029, + "learning_rate": 9.87121967859788e-05, + "loss": 2.0898, + "step": 3267 + }, + { + "epoch": 1.0030693677102518, + "grad_norm": 0.6836622953414917, + "learning_rate": 9.871107570052207e-05, + "loss": 2.1348, + "step": 3268 + }, + { + "epoch": 1.003376304481277, + "grad_norm": 0.699739933013916, + "learning_rate": 9.870995413367397e-05, + "loss": 2.0085, + "step": 3269 + }, + { + "epoch": 1.003683241252302, + "grad_norm": 0.650558590888977, + "learning_rate": 9.870883208544553e-05, + "loss": 2.0927, + "step": 3270 + }, + { + "epoch": 1.0039901780233271, + "grad_norm": 0.6837300658226013, + "learning_rate": 9.870770955584785e-05, + "loss": 2.1415, + "step": 3271 + }, + { + "epoch": 1.0042971147943525, + "grad_norm": 0.595761239528656, + "learning_rate": 9.870658654489206e-05, + "loss": 2.0372, + "step": 3272 + }, + { + "epoch": 1.0046040515653776, + "grad_norm": 0.5177203416824341, + "learning_rate": 9.870546305258922e-05, + "loss": 2.053, + "step": 3273 + }, + { + "epoch": 1.0049109883364027, + "grad_norm": 0.5392438173294067, + "learning_rate": 9.870433907895045e-05, + "loss": 2.0886, + "step": 3274 + }, + { + "epoch": 1.0052179251074278, + "grad_norm": 0.594776451587677, + "learning_rate": 9.870321462398686e-05, + "loss": 2.0158, + "step": 3275 + }, + { + "epoch": 1.0055248618784531, + "grad_norm": 0.6363179683685303, + "learning_rate": 9.870208968770955e-05, + "loss": 2.0532, + "step": 3276 + }, + { + "epoch": 1.0058317986494782, + "grad_norm": 0.7506567239761353, + "learning_rate": 9.870096427012965e-05, + "loss": 2.1288, + "step": 3277 + }, + { + "epoch": 1.0061387354205034, + "grad_norm": 0.7155289053916931, + "learning_rate": 9.869983837125828e-05, + "loss": 2.0859, + "step": 3278 + }, + { + "epoch": 1.0064456721915285, + "grad_norm": 0.7589760422706604, + "learning_rate": 9.869871199110656e-05, + "loss": 2.1668, + "step": 3279 + }, + { + "epoch": 1.0067526089625538, + "grad_norm": 0.6161168217658997, + "learning_rate": 9.869758512968562e-05, + "loss": 2.0421, + "step": 3280 + }, + { + "epoch": 1.007059545733579, + "grad_norm": 0.5722637176513672, + "learning_rate": 9.86964577870066e-05, + "loss": 2.1333, + "step": 3281 + }, + { + "epoch": 1.007366482504604, + "grad_norm": 0.6443020701408386, + "learning_rate": 9.869532996308065e-05, + "loss": 2.0227, + "step": 3282 + }, + { + "epoch": 1.0076734192756291, + "grad_norm": 0.6603342890739441, + "learning_rate": 9.869420165791891e-05, + "loss": 2.0888, + "step": 3283 + }, + { + "epoch": 1.0079803560466545, + "grad_norm": 0.6666482090950012, + "learning_rate": 9.869307287153251e-05, + "loss": 2.0132, + "step": 3284 + }, + { + "epoch": 1.0082872928176796, + "grad_norm": 0.6691575646400452, + "learning_rate": 9.869194360393264e-05, + "loss": 2.0752, + "step": 3285 + }, + { + "epoch": 1.0085942295887047, + "grad_norm": 0.6142565011978149, + "learning_rate": 9.869081385513044e-05, + "loss": 2.0491, + "step": 3286 + }, + { + "epoch": 1.0089011663597298, + "grad_norm": 0.5869930386543274, + "learning_rate": 9.868968362513708e-05, + "loss": 2.1252, + "step": 3287 + }, + { + "epoch": 1.0092081031307552, + "grad_norm": 0.532183825969696, + "learning_rate": 9.868855291396373e-05, + "loss": 2.0589, + "step": 3288 + }, + { + "epoch": 1.0095150399017803, + "grad_norm": 0.616374135017395, + "learning_rate": 9.868742172162156e-05, + "loss": 2.0808, + "step": 3289 + }, + { + "epoch": 1.0098219766728054, + "grad_norm": 0.5750923156738281, + "learning_rate": 9.868629004812176e-05, + "loss": 2.0407, + "step": 3290 + }, + { + "epoch": 1.0101289134438305, + "grad_norm": 0.6161531209945679, + "learning_rate": 9.86851578934755e-05, + "loss": 2.0938, + "step": 3291 + }, + { + "epoch": 1.0104358502148558, + "grad_norm": 0.5369158983230591, + "learning_rate": 9.868402525769397e-05, + "loss": 2.1298, + "step": 3292 + }, + { + "epoch": 1.010742786985881, + "grad_norm": 0.5134824514389038, + "learning_rate": 9.868289214078837e-05, + "loss": 2.0345, + "step": 3293 + }, + { + "epoch": 1.011049723756906, + "grad_norm": 0.4972594082355499, + "learning_rate": 9.868175854276991e-05, + "loss": 2.1264, + "step": 3294 + }, + { + "epoch": 1.0113566605279312, + "grad_norm": 0.5727534890174866, + "learning_rate": 9.868062446364976e-05, + "loss": 2.1668, + "step": 3295 + }, + { + "epoch": 1.0116635972989565, + "grad_norm": 0.6384626030921936, + "learning_rate": 9.867948990343915e-05, + "loss": 2.1125, + "step": 3296 + }, + { + "epoch": 1.0119705340699816, + "grad_norm": 0.7591070532798767, + "learning_rate": 9.867835486214929e-05, + "loss": 2.0975, + "step": 3297 + }, + { + "epoch": 1.0122774708410067, + "grad_norm": 0.7940282821655273, + "learning_rate": 9.86772193397914e-05, + "loss": 2.0107, + "step": 3298 + }, + { + "epoch": 1.0125844076120318, + "grad_norm": 0.6877933144569397, + "learning_rate": 9.86760833363767e-05, + "loss": 2.0684, + "step": 3299 + }, + { + "epoch": 1.0128913443830572, + "grad_norm": 0.5361137986183167, + "learning_rate": 9.867494685191641e-05, + "loss": 2.0426, + "step": 3300 + }, + { + "epoch": 1.0131982811540823, + "grad_norm": 0.5104349851608276, + "learning_rate": 9.867380988642177e-05, + "loss": 2.0849, + "step": 3301 + }, + { + "epoch": 1.0135052179251074, + "grad_norm": 0.6133849024772644, + "learning_rate": 9.867267243990399e-05, + "loss": 2.0789, + "step": 3302 + }, + { + "epoch": 1.0138121546961325, + "grad_norm": 0.6607559323310852, + "learning_rate": 9.867153451237436e-05, + "loss": 2.0978, + "step": 3303 + }, + { + "epoch": 1.0141190914671578, + "grad_norm": 0.6853774189949036, + "learning_rate": 9.867039610384409e-05, + "loss": 2.1612, + "step": 3304 + }, + { + "epoch": 1.014426028238183, + "grad_norm": 0.6326626539230347, + "learning_rate": 9.866925721432442e-05, + "loss": 2.0887, + "step": 3305 + }, + { + "epoch": 1.014732965009208, + "grad_norm": 0.5483830571174622, + "learning_rate": 9.866811784382665e-05, + "loss": 2.0522, + "step": 3306 + }, + { + "epoch": 1.0150399017802332, + "grad_norm": 0.5980744957923889, + "learning_rate": 9.866697799236201e-05, + "loss": 2.0666, + "step": 3307 + }, + { + "epoch": 1.0153468385512585, + "grad_norm": 0.6047075986862183, + "learning_rate": 9.866583765994177e-05, + "loss": 2.0924, + "step": 3308 + }, + { + "epoch": 1.0156537753222836, + "grad_norm": 0.5932674407958984, + "learning_rate": 9.86646968465772e-05, + "loss": 2.0426, + "step": 3309 + }, + { + "epoch": 1.0159607120933087, + "grad_norm": 0.5349873304367065, + "learning_rate": 9.866355555227957e-05, + "loss": 2.027, + "step": 3310 + }, + { + "epoch": 1.0162676488643339, + "grad_norm": 0.5090891122817993, + "learning_rate": 9.866241377706015e-05, + "loss": 2.0554, + "step": 3311 + }, + { + "epoch": 1.0165745856353592, + "grad_norm": 0.605268120765686, + "learning_rate": 9.866127152093025e-05, + "loss": 2.0788, + "step": 3312 + }, + { + "epoch": 1.0168815224063843, + "grad_norm": 0.6006563305854797, + "learning_rate": 9.866012878390113e-05, + "loss": 2.0154, + "step": 3313 + }, + { + "epoch": 1.0171884591774094, + "grad_norm": 0.6412727236747742, + "learning_rate": 9.865898556598409e-05, + "loss": 2.0948, + "step": 3314 + }, + { + "epoch": 1.0174953959484345, + "grad_norm": 0.512140154838562, + "learning_rate": 9.865784186719046e-05, + "loss": 2.0314, + "step": 3315 + }, + { + "epoch": 1.0178023327194599, + "grad_norm": 0.48285913467407227, + "learning_rate": 9.865669768753151e-05, + "loss": 1.9689, + "step": 3316 + }, + { + "epoch": 1.018109269490485, + "grad_norm": 0.6067737340927124, + "learning_rate": 9.865555302701854e-05, + "loss": 2.1042, + "step": 3317 + }, + { + "epoch": 1.01841620626151, + "grad_norm": 0.6272363662719727, + "learning_rate": 9.865440788566289e-05, + "loss": 2.1092, + "step": 3318 + }, + { + "epoch": 1.0187231430325352, + "grad_norm": 0.6264182925224304, + "learning_rate": 9.865326226347586e-05, + "loss": 2.0445, + "step": 3319 + }, + { + "epoch": 1.0190300798035605, + "grad_norm": 0.5642834901809692, + "learning_rate": 9.86521161604688e-05, + "loss": 2.1041, + "step": 3320 + }, + { + "epoch": 1.0193370165745856, + "grad_norm": 0.5188324451446533, + "learning_rate": 9.865096957665297e-05, + "loss": 2.0174, + "step": 3321 + }, + { + "epoch": 1.0196439533456108, + "grad_norm": 0.5204416513442993, + "learning_rate": 9.864982251203976e-05, + "loss": 2.0927, + "step": 3322 + }, + { + "epoch": 1.0199508901166359, + "grad_norm": 0.5845292806625366, + "learning_rate": 9.86486749666405e-05, + "loss": 2.0751, + "step": 3323 + }, + { + "epoch": 1.0202578268876612, + "grad_norm": 0.5514994263648987, + "learning_rate": 9.86475269404665e-05, + "loss": 2.0976, + "step": 3324 + }, + { + "epoch": 1.0205647636586863, + "grad_norm": 0.6578981280326843, + "learning_rate": 9.864637843352915e-05, + "loss": 2.0668, + "step": 3325 + }, + { + "epoch": 1.0208717004297114, + "grad_norm": 0.6396434307098389, + "learning_rate": 9.864522944583976e-05, + "loss": 2.0648, + "step": 3326 + }, + { + "epoch": 1.0211786372007365, + "grad_norm": 0.548759400844574, + "learning_rate": 9.86440799774097e-05, + "loss": 2.0873, + "step": 3327 + }, + { + "epoch": 1.0214855739717619, + "grad_norm": 0.5739279985427856, + "learning_rate": 9.864293002825033e-05, + "loss": 2.0623, + "step": 3328 + }, + { + "epoch": 1.021792510742787, + "grad_norm": 0.5882315039634705, + "learning_rate": 9.864177959837303e-05, + "loss": 2.0399, + "step": 3329 + }, + { + "epoch": 1.022099447513812, + "grad_norm": 0.563359797000885, + "learning_rate": 9.864062868778914e-05, + "loss": 2.0839, + "step": 3330 + }, + { + "epoch": 1.0224063842848374, + "grad_norm": 0.6162607073783875, + "learning_rate": 9.863947729651006e-05, + "loss": 2.0439, + "step": 3331 + }, + { + "epoch": 1.0227133210558625, + "grad_norm": 0.6540365815162659, + "learning_rate": 9.863832542454715e-05, + "loss": 2.1234, + "step": 3332 + }, + { + "epoch": 1.0230202578268877, + "grad_norm": 0.6401089429855347, + "learning_rate": 9.86371730719118e-05, + "loss": 2.0418, + "step": 3333 + }, + { + "epoch": 1.0233271945979128, + "grad_norm": 0.6456391215324402, + "learning_rate": 9.86360202386154e-05, + "loss": 2.1191, + "step": 3334 + }, + { + "epoch": 1.023634131368938, + "grad_norm": 0.59992516040802, + "learning_rate": 9.863486692466933e-05, + "loss": 2.0582, + "step": 3335 + }, + { + "epoch": 1.0239410681399632, + "grad_norm": 0.5932520627975464, + "learning_rate": 9.8633713130085e-05, + "loss": 2.1812, + "step": 3336 + }, + { + "epoch": 1.0242480049109883, + "grad_norm": 0.6322866082191467, + "learning_rate": 9.863255885487384e-05, + "loss": 2.1523, + "step": 3337 + }, + { + "epoch": 1.0245549416820134, + "grad_norm": 0.6291313171386719, + "learning_rate": 9.863140409904719e-05, + "loss": 2.0495, + "step": 3338 + }, + { + "epoch": 1.0248618784530388, + "grad_norm": 0.6272565126419067, + "learning_rate": 9.863024886261653e-05, + "loss": 1.9812, + "step": 3339 + }, + { + "epoch": 1.025168815224064, + "grad_norm": 0.6485729217529297, + "learning_rate": 9.862909314559323e-05, + "loss": 2.0826, + "step": 3340 + }, + { + "epoch": 1.025475751995089, + "grad_norm": 0.608239471912384, + "learning_rate": 9.862793694798875e-05, + "loss": 2.0519, + "step": 3341 + }, + { + "epoch": 1.0257826887661141, + "grad_norm": 0.5492779612541199, + "learning_rate": 9.862678026981447e-05, + "loss": 1.9901, + "step": 3342 + }, + { + "epoch": 1.0260896255371394, + "grad_norm": 0.524030327796936, + "learning_rate": 9.862562311108187e-05, + "loss": 2.0695, + "step": 3343 + }, + { + "epoch": 1.0263965623081646, + "grad_norm": 0.6835227608680725, + "learning_rate": 9.862446547180235e-05, + "loss": 2.1312, + "step": 3344 + }, + { + "epoch": 1.0267034990791897, + "grad_norm": 0.6771748065948486, + "learning_rate": 9.862330735198736e-05, + "loss": 2.0566, + "step": 3345 + }, + { + "epoch": 1.0270104358502148, + "grad_norm": 0.609993577003479, + "learning_rate": 9.862214875164835e-05, + "loss": 2.1463, + "step": 3346 + }, + { + "epoch": 1.0273173726212401, + "grad_norm": 0.6617777347564697, + "learning_rate": 9.862098967079677e-05, + "loss": 2.0485, + "step": 3347 + }, + { + "epoch": 1.0276243093922652, + "grad_norm": 0.7935113906860352, + "learning_rate": 9.861983010944407e-05, + "loss": 2.0528, + "step": 3348 + }, + { + "epoch": 1.0279312461632903, + "grad_norm": 0.7510255575180054, + "learning_rate": 9.861867006760172e-05, + "loss": 1.9803, + "step": 3349 + }, + { + "epoch": 1.0282381829343155, + "grad_norm": 0.6944519281387329, + "learning_rate": 9.861750954528117e-05, + "loss": 2.0488, + "step": 3350 + }, + { + "epoch": 1.0285451197053408, + "grad_norm": 0.6057126522064209, + "learning_rate": 9.861634854249389e-05, + "loss": 2.1465, + "step": 3351 + }, + { + "epoch": 1.028852056476366, + "grad_norm": 0.6156182289123535, + "learning_rate": 9.861518705925135e-05, + "loss": 2.1227, + "step": 3352 + }, + { + "epoch": 1.029158993247391, + "grad_norm": 0.6016978621482849, + "learning_rate": 9.861402509556506e-05, + "loss": 2.0238, + "step": 3353 + }, + { + "epoch": 1.0294659300184161, + "grad_norm": 0.5987950563430786, + "learning_rate": 9.861286265144648e-05, + "loss": 2.0529, + "step": 3354 + }, + { + "epoch": 1.0297728667894415, + "grad_norm": 0.6011384725570679, + "learning_rate": 9.861169972690707e-05, + "loss": 2.0612, + "step": 3355 + }, + { + "epoch": 1.0300798035604666, + "grad_norm": 0.5217840671539307, + "learning_rate": 9.861053632195838e-05, + "loss": 2.0472, + "step": 3356 + }, + { + "epoch": 1.0303867403314917, + "grad_norm": 0.5202180743217468, + "learning_rate": 9.860937243661186e-05, + "loss": 2.1301, + "step": 3357 + }, + { + "epoch": 1.0306936771025168, + "grad_norm": 0.572290301322937, + "learning_rate": 9.860820807087905e-05, + "loss": 2.0309, + "step": 3358 + }, + { + "epoch": 1.0310006138735421, + "grad_norm": 0.5088694095611572, + "learning_rate": 9.860704322477142e-05, + "loss": 2.0789, + "step": 3359 + }, + { + "epoch": 1.0313075506445673, + "grad_norm": 0.5546056032180786, + "learning_rate": 9.860587789830052e-05, + "loss": 1.9708, + "step": 3360 + }, + { + "epoch": 1.0316144874155924, + "grad_norm": 0.5152996182441711, + "learning_rate": 9.860471209147782e-05, + "loss": 2.0656, + "step": 3361 + }, + { + "epoch": 1.0319214241866175, + "grad_norm": 0.4997018873691559, + "learning_rate": 9.860354580431488e-05, + "loss": 2.1404, + "step": 3362 + }, + { + "epoch": 1.0322283609576428, + "grad_norm": 0.5464209318161011, + "learning_rate": 9.860237903682321e-05, + "loss": 2.0013, + "step": 3363 + }, + { + "epoch": 1.032535297728668, + "grad_norm": 0.4934932589530945, + "learning_rate": 9.860121178901435e-05, + "loss": 2.0873, + "step": 3364 + }, + { + "epoch": 1.032842234499693, + "grad_norm": 0.5755184292793274, + "learning_rate": 9.860004406089982e-05, + "loss": 2.0706, + "step": 3365 + }, + { + "epoch": 1.0331491712707181, + "grad_norm": 0.6155427098274231, + "learning_rate": 9.859887585249117e-05, + "loss": 2.1153, + "step": 3366 + }, + { + "epoch": 1.0334561080417435, + "grad_norm": 0.6251068711280823, + "learning_rate": 9.859770716379995e-05, + "loss": 1.9988, + "step": 3367 + }, + { + "epoch": 1.0337630448127686, + "grad_norm": 0.5652515888214111, + "learning_rate": 9.85965379948377e-05, + "loss": 1.9834, + "step": 3368 + }, + { + "epoch": 1.0340699815837937, + "grad_norm": 0.49031418561935425, + "learning_rate": 9.859536834561599e-05, + "loss": 2.0719, + "step": 3369 + }, + { + "epoch": 1.0343769183548188, + "grad_norm": 0.5014585852622986, + "learning_rate": 9.859419821614635e-05, + "loss": 2.0309, + "step": 3370 + }, + { + "epoch": 1.0346838551258442, + "grad_norm": 0.5657221674919128, + "learning_rate": 9.859302760644036e-05, + "loss": 2.048, + "step": 3371 + }, + { + "epoch": 1.0349907918968693, + "grad_norm": 0.7023506164550781, + "learning_rate": 9.85918565165096e-05, + "loss": 2.033, + "step": 3372 + }, + { + "epoch": 1.0352977286678944, + "grad_norm": 0.5712850689888, + "learning_rate": 9.859068494636565e-05, + "loss": 2.1006, + "step": 3373 + }, + { + "epoch": 1.0356046654389195, + "grad_norm": 0.5352653861045837, + "learning_rate": 9.858951289602004e-05, + "loss": 1.9775, + "step": 3374 + }, + { + "epoch": 1.0359116022099448, + "grad_norm": 0.5282073616981506, + "learning_rate": 9.85883403654844e-05, + "loss": 2.0388, + "step": 3375 + }, + { + "epoch": 1.03621853898097, + "grad_norm": 0.6164727210998535, + "learning_rate": 9.85871673547703e-05, + "loss": 2.0758, + "step": 3376 + }, + { + "epoch": 1.036525475751995, + "grad_norm": 0.6034660935401917, + "learning_rate": 9.858599386388933e-05, + "loss": 2.0619, + "step": 3377 + }, + { + "epoch": 1.0368324125230202, + "grad_norm": 0.6129952073097229, + "learning_rate": 9.85848198928531e-05, + "loss": 2.0709, + "step": 3378 + }, + { + "epoch": 1.0371393492940455, + "grad_norm": 0.6287248134613037, + "learning_rate": 9.85836454416732e-05, + "loss": 2.1493, + "step": 3379 + }, + { + "epoch": 1.0374462860650706, + "grad_norm": 0.675419807434082, + "learning_rate": 9.858247051036124e-05, + "loss": 2.0558, + "step": 3380 + }, + { + "epoch": 1.0377532228360957, + "grad_norm": 0.6493481397628784, + "learning_rate": 9.858129509892882e-05, + "loss": 2.2019, + "step": 3381 + }, + { + "epoch": 1.0380601596071208, + "grad_norm": 0.6690036058425903, + "learning_rate": 9.85801192073876e-05, + "loss": 2.0069, + "step": 3382 + }, + { + "epoch": 1.0383670963781462, + "grad_norm": 0.6682954430580139, + "learning_rate": 9.857894283574913e-05, + "loss": 2.0559, + "step": 3383 + }, + { + "epoch": 1.0386740331491713, + "grad_norm": 0.6408236622810364, + "learning_rate": 9.857776598402508e-05, + "loss": 2.0837, + "step": 3384 + }, + { + "epoch": 1.0389809699201964, + "grad_norm": 0.7896385192871094, + "learning_rate": 9.85765886522271e-05, + "loss": 2.1344, + "step": 3385 + }, + { + "epoch": 1.0392879066912215, + "grad_norm": 0.7404007911682129, + "learning_rate": 9.857541084036677e-05, + "loss": 2.0937, + "step": 3386 + }, + { + "epoch": 1.0395948434622468, + "grad_norm": 0.6780609488487244, + "learning_rate": 9.857423254845577e-05, + "loss": 2.0279, + "step": 3387 + }, + { + "epoch": 1.039901780233272, + "grad_norm": 0.5989474654197693, + "learning_rate": 9.857305377650574e-05, + "loss": 2.0997, + "step": 3388 + }, + { + "epoch": 1.040208717004297, + "grad_norm": 0.5449484586715698, + "learning_rate": 9.857187452452832e-05, + "loss": 2.0544, + "step": 3389 + }, + { + "epoch": 1.0405156537753222, + "grad_norm": 0.6261779069900513, + "learning_rate": 9.857069479253516e-05, + "loss": 2.024, + "step": 3390 + }, + { + "epoch": 1.0408225905463475, + "grad_norm": 0.6665713787078857, + "learning_rate": 9.856951458053794e-05, + "loss": 2.1139, + "step": 3391 + }, + { + "epoch": 1.0411295273173726, + "grad_norm": 0.5861490964889526, + "learning_rate": 9.856833388854829e-05, + "loss": 2.0087, + "step": 3392 + }, + { + "epoch": 1.0414364640883977, + "grad_norm": 0.5511623620986938, + "learning_rate": 9.856715271657793e-05, + "loss": 2.106, + "step": 3393 + }, + { + "epoch": 1.0417434008594229, + "grad_norm": 0.5450705885887146, + "learning_rate": 9.856597106463848e-05, + "loss": 2.0669, + "step": 3394 + }, + { + "epoch": 1.0420503376304482, + "grad_norm": 0.5172801613807678, + "learning_rate": 9.856478893274163e-05, + "loss": 2.0492, + "step": 3395 + }, + { + "epoch": 1.0423572744014733, + "grad_norm": 0.580157458782196, + "learning_rate": 9.856360632089907e-05, + "loss": 2.0794, + "step": 3396 + }, + { + "epoch": 1.0426642111724984, + "grad_norm": 0.5138662457466125, + "learning_rate": 9.856242322912251e-05, + "loss": 2.0813, + "step": 3397 + }, + { + "epoch": 1.0429711479435237, + "grad_norm": 0.5626689791679382, + "learning_rate": 9.85612396574236e-05, + "loss": 2.071, + "step": 3398 + }, + { + "epoch": 1.0432780847145489, + "grad_norm": 0.6069894433021545, + "learning_rate": 9.856005560581407e-05, + "loss": 2.132, + "step": 3399 + }, + { + "epoch": 1.043585021485574, + "grad_norm": 0.547346293926239, + "learning_rate": 9.85588710743056e-05, + "loss": 2.0572, + "step": 3400 + }, + { + "epoch": 1.043891958256599, + "grad_norm": 0.5712311863899231, + "learning_rate": 9.855768606290992e-05, + "loss": 2.0943, + "step": 3401 + }, + { + "epoch": 1.0441988950276242, + "grad_norm": 0.5945014953613281, + "learning_rate": 9.85565005716387e-05, + "loss": 2.1004, + "step": 3402 + }, + { + "epoch": 1.0445058317986495, + "grad_norm": 0.5712563395500183, + "learning_rate": 9.85553146005037e-05, + "loss": 2.0817, + "step": 3403 + }, + { + "epoch": 1.0448127685696746, + "grad_norm": 0.552578866481781, + "learning_rate": 9.855412814951661e-05, + "loss": 2.0514, + "step": 3404 + }, + { + "epoch": 1.0451197053406998, + "grad_norm": 0.5654930472373962, + "learning_rate": 9.855294121868918e-05, + "loss": 2.1342, + "step": 3405 + }, + { + "epoch": 1.045426642111725, + "grad_norm": 0.516094446182251, + "learning_rate": 9.855175380803312e-05, + "loss": 2.01, + "step": 3406 + }, + { + "epoch": 1.0457335788827502, + "grad_norm": 0.5198549628257751, + "learning_rate": 9.855056591756018e-05, + "loss": 2.0423, + "step": 3407 + }, + { + "epoch": 1.0460405156537753, + "grad_norm": 0.45312678813934326, + "learning_rate": 9.854937754728209e-05, + "loss": 1.9767, + "step": 3408 + }, + { + "epoch": 1.0463474524248004, + "grad_norm": 0.4647958278656006, + "learning_rate": 9.854818869721059e-05, + "loss": 2.107, + "step": 3409 + }, + { + "epoch": 1.0466543891958258, + "grad_norm": 0.5034347772598267, + "learning_rate": 9.854699936735742e-05, + "loss": 2.0358, + "step": 3410 + }, + { + "epoch": 1.0469613259668509, + "grad_norm": 0.48189103603363037, + "learning_rate": 9.854580955773435e-05, + "loss": 2.0441, + "step": 3411 + }, + { + "epoch": 1.047268262737876, + "grad_norm": 0.5315099954605103, + "learning_rate": 9.854461926835316e-05, + "loss": 2.0222, + "step": 3412 + }, + { + "epoch": 1.047575199508901, + "grad_norm": 0.6013970971107483, + "learning_rate": 9.854342849922557e-05, + "loss": 2.09, + "step": 3413 + }, + { + "epoch": 1.0478821362799264, + "grad_norm": 0.7554240226745605, + "learning_rate": 9.854223725036339e-05, + "loss": 2.0411, + "step": 3414 + }, + { + "epoch": 1.0481890730509515, + "grad_norm": 0.7160158157348633, + "learning_rate": 9.854104552177835e-05, + "loss": 2.0858, + "step": 3415 + }, + { + "epoch": 1.0484960098219767, + "grad_norm": 0.5641576051712036, + "learning_rate": 9.853985331348225e-05, + "loss": 2.0287, + "step": 3416 + }, + { + "epoch": 1.0488029465930018, + "grad_norm": 0.5947676301002502, + "learning_rate": 9.853866062548687e-05, + "loss": 2.1177, + "step": 3417 + }, + { + "epoch": 1.049109883364027, + "grad_norm": 0.5780991911888123, + "learning_rate": 9.853746745780401e-05, + "loss": 2.024, + "step": 3418 + }, + { + "epoch": 1.0494168201350522, + "grad_norm": 0.6753053665161133, + "learning_rate": 9.853627381044543e-05, + "loss": 2.1303, + "step": 3419 + }, + { + "epoch": 1.0497237569060773, + "grad_norm": 0.7183442711830139, + "learning_rate": 9.853507968342295e-05, + "loss": 2.0845, + "step": 3420 + }, + { + "epoch": 1.0500306936771024, + "grad_norm": 0.6768840551376343, + "learning_rate": 9.853388507674837e-05, + "loss": 2.0991, + "step": 3421 + }, + { + "epoch": 1.0503376304481278, + "grad_norm": 0.624703049659729, + "learning_rate": 9.85326899904335e-05, + "loss": 2.0952, + "step": 3422 + }, + { + "epoch": 1.050644567219153, + "grad_norm": 0.523289144039154, + "learning_rate": 9.853149442449013e-05, + "loss": 2.0244, + "step": 3423 + }, + { + "epoch": 1.050951503990178, + "grad_norm": 0.4939860701560974, + "learning_rate": 9.853029837893008e-05, + "loss": 2.0312, + "step": 3424 + }, + { + "epoch": 1.0512584407612031, + "grad_norm": 0.5685132145881653, + "learning_rate": 9.852910185376519e-05, + "loss": 2.0863, + "step": 3425 + }, + { + "epoch": 1.0515653775322285, + "grad_norm": 0.5713129639625549, + "learning_rate": 9.852790484900725e-05, + "loss": 2.1182, + "step": 3426 + }, + { + "epoch": 1.0518723143032536, + "grad_norm": 0.5626100301742554, + "learning_rate": 9.852670736466813e-05, + "loss": 2.0187, + "step": 3427 + }, + { + "epoch": 1.0521792510742787, + "grad_norm": 0.5129684805870056, + "learning_rate": 9.852550940075965e-05, + "loss": 2.0354, + "step": 3428 + }, + { + "epoch": 1.0524861878453038, + "grad_norm": 0.6123769879341125, + "learning_rate": 9.852431095729361e-05, + "loss": 2.1315, + "step": 3429 + }, + { + "epoch": 1.0527931246163291, + "grad_norm": 0.66834956407547, + "learning_rate": 9.852311203428192e-05, + "loss": 2.1642, + "step": 3430 + }, + { + "epoch": 1.0531000613873542, + "grad_norm": 0.6253052353858948, + "learning_rate": 9.85219126317364e-05, + "loss": 2.0651, + "step": 3431 + }, + { + "epoch": 1.0534069981583793, + "grad_norm": 0.5162510871887207, + "learning_rate": 9.852071274966888e-05, + "loss": 2.0029, + "step": 3432 + }, + { + "epoch": 1.0537139349294045, + "grad_norm": 0.5725626349449158, + "learning_rate": 9.851951238809125e-05, + "loss": 2.0875, + "step": 3433 + }, + { + "epoch": 1.0540208717004298, + "grad_norm": 0.5319885611534119, + "learning_rate": 9.851831154701537e-05, + "loss": 2.0042, + "step": 3434 + }, + { + "epoch": 1.054327808471455, + "grad_norm": 0.5030925273895264, + "learning_rate": 9.851711022645307e-05, + "loss": 1.9805, + "step": 3435 + }, + { + "epoch": 1.05463474524248, + "grad_norm": 0.5786148309707642, + "learning_rate": 9.851590842641627e-05, + "loss": 2.1456, + "step": 3436 + }, + { + "epoch": 1.0549416820135051, + "grad_norm": 0.6246622800827026, + "learning_rate": 9.851470614691682e-05, + "loss": 2.042, + "step": 3437 + }, + { + "epoch": 1.0552486187845305, + "grad_norm": 0.5181210041046143, + "learning_rate": 9.851350338796662e-05, + "loss": 2.0423, + "step": 3438 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.5505120754241943, + "learning_rate": 9.851230014957754e-05, + "loss": 2.0478, + "step": 3439 + }, + { + "epoch": 1.0558624923265807, + "grad_norm": 0.6193632483482361, + "learning_rate": 9.851109643176147e-05, + "loss": 1.9904, + "step": 3440 + }, + { + "epoch": 1.0561694290976058, + "grad_norm": 0.6332803964614868, + "learning_rate": 9.85098922345303e-05, + "loss": 2.0037, + "step": 3441 + }, + { + "epoch": 1.0564763658686311, + "grad_norm": 0.5601481199264526, + "learning_rate": 9.850868755789595e-05, + "loss": 2.141, + "step": 3442 + }, + { + "epoch": 1.0567833026396563, + "grad_norm": 0.588182270526886, + "learning_rate": 9.850748240187033e-05, + "loss": 2.17, + "step": 3443 + }, + { + "epoch": 1.0570902394106814, + "grad_norm": 0.5955865383148193, + "learning_rate": 9.850627676646533e-05, + "loss": 2.1004, + "step": 3444 + }, + { + "epoch": 1.0573971761817065, + "grad_norm": 0.6412670612335205, + "learning_rate": 9.850507065169288e-05, + "loss": 2.0642, + "step": 3445 + }, + { + "epoch": 1.0577041129527318, + "grad_norm": 0.5597305297851562, + "learning_rate": 9.850386405756489e-05, + "loss": 2.0412, + "step": 3446 + }, + { + "epoch": 1.058011049723757, + "grad_norm": 0.5633887052536011, + "learning_rate": 9.850265698409328e-05, + "loss": 1.9976, + "step": 3447 + }, + { + "epoch": 1.058317986494782, + "grad_norm": 0.5924213528633118, + "learning_rate": 9.850144943128998e-05, + "loss": 2.0715, + "step": 3448 + }, + { + "epoch": 1.0586249232658071, + "grad_norm": 0.5968048572540283, + "learning_rate": 9.850024139916694e-05, + "loss": 2.0755, + "step": 3449 + }, + { + "epoch": 1.0589318600368325, + "grad_norm": 0.5745044946670532, + "learning_rate": 9.849903288773609e-05, + "loss": 2.0615, + "step": 3450 + }, + { + "epoch": 1.0592387968078576, + "grad_norm": 0.5154273509979248, + "learning_rate": 9.849782389700936e-05, + "loss": 2.0429, + "step": 3451 + }, + { + "epoch": 1.0595457335788827, + "grad_norm": 0.5307286977767944, + "learning_rate": 9.849661442699871e-05, + "loss": 2.0788, + "step": 3452 + }, + { + "epoch": 1.0598526703499078, + "grad_norm": 0.5445010662078857, + "learning_rate": 9.84954044777161e-05, + "loss": 2.0598, + "step": 3453 + }, + { + "epoch": 1.0601596071209332, + "grad_norm": 0.5858064889907837, + "learning_rate": 9.849419404917347e-05, + "loss": 2.069, + "step": 3454 + }, + { + "epoch": 1.0604665438919583, + "grad_norm": 0.5906962156295776, + "learning_rate": 9.84929831413828e-05, + "loss": 2.1256, + "step": 3455 + }, + { + "epoch": 1.0607734806629834, + "grad_norm": 0.6632845997810364, + "learning_rate": 9.849177175435605e-05, + "loss": 2.1002, + "step": 3456 + }, + { + "epoch": 1.0610804174340085, + "grad_norm": 0.6352782845497131, + "learning_rate": 9.849055988810518e-05, + "loss": 2.0901, + "step": 3457 + }, + { + "epoch": 1.0613873542050338, + "grad_norm": 0.5406731963157654, + "learning_rate": 9.848934754264218e-05, + "loss": 2.0562, + "step": 3458 + }, + { + "epoch": 1.061694290976059, + "grad_norm": 0.6067590117454529, + "learning_rate": 9.848813471797902e-05, + "loss": 2.0914, + "step": 3459 + }, + { + "epoch": 1.062001227747084, + "grad_norm": 0.5876826047897339, + "learning_rate": 9.84869214141277e-05, + "loss": 2.0065, + "step": 3460 + }, + { + "epoch": 1.0623081645181092, + "grad_norm": 0.611648440361023, + "learning_rate": 9.84857076311002e-05, + "loss": 2.1252, + "step": 3461 + }, + { + "epoch": 1.0626151012891345, + "grad_norm": 0.568358302116394, + "learning_rate": 9.848449336890853e-05, + "loss": 2.0312, + "step": 3462 + }, + { + "epoch": 1.0629220380601596, + "grad_norm": 0.5303518772125244, + "learning_rate": 9.848327862756466e-05, + "loss": 1.9989, + "step": 3463 + }, + { + "epoch": 1.0632289748311847, + "grad_norm": 0.5377182960510254, + "learning_rate": 9.848206340708062e-05, + "loss": 2.0759, + "step": 3464 + }, + { + "epoch": 1.06353591160221, + "grad_norm": 0.5178431868553162, + "learning_rate": 9.848084770746842e-05, + "loss": 2.0613, + "step": 3465 + }, + { + "epoch": 1.0638428483732352, + "grad_norm": 0.4605518877506256, + "learning_rate": 9.847963152874007e-05, + "loss": 1.9961, + "step": 3466 + }, + { + "epoch": 1.0641497851442603, + "grad_norm": 0.5262506604194641, + "learning_rate": 9.847841487090758e-05, + "loss": 2.032, + "step": 3467 + }, + { + "epoch": 1.0644567219152854, + "grad_norm": 0.5210484862327576, + "learning_rate": 9.847719773398298e-05, + "loss": 2.106, + "step": 3468 + }, + { + "epoch": 1.0647636586863105, + "grad_norm": 0.5159584283828735, + "learning_rate": 9.84759801179783e-05, + "loss": 2.07, + "step": 3469 + }, + { + "epoch": 1.0650705954573358, + "grad_norm": 0.5094224810600281, + "learning_rate": 9.847476202290557e-05, + "loss": 2.1379, + "step": 3470 + }, + { + "epoch": 1.065377532228361, + "grad_norm": 0.5180851221084595, + "learning_rate": 9.847354344877684e-05, + "loss": 2.0911, + "step": 3471 + }, + { + "epoch": 1.065684468999386, + "grad_norm": 0.5476199984550476, + "learning_rate": 9.847232439560412e-05, + "loss": 2.0654, + "step": 3472 + }, + { + "epoch": 1.0659914057704114, + "grad_norm": 0.5314182639122009, + "learning_rate": 9.84711048633995e-05, + "loss": 1.9829, + "step": 3473 + }, + { + "epoch": 1.0662983425414365, + "grad_norm": 0.549379825592041, + "learning_rate": 9.8469884852175e-05, + "loss": 2.0876, + "step": 3474 + }, + { + "epoch": 1.0666052793124616, + "grad_norm": 0.6280861496925354, + "learning_rate": 9.84686643619427e-05, + "loss": 2.1026, + "step": 3475 + }, + { + "epoch": 1.0669122160834867, + "grad_norm": 0.5838838219642639, + "learning_rate": 9.846744339271464e-05, + "loss": 2.0553, + "step": 3476 + }, + { + "epoch": 1.0672191528545119, + "grad_norm": 0.6090747117996216, + "learning_rate": 9.84662219445029e-05, + "loss": 2.0983, + "step": 3477 + }, + { + "epoch": 1.0675260896255372, + "grad_norm": 0.515504002571106, + "learning_rate": 9.846500001731955e-05, + "loss": 2.0992, + "step": 3478 + }, + { + "epoch": 1.0678330263965623, + "grad_norm": 0.5083954930305481, + "learning_rate": 9.846377761117667e-05, + "loss": 1.9851, + "step": 3479 + }, + { + "epoch": 1.0681399631675874, + "grad_norm": 0.5102222561836243, + "learning_rate": 9.846255472608632e-05, + "loss": 2.0553, + "step": 3480 + }, + { + "epoch": 1.0684468999386127, + "grad_norm": 0.5123574137687683, + "learning_rate": 9.846133136206061e-05, + "loss": 2.0382, + "step": 3481 + }, + { + "epoch": 1.0687538367096379, + "grad_norm": 0.5657833814620972, + "learning_rate": 9.84601075191116e-05, + "loss": 2.0735, + "step": 3482 + }, + { + "epoch": 1.069060773480663, + "grad_norm": 0.5460711121559143, + "learning_rate": 9.845888319725143e-05, + "loss": 2.0445, + "step": 3483 + }, + { + "epoch": 1.069367710251688, + "grad_norm": 0.42860034108161926, + "learning_rate": 9.845765839649217e-05, + "loss": 2.0166, + "step": 3484 + }, + { + "epoch": 1.0696746470227134, + "grad_norm": 0.5413190126419067, + "learning_rate": 9.845643311684592e-05, + "loss": 1.9923, + "step": 3485 + }, + { + "epoch": 1.0699815837937385, + "grad_norm": 0.4982166290283203, + "learning_rate": 9.84552073583248e-05, + "loss": 2.0279, + "step": 3486 + }, + { + "epoch": 1.0702885205647636, + "grad_norm": 0.4824393689632416, + "learning_rate": 9.845398112094091e-05, + "loss": 1.9661, + "step": 3487 + }, + { + "epoch": 1.0705954573357888, + "grad_norm": 0.5690898895263672, + "learning_rate": 9.845275440470639e-05, + "loss": 2.0866, + "step": 3488 + }, + { + "epoch": 1.070902394106814, + "grad_norm": 0.6087098717689514, + "learning_rate": 9.845152720963335e-05, + "loss": 2.055, + "step": 3489 + }, + { + "epoch": 1.0712093308778392, + "grad_norm": 0.5754218101501465, + "learning_rate": 9.845029953573392e-05, + "loss": 2.0577, + "step": 3490 + }, + { + "epoch": 1.0715162676488643, + "grad_norm": 0.619746744632721, + "learning_rate": 9.844907138302023e-05, + "loss": 2.0694, + "step": 3491 + }, + { + "epoch": 1.0718232044198894, + "grad_norm": 0.5165389776229858, + "learning_rate": 9.844784275150442e-05, + "loss": 1.9618, + "step": 3492 + }, + { + "epoch": 1.0721301411909148, + "grad_norm": 0.5098079442977905, + "learning_rate": 9.844661364119863e-05, + "loss": 2.0021, + "step": 3493 + }, + { + "epoch": 1.0724370779619399, + "grad_norm": 0.5978688597679138, + "learning_rate": 9.8445384052115e-05, + "loss": 2.0861, + "step": 3494 + }, + { + "epoch": 1.072744014732965, + "grad_norm": 0.5498695373535156, + "learning_rate": 9.844415398426572e-05, + "loss": 2.095, + "step": 3495 + }, + { + "epoch": 1.07305095150399, + "grad_norm": 0.4890369474887848, + "learning_rate": 9.844292343766289e-05, + "loss": 1.9819, + "step": 3496 + }, + { + "epoch": 1.0733578882750154, + "grad_norm": 0.49551400542259216, + "learning_rate": 9.844169241231871e-05, + "loss": 2.109, + "step": 3497 + }, + { + "epoch": 1.0736648250460405, + "grad_norm": 0.5358633399009705, + "learning_rate": 9.844046090824533e-05, + "loss": 2.0579, + "step": 3498 + }, + { + "epoch": 1.0739717618170657, + "grad_norm": 0.5990919470787048, + "learning_rate": 9.843922892545492e-05, + "loss": 2.1962, + "step": 3499 + }, + { + "epoch": 1.0742786985880908, + "grad_norm": 0.5973169207572937, + "learning_rate": 9.843799646395967e-05, + "loss": 2.0691, + "step": 3500 + }, + { + "epoch": 1.074585635359116, + "grad_norm": 0.5875831246376038, + "learning_rate": 9.843676352377172e-05, + "loss": 2.0807, + "step": 3501 + }, + { + "epoch": 1.0748925721301412, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.84355301049033e-05, + "loss": 2.0694, + "step": 3502 + }, + { + "epoch": 1.0751995089011663, + "grad_norm": 0.7694209814071655, + "learning_rate": 9.843429620736659e-05, + "loss": 2.1504, + "step": 3503 + }, + { + "epoch": 1.0755064456721914, + "grad_norm": 0.7930089831352234, + "learning_rate": 9.843306183117376e-05, + "loss": 2.0635, + "step": 3504 + }, + { + "epoch": 1.0758133824432168, + "grad_norm": 0.6518469452857971, + "learning_rate": 9.843182697633704e-05, + "loss": 2.0395, + "step": 3505 + }, + { + "epoch": 1.076120319214242, + "grad_norm": 0.49737605452537537, + "learning_rate": 9.843059164286861e-05, + "loss": 1.9875, + "step": 3506 + }, + { + "epoch": 1.076427255985267, + "grad_norm": 0.5311492085456848, + "learning_rate": 9.84293558307807e-05, + "loss": 2.1331, + "step": 3507 + }, + { + "epoch": 1.0767341927562921, + "grad_norm": 0.6801449656486511, + "learning_rate": 9.842811954008551e-05, + "loss": 2.0991, + "step": 3508 + }, + { + "epoch": 1.0770411295273175, + "grad_norm": 0.5404406189918518, + "learning_rate": 9.842688277079523e-05, + "loss": 2.0482, + "step": 3509 + }, + { + "epoch": 1.0773480662983426, + "grad_norm": 0.6136532425880432, + "learning_rate": 9.842564552292215e-05, + "loss": 2.1016, + "step": 3510 + }, + { + "epoch": 1.0776550030693677, + "grad_norm": 0.5874183773994446, + "learning_rate": 9.842440779647843e-05, + "loss": 2.0495, + "step": 3511 + }, + { + "epoch": 1.0779619398403928, + "grad_norm": 0.4891047775745392, + "learning_rate": 9.842316959147635e-05, + "loss": 2.0592, + "step": 3512 + }, + { + "epoch": 1.0782688766114181, + "grad_norm": 0.5115689635276794, + "learning_rate": 9.84219309079281e-05, + "loss": 2.0084, + "step": 3513 + }, + { + "epoch": 1.0785758133824432, + "grad_norm": 0.5662370324134827, + "learning_rate": 9.842069174584597e-05, + "loss": 2.1134, + "step": 3514 + }, + { + "epoch": 1.0788827501534684, + "grad_norm": 0.6859605312347412, + "learning_rate": 9.841945210524217e-05, + "loss": 2.1144, + "step": 3515 + }, + { + "epoch": 1.0791896869244935, + "grad_norm": 0.8003933429718018, + "learning_rate": 9.841821198612897e-05, + "loss": 2.0353, + "step": 3516 + }, + { + "epoch": 1.0794966236955188, + "grad_norm": 0.8481027483940125, + "learning_rate": 9.841697138851863e-05, + "loss": 2.1012, + "step": 3517 + }, + { + "epoch": 1.079803560466544, + "grad_norm": 0.7234178185462952, + "learning_rate": 9.84157303124234e-05, + "loss": 2.1134, + "step": 3518 + }, + { + "epoch": 1.080110497237569, + "grad_norm": 0.6129522919654846, + "learning_rate": 9.841448875785553e-05, + "loss": 2.0736, + "step": 3519 + }, + { + "epoch": 1.0804174340085941, + "grad_norm": 0.4983314573764801, + "learning_rate": 9.841324672482732e-05, + "loss": 2.0334, + "step": 3520 + }, + { + "epoch": 1.0807243707796195, + "grad_norm": 0.6069099307060242, + "learning_rate": 9.841200421335101e-05, + "loss": 2.0506, + "step": 3521 + }, + { + "epoch": 1.0810313075506446, + "grad_norm": 0.5841798186302185, + "learning_rate": 9.841076122343893e-05, + "loss": 2.0491, + "step": 3522 + }, + { + "epoch": 1.0813382443216697, + "grad_norm": 0.5629861354827881, + "learning_rate": 9.84095177551033e-05, + "loss": 2.0435, + "step": 3523 + }, + { + "epoch": 1.0816451810926948, + "grad_norm": 0.48676446080207825, + "learning_rate": 9.840827380835646e-05, + "loss": 2.0543, + "step": 3524 + }, + { + "epoch": 1.0819521178637201, + "grad_norm": 0.5119389295578003, + "learning_rate": 9.840702938321069e-05, + "loss": 2.0461, + "step": 3525 + }, + { + "epoch": 1.0822590546347453, + "grad_norm": 0.47259917855262756, + "learning_rate": 9.840578447967827e-05, + "loss": 2.0494, + "step": 3526 + }, + { + "epoch": 1.0825659914057704, + "grad_norm": 0.5083605647087097, + "learning_rate": 9.840453909777153e-05, + "loss": 2.0518, + "step": 3527 + }, + { + "epoch": 1.0828729281767955, + "grad_norm": 0.46149778366088867, + "learning_rate": 9.840329323750276e-05, + "loss": 2.0087, + "step": 3528 + }, + { + "epoch": 1.0831798649478208, + "grad_norm": 0.4698919951915741, + "learning_rate": 9.840204689888427e-05, + "loss": 2.0715, + "step": 3529 + }, + { + "epoch": 1.083486801718846, + "grad_norm": 0.514570951461792, + "learning_rate": 9.840080008192838e-05, + "loss": 2.1067, + "step": 3530 + }, + { + "epoch": 1.083793738489871, + "grad_norm": 0.5938723087310791, + "learning_rate": 9.839955278664743e-05, + "loss": 2.1246, + "step": 3531 + }, + { + "epoch": 1.0841006752608962, + "grad_norm": 0.58525550365448, + "learning_rate": 9.839830501305372e-05, + "loss": 2.0695, + "step": 3532 + }, + { + "epoch": 1.0844076120319215, + "grad_norm": 0.5693490505218506, + "learning_rate": 9.83970567611596e-05, + "loss": 2.0166, + "step": 3533 + }, + { + "epoch": 1.0847145488029466, + "grad_norm": 0.544964075088501, + "learning_rate": 9.839580803097738e-05, + "loss": 2.0093, + "step": 3534 + }, + { + "epoch": 1.0850214855739717, + "grad_norm": 0.5509639978408813, + "learning_rate": 9.839455882251945e-05, + "loss": 2.0511, + "step": 3535 + }, + { + "epoch": 1.0853284223449968, + "grad_norm": 0.5092516541481018, + "learning_rate": 9.83933091357981e-05, + "loss": 2.0586, + "step": 3536 + }, + { + "epoch": 1.0856353591160222, + "grad_norm": 0.5163968205451965, + "learning_rate": 9.83920589708257e-05, + "loss": 2.0541, + "step": 3537 + }, + { + "epoch": 1.0859422958870473, + "grad_norm": 0.49756479263305664, + "learning_rate": 9.839080832761464e-05, + "loss": 2.0495, + "step": 3538 + }, + { + "epoch": 1.0862492326580724, + "grad_norm": 0.6246916055679321, + "learning_rate": 9.838955720617722e-05, + "loss": 2.2082, + "step": 3539 + }, + { + "epoch": 1.0865561694290977, + "grad_norm": 0.5826153755187988, + "learning_rate": 9.838830560652585e-05, + "loss": 2.0318, + "step": 3540 + }, + { + "epoch": 1.0868631062001228, + "grad_norm": 0.6131548285484314, + "learning_rate": 9.838705352867287e-05, + "loss": 2.1172, + "step": 3541 + }, + { + "epoch": 1.087170042971148, + "grad_norm": 0.7028201818466187, + "learning_rate": 9.838580097263068e-05, + "loss": 2.061, + "step": 3542 + }, + { + "epoch": 1.087476979742173, + "grad_norm": 0.7061073780059814, + "learning_rate": 9.838454793841166e-05, + "loss": 2.0944, + "step": 3543 + }, + { + "epoch": 1.0877839165131982, + "grad_norm": 0.6820229887962341, + "learning_rate": 9.838329442602814e-05, + "loss": 2.072, + "step": 3544 + }, + { + "epoch": 1.0880908532842235, + "grad_norm": 0.5658139586448669, + "learning_rate": 9.838204043549257e-05, + "loss": 2.0499, + "step": 3545 + }, + { + "epoch": 1.0883977900552486, + "grad_norm": 0.5714126825332642, + "learning_rate": 9.838078596681731e-05, + "loss": 2.06, + "step": 3546 + }, + { + "epoch": 1.0887047268262737, + "grad_norm": 0.5343610048294067, + "learning_rate": 9.837953102001477e-05, + "loss": 2.0932, + "step": 3547 + }, + { + "epoch": 1.089011663597299, + "grad_norm": 0.5799851417541504, + "learning_rate": 9.837827559509735e-05, + "loss": 2.0615, + "step": 3548 + }, + { + "epoch": 1.0893186003683242, + "grad_norm": 0.5679401159286499, + "learning_rate": 9.837701969207745e-05, + "loss": 2.0161, + "step": 3549 + }, + { + "epoch": 1.0896255371393493, + "grad_norm": 0.5369420647621155, + "learning_rate": 9.83757633109675e-05, + "loss": 2.0066, + "step": 3550 + }, + { + "epoch": 1.0899324739103744, + "grad_norm": 0.5276355147361755, + "learning_rate": 9.837450645177988e-05, + "loss": 2.03, + "step": 3551 + }, + { + "epoch": 1.0902394106813997, + "grad_norm": 0.49717894196510315, + "learning_rate": 9.837324911452705e-05, + "loss": 1.9897, + "step": 3552 + }, + { + "epoch": 1.0905463474524248, + "grad_norm": 0.460783451795578, + "learning_rate": 9.837199129922142e-05, + "loss": 2.089, + "step": 3553 + }, + { + "epoch": 1.09085328422345, + "grad_norm": 0.505473792552948, + "learning_rate": 9.837073300587541e-05, + "loss": 2.035, + "step": 3554 + }, + { + "epoch": 1.091160220994475, + "grad_norm": 0.4588155150413513, + "learning_rate": 9.836947423450147e-05, + "loss": 2.0029, + "step": 3555 + }, + { + "epoch": 1.0914671577655004, + "grad_norm": 0.5151825547218323, + "learning_rate": 9.836821498511203e-05, + "loss": 2.1075, + "step": 3556 + }, + { + "epoch": 1.0917740945365255, + "grad_norm": 0.46669647097587585, + "learning_rate": 9.836695525771955e-05, + "loss": 2.0468, + "step": 3557 + }, + { + "epoch": 1.0920810313075506, + "grad_norm": 0.49291539192199707, + "learning_rate": 9.836569505233647e-05, + "loss": 2.1201, + "step": 3558 + }, + { + "epoch": 1.0923879680785757, + "grad_norm": 0.49323126673698425, + "learning_rate": 9.836443436897525e-05, + "loss": 1.9796, + "step": 3559 + }, + { + "epoch": 1.092694904849601, + "grad_norm": 0.4784039258956909, + "learning_rate": 9.836317320764832e-05, + "loss": 2.0267, + "step": 3560 + }, + { + "epoch": 1.0930018416206262, + "grad_norm": 0.5402999520301819, + "learning_rate": 9.836191156836818e-05, + "loss": 2.07, + "step": 3561 + }, + { + "epoch": 1.0933087783916513, + "grad_norm": 0.5989857912063599, + "learning_rate": 9.83606494511473e-05, + "loss": 2.0518, + "step": 3562 + }, + { + "epoch": 1.0936157151626764, + "grad_norm": 0.685855507850647, + "learning_rate": 9.835938685599811e-05, + "loss": 2.0632, + "step": 3563 + }, + { + "epoch": 1.0939226519337018, + "grad_norm": 0.7716066837310791, + "learning_rate": 9.835812378293312e-05, + "loss": 2.0758, + "step": 3564 + }, + { + "epoch": 1.0942295887047269, + "grad_norm": 0.6822659969329834, + "learning_rate": 9.835686023196481e-05, + "loss": 2.0077, + "step": 3565 + }, + { + "epoch": 1.094536525475752, + "grad_norm": 0.5031718611717224, + "learning_rate": 9.835559620310566e-05, + "loss": 2.0432, + "step": 3566 + }, + { + "epoch": 1.094843462246777, + "grad_norm": 0.5570902228355408, + "learning_rate": 9.835433169636818e-05, + "loss": 2.1203, + "step": 3567 + }, + { + "epoch": 1.0951503990178024, + "grad_norm": 0.6224993467330933, + "learning_rate": 9.835306671176484e-05, + "loss": 2.0281, + "step": 3568 + }, + { + "epoch": 1.0954573357888275, + "grad_norm": 0.67215895652771, + "learning_rate": 9.835180124930816e-05, + "loss": 2.1158, + "step": 3569 + }, + { + "epoch": 1.0957642725598526, + "grad_norm": 0.5764983892440796, + "learning_rate": 9.835053530901064e-05, + "loss": 1.9735, + "step": 3570 + }, + { + "epoch": 1.0960712093308778, + "grad_norm": 0.48459672927856445, + "learning_rate": 9.834926889088478e-05, + "loss": 2.0074, + "step": 3571 + }, + { + "epoch": 1.096378146101903, + "grad_norm": 0.4789890944957733, + "learning_rate": 9.834800199494312e-05, + "loss": 1.9942, + "step": 3572 + }, + { + "epoch": 1.0966850828729282, + "grad_norm": 0.5133237838745117, + "learning_rate": 9.834673462119817e-05, + "loss": 2.0204, + "step": 3573 + }, + { + "epoch": 1.0969920196439533, + "grad_norm": 0.638518750667572, + "learning_rate": 9.834546676966244e-05, + "loss": 2.1396, + "step": 3574 + }, + { + "epoch": 1.0972989564149784, + "grad_norm": 0.5471677780151367, + "learning_rate": 9.834419844034848e-05, + "loss": 1.99, + "step": 3575 + }, + { + "epoch": 1.0976058931860038, + "grad_norm": 0.5372926592826843, + "learning_rate": 9.83429296332688e-05, + "loss": 2.0241, + "step": 3576 + }, + { + "epoch": 1.0979128299570289, + "grad_norm": 0.5284983515739441, + "learning_rate": 9.834166034843597e-05, + "loss": 2.0705, + "step": 3577 + }, + { + "epoch": 1.098219766728054, + "grad_norm": 0.5212574601173401, + "learning_rate": 9.834039058586252e-05, + "loss": 2.0648, + "step": 3578 + }, + { + "epoch": 1.098526703499079, + "grad_norm": 0.439454048871994, + "learning_rate": 9.833912034556099e-05, + "loss": 1.9981, + "step": 3579 + }, + { + "epoch": 1.0988336402701044, + "grad_norm": 0.529550313949585, + "learning_rate": 9.833784962754394e-05, + "loss": 2.0092, + "step": 3580 + }, + { + "epoch": 1.0991405770411296, + "grad_norm": 0.5555844902992249, + "learning_rate": 9.833657843182394e-05, + "loss": 2.0457, + "step": 3581 + }, + { + "epoch": 1.0994475138121547, + "grad_norm": 0.56191086769104, + "learning_rate": 9.833530675841352e-05, + "loss": 2.0742, + "step": 3582 + }, + { + "epoch": 1.0997544505831798, + "grad_norm": 0.5119436383247375, + "learning_rate": 9.833403460732529e-05, + "loss": 2.0836, + "step": 3583 + }, + { + "epoch": 1.1000613873542051, + "grad_norm": 0.48049578070640564, + "learning_rate": 9.833276197857179e-05, + "loss": 2.0018, + "step": 3584 + }, + { + "epoch": 1.1003683241252302, + "grad_norm": 0.48501092195510864, + "learning_rate": 9.83314888721656e-05, + "loss": 2.0158, + "step": 3585 + }, + { + "epoch": 1.1006752608962553, + "grad_norm": 0.528548538684845, + "learning_rate": 9.833021528811932e-05, + "loss": 2.0327, + "step": 3586 + }, + { + "epoch": 1.1009821976672804, + "grad_norm": 0.5243194699287415, + "learning_rate": 9.832894122644551e-05, + "loss": 1.9874, + "step": 3587 + }, + { + "epoch": 1.1012891344383058, + "grad_norm": 0.46920302510261536, + "learning_rate": 9.832766668715681e-05, + "loss": 2.0487, + "step": 3588 + }, + { + "epoch": 1.101596071209331, + "grad_norm": 0.45994171500205994, + "learning_rate": 9.832639167026575e-05, + "loss": 2.0926, + "step": 3589 + }, + { + "epoch": 1.101903007980356, + "grad_norm": 0.5337465405464172, + "learning_rate": 9.832511617578497e-05, + "loss": 1.9957, + "step": 3590 + }, + { + "epoch": 1.1022099447513811, + "grad_norm": 0.5920217633247375, + "learning_rate": 9.832384020372707e-05, + "loss": 2.0571, + "step": 3591 + }, + { + "epoch": 1.1025168815224065, + "grad_norm": 0.651720404624939, + "learning_rate": 9.832256375410466e-05, + "loss": 2.0382, + "step": 3592 + }, + { + "epoch": 1.1028238182934316, + "grad_norm": 0.6063461899757385, + "learning_rate": 9.832128682693035e-05, + "loss": 1.9932, + "step": 3593 + }, + { + "epoch": 1.1031307550644567, + "grad_norm": 0.5111881494522095, + "learning_rate": 9.832000942221676e-05, + "loss": 1.9821, + "step": 3594 + }, + { + "epoch": 1.1034376918354818, + "grad_norm": 0.5419835448265076, + "learning_rate": 9.831873153997652e-05, + "loss": 2.0535, + "step": 3595 + }, + { + "epoch": 1.1037446286065071, + "grad_norm": 0.5685762763023376, + "learning_rate": 9.831745318022226e-05, + "loss": 2.0715, + "step": 3596 + }, + { + "epoch": 1.1040515653775322, + "grad_norm": 0.6095051765441895, + "learning_rate": 9.831617434296659e-05, + "loss": 2.0382, + "step": 3597 + }, + { + "epoch": 1.1043585021485574, + "grad_norm": 0.548292338848114, + "learning_rate": 9.831489502822217e-05, + "loss": 1.98, + "step": 3598 + }, + { + "epoch": 1.1046654389195825, + "grad_norm": 0.5056986808776855, + "learning_rate": 9.831361523600165e-05, + "loss": 2.0271, + "step": 3599 + }, + { + "epoch": 1.1049723756906078, + "grad_norm": 0.48790082335472107, + "learning_rate": 9.831233496631767e-05, + "loss": 1.9555, + "step": 3600 + }, + { + "epoch": 1.105279312461633, + "grad_norm": 0.4663766622543335, + "learning_rate": 9.831105421918287e-05, + "loss": 1.9985, + "step": 3601 + }, + { + "epoch": 1.105586249232658, + "grad_norm": 0.4549616277217865, + "learning_rate": 9.83097729946099e-05, + "loss": 2.0543, + "step": 3602 + }, + { + "epoch": 1.1058931860036831, + "grad_norm": 0.46699193120002747, + "learning_rate": 9.830849129261146e-05, + "loss": 2.0395, + "step": 3603 + }, + { + "epoch": 1.1062001227747085, + "grad_norm": 0.4600387215614319, + "learning_rate": 9.830720911320019e-05, + "loss": 2.0155, + "step": 3604 + }, + { + "epoch": 1.1065070595457336, + "grad_norm": 0.4854283034801483, + "learning_rate": 9.830592645638877e-05, + "loss": 2.0698, + "step": 3605 + }, + { + "epoch": 1.1068139963167587, + "grad_norm": 0.5249526500701904, + "learning_rate": 9.830464332218987e-05, + "loss": 2.0842, + "step": 3606 + }, + { + "epoch": 1.107120933087784, + "grad_norm": 0.6377332806587219, + "learning_rate": 9.830335971061616e-05, + "loss": 2.1399, + "step": 3607 + }, + { + "epoch": 1.1074278698588091, + "grad_norm": 0.632194995880127, + "learning_rate": 9.830207562168034e-05, + "loss": 2.1203, + "step": 3608 + }, + { + "epoch": 1.1077348066298343, + "grad_norm": 0.5585857629776001, + "learning_rate": 9.830079105539512e-05, + "loss": 2.0219, + "step": 3609 + }, + { + "epoch": 1.1080417434008594, + "grad_norm": 0.5613297820091248, + "learning_rate": 9.829950601177316e-05, + "loss": 2.0464, + "step": 3610 + }, + { + "epoch": 1.1083486801718845, + "grad_norm": 0.5213276743888855, + "learning_rate": 9.829822049082716e-05, + "loss": 2.0134, + "step": 3611 + }, + { + "epoch": 1.1086556169429098, + "grad_norm": 0.5008644461631775, + "learning_rate": 9.829693449256984e-05, + "loss": 1.9952, + "step": 3612 + }, + { + "epoch": 1.108962553713935, + "grad_norm": 0.5565455555915833, + "learning_rate": 9.829564801701392e-05, + "loss": 1.9737, + "step": 3613 + }, + { + "epoch": 1.10926949048496, + "grad_norm": 0.6150243878364563, + "learning_rate": 9.82943610641721e-05, + "loss": 2.0414, + "step": 3614 + }, + { + "epoch": 1.1095764272559854, + "grad_norm": 0.6731769442558289, + "learning_rate": 9.829307363405709e-05, + "loss": 2.0262, + "step": 3615 + }, + { + "epoch": 1.1098833640270105, + "grad_norm": 0.5681004524230957, + "learning_rate": 9.829178572668162e-05, + "loss": 2.0303, + "step": 3616 + }, + { + "epoch": 1.1101903007980356, + "grad_norm": 0.4748475253582001, + "learning_rate": 9.829049734205841e-05, + "loss": 1.9756, + "step": 3617 + }, + { + "epoch": 1.1104972375690607, + "grad_norm": 0.4218698740005493, + "learning_rate": 9.82892084802002e-05, + "loss": 2.0243, + "step": 3618 + }, + { + "epoch": 1.1108041743400858, + "grad_norm": 0.47928178310394287, + "learning_rate": 9.828791914111976e-05, + "loss": 2.0368, + "step": 3619 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5805749297142029, + "learning_rate": 9.828662932482977e-05, + "loss": 2.0071, + "step": 3620 + }, + { + "epoch": 1.1114180478821363, + "grad_norm": 0.5580070614814758, + "learning_rate": 9.828533903134302e-05, + "loss": 1.9568, + "step": 3621 + }, + { + "epoch": 1.1117249846531614, + "grad_norm": 0.572694718837738, + "learning_rate": 9.828404826067224e-05, + "loss": 2.0128, + "step": 3622 + }, + { + "epoch": 1.1120319214241867, + "grad_norm": 0.605338990688324, + "learning_rate": 9.828275701283021e-05, + "loss": 2.0638, + "step": 3623 + }, + { + "epoch": 1.1123388581952118, + "grad_norm": 0.550521969795227, + "learning_rate": 9.828146528782967e-05, + "loss": 2.118, + "step": 3624 + }, + { + "epoch": 1.112645794966237, + "grad_norm": 0.5420751571655273, + "learning_rate": 9.828017308568337e-05, + "loss": 2.0685, + "step": 3625 + }, + { + "epoch": 1.112952731737262, + "grad_norm": 0.5761057734489441, + "learning_rate": 9.827888040640414e-05, + "loss": 2.1111, + "step": 3626 + }, + { + "epoch": 1.1132596685082874, + "grad_norm": 0.5724154710769653, + "learning_rate": 9.827758725000468e-05, + "loss": 2.0596, + "step": 3627 + }, + { + "epoch": 1.1135666052793125, + "grad_norm": 0.5120618343353271, + "learning_rate": 9.827629361649783e-05, + "loss": 1.9811, + "step": 3628 + }, + { + "epoch": 1.1138735420503376, + "grad_norm": 0.4449520409107208, + "learning_rate": 9.827499950589633e-05, + "loss": 1.9935, + "step": 3629 + }, + { + "epoch": 1.1141804788213627, + "grad_norm": 0.5478667616844177, + "learning_rate": 9.827370491821302e-05, + "loss": 2.0142, + "step": 3630 + }, + { + "epoch": 1.114487415592388, + "grad_norm": 0.6170383095741272, + "learning_rate": 9.827240985346064e-05, + "loss": 2.0588, + "step": 3631 + }, + { + "epoch": 1.1147943523634132, + "grad_norm": 0.5950221419334412, + "learning_rate": 9.827111431165202e-05, + "loss": 2.0187, + "step": 3632 + }, + { + "epoch": 1.1151012891344383, + "grad_norm": 0.5250533819198608, + "learning_rate": 9.826981829279995e-05, + "loss": 2.0288, + "step": 3633 + }, + { + "epoch": 1.1154082259054634, + "grad_norm": 0.6252482533454895, + "learning_rate": 9.826852179691725e-05, + "loss": 2.1834, + "step": 3634 + }, + { + "epoch": 1.1157151626764887, + "grad_norm": 0.5258986353874207, + "learning_rate": 9.826722482401673e-05, + "loss": 1.9894, + "step": 3635 + }, + { + "epoch": 1.1160220994475138, + "grad_norm": 0.5532206892967224, + "learning_rate": 9.82659273741112e-05, + "loss": 2.013, + "step": 3636 + }, + { + "epoch": 1.116329036218539, + "grad_norm": 0.5178828835487366, + "learning_rate": 9.826462944721349e-05, + "loss": 1.955, + "step": 3637 + }, + { + "epoch": 1.116635972989564, + "grad_norm": 0.5466227531433105, + "learning_rate": 9.826333104333642e-05, + "loss": 2.1073, + "step": 3638 + }, + { + "epoch": 1.1169429097605894, + "grad_norm": 0.5513507723808289, + "learning_rate": 9.826203216249282e-05, + "loss": 2.0735, + "step": 3639 + }, + { + "epoch": 1.1172498465316145, + "grad_norm": 0.5485204458236694, + "learning_rate": 9.826073280469554e-05, + "loss": 2.0699, + "step": 3640 + }, + { + "epoch": 1.1175567833026396, + "grad_norm": 0.5148037075996399, + "learning_rate": 9.825943296995741e-05, + "loss": 1.9364, + "step": 3641 + }, + { + "epoch": 1.1178637200736647, + "grad_norm": 0.5639125108718872, + "learning_rate": 9.825813265829127e-05, + "loss": 2.078, + "step": 3642 + }, + { + "epoch": 1.11817065684469, + "grad_norm": 0.581631064414978, + "learning_rate": 9.825683186970997e-05, + "loss": 2.0404, + "step": 3643 + }, + { + "epoch": 1.1184775936157152, + "grad_norm": 0.5630286335945129, + "learning_rate": 9.82555306042264e-05, + "loss": 2.0615, + "step": 3644 + }, + { + "epoch": 1.1187845303867403, + "grad_norm": 0.5661062598228455, + "learning_rate": 9.825422886185338e-05, + "loss": 2.0432, + "step": 3645 + }, + { + "epoch": 1.1190914671577654, + "grad_norm": 0.4960556626319885, + "learning_rate": 9.825292664260379e-05, + "loss": 2.0576, + "step": 3646 + }, + { + "epoch": 1.1193984039287908, + "grad_norm": 0.5052362084388733, + "learning_rate": 9.825162394649048e-05, + "loss": 2.0615, + "step": 3647 + }, + { + "epoch": 1.1197053406998159, + "grad_norm": 0.566758930683136, + "learning_rate": 9.825032077352636e-05, + "loss": 2.0821, + "step": 3648 + }, + { + "epoch": 1.120012277470841, + "grad_norm": 0.5705568790435791, + "learning_rate": 9.824901712372429e-05, + "loss": 2.1455, + "step": 3649 + }, + { + "epoch": 1.120319214241866, + "grad_norm": 0.5584011673927307, + "learning_rate": 9.824771299709714e-05, + "loss": 2.0911, + "step": 3650 + }, + { + "epoch": 1.1206261510128914, + "grad_norm": 0.5621497631072998, + "learning_rate": 9.824640839365782e-05, + "loss": 2.1209, + "step": 3651 + }, + { + "epoch": 1.1209330877839165, + "grad_norm": 0.4893646240234375, + "learning_rate": 9.824510331341921e-05, + "loss": 1.977, + "step": 3652 + }, + { + "epoch": 1.1212400245549416, + "grad_norm": 0.5626688599586487, + "learning_rate": 9.82437977563942e-05, + "loss": 2.1114, + "step": 3653 + }, + { + "epoch": 1.1215469613259668, + "grad_norm": 0.5714966058731079, + "learning_rate": 9.824249172259573e-05, + "loss": 2.021, + "step": 3654 + }, + { + "epoch": 1.121853898096992, + "grad_norm": 0.5190821886062622, + "learning_rate": 9.824118521203666e-05, + "loss": 1.9788, + "step": 3655 + }, + { + "epoch": 1.1221608348680172, + "grad_norm": 0.46421363949775696, + "learning_rate": 9.823987822472994e-05, + "loss": 1.9762, + "step": 3656 + }, + { + "epoch": 1.1224677716390423, + "grad_norm": 0.5071156620979309, + "learning_rate": 9.823857076068846e-05, + "loss": 1.9625, + "step": 3657 + }, + { + "epoch": 1.1227747084100674, + "grad_norm": 0.5762679576873779, + "learning_rate": 9.823726281992515e-05, + "loss": 2.0543, + "step": 3658 + }, + { + "epoch": 1.1230816451810928, + "grad_norm": 0.6275226473808289, + "learning_rate": 9.823595440245294e-05, + "loss": 2.0878, + "step": 3659 + }, + { + "epoch": 1.1233885819521179, + "grad_norm": 0.6893213391304016, + "learning_rate": 9.823464550828476e-05, + "loss": 2.1059, + "step": 3660 + }, + { + "epoch": 1.123695518723143, + "grad_norm": 0.5521993041038513, + "learning_rate": 9.823333613743353e-05, + "loss": 2.035, + "step": 3661 + }, + { + "epoch": 1.124002455494168, + "grad_norm": 0.4918796718120575, + "learning_rate": 9.823202628991221e-05, + "loss": 1.9873, + "step": 3662 + }, + { + "epoch": 1.1243093922651934, + "grad_norm": 0.5177932977676392, + "learning_rate": 9.823071596573373e-05, + "loss": 2.0376, + "step": 3663 + }, + { + "epoch": 1.1246163290362186, + "grad_norm": 0.5337314009666443, + "learning_rate": 9.822940516491106e-05, + "loss": 2.1065, + "step": 3664 + }, + { + "epoch": 1.1249232658072437, + "grad_norm": 0.5179010629653931, + "learning_rate": 9.822809388745713e-05, + "loss": 1.9642, + "step": 3665 + }, + { + "epoch": 1.125230202578269, + "grad_norm": 0.5394679307937622, + "learning_rate": 9.82267821333849e-05, + "loss": 2.0275, + "step": 3666 + }, + { + "epoch": 1.1255371393492941, + "grad_norm": 0.582873523235321, + "learning_rate": 9.822546990270735e-05, + "loss": 2.0369, + "step": 3667 + }, + { + "epoch": 1.1258440761203192, + "grad_norm": 0.6595674753189087, + "learning_rate": 9.822415719543745e-05, + "loss": 1.9776, + "step": 3668 + }, + { + "epoch": 1.1261510128913443, + "grad_norm": 0.8103840947151184, + "learning_rate": 9.822284401158814e-05, + "loss": 2.0784, + "step": 3669 + }, + { + "epoch": 1.1264579496623695, + "grad_norm": 0.9062070250511169, + "learning_rate": 9.822153035117245e-05, + "loss": 1.9886, + "step": 3670 + }, + { + "epoch": 1.1267648864333948, + "grad_norm": 0.8718156814575195, + "learning_rate": 9.822021621420333e-05, + "loss": 2.0499, + "step": 3671 + }, + { + "epoch": 1.12707182320442, + "grad_norm": 0.6499583721160889, + "learning_rate": 9.821890160069375e-05, + "loss": 2.0734, + "step": 3672 + }, + { + "epoch": 1.127378759975445, + "grad_norm": 0.4573141932487488, + "learning_rate": 9.821758651065673e-05, + "loss": 2.0306, + "step": 3673 + }, + { + "epoch": 1.1276856967464703, + "grad_norm": 0.6441135406494141, + "learning_rate": 9.821627094410526e-05, + "loss": 2.051, + "step": 3674 + }, + { + "epoch": 1.1279926335174955, + "grad_norm": 0.7201390266418457, + "learning_rate": 9.821495490105235e-05, + "loss": 2.0187, + "step": 3675 + }, + { + "epoch": 1.1282995702885206, + "grad_norm": 0.6751874685287476, + "learning_rate": 9.821363838151099e-05, + "loss": 2.0363, + "step": 3676 + }, + { + "epoch": 1.1286065070595457, + "grad_norm": 0.5435949563980103, + "learning_rate": 9.821232138549419e-05, + "loss": 1.939, + "step": 3677 + }, + { + "epoch": 1.1289134438305708, + "grad_norm": 0.605248212814331, + "learning_rate": 9.821100391301497e-05, + "loss": 2.146, + "step": 3678 + }, + { + "epoch": 1.1292203806015961, + "grad_norm": 0.6798139810562134, + "learning_rate": 9.820968596408636e-05, + "loss": 2.0423, + "step": 3679 + }, + { + "epoch": 1.1295273173726212, + "grad_norm": 0.6683683395385742, + "learning_rate": 9.820836753872137e-05, + "loss": 1.9768, + "step": 3680 + }, + { + "epoch": 1.1298342541436464, + "grad_norm": 0.578346312046051, + "learning_rate": 9.820704863693304e-05, + "loss": 1.9313, + "step": 3681 + }, + { + "epoch": 1.1301411909146717, + "grad_norm": 0.5639599561691284, + "learning_rate": 9.820572925873441e-05, + "loss": 2.0706, + "step": 3682 + }, + { + "epoch": 1.1304481276856968, + "grad_norm": 0.5749368071556091, + "learning_rate": 9.82044094041385e-05, + "loss": 2.0072, + "step": 3683 + }, + { + "epoch": 1.130755064456722, + "grad_norm": 0.6490229368209839, + "learning_rate": 9.820308907315836e-05, + "loss": 1.9947, + "step": 3684 + }, + { + "epoch": 1.131062001227747, + "grad_norm": 0.6207692623138428, + "learning_rate": 9.820176826580705e-05, + "loss": 2.1426, + "step": 3685 + }, + { + "epoch": 1.1313689379987721, + "grad_norm": 0.6421573162078857, + "learning_rate": 9.82004469820976e-05, + "loss": 2.0558, + "step": 3686 + }, + { + "epoch": 1.1316758747697975, + "grad_norm": 0.5462764501571655, + "learning_rate": 9.81991252220431e-05, + "loss": 2.0072, + "step": 3687 + }, + { + "epoch": 1.1319828115408226, + "grad_norm": 0.49791282415390015, + "learning_rate": 9.819780298565657e-05, + "loss": 1.9949, + "step": 3688 + }, + { + "epoch": 1.1322897483118477, + "grad_norm": 0.5120366215705872, + "learning_rate": 9.819648027295112e-05, + "loss": 2.0503, + "step": 3689 + }, + { + "epoch": 1.132596685082873, + "grad_norm": 0.5118343830108643, + "learning_rate": 9.81951570839398e-05, + "loss": 2.0104, + "step": 3690 + }, + { + "epoch": 1.1329036218538981, + "grad_norm": 0.44520822167396545, + "learning_rate": 9.81938334186357e-05, + "loss": 2.0024, + "step": 3691 + }, + { + "epoch": 1.1332105586249233, + "grad_norm": 0.5505960583686829, + "learning_rate": 9.819250927705188e-05, + "loss": 2.0924, + "step": 3692 + }, + { + "epoch": 1.1335174953959484, + "grad_norm": 0.5269182920455933, + "learning_rate": 9.819118465920143e-05, + "loss": 2.0553, + "step": 3693 + }, + { + "epoch": 1.1338244321669735, + "grad_norm": 0.4864311218261719, + "learning_rate": 9.818985956509745e-05, + "loss": 2.0405, + "step": 3694 + }, + { + "epoch": 1.1341313689379988, + "grad_norm": 0.515202522277832, + "learning_rate": 9.818853399475304e-05, + "loss": 2.0211, + "step": 3695 + }, + { + "epoch": 1.134438305709024, + "grad_norm": 0.5360483527183533, + "learning_rate": 9.818720794818128e-05, + "loss": 2.1077, + "step": 3696 + }, + { + "epoch": 1.134745242480049, + "grad_norm": 0.5469255447387695, + "learning_rate": 9.818588142539531e-05, + "loss": 1.9538, + "step": 3697 + }, + { + "epoch": 1.1350521792510744, + "grad_norm": 0.5042214393615723, + "learning_rate": 9.818455442640819e-05, + "loss": 2.0477, + "step": 3698 + }, + { + "epoch": 1.1353591160220995, + "grad_norm": 0.5678744316101074, + "learning_rate": 9.81832269512331e-05, + "loss": 2.0871, + "step": 3699 + }, + { + "epoch": 1.1356660527931246, + "grad_norm": 0.5218677520751953, + "learning_rate": 9.818189899988308e-05, + "loss": 2.1014, + "step": 3700 + }, + { + "epoch": 1.1359729895641497, + "grad_norm": 0.5141727924346924, + "learning_rate": 9.818057057237132e-05, + "loss": 2.0385, + "step": 3701 + }, + { + "epoch": 1.136279926335175, + "grad_norm": 0.5288038849830627, + "learning_rate": 9.81792416687109e-05, + "loss": 2.0736, + "step": 3702 + }, + { + "epoch": 1.1365868631062002, + "grad_norm": 0.5533168911933899, + "learning_rate": 9.817791228891499e-05, + "loss": 2.032, + "step": 3703 + }, + { + "epoch": 1.1368937998772253, + "grad_norm": 0.4840674102306366, + "learning_rate": 9.81765824329967e-05, + "loss": 2.027, + "step": 3704 + }, + { + "epoch": 1.1372007366482504, + "grad_norm": 0.5060023069381714, + "learning_rate": 9.817525210096921e-05, + "loss": 2.0561, + "step": 3705 + }, + { + "epoch": 1.1375076734192757, + "grad_norm": 0.48830488324165344, + "learning_rate": 9.817392129284561e-05, + "loss": 1.9807, + "step": 3706 + }, + { + "epoch": 1.1378146101903008, + "grad_norm": 0.4644564390182495, + "learning_rate": 9.817259000863911e-05, + "loss": 1.9871, + "step": 3707 + }, + { + "epoch": 1.138121546961326, + "grad_norm": 0.4644739329814911, + "learning_rate": 9.817125824836283e-05, + "loss": 2.0253, + "step": 3708 + }, + { + "epoch": 1.138428483732351, + "grad_norm": 0.5376463532447815, + "learning_rate": 9.816992601202994e-05, + "loss": 2.0693, + "step": 3709 + }, + { + "epoch": 1.1387354205033764, + "grad_norm": 0.49980148673057556, + "learning_rate": 9.816859329965363e-05, + "loss": 2.0123, + "step": 3710 + }, + { + "epoch": 1.1390423572744015, + "grad_norm": 0.5452225208282471, + "learning_rate": 9.816726011124702e-05, + "loss": 2.0725, + "step": 3711 + }, + { + "epoch": 1.1393492940454266, + "grad_norm": 0.5428896546363831, + "learning_rate": 9.816592644682332e-05, + "loss": 2.0446, + "step": 3712 + }, + { + "epoch": 1.1396562308164517, + "grad_norm": 0.5448847413063049, + "learning_rate": 9.816459230639571e-05, + "loss": 2.0262, + "step": 3713 + }, + { + "epoch": 1.139963167587477, + "grad_norm": 0.48574572801589966, + "learning_rate": 9.816325768997736e-05, + "loss": 2.0105, + "step": 3714 + }, + { + "epoch": 1.1402701043585022, + "grad_norm": 0.5566397905349731, + "learning_rate": 9.816192259758147e-05, + "loss": 2.0665, + "step": 3715 + }, + { + "epoch": 1.1405770411295273, + "grad_norm": 0.6098625659942627, + "learning_rate": 9.816058702922124e-05, + "loss": 2.0589, + "step": 3716 + }, + { + "epoch": 1.1408839779005524, + "grad_norm": 0.6118699312210083, + "learning_rate": 9.815925098490985e-05, + "loss": 2.0683, + "step": 3717 + }, + { + "epoch": 1.1411909146715777, + "grad_norm": 0.5213121175765991, + "learning_rate": 9.815791446466053e-05, + "loss": 2.0226, + "step": 3718 + }, + { + "epoch": 1.1414978514426029, + "grad_norm": 0.45717960596084595, + "learning_rate": 9.815657746848648e-05, + "loss": 2.0371, + "step": 3719 + }, + { + "epoch": 1.141804788213628, + "grad_norm": 0.4613656997680664, + "learning_rate": 9.815523999640088e-05, + "loss": 2.0702, + "step": 3720 + }, + { + "epoch": 1.142111724984653, + "grad_norm": 0.4527476727962494, + "learning_rate": 9.8153902048417e-05, + "loss": 1.9893, + "step": 3721 + }, + { + "epoch": 1.1424186617556784, + "grad_norm": 0.4524305462837219, + "learning_rate": 9.815256362454801e-05, + "loss": 1.975, + "step": 3722 + }, + { + "epoch": 1.1427255985267035, + "grad_norm": 0.4421180188655853, + "learning_rate": 9.815122472480718e-05, + "loss": 1.9987, + "step": 3723 + }, + { + "epoch": 1.1430325352977286, + "grad_norm": 0.4833788275718689, + "learning_rate": 9.814988534920771e-05, + "loss": 2.0246, + "step": 3724 + }, + { + "epoch": 1.1433394720687537, + "grad_norm": 0.46547624468803406, + "learning_rate": 9.814854549776287e-05, + "loss": 2.0007, + "step": 3725 + }, + { + "epoch": 1.143646408839779, + "grad_norm": 0.43220648169517517, + "learning_rate": 9.814720517048587e-05, + "loss": 1.9845, + "step": 3726 + }, + { + "epoch": 1.1439533456108042, + "grad_norm": 0.473910391330719, + "learning_rate": 9.814586436738998e-05, + "loss": 2.0518, + "step": 3727 + }, + { + "epoch": 1.1442602823818293, + "grad_norm": 0.507354199886322, + "learning_rate": 9.814452308848843e-05, + "loss": 2.0708, + "step": 3728 + }, + { + "epoch": 1.1445672191528544, + "grad_norm": 0.4585053622722626, + "learning_rate": 9.814318133379448e-05, + "loss": 2.0124, + "step": 3729 + }, + { + "epoch": 1.1448741559238798, + "grad_norm": 0.5280457735061646, + "learning_rate": 9.81418391033214e-05, + "loss": 2.0424, + "step": 3730 + }, + { + "epoch": 1.1451810926949049, + "grad_norm": 0.5173056125640869, + "learning_rate": 9.814049639708245e-05, + "loss": 1.9666, + "step": 3731 + }, + { + "epoch": 1.14548802946593, + "grad_norm": 0.5850839018821716, + "learning_rate": 9.81391532150909e-05, + "loss": 2.0765, + "step": 3732 + }, + { + "epoch": 1.145794966236955, + "grad_norm": 0.5450417995452881, + "learning_rate": 9.813780955736002e-05, + "loss": 2.0696, + "step": 3733 + }, + { + "epoch": 1.1461019030079804, + "grad_norm": 0.4577319622039795, + "learning_rate": 9.81364654239031e-05, + "loss": 2.0493, + "step": 3734 + }, + { + "epoch": 1.1464088397790055, + "grad_norm": 0.5211838483810425, + "learning_rate": 9.813512081473339e-05, + "loss": 2.0578, + "step": 3735 + }, + { + "epoch": 1.1467157765500307, + "grad_norm": 0.6763051152229309, + "learning_rate": 9.813377572986422e-05, + "loss": 2.0859, + "step": 3736 + }, + { + "epoch": 1.1470227133210558, + "grad_norm": 0.8591815233230591, + "learning_rate": 9.813243016930887e-05, + "loss": 1.9743, + "step": 3737 + }, + { + "epoch": 1.147329650092081, + "grad_norm": 0.8573755025863647, + "learning_rate": 9.813108413308063e-05, + "loss": 2.048, + "step": 3738 + }, + { + "epoch": 1.1476365868631062, + "grad_norm": 0.6887713074684143, + "learning_rate": 9.812973762119281e-05, + "loss": 2.0184, + "step": 3739 + }, + { + "epoch": 1.1479435236341313, + "grad_norm": 0.5491438508033752, + "learning_rate": 9.81283906336587e-05, + "loss": 2.0373, + "step": 3740 + }, + { + "epoch": 1.1482504604051567, + "grad_norm": 0.6413923501968384, + "learning_rate": 9.812704317049164e-05, + "loss": 2.067, + "step": 3741 + }, + { + "epoch": 1.1485573971761818, + "grad_norm": 0.8731338381767273, + "learning_rate": 9.812569523170492e-05, + "loss": 1.9996, + "step": 3742 + }, + { + "epoch": 1.1488643339472069, + "grad_norm": 0.8043886423110962, + "learning_rate": 9.812434681731189e-05, + "loss": 2.0464, + "step": 3743 + }, + { + "epoch": 1.149171270718232, + "grad_norm": 0.5458279252052307, + "learning_rate": 9.812299792732584e-05, + "loss": 2.0026, + "step": 3744 + }, + { + "epoch": 1.149478207489257, + "grad_norm": 0.5135432481765747, + "learning_rate": 9.812164856176011e-05, + "loss": 2.0302, + "step": 3745 + }, + { + "epoch": 1.1497851442602824, + "grad_norm": 0.6673153638839722, + "learning_rate": 9.812029872062807e-05, + "loss": 2.0435, + "step": 3746 + }, + { + "epoch": 1.1500920810313076, + "grad_norm": 0.6777083873748779, + "learning_rate": 9.811894840394302e-05, + "loss": 2.0591, + "step": 3747 + }, + { + "epoch": 1.1503990178023327, + "grad_norm": 0.6660524010658264, + "learning_rate": 9.811759761171833e-05, + "loss": 2.0461, + "step": 3748 + }, + { + "epoch": 1.150705954573358, + "grad_norm": 0.6079594492912292, + "learning_rate": 9.811624634396733e-05, + "loss": 2.0708, + "step": 3749 + }, + { + "epoch": 1.1510128913443831, + "grad_norm": 0.5242465734481812, + "learning_rate": 9.811489460070337e-05, + "loss": 2.0513, + "step": 3750 + }, + { + "epoch": 1.1513198281154082, + "grad_norm": 0.7091820240020752, + "learning_rate": 9.811354238193984e-05, + "loss": 2.0356, + "step": 3751 + }, + { + "epoch": 1.1516267648864333, + "grad_norm": 0.6781896948814392, + "learning_rate": 9.811218968769007e-05, + "loss": 2.0693, + "step": 3752 + }, + { + "epoch": 1.1519337016574585, + "grad_norm": 0.6036314368247986, + "learning_rate": 9.811083651796744e-05, + "loss": 2.134, + "step": 3753 + }, + { + "epoch": 1.1522406384284838, + "grad_norm": 0.6173892617225647, + "learning_rate": 9.810948287278534e-05, + "loss": 2.056, + "step": 3754 + }, + { + "epoch": 1.152547575199509, + "grad_norm": 0.4903198182582855, + "learning_rate": 9.810812875215712e-05, + "loss": 2.0037, + "step": 3755 + }, + { + "epoch": 1.152854511970534, + "grad_norm": 0.5527236461639404, + "learning_rate": 9.810677415609619e-05, + "loss": 2.0334, + "step": 3756 + }, + { + "epoch": 1.1531614487415593, + "grad_norm": 0.5342993140220642, + "learning_rate": 9.81054190846159e-05, + "loss": 2.0376, + "step": 3757 + }, + { + "epoch": 1.1534683855125845, + "grad_norm": 0.4860527515411377, + "learning_rate": 9.810406353772968e-05, + "loss": 2.0009, + "step": 3758 + }, + { + "epoch": 1.1537753222836096, + "grad_norm": 0.49722176790237427, + "learning_rate": 9.810270751545089e-05, + "loss": 2.051, + "step": 3759 + }, + { + "epoch": 1.1540822590546347, + "grad_norm": 0.4714743196964264, + "learning_rate": 9.810135101779296e-05, + "loss": 2.0474, + "step": 3760 + }, + { + "epoch": 1.1543891958256598, + "grad_norm": 0.5183619856834412, + "learning_rate": 9.80999940447693e-05, + "loss": 2.1032, + "step": 3761 + }, + { + "epoch": 1.1546961325966851, + "grad_norm": 0.6118659377098083, + "learning_rate": 9.809863659639328e-05, + "loss": 2.0967, + "step": 3762 + }, + { + "epoch": 1.1550030693677102, + "grad_norm": 0.49166184663772583, + "learning_rate": 9.809727867267838e-05, + "loss": 2.0683, + "step": 3763 + }, + { + "epoch": 1.1553100061387354, + "grad_norm": 0.5190026164054871, + "learning_rate": 9.809592027363795e-05, + "loss": 2.0161, + "step": 3764 + }, + { + "epoch": 1.1556169429097607, + "grad_norm": 0.516914427280426, + "learning_rate": 9.809456139928546e-05, + "loss": 2.0886, + "step": 3765 + }, + { + "epoch": 1.1559238796807858, + "grad_norm": 0.49737948179244995, + "learning_rate": 9.809320204963433e-05, + "loss": 2.0111, + "step": 3766 + }, + { + "epoch": 1.156230816451811, + "grad_norm": 0.44676536321640015, + "learning_rate": 9.809184222469796e-05, + "loss": 2.0571, + "step": 3767 + }, + { + "epoch": 1.156537753222836, + "grad_norm": 0.5008999109268188, + "learning_rate": 9.809048192448983e-05, + "loss": 2.0489, + "step": 3768 + }, + { + "epoch": 1.1568446899938611, + "grad_norm": 0.5116657614707947, + "learning_rate": 9.80891211490234e-05, + "loss": 1.9571, + "step": 3769 + }, + { + "epoch": 1.1571516267648865, + "grad_norm": 0.49909651279449463, + "learning_rate": 9.808775989831207e-05, + "loss": 2.0568, + "step": 3770 + }, + { + "epoch": 1.1574585635359116, + "grad_norm": 0.5186662077903748, + "learning_rate": 9.80863981723693e-05, + "loss": 2.0283, + "step": 3771 + }, + { + "epoch": 1.1577655003069367, + "grad_norm": 0.4974740445613861, + "learning_rate": 9.808503597120858e-05, + "loss": 1.9525, + "step": 3772 + }, + { + "epoch": 1.158072437077962, + "grad_norm": 0.5369553565979004, + "learning_rate": 9.808367329484333e-05, + "loss": 1.9627, + "step": 3773 + }, + { + "epoch": 1.1583793738489871, + "grad_norm": 0.5084113478660583, + "learning_rate": 9.808231014328704e-05, + "loss": 1.9563, + "step": 3774 + }, + { + "epoch": 1.1586863106200123, + "grad_norm": 0.6059956550598145, + "learning_rate": 9.808094651655319e-05, + "loss": 2.078, + "step": 3775 + }, + { + "epoch": 1.1589932473910374, + "grad_norm": 0.5677124261856079, + "learning_rate": 9.807958241465523e-05, + "loss": 1.9977, + "step": 3776 + }, + { + "epoch": 1.1593001841620627, + "grad_norm": 0.5582616329193115, + "learning_rate": 9.807821783760667e-05, + "loss": 2.0053, + "step": 3777 + }, + { + "epoch": 1.1596071209330878, + "grad_norm": 0.5558032989501953, + "learning_rate": 9.807685278542097e-05, + "loss": 2.0015, + "step": 3778 + }, + { + "epoch": 1.159914057704113, + "grad_norm": 0.553292989730835, + "learning_rate": 9.807548725811165e-05, + "loss": 2.133, + "step": 3779 + }, + { + "epoch": 1.160220994475138, + "grad_norm": 0.5281317234039307, + "learning_rate": 9.807412125569217e-05, + "loss": 2.0018, + "step": 3780 + }, + { + "epoch": 1.1605279312461634, + "grad_norm": 0.45385050773620605, + "learning_rate": 9.807275477817605e-05, + "loss": 1.9986, + "step": 3781 + }, + { + "epoch": 1.1608348680171885, + "grad_norm": 0.5843673944473267, + "learning_rate": 9.80713878255768e-05, + "loss": 2.0653, + "step": 3782 + }, + { + "epoch": 1.1611418047882136, + "grad_norm": 0.6193283796310425, + "learning_rate": 9.807002039790792e-05, + "loss": 1.9646, + "step": 3783 + }, + { + "epoch": 1.1614487415592387, + "grad_norm": 0.5831897258758545, + "learning_rate": 9.806865249518292e-05, + "loss": 1.9708, + "step": 3784 + }, + { + "epoch": 1.161755678330264, + "grad_norm": 0.49771901965141296, + "learning_rate": 9.806728411741533e-05, + "loss": 1.9953, + "step": 3785 + }, + { + "epoch": 1.1620626151012892, + "grad_norm": 0.5003515481948853, + "learning_rate": 9.806591526461864e-05, + "loss": 2.0503, + "step": 3786 + }, + { + "epoch": 1.1623695518723143, + "grad_norm": 0.5710052847862244, + "learning_rate": 9.806454593680642e-05, + "loss": 1.9976, + "step": 3787 + }, + { + "epoch": 1.1626764886433394, + "grad_norm": 0.5180788040161133, + "learning_rate": 9.806317613399218e-05, + "loss": 1.9872, + "step": 3788 + }, + { + "epoch": 1.1629834254143647, + "grad_norm": 0.5202008485794067, + "learning_rate": 9.806180585618949e-05, + "loss": 1.9628, + "step": 3789 + }, + { + "epoch": 1.1632903621853898, + "grad_norm": 0.47358211874961853, + "learning_rate": 9.806043510341183e-05, + "loss": 1.9994, + "step": 3790 + }, + { + "epoch": 1.163597298956415, + "grad_norm": 0.4258720278739929, + "learning_rate": 9.80590638756728e-05, + "loss": 1.9547, + "step": 3791 + }, + { + "epoch": 1.16390423572744, + "grad_norm": 0.4487614035606384, + "learning_rate": 9.805769217298593e-05, + "loss": 1.9912, + "step": 3792 + }, + { + "epoch": 1.1642111724984654, + "grad_norm": 0.4970495104789734, + "learning_rate": 9.805631999536477e-05, + "loss": 2.0568, + "step": 3793 + }, + { + "epoch": 1.1645181092694905, + "grad_norm": 0.4535474479198456, + "learning_rate": 9.805494734282289e-05, + "loss": 2.0088, + "step": 3794 + }, + { + "epoch": 1.1648250460405156, + "grad_norm": 0.44582805037498474, + "learning_rate": 9.805357421537385e-05, + "loss": 1.9694, + "step": 3795 + }, + { + "epoch": 1.1651319828115407, + "grad_norm": 0.43872734904289246, + "learning_rate": 9.805220061303125e-05, + "loss": 2.0041, + "step": 3796 + }, + { + "epoch": 1.165438919582566, + "grad_norm": 0.5050458908081055, + "learning_rate": 9.805082653580861e-05, + "loss": 1.9963, + "step": 3797 + }, + { + "epoch": 1.1657458563535912, + "grad_norm": 0.5346884727478027, + "learning_rate": 9.804945198371956e-05, + "loss": 2.0334, + "step": 3798 + }, + { + "epoch": 1.1660527931246163, + "grad_norm": 0.5607240796089172, + "learning_rate": 9.804807695677764e-05, + "loss": 2.0474, + "step": 3799 + }, + { + "epoch": 1.1663597298956414, + "grad_norm": 0.5343592166900635, + "learning_rate": 9.804670145499648e-05, + "loss": 2.0542, + "step": 3800 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5195753574371338, + "learning_rate": 9.804532547838964e-05, + "loss": 2.0816, + "step": 3801 + }, + { + "epoch": 1.1669736034376919, + "grad_norm": 0.575821042060852, + "learning_rate": 9.804394902697075e-05, + "loss": 2.0182, + "step": 3802 + }, + { + "epoch": 1.167280540208717, + "grad_norm": 0.6385466456413269, + "learning_rate": 9.804257210075339e-05, + "loss": 2.0519, + "step": 3803 + }, + { + "epoch": 1.167587476979742, + "grad_norm": 0.7202457785606384, + "learning_rate": 9.804119469975117e-05, + "loss": 1.9871, + "step": 3804 + }, + { + "epoch": 1.1678944137507674, + "grad_norm": 0.696793258190155, + "learning_rate": 9.803981682397772e-05, + "loss": 2.1018, + "step": 3805 + }, + { + "epoch": 1.1682013505217925, + "grad_norm": 0.6217656135559082, + "learning_rate": 9.803843847344662e-05, + "loss": 2.1009, + "step": 3806 + }, + { + "epoch": 1.1685082872928176, + "grad_norm": 0.5296351313591003, + "learning_rate": 9.803705964817153e-05, + "loss": 2.1057, + "step": 3807 + }, + { + "epoch": 1.168815224063843, + "grad_norm": 0.5280975699424744, + "learning_rate": 9.803568034816606e-05, + "loss": 2.0019, + "step": 3808 + }, + { + "epoch": 1.169122160834868, + "grad_norm": 0.4981881380081177, + "learning_rate": 9.803430057344385e-05, + "loss": 1.9918, + "step": 3809 + }, + { + "epoch": 1.1694290976058932, + "grad_norm": 0.43662941455841064, + "learning_rate": 9.803292032401852e-05, + "loss": 2.0273, + "step": 3810 + }, + { + "epoch": 1.1697360343769183, + "grad_norm": 0.5039259791374207, + "learning_rate": 9.80315395999037e-05, + "loss": 2.0475, + "step": 3811 + }, + { + "epoch": 1.1700429711479434, + "grad_norm": 0.4330410957336426, + "learning_rate": 9.803015840111308e-05, + "loss": 1.99, + "step": 3812 + }, + { + "epoch": 1.1703499079189688, + "grad_norm": 0.4603813886642456, + "learning_rate": 9.802877672766026e-05, + "loss": 2.0288, + "step": 3813 + }, + { + "epoch": 1.1706568446899939, + "grad_norm": 0.45815590023994446, + "learning_rate": 9.802739457955894e-05, + "loss": 2.0026, + "step": 3814 + }, + { + "epoch": 1.170963781461019, + "grad_norm": 0.46995803713798523, + "learning_rate": 9.802601195682275e-05, + "loss": 2.0608, + "step": 3815 + }, + { + "epoch": 1.1712707182320443, + "grad_norm": 0.4511576294898987, + "learning_rate": 9.802462885946536e-05, + "loss": 1.9793, + "step": 3816 + }, + { + "epoch": 1.1715776550030694, + "grad_norm": 0.49079468846321106, + "learning_rate": 9.802324528750044e-05, + "loss": 2.0049, + "step": 3817 + }, + { + "epoch": 1.1718845917740945, + "grad_norm": 0.47245466709136963, + "learning_rate": 9.802186124094166e-05, + "loss": 1.9562, + "step": 3818 + }, + { + "epoch": 1.1721915285451197, + "grad_norm": 0.485575795173645, + "learning_rate": 9.80204767198027e-05, + "loss": 2.0212, + "step": 3819 + }, + { + "epoch": 1.1724984653161448, + "grad_norm": 0.5924440622329712, + "learning_rate": 9.801909172409724e-05, + "loss": 1.9875, + "step": 3820 + }, + { + "epoch": 1.17280540208717, + "grad_norm": 0.48908641934394836, + "learning_rate": 9.801770625383899e-05, + "loss": 1.9778, + "step": 3821 + }, + { + "epoch": 1.1731123388581952, + "grad_norm": 0.4372415840625763, + "learning_rate": 9.80163203090416e-05, + "loss": 1.9368, + "step": 3822 + }, + { + "epoch": 1.1734192756292203, + "grad_norm": 0.5811094641685486, + "learning_rate": 9.801493388971881e-05, + "loss": 2.1293, + "step": 3823 + }, + { + "epoch": 1.1737262124002457, + "grad_norm": 0.516983151435852, + "learning_rate": 9.801354699588428e-05, + "loss": 2.039, + "step": 3824 + }, + { + "epoch": 1.1740331491712708, + "grad_norm": 0.53409343957901, + "learning_rate": 9.801215962755175e-05, + "loss": 2.0294, + "step": 3825 + }, + { + "epoch": 1.1743400859422959, + "grad_norm": 0.5703202486038208, + "learning_rate": 9.801077178473492e-05, + "loss": 2.0241, + "step": 3826 + }, + { + "epoch": 1.174647022713321, + "grad_norm": 0.49341192841529846, + "learning_rate": 9.80093834674475e-05, + "loss": 1.9092, + "step": 3827 + }, + { + "epoch": 1.174953959484346, + "grad_norm": 0.46960577368736267, + "learning_rate": 9.800799467570321e-05, + "loss": 1.9994, + "step": 3828 + }, + { + "epoch": 1.1752608962553714, + "grad_norm": 0.468108594417572, + "learning_rate": 9.800660540951577e-05, + "loss": 1.9471, + "step": 3829 + }, + { + "epoch": 1.1755678330263966, + "grad_norm": 0.4133259057998657, + "learning_rate": 9.800521566889893e-05, + "loss": 2.0159, + "step": 3830 + }, + { + "epoch": 1.1758747697974217, + "grad_norm": 0.44991979002952576, + "learning_rate": 9.800382545386641e-05, + "loss": 2.0179, + "step": 3831 + }, + { + "epoch": 1.176181706568447, + "grad_norm": 0.43111294507980347, + "learning_rate": 9.800243476443195e-05, + "loss": 2.1092, + "step": 3832 + }, + { + "epoch": 1.1764886433394721, + "grad_norm": 0.4859693944454193, + "learning_rate": 9.800104360060929e-05, + "loss": 2.0134, + "step": 3833 + }, + { + "epoch": 1.1767955801104972, + "grad_norm": 0.474960058927536, + "learning_rate": 9.799965196241219e-05, + "loss": 2.0288, + "step": 3834 + }, + { + "epoch": 1.1771025168815223, + "grad_norm": 0.5269008278846741, + "learning_rate": 9.79982598498544e-05, + "loss": 2.063, + "step": 3835 + }, + { + "epoch": 1.1774094536525475, + "grad_norm": 0.4923003613948822, + "learning_rate": 9.799686726294965e-05, + "loss": 1.9506, + "step": 3836 + }, + { + "epoch": 1.1777163904235728, + "grad_norm": 0.5355561971664429, + "learning_rate": 9.799547420171175e-05, + "loss": 2.0066, + "step": 3837 + }, + { + "epoch": 1.178023327194598, + "grad_norm": 0.6095728874206543, + "learning_rate": 9.799408066615443e-05, + "loss": 1.9799, + "step": 3838 + }, + { + "epoch": 1.178330263965623, + "grad_norm": 0.5268104672431946, + "learning_rate": 9.799268665629148e-05, + "loss": 2.0409, + "step": 3839 + }, + { + "epoch": 1.1786372007366483, + "grad_norm": 0.4478130340576172, + "learning_rate": 9.799129217213667e-05, + "loss": 1.9521, + "step": 3840 + }, + { + "epoch": 1.1789441375076735, + "grad_norm": 0.4691653847694397, + "learning_rate": 9.798989721370379e-05, + "loss": 2.0432, + "step": 3841 + }, + { + "epoch": 1.1792510742786986, + "grad_norm": 0.5602376461029053, + "learning_rate": 9.798850178100661e-05, + "loss": 2.0557, + "step": 3842 + }, + { + "epoch": 1.1795580110497237, + "grad_norm": 0.5619905591011047, + "learning_rate": 9.798710587405893e-05, + "loss": 2.0258, + "step": 3843 + }, + { + "epoch": 1.179864947820749, + "grad_norm": 0.5845574736595154, + "learning_rate": 9.798570949287454e-05, + "loss": 2.0637, + "step": 3844 + }, + { + "epoch": 1.1801718845917741, + "grad_norm": 0.5339313745498657, + "learning_rate": 9.798431263746725e-05, + "loss": 2.0265, + "step": 3845 + }, + { + "epoch": 1.1804788213627992, + "grad_norm": 0.45720914006233215, + "learning_rate": 9.798291530785086e-05, + "loss": 1.9745, + "step": 3846 + }, + { + "epoch": 1.1807857581338244, + "grad_norm": 0.5121282935142517, + "learning_rate": 9.798151750403917e-05, + "loss": 2.0427, + "step": 3847 + }, + { + "epoch": 1.1810926949048497, + "grad_norm": 0.48100459575653076, + "learning_rate": 9.7980119226046e-05, + "loss": 2.0307, + "step": 3848 + }, + { + "epoch": 1.1813996316758748, + "grad_norm": 0.4424034655094147, + "learning_rate": 9.797872047388517e-05, + "loss": 1.9697, + "step": 3849 + }, + { + "epoch": 1.1817065684469, + "grad_norm": 0.45154938101768494, + "learning_rate": 9.797732124757051e-05, + "loss": 1.9689, + "step": 3850 + }, + { + "epoch": 1.182013505217925, + "grad_norm": 0.4807071387767792, + "learning_rate": 9.797592154711584e-05, + "loss": 1.9616, + "step": 3851 + }, + { + "epoch": 1.1823204419889504, + "grad_norm": 0.5113904476165771, + "learning_rate": 9.797452137253498e-05, + "loss": 2.0158, + "step": 3852 + }, + { + "epoch": 1.1826273787599755, + "grad_norm": 0.5456753969192505, + "learning_rate": 9.797312072384179e-05, + "loss": 1.977, + "step": 3853 + }, + { + "epoch": 1.1829343155310006, + "grad_norm": 0.5545704364776611, + "learning_rate": 9.797171960105012e-05, + "loss": 2.0622, + "step": 3854 + }, + { + "epoch": 1.1832412523020257, + "grad_norm": 0.651498556137085, + "learning_rate": 9.797031800417377e-05, + "loss": 2.0739, + "step": 3855 + }, + { + "epoch": 1.183548189073051, + "grad_norm": 0.748968780040741, + "learning_rate": 9.796891593322665e-05, + "loss": 2.0713, + "step": 3856 + }, + { + "epoch": 1.1838551258440762, + "grad_norm": 0.8724157214164734, + "learning_rate": 9.796751338822256e-05, + "loss": 2.0224, + "step": 3857 + }, + { + "epoch": 1.1841620626151013, + "grad_norm": 0.8158844709396362, + "learning_rate": 9.796611036917542e-05, + "loss": 2.0165, + "step": 3858 + }, + { + "epoch": 1.1844689993861264, + "grad_norm": 0.6231487989425659, + "learning_rate": 9.796470687609904e-05, + "loss": 1.9607, + "step": 3859 + }, + { + "epoch": 1.1847759361571517, + "grad_norm": 0.49367067217826843, + "learning_rate": 9.796330290900731e-05, + "loss": 2.0074, + "step": 3860 + }, + { + "epoch": 1.1850828729281768, + "grad_norm": 0.5546393990516663, + "learning_rate": 9.796189846791413e-05, + "loss": 1.9688, + "step": 3861 + }, + { + "epoch": 1.185389809699202, + "grad_norm": 0.5880963802337646, + "learning_rate": 9.796049355283333e-05, + "loss": 2.0192, + "step": 3862 + }, + { + "epoch": 1.185696746470227, + "grad_norm": 0.6064910292625427, + "learning_rate": 9.795908816377884e-05, + "loss": 2.0236, + "step": 3863 + }, + { + "epoch": 1.1860036832412524, + "grad_norm": 0.524116575717926, + "learning_rate": 9.795768230076454e-05, + "loss": 2.0315, + "step": 3864 + }, + { + "epoch": 1.1863106200122775, + "grad_norm": 0.449158251285553, + "learning_rate": 9.79562759638043e-05, + "loss": 1.9423, + "step": 3865 + }, + { + "epoch": 1.1866175567833026, + "grad_norm": 0.5623016953468323, + "learning_rate": 9.795486915291203e-05, + "loss": 2.096, + "step": 3866 + }, + { + "epoch": 1.1869244935543277, + "grad_norm": 0.6107217073440552, + "learning_rate": 9.795346186810164e-05, + "loss": 1.9994, + "step": 3867 + }, + { + "epoch": 1.187231430325353, + "grad_norm": 0.5559211373329163, + "learning_rate": 9.795205410938704e-05, + "loss": 2.0138, + "step": 3868 + }, + { + "epoch": 1.1875383670963782, + "grad_norm": 0.5022037029266357, + "learning_rate": 9.795064587678212e-05, + "loss": 2.0835, + "step": 3869 + }, + { + "epoch": 1.1878453038674033, + "grad_norm": 0.5760810971260071, + "learning_rate": 9.794923717030082e-05, + "loss": 2.0839, + "step": 3870 + }, + { + "epoch": 1.1881522406384284, + "grad_norm": 0.559018075466156, + "learning_rate": 9.794782798995706e-05, + "loss": 2.0397, + "step": 3871 + }, + { + "epoch": 1.1884591774094537, + "grad_norm": 0.48842501640319824, + "learning_rate": 9.794641833576477e-05, + "loss": 2.022, + "step": 3872 + }, + { + "epoch": 1.1887661141804788, + "grad_norm": 0.47267377376556396, + "learning_rate": 9.794500820773785e-05, + "loss": 1.9677, + "step": 3873 + }, + { + "epoch": 1.189073050951504, + "grad_norm": 0.5107980966567993, + "learning_rate": 9.794359760589026e-05, + "loss": 2.124, + "step": 3874 + }, + { + "epoch": 1.189379987722529, + "grad_norm": 0.4993875026702881, + "learning_rate": 9.794218653023595e-05, + "loss": 1.9528, + "step": 3875 + }, + { + "epoch": 1.1896869244935544, + "grad_norm": 0.49543896317481995, + "learning_rate": 9.794077498078885e-05, + "loss": 2.0257, + "step": 3876 + }, + { + "epoch": 1.1899938612645795, + "grad_norm": 0.5207403302192688, + "learning_rate": 9.79393629575629e-05, + "loss": 2.0853, + "step": 3877 + }, + { + "epoch": 1.1903007980356046, + "grad_norm": 0.44884833693504333, + "learning_rate": 9.793795046057208e-05, + "loss": 1.9366, + "step": 3878 + }, + { + "epoch": 1.1906077348066297, + "grad_norm": 0.47921934723854065, + "learning_rate": 9.793653748983033e-05, + "loss": 2.0614, + "step": 3879 + }, + { + "epoch": 1.190914671577655, + "grad_norm": 0.5371566414833069, + "learning_rate": 9.793512404535163e-05, + "loss": 2.0433, + "step": 3880 + }, + { + "epoch": 1.1912216083486802, + "grad_norm": 0.48760104179382324, + "learning_rate": 9.793371012714994e-05, + "loss": 2.0061, + "step": 3881 + }, + { + "epoch": 1.1915285451197053, + "grad_norm": 0.47291669249534607, + "learning_rate": 9.793229573523922e-05, + "loss": 2.0661, + "step": 3882 + }, + { + "epoch": 1.1918354818907306, + "grad_norm": 0.5348502397537231, + "learning_rate": 9.793088086963347e-05, + "loss": 2.0131, + "step": 3883 + }, + { + "epoch": 1.1921424186617557, + "grad_norm": 0.6291812062263489, + "learning_rate": 9.792946553034666e-05, + "loss": 2.0312, + "step": 3884 + }, + { + "epoch": 1.1924493554327809, + "grad_norm": 0.5620503425598145, + "learning_rate": 9.792804971739276e-05, + "loss": 2.0429, + "step": 3885 + }, + { + "epoch": 1.192756292203806, + "grad_norm": 0.4984607696533203, + "learning_rate": 9.792663343078581e-05, + "loss": 2.0183, + "step": 3886 + }, + { + "epoch": 1.193063228974831, + "grad_norm": 0.5867961645126343, + "learning_rate": 9.792521667053975e-05, + "loss": 2.0609, + "step": 3887 + }, + { + "epoch": 1.1933701657458564, + "grad_norm": 0.5819169282913208, + "learning_rate": 9.792379943666863e-05, + "loss": 1.9412, + "step": 3888 + }, + { + "epoch": 1.1936771025168815, + "grad_norm": 0.6232548952102661, + "learning_rate": 9.792238172918643e-05, + "loss": 2.0607, + "step": 3889 + }, + { + "epoch": 1.1939840392879066, + "grad_norm": 0.5859619379043579, + "learning_rate": 9.792096354810716e-05, + "loss": 2.0718, + "step": 3890 + }, + { + "epoch": 1.194290976058932, + "grad_norm": 0.47209057211875916, + "learning_rate": 9.791954489344485e-05, + "loss": 1.9872, + "step": 3891 + }, + { + "epoch": 1.194597912829957, + "grad_norm": 0.5183662176132202, + "learning_rate": 9.79181257652135e-05, + "loss": 2.0782, + "step": 3892 + }, + { + "epoch": 1.1949048496009822, + "grad_norm": 0.551873505115509, + "learning_rate": 9.791670616342715e-05, + "loss": 2.0477, + "step": 3893 + }, + { + "epoch": 1.1952117863720073, + "grad_norm": 0.47254955768585205, + "learning_rate": 9.791528608809984e-05, + "loss": 1.9859, + "step": 3894 + }, + { + "epoch": 1.1955187231430324, + "grad_norm": 0.45482897758483887, + "learning_rate": 9.791386553924556e-05, + "loss": 1.9939, + "step": 3895 + }, + { + "epoch": 1.1958256599140578, + "grad_norm": 0.4687066078186035, + "learning_rate": 9.79124445168784e-05, + "loss": 1.9982, + "step": 3896 + }, + { + "epoch": 1.1961325966850829, + "grad_norm": 0.4855460524559021, + "learning_rate": 9.791102302101236e-05, + "loss": 1.9667, + "step": 3897 + }, + { + "epoch": 1.196439533456108, + "grad_norm": 0.48152467608451843, + "learning_rate": 9.790960105166153e-05, + "loss": 1.9914, + "step": 3898 + }, + { + "epoch": 1.1967464702271333, + "grad_norm": 0.48487406969070435, + "learning_rate": 9.790817860883993e-05, + "loss": 1.9978, + "step": 3899 + }, + { + "epoch": 1.1970534069981584, + "grad_norm": 0.47665563225746155, + "learning_rate": 9.790675569256162e-05, + "loss": 1.9995, + "step": 3900 + }, + { + "epoch": 1.1973603437691835, + "grad_norm": 0.48938530683517456, + "learning_rate": 9.790533230284069e-05, + "loss": 2.0461, + "step": 3901 + }, + { + "epoch": 1.1976672805402087, + "grad_norm": 0.6336411237716675, + "learning_rate": 9.790390843969119e-05, + "loss": 2.0003, + "step": 3902 + }, + { + "epoch": 1.1979742173112338, + "grad_norm": 0.6946616172790527, + "learning_rate": 9.790248410312717e-05, + "loss": 1.9979, + "step": 3903 + }, + { + "epoch": 1.198281154082259, + "grad_norm": 0.7829384803771973, + "learning_rate": 9.790105929316274e-05, + "loss": 2.015, + "step": 3904 + }, + { + "epoch": 1.1985880908532842, + "grad_norm": 0.6874059438705444, + "learning_rate": 9.789963400981197e-05, + "loss": 1.9887, + "step": 3905 + }, + { + "epoch": 1.1988950276243093, + "grad_norm": 0.6074720025062561, + "learning_rate": 9.789820825308893e-05, + "loss": 2.0287, + "step": 3906 + }, + { + "epoch": 1.1992019643953347, + "grad_norm": 0.49311673641204834, + "learning_rate": 9.789678202300774e-05, + "loss": 1.9846, + "step": 3907 + }, + { + "epoch": 1.1995089011663598, + "grad_norm": 0.5266487002372742, + "learning_rate": 9.789535531958244e-05, + "loss": 2.017, + "step": 3908 + }, + { + "epoch": 1.1998158379373849, + "grad_norm": 0.6170570850372314, + "learning_rate": 9.789392814282721e-05, + "loss": 2.0615, + "step": 3909 + }, + { + "epoch": 1.20012277470841, + "grad_norm": 0.5820409059524536, + "learning_rate": 9.789250049275609e-05, + "loss": 2.0459, + "step": 3910 + }, + { + "epoch": 1.2004297114794351, + "grad_norm": 0.5220739841461182, + "learning_rate": 9.78910723693832e-05, + "loss": 2.0843, + "step": 3911 + }, + { + "epoch": 1.2007366482504604, + "grad_norm": 0.5884750485420227, + "learning_rate": 9.788964377272267e-05, + "loss": 2.1068, + "step": 3912 + }, + { + "epoch": 1.2010435850214856, + "grad_norm": 0.5634950995445251, + "learning_rate": 9.788821470278861e-05, + "loss": 2.0206, + "step": 3913 + }, + { + "epoch": 1.2013505217925107, + "grad_norm": 0.5219514966011047, + "learning_rate": 9.788678515959517e-05, + "loss": 2.0802, + "step": 3914 + }, + { + "epoch": 1.201657458563536, + "grad_norm": 0.5870078206062317, + "learning_rate": 9.788535514315642e-05, + "loss": 2.0149, + "step": 3915 + }, + { + "epoch": 1.2019643953345611, + "grad_norm": 0.4850577414035797, + "learning_rate": 9.788392465348653e-05, + "loss": 2.0424, + "step": 3916 + }, + { + "epoch": 1.2022713321055862, + "grad_norm": 0.5354881882667542, + "learning_rate": 9.788249369059964e-05, + "loss": 2.0822, + "step": 3917 + }, + { + "epoch": 1.2025782688766113, + "grad_norm": 0.5817529559135437, + "learning_rate": 9.788106225450988e-05, + "loss": 2.0384, + "step": 3918 + }, + { + "epoch": 1.2028852056476367, + "grad_norm": 0.5685575008392334, + "learning_rate": 9.78796303452314e-05, + "loss": 1.9777, + "step": 3919 + }, + { + "epoch": 1.2031921424186618, + "grad_norm": 0.5086472034454346, + "learning_rate": 9.787819796277835e-05, + "loss": 1.9109, + "step": 3920 + }, + { + "epoch": 1.203499079189687, + "grad_norm": 0.45905008912086487, + "learning_rate": 9.787676510716488e-05, + "loss": 1.9945, + "step": 3921 + }, + { + "epoch": 1.203806015960712, + "grad_norm": 0.6052672863006592, + "learning_rate": 9.787533177840516e-05, + "loss": 2.0873, + "step": 3922 + }, + { + "epoch": 1.2041129527317374, + "grad_norm": 0.636320173740387, + "learning_rate": 9.787389797651334e-05, + "loss": 1.954, + "step": 3923 + }, + { + "epoch": 1.2044198895027625, + "grad_norm": 0.5775459408760071, + "learning_rate": 9.78724637015036e-05, + "loss": 1.9632, + "step": 3924 + }, + { + "epoch": 1.2047268262737876, + "grad_norm": 0.4593936502933502, + "learning_rate": 9.787102895339013e-05, + "loss": 1.948, + "step": 3925 + }, + { + "epoch": 1.2050337630448127, + "grad_norm": 0.4568643867969513, + "learning_rate": 9.78695937321871e-05, + "loss": 1.977, + "step": 3926 + }, + { + "epoch": 1.205340699815838, + "grad_norm": 0.6079357266426086, + "learning_rate": 9.786815803790867e-05, + "loss": 1.9738, + "step": 3927 + }, + { + "epoch": 1.2056476365868631, + "grad_norm": 0.5991626977920532, + "learning_rate": 9.786672187056905e-05, + "loss": 1.9603, + "step": 3928 + }, + { + "epoch": 1.2059545733578882, + "grad_norm": 0.4844282865524292, + "learning_rate": 9.786528523018242e-05, + "loss": 1.9739, + "step": 3929 + }, + { + "epoch": 1.2062615101289134, + "grad_norm": 0.43694475293159485, + "learning_rate": 9.786384811676298e-05, + "loss": 1.957, + "step": 3930 + }, + { + "epoch": 1.2065684468999387, + "grad_norm": 0.5742451548576355, + "learning_rate": 9.786241053032496e-05, + "loss": 1.9872, + "step": 3931 + }, + { + "epoch": 1.2068753836709638, + "grad_norm": 0.6246824860572815, + "learning_rate": 9.786097247088255e-05, + "loss": 2.0747, + "step": 3932 + }, + { + "epoch": 1.207182320441989, + "grad_norm": 0.5364731550216675, + "learning_rate": 9.785953393844996e-05, + "loss": 1.9793, + "step": 3933 + }, + { + "epoch": 1.207489257213014, + "grad_norm": 0.42909273505210876, + "learning_rate": 9.785809493304139e-05, + "loss": 1.9959, + "step": 3934 + }, + { + "epoch": 1.2077961939840394, + "grad_norm": 0.43952879309654236, + "learning_rate": 9.785665545467108e-05, + "loss": 2.0019, + "step": 3935 + }, + { + "epoch": 1.2081031307550645, + "grad_norm": 0.45972180366516113, + "learning_rate": 9.785521550335323e-05, + "loss": 1.9504, + "step": 3936 + }, + { + "epoch": 1.2084100675260896, + "grad_norm": 0.5592246651649475, + "learning_rate": 9.785377507910212e-05, + "loss": 2.0214, + "step": 3937 + }, + { + "epoch": 1.2087170042971147, + "grad_norm": 0.6084285378456116, + "learning_rate": 9.785233418193196e-05, + "loss": 2.08, + "step": 3938 + }, + { + "epoch": 1.20902394106814, + "grad_norm": 0.5370670557022095, + "learning_rate": 9.785089281185698e-05, + "loss": 2.0877, + "step": 3939 + }, + { + "epoch": 1.2093308778391652, + "grad_norm": 0.466501921415329, + "learning_rate": 9.784945096889143e-05, + "loss": 1.9795, + "step": 3940 + }, + { + "epoch": 1.2096378146101903, + "grad_norm": 0.48617517948150635, + "learning_rate": 9.784800865304954e-05, + "loss": 2.0099, + "step": 3941 + }, + { + "epoch": 1.2099447513812154, + "grad_norm": 0.528110921382904, + "learning_rate": 9.78465658643456e-05, + "loss": 2.0597, + "step": 3942 + }, + { + "epoch": 1.2102516881522407, + "grad_norm": 0.47355538606643677, + "learning_rate": 9.784512260279385e-05, + "loss": 2.0145, + "step": 3943 + }, + { + "epoch": 1.2105586249232658, + "grad_norm": 0.46970823407173157, + "learning_rate": 9.784367886840856e-05, + "loss": 2.0533, + "step": 3944 + }, + { + "epoch": 1.210865561694291, + "grad_norm": 0.41206037998199463, + "learning_rate": 9.784223466120399e-05, + "loss": 1.9226, + "step": 3945 + }, + { + "epoch": 1.211172498465316, + "grad_norm": 0.4298155605792999, + "learning_rate": 9.784078998119442e-05, + "loss": 2.0686, + "step": 3946 + }, + { + "epoch": 1.2114794352363414, + "grad_norm": 0.4616359770298004, + "learning_rate": 9.783934482839412e-05, + "loss": 2.0063, + "step": 3947 + }, + { + "epoch": 1.2117863720073665, + "grad_norm": 0.476726233959198, + "learning_rate": 9.783789920281737e-05, + "loss": 1.9868, + "step": 3948 + }, + { + "epoch": 1.2120933087783916, + "grad_norm": 0.5075610876083374, + "learning_rate": 9.783645310447846e-05, + "loss": 2.1019, + "step": 3949 + }, + { + "epoch": 1.212400245549417, + "grad_norm": 0.49806225299835205, + "learning_rate": 9.78350065333917e-05, + "loss": 2.0503, + "step": 3950 + }, + { + "epoch": 1.212707182320442, + "grad_norm": 0.5278452634811401, + "learning_rate": 9.783355948957134e-05, + "loss": 2.0513, + "step": 3951 + }, + { + "epoch": 1.2130141190914672, + "grad_norm": 0.5634627938270569, + "learning_rate": 9.783211197303174e-05, + "loss": 2.1135, + "step": 3952 + }, + { + "epoch": 1.2133210558624923, + "grad_norm": 0.5152999758720398, + "learning_rate": 9.783066398378715e-05, + "loss": 2.0392, + "step": 3953 + }, + { + "epoch": 1.2136279926335174, + "grad_norm": 0.48095864057540894, + "learning_rate": 9.782921552185191e-05, + "loss": 1.982, + "step": 3954 + }, + { + "epoch": 1.2139349294045427, + "grad_norm": 0.47377893328666687, + "learning_rate": 9.782776658724034e-05, + "loss": 1.9538, + "step": 3955 + }, + { + "epoch": 1.2142418661755678, + "grad_norm": 0.5260181427001953, + "learning_rate": 9.782631717996675e-05, + "loss": 2.1197, + "step": 3956 + }, + { + "epoch": 1.214548802946593, + "grad_norm": 0.5640038251876831, + "learning_rate": 9.782486730004544e-05, + "loss": 2.0338, + "step": 3957 + }, + { + "epoch": 1.2148557397176183, + "grad_norm": 0.5091645121574402, + "learning_rate": 9.782341694749078e-05, + "loss": 1.9921, + "step": 3958 + }, + { + "epoch": 1.2151626764886434, + "grad_norm": 0.48285624384880066, + "learning_rate": 9.782196612231706e-05, + "loss": 2.0358, + "step": 3959 + }, + { + "epoch": 1.2154696132596685, + "grad_norm": 0.5013573169708252, + "learning_rate": 9.782051482453867e-05, + "loss": 1.9378, + "step": 3960 + }, + { + "epoch": 1.2157765500306936, + "grad_norm": 0.42000052332878113, + "learning_rate": 9.781906305416991e-05, + "loss": 1.9232, + "step": 3961 + }, + { + "epoch": 1.2160834868017187, + "grad_norm": 0.4651196599006653, + "learning_rate": 9.781761081122514e-05, + "loss": 2.0244, + "step": 3962 + }, + { + "epoch": 1.216390423572744, + "grad_norm": 0.48081469535827637, + "learning_rate": 9.781615809571871e-05, + "loss": 1.938, + "step": 3963 + }, + { + "epoch": 1.2166973603437692, + "grad_norm": 0.4692462086677551, + "learning_rate": 9.7814704907665e-05, + "loss": 1.9592, + "step": 3964 + }, + { + "epoch": 1.2170042971147943, + "grad_norm": 0.5545635223388672, + "learning_rate": 9.781325124707832e-05, + "loss": 2.0882, + "step": 3965 + }, + { + "epoch": 1.2173112338858196, + "grad_norm": 0.47801801562309265, + "learning_rate": 9.78117971139731e-05, + "loss": 2.0127, + "step": 3966 + }, + { + "epoch": 1.2176181706568447, + "grad_norm": 0.4705824851989746, + "learning_rate": 9.781034250836364e-05, + "loss": 2.0659, + "step": 3967 + }, + { + "epoch": 1.2179251074278699, + "grad_norm": 0.4757092297077179, + "learning_rate": 9.78088874302644e-05, + "loss": 1.9177, + "step": 3968 + }, + { + "epoch": 1.218232044198895, + "grad_norm": 0.4563291370868683, + "learning_rate": 9.780743187968968e-05, + "loss": 1.991, + "step": 3969 + }, + { + "epoch": 1.21853898096992, + "grad_norm": 0.4641762375831604, + "learning_rate": 9.78059758566539e-05, + "loss": 2.0357, + "step": 3970 + }, + { + "epoch": 1.2188459177409454, + "grad_norm": 0.510754406452179, + "learning_rate": 9.780451936117145e-05, + "loss": 2.0754, + "step": 3971 + }, + { + "epoch": 1.2191528545119705, + "grad_norm": 0.5595460534095764, + "learning_rate": 9.780306239325671e-05, + "loss": 2.0449, + "step": 3972 + }, + { + "epoch": 1.2194597912829956, + "grad_norm": 0.5778231620788574, + "learning_rate": 9.780160495292412e-05, + "loss": 2.0187, + "step": 3973 + }, + { + "epoch": 1.219766728054021, + "grad_norm": 0.5098022818565369, + "learning_rate": 9.780014704018803e-05, + "loss": 1.9881, + "step": 3974 + }, + { + "epoch": 1.220073664825046, + "grad_norm": 0.46725937724113464, + "learning_rate": 9.779868865506288e-05, + "loss": 1.9929, + "step": 3975 + }, + { + "epoch": 1.2203806015960712, + "grad_norm": 0.48517540097236633, + "learning_rate": 9.779722979756304e-05, + "loss": 1.9446, + "step": 3976 + }, + { + "epoch": 1.2206875383670963, + "grad_norm": 0.5013269186019897, + "learning_rate": 9.7795770467703e-05, + "loss": 2.0256, + "step": 3977 + }, + { + "epoch": 1.2209944751381214, + "grad_norm": 0.4918982982635498, + "learning_rate": 9.779431066549713e-05, + "loss": 1.9732, + "step": 3978 + }, + { + "epoch": 1.2213014119091468, + "grad_norm": 0.45646655559539795, + "learning_rate": 9.779285039095987e-05, + "loss": 1.9672, + "step": 3979 + }, + { + "epoch": 1.2216083486801719, + "grad_norm": 0.4712901711463928, + "learning_rate": 9.779138964410565e-05, + "loss": 2.0074, + "step": 3980 + }, + { + "epoch": 1.221915285451197, + "grad_norm": 0.4901394844055176, + "learning_rate": 9.77899284249489e-05, + "loss": 2.0073, + "step": 3981 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.579247772693634, + "learning_rate": 9.778846673350407e-05, + "loss": 2.0983, + "step": 3982 + }, + { + "epoch": 1.2225291589932474, + "grad_norm": 0.6108444929122925, + "learning_rate": 9.77870045697856e-05, + "loss": 2.0268, + "step": 3983 + }, + { + "epoch": 1.2228360957642725, + "grad_norm": 0.5592121481895447, + "learning_rate": 9.778554193380796e-05, + "loss": 2.0549, + "step": 3984 + }, + { + "epoch": 1.2231430325352977, + "grad_norm": 0.538088858127594, + "learning_rate": 9.778407882558556e-05, + "loss": 1.9398, + "step": 3985 + }, + { + "epoch": 1.223449969306323, + "grad_norm": 0.5928295850753784, + "learning_rate": 9.77826152451329e-05, + "loss": 2.0341, + "step": 3986 + }, + { + "epoch": 1.223756906077348, + "grad_norm": 0.566687822341919, + "learning_rate": 9.778115119246442e-05, + "loss": 2.0629, + "step": 3987 + }, + { + "epoch": 1.2240638428483732, + "grad_norm": 0.7019027471542358, + "learning_rate": 9.777968666759461e-05, + "loss": 1.9979, + "step": 3988 + }, + { + "epoch": 1.2243707796193983, + "grad_norm": 0.7198969721794128, + "learning_rate": 9.777822167053793e-05, + "loss": 1.9898, + "step": 3989 + }, + { + "epoch": 1.2246777163904237, + "grad_norm": 0.6319006085395813, + "learning_rate": 9.777675620130887e-05, + "loss": 1.9591, + "step": 3990 + }, + { + "epoch": 1.2249846531614488, + "grad_norm": 0.5372903347015381, + "learning_rate": 9.777529025992187e-05, + "loss": 1.9605, + "step": 3991 + }, + { + "epoch": 1.225291589932474, + "grad_norm": 0.47436487674713135, + "learning_rate": 9.777382384639147e-05, + "loss": 1.9667, + "step": 3992 + }, + { + "epoch": 1.225598526703499, + "grad_norm": 0.5885797739028931, + "learning_rate": 9.777235696073214e-05, + "loss": 2.0363, + "step": 3993 + }, + { + "epoch": 1.2259054634745243, + "grad_norm": 0.6333138346672058, + "learning_rate": 9.777088960295838e-05, + "loss": 1.9352, + "step": 3994 + }, + { + "epoch": 1.2262124002455494, + "grad_norm": 0.6364251971244812, + "learning_rate": 9.776942177308468e-05, + "loss": 1.9577, + "step": 3995 + }, + { + "epoch": 1.2265193370165746, + "grad_norm": 0.5114668607711792, + "learning_rate": 9.776795347112557e-05, + "loss": 2.0241, + "step": 3996 + }, + { + "epoch": 1.2268262737875997, + "grad_norm": 0.6139995455741882, + "learning_rate": 9.776648469709556e-05, + "loss": 1.9847, + "step": 3997 + }, + { + "epoch": 1.227133210558625, + "grad_norm": 0.6104671955108643, + "learning_rate": 9.776501545100911e-05, + "loss": 1.9311, + "step": 3998 + }, + { + "epoch": 1.2274401473296501, + "grad_norm": 0.5099297761917114, + "learning_rate": 9.776354573288081e-05, + "loss": 2.0877, + "step": 3999 + }, + { + "epoch": 1.2277470841006752, + "grad_norm": 0.48199233412742615, + "learning_rate": 9.776207554272516e-05, + "loss": 1.9802, + "step": 4000 + }, + { + "epoch": 1.2280540208717003, + "grad_norm": 0.5323067307472229, + "learning_rate": 9.776060488055667e-05, + "loss": 2.0278, + "step": 4001 + }, + { + "epoch": 1.2283609576427257, + "grad_norm": 0.49086472392082214, + "learning_rate": 9.775913374638988e-05, + "loss": 2.0242, + "step": 4002 + }, + { + "epoch": 1.2286678944137508, + "grad_norm": 0.4812946319580078, + "learning_rate": 9.775766214023936e-05, + "loss": 1.9762, + "step": 4003 + }, + { + "epoch": 1.228974831184776, + "grad_norm": 0.44118809700012207, + "learning_rate": 9.775619006211962e-05, + "loss": 1.9242, + "step": 4004 + }, + { + "epoch": 1.229281767955801, + "grad_norm": 0.4507352113723755, + "learning_rate": 9.775471751204522e-05, + "loss": 2.0015, + "step": 4005 + }, + { + "epoch": 1.2295887047268264, + "grad_norm": 0.4620691239833832, + "learning_rate": 9.775324449003072e-05, + "loss": 2.0269, + "step": 4006 + }, + { + "epoch": 1.2298956414978515, + "grad_norm": 0.5053025484085083, + "learning_rate": 9.775177099609065e-05, + "loss": 1.9764, + "step": 4007 + }, + { + "epoch": 1.2302025782688766, + "grad_norm": 0.5113483667373657, + "learning_rate": 9.775029703023961e-05, + "loss": 2.0583, + "step": 4008 + }, + { + "epoch": 1.2305095150399017, + "grad_norm": 0.517400324344635, + "learning_rate": 9.774882259249214e-05, + "loss": 2.0918, + "step": 4009 + }, + { + "epoch": 1.230816451810927, + "grad_norm": 0.5575035214424133, + "learning_rate": 9.774734768286282e-05, + "loss": 2.0573, + "step": 4010 + }, + { + "epoch": 1.2311233885819521, + "grad_norm": 0.5556582808494568, + "learning_rate": 9.774587230136622e-05, + "loss": 1.9612, + "step": 4011 + }, + { + "epoch": 1.2314303253529773, + "grad_norm": 0.541752815246582, + "learning_rate": 9.774439644801693e-05, + "loss": 2.0165, + "step": 4012 + }, + { + "epoch": 1.2317372621240024, + "grad_norm": 0.46944886445999146, + "learning_rate": 9.774292012282953e-05, + "loss": 2.0068, + "step": 4013 + }, + { + "epoch": 1.2320441988950277, + "grad_norm": 0.5507385730743408, + "learning_rate": 9.77414433258186e-05, + "loss": 2.0092, + "step": 4014 + }, + { + "epoch": 1.2323511356660528, + "grad_norm": 0.550862193107605, + "learning_rate": 9.773996605699875e-05, + "loss": 1.9887, + "step": 4015 + }, + { + "epoch": 1.232658072437078, + "grad_norm": 0.5281004905700684, + "learning_rate": 9.77384883163846e-05, + "loss": 2.0214, + "step": 4016 + }, + { + "epoch": 1.232965009208103, + "grad_norm": 0.5682541131973267, + "learning_rate": 9.77370101039907e-05, + "loss": 2.0021, + "step": 4017 + }, + { + "epoch": 1.2332719459791284, + "grad_norm": 0.5083168745040894, + "learning_rate": 9.77355314198317e-05, + "loss": 1.9589, + "step": 4018 + }, + { + "epoch": 1.2335788827501535, + "grad_norm": 0.48763957619667053, + "learning_rate": 9.773405226392218e-05, + "loss": 1.9517, + "step": 4019 + }, + { + "epoch": 1.2338858195211786, + "grad_norm": 0.4721868634223938, + "learning_rate": 9.77325726362768e-05, + "loss": 1.959, + "step": 4020 + }, + { + "epoch": 1.2341927562922037, + "grad_norm": 0.5072606205940247, + "learning_rate": 9.773109253691016e-05, + "loss": 2.0252, + "step": 4021 + }, + { + "epoch": 1.234499693063229, + "grad_norm": 0.483260840177536, + "learning_rate": 9.772961196583686e-05, + "loss": 2.0205, + "step": 4022 + }, + { + "epoch": 1.2348066298342542, + "grad_norm": 0.4468609392642975, + "learning_rate": 9.772813092307158e-05, + "loss": 2.0182, + "step": 4023 + }, + { + "epoch": 1.2351135666052793, + "grad_norm": 0.4950753152370453, + "learning_rate": 9.772664940862893e-05, + "loss": 2.0276, + "step": 4024 + }, + { + "epoch": 1.2354205033763046, + "grad_norm": 0.45740416646003723, + "learning_rate": 9.772516742252356e-05, + "loss": 1.9519, + "step": 4025 + }, + { + "epoch": 1.2357274401473297, + "grad_norm": 0.409072607755661, + "learning_rate": 9.772368496477011e-05, + "loss": 1.9441, + "step": 4026 + }, + { + "epoch": 1.2360343769183548, + "grad_norm": 0.44857287406921387, + "learning_rate": 9.772220203538325e-05, + "loss": 1.9941, + "step": 4027 + }, + { + "epoch": 1.23634131368938, + "grad_norm": 0.4610998034477234, + "learning_rate": 9.77207186343776e-05, + "loss": 1.9855, + "step": 4028 + }, + { + "epoch": 1.236648250460405, + "grad_norm": 0.4809660017490387, + "learning_rate": 9.771923476176784e-05, + "loss": 1.9596, + "step": 4029 + }, + { + "epoch": 1.2369551872314304, + "grad_norm": 0.5011657476425171, + "learning_rate": 9.771775041756865e-05, + "loss": 1.9537, + "step": 4030 + }, + { + "epoch": 1.2372621240024555, + "grad_norm": 0.476001501083374, + "learning_rate": 9.771626560179465e-05, + "loss": 1.9447, + "step": 4031 + }, + { + "epoch": 1.2375690607734806, + "grad_norm": 0.4733816385269165, + "learning_rate": 9.771478031446057e-05, + "loss": 2.08, + "step": 4032 + }, + { + "epoch": 1.237875997544506, + "grad_norm": 0.4763995409011841, + "learning_rate": 9.771329455558108e-05, + "loss": 1.9483, + "step": 4033 + }, + { + "epoch": 1.238182934315531, + "grad_norm": 0.4906281530857086, + "learning_rate": 9.771180832517082e-05, + "loss": 1.9619, + "step": 4034 + }, + { + "epoch": 1.2384898710865562, + "grad_norm": 0.48713672161102295, + "learning_rate": 9.77103216232445e-05, + "loss": 1.9753, + "step": 4035 + }, + { + "epoch": 1.2387968078575813, + "grad_norm": 0.5214180946350098, + "learning_rate": 9.770883444981683e-05, + "loss": 2.0407, + "step": 4036 + }, + { + "epoch": 1.2391037446286064, + "grad_norm": 0.5161129236221313, + "learning_rate": 9.77073468049025e-05, + "loss": 2.0298, + "step": 4037 + }, + { + "epoch": 1.2394106813996317, + "grad_norm": 0.5041607022285461, + "learning_rate": 9.770585868851621e-05, + "loss": 1.9898, + "step": 4038 + }, + { + "epoch": 1.2397176181706568, + "grad_norm": 0.5076795220375061, + "learning_rate": 9.770437010067264e-05, + "loss": 1.9899, + "step": 4039 + }, + { + "epoch": 1.240024554941682, + "grad_norm": 0.47992074489593506, + "learning_rate": 9.770288104138654e-05, + "loss": 1.9923, + "step": 4040 + }, + { + "epoch": 1.2403314917127073, + "grad_norm": 0.4655405580997467, + "learning_rate": 9.770139151067261e-05, + "loss": 2.0082, + "step": 4041 + }, + { + "epoch": 1.2406384284837324, + "grad_norm": 0.499953031539917, + "learning_rate": 9.769990150854558e-05, + "loss": 2.0412, + "step": 4042 + }, + { + "epoch": 1.2409453652547575, + "grad_norm": 0.5288184285163879, + "learning_rate": 9.769841103502016e-05, + "loss": 2.0163, + "step": 4043 + }, + { + "epoch": 1.2412523020257826, + "grad_norm": 0.6660463809967041, + "learning_rate": 9.769692009011107e-05, + "loss": 2.1644, + "step": 4044 + }, + { + "epoch": 1.2415592387968077, + "grad_norm": 0.7020677328109741, + "learning_rate": 9.769542867383306e-05, + "loss": 1.9921, + "step": 4045 + }, + { + "epoch": 1.241866175567833, + "grad_norm": 0.8394366502761841, + "learning_rate": 9.769393678620089e-05, + "loss": 2.0099, + "step": 4046 + }, + { + "epoch": 1.2421731123388582, + "grad_norm": 0.9541008472442627, + "learning_rate": 9.769244442722927e-05, + "loss": 2.0035, + "step": 4047 + }, + { + "epoch": 1.2424800491098833, + "grad_norm": 0.8454573750495911, + "learning_rate": 9.769095159693296e-05, + "loss": 2.0075, + "step": 4048 + }, + { + "epoch": 1.2427869858809086, + "grad_norm": 0.6634951233863831, + "learning_rate": 9.768945829532672e-05, + "loss": 2.0352, + "step": 4049 + }, + { + "epoch": 1.2430939226519337, + "grad_norm": 0.5453166365623474, + "learning_rate": 9.76879645224253e-05, + "loss": 2.0259, + "step": 4050 + }, + { + "epoch": 1.2434008594229589, + "grad_norm": 0.8018995523452759, + "learning_rate": 9.768647027824344e-05, + "loss": 2.0175, + "step": 4051 + }, + { + "epoch": 1.243707796193984, + "grad_norm": 0.8518994450569153, + "learning_rate": 9.768497556279596e-05, + "loss": 1.986, + "step": 4052 + }, + { + "epoch": 1.244014732965009, + "grad_norm": 0.670764684677124, + "learning_rate": 9.76834803760976e-05, + "loss": 1.9779, + "step": 4053 + }, + { + "epoch": 1.2443216697360344, + "grad_norm": 0.5042433142662048, + "learning_rate": 9.768198471816312e-05, + "loss": 1.9808, + "step": 4054 + }, + { + "epoch": 1.2446286065070595, + "grad_norm": 0.45487603545188904, + "learning_rate": 9.768048858900733e-05, + "loss": 2.011, + "step": 4055 + }, + { + "epoch": 1.2449355432780846, + "grad_norm": 0.5012104511260986, + "learning_rate": 9.767899198864502e-05, + "loss": 1.9945, + "step": 4056 + }, + { + "epoch": 1.24524248004911, + "grad_norm": 0.6275805234909058, + "learning_rate": 9.767749491709095e-05, + "loss": 2.0397, + "step": 4057 + }, + { + "epoch": 1.245549416820135, + "grad_norm": 0.601513683795929, + "learning_rate": 9.767599737435993e-05, + "loss": 2.0201, + "step": 4058 + }, + { + "epoch": 1.2458563535911602, + "grad_norm": 0.531112551689148, + "learning_rate": 9.767449936046678e-05, + "loss": 2.0449, + "step": 4059 + }, + { + "epoch": 1.2461632903621853, + "grad_norm": 0.48515528440475464, + "learning_rate": 9.767300087542626e-05, + "loss": 2.0318, + "step": 4060 + }, + { + "epoch": 1.2464702271332107, + "grad_norm": 0.49292388558387756, + "learning_rate": 9.767150191925321e-05, + "loss": 2.0004, + "step": 4061 + }, + { + "epoch": 1.2467771639042358, + "grad_norm": 0.6046907901763916, + "learning_rate": 9.767000249196242e-05, + "loss": 2.0141, + "step": 4062 + }, + { + "epoch": 1.2470841006752609, + "grad_norm": 0.5311875939369202, + "learning_rate": 9.766850259356876e-05, + "loss": 1.9909, + "step": 4063 + }, + { + "epoch": 1.247391037446286, + "grad_norm": 0.535664975643158, + "learning_rate": 9.7667002224087e-05, + "loss": 2.07, + "step": 4064 + }, + { + "epoch": 1.2476979742173113, + "grad_norm": 0.594886839389801, + "learning_rate": 9.766550138353199e-05, + "loss": 1.9646, + "step": 4065 + }, + { + "epoch": 1.2480049109883364, + "grad_norm": 0.6726763844490051, + "learning_rate": 9.766400007191856e-05, + "loss": 1.9778, + "step": 4066 + }, + { + "epoch": 1.2483118477593615, + "grad_norm": 0.6045297384262085, + "learning_rate": 9.766249828926154e-05, + "loss": 2.0215, + "step": 4067 + }, + { + "epoch": 1.2486187845303867, + "grad_norm": 0.56207275390625, + "learning_rate": 9.766099603557576e-05, + "loss": 2.0252, + "step": 4068 + }, + { + "epoch": 1.248925721301412, + "grad_norm": 0.6623022556304932, + "learning_rate": 9.765949331087611e-05, + "loss": 1.975, + "step": 4069 + }, + { + "epoch": 1.249232658072437, + "grad_norm": 0.6274738311767578, + "learning_rate": 9.76579901151774e-05, + "loss": 2.037, + "step": 4070 + }, + { + "epoch": 1.2495395948434622, + "grad_norm": 0.5161643028259277, + "learning_rate": 9.76564864484945e-05, + "loss": 1.969, + "step": 4071 + }, + { + "epoch": 1.2498465316144873, + "grad_norm": 0.5624449849128723, + "learning_rate": 9.765498231084227e-05, + "loss": 2.0322, + "step": 4072 + }, + { + "epoch": 1.2501534683855127, + "grad_norm": 0.6198796629905701, + "learning_rate": 9.765347770223556e-05, + "loss": 1.986, + "step": 4073 + }, + { + "epoch": 1.2504604051565378, + "grad_norm": 0.5928165316581726, + "learning_rate": 9.765197262268927e-05, + "loss": 1.9886, + "step": 4074 + }, + { + "epoch": 1.250767341927563, + "grad_norm": 0.476484090089798, + "learning_rate": 9.765046707221825e-05, + "loss": 2.0476, + "step": 4075 + }, + { + "epoch": 1.2510742786985882, + "grad_norm": 0.5001220703125, + "learning_rate": 9.764896105083738e-05, + "loss": 1.9222, + "step": 4076 + }, + { + "epoch": 1.2513812154696133, + "grad_norm": 0.5429214239120483, + "learning_rate": 9.764745455856156e-05, + "loss": 2.0005, + "step": 4077 + }, + { + "epoch": 1.2516881522406385, + "grad_norm": 0.49443748593330383, + "learning_rate": 9.764594759540566e-05, + "loss": 1.9746, + "step": 4078 + }, + { + "epoch": 1.2519950890116636, + "grad_norm": 0.46963369846343994, + "learning_rate": 9.764444016138458e-05, + "loss": 1.9133, + "step": 4079 + }, + { + "epoch": 1.2523020257826887, + "grad_norm": 0.5112172365188599, + "learning_rate": 9.764293225651324e-05, + "loss": 1.9488, + "step": 4080 + }, + { + "epoch": 1.252608962553714, + "grad_norm": 0.4584117829799652, + "learning_rate": 9.764142388080648e-05, + "loss": 1.9895, + "step": 4081 + }, + { + "epoch": 1.2529158993247391, + "grad_norm": 0.48059090971946716, + "learning_rate": 9.763991503427927e-05, + "loss": 2.0436, + "step": 4082 + }, + { + "epoch": 1.2532228360957642, + "grad_norm": 0.5877810120582581, + "learning_rate": 9.763840571694649e-05, + "loss": 1.97, + "step": 4083 + }, + { + "epoch": 1.2535297728667896, + "grad_norm": 0.5370834469795227, + "learning_rate": 9.763689592882306e-05, + "loss": 2.0369, + "step": 4084 + }, + { + "epoch": 1.2538367096378147, + "grad_norm": 0.5483170747756958, + "learning_rate": 9.763538566992392e-05, + "loss": 2.066, + "step": 4085 + }, + { + "epoch": 1.2541436464088398, + "grad_norm": 0.5209359526634216, + "learning_rate": 9.763387494026396e-05, + "loss": 2.0685, + "step": 4086 + }, + { + "epoch": 1.254450583179865, + "grad_norm": 0.5569130182266235, + "learning_rate": 9.763236373985813e-05, + "loss": 2.0253, + "step": 4087 + }, + { + "epoch": 1.25475751995089, + "grad_norm": 0.48483753204345703, + "learning_rate": 9.763085206872136e-05, + "loss": 1.9851, + "step": 4088 + }, + { + "epoch": 1.2550644567219154, + "grad_norm": 0.4289563000202179, + "learning_rate": 9.76293399268686e-05, + "loss": 1.9374, + "step": 4089 + }, + { + "epoch": 1.2553713934929405, + "grad_norm": 0.4691961109638214, + "learning_rate": 9.762782731431478e-05, + "loss": 1.9588, + "step": 4090 + }, + { + "epoch": 1.2556783302639656, + "grad_norm": 0.49626582860946655, + "learning_rate": 9.762631423107488e-05, + "loss": 1.999, + "step": 4091 + }, + { + "epoch": 1.255985267034991, + "grad_norm": 0.5099872946739197, + "learning_rate": 9.762480067716381e-05, + "loss": 2.013, + "step": 4092 + }, + { + "epoch": 1.256292203806016, + "grad_norm": 0.47525838017463684, + "learning_rate": 9.762328665259654e-05, + "loss": 1.9953, + "step": 4093 + }, + { + "epoch": 1.2565991405770411, + "grad_norm": 0.4277878999710083, + "learning_rate": 9.762177215738804e-05, + "loss": 1.9623, + "step": 4094 + }, + { + "epoch": 1.2569060773480663, + "grad_norm": 0.46068885922431946, + "learning_rate": 9.762025719155328e-05, + "loss": 2.0012, + "step": 4095 + }, + { + "epoch": 1.2572130141190914, + "grad_norm": 0.4566059410572052, + "learning_rate": 9.761874175510723e-05, + "loss": 1.9666, + "step": 4096 + }, + { + "epoch": 1.2575199508901167, + "grad_norm": 0.44656631350517273, + "learning_rate": 9.761722584806487e-05, + "loss": 1.9912, + "step": 4097 + }, + { + "epoch": 1.2578268876611418, + "grad_norm": 0.5149295330047607, + "learning_rate": 9.761570947044117e-05, + "loss": 1.9876, + "step": 4098 + }, + { + "epoch": 1.258133824432167, + "grad_norm": 0.5265617370605469, + "learning_rate": 9.761419262225111e-05, + "loss": 2.0817, + "step": 4099 + }, + { + "epoch": 1.2584407612031923, + "grad_norm": 0.5015068054199219, + "learning_rate": 9.76126753035097e-05, + "loss": 1.9767, + "step": 4100 + }, + { + "epoch": 1.2587476979742174, + "grad_norm": 0.5178890228271484, + "learning_rate": 9.761115751423192e-05, + "loss": 1.9968, + "step": 4101 + }, + { + "epoch": 1.2590546347452425, + "grad_norm": 0.46565014123916626, + "learning_rate": 9.760963925443279e-05, + "loss": 1.8977, + "step": 4102 + }, + { + "epoch": 1.2593615715162676, + "grad_norm": 0.466398686170578, + "learning_rate": 9.760812052412728e-05, + "loss": 2.0317, + "step": 4103 + }, + { + "epoch": 1.2596685082872927, + "grad_norm": 0.48445576429367065, + "learning_rate": 9.760660132333043e-05, + "loss": 1.9953, + "step": 4104 + }, + { + "epoch": 1.259975445058318, + "grad_norm": 0.5716978907585144, + "learning_rate": 9.760508165205724e-05, + "loss": 2.0468, + "step": 4105 + }, + { + "epoch": 1.2602823818293432, + "grad_norm": 0.5168376564979553, + "learning_rate": 9.760356151032273e-05, + "loss": 1.9896, + "step": 4106 + }, + { + "epoch": 1.2605893186003683, + "grad_norm": 0.5014469027519226, + "learning_rate": 9.760204089814192e-05, + "loss": 2.0855, + "step": 4107 + }, + { + "epoch": 1.2608962553713936, + "grad_norm": 0.5283352732658386, + "learning_rate": 9.760051981552984e-05, + "loss": 2.0477, + "step": 4108 + }, + { + "epoch": 1.2612031921424187, + "grad_norm": 0.4526209533214569, + "learning_rate": 9.759899826250153e-05, + "loss": 1.9638, + "step": 4109 + }, + { + "epoch": 1.2615101289134438, + "grad_norm": 0.4565027058124542, + "learning_rate": 9.759747623907203e-05, + "loss": 1.9401, + "step": 4110 + }, + { + "epoch": 1.261817065684469, + "grad_norm": 0.48825928568840027, + "learning_rate": 9.759595374525636e-05, + "loss": 1.9721, + "step": 4111 + }, + { + "epoch": 1.262124002455494, + "grad_norm": 0.4922933578491211, + "learning_rate": 9.759443078106958e-05, + "loss": 1.969, + "step": 4112 + }, + { + "epoch": 1.2624309392265194, + "grad_norm": 0.5227758884429932, + "learning_rate": 9.759290734652674e-05, + "loss": 2.0144, + "step": 4113 + }, + { + "epoch": 1.2627378759975445, + "grad_norm": 0.48013919591903687, + "learning_rate": 9.759138344164289e-05, + "loss": 1.9889, + "step": 4114 + }, + { + "epoch": 1.2630448127685696, + "grad_norm": 0.5039379596710205, + "learning_rate": 9.758985906643309e-05, + "loss": 1.9313, + "step": 4115 + }, + { + "epoch": 1.263351749539595, + "grad_norm": 0.5248776078224182, + "learning_rate": 9.758833422091244e-05, + "loss": 2.0091, + "step": 4116 + }, + { + "epoch": 1.26365868631062, + "grad_norm": 0.4788825809955597, + "learning_rate": 9.758680890509595e-05, + "loss": 2.0197, + "step": 4117 + }, + { + "epoch": 1.2639656230816452, + "grad_norm": 0.4926285743713379, + "learning_rate": 9.758528311899873e-05, + "loss": 2.0558, + "step": 4118 + }, + { + "epoch": 1.2642725598526703, + "grad_norm": 0.44785842299461365, + "learning_rate": 9.758375686263586e-05, + "loss": 1.9505, + "step": 4119 + }, + { + "epoch": 1.2645794966236954, + "grad_norm": 0.44693484902381897, + "learning_rate": 9.75822301360224e-05, + "loss": 1.9734, + "step": 4120 + }, + { + "epoch": 1.2648864333947207, + "grad_norm": 0.4691752791404724, + "learning_rate": 9.758070293917346e-05, + "loss": 2.0069, + "step": 4121 + }, + { + "epoch": 1.2651933701657458, + "grad_norm": 0.4718364477157593, + "learning_rate": 9.757917527210413e-05, + "loss": 1.9926, + "step": 4122 + }, + { + "epoch": 1.265500306936771, + "grad_norm": 0.47527435421943665, + "learning_rate": 9.757764713482949e-05, + "loss": 2.0304, + "step": 4123 + }, + { + "epoch": 1.2658072437077963, + "grad_norm": 0.5030924677848816, + "learning_rate": 9.757611852736467e-05, + "loss": 2.0281, + "step": 4124 + }, + { + "epoch": 1.2661141804788214, + "grad_norm": 0.5260440707206726, + "learning_rate": 9.757458944972475e-05, + "loss": 1.9952, + "step": 4125 + }, + { + "epoch": 1.2664211172498465, + "grad_norm": 0.5542300939559937, + "learning_rate": 9.757305990192486e-05, + "loss": 1.979, + "step": 4126 + }, + { + "epoch": 1.2667280540208716, + "grad_norm": 0.5589221715927124, + "learning_rate": 9.757152988398011e-05, + "loss": 2.0123, + "step": 4127 + }, + { + "epoch": 1.2670349907918967, + "grad_norm": 0.48933175206184387, + "learning_rate": 9.75699993959056e-05, + "loss": 1.9671, + "step": 4128 + }, + { + "epoch": 1.267341927562922, + "grad_norm": 0.4785501956939697, + "learning_rate": 9.75684684377165e-05, + "loss": 1.9452, + "step": 4129 + }, + { + "epoch": 1.2676488643339472, + "grad_norm": 0.5000367760658264, + "learning_rate": 9.75669370094279e-05, + "loss": 1.9637, + "step": 4130 + }, + { + "epoch": 1.2679558011049723, + "grad_norm": 0.5292743444442749, + "learning_rate": 9.756540511105496e-05, + "loss": 2.0464, + "step": 4131 + }, + { + "epoch": 1.2682627378759976, + "grad_norm": 0.4979592561721802, + "learning_rate": 9.75638727426128e-05, + "loss": 1.9863, + "step": 4132 + }, + { + "epoch": 1.2685696746470227, + "grad_norm": 0.4681611657142639, + "learning_rate": 9.756233990411656e-05, + "loss": 1.9978, + "step": 4133 + }, + { + "epoch": 1.2688766114180479, + "grad_norm": 0.5034354329109192, + "learning_rate": 9.756080659558142e-05, + "loss": 2.0332, + "step": 4134 + }, + { + "epoch": 1.269183548189073, + "grad_norm": 0.4815942347049713, + "learning_rate": 9.75592728170225e-05, + "loss": 1.9669, + "step": 4135 + }, + { + "epoch": 1.269490484960098, + "grad_norm": 0.49555137753486633, + "learning_rate": 9.755773856845498e-05, + "loss": 1.9774, + "step": 4136 + }, + { + "epoch": 1.2697974217311234, + "grad_norm": 0.5533550381660461, + "learning_rate": 9.755620384989401e-05, + "loss": 2.0236, + "step": 4137 + }, + { + "epoch": 1.2701043585021485, + "grad_norm": 0.49497511982917786, + "learning_rate": 9.755466866135476e-05, + "loss": 1.9266, + "step": 4138 + }, + { + "epoch": 1.2704112952731736, + "grad_norm": 0.5009804964065552, + "learning_rate": 9.755313300285239e-05, + "loss": 1.9463, + "step": 4139 + }, + { + "epoch": 1.270718232044199, + "grad_norm": 0.49870428442955017, + "learning_rate": 9.755159687440209e-05, + "loss": 1.9566, + "step": 4140 + }, + { + "epoch": 1.271025168815224, + "grad_norm": 0.49113500118255615, + "learning_rate": 9.755006027601905e-05, + "loss": 2.0075, + "step": 4141 + }, + { + "epoch": 1.2713321055862492, + "grad_norm": 0.45977187156677246, + "learning_rate": 9.754852320771845e-05, + "loss": 1.9358, + "step": 4142 + }, + { + "epoch": 1.2716390423572743, + "grad_norm": 0.5493664145469666, + "learning_rate": 9.754698566951545e-05, + "loss": 1.9996, + "step": 4143 + }, + { + "epoch": 1.2719459791282997, + "grad_norm": 0.4791078567504883, + "learning_rate": 9.75454476614253e-05, + "loss": 1.9426, + "step": 4144 + }, + { + "epoch": 1.2722529158993248, + "grad_norm": 0.4809282720088959, + "learning_rate": 9.754390918346315e-05, + "loss": 2.0197, + "step": 4145 + }, + { + "epoch": 1.2725598526703499, + "grad_norm": 0.5380387902259827, + "learning_rate": 9.754237023564423e-05, + "loss": 2.0261, + "step": 4146 + }, + { + "epoch": 1.272866789441375, + "grad_norm": 0.48302608728408813, + "learning_rate": 9.754083081798374e-05, + "loss": 2.0539, + "step": 4147 + }, + { + "epoch": 1.2731737262124003, + "grad_norm": 0.5752124786376953, + "learning_rate": 9.75392909304969e-05, + "loss": 2.0901, + "step": 4148 + }, + { + "epoch": 1.2734806629834254, + "grad_norm": 0.5538807511329651, + "learning_rate": 9.75377505731989e-05, + "loss": 1.9721, + "step": 4149 + }, + { + "epoch": 1.2737875997544506, + "grad_norm": 0.6331756114959717, + "learning_rate": 9.753620974610502e-05, + "loss": 2.0124, + "step": 4150 + }, + { + "epoch": 1.2740945365254759, + "grad_norm": 0.6422140598297119, + "learning_rate": 9.753466844923042e-05, + "loss": 2.0115, + "step": 4151 + }, + { + "epoch": 1.274401473296501, + "grad_norm": 0.6650347113609314, + "learning_rate": 9.753312668259038e-05, + "loss": 1.9735, + "step": 4152 + }, + { + "epoch": 1.274708410067526, + "grad_norm": 0.587230384349823, + "learning_rate": 9.753158444620013e-05, + "loss": 1.9382, + "step": 4153 + }, + { + "epoch": 1.2750153468385512, + "grad_norm": 0.5357664823532104, + "learning_rate": 9.75300417400749e-05, + "loss": 2.0437, + "step": 4154 + }, + { + "epoch": 1.2753222836095763, + "grad_norm": 0.5058115720748901, + "learning_rate": 9.752849856422994e-05, + "loss": 2.0031, + "step": 4155 + }, + { + "epoch": 1.2756292203806017, + "grad_norm": 0.5913745164871216, + "learning_rate": 9.75269549186805e-05, + "loss": 1.9923, + "step": 4156 + }, + { + "epoch": 1.2759361571516268, + "grad_norm": 0.6766920685768127, + "learning_rate": 9.752541080344181e-05, + "loss": 1.9619, + "step": 4157 + }, + { + "epoch": 1.276243093922652, + "grad_norm": 0.606132984161377, + "learning_rate": 9.752386621852919e-05, + "loss": 1.9689, + "step": 4158 + }, + { + "epoch": 1.2765500306936772, + "grad_norm": 0.521133542060852, + "learning_rate": 9.752232116395785e-05, + "loss": 1.9602, + "step": 4159 + }, + { + "epoch": 1.2768569674647023, + "grad_norm": 0.45266324281692505, + "learning_rate": 9.75207756397431e-05, + "loss": 2.0032, + "step": 4160 + }, + { + "epoch": 1.2771639042357275, + "grad_norm": 0.5078892707824707, + "learning_rate": 9.751922964590017e-05, + "loss": 2.0656, + "step": 4161 + }, + { + "epoch": 1.2774708410067526, + "grad_norm": 0.5042154788970947, + "learning_rate": 9.751768318244437e-05, + "loss": 1.9356, + "step": 4162 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.5866135954856873, + "learning_rate": 9.751613624939098e-05, + "loss": 1.9655, + "step": 4163 + }, + { + "epoch": 1.278084714548803, + "grad_norm": 0.6038163304328918, + "learning_rate": 9.751458884675527e-05, + "loss": 1.9445, + "step": 4164 + }, + { + "epoch": 1.2783916513198281, + "grad_norm": 0.4938269555568695, + "learning_rate": 9.751304097455254e-05, + "loss": 2.0164, + "step": 4165 + }, + { + "epoch": 1.2786985880908532, + "grad_norm": 0.4289272427558899, + "learning_rate": 9.75114926327981e-05, + "loss": 1.912, + "step": 4166 + }, + { + "epoch": 1.2790055248618786, + "grad_norm": 0.524058997631073, + "learning_rate": 9.750994382150724e-05, + "loss": 1.9279, + "step": 4167 + }, + { + "epoch": 1.2793124616329037, + "grad_norm": 0.6318224668502808, + "learning_rate": 9.750839454069527e-05, + "loss": 1.98, + "step": 4168 + }, + { + "epoch": 1.2796193984039288, + "grad_norm": 0.5709670782089233, + "learning_rate": 9.750684479037749e-05, + "loss": 2.0029, + "step": 4169 + }, + { + "epoch": 1.279926335174954, + "grad_norm": 0.4621523916721344, + "learning_rate": 9.750529457056924e-05, + "loss": 2.0295, + "step": 4170 + }, + { + "epoch": 1.280233271945979, + "grad_norm": 0.5812001824378967, + "learning_rate": 9.750374388128581e-05, + "loss": 2.0839, + "step": 4171 + }, + { + "epoch": 1.2805402087170044, + "grad_norm": 0.6389874219894409, + "learning_rate": 9.750219272254256e-05, + "loss": 2.0825, + "step": 4172 + }, + { + "epoch": 1.2808471454880295, + "grad_norm": 0.49902382493019104, + "learning_rate": 9.750064109435478e-05, + "loss": 1.8902, + "step": 4173 + }, + { + "epoch": 1.2811540822590546, + "grad_norm": 0.5641525983810425, + "learning_rate": 9.749908899673783e-05, + "loss": 2.0463, + "step": 4174 + }, + { + "epoch": 1.28146101903008, + "grad_norm": 0.5977841019630432, + "learning_rate": 9.749753642970704e-05, + "loss": 2.0253, + "step": 4175 + }, + { + "epoch": 1.281767955801105, + "grad_norm": 0.5438104271888733, + "learning_rate": 9.749598339327777e-05, + "loss": 1.9862, + "step": 4176 + }, + { + "epoch": 1.2820748925721301, + "grad_norm": 0.4542587697505951, + "learning_rate": 9.749442988746535e-05, + "loss": 1.9476, + "step": 4177 + }, + { + "epoch": 1.2823818293431553, + "grad_norm": 0.4900791347026825, + "learning_rate": 9.749287591228513e-05, + "loss": 2.0093, + "step": 4178 + }, + { + "epoch": 1.2826887661141804, + "grad_norm": 0.5837534666061401, + "learning_rate": 9.749132146775247e-05, + "loss": 2.0699, + "step": 4179 + }, + { + "epoch": 1.2829957028852057, + "grad_norm": 0.5315881967544556, + "learning_rate": 9.748976655388274e-05, + "loss": 1.9514, + "step": 4180 + }, + { + "epoch": 1.2833026396562308, + "grad_norm": 0.5284895300865173, + "learning_rate": 9.74882111706913e-05, + "loss": 2.0171, + "step": 4181 + }, + { + "epoch": 1.283609576427256, + "grad_norm": 0.521202802658081, + "learning_rate": 9.748665531819352e-05, + "loss": 2.025, + "step": 4182 + }, + { + "epoch": 1.2839165131982813, + "grad_norm": 0.5437573194503784, + "learning_rate": 9.748509899640479e-05, + "loss": 2.0352, + "step": 4183 + }, + { + "epoch": 1.2842234499693064, + "grad_norm": 0.5394143462181091, + "learning_rate": 9.748354220534048e-05, + "loss": 2.0245, + "step": 4184 + }, + { + "epoch": 1.2845303867403315, + "grad_norm": 0.47468093037605286, + "learning_rate": 9.748198494501597e-05, + "loss": 1.9719, + "step": 4185 + }, + { + "epoch": 1.2848373235113566, + "grad_norm": 0.5312216877937317, + "learning_rate": 9.748042721544666e-05, + "loss": 2.0111, + "step": 4186 + }, + { + "epoch": 1.2851442602823817, + "grad_norm": 0.525694727897644, + "learning_rate": 9.747886901664794e-05, + "loss": 2.0582, + "step": 4187 + }, + { + "epoch": 1.285451197053407, + "grad_norm": 0.4965955317020416, + "learning_rate": 9.74773103486352e-05, + "loss": 1.9777, + "step": 4188 + }, + { + "epoch": 1.2857581338244322, + "grad_norm": 0.4391513466835022, + "learning_rate": 9.747575121142385e-05, + "loss": 1.9725, + "step": 4189 + }, + { + "epoch": 1.2860650705954573, + "grad_norm": 0.48999011516571045, + "learning_rate": 9.74741916050293e-05, + "loss": 1.953, + "step": 4190 + }, + { + "epoch": 1.2863720073664826, + "grad_norm": 0.5297304391860962, + "learning_rate": 9.747263152946698e-05, + "loss": 2.0484, + "step": 4191 + }, + { + "epoch": 1.2866789441375077, + "grad_norm": 0.4878230690956116, + "learning_rate": 9.747107098475226e-05, + "loss": 2.0423, + "step": 4192 + }, + { + "epoch": 1.2869858809085328, + "grad_norm": 0.538070023059845, + "learning_rate": 9.74695099709006e-05, + "loss": 2.0699, + "step": 4193 + }, + { + "epoch": 1.287292817679558, + "grad_norm": 0.6656436324119568, + "learning_rate": 9.746794848792743e-05, + "loss": 2.0689, + "step": 4194 + }, + { + "epoch": 1.287599754450583, + "grad_norm": 0.6416848301887512, + "learning_rate": 9.746638653584819e-05, + "loss": 1.9796, + "step": 4195 + }, + { + "epoch": 1.2879066912216084, + "grad_norm": 0.5917447805404663, + "learning_rate": 9.746482411467827e-05, + "loss": 2.0324, + "step": 4196 + }, + { + "epoch": 1.2882136279926335, + "grad_norm": 0.5234537124633789, + "learning_rate": 9.746326122443314e-05, + "loss": 2.0468, + "step": 4197 + }, + { + "epoch": 1.2885205647636586, + "grad_norm": 0.4885808229446411, + "learning_rate": 9.746169786512827e-05, + "loss": 1.9619, + "step": 4198 + }, + { + "epoch": 1.288827501534684, + "grad_norm": 0.5776945948600769, + "learning_rate": 9.746013403677905e-05, + "loss": 2.0167, + "step": 4199 + }, + { + "epoch": 1.289134438305709, + "grad_norm": 0.5722271203994751, + "learning_rate": 9.745856973940099e-05, + "loss": 1.9751, + "step": 4200 + }, + { + "epoch": 1.2894413750767342, + "grad_norm": 0.49253931641578674, + "learning_rate": 9.745700497300951e-05, + "loss": 1.9821, + "step": 4201 + }, + { + "epoch": 1.2897483118477593, + "grad_norm": 0.4739282727241516, + "learning_rate": 9.74554397376201e-05, + "loss": 1.9926, + "step": 4202 + }, + { + "epoch": 1.2900552486187844, + "grad_norm": 0.5133153200149536, + "learning_rate": 9.745387403324823e-05, + "loss": 1.9655, + "step": 4203 + }, + { + "epoch": 1.2903621853898097, + "grad_norm": 0.48941388726234436, + "learning_rate": 9.745230785990935e-05, + "loss": 1.9401, + "step": 4204 + }, + { + "epoch": 1.2906691221608348, + "grad_norm": 0.5998152494430542, + "learning_rate": 9.745074121761896e-05, + "loss": 2.0223, + "step": 4205 + }, + { + "epoch": 1.29097605893186, + "grad_norm": 0.4423331618309021, + "learning_rate": 9.744917410639253e-05, + "loss": 1.9602, + "step": 4206 + }, + { + "epoch": 1.2912829957028853, + "grad_norm": 0.5387418866157532, + "learning_rate": 9.744760652624553e-05, + "loss": 2.0631, + "step": 4207 + }, + { + "epoch": 1.2915899324739104, + "grad_norm": 0.5992900729179382, + "learning_rate": 9.744603847719352e-05, + "loss": 1.9805, + "step": 4208 + }, + { + "epoch": 1.2918968692449355, + "grad_norm": 0.5033924579620361, + "learning_rate": 9.744446995925192e-05, + "loss": 1.9817, + "step": 4209 + }, + { + "epoch": 1.2922038060159606, + "grad_norm": 0.47493448853492737, + "learning_rate": 9.744290097243624e-05, + "loss": 2.0259, + "step": 4210 + }, + { + "epoch": 1.2925107427869857, + "grad_norm": 0.5161942839622498, + "learning_rate": 9.744133151676203e-05, + "loss": 1.9686, + "step": 4211 + }, + { + "epoch": 1.292817679558011, + "grad_norm": 0.4476351737976074, + "learning_rate": 9.743976159224477e-05, + "loss": 1.9488, + "step": 4212 + }, + { + "epoch": 1.2931246163290362, + "grad_norm": 0.5168361663818359, + "learning_rate": 9.743819119889999e-05, + "loss": 2.0645, + "step": 4213 + }, + { + "epoch": 1.2934315531000613, + "grad_norm": 0.5098811984062195, + "learning_rate": 9.743662033674319e-05, + "loss": 1.9889, + "step": 4214 + }, + { + "epoch": 1.2937384898710866, + "grad_norm": 0.5559372305870056, + "learning_rate": 9.74350490057899e-05, + "loss": 2.0348, + "step": 4215 + }, + { + "epoch": 1.2940454266421118, + "grad_norm": 0.5274948477745056, + "learning_rate": 9.743347720605566e-05, + "loss": 2.0566, + "step": 4216 + }, + { + "epoch": 1.2943523634131369, + "grad_norm": 0.5009967088699341, + "learning_rate": 9.743190493755601e-05, + "loss": 1.9915, + "step": 4217 + }, + { + "epoch": 1.2946593001841622, + "grad_norm": 0.5365834832191467, + "learning_rate": 9.743033220030646e-05, + "loss": 2.0581, + "step": 4218 + }, + { + "epoch": 1.2949662369551873, + "grad_norm": 0.519478976726532, + "learning_rate": 9.742875899432255e-05, + "loss": 1.9766, + "step": 4219 + }, + { + "epoch": 1.2952731737262124, + "grad_norm": 0.48030364513397217, + "learning_rate": 9.742718531961988e-05, + "loss": 2.0006, + "step": 4220 + }, + { + "epoch": 1.2955801104972375, + "grad_norm": 0.5257472991943359, + "learning_rate": 9.742561117621394e-05, + "loss": 2.0636, + "step": 4221 + }, + { + "epoch": 1.2958870472682626, + "grad_norm": 0.44784319400787354, + "learning_rate": 9.742403656412034e-05, + "loss": 1.9975, + "step": 4222 + }, + { + "epoch": 1.296193984039288, + "grad_norm": 0.4997022747993469, + "learning_rate": 9.742246148335459e-05, + "loss": 2.0167, + "step": 4223 + }, + { + "epoch": 1.296500920810313, + "grad_norm": 0.43378305435180664, + "learning_rate": 9.742088593393228e-05, + "loss": 1.9202, + "step": 4224 + }, + { + "epoch": 1.2968078575813382, + "grad_norm": 0.5256497859954834, + "learning_rate": 9.741930991586899e-05, + "loss": 2.0306, + "step": 4225 + }, + { + "epoch": 1.2971147943523635, + "grad_norm": 0.5017027258872986, + "learning_rate": 9.741773342918028e-05, + "loss": 2.0124, + "step": 4226 + }, + { + "epoch": 1.2974217311233887, + "grad_norm": 0.5393915176391602, + "learning_rate": 9.741615647388175e-05, + "loss": 2.0255, + "step": 4227 + }, + { + "epoch": 1.2977286678944138, + "grad_norm": 0.48618295788764954, + "learning_rate": 9.741457904998896e-05, + "loss": 1.9863, + "step": 4228 + }, + { + "epoch": 1.2980356046654389, + "grad_norm": 0.48060059547424316, + "learning_rate": 9.741300115751752e-05, + "loss": 2.0787, + "step": 4229 + }, + { + "epoch": 1.298342541436464, + "grad_norm": 0.4966236650943756, + "learning_rate": 9.741142279648298e-05, + "loss": 1.9818, + "step": 4230 + }, + { + "epoch": 1.2986494782074893, + "grad_norm": 0.5178021788597107, + "learning_rate": 9.7409843966901e-05, + "loss": 1.9847, + "step": 4231 + }, + { + "epoch": 1.2989564149785144, + "grad_norm": 0.5161063075065613, + "learning_rate": 9.740826466878716e-05, + "loss": 2.0028, + "step": 4232 + }, + { + "epoch": 1.2992633517495396, + "grad_norm": 0.5972462296485901, + "learning_rate": 9.740668490215705e-05, + "loss": 2.0205, + "step": 4233 + }, + { + "epoch": 1.2995702885205649, + "grad_norm": 0.5929185152053833, + "learning_rate": 9.740510466702629e-05, + "loss": 1.9802, + "step": 4234 + }, + { + "epoch": 1.29987722529159, + "grad_norm": 0.5496684908866882, + "learning_rate": 9.74035239634105e-05, + "loss": 1.9331, + "step": 4235 + }, + { + "epoch": 1.3001841620626151, + "grad_norm": 0.5822622179985046, + "learning_rate": 9.740194279132531e-05, + "loss": 2.1079, + "step": 4236 + }, + { + "epoch": 1.3004910988336402, + "grad_norm": 0.5886369943618774, + "learning_rate": 9.740036115078634e-05, + "loss": 1.9938, + "step": 4237 + }, + { + "epoch": 1.3007980356046653, + "grad_norm": 0.5259171724319458, + "learning_rate": 9.73987790418092e-05, + "loss": 2.0787, + "step": 4238 + }, + { + "epoch": 1.3011049723756907, + "grad_norm": 0.6112152934074402, + "learning_rate": 9.739719646440956e-05, + "loss": 2.0488, + "step": 4239 + }, + { + "epoch": 1.3014119091467158, + "grad_norm": 0.5786338448524475, + "learning_rate": 9.739561341860306e-05, + "loss": 1.9917, + "step": 4240 + }, + { + "epoch": 1.301718845917741, + "grad_norm": 0.5099230408668518, + "learning_rate": 9.739402990440531e-05, + "loss": 1.9949, + "step": 4241 + }, + { + "epoch": 1.3020257826887662, + "grad_norm": 0.5040346384048462, + "learning_rate": 9.739244592183198e-05, + "loss": 1.9368, + "step": 4242 + }, + { + "epoch": 1.3023327194597913, + "grad_norm": 0.48172008991241455, + "learning_rate": 9.739086147089871e-05, + "loss": 1.97, + "step": 4243 + }, + { + "epoch": 1.3026396562308165, + "grad_norm": 0.5350810885429382, + "learning_rate": 9.738927655162119e-05, + "loss": 2.0584, + "step": 4244 + }, + { + "epoch": 1.3029465930018416, + "grad_norm": 0.566371738910675, + "learning_rate": 9.738769116401505e-05, + "loss": 2.0138, + "step": 4245 + }, + { + "epoch": 1.3032535297728667, + "grad_norm": 0.5697746872901917, + "learning_rate": 9.738610530809598e-05, + "loss": 2.0319, + "step": 4246 + }, + { + "epoch": 1.303560466543892, + "grad_norm": 0.5186757445335388, + "learning_rate": 9.738451898387964e-05, + "loss": 1.9958, + "step": 4247 + }, + { + "epoch": 1.3038674033149171, + "grad_norm": 0.5318703651428223, + "learning_rate": 9.73829321913817e-05, + "loss": 2.0857, + "step": 4248 + }, + { + "epoch": 1.3041743400859422, + "grad_norm": 0.5013560056686401, + "learning_rate": 9.738134493061786e-05, + "loss": 1.9545, + "step": 4249 + }, + { + "epoch": 1.3044812768569676, + "grad_norm": 0.499009907245636, + "learning_rate": 9.737975720160382e-05, + "loss": 1.9773, + "step": 4250 + }, + { + "epoch": 1.3047882136279927, + "grad_norm": 0.5187140703201294, + "learning_rate": 9.737816900435522e-05, + "loss": 1.9826, + "step": 4251 + }, + { + "epoch": 1.3050951503990178, + "grad_norm": 0.4950683116912842, + "learning_rate": 9.73765803388878e-05, + "loss": 2.0061, + "step": 4252 + }, + { + "epoch": 1.305402087170043, + "grad_norm": 0.40729087591171265, + "learning_rate": 9.737499120521722e-05, + "loss": 1.9502, + "step": 4253 + }, + { + "epoch": 1.305709023941068, + "grad_norm": 0.4959156811237335, + "learning_rate": 9.737340160335924e-05, + "loss": 2.0975, + "step": 4254 + }, + { + "epoch": 1.3060159607120934, + "grad_norm": 0.5127618312835693, + "learning_rate": 9.737181153332952e-05, + "loss": 2.0098, + "step": 4255 + }, + { + "epoch": 1.3063228974831185, + "grad_norm": 0.45458972454071045, + "learning_rate": 9.737022099514381e-05, + "loss": 1.9475, + "step": 4256 + }, + { + "epoch": 1.3066298342541436, + "grad_norm": 0.5024627447128296, + "learning_rate": 9.736862998881779e-05, + "loss": 2.0682, + "step": 4257 + }, + { + "epoch": 1.306936771025169, + "grad_norm": 0.5217326283454895, + "learning_rate": 9.736703851436722e-05, + "loss": 2.0363, + "step": 4258 + }, + { + "epoch": 1.307243707796194, + "grad_norm": 0.4798679053783417, + "learning_rate": 9.736544657180781e-05, + "loss": 2.0357, + "step": 4259 + }, + { + "epoch": 1.3075506445672191, + "grad_norm": 0.6031736135482788, + "learning_rate": 9.73638541611553e-05, + "loss": 2.0143, + "step": 4260 + }, + { + "epoch": 1.3078575813382443, + "grad_norm": 0.4914969801902771, + "learning_rate": 9.736226128242542e-05, + "loss": 1.9292, + "step": 4261 + }, + { + "epoch": 1.3081645181092694, + "grad_norm": 0.40556418895721436, + "learning_rate": 9.736066793563392e-05, + "loss": 1.9528, + "step": 4262 + }, + { + "epoch": 1.3084714548802947, + "grad_norm": 0.45605841279029846, + "learning_rate": 9.735907412079652e-05, + "loss": 2.0704, + "step": 4263 + }, + { + "epoch": 1.3087783916513198, + "grad_norm": 0.4992324113845825, + "learning_rate": 9.7357479837929e-05, + "loss": 2.0211, + "step": 4264 + }, + { + "epoch": 1.309085328422345, + "grad_norm": 0.4904097020626068, + "learning_rate": 9.735588508704712e-05, + "loss": 1.987, + "step": 4265 + }, + { + "epoch": 1.3093922651933703, + "grad_norm": 0.5436086058616638, + "learning_rate": 9.735428986816661e-05, + "loss": 2.0704, + "step": 4266 + }, + { + "epoch": 1.3096992019643954, + "grad_norm": 0.4850294589996338, + "learning_rate": 9.735269418130326e-05, + "loss": 1.9576, + "step": 4267 + }, + { + "epoch": 1.3100061387354205, + "grad_norm": 0.44082164764404297, + "learning_rate": 9.735109802647283e-05, + "loss": 2.0018, + "step": 4268 + }, + { + "epoch": 1.3103130755064456, + "grad_norm": 0.4844531714916229, + "learning_rate": 9.73495014036911e-05, + "loss": 1.9852, + "step": 4269 + }, + { + "epoch": 1.3106200122774707, + "grad_norm": 0.547596275806427, + "learning_rate": 9.734790431297384e-05, + "loss": 2.0632, + "step": 4270 + }, + { + "epoch": 1.310926949048496, + "grad_norm": 0.517882764339447, + "learning_rate": 9.734630675433684e-05, + "loss": 1.9851, + "step": 4271 + }, + { + "epoch": 1.3112338858195212, + "grad_norm": 0.5148623585700989, + "learning_rate": 9.734470872779589e-05, + "loss": 2.0446, + "step": 4272 + }, + { + "epoch": 1.3115408225905463, + "grad_norm": 0.5872887372970581, + "learning_rate": 9.734311023336678e-05, + "loss": 2.0588, + "step": 4273 + }, + { + "epoch": 1.3118477593615716, + "grad_norm": 0.7116255164146423, + "learning_rate": 9.73415112710653e-05, + "loss": 2.0213, + "step": 4274 + }, + { + "epoch": 1.3121546961325967, + "grad_norm": 0.8191964626312256, + "learning_rate": 9.733991184090725e-05, + "loss": 1.9528, + "step": 4275 + }, + { + "epoch": 1.3124616329036218, + "grad_norm": 0.8214605450630188, + "learning_rate": 9.733831194290846e-05, + "loss": 1.9614, + "step": 4276 + }, + { + "epoch": 1.312768569674647, + "grad_norm": 0.7057182788848877, + "learning_rate": 9.733671157708472e-05, + "loss": 2.0767, + "step": 4277 + }, + { + "epoch": 1.313075506445672, + "grad_norm": 0.5114007592201233, + "learning_rate": 9.733511074345185e-05, + "loss": 1.946, + "step": 4278 + }, + { + "epoch": 1.3133824432166974, + "grad_norm": 0.5347970128059387, + "learning_rate": 9.733350944202566e-05, + "loss": 1.9658, + "step": 4279 + }, + { + "epoch": 1.3136893799877225, + "grad_norm": 0.6962214112281799, + "learning_rate": 9.733190767282202e-05, + "loss": 2.0943, + "step": 4280 + }, + { + "epoch": 1.3139963167587476, + "grad_norm": 0.5942707657814026, + "learning_rate": 9.733030543585668e-05, + "loss": 2.0101, + "step": 4281 + }, + { + "epoch": 1.314303253529773, + "grad_norm": 0.46218639612197876, + "learning_rate": 9.732870273114556e-05, + "loss": 2.0292, + "step": 4282 + }, + { + "epoch": 1.314610190300798, + "grad_norm": 0.5194444060325623, + "learning_rate": 9.732709955870445e-05, + "loss": 2.0666, + "step": 4283 + }, + { + "epoch": 1.3149171270718232, + "grad_norm": 0.5112141370773315, + "learning_rate": 9.732549591854918e-05, + "loss": 2.0205, + "step": 4284 + }, + { + "epoch": 1.3152240638428485, + "grad_norm": 0.5282790660858154, + "learning_rate": 9.732389181069566e-05, + "loss": 2.0704, + "step": 4285 + }, + { + "epoch": 1.3155310006138736, + "grad_norm": 0.4598311185836792, + "learning_rate": 9.732228723515968e-05, + "loss": 1.9485, + "step": 4286 + }, + { + "epoch": 1.3158379373848987, + "grad_norm": 0.4700186550617218, + "learning_rate": 9.732068219195711e-05, + "loss": 2.0329, + "step": 4287 + }, + { + "epoch": 1.3161448741559238, + "grad_norm": 0.4512452781200409, + "learning_rate": 9.731907668110384e-05, + "loss": 1.9829, + "step": 4288 + }, + { + "epoch": 1.316451810926949, + "grad_norm": 0.5053353309631348, + "learning_rate": 9.731747070261572e-05, + "loss": 2.0583, + "step": 4289 + }, + { + "epoch": 1.3167587476979743, + "grad_norm": 0.48143625259399414, + "learning_rate": 9.73158642565086e-05, + "loss": 2.014, + "step": 4290 + }, + { + "epoch": 1.3170656844689994, + "grad_norm": 0.4843716025352478, + "learning_rate": 9.73142573427984e-05, + "loss": 1.9951, + "step": 4291 + }, + { + "epoch": 1.3173726212400245, + "grad_norm": 0.45646217465400696, + "learning_rate": 9.731264996150098e-05, + "loss": 1.9701, + "step": 4292 + }, + { + "epoch": 1.3176795580110499, + "grad_norm": 0.5176306962966919, + "learning_rate": 9.73110421126322e-05, + "loss": 1.9915, + "step": 4293 + }, + { + "epoch": 1.317986494782075, + "grad_norm": 0.4862259328365326, + "learning_rate": 9.730943379620799e-05, + "loss": 2.0157, + "step": 4294 + }, + { + "epoch": 1.3182934315531, + "grad_norm": 0.4941593110561371, + "learning_rate": 9.730782501224423e-05, + "loss": 2.0164, + "step": 4295 + }, + { + "epoch": 1.3186003683241252, + "grad_norm": 0.46818530559539795, + "learning_rate": 9.73062157607568e-05, + "loss": 1.9749, + "step": 4296 + }, + { + "epoch": 1.3189073050951503, + "grad_norm": 0.41685113310813904, + "learning_rate": 9.730460604176163e-05, + "loss": 1.9443, + "step": 4297 + }, + { + "epoch": 1.3192142418661756, + "grad_norm": 0.40586861968040466, + "learning_rate": 9.73029958552746e-05, + "loss": 1.9227, + "step": 4298 + }, + { + "epoch": 1.3195211786372008, + "grad_norm": 0.3946068286895752, + "learning_rate": 9.730138520131167e-05, + "loss": 1.9073, + "step": 4299 + }, + { + "epoch": 1.3198281154082259, + "grad_norm": 0.3722321093082428, + "learning_rate": 9.729977407988871e-05, + "loss": 1.9299, + "step": 4300 + }, + { + "epoch": 1.3201350521792512, + "grad_norm": 0.39335691928863525, + "learning_rate": 9.729816249102164e-05, + "loss": 1.9673, + "step": 4301 + }, + { + "epoch": 1.3204419889502763, + "grad_norm": 0.4342779815196991, + "learning_rate": 9.729655043472643e-05, + "loss": 2.0704, + "step": 4302 + }, + { + "epoch": 1.3207489257213014, + "grad_norm": 0.46981000900268555, + "learning_rate": 9.729493791101899e-05, + "loss": 2.0593, + "step": 4303 + }, + { + "epoch": 1.3210558624923265, + "grad_norm": 0.4319849908351898, + "learning_rate": 9.729332491991524e-05, + "loss": 1.9378, + "step": 4304 + }, + { + "epoch": 1.3213627992633517, + "grad_norm": 0.4555012285709381, + "learning_rate": 9.729171146143115e-05, + "loss": 1.993, + "step": 4305 + }, + { + "epoch": 1.321669736034377, + "grad_norm": 0.5122297406196594, + "learning_rate": 9.729009753558262e-05, + "loss": 2.0237, + "step": 4306 + }, + { + "epoch": 1.321976672805402, + "grad_norm": 0.4814549386501312, + "learning_rate": 9.728848314238566e-05, + "loss": 2.0063, + "step": 4307 + }, + { + "epoch": 1.3222836095764272, + "grad_norm": 0.45410022139549255, + "learning_rate": 9.728686828185618e-05, + "loss": 2.0262, + "step": 4308 + }, + { + "epoch": 1.3225905463474525, + "grad_norm": 0.44759154319763184, + "learning_rate": 9.728525295401014e-05, + "loss": 1.9746, + "step": 4309 + }, + { + "epoch": 1.3228974831184777, + "grad_norm": 0.41539889574050903, + "learning_rate": 9.728363715886352e-05, + "loss": 1.9197, + "step": 4310 + }, + { + "epoch": 1.3232044198895028, + "grad_norm": 0.549961268901825, + "learning_rate": 9.72820208964323e-05, + "loss": 2.0168, + "step": 4311 + }, + { + "epoch": 1.3235113566605279, + "grad_norm": 0.6832249164581299, + "learning_rate": 9.728040416673243e-05, + "loss": 1.9711, + "step": 4312 + }, + { + "epoch": 1.323818293431553, + "grad_norm": 0.7458481788635254, + "learning_rate": 9.727878696977988e-05, + "loss": 2.1677, + "step": 4313 + }, + { + "epoch": 1.3241252302025783, + "grad_norm": 0.6268119812011719, + "learning_rate": 9.727716930559066e-05, + "loss": 2.0222, + "step": 4314 + }, + { + "epoch": 1.3244321669736034, + "grad_norm": 0.540987491607666, + "learning_rate": 9.727555117418075e-05, + "loss": 2.0552, + "step": 4315 + }, + { + "epoch": 1.3247391037446286, + "grad_norm": 0.6105024814605713, + "learning_rate": 9.727393257556612e-05, + "loss": 1.9287, + "step": 4316 + }, + { + "epoch": 1.325046040515654, + "grad_norm": 0.594327449798584, + "learning_rate": 9.727231350976277e-05, + "loss": 1.9737, + "step": 4317 + }, + { + "epoch": 1.325352977286679, + "grad_norm": 0.5686312913894653, + "learning_rate": 9.727069397678674e-05, + "loss": 1.988, + "step": 4318 + }, + { + "epoch": 1.3256599140577041, + "grad_norm": 0.5335875153541565, + "learning_rate": 9.726907397665399e-05, + "loss": 1.9992, + "step": 4319 + }, + { + "epoch": 1.3259668508287292, + "grad_norm": 0.514209508895874, + "learning_rate": 9.726745350938055e-05, + "loss": 2.0928, + "step": 4320 + }, + { + "epoch": 1.3262737875997543, + "grad_norm": 0.58844393491745, + "learning_rate": 9.726583257498242e-05, + "loss": 1.968, + "step": 4321 + }, + { + "epoch": 1.3265807243707797, + "grad_norm": 0.5247591733932495, + "learning_rate": 9.726421117347563e-05, + "loss": 1.9529, + "step": 4322 + }, + { + "epoch": 1.3268876611418048, + "grad_norm": 0.5057464241981506, + "learning_rate": 9.726258930487622e-05, + "loss": 2.0595, + "step": 4323 + }, + { + "epoch": 1.32719459791283, + "grad_norm": 0.564689040184021, + "learning_rate": 9.726096696920019e-05, + "loss": 1.9974, + "step": 4324 + }, + { + "epoch": 1.3275015346838552, + "grad_norm": 0.5755618214607239, + "learning_rate": 9.725934416646358e-05, + "loss": 1.9949, + "step": 4325 + }, + { + "epoch": 1.3278084714548803, + "grad_norm": 0.5969316959381104, + "learning_rate": 9.725772089668243e-05, + "loss": 1.972, + "step": 4326 + }, + { + "epoch": 1.3281154082259055, + "grad_norm": 0.5776877403259277, + "learning_rate": 9.725609715987278e-05, + "loss": 2.1018, + "step": 4327 + }, + { + "epoch": 1.3284223449969306, + "grad_norm": 0.5471270680427551, + "learning_rate": 9.725447295605071e-05, + "loss": 2.0153, + "step": 4328 + }, + { + "epoch": 1.3287292817679557, + "grad_norm": 0.49090373516082764, + "learning_rate": 9.725284828523222e-05, + "loss": 1.9651, + "step": 4329 + }, + { + "epoch": 1.329036218538981, + "grad_norm": 0.49420034885406494, + "learning_rate": 9.725122314743337e-05, + "loss": 2.0119, + "step": 4330 + }, + { + "epoch": 1.3293431553100061, + "grad_norm": 0.4841148853302002, + "learning_rate": 9.724959754267027e-05, + "loss": 1.974, + "step": 4331 + }, + { + "epoch": 1.3296500920810312, + "grad_norm": 0.42349007725715637, + "learning_rate": 9.724797147095893e-05, + "loss": 1.9779, + "step": 4332 + }, + { + "epoch": 1.3299570288520566, + "grad_norm": 0.47239863872528076, + "learning_rate": 9.724634493231545e-05, + "loss": 1.9184, + "step": 4333 + }, + { + "epoch": 1.3302639656230817, + "grad_norm": 0.5583773255348206, + "learning_rate": 9.72447179267559e-05, + "loss": 2.0742, + "step": 4334 + }, + { + "epoch": 1.3305709023941068, + "grad_norm": 0.486937552690506, + "learning_rate": 9.724309045429636e-05, + "loss": 2.0101, + "step": 4335 + }, + { + "epoch": 1.330877839165132, + "grad_norm": 0.42204493284225464, + "learning_rate": 9.724146251495289e-05, + "loss": 1.9564, + "step": 4336 + }, + { + "epoch": 1.331184775936157, + "grad_norm": 0.451628714799881, + "learning_rate": 9.723983410874163e-05, + "loss": 1.9949, + "step": 4337 + }, + { + "epoch": 1.3314917127071824, + "grad_norm": 0.4453491270542145, + "learning_rate": 9.723820523567861e-05, + "loss": 1.9415, + "step": 4338 + }, + { + "epoch": 1.3317986494782075, + "grad_norm": 0.4628424644470215, + "learning_rate": 9.723657589577999e-05, + "loss": 2.0296, + "step": 4339 + }, + { + "epoch": 1.3321055862492326, + "grad_norm": 0.5362148284912109, + "learning_rate": 9.723494608906181e-05, + "loss": 2.0719, + "step": 4340 + }, + { + "epoch": 1.332412523020258, + "grad_norm": 0.45357146859169006, + "learning_rate": 9.723331581554023e-05, + "loss": 1.9107, + "step": 4341 + }, + { + "epoch": 1.332719459791283, + "grad_norm": 0.5042485594749451, + "learning_rate": 9.723168507523133e-05, + "loss": 1.9838, + "step": 4342 + }, + { + "epoch": 1.3330263965623081, + "grad_norm": 0.4797585606575012, + "learning_rate": 9.723005386815123e-05, + "loss": 1.9779, + "step": 4343 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4489155113697052, + "learning_rate": 9.722842219431607e-05, + "loss": 1.9805, + "step": 4344 + }, + { + "epoch": 1.3336402701043584, + "grad_norm": 0.43091216683387756, + "learning_rate": 9.722679005374196e-05, + "loss": 1.9708, + "step": 4345 + }, + { + "epoch": 1.3339472068753837, + "grad_norm": 0.453937292098999, + "learning_rate": 9.722515744644502e-05, + "loss": 2.0038, + "step": 4346 + }, + { + "epoch": 1.3342541436464088, + "grad_norm": 0.38905346393585205, + "learning_rate": 9.722352437244138e-05, + "loss": 2.0042, + "step": 4347 + }, + { + "epoch": 1.334561080417434, + "grad_norm": 0.46686118841171265, + "learning_rate": 9.722189083174722e-05, + "loss": 2.0733, + "step": 4348 + }, + { + "epoch": 1.3348680171884593, + "grad_norm": 0.42737439274787903, + "learning_rate": 9.722025682437865e-05, + "loss": 1.9572, + "step": 4349 + }, + { + "epoch": 1.3351749539594844, + "grad_norm": 0.3857511878013611, + "learning_rate": 9.721862235035181e-05, + "loss": 1.9288, + "step": 4350 + }, + { + "epoch": 1.3354818907305095, + "grad_norm": 0.42448824644088745, + "learning_rate": 9.721698740968288e-05, + "loss": 1.99, + "step": 4351 + }, + { + "epoch": 1.3357888275015346, + "grad_norm": 0.4753642976284027, + "learning_rate": 9.721535200238802e-05, + "loss": 2.0268, + "step": 4352 + }, + { + "epoch": 1.3360957642725597, + "grad_norm": 0.5248960256576538, + "learning_rate": 9.721371612848336e-05, + "loss": 2.008, + "step": 4353 + }, + { + "epoch": 1.336402701043585, + "grad_norm": 0.5046865344047546, + "learning_rate": 9.721207978798507e-05, + "loss": 1.9248, + "step": 4354 + }, + { + "epoch": 1.3367096378146102, + "grad_norm": 0.48205190896987915, + "learning_rate": 9.721044298090937e-05, + "loss": 1.9895, + "step": 4355 + }, + { + "epoch": 1.3370165745856353, + "grad_norm": 0.46149346232414246, + "learning_rate": 9.720880570727238e-05, + "loss": 2.0001, + "step": 4356 + }, + { + "epoch": 1.3373235113566606, + "grad_norm": 0.6212405562400818, + "learning_rate": 9.72071679670903e-05, + "loss": 2.0772, + "step": 4357 + }, + { + "epoch": 1.3376304481276857, + "grad_norm": 0.6935828924179077, + "learning_rate": 9.720552976037934e-05, + "loss": 1.9865, + "step": 4358 + }, + { + "epoch": 1.3379373848987108, + "grad_norm": 0.6850154399871826, + "learning_rate": 9.720389108715564e-05, + "loss": 1.9964, + "step": 4359 + }, + { + "epoch": 1.3382443216697362, + "grad_norm": 0.5925734043121338, + "learning_rate": 9.720225194743544e-05, + "loss": 2.0109, + "step": 4360 + }, + { + "epoch": 1.3385512584407613, + "grad_norm": 0.47503459453582764, + "learning_rate": 9.720061234123492e-05, + "loss": 2.0406, + "step": 4361 + }, + { + "epoch": 1.3388581952117864, + "grad_norm": 0.44226083159446716, + "learning_rate": 9.719897226857026e-05, + "loss": 1.953, + "step": 4362 + }, + { + "epoch": 1.3391651319828115, + "grad_norm": 0.5688608884811401, + "learning_rate": 9.719733172945772e-05, + "loss": 1.9422, + "step": 4363 + }, + { + "epoch": 1.3394720687538366, + "grad_norm": 0.6097545027732849, + "learning_rate": 9.719569072391347e-05, + "loss": 2.0204, + "step": 4364 + }, + { + "epoch": 1.339779005524862, + "grad_norm": 0.44313064217567444, + "learning_rate": 9.719404925195374e-05, + "loss": 1.9458, + "step": 4365 + }, + { + "epoch": 1.340085942295887, + "grad_norm": 0.495632141828537, + "learning_rate": 9.719240731359476e-05, + "loss": 1.9682, + "step": 4366 + }, + { + "epoch": 1.3403928790669122, + "grad_norm": 0.5843736529350281, + "learning_rate": 9.719076490885275e-05, + "loss": 1.9948, + "step": 4367 + }, + { + "epoch": 1.3406998158379375, + "grad_norm": 0.6249645352363586, + "learning_rate": 9.718912203774395e-05, + "loss": 1.9675, + "step": 4368 + }, + { + "epoch": 1.3410067526089626, + "grad_norm": 0.48386043310165405, + "learning_rate": 9.718747870028457e-05, + "loss": 1.9678, + "step": 4369 + }, + { + "epoch": 1.3413136893799877, + "grad_norm": 0.4797835648059845, + "learning_rate": 9.718583489649088e-05, + "loss": 2.0118, + "step": 4370 + }, + { + "epoch": 1.3416206261510129, + "grad_norm": 0.6131169199943542, + "learning_rate": 9.718419062637911e-05, + "loss": 2.0057, + "step": 4371 + }, + { + "epoch": 1.341927562922038, + "grad_norm": 0.6230120062828064, + "learning_rate": 9.718254588996552e-05, + "loss": 1.9871, + "step": 4372 + }, + { + "epoch": 1.3422344996930633, + "grad_norm": 0.5323978662490845, + "learning_rate": 9.718090068726633e-05, + "loss": 1.9389, + "step": 4373 + }, + { + "epoch": 1.3425414364640884, + "grad_norm": 0.429446280002594, + "learning_rate": 9.717925501829786e-05, + "loss": 1.9928, + "step": 4374 + }, + { + "epoch": 1.3428483732351135, + "grad_norm": 0.5588231086730957, + "learning_rate": 9.717760888307632e-05, + "loss": 2.0197, + "step": 4375 + }, + { + "epoch": 1.3431553100061389, + "grad_norm": 0.608248770236969, + "learning_rate": 9.7175962281618e-05, + "loss": 1.9486, + "step": 4376 + }, + { + "epoch": 1.343462246777164, + "grad_norm": 0.6100868582725525, + "learning_rate": 9.717431521393918e-05, + "loss": 2.044, + "step": 4377 + }, + { + "epoch": 1.343769183548189, + "grad_norm": 0.5428611636161804, + "learning_rate": 9.717266768005611e-05, + "loss": 2.0078, + "step": 4378 + }, + { + "epoch": 1.3440761203192142, + "grad_norm": 0.4338260889053345, + "learning_rate": 9.71710196799851e-05, + "loss": 1.9206, + "step": 4379 + }, + { + "epoch": 1.3443830570902393, + "grad_norm": 0.4879632294178009, + "learning_rate": 9.716937121374243e-05, + "loss": 1.9852, + "step": 4380 + }, + { + "epoch": 1.3446899938612646, + "grad_norm": 0.5174580216407776, + "learning_rate": 9.716772228134438e-05, + "loss": 1.9328, + "step": 4381 + }, + { + "epoch": 1.3449969306322898, + "grad_norm": 0.4461662173271179, + "learning_rate": 9.716607288280726e-05, + "loss": 1.9653, + "step": 4382 + }, + { + "epoch": 1.3453038674033149, + "grad_norm": 0.49747103452682495, + "learning_rate": 9.716442301814735e-05, + "loss": 1.9904, + "step": 4383 + }, + { + "epoch": 1.3456108041743402, + "grad_norm": 0.5059060454368591, + "learning_rate": 9.716277268738097e-05, + "loss": 1.9408, + "step": 4384 + }, + { + "epoch": 1.3459177409453653, + "grad_norm": 0.47981831431388855, + "learning_rate": 9.716112189052445e-05, + "loss": 1.9604, + "step": 4385 + }, + { + "epoch": 1.3462246777163904, + "grad_norm": 0.48941048979759216, + "learning_rate": 9.715947062759405e-05, + "loss": 2.0005, + "step": 4386 + }, + { + "epoch": 1.3465316144874155, + "grad_norm": 0.4544732868671417, + "learning_rate": 9.715781889860613e-05, + "loss": 1.9641, + "step": 4387 + }, + { + "epoch": 1.3468385512584407, + "grad_norm": 0.4564060866832733, + "learning_rate": 9.715616670357701e-05, + "loss": 1.8786, + "step": 4388 + }, + { + "epoch": 1.347145488029466, + "grad_norm": 0.4216209352016449, + "learning_rate": 9.715451404252301e-05, + "loss": 1.9402, + "step": 4389 + }, + { + "epoch": 1.347452424800491, + "grad_norm": 0.5024694204330444, + "learning_rate": 9.715286091546046e-05, + "loss": 1.9815, + "step": 4390 + }, + { + "epoch": 1.3477593615715162, + "grad_norm": 0.523953378200531, + "learning_rate": 9.715120732240571e-05, + "loss": 2.008, + "step": 4391 + }, + { + "epoch": 1.3480662983425415, + "grad_norm": 0.5068427920341492, + "learning_rate": 9.714955326337508e-05, + "loss": 1.9984, + "step": 4392 + }, + { + "epoch": 1.3483732351135667, + "grad_norm": 0.4349055290222168, + "learning_rate": 9.714789873838494e-05, + "loss": 1.9576, + "step": 4393 + }, + { + "epoch": 1.3486801718845918, + "grad_norm": 0.4677357077598572, + "learning_rate": 9.714624374745162e-05, + "loss": 2.0491, + "step": 4394 + }, + { + "epoch": 1.3489871086556169, + "grad_norm": 0.5942007899284363, + "learning_rate": 9.71445882905915e-05, + "loss": 1.9951, + "step": 4395 + }, + { + "epoch": 1.349294045426642, + "grad_norm": 0.5354358553886414, + "learning_rate": 9.714293236782092e-05, + "loss": 2.0033, + "step": 4396 + }, + { + "epoch": 1.3496009821976673, + "grad_norm": 0.5081890821456909, + "learning_rate": 9.714127597915625e-05, + "loss": 1.9944, + "step": 4397 + }, + { + "epoch": 1.3499079189686924, + "grad_norm": 0.5279759764671326, + "learning_rate": 9.713961912461386e-05, + "loss": 2.025, + "step": 4398 + }, + { + "epoch": 1.3502148557397176, + "grad_norm": 0.41777312755584717, + "learning_rate": 9.713796180421012e-05, + "loss": 1.9214, + "step": 4399 + }, + { + "epoch": 1.350521792510743, + "grad_norm": 0.48946598172187805, + "learning_rate": 9.713630401796141e-05, + "loss": 1.9851, + "step": 4400 + }, + { + "epoch": 1.350828729281768, + "grad_norm": 0.45182350277900696, + "learning_rate": 9.713464576588413e-05, + "loss": 1.9825, + "step": 4401 + }, + { + "epoch": 1.3511356660527931, + "grad_norm": 0.4178939461708069, + "learning_rate": 9.713298704799465e-05, + "loss": 1.8944, + "step": 4402 + }, + { + "epoch": 1.3514426028238182, + "grad_norm": 0.4178236424922943, + "learning_rate": 9.713132786430937e-05, + "loss": 1.9884, + "step": 4403 + }, + { + "epoch": 1.3517495395948433, + "grad_norm": 0.45951130986213684, + "learning_rate": 9.712966821484467e-05, + "loss": 2.0786, + "step": 4404 + }, + { + "epoch": 1.3520564763658687, + "grad_norm": 0.4884461760520935, + "learning_rate": 9.712800809961697e-05, + "loss": 2.0494, + "step": 4405 + }, + { + "epoch": 1.3523634131368938, + "grad_norm": 0.5342240929603577, + "learning_rate": 9.712634751864268e-05, + "loss": 2.1068, + "step": 4406 + }, + { + "epoch": 1.352670349907919, + "grad_norm": 0.5503208637237549, + "learning_rate": 9.71246864719382e-05, + "loss": 1.9588, + "step": 4407 + }, + { + "epoch": 1.3529772866789442, + "grad_norm": 0.5576291084289551, + "learning_rate": 9.712302495951994e-05, + "loss": 2.0461, + "step": 4408 + }, + { + "epoch": 1.3532842234499693, + "grad_norm": 0.5063806772232056, + "learning_rate": 9.712136298140433e-05, + "loss": 1.9606, + "step": 4409 + }, + { + "epoch": 1.3535911602209945, + "grad_norm": 0.5391512513160706, + "learning_rate": 9.71197005376078e-05, + "loss": 2.0115, + "step": 4410 + }, + { + "epoch": 1.3538980969920196, + "grad_norm": 0.4934769868850708, + "learning_rate": 9.711803762814676e-05, + "loss": 1.9966, + "step": 4411 + }, + { + "epoch": 1.3542050337630447, + "grad_norm": 0.4658334255218506, + "learning_rate": 9.711637425303766e-05, + "loss": 1.9477, + "step": 4412 + }, + { + "epoch": 1.35451197053407, + "grad_norm": 0.4407191574573517, + "learning_rate": 9.711471041229693e-05, + "loss": 1.9334, + "step": 4413 + }, + { + "epoch": 1.3548189073050951, + "grad_norm": 0.5043092370033264, + "learning_rate": 9.711304610594104e-05, + "loss": 2.0068, + "step": 4414 + }, + { + "epoch": 1.3551258440761202, + "grad_norm": 0.4502009451389313, + "learning_rate": 9.711138133398639e-05, + "loss": 1.9389, + "step": 4415 + }, + { + "epoch": 1.3554327808471456, + "grad_norm": 0.41863033175468445, + "learning_rate": 9.710971609644945e-05, + "loss": 1.9244, + "step": 4416 + }, + { + "epoch": 1.3557397176181707, + "grad_norm": 0.47590091824531555, + "learning_rate": 9.71080503933467e-05, + "loss": 2.0144, + "step": 4417 + }, + { + "epoch": 1.3560466543891958, + "grad_norm": 0.47155439853668213, + "learning_rate": 9.71063842246946e-05, + "loss": 2.0729, + "step": 4418 + }, + { + "epoch": 1.356353591160221, + "grad_norm": 0.5231152176856995, + "learning_rate": 9.710471759050957e-05, + "loss": 2.0654, + "step": 4419 + }, + { + "epoch": 1.356660527931246, + "grad_norm": 0.5952544212341309, + "learning_rate": 9.710305049080812e-05, + "loss": 1.9983, + "step": 4420 + }, + { + "epoch": 1.3569674647022714, + "grad_norm": 0.4810022711753845, + "learning_rate": 9.710138292560673e-05, + "loss": 1.9725, + "step": 4421 + }, + { + "epoch": 1.3572744014732965, + "grad_norm": 0.553421676158905, + "learning_rate": 9.709971489492185e-05, + "loss": 2.0666, + "step": 4422 + }, + { + "epoch": 1.3575813382443216, + "grad_norm": 0.48790663480758667, + "learning_rate": 9.709804639877001e-05, + "loss": 1.9312, + "step": 4423 + }, + { + "epoch": 1.357888275015347, + "grad_norm": 0.42968273162841797, + "learning_rate": 9.709637743716764e-05, + "loss": 1.9061, + "step": 4424 + }, + { + "epoch": 1.358195211786372, + "grad_norm": 0.40183690190315247, + "learning_rate": 9.709470801013128e-05, + "loss": 2.0547, + "step": 4425 + }, + { + "epoch": 1.3585021485573971, + "grad_norm": 0.5162881016731262, + "learning_rate": 9.70930381176774e-05, + "loss": 2.0246, + "step": 4426 + }, + { + "epoch": 1.3588090853284225, + "grad_norm": 0.517995297908783, + "learning_rate": 9.709136775982252e-05, + "loss": 2.0029, + "step": 4427 + }, + { + "epoch": 1.3591160220994476, + "grad_norm": 0.47416025400161743, + "learning_rate": 9.708969693658314e-05, + "loss": 1.9517, + "step": 4428 + }, + { + "epoch": 1.3594229588704727, + "grad_norm": 0.4192255437374115, + "learning_rate": 9.708802564797578e-05, + "loss": 1.9138, + "step": 4429 + }, + { + "epoch": 1.3597298956414978, + "grad_norm": 0.4643617868423462, + "learning_rate": 9.708635389401697e-05, + "loss": 1.9753, + "step": 4430 + }, + { + "epoch": 1.360036832412523, + "grad_norm": 0.5007988214492798, + "learning_rate": 9.708468167472317e-05, + "loss": 1.9654, + "step": 4431 + }, + { + "epoch": 1.3603437691835483, + "grad_norm": 0.5188244581222534, + "learning_rate": 9.708300899011098e-05, + "loss": 1.9959, + "step": 4432 + }, + { + "epoch": 1.3606507059545734, + "grad_norm": 0.5209388732910156, + "learning_rate": 9.70813358401969e-05, + "loss": 2.0028, + "step": 4433 + }, + { + "epoch": 1.3609576427255985, + "grad_norm": 0.48829126358032227, + "learning_rate": 9.707966222499745e-05, + "loss": 2.0554, + "step": 4434 + }, + { + "epoch": 1.3612645794966238, + "grad_norm": 0.4373438358306885, + "learning_rate": 9.707798814452919e-05, + "loss": 1.9611, + "step": 4435 + }, + { + "epoch": 1.361571516267649, + "grad_norm": 0.4294830858707428, + "learning_rate": 9.707631359880867e-05, + "loss": 1.9049, + "step": 4436 + }, + { + "epoch": 1.361878453038674, + "grad_norm": 0.46988123655319214, + "learning_rate": 9.70746385878524e-05, + "loss": 1.9221, + "step": 4437 + }, + { + "epoch": 1.3621853898096992, + "grad_norm": 0.4956746995449066, + "learning_rate": 9.707296311167697e-05, + "loss": 1.9215, + "step": 4438 + }, + { + "epoch": 1.3624923265807243, + "grad_norm": 0.43748801946640015, + "learning_rate": 9.707128717029894e-05, + "loss": 1.9882, + "step": 4439 + }, + { + "epoch": 1.3627992633517496, + "grad_norm": 0.4926415979862213, + "learning_rate": 9.706961076373485e-05, + "loss": 1.9664, + "step": 4440 + }, + { + "epoch": 1.3631062001227747, + "grad_norm": 0.5239415764808655, + "learning_rate": 9.706793389200129e-05, + "loss": 1.9809, + "step": 4441 + }, + { + "epoch": 1.3634131368937998, + "grad_norm": 0.5134629607200623, + "learning_rate": 9.706625655511481e-05, + "loss": 1.9559, + "step": 4442 + }, + { + "epoch": 1.3637200736648252, + "grad_norm": 0.49562570452690125, + "learning_rate": 9.706457875309198e-05, + "loss": 1.9603, + "step": 4443 + }, + { + "epoch": 1.3640270104358503, + "grad_norm": 0.45000702142715454, + "learning_rate": 9.706290048594942e-05, + "loss": 1.9395, + "step": 4444 + }, + { + "epoch": 1.3643339472068754, + "grad_norm": 0.4216759502887726, + "learning_rate": 9.70612217537037e-05, + "loss": 1.8857, + "step": 4445 + }, + { + "epoch": 1.3646408839779005, + "grad_norm": 0.5022158622741699, + "learning_rate": 9.705954255637138e-05, + "loss": 1.9388, + "step": 4446 + }, + { + "epoch": 1.3649478207489256, + "grad_norm": 0.5086642503738403, + "learning_rate": 9.70578628939691e-05, + "loss": 1.9325, + "step": 4447 + }, + { + "epoch": 1.365254757519951, + "grad_norm": 0.4891139566898346, + "learning_rate": 9.705618276651342e-05, + "loss": 1.9068, + "step": 4448 + }, + { + "epoch": 1.365561694290976, + "grad_norm": 0.42479926347732544, + "learning_rate": 9.705450217402096e-05, + "loss": 2.0345, + "step": 4449 + }, + { + "epoch": 1.3658686310620012, + "grad_norm": 0.45347172021865845, + "learning_rate": 9.705282111650834e-05, + "loss": 1.9343, + "step": 4450 + }, + { + "epoch": 1.3661755678330265, + "grad_norm": 0.5443231463432312, + "learning_rate": 9.705113959399217e-05, + "loss": 2.0428, + "step": 4451 + }, + { + "epoch": 1.3664825046040516, + "grad_norm": 0.5320110321044922, + "learning_rate": 9.704945760648905e-05, + "loss": 2.0015, + "step": 4452 + }, + { + "epoch": 1.3667894413750767, + "grad_norm": 0.5018410086631775, + "learning_rate": 9.704777515401561e-05, + "loss": 1.9284, + "step": 4453 + }, + { + "epoch": 1.3670963781461019, + "grad_norm": 0.4587440490722656, + "learning_rate": 9.704609223658848e-05, + "loss": 1.8945, + "step": 4454 + }, + { + "epoch": 1.367403314917127, + "grad_norm": 0.4634784758090973, + "learning_rate": 9.70444088542243e-05, + "loss": 1.9564, + "step": 4455 + }, + { + "epoch": 1.3677102516881523, + "grad_norm": 0.43047839403152466, + "learning_rate": 9.70427250069397e-05, + "loss": 2.0417, + "step": 4456 + }, + { + "epoch": 1.3680171884591774, + "grad_norm": 0.46661630272865295, + "learning_rate": 9.70410406947513e-05, + "loss": 2.0563, + "step": 4457 + }, + { + "epoch": 1.3683241252302025, + "grad_norm": 0.46544912457466125, + "learning_rate": 9.703935591767579e-05, + "loss": 2.0115, + "step": 4458 + }, + { + "epoch": 1.3686310620012279, + "grad_norm": 0.466172993183136, + "learning_rate": 9.703767067572977e-05, + "loss": 1.9177, + "step": 4459 + }, + { + "epoch": 1.368937998772253, + "grad_norm": 0.44513949751853943, + "learning_rate": 9.703598496892994e-05, + "loss": 1.9954, + "step": 4460 + }, + { + "epoch": 1.369244935543278, + "grad_norm": 0.4502551257610321, + "learning_rate": 9.703429879729293e-05, + "loss": 1.9155, + "step": 4461 + }, + { + "epoch": 1.3695518723143032, + "grad_norm": 0.4618416726589203, + "learning_rate": 9.703261216083541e-05, + "loss": 2.015, + "step": 4462 + }, + { + "epoch": 1.3698588090853283, + "grad_norm": 0.4691082239151001, + "learning_rate": 9.703092505957405e-05, + "loss": 2.0332, + "step": 4463 + }, + { + "epoch": 1.3701657458563536, + "grad_norm": 0.5674530863761902, + "learning_rate": 9.702923749352553e-05, + "loss": 2.0, + "step": 4464 + }, + { + "epoch": 1.3704726826273788, + "grad_norm": 0.5828661322593689, + "learning_rate": 9.702754946270651e-05, + "loss": 1.9727, + "step": 4465 + }, + { + "epoch": 1.3707796193984039, + "grad_norm": 0.5861548781394958, + "learning_rate": 9.702586096713369e-05, + "loss": 2.0337, + "step": 4466 + }, + { + "epoch": 1.3710865561694292, + "grad_norm": 0.5607923865318298, + "learning_rate": 9.702417200682374e-05, + "loss": 1.9639, + "step": 4467 + }, + { + "epoch": 1.3713934929404543, + "grad_norm": 0.553827702999115, + "learning_rate": 9.702248258179337e-05, + "loss": 1.9644, + "step": 4468 + }, + { + "epoch": 1.3717004297114794, + "grad_norm": 0.6120470762252808, + "learning_rate": 9.702079269205925e-05, + "loss": 1.9562, + "step": 4469 + }, + { + "epoch": 1.3720073664825045, + "grad_norm": 0.6354473829269409, + "learning_rate": 9.70191023376381e-05, + "loss": 2.0984, + "step": 4470 + }, + { + "epoch": 1.3723143032535297, + "grad_norm": 0.5426626801490784, + "learning_rate": 9.701741151854665e-05, + "loss": 1.9473, + "step": 4471 + }, + { + "epoch": 1.372621240024555, + "grad_norm": 0.5632089376449585, + "learning_rate": 9.701572023480156e-05, + "loss": 2.0167, + "step": 4472 + }, + { + "epoch": 1.37292817679558, + "grad_norm": 0.5315039157867432, + "learning_rate": 9.701402848641957e-05, + "loss": 1.9537, + "step": 4473 + }, + { + "epoch": 1.3732351135666052, + "grad_norm": 0.4552931785583496, + "learning_rate": 9.70123362734174e-05, + "loss": 1.9553, + "step": 4474 + }, + { + "epoch": 1.3735420503376305, + "grad_norm": 0.49282166361808777, + "learning_rate": 9.701064359581176e-05, + "loss": 2.0409, + "step": 4475 + }, + { + "epoch": 1.3738489871086557, + "grad_norm": 0.46548575162887573, + "learning_rate": 9.700895045361939e-05, + "loss": 1.9707, + "step": 4476 + }, + { + "epoch": 1.3741559238796808, + "grad_norm": 0.4619027078151703, + "learning_rate": 9.7007256846857e-05, + "loss": 1.9531, + "step": 4477 + }, + { + "epoch": 1.3744628606507059, + "grad_norm": 0.5122626423835754, + "learning_rate": 9.700556277554138e-05, + "loss": 2.0625, + "step": 4478 + }, + { + "epoch": 1.374769797421731, + "grad_norm": 0.487246036529541, + "learning_rate": 9.700386823968922e-05, + "loss": 1.9667, + "step": 4479 + }, + { + "epoch": 1.3750767341927563, + "grad_norm": 0.5093865990638733, + "learning_rate": 9.700217323931729e-05, + "loss": 1.9982, + "step": 4480 + }, + { + "epoch": 1.3753836709637814, + "grad_norm": 0.47049981355667114, + "learning_rate": 9.700047777444232e-05, + "loss": 1.9876, + "step": 4481 + }, + { + "epoch": 1.3756906077348066, + "grad_norm": 0.4997411370277405, + "learning_rate": 9.699878184508109e-05, + "loss": 1.9925, + "step": 4482 + }, + { + "epoch": 1.375997544505832, + "grad_norm": 0.49374327063560486, + "learning_rate": 9.699708545125034e-05, + "loss": 1.9468, + "step": 4483 + }, + { + "epoch": 1.376304481276857, + "grad_norm": 0.44101378321647644, + "learning_rate": 9.699538859296686e-05, + "loss": 2.0577, + "step": 4484 + }, + { + "epoch": 1.3766114180478821, + "grad_norm": 0.47289925813674927, + "learning_rate": 9.699369127024741e-05, + "loss": 1.9611, + "step": 4485 + }, + { + "epoch": 1.3769183548189072, + "grad_norm": 0.4616342782974243, + "learning_rate": 9.699199348310875e-05, + "loss": 2.0196, + "step": 4486 + }, + { + "epoch": 1.3772252915899323, + "grad_norm": 0.45797309279441833, + "learning_rate": 9.699029523156766e-05, + "loss": 2.0168, + "step": 4487 + }, + { + "epoch": 1.3775322283609577, + "grad_norm": 0.5224477648735046, + "learning_rate": 9.698859651564095e-05, + "loss": 2.0312, + "step": 4488 + }, + { + "epoch": 1.3778391651319828, + "grad_norm": 0.4831027388572693, + "learning_rate": 9.698689733534539e-05, + "loss": 2.0084, + "step": 4489 + }, + { + "epoch": 1.378146101903008, + "grad_norm": 0.49492040276527405, + "learning_rate": 9.698519769069774e-05, + "loss": 1.9474, + "step": 4490 + }, + { + "epoch": 1.3784530386740332, + "grad_norm": 0.4911774694919586, + "learning_rate": 9.698349758171486e-05, + "loss": 1.987, + "step": 4491 + }, + { + "epoch": 1.3787599754450584, + "grad_norm": 0.5415390729904175, + "learning_rate": 9.69817970084135e-05, + "loss": 1.9927, + "step": 4492 + }, + { + "epoch": 1.3790669122160835, + "grad_norm": 0.6870381832122803, + "learning_rate": 9.698009597081048e-05, + "loss": 2.0348, + "step": 4493 + }, + { + "epoch": 1.3793738489871086, + "grad_norm": 0.6322616934776306, + "learning_rate": 9.697839446892263e-05, + "loss": 2.0119, + "step": 4494 + }, + { + "epoch": 1.3796807857581337, + "grad_norm": 0.5950151681900024, + "learning_rate": 9.697669250276675e-05, + "loss": 2.002, + "step": 4495 + }, + { + "epoch": 1.379987722529159, + "grad_norm": 0.4321151673793793, + "learning_rate": 9.697499007235966e-05, + "loss": 1.9173, + "step": 4496 + }, + { + "epoch": 1.3802946593001841, + "grad_norm": 0.4627344608306885, + "learning_rate": 9.697328717771818e-05, + "loss": 2.0289, + "step": 4497 + }, + { + "epoch": 1.3806015960712092, + "grad_norm": 0.5040726661682129, + "learning_rate": 9.697158381885915e-05, + "loss": 1.9844, + "step": 4498 + }, + { + "epoch": 1.3809085328422346, + "grad_norm": 0.5219398736953735, + "learning_rate": 9.696987999579939e-05, + "loss": 1.9536, + "step": 4499 + }, + { + "epoch": 1.3812154696132597, + "grad_norm": 0.487734317779541, + "learning_rate": 9.696817570855575e-05, + "loss": 1.9655, + "step": 4500 + }, + { + "epoch": 1.3815224063842848, + "grad_norm": 0.40818822383880615, + "learning_rate": 9.696647095714506e-05, + "loss": 1.9524, + "step": 4501 + }, + { + "epoch": 1.3818293431553101, + "grad_norm": 0.41752889752388, + "learning_rate": 9.69647657415842e-05, + "loss": 1.9927, + "step": 4502 + }, + { + "epoch": 1.3821362799263353, + "grad_norm": 0.44540464878082275, + "learning_rate": 9.696306006188998e-05, + "loss": 1.9207, + "step": 4503 + }, + { + "epoch": 1.3824432166973604, + "grad_norm": 0.44818806648254395, + "learning_rate": 9.696135391807927e-05, + "loss": 1.9054, + "step": 4504 + }, + { + "epoch": 1.3827501534683855, + "grad_norm": 0.430758535861969, + "learning_rate": 9.695964731016896e-05, + "loss": 1.9644, + "step": 4505 + }, + { + "epoch": 1.3830570902394106, + "grad_norm": 0.3787635564804077, + "learning_rate": 9.695794023817586e-05, + "loss": 1.9601, + "step": 4506 + }, + { + "epoch": 1.383364027010436, + "grad_norm": 0.42520588636398315, + "learning_rate": 9.695623270211689e-05, + "loss": 1.9681, + "step": 4507 + }, + { + "epoch": 1.383670963781461, + "grad_norm": 0.39063912630081177, + "learning_rate": 9.69545247020089e-05, + "loss": 2.0323, + "step": 4508 + }, + { + "epoch": 1.3839779005524862, + "grad_norm": 0.41405799984931946, + "learning_rate": 9.695281623786879e-05, + "loss": 1.9239, + "step": 4509 + }, + { + "epoch": 1.3842848373235115, + "grad_norm": 0.4275501072406769, + "learning_rate": 9.695110730971342e-05, + "loss": 1.941, + "step": 4510 + }, + { + "epoch": 1.3845917740945366, + "grad_norm": 0.5254966616630554, + "learning_rate": 9.694939791755968e-05, + "loss": 1.9997, + "step": 4511 + }, + { + "epoch": 1.3848987108655617, + "grad_norm": 0.581857442855835, + "learning_rate": 9.694768806142448e-05, + "loss": 2.0085, + "step": 4512 + }, + { + "epoch": 1.3852056476365868, + "grad_norm": 0.6330662965774536, + "learning_rate": 9.69459777413247e-05, + "loss": 1.9898, + "step": 4513 + }, + { + "epoch": 1.385512584407612, + "grad_norm": 0.693536639213562, + "learning_rate": 9.694426695727727e-05, + "loss": 1.9466, + "step": 4514 + }, + { + "epoch": 1.3858195211786373, + "grad_norm": 0.6494079232215881, + "learning_rate": 9.694255570929906e-05, + "loss": 1.9523, + "step": 4515 + }, + { + "epoch": 1.3861264579496624, + "grad_norm": 0.573515772819519, + "learning_rate": 9.694084399740701e-05, + "loss": 1.9789, + "step": 4516 + }, + { + "epoch": 1.3864333947206875, + "grad_norm": 0.5253448486328125, + "learning_rate": 9.693913182161805e-05, + "loss": 2.0348, + "step": 4517 + }, + { + "epoch": 1.3867403314917128, + "grad_norm": 0.49921590089797974, + "learning_rate": 9.693741918194904e-05, + "loss": 1.9684, + "step": 4518 + }, + { + "epoch": 1.387047268262738, + "grad_norm": 0.5164174437522888, + "learning_rate": 9.693570607841696e-05, + "loss": 2.0104, + "step": 4519 + }, + { + "epoch": 1.387354205033763, + "grad_norm": 0.5620231032371521, + "learning_rate": 9.693399251103872e-05, + "loss": 1.9969, + "step": 4520 + }, + { + "epoch": 1.3876611418047882, + "grad_norm": 0.495890349149704, + "learning_rate": 9.693227847983126e-05, + "loss": 2.0037, + "step": 4521 + }, + { + "epoch": 1.3879680785758133, + "grad_norm": 0.4942645728588104, + "learning_rate": 9.693056398481151e-05, + "loss": 2.0199, + "step": 4522 + }, + { + "epoch": 1.3882750153468386, + "grad_norm": 0.5366860628128052, + "learning_rate": 9.692884902599643e-05, + "loss": 2.0395, + "step": 4523 + }, + { + "epoch": 1.3885819521178637, + "grad_norm": 0.48179951310157776, + "learning_rate": 9.692713360340295e-05, + "loss": 2.0292, + "step": 4524 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.4709320366382599, + "learning_rate": 9.692541771704804e-05, + "loss": 2.006, + "step": 4525 + }, + { + "epoch": 1.3891958256599142, + "grad_norm": 0.4311975836753845, + "learning_rate": 9.692370136694864e-05, + "loss": 2.0122, + "step": 4526 + }, + { + "epoch": 1.3895027624309393, + "grad_norm": 0.4489841163158417, + "learning_rate": 9.692198455312172e-05, + "loss": 1.9635, + "step": 4527 + }, + { + "epoch": 1.3898096992019644, + "grad_norm": 0.40383243560791016, + "learning_rate": 9.692026727558425e-05, + "loss": 1.9352, + "step": 4528 + }, + { + "epoch": 1.3901166359729895, + "grad_norm": 0.4732677638530731, + "learning_rate": 9.691854953435319e-05, + "loss": 1.9882, + "step": 4529 + }, + { + "epoch": 1.3904235727440146, + "grad_norm": 0.5124688744544983, + "learning_rate": 9.691683132944553e-05, + "loss": 2.0068, + "step": 4530 + }, + { + "epoch": 1.39073050951504, + "grad_norm": 0.4810490906238556, + "learning_rate": 9.691511266087824e-05, + "loss": 2.0163, + "step": 4531 + }, + { + "epoch": 1.391037446286065, + "grad_norm": 0.4019710421562195, + "learning_rate": 9.691339352866831e-05, + "loss": 1.8943, + "step": 4532 + }, + { + "epoch": 1.3913443830570902, + "grad_norm": 0.4144287705421448, + "learning_rate": 9.691167393283274e-05, + "loss": 1.9457, + "step": 4533 + }, + { + "epoch": 1.3916513198281155, + "grad_norm": 0.42622655630111694, + "learning_rate": 9.690995387338851e-05, + "loss": 1.9618, + "step": 4534 + }, + { + "epoch": 1.3919582565991406, + "grad_norm": 0.4547794461250305, + "learning_rate": 9.690823335035259e-05, + "loss": 2.0243, + "step": 4535 + }, + { + "epoch": 1.3922651933701657, + "grad_norm": 0.4298909604549408, + "learning_rate": 9.690651236374205e-05, + "loss": 1.9872, + "step": 4536 + }, + { + "epoch": 1.3925721301411909, + "grad_norm": 0.40463829040527344, + "learning_rate": 9.690479091357386e-05, + "loss": 1.9617, + "step": 4537 + }, + { + "epoch": 1.392879066912216, + "grad_norm": 0.441487580537796, + "learning_rate": 9.690306899986502e-05, + "loss": 1.8965, + "step": 4538 + }, + { + "epoch": 1.3931860036832413, + "grad_norm": 0.4713582694530487, + "learning_rate": 9.690134662263256e-05, + "loss": 2.0112, + "step": 4539 + }, + { + "epoch": 1.3934929404542664, + "grad_norm": 0.5772922039031982, + "learning_rate": 9.689962378189351e-05, + "loss": 1.9903, + "step": 4540 + }, + { + "epoch": 1.3937998772252915, + "grad_norm": 0.6658890247344971, + "learning_rate": 9.689790047766489e-05, + "loss": 2.0569, + "step": 4541 + }, + { + "epoch": 1.3941068139963169, + "grad_norm": 0.6710116267204285, + "learning_rate": 9.689617670996372e-05, + "loss": 1.9692, + "step": 4542 + }, + { + "epoch": 1.394413750767342, + "grad_norm": 0.6778390407562256, + "learning_rate": 9.689445247880707e-05, + "loss": 2.0363, + "step": 4543 + }, + { + "epoch": 1.394720687538367, + "grad_norm": 0.6921203136444092, + "learning_rate": 9.689272778421192e-05, + "loss": 2.0104, + "step": 4544 + }, + { + "epoch": 1.3950276243093922, + "grad_norm": 0.48772117495536804, + "learning_rate": 9.689100262619537e-05, + "loss": 2.0006, + "step": 4545 + }, + { + "epoch": 1.3953345610804173, + "grad_norm": 0.4956360459327698, + "learning_rate": 9.688927700477445e-05, + "loss": 1.9724, + "step": 4546 + }, + { + "epoch": 1.3956414978514426, + "grad_norm": 0.6304072141647339, + "learning_rate": 9.68875509199662e-05, + "loss": 1.9904, + "step": 4547 + }, + { + "epoch": 1.3959484346224678, + "grad_norm": 0.6372275948524475, + "learning_rate": 9.68858243717877e-05, + "loss": 2.0328, + "step": 4548 + }, + { + "epoch": 1.3962553713934929, + "grad_norm": 0.48642870783805847, + "learning_rate": 9.688409736025601e-05, + "loss": 1.9898, + "step": 4549 + }, + { + "epoch": 1.3965623081645182, + "grad_norm": 0.41096800565719604, + "learning_rate": 9.688236988538817e-05, + "loss": 1.8945, + "step": 4550 + }, + { + "epoch": 1.3968692449355433, + "grad_norm": 0.48746830224990845, + "learning_rate": 9.68806419472013e-05, + "loss": 1.9809, + "step": 4551 + }, + { + "epoch": 1.3971761817065684, + "grad_norm": 0.5296676754951477, + "learning_rate": 9.687891354571242e-05, + "loss": 1.9194, + "step": 4552 + }, + { + "epoch": 1.3974831184775935, + "grad_norm": 0.43177086114883423, + "learning_rate": 9.687718468093865e-05, + "loss": 1.8785, + "step": 4553 + }, + { + "epoch": 1.3977900552486187, + "grad_norm": 0.4617565870285034, + "learning_rate": 9.687545535289705e-05, + "loss": 2.0021, + "step": 4554 + }, + { + "epoch": 1.398096992019644, + "grad_norm": 0.4460168182849884, + "learning_rate": 9.687372556160477e-05, + "loss": 1.9368, + "step": 4555 + }, + { + "epoch": 1.398403928790669, + "grad_norm": 0.5051010847091675, + "learning_rate": 9.687199530707882e-05, + "loss": 2.0321, + "step": 4556 + }, + { + "epoch": 1.3987108655616942, + "grad_norm": 0.5623685717582703, + "learning_rate": 9.687026458933636e-05, + "loss": 2.007, + "step": 4557 + }, + { + "epoch": 1.3990178023327196, + "grad_norm": 0.48149919509887695, + "learning_rate": 9.686853340839446e-05, + "loss": 1.9346, + "step": 4558 + }, + { + "epoch": 1.3993247391037447, + "grad_norm": 0.4651631712913513, + "learning_rate": 9.686680176427025e-05, + "loss": 1.9603, + "step": 4559 + }, + { + "epoch": 1.3996316758747698, + "grad_norm": 0.5255021452903748, + "learning_rate": 9.686506965698083e-05, + "loss": 2.0206, + "step": 4560 + }, + { + "epoch": 1.3999386126457949, + "grad_norm": 0.5137404799461365, + "learning_rate": 9.686333708654334e-05, + "loss": 1.9736, + "step": 4561 + }, + { + "epoch": 1.40024554941682, + "grad_norm": 0.5037943124771118, + "learning_rate": 9.686160405297487e-05, + "loss": 1.9886, + "step": 4562 + }, + { + "epoch": 1.4005524861878453, + "grad_norm": 0.46424365043640137, + "learning_rate": 9.685987055629256e-05, + "loss": 1.9316, + "step": 4563 + }, + { + "epoch": 1.4008594229588704, + "grad_norm": 0.4839535355567932, + "learning_rate": 9.685813659651355e-05, + "loss": 1.9651, + "step": 4564 + }, + { + "epoch": 1.4011663597298956, + "grad_norm": 0.48972323536872864, + "learning_rate": 9.685640217365497e-05, + "loss": 1.9544, + "step": 4565 + }, + { + "epoch": 1.401473296500921, + "grad_norm": 0.43038102984428406, + "learning_rate": 9.685466728773396e-05, + "loss": 1.9522, + "step": 4566 + }, + { + "epoch": 1.401780233271946, + "grad_norm": 0.5174641013145447, + "learning_rate": 9.685293193876765e-05, + "loss": 2.046, + "step": 4567 + }, + { + "epoch": 1.4020871700429711, + "grad_norm": 0.6731263995170593, + "learning_rate": 9.685119612677323e-05, + "loss": 2.0123, + "step": 4568 + }, + { + "epoch": 1.4023941068139965, + "grad_norm": 0.5863515734672546, + "learning_rate": 9.684945985176782e-05, + "loss": 1.9951, + "step": 4569 + }, + { + "epoch": 1.4027010435850216, + "grad_norm": 0.4479050934314728, + "learning_rate": 9.684772311376859e-05, + "loss": 1.9287, + "step": 4570 + }, + { + "epoch": 1.4030079803560467, + "grad_norm": 0.432740718126297, + "learning_rate": 9.68459859127927e-05, + "loss": 1.955, + "step": 4571 + }, + { + "epoch": 1.4033149171270718, + "grad_norm": 0.571775496006012, + "learning_rate": 9.684424824885731e-05, + "loss": 1.9519, + "step": 4572 + }, + { + "epoch": 1.403621853898097, + "grad_norm": 0.6454880237579346, + "learning_rate": 9.684251012197963e-05, + "loss": 1.9858, + "step": 4573 + }, + { + "epoch": 1.4039287906691222, + "grad_norm": 0.5274731516838074, + "learning_rate": 9.684077153217677e-05, + "loss": 1.9956, + "step": 4574 + }, + { + "epoch": 1.4042357274401474, + "grad_norm": 0.4459272027015686, + "learning_rate": 9.683903247946597e-05, + "loss": 2.0412, + "step": 4575 + }, + { + "epoch": 1.4045426642111725, + "grad_norm": 0.47089213132858276, + "learning_rate": 9.683729296386441e-05, + "loss": 1.9247, + "step": 4576 + }, + { + "epoch": 1.4048496009821978, + "grad_norm": 0.628490149974823, + "learning_rate": 9.683555298538927e-05, + "loss": 2.1311, + "step": 4577 + }, + { + "epoch": 1.405156537753223, + "grad_norm": 0.5498626232147217, + "learning_rate": 9.683381254405773e-05, + "loss": 1.9538, + "step": 4578 + }, + { + "epoch": 1.405463474524248, + "grad_norm": 0.4556458294391632, + "learning_rate": 9.6832071639887e-05, + "loss": 1.9957, + "step": 4579 + }, + { + "epoch": 1.4057704112952731, + "grad_norm": 0.5684164762496948, + "learning_rate": 9.68303302728943e-05, + "loss": 1.9339, + "step": 4580 + }, + { + "epoch": 1.4060773480662982, + "grad_norm": 0.5723292231559753, + "learning_rate": 9.682858844309682e-05, + "loss": 2.0043, + "step": 4581 + }, + { + "epoch": 1.4063842848373236, + "grad_norm": 0.4734770953655243, + "learning_rate": 9.682684615051178e-05, + "loss": 1.9854, + "step": 4582 + }, + { + "epoch": 1.4066912216083487, + "grad_norm": 0.49376189708709717, + "learning_rate": 9.682510339515642e-05, + "loss": 2.0436, + "step": 4583 + }, + { + "epoch": 1.4069981583793738, + "grad_norm": 0.6263520121574402, + "learning_rate": 9.682336017704793e-05, + "loss": 1.9426, + "step": 4584 + }, + { + "epoch": 1.4073050951503991, + "grad_norm": 0.5852357745170593, + "learning_rate": 9.682161649620355e-05, + "loss": 1.9865, + "step": 4585 + }, + { + "epoch": 1.4076120319214243, + "grad_norm": 0.45548367500305176, + "learning_rate": 9.681987235264052e-05, + "loss": 2.0454, + "step": 4586 + }, + { + "epoch": 1.4079189686924494, + "grad_norm": 0.4961472153663635, + "learning_rate": 9.681812774637607e-05, + "loss": 2.0414, + "step": 4587 + }, + { + "epoch": 1.4082259054634745, + "grad_norm": 0.5739028453826904, + "learning_rate": 9.681638267742741e-05, + "loss": 1.9591, + "step": 4588 + }, + { + "epoch": 1.4085328422344996, + "grad_norm": 0.546283483505249, + "learning_rate": 9.681463714581184e-05, + "loss": 1.9631, + "step": 4589 + }, + { + "epoch": 1.408839779005525, + "grad_norm": 0.4757421910762787, + "learning_rate": 9.681289115154659e-05, + "loss": 1.954, + "step": 4590 + }, + { + "epoch": 1.40914671577655, + "grad_norm": 0.5116898417472839, + "learning_rate": 9.681114469464891e-05, + "loss": 1.9816, + "step": 4591 + }, + { + "epoch": 1.4094536525475752, + "grad_norm": 0.6128544807434082, + "learning_rate": 9.680939777513607e-05, + "loss": 1.9408, + "step": 4592 + }, + { + "epoch": 1.4097605893186005, + "grad_norm": 0.5577036142349243, + "learning_rate": 9.680765039302531e-05, + "loss": 1.906, + "step": 4593 + }, + { + "epoch": 1.4100675260896256, + "grad_norm": 0.4608074128627777, + "learning_rate": 9.680590254833393e-05, + "loss": 1.9421, + "step": 4594 + }, + { + "epoch": 1.4103744628606507, + "grad_norm": 0.4221206307411194, + "learning_rate": 9.680415424107917e-05, + "loss": 1.9596, + "step": 4595 + }, + { + "epoch": 1.4106813996316758, + "grad_norm": 0.4278069734573364, + "learning_rate": 9.680240547127832e-05, + "loss": 1.9718, + "step": 4596 + }, + { + "epoch": 1.410988336402701, + "grad_norm": 0.48608019948005676, + "learning_rate": 9.680065623894869e-05, + "loss": 2.0595, + "step": 4597 + }, + { + "epoch": 1.4112952731737263, + "grad_norm": 0.4559817910194397, + "learning_rate": 9.679890654410753e-05, + "loss": 1.959, + "step": 4598 + }, + { + "epoch": 1.4116022099447514, + "grad_norm": 0.5122750997543335, + "learning_rate": 9.679715638677216e-05, + "loss": 2.0669, + "step": 4599 + }, + { + "epoch": 1.4119091467157765, + "grad_norm": 0.5203170776367188, + "learning_rate": 9.679540576695985e-05, + "loss": 1.9475, + "step": 4600 + }, + { + "epoch": 1.4122160834868018, + "grad_norm": 0.5420581698417664, + "learning_rate": 9.679365468468791e-05, + "loss": 1.9603, + "step": 4601 + }, + { + "epoch": 1.412523020257827, + "grad_norm": 0.527387261390686, + "learning_rate": 9.679190313997364e-05, + "loss": 1.9172, + "step": 4602 + }, + { + "epoch": 1.412829957028852, + "grad_norm": 0.48417946696281433, + "learning_rate": 9.679015113283438e-05, + "loss": 1.9619, + "step": 4603 + }, + { + "epoch": 1.4131368937998772, + "grad_norm": 0.49174100160598755, + "learning_rate": 9.678839866328742e-05, + "loss": 1.9959, + "step": 4604 + }, + { + "epoch": 1.4134438305709023, + "grad_norm": 0.5096092224121094, + "learning_rate": 9.678664573135006e-05, + "loss": 2.0046, + "step": 4605 + }, + { + "epoch": 1.4137507673419276, + "grad_norm": 0.4536958634853363, + "learning_rate": 9.678489233703965e-05, + "loss": 1.9289, + "step": 4606 + }, + { + "epoch": 1.4140577041129527, + "grad_norm": 0.40438196063041687, + "learning_rate": 9.678313848037353e-05, + "loss": 1.9488, + "step": 4607 + }, + { + "epoch": 1.4143646408839778, + "grad_norm": 0.4447456896305084, + "learning_rate": 9.6781384161369e-05, + "loss": 1.9638, + "step": 4608 + }, + { + "epoch": 1.4146715776550032, + "grad_norm": 0.44451746344566345, + "learning_rate": 9.677962938004342e-05, + "loss": 1.9026, + "step": 4609 + }, + { + "epoch": 1.4149785144260283, + "grad_norm": 0.4262266457080841, + "learning_rate": 9.677787413641412e-05, + "loss": 1.9408, + "step": 4610 + }, + { + "epoch": 1.4152854511970534, + "grad_norm": 0.42755937576293945, + "learning_rate": 9.677611843049845e-05, + "loss": 1.9542, + "step": 4611 + }, + { + "epoch": 1.4155923879680785, + "grad_norm": 0.43264830112457275, + "learning_rate": 9.677436226231375e-05, + "loss": 2.0244, + "step": 4612 + }, + { + "epoch": 1.4158993247391036, + "grad_norm": 0.4521278142929077, + "learning_rate": 9.67726056318774e-05, + "loss": 2.0343, + "step": 4613 + }, + { + "epoch": 1.416206261510129, + "grad_norm": 0.45257535576820374, + "learning_rate": 9.677084853920675e-05, + "loss": 1.9743, + "step": 4614 + }, + { + "epoch": 1.416513198281154, + "grad_norm": 0.42859771847724915, + "learning_rate": 9.676909098431915e-05, + "loss": 2.0067, + "step": 4615 + }, + { + "epoch": 1.4168201350521792, + "grad_norm": 0.4057050049304962, + "learning_rate": 9.6767332967232e-05, + "loss": 1.9074, + "step": 4616 + }, + { + "epoch": 1.4171270718232045, + "grad_norm": 0.46177807450294495, + "learning_rate": 9.676557448796264e-05, + "loss": 1.9899, + "step": 4617 + }, + { + "epoch": 1.4174340085942296, + "grad_norm": 0.44164395332336426, + "learning_rate": 9.676381554652846e-05, + "loss": 1.9759, + "step": 4618 + }, + { + "epoch": 1.4177409453652547, + "grad_norm": 0.42987993359565735, + "learning_rate": 9.676205614294684e-05, + "loss": 1.8783, + "step": 4619 + }, + { + "epoch": 1.4180478821362799, + "grad_norm": 0.541702389717102, + "learning_rate": 9.67602962772352e-05, + "loss": 2.0099, + "step": 4620 + }, + { + "epoch": 1.418354818907305, + "grad_norm": 0.42173272371292114, + "learning_rate": 9.67585359494109e-05, + "loss": 1.9281, + "step": 4621 + }, + { + "epoch": 1.4186617556783303, + "grad_norm": 0.432476669549942, + "learning_rate": 9.67567751594913e-05, + "loss": 1.9124, + "step": 4622 + }, + { + "epoch": 1.4189686924493554, + "grad_norm": 0.4952125549316406, + "learning_rate": 9.675501390749388e-05, + "loss": 1.973, + "step": 4623 + }, + { + "epoch": 1.4192756292203805, + "grad_norm": 0.5270698070526123, + "learning_rate": 9.6753252193436e-05, + "loss": 2.003, + "step": 4624 + }, + { + "epoch": 1.4195825659914059, + "grad_norm": 0.5735524892807007, + "learning_rate": 9.67514900173351e-05, + "loss": 1.9266, + "step": 4625 + }, + { + "epoch": 1.419889502762431, + "grad_norm": 0.508196234703064, + "learning_rate": 9.674972737920855e-05, + "loss": 1.9633, + "step": 4626 + }, + { + "epoch": 1.420196439533456, + "grad_norm": 0.4321250319480896, + "learning_rate": 9.674796427907379e-05, + "loss": 1.9994, + "step": 4627 + }, + { + "epoch": 1.4205033763044812, + "grad_norm": 0.5697643756866455, + "learning_rate": 9.674620071694826e-05, + "loss": 2.0018, + "step": 4628 + }, + { + "epoch": 1.4208103130755063, + "grad_norm": 0.6797513365745544, + "learning_rate": 9.674443669284936e-05, + "loss": 2.0514, + "step": 4629 + }, + { + "epoch": 1.4211172498465316, + "grad_norm": 0.6622742414474487, + "learning_rate": 9.674267220679456e-05, + "loss": 1.9315, + "step": 4630 + }, + { + "epoch": 1.4214241866175568, + "grad_norm": 0.5143589377403259, + "learning_rate": 9.674090725880125e-05, + "loss": 1.9691, + "step": 4631 + }, + { + "epoch": 1.4217311233885819, + "grad_norm": 0.4472220838069916, + "learning_rate": 9.673914184888692e-05, + "loss": 1.9629, + "step": 4632 + }, + { + "epoch": 1.4220380601596072, + "grad_norm": 0.4992378354072571, + "learning_rate": 9.6737375977069e-05, + "loss": 1.9202, + "step": 4633 + }, + { + "epoch": 1.4223449969306323, + "grad_norm": 0.5463345646858215, + "learning_rate": 9.673560964336493e-05, + "loss": 2.0143, + "step": 4634 + }, + { + "epoch": 1.4226519337016574, + "grad_norm": 0.4566437304019928, + "learning_rate": 9.673384284779217e-05, + "loss": 1.8907, + "step": 4635 + }, + { + "epoch": 1.4229588704726825, + "grad_norm": 0.41718652844429016, + "learning_rate": 9.673207559036816e-05, + "loss": 1.8955, + "step": 4636 + }, + { + "epoch": 1.4232658072437077, + "grad_norm": 0.5017329454421997, + "learning_rate": 9.673030787111043e-05, + "loss": 1.9745, + "step": 4637 + }, + { + "epoch": 1.423572744014733, + "grad_norm": 0.48890092968940735, + "learning_rate": 9.67285396900364e-05, + "loss": 1.9448, + "step": 4638 + }, + { + "epoch": 1.423879680785758, + "grad_norm": 0.4519537687301636, + "learning_rate": 9.672677104716352e-05, + "loss": 1.9572, + "step": 4639 + }, + { + "epoch": 1.4241866175567832, + "grad_norm": 0.4786919355392456, + "learning_rate": 9.672500194250932e-05, + "loss": 2.0212, + "step": 4640 + }, + { + "epoch": 1.4244935543278086, + "grad_norm": 0.4938487112522125, + "learning_rate": 9.672323237609127e-05, + "loss": 1.9842, + "step": 4641 + }, + { + "epoch": 1.4248004910988337, + "grad_norm": 0.5786599516868591, + "learning_rate": 9.672146234792686e-05, + "loss": 1.9575, + "step": 4642 + }, + { + "epoch": 1.4251074278698588, + "grad_norm": 0.5532247424125671, + "learning_rate": 9.671969185803356e-05, + "loss": 1.9972, + "step": 4643 + }, + { + "epoch": 1.4254143646408841, + "grad_norm": 0.5058014988899231, + "learning_rate": 9.671792090642889e-05, + "loss": 2.0042, + "step": 4644 + }, + { + "epoch": 1.4257213014119092, + "grad_norm": 0.46545106172561646, + "learning_rate": 9.671614949313033e-05, + "loss": 1.9853, + "step": 4645 + }, + { + "epoch": 1.4260282381829343, + "grad_norm": 0.47626879811286926, + "learning_rate": 9.671437761815541e-05, + "loss": 1.9725, + "step": 4646 + }, + { + "epoch": 1.4263351749539595, + "grad_norm": 0.4476237893104553, + "learning_rate": 9.671260528152165e-05, + "loss": 1.8876, + "step": 4647 + }, + { + "epoch": 1.4266421117249846, + "grad_norm": 0.4290693700313568, + "learning_rate": 9.671083248324651e-05, + "loss": 1.9766, + "step": 4648 + }, + { + "epoch": 1.42694904849601, + "grad_norm": 0.443131685256958, + "learning_rate": 9.670905922334757e-05, + "loss": 2.0201, + "step": 4649 + }, + { + "epoch": 1.427255985267035, + "grad_norm": 0.5181389451026917, + "learning_rate": 9.670728550184231e-05, + "loss": 2.0013, + "step": 4650 + }, + { + "epoch": 1.4275629220380601, + "grad_norm": 0.48453402519226074, + "learning_rate": 9.670551131874829e-05, + "loss": 1.9536, + "step": 4651 + }, + { + "epoch": 1.4278698588090855, + "grad_norm": 0.49652302265167236, + "learning_rate": 9.670373667408303e-05, + "loss": 1.9934, + "step": 4652 + }, + { + "epoch": 1.4281767955801106, + "grad_norm": 0.47071191668510437, + "learning_rate": 9.670196156786406e-05, + "loss": 2.0319, + "step": 4653 + }, + { + "epoch": 1.4284837323511357, + "grad_norm": 0.46828708052635193, + "learning_rate": 9.670018600010894e-05, + "loss": 1.9248, + "step": 4654 + }, + { + "epoch": 1.4287906691221608, + "grad_norm": 0.48472490906715393, + "learning_rate": 9.669840997083524e-05, + "loss": 1.9681, + "step": 4655 + }, + { + "epoch": 1.429097605893186, + "grad_norm": 0.48628562688827515, + "learning_rate": 9.669663348006044e-05, + "loss": 1.9818, + "step": 4656 + }, + { + "epoch": 1.4294045426642112, + "grad_norm": 0.40770742297172546, + "learning_rate": 9.669485652780215e-05, + "loss": 1.927, + "step": 4657 + }, + { + "epoch": 1.4297114794352364, + "grad_norm": 0.5005267858505249, + "learning_rate": 9.669307911407794e-05, + "loss": 2.0564, + "step": 4658 + }, + { + "epoch": 1.4300184162062615, + "grad_norm": 0.42432111501693726, + "learning_rate": 9.669130123890533e-05, + "loss": 1.9344, + "step": 4659 + }, + { + "epoch": 1.4303253529772868, + "grad_norm": 0.42347240447998047, + "learning_rate": 9.668952290230192e-05, + "loss": 1.962, + "step": 4660 + }, + { + "epoch": 1.430632289748312, + "grad_norm": 0.4718005955219269, + "learning_rate": 9.668774410428529e-05, + "loss": 2.0081, + "step": 4661 + }, + { + "epoch": 1.430939226519337, + "grad_norm": 0.45922374725341797, + "learning_rate": 9.6685964844873e-05, + "loss": 1.9378, + "step": 4662 + }, + { + "epoch": 1.4312461632903621, + "grad_norm": 0.43764227628707886, + "learning_rate": 9.668418512408263e-05, + "loss": 2.0084, + "step": 4663 + }, + { + "epoch": 1.4315531000613873, + "grad_norm": 0.42079678177833557, + "learning_rate": 9.668240494193179e-05, + "loss": 1.9675, + "step": 4664 + }, + { + "epoch": 1.4318600368324126, + "grad_norm": 0.4470539093017578, + "learning_rate": 9.668062429843808e-05, + "loss": 1.9781, + "step": 4665 + }, + { + "epoch": 1.4321669736034377, + "grad_norm": 0.4903084337711334, + "learning_rate": 9.667884319361906e-05, + "loss": 1.9612, + "step": 4666 + }, + { + "epoch": 1.4324739103744628, + "grad_norm": 0.4906228482723236, + "learning_rate": 9.667706162749234e-05, + "loss": 2.0115, + "step": 4667 + }, + { + "epoch": 1.4327808471454881, + "grad_norm": 0.4868105351924896, + "learning_rate": 9.667527960007556e-05, + "loss": 1.9648, + "step": 4668 + }, + { + "epoch": 1.4330877839165133, + "grad_norm": 0.5115882754325867, + "learning_rate": 9.667349711138632e-05, + "loss": 2.0366, + "step": 4669 + }, + { + "epoch": 1.4333947206875384, + "grad_norm": 0.47366276383399963, + "learning_rate": 9.66717141614422e-05, + "loss": 1.9467, + "step": 4670 + }, + { + "epoch": 1.4337016574585635, + "grad_norm": 0.6110171675682068, + "learning_rate": 9.666993075026086e-05, + "loss": 1.9272, + "step": 4671 + }, + { + "epoch": 1.4340085942295886, + "grad_norm": 0.5915683507919312, + "learning_rate": 9.66681468778599e-05, + "loss": 2.0444, + "step": 4672 + }, + { + "epoch": 1.434315531000614, + "grad_norm": 0.5783519744873047, + "learning_rate": 9.666636254425697e-05, + "loss": 1.9579, + "step": 4673 + }, + { + "epoch": 1.434622467771639, + "grad_norm": 0.4646502137184143, + "learning_rate": 9.66645777494697e-05, + "loss": 1.9172, + "step": 4674 + }, + { + "epoch": 1.4349294045426642, + "grad_norm": 0.4184744656085968, + "learning_rate": 9.666279249351571e-05, + "loss": 1.9189, + "step": 4675 + }, + { + "epoch": 1.4352363413136895, + "grad_norm": 0.5444575548171997, + "learning_rate": 9.666100677641266e-05, + "loss": 2.045, + "step": 4676 + }, + { + "epoch": 1.4355432780847146, + "grad_norm": 0.5232846140861511, + "learning_rate": 9.665922059817818e-05, + "loss": 2.0059, + "step": 4677 + }, + { + "epoch": 1.4358502148557397, + "grad_norm": 0.439259797334671, + "learning_rate": 9.665743395882994e-05, + "loss": 1.9164, + "step": 4678 + }, + { + "epoch": 1.4361571516267648, + "grad_norm": 0.405073344707489, + "learning_rate": 9.66556468583856e-05, + "loss": 1.9211, + "step": 4679 + }, + { + "epoch": 1.43646408839779, + "grad_norm": 0.47113174200057983, + "learning_rate": 9.665385929686279e-05, + "loss": 2.0732, + "step": 4680 + }, + { + "epoch": 1.4367710251688153, + "grad_norm": 0.4710143506526947, + "learning_rate": 9.665207127427923e-05, + "loss": 1.9153, + "step": 4681 + }, + { + "epoch": 1.4370779619398404, + "grad_norm": 0.41988152265548706, + "learning_rate": 9.665028279065254e-05, + "loss": 1.9985, + "step": 4682 + }, + { + "epoch": 1.4373848987108655, + "grad_norm": 0.4629889130592346, + "learning_rate": 9.664849384600042e-05, + "loss": 2.0188, + "step": 4683 + }, + { + "epoch": 1.4376918354818908, + "grad_norm": 0.42099106311798096, + "learning_rate": 9.664670444034051e-05, + "loss": 1.8915, + "step": 4684 + }, + { + "epoch": 1.437998772252916, + "grad_norm": 0.4132508337497711, + "learning_rate": 9.664491457369056e-05, + "loss": 1.9842, + "step": 4685 + }, + { + "epoch": 1.438305709023941, + "grad_norm": 0.4019499123096466, + "learning_rate": 9.664312424606822e-05, + "loss": 1.8653, + "step": 4686 + }, + { + "epoch": 1.4386126457949662, + "grad_norm": 0.40366294980049133, + "learning_rate": 9.664133345749118e-05, + "loss": 1.8993, + "step": 4687 + }, + { + "epoch": 1.4389195825659913, + "grad_norm": 0.4391988217830658, + "learning_rate": 9.663954220797715e-05, + "loss": 1.9471, + "step": 4688 + }, + { + "epoch": 1.4392265193370166, + "grad_norm": 0.44109684228897095, + "learning_rate": 9.663775049754382e-05, + "loss": 1.9579, + "step": 4689 + }, + { + "epoch": 1.4395334561080417, + "grad_norm": 0.45682960748672485, + "learning_rate": 9.663595832620891e-05, + "loss": 1.9757, + "step": 4690 + }, + { + "epoch": 1.4398403928790668, + "grad_norm": 0.4106207489967346, + "learning_rate": 9.663416569399013e-05, + "loss": 2.0038, + "step": 4691 + }, + { + "epoch": 1.4401473296500922, + "grad_norm": 0.4627512991428375, + "learning_rate": 9.66323726009052e-05, + "loss": 2.0253, + "step": 4692 + }, + { + "epoch": 1.4404542664211173, + "grad_norm": 0.43822941184043884, + "learning_rate": 9.663057904697182e-05, + "loss": 1.9565, + "step": 4693 + }, + { + "epoch": 1.4407612031921424, + "grad_norm": 0.46254315972328186, + "learning_rate": 9.662878503220772e-05, + "loss": 2.0042, + "step": 4694 + }, + { + "epoch": 1.4410681399631675, + "grad_norm": 0.49801671504974365, + "learning_rate": 9.662699055663065e-05, + "loss": 1.9725, + "step": 4695 + }, + { + "epoch": 1.4413750767341926, + "grad_norm": 0.40280646085739136, + "learning_rate": 9.662519562025832e-05, + "loss": 1.9016, + "step": 4696 + }, + { + "epoch": 1.441682013505218, + "grad_norm": 0.4095497131347656, + "learning_rate": 9.662340022310848e-05, + "loss": 2.0054, + "step": 4697 + }, + { + "epoch": 1.441988950276243, + "grad_norm": 0.44916659593582153, + "learning_rate": 9.662160436519889e-05, + "loss": 2.0126, + "step": 4698 + }, + { + "epoch": 1.4422958870472682, + "grad_norm": 0.47450655698776245, + "learning_rate": 9.661980804654725e-05, + "loss": 1.9679, + "step": 4699 + }, + { + "epoch": 1.4426028238182935, + "grad_norm": 0.4454696774482727, + "learning_rate": 9.661801126717136e-05, + "loss": 1.9335, + "step": 4700 + }, + { + "epoch": 1.4429097605893186, + "grad_norm": 0.5009927153587341, + "learning_rate": 9.661621402708896e-05, + "loss": 1.9777, + "step": 4701 + }, + { + "epoch": 1.4432166973603437, + "grad_norm": 0.49912458658218384, + "learning_rate": 9.66144163263178e-05, + "loss": 2.0095, + "step": 4702 + }, + { + "epoch": 1.4435236341313689, + "grad_norm": 0.4477069079875946, + "learning_rate": 9.661261816487568e-05, + "loss": 1.9265, + "step": 4703 + }, + { + "epoch": 1.443830570902394, + "grad_norm": 0.4170798361301422, + "learning_rate": 9.661081954278033e-05, + "loss": 1.9458, + "step": 4704 + }, + { + "epoch": 1.4441375076734193, + "grad_norm": 0.45160573720932007, + "learning_rate": 9.660902046004953e-05, + "loss": 1.9596, + "step": 4705 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4391551911830902, + "learning_rate": 9.660722091670109e-05, + "loss": 1.9158, + "step": 4706 + }, + { + "epoch": 1.4447513812154695, + "grad_norm": 0.5183218121528625, + "learning_rate": 9.660542091275276e-05, + "loss": 2.0055, + "step": 4707 + }, + { + "epoch": 1.4450583179864949, + "grad_norm": 0.49749481678009033, + "learning_rate": 9.660362044822235e-05, + "loss": 1.9695, + "step": 4708 + }, + { + "epoch": 1.44536525475752, + "grad_norm": 0.4839307963848114, + "learning_rate": 9.660181952312766e-05, + "loss": 1.9447, + "step": 4709 + }, + { + "epoch": 1.445672191528545, + "grad_norm": 0.5218588709831238, + "learning_rate": 9.660001813748647e-05, + "loss": 1.9892, + "step": 4710 + }, + { + "epoch": 1.4459791282995704, + "grad_norm": 0.5628986954689026, + "learning_rate": 9.659821629131658e-05, + "loss": 2.0598, + "step": 4711 + }, + { + "epoch": 1.4462860650705955, + "grad_norm": 0.5226300358772278, + "learning_rate": 9.65964139846358e-05, + "loss": 1.977, + "step": 4712 + }, + { + "epoch": 1.4465930018416207, + "grad_norm": 0.4345463216304779, + "learning_rate": 9.659461121746196e-05, + "loss": 1.9649, + "step": 4713 + }, + { + "epoch": 1.4468999386126458, + "grad_norm": 0.47233885526657104, + "learning_rate": 9.659280798981285e-05, + "loss": 1.9791, + "step": 4714 + }, + { + "epoch": 1.4472068753836709, + "grad_norm": 0.5272542238235474, + "learning_rate": 9.659100430170631e-05, + "loss": 2.0153, + "step": 4715 + }, + { + "epoch": 1.4475138121546962, + "grad_norm": 0.5567492246627808, + "learning_rate": 9.658920015316015e-05, + "loss": 2.0196, + "step": 4716 + }, + { + "epoch": 1.4478207489257213, + "grad_norm": 0.5393046140670776, + "learning_rate": 9.658739554419222e-05, + "loss": 1.9871, + "step": 4717 + }, + { + "epoch": 1.4481276856967464, + "grad_norm": 0.46408072113990784, + "learning_rate": 9.658559047482034e-05, + "loss": 1.9896, + "step": 4718 + }, + { + "epoch": 1.4484346224677718, + "grad_norm": 0.47001218795776367, + "learning_rate": 9.658378494506234e-05, + "loss": 2.0281, + "step": 4719 + }, + { + "epoch": 1.4487415592387969, + "grad_norm": 0.555749773979187, + "learning_rate": 9.658197895493608e-05, + "loss": 2.0184, + "step": 4720 + }, + { + "epoch": 1.449048496009822, + "grad_norm": 0.6206443905830383, + "learning_rate": 9.65801725044594e-05, + "loss": 1.9788, + "step": 4721 + }, + { + "epoch": 1.449355432780847, + "grad_norm": 0.533336877822876, + "learning_rate": 9.657836559365016e-05, + "loss": 1.9755, + "step": 4722 + }, + { + "epoch": 1.4496623695518722, + "grad_norm": 0.4553185701370239, + "learning_rate": 9.65765582225262e-05, + "loss": 1.9791, + "step": 4723 + }, + { + "epoch": 1.4499693063228976, + "grad_norm": 0.5754305124282837, + "learning_rate": 9.65747503911054e-05, + "loss": 1.9485, + "step": 4724 + }, + { + "epoch": 1.4502762430939227, + "grad_norm": 0.6812698245048523, + "learning_rate": 9.657294209940562e-05, + "loss": 2.0326, + "step": 4725 + }, + { + "epoch": 1.4505831798649478, + "grad_norm": 0.7532522678375244, + "learning_rate": 9.657113334744472e-05, + "loss": 1.9387, + "step": 4726 + }, + { + "epoch": 1.4508901166359731, + "grad_norm": 0.5618684887886047, + "learning_rate": 9.656932413524058e-05, + "loss": 1.9395, + "step": 4727 + }, + { + "epoch": 1.4511970534069982, + "grad_norm": 0.4818387031555176, + "learning_rate": 9.65675144628111e-05, + "loss": 1.9473, + "step": 4728 + }, + { + "epoch": 1.4515039901780233, + "grad_norm": 0.5152607560157776, + "learning_rate": 9.656570433017413e-05, + "loss": 1.894, + "step": 4729 + }, + { + "epoch": 1.4518109269490485, + "grad_norm": 0.5098578333854675, + "learning_rate": 9.656389373734759e-05, + "loss": 1.9519, + "step": 4730 + }, + { + "epoch": 1.4521178637200736, + "grad_norm": 0.5862317681312561, + "learning_rate": 9.656208268434936e-05, + "loss": 1.9968, + "step": 4731 + }, + { + "epoch": 1.452424800491099, + "grad_norm": 0.501220703125, + "learning_rate": 9.656027117119732e-05, + "loss": 1.993, + "step": 4732 + }, + { + "epoch": 1.452731737262124, + "grad_norm": 0.4974796772003174, + "learning_rate": 9.655845919790943e-05, + "loss": 2.0007, + "step": 4733 + }, + { + "epoch": 1.4530386740331491, + "grad_norm": 0.513671875, + "learning_rate": 9.655664676450351e-05, + "loss": 1.9321, + "step": 4734 + }, + { + "epoch": 1.4533456108041745, + "grad_norm": 0.5111755728721619, + "learning_rate": 9.655483387099756e-05, + "loss": 2.0187, + "step": 4735 + }, + { + "epoch": 1.4536525475751996, + "grad_norm": 0.47103258967399597, + "learning_rate": 9.655302051740942e-05, + "loss": 1.9716, + "step": 4736 + }, + { + "epoch": 1.4539594843462247, + "grad_norm": 0.4526553750038147, + "learning_rate": 9.655120670375707e-05, + "loss": 2.0424, + "step": 4737 + }, + { + "epoch": 1.4542664211172498, + "grad_norm": 0.44393640756607056, + "learning_rate": 9.65493924300584e-05, + "loss": 1.9318, + "step": 4738 + }, + { + "epoch": 1.454573357888275, + "grad_norm": 0.4070759415626526, + "learning_rate": 9.654757769633136e-05, + "loss": 1.9292, + "step": 4739 + }, + { + "epoch": 1.4548802946593002, + "grad_norm": 0.4010253846645355, + "learning_rate": 9.654576250259387e-05, + "loss": 1.9641, + "step": 4740 + }, + { + "epoch": 1.4551872314303254, + "grad_norm": 0.39156264066696167, + "learning_rate": 9.654394684886387e-05, + "loss": 1.9575, + "step": 4741 + }, + { + "epoch": 1.4554941682013505, + "grad_norm": 0.4360155463218689, + "learning_rate": 9.65421307351593e-05, + "loss": 1.9615, + "step": 4742 + }, + { + "epoch": 1.4558011049723758, + "grad_norm": 0.4203348755836487, + "learning_rate": 9.654031416149813e-05, + "loss": 1.9629, + "step": 4743 + }, + { + "epoch": 1.456108041743401, + "grad_norm": 0.42294225096702576, + "learning_rate": 9.653849712789828e-05, + "loss": 1.9756, + "step": 4744 + }, + { + "epoch": 1.456414978514426, + "grad_norm": 0.46253907680511475, + "learning_rate": 9.653667963437775e-05, + "loss": 2.0128, + "step": 4745 + }, + { + "epoch": 1.4567219152854511, + "grad_norm": 0.41743987798690796, + "learning_rate": 9.653486168095446e-05, + "loss": 1.938, + "step": 4746 + }, + { + "epoch": 1.4570288520564763, + "grad_norm": 0.43411263823509216, + "learning_rate": 9.653304326764639e-05, + "loss": 1.9744, + "step": 4747 + }, + { + "epoch": 1.4573357888275016, + "grad_norm": 0.4569607973098755, + "learning_rate": 9.653122439447151e-05, + "loss": 1.9844, + "step": 4748 + }, + { + "epoch": 1.4576427255985267, + "grad_norm": 0.41858115792274475, + "learning_rate": 9.652940506144781e-05, + "loss": 1.9835, + "step": 4749 + }, + { + "epoch": 1.4579496623695518, + "grad_norm": 0.4259703755378723, + "learning_rate": 9.652758526859324e-05, + "loss": 1.9467, + "step": 4750 + }, + { + "epoch": 1.4582565991405771, + "grad_norm": 0.49847620725631714, + "learning_rate": 9.652576501592583e-05, + "loss": 1.989, + "step": 4751 + }, + { + "epoch": 1.4585635359116023, + "grad_norm": 0.5898705720901489, + "learning_rate": 9.652394430346352e-05, + "loss": 1.9896, + "step": 4752 + }, + { + "epoch": 1.4588704726826274, + "grad_norm": 0.6528434157371521, + "learning_rate": 9.652212313122433e-05, + "loss": 1.9814, + "step": 4753 + }, + { + "epoch": 1.4591774094536525, + "grad_norm": 0.5704251527786255, + "learning_rate": 9.652030149922624e-05, + "loss": 1.9735, + "step": 4754 + }, + { + "epoch": 1.4594843462246776, + "grad_norm": 0.4349142014980316, + "learning_rate": 9.651847940748727e-05, + "loss": 1.9923, + "step": 4755 + }, + { + "epoch": 1.459791282995703, + "grad_norm": 0.43891096115112305, + "learning_rate": 9.651665685602542e-05, + "loss": 1.9429, + "step": 4756 + }, + { + "epoch": 1.460098219766728, + "grad_norm": 0.5881633758544922, + "learning_rate": 9.651483384485871e-05, + "loss": 2.0075, + "step": 4757 + }, + { + "epoch": 1.4604051565377532, + "grad_norm": 0.569064736366272, + "learning_rate": 9.651301037400515e-05, + "loss": 1.9968, + "step": 4758 + }, + { + "epoch": 1.4607120933087785, + "grad_norm": 0.49636805057525635, + "learning_rate": 9.651118644348276e-05, + "loss": 2.0844, + "step": 4759 + }, + { + "epoch": 1.4610190300798036, + "grad_norm": 0.4893283247947693, + "learning_rate": 9.650936205330955e-05, + "loss": 1.9635, + "step": 4760 + }, + { + "epoch": 1.4613259668508287, + "grad_norm": 0.5199632048606873, + "learning_rate": 9.650753720350358e-05, + "loss": 1.8934, + "step": 4761 + }, + { + "epoch": 1.4616329036218538, + "grad_norm": 0.5655859708786011, + "learning_rate": 9.650571189408287e-05, + "loss": 2.0473, + "step": 4762 + }, + { + "epoch": 1.461939840392879, + "grad_norm": 0.5004158020019531, + "learning_rate": 9.650388612506545e-05, + "loss": 1.9388, + "step": 4763 + }, + { + "epoch": 1.4622467771639043, + "grad_norm": 0.5075541734695435, + "learning_rate": 9.650205989646937e-05, + "loss": 2.0362, + "step": 4764 + }, + { + "epoch": 1.4625537139349294, + "grad_norm": 0.52835613489151, + "learning_rate": 9.650023320831267e-05, + "loss": 1.9849, + "step": 4765 + }, + { + "epoch": 1.4628606507059545, + "grad_norm": 0.5208338499069214, + "learning_rate": 9.649840606061342e-05, + "loss": 1.9619, + "step": 4766 + }, + { + "epoch": 1.4631675874769798, + "grad_norm": 0.4954691529273987, + "learning_rate": 9.649657845338966e-05, + "loss": 1.9282, + "step": 4767 + }, + { + "epoch": 1.463474524248005, + "grad_norm": 0.4260660409927368, + "learning_rate": 9.649475038665947e-05, + "loss": 2.0108, + "step": 4768 + }, + { + "epoch": 1.46378146101903, + "grad_norm": 0.4954771101474762, + "learning_rate": 9.64929218604409e-05, + "loss": 1.9995, + "step": 4769 + }, + { + "epoch": 1.4640883977900552, + "grad_norm": 0.6004415154457092, + "learning_rate": 9.649109287475202e-05, + "loss": 1.9816, + "step": 4770 + }, + { + "epoch": 1.4643953345610803, + "grad_norm": 0.6472858190536499, + "learning_rate": 9.648926342961092e-05, + "loss": 1.927, + "step": 4771 + }, + { + "epoch": 1.4647022713321056, + "grad_norm": 0.5293224453926086, + "learning_rate": 9.648743352503567e-05, + "loss": 1.9082, + "step": 4772 + }, + { + "epoch": 1.4650092081031307, + "grad_norm": 0.4413148760795593, + "learning_rate": 9.648560316104435e-05, + "loss": 1.9368, + "step": 4773 + }, + { + "epoch": 1.4653161448741558, + "grad_norm": 0.4727863371372223, + "learning_rate": 9.648377233765507e-05, + "loss": 1.944, + "step": 4774 + }, + { + "epoch": 1.4656230816451812, + "grad_norm": 0.5681154131889343, + "learning_rate": 9.648194105488589e-05, + "loss": 2.0003, + "step": 4775 + }, + { + "epoch": 1.4659300184162063, + "grad_norm": 0.5893644690513611, + "learning_rate": 9.648010931275493e-05, + "loss": 1.936, + "step": 4776 + }, + { + "epoch": 1.4662369551872314, + "grad_norm": 0.5034298300743103, + "learning_rate": 9.647827711128029e-05, + "loss": 2.0318, + "step": 4777 + }, + { + "epoch": 1.4665438919582565, + "grad_norm": 0.4954885244369507, + "learning_rate": 9.647644445048006e-05, + "loss": 2.0053, + "step": 4778 + }, + { + "epoch": 1.4668508287292816, + "grad_norm": 0.475923627614975, + "learning_rate": 9.647461133037236e-05, + "loss": 1.8911, + "step": 4779 + }, + { + "epoch": 1.467157765500307, + "grad_norm": 0.4725008010864258, + "learning_rate": 9.647277775097534e-05, + "loss": 1.8954, + "step": 4780 + }, + { + "epoch": 1.467464702271332, + "grad_norm": 0.4183707535266876, + "learning_rate": 9.647094371230707e-05, + "loss": 1.9891, + "step": 4781 + }, + { + "epoch": 1.4677716390423572, + "grad_norm": 0.4862513244152069, + "learning_rate": 9.64691092143857e-05, + "loss": 2.0364, + "step": 4782 + }, + { + "epoch": 1.4680785758133825, + "grad_norm": 0.5038082599639893, + "learning_rate": 9.646727425722936e-05, + "loss": 1.9304, + "step": 4783 + }, + { + "epoch": 1.4683855125844076, + "grad_norm": 0.47281327843666077, + "learning_rate": 9.646543884085618e-05, + "loss": 1.9453, + "step": 4784 + }, + { + "epoch": 1.4686924493554327, + "grad_norm": 0.42275354266166687, + "learning_rate": 9.646360296528431e-05, + "loss": 1.9434, + "step": 4785 + }, + { + "epoch": 1.468999386126458, + "grad_norm": 0.5757746696472168, + "learning_rate": 9.646176663053185e-05, + "loss": 2.0241, + "step": 4786 + }, + { + "epoch": 1.4693063228974832, + "grad_norm": 0.6757779121398926, + "learning_rate": 9.645992983661701e-05, + "loss": 1.9823, + "step": 4787 + }, + { + "epoch": 1.4696132596685083, + "grad_norm": 0.7052981853485107, + "learning_rate": 9.645809258355792e-05, + "loss": 2.0553, + "step": 4788 + }, + { + "epoch": 1.4699201964395334, + "grad_norm": 0.5630238652229309, + "learning_rate": 9.64562548713727e-05, + "loss": 2.0241, + "step": 4789 + }, + { + "epoch": 1.4702271332105585, + "grad_norm": 0.5034958124160767, + "learning_rate": 9.645441670007955e-05, + "loss": 1.9788, + "step": 4790 + }, + { + "epoch": 1.4705340699815839, + "grad_norm": 0.48978129029273987, + "learning_rate": 9.645257806969663e-05, + "loss": 1.9415, + "step": 4791 + }, + { + "epoch": 1.470841006752609, + "grad_norm": 0.4718508720397949, + "learning_rate": 9.645073898024211e-05, + "loss": 1.9657, + "step": 4792 + }, + { + "epoch": 1.471147943523634, + "grad_norm": 0.5171064734458923, + "learning_rate": 9.644889943173417e-05, + "loss": 1.9311, + "step": 4793 + }, + { + "epoch": 1.4714548802946594, + "grad_norm": 0.4556005597114563, + "learning_rate": 9.644705942419097e-05, + "loss": 1.9093, + "step": 4794 + }, + { + "epoch": 1.4717618170656845, + "grad_norm": 0.44836321473121643, + "learning_rate": 9.64452189576307e-05, + "loss": 1.9715, + "step": 4795 + }, + { + "epoch": 1.4720687538367097, + "grad_norm": 0.5139105916023254, + "learning_rate": 9.644337803207155e-05, + "loss": 1.967, + "step": 4796 + }, + { + "epoch": 1.4723756906077348, + "grad_norm": 0.49145743250846863, + "learning_rate": 9.644153664753173e-05, + "loss": 1.9679, + "step": 4797 + }, + { + "epoch": 1.4726826273787599, + "grad_norm": 0.4353790283203125, + "learning_rate": 9.643969480402942e-05, + "loss": 1.9438, + "step": 4798 + }, + { + "epoch": 1.4729895641497852, + "grad_norm": 0.39393118023872375, + "learning_rate": 9.643785250158283e-05, + "loss": 1.91, + "step": 4799 + }, + { + "epoch": 1.4732965009208103, + "grad_norm": 0.4250284731388092, + "learning_rate": 9.643600974021017e-05, + "loss": 1.9315, + "step": 4800 + }, + { + "epoch": 1.4736034376918354, + "grad_norm": 0.40301406383514404, + "learning_rate": 9.643416651992962e-05, + "loss": 1.9344, + "step": 4801 + }, + { + "epoch": 1.4739103744628608, + "grad_norm": 0.4428589940071106, + "learning_rate": 9.643232284075944e-05, + "loss": 1.9767, + "step": 4802 + }, + { + "epoch": 1.4742173112338859, + "grad_norm": 0.5098150372505188, + "learning_rate": 9.643047870271783e-05, + "loss": 2.0471, + "step": 4803 + }, + { + "epoch": 1.474524248004911, + "grad_norm": 0.5230079293251038, + "learning_rate": 9.642863410582302e-05, + "loss": 1.9647, + "step": 4804 + }, + { + "epoch": 1.474831184775936, + "grad_norm": 0.44200628995895386, + "learning_rate": 9.642678905009322e-05, + "loss": 1.9046, + "step": 4805 + }, + { + "epoch": 1.4751381215469612, + "grad_norm": 0.42684751749038696, + "learning_rate": 9.642494353554669e-05, + "loss": 1.82, + "step": 4806 + }, + { + "epoch": 1.4754450583179866, + "grad_norm": 0.3907437324523926, + "learning_rate": 9.642309756220165e-05, + "loss": 1.9257, + "step": 4807 + }, + { + "epoch": 1.4757519950890117, + "grad_norm": 0.43622660636901855, + "learning_rate": 9.642125113007636e-05, + "loss": 1.9319, + "step": 4808 + }, + { + "epoch": 1.4760589318600368, + "grad_norm": 0.4553097188472748, + "learning_rate": 9.641940423918905e-05, + "loss": 1.9699, + "step": 4809 + }, + { + "epoch": 1.4763658686310621, + "grad_norm": 0.48997193574905396, + "learning_rate": 9.641755688955798e-05, + "loss": 1.9843, + "step": 4810 + }, + { + "epoch": 1.4766728054020872, + "grad_norm": 0.5008227825164795, + "learning_rate": 9.641570908120141e-05, + "loss": 1.9616, + "step": 4811 + }, + { + "epoch": 1.4769797421731123, + "grad_norm": 0.49788615107536316, + "learning_rate": 9.64138608141376e-05, + "loss": 2.0233, + "step": 4812 + }, + { + "epoch": 1.4772866789441375, + "grad_norm": 0.509159505367279, + "learning_rate": 9.64120120883848e-05, + "loss": 1.9982, + "step": 4813 + }, + { + "epoch": 1.4775936157151626, + "grad_norm": 0.4976164996623993, + "learning_rate": 9.641016290396132e-05, + "loss": 1.9944, + "step": 4814 + }, + { + "epoch": 1.477900552486188, + "grad_norm": 0.4925370514392853, + "learning_rate": 9.640831326088539e-05, + "loss": 1.9547, + "step": 4815 + }, + { + "epoch": 1.478207489257213, + "grad_norm": 0.5058705806732178, + "learning_rate": 9.64064631591753e-05, + "loss": 2.0147, + "step": 4816 + }, + { + "epoch": 1.4785144260282381, + "grad_norm": 0.5614715814590454, + "learning_rate": 9.640461259884937e-05, + "loss": 1.9475, + "step": 4817 + }, + { + "epoch": 1.4788213627992635, + "grad_norm": 0.4417608380317688, + "learning_rate": 9.640276157992582e-05, + "loss": 1.9422, + "step": 4818 + }, + { + "epoch": 1.4791282995702886, + "grad_norm": 0.5124607682228088, + "learning_rate": 9.6400910102423e-05, + "loss": 1.9489, + "step": 4819 + }, + { + "epoch": 1.4794352363413137, + "grad_norm": 0.4931279420852661, + "learning_rate": 9.63990581663592e-05, + "loss": 1.9717, + "step": 4820 + }, + { + "epoch": 1.4797421731123388, + "grad_norm": 0.4716447591781616, + "learning_rate": 9.639720577175271e-05, + "loss": 1.9758, + "step": 4821 + }, + { + "epoch": 1.480049109883364, + "grad_norm": 0.4613695740699768, + "learning_rate": 9.639535291862183e-05, + "loss": 1.8998, + "step": 4822 + }, + { + "epoch": 1.4803560466543892, + "grad_norm": 0.4430600702762604, + "learning_rate": 9.639349960698489e-05, + "loss": 1.9539, + "step": 4823 + }, + { + "epoch": 1.4806629834254144, + "grad_norm": 0.45596009492874146, + "learning_rate": 9.639164583686018e-05, + "loss": 1.9626, + "step": 4824 + }, + { + "epoch": 1.4809699201964395, + "grad_norm": 0.4248705804347992, + "learning_rate": 9.638979160826604e-05, + "loss": 1.9627, + "step": 4825 + }, + { + "epoch": 1.4812768569674648, + "grad_norm": 0.43419960141181946, + "learning_rate": 9.63879369212208e-05, + "loss": 1.9589, + "step": 4826 + }, + { + "epoch": 1.48158379373849, + "grad_norm": 0.4715637266635895, + "learning_rate": 9.638608177574278e-05, + "loss": 1.981, + "step": 4827 + }, + { + "epoch": 1.481890730509515, + "grad_norm": 0.41809993982315063, + "learning_rate": 9.63842261718503e-05, + "loss": 1.9587, + "step": 4828 + }, + { + "epoch": 1.4821976672805401, + "grad_norm": 0.4085060656070709, + "learning_rate": 9.63823701095617e-05, + "loss": 1.9497, + "step": 4829 + }, + { + "epoch": 1.4825046040515653, + "grad_norm": 0.4199173152446747, + "learning_rate": 9.638051358889535e-05, + "loss": 1.9543, + "step": 4830 + }, + { + "epoch": 1.4828115408225906, + "grad_norm": 0.4560040235519409, + "learning_rate": 9.637865660986958e-05, + "loss": 1.9451, + "step": 4831 + }, + { + "epoch": 1.4831184775936157, + "grad_norm": 0.4059405028820038, + "learning_rate": 9.637679917250272e-05, + "loss": 1.9154, + "step": 4832 + }, + { + "epoch": 1.4834254143646408, + "grad_norm": 0.43314236402511597, + "learning_rate": 9.637494127681318e-05, + "loss": 1.9589, + "step": 4833 + }, + { + "epoch": 1.4837323511356661, + "grad_norm": 0.3866138458251953, + "learning_rate": 9.637308292281928e-05, + "loss": 1.9239, + "step": 4834 + }, + { + "epoch": 1.4840392879066913, + "grad_norm": 0.40781381726264954, + "learning_rate": 9.637122411053939e-05, + "loss": 1.9805, + "step": 4835 + }, + { + "epoch": 1.4843462246777164, + "grad_norm": 0.4605334401130676, + "learning_rate": 9.636936483999189e-05, + "loss": 1.9571, + "step": 4836 + }, + { + "epoch": 1.4846531614487415, + "grad_norm": 0.4730539917945862, + "learning_rate": 9.636750511119513e-05, + "loss": 1.9429, + "step": 4837 + }, + { + "epoch": 1.4849600982197666, + "grad_norm": 0.47973817586898804, + "learning_rate": 9.636564492416753e-05, + "loss": 1.9865, + "step": 4838 + }, + { + "epoch": 1.485267034990792, + "grad_norm": 0.4541794955730438, + "learning_rate": 9.636378427892744e-05, + "loss": 1.9796, + "step": 4839 + }, + { + "epoch": 1.485573971761817, + "grad_norm": 0.4863722026348114, + "learning_rate": 9.636192317549327e-05, + "loss": 1.9581, + "step": 4840 + }, + { + "epoch": 1.4858809085328422, + "grad_norm": 0.4559536278247833, + "learning_rate": 9.636006161388338e-05, + "loss": 1.9444, + "step": 4841 + }, + { + "epoch": 1.4861878453038675, + "grad_norm": 0.4385206401348114, + "learning_rate": 9.63581995941162e-05, + "loss": 1.9323, + "step": 4842 + }, + { + "epoch": 1.4864947820748926, + "grad_norm": 0.48802945017814636, + "learning_rate": 9.635633711621012e-05, + "loss": 1.9643, + "step": 4843 + }, + { + "epoch": 1.4868017188459177, + "grad_norm": 0.4051367938518524, + "learning_rate": 9.635447418018355e-05, + "loss": 1.9342, + "step": 4844 + }, + { + "epoch": 1.4871086556169428, + "grad_norm": 0.46384257078170776, + "learning_rate": 9.63526107860549e-05, + "loss": 1.9656, + "step": 4845 + }, + { + "epoch": 1.487415592387968, + "grad_norm": 0.3950713574886322, + "learning_rate": 9.635074693384257e-05, + "loss": 1.8673, + "step": 4846 + }, + { + "epoch": 1.4877225291589933, + "grad_norm": 0.4694644808769226, + "learning_rate": 9.634888262356501e-05, + "loss": 1.9484, + "step": 4847 + }, + { + "epoch": 1.4880294659300184, + "grad_norm": 0.45068567991256714, + "learning_rate": 9.63470178552406e-05, + "loss": 1.9221, + "step": 4848 + }, + { + "epoch": 1.4883364027010435, + "grad_norm": 0.44717836380004883, + "learning_rate": 9.634515262888781e-05, + "loss": 1.9968, + "step": 4849 + }, + { + "epoch": 1.4886433394720688, + "grad_norm": 0.42189615964889526, + "learning_rate": 9.634328694452506e-05, + "loss": 2.0262, + "step": 4850 + }, + { + "epoch": 1.488950276243094, + "grad_norm": 0.4895322322845459, + "learning_rate": 9.63414208021708e-05, + "loss": 2.0628, + "step": 4851 + }, + { + "epoch": 1.489257213014119, + "grad_norm": 0.4732883870601654, + "learning_rate": 9.633955420184342e-05, + "loss": 1.9487, + "step": 4852 + }, + { + "epoch": 1.4895641497851444, + "grad_norm": 0.4426051676273346, + "learning_rate": 9.633768714356143e-05, + "loss": 2.0181, + "step": 4853 + }, + { + "epoch": 1.4898710865561695, + "grad_norm": 0.5831739902496338, + "learning_rate": 9.633581962734326e-05, + "loss": 1.9311, + "step": 4854 + }, + { + "epoch": 1.4901780233271946, + "grad_norm": 0.6048587560653687, + "learning_rate": 9.633395165320734e-05, + "loss": 1.9159, + "step": 4855 + }, + { + "epoch": 1.4904849600982197, + "grad_norm": 0.60125732421875, + "learning_rate": 9.633208322117218e-05, + "loss": 1.9732, + "step": 4856 + }, + { + "epoch": 1.4907918968692448, + "grad_norm": 0.4806794822216034, + "learning_rate": 9.63302143312562e-05, + "loss": 1.9101, + "step": 4857 + }, + { + "epoch": 1.4910988336402702, + "grad_norm": 0.4032946228981018, + "learning_rate": 9.632834498347789e-05, + "loss": 1.9097, + "step": 4858 + }, + { + "epoch": 1.4914057704112953, + "grad_norm": 0.400632381439209, + "learning_rate": 9.632647517785571e-05, + "loss": 1.9949, + "step": 4859 + }, + { + "epoch": 1.4917127071823204, + "grad_norm": 0.49766576290130615, + "learning_rate": 9.632460491440818e-05, + "loss": 1.9762, + "step": 4860 + }, + { + "epoch": 1.4920196439533457, + "grad_norm": 0.6273209452629089, + "learning_rate": 9.632273419315372e-05, + "loss": 2.0797, + "step": 4861 + }, + { + "epoch": 1.4923265807243709, + "grad_norm": 0.5848406553268433, + "learning_rate": 9.632086301411087e-05, + "loss": 1.9366, + "step": 4862 + }, + { + "epoch": 1.492633517495396, + "grad_norm": 0.4683595597743988, + "learning_rate": 9.631899137729809e-05, + "loss": 1.9802, + "step": 4863 + }, + { + "epoch": 1.492940454266421, + "grad_norm": 0.43066033720970154, + "learning_rate": 9.63171192827339e-05, + "loss": 1.9621, + "step": 4864 + }, + { + "epoch": 1.4932473910374462, + "grad_norm": 0.47469422221183777, + "learning_rate": 9.63152467304368e-05, + "loss": 1.9795, + "step": 4865 + }, + { + "epoch": 1.4935543278084715, + "grad_norm": 0.5453927516937256, + "learning_rate": 9.631337372042526e-05, + "loss": 1.9711, + "step": 4866 + }, + { + "epoch": 1.4938612645794966, + "grad_norm": 0.5361614227294922, + "learning_rate": 9.631150025271782e-05, + "loss": 1.9849, + "step": 4867 + }, + { + "epoch": 1.4941682013505218, + "grad_norm": 0.4773578643798828, + "learning_rate": 9.6309626327333e-05, + "loss": 2.065, + "step": 4868 + }, + { + "epoch": 1.494475138121547, + "grad_norm": 0.428091824054718, + "learning_rate": 9.630775194428932e-05, + "loss": 1.9448, + "step": 4869 + }, + { + "epoch": 1.4947820748925722, + "grad_norm": 0.41679108142852783, + "learning_rate": 9.630587710360527e-05, + "loss": 1.9511, + "step": 4870 + }, + { + "epoch": 1.4950890116635973, + "grad_norm": 0.5072546601295471, + "learning_rate": 9.630400180529942e-05, + "loss": 1.9973, + "step": 4871 + }, + { + "epoch": 1.4953959484346224, + "grad_norm": 0.5230575799942017, + "learning_rate": 9.630212604939026e-05, + "loss": 1.9659, + "step": 4872 + }, + { + "epoch": 1.4957028852056475, + "grad_norm": 0.44307753443717957, + "learning_rate": 9.630024983589638e-05, + "loss": 1.9056, + "step": 4873 + }, + { + "epoch": 1.4960098219766729, + "grad_norm": 0.43783196806907654, + "learning_rate": 9.629837316483628e-05, + "loss": 1.9716, + "step": 4874 + }, + { + "epoch": 1.496316758747698, + "grad_norm": 0.4553990960121155, + "learning_rate": 9.629649603622852e-05, + "loss": 2.044, + "step": 4875 + }, + { + "epoch": 1.496623695518723, + "grad_norm": 0.49152833223342896, + "learning_rate": 9.629461845009164e-05, + "loss": 1.948, + "step": 4876 + }, + { + "epoch": 1.4969306322897484, + "grad_norm": 0.4371738135814667, + "learning_rate": 9.629274040644422e-05, + "loss": 1.9497, + "step": 4877 + }, + { + "epoch": 1.4972375690607735, + "grad_norm": 0.4973873198032379, + "learning_rate": 9.629086190530482e-05, + "loss": 2.0053, + "step": 4878 + }, + { + "epoch": 1.4975445058317987, + "grad_norm": 0.4250672459602356, + "learning_rate": 9.628898294669197e-05, + "loss": 1.9617, + "step": 4879 + }, + { + "epoch": 1.4978514426028238, + "grad_norm": 0.4514639675617218, + "learning_rate": 9.628710353062427e-05, + "loss": 1.9503, + "step": 4880 + }, + { + "epoch": 1.4981583793738489, + "grad_norm": 0.4960804879665375, + "learning_rate": 9.628522365712027e-05, + "loss": 1.9932, + "step": 4881 + }, + { + "epoch": 1.4984653161448742, + "grad_norm": 0.5604363083839417, + "learning_rate": 9.628334332619857e-05, + "loss": 2.0186, + "step": 4882 + }, + { + "epoch": 1.4987722529158993, + "grad_norm": 0.5125443935394287, + "learning_rate": 9.628146253787776e-05, + "loss": 1.9897, + "step": 4883 + }, + { + "epoch": 1.4990791896869244, + "grad_norm": 0.4029771089553833, + "learning_rate": 9.627958129217639e-05, + "loss": 1.9083, + "step": 4884 + }, + { + "epoch": 1.4993861264579498, + "grad_norm": 0.4608222544193268, + "learning_rate": 9.627769958911308e-05, + "loss": 2.0153, + "step": 4885 + }, + { + "epoch": 1.4996930632289749, + "grad_norm": 0.4253246486186981, + "learning_rate": 9.627581742870641e-05, + "loss": 1.9278, + "step": 4886 + }, + { + "epoch": 1.5, + "grad_norm": 0.4247463047504425, + "learning_rate": 9.6273934810975e-05, + "loss": 1.9456, + "step": 4887 + }, + { + "epoch": 1.5003069367710253, + "grad_norm": 0.44055816531181335, + "learning_rate": 9.627205173593744e-05, + "loss": 2.0225, + "step": 4888 + }, + { + "epoch": 1.5006138735420502, + "grad_norm": 0.47912710905075073, + "learning_rate": 9.627016820361235e-05, + "loss": 1.9716, + "step": 4889 + }, + { + "epoch": 1.5009208103130756, + "grad_norm": 0.47608625888824463, + "learning_rate": 9.626828421401832e-05, + "loss": 1.9444, + "step": 4890 + }, + { + "epoch": 1.5012277470841007, + "grad_norm": 0.4757349193096161, + "learning_rate": 9.6266399767174e-05, + "loss": 2.0699, + "step": 4891 + }, + { + "epoch": 1.5015346838551258, + "grad_norm": 0.5556650757789612, + "learning_rate": 9.6264514863098e-05, + "loss": 1.99, + "step": 4892 + }, + { + "epoch": 1.5018416206261511, + "grad_norm": 0.5072291493415833, + "learning_rate": 9.626262950180894e-05, + "loss": 1.9435, + "step": 4893 + }, + { + "epoch": 1.5021485573971762, + "grad_norm": 0.47811564803123474, + "learning_rate": 9.626074368332546e-05, + "loss": 1.9399, + "step": 4894 + }, + { + "epoch": 1.5024554941682013, + "grad_norm": 0.4613232910633087, + "learning_rate": 9.62588574076662e-05, + "loss": 1.9259, + "step": 4895 + }, + { + "epoch": 1.5027624309392267, + "grad_norm": 0.4170697331428528, + "learning_rate": 9.62569706748498e-05, + "loss": 1.9319, + "step": 4896 + }, + { + "epoch": 1.5030693677102516, + "grad_norm": 0.4731575548648834, + "learning_rate": 9.62550834848949e-05, + "loss": 1.9862, + "step": 4897 + }, + { + "epoch": 1.503376304481277, + "grad_norm": 0.49881401658058167, + "learning_rate": 9.625319583782016e-05, + "loss": 1.9837, + "step": 4898 + }, + { + "epoch": 1.503683241252302, + "grad_norm": 0.4689660668373108, + "learning_rate": 9.625130773364424e-05, + "loss": 1.9662, + "step": 4899 + }, + { + "epoch": 1.5039901780233271, + "grad_norm": 0.48389768600463867, + "learning_rate": 9.624941917238577e-05, + "loss": 2.0087, + "step": 4900 + }, + { + "epoch": 1.5042971147943525, + "grad_norm": 0.46716609597206116, + "learning_rate": 9.624753015406342e-05, + "loss": 1.9718, + "step": 4901 + }, + { + "epoch": 1.5046040515653776, + "grad_norm": 0.544793963432312, + "learning_rate": 9.62456406786959e-05, + "loss": 1.9878, + "step": 4902 + }, + { + "epoch": 1.5049109883364027, + "grad_norm": 0.44499701261520386, + "learning_rate": 9.624375074630183e-05, + "loss": 1.8849, + "step": 4903 + }, + { + "epoch": 1.505217925107428, + "grad_norm": 0.42464208602905273, + "learning_rate": 9.624186035689993e-05, + "loss": 1.8995, + "step": 4904 + }, + { + "epoch": 1.505524861878453, + "grad_norm": 0.41650670766830444, + "learning_rate": 9.623996951050885e-05, + "loss": 1.9138, + "step": 4905 + }, + { + "epoch": 1.5058317986494782, + "grad_norm": 0.37955889105796814, + "learning_rate": 9.62380782071473e-05, + "loss": 1.9746, + "step": 4906 + }, + { + "epoch": 1.5061387354205034, + "grad_norm": 0.3799228072166443, + "learning_rate": 9.623618644683394e-05, + "loss": 1.942, + "step": 4907 + }, + { + "epoch": 1.5064456721915285, + "grad_norm": 0.3799766004085541, + "learning_rate": 9.623429422958751e-05, + "loss": 1.9025, + "step": 4908 + }, + { + "epoch": 1.5067526089625538, + "grad_norm": 0.3780234456062317, + "learning_rate": 9.623240155542668e-05, + "loss": 1.9581, + "step": 4909 + }, + { + "epoch": 1.507059545733579, + "grad_norm": 0.36379706859588623, + "learning_rate": 9.623050842437014e-05, + "loss": 1.9299, + "step": 4910 + }, + { + "epoch": 1.507366482504604, + "grad_norm": 0.5230580568313599, + "learning_rate": 9.622861483643663e-05, + "loss": 2.0306, + "step": 4911 + }, + { + "epoch": 1.5076734192756294, + "grad_norm": 0.443945050239563, + "learning_rate": 9.622672079164486e-05, + "loss": 1.9032, + "step": 4912 + }, + { + "epoch": 1.5079803560466543, + "grad_norm": 0.4689701795578003, + "learning_rate": 9.622482629001355e-05, + "loss": 1.9901, + "step": 4913 + }, + { + "epoch": 1.5082872928176796, + "grad_norm": 0.4483632445335388, + "learning_rate": 9.622293133156139e-05, + "loss": 1.948, + "step": 4914 + }, + { + "epoch": 1.5085942295887047, + "grad_norm": 0.4064919948577881, + "learning_rate": 9.622103591630715e-05, + "loss": 1.9487, + "step": 4915 + }, + { + "epoch": 1.5089011663597298, + "grad_norm": 0.44170522689819336, + "learning_rate": 9.621914004426952e-05, + "loss": 1.9929, + "step": 4916 + }, + { + "epoch": 1.5092081031307552, + "grad_norm": 0.45979443192481995, + "learning_rate": 9.621724371546727e-05, + "loss": 1.9428, + "step": 4917 + }, + { + "epoch": 1.5095150399017803, + "grad_norm": 0.5258452892303467, + "learning_rate": 9.621534692991913e-05, + "loss": 2.0049, + "step": 4918 + }, + { + "epoch": 1.5098219766728054, + "grad_norm": 0.45191919803619385, + "learning_rate": 9.621344968764385e-05, + "loss": 2.0364, + "step": 4919 + }, + { + "epoch": 1.5101289134438307, + "grad_norm": 0.539245069026947, + "learning_rate": 9.621155198866016e-05, + "loss": 2.072, + "step": 4920 + }, + { + "epoch": 1.5104358502148556, + "grad_norm": 0.5410256385803223, + "learning_rate": 9.620965383298684e-05, + "loss": 2.0231, + "step": 4921 + }, + { + "epoch": 1.510742786985881, + "grad_norm": 0.4409741759300232, + "learning_rate": 9.620775522064264e-05, + "loss": 1.9024, + "step": 4922 + }, + { + "epoch": 1.511049723756906, + "grad_norm": 0.4911535680294037, + "learning_rate": 9.620585615164631e-05, + "loss": 2.0057, + "step": 4923 + }, + { + "epoch": 1.5113566605279312, + "grad_norm": 0.48139557242393494, + "learning_rate": 9.620395662601663e-05, + "loss": 2.0175, + "step": 4924 + }, + { + "epoch": 1.5116635972989565, + "grad_norm": 0.5130077004432678, + "learning_rate": 9.620205664377238e-05, + "loss": 1.952, + "step": 4925 + }, + { + "epoch": 1.5119705340699816, + "grad_norm": 0.5428542494773865, + "learning_rate": 9.62001562049323e-05, + "loss": 1.977, + "step": 4926 + }, + { + "epoch": 1.5122774708410067, + "grad_norm": 0.4586256444454193, + "learning_rate": 9.619825530951522e-05, + "loss": 1.9997, + "step": 4927 + }, + { + "epoch": 1.512584407612032, + "grad_norm": 0.3941349387168884, + "learning_rate": 9.61963539575399e-05, + "loss": 1.9174, + "step": 4928 + }, + { + "epoch": 1.512891344383057, + "grad_norm": 0.4396456480026245, + "learning_rate": 9.619445214902511e-05, + "loss": 1.9696, + "step": 4929 + }, + { + "epoch": 1.5131982811540823, + "grad_norm": 0.5413886904716492, + "learning_rate": 9.61925498839897e-05, + "loss": 2.0332, + "step": 4930 + }, + { + "epoch": 1.5135052179251074, + "grad_norm": 0.5946230888366699, + "learning_rate": 9.619064716245242e-05, + "loss": 2.0433, + "step": 4931 + }, + { + "epoch": 1.5138121546961325, + "grad_norm": 0.6353569030761719, + "learning_rate": 9.618874398443211e-05, + "loss": 1.9828, + "step": 4932 + }, + { + "epoch": 1.5141190914671578, + "grad_norm": 0.523690938949585, + "learning_rate": 9.618684034994754e-05, + "loss": 1.9024, + "step": 4933 + }, + { + "epoch": 1.514426028238183, + "grad_norm": 0.4437367022037506, + "learning_rate": 9.618493625901754e-05, + "loss": 1.9961, + "step": 4934 + }, + { + "epoch": 1.514732965009208, + "grad_norm": 0.48458734154701233, + "learning_rate": 9.618303171166094e-05, + "loss": 1.9515, + "step": 4935 + }, + { + "epoch": 1.5150399017802334, + "grad_norm": 0.47659310698509216, + "learning_rate": 9.618112670789657e-05, + "loss": 1.9943, + "step": 4936 + }, + { + "epoch": 1.5153468385512583, + "grad_norm": 0.49281415343284607, + "learning_rate": 9.617922124774322e-05, + "loss": 1.9311, + "step": 4937 + }, + { + "epoch": 1.5156537753222836, + "grad_norm": 0.4706041216850281, + "learning_rate": 9.617731533121972e-05, + "loss": 1.9478, + "step": 4938 + }, + { + "epoch": 1.5159607120933087, + "grad_norm": 0.4187149405479431, + "learning_rate": 9.617540895834496e-05, + "loss": 1.9915, + "step": 4939 + }, + { + "epoch": 1.5162676488643339, + "grad_norm": 0.3792540431022644, + "learning_rate": 9.617350212913772e-05, + "loss": 1.8609, + "step": 4940 + }, + { + "epoch": 1.5165745856353592, + "grad_norm": 0.46558165550231934, + "learning_rate": 9.617159484361688e-05, + "loss": 1.9574, + "step": 4941 + }, + { + "epoch": 1.5168815224063843, + "grad_norm": 0.4930344820022583, + "learning_rate": 9.616968710180127e-05, + "loss": 1.9924, + "step": 4942 + }, + { + "epoch": 1.5171884591774094, + "grad_norm": 0.44909337162971497, + "learning_rate": 9.616777890370976e-05, + "loss": 1.9674, + "step": 4943 + }, + { + "epoch": 1.5174953959484347, + "grad_norm": 0.43266600370407104, + "learning_rate": 9.616587024936119e-05, + "loss": 1.8899, + "step": 4944 + }, + { + "epoch": 1.5178023327194596, + "grad_norm": 0.43229207396507263, + "learning_rate": 9.616396113877444e-05, + "loss": 1.9671, + "step": 4945 + }, + { + "epoch": 1.518109269490485, + "grad_norm": 0.4609402120113373, + "learning_rate": 9.616205157196837e-05, + "loss": 1.9844, + "step": 4946 + }, + { + "epoch": 1.51841620626151, + "grad_norm": 0.4598314166069031, + "learning_rate": 9.616014154896184e-05, + "loss": 1.985, + "step": 4947 + }, + { + "epoch": 1.5187231430325352, + "grad_norm": 0.4746960997581482, + "learning_rate": 9.615823106977376e-05, + "loss": 2.0199, + "step": 4948 + }, + { + "epoch": 1.5190300798035605, + "grad_norm": 0.47560420632362366, + "learning_rate": 9.615632013442295e-05, + "loss": 1.8864, + "step": 4949 + }, + { + "epoch": 1.5193370165745856, + "grad_norm": 0.447837233543396, + "learning_rate": 9.615440874292835e-05, + "loss": 1.9699, + "step": 4950 + }, + { + "epoch": 1.5196439533456108, + "grad_norm": 0.49653175473213196, + "learning_rate": 9.615249689530883e-05, + "loss": 2.0645, + "step": 4951 + }, + { + "epoch": 1.519950890116636, + "grad_norm": 0.47083014249801636, + "learning_rate": 9.615058459158328e-05, + "loss": 2.01, + "step": 4952 + }, + { + "epoch": 1.520257826887661, + "grad_norm": 0.5299197435379028, + "learning_rate": 9.614867183177061e-05, + "loss": 2.0232, + "step": 4953 + }, + { + "epoch": 1.5205647636586863, + "grad_norm": 0.5005922317504883, + "learning_rate": 9.614675861588971e-05, + "loss": 1.9703, + "step": 4954 + }, + { + "epoch": 1.5208717004297114, + "grad_norm": 0.5131978392601013, + "learning_rate": 9.61448449439595e-05, + "loss": 1.9921, + "step": 4955 + }, + { + "epoch": 1.5211786372007365, + "grad_norm": 0.5278428196907043, + "learning_rate": 9.614293081599889e-05, + "loss": 1.9111, + "step": 4956 + }, + { + "epoch": 1.5214855739717619, + "grad_norm": 0.4914579689502716, + "learning_rate": 9.614101623202678e-05, + "loss": 2.0398, + "step": 4957 + }, + { + "epoch": 1.521792510742787, + "grad_norm": 0.454863041639328, + "learning_rate": 9.61391011920621e-05, + "loss": 1.9674, + "step": 4958 + }, + { + "epoch": 1.522099447513812, + "grad_norm": 0.464491605758667, + "learning_rate": 9.613718569612379e-05, + "loss": 2.0123, + "step": 4959 + }, + { + "epoch": 1.5224063842848374, + "grad_norm": 0.4252295196056366, + "learning_rate": 9.613526974423078e-05, + "loss": 1.9796, + "step": 4960 + }, + { + "epoch": 1.5227133210558625, + "grad_norm": 0.4643968641757965, + "learning_rate": 9.613335333640199e-05, + "loss": 1.9448, + "step": 4961 + }, + { + "epoch": 1.5230202578268877, + "grad_norm": 0.4204397201538086, + "learning_rate": 9.613143647265635e-05, + "loss": 2.0191, + "step": 4962 + }, + { + "epoch": 1.523327194597913, + "grad_norm": 0.3838767111301422, + "learning_rate": 9.612951915301283e-05, + "loss": 1.9057, + "step": 4963 + }, + { + "epoch": 1.5236341313689379, + "grad_norm": 0.4353863000869751, + "learning_rate": 9.612760137749035e-05, + "loss": 2.0435, + "step": 4964 + }, + { + "epoch": 1.5239410681399632, + "grad_norm": 0.4082738757133484, + "learning_rate": 9.612568314610788e-05, + "loss": 1.9229, + "step": 4965 + }, + { + "epoch": 1.5242480049109883, + "grad_norm": 0.4382591247558594, + "learning_rate": 9.612376445888437e-05, + "loss": 1.9185, + "step": 4966 + }, + { + "epoch": 1.5245549416820134, + "grad_norm": 0.48340749740600586, + "learning_rate": 9.61218453158388e-05, + "loss": 1.9669, + "step": 4967 + }, + { + "epoch": 1.5248618784530388, + "grad_norm": 0.47423556447029114, + "learning_rate": 9.611992571699012e-05, + "loss": 1.9372, + "step": 4968 + }, + { + "epoch": 1.525168815224064, + "grad_norm": 0.4070637822151184, + "learning_rate": 9.611800566235728e-05, + "loss": 2.0201, + "step": 4969 + }, + { + "epoch": 1.525475751995089, + "grad_norm": 0.43758198618888855, + "learning_rate": 9.61160851519593e-05, + "loss": 1.982, + "step": 4970 + }, + { + "epoch": 1.5257826887661143, + "grad_norm": 0.4724174737930298, + "learning_rate": 9.611416418581513e-05, + "loss": 1.9938, + "step": 4971 + }, + { + "epoch": 1.5260896255371392, + "grad_norm": 0.492405503988266, + "learning_rate": 9.611224276394374e-05, + "loss": 1.9462, + "step": 4972 + }, + { + "epoch": 1.5263965623081646, + "grad_norm": 0.5064161419868469, + "learning_rate": 9.611032088636418e-05, + "loss": 2.0326, + "step": 4973 + }, + { + "epoch": 1.5267034990791897, + "grad_norm": 0.4256031811237335, + "learning_rate": 9.610839855309537e-05, + "loss": 1.8885, + "step": 4974 + }, + { + "epoch": 1.5270104358502148, + "grad_norm": 0.4283316731452942, + "learning_rate": 9.610647576415636e-05, + "loss": 2.005, + "step": 4975 + }, + { + "epoch": 1.5273173726212401, + "grad_norm": 0.44234412908554077, + "learning_rate": 9.610455251956614e-05, + "loss": 1.9626, + "step": 4976 + }, + { + "epoch": 1.5276243093922652, + "grad_norm": 0.4135831594467163, + "learning_rate": 9.610262881934369e-05, + "loss": 1.9529, + "step": 4977 + }, + { + "epoch": 1.5279312461632903, + "grad_norm": 0.48090922832489014, + "learning_rate": 9.610070466350805e-05, + "loss": 2.0239, + "step": 4978 + }, + { + "epoch": 1.5282381829343157, + "grad_norm": 0.4546974301338196, + "learning_rate": 9.609878005207822e-05, + "loss": 1.9556, + "step": 4979 + }, + { + "epoch": 1.5285451197053406, + "grad_norm": 0.4197862148284912, + "learning_rate": 9.609685498507323e-05, + "loss": 1.9117, + "step": 4980 + }, + { + "epoch": 1.528852056476366, + "grad_norm": 0.4376974105834961, + "learning_rate": 9.60949294625121e-05, + "loss": 1.9514, + "step": 4981 + }, + { + "epoch": 1.529158993247391, + "grad_norm": 0.3671407401561737, + "learning_rate": 9.609300348441385e-05, + "loss": 1.9042, + "step": 4982 + }, + { + "epoch": 1.5294659300184161, + "grad_norm": 0.4326031506061554, + "learning_rate": 9.609107705079754e-05, + "loss": 1.9606, + "step": 4983 + }, + { + "epoch": 1.5297728667894415, + "grad_norm": 0.423308402299881, + "learning_rate": 9.608915016168218e-05, + "loss": 1.9663, + "step": 4984 + }, + { + "epoch": 1.5300798035604666, + "grad_norm": 0.46309906244277954, + "learning_rate": 9.608722281708683e-05, + "loss": 2.0114, + "step": 4985 + }, + { + "epoch": 1.5303867403314917, + "grad_norm": 0.4619913101196289, + "learning_rate": 9.608529501703053e-05, + "loss": 1.9328, + "step": 4986 + }, + { + "epoch": 1.530693677102517, + "grad_norm": 0.4335738718509674, + "learning_rate": 9.608336676153234e-05, + "loss": 1.9069, + "step": 4987 + }, + { + "epoch": 1.531000613873542, + "grad_norm": 0.40606966614723206, + "learning_rate": 9.608143805061129e-05, + "loss": 1.9243, + "step": 4988 + }, + { + "epoch": 1.5313075506445673, + "grad_norm": 0.45613235235214233, + "learning_rate": 9.607950888428649e-05, + "loss": 1.9943, + "step": 4989 + }, + { + "epoch": 1.5316144874155924, + "grad_norm": 0.4905582666397095, + "learning_rate": 9.607757926257696e-05, + "loss": 1.9649, + "step": 4990 + }, + { + "epoch": 1.5319214241866175, + "grad_norm": 0.44312527775764465, + "learning_rate": 9.607564918550179e-05, + "loss": 1.927, + "step": 4991 + }, + { + "epoch": 1.5322283609576428, + "grad_norm": 0.5193700790405273, + "learning_rate": 9.607371865308004e-05, + "loss": 1.9038, + "step": 4992 + }, + { + "epoch": 1.532535297728668, + "grad_norm": 0.5528806447982788, + "learning_rate": 9.607178766533078e-05, + "loss": 1.9194, + "step": 4993 + }, + { + "epoch": 1.532842234499693, + "grad_norm": 0.6561285257339478, + "learning_rate": 9.606985622227314e-05, + "loss": 2.0098, + "step": 4994 + }, + { + "epoch": 1.5331491712707184, + "grad_norm": 0.5642603635787964, + "learning_rate": 9.606792432392617e-05, + "loss": 1.9781, + "step": 4995 + }, + { + "epoch": 1.5334561080417433, + "grad_norm": 0.4974311590194702, + "learning_rate": 9.606599197030896e-05, + "loss": 1.9558, + "step": 4996 + }, + { + "epoch": 1.5337630448127686, + "grad_norm": 0.4324510395526886, + "learning_rate": 9.606405916144063e-05, + "loss": 1.9749, + "step": 4997 + }, + { + "epoch": 1.5340699815837937, + "grad_norm": 0.45244327187538147, + "learning_rate": 9.606212589734027e-05, + "loss": 1.8902, + "step": 4998 + }, + { + "epoch": 1.5343769183548188, + "grad_norm": 0.5418685078620911, + "learning_rate": 9.606019217802698e-05, + "loss": 1.9766, + "step": 4999 + }, + { + "epoch": 1.5346838551258442, + "grad_norm": 0.48479241132736206, + "learning_rate": 9.605825800351987e-05, + "loss": 1.9949, + "step": 5000 + }, + { + "epoch": 1.5349907918968693, + "grad_norm": 0.4958111643791199, + "learning_rate": 9.605632337383806e-05, + "loss": 1.988, + "step": 5001 + }, + { + "epoch": 1.5352977286678944, + "grad_norm": 0.47347983717918396, + "learning_rate": 9.605438828900067e-05, + "loss": 1.9157, + "step": 5002 + }, + { + "epoch": 1.5356046654389197, + "grad_norm": 0.4018974304199219, + "learning_rate": 9.605245274902684e-05, + "loss": 1.9347, + "step": 5003 + }, + { + "epoch": 1.5359116022099446, + "grad_norm": 0.46161791682243347, + "learning_rate": 9.605051675393565e-05, + "loss": 1.9785, + "step": 5004 + }, + { + "epoch": 1.53621853898097, + "grad_norm": 0.5113234519958496, + "learning_rate": 9.604858030374627e-05, + "loss": 1.9595, + "step": 5005 + }, + { + "epoch": 1.536525475751995, + "grad_norm": 0.6643409132957458, + "learning_rate": 9.604664339847784e-05, + "loss": 2.0395, + "step": 5006 + }, + { + "epoch": 1.5368324125230202, + "grad_norm": 0.6759974360466003, + "learning_rate": 9.604470603814948e-05, + "loss": 1.9058, + "step": 5007 + }, + { + "epoch": 1.5371393492940455, + "grad_norm": 0.5576213598251343, + "learning_rate": 9.604276822278035e-05, + "loss": 1.9326, + "step": 5008 + }, + { + "epoch": 1.5374462860650706, + "grad_norm": 0.4472630023956299, + "learning_rate": 9.60408299523896e-05, + "loss": 1.9553, + "step": 5009 + }, + { + "epoch": 1.5377532228360957, + "grad_norm": 0.48445144295692444, + "learning_rate": 9.603889122699638e-05, + "loss": 2.0136, + "step": 5010 + }, + { + "epoch": 1.538060159607121, + "grad_norm": 0.4793097972869873, + "learning_rate": 9.603695204661987e-05, + "loss": 1.9777, + "step": 5011 + }, + { + "epoch": 1.538367096378146, + "grad_norm": 0.5003167390823364, + "learning_rate": 9.60350124112792e-05, + "loss": 1.9672, + "step": 5012 + }, + { + "epoch": 1.5386740331491713, + "grad_norm": 0.5131042003631592, + "learning_rate": 9.603307232099355e-05, + "loss": 2.0058, + "step": 5013 + }, + { + "epoch": 1.5389809699201964, + "grad_norm": 0.4145869314670563, + "learning_rate": 9.603113177578212e-05, + "loss": 1.9332, + "step": 5014 + }, + { + "epoch": 1.5392879066912215, + "grad_norm": 0.4939991235733032, + "learning_rate": 9.602919077566404e-05, + "loss": 1.9967, + "step": 5015 + }, + { + "epoch": 1.5395948434622468, + "grad_norm": 0.4768902361392975, + "learning_rate": 9.602724932065853e-05, + "loss": 1.873, + "step": 5016 + }, + { + "epoch": 1.539901780233272, + "grad_norm": 0.45381611585617065, + "learning_rate": 9.602530741078476e-05, + "loss": 1.9416, + "step": 5017 + }, + { + "epoch": 1.540208717004297, + "grad_norm": 0.43104392290115356, + "learning_rate": 9.602336504606193e-05, + "loss": 1.9566, + "step": 5018 + }, + { + "epoch": 1.5405156537753224, + "grad_norm": 0.5354776978492737, + "learning_rate": 9.602142222650924e-05, + "loss": 1.9939, + "step": 5019 + }, + { + "epoch": 1.5408225905463473, + "grad_norm": 0.5623740553855896, + "learning_rate": 9.601947895214586e-05, + "loss": 1.9622, + "step": 5020 + }, + { + "epoch": 1.5411295273173726, + "grad_norm": 0.5234485268592834, + "learning_rate": 9.601753522299103e-05, + "loss": 1.9636, + "step": 5021 + }, + { + "epoch": 1.5414364640883977, + "grad_norm": 0.416384756565094, + "learning_rate": 9.601559103906396e-05, + "loss": 1.92, + "step": 5022 + }, + { + "epoch": 1.5417434008594229, + "grad_norm": 0.47080478072166443, + "learning_rate": 9.601364640038384e-05, + "loss": 1.9147, + "step": 5023 + }, + { + "epoch": 1.5420503376304482, + "grad_norm": 0.527463972568512, + "learning_rate": 9.601170130696988e-05, + "loss": 1.9458, + "step": 5024 + }, + { + "epoch": 1.5423572744014733, + "grad_norm": 0.4761022925376892, + "learning_rate": 9.600975575884134e-05, + "loss": 1.95, + "step": 5025 + }, + { + "epoch": 1.5426642111724984, + "grad_norm": 0.48202264308929443, + "learning_rate": 9.600780975601741e-05, + "loss": 1.9618, + "step": 5026 + }, + { + "epoch": 1.5429711479435237, + "grad_norm": 0.43222522735595703, + "learning_rate": 9.600586329851735e-05, + "loss": 1.9869, + "step": 5027 + }, + { + "epoch": 1.5432780847145486, + "grad_norm": 0.40816691517829895, + "learning_rate": 9.600391638636037e-05, + "loss": 1.991, + "step": 5028 + }, + { + "epoch": 1.543585021485574, + "grad_norm": 0.4365478754043579, + "learning_rate": 9.600196901956572e-05, + "loss": 1.9904, + "step": 5029 + }, + { + "epoch": 1.5438919582565993, + "grad_norm": 0.41411092877388, + "learning_rate": 9.600002119815268e-05, + "loss": 1.9449, + "step": 5030 + }, + { + "epoch": 1.5441988950276242, + "grad_norm": 0.41023650765419006, + "learning_rate": 9.599807292214045e-05, + "loss": 1.9318, + "step": 5031 + }, + { + "epoch": 1.5445058317986495, + "grad_norm": 0.4844631254673004, + "learning_rate": 9.599612419154831e-05, + "loss": 1.9884, + "step": 5032 + }, + { + "epoch": 1.5448127685696746, + "grad_norm": 0.4347037374973297, + "learning_rate": 9.59941750063955e-05, + "loss": 1.8992, + "step": 5033 + }, + { + "epoch": 1.5451197053406998, + "grad_norm": 0.6414445638656616, + "learning_rate": 9.59922253667013e-05, + "loss": 2.0268, + "step": 5034 + }, + { + "epoch": 1.545426642111725, + "grad_norm": 0.6607222557067871, + "learning_rate": 9.599027527248498e-05, + "loss": 2.0116, + "step": 5035 + }, + { + "epoch": 1.5457335788827502, + "grad_norm": 0.6406869292259216, + "learning_rate": 9.59883247237658e-05, + "loss": 1.9256, + "step": 5036 + }, + { + "epoch": 1.5460405156537753, + "grad_norm": 0.5388308167457581, + "learning_rate": 9.598637372056303e-05, + "loss": 1.906, + "step": 5037 + }, + { + "epoch": 1.5463474524248007, + "grad_norm": 0.42285510897636414, + "learning_rate": 9.598442226289596e-05, + "loss": 1.9137, + "step": 5038 + }, + { + "epoch": 1.5466543891958255, + "grad_norm": 0.5622994303703308, + "learning_rate": 9.598247035078389e-05, + "loss": 1.9825, + "step": 5039 + }, + { + "epoch": 1.5469613259668509, + "grad_norm": 0.7120574116706848, + "learning_rate": 9.59805179842461e-05, + "loss": 1.9467, + "step": 5040 + }, + { + "epoch": 1.547268262737876, + "grad_norm": 0.7050338983535767, + "learning_rate": 9.597856516330187e-05, + "loss": 1.9763, + "step": 5041 + }, + { + "epoch": 1.547575199508901, + "grad_norm": 0.4908922016620636, + "learning_rate": 9.597661188797051e-05, + "loss": 1.9826, + "step": 5042 + }, + { + "epoch": 1.5478821362799264, + "grad_norm": 0.47363361716270447, + "learning_rate": 9.597465815827133e-05, + "loss": 1.9769, + "step": 5043 + }, + { + "epoch": 1.5481890730509515, + "grad_norm": 0.6289864182472229, + "learning_rate": 9.597270397422364e-05, + "loss": 1.9364, + "step": 5044 + }, + { + "epoch": 1.5484960098219767, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.597074933584673e-05, + "loss": 1.949, + "step": 5045 + }, + { + "epoch": 1.548802946593002, + "grad_norm": 0.559152364730835, + "learning_rate": 9.596879424315993e-05, + "loss": 2.0194, + "step": 5046 + }, + { + "epoch": 1.5491098833640269, + "grad_norm": 0.4613901674747467, + "learning_rate": 9.596683869618257e-05, + "loss": 1.9658, + "step": 5047 + }, + { + "epoch": 1.5494168201350522, + "grad_norm": 0.6245483160018921, + "learning_rate": 9.596488269493396e-05, + "loss": 1.9265, + "step": 5048 + }, + { + "epoch": 1.5497237569060773, + "grad_norm": 0.8100824356079102, + "learning_rate": 9.596292623943343e-05, + "loss": 1.9536, + "step": 5049 + }, + { + "epoch": 1.5500306936771024, + "grad_norm": 0.7486092448234558, + "learning_rate": 9.596096932970035e-05, + "loss": 1.9801, + "step": 5050 + }, + { + "epoch": 1.5503376304481278, + "grad_norm": 0.4803295135498047, + "learning_rate": 9.595901196575401e-05, + "loss": 1.9943, + "step": 5051 + }, + { + "epoch": 1.550644567219153, + "grad_norm": 0.5027125477790833, + "learning_rate": 9.595705414761379e-05, + "loss": 1.9036, + "step": 5052 + }, + { + "epoch": 1.550951503990178, + "grad_norm": 0.5785070657730103, + "learning_rate": 9.595509587529902e-05, + "loss": 1.9489, + "step": 5053 + }, + { + "epoch": 1.5512584407612033, + "grad_norm": 0.6017338633537292, + "learning_rate": 9.595313714882906e-05, + "loss": 1.9964, + "step": 5054 + }, + { + "epoch": 1.5515653775322282, + "grad_norm": 0.5023195147514343, + "learning_rate": 9.595117796822326e-05, + "loss": 1.9778, + "step": 5055 + }, + { + "epoch": 1.5518723143032536, + "grad_norm": 0.4488884508609772, + "learning_rate": 9.594921833350099e-05, + "loss": 2.0141, + "step": 5056 + }, + { + "epoch": 1.5521792510742787, + "grad_norm": 0.47110801935195923, + "learning_rate": 9.59472582446816e-05, + "loss": 1.9294, + "step": 5057 + }, + { + "epoch": 1.5524861878453038, + "grad_norm": 0.5292330980300903, + "learning_rate": 9.594529770178449e-05, + "loss": 2.0427, + "step": 5058 + }, + { + "epoch": 1.5527931246163291, + "grad_norm": 0.522756814956665, + "learning_rate": 9.5943336704829e-05, + "loss": 1.9854, + "step": 5059 + }, + { + "epoch": 1.5531000613873542, + "grad_norm": 0.44659632444381714, + "learning_rate": 9.594137525383455e-05, + "loss": 2.028, + "step": 5060 + }, + { + "epoch": 1.5534069981583793, + "grad_norm": 0.4745616614818573, + "learning_rate": 9.593941334882048e-05, + "loss": 1.9994, + "step": 5061 + }, + { + "epoch": 1.5537139349294047, + "grad_norm": 0.41752973198890686, + "learning_rate": 9.593745098980622e-05, + "loss": 1.9466, + "step": 5062 + }, + { + "epoch": 1.5540208717004296, + "grad_norm": 0.4548248052597046, + "learning_rate": 9.593548817681115e-05, + "loss": 1.9064, + "step": 5063 + }, + { + "epoch": 1.554327808471455, + "grad_norm": 0.45780888199806213, + "learning_rate": 9.593352490985464e-05, + "loss": 2.0254, + "step": 5064 + }, + { + "epoch": 1.55463474524248, + "grad_norm": 0.4118718206882477, + "learning_rate": 9.593156118895613e-05, + "loss": 1.9761, + "step": 5065 + }, + { + "epoch": 1.5549416820135051, + "grad_norm": 0.41350236535072327, + "learning_rate": 9.592959701413501e-05, + "loss": 1.9476, + "step": 5066 + }, + { + "epoch": 1.5552486187845305, + "grad_norm": 0.4116091728210449, + "learning_rate": 9.59276323854107e-05, + "loss": 1.9325, + "step": 5067 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.44039735198020935, + "learning_rate": 9.592566730280259e-05, + "loss": 1.9916, + "step": 5068 + }, + { + "epoch": 1.5558624923265807, + "grad_norm": 0.4028816819190979, + "learning_rate": 9.592370176633012e-05, + "loss": 1.916, + "step": 5069 + }, + { + "epoch": 1.556169429097606, + "grad_norm": 0.42046302556991577, + "learning_rate": 9.592173577601271e-05, + "loss": 1.961, + "step": 5070 + }, + { + "epoch": 1.556476365868631, + "grad_norm": 0.3749450147151947, + "learning_rate": 9.591976933186982e-05, + "loss": 1.9279, + "step": 5071 + }, + { + "epoch": 1.5567833026396563, + "grad_norm": 0.3441384434700012, + "learning_rate": 9.591780243392081e-05, + "loss": 1.8967, + "step": 5072 + }, + { + "epoch": 1.5570902394106814, + "grad_norm": 0.4032546877861023, + "learning_rate": 9.59158350821852e-05, + "loss": 1.9912, + "step": 5073 + }, + { + "epoch": 1.5573971761817065, + "grad_norm": 0.44628265500068665, + "learning_rate": 9.591386727668238e-05, + "loss": 2.0539, + "step": 5074 + }, + { + "epoch": 1.5577041129527318, + "grad_norm": 0.43606969714164734, + "learning_rate": 9.59118990174318e-05, + "loss": 1.97, + "step": 5075 + }, + { + "epoch": 1.558011049723757, + "grad_norm": 0.42076775431632996, + "learning_rate": 9.590993030445295e-05, + "loss": 1.962, + "step": 5076 + }, + { + "epoch": 1.558317986494782, + "grad_norm": 0.34569117426872253, + "learning_rate": 9.590796113776526e-05, + "loss": 1.8815, + "step": 5077 + }, + { + "epoch": 1.5586249232658074, + "grad_norm": 0.3931111693382263, + "learning_rate": 9.590599151738817e-05, + "loss": 1.9016, + "step": 5078 + }, + { + "epoch": 1.5589318600368323, + "grad_norm": 0.3952369689941406, + "learning_rate": 9.590402144334117e-05, + "loss": 1.9277, + "step": 5079 + }, + { + "epoch": 1.5592387968078576, + "grad_norm": 0.3960857689380646, + "learning_rate": 9.590205091564372e-05, + "loss": 1.947, + "step": 5080 + }, + { + "epoch": 1.5595457335788827, + "grad_norm": 0.37946292757987976, + "learning_rate": 9.590007993431532e-05, + "loss": 1.9907, + "step": 5081 + }, + { + "epoch": 1.5598526703499078, + "grad_norm": 0.41619375348091125, + "learning_rate": 9.589810849937541e-05, + "loss": 1.9451, + "step": 5082 + }, + { + "epoch": 1.5601596071209332, + "grad_norm": 0.39266669750213623, + "learning_rate": 9.58961366108435e-05, + "loss": 2.0137, + "step": 5083 + }, + { + "epoch": 1.5604665438919583, + "grad_norm": 0.39510276913642883, + "learning_rate": 9.589416426873907e-05, + "loss": 1.947, + "step": 5084 + }, + { + "epoch": 1.5607734806629834, + "grad_norm": 0.40243181586265564, + "learning_rate": 9.58921914730816e-05, + "loss": 1.8957, + "step": 5085 + }, + { + "epoch": 1.5610804174340087, + "grad_norm": 0.39877578616142273, + "learning_rate": 9.58902182238906e-05, + "loss": 1.9497, + "step": 5086 + }, + { + "epoch": 1.5613873542050336, + "grad_norm": 0.39367151260375977, + "learning_rate": 9.588824452118557e-05, + "loss": 1.9616, + "step": 5087 + }, + { + "epoch": 1.561694290976059, + "grad_norm": 0.35690104961395264, + "learning_rate": 9.5886270364986e-05, + "loss": 1.9108, + "step": 5088 + }, + { + "epoch": 1.562001227747084, + "grad_norm": 0.39512762427330017, + "learning_rate": 9.588429575531141e-05, + "loss": 1.9909, + "step": 5089 + }, + { + "epoch": 1.5623081645181092, + "grad_norm": 0.39253926277160645, + "learning_rate": 9.588232069218132e-05, + "loss": 1.937, + "step": 5090 + }, + { + "epoch": 1.5626151012891345, + "grad_norm": 0.37811553478240967, + "learning_rate": 9.588034517561526e-05, + "loss": 1.8918, + "step": 5091 + }, + { + "epoch": 1.5629220380601596, + "grad_norm": 0.38191986083984375, + "learning_rate": 9.587836920563272e-05, + "loss": 1.9149, + "step": 5092 + }, + { + "epoch": 1.5632289748311847, + "grad_norm": 0.3903779089450836, + "learning_rate": 9.587639278225326e-05, + "loss": 1.9714, + "step": 5093 + }, + { + "epoch": 1.56353591160221, + "grad_norm": 0.4467499554157257, + "learning_rate": 9.587441590549639e-05, + "loss": 1.8822, + "step": 5094 + }, + { + "epoch": 1.563842848373235, + "grad_norm": 0.3819296956062317, + "learning_rate": 9.587243857538164e-05, + "loss": 1.9212, + "step": 5095 + }, + { + "epoch": 1.5641497851442603, + "grad_norm": 0.4305097162723541, + "learning_rate": 9.587046079192858e-05, + "loss": 1.9264, + "step": 5096 + }, + { + "epoch": 1.5644567219152854, + "grad_norm": 0.4135383367538452, + "learning_rate": 9.586848255515675e-05, + "loss": 1.9743, + "step": 5097 + }, + { + "epoch": 1.5647636586863105, + "grad_norm": 0.44688066840171814, + "learning_rate": 9.586650386508566e-05, + "loss": 1.8804, + "step": 5098 + }, + { + "epoch": 1.5650705954573358, + "grad_norm": 0.5358461737632751, + "learning_rate": 9.586452472173492e-05, + "loss": 1.9485, + "step": 5099 + }, + { + "epoch": 1.565377532228361, + "grad_norm": 0.5585343837738037, + "learning_rate": 9.586254512512408e-05, + "loss": 2.0901, + "step": 5100 + }, + { + "epoch": 1.565684468999386, + "grad_norm": 0.4682343602180481, + "learning_rate": 9.586056507527266e-05, + "loss": 1.8877, + "step": 5101 + }, + { + "epoch": 1.5659914057704114, + "grad_norm": 0.44076529145240784, + "learning_rate": 9.585858457220026e-05, + "loss": 1.93, + "step": 5102 + }, + { + "epoch": 1.5662983425414365, + "grad_norm": 0.4613071382045746, + "learning_rate": 9.585660361592646e-05, + "loss": 1.9689, + "step": 5103 + }, + { + "epoch": 1.5666052793124616, + "grad_norm": 0.4589289128780365, + "learning_rate": 9.585462220647082e-05, + "loss": 1.8876, + "step": 5104 + }, + { + "epoch": 1.566912216083487, + "grad_norm": 0.3495907485485077, + "learning_rate": 9.585264034385292e-05, + "loss": 1.9013, + "step": 5105 + }, + { + "epoch": 1.5672191528545119, + "grad_norm": 0.42263728380203247, + "learning_rate": 9.585065802809235e-05, + "loss": 1.8886, + "step": 5106 + }, + { + "epoch": 1.5675260896255372, + "grad_norm": 0.4275301694869995, + "learning_rate": 9.584867525920872e-05, + "loss": 1.9865, + "step": 5107 + }, + { + "epoch": 1.5678330263965623, + "grad_norm": 0.4228142201900482, + "learning_rate": 9.584669203722161e-05, + "loss": 1.8573, + "step": 5108 + }, + { + "epoch": 1.5681399631675874, + "grad_norm": 0.4422524571418762, + "learning_rate": 9.58447083621506e-05, + "loss": 1.924, + "step": 5109 + }, + { + "epoch": 1.5684468999386127, + "grad_norm": 0.41540947556495667, + "learning_rate": 9.584272423401532e-05, + "loss": 1.969, + "step": 5110 + }, + { + "epoch": 1.5687538367096379, + "grad_norm": 0.3963775336742401, + "learning_rate": 9.584073965283538e-05, + "loss": 1.9509, + "step": 5111 + }, + { + "epoch": 1.569060773480663, + "grad_norm": 0.41465985774993896, + "learning_rate": 9.583875461863037e-05, + "loss": 1.9393, + "step": 5112 + }, + { + "epoch": 1.5693677102516883, + "grad_norm": 0.4396083652973175, + "learning_rate": 9.583676913141991e-05, + "loss": 1.9872, + "step": 5113 + }, + { + "epoch": 1.5696746470227132, + "grad_norm": 0.4247182607650757, + "learning_rate": 9.583478319122366e-05, + "loss": 1.9807, + "step": 5114 + }, + { + "epoch": 1.5699815837937385, + "grad_norm": 0.3612080216407776, + "learning_rate": 9.583279679806119e-05, + "loss": 1.9563, + "step": 5115 + }, + { + "epoch": 1.5702885205647636, + "grad_norm": 0.40084055066108704, + "learning_rate": 9.583080995195217e-05, + "loss": 1.9099, + "step": 5116 + }, + { + "epoch": 1.5705954573357888, + "grad_norm": 0.432381272315979, + "learning_rate": 9.582882265291621e-05, + "loss": 2.0167, + "step": 5117 + }, + { + "epoch": 1.570902394106814, + "grad_norm": 0.45490768551826477, + "learning_rate": 9.5826834900973e-05, + "loss": 1.9179, + "step": 5118 + }, + { + "epoch": 1.5712093308778392, + "grad_norm": 0.39158329367637634, + "learning_rate": 9.582484669614211e-05, + "loss": 1.8716, + "step": 5119 + }, + { + "epoch": 1.5715162676488643, + "grad_norm": 0.45607441663742065, + "learning_rate": 9.582285803844324e-05, + "loss": 1.9631, + "step": 5120 + }, + { + "epoch": 1.5718232044198897, + "grad_norm": 0.42591094970703125, + "learning_rate": 9.582086892789604e-05, + "loss": 1.9809, + "step": 5121 + }, + { + "epoch": 1.5721301411909145, + "grad_norm": 0.46772903203964233, + "learning_rate": 9.581887936452015e-05, + "loss": 1.9991, + "step": 5122 + }, + { + "epoch": 1.5724370779619399, + "grad_norm": 0.4450485408306122, + "learning_rate": 9.581688934833524e-05, + "loss": 1.9471, + "step": 5123 + }, + { + "epoch": 1.572744014732965, + "grad_norm": 0.37539350986480713, + "learning_rate": 9.581489887936097e-05, + "loss": 1.8624, + "step": 5124 + }, + { + "epoch": 1.57305095150399, + "grad_norm": 0.4184030294418335, + "learning_rate": 9.581290795761702e-05, + "loss": 1.9746, + "step": 5125 + }, + { + "epoch": 1.5733578882750154, + "grad_norm": 0.43275317549705505, + "learning_rate": 9.581091658312305e-05, + "loss": 2.0484, + "step": 5126 + }, + { + "epoch": 1.5736648250460405, + "grad_norm": 0.48845502734184265, + "learning_rate": 9.580892475589876e-05, + "loss": 1.9331, + "step": 5127 + }, + { + "epoch": 1.5739717618170657, + "grad_norm": 0.4653528034687042, + "learning_rate": 9.580693247596383e-05, + "loss": 1.8888, + "step": 5128 + }, + { + "epoch": 1.574278698588091, + "grad_norm": 0.4371016323566437, + "learning_rate": 9.580493974333794e-05, + "loss": 1.9004, + "step": 5129 + }, + { + "epoch": 1.5745856353591159, + "grad_norm": 0.4274102747440338, + "learning_rate": 9.580294655804079e-05, + "loss": 1.9877, + "step": 5130 + }, + { + "epoch": 1.5748925721301412, + "grad_norm": 0.4053245484828949, + "learning_rate": 9.580095292009208e-05, + "loss": 1.9253, + "step": 5131 + }, + { + "epoch": 1.5751995089011663, + "grad_norm": 0.47868627309799194, + "learning_rate": 9.579895882951151e-05, + "loss": 1.9659, + "step": 5132 + }, + { + "epoch": 1.5755064456721914, + "grad_norm": 0.47420576214790344, + "learning_rate": 9.579696428631877e-05, + "loss": 1.9115, + "step": 5133 + }, + { + "epoch": 1.5758133824432168, + "grad_norm": 0.41192150115966797, + "learning_rate": 9.57949692905336e-05, + "loss": 1.8949, + "step": 5134 + }, + { + "epoch": 1.576120319214242, + "grad_norm": 0.44949471950531006, + "learning_rate": 9.57929738421757e-05, + "loss": 1.9393, + "step": 5135 + }, + { + "epoch": 1.576427255985267, + "grad_norm": 0.38450154662132263, + "learning_rate": 9.57909779412648e-05, + "loss": 1.8399, + "step": 5136 + }, + { + "epoch": 1.5767341927562923, + "grad_norm": 0.43553364276885986, + "learning_rate": 9.57889815878206e-05, + "loss": 1.9477, + "step": 5137 + }, + { + "epoch": 1.5770411295273172, + "grad_norm": 0.4546982944011688, + "learning_rate": 9.578698478186285e-05, + "loss": 1.9169, + "step": 5138 + }, + { + "epoch": 1.5773480662983426, + "grad_norm": 0.47802838683128357, + "learning_rate": 9.57849875234113e-05, + "loss": 1.9204, + "step": 5139 + }, + { + "epoch": 1.5776550030693677, + "grad_norm": 0.3648034930229187, + "learning_rate": 9.578298981248565e-05, + "loss": 1.9157, + "step": 5140 + }, + { + "epoch": 1.5779619398403928, + "grad_norm": 0.41951245069503784, + "learning_rate": 9.578099164910565e-05, + "loss": 1.9171, + "step": 5141 + }, + { + "epoch": 1.5782688766114181, + "grad_norm": 0.5198701620101929, + "learning_rate": 9.577899303329107e-05, + "loss": 1.9786, + "step": 5142 + }, + { + "epoch": 1.5785758133824432, + "grad_norm": 0.45244187116622925, + "learning_rate": 9.577699396506165e-05, + "loss": 2.0044, + "step": 5143 + }, + { + "epoch": 1.5788827501534684, + "grad_norm": 0.3874819874763489, + "learning_rate": 9.577499444443715e-05, + "loss": 1.9385, + "step": 5144 + }, + { + "epoch": 1.5791896869244937, + "grad_norm": 0.4578075110912323, + "learning_rate": 9.577299447143733e-05, + "loss": 1.9679, + "step": 5145 + }, + { + "epoch": 1.5794966236955186, + "grad_norm": 0.6001343727111816, + "learning_rate": 9.577099404608192e-05, + "loss": 1.9331, + "step": 5146 + }, + { + "epoch": 1.579803560466544, + "grad_norm": 0.5592501759529114, + "learning_rate": 9.576899316839074e-05, + "loss": 1.8968, + "step": 5147 + }, + { + "epoch": 1.580110497237569, + "grad_norm": 0.4333004951477051, + "learning_rate": 9.576699183838356e-05, + "loss": 2.0378, + "step": 5148 + }, + { + "epoch": 1.5804174340085941, + "grad_norm": 0.40593892335891724, + "learning_rate": 9.576499005608011e-05, + "loss": 1.9878, + "step": 5149 + }, + { + "epoch": 1.5807243707796195, + "grad_norm": 0.4805290400981903, + "learning_rate": 9.576298782150023e-05, + "loss": 1.9897, + "step": 5150 + }, + { + "epoch": 1.5810313075506446, + "grad_norm": 0.4620860517024994, + "learning_rate": 9.576098513466367e-05, + "loss": 1.9808, + "step": 5151 + }, + { + "epoch": 1.5813382443216697, + "grad_norm": 0.47085410356521606, + "learning_rate": 9.575898199559023e-05, + "loss": 1.9526, + "step": 5152 + }, + { + "epoch": 1.581645181092695, + "grad_norm": 0.512971043586731, + "learning_rate": 9.575697840429971e-05, + "loss": 1.9684, + "step": 5153 + }, + { + "epoch": 1.58195211786372, + "grad_norm": 0.5474939346313477, + "learning_rate": 9.575497436081193e-05, + "loss": 2.0052, + "step": 5154 + }, + { + "epoch": 1.5822590546347453, + "grad_norm": 0.6277830004692078, + "learning_rate": 9.575296986514666e-05, + "loss": 2.042, + "step": 5155 + }, + { + "epoch": 1.5825659914057704, + "grad_norm": 0.46941256523132324, + "learning_rate": 9.575096491732372e-05, + "loss": 1.952, + "step": 5156 + }, + { + "epoch": 1.5828729281767955, + "grad_norm": 0.4948115646839142, + "learning_rate": 9.574895951736294e-05, + "loss": 1.9573, + "step": 5157 + }, + { + "epoch": 1.5831798649478208, + "grad_norm": 0.5677160024642944, + "learning_rate": 9.574695366528411e-05, + "loss": 1.9696, + "step": 5158 + }, + { + "epoch": 1.583486801718846, + "grad_norm": 0.5915918350219727, + "learning_rate": 9.574494736110708e-05, + "loss": 1.9822, + "step": 5159 + }, + { + "epoch": 1.583793738489871, + "grad_norm": 0.556413471698761, + "learning_rate": 9.574294060485168e-05, + "loss": 1.9548, + "step": 5160 + }, + { + "epoch": 1.5841006752608964, + "grad_norm": 0.4706072509288788, + "learning_rate": 9.574093339653772e-05, + "loss": 2.0052, + "step": 5161 + }, + { + "epoch": 1.5844076120319213, + "grad_norm": 0.3931087553501129, + "learning_rate": 9.573892573618505e-05, + "loss": 1.9071, + "step": 5162 + }, + { + "epoch": 1.5847145488029466, + "grad_norm": 0.4590308368206024, + "learning_rate": 9.573691762381349e-05, + "loss": 2.048, + "step": 5163 + }, + { + "epoch": 1.5850214855739717, + "grad_norm": 0.4404078423976898, + "learning_rate": 9.573490905944293e-05, + "loss": 1.9426, + "step": 5164 + }, + { + "epoch": 1.5853284223449968, + "grad_norm": 0.486074298620224, + "learning_rate": 9.573290004309318e-05, + "loss": 1.9937, + "step": 5165 + }, + { + "epoch": 1.5856353591160222, + "grad_norm": 0.4650556445121765, + "learning_rate": 9.57308905747841e-05, + "loss": 1.9821, + "step": 5166 + }, + { + "epoch": 1.5859422958870473, + "grad_norm": 0.48193567991256714, + "learning_rate": 9.572888065453557e-05, + "loss": 2.0143, + "step": 5167 + }, + { + "epoch": 1.5862492326580724, + "grad_norm": 0.43178877234458923, + "learning_rate": 9.572687028236744e-05, + "loss": 2.0066, + "step": 5168 + }, + { + "epoch": 1.5865561694290977, + "grad_norm": 0.5256033539772034, + "learning_rate": 9.572485945829957e-05, + "loss": 2.0431, + "step": 5169 + }, + { + "epoch": 1.5868631062001226, + "grad_norm": 0.4714619517326355, + "learning_rate": 9.572284818235182e-05, + "loss": 1.9411, + "step": 5170 + }, + { + "epoch": 1.587170042971148, + "grad_norm": 0.4224734902381897, + "learning_rate": 9.572083645454411e-05, + "loss": 1.9648, + "step": 5171 + }, + { + "epoch": 1.5874769797421733, + "grad_norm": 0.45965152978897095, + "learning_rate": 9.571882427489628e-05, + "loss": 1.9241, + "step": 5172 + }, + { + "epoch": 1.5877839165131982, + "grad_norm": 0.459114670753479, + "learning_rate": 9.571681164342825e-05, + "loss": 2.0197, + "step": 5173 + }, + { + "epoch": 1.5880908532842235, + "grad_norm": 0.4278501272201538, + "learning_rate": 9.571479856015988e-05, + "loss": 1.9411, + "step": 5174 + }, + { + "epoch": 1.5883977900552486, + "grad_norm": 0.6875150799751282, + "learning_rate": 9.571278502511107e-05, + "loss": 1.8876, + "step": 5175 + }, + { + "epoch": 1.5887047268262737, + "grad_norm": 0.4596772789955139, + "learning_rate": 9.571077103830174e-05, + "loss": 1.9002, + "step": 5176 + }, + { + "epoch": 1.589011663597299, + "grad_norm": 0.47587937116622925, + "learning_rate": 9.570875659975178e-05, + "loss": 2.0034, + "step": 5177 + }, + { + "epoch": 1.5893186003683242, + "grad_norm": 0.42494842410087585, + "learning_rate": 9.570674170948109e-05, + "loss": 1.9668, + "step": 5178 + }, + { + "epoch": 1.5896255371393493, + "grad_norm": 0.4231310784816742, + "learning_rate": 9.570472636750957e-05, + "loss": 1.9365, + "step": 5179 + }, + { + "epoch": 1.5899324739103746, + "grad_norm": 0.4585247337818146, + "learning_rate": 9.570271057385719e-05, + "loss": 1.9707, + "step": 5180 + }, + { + "epoch": 1.5902394106813995, + "grad_norm": 0.4146895408630371, + "learning_rate": 9.570069432854382e-05, + "loss": 1.9405, + "step": 5181 + }, + { + "epoch": 1.5905463474524248, + "grad_norm": 0.42243605852127075, + "learning_rate": 9.56986776315894e-05, + "loss": 1.8893, + "step": 5182 + }, + { + "epoch": 1.59085328422345, + "grad_norm": 0.44299328327178955, + "learning_rate": 9.569666048301386e-05, + "loss": 1.9596, + "step": 5183 + }, + { + "epoch": 1.591160220994475, + "grad_norm": 0.4950970709323883, + "learning_rate": 9.569464288283716e-05, + "loss": 1.9066, + "step": 5184 + }, + { + "epoch": 1.5914671577655004, + "grad_norm": 0.4664969742298126, + "learning_rate": 9.569262483107919e-05, + "loss": 1.9485, + "step": 5185 + }, + { + "epoch": 1.5917740945365255, + "grad_norm": 0.5052160024642944, + "learning_rate": 9.569060632775993e-05, + "loss": 1.9189, + "step": 5186 + }, + { + "epoch": 1.5920810313075506, + "grad_norm": 0.4109063446521759, + "learning_rate": 9.568858737289932e-05, + "loss": 1.9236, + "step": 5187 + }, + { + "epoch": 1.592387968078576, + "grad_norm": 0.4078194499015808, + "learning_rate": 9.568656796651731e-05, + "loss": 1.9465, + "step": 5188 + }, + { + "epoch": 1.5926949048496009, + "grad_norm": 0.43199312686920166, + "learning_rate": 9.568454810863385e-05, + "loss": 1.9537, + "step": 5189 + }, + { + "epoch": 1.5930018416206262, + "grad_norm": 0.46389925479888916, + "learning_rate": 9.568252779926891e-05, + "loss": 1.9463, + "step": 5190 + }, + { + "epoch": 1.5933087783916513, + "grad_norm": 0.4130708575248718, + "learning_rate": 9.568050703844247e-05, + "loss": 1.948, + "step": 5191 + }, + { + "epoch": 1.5936157151626764, + "grad_norm": 0.4699256122112274, + "learning_rate": 9.567848582617448e-05, + "loss": 1.957, + "step": 5192 + }, + { + "epoch": 1.5939226519337018, + "grad_norm": 0.41965460777282715, + "learning_rate": 9.56764641624849e-05, + "loss": 1.9622, + "step": 5193 + }, + { + "epoch": 1.5942295887047269, + "grad_norm": 0.4313151240348816, + "learning_rate": 9.567444204739376e-05, + "loss": 1.981, + "step": 5194 + }, + { + "epoch": 1.594536525475752, + "grad_norm": 0.4149332642555237, + "learning_rate": 9.5672419480921e-05, + "loss": 1.9542, + "step": 5195 + }, + { + "epoch": 1.5948434622467773, + "grad_norm": 0.4456483721733093, + "learning_rate": 9.567039646308661e-05, + "loss": 2.0206, + "step": 5196 + }, + { + "epoch": 1.5951503990178022, + "grad_norm": 0.46637552976608276, + "learning_rate": 9.56683729939106e-05, + "loss": 2.0264, + "step": 5197 + }, + { + "epoch": 1.5954573357888275, + "grad_norm": 0.4809871315956116, + "learning_rate": 9.566634907341297e-05, + "loss": 1.9113, + "step": 5198 + }, + { + "epoch": 1.5957642725598526, + "grad_norm": 0.5220670104026794, + "learning_rate": 9.566432470161371e-05, + "loss": 1.9806, + "step": 5199 + }, + { + "epoch": 1.5960712093308778, + "grad_norm": 0.5020555853843689, + "learning_rate": 9.566229987853283e-05, + "loss": 1.9925, + "step": 5200 + }, + { + "epoch": 1.596378146101903, + "grad_norm": 0.5481683611869812, + "learning_rate": 9.566027460419034e-05, + "loss": 1.978, + "step": 5201 + }, + { + "epoch": 1.5966850828729282, + "grad_norm": 0.5014147758483887, + "learning_rate": 9.565824887860624e-05, + "loss": 1.9402, + "step": 5202 + }, + { + "epoch": 1.5969920196439533, + "grad_norm": 0.43973588943481445, + "learning_rate": 9.565622270180057e-05, + "loss": 1.9877, + "step": 5203 + }, + { + "epoch": 1.5972989564149787, + "grad_norm": 0.5172939300537109, + "learning_rate": 9.565419607379335e-05, + "loss": 1.9304, + "step": 5204 + }, + { + "epoch": 1.5976058931860035, + "grad_norm": 0.4767214357852936, + "learning_rate": 9.56521689946046e-05, + "loss": 1.9063, + "step": 5205 + }, + { + "epoch": 1.5979128299570289, + "grad_norm": 0.48810651898384094, + "learning_rate": 9.565014146425437e-05, + "loss": 1.9473, + "step": 5206 + }, + { + "epoch": 1.598219766728054, + "grad_norm": 0.4204402565956116, + "learning_rate": 9.564811348276269e-05, + "loss": 1.9562, + "step": 5207 + }, + { + "epoch": 1.598526703499079, + "grad_norm": 0.42679163813591003, + "learning_rate": 9.564608505014958e-05, + "loss": 1.8904, + "step": 5208 + }, + { + "epoch": 1.5988336402701044, + "grad_norm": 0.4240354299545288, + "learning_rate": 9.56440561664351e-05, + "loss": 1.9982, + "step": 5209 + }, + { + "epoch": 1.5991405770411296, + "grad_norm": 0.41588497161865234, + "learning_rate": 9.564202683163932e-05, + "loss": 1.9904, + "step": 5210 + }, + { + "epoch": 1.5994475138121547, + "grad_norm": 0.486240029335022, + "learning_rate": 9.563999704578226e-05, + "loss": 1.9379, + "step": 5211 + }, + { + "epoch": 1.59975445058318, + "grad_norm": 0.4628448188304901, + "learning_rate": 9.563796680888403e-05, + "loss": 2.0061, + "step": 5212 + }, + { + "epoch": 1.600061387354205, + "grad_norm": 0.4514544606208801, + "learning_rate": 9.563593612096464e-05, + "loss": 1.9692, + "step": 5213 + }, + { + "epoch": 1.6003683241252302, + "grad_norm": 0.3869803845882416, + "learning_rate": 9.563390498204419e-05, + "loss": 1.8801, + "step": 5214 + }, + { + "epoch": 1.6006752608962553, + "grad_norm": 0.47029098868370056, + "learning_rate": 9.563187339214274e-05, + "loss": 2.0457, + "step": 5215 + }, + { + "epoch": 1.6009821976672804, + "grad_norm": 0.49051982164382935, + "learning_rate": 9.562984135128037e-05, + "loss": 1.9121, + "step": 5216 + }, + { + "epoch": 1.6012891344383058, + "grad_norm": 0.5087830424308777, + "learning_rate": 9.562780885947717e-05, + "loss": 1.9165, + "step": 5217 + }, + { + "epoch": 1.601596071209331, + "grad_norm": 0.4597826600074768, + "learning_rate": 9.562577591675322e-05, + "loss": 1.9037, + "step": 5218 + }, + { + "epoch": 1.601903007980356, + "grad_norm": 0.43610528111457825, + "learning_rate": 9.562374252312858e-05, + "loss": 1.8785, + "step": 5219 + }, + { + "epoch": 1.6022099447513813, + "grad_norm": 0.45797282457351685, + "learning_rate": 9.56217086786234e-05, + "loss": 2.0713, + "step": 5220 + }, + { + "epoch": 1.6025168815224062, + "grad_norm": 0.46097078919410706, + "learning_rate": 9.561967438325777e-05, + "loss": 1.9176, + "step": 5221 + }, + { + "epoch": 1.6028238182934316, + "grad_norm": 0.47368288040161133, + "learning_rate": 9.561763963705176e-05, + "loss": 1.9333, + "step": 5222 + }, + { + "epoch": 1.6031307550644567, + "grad_norm": 0.5048179626464844, + "learning_rate": 9.561560444002551e-05, + "loss": 1.9473, + "step": 5223 + }, + { + "epoch": 1.6034376918354818, + "grad_norm": 0.42069435119628906, + "learning_rate": 9.56135687921991e-05, + "loss": 1.8507, + "step": 5224 + }, + { + "epoch": 1.6037446286065071, + "grad_norm": 0.37166985869407654, + "learning_rate": 9.561153269359269e-05, + "loss": 1.9404, + "step": 5225 + }, + { + "epoch": 1.6040515653775322, + "grad_norm": 0.42752668261528015, + "learning_rate": 9.560949614422637e-05, + "loss": 1.9791, + "step": 5226 + }, + { + "epoch": 1.6043585021485574, + "grad_norm": 0.4334527552127838, + "learning_rate": 9.560745914412029e-05, + "loss": 1.972, + "step": 5227 + }, + { + "epoch": 1.6046654389195827, + "grad_norm": 0.44162631034851074, + "learning_rate": 9.560542169329454e-05, + "loss": 1.9054, + "step": 5228 + }, + { + "epoch": 1.6049723756906076, + "grad_norm": 0.3891509771347046, + "learning_rate": 9.560338379176929e-05, + "loss": 1.9356, + "step": 5229 + }, + { + "epoch": 1.605279312461633, + "grad_norm": 0.3821989893913269, + "learning_rate": 9.56013454395647e-05, + "loss": 1.9197, + "step": 5230 + }, + { + "epoch": 1.605586249232658, + "grad_norm": 0.4338948428630829, + "learning_rate": 9.559930663670084e-05, + "loss": 2.002, + "step": 5231 + }, + { + "epoch": 1.6058931860036831, + "grad_norm": 0.4784114956855774, + "learning_rate": 9.559726738319794e-05, + "loss": 2.0344, + "step": 5232 + }, + { + "epoch": 1.6062001227747085, + "grad_norm": 0.43362441658973694, + "learning_rate": 9.559522767907612e-05, + "loss": 1.9282, + "step": 5233 + }, + { + "epoch": 1.6065070595457336, + "grad_norm": 0.40863800048828125, + "learning_rate": 9.559318752435553e-05, + "loss": 1.8468, + "step": 5234 + }, + { + "epoch": 1.6068139963167587, + "grad_norm": 0.4509727358818054, + "learning_rate": 9.559114691905633e-05, + "loss": 2.0175, + "step": 5235 + }, + { + "epoch": 1.607120933087784, + "grad_norm": 0.4650020897388458, + "learning_rate": 9.55891058631987e-05, + "loss": 1.9946, + "step": 5236 + }, + { + "epoch": 1.607427869858809, + "grad_norm": 0.4315911829471588, + "learning_rate": 9.55870643568028e-05, + "loss": 1.9271, + "step": 5237 + }, + { + "epoch": 1.6077348066298343, + "grad_norm": 0.4109809994697571, + "learning_rate": 9.558502239988882e-05, + "loss": 1.9791, + "step": 5238 + }, + { + "epoch": 1.6080417434008594, + "grad_norm": 0.4323776662349701, + "learning_rate": 9.558297999247692e-05, + "loss": 1.9745, + "step": 5239 + }, + { + "epoch": 1.6083486801718845, + "grad_norm": 0.4255007207393646, + "learning_rate": 9.558093713458729e-05, + "loss": 1.96, + "step": 5240 + }, + { + "epoch": 1.6086556169429098, + "grad_norm": 0.4045571982860565, + "learning_rate": 9.557889382624014e-05, + "loss": 1.9148, + "step": 5241 + }, + { + "epoch": 1.608962553713935, + "grad_norm": 0.39663615822792053, + "learning_rate": 9.557685006745564e-05, + "loss": 1.9313, + "step": 5242 + }, + { + "epoch": 1.60926949048496, + "grad_norm": 0.39130523800849915, + "learning_rate": 9.5574805858254e-05, + "loss": 2.0073, + "step": 5243 + }, + { + "epoch": 1.6095764272559854, + "grad_norm": 0.4071548581123352, + "learning_rate": 9.55727611986554e-05, + "loss": 1.9353, + "step": 5244 + }, + { + "epoch": 1.6098833640270105, + "grad_norm": 0.44347357749938965, + "learning_rate": 9.557071608868007e-05, + "loss": 1.9325, + "step": 5245 + }, + { + "epoch": 1.6101903007980356, + "grad_norm": 0.48900067806243896, + "learning_rate": 9.556867052834821e-05, + "loss": 2.0083, + "step": 5246 + }, + { + "epoch": 1.610497237569061, + "grad_norm": 0.44374197721481323, + "learning_rate": 9.556662451768006e-05, + "loss": 2.0143, + "step": 5247 + }, + { + "epoch": 1.6108041743400858, + "grad_norm": 0.385268896818161, + "learning_rate": 9.556457805669581e-05, + "loss": 1.8981, + "step": 5248 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.5355607867240906, + "learning_rate": 9.556253114541569e-05, + "loss": 2.0413, + "step": 5249 + }, + { + "epoch": 1.6114180478821363, + "grad_norm": 0.5672646164894104, + "learning_rate": 9.556048378385992e-05, + "loss": 1.9429, + "step": 5250 + }, + { + "epoch": 1.6117249846531614, + "grad_norm": 0.46225669980049133, + "learning_rate": 9.555843597204875e-05, + "loss": 1.9883, + "step": 5251 + }, + { + "epoch": 1.6120319214241867, + "grad_norm": 0.43236228823661804, + "learning_rate": 9.555638771000243e-05, + "loss": 1.9641, + "step": 5252 + }, + { + "epoch": 1.6123388581952118, + "grad_norm": 0.4843178987503052, + "learning_rate": 9.555433899774116e-05, + "loss": 1.9224, + "step": 5253 + }, + { + "epoch": 1.612645794966237, + "grad_norm": 0.4693675637245178, + "learning_rate": 9.555228983528523e-05, + "loss": 1.9774, + "step": 5254 + }, + { + "epoch": 1.6129527317372623, + "grad_norm": 0.3968529999256134, + "learning_rate": 9.555024022265487e-05, + "loss": 1.8939, + "step": 5255 + }, + { + "epoch": 1.6132596685082872, + "grad_norm": 0.42781850695610046, + "learning_rate": 9.554819015987033e-05, + "loss": 1.9561, + "step": 5256 + }, + { + "epoch": 1.6135666052793125, + "grad_norm": 0.5241015553474426, + "learning_rate": 9.554613964695189e-05, + "loss": 1.963, + "step": 5257 + }, + { + "epoch": 1.6138735420503376, + "grad_norm": 0.4292888641357422, + "learning_rate": 9.554408868391979e-05, + "loss": 2.0248, + "step": 5258 + }, + { + "epoch": 1.6141804788213627, + "grad_norm": 0.49197763204574585, + "learning_rate": 9.554203727079433e-05, + "loss": 1.9612, + "step": 5259 + }, + { + "epoch": 1.614487415592388, + "grad_norm": 0.45733556151390076, + "learning_rate": 9.553998540759575e-05, + "loss": 1.9093, + "step": 5260 + }, + { + "epoch": 1.6147943523634132, + "grad_norm": 0.4139576256275177, + "learning_rate": 9.553793309434436e-05, + "loss": 1.875, + "step": 5261 + }, + { + "epoch": 1.6151012891344383, + "grad_norm": 0.42295894026756287, + "learning_rate": 9.55358803310604e-05, + "loss": 1.9427, + "step": 5262 + }, + { + "epoch": 1.6154082259054636, + "grad_norm": 0.370761513710022, + "learning_rate": 9.55338271177642e-05, + "loss": 1.932, + "step": 5263 + }, + { + "epoch": 1.6157151626764885, + "grad_norm": 0.38912683725357056, + "learning_rate": 9.553177345447602e-05, + "loss": 1.9606, + "step": 5264 + }, + { + "epoch": 1.6160220994475138, + "grad_norm": 0.3901510238647461, + "learning_rate": 9.552971934121618e-05, + "loss": 1.9455, + "step": 5265 + }, + { + "epoch": 1.616329036218539, + "grad_norm": 0.4517458975315094, + "learning_rate": 9.552766477800494e-05, + "loss": 1.9291, + "step": 5266 + }, + { + "epoch": 1.616635972989564, + "grad_norm": 0.47282713651657104, + "learning_rate": 9.552560976486266e-05, + "loss": 1.9326, + "step": 5267 + }, + { + "epoch": 1.6169429097605894, + "grad_norm": 0.4741488993167877, + "learning_rate": 9.552355430180961e-05, + "loss": 1.9782, + "step": 5268 + }, + { + "epoch": 1.6172498465316145, + "grad_norm": 0.42634037137031555, + "learning_rate": 9.552149838886612e-05, + "loss": 1.9871, + "step": 5269 + }, + { + "epoch": 1.6175567833026396, + "grad_norm": 0.39007633924484253, + "learning_rate": 9.55194420260525e-05, + "loss": 1.9397, + "step": 5270 + }, + { + "epoch": 1.617863720073665, + "grad_norm": 0.41707170009613037, + "learning_rate": 9.551738521338906e-05, + "loss": 1.8555, + "step": 5271 + }, + { + "epoch": 1.6181706568446899, + "grad_norm": 0.46702343225479126, + "learning_rate": 9.551532795089616e-05, + "loss": 1.9987, + "step": 5272 + }, + { + "epoch": 1.6184775936157152, + "grad_norm": 0.44585564732551575, + "learning_rate": 9.551327023859411e-05, + "loss": 1.8512, + "step": 5273 + }, + { + "epoch": 1.6187845303867403, + "grad_norm": 0.42617684602737427, + "learning_rate": 9.551121207650324e-05, + "loss": 1.9405, + "step": 5274 + }, + { + "epoch": 1.6190914671577654, + "grad_norm": 0.39399340748786926, + "learning_rate": 9.55091534646439e-05, + "loss": 1.9787, + "step": 5275 + }, + { + "epoch": 1.6193984039287908, + "grad_norm": 0.44386324286460876, + "learning_rate": 9.550709440303642e-05, + "loss": 1.9791, + "step": 5276 + }, + { + "epoch": 1.6197053406998159, + "grad_norm": 0.3871287405490875, + "learning_rate": 9.550503489170117e-05, + "loss": 1.9354, + "step": 5277 + }, + { + "epoch": 1.620012277470841, + "grad_norm": 0.4131690263748169, + "learning_rate": 9.550297493065851e-05, + "loss": 1.9709, + "step": 5278 + }, + { + "epoch": 1.6203192142418663, + "grad_norm": 0.3919534683227539, + "learning_rate": 9.550091451992877e-05, + "loss": 1.8997, + "step": 5279 + }, + { + "epoch": 1.6206261510128912, + "grad_norm": 0.40001583099365234, + "learning_rate": 9.54988536595323e-05, + "loss": 1.9006, + "step": 5280 + }, + { + "epoch": 1.6209330877839165, + "grad_norm": 0.44222408533096313, + "learning_rate": 9.549679234948952e-05, + "loss": 2.0033, + "step": 5281 + }, + { + "epoch": 1.6212400245549416, + "grad_norm": 0.4243159592151642, + "learning_rate": 9.549473058982077e-05, + "loss": 1.9582, + "step": 5282 + }, + { + "epoch": 1.6215469613259668, + "grad_norm": 0.411408007144928, + "learning_rate": 9.549266838054641e-05, + "loss": 1.9244, + "step": 5283 + }, + { + "epoch": 1.621853898096992, + "grad_norm": 0.3833782970905304, + "learning_rate": 9.549060572168686e-05, + "loss": 1.9184, + "step": 5284 + }, + { + "epoch": 1.6221608348680172, + "grad_norm": 0.3925926685333252, + "learning_rate": 9.548854261326246e-05, + "loss": 1.9299, + "step": 5285 + }, + { + "epoch": 1.6224677716390423, + "grad_norm": 0.4472656846046448, + "learning_rate": 9.548647905529363e-05, + "loss": 2.0622, + "step": 5286 + }, + { + "epoch": 1.6227747084100677, + "grad_norm": 0.4842108488082886, + "learning_rate": 9.548441504780074e-05, + "loss": 1.9759, + "step": 5287 + }, + { + "epoch": 1.6230816451810925, + "grad_norm": 0.49826517701148987, + "learning_rate": 9.548235059080422e-05, + "loss": 1.9162, + "step": 5288 + }, + { + "epoch": 1.6233885819521179, + "grad_norm": 0.4672689735889435, + "learning_rate": 9.548028568432445e-05, + "loss": 1.9843, + "step": 5289 + }, + { + "epoch": 1.623695518723143, + "grad_norm": 0.48113325238227844, + "learning_rate": 9.547822032838182e-05, + "loss": 1.9426, + "step": 5290 + }, + { + "epoch": 1.624002455494168, + "grad_norm": 0.49646374583244324, + "learning_rate": 9.54761545229968e-05, + "loss": 1.908, + "step": 5291 + }, + { + "epoch": 1.6243093922651934, + "grad_norm": 0.42530664801597595, + "learning_rate": 9.547408826818974e-05, + "loss": 1.9189, + "step": 5292 + }, + { + "epoch": 1.6246163290362186, + "grad_norm": 0.592721164226532, + "learning_rate": 9.54720215639811e-05, + "loss": 1.9656, + "step": 5293 + }, + { + "epoch": 1.6249232658072437, + "grad_norm": 0.5530748963356018, + "learning_rate": 9.546995441039127e-05, + "loss": 1.8815, + "step": 5294 + }, + { + "epoch": 1.625230202578269, + "grad_norm": 0.4551030695438385, + "learning_rate": 9.546788680744073e-05, + "loss": 1.9485, + "step": 5295 + }, + { + "epoch": 1.625537139349294, + "grad_norm": 0.42004409432411194, + "learning_rate": 9.546581875514985e-05, + "loss": 1.9903, + "step": 5296 + }, + { + "epoch": 1.6258440761203192, + "grad_norm": 0.5363507270812988, + "learning_rate": 9.546375025353911e-05, + "loss": 1.93, + "step": 5297 + }, + { + "epoch": 1.6261510128913443, + "grad_norm": 0.457795649766922, + "learning_rate": 9.546168130262896e-05, + "loss": 1.9279, + "step": 5298 + }, + { + "epoch": 1.6264579496623695, + "grad_norm": 0.5061174631118774, + "learning_rate": 9.545961190243982e-05, + "loss": 1.9198, + "step": 5299 + }, + { + "epoch": 1.6267648864333948, + "grad_norm": 0.4366548955440521, + "learning_rate": 9.545754205299214e-05, + "loss": 1.9206, + "step": 5300 + }, + { + "epoch": 1.62707182320442, + "grad_norm": 0.361251562833786, + "learning_rate": 9.54554717543064e-05, + "loss": 1.8638, + "step": 5301 + }, + { + "epoch": 1.627378759975445, + "grad_norm": 0.45089036226272583, + "learning_rate": 9.545340100640303e-05, + "loss": 1.9206, + "step": 5302 + }, + { + "epoch": 1.6276856967464703, + "grad_norm": 0.38224726915359497, + "learning_rate": 9.545132980930251e-05, + "loss": 1.9893, + "step": 5303 + }, + { + "epoch": 1.6279926335174952, + "grad_norm": 0.43573206663131714, + "learning_rate": 9.544925816302533e-05, + "loss": 1.9358, + "step": 5304 + }, + { + "epoch": 1.6282995702885206, + "grad_norm": 0.5618723630905151, + "learning_rate": 9.544718606759193e-05, + "loss": 1.9745, + "step": 5305 + }, + { + "epoch": 1.6286065070595457, + "grad_norm": 0.517867386341095, + "learning_rate": 9.54451135230228e-05, + "loss": 2.0238, + "step": 5306 + }, + { + "epoch": 1.6289134438305708, + "grad_norm": 0.4745725393295288, + "learning_rate": 9.544304052933842e-05, + "loss": 1.999, + "step": 5307 + }, + { + "epoch": 1.6292203806015961, + "grad_norm": 0.4454270899295807, + "learning_rate": 9.544096708655928e-05, + "loss": 1.9215, + "step": 5308 + }, + { + "epoch": 1.6295273173726212, + "grad_norm": 0.5604696273803711, + "learning_rate": 9.543889319470586e-05, + "loss": 1.8756, + "step": 5309 + }, + { + "epoch": 1.6298342541436464, + "grad_norm": 0.645453155040741, + "learning_rate": 9.543681885379869e-05, + "loss": 1.9177, + "step": 5310 + }, + { + "epoch": 1.6301411909146717, + "grad_norm": 0.7018140554428101, + "learning_rate": 9.543474406385824e-05, + "loss": 1.9231, + "step": 5311 + }, + { + "epoch": 1.6304481276856968, + "grad_norm": 0.691644549369812, + "learning_rate": 9.543266882490501e-05, + "loss": 1.9055, + "step": 5312 + }, + { + "epoch": 1.630755064456722, + "grad_norm": 0.5484849810600281, + "learning_rate": 9.54305931369595e-05, + "loss": 1.8977, + "step": 5313 + }, + { + "epoch": 1.6310620012277472, + "grad_norm": 0.4035104811191559, + "learning_rate": 9.542851700004227e-05, + "loss": 1.9098, + "step": 5314 + }, + { + "epoch": 1.6313689379987721, + "grad_norm": 0.4578574299812317, + "learning_rate": 9.542644041417379e-05, + "loss": 1.9946, + "step": 5315 + }, + { + "epoch": 1.6316758747697975, + "grad_norm": 0.646272599697113, + "learning_rate": 9.542436337937462e-05, + "loss": 1.9489, + "step": 5316 + }, + { + "epoch": 1.6319828115408226, + "grad_norm": 0.5796291828155518, + "learning_rate": 9.542228589566524e-05, + "loss": 1.8396, + "step": 5317 + }, + { + "epoch": 1.6322897483118477, + "grad_norm": 0.42690619826316833, + "learning_rate": 9.542020796306623e-05, + "loss": 1.9691, + "step": 5318 + }, + { + "epoch": 1.632596685082873, + "grad_norm": 0.3943910002708435, + "learning_rate": 9.54181295815981e-05, + "loss": 1.8711, + "step": 5319 + }, + { + "epoch": 1.6329036218538981, + "grad_norm": 0.4636860489845276, + "learning_rate": 9.541605075128137e-05, + "loss": 1.8659, + "step": 5320 + }, + { + "epoch": 1.6332105586249233, + "grad_norm": 0.5485807061195374, + "learning_rate": 9.541397147213664e-05, + "loss": 2.031, + "step": 5321 + }, + { + "epoch": 1.6335174953959486, + "grad_norm": 0.40169721841812134, + "learning_rate": 9.541189174418441e-05, + "loss": 1.9346, + "step": 5322 + }, + { + "epoch": 1.6338244321669735, + "grad_norm": 0.3407663106918335, + "learning_rate": 9.540981156744524e-05, + "loss": 1.9238, + "step": 5323 + }, + { + "epoch": 1.6341313689379988, + "grad_norm": 0.4062422513961792, + "learning_rate": 9.540773094193971e-05, + "loss": 1.914, + "step": 5324 + }, + { + "epoch": 1.634438305709024, + "grad_norm": 0.47654685378074646, + "learning_rate": 9.540564986768836e-05, + "loss": 1.8957, + "step": 5325 + }, + { + "epoch": 1.634745242480049, + "grad_norm": 0.4369850754737854, + "learning_rate": 9.540356834471178e-05, + "loss": 1.968, + "step": 5326 + }, + { + "epoch": 1.6350521792510744, + "grad_norm": 0.38868457078933716, + "learning_rate": 9.540148637303052e-05, + "loss": 1.931, + "step": 5327 + }, + { + "epoch": 1.6353591160220995, + "grad_norm": 0.4998358190059662, + "learning_rate": 9.539940395266515e-05, + "loss": 1.9316, + "step": 5328 + }, + { + "epoch": 1.6356660527931246, + "grad_norm": 0.5497372150421143, + "learning_rate": 9.539732108363628e-05, + "loss": 1.9233, + "step": 5329 + }, + { + "epoch": 1.63597298956415, + "grad_norm": 0.5609846115112305, + "learning_rate": 9.539523776596445e-05, + "loss": 1.898, + "step": 5330 + }, + { + "epoch": 1.6362799263351748, + "grad_norm": 0.44984617829322815, + "learning_rate": 9.539315399967029e-05, + "loss": 2.0103, + "step": 5331 + }, + { + "epoch": 1.6365868631062002, + "grad_norm": 0.41710013151168823, + "learning_rate": 9.539106978477436e-05, + "loss": 1.9008, + "step": 5332 + }, + { + "epoch": 1.6368937998772253, + "grad_norm": 0.44854703545570374, + "learning_rate": 9.53889851212973e-05, + "loss": 1.9591, + "step": 5333 + }, + { + "epoch": 1.6372007366482504, + "grad_norm": 0.4259171485900879, + "learning_rate": 9.538690000925968e-05, + "loss": 1.915, + "step": 5334 + }, + { + "epoch": 1.6375076734192757, + "grad_norm": 0.4444480240345001, + "learning_rate": 9.53848144486821e-05, + "loss": 1.9562, + "step": 5335 + }, + { + "epoch": 1.6378146101903008, + "grad_norm": 0.40078794956207275, + "learning_rate": 9.538272843958518e-05, + "loss": 1.8802, + "step": 5336 + }, + { + "epoch": 1.638121546961326, + "grad_norm": 0.5346726179122925, + "learning_rate": 9.538064198198955e-05, + "loss": 2.0214, + "step": 5337 + }, + { + "epoch": 1.6384284837323513, + "grad_norm": 0.47136780619621277, + "learning_rate": 9.537855507591581e-05, + "loss": 1.9593, + "step": 5338 + }, + { + "epoch": 1.6387354205033762, + "grad_norm": 0.3839198052883148, + "learning_rate": 9.53764677213846e-05, + "loss": 1.9507, + "step": 5339 + }, + { + "epoch": 1.6390423572744015, + "grad_norm": 0.4565586447715759, + "learning_rate": 9.537437991841654e-05, + "loss": 1.9292, + "step": 5340 + }, + { + "epoch": 1.6393492940454266, + "grad_norm": 0.5139011740684509, + "learning_rate": 9.537229166703225e-05, + "loss": 1.9388, + "step": 5341 + }, + { + "epoch": 1.6396562308164517, + "grad_norm": 0.5421571135520935, + "learning_rate": 9.537020296725238e-05, + "loss": 1.9031, + "step": 5342 + }, + { + "epoch": 1.639963167587477, + "grad_norm": 0.4085434675216675, + "learning_rate": 9.536811381909758e-05, + "loss": 1.9167, + "step": 5343 + }, + { + "epoch": 1.6402701043585022, + "grad_norm": 0.3567824065685272, + "learning_rate": 9.536602422258849e-05, + "loss": 1.89, + "step": 5344 + }, + { + "epoch": 1.6405770411295273, + "grad_norm": 0.5427443385124207, + "learning_rate": 9.536393417774575e-05, + "loss": 2.0036, + "step": 5345 + }, + { + "epoch": 1.6408839779005526, + "grad_norm": 0.5275370478630066, + "learning_rate": 9.536184368459003e-05, + "loss": 1.94, + "step": 5346 + }, + { + "epoch": 1.6411909146715775, + "grad_norm": 0.3916989862918854, + "learning_rate": 9.535975274314198e-05, + "loss": 1.8769, + "step": 5347 + }, + { + "epoch": 1.6414978514426029, + "grad_norm": 0.4200802743434906, + "learning_rate": 9.535766135342228e-05, + "loss": 1.9384, + "step": 5348 + }, + { + "epoch": 1.641804788213628, + "grad_norm": 0.5287195444107056, + "learning_rate": 9.535556951545157e-05, + "loss": 1.9159, + "step": 5349 + }, + { + "epoch": 1.642111724984653, + "grad_norm": 0.5934851765632629, + "learning_rate": 9.535347722925055e-05, + "loss": 1.9927, + "step": 5350 + }, + { + "epoch": 1.6424186617556784, + "grad_norm": 0.49941807985305786, + "learning_rate": 9.535138449483987e-05, + "loss": 1.9124, + "step": 5351 + }, + { + "epoch": 1.6427255985267035, + "grad_norm": 0.41778016090393066, + "learning_rate": 9.534929131224024e-05, + "loss": 1.9468, + "step": 5352 + }, + { + "epoch": 1.6430325352977286, + "grad_norm": 0.5172474384307861, + "learning_rate": 9.534719768147233e-05, + "loss": 1.928, + "step": 5353 + }, + { + "epoch": 1.643339472068754, + "grad_norm": 0.6690294146537781, + "learning_rate": 9.534510360255683e-05, + "loss": 1.9697, + "step": 5354 + }, + { + "epoch": 1.6436464088397789, + "grad_norm": 0.617683470249176, + "learning_rate": 9.534300907551444e-05, + "loss": 1.9529, + "step": 5355 + }, + { + "epoch": 1.6439533456108042, + "grad_norm": 0.40067893266677856, + "learning_rate": 9.534091410036587e-05, + "loss": 1.915, + "step": 5356 + }, + { + "epoch": 1.6442602823818293, + "grad_norm": 0.46418440341949463, + "learning_rate": 9.53388186771318e-05, + "loss": 1.9056, + "step": 5357 + }, + { + "epoch": 1.6445672191528544, + "grad_norm": 0.6600098013877869, + "learning_rate": 9.533672280583295e-05, + "loss": 1.9641, + "step": 5358 + }, + { + "epoch": 1.6448741559238798, + "grad_norm": 0.6510347127914429, + "learning_rate": 9.533462648649004e-05, + "loss": 1.916, + "step": 5359 + }, + { + "epoch": 1.6451810926949049, + "grad_norm": 0.5004377365112305, + "learning_rate": 9.533252971912376e-05, + "loss": 1.9584, + "step": 5360 + }, + { + "epoch": 1.64548802946593, + "grad_norm": 0.45522230863571167, + "learning_rate": 9.533043250375488e-05, + "loss": 1.973, + "step": 5361 + }, + { + "epoch": 1.6457949662369553, + "grad_norm": 0.5304180383682251, + "learning_rate": 9.532833484040408e-05, + "loss": 1.8542, + "step": 5362 + }, + { + "epoch": 1.6461019030079802, + "grad_norm": 0.5320406556129456, + "learning_rate": 9.53262367290921e-05, + "loss": 1.9405, + "step": 5363 + }, + { + "epoch": 1.6464088397790055, + "grad_norm": 0.4377361536026001, + "learning_rate": 9.532413816983969e-05, + "loss": 1.9126, + "step": 5364 + }, + { + "epoch": 1.6467157765500307, + "grad_norm": 0.4632298946380615, + "learning_rate": 9.532203916266758e-05, + "loss": 1.9868, + "step": 5365 + }, + { + "epoch": 1.6470227133210558, + "grad_norm": 0.4861730635166168, + "learning_rate": 9.531993970759651e-05, + "loss": 1.895, + "step": 5366 + }, + { + "epoch": 1.647329650092081, + "grad_norm": 0.45012348890304565, + "learning_rate": 9.531783980464726e-05, + "loss": 1.9583, + "step": 5367 + }, + { + "epoch": 1.6476365868631062, + "grad_norm": 0.43772751092910767, + "learning_rate": 9.531573945384053e-05, + "loss": 1.9341, + "step": 5368 + }, + { + "epoch": 1.6479435236341313, + "grad_norm": 0.39253392815589905, + "learning_rate": 9.531363865519711e-05, + "loss": 1.8629, + "step": 5369 + }, + { + "epoch": 1.6482504604051567, + "grad_norm": 0.44614076614379883, + "learning_rate": 9.531153740873775e-05, + "loss": 1.9508, + "step": 5370 + }, + { + "epoch": 1.6485573971761815, + "grad_norm": 0.4442307949066162, + "learning_rate": 9.530943571448322e-05, + "loss": 1.9624, + "step": 5371 + }, + { + "epoch": 1.6488643339472069, + "grad_norm": 0.44962942600250244, + "learning_rate": 9.53073335724543e-05, + "loss": 1.9315, + "step": 5372 + }, + { + "epoch": 1.649171270718232, + "grad_norm": 0.4903222620487213, + "learning_rate": 9.530523098267173e-05, + "loss": 1.8776, + "step": 5373 + }, + { + "epoch": 1.649478207489257, + "grad_norm": 0.4733131229877472, + "learning_rate": 9.530312794515633e-05, + "loss": 1.958, + "step": 5374 + }, + { + "epoch": 1.6497851442602824, + "grad_norm": 0.4134232997894287, + "learning_rate": 9.530102445992886e-05, + "loss": 1.9184, + "step": 5375 + }, + { + "epoch": 1.6500920810313076, + "grad_norm": 0.43521758913993835, + "learning_rate": 9.529892052701012e-05, + "loss": 1.9383, + "step": 5376 + }, + { + "epoch": 1.6503990178023327, + "grad_norm": 0.5098583102226257, + "learning_rate": 9.52968161464209e-05, + "loss": 1.9596, + "step": 5377 + }, + { + "epoch": 1.650705954573358, + "grad_norm": 0.48421037197113037, + "learning_rate": 9.5294711318182e-05, + "loss": 1.9258, + "step": 5378 + }, + { + "epoch": 1.651012891344383, + "grad_norm": 0.4039461314678192, + "learning_rate": 9.52926060423142e-05, + "loss": 1.9975, + "step": 5379 + }, + { + "epoch": 1.6513198281154082, + "grad_norm": 0.491858571767807, + "learning_rate": 9.529050031883832e-05, + "loss": 1.9564, + "step": 5380 + }, + { + "epoch": 1.6516267648864333, + "grad_norm": 0.45920100808143616, + "learning_rate": 9.528839414777517e-05, + "loss": 1.8513, + "step": 5381 + }, + { + "epoch": 1.6519337016574585, + "grad_norm": 0.4812139868736267, + "learning_rate": 9.528628752914558e-05, + "loss": 1.9638, + "step": 5382 + }, + { + "epoch": 1.6522406384284838, + "grad_norm": 0.38021141290664673, + "learning_rate": 9.528418046297034e-05, + "loss": 1.848, + "step": 5383 + }, + { + "epoch": 1.652547575199509, + "grad_norm": 0.438681960105896, + "learning_rate": 9.52820729492703e-05, + "loss": 1.9931, + "step": 5384 + }, + { + "epoch": 1.652854511970534, + "grad_norm": 0.4387293756008148, + "learning_rate": 9.527996498806627e-05, + "loss": 1.9969, + "step": 5385 + }, + { + "epoch": 1.6531614487415593, + "grad_norm": 0.43315380811691284, + "learning_rate": 9.527785657937907e-05, + "loss": 1.9607, + "step": 5386 + }, + { + "epoch": 1.6534683855125845, + "grad_norm": 0.4800446927547455, + "learning_rate": 9.527574772322956e-05, + "loss": 1.9645, + "step": 5387 + }, + { + "epoch": 1.6537753222836096, + "grad_norm": 0.45495909452438354, + "learning_rate": 9.527363841963857e-05, + "loss": 1.8748, + "step": 5388 + }, + { + "epoch": 1.654082259054635, + "grad_norm": 0.4052638113498688, + "learning_rate": 9.527152866862696e-05, + "loss": 1.9491, + "step": 5389 + }, + { + "epoch": 1.6543891958256598, + "grad_norm": 0.44545745849609375, + "learning_rate": 9.526941847021558e-05, + "loss": 1.8938, + "step": 5390 + }, + { + "epoch": 1.6546961325966851, + "grad_norm": 0.5576399564743042, + "learning_rate": 9.526730782442526e-05, + "loss": 1.9656, + "step": 5391 + }, + { + "epoch": 1.6550030693677102, + "grad_norm": 0.5678401589393616, + "learning_rate": 9.526519673127686e-05, + "loss": 1.9914, + "step": 5392 + }, + { + "epoch": 1.6553100061387354, + "grad_norm": 0.4391598701477051, + "learning_rate": 9.526308519079127e-05, + "loss": 1.9452, + "step": 5393 + }, + { + "epoch": 1.6556169429097607, + "grad_norm": 0.4375559091567993, + "learning_rate": 9.526097320298934e-05, + "loss": 1.9335, + "step": 5394 + }, + { + "epoch": 1.6559238796807858, + "grad_norm": 0.4976498782634735, + "learning_rate": 9.525886076789194e-05, + "loss": 2.0065, + "step": 5395 + }, + { + "epoch": 1.656230816451811, + "grad_norm": 0.5966445207595825, + "learning_rate": 9.525674788551996e-05, + "loss": 1.9924, + "step": 5396 + }, + { + "epoch": 1.6565377532228363, + "grad_norm": 0.5119359493255615, + "learning_rate": 9.525463455589427e-05, + "loss": 2.0061, + "step": 5397 + }, + { + "epoch": 1.6568446899938611, + "grad_norm": 0.46835067868232727, + "learning_rate": 9.525252077903574e-05, + "loss": 1.9441, + "step": 5398 + }, + { + "epoch": 1.6571516267648865, + "grad_norm": 0.5319140553474426, + "learning_rate": 9.52504065549653e-05, + "loss": 1.9704, + "step": 5399 + }, + { + "epoch": 1.6574585635359116, + "grad_norm": 0.5132572054862976, + "learning_rate": 9.52482918837038e-05, + "loss": 1.9037, + "step": 5400 + }, + { + "epoch": 1.6577655003069367, + "grad_norm": 0.41260987520217896, + "learning_rate": 9.524617676527218e-05, + "loss": 1.9103, + "step": 5401 + }, + { + "epoch": 1.658072437077962, + "grad_norm": 0.41780540347099304, + "learning_rate": 9.524406119969131e-05, + "loss": 1.9419, + "step": 5402 + }, + { + "epoch": 1.6583793738489871, + "grad_norm": 0.42015889286994934, + "learning_rate": 9.524194518698211e-05, + "loss": 1.9143, + "step": 5403 + }, + { + "epoch": 1.6586863106200123, + "grad_norm": 0.4449796676635742, + "learning_rate": 9.523982872716548e-05, + "loss": 1.9794, + "step": 5404 + }, + { + "epoch": 1.6589932473910376, + "grad_norm": 0.4392293393611908, + "learning_rate": 9.523771182026237e-05, + "loss": 1.8687, + "step": 5405 + }, + { + "epoch": 1.6593001841620625, + "grad_norm": 0.49595963954925537, + "learning_rate": 9.523559446629366e-05, + "loss": 2.013, + "step": 5406 + }, + { + "epoch": 1.6596071209330878, + "grad_norm": 0.4456728994846344, + "learning_rate": 9.523347666528029e-05, + "loss": 1.9269, + "step": 5407 + }, + { + "epoch": 1.659914057704113, + "grad_norm": 0.3835284411907196, + "learning_rate": 9.52313584172432e-05, + "loss": 1.9042, + "step": 5408 + }, + { + "epoch": 1.660220994475138, + "grad_norm": 0.39068692922592163, + "learning_rate": 9.522923972220332e-05, + "loss": 1.999, + "step": 5409 + }, + { + "epoch": 1.6605279312461634, + "grad_norm": 0.4522729814052582, + "learning_rate": 9.522712058018157e-05, + "loss": 1.9546, + "step": 5410 + }, + { + "epoch": 1.6608348680171885, + "grad_norm": 0.3834155201911926, + "learning_rate": 9.522500099119891e-05, + "loss": 1.9184, + "step": 5411 + }, + { + "epoch": 1.6611418047882136, + "grad_norm": 0.36149126291275024, + "learning_rate": 9.522288095527629e-05, + "loss": 1.8973, + "step": 5412 + }, + { + "epoch": 1.661448741559239, + "grad_norm": 0.3502398729324341, + "learning_rate": 9.522076047243464e-05, + "loss": 1.8775, + "step": 5413 + }, + { + "epoch": 1.6617556783302638, + "grad_norm": 0.36552321910858154, + "learning_rate": 9.521863954269495e-05, + "loss": 1.901, + "step": 5414 + }, + { + "epoch": 1.6620626151012892, + "grad_norm": 0.37815216183662415, + "learning_rate": 9.521651816607814e-05, + "loss": 1.9143, + "step": 5415 + }, + { + "epoch": 1.6623695518723143, + "grad_norm": 0.4048994481563568, + "learning_rate": 9.52143963426052e-05, + "loss": 1.9892, + "step": 5416 + }, + { + "epoch": 1.6626764886433394, + "grad_norm": 0.35271233320236206, + "learning_rate": 9.52122740722971e-05, + "loss": 1.9209, + "step": 5417 + }, + { + "epoch": 1.6629834254143647, + "grad_norm": 0.405009925365448, + "learning_rate": 9.521015135517482e-05, + "loss": 1.9583, + "step": 5418 + }, + { + "epoch": 1.6632903621853898, + "grad_norm": 0.4041683077812195, + "learning_rate": 9.520802819125932e-05, + "loss": 1.8937, + "step": 5419 + }, + { + "epoch": 1.663597298956415, + "grad_norm": 0.41353970766067505, + "learning_rate": 9.520590458057157e-05, + "loss": 1.949, + "step": 5420 + }, + { + "epoch": 1.6639042357274403, + "grad_norm": 0.3704569637775421, + "learning_rate": 9.520378052313258e-05, + "loss": 1.9287, + "step": 5421 + }, + { + "epoch": 1.6642111724984652, + "grad_norm": 0.4043133854866028, + "learning_rate": 9.520165601896334e-05, + "loss": 1.9116, + "step": 5422 + }, + { + "epoch": 1.6645181092694905, + "grad_norm": 0.3976849317550659, + "learning_rate": 9.519953106808485e-05, + "loss": 1.9578, + "step": 5423 + }, + { + "epoch": 1.6648250460405156, + "grad_norm": 0.41225695610046387, + "learning_rate": 9.51974056705181e-05, + "loss": 1.8861, + "step": 5424 + }, + { + "epoch": 1.6651319828115407, + "grad_norm": 0.40096259117126465, + "learning_rate": 9.519527982628409e-05, + "loss": 1.926, + "step": 5425 + }, + { + "epoch": 1.665438919582566, + "grad_norm": 0.4373134970664978, + "learning_rate": 9.519315353540384e-05, + "loss": 1.8761, + "step": 5426 + }, + { + "epoch": 1.6657458563535912, + "grad_norm": 0.3798682689666748, + "learning_rate": 9.519102679789835e-05, + "loss": 1.8655, + "step": 5427 + }, + { + "epoch": 1.6660527931246163, + "grad_norm": 0.3889687955379486, + "learning_rate": 9.518889961378865e-05, + "loss": 1.8928, + "step": 5428 + }, + { + "epoch": 1.6663597298956416, + "grad_norm": 0.39567697048187256, + "learning_rate": 9.518677198309575e-05, + "loss": 1.9193, + "step": 5429 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.37571004033088684, + "learning_rate": 9.51846439058407e-05, + "loss": 1.9653, + "step": 5430 + }, + { + "epoch": 1.6669736034376919, + "grad_norm": 0.36011725664138794, + "learning_rate": 9.518251538204451e-05, + "loss": 1.9202, + "step": 5431 + }, + { + "epoch": 1.667280540208717, + "grad_norm": 0.42314839363098145, + "learning_rate": 9.518038641172822e-05, + "loss": 1.9883, + "step": 5432 + }, + { + "epoch": 1.667587476979742, + "grad_norm": 0.3986029326915741, + "learning_rate": 9.517825699491287e-05, + "loss": 1.9838, + "step": 5433 + }, + { + "epoch": 1.6678944137507674, + "grad_norm": 0.388236939907074, + "learning_rate": 9.517612713161949e-05, + "loss": 1.901, + "step": 5434 + }, + { + "epoch": 1.6682013505217925, + "grad_norm": 0.3849826455116272, + "learning_rate": 9.517399682186917e-05, + "loss": 1.9621, + "step": 5435 + }, + { + "epoch": 1.6685082872928176, + "grad_norm": 0.40182530879974365, + "learning_rate": 9.517186606568292e-05, + "loss": 1.9081, + "step": 5436 + }, + { + "epoch": 1.668815224063843, + "grad_norm": 0.4260261654853821, + "learning_rate": 9.516973486308181e-05, + "loss": 1.9701, + "step": 5437 + }, + { + "epoch": 1.6691221608348679, + "grad_norm": 0.4035099744796753, + "learning_rate": 9.516760321408692e-05, + "loss": 1.9269, + "step": 5438 + }, + { + "epoch": 1.6694290976058932, + "grad_norm": 0.42106589674949646, + "learning_rate": 9.51654711187193e-05, + "loss": 1.9026, + "step": 5439 + }, + { + "epoch": 1.6697360343769183, + "grad_norm": 0.4629819989204407, + "learning_rate": 9.516333857700001e-05, + "loss": 1.9128, + "step": 5440 + }, + { + "epoch": 1.6700429711479434, + "grad_norm": 0.3824837803840637, + "learning_rate": 9.516120558895014e-05, + "loss": 1.8861, + "step": 5441 + }, + { + "epoch": 1.6703499079189688, + "grad_norm": 0.37263223528862, + "learning_rate": 9.515907215459076e-05, + "loss": 1.9098, + "step": 5442 + }, + { + "epoch": 1.6706568446899939, + "grad_norm": 0.3980494439601898, + "learning_rate": 9.515693827394299e-05, + "loss": 1.9764, + "step": 5443 + }, + { + "epoch": 1.670963781461019, + "grad_norm": 0.5064507722854614, + "learning_rate": 9.515480394702786e-05, + "loss": 1.9771, + "step": 5444 + }, + { + "epoch": 1.6712707182320443, + "grad_norm": 0.5012909770011902, + "learning_rate": 9.515266917386649e-05, + "loss": 1.9162, + "step": 5445 + }, + { + "epoch": 1.6715776550030692, + "grad_norm": 0.5422279238700867, + "learning_rate": 9.515053395447999e-05, + "loss": 1.8913, + "step": 5446 + }, + { + "epoch": 1.6718845917740945, + "grad_norm": 0.4677022397518158, + "learning_rate": 9.514839828888946e-05, + "loss": 1.9156, + "step": 5447 + }, + { + "epoch": 1.6721915285451197, + "grad_norm": 0.39561185240745544, + "learning_rate": 9.514626217711597e-05, + "loss": 1.9203, + "step": 5448 + }, + { + "epoch": 1.6724984653161448, + "grad_norm": 0.4435743987560272, + "learning_rate": 9.514412561918068e-05, + "loss": 1.953, + "step": 5449 + }, + { + "epoch": 1.67280540208717, + "grad_norm": 0.5383535027503967, + "learning_rate": 9.514198861510467e-05, + "loss": 1.9662, + "step": 5450 + }, + { + "epoch": 1.6731123388581952, + "grad_norm": 0.4787214696407318, + "learning_rate": 9.513985116490906e-05, + "loss": 1.9278, + "step": 5451 + }, + { + "epoch": 1.6734192756292203, + "grad_norm": 0.40962034463882446, + "learning_rate": 9.513771326861501e-05, + "loss": 1.9267, + "step": 5452 + }, + { + "epoch": 1.6737262124002457, + "grad_norm": 0.43605929613113403, + "learning_rate": 9.513557492624359e-05, + "loss": 1.9537, + "step": 5453 + }, + { + "epoch": 1.6740331491712708, + "grad_norm": 0.46278494596481323, + "learning_rate": 9.513343613781599e-05, + "loss": 1.9383, + "step": 5454 + }, + { + "epoch": 1.6743400859422959, + "grad_norm": 0.4052918255329132, + "learning_rate": 9.513129690335331e-05, + "loss": 1.9289, + "step": 5455 + }, + { + "epoch": 1.6746470227133212, + "grad_norm": 0.37791141867637634, + "learning_rate": 9.51291572228767e-05, + "loss": 1.9185, + "step": 5456 + }, + { + "epoch": 1.674953959484346, + "grad_norm": 0.41135111451148987, + "learning_rate": 9.512701709640731e-05, + "loss": 2.0003, + "step": 5457 + }, + { + "epoch": 1.6752608962553714, + "grad_norm": 0.41175320744514465, + "learning_rate": 9.512487652396629e-05, + "loss": 1.9307, + "step": 5458 + }, + { + "epoch": 1.6755678330263966, + "grad_norm": 0.40061330795288086, + "learning_rate": 9.512273550557478e-05, + "loss": 1.9361, + "step": 5459 + }, + { + "epoch": 1.6758747697974217, + "grad_norm": 0.3938329219818115, + "learning_rate": 9.512059404125397e-05, + "loss": 1.9419, + "step": 5460 + }, + { + "epoch": 1.676181706568447, + "grad_norm": 0.42825883626937866, + "learning_rate": 9.511845213102498e-05, + "loss": 1.9201, + "step": 5461 + }, + { + "epoch": 1.6764886433394721, + "grad_norm": 0.3795798122882843, + "learning_rate": 9.511630977490901e-05, + "loss": 1.9872, + "step": 5462 + }, + { + "epoch": 1.6767955801104972, + "grad_norm": 0.3639005422592163, + "learning_rate": 9.511416697292724e-05, + "loss": 1.9066, + "step": 5463 + }, + { + "epoch": 1.6771025168815226, + "grad_norm": 0.4200088381767273, + "learning_rate": 9.511202372510082e-05, + "loss": 1.9928, + "step": 5464 + }, + { + "epoch": 1.6774094536525475, + "grad_norm": 0.436638742685318, + "learning_rate": 9.510988003145092e-05, + "loss": 1.8527, + "step": 5465 + }, + { + "epoch": 1.6777163904235728, + "grad_norm": 0.40901345014572144, + "learning_rate": 9.510773589199877e-05, + "loss": 1.9915, + "step": 5466 + }, + { + "epoch": 1.678023327194598, + "grad_norm": 0.39717167615890503, + "learning_rate": 9.510559130676553e-05, + "loss": 1.9682, + "step": 5467 + }, + { + "epoch": 1.678330263965623, + "grad_norm": 0.37574490904808044, + "learning_rate": 9.510344627577239e-05, + "loss": 1.9641, + "step": 5468 + }, + { + "epoch": 1.6786372007366483, + "grad_norm": 0.36686137318611145, + "learning_rate": 9.510130079904057e-05, + "loss": 1.9082, + "step": 5469 + }, + { + "epoch": 1.6789441375076735, + "grad_norm": 0.37321972846984863, + "learning_rate": 9.509915487659125e-05, + "loss": 1.8911, + "step": 5470 + }, + { + "epoch": 1.6792510742786986, + "grad_norm": 0.3911389112472534, + "learning_rate": 9.509700850844566e-05, + "loss": 1.9721, + "step": 5471 + }, + { + "epoch": 1.679558011049724, + "grad_norm": 0.41182973980903625, + "learning_rate": 9.509486169462499e-05, + "loss": 1.9188, + "step": 5472 + }, + { + "epoch": 1.6798649478207488, + "grad_norm": 0.4141900837421417, + "learning_rate": 9.509271443515047e-05, + "loss": 1.875, + "step": 5473 + }, + { + "epoch": 1.6801718845917741, + "grad_norm": 0.4259745478630066, + "learning_rate": 9.509056673004333e-05, + "loss": 1.9258, + "step": 5474 + }, + { + "epoch": 1.6804788213627992, + "grad_norm": 0.47081178426742554, + "learning_rate": 9.508841857932476e-05, + "loss": 2.0494, + "step": 5475 + }, + { + "epoch": 1.6807857581338244, + "grad_norm": 0.5346465110778809, + "learning_rate": 9.508626998301602e-05, + "loss": 1.9371, + "step": 5476 + }, + { + "epoch": 1.6810926949048497, + "grad_norm": 0.5532976388931274, + "learning_rate": 9.508412094113832e-05, + "loss": 1.8727, + "step": 5477 + }, + { + "epoch": 1.6813996316758748, + "grad_norm": 0.5262138843536377, + "learning_rate": 9.508197145371294e-05, + "loss": 1.9098, + "step": 5478 + }, + { + "epoch": 1.6817065684469, + "grad_norm": 0.47581788897514343, + "learning_rate": 9.507982152076108e-05, + "loss": 1.9174, + "step": 5479 + }, + { + "epoch": 1.6820135052179253, + "grad_norm": 0.41795024275779724, + "learning_rate": 9.507767114230399e-05, + "loss": 1.9333, + "step": 5480 + }, + { + "epoch": 1.6823204419889501, + "grad_norm": 0.5213392376899719, + "learning_rate": 9.507552031836295e-05, + "loss": 1.9731, + "step": 5481 + }, + { + "epoch": 1.6826273787599755, + "grad_norm": 0.624969482421875, + "learning_rate": 9.507336904895919e-05, + "loss": 1.965, + "step": 5482 + }, + { + "epoch": 1.6829343155310006, + "grad_norm": 0.5719303488731384, + "learning_rate": 9.507121733411397e-05, + "loss": 1.9325, + "step": 5483 + }, + { + "epoch": 1.6832412523020257, + "grad_norm": 0.45429563522338867, + "learning_rate": 9.506906517384858e-05, + "loss": 1.8846, + "step": 5484 + }, + { + "epoch": 1.683548189073051, + "grad_norm": 0.4679521322250366, + "learning_rate": 9.506691256818427e-05, + "loss": 1.9609, + "step": 5485 + }, + { + "epoch": 1.6838551258440762, + "grad_norm": 0.64385986328125, + "learning_rate": 9.50647595171423e-05, + "loss": 1.9138, + "step": 5486 + }, + { + "epoch": 1.6841620626151013, + "grad_norm": 0.6783073544502258, + "learning_rate": 9.506260602074398e-05, + "loss": 2.0252, + "step": 5487 + }, + { + "epoch": 1.6844689993861266, + "grad_norm": 0.6151844263076782, + "learning_rate": 9.506045207901058e-05, + "loss": 2.0077, + "step": 5488 + }, + { + "epoch": 1.6847759361571515, + "grad_norm": 0.43046683073043823, + "learning_rate": 9.505829769196338e-05, + "loss": 1.8945, + "step": 5489 + }, + { + "epoch": 1.6850828729281768, + "grad_norm": 0.44831258058547974, + "learning_rate": 9.505614285962366e-05, + "loss": 1.9775, + "step": 5490 + }, + { + "epoch": 1.685389809699202, + "grad_norm": 0.4917668402194977, + "learning_rate": 9.505398758201272e-05, + "loss": 1.9115, + "step": 5491 + }, + { + "epoch": 1.685696746470227, + "grad_norm": 0.4595036506652832, + "learning_rate": 9.505183185915187e-05, + "loss": 1.9103, + "step": 5492 + }, + { + "epoch": 1.6860036832412524, + "grad_norm": 0.43335607647895813, + "learning_rate": 9.504967569106243e-05, + "loss": 1.9147, + "step": 5493 + }, + { + "epoch": 1.6863106200122775, + "grad_norm": 0.42885956168174744, + "learning_rate": 9.504751907776567e-05, + "loss": 2.0085, + "step": 5494 + }, + { + "epoch": 1.6866175567833026, + "grad_norm": 0.4121492803096771, + "learning_rate": 9.504536201928295e-05, + "loss": 1.9212, + "step": 5495 + }, + { + "epoch": 1.686924493554328, + "grad_norm": 0.4387015700340271, + "learning_rate": 9.504320451563555e-05, + "loss": 1.9202, + "step": 5496 + }, + { + "epoch": 1.6872314303253528, + "grad_norm": 0.4333394467830658, + "learning_rate": 9.504104656684481e-05, + "loss": 1.9165, + "step": 5497 + }, + { + "epoch": 1.6875383670963782, + "grad_norm": 0.37835901975631714, + "learning_rate": 9.503888817293203e-05, + "loss": 1.9087, + "step": 5498 + }, + { + "epoch": 1.6878453038674033, + "grad_norm": 0.42156684398651123, + "learning_rate": 9.503672933391857e-05, + "loss": 1.8909, + "step": 5499 + }, + { + "epoch": 1.6881522406384284, + "grad_norm": 0.4315885603427887, + "learning_rate": 9.503457004982574e-05, + "loss": 1.8892, + "step": 5500 + }, + { + "epoch": 1.6884591774094537, + "grad_norm": 0.4349892735481262, + "learning_rate": 9.50324103206749e-05, + "loss": 1.9532, + "step": 5501 + }, + { + "epoch": 1.6887661141804788, + "grad_norm": 0.45786523818969727, + "learning_rate": 9.503025014648739e-05, + "loss": 1.9285, + "step": 5502 + }, + { + "epoch": 1.689073050951504, + "grad_norm": 0.36640092730522156, + "learning_rate": 9.502808952728456e-05, + "loss": 1.9167, + "step": 5503 + }, + { + "epoch": 1.6893799877225293, + "grad_norm": 0.46942031383514404, + "learning_rate": 9.502592846308775e-05, + "loss": 2.08, + "step": 5504 + }, + { + "epoch": 1.6896869244935542, + "grad_norm": 0.44714173674583435, + "learning_rate": 9.502376695391833e-05, + "loss": 1.9618, + "step": 5505 + }, + { + "epoch": 1.6899938612645795, + "grad_norm": 0.4216810464859009, + "learning_rate": 9.502160499979764e-05, + "loss": 1.888, + "step": 5506 + }, + { + "epoch": 1.6903007980356046, + "grad_norm": 0.40471377968788147, + "learning_rate": 9.501944260074709e-05, + "loss": 1.9048, + "step": 5507 + }, + { + "epoch": 1.6906077348066297, + "grad_norm": 0.399309366941452, + "learning_rate": 9.501727975678801e-05, + "loss": 1.8796, + "step": 5508 + }, + { + "epoch": 1.690914671577655, + "grad_norm": 0.36903873085975647, + "learning_rate": 9.501511646794176e-05, + "loss": 1.9607, + "step": 5509 + }, + { + "epoch": 1.6912216083486802, + "grad_norm": 0.40781939029693604, + "learning_rate": 9.501295273422977e-05, + "loss": 1.9328, + "step": 5510 + }, + { + "epoch": 1.6915285451197053, + "grad_norm": 0.38062483072280884, + "learning_rate": 9.50107885556734e-05, + "loss": 1.9552, + "step": 5511 + }, + { + "epoch": 1.6918354818907306, + "grad_norm": 0.4047648012638092, + "learning_rate": 9.500862393229402e-05, + "loss": 1.9503, + "step": 5512 + }, + { + "epoch": 1.6921424186617555, + "grad_norm": 0.3829517066478729, + "learning_rate": 9.500645886411305e-05, + "loss": 1.9034, + "step": 5513 + }, + { + "epoch": 1.6924493554327809, + "grad_norm": 0.3657867908477783, + "learning_rate": 9.500429335115188e-05, + "loss": 1.869, + "step": 5514 + }, + { + "epoch": 1.692756292203806, + "grad_norm": 0.410877525806427, + "learning_rate": 9.50021273934319e-05, + "loss": 1.9824, + "step": 5515 + }, + { + "epoch": 1.693063228974831, + "grad_norm": 0.420682817697525, + "learning_rate": 9.499996099097453e-05, + "loss": 1.969, + "step": 5516 + }, + { + "epoch": 1.6933701657458564, + "grad_norm": 0.44578227400779724, + "learning_rate": 9.499779414380115e-05, + "loss": 1.9513, + "step": 5517 + }, + { + "epoch": 1.6936771025168815, + "grad_norm": 0.42710423469543457, + "learning_rate": 9.499562685193319e-05, + "loss": 1.9423, + "step": 5518 + }, + { + "epoch": 1.6939840392879066, + "grad_norm": 0.4503214657306671, + "learning_rate": 9.49934591153921e-05, + "loss": 1.9849, + "step": 5519 + }, + { + "epoch": 1.694290976058932, + "grad_norm": 0.427157998085022, + "learning_rate": 9.499129093419926e-05, + "loss": 1.9502, + "step": 5520 + }, + { + "epoch": 1.6945979128299569, + "grad_norm": 0.4356638491153717, + "learning_rate": 9.498912230837611e-05, + "loss": 1.8593, + "step": 5521 + }, + { + "epoch": 1.6949048496009822, + "grad_norm": 0.3894338309764862, + "learning_rate": 9.498695323794409e-05, + "loss": 1.8857, + "step": 5522 + }, + { + "epoch": 1.6952117863720073, + "grad_norm": 0.4285121262073517, + "learning_rate": 9.498478372292464e-05, + "loss": 1.9774, + "step": 5523 + }, + { + "epoch": 1.6955187231430324, + "grad_norm": 0.4316183924674988, + "learning_rate": 9.498261376333916e-05, + "loss": 1.9067, + "step": 5524 + }, + { + "epoch": 1.6958256599140578, + "grad_norm": 0.3760167956352234, + "learning_rate": 9.498044335920914e-05, + "loss": 1.8375, + "step": 5525 + }, + { + "epoch": 1.6961325966850829, + "grad_norm": 0.4327097237110138, + "learning_rate": 9.497827251055602e-05, + "loss": 1.9333, + "step": 5526 + }, + { + "epoch": 1.696439533456108, + "grad_norm": 0.4169953167438507, + "learning_rate": 9.497610121740126e-05, + "loss": 1.9015, + "step": 5527 + }, + { + "epoch": 1.6967464702271333, + "grad_norm": 0.3915253281593323, + "learning_rate": 9.49739294797663e-05, + "loss": 1.8608, + "step": 5528 + }, + { + "epoch": 1.6970534069981584, + "grad_norm": 0.4071075916290283, + "learning_rate": 9.497175729767259e-05, + "loss": 1.9336, + "step": 5529 + }, + { + "epoch": 1.6973603437691835, + "grad_norm": 0.3550303876399994, + "learning_rate": 9.496958467114163e-05, + "loss": 1.8614, + "step": 5530 + }, + { + "epoch": 1.6976672805402089, + "grad_norm": 0.3757273554801941, + "learning_rate": 9.496741160019487e-05, + "loss": 1.9959, + "step": 5531 + }, + { + "epoch": 1.6979742173112338, + "grad_norm": 0.4126262366771698, + "learning_rate": 9.49652380848538e-05, + "loss": 1.935, + "step": 5532 + }, + { + "epoch": 1.698281154082259, + "grad_norm": 0.46366190910339355, + "learning_rate": 9.496306412513988e-05, + "loss": 1.9336, + "step": 5533 + }, + { + "epoch": 1.6985880908532842, + "grad_norm": 0.42553630471229553, + "learning_rate": 9.496088972107463e-05, + "loss": 1.9388, + "step": 5534 + }, + { + "epoch": 1.6988950276243093, + "grad_norm": 0.4060843884944916, + "learning_rate": 9.49587148726795e-05, + "loss": 1.917, + "step": 5535 + }, + { + "epoch": 1.6992019643953347, + "grad_norm": 0.37994736433029175, + "learning_rate": 9.495653957997601e-05, + "loss": 1.9268, + "step": 5536 + }, + { + "epoch": 1.6995089011663598, + "grad_norm": 0.4148559272289276, + "learning_rate": 9.495436384298563e-05, + "loss": 1.8936, + "step": 5537 + }, + { + "epoch": 1.6998158379373849, + "grad_norm": 0.39814767241477966, + "learning_rate": 9.495218766172989e-05, + "loss": 1.9468, + "step": 5538 + }, + { + "epoch": 1.7001227747084102, + "grad_norm": 0.40800294280052185, + "learning_rate": 9.495001103623027e-05, + "loss": 1.9649, + "step": 5539 + }, + { + "epoch": 1.7004297114794351, + "grad_norm": 0.4225989282131195, + "learning_rate": 9.49478339665083e-05, + "loss": 1.987, + "step": 5540 + }, + { + "epoch": 1.7007366482504604, + "grad_norm": 0.4280939996242523, + "learning_rate": 9.494565645258551e-05, + "loss": 2.0487, + "step": 5541 + }, + { + "epoch": 1.7010435850214856, + "grad_norm": 0.44816237688064575, + "learning_rate": 9.494347849448338e-05, + "loss": 1.9112, + "step": 5542 + }, + { + "epoch": 1.7013505217925107, + "grad_norm": 0.424629271030426, + "learning_rate": 9.494130009222346e-05, + "loss": 1.9284, + "step": 5543 + }, + { + "epoch": 1.701657458563536, + "grad_norm": 0.40010082721710205, + "learning_rate": 9.493912124582727e-05, + "loss": 1.9307, + "step": 5544 + }, + { + "epoch": 1.7019643953345611, + "grad_norm": 0.42541825771331787, + "learning_rate": 9.493694195531633e-05, + "loss": 2.0009, + "step": 5545 + }, + { + "epoch": 1.7022713321055862, + "grad_norm": 0.39693546295166016, + "learning_rate": 9.49347622207122e-05, + "loss": 1.9237, + "step": 5546 + }, + { + "epoch": 1.7025782688766116, + "grad_norm": 0.37853676080703735, + "learning_rate": 9.493258204203644e-05, + "loss": 1.9212, + "step": 5547 + }, + { + "epoch": 1.7028852056476365, + "grad_norm": 0.3856247663497925, + "learning_rate": 9.493040141931054e-05, + "loss": 1.926, + "step": 5548 + }, + { + "epoch": 1.7031921424186618, + "grad_norm": 0.3429555892944336, + "learning_rate": 9.492822035255608e-05, + "loss": 1.8854, + "step": 5549 + }, + { + "epoch": 1.703499079189687, + "grad_norm": 0.3500545620918274, + "learning_rate": 9.49260388417946e-05, + "loss": 1.8627, + "step": 5550 + }, + { + "epoch": 1.703806015960712, + "grad_norm": 0.3461480140686035, + "learning_rate": 9.49238568870477e-05, + "loss": 1.8962, + "step": 5551 + }, + { + "epoch": 1.7041129527317374, + "grad_norm": 0.36311015486717224, + "learning_rate": 9.492167448833691e-05, + "loss": 1.9398, + "step": 5552 + }, + { + "epoch": 1.7044198895027625, + "grad_norm": 0.36770105361938477, + "learning_rate": 9.491949164568379e-05, + "loss": 1.9083, + "step": 5553 + }, + { + "epoch": 1.7047268262737876, + "grad_norm": 0.42491769790649414, + "learning_rate": 9.491730835910993e-05, + "loss": 1.8874, + "step": 5554 + }, + { + "epoch": 1.705033763044813, + "grad_norm": 0.5321764945983887, + "learning_rate": 9.491512462863691e-05, + "loss": 1.9813, + "step": 5555 + }, + { + "epoch": 1.7053406998158378, + "grad_norm": 0.5481576323509216, + "learning_rate": 9.49129404542863e-05, + "loss": 1.8696, + "step": 5556 + }, + { + "epoch": 1.7056476365868631, + "grad_norm": 0.47720953822135925, + "learning_rate": 9.491075583607969e-05, + "loss": 1.9026, + "step": 5557 + }, + { + "epoch": 1.7059545733578882, + "grad_norm": 0.3976534605026245, + "learning_rate": 9.490857077403865e-05, + "loss": 1.8551, + "step": 5558 + }, + { + "epoch": 1.7062615101289134, + "grad_norm": 0.3744281828403473, + "learning_rate": 9.49063852681848e-05, + "loss": 2.012, + "step": 5559 + }, + { + "epoch": 1.7065684468999387, + "grad_norm": 0.3931918740272522, + "learning_rate": 9.490419931853974e-05, + "loss": 1.845, + "step": 5560 + }, + { + "epoch": 1.7068753836709638, + "grad_norm": 0.5411466956138611, + "learning_rate": 9.490201292512506e-05, + "loss": 2.0225, + "step": 5561 + }, + { + "epoch": 1.707182320441989, + "grad_norm": 0.6602910757064819, + "learning_rate": 9.489982608796237e-05, + "loss": 1.9559, + "step": 5562 + }, + { + "epoch": 1.7074892572130143, + "grad_norm": 0.5455329418182373, + "learning_rate": 9.489763880707329e-05, + "loss": 1.8855, + "step": 5563 + }, + { + "epoch": 1.7077961939840391, + "grad_norm": 0.42309099435806274, + "learning_rate": 9.489545108247941e-05, + "loss": 1.8784, + "step": 5564 + }, + { + "epoch": 1.7081031307550645, + "grad_norm": 0.3817001283168793, + "learning_rate": 9.489326291420239e-05, + "loss": 1.8926, + "step": 5565 + }, + { + "epoch": 1.7084100675260896, + "grad_norm": 0.5077582597732544, + "learning_rate": 9.489107430226381e-05, + "loss": 1.8742, + "step": 5566 + }, + { + "epoch": 1.7087170042971147, + "grad_norm": 0.5634065866470337, + "learning_rate": 9.488888524668533e-05, + "loss": 1.9251, + "step": 5567 + }, + { + "epoch": 1.70902394106814, + "grad_norm": 0.5182891488075256, + "learning_rate": 9.488669574748859e-05, + "loss": 1.9689, + "step": 5568 + }, + { + "epoch": 1.7093308778391652, + "grad_norm": 0.4180498719215393, + "learning_rate": 9.48845058046952e-05, + "loss": 1.9248, + "step": 5569 + }, + { + "epoch": 1.7096378146101903, + "grad_norm": 0.4833194315433502, + "learning_rate": 9.488231541832682e-05, + "loss": 2.0115, + "step": 5570 + }, + { + "epoch": 1.7099447513812156, + "grad_norm": 0.46525415778160095, + "learning_rate": 9.488012458840509e-05, + "loss": 1.9108, + "step": 5571 + }, + { + "epoch": 1.7102516881522405, + "grad_norm": 0.5051191449165344, + "learning_rate": 9.487793331495166e-05, + "loss": 1.9055, + "step": 5572 + }, + { + "epoch": 1.7105586249232658, + "grad_norm": 0.4713154137134552, + "learning_rate": 9.48757415979882e-05, + "loss": 1.9104, + "step": 5573 + }, + { + "epoch": 1.710865561694291, + "grad_norm": 0.44901835918426514, + "learning_rate": 9.487354943753635e-05, + "loss": 1.9536, + "step": 5574 + }, + { + "epoch": 1.711172498465316, + "grad_norm": 0.41106006503105164, + "learning_rate": 9.487135683361778e-05, + "loss": 1.9549, + "step": 5575 + }, + { + "epoch": 1.7114794352363414, + "grad_norm": 0.4571320116519928, + "learning_rate": 9.486916378625416e-05, + "loss": 1.859, + "step": 5576 + }, + { + "epoch": 1.7117863720073665, + "grad_norm": 0.4423540532588959, + "learning_rate": 9.486697029546718e-05, + "loss": 1.9621, + "step": 5577 + }, + { + "epoch": 1.7120933087783916, + "grad_norm": 0.44291070103645325, + "learning_rate": 9.48647763612785e-05, + "loss": 1.8567, + "step": 5578 + }, + { + "epoch": 1.712400245549417, + "grad_norm": 0.4374423921108246, + "learning_rate": 9.486258198370981e-05, + "loss": 1.9754, + "step": 5579 + }, + { + "epoch": 1.7127071823204418, + "grad_norm": 0.44008153676986694, + "learning_rate": 9.486038716278277e-05, + "loss": 1.8815, + "step": 5580 + }, + { + "epoch": 1.7130141190914672, + "grad_norm": 0.3571348190307617, + "learning_rate": 9.48581918985191e-05, + "loss": 1.8948, + "step": 5581 + }, + { + "epoch": 1.7133210558624923, + "grad_norm": 0.42260754108428955, + "learning_rate": 9.485599619094049e-05, + "loss": 1.9964, + "step": 5582 + }, + { + "epoch": 1.7136279926335174, + "grad_norm": 0.44568777084350586, + "learning_rate": 9.485380004006863e-05, + "loss": 1.9596, + "step": 5583 + }, + { + "epoch": 1.7139349294045427, + "grad_norm": 0.5488269925117493, + "learning_rate": 9.485160344592523e-05, + "loss": 1.9239, + "step": 5584 + }, + { + "epoch": 1.7142418661755678, + "grad_norm": 0.5653155446052551, + "learning_rate": 9.484940640853199e-05, + "loss": 1.9115, + "step": 5585 + }, + { + "epoch": 1.714548802946593, + "grad_norm": 0.4652312099933624, + "learning_rate": 9.484720892791064e-05, + "loss": 1.9973, + "step": 5586 + }, + { + "epoch": 1.7148557397176183, + "grad_norm": 0.41521382331848145, + "learning_rate": 9.484501100408288e-05, + "loss": 1.9395, + "step": 5587 + }, + { + "epoch": 1.7151626764886432, + "grad_norm": 0.46761438250541687, + "learning_rate": 9.484281263707043e-05, + "loss": 1.9465, + "step": 5588 + }, + { + "epoch": 1.7154696132596685, + "grad_norm": 0.46990182995796204, + "learning_rate": 9.484061382689501e-05, + "loss": 1.8969, + "step": 5589 + }, + { + "epoch": 1.7157765500306936, + "grad_norm": 0.44951021671295166, + "learning_rate": 9.48384145735784e-05, + "loss": 1.9925, + "step": 5590 + }, + { + "epoch": 1.7160834868017187, + "grad_norm": 0.4029327630996704, + "learning_rate": 9.483621487714227e-05, + "loss": 1.8574, + "step": 5591 + }, + { + "epoch": 1.716390423572744, + "grad_norm": 0.3501027226448059, + "learning_rate": 9.48340147376084e-05, + "loss": 1.9156, + "step": 5592 + }, + { + "epoch": 1.7166973603437692, + "grad_norm": 0.5058720111846924, + "learning_rate": 9.48318141549985e-05, + "loss": 2.071, + "step": 5593 + }, + { + "epoch": 1.7170042971147943, + "grad_norm": 0.5097518563270569, + "learning_rate": 9.482961312933435e-05, + "loss": 1.9609, + "step": 5594 + }, + { + "epoch": 1.7173112338858196, + "grad_norm": 0.4728573262691498, + "learning_rate": 9.482741166063769e-05, + "loss": 1.9552, + "step": 5595 + }, + { + "epoch": 1.7176181706568447, + "grad_norm": 0.44095897674560547, + "learning_rate": 9.482520974893026e-05, + "loss": 2.011, + "step": 5596 + }, + { + "epoch": 1.7179251074278699, + "grad_norm": 0.48331573605537415, + "learning_rate": 9.482300739423385e-05, + "loss": 1.9676, + "step": 5597 + }, + { + "epoch": 1.7182320441988952, + "grad_norm": 0.4890894293785095, + "learning_rate": 9.482080459657019e-05, + "loss": 1.9571, + "step": 5598 + }, + { + "epoch": 1.71853898096992, + "grad_norm": 0.4486929476261139, + "learning_rate": 9.481860135596109e-05, + "loss": 1.9205, + "step": 5599 + }, + { + "epoch": 1.7188459177409454, + "grad_norm": 0.44154083728790283, + "learning_rate": 9.48163976724283e-05, + "loss": 1.9995, + "step": 5600 + }, + { + "epoch": 1.7191528545119705, + "grad_norm": 0.4155641496181488, + "learning_rate": 9.481419354599358e-05, + "loss": 1.9192, + "step": 5601 + }, + { + "epoch": 1.7194597912829956, + "grad_norm": 0.453253835439682, + "learning_rate": 9.481198897667875e-05, + "loss": 2.0102, + "step": 5602 + }, + { + "epoch": 1.719766728054021, + "grad_norm": 0.4325653314590454, + "learning_rate": 9.480978396450557e-05, + "loss": 1.8859, + "step": 5603 + }, + { + "epoch": 1.720073664825046, + "grad_norm": 0.4191089868545532, + "learning_rate": 9.480757850949584e-05, + "loss": 2.0007, + "step": 5604 + }, + { + "epoch": 1.7203806015960712, + "grad_norm": 0.4182284474372864, + "learning_rate": 9.480537261167137e-05, + "loss": 1.9374, + "step": 5605 + }, + { + "epoch": 1.7206875383670965, + "grad_norm": 0.4695988893508911, + "learning_rate": 9.480316627105394e-05, + "loss": 1.983, + "step": 5606 + }, + { + "epoch": 1.7209944751381214, + "grad_norm": 0.4668160378932953, + "learning_rate": 9.480095948766536e-05, + "loss": 1.8705, + "step": 5607 + }, + { + "epoch": 1.7213014119091468, + "grad_norm": 0.3689236044883728, + "learning_rate": 9.479875226152744e-05, + "loss": 1.8695, + "step": 5608 + }, + { + "epoch": 1.7216083486801719, + "grad_norm": 0.4206932485103607, + "learning_rate": 9.4796544592662e-05, + "loss": 1.9494, + "step": 5609 + }, + { + "epoch": 1.721915285451197, + "grad_norm": 0.4420578181743622, + "learning_rate": 9.479433648109083e-05, + "loss": 1.8749, + "step": 5610 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.4243582487106323, + "learning_rate": 9.479212792683579e-05, + "loss": 1.9524, + "step": 5611 + }, + { + "epoch": 1.7225291589932474, + "grad_norm": 0.5053666234016418, + "learning_rate": 9.478991892991868e-05, + "loss": 1.9308, + "step": 5612 + }, + { + "epoch": 1.7228360957642725, + "grad_norm": 0.4365650713443756, + "learning_rate": 9.478770949036136e-05, + "loss": 1.9469, + "step": 5613 + }, + { + "epoch": 1.7231430325352979, + "grad_norm": 0.3916216194629669, + "learning_rate": 9.478549960818561e-05, + "loss": 1.8239, + "step": 5614 + }, + { + "epoch": 1.7234499693063228, + "grad_norm": 0.4051356911659241, + "learning_rate": 9.478328928341334e-05, + "loss": 1.892, + "step": 5615 + }, + { + "epoch": 1.723756906077348, + "grad_norm": 0.36592593789100647, + "learning_rate": 9.478107851606633e-05, + "loss": 1.8763, + "step": 5616 + }, + { + "epoch": 1.7240638428483732, + "grad_norm": 0.45741888880729675, + "learning_rate": 9.477886730616645e-05, + "loss": 1.9502, + "step": 5617 + }, + { + "epoch": 1.7243707796193983, + "grad_norm": 0.38170990347862244, + "learning_rate": 9.477665565373558e-05, + "loss": 1.8568, + "step": 5618 + }, + { + "epoch": 1.7246777163904237, + "grad_norm": 0.4193691313266754, + "learning_rate": 9.477444355879554e-05, + "loss": 1.9553, + "step": 5619 + }, + { + "epoch": 1.7249846531614488, + "grad_norm": 0.39682838320732117, + "learning_rate": 9.477223102136821e-05, + "loss": 1.9474, + "step": 5620 + }, + { + "epoch": 1.725291589932474, + "grad_norm": 0.391544371843338, + "learning_rate": 9.477001804147545e-05, + "loss": 1.9277, + "step": 5621 + }, + { + "epoch": 1.7255985267034992, + "grad_norm": 0.42348888516426086, + "learning_rate": 9.476780461913913e-05, + "loss": 1.8923, + "step": 5622 + }, + { + "epoch": 1.7259054634745241, + "grad_norm": 0.4393916130065918, + "learning_rate": 9.476559075438114e-05, + "loss": 1.9052, + "step": 5623 + }, + { + "epoch": 1.7262124002455494, + "grad_norm": 0.42631569504737854, + "learning_rate": 9.476337644722333e-05, + "loss": 1.8849, + "step": 5624 + }, + { + "epoch": 1.7265193370165746, + "grad_norm": 0.3514206111431122, + "learning_rate": 9.47611616976876e-05, + "loss": 1.9286, + "step": 5625 + }, + { + "epoch": 1.7268262737875997, + "grad_norm": 0.4104609191417694, + "learning_rate": 9.475894650579582e-05, + "loss": 1.9178, + "step": 5626 + }, + { + "epoch": 1.727133210558625, + "grad_norm": 0.44329676032066345, + "learning_rate": 9.475673087156992e-05, + "loss": 1.9789, + "step": 5627 + }, + { + "epoch": 1.7274401473296501, + "grad_norm": 0.41865840554237366, + "learning_rate": 9.475451479503175e-05, + "loss": 1.9105, + "step": 5628 + }, + { + "epoch": 1.7277470841006752, + "grad_norm": 0.4166790544986725, + "learning_rate": 9.475229827620326e-05, + "loss": 1.9089, + "step": 5629 + }, + { + "epoch": 1.7280540208717006, + "grad_norm": 0.353771448135376, + "learning_rate": 9.475008131510633e-05, + "loss": 1.9081, + "step": 5630 + }, + { + "epoch": 1.7283609576427255, + "grad_norm": 0.385046124458313, + "learning_rate": 9.474786391176284e-05, + "loss": 1.9268, + "step": 5631 + }, + { + "epoch": 1.7286678944137508, + "grad_norm": 0.3956538438796997, + "learning_rate": 9.474564606619474e-05, + "loss": 1.9445, + "step": 5632 + }, + { + "epoch": 1.728974831184776, + "grad_norm": 0.41305112838745117, + "learning_rate": 9.474342777842394e-05, + "loss": 1.9331, + "step": 5633 + }, + { + "epoch": 1.729281767955801, + "grad_norm": 0.39336860179901123, + "learning_rate": 9.474120904847237e-05, + "loss": 1.9792, + "step": 5634 + }, + { + "epoch": 1.7295887047268264, + "grad_norm": 0.41963186860084534, + "learning_rate": 9.473898987636194e-05, + "loss": 1.8719, + "step": 5635 + }, + { + "epoch": 1.7298956414978515, + "grad_norm": 0.4087338149547577, + "learning_rate": 9.473677026211458e-05, + "loss": 1.9121, + "step": 5636 + }, + { + "epoch": 1.7302025782688766, + "grad_norm": 0.3693830966949463, + "learning_rate": 9.473455020575226e-05, + "loss": 1.9293, + "step": 5637 + }, + { + "epoch": 1.730509515039902, + "grad_norm": 0.40699541568756104, + "learning_rate": 9.473232970729688e-05, + "loss": 1.94, + "step": 5638 + }, + { + "epoch": 1.7308164518109268, + "grad_norm": 0.4222811162471771, + "learning_rate": 9.473010876677041e-05, + "loss": 1.9416, + "step": 5639 + }, + { + "epoch": 1.7311233885819521, + "grad_norm": 0.41459110379219055, + "learning_rate": 9.472788738419477e-05, + "loss": 1.8801, + "step": 5640 + }, + { + "epoch": 1.7314303253529773, + "grad_norm": 0.36970487236976624, + "learning_rate": 9.472566555959195e-05, + "loss": 1.9122, + "step": 5641 + }, + { + "epoch": 1.7317372621240024, + "grad_norm": 0.35511577129364014, + "learning_rate": 9.472344329298388e-05, + "loss": 1.8646, + "step": 5642 + }, + { + "epoch": 1.7320441988950277, + "grad_norm": 0.3511577248573303, + "learning_rate": 9.472122058439252e-05, + "loss": 1.9047, + "step": 5643 + }, + { + "epoch": 1.7323511356660528, + "grad_norm": 0.3421955108642578, + "learning_rate": 9.471899743383986e-05, + "loss": 1.8732, + "step": 5644 + }, + { + "epoch": 1.732658072437078, + "grad_norm": 0.44008341431617737, + "learning_rate": 9.471677384134785e-05, + "loss": 1.8956, + "step": 5645 + }, + { + "epoch": 1.7329650092081033, + "grad_norm": 0.49410128593444824, + "learning_rate": 9.471454980693848e-05, + "loss": 1.9197, + "step": 5646 + }, + { + "epoch": 1.7332719459791281, + "grad_norm": 0.4664965867996216, + "learning_rate": 9.471232533063373e-05, + "loss": 1.8945, + "step": 5647 + }, + { + "epoch": 1.7335788827501535, + "grad_norm": 0.3789248764514923, + "learning_rate": 9.471010041245555e-05, + "loss": 1.9153, + "step": 5648 + }, + { + "epoch": 1.7338858195211786, + "grad_norm": 0.34556612372398376, + "learning_rate": 9.470787505242596e-05, + "loss": 1.9144, + "step": 5649 + }, + { + "epoch": 1.7341927562922037, + "grad_norm": 0.3466256856918335, + "learning_rate": 9.470564925056695e-05, + "loss": 1.8837, + "step": 5650 + }, + { + "epoch": 1.734499693063229, + "grad_norm": 0.34612321853637695, + "learning_rate": 9.470342300690051e-05, + "loss": 1.8667, + "step": 5651 + }, + { + "epoch": 1.7348066298342542, + "grad_norm": 0.3648833632469177, + "learning_rate": 9.470119632144864e-05, + "loss": 1.9499, + "step": 5652 + }, + { + "epoch": 1.7351135666052793, + "grad_norm": 0.3600454330444336, + "learning_rate": 9.469896919423334e-05, + "loss": 1.9093, + "step": 5653 + }, + { + "epoch": 1.7354205033763046, + "grad_norm": 0.41487598419189453, + "learning_rate": 9.469674162527664e-05, + "loss": 1.9714, + "step": 5654 + }, + { + "epoch": 1.7357274401473295, + "grad_norm": 0.35980695486068726, + "learning_rate": 9.469451361460053e-05, + "loss": 1.9006, + "step": 5655 + }, + { + "epoch": 1.7360343769183548, + "grad_norm": 0.42676928639411926, + "learning_rate": 9.469228516222705e-05, + "loss": 1.9286, + "step": 5656 + }, + { + "epoch": 1.73634131368938, + "grad_norm": 0.41541969776153564, + "learning_rate": 9.469005626817822e-05, + "loss": 1.9243, + "step": 5657 + }, + { + "epoch": 1.736648250460405, + "grad_norm": 0.4245065152645111, + "learning_rate": 9.468782693247604e-05, + "loss": 1.9427, + "step": 5658 + }, + { + "epoch": 1.7369551872314304, + "grad_norm": 0.46148940920829773, + "learning_rate": 9.468559715514257e-05, + "loss": 2.0201, + "step": 5659 + }, + { + "epoch": 1.7372621240024555, + "grad_norm": 0.47727301716804504, + "learning_rate": 9.468336693619985e-05, + "loss": 1.9792, + "step": 5660 + }, + { + "epoch": 1.7375690607734806, + "grad_norm": 0.4807848036289215, + "learning_rate": 9.46811362756699e-05, + "loss": 1.9036, + "step": 5661 + }, + { + "epoch": 1.737875997544506, + "grad_norm": 0.5129636526107788, + "learning_rate": 9.467890517357477e-05, + "loss": 1.8861, + "step": 5662 + }, + { + "epoch": 1.7381829343155308, + "grad_norm": 0.467804878950119, + "learning_rate": 9.467667362993651e-05, + "loss": 1.868, + "step": 5663 + }, + { + "epoch": 1.7384898710865562, + "grad_norm": 0.4179893136024475, + "learning_rate": 9.46744416447772e-05, + "loss": 1.9521, + "step": 5664 + }, + { + "epoch": 1.7387968078575813, + "grad_norm": 0.4384612739086151, + "learning_rate": 9.467220921811884e-05, + "loss": 1.9167, + "step": 5665 + }, + { + "epoch": 1.7391037446286064, + "grad_norm": 0.517855703830719, + "learning_rate": 9.466997634998354e-05, + "loss": 1.8919, + "step": 5666 + }, + { + "epoch": 1.7394106813996317, + "grad_norm": 0.4875940978527069, + "learning_rate": 9.466774304039334e-05, + "loss": 1.8774, + "step": 5667 + }, + { + "epoch": 1.7397176181706568, + "grad_norm": 0.44286540150642395, + "learning_rate": 9.466550928937034e-05, + "loss": 1.9696, + "step": 5668 + }, + { + "epoch": 1.740024554941682, + "grad_norm": 0.4092461168766022, + "learning_rate": 9.466327509693658e-05, + "loss": 1.9978, + "step": 5669 + }, + { + "epoch": 1.7403314917127073, + "grad_norm": 0.42797163128852844, + "learning_rate": 9.466104046311418e-05, + "loss": 1.9428, + "step": 5670 + }, + { + "epoch": 1.7406384284837324, + "grad_norm": 0.5174738764762878, + "learning_rate": 9.465880538792518e-05, + "loss": 1.9493, + "step": 5671 + }, + { + "epoch": 1.7409453652547575, + "grad_norm": 0.6263836622238159, + "learning_rate": 9.46565698713917e-05, + "loss": 1.9131, + "step": 5672 + }, + { + "epoch": 1.7412523020257828, + "grad_norm": 0.6452967524528503, + "learning_rate": 9.465433391353582e-05, + "loss": 2.0412, + "step": 5673 + }, + { + "epoch": 1.7415592387968077, + "grad_norm": 0.5004684925079346, + "learning_rate": 9.465209751437964e-05, + "loss": 1.8721, + "step": 5674 + }, + { + "epoch": 1.741866175567833, + "grad_norm": 0.4694507420063019, + "learning_rate": 9.464986067394526e-05, + "loss": 1.9614, + "step": 5675 + }, + { + "epoch": 1.7421731123388582, + "grad_norm": 0.4519532322883606, + "learning_rate": 9.464762339225479e-05, + "loss": 1.9687, + "step": 5676 + }, + { + "epoch": 1.7424800491098833, + "grad_norm": 0.4297941029071808, + "learning_rate": 9.464538566933033e-05, + "loss": 1.965, + "step": 5677 + }, + { + "epoch": 1.7427869858809086, + "grad_norm": 0.4612393081188202, + "learning_rate": 9.464314750519401e-05, + "loss": 1.9651, + "step": 5678 + }, + { + "epoch": 1.7430939226519337, + "grad_norm": 0.394142210483551, + "learning_rate": 9.464090889986794e-05, + "loss": 1.9185, + "step": 5679 + }, + { + "epoch": 1.7434008594229589, + "grad_norm": 0.39999979734420776, + "learning_rate": 9.463866985337424e-05, + "loss": 1.899, + "step": 5680 + }, + { + "epoch": 1.7437077961939842, + "grad_norm": 0.40942859649658203, + "learning_rate": 9.463643036573504e-05, + "loss": 1.9653, + "step": 5681 + }, + { + "epoch": 1.744014732965009, + "grad_norm": 0.4097300171852112, + "learning_rate": 9.463419043697248e-05, + "loss": 1.9944, + "step": 5682 + }, + { + "epoch": 1.7443216697360344, + "grad_norm": 0.41627535223960876, + "learning_rate": 9.463195006710868e-05, + "loss": 1.9156, + "step": 5683 + }, + { + "epoch": 1.7446286065070595, + "grad_norm": 0.3789215385913849, + "learning_rate": 9.46297092561658e-05, + "loss": 1.9262, + "step": 5684 + }, + { + "epoch": 1.7449355432780846, + "grad_norm": 0.4867783188819885, + "learning_rate": 9.462746800416595e-05, + "loss": 1.961, + "step": 5685 + }, + { + "epoch": 1.74524248004911, + "grad_norm": 0.6078580617904663, + "learning_rate": 9.462522631113133e-05, + "loss": 1.9694, + "step": 5686 + }, + { + "epoch": 1.745549416820135, + "grad_norm": 0.558968186378479, + "learning_rate": 9.462298417708406e-05, + "loss": 1.9537, + "step": 5687 + }, + { + "epoch": 1.7458563535911602, + "grad_norm": 0.4677596986293793, + "learning_rate": 9.46207416020463e-05, + "loss": 1.9253, + "step": 5688 + }, + { + "epoch": 1.7461632903621855, + "grad_norm": 0.40353646874427795, + "learning_rate": 9.461849858604023e-05, + "loss": 1.8992, + "step": 5689 + }, + { + "epoch": 1.7464702271332104, + "grad_norm": 0.3738614618778229, + "learning_rate": 9.4616255129088e-05, + "loss": 1.9109, + "step": 5690 + }, + { + "epoch": 1.7467771639042358, + "grad_norm": 0.4040324091911316, + "learning_rate": 9.461401123121179e-05, + "loss": 1.8981, + "step": 5691 + }, + { + "epoch": 1.7470841006752609, + "grad_norm": 0.44214901328086853, + "learning_rate": 9.461176689243376e-05, + "loss": 1.9244, + "step": 5692 + }, + { + "epoch": 1.747391037446286, + "grad_norm": 0.44187378883361816, + "learning_rate": 9.460952211277611e-05, + "loss": 1.9329, + "step": 5693 + }, + { + "epoch": 1.7476979742173113, + "grad_norm": 0.44287410378456116, + "learning_rate": 9.460727689226102e-05, + "loss": 1.97, + "step": 5694 + }, + { + "epoch": 1.7480049109883364, + "grad_norm": 0.3757341504096985, + "learning_rate": 9.460503123091067e-05, + "loss": 1.8766, + "step": 5695 + }, + { + "epoch": 1.7483118477593615, + "grad_norm": 0.4139314591884613, + "learning_rate": 9.460278512874725e-05, + "loss": 1.902, + "step": 5696 + }, + { + "epoch": 1.7486187845303869, + "grad_norm": 0.37526339292526245, + "learning_rate": 9.460053858579298e-05, + "loss": 1.9325, + "step": 5697 + }, + { + "epoch": 1.7489257213014118, + "grad_norm": 0.3770616948604584, + "learning_rate": 9.459829160207004e-05, + "loss": 1.9437, + "step": 5698 + }, + { + "epoch": 1.749232658072437, + "grad_norm": 0.4069806933403015, + "learning_rate": 9.459604417760064e-05, + "loss": 1.9454, + "step": 5699 + }, + { + "epoch": 1.7495395948434622, + "grad_norm": 0.42822694778442383, + "learning_rate": 9.459379631240699e-05, + "loss": 1.8798, + "step": 5700 + }, + { + "epoch": 1.7498465316144873, + "grad_norm": 0.44075292348861694, + "learning_rate": 9.459154800651131e-05, + "loss": 1.9842, + "step": 5701 + }, + { + "epoch": 1.7501534683855127, + "grad_norm": 0.4151122272014618, + "learning_rate": 9.458929925993583e-05, + "loss": 1.8495, + "step": 5702 + }, + { + "epoch": 1.7504604051565378, + "grad_norm": 0.41887882351875305, + "learning_rate": 9.458705007270275e-05, + "loss": 1.9611, + "step": 5703 + }, + { + "epoch": 1.750767341927563, + "grad_norm": 0.3976796865463257, + "learning_rate": 9.45848004448343e-05, + "loss": 1.8841, + "step": 5704 + }, + { + "epoch": 1.7510742786985882, + "grad_norm": 0.3783813416957855, + "learning_rate": 9.458255037635272e-05, + "loss": 1.8897, + "step": 5705 + }, + { + "epoch": 1.7513812154696131, + "grad_norm": 0.35153308510780334, + "learning_rate": 9.458029986728026e-05, + "loss": 1.911, + "step": 5706 + }, + { + "epoch": 1.7516881522406385, + "grad_norm": 0.38390985131263733, + "learning_rate": 9.457804891763913e-05, + "loss": 2.0105, + "step": 5707 + }, + { + "epoch": 1.7519950890116636, + "grad_norm": 0.3830740451812744, + "learning_rate": 9.457579752745161e-05, + "loss": 1.9635, + "step": 5708 + }, + { + "epoch": 1.7523020257826887, + "grad_norm": 0.3711417019367218, + "learning_rate": 9.457354569673993e-05, + "loss": 1.8553, + "step": 5709 + }, + { + "epoch": 1.752608962553714, + "grad_norm": 0.3670618236064911, + "learning_rate": 9.457129342552633e-05, + "loss": 1.9044, + "step": 5710 + }, + { + "epoch": 1.7529158993247391, + "grad_norm": 0.398863285779953, + "learning_rate": 9.45690407138331e-05, + "loss": 1.987, + "step": 5711 + }, + { + "epoch": 1.7532228360957642, + "grad_norm": 0.4100732207298279, + "learning_rate": 9.456678756168248e-05, + "loss": 1.8552, + "step": 5712 + }, + { + "epoch": 1.7535297728667896, + "grad_norm": 0.41883236169815063, + "learning_rate": 9.456453396909676e-05, + "loss": 1.9183, + "step": 5713 + }, + { + "epoch": 1.7538367096378145, + "grad_norm": 0.4063440263271332, + "learning_rate": 9.456227993609818e-05, + "loss": 1.8751, + "step": 5714 + }, + { + "epoch": 1.7541436464088398, + "grad_norm": 0.3880515694618225, + "learning_rate": 9.456002546270904e-05, + "loss": 1.9558, + "step": 5715 + }, + { + "epoch": 1.754450583179865, + "grad_norm": 0.38582444190979004, + "learning_rate": 9.45577705489516e-05, + "loss": 1.9588, + "step": 5716 + }, + { + "epoch": 1.75475751995089, + "grad_norm": 0.3678396940231323, + "learning_rate": 9.455551519484816e-05, + "loss": 1.9108, + "step": 5717 + }, + { + "epoch": 1.7550644567219154, + "grad_norm": 0.3590768277645111, + "learning_rate": 9.455325940042098e-05, + "loss": 1.9027, + "step": 5718 + }, + { + "epoch": 1.7553713934929405, + "grad_norm": 0.4104592204093933, + "learning_rate": 9.455100316569241e-05, + "loss": 1.9099, + "step": 5719 + }, + { + "epoch": 1.7556783302639656, + "grad_norm": 0.3774401843547821, + "learning_rate": 9.45487464906847e-05, + "loss": 1.9098, + "step": 5720 + }, + { + "epoch": 1.755985267034991, + "grad_norm": 0.38464388251304626, + "learning_rate": 9.454648937542019e-05, + "loss": 1.9194, + "step": 5721 + }, + { + "epoch": 1.7562922038060158, + "grad_norm": 0.435131698846817, + "learning_rate": 9.454423181992114e-05, + "loss": 1.9798, + "step": 5722 + }, + { + "epoch": 1.7565991405770411, + "grad_norm": 0.4583236575126648, + "learning_rate": 9.454197382420988e-05, + "loss": 1.9862, + "step": 5723 + }, + { + "epoch": 1.7569060773480663, + "grad_norm": 0.3644738793373108, + "learning_rate": 9.453971538830874e-05, + "loss": 1.8535, + "step": 5724 + }, + { + "epoch": 1.7572130141190914, + "grad_norm": 0.3644218444824219, + "learning_rate": 9.453745651224002e-05, + "loss": 1.8773, + "step": 5725 + }, + { + "epoch": 1.7575199508901167, + "grad_norm": 0.42884743213653564, + "learning_rate": 9.453519719602604e-05, + "loss": 1.882, + "step": 5726 + }, + { + "epoch": 1.7578268876611418, + "grad_norm": 0.41049477458000183, + "learning_rate": 9.453293743968916e-05, + "loss": 1.9133, + "step": 5727 + }, + { + "epoch": 1.758133824432167, + "grad_norm": 0.35882604122161865, + "learning_rate": 9.453067724325169e-05, + "loss": 1.9056, + "step": 5728 + }, + { + "epoch": 1.7584407612031923, + "grad_norm": 0.34516364336013794, + "learning_rate": 9.452841660673595e-05, + "loss": 1.8894, + "step": 5729 + }, + { + "epoch": 1.7587476979742172, + "grad_norm": 0.41804373264312744, + "learning_rate": 9.45261555301643e-05, + "loss": 1.8798, + "step": 5730 + }, + { + "epoch": 1.7590546347452425, + "grad_norm": 0.48584702610969543, + "learning_rate": 9.45238940135591e-05, + "loss": 1.9353, + "step": 5731 + }, + { + "epoch": 1.7593615715162676, + "grad_norm": 0.5693044662475586, + "learning_rate": 9.452163205694267e-05, + "loss": 1.8813, + "step": 5732 + }, + { + "epoch": 1.7596685082872927, + "grad_norm": 0.6146205067634583, + "learning_rate": 9.451936966033738e-05, + "loss": 1.9993, + "step": 5733 + }, + { + "epoch": 1.759975445058318, + "grad_norm": 0.4658338129520416, + "learning_rate": 9.451710682376558e-05, + "loss": 1.8977, + "step": 5734 + }, + { + "epoch": 1.7602823818293432, + "grad_norm": 0.35184696316719055, + "learning_rate": 9.451484354724964e-05, + "loss": 1.9924, + "step": 5735 + }, + { + "epoch": 1.7605893186003683, + "grad_norm": 0.48720163106918335, + "learning_rate": 9.451257983081194e-05, + "loss": 1.9054, + "step": 5736 + }, + { + "epoch": 1.7608962553713936, + "grad_norm": 0.6268271803855896, + "learning_rate": 9.451031567447482e-05, + "loss": 1.9956, + "step": 5737 + }, + { + "epoch": 1.7612031921424187, + "grad_norm": 0.5384534001350403, + "learning_rate": 9.450805107826068e-05, + "loss": 1.9169, + "step": 5738 + }, + { + "epoch": 1.7615101289134438, + "grad_norm": 0.4011121094226837, + "learning_rate": 9.450578604219188e-05, + "loss": 1.9845, + "step": 5739 + }, + { + "epoch": 1.7618170656844692, + "grad_norm": 0.4422668516635895, + "learning_rate": 9.450352056629082e-05, + "loss": 2.0014, + "step": 5740 + }, + { + "epoch": 1.762124002455494, + "grad_norm": 0.5033303499221802, + "learning_rate": 9.45012546505799e-05, + "loss": 1.9142, + "step": 5741 + }, + { + "epoch": 1.7624309392265194, + "grad_norm": 0.6074427366256714, + "learning_rate": 9.449898829508148e-05, + "loss": 1.9385, + "step": 5742 + }, + { + "epoch": 1.7627378759975445, + "grad_norm": 0.6405495405197144, + "learning_rate": 9.449672149981799e-05, + "loss": 1.9792, + "step": 5743 + }, + { + "epoch": 1.7630448127685696, + "grad_norm": 0.5432560443878174, + "learning_rate": 9.449445426481182e-05, + "loss": 1.9294, + "step": 5744 + }, + { + "epoch": 1.763351749539595, + "grad_norm": 0.41406089067459106, + "learning_rate": 9.449218659008536e-05, + "loss": 1.9266, + "step": 5745 + }, + { + "epoch": 1.76365868631062, + "grad_norm": 0.41278013586997986, + "learning_rate": 9.448991847566104e-05, + "loss": 1.9448, + "step": 5746 + }, + { + "epoch": 1.7639656230816452, + "grad_norm": 0.4682934582233429, + "learning_rate": 9.448764992156128e-05, + "loss": 1.9836, + "step": 5747 + }, + { + "epoch": 1.7642725598526705, + "grad_norm": 0.47673073410987854, + "learning_rate": 9.448538092780848e-05, + "loss": 2.0229, + "step": 5748 + }, + { + "epoch": 1.7645794966236954, + "grad_norm": 0.3956258296966553, + "learning_rate": 9.448311149442507e-05, + "loss": 1.9871, + "step": 5749 + }, + { + "epoch": 1.7648864333947207, + "grad_norm": 0.39578214287757874, + "learning_rate": 9.448084162143348e-05, + "loss": 1.8991, + "step": 5750 + }, + { + "epoch": 1.7651933701657458, + "grad_norm": 0.42902353405952454, + "learning_rate": 9.447857130885614e-05, + "loss": 1.9925, + "step": 5751 + }, + { + "epoch": 1.765500306936771, + "grad_norm": 0.45643556118011475, + "learning_rate": 9.44763005567155e-05, + "loss": 1.9662, + "step": 5752 + }, + { + "epoch": 1.7658072437077963, + "grad_norm": 0.39291635155677795, + "learning_rate": 9.447402936503398e-05, + "loss": 1.8925, + "step": 5753 + }, + { + "epoch": 1.7661141804788214, + "grad_norm": 0.36709296703338623, + "learning_rate": 9.447175773383404e-05, + "loss": 1.8669, + "step": 5754 + }, + { + "epoch": 1.7664211172498465, + "grad_norm": 0.41586652398109436, + "learning_rate": 9.446948566313812e-05, + "loss": 1.8925, + "step": 5755 + }, + { + "epoch": 1.7667280540208719, + "grad_norm": 0.42532578110694885, + "learning_rate": 9.446721315296867e-05, + "loss": 1.9923, + "step": 5756 + }, + { + "epoch": 1.7670349907918967, + "grad_norm": 0.45310646295547485, + "learning_rate": 9.446494020334817e-05, + "loss": 1.9908, + "step": 5757 + }, + { + "epoch": 1.767341927562922, + "grad_norm": 0.4391445219516754, + "learning_rate": 9.446266681429907e-05, + "loss": 1.9391, + "step": 5758 + }, + { + "epoch": 1.7676488643339472, + "grad_norm": 0.3728313446044922, + "learning_rate": 9.446039298584382e-05, + "loss": 1.9352, + "step": 5759 + }, + { + "epoch": 1.7679558011049723, + "grad_norm": 0.3862408697605133, + "learning_rate": 9.445811871800492e-05, + "loss": 1.9628, + "step": 5760 + }, + { + "epoch": 1.7682627378759976, + "grad_norm": 0.3704443573951721, + "learning_rate": 9.445584401080482e-05, + "loss": 1.9041, + "step": 5761 + }, + { + "epoch": 1.7685696746470227, + "grad_norm": 0.3490816652774811, + "learning_rate": 9.445356886426603e-05, + "loss": 1.9203, + "step": 5762 + }, + { + "epoch": 1.7688766114180479, + "grad_norm": 0.40135613083839417, + "learning_rate": 9.445129327841102e-05, + "loss": 1.9166, + "step": 5763 + }, + { + "epoch": 1.7691835481890732, + "grad_norm": 0.3794950246810913, + "learning_rate": 9.444901725326227e-05, + "loss": 1.8735, + "step": 5764 + }, + { + "epoch": 1.769490484960098, + "grad_norm": 0.3908408284187317, + "learning_rate": 9.444674078884228e-05, + "loss": 1.9044, + "step": 5765 + }, + { + "epoch": 1.7697974217311234, + "grad_norm": 0.45880573987960815, + "learning_rate": 9.444446388517354e-05, + "loss": 1.999, + "step": 5766 + }, + { + "epoch": 1.7701043585021485, + "grad_norm": 0.44833555817604065, + "learning_rate": 9.444218654227856e-05, + "loss": 1.8638, + "step": 5767 + }, + { + "epoch": 1.7704112952731736, + "grad_norm": 0.4608282446861267, + "learning_rate": 9.443990876017985e-05, + "loss": 2.0073, + "step": 5768 + }, + { + "epoch": 1.770718232044199, + "grad_norm": 0.41873493790626526, + "learning_rate": 9.44376305388999e-05, + "loss": 1.9337, + "step": 5769 + }, + { + "epoch": 1.771025168815224, + "grad_norm": 0.44395530223846436, + "learning_rate": 9.443535187846125e-05, + "loss": 1.9218, + "step": 5770 + }, + { + "epoch": 1.7713321055862492, + "grad_norm": 0.4347928464412689, + "learning_rate": 9.443307277888641e-05, + "loss": 1.9251, + "step": 5771 + }, + { + "epoch": 1.7716390423572745, + "grad_norm": 0.4892890155315399, + "learning_rate": 9.44307932401979e-05, + "loss": 1.9549, + "step": 5772 + }, + { + "epoch": 1.7719459791282994, + "grad_norm": 0.4234324097633362, + "learning_rate": 9.442851326241826e-05, + "loss": 1.9835, + "step": 5773 + }, + { + "epoch": 1.7722529158993248, + "grad_norm": 0.3614303171634674, + "learning_rate": 9.442623284557e-05, + "loss": 1.8942, + "step": 5774 + }, + { + "epoch": 1.7725598526703499, + "grad_norm": 0.4273429214954376, + "learning_rate": 9.442395198967566e-05, + "loss": 1.9363, + "step": 5775 + }, + { + "epoch": 1.772866789441375, + "grad_norm": 0.5049880146980286, + "learning_rate": 9.44216706947578e-05, + "loss": 1.904, + "step": 5776 + }, + { + "epoch": 1.7731737262124003, + "grad_norm": 0.5713424682617188, + "learning_rate": 9.441938896083895e-05, + "loss": 1.9756, + "step": 5777 + }, + { + "epoch": 1.7734806629834254, + "grad_norm": 0.4836362600326538, + "learning_rate": 9.441710678794166e-05, + "loss": 1.9657, + "step": 5778 + }, + { + "epoch": 1.7737875997544506, + "grad_norm": 0.39967820048332214, + "learning_rate": 9.44148241760885e-05, + "loss": 1.9566, + "step": 5779 + }, + { + "epoch": 1.7740945365254759, + "grad_norm": 0.38304075598716736, + "learning_rate": 9.4412541125302e-05, + "loss": 1.9055, + "step": 5780 + }, + { + "epoch": 1.7744014732965008, + "grad_norm": 0.3932463526725769, + "learning_rate": 9.441025763560474e-05, + "loss": 1.9603, + "step": 5781 + }, + { + "epoch": 1.774708410067526, + "grad_norm": 0.4528409242630005, + "learning_rate": 9.44079737070193e-05, + "loss": 2.0095, + "step": 5782 + }, + { + "epoch": 1.7750153468385512, + "grad_norm": 0.42075392603874207, + "learning_rate": 9.440568933956822e-05, + "loss": 1.8818, + "step": 5783 + }, + { + "epoch": 1.7753222836095763, + "grad_norm": 0.4114269018173218, + "learning_rate": 9.44034045332741e-05, + "loss": 1.8524, + "step": 5784 + }, + { + "epoch": 1.7756292203806017, + "grad_norm": 0.4052261412143707, + "learning_rate": 9.44011192881595e-05, + "loss": 1.9759, + "step": 5785 + }, + { + "epoch": 1.7759361571516268, + "grad_norm": 0.3551998436450958, + "learning_rate": 9.439883360424702e-05, + "loss": 1.9534, + "step": 5786 + }, + { + "epoch": 1.776243093922652, + "grad_norm": 0.404109925031662, + "learning_rate": 9.439654748155924e-05, + "loss": 1.8944, + "step": 5787 + }, + { + "epoch": 1.7765500306936772, + "grad_norm": 0.4092860519886017, + "learning_rate": 9.439426092011875e-05, + "loss": 2.0341, + "step": 5788 + }, + { + "epoch": 1.7768569674647021, + "grad_norm": 0.36132386326789856, + "learning_rate": 9.439197391994819e-05, + "loss": 1.8746, + "step": 5789 + }, + { + "epoch": 1.7771639042357275, + "grad_norm": 0.34845319390296936, + "learning_rate": 9.438968648107009e-05, + "loss": 1.8646, + "step": 5790 + }, + { + "epoch": 1.7774708410067526, + "grad_norm": 0.33360353112220764, + "learning_rate": 9.43873986035071e-05, + "loss": 1.901, + "step": 5791 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.348147988319397, + "learning_rate": 9.438511028728181e-05, + "loss": 1.8703, + "step": 5792 + }, + { + "epoch": 1.778084714548803, + "grad_norm": 0.385662704706192, + "learning_rate": 9.438282153241686e-05, + "loss": 1.9806, + "step": 5793 + }, + { + "epoch": 1.7783916513198281, + "grad_norm": 0.39457234740257263, + "learning_rate": 9.438053233893484e-05, + "loss": 1.9324, + "step": 5794 + }, + { + "epoch": 1.7786985880908532, + "grad_norm": 0.35745853185653687, + "learning_rate": 9.43782427068584e-05, + "loss": 1.9754, + "step": 5795 + }, + { + "epoch": 1.7790055248618786, + "grad_norm": 0.40866991877555847, + "learning_rate": 9.437595263621015e-05, + "loss": 1.959, + "step": 5796 + }, + { + "epoch": 1.7793124616329035, + "grad_norm": 0.3938930630683899, + "learning_rate": 9.437366212701274e-05, + "loss": 1.8746, + "step": 5797 + }, + { + "epoch": 1.7796193984039288, + "grad_norm": 0.36665603518486023, + "learning_rate": 9.437137117928878e-05, + "loss": 1.9209, + "step": 5798 + }, + { + "epoch": 1.779926335174954, + "grad_norm": 0.38514846563339233, + "learning_rate": 9.436907979306092e-05, + "loss": 1.8697, + "step": 5799 + }, + { + "epoch": 1.780233271945979, + "grad_norm": 0.4100898206233978, + "learning_rate": 9.43667879683518e-05, + "loss": 1.9606, + "step": 5800 + }, + { + "epoch": 1.7805402087170044, + "grad_norm": 0.40195250511169434, + "learning_rate": 9.43644957051841e-05, + "loss": 1.918, + "step": 5801 + }, + { + "epoch": 1.7808471454880295, + "grad_norm": 0.3943032920360565, + "learning_rate": 9.436220300358043e-05, + "loss": 1.9394, + "step": 5802 + }, + { + "epoch": 1.7811540822590546, + "grad_norm": 0.4171943664550781, + "learning_rate": 9.435990986356349e-05, + "loss": 1.9773, + "step": 5803 + }, + { + "epoch": 1.78146101903008, + "grad_norm": 0.4278806746006012, + "learning_rate": 9.435761628515589e-05, + "loss": 1.8696, + "step": 5804 + }, + { + "epoch": 1.7817679558011048, + "grad_norm": 0.4659377634525299, + "learning_rate": 9.435532226838036e-05, + "loss": 1.9387, + "step": 5805 + }, + { + "epoch": 1.7820748925721301, + "grad_norm": 0.4428139925003052, + "learning_rate": 9.435302781325952e-05, + "loss": 1.9673, + "step": 5806 + }, + { + "epoch": 1.7823818293431553, + "grad_norm": 0.4488377869129181, + "learning_rate": 9.435073291981607e-05, + "loss": 1.8493, + "step": 5807 + }, + { + "epoch": 1.7826887661141804, + "grad_norm": 0.5337218046188354, + "learning_rate": 9.434843758807268e-05, + "loss": 1.8631, + "step": 5808 + }, + { + "epoch": 1.7829957028852057, + "grad_norm": 0.5479410886764526, + "learning_rate": 9.434614181805202e-05, + "loss": 1.8548, + "step": 5809 + }, + { + "epoch": 1.7833026396562308, + "grad_norm": 0.5154398679733276, + "learning_rate": 9.434384560977681e-05, + "loss": 1.9558, + "step": 5810 + }, + { + "epoch": 1.783609576427256, + "grad_norm": 0.44863855838775635, + "learning_rate": 9.434154896326974e-05, + "loss": 1.9287, + "step": 5811 + }, + { + "epoch": 1.7839165131982813, + "grad_norm": 0.43923139572143555, + "learning_rate": 9.433925187855348e-05, + "loss": 1.9475, + "step": 5812 + }, + { + "epoch": 1.7842234499693064, + "grad_norm": 0.3602962791919708, + "learning_rate": 9.433695435565073e-05, + "loss": 1.8705, + "step": 5813 + }, + { + "epoch": 1.7845303867403315, + "grad_norm": 0.3956433832645416, + "learning_rate": 9.433465639458423e-05, + "loss": 1.9402, + "step": 5814 + }, + { + "epoch": 1.7848373235113568, + "grad_norm": 0.3382786810398102, + "learning_rate": 9.433235799537666e-05, + "loss": 1.9176, + "step": 5815 + }, + { + "epoch": 1.7851442602823817, + "grad_norm": 0.3681669533252716, + "learning_rate": 9.433005915805076e-05, + "loss": 1.8628, + "step": 5816 + }, + { + "epoch": 1.785451197053407, + "grad_norm": 0.32285505533218384, + "learning_rate": 9.432775988262921e-05, + "loss": 1.8875, + "step": 5817 + }, + { + "epoch": 1.7857581338244322, + "grad_norm": 0.35673508048057556, + "learning_rate": 9.432546016913477e-05, + "loss": 1.925, + "step": 5818 + }, + { + "epoch": 1.7860650705954573, + "grad_norm": 0.363308310508728, + "learning_rate": 9.432316001759015e-05, + "loss": 1.8711, + "step": 5819 + }, + { + "epoch": 1.7863720073664826, + "grad_norm": 0.36789265275001526, + "learning_rate": 9.432085942801808e-05, + "loss": 1.8578, + "step": 5820 + }, + { + "epoch": 1.7866789441375077, + "grad_norm": 0.3791796565055847, + "learning_rate": 9.43185584004413e-05, + "loss": 1.9162, + "step": 5821 + }, + { + "epoch": 1.7869858809085328, + "grad_norm": 0.3819539248943329, + "learning_rate": 9.431625693488256e-05, + "loss": 1.9042, + "step": 5822 + }, + { + "epoch": 1.7872928176795582, + "grad_norm": 0.36675095558166504, + "learning_rate": 9.43139550313646e-05, + "loss": 1.9775, + "step": 5823 + }, + { + "epoch": 1.787599754450583, + "grad_norm": 0.40895935893058777, + "learning_rate": 9.431165268991013e-05, + "loss": 1.9249, + "step": 5824 + }, + { + "epoch": 1.7879066912216084, + "grad_norm": 0.3866878151893616, + "learning_rate": 9.430934991054197e-05, + "loss": 1.8706, + "step": 5825 + }, + { + "epoch": 1.7882136279926335, + "grad_norm": 0.4892923831939697, + "learning_rate": 9.430704669328283e-05, + "loss": 1.9177, + "step": 5826 + }, + { + "epoch": 1.7885205647636586, + "grad_norm": 0.46216699481010437, + "learning_rate": 9.430474303815548e-05, + "loss": 1.8606, + "step": 5827 + }, + { + "epoch": 1.788827501534684, + "grad_norm": 0.4253760874271393, + "learning_rate": 9.430243894518271e-05, + "loss": 1.9123, + "step": 5828 + }, + { + "epoch": 1.789134438305709, + "grad_norm": 0.3316090404987335, + "learning_rate": 9.430013441438726e-05, + "loss": 1.9138, + "step": 5829 + }, + { + "epoch": 1.7894413750767342, + "grad_norm": 0.36144545674324036, + "learning_rate": 9.429782944579191e-05, + "loss": 1.8851, + "step": 5830 + }, + { + "epoch": 1.7897483118477595, + "grad_norm": 0.47213298082351685, + "learning_rate": 9.429552403941946e-05, + "loss": 1.9614, + "step": 5831 + }, + { + "epoch": 1.7900552486187844, + "grad_norm": 0.5166186094284058, + "learning_rate": 9.429321819529267e-05, + "loss": 1.9297, + "step": 5832 + }, + { + "epoch": 1.7903621853898097, + "grad_norm": 0.5276393294334412, + "learning_rate": 9.429091191343433e-05, + "loss": 1.8803, + "step": 5833 + }, + { + "epoch": 1.7906691221608348, + "grad_norm": 0.5736613869667053, + "learning_rate": 9.428860519386726e-05, + "loss": 1.9256, + "step": 5834 + }, + { + "epoch": 1.79097605893186, + "grad_norm": 0.6111080050468445, + "learning_rate": 9.428629803661421e-05, + "loss": 1.9624, + "step": 5835 + }, + { + "epoch": 1.7912829957028853, + "grad_norm": 0.45036107301712036, + "learning_rate": 9.428399044169802e-05, + "loss": 1.8625, + "step": 5836 + }, + { + "epoch": 1.7915899324739104, + "grad_norm": 0.35049325227737427, + "learning_rate": 9.428168240914148e-05, + "loss": 1.8988, + "step": 5837 + }, + { + "epoch": 1.7918968692449355, + "grad_norm": 0.4196048080921173, + "learning_rate": 9.427937393896739e-05, + "loss": 1.8593, + "step": 5838 + }, + { + "epoch": 1.7922038060159609, + "grad_norm": 0.5051491856575012, + "learning_rate": 9.42770650311986e-05, + "loss": 1.9283, + "step": 5839 + }, + { + "epoch": 1.7925107427869857, + "grad_norm": 0.5883297324180603, + "learning_rate": 9.427475568585787e-05, + "loss": 1.9211, + "step": 5840 + }, + { + "epoch": 1.792817679558011, + "grad_norm": 0.54326993227005, + "learning_rate": 9.427244590296807e-05, + "loss": 1.8856, + "step": 5841 + }, + { + "epoch": 1.7931246163290362, + "grad_norm": 0.3963034152984619, + "learning_rate": 9.4270135682552e-05, + "loss": 1.9302, + "step": 5842 + }, + { + "epoch": 1.7934315531000613, + "grad_norm": 0.3804232180118561, + "learning_rate": 9.426782502463251e-05, + "loss": 1.8615, + "step": 5843 + }, + { + "epoch": 1.7937384898710866, + "grad_norm": 0.5173880457878113, + "learning_rate": 9.426551392923244e-05, + "loss": 1.9702, + "step": 5844 + }, + { + "epoch": 1.7940454266421118, + "grad_norm": 0.5509253144264221, + "learning_rate": 9.42632023963746e-05, + "loss": 1.9091, + "step": 5845 + }, + { + "epoch": 1.7943523634131369, + "grad_norm": 0.4918860197067261, + "learning_rate": 9.426089042608186e-05, + "loss": 1.956, + "step": 5846 + }, + { + "epoch": 1.7946593001841622, + "grad_norm": 0.40632131695747375, + "learning_rate": 9.425857801837705e-05, + "loss": 1.978, + "step": 5847 + }, + { + "epoch": 1.794966236955187, + "grad_norm": 0.429643839597702, + "learning_rate": 9.425626517328303e-05, + "loss": 1.9293, + "step": 5848 + }, + { + "epoch": 1.7952731737262124, + "grad_norm": 0.46690109372138977, + "learning_rate": 9.425395189082267e-05, + "loss": 1.935, + "step": 5849 + }, + { + "epoch": 1.7955801104972375, + "grad_norm": 0.47745081782341003, + "learning_rate": 9.425163817101881e-05, + "loss": 1.9308, + "step": 5850 + }, + { + "epoch": 1.7958870472682626, + "grad_norm": 0.40971288084983826, + "learning_rate": 9.424932401389433e-05, + "loss": 1.8818, + "step": 5851 + }, + { + "epoch": 1.796193984039288, + "grad_norm": 0.44640809297561646, + "learning_rate": 9.424700941947209e-05, + "loss": 1.9298, + "step": 5852 + }, + { + "epoch": 1.796500920810313, + "grad_norm": 0.4068106412887573, + "learning_rate": 9.424469438777497e-05, + "loss": 1.9176, + "step": 5853 + }, + { + "epoch": 1.7968078575813382, + "grad_norm": 0.39228180050849915, + "learning_rate": 9.424237891882584e-05, + "loss": 1.9822, + "step": 5854 + }, + { + "epoch": 1.7971147943523635, + "grad_norm": 0.4050966203212738, + "learning_rate": 9.424006301264761e-05, + "loss": 2.0092, + "step": 5855 + }, + { + "epoch": 1.7974217311233884, + "grad_norm": 0.4402252733707428, + "learning_rate": 9.423774666926313e-05, + "loss": 1.9686, + "step": 5856 + }, + { + "epoch": 1.7977286678944138, + "grad_norm": 0.4362206757068634, + "learning_rate": 9.423542988869531e-05, + "loss": 1.9472, + "step": 5857 + }, + { + "epoch": 1.7980356046654389, + "grad_norm": 0.4363079369068146, + "learning_rate": 9.423311267096706e-05, + "loss": 1.9046, + "step": 5858 + }, + { + "epoch": 1.798342541436464, + "grad_norm": 0.4619371294975281, + "learning_rate": 9.423079501610123e-05, + "loss": 1.9322, + "step": 5859 + }, + { + "epoch": 1.7986494782074893, + "grad_norm": 0.3747330605983734, + "learning_rate": 9.42284769241208e-05, + "loss": 1.8859, + "step": 5860 + }, + { + "epoch": 1.7989564149785144, + "grad_norm": 0.46349939703941345, + "learning_rate": 9.422615839504863e-05, + "loss": 2.0343, + "step": 5861 + }, + { + "epoch": 1.7992633517495396, + "grad_norm": 0.4081406891345978, + "learning_rate": 9.422383942890762e-05, + "loss": 1.9261, + "step": 5862 + }, + { + "epoch": 1.7995702885205649, + "grad_norm": 0.4200274348258972, + "learning_rate": 9.42215200257207e-05, + "loss": 1.8922, + "step": 5863 + }, + { + "epoch": 1.7998772252915898, + "grad_norm": 0.4353233277797699, + "learning_rate": 9.421920018551084e-05, + "loss": 1.9263, + "step": 5864 + }, + { + "epoch": 1.8001841620626151, + "grad_norm": 0.43261346220970154, + "learning_rate": 9.42168799083009e-05, + "loss": 1.872, + "step": 5865 + }, + { + "epoch": 1.8004910988336402, + "grad_norm": 0.41588231921195984, + "learning_rate": 9.421455919411385e-05, + "loss": 1.9427, + "step": 5866 + }, + { + "epoch": 1.8007980356046653, + "grad_norm": 0.36490678787231445, + "learning_rate": 9.421223804297261e-05, + "loss": 1.9458, + "step": 5867 + }, + { + "epoch": 1.8011049723756907, + "grad_norm": 0.40656644105911255, + "learning_rate": 9.42099164549001e-05, + "loss": 1.8791, + "step": 5868 + }, + { + "epoch": 1.8014119091467158, + "grad_norm": 0.35529834032058716, + "learning_rate": 9.42075944299193e-05, + "loss": 1.8889, + "step": 5869 + }, + { + "epoch": 1.801718845917741, + "grad_norm": 0.3530628979206085, + "learning_rate": 9.420527196805314e-05, + "loss": 1.9093, + "step": 5870 + }, + { + "epoch": 1.8020257826887662, + "grad_norm": 0.35012003779411316, + "learning_rate": 9.420294906932457e-05, + "loss": 1.84, + "step": 5871 + }, + { + "epoch": 1.8023327194597911, + "grad_norm": 0.37993142008781433, + "learning_rate": 9.420062573375654e-05, + "loss": 1.9943, + "step": 5872 + }, + { + "epoch": 1.8026396562308165, + "grad_norm": 0.34801873564720154, + "learning_rate": 9.419830196137204e-05, + "loss": 1.9092, + "step": 5873 + }, + { + "epoch": 1.8029465930018416, + "grad_norm": 0.3381052017211914, + "learning_rate": 9.4195977752194e-05, + "loss": 1.9212, + "step": 5874 + }, + { + "epoch": 1.8032535297728667, + "grad_norm": 0.3624991476535797, + "learning_rate": 9.419365310624542e-05, + "loss": 1.9491, + "step": 5875 + }, + { + "epoch": 1.803560466543892, + "grad_norm": 0.3840768337249756, + "learning_rate": 9.419132802354925e-05, + "loss": 1.9531, + "step": 5876 + }, + { + "epoch": 1.8038674033149171, + "grad_norm": 0.377481073141098, + "learning_rate": 9.418900250412846e-05, + "loss": 1.9103, + "step": 5877 + }, + { + "epoch": 1.8041743400859422, + "grad_norm": 0.41462278366088867, + "learning_rate": 9.418667654800606e-05, + "loss": 1.944, + "step": 5878 + }, + { + "epoch": 1.8044812768569676, + "grad_norm": 0.5620705485343933, + "learning_rate": 9.418435015520502e-05, + "loss": 1.9184, + "step": 5879 + }, + { + "epoch": 1.8047882136279927, + "grad_norm": 0.6150699853897095, + "learning_rate": 9.418202332574833e-05, + "loss": 1.8971, + "step": 5880 + }, + { + "epoch": 1.8050951503990178, + "grad_norm": 0.5631645321846008, + "learning_rate": 9.4179696059659e-05, + "loss": 1.9668, + "step": 5881 + }, + { + "epoch": 1.8054020871700431, + "grad_norm": 0.4416831433773041, + "learning_rate": 9.417736835696001e-05, + "loss": 1.8531, + "step": 5882 + }, + { + "epoch": 1.805709023941068, + "grad_norm": 0.37340816855430603, + "learning_rate": 9.417504021767438e-05, + "loss": 1.8928, + "step": 5883 + }, + { + "epoch": 1.8060159607120934, + "grad_norm": 0.46018123626708984, + "learning_rate": 9.41727116418251e-05, + "loss": 1.8943, + "step": 5884 + }, + { + "epoch": 1.8063228974831185, + "grad_norm": 0.3852032721042633, + "learning_rate": 9.41703826294352e-05, + "loss": 1.8927, + "step": 5885 + }, + { + "epoch": 1.8066298342541436, + "grad_norm": 0.36783283948898315, + "learning_rate": 9.41680531805277e-05, + "loss": 1.9255, + "step": 5886 + }, + { + "epoch": 1.806936771025169, + "grad_norm": 0.39950302243232727, + "learning_rate": 9.416572329512559e-05, + "loss": 1.9215, + "step": 5887 + }, + { + "epoch": 1.807243707796194, + "grad_norm": 0.37217068672180176, + "learning_rate": 9.416339297325193e-05, + "loss": 1.8798, + "step": 5888 + }, + { + "epoch": 1.8075506445672191, + "grad_norm": 0.4334213137626648, + "learning_rate": 9.416106221492974e-05, + "loss": 1.9583, + "step": 5889 + }, + { + "epoch": 1.8078575813382445, + "grad_norm": 0.39610370993614197, + "learning_rate": 9.415873102018204e-05, + "loss": 1.9526, + "step": 5890 + }, + { + "epoch": 1.8081645181092694, + "grad_norm": 0.4256335496902466, + "learning_rate": 9.41563993890319e-05, + "loss": 1.9633, + "step": 5891 + }, + { + "epoch": 1.8084714548802947, + "grad_norm": 0.48030543327331543, + "learning_rate": 9.41540673215023e-05, + "loss": 1.8869, + "step": 5892 + }, + { + "epoch": 1.8087783916513198, + "grad_norm": 0.5549675822257996, + "learning_rate": 9.415173481761634e-05, + "loss": 1.9894, + "step": 5893 + }, + { + "epoch": 1.809085328422345, + "grad_norm": 0.5706361532211304, + "learning_rate": 9.414940187739708e-05, + "loss": 1.9721, + "step": 5894 + }, + { + "epoch": 1.8093922651933703, + "grad_norm": 0.4263947606086731, + "learning_rate": 9.414706850086754e-05, + "loss": 1.9408, + "step": 5895 + }, + { + "epoch": 1.8096992019643954, + "grad_norm": 0.3934611976146698, + "learning_rate": 9.414473468805078e-05, + "loss": 1.9444, + "step": 5896 + }, + { + "epoch": 1.8100061387354205, + "grad_norm": 0.4267776608467102, + "learning_rate": 9.41424004389699e-05, + "loss": 1.8774, + "step": 5897 + }, + { + "epoch": 1.8103130755064458, + "grad_norm": 0.46216219663619995, + "learning_rate": 9.414006575364795e-05, + "loss": 1.9648, + "step": 5898 + }, + { + "epoch": 1.8106200122774707, + "grad_norm": 0.4730767607688904, + "learning_rate": 9.413773063210798e-05, + "loss": 1.9528, + "step": 5899 + }, + { + "epoch": 1.810926949048496, + "grad_norm": 0.36383283138275146, + "learning_rate": 9.413539507437308e-05, + "loss": 1.843, + "step": 5900 + }, + { + "epoch": 1.8112338858195212, + "grad_norm": 0.343729168176651, + "learning_rate": 9.413305908046636e-05, + "loss": 1.9101, + "step": 5901 + }, + { + "epoch": 1.8115408225905463, + "grad_norm": 0.3774524927139282, + "learning_rate": 9.413072265041087e-05, + "loss": 1.8705, + "step": 5902 + }, + { + "epoch": 1.8118477593615716, + "grad_norm": 0.37734711170196533, + "learning_rate": 9.412838578422972e-05, + "loss": 1.868, + "step": 5903 + }, + { + "epoch": 1.8121546961325967, + "grad_norm": 0.3705524206161499, + "learning_rate": 9.4126048481946e-05, + "loss": 1.9587, + "step": 5904 + }, + { + "epoch": 1.8124616329036218, + "grad_norm": 0.45906612277030945, + "learning_rate": 9.41237107435828e-05, + "loss": 1.9872, + "step": 5905 + }, + { + "epoch": 1.8127685696746472, + "grad_norm": 0.5013484954833984, + "learning_rate": 9.412137256916323e-05, + "loss": 1.8692, + "step": 5906 + }, + { + "epoch": 1.813075506445672, + "grad_norm": 0.5123991370201111, + "learning_rate": 9.411903395871038e-05, + "loss": 1.9574, + "step": 5907 + }, + { + "epoch": 1.8133824432166974, + "grad_norm": 0.45425844192504883, + "learning_rate": 9.411669491224739e-05, + "loss": 1.9295, + "step": 5908 + }, + { + "epoch": 1.8136893799877225, + "grad_norm": 0.3939640522003174, + "learning_rate": 9.411435542979736e-05, + "loss": 1.9258, + "step": 5909 + }, + { + "epoch": 1.8139963167587476, + "grad_norm": 0.5032235383987427, + "learning_rate": 9.411201551138342e-05, + "loss": 1.9012, + "step": 5910 + }, + { + "epoch": 1.814303253529773, + "grad_norm": 0.6334826946258545, + "learning_rate": 9.410967515702869e-05, + "loss": 1.9699, + "step": 5911 + }, + { + "epoch": 1.814610190300798, + "grad_norm": 0.56645667552948, + "learning_rate": 9.41073343667563e-05, + "loss": 1.9346, + "step": 5912 + }, + { + "epoch": 1.8149171270718232, + "grad_norm": 0.461668461561203, + "learning_rate": 9.410499314058936e-05, + "loss": 1.9549, + "step": 5913 + }, + { + "epoch": 1.8152240638428485, + "grad_norm": 0.39917534589767456, + "learning_rate": 9.410265147855104e-05, + "loss": 1.9503, + "step": 5914 + }, + { + "epoch": 1.8155310006138734, + "grad_norm": 0.4409043788909912, + "learning_rate": 9.410030938066448e-05, + "loss": 1.897, + "step": 5915 + }, + { + "epoch": 1.8158379373848987, + "grad_norm": 0.5793384313583374, + "learning_rate": 9.40979668469528e-05, + "loss": 1.9526, + "step": 5916 + }, + { + "epoch": 1.8161448741559238, + "grad_norm": 0.4642924666404724, + "learning_rate": 9.409562387743917e-05, + "loss": 1.8993, + "step": 5917 + }, + { + "epoch": 1.816451810926949, + "grad_norm": 0.3799861669540405, + "learning_rate": 9.409328047214674e-05, + "loss": 1.9412, + "step": 5918 + }, + { + "epoch": 1.8167587476979743, + "grad_norm": 0.40758320689201355, + "learning_rate": 9.409093663109866e-05, + "loss": 1.9908, + "step": 5919 + }, + { + "epoch": 1.8170656844689994, + "grad_norm": 0.41446420550346375, + "learning_rate": 9.40885923543181e-05, + "loss": 1.8711, + "step": 5920 + }, + { + "epoch": 1.8173726212400245, + "grad_norm": 0.4744807183742523, + "learning_rate": 9.408624764182823e-05, + "loss": 2.0297, + "step": 5921 + }, + { + "epoch": 1.8176795580110499, + "grad_norm": 0.43377524614334106, + "learning_rate": 9.408390249365224e-05, + "loss": 1.9613, + "step": 5922 + }, + { + "epoch": 1.8179864947820747, + "grad_norm": 0.38450872898101807, + "learning_rate": 9.408155690981328e-05, + "loss": 1.8716, + "step": 5923 + }, + { + "epoch": 1.8182934315531, + "grad_norm": 0.4989684820175171, + "learning_rate": 9.407921089033452e-05, + "loss": 1.9909, + "step": 5924 + }, + { + "epoch": 1.8186003683241252, + "grad_norm": 0.4137042462825775, + "learning_rate": 9.407686443523918e-05, + "loss": 1.8778, + "step": 5925 + }, + { + "epoch": 1.8189073050951503, + "grad_norm": 0.3816729485988617, + "learning_rate": 9.407451754455042e-05, + "loss": 1.9355, + "step": 5926 + }, + { + "epoch": 1.8192142418661756, + "grad_norm": 0.48876214027404785, + "learning_rate": 9.407217021829145e-05, + "loss": 1.9256, + "step": 5927 + }, + { + "epoch": 1.8195211786372008, + "grad_norm": 0.5273690223693848, + "learning_rate": 9.406982245648547e-05, + "loss": 1.9456, + "step": 5928 + }, + { + "epoch": 1.8198281154082259, + "grad_norm": 0.4148990511894226, + "learning_rate": 9.406747425915566e-05, + "loss": 1.9184, + "step": 5929 + }, + { + "epoch": 1.8201350521792512, + "grad_norm": 0.4484131634235382, + "learning_rate": 9.406512562632526e-05, + "loss": 1.9305, + "step": 5930 + }, + { + "epoch": 1.820441988950276, + "grad_norm": 0.6036938428878784, + "learning_rate": 9.406277655801744e-05, + "loss": 1.9294, + "step": 5931 + }, + { + "epoch": 1.8207489257213014, + "grad_norm": 0.5399366021156311, + "learning_rate": 9.406042705425543e-05, + "loss": 1.9265, + "step": 5932 + }, + { + "epoch": 1.8210558624923265, + "grad_norm": 0.3591126501560211, + "learning_rate": 9.405807711506249e-05, + "loss": 1.8634, + "step": 5933 + }, + { + "epoch": 1.8213627992633517, + "grad_norm": 0.4474995732307434, + "learning_rate": 9.405572674046179e-05, + "loss": 2.0084, + "step": 5934 + }, + { + "epoch": 1.821669736034377, + "grad_norm": 0.4841657876968384, + "learning_rate": 9.405337593047657e-05, + "loss": 1.8885, + "step": 5935 + }, + { + "epoch": 1.821976672805402, + "grad_norm": 0.4786655008792877, + "learning_rate": 9.405102468513008e-05, + "loss": 1.9273, + "step": 5936 + }, + { + "epoch": 1.8222836095764272, + "grad_norm": 0.4675963521003723, + "learning_rate": 9.404867300444553e-05, + "loss": 1.9267, + "step": 5937 + }, + { + "epoch": 1.8225905463474525, + "grad_norm": 0.40235474705696106, + "learning_rate": 9.404632088844619e-05, + "loss": 2.0208, + "step": 5938 + }, + { + "epoch": 1.8228974831184774, + "grad_norm": 0.40626317262649536, + "learning_rate": 9.404396833715527e-05, + "loss": 1.9079, + "step": 5939 + }, + { + "epoch": 1.8232044198895028, + "grad_norm": 0.4164435565471649, + "learning_rate": 9.404161535059607e-05, + "loss": 1.8818, + "step": 5940 + }, + { + "epoch": 1.8235113566605279, + "grad_norm": 0.44487184286117554, + "learning_rate": 9.40392619287918e-05, + "loss": 1.9184, + "step": 5941 + }, + { + "epoch": 1.823818293431553, + "grad_norm": 0.4009508192539215, + "learning_rate": 9.403690807176572e-05, + "loss": 1.8814, + "step": 5942 + }, + { + "epoch": 1.8241252302025783, + "grad_norm": 0.3518575429916382, + "learning_rate": 9.403455377954112e-05, + "loss": 1.9319, + "step": 5943 + }, + { + "epoch": 1.8244321669736034, + "grad_norm": 0.36712533235549927, + "learning_rate": 9.403219905214125e-05, + "loss": 1.8609, + "step": 5944 + }, + { + "epoch": 1.8247391037446286, + "grad_norm": 0.3926267623901367, + "learning_rate": 9.402984388958937e-05, + "loss": 1.9328, + "step": 5945 + }, + { + "epoch": 1.825046040515654, + "grad_norm": 0.370781272649765, + "learning_rate": 9.402748829190878e-05, + "loss": 1.9848, + "step": 5946 + }, + { + "epoch": 1.8253529772866788, + "grad_norm": 0.38226625323295593, + "learning_rate": 9.402513225912273e-05, + "loss": 1.8933, + "step": 5947 + }, + { + "epoch": 1.8256599140577041, + "grad_norm": 0.40101101994514465, + "learning_rate": 9.402277579125451e-05, + "loss": 1.9231, + "step": 5948 + }, + { + "epoch": 1.8259668508287292, + "grad_norm": 0.41038060188293457, + "learning_rate": 9.402041888832744e-05, + "loss": 1.9445, + "step": 5949 + }, + { + "epoch": 1.8262737875997543, + "grad_norm": 0.37442395091056824, + "learning_rate": 9.401806155036479e-05, + "loss": 1.9271, + "step": 5950 + }, + { + "epoch": 1.8265807243707797, + "grad_norm": 0.43142926692962646, + "learning_rate": 9.401570377738984e-05, + "loss": 1.9489, + "step": 5951 + }, + { + "epoch": 1.8268876611418048, + "grad_norm": 0.38730981945991516, + "learning_rate": 9.401334556942591e-05, + "loss": 1.8802, + "step": 5952 + }, + { + "epoch": 1.82719459791283, + "grad_norm": 0.34189531207084656, + "learning_rate": 9.40109869264963e-05, + "loss": 1.9116, + "step": 5953 + }, + { + "epoch": 1.8275015346838552, + "grad_norm": 0.3632197678089142, + "learning_rate": 9.400862784862434e-05, + "loss": 1.8456, + "step": 5954 + }, + { + "epoch": 1.8278084714548803, + "grad_norm": 0.4008798599243164, + "learning_rate": 9.400626833583331e-05, + "loss": 1.9984, + "step": 5955 + }, + { + "epoch": 1.8281154082259055, + "grad_norm": 0.4087502062320709, + "learning_rate": 9.400390838814655e-05, + "loss": 1.8177, + "step": 5956 + }, + { + "epoch": 1.8284223449969308, + "grad_norm": 0.3753478229045868, + "learning_rate": 9.400154800558737e-05, + "loss": 1.864, + "step": 5957 + }, + { + "epoch": 1.8287292817679557, + "grad_norm": 0.37939608097076416, + "learning_rate": 9.399918718817911e-05, + "loss": 1.9331, + "step": 5958 + }, + { + "epoch": 1.829036218538981, + "grad_norm": 0.41382426023483276, + "learning_rate": 9.399682593594507e-05, + "loss": 1.9014, + "step": 5959 + }, + { + "epoch": 1.8293431553100061, + "grad_norm": 0.46129345893859863, + "learning_rate": 9.399446424890864e-05, + "loss": 1.9591, + "step": 5960 + }, + { + "epoch": 1.8296500920810312, + "grad_norm": 0.487870454788208, + "learning_rate": 9.399210212709312e-05, + "loss": 1.9073, + "step": 5961 + }, + { + "epoch": 1.8299570288520566, + "grad_norm": 0.4693615138530731, + "learning_rate": 9.398973957052185e-05, + "loss": 1.8336, + "step": 5962 + }, + { + "epoch": 1.8302639656230817, + "grad_norm": 0.38947850465774536, + "learning_rate": 9.39873765792182e-05, + "loss": 1.8599, + "step": 5963 + }, + { + "epoch": 1.8305709023941068, + "grad_norm": 0.372242271900177, + "learning_rate": 9.398501315320551e-05, + "loss": 1.9653, + "step": 5964 + }, + { + "epoch": 1.8308778391651321, + "grad_norm": 0.37679895758628845, + "learning_rate": 9.398264929250714e-05, + "loss": 1.8886, + "step": 5965 + }, + { + "epoch": 1.831184775936157, + "grad_norm": 0.347989022731781, + "learning_rate": 9.398028499714645e-05, + "loss": 1.8665, + "step": 5966 + }, + { + "epoch": 1.8314917127071824, + "grad_norm": 0.4297877550125122, + "learning_rate": 9.397792026714681e-05, + "loss": 1.9646, + "step": 5967 + }, + { + "epoch": 1.8317986494782075, + "grad_norm": 0.3698103427886963, + "learning_rate": 9.397555510253158e-05, + "loss": 1.9537, + "step": 5968 + }, + { + "epoch": 1.8321055862492326, + "grad_norm": 0.3268609941005707, + "learning_rate": 9.397318950332414e-05, + "loss": 1.8679, + "step": 5969 + }, + { + "epoch": 1.832412523020258, + "grad_norm": 0.3487341105937958, + "learning_rate": 9.397082346954788e-05, + "loss": 1.8936, + "step": 5970 + }, + { + "epoch": 1.832719459791283, + "grad_norm": 0.36363741755485535, + "learning_rate": 9.396845700122616e-05, + "loss": 1.8926, + "step": 5971 + }, + { + "epoch": 1.8330263965623081, + "grad_norm": 0.42258647084236145, + "learning_rate": 9.396609009838237e-05, + "loss": 1.9439, + "step": 5972 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.4087521433830261, + "learning_rate": 9.396372276103992e-05, + "loss": 1.8868, + "step": 5973 + }, + { + "epoch": 1.8336402701043584, + "grad_norm": 0.41857820749282837, + "learning_rate": 9.396135498922218e-05, + "loss": 1.9824, + "step": 5974 + }, + { + "epoch": 1.8339472068753837, + "grad_norm": 0.44207099080085754, + "learning_rate": 9.395898678295259e-05, + "loss": 1.9183, + "step": 5975 + }, + { + "epoch": 1.8342541436464088, + "grad_norm": 0.38295891880989075, + "learning_rate": 9.39566181422545e-05, + "loss": 1.8882, + "step": 5976 + }, + { + "epoch": 1.834561080417434, + "grad_norm": 0.4440687298774719, + "learning_rate": 9.395424906715136e-05, + "loss": 1.9401, + "step": 5977 + }, + { + "epoch": 1.8348680171884593, + "grad_norm": 0.3867577016353607, + "learning_rate": 9.395187955766655e-05, + "loss": 1.9243, + "step": 5978 + }, + { + "epoch": 1.8351749539594844, + "grad_norm": 0.47536182403564453, + "learning_rate": 9.394950961382354e-05, + "loss": 1.9248, + "step": 5979 + }, + { + "epoch": 1.8354818907305095, + "grad_norm": 0.4071936011314392, + "learning_rate": 9.394713923564569e-05, + "loss": 1.8701, + "step": 5980 + }, + { + "epoch": 1.8357888275015348, + "grad_norm": 0.41844502091407776, + "learning_rate": 9.394476842315645e-05, + "loss": 2.0087, + "step": 5981 + }, + { + "epoch": 1.8360957642725597, + "grad_norm": 0.40439316630363464, + "learning_rate": 9.394239717637927e-05, + "loss": 1.8945, + "step": 5982 + }, + { + "epoch": 1.836402701043585, + "grad_norm": 0.36738064885139465, + "learning_rate": 9.394002549533754e-05, + "loss": 1.9361, + "step": 5983 + }, + { + "epoch": 1.8367096378146102, + "grad_norm": 0.4733370542526245, + "learning_rate": 9.393765338005476e-05, + "loss": 1.9301, + "step": 5984 + }, + { + "epoch": 1.8370165745856353, + "grad_norm": 0.4467030465602875, + "learning_rate": 9.39352808305543e-05, + "loss": 1.8691, + "step": 5985 + }, + { + "epoch": 1.8373235113566606, + "grad_norm": 0.5276423692703247, + "learning_rate": 9.393290784685967e-05, + "loss": 1.9211, + "step": 5986 + }, + { + "epoch": 1.8376304481276857, + "grad_norm": 0.4791669547557831, + "learning_rate": 9.393053442899428e-05, + "loss": 1.9876, + "step": 5987 + }, + { + "epoch": 1.8379373848987108, + "grad_norm": 0.41468554735183716, + "learning_rate": 9.392816057698159e-05, + "loss": 1.9483, + "step": 5988 + }, + { + "epoch": 1.8382443216697362, + "grad_norm": 0.3979242742061615, + "learning_rate": 9.39257862908451e-05, + "loss": 1.8962, + "step": 5989 + }, + { + "epoch": 1.838551258440761, + "grad_norm": 0.47706472873687744, + "learning_rate": 9.392341157060822e-05, + "loss": 1.9028, + "step": 5990 + }, + { + "epoch": 1.8388581952117864, + "grad_norm": 0.5254244804382324, + "learning_rate": 9.392103641629446e-05, + "loss": 1.9244, + "step": 5991 + }, + { + "epoch": 1.8391651319828115, + "grad_norm": 0.49596595764160156, + "learning_rate": 9.391866082792727e-05, + "loss": 1.8731, + "step": 5992 + }, + { + "epoch": 1.8394720687538366, + "grad_norm": 0.3787136971950531, + "learning_rate": 9.391628480553013e-05, + "loss": 1.9404, + "step": 5993 + }, + { + "epoch": 1.839779005524862, + "grad_norm": 0.3986566960811615, + "learning_rate": 9.391390834912651e-05, + "loss": 1.9319, + "step": 5994 + }, + { + "epoch": 1.840085942295887, + "grad_norm": 0.4466419219970703, + "learning_rate": 9.391153145873992e-05, + "loss": 1.9755, + "step": 5995 + }, + { + "epoch": 1.8403928790669122, + "grad_norm": 0.43374884128570557, + "learning_rate": 9.390915413439385e-05, + "loss": 1.913, + "step": 5996 + }, + { + "epoch": 1.8406998158379375, + "grad_norm": 0.3897610902786255, + "learning_rate": 9.390677637611176e-05, + "loss": 1.9488, + "step": 5997 + }, + { + "epoch": 1.8410067526089624, + "grad_norm": 0.38407614827156067, + "learning_rate": 9.390439818391718e-05, + "loss": 1.8712, + "step": 5998 + }, + { + "epoch": 1.8413136893799877, + "grad_norm": 0.4159192740917206, + "learning_rate": 9.390201955783362e-05, + "loss": 1.9254, + "step": 5999 + }, + { + "epoch": 1.8416206261510129, + "grad_norm": 0.42220592498779297, + "learning_rate": 9.389964049788455e-05, + "loss": 1.9684, + "step": 6000 + }, + { + "epoch": 1.841927562922038, + "grad_norm": 0.3792029619216919, + "learning_rate": 9.389726100409351e-05, + "loss": 1.9091, + "step": 6001 + }, + { + "epoch": 1.8422344996930633, + "grad_norm": 0.37374788522720337, + "learning_rate": 9.389488107648401e-05, + "loss": 1.9498, + "step": 6002 + }, + { + "epoch": 1.8425414364640884, + "grad_norm": 0.4237084686756134, + "learning_rate": 9.389250071507958e-05, + "loss": 1.9177, + "step": 6003 + }, + { + "epoch": 1.8428483732351135, + "grad_norm": 0.5332993865013123, + "learning_rate": 9.38901199199037e-05, + "loss": 1.8994, + "step": 6004 + }, + { + "epoch": 1.8431553100061389, + "grad_norm": 0.42202335596084595, + "learning_rate": 9.388773869097996e-05, + "loss": 1.8365, + "step": 6005 + }, + { + "epoch": 1.8434622467771637, + "grad_norm": 0.3581100106239319, + "learning_rate": 9.388535702833185e-05, + "loss": 1.8536, + "step": 6006 + }, + { + "epoch": 1.843769183548189, + "grad_norm": 0.3670782446861267, + "learning_rate": 9.388297493198293e-05, + "loss": 1.8965, + "step": 6007 + }, + { + "epoch": 1.8440761203192142, + "grad_norm": 0.39181825518608093, + "learning_rate": 9.38805924019567e-05, + "loss": 1.8674, + "step": 6008 + }, + { + "epoch": 1.8443830570902393, + "grad_norm": 0.46757015585899353, + "learning_rate": 9.387820943827676e-05, + "loss": 1.8945, + "step": 6009 + }, + { + "epoch": 1.8446899938612646, + "grad_norm": 0.4656504690647125, + "learning_rate": 9.387582604096664e-05, + "loss": 1.8626, + "step": 6010 + }, + { + "epoch": 1.8449969306322898, + "grad_norm": 0.4699888825416565, + "learning_rate": 9.387344221004988e-05, + "loss": 1.9396, + "step": 6011 + }, + { + "epoch": 1.8453038674033149, + "grad_norm": 0.36591392755508423, + "learning_rate": 9.387105794555006e-05, + "loss": 1.8031, + "step": 6012 + }, + { + "epoch": 1.8456108041743402, + "grad_norm": 0.3563486933708191, + "learning_rate": 9.386867324749073e-05, + "loss": 1.8658, + "step": 6013 + }, + { + "epoch": 1.845917740945365, + "grad_norm": 0.4490883946418762, + "learning_rate": 9.386628811589547e-05, + "loss": 1.9809, + "step": 6014 + }, + { + "epoch": 1.8462246777163904, + "grad_norm": 0.39862295985221863, + "learning_rate": 9.38639025507878e-05, + "loss": 1.9268, + "step": 6015 + }, + { + "epoch": 1.8465316144874155, + "grad_norm": 0.3579883575439453, + "learning_rate": 9.386151655219138e-05, + "loss": 1.8538, + "step": 6016 + }, + { + "epoch": 1.8468385512584407, + "grad_norm": 0.411685973405838, + "learning_rate": 9.385913012012973e-05, + "loss": 1.9034, + "step": 6017 + }, + { + "epoch": 1.847145488029466, + "grad_norm": 0.44486066699028015, + "learning_rate": 9.385674325462643e-05, + "loss": 1.9279, + "step": 6018 + }, + { + "epoch": 1.847452424800491, + "grad_norm": 0.42794153094291687, + "learning_rate": 9.385435595570511e-05, + "loss": 1.9117, + "step": 6019 + }, + { + "epoch": 1.8477593615715162, + "grad_norm": 0.3652110695838928, + "learning_rate": 9.385196822338933e-05, + "loss": 1.9636, + "step": 6020 + }, + { + "epoch": 1.8480662983425415, + "grad_norm": 0.36490142345428467, + "learning_rate": 9.38495800577027e-05, + "loss": 1.9468, + "step": 6021 + }, + { + "epoch": 1.8483732351135667, + "grad_norm": 0.3946039080619812, + "learning_rate": 9.384719145866882e-05, + "loss": 1.8851, + "step": 6022 + }, + { + "epoch": 1.8486801718845918, + "grad_norm": 0.4236997067928314, + "learning_rate": 9.38448024263113e-05, + "loss": 2.0256, + "step": 6023 + }, + { + "epoch": 1.848987108655617, + "grad_norm": 0.34637942910194397, + "learning_rate": 9.384241296065374e-05, + "loss": 1.9032, + "step": 6024 + }, + { + "epoch": 1.849294045426642, + "grad_norm": 0.4096907079219818, + "learning_rate": 9.384002306171975e-05, + "loss": 1.9762, + "step": 6025 + }, + { + "epoch": 1.8496009821976673, + "grad_norm": 0.38225218653678894, + "learning_rate": 9.383763272953297e-05, + "loss": 2.023, + "step": 6026 + }, + { + "epoch": 1.8499079189686924, + "grad_norm": 0.4297153055667877, + "learning_rate": 9.3835241964117e-05, + "loss": 1.977, + "step": 6027 + }, + { + "epoch": 1.8502148557397176, + "grad_norm": 0.5225360989570618, + "learning_rate": 9.383285076549548e-05, + "loss": 1.919, + "step": 6028 + }, + { + "epoch": 1.850521792510743, + "grad_norm": 0.6799743175506592, + "learning_rate": 9.383045913369205e-05, + "loss": 1.9382, + "step": 6029 + }, + { + "epoch": 1.850828729281768, + "grad_norm": 0.6274817585945129, + "learning_rate": 9.382806706873031e-05, + "loss": 1.9782, + "step": 6030 + }, + { + "epoch": 1.8511356660527931, + "grad_norm": 0.4939708113670349, + "learning_rate": 9.382567457063392e-05, + "loss": 1.8794, + "step": 6031 + }, + { + "epoch": 1.8514426028238185, + "grad_norm": 0.3876135051250458, + "learning_rate": 9.382328163942656e-05, + "loss": 2.0153, + "step": 6032 + }, + { + "epoch": 1.8517495395948433, + "grad_norm": 0.592051088809967, + "learning_rate": 9.38208882751318e-05, + "loss": 1.9277, + "step": 6033 + }, + { + "epoch": 1.8520564763658687, + "grad_norm": 0.660763144493103, + "learning_rate": 9.381849447777337e-05, + "loss": 1.9177, + "step": 6034 + }, + { + "epoch": 1.8523634131368938, + "grad_norm": 0.5823151469230652, + "learning_rate": 9.381610024737489e-05, + "loss": 1.9363, + "step": 6035 + }, + { + "epoch": 1.852670349907919, + "grad_norm": 0.39519962668418884, + "learning_rate": 9.381370558396004e-05, + "loss": 1.8627, + "step": 6036 + }, + { + "epoch": 1.8529772866789442, + "grad_norm": 0.44657328724861145, + "learning_rate": 9.381131048755244e-05, + "loss": 1.9075, + "step": 6037 + }, + { + "epoch": 1.8532842234499693, + "grad_norm": 0.540743887424469, + "learning_rate": 9.380891495817581e-05, + "loss": 1.9518, + "step": 6038 + }, + { + "epoch": 1.8535911602209945, + "grad_norm": 0.4388680160045624, + "learning_rate": 9.38065189958538e-05, + "loss": 1.8485, + "step": 6039 + }, + { + "epoch": 1.8538980969920198, + "grad_norm": 0.37645572423934937, + "learning_rate": 9.38041226006101e-05, + "loss": 1.9542, + "step": 6040 + }, + { + "epoch": 1.8542050337630447, + "grad_norm": 0.4405656158924103, + "learning_rate": 9.380172577246837e-05, + "loss": 1.9054, + "step": 6041 + }, + { + "epoch": 1.85451197053407, + "grad_norm": 0.45483505725860596, + "learning_rate": 9.379932851145232e-05, + "loss": 1.9077, + "step": 6042 + }, + { + "epoch": 1.8548189073050951, + "grad_norm": 0.40666261315345764, + "learning_rate": 9.379693081758564e-05, + "loss": 1.9977, + "step": 6043 + }, + { + "epoch": 1.8551258440761202, + "grad_norm": 0.365241140127182, + "learning_rate": 9.379453269089202e-05, + "loss": 1.9047, + "step": 6044 + }, + { + "epoch": 1.8554327808471456, + "grad_norm": 0.40797916054725647, + "learning_rate": 9.379213413139516e-05, + "loss": 1.9621, + "step": 6045 + }, + { + "epoch": 1.8557397176181707, + "grad_norm": 0.4525306820869446, + "learning_rate": 9.378973513911875e-05, + "loss": 1.9479, + "step": 6046 + }, + { + "epoch": 1.8560466543891958, + "grad_norm": 0.45422959327697754, + "learning_rate": 9.378733571408652e-05, + "loss": 1.9754, + "step": 6047 + }, + { + "epoch": 1.8563535911602211, + "grad_norm": 0.381862998008728, + "learning_rate": 9.378493585632217e-05, + "loss": 1.8542, + "step": 6048 + }, + { + "epoch": 1.856660527931246, + "grad_norm": 0.40489691495895386, + "learning_rate": 9.378253556584944e-05, + "loss": 1.9331, + "step": 6049 + }, + { + "epoch": 1.8569674647022714, + "grad_norm": 0.40347445011138916, + "learning_rate": 9.378013484269201e-05, + "loss": 1.9414, + "step": 6050 + }, + { + "epoch": 1.8572744014732965, + "grad_norm": 0.35401904582977295, + "learning_rate": 9.377773368687363e-05, + "loss": 1.8094, + "step": 6051 + }, + { + "epoch": 1.8575813382443216, + "grad_norm": 0.4061582684516907, + "learning_rate": 9.377533209841805e-05, + "loss": 1.8686, + "step": 6052 + }, + { + "epoch": 1.857888275015347, + "grad_norm": 0.44419318437576294, + "learning_rate": 9.377293007734895e-05, + "loss": 1.929, + "step": 6053 + }, + { + "epoch": 1.858195211786372, + "grad_norm": 0.41038191318511963, + "learning_rate": 9.37705276236901e-05, + "loss": 1.9636, + "step": 6054 + }, + { + "epoch": 1.8585021485573971, + "grad_norm": 0.4431348145008087, + "learning_rate": 9.376812473746526e-05, + "loss": 1.953, + "step": 6055 + }, + { + "epoch": 1.8588090853284225, + "grad_norm": 0.42502057552337646, + "learning_rate": 9.376572141869814e-05, + "loss": 1.95, + "step": 6056 + }, + { + "epoch": 1.8591160220994474, + "grad_norm": 0.40050914883613586, + "learning_rate": 9.376331766741253e-05, + "loss": 1.9507, + "step": 6057 + }, + { + "epoch": 1.8594229588704727, + "grad_norm": 0.3863932490348816, + "learning_rate": 9.376091348363216e-05, + "loss": 1.8746, + "step": 6058 + }, + { + "epoch": 1.8597298956414978, + "grad_norm": 0.37295350432395935, + "learning_rate": 9.375850886738077e-05, + "loss": 1.8778, + "step": 6059 + }, + { + "epoch": 1.860036832412523, + "grad_norm": 0.37965887784957886, + "learning_rate": 9.375610381868217e-05, + "loss": 1.8511, + "step": 6060 + }, + { + "epoch": 1.8603437691835483, + "grad_norm": 0.3740752637386322, + "learning_rate": 9.37536983375601e-05, + "loss": 1.8988, + "step": 6061 + }, + { + "epoch": 1.8606507059545734, + "grad_norm": 0.40466782450675964, + "learning_rate": 9.375129242403834e-05, + "loss": 1.9195, + "step": 6062 + }, + { + "epoch": 1.8609576427255985, + "grad_norm": 0.3658956289291382, + "learning_rate": 9.374888607814067e-05, + "loss": 1.9598, + "step": 6063 + }, + { + "epoch": 1.8612645794966238, + "grad_norm": 0.3752783238887787, + "learning_rate": 9.374647929989085e-05, + "loss": 1.9791, + "step": 6064 + }, + { + "epoch": 1.8615715162676487, + "grad_norm": 0.408774733543396, + "learning_rate": 9.374407208931268e-05, + "loss": 1.88, + "step": 6065 + }, + { + "epoch": 1.861878453038674, + "grad_norm": 0.3968205749988556, + "learning_rate": 9.374166444642997e-05, + "loss": 1.8755, + "step": 6066 + }, + { + "epoch": 1.8621853898096992, + "grad_norm": 0.37851858139038086, + "learning_rate": 9.373925637126648e-05, + "loss": 1.9296, + "step": 6067 + }, + { + "epoch": 1.8624923265807243, + "grad_norm": 0.34285619854927063, + "learning_rate": 9.373684786384604e-05, + "loss": 2.0149, + "step": 6068 + }, + { + "epoch": 1.8627992633517496, + "grad_norm": 0.38841512799263, + "learning_rate": 9.373443892419242e-05, + "loss": 1.9134, + "step": 6069 + }, + { + "epoch": 1.8631062001227747, + "grad_norm": 0.4744485914707184, + "learning_rate": 9.373202955232943e-05, + "loss": 1.9164, + "step": 6070 + }, + { + "epoch": 1.8634131368937998, + "grad_norm": 0.522659420967102, + "learning_rate": 9.372961974828092e-05, + "loss": 1.9155, + "step": 6071 + }, + { + "epoch": 1.8637200736648252, + "grad_norm": 0.5794001817703247, + "learning_rate": 9.372720951207066e-05, + "loss": 1.9003, + "step": 6072 + }, + { + "epoch": 1.86402701043585, + "grad_norm": 0.5135447978973389, + "learning_rate": 9.372479884372247e-05, + "loss": 1.948, + "step": 6073 + }, + { + "epoch": 1.8643339472068754, + "grad_norm": 0.4060198664665222, + "learning_rate": 9.372238774326021e-05, + "loss": 1.8634, + "step": 6074 + }, + { + "epoch": 1.8646408839779005, + "grad_norm": 0.3880244195461273, + "learning_rate": 9.371997621070769e-05, + "loss": 1.8729, + "step": 6075 + }, + { + "epoch": 1.8649478207489256, + "grad_norm": 0.4862929582595825, + "learning_rate": 9.371756424608875e-05, + "loss": 1.9185, + "step": 6076 + }, + { + "epoch": 1.865254757519951, + "grad_norm": 0.4763035476207733, + "learning_rate": 9.371515184942719e-05, + "loss": 1.9696, + "step": 6077 + }, + { + "epoch": 1.865561694290976, + "grad_norm": 0.3552228808403015, + "learning_rate": 9.371273902074689e-05, + "loss": 1.9101, + "step": 6078 + }, + { + "epoch": 1.8658686310620012, + "grad_norm": 0.46329566836357117, + "learning_rate": 9.371032576007168e-05, + "loss": 1.8807, + "step": 6079 + }, + { + "epoch": 1.8661755678330265, + "grad_norm": 0.5176550149917603, + "learning_rate": 9.370791206742541e-05, + "loss": 1.9044, + "step": 6080 + }, + { + "epoch": 1.8664825046040514, + "grad_norm": 0.3929184675216675, + "learning_rate": 9.370549794283194e-05, + "loss": 1.8858, + "step": 6081 + }, + { + "epoch": 1.8667894413750767, + "grad_norm": 0.35135987401008606, + "learning_rate": 9.370308338631511e-05, + "loss": 1.8518, + "step": 6082 + }, + { + "epoch": 1.8670963781461019, + "grad_norm": 0.4229072034358978, + "learning_rate": 9.370066839789881e-05, + "loss": 1.891, + "step": 6083 + }, + { + "epoch": 1.867403314917127, + "grad_norm": 0.4862394630908966, + "learning_rate": 9.369825297760688e-05, + "loss": 1.9058, + "step": 6084 + }, + { + "epoch": 1.8677102516881523, + "grad_norm": 0.4775281548500061, + "learning_rate": 9.369583712546322e-05, + "loss": 1.9738, + "step": 6085 + }, + { + "epoch": 1.8680171884591774, + "grad_norm": 0.3831046521663666, + "learning_rate": 9.369342084149166e-05, + "loss": 1.9516, + "step": 6086 + }, + { + "epoch": 1.8683241252302025, + "grad_norm": 0.3970867395401001, + "learning_rate": 9.369100412571612e-05, + "loss": 2.0158, + "step": 6087 + }, + { + "epoch": 1.8686310620012279, + "grad_norm": 0.41662725806236267, + "learning_rate": 9.368858697816047e-05, + "loss": 1.86, + "step": 6088 + }, + { + "epoch": 1.8689379987722528, + "grad_norm": 0.44235244393348694, + "learning_rate": 9.36861693988486e-05, + "loss": 1.9257, + "step": 6089 + }, + { + "epoch": 1.869244935543278, + "grad_norm": 0.37863966822624207, + "learning_rate": 9.36837513878044e-05, + "loss": 1.8877, + "step": 6090 + }, + { + "epoch": 1.8695518723143032, + "grad_norm": 0.44757044315338135, + "learning_rate": 9.368133294505175e-05, + "loss": 1.8962, + "step": 6091 + }, + { + "epoch": 1.8698588090853283, + "grad_norm": 0.5299558639526367, + "learning_rate": 9.367891407061458e-05, + "loss": 1.8655, + "step": 6092 + }, + { + "epoch": 1.8701657458563536, + "grad_norm": 0.4899531900882721, + "learning_rate": 9.367649476451678e-05, + "loss": 1.8933, + "step": 6093 + }, + { + "epoch": 1.8704726826273788, + "grad_norm": 0.3883507251739502, + "learning_rate": 9.367407502678224e-05, + "loss": 1.88, + "step": 6094 + }, + { + "epoch": 1.8707796193984039, + "grad_norm": 0.40936750173568726, + "learning_rate": 9.367165485743493e-05, + "loss": 1.8926, + "step": 6095 + }, + { + "epoch": 1.8710865561694292, + "grad_norm": 0.5708447098731995, + "learning_rate": 9.36692342564987e-05, + "loss": 1.9701, + "step": 6096 + }, + { + "epoch": 1.8713934929404543, + "grad_norm": 0.5559602379798889, + "learning_rate": 9.366681322399751e-05, + "loss": 1.8962, + "step": 6097 + }, + { + "epoch": 1.8717004297114794, + "grad_norm": 0.45344826579093933, + "learning_rate": 9.366439175995528e-05, + "loss": 1.9766, + "step": 6098 + }, + { + "epoch": 1.8720073664825048, + "grad_norm": 0.4887133538722992, + "learning_rate": 9.366196986439592e-05, + "loss": 1.8982, + "step": 6099 + }, + { + "epoch": 1.8723143032535297, + "grad_norm": 0.536568284034729, + "learning_rate": 9.365954753734339e-05, + "loss": 1.9506, + "step": 6100 + }, + { + "epoch": 1.872621240024555, + "grad_norm": 0.4792746901512146, + "learning_rate": 9.365712477882162e-05, + "loss": 1.9392, + "step": 6101 + }, + { + "epoch": 1.87292817679558, + "grad_norm": 0.39836910367012024, + "learning_rate": 9.365470158885458e-05, + "loss": 1.8812, + "step": 6102 + }, + { + "epoch": 1.8732351135666052, + "grad_norm": 0.4263121783733368, + "learning_rate": 9.365227796746617e-05, + "loss": 1.8326, + "step": 6103 + }, + { + "epoch": 1.8735420503376305, + "grad_norm": 0.4158315360546112, + "learning_rate": 9.364985391468038e-05, + "loss": 1.8857, + "step": 6104 + }, + { + "epoch": 1.8738489871086557, + "grad_norm": 0.4384559094905853, + "learning_rate": 9.364742943052112e-05, + "loss": 1.9247, + "step": 6105 + }, + { + "epoch": 1.8741559238796808, + "grad_norm": 0.34221649169921875, + "learning_rate": 9.364500451501242e-05, + "loss": 1.8869, + "step": 6106 + }, + { + "epoch": 1.874462860650706, + "grad_norm": 0.38786688446998596, + "learning_rate": 9.364257916817817e-05, + "loss": 1.8879, + "step": 6107 + }, + { + "epoch": 1.874769797421731, + "grad_norm": 0.39408090710639954, + "learning_rate": 9.364015339004239e-05, + "loss": 1.8832, + "step": 6108 + }, + { + "epoch": 1.8750767341927563, + "grad_norm": 0.33985385298728943, + "learning_rate": 9.363772718062902e-05, + "loss": 1.8823, + "step": 6109 + }, + { + "epoch": 1.8753836709637814, + "grad_norm": 0.35319194197654724, + "learning_rate": 9.363530053996206e-05, + "loss": 1.9205, + "step": 6110 + }, + { + "epoch": 1.8756906077348066, + "grad_norm": 0.3455435335636139, + "learning_rate": 9.36328734680655e-05, + "loss": 1.9028, + "step": 6111 + }, + { + "epoch": 1.875997544505832, + "grad_norm": 0.3689115643501282, + "learning_rate": 9.363044596496329e-05, + "loss": 1.8996, + "step": 6112 + }, + { + "epoch": 1.876304481276857, + "grad_norm": 0.35776960849761963, + "learning_rate": 9.362801803067945e-05, + "loss": 1.9563, + "step": 6113 + }, + { + "epoch": 1.8766114180478821, + "grad_norm": 0.3524370491504669, + "learning_rate": 9.362558966523797e-05, + "loss": 1.9016, + "step": 6114 + }, + { + "epoch": 1.8769183548189075, + "grad_norm": 0.3725074529647827, + "learning_rate": 9.362316086866283e-05, + "loss": 1.9467, + "step": 6115 + }, + { + "epoch": 1.8772252915899323, + "grad_norm": 0.390055775642395, + "learning_rate": 9.362073164097807e-05, + "loss": 1.9326, + "step": 6116 + }, + { + "epoch": 1.8775322283609577, + "grad_norm": 0.39119964838027954, + "learning_rate": 9.361830198220764e-05, + "loss": 1.8723, + "step": 6117 + }, + { + "epoch": 1.8778391651319828, + "grad_norm": 0.3659103512763977, + "learning_rate": 9.36158718923756e-05, + "loss": 1.835, + "step": 6118 + }, + { + "epoch": 1.878146101903008, + "grad_norm": 0.3360283076763153, + "learning_rate": 9.361344137150597e-05, + "loss": 1.8622, + "step": 6119 + }, + { + "epoch": 1.8784530386740332, + "grad_norm": 0.35440295934677124, + "learning_rate": 9.361101041962272e-05, + "loss": 1.8523, + "step": 6120 + }, + { + "epoch": 1.8787599754450584, + "grad_norm": 1.2606174945831299, + "learning_rate": 9.36085790367499e-05, + "loss": 1.9826, + "step": 6121 + }, + { + "epoch": 1.8790669122160835, + "grad_norm": 0.49294769763946533, + "learning_rate": 9.360614722291157e-05, + "loss": 1.8478, + "step": 6122 + }, + { + "epoch": 1.8793738489871088, + "grad_norm": 0.5642881393432617, + "learning_rate": 9.360371497813172e-05, + "loss": 1.883, + "step": 6123 + }, + { + "epoch": 1.8796807857581337, + "grad_norm": 0.5257276296615601, + "learning_rate": 9.36012823024344e-05, + "loss": 1.8577, + "step": 6124 + }, + { + "epoch": 1.879987722529159, + "grad_norm": 0.36913231015205383, + "learning_rate": 9.359884919584366e-05, + "loss": 1.8934, + "step": 6125 + }, + { + "epoch": 1.8802946593001841, + "grad_norm": 0.43373262882232666, + "learning_rate": 9.359641565838353e-05, + "loss": 1.8354, + "step": 6126 + }, + { + "epoch": 1.8806015960712092, + "grad_norm": 0.5280462503433228, + "learning_rate": 9.359398169007807e-05, + "loss": 1.9446, + "step": 6127 + }, + { + "epoch": 1.8809085328422346, + "grad_norm": 0.4991915225982666, + "learning_rate": 9.359154729095135e-05, + "loss": 1.9003, + "step": 6128 + }, + { + "epoch": 1.8812154696132597, + "grad_norm": 0.3766331374645233, + "learning_rate": 9.358911246102738e-05, + "loss": 1.9149, + "step": 6129 + }, + { + "epoch": 1.8815224063842848, + "grad_norm": 0.39050692319869995, + "learning_rate": 9.358667720033026e-05, + "loss": 1.8945, + "step": 6130 + }, + { + "epoch": 1.8818293431553101, + "grad_norm": 0.47633904218673706, + "learning_rate": 9.358424150888405e-05, + "loss": 1.8772, + "step": 6131 + }, + { + "epoch": 1.882136279926335, + "grad_norm": 0.46322503685951233, + "learning_rate": 9.358180538671283e-05, + "loss": 1.893, + "step": 6132 + }, + { + "epoch": 1.8824432166973604, + "grad_norm": 0.39437612891197205, + "learning_rate": 9.357936883384066e-05, + "loss": 1.9394, + "step": 6133 + }, + { + "epoch": 1.8827501534683855, + "grad_norm": 0.4534996747970581, + "learning_rate": 9.357693185029162e-05, + "loss": 1.9689, + "step": 6134 + }, + { + "epoch": 1.8830570902394106, + "grad_norm": 0.4408230483531952, + "learning_rate": 9.35744944360898e-05, + "loss": 1.876, + "step": 6135 + }, + { + "epoch": 1.883364027010436, + "grad_norm": 0.5688899755477905, + "learning_rate": 9.35720565912593e-05, + "loss": 2.0153, + "step": 6136 + }, + { + "epoch": 1.883670963781461, + "grad_norm": 0.5005510449409485, + "learning_rate": 9.356961831582418e-05, + "loss": 1.9454, + "step": 6137 + }, + { + "epoch": 1.8839779005524862, + "grad_norm": 0.4002588987350464, + "learning_rate": 9.356717960980856e-05, + "loss": 1.9153, + "step": 6138 + }, + { + "epoch": 1.8842848373235115, + "grad_norm": 0.49053385853767395, + "learning_rate": 9.356474047323653e-05, + "loss": 1.9734, + "step": 6139 + }, + { + "epoch": 1.8845917740945364, + "grad_norm": 0.4828382432460785, + "learning_rate": 9.35623009061322e-05, + "loss": 1.8946, + "step": 6140 + }, + { + "epoch": 1.8848987108655617, + "grad_norm": 0.4389181137084961, + "learning_rate": 9.35598609085197e-05, + "loss": 1.9491, + "step": 6141 + }, + { + "epoch": 1.8852056476365868, + "grad_norm": 0.4010564982891083, + "learning_rate": 9.35574204804231e-05, + "loss": 1.8786, + "step": 6142 + }, + { + "epoch": 1.885512584407612, + "grad_norm": 0.4038756787776947, + "learning_rate": 9.355497962186657e-05, + "loss": 1.907, + "step": 6143 + }, + { + "epoch": 1.8858195211786373, + "grad_norm": 0.5030881762504578, + "learning_rate": 9.355253833287418e-05, + "loss": 1.8438, + "step": 6144 + }, + { + "epoch": 1.8861264579496624, + "grad_norm": 0.42690956592559814, + "learning_rate": 9.355009661347007e-05, + "loss": 1.8254, + "step": 6145 + }, + { + "epoch": 1.8864333947206875, + "grad_norm": 0.37733983993530273, + "learning_rate": 9.35476544636784e-05, + "loss": 1.9035, + "step": 6146 + }, + { + "epoch": 1.8867403314917128, + "grad_norm": 0.36874648928642273, + "learning_rate": 9.354521188352327e-05, + "loss": 1.885, + "step": 6147 + }, + { + "epoch": 1.8870472682627377, + "grad_norm": 0.36208659410476685, + "learning_rate": 9.354276887302885e-05, + "loss": 1.9416, + "step": 6148 + }, + { + "epoch": 1.887354205033763, + "grad_norm": 0.3952158987522125, + "learning_rate": 9.354032543221926e-05, + "loss": 1.9073, + "step": 6149 + }, + { + "epoch": 1.8876611418047882, + "grad_norm": 0.3603280782699585, + "learning_rate": 9.353788156111864e-05, + "loss": 1.9204, + "step": 6150 + }, + { + "epoch": 1.8879680785758133, + "grad_norm": 0.4325824975967407, + "learning_rate": 9.353543725975118e-05, + "loss": 1.9345, + "step": 6151 + }, + { + "epoch": 1.8882750153468386, + "grad_norm": 0.46270960569381714, + "learning_rate": 9.3532992528141e-05, + "loss": 1.9783, + "step": 6152 + }, + { + "epoch": 1.8885819521178637, + "grad_norm": 0.42317959666252136, + "learning_rate": 9.353054736631228e-05, + "loss": 1.9252, + "step": 6153 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.42653194069862366, + "learning_rate": 9.352810177428917e-05, + "loss": 1.9875, + "step": 6154 + }, + { + "epoch": 1.8891958256599142, + "grad_norm": 0.49819129705429077, + "learning_rate": 9.352565575209584e-05, + "loss": 1.9334, + "step": 6155 + }, + { + "epoch": 1.889502762430939, + "grad_norm": 0.4481790065765381, + "learning_rate": 9.352320929975646e-05, + "loss": 1.8939, + "step": 6156 + }, + { + "epoch": 1.8898096992019644, + "grad_norm": 0.41602686047554016, + "learning_rate": 9.352076241729524e-05, + "loss": 1.9207, + "step": 6157 + }, + { + "epoch": 1.8901166359729895, + "grad_norm": 0.4516759216785431, + "learning_rate": 9.351831510473633e-05, + "loss": 1.9384, + "step": 6158 + }, + { + "epoch": 1.8904235727440146, + "grad_norm": 0.5030251741409302, + "learning_rate": 9.351586736210391e-05, + "loss": 1.9787, + "step": 6159 + }, + { + "epoch": 1.89073050951504, + "grad_norm": 0.37176215648651123, + "learning_rate": 9.35134191894222e-05, + "loss": 1.8826, + "step": 6160 + }, + { + "epoch": 1.891037446286065, + "grad_norm": 0.3850235939025879, + "learning_rate": 9.351097058671537e-05, + "loss": 1.8689, + "step": 6161 + }, + { + "epoch": 1.8913443830570902, + "grad_norm": 0.3740260601043701, + "learning_rate": 9.350852155400764e-05, + "loss": 1.8624, + "step": 6162 + }, + { + "epoch": 1.8916513198281155, + "grad_norm": 0.386124849319458, + "learning_rate": 9.350607209132318e-05, + "loss": 1.8506, + "step": 6163 + }, + { + "epoch": 1.8919582565991406, + "grad_norm": 0.3743472993373871, + "learning_rate": 9.350362219868623e-05, + "loss": 1.9499, + "step": 6164 + }, + { + "epoch": 1.8922651933701657, + "grad_norm": 0.4257555603981018, + "learning_rate": 9.350117187612097e-05, + "loss": 1.9407, + "step": 6165 + }, + { + "epoch": 1.892572130141191, + "grad_norm": 0.37218552827835083, + "learning_rate": 9.349872112365163e-05, + "loss": 1.8772, + "step": 6166 + }, + { + "epoch": 1.892879066912216, + "grad_norm": 0.3443894386291504, + "learning_rate": 9.349626994130245e-05, + "loss": 1.8846, + "step": 6167 + }, + { + "epoch": 1.8931860036832413, + "grad_norm": 0.33507248759269714, + "learning_rate": 9.349381832909763e-05, + "loss": 1.9303, + "step": 6168 + }, + { + "epoch": 1.8934929404542664, + "grad_norm": 0.3844592869281769, + "learning_rate": 9.349136628706141e-05, + "loss": 1.9453, + "step": 6169 + }, + { + "epoch": 1.8937998772252915, + "grad_norm": 0.35765793919563293, + "learning_rate": 9.348891381521802e-05, + "loss": 1.8745, + "step": 6170 + }, + { + "epoch": 1.8941068139963169, + "grad_norm": 0.3732185661792755, + "learning_rate": 9.348646091359168e-05, + "loss": 1.9318, + "step": 6171 + }, + { + "epoch": 1.894413750767342, + "grad_norm": 0.3704257607460022, + "learning_rate": 9.348400758220666e-05, + "loss": 1.9285, + "step": 6172 + }, + { + "epoch": 1.894720687538367, + "grad_norm": 0.32159942388534546, + "learning_rate": 9.348155382108717e-05, + "loss": 1.8368, + "step": 6173 + }, + { + "epoch": 1.8950276243093924, + "grad_norm": 0.32755646109580994, + "learning_rate": 9.34790996302575e-05, + "loss": 1.8975, + "step": 6174 + }, + { + "epoch": 1.8953345610804173, + "grad_norm": 0.38797906041145325, + "learning_rate": 9.347664500974186e-05, + "loss": 1.9684, + "step": 6175 + }, + { + "epoch": 1.8956414978514426, + "grad_norm": 0.3870599865913391, + "learning_rate": 9.347418995956456e-05, + "loss": 1.963, + "step": 6176 + }, + { + "epoch": 1.8959484346224678, + "grad_norm": 0.35739025473594666, + "learning_rate": 9.347173447974982e-05, + "loss": 1.8912, + "step": 6177 + }, + { + "epoch": 1.8962553713934929, + "grad_norm": 0.3525852859020233, + "learning_rate": 9.346927857032193e-05, + "loss": 1.8455, + "step": 6178 + }, + { + "epoch": 1.8965623081645182, + "grad_norm": 0.39735934138298035, + "learning_rate": 9.346682223130514e-05, + "loss": 1.8824, + "step": 6179 + }, + { + "epoch": 1.8968692449355433, + "grad_norm": 0.3677692413330078, + "learning_rate": 9.346436546272373e-05, + "loss": 1.8723, + "step": 6180 + }, + { + "epoch": 1.8971761817065684, + "grad_norm": 0.3660476505756378, + "learning_rate": 9.346190826460199e-05, + "loss": 1.9674, + "step": 6181 + }, + { + "epoch": 1.8974831184775938, + "grad_norm": 0.4416230022907257, + "learning_rate": 9.34594506369642e-05, + "loss": 1.9309, + "step": 6182 + }, + { + "epoch": 1.8977900552486187, + "grad_norm": 0.39761826395988464, + "learning_rate": 9.345699257983466e-05, + "loss": 1.9408, + "step": 6183 + }, + { + "epoch": 1.898096992019644, + "grad_norm": 0.44419440627098083, + "learning_rate": 9.345453409323763e-05, + "loss": 2.0013, + "step": 6184 + }, + { + "epoch": 1.898403928790669, + "grad_norm": 0.4173676371574402, + "learning_rate": 9.345207517719743e-05, + "loss": 1.8462, + "step": 6185 + }, + { + "epoch": 1.8987108655616942, + "grad_norm": 0.39312002062797546, + "learning_rate": 9.344961583173837e-05, + "loss": 1.8716, + "step": 6186 + }, + { + "epoch": 1.8990178023327196, + "grad_norm": 0.389996737241745, + "learning_rate": 9.344715605688472e-05, + "loss": 1.9331, + "step": 6187 + }, + { + "epoch": 1.8993247391037447, + "grad_norm": 0.4575251340866089, + "learning_rate": 9.34446958526608e-05, + "loss": 1.9408, + "step": 6188 + }, + { + "epoch": 1.8996316758747698, + "grad_norm": 0.425075888633728, + "learning_rate": 9.344223521909097e-05, + "loss": 1.8632, + "step": 6189 + }, + { + "epoch": 1.899938612645795, + "grad_norm": 0.3622394800186157, + "learning_rate": 9.343977415619948e-05, + "loss": 1.8671, + "step": 6190 + }, + { + "epoch": 1.90024554941682, + "grad_norm": 0.38955047726631165, + "learning_rate": 9.343731266401068e-05, + "loss": 1.8955, + "step": 6191 + }, + { + "epoch": 1.9005524861878453, + "grad_norm": 0.40853381156921387, + "learning_rate": 9.34348507425489e-05, + "loss": 1.8477, + "step": 6192 + }, + { + "epoch": 1.9008594229588704, + "grad_norm": 0.36416095495224, + "learning_rate": 9.343238839183848e-05, + "loss": 1.8596, + "step": 6193 + }, + { + "epoch": 1.9011663597298956, + "grad_norm": 0.3371017277240753, + "learning_rate": 9.342992561190374e-05, + "loss": 1.9646, + "step": 6194 + }, + { + "epoch": 1.901473296500921, + "grad_norm": 0.3605191111564636, + "learning_rate": 9.3427462402769e-05, + "loss": 1.9165, + "step": 6195 + }, + { + "epoch": 1.901780233271946, + "grad_norm": 0.32952287793159485, + "learning_rate": 9.342499876445863e-05, + "loss": 1.8827, + "step": 6196 + }, + { + "epoch": 1.9020871700429711, + "grad_norm": 0.3627411425113678, + "learning_rate": 9.342253469699698e-05, + "loss": 1.9058, + "step": 6197 + }, + { + "epoch": 1.9023941068139965, + "grad_norm": 0.3830505311489105, + "learning_rate": 9.342007020040839e-05, + "loss": 1.89, + "step": 6198 + }, + { + "epoch": 1.9027010435850213, + "grad_norm": 0.36550065875053406, + "learning_rate": 9.341760527471722e-05, + "loss": 1.9004, + "step": 6199 + }, + { + "epoch": 1.9030079803560467, + "grad_norm": 0.4098506569862366, + "learning_rate": 9.341513991994782e-05, + "loss": 1.8656, + "step": 6200 + }, + { + "epoch": 1.9033149171270718, + "grad_norm": 0.5218825340270996, + "learning_rate": 9.341267413612456e-05, + "loss": 1.9179, + "step": 6201 + }, + { + "epoch": 1.903621853898097, + "grad_norm": 0.6201978921890259, + "learning_rate": 9.34102079232718e-05, + "loss": 1.9485, + "step": 6202 + }, + { + "epoch": 1.9039287906691222, + "grad_norm": 0.597594141960144, + "learning_rate": 9.340774128141395e-05, + "loss": 1.9074, + "step": 6203 + }, + { + "epoch": 1.9042357274401474, + "grad_norm": 0.477268248796463, + "learning_rate": 9.340527421057533e-05, + "loss": 1.9202, + "step": 6204 + }, + { + "epoch": 1.9045426642111725, + "grad_norm": 0.39805278182029724, + "learning_rate": 9.340280671078035e-05, + "loss": 1.8801, + "step": 6205 + }, + { + "epoch": 1.9048496009821978, + "grad_norm": 0.5815454721450806, + "learning_rate": 9.340033878205342e-05, + "loss": 1.8564, + "step": 6206 + }, + { + "epoch": 1.9051565377532227, + "grad_norm": 0.6385661363601685, + "learning_rate": 9.339787042441888e-05, + "loss": 1.8992, + "step": 6207 + }, + { + "epoch": 1.905463474524248, + "grad_norm": 0.5905124545097351, + "learning_rate": 9.339540163790116e-05, + "loss": 1.9608, + "step": 6208 + }, + { + "epoch": 1.9057704112952731, + "grad_norm": 0.37329113483428955, + "learning_rate": 9.339293242252465e-05, + "loss": 1.9037, + "step": 6209 + }, + { + "epoch": 1.9060773480662982, + "grad_norm": 0.4568968117237091, + "learning_rate": 9.339046277831374e-05, + "loss": 1.8719, + "step": 6210 + }, + { + "epoch": 1.9063842848373236, + "grad_norm": 0.43003782629966736, + "learning_rate": 9.338799270529284e-05, + "loss": 1.8594, + "step": 6211 + }, + { + "epoch": 1.9066912216083487, + "grad_norm": 0.3795240819454193, + "learning_rate": 9.338552220348637e-05, + "loss": 1.8645, + "step": 6212 + }, + { + "epoch": 1.9069981583793738, + "grad_norm": 0.3791581392288208, + "learning_rate": 9.338305127291876e-05, + "loss": 1.9076, + "step": 6213 + }, + { + "epoch": 1.9073050951503991, + "grad_norm": 0.3747733533382416, + "learning_rate": 9.338057991361438e-05, + "loss": 1.8665, + "step": 6214 + }, + { + "epoch": 1.907612031921424, + "grad_norm": 0.3994114100933075, + "learning_rate": 9.337810812559771e-05, + "loss": 1.9202, + "step": 6215 + }, + { + "epoch": 1.9079189686924494, + "grad_norm": 0.3808605670928955, + "learning_rate": 9.337563590889312e-05, + "loss": 1.9272, + "step": 6216 + }, + { + "epoch": 1.9082259054634745, + "grad_norm": 0.3461966812610626, + "learning_rate": 9.33731632635251e-05, + "loss": 1.8621, + "step": 6217 + }, + { + "epoch": 1.9085328422344996, + "grad_norm": 0.37272316217422485, + "learning_rate": 9.337069018951805e-05, + "loss": 1.8996, + "step": 6218 + }, + { + "epoch": 1.908839779005525, + "grad_norm": 0.40319329500198364, + "learning_rate": 9.336821668689642e-05, + "loss": 1.8852, + "step": 6219 + }, + { + "epoch": 1.90914671577655, + "grad_norm": 0.4059053659439087, + "learning_rate": 9.336574275568463e-05, + "loss": 1.9156, + "step": 6220 + }, + { + "epoch": 1.9094536525475752, + "grad_norm": 0.41244640946388245, + "learning_rate": 9.336326839590719e-05, + "loss": 1.9858, + "step": 6221 + }, + { + "epoch": 1.9097605893186005, + "grad_norm": 0.38230007886886597, + "learning_rate": 9.336079360758849e-05, + "loss": 1.8756, + "step": 6222 + }, + { + "epoch": 1.9100675260896254, + "grad_norm": 0.3620646297931671, + "learning_rate": 9.335831839075304e-05, + "loss": 1.9305, + "step": 6223 + }, + { + "epoch": 1.9103744628606507, + "grad_norm": 0.3700193166732788, + "learning_rate": 9.335584274542525e-05, + "loss": 1.8544, + "step": 6224 + }, + { + "epoch": 1.9106813996316758, + "grad_norm": 0.36827734112739563, + "learning_rate": 9.335336667162962e-05, + "loss": 1.8658, + "step": 6225 + }, + { + "epoch": 1.910988336402701, + "grad_norm": 0.33878061175346375, + "learning_rate": 9.33508901693906e-05, + "loss": 1.8638, + "step": 6226 + }, + { + "epoch": 1.9112952731737263, + "grad_norm": 0.3522186577320099, + "learning_rate": 9.334841323873269e-05, + "loss": 1.9109, + "step": 6227 + }, + { + "epoch": 1.9116022099447514, + "grad_norm": 0.3552776277065277, + "learning_rate": 9.334593587968035e-05, + "loss": 1.8499, + "step": 6228 + }, + { + "epoch": 1.9119091467157765, + "grad_norm": 0.3232300877571106, + "learning_rate": 9.334345809225805e-05, + "loss": 1.9078, + "step": 6229 + }, + { + "epoch": 1.9122160834868018, + "grad_norm": 0.3500599265098572, + "learning_rate": 9.33409798764903e-05, + "loss": 1.8953, + "step": 6230 + }, + { + "epoch": 1.9125230202578267, + "grad_norm": 0.4011479914188385, + "learning_rate": 9.333850123240159e-05, + "loss": 1.8961, + "step": 6231 + }, + { + "epoch": 1.912829957028852, + "grad_norm": 0.419539213180542, + "learning_rate": 9.333602216001642e-05, + "loss": 1.9381, + "step": 6232 + }, + { + "epoch": 1.9131368937998774, + "grad_norm": 0.364956259727478, + "learning_rate": 9.333354265935926e-05, + "loss": 1.8495, + "step": 6233 + }, + { + "epoch": 1.9134438305709023, + "grad_norm": 0.3322601318359375, + "learning_rate": 9.333106273045464e-05, + "loss": 1.8389, + "step": 6234 + }, + { + "epoch": 1.9137507673419276, + "grad_norm": 0.3706522583961487, + "learning_rate": 9.332858237332705e-05, + "loss": 1.904, + "step": 6235 + }, + { + "epoch": 1.9140577041129527, + "grad_norm": 0.3900963366031647, + "learning_rate": 9.332610158800104e-05, + "loss": 1.8974, + "step": 6236 + }, + { + "epoch": 1.9143646408839778, + "grad_norm": 0.3308334946632385, + "learning_rate": 9.332362037450108e-05, + "loss": 1.959, + "step": 6237 + }, + { + "epoch": 1.9146715776550032, + "grad_norm": 0.37876754999160767, + "learning_rate": 9.332113873285171e-05, + "loss": 1.9187, + "step": 6238 + }, + { + "epoch": 1.9149785144260283, + "grad_norm": 0.3557550609111786, + "learning_rate": 9.331865666307746e-05, + "loss": 1.9351, + "step": 6239 + }, + { + "epoch": 1.9152854511970534, + "grad_norm": 0.3792133927345276, + "learning_rate": 9.331617416520285e-05, + "loss": 1.8488, + "step": 6240 + }, + { + "epoch": 1.9155923879680787, + "grad_norm": 0.40517017245292664, + "learning_rate": 9.331369123925242e-05, + "loss": 1.9311, + "step": 6241 + }, + { + "epoch": 1.9158993247391036, + "grad_norm": 0.34011030197143555, + "learning_rate": 9.331120788525072e-05, + "loss": 1.8606, + "step": 6242 + }, + { + "epoch": 1.916206261510129, + "grad_norm": 0.39949584007263184, + "learning_rate": 9.330872410322227e-05, + "loss": 1.9156, + "step": 6243 + }, + { + "epoch": 1.916513198281154, + "grad_norm": 0.3771394193172455, + "learning_rate": 9.330623989319162e-05, + "loss": 1.8448, + "step": 6244 + }, + { + "epoch": 1.9168201350521792, + "grad_norm": 0.32114169001579285, + "learning_rate": 9.330375525518333e-05, + "loss": 1.8681, + "step": 6245 + }, + { + "epoch": 1.9171270718232045, + "grad_norm": 0.3438408672809601, + "learning_rate": 9.330127018922194e-05, + "loss": 1.8582, + "step": 6246 + }, + { + "epoch": 1.9174340085942296, + "grad_norm": 0.35971906781196594, + "learning_rate": 9.329878469533201e-05, + "loss": 1.9026, + "step": 6247 + }, + { + "epoch": 1.9177409453652547, + "grad_norm": 0.3953855633735657, + "learning_rate": 9.329629877353813e-05, + "loss": 1.8837, + "step": 6248 + }, + { + "epoch": 1.91804788213628, + "grad_norm": 0.36541905999183655, + "learning_rate": 9.329381242386485e-05, + "loss": 1.9156, + "step": 6249 + }, + { + "epoch": 1.918354818907305, + "grad_norm": 0.3577594459056854, + "learning_rate": 9.329132564633673e-05, + "loss": 1.8791, + "step": 6250 + }, + { + "epoch": 1.9186617556783303, + "grad_norm": 0.3869122564792633, + "learning_rate": 9.328883844097837e-05, + "loss": 1.9048, + "step": 6251 + }, + { + "epoch": 1.9189686924493554, + "grad_norm": 0.35097724199295044, + "learning_rate": 9.328635080781433e-05, + "loss": 1.9602, + "step": 6252 + }, + { + "epoch": 1.9192756292203805, + "grad_norm": 0.3813062012195587, + "learning_rate": 9.328386274686919e-05, + "loss": 1.9133, + "step": 6253 + }, + { + "epoch": 1.9195825659914059, + "grad_norm": 0.3950280249118805, + "learning_rate": 9.328137425816756e-05, + "loss": 1.9462, + "step": 6254 + }, + { + "epoch": 1.919889502762431, + "grad_norm": 0.41710540652275085, + "learning_rate": 9.327888534173402e-05, + "loss": 1.8616, + "step": 6255 + }, + { + "epoch": 1.920196439533456, + "grad_norm": 0.39998626708984375, + "learning_rate": 9.327639599759318e-05, + "loss": 1.8758, + "step": 6256 + }, + { + "epoch": 1.9205033763044814, + "grad_norm": 0.35425302386283875, + "learning_rate": 9.32739062257696e-05, + "loss": 1.8896, + "step": 6257 + }, + { + "epoch": 1.9208103130755063, + "grad_norm": 0.3487682640552521, + "learning_rate": 9.327141602628793e-05, + "loss": 1.8901, + "step": 6258 + }, + { + "epoch": 1.9211172498465316, + "grad_norm": 0.38767126202583313, + "learning_rate": 9.326892539917277e-05, + "loss": 1.9264, + "step": 6259 + }, + { + "epoch": 1.9214241866175568, + "grad_norm": 0.4265333116054535, + "learning_rate": 9.326643434444872e-05, + "loss": 1.9282, + "step": 6260 + }, + { + "epoch": 1.9217311233885819, + "grad_norm": 0.3386894166469574, + "learning_rate": 9.326394286214042e-05, + "loss": 1.8167, + "step": 6261 + }, + { + "epoch": 1.9220380601596072, + "grad_norm": 0.3594066798686981, + "learning_rate": 9.326145095227246e-05, + "loss": 1.9293, + "step": 6262 + }, + { + "epoch": 1.9223449969306323, + "grad_norm": 0.4041733741760254, + "learning_rate": 9.32589586148695e-05, + "loss": 2.0066, + "step": 6263 + }, + { + "epoch": 1.9226519337016574, + "grad_norm": 0.45588794350624084, + "learning_rate": 9.325646584995615e-05, + "loss": 1.9485, + "step": 6264 + }, + { + "epoch": 1.9229588704726828, + "grad_norm": 0.42583590745925903, + "learning_rate": 9.325397265755705e-05, + "loss": 1.8973, + "step": 6265 + }, + { + "epoch": 1.9232658072437077, + "grad_norm": 0.38701504468917847, + "learning_rate": 9.325147903769684e-05, + "loss": 1.9624, + "step": 6266 + }, + { + "epoch": 1.923572744014733, + "grad_norm": 0.4298608899116516, + "learning_rate": 9.324898499040017e-05, + "loss": 1.9033, + "step": 6267 + }, + { + "epoch": 1.923879680785758, + "grad_norm": 0.3692619800567627, + "learning_rate": 9.324649051569167e-05, + "loss": 1.973, + "step": 6268 + }, + { + "epoch": 1.9241866175567832, + "grad_norm": 0.40625011920928955, + "learning_rate": 9.324399561359602e-05, + "loss": 1.8629, + "step": 6269 + }, + { + "epoch": 1.9244935543278086, + "grad_norm": 0.43613263964653015, + "learning_rate": 9.324150028413784e-05, + "loss": 1.8928, + "step": 6270 + }, + { + "epoch": 1.9248004910988337, + "grad_norm": 0.4670937657356262, + "learning_rate": 9.323900452734182e-05, + "loss": 1.8809, + "step": 6271 + }, + { + "epoch": 1.9251074278698588, + "grad_norm": 0.43263986706733704, + "learning_rate": 9.323650834323262e-05, + "loss": 1.891, + "step": 6272 + }, + { + "epoch": 1.9254143646408841, + "grad_norm": 0.4253878891468048, + "learning_rate": 9.32340117318349e-05, + "loss": 2.0064, + "step": 6273 + }, + { + "epoch": 1.925721301411909, + "grad_norm": 0.3742302358150482, + "learning_rate": 9.323151469317332e-05, + "loss": 1.9441, + "step": 6274 + }, + { + "epoch": 1.9260282381829343, + "grad_norm": 0.37415632605552673, + "learning_rate": 9.32290172272726e-05, + "loss": 1.8901, + "step": 6275 + }, + { + "epoch": 1.9263351749539595, + "grad_norm": 0.402935266494751, + "learning_rate": 9.322651933415738e-05, + "loss": 1.9013, + "step": 6276 + }, + { + "epoch": 1.9266421117249846, + "grad_norm": 0.479819118976593, + "learning_rate": 9.322402101385235e-05, + "loss": 1.9713, + "step": 6277 + }, + { + "epoch": 1.92694904849601, + "grad_norm": 0.4472719430923462, + "learning_rate": 9.322152226638222e-05, + "loss": 1.9106, + "step": 6278 + }, + { + "epoch": 1.927255985267035, + "grad_norm": 0.36508920788764954, + "learning_rate": 9.321902309177168e-05, + "loss": 1.8999, + "step": 6279 + }, + { + "epoch": 1.9275629220380601, + "grad_norm": 0.38674476742744446, + "learning_rate": 9.321652349004542e-05, + "loss": 1.8653, + "step": 6280 + }, + { + "epoch": 1.9278698588090855, + "grad_norm": 0.3745587170124054, + "learning_rate": 9.321402346122814e-05, + "loss": 1.8764, + "step": 6281 + }, + { + "epoch": 1.9281767955801103, + "grad_norm": 0.37824445962905884, + "learning_rate": 9.321152300534454e-05, + "loss": 1.8712, + "step": 6282 + }, + { + "epoch": 1.9284837323511357, + "grad_norm": 0.3442685306072235, + "learning_rate": 9.320902212241936e-05, + "loss": 1.8242, + "step": 6283 + }, + { + "epoch": 1.9287906691221608, + "grad_norm": 0.3152186870574951, + "learning_rate": 9.32065208124773e-05, + "loss": 1.9282, + "step": 6284 + }, + { + "epoch": 1.929097605893186, + "grad_norm": 0.35380542278289795, + "learning_rate": 9.320401907554306e-05, + "loss": 1.8783, + "step": 6285 + }, + { + "epoch": 1.9294045426642112, + "grad_norm": 0.3140089511871338, + "learning_rate": 9.320151691164138e-05, + "loss": 1.9174, + "step": 6286 + }, + { + "epoch": 1.9297114794352364, + "grad_norm": 0.33666202425956726, + "learning_rate": 9.3199014320797e-05, + "loss": 1.8926, + "step": 6287 + }, + { + "epoch": 1.9300184162062615, + "grad_norm": 0.3297472894191742, + "learning_rate": 9.319651130303465e-05, + "loss": 1.8763, + "step": 6288 + }, + { + "epoch": 1.9303253529772868, + "grad_norm": 0.3323235511779785, + "learning_rate": 9.319400785837906e-05, + "loss": 1.9088, + "step": 6289 + }, + { + "epoch": 1.9306322897483117, + "grad_norm": 0.32601413130760193, + "learning_rate": 9.319150398685494e-05, + "loss": 1.8672, + "step": 6290 + }, + { + "epoch": 1.930939226519337, + "grad_norm": 0.35310089588165283, + "learning_rate": 9.318899968848708e-05, + "loss": 1.9492, + "step": 6291 + }, + { + "epoch": 1.9312461632903621, + "grad_norm": 0.3718548119068146, + "learning_rate": 9.31864949633002e-05, + "loss": 1.8692, + "step": 6292 + }, + { + "epoch": 1.9315531000613873, + "grad_norm": 0.42382025718688965, + "learning_rate": 9.318398981131908e-05, + "loss": 1.9693, + "step": 6293 + }, + { + "epoch": 1.9318600368324126, + "grad_norm": 0.5123299360275269, + "learning_rate": 9.318148423256845e-05, + "loss": 2.0117, + "step": 6294 + }, + { + "epoch": 1.9321669736034377, + "grad_norm": 0.4483809769153595, + "learning_rate": 9.317897822707308e-05, + "loss": 1.9165, + "step": 6295 + }, + { + "epoch": 1.9324739103744628, + "grad_norm": 0.4385908544063568, + "learning_rate": 9.317647179485776e-05, + "loss": 1.8869, + "step": 6296 + }, + { + "epoch": 1.9327808471454881, + "grad_norm": 0.42863771319389343, + "learning_rate": 9.317396493594724e-05, + "loss": 1.9484, + "step": 6297 + }, + { + "epoch": 1.933087783916513, + "grad_norm": 0.4130534529685974, + "learning_rate": 9.317145765036627e-05, + "loss": 1.9201, + "step": 6298 + }, + { + "epoch": 1.9333947206875384, + "grad_norm": 0.39024612307548523, + "learning_rate": 9.316894993813965e-05, + "loss": 1.9674, + "step": 6299 + }, + { + "epoch": 1.9337016574585635, + "grad_norm": 0.41060271859169006, + "learning_rate": 9.316644179929219e-05, + "loss": 1.9529, + "step": 6300 + }, + { + "epoch": 1.9340085942295886, + "grad_norm": 0.4302372634410858, + "learning_rate": 9.316393323384863e-05, + "loss": 1.8998, + "step": 6301 + }, + { + "epoch": 1.934315531000614, + "grad_norm": 0.3739410936832428, + "learning_rate": 9.316142424183379e-05, + "loss": 1.8812, + "step": 6302 + }, + { + "epoch": 1.934622467771639, + "grad_norm": 0.3965891897678375, + "learning_rate": 9.315891482327245e-05, + "loss": 1.8851, + "step": 6303 + }, + { + "epoch": 1.9349294045426642, + "grad_norm": 0.4486664831638336, + "learning_rate": 9.315640497818943e-05, + "loss": 1.9494, + "step": 6304 + }, + { + "epoch": 1.9352363413136895, + "grad_norm": 0.5530070662498474, + "learning_rate": 9.315389470660951e-05, + "loss": 1.9716, + "step": 6305 + }, + { + "epoch": 1.9355432780847146, + "grad_norm": 0.7142495512962341, + "learning_rate": 9.315138400855751e-05, + "loss": 1.947, + "step": 6306 + }, + { + "epoch": 1.9358502148557397, + "grad_norm": 0.7555594444274902, + "learning_rate": 9.314887288405827e-05, + "loss": 1.873, + "step": 6307 + }, + { + "epoch": 1.936157151626765, + "grad_norm": 0.6025232076644897, + "learning_rate": 9.314636133313654e-05, + "loss": 1.9189, + "step": 6308 + }, + { + "epoch": 1.93646408839779, + "grad_norm": 0.3686346113681793, + "learning_rate": 9.314384935581719e-05, + "loss": 1.8461, + "step": 6309 + }, + { + "epoch": 1.9367710251688153, + "grad_norm": 0.46265771985054016, + "learning_rate": 9.314133695212505e-05, + "loss": 1.8955, + "step": 6310 + }, + { + "epoch": 1.9370779619398404, + "grad_norm": 0.7023865580558777, + "learning_rate": 9.313882412208492e-05, + "loss": 1.9378, + "step": 6311 + }, + { + "epoch": 1.9373848987108655, + "grad_norm": 0.7163348197937012, + "learning_rate": 9.313631086572163e-05, + "loss": 1.9278, + "step": 6312 + }, + { + "epoch": 1.9376918354818908, + "grad_norm": 0.4772320091724396, + "learning_rate": 9.313379718306006e-05, + "loss": 1.9215, + "step": 6313 + }, + { + "epoch": 1.937998772252916, + "grad_norm": 0.4934171438217163, + "learning_rate": 9.313128307412501e-05, + "loss": 1.9725, + "step": 6314 + }, + { + "epoch": 1.938305709023941, + "grad_norm": 0.5988278985023499, + "learning_rate": 9.312876853894134e-05, + "loss": 1.9238, + "step": 6315 + }, + { + "epoch": 1.9386126457949664, + "grad_norm": 0.5819640159606934, + "learning_rate": 9.31262535775339e-05, + "loss": 1.9228, + "step": 6316 + }, + { + "epoch": 1.9389195825659913, + "grad_norm": 0.49525877833366394, + "learning_rate": 9.312373818992756e-05, + "loss": 1.8939, + "step": 6317 + }, + { + "epoch": 1.9392265193370166, + "grad_norm": 0.3778049647808075, + "learning_rate": 9.312122237614715e-05, + "loss": 1.8709, + "step": 6318 + }, + { + "epoch": 1.9395334561080417, + "grad_norm": 0.48716801404953003, + "learning_rate": 9.311870613621754e-05, + "loss": 1.9014, + "step": 6319 + }, + { + "epoch": 1.9398403928790668, + "grad_norm": 0.47298866510391235, + "learning_rate": 9.311618947016362e-05, + "loss": 1.8686, + "step": 6320 + }, + { + "epoch": 1.9401473296500922, + "grad_norm": 0.3709685206413269, + "learning_rate": 9.311367237801023e-05, + "loss": 1.9531, + "step": 6321 + }, + { + "epoch": 1.9404542664211173, + "grad_norm": 0.3898928761482239, + "learning_rate": 9.311115485978228e-05, + "loss": 1.8806, + "step": 6322 + }, + { + "epoch": 1.9407612031921424, + "grad_norm": 0.43091922998428345, + "learning_rate": 9.310863691550461e-05, + "loss": 1.9278, + "step": 6323 + }, + { + "epoch": 1.9410681399631677, + "grad_norm": 0.3788231909275055, + "learning_rate": 9.310611854520212e-05, + "loss": 1.893, + "step": 6324 + }, + { + "epoch": 1.9413750767341926, + "grad_norm": 0.4471469819545746, + "learning_rate": 9.310359974889972e-05, + "loss": 1.9706, + "step": 6325 + }, + { + "epoch": 1.941682013505218, + "grad_norm": 0.4047459661960602, + "learning_rate": 9.310108052662228e-05, + "loss": 1.8863, + "step": 6326 + }, + { + "epoch": 1.941988950276243, + "grad_norm": 0.4334566593170166, + "learning_rate": 9.309856087839468e-05, + "loss": 1.9543, + "step": 6327 + }, + { + "epoch": 1.9422958870472682, + "grad_norm": 0.3828316032886505, + "learning_rate": 9.309604080424185e-05, + "loss": 1.8601, + "step": 6328 + }, + { + "epoch": 1.9426028238182935, + "grad_norm": 0.3702560067176819, + "learning_rate": 9.30935203041887e-05, + "loss": 1.9055, + "step": 6329 + }, + { + "epoch": 1.9429097605893186, + "grad_norm": 0.4922797977924347, + "learning_rate": 9.309099937826011e-05, + "loss": 1.9589, + "step": 6330 + }, + { + "epoch": 1.9432166973603437, + "grad_norm": 0.4073271155357361, + "learning_rate": 9.308847802648102e-05, + "loss": 1.9727, + "step": 6331 + }, + { + "epoch": 1.943523634131369, + "grad_norm": 0.3833904266357422, + "learning_rate": 9.308595624887633e-05, + "loss": 1.8641, + "step": 6332 + }, + { + "epoch": 1.943830570902394, + "grad_norm": 0.44063761830329895, + "learning_rate": 9.308343404547095e-05, + "loss": 1.8996, + "step": 6333 + }, + { + "epoch": 1.9441375076734193, + "grad_norm": 0.4776977300643921, + "learning_rate": 9.308091141628983e-05, + "loss": 1.9353, + "step": 6334 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.39584699273109436, + "learning_rate": 9.307838836135792e-05, + "loss": 1.8521, + "step": 6335 + }, + { + "epoch": 1.9447513812154695, + "grad_norm": 0.3220890760421753, + "learning_rate": 9.30758648807001e-05, + "loss": 1.825, + "step": 6336 + }, + { + "epoch": 1.9450583179864949, + "grad_norm": 0.4301774501800537, + "learning_rate": 9.307334097434133e-05, + "loss": 1.9317, + "step": 6337 + }, + { + "epoch": 1.94536525475752, + "grad_norm": 0.439165323972702, + "learning_rate": 9.307081664230658e-05, + "loss": 1.8669, + "step": 6338 + }, + { + "epoch": 1.945672191528545, + "grad_norm": 0.4185279607772827, + "learning_rate": 9.306829188462076e-05, + "loss": 1.9512, + "step": 6339 + }, + { + "epoch": 1.9459791282995704, + "grad_norm": 0.4089502990245819, + "learning_rate": 9.306576670130885e-05, + "loss": 1.9607, + "step": 6340 + }, + { + "epoch": 1.9462860650705953, + "grad_norm": 0.508836567401886, + "learning_rate": 9.306324109239578e-05, + "loss": 1.9187, + "step": 6341 + }, + { + "epoch": 1.9465930018416207, + "grad_norm": 0.637534499168396, + "learning_rate": 9.306071505790652e-05, + "loss": 1.8237, + "step": 6342 + }, + { + "epoch": 1.9468999386126458, + "grad_norm": 0.5845112800598145, + "learning_rate": 9.305818859786603e-05, + "loss": 1.8238, + "step": 6343 + }, + { + "epoch": 1.9472068753836709, + "grad_norm": 0.4168374240398407, + "learning_rate": 9.305566171229932e-05, + "loss": 1.9343, + "step": 6344 + }, + { + "epoch": 1.9475138121546962, + "grad_norm": 0.43040701746940613, + "learning_rate": 9.305313440123129e-05, + "loss": 1.8774, + "step": 6345 + }, + { + "epoch": 1.9478207489257213, + "grad_norm": 0.6011641025543213, + "learning_rate": 9.305060666468696e-05, + "loss": 1.89, + "step": 6346 + }, + { + "epoch": 1.9481276856967464, + "grad_norm": 0.5530022382736206, + "learning_rate": 9.304807850269131e-05, + "loss": 2.0006, + "step": 6347 + }, + { + "epoch": 1.9484346224677718, + "grad_norm": 0.3707423210144043, + "learning_rate": 9.30455499152693e-05, + "loss": 1.9116, + "step": 6348 + }, + { + "epoch": 1.9487415592387967, + "grad_norm": 0.5013771653175354, + "learning_rate": 9.304302090244595e-05, + "loss": 1.8902, + "step": 6349 + }, + { + "epoch": 1.949048496009822, + "grad_norm": 0.5873609781265259, + "learning_rate": 9.304049146424623e-05, + "loss": 1.8879, + "step": 6350 + }, + { + "epoch": 1.949355432780847, + "grad_norm": 0.4389801621437073, + "learning_rate": 9.303796160069516e-05, + "loss": 1.9215, + "step": 6351 + }, + { + "epoch": 1.9496623695518722, + "grad_norm": 0.4004434645175934, + "learning_rate": 9.303543131181772e-05, + "loss": 1.9137, + "step": 6352 + }, + { + "epoch": 1.9499693063228976, + "grad_norm": 0.4928852617740631, + "learning_rate": 9.303290059763892e-05, + "loss": 1.9415, + "step": 6353 + }, + { + "epoch": 1.9502762430939227, + "grad_norm": 0.5045879483222961, + "learning_rate": 9.303036945818377e-05, + "loss": 1.8727, + "step": 6354 + }, + { + "epoch": 1.9505831798649478, + "grad_norm": 0.3434823453426361, + "learning_rate": 9.30278378934773e-05, + "loss": 1.8971, + "step": 6355 + }, + { + "epoch": 1.9508901166359731, + "grad_norm": 0.42980003356933594, + "learning_rate": 9.302530590354452e-05, + "loss": 1.9233, + "step": 6356 + }, + { + "epoch": 1.951197053406998, + "grad_norm": 0.3832406997680664, + "learning_rate": 9.302277348841042e-05, + "loss": 1.9317, + "step": 6357 + }, + { + "epoch": 1.9515039901780233, + "grad_norm": 0.37214264273643494, + "learning_rate": 9.30202406481001e-05, + "loss": 1.9172, + "step": 6358 + }, + { + "epoch": 1.9518109269490485, + "grad_norm": 0.3601585924625397, + "learning_rate": 9.30177073826385e-05, + "loss": 1.9286, + "step": 6359 + }, + { + "epoch": 1.9521178637200736, + "grad_norm": 0.36419349908828735, + "learning_rate": 9.301517369205072e-05, + "loss": 1.8624, + "step": 6360 + }, + { + "epoch": 1.952424800491099, + "grad_norm": 0.3808813691139221, + "learning_rate": 9.30126395763618e-05, + "loss": 1.8656, + "step": 6361 + }, + { + "epoch": 1.952731737262124, + "grad_norm": 0.39045700430870056, + "learning_rate": 9.301010503559675e-05, + "loss": 1.9205, + "step": 6362 + }, + { + "epoch": 1.9530386740331491, + "grad_norm": 0.37281444668769836, + "learning_rate": 9.300757006978065e-05, + "loss": 1.9162, + "step": 6363 + }, + { + "epoch": 1.9533456108041745, + "grad_norm": 0.4525204002857208, + "learning_rate": 9.300503467893851e-05, + "loss": 1.8999, + "step": 6364 + }, + { + "epoch": 1.9536525475751993, + "grad_norm": 0.41406187415122986, + "learning_rate": 9.300249886309542e-05, + "loss": 1.9804, + "step": 6365 + }, + { + "epoch": 1.9539594843462247, + "grad_norm": 0.4125058650970459, + "learning_rate": 9.299996262227644e-05, + "loss": 1.8464, + "step": 6366 + }, + { + "epoch": 1.9542664211172498, + "grad_norm": 0.41582876443862915, + "learning_rate": 9.299742595650663e-05, + "loss": 1.9937, + "step": 6367 + }, + { + "epoch": 1.954573357888275, + "grad_norm": 0.4360882639884949, + "learning_rate": 9.299488886581103e-05, + "loss": 1.9064, + "step": 6368 + }, + { + "epoch": 1.9548802946593002, + "grad_norm": 0.38369372487068176, + "learning_rate": 9.299235135021476e-05, + "loss": 1.9202, + "step": 6369 + }, + { + "epoch": 1.9551872314303254, + "grad_norm": 0.34401383996009827, + "learning_rate": 9.298981340974287e-05, + "loss": 1.844, + "step": 6370 + }, + { + "epoch": 1.9554941682013505, + "grad_norm": 0.3434326946735382, + "learning_rate": 9.298727504442044e-05, + "loss": 1.8206, + "step": 6371 + }, + { + "epoch": 1.9558011049723758, + "grad_norm": 0.35966724157333374, + "learning_rate": 9.298473625427257e-05, + "loss": 1.9, + "step": 6372 + }, + { + "epoch": 1.9561080417434007, + "grad_norm": 0.3726016581058502, + "learning_rate": 9.298219703932434e-05, + "loss": 1.9004, + "step": 6373 + }, + { + "epoch": 1.956414978514426, + "grad_norm": 0.3377366364002228, + "learning_rate": 9.297965739960084e-05, + "loss": 1.8747, + "step": 6374 + }, + { + "epoch": 1.9567219152854514, + "grad_norm": 0.36824578046798706, + "learning_rate": 9.297711733512718e-05, + "loss": 1.9059, + "step": 6375 + }, + { + "epoch": 1.9570288520564763, + "grad_norm": 0.3434023857116699, + "learning_rate": 9.297457684592847e-05, + "loss": 1.8624, + "step": 6376 + }, + { + "epoch": 1.9573357888275016, + "grad_norm": 0.36236703395843506, + "learning_rate": 9.297203593202979e-05, + "loss": 1.8558, + "step": 6377 + }, + { + "epoch": 1.9576427255985267, + "grad_norm": 0.3326953947544098, + "learning_rate": 9.296949459345625e-05, + "loss": 1.9189, + "step": 6378 + }, + { + "epoch": 1.9579496623695518, + "grad_norm": 0.3358452022075653, + "learning_rate": 9.2966952830233e-05, + "loss": 1.8601, + "step": 6379 + }, + { + "epoch": 1.9582565991405771, + "grad_norm": 0.36092114448547363, + "learning_rate": 9.296441064238514e-05, + "loss": 1.873, + "step": 6380 + }, + { + "epoch": 1.9585635359116023, + "grad_norm": 0.345683217048645, + "learning_rate": 9.296186802993778e-05, + "loss": 1.9122, + "step": 6381 + }, + { + "epoch": 1.9588704726826274, + "grad_norm": 0.32488611340522766, + "learning_rate": 9.295932499291606e-05, + "loss": 1.8709, + "step": 6382 + }, + { + "epoch": 1.9591774094536527, + "grad_norm": 0.34276288747787476, + "learning_rate": 9.295678153134512e-05, + "loss": 1.937, + "step": 6383 + }, + { + "epoch": 1.9594843462246776, + "grad_norm": 0.3953622877597809, + "learning_rate": 9.295423764525008e-05, + "loss": 1.9357, + "step": 6384 + }, + { + "epoch": 1.959791282995703, + "grad_norm": 0.37806951999664307, + "learning_rate": 9.29516933346561e-05, + "loss": 1.8813, + "step": 6385 + }, + { + "epoch": 1.960098219766728, + "grad_norm": 0.39551272988319397, + "learning_rate": 9.29491485995883e-05, + "loss": 1.8812, + "step": 6386 + }, + { + "epoch": 1.9604051565377532, + "grad_norm": 0.37042370438575745, + "learning_rate": 9.294660344007184e-05, + "loss": 1.9059, + "step": 6387 + }, + { + "epoch": 1.9607120933087785, + "grad_norm": 0.37503576278686523, + "learning_rate": 9.294405785613187e-05, + "loss": 1.9792, + "step": 6388 + }, + { + "epoch": 1.9610190300798036, + "grad_norm": 0.3515741229057312, + "learning_rate": 9.294151184779355e-05, + "loss": 1.8792, + "step": 6389 + }, + { + "epoch": 1.9613259668508287, + "grad_norm": 0.319890558719635, + "learning_rate": 9.293896541508205e-05, + "loss": 1.9222, + "step": 6390 + }, + { + "epoch": 1.961632903621854, + "grad_norm": 0.3517487645149231, + "learning_rate": 9.293641855802252e-05, + "loss": 1.8751, + "step": 6391 + }, + { + "epoch": 1.961939840392879, + "grad_norm": 0.33269986510276794, + "learning_rate": 9.293387127664012e-05, + "loss": 1.8372, + "step": 6392 + }, + { + "epoch": 1.9622467771639043, + "grad_norm": 0.36048516631126404, + "learning_rate": 9.293132357096007e-05, + "loss": 1.8944, + "step": 6393 + }, + { + "epoch": 1.9625537139349294, + "grad_norm": 0.4329642057418823, + "learning_rate": 9.292877544100751e-05, + "loss": 1.9868, + "step": 6394 + }, + { + "epoch": 1.9628606507059545, + "grad_norm": 0.445496529340744, + "learning_rate": 9.292622688680762e-05, + "loss": 1.9885, + "step": 6395 + }, + { + "epoch": 1.9631675874769798, + "grad_norm": 0.3818886876106262, + "learning_rate": 9.292367790838561e-05, + "loss": 1.9515, + "step": 6396 + }, + { + "epoch": 1.963474524248005, + "grad_norm": 0.3800121545791626, + "learning_rate": 9.292112850576664e-05, + "loss": 1.8838, + "step": 6397 + }, + { + "epoch": 1.96378146101903, + "grad_norm": 0.44252321124076843, + "learning_rate": 9.291857867897593e-05, + "loss": 1.9296, + "step": 6398 + }, + { + "epoch": 1.9640883977900554, + "grad_norm": 0.463766485452652, + "learning_rate": 9.291602842803867e-05, + "loss": 1.9164, + "step": 6399 + }, + { + "epoch": 1.9643953345610803, + "grad_norm": 0.4599217474460602, + "learning_rate": 9.291347775298006e-05, + "loss": 1.9277, + "step": 6400 + }, + { + "epoch": 1.9647022713321056, + "grad_norm": 0.371346652507782, + "learning_rate": 9.291092665382532e-05, + "loss": 1.9036, + "step": 6401 + }, + { + "epoch": 1.9650092081031307, + "grad_norm": 0.327197402715683, + "learning_rate": 9.290837513059965e-05, + "loss": 1.8214, + "step": 6402 + }, + { + "epoch": 1.9653161448741558, + "grad_norm": 0.3346688747406006, + "learning_rate": 9.290582318332826e-05, + "loss": 1.8671, + "step": 6403 + }, + { + "epoch": 1.9656230816451812, + "grad_norm": 0.342208594083786, + "learning_rate": 9.290327081203637e-05, + "loss": 1.9143, + "step": 6404 + }, + { + "epoch": 1.9659300184162063, + "grad_norm": 0.3430559039115906, + "learning_rate": 9.290071801674923e-05, + "loss": 1.9135, + "step": 6405 + }, + { + "epoch": 1.9662369551872314, + "grad_norm": 0.3335573971271515, + "learning_rate": 9.289816479749202e-05, + "loss": 1.9011, + "step": 6406 + }, + { + "epoch": 1.9665438919582567, + "grad_norm": 0.3464879095554352, + "learning_rate": 9.289561115429004e-05, + "loss": 1.9061, + "step": 6407 + }, + { + "epoch": 1.9668508287292816, + "grad_norm": 0.3513408899307251, + "learning_rate": 9.289305708716847e-05, + "loss": 1.8982, + "step": 6408 + }, + { + "epoch": 1.967157765500307, + "grad_norm": 0.3888663947582245, + "learning_rate": 9.289050259615256e-05, + "loss": 1.9196, + "step": 6409 + }, + { + "epoch": 1.967464702271332, + "grad_norm": 0.3414073884487152, + "learning_rate": 9.288794768126759e-05, + "loss": 1.932, + "step": 6410 + }, + { + "epoch": 1.9677716390423572, + "grad_norm": 0.33067384362220764, + "learning_rate": 9.288539234253876e-05, + "loss": 1.8547, + "step": 6411 + }, + { + "epoch": 1.9680785758133825, + "grad_norm": 0.31827688217163086, + "learning_rate": 9.288283657999135e-05, + "loss": 1.8691, + "step": 6412 + }, + { + "epoch": 1.9683855125844076, + "grad_norm": 0.32259073853492737, + "learning_rate": 9.288028039365062e-05, + "loss": 1.8889, + "step": 6413 + }, + { + "epoch": 1.9686924493554327, + "grad_norm": 0.37552687525749207, + "learning_rate": 9.287772378354182e-05, + "loss": 1.8709, + "step": 6414 + }, + { + "epoch": 1.968999386126458, + "grad_norm": 0.3446151316165924, + "learning_rate": 9.287516674969024e-05, + "loss": 1.8749, + "step": 6415 + }, + { + "epoch": 1.969306322897483, + "grad_norm": 0.3648208975791931, + "learning_rate": 9.287260929212111e-05, + "loss": 1.93, + "step": 6416 + }, + { + "epoch": 1.9696132596685083, + "grad_norm": 0.3430599868297577, + "learning_rate": 9.287005141085974e-05, + "loss": 1.8537, + "step": 6417 + }, + { + "epoch": 1.9699201964395334, + "grad_norm": 0.39110586047172546, + "learning_rate": 9.286749310593139e-05, + "loss": 1.987, + "step": 6418 + }, + { + "epoch": 1.9702271332105585, + "grad_norm": 0.4033393859863281, + "learning_rate": 9.286493437736136e-05, + "loss": 1.9793, + "step": 6419 + }, + { + "epoch": 1.9705340699815839, + "grad_norm": 0.3950151205062866, + "learning_rate": 9.286237522517491e-05, + "loss": 1.8781, + "step": 6420 + }, + { + "epoch": 1.970841006752609, + "grad_norm": 0.4614053964614868, + "learning_rate": 9.285981564939735e-05, + "loss": 1.9886, + "step": 6421 + }, + { + "epoch": 1.971147943523634, + "grad_norm": 0.4990023076534271, + "learning_rate": 9.285725565005398e-05, + "loss": 1.8957, + "step": 6422 + }, + { + "epoch": 1.9714548802946594, + "grad_norm": 0.501301109790802, + "learning_rate": 9.285469522717008e-05, + "loss": 1.8606, + "step": 6423 + }, + { + "epoch": 1.9717618170656843, + "grad_norm": 0.3820148706436157, + "learning_rate": 9.285213438077097e-05, + "loss": 1.9097, + "step": 6424 + }, + { + "epoch": 1.9720687538367097, + "grad_norm": 0.3959129750728607, + "learning_rate": 9.284957311088193e-05, + "loss": 1.8972, + "step": 6425 + }, + { + "epoch": 1.9723756906077348, + "grad_norm": 0.4914678931236267, + "learning_rate": 9.284701141752831e-05, + "loss": 1.9211, + "step": 6426 + }, + { + "epoch": 1.9726826273787599, + "grad_norm": 0.5992010831832886, + "learning_rate": 9.284444930073542e-05, + "loss": 1.917, + "step": 6427 + }, + { + "epoch": 1.9729895641497852, + "grad_norm": 0.6089407801628113, + "learning_rate": 9.284188676052856e-05, + "loss": 1.9497, + "step": 6428 + }, + { + "epoch": 1.9732965009208103, + "grad_norm": 0.5493173003196716, + "learning_rate": 9.283932379693306e-05, + "loss": 1.9888, + "step": 6429 + }, + { + "epoch": 1.9736034376918354, + "grad_norm": 0.4451984167098999, + "learning_rate": 9.283676040997426e-05, + "loss": 1.892, + "step": 6430 + }, + { + "epoch": 1.9739103744628608, + "grad_norm": 0.35765743255615234, + "learning_rate": 9.283419659967748e-05, + "loss": 1.8768, + "step": 6431 + }, + { + "epoch": 1.9742173112338857, + "grad_norm": 0.36561164259910583, + "learning_rate": 9.283163236606807e-05, + "loss": 1.825, + "step": 6432 + }, + { + "epoch": 1.974524248004911, + "grad_norm": 0.38473913073539734, + "learning_rate": 9.282906770917137e-05, + "loss": 1.9247, + "step": 6433 + }, + { + "epoch": 1.974831184775936, + "grad_norm": 0.324945867061615, + "learning_rate": 9.28265026290127e-05, + "loss": 1.8832, + "step": 6434 + }, + { + "epoch": 1.9751381215469612, + "grad_norm": 0.38697487115859985, + "learning_rate": 9.282393712561744e-05, + "loss": 1.9282, + "step": 6435 + }, + { + "epoch": 1.9754450583179866, + "grad_norm": 0.3772333264350891, + "learning_rate": 9.282137119901094e-05, + "loss": 1.8822, + "step": 6436 + }, + { + "epoch": 1.9757519950890117, + "grad_norm": 0.3522745668888092, + "learning_rate": 9.281880484921854e-05, + "loss": 1.9102, + "step": 6437 + }, + { + "epoch": 1.9760589318600368, + "grad_norm": 0.36745330691337585, + "learning_rate": 9.281623807626562e-05, + "loss": 1.8842, + "step": 6438 + }, + { + "epoch": 1.9763658686310621, + "grad_norm": 0.3990548253059387, + "learning_rate": 9.281367088017755e-05, + "loss": 1.9642, + "step": 6439 + }, + { + "epoch": 1.976672805402087, + "grad_norm": 0.3333520293235779, + "learning_rate": 9.281110326097969e-05, + "loss": 1.8541, + "step": 6440 + }, + { + "epoch": 1.9769797421731123, + "grad_norm": 0.3282802700996399, + "learning_rate": 9.280853521869739e-05, + "loss": 1.8416, + "step": 6441 + }, + { + "epoch": 1.9772866789441375, + "grad_norm": 0.3415268361568451, + "learning_rate": 9.280596675335607e-05, + "loss": 1.9009, + "step": 6442 + }, + { + "epoch": 1.9775936157151626, + "grad_norm": 0.3621836006641388, + "learning_rate": 9.28033978649811e-05, + "loss": 1.8584, + "step": 6443 + }, + { + "epoch": 1.977900552486188, + "grad_norm": 0.34778010845184326, + "learning_rate": 9.280082855359786e-05, + "loss": 1.9455, + "step": 6444 + }, + { + "epoch": 1.978207489257213, + "grad_norm": 0.36525633931159973, + "learning_rate": 9.279825881923174e-05, + "loss": 1.9182, + "step": 6445 + }, + { + "epoch": 1.9785144260282381, + "grad_norm": 0.3404203951358795, + "learning_rate": 9.279568866190815e-05, + "loss": 1.8853, + "step": 6446 + }, + { + "epoch": 1.9788213627992635, + "grad_norm": 0.4564785659313202, + "learning_rate": 9.279311808165249e-05, + "loss": 2.0012, + "step": 6447 + }, + { + "epoch": 1.9791282995702886, + "grad_norm": 0.4371441602706909, + "learning_rate": 9.279054707849015e-05, + "loss": 1.9372, + "step": 6448 + }, + { + "epoch": 1.9794352363413137, + "grad_norm": 0.3928726017475128, + "learning_rate": 9.278797565244652e-05, + "loss": 1.882, + "step": 6449 + }, + { + "epoch": 1.979742173112339, + "grad_norm": 0.483331561088562, + "learning_rate": 9.278540380354706e-05, + "loss": 1.9664, + "step": 6450 + }, + { + "epoch": 1.980049109883364, + "grad_norm": 0.39085066318511963, + "learning_rate": 9.278283153181716e-05, + "loss": 1.874, + "step": 6451 + }, + { + "epoch": 1.9803560466543892, + "grad_norm": 0.3549460172653198, + "learning_rate": 9.278025883728224e-05, + "loss": 1.9108, + "step": 6452 + }, + { + "epoch": 1.9806629834254144, + "grad_norm": 0.4260072410106659, + "learning_rate": 9.277768571996772e-05, + "loss": 1.8621, + "step": 6453 + }, + { + "epoch": 1.9809699201964395, + "grad_norm": 0.4531188905239105, + "learning_rate": 9.277511217989904e-05, + "loss": 1.9924, + "step": 6454 + }, + { + "epoch": 1.9812768569674648, + "grad_norm": 0.34916743636131287, + "learning_rate": 9.277253821710165e-05, + "loss": 1.9459, + "step": 6455 + }, + { + "epoch": 1.98158379373849, + "grad_norm": 0.45466169714927673, + "learning_rate": 9.276996383160095e-05, + "loss": 1.9129, + "step": 6456 + }, + { + "epoch": 1.981890730509515, + "grad_norm": 0.4948022663593292, + "learning_rate": 9.27673890234224e-05, + "loss": 1.9362, + "step": 6457 + }, + { + "epoch": 1.9821976672805404, + "grad_norm": 0.43365779519081116, + "learning_rate": 9.276481379259146e-05, + "loss": 1.9323, + "step": 6458 + }, + { + "epoch": 1.9825046040515653, + "grad_norm": 0.5301255583763123, + "learning_rate": 9.276223813913354e-05, + "loss": 1.9611, + "step": 6459 + }, + { + "epoch": 1.9828115408225906, + "grad_norm": 0.4785257577896118, + "learning_rate": 9.275966206307412e-05, + "loss": 1.8945, + "step": 6460 + }, + { + "epoch": 1.9831184775936157, + "grad_norm": 0.4091590940952301, + "learning_rate": 9.275708556443868e-05, + "loss": 1.9171, + "step": 6461 + }, + { + "epoch": 1.9834254143646408, + "grad_norm": 0.4031025767326355, + "learning_rate": 9.275450864325264e-05, + "loss": 1.9518, + "step": 6462 + }, + { + "epoch": 1.9837323511356661, + "grad_norm": 0.39147642254829407, + "learning_rate": 9.275193129954149e-05, + "loss": 1.8756, + "step": 6463 + }, + { + "epoch": 1.9840392879066913, + "grad_norm": 0.3863523006439209, + "learning_rate": 9.27493535333307e-05, + "loss": 1.8894, + "step": 6464 + }, + { + "epoch": 1.9843462246777164, + "grad_norm": 0.36373165249824524, + "learning_rate": 9.274677534464576e-05, + "loss": 1.8574, + "step": 6465 + }, + { + "epoch": 1.9846531614487417, + "grad_norm": 0.40247389674186707, + "learning_rate": 9.274419673351211e-05, + "loss": 1.832, + "step": 6466 + }, + { + "epoch": 1.9849600982197666, + "grad_norm": 0.3874013125896454, + "learning_rate": 9.274161769995526e-05, + "loss": 1.9079, + "step": 6467 + }, + { + "epoch": 1.985267034990792, + "grad_norm": 0.35506606101989746, + "learning_rate": 9.27390382440007e-05, + "loss": 1.8784, + "step": 6468 + }, + { + "epoch": 1.985573971761817, + "grad_norm": 0.406325101852417, + "learning_rate": 9.273645836567388e-05, + "loss": 1.9822, + "step": 6469 + }, + { + "epoch": 1.9858809085328422, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.273387806500036e-05, + "loss": 1.9334, + "step": 6470 + }, + { + "epoch": 1.9861878453038675, + "grad_norm": 0.4810343384742737, + "learning_rate": 9.273129734200561e-05, + "loss": 1.9598, + "step": 6471 + }, + { + "epoch": 1.9864947820748926, + "grad_norm": 0.4552834630012512, + "learning_rate": 9.272871619671513e-05, + "loss": 1.9504, + "step": 6472 + }, + { + "epoch": 1.9868017188459177, + "grad_norm": 0.38974207639694214, + "learning_rate": 9.272613462915443e-05, + "loss": 1.8811, + "step": 6473 + }, + { + "epoch": 1.987108655616943, + "grad_norm": 0.40983298420906067, + "learning_rate": 9.272355263934902e-05, + "loss": 1.8876, + "step": 6474 + }, + { + "epoch": 1.987415592387968, + "grad_norm": 0.3684757947921753, + "learning_rate": 9.272097022732443e-05, + "loss": 1.921, + "step": 6475 + }, + { + "epoch": 1.9877225291589933, + "grad_norm": 0.38384270668029785, + "learning_rate": 9.271838739310618e-05, + "loss": 1.9099, + "step": 6476 + }, + { + "epoch": 1.9880294659300184, + "grad_norm": 0.3783731460571289, + "learning_rate": 9.271580413671976e-05, + "loss": 1.9322, + "step": 6477 + }, + { + "epoch": 1.9883364027010435, + "grad_norm": 0.3686216473579407, + "learning_rate": 9.271322045819076e-05, + "loss": 1.914, + "step": 6478 + }, + { + "epoch": 1.9886433394720688, + "grad_norm": 0.38776305317878723, + "learning_rate": 9.271063635754466e-05, + "loss": 1.9331, + "step": 6479 + }, + { + "epoch": 1.988950276243094, + "grad_norm": 0.35099950432777405, + "learning_rate": 9.270805183480702e-05, + "loss": 1.9837, + "step": 6480 + }, + { + "epoch": 1.989257213014119, + "grad_norm": 0.3736453652381897, + "learning_rate": 9.270546689000339e-05, + "loss": 1.846, + "step": 6481 + }, + { + "epoch": 1.9895641497851444, + "grad_norm": 0.3654848635196686, + "learning_rate": 9.27028815231593e-05, + "loss": 1.8987, + "step": 6482 + }, + { + "epoch": 1.9898710865561693, + "grad_norm": 0.3534870147705078, + "learning_rate": 9.27002957343003e-05, + "loss": 1.868, + "step": 6483 + }, + { + "epoch": 1.9901780233271946, + "grad_norm": 0.3143392503261566, + "learning_rate": 9.269770952345197e-05, + "loss": 1.8042, + "step": 6484 + }, + { + "epoch": 1.9904849600982197, + "grad_norm": 0.37151026725769043, + "learning_rate": 9.269512289063982e-05, + "loss": 1.8392, + "step": 6485 + }, + { + "epoch": 1.9907918968692448, + "grad_norm": 0.39781463146209717, + "learning_rate": 9.269253583588947e-05, + "loss": 1.9911, + "step": 6486 + }, + { + "epoch": 1.9910988336402702, + "grad_norm": 0.44022107124328613, + "learning_rate": 9.268994835922643e-05, + "loss": 1.9644, + "step": 6487 + }, + { + "epoch": 1.9914057704112953, + "grad_norm": 0.4058530628681183, + "learning_rate": 9.268736046067632e-05, + "loss": 1.9062, + "step": 6488 + }, + { + "epoch": 1.9917127071823204, + "grad_norm": 0.3754481077194214, + "learning_rate": 9.268477214026467e-05, + "loss": 1.8278, + "step": 6489 + }, + { + "epoch": 1.9920196439533457, + "grad_norm": 0.318208247423172, + "learning_rate": 9.268218339801711e-05, + "loss": 1.8529, + "step": 6490 + }, + { + "epoch": 1.9923265807243706, + "grad_norm": 0.350777268409729, + "learning_rate": 9.267959423395918e-05, + "loss": 1.9024, + "step": 6491 + }, + { + "epoch": 1.992633517495396, + "grad_norm": 0.3145158588886261, + "learning_rate": 9.26770046481165e-05, + "loss": 1.934, + "step": 6492 + }, + { + "epoch": 1.992940454266421, + "grad_norm": 0.3347548842430115, + "learning_rate": 9.267441464051463e-05, + "loss": 1.8989, + "step": 6493 + }, + { + "epoch": 1.9932473910374462, + "grad_norm": 0.33111512660980225, + "learning_rate": 9.267182421117919e-05, + "loss": 1.8808, + "step": 6494 + }, + { + "epoch": 1.9935543278084715, + "grad_norm": 0.3135010898113251, + "learning_rate": 9.266923336013577e-05, + "loss": 1.895, + "step": 6495 + }, + { + "epoch": 1.9938612645794966, + "grad_norm": 0.3638830780982971, + "learning_rate": 9.266664208740998e-05, + "loss": 1.9331, + "step": 6496 + }, + { + "epoch": 1.9941682013505218, + "grad_norm": 0.3592624068260193, + "learning_rate": 9.266405039302743e-05, + "loss": 1.8963, + "step": 6497 + }, + { + "epoch": 1.994475138121547, + "grad_norm": 0.34216129779815674, + "learning_rate": 9.266145827701371e-05, + "loss": 1.9062, + "step": 6498 + }, + { + "epoch": 1.994782074892572, + "grad_norm": 0.4180343747138977, + "learning_rate": 9.265886573939447e-05, + "loss": 1.9351, + "step": 6499 + }, + { + "epoch": 1.9950890116635973, + "grad_norm": 0.36890342831611633, + "learning_rate": 9.265627278019531e-05, + "loss": 1.9037, + "step": 6500 + }, + { + "epoch": 1.9953959484346224, + "grad_norm": 0.36638152599334717, + "learning_rate": 9.265367939944188e-05, + "loss": 1.9524, + "step": 6501 + }, + { + "epoch": 1.9957028852056475, + "grad_norm": 0.44918373227119446, + "learning_rate": 9.265108559715976e-05, + "loss": 1.9236, + "step": 6502 + }, + { + "epoch": 1.9960098219766729, + "grad_norm": 0.3805326521396637, + "learning_rate": 9.264849137337462e-05, + "loss": 1.8526, + "step": 6503 + }, + { + "epoch": 1.996316758747698, + "grad_norm": 0.39035212993621826, + "learning_rate": 9.26458967281121e-05, + "loss": 1.8256, + "step": 6504 + }, + { + "epoch": 1.996623695518723, + "grad_norm": 0.330522358417511, + "learning_rate": 9.264330166139783e-05, + "loss": 1.8487, + "step": 6505 + }, + { + "epoch": 1.9969306322897484, + "grad_norm": 0.33569198846817017, + "learning_rate": 9.264070617325746e-05, + "loss": 1.8735, + "step": 6506 + }, + { + "epoch": 1.9972375690607733, + "grad_norm": 0.4121384918689728, + "learning_rate": 9.263811026371664e-05, + "loss": 2.0028, + "step": 6507 + }, + { + "epoch": 1.9975445058317987, + "grad_norm": 0.3419879972934723, + "learning_rate": 9.263551393280103e-05, + "loss": 1.8432, + "step": 6508 + }, + { + "epoch": 1.9978514426028238, + "grad_norm": 0.33369818329811096, + "learning_rate": 9.263291718053626e-05, + "loss": 1.8752, + "step": 6509 + }, + { + "epoch": 1.9981583793738489, + "grad_norm": 0.3580996096134186, + "learning_rate": 9.263032000694804e-05, + "loss": 1.9319, + "step": 6510 + }, + { + "epoch": 1.9984653161448742, + "grad_norm": 0.38216903805732727, + "learning_rate": 9.2627722412062e-05, + "loss": 1.9424, + "step": 6511 + }, + { + "epoch": 1.9987722529158993, + "grad_norm": 0.3836761713027954, + "learning_rate": 9.26251243959038e-05, + "loss": 1.9259, + "step": 6512 + }, + { + "epoch": 1.9990791896869244, + "grad_norm": 0.34978967905044556, + "learning_rate": 9.262252595849917e-05, + "loss": 1.8648, + "step": 6513 + }, + { + "epoch": 1.9993861264579498, + "grad_norm": 0.4190160632133484, + "learning_rate": 9.261992709987375e-05, + "loss": 1.9456, + "step": 6514 + }, + { + "epoch": 1.9996930632289747, + "grad_norm": 0.38700881600379944, + "learning_rate": 9.261732782005322e-05, + "loss": 1.8768, + "step": 6515 + }, + { + "epoch": 2.0, + "grad_norm": 0.3706338405609131, + "learning_rate": 9.261472811906328e-05, + "loss": 1.9247, + "step": 6516 + }, + { + "epoch": 2.0003069367710253, + "grad_norm": 0.36679908633232117, + "learning_rate": 9.261212799692962e-05, + "loss": 1.8193, + "step": 6517 + }, + { + "epoch": 2.0006138735420502, + "grad_norm": 0.45219072699546814, + "learning_rate": 9.260952745367795e-05, + "loss": 1.9019, + "step": 6518 + }, + { + "epoch": 2.0009208103130756, + "grad_norm": 0.6038491725921631, + "learning_rate": 9.260692648933393e-05, + "loss": 1.8834, + "step": 6519 + }, + { + "epoch": 2.001227747084101, + "grad_norm": 0.5823990106582642, + "learning_rate": 9.260432510392331e-05, + "loss": 1.9066, + "step": 6520 + }, + { + "epoch": 2.001534683855126, + "grad_norm": 0.4731088876724243, + "learning_rate": 9.260172329747178e-05, + "loss": 1.8997, + "step": 6521 + }, + { + "epoch": 2.001841620626151, + "grad_norm": 0.3397974669933319, + "learning_rate": 9.259912107000504e-05, + "loss": 1.9396, + "step": 6522 + }, + { + "epoch": 2.002148557397176, + "grad_norm": 0.374734103679657, + "learning_rate": 9.259651842154882e-05, + "loss": 1.9311, + "step": 6523 + }, + { + "epoch": 2.0024554941682013, + "grad_norm": 0.48218441009521484, + "learning_rate": 9.259391535212884e-05, + "loss": 1.948, + "step": 6524 + }, + { + "epoch": 2.0027624309392267, + "grad_norm": 0.40540626645088196, + "learning_rate": 9.259131186177082e-05, + "loss": 1.8541, + "step": 6525 + }, + { + "epoch": 2.0030693677102516, + "grad_norm": 0.3698440492153168, + "learning_rate": 9.258870795050048e-05, + "loss": 1.9622, + "step": 6526 + }, + { + "epoch": 2.003376304481277, + "grad_norm": 0.35084524750709534, + "learning_rate": 9.258610361834358e-05, + "loss": 1.8882, + "step": 6527 + }, + { + "epoch": 2.0036832412523022, + "grad_norm": 0.38982072472572327, + "learning_rate": 9.258349886532584e-05, + "loss": 1.9523, + "step": 6528 + }, + { + "epoch": 2.003990178023327, + "grad_norm": 0.3737744390964508, + "learning_rate": 9.258089369147302e-05, + "loss": 1.9091, + "step": 6529 + }, + { + "epoch": 2.0042971147943525, + "grad_norm": 0.36094167828559875, + "learning_rate": 9.257828809681083e-05, + "loss": 1.8711, + "step": 6530 + }, + { + "epoch": 2.0046040515653774, + "grad_norm": 0.3270244896411896, + "learning_rate": 9.257568208136506e-05, + "loss": 1.8738, + "step": 6531 + }, + { + "epoch": 2.0049109883364027, + "grad_norm": 0.3320237100124359, + "learning_rate": 9.257307564516145e-05, + "loss": 1.8889, + "step": 6532 + }, + { + "epoch": 2.005217925107428, + "grad_norm": 0.3091014623641968, + "learning_rate": 9.257046878822573e-05, + "loss": 1.8683, + "step": 6533 + }, + { + "epoch": 2.005524861878453, + "grad_norm": 0.3234712779521942, + "learning_rate": 9.25678615105837e-05, + "loss": 1.8787, + "step": 6534 + }, + { + "epoch": 2.0058317986494782, + "grad_norm": 0.38402292132377625, + "learning_rate": 9.25652538122611e-05, + "loss": 1.9414, + "step": 6535 + }, + { + "epoch": 2.0061387354205036, + "grad_norm": 0.41379863023757935, + "learning_rate": 9.256264569328372e-05, + "loss": 1.9185, + "step": 6536 + }, + { + "epoch": 2.0064456721915285, + "grad_norm": 0.35990384221076965, + "learning_rate": 9.256003715367733e-05, + "loss": 1.8756, + "step": 6537 + }, + { + "epoch": 2.006752608962554, + "grad_norm": 0.3489217460155487, + "learning_rate": 9.25574281934677e-05, + "loss": 1.8984, + "step": 6538 + }, + { + "epoch": 2.0070595457335787, + "grad_norm": 0.326541006565094, + "learning_rate": 9.255481881268064e-05, + "loss": 1.8559, + "step": 6539 + }, + { + "epoch": 2.007366482504604, + "grad_norm": 0.40900397300720215, + "learning_rate": 9.25522090113419e-05, + "loss": 1.8832, + "step": 6540 + }, + { + "epoch": 2.0076734192756294, + "grad_norm": 0.4130956828594208, + "learning_rate": 9.254959878947731e-05, + "loss": 1.8437, + "step": 6541 + }, + { + "epoch": 2.0079803560466543, + "grad_norm": 0.38869336247444153, + "learning_rate": 9.254698814711263e-05, + "loss": 1.8839, + "step": 6542 + }, + { + "epoch": 2.0082872928176796, + "grad_norm": 0.37832918763160706, + "learning_rate": 9.254437708427368e-05, + "loss": 1.9519, + "step": 6543 + }, + { + "epoch": 2.008594229588705, + "grad_norm": 0.35336560010910034, + "learning_rate": 9.254176560098625e-05, + "loss": 1.8928, + "step": 6544 + }, + { + "epoch": 2.00890116635973, + "grad_norm": 0.347260981798172, + "learning_rate": 9.253915369727617e-05, + "loss": 1.9133, + "step": 6545 + }, + { + "epoch": 2.009208103130755, + "grad_norm": 0.3706999719142914, + "learning_rate": 9.253654137316923e-05, + "loss": 1.9048, + "step": 6546 + }, + { + "epoch": 2.00951503990178, + "grad_norm": 0.40080907940864563, + "learning_rate": 9.253392862869127e-05, + "loss": 1.9169, + "step": 6547 + }, + { + "epoch": 2.0098219766728054, + "grad_norm": 0.3635334074497223, + "learning_rate": 9.253131546386808e-05, + "loss": 1.8623, + "step": 6548 + }, + { + "epoch": 2.0101289134438307, + "grad_norm": 0.32642990350723267, + "learning_rate": 9.252870187872552e-05, + "loss": 1.8624, + "step": 6549 + }, + { + "epoch": 2.0104358502148556, + "grad_norm": 0.32467779517173767, + "learning_rate": 9.25260878732894e-05, + "loss": 1.8867, + "step": 6550 + }, + { + "epoch": 2.010742786985881, + "grad_norm": 0.3496699631214142, + "learning_rate": 9.252347344758553e-05, + "loss": 1.8441, + "step": 6551 + }, + { + "epoch": 2.0110497237569063, + "grad_norm": 0.3624981939792633, + "learning_rate": 9.252085860163981e-05, + "loss": 1.9045, + "step": 6552 + }, + { + "epoch": 2.011356660527931, + "grad_norm": 0.3801099359989166, + "learning_rate": 9.251824333547801e-05, + "loss": 1.9273, + "step": 6553 + }, + { + "epoch": 2.0116635972989565, + "grad_norm": 0.355866402387619, + "learning_rate": 9.251562764912602e-05, + "loss": 1.9032, + "step": 6554 + }, + { + "epoch": 2.0119705340699814, + "grad_norm": 0.31210052967071533, + "learning_rate": 9.251301154260968e-05, + "loss": 1.8148, + "step": 6555 + }, + { + "epoch": 2.0122774708410067, + "grad_norm": 0.3583676218986511, + "learning_rate": 9.251039501595485e-05, + "loss": 1.9326, + "step": 6556 + }, + { + "epoch": 2.012584407612032, + "grad_norm": 0.40221846103668213, + "learning_rate": 9.250777806918737e-05, + "loss": 1.8968, + "step": 6557 + }, + { + "epoch": 2.012891344383057, + "grad_norm": 0.3403627574443817, + "learning_rate": 9.250516070233311e-05, + "loss": 1.8956, + "step": 6558 + }, + { + "epoch": 2.0131982811540823, + "grad_norm": 0.37752729654312134, + "learning_rate": 9.250254291541796e-05, + "loss": 1.9136, + "step": 6559 + }, + { + "epoch": 2.0135052179251076, + "grad_norm": 0.3661794364452362, + "learning_rate": 9.249992470846774e-05, + "loss": 1.8796, + "step": 6560 + }, + { + "epoch": 2.0138121546961325, + "grad_norm": 0.315603643655777, + "learning_rate": 9.249730608150837e-05, + "loss": 1.8711, + "step": 6561 + }, + { + "epoch": 2.014119091467158, + "grad_norm": 0.3187065124511719, + "learning_rate": 9.249468703456571e-05, + "loss": 1.8611, + "step": 6562 + }, + { + "epoch": 2.0144260282381827, + "grad_norm": 0.3018025755882263, + "learning_rate": 9.249206756766564e-05, + "loss": 1.786, + "step": 6563 + }, + { + "epoch": 2.014732965009208, + "grad_norm": 0.344963401556015, + "learning_rate": 9.248944768083406e-05, + "loss": 1.9428, + "step": 6564 + }, + { + "epoch": 2.0150399017802334, + "grad_norm": 0.29776978492736816, + "learning_rate": 9.248682737409687e-05, + "loss": 1.8089, + "step": 6565 + }, + { + "epoch": 2.0153468385512583, + "grad_norm": 0.348982572555542, + "learning_rate": 9.248420664747992e-05, + "loss": 1.8407, + "step": 6566 + }, + { + "epoch": 2.0156537753222836, + "grad_norm": 0.3413224518299103, + "learning_rate": 9.248158550100915e-05, + "loss": 1.9802, + "step": 6567 + }, + { + "epoch": 2.015960712093309, + "grad_norm": 0.3598950505256653, + "learning_rate": 9.247896393471044e-05, + "loss": 1.8882, + "step": 6568 + }, + { + "epoch": 2.016267648864334, + "grad_norm": 0.3609221875667572, + "learning_rate": 9.247634194860974e-05, + "loss": 1.934, + "step": 6569 + }, + { + "epoch": 2.016574585635359, + "grad_norm": 0.3893497586250305, + "learning_rate": 9.247371954273291e-05, + "loss": 1.8808, + "step": 6570 + }, + { + "epoch": 2.016881522406384, + "grad_norm": 0.347417950630188, + "learning_rate": 9.24710967171059e-05, + "loss": 1.863, + "step": 6571 + }, + { + "epoch": 2.0171884591774094, + "grad_norm": 0.35378298163414, + "learning_rate": 9.246847347175461e-05, + "loss": 1.8664, + "step": 6572 + }, + { + "epoch": 2.0174953959484347, + "grad_norm": 0.2819608151912689, + "learning_rate": 9.246584980670499e-05, + "loss": 1.9007, + "step": 6573 + }, + { + "epoch": 2.0178023327194596, + "grad_norm": 0.32445117831230164, + "learning_rate": 9.246322572198293e-05, + "loss": 1.9176, + "step": 6574 + }, + { + "epoch": 2.018109269490485, + "grad_norm": 0.33579203486442566, + "learning_rate": 9.24606012176144e-05, + "loss": 1.8192, + "step": 6575 + }, + { + "epoch": 2.0184162062615103, + "grad_norm": 0.40369588136672974, + "learning_rate": 9.245797629362532e-05, + "loss": 1.8731, + "step": 6576 + }, + { + "epoch": 2.018723143032535, + "grad_norm": 0.34241169691085815, + "learning_rate": 9.245535095004163e-05, + "loss": 1.8555, + "step": 6577 + }, + { + "epoch": 2.0190300798035605, + "grad_norm": 0.3627666234970093, + "learning_rate": 9.245272518688927e-05, + "loss": 1.9212, + "step": 6578 + }, + { + "epoch": 2.0193370165745854, + "grad_norm": 0.3330884873867035, + "learning_rate": 9.245009900419422e-05, + "loss": 1.8727, + "step": 6579 + }, + { + "epoch": 2.0196439533456108, + "grad_norm": 0.3259236514568329, + "learning_rate": 9.244747240198239e-05, + "loss": 1.8471, + "step": 6580 + }, + { + "epoch": 2.019950890116636, + "grad_norm": 0.3715277910232544, + "learning_rate": 9.244484538027976e-05, + "loss": 1.8925, + "step": 6581 + }, + { + "epoch": 2.020257826887661, + "grad_norm": 0.4752909541130066, + "learning_rate": 9.24422179391123e-05, + "loss": 1.889, + "step": 6582 + }, + { + "epoch": 2.0205647636586863, + "grad_norm": 0.5166791677474976, + "learning_rate": 9.243959007850597e-05, + "loss": 1.8637, + "step": 6583 + }, + { + "epoch": 2.0208717004297116, + "grad_norm": 0.5350266695022583, + "learning_rate": 9.243696179848673e-05, + "loss": 1.8916, + "step": 6584 + }, + { + "epoch": 2.0211786372007365, + "grad_norm": 0.6115607619285583, + "learning_rate": 9.243433309908055e-05, + "loss": 1.8847, + "step": 6585 + }, + { + "epoch": 2.021485573971762, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.243170398031343e-05, + "loss": 1.8889, + "step": 6586 + }, + { + "epoch": 2.021792510742787, + "grad_norm": 0.4547630846500397, + "learning_rate": 9.242907444221134e-05, + "loss": 1.8752, + "step": 6587 + }, + { + "epoch": 2.022099447513812, + "grad_norm": 0.39437413215637207, + "learning_rate": 9.242644448480027e-05, + "loss": 1.9318, + "step": 6588 + }, + { + "epoch": 2.0224063842848374, + "grad_norm": 0.39216291904449463, + "learning_rate": 9.24238141081062e-05, + "loss": 1.8799, + "step": 6589 + }, + { + "epoch": 2.0227133210558623, + "grad_norm": 0.4100605547428131, + "learning_rate": 9.242118331215513e-05, + "loss": 1.9278, + "step": 6590 + }, + { + "epoch": 2.0230202578268877, + "grad_norm": 0.38527074456214905, + "learning_rate": 9.241855209697307e-05, + "loss": 1.9085, + "step": 6591 + }, + { + "epoch": 2.023327194597913, + "grad_norm": 0.39856311678886414, + "learning_rate": 9.241592046258602e-05, + "loss": 1.8057, + "step": 6592 + }, + { + "epoch": 2.023634131368938, + "grad_norm": 0.4070499539375305, + "learning_rate": 9.241328840902e-05, + "loss": 1.8099, + "step": 6593 + }, + { + "epoch": 2.023941068139963, + "grad_norm": 0.40319183468818665, + "learning_rate": 9.241065593630097e-05, + "loss": 1.8654, + "step": 6594 + }, + { + "epoch": 2.0242480049109886, + "grad_norm": 0.3788430988788605, + "learning_rate": 9.240802304445499e-05, + "loss": 1.9419, + "step": 6595 + }, + { + "epoch": 2.0245549416820134, + "grad_norm": 0.3656894564628601, + "learning_rate": 9.240538973350809e-05, + "loss": 1.8625, + "step": 6596 + }, + { + "epoch": 2.0248618784530388, + "grad_norm": 0.4384852945804596, + "learning_rate": 9.240275600348625e-05, + "loss": 1.8893, + "step": 6597 + }, + { + "epoch": 2.0251688152240637, + "grad_norm": 0.5054775476455688, + "learning_rate": 9.240012185441554e-05, + "loss": 1.826, + "step": 6598 + }, + { + "epoch": 2.025475751995089, + "grad_norm": 0.4576725959777832, + "learning_rate": 9.239748728632196e-05, + "loss": 1.9319, + "step": 6599 + }, + { + "epoch": 2.0257826887661143, + "grad_norm": 0.40581515431404114, + "learning_rate": 9.239485229923157e-05, + "loss": 1.905, + "step": 6600 + }, + { + "epoch": 2.0260896255371392, + "grad_norm": 0.3168322443962097, + "learning_rate": 9.23922168931704e-05, + "loss": 1.8937, + "step": 6601 + }, + { + "epoch": 2.0263965623081646, + "grad_norm": 0.39211124181747437, + "learning_rate": 9.238958106816449e-05, + "loss": 1.8346, + "step": 6602 + }, + { + "epoch": 2.02670349907919, + "grad_norm": 0.4722496569156647, + "learning_rate": 9.23869448242399e-05, + "loss": 1.933, + "step": 6603 + }, + { + "epoch": 2.027010435850215, + "grad_norm": 0.47029170393943787, + "learning_rate": 9.238430816142268e-05, + "loss": 1.8873, + "step": 6604 + }, + { + "epoch": 2.02731737262124, + "grad_norm": 0.36421555280685425, + "learning_rate": 9.238167107973888e-05, + "loss": 1.8311, + "step": 6605 + }, + { + "epoch": 2.027624309392265, + "grad_norm": 0.36506712436676025, + "learning_rate": 9.237903357921455e-05, + "loss": 1.9025, + "step": 6606 + }, + { + "epoch": 2.0279312461632903, + "grad_norm": 0.5055087208747864, + "learning_rate": 9.237639565987579e-05, + "loss": 1.9138, + "step": 6607 + }, + { + "epoch": 2.0282381829343157, + "grad_norm": 0.5850993394851685, + "learning_rate": 9.237375732174867e-05, + "loss": 1.869, + "step": 6608 + }, + { + "epoch": 2.0285451197053406, + "grad_norm": 0.5053986310958862, + "learning_rate": 9.237111856485921e-05, + "loss": 1.8196, + "step": 6609 + }, + { + "epoch": 2.028852056476366, + "grad_norm": 0.40635839104652405, + "learning_rate": 9.236847938923354e-05, + "loss": 1.8399, + "step": 6610 + }, + { + "epoch": 2.0291589932473912, + "grad_norm": 0.32075709104537964, + "learning_rate": 9.236583979489771e-05, + "loss": 1.8532, + "step": 6611 + }, + { + "epoch": 2.029465930018416, + "grad_norm": 0.4474230408668518, + "learning_rate": 9.236319978187783e-05, + "loss": 1.8807, + "step": 6612 + }, + { + "epoch": 2.0297728667894415, + "grad_norm": 0.5391832590103149, + "learning_rate": 9.236055935019998e-05, + "loss": 1.8887, + "step": 6613 + }, + { + "epoch": 2.0300798035604664, + "grad_norm": 0.5129361748695374, + "learning_rate": 9.235791849989024e-05, + "loss": 1.8541, + "step": 6614 + }, + { + "epoch": 2.0303867403314917, + "grad_norm": 0.33113735914230347, + "learning_rate": 9.235527723097474e-05, + "loss": 1.8611, + "step": 6615 + }, + { + "epoch": 2.030693677102517, + "grad_norm": 0.3526761531829834, + "learning_rate": 9.235263554347956e-05, + "loss": 1.8436, + "step": 6616 + }, + { + "epoch": 2.031000613873542, + "grad_norm": 0.4380190670490265, + "learning_rate": 9.234999343743081e-05, + "loss": 1.854, + "step": 6617 + }, + { + "epoch": 2.0313075506445673, + "grad_norm": 0.4300559163093567, + "learning_rate": 9.23473509128546e-05, + "loss": 1.919, + "step": 6618 + }, + { + "epoch": 2.0316144874155926, + "grad_norm": 0.3445209860801697, + "learning_rate": 9.234470796977705e-05, + "loss": 1.88, + "step": 6619 + }, + { + "epoch": 2.0319214241866175, + "grad_norm": 0.35759109258651733, + "learning_rate": 9.234206460822428e-05, + "loss": 1.9244, + "step": 6620 + }, + { + "epoch": 2.032228360957643, + "grad_norm": 0.432804137468338, + "learning_rate": 9.23394208282224e-05, + "loss": 1.9312, + "step": 6621 + }, + { + "epoch": 2.0325352977286677, + "grad_norm": 0.446865439414978, + "learning_rate": 9.233677662979756e-05, + "loss": 1.8791, + "step": 6622 + }, + { + "epoch": 2.032842234499693, + "grad_norm": 0.37617436051368713, + "learning_rate": 9.233413201297588e-05, + "loss": 1.8794, + "step": 6623 + }, + { + "epoch": 2.0331491712707184, + "grad_norm": 0.33695775270462036, + "learning_rate": 9.233148697778349e-05, + "loss": 1.8649, + "step": 6624 + }, + { + "epoch": 2.0334561080417433, + "grad_norm": 0.3893069624900818, + "learning_rate": 9.232884152424654e-05, + "loss": 1.899, + "step": 6625 + }, + { + "epoch": 2.0337630448127686, + "grad_norm": 0.38993194699287415, + "learning_rate": 9.232619565239116e-05, + "loss": 1.8994, + "step": 6626 + }, + { + "epoch": 2.034069981583794, + "grad_norm": 0.3725507855415344, + "learning_rate": 9.23235493622435e-05, + "loss": 1.8758, + "step": 6627 + }, + { + "epoch": 2.034376918354819, + "grad_norm": 0.3236019015312195, + "learning_rate": 9.232090265382973e-05, + "loss": 1.9041, + "step": 6628 + }, + { + "epoch": 2.034683855125844, + "grad_norm": 0.3399617671966553, + "learning_rate": 9.231825552717599e-05, + "loss": 1.9081, + "step": 6629 + }, + { + "epoch": 2.034990791896869, + "grad_norm": 0.352096289396286, + "learning_rate": 9.231560798230845e-05, + "loss": 1.9001, + "step": 6630 + }, + { + "epoch": 2.0352977286678944, + "grad_norm": 0.39621952176094055, + "learning_rate": 9.231296001925327e-05, + "loss": 1.9258, + "step": 6631 + }, + { + "epoch": 2.0356046654389197, + "grad_norm": 0.36686012148857117, + "learning_rate": 9.23103116380366e-05, + "loss": 1.9325, + "step": 6632 + }, + { + "epoch": 2.0359116022099446, + "grad_norm": 0.36286696791648865, + "learning_rate": 9.230766283868466e-05, + "loss": 1.9623, + "step": 6633 + }, + { + "epoch": 2.03621853898097, + "grad_norm": 0.34748387336730957, + "learning_rate": 9.230501362122359e-05, + "loss": 1.8326, + "step": 6634 + }, + { + "epoch": 2.0365254757519953, + "grad_norm": 0.350993275642395, + "learning_rate": 9.230236398567958e-05, + "loss": 1.8333, + "step": 6635 + }, + { + "epoch": 2.03683241252302, + "grad_norm": 0.3181723356246948, + "learning_rate": 9.229971393207881e-05, + "loss": 1.8852, + "step": 6636 + }, + { + "epoch": 2.0371393492940455, + "grad_norm": 0.3446536660194397, + "learning_rate": 9.229706346044747e-05, + "loss": 1.8833, + "step": 6637 + }, + { + "epoch": 2.0374462860650704, + "grad_norm": 0.3077203631401062, + "learning_rate": 9.229441257081176e-05, + "loss": 1.8546, + "step": 6638 + }, + { + "epoch": 2.0377532228360957, + "grad_norm": 0.3659566342830658, + "learning_rate": 9.229176126319788e-05, + "loss": 1.8687, + "step": 6639 + }, + { + "epoch": 2.038060159607121, + "grad_norm": 0.379779577255249, + "learning_rate": 9.228910953763204e-05, + "loss": 1.9208, + "step": 6640 + }, + { + "epoch": 2.038367096378146, + "grad_norm": 0.4496903121471405, + "learning_rate": 9.228645739414042e-05, + "loss": 1.9471, + "step": 6641 + }, + { + "epoch": 2.0386740331491713, + "grad_norm": 0.37597209215164185, + "learning_rate": 9.228380483274923e-05, + "loss": 1.9047, + "step": 6642 + }, + { + "epoch": 2.0389809699201966, + "grad_norm": 0.3739323019981384, + "learning_rate": 9.228115185348471e-05, + "loss": 1.9697, + "step": 6643 + }, + { + "epoch": 2.0392879066912215, + "grad_norm": 0.3524092435836792, + "learning_rate": 9.227849845637306e-05, + "loss": 1.8716, + "step": 6644 + }, + { + "epoch": 2.039594843462247, + "grad_norm": 0.36939096450805664, + "learning_rate": 9.227584464144051e-05, + "loss": 1.9836, + "step": 6645 + }, + { + "epoch": 2.0399017802332717, + "grad_norm": 0.39015519618988037, + "learning_rate": 9.22731904087133e-05, + "loss": 1.907, + "step": 6646 + }, + { + "epoch": 2.040208717004297, + "grad_norm": 0.3725626468658447, + "learning_rate": 9.227053575821763e-05, + "loss": 1.9483, + "step": 6647 + }, + { + "epoch": 2.0405156537753224, + "grad_norm": 0.41595613956451416, + "learning_rate": 9.226788068997974e-05, + "loss": 1.9352, + "step": 6648 + }, + { + "epoch": 2.0408225905463473, + "grad_norm": 0.4026443660259247, + "learning_rate": 9.226522520402589e-05, + "loss": 1.9166, + "step": 6649 + }, + { + "epoch": 2.0411295273173726, + "grad_norm": 0.39883533120155334, + "learning_rate": 9.226256930038233e-05, + "loss": 1.8594, + "step": 6650 + }, + { + "epoch": 2.041436464088398, + "grad_norm": 0.35540083050727844, + "learning_rate": 9.225991297907526e-05, + "loss": 1.9065, + "step": 6651 + }, + { + "epoch": 2.041743400859423, + "grad_norm": 0.3799804747104645, + "learning_rate": 9.225725624013097e-05, + "loss": 1.9232, + "step": 6652 + }, + { + "epoch": 2.042050337630448, + "grad_norm": 0.37289959192276, + "learning_rate": 9.225459908357572e-05, + "loss": 1.9679, + "step": 6653 + }, + { + "epoch": 2.042357274401473, + "grad_norm": 0.38069143891334534, + "learning_rate": 9.225194150943574e-05, + "loss": 1.9699, + "step": 6654 + }, + { + "epoch": 2.0426642111724984, + "grad_norm": 0.43708884716033936, + "learning_rate": 9.224928351773731e-05, + "loss": 1.8907, + "step": 6655 + }, + { + "epoch": 2.0429711479435237, + "grad_norm": 0.47203195095062256, + "learning_rate": 9.22466251085067e-05, + "loss": 1.9615, + "step": 6656 + }, + { + "epoch": 2.0432780847145486, + "grad_norm": 0.405129998922348, + "learning_rate": 9.224396628177019e-05, + "loss": 1.9165, + "step": 6657 + }, + { + "epoch": 2.043585021485574, + "grad_norm": 0.33447468280792236, + "learning_rate": 9.224130703755403e-05, + "loss": 1.852, + "step": 6658 + }, + { + "epoch": 2.0438919582565993, + "grad_norm": 0.33780771493911743, + "learning_rate": 9.223864737588453e-05, + "loss": 1.875, + "step": 6659 + }, + { + "epoch": 2.044198895027624, + "grad_norm": 0.37942594289779663, + "learning_rate": 9.223598729678796e-05, + "loss": 1.9115, + "step": 6660 + }, + { + "epoch": 2.0445058317986495, + "grad_norm": 0.3368874192237854, + "learning_rate": 9.223332680029059e-05, + "loss": 1.822, + "step": 6661 + }, + { + "epoch": 2.044812768569675, + "grad_norm": 0.3029201924800873, + "learning_rate": 9.223066588641873e-05, + "loss": 1.8902, + "step": 6662 + }, + { + "epoch": 2.0451197053406998, + "grad_norm": 0.4605506360530853, + "learning_rate": 9.22280045551987e-05, + "loss": 1.9164, + "step": 6663 + }, + { + "epoch": 2.045426642111725, + "grad_norm": 0.5012617111206055, + "learning_rate": 9.222534280665675e-05, + "loss": 1.8859, + "step": 6664 + }, + { + "epoch": 2.04573357888275, + "grad_norm": 0.5177115797996521, + "learning_rate": 9.222268064081924e-05, + "loss": 1.93, + "step": 6665 + }, + { + "epoch": 2.0460405156537753, + "grad_norm": 0.3966628313064575, + "learning_rate": 9.222001805771244e-05, + "loss": 1.8817, + "step": 6666 + }, + { + "epoch": 2.0463474524248007, + "grad_norm": 0.3670666813850403, + "learning_rate": 9.221735505736269e-05, + "loss": 1.8224, + "step": 6667 + }, + { + "epoch": 2.0466543891958255, + "grad_norm": 0.4584221839904785, + "learning_rate": 9.221469163979628e-05, + "loss": 1.7788, + "step": 6668 + }, + { + "epoch": 2.046961325966851, + "grad_norm": 0.5598693490028381, + "learning_rate": 9.221202780503954e-05, + "loss": 1.9263, + "step": 6669 + }, + { + "epoch": 2.047268262737876, + "grad_norm": 0.44200289249420166, + "learning_rate": 9.22093635531188e-05, + "loss": 1.8455, + "step": 6670 + }, + { + "epoch": 2.047575199508901, + "grad_norm": 0.33257725834846497, + "learning_rate": 9.22066988840604e-05, + "loss": 1.9019, + "step": 6671 + }, + { + "epoch": 2.0478821362799264, + "grad_norm": 0.4716290831565857, + "learning_rate": 9.220403379789066e-05, + "loss": 1.9012, + "step": 6672 + }, + { + "epoch": 2.0481890730509513, + "grad_norm": 0.5600453615188599, + "learning_rate": 9.220136829463591e-05, + "loss": 1.9158, + "step": 6673 + }, + { + "epoch": 2.0484960098219767, + "grad_norm": 0.5345216393470764, + "learning_rate": 9.219870237432252e-05, + "loss": 1.931, + "step": 6674 + }, + { + "epoch": 2.048802946593002, + "grad_norm": 0.36617112159729004, + "learning_rate": 9.219603603697682e-05, + "loss": 1.9019, + "step": 6675 + }, + { + "epoch": 2.049109883364027, + "grad_norm": 0.33677804470062256, + "learning_rate": 9.219336928262514e-05, + "loss": 1.8897, + "step": 6676 + }, + { + "epoch": 2.049416820135052, + "grad_norm": 0.48563066124916077, + "learning_rate": 9.219070211129388e-05, + "loss": 1.9147, + "step": 6677 + }, + { + "epoch": 2.0497237569060776, + "grad_norm": 0.5029729008674622, + "learning_rate": 9.218803452300935e-05, + "loss": 1.8926, + "step": 6678 + }, + { + "epoch": 2.0500306936771024, + "grad_norm": 0.3969452977180481, + "learning_rate": 9.218536651779795e-05, + "loss": 1.9337, + "step": 6679 + }, + { + "epoch": 2.050337630448128, + "grad_norm": 0.37374138832092285, + "learning_rate": 9.218269809568603e-05, + "loss": 1.9147, + "step": 6680 + }, + { + "epoch": 2.0506445672191527, + "grad_norm": 0.416608065366745, + "learning_rate": 9.218002925669996e-05, + "loss": 1.975, + "step": 6681 + }, + { + "epoch": 2.050951503990178, + "grad_norm": 0.35848283767700195, + "learning_rate": 9.217736000086612e-05, + "loss": 1.9194, + "step": 6682 + }, + { + "epoch": 2.0512584407612033, + "grad_norm": 0.3294626772403717, + "learning_rate": 9.217469032821088e-05, + "loss": 1.8541, + "step": 6683 + }, + { + "epoch": 2.0515653775322282, + "grad_norm": 0.4164618253707886, + "learning_rate": 9.217202023876064e-05, + "loss": 1.8999, + "step": 6684 + }, + { + "epoch": 2.0518723143032536, + "grad_norm": 0.4067288935184479, + "learning_rate": 9.216934973254179e-05, + "loss": 1.8609, + "step": 6685 + }, + { + "epoch": 2.052179251074279, + "grad_norm": 0.38743069767951965, + "learning_rate": 9.216667880958069e-05, + "loss": 1.8571, + "step": 6686 + }, + { + "epoch": 2.052486187845304, + "grad_norm": 0.3430919647216797, + "learning_rate": 9.216400746990377e-05, + "loss": 1.9229, + "step": 6687 + }, + { + "epoch": 2.052793124616329, + "grad_norm": 0.3512028753757477, + "learning_rate": 9.21613357135374e-05, + "loss": 1.9331, + "step": 6688 + }, + { + "epoch": 2.053100061387354, + "grad_norm": 0.3708036541938782, + "learning_rate": 9.215866354050799e-05, + "loss": 1.8499, + "step": 6689 + }, + { + "epoch": 2.0534069981583793, + "grad_norm": 0.39376455545425415, + "learning_rate": 9.215599095084199e-05, + "loss": 1.8531, + "step": 6690 + }, + { + "epoch": 2.0537139349294047, + "grad_norm": 0.3855830430984497, + "learning_rate": 9.215331794456576e-05, + "loss": 1.8597, + "step": 6691 + }, + { + "epoch": 2.0540208717004296, + "grad_norm": 0.3515113592147827, + "learning_rate": 9.215064452170574e-05, + "loss": 1.8776, + "step": 6692 + }, + { + "epoch": 2.054327808471455, + "grad_norm": 0.3165057897567749, + "learning_rate": 9.214797068228833e-05, + "loss": 1.926, + "step": 6693 + }, + { + "epoch": 2.0546347452424802, + "grad_norm": 0.3516407310962677, + "learning_rate": 9.214529642633998e-05, + "loss": 1.9397, + "step": 6694 + }, + { + "epoch": 2.054941682013505, + "grad_norm": 0.36943888664245605, + "learning_rate": 9.214262175388713e-05, + "loss": 1.9114, + "step": 6695 + }, + { + "epoch": 2.0552486187845305, + "grad_norm": 0.3490065634250641, + "learning_rate": 9.213994666495616e-05, + "loss": 1.8637, + "step": 6696 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.30341869592666626, + "learning_rate": 9.213727115957356e-05, + "loss": 1.8525, + "step": 6697 + }, + { + "epoch": 2.0558624923265807, + "grad_norm": 0.3899247646331787, + "learning_rate": 9.213459523776573e-05, + "loss": 2.0578, + "step": 6698 + }, + { + "epoch": 2.056169429097606, + "grad_norm": 0.34904104471206665, + "learning_rate": 9.213191889955915e-05, + "loss": 1.9135, + "step": 6699 + }, + { + "epoch": 2.056476365868631, + "grad_norm": 0.3806450366973877, + "learning_rate": 9.212924214498024e-05, + "loss": 1.9252, + "step": 6700 + }, + { + "epoch": 2.0567833026396563, + "grad_norm": 0.33185848593711853, + "learning_rate": 9.212656497405547e-05, + "loss": 1.8457, + "step": 6701 + }, + { + "epoch": 2.0570902394106816, + "grad_norm": 0.356717050075531, + "learning_rate": 9.21238873868113e-05, + "loss": 1.9086, + "step": 6702 + }, + { + "epoch": 2.0573971761817065, + "grad_norm": 0.41743260622024536, + "learning_rate": 9.212120938327418e-05, + "loss": 1.9255, + "step": 6703 + }, + { + "epoch": 2.057704112952732, + "grad_norm": 0.3937377631664276, + "learning_rate": 9.211853096347058e-05, + "loss": 1.9529, + "step": 6704 + }, + { + "epoch": 2.0580110497237567, + "grad_norm": 0.43980923295021057, + "learning_rate": 9.211585212742698e-05, + "loss": 1.905, + "step": 6705 + }, + { + "epoch": 2.058317986494782, + "grad_norm": 0.36891186237335205, + "learning_rate": 9.211317287516984e-05, + "loss": 1.8109, + "step": 6706 + }, + { + "epoch": 2.0586249232658074, + "grad_norm": 0.3582547605037689, + "learning_rate": 9.211049320672563e-05, + "loss": 1.9633, + "step": 6707 + }, + { + "epoch": 2.0589318600368323, + "grad_norm": 0.3421446979045868, + "learning_rate": 9.210781312212087e-05, + "loss": 1.8956, + "step": 6708 + }, + { + "epoch": 2.0592387968078576, + "grad_norm": 0.34717023372650146, + "learning_rate": 9.210513262138199e-05, + "loss": 1.837, + "step": 6709 + }, + { + "epoch": 2.059545733578883, + "grad_norm": 0.32769930362701416, + "learning_rate": 9.210245170453553e-05, + "loss": 1.8588, + "step": 6710 + }, + { + "epoch": 2.059852670349908, + "grad_norm": 0.3694380223751068, + "learning_rate": 9.209977037160796e-05, + "loss": 1.9298, + "step": 6711 + }, + { + "epoch": 2.060159607120933, + "grad_norm": 0.38598594069480896, + "learning_rate": 9.209708862262578e-05, + "loss": 1.9011, + "step": 6712 + }, + { + "epoch": 2.060466543891958, + "grad_norm": 0.33520397543907166, + "learning_rate": 9.20944064576155e-05, + "loss": 1.9689, + "step": 6713 + }, + { + "epoch": 2.0607734806629834, + "grad_norm": 0.36898335814476013, + "learning_rate": 9.209172387660363e-05, + "loss": 1.9362, + "step": 6714 + }, + { + "epoch": 2.0610804174340087, + "grad_norm": 0.3989763855934143, + "learning_rate": 9.208904087961667e-05, + "loss": 1.8875, + "step": 6715 + }, + { + "epoch": 2.0613873542050336, + "grad_norm": 0.38079237937927246, + "learning_rate": 9.208635746668113e-05, + "loss": 1.8645, + "step": 6716 + }, + { + "epoch": 2.061694290976059, + "grad_norm": 0.3853057026863098, + "learning_rate": 9.208367363782355e-05, + "loss": 1.9346, + "step": 6717 + }, + { + "epoch": 2.0620012277470843, + "grad_norm": 0.33557942509651184, + "learning_rate": 9.208098939307044e-05, + "loss": 1.8629, + "step": 6718 + }, + { + "epoch": 2.062308164518109, + "grad_norm": 0.31848183274269104, + "learning_rate": 9.207830473244832e-05, + "loss": 1.7616, + "step": 6719 + }, + { + "epoch": 2.0626151012891345, + "grad_norm": 0.2901391088962555, + "learning_rate": 9.207561965598375e-05, + "loss": 1.8876, + "step": 6720 + }, + { + "epoch": 2.06292203806016, + "grad_norm": 0.33935174345970154, + "learning_rate": 9.207293416370322e-05, + "loss": 1.8407, + "step": 6721 + }, + { + "epoch": 2.0632289748311847, + "grad_norm": 0.3615114390850067, + "learning_rate": 9.207024825563331e-05, + "loss": 1.8378, + "step": 6722 + }, + { + "epoch": 2.06353591160221, + "grad_norm": 0.35903334617614746, + "learning_rate": 9.206756193180053e-05, + "loss": 1.8316, + "step": 6723 + }, + { + "epoch": 2.063842848373235, + "grad_norm": 0.35222968459129333, + "learning_rate": 9.206487519223146e-05, + "loss": 1.8786, + "step": 6724 + }, + { + "epoch": 2.0641497851442603, + "grad_norm": 0.3412967622280121, + "learning_rate": 9.206218803695264e-05, + "loss": 1.8682, + "step": 6725 + }, + { + "epoch": 2.0644567219152856, + "grad_norm": 0.4166354835033417, + "learning_rate": 9.205950046599062e-05, + "loss": 1.8871, + "step": 6726 + }, + { + "epoch": 2.0647636586863105, + "grad_norm": 0.4631161093711853, + "learning_rate": 9.205681247937196e-05, + "loss": 1.9328, + "step": 6727 + }, + { + "epoch": 2.065070595457336, + "grad_norm": 0.39197248220443726, + "learning_rate": 9.205412407712325e-05, + "loss": 1.9434, + "step": 6728 + }, + { + "epoch": 2.0653775322283607, + "grad_norm": 0.37939852476119995, + "learning_rate": 9.205143525927103e-05, + "loss": 1.9115, + "step": 6729 + }, + { + "epoch": 2.065684468999386, + "grad_norm": 0.35442814230918884, + "learning_rate": 9.204874602584186e-05, + "loss": 1.9197, + "step": 6730 + }, + { + "epoch": 2.0659914057704114, + "grad_norm": 0.3598809242248535, + "learning_rate": 9.204605637686235e-05, + "loss": 1.8684, + "step": 6731 + }, + { + "epoch": 2.0662983425414363, + "grad_norm": 0.3360415995121002, + "learning_rate": 9.204336631235905e-05, + "loss": 1.8531, + "step": 6732 + }, + { + "epoch": 2.0666052793124616, + "grad_norm": 0.4487619698047638, + "learning_rate": 9.204067583235859e-05, + "loss": 1.8509, + "step": 6733 + }, + { + "epoch": 2.066912216083487, + "grad_norm": 0.37166881561279297, + "learning_rate": 9.203798493688753e-05, + "loss": 1.8826, + "step": 6734 + }, + { + "epoch": 2.067219152854512, + "grad_norm": 0.35294032096862793, + "learning_rate": 9.203529362597244e-05, + "loss": 1.9029, + "step": 6735 + }, + { + "epoch": 2.067526089625537, + "grad_norm": 0.4115317165851593, + "learning_rate": 9.203260189963995e-05, + "loss": 1.9117, + "step": 6736 + }, + { + "epoch": 2.0678330263965625, + "grad_norm": 0.44137999415397644, + "learning_rate": 9.202990975791666e-05, + "loss": 1.8754, + "step": 6737 + }, + { + "epoch": 2.0681399631675874, + "grad_norm": 0.46055081486701965, + "learning_rate": 9.202721720082916e-05, + "loss": 1.8322, + "step": 6738 + }, + { + "epoch": 2.0684468999386127, + "grad_norm": 0.38548141717910767, + "learning_rate": 9.202452422840407e-05, + "loss": 1.8341, + "step": 6739 + }, + { + "epoch": 2.0687538367096376, + "grad_norm": 0.3542765974998474, + "learning_rate": 9.2021830840668e-05, + "loss": 1.9301, + "step": 6740 + }, + { + "epoch": 2.069060773480663, + "grad_norm": 0.35987207293510437, + "learning_rate": 9.201913703764755e-05, + "loss": 1.8756, + "step": 6741 + }, + { + "epoch": 2.0693677102516883, + "grad_norm": 0.4297364056110382, + "learning_rate": 9.201644281936938e-05, + "loss": 1.8549, + "step": 6742 + }, + { + "epoch": 2.069674647022713, + "grad_norm": 0.3679873049259186, + "learning_rate": 9.20137481858601e-05, + "loss": 1.8905, + "step": 6743 + }, + { + "epoch": 2.0699815837937385, + "grad_norm": 0.3402685523033142, + "learning_rate": 9.201105313714632e-05, + "loss": 1.8834, + "step": 6744 + }, + { + "epoch": 2.070288520564764, + "grad_norm": 0.40986955165863037, + "learning_rate": 9.200835767325469e-05, + "loss": 1.8861, + "step": 6745 + }, + { + "epoch": 2.0705954573357888, + "grad_norm": 0.4305949807167053, + "learning_rate": 9.200566179421186e-05, + "loss": 1.8977, + "step": 6746 + }, + { + "epoch": 2.070902394106814, + "grad_norm": 0.3948439359664917, + "learning_rate": 9.200296550004446e-05, + "loss": 1.8801, + "step": 6747 + }, + { + "epoch": 2.071209330877839, + "grad_norm": 0.3404015600681305, + "learning_rate": 9.200026879077912e-05, + "loss": 1.8417, + "step": 6748 + }, + { + "epoch": 2.0715162676488643, + "grad_norm": 0.39447101950645447, + "learning_rate": 9.199757166644252e-05, + "loss": 1.9675, + "step": 6749 + }, + { + "epoch": 2.0718232044198897, + "grad_norm": 0.44323647022247314, + "learning_rate": 9.199487412706129e-05, + "loss": 1.9014, + "step": 6750 + }, + { + "epoch": 2.0721301411909145, + "grad_norm": 0.47096556425094604, + "learning_rate": 9.199217617266212e-05, + "loss": 1.8783, + "step": 6751 + }, + { + "epoch": 2.07243707796194, + "grad_norm": 0.42863038182258606, + "learning_rate": 9.198947780327163e-05, + "loss": 1.8369, + "step": 6752 + }, + { + "epoch": 2.072744014732965, + "grad_norm": 0.414079874753952, + "learning_rate": 9.198677901891652e-05, + "loss": 1.9247, + "step": 6753 + }, + { + "epoch": 2.07305095150399, + "grad_norm": 0.3445589542388916, + "learning_rate": 9.198407981962345e-05, + "loss": 1.8494, + "step": 6754 + }, + { + "epoch": 2.0733578882750154, + "grad_norm": 0.4340321719646454, + "learning_rate": 9.198138020541908e-05, + "loss": 1.904, + "step": 6755 + }, + { + "epoch": 2.0736648250460403, + "grad_norm": 0.55349200963974, + "learning_rate": 9.197868017633013e-05, + "loss": 1.9368, + "step": 6756 + }, + { + "epoch": 2.0739717618170657, + "grad_norm": 0.5893970727920532, + "learning_rate": 9.197597973238326e-05, + "loss": 1.9329, + "step": 6757 + }, + { + "epoch": 2.074278698588091, + "grad_norm": 0.4942009449005127, + "learning_rate": 9.197327887360514e-05, + "loss": 1.7726, + "step": 6758 + }, + { + "epoch": 2.074585635359116, + "grad_norm": 0.36411046981811523, + "learning_rate": 9.197057760002247e-05, + "loss": 1.8214, + "step": 6759 + }, + { + "epoch": 2.074892572130141, + "grad_norm": 0.31520166993141174, + "learning_rate": 9.196787591166198e-05, + "loss": 1.8491, + "step": 6760 + }, + { + "epoch": 2.0751995089011666, + "grad_norm": 0.47392621636390686, + "learning_rate": 9.196517380855032e-05, + "loss": 2.0165, + "step": 6761 + }, + { + "epoch": 2.0755064456721914, + "grad_norm": 0.4768085181713104, + "learning_rate": 9.196247129071423e-05, + "loss": 1.9289, + "step": 6762 + }, + { + "epoch": 2.075813382443217, + "grad_norm": 0.396391361951828, + "learning_rate": 9.195976835818039e-05, + "loss": 1.9521, + "step": 6763 + }, + { + "epoch": 2.0761203192142417, + "grad_norm": 0.4030967950820923, + "learning_rate": 9.195706501097551e-05, + "loss": 1.8386, + "step": 6764 + }, + { + "epoch": 2.076427255985267, + "grad_norm": 0.48308777809143066, + "learning_rate": 9.195436124912635e-05, + "loss": 1.8874, + "step": 6765 + }, + { + "epoch": 2.0767341927562923, + "grad_norm": 0.5232771635055542, + "learning_rate": 9.19516570726596e-05, + "loss": 1.8822, + "step": 6766 + }, + { + "epoch": 2.0770411295273172, + "grad_norm": 0.3607174754142761, + "learning_rate": 9.194895248160198e-05, + "loss": 1.8995, + "step": 6767 + }, + { + "epoch": 2.0773480662983426, + "grad_norm": 0.4354429841041565, + "learning_rate": 9.194624747598022e-05, + "loss": 1.8629, + "step": 6768 + }, + { + "epoch": 2.077655003069368, + "grad_norm": 0.5405299067497253, + "learning_rate": 9.194354205582107e-05, + "loss": 1.8608, + "step": 6769 + }, + { + "epoch": 2.077961939840393, + "grad_norm": 0.5442025065422058, + "learning_rate": 9.194083622115123e-05, + "loss": 1.885, + "step": 6770 + }, + { + "epoch": 2.078268876611418, + "grad_norm": 0.4160112142562866, + "learning_rate": 9.193812997199749e-05, + "loss": 1.8617, + "step": 6771 + }, + { + "epoch": 2.078575813382443, + "grad_norm": 0.3550199866294861, + "learning_rate": 9.193542330838656e-05, + "loss": 1.9277, + "step": 6772 + }, + { + "epoch": 2.0788827501534684, + "grad_norm": 0.5224893093109131, + "learning_rate": 9.19327162303452e-05, + "loss": 1.7893, + "step": 6773 + }, + { + "epoch": 2.0791896869244937, + "grad_norm": 0.45021727681159973, + "learning_rate": 9.193000873790014e-05, + "loss": 1.8635, + "step": 6774 + }, + { + "epoch": 2.0794966236955186, + "grad_norm": 0.3087892532348633, + "learning_rate": 9.192730083107819e-05, + "loss": 1.842, + "step": 6775 + }, + { + "epoch": 2.079803560466544, + "grad_norm": 0.4304139018058777, + "learning_rate": 9.192459250990606e-05, + "loss": 1.8461, + "step": 6776 + }, + { + "epoch": 2.0801104972375692, + "grad_norm": 0.4388587474822998, + "learning_rate": 9.192188377441054e-05, + "loss": 1.8978, + "step": 6777 + }, + { + "epoch": 2.080417434008594, + "grad_norm": 0.3452616333961487, + "learning_rate": 9.19191746246184e-05, + "loss": 1.8849, + "step": 6778 + }, + { + "epoch": 2.0807243707796195, + "grad_norm": 0.3127618432044983, + "learning_rate": 9.191646506055638e-05, + "loss": 1.8703, + "step": 6779 + }, + { + "epoch": 2.0810313075506444, + "grad_norm": 0.3424977958202362, + "learning_rate": 9.191375508225131e-05, + "loss": 1.8446, + "step": 6780 + }, + { + "epoch": 2.0813382443216697, + "grad_norm": 0.3536671996116638, + "learning_rate": 9.191104468972993e-05, + "loss": 1.9079, + "step": 6781 + }, + { + "epoch": 2.081645181092695, + "grad_norm": 0.3689599633216858, + "learning_rate": 9.190833388301905e-05, + "loss": 1.8683, + "step": 6782 + }, + { + "epoch": 2.08195211786372, + "grad_norm": 0.30976906418800354, + "learning_rate": 9.190562266214546e-05, + "loss": 1.89, + "step": 6783 + }, + { + "epoch": 2.0822590546347453, + "grad_norm": 0.34682777523994446, + "learning_rate": 9.190291102713593e-05, + "loss": 1.8384, + "step": 6784 + }, + { + "epoch": 2.0825659914057706, + "grad_norm": 0.4135018587112427, + "learning_rate": 9.190019897801727e-05, + "loss": 1.8878, + "step": 6785 + }, + { + "epoch": 2.0828729281767955, + "grad_norm": 0.4247548580169678, + "learning_rate": 9.189748651481629e-05, + "loss": 1.9244, + "step": 6786 + }, + { + "epoch": 2.083179864947821, + "grad_norm": 0.3961609899997711, + "learning_rate": 9.18947736375598e-05, + "loss": 1.9539, + "step": 6787 + }, + { + "epoch": 2.0834868017188457, + "grad_norm": 0.4174231290817261, + "learning_rate": 9.18920603462746e-05, + "loss": 1.9705, + "step": 6788 + }, + { + "epoch": 2.083793738489871, + "grad_norm": 0.38771605491638184, + "learning_rate": 9.18893466409875e-05, + "loss": 1.9038, + "step": 6789 + }, + { + "epoch": 2.0841006752608964, + "grad_norm": 0.38480475544929504, + "learning_rate": 9.188663252172534e-05, + "loss": 1.8725, + "step": 6790 + }, + { + "epoch": 2.0844076120319213, + "grad_norm": 0.37508267164230347, + "learning_rate": 9.18839179885149e-05, + "loss": 1.8819, + "step": 6791 + }, + { + "epoch": 2.0847145488029466, + "grad_norm": 0.3970893621444702, + "learning_rate": 9.188120304138306e-05, + "loss": 1.9035, + "step": 6792 + }, + { + "epoch": 2.085021485573972, + "grad_norm": 0.42629706859588623, + "learning_rate": 9.18784876803566e-05, + "loss": 1.993, + "step": 6793 + }, + { + "epoch": 2.085328422344997, + "grad_norm": 0.40387317538261414, + "learning_rate": 9.18757719054624e-05, + "loss": 1.8987, + "step": 6794 + }, + { + "epoch": 2.085635359116022, + "grad_norm": 0.40304768085479736, + "learning_rate": 9.187305571672726e-05, + "loss": 1.9017, + "step": 6795 + }, + { + "epoch": 2.0859422958870475, + "grad_norm": 0.34255313873291016, + "learning_rate": 9.187033911417805e-05, + "loss": 1.8406, + "step": 6796 + }, + { + "epoch": 2.0862492326580724, + "grad_norm": 0.34713810682296753, + "learning_rate": 9.18676220978416e-05, + "loss": 1.8773, + "step": 6797 + }, + { + "epoch": 2.0865561694290977, + "grad_norm": 0.3651806712150574, + "learning_rate": 9.186490466774478e-05, + "loss": 1.9158, + "step": 6798 + }, + { + "epoch": 2.0868631062001226, + "grad_norm": 0.3859401047229767, + "learning_rate": 9.186218682391443e-05, + "loss": 1.8488, + "step": 6799 + }, + { + "epoch": 2.087170042971148, + "grad_norm": 0.34309303760528564, + "learning_rate": 9.185946856637742e-05, + "loss": 1.8373, + "step": 6800 + }, + { + "epoch": 2.0874769797421733, + "grad_norm": 0.3597384989261627, + "learning_rate": 9.18567498951606e-05, + "loss": 1.8297, + "step": 6801 + }, + { + "epoch": 2.087783916513198, + "grad_norm": 0.39170950651168823, + "learning_rate": 9.185403081029085e-05, + "loss": 1.9623, + "step": 6802 + }, + { + "epoch": 2.0880908532842235, + "grad_norm": 0.37024664878845215, + "learning_rate": 9.185131131179503e-05, + "loss": 1.8966, + "step": 6803 + }, + { + "epoch": 2.0883977900552484, + "grad_norm": 0.37869709730148315, + "learning_rate": 9.184859139970001e-05, + "loss": 1.9121, + "step": 6804 + }, + { + "epoch": 2.0887047268262737, + "grad_norm": 0.3808143436908722, + "learning_rate": 9.184587107403271e-05, + "loss": 1.918, + "step": 6805 + }, + { + "epoch": 2.089011663597299, + "grad_norm": 0.3864719271659851, + "learning_rate": 9.184315033481996e-05, + "loss": 1.9087, + "step": 6806 + }, + { + "epoch": 2.089318600368324, + "grad_norm": 0.41121476888656616, + "learning_rate": 9.184042918208869e-05, + "loss": 1.8971, + "step": 6807 + }, + { + "epoch": 2.0896255371393493, + "grad_norm": 0.33098986744880676, + "learning_rate": 9.183770761586576e-05, + "loss": 1.8497, + "step": 6808 + }, + { + "epoch": 2.0899324739103746, + "grad_norm": 0.336174339056015, + "learning_rate": 9.183498563617809e-05, + "loss": 1.8341, + "step": 6809 + }, + { + "epoch": 2.0902394106813995, + "grad_norm": 0.339040070772171, + "learning_rate": 9.183226324305258e-05, + "loss": 1.9228, + "step": 6810 + }, + { + "epoch": 2.090546347452425, + "grad_norm": 0.395000159740448, + "learning_rate": 9.182954043651613e-05, + "loss": 1.9773, + "step": 6811 + }, + { + "epoch": 2.09085328422345, + "grad_norm": 0.3884550929069519, + "learning_rate": 9.182681721659563e-05, + "loss": 1.9665, + "step": 6812 + }, + { + "epoch": 2.091160220994475, + "grad_norm": 0.38752105832099915, + "learning_rate": 9.182409358331801e-05, + "loss": 1.9337, + "step": 6813 + }, + { + "epoch": 2.0914671577655004, + "grad_norm": 0.3557493984699249, + "learning_rate": 9.182136953671017e-05, + "loss": 1.8506, + "step": 6814 + }, + { + "epoch": 2.0917740945365253, + "grad_norm": 0.36052554845809937, + "learning_rate": 9.181864507679906e-05, + "loss": 1.8336, + "step": 6815 + }, + { + "epoch": 2.0920810313075506, + "grad_norm": 0.3311133086681366, + "learning_rate": 9.181592020361158e-05, + "loss": 1.9121, + "step": 6816 + }, + { + "epoch": 2.092387968078576, + "grad_norm": 0.33922117948532104, + "learning_rate": 9.181319491717468e-05, + "loss": 1.8366, + "step": 6817 + }, + { + "epoch": 2.092694904849601, + "grad_norm": 0.30820000171661377, + "learning_rate": 9.181046921751527e-05, + "loss": 1.8931, + "step": 6818 + }, + { + "epoch": 2.093001841620626, + "grad_norm": 0.327374666929245, + "learning_rate": 9.180774310466031e-05, + "loss": 1.8818, + "step": 6819 + }, + { + "epoch": 2.0933087783916515, + "grad_norm": 0.3244091868400574, + "learning_rate": 9.180501657863672e-05, + "loss": 1.8542, + "step": 6820 + }, + { + "epoch": 2.0936157151626764, + "grad_norm": 0.32823657989501953, + "learning_rate": 9.180228963947144e-05, + "loss": 1.8745, + "step": 6821 + }, + { + "epoch": 2.0939226519337018, + "grad_norm": 0.32869017124176025, + "learning_rate": 9.179956228719144e-05, + "loss": 1.8497, + "step": 6822 + }, + { + "epoch": 2.0942295887047266, + "grad_norm": 0.3624805808067322, + "learning_rate": 9.179683452182369e-05, + "loss": 1.9499, + "step": 6823 + }, + { + "epoch": 2.094536525475752, + "grad_norm": 0.35709038376808167, + "learning_rate": 9.179410634339509e-05, + "loss": 1.8709, + "step": 6824 + }, + { + "epoch": 2.0948434622467773, + "grad_norm": 0.3875027298927307, + "learning_rate": 9.179137775193266e-05, + "loss": 1.883, + "step": 6825 + }, + { + "epoch": 2.095150399017802, + "grad_norm": 0.4203769862651825, + "learning_rate": 9.178864874746333e-05, + "loss": 1.814, + "step": 6826 + }, + { + "epoch": 2.0954573357888275, + "grad_norm": 0.46331214904785156, + "learning_rate": 9.178591933001407e-05, + "loss": 1.9821, + "step": 6827 + }, + { + "epoch": 2.095764272559853, + "grad_norm": 0.4264145791530609, + "learning_rate": 9.178318949961188e-05, + "loss": 1.9249, + "step": 6828 + }, + { + "epoch": 2.0960712093308778, + "grad_norm": 0.3697608709335327, + "learning_rate": 9.178045925628371e-05, + "loss": 2.0052, + "step": 6829 + }, + { + "epoch": 2.096378146101903, + "grad_norm": 0.39582517743110657, + "learning_rate": 9.177772860005656e-05, + "loss": 1.9086, + "step": 6830 + }, + { + "epoch": 2.096685082872928, + "grad_norm": 0.3287788927555084, + "learning_rate": 9.17749975309574e-05, + "loss": 1.8766, + "step": 6831 + }, + { + "epoch": 2.0969920196439533, + "grad_norm": 0.33648282289505005, + "learning_rate": 9.177226604901324e-05, + "loss": 1.933, + "step": 6832 + }, + { + "epoch": 2.0972989564149787, + "grad_norm": 0.34225910902023315, + "learning_rate": 9.176953415425106e-05, + "loss": 1.8801, + "step": 6833 + }, + { + "epoch": 2.0976058931860035, + "grad_norm": 0.35536935925483704, + "learning_rate": 9.176680184669786e-05, + "loss": 1.9472, + "step": 6834 + }, + { + "epoch": 2.097912829957029, + "grad_norm": 0.39152607321739197, + "learning_rate": 9.176406912638064e-05, + "loss": 1.9502, + "step": 6835 + }, + { + "epoch": 2.098219766728054, + "grad_norm": 0.3812694549560547, + "learning_rate": 9.176133599332643e-05, + "loss": 1.8746, + "step": 6836 + }, + { + "epoch": 2.098526703499079, + "grad_norm": 0.36225396394729614, + "learning_rate": 9.17586024475622e-05, + "loss": 1.8489, + "step": 6837 + }, + { + "epoch": 2.0988336402701044, + "grad_norm": 0.3953205943107605, + "learning_rate": 9.1755868489115e-05, + "loss": 1.8671, + "step": 6838 + }, + { + "epoch": 2.0991405770411293, + "grad_norm": 0.33443906903266907, + "learning_rate": 9.175313411801181e-05, + "loss": 1.8574, + "step": 6839 + }, + { + "epoch": 2.0994475138121547, + "grad_norm": 0.3358154892921448, + "learning_rate": 9.17503993342797e-05, + "loss": 1.8329, + "step": 6840 + }, + { + "epoch": 2.09975445058318, + "grad_norm": 0.45934513211250305, + "learning_rate": 9.174766413794566e-05, + "loss": 1.862, + "step": 6841 + }, + { + "epoch": 2.100061387354205, + "grad_norm": 0.46342480182647705, + "learning_rate": 9.174492852903673e-05, + "loss": 1.8747, + "step": 6842 + }, + { + "epoch": 2.1003683241252302, + "grad_norm": 0.4199588894844055, + "learning_rate": 9.174219250757996e-05, + "loss": 1.9308, + "step": 6843 + }, + { + "epoch": 2.1006752608962556, + "grad_norm": 0.3508588373661041, + "learning_rate": 9.173945607360238e-05, + "loss": 1.8622, + "step": 6844 + }, + { + "epoch": 2.1009821976672804, + "grad_norm": 0.3656609356403351, + "learning_rate": 9.173671922713104e-05, + "loss": 1.899, + "step": 6845 + }, + { + "epoch": 2.101289134438306, + "grad_norm": 0.43374791741371155, + "learning_rate": 9.173398196819295e-05, + "loss": 1.8725, + "step": 6846 + }, + { + "epoch": 2.1015960712093307, + "grad_norm": 0.49730411171913147, + "learning_rate": 9.17312442968152e-05, + "loss": 1.9224, + "step": 6847 + }, + { + "epoch": 2.101903007980356, + "grad_norm": 0.45392677187919617, + "learning_rate": 9.172850621302484e-05, + "loss": 1.8374, + "step": 6848 + }, + { + "epoch": 2.1022099447513813, + "grad_norm": 0.3507382273674011, + "learning_rate": 9.172576771684892e-05, + "loss": 1.8875, + "step": 6849 + }, + { + "epoch": 2.1025168815224062, + "grad_norm": 0.4124681055545807, + "learning_rate": 9.172302880831451e-05, + "loss": 1.8828, + "step": 6850 + }, + { + "epoch": 2.1028238182934316, + "grad_norm": 0.5120462775230408, + "learning_rate": 9.172028948744867e-05, + "loss": 1.8218, + "step": 6851 + }, + { + "epoch": 2.103130755064457, + "grad_norm": 0.5858038067817688, + "learning_rate": 9.171754975427848e-05, + "loss": 1.8679, + "step": 6852 + }, + { + "epoch": 2.103437691835482, + "grad_norm": 0.5196588039398193, + "learning_rate": 9.171480960883101e-05, + "loss": 1.8885, + "step": 6853 + }, + { + "epoch": 2.103744628606507, + "grad_norm": 0.38581255078315735, + "learning_rate": 9.171206905113335e-05, + "loss": 1.9127, + "step": 6854 + }, + { + "epoch": 2.104051565377532, + "grad_norm": 0.31531259417533875, + "learning_rate": 9.170932808121256e-05, + "loss": 1.84, + "step": 6855 + }, + { + "epoch": 2.1043585021485574, + "grad_norm": 0.4595080018043518, + "learning_rate": 9.170658669909575e-05, + "loss": 1.908, + "step": 6856 + }, + { + "epoch": 2.1046654389195827, + "grad_norm": 0.42485639452934265, + "learning_rate": 9.170384490481001e-05, + "loss": 1.8943, + "step": 6857 + }, + { + "epoch": 2.1049723756906076, + "grad_norm": 0.3465791344642639, + "learning_rate": 9.170110269838243e-05, + "loss": 1.8362, + "step": 6858 + }, + { + "epoch": 2.105279312461633, + "grad_norm": 0.26863181591033936, + "learning_rate": 9.16983600798401e-05, + "loss": 1.856, + "step": 6859 + }, + { + "epoch": 2.1055862492326582, + "grad_norm": 0.33826425671577454, + "learning_rate": 9.169561704921014e-05, + "loss": 1.8148, + "step": 6860 + }, + { + "epoch": 2.105893186003683, + "grad_norm": 0.3657929301261902, + "learning_rate": 9.169287360651967e-05, + "loss": 1.8978, + "step": 6861 + }, + { + "epoch": 2.1062001227747085, + "grad_norm": 0.2963617444038391, + "learning_rate": 9.169012975179579e-05, + "loss": 1.8432, + "step": 6862 + }, + { + "epoch": 2.1065070595457334, + "grad_norm": 0.32966092228889465, + "learning_rate": 9.168738548506559e-05, + "loss": 1.9137, + "step": 6863 + }, + { + "epoch": 2.1068139963167587, + "grad_norm": 0.4043191075325012, + "learning_rate": 9.168464080635622e-05, + "loss": 1.9294, + "step": 6864 + }, + { + "epoch": 2.107120933087784, + "grad_norm": 0.41461876034736633, + "learning_rate": 9.168189571569479e-05, + "loss": 1.8582, + "step": 6865 + }, + { + "epoch": 2.107427869858809, + "grad_norm": 0.34119492769241333, + "learning_rate": 9.167915021310845e-05, + "loss": 1.8245, + "step": 6866 + }, + { + "epoch": 2.1077348066298343, + "grad_norm": 0.3259434401988983, + "learning_rate": 9.167640429862429e-05, + "loss": 1.8962, + "step": 6867 + }, + { + "epoch": 2.1080417434008596, + "grad_norm": 0.3074548840522766, + "learning_rate": 9.167365797226951e-05, + "loss": 1.8617, + "step": 6868 + }, + { + "epoch": 2.1083486801718845, + "grad_norm": 0.40738388895988464, + "learning_rate": 9.167091123407121e-05, + "loss": 1.9701, + "step": 6869 + }, + { + "epoch": 2.10865561694291, + "grad_norm": 0.3931449055671692, + "learning_rate": 9.166816408405653e-05, + "loss": 1.8874, + "step": 6870 + }, + { + "epoch": 2.108962553713935, + "grad_norm": 0.3726460635662079, + "learning_rate": 9.166541652225264e-05, + "loss": 1.9307, + "step": 6871 + }, + { + "epoch": 2.10926949048496, + "grad_norm": 0.36566078662872314, + "learning_rate": 9.166266854868667e-05, + "loss": 1.8782, + "step": 6872 + }, + { + "epoch": 2.1095764272559854, + "grad_norm": 0.33448025584220886, + "learning_rate": 9.16599201633858e-05, + "loss": 1.8007, + "step": 6873 + }, + { + "epoch": 2.1098833640270103, + "grad_norm": 0.4261031150817871, + "learning_rate": 9.165717136637716e-05, + "loss": 1.9092, + "step": 6874 + }, + { + "epoch": 2.1101903007980356, + "grad_norm": 0.37860241532325745, + "learning_rate": 9.165442215768798e-05, + "loss": 1.8538, + "step": 6875 + }, + { + "epoch": 2.110497237569061, + "grad_norm": 0.35417279601097107, + "learning_rate": 9.165167253734535e-05, + "loss": 1.8859, + "step": 6876 + }, + { + "epoch": 2.110804174340086, + "grad_norm": 0.33357858657836914, + "learning_rate": 9.16489225053765e-05, + "loss": 1.8615, + "step": 6877 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.40441447496414185, + "learning_rate": 9.164617206180856e-05, + "loss": 1.8711, + "step": 6878 + }, + { + "epoch": 2.1114180478821365, + "grad_norm": 0.401530921459198, + "learning_rate": 9.164342120666876e-05, + "loss": 1.8378, + "step": 6879 + }, + { + "epoch": 2.1117249846531614, + "grad_norm": 0.36379504203796387, + "learning_rate": 9.164066993998426e-05, + "loss": 1.87, + "step": 6880 + }, + { + "epoch": 2.1120319214241867, + "grad_norm": 0.36242642998695374, + "learning_rate": 9.163791826178225e-05, + "loss": 1.9041, + "step": 6881 + }, + { + "epoch": 2.1123388581952116, + "grad_norm": 0.34601980447769165, + "learning_rate": 9.163516617208994e-05, + "loss": 1.9248, + "step": 6882 + }, + { + "epoch": 2.112645794966237, + "grad_norm": 0.4664660096168518, + "learning_rate": 9.163241367093451e-05, + "loss": 1.901, + "step": 6883 + }, + { + "epoch": 2.1129527317372623, + "grad_norm": 0.5991809964179993, + "learning_rate": 9.162966075834315e-05, + "loss": 1.9061, + "step": 6884 + }, + { + "epoch": 2.113259668508287, + "grad_norm": 0.5235050320625305, + "learning_rate": 9.16269074343431e-05, + "loss": 1.8958, + "step": 6885 + }, + { + "epoch": 2.1135666052793125, + "grad_norm": 0.39008161425590515, + "learning_rate": 9.162415369896153e-05, + "loss": 1.7935, + "step": 6886 + }, + { + "epoch": 2.113873542050338, + "grad_norm": 0.4212269186973572, + "learning_rate": 9.16213995522257e-05, + "loss": 1.9876, + "step": 6887 + }, + { + "epoch": 2.1141804788213627, + "grad_norm": 0.44495880603790283, + "learning_rate": 9.161864499416279e-05, + "loss": 1.9011, + "step": 6888 + }, + { + "epoch": 2.114487415592388, + "grad_norm": 0.40533384680747986, + "learning_rate": 9.161589002480006e-05, + "loss": 1.8734, + "step": 6889 + }, + { + "epoch": 2.114794352363413, + "grad_norm": 0.45783132314682007, + "learning_rate": 9.161313464416469e-05, + "loss": 1.9769, + "step": 6890 + }, + { + "epoch": 2.1151012891344383, + "grad_norm": 0.37975600361824036, + "learning_rate": 9.161037885228393e-05, + "loss": 1.8988, + "step": 6891 + }, + { + "epoch": 2.1154082259054636, + "grad_norm": 0.394987553358078, + "learning_rate": 9.160762264918504e-05, + "loss": 1.8076, + "step": 6892 + }, + { + "epoch": 2.1157151626764885, + "grad_norm": 0.4180262088775635, + "learning_rate": 9.160486603489522e-05, + "loss": 1.9497, + "step": 6893 + }, + { + "epoch": 2.116022099447514, + "grad_norm": 0.3917383849620819, + "learning_rate": 9.160210900944173e-05, + "loss": 1.9093, + "step": 6894 + }, + { + "epoch": 2.116329036218539, + "grad_norm": 0.3631739616394043, + "learning_rate": 9.15993515728518e-05, + "loss": 1.8724, + "step": 6895 + }, + { + "epoch": 2.116635972989564, + "grad_norm": 0.3304460942745209, + "learning_rate": 9.159659372515272e-05, + "loss": 1.8291, + "step": 6896 + }, + { + "epoch": 2.1169429097605894, + "grad_norm": 0.38202792406082153, + "learning_rate": 9.159383546637172e-05, + "loss": 1.8919, + "step": 6897 + }, + { + "epoch": 2.1172498465316143, + "grad_norm": 0.39544618129730225, + "learning_rate": 9.159107679653605e-05, + "loss": 1.8748, + "step": 6898 + }, + { + "epoch": 2.1175567833026396, + "grad_norm": 0.44175153970718384, + "learning_rate": 9.158831771567298e-05, + "loss": 1.9063, + "step": 6899 + }, + { + "epoch": 2.117863720073665, + "grad_norm": 0.3696559965610504, + "learning_rate": 9.158555822380979e-05, + "loss": 1.8356, + "step": 6900 + }, + { + "epoch": 2.11817065684469, + "grad_norm": 0.2917703688144684, + "learning_rate": 9.158279832097372e-05, + "loss": 1.8996, + "step": 6901 + }, + { + "epoch": 2.118477593615715, + "grad_norm": 0.3991266191005707, + "learning_rate": 9.158003800719208e-05, + "loss": 1.8872, + "step": 6902 + }, + { + "epoch": 2.1187845303867405, + "grad_norm": 0.41425880789756775, + "learning_rate": 9.157727728249213e-05, + "loss": 1.845, + "step": 6903 + }, + { + "epoch": 2.1190914671577654, + "grad_norm": 0.33590519428253174, + "learning_rate": 9.157451614690115e-05, + "loss": 1.8779, + "step": 6904 + }, + { + "epoch": 2.1193984039287908, + "grad_norm": 0.34963786602020264, + "learning_rate": 9.157175460044644e-05, + "loss": 1.8846, + "step": 6905 + }, + { + "epoch": 2.1197053406998156, + "grad_norm": 0.3274745047092438, + "learning_rate": 9.156899264315528e-05, + "loss": 1.8859, + "step": 6906 + }, + { + "epoch": 2.120012277470841, + "grad_norm": 0.35821303725242615, + "learning_rate": 9.156623027505498e-05, + "loss": 1.8314, + "step": 6907 + }, + { + "epoch": 2.1203192142418663, + "grad_norm": 0.41185733675956726, + "learning_rate": 9.156346749617283e-05, + "loss": 1.9162, + "step": 6908 + }, + { + "epoch": 2.120626151012891, + "grad_norm": 0.4120326042175293, + "learning_rate": 9.156070430653613e-05, + "loss": 1.8593, + "step": 6909 + }, + { + "epoch": 2.1209330877839165, + "grad_norm": 0.39017269015312195, + "learning_rate": 9.155794070617218e-05, + "loss": 1.9333, + "step": 6910 + }, + { + "epoch": 2.121240024554942, + "grad_norm": 0.3104727864265442, + "learning_rate": 9.155517669510832e-05, + "loss": 1.8274, + "step": 6911 + }, + { + "epoch": 2.1215469613259668, + "grad_norm": 0.38360875844955444, + "learning_rate": 9.155241227337183e-05, + "loss": 1.9013, + "step": 6912 + }, + { + "epoch": 2.121853898096992, + "grad_norm": 0.3752502501010895, + "learning_rate": 9.154964744099006e-05, + "loss": 1.9079, + "step": 6913 + }, + { + "epoch": 2.122160834868017, + "grad_norm": 0.32074928283691406, + "learning_rate": 9.154688219799033e-05, + "loss": 1.8232, + "step": 6914 + }, + { + "epoch": 2.1224677716390423, + "grad_norm": 0.39559221267700195, + "learning_rate": 9.154411654439993e-05, + "loss": 1.9273, + "step": 6915 + }, + { + "epoch": 2.1227747084100677, + "grad_norm": 0.4010276198387146, + "learning_rate": 9.154135048024623e-05, + "loss": 1.8368, + "step": 6916 + }, + { + "epoch": 2.1230816451810925, + "grad_norm": 0.5745936036109924, + "learning_rate": 9.153858400555658e-05, + "loss": 2.0344, + "step": 6917 + }, + { + "epoch": 2.123388581952118, + "grad_norm": 0.45708227157592773, + "learning_rate": 9.153581712035827e-05, + "loss": 1.9309, + "step": 6918 + }, + { + "epoch": 2.123695518723143, + "grad_norm": 0.43845629692077637, + "learning_rate": 9.153304982467868e-05, + "loss": 1.9213, + "step": 6919 + }, + { + "epoch": 2.124002455494168, + "grad_norm": 0.34456655383110046, + "learning_rate": 9.153028211854516e-05, + "loss": 1.9, + "step": 6920 + }, + { + "epoch": 2.1243093922651934, + "grad_norm": 0.3903563618659973, + "learning_rate": 9.152751400198502e-05, + "loss": 1.8619, + "step": 6921 + }, + { + "epoch": 2.1246163290362183, + "grad_norm": 0.3465174436569214, + "learning_rate": 9.152474547502566e-05, + "loss": 1.8253, + "step": 6922 + }, + { + "epoch": 2.1249232658072437, + "grad_norm": 0.38335317373275757, + "learning_rate": 9.152197653769444e-05, + "loss": 1.8824, + "step": 6923 + }, + { + "epoch": 2.125230202578269, + "grad_norm": 0.3583361506462097, + "learning_rate": 9.15192071900187e-05, + "loss": 1.8749, + "step": 6924 + }, + { + "epoch": 2.125537139349294, + "grad_norm": 0.38249272108078003, + "learning_rate": 9.151643743202582e-05, + "loss": 1.9289, + "step": 6925 + }, + { + "epoch": 2.1258440761203192, + "grad_norm": 0.3972204327583313, + "learning_rate": 9.151366726374318e-05, + "loss": 1.8259, + "step": 6926 + }, + { + "epoch": 2.1261510128913446, + "grad_norm": 0.42475268244743347, + "learning_rate": 9.151089668519814e-05, + "loss": 1.9026, + "step": 6927 + }, + { + "epoch": 2.1264579496623695, + "grad_norm": 0.39575010538101196, + "learning_rate": 9.15081256964181e-05, + "loss": 1.8835, + "step": 6928 + }, + { + "epoch": 2.126764886433395, + "grad_norm": 0.33592918515205383, + "learning_rate": 9.150535429743041e-05, + "loss": 1.9439, + "step": 6929 + }, + { + "epoch": 2.12707182320442, + "grad_norm": 0.41760140657424927, + "learning_rate": 9.150258248826249e-05, + "loss": 1.9326, + "step": 6930 + }, + { + "epoch": 2.127378759975445, + "grad_norm": 0.4759281575679779, + "learning_rate": 9.149981026894173e-05, + "loss": 1.8443, + "step": 6931 + }, + { + "epoch": 2.1276856967464703, + "grad_norm": 0.4669014513492584, + "learning_rate": 9.149703763949552e-05, + "loss": 1.9254, + "step": 6932 + }, + { + "epoch": 2.1279926335174952, + "grad_norm": 0.3498002588748932, + "learning_rate": 9.149426459995126e-05, + "loss": 1.8814, + "step": 6933 + }, + { + "epoch": 2.1282995702885206, + "grad_norm": 0.332998663187027, + "learning_rate": 9.149149115033637e-05, + "loss": 1.8223, + "step": 6934 + }, + { + "epoch": 2.128606507059546, + "grad_norm": 0.36990395188331604, + "learning_rate": 9.148871729067823e-05, + "loss": 1.917, + "step": 6935 + }, + { + "epoch": 2.128913443830571, + "grad_norm": 0.4807330369949341, + "learning_rate": 9.148594302100426e-05, + "loss": 1.9138, + "step": 6936 + }, + { + "epoch": 2.129220380601596, + "grad_norm": 0.4821743369102478, + "learning_rate": 9.14831683413419e-05, + "loss": 1.9201, + "step": 6937 + }, + { + "epoch": 2.129527317372621, + "grad_norm": 0.45373013615608215, + "learning_rate": 9.148039325171855e-05, + "loss": 1.88, + "step": 6938 + }, + { + "epoch": 2.1298342541436464, + "grad_norm": 0.3712935745716095, + "learning_rate": 9.147761775216166e-05, + "loss": 1.8424, + "step": 6939 + }, + { + "epoch": 2.1301411909146717, + "grad_norm": 0.32493939995765686, + "learning_rate": 9.147484184269862e-05, + "loss": 1.8691, + "step": 6940 + }, + { + "epoch": 2.1304481276856966, + "grad_norm": 0.41952449083328247, + "learning_rate": 9.14720655233569e-05, + "loss": 1.8468, + "step": 6941 + }, + { + "epoch": 2.130755064456722, + "grad_norm": 0.4730648398399353, + "learning_rate": 9.14692887941639e-05, + "loss": 2.0333, + "step": 6942 + }, + { + "epoch": 2.1310620012277472, + "grad_norm": 0.3745786249637604, + "learning_rate": 9.14665116551471e-05, + "loss": 1.8835, + "step": 6943 + }, + { + "epoch": 2.131368937998772, + "grad_norm": 0.3747421205043793, + "learning_rate": 9.146373410633392e-05, + "loss": 1.8958, + "step": 6944 + }, + { + "epoch": 2.1316758747697975, + "grad_norm": 0.4383934438228607, + "learning_rate": 9.146095614775182e-05, + "loss": 1.8527, + "step": 6945 + }, + { + "epoch": 2.131982811540823, + "grad_norm": 0.4657299220561981, + "learning_rate": 9.145817777942824e-05, + "loss": 1.9073, + "step": 6946 + }, + { + "epoch": 2.1322897483118477, + "grad_norm": 0.4741605818271637, + "learning_rate": 9.145539900139067e-05, + "loss": 1.8736, + "step": 6947 + }, + { + "epoch": 2.132596685082873, + "grad_norm": 0.4058460295200348, + "learning_rate": 9.145261981366653e-05, + "loss": 1.9365, + "step": 6948 + }, + { + "epoch": 2.132903621853898, + "grad_norm": 0.3430838882923126, + "learning_rate": 9.14498402162833e-05, + "loss": 1.8992, + "step": 6949 + }, + { + "epoch": 2.1332105586249233, + "grad_norm": 0.43009114265441895, + "learning_rate": 9.144706020926847e-05, + "loss": 1.925, + "step": 6950 + }, + { + "epoch": 2.1335174953959486, + "grad_norm": 0.47696158289909363, + "learning_rate": 9.144427979264949e-05, + "loss": 1.858, + "step": 6951 + }, + { + "epoch": 2.1338244321669735, + "grad_norm": 0.4477602243423462, + "learning_rate": 9.144149896645386e-05, + "loss": 1.9042, + "step": 6952 + }, + { + "epoch": 2.134131368937999, + "grad_norm": 0.3736960291862488, + "learning_rate": 9.143871773070903e-05, + "loss": 1.782, + "step": 6953 + }, + { + "epoch": 2.1344383057090237, + "grad_norm": 0.3065558075904846, + "learning_rate": 9.143593608544251e-05, + "loss": 1.8711, + "step": 6954 + }, + { + "epoch": 2.134745242480049, + "grad_norm": 0.41738569736480713, + "learning_rate": 9.143315403068178e-05, + "loss": 1.8651, + "step": 6955 + }, + { + "epoch": 2.1350521792510744, + "grad_norm": 0.4652978479862213, + "learning_rate": 9.143037156645435e-05, + "loss": 1.8225, + "step": 6956 + }, + { + "epoch": 2.1353591160220993, + "grad_norm": 0.3625001311302185, + "learning_rate": 9.142758869278769e-05, + "loss": 1.9045, + "step": 6957 + }, + { + "epoch": 2.1356660527931246, + "grad_norm": 0.34516090154647827, + "learning_rate": 9.142480540970933e-05, + "loss": 1.8527, + "step": 6958 + }, + { + "epoch": 2.13597298956415, + "grad_norm": 0.36983323097229004, + "learning_rate": 9.142202171724674e-05, + "loss": 1.7911, + "step": 6959 + }, + { + "epoch": 2.136279926335175, + "grad_norm": 0.46084535121917725, + "learning_rate": 9.141923761542748e-05, + "loss": 1.9489, + "step": 6960 + }, + { + "epoch": 2.1365868631062, + "grad_norm": 0.49472227692604065, + "learning_rate": 9.141645310427903e-05, + "loss": 1.9904, + "step": 6961 + }, + { + "epoch": 2.1368937998772255, + "grad_norm": 0.39878135919570923, + "learning_rate": 9.14136681838289e-05, + "loss": 1.8969, + "step": 6962 + }, + { + "epoch": 2.1372007366482504, + "grad_norm": 0.3451174795627594, + "learning_rate": 9.141088285410464e-05, + "loss": 1.9186, + "step": 6963 + }, + { + "epoch": 2.1375076734192757, + "grad_norm": 0.4497967064380646, + "learning_rate": 9.140809711513377e-05, + "loss": 1.8636, + "step": 6964 + }, + { + "epoch": 2.1378146101903006, + "grad_norm": 0.4643685221672058, + "learning_rate": 9.14053109669438e-05, + "loss": 1.8427, + "step": 6965 + }, + { + "epoch": 2.138121546961326, + "grad_norm": 0.3748690187931061, + "learning_rate": 9.140252440956229e-05, + "loss": 1.8529, + "step": 6966 + }, + { + "epoch": 2.1384284837323513, + "grad_norm": 0.3211230933666229, + "learning_rate": 9.139973744301675e-05, + "loss": 1.8849, + "step": 6967 + }, + { + "epoch": 2.138735420503376, + "grad_norm": 0.41169998049736023, + "learning_rate": 9.139695006733476e-05, + "loss": 1.8535, + "step": 6968 + }, + { + "epoch": 2.1390423572744015, + "grad_norm": 0.48356300592422485, + "learning_rate": 9.139416228254382e-05, + "loss": 1.8182, + "step": 6969 + }, + { + "epoch": 2.139349294045427, + "grad_norm": 0.4596598148345947, + "learning_rate": 9.139137408867153e-05, + "loss": 1.8522, + "step": 6970 + }, + { + "epoch": 2.1396562308164517, + "grad_norm": 0.37168747186660767, + "learning_rate": 9.138858548574543e-05, + "loss": 1.896, + "step": 6971 + }, + { + "epoch": 2.139963167587477, + "grad_norm": 0.34447649121284485, + "learning_rate": 9.138579647379305e-05, + "loss": 1.8473, + "step": 6972 + }, + { + "epoch": 2.140270104358502, + "grad_norm": 0.466169536113739, + "learning_rate": 9.138300705284197e-05, + "loss": 1.9131, + "step": 6973 + }, + { + "epoch": 2.1405770411295273, + "grad_norm": 0.4297258257865906, + "learning_rate": 9.138021722291977e-05, + "loss": 1.9013, + "step": 6974 + }, + { + "epoch": 2.1408839779005526, + "grad_norm": 0.29336342215538025, + "learning_rate": 9.1377426984054e-05, + "loss": 1.8242, + "step": 6975 + }, + { + "epoch": 2.1411909146715775, + "grad_norm": 0.4282750189304352, + "learning_rate": 9.137463633627226e-05, + "loss": 1.9159, + "step": 6976 + }, + { + "epoch": 2.141497851442603, + "grad_norm": 0.6071211099624634, + "learning_rate": 9.13718452796021e-05, + "loss": 1.9105, + "step": 6977 + }, + { + "epoch": 2.141804788213628, + "grad_norm": 0.5837090015411377, + "learning_rate": 9.136905381407113e-05, + "loss": 1.8735, + "step": 6978 + }, + { + "epoch": 2.142111724984653, + "grad_norm": 0.36910486221313477, + "learning_rate": 9.13662619397069e-05, + "loss": 1.9013, + "step": 6979 + }, + { + "epoch": 2.1424186617556784, + "grad_norm": 0.37497541308403015, + "learning_rate": 9.136346965653704e-05, + "loss": 1.8444, + "step": 6980 + }, + { + "epoch": 2.1427255985267033, + "grad_norm": 0.508252739906311, + "learning_rate": 9.136067696458911e-05, + "loss": 1.8756, + "step": 6981 + }, + { + "epoch": 2.1430325352977286, + "grad_norm": 0.4045214056968689, + "learning_rate": 9.135788386389077e-05, + "loss": 1.8843, + "step": 6982 + }, + { + "epoch": 2.143339472068754, + "grad_norm": 0.36260777711868286, + "learning_rate": 9.135509035446955e-05, + "loss": 1.9264, + "step": 6983 + }, + { + "epoch": 2.143646408839779, + "grad_norm": 0.4112427234649658, + "learning_rate": 9.135229643635309e-05, + "loss": 1.8843, + "step": 6984 + }, + { + "epoch": 2.143953345610804, + "grad_norm": 0.43893104791641235, + "learning_rate": 9.1349502109569e-05, + "loss": 1.9486, + "step": 6985 + }, + { + "epoch": 2.1442602823818295, + "grad_norm": 0.3942745625972748, + "learning_rate": 9.13467073741449e-05, + "loss": 1.8607, + "step": 6986 + }, + { + "epoch": 2.1445672191528544, + "grad_norm": 0.3920004963874817, + "learning_rate": 9.13439122301084e-05, + "loss": 1.8102, + "step": 6987 + }, + { + "epoch": 2.1448741559238798, + "grad_norm": 0.3774373531341553, + "learning_rate": 9.134111667748712e-05, + "loss": 1.8326, + "step": 6988 + }, + { + "epoch": 2.1451810926949046, + "grad_norm": 0.355228453874588, + "learning_rate": 9.13383207163087e-05, + "loss": 1.895, + "step": 6989 + }, + { + "epoch": 2.14548802946593, + "grad_norm": 0.40284648537635803, + "learning_rate": 9.133552434660077e-05, + "loss": 1.928, + "step": 6990 + }, + { + "epoch": 2.1457949662369553, + "grad_norm": 0.3974910378456116, + "learning_rate": 9.133272756839096e-05, + "loss": 1.8567, + "step": 6991 + }, + { + "epoch": 2.14610190300798, + "grad_norm": 0.3878382742404938, + "learning_rate": 9.13299303817069e-05, + "loss": 1.9125, + "step": 6992 + }, + { + "epoch": 2.1464088397790055, + "grad_norm": 0.36132267117500305, + "learning_rate": 9.132713278657625e-05, + "loss": 1.8395, + "step": 6993 + }, + { + "epoch": 2.146715776550031, + "grad_norm": 0.4648832082748413, + "learning_rate": 9.132433478302667e-05, + "loss": 1.8877, + "step": 6994 + }, + { + "epoch": 2.1470227133210558, + "grad_norm": 0.5171563625335693, + "learning_rate": 9.132153637108577e-05, + "loss": 1.857, + "step": 6995 + }, + { + "epoch": 2.147329650092081, + "grad_norm": 0.4256175756454468, + "learning_rate": 9.131873755078124e-05, + "loss": 1.8434, + "step": 6996 + }, + { + "epoch": 2.147636586863106, + "grad_norm": 0.3421500623226166, + "learning_rate": 9.131593832214072e-05, + "loss": 1.8747, + "step": 6997 + }, + { + "epoch": 2.1479435236341313, + "grad_norm": 0.3880314230918884, + "learning_rate": 9.131313868519188e-05, + "loss": 1.8592, + "step": 6998 + }, + { + "epoch": 2.1482504604051567, + "grad_norm": 0.41070252656936646, + "learning_rate": 9.131033863996239e-05, + "loss": 1.8746, + "step": 6999 + }, + { + "epoch": 2.1485573971761815, + "grad_norm": 0.3837376534938812, + "learning_rate": 9.130753818647992e-05, + "loss": 1.8722, + "step": 7000 + }, + { + "epoch": 2.148864333947207, + "grad_norm": 0.311184823513031, + "learning_rate": 9.130473732477217e-05, + "loss": 1.8964, + "step": 7001 + }, + { + "epoch": 2.149171270718232, + "grad_norm": 0.3548091948032379, + "learning_rate": 9.130193605486677e-05, + "loss": 1.9235, + "step": 7002 + }, + { + "epoch": 2.149478207489257, + "grad_norm": 0.3509860932826996, + "learning_rate": 9.129913437679143e-05, + "loss": 1.8088, + "step": 7003 + }, + { + "epoch": 2.1497851442602824, + "grad_norm": 0.3301749527454376, + "learning_rate": 9.129633229057384e-05, + "loss": 1.8926, + "step": 7004 + }, + { + "epoch": 2.150092081031308, + "grad_norm": 0.3071286082267761, + "learning_rate": 9.129352979624169e-05, + "loss": 1.8045, + "step": 7005 + }, + { + "epoch": 2.1503990178023327, + "grad_norm": 0.3222786486148834, + "learning_rate": 9.129072689382268e-05, + "loss": 1.877, + "step": 7006 + }, + { + "epoch": 2.150705954573358, + "grad_norm": 0.31817424297332764, + "learning_rate": 9.128792358334451e-05, + "loss": 1.8863, + "step": 7007 + }, + { + "epoch": 2.151012891344383, + "grad_norm": 0.29379183053970337, + "learning_rate": 9.128511986483487e-05, + "loss": 1.8339, + "step": 7008 + }, + { + "epoch": 2.1513198281154082, + "grad_norm": 0.3618883788585663, + "learning_rate": 9.128231573832149e-05, + "loss": 1.9521, + "step": 7009 + }, + { + "epoch": 2.1516267648864336, + "grad_norm": 0.3188464045524597, + "learning_rate": 9.127951120383205e-05, + "loss": 1.811, + "step": 7010 + }, + { + "epoch": 2.1519337016574585, + "grad_norm": 0.3257068395614624, + "learning_rate": 9.127670626139431e-05, + "loss": 1.9084, + "step": 7011 + }, + { + "epoch": 2.152240638428484, + "grad_norm": 0.3389057219028473, + "learning_rate": 9.127390091103595e-05, + "loss": 1.9272, + "step": 7012 + }, + { + "epoch": 2.1525475751995087, + "grad_norm": 0.3376730680465698, + "learning_rate": 9.127109515278471e-05, + "loss": 1.8841, + "step": 7013 + }, + { + "epoch": 2.152854511970534, + "grad_norm": 0.3032901883125305, + "learning_rate": 9.126828898666833e-05, + "loss": 1.8057, + "step": 7014 + }, + { + "epoch": 2.1531614487415593, + "grad_norm": 0.32034799456596375, + "learning_rate": 9.126548241271451e-05, + "loss": 1.7988, + "step": 7015 + }, + { + "epoch": 2.1534683855125842, + "grad_norm": 0.31879931688308716, + "learning_rate": 9.126267543095102e-05, + "loss": 1.8932, + "step": 7016 + }, + { + "epoch": 2.1537753222836096, + "grad_norm": 0.3282395005226135, + "learning_rate": 9.125986804140559e-05, + "loss": 1.907, + "step": 7017 + }, + { + "epoch": 2.154082259054635, + "grad_norm": 0.36310696601867676, + "learning_rate": 9.125706024410594e-05, + "loss": 1.9812, + "step": 7018 + }, + { + "epoch": 2.15438919582566, + "grad_norm": 0.39414262771606445, + "learning_rate": 9.125425203907985e-05, + "loss": 1.9112, + "step": 7019 + }, + { + "epoch": 2.154696132596685, + "grad_norm": 0.4457061290740967, + "learning_rate": 9.125144342635508e-05, + "loss": 1.8876, + "step": 7020 + }, + { + "epoch": 2.1550030693677105, + "grad_norm": 0.4651646316051483, + "learning_rate": 9.124863440595934e-05, + "loss": 1.8283, + "step": 7021 + }, + { + "epoch": 2.1553100061387354, + "grad_norm": 0.4404383897781372, + "learning_rate": 9.124582497792043e-05, + "loss": 1.8646, + "step": 7022 + }, + { + "epoch": 2.1556169429097607, + "grad_norm": 0.3569783866405487, + "learning_rate": 9.124301514226612e-05, + "loss": 1.9603, + "step": 7023 + }, + { + "epoch": 2.1559238796807856, + "grad_norm": 0.3878212571144104, + "learning_rate": 9.124020489902414e-05, + "loss": 1.889, + "step": 7024 + }, + { + "epoch": 2.156230816451811, + "grad_norm": 0.43005698919296265, + "learning_rate": 9.123739424822229e-05, + "loss": 1.9127, + "step": 7025 + }, + { + "epoch": 2.1565377532228363, + "grad_norm": 0.37798774242401123, + "learning_rate": 9.123458318988834e-05, + "loss": 1.8434, + "step": 7026 + }, + { + "epoch": 2.156844689993861, + "grad_norm": 0.38182979822158813, + "learning_rate": 9.123177172405007e-05, + "loss": 1.8905, + "step": 7027 + }, + { + "epoch": 2.1571516267648865, + "grad_norm": 0.4695180058479309, + "learning_rate": 9.122895985073524e-05, + "loss": 1.9035, + "step": 7028 + }, + { + "epoch": 2.1574585635359114, + "grad_norm": 0.37112870812416077, + "learning_rate": 9.12261475699717e-05, + "loss": 1.8497, + "step": 7029 + }, + { + "epoch": 2.1577655003069367, + "grad_norm": 0.36758264899253845, + "learning_rate": 9.122333488178721e-05, + "loss": 1.9015, + "step": 7030 + }, + { + "epoch": 2.158072437077962, + "grad_norm": 0.4691081643104553, + "learning_rate": 9.122052178620953e-05, + "loss": 1.9707, + "step": 7031 + }, + { + "epoch": 2.158379373848987, + "grad_norm": 0.47068753838539124, + "learning_rate": 9.121770828326653e-05, + "loss": 1.9103, + "step": 7032 + }, + { + "epoch": 2.1586863106200123, + "grad_norm": 0.38539063930511475, + "learning_rate": 9.121489437298593e-05, + "loss": 1.7872, + "step": 7033 + }, + { + "epoch": 2.1589932473910376, + "grad_norm": 0.43769749999046326, + "learning_rate": 9.121208005539563e-05, + "loss": 1.9654, + "step": 7034 + }, + { + "epoch": 2.1593001841620625, + "grad_norm": 0.4770655930042267, + "learning_rate": 9.120926533052338e-05, + "loss": 1.9754, + "step": 7035 + }, + { + "epoch": 2.159607120933088, + "grad_norm": 0.526979386806488, + "learning_rate": 9.120645019839702e-05, + "loss": 1.8833, + "step": 7036 + }, + { + "epoch": 2.159914057704113, + "grad_norm": 0.4734671413898468, + "learning_rate": 9.120363465904438e-05, + "loss": 1.8695, + "step": 7037 + }, + { + "epoch": 2.160220994475138, + "grad_norm": 0.40346798300743103, + "learning_rate": 9.120081871249326e-05, + "loss": 1.9216, + "step": 7038 + }, + { + "epoch": 2.1605279312461634, + "grad_norm": 0.38210105895996094, + "learning_rate": 9.119800235877149e-05, + "loss": 1.9334, + "step": 7039 + }, + { + "epoch": 2.1608348680171883, + "grad_norm": 0.5528677105903625, + "learning_rate": 9.119518559790694e-05, + "loss": 1.8858, + "step": 7040 + }, + { + "epoch": 2.1611418047882136, + "grad_norm": 0.6684148907661438, + "learning_rate": 9.11923684299274e-05, + "loss": 1.9105, + "step": 7041 + }, + { + "epoch": 2.161448741559239, + "grad_norm": 0.4497738778591156, + "learning_rate": 9.118955085486073e-05, + "loss": 1.8789, + "step": 7042 + }, + { + "epoch": 2.161755678330264, + "grad_norm": 0.4440831243991852, + "learning_rate": 9.11867328727348e-05, + "loss": 1.9966, + "step": 7043 + }, + { + "epoch": 2.162062615101289, + "grad_norm": 0.5910835266113281, + "learning_rate": 9.118391448357742e-05, + "loss": 1.8841, + "step": 7044 + }, + { + "epoch": 2.1623695518723145, + "grad_norm": 0.5312752723693848, + "learning_rate": 9.118109568741645e-05, + "loss": 1.8825, + "step": 7045 + }, + { + "epoch": 2.1626764886433394, + "grad_norm": 0.3885713815689087, + "learning_rate": 9.117827648427977e-05, + "loss": 1.8763, + "step": 7046 + }, + { + "epoch": 2.1629834254143647, + "grad_norm": 0.4274894893169403, + "learning_rate": 9.117545687419522e-05, + "loss": 1.8802, + "step": 7047 + }, + { + "epoch": 2.1632903621853896, + "grad_norm": 0.3984382748603821, + "learning_rate": 9.117263685719067e-05, + "loss": 1.8319, + "step": 7048 + }, + { + "epoch": 2.163597298956415, + "grad_norm": 0.3687778115272522, + "learning_rate": 9.1169816433294e-05, + "loss": 1.838, + "step": 7049 + }, + { + "epoch": 2.1639042357274403, + "grad_norm": 0.37597915530204773, + "learning_rate": 9.116699560253306e-05, + "loss": 1.8711, + "step": 7050 + }, + { + "epoch": 2.164211172498465, + "grad_norm": 0.41217467188835144, + "learning_rate": 9.116417436493574e-05, + "loss": 1.8552, + "step": 7051 + }, + { + "epoch": 2.1645181092694905, + "grad_norm": 0.3937448263168335, + "learning_rate": 9.116135272052994e-05, + "loss": 1.8548, + "step": 7052 + }, + { + "epoch": 2.164825046040516, + "grad_norm": 0.3545389175415039, + "learning_rate": 9.115853066934351e-05, + "loss": 1.8694, + "step": 7053 + }, + { + "epoch": 2.1651319828115407, + "grad_norm": 0.32625243067741394, + "learning_rate": 9.115570821140436e-05, + "loss": 1.8579, + "step": 7054 + }, + { + "epoch": 2.165438919582566, + "grad_norm": 0.32701975107192993, + "learning_rate": 9.115288534674038e-05, + "loss": 1.8676, + "step": 7055 + }, + { + "epoch": 2.165745856353591, + "grad_norm": 0.39372533559799194, + "learning_rate": 9.115006207537947e-05, + "loss": 1.8895, + "step": 7056 + }, + { + "epoch": 2.1660527931246163, + "grad_norm": 0.3688350021839142, + "learning_rate": 9.114723839734954e-05, + "loss": 1.8742, + "step": 7057 + }, + { + "epoch": 2.1663597298956416, + "grad_norm": 0.35461875796318054, + "learning_rate": 9.114441431267846e-05, + "loss": 1.8723, + "step": 7058 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.3331618010997772, + "learning_rate": 9.114158982139416e-05, + "loss": 1.8514, + "step": 7059 + }, + { + "epoch": 2.166973603437692, + "grad_norm": 0.3313215374946594, + "learning_rate": 9.113876492352458e-05, + "loss": 1.912, + "step": 7060 + }, + { + "epoch": 2.167280540208717, + "grad_norm": 0.3320949375629425, + "learning_rate": 9.113593961909759e-05, + "loss": 1.8908, + "step": 7061 + }, + { + "epoch": 2.167587476979742, + "grad_norm": 0.3292064070701599, + "learning_rate": 9.113311390814115e-05, + "loss": 1.8702, + "step": 7062 + }, + { + "epoch": 2.1678944137507674, + "grad_norm": 0.33991244435310364, + "learning_rate": 9.113028779068316e-05, + "loss": 1.8503, + "step": 7063 + }, + { + "epoch": 2.1682013505217923, + "grad_norm": 0.3602859377861023, + "learning_rate": 9.112746126675156e-05, + "loss": 1.9185, + "step": 7064 + }, + { + "epoch": 2.1685082872928176, + "grad_norm": 0.3354876637458801, + "learning_rate": 9.112463433637428e-05, + "loss": 1.8857, + "step": 7065 + }, + { + "epoch": 2.168815224063843, + "grad_norm": 0.32364192605018616, + "learning_rate": 9.112180699957926e-05, + "loss": 1.8548, + "step": 7066 + }, + { + "epoch": 2.169122160834868, + "grad_norm": 0.3617163896560669, + "learning_rate": 9.111897925639446e-05, + "loss": 1.9021, + "step": 7067 + }, + { + "epoch": 2.169429097605893, + "grad_norm": 0.3852904438972473, + "learning_rate": 9.111615110684778e-05, + "loss": 1.9331, + "step": 7068 + }, + { + "epoch": 2.1697360343769185, + "grad_norm": 0.332939088344574, + "learning_rate": 9.111332255096721e-05, + "loss": 1.9156, + "step": 7069 + }, + { + "epoch": 2.1700429711479434, + "grad_norm": 0.3386891186237335, + "learning_rate": 9.111049358878067e-05, + "loss": 1.8898, + "step": 7070 + }, + { + "epoch": 2.1703499079189688, + "grad_norm": 0.3559711277484894, + "learning_rate": 9.110766422031617e-05, + "loss": 1.8546, + "step": 7071 + }, + { + "epoch": 2.1706568446899936, + "grad_norm": 0.3440175950527191, + "learning_rate": 9.110483444560162e-05, + "loss": 1.9005, + "step": 7072 + }, + { + "epoch": 2.170963781461019, + "grad_norm": 0.3239493668079376, + "learning_rate": 9.110200426466499e-05, + "loss": 1.9258, + "step": 7073 + }, + { + "epoch": 2.1712707182320443, + "grad_norm": 0.3658723533153534, + "learning_rate": 9.109917367753428e-05, + "loss": 2.0203, + "step": 7074 + }, + { + "epoch": 2.171577655003069, + "grad_norm": 0.35419905185699463, + "learning_rate": 9.109634268423746e-05, + "loss": 1.8515, + "step": 7075 + }, + { + "epoch": 2.1718845917740945, + "grad_norm": 0.40852081775665283, + "learning_rate": 9.109351128480246e-05, + "loss": 1.8744, + "step": 7076 + }, + { + "epoch": 2.17219152854512, + "grad_norm": 0.3502386212348938, + "learning_rate": 9.109067947925732e-05, + "loss": 1.8785, + "step": 7077 + }, + { + "epoch": 2.1724984653161448, + "grad_norm": 0.42964309453964233, + "learning_rate": 9.108784726763e-05, + "loss": 1.9175, + "step": 7078 + }, + { + "epoch": 2.17280540208717, + "grad_norm": 0.39438319206237793, + "learning_rate": 9.108501464994849e-05, + "loss": 1.9072, + "step": 7079 + }, + { + "epoch": 2.1731123388581954, + "grad_norm": 0.5045785903930664, + "learning_rate": 9.108218162624079e-05, + "loss": 1.9246, + "step": 7080 + }, + { + "epoch": 2.1734192756292203, + "grad_norm": 0.4374946653842926, + "learning_rate": 9.107934819653488e-05, + "loss": 1.8669, + "step": 7081 + }, + { + "epoch": 2.1737262124002457, + "grad_norm": 0.3263556957244873, + "learning_rate": 9.107651436085878e-05, + "loss": 1.8402, + "step": 7082 + }, + { + "epoch": 2.1740331491712706, + "grad_norm": 0.4380986988544464, + "learning_rate": 9.107368011924048e-05, + "loss": 1.8948, + "step": 7083 + }, + { + "epoch": 2.174340085942296, + "grad_norm": 0.4350908696651459, + "learning_rate": 9.1070845471708e-05, + "loss": 1.8717, + "step": 7084 + }, + { + "epoch": 2.174647022713321, + "grad_norm": 0.37809762358665466, + "learning_rate": 9.106801041828936e-05, + "loss": 1.8703, + "step": 7085 + }, + { + "epoch": 2.174953959484346, + "grad_norm": 0.3473457992076874, + "learning_rate": 9.106517495901257e-05, + "loss": 1.8999, + "step": 7086 + }, + { + "epoch": 2.1752608962553714, + "grad_norm": 0.48066645860671997, + "learning_rate": 9.106233909390564e-05, + "loss": 1.8788, + "step": 7087 + }, + { + "epoch": 2.1755678330263963, + "grad_norm": 0.5873035788536072, + "learning_rate": 9.105950282299663e-05, + "loss": 1.8879, + "step": 7088 + }, + { + "epoch": 2.1758747697974217, + "grad_norm": 0.47609585523605347, + "learning_rate": 9.105666614631354e-05, + "loss": 1.8813, + "step": 7089 + }, + { + "epoch": 2.176181706568447, + "grad_norm": 0.3845362365245819, + "learning_rate": 9.10538290638844e-05, + "loss": 1.9629, + "step": 7090 + }, + { + "epoch": 2.176488643339472, + "grad_norm": 0.5463572144508362, + "learning_rate": 9.105099157573727e-05, + "loss": 1.9455, + "step": 7091 + }, + { + "epoch": 2.1767955801104972, + "grad_norm": 0.4875337779521942, + "learning_rate": 9.104815368190017e-05, + "loss": 1.9146, + "step": 7092 + }, + { + "epoch": 2.1771025168815226, + "grad_norm": 0.37513965368270874, + "learning_rate": 9.104531538240116e-05, + "loss": 1.8626, + "step": 7093 + }, + { + "epoch": 2.1774094536525475, + "grad_norm": 0.3477539122104645, + "learning_rate": 9.104247667726828e-05, + "loss": 1.878, + "step": 7094 + }, + { + "epoch": 2.177716390423573, + "grad_norm": 0.5122693181037903, + "learning_rate": 9.103963756652961e-05, + "loss": 1.8784, + "step": 7095 + }, + { + "epoch": 2.178023327194598, + "grad_norm": 0.49106159806251526, + "learning_rate": 9.103679805021317e-05, + "loss": 1.8441, + "step": 7096 + }, + { + "epoch": 2.178330263965623, + "grad_norm": 0.3801479637622833, + "learning_rate": 9.103395812834705e-05, + "loss": 1.8986, + "step": 7097 + }, + { + "epoch": 2.1786372007366483, + "grad_norm": 0.3429640233516693, + "learning_rate": 9.10311178009593e-05, + "loss": 1.8806, + "step": 7098 + }, + { + "epoch": 2.1789441375076732, + "grad_norm": 0.36715295910835266, + "learning_rate": 9.102827706807799e-05, + "loss": 1.8215, + "step": 7099 + }, + { + "epoch": 2.1792510742786986, + "grad_norm": 0.37225866317749023, + "learning_rate": 9.10254359297312e-05, + "loss": 1.8851, + "step": 7100 + }, + { + "epoch": 2.179558011049724, + "grad_norm": 0.3552459180355072, + "learning_rate": 9.102259438594702e-05, + "loss": 1.9345, + "step": 7101 + }, + { + "epoch": 2.179864947820749, + "grad_norm": 0.3876415193080902, + "learning_rate": 9.10197524367535e-05, + "loss": 1.8657, + "step": 7102 + }, + { + "epoch": 2.180171884591774, + "grad_norm": 0.4635472595691681, + "learning_rate": 9.101691008217875e-05, + "loss": 1.8527, + "step": 7103 + }, + { + "epoch": 2.1804788213627995, + "grad_norm": 0.46319296956062317, + "learning_rate": 9.101406732225086e-05, + "loss": 1.869, + "step": 7104 + }, + { + "epoch": 2.1807857581338244, + "grad_norm": 0.36179330945014954, + "learning_rate": 9.101122415699792e-05, + "loss": 1.9157, + "step": 7105 + }, + { + "epoch": 2.1810926949048497, + "grad_norm": 0.30921339988708496, + "learning_rate": 9.100838058644801e-05, + "loss": 1.858, + "step": 7106 + }, + { + "epoch": 2.1813996316758746, + "grad_norm": 0.4568884074687958, + "learning_rate": 9.100553661062925e-05, + "loss": 1.8663, + "step": 7107 + }, + { + "epoch": 2.1817065684469, + "grad_norm": 0.43856412172317505, + "learning_rate": 9.100269222956976e-05, + "loss": 1.8492, + "step": 7108 + }, + { + "epoch": 2.1820135052179253, + "grad_norm": 0.3025546967983246, + "learning_rate": 9.099984744329761e-05, + "loss": 1.8532, + "step": 7109 + }, + { + "epoch": 2.18232044198895, + "grad_norm": 0.38365665078163147, + "learning_rate": 9.099700225184096e-05, + "loss": 1.8883, + "step": 7110 + }, + { + "epoch": 2.1826273787599755, + "grad_norm": 0.4863334596157074, + "learning_rate": 9.099415665522788e-05, + "loss": 1.8682, + "step": 7111 + }, + { + "epoch": 2.182934315531001, + "grad_norm": 0.42789241671562195, + "learning_rate": 9.099131065348653e-05, + "loss": 1.8867, + "step": 7112 + }, + { + "epoch": 2.1832412523020257, + "grad_norm": 0.35933569073677063, + "learning_rate": 9.098846424664504e-05, + "loss": 1.9282, + "step": 7113 + }, + { + "epoch": 2.183548189073051, + "grad_norm": 0.42611026763916016, + "learning_rate": 9.09856174347315e-05, + "loss": 1.9609, + "step": 7114 + }, + { + "epoch": 2.183855125844076, + "grad_norm": 0.43970558047294617, + "learning_rate": 9.098277021777406e-05, + "loss": 1.823, + "step": 7115 + }, + { + "epoch": 2.1841620626151013, + "grad_norm": 0.36792683601379395, + "learning_rate": 9.097992259580089e-05, + "loss": 1.9231, + "step": 7116 + }, + { + "epoch": 2.1844689993861266, + "grad_norm": 0.3554590344429016, + "learning_rate": 9.097707456884008e-05, + "loss": 1.914, + "step": 7117 + }, + { + "epoch": 2.1847759361571515, + "grad_norm": 0.4271651804447174, + "learning_rate": 9.097422613691982e-05, + "loss": 1.8666, + "step": 7118 + }, + { + "epoch": 2.185082872928177, + "grad_norm": 0.32142770290374756, + "learning_rate": 9.097137730006822e-05, + "loss": 1.7989, + "step": 7119 + }, + { + "epoch": 2.185389809699202, + "grad_norm": 0.33245620131492615, + "learning_rate": 9.096852805831348e-05, + "loss": 1.8536, + "step": 7120 + }, + { + "epoch": 2.185696746470227, + "grad_norm": 0.3480495810508728, + "learning_rate": 9.09656784116837e-05, + "loss": 1.9008, + "step": 7121 + }, + { + "epoch": 2.1860036832412524, + "grad_norm": 0.35290226340293884, + "learning_rate": 9.09628283602071e-05, + "loss": 1.8593, + "step": 7122 + }, + { + "epoch": 2.1863106200122773, + "grad_norm": 0.3084987998008728, + "learning_rate": 9.095997790391183e-05, + "loss": 1.827, + "step": 7123 + }, + { + "epoch": 2.1866175567833026, + "grad_norm": 0.36295285820961, + "learning_rate": 9.095712704282604e-05, + "loss": 1.909, + "step": 7124 + }, + { + "epoch": 2.186924493554328, + "grad_norm": 0.3893873691558838, + "learning_rate": 9.095427577697791e-05, + "loss": 1.9221, + "step": 7125 + }, + { + "epoch": 2.187231430325353, + "grad_norm": 0.3699241578578949, + "learning_rate": 9.095142410639564e-05, + "loss": 1.9352, + "step": 7126 + }, + { + "epoch": 2.187538367096378, + "grad_norm": 0.3384705185890198, + "learning_rate": 9.094857203110738e-05, + "loss": 1.8541, + "step": 7127 + }, + { + "epoch": 2.1878453038674035, + "grad_norm": 0.377687007188797, + "learning_rate": 9.094571955114133e-05, + "loss": 1.8336, + "step": 7128 + }, + { + "epoch": 2.1881522406384284, + "grad_norm": 0.40227916836738586, + "learning_rate": 9.094286666652567e-05, + "loss": 1.9565, + "step": 7129 + }, + { + "epoch": 2.1884591774094537, + "grad_norm": 0.3679705560207367, + "learning_rate": 9.094001337728862e-05, + "loss": 1.8152, + "step": 7130 + }, + { + "epoch": 2.1887661141804786, + "grad_norm": 0.3197132647037506, + "learning_rate": 9.093715968345836e-05, + "loss": 1.9263, + "step": 7131 + }, + { + "epoch": 2.189073050951504, + "grad_norm": 0.3518284559249878, + "learning_rate": 9.09343055850631e-05, + "loss": 1.8675, + "step": 7132 + }, + { + "epoch": 2.1893799877225293, + "grad_norm": 0.3214010000228882, + "learning_rate": 9.093145108213103e-05, + "loss": 1.8991, + "step": 7133 + }, + { + "epoch": 2.189686924493554, + "grad_norm": 0.3563176393508911, + "learning_rate": 9.092859617469037e-05, + "loss": 1.8603, + "step": 7134 + }, + { + "epoch": 2.1899938612645795, + "grad_norm": 0.34053143858909607, + "learning_rate": 9.092574086276933e-05, + "loss": 1.8955, + "step": 7135 + }, + { + "epoch": 2.190300798035605, + "grad_norm": 0.3833705484867096, + "learning_rate": 9.092288514639613e-05, + "loss": 1.8845, + "step": 7136 + }, + { + "epoch": 2.1906077348066297, + "grad_norm": 0.3932427763938904, + "learning_rate": 9.092002902559901e-05, + "loss": 1.8608, + "step": 7137 + }, + { + "epoch": 2.190914671577655, + "grad_norm": 0.332955539226532, + "learning_rate": 9.091717250040617e-05, + "loss": 1.8558, + "step": 7138 + }, + { + "epoch": 2.1912216083486804, + "grad_norm": 0.3149980306625366, + "learning_rate": 9.091431557084584e-05, + "loss": 1.893, + "step": 7139 + }, + { + "epoch": 2.1915285451197053, + "grad_norm": 0.3679150640964508, + "learning_rate": 9.091145823694628e-05, + "loss": 1.9012, + "step": 7140 + }, + { + "epoch": 2.1918354818907306, + "grad_norm": 0.36836057901382446, + "learning_rate": 9.09086004987357e-05, + "loss": 1.9121, + "step": 7141 + }, + { + "epoch": 2.1921424186617555, + "grad_norm": 0.3581927418708801, + "learning_rate": 9.090574235624237e-05, + "loss": 1.8826, + "step": 7142 + }, + { + "epoch": 2.192449355432781, + "grad_norm": 0.40886545181274414, + "learning_rate": 9.09028838094945e-05, + "loss": 1.8828, + "step": 7143 + }, + { + "epoch": 2.192756292203806, + "grad_norm": 0.32729873061180115, + "learning_rate": 9.090002485852037e-05, + "loss": 1.8827, + "step": 7144 + }, + { + "epoch": 2.193063228974831, + "grad_norm": 0.35304784774780273, + "learning_rate": 9.089716550334819e-05, + "loss": 1.846, + "step": 7145 + }, + { + "epoch": 2.1933701657458564, + "grad_norm": 0.35022708773612976, + "learning_rate": 9.089430574400629e-05, + "loss": 1.9169, + "step": 7146 + }, + { + "epoch": 2.1936771025168813, + "grad_norm": 0.4137697219848633, + "learning_rate": 9.089144558052287e-05, + "loss": 1.9111, + "step": 7147 + }, + { + "epoch": 2.1939840392879066, + "grad_norm": 0.3193536102771759, + "learning_rate": 9.088858501292622e-05, + "loss": 1.8577, + "step": 7148 + }, + { + "epoch": 2.194290976058932, + "grad_norm": 0.35795432329177856, + "learning_rate": 9.08857240412446e-05, + "loss": 1.8645, + "step": 7149 + }, + { + "epoch": 2.194597912829957, + "grad_norm": 0.3626460134983063, + "learning_rate": 9.088286266550632e-05, + "loss": 1.9288, + "step": 7150 + }, + { + "epoch": 2.194904849600982, + "grad_norm": 0.3438000977039337, + "learning_rate": 9.08800008857396e-05, + "loss": 1.9112, + "step": 7151 + }, + { + "epoch": 2.1952117863720075, + "grad_norm": 0.3445241153240204, + "learning_rate": 9.087713870197276e-05, + "loss": 1.8711, + "step": 7152 + }, + { + "epoch": 2.1955187231430324, + "grad_norm": 0.34294596314430237, + "learning_rate": 9.087427611423408e-05, + "loss": 1.9061, + "step": 7153 + }, + { + "epoch": 2.1958256599140578, + "grad_norm": 0.3608735203742981, + "learning_rate": 9.087141312255184e-05, + "loss": 1.8634, + "step": 7154 + }, + { + "epoch": 2.196132596685083, + "grad_norm": 0.3417772352695465, + "learning_rate": 9.086854972695434e-05, + "loss": 1.9, + "step": 7155 + }, + { + "epoch": 2.196439533456108, + "grad_norm": 0.3516700863838196, + "learning_rate": 9.086568592746988e-05, + "loss": 1.9021, + "step": 7156 + }, + { + "epoch": 2.1967464702271333, + "grad_norm": 0.37481075525283813, + "learning_rate": 9.086282172412677e-05, + "loss": 1.8845, + "step": 7157 + }, + { + "epoch": 2.197053406998158, + "grad_norm": 0.3413105010986328, + "learning_rate": 9.08599571169533e-05, + "loss": 1.8128, + "step": 7158 + }, + { + "epoch": 2.1973603437691835, + "grad_norm": 0.3539934754371643, + "learning_rate": 9.085709210597777e-05, + "loss": 1.857, + "step": 7159 + }, + { + "epoch": 2.197667280540209, + "grad_norm": 0.4345060884952545, + "learning_rate": 9.085422669122851e-05, + "loss": 1.8698, + "step": 7160 + }, + { + "epoch": 2.1979742173112338, + "grad_norm": 0.40369880199432373, + "learning_rate": 9.085136087273386e-05, + "loss": 1.7948, + "step": 7161 + }, + { + "epoch": 2.198281154082259, + "grad_norm": 0.3832145035266876, + "learning_rate": 9.08484946505221e-05, + "loss": 1.8682, + "step": 7162 + }, + { + "epoch": 2.198588090853284, + "grad_norm": 0.2859131097793579, + "learning_rate": 9.084562802462158e-05, + "loss": 1.8123, + "step": 7163 + }, + { + "epoch": 2.1988950276243093, + "grad_norm": 0.3062222898006439, + "learning_rate": 9.084276099506062e-05, + "loss": 1.8448, + "step": 7164 + }, + { + "epoch": 2.1992019643953347, + "grad_norm": 0.3819046914577484, + "learning_rate": 9.083989356186757e-05, + "loss": 1.8661, + "step": 7165 + }, + { + "epoch": 2.1995089011663596, + "grad_norm": 0.5007020235061646, + "learning_rate": 9.083702572507074e-05, + "loss": 1.9144, + "step": 7166 + }, + { + "epoch": 2.199815837937385, + "grad_norm": 0.521885097026825, + "learning_rate": 9.083415748469849e-05, + "loss": 1.8695, + "step": 7167 + }, + { + "epoch": 2.2001227747084102, + "grad_norm": 0.35051268339157104, + "learning_rate": 9.083128884077916e-05, + "loss": 1.9378, + "step": 7168 + }, + { + "epoch": 2.200429711479435, + "grad_norm": 0.40265345573425293, + "learning_rate": 9.082841979334111e-05, + "loss": 1.8902, + "step": 7169 + }, + { + "epoch": 2.2007366482504604, + "grad_norm": 0.506377637386322, + "learning_rate": 9.082555034241267e-05, + "loss": 1.9115, + "step": 7170 + }, + { + "epoch": 2.201043585021486, + "grad_norm": 0.42828384041786194, + "learning_rate": 9.082268048802223e-05, + "loss": 1.8173, + "step": 7171 + }, + { + "epoch": 2.2013505217925107, + "grad_norm": 0.2979312539100647, + "learning_rate": 9.081981023019812e-05, + "loss": 1.8089, + "step": 7172 + }, + { + "epoch": 2.201657458563536, + "grad_norm": 0.3840465843677521, + "learning_rate": 9.081693956896872e-05, + "loss": 1.8557, + "step": 7173 + }, + { + "epoch": 2.201964395334561, + "grad_norm": 0.41454845666885376, + "learning_rate": 9.081406850436241e-05, + "loss": 1.8599, + "step": 7174 + }, + { + "epoch": 2.2022713321055862, + "grad_norm": 0.3305908739566803, + "learning_rate": 9.081119703640756e-05, + "loss": 1.8013, + "step": 7175 + }, + { + "epoch": 2.2025782688766116, + "grad_norm": 0.33649876713752747, + "learning_rate": 9.080832516513252e-05, + "loss": 1.9028, + "step": 7176 + }, + { + "epoch": 2.2028852056476365, + "grad_norm": 0.41247284412384033, + "learning_rate": 9.08054528905657e-05, + "loss": 1.8636, + "step": 7177 + }, + { + "epoch": 2.203192142418662, + "grad_norm": 0.4355279505252838, + "learning_rate": 9.080258021273548e-05, + "loss": 1.8923, + "step": 7178 + }, + { + "epoch": 2.203499079189687, + "grad_norm": 0.34598320722579956, + "learning_rate": 9.079970713167026e-05, + "loss": 1.9187, + "step": 7179 + }, + { + "epoch": 2.203806015960712, + "grad_norm": 0.3560951054096222, + "learning_rate": 9.07968336473984e-05, + "loss": 1.9382, + "step": 7180 + }, + { + "epoch": 2.2041129527317374, + "grad_norm": 0.3873176872730255, + "learning_rate": 9.079395975994834e-05, + "loss": 1.8377, + "step": 7181 + }, + { + "epoch": 2.2044198895027622, + "grad_norm": 0.38699567317962646, + "learning_rate": 9.079108546934844e-05, + "loss": 1.848, + "step": 7182 + }, + { + "epoch": 2.2047268262737876, + "grad_norm": 0.3658364713191986, + "learning_rate": 9.078821077562712e-05, + "loss": 1.9308, + "step": 7183 + }, + { + "epoch": 2.205033763044813, + "grad_norm": 0.35228830575942993, + "learning_rate": 9.078533567881281e-05, + "loss": 1.8886, + "step": 7184 + }, + { + "epoch": 2.205340699815838, + "grad_norm": 0.4177337884902954, + "learning_rate": 9.07824601789339e-05, + "loss": 1.8695, + "step": 7185 + }, + { + "epoch": 2.205647636586863, + "grad_norm": 0.4778536260128021, + "learning_rate": 9.077958427601882e-05, + "loss": 1.8288, + "step": 7186 + }, + { + "epoch": 2.2059545733578885, + "grad_norm": 0.46544820070266724, + "learning_rate": 9.077670797009599e-05, + "loss": 1.8974, + "step": 7187 + }, + { + "epoch": 2.2062615101289134, + "grad_norm": 0.36188805103302, + "learning_rate": 9.077383126119382e-05, + "loss": 1.8953, + "step": 7188 + }, + { + "epoch": 2.2065684468999387, + "grad_norm": 0.30941206216812134, + "learning_rate": 9.077095414934075e-05, + "loss": 1.8395, + "step": 7189 + }, + { + "epoch": 2.2068753836709636, + "grad_norm": 0.4497200846672058, + "learning_rate": 9.076807663456524e-05, + "loss": 1.8485, + "step": 7190 + }, + { + "epoch": 2.207182320441989, + "grad_norm": 0.4923233985900879, + "learning_rate": 9.076519871689568e-05, + "loss": 1.8233, + "step": 7191 + }, + { + "epoch": 2.2074892572130143, + "grad_norm": 0.32226502895355225, + "learning_rate": 9.076232039636053e-05, + "loss": 1.8563, + "step": 7192 + }, + { + "epoch": 2.207796193984039, + "grad_norm": 0.46719446778297424, + "learning_rate": 9.075944167298824e-05, + "loss": 1.8602, + "step": 7193 + }, + { + "epoch": 2.2081031307550645, + "grad_norm": 0.5534674525260925, + "learning_rate": 9.075656254680727e-05, + "loss": 1.8804, + "step": 7194 + }, + { + "epoch": 2.20841006752609, + "grad_norm": 0.4895678162574768, + "learning_rate": 9.075368301784606e-05, + "loss": 1.8893, + "step": 7195 + }, + { + "epoch": 2.2087170042971147, + "grad_norm": 0.33137625455856323, + "learning_rate": 9.075080308613306e-05, + "loss": 1.9158, + "step": 7196 + }, + { + "epoch": 2.20902394106814, + "grad_norm": 0.469319611787796, + "learning_rate": 9.074792275169674e-05, + "loss": 1.8628, + "step": 7197 + }, + { + "epoch": 2.209330877839165, + "grad_norm": 0.43872305750846863, + "learning_rate": 9.074504201456556e-05, + "loss": 1.8867, + "step": 7198 + }, + { + "epoch": 2.2096378146101903, + "grad_norm": 0.32900992035865784, + "learning_rate": 9.0742160874768e-05, + "loss": 1.8079, + "step": 7199 + }, + { + "epoch": 2.2099447513812156, + "grad_norm": 0.34231048822402954, + "learning_rate": 9.073927933233253e-05, + "loss": 1.9018, + "step": 7200 + }, + { + "epoch": 2.2102516881522405, + "grad_norm": 0.43461740016937256, + "learning_rate": 9.07363973872876e-05, + "loss": 1.8299, + "step": 7201 + }, + { + "epoch": 2.210558624923266, + "grad_norm": 0.43819913268089294, + "learning_rate": 9.073351503966174e-05, + "loss": 1.8641, + "step": 7202 + }, + { + "epoch": 2.210865561694291, + "grad_norm": 0.330683171749115, + "learning_rate": 9.073063228948339e-05, + "loss": 1.8595, + "step": 7203 + }, + { + "epoch": 2.211172498465316, + "grad_norm": 0.35648414492607117, + "learning_rate": 9.072774913678108e-05, + "loss": 1.8265, + "step": 7204 + }, + { + "epoch": 2.2114794352363414, + "grad_norm": 0.4420771300792694, + "learning_rate": 9.072486558158329e-05, + "loss": 1.902, + "step": 7205 + }, + { + "epoch": 2.2117863720073663, + "grad_norm": 0.41682472825050354, + "learning_rate": 9.072198162391849e-05, + "loss": 1.903, + "step": 7206 + }, + { + "epoch": 2.2120933087783916, + "grad_norm": 0.3194744288921356, + "learning_rate": 9.07190972638152e-05, + "loss": 1.8221, + "step": 7207 + }, + { + "epoch": 2.212400245549417, + "grad_norm": 0.35625776648521423, + "learning_rate": 9.071621250130192e-05, + "loss": 1.8737, + "step": 7208 + }, + { + "epoch": 2.212707182320442, + "grad_norm": 0.4136293828487396, + "learning_rate": 9.071332733640716e-05, + "loss": 1.7995, + "step": 7209 + }, + { + "epoch": 2.213014119091467, + "grad_norm": 0.39144495129585266, + "learning_rate": 9.071044176915947e-05, + "loss": 1.8446, + "step": 7210 + }, + { + "epoch": 2.2133210558624925, + "grad_norm": 0.3082813322544098, + "learning_rate": 9.07075557995873e-05, + "loss": 1.7635, + "step": 7211 + }, + { + "epoch": 2.2136279926335174, + "grad_norm": 0.3642291724681854, + "learning_rate": 9.070466942771921e-05, + "loss": 1.9471, + "step": 7212 + }, + { + "epoch": 2.2139349294045427, + "grad_norm": 0.4506807029247284, + "learning_rate": 9.070178265358372e-05, + "loss": 1.8542, + "step": 7213 + }, + { + "epoch": 2.214241866175568, + "grad_norm": 0.5011601448059082, + "learning_rate": 9.069889547720936e-05, + "loss": 1.9135, + "step": 7214 + }, + { + "epoch": 2.214548802946593, + "grad_norm": 0.3946228623390198, + "learning_rate": 9.069600789862467e-05, + "loss": 1.876, + "step": 7215 + }, + { + "epoch": 2.2148557397176183, + "grad_norm": 0.34833815693855286, + "learning_rate": 9.069311991785816e-05, + "loss": 1.8666, + "step": 7216 + }, + { + "epoch": 2.215162676488643, + "grad_norm": 0.43735191226005554, + "learning_rate": 9.069023153493839e-05, + "loss": 1.9238, + "step": 7217 + }, + { + "epoch": 2.2154696132596685, + "grad_norm": 0.5010718107223511, + "learning_rate": 9.06873427498939e-05, + "loss": 1.8724, + "step": 7218 + }, + { + "epoch": 2.215776550030694, + "grad_norm": 0.35850396752357483, + "learning_rate": 9.068445356275326e-05, + "loss": 1.8825, + "step": 7219 + }, + { + "epoch": 2.2160834868017187, + "grad_norm": 0.3528468906879425, + "learning_rate": 9.0681563973545e-05, + "loss": 1.8724, + "step": 7220 + }, + { + "epoch": 2.216390423572744, + "grad_norm": 0.34725508093833923, + "learning_rate": 9.067867398229767e-05, + "loss": 1.8722, + "step": 7221 + }, + { + "epoch": 2.216697360343769, + "grad_norm": 0.3343757092952728, + "learning_rate": 9.067578358903985e-05, + "loss": 1.8144, + "step": 7222 + }, + { + "epoch": 2.2170042971147943, + "grad_norm": 0.33384087681770325, + "learning_rate": 9.067289279380009e-05, + "loss": 1.832, + "step": 7223 + }, + { + "epoch": 2.2173112338858196, + "grad_norm": 0.3275810778141022, + "learning_rate": 9.067000159660697e-05, + "loss": 1.8819, + "step": 7224 + }, + { + "epoch": 2.2176181706568445, + "grad_norm": 0.405293732881546, + "learning_rate": 9.066710999748904e-05, + "loss": 1.8669, + "step": 7225 + }, + { + "epoch": 2.21792510742787, + "grad_norm": 0.3554569482803345, + "learning_rate": 9.066421799647491e-05, + "loss": 1.8331, + "step": 7226 + }, + { + "epoch": 2.218232044198895, + "grad_norm": 0.3896840810775757, + "learning_rate": 9.066132559359313e-05, + "loss": 1.891, + "step": 7227 + }, + { + "epoch": 2.21853898096992, + "grad_norm": 0.38668718934059143, + "learning_rate": 9.065843278887231e-05, + "loss": 1.9162, + "step": 7228 + }, + { + "epoch": 2.2188459177409454, + "grad_norm": 0.3593392074108124, + "learning_rate": 9.065553958234103e-05, + "loss": 1.866, + "step": 7229 + }, + { + "epoch": 2.2191528545119708, + "grad_norm": 0.3509809076786041, + "learning_rate": 9.065264597402788e-05, + "loss": 1.8979, + "step": 7230 + }, + { + "epoch": 2.2194597912829956, + "grad_norm": 0.35477882623672485, + "learning_rate": 9.064975196396144e-05, + "loss": 1.8425, + "step": 7231 + }, + { + "epoch": 2.219766728054021, + "grad_norm": 0.38763463497161865, + "learning_rate": 9.064685755217033e-05, + "loss": 1.8853, + "step": 7232 + }, + { + "epoch": 2.220073664825046, + "grad_norm": 0.33559930324554443, + "learning_rate": 9.064396273868316e-05, + "loss": 1.8825, + "step": 7233 + }, + { + "epoch": 2.220380601596071, + "grad_norm": 0.3130233585834503, + "learning_rate": 9.064106752352852e-05, + "loss": 1.8082, + "step": 7234 + }, + { + "epoch": 2.2206875383670965, + "grad_norm": 0.33321285247802734, + "learning_rate": 9.063817190673503e-05, + "loss": 1.8795, + "step": 7235 + }, + { + "epoch": 2.2209944751381214, + "grad_norm": 0.47564151883125305, + "learning_rate": 9.063527588833132e-05, + "loss": 1.9461, + "step": 7236 + }, + { + "epoch": 2.2213014119091468, + "grad_norm": 0.38102859258651733, + "learning_rate": 9.063237946834597e-05, + "loss": 1.8656, + "step": 7237 + }, + { + "epoch": 2.2216083486801717, + "grad_norm": 0.32240456342697144, + "learning_rate": 9.062948264680765e-05, + "loss": 1.8187, + "step": 7238 + }, + { + "epoch": 2.221915285451197, + "grad_norm": 0.2852800190448761, + "learning_rate": 9.062658542374496e-05, + "loss": 1.8172, + "step": 7239 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3506350815296173, + "learning_rate": 9.062368779918655e-05, + "loss": 1.8909, + "step": 7240 + }, + { + "epoch": 2.222529158993247, + "grad_norm": 0.29418817162513733, + "learning_rate": 9.062078977316104e-05, + "loss": 1.8078, + "step": 7241 + }, + { + "epoch": 2.2228360957642725, + "grad_norm": 0.31221407651901245, + "learning_rate": 9.061789134569707e-05, + "loss": 1.8813, + "step": 7242 + }, + { + "epoch": 2.223143032535298, + "grad_norm": 0.32314184308052063, + "learning_rate": 9.061499251682331e-05, + "loss": 1.8838, + "step": 7243 + }, + { + "epoch": 2.2234499693063228, + "grad_norm": 0.3329566419124603, + "learning_rate": 9.061209328656838e-05, + "loss": 1.8987, + "step": 7244 + }, + { + "epoch": 2.223756906077348, + "grad_norm": 0.35992133617401123, + "learning_rate": 9.060919365496094e-05, + "loss": 1.9194, + "step": 7245 + }, + { + "epoch": 2.2240638428483734, + "grad_norm": 0.33594536781311035, + "learning_rate": 9.060629362202964e-05, + "loss": 1.8303, + "step": 7246 + }, + { + "epoch": 2.2243707796193983, + "grad_norm": 0.3469938635826111, + "learning_rate": 9.060339318780316e-05, + "loss": 1.905, + "step": 7247 + }, + { + "epoch": 2.2246777163904237, + "grad_norm": 0.3989942967891693, + "learning_rate": 9.060049235231015e-05, + "loss": 1.8655, + "step": 7248 + }, + { + "epoch": 2.2249846531614486, + "grad_norm": 0.35004356503486633, + "learning_rate": 9.059759111557926e-05, + "loss": 1.8081, + "step": 7249 + }, + { + "epoch": 2.225291589932474, + "grad_norm": 0.38162320852279663, + "learning_rate": 9.059468947763919e-05, + "loss": 1.9243, + "step": 7250 + }, + { + "epoch": 2.2255985267034992, + "grad_norm": 0.3417564034461975, + "learning_rate": 9.059178743851859e-05, + "loss": 1.8246, + "step": 7251 + }, + { + "epoch": 2.225905463474524, + "grad_norm": 0.39185380935668945, + "learning_rate": 9.058888499824618e-05, + "loss": 1.9235, + "step": 7252 + }, + { + "epoch": 2.2262124002455494, + "grad_norm": 0.5741223096847534, + "learning_rate": 9.058598215685061e-05, + "loss": 1.9104, + "step": 7253 + }, + { + "epoch": 2.226519337016575, + "grad_norm": 0.6595804691314697, + "learning_rate": 9.058307891436057e-05, + "loss": 1.9956, + "step": 7254 + }, + { + "epoch": 2.2268262737875997, + "grad_norm": 0.6249661445617676, + "learning_rate": 9.058017527080476e-05, + "loss": 1.8913, + "step": 7255 + }, + { + "epoch": 2.227133210558625, + "grad_norm": 0.48208609223365784, + "learning_rate": 9.057727122621188e-05, + "loss": 1.9116, + "step": 7256 + }, + { + "epoch": 2.22744014732965, + "grad_norm": 0.37400147318840027, + "learning_rate": 9.057436678061062e-05, + "loss": 1.8828, + "step": 7257 + }, + { + "epoch": 2.2277470841006752, + "grad_norm": 0.40321463346481323, + "learning_rate": 9.057146193402968e-05, + "loss": 1.7984, + "step": 7258 + }, + { + "epoch": 2.2280540208717006, + "grad_norm": 0.43090149760246277, + "learning_rate": 9.056855668649778e-05, + "loss": 1.9135, + "step": 7259 + }, + { + "epoch": 2.2283609576427255, + "grad_norm": 0.3625677525997162, + "learning_rate": 9.056565103804362e-05, + "loss": 1.9005, + "step": 7260 + }, + { + "epoch": 2.228667894413751, + "grad_norm": 0.3386496901512146, + "learning_rate": 9.056274498869593e-05, + "loss": 1.879, + "step": 7261 + }, + { + "epoch": 2.228974831184776, + "grad_norm": 0.45207980275154114, + "learning_rate": 9.05598385384834e-05, + "loss": 1.8748, + "step": 7262 + }, + { + "epoch": 2.229281767955801, + "grad_norm": 0.38665562868118286, + "learning_rate": 9.055693168743478e-05, + "loss": 1.8828, + "step": 7263 + }, + { + "epoch": 2.2295887047268264, + "grad_norm": 0.3074968159198761, + "learning_rate": 9.05540244355788e-05, + "loss": 1.8443, + "step": 7264 + }, + { + "epoch": 2.2298956414978512, + "grad_norm": 0.36243903636932373, + "learning_rate": 9.055111678294418e-05, + "loss": 1.8681, + "step": 7265 + }, + { + "epoch": 2.2302025782688766, + "grad_norm": 0.4070085287094116, + "learning_rate": 9.054820872955965e-05, + "loss": 1.8643, + "step": 7266 + }, + { + "epoch": 2.230509515039902, + "grad_norm": 0.3784204125404358, + "learning_rate": 9.054530027545396e-05, + "loss": 1.9197, + "step": 7267 + }, + { + "epoch": 2.230816451810927, + "grad_norm": 0.32002586126327515, + "learning_rate": 9.054239142065583e-05, + "loss": 1.9, + "step": 7268 + }, + { + "epoch": 2.231123388581952, + "grad_norm": 0.3701259195804596, + "learning_rate": 9.053948216519405e-05, + "loss": 1.8815, + "step": 7269 + }, + { + "epoch": 2.2314303253529775, + "grad_norm": 0.32927554845809937, + "learning_rate": 9.053657250909734e-05, + "loss": 1.8599, + "step": 7270 + }, + { + "epoch": 2.2317372621240024, + "grad_norm": 0.2915503680706024, + "learning_rate": 9.053366245239445e-05, + "loss": 1.8553, + "step": 7271 + }, + { + "epoch": 2.2320441988950277, + "grad_norm": 0.3347928822040558, + "learning_rate": 9.053075199511416e-05, + "loss": 1.926, + "step": 7272 + }, + { + "epoch": 2.2323511356660526, + "grad_norm": 0.37499183416366577, + "learning_rate": 9.052784113728523e-05, + "loss": 1.8636, + "step": 7273 + }, + { + "epoch": 2.232658072437078, + "grad_norm": 0.38303107023239136, + "learning_rate": 9.05249298789364e-05, + "loss": 1.8739, + "step": 7274 + }, + { + "epoch": 2.2329650092081033, + "grad_norm": 0.356942355632782, + "learning_rate": 9.052201822009648e-05, + "loss": 1.8401, + "step": 7275 + }, + { + "epoch": 2.233271945979128, + "grad_norm": 0.3391316533088684, + "learning_rate": 9.051910616079422e-05, + "loss": 1.8954, + "step": 7276 + }, + { + "epoch": 2.2335788827501535, + "grad_norm": 0.3100464344024658, + "learning_rate": 9.051619370105839e-05, + "loss": 1.8726, + "step": 7277 + }, + { + "epoch": 2.233885819521179, + "grad_norm": 0.38745078444480896, + "learning_rate": 9.05132808409178e-05, + "loss": 1.9605, + "step": 7278 + }, + { + "epoch": 2.2341927562922037, + "grad_norm": 0.40631747245788574, + "learning_rate": 9.051036758040123e-05, + "loss": 1.8458, + "step": 7279 + }, + { + "epoch": 2.234499693063229, + "grad_norm": 0.4084717929363251, + "learning_rate": 9.050745391953745e-05, + "loss": 1.8696, + "step": 7280 + }, + { + "epoch": 2.234806629834254, + "grad_norm": 0.4426955282688141, + "learning_rate": 9.050453985835527e-05, + "loss": 1.9063, + "step": 7281 + }, + { + "epoch": 2.2351135666052793, + "grad_norm": 0.37360796332359314, + "learning_rate": 9.05016253968835e-05, + "loss": 1.9299, + "step": 7282 + }, + { + "epoch": 2.2354205033763046, + "grad_norm": 0.34415799379348755, + "learning_rate": 9.049871053515091e-05, + "loss": 1.8877, + "step": 7283 + }, + { + "epoch": 2.2357274401473295, + "grad_norm": 0.3745698928833008, + "learning_rate": 9.049579527318633e-05, + "loss": 1.9272, + "step": 7284 + }, + { + "epoch": 2.236034376918355, + "grad_norm": 0.3293079435825348, + "learning_rate": 9.049287961101857e-05, + "loss": 1.8599, + "step": 7285 + }, + { + "epoch": 2.23634131368938, + "grad_norm": 0.3563106060028076, + "learning_rate": 9.048996354867644e-05, + "loss": 1.938, + "step": 7286 + }, + { + "epoch": 2.236648250460405, + "grad_norm": 0.36354976892471313, + "learning_rate": 9.048704708618876e-05, + "loss": 1.9401, + "step": 7287 + }, + { + "epoch": 2.2369551872314304, + "grad_norm": 0.32659000158309937, + "learning_rate": 9.048413022358434e-05, + "loss": 1.8056, + "step": 7288 + }, + { + "epoch": 2.2372621240024557, + "grad_norm": 0.30486637353897095, + "learning_rate": 9.048121296089202e-05, + "loss": 1.8178, + "step": 7289 + }, + { + "epoch": 2.2375690607734806, + "grad_norm": 0.34506455063819885, + "learning_rate": 9.047829529814063e-05, + "loss": 1.8866, + "step": 7290 + }, + { + "epoch": 2.237875997544506, + "grad_norm": 0.3200983703136444, + "learning_rate": 9.047537723535902e-05, + "loss": 1.8218, + "step": 7291 + }, + { + "epoch": 2.238182934315531, + "grad_norm": 0.33315715193748474, + "learning_rate": 9.047245877257597e-05, + "loss": 1.8939, + "step": 7292 + }, + { + "epoch": 2.238489871086556, + "grad_norm": 0.38259127736091614, + "learning_rate": 9.046953990982039e-05, + "loss": 1.9566, + "step": 7293 + }, + { + "epoch": 2.2387968078575815, + "grad_norm": 0.32880350947380066, + "learning_rate": 9.04666206471211e-05, + "loss": 1.9056, + "step": 7294 + }, + { + "epoch": 2.2391037446286064, + "grad_norm": 0.39114195108413696, + "learning_rate": 9.046370098450692e-05, + "loss": 1.8773, + "step": 7295 + }, + { + "epoch": 2.2394106813996317, + "grad_norm": 0.37625813484191895, + "learning_rate": 9.046078092200675e-05, + "loss": 1.8685, + "step": 7296 + }, + { + "epoch": 2.2397176181706566, + "grad_norm": 0.3604978621006012, + "learning_rate": 9.045786045964942e-05, + "loss": 1.885, + "step": 7297 + }, + { + "epoch": 2.240024554941682, + "grad_norm": 0.32200589776039124, + "learning_rate": 9.045493959746381e-05, + "loss": 1.9146, + "step": 7298 + }, + { + "epoch": 2.2403314917127073, + "grad_norm": 0.3635976314544678, + "learning_rate": 9.045201833547876e-05, + "loss": 1.8597, + "step": 7299 + }, + { + "epoch": 2.240638428483732, + "grad_norm": 0.3326318562030792, + "learning_rate": 9.044909667372317e-05, + "loss": 1.8577, + "step": 7300 + }, + { + "epoch": 2.2409453652547575, + "grad_norm": 0.32209664583206177, + "learning_rate": 9.044617461222589e-05, + "loss": 1.844, + "step": 7301 + }, + { + "epoch": 2.241252302025783, + "grad_norm": 0.3654637634754181, + "learning_rate": 9.044325215101581e-05, + "loss": 1.8858, + "step": 7302 + }, + { + "epoch": 2.2415592387968077, + "grad_norm": 0.3583166003227234, + "learning_rate": 9.04403292901218e-05, + "loss": 1.8148, + "step": 7303 + }, + { + "epoch": 2.241866175567833, + "grad_norm": 0.3315606117248535, + "learning_rate": 9.043740602957276e-05, + "loss": 1.8504, + "step": 7304 + }, + { + "epoch": 2.2421731123388584, + "grad_norm": 0.36084556579589844, + "learning_rate": 9.043448236939758e-05, + "loss": 1.9167, + "step": 7305 + }, + { + "epoch": 2.2424800491098833, + "grad_norm": 0.43558987975120544, + "learning_rate": 9.043155830962514e-05, + "loss": 1.8937, + "step": 7306 + }, + { + "epoch": 2.2427869858809086, + "grad_norm": 0.455240398645401, + "learning_rate": 9.042863385028433e-05, + "loss": 1.9774, + "step": 7307 + }, + { + "epoch": 2.2430939226519335, + "grad_norm": 0.35868698358535767, + "learning_rate": 9.042570899140408e-05, + "loss": 1.7999, + "step": 7308 + }, + { + "epoch": 2.243400859422959, + "grad_norm": 0.33930447697639465, + "learning_rate": 9.042278373301327e-05, + "loss": 1.965, + "step": 7309 + }, + { + "epoch": 2.243707796193984, + "grad_norm": 0.34124335646629333, + "learning_rate": 9.041985807514082e-05, + "loss": 1.8916, + "step": 7310 + }, + { + "epoch": 2.244014732965009, + "grad_norm": 0.3905695974826813, + "learning_rate": 9.041693201781565e-05, + "loss": 1.9066, + "step": 7311 + }, + { + "epoch": 2.2443216697360344, + "grad_norm": 0.3108711242675781, + "learning_rate": 9.041400556106667e-05, + "loss": 1.8038, + "step": 7312 + }, + { + "epoch": 2.2446286065070598, + "grad_norm": 0.2853390872478485, + "learning_rate": 9.041107870492279e-05, + "loss": 1.8945, + "step": 7313 + }, + { + "epoch": 2.2449355432780846, + "grad_norm": 0.33351564407348633, + "learning_rate": 9.040815144941295e-05, + "loss": 1.8796, + "step": 7314 + }, + { + "epoch": 2.24524248004911, + "grad_norm": 0.3470609486103058, + "learning_rate": 9.040522379456606e-05, + "loss": 1.8914, + "step": 7315 + }, + { + "epoch": 2.245549416820135, + "grad_norm": 0.3474356532096863, + "learning_rate": 9.040229574041109e-05, + "loss": 1.838, + "step": 7316 + }, + { + "epoch": 2.24585635359116, + "grad_norm": 0.36590397357940674, + "learning_rate": 9.039936728697693e-05, + "loss": 1.86, + "step": 7317 + }, + { + "epoch": 2.2461632903621855, + "grad_norm": 0.35168272256851196, + "learning_rate": 9.039643843429257e-05, + "loss": 1.9337, + "step": 7318 + }, + { + "epoch": 2.2464702271332104, + "grad_norm": 0.3402341604232788, + "learning_rate": 9.039350918238691e-05, + "loss": 1.9291, + "step": 7319 + }, + { + "epoch": 2.2467771639042358, + "grad_norm": 0.3505321443080902, + "learning_rate": 9.03905795312889e-05, + "loss": 1.8252, + "step": 7320 + }, + { + "epoch": 2.247084100675261, + "grad_norm": 0.38366270065307617, + "learning_rate": 9.038764948102754e-05, + "loss": 1.8685, + "step": 7321 + }, + { + "epoch": 2.247391037446286, + "grad_norm": 0.3616010844707489, + "learning_rate": 9.038471903163176e-05, + "loss": 1.8734, + "step": 7322 + }, + { + "epoch": 2.2476979742173113, + "grad_norm": 0.2982875108718872, + "learning_rate": 9.038178818313048e-05, + "loss": 1.824, + "step": 7323 + }, + { + "epoch": 2.248004910988336, + "grad_norm": 0.41936174035072327, + "learning_rate": 9.037885693555273e-05, + "loss": 1.8799, + "step": 7324 + }, + { + "epoch": 2.2483118477593615, + "grad_norm": 0.3460717797279358, + "learning_rate": 9.037592528892744e-05, + "loss": 1.8889, + "step": 7325 + }, + { + "epoch": 2.248618784530387, + "grad_norm": 0.34347018599510193, + "learning_rate": 9.03729932432836e-05, + "loss": 1.8779, + "step": 7326 + }, + { + "epoch": 2.2489257213014118, + "grad_norm": 0.2988032400608063, + "learning_rate": 9.037006079865016e-05, + "loss": 1.8753, + "step": 7327 + }, + { + "epoch": 2.249232658072437, + "grad_norm": 0.32754310965538025, + "learning_rate": 9.036712795505613e-05, + "loss": 1.8896, + "step": 7328 + }, + { + "epoch": 2.2495395948434624, + "grad_norm": 0.3599032163619995, + "learning_rate": 9.036419471253049e-05, + "loss": 1.8752, + "step": 7329 + }, + { + "epoch": 2.2498465316144873, + "grad_norm": 0.3461225926876068, + "learning_rate": 9.03612610711022e-05, + "loss": 1.8723, + "step": 7330 + }, + { + "epoch": 2.2501534683855127, + "grad_norm": 0.3141838610172272, + "learning_rate": 9.035832703080027e-05, + "loss": 1.8825, + "step": 7331 + }, + { + "epoch": 2.250460405156538, + "grad_norm": 0.35188567638397217, + "learning_rate": 9.035539259165371e-05, + "loss": 1.8832, + "step": 7332 + }, + { + "epoch": 2.250767341927563, + "grad_norm": 0.3496280014514923, + "learning_rate": 9.035245775369151e-05, + "loss": 1.9084, + "step": 7333 + }, + { + "epoch": 2.2510742786985882, + "grad_norm": 0.34936273097991943, + "learning_rate": 9.034952251694266e-05, + "loss": 1.8142, + "step": 7334 + }, + { + "epoch": 2.251381215469613, + "grad_norm": 0.4227045774459839, + "learning_rate": 9.034658688143618e-05, + "loss": 1.9454, + "step": 7335 + }, + { + "epoch": 2.2516881522406385, + "grad_norm": 0.4042366147041321, + "learning_rate": 9.034365084720108e-05, + "loss": 1.8993, + "step": 7336 + }, + { + "epoch": 2.251995089011664, + "grad_norm": 0.392633318901062, + "learning_rate": 9.03407144142664e-05, + "loss": 1.9229, + "step": 7337 + }, + { + "epoch": 2.2523020257826887, + "grad_norm": 0.31304940581321716, + "learning_rate": 9.033777758266111e-05, + "loss": 1.8746, + "step": 7338 + }, + { + "epoch": 2.252608962553714, + "grad_norm": 0.3205752372741699, + "learning_rate": 9.033484035241426e-05, + "loss": 1.8224, + "step": 7339 + }, + { + "epoch": 2.252915899324739, + "grad_norm": 0.32164251804351807, + "learning_rate": 9.033190272355488e-05, + "loss": 1.8164, + "step": 7340 + }, + { + "epoch": 2.2532228360957642, + "grad_norm": 0.3567545413970947, + "learning_rate": 9.032896469611201e-05, + "loss": 1.8892, + "step": 7341 + }, + { + "epoch": 2.2535297728667896, + "grad_norm": 0.3475800156593323, + "learning_rate": 9.032602627011467e-05, + "loss": 1.8594, + "step": 7342 + }, + { + "epoch": 2.2538367096378145, + "grad_norm": 0.38770994544029236, + "learning_rate": 9.032308744559189e-05, + "loss": 1.8899, + "step": 7343 + }, + { + "epoch": 2.25414364640884, + "grad_norm": 0.3671153783798218, + "learning_rate": 9.032014822257273e-05, + "loss": 1.8795, + "step": 7344 + }, + { + "epoch": 2.254450583179865, + "grad_norm": 0.3415989875793457, + "learning_rate": 9.031720860108623e-05, + "loss": 1.9007, + "step": 7345 + }, + { + "epoch": 2.25475751995089, + "grad_norm": 0.3317084014415741, + "learning_rate": 9.031426858116145e-05, + "loss": 1.8604, + "step": 7346 + }, + { + "epoch": 2.2550644567219154, + "grad_norm": 0.3760251998901367, + "learning_rate": 9.031132816282745e-05, + "loss": 1.9061, + "step": 7347 + }, + { + "epoch": 2.2553713934929407, + "grad_norm": 0.4288908541202545, + "learning_rate": 9.030838734611326e-05, + "loss": 1.8621, + "step": 7348 + }, + { + "epoch": 2.2556783302639656, + "grad_norm": 0.3840491771697998, + "learning_rate": 9.030544613104797e-05, + "loss": 1.8743, + "step": 7349 + }, + { + "epoch": 2.255985267034991, + "grad_norm": 0.32746297121047974, + "learning_rate": 9.030250451766063e-05, + "loss": 1.8813, + "step": 7350 + }, + { + "epoch": 2.256292203806016, + "grad_norm": 0.31266525387763977, + "learning_rate": 9.029956250598032e-05, + "loss": 1.816, + "step": 7351 + }, + { + "epoch": 2.256599140577041, + "grad_norm": 0.34744998812675476, + "learning_rate": 9.029662009603613e-05, + "loss": 1.8728, + "step": 7352 + }, + { + "epoch": 2.2569060773480665, + "grad_norm": 0.36204856634140015, + "learning_rate": 9.029367728785709e-05, + "loss": 1.9331, + "step": 7353 + }, + { + "epoch": 2.2572130141190914, + "grad_norm": 0.3839271664619446, + "learning_rate": 9.029073408147234e-05, + "loss": 2.0018, + "step": 7354 + }, + { + "epoch": 2.2575199508901167, + "grad_norm": 0.34844526648521423, + "learning_rate": 9.028779047691094e-05, + "loss": 1.8873, + "step": 7355 + }, + { + "epoch": 2.2578268876611416, + "grad_norm": 0.31876906752586365, + "learning_rate": 9.028484647420196e-05, + "loss": 1.8569, + "step": 7356 + }, + { + "epoch": 2.258133824432167, + "grad_norm": 0.3633274435997009, + "learning_rate": 9.028190207337452e-05, + "loss": 1.8645, + "step": 7357 + }, + { + "epoch": 2.2584407612031923, + "grad_norm": 0.39025530219078064, + "learning_rate": 9.027895727445775e-05, + "loss": 1.911, + "step": 7358 + }, + { + "epoch": 2.258747697974217, + "grad_norm": 0.34168434143066406, + "learning_rate": 9.027601207748067e-05, + "loss": 1.8675, + "step": 7359 + }, + { + "epoch": 2.2590546347452425, + "grad_norm": 0.3539605438709259, + "learning_rate": 9.027306648247245e-05, + "loss": 1.9001, + "step": 7360 + }, + { + "epoch": 2.259361571516268, + "grad_norm": 0.30433401465415955, + "learning_rate": 9.02701204894622e-05, + "loss": 1.8598, + "step": 7361 + }, + { + "epoch": 2.2596685082872927, + "grad_norm": 0.35448700189590454, + "learning_rate": 9.026717409847898e-05, + "loss": 1.8845, + "step": 7362 + }, + { + "epoch": 2.259975445058318, + "grad_norm": 0.34060248732566833, + "learning_rate": 9.026422730955197e-05, + "loss": 1.9322, + "step": 7363 + }, + { + "epoch": 2.2602823818293434, + "grad_norm": 0.3370642364025116, + "learning_rate": 9.026128012271026e-05, + "loss": 1.8356, + "step": 7364 + }, + { + "epoch": 2.2605893186003683, + "grad_norm": 0.3148033022880554, + "learning_rate": 9.025833253798298e-05, + "loss": 1.7723, + "step": 7365 + }, + { + "epoch": 2.2608962553713936, + "grad_norm": 0.3062879145145416, + "learning_rate": 9.025538455539925e-05, + "loss": 1.8548, + "step": 7366 + }, + { + "epoch": 2.2612031921424185, + "grad_norm": 0.3378484547138214, + "learning_rate": 9.025243617498825e-05, + "loss": 1.9049, + "step": 7367 + }, + { + "epoch": 2.261510128913444, + "grad_norm": 0.277660608291626, + "learning_rate": 9.024948739677905e-05, + "loss": 1.7833, + "step": 7368 + }, + { + "epoch": 2.261817065684469, + "grad_norm": 0.3986060619354248, + "learning_rate": 9.024653822080083e-05, + "loss": 1.8837, + "step": 7369 + }, + { + "epoch": 2.262124002455494, + "grad_norm": 0.3013289272785187, + "learning_rate": 9.024358864708275e-05, + "loss": 1.8659, + "step": 7370 + }, + { + "epoch": 2.2624309392265194, + "grad_norm": 0.3403053879737854, + "learning_rate": 9.024063867565391e-05, + "loss": 1.8914, + "step": 7371 + }, + { + "epoch": 2.2627378759975443, + "grad_norm": 0.3488257825374603, + "learning_rate": 9.023768830654351e-05, + "loss": 1.8887, + "step": 7372 + }, + { + "epoch": 2.2630448127685696, + "grad_norm": 0.2950255274772644, + "learning_rate": 9.023473753978069e-05, + "loss": 1.8385, + "step": 7373 + }, + { + "epoch": 2.263351749539595, + "grad_norm": 0.35732173919677734, + "learning_rate": 9.023178637539461e-05, + "loss": 1.8769, + "step": 7374 + }, + { + "epoch": 2.26365868631062, + "grad_norm": 0.5403436422348022, + "learning_rate": 9.022883481341445e-05, + "loss": 1.9742, + "step": 7375 + }, + { + "epoch": 2.263965623081645, + "grad_norm": 0.5506799221038818, + "learning_rate": 9.022588285386935e-05, + "loss": 1.8667, + "step": 7376 + }, + { + "epoch": 2.2642725598526705, + "grad_norm": 0.4272395372390747, + "learning_rate": 9.02229304967885e-05, + "loss": 1.8336, + "step": 7377 + }, + { + "epoch": 2.2645794966236954, + "grad_norm": 0.34911462664604187, + "learning_rate": 9.021997774220108e-05, + "loss": 1.8608, + "step": 7378 + }, + { + "epoch": 2.2648864333947207, + "grad_norm": 0.3592715263366699, + "learning_rate": 9.021702459013626e-05, + "loss": 1.925, + "step": 7379 + }, + { + "epoch": 2.265193370165746, + "grad_norm": 0.38482216000556946, + "learning_rate": 9.021407104062323e-05, + "loss": 1.8553, + "step": 7380 + }, + { + "epoch": 2.265500306936771, + "grad_norm": 0.4675584137439728, + "learning_rate": 9.021111709369118e-05, + "loss": 1.9303, + "step": 7381 + }, + { + "epoch": 2.2658072437077963, + "grad_norm": 0.40397754311561584, + "learning_rate": 9.02081627493693e-05, + "loss": 1.9512, + "step": 7382 + }, + { + "epoch": 2.266114180478821, + "grad_norm": 0.3385498821735382, + "learning_rate": 9.02052080076868e-05, + "loss": 1.8314, + "step": 7383 + }, + { + "epoch": 2.2664211172498465, + "grad_norm": 0.40668871998786926, + "learning_rate": 9.020225286867285e-05, + "loss": 1.8658, + "step": 7384 + }, + { + "epoch": 2.266728054020872, + "grad_norm": 0.4566061198711395, + "learning_rate": 9.01992973323567e-05, + "loss": 1.8429, + "step": 7385 + }, + { + "epoch": 2.2670349907918967, + "grad_norm": 0.42283549904823303, + "learning_rate": 9.019634139876752e-05, + "loss": 1.8858, + "step": 7386 + }, + { + "epoch": 2.267341927562922, + "grad_norm": 0.3491251468658447, + "learning_rate": 9.019338506793454e-05, + "loss": 1.8389, + "step": 7387 + }, + { + "epoch": 2.267648864333947, + "grad_norm": 0.33846428990364075, + "learning_rate": 9.019042833988696e-05, + "loss": 1.8309, + "step": 7388 + }, + { + "epoch": 2.2679558011049723, + "grad_norm": 0.39968016743659973, + "learning_rate": 9.0187471214654e-05, + "loss": 1.8591, + "step": 7389 + }, + { + "epoch": 2.2682627378759976, + "grad_norm": 0.39926376938819885, + "learning_rate": 9.018451369226493e-05, + "loss": 1.9341, + "step": 7390 + }, + { + "epoch": 2.2685696746470225, + "grad_norm": 0.41112056374549866, + "learning_rate": 9.018155577274892e-05, + "loss": 1.8856, + "step": 7391 + }, + { + "epoch": 2.268876611418048, + "grad_norm": 0.49490058422088623, + "learning_rate": 9.017859745613521e-05, + "loss": 1.8458, + "step": 7392 + }, + { + "epoch": 2.269183548189073, + "grad_norm": 0.42149874567985535, + "learning_rate": 9.017563874245308e-05, + "loss": 1.862, + "step": 7393 + }, + { + "epoch": 2.269490484960098, + "grad_norm": 0.37284091114997864, + "learning_rate": 9.017267963173173e-05, + "loss": 1.8698, + "step": 7394 + }, + { + "epoch": 2.2697974217311234, + "grad_norm": 0.3743322193622589, + "learning_rate": 9.016972012400041e-05, + "loss": 1.8847, + "step": 7395 + }, + { + "epoch": 2.2701043585021488, + "grad_norm": 0.4327050447463989, + "learning_rate": 9.016676021928838e-05, + "loss": 1.8227, + "step": 7396 + }, + { + "epoch": 2.2704112952731736, + "grad_norm": 0.4334336519241333, + "learning_rate": 9.016379991762487e-05, + "loss": 1.9292, + "step": 7397 + }, + { + "epoch": 2.270718232044199, + "grad_norm": 0.37071630358695984, + "learning_rate": 9.016083921903915e-05, + "loss": 1.8045, + "step": 7398 + }, + { + "epoch": 2.271025168815224, + "grad_norm": 0.32131752371788025, + "learning_rate": 9.015787812356049e-05, + "loss": 1.8697, + "step": 7399 + }, + { + "epoch": 2.271332105586249, + "grad_norm": 0.3604664206504822, + "learning_rate": 9.015491663121813e-05, + "loss": 1.9259, + "step": 7400 + }, + { + "epoch": 2.2716390423572745, + "grad_norm": 0.3364580571651459, + "learning_rate": 9.015195474204136e-05, + "loss": 1.8964, + "step": 7401 + }, + { + "epoch": 2.2719459791282994, + "grad_norm": 0.3141402304172516, + "learning_rate": 9.014899245605944e-05, + "loss": 1.8536, + "step": 7402 + }, + { + "epoch": 2.2722529158993248, + "grad_norm": 0.3387024402618408, + "learning_rate": 9.014602977330162e-05, + "loss": 1.8362, + "step": 7403 + }, + { + "epoch": 2.27255985267035, + "grad_norm": 0.42270272970199585, + "learning_rate": 9.014306669379723e-05, + "loss": 1.8288, + "step": 7404 + }, + { + "epoch": 2.272866789441375, + "grad_norm": 0.4565230906009674, + "learning_rate": 9.01401032175755e-05, + "loss": 1.8573, + "step": 7405 + }, + { + "epoch": 2.2731737262124003, + "grad_norm": 0.38861140608787537, + "learning_rate": 9.013713934466576e-05, + "loss": 1.8778, + "step": 7406 + }, + { + "epoch": 2.2734806629834257, + "grad_norm": 0.31552520394325256, + "learning_rate": 9.01341750750973e-05, + "loss": 1.8342, + "step": 7407 + }, + { + "epoch": 2.2737875997544506, + "grad_norm": 0.3771591782569885, + "learning_rate": 9.013121040889938e-05, + "loss": 1.8847, + "step": 7408 + }, + { + "epoch": 2.274094536525476, + "grad_norm": 0.3689042925834656, + "learning_rate": 9.012824534610132e-05, + "loss": 1.9014, + "step": 7409 + }, + { + "epoch": 2.2744014732965008, + "grad_norm": 0.31477800011634827, + "learning_rate": 9.012527988673241e-05, + "loss": 1.8631, + "step": 7410 + }, + { + "epoch": 2.274708410067526, + "grad_norm": 0.3238977789878845, + "learning_rate": 9.012231403082199e-05, + "loss": 1.8319, + "step": 7411 + }, + { + "epoch": 2.2750153468385514, + "grad_norm": 0.3587593138217926, + "learning_rate": 9.011934777839932e-05, + "loss": 1.8982, + "step": 7412 + }, + { + "epoch": 2.2753222836095763, + "grad_norm": 0.35946986079216003, + "learning_rate": 9.011638112949376e-05, + "loss": 1.9206, + "step": 7413 + }, + { + "epoch": 2.2756292203806017, + "grad_norm": 0.3451001048088074, + "learning_rate": 9.01134140841346e-05, + "loss": 1.8122, + "step": 7414 + }, + { + "epoch": 2.2759361571516266, + "grad_norm": 0.3779532313346863, + "learning_rate": 9.011044664235116e-05, + "loss": 1.8851, + "step": 7415 + }, + { + "epoch": 2.276243093922652, + "grad_norm": 0.3812767267227173, + "learning_rate": 9.010747880417279e-05, + "loss": 1.902, + "step": 7416 + }, + { + "epoch": 2.2765500306936772, + "grad_norm": 0.3666127920150757, + "learning_rate": 9.01045105696288e-05, + "loss": 1.8296, + "step": 7417 + }, + { + "epoch": 2.276856967464702, + "grad_norm": 0.3588816225528717, + "learning_rate": 9.010154193874854e-05, + "loss": 1.9023, + "step": 7418 + }, + { + "epoch": 2.2771639042357275, + "grad_norm": 0.37766706943511963, + "learning_rate": 9.009857291156134e-05, + "loss": 1.7996, + "step": 7419 + }, + { + "epoch": 2.277470841006753, + "grad_norm": 0.4222901165485382, + "learning_rate": 9.009560348809654e-05, + "loss": 1.8802, + "step": 7420 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.39289870858192444, + "learning_rate": 9.009263366838348e-05, + "loss": 1.8988, + "step": 7421 + }, + { + "epoch": 2.278084714548803, + "grad_norm": 0.3670540750026703, + "learning_rate": 9.008966345245152e-05, + "loss": 1.8348, + "step": 7422 + }, + { + "epoch": 2.2783916513198283, + "grad_norm": 0.36671552062034607, + "learning_rate": 9.008669284032998e-05, + "loss": 1.9059, + "step": 7423 + }, + { + "epoch": 2.2786985880908532, + "grad_norm": 0.33226338028907776, + "learning_rate": 9.008372183204827e-05, + "loss": 1.8736, + "step": 7424 + }, + { + "epoch": 2.2790055248618786, + "grad_norm": 0.3424983322620392, + "learning_rate": 9.008075042763573e-05, + "loss": 1.8537, + "step": 7425 + }, + { + "epoch": 2.2793124616329035, + "grad_norm": 0.3336870074272156, + "learning_rate": 9.007777862712172e-05, + "loss": 1.8622, + "step": 7426 + }, + { + "epoch": 2.279619398403929, + "grad_norm": 0.3488881289958954, + "learning_rate": 9.007480643053561e-05, + "loss": 1.88, + "step": 7427 + }, + { + "epoch": 2.279926335174954, + "grad_norm": 0.34159761667251587, + "learning_rate": 9.007183383790676e-05, + "loss": 1.8893, + "step": 7428 + }, + { + "epoch": 2.280233271945979, + "grad_norm": 0.3075805604457855, + "learning_rate": 9.006886084926459e-05, + "loss": 1.8613, + "step": 7429 + }, + { + "epoch": 2.2805402087170044, + "grad_norm": 0.32371413707733154, + "learning_rate": 9.006588746463844e-05, + "loss": 1.909, + "step": 7430 + }, + { + "epoch": 2.2808471454880292, + "grad_norm": 0.34343451261520386, + "learning_rate": 9.006291368405769e-05, + "loss": 1.8696, + "step": 7431 + }, + { + "epoch": 2.2811540822590546, + "grad_norm": 0.34018251299858093, + "learning_rate": 9.005993950755177e-05, + "loss": 1.9155, + "step": 7432 + }, + { + "epoch": 2.28146101903008, + "grad_norm": 0.42582982778549194, + "learning_rate": 9.005696493515003e-05, + "loss": 1.8901, + "step": 7433 + }, + { + "epoch": 2.281767955801105, + "grad_norm": 0.44168829917907715, + "learning_rate": 9.005398996688188e-05, + "loss": 1.8693, + "step": 7434 + }, + { + "epoch": 2.28207489257213, + "grad_norm": 0.3650555908679962, + "learning_rate": 9.005101460277673e-05, + "loss": 1.8726, + "step": 7435 + }, + { + "epoch": 2.2823818293431555, + "grad_norm": 0.2945705056190491, + "learning_rate": 9.004803884286399e-05, + "loss": 1.8655, + "step": 7436 + }, + { + "epoch": 2.2826887661141804, + "grad_norm": 0.4192120432853699, + "learning_rate": 9.004506268717305e-05, + "loss": 1.9859, + "step": 7437 + }, + { + "epoch": 2.2829957028852057, + "grad_norm": 0.35403937101364136, + "learning_rate": 9.004208613573334e-05, + "loss": 1.785, + "step": 7438 + }, + { + "epoch": 2.283302639656231, + "grad_norm": 0.3038218021392822, + "learning_rate": 9.003910918857426e-05, + "loss": 1.8199, + "step": 7439 + }, + { + "epoch": 2.283609576427256, + "grad_norm": 0.3447442352771759, + "learning_rate": 9.003613184572522e-05, + "loss": 1.882, + "step": 7440 + }, + { + "epoch": 2.2839165131982813, + "grad_norm": 0.32208123803138733, + "learning_rate": 9.003315410721567e-05, + "loss": 1.8326, + "step": 7441 + }, + { + "epoch": 2.284223449969306, + "grad_norm": 0.31731268763542175, + "learning_rate": 9.003017597307504e-05, + "loss": 1.8947, + "step": 7442 + }, + { + "epoch": 2.2845303867403315, + "grad_norm": 0.3491910398006439, + "learning_rate": 9.002719744333273e-05, + "loss": 1.924, + "step": 7443 + }, + { + "epoch": 2.284837323511357, + "grad_norm": 0.32135117053985596, + "learning_rate": 9.00242185180182e-05, + "loss": 1.838, + "step": 7444 + }, + { + "epoch": 2.2851442602823817, + "grad_norm": 0.32201823592185974, + "learning_rate": 9.00212391971609e-05, + "loss": 1.8449, + "step": 7445 + }, + { + "epoch": 2.285451197053407, + "grad_norm": 0.3983609676361084, + "learning_rate": 9.001825948079024e-05, + "loss": 1.8897, + "step": 7446 + }, + { + "epoch": 2.285758133824432, + "grad_norm": 0.4174421727657318, + "learning_rate": 9.001527936893568e-05, + "loss": 1.8671, + "step": 7447 + }, + { + "epoch": 2.2860650705954573, + "grad_norm": 0.3456888496875763, + "learning_rate": 9.001229886162668e-05, + "loss": 1.9064, + "step": 7448 + }, + { + "epoch": 2.2863720073664826, + "grad_norm": 0.3092229664325714, + "learning_rate": 9.000931795889269e-05, + "loss": 1.8478, + "step": 7449 + }, + { + "epoch": 2.2866789441375075, + "grad_norm": 0.40093541145324707, + "learning_rate": 9.000633666076317e-05, + "loss": 1.9226, + "step": 7450 + }, + { + "epoch": 2.286985880908533, + "grad_norm": 0.41090336441993713, + "learning_rate": 9.000335496726759e-05, + "loss": 1.8542, + "step": 7451 + }, + { + "epoch": 2.287292817679558, + "grad_norm": 0.48479974269866943, + "learning_rate": 9.00003728784354e-05, + "loss": 1.9217, + "step": 7452 + }, + { + "epoch": 2.287599754450583, + "grad_norm": 0.662677526473999, + "learning_rate": 8.999739039429609e-05, + "loss": 1.957, + "step": 7453 + }, + { + "epoch": 2.2879066912216084, + "grad_norm": 0.6417959928512573, + "learning_rate": 8.999440751487911e-05, + "loss": 1.8273, + "step": 7454 + }, + { + "epoch": 2.2882136279926337, + "grad_norm": 0.5561745762825012, + "learning_rate": 8.999142424021396e-05, + "loss": 1.9081, + "step": 7455 + }, + { + "epoch": 2.2885205647636586, + "grad_norm": 0.3603537976741791, + "learning_rate": 8.998844057033013e-05, + "loss": 1.8256, + "step": 7456 + }, + { + "epoch": 2.288827501534684, + "grad_norm": 0.5149406790733337, + "learning_rate": 8.998545650525707e-05, + "loss": 1.8257, + "step": 7457 + }, + { + "epoch": 2.289134438305709, + "grad_norm": 0.6777750253677368, + "learning_rate": 8.99824720450243e-05, + "loss": 1.8581, + "step": 7458 + }, + { + "epoch": 2.289441375076734, + "grad_norm": 0.6244171857833862, + "learning_rate": 8.997948718966132e-05, + "loss": 1.9195, + "step": 7459 + }, + { + "epoch": 2.2897483118477595, + "grad_norm": 0.3903466463088989, + "learning_rate": 8.99765019391976e-05, + "loss": 1.8996, + "step": 7460 + }, + { + "epoch": 2.2900552486187844, + "grad_norm": 0.4231773614883423, + "learning_rate": 8.997351629366266e-05, + "loss": 1.9447, + "step": 7461 + }, + { + "epoch": 2.2903621853898097, + "grad_norm": 0.5735896825790405, + "learning_rate": 8.997053025308602e-05, + "loss": 1.9082, + "step": 7462 + }, + { + "epoch": 2.2906691221608346, + "grad_norm": 0.5015980005264282, + "learning_rate": 8.996754381749715e-05, + "loss": 1.8744, + "step": 7463 + }, + { + "epoch": 2.29097605893186, + "grad_norm": 0.3385339677333832, + "learning_rate": 8.996455698692558e-05, + "loss": 1.8908, + "step": 7464 + }, + { + "epoch": 2.2912829957028853, + "grad_norm": 0.35323935747146606, + "learning_rate": 8.996156976140086e-05, + "loss": 1.8739, + "step": 7465 + }, + { + "epoch": 2.29158993247391, + "grad_norm": 0.386081725358963, + "learning_rate": 8.995858214095248e-05, + "loss": 1.8734, + "step": 7466 + }, + { + "epoch": 2.2918968692449355, + "grad_norm": 0.32834386825561523, + "learning_rate": 8.995559412560996e-05, + "loss": 1.8849, + "step": 7467 + }, + { + "epoch": 2.292203806015961, + "grad_norm": 0.3868117034435272, + "learning_rate": 8.995260571540284e-05, + "loss": 1.8992, + "step": 7468 + }, + { + "epoch": 2.2925107427869857, + "grad_norm": 0.3869209885597229, + "learning_rate": 8.994961691036066e-05, + "loss": 1.8562, + "step": 7469 + }, + { + "epoch": 2.292817679558011, + "grad_norm": 0.39098650217056274, + "learning_rate": 8.994662771051294e-05, + "loss": 1.9077, + "step": 7470 + }, + { + "epoch": 2.2931246163290364, + "grad_norm": 0.4433341920375824, + "learning_rate": 8.994363811588923e-05, + "loss": 1.9193, + "step": 7471 + }, + { + "epoch": 2.2934315531000613, + "grad_norm": 0.37947940826416016, + "learning_rate": 8.99406481265191e-05, + "loss": 1.8843, + "step": 7472 + }, + { + "epoch": 2.2937384898710866, + "grad_norm": 0.4123954772949219, + "learning_rate": 8.993765774243206e-05, + "loss": 1.8847, + "step": 7473 + }, + { + "epoch": 2.2940454266421115, + "grad_norm": 0.3863835036754608, + "learning_rate": 8.993466696365768e-05, + "loss": 1.8226, + "step": 7474 + }, + { + "epoch": 2.294352363413137, + "grad_norm": 0.34903961420059204, + "learning_rate": 8.993167579022551e-05, + "loss": 1.9151, + "step": 7475 + }, + { + "epoch": 2.294659300184162, + "grad_norm": 0.439989298582077, + "learning_rate": 8.992868422216512e-05, + "loss": 1.8494, + "step": 7476 + }, + { + "epoch": 2.294966236955187, + "grad_norm": 0.42929476499557495, + "learning_rate": 8.992569225950607e-05, + "loss": 1.8174, + "step": 7477 + }, + { + "epoch": 2.2952731737262124, + "grad_norm": 0.39554497599601746, + "learning_rate": 8.992269990227792e-05, + "loss": 1.8692, + "step": 7478 + }, + { + "epoch": 2.2955801104972378, + "grad_norm": 0.29355254769325256, + "learning_rate": 8.991970715051026e-05, + "loss": 1.8033, + "step": 7479 + }, + { + "epoch": 2.2958870472682626, + "grad_norm": 0.3488605320453644, + "learning_rate": 8.991671400423265e-05, + "loss": 1.8979, + "step": 7480 + }, + { + "epoch": 2.296193984039288, + "grad_norm": 0.34984245896339417, + "learning_rate": 8.991372046347468e-05, + "loss": 1.8931, + "step": 7481 + }, + { + "epoch": 2.2965009208103133, + "grad_norm": 0.29404810070991516, + "learning_rate": 8.991072652826593e-05, + "loss": 1.8626, + "step": 7482 + }, + { + "epoch": 2.296807857581338, + "grad_norm": 0.2838701009750366, + "learning_rate": 8.990773219863598e-05, + "loss": 1.8542, + "step": 7483 + }, + { + "epoch": 2.2971147943523635, + "grad_norm": 0.28008925914764404, + "learning_rate": 8.990473747461444e-05, + "loss": 1.8354, + "step": 7484 + }, + { + "epoch": 2.2974217311233884, + "grad_norm": 0.3046751320362091, + "learning_rate": 8.99017423562309e-05, + "loss": 1.8657, + "step": 7485 + }, + { + "epoch": 2.2977286678944138, + "grad_norm": 0.28220781683921814, + "learning_rate": 8.989874684351494e-05, + "loss": 1.8349, + "step": 7486 + }, + { + "epoch": 2.298035604665439, + "grad_norm": 0.2665577232837677, + "learning_rate": 8.989575093649619e-05, + "loss": 1.8551, + "step": 7487 + }, + { + "epoch": 2.298342541436464, + "grad_norm": 0.2797924280166626, + "learning_rate": 8.989275463520423e-05, + "loss": 1.8568, + "step": 7488 + }, + { + "epoch": 2.2986494782074893, + "grad_norm": 0.2917410731315613, + "learning_rate": 8.98897579396687e-05, + "loss": 1.843, + "step": 7489 + }, + { + "epoch": 2.298956414978514, + "grad_norm": 0.3014819920063019, + "learning_rate": 8.98867608499192e-05, + "loss": 1.8527, + "step": 7490 + }, + { + "epoch": 2.2992633517495396, + "grad_norm": 0.28019243478775024, + "learning_rate": 8.988376336598537e-05, + "loss": 1.7744, + "step": 7491 + }, + { + "epoch": 2.299570288520565, + "grad_norm": 0.35014277696609497, + "learning_rate": 8.988076548789678e-05, + "loss": 1.9604, + "step": 7492 + }, + { + "epoch": 2.2998772252915898, + "grad_norm": 0.3060695230960846, + "learning_rate": 8.987776721568311e-05, + "loss": 1.8463, + "step": 7493 + }, + { + "epoch": 2.300184162062615, + "grad_norm": 0.29870638251304626, + "learning_rate": 8.987476854937395e-05, + "loss": 1.815, + "step": 7494 + }, + { + "epoch": 2.3004910988336404, + "grad_norm": 0.27395132184028625, + "learning_rate": 8.987176948899898e-05, + "loss": 1.8126, + "step": 7495 + }, + { + "epoch": 2.3007980356046653, + "grad_norm": 0.2982339859008789, + "learning_rate": 8.986877003458781e-05, + "loss": 1.9114, + "step": 7496 + }, + { + "epoch": 2.3011049723756907, + "grad_norm": 0.3113982081413269, + "learning_rate": 8.986577018617008e-05, + "loss": 1.8429, + "step": 7497 + }, + { + "epoch": 2.301411909146716, + "grad_norm": 0.3538585603237152, + "learning_rate": 8.986276994377544e-05, + "loss": 1.9045, + "step": 7498 + }, + { + "epoch": 2.301718845917741, + "grad_norm": 0.37576064467430115, + "learning_rate": 8.985976930743356e-05, + "loss": 1.8955, + "step": 7499 + }, + { + "epoch": 2.3020257826887662, + "grad_norm": 0.3080044388771057, + "learning_rate": 8.985676827717406e-05, + "loss": 1.7946, + "step": 7500 + }, + { + "epoch": 2.302332719459791, + "grad_norm": 0.33935341238975525, + "learning_rate": 8.985376685302662e-05, + "loss": 1.8817, + "step": 7501 + }, + { + "epoch": 2.3026396562308165, + "grad_norm": 0.3817180395126343, + "learning_rate": 8.98507650350209e-05, + "loss": 1.9178, + "step": 7502 + }, + { + "epoch": 2.302946593001842, + "grad_norm": 0.35170307755470276, + "learning_rate": 8.984776282318657e-05, + "loss": 1.9451, + "step": 7503 + }, + { + "epoch": 2.3032535297728667, + "grad_norm": 0.3451419770717621, + "learning_rate": 8.984476021755329e-05, + "loss": 1.9127, + "step": 7504 + }, + { + "epoch": 2.303560466543892, + "grad_norm": 0.4312259554862976, + "learning_rate": 8.984175721815071e-05, + "loss": 1.8784, + "step": 7505 + }, + { + "epoch": 2.303867403314917, + "grad_norm": 0.4684976041316986, + "learning_rate": 8.983875382500856e-05, + "loss": 1.8782, + "step": 7506 + }, + { + "epoch": 2.3041743400859422, + "grad_norm": 0.4230491518974304, + "learning_rate": 8.983575003815648e-05, + "loss": 1.8769, + "step": 7507 + }, + { + "epoch": 2.3044812768569676, + "grad_norm": 0.32715409994125366, + "learning_rate": 8.983274585762417e-05, + "loss": 1.8535, + "step": 7508 + }, + { + "epoch": 2.3047882136279925, + "grad_norm": 0.3857569396495819, + "learning_rate": 8.982974128344134e-05, + "loss": 1.8689, + "step": 7509 + }, + { + "epoch": 2.305095150399018, + "grad_norm": 0.46266329288482666, + "learning_rate": 8.982673631563766e-05, + "loss": 1.9151, + "step": 7510 + }, + { + "epoch": 2.305402087170043, + "grad_norm": 0.455713152885437, + "learning_rate": 8.98237309542428e-05, + "loss": 1.9304, + "step": 7511 + }, + { + "epoch": 2.305709023941068, + "grad_norm": 0.3413514792919159, + "learning_rate": 8.98207251992865e-05, + "loss": 1.8516, + "step": 7512 + }, + { + "epoch": 2.3060159607120934, + "grad_norm": 0.3705863058567047, + "learning_rate": 8.981771905079846e-05, + "loss": 1.8434, + "step": 7513 + }, + { + "epoch": 2.3063228974831187, + "grad_norm": 0.46615147590637207, + "learning_rate": 8.981471250880839e-05, + "loss": 1.9265, + "step": 7514 + }, + { + "epoch": 2.3066298342541436, + "grad_norm": 0.5400925278663635, + "learning_rate": 8.981170557334598e-05, + "loss": 1.9061, + "step": 7515 + }, + { + "epoch": 2.306936771025169, + "grad_norm": 0.40317288041114807, + "learning_rate": 8.980869824444096e-05, + "loss": 1.7916, + "step": 7516 + }, + { + "epoch": 2.307243707796194, + "grad_norm": 0.3522326648235321, + "learning_rate": 8.980569052212307e-05, + "loss": 1.867, + "step": 7517 + }, + { + "epoch": 2.307550644567219, + "grad_norm": 0.5134142637252808, + "learning_rate": 8.9802682406422e-05, + "loss": 1.8406, + "step": 7518 + }, + { + "epoch": 2.3078575813382445, + "grad_norm": 0.5792621970176697, + "learning_rate": 8.97996738973675e-05, + "loss": 1.8467, + "step": 7519 + }, + { + "epoch": 2.3081645181092694, + "grad_norm": 0.424405962228775, + "learning_rate": 8.979666499498928e-05, + "loss": 1.779, + "step": 7520 + }, + { + "epoch": 2.3084714548802947, + "grad_norm": 0.3233562409877777, + "learning_rate": 8.979365569931712e-05, + "loss": 1.9043, + "step": 7521 + }, + { + "epoch": 2.3087783916513196, + "grad_norm": 0.6043062806129456, + "learning_rate": 8.979064601038071e-05, + "loss": 1.9245, + "step": 7522 + }, + { + "epoch": 2.309085328422345, + "grad_norm": 0.6618810892105103, + "learning_rate": 8.978763592820982e-05, + "loss": 1.8601, + "step": 7523 + }, + { + "epoch": 2.3093922651933703, + "grad_norm": 0.44771909713745117, + "learning_rate": 8.978462545283418e-05, + "loss": 1.7836, + "step": 7524 + }, + { + "epoch": 2.309699201964395, + "grad_norm": 0.3473430871963501, + "learning_rate": 8.978161458428356e-05, + "loss": 1.8743, + "step": 7525 + }, + { + "epoch": 2.3100061387354205, + "grad_norm": 0.46158188581466675, + "learning_rate": 8.977860332258772e-05, + "loss": 1.8802, + "step": 7526 + }, + { + "epoch": 2.310313075506446, + "grad_norm": 0.42034098505973816, + "learning_rate": 8.977559166777639e-05, + "loss": 1.8773, + "step": 7527 + }, + { + "epoch": 2.3106200122774707, + "grad_norm": 0.30994895100593567, + "learning_rate": 8.977257961987936e-05, + "loss": 1.8042, + "step": 7528 + }, + { + "epoch": 2.310926949048496, + "grad_norm": 0.32265907526016235, + "learning_rate": 8.976956717892638e-05, + "loss": 1.8, + "step": 7529 + }, + { + "epoch": 2.3112338858195214, + "grad_norm": 0.3592197000980377, + "learning_rate": 8.976655434494723e-05, + "loss": 1.9053, + "step": 7530 + }, + { + "epoch": 2.3115408225905463, + "grad_norm": 0.36494702100753784, + "learning_rate": 8.97635411179717e-05, + "loss": 1.8982, + "step": 7531 + }, + { + "epoch": 2.3118477593615716, + "grad_norm": 0.3697327971458435, + "learning_rate": 8.976052749802952e-05, + "loss": 1.9446, + "step": 7532 + }, + { + "epoch": 2.3121546961325965, + "grad_norm": 0.5200048089027405, + "learning_rate": 8.975751348515052e-05, + "loss": 1.9429, + "step": 7533 + }, + { + "epoch": 2.312461632903622, + "grad_norm": 0.4033229947090149, + "learning_rate": 8.975449907936446e-05, + "loss": 1.8128, + "step": 7534 + }, + { + "epoch": 2.312768569674647, + "grad_norm": 0.35759851336479187, + "learning_rate": 8.975148428070115e-05, + "loss": 1.8721, + "step": 7535 + }, + { + "epoch": 2.313075506445672, + "grad_norm": 0.4578085243701935, + "learning_rate": 8.974846908919037e-05, + "loss": 1.8397, + "step": 7536 + }, + { + "epoch": 2.3133824432166974, + "grad_norm": 0.4557357132434845, + "learning_rate": 8.974545350486192e-05, + "loss": 1.8726, + "step": 7537 + }, + { + "epoch": 2.3136893799877223, + "grad_norm": 0.3946380615234375, + "learning_rate": 8.974243752774561e-05, + "loss": 1.8662, + "step": 7538 + }, + { + "epoch": 2.3139963167587476, + "grad_norm": 0.29723790287971497, + "learning_rate": 8.973942115787122e-05, + "loss": 1.8215, + "step": 7539 + }, + { + "epoch": 2.314303253529773, + "grad_norm": 0.37225791811943054, + "learning_rate": 8.973640439526858e-05, + "loss": 1.9422, + "step": 7540 + }, + { + "epoch": 2.314610190300798, + "grad_norm": 0.3359868824481964, + "learning_rate": 8.973338723996751e-05, + "loss": 1.7974, + "step": 7541 + }, + { + "epoch": 2.314917127071823, + "grad_norm": 0.2993139922618866, + "learning_rate": 8.973036969199782e-05, + "loss": 1.8691, + "step": 7542 + }, + { + "epoch": 2.3152240638428485, + "grad_norm": 0.3155567944049835, + "learning_rate": 8.972735175138933e-05, + "loss": 1.857, + "step": 7543 + }, + { + "epoch": 2.3155310006138734, + "grad_norm": 0.315820574760437, + "learning_rate": 8.972433341817188e-05, + "loss": 1.8597, + "step": 7544 + }, + { + "epoch": 2.3158379373848987, + "grad_norm": 0.32500606775283813, + "learning_rate": 8.972131469237526e-05, + "loss": 1.9293, + "step": 7545 + }, + { + "epoch": 2.316144874155924, + "grad_norm": 0.3481442332267761, + "learning_rate": 8.971829557402933e-05, + "loss": 1.8839, + "step": 7546 + }, + { + "epoch": 2.316451810926949, + "grad_norm": 0.3110404312610626, + "learning_rate": 8.971527606316394e-05, + "loss": 1.8717, + "step": 7547 + }, + { + "epoch": 2.3167587476979743, + "grad_norm": 0.319795161485672, + "learning_rate": 8.97122561598089e-05, + "loss": 1.8855, + "step": 7548 + }, + { + "epoch": 2.317065684468999, + "grad_norm": 0.33142411708831787, + "learning_rate": 8.970923586399407e-05, + "loss": 1.863, + "step": 7549 + }, + { + "epoch": 2.3173726212400245, + "grad_norm": 0.348715603351593, + "learning_rate": 8.970621517574929e-05, + "loss": 1.8886, + "step": 7550 + }, + { + "epoch": 2.31767955801105, + "grad_norm": 0.3179607689380646, + "learning_rate": 8.970319409510444e-05, + "loss": 1.8955, + "step": 7551 + }, + { + "epoch": 2.3179864947820747, + "grad_norm": 0.33166465163230896, + "learning_rate": 8.970017262208934e-05, + "loss": 1.8366, + "step": 7552 + }, + { + "epoch": 2.3182934315531, + "grad_norm": 0.30798691511154175, + "learning_rate": 8.969715075673386e-05, + "loss": 1.8437, + "step": 7553 + }, + { + "epoch": 2.3186003683241254, + "grad_norm": 0.292639821767807, + "learning_rate": 8.969412849906788e-05, + "loss": 1.8056, + "step": 7554 + }, + { + "epoch": 2.3189073050951503, + "grad_norm": 0.2972165048122406, + "learning_rate": 8.969110584912125e-05, + "loss": 1.8596, + "step": 7555 + }, + { + "epoch": 2.3192142418661756, + "grad_norm": 0.3346043527126312, + "learning_rate": 8.968808280692385e-05, + "loss": 1.8652, + "step": 7556 + }, + { + "epoch": 2.319521178637201, + "grad_norm": 0.31866857409477234, + "learning_rate": 8.968505937250555e-05, + "loss": 1.9263, + "step": 7557 + }, + { + "epoch": 2.319828115408226, + "grad_norm": 0.3511367440223694, + "learning_rate": 8.968203554589625e-05, + "loss": 1.8615, + "step": 7558 + }, + { + "epoch": 2.320135052179251, + "grad_norm": 0.36077243089675903, + "learning_rate": 8.96790113271258e-05, + "loss": 1.9155, + "step": 7559 + }, + { + "epoch": 2.320441988950276, + "grad_norm": 0.3335363268852234, + "learning_rate": 8.96759867162241e-05, + "loss": 1.8313, + "step": 7560 + }, + { + "epoch": 2.3207489257213014, + "grad_norm": 0.31834676861763, + "learning_rate": 8.967296171322105e-05, + "loss": 1.809, + "step": 7561 + }, + { + "epoch": 2.3210558624923268, + "grad_norm": 0.3629632890224457, + "learning_rate": 8.966993631814655e-05, + "loss": 1.854, + "step": 7562 + }, + { + "epoch": 2.3213627992633517, + "grad_norm": 0.3164220154285431, + "learning_rate": 8.966691053103049e-05, + "loss": 1.8431, + "step": 7563 + }, + { + "epoch": 2.321669736034377, + "grad_norm": 0.408178448677063, + "learning_rate": 8.966388435190276e-05, + "loss": 1.8652, + "step": 7564 + }, + { + "epoch": 2.321976672805402, + "grad_norm": 0.4244436025619507, + "learning_rate": 8.966085778079327e-05, + "loss": 1.8834, + "step": 7565 + }, + { + "epoch": 2.322283609576427, + "grad_norm": 0.44187989830970764, + "learning_rate": 8.965783081773195e-05, + "loss": 1.8822, + "step": 7566 + }, + { + "epoch": 2.3225905463474525, + "grad_norm": 0.30801042914390564, + "learning_rate": 8.965480346274869e-05, + "loss": 1.8145, + "step": 7567 + }, + { + "epoch": 2.3228974831184774, + "grad_norm": 0.30103740096092224, + "learning_rate": 8.965177571587343e-05, + "loss": 1.8207, + "step": 7568 + }, + { + "epoch": 2.3232044198895028, + "grad_norm": 0.417538046836853, + "learning_rate": 8.964874757713608e-05, + "loss": 1.9213, + "step": 7569 + }, + { + "epoch": 2.323511356660528, + "grad_norm": 0.4238434433937073, + "learning_rate": 8.964571904656656e-05, + "loss": 1.8309, + "step": 7570 + }, + { + "epoch": 2.323818293431553, + "grad_norm": 0.3717726171016693, + "learning_rate": 8.964269012419482e-05, + "loss": 1.8613, + "step": 7571 + }, + { + "epoch": 2.3241252302025783, + "grad_norm": 0.369182288646698, + "learning_rate": 8.963966081005078e-05, + "loss": 1.9232, + "step": 7572 + }, + { + "epoch": 2.3244321669736037, + "grad_norm": 0.40301385521888733, + "learning_rate": 8.963663110416436e-05, + "loss": 1.9509, + "step": 7573 + }, + { + "epoch": 2.3247391037446286, + "grad_norm": 0.3336825966835022, + "learning_rate": 8.963360100656553e-05, + "loss": 1.807, + "step": 7574 + }, + { + "epoch": 2.325046040515654, + "grad_norm": 0.4070039987564087, + "learning_rate": 8.963057051728423e-05, + "loss": 1.9349, + "step": 7575 + }, + { + "epoch": 2.325352977286679, + "grad_norm": 0.34244731068611145, + "learning_rate": 8.96275396363504e-05, + "loss": 1.8378, + "step": 7576 + }, + { + "epoch": 2.325659914057704, + "grad_norm": 0.3408849835395813, + "learning_rate": 8.962450836379401e-05, + "loss": 1.8087, + "step": 7577 + }, + { + "epoch": 2.3259668508287294, + "grad_norm": 0.34224358201026917, + "learning_rate": 8.962147669964498e-05, + "loss": 1.9158, + "step": 7578 + }, + { + "epoch": 2.3262737875997543, + "grad_norm": 0.36177051067352295, + "learning_rate": 8.961844464393332e-05, + "loss": 1.8774, + "step": 7579 + }, + { + "epoch": 2.3265807243707797, + "grad_norm": 0.3000224232673645, + "learning_rate": 8.961541219668895e-05, + "loss": 1.8092, + "step": 7580 + }, + { + "epoch": 2.3268876611418046, + "grad_norm": 0.34738194942474365, + "learning_rate": 8.961237935794185e-05, + "loss": 1.9107, + "step": 7581 + }, + { + "epoch": 2.32719459791283, + "grad_norm": 0.355585515499115, + "learning_rate": 8.960934612772203e-05, + "loss": 1.8343, + "step": 7582 + }, + { + "epoch": 2.3275015346838552, + "grad_norm": 0.29839828610420227, + "learning_rate": 8.96063125060594e-05, + "loss": 1.8345, + "step": 7583 + }, + { + "epoch": 2.32780847145488, + "grad_norm": 0.3695736229419708, + "learning_rate": 8.960327849298399e-05, + "loss": 1.8763, + "step": 7584 + }, + { + "epoch": 2.3281154082259055, + "grad_norm": 0.38834989070892334, + "learning_rate": 8.960024408852578e-05, + "loss": 1.8732, + "step": 7585 + }, + { + "epoch": 2.328422344996931, + "grad_norm": 0.4515606462955475, + "learning_rate": 8.959720929271474e-05, + "loss": 1.9685, + "step": 7586 + }, + { + "epoch": 2.3287292817679557, + "grad_norm": 0.39115825295448303, + "learning_rate": 8.959417410558087e-05, + "loss": 1.7969, + "step": 7587 + }, + { + "epoch": 2.329036218538981, + "grad_norm": 0.37858307361602783, + "learning_rate": 8.959113852715417e-05, + "loss": 1.9013, + "step": 7588 + }, + { + "epoch": 2.3293431553100064, + "grad_norm": 0.35533010959625244, + "learning_rate": 8.958810255746462e-05, + "loss": 1.8862, + "step": 7589 + }, + { + "epoch": 2.3296500920810312, + "grad_norm": 0.36994054913520813, + "learning_rate": 8.958506619654226e-05, + "loss": 1.9783, + "step": 7590 + }, + { + "epoch": 2.3299570288520566, + "grad_norm": 0.4424416124820709, + "learning_rate": 8.958202944441705e-05, + "loss": 1.9095, + "step": 7591 + }, + { + "epoch": 2.3302639656230815, + "grad_norm": 0.41932111978530884, + "learning_rate": 8.957899230111903e-05, + "loss": 1.8623, + "step": 7592 + }, + { + "epoch": 2.330570902394107, + "grad_norm": 0.4359748363494873, + "learning_rate": 8.957595476667822e-05, + "loss": 1.8917, + "step": 7593 + }, + { + "epoch": 2.330877839165132, + "grad_norm": 0.362957239151001, + "learning_rate": 8.957291684112463e-05, + "loss": 1.8478, + "step": 7594 + }, + { + "epoch": 2.331184775936157, + "grad_norm": 0.3442717492580414, + "learning_rate": 8.956987852448827e-05, + "loss": 1.862, + "step": 7595 + }, + { + "epoch": 2.3314917127071824, + "grad_norm": 0.33355212211608887, + "learning_rate": 8.956683981679918e-05, + "loss": 1.8319, + "step": 7596 + }, + { + "epoch": 2.3317986494782073, + "grad_norm": 0.36758801341056824, + "learning_rate": 8.95638007180874e-05, + "loss": 1.8989, + "step": 7597 + }, + { + "epoch": 2.3321055862492326, + "grad_norm": 0.3574751019477844, + "learning_rate": 8.956076122838294e-05, + "loss": 1.8304, + "step": 7598 + }, + { + "epoch": 2.332412523020258, + "grad_norm": 0.30615341663360596, + "learning_rate": 8.955772134771585e-05, + "loss": 1.9078, + "step": 7599 + }, + { + "epoch": 2.332719459791283, + "grad_norm": 0.38824397325515747, + "learning_rate": 8.955468107611618e-05, + "loss": 1.8733, + "step": 7600 + }, + { + "epoch": 2.333026396562308, + "grad_norm": 0.40545380115509033, + "learning_rate": 8.955164041361395e-05, + "loss": 1.8264, + "step": 7601 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.3104313910007477, + "learning_rate": 8.954859936023925e-05, + "loss": 1.8272, + "step": 7602 + }, + { + "epoch": 2.3336402701043584, + "grad_norm": 0.34795114398002625, + "learning_rate": 8.954555791602211e-05, + "loss": 1.8711, + "step": 7603 + }, + { + "epoch": 2.3339472068753837, + "grad_norm": 0.42790937423706055, + "learning_rate": 8.954251608099257e-05, + "loss": 1.8802, + "step": 7604 + }, + { + "epoch": 2.334254143646409, + "grad_norm": 0.3903054893016815, + "learning_rate": 8.953947385518072e-05, + "loss": 1.8489, + "step": 7605 + }, + { + "epoch": 2.334561080417434, + "grad_norm": 0.35869601368904114, + "learning_rate": 8.953643123861661e-05, + "loss": 1.8565, + "step": 7606 + }, + { + "epoch": 2.3348680171884593, + "grad_norm": 0.3960758447647095, + "learning_rate": 8.953338823133033e-05, + "loss": 1.9335, + "step": 7607 + }, + { + "epoch": 2.335174953959484, + "grad_norm": 0.3884136974811554, + "learning_rate": 8.953034483335191e-05, + "loss": 1.887, + "step": 7608 + }, + { + "epoch": 2.3354818907305095, + "grad_norm": 0.3734811246395111, + "learning_rate": 8.952730104471147e-05, + "loss": 1.861, + "step": 7609 + }, + { + "epoch": 2.335788827501535, + "grad_norm": 0.3074554204940796, + "learning_rate": 8.952425686543908e-05, + "loss": 1.8556, + "step": 7610 + }, + { + "epoch": 2.3360957642725597, + "grad_norm": 0.3098750412464142, + "learning_rate": 8.952121229556481e-05, + "loss": 1.8724, + "step": 7611 + }, + { + "epoch": 2.336402701043585, + "grad_norm": 0.3514649569988251, + "learning_rate": 8.951816733511875e-05, + "loss": 1.8023, + "step": 7612 + }, + { + "epoch": 2.33670963781461, + "grad_norm": 0.3275100290775299, + "learning_rate": 8.951512198413101e-05, + "loss": 1.8805, + "step": 7613 + }, + { + "epoch": 2.3370165745856353, + "grad_norm": 0.3380829989910126, + "learning_rate": 8.951207624263165e-05, + "loss": 1.8559, + "step": 7614 + }, + { + "epoch": 2.3373235113566606, + "grad_norm": 0.43179723620414734, + "learning_rate": 8.950903011065082e-05, + "loss": 1.937, + "step": 7615 + }, + { + "epoch": 2.337630448127686, + "grad_norm": 0.4981893002986908, + "learning_rate": 8.950598358821858e-05, + "loss": 1.8828, + "step": 7616 + }, + { + "epoch": 2.337937384898711, + "grad_norm": 0.42164552211761475, + "learning_rate": 8.950293667536506e-05, + "loss": 1.8898, + "step": 7617 + }, + { + "epoch": 2.338244321669736, + "grad_norm": 0.32897287607192993, + "learning_rate": 8.949988937212037e-05, + "loss": 1.9073, + "step": 7618 + }, + { + "epoch": 2.338551258440761, + "grad_norm": 0.38831618428230286, + "learning_rate": 8.949684167851462e-05, + "loss": 1.9694, + "step": 7619 + }, + { + "epoch": 2.3388581952117864, + "grad_norm": 0.3728467524051666, + "learning_rate": 8.949379359457793e-05, + "loss": 1.8803, + "step": 7620 + }, + { + "epoch": 2.3391651319828117, + "grad_norm": 0.4003579020500183, + "learning_rate": 8.949074512034044e-05, + "loss": 1.9306, + "step": 7621 + }, + { + "epoch": 2.3394720687538366, + "grad_norm": 0.35670751333236694, + "learning_rate": 8.948769625583224e-05, + "loss": 1.9176, + "step": 7622 + }, + { + "epoch": 2.339779005524862, + "grad_norm": 0.3257119357585907, + "learning_rate": 8.948464700108347e-05, + "loss": 1.8781, + "step": 7623 + }, + { + "epoch": 2.340085942295887, + "grad_norm": 0.2840226888656616, + "learning_rate": 8.94815973561243e-05, + "loss": 1.8112, + "step": 7624 + }, + { + "epoch": 2.340392879066912, + "grad_norm": 0.33156147599220276, + "learning_rate": 8.947854732098484e-05, + "loss": 1.8562, + "step": 7625 + }, + { + "epoch": 2.3406998158379375, + "grad_norm": 0.33335328102111816, + "learning_rate": 8.947549689569524e-05, + "loss": 1.8404, + "step": 7626 + }, + { + "epoch": 2.3410067526089624, + "grad_norm": 0.2913919985294342, + "learning_rate": 8.947244608028562e-05, + "loss": 1.83, + "step": 7627 + }, + { + "epoch": 2.3413136893799877, + "grad_norm": 0.32735875248908997, + "learning_rate": 8.946939487478618e-05, + "loss": 1.9047, + "step": 7628 + }, + { + "epoch": 2.341620626151013, + "grad_norm": 0.3421878516674042, + "learning_rate": 8.946634327922703e-05, + "loss": 1.8771, + "step": 7629 + }, + { + "epoch": 2.341927562922038, + "grad_norm": 0.33164483308792114, + "learning_rate": 8.946329129363835e-05, + "loss": 1.8463, + "step": 7630 + }, + { + "epoch": 2.3422344996930633, + "grad_norm": 0.35423099994659424, + "learning_rate": 8.946023891805029e-05, + "loss": 1.9254, + "step": 7631 + }, + { + "epoch": 2.3425414364640886, + "grad_norm": 0.3554958403110504, + "learning_rate": 8.9457186152493e-05, + "loss": 1.8949, + "step": 7632 + }, + { + "epoch": 2.3428483732351135, + "grad_norm": 0.35155919194221497, + "learning_rate": 8.94541329969967e-05, + "loss": 1.8432, + "step": 7633 + }, + { + "epoch": 2.343155310006139, + "grad_norm": 0.3210476338863373, + "learning_rate": 8.945107945159154e-05, + "loss": 1.8512, + "step": 7634 + }, + { + "epoch": 2.3434622467771637, + "grad_norm": 0.3587365746498108, + "learning_rate": 8.944802551630767e-05, + "loss": 1.8355, + "step": 7635 + }, + { + "epoch": 2.343769183548189, + "grad_norm": 0.41851457953453064, + "learning_rate": 8.94449711911753e-05, + "loss": 1.814, + "step": 7636 + }, + { + "epoch": 2.3440761203192144, + "grad_norm": 0.3516016900539398, + "learning_rate": 8.94419164762246e-05, + "loss": 1.8563, + "step": 7637 + }, + { + "epoch": 2.3443830570902393, + "grad_norm": 0.2917228937149048, + "learning_rate": 8.943886137148576e-05, + "loss": 1.8037, + "step": 7638 + }, + { + "epoch": 2.3446899938612646, + "grad_norm": 0.3597778379917145, + "learning_rate": 8.943580587698899e-05, + "loss": 1.8766, + "step": 7639 + }, + { + "epoch": 2.3449969306322895, + "grad_norm": 0.359642893075943, + "learning_rate": 8.943274999276445e-05, + "loss": 1.8485, + "step": 7640 + }, + { + "epoch": 2.345303867403315, + "grad_norm": 0.3543380796909332, + "learning_rate": 8.942969371884238e-05, + "loss": 1.8853, + "step": 7641 + }, + { + "epoch": 2.34561080417434, + "grad_norm": 0.371267706155777, + "learning_rate": 8.942663705525296e-05, + "loss": 1.869, + "step": 7642 + }, + { + "epoch": 2.345917740945365, + "grad_norm": 0.34073930978775024, + "learning_rate": 8.942358000202642e-05, + "loss": 1.831, + "step": 7643 + }, + { + "epoch": 2.3462246777163904, + "grad_norm": 0.3654492497444153, + "learning_rate": 8.942052255919293e-05, + "loss": 1.8697, + "step": 7644 + }, + { + "epoch": 2.3465316144874158, + "grad_norm": 0.31281957030296326, + "learning_rate": 8.941746472678275e-05, + "loss": 1.7908, + "step": 7645 + }, + { + "epoch": 2.3468385512584407, + "grad_norm": 0.3310844302177429, + "learning_rate": 8.941440650482607e-05, + "loss": 1.8523, + "step": 7646 + }, + { + "epoch": 2.347145488029466, + "grad_norm": 0.3187454342842102, + "learning_rate": 8.941134789335312e-05, + "loss": 1.8808, + "step": 7647 + }, + { + "epoch": 2.3474524248004913, + "grad_norm": 0.35980424284935, + "learning_rate": 8.940828889239415e-05, + "loss": 1.8713, + "step": 7648 + }, + { + "epoch": 2.347759361571516, + "grad_norm": 0.2960885763168335, + "learning_rate": 8.940522950197935e-05, + "loss": 1.8077, + "step": 7649 + }, + { + "epoch": 2.3480662983425415, + "grad_norm": 0.3056114912033081, + "learning_rate": 8.940216972213897e-05, + "loss": 1.8805, + "step": 7650 + }, + { + "epoch": 2.3483732351135664, + "grad_norm": 0.3047563135623932, + "learning_rate": 8.939910955290328e-05, + "loss": 1.793, + "step": 7651 + }, + { + "epoch": 2.3486801718845918, + "grad_norm": 0.3381251394748688, + "learning_rate": 8.939604899430248e-05, + "loss": 1.8267, + "step": 7652 + }, + { + "epoch": 2.348987108655617, + "grad_norm": 0.36855414509773254, + "learning_rate": 8.939298804636684e-05, + "loss": 1.9386, + "step": 7653 + }, + { + "epoch": 2.349294045426642, + "grad_norm": 0.3742626905441284, + "learning_rate": 8.93899267091266e-05, + "loss": 1.8695, + "step": 7654 + }, + { + "epoch": 2.3496009821976673, + "grad_norm": 0.3170017600059509, + "learning_rate": 8.938686498261201e-05, + "loss": 1.881, + "step": 7655 + }, + { + "epoch": 2.349907918968692, + "grad_norm": 0.2740418016910553, + "learning_rate": 8.938380286685334e-05, + "loss": 1.7992, + "step": 7656 + }, + { + "epoch": 2.3502148557397176, + "grad_norm": 0.3170342743396759, + "learning_rate": 8.938074036188087e-05, + "loss": 1.8281, + "step": 7657 + }, + { + "epoch": 2.350521792510743, + "grad_norm": 0.3487764298915863, + "learning_rate": 8.93776774677248e-05, + "loss": 1.8508, + "step": 7658 + }, + { + "epoch": 2.350828729281768, + "grad_norm": 0.3193725347518921, + "learning_rate": 8.937461418441549e-05, + "loss": 1.802, + "step": 7659 + }, + { + "epoch": 2.351135666052793, + "grad_norm": 0.30621078610420227, + "learning_rate": 8.937155051198312e-05, + "loss": 1.8723, + "step": 7660 + }, + { + "epoch": 2.3514426028238185, + "grad_norm": 0.3154527544975281, + "learning_rate": 8.936848645045803e-05, + "loss": 1.8276, + "step": 7661 + }, + { + "epoch": 2.3517495395948433, + "grad_norm": 0.3809822201728821, + "learning_rate": 8.936542199987048e-05, + "loss": 1.9682, + "step": 7662 + }, + { + "epoch": 2.3520564763658687, + "grad_norm": 0.3817490339279175, + "learning_rate": 8.936235716025076e-05, + "loss": 1.8896, + "step": 7663 + }, + { + "epoch": 2.352363413136894, + "grad_norm": 0.2996097207069397, + "learning_rate": 8.935929193162915e-05, + "loss": 1.7994, + "step": 7664 + }, + { + "epoch": 2.352670349907919, + "grad_norm": 0.30788013339042664, + "learning_rate": 8.935622631403596e-05, + "loss": 1.8243, + "step": 7665 + }, + { + "epoch": 2.3529772866789442, + "grad_norm": 0.331193745136261, + "learning_rate": 8.935316030750145e-05, + "loss": 1.9044, + "step": 7666 + }, + { + "epoch": 2.353284223449969, + "grad_norm": 0.31796711683273315, + "learning_rate": 8.935009391205598e-05, + "loss": 1.8006, + "step": 7667 + }, + { + "epoch": 2.3535911602209945, + "grad_norm": 0.3864014744758606, + "learning_rate": 8.934702712772979e-05, + "loss": 2.0193, + "step": 7668 + }, + { + "epoch": 2.35389809699202, + "grad_norm": 0.3923170566558838, + "learning_rate": 8.934395995455323e-05, + "loss": 1.9418, + "step": 7669 + }, + { + "epoch": 2.3542050337630447, + "grad_norm": 0.3210037052631378, + "learning_rate": 8.934089239255659e-05, + "loss": 1.7964, + "step": 7670 + }, + { + "epoch": 2.35451197053407, + "grad_norm": 0.32465317845344543, + "learning_rate": 8.933782444177019e-05, + "loss": 1.9405, + "step": 7671 + }, + { + "epoch": 2.354818907305095, + "grad_norm": 0.35554173588752747, + "learning_rate": 8.933475610222435e-05, + "loss": 1.8645, + "step": 7672 + }, + { + "epoch": 2.3551258440761202, + "grad_norm": 0.32723551988601685, + "learning_rate": 8.933168737394942e-05, + "loss": 1.8941, + "step": 7673 + }, + { + "epoch": 2.3554327808471456, + "grad_norm": 0.3295009732246399, + "learning_rate": 8.932861825697567e-05, + "loss": 1.9047, + "step": 7674 + }, + { + "epoch": 2.3557397176181705, + "grad_norm": 0.32315388321876526, + "learning_rate": 8.932554875133348e-05, + "loss": 1.8535, + "step": 7675 + }, + { + "epoch": 2.356046654389196, + "grad_norm": 0.31577154994010925, + "learning_rate": 8.932247885705315e-05, + "loss": 1.8697, + "step": 7676 + }, + { + "epoch": 2.356353591160221, + "grad_norm": 0.31099769473075867, + "learning_rate": 8.931940857416506e-05, + "loss": 1.8377, + "step": 7677 + }, + { + "epoch": 2.356660527931246, + "grad_norm": 0.32998642325401306, + "learning_rate": 8.931633790269954e-05, + "loss": 1.8528, + "step": 7678 + }, + { + "epoch": 2.3569674647022714, + "grad_norm": 0.29609233140945435, + "learning_rate": 8.93132668426869e-05, + "loss": 1.8646, + "step": 7679 + }, + { + "epoch": 2.3572744014732967, + "grad_norm": 0.31335413455963135, + "learning_rate": 8.931019539415752e-05, + "loss": 1.9011, + "step": 7680 + }, + { + "epoch": 2.3575813382443216, + "grad_norm": 0.3441788852214813, + "learning_rate": 8.930712355714174e-05, + "loss": 1.8673, + "step": 7681 + }, + { + "epoch": 2.357888275015347, + "grad_norm": 0.34610918164253235, + "learning_rate": 8.930405133166992e-05, + "loss": 1.8613, + "step": 7682 + }, + { + "epoch": 2.358195211786372, + "grad_norm": 0.31753265857696533, + "learning_rate": 8.930097871777245e-05, + "loss": 1.873, + "step": 7683 + }, + { + "epoch": 2.358502148557397, + "grad_norm": 0.29862073063850403, + "learning_rate": 8.929790571547966e-05, + "loss": 1.8392, + "step": 7684 + }, + { + "epoch": 2.3588090853284225, + "grad_norm": 0.2953017055988312, + "learning_rate": 8.929483232482194e-05, + "loss": 1.8402, + "step": 7685 + }, + { + "epoch": 2.3591160220994474, + "grad_norm": 0.36613956093788147, + "learning_rate": 8.929175854582966e-05, + "loss": 1.8954, + "step": 7686 + }, + { + "epoch": 2.3594229588704727, + "grad_norm": 0.3867746889591217, + "learning_rate": 8.928868437853319e-05, + "loss": 1.8496, + "step": 7687 + }, + { + "epoch": 2.359729895641498, + "grad_norm": 0.30742913484573364, + "learning_rate": 8.928560982296292e-05, + "loss": 1.82, + "step": 7688 + }, + { + "epoch": 2.360036832412523, + "grad_norm": 0.306905061006546, + "learning_rate": 8.928253487914921e-05, + "loss": 1.8299, + "step": 7689 + }, + { + "epoch": 2.3603437691835483, + "grad_norm": 0.3253326416015625, + "learning_rate": 8.927945954712247e-05, + "loss": 1.896, + "step": 7690 + }, + { + "epoch": 2.3606507059545736, + "grad_norm": 0.3139156699180603, + "learning_rate": 8.927638382691309e-05, + "loss": 1.838, + "step": 7691 + }, + { + "epoch": 2.3609576427255985, + "grad_norm": 0.3865121006965637, + "learning_rate": 8.927330771855147e-05, + "loss": 1.8502, + "step": 7692 + }, + { + "epoch": 2.361264579496624, + "grad_norm": 0.3640300929546356, + "learning_rate": 8.927023122206799e-05, + "loss": 1.8929, + "step": 7693 + }, + { + "epoch": 2.3615715162676487, + "grad_norm": 0.3446909487247467, + "learning_rate": 8.926715433749309e-05, + "loss": 1.864, + "step": 7694 + }, + { + "epoch": 2.361878453038674, + "grad_norm": 0.3086490035057068, + "learning_rate": 8.926407706485713e-05, + "loss": 1.8588, + "step": 7695 + }, + { + "epoch": 2.3621853898096994, + "grad_norm": 0.28351619839668274, + "learning_rate": 8.926099940419057e-05, + "loss": 1.8114, + "step": 7696 + }, + { + "epoch": 2.3624923265807243, + "grad_norm": 0.31882742047309875, + "learning_rate": 8.925792135552379e-05, + "loss": 1.8544, + "step": 7697 + }, + { + "epoch": 2.3627992633517496, + "grad_norm": 0.2691894769668579, + "learning_rate": 8.925484291888723e-05, + "loss": 1.8143, + "step": 7698 + }, + { + "epoch": 2.3631062001227745, + "grad_norm": 0.2815118432044983, + "learning_rate": 8.925176409431129e-05, + "loss": 1.8687, + "step": 7699 + }, + { + "epoch": 2.3634131368938, + "grad_norm": 0.34842196106910706, + "learning_rate": 8.924868488182643e-05, + "loss": 1.8673, + "step": 7700 + }, + { + "epoch": 2.363720073664825, + "grad_norm": 0.33553025126457214, + "learning_rate": 8.924560528146304e-05, + "loss": 1.8982, + "step": 7701 + }, + { + "epoch": 2.36402701043585, + "grad_norm": 0.30077221989631653, + "learning_rate": 8.924252529325159e-05, + "loss": 1.8155, + "step": 7702 + }, + { + "epoch": 2.3643339472068754, + "grad_norm": 0.3376595079898834, + "learning_rate": 8.923944491722252e-05, + "loss": 1.8871, + "step": 7703 + }, + { + "epoch": 2.3646408839779007, + "grad_norm": 0.3980284333229065, + "learning_rate": 8.923636415340622e-05, + "loss": 1.8414, + "step": 7704 + }, + { + "epoch": 2.3649478207489256, + "grad_norm": 0.4772777259349823, + "learning_rate": 8.92332830018332e-05, + "loss": 1.8393, + "step": 7705 + }, + { + "epoch": 2.365254757519951, + "grad_norm": 0.5061559081077576, + "learning_rate": 8.923020146253387e-05, + "loss": 1.9134, + "step": 7706 + }, + { + "epoch": 2.3655616942909763, + "grad_norm": 0.47147873044013977, + "learning_rate": 8.922711953553871e-05, + "loss": 1.9026, + "step": 7707 + }, + { + "epoch": 2.365868631062001, + "grad_norm": 0.37263748049736023, + "learning_rate": 8.922403722087814e-05, + "loss": 1.8474, + "step": 7708 + }, + { + "epoch": 2.3661755678330265, + "grad_norm": 0.3158501386642456, + "learning_rate": 8.922095451858265e-05, + "loss": 1.8771, + "step": 7709 + }, + { + "epoch": 2.3664825046040514, + "grad_norm": 0.3170566260814667, + "learning_rate": 8.921787142868271e-05, + "loss": 1.8111, + "step": 7710 + }, + { + "epoch": 2.3667894413750767, + "grad_norm": 0.3532208502292633, + "learning_rate": 8.921478795120877e-05, + "loss": 1.8708, + "step": 7711 + }, + { + "epoch": 2.367096378146102, + "grad_norm": 0.3211480379104614, + "learning_rate": 8.921170408619131e-05, + "loss": 1.8487, + "step": 7712 + }, + { + "epoch": 2.367403314917127, + "grad_norm": 0.2806071937084198, + "learning_rate": 8.920861983366083e-05, + "loss": 1.8325, + "step": 7713 + }, + { + "epoch": 2.3677102516881523, + "grad_norm": 0.30703970789909363, + "learning_rate": 8.920553519364777e-05, + "loss": 1.8364, + "step": 7714 + }, + { + "epoch": 2.368017188459177, + "grad_norm": 0.30848923325538635, + "learning_rate": 8.920245016618263e-05, + "loss": 1.833, + "step": 7715 + }, + { + "epoch": 2.3683241252302025, + "grad_norm": 0.31656739115715027, + "learning_rate": 8.919936475129588e-05, + "loss": 1.8884, + "step": 7716 + }, + { + "epoch": 2.368631062001228, + "grad_norm": 0.2806589603424072, + "learning_rate": 8.919627894901806e-05, + "loss": 1.7779, + "step": 7717 + }, + { + "epoch": 2.3689379987722528, + "grad_norm": 0.2943432629108429, + "learning_rate": 8.919319275937962e-05, + "loss": 1.8741, + "step": 7718 + }, + { + "epoch": 2.369244935543278, + "grad_norm": 0.2870347499847412, + "learning_rate": 8.919010618241111e-05, + "loss": 1.8415, + "step": 7719 + }, + { + "epoch": 2.3695518723143034, + "grad_norm": 0.3224312663078308, + "learning_rate": 8.918701921814297e-05, + "loss": 1.8594, + "step": 7720 + }, + { + "epoch": 2.3698588090853283, + "grad_norm": 0.3007681369781494, + "learning_rate": 8.918393186660575e-05, + "loss": 1.878, + "step": 7721 + }, + { + "epoch": 2.3701657458563536, + "grad_norm": 0.3083780109882355, + "learning_rate": 8.918084412782994e-05, + "loss": 1.9088, + "step": 7722 + }, + { + "epoch": 2.370472682627379, + "grad_norm": 0.30599063634872437, + "learning_rate": 8.917775600184608e-05, + "loss": 1.8743, + "step": 7723 + }, + { + "epoch": 2.370779619398404, + "grad_norm": 0.33503273129463196, + "learning_rate": 8.917466748868466e-05, + "loss": 1.9048, + "step": 7724 + }, + { + "epoch": 2.371086556169429, + "grad_norm": 0.3861919343471527, + "learning_rate": 8.917157858837622e-05, + "loss": 1.9073, + "step": 7725 + }, + { + "epoch": 2.371393492940454, + "grad_norm": 0.395945280790329, + "learning_rate": 8.916848930095128e-05, + "loss": 1.8678, + "step": 7726 + }, + { + "epoch": 2.3717004297114794, + "grad_norm": 0.3657386600971222, + "learning_rate": 8.916539962644037e-05, + "loss": 1.9138, + "step": 7727 + }, + { + "epoch": 2.3720073664825048, + "grad_norm": 0.32392752170562744, + "learning_rate": 8.916230956487402e-05, + "loss": 1.803, + "step": 7728 + }, + { + "epoch": 2.3723143032535297, + "grad_norm": 0.406703382730484, + "learning_rate": 8.915921911628278e-05, + "loss": 1.9222, + "step": 7729 + }, + { + "epoch": 2.372621240024555, + "grad_norm": 0.4293023645877838, + "learning_rate": 8.915612828069718e-05, + "loss": 1.8874, + "step": 7730 + }, + { + "epoch": 2.37292817679558, + "grad_norm": 0.45155876874923706, + "learning_rate": 8.915303705814777e-05, + "loss": 1.9059, + "step": 7731 + }, + { + "epoch": 2.373235113566605, + "grad_norm": 0.35105881094932556, + "learning_rate": 8.91499454486651e-05, + "loss": 1.8387, + "step": 7732 + }, + { + "epoch": 2.3735420503376305, + "grad_norm": 0.3197930157184601, + "learning_rate": 8.914685345227973e-05, + "loss": 1.8174, + "step": 7733 + }, + { + "epoch": 2.3738489871086554, + "grad_norm": 0.3610389232635498, + "learning_rate": 8.91437610690222e-05, + "loss": 1.841, + "step": 7734 + }, + { + "epoch": 2.3741559238796808, + "grad_norm": 0.3696954548358917, + "learning_rate": 8.91406682989231e-05, + "loss": 1.8511, + "step": 7735 + }, + { + "epoch": 2.374462860650706, + "grad_norm": 0.3364555239677429, + "learning_rate": 8.913757514201295e-05, + "loss": 1.8382, + "step": 7736 + }, + { + "epoch": 2.374769797421731, + "grad_norm": 0.4600698947906494, + "learning_rate": 8.913448159832236e-05, + "loss": 1.8247, + "step": 7737 + }, + { + "epoch": 2.3750767341927563, + "grad_norm": 0.5877843499183655, + "learning_rate": 8.913138766788187e-05, + "loss": 1.8449, + "step": 7738 + }, + { + "epoch": 2.3753836709637817, + "grad_norm": 0.5380640029907227, + "learning_rate": 8.912829335072208e-05, + "loss": 1.8647, + "step": 7739 + }, + { + "epoch": 2.3756906077348066, + "grad_norm": 0.5100306272506714, + "learning_rate": 8.912519864687357e-05, + "loss": 1.884, + "step": 7740 + }, + { + "epoch": 2.375997544505832, + "grad_norm": 0.48175910115242004, + "learning_rate": 8.91221035563669e-05, + "loss": 1.8378, + "step": 7741 + }, + { + "epoch": 2.376304481276857, + "grad_norm": 0.3296540081501007, + "learning_rate": 8.911900807923268e-05, + "loss": 1.8036, + "step": 7742 + }, + { + "epoch": 2.376611418047882, + "grad_norm": 0.32398131489753723, + "learning_rate": 8.911591221550149e-05, + "loss": 1.8415, + "step": 7743 + }, + { + "epoch": 2.3769183548189075, + "grad_norm": 0.33934786915779114, + "learning_rate": 8.911281596520393e-05, + "loss": 1.9002, + "step": 7744 + }, + { + "epoch": 2.3772252915899323, + "grad_norm": 0.33059465885162354, + "learning_rate": 8.91097193283706e-05, + "loss": 1.8194, + "step": 7745 + }, + { + "epoch": 2.3775322283609577, + "grad_norm": 0.2908796966075897, + "learning_rate": 8.91066223050321e-05, + "loss": 1.8272, + "step": 7746 + }, + { + "epoch": 2.3778391651319826, + "grad_norm": 0.31551963090896606, + "learning_rate": 8.910352489521904e-05, + "loss": 1.8717, + "step": 7747 + }, + { + "epoch": 2.378146101903008, + "grad_norm": 0.2886766493320465, + "learning_rate": 8.910042709896203e-05, + "loss": 1.8714, + "step": 7748 + }, + { + "epoch": 2.3784530386740332, + "grad_norm": 0.3288721740245819, + "learning_rate": 8.909732891629167e-05, + "loss": 1.9194, + "step": 7749 + }, + { + "epoch": 2.378759975445058, + "grad_norm": 0.42444637417793274, + "learning_rate": 8.90942303472386e-05, + "loss": 1.8871, + "step": 7750 + }, + { + "epoch": 2.3790669122160835, + "grad_norm": 0.3550770580768585, + "learning_rate": 8.909113139183343e-05, + "loss": 1.8639, + "step": 7751 + }, + { + "epoch": 2.379373848987109, + "grad_norm": 0.3291744589805603, + "learning_rate": 8.908803205010679e-05, + "loss": 1.8284, + "step": 7752 + }, + { + "epoch": 2.3796807857581337, + "grad_norm": 0.2803054451942444, + "learning_rate": 8.908493232208928e-05, + "loss": 1.8113, + "step": 7753 + }, + { + "epoch": 2.379987722529159, + "grad_norm": 0.30959245562553406, + "learning_rate": 8.908183220781158e-05, + "loss": 1.8821, + "step": 7754 + }, + { + "epoch": 2.3802946593001844, + "grad_norm": 0.37838777899742126, + "learning_rate": 8.907873170730431e-05, + "loss": 1.8749, + "step": 7755 + }, + { + "epoch": 2.3806015960712092, + "grad_norm": 0.34625449776649475, + "learning_rate": 8.907563082059813e-05, + "loss": 1.8804, + "step": 7756 + }, + { + "epoch": 2.3809085328422346, + "grad_norm": 0.3966830372810364, + "learning_rate": 8.907252954772364e-05, + "loss": 1.9295, + "step": 7757 + }, + { + "epoch": 2.3812154696132595, + "grad_norm": 0.3144119679927826, + "learning_rate": 8.906942788871151e-05, + "loss": 1.8486, + "step": 7758 + }, + { + "epoch": 2.381522406384285, + "grad_norm": 0.3498438596725464, + "learning_rate": 8.90663258435924e-05, + "loss": 1.8813, + "step": 7759 + }, + { + "epoch": 2.38182934315531, + "grad_norm": 0.32803723216056824, + "learning_rate": 8.906322341239696e-05, + "loss": 1.8282, + "step": 7760 + }, + { + "epoch": 2.382136279926335, + "grad_norm": 0.28600773215293884, + "learning_rate": 8.906012059515585e-05, + "loss": 1.8319, + "step": 7761 + }, + { + "epoch": 2.3824432166973604, + "grad_norm": 0.2743505537509918, + "learning_rate": 8.905701739189973e-05, + "loss": 1.8198, + "step": 7762 + }, + { + "epoch": 2.3827501534683857, + "grad_norm": 0.3011966347694397, + "learning_rate": 8.905391380265929e-05, + "loss": 1.8476, + "step": 7763 + }, + { + "epoch": 2.3830570902394106, + "grad_norm": 0.3022943437099457, + "learning_rate": 8.905080982746516e-05, + "loss": 1.9037, + "step": 7764 + }, + { + "epoch": 2.383364027010436, + "grad_norm": 0.3333243727684021, + "learning_rate": 8.904770546634805e-05, + "loss": 1.8487, + "step": 7765 + }, + { + "epoch": 2.3836709637814613, + "grad_norm": 0.3773072361946106, + "learning_rate": 8.904460071933862e-05, + "loss": 1.8828, + "step": 7766 + }, + { + "epoch": 2.383977900552486, + "grad_norm": 0.4382041096687317, + "learning_rate": 8.904149558646756e-05, + "loss": 1.9069, + "step": 7767 + }, + { + "epoch": 2.3842848373235115, + "grad_norm": 0.3963650166988373, + "learning_rate": 8.903839006776557e-05, + "loss": 1.816, + "step": 7768 + }, + { + "epoch": 2.3845917740945364, + "grad_norm": 0.35340386629104614, + "learning_rate": 8.903528416326333e-05, + "loss": 1.8853, + "step": 7769 + }, + { + "epoch": 2.3848987108655617, + "grad_norm": 0.31519120931625366, + "learning_rate": 8.903217787299153e-05, + "loss": 1.8953, + "step": 7770 + }, + { + "epoch": 2.385205647636587, + "grad_norm": 0.41126203536987305, + "learning_rate": 8.902907119698088e-05, + "loss": 1.9494, + "step": 7771 + }, + { + "epoch": 2.385512584407612, + "grad_norm": 0.4488140344619751, + "learning_rate": 8.902596413526205e-05, + "loss": 1.8717, + "step": 7772 + }, + { + "epoch": 2.3858195211786373, + "grad_norm": 0.36129191517829895, + "learning_rate": 8.902285668786578e-05, + "loss": 1.8472, + "step": 7773 + }, + { + "epoch": 2.386126457949662, + "grad_norm": 0.3357439935207367, + "learning_rate": 8.901974885482277e-05, + "loss": 1.8143, + "step": 7774 + }, + { + "epoch": 2.3864333947206875, + "grad_norm": 0.2832469046115875, + "learning_rate": 8.901664063616372e-05, + "loss": 1.7952, + "step": 7775 + }, + { + "epoch": 2.386740331491713, + "grad_norm": 0.31065669655799866, + "learning_rate": 8.901353203191937e-05, + "loss": 1.8651, + "step": 7776 + }, + { + "epoch": 2.3870472682627377, + "grad_norm": 0.2985263764858246, + "learning_rate": 8.901042304212042e-05, + "loss": 1.8106, + "step": 7777 + }, + { + "epoch": 2.387354205033763, + "grad_norm": 0.31606364250183105, + "learning_rate": 8.900731366679761e-05, + "loss": 1.8831, + "step": 7778 + }, + { + "epoch": 2.3876611418047884, + "grad_norm": 0.33167949318885803, + "learning_rate": 8.900420390598166e-05, + "loss": 1.9494, + "step": 7779 + }, + { + "epoch": 2.3879680785758133, + "grad_norm": 0.32814472913742065, + "learning_rate": 8.900109375970333e-05, + "loss": 1.8654, + "step": 7780 + }, + { + "epoch": 2.3882750153468386, + "grad_norm": 0.35307401418685913, + "learning_rate": 8.899798322799331e-05, + "loss": 1.904, + "step": 7781 + }, + { + "epoch": 2.388581952117864, + "grad_norm": 0.3936740458011627, + "learning_rate": 8.899487231088236e-05, + "loss": 1.8404, + "step": 7782 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.3675380349159241, + "learning_rate": 8.899176100840124e-05, + "loss": 1.8689, + "step": 7783 + }, + { + "epoch": 2.389195825659914, + "grad_norm": 0.34065911173820496, + "learning_rate": 8.898864932058067e-05, + "loss": 1.8819, + "step": 7784 + }, + { + "epoch": 2.389502762430939, + "grad_norm": 0.31531861424446106, + "learning_rate": 8.898553724745142e-05, + "loss": 1.8379, + "step": 7785 + }, + { + "epoch": 2.3898096992019644, + "grad_norm": 0.33485177159309387, + "learning_rate": 8.898242478904424e-05, + "loss": 1.9206, + "step": 7786 + }, + { + "epoch": 2.3901166359729897, + "grad_norm": 0.33116385340690613, + "learning_rate": 8.897931194538989e-05, + "loss": 1.8744, + "step": 7787 + }, + { + "epoch": 2.3904235727440146, + "grad_norm": 0.33216002583503723, + "learning_rate": 8.897619871651915e-05, + "loss": 1.8794, + "step": 7788 + }, + { + "epoch": 2.39073050951504, + "grad_norm": 0.3246794641017914, + "learning_rate": 8.897308510246273e-05, + "loss": 1.8739, + "step": 7789 + }, + { + "epoch": 2.391037446286065, + "grad_norm": 0.3038793206214905, + "learning_rate": 8.896997110325146e-05, + "loss": 1.8314, + "step": 7790 + }, + { + "epoch": 2.39134438305709, + "grad_norm": 0.35726267099380493, + "learning_rate": 8.896685671891612e-05, + "loss": 1.8764, + "step": 7791 + }, + { + "epoch": 2.3916513198281155, + "grad_norm": 0.421522855758667, + "learning_rate": 8.896374194948744e-05, + "loss": 1.8215, + "step": 7792 + }, + { + "epoch": 2.3919582565991404, + "grad_norm": 0.4456072747707367, + "learning_rate": 8.896062679499621e-05, + "loss": 1.9146, + "step": 7793 + }, + { + "epoch": 2.3922651933701657, + "grad_norm": 0.33498415350914, + "learning_rate": 8.895751125547325e-05, + "loss": 1.8372, + "step": 7794 + }, + { + "epoch": 2.392572130141191, + "grad_norm": 0.3279598355293274, + "learning_rate": 8.895439533094933e-05, + "loss": 1.8469, + "step": 7795 + }, + { + "epoch": 2.392879066912216, + "grad_norm": 0.4238305687904358, + "learning_rate": 8.895127902145524e-05, + "loss": 1.8259, + "step": 7796 + }, + { + "epoch": 2.3931860036832413, + "grad_norm": 0.473057359457016, + "learning_rate": 8.89481623270218e-05, + "loss": 1.8374, + "step": 7797 + }, + { + "epoch": 2.3934929404542666, + "grad_norm": 0.30914968252182007, + "learning_rate": 8.894504524767976e-05, + "loss": 1.7803, + "step": 7798 + }, + { + "epoch": 2.3937998772252915, + "grad_norm": 0.3433384597301483, + "learning_rate": 8.894192778345996e-05, + "loss": 1.8568, + "step": 7799 + }, + { + "epoch": 2.394106813996317, + "grad_norm": 0.4965706467628479, + "learning_rate": 8.893880993439323e-05, + "loss": 1.8576, + "step": 7800 + }, + { + "epoch": 2.3944137507673418, + "grad_norm": 0.4996519684791565, + "learning_rate": 8.893569170051032e-05, + "loss": 1.788, + "step": 7801 + }, + { + "epoch": 2.394720687538367, + "grad_norm": 0.31231364607810974, + "learning_rate": 8.893257308184212e-05, + "loss": 1.7846, + "step": 7802 + }, + { + "epoch": 2.3950276243093924, + "grad_norm": 0.32845574617385864, + "learning_rate": 8.89294540784194e-05, + "loss": 1.8811, + "step": 7803 + }, + { + "epoch": 2.3953345610804173, + "grad_norm": 0.525324285030365, + "learning_rate": 8.8926334690273e-05, + "loss": 1.8458, + "step": 7804 + }, + { + "epoch": 2.3956414978514426, + "grad_norm": 0.5107213854789734, + "learning_rate": 8.892321491743373e-05, + "loss": 1.8419, + "step": 7805 + }, + { + "epoch": 2.3959484346224675, + "grad_norm": 0.33831658959388733, + "learning_rate": 8.892009475993245e-05, + "loss": 1.811, + "step": 7806 + }, + { + "epoch": 2.396255371393493, + "grad_norm": 0.3781357407569885, + "learning_rate": 8.891697421779999e-05, + "loss": 1.9385, + "step": 7807 + }, + { + "epoch": 2.396562308164518, + "grad_norm": 0.43507882952690125, + "learning_rate": 8.891385329106717e-05, + "loss": 1.7705, + "step": 7808 + }, + { + "epoch": 2.396869244935543, + "grad_norm": 0.45114290714263916, + "learning_rate": 8.891073197976483e-05, + "loss": 1.8661, + "step": 7809 + }, + { + "epoch": 2.3971761817065684, + "grad_norm": 0.29369547963142395, + "learning_rate": 8.890761028392385e-05, + "loss": 1.873, + "step": 7810 + }, + { + "epoch": 2.3974831184775938, + "grad_norm": 0.3268595337867737, + "learning_rate": 8.890448820357506e-05, + "loss": 1.8461, + "step": 7811 + }, + { + "epoch": 2.3977900552486187, + "grad_norm": 0.4514225423336029, + "learning_rate": 8.890136573874931e-05, + "loss": 1.8458, + "step": 7812 + }, + { + "epoch": 2.398096992019644, + "grad_norm": 0.5288760662078857, + "learning_rate": 8.889824288947745e-05, + "loss": 1.8301, + "step": 7813 + }, + { + "epoch": 2.3984039287906693, + "grad_norm": 0.46517884731292725, + "learning_rate": 8.889511965579038e-05, + "loss": 1.8769, + "step": 7814 + }, + { + "epoch": 2.398710865561694, + "grad_norm": 0.29907044768333435, + "learning_rate": 8.889199603771892e-05, + "loss": 1.7815, + "step": 7815 + }, + { + "epoch": 2.3990178023327196, + "grad_norm": 0.36091622710227966, + "learning_rate": 8.888887203529398e-05, + "loss": 1.8375, + "step": 7816 + }, + { + "epoch": 2.3993247391037444, + "grad_norm": 0.5604190230369568, + "learning_rate": 8.88857476485464e-05, + "loss": 1.9176, + "step": 7817 + }, + { + "epoch": 2.3996316758747698, + "grad_norm": 0.48299452662467957, + "learning_rate": 8.888262287750707e-05, + "loss": 1.8682, + "step": 7818 + }, + { + "epoch": 2.399938612645795, + "grad_norm": 0.32829394936561584, + "learning_rate": 8.887949772220687e-05, + "loss": 1.9143, + "step": 7819 + }, + { + "epoch": 2.40024554941682, + "grad_norm": 0.401719868183136, + "learning_rate": 8.88763721826767e-05, + "loss": 1.8517, + "step": 7820 + }, + { + "epoch": 2.4005524861878453, + "grad_norm": 0.5205032825469971, + "learning_rate": 8.887324625894741e-05, + "loss": 1.811, + "step": 7821 + }, + { + "epoch": 2.4008594229588702, + "grad_norm": 0.3828800618648529, + "learning_rate": 8.887011995104993e-05, + "loss": 1.8042, + "step": 7822 + }, + { + "epoch": 2.4011663597298956, + "grad_norm": 0.31816062331199646, + "learning_rate": 8.886699325901514e-05, + "loss": 1.8998, + "step": 7823 + }, + { + "epoch": 2.401473296500921, + "grad_norm": 0.36172720789909363, + "learning_rate": 8.886386618287394e-05, + "loss": 1.8689, + "step": 7824 + }, + { + "epoch": 2.401780233271946, + "grad_norm": 0.3582005202770233, + "learning_rate": 8.886073872265725e-05, + "loss": 1.8565, + "step": 7825 + }, + { + "epoch": 2.402087170042971, + "grad_norm": 0.2915255129337311, + "learning_rate": 8.885761087839594e-05, + "loss": 1.8686, + "step": 7826 + }, + { + "epoch": 2.4023941068139965, + "grad_norm": 0.26619917154312134, + "learning_rate": 8.885448265012095e-05, + "loss": 1.7737, + "step": 7827 + }, + { + "epoch": 2.4027010435850213, + "grad_norm": 0.31685733795166016, + "learning_rate": 8.88513540378632e-05, + "loss": 1.9136, + "step": 7828 + }, + { + "epoch": 2.4030079803560467, + "grad_norm": 0.3427450954914093, + "learning_rate": 8.884822504165359e-05, + "loss": 1.8824, + "step": 7829 + }, + { + "epoch": 2.403314917127072, + "grad_norm": 0.3207513689994812, + "learning_rate": 8.884509566152306e-05, + "loss": 1.8332, + "step": 7830 + }, + { + "epoch": 2.403621853898097, + "grad_norm": 0.3301675319671631, + "learning_rate": 8.884196589750251e-05, + "loss": 1.9129, + "step": 7831 + }, + { + "epoch": 2.4039287906691222, + "grad_norm": 0.3232486844062805, + "learning_rate": 8.88388357496229e-05, + "loss": 1.8362, + "step": 7832 + }, + { + "epoch": 2.404235727440147, + "grad_norm": 0.3152230381965637, + "learning_rate": 8.883570521791514e-05, + "loss": 1.8586, + "step": 7833 + }, + { + "epoch": 2.4045426642111725, + "grad_norm": 0.3204822540283203, + "learning_rate": 8.883257430241019e-05, + "loss": 1.842, + "step": 7834 + }, + { + "epoch": 2.404849600982198, + "grad_norm": 0.28253886103630066, + "learning_rate": 8.882944300313897e-05, + "loss": 1.8521, + "step": 7835 + }, + { + "epoch": 2.4051565377532227, + "grad_norm": 0.37631165981292725, + "learning_rate": 8.882631132013245e-05, + "loss": 1.8838, + "step": 7836 + }, + { + "epoch": 2.405463474524248, + "grad_norm": 0.3606031537055969, + "learning_rate": 8.882317925342157e-05, + "loss": 1.8452, + "step": 7837 + }, + { + "epoch": 2.4057704112952734, + "grad_norm": 0.33793914318084717, + "learning_rate": 8.882004680303726e-05, + "loss": 1.8866, + "step": 7838 + }, + { + "epoch": 2.4060773480662982, + "grad_norm": 0.2714223265647888, + "learning_rate": 8.881691396901048e-05, + "loss": 1.7953, + "step": 7839 + }, + { + "epoch": 2.4063842848373236, + "grad_norm": 0.3588239252567291, + "learning_rate": 8.881378075137224e-05, + "loss": 1.9679, + "step": 7840 + }, + { + "epoch": 2.406691221608349, + "grad_norm": 0.3266383707523346, + "learning_rate": 8.881064715015344e-05, + "loss": 1.8747, + "step": 7841 + }, + { + "epoch": 2.406998158379374, + "grad_norm": 0.3498428761959076, + "learning_rate": 8.88075131653851e-05, + "loss": 1.8882, + "step": 7842 + }, + { + "epoch": 2.407305095150399, + "grad_norm": 0.36646100878715515, + "learning_rate": 8.880437879709815e-05, + "loss": 1.8624, + "step": 7843 + }, + { + "epoch": 2.407612031921424, + "grad_norm": 0.36088457703590393, + "learning_rate": 8.88012440453236e-05, + "loss": 1.8527, + "step": 7844 + }, + { + "epoch": 2.4079189686924494, + "grad_norm": 0.3267477750778198, + "learning_rate": 8.87981089100924e-05, + "loss": 1.8374, + "step": 7845 + }, + { + "epoch": 2.4082259054634747, + "grad_norm": 0.3262403607368469, + "learning_rate": 8.879497339143556e-05, + "loss": 1.8752, + "step": 7846 + }, + { + "epoch": 2.4085328422344996, + "grad_norm": 0.278877854347229, + "learning_rate": 8.879183748938405e-05, + "loss": 1.8056, + "step": 7847 + }, + { + "epoch": 2.408839779005525, + "grad_norm": 0.35509005188941956, + "learning_rate": 8.878870120396886e-05, + "loss": 1.8555, + "step": 7848 + }, + { + "epoch": 2.40914671577655, + "grad_norm": 0.3621126413345337, + "learning_rate": 8.8785564535221e-05, + "loss": 1.8084, + "step": 7849 + }, + { + "epoch": 2.409453652547575, + "grad_norm": 0.2772746682167053, + "learning_rate": 8.878242748317145e-05, + "loss": 1.8034, + "step": 7850 + }, + { + "epoch": 2.4097605893186005, + "grad_norm": 0.30938875675201416, + "learning_rate": 8.877929004785121e-05, + "loss": 1.8341, + "step": 7851 + }, + { + "epoch": 2.4100675260896254, + "grad_norm": 0.3349369764328003, + "learning_rate": 8.877615222929133e-05, + "loss": 1.8306, + "step": 7852 + }, + { + "epoch": 2.4103744628606507, + "grad_norm": 0.3109685778617859, + "learning_rate": 8.877301402752277e-05, + "loss": 1.7998, + "step": 7853 + }, + { + "epoch": 2.410681399631676, + "grad_norm": 0.3337927460670471, + "learning_rate": 8.876987544257655e-05, + "loss": 1.8766, + "step": 7854 + }, + { + "epoch": 2.410988336402701, + "grad_norm": 0.33891361951828003, + "learning_rate": 8.87667364744837e-05, + "loss": 1.8535, + "step": 7855 + }, + { + "epoch": 2.4112952731737263, + "grad_norm": 0.30946552753448486, + "learning_rate": 8.876359712327524e-05, + "loss": 1.8144, + "step": 7856 + }, + { + "epoch": 2.4116022099447516, + "grad_norm": 0.354981929063797, + "learning_rate": 8.87604573889822e-05, + "loss": 1.9253, + "step": 7857 + }, + { + "epoch": 2.4119091467157765, + "grad_norm": 0.42054516077041626, + "learning_rate": 8.875731727163559e-05, + "loss": 1.9122, + "step": 7858 + }, + { + "epoch": 2.412216083486802, + "grad_norm": 0.37435492873191833, + "learning_rate": 8.875417677126646e-05, + "loss": 1.8639, + "step": 7859 + }, + { + "epoch": 2.4125230202578267, + "grad_norm": 0.3742216229438782, + "learning_rate": 8.875103588790584e-05, + "loss": 1.8398, + "step": 7860 + }, + { + "epoch": 2.412829957028852, + "grad_norm": 0.3152104616165161, + "learning_rate": 8.874789462158478e-05, + "loss": 1.8078, + "step": 7861 + }, + { + "epoch": 2.4131368937998774, + "grad_norm": 0.32342761754989624, + "learning_rate": 8.87447529723343e-05, + "loss": 1.8632, + "step": 7862 + }, + { + "epoch": 2.4134438305709023, + "grad_norm": 0.31065210700035095, + "learning_rate": 8.874161094018547e-05, + "loss": 1.845, + "step": 7863 + }, + { + "epoch": 2.4137507673419276, + "grad_norm": 0.31379538774490356, + "learning_rate": 8.873846852516933e-05, + "loss": 1.8184, + "step": 7864 + }, + { + "epoch": 2.4140577041129525, + "grad_norm": 0.29058924317359924, + "learning_rate": 8.873532572731694e-05, + "loss": 1.8671, + "step": 7865 + }, + { + "epoch": 2.414364640883978, + "grad_norm": 0.3024691641330719, + "learning_rate": 8.873218254665936e-05, + "loss": 1.7977, + "step": 7866 + }, + { + "epoch": 2.414671577655003, + "grad_norm": 0.30356913805007935, + "learning_rate": 8.872903898322764e-05, + "loss": 1.8284, + "step": 7867 + }, + { + "epoch": 2.414978514426028, + "grad_norm": 0.29594334959983826, + "learning_rate": 8.872589503705287e-05, + "loss": 1.8651, + "step": 7868 + }, + { + "epoch": 2.4152854511970534, + "grad_norm": 0.2929564118385315, + "learning_rate": 8.872275070816612e-05, + "loss": 1.8671, + "step": 7869 + }, + { + "epoch": 2.4155923879680787, + "grad_norm": 0.30591902136802673, + "learning_rate": 8.871960599659842e-05, + "loss": 1.9341, + "step": 7870 + }, + { + "epoch": 2.4158993247391036, + "grad_norm": 0.3944799304008484, + "learning_rate": 8.87164609023809e-05, + "loss": 1.8947, + "step": 7871 + }, + { + "epoch": 2.416206261510129, + "grad_norm": 0.3568263351917267, + "learning_rate": 8.871331542554461e-05, + "loss": 1.8466, + "step": 7872 + }, + { + "epoch": 2.4165131982811543, + "grad_norm": 0.3182635009288788, + "learning_rate": 8.871016956612066e-05, + "loss": 1.8373, + "step": 7873 + }, + { + "epoch": 2.416820135052179, + "grad_norm": 0.31941649317741394, + "learning_rate": 8.870702332414012e-05, + "loss": 1.8356, + "step": 7874 + }, + { + "epoch": 2.4171270718232045, + "grad_norm": 0.3090899586677551, + "learning_rate": 8.870387669963407e-05, + "loss": 1.9308, + "step": 7875 + }, + { + "epoch": 2.4174340085942294, + "grad_norm": 0.3078390955924988, + "learning_rate": 8.870072969263364e-05, + "loss": 1.8521, + "step": 7876 + }, + { + "epoch": 2.4177409453652547, + "grad_norm": 0.29126885533332825, + "learning_rate": 8.869758230316992e-05, + "loss": 1.8091, + "step": 7877 + }, + { + "epoch": 2.41804788213628, + "grad_norm": 0.36473605036735535, + "learning_rate": 8.869443453127402e-05, + "loss": 1.8282, + "step": 7878 + }, + { + "epoch": 2.418354818907305, + "grad_norm": 0.3617660701274872, + "learning_rate": 8.869128637697702e-05, + "loss": 1.8843, + "step": 7879 + }, + { + "epoch": 2.4186617556783303, + "grad_norm": 0.33267220854759216, + "learning_rate": 8.868813784031005e-05, + "loss": 1.8647, + "step": 7880 + }, + { + "epoch": 2.418968692449355, + "grad_norm": 0.29990482330322266, + "learning_rate": 8.868498892130424e-05, + "loss": 1.7697, + "step": 7881 + }, + { + "epoch": 2.4192756292203805, + "grad_norm": 0.3618892431259155, + "learning_rate": 8.868183961999068e-05, + "loss": 1.7699, + "step": 7882 + }, + { + "epoch": 2.419582565991406, + "grad_norm": 0.29534587264060974, + "learning_rate": 8.867868993640051e-05, + "loss": 1.828, + "step": 7883 + }, + { + "epoch": 2.4198895027624308, + "grad_norm": 0.3086758255958557, + "learning_rate": 8.867553987056487e-05, + "loss": 1.8652, + "step": 7884 + }, + { + "epoch": 2.420196439533456, + "grad_norm": 0.3273947834968567, + "learning_rate": 8.867238942251487e-05, + "loss": 1.8553, + "step": 7885 + }, + { + "epoch": 2.4205033763044814, + "grad_norm": 0.3069070279598236, + "learning_rate": 8.866923859228165e-05, + "loss": 1.8057, + "step": 7886 + }, + { + "epoch": 2.4208103130755063, + "grad_norm": 0.2884439527988434, + "learning_rate": 8.866608737989635e-05, + "loss": 1.8479, + "step": 7887 + }, + { + "epoch": 2.4211172498465316, + "grad_norm": 0.32123002409935, + "learning_rate": 8.866293578539011e-05, + "loss": 1.916, + "step": 7888 + }, + { + "epoch": 2.421424186617557, + "grad_norm": 0.285966157913208, + "learning_rate": 8.865978380879407e-05, + "loss": 1.834, + "step": 7889 + }, + { + "epoch": 2.421731123388582, + "grad_norm": 0.28088799118995667, + "learning_rate": 8.865663145013941e-05, + "loss": 1.7794, + "step": 7890 + }, + { + "epoch": 2.422038060159607, + "grad_norm": 0.31160372495651245, + "learning_rate": 8.865347870945724e-05, + "loss": 1.8584, + "step": 7891 + }, + { + "epoch": 2.422344996930632, + "grad_norm": 0.3121089041233063, + "learning_rate": 8.865032558677874e-05, + "loss": 1.8797, + "step": 7892 + }, + { + "epoch": 2.4226519337016574, + "grad_norm": 0.35856643319129944, + "learning_rate": 8.864717208213506e-05, + "loss": 1.8664, + "step": 7893 + }, + { + "epoch": 2.4229588704726828, + "grad_norm": 0.32826781272888184, + "learning_rate": 8.864401819555739e-05, + "loss": 1.8473, + "step": 7894 + }, + { + "epoch": 2.4232658072437077, + "grad_norm": 0.34450921416282654, + "learning_rate": 8.86408639270769e-05, + "loss": 1.918, + "step": 7895 + }, + { + "epoch": 2.423572744014733, + "grad_norm": 0.39621153473854065, + "learning_rate": 8.86377092767247e-05, + "loss": 1.9411, + "step": 7896 + }, + { + "epoch": 2.423879680785758, + "grad_norm": 0.3765166103839874, + "learning_rate": 8.863455424453204e-05, + "loss": 1.9003, + "step": 7897 + }, + { + "epoch": 2.424186617556783, + "grad_norm": 0.3942621946334839, + "learning_rate": 8.863139883053007e-05, + "loss": 1.9647, + "step": 7898 + }, + { + "epoch": 2.4244935543278086, + "grad_norm": 0.4255806803703308, + "learning_rate": 8.862824303474996e-05, + "loss": 1.9147, + "step": 7899 + }, + { + "epoch": 2.424800491098834, + "grad_norm": 0.3993197977542877, + "learning_rate": 8.862508685722292e-05, + "loss": 1.8822, + "step": 7900 + }, + { + "epoch": 2.425107427869859, + "grad_norm": 0.3734201490879059, + "learning_rate": 8.862193029798013e-05, + "loss": 1.8745, + "step": 7901 + }, + { + "epoch": 2.425414364640884, + "grad_norm": 0.40955278277397156, + "learning_rate": 8.861877335705279e-05, + "loss": 1.877, + "step": 7902 + }, + { + "epoch": 2.425721301411909, + "grad_norm": 0.3975965678691864, + "learning_rate": 8.861561603447211e-05, + "loss": 1.868, + "step": 7903 + }, + { + "epoch": 2.4260282381829343, + "grad_norm": 0.30194091796875, + "learning_rate": 8.861245833026926e-05, + "loss": 1.7849, + "step": 7904 + }, + { + "epoch": 2.4263351749539597, + "grad_norm": 0.349930077791214, + "learning_rate": 8.860930024447547e-05, + "loss": 1.891, + "step": 7905 + }, + { + "epoch": 2.4266421117249846, + "grad_norm": 0.40644606947898865, + "learning_rate": 8.860614177712196e-05, + "loss": 1.8463, + "step": 7906 + }, + { + "epoch": 2.42694904849601, + "grad_norm": 0.3627426028251648, + "learning_rate": 8.86029829282399e-05, + "loss": 1.8518, + "step": 7907 + }, + { + "epoch": 2.427255985267035, + "grad_norm": 0.4019826054573059, + "learning_rate": 8.859982369786055e-05, + "loss": 1.7997, + "step": 7908 + }, + { + "epoch": 2.42756292203806, + "grad_norm": 0.375589519739151, + "learning_rate": 8.859666408601512e-05, + "loss": 1.9136, + "step": 7909 + }, + { + "epoch": 2.4278698588090855, + "grad_norm": 0.3135814070701599, + "learning_rate": 8.859350409273484e-05, + "loss": 1.8511, + "step": 7910 + }, + { + "epoch": 2.4281767955801103, + "grad_norm": 0.4534473717212677, + "learning_rate": 8.859034371805093e-05, + "loss": 1.9827, + "step": 7911 + }, + { + "epoch": 2.4284837323511357, + "grad_norm": 0.5559772849082947, + "learning_rate": 8.858718296199462e-05, + "loss": 1.8578, + "step": 7912 + }, + { + "epoch": 2.428790669122161, + "grad_norm": 0.4518011212348938, + "learning_rate": 8.858402182459715e-05, + "loss": 1.8374, + "step": 7913 + }, + { + "epoch": 2.429097605893186, + "grad_norm": 0.31662946939468384, + "learning_rate": 8.858086030588977e-05, + "loss": 1.8356, + "step": 7914 + }, + { + "epoch": 2.4294045426642112, + "grad_norm": 0.4660717844963074, + "learning_rate": 8.857769840590371e-05, + "loss": 1.7977, + "step": 7915 + }, + { + "epoch": 2.4297114794352366, + "grad_norm": 0.5611162185668945, + "learning_rate": 8.857453612467022e-05, + "loss": 1.8423, + "step": 7916 + }, + { + "epoch": 2.4300184162062615, + "grad_norm": 0.5055921077728271, + "learning_rate": 8.857137346222056e-05, + "loss": 1.8595, + "step": 7917 + }, + { + "epoch": 2.430325352977287, + "grad_norm": 0.3589123487472534, + "learning_rate": 8.856821041858597e-05, + "loss": 1.776, + "step": 7918 + }, + { + "epoch": 2.4306322897483117, + "grad_norm": 0.36849313974380493, + "learning_rate": 8.856504699379773e-05, + "loss": 1.8695, + "step": 7919 + }, + { + "epoch": 2.430939226519337, + "grad_norm": 0.47566625475883484, + "learning_rate": 8.856188318788709e-05, + "loss": 1.8578, + "step": 7920 + }, + { + "epoch": 2.4312461632903624, + "grad_norm": 0.554790735244751, + "learning_rate": 8.855871900088532e-05, + "loss": 1.8406, + "step": 7921 + }, + { + "epoch": 2.4315531000613873, + "grad_norm": 0.4846283197402954, + "learning_rate": 8.855555443282369e-05, + "loss": 1.8475, + "step": 7922 + }, + { + "epoch": 2.4318600368324126, + "grad_norm": 0.35256531834602356, + "learning_rate": 8.855238948373346e-05, + "loss": 1.8594, + "step": 7923 + }, + { + "epoch": 2.4321669736034375, + "grad_norm": 0.3713412880897522, + "learning_rate": 8.854922415364593e-05, + "loss": 1.893, + "step": 7924 + }, + { + "epoch": 2.432473910374463, + "grad_norm": 0.4289644658565521, + "learning_rate": 8.854605844259237e-05, + "loss": 1.8958, + "step": 7925 + }, + { + "epoch": 2.432780847145488, + "grad_norm": 0.4209578335285187, + "learning_rate": 8.854289235060406e-05, + "loss": 1.8419, + "step": 7926 + }, + { + "epoch": 2.433087783916513, + "grad_norm": 0.41226091980934143, + "learning_rate": 8.853972587771232e-05, + "loss": 1.958, + "step": 7927 + }, + { + "epoch": 2.4333947206875384, + "grad_norm": 0.36133915185928345, + "learning_rate": 8.853655902394841e-05, + "loss": 1.9181, + "step": 7928 + }, + { + "epoch": 2.4337016574585637, + "grad_norm": 0.44178202748298645, + "learning_rate": 8.853339178934363e-05, + "loss": 1.9242, + "step": 7929 + }, + { + "epoch": 2.4340085942295886, + "grad_norm": 0.4537523686885834, + "learning_rate": 8.853022417392929e-05, + "loss": 2.0451, + "step": 7930 + }, + { + "epoch": 2.434315531000614, + "grad_norm": 0.3214915990829468, + "learning_rate": 8.852705617773669e-05, + "loss": 1.8549, + "step": 7931 + }, + { + "epoch": 2.4346224677716393, + "grad_norm": 0.4621930420398712, + "learning_rate": 8.852388780079714e-05, + "loss": 1.8705, + "step": 7932 + }, + { + "epoch": 2.434929404542664, + "grad_norm": 0.52337646484375, + "learning_rate": 8.852071904314196e-05, + "loss": 1.8381, + "step": 7933 + }, + { + "epoch": 2.4352363413136895, + "grad_norm": 0.3846060633659363, + "learning_rate": 8.851754990480246e-05, + "loss": 1.828, + "step": 7934 + }, + { + "epoch": 2.4355432780847144, + "grad_norm": 0.34233763813972473, + "learning_rate": 8.851438038580994e-05, + "loss": 1.924, + "step": 7935 + }, + { + "epoch": 2.4358502148557397, + "grad_norm": 0.39583292603492737, + "learning_rate": 8.851121048619574e-05, + "loss": 1.8383, + "step": 7936 + }, + { + "epoch": 2.436157151626765, + "grad_norm": 0.3715476393699646, + "learning_rate": 8.850804020599119e-05, + "loss": 1.9251, + "step": 7937 + }, + { + "epoch": 2.43646408839779, + "grad_norm": 0.32089582085609436, + "learning_rate": 8.850486954522762e-05, + "loss": 1.9317, + "step": 7938 + }, + { + "epoch": 2.4367710251688153, + "grad_norm": 0.46823611855506897, + "learning_rate": 8.850169850393634e-05, + "loss": 1.9743, + "step": 7939 + }, + { + "epoch": 2.43707796193984, + "grad_norm": 0.405205637216568, + "learning_rate": 8.849852708214874e-05, + "loss": 1.8772, + "step": 7940 + }, + { + "epoch": 2.4373848987108655, + "grad_norm": 0.33672770857810974, + "learning_rate": 8.849535527989612e-05, + "loss": 1.8767, + "step": 7941 + }, + { + "epoch": 2.437691835481891, + "grad_norm": 0.38022953271865845, + "learning_rate": 8.849218309720983e-05, + "loss": 1.8882, + "step": 7942 + }, + { + "epoch": 2.4379987722529157, + "grad_norm": 0.4224186837673187, + "learning_rate": 8.848901053412124e-05, + "loss": 1.9016, + "step": 7943 + }, + { + "epoch": 2.438305709023941, + "grad_norm": 0.3890904486179352, + "learning_rate": 8.848583759066167e-05, + "loss": 1.8761, + "step": 7944 + }, + { + "epoch": 2.4386126457949664, + "grad_norm": 0.3747030794620514, + "learning_rate": 8.84826642668625e-05, + "loss": 1.8576, + "step": 7945 + }, + { + "epoch": 2.4389195825659913, + "grad_norm": 0.3317604959011078, + "learning_rate": 8.84794905627551e-05, + "loss": 1.9249, + "step": 7946 + }, + { + "epoch": 2.4392265193370166, + "grad_norm": 0.3294972777366638, + "learning_rate": 8.84763164783708e-05, + "loss": 1.8308, + "step": 7947 + }, + { + "epoch": 2.439533456108042, + "grad_norm": 0.42031124234199524, + "learning_rate": 8.847314201374101e-05, + "loss": 1.7884, + "step": 7948 + }, + { + "epoch": 2.439840392879067, + "grad_norm": 0.4018419682979584, + "learning_rate": 8.846996716889708e-05, + "loss": 1.8334, + "step": 7949 + }, + { + "epoch": 2.440147329650092, + "grad_norm": 0.39541858434677124, + "learning_rate": 8.846679194387036e-05, + "loss": 1.888, + "step": 7950 + }, + { + "epoch": 2.440454266421117, + "grad_norm": 0.34641456604003906, + "learning_rate": 8.846361633869228e-05, + "loss": 1.8521, + "step": 7951 + }, + { + "epoch": 2.4407612031921424, + "grad_norm": 0.42987826466560364, + "learning_rate": 8.846044035339419e-05, + "loss": 1.8789, + "step": 7952 + }, + { + "epoch": 2.4410681399631677, + "grad_norm": 0.3651089072227478, + "learning_rate": 8.845726398800749e-05, + "loss": 1.9024, + "step": 7953 + }, + { + "epoch": 2.4413750767341926, + "grad_norm": 0.3024137616157532, + "learning_rate": 8.845408724256356e-05, + "loss": 1.7773, + "step": 7954 + }, + { + "epoch": 2.441682013505218, + "grad_norm": 0.32426944375038147, + "learning_rate": 8.845091011709381e-05, + "loss": 1.7873, + "step": 7955 + }, + { + "epoch": 2.441988950276243, + "grad_norm": 0.34448274970054626, + "learning_rate": 8.844773261162962e-05, + "loss": 1.8854, + "step": 7956 + }, + { + "epoch": 2.442295887047268, + "grad_norm": 0.2942068874835968, + "learning_rate": 8.844455472620241e-05, + "loss": 1.8186, + "step": 7957 + }, + { + "epoch": 2.4426028238182935, + "grad_norm": 0.3849888741970062, + "learning_rate": 8.844137646084358e-05, + "loss": 1.905, + "step": 7958 + }, + { + "epoch": 2.4429097605893184, + "grad_norm": 0.44277897477149963, + "learning_rate": 8.843819781558452e-05, + "loss": 1.8836, + "step": 7959 + }, + { + "epoch": 2.4432166973603437, + "grad_norm": 0.34470248222351074, + "learning_rate": 8.843501879045667e-05, + "loss": 1.9368, + "step": 7960 + }, + { + "epoch": 2.443523634131369, + "grad_norm": 0.29713204503059387, + "learning_rate": 8.843183938549145e-05, + "loss": 1.8562, + "step": 7961 + }, + { + "epoch": 2.443830570902394, + "grad_norm": 0.370623379945755, + "learning_rate": 8.842865960072025e-05, + "loss": 1.8501, + "step": 7962 + }, + { + "epoch": 2.4441375076734193, + "grad_norm": 0.38828277587890625, + "learning_rate": 8.842547943617453e-05, + "loss": 1.884, + "step": 7963 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.294223427772522, + "learning_rate": 8.842229889188566e-05, + "loss": 1.857, + "step": 7964 + }, + { + "epoch": 2.4447513812154695, + "grad_norm": 0.31901589035987854, + "learning_rate": 8.841911796788516e-05, + "loss": 1.8675, + "step": 7965 + }, + { + "epoch": 2.445058317986495, + "grad_norm": 0.3586447834968567, + "learning_rate": 8.84159366642044e-05, + "loss": 1.86, + "step": 7966 + }, + { + "epoch": 2.4453652547575198, + "grad_norm": 0.30848199129104614, + "learning_rate": 8.841275498087482e-05, + "loss": 1.8153, + "step": 7967 + }, + { + "epoch": 2.445672191528545, + "grad_norm": 0.2694801688194275, + "learning_rate": 8.84095729179279e-05, + "loss": 1.7702, + "step": 7968 + }, + { + "epoch": 2.4459791282995704, + "grad_norm": 0.3068044185638428, + "learning_rate": 8.840639047539507e-05, + "loss": 1.8531, + "step": 7969 + }, + { + "epoch": 2.4462860650705953, + "grad_norm": 0.32885125279426575, + "learning_rate": 8.840320765330776e-05, + "loss": 1.9194, + "step": 7970 + }, + { + "epoch": 2.4465930018416207, + "grad_norm": 0.2949635088443756, + "learning_rate": 8.840002445169746e-05, + "loss": 1.8427, + "step": 7971 + }, + { + "epoch": 2.446899938612646, + "grad_norm": 0.27281275391578674, + "learning_rate": 8.83968408705956e-05, + "loss": 1.8279, + "step": 7972 + }, + { + "epoch": 2.447206875383671, + "grad_norm": 0.3038519620895386, + "learning_rate": 8.839365691003367e-05, + "loss": 1.8629, + "step": 7973 + }, + { + "epoch": 2.447513812154696, + "grad_norm": 0.28468266129493713, + "learning_rate": 8.839047257004311e-05, + "loss": 1.8765, + "step": 7974 + }, + { + "epoch": 2.4478207489257215, + "grad_norm": 0.29807159304618835, + "learning_rate": 8.83872878506554e-05, + "loss": 1.8152, + "step": 7975 + }, + { + "epoch": 2.4481276856967464, + "grad_norm": 0.3005301356315613, + "learning_rate": 8.838410275190201e-05, + "loss": 1.8577, + "step": 7976 + }, + { + "epoch": 2.4484346224677718, + "grad_norm": 0.3068598806858063, + "learning_rate": 8.838091727381442e-05, + "loss": 1.863, + "step": 7977 + }, + { + "epoch": 2.4487415592387967, + "grad_norm": 0.33748000860214233, + "learning_rate": 8.837773141642411e-05, + "loss": 1.7889, + "step": 7978 + }, + { + "epoch": 2.449048496009822, + "grad_norm": 0.344417542219162, + "learning_rate": 8.837454517976256e-05, + "loss": 1.9167, + "step": 7979 + }, + { + "epoch": 2.4493554327808473, + "grad_norm": 0.29128298163414, + "learning_rate": 8.837135856386127e-05, + "loss": 1.8246, + "step": 7980 + }, + { + "epoch": 2.449662369551872, + "grad_norm": 0.27023759484291077, + "learning_rate": 8.836817156875172e-05, + "loss": 1.8493, + "step": 7981 + }, + { + "epoch": 2.4499693063228976, + "grad_norm": 0.2792586088180542, + "learning_rate": 8.836498419446541e-05, + "loss": 1.8739, + "step": 7982 + }, + { + "epoch": 2.4502762430939224, + "grad_norm": 0.2715211510658264, + "learning_rate": 8.836179644103384e-05, + "loss": 1.8218, + "step": 7983 + }, + { + "epoch": 2.450583179864948, + "grad_norm": 0.273576557636261, + "learning_rate": 8.835860830848851e-05, + "loss": 1.9063, + "step": 7984 + }, + { + "epoch": 2.450890116635973, + "grad_norm": 0.2992589473724365, + "learning_rate": 8.835541979686093e-05, + "loss": 1.8799, + "step": 7985 + }, + { + "epoch": 2.451197053406998, + "grad_norm": 0.3231843411922455, + "learning_rate": 8.835223090618263e-05, + "loss": 1.8956, + "step": 7986 + }, + { + "epoch": 2.4515039901780233, + "grad_norm": 0.31108468770980835, + "learning_rate": 8.834904163648508e-05, + "loss": 1.8371, + "step": 7987 + }, + { + "epoch": 2.4518109269490487, + "grad_norm": 0.26657021045684814, + "learning_rate": 8.834585198779983e-05, + "loss": 1.8384, + "step": 7988 + }, + { + "epoch": 2.4521178637200736, + "grad_norm": 0.32093849778175354, + "learning_rate": 8.83426619601584e-05, + "loss": 1.8603, + "step": 7989 + }, + { + "epoch": 2.452424800491099, + "grad_norm": 0.32942765951156616, + "learning_rate": 8.833947155359231e-05, + "loss": 1.8306, + "step": 7990 + }, + { + "epoch": 2.4527317372621242, + "grad_norm": 0.31677374243736267, + "learning_rate": 8.83362807681331e-05, + "loss": 1.8339, + "step": 7991 + }, + { + "epoch": 2.453038674033149, + "grad_norm": 0.2739655673503876, + "learning_rate": 8.833308960381228e-05, + "loss": 1.8514, + "step": 7992 + }, + { + "epoch": 2.4533456108041745, + "grad_norm": 0.3194214105606079, + "learning_rate": 8.83298980606614e-05, + "loss": 1.8413, + "step": 7993 + }, + { + "epoch": 2.4536525475751993, + "grad_norm": 0.3346202075481415, + "learning_rate": 8.832670613871202e-05, + "loss": 1.8558, + "step": 7994 + }, + { + "epoch": 2.4539594843462247, + "grad_norm": 0.3400736451148987, + "learning_rate": 8.832351383799565e-05, + "loss": 1.8668, + "step": 7995 + }, + { + "epoch": 2.45426642111725, + "grad_norm": 0.2807479202747345, + "learning_rate": 8.832032115854385e-05, + "loss": 1.8361, + "step": 7996 + }, + { + "epoch": 2.454573357888275, + "grad_norm": 0.2977379262447357, + "learning_rate": 8.831712810038817e-05, + "loss": 1.84, + "step": 7997 + }, + { + "epoch": 2.4548802946593002, + "grad_norm": 0.3242948353290558, + "learning_rate": 8.831393466356019e-05, + "loss": 1.9421, + "step": 7998 + }, + { + "epoch": 2.455187231430325, + "grad_norm": 0.3289327025413513, + "learning_rate": 8.831074084809144e-05, + "loss": 1.9348, + "step": 7999 + }, + { + "epoch": 2.4554941682013505, + "grad_norm": 0.3378387987613678, + "learning_rate": 8.830754665401351e-05, + "loss": 1.7871, + "step": 8000 + }, + { + "epoch": 2.455801104972376, + "grad_norm": 0.29627665877342224, + "learning_rate": 8.830435208135794e-05, + "loss": 1.815, + "step": 8001 + }, + { + "epoch": 2.4561080417434007, + "grad_norm": 0.3509432375431061, + "learning_rate": 8.83011571301563e-05, + "loss": 1.9209, + "step": 8002 + }, + { + "epoch": 2.456414978514426, + "grad_norm": 0.3272305130958557, + "learning_rate": 8.829796180044019e-05, + "loss": 1.8437, + "step": 8003 + }, + { + "epoch": 2.4567219152854514, + "grad_norm": 0.33997493982315063, + "learning_rate": 8.829476609224119e-05, + "loss": 1.8827, + "step": 8004 + }, + { + "epoch": 2.4570288520564763, + "grad_norm": 0.30387789011001587, + "learning_rate": 8.829157000559084e-05, + "loss": 1.8427, + "step": 8005 + }, + { + "epoch": 2.4573357888275016, + "grad_norm": 0.30266425013542175, + "learning_rate": 8.828837354052075e-05, + "loss": 1.8274, + "step": 8006 + }, + { + "epoch": 2.457642725598527, + "grad_norm": 0.365546315908432, + "learning_rate": 8.828517669706254e-05, + "loss": 1.8455, + "step": 8007 + }, + { + "epoch": 2.457949662369552, + "grad_norm": 0.339226633310318, + "learning_rate": 8.828197947524774e-05, + "loss": 1.8665, + "step": 8008 + }, + { + "epoch": 2.458256599140577, + "grad_norm": 0.31167346239089966, + "learning_rate": 8.8278781875108e-05, + "loss": 1.7807, + "step": 8009 + }, + { + "epoch": 2.458563535911602, + "grad_norm": 0.2788028120994568, + "learning_rate": 8.82755838966749e-05, + "loss": 1.8834, + "step": 8010 + }, + { + "epoch": 2.4588704726826274, + "grad_norm": 0.34648752212524414, + "learning_rate": 8.827238553998005e-05, + "loss": 1.8981, + "step": 8011 + }, + { + "epoch": 2.4591774094536527, + "grad_norm": 0.3169974982738495, + "learning_rate": 8.826918680505504e-05, + "loss": 1.81, + "step": 8012 + }, + { + "epoch": 2.4594843462246776, + "grad_norm": 0.46924272179603577, + "learning_rate": 8.826598769193151e-05, + "loss": 1.9016, + "step": 8013 + }, + { + "epoch": 2.459791282995703, + "grad_norm": 0.38437098264694214, + "learning_rate": 8.826278820064106e-05, + "loss": 1.8924, + "step": 8014 + }, + { + "epoch": 2.460098219766728, + "grad_norm": 0.3350604474544525, + "learning_rate": 8.82595883312153e-05, + "loss": 1.8591, + "step": 8015 + }, + { + "epoch": 2.460405156537753, + "grad_norm": 0.3053742051124573, + "learning_rate": 8.825638808368588e-05, + "loss": 1.8114, + "step": 8016 + }, + { + "epoch": 2.4607120933087785, + "grad_norm": 0.29566875100135803, + "learning_rate": 8.82531874580844e-05, + "loss": 1.8055, + "step": 8017 + }, + { + "epoch": 2.4610190300798034, + "grad_norm": 0.3057360053062439, + "learning_rate": 8.824998645444249e-05, + "loss": 1.8268, + "step": 8018 + }, + { + "epoch": 2.4613259668508287, + "grad_norm": 0.27333348989486694, + "learning_rate": 8.82467850727918e-05, + "loss": 1.7876, + "step": 8019 + }, + { + "epoch": 2.461632903621854, + "grad_norm": 0.29202890396118164, + "learning_rate": 8.824358331316398e-05, + "loss": 1.8488, + "step": 8020 + }, + { + "epoch": 2.461939840392879, + "grad_norm": 0.3640623986721039, + "learning_rate": 8.824038117559064e-05, + "loss": 1.9665, + "step": 8021 + }, + { + "epoch": 2.4622467771639043, + "grad_norm": 0.35411131381988525, + "learning_rate": 8.823717866010344e-05, + "loss": 1.8561, + "step": 8022 + }, + { + "epoch": 2.4625537139349296, + "grad_norm": 0.3695240020751953, + "learning_rate": 8.823397576673403e-05, + "loss": 1.8489, + "step": 8023 + }, + { + "epoch": 2.4628606507059545, + "grad_norm": 0.36554715037345886, + "learning_rate": 8.823077249551406e-05, + "loss": 1.8523, + "step": 8024 + }, + { + "epoch": 2.46316758747698, + "grad_norm": 0.2982638478279114, + "learning_rate": 8.822756884647521e-05, + "loss": 1.8006, + "step": 8025 + }, + { + "epoch": 2.4634745242480047, + "grad_norm": 0.3693525791168213, + "learning_rate": 8.822436481964909e-05, + "loss": 1.8695, + "step": 8026 + }, + { + "epoch": 2.46378146101903, + "grad_norm": 0.46769842505455017, + "learning_rate": 8.82211604150674e-05, + "loss": 1.8509, + "step": 8027 + }, + { + "epoch": 2.4640883977900554, + "grad_norm": 0.5327584743499756, + "learning_rate": 8.82179556327618e-05, + "loss": 1.8642, + "step": 8028 + }, + { + "epoch": 2.4643953345610803, + "grad_norm": 0.5302795767784119, + "learning_rate": 8.821475047276398e-05, + "loss": 1.8645, + "step": 8029 + }, + { + "epoch": 2.4647022713321056, + "grad_norm": 0.43549028038978577, + "learning_rate": 8.821154493510557e-05, + "loss": 1.9193, + "step": 8030 + }, + { + "epoch": 2.4650092081031305, + "grad_norm": 0.3013847768306732, + "learning_rate": 8.82083390198183e-05, + "loss": 1.7819, + "step": 8031 + }, + { + "epoch": 2.465316144874156, + "grad_norm": 0.422325074672699, + "learning_rate": 8.820513272693383e-05, + "loss": 1.9307, + "step": 8032 + }, + { + "epoch": 2.465623081645181, + "grad_norm": 0.4823217988014221, + "learning_rate": 8.820192605648383e-05, + "loss": 1.8681, + "step": 8033 + }, + { + "epoch": 2.465930018416206, + "grad_norm": 0.3938382863998413, + "learning_rate": 8.819871900850001e-05, + "loss": 1.8483, + "step": 8034 + }, + { + "epoch": 2.4662369551872314, + "grad_norm": 0.30860164761543274, + "learning_rate": 8.819551158301406e-05, + "loss": 1.8818, + "step": 8035 + }, + { + "epoch": 2.4665438919582567, + "grad_norm": 0.3715503215789795, + "learning_rate": 8.819230378005767e-05, + "loss": 1.8443, + "step": 8036 + }, + { + "epoch": 2.4668508287292816, + "grad_norm": 0.4750272333621979, + "learning_rate": 8.818909559966255e-05, + "loss": 1.8379, + "step": 8037 + }, + { + "epoch": 2.467157765500307, + "grad_norm": 0.4794345796108246, + "learning_rate": 8.818588704186041e-05, + "loss": 1.8585, + "step": 8038 + }, + { + "epoch": 2.4674647022713323, + "grad_norm": 0.33470577001571655, + "learning_rate": 8.818267810668296e-05, + "loss": 1.8231, + "step": 8039 + }, + { + "epoch": 2.467771639042357, + "grad_norm": 0.31480371952056885, + "learning_rate": 8.817946879416191e-05, + "loss": 1.867, + "step": 8040 + }, + { + "epoch": 2.4680785758133825, + "grad_norm": 0.41635531187057495, + "learning_rate": 8.817625910432897e-05, + "loss": 1.9385, + "step": 8041 + }, + { + "epoch": 2.4683855125844074, + "grad_norm": 0.4570399522781372, + "learning_rate": 8.817304903721584e-05, + "loss": 1.7855, + "step": 8042 + }, + { + "epoch": 2.4686924493554327, + "grad_norm": 0.36506229639053345, + "learning_rate": 8.816983859285429e-05, + "loss": 1.808, + "step": 8043 + }, + { + "epoch": 2.468999386126458, + "grad_norm": 0.2650545537471771, + "learning_rate": 8.8166627771276e-05, + "loss": 1.8271, + "step": 8044 + }, + { + "epoch": 2.469306322897483, + "grad_norm": 0.3143758475780487, + "learning_rate": 8.816341657251272e-05, + "loss": 1.9016, + "step": 8045 + }, + { + "epoch": 2.4696132596685083, + "grad_norm": 0.3015407621860504, + "learning_rate": 8.81602049965962e-05, + "loss": 1.8357, + "step": 8046 + }, + { + "epoch": 2.4699201964395336, + "grad_norm": 0.26860085129737854, + "learning_rate": 8.815699304355819e-05, + "loss": 1.8223, + "step": 8047 + }, + { + "epoch": 2.4702271332105585, + "grad_norm": 0.2852436602115631, + "learning_rate": 8.81537807134304e-05, + "loss": 1.8298, + "step": 8048 + }, + { + "epoch": 2.470534069981584, + "grad_norm": 0.29519692063331604, + "learning_rate": 8.815056800624457e-05, + "loss": 1.863, + "step": 8049 + }, + { + "epoch": 2.470841006752609, + "grad_norm": 0.3163367807865143, + "learning_rate": 8.814735492203247e-05, + "loss": 1.878, + "step": 8050 + }, + { + "epoch": 2.471147943523634, + "grad_norm": 0.2955954968929291, + "learning_rate": 8.814414146082586e-05, + "loss": 1.8657, + "step": 8051 + }, + { + "epoch": 2.4714548802946594, + "grad_norm": 0.2773810029029846, + "learning_rate": 8.814092762265648e-05, + "loss": 1.7626, + "step": 8052 + }, + { + "epoch": 2.4717618170656843, + "grad_norm": 0.33908557891845703, + "learning_rate": 8.813771340755609e-05, + "loss": 1.8902, + "step": 8053 + }, + { + "epoch": 2.4720687538367097, + "grad_norm": 0.3083830773830414, + "learning_rate": 8.81344988155565e-05, + "loss": 1.876, + "step": 8054 + }, + { + "epoch": 2.472375690607735, + "grad_norm": 0.29082754254341125, + "learning_rate": 8.81312838466894e-05, + "loss": 1.8637, + "step": 8055 + }, + { + "epoch": 2.47268262737876, + "grad_norm": 0.3240490257740021, + "learning_rate": 8.81280685009866e-05, + "loss": 1.9096, + "step": 8056 + }, + { + "epoch": 2.472989564149785, + "grad_norm": 0.364561527967453, + "learning_rate": 8.812485277847991e-05, + "loss": 1.9361, + "step": 8057 + }, + { + "epoch": 2.47329650092081, + "grad_norm": 0.3420087695121765, + "learning_rate": 8.812163667920107e-05, + "loss": 1.9014, + "step": 8058 + }, + { + "epoch": 2.4736034376918354, + "grad_norm": 0.3346010148525238, + "learning_rate": 8.811842020318186e-05, + "loss": 1.9195, + "step": 8059 + }, + { + "epoch": 2.4739103744628608, + "grad_norm": 0.2990448772907257, + "learning_rate": 8.811520335045409e-05, + "loss": 1.8866, + "step": 8060 + }, + { + "epoch": 2.4742173112338857, + "grad_norm": 0.3047022223472595, + "learning_rate": 8.811198612104953e-05, + "loss": 1.8226, + "step": 8061 + }, + { + "epoch": 2.474524248004911, + "grad_norm": 0.300020307302475, + "learning_rate": 8.8108768515e-05, + "loss": 1.8496, + "step": 8062 + }, + { + "epoch": 2.4748311847759363, + "grad_norm": 0.31999605894088745, + "learning_rate": 8.810555053233729e-05, + "loss": 1.7853, + "step": 8063 + }, + { + "epoch": 2.4751381215469612, + "grad_norm": 0.3136597275733948, + "learning_rate": 8.810233217309318e-05, + "loss": 1.9317, + "step": 8064 + }, + { + "epoch": 2.4754450583179866, + "grad_norm": 0.3373543322086334, + "learning_rate": 8.809911343729948e-05, + "loss": 1.7827, + "step": 8065 + }, + { + "epoch": 2.475751995089012, + "grad_norm": 0.33876341581344604, + "learning_rate": 8.809589432498804e-05, + "loss": 1.8803, + "step": 8066 + }, + { + "epoch": 2.476058931860037, + "grad_norm": 0.3455486297607422, + "learning_rate": 8.809267483619061e-05, + "loss": 1.8987, + "step": 8067 + }, + { + "epoch": 2.476365868631062, + "grad_norm": 0.34245389699935913, + "learning_rate": 8.808945497093907e-05, + "loss": 1.8948, + "step": 8068 + }, + { + "epoch": 2.476672805402087, + "grad_norm": 0.3200787901878357, + "learning_rate": 8.808623472926521e-05, + "loss": 1.8234, + "step": 8069 + }, + { + "epoch": 2.4769797421731123, + "grad_norm": 0.3244795799255371, + "learning_rate": 8.808301411120083e-05, + "loss": 1.8974, + "step": 8070 + }, + { + "epoch": 2.4772866789441377, + "grad_norm": 0.30235809087753296, + "learning_rate": 8.80797931167778e-05, + "loss": 1.8461, + "step": 8071 + }, + { + "epoch": 2.4775936157151626, + "grad_norm": 0.3719651997089386, + "learning_rate": 8.807657174602792e-05, + "loss": 1.9717, + "step": 8072 + }, + { + "epoch": 2.477900552486188, + "grad_norm": 0.3349135220050812, + "learning_rate": 8.807334999898307e-05, + "loss": 1.9, + "step": 8073 + }, + { + "epoch": 2.478207489257213, + "grad_norm": 0.28822100162506104, + "learning_rate": 8.807012787567503e-05, + "loss": 1.7606, + "step": 8074 + }, + { + "epoch": 2.478514426028238, + "grad_norm": 0.33698850870132446, + "learning_rate": 8.806690537613568e-05, + "loss": 1.8909, + "step": 8075 + }, + { + "epoch": 2.4788213627992635, + "grad_norm": 0.35167089104652405, + "learning_rate": 8.806368250039687e-05, + "loss": 1.8529, + "step": 8076 + }, + { + "epoch": 2.4791282995702884, + "grad_norm": 0.3142544627189636, + "learning_rate": 8.806045924849044e-05, + "loss": 1.8169, + "step": 8077 + }, + { + "epoch": 2.4794352363413137, + "grad_norm": 0.3489094078540802, + "learning_rate": 8.805723562044824e-05, + "loss": 1.8822, + "step": 8078 + }, + { + "epoch": 2.479742173112339, + "grad_norm": 0.33814284205436707, + "learning_rate": 8.805401161630214e-05, + "loss": 1.7982, + "step": 8079 + }, + { + "epoch": 2.480049109883364, + "grad_norm": 0.26772376894950867, + "learning_rate": 8.805078723608398e-05, + "loss": 1.8354, + "step": 8080 + }, + { + "epoch": 2.4803560466543892, + "grad_norm": 0.3259965777397156, + "learning_rate": 8.804756247982563e-05, + "loss": 1.8292, + "step": 8081 + }, + { + "epoch": 2.4806629834254146, + "grad_norm": 0.32701683044433594, + "learning_rate": 8.804433734755899e-05, + "loss": 1.8339, + "step": 8082 + }, + { + "epoch": 2.4809699201964395, + "grad_norm": 0.3180190324783325, + "learning_rate": 8.804111183931589e-05, + "loss": 1.8839, + "step": 8083 + }, + { + "epoch": 2.481276856967465, + "grad_norm": 0.3318104147911072, + "learning_rate": 8.803788595512824e-05, + "loss": 1.9024, + "step": 8084 + }, + { + "epoch": 2.4815837937384897, + "grad_norm": 0.3849479854106903, + "learning_rate": 8.80346596950279e-05, + "loss": 1.8497, + "step": 8085 + }, + { + "epoch": 2.481890730509515, + "grad_norm": 0.48812124133110046, + "learning_rate": 8.803143305904676e-05, + "loss": 1.799, + "step": 8086 + }, + { + "epoch": 2.4821976672805404, + "grad_norm": 0.4957241415977478, + "learning_rate": 8.802820604721671e-05, + "loss": 1.8842, + "step": 8087 + }, + { + "epoch": 2.4825046040515653, + "grad_norm": 0.4011611342430115, + "learning_rate": 8.802497865956964e-05, + "loss": 1.8354, + "step": 8088 + }, + { + "epoch": 2.4828115408225906, + "grad_norm": 0.3676159679889679, + "learning_rate": 8.802175089613744e-05, + "loss": 1.8564, + "step": 8089 + }, + { + "epoch": 2.4831184775936155, + "grad_norm": 0.30699628591537476, + "learning_rate": 8.801852275695202e-05, + "loss": 1.8403, + "step": 8090 + }, + { + "epoch": 2.483425414364641, + "grad_norm": 0.4100657105445862, + "learning_rate": 8.801529424204527e-05, + "loss": 1.7885, + "step": 8091 + }, + { + "epoch": 2.483732351135666, + "grad_norm": 0.30880647897720337, + "learning_rate": 8.801206535144909e-05, + "loss": 1.8682, + "step": 8092 + }, + { + "epoch": 2.484039287906691, + "grad_norm": 0.2775783836841583, + "learning_rate": 8.800883608519541e-05, + "loss": 1.8179, + "step": 8093 + }, + { + "epoch": 2.4843462246777164, + "grad_norm": 0.3048902451992035, + "learning_rate": 8.800560644331613e-05, + "loss": 1.8799, + "step": 8094 + }, + { + "epoch": 2.4846531614487417, + "grad_norm": 0.30332526564598083, + "learning_rate": 8.800237642584318e-05, + "loss": 1.8892, + "step": 8095 + }, + { + "epoch": 2.4849600982197666, + "grad_norm": 0.27216237783432007, + "learning_rate": 8.799914603280847e-05, + "loss": 1.7896, + "step": 8096 + }, + { + "epoch": 2.485267034990792, + "grad_norm": 0.28771117329597473, + "learning_rate": 8.799591526424393e-05, + "loss": 1.8593, + "step": 8097 + }, + { + "epoch": 2.4855739717618173, + "grad_norm": 0.2986912429332733, + "learning_rate": 8.799268412018146e-05, + "loss": 1.8205, + "step": 8098 + }, + { + "epoch": 2.485880908532842, + "grad_norm": 0.3072153925895691, + "learning_rate": 8.798945260065306e-05, + "loss": 1.841, + "step": 8099 + }, + { + "epoch": 2.4861878453038675, + "grad_norm": 0.33869001269340515, + "learning_rate": 8.798622070569059e-05, + "loss": 1.8353, + "step": 8100 + }, + { + "epoch": 2.4864947820748924, + "grad_norm": 0.3075481951236725, + "learning_rate": 8.798298843532605e-05, + "loss": 1.8824, + "step": 8101 + }, + { + "epoch": 2.4868017188459177, + "grad_norm": 0.2758934795856476, + "learning_rate": 8.797975578959132e-05, + "loss": 1.8068, + "step": 8102 + }, + { + "epoch": 2.487108655616943, + "grad_norm": 0.3065447211265564, + "learning_rate": 8.79765227685184e-05, + "loss": 1.8661, + "step": 8103 + }, + { + "epoch": 2.487415592387968, + "grad_norm": 0.34466415643692017, + "learning_rate": 8.797328937213923e-05, + "loss": 1.8579, + "step": 8104 + }, + { + "epoch": 2.4877225291589933, + "grad_norm": 0.4202970862388611, + "learning_rate": 8.797005560048575e-05, + "loss": 1.8526, + "step": 8105 + }, + { + "epoch": 2.488029465930018, + "grad_norm": 0.35885924100875854, + "learning_rate": 8.796682145358991e-05, + "loss": 1.8194, + "step": 8106 + }, + { + "epoch": 2.4883364027010435, + "grad_norm": 0.3208492696285248, + "learning_rate": 8.796358693148372e-05, + "loss": 1.8379, + "step": 8107 + }, + { + "epoch": 2.488643339472069, + "grad_norm": 0.26514047384262085, + "learning_rate": 8.79603520341991e-05, + "loss": 1.7978, + "step": 8108 + }, + { + "epoch": 2.4889502762430937, + "grad_norm": 0.34550225734710693, + "learning_rate": 8.795711676176803e-05, + "loss": 1.8771, + "step": 8109 + }, + { + "epoch": 2.489257213014119, + "grad_norm": 0.3016511797904968, + "learning_rate": 8.795388111422248e-05, + "loss": 1.8184, + "step": 8110 + }, + { + "epoch": 2.4895641497851444, + "grad_norm": 0.34824177622795105, + "learning_rate": 8.795064509159444e-05, + "loss": 1.8486, + "step": 8111 + }, + { + "epoch": 2.4898710865561693, + "grad_norm": 0.341482013463974, + "learning_rate": 8.794740869391587e-05, + "loss": 1.7872, + "step": 8112 + }, + { + "epoch": 2.4901780233271946, + "grad_norm": 0.3366520404815674, + "learning_rate": 8.794417192121878e-05, + "loss": 1.838, + "step": 8113 + }, + { + "epoch": 2.49048496009822, + "grad_norm": 0.3168759047985077, + "learning_rate": 8.794093477353514e-05, + "loss": 1.8195, + "step": 8114 + }, + { + "epoch": 2.490791896869245, + "grad_norm": 0.36757516860961914, + "learning_rate": 8.793769725089693e-05, + "loss": 1.8825, + "step": 8115 + }, + { + "epoch": 2.49109883364027, + "grad_norm": 0.3936297297477722, + "learning_rate": 8.793445935333617e-05, + "loss": 1.855, + "step": 8116 + }, + { + "epoch": 2.491405770411295, + "grad_norm": 0.31962448358535767, + "learning_rate": 8.793122108088485e-05, + "loss": 1.8307, + "step": 8117 + }, + { + "epoch": 2.4917127071823204, + "grad_norm": 0.3082095980644226, + "learning_rate": 8.792798243357499e-05, + "loss": 1.8204, + "step": 8118 + }, + { + "epoch": 2.4920196439533457, + "grad_norm": 0.4574470520019531, + "learning_rate": 8.792474341143855e-05, + "loss": 1.8989, + "step": 8119 + }, + { + "epoch": 2.4923265807243706, + "grad_norm": 0.4596022367477417, + "learning_rate": 8.792150401450757e-05, + "loss": 1.8773, + "step": 8120 + }, + { + "epoch": 2.492633517495396, + "grad_norm": 0.32090309262275696, + "learning_rate": 8.791826424281407e-05, + "loss": 1.8621, + "step": 8121 + }, + { + "epoch": 2.4929404542664213, + "grad_norm": 0.3492026925086975, + "learning_rate": 8.791502409639006e-05, + "loss": 1.8887, + "step": 8122 + }, + { + "epoch": 2.493247391037446, + "grad_norm": 0.39859771728515625, + "learning_rate": 8.791178357526754e-05, + "loss": 1.8326, + "step": 8123 + }, + { + "epoch": 2.4935543278084715, + "grad_norm": 0.40439239144325256, + "learning_rate": 8.790854267947857e-05, + "loss": 1.8716, + "step": 8124 + }, + { + "epoch": 2.493861264579497, + "grad_norm": 0.4004671573638916, + "learning_rate": 8.790530140905515e-05, + "loss": 1.8253, + "step": 8125 + }, + { + "epoch": 2.4941682013505218, + "grad_norm": 0.31446993350982666, + "learning_rate": 8.790205976402934e-05, + "loss": 1.8356, + "step": 8126 + }, + { + "epoch": 2.494475138121547, + "grad_norm": 0.3069862723350525, + "learning_rate": 8.789881774443315e-05, + "loss": 1.8532, + "step": 8127 + }, + { + "epoch": 2.494782074892572, + "grad_norm": 0.3192054033279419, + "learning_rate": 8.789557535029864e-05, + "loss": 1.7991, + "step": 8128 + }, + { + "epoch": 2.4950890116635973, + "grad_norm": 0.30979350209236145, + "learning_rate": 8.789233258165783e-05, + "loss": 1.8874, + "step": 8129 + }, + { + "epoch": 2.4953959484346226, + "grad_norm": 0.3193976879119873, + "learning_rate": 8.788908943854279e-05, + "loss": 1.8218, + "step": 8130 + }, + { + "epoch": 2.4957028852056475, + "grad_norm": 0.3120083808898926, + "learning_rate": 8.788584592098557e-05, + "loss": 1.9542, + "step": 8131 + }, + { + "epoch": 2.496009821976673, + "grad_norm": 0.36913001537323, + "learning_rate": 8.788260202901819e-05, + "loss": 1.8543, + "step": 8132 + }, + { + "epoch": 2.4963167587476978, + "grad_norm": 0.40216776728630066, + "learning_rate": 8.787935776267275e-05, + "loss": 1.8645, + "step": 8133 + }, + { + "epoch": 2.496623695518723, + "grad_norm": 0.3553076684474945, + "learning_rate": 8.78761131219813e-05, + "loss": 1.8881, + "step": 8134 + }, + { + "epoch": 2.4969306322897484, + "grad_norm": 0.2926538288593292, + "learning_rate": 8.787286810697589e-05, + "loss": 1.8419, + "step": 8135 + }, + { + "epoch": 2.4972375690607733, + "grad_norm": 0.3412233293056488, + "learning_rate": 8.78696227176886e-05, + "loss": 1.8766, + "step": 8136 + }, + { + "epoch": 2.4975445058317987, + "grad_norm": 0.30935296416282654, + "learning_rate": 8.78663769541515e-05, + "loss": 1.8002, + "step": 8137 + }, + { + "epoch": 2.497851442602824, + "grad_norm": 0.31171828508377075, + "learning_rate": 8.786313081639666e-05, + "loss": 1.7795, + "step": 8138 + }, + { + "epoch": 2.498158379373849, + "grad_norm": 0.2874031364917755, + "learning_rate": 8.785988430445619e-05, + "loss": 1.8508, + "step": 8139 + }, + { + "epoch": 2.498465316144874, + "grad_norm": 0.3126043379306793, + "learning_rate": 8.785663741836215e-05, + "loss": 1.8328, + "step": 8140 + }, + { + "epoch": 2.4987722529158995, + "grad_norm": 0.32581454515457153, + "learning_rate": 8.785339015814662e-05, + "loss": 1.8333, + "step": 8141 + }, + { + "epoch": 2.4990791896869244, + "grad_norm": 0.329745888710022, + "learning_rate": 8.78501425238417e-05, + "loss": 1.8257, + "step": 8142 + }, + { + "epoch": 2.4993861264579498, + "grad_norm": 0.29101938009262085, + "learning_rate": 8.78468945154795e-05, + "loss": 1.8472, + "step": 8143 + }, + { + "epoch": 2.4996930632289747, + "grad_norm": 0.3123742341995239, + "learning_rate": 8.784364613309208e-05, + "loss": 1.9226, + "step": 8144 + }, + { + "epoch": 2.5, + "grad_norm": 0.3330230116844177, + "learning_rate": 8.784039737671159e-05, + "loss": 1.8768, + "step": 8145 + }, + { + "epoch": 2.5003069367710253, + "grad_norm": 0.3147718012332916, + "learning_rate": 8.783714824637011e-05, + "loss": 1.853, + "step": 8146 + }, + { + "epoch": 2.5006138735420502, + "grad_norm": 0.34790241718292236, + "learning_rate": 8.783389874209977e-05, + "loss": 1.8328, + "step": 8147 + }, + { + "epoch": 2.5009208103130756, + "grad_norm": 0.29425308108329773, + "learning_rate": 8.783064886393264e-05, + "loss": 1.8487, + "step": 8148 + }, + { + "epoch": 2.5012277470841005, + "grad_norm": 0.30555078387260437, + "learning_rate": 8.782739861190088e-05, + "loss": 1.8588, + "step": 8149 + }, + { + "epoch": 2.501534683855126, + "grad_norm": 0.29712429642677307, + "learning_rate": 8.78241479860366e-05, + "loss": 1.8056, + "step": 8150 + }, + { + "epoch": 2.501841620626151, + "grad_norm": 0.32512977719306946, + "learning_rate": 8.782089698637191e-05, + "loss": 1.9099, + "step": 8151 + }, + { + "epoch": 2.5021485573971765, + "grad_norm": 0.3660493493080139, + "learning_rate": 8.781764561293895e-05, + "loss": 1.905, + "step": 8152 + }, + { + "epoch": 2.5024554941682013, + "grad_norm": 0.33591583371162415, + "learning_rate": 8.781439386576984e-05, + "loss": 1.8353, + "step": 8153 + }, + { + "epoch": 2.5027624309392267, + "grad_norm": 0.3774370551109314, + "learning_rate": 8.781114174489673e-05, + "loss": 1.8626, + "step": 8154 + }, + { + "epoch": 2.5030693677102516, + "grad_norm": 0.3628109097480774, + "learning_rate": 8.780788925035178e-05, + "loss": 1.8549, + "step": 8155 + }, + { + "epoch": 2.503376304481277, + "grad_norm": 0.3089732825756073, + "learning_rate": 8.78046363821671e-05, + "loss": 1.835, + "step": 8156 + }, + { + "epoch": 2.5036832412523022, + "grad_norm": 0.3630690574645996, + "learning_rate": 8.780138314037482e-05, + "loss": 1.8308, + "step": 8157 + }, + { + "epoch": 2.503990178023327, + "grad_norm": 0.3658130466938019, + "learning_rate": 8.779812952500714e-05, + "loss": 1.8484, + "step": 8158 + }, + { + "epoch": 2.5042971147943525, + "grad_norm": 0.38401272892951965, + "learning_rate": 8.779487553609617e-05, + "loss": 1.8408, + "step": 8159 + }, + { + "epoch": 2.5046040515653774, + "grad_norm": 0.354514479637146, + "learning_rate": 8.77916211736741e-05, + "loss": 1.8491, + "step": 8160 + }, + { + "epoch": 2.5049109883364027, + "grad_norm": 0.3604681193828583, + "learning_rate": 8.778836643777309e-05, + "loss": 1.8887, + "step": 8161 + }, + { + "epoch": 2.505217925107428, + "grad_norm": 0.3155761957168579, + "learning_rate": 8.778511132842528e-05, + "loss": 1.8066, + "step": 8162 + }, + { + "epoch": 2.505524861878453, + "grad_norm": 0.35986092686653137, + "learning_rate": 8.778185584566286e-05, + "loss": 1.8348, + "step": 8163 + }, + { + "epoch": 2.5058317986494782, + "grad_norm": 0.558273434638977, + "learning_rate": 8.777859998951799e-05, + "loss": 1.9118, + "step": 8164 + }, + { + "epoch": 2.506138735420503, + "grad_norm": 0.6520169377326965, + "learning_rate": 8.777534376002285e-05, + "loss": 1.8747, + "step": 8165 + }, + { + "epoch": 2.5064456721915285, + "grad_norm": 0.5059971213340759, + "learning_rate": 8.777208715720963e-05, + "loss": 1.8218, + "step": 8166 + }, + { + "epoch": 2.506752608962554, + "grad_norm": 0.2873745560646057, + "learning_rate": 8.77688301811105e-05, + "loss": 1.8266, + "step": 8167 + }, + { + "epoch": 2.507059545733579, + "grad_norm": 0.4212021827697754, + "learning_rate": 8.776557283175765e-05, + "loss": 1.8553, + "step": 8168 + }, + { + "epoch": 2.507366482504604, + "grad_norm": 0.49324098229408264, + "learning_rate": 8.776231510918328e-05, + "loss": 1.8625, + "step": 8169 + }, + { + "epoch": 2.5076734192756294, + "grad_norm": 0.4414234459400177, + "learning_rate": 8.775905701341959e-05, + "loss": 1.7956, + "step": 8170 + }, + { + "epoch": 2.5079803560466543, + "grad_norm": 0.2691541612148285, + "learning_rate": 8.775579854449876e-05, + "loss": 1.8216, + "step": 8171 + }, + { + "epoch": 2.5082872928176796, + "grad_norm": 0.3366323411464691, + "learning_rate": 8.775253970245299e-05, + "loss": 1.8738, + "step": 8172 + }, + { + "epoch": 2.508594229588705, + "grad_norm": 0.49541351199150085, + "learning_rate": 8.77492804873145e-05, + "loss": 1.8281, + "step": 8173 + }, + { + "epoch": 2.50890116635973, + "grad_norm": 0.584227442741394, + "learning_rate": 8.774602089911548e-05, + "loss": 1.8248, + "step": 8174 + }, + { + "epoch": 2.509208103130755, + "grad_norm": 0.4493597149848938, + "learning_rate": 8.774276093788818e-05, + "loss": 1.8624, + "step": 8175 + }, + { + "epoch": 2.50951503990178, + "grad_norm": 0.29684513807296753, + "learning_rate": 8.77395006036648e-05, + "loss": 1.7806, + "step": 8176 + }, + { + "epoch": 2.5098219766728054, + "grad_norm": 0.38788866996765137, + "learning_rate": 8.773623989647754e-05, + "loss": 1.8334, + "step": 8177 + }, + { + "epoch": 2.5101289134438307, + "grad_norm": 0.44810980558395386, + "learning_rate": 8.773297881635865e-05, + "loss": 1.823, + "step": 8178 + }, + { + "epoch": 2.5104358502148556, + "grad_norm": 0.39918363094329834, + "learning_rate": 8.772971736334032e-05, + "loss": 1.8535, + "step": 8179 + }, + { + "epoch": 2.510742786985881, + "grad_norm": 0.3454466462135315, + "learning_rate": 8.772645553745484e-05, + "loss": 1.8532, + "step": 8180 + }, + { + "epoch": 2.511049723756906, + "grad_norm": 0.3523466885089874, + "learning_rate": 8.77231933387344e-05, + "loss": 1.8402, + "step": 8181 + }, + { + "epoch": 2.511356660527931, + "grad_norm": 0.41947969794273376, + "learning_rate": 8.771993076721126e-05, + "loss": 1.8509, + "step": 8182 + }, + { + "epoch": 2.5116635972989565, + "grad_norm": 0.43224433064460754, + "learning_rate": 8.771666782291765e-05, + "loss": 1.858, + "step": 8183 + }, + { + "epoch": 2.511970534069982, + "grad_norm": 0.3467538058757782, + "learning_rate": 8.771340450588584e-05, + "loss": 1.8528, + "step": 8184 + }, + { + "epoch": 2.5122774708410067, + "grad_norm": 0.33712685108184814, + "learning_rate": 8.771014081614803e-05, + "loss": 1.8741, + "step": 8185 + }, + { + "epoch": 2.512584407612032, + "grad_norm": 0.4289829134941101, + "learning_rate": 8.770687675373652e-05, + "loss": 1.8252, + "step": 8186 + }, + { + "epoch": 2.512891344383057, + "grad_norm": 0.4774068295955658, + "learning_rate": 8.770361231868356e-05, + "loss": 1.8285, + "step": 8187 + }, + { + "epoch": 2.5131982811540823, + "grad_norm": 0.3455580472946167, + "learning_rate": 8.77003475110214e-05, + "loss": 1.8025, + "step": 8188 + }, + { + "epoch": 2.5135052179251076, + "grad_norm": 0.3050900399684906, + "learning_rate": 8.769708233078231e-05, + "loss": 1.8764, + "step": 8189 + }, + { + "epoch": 2.5138121546961325, + "grad_norm": 0.42384061217308044, + "learning_rate": 8.769381677799855e-05, + "loss": 1.8937, + "step": 8190 + }, + { + "epoch": 2.514119091467158, + "grad_norm": 0.4084749221801758, + "learning_rate": 8.76905508527024e-05, + "loss": 1.8124, + "step": 8191 + }, + { + "epoch": 2.5144260282381827, + "grad_norm": 0.38785848021507263, + "learning_rate": 8.768728455492615e-05, + "loss": 1.8731, + "step": 8192 + }, + { + "epoch": 2.514732965009208, + "grad_norm": 0.28196588158607483, + "learning_rate": 8.768401788470206e-05, + "loss": 1.809, + "step": 8193 + }, + { + "epoch": 2.5150399017802334, + "grad_norm": 0.3551066815853119, + "learning_rate": 8.76807508420624e-05, + "loss": 1.8955, + "step": 8194 + }, + { + "epoch": 2.5153468385512583, + "grad_norm": 0.4327031373977661, + "learning_rate": 8.76774834270395e-05, + "loss": 1.8651, + "step": 8195 + }, + { + "epoch": 2.5156537753222836, + "grad_norm": 0.3748793303966522, + "learning_rate": 8.76742156396656e-05, + "loss": 1.8158, + "step": 8196 + }, + { + "epoch": 2.5159607120933085, + "grad_norm": 0.32504430413246155, + "learning_rate": 8.767094747997304e-05, + "loss": 1.8598, + "step": 8197 + }, + { + "epoch": 2.516267648864334, + "grad_norm": 0.3639826476573944, + "learning_rate": 8.76676789479941e-05, + "loss": 1.8829, + "step": 8198 + }, + { + "epoch": 2.516574585635359, + "grad_norm": 0.36793577671051025, + "learning_rate": 8.766441004376106e-05, + "loss": 1.8215, + "step": 8199 + }, + { + "epoch": 2.5168815224063845, + "grad_norm": 0.3245735466480255, + "learning_rate": 8.766114076730624e-05, + "loss": 1.8309, + "step": 8200 + }, + { + "epoch": 2.5171884591774094, + "grad_norm": 0.3022485673427582, + "learning_rate": 8.765787111866198e-05, + "loss": 1.8286, + "step": 8201 + }, + { + "epoch": 2.5174953959484347, + "grad_norm": 0.40962809324264526, + "learning_rate": 8.765460109786056e-05, + "loss": 1.8032, + "step": 8202 + }, + { + "epoch": 2.5178023327194596, + "grad_norm": 0.4123937487602234, + "learning_rate": 8.765133070493428e-05, + "loss": 1.9311, + "step": 8203 + }, + { + "epoch": 2.518109269490485, + "grad_norm": 0.30352556705474854, + "learning_rate": 8.764805993991551e-05, + "loss": 1.8197, + "step": 8204 + }, + { + "epoch": 2.5184162062615103, + "grad_norm": 0.3201169967651367, + "learning_rate": 8.764478880283653e-05, + "loss": 1.9355, + "step": 8205 + }, + { + "epoch": 2.518723143032535, + "grad_norm": 0.36343297362327576, + "learning_rate": 8.764151729372969e-05, + "loss": 1.9201, + "step": 8206 + }, + { + "epoch": 2.5190300798035605, + "grad_norm": 0.3273618817329407, + "learning_rate": 8.763824541262729e-05, + "loss": 1.8195, + "step": 8207 + }, + { + "epoch": 2.5193370165745854, + "grad_norm": 0.30200251936912537, + "learning_rate": 8.76349731595617e-05, + "loss": 1.8094, + "step": 8208 + }, + { + "epoch": 2.5196439533456108, + "grad_norm": 0.3177770674228668, + "learning_rate": 8.763170053456527e-05, + "loss": 1.8519, + "step": 8209 + }, + { + "epoch": 2.519950890116636, + "grad_norm": 0.3206307291984558, + "learning_rate": 8.762842753767031e-05, + "loss": 1.8496, + "step": 8210 + }, + { + "epoch": 2.520257826887661, + "grad_norm": 0.31902456283569336, + "learning_rate": 8.762515416890915e-05, + "loss": 1.9069, + "step": 8211 + }, + { + "epoch": 2.5205647636586863, + "grad_norm": 0.3088377118110657, + "learning_rate": 8.762188042831419e-05, + "loss": 1.8482, + "step": 8212 + }, + { + "epoch": 2.520871700429711, + "grad_norm": 0.3046402931213379, + "learning_rate": 8.761860631591773e-05, + "loss": 1.8241, + "step": 8213 + }, + { + "epoch": 2.5211786372007365, + "grad_norm": 0.291831910610199, + "learning_rate": 8.761533183175217e-05, + "loss": 1.846, + "step": 8214 + }, + { + "epoch": 2.521485573971762, + "grad_norm": 0.3514893054962158, + "learning_rate": 8.761205697584986e-05, + "loss": 1.9, + "step": 8215 + }, + { + "epoch": 2.521792510742787, + "grad_norm": 0.31843090057373047, + "learning_rate": 8.760878174824316e-05, + "loss": 1.78, + "step": 8216 + }, + { + "epoch": 2.522099447513812, + "grad_norm": 0.30090904235839844, + "learning_rate": 8.760550614896443e-05, + "loss": 1.8718, + "step": 8217 + }, + { + "epoch": 2.5224063842848374, + "grad_norm": 0.38502126932144165, + "learning_rate": 8.760223017804604e-05, + "loss": 1.8772, + "step": 8218 + }, + { + "epoch": 2.5227133210558623, + "grad_norm": 0.30862319469451904, + "learning_rate": 8.759895383552037e-05, + "loss": 1.8532, + "step": 8219 + }, + { + "epoch": 2.5230202578268877, + "grad_norm": 0.36331596970558167, + "learning_rate": 8.759567712141981e-05, + "loss": 1.8587, + "step": 8220 + }, + { + "epoch": 2.523327194597913, + "grad_norm": 0.3370853662490845, + "learning_rate": 8.759240003577673e-05, + "loss": 1.8065, + "step": 8221 + }, + { + "epoch": 2.523634131368938, + "grad_norm": 0.3047318160533905, + "learning_rate": 8.758912257862351e-05, + "loss": 1.8783, + "step": 8222 + }, + { + "epoch": 2.523941068139963, + "grad_norm": 0.3172069787979126, + "learning_rate": 8.758584474999257e-05, + "loss": 1.7844, + "step": 8223 + }, + { + "epoch": 2.524248004910988, + "grad_norm": 0.3063897490501404, + "learning_rate": 8.758256654991626e-05, + "loss": 1.8642, + "step": 8224 + }, + { + "epoch": 2.5245549416820134, + "grad_norm": 0.2535867393016815, + "learning_rate": 8.757928797842702e-05, + "loss": 1.7784, + "step": 8225 + }, + { + "epoch": 2.5248618784530388, + "grad_norm": 0.27732348442077637, + "learning_rate": 8.757600903555722e-05, + "loss": 1.8223, + "step": 8226 + }, + { + "epoch": 2.525168815224064, + "grad_norm": 0.29819566011428833, + "learning_rate": 8.757272972133927e-05, + "loss": 1.8237, + "step": 8227 + }, + { + "epoch": 2.525475751995089, + "grad_norm": 0.26726382970809937, + "learning_rate": 8.756945003580559e-05, + "loss": 1.8134, + "step": 8228 + }, + { + "epoch": 2.5257826887661143, + "grad_norm": 0.2845614552497864, + "learning_rate": 8.756616997898859e-05, + "loss": 1.8757, + "step": 8229 + }, + { + "epoch": 2.5260896255371392, + "grad_norm": 0.33399102091789246, + "learning_rate": 8.756288955092066e-05, + "loss": 1.9036, + "step": 8230 + }, + { + "epoch": 2.5263965623081646, + "grad_norm": 0.3839001953601837, + "learning_rate": 8.755960875163426e-05, + "loss": 1.8205, + "step": 8231 + }, + { + "epoch": 2.52670349907919, + "grad_norm": 0.3703761696815491, + "learning_rate": 8.75563275811618e-05, + "loss": 1.768, + "step": 8232 + }, + { + "epoch": 2.527010435850215, + "grad_norm": 0.3083760440349579, + "learning_rate": 8.755304603953568e-05, + "loss": 1.8621, + "step": 8233 + }, + { + "epoch": 2.52731737262124, + "grad_norm": 0.2995334267616272, + "learning_rate": 8.754976412678833e-05, + "loss": 1.8246, + "step": 8234 + }, + { + "epoch": 2.527624309392265, + "grad_norm": 0.3482929766178131, + "learning_rate": 8.754648184295222e-05, + "loss": 1.7982, + "step": 8235 + }, + { + "epoch": 2.5279312461632903, + "grad_norm": 0.37462911009788513, + "learning_rate": 8.754319918805978e-05, + "loss": 1.8458, + "step": 8236 + }, + { + "epoch": 2.5282381829343157, + "grad_norm": 0.3112029433250427, + "learning_rate": 8.753991616214343e-05, + "loss": 1.9116, + "step": 8237 + }, + { + "epoch": 2.5285451197053406, + "grad_norm": 0.309711217880249, + "learning_rate": 8.753663276523563e-05, + "loss": 1.8072, + "step": 8238 + }, + { + "epoch": 2.528852056476366, + "grad_norm": 0.3831833302974701, + "learning_rate": 8.753334899736882e-05, + "loss": 1.8769, + "step": 8239 + }, + { + "epoch": 2.529158993247391, + "grad_norm": 0.30272287130355835, + "learning_rate": 8.753006485857547e-05, + "loss": 1.7874, + "step": 8240 + }, + { + "epoch": 2.529465930018416, + "grad_norm": 0.3613976538181305, + "learning_rate": 8.752678034888801e-05, + "loss": 1.8591, + "step": 8241 + }, + { + "epoch": 2.5297728667894415, + "grad_norm": 0.35976549983024597, + "learning_rate": 8.75234954683389e-05, + "loss": 1.7831, + "step": 8242 + }, + { + "epoch": 2.530079803560467, + "grad_norm": 0.33987951278686523, + "learning_rate": 8.752021021696064e-05, + "loss": 1.7986, + "step": 8243 + }, + { + "epoch": 2.5303867403314917, + "grad_norm": 0.29231634736061096, + "learning_rate": 8.751692459478567e-05, + "loss": 1.8205, + "step": 8244 + }, + { + "epoch": 2.530693677102517, + "grad_norm": 0.3382028341293335, + "learning_rate": 8.751363860184644e-05, + "loss": 1.8403, + "step": 8245 + }, + { + "epoch": 2.531000613873542, + "grad_norm": 0.44643479585647583, + "learning_rate": 8.751035223817546e-05, + "loss": 1.8273, + "step": 8246 + }, + { + "epoch": 2.5313075506445673, + "grad_norm": 0.4412732720375061, + "learning_rate": 8.750706550380518e-05, + "loss": 1.7935, + "step": 8247 + }, + { + "epoch": 2.5316144874155926, + "grad_norm": 0.3826131820678711, + "learning_rate": 8.750377839876811e-05, + "loss": 1.8622, + "step": 8248 + }, + { + "epoch": 2.5319214241866175, + "grad_norm": 0.27509525418281555, + "learning_rate": 8.750049092309672e-05, + "loss": 1.8359, + "step": 8249 + }, + { + "epoch": 2.532228360957643, + "grad_norm": 0.36282727122306824, + "learning_rate": 8.749720307682348e-05, + "loss": 1.8531, + "step": 8250 + }, + { + "epoch": 2.5325352977286677, + "grad_norm": 0.3730177581310272, + "learning_rate": 8.749391485998091e-05, + "loss": 1.8616, + "step": 8251 + }, + { + "epoch": 2.532842234499693, + "grad_norm": 0.3347858190536499, + "learning_rate": 8.749062627260152e-05, + "loss": 1.8078, + "step": 8252 + }, + { + "epoch": 2.5331491712707184, + "grad_norm": 0.29422396421432495, + "learning_rate": 8.748733731471777e-05, + "loss": 1.8623, + "step": 8253 + }, + { + "epoch": 2.5334561080417433, + "grad_norm": 0.36915895342826843, + "learning_rate": 8.748404798636219e-05, + "loss": 1.8461, + "step": 8254 + }, + { + "epoch": 2.5337630448127686, + "grad_norm": 0.4497677981853485, + "learning_rate": 8.748075828756725e-05, + "loss": 1.8328, + "step": 8255 + }, + { + "epoch": 2.5340699815837935, + "grad_norm": 0.4770478308200836, + "learning_rate": 8.747746821836552e-05, + "loss": 1.8418, + "step": 8256 + }, + { + "epoch": 2.534376918354819, + "grad_norm": 0.39125776290893555, + "learning_rate": 8.747417777878946e-05, + "loss": 1.8044, + "step": 8257 + }, + { + "epoch": 2.534683855125844, + "grad_norm": 0.2976539731025696, + "learning_rate": 8.747088696887163e-05, + "loss": 1.8819, + "step": 8258 + }, + { + "epoch": 2.5349907918968695, + "grad_norm": 0.37511107325553894, + "learning_rate": 8.746759578864452e-05, + "loss": 1.8304, + "step": 8259 + }, + { + "epoch": 2.5352977286678944, + "grad_norm": 0.4462794363498688, + "learning_rate": 8.746430423814068e-05, + "loss": 1.8248, + "step": 8260 + }, + { + "epoch": 2.5356046654389197, + "grad_norm": 0.3465537130832672, + "learning_rate": 8.746101231739261e-05, + "loss": 1.7987, + "step": 8261 + }, + { + "epoch": 2.5359116022099446, + "grad_norm": 0.3182581663131714, + "learning_rate": 8.745772002643287e-05, + "loss": 1.8817, + "step": 8262 + }, + { + "epoch": 2.53621853898097, + "grad_norm": 0.43006083369255066, + "learning_rate": 8.745442736529398e-05, + "loss": 1.8003, + "step": 8263 + }, + { + "epoch": 2.5365254757519953, + "grad_norm": 0.45511460304260254, + "learning_rate": 8.745113433400849e-05, + "loss": 1.8735, + "step": 8264 + }, + { + "epoch": 2.53683241252302, + "grad_norm": 0.3625985085964203, + "learning_rate": 8.744784093260894e-05, + "loss": 1.8469, + "step": 8265 + }, + { + "epoch": 2.5371393492940455, + "grad_norm": 0.2977297306060791, + "learning_rate": 8.744454716112787e-05, + "loss": 1.7885, + "step": 8266 + }, + { + "epoch": 2.5374462860650704, + "grad_norm": 0.34910085797309875, + "learning_rate": 8.744125301959785e-05, + "loss": 1.8885, + "step": 8267 + }, + { + "epoch": 2.5377532228360957, + "grad_norm": 0.40707942843437195, + "learning_rate": 8.743795850805141e-05, + "loss": 1.8829, + "step": 8268 + }, + { + "epoch": 2.538060159607121, + "grad_norm": 0.4142697751522064, + "learning_rate": 8.743466362652114e-05, + "loss": 1.903, + "step": 8269 + }, + { + "epoch": 2.538367096378146, + "grad_norm": 0.38610437512397766, + "learning_rate": 8.743136837503958e-05, + "loss": 1.9245, + "step": 8270 + }, + { + "epoch": 2.5386740331491713, + "grad_norm": 0.2940465211868286, + "learning_rate": 8.742807275363928e-05, + "loss": 1.8532, + "step": 8271 + }, + { + "epoch": 2.538980969920196, + "grad_norm": 0.3257673978805542, + "learning_rate": 8.742477676235284e-05, + "loss": 1.8517, + "step": 8272 + }, + { + "epoch": 2.5392879066912215, + "grad_norm": 0.3709326982498169, + "learning_rate": 8.742148040121282e-05, + "loss": 1.872, + "step": 8273 + }, + { + "epoch": 2.539594843462247, + "grad_norm": 0.3433123826980591, + "learning_rate": 8.741818367025179e-05, + "loss": 1.8717, + "step": 8274 + }, + { + "epoch": 2.539901780233272, + "grad_norm": 0.39426255226135254, + "learning_rate": 8.741488656950234e-05, + "loss": 1.8155, + "step": 8275 + }, + { + "epoch": 2.540208717004297, + "grad_norm": 0.48205071687698364, + "learning_rate": 8.741158909899706e-05, + "loss": 1.8668, + "step": 8276 + }, + { + "epoch": 2.5405156537753224, + "grad_norm": 0.35280337929725647, + "learning_rate": 8.740829125876853e-05, + "loss": 1.7845, + "step": 8277 + }, + { + "epoch": 2.5408225905463473, + "grad_norm": 0.3148525059223175, + "learning_rate": 8.740499304884932e-05, + "loss": 1.8539, + "step": 8278 + }, + { + "epoch": 2.5411295273173726, + "grad_norm": 0.387932687997818, + "learning_rate": 8.740169446927207e-05, + "loss": 1.8514, + "step": 8279 + }, + { + "epoch": 2.541436464088398, + "grad_norm": 0.37375807762145996, + "learning_rate": 8.739839552006934e-05, + "loss": 1.8497, + "step": 8280 + }, + { + "epoch": 2.541743400859423, + "grad_norm": 0.3094288408756256, + "learning_rate": 8.739509620127375e-05, + "loss": 1.8675, + "step": 8281 + }, + { + "epoch": 2.542050337630448, + "grad_norm": 0.36951884627342224, + "learning_rate": 8.73917965129179e-05, + "loss": 1.8533, + "step": 8282 + }, + { + "epoch": 2.542357274401473, + "grad_norm": 0.39360809326171875, + "learning_rate": 8.73884964550344e-05, + "loss": 1.8688, + "step": 8283 + }, + { + "epoch": 2.5426642111724984, + "grad_norm": 0.29781201481819153, + "learning_rate": 8.738519602765586e-05, + "loss": 1.8285, + "step": 8284 + }, + { + "epoch": 2.5429711479435237, + "grad_norm": 0.29476743936538696, + "learning_rate": 8.73818952308149e-05, + "loss": 1.8234, + "step": 8285 + }, + { + "epoch": 2.5432780847145486, + "grad_norm": 0.3660123646259308, + "learning_rate": 8.737859406454416e-05, + "loss": 1.8933, + "step": 8286 + }, + { + "epoch": 2.543585021485574, + "grad_norm": 0.41587865352630615, + "learning_rate": 8.737529252887621e-05, + "loss": 1.8799, + "step": 8287 + }, + { + "epoch": 2.5438919582565993, + "grad_norm": 0.4183691143989563, + "learning_rate": 8.737199062384374e-05, + "loss": 1.8479, + "step": 8288 + }, + { + "epoch": 2.544198895027624, + "grad_norm": 0.35940057039260864, + "learning_rate": 8.736868834947935e-05, + "loss": 1.8164, + "step": 8289 + }, + { + "epoch": 2.5445058317986495, + "grad_norm": 0.26804691553115845, + "learning_rate": 8.736538570581568e-05, + "loss": 1.8017, + "step": 8290 + }, + { + "epoch": 2.544812768569675, + "grad_norm": 0.34537792205810547, + "learning_rate": 8.736208269288534e-05, + "loss": 1.9002, + "step": 8291 + }, + { + "epoch": 2.5451197053406998, + "grad_norm": 0.4636915624141693, + "learning_rate": 8.735877931072106e-05, + "loss": 1.8207, + "step": 8292 + }, + { + "epoch": 2.545426642111725, + "grad_norm": 0.4897560775279999, + "learning_rate": 8.735547555935537e-05, + "loss": 1.7981, + "step": 8293 + }, + { + "epoch": 2.54573357888275, + "grad_norm": 0.37379372119903564, + "learning_rate": 8.7352171438821e-05, + "loss": 1.8727, + "step": 8294 + }, + { + "epoch": 2.5460405156537753, + "grad_norm": 0.295436292886734, + "learning_rate": 8.734886694915059e-05, + "loss": 1.8321, + "step": 8295 + }, + { + "epoch": 2.5463474524248007, + "grad_norm": 0.40406084060668945, + "learning_rate": 8.734556209037676e-05, + "loss": 1.8666, + "step": 8296 + }, + { + "epoch": 2.5466543891958255, + "grad_norm": 0.3286290466785431, + "learning_rate": 8.734225686253221e-05, + "loss": 1.8574, + "step": 8297 + }, + { + "epoch": 2.546961325966851, + "grad_norm": 0.3200569152832031, + "learning_rate": 8.73389512656496e-05, + "loss": 1.8253, + "step": 8298 + }, + { + "epoch": 2.5472682627378758, + "grad_norm": 0.35550132393836975, + "learning_rate": 8.733564529976157e-05, + "loss": 1.8293, + "step": 8299 + }, + { + "epoch": 2.547575199508901, + "grad_norm": 0.3804685175418854, + "learning_rate": 8.733233896490081e-05, + "loss": 1.8689, + "step": 8300 + }, + { + "epoch": 2.5478821362799264, + "grad_norm": 0.34739598631858826, + "learning_rate": 8.73290322611e-05, + "loss": 1.8441, + "step": 8301 + }, + { + "epoch": 2.5481890730509518, + "grad_norm": 0.29757586121559143, + "learning_rate": 8.732572518839182e-05, + "loss": 1.8698, + "step": 8302 + }, + { + "epoch": 2.5484960098219767, + "grad_norm": 0.30403536558151245, + "learning_rate": 8.732241774680895e-05, + "loss": 1.8305, + "step": 8303 + }, + { + "epoch": 2.548802946593002, + "grad_norm": 0.326876699924469, + "learning_rate": 8.731910993638406e-05, + "loss": 1.8514, + "step": 8304 + }, + { + "epoch": 2.549109883364027, + "grad_norm": 0.3108467161655426, + "learning_rate": 8.731580175714986e-05, + "loss": 1.8509, + "step": 8305 + }, + { + "epoch": 2.549416820135052, + "grad_norm": 0.31641489267349243, + "learning_rate": 8.731249320913904e-05, + "loss": 1.9009, + "step": 8306 + }, + { + "epoch": 2.5497237569060776, + "grad_norm": 0.3166131377220154, + "learning_rate": 8.730918429238428e-05, + "loss": 1.8291, + "step": 8307 + }, + { + "epoch": 2.5500306936771024, + "grad_norm": 0.27900195121765137, + "learning_rate": 8.730587500691829e-05, + "loss": 1.856, + "step": 8308 + }, + { + "epoch": 2.550337630448128, + "grad_norm": 0.3000704050064087, + "learning_rate": 8.730256535277379e-05, + "loss": 1.839, + "step": 8309 + }, + { + "epoch": 2.5506445672191527, + "grad_norm": 0.30938518047332764, + "learning_rate": 8.729925532998348e-05, + "loss": 1.929, + "step": 8310 + }, + { + "epoch": 2.550951503990178, + "grad_norm": 0.3687250316143036, + "learning_rate": 8.729594493858007e-05, + "loss": 1.9214, + "step": 8311 + }, + { + "epoch": 2.5512584407612033, + "grad_norm": 0.3302690386772156, + "learning_rate": 8.729263417859625e-05, + "loss": 1.8667, + "step": 8312 + }, + { + "epoch": 2.5515653775322282, + "grad_norm": 0.32535505294799805, + "learning_rate": 8.728932305006478e-05, + "loss": 1.8298, + "step": 8313 + }, + { + "epoch": 2.5518723143032536, + "grad_norm": 0.3425545394420624, + "learning_rate": 8.728601155301834e-05, + "loss": 1.9479, + "step": 8314 + }, + { + "epoch": 2.5521792510742785, + "grad_norm": 0.29452621936798096, + "learning_rate": 8.72826996874897e-05, + "loss": 1.7963, + "step": 8315 + }, + { + "epoch": 2.552486187845304, + "grad_norm": 0.28749120235443115, + "learning_rate": 8.727938745351156e-05, + "loss": 1.7993, + "step": 8316 + }, + { + "epoch": 2.552793124616329, + "grad_norm": 0.29261404275894165, + "learning_rate": 8.727607485111669e-05, + "loss": 1.8307, + "step": 8317 + }, + { + "epoch": 2.5531000613873545, + "grad_norm": 0.2949221730232239, + "learning_rate": 8.727276188033778e-05, + "loss": 1.7918, + "step": 8318 + }, + { + "epoch": 2.5534069981583793, + "grad_norm": 0.2975117862224579, + "learning_rate": 8.726944854120757e-05, + "loss": 1.8488, + "step": 8319 + }, + { + "epoch": 2.5537139349294047, + "grad_norm": 0.30285659432411194, + "learning_rate": 8.726613483375885e-05, + "loss": 1.8763, + "step": 8320 + }, + { + "epoch": 2.5540208717004296, + "grad_norm": 0.3068414330482483, + "learning_rate": 8.726282075802435e-05, + "loss": 1.8684, + "step": 8321 + }, + { + "epoch": 2.554327808471455, + "grad_norm": 0.3904091715812683, + "learning_rate": 8.72595063140368e-05, + "loss": 1.8643, + "step": 8322 + }, + { + "epoch": 2.5546347452424802, + "grad_norm": 0.443294882774353, + "learning_rate": 8.725619150182897e-05, + "loss": 1.8268, + "step": 8323 + }, + { + "epoch": 2.554941682013505, + "grad_norm": 0.4574877619743347, + "learning_rate": 8.725287632143362e-05, + "loss": 1.8686, + "step": 8324 + }, + { + "epoch": 2.5552486187845305, + "grad_norm": 0.3246860206127167, + "learning_rate": 8.724956077288351e-05, + "loss": 1.8304, + "step": 8325 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.30745935440063477, + "learning_rate": 8.724624485621141e-05, + "loss": 1.8129, + "step": 8326 + }, + { + "epoch": 2.5558624923265807, + "grad_norm": 0.4026782214641571, + "learning_rate": 8.72429285714501e-05, + "loss": 1.8511, + "step": 8327 + }, + { + "epoch": 2.556169429097606, + "grad_norm": 0.41659530997276306, + "learning_rate": 8.723961191863232e-05, + "loss": 1.891, + "step": 8328 + }, + { + "epoch": 2.556476365868631, + "grad_norm": 0.31792551279067993, + "learning_rate": 8.723629489779088e-05, + "loss": 1.8413, + "step": 8329 + }, + { + "epoch": 2.5567833026396563, + "grad_norm": 0.3168247640132904, + "learning_rate": 8.723297750895856e-05, + "loss": 1.902, + "step": 8330 + }, + { + "epoch": 2.557090239410681, + "grad_norm": 0.27834242582321167, + "learning_rate": 8.72296597521681e-05, + "loss": 1.8185, + "step": 8331 + }, + { + "epoch": 2.5573971761817065, + "grad_norm": 0.2997399568557739, + "learning_rate": 8.722634162745236e-05, + "loss": 1.8389, + "step": 8332 + }, + { + "epoch": 2.557704112952732, + "grad_norm": 0.29116490483283997, + "learning_rate": 8.722302313484407e-05, + "loss": 1.8391, + "step": 8333 + }, + { + "epoch": 2.558011049723757, + "grad_norm": 0.2898460030555725, + "learning_rate": 8.721970427437605e-05, + "loss": 1.8891, + "step": 8334 + }, + { + "epoch": 2.558317986494782, + "grad_norm": 0.3231159746646881, + "learning_rate": 8.721638504608109e-05, + "loss": 1.826, + "step": 8335 + }, + { + "epoch": 2.5586249232658074, + "grad_norm": 0.38665273785591125, + "learning_rate": 8.721306544999203e-05, + "loss": 1.9162, + "step": 8336 + }, + { + "epoch": 2.5589318600368323, + "grad_norm": 0.367824912071228, + "learning_rate": 8.720974548614162e-05, + "loss": 1.8165, + "step": 8337 + }, + { + "epoch": 2.5592387968078576, + "grad_norm": 0.3095315098762512, + "learning_rate": 8.72064251545627e-05, + "loss": 1.8887, + "step": 8338 + }, + { + "epoch": 2.559545733578883, + "grad_norm": 0.316890150308609, + "learning_rate": 8.720310445528807e-05, + "loss": 1.8547, + "step": 8339 + }, + { + "epoch": 2.559852670349908, + "grad_norm": 0.2962728440761566, + "learning_rate": 8.719978338835057e-05, + "loss": 1.8252, + "step": 8340 + }, + { + "epoch": 2.560159607120933, + "grad_norm": 0.3351762890815735, + "learning_rate": 8.719646195378302e-05, + "loss": 1.8056, + "step": 8341 + }, + { + "epoch": 2.560466543891958, + "grad_norm": 0.2946149706840515, + "learning_rate": 8.719314015161822e-05, + "loss": 1.8219, + "step": 8342 + }, + { + "epoch": 2.5607734806629834, + "grad_norm": 0.30291053652763367, + "learning_rate": 8.718981798188899e-05, + "loss": 1.8161, + "step": 8343 + }, + { + "epoch": 2.5610804174340087, + "grad_norm": 0.30717429518699646, + "learning_rate": 8.71864954446282e-05, + "loss": 1.8763, + "step": 8344 + }, + { + "epoch": 2.5613873542050336, + "grad_norm": 0.28360515832901, + "learning_rate": 8.718317253986866e-05, + "loss": 1.7972, + "step": 8345 + }, + { + "epoch": 2.561694290976059, + "grad_norm": 0.34898701310157776, + "learning_rate": 8.717984926764322e-05, + "loss": 1.8843, + "step": 8346 + }, + { + "epoch": 2.562001227747084, + "grad_norm": 0.2702360451221466, + "learning_rate": 8.717652562798472e-05, + "loss": 1.7917, + "step": 8347 + }, + { + "epoch": 2.562308164518109, + "grad_norm": 0.30566295981407166, + "learning_rate": 8.7173201620926e-05, + "loss": 1.9027, + "step": 8348 + }, + { + "epoch": 2.5626151012891345, + "grad_norm": 0.2882433533668518, + "learning_rate": 8.716987724649991e-05, + "loss": 1.8167, + "step": 8349 + }, + { + "epoch": 2.56292203806016, + "grad_norm": 0.2616370916366577, + "learning_rate": 8.71665525047393e-05, + "loss": 1.7779, + "step": 8350 + }, + { + "epoch": 2.5632289748311847, + "grad_norm": 0.3033899664878845, + "learning_rate": 8.716322739567706e-05, + "loss": 1.9022, + "step": 8351 + }, + { + "epoch": 2.56353591160221, + "grad_norm": 0.30584800243377686, + "learning_rate": 8.7159901919346e-05, + "loss": 1.8808, + "step": 8352 + }, + { + "epoch": 2.563842848373235, + "grad_norm": 0.34650805592536926, + "learning_rate": 8.715657607577903e-05, + "loss": 1.8817, + "step": 8353 + }, + { + "epoch": 2.5641497851442603, + "grad_norm": 0.30568572878837585, + "learning_rate": 8.715324986500898e-05, + "loss": 1.8852, + "step": 8354 + }, + { + "epoch": 2.5644567219152856, + "grad_norm": 0.36174869537353516, + "learning_rate": 8.714992328706875e-05, + "loss": 1.8518, + "step": 8355 + }, + { + "epoch": 2.5647636586863105, + "grad_norm": 0.48538872599601746, + "learning_rate": 8.714659634199119e-05, + "loss": 1.8902, + "step": 8356 + }, + { + "epoch": 2.565070595457336, + "grad_norm": 0.44997766613960266, + "learning_rate": 8.71432690298092e-05, + "loss": 1.8914, + "step": 8357 + }, + { + "epoch": 2.5653775322283607, + "grad_norm": 0.30164965987205505, + "learning_rate": 8.713994135055566e-05, + "loss": 1.826, + "step": 8358 + }, + { + "epoch": 2.565684468999386, + "grad_norm": 0.35495996475219727, + "learning_rate": 8.713661330426345e-05, + "loss": 1.8006, + "step": 8359 + }, + { + "epoch": 2.5659914057704114, + "grad_norm": 0.4141593277454376, + "learning_rate": 8.713328489096545e-05, + "loss": 1.782, + "step": 8360 + }, + { + "epoch": 2.5662983425414367, + "grad_norm": 0.4758378267288208, + "learning_rate": 8.712995611069458e-05, + "loss": 1.8378, + "step": 8361 + }, + { + "epoch": 2.5666052793124616, + "grad_norm": 0.4852865934371948, + "learning_rate": 8.71266269634837e-05, + "loss": 1.8472, + "step": 8362 + }, + { + "epoch": 2.566912216083487, + "grad_norm": 0.43413496017456055, + "learning_rate": 8.712329744936576e-05, + "loss": 1.8118, + "step": 8363 + }, + { + "epoch": 2.567219152854512, + "grad_norm": 0.3100700080394745, + "learning_rate": 8.711996756837361e-05, + "loss": 1.8699, + "step": 8364 + }, + { + "epoch": 2.567526089625537, + "grad_norm": 0.31886258721351624, + "learning_rate": 8.711663732054021e-05, + "loss": 1.8022, + "step": 8365 + }, + { + "epoch": 2.5678330263965625, + "grad_norm": 0.38900697231292725, + "learning_rate": 8.711330670589841e-05, + "loss": 1.8119, + "step": 8366 + }, + { + "epoch": 2.5681399631675874, + "grad_norm": 0.4188348650932312, + "learning_rate": 8.710997572448119e-05, + "loss": 1.8561, + "step": 8367 + }, + { + "epoch": 2.5684468999386127, + "grad_norm": 0.3562021255493164, + "learning_rate": 8.710664437632143e-05, + "loss": 1.8605, + "step": 8368 + }, + { + "epoch": 2.5687538367096376, + "grad_norm": 0.3105112910270691, + "learning_rate": 8.710331266145206e-05, + "loss": 1.8122, + "step": 8369 + }, + { + "epoch": 2.569060773480663, + "grad_norm": 0.3209846615791321, + "learning_rate": 8.7099980579906e-05, + "loss": 1.8914, + "step": 8370 + }, + { + "epoch": 2.5693677102516883, + "grad_norm": 0.32560455799102783, + "learning_rate": 8.70966481317162e-05, + "loss": 1.9245, + "step": 8371 + }, + { + "epoch": 2.569674647022713, + "grad_norm": 0.29573267698287964, + "learning_rate": 8.709331531691558e-05, + "loss": 1.8576, + "step": 8372 + }, + { + "epoch": 2.5699815837937385, + "grad_norm": 0.2974778115749359, + "learning_rate": 8.708998213553707e-05, + "loss": 1.8464, + "step": 8373 + }, + { + "epoch": 2.5702885205647634, + "grad_norm": 0.3264322578907013, + "learning_rate": 8.708664858761362e-05, + "loss": 1.8945, + "step": 8374 + }, + { + "epoch": 2.5705954573357888, + "grad_norm": 0.28260353207588196, + "learning_rate": 8.708331467317816e-05, + "loss": 1.8296, + "step": 8375 + }, + { + "epoch": 2.570902394106814, + "grad_norm": 0.2991141676902771, + "learning_rate": 8.707998039226367e-05, + "loss": 1.9227, + "step": 8376 + }, + { + "epoch": 2.5712093308778394, + "grad_norm": 0.28582924604415894, + "learning_rate": 8.707664574490306e-05, + "loss": 1.8465, + "step": 8377 + }, + { + "epoch": 2.5715162676488643, + "grad_norm": 0.2860773205757141, + "learning_rate": 8.707331073112932e-05, + "loss": 1.8403, + "step": 8378 + }, + { + "epoch": 2.5718232044198897, + "grad_norm": 0.31145161390304565, + "learning_rate": 8.70699753509754e-05, + "loss": 1.8775, + "step": 8379 + }, + { + "epoch": 2.5721301411909145, + "grad_norm": 0.28711119294166565, + "learning_rate": 8.706663960447424e-05, + "loss": 1.8354, + "step": 8380 + }, + { + "epoch": 2.57243707796194, + "grad_norm": 0.2884272634983063, + "learning_rate": 8.706330349165884e-05, + "loss": 1.8772, + "step": 8381 + }, + { + "epoch": 2.572744014732965, + "grad_norm": 0.3581789433956146, + "learning_rate": 8.705996701256214e-05, + "loss": 1.8654, + "step": 8382 + }, + { + "epoch": 2.57305095150399, + "grad_norm": 0.41561809182167053, + "learning_rate": 8.705663016721712e-05, + "loss": 1.9112, + "step": 8383 + }, + { + "epoch": 2.5733578882750154, + "grad_norm": 0.301883727312088, + "learning_rate": 8.705329295565676e-05, + "loss": 1.803, + "step": 8384 + }, + { + "epoch": 2.5736648250460403, + "grad_norm": 0.37060779333114624, + "learning_rate": 8.704995537791405e-05, + "loss": 1.9371, + "step": 8385 + }, + { + "epoch": 2.5739717618170657, + "grad_norm": 0.44705548882484436, + "learning_rate": 8.704661743402195e-05, + "loss": 1.8599, + "step": 8386 + }, + { + "epoch": 2.574278698588091, + "grad_norm": 0.44097039103507996, + "learning_rate": 8.70432791240135e-05, + "loss": 1.8305, + "step": 8387 + }, + { + "epoch": 2.574585635359116, + "grad_norm": 0.3278143107891083, + "learning_rate": 8.703994044792161e-05, + "loss": 1.8817, + "step": 8388 + }, + { + "epoch": 2.574892572130141, + "grad_norm": 0.347153902053833, + "learning_rate": 8.703660140577934e-05, + "loss": 1.8182, + "step": 8389 + }, + { + "epoch": 2.575199508901166, + "grad_norm": 0.4667893052101135, + "learning_rate": 8.703326199761966e-05, + "loss": 1.8354, + "step": 8390 + }, + { + "epoch": 2.5755064456721914, + "grad_norm": 0.4956285059452057, + "learning_rate": 8.702992222347559e-05, + "loss": 1.8284, + "step": 8391 + }, + { + "epoch": 2.575813382443217, + "grad_norm": 0.3489355146884918, + "learning_rate": 8.702658208338012e-05, + "loss": 1.8439, + "step": 8392 + }, + { + "epoch": 2.576120319214242, + "grad_norm": 0.3054865002632141, + "learning_rate": 8.702324157736625e-05, + "loss": 1.8659, + "step": 8393 + }, + { + "epoch": 2.576427255985267, + "grad_norm": 0.3459004759788513, + "learning_rate": 8.701990070546703e-05, + "loss": 1.8644, + "step": 8394 + }, + { + "epoch": 2.5767341927562923, + "grad_norm": 0.34715306758880615, + "learning_rate": 8.701655946771544e-05, + "loss": 1.8765, + "step": 8395 + }, + { + "epoch": 2.5770411295273172, + "grad_norm": 0.35610535740852356, + "learning_rate": 8.701321786414452e-05, + "loss": 1.886, + "step": 8396 + }, + { + "epoch": 2.5773480662983426, + "grad_norm": 0.34869852662086487, + "learning_rate": 8.700987589478728e-05, + "loss": 1.8858, + "step": 8397 + }, + { + "epoch": 2.577655003069368, + "grad_norm": 0.33508050441741943, + "learning_rate": 8.700653355967675e-05, + "loss": 1.8429, + "step": 8398 + }, + { + "epoch": 2.577961939840393, + "grad_norm": 0.4707668721675873, + "learning_rate": 8.700319085884597e-05, + "loss": 1.8806, + "step": 8399 + }, + { + "epoch": 2.578268876611418, + "grad_norm": 0.5073609948158264, + "learning_rate": 8.699984779232797e-05, + "loss": 1.9252, + "step": 8400 + }, + { + "epoch": 2.578575813382443, + "grad_norm": 0.4120771884918213, + "learning_rate": 8.699650436015578e-05, + "loss": 1.9463, + "step": 8401 + }, + { + "epoch": 2.5788827501534684, + "grad_norm": 0.5639505386352539, + "learning_rate": 8.699316056236246e-05, + "loss": 1.9076, + "step": 8402 + }, + { + "epoch": 2.5791896869244937, + "grad_norm": 0.7611388564109802, + "learning_rate": 8.698981639898106e-05, + "loss": 1.8344, + "step": 8403 + }, + { + "epoch": 2.5794966236955186, + "grad_norm": 0.715629518032074, + "learning_rate": 8.69864718700446e-05, + "loss": 1.7928, + "step": 8404 + }, + { + "epoch": 2.579803560466544, + "grad_norm": 0.4248988926410675, + "learning_rate": 8.698312697558614e-05, + "loss": 1.835, + "step": 8405 + }, + { + "epoch": 2.580110497237569, + "grad_norm": 0.3638152778148651, + "learning_rate": 8.697978171563875e-05, + "loss": 1.8544, + "step": 8406 + }, + { + "epoch": 2.580417434008594, + "grad_norm": 0.40734997391700745, + "learning_rate": 8.697643609023547e-05, + "loss": 1.7759, + "step": 8407 + }, + { + "epoch": 2.5807243707796195, + "grad_norm": 0.41469305753707886, + "learning_rate": 8.697309009940939e-05, + "loss": 1.8989, + "step": 8408 + }, + { + "epoch": 2.581031307550645, + "grad_norm": 0.3003403842449188, + "learning_rate": 8.696974374319355e-05, + "loss": 1.8138, + "step": 8409 + }, + { + "epoch": 2.5813382443216697, + "grad_norm": 0.3475555181503296, + "learning_rate": 8.696639702162104e-05, + "loss": 1.8851, + "step": 8410 + }, + { + "epoch": 2.581645181092695, + "grad_norm": 0.3952930271625519, + "learning_rate": 8.696304993472493e-05, + "loss": 1.8421, + "step": 8411 + }, + { + "epoch": 2.58195211786372, + "grad_norm": 0.33059266209602356, + "learning_rate": 8.69597024825383e-05, + "loss": 1.886, + "step": 8412 + }, + { + "epoch": 2.5822590546347453, + "grad_norm": 0.291877806186676, + "learning_rate": 8.695635466509422e-05, + "loss": 1.8001, + "step": 8413 + }, + { + "epoch": 2.5825659914057706, + "grad_norm": 0.3707219064235687, + "learning_rate": 8.69530064824258e-05, + "loss": 1.8419, + "step": 8414 + }, + { + "epoch": 2.5828729281767955, + "grad_norm": 0.4656111001968384, + "learning_rate": 8.694965793456609e-05, + "loss": 1.8925, + "step": 8415 + }, + { + "epoch": 2.583179864947821, + "grad_norm": 0.4284421503543854, + "learning_rate": 8.694630902154821e-05, + "loss": 1.8794, + "step": 8416 + }, + { + "epoch": 2.5834868017188457, + "grad_norm": 0.25311100482940674, + "learning_rate": 8.694295974340525e-05, + "loss": 1.8004, + "step": 8417 + }, + { + "epoch": 2.583793738489871, + "grad_norm": 0.3463805615901947, + "learning_rate": 8.693961010017031e-05, + "loss": 1.8666, + "step": 8418 + }, + { + "epoch": 2.5841006752608964, + "grad_norm": 0.3193957209587097, + "learning_rate": 8.693626009187647e-05, + "loss": 1.8787, + "step": 8419 + }, + { + "epoch": 2.5844076120319213, + "grad_norm": 0.30919939279556274, + "learning_rate": 8.69329097185569e-05, + "loss": 1.9066, + "step": 8420 + }, + { + "epoch": 2.5847145488029466, + "grad_norm": 0.31369611620903015, + "learning_rate": 8.692955898024464e-05, + "loss": 1.8714, + "step": 8421 + }, + { + "epoch": 2.5850214855739715, + "grad_norm": 0.3191319406032562, + "learning_rate": 8.692620787697284e-05, + "loss": 1.8535, + "step": 8422 + }, + { + "epoch": 2.585328422344997, + "grad_norm": 0.3148418366909027, + "learning_rate": 8.692285640877462e-05, + "loss": 1.8648, + "step": 8423 + }, + { + "epoch": 2.585635359116022, + "grad_norm": 0.28245437145233154, + "learning_rate": 8.691950457568307e-05, + "loss": 1.8574, + "step": 8424 + }, + { + "epoch": 2.5859422958870475, + "grad_norm": 0.28383150696754456, + "learning_rate": 8.691615237773137e-05, + "loss": 1.7993, + "step": 8425 + }, + { + "epoch": 2.5862492326580724, + "grad_norm": 0.30522802472114563, + "learning_rate": 8.691279981495257e-05, + "loss": 1.8809, + "step": 8426 + }, + { + "epoch": 2.5865561694290977, + "grad_norm": 0.2936995327472687, + "learning_rate": 8.690944688737988e-05, + "loss": 1.745, + "step": 8427 + }, + { + "epoch": 2.5868631062001226, + "grad_norm": 0.2923533320426941, + "learning_rate": 8.69060935950464e-05, + "loss": 1.8929, + "step": 8428 + }, + { + "epoch": 2.587170042971148, + "grad_norm": 0.3280770182609558, + "learning_rate": 8.690273993798526e-05, + "loss": 1.8587, + "step": 8429 + }, + { + "epoch": 2.5874769797421733, + "grad_norm": 0.314712792634964, + "learning_rate": 8.689938591622962e-05, + "loss": 1.8569, + "step": 8430 + }, + { + "epoch": 2.587783916513198, + "grad_norm": 0.3230959475040436, + "learning_rate": 8.689603152981263e-05, + "loss": 1.8451, + "step": 8431 + }, + { + "epoch": 2.5880908532842235, + "grad_norm": 0.35917067527770996, + "learning_rate": 8.689267677876742e-05, + "loss": 1.7755, + "step": 8432 + }, + { + "epoch": 2.5883977900552484, + "grad_norm": 0.3590618968009949, + "learning_rate": 8.688932166312715e-05, + "loss": 1.8236, + "step": 8433 + }, + { + "epoch": 2.5887047268262737, + "grad_norm": 0.29416507482528687, + "learning_rate": 8.6885966182925e-05, + "loss": 1.7852, + "step": 8434 + }, + { + "epoch": 2.589011663597299, + "grad_norm": 0.24230079352855682, + "learning_rate": 8.688261033819409e-05, + "loss": 1.8006, + "step": 8435 + }, + { + "epoch": 2.5893186003683244, + "grad_norm": 0.2519497573375702, + "learning_rate": 8.687925412896762e-05, + "loss": 1.7787, + "step": 8436 + }, + { + "epoch": 2.5896255371393493, + "grad_norm": 0.2794395089149475, + "learning_rate": 8.687589755527874e-05, + "loss": 1.8408, + "step": 8437 + }, + { + "epoch": 2.5899324739103746, + "grad_norm": 0.28811511397361755, + "learning_rate": 8.687254061716063e-05, + "loss": 1.8961, + "step": 8438 + }, + { + "epoch": 2.5902394106813995, + "grad_norm": 0.28127825260162354, + "learning_rate": 8.686918331464647e-05, + "loss": 1.8235, + "step": 8439 + }, + { + "epoch": 2.590546347452425, + "grad_norm": 0.2869607210159302, + "learning_rate": 8.686582564776942e-05, + "loss": 1.8452, + "step": 8440 + }, + { + "epoch": 2.59085328422345, + "grad_norm": 0.36350393295288086, + "learning_rate": 8.686246761656268e-05, + "loss": 1.9262, + "step": 8441 + }, + { + "epoch": 2.591160220994475, + "grad_norm": 0.30231785774230957, + "learning_rate": 8.685910922105942e-05, + "loss": 1.8674, + "step": 8442 + }, + { + "epoch": 2.5914671577655004, + "grad_norm": 0.28321847319602966, + "learning_rate": 8.685575046129285e-05, + "loss": 1.8243, + "step": 8443 + }, + { + "epoch": 2.5917740945365253, + "grad_norm": 0.30235186219215393, + "learning_rate": 8.685239133729615e-05, + "loss": 1.8442, + "step": 8444 + }, + { + "epoch": 2.5920810313075506, + "grad_norm": 0.2684946060180664, + "learning_rate": 8.684903184910252e-05, + "loss": 1.8584, + "step": 8445 + }, + { + "epoch": 2.592387968078576, + "grad_norm": 0.33788567781448364, + "learning_rate": 8.684567199674514e-05, + "loss": 1.8296, + "step": 8446 + }, + { + "epoch": 2.592694904849601, + "grad_norm": 0.38110965490341187, + "learning_rate": 8.684231178025726e-05, + "loss": 1.8581, + "step": 8447 + }, + { + "epoch": 2.593001841620626, + "grad_norm": 0.36466923356056213, + "learning_rate": 8.683895119967204e-05, + "loss": 1.8799, + "step": 8448 + }, + { + "epoch": 2.593308778391651, + "grad_norm": 0.3052733838558197, + "learning_rate": 8.683559025502272e-05, + "loss": 1.8834, + "step": 8449 + }, + { + "epoch": 2.5936157151626764, + "grad_norm": 0.31457164883613586, + "learning_rate": 8.683222894634251e-05, + "loss": 1.8635, + "step": 8450 + }, + { + "epoch": 2.5939226519337018, + "grad_norm": 0.46189576387405396, + "learning_rate": 8.682886727366464e-05, + "loss": 1.8852, + "step": 8451 + }, + { + "epoch": 2.594229588704727, + "grad_norm": 0.467640221118927, + "learning_rate": 8.682550523702229e-05, + "loss": 1.8306, + "step": 8452 + }, + { + "epoch": 2.594536525475752, + "grad_norm": 0.3384416699409485, + "learning_rate": 8.682214283644873e-05, + "loss": 1.8298, + "step": 8453 + }, + { + "epoch": 2.5948434622467773, + "grad_norm": 0.2842169404029846, + "learning_rate": 8.681878007197717e-05, + "loss": 1.8091, + "step": 8454 + }, + { + "epoch": 2.595150399017802, + "grad_norm": 0.31266552209854126, + "learning_rate": 8.681541694364084e-05, + "loss": 1.8329, + "step": 8455 + }, + { + "epoch": 2.5954573357888275, + "grad_norm": 0.36803483963012695, + "learning_rate": 8.681205345147298e-05, + "loss": 1.8427, + "step": 8456 + }, + { + "epoch": 2.595764272559853, + "grad_norm": 0.37500229477882385, + "learning_rate": 8.680868959550684e-05, + "loss": 1.8865, + "step": 8457 + }, + { + "epoch": 2.5960712093308778, + "grad_norm": 0.30494266748428345, + "learning_rate": 8.680532537577565e-05, + "loss": 1.8375, + "step": 8458 + }, + { + "epoch": 2.596378146101903, + "grad_norm": 0.38320985436439514, + "learning_rate": 8.680196079231266e-05, + "loss": 1.8762, + "step": 8459 + }, + { + "epoch": 2.596685082872928, + "grad_norm": 0.48555347323417664, + "learning_rate": 8.679859584515112e-05, + "loss": 1.8558, + "step": 8460 + }, + { + "epoch": 2.5969920196439533, + "grad_norm": 0.3975796401500702, + "learning_rate": 8.67952305343243e-05, + "loss": 1.8265, + "step": 8461 + }, + { + "epoch": 2.5972989564149787, + "grad_norm": 0.3312734365463257, + "learning_rate": 8.679186485986544e-05, + "loss": 1.8346, + "step": 8462 + }, + { + "epoch": 2.5976058931860035, + "grad_norm": 0.37137889862060547, + "learning_rate": 8.67884988218078e-05, + "loss": 1.8894, + "step": 8463 + }, + { + "epoch": 2.597912829957029, + "grad_norm": 0.3645901083946228, + "learning_rate": 8.678513242018467e-05, + "loss": 1.8103, + "step": 8464 + }, + { + "epoch": 2.5982197667280538, + "grad_norm": 0.35010847449302673, + "learning_rate": 8.67817656550293e-05, + "loss": 1.8704, + "step": 8465 + }, + { + "epoch": 2.598526703499079, + "grad_norm": 0.36948931217193604, + "learning_rate": 8.677839852637492e-05, + "loss": 1.8413, + "step": 8466 + }, + { + "epoch": 2.5988336402701044, + "grad_norm": 0.3512018322944641, + "learning_rate": 8.67750310342549e-05, + "loss": 1.8222, + "step": 8467 + }, + { + "epoch": 2.5991405770411298, + "grad_norm": 0.3678590953350067, + "learning_rate": 8.677166317870245e-05, + "loss": 1.852, + "step": 8468 + }, + { + "epoch": 2.5994475138121547, + "grad_norm": 0.46718111634254456, + "learning_rate": 8.676829495975087e-05, + "loss": 1.8459, + "step": 8469 + }, + { + "epoch": 2.59975445058318, + "grad_norm": 0.4580456018447876, + "learning_rate": 8.676492637743345e-05, + "loss": 1.8547, + "step": 8470 + }, + { + "epoch": 2.600061387354205, + "grad_norm": 0.3790566921234131, + "learning_rate": 8.676155743178348e-05, + "loss": 1.8483, + "step": 8471 + }, + { + "epoch": 2.6003683241252302, + "grad_norm": 0.34775233268737793, + "learning_rate": 8.675818812283424e-05, + "loss": 1.9, + "step": 8472 + }, + { + "epoch": 2.6006752608962556, + "grad_norm": 0.4257417619228363, + "learning_rate": 8.675481845061906e-05, + "loss": 1.8354, + "step": 8473 + }, + { + "epoch": 2.6009821976672804, + "grad_norm": 0.46964964270591736, + "learning_rate": 8.675144841517122e-05, + "loss": 1.8305, + "step": 8474 + }, + { + "epoch": 2.601289134438306, + "grad_norm": 0.3592812120914459, + "learning_rate": 8.674807801652403e-05, + "loss": 1.778, + "step": 8475 + }, + { + "epoch": 2.6015960712093307, + "grad_norm": 0.3184985816478729, + "learning_rate": 8.674470725471078e-05, + "loss": 1.8706, + "step": 8476 + }, + { + "epoch": 2.601903007980356, + "grad_norm": 0.31306785345077515, + "learning_rate": 8.674133612976481e-05, + "loss": 1.8482, + "step": 8477 + }, + { + "epoch": 2.6022099447513813, + "grad_norm": 0.30568715929985046, + "learning_rate": 8.673796464171939e-05, + "loss": 1.8346, + "step": 8478 + }, + { + "epoch": 2.6025168815224062, + "grad_norm": 0.33701828122138977, + "learning_rate": 8.673459279060791e-05, + "loss": 1.8165, + "step": 8479 + }, + { + "epoch": 2.6028238182934316, + "grad_norm": 0.3153107166290283, + "learning_rate": 8.673122057646364e-05, + "loss": 1.8175, + "step": 8480 + }, + { + "epoch": 2.6031307550644565, + "grad_norm": 0.3428439497947693, + "learning_rate": 8.67278479993199e-05, + "loss": 1.8344, + "step": 8481 + }, + { + "epoch": 2.603437691835482, + "grad_norm": 0.39118432998657227, + "learning_rate": 8.672447505921006e-05, + "loss": 1.7904, + "step": 8482 + }, + { + "epoch": 2.603744628606507, + "grad_norm": 0.3845612108707428, + "learning_rate": 8.672110175616743e-05, + "loss": 1.8442, + "step": 8483 + }, + { + "epoch": 2.6040515653775325, + "grad_norm": 0.3402850329875946, + "learning_rate": 8.671772809022535e-05, + "loss": 1.8578, + "step": 8484 + }, + { + "epoch": 2.6043585021485574, + "grad_norm": 0.30314967036247253, + "learning_rate": 8.671435406141716e-05, + "loss": 1.8235, + "step": 8485 + }, + { + "epoch": 2.6046654389195827, + "grad_norm": 0.29402145743370056, + "learning_rate": 8.67109796697762e-05, + "loss": 1.8105, + "step": 8486 + }, + { + "epoch": 2.6049723756906076, + "grad_norm": 0.33207419514656067, + "learning_rate": 8.670760491533582e-05, + "loss": 1.9133, + "step": 8487 + }, + { + "epoch": 2.605279312461633, + "grad_norm": 0.3287195861339569, + "learning_rate": 8.670422979812938e-05, + "loss": 1.8344, + "step": 8488 + }, + { + "epoch": 2.6055862492326582, + "grad_norm": 0.37947842478752136, + "learning_rate": 8.670085431819021e-05, + "loss": 1.8504, + "step": 8489 + }, + { + "epoch": 2.605893186003683, + "grad_norm": 0.3688724935054779, + "learning_rate": 8.669747847555171e-05, + "loss": 1.8305, + "step": 8490 + }, + { + "epoch": 2.6062001227747085, + "grad_norm": 0.33962976932525635, + "learning_rate": 8.669410227024721e-05, + "loss": 1.861, + "step": 8491 + }, + { + "epoch": 2.6065070595457334, + "grad_norm": 0.27068057656288147, + "learning_rate": 8.669072570231009e-05, + "loss": 1.7666, + "step": 8492 + }, + { + "epoch": 2.6068139963167587, + "grad_norm": 0.32670122385025024, + "learning_rate": 8.668734877177371e-05, + "loss": 1.8434, + "step": 8493 + }, + { + "epoch": 2.607120933087784, + "grad_norm": 0.37303030490875244, + "learning_rate": 8.668397147867144e-05, + "loss": 1.8326, + "step": 8494 + }, + { + "epoch": 2.607427869858809, + "grad_norm": 0.2860218286514282, + "learning_rate": 8.668059382303666e-05, + "loss": 1.7993, + "step": 8495 + }, + { + "epoch": 2.6077348066298343, + "grad_norm": 0.3480636477470398, + "learning_rate": 8.667721580490278e-05, + "loss": 1.8895, + "step": 8496 + }, + { + "epoch": 2.608041743400859, + "grad_norm": 0.37609198689460754, + "learning_rate": 8.667383742430313e-05, + "loss": 1.8906, + "step": 8497 + }, + { + "epoch": 2.6083486801718845, + "grad_norm": 0.30747851729393005, + "learning_rate": 8.667045868127113e-05, + "loss": 1.8169, + "step": 8498 + }, + { + "epoch": 2.60865561694291, + "grad_norm": 0.3108443021774292, + "learning_rate": 8.666707957584016e-05, + "loss": 1.8296, + "step": 8499 + }, + { + "epoch": 2.608962553713935, + "grad_norm": 0.36353448033332825, + "learning_rate": 8.666370010804361e-05, + "loss": 1.879, + "step": 8500 + }, + { + "epoch": 2.60926949048496, + "grad_norm": 0.39959096908569336, + "learning_rate": 8.666032027791491e-05, + "loss": 1.8602, + "step": 8501 + }, + { + "epoch": 2.6095764272559854, + "grad_norm": 0.3505500853061676, + "learning_rate": 8.665694008548742e-05, + "loss": 1.861, + "step": 8502 + }, + { + "epoch": 2.6098833640270103, + "grad_norm": 0.3155219852924347, + "learning_rate": 8.665355953079457e-05, + "loss": 1.7911, + "step": 8503 + }, + { + "epoch": 2.6101903007980356, + "grad_norm": 0.2868075668811798, + "learning_rate": 8.665017861386975e-05, + "loss": 1.8023, + "step": 8504 + }, + { + "epoch": 2.610497237569061, + "grad_norm": 0.2890832722187042, + "learning_rate": 8.664679733474641e-05, + "loss": 1.8653, + "step": 8505 + }, + { + "epoch": 2.610804174340086, + "grad_norm": 0.3143366575241089, + "learning_rate": 8.66434156934579e-05, + "loss": 1.8024, + "step": 8506 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.28702911734580994, + "learning_rate": 8.664003369003772e-05, + "loss": 1.8231, + "step": 8507 + }, + { + "epoch": 2.611418047882136, + "grad_norm": 0.37087059020996094, + "learning_rate": 8.663665132451924e-05, + "loss": 1.8565, + "step": 8508 + }, + { + "epoch": 2.6117249846531614, + "grad_norm": 0.29796209931373596, + "learning_rate": 8.663326859693588e-05, + "loss": 1.8188, + "step": 8509 + }, + { + "epoch": 2.6120319214241867, + "grad_norm": 0.31352412700653076, + "learning_rate": 8.66298855073211e-05, + "loss": 1.806, + "step": 8510 + }, + { + "epoch": 2.612338858195212, + "grad_norm": 0.28749167919158936, + "learning_rate": 8.662650205570832e-05, + "loss": 1.8082, + "step": 8511 + }, + { + "epoch": 2.612645794966237, + "grad_norm": 0.26889678835868835, + "learning_rate": 8.662311824213099e-05, + "loss": 1.8211, + "step": 8512 + }, + { + "epoch": 2.6129527317372623, + "grad_norm": 0.2562754154205322, + "learning_rate": 8.661973406662253e-05, + "loss": 1.7519, + "step": 8513 + }, + { + "epoch": 2.613259668508287, + "grad_norm": 0.26967912912368774, + "learning_rate": 8.661634952921639e-05, + "loss": 1.8339, + "step": 8514 + }, + { + "epoch": 2.6135666052793125, + "grad_norm": 0.3468424081802368, + "learning_rate": 8.661296462994602e-05, + "loss": 1.9219, + "step": 8515 + }, + { + "epoch": 2.613873542050338, + "grad_norm": 0.34790560603141785, + "learning_rate": 8.660957936884489e-05, + "loss": 1.9089, + "step": 8516 + }, + { + "epoch": 2.6141804788213627, + "grad_norm": 0.350337952375412, + "learning_rate": 8.660619374594643e-05, + "loss": 1.8228, + "step": 8517 + }, + { + "epoch": 2.614487415592388, + "grad_norm": 0.37077057361602783, + "learning_rate": 8.660280776128411e-05, + "loss": 1.8658, + "step": 8518 + }, + { + "epoch": 2.614794352363413, + "grad_norm": 0.35846221446990967, + "learning_rate": 8.659942141489139e-05, + "loss": 1.8573, + "step": 8519 + }, + { + "epoch": 2.6151012891344383, + "grad_norm": 0.339101642370224, + "learning_rate": 8.659603470680173e-05, + "loss": 1.875, + "step": 8520 + }, + { + "epoch": 2.6154082259054636, + "grad_norm": 0.35074207186698914, + "learning_rate": 8.65926476370486e-05, + "loss": 1.8395, + "step": 8521 + }, + { + "epoch": 2.6157151626764885, + "grad_norm": 0.31544017791748047, + "learning_rate": 8.658926020566551e-05, + "loss": 1.8453, + "step": 8522 + }, + { + "epoch": 2.616022099447514, + "grad_norm": 0.30619683861732483, + "learning_rate": 8.658587241268587e-05, + "loss": 1.775, + "step": 8523 + }, + { + "epoch": 2.6163290362185387, + "grad_norm": 0.29331618547439575, + "learning_rate": 8.658248425814322e-05, + "loss": 1.8068, + "step": 8524 + }, + { + "epoch": 2.616635972989564, + "grad_norm": 0.2824336290359497, + "learning_rate": 8.6579095742071e-05, + "loss": 1.8759, + "step": 8525 + }, + { + "epoch": 2.6169429097605894, + "grad_norm": 0.2697986364364624, + "learning_rate": 8.657570686450271e-05, + "loss": 1.8295, + "step": 8526 + }, + { + "epoch": 2.6172498465316147, + "grad_norm": 0.3031822144985199, + "learning_rate": 8.657231762547186e-05, + "loss": 1.9205, + "step": 8527 + }, + { + "epoch": 2.6175567833026396, + "grad_norm": 0.2867984473705292, + "learning_rate": 8.656892802501196e-05, + "loss": 1.8638, + "step": 8528 + }, + { + "epoch": 2.617863720073665, + "grad_norm": 0.29799792170524597, + "learning_rate": 8.656553806315644e-05, + "loss": 1.8187, + "step": 8529 + }, + { + "epoch": 2.61817065684469, + "grad_norm": 0.3222150504589081, + "learning_rate": 8.656214773993884e-05, + "loss": 1.8661, + "step": 8530 + }, + { + "epoch": 2.618477593615715, + "grad_norm": 0.35999616980552673, + "learning_rate": 8.655875705539269e-05, + "loss": 1.9155, + "step": 8531 + }, + { + "epoch": 2.6187845303867405, + "grad_norm": 0.36571675539016724, + "learning_rate": 8.655536600955147e-05, + "loss": 1.8536, + "step": 8532 + }, + { + "epoch": 2.6190914671577654, + "grad_norm": 0.29667189717292786, + "learning_rate": 8.655197460244868e-05, + "loss": 1.8208, + "step": 8533 + }, + { + "epoch": 2.6193984039287908, + "grad_norm": 0.3216320276260376, + "learning_rate": 8.654858283411787e-05, + "loss": 1.8613, + "step": 8534 + }, + { + "epoch": 2.6197053406998156, + "grad_norm": 0.28880423307418823, + "learning_rate": 8.654519070459254e-05, + "loss": 1.8547, + "step": 8535 + }, + { + "epoch": 2.620012277470841, + "grad_norm": 0.3130050301551819, + "learning_rate": 8.654179821390621e-05, + "loss": 1.9355, + "step": 8536 + }, + { + "epoch": 2.6203192142418663, + "grad_norm": 0.3151358664035797, + "learning_rate": 8.653840536209241e-05, + "loss": 1.8462, + "step": 8537 + }, + { + "epoch": 2.620626151012891, + "grad_norm": 0.2702169120311737, + "learning_rate": 8.653501214918468e-05, + "loss": 1.7966, + "step": 8538 + }, + { + "epoch": 2.6209330877839165, + "grad_norm": 0.31494441628456116, + "learning_rate": 8.653161857521655e-05, + "loss": 1.7449, + "step": 8539 + }, + { + "epoch": 2.6212400245549414, + "grad_norm": 0.3219514787197113, + "learning_rate": 8.652822464022154e-05, + "loss": 1.8238, + "step": 8540 + }, + { + "epoch": 2.6215469613259668, + "grad_norm": 0.3237066864967346, + "learning_rate": 8.652483034423322e-05, + "loss": 1.8273, + "step": 8541 + }, + { + "epoch": 2.621853898096992, + "grad_norm": 0.31354910135269165, + "learning_rate": 8.65214356872851e-05, + "loss": 1.8662, + "step": 8542 + }, + { + "epoch": 2.6221608348680174, + "grad_norm": 0.30085036158561707, + "learning_rate": 8.651804066941077e-05, + "loss": 1.8922, + "step": 8543 + }, + { + "epoch": 2.6224677716390423, + "grad_norm": 0.337528258562088, + "learning_rate": 8.651464529064373e-05, + "loss": 1.8234, + "step": 8544 + }, + { + "epoch": 2.6227747084100677, + "grad_norm": 0.33202415704727173, + "learning_rate": 8.65112495510176e-05, + "loss": 1.8331, + "step": 8545 + }, + { + "epoch": 2.6230816451810925, + "grad_norm": 0.3288112282752991, + "learning_rate": 8.650785345056586e-05, + "loss": 1.8129, + "step": 8546 + }, + { + "epoch": 2.623388581952118, + "grad_norm": 0.35483047366142273, + "learning_rate": 8.650445698932214e-05, + "loss": 1.8488, + "step": 8547 + }, + { + "epoch": 2.623695518723143, + "grad_norm": 0.32108932733535767, + "learning_rate": 8.650106016731998e-05, + "loss": 1.8263, + "step": 8548 + }, + { + "epoch": 2.624002455494168, + "grad_norm": 0.2902318239212036, + "learning_rate": 8.649766298459295e-05, + "loss": 1.8352, + "step": 8549 + }, + { + "epoch": 2.6243093922651934, + "grad_norm": 0.29014477133750916, + "learning_rate": 8.64942654411746e-05, + "loss": 1.8568, + "step": 8550 + }, + { + "epoch": 2.6246163290362183, + "grad_norm": 0.3996742367744446, + "learning_rate": 8.649086753709855e-05, + "loss": 1.8928, + "step": 8551 + }, + { + "epoch": 2.6249232658072437, + "grad_norm": 0.3703175187110901, + "learning_rate": 8.648746927239835e-05, + "loss": 1.829, + "step": 8552 + }, + { + "epoch": 2.625230202578269, + "grad_norm": 0.33802542090415955, + "learning_rate": 8.64840706471076e-05, + "loss": 1.8827, + "step": 8553 + }, + { + "epoch": 2.625537139349294, + "grad_norm": 0.33303168416023254, + "learning_rate": 8.648067166125988e-05, + "loss": 1.8964, + "step": 8554 + }, + { + "epoch": 2.6258440761203192, + "grad_norm": 0.33449646830558777, + "learning_rate": 8.647727231488878e-05, + "loss": 1.8477, + "step": 8555 + }, + { + "epoch": 2.626151012891344, + "grad_norm": 0.3260989189147949, + "learning_rate": 8.647387260802788e-05, + "loss": 1.8623, + "step": 8556 + }, + { + "epoch": 2.6264579496623695, + "grad_norm": 0.2847815752029419, + "learning_rate": 8.647047254071082e-05, + "loss": 1.769, + "step": 8557 + }, + { + "epoch": 2.626764886433395, + "grad_norm": 0.30041372776031494, + "learning_rate": 8.646707211297116e-05, + "loss": 1.8451, + "step": 8558 + }, + { + "epoch": 2.62707182320442, + "grad_norm": 0.3557286560535431, + "learning_rate": 8.646367132484252e-05, + "loss": 1.8233, + "step": 8559 + }, + { + "epoch": 2.627378759975445, + "grad_norm": 0.39471131563186646, + "learning_rate": 8.646027017635851e-05, + "loss": 1.8364, + "step": 8560 + }, + { + "epoch": 2.6276856967464703, + "grad_norm": 0.37501803040504456, + "learning_rate": 8.645686866755273e-05, + "loss": 1.8129, + "step": 8561 + }, + { + "epoch": 2.6279926335174952, + "grad_norm": 0.374553918838501, + "learning_rate": 8.645346679845881e-05, + "loss": 1.9388, + "step": 8562 + }, + { + "epoch": 2.6282995702885206, + "grad_norm": 0.34410929679870605, + "learning_rate": 8.645006456911037e-05, + "loss": 1.8496, + "step": 8563 + }, + { + "epoch": 2.628606507059546, + "grad_norm": 0.28208592534065247, + "learning_rate": 8.644666197954103e-05, + "loss": 1.8405, + "step": 8564 + }, + { + "epoch": 2.628913443830571, + "grad_norm": 0.2913917005062103, + "learning_rate": 8.644325902978441e-05, + "loss": 1.8775, + "step": 8565 + }, + { + "epoch": 2.629220380601596, + "grad_norm": 0.33285796642303467, + "learning_rate": 8.643985571987414e-05, + "loss": 1.8217, + "step": 8566 + }, + { + "epoch": 2.629527317372621, + "grad_norm": 0.3419492244720459, + "learning_rate": 8.643645204984386e-05, + "loss": 1.8911, + "step": 8567 + }, + { + "epoch": 2.6298342541436464, + "grad_norm": 0.33901095390319824, + "learning_rate": 8.643304801972721e-05, + "loss": 1.8653, + "step": 8568 + }, + { + "epoch": 2.6301411909146717, + "grad_norm": 0.30073773860931396, + "learning_rate": 8.642964362955781e-05, + "loss": 1.7544, + "step": 8569 + }, + { + "epoch": 2.630448127685697, + "grad_norm": 0.3300367593765259, + "learning_rate": 8.642623887936933e-05, + "loss": 1.8764, + "step": 8570 + }, + { + "epoch": 2.630755064456722, + "grad_norm": 0.330671101808548, + "learning_rate": 8.642283376919542e-05, + "loss": 1.8227, + "step": 8571 + }, + { + "epoch": 2.6310620012277472, + "grad_norm": 0.3498590290546417, + "learning_rate": 8.64194282990697e-05, + "loss": 1.8639, + "step": 8572 + }, + { + "epoch": 2.631368937998772, + "grad_norm": 0.33145999908447266, + "learning_rate": 8.641602246902586e-05, + "loss": 1.8442, + "step": 8573 + }, + { + "epoch": 2.6316758747697975, + "grad_norm": 0.29510337114334106, + "learning_rate": 8.641261627909754e-05, + "loss": 1.829, + "step": 8574 + }, + { + "epoch": 2.631982811540823, + "grad_norm": 0.2788131833076477, + "learning_rate": 8.640920972931839e-05, + "loss": 1.7717, + "step": 8575 + }, + { + "epoch": 2.6322897483118477, + "grad_norm": 0.27459269762039185, + "learning_rate": 8.640580281972209e-05, + "loss": 1.7924, + "step": 8576 + }, + { + "epoch": 2.632596685082873, + "grad_norm": 0.3517146110534668, + "learning_rate": 8.640239555034232e-05, + "loss": 1.8921, + "step": 8577 + }, + { + "epoch": 2.632903621853898, + "grad_norm": 0.2852388620376587, + "learning_rate": 8.639898792121273e-05, + "loss": 1.8207, + "step": 8578 + }, + { + "epoch": 2.6332105586249233, + "grad_norm": 0.3164372742176056, + "learning_rate": 8.639557993236702e-05, + "loss": 1.8782, + "step": 8579 + }, + { + "epoch": 2.6335174953959486, + "grad_norm": 0.43939462304115295, + "learning_rate": 8.639217158383885e-05, + "loss": 1.8345, + "step": 8580 + }, + { + "epoch": 2.6338244321669735, + "grad_norm": 0.45321017503738403, + "learning_rate": 8.63887628756619e-05, + "loss": 1.904, + "step": 8581 + }, + { + "epoch": 2.634131368937999, + "grad_norm": 0.4423905611038208, + "learning_rate": 8.638535380786989e-05, + "loss": 1.8894, + "step": 8582 + }, + { + "epoch": 2.6344383057090237, + "grad_norm": 0.3929237723350525, + "learning_rate": 8.638194438049648e-05, + "loss": 1.8835, + "step": 8583 + }, + { + "epoch": 2.634745242480049, + "grad_norm": 0.3178403973579407, + "learning_rate": 8.637853459357536e-05, + "loss": 1.8125, + "step": 8584 + }, + { + "epoch": 2.6350521792510744, + "grad_norm": 0.3796660602092743, + "learning_rate": 8.637512444714024e-05, + "loss": 1.9376, + "step": 8585 + }, + { + "epoch": 2.6353591160220997, + "grad_norm": 0.34011390805244446, + "learning_rate": 8.637171394122483e-05, + "loss": 1.8339, + "step": 8586 + }, + { + "epoch": 2.6356660527931246, + "grad_norm": 0.3423489034175873, + "learning_rate": 8.636830307586281e-05, + "loss": 1.82, + "step": 8587 + }, + { + "epoch": 2.63597298956415, + "grad_norm": 0.3644867241382599, + "learning_rate": 8.636489185108791e-05, + "loss": 1.811, + "step": 8588 + }, + { + "epoch": 2.636279926335175, + "grad_norm": 0.35383811593055725, + "learning_rate": 8.636148026693384e-05, + "loss": 1.8228, + "step": 8589 + }, + { + "epoch": 2.6365868631062, + "grad_norm": 0.28066012263298035, + "learning_rate": 8.635806832343431e-05, + "loss": 1.7752, + "step": 8590 + }, + { + "epoch": 2.6368937998772255, + "grad_norm": 0.27132275700569153, + "learning_rate": 8.635465602062304e-05, + "loss": 1.8053, + "step": 8591 + }, + { + "epoch": 2.6372007366482504, + "grad_norm": 0.3076920211315155, + "learning_rate": 8.635124335853375e-05, + "loss": 1.77, + "step": 8592 + }, + { + "epoch": 2.6375076734192757, + "grad_norm": 0.35130617022514343, + "learning_rate": 8.634783033720015e-05, + "loss": 1.8272, + "step": 8593 + }, + { + "epoch": 2.6378146101903006, + "grad_norm": 0.3805561661720276, + "learning_rate": 8.634441695665601e-05, + "loss": 1.8549, + "step": 8594 + }, + { + "epoch": 2.638121546961326, + "grad_norm": 0.3168867230415344, + "learning_rate": 8.634100321693504e-05, + "loss": 1.9131, + "step": 8595 + }, + { + "epoch": 2.6384284837323513, + "grad_norm": 0.3061029314994812, + "learning_rate": 8.633758911807095e-05, + "loss": 1.84, + "step": 8596 + }, + { + "epoch": 2.638735420503376, + "grad_norm": 0.2766086459159851, + "learning_rate": 8.633417466009752e-05, + "loss": 1.8519, + "step": 8597 + }, + { + "epoch": 2.6390423572744015, + "grad_norm": 0.3250633180141449, + "learning_rate": 8.633075984304849e-05, + "loss": 1.8434, + "step": 8598 + }, + { + "epoch": 2.6393492940454264, + "grad_norm": 0.2819656729698181, + "learning_rate": 8.63273446669576e-05, + "loss": 1.8181, + "step": 8599 + }, + { + "epoch": 2.6396562308164517, + "grad_norm": 0.3506627678871155, + "learning_rate": 8.632392913185859e-05, + "loss": 1.8521, + "step": 8600 + }, + { + "epoch": 2.639963167587477, + "grad_norm": 0.3026714026927948, + "learning_rate": 8.632051323778521e-05, + "loss": 1.8183, + "step": 8601 + }, + { + "epoch": 2.6402701043585024, + "grad_norm": 0.31900104880332947, + "learning_rate": 8.631709698477124e-05, + "loss": 1.8615, + "step": 8602 + }, + { + "epoch": 2.6405770411295273, + "grad_norm": 0.3017260730266571, + "learning_rate": 8.631368037285044e-05, + "loss": 1.837, + "step": 8603 + }, + { + "epoch": 2.6408839779005526, + "grad_norm": 0.29461613297462463, + "learning_rate": 8.631026340205655e-05, + "loss": 1.8398, + "step": 8604 + }, + { + "epoch": 2.6411909146715775, + "grad_norm": 0.3405241370201111, + "learning_rate": 8.630684607242337e-05, + "loss": 1.9241, + "step": 8605 + }, + { + "epoch": 2.641497851442603, + "grad_norm": 0.36280715465545654, + "learning_rate": 8.630342838398465e-05, + "loss": 1.8319, + "step": 8606 + }, + { + "epoch": 2.641804788213628, + "grad_norm": 0.32274433970451355, + "learning_rate": 8.630001033677414e-05, + "loss": 1.8462, + "step": 8607 + }, + { + "epoch": 2.642111724984653, + "grad_norm": 0.28930720686912537, + "learning_rate": 8.629659193082571e-05, + "loss": 1.8251, + "step": 8608 + }, + { + "epoch": 2.6424186617556784, + "grad_norm": 0.30114278197288513, + "learning_rate": 8.629317316617305e-05, + "loss": 1.8037, + "step": 8609 + }, + { + "epoch": 2.6427255985267033, + "grad_norm": 0.31895074248313904, + "learning_rate": 8.628975404285e-05, + "loss": 1.808, + "step": 8610 + }, + { + "epoch": 2.6430325352977286, + "grad_norm": 0.31819066405296326, + "learning_rate": 8.62863345608903e-05, + "loss": 1.811, + "step": 8611 + }, + { + "epoch": 2.643339472068754, + "grad_norm": 0.3860008716583252, + "learning_rate": 8.628291472032779e-05, + "loss": 1.9041, + "step": 8612 + }, + { + "epoch": 2.643646408839779, + "grad_norm": 0.4598442614078522, + "learning_rate": 8.627949452119626e-05, + "loss": 1.788, + "step": 8613 + }, + { + "epoch": 2.643953345610804, + "grad_norm": 0.4720706641674042, + "learning_rate": 8.62760739635295e-05, + "loss": 1.8436, + "step": 8614 + }, + { + "epoch": 2.644260282381829, + "grad_norm": 0.3894381523132324, + "learning_rate": 8.627265304736131e-05, + "loss": 1.8188, + "step": 8615 + }, + { + "epoch": 2.6445672191528544, + "grad_norm": 0.2819352149963379, + "learning_rate": 8.626923177272551e-05, + "loss": 1.7804, + "step": 8616 + }, + { + "epoch": 2.6448741559238798, + "grad_norm": 0.33847305178642273, + "learning_rate": 8.626581013965588e-05, + "loss": 1.8628, + "step": 8617 + }, + { + "epoch": 2.645181092694905, + "grad_norm": 0.49113303422927856, + "learning_rate": 8.626238814818628e-05, + "loss": 1.821, + "step": 8618 + }, + { + "epoch": 2.64548802946593, + "grad_norm": 0.5562265515327454, + "learning_rate": 8.62589657983505e-05, + "loss": 1.8732, + "step": 8619 + }, + { + "epoch": 2.6457949662369553, + "grad_norm": 0.48525476455688477, + "learning_rate": 8.625554309018237e-05, + "loss": 1.8711, + "step": 8620 + }, + { + "epoch": 2.64610190300798, + "grad_norm": 0.35900986194610596, + "learning_rate": 8.62521200237157e-05, + "loss": 1.8922, + "step": 8621 + }, + { + "epoch": 2.6464088397790055, + "grad_norm": 0.2920636832714081, + "learning_rate": 8.624869659898435e-05, + "loss": 1.8121, + "step": 8622 + }, + { + "epoch": 2.646715776550031, + "grad_norm": 0.3626689314842224, + "learning_rate": 8.624527281602213e-05, + "loss": 1.8231, + "step": 8623 + }, + { + "epoch": 2.6470227133210558, + "grad_norm": 0.37683549523353577, + "learning_rate": 8.624184867486288e-05, + "loss": 1.8648, + "step": 8624 + }, + { + "epoch": 2.647329650092081, + "grad_norm": 0.293865829706192, + "learning_rate": 8.623842417554043e-05, + "loss": 1.8347, + "step": 8625 + }, + { + "epoch": 2.647636586863106, + "grad_norm": 0.28916221857070923, + "learning_rate": 8.623499931808863e-05, + "loss": 1.8337, + "step": 8626 + }, + { + "epoch": 2.6479435236341313, + "grad_norm": 0.439003586769104, + "learning_rate": 8.623157410254134e-05, + "loss": 1.8933, + "step": 8627 + }, + { + "epoch": 2.6482504604051567, + "grad_norm": 0.39125844836235046, + "learning_rate": 8.62281485289324e-05, + "loss": 1.7986, + "step": 8628 + }, + { + "epoch": 2.6485573971761815, + "grad_norm": 0.3968810439109802, + "learning_rate": 8.622472259729566e-05, + "loss": 1.8211, + "step": 8629 + }, + { + "epoch": 2.648864333947207, + "grad_norm": 0.37775713205337524, + "learning_rate": 8.622129630766498e-05, + "loss": 1.8976, + "step": 8630 + }, + { + "epoch": 2.6491712707182318, + "grad_norm": 0.329583078622818, + "learning_rate": 8.621786966007422e-05, + "loss": 1.9164, + "step": 8631 + }, + { + "epoch": 2.649478207489257, + "grad_norm": 0.3499230742454529, + "learning_rate": 8.621444265455725e-05, + "loss": 1.8589, + "step": 8632 + }, + { + "epoch": 2.6497851442602824, + "grad_norm": 0.504540741443634, + "learning_rate": 8.621101529114792e-05, + "loss": 1.7853, + "step": 8633 + }, + { + "epoch": 2.650092081031308, + "grad_norm": 0.47648704051971436, + "learning_rate": 8.620758756988012e-05, + "loss": 1.865, + "step": 8634 + }, + { + "epoch": 2.6503990178023327, + "grad_norm": 0.3592020869255066, + "learning_rate": 8.62041594907877e-05, + "loss": 1.886, + "step": 8635 + }, + { + "epoch": 2.650705954573358, + "grad_norm": 0.4862852096557617, + "learning_rate": 8.620073105390458e-05, + "loss": 1.8408, + "step": 8636 + }, + { + "epoch": 2.651012891344383, + "grad_norm": 0.5418413877487183, + "learning_rate": 8.619730225926462e-05, + "loss": 1.8715, + "step": 8637 + }, + { + "epoch": 2.6513198281154082, + "grad_norm": 0.4154299795627594, + "learning_rate": 8.619387310690168e-05, + "loss": 1.8879, + "step": 8638 + }, + { + "epoch": 2.6516267648864336, + "grad_norm": 0.3325296938419342, + "learning_rate": 8.619044359684968e-05, + "loss": 1.8422, + "step": 8639 + }, + { + "epoch": 2.6519337016574585, + "grad_norm": 0.4082878828048706, + "learning_rate": 8.61870137291425e-05, + "loss": 1.8375, + "step": 8640 + }, + { + "epoch": 2.652240638428484, + "grad_norm": 0.46948596835136414, + "learning_rate": 8.618358350381406e-05, + "loss": 1.8367, + "step": 8641 + }, + { + "epoch": 2.6525475751995087, + "grad_norm": 0.3770928978919983, + "learning_rate": 8.618015292089823e-05, + "loss": 1.8236, + "step": 8642 + }, + { + "epoch": 2.652854511970534, + "grad_norm": 0.27340826392173767, + "learning_rate": 8.617672198042892e-05, + "loss": 1.8446, + "step": 8643 + }, + { + "epoch": 2.6531614487415593, + "grad_norm": 0.4071608781814575, + "learning_rate": 8.617329068244004e-05, + "loss": 1.8576, + "step": 8644 + }, + { + "epoch": 2.6534683855125847, + "grad_norm": 0.5041884779930115, + "learning_rate": 8.61698590269655e-05, + "loss": 1.9075, + "step": 8645 + }, + { + "epoch": 2.6537753222836096, + "grad_norm": 0.4129817485809326, + "learning_rate": 8.616642701403921e-05, + "loss": 1.8592, + "step": 8646 + }, + { + "epoch": 2.654082259054635, + "grad_norm": 0.2837994694709778, + "learning_rate": 8.616299464369508e-05, + "loss": 1.8383, + "step": 8647 + }, + { + "epoch": 2.65438919582566, + "grad_norm": 0.3413170278072357, + "learning_rate": 8.615956191596707e-05, + "loss": 1.8083, + "step": 8648 + }, + { + "epoch": 2.654696132596685, + "grad_norm": 0.3661767244338989, + "learning_rate": 8.615612883088907e-05, + "loss": 1.9141, + "step": 8649 + }, + { + "epoch": 2.6550030693677105, + "grad_norm": 0.3209584951400757, + "learning_rate": 8.6152695388495e-05, + "loss": 1.8886, + "step": 8650 + }, + { + "epoch": 2.6553100061387354, + "grad_norm": 0.3161548674106598, + "learning_rate": 8.61492615888188e-05, + "loss": 1.832, + "step": 8651 + }, + { + "epoch": 2.6556169429097607, + "grad_norm": 0.3258545696735382, + "learning_rate": 8.614582743189441e-05, + "loss": 1.8747, + "step": 8652 + }, + { + "epoch": 2.6559238796807856, + "grad_norm": 0.3528682291507721, + "learning_rate": 8.614239291775579e-05, + "loss": 1.9192, + "step": 8653 + }, + { + "epoch": 2.656230816451811, + "grad_norm": 0.3430826961994171, + "learning_rate": 8.613895804643684e-05, + "loss": 1.8601, + "step": 8654 + }, + { + "epoch": 2.6565377532228363, + "grad_norm": 0.3221988379955292, + "learning_rate": 8.613552281797152e-05, + "loss": 1.9218, + "step": 8655 + }, + { + "epoch": 2.656844689993861, + "grad_norm": 0.2917289137840271, + "learning_rate": 8.613208723239379e-05, + "loss": 1.7443, + "step": 8656 + }, + { + "epoch": 2.6571516267648865, + "grad_norm": 0.28350377082824707, + "learning_rate": 8.612865128973762e-05, + "loss": 1.809, + "step": 8657 + }, + { + "epoch": 2.6574585635359114, + "grad_norm": 0.2758159339427948, + "learning_rate": 8.61252149900369e-05, + "loss": 1.8628, + "step": 8658 + }, + { + "epoch": 2.6577655003069367, + "grad_norm": 0.3537377417087555, + "learning_rate": 8.612177833332566e-05, + "loss": 1.8586, + "step": 8659 + }, + { + "epoch": 2.658072437077962, + "grad_norm": 0.38237693905830383, + "learning_rate": 8.611834131963783e-05, + "loss": 1.8869, + "step": 8660 + }, + { + "epoch": 2.6583793738489874, + "grad_norm": 0.30623751878738403, + "learning_rate": 8.611490394900739e-05, + "loss": 1.8508, + "step": 8661 + }, + { + "epoch": 2.6586863106200123, + "grad_norm": 0.2597752809524536, + "learning_rate": 8.611146622146828e-05, + "loss": 1.7931, + "step": 8662 + }, + { + "epoch": 2.6589932473910376, + "grad_norm": 0.2953357696533203, + "learning_rate": 8.61080281370545e-05, + "loss": 1.837, + "step": 8663 + }, + { + "epoch": 2.6593001841620625, + "grad_norm": 0.3018724322319031, + "learning_rate": 8.610458969580003e-05, + "loss": 1.871, + "step": 8664 + }, + { + "epoch": 2.659607120933088, + "grad_norm": 0.36607179045677185, + "learning_rate": 8.610115089773885e-05, + "loss": 1.9453, + "step": 8665 + }, + { + "epoch": 2.659914057704113, + "grad_norm": 0.38754695653915405, + "learning_rate": 8.609771174290493e-05, + "loss": 1.8886, + "step": 8666 + }, + { + "epoch": 2.660220994475138, + "grad_norm": 0.3752847909927368, + "learning_rate": 8.609427223133226e-05, + "loss": 1.8662, + "step": 8667 + }, + { + "epoch": 2.6605279312461634, + "grad_norm": 0.3301216661930084, + "learning_rate": 8.609083236305483e-05, + "loss": 1.8697, + "step": 8668 + }, + { + "epoch": 2.6608348680171883, + "grad_norm": 0.31682586669921875, + "learning_rate": 8.608739213810666e-05, + "loss": 1.8982, + "step": 8669 + }, + { + "epoch": 2.6611418047882136, + "grad_norm": 0.30835145711898804, + "learning_rate": 8.608395155652172e-05, + "loss": 1.8245, + "step": 8670 + }, + { + "epoch": 2.661448741559239, + "grad_norm": 0.32517582178115845, + "learning_rate": 8.608051061833402e-05, + "loss": 1.9117, + "step": 8671 + }, + { + "epoch": 2.661755678330264, + "grad_norm": 0.3120395541191101, + "learning_rate": 8.607706932357757e-05, + "loss": 1.76, + "step": 8672 + }, + { + "epoch": 2.662062615101289, + "grad_norm": 0.31719091534614563, + "learning_rate": 8.607362767228637e-05, + "loss": 1.8939, + "step": 8673 + }, + { + "epoch": 2.662369551872314, + "grad_norm": 0.28792136907577515, + "learning_rate": 8.607018566449445e-05, + "loss": 1.8403, + "step": 8674 + }, + { + "epoch": 2.6626764886433394, + "grad_norm": 0.28327643871307373, + "learning_rate": 8.606674330023581e-05, + "loss": 1.8204, + "step": 8675 + }, + { + "epoch": 2.6629834254143647, + "grad_norm": 0.29808422923088074, + "learning_rate": 8.606330057954446e-05, + "loss": 1.8325, + "step": 8676 + }, + { + "epoch": 2.66329036218539, + "grad_norm": 0.36162641644477844, + "learning_rate": 8.605985750245446e-05, + "loss": 1.8387, + "step": 8677 + }, + { + "epoch": 2.663597298956415, + "grad_norm": 0.3418589234352112, + "learning_rate": 8.605641406899978e-05, + "loss": 1.8139, + "step": 8678 + }, + { + "epoch": 2.6639042357274403, + "grad_norm": 0.31307870149612427, + "learning_rate": 8.605297027921451e-05, + "loss": 1.8897, + "step": 8679 + }, + { + "epoch": 2.664211172498465, + "grad_norm": 0.36962878704071045, + "learning_rate": 8.604952613313264e-05, + "loss": 1.9233, + "step": 8680 + }, + { + "epoch": 2.6645181092694905, + "grad_norm": 0.3502652049064636, + "learning_rate": 8.604608163078824e-05, + "loss": 1.8218, + "step": 8681 + }, + { + "epoch": 2.664825046040516, + "grad_norm": 0.3703038692474365, + "learning_rate": 8.604263677221533e-05, + "loss": 1.8484, + "step": 8682 + }, + { + "epoch": 2.6651319828115407, + "grad_norm": 0.2609662711620331, + "learning_rate": 8.603919155744796e-05, + "loss": 1.7645, + "step": 8683 + }, + { + "epoch": 2.665438919582566, + "grad_norm": 0.33297231793403625, + "learning_rate": 8.603574598652015e-05, + "loss": 1.8543, + "step": 8684 + }, + { + "epoch": 2.665745856353591, + "grad_norm": 0.28411462903022766, + "learning_rate": 8.603230005946601e-05, + "loss": 1.867, + "step": 8685 + }, + { + "epoch": 2.6660527931246163, + "grad_norm": 0.3209732174873352, + "learning_rate": 8.602885377631954e-05, + "loss": 1.8886, + "step": 8686 + }, + { + "epoch": 2.6663597298956416, + "grad_norm": 0.35397234559059143, + "learning_rate": 8.602540713711482e-05, + "loss": 1.8965, + "step": 8687 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2925071716308594, + "learning_rate": 8.602196014188593e-05, + "loss": 1.8027, + "step": 8688 + }, + { + "epoch": 2.666973603437692, + "grad_norm": 0.2902941107749939, + "learning_rate": 8.60185127906669e-05, + "loss": 1.8022, + "step": 8689 + }, + { + "epoch": 2.6672805402087167, + "grad_norm": 0.31528550386428833, + "learning_rate": 8.601506508349181e-05, + "loss": 1.8153, + "step": 8690 + }, + { + "epoch": 2.667587476979742, + "grad_norm": 0.32254844903945923, + "learning_rate": 8.601161702039477e-05, + "loss": 1.8199, + "step": 8691 + }, + { + "epoch": 2.6678944137507674, + "grad_norm": 0.2999059855937958, + "learning_rate": 8.600816860140979e-05, + "loss": 1.8404, + "step": 8692 + }, + { + "epoch": 2.6682013505217927, + "grad_norm": 0.32727453112602234, + "learning_rate": 8.6004719826571e-05, + "loss": 1.8148, + "step": 8693 + }, + { + "epoch": 2.6685082872928176, + "grad_norm": 0.3048906624317169, + "learning_rate": 8.600127069591245e-05, + "loss": 1.833, + "step": 8694 + }, + { + "epoch": 2.668815224063843, + "grad_norm": 0.43790102005004883, + "learning_rate": 8.599782120946826e-05, + "loss": 1.8537, + "step": 8695 + }, + { + "epoch": 2.669122160834868, + "grad_norm": 0.38096752762794495, + "learning_rate": 8.59943713672725e-05, + "loss": 1.8094, + "step": 8696 + }, + { + "epoch": 2.669429097605893, + "grad_norm": 0.3065931499004364, + "learning_rate": 8.599092116935927e-05, + "loss": 1.8878, + "step": 8697 + }, + { + "epoch": 2.6697360343769185, + "grad_norm": 0.41807904839515686, + "learning_rate": 8.598747061576264e-05, + "loss": 1.8753, + "step": 8698 + }, + { + "epoch": 2.6700429711479434, + "grad_norm": 0.4906943142414093, + "learning_rate": 8.598401970651676e-05, + "loss": 1.7642, + "step": 8699 + }, + { + "epoch": 2.6703499079189688, + "grad_norm": 0.37138858437538147, + "learning_rate": 8.598056844165567e-05, + "loss": 1.8191, + "step": 8700 + }, + { + "epoch": 2.6706568446899936, + "grad_norm": 0.2804940938949585, + "learning_rate": 8.597711682121354e-05, + "loss": 1.8238, + "step": 8701 + }, + { + "epoch": 2.670963781461019, + "grad_norm": 0.3853018581867218, + "learning_rate": 8.597366484522445e-05, + "loss": 1.8762, + "step": 8702 + }, + { + "epoch": 2.6712707182320443, + "grad_norm": 0.3066580295562744, + "learning_rate": 8.597021251372253e-05, + "loss": 1.7638, + "step": 8703 + }, + { + "epoch": 2.671577655003069, + "grad_norm": 0.30797824263572693, + "learning_rate": 8.596675982674186e-05, + "loss": 1.8574, + "step": 8704 + }, + { + "epoch": 2.6718845917740945, + "grad_norm": 0.3268548548221588, + "learning_rate": 8.596330678431661e-05, + "loss": 1.9184, + "step": 8705 + }, + { + "epoch": 2.6721915285451194, + "grad_norm": 0.4077534079551697, + "learning_rate": 8.595985338648087e-05, + "loss": 1.8967, + "step": 8706 + }, + { + "epoch": 2.6724984653161448, + "grad_norm": 0.4514889419078827, + "learning_rate": 8.595639963326881e-05, + "loss": 1.8491, + "step": 8707 + }, + { + "epoch": 2.67280540208717, + "grad_norm": 0.39269959926605225, + "learning_rate": 8.59529455247145e-05, + "loss": 1.7865, + "step": 8708 + }, + { + "epoch": 2.6731123388581954, + "grad_norm": 0.3139820694923401, + "learning_rate": 8.594949106085212e-05, + "loss": 1.8007, + "step": 8709 + }, + { + "epoch": 2.6734192756292203, + "grad_norm": 0.3423599600791931, + "learning_rate": 8.59460362417158e-05, + "loss": 1.8389, + "step": 8710 + }, + { + "epoch": 2.6737262124002457, + "grad_norm": 0.3829670548439026, + "learning_rate": 8.594258106733968e-05, + "loss": 1.8355, + "step": 8711 + }, + { + "epoch": 2.6740331491712706, + "grad_norm": 0.34447145462036133, + "learning_rate": 8.593912553775791e-05, + "loss": 1.8595, + "step": 8712 + }, + { + "epoch": 2.674340085942296, + "grad_norm": 0.34868502616882324, + "learning_rate": 8.593566965300465e-05, + "loss": 1.9195, + "step": 8713 + }, + { + "epoch": 2.674647022713321, + "grad_norm": 0.4919234812259674, + "learning_rate": 8.593221341311402e-05, + "loss": 1.8321, + "step": 8714 + }, + { + "epoch": 2.674953959484346, + "grad_norm": 0.4413202702999115, + "learning_rate": 8.59287568181202e-05, + "loss": 1.7976, + "step": 8715 + }, + { + "epoch": 2.6752608962553714, + "grad_norm": 0.3395153880119324, + "learning_rate": 8.592529986805736e-05, + "loss": 1.7974, + "step": 8716 + }, + { + "epoch": 2.6755678330263963, + "grad_norm": 0.30407002568244934, + "learning_rate": 8.592184256295965e-05, + "loss": 1.7929, + "step": 8717 + }, + { + "epoch": 2.6758747697974217, + "grad_norm": 0.31925150752067566, + "learning_rate": 8.591838490286121e-05, + "loss": 1.8413, + "step": 8718 + }, + { + "epoch": 2.676181706568447, + "grad_norm": 0.28456512093544006, + "learning_rate": 8.591492688779627e-05, + "loss": 1.8686, + "step": 8719 + }, + { + "epoch": 2.6764886433394723, + "grad_norm": 0.3286445438861847, + "learning_rate": 8.591146851779895e-05, + "loss": 1.8538, + "step": 8720 + }, + { + "epoch": 2.6767955801104972, + "grad_norm": 0.40354880690574646, + "learning_rate": 8.590800979290346e-05, + "loss": 1.8599, + "step": 8721 + }, + { + "epoch": 2.6771025168815226, + "grad_norm": 0.3654378652572632, + "learning_rate": 8.590455071314397e-05, + "loss": 1.8063, + "step": 8722 + }, + { + "epoch": 2.6774094536525475, + "grad_norm": 0.3211844861507416, + "learning_rate": 8.590109127855466e-05, + "loss": 1.8146, + "step": 8723 + }, + { + "epoch": 2.677716390423573, + "grad_norm": 0.30884361267089844, + "learning_rate": 8.589763148916973e-05, + "loss": 1.8725, + "step": 8724 + }, + { + "epoch": 2.678023327194598, + "grad_norm": 0.303095281124115, + "learning_rate": 8.589417134502336e-05, + "loss": 1.8994, + "step": 8725 + }, + { + "epoch": 2.678330263965623, + "grad_norm": 0.3086979389190674, + "learning_rate": 8.589071084614977e-05, + "loss": 1.7941, + "step": 8726 + }, + { + "epoch": 2.6786372007366483, + "grad_norm": 0.30298081040382385, + "learning_rate": 8.588724999258311e-05, + "loss": 1.8945, + "step": 8727 + }, + { + "epoch": 2.6789441375076732, + "grad_norm": 0.33253392577171326, + "learning_rate": 8.588378878435763e-05, + "loss": 1.8397, + "step": 8728 + }, + { + "epoch": 2.6792510742786986, + "grad_norm": 0.2782913148403168, + "learning_rate": 8.588032722150752e-05, + "loss": 1.8505, + "step": 8729 + }, + { + "epoch": 2.679558011049724, + "grad_norm": 0.3482373058795929, + "learning_rate": 8.587686530406697e-05, + "loss": 1.9144, + "step": 8730 + }, + { + "epoch": 2.679864947820749, + "grad_norm": 0.31985580921173096, + "learning_rate": 8.587340303207021e-05, + "loss": 1.7695, + "step": 8731 + }, + { + "epoch": 2.680171884591774, + "grad_norm": 0.3222995400428772, + "learning_rate": 8.586994040555147e-05, + "loss": 1.8624, + "step": 8732 + }, + { + "epoch": 2.680478821362799, + "grad_norm": 0.28178468346595764, + "learning_rate": 8.586647742454495e-05, + "loss": 1.8036, + "step": 8733 + }, + { + "epoch": 2.6807857581338244, + "grad_norm": 0.27367156744003296, + "learning_rate": 8.586301408908487e-05, + "loss": 1.801, + "step": 8734 + }, + { + "epoch": 2.6810926949048497, + "grad_norm": 0.2696636915206909, + "learning_rate": 8.585955039920547e-05, + "loss": 1.8211, + "step": 8735 + }, + { + "epoch": 2.681399631675875, + "grad_norm": 0.2880568504333496, + "learning_rate": 8.585608635494098e-05, + "loss": 1.8543, + "step": 8736 + }, + { + "epoch": 2.6817065684469, + "grad_norm": 0.28708669543266296, + "learning_rate": 8.585262195632562e-05, + "loss": 1.8311, + "step": 8737 + }, + { + "epoch": 2.6820135052179253, + "grad_norm": 0.2633354663848877, + "learning_rate": 8.584915720339364e-05, + "loss": 1.7815, + "step": 8738 + }, + { + "epoch": 2.68232044198895, + "grad_norm": 0.25772908329963684, + "learning_rate": 8.584569209617928e-05, + "loss": 1.8322, + "step": 8739 + }, + { + "epoch": 2.6826273787599755, + "grad_norm": 0.2665303647518158, + "learning_rate": 8.584222663471677e-05, + "loss": 1.8456, + "step": 8740 + }, + { + "epoch": 2.682934315531001, + "grad_norm": 0.26330938935279846, + "learning_rate": 8.583876081904038e-05, + "loss": 1.8552, + "step": 8741 + }, + { + "epoch": 2.6832412523020257, + "grad_norm": 0.29758915305137634, + "learning_rate": 8.583529464918434e-05, + "loss": 1.8362, + "step": 8742 + }, + { + "epoch": 2.683548189073051, + "grad_norm": 0.32018154859542847, + "learning_rate": 8.583182812518293e-05, + "loss": 1.8439, + "step": 8743 + }, + { + "epoch": 2.683855125844076, + "grad_norm": 0.33279770612716675, + "learning_rate": 8.582836124707036e-05, + "loss": 1.8629, + "step": 8744 + }, + { + "epoch": 2.6841620626151013, + "grad_norm": 0.40244174003601074, + "learning_rate": 8.582489401488096e-05, + "loss": 1.8221, + "step": 8745 + }, + { + "epoch": 2.6844689993861266, + "grad_norm": 0.3935016393661499, + "learning_rate": 8.582142642864895e-05, + "loss": 1.8564, + "step": 8746 + }, + { + "epoch": 2.6847759361571515, + "grad_norm": 0.3062369227409363, + "learning_rate": 8.58179584884086e-05, + "loss": 1.8587, + "step": 8747 + }, + { + "epoch": 2.685082872928177, + "grad_norm": 0.320422500371933, + "learning_rate": 8.58144901941942e-05, + "loss": 1.8758, + "step": 8748 + }, + { + "epoch": 2.6853898096992017, + "grad_norm": 0.3681413531303406, + "learning_rate": 8.581102154604001e-05, + "loss": 1.7899, + "step": 8749 + }, + { + "epoch": 2.685696746470227, + "grad_norm": 0.37779754400253296, + "learning_rate": 8.580755254398032e-05, + "loss": 1.8584, + "step": 8750 + }, + { + "epoch": 2.6860036832412524, + "grad_norm": 0.34761306643486023, + "learning_rate": 8.58040831880494e-05, + "loss": 1.8656, + "step": 8751 + }, + { + "epoch": 2.6863106200122777, + "grad_norm": 0.2833636403083801, + "learning_rate": 8.580061347828156e-05, + "loss": 1.8043, + "step": 8752 + }, + { + "epoch": 2.6866175567833026, + "grad_norm": 0.29990699887275696, + "learning_rate": 8.579714341471106e-05, + "loss": 1.8365, + "step": 8753 + }, + { + "epoch": 2.686924493554328, + "grad_norm": 0.3322729766368866, + "learning_rate": 8.579367299737222e-05, + "loss": 1.8541, + "step": 8754 + }, + { + "epoch": 2.687231430325353, + "grad_norm": 0.31999245285987854, + "learning_rate": 8.579020222629931e-05, + "loss": 1.8405, + "step": 8755 + }, + { + "epoch": 2.687538367096378, + "grad_norm": 0.332714319229126, + "learning_rate": 8.578673110152666e-05, + "loss": 1.9512, + "step": 8756 + }, + { + "epoch": 2.6878453038674035, + "grad_norm": 0.36372992396354675, + "learning_rate": 8.578325962308855e-05, + "loss": 1.8969, + "step": 8757 + }, + { + "epoch": 2.6881522406384284, + "grad_norm": 0.27239182591438293, + "learning_rate": 8.577978779101929e-05, + "loss": 1.7898, + "step": 8758 + }, + { + "epoch": 2.6884591774094537, + "grad_norm": 0.3552536070346832, + "learning_rate": 8.57763156053532e-05, + "loss": 1.8919, + "step": 8759 + }, + { + "epoch": 2.6887661141804786, + "grad_norm": 0.40591174364089966, + "learning_rate": 8.577284306612458e-05, + "loss": 1.8021, + "step": 8760 + }, + { + "epoch": 2.689073050951504, + "grad_norm": 0.37012994289398193, + "learning_rate": 8.576937017336777e-05, + "loss": 1.7803, + "step": 8761 + }, + { + "epoch": 2.6893799877225293, + "grad_norm": 0.33496031165122986, + "learning_rate": 8.576589692711707e-05, + "loss": 1.8573, + "step": 8762 + }, + { + "epoch": 2.689686924493554, + "grad_norm": 0.35000404715538025, + "learning_rate": 8.576242332740683e-05, + "loss": 1.8769, + "step": 8763 + }, + { + "epoch": 2.6899938612645795, + "grad_norm": 0.32730549573898315, + "learning_rate": 8.575894937427135e-05, + "loss": 1.823, + "step": 8764 + }, + { + "epoch": 2.6903007980356044, + "grad_norm": 0.31418806314468384, + "learning_rate": 8.575547506774497e-05, + "loss": 1.7646, + "step": 8765 + }, + { + "epoch": 2.6906077348066297, + "grad_norm": 0.277721107006073, + "learning_rate": 8.575200040786205e-05, + "loss": 1.8046, + "step": 8766 + }, + { + "epoch": 2.690914671577655, + "grad_norm": 0.3289557695388794, + "learning_rate": 8.574852539465688e-05, + "loss": 1.8145, + "step": 8767 + }, + { + "epoch": 2.6912216083486804, + "grad_norm": 0.28926602005958557, + "learning_rate": 8.574505002816385e-05, + "loss": 1.7627, + "step": 8768 + }, + { + "epoch": 2.6915285451197053, + "grad_norm": 0.2972332835197449, + "learning_rate": 8.574157430841727e-05, + "loss": 1.8294, + "step": 8769 + }, + { + "epoch": 2.6918354818907306, + "grad_norm": 0.28366953134536743, + "learning_rate": 8.57380982354515e-05, + "loss": 1.8535, + "step": 8770 + }, + { + "epoch": 2.6921424186617555, + "grad_norm": 0.2798771262168884, + "learning_rate": 8.57346218093009e-05, + "loss": 1.8298, + "step": 8771 + }, + { + "epoch": 2.692449355432781, + "grad_norm": 0.2614765465259552, + "learning_rate": 8.573114502999983e-05, + "loss": 1.8555, + "step": 8772 + }, + { + "epoch": 2.692756292203806, + "grad_norm": 0.30653777718544006, + "learning_rate": 8.572766789758265e-05, + "loss": 1.8507, + "step": 8773 + }, + { + "epoch": 2.693063228974831, + "grad_norm": 0.3189094066619873, + "learning_rate": 8.572419041208369e-05, + "loss": 1.8791, + "step": 8774 + }, + { + "epoch": 2.6933701657458564, + "grad_norm": 0.33381524682044983, + "learning_rate": 8.572071257353735e-05, + "loss": 1.8241, + "step": 8775 + }, + { + "epoch": 2.6936771025168813, + "grad_norm": 0.2776879668235779, + "learning_rate": 8.571723438197801e-05, + "loss": 1.7837, + "step": 8776 + }, + { + "epoch": 2.6939840392879066, + "grad_norm": 0.35845425724983215, + "learning_rate": 8.571375583744001e-05, + "loss": 1.8896, + "step": 8777 + }, + { + "epoch": 2.694290976058932, + "grad_norm": 0.28849005699157715, + "learning_rate": 8.571027693995775e-05, + "loss": 1.803, + "step": 8778 + }, + { + "epoch": 2.694597912829957, + "grad_norm": 0.3008786141872406, + "learning_rate": 8.57067976895656e-05, + "loss": 1.8559, + "step": 8779 + }, + { + "epoch": 2.694904849600982, + "grad_norm": 0.2924736440181732, + "learning_rate": 8.570331808629795e-05, + "loss": 1.8016, + "step": 8780 + }, + { + "epoch": 2.695211786372007, + "grad_norm": 0.2962380051612854, + "learning_rate": 8.569983813018917e-05, + "loss": 1.819, + "step": 8781 + }, + { + "epoch": 2.6955187231430324, + "grad_norm": 0.3141970634460449, + "learning_rate": 8.569635782127367e-05, + "loss": 1.8462, + "step": 8782 + }, + { + "epoch": 2.6958256599140578, + "grad_norm": 0.297061562538147, + "learning_rate": 8.569287715958584e-05, + "loss": 1.855, + "step": 8783 + }, + { + "epoch": 2.696132596685083, + "grad_norm": 0.30669623613357544, + "learning_rate": 8.568939614516009e-05, + "loss": 1.8626, + "step": 8784 + }, + { + "epoch": 2.696439533456108, + "grad_norm": 0.2782025933265686, + "learning_rate": 8.568591477803081e-05, + "loss": 1.8993, + "step": 8785 + }, + { + "epoch": 2.6967464702271333, + "grad_norm": 0.3644821345806122, + "learning_rate": 8.568243305823239e-05, + "loss": 1.8318, + "step": 8786 + }, + { + "epoch": 2.697053406998158, + "grad_norm": 0.4073259234428406, + "learning_rate": 8.567895098579925e-05, + "loss": 1.8963, + "step": 8787 + }, + { + "epoch": 2.6973603437691835, + "grad_norm": 0.40539780259132385, + "learning_rate": 8.567546856076583e-05, + "loss": 1.8644, + "step": 8788 + }, + { + "epoch": 2.697667280540209, + "grad_norm": 0.36739271879196167, + "learning_rate": 8.567198578316648e-05, + "loss": 1.8555, + "step": 8789 + }, + { + "epoch": 2.6979742173112338, + "grad_norm": 0.3339182138442993, + "learning_rate": 8.566850265303568e-05, + "loss": 1.8431, + "step": 8790 + }, + { + "epoch": 2.698281154082259, + "grad_norm": 0.3389740586280823, + "learning_rate": 8.566501917040784e-05, + "loss": 1.8271, + "step": 8791 + }, + { + "epoch": 2.698588090853284, + "grad_norm": 0.33819615840911865, + "learning_rate": 8.566153533531737e-05, + "loss": 1.8504, + "step": 8792 + }, + { + "epoch": 2.6988950276243093, + "grad_norm": 0.39106276631355286, + "learning_rate": 8.56580511477987e-05, + "loss": 1.7656, + "step": 8793 + }, + { + "epoch": 2.6992019643953347, + "grad_norm": 0.3374726474285126, + "learning_rate": 8.565456660788628e-05, + "loss": 1.8256, + "step": 8794 + }, + { + "epoch": 2.69950890116636, + "grad_norm": 0.33096614480018616, + "learning_rate": 8.565108171561452e-05, + "loss": 1.9486, + "step": 8795 + }, + { + "epoch": 2.699815837937385, + "grad_norm": 0.3202100396156311, + "learning_rate": 8.564759647101788e-05, + "loss": 1.7708, + "step": 8796 + }, + { + "epoch": 2.7001227747084102, + "grad_norm": 0.28830909729003906, + "learning_rate": 8.56441108741308e-05, + "loss": 1.8247, + "step": 8797 + }, + { + "epoch": 2.700429711479435, + "grad_norm": 0.32385459542274475, + "learning_rate": 8.564062492498772e-05, + "loss": 1.8338, + "step": 8798 + }, + { + "epoch": 2.7007366482504604, + "grad_norm": 0.3059900104999542, + "learning_rate": 8.56371386236231e-05, + "loss": 1.8321, + "step": 8799 + }, + { + "epoch": 2.701043585021486, + "grad_norm": 0.2922738492488861, + "learning_rate": 8.563365197007141e-05, + "loss": 1.7734, + "step": 8800 + }, + { + "epoch": 2.7013505217925107, + "grad_norm": 0.32542386651039124, + "learning_rate": 8.563016496436704e-05, + "loss": 1.8696, + "step": 8801 + }, + { + "epoch": 2.701657458563536, + "grad_norm": 0.2830851674079895, + "learning_rate": 8.562667760654452e-05, + "loss": 1.8237, + "step": 8802 + }, + { + "epoch": 2.701964395334561, + "grad_norm": 0.2794142961502075, + "learning_rate": 8.562318989663831e-05, + "loss": 1.8301, + "step": 8803 + }, + { + "epoch": 2.7022713321055862, + "grad_norm": 0.3149101436138153, + "learning_rate": 8.561970183468281e-05, + "loss": 1.8716, + "step": 8804 + }, + { + "epoch": 2.7025782688766116, + "grad_norm": 0.29530593752861023, + "learning_rate": 8.561621342071258e-05, + "loss": 1.9069, + "step": 8805 + }, + { + "epoch": 2.7028852056476365, + "grad_norm": 0.33965879678726196, + "learning_rate": 8.561272465476204e-05, + "loss": 1.8381, + "step": 8806 + }, + { + "epoch": 2.703192142418662, + "grad_norm": 0.3310995399951935, + "learning_rate": 8.560923553686569e-05, + "loss": 1.9293, + "step": 8807 + }, + { + "epoch": 2.7034990791896867, + "grad_norm": 0.3828842043876648, + "learning_rate": 8.5605746067058e-05, + "loss": 1.8789, + "step": 8808 + }, + { + "epoch": 2.703806015960712, + "grad_norm": 0.3666260242462158, + "learning_rate": 8.560225624537346e-05, + "loss": 1.8622, + "step": 8809 + }, + { + "epoch": 2.7041129527317374, + "grad_norm": 0.36732783913612366, + "learning_rate": 8.559876607184653e-05, + "loss": 1.8177, + "step": 8810 + }, + { + "epoch": 2.7044198895027627, + "grad_norm": 0.35554859042167664, + "learning_rate": 8.559527554651176e-05, + "loss": 1.884, + "step": 8811 + }, + { + "epoch": 2.7047268262737876, + "grad_norm": 0.3118159770965576, + "learning_rate": 8.55917846694036e-05, + "loss": 1.8779, + "step": 8812 + }, + { + "epoch": 2.705033763044813, + "grad_norm": 0.278105765581131, + "learning_rate": 8.558829344055657e-05, + "loss": 1.8513, + "step": 8813 + }, + { + "epoch": 2.705340699815838, + "grad_norm": 0.30809372663497925, + "learning_rate": 8.558480186000517e-05, + "loss": 1.8023, + "step": 8814 + }, + { + "epoch": 2.705647636586863, + "grad_norm": 0.28222522139549255, + "learning_rate": 8.558130992778388e-05, + "loss": 1.8421, + "step": 8815 + }, + { + "epoch": 2.7059545733578885, + "grad_norm": 0.29532718658447266, + "learning_rate": 8.557781764392725e-05, + "loss": 1.8131, + "step": 8816 + }, + { + "epoch": 2.7062615101289134, + "grad_norm": 0.2670072317123413, + "learning_rate": 8.557432500846975e-05, + "loss": 1.7856, + "step": 8817 + }, + { + "epoch": 2.7065684468999387, + "grad_norm": 0.3431483805179596, + "learning_rate": 8.557083202144594e-05, + "loss": 1.8484, + "step": 8818 + }, + { + "epoch": 2.7068753836709636, + "grad_norm": 0.3824561536312103, + "learning_rate": 8.556733868289033e-05, + "loss": 1.8954, + "step": 8819 + }, + { + "epoch": 2.707182320441989, + "grad_norm": 0.4189379811286926, + "learning_rate": 8.55638449928374e-05, + "loss": 1.7846, + "step": 8820 + }, + { + "epoch": 2.7074892572130143, + "grad_norm": 0.34948450326919556, + "learning_rate": 8.556035095132173e-05, + "loss": 1.7696, + "step": 8821 + }, + { + "epoch": 2.707796193984039, + "grad_norm": 0.2906292676925659, + "learning_rate": 8.555685655837783e-05, + "loss": 1.8359, + "step": 8822 + }, + { + "epoch": 2.7081031307550645, + "grad_norm": 0.2756035029888153, + "learning_rate": 8.555336181404023e-05, + "loss": 1.8684, + "step": 8823 + }, + { + "epoch": 2.7084100675260894, + "grad_norm": 0.3714772164821625, + "learning_rate": 8.554986671834346e-05, + "loss": 1.8833, + "step": 8824 + }, + { + "epoch": 2.7087170042971147, + "grad_norm": 0.41674792766571045, + "learning_rate": 8.554637127132209e-05, + "loss": 1.8272, + "step": 8825 + }, + { + "epoch": 2.70902394106814, + "grad_norm": 0.333915650844574, + "learning_rate": 8.554287547301063e-05, + "loss": 1.8343, + "step": 8826 + }, + { + "epoch": 2.7093308778391654, + "grad_norm": 0.33764639496803284, + "learning_rate": 8.553937932344365e-05, + "loss": 1.812, + "step": 8827 + }, + { + "epoch": 2.7096378146101903, + "grad_norm": 0.4445551931858063, + "learning_rate": 8.553588282265569e-05, + "loss": 1.8386, + "step": 8828 + }, + { + "epoch": 2.7099447513812156, + "grad_norm": 0.43314024806022644, + "learning_rate": 8.553238597068131e-05, + "loss": 1.7727, + "step": 8829 + }, + { + "epoch": 2.7102516881522405, + "grad_norm": 0.364596426486969, + "learning_rate": 8.552888876755506e-05, + "loss": 1.8875, + "step": 8830 + }, + { + "epoch": 2.710558624923266, + "grad_norm": 0.3023224174976349, + "learning_rate": 8.552539121331151e-05, + "loss": 1.8676, + "step": 8831 + }, + { + "epoch": 2.710865561694291, + "grad_norm": 0.3278682231903076, + "learning_rate": 8.552189330798522e-05, + "loss": 1.852, + "step": 8832 + }, + { + "epoch": 2.711172498465316, + "grad_norm": 0.34684303402900696, + "learning_rate": 8.551839505161077e-05, + "loss": 1.8449, + "step": 8833 + }, + { + "epoch": 2.7114794352363414, + "grad_norm": 0.3398132920265198, + "learning_rate": 8.551489644422271e-05, + "loss": 1.8493, + "step": 8834 + }, + { + "epoch": 2.7117863720073663, + "grad_norm": 0.2835905849933624, + "learning_rate": 8.551139748585563e-05, + "loss": 1.8283, + "step": 8835 + }, + { + "epoch": 2.7120933087783916, + "grad_norm": 0.30910351872444153, + "learning_rate": 8.55078981765441e-05, + "loss": 1.8429, + "step": 8836 + }, + { + "epoch": 2.712400245549417, + "grad_norm": 0.3802061676979065, + "learning_rate": 8.550439851632272e-05, + "loss": 1.8348, + "step": 8837 + }, + { + "epoch": 2.712707182320442, + "grad_norm": 0.3686448931694031, + "learning_rate": 8.550089850522606e-05, + "loss": 1.8652, + "step": 8838 + }, + { + "epoch": 2.713014119091467, + "grad_norm": 0.2919705808162689, + "learning_rate": 8.549739814328872e-05, + "loss": 1.8318, + "step": 8839 + }, + { + "epoch": 2.713321055862492, + "grad_norm": 0.34780198335647583, + "learning_rate": 8.549389743054527e-05, + "loss": 1.8781, + "step": 8840 + }, + { + "epoch": 2.7136279926335174, + "grad_norm": 0.3955966532230377, + "learning_rate": 8.549039636703034e-05, + "loss": 1.867, + "step": 8841 + }, + { + "epoch": 2.7139349294045427, + "grad_norm": 0.2836689054965973, + "learning_rate": 8.548689495277851e-05, + "loss": 1.7859, + "step": 8842 + }, + { + "epoch": 2.714241866175568, + "grad_norm": 0.369865357875824, + "learning_rate": 8.548339318782436e-05, + "loss": 1.8246, + "step": 8843 + }, + { + "epoch": 2.714548802946593, + "grad_norm": 0.2901081442832947, + "learning_rate": 8.547989107220256e-05, + "loss": 1.7888, + "step": 8844 + }, + { + "epoch": 2.7148557397176183, + "grad_norm": 0.2790970802307129, + "learning_rate": 8.547638860594764e-05, + "loss": 1.8311, + "step": 8845 + }, + { + "epoch": 2.715162676488643, + "grad_norm": 0.2935783267021179, + "learning_rate": 8.547288578909429e-05, + "loss": 1.857, + "step": 8846 + }, + { + "epoch": 2.7154696132596685, + "grad_norm": 0.27074959874153137, + "learning_rate": 8.546938262167708e-05, + "loss": 1.7457, + "step": 8847 + }, + { + "epoch": 2.715776550030694, + "grad_norm": 0.3042888343334198, + "learning_rate": 8.546587910373063e-05, + "loss": 1.8598, + "step": 8848 + }, + { + "epoch": 2.7160834868017187, + "grad_norm": 0.29088664054870605, + "learning_rate": 8.546237523528958e-05, + "loss": 1.8461, + "step": 8849 + }, + { + "epoch": 2.716390423572744, + "grad_norm": 0.3022211492061615, + "learning_rate": 8.545887101638857e-05, + "loss": 1.8327, + "step": 8850 + }, + { + "epoch": 2.716697360343769, + "grad_norm": 0.30194929242134094, + "learning_rate": 8.545536644706218e-05, + "loss": 1.8331, + "step": 8851 + }, + { + "epoch": 2.7170042971147943, + "grad_norm": 0.31702303886413574, + "learning_rate": 8.54518615273451e-05, + "loss": 1.8576, + "step": 8852 + }, + { + "epoch": 2.7173112338858196, + "grad_norm": 0.30386796593666077, + "learning_rate": 8.544835625727195e-05, + "loss": 1.8278, + "step": 8853 + }, + { + "epoch": 2.717618170656845, + "grad_norm": 0.30670568346977234, + "learning_rate": 8.544485063687735e-05, + "loss": 1.8123, + "step": 8854 + }, + { + "epoch": 2.71792510742787, + "grad_norm": 0.3896371126174927, + "learning_rate": 8.544134466619597e-05, + "loss": 1.8101, + "step": 8855 + }, + { + "epoch": 2.718232044198895, + "grad_norm": 0.4742000699043274, + "learning_rate": 8.543783834526245e-05, + "loss": 1.8402, + "step": 8856 + }, + { + "epoch": 2.71853898096992, + "grad_norm": 0.4234209954738617, + "learning_rate": 8.543433167411143e-05, + "loss": 1.8814, + "step": 8857 + }, + { + "epoch": 2.7188459177409454, + "grad_norm": 0.28478503227233887, + "learning_rate": 8.54308246527776e-05, + "loss": 1.8165, + "step": 8858 + }, + { + "epoch": 2.7191528545119708, + "grad_norm": 0.3534078896045685, + "learning_rate": 8.542731728129558e-05, + "loss": 1.7947, + "step": 8859 + }, + { + "epoch": 2.7194597912829956, + "grad_norm": 0.5471592545509338, + "learning_rate": 8.542380955970004e-05, + "loss": 1.9073, + "step": 8860 + }, + { + "epoch": 2.719766728054021, + "grad_norm": 0.5037226676940918, + "learning_rate": 8.542030148802566e-05, + "loss": 1.8701, + "step": 8861 + }, + { + "epoch": 2.720073664825046, + "grad_norm": 0.3415449559688568, + "learning_rate": 8.54167930663071e-05, + "loss": 1.827, + "step": 8862 + }, + { + "epoch": 2.720380601596071, + "grad_norm": 0.33516764640808105, + "learning_rate": 8.541328429457903e-05, + "loss": 1.9396, + "step": 8863 + }, + { + "epoch": 2.7206875383670965, + "grad_norm": 0.3934863209724426, + "learning_rate": 8.540977517287612e-05, + "loss": 1.8738, + "step": 8864 + }, + { + "epoch": 2.7209944751381214, + "grad_norm": 0.5137139558792114, + "learning_rate": 8.540626570123307e-05, + "loss": 1.9007, + "step": 8865 + }, + { + "epoch": 2.7213014119091468, + "grad_norm": 0.5846540331840515, + "learning_rate": 8.540275587968453e-05, + "loss": 1.9335, + "step": 8866 + }, + { + "epoch": 2.7216083486801717, + "grad_norm": 0.613388180732727, + "learning_rate": 8.539924570826523e-05, + "loss": 1.8967, + "step": 8867 + }, + { + "epoch": 2.721915285451197, + "grad_norm": 0.4804840087890625, + "learning_rate": 8.539573518700983e-05, + "loss": 1.7712, + "step": 8868 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.34939101338386536, + "learning_rate": 8.539222431595303e-05, + "loss": 1.8578, + "step": 8869 + }, + { + "epoch": 2.7225291589932477, + "grad_norm": 0.4230511486530304, + "learning_rate": 8.538871309512951e-05, + "loss": 1.793, + "step": 8870 + }, + { + "epoch": 2.7228360957642725, + "grad_norm": 0.5383400917053223, + "learning_rate": 8.538520152457402e-05, + "loss": 1.8153, + "step": 8871 + }, + { + "epoch": 2.723143032535298, + "grad_norm": 0.46213194727897644, + "learning_rate": 8.538168960432118e-05, + "loss": 1.9357, + "step": 8872 + }, + { + "epoch": 2.7234499693063228, + "grad_norm": 0.3126194477081299, + "learning_rate": 8.537817733440577e-05, + "loss": 1.7954, + "step": 8873 + }, + { + "epoch": 2.723756906077348, + "grad_norm": 0.4018714129924774, + "learning_rate": 8.537466471486248e-05, + "loss": 1.824, + "step": 8874 + }, + { + "epoch": 2.7240638428483734, + "grad_norm": 0.5690213441848755, + "learning_rate": 8.537115174572602e-05, + "loss": 1.7807, + "step": 8875 + }, + { + "epoch": 2.7243707796193983, + "grad_norm": 0.4669814705848694, + "learning_rate": 8.53676384270311e-05, + "loss": 1.7438, + "step": 8876 + }, + { + "epoch": 2.7246777163904237, + "grad_norm": 0.3040566146373749, + "learning_rate": 8.536412475881246e-05, + "loss": 1.8613, + "step": 8877 + }, + { + "epoch": 2.7249846531614486, + "grad_norm": 0.38985559344291687, + "learning_rate": 8.53606107411048e-05, + "loss": 1.816, + "step": 8878 + }, + { + "epoch": 2.725291589932474, + "grad_norm": 0.4417174160480499, + "learning_rate": 8.535709637394285e-05, + "loss": 1.8675, + "step": 8879 + }, + { + "epoch": 2.7255985267034992, + "grad_norm": 0.3254696726799011, + "learning_rate": 8.535358165736138e-05, + "loss": 1.8419, + "step": 8880 + }, + { + "epoch": 2.725905463474524, + "grad_norm": 0.36002370715141296, + "learning_rate": 8.535006659139506e-05, + "loss": 1.9084, + "step": 8881 + }, + { + "epoch": 2.7262124002455494, + "grad_norm": 0.3471790850162506, + "learning_rate": 8.534655117607869e-05, + "loss": 1.8442, + "step": 8882 + }, + { + "epoch": 2.7265193370165743, + "grad_norm": 0.3042849004268646, + "learning_rate": 8.534303541144697e-05, + "loss": 1.8261, + "step": 8883 + }, + { + "epoch": 2.7268262737875997, + "grad_norm": 0.32416659593582153, + "learning_rate": 8.533951929753465e-05, + "loss": 1.8625, + "step": 8884 + }, + { + "epoch": 2.727133210558625, + "grad_norm": 0.32449519634246826, + "learning_rate": 8.53360028343765e-05, + "loss": 1.8653, + "step": 8885 + }, + { + "epoch": 2.7274401473296503, + "grad_norm": 0.34744054079055786, + "learning_rate": 8.533248602200726e-05, + "loss": 1.8742, + "step": 8886 + }, + { + "epoch": 2.7277470841006752, + "grad_norm": 0.30540695786476135, + "learning_rate": 8.532896886046167e-05, + "loss": 1.8064, + "step": 8887 + }, + { + "epoch": 2.7280540208717006, + "grad_norm": 0.27105677127838135, + "learning_rate": 8.532545134977452e-05, + "loss": 1.7867, + "step": 8888 + }, + { + "epoch": 2.7283609576427255, + "grad_norm": 0.2682685852050781, + "learning_rate": 8.532193348998054e-05, + "loss": 1.8191, + "step": 8889 + }, + { + "epoch": 2.728667894413751, + "grad_norm": 0.33534809947013855, + "learning_rate": 8.531841528111452e-05, + "loss": 1.8758, + "step": 8890 + }, + { + "epoch": 2.728974831184776, + "grad_norm": 0.33555057644844055, + "learning_rate": 8.531489672321122e-05, + "loss": 1.8932, + "step": 8891 + }, + { + "epoch": 2.729281767955801, + "grad_norm": 0.3532167077064514, + "learning_rate": 8.531137781630542e-05, + "loss": 1.8621, + "step": 8892 + }, + { + "epoch": 2.7295887047268264, + "grad_norm": 0.337634414434433, + "learning_rate": 8.530785856043186e-05, + "loss": 1.8618, + "step": 8893 + }, + { + "epoch": 2.7298956414978512, + "grad_norm": 0.28855568170547485, + "learning_rate": 8.530433895562538e-05, + "loss": 1.8248, + "step": 8894 + }, + { + "epoch": 2.7302025782688766, + "grad_norm": 0.3128049373626709, + "learning_rate": 8.530081900192071e-05, + "loss": 1.8071, + "step": 8895 + }, + { + "epoch": 2.730509515039902, + "grad_norm": 0.2949801981449127, + "learning_rate": 8.529729869935265e-05, + "loss": 1.7704, + "step": 8896 + }, + { + "epoch": 2.730816451810927, + "grad_norm": 0.2708294987678528, + "learning_rate": 8.529377804795603e-05, + "loss": 1.8127, + "step": 8897 + }, + { + "epoch": 2.731123388581952, + "grad_norm": 0.300516813993454, + "learning_rate": 8.529025704776559e-05, + "loss": 1.9063, + "step": 8898 + }, + { + "epoch": 2.731430325352977, + "grad_norm": 0.2590954005718231, + "learning_rate": 8.528673569881613e-05, + "loss": 1.7595, + "step": 8899 + }, + { + "epoch": 2.7317372621240024, + "grad_norm": 0.30067136883735657, + "learning_rate": 8.528321400114248e-05, + "loss": 1.8697, + "step": 8900 + }, + { + "epoch": 2.7320441988950277, + "grad_norm": 0.3289981186389923, + "learning_rate": 8.527969195477943e-05, + "loss": 1.8257, + "step": 8901 + }, + { + "epoch": 2.732351135666053, + "grad_norm": 0.3205581307411194, + "learning_rate": 8.527616955976178e-05, + "loss": 1.9002, + "step": 8902 + }, + { + "epoch": 2.732658072437078, + "grad_norm": 0.30869361758232117, + "learning_rate": 8.527264681612435e-05, + "loss": 1.8239, + "step": 8903 + }, + { + "epoch": 2.7329650092081033, + "grad_norm": 0.3237484097480774, + "learning_rate": 8.526912372390195e-05, + "loss": 1.8879, + "step": 8904 + }, + { + "epoch": 2.733271945979128, + "grad_norm": 0.3172036111354828, + "learning_rate": 8.52656002831294e-05, + "loss": 1.8118, + "step": 8905 + }, + { + "epoch": 2.7335788827501535, + "grad_norm": 0.3326823115348816, + "learning_rate": 8.52620764938415e-05, + "loss": 1.8035, + "step": 8906 + }, + { + "epoch": 2.733885819521179, + "grad_norm": 0.36605212092399597, + "learning_rate": 8.525855235607311e-05, + "loss": 1.8689, + "step": 8907 + }, + { + "epoch": 2.7341927562922037, + "grad_norm": 0.31904828548431396, + "learning_rate": 8.525502786985905e-05, + "loss": 1.8188, + "step": 8908 + }, + { + "epoch": 2.734499693063229, + "grad_norm": 0.2657643258571625, + "learning_rate": 8.525150303523413e-05, + "loss": 1.7471, + "step": 8909 + }, + { + "epoch": 2.734806629834254, + "grad_norm": 0.32748520374298096, + "learning_rate": 8.524797785223318e-05, + "loss": 1.8678, + "step": 8910 + }, + { + "epoch": 2.7351135666052793, + "grad_norm": 0.32576173543930054, + "learning_rate": 8.524445232089107e-05, + "loss": 1.8296, + "step": 8911 + }, + { + "epoch": 2.7354205033763046, + "grad_norm": 0.3028578758239746, + "learning_rate": 8.524092644124261e-05, + "loss": 1.8656, + "step": 8912 + }, + { + "epoch": 2.7357274401473295, + "grad_norm": 0.29967090487480164, + "learning_rate": 8.523740021332268e-05, + "loss": 1.8206, + "step": 8913 + }, + { + "epoch": 2.736034376918355, + "grad_norm": 0.3042941391468048, + "learning_rate": 8.523387363716611e-05, + "loss": 1.7928, + "step": 8914 + }, + { + "epoch": 2.7363413136893797, + "grad_norm": 0.3278021216392517, + "learning_rate": 8.523034671280772e-05, + "loss": 1.9213, + "step": 8915 + }, + { + "epoch": 2.736648250460405, + "grad_norm": 0.39839017391204834, + "learning_rate": 8.522681944028242e-05, + "loss": 1.8242, + "step": 8916 + }, + { + "epoch": 2.7369551872314304, + "grad_norm": 0.3960748016834259, + "learning_rate": 8.522329181962504e-05, + "loss": 1.8761, + "step": 8917 + }, + { + "epoch": 2.7372621240024557, + "grad_norm": 0.3250591456890106, + "learning_rate": 8.521976385087044e-05, + "loss": 1.8318, + "step": 8918 + }, + { + "epoch": 2.7375690607734806, + "grad_norm": 0.31731119751930237, + "learning_rate": 8.521623553405349e-05, + "loss": 1.8062, + "step": 8919 + }, + { + "epoch": 2.737875997544506, + "grad_norm": 0.32452264428138733, + "learning_rate": 8.521270686920906e-05, + "loss": 1.8384, + "step": 8920 + }, + { + "epoch": 2.738182934315531, + "grad_norm": 0.2892500162124634, + "learning_rate": 8.520917785637204e-05, + "loss": 1.8128, + "step": 8921 + }, + { + "epoch": 2.738489871086556, + "grad_norm": 0.30028483271598816, + "learning_rate": 8.520564849557726e-05, + "loss": 1.8512, + "step": 8922 + }, + { + "epoch": 2.7387968078575815, + "grad_norm": 0.29927411675453186, + "learning_rate": 8.520211878685964e-05, + "loss": 1.8431, + "step": 8923 + }, + { + "epoch": 2.7391037446286064, + "grad_norm": 0.3426479995250702, + "learning_rate": 8.519858873025405e-05, + "loss": 1.8724, + "step": 8924 + }, + { + "epoch": 2.7394106813996317, + "grad_norm": 0.3795917332172394, + "learning_rate": 8.519505832579538e-05, + "loss": 1.8888, + "step": 8925 + }, + { + "epoch": 2.7397176181706566, + "grad_norm": 0.4924582839012146, + "learning_rate": 8.519152757351849e-05, + "loss": 1.7743, + "step": 8926 + }, + { + "epoch": 2.740024554941682, + "grad_norm": 0.43054282665252686, + "learning_rate": 8.518799647345832e-05, + "loss": 1.8556, + "step": 8927 + }, + { + "epoch": 2.7403314917127073, + "grad_norm": 0.37040412425994873, + "learning_rate": 8.518446502564974e-05, + "loss": 1.9162, + "step": 8928 + }, + { + "epoch": 2.7406384284837326, + "grad_norm": 0.38334885239601135, + "learning_rate": 8.518093323012766e-05, + "loss": 1.8078, + "step": 8929 + }, + { + "epoch": 2.7409453652547575, + "grad_norm": 0.409101665019989, + "learning_rate": 8.517740108692698e-05, + "loss": 1.7874, + "step": 8930 + }, + { + "epoch": 2.741252302025783, + "grad_norm": 0.3953499495983124, + "learning_rate": 8.517386859608258e-05, + "loss": 1.8455, + "step": 8931 + }, + { + "epoch": 2.7415592387968077, + "grad_norm": 0.30524972081184387, + "learning_rate": 8.517033575762942e-05, + "loss": 1.822, + "step": 8932 + }, + { + "epoch": 2.741866175567833, + "grad_norm": 0.354086309671402, + "learning_rate": 8.516680257160239e-05, + "loss": 1.859, + "step": 8933 + }, + { + "epoch": 2.7421731123388584, + "grad_norm": 0.4305376410484314, + "learning_rate": 8.516326903803638e-05, + "loss": 1.8918, + "step": 8934 + }, + { + "epoch": 2.7424800491098833, + "grad_norm": 0.590727686882019, + "learning_rate": 8.515973515696635e-05, + "loss": 1.8841, + "step": 8935 + }, + { + "epoch": 2.7427869858809086, + "grad_norm": 0.665314257144928, + "learning_rate": 8.515620092842723e-05, + "loss": 1.8166, + "step": 8936 + }, + { + "epoch": 2.7430939226519335, + "grad_norm": 0.5579181909561157, + "learning_rate": 8.515266635245389e-05, + "loss": 1.8344, + "step": 8937 + }, + { + "epoch": 2.743400859422959, + "grad_norm": 0.3698382079601288, + "learning_rate": 8.514913142908132e-05, + "loss": 1.8445, + "step": 8938 + }, + { + "epoch": 2.743707796193984, + "grad_norm": 0.30882057547569275, + "learning_rate": 8.514559615834442e-05, + "loss": 1.8443, + "step": 8939 + }, + { + "epoch": 2.744014732965009, + "grad_norm": 0.35821446776390076, + "learning_rate": 8.514206054027815e-05, + "loss": 1.8482, + "step": 8940 + }, + { + "epoch": 2.7443216697360344, + "grad_norm": 0.35552099347114563, + "learning_rate": 8.513852457491744e-05, + "loss": 1.7848, + "step": 8941 + }, + { + "epoch": 2.7446286065070593, + "grad_norm": 0.27788954973220825, + "learning_rate": 8.513498826229722e-05, + "loss": 1.7935, + "step": 8942 + }, + { + "epoch": 2.7449355432780846, + "grad_norm": 0.30653929710388184, + "learning_rate": 8.513145160245246e-05, + "loss": 1.808, + "step": 8943 + }, + { + "epoch": 2.74524248004911, + "grad_norm": 0.34749966859817505, + "learning_rate": 8.512791459541812e-05, + "loss": 1.8498, + "step": 8944 + }, + { + "epoch": 2.7455494168201353, + "grad_norm": 0.362326979637146, + "learning_rate": 8.512437724122912e-05, + "loss": 1.8263, + "step": 8945 + }, + { + "epoch": 2.74585635359116, + "grad_norm": 0.2914038598537445, + "learning_rate": 8.512083953992044e-05, + "loss": 1.834, + "step": 8946 + }, + { + "epoch": 2.7461632903621855, + "grad_norm": 0.31662893295288086, + "learning_rate": 8.511730149152705e-05, + "loss": 1.8157, + "step": 8947 + }, + { + "epoch": 2.7464702271332104, + "grad_norm": 0.38970568776130676, + "learning_rate": 8.51137630960839e-05, + "loss": 1.8764, + "step": 8948 + }, + { + "epoch": 2.7467771639042358, + "grad_norm": 0.3907272517681122, + "learning_rate": 8.511022435362594e-05, + "loss": 1.8665, + "step": 8949 + }, + { + "epoch": 2.747084100675261, + "grad_norm": 0.3315196931362152, + "learning_rate": 8.510668526418819e-05, + "loss": 1.8076, + "step": 8950 + }, + { + "epoch": 2.747391037446286, + "grad_norm": 0.29783520102500916, + "learning_rate": 8.510314582780559e-05, + "loss": 1.8518, + "step": 8951 + }, + { + "epoch": 2.7476979742173113, + "grad_norm": 0.3085685670375824, + "learning_rate": 8.509960604451312e-05, + "loss": 1.8961, + "step": 8952 + }, + { + "epoch": 2.748004910988336, + "grad_norm": 0.3204992711544037, + "learning_rate": 8.509606591434579e-05, + "loss": 1.8374, + "step": 8953 + }, + { + "epoch": 2.7483118477593615, + "grad_norm": 0.2801276445388794, + "learning_rate": 8.509252543733855e-05, + "loss": 1.8455, + "step": 8954 + }, + { + "epoch": 2.748618784530387, + "grad_norm": 0.26911506056785583, + "learning_rate": 8.508898461352641e-05, + "loss": 1.8093, + "step": 8955 + }, + { + "epoch": 2.7489257213014118, + "grad_norm": 0.30429625511169434, + "learning_rate": 8.508544344294435e-05, + "loss": 1.8526, + "step": 8956 + }, + { + "epoch": 2.749232658072437, + "grad_norm": 0.308403342962265, + "learning_rate": 8.50819019256274e-05, + "loss": 1.7917, + "step": 8957 + }, + { + "epoch": 2.749539594843462, + "grad_norm": 0.3292251229286194, + "learning_rate": 8.507836006161052e-05, + "loss": 1.8206, + "step": 8958 + }, + { + "epoch": 2.7498465316144873, + "grad_norm": 0.30014076828956604, + "learning_rate": 8.507481785092871e-05, + "loss": 1.8136, + "step": 8959 + }, + { + "epoch": 2.7501534683855127, + "grad_norm": 0.2879343032836914, + "learning_rate": 8.5071275293617e-05, + "loss": 1.8476, + "step": 8960 + }, + { + "epoch": 2.750460405156538, + "grad_norm": 0.30646058917045593, + "learning_rate": 8.506773238971039e-05, + "loss": 1.7936, + "step": 8961 + }, + { + "epoch": 2.750767341927563, + "grad_norm": 0.309804230928421, + "learning_rate": 8.506418913924391e-05, + "loss": 1.8076, + "step": 8962 + }, + { + "epoch": 2.7510742786985882, + "grad_norm": 0.27035996317863464, + "learning_rate": 8.506064554225255e-05, + "loss": 1.8169, + "step": 8963 + }, + { + "epoch": 2.751381215469613, + "grad_norm": 0.3185548782348633, + "learning_rate": 8.505710159877134e-05, + "loss": 1.8265, + "step": 8964 + }, + { + "epoch": 2.7516881522406385, + "grad_norm": 0.3806973099708557, + "learning_rate": 8.505355730883532e-05, + "loss": 1.824, + "step": 8965 + }, + { + "epoch": 2.751995089011664, + "grad_norm": 0.3206372857093811, + "learning_rate": 8.505001267247949e-05, + "loss": 1.8436, + "step": 8966 + }, + { + "epoch": 2.7523020257826887, + "grad_norm": 0.2957460880279541, + "learning_rate": 8.504646768973889e-05, + "loss": 1.8212, + "step": 8967 + }, + { + "epoch": 2.752608962553714, + "grad_norm": 0.2854628562927246, + "learning_rate": 8.504292236064854e-05, + "loss": 1.862, + "step": 8968 + }, + { + "epoch": 2.752915899324739, + "grad_norm": 0.30056047439575195, + "learning_rate": 8.503937668524351e-05, + "loss": 1.8007, + "step": 8969 + }, + { + "epoch": 2.7532228360957642, + "grad_norm": 0.33884522318840027, + "learning_rate": 8.503583066355883e-05, + "loss": 1.8972, + "step": 8970 + }, + { + "epoch": 2.7535297728667896, + "grad_norm": 0.29358747601509094, + "learning_rate": 8.503228429562951e-05, + "loss": 1.8343, + "step": 8971 + }, + { + "epoch": 2.7538367096378145, + "grad_norm": 0.3650909662246704, + "learning_rate": 8.502873758149063e-05, + "loss": 1.7866, + "step": 8972 + }, + { + "epoch": 2.75414364640884, + "grad_norm": 0.3245839476585388, + "learning_rate": 8.502519052117725e-05, + "loss": 1.8451, + "step": 8973 + }, + { + "epoch": 2.7544505831798647, + "grad_norm": 0.305429071187973, + "learning_rate": 8.502164311472441e-05, + "loss": 1.9277, + "step": 8974 + }, + { + "epoch": 2.75475751995089, + "grad_norm": 0.3520638942718506, + "learning_rate": 8.501809536216716e-05, + "loss": 1.7648, + "step": 8975 + }, + { + "epoch": 2.7550644567219154, + "grad_norm": 0.419918030500412, + "learning_rate": 8.501454726354054e-05, + "loss": 1.7862, + "step": 8976 + }, + { + "epoch": 2.7553713934929407, + "grad_norm": 0.3854345977306366, + "learning_rate": 8.501099881887968e-05, + "loss": 1.8234, + "step": 8977 + }, + { + "epoch": 2.7556783302639656, + "grad_norm": 0.27826064825057983, + "learning_rate": 8.50074500282196e-05, + "loss": 1.7694, + "step": 8978 + }, + { + "epoch": 2.755985267034991, + "grad_norm": 0.3439055383205414, + "learning_rate": 8.500390089159536e-05, + "loss": 1.8136, + "step": 8979 + }, + { + "epoch": 2.756292203806016, + "grad_norm": 0.3434913754463196, + "learning_rate": 8.500035140904208e-05, + "loss": 1.8053, + "step": 8980 + }, + { + "epoch": 2.756599140577041, + "grad_norm": 0.27551600337028503, + "learning_rate": 8.49968015805948e-05, + "loss": 1.8349, + "step": 8981 + }, + { + "epoch": 2.7569060773480665, + "grad_norm": 0.304706871509552, + "learning_rate": 8.499325140628863e-05, + "loss": 1.8488, + "step": 8982 + }, + { + "epoch": 2.7572130141190914, + "grad_norm": 0.36910584568977356, + "learning_rate": 8.498970088615861e-05, + "loss": 1.8519, + "step": 8983 + }, + { + "epoch": 2.7575199508901167, + "grad_norm": 0.30584999918937683, + "learning_rate": 8.498615002023987e-05, + "loss": 1.8479, + "step": 8984 + }, + { + "epoch": 2.7578268876611416, + "grad_norm": 0.28511542081832886, + "learning_rate": 8.498259880856749e-05, + "loss": 1.8047, + "step": 8985 + }, + { + "epoch": 2.758133824432167, + "grad_norm": 0.28804922103881836, + "learning_rate": 8.497904725117658e-05, + "loss": 1.891, + "step": 8986 + }, + { + "epoch": 2.7584407612031923, + "grad_norm": 0.32592445611953735, + "learning_rate": 8.497549534810221e-05, + "loss": 1.8081, + "step": 8987 + }, + { + "epoch": 2.758747697974217, + "grad_norm": 0.3298552632331848, + "learning_rate": 8.497194309937949e-05, + "loss": 1.8897, + "step": 8988 + }, + { + "epoch": 2.7590546347452425, + "grad_norm": 0.3506438136100769, + "learning_rate": 8.496839050504353e-05, + "loss": 1.9007, + "step": 8989 + }, + { + "epoch": 2.7593615715162674, + "grad_norm": 0.30891793966293335, + "learning_rate": 8.496483756512946e-05, + "loss": 1.8154, + "step": 8990 + }, + { + "epoch": 2.7596685082872927, + "grad_norm": 0.3697068691253662, + "learning_rate": 8.496128427967235e-05, + "loss": 1.8301, + "step": 8991 + }, + { + "epoch": 2.759975445058318, + "grad_norm": 0.3090182840824127, + "learning_rate": 8.495773064870734e-05, + "loss": 1.8443, + "step": 8992 + }, + { + "epoch": 2.7602823818293434, + "grad_norm": 0.31172695755958557, + "learning_rate": 8.495417667226955e-05, + "loss": 1.8051, + "step": 8993 + }, + { + "epoch": 2.7605893186003683, + "grad_norm": 0.34285077452659607, + "learning_rate": 8.495062235039411e-05, + "loss": 1.8766, + "step": 8994 + }, + { + "epoch": 2.7608962553713936, + "grad_norm": 0.30001118779182434, + "learning_rate": 8.494706768311612e-05, + "loss": 1.8267, + "step": 8995 + }, + { + "epoch": 2.7612031921424185, + "grad_norm": 0.2767544984817505, + "learning_rate": 8.494351267047074e-05, + "loss": 1.8038, + "step": 8996 + }, + { + "epoch": 2.761510128913444, + "grad_norm": 0.2952648401260376, + "learning_rate": 8.493995731249307e-05, + "loss": 1.7863, + "step": 8997 + }, + { + "epoch": 2.761817065684469, + "grad_norm": 0.27491581439971924, + "learning_rate": 8.493640160921828e-05, + "loss": 1.844, + "step": 8998 + }, + { + "epoch": 2.762124002455494, + "grad_norm": 0.2733328938484192, + "learning_rate": 8.493284556068147e-05, + "loss": 1.7909, + "step": 8999 + }, + { + "epoch": 2.7624309392265194, + "grad_norm": 0.3201010525226593, + "learning_rate": 8.492928916691783e-05, + "loss": 1.8827, + "step": 9000 + }, + { + "epoch": 2.7627378759975443, + "grad_norm": 0.293652206659317, + "learning_rate": 8.492573242796244e-05, + "loss": 1.7755, + "step": 9001 + }, + { + "epoch": 2.7630448127685696, + "grad_norm": 0.2862321436405182, + "learning_rate": 8.492217534385053e-05, + "loss": 1.7868, + "step": 9002 + }, + { + "epoch": 2.763351749539595, + "grad_norm": 0.364490270614624, + "learning_rate": 8.491861791461722e-05, + "loss": 1.8276, + "step": 9003 + }, + { + "epoch": 2.7636586863106203, + "grad_norm": 0.4316955506801605, + "learning_rate": 8.491506014029765e-05, + "loss": 1.8727, + "step": 9004 + }, + { + "epoch": 2.763965623081645, + "grad_norm": 0.37957659363746643, + "learning_rate": 8.491150202092697e-05, + "loss": 1.8471, + "step": 9005 + }, + { + "epoch": 2.7642725598526705, + "grad_norm": 0.2936808168888092, + "learning_rate": 8.490794355654039e-05, + "loss": 1.7964, + "step": 9006 + }, + { + "epoch": 2.7645794966236954, + "grad_norm": 0.3742556869983673, + "learning_rate": 8.490438474717304e-05, + "loss": 1.8461, + "step": 9007 + }, + { + "epoch": 2.7648864333947207, + "grad_norm": 0.4273780286312103, + "learning_rate": 8.49008255928601e-05, + "loss": 1.7947, + "step": 9008 + }, + { + "epoch": 2.765193370165746, + "grad_norm": 0.35967808961868286, + "learning_rate": 8.489726609363675e-05, + "loss": 1.8125, + "step": 9009 + }, + { + "epoch": 2.765500306936771, + "grad_norm": 0.27607613801956177, + "learning_rate": 8.489370624953817e-05, + "loss": 1.8413, + "step": 9010 + }, + { + "epoch": 2.7658072437077963, + "grad_norm": 0.38287433981895447, + "learning_rate": 8.489014606059952e-05, + "loss": 1.8184, + "step": 9011 + }, + { + "epoch": 2.766114180478821, + "grad_norm": 0.4284100830554962, + "learning_rate": 8.4886585526856e-05, + "loss": 1.7965, + "step": 9012 + }, + { + "epoch": 2.7664211172498465, + "grad_norm": 0.35851627588272095, + "learning_rate": 8.48830246483428e-05, + "loss": 1.8275, + "step": 9013 + }, + { + "epoch": 2.766728054020872, + "grad_norm": 0.30598360300064087, + "learning_rate": 8.487946342509509e-05, + "loss": 1.8383, + "step": 9014 + }, + { + "epoch": 2.7670349907918967, + "grad_norm": 0.30098259449005127, + "learning_rate": 8.487590185714811e-05, + "loss": 1.8229, + "step": 9015 + }, + { + "epoch": 2.767341927562922, + "grad_norm": 0.45887723565101624, + "learning_rate": 8.487233994453701e-05, + "loss": 1.9128, + "step": 9016 + }, + { + "epoch": 2.767648864333947, + "grad_norm": 0.4983403980731964, + "learning_rate": 8.4868777687297e-05, + "loss": 1.8269, + "step": 9017 + }, + { + "epoch": 2.7679558011049723, + "grad_norm": 0.4925507605075836, + "learning_rate": 8.48652150854633e-05, + "loss": 1.9231, + "step": 9018 + }, + { + "epoch": 2.7682627378759976, + "grad_norm": 0.31434112787246704, + "learning_rate": 8.48616521390711e-05, + "loss": 1.7782, + "step": 9019 + }, + { + "epoch": 2.768569674647023, + "grad_norm": 0.31802332401275635, + "learning_rate": 8.485808884815563e-05, + "loss": 1.8927, + "step": 9020 + }, + { + "epoch": 2.768876611418048, + "grad_norm": 0.4615871012210846, + "learning_rate": 8.485452521275208e-05, + "loss": 1.7866, + "step": 9021 + }, + { + "epoch": 2.769183548189073, + "grad_norm": 0.43722355365753174, + "learning_rate": 8.48509612328957e-05, + "loss": 1.8159, + "step": 9022 + }, + { + "epoch": 2.769490484960098, + "grad_norm": 0.27137285470962524, + "learning_rate": 8.484739690862169e-05, + "loss": 1.7613, + "step": 9023 + }, + { + "epoch": 2.7697974217311234, + "grad_norm": 0.32973676919937134, + "learning_rate": 8.484383223996528e-05, + "loss": 1.8321, + "step": 9024 + }, + { + "epoch": 2.7701043585021488, + "grad_norm": 0.38628003001213074, + "learning_rate": 8.484026722696169e-05, + "loss": 1.8154, + "step": 9025 + }, + { + "epoch": 2.7704112952731736, + "grad_norm": 0.33044543862342834, + "learning_rate": 8.483670186964617e-05, + "loss": 1.857, + "step": 9026 + }, + { + "epoch": 2.770718232044199, + "grad_norm": 0.2778245210647583, + "learning_rate": 8.483313616805393e-05, + "loss": 1.8524, + "step": 9027 + }, + { + "epoch": 2.771025168815224, + "grad_norm": 0.32064709067344666, + "learning_rate": 8.482957012222024e-05, + "loss": 1.8757, + "step": 9028 + }, + { + "epoch": 2.771332105586249, + "grad_norm": 0.29325249791145325, + "learning_rate": 8.48260037321803e-05, + "loss": 1.8504, + "step": 9029 + }, + { + "epoch": 2.7716390423572745, + "grad_norm": 0.308626651763916, + "learning_rate": 8.48224369979694e-05, + "loss": 1.882, + "step": 9030 + }, + { + "epoch": 2.7719459791282994, + "grad_norm": 0.34577706456184387, + "learning_rate": 8.481886991962276e-05, + "loss": 1.8178, + "step": 9031 + }, + { + "epoch": 2.7722529158993248, + "grad_norm": 0.3902320861816406, + "learning_rate": 8.481530249717564e-05, + "loss": 1.9111, + "step": 9032 + }, + { + "epoch": 2.7725598526703497, + "grad_norm": 0.431540310382843, + "learning_rate": 8.481173473066328e-05, + "loss": 1.8145, + "step": 9033 + }, + { + "epoch": 2.772866789441375, + "grad_norm": 0.3637184798717499, + "learning_rate": 8.480816662012097e-05, + "loss": 1.8298, + "step": 9034 + }, + { + "epoch": 2.7731737262124003, + "grad_norm": 0.3045017123222351, + "learning_rate": 8.480459816558397e-05, + "loss": 1.8099, + "step": 9035 + }, + { + "epoch": 2.7734806629834257, + "grad_norm": 0.4252402186393738, + "learning_rate": 8.48010293670875e-05, + "loss": 1.8125, + "step": 9036 + }, + { + "epoch": 2.7737875997544506, + "grad_norm": 0.37933188676834106, + "learning_rate": 8.479746022466688e-05, + "loss": 1.8162, + "step": 9037 + }, + { + "epoch": 2.774094536525476, + "grad_norm": 0.287536084651947, + "learning_rate": 8.479389073835735e-05, + "loss": 1.8377, + "step": 9038 + }, + { + "epoch": 2.7744014732965008, + "grad_norm": 0.3484840393066406, + "learning_rate": 8.47903209081942e-05, + "loss": 1.8166, + "step": 9039 + }, + { + "epoch": 2.774708410067526, + "grad_norm": 0.4489477872848511, + "learning_rate": 8.478675073421272e-05, + "loss": 1.8618, + "step": 9040 + }, + { + "epoch": 2.7750153468385514, + "grad_norm": 0.3817744553089142, + "learning_rate": 8.478318021644817e-05, + "loss": 1.86, + "step": 9041 + }, + { + "epoch": 2.7753222836095763, + "grad_norm": 0.263468861579895, + "learning_rate": 8.477960935493585e-05, + "loss": 1.7802, + "step": 9042 + }, + { + "epoch": 2.7756292203806017, + "grad_norm": 0.3218925893306732, + "learning_rate": 8.477603814971104e-05, + "loss": 1.8056, + "step": 9043 + }, + { + "epoch": 2.7759361571516266, + "grad_norm": 0.38502782583236694, + "learning_rate": 8.477246660080905e-05, + "loss": 1.8405, + "step": 9044 + }, + { + "epoch": 2.776243093922652, + "grad_norm": 0.3504064381122589, + "learning_rate": 8.476889470826517e-05, + "loss": 1.8606, + "step": 9045 + }, + { + "epoch": 2.7765500306936772, + "grad_norm": 0.3007161021232605, + "learning_rate": 8.476532247211468e-05, + "loss": 1.8407, + "step": 9046 + }, + { + "epoch": 2.776856967464702, + "grad_norm": 0.30306726694107056, + "learning_rate": 8.476174989239289e-05, + "loss": 1.8399, + "step": 9047 + }, + { + "epoch": 2.7771639042357275, + "grad_norm": 0.3898545801639557, + "learning_rate": 8.475817696913511e-05, + "loss": 1.8971, + "step": 9048 + }, + { + "epoch": 2.7774708410067523, + "grad_norm": 0.35386478900909424, + "learning_rate": 8.475460370237667e-05, + "loss": 1.8213, + "step": 9049 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.35815873742103577, + "learning_rate": 8.475103009215287e-05, + "loss": 1.9593, + "step": 9050 + }, + { + "epoch": 2.778084714548803, + "grad_norm": 0.28021275997161865, + "learning_rate": 8.474745613849901e-05, + "loss": 1.7767, + "step": 9051 + }, + { + "epoch": 2.7783916513198283, + "grad_norm": 0.3393603563308716, + "learning_rate": 8.474388184145042e-05, + "loss": 1.8484, + "step": 9052 + }, + { + "epoch": 2.7786985880908532, + "grad_norm": 0.30488693714141846, + "learning_rate": 8.474030720104243e-05, + "loss": 1.835, + "step": 9053 + }, + { + "epoch": 2.7790055248618786, + "grad_norm": 0.2839586138725281, + "learning_rate": 8.473673221731037e-05, + "loss": 1.8054, + "step": 9054 + }, + { + "epoch": 2.7793124616329035, + "grad_norm": 0.2718851864337921, + "learning_rate": 8.473315689028955e-05, + "loss": 1.8216, + "step": 9055 + }, + { + "epoch": 2.779619398403929, + "grad_norm": 0.3072827458381653, + "learning_rate": 8.472958122001531e-05, + "loss": 1.8537, + "step": 9056 + }, + { + "epoch": 2.779926335174954, + "grad_norm": 0.36827966570854187, + "learning_rate": 8.472600520652301e-05, + "loss": 1.8174, + "step": 9057 + }, + { + "epoch": 2.780233271945979, + "grad_norm": 0.37436968088150024, + "learning_rate": 8.472242884984797e-05, + "loss": 1.7983, + "step": 9058 + }, + { + "epoch": 2.7805402087170044, + "grad_norm": 0.3039530813694, + "learning_rate": 8.471885215002554e-05, + "loss": 1.839, + "step": 9059 + }, + { + "epoch": 2.7808471454880292, + "grad_norm": 0.2949865162372589, + "learning_rate": 8.471527510709106e-05, + "loss": 1.8191, + "step": 9060 + }, + { + "epoch": 2.7811540822590546, + "grad_norm": 0.2914051413536072, + "learning_rate": 8.471169772107987e-05, + "loss": 1.8511, + "step": 9061 + }, + { + "epoch": 2.78146101903008, + "grad_norm": 0.29169002175331116, + "learning_rate": 8.470811999202734e-05, + "loss": 1.8242, + "step": 9062 + }, + { + "epoch": 2.781767955801105, + "grad_norm": 0.2862909436225891, + "learning_rate": 8.470454191996884e-05, + "loss": 1.8471, + "step": 9063 + }, + { + "epoch": 2.78207489257213, + "grad_norm": 0.2820829749107361, + "learning_rate": 8.47009635049397e-05, + "loss": 1.8539, + "step": 9064 + }, + { + "epoch": 2.782381829343155, + "grad_norm": 0.2778072655200958, + "learning_rate": 8.469738474697532e-05, + "loss": 1.7999, + "step": 9065 + }, + { + "epoch": 2.7826887661141804, + "grad_norm": 0.35963353514671326, + "learning_rate": 8.469380564611103e-05, + "loss": 1.8589, + "step": 9066 + }, + { + "epoch": 2.7829957028852057, + "grad_norm": 0.29438379406929016, + "learning_rate": 8.469022620238223e-05, + "loss": 1.7898, + "step": 9067 + }, + { + "epoch": 2.783302639656231, + "grad_norm": 0.2766551971435547, + "learning_rate": 8.468664641582428e-05, + "loss": 1.858, + "step": 9068 + }, + { + "epoch": 2.783609576427256, + "grad_norm": 0.29893574118614197, + "learning_rate": 8.468306628647256e-05, + "loss": 1.7859, + "step": 9069 + }, + { + "epoch": 2.7839165131982813, + "grad_norm": 0.2744910717010498, + "learning_rate": 8.467948581436243e-05, + "loss": 1.7803, + "step": 9070 + }, + { + "epoch": 2.784223449969306, + "grad_norm": 0.2405908703804016, + "learning_rate": 8.467590499952931e-05, + "loss": 1.8064, + "step": 9071 + }, + { + "epoch": 2.7845303867403315, + "grad_norm": 0.28585049510002136, + "learning_rate": 8.467232384200858e-05, + "loss": 1.809, + "step": 9072 + }, + { + "epoch": 2.784837323511357, + "grad_norm": 0.25816819071769714, + "learning_rate": 8.466874234183562e-05, + "loss": 1.7687, + "step": 9073 + }, + { + "epoch": 2.7851442602823817, + "grad_norm": 0.3135145306587219, + "learning_rate": 8.466516049904582e-05, + "loss": 1.8902, + "step": 9074 + }, + { + "epoch": 2.785451197053407, + "grad_norm": 0.32004159688949585, + "learning_rate": 8.46615783136746e-05, + "loss": 1.8227, + "step": 9075 + }, + { + "epoch": 2.785758133824432, + "grad_norm": 0.2775251567363739, + "learning_rate": 8.465799578575733e-05, + "loss": 1.8293, + "step": 9076 + }, + { + "epoch": 2.7860650705954573, + "grad_norm": 0.3377391993999481, + "learning_rate": 8.465441291532944e-05, + "loss": 1.9096, + "step": 9077 + }, + { + "epoch": 2.7863720073664826, + "grad_norm": 0.322818398475647, + "learning_rate": 8.465082970242634e-05, + "loss": 1.8372, + "step": 9078 + }, + { + "epoch": 2.786678944137508, + "grad_norm": 0.30539727210998535, + "learning_rate": 8.464724614708342e-05, + "loss": 1.8678, + "step": 9079 + }, + { + "epoch": 2.786985880908533, + "grad_norm": 0.3148079216480255, + "learning_rate": 8.464366224933611e-05, + "loss": 1.798, + "step": 9080 + }, + { + "epoch": 2.787292817679558, + "grad_norm": 0.3834371566772461, + "learning_rate": 8.464007800921983e-05, + "loss": 1.7871, + "step": 9081 + }, + { + "epoch": 2.787599754450583, + "grad_norm": 0.360202431678772, + "learning_rate": 8.463649342676998e-05, + "loss": 1.8396, + "step": 9082 + }, + { + "epoch": 2.7879066912216084, + "grad_norm": 0.28360050916671753, + "learning_rate": 8.463290850202201e-05, + "loss": 1.7905, + "step": 9083 + }, + { + "epoch": 2.7882136279926337, + "grad_norm": 0.28087326884269714, + "learning_rate": 8.462932323501134e-05, + "loss": 1.8079, + "step": 9084 + }, + { + "epoch": 2.7885205647636586, + "grad_norm": 0.2725851833820343, + "learning_rate": 8.462573762577339e-05, + "loss": 1.8099, + "step": 9085 + }, + { + "epoch": 2.788827501534684, + "grad_norm": 0.27776938676834106, + "learning_rate": 8.462215167434363e-05, + "loss": 1.8002, + "step": 9086 + }, + { + "epoch": 2.789134438305709, + "grad_norm": 0.3118545711040497, + "learning_rate": 8.461856538075745e-05, + "loss": 1.8541, + "step": 9087 + }, + { + "epoch": 2.789441375076734, + "grad_norm": 0.29499873518943787, + "learning_rate": 8.461497874505034e-05, + "loss": 1.8667, + "step": 9088 + }, + { + "epoch": 2.7897483118477595, + "grad_norm": 0.31346917152404785, + "learning_rate": 8.46113917672577e-05, + "loss": 1.8737, + "step": 9089 + }, + { + "epoch": 2.7900552486187844, + "grad_norm": 0.30406203866004944, + "learning_rate": 8.460780444741501e-05, + "loss": 1.8467, + "step": 9090 + }, + { + "epoch": 2.7903621853898097, + "grad_norm": 0.28438735008239746, + "learning_rate": 8.46042167855577e-05, + "loss": 1.8008, + "step": 9091 + }, + { + "epoch": 2.7906691221608346, + "grad_norm": 0.29893866181373596, + "learning_rate": 8.460062878172125e-05, + "loss": 1.8498, + "step": 9092 + }, + { + "epoch": 2.79097605893186, + "grad_norm": 0.33810749650001526, + "learning_rate": 8.459704043594112e-05, + "loss": 1.8259, + "step": 9093 + }, + { + "epoch": 2.7912829957028853, + "grad_norm": 0.3726813495159149, + "learning_rate": 8.459345174825273e-05, + "loss": 1.8831, + "step": 9094 + }, + { + "epoch": 2.7915899324739106, + "grad_norm": 0.2983379662036896, + "learning_rate": 8.45898627186916e-05, + "loss": 1.7886, + "step": 9095 + }, + { + "epoch": 2.7918968692449355, + "grad_norm": 0.3235681354999542, + "learning_rate": 8.458627334729316e-05, + "loss": 1.8616, + "step": 9096 + }, + { + "epoch": 2.792203806015961, + "grad_norm": 0.47961094975471497, + "learning_rate": 8.458268363409288e-05, + "loss": 1.8134, + "step": 9097 + }, + { + "epoch": 2.7925107427869857, + "grad_norm": 0.5463281869888306, + "learning_rate": 8.457909357912628e-05, + "loss": 1.8288, + "step": 9098 + }, + { + "epoch": 2.792817679558011, + "grad_norm": 0.5377171635627747, + "learning_rate": 8.45755031824288e-05, + "loss": 1.8032, + "step": 9099 + }, + { + "epoch": 2.7931246163290364, + "grad_norm": 0.30159178376197815, + "learning_rate": 8.457191244403592e-05, + "loss": 1.7619, + "step": 9100 + }, + { + "epoch": 2.7934315531000613, + "grad_norm": 0.33798086643218994, + "learning_rate": 8.456832136398315e-05, + "loss": 1.839, + "step": 9101 + }, + { + "epoch": 2.7937384898710866, + "grad_norm": 0.5194488167762756, + "learning_rate": 8.456472994230595e-05, + "loss": 1.7908, + "step": 9102 + }, + { + "epoch": 2.7940454266421115, + "grad_norm": 0.49310582876205444, + "learning_rate": 8.456113817903986e-05, + "loss": 1.8471, + "step": 9103 + }, + { + "epoch": 2.794352363413137, + "grad_norm": 0.27490735054016113, + "learning_rate": 8.455754607422032e-05, + "loss": 1.8168, + "step": 9104 + }, + { + "epoch": 2.794659300184162, + "grad_norm": 0.3760504126548767, + "learning_rate": 8.455395362788285e-05, + "loss": 1.8796, + "step": 9105 + }, + { + "epoch": 2.794966236955187, + "grad_norm": 0.4636823534965515, + "learning_rate": 8.455036084006298e-05, + "loss": 1.8001, + "step": 9106 + }, + { + "epoch": 2.7952731737262124, + "grad_norm": 0.38666999340057373, + "learning_rate": 8.454676771079619e-05, + "loss": 1.8396, + "step": 9107 + }, + { + "epoch": 2.7955801104972373, + "grad_norm": 0.2992180585861206, + "learning_rate": 8.454317424011797e-05, + "loss": 1.8298, + "step": 9108 + }, + { + "epoch": 2.7958870472682626, + "grad_norm": 0.3744206428527832, + "learning_rate": 8.453958042806389e-05, + "loss": 1.8396, + "step": 9109 + }, + { + "epoch": 2.796193984039288, + "grad_norm": 0.5117284059524536, + "learning_rate": 8.453598627466941e-05, + "loss": 1.9734, + "step": 9110 + }, + { + "epoch": 2.7965009208103133, + "grad_norm": 0.36792969703674316, + "learning_rate": 8.453239177997008e-05, + "loss": 1.8347, + "step": 9111 + }, + { + "epoch": 2.796807857581338, + "grad_norm": 0.3352719843387604, + "learning_rate": 8.452879694400139e-05, + "loss": 1.7967, + "step": 9112 + }, + { + "epoch": 2.7971147943523635, + "grad_norm": 0.45745235681533813, + "learning_rate": 8.452520176679893e-05, + "loss": 1.8484, + "step": 9113 + }, + { + "epoch": 2.7974217311233884, + "grad_norm": 0.43958255648612976, + "learning_rate": 8.452160624839816e-05, + "loss": 1.7954, + "step": 9114 + }, + { + "epoch": 2.7977286678944138, + "grad_norm": 0.28715837001800537, + "learning_rate": 8.451801038883467e-05, + "loss": 1.8088, + "step": 9115 + }, + { + "epoch": 2.798035604665439, + "grad_norm": 0.3552972078323364, + "learning_rate": 8.451441418814394e-05, + "loss": 1.7654, + "step": 9116 + }, + { + "epoch": 2.798342541436464, + "grad_norm": 0.5065462589263916, + "learning_rate": 8.451081764636156e-05, + "loss": 1.7841, + "step": 9117 + }, + { + "epoch": 2.7986494782074893, + "grad_norm": 0.48900917172431946, + "learning_rate": 8.450722076352306e-05, + "loss": 1.8709, + "step": 9118 + }, + { + "epoch": 2.798956414978514, + "grad_norm": 0.31420227885246277, + "learning_rate": 8.450362353966395e-05, + "loss": 1.9057, + "step": 9119 + }, + { + "epoch": 2.7992633517495396, + "grad_norm": 0.35886913537979126, + "learning_rate": 8.450002597481982e-05, + "loss": 1.877, + "step": 9120 + }, + { + "epoch": 2.799570288520565, + "grad_norm": 0.3822213113307953, + "learning_rate": 8.449642806902623e-05, + "loss": 1.9171, + "step": 9121 + }, + { + "epoch": 2.7998772252915898, + "grad_norm": 0.3286183476448059, + "learning_rate": 8.449282982231869e-05, + "loss": 1.8342, + "step": 9122 + }, + { + "epoch": 2.800184162062615, + "grad_norm": 0.3498966693878174, + "learning_rate": 8.448923123473282e-05, + "loss": 1.8276, + "step": 9123 + }, + { + "epoch": 2.80049109883364, + "grad_norm": 0.3550187647342682, + "learning_rate": 8.448563230630413e-05, + "loss": 1.8585, + "step": 9124 + }, + { + "epoch": 2.8007980356046653, + "grad_norm": 0.32100117206573486, + "learning_rate": 8.448203303706821e-05, + "loss": 1.8168, + "step": 9125 + }, + { + "epoch": 2.8011049723756907, + "grad_norm": 0.3859860301017761, + "learning_rate": 8.447843342706063e-05, + "loss": 1.8941, + "step": 9126 + }, + { + "epoch": 2.801411909146716, + "grad_norm": 0.41674432158470154, + "learning_rate": 8.447483347631697e-05, + "loss": 1.7894, + "step": 9127 + }, + { + "epoch": 2.801718845917741, + "grad_norm": 0.3324837386608124, + "learning_rate": 8.44712331848728e-05, + "loss": 1.8901, + "step": 9128 + }, + { + "epoch": 2.8020257826887662, + "grad_norm": 0.30357789993286133, + "learning_rate": 8.44676325527637e-05, + "loss": 1.8434, + "step": 9129 + }, + { + "epoch": 2.802332719459791, + "grad_norm": 0.3215816617012024, + "learning_rate": 8.446403158002525e-05, + "loss": 1.8291, + "step": 9130 + }, + { + "epoch": 2.8026396562308165, + "grad_norm": 0.26280832290649414, + "learning_rate": 8.446043026669303e-05, + "loss": 1.7934, + "step": 9131 + }, + { + "epoch": 2.802946593001842, + "grad_norm": 0.2963539659976959, + "learning_rate": 8.445682861280265e-05, + "loss": 1.824, + "step": 9132 + }, + { + "epoch": 2.8032535297728667, + "grad_norm": 0.4251864552497864, + "learning_rate": 8.44532266183897e-05, + "loss": 1.9, + "step": 9133 + }, + { + "epoch": 2.803560466543892, + "grad_norm": 0.3920140862464905, + "learning_rate": 8.444962428348978e-05, + "loss": 1.7753, + "step": 9134 + }, + { + "epoch": 2.803867403314917, + "grad_norm": 0.2614890933036804, + "learning_rate": 8.444602160813845e-05, + "loss": 1.844, + "step": 9135 + }, + { + "epoch": 2.8041743400859422, + "grad_norm": 0.3359995484352112, + "learning_rate": 8.444241859237135e-05, + "loss": 1.8636, + "step": 9136 + }, + { + "epoch": 2.8044812768569676, + "grad_norm": 0.34399285912513733, + "learning_rate": 8.44388152362241e-05, + "loss": 1.8304, + "step": 9137 + }, + { + "epoch": 2.804788213627993, + "grad_norm": 0.27815961837768555, + "learning_rate": 8.443521153973228e-05, + "loss": 1.7916, + "step": 9138 + }, + { + "epoch": 2.805095150399018, + "grad_norm": 0.40705251693725586, + "learning_rate": 8.443160750293152e-05, + "loss": 1.7707, + "step": 9139 + }, + { + "epoch": 2.805402087170043, + "grad_norm": 0.49512532353401184, + "learning_rate": 8.442800312585744e-05, + "loss": 1.866, + "step": 9140 + }, + { + "epoch": 2.805709023941068, + "grad_norm": 0.31373831629753113, + "learning_rate": 8.442439840854565e-05, + "loss": 1.8495, + "step": 9141 + }, + { + "epoch": 2.8060159607120934, + "grad_norm": 0.33470213413238525, + "learning_rate": 8.442079335103177e-05, + "loss": 1.8459, + "step": 9142 + }, + { + "epoch": 2.8063228974831187, + "grad_norm": 0.4092586636543274, + "learning_rate": 8.441718795335145e-05, + "loss": 1.8547, + "step": 9143 + }, + { + "epoch": 2.8066298342541436, + "grad_norm": 0.37220728397369385, + "learning_rate": 8.44135822155403e-05, + "loss": 1.8922, + "step": 9144 + }, + { + "epoch": 2.806936771025169, + "grad_norm": 0.3197399973869324, + "learning_rate": 8.440997613763395e-05, + "loss": 1.872, + "step": 9145 + }, + { + "epoch": 2.807243707796194, + "grad_norm": 0.31258881092071533, + "learning_rate": 8.440636971966805e-05, + "loss": 1.8394, + "step": 9146 + }, + { + "epoch": 2.807550644567219, + "grad_norm": 0.31450721621513367, + "learning_rate": 8.440276296167825e-05, + "loss": 1.8496, + "step": 9147 + }, + { + "epoch": 2.8078575813382445, + "grad_norm": 0.30959805846214294, + "learning_rate": 8.439915586370018e-05, + "loss": 1.8326, + "step": 9148 + }, + { + "epoch": 2.8081645181092694, + "grad_norm": 0.2942456901073456, + "learning_rate": 8.439554842576949e-05, + "loss": 1.8742, + "step": 9149 + }, + { + "epoch": 2.8084714548802947, + "grad_norm": 0.32378795742988586, + "learning_rate": 8.439194064792182e-05, + "loss": 1.7991, + "step": 9150 + }, + { + "epoch": 2.8087783916513196, + "grad_norm": 0.30733996629714966, + "learning_rate": 8.438833253019285e-05, + "loss": 1.8822, + "step": 9151 + }, + { + "epoch": 2.809085328422345, + "grad_norm": 0.29933521151542664, + "learning_rate": 8.438472407261821e-05, + "loss": 1.7785, + "step": 9152 + }, + { + "epoch": 2.8093922651933703, + "grad_norm": 0.2992005944252014, + "learning_rate": 8.438111527523358e-05, + "loss": 1.9056, + "step": 9153 + }, + { + "epoch": 2.8096992019643956, + "grad_norm": 0.3074969947338104, + "learning_rate": 8.43775061380746e-05, + "loss": 1.8283, + "step": 9154 + }, + { + "epoch": 2.8100061387354205, + "grad_norm": 0.29843345284461975, + "learning_rate": 8.437389666117699e-05, + "loss": 1.87, + "step": 9155 + }, + { + "epoch": 2.810313075506446, + "grad_norm": 0.2939853072166443, + "learning_rate": 8.437028684457635e-05, + "loss": 1.8657, + "step": 9156 + }, + { + "epoch": 2.8106200122774707, + "grad_norm": 0.292972207069397, + "learning_rate": 8.436667668830841e-05, + "loss": 1.821, + "step": 9157 + }, + { + "epoch": 2.810926949048496, + "grad_norm": 0.298244833946228, + "learning_rate": 8.436306619240882e-05, + "loss": 1.8531, + "step": 9158 + }, + { + "epoch": 2.8112338858195214, + "grad_norm": 0.28567394614219666, + "learning_rate": 8.435945535691328e-05, + "loss": 1.7719, + "step": 9159 + }, + { + "epoch": 2.8115408225905463, + "grad_norm": 0.2876092493534088, + "learning_rate": 8.435584418185745e-05, + "loss": 1.7622, + "step": 9160 + }, + { + "epoch": 2.8118477593615716, + "grad_norm": 0.2656804919242859, + "learning_rate": 8.435223266727704e-05, + "loss": 1.7624, + "step": 9161 + }, + { + "epoch": 2.8121546961325965, + "grad_norm": 0.26690298318862915, + "learning_rate": 8.434862081320774e-05, + "loss": 1.807, + "step": 9162 + }, + { + "epoch": 2.812461632903622, + "grad_norm": 0.3088238537311554, + "learning_rate": 8.434500861968521e-05, + "loss": 1.9214, + "step": 9163 + }, + { + "epoch": 2.812768569674647, + "grad_norm": 0.32310751080513, + "learning_rate": 8.43413960867452e-05, + "loss": 1.8341, + "step": 9164 + }, + { + "epoch": 2.813075506445672, + "grad_norm": 0.3028428554534912, + "learning_rate": 8.433778321442339e-05, + "loss": 1.8316, + "step": 9165 + }, + { + "epoch": 2.8133824432166974, + "grad_norm": 0.28363901376724243, + "learning_rate": 8.433417000275545e-05, + "loss": 1.8506, + "step": 9166 + }, + { + "epoch": 2.8136893799877223, + "grad_norm": 0.2976547181606293, + "learning_rate": 8.433055645177714e-05, + "loss": 1.8654, + "step": 9167 + }, + { + "epoch": 2.8139963167587476, + "grad_norm": 0.2945725619792938, + "learning_rate": 8.432694256152414e-05, + "loss": 1.8146, + "step": 9168 + }, + { + "epoch": 2.814303253529773, + "grad_norm": 0.30364149808883667, + "learning_rate": 8.432332833203217e-05, + "loss": 1.8152, + "step": 9169 + }, + { + "epoch": 2.8146101903007983, + "grad_norm": 0.2776038348674774, + "learning_rate": 8.431971376333699e-05, + "loss": 1.7723, + "step": 9170 + }, + { + "epoch": 2.814917127071823, + "grad_norm": 0.41802000999450684, + "learning_rate": 8.431609885547425e-05, + "loss": 1.7909, + "step": 9171 + }, + { + "epoch": 2.8152240638428485, + "grad_norm": 0.400622695684433, + "learning_rate": 8.43124836084797e-05, + "loss": 1.8241, + "step": 9172 + }, + { + "epoch": 2.8155310006138734, + "grad_norm": 0.3760300576686859, + "learning_rate": 8.430886802238908e-05, + "loss": 1.9298, + "step": 9173 + }, + { + "epoch": 2.8158379373848987, + "grad_norm": 0.2944977283477783, + "learning_rate": 8.430525209723813e-05, + "loss": 1.8181, + "step": 9174 + }, + { + "epoch": 2.816144874155924, + "grad_norm": 0.28091785311698914, + "learning_rate": 8.430163583306257e-05, + "loss": 1.8178, + "step": 9175 + }, + { + "epoch": 2.816451810926949, + "grad_norm": 0.33689528703689575, + "learning_rate": 8.429801922989812e-05, + "loss": 1.8195, + "step": 9176 + }, + { + "epoch": 2.8167587476979743, + "grad_norm": 0.3541412055492401, + "learning_rate": 8.429440228778058e-05, + "loss": 1.8951, + "step": 9177 + }, + { + "epoch": 2.817065684468999, + "grad_norm": 0.2846376299858093, + "learning_rate": 8.429078500674564e-05, + "loss": 1.7858, + "step": 9178 + }, + { + "epoch": 2.8173726212400245, + "grad_norm": 0.28097108006477356, + "learning_rate": 8.428716738682905e-05, + "loss": 1.8503, + "step": 9179 + }, + { + "epoch": 2.81767955801105, + "grad_norm": 0.354670912027359, + "learning_rate": 8.428354942806658e-05, + "loss": 1.8332, + "step": 9180 + }, + { + "epoch": 2.8179864947820747, + "grad_norm": 0.3589770793914795, + "learning_rate": 8.427993113049397e-05, + "loss": 1.8527, + "step": 9181 + }, + { + "epoch": 2.8182934315531, + "grad_norm": 0.3171144723892212, + "learning_rate": 8.4276312494147e-05, + "loss": 1.789, + "step": 9182 + }, + { + "epoch": 2.818600368324125, + "grad_norm": 0.3540917932987213, + "learning_rate": 8.427269351906143e-05, + "loss": 1.8338, + "step": 9183 + }, + { + "epoch": 2.8189073050951503, + "grad_norm": 0.34149861335754395, + "learning_rate": 8.426907420527302e-05, + "loss": 1.8202, + "step": 9184 + }, + { + "epoch": 2.8192142418661756, + "grad_norm": 0.3035878837108612, + "learning_rate": 8.426545455281751e-05, + "loss": 1.842, + "step": 9185 + }, + { + "epoch": 2.819521178637201, + "grad_norm": 0.29007625579833984, + "learning_rate": 8.426183456173072e-05, + "loss": 1.8486, + "step": 9186 + }, + { + "epoch": 2.819828115408226, + "grad_norm": 0.3066602647304535, + "learning_rate": 8.425821423204837e-05, + "loss": 1.7833, + "step": 9187 + }, + { + "epoch": 2.820135052179251, + "grad_norm": 0.3163747191429138, + "learning_rate": 8.425459356380627e-05, + "loss": 1.8037, + "step": 9188 + }, + { + "epoch": 2.820441988950276, + "grad_norm": 0.3282648026943207, + "learning_rate": 8.425097255704022e-05, + "loss": 1.8476, + "step": 9189 + }, + { + "epoch": 2.8207489257213014, + "grad_norm": 0.3573009669780731, + "learning_rate": 8.424735121178598e-05, + "loss": 1.87, + "step": 9190 + }, + { + "epoch": 2.8210558624923268, + "grad_norm": 0.3480490744113922, + "learning_rate": 8.424372952807933e-05, + "loss": 1.8773, + "step": 9191 + }, + { + "epoch": 2.8213627992633517, + "grad_norm": 0.3296821415424347, + "learning_rate": 8.424010750595608e-05, + "loss": 1.8775, + "step": 9192 + }, + { + "epoch": 2.821669736034377, + "grad_norm": 0.33366382122039795, + "learning_rate": 8.423648514545202e-05, + "loss": 1.8064, + "step": 9193 + }, + { + "epoch": 2.821976672805402, + "grad_norm": 0.454303503036499, + "learning_rate": 8.423286244660295e-05, + "loss": 1.9702, + "step": 9194 + }, + { + "epoch": 2.822283609576427, + "grad_norm": 0.361215740442276, + "learning_rate": 8.422923940944466e-05, + "loss": 1.8055, + "step": 9195 + }, + { + "epoch": 2.8225905463474525, + "grad_norm": 0.3678447902202606, + "learning_rate": 8.422561603401297e-05, + "loss": 1.8924, + "step": 9196 + }, + { + "epoch": 2.8228974831184774, + "grad_norm": 0.32999005913734436, + "learning_rate": 8.422199232034369e-05, + "loss": 1.7887, + "step": 9197 + }, + { + "epoch": 2.8232044198895028, + "grad_norm": 0.2811618149280548, + "learning_rate": 8.42183682684726e-05, + "loss": 1.8166, + "step": 9198 + }, + { + "epoch": 2.8235113566605277, + "grad_norm": 0.3178839385509491, + "learning_rate": 8.421474387843555e-05, + "loss": 1.7868, + "step": 9199 + }, + { + "epoch": 2.823818293431553, + "grad_norm": 0.27299264073371887, + "learning_rate": 8.421111915026836e-05, + "loss": 1.816, + "step": 9200 + }, + { + "epoch": 2.8241252302025783, + "grad_norm": 0.3191591203212738, + "learning_rate": 8.420749408400684e-05, + "loss": 1.912, + "step": 9201 + }, + { + "epoch": 2.8244321669736037, + "grad_norm": 0.3638809323310852, + "learning_rate": 8.42038686796868e-05, + "loss": 1.7716, + "step": 9202 + }, + { + "epoch": 2.8247391037446286, + "grad_norm": 0.33573171496391296, + "learning_rate": 8.420024293734407e-05, + "loss": 1.8599, + "step": 9203 + }, + { + "epoch": 2.825046040515654, + "grad_norm": 0.29062843322753906, + "learning_rate": 8.419661685701452e-05, + "loss": 1.7982, + "step": 9204 + }, + { + "epoch": 2.825352977286679, + "grad_norm": 0.27475887537002563, + "learning_rate": 8.419299043873394e-05, + "loss": 1.7763, + "step": 9205 + }, + { + "epoch": 2.825659914057704, + "grad_norm": 0.2996850609779358, + "learning_rate": 8.41893636825382e-05, + "loss": 1.7957, + "step": 9206 + }, + { + "epoch": 2.8259668508287294, + "grad_norm": 0.38112908601760864, + "learning_rate": 8.418573658846314e-05, + "loss": 1.8536, + "step": 9207 + }, + { + "epoch": 2.8262737875997543, + "grad_norm": 0.3245584964752197, + "learning_rate": 8.418210915654456e-05, + "loss": 1.8254, + "step": 9208 + }, + { + "epoch": 2.8265807243707797, + "grad_norm": 0.24600234627723694, + "learning_rate": 8.417848138681837e-05, + "loss": 1.825, + "step": 9209 + }, + { + "epoch": 2.8268876611418046, + "grad_norm": 0.3130429685115814, + "learning_rate": 8.417485327932038e-05, + "loss": 1.7954, + "step": 9210 + }, + { + "epoch": 2.82719459791283, + "grad_norm": 0.3218819200992584, + "learning_rate": 8.417122483408647e-05, + "loss": 1.8343, + "step": 9211 + }, + { + "epoch": 2.8275015346838552, + "grad_norm": 0.3020598292350769, + "learning_rate": 8.416759605115248e-05, + "loss": 1.8547, + "step": 9212 + }, + { + "epoch": 2.8278084714548806, + "grad_norm": 0.2685437798500061, + "learning_rate": 8.416396693055429e-05, + "loss": 1.7828, + "step": 9213 + }, + { + "epoch": 2.8281154082259055, + "grad_norm": 0.2990378737449646, + "learning_rate": 8.416033747232775e-05, + "loss": 1.8108, + "step": 9214 + }, + { + "epoch": 2.828422344996931, + "grad_norm": 0.25395238399505615, + "learning_rate": 8.415670767650871e-05, + "loss": 1.786, + "step": 9215 + }, + { + "epoch": 2.8287292817679557, + "grad_norm": 0.3406725823879242, + "learning_rate": 8.41530775431331e-05, + "loss": 1.9015, + "step": 9216 + }, + { + "epoch": 2.829036218538981, + "grad_norm": 0.279859721660614, + "learning_rate": 8.414944707223676e-05, + "loss": 1.8639, + "step": 9217 + }, + { + "epoch": 2.8293431553100064, + "grad_norm": 0.2574310600757599, + "learning_rate": 8.414581626385554e-05, + "loss": 1.7595, + "step": 9218 + }, + { + "epoch": 2.8296500920810312, + "grad_norm": 0.2956291437149048, + "learning_rate": 8.414218511802537e-05, + "loss": 1.8418, + "step": 9219 + }, + { + "epoch": 2.8299570288520566, + "grad_norm": 0.30965283513069153, + "learning_rate": 8.41385536347821e-05, + "loss": 1.8241, + "step": 9220 + }, + { + "epoch": 2.8302639656230815, + "grad_norm": 0.3125357925891876, + "learning_rate": 8.413492181416166e-05, + "loss": 1.7961, + "step": 9221 + }, + { + "epoch": 2.830570902394107, + "grad_norm": 0.23901188373565674, + "learning_rate": 8.413128965619988e-05, + "loss": 1.8109, + "step": 9222 + }, + { + "epoch": 2.830877839165132, + "grad_norm": 0.26556700468063354, + "learning_rate": 8.412765716093272e-05, + "loss": 1.8756, + "step": 9223 + }, + { + "epoch": 2.831184775936157, + "grad_norm": 0.3080972731113434, + "learning_rate": 8.412402432839604e-05, + "loss": 1.8271, + "step": 9224 + }, + { + "epoch": 2.8314917127071824, + "grad_norm": 0.32894501090049744, + "learning_rate": 8.412039115862573e-05, + "loss": 1.8427, + "step": 9225 + }, + { + "epoch": 2.8317986494782073, + "grad_norm": 0.3136049509048462, + "learning_rate": 8.411675765165774e-05, + "loss": 1.8716, + "step": 9226 + }, + { + "epoch": 2.8321055862492326, + "grad_norm": 0.26859185099601746, + "learning_rate": 8.411312380752795e-05, + "loss": 1.8138, + "step": 9227 + }, + { + "epoch": 2.832412523020258, + "grad_norm": 0.26863718032836914, + "learning_rate": 8.410948962627227e-05, + "loss": 1.8286, + "step": 9228 + }, + { + "epoch": 2.8327194597912833, + "grad_norm": 0.25599852204322815, + "learning_rate": 8.410585510792663e-05, + "loss": 1.8274, + "step": 9229 + }, + { + "epoch": 2.833026396562308, + "grad_norm": 0.22787287831306458, + "learning_rate": 8.410222025252694e-05, + "loss": 1.7961, + "step": 9230 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.22957643866539001, + "learning_rate": 8.409858506010912e-05, + "loss": 1.7763, + "step": 9231 + }, + { + "epoch": 2.8336402701043584, + "grad_norm": 0.2794438302516937, + "learning_rate": 8.409494953070909e-05, + "loss": 1.8552, + "step": 9232 + }, + { + "epoch": 2.8339472068753837, + "grad_norm": 0.2755461037158966, + "learning_rate": 8.409131366436279e-05, + "loss": 1.8418, + "step": 9233 + }, + { + "epoch": 2.834254143646409, + "grad_norm": 0.27968719601631165, + "learning_rate": 8.408767746110616e-05, + "loss": 1.8774, + "step": 9234 + }, + { + "epoch": 2.834561080417434, + "grad_norm": 0.3014982044696808, + "learning_rate": 8.408404092097511e-05, + "loss": 1.8886, + "step": 9235 + }, + { + "epoch": 2.8348680171884593, + "grad_norm": 0.3139450252056122, + "learning_rate": 8.408040404400558e-05, + "loss": 1.8119, + "step": 9236 + }, + { + "epoch": 2.835174953959484, + "grad_norm": 0.43578827381134033, + "learning_rate": 8.407676683023353e-05, + "loss": 1.8173, + "step": 9237 + }, + { + "epoch": 2.8354818907305095, + "grad_norm": 0.4939953088760376, + "learning_rate": 8.407312927969489e-05, + "loss": 1.8647, + "step": 9238 + }, + { + "epoch": 2.835788827501535, + "grad_norm": 0.40801018476486206, + "learning_rate": 8.406949139242562e-05, + "loss": 1.8259, + "step": 9239 + }, + { + "epoch": 2.8360957642725597, + "grad_norm": 0.331249862909317, + "learning_rate": 8.406585316846168e-05, + "loss": 1.8727, + "step": 9240 + }, + { + "epoch": 2.836402701043585, + "grad_norm": 0.3368569314479828, + "learning_rate": 8.406221460783901e-05, + "loss": 1.8362, + "step": 9241 + }, + { + "epoch": 2.83670963781461, + "grad_norm": 0.4736326336860657, + "learning_rate": 8.405857571059355e-05, + "loss": 1.9543, + "step": 9242 + }, + { + "epoch": 2.8370165745856353, + "grad_norm": 0.4151712656021118, + "learning_rate": 8.405493647676131e-05, + "loss": 1.8764, + "step": 9243 + }, + { + "epoch": 2.8373235113566606, + "grad_norm": 0.3463367819786072, + "learning_rate": 8.405129690637821e-05, + "loss": 1.8578, + "step": 9244 + }, + { + "epoch": 2.837630448127686, + "grad_norm": 0.28701671957969666, + "learning_rate": 8.404765699948023e-05, + "loss": 1.8201, + "step": 9245 + }, + { + "epoch": 2.837937384898711, + "grad_norm": 0.2893613874912262, + "learning_rate": 8.404401675610336e-05, + "loss": 1.7918, + "step": 9246 + }, + { + "epoch": 2.838244321669736, + "grad_norm": 0.29359766840934753, + "learning_rate": 8.404037617628357e-05, + "loss": 1.7919, + "step": 9247 + }, + { + "epoch": 2.838551258440761, + "grad_norm": 0.30147913098335266, + "learning_rate": 8.403673526005682e-05, + "loss": 1.8227, + "step": 9248 + }, + { + "epoch": 2.8388581952117864, + "grad_norm": 0.28443291783332825, + "learning_rate": 8.403309400745908e-05, + "loss": 1.8128, + "step": 9249 + }, + { + "epoch": 2.8391651319828117, + "grad_norm": 0.27890142798423767, + "learning_rate": 8.40294524185264e-05, + "loss": 1.8109, + "step": 9250 + }, + { + "epoch": 2.8394720687538366, + "grad_norm": 0.29900890588760376, + "learning_rate": 8.402581049329471e-05, + "loss": 1.7852, + "step": 9251 + }, + { + "epoch": 2.839779005524862, + "grad_norm": 0.34249019622802734, + "learning_rate": 8.402216823180001e-05, + "loss": 1.8681, + "step": 9252 + }, + { + "epoch": 2.840085942295887, + "grad_norm": 0.3387257754802704, + "learning_rate": 8.40185256340783e-05, + "loss": 1.9171, + "step": 9253 + }, + { + "epoch": 2.840392879066912, + "grad_norm": 0.2831752598285675, + "learning_rate": 8.40148827001656e-05, + "loss": 1.8422, + "step": 9254 + }, + { + "epoch": 2.8406998158379375, + "grad_norm": 0.30895891785621643, + "learning_rate": 8.401123943009788e-05, + "loss": 1.7967, + "step": 9255 + }, + { + "epoch": 2.8410067526089624, + "grad_norm": 0.381154328584671, + "learning_rate": 8.400759582391116e-05, + "loss": 1.8359, + "step": 9256 + }, + { + "epoch": 2.8413136893799877, + "grad_norm": 0.4041622281074524, + "learning_rate": 8.400395188164144e-05, + "loss": 1.8306, + "step": 9257 + }, + { + "epoch": 2.8416206261510126, + "grad_norm": 0.3801247775554657, + "learning_rate": 8.400030760332474e-05, + "loss": 1.8696, + "step": 9258 + }, + { + "epoch": 2.841927562922038, + "grad_norm": 0.27382874488830566, + "learning_rate": 8.399666298899706e-05, + "loss": 1.8369, + "step": 9259 + }, + { + "epoch": 2.8422344996930633, + "grad_norm": 0.31395214796066284, + "learning_rate": 8.399301803869445e-05, + "loss": 1.8135, + "step": 9260 + }, + { + "epoch": 2.8425414364640886, + "grad_norm": 0.36473774909973145, + "learning_rate": 8.398937275245291e-05, + "loss": 1.8025, + "step": 9261 + }, + { + "epoch": 2.8428483732351135, + "grad_norm": 0.38420331478118896, + "learning_rate": 8.398572713030846e-05, + "loss": 1.7873, + "step": 9262 + }, + { + "epoch": 2.843155310006139, + "grad_norm": 0.2707001566886902, + "learning_rate": 8.398208117229714e-05, + "loss": 1.8071, + "step": 9263 + }, + { + "epoch": 2.8434622467771637, + "grad_norm": 0.3391258418560028, + "learning_rate": 8.397843487845496e-05, + "loss": 1.8186, + "step": 9264 + }, + { + "epoch": 2.843769183548189, + "grad_norm": 0.4473530650138855, + "learning_rate": 8.397478824881799e-05, + "loss": 1.9144, + "step": 9265 + }, + { + "epoch": 2.8440761203192144, + "grad_norm": 0.3141709268093109, + "learning_rate": 8.397114128342224e-05, + "loss": 1.77, + "step": 9266 + }, + { + "epoch": 2.8443830570902393, + "grad_norm": 0.29191854596138, + "learning_rate": 8.396749398230377e-05, + "loss": 1.8645, + "step": 9267 + }, + { + "epoch": 2.8446899938612646, + "grad_norm": 0.4399743676185608, + "learning_rate": 8.39638463454986e-05, + "loss": 1.8261, + "step": 9268 + }, + { + "epoch": 2.8449969306322895, + "grad_norm": 0.4741196036338806, + "learning_rate": 8.396019837304281e-05, + "loss": 1.8566, + "step": 9269 + }, + { + "epoch": 2.845303867403315, + "grad_norm": 0.39640361070632935, + "learning_rate": 8.395655006497243e-05, + "loss": 1.8062, + "step": 9270 + }, + { + "epoch": 2.84561080417434, + "grad_norm": 0.290171355009079, + "learning_rate": 8.39529014213235e-05, + "loss": 1.8463, + "step": 9271 + }, + { + "epoch": 2.845917740945365, + "grad_norm": 0.2773928940296173, + "learning_rate": 8.394925244213212e-05, + "loss": 1.7929, + "step": 9272 + }, + { + "epoch": 2.8462246777163904, + "grad_norm": 0.38512173295021057, + "learning_rate": 8.394560312743433e-05, + "loss": 1.8724, + "step": 9273 + }, + { + "epoch": 2.8465316144874153, + "grad_norm": 0.44405680894851685, + "learning_rate": 8.394195347726619e-05, + "loss": 1.8184, + "step": 9274 + }, + { + "epoch": 2.8468385512584407, + "grad_norm": 0.32526880502700806, + "learning_rate": 8.393830349166376e-05, + "loss": 1.8207, + "step": 9275 + }, + { + "epoch": 2.847145488029466, + "grad_norm": 0.2934194803237915, + "learning_rate": 8.393465317066313e-05, + "loss": 1.8023, + "step": 9276 + }, + { + "epoch": 2.8474524248004913, + "grad_norm": 0.43126001954078674, + "learning_rate": 8.393100251430037e-05, + "loss": 1.8283, + "step": 9277 + }, + { + "epoch": 2.847759361571516, + "grad_norm": 0.48253729939460754, + "learning_rate": 8.392735152261157e-05, + "loss": 1.8359, + "step": 9278 + }, + { + "epoch": 2.8480662983425415, + "grad_norm": 0.3736251890659332, + "learning_rate": 8.392370019563279e-05, + "loss": 1.8553, + "step": 9279 + }, + { + "epoch": 2.8483732351135664, + "grad_norm": 0.33329901099205017, + "learning_rate": 8.39200485334001e-05, + "loss": 1.8156, + "step": 9280 + }, + { + "epoch": 2.8486801718845918, + "grad_norm": 0.42538657784461975, + "learning_rate": 8.391639653594963e-05, + "loss": 1.7812, + "step": 9281 + }, + { + "epoch": 2.848987108655617, + "grad_norm": 0.39076727628707886, + "learning_rate": 8.391274420331744e-05, + "loss": 1.8027, + "step": 9282 + }, + { + "epoch": 2.849294045426642, + "grad_norm": 0.3558272123336792, + "learning_rate": 8.390909153553963e-05, + "loss": 1.8448, + "step": 9283 + }, + { + "epoch": 2.8496009821976673, + "grad_norm": 0.26782071590423584, + "learning_rate": 8.390543853265232e-05, + "loss": 1.7995, + "step": 9284 + }, + { + "epoch": 2.849907918968692, + "grad_norm": 0.3449724614620209, + "learning_rate": 8.390178519469158e-05, + "loss": 1.7888, + "step": 9285 + }, + { + "epoch": 2.8502148557397176, + "grad_norm": 0.36390578746795654, + "learning_rate": 8.389813152169355e-05, + "loss": 1.8072, + "step": 9286 + }, + { + "epoch": 2.850521792510743, + "grad_norm": 0.31959423422813416, + "learning_rate": 8.389447751369428e-05, + "loss": 1.8513, + "step": 9287 + }, + { + "epoch": 2.8508287292817682, + "grad_norm": 0.2717762589454651, + "learning_rate": 8.389082317072994e-05, + "loss": 1.8457, + "step": 9288 + }, + { + "epoch": 2.851135666052793, + "grad_norm": 0.28937265276908875, + "learning_rate": 8.388716849283662e-05, + "loss": 1.7945, + "step": 9289 + }, + { + "epoch": 2.8514426028238185, + "grad_norm": 0.293079674243927, + "learning_rate": 8.388351348005044e-05, + "loss": 1.7731, + "step": 9290 + }, + { + "epoch": 2.8517495395948433, + "grad_norm": 0.32930463552474976, + "learning_rate": 8.38798581324075e-05, + "loss": 1.9017, + "step": 9291 + }, + { + "epoch": 2.8520564763658687, + "grad_norm": 0.2972584664821625, + "learning_rate": 8.387620244994397e-05, + "loss": 1.861, + "step": 9292 + }, + { + "epoch": 2.852363413136894, + "grad_norm": 0.24732981622219086, + "learning_rate": 8.387254643269595e-05, + "loss": 1.7749, + "step": 9293 + }, + { + "epoch": 2.852670349907919, + "grad_norm": 0.31004419922828674, + "learning_rate": 8.386889008069955e-05, + "loss": 1.7848, + "step": 9294 + }, + { + "epoch": 2.8529772866789442, + "grad_norm": 0.2916278541088104, + "learning_rate": 8.386523339399095e-05, + "loss": 1.8299, + "step": 9295 + }, + { + "epoch": 2.853284223449969, + "grad_norm": 0.3109573423862457, + "learning_rate": 8.386157637260626e-05, + "loss": 1.8072, + "step": 9296 + }, + { + "epoch": 2.8535911602209945, + "grad_norm": 0.26398584246635437, + "learning_rate": 8.385791901658162e-05, + "loss": 1.8157, + "step": 9297 + }, + { + "epoch": 2.85389809699202, + "grad_norm": 0.3289371132850647, + "learning_rate": 8.385426132595317e-05, + "loss": 1.9382, + "step": 9298 + }, + { + "epoch": 2.8542050337630447, + "grad_norm": 0.2946974039077759, + "learning_rate": 8.38506033007571e-05, + "loss": 1.7893, + "step": 9299 + }, + { + "epoch": 2.85451197053407, + "grad_norm": 0.2909530699253082, + "learning_rate": 8.384694494102949e-05, + "loss": 1.8223, + "step": 9300 + }, + { + "epoch": 2.854818907305095, + "grad_norm": 0.2886645793914795, + "learning_rate": 8.384328624680655e-05, + "loss": 1.8239, + "step": 9301 + }, + { + "epoch": 2.8551258440761202, + "grad_norm": 0.2669137716293335, + "learning_rate": 8.383962721812442e-05, + "loss": 1.8102, + "step": 9302 + }, + { + "epoch": 2.8554327808471456, + "grad_norm": 0.3740660548210144, + "learning_rate": 8.383596785501926e-05, + "loss": 1.9014, + "step": 9303 + }, + { + "epoch": 2.855739717618171, + "grad_norm": 0.3062593638896942, + "learning_rate": 8.383230815752724e-05, + "loss": 1.8071, + "step": 9304 + }, + { + "epoch": 2.856046654389196, + "grad_norm": 0.2509091794490814, + "learning_rate": 8.382864812568452e-05, + "loss": 1.7968, + "step": 9305 + }, + { + "epoch": 2.856353591160221, + "grad_norm": 0.2764138877391815, + "learning_rate": 8.382498775952725e-05, + "loss": 1.7463, + "step": 9306 + }, + { + "epoch": 2.856660527931246, + "grad_norm": 0.3292323350906372, + "learning_rate": 8.382132705909165e-05, + "loss": 1.7888, + "step": 9307 + }, + { + "epoch": 2.8569674647022714, + "grad_norm": 0.3169284462928772, + "learning_rate": 8.381766602441386e-05, + "loss": 1.841, + "step": 9308 + }, + { + "epoch": 2.8572744014732967, + "grad_norm": 0.27665168046951294, + "learning_rate": 8.381400465553007e-05, + "loss": 1.7659, + "step": 9309 + }, + { + "epoch": 2.8575813382443216, + "grad_norm": 0.34908005595207214, + "learning_rate": 8.381034295247647e-05, + "loss": 1.8752, + "step": 9310 + }, + { + "epoch": 2.857888275015347, + "grad_norm": 0.31204238533973694, + "learning_rate": 8.380668091528924e-05, + "loss": 1.8201, + "step": 9311 + }, + { + "epoch": 2.858195211786372, + "grad_norm": 0.2713339328765869, + "learning_rate": 8.380301854400459e-05, + "loss": 1.8002, + "step": 9312 + }, + { + "epoch": 2.858502148557397, + "grad_norm": 0.30525076389312744, + "learning_rate": 8.379935583865868e-05, + "loss": 1.8533, + "step": 9313 + }, + { + "epoch": 2.8588090853284225, + "grad_norm": 0.3294430673122406, + "learning_rate": 8.379569279928774e-05, + "loss": 1.8895, + "step": 9314 + }, + { + "epoch": 2.8591160220994474, + "grad_norm": 0.31798750162124634, + "learning_rate": 8.379202942592795e-05, + "loss": 1.8148, + "step": 9315 + }, + { + "epoch": 2.8594229588704727, + "grad_norm": 0.3044969141483307, + "learning_rate": 8.378836571861553e-05, + "loss": 1.8477, + "step": 9316 + }, + { + "epoch": 2.8597298956414976, + "grad_norm": 0.2694118320941925, + "learning_rate": 8.378470167738665e-05, + "loss": 1.7998, + "step": 9317 + }, + { + "epoch": 2.860036832412523, + "grad_norm": 0.2601872980594635, + "learning_rate": 8.378103730227758e-05, + "loss": 1.8118, + "step": 9318 + }, + { + "epoch": 2.8603437691835483, + "grad_norm": 0.28168994188308716, + "learning_rate": 8.377737259332446e-05, + "loss": 1.8048, + "step": 9319 + }, + { + "epoch": 2.8606507059545736, + "grad_norm": 0.3008260428905487, + "learning_rate": 8.377370755056358e-05, + "loss": 1.7743, + "step": 9320 + }, + { + "epoch": 2.8609576427255985, + "grad_norm": 0.2578682601451874, + "learning_rate": 8.37700421740311e-05, + "loss": 1.8011, + "step": 9321 + }, + { + "epoch": 2.861264579496624, + "grad_norm": 0.3051932752132416, + "learning_rate": 8.376637646376329e-05, + "loss": 1.8747, + "step": 9322 + }, + { + "epoch": 2.8615715162676487, + "grad_norm": 0.27534300088882446, + "learning_rate": 8.376271041979636e-05, + "loss": 1.8018, + "step": 9323 + }, + { + "epoch": 2.861878453038674, + "grad_norm": 0.3990626335144043, + "learning_rate": 8.375904404216653e-05, + "loss": 1.9223, + "step": 9324 + }, + { + "epoch": 2.8621853898096994, + "grad_norm": 0.43015196919441223, + "learning_rate": 8.375537733091003e-05, + "loss": 1.8219, + "step": 9325 + }, + { + "epoch": 2.8624923265807243, + "grad_norm": 0.4051269590854645, + "learning_rate": 8.37517102860631e-05, + "loss": 1.8057, + "step": 9326 + }, + { + "epoch": 2.8627992633517496, + "grad_norm": 0.31781086325645447, + "learning_rate": 8.3748042907662e-05, + "loss": 1.8374, + "step": 9327 + }, + { + "epoch": 2.8631062001227745, + "grad_norm": 0.3476638197898865, + "learning_rate": 8.374437519574297e-05, + "loss": 1.8679, + "step": 9328 + }, + { + "epoch": 2.8634131368938, + "grad_norm": 0.40497875213623047, + "learning_rate": 8.374070715034224e-05, + "loss": 1.7996, + "step": 9329 + }, + { + "epoch": 2.863720073664825, + "grad_norm": 0.40277308225631714, + "learning_rate": 8.373703877149605e-05, + "loss": 1.8156, + "step": 9330 + }, + { + "epoch": 2.86402701043585, + "grad_norm": 0.3012325167655945, + "learning_rate": 8.373337005924069e-05, + "loss": 1.8765, + "step": 9331 + }, + { + "epoch": 2.8643339472068754, + "grad_norm": 0.3151897192001343, + "learning_rate": 8.372970101361238e-05, + "loss": 1.8395, + "step": 9332 + }, + { + "epoch": 2.8646408839779003, + "grad_norm": 0.33645790815353394, + "learning_rate": 8.372603163464741e-05, + "loss": 1.8587, + "step": 9333 + }, + { + "epoch": 2.8649478207489256, + "grad_norm": 0.29943743348121643, + "learning_rate": 8.3722361922382e-05, + "loss": 1.8007, + "step": 9334 + }, + { + "epoch": 2.865254757519951, + "grad_norm": 0.24727779626846313, + "learning_rate": 8.371869187685248e-05, + "loss": 1.766, + "step": 9335 + }, + { + "epoch": 2.8655616942909763, + "grad_norm": 0.3177282512187958, + "learning_rate": 8.371502149809507e-05, + "loss": 1.7954, + "step": 9336 + }, + { + "epoch": 2.865868631062001, + "grad_norm": 0.3415081202983856, + "learning_rate": 8.371135078614605e-05, + "loss": 1.8036, + "step": 9337 + }, + { + "epoch": 2.8661755678330265, + "grad_norm": 0.3044268488883972, + "learning_rate": 8.37076797410417e-05, + "loss": 1.8196, + "step": 9338 + }, + { + "epoch": 2.8664825046040514, + "grad_norm": 0.24425630271434784, + "learning_rate": 8.370400836281831e-05, + "loss": 1.8267, + "step": 9339 + }, + { + "epoch": 2.8667894413750767, + "grad_norm": 0.27264806628227234, + "learning_rate": 8.370033665151216e-05, + "loss": 1.8218, + "step": 9340 + }, + { + "epoch": 2.867096378146102, + "grad_norm": 0.275601327419281, + "learning_rate": 8.369666460715953e-05, + "loss": 1.8427, + "step": 9341 + }, + { + "epoch": 2.867403314917127, + "grad_norm": 0.2670573592185974, + "learning_rate": 8.36929922297967e-05, + "loss": 1.8449, + "step": 9342 + }, + { + "epoch": 2.8677102516881523, + "grad_norm": 0.2991434335708618, + "learning_rate": 8.368931951945998e-05, + "loss": 1.8866, + "step": 9343 + }, + { + "epoch": 2.868017188459177, + "grad_norm": 0.2975110411643982, + "learning_rate": 8.368564647618564e-05, + "loss": 1.7992, + "step": 9344 + }, + { + "epoch": 2.8683241252302025, + "grad_norm": 0.30109819769859314, + "learning_rate": 8.368197310001001e-05, + "loss": 1.8402, + "step": 9345 + }, + { + "epoch": 2.868631062001228, + "grad_norm": 0.3303714692592621, + "learning_rate": 8.367829939096938e-05, + "loss": 1.8329, + "step": 9346 + }, + { + "epoch": 2.8689379987722528, + "grad_norm": 0.3697182834148407, + "learning_rate": 8.367462534910007e-05, + "loss": 1.9328, + "step": 9347 + }, + { + "epoch": 2.869244935543278, + "grad_norm": 0.3292355537414551, + "learning_rate": 8.367095097443836e-05, + "loss": 1.8284, + "step": 9348 + }, + { + "epoch": 2.869551872314303, + "grad_norm": 0.30440348386764526, + "learning_rate": 8.366727626702058e-05, + "loss": 1.8891, + "step": 9349 + }, + { + "epoch": 2.8698588090853283, + "grad_norm": 0.28200212121009827, + "learning_rate": 8.366360122688303e-05, + "loss": 1.7931, + "step": 9350 + }, + { + "epoch": 2.8701657458563536, + "grad_norm": 0.3162787854671478, + "learning_rate": 8.365992585406207e-05, + "loss": 1.8033, + "step": 9351 + }, + { + "epoch": 2.870472682627379, + "grad_norm": 0.3326094448566437, + "learning_rate": 8.365625014859399e-05, + "loss": 1.8474, + "step": 9352 + }, + { + "epoch": 2.870779619398404, + "grad_norm": 0.36957383155822754, + "learning_rate": 8.36525741105151e-05, + "loss": 1.8387, + "step": 9353 + }, + { + "epoch": 2.871086556169429, + "grad_norm": 0.32996198534965515, + "learning_rate": 8.364889773986175e-05, + "loss": 1.9087, + "step": 9354 + }, + { + "epoch": 2.871393492940454, + "grad_norm": 0.3164239227771759, + "learning_rate": 8.36452210366703e-05, + "loss": 1.8735, + "step": 9355 + }, + { + "epoch": 2.8717004297114794, + "grad_norm": 0.411538302898407, + "learning_rate": 8.364154400097702e-05, + "loss": 1.832, + "step": 9356 + }, + { + "epoch": 2.8720073664825048, + "grad_norm": 0.48294687271118164, + "learning_rate": 8.36378666328183e-05, + "loss": 1.7772, + "step": 9357 + }, + { + "epoch": 2.8723143032535297, + "grad_norm": 0.4894202649593353, + "learning_rate": 8.363418893223046e-05, + "loss": 1.8396, + "step": 9358 + }, + { + "epoch": 2.872621240024555, + "grad_norm": 0.3328344225883484, + "learning_rate": 8.363051089924986e-05, + "loss": 1.8264, + "step": 9359 + }, + { + "epoch": 2.87292817679558, + "grad_norm": 0.29800695180892944, + "learning_rate": 8.362683253391284e-05, + "loss": 1.8609, + "step": 9360 + }, + { + "epoch": 2.873235113566605, + "grad_norm": 0.48049718141555786, + "learning_rate": 8.362315383625574e-05, + "loss": 1.8703, + "step": 9361 + }, + { + "epoch": 2.8735420503376305, + "grad_norm": 0.5477426052093506, + "learning_rate": 8.361947480631494e-05, + "loss": 1.8336, + "step": 9362 + }, + { + "epoch": 2.873848987108656, + "grad_norm": 0.42515942454338074, + "learning_rate": 8.361579544412676e-05, + "loss": 1.826, + "step": 9363 + }, + { + "epoch": 2.8741559238796808, + "grad_norm": 0.3049539029598236, + "learning_rate": 8.361211574972762e-05, + "loss": 1.9117, + "step": 9364 + }, + { + "epoch": 2.874462860650706, + "grad_norm": 0.4089799225330353, + "learning_rate": 8.360843572315384e-05, + "loss": 1.8669, + "step": 9365 + }, + { + "epoch": 2.874769797421731, + "grad_norm": 0.42594894766807556, + "learning_rate": 8.36047553644418e-05, + "loss": 1.8527, + "step": 9366 + }, + { + "epoch": 2.8750767341927563, + "grad_norm": 0.3282840847969055, + "learning_rate": 8.360107467362785e-05, + "loss": 1.833, + "step": 9367 + }, + { + "epoch": 2.8753836709637817, + "grad_norm": 0.26597294211387634, + "learning_rate": 8.359739365074841e-05, + "loss": 1.7735, + "step": 9368 + }, + { + "epoch": 2.8756906077348066, + "grad_norm": 0.33498096466064453, + "learning_rate": 8.359371229583983e-05, + "loss": 1.7923, + "step": 9369 + }, + { + "epoch": 2.875997544505832, + "grad_norm": 0.3046290874481201, + "learning_rate": 8.35900306089385e-05, + "loss": 1.8296, + "step": 9370 + }, + { + "epoch": 2.876304481276857, + "grad_norm": 0.3128269612789154, + "learning_rate": 8.358634859008079e-05, + "loss": 1.8115, + "step": 9371 + }, + { + "epoch": 2.876611418047882, + "grad_norm": 0.3814822733402252, + "learning_rate": 8.358266623930309e-05, + "loss": 1.8454, + "step": 9372 + }, + { + "epoch": 2.8769183548189075, + "grad_norm": 0.42400503158569336, + "learning_rate": 8.35789835566418e-05, + "loss": 1.8162, + "step": 9373 + }, + { + "epoch": 2.8772252915899323, + "grad_norm": 0.3131491243839264, + "learning_rate": 8.357530054213333e-05, + "loss": 1.8281, + "step": 9374 + }, + { + "epoch": 2.8775322283609577, + "grad_norm": 0.2566036581993103, + "learning_rate": 8.357161719581406e-05, + "loss": 1.7751, + "step": 9375 + }, + { + "epoch": 2.8778391651319826, + "grad_norm": 0.3858461081981659, + "learning_rate": 8.356793351772038e-05, + "loss": 1.8558, + "step": 9376 + }, + { + "epoch": 2.878146101903008, + "grad_norm": 0.38664349913597107, + "learning_rate": 8.35642495078887e-05, + "loss": 1.8009, + "step": 9377 + }, + { + "epoch": 2.8784530386740332, + "grad_norm": 0.33365172147750854, + "learning_rate": 8.356056516635545e-05, + "loss": 1.8689, + "step": 9378 + }, + { + "epoch": 2.8787599754450586, + "grad_norm": 0.3602980971336365, + "learning_rate": 8.355688049315702e-05, + "loss": 1.8397, + "step": 9379 + }, + { + "epoch": 2.8790669122160835, + "grad_norm": 0.4508447051048279, + "learning_rate": 8.355319548832983e-05, + "loss": 1.8163, + "step": 9380 + }, + { + "epoch": 2.879373848987109, + "grad_norm": 0.4433961808681488, + "learning_rate": 8.35495101519103e-05, + "loss": 1.7868, + "step": 9381 + }, + { + "epoch": 2.8796807857581337, + "grad_norm": 0.2754592299461365, + "learning_rate": 8.354582448393483e-05, + "loss": 1.8222, + "step": 9382 + }, + { + "epoch": 2.879987722529159, + "grad_norm": 0.29384344816207886, + "learning_rate": 8.354213848443987e-05, + "loss": 1.7742, + "step": 9383 + }, + { + "epoch": 2.8802946593001844, + "grad_norm": 0.33183756470680237, + "learning_rate": 8.353845215346183e-05, + "loss": 1.8327, + "step": 9384 + }, + { + "epoch": 2.8806015960712092, + "grad_norm": 0.3018858730792999, + "learning_rate": 8.353476549103717e-05, + "loss": 1.8606, + "step": 9385 + }, + { + "epoch": 2.8809085328422346, + "grad_norm": 0.38592803478240967, + "learning_rate": 8.353107849720229e-05, + "loss": 1.8091, + "step": 9386 + }, + { + "epoch": 2.8812154696132595, + "grad_norm": 0.448723703622818, + "learning_rate": 8.352739117199364e-05, + "loss": 1.8537, + "step": 9387 + }, + { + "epoch": 2.881522406384285, + "grad_norm": 0.25959616899490356, + "learning_rate": 8.352370351544765e-05, + "loss": 1.8188, + "step": 9388 + }, + { + "epoch": 2.88182934315531, + "grad_norm": 0.3304184079170227, + "learning_rate": 8.352001552760078e-05, + "loss": 1.8008, + "step": 9389 + }, + { + "epoch": 2.882136279926335, + "grad_norm": 0.3831254541873932, + "learning_rate": 8.351632720848947e-05, + "loss": 1.7636, + "step": 9390 + }, + { + "epoch": 2.8824432166973604, + "grad_norm": 0.3358294665813446, + "learning_rate": 8.351263855815017e-05, + "loss": 1.8375, + "step": 9391 + }, + { + "epoch": 2.8827501534683853, + "grad_norm": 0.31194913387298584, + "learning_rate": 8.350894957661935e-05, + "loss": 1.817, + "step": 9392 + }, + { + "epoch": 2.8830570902394106, + "grad_norm": 0.4156818687915802, + "learning_rate": 8.350526026393343e-05, + "loss": 1.799, + "step": 9393 + }, + { + "epoch": 2.883364027010436, + "grad_norm": 0.3062533140182495, + "learning_rate": 8.350157062012889e-05, + "loss": 1.8535, + "step": 9394 + }, + { + "epoch": 2.8836709637814613, + "grad_norm": 0.3091447949409485, + "learning_rate": 8.34978806452422e-05, + "loss": 1.839, + "step": 9395 + }, + { + "epoch": 2.883977900552486, + "grad_norm": 0.38731643557548523, + "learning_rate": 8.349419033930981e-05, + "loss": 1.8714, + "step": 9396 + }, + { + "epoch": 2.8842848373235115, + "grad_norm": 0.34655869007110596, + "learning_rate": 8.34904997023682e-05, + "loss": 1.8694, + "step": 9397 + }, + { + "epoch": 2.8845917740945364, + "grad_norm": 0.3094301223754883, + "learning_rate": 8.348680873445386e-05, + "loss": 1.8773, + "step": 9398 + }, + { + "epoch": 2.8848987108655617, + "grad_norm": 0.2954508364200592, + "learning_rate": 8.348311743560325e-05, + "loss": 1.7716, + "step": 9399 + }, + { + "epoch": 2.885205647636587, + "grad_norm": 0.32545948028564453, + "learning_rate": 8.347942580585282e-05, + "loss": 1.871, + "step": 9400 + }, + { + "epoch": 2.885512584407612, + "grad_norm": 0.3251612186431885, + "learning_rate": 8.34757338452391e-05, + "loss": 1.8553, + "step": 9401 + }, + { + "epoch": 2.8858195211786373, + "grad_norm": 0.2610895335674286, + "learning_rate": 8.347204155379856e-05, + "loss": 1.8018, + "step": 9402 + }, + { + "epoch": 2.886126457949662, + "grad_norm": 0.3369129002094269, + "learning_rate": 8.346834893156768e-05, + "loss": 1.8536, + "step": 9403 + }, + { + "epoch": 2.8864333947206875, + "grad_norm": 0.4544060528278351, + "learning_rate": 8.346465597858296e-05, + "loss": 1.8332, + "step": 9404 + }, + { + "epoch": 2.886740331491713, + "grad_norm": 0.45742174983024597, + "learning_rate": 8.346096269488089e-05, + "loss": 1.89, + "step": 9405 + }, + { + "epoch": 2.8870472682627377, + "grad_norm": 0.3458103537559509, + "learning_rate": 8.345726908049799e-05, + "loss": 1.8902, + "step": 9406 + }, + { + "epoch": 2.887354205033763, + "grad_norm": 0.33266058564186096, + "learning_rate": 8.345357513547074e-05, + "loss": 1.7975, + "step": 9407 + }, + { + "epoch": 2.887661141804788, + "grad_norm": 0.3503437042236328, + "learning_rate": 8.344988085983565e-05, + "loss": 1.8503, + "step": 9408 + }, + { + "epoch": 2.8879680785758133, + "grad_norm": 0.33511486649513245, + "learning_rate": 8.344618625362923e-05, + "loss": 1.8731, + "step": 9409 + }, + { + "epoch": 2.8882750153468386, + "grad_norm": 0.295250803232193, + "learning_rate": 8.344249131688799e-05, + "loss": 1.8557, + "step": 9410 + }, + { + "epoch": 2.888581952117864, + "grad_norm": 0.33287179470062256, + "learning_rate": 8.343879604964846e-05, + "loss": 1.8015, + "step": 9411 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.35169747471809387, + "learning_rate": 8.343510045194715e-05, + "loss": 1.7857, + "step": 9412 + }, + { + "epoch": 2.889195825659914, + "grad_norm": 0.3191360533237457, + "learning_rate": 8.343140452382056e-05, + "loss": 1.8474, + "step": 9413 + }, + { + "epoch": 2.889502762430939, + "grad_norm": 0.27216482162475586, + "learning_rate": 8.342770826530526e-05, + "loss": 1.7941, + "step": 9414 + }, + { + "epoch": 2.8898096992019644, + "grad_norm": 0.32968905568122864, + "learning_rate": 8.342401167643774e-05, + "loss": 1.8568, + "step": 9415 + }, + { + "epoch": 2.8901166359729897, + "grad_norm": 0.37429341673851013, + "learning_rate": 8.342031475725456e-05, + "loss": 1.8995, + "step": 9416 + }, + { + "epoch": 2.8904235727440146, + "grad_norm": 0.3318146765232086, + "learning_rate": 8.341661750779223e-05, + "loss": 1.8886, + "step": 9417 + }, + { + "epoch": 2.89073050951504, + "grad_norm": 0.3208807408809662, + "learning_rate": 8.34129199280873e-05, + "loss": 1.8306, + "step": 9418 + }, + { + "epoch": 2.891037446286065, + "grad_norm": 0.30906134843826294, + "learning_rate": 8.340922201817632e-05, + "loss": 1.8931, + "step": 9419 + }, + { + "epoch": 2.89134438305709, + "grad_norm": 0.2949373722076416, + "learning_rate": 8.340552377809581e-05, + "loss": 1.8375, + "step": 9420 + }, + { + "epoch": 2.8916513198281155, + "grad_norm": 0.2553368806838989, + "learning_rate": 8.340182520788236e-05, + "loss": 1.7816, + "step": 9421 + }, + { + "epoch": 2.891958256599141, + "grad_norm": 0.26867765188217163, + "learning_rate": 8.339812630757246e-05, + "loss": 1.7721, + "step": 9422 + }, + { + "epoch": 2.8922651933701657, + "grad_norm": 0.3132673501968384, + "learning_rate": 8.339442707720273e-05, + "loss": 1.8412, + "step": 9423 + }, + { + "epoch": 2.892572130141191, + "grad_norm": 0.32028669118881226, + "learning_rate": 8.33907275168097e-05, + "loss": 1.8081, + "step": 9424 + }, + { + "epoch": 2.892879066912216, + "grad_norm": 0.30383285880088806, + "learning_rate": 8.338702762642992e-05, + "loss": 1.8294, + "step": 9425 + }, + { + "epoch": 2.8931860036832413, + "grad_norm": 0.284161239862442, + "learning_rate": 8.338332740609995e-05, + "loss": 1.7788, + "step": 9426 + }, + { + "epoch": 2.8934929404542666, + "grad_norm": 0.26731929183006287, + "learning_rate": 8.337962685585638e-05, + "loss": 1.8244, + "step": 9427 + }, + { + "epoch": 2.8937998772252915, + "grad_norm": 0.2687760889530182, + "learning_rate": 8.337592597573578e-05, + "loss": 1.8104, + "step": 9428 + }, + { + "epoch": 2.894106813996317, + "grad_norm": 0.3097872734069824, + "learning_rate": 8.337222476577472e-05, + "loss": 1.8311, + "step": 9429 + }, + { + "epoch": 2.8944137507673418, + "grad_norm": 0.2915988862514496, + "learning_rate": 8.336852322600977e-05, + "loss": 1.8878, + "step": 9430 + }, + { + "epoch": 2.894720687538367, + "grad_norm": 0.2783167362213135, + "learning_rate": 8.336482135647751e-05, + "loss": 1.829, + "step": 9431 + }, + { + "epoch": 2.8950276243093924, + "grad_norm": 0.27866432070732117, + "learning_rate": 8.336111915721454e-05, + "loss": 1.8881, + "step": 9432 + }, + { + "epoch": 2.8953345610804173, + "grad_norm": 0.26949164271354675, + "learning_rate": 8.335741662825743e-05, + "loss": 1.7652, + "step": 9433 + }, + { + "epoch": 2.8956414978514426, + "grad_norm": 0.31324130296707153, + "learning_rate": 8.335371376964278e-05, + "loss": 1.8362, + "step": 9434 + }, + { + "epoch": 2.8959484346224675, + "grad_norm": 0.31150999665260315, + "learning_rate": 8.335001058140718e-05, + "loss": 1.8588, + "step": 9435 + }, + { + "epoch": 2.896255371393493, + "grad_norm": 0.30692601203918457, + "learning_rate": 8.334630706358724e-05, + "loss": 1.8473, + "step": 9436 + }, + { + "epoch": 2.896562308164518, + "grad_norm": 0.2764357328414917, + "learning_rate": 8.334260321621954e-05, + "loss": 1.8696, + "step": 9437 + }, + { + "epoch": 2.8968692449355435, + "grad_norm": 0.26108071208000183, + "learning_rate": 8.333889903934069e-05, + "loss": 1.7647, + "step": 9438 + }, + { + "epoch": 2.8971761817065684, + "grad_norm": 0.3382989466190338, + "learning_rate": 8.33351945329873e-05, + "loss": 1.8936, + "step": 9439 + }, + { + "epoch": 2.8974831184775938, + "grad_norm": 0.3121405839920044, + "learning_rate": 8.333148969719598e-05, + "loss": 1.8281, + "step": 9440 + }, + { + "epoch": 2.8977900552486187, + "grad_norm": 0.283149778842926, + "learning_rate": 8.332778453200334e-05, + "loss": 1.8642, + "step": 9441 + }, + { + "epoch": 2.898096992019644, + "grad_norm": 0.4140075445175171, + "learning_rate": 8.332407903744598e-05, + "loss": 1.8553, + "step": 9442 + }, + { + "epoch": 2.8984039287906693, + "grad_norm": 0.4345620274543762, + "learning_rate": 8.332037321356057e-05, + "loss": 1.7879, + "step": 9443 + }, + { + "epoch": 2.898710865561694, + "grad_norm": 0.4103661775588989, + "learning_rate": 8.33166670603837e-05, + "loss": 1.7928, + "step": 9444 + }, + { + "epoch": 2.8990178023327196, + "grad_norm": 0.2874266505241394, + "learning_rate": 8.3312960577952e-05, + "loss": 1.8097, + "step": 9445 + }, + { + "epoch": 2.8993247391037444, + "grad_norm": 0.2949487864971161, + "learning_rate": 8.330925376630208e-05, + "loss": 1.8679, + "step": 9446 + }, + { + "epoch": 2.8996316758747698, + "grad_norm": 0.3222406804561615, + "learning_rate": 8.330554662547059e-05, + "loss": 1.8184, + "step": 9447 + }, + { + "epoch": 2.899938612645795, + "grad_norm": 0.32089436054229736, + "learning_rate": 8.330183915549418e-05, + "loss": 1.8798, + "step": 9448 + }, + { + "epoch": 2.90024554941682, + "grad_norm": 0.28950363397598267, + "learning_rate": 8.329813135640947e-05, + "loss": 1.8502, + "step": 9449 + }, + { + "epoch": 2.9005524861878453, + "grad_norm": 0.29070547223091125, + "learning_rate": 8.329442322825312e-05, + "loss": 1.8826, + "step": 9450 + }, + { + "epoch": 2.9008594229588702, + "grad_norm": 0.3030688464641571, + "learning_rate": 8.329071477106175e-05, + "loss": 1.8002, + "step": 9451 + }, + { + "epoch": 2.9011663597298956, + "grad_norm": 0.33711570501327515, + "learning_rate": 8.328700598487203e-05, + "loss": 1.8876, + "step": 9452 + }, + { + "epoch": 2.901473296500921, + "grad_norm": 0.31995612382888794, + "learning_rate": 8.328329686972063e-05, + "loss": 1.7952, + "step": 9453 + }, + { + "epoch": 2.9017802332719462, + "grad_norm": 0.2619616389274597, + "learning_rate": 8.327958742564415e-05, + "loss": 1.7371, + "step": 9454 + }, + { + "epoch": 2.902087170042971, + "grad_norm": 0.3527650535106659, + "learning_rate": 8.32758776526793e-05, + "loss": 1.8385, + "step": 9455 + }, + { + "epoch": 2.9023941068139965, + "grad_norm": 0.3238582909107208, + "learning_rate": 8.327216755086271e-05, + "loss": 1.7955, + "step": 9456 + }, + { + "epoch": 2.9027010435850213, + "grad_norm": 0.2647970914840698, + "learning_rate": 8.326845712023106e-05, + "loss": 1.8639, + "step": 9457 + }, + { + "epoch": 2.9030079803560467, + "grad_norm": 0.3435346186161041, + "learning_rate": 8.326474636082103e-05, + "loss": 1.7831, + "step": 9458 + }, + { + "epoch": 2.903314917127072, + "grad_norm": 0.42539843916893005, + "learning_rate": 8.326103527266927e-05, + "loss": 1.8473, + "step": 9459 + }, + { + "epoch": 2.903621853898097, + "grad_norm": 0.3773367404937744, + "learning_rate": 8.325732385581247e-05, + "loss": 1.8993, + "step": 9460 + }, + { + "epoch": 2.9039287906691222, + "grad_norm": 0.2918262183666229, + "learning_rate": 8.32536121102873e-05, + "loss": 1.8198, + "step": 9461 + }, + { + "epoch": 2.904235727440147, + "grad_norm": 0.3997703492641449, + "learning_rate": 8.324990003613044e-05, + "loss": 1.8307, + "step": 9462 + }, + { + "epoch": 2.9045426642111725, + "grad_norm": 0.4593566656112671, + "learning_rate": 8.324618763337858e-05, + "loss": 1.8068, + "step": 9463 + }, + { + "epoch": 2.904849600982198, + "grad_norm": 0.30200180411338806, + "learning_rate": 8.324247490206841e-05, + "loss": 1.7935, + "step": 9464 + }, + { + "epoch": 2.9051565377532227, + "grad_norm": 0.37651970982551575, + "learning_rate": 8.323876184223663e-05, + "loss": 1.9268, + "step": 9465 + }, + { + "epoch": 2.905463474524248, + "grad_norm": 0.465863436460495, + "learning_rate": 8.32350484539199e-05, + "loss": 1.8331, + "step": 9466 + }, + { + "epoch": 2.905770411295273, + "grad_norm": 0.3527480661869049, + "learning_rate": 8.323133473715496e-05, + "loss": 1.899, + "step": 9467 + }, + { + "epoch": 2.9060773480662982, + "grad_norm": 0.30979883670806885, + "learning_rate": 8.32276206919785e-05, + "loss": 1.7578, + "step": 9468 + }, + { + "epoch": 2.9063842848373236, + "grad_norm": 0.5039793252944946, + "learning_rate": 8.322390631842718e-05, + "loss": 1.7822, + "step": 9469 + }, + { + "epoch": 2.906691221608349, + "grad_norm": 0.4683503806591034, + "learning_rate": 8.322019161653777e-05, + "loss": 1.7958, + "step": 9470 + }, + { + "epoch": 2.906998158379374, + "grad_norm": 0.27022865414619446, + "learning_rate": 8.321647658634696e-05, + "loss": 1.838, + "step": 9471 + }, + { + "epoch": 2.907305095150399, + "grad_norm": 0.3253246247768402, + "learning_rate": 8.321276122789146e-05, + "loss": 1.862, + "step": 9472 + }, + { + "epoch": 2.907612031921424, + "grad_norm": 0.3654547929763794, + "learning_rate": 8.320904554120798e-05, + "loss": 1.8578, + "step": 9473 + }, + { + "epoch": 2.9079189686924494, + "grad_norm": 0.3140239417552948, + "learning_rate": 8.320532952633325e-05, + "loss": 1.7954, + "step": 9474 + }, + { + "epoch": 2.9082259054634747, + "grad_norm": 0.24541302025318146, + "learning_rate": 8.3201613183304e-05, + "loss": 1.7711, + "step": 9475 + }, + { + "epoch": 2.9085328422344996, + "grad_norm": 0.2538415491580963, + "learning_rate": 8.319789651215692e-05, + "loss": 1.7756, + "step": 9476 + }, + { + "epoch": 2.908839779005525, + "grad_norm": 0.3181871175765991, + "learning_rate": 8.31941795129288e-05, + "loss": 1.7957, + "step": 9477 + }, + { + "epoch": 2.90914671577655, + "grad_norm": 0.3094673752784729, + "learning_rate": 8.319046218565633e-05, + "loss": 1.8897, + "step": 9478 + }, + { + "epoch": 2.909453652547575, + "grad_norm": 0.3004473149776459, + "learning_rate": 8.318674453037626e-05, + "loss": 1.7853, + "step": 9479 + }, + { + "epoch": 2.9097605893186005, + "grad_norm": 0.28673505783081055, + "learning_rate": 8.318302654712532e-05, + "loss": 1.8119, + "step": 9480 + }, + { + "epoch": 2.9100675260896254, + "grad_norm": 0.3177729547023773, + "learning_rate": 8.317930823594027e-05, + "loss": 1.8211, + "step": 9481 + }, + { + "epoch": 2.9103744628606507, + "grad_norm": 0.28347232937812805, + "learning_rate": 8.317558959685786e-05, + "loss": 1.8061, + "step": 9482 + }, + { + "epoch": 2.9106813996316756, + "grad_norm": 0.28247126936912537, + "learning_rate": 8.317187062991482e-05, + "loss": 1.8175, + "step": 9483 + }, + { + "epoch": 2.910988336402701, + "grad_norm": 0.3153017461299896, + "learning_rate": 8.31681513351479e-05, + "loss": 1.8619, + "step": 9484 + }, + { + "epoch": 2.9112952731737263, + "grad_norm": 0.265821635723114, + "learning_rate": 8.316443171259389e-05, + "loss": 1.7783, + "step": 9485 + }, + { + "epoch": 2.9116022099447516, + "grad_norm": 0.33247366547584534, + "learning_rate": 8.31607117622895e-05, + "loss": 1.8701, + "step": 9486 + }, + { + "epoch": 2.9119091467157765, + "grad_norm": 0.3343275189399719, + "learning_rate": 8.315699148427154e-05, + "loss": 1.742, + "step": 9487 + }, + { + "epoch": 2.912216083486802, + "grad_norm": 0.3427117168903351, + "learning_rate": 8.315327087857677e-05, + "loss": 1.8382, + "step": 9488 + }, + { + "epoch": 2.9125230202578267, + "grad_norm": 0.2884635925292969, + "learning_rate": 8.31495499452419e-05, + "loss": 1.8378, + "step": 9489 + }, + { + "epoch": 2.912829957028852, + "grad_norm": 0.30335184931755066, + "learning_rate": 8.31458286843038e-05, + "loss": 1.7619, + "step": 9490 + }, + { + "epoch": 2.9131368937998774, + "grad_norm": 0.3224368095397949, + "learning_rate": 8.314210709579916e-05, + "loss": 1.8289, + "step": 9491 + }, + { + "epoch": 2.9134438305709023, + "grad_norm": 0.28016242384910583, + "learning_rate": 8.31383851797648e-05, + "loss": 1.8027, + "step": 9492 + }, + { + "epoch": 2.9137507673419276, + "grad_norm": 0.32091468572616577, + "learning_rate": 8.313466293623749e-05, + "loss": 1.9027, + "step": 9493 + }, + { + "epoch": 2.9140577041129525, + "grad_norm": 0.2809069752693176, + "learning_rate": 8.313094036525403e-05, + "loss": 1.9194, + "step": 9494 + }, + { + "epoch": 2.914364640883978, + "grad_norm": 0.30734366178512573, + "learning_rate": 8.312721746685119e-05, + "loss": 1.8612, + "step": 9495 + }, + { + "epoch": 2.914671577655003, + "grad_norm": 0.25953513383865356, + "learning_rate": 8.312349424106578e-05, + "loss": 1.7593, + "step": 9496 + }, + { + "epoch": 2.9149785144260285, + "grad_norm": 0.27583983540534973, + "learning_rate": 8.311977068793459e-05, + "loss": 1.8138, + "step": 9497 + }, + { + "epoch": 2.9152854511970534, + "grad_norm": 0.30315884947776794, + "learning_rate": 8.31160468074944e-05, + "loss": 1.7704, + "step": 9498 + }, + { + "epoch": 2.9155923879680787, + "grad_norm": 0.321603387594223, + "learning_rate": 8.311232259978204e-05, + "loss": 1.8055, + "step": 9499 + }, + { + "epoch": 2.9158993247391036, + "grad_norm": 0.27882421016693115, + "learning_rate": 8.310859806483429e-05, + "loss": 1.8257, + "step": 9500 + }, + { + "epoch": 2.916206261510129, + "grad_norm": 0.3095625042915344, + "learning_rate": 8.310487320268795e-05, + "loss": 1.8561, + "step": 9501 + }, + { + "epoch": 2.9165131982811543, + "grad_norm": 0.27503731846809387, + "learning_rate": 8.310114801337988e-05, + "loss": 1.7588, + "step": 9502 + }, + { + "epoch": 2.916820135052179, + "grad_norm": 0.2534404695034027, + "learning_rate": 8.309742249694686e-05, + "loss": 1.7289, + "step": 9503 + }, + { + "epoch": 2.9171270718232045, + "grad_norm": 0.24968849122524261, + "learning_rate": 8.30936966534257e-05, + "loss": 1.7763, + "step": 9504 + }, + { + "epoch": 2.9174340085942294, + "grad_norm": 0.2728060781955719, + "learning_rate": 8.308997048285324e-05, + "loss": 1.7847, + "step": 9505 + }, + { + "epoch": 2.9177409453652547, + "grad_norm": 0.28728193044662476, + "learning_rate": 8.308624398526629e-05, + "loss": 1.7957, + "step": 9506 + }, + { + "epoch": 2.91804788213628, + "grad_norm": 0.3097241520881653, + "learning_rate": 8.308251716070169e-05, + "loss": 1.8141, + "step": 9507 + }, + { + "epoch": 2.918354818907305, + "grad_norm": 0.3570188879966736, + "learning_rate": 8.307879000919628e-05, + "loss": 1.8246, + "step": 9508 + }, + { + "epoch": 2.9186617556783303, + "grad_norm": 0.27077826857566833, + "learning_rate": 8.307506253078685e-05, + "loss": 1.7912, + "step": 9509 + }, + { + "epoch": 2.918968692449355, + "grad_norm": 0.26213565468788147, + "learning_rate": 8.307133472551028e-05, + "loss": 1.8378, + "step": 9510 + }, + { + "epoch": 2.9192756292203805, + "grad_norm": 0.3482845723628998, + "learning_rate": 8.306760659340339e-05, + "loss": 1.8031, + "step": 9511 + }, + { + "epoch": 2.919582565991406, + "grad_norm": 0.3730507791042328, + "learning_rate": 8.306387813450303e-05, + "loss": 1.7404, + "step": 9512 + }, + { + "epoch": 2.919889502762431, + "grad_norm": 0.2957874536514282, + "learning_rate": 8.306014934884606e-05, + "loss": 1.8623, + "step": 9513 + }, + { + "epoch": 2.920196439533456, + "grad_norm": 0.29137885570526123, + "learning_rate": 8.30564202364693e-05, + "loss": 1.847, + "step": 9514 + }, + { + "epoch": 2.9205033763044814, + "grad_norm": 0.35623642802238464, + "learning_rate": 8.305269079740964e-05, + "loss": 1.8382, + "step": 9515 + }, + { + "epoch": 2.9208103130755063, + "grad_norm": 0.28263330459594727, + "learning_rate": 8.304896103170389e-05, + "loss": 1.7732, + "step": 9516 + }, + { + "epoch": 2.9211172498465316, + "grad_norm": 0.23631221055984497, + "learning_rate": 8.304523093938897e-05, + "loss": 1.7709, + "step": 9517 + }, + { + "epoch": 2.921424186617557, + "grad_norm": 0.25887101888656616, + "learning_rate": 8.304150052050169e-05, + "loss": 1.7966, + "step": 9518 + }, + { + "epoch": 2.921731123388582, + "grad_norm": 0.31445473432540894, + "learning_rate": 8.303776977507894e-05, + "loss": 1.8735, + "step": 9519 + }, + { + "epoch": 2.922038060159607, + "grad_norm": 0.264930784702301, + "learning_rate": 8.303403870315757e-05, + "loss": 1.7983, + "step": 9520 + }, + { + "epoch": 2.922344996930632, + "grad_norm": 0.2664194107055664, + "learning_rate": 8.30303073047745e-05, + "loss": 1.8573, + "step": 9521 + }, + { + "epoch": 2.9226519337016574, + "grad_norm": 0.31645768880844116, + "learning_rate": 8.302657557996656e-05, + "loss": 1.913, + "step": 9522 + }, + { + "epoch": 2.9229588704726828, + "grad_norm": 0.2820858657360077, + "learning_rate": 8.302284352877063e-05, + "loss": 1.8714, + "step": 9523 + }, + { + "epoch": 2.9232658072437077, + "grad_norm": 0.2960543930530548, + "learning_rate": 8.30191111512236e-05, + "loss": 1.8296, + "step": 9524 + }, + { + "epoch": 2.923572744014733, + "grad_norm": 0.319363534450531, + "learning_rate": 8.301537844736237e-05, + "loss": 1.8533, + "step": 9525 + }, + { + "epoch": 2.923879680785758, + "grad_norm": 0.28047996759414673, + "learning_rate": 8.301164541722384e-05, + "loss": 1.7415, + "step": 9526 + }, + { + "epoch": 2.924186617556783, + "grad_norm": 0.3106628656387329, + "learning_rate": 8.300791206084486e-05, + "loss": 1.8809, + "step": 9527 + }, + { + "epoch": 2.9244935543278086, + "grad_norm": 0.2650253474712372, + "learning_rate": 8.300417837826235e-05, + "loss": 1.8097, + "step": 9528 + }, + { + "epoch": 2.924800491098834, + "grad_norm": 0.31832796335220337, + "learning_rate": 8.30004443695132e-05, + "loss": 1.881, + "step": 9529 + }, + { + "epoch": 2.925107427869859, + "grad_norm": 0.311018168926239, + "learning_rate": 8.299671003463432e-05, + "loss": 1.8725, + "step": 9530 + }, + { + "epoch": 2.925414364640884, + "grad_norm": 0.3125450909137726, + "learning_rate": 8.299297537366262e-05, + "loss": 1.8159, + "step": 9531 + }, + { + "epoch": 2.925721301411909, + "grad_norm": 0.30022570490837097, + "learning_rate": 8.298924038663498e-05, + "loss": 1.8217, + "step": 9532 + }, + { + "epoch": 2.9260282381829343, + "grad_norm": 0.3061163127422333, + "learning_rate": 8.298550507358836e-05, + "loss": 1.8529, + "step": 9533 + }, + { + "epoch": 2.9263351749539597, + "grad_norm": 0.258891224861145, + "learning_rate": 8.298176943455962e-05, + "loss": 1.8579, + "step": 9534 + }, + { + "epoch": 2.9266421117249846, + "grad_norm": 0.2871147096157074, + "learning_rate": 8.297803346958571e-05, + "loss": 1.8699, + "step": 9535 + }, + { + "epoch": 2.92694904849601, + "grad_norm": 0.3047468066215515, + "learning_rate": 8.297429717870356e-05, + "loss": 1.9165, + "step": 9536 + }, + { + "epoch": 2.927255985267035, + "grad_norm": 0.2852346897125244, + "learning_rate": 8.297056056195005e-05, + "loss": 1.8417, + "step": 9537 + }, + { + "epoch": 2.92756292203806, + "grad_norm": 0.30782654881477356, + "learning_rate": 8.296682361936216e-05, + "loss": 1.835, + "step": 9538 + }, + { + "epoch": 2.9278698588090855, + "grad_norm": 0.44828128814697266, + "learning_rate": 8.296308635097678e-05, + "loss": 1.8997, + "step": 9539 + }, + { + "epoch": 2.9281767955801103, + "grad_norm": 0.48911961913108826, + "learning_rate": 8.295934875683087e-05, + "loss": 1.8249, + "step": 9540 + }, + { + "epoch": 2.9284837323511357, + "grad_norm": 0.3377256691455841, + "learning_rate": 8.295561083696136e-05, + "loss": 1.757, + "step": 9541 + }, + { + "epoch": 2.9287906691221606, + "grad_norm": 0.29486989974975586, + "learning_rate": 8.295187259140518e-05, + "loss": 1.8282, + "step": 9542 + }, + { + "epoch": 2.929097605893186, + "grad_norm": 0.4291549026966095, + "learning_rate": 8.294813402019927e-05, + "loss": 1.7633, + "step": 9543 + }, + { + "epoch": 2.9294045426642112, + "grad_norm": 0.43153640627861023, + "learning_rate": 8.294439512338061e-05, + "loss": 1.7904, + "step": 9544 + }, + { + "epoch": 2.9297114794352366, + "grad_norm": 0.3454402685165405, + "learning_rate": 8.294065590098611e-05, + "loss": 1.8586, + "step": 9545 + }, + { + "epoch": 2.9300184162062615, + "grad_norm": 0.2709622383117676, + "learning_rate": 8.293691635305276e-05, + "loss": 1.8225, + "step": 9546 + }, + { + "epoch": 2.930325352977287, + "grad_norm": 0.34379467368125916, + "learning_rate": 8.293317647961749e-05, + "loss": 1.9005, + "step": 9547 + }, + { + "epoch": 2.9306322897483117, + "grad_norm": 0.37137365341186523, + "learning_rate": 8.292943628071727e-05, + "loss": 1.829, + "step": 9548 + }, + { + "epoch": 2.930939226519337, + "grad_norm": 0.31634894013404846, + "learning_rate": 8.292569575638905e-05, + "loss": 1.8062, + "step": 9549 + }, + { + "epoch": 2.9312461632903624, + "grad_norm": 0.25719332695007324, + "learning_rate": 8.292195490666981e-05, + "loss": 1.8044, + "step": 9550 + }, + { + "epoch": 2.9315531000613873, + "grad_norm": 0.3341852128505707, + "learning_rate": 8.291821373159652e-05, + "loss": 1.8627, + "step": 9551 + }, + { + "epoch": 2.9318600368324126, + "grad_norm": 0.38499385118484497, + "learning_rate": 8.291447223120614e-05, + "loss": 1.8138, + "step": 9552 + }, + { + "epoch": 2.9321669736034375, + "grad_norm": 0.28036460280418396, + "learning_rate": 8.291073040553567e-05, + "loss": 1.7958, + "step": 9553 + }, + { + "epoch": 2.932473910374463, + "grad_norm": 0.30798816680908203, + "learning_rate": 8.290698825462207e-05, + "loss": 1.899, + "step": 9554 + }, + { + "epoch": 2.932780847145488, + "grad_norm": 0.40930941700935364, + "learning_rate": 8.290324577850232e-05, + "loss": 1.841, + "step": 9555 + }, + { + "epoch": 2.933087783916513, + "grad_norm": 0.38794800639152527, + "learning_rate": 8.289950297721341e-05, + "loss": 1.8022, + "step": 9556 + }, + { + "epoch": 2.9333947206875384, + "grad_norm": 0.2716790437698364, + "learning_rate": 8.289575985079232e-05, + "loss": 1.8009, + "step": 9557 + }, + { + "epoch": 2.9337016574585633, + "grad_norm": 0.3063231110572815, + "learning_rate": 8.289201639927605e-05, + "loss": 1.8677, + "step": 9558 + }, + { + "epoch": 2.9340085942295886, + "grad_norm": 0.3279048800468445, + "learning_rate": 8.28882726227016e-05, + "loss": 1.8071, + "step": 9559 + }, + { + "epoch": 2.934315531000614, + "grad_norm": 0.32144758105278015, + "learning_rate": 8.288452852110596e-05, + "loss": 1.8601, + "step": 9560 + }, + { + "epoch": 2.9346224677716393, + "grad_norm": 0.284495085477829, + "learning_rate": 8.288078409452614e-05, + "loss": 1.8358, + "step": 9561 + }, + { + "epoch": 2.934929404542664, + "grad_norm": 0.3779112696647644, + "learning_rate": 8.287703934299915e-05, + "loss": 1.7903, + "step": 9562 + }, + { + "epoch": 2.9352363413136895, + "grad_norm": 0.33851495385169983, + "learning_rate": 8.287329426656197e-05, + "loss": 1.806, + "step": 9563 + }, + { + "epoch": 2.9355432780847144, + "grad_norm": 0.26610738039016724, + "learning_rate": 8.286954886525164e-05, + "loss": 1.7739, + "step": 9564 + }, + { + "epoch": 2.9358502148557397, + "grad_norm": 0.24825556576251984, + "learning_rate": 8.286580313910515e-05, + "loss": 1.7595, + "step": 9565 + }, + { + "epoch": 2.936157151626765, + "grad_norm": 0.28356245160102844, + "learning_rate": 8.286205708815954e-05, + "loss": 1.8497, + "step": 9566 + }, + { + "epoch": 2.93646408839779, + "grad_norm": 0.2974208891391754, + "learning_rate": 8.285831071245182e-05, + "loss": 1.8561, + "step": 9567 + }, + { + "epoch": 2.9367710251688153, + "grad_norm": 0.26718810200691223, + "learning_rate": 8.2854564012019e-05, + "loss": 1.776, + "step": 9568 + }, + { + "epoch": 2.93707796193984, + "grad_norm": 0.30627691745758057, + "learning_rate": 8.285081698689814e-05, + "loss": 1.8141, + "step": 9569 + }, + { + "epoch": 2.9373848987108655, + "grad_norm": 0.33287444710731506, + "learning_rate": 8.284706963712625e-05, + "loss": 1.8727, + "step": 9570 + }, + { + "epoch": 2.937691835481891, + "grad_norm": 0.30571332573890686, + "learning_rate": 8.284332196274036e-05, + "loss": 1.8388, + "step": 9571 + }, + { + "epoch": 2.937998772252916, + "grad_norm": 0.3603699207305908, + "learning_rate": 8.283957396377753e-05, + "loss": 1.8655, + "step": 9572 + }, + { + "epoch": 2.938305709023941, + "grad_norm": 0.2890760898590088, + "learning_rate": 8.283582564027477e-05, + "loss": 1.7919, + "step": 9573 + }, + { + "epoch": 2.9386126457949664, + "grad_norm": 0.34981194138526917, + "learning_rate": 8.283207699226912e-05, + "loss": 1.8542, + "step": 9574 + }, + { + "epoch": 2.9389195825659913, + "grad_norm": 0.43490317463874817, + "learning_rate": 8.282832801979766e-05, + "loss": 1.8109, + "step": 9575 + }, + { + "epoch": 2.9392265193370166, + "grad_norm": 0.4337438941001892, + "learning_rate": 8.282457872289742e-05, + "loss": 1.8856, + "step": 9576 + }, + { + "epoch": 2.939533456108042, + "grad_norm": 0.2723710834980011, + "learning_rate": 8.282082910160544e-05, + "loss": 1.8554, + "step": 9577 + }, + { + "epoch": 2.939840392879067, + "grad_norm": 0.32447734475135803, + "learning_rate": 8.28170791559588e-05, + "loss": 1.8086, + "step": 9578 + }, + { + "epoch": 2.940147329650092, + "grad_norm": 0.3495276868343353, + "learning_rate": 8.281332888599455e-05, + "loss": 1.785, + "step": 9579 + }, + { + "epoch": 2.940454266421117, + "grad_norm": 0.3324705958366394, + "learning_rate": 8.280957829174975e-05, + "loss": 1.8086, + "step": 9580 + }, + { + "epoch": 2.9407612031921424, + "grad_norm": 0.2633898854255676, + "learning_rate": 8.280582737326146e-05, + "loss": 1.8116, + "step": 9581 + }, + { + "epoch": 2.9410681399631677, + "grad_norm": 0.3109157085418701, + "learning_rate": 8.280207613056676e-05, + "loss": 1.8649, + "step": 9582 + }, + { + "epoch": 2.9413750767341926, + "grad_norm": 0.2772599756717682, + "learning_rate": 8.279832456370273e-05, + "loss": 1.8578, + "step": 9583 + }, + { + "epoch": 2.941682013505218, + "grad_norm": 0.32322654128074646, + "learning_rate": 8.279457267270642e-05, + "loss": 1.8621, + "step": 9584 + }, + { + "epoch": 2.941988950276243, + "grad_norm": 0.3678343594074249, + "learning_rate": 8.279082045761493e-05, + "loss": 1.8819, + "step": 9585 + }, + { + "epoch": 2.942295887047268, + "grad_norm": 0.30976057052612305, + "learning_rate": 8.27870679184653e-05, + "loss": 1.8126, + "step": 9586 + }, + { + "epoch": 2.9426028238182935, + "grad_norm": 0.26715603470802307, + "learning_rate": 8.278331505529469e-05, + "loss": 1.8831, + "step": 9587 + }, + { + "epoch": 2.942909760589319, + "grad_norm": 0.263288289308548, + "learning_rate": 8.277956186814014e-05, + "loss": 1.8057, + "step": 9588 + }, + { + "epoch": 2.9432166973603437, + "grad_norm": 0.29458633065223694, + "learning_rate": 8.277580835703873e-05, + "loss": 1.7307, + "step": 9589 + }, + { + "epoch": 2.943523634131369, + "grad_norm": 0.27819791436195374, + "learning_rate": 8.277205452202759e-05, + "loss": 1.8783, + "step": 9590 + }, + { + "epoch": 2.943830570902394, + "grad_norm": 0.29286056756973267, + "learning_rate": 8.276830036314379e-05, + "loss": 1.8061, + "step": 9591 + }, + { + "epoch": 2.9441375076734193, + "grad_norm": 0.2955230474472046, + "learning_rate": 8.276454588042442e-05, + "loss": 1.8227, + "step": 9592 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.3070714473724365, + "learning_rate": 8.276079107390663e-05, + "loss": 1.8451, + "step": 9593 + }, + { + "epoch": 2.9447513812154695, + "grad_norm": 0.34235841035842896, + "learning_rate": 8.275703594362749e-05, + "loss": 1.8052, + "step": 9594 + }, + { + "epoch": 2.945058317986495, + "grad_norm": 0.2863236665725708, + "learning_rate": 8.275328048962412e-05, + "loss": 1.8741, + "step": 9595 + }, + { + "epoch": 2.9453652547575198, + "grad_norm": 0.3013235032558441, + "learning_rate": 8.274952471193364e-05, + "loss": 1.8177, + "step": 9596 + }, + { + "epoch": 2.945672191528545, + "grad_norm": 0.2994023561477661, + "learning_rate": 8.274576861059316e-05, + "loss": 1.903, + "step": 9597 + }, + { + "epoch": 2.9459791282995704, + "grad_norm": 0.320049524307251, + "learning_rate": 8.27420121856398e-05, + "loss": 1.882, + "step": 9598 + }, + { + "epoch": 2.9462860650705953, + "grad_norm": 0.2789655029773712, + "learning_rate": 8.273825543711069e-05, + "loss": 1.794, + "step": 9599 + }, + { + "epoch": 2.9465930018416207, + "grad_norm": 0.3148564398288727, + "learning_rate": 8.273449836504294e-05, + "loss": 1.8453, + "step": 9600 + }, + { + "epoch": 2.9468999386126455, + "grad_norm": 0.46754372119903564, + "learning_rate": 8.273074096947371e-05, + "loss": 1.8147, + "step": 9601 + }, + { + "epoch": 2.947206875383671, + "grad_norm": 0.5946900844573975, + "learning_rate": 8.27269832504401e-05, + "loss": 1.8099, + "step": 9602 + }, + { + "epoch": 2.947513812154696, + "grad_norm": 0.4916069507598877, + "learning_rate": 8.272322520797926e-05, + "loss": 1.8315, + "step": 9603 + }, + { + "epoch": 2.9478207489257215, + "grad_norm": 0.30378973484039307, + "learning_rate": 8.271946684212833e-05, + "loss": 1.87, + "step": 9604 + }, + { + "epoch": 2.9481276856967464, + "grad_norm": 0.5197327136993408, + "learning_rate": 8.271570815292447e-05, + "loss": 1.8109, + "step": 9605 + }, + { + "epoch": 2.9484346224677718, + "grad_norm": 0.7213841080665588, + "learning_rate": 8.271194914040478e-05, + "loss": 1.8526, + "step": 9606 + }, + { + "epoch": 2.9487415592387967, + "grad_norm": 0.5521572232246399, + "learning_rate": 8.270818980460643e-05, + "loss": 1.7982, + "step": 9607 + }, + { + "epoch": 2.949048496009822, + "grad_norm": 0.3072868287563324, + "learning_rate": 8.27044301455666e-05, + "loss": 1.8708, + "step": 9608 + }, + { + "epoch": 2.9493554327808473, + "grad_norm": 0.5477200746536255, + "learning_rate": 8.270067016332241e-05, + "loss": 1.8708, + "step": 9609 + }, + { + "epoch": 2.949662369551872, + "grad_norm": 0.5991030335426331, + "learning_rate": 8.269690985791104e-05, + "loss": 1.7983, + "step": 9610 + }, + { + "epoch": 2.9499693063228976, + "grad_norm": 0.33343803882598877, + "learning_rate": 8.269314922936964e-05, + "loss": 1.7867, + "step": 9611 + }, + { + "epoch": 2.9502762430939224, + "grad_norm": 0.3671727776527405, + "learning_rate": 8.268938827773538e-05, + "loss": 1.9604, + "step": 9612 + }, + { + "epoch": 2.950583179864948, + "grad_norm": 0.5015503764152527, + "learning_rate": 8.26856270030454e-05, + "loss": 1.8424, + "step": 9613 + }, + { + "epoch": 2.950890116635973, + "grad_norm": 0.4369170367717743, + "learning_rate": 8.268186540533693e-05, + "loss": 1.7915, + "step": 9614 + }, + { + "epoch": 2.951197053406998, + "grad_norm": 0.2739746868610382, + "learning_rate": 8.267810348464709e-05, + "loss": 1.7816, + "step": 9615 + }, + { + "epoch": 2.9515039901780233, + "grad_norm": 0.3660983145236969, + "learning_rate": 8.26743412410131e-05, + "loss": 1.8235, + "step": 9616 + }, + { + "epoch": 2.9518109269490482, + "grad_norm": 0.44442248344421387, + "learning_rate": 8.26705786744721e-05, + "loss": 1.8566, + "step": 9617 + }, + { + "epoch": 2.9521178637200736, + "grad_norm": 0.28847622871398926, + "learning_rate": 8.266681578506129e-05, + "loss": 1.82, + "step": 9618 + }, + { + "epoch": 2.952424800491099, + "grad_norm": 0.32827475666999817, + "learning_rate": 8.266305257281786e-05, + "loss": 1.8422, + "step": 9619 + }, + { + "epoch": 2.9527317372621242, + "grad_norm": 0.3459654748439789, + "learning_rate": 8.265928903777902e-05, + "loss": 1.7919, + "step": 9620 + }, + { + "epoch": 2.953038674033149, + "grad_norm": 0.31467050313949585, + "learning_rate": 8.265552517998191e-05, + "loss": 1.8178, + "step": 9621 + }, + { + "epoch": 2.9533456108041745, + "grad_norm": 0.2814936935901642, + "learning_rate": 8.265176099946381e-05, + "loss": 1.7823, + "step": 9622 + }, + { + "epoch": 2.9536525475751993, + "grad_norm": 0.36387261748313904, + "learning_rate": 8.264799649626182e-05, + "loss": 1.7861, + "step": 9623 + }, + { + "epoch": 2.9539594843462247, + "grad_norm": 0.3504095673561096, + "learning_rate": 8.264423167041322e-05, + "loss": 1.8216, + "step": 9624 + }, + { + "epoch": 2.95426642111725, + "grad_norm": 0.28199300169944763, + "learning_rate": 8.264046652195519e-05, + "loss": 1.8397, + "step": 9625 + }, + { + "epoch": 2.954573357888275, + "grad_norm": 0.435774028301239, + "learning_rate": 8.263670105092494e-05, + "loss": 1.8316, + "step": 9626 + }, + { + "epoch": 2.9548802946593002, + "grad_norm": 0.37712937593460083, + "learning_rate": 8.263293525735967e-05, + "loss": 1.8089, + "step": 9627 + }, + { + "epoch": 2.955187231430325, + "grad_norm": 0.34833967685699463, + "learning_rate": 8.26291691412966e-05, + "loss": 1.8324, + "step": 9628 + }, + { + "epoch": 2.9554941682013505, + "grad_norm": 0.37515538930892944, + "learning_rate": 8.262540270277297e-05, + "loss": 1.7958, + "step": 9629 + }, + { + "epoch": 2.955801104972376, + "grad_norm": 0.3392273485660553, + "learning_rate": 8.262163594182598e-05, + "loss": 1.8322, + "step": 9630 + }, + { + "epoch": 2.9561080417434007, + "grad_norm": 0.3477925956249237, + "learning_rate": 8.261786885849287e-05, + "loss": 1.8525, + "step": 9631 + }, + { + "epoch": 2.956414978514426, + "grad_norm": 0.35574036836624146, + "learning_rate": 8.261410145281085e-05, + "loss": 1.8148, + "step": 9632 + }, + { + "epoch": 2.9567219152854514, + "grad_norm": 0.3166620135307312, + "learning_rate": 8.261033372481717e-05, + "loss": 1.7914, + "step": 9633 + }, + { + "epoch": 2.9570288520564763, + "grad_norm": 0.2562217116355896, + "learning_rate": 8.260656567454907e-05, + "loss": 1.7794, + "step": 9634 + }, + { + "epoch": 2.9573357888275016, + "grad_norm": 0.3328792452812195, + "learning_rate": 8.260279730204377e-05, + "loss": 1.8235, + "step": 9635 + }, + { + "epoch": 2.957642725598527, + "grad_norm": 0.33144834637641907, + "learning_rate": 8.259902860733852e-05, + "loss": 1.7668, + "step": 9636 + }, + { + "epoch": 2.957949662369552, + "grad_norm": 0.30557021498680115, + "learning_rate": 8.259525959047056e-05, + "loss": 1.9135, + "step": 9637 + }, + { + "epoch": 2.958256599140577, + "grad_norm": 0.2901468575000763, + "learning_rate": 8.259149025147713e-05, + "loss": 1.8023, + "step": 9638 + }, + { + "epoch": 2.958563535911602, + "grad_norm": 0.35177919268608093, + "learning_rate": 8.25877205903955e-05, + "loss": 1.8541, + "step": 9639 + }, + { + "epoch": 2.9588704726826274, + "grad_norm": 0.2745177447795868, + "learning_rate": 8.258395060726291e-05, + "loss": 1.8103, + "step": 9640 + }, + { + "epoch": 2.9591774094536527, + "grad_norm": 0.29005685448646545, + "learning_rate": 8.258018030211663e-05, + "loss": 1.7587, + "step": 9641 + }, + { + "epoch": 2.9594843462246776, + "grad_norm": 0.27498918771743774, + "learning_rate": 8.257640967499391e-05, + "loss": 1.8052, + "step": 9642 + }, + { + "epoch": 2.959791282995703, + "grad_norm": 0.2689644694328308, + "learning_rate": 8.257263872593202e-05, + "loss": 1.8582, + "step": 9643 + }, + { + "epoch": 2.960098219766728, + "grad_norm": 0.2953707277774811, + "learning_rate": 8.256886745496821e-05, + "loss": 1.7654, + "step": 9644 + }, + { + "epoch": 2.960405156537753, + "grad_norm": 0.2573971450328827, + "learning_rate": 8.256509586213978e-05, + "loss": 1.7819, + "step": 9645 + }, + { + "epoch": 2.9607120933087785, + "grad_norm": 0.29667192697525024, + "learning_rate": 8.256132394748398e-05, + "loss": 1.8632, + "step": 9646 + }, + { + "epoch": 2.961019030079804, + "grad_norm": 0.2953830361366272, + "learning_rate": 8.255755171103808e-05, + "loss": 1.8672, + "step": 9647 + }, + { + "epoch": 2.9613259668508287, + "grad_norm": 0.2925500273704529, + "learning_rate": 8.255377915283937e-05, + "loss": 1.8691, + "step": 9648 + }, + { + "epoch": 2.961632903621854, + "grad_norm": 0.32245302200317383, + "learning_rate": 8.255000627292515e-05, + "loss": 1.8701, + "step": 9649 + }, + { + "epoch": 2.961939840392879, + "grad_norm": 0.2671414315700531, + "learning_rate": 8.254623307133268e-05, + "loss": 1.8045, + "step": 9650 + }, + { + "epoch": 2.9622467771639043, + "grad_norm": 0.3135749101638794, + "learning_rate": 8.254245954809928e-05, + "loss": 1.7573, + "step": 9651 + }, + { + "epoch": 2.9625537139349296, + "grad_norm": 0.2604369521141052, + "learning_rate": 8.253868570326218e-05, + "loss": 1.8513, + "step": 9652 + }, + { + "epoch": 2.9628606507059545, + "grad_norm": 0.24657092988491058, + "learning_rate": 8.253491153685875e-05, + "loss": 1.8303, + "step": 9653 + }, + { + "epoch": 2.96316758747698, + "grad_norm": 0.24310527741909027, + "learning_rate": 8.253113704892623e-05, + "loss": 1.7648, + "step": 9654 + }, + { + "epoch": 2.9634745242480047, + "grad_norm": 0.24558408558368683, + "learning_rate": 8.252736223950198e-05, + "loss": 1.7517, + "step": 9655 + }, + { + "epoch": 2.96378146101903, + "grad_norm": 0.2500043511390686, + "learning_rate": 8.252358710862324e-05, + "loss": 1.7588, + "step": 9656 + }, + { + "epoch": 2.9640883977900554, + "grad_norm": 0.2532055079936981, + "learning_rate": 8.251981165632737e-05, + "loss": 1.8414, + "step": 9657 + }, + { + "epoch": 2.9643953345610803, + "grad_norm": 0.2692684829235077, + "learning_rate": 8.251603588265165e-05, + "loss": 1.8701, + "step": 9658 + }, + { + "epoch": 2.9647022713321056, + "grad_norm": 0.2511022984981537, + "learning_rate": 8.251225978763341e-05, + "loss": 1.8068, + "step": 9659 + }, + { + "epoch": 2.9650092081031305, + "grad_norm": 0.24702081084251404, + "learning_rate": 8.250848337130997e-05, + "loss": 1.7993, + "step": 9660 + }, + { + "epoch": 2.965316144874156, + "grad_norm": 0.26960623264312744, + "learning_rate": 8.250470663371862e-05, + "loss": 1.8269, + "step": 9661 + }, + { + "epoch": 2.965623081645181, + "grad_norm": 0.2651064693927765, + "learning_rate": 8.250092957489673e-05, + "loss": 1.8235, + "step": 9662 + }, + { + "epoch": 2.9659300184162065, + "grad_norm": 0.3117934465408325, + "learning_rate": 8.249715219488158e-05, + "loss": 1.9603, + "step": 9663 + }, + { + "epoch": 2.9662369551872314, + "grad_norm": 0.3244706988334656, + "learning_rate": 8.249337449371055e-05, + "loss": 1.8766, + "step": 9664 + }, + { + "epoch": 2.9665438919582567, + "grad_norm": 0.3071763515472412, + "learning_rate": 8.248959647142094e-05, + "loss": 1.8118, + "step": 9665 + }, + { + "epoch": 2.9668508287292816, + "grad_norm": 0.2575626075267792, + "learning_rate": 8.24858181280501e-05, + "loss": 1.8578, + "step": 9666 + }, + { + "epoch": 2.967157765500307, + "grad_norm": 0.369356244802475, + "learning_rate": 8.248203946363535e-05, + "loss": 1.7831, + "step": 9667 + }, + { + "epoch": 2.9674647022713323, + "grad_norm": 0.317775160074234, + "learning_rate": 8.247826047821405e-05, + "loss": 1.8839, + "step": 9668 + }, + { + "epoch": 2.967771639042357, + "grad_norm": 0.31816980242729187, + "learning_rate": 8.247448117182355e-05, + "loss": 1.8111, + "step": 9669 + }, + { + "epoch": 2.9680785758133825, + "grad_norm": 0.2943781316280365, + "learning_rate": 8.247070154450119e-05, + "loss": 1.848, + "step": 9670 + }, + { + "epoch": 2.9683855125844074, + "grad_norm": 0.28252434730529785, + "learning_rate": 8.246692159628433e-05, + "loss": 1.8601, + "step": 9671 + }, + { + "epoch": 2.9686924493554327, + "grad_norm": 0.29150691628456116, + "learning_rate": 8.246314132721032e-05, + "loss": 1.7738, + "step": 9672 + }, + { + "epoch": 2.968999386126458, + "grad_norm": 0.3699757754802704, + "learning_rate": 8.245936073731653e-05, + "loss": 1.842, + "step": 9673 + }, + { + "epoch": 2.969306322897483, + "grad_norm": 0.37951794266700745, + "learning_rate": 8.245557982664031e-05, + "loss": 1.8648, + "step": 9674 + }, + { + "epoch": 2.9696132596685083, + "grad_norm": 0.2792273461818695, + "learning_rate": 8.245179859521901e-05, + "loss": 1.889, + "step": 9675 + }, + { + "epoch": 2.969920196439533, + "grad_norm": 0.3405047059059143, + "learning_rate": 8.244801704309002e-05, + "loss": 1.7658, + "step": 9676 + }, + { + "epoch": 2.9702271332105585, + "grad_norm": 0.40138551592826843, + "learning_rate": 8.244423517029072e-05, + "loss": 1.79, + "step": 9677 + }, + { + "epoch": 2.970534069981584, + "grad_norm": 0.42260462045669556, + "learning_rate": 8.244045297685846e-05, + "loss": 1.9248, + "step": 9678 + }, + { + "epoch": 2.970841006752609, + "grad_norm": 0.30391061305999756, + "learning_rate": 8.243667046283063e-05, + "loss": 1.7922, + "step": 9679 + }, + { + "epoch": 2.971147943523634, + "grad_norm": 0.3194752037525177, + "learning_rate": 8.243288762824463e-05, + "loss": 1.8582, + "step": 9680 + }, + { + "epoch": 2.9714548802946594, + "grad_norm": 0.47853100299835205, + "learning_rate": 8.24291044731378e-05, + "loss": 1.8206, + "step": 9681 + }, + { + "epoch": 2.9717618170656843, + "grad_norm": 0.47428956627845764, + "learning_rate": 8.242532099754756e-05, + "loss": 1.8271, + "step": 9682 + }, + { + "epoch": 2.9720687538367097, + "grad_norm": 0.30275169014930725, + "learning_rate": 8.24215372015113e-05, + "loss": 1.8532, + "step": 9683 + }, + { + "epoch": 2.972375690607735, + "grad_norm": 0.31766825914382935, + "learning_rate": 8.24177530850664e-05, + "loss": 1.7751, + "step": 9684 + }, + { + "epoch": 2.97268262737876, + "grad_norm": 0.3738986551761627, + "learning_rate": 8.241396864825026e-05, + "loss": 1.7644, + "step": 9685 + }, + { + "epoch": 2.972989564149785, + "grad_norm": 0.2794596254825592, + "learning_rate": 8.24101838911003e-05, + "loss": 1.7445, + "step": 9686 + }, + { + "epoch": 2.97329650092081, + "grad_norm": 0.30008718371391296, + "learning_rate": 8.240639881365388e-05, + "loss": 1.8181, + "step": 9687 + }, + { + "epoch": 2.9736034376918354, + "grad_norm": 0.36667200922966003, + "learning_rate": 8.240261341594846e-05, + "loss": 1.8606, + "step": 9688 + }, + { + "epoch": 2.9739103744628608, + "grad_norm": 0.2943612039089203, + "learning_rate": 8.23988276980214e-05, + "loss": 1.8169, + "step": 9689 + }, + { + "epoch": 2.9742173112338857, + "grad_norm": 0.3499365746974945, + "learning_rate": 8.239504165991015e-05, + "loss": 1.8901, + "step": 9690 + }, + { + "epoch": 2.974524248004911, + "grad_norm": 0.35552978515625, + "learning_rate": 8.239125530165211e-05, + "loss": 1.8266, + "step": 9691 + }, + { + "epoch": 2.974831184775936, + "grad_norm": 0.35415011644363403, + "learning_rate": 8.23874686232847e-05, + "loss": 1.8588, + "step": 9692 + }, + { + "epoch": 2.9751381215469612, + "grad_norm": 0.3237420618534088, + "learning_rate": 8.238368162484533e-05, + "loss": 1.8112, + "step": 9693 + }, + { + "epoch": 2.9754450583179866, + "grad_norm": 0.31672203540802, + "learning_rate": 8.237989430637145e-05, + "loss": 1.7983, + "step": 9694 + }, + { + "epoch": 2.975751995089012, + "grad_norm": 0.2926657795906067, + "learning_rate": 8.237610666790048e-05, + "loss": 1.8137, + "step": 9695 + }, + { + "epoch": 2.976058931860037, + "grad_norm": 0.2924230992794037, + "learning_rate": 8.237231870946983e-05, + "loss": 1.8789, + "step": 9696 + }, + { + "epoch": 2.976365868631062, + "grad_norm": 0.2768077850341797, + "learning_rate": 8.236853043111697e-05, + "loss": 1.8643, + "step": 9697 + }, + { + "epoch": 2.976672805402087, + "grad_norm": 0.24151389300823212, + "learning_rate": 8.23647418328793e-05, + "loss": 1.8245, + "step": 9698 + }, + { + "epoch": 2.9769797421731123, + "grad_norm": 0.24514195322990417, + "learning_rate": 8.23609529147943e-05, + "loss": 1.761, + "step": 9699 + }, + { + "epoch": 2.9772866789441377, + "grad_norm": 0.2619125545024872, + "learning_rate": 8.235716367689938e-05, + "loss": 1.8445, + "step": 9700 + }, + { + "epoch": 2.9775936157151626, + "grad_norm": 0.2570437490940094, + "learning_rate": 8.235337411923203e-05, + "loss": 1.7881, + "step": 9701 + }, + { + "epoch": 2.977900552486188, + "grad_norm": 0.288775235414505, + "learning_rate": 8.234958424182966e-05, + "loss": 1.8177, + "step": 9702 + }, + { + "epoch": 2.978207489257213, + "grad_norm": 0.3186240792274475, + "learning_rate": 8.234579404472973e-05, + "loss": 1.8438, + "step": 9703 + }, + { + "epoch": 2.978514426028238, + "grad_norm": 0.2520117163658142, + "learning_rate": 8.23420035279697e-05, + "loss": 1.7791, + "step": 9704 + }, + { + "epoch": 2.9788213627992635, + "grad_norm": 0.23164312541484833, + "learning_rate": 8.233821269158706e-05, + "loss": 1.7368, + "step": 9705 + }, + { + "epoch": 2.979128299570289, + "grad_norm": 0.33843451738357544, + "learning_rate": 8.233442153561924e-05, + "loss": 1.8656, + "step": 9706 + }, + { + "epoch": 2.9794352363413137, + "grad_norm": 0.3070257604122162, + "learning_rate": 8.23306300601037e-05, + "loss": 1.7982, + "step": 9707 + }, + { + "epoch": 2.979742173112339, + "grad_norm": 0.29138872027397156, + "learning_rate": 8.232683826507793e-05, + "loss": 1.8227, + "step": 9708 + }, + { + "epoch": 2.980049109883364, + "grad_norm": 0.22698308527469635, + "learning_rate": 8.23230461505794e-05, + "loss": 1.7841, + "step": 9709 + }, + { + "epoch": 2.9803560466543892, + "grad_norm": 0.2597857713699341, + "learning_rate": 8.231925371664559e-05, + "loss": 1.7438, + "step": 9710 + }, + { + "epoch": 2.9806629834254146, + "grad_norm": 0.28672367334365845, + "learning_rate": 8.231546096331395e-05, + "loss": 1.8415, + "step": 9711 + }, + { + "epoch": 2.9809699201964395, + "grad_norm": 0.24295037984848022, + "learning_rate": 8.2311667890622e-05, + "loss": 1.8179, + "step": 9712 + }, + { + "epoch": 2.981276856967465, + "grad_norm": 0.24558894336223602, + "learning_rate": 8.23078744986072e-05, + "loss": 1.8092, + "step": 9713 + }, + { + "epoch": 2.9815837937384897, + "grad_norm": 0.2644276022911072, + "learning_rate": 8.230408078730706e-05, + "loss": 1.8214, + "step": 9714 + }, + { + "epoch": 2.981890730509515, + "grad_norm": 0.27007076144218445, + "learning_rate": 8.230028675675907e-05, + "loss": 1.8042, + "step": 9715 + }, + { + "epoch": 2.9821976672805404, + "grad_norm": 0.2729937732219696, + "learning_rate": 8.229649240700069e-05, + "loss": 1.8419, + "step": 9716 + }, + { + "epoch": 2.9825046040515653, + "grad_norm": 0.26545679569244385, + "learning_rate": 8.229269773806945e-05, + "loss": 1.823, + "step": 9717 + }, + { + "epoch": 2.9828115408225906, + "grad_norm": 0.23276878893375397, + "learning_rate": 8.228890275000285e-05, + "loss": 1.7635, + "step": 9718 + }, + { + "epoch": 2.9831184775936155, + "grad_norm": 0.28991779685020447, + "learning_rate": 8.228510744283837e-05, + "loss": 1.8303, + "step": 9719 + }, + { + "epoch": 2.983425414364641, + "grad_norm": 0.2821960151195526, + "learning_rate": 8.228131181661357e-05, + "loss": 1.8246, + "step": 9720 + }, + { + "epoch": 2.983732351135666, + "grad_norm": 0.25588423013687134, + "learning_rate": 8.22775158713659e-05, + "loss": 1.7764, + "step": 9721 + }, + { + "epoch": 2.9840392879066915, + "grad_norm": 0.2694758176803589, + "learning_rate": 8.227371960713289e-05, + "loss": 1.8026, + "step": 9722 + }, + { + "epoch": 2.9843462246777164, + "grad_norm": 0.27571097016334534, + "learning_rate": 8.226992302395209e-05, + "loss": 1.8051, + "step": 9723 + }, + { + "epoch": 2.9846531614487417, + "grad_norm": 0.2940119504928589, + "learning_rate": 8.226612612186099e-05, + "loss": 1.8782, + "step": 9724 + }, + { + "epoch": 2.9849600982197666, + "grad_norm": 0.34924936294555664, + "learning_rate": 8.226232890089711e-05, + "loss": 1.7845, + "step": 9725 + }, + { + "epoch": 2.985267034990792, + "grad_norm": 0.30503180623054504, + "learning_rate": 8.2258531361098e-05, + "loss": 1.8345, + "step": 9726 + }, + { + "epoch": 2.9855739717618173, + "grad_norm": 0.2463730275630951, + "learning_rate": 8.225473350250117e-05, + "loss": 1.8188, + "step": 9727 + }, + { + "epoch": 2.985880908532842, + "grad_norm": 0.3514629900455475, + "learning_rate": 8.225093532514417e-05, + "loss": 1.9253, + "step": 9728 + }, + { + "epoch": 2.9861878453038675, + "grad_norm": 0.26462769508361816, + "learning_rate": 8.224713682906449e-05, + "loss": 1.7396, + "step": 9729 + }, + { + "epoch": 2.9864947820748924, + "grad_norm": 0.27125996351242065, + "learning_rate": 8.224333801429973e-05, + "loss": 1.7784, + "step": 9730 + }, + { + "epoch": 2.9868017188459177, + "grad_norm": 0.3083387315273285, + "learning_rate": 8.22395388808874e-05, + "loss": 1.8503, + "step": 9731 + }, + { + "epoch": 2.987108655616943, + "grad_norm": 0.28289708495140076, + "learning_rate": 8.223573942886505e-05, + "loss": 1.8337, + "step": 9732 + }, + { + "epoch": 2.987415592387968, + "grad_norm": 0.3667753040790558, + "learning_rate": 8.223193965827023e-05, + "loss": 1.8213, + "step": 9733 + }, + { + "epoch": 2.9877225291589933, + "grad_norm": 0.3568948805332184, + "learning_rate": 8.222813956914049e-05, + "loss": 1.8337, + "step": 9734 + }, + { + "epoch": 2.988029465930018, + "grad_norm": 0.2883065640926361, + "learning_rate": 8.22243391615134e-05, + "loss": 1.7227, + "step": 9735 + }, + { + "epoch": 2.9883364027010435, + "grad_norm": 0.24940936267375946, + "learning_rate": 8.222053843542648e-05, + "loss": 1.7889, + "step": 9736 + }, + { + "epoch": 2.988643339472069, + "grad_norm": 0.31267982721328735, + "learning_rate": 8.221673739091732e-05, + "loss": 1.8432, + "step": 9737 + }, + { + "epoch": 2.988950276243094, + "grad_norm": 0.3552311658859253, + "learning_rate": 8.221293602802349e-05, + "loss": 1.8569, + "step": 9738 + }, + { + "epoch": 2.989257213014119, + "grad_norm": 0.4149966835975647, + "learning_rate": 8.220913434678252e-05, + "loss": 1.8052, + "step": 9739 + }, + { + "epoch": 2.9895641497851444, + "grad_norm": 0.282320499420166, + "learning_rate": 8.220533234723204e-05, + "loss": 1.7629, + "step": 9740 + }, + { + "epoch": 2.9898710865561693, + "grad_norm": 0.27737030386924744, + "learning_rate": 8.220153002940958e-05, + "loss": 1.8331, + "step": 9741 + }, + { + "epoch": 2.9901780233271946, + "grad_norm": 0.29296645522117615, + "learning_rate": 8.219772739335272e-05, + "loss": 1.8414, + "step": 9742 + }, + { + "epoch": 2.99048496009822, + "grad_norm": 0.35226449370384216, + "learning_rate": 8.219392443909903e-05, + "loss": 1.8608, + "step": 9743 + }, + { + "epoch": 2.990791896869245, + "grad_norm": 0.3199223577976227, + "learning_rate": 8.219012116668612e-05, + "loss": 1.7868, + "step": 9744 + }, + { + "epoch": 2.99109883364027, + "grad_norm": 0.2904597818851471, + "learning_rate": 8.218631757615159e-05, + "loss": 1.8495, + "step": 9745 + }, + { + "epoch": 2.991405770411295, + "grad_norm": 0.34674009680747986, + "learning_rate": 8.218251366753298e-05, + "loss": 1.8143, + "step": 9746 + }, + { + "epoch": 2.9917127071823204, + "grad_norm": 0.38007479906082153, + "learning_rate": 8.217870944086791e-05, + "loss": 1.8534, + "step": 9747 + }, + { + "epoch": 2.9920196439533457, + "grad_norm": 0.31660130620002747, + "learning_rate": 8.217490489619398e-05, + "loss": 1.7807, + "step": 9748 + }, + { + "epoch": 2.9923265807243706, + "grad_norm": 0.2923539876937866, + "learning_rate": 8.217110003354877e-05, + "loss": 1.8517, + "step": 9749 + }, + { + "epoch": 2.992633517495396, + "grad_norm": 0.31018227338790894, + "learning_rate": 8.21672948529699e-05, + "loss": 1.7998, + "step": 9750 + }, + { + "epoch": 2.992940454266421, + "grad_norm": 0.29448994994163513, + "learning_rate": 8.216348935449496e-05, + "loss": 1.7883, + "step": 9751 + }, + { + "epoch": 2.993247391037446, + "grad_norm": 0.26120781898498535, + "learning_rate": 8.215968353816158e-05, + "loss": 1.7762, + "step": 9752 + }, + { + "epoch": 2.9935543278084715, + "grad_norm": 0.27784180641174316, + "learning_rate": 8.215587740400735e-05, + "loss": 1.8711, + "step": 9753 + }, + { + "epoch": 2.993861264579497, + "grad_norm": 0.3106052577495575, + "learning_rate": 8.21520709520699e-05, + "loss": 1.8112, + "step": 9754 + }, + { + "epoch": 2.9941682013505218, + "grad_norm": 0.3170885145664215, + "learning_rate": 8.214826418238684e-05, + "loss": 1.8893, + "step": 9755 + }, + { + "epoch": 2.994475138121547, + "grad_norm": 0.2969432473182678, + "learning_rate": 8.214445709499577e-05, + "loss": 1.8628, + "step": 9756 + }, + { + "epoch": 2.994782074892572, + "grad_norm": 0.30484744906425476, + "learning_rate": 8.214064968993436e-05, + "loss": 1.8421, + "step": 9757 + }, + { + "epoch": 2.9950890116635973, + "grad_norm": 0.24819856882095337, + "learning_rate": 8.213684196724019e-05, + "loss": 1.8243, + "step": 9758 + }, + { + "epoch": 2.9953959484346226, + "grad_norm": 0.28566786646842957, + "learning_rate": 8.213303392695092e-05, + "loss": 1.8064, + "step": 9759 + }, + { + "epoch": 2.9957028852056475, + "grad_norm": 0.27742111682891846, + "learning_rate": 8.212922556910418e-05, + "loss": 1.8174, + "step": 9760 + }, + { + "epoch": 2.996009821976673, + "grad_norm": 0.27103090286254883, + "learning_rate": 8.212541689373761e-05, + "loss": 1.761, + "step": 9761 + }, + { + "epoch": 2.9963167587476978, + "grad_norm": 0.27157172560691833, + "learning_rate": 8.212160790088883e-05, + "loss": 1.8893, + "step": 9762 + }, + { + "epoch": 2.996623695518723, + "grad_norm": 0.2742370367050171, + "learning_rate": 8.21177985905955e-05, + "loss": 1.8774, + "step": 9763 + }, + { + "epoch": 2.9969306322897484, + "grad_norm": 0.26467064023017883, + "learning_rate": 8.211398896289524e-05, + "loss": 1.7805, + "step": 9764 + }, + { + "epoch": 2.9972375690607733, + "grad_norm": 0.2622149884700775, + "learning_rate": 8.211017901782574e-05, + "loss": 1.7346, + "step": 9765 + }, + { + "epoch": 2.9975445058317987, + "grad_norm": 0.3163202106952667, + "learning_rate": 8.210636875542462e-05, + "loss": 1.8348, + "step": 9766 + }, + { + "epoch": 2.9978514426028235, + "grad_norm": 0.2789528965950012, + "learning_rate": 8.210255817572955e-05, + "loss": 1.7535, + "step": 9767 + }, + { + "epoch": 2.998158379373849, + "grad_norm": 0.25694188475608826, + "learning_rate": 8.209874727877818e-05, + "loss": 1.8731, + "step": 9768 + }, + { + "epoch": 2.998465316144874, + "grad_norm": 0.40298742055892944, + "learning_rate": 8.209493606460818e-05, + "loss": 1.7924, + "step": 9769 + }, + { + "epoch": 2.9987722529158995, + "grad_norm": 0.5090280771255493, + "learning_rate": 8.20911245332572e-05, + "loss": 1.8253, + "step": 9770 + }, + { + "epoch": 2.9990791896869244, + "grad_norm": 0.41809162497520447, + "learning_rate": 8.208731268476293e-05, + "loss": 1.8233, + "step": 9771 + }, + { + "epoch": 2.9993861264579498, + "grad_norm": 0.23141434788703918, + "learning_rate": 8.208350051916303e-05, + "loss": 1.7842, + "step": 9772 + }, + { + "epoch": 2.9996930632289747, + "grad_norm": 0.3174372613430023, + "learning_rate": 8.207968803649517e-05, + "loss": 1.8477, + "step": 9773 + }, + { + "epoch": 3.0, + "grad_norm": 0.41795292496681213, + "learning_rate": 8.207587523679704e-05, + "loss": 1.8407, + "step": 9774 + }, + { + "epoch": 3.0003069367710253, + "grad_norm": 0.43365660309791565, + "learning_rate": 8.20720621201063e-05, + "loss": 1.8074, + "step": 9775 + }, + { + "epoch": 3.0006138735420502, + "grad_norm": 0.461374968290329, + "learning_rate": 8.206824868646064e-05, + "loss": 1.9089, + "step": 9776 + }, + { + "epoch": 3.0009208103130756, + "grad_norm": 0.3747929632663727, + "learning_rate": 8.206443493589776e-05, + "loss": 1.8358, + "step": 9777 + }, + { + "epoch": 3.001227747084101, + "grad_norm": 0.28436774015426636, + "learning_rate": 8.206062086845532e-05, + "loss": 1.8527, + "step": 9778 + }, + { + "epoch": 3.001534683855126, + "grad_norm": 0.33642131090164185, + "learning_rate": 8.205680648417106e-05, + "loss": 1.8142, + "step": 9779 + }, + { + "epoch": 3.001841620626151, + "grad_norm": 0.4283481240272522, + "learning_rate": 8.205299178308263e-05, + "loss": 1.9006, + "step": 9780 + }, + { + "epoch": 3.002148557397176, + "grad_norm": 0.34405630826950073, + "learning_rate": 8.204917676522777e-05, + "loss": 1.7988, + "step": 9781 + }, + { + "epoch": 3.0024554941682013, + "grad_norm": 0.3161070942878723, + "learning_rate": 8.204536143064414e-05, + "loss": 1.8271, + "step": 9782 + }, + { + "epoch": 3.0027624309392267, + "grad_norm": 0.42518749833106995, + "learning_rate": 8.204154577936946e-05, + "loss": 1.864, + "step": 9783 + }, + { + "epoch": 3.0030693677102516, + "grad_norm": 0.3760852813720703, + "learning_rate": 8.203772981144146e-05, + "loss": 1.8543, + "step": 9784 + }, + { + "epoch": 3.003376304481277, + "grad_norm": 0.32794755697250366, + "learning_rate": 8.203391352689784e-05, + "loss": 1.8776, + "step": 9785 + }, + { + "epoch": 3.0036832412523022, + "grad_norm": 0.3053889274597168, + "learning_rate": 8.20300969257763e-05, + "loss": 1.8064, + "step": 9786 + }, + { + "epoch": 3.003990178023327, + "grad_norm": 0.40283143520355225, + "learning_rate": 8.202628000811456e-05, + "loss": 1.8083, + "step": 9787 + }, + { + "epoch": 3.0042971147943525, + "grad_norm": 0.49270665645599365, + "learning_rate": 8.202246277395038e-05, + "loss": 1.802, + "step": 9788 + }, + { + "epoch": 3.0046040515653774, + "grad_norm": 0.4373023211956024, + "learning_rate": 8.201864522332143e-05, + "loss": 1.8429, + "step": 9789 + }, + { + "epoch": 3.0049109883364027, + "grad_norm": 0.3136310875415802, + "learning_rate": 8.201482735626547e-05, + "loss": 1.8224, + "step": 9790 + }, + { + "epoch": 3.005217925107428, + "grad_norm": 0.3306807279586792, + "learning_rate": 8.201100917282023e-05, + "loss": 1.8463, + "step": 9791 + }, + { + "epoch": 3.005524861878453, + "grad_norm": 0.45082196593284607, + "learning_rate": 8.200719067302342e-05, + "loss": 1.7587, + "step": 9792 + }, + { + "epoch": 3.0058317986494782, + "grad_norm": 0.49246448278427124, + "learning_rate": 8.20033718569128e-05, + "loss": 1.8245, + "step": 9793 + }, + { + "epoch": 3.0061387354205036, + "grad_norm": 0.3040246367454529, + "learning_rate": 8.199955272452609e-05, + "loss": 1.8309, + "step": 9794 + }, + { + "epoch": 3.0064456721915285, + "grad_norm": 0.3909318149089813, + "learning_rate": 8.199573327590105e-05, + "loss": 1.8187, + "step": 9795 + }, + { + "epoch": 3.006752608962554, + "grad_norm": 0.5753183960914612, + "learning_rate": 8.199191351107543e-05, + "loss": 1.826, + "step": 9796 + }, + { + "epoch": 3.0070595457335787, + "grad_norm": 0.48908689618110657, + "learning_rate": 8.198809343008695e-05, + "loss": 1.8475, + "step": 9797 + }, + { + "epoch": 3.007366482504604, + "grad_norm": 0.31570208072662354, + "learning_rate": 8.198427303297341e-05, + "loss": 1.8046, + "step": 9798 + }, + { + "epoch": 3.0076734192756294, + "grad_norm": 0.39205440878868103, + "learning_rate": 8.198045231977251e-05, + "loss": 1.8413, + "step": 9799 + }, + { + "epoch": 3.0079803560466543, + "grad_norm": 0.5117597579956055, + "learning_rate": 8.197663129052204e-05, + "loss": 1.8184, + "step": 9800 + }, + { + "epoch": 3.0082872928176796, + "grad_norm": 0.3623514175415039, + "learning_rate": 8.197280994525978e-05, + "loss": 1.8292, + "step": 9801 + }, + { + "epoch": 3.008594229588705, + "grad_norm": 0.2826726734638214, + "learning_rate": 8.196898828402344e-05, + "loss": 1.8216, + "step": 9802 + }, + { + "epoch": 3.00890116635973, + "grad_norm": 0.38658398389816284, + "learning_rate": 8.196516630685085e-05, + "loss": 1.867, + "step": 9803 + }, + { + "epoch": 3.009208103130755, + "grad_norm": 0.3371698260307312, + "learning_rate": 8.196134401377973e-05, + "loss": 1.8077, + "step": 9804 + }, + { + "epoch": 3.00951503990178, + "grad_norm": 0.24108785390853882, + "learning_rate": 8.195752140484789e-05, + "loss": 1.7858, + "step": 9805 + }, + { + "epoch": 3.0098219766728054, + "grad_norm": 0.34410104155540466, + "learning_rate": 8.195369848009309e-05, + "loss": 1.801, + "step": 9806 + }, + { + "epoch": 3.0101289134438307, + "grad_norm": 0.3412116467952728, + "learning_rate": 8.194987523955311e-05, + "loss": 1.7905, + "step": 9807 + }, + { + "epoch": 3.0104358502148556, + "grad_norm": 0.2473030537366867, + "learning_rate": 8.194605168326573e-05, + "loss": 1.7765, + "step": 9808 + }, + { + "epoch": 3.010742786985881, + "grad_norm": 0.28590065240859985, + "learning_rate": 8.194222781126875e-05, + "loss": 1.7897, + "step": 9809 + }, + { + "epoch": 3.0110497237569063, + "grad_norm": 0.2994272708892822, + "learning_rate": 8.193840362359994e-05, + "loss": 1.7976, + "step": 9810 + }, + { + "epoch": 3.011356660527931, + "grad_norm": 0.2971307635307312, + "learning_rate": 8.193457912029713e-05, + "loss": 1.829, + "step": 9811 + }, + { + "epoch": 3.0116635972989565, + "grad_norm": 0.25149810314178467, + "learning_rate": 8.193075430139809e-05, + "loss": 1.7709, + "step": 9812 + }, + { + "epoch": 3.0119705340699814, + "grad_norm": 0.2561332583427429, + "learning_rate": 8.19269291669406e-05, + "loss": 1.7689, + "step": 9813 + }, + { + "epoch": 3.0122774708410067, + "grad_norm": 0.2658882141113281, + "learning_rate": 8.192310371696249e-05, + "loss": 1.8497, + "step": 9814 + }, + { + "epoch": 3.012584407612032, + "grad_norm": 0.2873780429363251, + "learning_rate": 8.191927795150156e-05, + "loss": 1.8217, + "step": 9815 + }, + { + "epoch": 3.012891344383057, + "grad_norm": 0.2181183248758316, + "learning_rate": 8.191545187059562e-05, + "loss": 1.7261, + "step": 9816 + }, + { + "epoch": 3.0131982811540823, + "grad_norm": 0.2414858490228653, + "learning_rate": 8.191162547428248e-05, + "loss": 1.8035, + "step": 9817 + }, + { + "epoch": 3.0135052179251076, + "grad_norm": 0.2799840271472931, + "learning_rate": 8.190779876259995e-05, + "loss": 1.8279, + "step": 9818 + }, + { + "epoch": 3.0138121546961325, + "grad_norm": 0.2669760584831238, + "learning_rate": 8.190397173558584e-05, + "loss": 1.8155, + "step": 9819 + }, + { + "epoch": 3.014119091467158, + "grad_norm": 0.28857991099357605, + "learning_rate": 8.1900144393278e-05, + "loss": 1.8479, + "step": 9820 + }, + { + "epoch": 3.0144260282381827, + "grad_norm": 0.30534693598747253, + "learning_rate": 8.189631673571422e-05, + "loss": 1.8609, + "step": 9821 + }, + { + "epoch": 3.014732965009208, + "grad_norm": 0.3238218128681183, + "learning_rate": 8.189248876293236e-05, + "loss": 1.9292, + "step": 9822 + }, + { + "epoch": 3.0150399017802334, + "grad_norm": 0.3000536561012268, + "learning_rate": 8.188866047497022e-05, + "loss": 1.8214, + "step": 9823 + }, + { + "epoch": 3.0153468385512583, + "grad_norm": 0.2960065007209778, + "learning_rate": 8.188483187186565e-05, + "loss": 1.8316, + "step": 9824 + }, + { + "epoch": 3.0156537753222836, + "grad_norm": 0.28609779477119446, + "learning_rate": 8.188100295365648e-05, + "loss": 1.8002, + "step": 9825 + }, + { + "epoch": 3.015960712093309, + "grad_norm": 0.31390634179115295, + "learning_rate": 8.187717372038057e-05, + "loss": 1.8134, + "step": 9826 + }, + { + "epoch": 3.016267648864334, + "grad_norm": 0.28550946712493896, + "learning_rate": 8.187334417207573e-05, + "loss": 1.8359, + "step": 9827 + }, + { + "epoch": 3.016574585635359, + "grad_norm": 0.3085210621356964, + "learning_rate": 8.186951430877982e-05, + "loss": 1.813, + "step": 9828 + }, + { + "epoch": 3.016881522406384, + "grad_norm": 0.3043847978115082, + "learning_rate": 8.18656841305307e-05, + "loss": 1.8222, + "step": 9829 + }, + { + "epoch": 3.0171884591774094, + "grad_norm": 0.32524731755256653, + "learning_rate": 8.18618536373662e-05, + "loss": 1.8258, + "step": 9830 + }, + { + "epoch": 3.0174953959484347, + "grad_norm": 0.2690991461277008, + "learning_rate": 8.18580228293242e-05, + "loss": 1.8492, + "step": 9831 + }, + { + "epoch": 3.0178023327194596, + "grad_norm": 0.34936225414276123, + "learning_rate": 8.185419170644253e-05, + "loss": 1.8363, + "step": 9832 + }, + { + "epoch": 3.018109269490485, + "grad_norm": 0.3274296820163727, + "learning_rate": 8.185036026875908e-05, + "loss": 1.7789, + "step": 9833 + }, + { + "epoch": 3.0184162062615103, + "grad_norm": 0.2729836106300354, + "learning_rate": 8.184652851631169e-05, + "loss": 1.8264, + "step": 9834 + }, + { + "epoch": 3.018723143032535, + "grad_norm": 0.28682780265808105, + "learning_rate": 8.184269644913826e-05, + "loss": 1.8399, + "step": 9835 + }, + { + "epoch": 3.0190300798035605, + "grad_norm": 0.3224826455116272, + "learning_rate": 8.183886406727662e-05, + "loss": 1.8338, + "step": 9836 + }, + { + "epoch": 3.0193370165745854, + "grad_norm": 0.30945318937301636, + "learning_rate": 8.183503137076467e-05, + "loss": 1.8248, + "step": 9837 + }, + { + "epoch": 3.0196439533456108, + "grad_norm": 0.27580398321151733, + "learning_rate": 8.183119835964029e-05, + "loss": 1.8096, + "step": 9838 + }, + { + "epoch": 3.019950890116636, + "grad_norm": 0.28927183151245117, + "learning_rate": 8.182736503394132e-05, + "loss": 1.825, + "step": 9839 + }, + { + "epoch": 3.020257826887661, + "grad_norm": 0.253000408411026, + "learning_rate": 8.182353139370571e-05, + "loss": 1.7678, + "step": 9840 + }, + { + "epoch": 3.0205647636586863, + "grad_norm": 0.2882022559642792, + "learning_rate": 8.18196974389713e-05, + "loss": 1.8895, + "step": 9841 + }, + { + "epoch": 3.0208717004297116, + "grad_norm": 0.26864609122276306, + "learning_rate": 8.1815863169776e-05, + "loss": 1.7674, + "step": 9842 + }, + { + "epoch": 3.0211786372007365, + "grad_norm": 0.27344849705696106, + "learning_rate": 8.181202858615769e-05, + "loss": 1.8146, + "step": 9843 + }, + { + "epoch": 3.021485573971762, + "grad_norm": 0.31659772992134094, + "learning_rate": 8.180819368815425e-05, + "loss": 1.8485, + "step": 9844 + }, + { + "epoch": 3.021792510742787, + "grad_norm": 0.3163176476955414, + "learning_rate": 8.18043584758036e-05, + "loss": 1.8994, + "step": 9845 + }, + { + "epoch": 3.022099447513812, + "grad_norm": 0.2583829462528229, + "learning_rate": 8.180052294914365e-05, + "loss": 1.764, + "step": 9846 + }, + { + "epoch": 3.0224063842848374, + "grad_norm": 0.3006649315357208, + "learning_rate": 8.179668710821227e-05, + "loss": 1.9232, + "step": 9847 + }, + { + "epoch": 3.0227133210558623, + "grad_norm": 0.35702988505363464, + "learning_rate": 8.179285095304741e-05, + "loss": 1.8403, + "step": 9848 + }, + { + "epoch": 3.0230202578268877, + "grad_norm": 0.29699379205703735, + "learning_rate": 8.178901448368697e-05, + "loss": 1.8412, + "step": 9849 + }, + { + "epoch": 3.023327194597913, + "grad_norm": 0.3022700548171997, + "learning_rate": 8.178517770016885e-05, + "loss": 1.8197, + "step": 9850 + }, + { + "epoch": 3.023634131368938, + "grad_norm": 0.2943836748600006, + "learning_rate": 8.178134060253097e-05, + "loss": 1.8127, + "step": 9851 + }, + { + "epoch": 3.023941068139963, + "grad_norm": 0.31290489435195923, + "learning_rate": 8.177750319081126e-05, + "loss": 1.821, + "step": 9852 + }, + { + "epoch": 3.0242480049109886, + "grad_norm": 0.30308374762535095, + "learning_rate": 8.177366546504763e-05, + "loss": 1.8522, + "step": 9853 + }, + { + "epoch": 3.0245549416820134, + "grad_norm": 0.301559716463089, + "learning_rate": 8.176982742527802e-05, + "loss": 1.8758, + "step": 9854 + }, + { + "epoch": 3.0248618784530388, + "grad_norm": 0.33314836025238037, + "learning_rate": 8.176598907154034e-05, + "loss": 1.8178, + "step": 9855 + }, + { + "epoch": 3.0251688152240637, + "grad_norm": 0.3567935526371002, + "learning_rate": 8.176215040387255e-05, + "loss": 1.7847, + "step": 9856 + }, + { + "epoch": 3.025475751995089, + "grad_norm": 0.27716195583343506, + "learning_rate": 8.175831142231258e-05, + "loss": 1.772, + "step": 9857 + }, + { + "epoch": 3.0257826887661143, + "grad_norm": 0.24568212032318115, + "learning_rate": 8.175447212689836e-05, + "loss": 1.8171, + "step": 9858 + }, + { + "epoch": 3.0260896255371392, + "grad_norm": 0.25368261337280273, + "learning_rate": 8.175063251766784e-05, + "loss": 1.852, + "step": 9859 + }, + { + "epoch": 3.0263965623081646, + "grad_norm": 0.2509497404098511, + "learning_rate": 8.174679259465894e-05, + "loss": 1.7737, + "step": 9860 + }, + { + "epoch": 3.02670349907919, + "grad_norm": 0.3539343774318695, + "learning_rate": 8.174295235790963e-05, + "loss": 1.8663, + "step": 9861 + }, + { + "epoch": 3.027010435850215, + "grad_norm": 0.36450034379959106, + "learning_rate": 8.173911180745788e-05, + "loss": 1.8179, + "step": 9862 + }, + { + "epoch": 3.02731737262124, + "grad_norm": 0.3550017178058624, + "learning_rate": 8.173527094334162e-05, + "loss": 1.8256, + "step": 9863 + }, + { + "epoch": 3.027624309392265, + "grad_norm": 0.33518701791763306, + "learning_rate": 8.17314297655988e-05, + "loss": 1.7842, + "step": 9864 + }, + { + "epoch": 3.0279312461632903, + "grad_norm": 0.2522886097431183, + "learning_rate": 8.172758827426739e-05, + "loss": 1.7688, + "step": 9865 + }, + { + "epoch": 3.0282381829343157, + "grad_norm": 0.26222914457321167, + "learning_rate": 8.172374646938536e-05, + "loss": 1.8517, + "step": 9866 + }, + { + "epoch": 3.0285451197053406, + "grad_norm": 0.3355788588523865, + "learning_rate": 8.171990435099068e-05, + "loss": 1.9002, + "step": 9867 + }, + { + "epoch": 3.028852056476366, + "grad_norm": 0.32907500863075256, + "learning_rate": 8.171606191912131e-05, + "loss": 1.7801, + "step": 9868 + }, + { + "epoch": 3.0291589932473912, + "grad_norm": 0.29234179854393005, + "learning_rate": 8.171221917381523e-05, + "loss": 1.8055, + "step": 9869 + }, + { + "epoch": 3.029465930018416, + "grad_norm": 0.26374876499176025, + "learning_rate": 8.170837611511041e-05, + "loss": 1.781, + "step": 9870 + }, + { + "epoch": 3.0297728667894415, + "grad_norm": 0.311282217502594, + "learning_rate": 8.170453274304483e-05, + "loss": 1.839, + "step": 9871 + }, + { + "epoch": 3.0300798035604664, + "grad_norm": 0.24225831031799316, + "learning_rate": 8.170068905765648e-05, + "loss": 1.804, + "step": 9872 + }, + { + "epoch": 3.0303867403314917, + "grad_norm": 0.29383334517478943, + "learning_rate": 8.169684505898335e-05, + "loss": 1.7817, + "step": 9873 + }, + { + "epoch": 3.030693677102517, + "grad_norm": 0.2607928514480591, + "learning_rate": 8.169300074706339e-05, + "loss": 1.8379, + "step": 9874 + }, + { + "epoch": 3.031000613873542, + "grad_norm": 0.283028244972229, + "learning_rate": 8.168915612193464e-05, + "loss": 1.7797, + "step": 9875 + }, + { + "epoch": 3.0313075506445673, + "grad_norm": 0.27675309777259827, + "learning_rate": 8.168531118363508e-05, + "loss": 1.8355, + "step": 9876 + }, + { + "epoch": 3.0316144874155926, + "grad_norm": 0.2598227262496948, + "learning_rate": 8.16814659322027e-05, + "loss": 1.7898, + "step": 9877 + }, + { + "epoch": 3.0319214241866175, + "grad_norm": 0.24715003371238708, + "learning_rate": 8.16776203676755e-05, + "loss": 1.7791, + "step": 9878 + }, + { + "epoch": 3.032228360957643, + "grad_norm": 0.2749374210834503, + "learning_rate": 8.167377449009149e-05, + "loss": 1.8303, + "step": 9879 + }, + { + "epoch": 3.0325352977286677, + "grad_norm": 0.26150834560394287, + "learning_rate": 8.166992829948868e-05, + "loss": 1.8462, + "step": 9880 + }, + { + "epoch": 3.032842234499693, + "grad_norm": 0.3044755160808563, + "learning_rate": 8.166608179590506e-05, + "loss": 1.806, + "step": 9881 + }, + { + "epoch": 3.0331491712707184, + "grad_norm": 0.2949555516242981, + "learning_rate": 8.166223497937868e-05, + "loss": 1.8785, + "step": 9882 + }, + { + "epoch": 3.0334561080417433, + "grad_norm": 0.33206698298454285, + "learning_rate": 8.165838784994752e-05, + "loss": 1.8476, + "step": 9883 + }, + { + "epoch": 3.0337630448127686, + "grad_norm": 0.2720400094985962, + "learning_rate": 8.165454040764962e-05, + "loss": 1.843, + "step": 9884 + }, + { + "epoch": 3.034069981583794, + "grad_norm": 0.29340869188308716, + "learning_rate": 8.1650692652523e-05, + "loss": 1.7761, + "step": 9885 + }, + { + "epoch": 3.034376918354819, + "grad_norm": 0.35155293345451355, + "learning_rate": 8.16468445846057e-05, + "loss": 1.8887, + "step": 9886 + }, + { + "epoch": 3.034683855125844, + "grad_norm": 0.2688990831375122, + "learning_rate": 8.164299620393571e-05, + "loss": 1.8001, + "step": 9887 + }, + { + "epoch": 3.034990791896869, + "grad_norm": 0.2921253442764282, + "learning_rate": 8.16391475105511e-05, + "loss": 1.7951, + "step": 9888 + }, + { + "epoch": 3.0352977286678944, + "grad_norm": 0.28100699186325073, + "learning_rate": 8.163529850448988e-05, + "loss": 1.8041, + "step": 9889 + }, + { + "epoch": 3.0356046654389197, + "grad_norm": 0.3155081868171692, + "learning_rate": 8.16314491857901e-05, + "loss": 1.8026, + "step": 9890 + }, + { + "epoch": 3.0359116022099446, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.16275995544898e-05, + "loss": 1.8502, + "step": 9891 + }, + { + "epoch": 3.03621853898097, + "grad_norm": 0.2732076644897461, + "learning_rate": 8.162374961062704e-05, + "loss": 1.8424, + "step": 9892 + }, + { + "epoch": 3.0365254757519953, + "grad_norm": 0.2943679690361023, + "learning_rate": 8.161989935423984e-05, + "loss": 1.7635, + "step": 9893 + }, + { + "epoch": 3.03683241252302, + "grad_norm": 0.28894683718681335, + "learning_rate": 8.161604878536626e-05, + "loss": 1.78, + "step": 9894 + }, + { + "epoch": 3.0371393492940455, + "grad_norm": 0.2718082666397095, + "learning_rate": 8.161219790404435e-05, + "loss": 1.7664, + "step": 9895 + }, + { + "epoch": 3.0374462860650704, + "grad_norm": 0.29092124104499817, + "learning_rate": 8.160834671031216e-05, + "loss": 1.8621, + "step": 9896 + }, + { + "epoch": 3.0377532228360957, + "grad_norm": 0.284665584564209, + "learning_rate": 8.160449520420779e-05, + "loss": 1.8607, + "step": 9897 + }, + { + "epoch": 3.038060159607121, + "grad_norm": 0.23676982522010803, + "learning_rate": 8.160064338576925e-05, + "loss": 1.7137, + "step": 9898 + }, + { + "epoch": 3.038367096378146, + "grad_norm": 0.2666932940483093, + "learning_rate": 8.159679125503466e-05, + "loss": 1.8038, + "step": 9899 + }, + { + "epoch": 3.0386740331491713, + "grad_norm": 0.36214375495910645, + "learning_rate": 8.159293881204204e-05, + "loss": 1.8902, + "step": 9900 + }, + { + "epoch": 3.0389809699201966, + "grad_norm": 0.30301332473754883, + "learning_rate": 8.158908605682948e-05, + "loss": 1.8456, + "step": 9901 + }, + { + "epoch": 3.0392879066912215, + "grad_norm": 0.32190418243408203, + "learning_rate": 8.158523298943506e-05, + "loss": 1.8246, + "step": 9902 + }, + { + "epoch": 3.039594843462247, + "grad_norm": 0.2938043475151062, + "learning_rate": 8.158137960989685e-05, + "loss": 1.8324, + "step": 9903 + }, + { + "epoch": 3.0399017802332717, + "grad_norm": 0.29493969678878784, + "learning_rate": 8.157752591825294e-05, + "loss": 1.8458, + "step": 9904 + }, + { + "epoch": 3.040208717004297, + "grad_norm": 0.2681889832019806, + "learning_rate": 8.157367191454141e-05, + "loss": 1.889, + "step": 9905 + }, + { + "epoch": 3.0405156537753224, + "grad_norm": 0.3111969232559204, + "learning_rate": 8.156981759880035e-05, + "loss": 1.8966, + "step": 9906 + }, + { + "epoch": 3.0408225905463473, + "grad_norm": 0.345262736082077, + "learning_rate": 8.156596297106784e-05, + "loss": 1.8174, + "step": 9907 + }, + { + "epoch": 3.0411295273173726, + "grad_norm": 0.30156534910202026, + "learning_rate": 8.156210803138199e-05, + "loss": 1.766, + "step": 9908 + }, + { + "epoch": 3.041436464088398, + "grad_norm": 0.28691565990448, + "learning_rate": 8.15582527797809e-05, + "loss": 1.8436, + "step": 9909 + }, + { + "epoch": 3.041743400859423, + "grad_norm": 0.33418282866477966, + "learning_rate": 8.155439721630264e-05, + "loss": 1.8939, + "step": 9910 + }, + { + "epoch": 3.042050337630448, + "grad_norm": 0.25496938824653625, + "learning_rate": 8.155054134098535e-05, + "loss": 1.8368, + "step": 9911 + }, + { + "epoch": 3.042357274401473, + "grad_norm": 0.3806788921356201, + "learning_rate": 8.154668515386711e-05, + "loss": 1.8635, + "step": 9912 + }, + { + "epoch": 3.0426642111724984, + "grad_norm": 0.42668119072914124, + "learning_rate": 8.154282865498603e-05, + "loss": 1.76, + "step": 9913 + }, + { + "epoch": 3.0429711479435237, + "grad_norm": 0.35945314168930054, + "learning_rate": 8.153897184438024e-05, + "loss": 1.8275, + "step": 9914 + }, + { + "epoch": 3.0432780847145486, + "grad_norm": 0.3225449323654175, + "learning_rate": 8.153511472208784e-05, + "loss": 1.7901, + "step": 9915 + }, + { + "epoch": 3.043585021485574, + "grad_norm": 0.2905425727367401, + "learning_rate": 8.153125728814694e-05, + "loss": 1.8021, + "step": 9916 + }, + { + "epoch": 3.0438919582565993, + "grad_norm": 0.3315529525279999, + "learning_rate": 8.15273995425957e-05, + "loss": 1.8003, + "step": 9917 + }, + { + "epoch": 3.044198895027624, + "grad_norm": 0.30256444215774536, + "learning_rate": 8.152354148547221e-05, + "loss": 1.8243, + "step": 9918 + }, + { + "epoch": 3.0445058317986495, + "grad_norm": 0.2563035190105438, + "learning_rate": 8.15196831168146e-05, + "loss": 1.7877, + "step": 9919 + }, + { + "epoch": 3.044812768569675, + "grad_norm": 0.25705814361572266, + "learning_rate": 8.151582443666101e-05, + "loss": 1.813, + "step": 9920 + }, + { + "epoch": 3.0451197053406998, + "grad_norm": 0.3649071455001831, + "learning_rate": 8.151196544504957e-05, + "loss": 1.8114, + "step": 9921 + }, + { + "epoch": 3.045426642111725, + "grad_norm": 0.4076193571090698, + "learning_rate": 8.150810614201841e-05, + "loss": 1.7869, + "step": 9922 + }, + { + "epoch": 3.04573357888275, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.150424652760569e-05, + "loss": 1.7878, + "step": 9923 + }, + { + "epoch": 3.0460405156537753, + "grad_norm": 0.2243243157863617, + "learning_rate": 8.150038660184955e-05, + "loss": 1.8224, + "step": 9924 + }, + { + "epoch": 3.0463474524248007, + "grad_norm": 0.3295031487941742, + "learning_rate": 8.149652636478811e-05, + "loss": 1.8685, + "step": 9925 + }, + { + "epoch": 3.0466543891958255, + "grad_norm": 0.2973531186580658, + "learning_rate": 8.149266581645954e-05, + "loss": 1.8082, + "step": 9926 + }, + { + "epoch": 3.046961325966851, + "grad_norm": 0.25648918747901917, + "learning_rate": 8.148880495690199e-05, + "loss": 1.8089, + "step": 9927 + }, + { + "epoch": 3.047268262737876, + "grad_norm": 0.2845752537250519, + "learning_rate": 8.148494378615361e-05, + "loss": 1.8726, + "step": 9928 + }, + { + "epoch": 3.047575199508901, + "grad_norm": 0.2917105555534363, + "learning_rate": 8.148108230425255e-05, + "loss": 1.8035, + "step": 9929 + }, + { + "epoch": 3.0478821362799264, + "grad_norm": 0.2775834798812866, + "learning_rate": 8.1477220511237e-05, + "loss": 1.8545, + "step": 9930 + }, + { + "epoch": 3.0481890730509513, + "grad_norm": 0.3522767424583435, + "learning_rate": 8.14733584071451e-05, + "loss": 1.8261, + "step": 9931 + }, + { + "epoch": 3.0484960098219767, + "grad_norm": 0.3759000599384308, + "learning_rate": 8.146949599201503e-05, + "loss": 1.8405, + "step": 9932 + }, + { + "epoch": 3.048802946593002, + "grad_norm": 0.3353044390678406, + "learning_rate": 8.146563326588496e-05, + "loss": 1.7762, + "step": 9933 + }, + { + "epoch": 3.049109883364027, + "grad_norm": 0.263810932636261, + "learning_rate": 8.146177022879304e-05, + "loss": 1.7546, + "step": 9934 + }, + { + "epoch": 3.049416820135052, + "grad_norm": 0.24064256250858307, + "learning_rate": 8.14579068807775e-05, + "loss": 1.7903, + "step": 9935 + }, + { + "epoch": 3.0497237569060776, + "grad_norm": 0.3144194781780243, + "learning_rate": 8.145404322187645e-05, + "loss": 1.8011, + "step": 9936 + }, + { + "epoch": 3.0500306936771024, + "grad_norm": 0.3362879455089569, + "learning_rate": 8.145017925212812e-05, + "loss": 1.8224, + "step": 9937 + }, + { + "epoch": 3.050337630448128, + "grad_norm": 0.33979395031929016, + "learning_rate": 8.144631497157071e-05, + "loss": 1.8415, + "step": 9938 + }, + { + "epoch": 3.0506445672191527, + "grad_norm": 0.33391237258911133, + "learning_rate": 8.144245038024235e-05, + "loss": 1.7983, + "step": 9939 + }, + { + "epoch": 3.050951503990178, + "grad_norm": 0.34034964442253113, + "learning_rate": 8.143858547818128e-05, + "loss": 1.8635, + "step": 9940 + }, + { + "epoch": 3.0512584407612033, + "grad_norm": 0.3472529947757721, + "learning_rate": 8.143472026542569e-05, + "loss": 1.8067, + "step": 9941 + }, + { + "epoch": 3.0515653775322282, + "grad_norm": 0.3369109630584717, + "learning_rate": 8.143085474201376e-05, + "loss": 1.7933, + "step": 9942 + }, + { + "epoch": 3.0518723143032536, + "grad_norm": 0.3055182993412018, + "learning_rate": 8.14269889079837e-05, + "loss": 1.7358, + "step": 9943 + }, + { + "epoch": 3.052179251074279, + "grad_norm": 0.26729708909988403, + "learning_rate": 8.142312276337372e-05, + "loss": 1.8315, + "step": 9944 + }, + { + "epoch": 3.052486187845304, + "grad_norm": 0.3626720607280731, + "learning_rate": 8.141925630822203e-05, + "loss": 1.7593, + "step": 9945 + }, + { + "epoch": 3.052793124616329, + "grad_norm": 0.3673512637615204, + "learning_rate": 8.141538954256683e-05, + "loss": 1.8414, + "step": 9946 + }, + { + "epoch": 3.053100061387354, + "grad_norm": 0.30554768443107605, + "learning_rate": 8.141152246644632e-05, + "loss": 1.7504, + "step": 9947 + }, + { + "epoch": 3.0534069981583793, + "grad_norm": 0.41163405776023865, + "learning_rate": 8.140765507989875e-05, + "loss": 1.8794, + "step": 9948 + }, + { + "epoch": 3.0537139349294047, + "grad_norm": 0.592751145362854, + "learning_rate": 8.140378738296233e-05, + "loss": 1.8538, + "step": 9949 + }, + { + "epoch": 3.0540208717004296, + "grad_norm": 0.483828604221344, + "learning_rate": 8.139991937567527e-05, + "loss": 1.7952, + "step": 9950 + }, + { + "epoch": 3.054327808471455, + "grad_norm": 0.26665306091308594, + "learning_rate": 8.13960510580758e-05, + "loss": 1.8268, + "step": 9951 + }, + { + "epoch": 3.0546347452424802, + "grad_norm": 0.42917072772979736, + "learning_rate": 8.139218243020215e-05, + "loss": 1.843, + "step": 9952 + }, + { + "epoch": 3.054941682013505, + "grad_norm": 0.47911396622657776, + "learning_rate": 8.138831349209256e-05, + "loss": 1.8223, + "step": 9953 + }, + { + "epoch": 3.0552486187845305, + "grad_norm": 0.4540431797504425, + "learning_rate": 8.138444424378524e-05, + "loss": 1.9198, + "step": 9954 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.29719051718711853, + "learning_rate": 8.138057468531845e-05, + "loss": 1.7873, + "step": 9955 + }, + { + "epoch": 3.0558624923265807, + "grad_norm": 0.35133618116378784, + "learning_rate": 8.137670481673045e-05, + "loss": 1.8459, + "step": 9956 + }, + { + "epoch": 3.056169429097606, + "grad_norm": 0.42896488308906555, + "learning_rate": 8.137283463805945e-05, + "loss": 1.7814, + "step": 9957 + }, + { + "epoch": 3.056476365868631, + "grad_norm": 0.38993972539901733, + "learning_rate": 8.136896414934372e-05, + "loss": 1.7636, + "step": 9958 + }, + { + "epoch": 3.0567833026396563, + "grad_norm": 0.31362372636795044, + "learning_rate": 8.13650933506215e-05, + "loss": 1.8021, + "step": 9959 + }, + { + "epoch": 3.0570902394106816, + "grad_norm": 0.27980196475982666, + "learning_rate": 8.136122224193103e-05, + "loss": 1.8445, + "step": 9960 + }, + { + "epoch": 3.0573971761817065, + "grad_norm": 0.2721461057662964, + "learning_rate": 8.135735082331059e-05, + "loss": 1.7614, + "step": 9961 + }, + { + "epoch": 3.057704112952732, + "grad_norm": 0.25157424807548523, + "learning_rate": 8.135347909479843e-05, + "loss": 1.7598, + "step": 9962 + }, + { + "epoch": 3.0580110497237567, + "grad_norm": 0.25798025727272034, + "learning_rate": 8.13496070564328e-05, + "loss": 1.7823, + "step": 9963 + }, + { + "epoch": 3.058317986494782, + "grad_norm": 0.30775198340415955, + "learning_rate": 8.134573470825199e-05, + "loss": 1.7755, + "step": 9964 + }, + { + "epoch": 3.0586249232658074, + "grad_norm": 0.28916797041893005, + "learning_rate": 8.134186205029426e-05, + "loss": 1.8189, + "step": 9965 + }, + { + "epoch": 3.0589318600368323, + "grad_norm": 0.2829149067401886, + "learning_rate": 8.133798908259787e-05, + "loss": 1.8546, + "step": 9966 + }, + { + "epoch": 3.0592387968078576, + "grad_norm": 0.2884117662906647, + "learning_rate": 8.13341158052011e-05, + "loss": 1.7705, + "step": 9967 + }, + { + "epoch": 3.059545733578883, + "grad_norm": 0.28311973810195923, + "learning_rate": 8.133024221814225e-05, + "loss": 1.8147, + "step": 9968 + }, + { + "epoch": 3.059852670349908, + "grad_norm": 0.25405213236808777, + "learning_rate": 8.132636832145957e-05, + "loss": 1.7813, + "step": 9969 + }, + { + "epoch": 3.060159607120933, + "grad_norm": 0.3082229793071747, + "learning_rate": 8.132249411519137e-05, + "loss": 1.8536, + "step": 9970 + }, + { + "epoch": 3.060466543891958, + "grad_norm": 0.29918181896209717, + "learning_rate": 8.13186195993759e-05, + "loss": 1.8181, + "step": 9971 + }, + { + "epoch": 3.0607734806629834, + "grad_norm": 0.3025238811969757, + "learning_rate": 8.13147447740515e-05, + "loss": 1.7785, + "step": 9972 + }, + { + "epoch": 3.0610804174340087, + "grad_norm": 0.2798222303390503, + "learning_rate": 8.131086963925643e-05, + "loss": 1.7873, + "step": 9973 + }, + { + "epoch": 3.0613873542050336, + "grad_norm": 0.32636210322380066, + "learning_rate": 8.130699419502898e-05, + "loss": 1.882, + "step": 9974 + }, + { + "epoch": 3.061694290976059, + "grad_norm": 0.27722054719924927, + "learning_rate": 8.130311844140748e-05, + "loss": 1.7788, + "step": 9975 + }, + { + "epoch": 3.0620012277470843, + "grad_norm": 0.289156436920166, + "learning_rate": 8.129924237843023e-05, + "loss": 1.8591, + "step": 9976 + }, + { + "epoch": 3.062308164518109, + "grad_norm": 0.2839665412902832, + "learning_rate": 8.12953660061355e-05, + "loss": 1.8255, + "step": 9977 + }, + { + "epoch": 3.0626151012891345, + "grad_norm": 0.2650148272514343, + "learning_rate": 8.129148932456161e-05, + "loss": 1.8353, + "step": 9978 + }, + { + "epoch": 3.06292203806016, + "grad_norm": 0.2884560227394104, + "learning_rate": 8.128761233374691e-05, + "loss": 1.8099, + "step": 9979 + }, + { + "epoch": 3.0632289748311847, + "grad_norm": 0.2610029876232147, + "learning_rate": 8.128373503372967e-05, + "loss": 1.8173, + "step": 9980 + }, + { + "epoch": 3.06353591160221, + "grad_norm": 0.32512393593788147, + "learning_rate": 8.127985742454822e-05, + "loss": 1.8619, + "step": 9981 + }, + { + "epoch": 3.063842848373235, + "grad_norm": 0.3382968604564667, + "learning_rate": 8.127597950624091e-05, + "loss": 1.831, + "step": 9982 + }, + { + "epoch": 3.0641497851442603, + "grad_norm": 0.33773133158683777, + "learning_rate": 8.127210127884602e-05, + "loss": 1.8194, + "step": 9983 + }, + { + "epoch": 3.0644567219152856, + "grad_norm": 0.31642746925354004, + "learning_rate": 8.126822274240188e-05, + "loss": 1.8782, + "step": 9984 + }, + { + "epoch": 3.0647636586863105, + "grad_norm": 0.2476506233215332, + "learning_rate": 8.126434389694686e-05, + "loss": 1.7866, + "step": 9985 + }, + { + "epoch": 3.065070595457336, + "grad_norm": 0.27296319603919983, + "learning_rate": 8.126046474251927e-05, + "loss": 1.8276, + "step": 9986 + }, + { + "epoch": 3.0653775322283607, + "grad_norm": 0.353865385055542, + "learning_rate": 8.125658527915744e-05, + "loss": 1.9525, + "step": 9987 + }, + { + "epoch": 3.065684468999386, + "grad_norm": 0.370256632566452, + "learning_rate": 8.12527055068997e-05, + "loss": 1.8514, + "step": 9988 + }, + { + "epoch": 3.0659914057704114, + "grad_norm": 0.30738842487335205, + "learning_rate": 8.124882542578442e-05, + "loss": 1.8125, + "step": 9989 + }, + { + "epoch": 3.0662983425414363, + "grad_norm": 0.3151233494281769, + "learning_rate": 8.124494503584995e-05, + "loss": 1.8165, + "step": 9990 + }, + { + "epoch": 3.0666052793124616, + "grad_norm": 0.29071590304374695, + "learning_rate": 8.124106433713458e-05, + "loss": 1.7617, + "step": 9991 + }, + { + "epoch": 3.066912216083487, + "grad_norm": 0.2898697853088379, + "learning_rate": 8.123718332967672e-05, + "loss": 1.7779, + "step": 9992 + }, + { + "epoch": 3.067219152854512, + "grad_norm": 0.26601701974868774, + "learning_rate": 8.123330201351471e-05, + "loss": 1.8307, + "step": 9993 + }, + { + "epoch": 3.067526089625537, + "grad_norm": 0.2622119188308716, + "learning_rate": 8.12294203886869e-05, + "loss": 1.7958, + "step": 9994 + }, + { + "epoch": 3.0678330263965625, + "grad_norm": 0.29709386825561523, + "learning_rate": 8.122553845523166e-05, + "loss": 1.7799, + "step": 9995 + }, + { + "epoch": 3.0681399631675874, + "grad_norm": 0.31267789006233215, + "learning_rate": 8.122165621318733e-05, + "loss": 1.8149, + "step": 9996 + }, + { + "epoch": 3.0684468999386127, + "grad_norm": 0.3076523244380951, + "learning_rate": 8.121777366259232e-05, + "loss": 1.7701, + "step": 9997 + }, + { + "epoch": 3.0687538367096376, + "grad_norm": 0.30096009373664856, + "learning_rate": 8.121389080348496e-05, + "loss": 1.8323, + "step": 9998 + }, + { + "epoch": 3.069060773480663, + "grad_norm": 0.25739142298698425, + "learning_rate": 8.121000763590363e-05, + "loss": 1.8105, + "step": 9999 + }, + { + "epoch": 3.0693677102516883, + "grad_norm": 0.2780844271183014, + "learning_rate": 8.120612415988671e-05, + "loss": 1.8502, + "step": 10000 + }, + { + "epoch": 3.069674647022713, + "grad_norm": 0.3316378593444824, + "learning_rate": 8.120224037547259e-05, + "loss": 1.8244, + "step": 10001 + }, + { + "epoch": 3.0699815837937385, + "grad_norm": 0.261129766702652, + "learning_rate": 8.119835628269964e-05, + "loss": 1.7769, + "step": 10002 + }, + { + "epoch": 3.070288520564764, + "grad_norm": 0.29213985800743103, + "learning_rate": 8.119447188160625e-05, + "loss": 1.7717, + "step": 10003 + }, + { + "epoch": 3.0705954573357888, + "grad_norm": 0.38545623421669006, + "learning_rate": 8.11905871722308e-05, + "loss": 1.8433, + "step": 10004 + }, + { + "epoch": 3.070902394106814, + "grad_norm": 0.3617223799228668, + "learning_rate": 8.118670215461168e-05, + "loss": 1.8172, + "step": 10005 + }, + { + "epoch": 3.071209330877839, + "grad_norm": 0.3241543769836426, + "learning_rate": 8.11828168287873e-05, + "loss": 1.8325, + "step": 10006 + }, + { + "epoch": 3.0715162676488643, + "grad_norm": 0.3538578152656555, + "learning_rate": 8.117893119479605e-05, + "loss": 1.8188, + "step": 10007 + }, + { + "epoch": 3.0718232044198897, + "grad_norm": 0.3861970603466034, + "learning_rate": 8.117504525267632e-05, + "loss": 1.8518, + "step": 10008 + }, + { + "epoch": 3.0721301411909145, + "grad_norm": 0.35433146357536316, + "learning_rate": 8.117115900246652e-05, + "loss": 1.8601, + "step": 10009 + }, + { + "epoch": 3.07243707796194, + "grad_norm": 0.29796987771987915, + "learning_rate": 8.116727244420507e-05, + "loss": 1.7934, + "step": 10010 + }, + { + "epoch": 3.072744014732965, + "grad_norm": 0.3091779947280884, + "learning_rate": 8.116338557793035e-05, + "loss": 1.8111, + "step": 10011 + }, + { + "epoch": 3.07305095150399, + "grad_norm": 0.2741319537162781, + "learning_rate": 8.11594984036808e-05, + "loss": 1.8079, + "step": 10012 + }, + { + "epoch": 3.0733578882750154, + "grad_norm": 0.28905320167541504, + "learning_rate": 8.115561092149482e-05, + "loss": 1.8475, + "step": 10013 + }, + { + "epoch": 3.0736648250460403, + "grad_norm": 0.2897081673145294, + "learning_rate": 8.115172313141081e-05, + "loss": 1.838, + "step": 10014 + }, + { + "epoch": 3.0739717618170657, + "grad_norm": 0.2620783746242523, + "learning_rate": 8.114783503346725e-05, + "loss": 1.8024, + "step": 10015 + }, + { + "epoch": 3.074278698588091, + "grad_norm": 0.26478636264801025, + "learning_rate": 8.11439466277025e-05, + "loss": 1.8137, + "step": 10016 + }, + { + "epoch": 3.074585635359116, + "grad_norm": 0.2796174883842468, + "learning_rate": 8.114005791415502e-05, + "loss": 1.7976, + "step": 10017 + }, + { + "epoch": 3.074892572130141, + "grad_norm": 0.26813286542892456, + "learning_rate": 8.113616889286325e-05, + "loss": 1.7945, + "step": 10018 + }, + { + "epoch": 3.0751995089011666, + "grad_norm": 0.2443828582763672, + "learning_rate": 8.11322795638656e-05, + "loss": 1.7829, + "step": 10019 + }, + { + "epoch": 3.0755064456721914, + "grad_norm": 0.2981395423412323, + "learning_rate": 8.112838992720053e-05, + "loss": 1.7928, + "step": 10020 + }, + { + "epoch": 3.075813382443217, + "grad_norm": 0.25605037808418274, + "learning_rate": 8.112449998290644e-05, + "loss": 1.8129, + "step": 10021 + }, + { + "epoch": 3.0761203192142417, + "grad_norm": 0.31180307269096375, + "learning_rate": 8.112060973102181e-05, + "loss": 1.7393, + "step": 10022 + }, + { + "epoch": 3.076427255985267, + "grad_norm": 0.3230421543121338, + "learning_rate": 8.111671917158508e-05, + "loss": 1.818, + "step": 10023 + }, + { + "epoch": 3.0767341927562923, + "grad_norm": 0.3158549964427948, + "learning_rate": 8.111282830463468e-05, + "loss": 1.7582, + "step": 10024 + }, + { + "epoch": 3.0770411295273172, + "grad_norm": 0.24524325132369995, + "learning_rate": 8.110893713020908e-05, + "loss": 1.8215, + "step": 10025 + }, + { + "epoch": 3.0773480662983426, + "grad_norm": 0.2793932259082794, + "learning_rate": 8.110504564834675e-05, + "loss": 1.8551, + "step": 10026 + }, + { + "epoch": 3.077655003069368, + "grad_norm": 0.29629403352737427, + "learning_rate": 8.110115385908612e-05, + "loss": 1.8019, + "step": 10027 + }, + { + "epoch": 3.077961939840393, + "grad_norm": 0.3138490915298462, + "learning_rate": 8.109726176246564e-05, + "loss": 1.8436, + "step": 10028 + }, + { + "epoch": 3.078268876611418, + "grad_norm": 0.29802024364471436, + "learning_rate": 8.10933693585238e-05, + "loss": 1.8158, + "step": 10029 + }, + { + "epoch": 3.078575813382443, + "grad_norm": 0.30785220861434937, + "learning_rate": 8.108947664729907e-05, + "loss": 1.8674, + "step": 10030 + }, + { + "epoch": 3.0788827501534684, + "grad_norm": 0.277662992477417, + "learning_rate": 8.10855836288299e-05, + "loss": 1.8253, + "step": 10031 + }, + { + "epoch": 3.0791896869244937, + "grad_norm": 0.27399590611457825, + "learning_rate": 8.108169030315477e-05, + "loss": 1.8587, + "step": 10032 + }, + { + "epoch": 3.0794966236955186, + "grad_norm": 0.28398239612579346, + "learning_rate": 8.107779667031217e-05, + "loss": 1.8326, + "step": 10033 + }, + { + "epoch": 3.079803560466544, + "grad_norm": 0.2882741093635559, + "learning_rate": 8.107390273034057e-05, + "loss": 1.785, + "step": 10034 + }, + { + "epoch": 3.0801104972375692, + "grad_norm": 0.271043598651886, + "learning_rate": 8.107000848327843e-05, + "loss": 1.765, + "step": 10035 + }, + { + "epoch": 3.080417434008594, + "grad_norm": 0.2589638829231262, + "learning_rate": 8.106611392916427e-05, + "loss": 1.8136, + "step": 10036 + }, + { + "epoch": 3.0807243707796195, + "grad_norm": 0.3068227469921112, + "learning_rate": 8.106221906803656e-05, + "loss": 1.8034, + "step": 10037 + }, + { + "epoch": 3.0810313075506444, + "grad_norm": 0.2714168131351471, + "learning_rate": 8.105832389993379e-05, + "loss": 1.8007, + "step": 10038 + }, + { + "epoch": 3.0813382443216697, + "grad_norm": 0.2747504711151123, + "learning_rate": 8.105442842489447e-05, + "loss": 1.8135, + "step": 10039 + }, + { + "epoch": 3.081645181092695, + "grad_norm": 0.2719285488128662, + "learning_rate": 8.105053264295708e-05, + "loss": 1.7629, + "step": 10040 + }, + { + "epoch": 3.08195211786372, + "grad_norm": 0.3119582235813141, + "learning_rate": 8.104663655416014e-05, + "loss": 1.7887, + "step": 10041 + }, + { + "epoch": 3.0822590546347453, + "grad_norm": 0.35965192317962646, + "learning_rate": 8.104274015854212e-05, + "loss": 1.8484, + "step": 10042 + }, + { + "epoch": 3.0825659914057706, + "grad_norm": 0.3045980632305145, + "learning_rate": 8.103884345614157e-05, + "loss": 1.8625, + "step": 10043 + }, + { + "epoch": 3.0828729281767955, + "grad_norm": 0.2925138473510742, + "learning_rate": 8.103494644699696e-05, + "loss": 1.9306, + "step": 10044 + }, + { + "epoch": 3.083179864947821, + "grad_norm": 0.2894277274608612, + "learning_rate": 8.103104913114681e-05, + "loss": 1.7796, + "step": 10045 + }, + { + "epoch": 3.0834868017188457, + "grad_norm": 0.2776826322078705, + "learning_rate": 8.102715150862967e-05, + "loss": 1.8169, + "step": 10046 + }, + { + "epoch": 3.083793738489871, + "grad_norm": 0.3315230906009674, + "learning_rate": 8.102325357948402e-05, + "loss": 1.8139, + "step": 10047 + }, + { + "epoch": 3.0841006752608964, + "grad_norm": 0.2906761169433594, + "learning_rate": 8.10193553437484e-05, + "loss": 1.8162, + "step": 10048 + }, + { + "epoch": 3.0844076120319213, + "grad_norm": 0.32681339979171753, + "learning_rate": 8.101545680146132e-05, + "loss": 1.8245, + "step": 10049 + }, + { + "epoch": 3.0847145488029466, + "grad_norm": 0.32525795698165894, + "learning_rate": 8.101155795266131e-05, + "loss": 1.8605, + "step": 10050 + }, + { + "epoch": 3.085021485573972, + "grad_norm": 0.31705379486083984, + "learning_rate": 8.100765879738692e-05, + "loss": 1.8214, + "step": 10051 + }, + { + "epoch": 3.085328422344997, + "grad_norm": 0.27772918343544006, + "learning_rate": 8.100375933567668e-05, + "loss": 1.7822, + "step": 10052 + }, + { + "epoch": 3.085635359116022, + "grad_norm": 0.2877809405326843, + "learning_rate": 8.09998595675691e-05, + "loss": 1.7935, + "step": 10053 + }, + { + "epoch": 3.0859422958870475, + "grad_norm": 0.29759806394577026, + "learning_rate": 8.099595949310276e-05, + "loss": 1.8041, + "step": 10054 + }, + { + "epoch": 3.0862492326580724, + "grad_norm": 0.2715320289134979, + "learning_rate": 8.099205911231617e-05, + "loss": 1.7923, + "step": 10055 + }, + { + "epoch": 3.0865561694290977, + "grad_norm": 0.33566340804100037, + "learning_rate": 8.098815842524789e-05, + "loss": 1.7953, + "step": 10056 + }, + { + "epoch": 3.0868631062001226, + "grad_norm": 0.3360871970653534, + "learning_rate": 8.098425743193645e-05, + "loss": 1.8275, + "step": 10057 + }, + { + "epoch": 3.087170042971148, + "grad_norm": 0.2797739803791046, + "learning_rate": 8.098035613242043e-05, + "loss": 1.7597, + "step": 10058 + }, + { + "epoch": 3.0874769797421733, + "grad_norm": 0.25500187277793884, + "learning_rate": 8.097645452673837e-05, + "loss": 1.8059, + "step": 10059 + }, + { + "epoch": 3.087783916513198, + "grad_norm": 0.28042587637901306, + "learning_rate": 8.097255261492884e-05, + "loss": 1.7954, + "step": 10060 + }, + { + "epoch": 3.0880908532842235, + "grad_norm": 0.3616262376308441, + "learning_rate": 8.096865039703038e-05, + "loss": 1.8605, + "step": 10061 + }, + { + "epoch": 3.0883977900552484, + "grad_norm": 0.3453714847564697, + "learning_rate": 8.096474787308157e-05, + "loss": 1.7643, + "step": 10062 + }, + { + "epoch": 3.0887047268262737, + "grad_norm": 0.3192278742790222, + "learning_rate": 8.096084504312098e-05, + "loss": 1.8415, + "step": 10063 + }, + { + "epoch": 3.089011663597299, + "grad_norm": 0.2714482545852661, + "learning_rate": 8.095694190718715e-05, + "loss": 1.8204, + "step": 10064 + }, + { + "epoch": 3.089318600368324, + "grad_norm": 0.26562005281448364, + "learning_rate": 8.09530384653187e-05, + "loss": 1.7322, + "step": 10065 + }, + { + "epoch": 3.0896255371393493, + "grad_norm": 0.33727800846099854, + "learning_rate": 8.094913471755417e-05, + "loss": 1.8221, + "step": 10066 + }, + { + "epoch": 3.0899324739103746, + "grad_norm": 0.3561044931411743, + "learning_rate": 8.094523066393215e-05, + "loss": 1.8879, + "step": 10067 + }, + { + "epoch": 3.0902394106813995, + "grad_norm": 0.2568742334842682, + "learning_rate": 8.094132630449122e-05, + "loss": 1.8178, + "step": 10068 + }, + { + "epoch": 3.090546347452425, + "grad_norm": 0.4025525450706482, + "learning_rate": 8.093742163926998e-05, + "loss": 1.8186, + "step": 10069 + }, + { + "epoch": 3.09085328422345, + "grad_norm": 0.43863433599472046, + "learning_rate": 8.0933516668307e-05, + "loss": 1.8371, + "step": 10070 + }, + { + "epoch": 3.091160220994475, + "grad_norm": 0.34873950481414795, + "learning_rate": 8.092961139164087e-05, + "loss": 1.8083, + "step": 10071 + }, + { + "epoch": 3.0914671577655004, + "grad_norm": 0.31433534622192383, + "learning_rate": 8.092570580931021e-05, + "loss": 1.8154, + "step": 10072 + }, + { + "epoch": 3.0917740945365253, + "grad_norm": 0.25523966550827026, + "learning_rate": 8.092179992135358e-05, + "loss": 1.8158, + "step": 10073 + }, + { + "epoch": 3.0920810313075506, + "grad_norm": 0.348469078540802, + "learning_rate": 8.09178937278096e-05, + "loss": 1.8358, + "step": 10074 + }, + { + "epoch": 3.092387968078576, + "grad_norm": 0.33455297350883484, + "learning_rate": 8.091398722871688e-05, + "loss": 1.7779, + "step": 10075 + }, + { + "epoch": 3.092694904849601, + "grad_norm": 0.36544880270957947, + "learning_rate": 8.091008042411403e-05, + "loss": 1.9186, + "step": 10076 + }, + { + "epoch": 3.093001841620626, + "grad_norm": 0.29165831208229065, + "learning_rate": 8.090617331403965e-05, + "loss": 1.8964, + "step": 10077 + }, + { + "epoch": 3.0933087783916515, + "grad_norm": 0.31011059880256653, + "learning_rate": 8.090226589853234e-05, + "loss": 1.8453, + "step": 10078 + }, + { + "epoch": 3.0936157151626764, + "grad_norm": 0.2835703492164612, + "learning_rate": 8.089835817763071e-05, + "loss": 1.7718, + "step": 10079 + }, + { + "epoch": 3.0939226519337018, + "grad_norm": 0.2910583019256592, + "learning_rate": 8.08944501513734e-05, + "loss": 1.7881, + "step": 10080 + }, + { + "epoch": 3.0942295887047266, + "grad_norm": 0.391303688287735, + "learning_rate": 8.089054181979905e-05, + "loss": 1.7915, + "step": 10081 + }, + { + "epoch": 3.094536525475752, + "grad_norm": 0.4119330048561096, + "learning_rate": 8.088663318294623e-05, + "loss": 1.7975, + "step": 10082 + }, + { + "epoch": 3.0948434622467773, + "grad_norm": 0.2980102002620697, + "learning_rate": 8.088272424085361e-05, + "loss": 1.805, + "step": 10083 + }, + { + "epoch": 3.095150399017802, + "grad_norm": 0.3089980483055115, + "learning_rate": 8.087881499355983e-05, + "loss": 1.8265, + "step": 10084 + }, + { + "epoch": 3.0954573357888275, + "grad_norm": 0.3851003348827362, + "learning_rate": 8.087490544110348e-05, + "loss": 1.8174, + "step": 10085 + }, + { + "epoch": 3.095764272559853, + "grad_norm": 0.42357420921325684, + "learning_rate": 8.08709955835232e-05, + "loss": 1.8083, + "step": 10086 + }, + { + "epoch": 3.0960712093308778, + "grad_norm": 0.291777640581131, + "learning_rate": 8.086708542085768e-05, + "loss": 1.7713, + "step": 10087 + }, + { + "epoch": 3.096378146101903, + "grad_norm": 0.2563805878162384, + "learning_rate": 8.086317495314552e-05, + "loss": 1.7691, + "step": 10088 + }, + { + "epoch": 3.096685082872928, + "grad_norm": 0.3418877422809601, + "learning_rate": 8.085926418042536e-05, + "loss": 1.8547, + "step": 10089 + }, + { + "epoch": 3.0969920196439533, + "grad_norm": 0.3859385550022125, + "learning_rate": 8.085535310273589e-05, + "loss": 1.8226, + "step": 10090 + }, + { + "epoch": 3.0972989564149787, + "grad_norm": 0.3427267372608185, + "learning_rate": 8.085144172011571e-05, + "loss": 1.837, + "step": 10091 + }, + { + "epoch": 3.0976058931860035, + "grad_norm": 0.29290953278541565, + "learning_rate": 8.084753003260352e-05, + "loss": 1.8392, + "step": 10092 + }, + { + "epoch": 3.097912829957029, + "grad_norm": 0.33282020688056946, + "learning_rate": 8.084361804023795e-05, + "loss": 1.8351, + "step": 10093 + }, + { + "epoch": 3.098219766728054, + "grad_norm": 0.3802134394645691, + "learning_rate": 8.083970574305768e-05, + "loss": 1.7467, + "step": 10094 + }, + { + "epoch": 3.098526703499079, + "grad_norm": 0.3142111897468567, + "learning_rate": 8.083579314110135e-05, + "loss": 1.7966, + "step": 10095 + }, + { + "epoch": 3.0988336402701044, + "grad_norm": 0.2956278324127197, + "learning_rate": 8.083188023440765e-05, + "loss": 1.8724, + "step": 10096 + }, + { + "epoch": 3.0991405770411293, + "grad_norm": 0.3262473940849304, + "learning_rate": 8.082796702301522e-05, + "loss": 1.8448, + "step": 10097 + }, + { + "epoch": 3.0994475138121547, + "grad_norm": 0.29358017444610596, + "learning_rate": 8.082405350696276e-05, + "loss": 1.8679, + "step": 10098 + }, + { + "epoch": 3.09975445058318, + "grad_norm": 0.36439722776412964, + "learning_rate": 8.082013968628893e-05, + "loss": 1.8801, + "step": 10099 + }, + { + "epoch": 3.100061387354205, + "grad_norm": 0.3565322458744049, + "learning_rate": 8.081622556103244e-05, + "loss": 1.794, + "step": 10100 + }, + { + "epoch": 3.1003683241252302, + "grad_norm": 0.2841760814189911, + "learning_rate": 8.081231113123191e-05, + "loss": 1.7593, + "step": 10101 + }, + { + "epoch": 3.1006752608962556, + "grad_norm": 0.28589630126953125, + "learning_rate": 8.080839639692608e-05, + "loss": 1.864, + "step": 10102 + }, + { + "epoch": 3.1009821976672804, + "grad_norm": 0.3595057427883148, + "learning_rate": 8.080448135815362e-05, + "loss": 1.8067, + "step": 10103 + }, + { + "epoch": 3.101289134438306, + "grad_norm": 0.3909708261489868, + "learning_rate": 8.080056601495322e-05, + "loss": 1.8601, + "step": 10104 + }, + { + "epoch": 3.1015960712093307, + "grad_norm": 0.35180148482322693, + "learning_rate": 8.079665036736358e-05, + "loss": 1.8328, + "step": 10105 + }, + { + "epoch": 3.101903007980356, + "grad_norm": 0.3065175712108612, + "learning_rate": 8.079273441542338e-05, + "loss": 1.8449, + "step": 10106 + }, + { + "epoch": 3.1022099447513813, + "grad_norm": 0.31358617544174194, + "learning_rate": 8.078881815917134e-05, + "loss": 1.8325, + "step": 10107 + }, + { + "epoch": 3.1025168815224062, + "grad_norm": 0.4737118184566498, + "learning_rate": 8.078490159864614e-05, + "loss": 1.8232, + "step": 10108 + }, + { + "epoch": 3.1028238182934316, + "grad_norm": 0.435148686170578, + "learning_rate": 8.078098473388651e-05, + "loss": 1.8227, + "step": 10109 + }, + { + "epoch": 3.103130755064457, + "grad_norm": 0.3080987334251404, + "learning_rate": 8.077706756493115e-05, + "loss": 1.8072, + "step": 10110 + }, + { + "epoch": 3.103437691835482, + "grad_norm": 0.3225170075893402, + "learning_rate": 8.077315009181876e-05, + "loss": 1.7716, + "step": 10111 + }, + { + "epoch": 3.103744628606507, + "grad_norm": 0.46642443537712097, + "learning_rate": 8.076923231458808e-05, + "loss": 1.8295, + "step": 10112 + }, + { + "epoch": 3.104051565377532, + "grad_norm": 0.42561766505241394, + "learning_rate": 8.07653142332778e-05, + "loss": 1.8553, + "step": 10113 + }, + { + "epoch": 3.1043585021485574, + "grad_norm": 0.27187541127204895, + "learning_rate": 8.076139584792664e-05, + "loss": 1.7937, + "step": 10114 + }, + { + "epoch": 3.1046654389195827, + "grad_norm": 0.27822238206863403, + "learning_rate": 8.075747715857335e-05, + "loss": 1.8151, + "step": 10115 + }, + { + "epoch": 3.1049723756906076, + "grad_norm": 0.40106478333473206, + "learning_rate": 8.075355816525665e-05, + "loss": 1.8637, + "step": 10116 + }, + { + "epoch": 3.105279312461633, + "grad_norm": 0.33455124497413635, + "learning_rate": 8.074963886801525e-05, + "loss": 1.8543, + "step": 10117 + }, + { + "epoch": 3.1055862492326582, + "grad_norm": 0.32246437668800354, + "learning_rate": 8.07457192668879e-05, + "loss": 1.7907, + "step": 10118 + }, + { + "epoch": 3.105893186003683, + "grad_norm": 0.45360109210014343, + "learning_rate": 8.074179936191332e-05, + "loss": 1.7404, + "step": 10119 + }, + { + "epoch": 3.1062001227747085, + "grad_norm": 0.445916086435318, + "learning_rate": 8.07378791531303e-05, + "loss": 1.778, + "step": 10120 + }, + { + "epoch": 3.1065070595457334, + "grad_norm": 0.28561538457870483, + "learning_rate": 8.073395864057751e-05, + "loss": 1.8723, + "step": 10121 + }, + { + "epoch": 3.1068139963167587, + "grad_norm": 0.3258218467235565, + "learning_rate": 8.073003782429373e-05, + "loss": 1.8106, + "step": 10122 + }, + { + "epoch": 3.107120933087784, + "grad_norm": 0.5459560751914978, + "learning_rate": 8.07261167043177e-05, + "loss": 1.8022, + "step": 10123 + }, + { + "epoch": 3.107427869858809, + "grad_norm": 0.4828549921512604, + "learning_rate": 8.072219528068819e-05, + "loss": 1.7556, + "step": 10124 + }, + { + "epoch": 3.1077348066298343, + "grad_norm": 0.24075324833393097, + "learning_rate": 8.071827355344393e-05, + "loss": 1.7901, + "step": 10125 + }, + { + "epoch": 3.1080417434008596, + "grad_norm": 0.44677188992500305, + "learning_rate": 8.071435152262367e-05, + "loss": 1.7858, + "step": 10126 + }, + { + "epoch": 3.1083486801718845, + "grad_norm": 0.49862590432167053, + "learning_rate": 8.071042918826622e-05, + "loss": 1.805, + "step": 10127 + }, + { + "epoch": 3.10865561694291, + "grad_norm": 0.30883491039276123, + "learning_rate": 8.07065065504103e-05, + "loss": 1.7693, + "step": 10128 + }, + { + "epoch": 3.108962553713935, + "grad_norm": 0.29583030939102173, + "learning_rate": 8.070258360909467e-05, + "loss": 1.8141, + "step": 10129 + }, + { + "epoch": 3.10926949048496, + "grad_norm": 0.3595346510410309, + "learning_rate": 8.069866036435812e-05, + "loss": 1.8286, + "step": 10130 + }, + { + "epoch": 3.1095764272559854, + "grad_norm": 0.3215504288673401, + "learning_rate": 8.069473681623942e-05, + "loss": 1.8557, + "step": 10131 + }, + { + "epoch": 3.1098833640270103, + "grad_norm": 0.29734939336776733, + "learning_rate": 8.069081296477734e-05, + "loss": 1.7996, + "step": 10132 + }, + { + "epoch": 3.1101903007980356, + "grad_norm": 0.33546003699302673, + "learning_rate": 8.068688881001065e-05, + "loss": 1.8307, + "step": 10133 + }, + { + "epoch": 3.110497237569061, + "grad_norm": 0.3886832296848297, + "learning_rate": 8.068296435197814e-05, + "loss": 1.751, + "step": 10134 + }, + { + "epoch": 3.110804174340086, + "grad_norm": 0.34505394101142883, + "learning_rate": 8.06790395907186e-05, + "loss": 1.7543, + "step": 10135 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.27018141746520996, + "learning_rate": 8.06751145262708e-05, + "loss": 1.8109, + "step": 10136 + }, + { + "epoch": 3.1114180478821365, + "grad_norm": 0.3367149531841278, + "learning_rate": 8.067118915867355e-05, + "loss": 1.8025, + "step": 10137 + }, + { + "epoch": 3.1117249846531614, + "grad_norm": 0.40811091661453247, + "learning_rate": 8.066726348796562e-05, + "loss": 1.7327, + "step": 10138 + }, + { + "epoch": 3.1120319214241867, + "grad_norm": 0.3511471152305603, + "learning_rate": 8.066333751418583e-05, + "loss": 1.8711, + "step": 10139 + }, + { + "epoch": 3.1123388581952116, + "grad_norm": 0.3112446367740631, + "learning_rate": 8.065941123737295e-05, + "loss": 1.8621, + "step": 10140 + }, + { + "epoch": 3.112645794966237, + "grad_norm": 0.3424238860607147, + "learning_rate": 8.065548465756581e-05, + "loss": 1.8383, + "step": 10141 + }, + { + "epoch": 3.1129527317372623, + "grad_norm": 0.380013108253479, + "learning_rate": 8.06515577748032e-05, + "loss": 1.8121, + "step": 10142 + }, + { + "epoch": 3.113259668508287, + "grad_norm": 0.2650558650493622, + "learning_rate": 8.064763058912393e-05, + "loss": 1.866, + "step": 10143 + }, + { + "epoch": 3.1135666052793125, + "grad_norm": 0.30580762028694153, + "learning_rate": 8.06437031005668e-05, + "loss": 1.7769, + "step": 10144 + }, + { + "epoch": 3.113873542050338, + "grad_norm": 0.29927194118499756, + "learning_rate": 8.063977530917066e-05, + "loss": 1.7897, + "step": 10145 + }, + { + "epoch": 3.1141804788213627, + "grad_norm": 0.24322012066841125, + "learning_rate": 8.063584721497429e-05, + "loss": 1.7968, + "step": 10146 + }, + { + "epoch": 3.114487415592388, + "grad_norm": 0.3082945644855499, + "learning_rate": 8.063191881801651e-05, + "loss": 1.8456, + "step": 10147 + }, + { + "epoch": 3.114794352363413, + "grad_norm": 0.3247329890727997, + "learning_rate": 8.062799011833617e-05, + "loss": 1.7436, + "step": 10148 + }, + { + "epoch": 3.1151012891344383, + "grad_norm": 0.27591946721076965, + "learning_rate": 8.062406111597207e-05, + "loss": 1.7976, + "step": 10149 + }, + { + "epoch": 3.1154082259054636, + "grad_norm": 0.2752058804035187, + "learning_rate": 8.062013181096306e-05, + "loss": 1.7814, + "step": 10150 + }, + { + "epoch": 3.1157151626764885, + "grad_norm": 0.3207196891307831, + "learning_rate": 8.061620220334795e-05, + "loss": 1.7767, + "step": 10151 + }, + { + "epoch": 3.116022099447514, + "grad_norm": 0.2895309627056122, + "learning_rate": 8.061227229316559e-05, + "loss": 1.8588, + "step": 10152 + }, + { + "epoch": 3.116329036218539, + "grad_norm": 0.333843469619751, + "learning_rate": 8.060834208045481e-05, + "loss": 1.7871, + "step": 10153 + }, + { + "epoch": 3.116635972989564, + "grad_norm": 0.43877774477005005, + "learning_rate": 8.060441156525445e-05, + "loss": 1.8165, + "step": 10154 + }, + { + "epoch": 3.1169429097605894, + "grad_norm": 0.35700589418411255, + "learning_rate": 8.060048074760337e-05, + "loss": 1.777, + "step": 10155 + }, + { + "epoch": 3.1172498465316143, + "grad_norm": 0.26124534010887146, + "learning_rate": 8.059654962754039e-05, + "loss": 1.8343, + "step": 10156 + }, + { + "epoch": 3.1175567833026396, + "grad_norm": 0.331444650888443, + "learning_rate": 8.059261820510438e-05, + "loss": 1.9437, + "step": 10157 + }, + { + "epoch": 3.117863720073665, + "grad_norm": 0.31657731533050537, + "learning_rate": 8.058868648033419e-05, + "loss": 1.7621, + "step": 10158 + }, + { + "epoch": 3.11817065684469, + "grad_norm": 0.2785957455635071, + "learning_rate": 8.058475445326867e-05, + "loss": 1.9049, + "step": 10159 + }, + { + "epoch": 3.118477593615715, + "grad_norm": 0.2605743408203125, + "learning_rate": 8.058082212394667e-05, + "loss": 1.7895, + "step": 10160 + }, + { + "epoch": 3.1187845303867405, + "grad_norm": 0.2981378138065338, + "learning_rate": 8.057688949240707e-05, + "loss": 1.8373, + "step": 10161 + }, + { + "epoch": 3.1190914671577654, + "grad_norm": 0.2944273054599762, + "learning_rate": 8.057295655868873e-05, + "loss": 1.8373, + "step": 10162 + }, + { + "epoch": 3.1193984039287908, + "grad_norm": 0.2696721851825714, + "learning_rate": 8.056902332283052e-05, + "loss": 1.8023, + "step": 10163 + }, + { + "epoch": 3.1197053406998156, + "grad_norm": 0.27659857273101807, + "learning_rate": 8.056508978487128e-05, + "loss": 1.8453, + "step": 10164 + }, + { + "epoch": 3.120012277470841, + "grad_norm": 0.2982441186904907, + "learning_rate": 8.056115594484992e-05, + "loss": 1.9072, + "step": 10165 + }, + { + "epoch": 3.1203192142418663, + "grad_norm": 0.3136404752731323, + "learning_rate": 8.055722180280531e-05, + "loss": 1.8585, + "step": 10166 + }, + { + "epoch": 3.120626151012891, + "grad_norm": 0.2979940176010132, + "learning_rate": 8.055328735877631e-05, + "loss": 1.8699, + "step": 10167 + }, + { + "epoch": 3.1209330877839165, + "grad_norm": 0.2585618793964386, + "learning_rate": 8.054935261280184e-05, + "loss": 1.8323, + "step": 10168 + }, + { + "epoch": 3.121240024554942, + "grad_norm": 0.28734859824180603, + "learning_rate": 8.054541756492075e-05, + "loss": 1.8694, + "step": 10169 + }, + { + "epoch": 3.1215469613259668, + "grad_norm": 0.30582788586616516, + "learning_rate": 8.054148221517193e-05, + "loss": 1.856, + "step": 10170 + }, + { + "epoch": 3.121853898096992, + "grad_norm": 0.3128255009651184, + "learning_rate": 8.053754656359429e-05, + "loss": 1.8329, + "step": 10171 + }, + { + "epoch": 3.122160834868017, + "grad_norm": 0.2845318615436554, + "learning_rate": 8.053361061022671e-05, + "loss": 1.8111, + "step": 10172 + }, + { + "epoch": 3.1224677716390423, + "grad_norm": 0.2994609773159027, + "learning_rate": 8.05296743551081e-05, + "loss": 1.8157, + "step": 10173 + }, + { + "epoch": 3.1227747084100677, + "grad_norm": 0.26397961378097534, + "learning_rate": 8.052573779827737e-05, + "loss": 1.8572, + "step": 10174 + }, + { + "epoch": 3.1230816451810925, + "grad_norm": 0.2911500334739685, + "learning_rate": 8.052180093977339e-05, + "loss": 1.8312, + "step": 10175 + }, + { + "epoch": 3.123388581952118, + "grad_norm": 0.33455008268356323, + "learning_rate": 8.051786377963509e-05, + "loss": 1.8748, + "step": 10176 + }, + { + "epoch": 3.123695518723143, + "grad_norm": 0.3127586841583252, + "learning_rate": 8.051392631790135e-05, + "loss": 1.8224, + "step": 10177 + }, + { + "epoch": 3.124002455494168, + "grad_norm": 0.2910686433315277, + "learning_rate": 8.050998855461113e-05, + "loss": 1.8557, + "step": 10178 + }, + { + "epoch": 3.1243093922651934, + "grad_norm": 0.2849208414554596, + "learning_rate": 8.050605048980333e-05, + "loss": 1.82, + "step": 10179 + }, + { + "epoch": 3.1246163290362183, + "grad_norm": 0.35189691185951233, + "learning_rate": 8.050211212351683e-05, + "loss": 1.7884, + "step": 10180 + }, + { + "epoch": 3.1249232658072437, + "grad_norm": 0.3641110360622406, + "learning_rate": 8.04981734557906e-05, + "loss": 1.7984, + "step": 10181 + }, + { + "epoch": 3.125230202578269, + "grad_norm": 0.3111717700958252, + "learning_rate": 8.049423448666353e-05, + "loss": 1.8134, + "step": 10182 + }, + { + "epoch": 3.125537139349294, + "grad_norm": 0.2608453631401062, + "learning_rate": 8.049029521617457e-05, + "loss": 1.765, + "step": 10183 + }, + { + "epoch": 3.1258440761203192, + "grad_norm": 0.28779423236846924, + "learning_rate": 8.048635564436265e-05, + "loss": 1.8355, + "step": 10184 + }, + { + "epoch": 3.1261510128913446, + "grad_norm": 0.38227665424346924, + "learning_rate": 8.048241577126668e-05, + "loss": 1.8487, + "step": 10185 + }, + { + "epoch": 3.1264579496623695, + "grad_norm": 0.3603171706199646, + "learning_rate": 8.047847559692562e-05, + "loss": 1.8035, + "step": 10186 + }, + { + "epoch": 3.126764886433395, + "grad_norm": 0.21950066089630127, + "learning_rate": 8.04745351213784e-05, + "loss": 1.7399, + "step": 10187 + }, + { + "epoch": 3.12707182320442, + "grad_norm": 0.2796075642108917, + "learning_rate": 8.047059434466395e-05, + "loss": 1.8229, + "step": 10188 + }, + { + "epoch": 3.127378759975445, + "grad_norm": 0.3382907807826996, + "learning_rate": 8.046665326682125e-05, + "loss": 1.7713, + "step": 10189 + }, + { + "epoch": 3.1276856967464703, + "grad_norm": 0.36472463607788086, + "learning_rate": 8.04627118878892e-05, + "loss": 1.8129, + "step": 10190 + }, + { + "epoch": 3.1279926335174952, + "grad_norm": 0.2971884310245514, + "learning_rate": 8.045877020790679e-05, + "loss": 1.7894, + "step": 10191 + }, + { + "epoch": 3.1282995702885206, + "grad_norm": 0.2292303442955017, + "learning_rate": 8.045482822691297e-05, + "loss": 1.7637, + "step": 10192 + }, + { + "epoch": 3.128606507059546, + "grad_norm": 0.300750732421875, + "learning_rate": 8.045088594494668e-05, + "loss": 1.7678, + "step": 10193 + }, + { + "epoch": 3.128913443830571, + "grad_norm": 0.3121531009674072, + "learning_rate": 8.044694336204688e-05, + "loss": 1.8651, + "step": 10194 + }, + { + "epoch": 3.129220380601596, + "grad_norm": 0.2456093430519104, + "learning_rate": 8.044300047825254e-05, + "loss": 1.7769, + "step": 10195 + }, + { + "epoch": 3.129527317372621, + "grad_norm": 0.25085800886154175, + "learning_rate": 8.043905729360264e-05, + "loss": 1.7723, + "step": 10196 + }, + { + "epoch": 3.1298342541436464, + "grad_norm": 0.2505287826061249, + "learning_rate": 8.043511380813612e-05, + "loss": 1.7943, + "step": 10197 + }, + { + "epoch": 3.1301411909146717, + "grad_norm": 0.27144530415534973, + "learning_rate": 8.043117002189198e-05, + "loss": 1.8119, + "step": 10198 + }, + { + "epoch": 3.1304481276856966, + "grad_norm": 0.2702989876270294, + "learning_rate": 8.042722593490916e-05, + "loss": 1.8517, + "step": 10199 + }, + { + "epoch": 3.130755064456722, + "grad_norm": 0.2585136890411377, + "learning_rate": 8.042328154722667e-05, + "loss": 1.8382, + "step": 10200 + }, + { + "epoch": 3.1310620012277472, + "grad_norm": 0.26306065917015076, + "learning_rate": 8.041933685888348e-05, + "loss": 1.8211, + "step": 10201 + }, + { + "epoch": 3.131368937998772, + "grad_norm": 0.2208927720785141, + "learning_rate": 8.041539186991858e-05, + "loss": 1.7765, + "step": 10202 + }, + { + "epoch": 3.1316758747697975, + "grad_norm": 0.2756440043449402, + "learning_rate": 8.041144658037095e-05, + "loss": 1.898, + "step": 10203 + }, + { + "epoch": 3.131982811540823, + "grad_norm": 0.29718101024627686, + "learning_rate": 8.040750099027958e-05, + "loss": 1.8226, + "step": 10204 + }, + { + "epoch": 3.1322897483118477, + "grad_norm": 0.3166738748550415, + "learning_rate": 8.040355509968345e-05, + "loss": 1.8129, + "step": 10205 + }, + { + "epoch": 3.132596685082873, + "grad_norm": 0.3534909784793854, + "learning_rate": 8.039960890862158e-05, + "loss": 1.8915, + "step": 10206 + }, + { + "epoch": 3.132903621853898, + "grad_norm": 0.3015006184577942, + "learning_rate": 8.039566241713297e-05, + "loss": 1.8389, + "step": 10207 + }, + { + "epoch": 3.1332105586249233, + "grad_norm": 0.35226619243621826, + "learning_rate": 8.039171562525659e-05, + "loss": 1.7287, + "step": 10208 + }, + { + "epoch": 3.1335174953959486, + "grad_norm": 0.4290136694908142, + "learning_rate": 8.038776853303146e-05, + "loss": 1.8768, + "step": 10209 + }, + { + "epoch": 3.1338244321669735, + "grad_norm": 0.2828960418701172, + "learning_rate": 8.03838211404966e-05, + "loss": 1.7552, + "step": 10210 + }, + { + "epoch": 3.134131368937999, + "grad_norm": 0.3781953752040863, + "learning_rate": 8.0379873447691e-05, + "loss": 1.7812, + "step": 10211 + }, + { + "epoch": 3.1344383057090237, + "grad_norm": 0.4282926023006439, + "learning_rate": 8.037592545465371e-05, + "loss": 1.84, + "step": 10212 + }, + { + "epoch": 3.134745242480049, + "grad_norm": 0.2622411251068115, + "learning_rate": 8.03719771614237e-05, + "loss": 1.8114, + "step": 10213 + }, + { + "epoch": 3.1350521792510744, + "grad_norm": 0.34881457686424255, + "learning_rate": 8.036802856804001e-05, + "loss": 1.7694, + "step": 10214 + }, + { + "epoch": 3.1353591160220993, + "grad_norm": 0.40797632932662964, + "learning_rate": 8.036407967454167e-05, + "loss": 1.7595, + "step": 10215 + }, + { + "epoch": 3.1356660527931246, + "grad_norm": 0.24902814626693726, + "learning_rate": 8.036013048096769e-05, + "loss": 1.8068, + "step": 10216 + }, + { + "epoch": 3.13597298956415, + "grad_norm": 0.3682909607887268, + "learning_rate": 8.035618098735711e-05, + "loss": 1.8519, + "step": 10217 + }, + { + "epoch": 3.136279926335175, + "grad_norm": 0.6111233234405518, + "learning_rate": 8.035223119374895e-05, + "loss": 1.9254, + "step": 10218 + }, + { + "epoch": 3.1365868631062, + "grad_norm": 0.4793062210083008, + "learning_rate": 8.034828110018227e-05, + "loss": 1.786, + "step": 10219 + }, + { + "epoch": 3.1368937998772255, + "grad_norm": 0.3074932396411896, + "learning_rate": 8.034433070669607e-05, + "loss": 1.8495, + "step": 10220 + }, + { + "epoch": 3.1372007366482504, + "grad_norm": 0.4366479218006134, + "learning_rate": 8.034038001332942e-05, + "loss": 1.8501, + "step": 10221 + }, + { + "epoch": 3.1375076734192757, + "grad_norm": 0.4660070538520813, + "learning_rate": 8.033642902012135e-05, + "loss": 1.8317, + "step": 10222 + }, + { + "epoch": 3.1378146101903006, + "grad_norm": 0.3452899158000946, + "learning_rate": 8.03324777271109e-05, + "loss": 1.8702, + "step": 10223 + }, + { + "epoch": 3.138121546961326, + "grad_norm": 0.3658824563026428, + "learning_rate": 8.032852613433713e-05, + "loss": 1.8754, + "step": 10224 + }, + { + "epoch": 3.1384284837323513, + "grad_norm": 0.3777768909931183, + "learning_rate": 8.03245742418391e-05, + "loss": 1.8613, + "step": 10225 + }, + { + "epoch": 3.138735420503376, + "grad_norm": 0.3873192071914673, + "learning_rate": 8.032062204965582e-05, + "loss": 1.8438, + "step": 10226 + }, + { + "epoch": 3.1390423572744015, + "grad_norm": 0.30686715245246887, + "learning_rate": 8.031666955782641e-05, + "loss": 1.811, + "step": 10227 + }, + { + "epoch": 3.139349294045427, + "grad_norm": 0.2738516330718994, + "learning_rate": 8.03127167663899e-05, + "loss": 1.757, + "step": 10228 + }, + { + "epoch": 3.1396562308164517, + "grad_norm": 0.3093133270740509, + "learning_rate": 8.030876367538536e-05, + "loss": 1.8181, + "step": 10229 + }, + { + "epoch": 3.139963167587477, + "grad_norm": 0.3247159719467163, + "learning_rate": 8.030481028485185e-05, + "loss": 1.7798, + "step": 10230 + }, + { + "epoch": 3.140270104358502, + "grad_norm": 0.2855088412761688, + "learning_rate": 8.030085659482845e-05, + "loss": 1.825, + "step": 10231 + }, + { + "epoch": 3.1405770411295273, + "grad_norm": 0.2818242907524109, + "learning_rate": 8.02969026053542e-05, + "loss": 1.7737, + "step": 10232 + }, + { + "epoch": 3.1408839779005526, + "grad_norm": 0.27074751257896423, + "learning_rate": 8.029294831646822e-05, + "loss": 1.8306, + "step": 10233 + }, + { + "epoch": 3.1411909146715775, + "grad_norm": 0.29740920662879944, + "learning_rate": 8.028899372820954e-05, + "loss": 1.8157, + "step": 10234 + }, + { + "epoch": 3.141497851442603, + "grad_norm": 0.30743202567100525, + "learning_rate": 8.028503884061731e-05, + "loss": 1.7626, + "step": 10235 + }, + { + "epoch": 3.141804788213628, + "grad_norm": 0.27812567353248596, + "learning_rate": 8.028108365373058e-05, + "loss": 1.7604, + "step": 10236 + }, + { + "epoch": 3.142111724984653, + "grad_norm": 0.26212629675865173, + "learning_rate": 8.027712816758839e-05, + "loss": 1.8161, + "step": 10237 + }, + { + "epoch": 3.1424186617556784, + "grad_norm": 0.3611658811569214, + "learning_rate": 8.02731723822299e-05, + "loss": 1.8283, + "step": 10238 + }, + { + "epoch": 3.1427255985267033, + "grad_norm": 0.31705498695373535, + "learning_rate": 8.026921629769418e-05, + "loss": 1.7986, + "step": 10239 + }, + { + "epoch": 3.1430325352977286, + "grad_norm": 0.25905972719192505, + "learning_rate": 8.026525991402032e-05, + "loss": 1.7926, + "step": 10240 + }, + { + "epoch": 3.143339472068754, + "grad_norm": 0.42376595735549927, + "learning_rate": 8.026130323124741e-05, + "loss": 1.8275, + "step": 10241 + }, + { + "epoch": 3.143646408839779, + "grad_norm": 0.415556401014328, + "learning_rate": 8.025734624941458e-05, + "loss": 1.7938, + "step": 10242 + }, + { + "epoch": 3.143953345610804, + "grad_norm": 0.3558904528617859, + "learning_rate": 8.025338896856091e-05, + "loss": 1.836, + "step": 10243 + }, + { + "epoch": 3.1442602823818295, + "grad_norm": 0.3091062307357788, + "learning_rate": 8.024943138872553e-05, + "loss": 1.8285, + "step": 10244 + }, + { + "epoch": 3.1445672191528544, + "grad_norm": 0.2620905041694641, + "learning_rate": 8.024547350994753e-05, + "loss": 1.7115, + "step": 10245 + }, + { + "epoch": 3.1448741559238798, + "grad_norm": 0.25716835260391235, + "learning_rate": 8.024151533226604e-05, + "loss": 1.7702, + "step": 10246 + }, + { + "epoch": 3.1451810926949046, + "grad_norm": 0.250844269990921, + "learning_rate": 8.023755685572017e-05, + "loss": 1.7617, + "step": 10247 + }, + { + "epoch": 3.14548802946593, + "grad_norm": 0.23898956179618835, + "learning_rate": 8.023359808034903e-05, + "loss": 1.7872, + "step": 10248 + }, + { + "epoch": 3.1457949662369553, + "grad_norm": 0.2335387021303177, + "learning_rate": 8.022963900619176e-05, + "loss": 1.7656, + "step": 10249 + }, + { + "epoch": 3.14610190300798, + "grad_norm": 0.21822704374790192, + "learning_rate": 8.022567963328749e-05, + "loss": 1.7706, + "step": 10250 + }, + { + "epoch": 3.1464088397790055, + "grad_norm": 0.2627898156642914, + "learning_rate": 8.022171996167531e-05, + "loss": 1.8559, + "step": 10251 + }, + { + "epoch": 3.146715776550031, + "grad_norm": 0.2530064582824707, + "learning_rate": 8.021775999139441e-05, + "loss": 1.788, + "step": 10252 + }, + { + "epoch": 3.1470227133210558, + "grad_norm": 0.2293635457754135, + "learning_rate": 8.021379972248387e-05, + "loss": 1.8129, + "step": 10253 + }, + { + "epoch": 3.147329650092081, + "grad_norm": 0.27753588557243347, + "learning_rate": 8.020983915498286e-05, + "loss": 1.7957, + "step": 10254 + }, + { + "epoch": 3.147636586863106, + "grad_norm": 0.24507668614387512, + "learning_rate": 8.020587828893051e-05, + "loss": 1.7969, + "step": 10255 + }, + { + "epoch": 3.1479435236341313, + "grad_norm": 0.24818891286849976, + "learning_rate": 8.020191712436598e-05, + "loss": 1.8412, + "step": 10256 + }, + { + "epoch": 3.1482504604051567, + "grad_norm": 0.2463149130344391, + "learning_rate": 8.01979556613284e-05, + "loss": 1.8097, + "step": 10257 + }, + { + "epoch": 3.1485573971761815, + "grad_norm": 0.26742151379585266, + "learning_rate": 8.019399389985692e-05, + "loss": 1.8487, + "step": 10258 + }, + { + "epoch": 3.148864333947207, + "grad_norm": 0.3078254461288452, + "learning_rate": 8.01900318399907e-05, + "loss": 1.8189, + "step": 10259 + }, + { + "epoch": 3.149171270718232, + "grad_norm": 0.3819321393966675, + "learning_rate": 8.018606948176887e-05, + "loss": 1.8019, + "step": 10260 + }, + { + "epoch": 3.149478207489257, + "grad_norm": 0.3932126462459564, + "learning_rate": 8.018210682523061e-05, + "loss": 1.787, + "step": 10261 + }, + { + "epoch": 3.1497851442602824, + "grad_norm": 0.2696186900138855, + "learning_rate": 8.017814387041511e-05, + "loss": 1.8345, + "step": 10262 + }, + { + "epoch": 3.150092081031308, + "grad_norm": 0.32631832361221313, + "learning_rate": 8.017418061736149e-05, + "loss": 1.7724, + "step": 10263 + }, + { + "epoch": 3.1503990178023327, + "grad_norm": 0.36187833547592163, + "learning_rate": 8.017021706610893e-05, + "loss": 1.7829, + "step": 10264 + }, + { + "epoch": 3.150705954573358, + "grad_norm": 0.29678142070770264, + "learning_rate": 8.01662532166966e-05, + "loss": 1.7896, + "step": 10265 + }, + { + "epoch": 3.151012891344383, + "grad_norm": 0.2997078001499176, + "learning_rate": 8.016228906916368e-05, + "loss": 1.8401, + "step": 10266 + }, + { + "epoch": 3.1513198281154082, + "grad_norm": 0.4688792824745178, + "learning_rate": 8.015832462354933e-05, + "loss": 1.8263, + "step": 10267 + }, + { + "epoch": 3.1516267648864336, + "grad_norm": 0.42710503935813904, + "learning_rate": 8.015435987989275e-05, + "loss": 1.8233, + "step": 10268 + }, + { + "epoch": 3.1519337016574585, + "grad_norm": 0.2490987628698349, + "learning_rate": 8.01503948382331e-05, + "loss": 1.7792, + "step": 10269 + }, + { + "epoch": 3.152240638428484, + "grad_norm": 0.400836706161499, + "learning_rate": 8.014642949860957e-05, + "loss": 1.8113, + "step": 10270 + }, + { + "epoch": 3.1525475751995087, + "grad_norm": 0.47995972633361816, + "learning_rate": 8.014246386106138e-05, + "loss": 1.8754, + "step": 10271 + }, + { + "epoch": 3.152854511970534, + "grad_norm": 0.39069879055023193, + "learning_rate": 8.013849792562769e-05, + "loss": 1.8541, + "step": 10272 + }, + { + "epoch": 3.1531614487415593, + "grad_norm": 0.27174463868141174, + "learning_rate": 8.013453169234768e-05, + "loss": 1.8018, + "step": 10273 + }, + { + "epoch": 3.1534683855125842, + "grad_norm": 0.37808045744895935, + "learning_rate": 8.013056516126058e-05, + "loss": 1.8346, + "step": 10274 + }, + { + "epoch": 3.1537753222836096, + "grad_norm": 0.43864908814430237, + "learning_rate": 8.012659833240557e-05, + "loss": 1.7626, + "step": 10275 + }, + { + "epoch": 3.154082259054635, + "grad_norm": 0.3592168688774109, + "learning_rate": 8.012263120582187e-05, + "loss": 1.8261, + "step": 10276 + }, + { + "epoch": 3.15438919582566, + "grad_norm": 0.3056562542915344, + "learning_rate": 8.011866378154866e-05, + "loss": 1.903, + "step": 10277 + }, + { + "epoch": 3.154696132596685, + "grad_norm": 0.2898549735546112, + "learning_rate": 8.011469605962517e-05, + "loss": 1.7781, + "step": 10278 + }, + { + "epoch": 3.1550030693677105, + "grad_norm": 0.3498871624469757, + "learning_rate": 8.011072804009059e-05, + "loss": 1.7571, + "step": 10279 + }, + { + "epoch": 3.1553100061387354, + "grad_norm": 0.3330932557582855, + "learning_rate": 8.010675972298416e-05, + "loss": 1.8298, + "step": 10280 + }, + { + "epoch": 3.1556169429097607, + "grad_norm": 0.2540839910507202, + "learning_rate": 8.010279110834507e-05, + "loss": 1.8327, + "step": 10281 + }, + { + "epoch": 3.1559238796807856, + "grad_norm": 0.3557111322879791, + "learning_rate": 8.009882219621257e-05, + "loss": 1.7611, + "step": 10282 + }, + { + "epoch": 3.156230816451811, + "grad_norm": 0.28293952345848083, + "learning_rate": 8.009485298662584e-05, + "loss": 1.7761, + "step": 10283 + }, + { + "epoch": 3.1565377532228363, + "grad_norm": 0.27089303731918335, + "learning_rate": 8.009088347962416e-05, + "loss": 1.8081, + "step": 10284 + }, + { + "epoch": 3.156844689993861, + "grad_norm": 0.2689332664012909, + "learning_rate": 8.008691367524673e-05, + "loss": 1.7458, + "step": 10285 + }, + { + "epoch": 3.1571516267648865, + "grad_norm": 0.2495841234922409, + "learning_rate": 8.008294357353278e-05, + "loss": 1.8307, + "step": 10286 + }, + { + "epoch": 3.1574585635359114, + "grad_norm": 0.29242852330207825, + "learning_rate": 8.007897317452156e-05, + "loss": 1.9216, + "step": 10287 + }, + { + "epoch": 3.1577655003069367, + "grad_norm": 0.26574134826660156, + "learning_rate": 8.007500247825229e-05, + "loss": 1.8392, + "step": 10288 + }, + { + "epoch": 3.158072437077962, + "grad_norm": 0.2503872811794281, + "learning_rate": 8.00710314847642e-05, + "loss": 1.7742, + "step": 10289 + }, + { + "epoch": 3.158379373848987, + "grad_norm": 0.25614771246910095, + "learning_rate": 8.006706019409658e-05, + "loss": 1.828, + "step": 10290 + }, + { + "epoch": 3.1586863106200123, + "grad_norm": 0.259369820356369, + "learning_rate": 8.006308860628863e-05, + "loss": 1.8328, + "step": 10291 + }, + { + "epoch": 3.1589932473910376, + "grad_norm": 0.28183647990226746, + "learning_rate": 8.005911672137962e-05, + "loss": 1.8269, + "step": 10292 + }, + { + "epoch": 3.1593001841620625, + "grad_norm": 0.2926514446735382, + "learning_rate": 8.005514453940881e-05, + "loss": 1.8334, + "step": 10293 + }, + { + "epoch": 3.159607120933088, + "grad_norm": 0.34313449263572693, + "learning_rate": 8.005117206041543e-05, + "loss": 1.7866, + "step": 10294 + }, + { + "epoch": 3.159914057704113, + "grad_norm": 0.30971628427505493, + "learning_rate": 8.004719928443875e-05, + "loss": 1.7827, + "step": 10295 + }, + { + "epoch": 3.160220994475138, + "grad_norm": 0.23955371975898743, + "learning_rate": 8.004322621151807e-05, + "loss": 1.7619, + "step": 10296 + }, + { + "epoch": 3.1605279312461634, + "grad_norm": 0.31311795115470886, + "learning_rate": 8.003925284169261e-05, + "loss": 1.8247, + "step": 10297 + }, + { + "epoch": 3.1608348680171883, + "grad_norm": 0.3408358097076416, + "learning_rate": 8.003527917500163e-05, + "loss": 1.8146, + "step": 10298 + }, + { + "epoch": 3.1611418047882136, + "grad_norm": 0.3030858337879181, + "learning_rate": 8.003130521148442e-05, + "loss": 1.857, + "step": 10299 + }, + { + "epoch": 3.161448741559239, + "grad_norm": 0.25168511271476746, + "learning_rate": 8.002733095118025e-05, + "loss": 1.8404, + "step": 10300 + }, + { + "epoch": 3.161755678330264, + "grad_norm": 0.2956216335296631, + "learning_rate": 8.002335639412839e-05, + "loss": 1.7352, + "step": 10301 + }, + { + "epoch": 3.162062615101289, + "grad_norm": 0.27791857719421387, + "learning_rate": 8.001938154036814e-05, + "loss": 1.7797, + "step": 10302 + }, + { + "epoch": 3.1623695518723145, + "grad_norm": 0.3106420040130615, + "learning_rate": 8.001540638993876e-05, + "loss": 1.8434, + "step": 10303 + }, + { + "epoch": 3.1626764886433394, + "grad_norm": 0.2940445840358734, + "learning_rate": 8.001143094287954e-05, + "loss": 1.8459, + "step": 10304 + }, + { + "epoch": 3.1629834254143647, + "grad_norm": 0.3857429325580597, + "learning_rate": 8.000745519922977e-05, + "loss": 1.7853, + "step": 10305 + }, + { + "epoch": 3.1632903621853896, + "grad_norm": 0.3585071861743927, + "learning_rate": 8.000347915902874e-05, + "loss": 1.8905, + "step": 10306 + }, + { + "epoch": 3.163597298956415, + "grad_norm": 0.320003867149353, + "learning_rate": 7.999950282231574e-05, + "loss": 1.8397, + "step": 10307 + }, + { + "epoch": 3.1639042357274403, + "grad_norm": 0.24986252188682556, + "learning_rate": 7.999552618913009e-05, + "loss": 1.7916, + "step": 10308 + }, + { + "epoch": 3.164211172498465, + "grad_norm": 0.33077237010002136, + "learning_rate": 7.999154925951104e-05, + "loss": 1.8334, + "step": 10309 + }, + { + "epoch": 3.1645181092694905, + "grad_norm": 0.35700327157974243, + "learning_rate": 7.998757203349794e-05, + "loss": 1.7773, + "step": 10310 + }, + { + "epoch": 3.164825046040516, + "grad_norm": 0.3095493018627167, + "learning_rate": 7.998359451113007e-05, + "loss": 1.8156, + "step": 10311 + }, + { + "epoch": 3.1651319828115407, + "grad_norm": 0.3004748225212097, + "learning_rate": 7.997961669244673e-05, + "loss": 1.7862, + "step": 10312 + }, + { + "epoch": 3.165438919582566, + "grad_norm": 0.39382806420326233, + "learning_rate": 7.99756385774873e-05, + "loss": 1.764, + "step": 10313 + }, + { + "epoch": 3.165745856353591, + "grad_norm": 0.3109463155269623, + "learning_rate": 7.997166016629099e-05, + "loss": 1.8006, + "step": 10314 + }, + { + "epoch": 3.1660527931246163, + "grad_norm": 0.2896469235420227, + "learning_rate": 7.996768145889717e-05, + "loss": 1.8373, + "step": 10315 + }, + { + "epoch": 3.1663597298956416, + "grad_norm": 0.35024940967559814, + "learning_rate": 7.996370245534517e-05, + "loss": 1.797, + "step": 10316 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.3228827714920044, + "learning_rate": 7.995972315567431e-05, + "loss": 1.7757, + "step": 10317 + }, + { + "epoch": 3.166973603437692, + "grad_norm": 0.27102410793304443, + "learning_rate": 7.995574355992388e-05, + "loss": 1.7786, + "step": 10318 + }, + { + "epoch": 3.167280540208717, + "grad_norm": 0.2556116580963135, + "learning_rate": 7.995176366813325e-05, + "loss": 1.7621, + "step": 10319 + }, + { + "epoch": 3.167587476979742, + "grad_norm": 0.28279444575309753, + "learning_rate": 7.994778348034173e-05, + "loss": 1.7954, + "step": 10320 + }, + { + "epoch": 3.1678944137507674, + "grad_norm": 0.31778639554977417, + "learning_rate": 7.994380299658867e-05, + "loss": 1.7657, + "step": 10321 + }, + { + "epoch": 3.1682013505217923, + "grad_norm": 0.27935469150543213, + "learning_rate": 7.993982221691339e-05, + "loss": 1.7502, + "step": 10322 + }, + { + "epoch": 3.1685082872928176, + "grad_norm": 0.29012617468833923, + "learning_rate": 7.993584114135524e-05, + "loss": 1.8497, + "step": 10323 + }, + { + "epoch": 3.168815224063843, + "grad_norm": 0.2674056887626648, + "learning_rate": 7.993185976995356e-05, + "loss": 1.7875, + "step": 10324 + }, + { + "epoch": 3.169122160834868, + "grad_norm": 0.2667328417301178, + "learning_rate": 7.992787810274771e-05, + "loss": 1.771, + "step": 10325 + }, + { + "epoch": 3.169429097605893, + "grad_norm": 0.25807151198387146, + "learning_rate": 7.992389613977702e-05, + "loss": 1.7638, + "step": 10326 + }, + { + "epoch": 3.1697360343769185, + "grad_norm": 0.2572930157184601, + "learning_rate": 7.991991388108084e-05, + "loss": 1.8218, + "step": 10327 + }, + { + "epoch": 3.1700429711479434, + "grad_norm": 0.3955067992210388, + "learning_rate": 7.991593132669855e-05, + "loss": 1.8458, + "step": 10328 + }, + { + "epoch": 3.1703499079189688, + "grad_norm": 0.2813466489315033, + "learning_rate": 7.991194847666948e-05, + "loss": 1.8042, + "step": 10329 + }, + { + "epoch": 3.1706568446899936, + "grad_norm": 0.2645012140274048, + "learning_rate": 7.990796533103302e-05, + "loss": 1.8241, + "step": 10330 + }, + { + "epoch": 3.170963781461019, + "grad_norm": 0.28462091088294983, + "learning_rate": 7.99039818898285e-05, + "loss": 1.8853, + "step": 10331 + }, + { + "epoch": 3.1712707182320443, + "grad_norm": 0.2727372944355011, + "learning_rate": 7.98999981530953e-05, + "loss": 1.7564, + "step": 10332 + }, + { + "epoch": 3.171577655003069, + "grad_norm": 0.2658170759677887, + "learning_rate": 7.989601412087281e-05, + "loss": 1.8344, + "step": 10333 + }, + { + "epoch": 3.1718845917740945, + "grad_norm": 0.29713502526283264, + "learning_rate": 7.989202979320039e-05, + "loss": 1.8721, + "step": 10334 + }, + { + "epoch": 3.17219152854512, + "grad_norm": 0.26609495282173157, + "learning_rate": 7.98880451701174e-05, + "loss": 1.7991, + "step": 10335 + }, + { + "epoch": 3.1724984653161448, + "grad_norm": 0.29779741168022156, + "learning_rate": 7.988406025166322e-05, + "loss": 1.8182, + "step": 10336 + }, + { + "epoch": 3.17280540208717, + "grad_norm": 0.2771340012550354, + "learning_rate": 7.988007503787724e-05, + "loss": 1.8034, + "step": 10337 + }, + { + "epoch": 3.1731123388581954, + "grad_norm": 0.30510422587394714, + "learning_rate": 7.987608952879886e-05, + "loss": 1.8477, + "step": 10338 + }, + { + "epoch": 3.1734192756292203, + "grad_norm": 0.3097476363182068, + "learning_rate": 7.987210372446745e-05, + "loss": 1.7572, + "step": 10339 + }, + { + "epoch": 3.1737262124002457, + "grad_norm": 0.2553942799568176, + "learning_rate": 7.986811762492239e-05, + "loss": 1.7837, + "step": 10340 + }, + { + "epoch": 3.1740331491712706, + "grad_norm": 0.26546719670295715, + "learning_rate": 7.986413123020312e-05, + "loss": 1.7893, + "step": 10341 + }, + { + "epoch": 3.174340085942296, + "grad_norm": 0.37721553444862366, + "learning_rate": 7.986014454034895e-05, + "loss": 1.8475, + "step": 10342 + }, + { + "epoch": 3.174647022713321, + "grad_norm": 0.3215494453907013, + "learning_rate": 7.985615755539937e-05, + "loss": 1.7806, + "step": 10343 + }, + { + "epoch": 3.174953959484346, + "grad_norm": 0.2662442922592163, + "learning_rate": 7.985217027539373e-05, + "loss": 1.8116, + "step": 10344 + }, + { + "epoch": 3.1752608962553714, + "grad_norm": 0.23334236443042755, + "learning_rate": 7.984818270037145e-05, + "loss": 1.7929, + "step": 10345 + }, + { + "epoch": 3.1755678330263963, + "grad_norm": 0.2873367667198181, + "learning_rate": 7.98441948303719e-05, + "loss": 1.7808, + "step": 10346 + }, + { + "epoch": 3.1758747697974217, + "grad_norm": 0.3623826801776886, + "learning_rate": 7.984020666543458e-05, + "loss": 1.8817, + "step": 10347 + }, + { + "epoch": 3.176181706568447, + "grad_norm": 0.3060589134693146, + "learning_rate": 7.983621820559881e-05, + "loss": 1.796, + "step": 10348 + }, + { + "epoch": 3.176488643339472, + "grad_norm": 0.2396882325410843, + "learning_rate": 7.983222945090407e-05, + "loss": 1.7455, + "step": 10349 + }, + { + "epoch": 3.1767955801104972, + "grad_norm": 0.24811476469039917, + "learning_rate": 7.982824040138974e-05, + "loss": 1.7907, + "step": 10350 + }, + { + "epoch": 3.1771025168815226, + "grad_norm": 0.32749706506729126, + "learning_rate": 7.982425105709524e-05, + "loss": 1.8553, + "step": 10351 + }, + { + "epoch": 3.1774094536525475, + "grad_norm": 0.3648095726966858, + "learning_rate": 7.982026141806003e-05, + "loss": 1.8387, + "step": 10352 + }, + { + "epoch": 3.177716390423573, + "grad_norm": 0.2749348282814026, + "learning_rate": 7.981627148432352e-05, + "loss": 1.7676, + "step": 10353 + }, + { + "epoch": 3.178023327194598, + "grad_norm": 0.2735142409801483, + "learning_rate": 7.981228125592513e-05, + "loss": 1.822, + "step": 10354 + }, + { + "epoch": 3.178330263965623, + "grad_norm": 0.28759655356407166, + "learning_rate": 7.98082907329043e-05, + "loss": 1.8113, + "step": 10355 + }, + { + "epoch": 3.1786372007366483, + "grad_norm": 0.33661654591560364, + "learning_rate": 7.980429991530048e-05, + "loss": 1.8036, + "step": 10356 + }, + { + "epoch": 3.1789441375076732, + "grad_norm": 0.2634892761707306, + "learning_rate": 7.98003088031531e-05, + "loss": 1.8323, + "step": 10357 + }, + { + "epoch": 3.1792510742786986, + "grad_norm": 0.25864094495773315, + "learning_rate": 7.979631739650158e-05, + "loss": 1.8199, + "step": 10358 + }, + { + "epoch": 3.179558011049724, + "grad_norm": 0.27368444204330444, + "learning_rate": 7.979232569538541e-05, + "loss": 1.7673, + "step": 10359 + }, + { + "epoch": 3.179864947820749, + "grad_norm": 0.2506616413593292, + "learning_rate": 7.9788333699844e-05, + "loss": 1.7912, + "step": 10360 + }, + { + "epoch": 3.180171884591774, + "grad_norm": 0.2539178133010864, + "learning_rate": 7.978434140991684e-05, + "loss": 1.7934, + "step": 10361 + }, + { + "epoch": 3.1804788213627995, + "grad_norm": 0.2605626881122589, + "learning_rate": 7.978034882564334e-05, + "loss": 1.8031, + "step": 10362 + }, + { + "epoch": 3.1807857581338244, + "grad_norm": 0.2610207796096802, + "learning_rate": 7.977635594706299e-05, + "loss": 1.8664, + "step": 10363 + }, + { + "epoch": 3.1810926949048497, + "grad_norm": 0.26164132356643677, + "learning_rate": 7.977236277421523e-05, + "loss": 1.7758, + "step": 10364 + }, + { + "epoch": 3.1813996316758746, + "grad_norm": 0.3122340142726898, + "learning_rate": 7.976836930713953e-05, + "loss": 1.9033, + "step": 10365 + }, + { + "epoch": 3.1817065684469, + "grad_norm": 0.3317202031612396, + "learning_rate": 7.976437554587537e-05, + "loss": 1.7899, + "step": 10366 + }, + { + "epoch": 3.1820135052179253, + "grad_norm": 0.28612568974494934, + "learning_rate": 7.97603814904622e-05, + "loss": 1.8145, + "step": 10367 + }, + { + "epoch": 3.18232044198895, + "grad_norm": 0.349917471408844, + "learning_rate": 7.975638714093949e-05, + "loss": 1.877, + "step": 10368 + }, + { + "epoch": 3.1826273787599755, + "grad_norm": 0.3737771809101105, + "learning_rate": 7.975239249734672e-05, + "loss": 1.8204, + "step": 10369 + }, + { + "epoch": 3.182934315531001, + "grad_norm": 0.3688446879386902, + "learning_rate": 7.974839755972339e-05, + "loss": 1.8487, + "step": 10370 + }, + { + "epoch": 3.1832412523020257, + "grad_norm": 0.2934897541999817, + "learning_rate": 7.974440232810894e-05, + "loss": 1.8243, + "step": 10371 + }, + { + "epoch": 3.183548189073051, + "grad_norm": 0.2596173882484436, + "learning_rate": 7.974040680254287e-05, + "loss": 1.7887, + "step": 10372 + }, + { + "epoch": 3.183855125844076, + "grad_norm": 0.35686594247817993, + "learning_rate": 7.973641098306468e-05, + "loss": 1.8653, + "step": 10373 + }, + { + "epoch": 3.1841620626151013, + "grad_norm": 0.3187713921070099, + "learning_rate": 7.973241486971383e-05, + "loss": 1.8767, + "step": 10374 + }, + { + "epoch": 3.1844689993861266, + "grad_norm": 0.2596273124217987, + "learning_rate": 7.972841846252985e-05, + "loss": 1.8028, + "step": 10375 + }, + { + "epoch": 3.1847759361571515, + "grad_norm": 0.2637474834918976, + "learning_rate": 7.972442176155221e-05, + "loss": 1.802, + "step": 10376 + }, + { + "epoch": 3.185082872928177, + "grad_norm": 0.2641126215457916, + "learning_rate": 7.97204247668204e-05, + "loss": 1.7931, + "step": 10377 + }, + { + "epoch": 3.185389809699202, + "grad_norm": 0.25594159960746765, + "learning_rate": 7.971642747837393e-05, + "loss": 1.818, + "step": 10378 + }, + { + "epoch": 3.185696746470227, + "grad_norm": 0.26567938923835754, + "learning_rate": 7.971242989625233e-05, + "loss": 1.8174, + "step": 10379 + }, + { + "epoch": 3.1860036832412524, + "grad_norm": 0.29580214619636536, + "learning_rate": 7.970843202049508e-05, + "loss": 1.869, + "step": 10380 + }, + { + "epoch": 3.1863106200122773, + "grad_norm": 0.2657530605792999, + "learning_rate": 7.970443385114168e-05, + "loss": 1.8352, + "step": 10381 + }, + { + "epoch": 3.1866175567833026, + "grad_norm": 0.2468358278274536, + "learning_rate": 7.970043538823165e-05, + "loss": 1.7851, + "step": 10382 + }, + { + "epoch": 3.186924493554328, + "grad_norm": 0.26464715600013733, + "learning_rate": 7.969643663180451e-05, + "loss": 1.8208, + "step": 10383 + }, + { + "epoch": 3.187231430325353, + "grad_norm": 0.26035723090171814, + "learning_rate": 7.969243758189979e-05, + "loss": 1.8089, + "step": 10384 + }, + { + "epoch": 3.187538367096378, + "grad_norm": 0.2644619941711426, + "learning_rate": 7.968843823855699e-05, + "loss": 1.8379, + "step": 10385 + }, + { + "epoch": 3.1878453038674035, + "grad_norm": 0.25576624274253845, + "learning_rate": 7.968443860181565e-05, + "loss": 1.7932, + "step": 10386 + }, + { + "epoch": 3.1881522406384284, + "grad_norm": 0.24276074767112732, + "learning_rate": 7.968043867171528e-05, + "loss": 1.8037, + "step": 10387 + }, + { + "epoch": 3.1884591774094537, + "grad_norm": 0.27156540751457214, + "learning_rate": 7.967643844829543e-05, + "loss": 1.7998, + "step": 10388 + }, + { + "epoch": 3.1887661141804786, + "grad_norm": 0.2555428743362427, + "learning_rate": 7.96724379315956e-05, + "loss": 1.7612, + "step": 10389 + }, + { + "epoch": 3.189073050951504, + "grad_norm": 0.3358438014984131, + "learning_rate": 7.966843712165537e-05, + "loss": 1.8543, + "step": 10390 + }, + { + "epoch": 3.1893799877225293, + "grad_norm": 0.2799586355686188, + "learning_rate": 7.966443601851424e-05, + "loss": 1.819, + "step": 10391 + }, + { + "epoch": 3.189686924493554, + "grad_norm": 0.2364189177751541, + "learning_rate": 7.966043462221178e-05, + "loss": 1.8537, + "step": 10392 + }, + { + "epoch": 3.1899938612645795, + "grad_norm": 0.23849403858184814, + "learning_rate": 7.96564329327875e-05, + "loss": 1.8125, + "step": 10393 + }, + { + "epoch": 3.190300798035605, + "grad_norm": 0.2371583878993988, + "learning_rate": 7.965243095028098e-05, + "loss": 1.7352, + "step": 10394 + }, + { + "epoch": 3.1906077348066297, + "grad_norm": 0.2584737539291382, + "learning_rate": 7.964842867473176e-05, + "loss": 1.8801, + "step": 10395 + }, + { + "epoch": 3.190914671577655, + "grad_norm": 0.27768051624298096, + "learning_rate": 7.964442610617939e-05, + "loss": 1.8221, + "step": 10396 + }, + { + "epoch": 3.1912216083486804, + "grad_norm": 0.2680891752243042, + "learning_rate": 7.964042324466341e-05, + "loss": 1.8371, + "step": 10397 + }, + { + "epoch": 3.1915285451197053, + "grad_norm": 0.25301921367645264, + "learning_rate": 7.963642009022343e-05, + "loss": 1.7972, + "step": 10398 + }, + { + "epoch": 3.1918354818907306, + "grad_norm": 0.2589731216430664, + "learning_rate": 7.963241664289896e-05, + "loss": 1.8145, + "step": 10399 + }, + { + "epoch": 3.1921424186617555, + "grad_norm": 0.2611297369003296, + "learning_rate": 7.962841290272956e-05, + "loss": 1.8736, + "step": 10400 + }, + { + "epoch": 3.192449355432781, + "grad_norm": 0.2812272906303406, + "learning_rate": 7.962440886975483e-05, + "loss": 1.8116, + "step": 10401 + }, + { + "epoch": 3.192756292203806, + "grad_norm": 0.3261657655239105, + "learning_rate": 7.962040454401434e-05, + "loss": 1.7935, + "step": 10402 + }, + { + "epoch": 3.193063228974831, + "grad_norm": 0.3355373442173004, + "learning_rate": 7.961639992554764e-05, + "loss": 1.7957, + "step": 10403 + }, + { + "epoch": 3.1933701657458564, + "grad_norm": 0.2811843156814575, + "learning_rate": 7.961239501439432e-05, + "loss": 1.797, + "step": 10404 + }, + { + "epoch": 3.1936771025168813, + "grad_norm": 0.24933238327503204, + "learning_rate": 7.960838981059395e-05, + "loss": 1.7594, + "step": 10405 + }, + { + "epoch": 3.1939840392879066, + "grad_norm": 0.29110121726989746, + "learning_rate": 7.960438431418613e-05, + "loss": 1.8268, + "step": 10406 + }, + { + "epoch": 3.194290976058932, + "grad_norm": 0.3702283799648285, + "learning_rate": 7.960037852521043e-05, + "loss": 1.7629, + "step": 10407 + }, + { + "epoch": 3.194597912829957, + "grad_norm": 0.33275437355041504, + "learning_rate": 7.959637244370644e-05, + "loss": 1.8507, + "step": 10408 + }, + { + "epoch": 3.194904849600982, + "grad_norm": 0.2691981792449951, + "learning_rate": 7.959236606971375e-05, + "loss": 1.8084, + "step": 10409 + }, + { + "epoch": 3.1952117863720075, + "grad_norm": 0.30108413100242615, + "learning_rate": 7.958835940327194e-05, + "loss": 1.8525, + "step": 10410 + }, + { + "epoch": 3.1955187231430324, + "grad_norm": 0.32112306356430054, + "learning_rate": 7.958435244442064e-05, + "loss": 1.7431, + "step": 10411 + }, + { + "epoch": 3.1958256599140578, + "grad_norm": 0.2795291543006897, + "learning_rate": 7.958034519319942e-05, + "loss": 1.7985, + "step": 10412 + }, + { + "epoch": 3.196132596685083, + "grad_norm": 0.2485792338848114, + "learning_rate": 7.957633764964788e-05, + "loss": 1.7363, + "step": 10413 + }, + { + "epoch": 3.196439533456108, + "grad_norm": 0.3552432358264923, + "learning_rate": 7.957232981380565e-05, + "loss": 1.8174, + "step": 10414 + }, + { + "epoch": 3.1967464702271333, + "grad_norm": 0.3829655051231384, + "learning_rate": 7.956832168571234e-05, + "loss": 1.9249, + "step": 10415 + }, + { + "epoch": 3.197053406998158, + "grad_norm": 0.2498074769973755, + "learning_rate": 7.956431326540752e-05, + "loss": 1.8104, + "step": 10416 + }, + { + "epoch": 3.1973603437691835, + "grad_norm": 0.24596504867076874, + "learning_rate": 7.956030455293082e-05, + "loss": 1.8007, + "step": 10417 + }, + { + "epoch": 3.197667280540209, + "grad_norm": 0.2795363664627075, + "learning_rate": 7.95562955483219e-05, + "loss": 1.775, + "step": 10418 + }, + { + "epoch": 3.1979742173112338, + "grad_norm": 0.3581138253211975, + "learning_rate": 7.95522862516203e-05, + "loss": 1.8567, + "step": 10419 + }, + { + "epoch": 3.198281154082259, + "grad_norm": 0.36102500557899475, + "learning_rate": 7.95482766628657e-05, + "loss": 1.8509, + "step": 10420 + }, + { + "epoch": 3.198588090853284, + "grad_norm": 0.4717029929161072, + "learning_rate": 7.954426678209774e-05, + "loss": 1.8218, + "step": 10421 + }, + { + "epoch": 3.1988950276243093, + "grad_norm": 0.3211984932422638, + "learning_rate": 7.9540256609356e-05, + "loss": 1.8696, + "step": 10422 + }, + { + "epoch": 3.1992019643953347, + "grad_norm": 0.30094626545906067, + "learning_rate": 7.953624614468011e-05, + "loss": 1.8714, + "step": 10423 + }, + { + "epoch": 3.1995089011663596, + "grad_norm": 0.267578125, + "learning_rate": 7.953223538810976e-05, + "loss": 1.7903, + "step": 10424 + }, + { + "epoch": 3.199815837937385, + "grad_norm": 0.35577845573425293, + "learning_rate": 7.952822433968453e-05, + "loss": 1.7808, + "step": 10425 + }, + { + "epoch": 3.2001227747084102, + "grad_norm": 0.4117741882801056, + "learning_rate": 7.952421299944408e-05, + "loss": 1.7856, + "step": 10426 + }, + { + "epoch": 3.200429711479435, + "grad_norm": 0.35202035307884216, + "learning_rate": 7.952020136742806e-05, + "loss": 1.8112, + "step": 10427 + }, + { + "epoch": 3.2007366482504604, + "grad_norm": 0.26514917612075806, + "learning_rate": 7.951618944367611e-05, + "loss": 1.828, + "step": 10428 + }, + { + "epoch": 3.201043585021486, + "grad_norm": 0.29219159483909607, + "learning_rate": 7.951217722822786e-05, + "loss": 1.9366, + "step": 10429 + }, + { + "epoch": 3.2013505217925107, + "grad_norm": 0.2929961383342743, + "learning_rate": 7.950816472112298e-05, + "loss": 1.8006, + "step": 10430 + }, + { + "epoch": 3.201657458563536, + "grad_norm": 0.28339722752571106, + "learning_rate": 7.950415192240114e-05, + "loss": 1.7411, + "step": 10431 + }, + { + "epoch": 3.201964395334561, + "grad_norm": 0.258884996175766, + "learning_rate": 7.950013883210196e-05, + "loss": 1.8153, + "step": 10432 + }, + { + "epoch": 3.2022713321055862, + "grad_norm": 0.3065929114818573, + "learning_rate": 7.949612545026512e-05, + "loss": 1.7918, + "step": 10433 + }, + { + "epoch": 3.2025782688766116, + "grad_norm": 0.289874404668808, + "learning_rate": 7.949211177693029e-05, + "loss": 1.7975, + "step": 10434 + }, + { + "epoch": 3.2028852056476365, + "grad_norm": 0.27025631070137024, + "learning_rate": 7.948809781213711e-05, + "loss": 1.8129, + "step": 10435 + }, + { + "epoch": 3.203192142418662, + "grad_norm": 0.2501074969768524, + "learning_rate": 7.948408355592528e-05, + "loss": 1.7653, + "step": 10436 + }, + { + "epoch": 3.203499079189687, + "grad_norm": 0.30402958393096924, + "learning_rate": 7.948006900833445e-05, + "loss": 1.8311, + "step": 10437 + }, + { + "epoch": 3.203806015960712, + "grad_norm": 0.28783223032951355, + "learning_rate": 7.94760541694043e-05, + "loss": 1.82, + "step": 10438 + }, + { + "epoch": 3.2041129527317374, + "grad_norm": 0.30428317189216614, + "learning_rate": 7.947203903917451e-05, + "loss": 1.8673, + "step": 10439 + }, + { + "epoch": 3.2044198895027622, + "grad_norm": 0.2860367000102997, + "learning_rate": 7.946802361768473e-05, + "loss": 1.824, + "step": 10440 + }, + { + "epoch": 3.2047268262737876, + "grad_norm": 0.2995273172855377, + "learning_rate": 7.946400790497469e-05, + "loss": 1.7342, + "step": 10441 + }, + { + "epoch": 3.205033763044813, + "grad_norm": 0.4374088943004608, + "learning_rate": 7.945999190108407e-05, + "loss": 1.8522, + "step": 10442 + }, + { + "epoch": 3.205340699815838, + "grad_norm": 0.37659478187561035, + "learning_rate": 7.945597560605252e-05, + "loss": 1.7518, + "step": 10443 + }, + { + "epoch": 3.205647636586863, + "grad_norm": 0.24257932603359222, + "learning_rate": 7.945195901991975e-05, + "loss": 1.7892, + "step": 10444 + }, + { + "epoch": 3.2059545733578885, + "grad_norm": 0.3682694435119629, + "learning_rate": 7.944794214272546e-05, + "loss": 1.7757, + "step": 10445 + }, + { + "epoch": 3.2062615101289134, + "grad_norm": 0.434692919254303, + "learning_rate": 7.944392497450936e-05, + "loss": 1.8207, + "step": 10446 + }, + { + "epoch": 3.2065684468999387, + "grad_norm": 0.3982211947441101, + "learning_rate": 7.943990751531113e-05, + "loss": 1.8303, + "step": 10447 + }, + { + "epoch": 3.2068753836709636, + "grad_norm": 0.2877334654331207, + "learning_rate": 7.943588976517049e-05, + "loss": 1.8495, + "step": 10448 + }, + { + "epoch": 3.207182320441989, + "grad_norm": 0.34589654207229614, + "learning_rate": 7.943187172412712e-05, + "loss": 1.7773, + "step": 10449 + }, + { + "epoch": 3.2074892572130143, + "grad_norm": 0.4727517366409302, + "learning_rate": 7.942785339222074e-05, + "loss": 1.8702, + "step": 10450 + }, + { + "epoch": 3.207796193984039, + "grad_norm": 0.4019354581832886, + "learning_rate": 7.942383476949107e-05, + "loss": 1.8095, + "step": 10451 + }, + { + "epoch": 3.2081031307550645, + "grad_norm": 0.2726243734359741, + "learning_rate": 7.941981585597782e-05, + "loss": 1.7273, + "step": 10452 + }, + { + "epoch": 3.20841006752609, + "grad_norm": 0.2944760024547577, + "learning_rate": 7.941579665172072e-05, + "loss": 1.7507, + "step": 10453 + }, + { + "epoch": 3.2087170042971147, + "grad_norm": 0.3530777096748352, + "learning_rate": 7.941177715675945e-05, + "loss": 1.8434, + "step": 10454 + }, + { + "epoch": 3.20902394106814, + "grad_norm": 0.28612539172172546, + "learning_rate": 7.940775737113378e-05, + "loss": 1.8094, + "step": 10455 + }, + { + "epoch": 3.209330877839165, + "grad_norm": 0.27006468176841736, + "learning_rate": 7.94037372948834e-05, + "loss": 1.7854, + "step": 10456 + }, + { + "epoch": 3.2096378146101903, + "grad_norm": 0.3027147054672241, + "learning_rate": 7.939971692804806e-05, + "loss": 1.7596, + "step": 10457 + }, + { + "epoch": 3.2099447513812156, + "grad_norm": 0.31999528408050537, + "learning_rate": 7.939569627066749e-05, + "loss": 1.8836, + "step": 10458 + }, + { + "epoch": 3.2102516881522405, + "grad_norm": 0.267600417137146, + "learning_rate": 7.939167532278142e-05, + "loss": 1.8508, + "step": 10459 + }, + { + "epoch": 3.210558624923266, + "grad_norm": 0.3171706795692444, + "learning_rate": 7.938765408442958e-05, + "loss": 1.7507, + "step": 10460 + }, + { + "epoch": 3.210865561694291, + "grad_norm": 0.2955280840396881, + "learning_rate": 7.938363255565171e-05, + "loss": 1.733, + "step": 10461 + }, + { + "epoch": 3.211172498465316, + "grad_norm": 0.3427969217300415, + "learning_rate": 7.937961073648759e-05, + "loss": 1.9208, + "step": 10462 + }, + { + "epoch": 3.2114794352363414, + "grad_norm": 0.28788647055625916, + "learning_rate": 7.937558862697692e-05, + "loss": 1.7723, + "step": 10463 + }, + { + "epoch": 3.2117863720073663, + "grad_norm": 0.26093682646751404, + "learning_rate": 7.937156622715945e-05, + "loss": 1.803, + "step": 10464 + }, + { + "epoch": 3.2120933087783916, + "grad_norm": 0.2791301906108856, + "learning_rate": 7.936754353707497e-05, + "loss": 1.7601, + "step": 10465 + }, + { + "epoch": 3.212400245549417, + "grad_norm": 0.3039831519126892, + "learning_rate": 7.93635205567632e-05, + "loss": 1.7864, + "step": 10466 + }, + { + "epoch": 3.212707182320442, + "grad_norm": 0.28498128056526184, + "learning_rate": 7.935949728626392e-05, + "loss": 1.7745, + "step": 10467 + }, + { + "epoch": 3.213014119091467, + "grad_norm": 0.2908780872821808, + "learning_rate": 7.935547372561687e-05, + "loss": 1.8281, + "step": 10468 + }, + { + "epoch": 3.2133210558624925, + "grad_norm": 0.26148509979248047, + "learning_rate": 7.935144987486183e-05, + "loss": 1.8545, + "step": 10469 + }, + { + "epoch": 3.2136279926335174, + "grad_norm": 0.2853962481021881, + "learning_rate": 7.934742573403856e-05, + "loss": 1.7765, + "step": 10470 + }, + { + "epoch": 3.2139349294045427, + "grad_norm": 0.26497501134872437, + "learning_rate": 7.934340130318681e-05, + "loss": 1.7472, + "step": 10471 + }, + { + "epoch": 3.214241866175568, + "grad_norm": 0.2806912660598755, + "learning_rate": 7.933937658234638e-05, + "loss": 1.7879, + "step": 10472 + }, + { + "epoch": 3.214548802946593, + "grad_norm": 0.2699974477291107, + "learning_rate": 7.933535157155705e-05, + "loss": 1.7539, + "step": 10473 + }, + { + "epoch": 3.2148557397176183, + "grad_norm": 0.22714731097221375, + "learning_rate": 7.933132627085856e-05, + "loss": 1.7861, + "step": 10474 + }, + { + "epoch": 3.215162676488643, + "grad_norm": 0.291340708732605, + "learning_rate": 7.932730068029072e-05, + "loss": 1.8381, + "step": 10475 + }, + { + "epoch": 3.2154696132596685, + "grad_norm": 0.3257324695587158, + "learning_rate": 7.93232747998933e-05, + "loss": 1.8293, + "step": 10476 + }, + { + "epoch": 3.215776550030694, + "grad_norm": 0.3518911600112915, + "learning_rate": 7.93192486297061e-05, + "loss": 1.853, + "step": 10477 + }, + { + "epoch": 3.2160834868017187, + "grad_norm": 0.27663540840148926, + "learning_rate": 7.93152221697689e-05, + "loss": 1.7831, + "step": 10478 + }, + { + "epoch": 3.216390423572744, + "grad_norm": 0.3153248429298401, + "learning_rate": 7.931119542012149e-05, + "loss": 1.7443, + "step": 10479 + }, + { + "epoch": 3.216697360343769, + "grad_norm": 0.2919597029685974, + "learning_rate": 7.930716838080368e-05, + "loss": 1.8108, + "step": 10480 + }, + { + "epoch": 3.2170042971147943, + "grad_norm": 0.26892516016960144, + "learning_rate": 7.930314105185524e-05, + "loss": 1.7791, + "step": 10481 + }, + { + "epoch": 3.2173112338858196, + "grad_norm": 0.2486005276441574, + "learning_rate": 7.929911343331599e-05, + "loss": 1.8184, + "step": 10482 + }, + { + "epoch": 3.2176181706568445, + "grad_norm": 0.260728120803833, + "learning_rate": 7.929508552522571e-05, + "loss": 1.7933, + "step": 10483 + }, + { + "epoch": 3.21792510742787, + "grad_norm": 0.3081948757171631, + "learning_rate": 7.929105732762425e-05, + "loss": 1.7732, + "step": 10484 + }, + { + "epoch": 3.218232044198895, + "grad_norm": 0.3807671368122101, + "learning_rate": 7.928702884055138e-05, + "loss": 1.7652, + "step": 10485 + }, + { + "epoch": 3.21853898096992, + "grad_norm": 0.31637755036354065, + "learning_rate": 7.928300006404692e-05, + "loss": 1.7605, + "step": 10486 + }, + { + "epoch": 3.2188459177409454, + "grad_norm": 0.2812853455543518, + "learning_rate": 7.927897099815071e-05, + "loss": 1.7925, + "step": 10487 + }, + { + "epoch": 3.2191528545119708, + "grad_norm": 0.3472350239753723, + "learning_rate": 7.927494164290253e-05, + "loss": 1.8252, + "step": 10488 + }, + { + "epoch": 3.2194597912829956, + "grad_norm": 0.4202714264392853, + "learning_rate": 7.927091199834222e-05, + "loss": 1.7993, + "step": 10489 + }, + { + "epoch": 3.219766728054021, + "grad_norm": 0.44552353024482727, + "learning_rate": 7.92668820645096e-05, + "loss": 1.8609, + "step": 10490 + }, + { + "epoch": 3.220073664825046, + "grad_norm": 0.38964664936065674, + "learning_rate": 7.926285184144451e-05, + "loss": 1.864, + "step": 10491 + }, + { + "epoch": 3.220380601596071, + "grad_norm": 0.2978462278842926, + "learning_rate": 7.925882132918676e-05, + "loss": 1.7892, + "step": 10492 + }, + { + "epoch": 3.2206875383670965, + "grad_norm": 0.2520316243171692, + "learning_rate": 7.925479052777619e-05, + "loss": 1.7702, + "step": 10493 + }, + { + "epoch": 3.2209944751381214, + "grad_norm": 0.28151068091392517, + "learning_rate": 7.925075943725263e-05, + "loss": 1.7613, + "step": 10494 + }, + { + "epoch": 3.2213014119091468, + "grad_norm": 0.3346099555492401, + "learning_rate": 7.924672805765592e-05, + "loss": 1.894, + "step": 10495 + }, + { + "epoch": 3.2216083486801717, + "grad_norm": 0.2981362044811249, + "learning_rate": 7.924269638902591e-05, + "loss": 1.8157, + "step": 10496 + }, + { + "epoch": 3.221915285451197, + "grad_norm": 0.2561499774456024, + "learning_rate": 7.923866443140242e-05, + "loss": 1.8259, + "step": 10497 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.26480481028556824, + "learning_rate": 7.923463218482532e-05, + "loss": 1.7856, + "step": 10498 + }, + { + "epoch": 3.222529158993247, + "grad_norm": 0.24103692173957825, + "learning_rate": 7.923059964933446e-05, + "loss": 1.7765, + "step": 10499 + }, + { + "epoch": 3.2228360957642725, + "grad_norm": 0.2399173080921173, + "learning_rate": 7.922656682496967e-05, + "loss": 1.8216, + "step": 10500 + }, + { + "epoch": 3.223143032535298, + "grad_norm": 0.24530018866062164, + "learning_rate": 7.922253371177082e-05, + "loss": 1.8155, + "step": 10501 + }, + { + "epoch": 3.2234499693063228, + "grad_norm": 0.23298653960227966, + "learning_rate": 7.921850030977775e-05, + "loss": 1.7843, + "step": 10502 + }, + { + "epoch": 3.223756906077348, + "grad_norm": 0.3053973317146301, + "learning_rate": 7.921446661903035e-05, + "loss": 1.8113, + "step": 10503 + }, + { + "epoch": 3.2240638428483734, + "grad_norm": 0.261336088180542, + "learning_rate": 7.921043263956847e-05, + "loss": 1.8073, + "step": 10504 + }, + { + "epoch": 3.2243707796193983, + "grad_norm": 0.24877268075942993, + "learning_rate": 7.920639837143195e-05, + "loss": 1.8344, + "step": 10505 + }, + { + "epoch": 3.2246777163904237, + "grad_norm": 0.26784422993659973, + "learning_rate": 7.920236381466071e-05, + "loss": 1.7757, + "step": 10506 + }, + { + "epoch": 3.2249846531614486, + "grad_norm": 0.2672121226787567, + "learning_rate": 7.919832896929458e-05, + "loss": 1.8384, + "step": 10507 + }, + { + "epoch": 3.225291589932474, + "grad_norm": 0.27254921197891235, + "learning_rate": 7.919429383537346e-05, + "loss": 1.8056, + "step": 10508 + }, + { + "epoch": 3.2255985267034992, + "grad_norm": 0.24467822909355164, + "learning_rate": 7.91902584129372e-05, + "loss": 1.8109, + "step": 10509 + }, + { + "epoch": 3.225905463474524, + "grad_norm": 0.25966358184814453, + "learning_rate": 7.918622270202571e-05, + "loss": 1.82, + "step": 10510 + }, + { + "epoch": 3.2262124002455494, + "grad_norm": 0.28601330518722534, + "learning_rate": 7.918218670267886e-05, + "loss": 1.7266, + "step": 10511 + }, + { + "epoch": 3.226519337016575, + "grad_norm": 0.4017516076564789, + "learning_rate": 7.917815041493653e-05, + "loss": 1.8408, + "step": 10512 + }, + { + "epoch": 3.2268262737875997, + "grad_norm": 0.3995787501335144, + "learning_rate": 7.917411383883862e-05, + "loss": 1.8441, + "step": 10513 + }, + { + "epoch": 3.227133210558625, + "grad_norm": 0.26997458934783936, + "learning_rate": 7.917007697442502e-05, + "loss": 1.8078, + "step": 10514 + }, + { + "epoch": 3.22744014732965, + "grad_norm": 0.34353014826774597, + "learning_rate": 7.916603982173562e-05, + "loss": 1.7523, + "step": 10515 + }, + { + "epoch": 3.2277470841006752, + "grad_norm": 0.39522337913513184, + "learning_rate": 7.916200238081032e-05, + "loss": 1.7532, + "step": 10516 + }, + { + "epoch": 3.2280540208717006, + "grad_norm": 0.4176923334598541, + "learning_rate": 7.915796465168903e-05, + "loss": 1.8895, + "step": 10517 + }, + { + "epoch": 3.2283609576427255, + "grad_norm": 0.30232906341552734, + "learning_rate": 7.915392663441164e-05, + "loss": 1.8223, + "step": 10518 + }, + { + "epoch": 3.228667894413751, + "grad_norm": 0.230951726436615, + "learning_rate": 7.914988832901805e-05, + "loss": 1.7265, + "step": 10519 + }, + { + "epoch": 3.228974831184776, + "grad_norm": 0.26381877064704895, + "learning_rate": 7.914584973554819e-05, + "loss": 1.7858, + "step": 10520 + }, + { + "epoch": 3.229281767955801, + "grad_norm": 0.2500905394554138, + "learning_rate": 7.914181085404194e-05, + "loss": 1.7606, + "step": 10521 + }, + { + "epoch": 3.2295887047268264, + "grad_norm": 0.2585415840148926, + "learning_rate": 7.913777168453925e-05, + "loss": 1.787, + "step": 10522 + }, + { + "epoch": 3.2298956414978512, + "grad_norm": 0.24236604571342468, + "learning_rate": 7.913373222708001e-05, + "loss": 1.7623, + "step": 10523 + }, + { + "epoch": 3.2302025782688766, + "grad_norm": 0.3113093078136444, + "learning_rate": 7.912969248170416e-05, + "loss": 1.7736, + "step": 10524 + }, + { + "epoch": 3.230509515039902, + "grad_norm": 0.3341342806816101, + "learning_rate": 7.912565244845163e-05, + "loss": 1.8583, + "step": 10525 + }, + { + "epoch": 3.230816451810927, + "grad_norm": 0.2644478678703308, + "learning_rate": 7.912161212736231e-05, + "loss": 1.7891, + "step": 10526 + }, + { + "epoch": 3.231123388581952, + "grad_norm": 0.22916561365127563, + "learning_rate": 7.911757151847616e-05, + "loss": 1.7642, + "step": 10527 + }, + { + "epoch": 3.2314303253529775, + "grad_norm": 0.24204877018928528, + "learning_rate": 7.911353062183309e-05, + "loss": 1.8522, + "step": 10528 + }, + { + "epoch": 3.2317372621240024, + "grad_norm": 0.25339365005493164, + "learning_rate": 7.910948943747307e-05, + "loss": 1.8391, + "step": 10529 + }, + { + "epoch": 3.2320441988950277, + "grad_norm": 0.2652709186077118, + "learning_rate": 7.9105447965436e-05, + "loss": 1.7735, + "step": 10530 + }, + { + "epoch": 3.2323511356660526, + "grad_norm": 0.2711019217967987, + "learning_rate": 7.910140620576183e-05, + "loss": 1.8491, + "step": 10531 + }, + { + "epoch": 3.232658072437078, + "grad_norm": 0.2598389685153961, + "learning_rate": 7.909736415849052e-05, + "loss": 1.8417, + "step": 10532 + }, + { + "epoch": 3.2329650092081033, + "grad_norm": 0.278037428855896, + "learning_rate": 7.9093321823662e-05, + "loss": 1.8774, + "step": 10533 + }, + { + "epoch": 3.233271945979128, + "grad_norm": 0.32015568017959595, + "learning_rate": 7.90892792013162e-05, + "loss": 1.8873, + "step": 10534 + }, + { + "epoch": 3.2335788827501535, + "grad_norm": 0.3098098635673523, + "learning_rate": 7.908523629149312e-05, + "loss": 1.8141, + "step": 10535 + }, + { + "epoch": 3.233885819521179, + "grad_norm": 0.3127266764640808, + "learning_rate": 7.908119309423267e-05, + "loss": 1.8587, + "step": 10536 + }, + { + "epoch": 3.2341927562922037, + "grad_norm": 0.3085545301437378, + "learning_rate": 7.907714960957483e-05, + "loss": 1.8544, + "step": 10537 + }, + { + "epoch": 3.234499693063229, + "grad_norm": 0.3051004409790039, + "learning_rate": 7.907310583755956e-05, + "loss": 1.8144, + "step": 10538 + }, + { + "epoch": 3.234806629834254, + "grad_norm": 0.3458186686038971, + "learning_rate": 7.906906177822682e-05, + "loss": 1.8388, + "step": 10539 + }, + { + "epoch": 3.2351135666052793, + "grad_norm": 0.37064439058303833, + "learning_rate": 7.906501743161656e-05, + "loss": 1.7574, + "step": 10540 + }, + { + "epoch": 3.2354205033763046, + "grad_norm": 0.3382316827774048, + "learning_rate": 7.906097279776876e-05, + "loss": 1.8785, + "step": 10541 + }, + { + "epoch": 3.2357274401473295, + "grad_norm": 0.254802942276001, + "learning_rate": 7.905692787672341e-05, + "loss": 1.8276, + "step": 10542 + }, + { + "epoch": 3.236034376918355, + "grad_norm": 0.3362341523170471, + "learning_rate": 7.905288266852047e-05, + "loss": 1.8057, + "step": 10543 + }, + { + "epoch": 3.23634131368938, + "grad_norm": 0.38821661472320557, + "learning_rate": 7.904883717319988e-05, + "loss": 1.7841, + "step": 10544 + }, + { + "epoch": 3.236648250460405, + "grad_norm": 0.33889076113700867, + "learning_rate": 7.90447913908017e-05, + "loss": 1.7892, + "step": 10545 + }, + { + "epoch": 3.2369551872314304, + "grad_norm": 0.2741014361381531, + "learning_rate": 7.904074532136585e-05, + "loss": 1.7611, + "step": 10546 + }, + { + "epoch": 3.2372621240024557, + "grad_norm": 0.28950995206832886, + "learning_rate": 7.903669896493233e-05, + "loss": 1.7963, + "step": 10547 + }, + { + "epoch": 3.2375690607734806, + "grad_norm": 0.30647143721580505, + "learning_rate": 7.903265232154113e-05, + "loss": 1.7522, + "step": 10548 + }, + { + "epoch": 3.237875997544506, + "grad_norm": 0.30428263545036316, + "learning_rate": 7.902860539123225e-05, + "loss": 1.7383, + "step": 10549 + }, + { + "epoch": 3.238182934315531, + "grad_norm": 0.2357146292924881, + "learning_rate": 7.902455817404569e-05, + "loss": 1.7243, + "step": 10550 + }, + { + "epoch": 3.238489871086556, + "grad_norm": 0.3125104606151581, + "learning_rate": 7.90205106700214e-05, + "loss": 1.8542, + "step": 10551 + }, + { + "epoch": 3.2387968078575815, + "grad_norm": 0.25797244906425476, + "learning_rate": 7.901646287919944e-05, + "loss": 1.8374, + "step": 10552 + }, + { + "epoch": 3.2391037446286064, + "grad_norm": 0.3127591907978058, + "learning_rate": 7.901241480161978e-05, + "loss": 1.9457, + "step": 10553 + }, + { + "epoch": 3.2394106813996317, + "grad_norm": 0.2971835434436798, + "learning_rate": 7.900836643732243e-05, + "loss": 1.7933, + "step": 10554 + }, + { + "epoch": 3.2397176181706566, + "grad_norm": 0.28931814432144165, + "learning_rate": 7.90043177863474e-05, + "loss": 1.8201, + "step": 10555 + }, + { + "epoch": 3.240024554941682, + "grad_norm": 0.3348724842071533, + "learning_rate": 7.90002688487347e-05, + "loss": 1.8718, + "step": 10556 + }, + { + "epoch": 3.2403314917127073, + "grad_norm": 0.28566426038742065, + "learning_rate": 7.899621962452436e-05, + "loss": 1.805, + "step": 10557 + }, + { + "epoch": 3.240638428483732, + "grad_norm": 0.27074119448661804, + "learning_rate": 7.899217011375637e-05, + "loss": 1.842, + "step": 10558 + }, + { + "epoch": 3.2409453652547575, + "grad_norm": 0.27014291286468506, + "learning_rate": 7.898812031647076e-05, + "loss": 1.8156, + "step": 10559 + }, + { + "epoch": 3.241252302025783, + "grad_norm": 0.28087863326072693, + "learning_rate": 7.898407023270756e-05, + "loss": 1.8399, + "step": 10560 + }, + { + "epoch": 3.2415592387968077, + "grad_norm": 0.2641037404537201, + "learning_rate": 7.898001986250679e-05, + "loss": 1.7977, + "step": 10561 + }, + { + "epoch": 3.241866175567833, + "grad_norm": 0.2843858301639557, + "learning_rate": 7.897596920590848e-05, + "loss": 1.834, + "step": 10562 + }, + { + "epoch": 3.2421731123388584, + "grad_norm": 0.2724611163139343, + "learning_rate": 7.897191826295266e-05, + "loss": 1.7547, + "step": 10563 + }, + { + "epoch": 3.2424800491098833, + "grad_norm": 0.2583858370780945, + "learning_rate": 7.896786703367935e-05, + "loss": 1.7658, + "step": 10564 + }, + { + "epoch": 3.2427869858809086, + "grad_norm": 0.2666650712490082, + "learning_rate": 7.896381551812861e-05, + "loss": 1.8017, + "step": 10565 + }, + { + "epoch": 3.2430939226519335, + "grad_norm": 0.23269347846508026, + "learning_rate": 7.895976371634047e-05, + "loss": 1.8267, + "step": 10566 + }, + { + "epoch": 3.243400859422959, + "grad_norm": 0.27865225076675415, + "learning_rate": 7.895571162835496e-05, + "loss": 1.8093, + "step": 10567 + }, + { + "epoch": 3.243707796193984, + "grad_norm": 0.29445022344589233, + "learning_rate": 7.895165925421216e-05, + "loss": 1.7999, + "step": 10568 + }, + { + "epoch": 3.244014732965009, + "grad_norm": 0.32135528326034546, + "learning_rate": 7.894760659395206e-05, + "loss": 1.8405, + "step": 10569 + }, + { + "epoch": 3.2443216697360344, + "grad_norm": 0.3409091532230377, + "learning_rate": 7.894355364761477e-05, + "loss": 1.7861, + "step": 10570 + }, + { + "epoch": 3.2446286065070598, + "grad_norm": 0.3379025459289551, + "learning_rate": 7.893950041524032e-05, + "loss": 1.8495, + "step": 10571 + }, + { + "epoch": 3.2449355432780846, + "grad_norm": 0.2843063473701477, + "learning_rate": 7.893544689686874e-05, + "loss": 1.7888, + "step": 10572 + }, + { + "epoch": 3.24524248004911, + "grad_norm": 0.2914074957370758, + "learning_rate": 7.893139309254013e-05, + "loss": 1.7866, + "step": 10573 + }, + { + "epoch": 3.245549416820135, + "grad_norm": 0.39855021238327026, + "learning_rate": 7.892733900229454e-05, + "loss": 1.7865, + "step": 10574 + }, + { + "epoch": 3.24585635359116, + "grad_norm": 0.4232102632522583, + "learning_rate": 7.892328462617203e-05, + "loss": 1.8443, + "step": 10575 + }, + { + "epoch": 3.2461632903621855, + "grad_norm": 0.390794962644577, + "learning_rate": 7.891922996421267e-05, + "loss": 1.8735, + "step": 10576 + }, + { + "epoch": 3.2464702271332104, + "grad_norm": 0.3051595687866211, + "learning_rate": 7.891517501645653e-05, + "loss": 1.8654, + "step": 10577 + }, + { + "epoch": 3.2467771639042358, + "grad_norm": 0.25363096594810486, + "learning_rate": 7.891111978294367e-05, + "loss": 1.7602, + "step": 10578 + }, + { + "epoch": 3.247084100675261, + "grad_norm": 0.29785794019699097, + "learning_rate": 7.890706426371419e-05, + "loss": 1.8242, + "step": 10579 + }, + { + "epoch": 3.247391037446286, + "grad_norm": 0.346162885427475, + "learning_rate": 7.890300845880816e-05, + "loss": 1.8551, + "step": 10580 + }, + { + "epoch": 3.2476979742173113, + "grad_norm": 0.33906155824661255, + "learning_rate": 7.889895236826566e-05, + "loss": 1.765, + "step": 10581 + }, + { + "epoch": 3.248004910988336, + "grad_norm": 0.26083165407180786, + "learning_rate": 7.889489599212676e-05, + "loss": 1.8246, + "step": 10582 + }, + { + "epoch": 3.2483118477593615, + "grad_norm": 0.3042019009590149, + "learning_rate": 7.889083933043157e-05, + "loss": 1.9017, + "step": 10583 + }, + { + "epoch": 3.248618784530387, + "grad_norm": 0.34833577275276184, + "learning_rate": 7.888678238322018e-05, + "loss": 1.7863, + "step": 10584 + }, + { + "epoch": 3.2489257213014118, + "grad_norm": 0.34436655044555664, + "learning_rate": 7.888272515053267e-05, + "loss": 1.7937, + "step": 10585 + }, + { + "epoch": 3.249232658072437, + "grad_norm": 0.2550172507762909, + "learning_rate": 7.887866763240914e-05, + "loss": 1.7615, + "step": 10586 + }, + { + "epoch": 3.2495395948434624, + "grad_norm": 0.3334405720233917, + "learning_rate": 7.88746098288897e-05, + "loss": 1.7465, + "step": 10587 + }, + { + "epoch": 3.2498465316144873, + "grad_norm": 0.4668157696723938, + "learning_rate": 7.887055174001443e-05, + "loss": 1.7836, + "step": 10588 + }, + { + "epoch": 3.2501534683855127, + "grad_norm": 0.524680495262146, + "learning_rate": 7.886649336582344e-05, + "loss": 1.844, + "step": 10589 + }, + { + "epoch": 3.250460405156538, + "grad_norm": 0.36859074234962463, + "learning_rate": 7.886243470635685e-05, + "loss": 1.8072, + "step": 10590 + }, + { + "epoch": 3.250767341927563, + "grad_norm": 0.32370296120643616, + "learning_rate": 7.885837576165478e-05, + "loss": 1.802, + "step": 10591 + }, + { + "epoch": 3.2510742786985882, + "grad_norm": 0.3506374955177307, + "learning_rate": 7.88543165317573e-05, + "loss": 1.7965, + "step": 10592 + }, + { + "epoch": 3.251381215469613, + "grad_norm": 0.39058688282966614, + "learning_rate": 7.885025701670457e-05, + "loss": 1.7987, + "step": 10593 + }, + { + "epoch": 3.2516881522406385, + "grad_norm": 0.3042154014110565, + "learning_rate": 7.884619721653669e-05, + "loss": 1.8345, + "step": 10594 + }, + { + "epoch": 3.251995089011664, + "grad_norm": 0.2249498963356018, + "learning_rate": 7.884213713129378e-05, + "loss": 1.7796, + "step": 10595 + }, + { + "epoch": 3.2523020257826887, + "grad_norm": 0.2701997458934784, + "learning_rate": 7.883807676101595e-05, + "loss": 1.8027, + "step": 10596 + }, + { + "epoch": 3.252608962553714, + "grad_norm": 0.2574785053730011, + "learning_rate": 7.883401610574336e-05, + "loss": 1.7878, + "step": 10597 + }, + { + "epoch": 3.252915899324739, + "grad_norm": 0.24964739382266998, + "learning_rate": 7.882995516551613e-05, + "loss": 1.7612, + "step": 10598 + }, + { + "epoch": 3.2532228360957642, + "grad_norm": 0.2519865930080414, + "learning_rate": 7.882589394037437e-05, + "loss": 1.7583, + "step": 10599 + }, + { + "epoch": 3.2535297728667896, + "grad_norm": 0.23174463212490082, + "learning_rate": 7.882183243035823e-05, + "loss": 1.7607, + "step": 10600 + }, + { + "epoch": 3.2538367096378145, + "grad_norm": 0.28103554248809814, + "learning_rate": 7.881777063550786e-05, + "loss": 1.904, + "step": 10601 + }, + { + "epoch": 3.25414364640884, + "grad_norm": 0.265677809715271, + "learning_rate": 7.881370855586339e-05, + "loss": 1.8169, + "step": 10602 + }, + { + "epoch": 3.254450583179865, + "grad_norm": 0.2539603114128113, + "learning_rate": 7.880964619146493e-05, + "loss": 1.8439, + "step": 10603 + }, + { + "epoch": 3.25475751995089, + "grad_norm": 0.2741886377334595, + "learning_rate": 7.88055835423527e-05, + "loss": 1.8737, + "step": 10604 + }, + { + "epoch": 3.2550644567219154, + "grad_norm": 0.27548348903656006, + "learning_rate": 7.88015206085668e-05, + "loss": 1.8385, + "step": 10605 + }, + { + "epoch": 3.2553713934929407, + "grad_norm": 0.2958502769470215, + "learning_rate": 7.879745739014739e-05, + "loss": 1.8603, + "step": 10606 + }, + { + "epoch": 3.2556783302639656, + "grad_norm": 0.2728644907474518, + "learning_rate": 7.879339388713462e-05, + "loss": 1.8, + "step": 10607 + }, + { + "epoch": 3.255985267034991, + "grad_norm": 0.28718289732933044, + "learning_rate": 7.878933009956866e-05, + "loss": 1.7803, + "step": 10608 + }, + { + "epoch": 3.256292203806016, + "grad_norm": 0.2989691197872162, + "learning_rate": 7.878526602748967e-05, + "loss": 1.8155, + "step": 10609 + }, + { + "epoch": 3.256599140577041, + "grad_norm": 0.24515527486801147, + "learning_rate": 7.87812016709378e-05, + "loss": 1.7623, + "step": 10610 + }, + { + "epoch": 3.2569060773480665, + "grad_norm": 0.29946041107177734, + "learning_rate": 7.877713702995324e-05, + "loss": 1.8097, + "step": 10611 + }, + { + "epoch": 3.2572130141190914, + "grad_norm": 0.2854483723640442, + "learning_rate": 7.877307210457613e-05, + "loss": 1.8088, + "step": 10612 + }, + { + "epoch": 3.2575199508901167, + "grad_norm": 0.27812930941581726, + "learning_rate": 7.876900689484668e-05, + "loss": 1.8151, + "step": 10613 + }, + { + "epoch": 3.2578268876611416, + "grad_norm": 0.2658015787601471, + "learning_rate": 7.876494140080503e-05, + "loss": 1.8314, + "step": 10614 + }, + { + "epoch": 3.258133824432167, + "grad_norm": 0.28935661911964417, + "learning_rate": 7.876087562249137e-05, + "loss": 1.7948, + "step": 10615 + }, + { + "epoch": 3.2584407612031923, + "grad_norm": 0.27497121691703796, + "learning_rate": 7.875680955994587e-05, + "loss": 1.7964, + "step": 10616 + }, + { + "epoch": 3.258747697974217, + "grad_norm": 0.3313405513763428, + "learning_rate": 7.875274321320873e-05, + "loss": 1.8143, + "step": 10617 + }, + { + "epoch": 3.2590546347452425, + "grad_norm": 0.3217218816280365, + "learning_rate": 7.874867658232013e-05, + "loss": 1.7749, + "step": 10618 + }, + { + "epoch": 3.259361571516268, + "grad_norm": 0.25105544924736023, + "learning_rate": 7.874460966732025e-05, + "loss": 1.7834, + "step": 10619 + }, + { + "epoch": 3.2596685082872927, + "grad_norm": 0.2931382358074188, + "learning_rate": 7.874054246824931e-05, + "loss": 1.8252, + "step": 10620 + }, + { + "epoch": 3.259975445058318, + "grad_norm": 0.2803363502025604, + "learning_rate": 7.873647498514747e-05, + "loss": 1.7527, + "step": 10621 + }, + { + "epoch": 3.2602823818293434, + "grad_norm": 0.29857927560806274, + "learning_rate": 7.873240721805492e-05, + "loss": 1.8085, + "step": 10622 + }, + { + "epoch": 3.2605893186003683, + "grad_norm": 0.24864110350608826, + "learning_rate": 7.872833916701192e-05, + "loss": 1.7509, + "step": 10623 + }, + { + "epoch": 3.2608962553713936, + "grad_norm": 0.24105949699878693, + "learning_rate": 7.872427083205862e-05, + "loss": 1.7871, + "step": 10624 + }, + { + "epoch": 3.2612031921424185, + "grad_norm": 0.2429245114326477, + "learning_rate": 7.872020221323523e-05, + "loss": 1.777, + "step": 10625 + }, + { + "epoch": 3.261510128913444, + "grad_norm": 0.234287828207016, + "learning_rate": 7.871613331058197e-05, + "loss": 1.8001, + "step": 10626 + }, + { + "epoch": 3.261817065684469, + "grad_norm": 0.3463406264781952, + "learning_rate": 7.871206412413905e-05, + "loss": 1.8925, + "step": 10627 + }, + { + "epoch": 3.262124002455494, + "grad_norm": 0.26798921823501587, + "learning_rate": 7.87079946539467e-05, + "loss": 1.7963, + "step": 10628 + }, + { + "epoch": 3.2624309392265194, + "grad_norm": 0.28603312373161316, + "learning_rate": 7.87039249000451e-05, + "loss": 1.8308, + "step": 10629 + }, + { + "epoch": 3.2627378759975443, + "grad_norm": 0.2717527747154236, + "learning_rate": 7.86998548624745e-05, + "loss": 1.8246, + "step": 10630 + }, + { + "epoch": 3.2630448127685696, + "grad_norm": 0.32215580344200134, + "learning_rate": 7.86957845412751e-05, + "loss": 1.7278, + "step": 10631 + }, + { + "epoch": 3.263351749539595, + "grad_norm": 0.3578735589981079, + "learning_rate": 7.869171393648717e-05, + "loss": 1.7288, + "step": 10632 + }, + { + "epoch": 3.26365868631062, + "grad_norm": 0.3120707869529724, + "learning_rate": 7.868764304815089e-05, + "loss": 1.7971, + "step": 10633 + }, + { + "epoch": 3.263965623081645, + "grad_norm": 0.27419236302375793, + "learning_rate": 7.86835718763065e-05, + "loss": 1.8529, + "step": 10634 + }, + { + "epoch": 3.2642725598526705, + "grad_norm": 0.3200531601905823, + "learning_rate": 7.867950042099423e-05, + "loss": 1.7892, + "step": 10635 + }, + { + "epoch": 3.2645794966236954, + "grad_norm": 0.325706422328949, + "learning_rate": 7.867542868225435e-05, + "loss": 1.8236, + "step": 10636 + }, + { + "epoch": 3.2648864333947207, + "grad_norm": 0.2950136065483093, + "learning_rate": 7.867135666012707e-05, + "loss": 1.8163, + "step": 10637 + }, + { + "epoch": 3.265193370165746, + "grad_norm": 0.2772117257118225, + "learning_rate": 7.866728435465263e-05, + "loss": 1.8373, + "step": 10638 + }, + { + "epoch": 3.265500306936771, + "grad_norm": 0.2887401580810547, + "learning_rate": 7.866321176587129e-05, + "loss": 1.7756, + "step": 10639 + }, + { + "epoch": 3.2658072437077963, + "grad_norm": 0.3474489152431488, + "learning_rate": 7.865913889382329e-05, + "loss": 1.7539, + "step": 10640 + }, + { + "epoch": 3.266114180478821, + "grad_norm": 0.3433493971824646, + "learning_rate": 7.865506573854888e-05, + "loss": 1.7987, + "step": 10641 + }, + { + "epoch": 3.2664211172498465, + "grad_norm": 0.3075394630432129, + "learning_rate": 7.865099230008832e-05, + "loss": 1.7907, + "step": 10642 + }, + { + "epoch": 3.266728054020872, + "grad_norm": 0.24817697703838348, + "learning_rate": 7.864691857848187e-05, + "loss": 1.7941, + "step": 10643 + }, + { + "epoch": 3.2670349907918967, + "grad_norm": 0.290147602558136, + "learning_rate": 7.864284457376976e-05, + "loss": 1.9125, + "step": 10644 + }, + { + "epoch": 3.267341927562922, + "grad_norm": 0.253684937953949, + "learning_rate": 7.863877028599229e-05, + "loss": 1.8084, + "step": 10645 + }, + { + "epoch": 3.267648864333947, + "grad_norm": 0.26349252462387085, + "learning_rate": 7.863469571518969e-05, + "loss": 1.7548, + "step": 10646 + }, + { + "epoch": 3.2679558011049723, + "grad_norm": 0.30568864941596985, + "learning_rate": 7.863062086140224e-05, + "loss": 1.8551, + "step": 10647 + }, + { + "epoch": 3.2682627378759976, + "grad_norm": 0.2866690456867218, + "learning_rate": 7.862654572467024e-05, + "loss": 1.8145, + "step": 10648 + }, + { + "epoch": 3.2685696746470225, + "grad_norm": 0.32022854685783386, + "learning_rate": 7.862247030503391e-05, + "loss": 1.896, + "step": 10649 + }, + { + "epoch": 3.268876611418048, + "grad_norm": 0.25260284543037415, + "learning_rate": 7.861839460253356e-05, + "loss": 1.814, + "step": 10650 + }, + { + "epoch": 3.269183548189073, + "grad_norm": 0.26776066422462463, + "learning_rate": 7.861431861720947e-05, + "loss": 1.7755, + "step": 10651 + }, + { + "epoch": 3.269490484960098, + "grad_norm": 0.26514193415641785, + "learning_rate": 7.861024234910191e-05, + "loss": 1.7606, + "step": 10652 + }, + { + "epoch": 3.2697974217311234, + "grad_norm": 0.27213940024375916, + "learning_rate": 7.860616579825116e-05, + "loss": 1.8074, + "step": 10653 + }, + { + "epoch": 3.2701043585021488, + "grad_norm": 0.29192888736724854, + "learning_rate": 7.860208896469752e-05, + "loss": 1.8436, + "step": 10654 + }, + { + "epoch": 3.2704112952731736, + "grad_norm": 0.3772370219230652, + "learning_rate": 7.859801184848127e-05, + "loss": 1.8096, + "step": 10655 + }, + { + "epoch": 3.270718232044199, + "grad_norm": 0.4574970006942749, + "learning_rate": 7.859393444964269e-05, + "loss": 1.7612, + "step": 10656 + }, + { + "epoch": 3.271025168815224, + "grad_norm": 0.4614393413066864, + "learning_rate": 7.858985676822211e-05, + "loss": 1.8529, + "step": 10657 + }, + { + "epoch": 3.271332105586249, + "grad_norm": 0.33567267656326294, + "learning_rate": 7.85857788042598e-05, + "loss": 1.8391, + "step": 10658 + }, + { + "epoch": 3.2716390423572745, + "grad_norm": 0.2564064860343933, + "learning_rate": 7.858170055779609e-05, + "loss": 1.7621, + "step": 10659 + }, + { + "epoch": 3.2719459791282994, + "grad_norm": 0.26769882440567017, + "learning_rate": 7.857762202887122e-05, + "loss": 1.8145, + "step": 10660 + }, + { + "epoch": 3.2722529158993248, + "grad_norm": 0.262008935213089, + "learning_rate": 7.857354321752558e-05, + "loss": 1.7513, + "step": 10661 + }, + { + "epoch": 3.27255985267035, + "grad_norm": 0.26494377851486206, + "learning_rate": 7.856946412379942e-05, + "loss": 1.8071, + "step": 10662 + }, + { + "epoch": 3.272866789441375, + "grad_norm": 0.25613999366760254, + "learning_rate": 7.856538474773307e-05, + "loss": 1.8775, + "step": 10663 + }, + { + "epoch": 3.2731737262124003, + "grad_norm": 0.24789929389953613, + "learning_rate": 7.856130508936684e-05, + "loss": 1.8055, + "step": 10664 + }, + { + "epoch": 3.2734806629834257, + "grad_norm": 0.29111939668655396, + "learning_rate": 7.855722514874107e-05, + "loss": 1.8114, + "step": 10665 + }, + { + "epoch": 3.2737875997544506, + "grad_norm": 0.30511030554771423, + "learning_rate": 7.855314492589605e-05, + "loss": 1.8131, + "step": 10666 + }, + { + "epoch": 3.274094536525476, + "grad_norm": 0.2545989453792572, + "learning_rate": 7.854906442087212e-05, + "loss": 1.7933, + "step": 10667 + }, + { + "epoch": 3.2744014732965008, + "grad_norm": 0.26684823632240295, + "learning_rate": 7.85449836337096e-05, + "loss": 1.7604, + "step": 10668 + }, + { + "epoch": 3.274708410067526, + "grad_norm": 0.5097808837890625, + "learning_rate": 7.854090256444881e-05, + "loss": 1.777, + "step": 10669 + }, + { + "epoch": 3.2750153468385514, + "grad_norm": 0.27828142046928406, + "learning_rate": 7.853682121313011e-05, + "loss": 1.7885, + "step": 10670 + }, + { + "epoch": 3.2753222836095763, + "grad_norm": 0.2925552725791931, + "learning_rate": 7.853273957979381e-05, + "loss": 1.7962, + "step": 10671 + }, + { + "epoch": 3.2756292203806017, + "grad_norm": 0.284574955701828, + "learning_rate": 7.852865766448025e-05, + "loss": 1.8645, + "step": 10672 + }, + { + "epoch": 3.2759361571516266, + "grad_norm": 0.23407664895057678, + "learning_rate": 7.85245754672298e-05, + "loss": 1.7106, + "step": 10673 + }, + { + "epoch": 3.276243093922652, + "grad_norm": 0.2555919885635376, + "learning_rate": 7.852049298808274e-05, + "loss": 1.8237, + "step": 10674 + }, + { + "epoch": 3.2765500306936772, + "grad_norm": 0.26703694462776184, + "learning_rate": 7.851641022707947e-05, + "loss": 1.7844, + "step": 10675 + }, + { + "epoch": 3.276856967464702, + "grad_norm": 0.24889135360717773, + "learning_rate": 7.851232718426033e-05, + "loss": 1.7783, + "step": 10676 + }, + { + "epoch": 3.2771639042357275, + "grad_norm": 0.25770726799964905, + "learning_rate": 7.850824385966564e-05, + "loss": 1.8007, + "step": 10677 + }, + { + "epoch": 3.277470841006753, + "grad_norm": 0.31806984543800354, + "learning_rate": 7.850416025333578e-05, + "loss": 1.8623, + "step": 10678 + }, + { + "epoch": 3.2777777777777777, + "grad_norm": 0.2906930148601532, + "learning_rate": 7.850007636531111e-05, + "loss": 1.8315, + "step": 10679 + }, + { + "epoch": 3.278084714548803, + "grad_norm": 0.2802525460720062, + "learning_rate": 7.849599219563197e-05, + "loss": 1.8488, + "step": 10680 + }, + { + "epoch": 3.2783916513198283, + "grad_norm": 0.26150405406951904, + "learning_rate": 7.849190774433874e-05, + "loss": 1.7967, + "step": 10681 + }, + { + "epoch": 3.2786985880908532, + "grad_norm": 0.25863370299339294, + "learning_rate": 7.848782301147178e-05, + "loss": 1.864, + "step": 10682 + }, + { + "epoch": 3.2790055248618786, + "grad_norm": 0.25381043553352356, + "learning_rate": 7.848373799707145e-05, + "loss": 1.8239, + "step": 10683 + }, + { + "epoch": 3.2793124616329035, + "grad_norm": 0.2583387792110443, + "learning_rate": 7.847965270117814e-05, + "loss": 1.8449, + "step": 10684 + }, + { + "epoch": 3.279619398403929, + "grad_norm": 0.30759841203689575, + "learning_rate": 7.84755671238322e-05, + "loss": 1.7992, + "step": 10685 + }, + { + "epoch": 3.279926335174954, + "grad_norm": 0.4316023588180542, + "learning_rate": 7.847148126507402e-05, + "loss": 1.7912, + "step": 10686 + }, + { + "epoch": 3.280233271945979, + "grad_norm": 0.3988901674747467, + "learning_rate": 7.846739512494396e-05, + "loss": 1.8831, + "step": 10687 + }, + { + "epoch": 3.2805402087170044, + "grad_norm": 0.318934828042984, + "learning_rate": 7.846330870348244e-05, + "loss": 1.8411, + "step": 10688 + }, + { + "epoch": 3.2808471454880292, + "grad_norm": 0.27755632996559143, + "learning_rate": 7.84592220007298e-05, + "loss": 1.8763, + "step": 10689 + }, + { + "epoch": 3.2811540822590546, + "grad_norm": 0.33544883131980896, + "learning_rate": 7.845513501672646e-05, + "loss": 1.731, + "step": 10690 + }, + { + "epoch": 3.28146101903008, + "grad_norm": 0.28299057483673096, + "learning_rate": 7.845104775151278e-05, + "loss": 1.813, + "step": 10691 + }, + { + "epoch": 3.281767955801105, + "grad_norm": 0.2761382460594177, + "learning_rate": 7.844696020512918e-05, + "loss": 1.8018, + "step": 10692 + }, + { + "epoch": 3.28207489257213, + "grad_norm": 0.2919033169746399, + "learning_rate": 7.844287237761605e-05, + "loss": 1.793, + "step": 10693 + }, + { + "epoch": 3.2823818293431555, + "grad_norm": 0.32922014594078064, + "learning_rate": 7.843878426901378e-05, + "loss": 1.8186, + "step": 10694 + }, + { + "epoch": 3.2826887661141804, + "grad_norm": 0.2818562090396881, + "learning_rate": 7.843469587936279e-05, + "loss": 1.7794, + "step": 10695 + }, + { + "epoch": 3.2829957028852057, + "grad_norm": 0.26414254307746887, + "learning_rate": 7.843060720870345e-05, + "loss": 1.7854, + "step": 10696 + }, + { + "epoch": 3.283302639656231, + "grad_norm": 0.28345760703086853, + "learning_rate": 7.842651825707618e-05, + "loss": 1.7659, + "step": 10697 + }, + { + "epoch": 3.283609576427256, + "grad_norm": 0.3522340655326843, + "learning_rate": 7.842242902452141e-05, + "loss": 1.8427, + "step": 10698 + }, + { + "epoch": 3.2839165131982813, + "grad_norm": 0.2861590087413788, + "learning_rate": 7.841833951107954e-05, + "loss": 1.7539, + "step": 10699 + }, + { + "epoch": 3.284223449969306, + "grad_norm": 0.2596624493598938, + "learning_rate": 7.841424971679099e-05, + "loss": 1.8407, + "step": 10700 + }, + { + "epoch": 3.2845303867403315, + "grad_norm": 0.2847718298435211, + "learning_rate": 7.841015964169616e-05, + "loss": 1.8085, + "step": 10701 + }, + { + "epoch": 3.284837323511357, + "grad_norm": 0.29566115140914917, + "learning_rate": 7.840606928583547e-05, + "loss": 1.7873, + "step": 10702 + }, + { + "epoch": 3.2851442602823817, + "grad_norm": 0.2752111256122589, + "learning_rate": 7.840197864924936e-05, + "loss": 1.8186, + "step": 10703 + }, + { + "epoch": 3.285451197053407, + "grad_norm": 0.2907958924770355, + "learning_rate": 7.839788773197826e-05, + "loss": 1.8081, + "step": 10704 + }, + { + "epoch": 3.285758133824432, + "grad_norm": 0.25808724761009216, + "learning_rate": 7.839379653406258e-05, + "loss": 1.7635, + "step": 10705 + }, + { + "epoch": 3.2860650705954573, + "grad_norm": 0.2732730507850647, + "learning_rate": 7.838970505554277e-05, + "loss": 1.8061, + "step": 10706 + }, + { + "epoch": 3.2863720073664826, + "grad_norm": 0.23820067942142487, + "learning_rate": 7.838561329645923e-05, + "loss": 1.8091, + "step": 10707 + }, + { + "epoch": 3.2866789441375075, + "grad_norm": 0.24179396033287048, + "learning_rate": 7.838152125685245e-05, + "loss": 1.7513, + "step": 10708 + }, + { + "epoch": 3.286985880908533, + "grad_norm": 0.2627546787261963, + "learning_rate": 7.837742893676283e-05, + "loss": 1.8741, + "step": 10709 + }, + { + "epoch": 3.287292817679558, + "grad_norm": 0.2827817499637604, + "learning_rate": 7.837333633623083e-05, + "loss": 1.8387, + "step": 10710 + }, + { + "epoch": 3.287599754450583, + "grad_norm": 0.2666749060153961, + "learning_rate": 7.836924345529688e-05, + "loss": 1.8319, + "step": 10711 + }, + { + "epoch": 3.2879066912216084, + "grad_norm": 0.3403390944004059, + "learning_rate": 7.836515029400145e-05, + "loss": 1.7827, + "step": 10712 + }, + { + "epoch": 3.2882136279926337, + "grad_norm": 0.30646705627441406, + "learning_rate": 7.836105685238497e-05, + "loss": 1.8612, + "step": 10713 + }, + { + "epoch": 3.2885205647636586, + "grad_norm": 0.2580253481864929, + "learning_rate": 7.83569631304879e-05, + "loss": 1.7332, + "step": 10714 + }, + { + "epoch": 3.288827501534684, + "grad_norm": 0.23734542727470398, + "learning_rate": 7.835286912835071e-05, + "loss": 1.7899, + "step": 10715 + }, + { + "epoch": 3.289134438305709, + "grad_norm": 0.2457810491323471, + "learning_rate": 7.834877484601384e-05, + "loss": 1.8059, + "step": 10716 + }, + { + "epoch": 3.289441375076734, + "grad_norm": 0.2558443248271942, + "learning_rate": 7.834468028351778e-05, + "loss": 1.8689, + "step": 10717 + }, + { + "epoch": 3.2897483118477595, + "grad_norm": 0.26596710085868835, + "learning_rate": 7.834058544090298e-05, + "loss": 1.816, + "step": 10718 + }, + { + "epoch": 3.2900552486187844, + "grad_norm": 0.25424903631210327, + "learning_rate": 7.833649031820987e-05, + "loss": 1.7907, + "step": 10719 + }, + { + "epoch": 3.2903621853898097, + "grad_norm": 0.23873139917850494, + "learning_rate": 7.833239491547896e-05, + "loss": 1.7666, + "step": 10720 + }, + { + "epoch": 3.2906691221608346, + "grad_norm": 0.23292972147464752, + "learning_rate": 7.832829923275073e-05, + "loss": 1.7674, + "step": 10721 + }, + { + "epoch": 3.29097605893186, + "grad_norm": 0.30133312940597534, + "learning_rate": 7.832420327006566e-05, + "loss": 1.8229, + "step": 10722 + }, + { + "epoch": 3.2912829957028853, + "grad_norm": 0.2882522642612457, + "learning_rate": 7.83201070274642e-05, + "loss": 1.7855, + "step": 10723 + }, + { + "epoch": 3.29158993247391, + "grad_norm": 0.2578088045120239, + "learning_rate": 7.831601050498683e-05, + "loss": 1.7276, + "step": 10724 + }, + { + "epoch": 3.2918968692449355, + "grad_norm": 0.29511600732803345, + "learning_rate": 7.831191370267406e-05, + "loss": 1.8085, + "step": 10725 + }, + { + "epoch": 3.292203806015961, + "grad_norm": 0.29557499289512634, + "learning_rate": 7.830781662056634e-05, + "loss": 1.815, + "step": 10726 + }, + { + "epoch": 3.2925107427869857, + "grad_norm": 0.32722121477127075, + "learning_rate": 7.830371925870422e-05, + "loss": 1.7889, + "step": 10727 + }, + { + "epoch": 3.292817679558011, + "grad_norm": 0.3124488592147827, + "learning_rate": 7.829962161712814e-05, + "loss": 1.8063, + "step": 10728 + }, + { + "epoch": 3.2931246163290364, + "grad_norm": 0.311334490776062, + "learning_rate": 7.829552369587861e-05, + "loss": 1.8852, + "step": 10729 + }, + { + "epoch": 3.2934315531000613, + "grad_norm": 0.28010860085487366, + "learning_rate": 7.829142549499613e-05, + "loss": 1.8274, + "step": 10730 + }, + { + "epoch": 3.2937384898710866, + "grad_norm": 0.3453529477119446, + "learning_rate": 7.828732701452119e-05, + "loss": 1.8618, + "step": 10731 + }, + { + "epoch": 3.2940454266421115, + "grad_norm": 0.2946802079677582, + "learning_rate": 7.828322825449432e-05, + "loss": 1.7123, + "step": 10732 + }, + { + "epoch": 3.294352363413137, + "grad_norm": 0.2467648684978485, + "learning_rate": 7.827912921495601e-05, + "loss": 1.7786, + "step": 10733 + }, + { + "epoch": 3.294659300184162, + "grad_norm": 0.2957034707069397, + "learning_rate": 7.827502989594677e-05, + "loss": 1.7817, + "step": 10734 + }, + { + "epoch": 3.294966236955187, + "grad_norm": 0.300905704498291, + "learning_rate": 7.827093029750713e-05, + "loss": 1.7582, + "step": 10735 + }, + { + "epoch": 3.2952731737262124, + "grad_norm": 0.28935131430625916, + "learning_rate": 7.826683041967757e-05, + "loss": 1.7766, + "step": 10736 + }, + { + "epoch": 3.2955801104972378, + "grad_norm": 0.26046010851860046, + "learning_rate": 7.826273026249861e-05, + "loss": 1.8152, + "step": 10737 + }, + { + "epoch": 3.2958870472682626, + "grad_norm": 0.24247924983501434, + "learning_rate": 7.82586298260108e-05, + "loss": 1.8679, + "step": 10738 + }, + { + "epoch": 3.296193984039288, + "grad_norm": 0.25977620482444763, + "learning_rate": 7.825452911025466e-05, + "loss": 1.8108, + "step": 10739 + }, + { + "epoch": 3.2965009208103133, + "grad_norm": 0.2732592821121216, + "learning_rate": 7.825042811527068e-05, + "loss": 1.7355, + "step": 10740 + }, + { + "epoch": 3.296807857581338, + "grad_norm": 0.38407859206199646, + "learning_rate": 7.824632684109941e-05, + "loss": 1.8418, + "step": 10741 + }, + { + "epoch": 3.2971147943523635, + "grad_norm": 0.4239252805709839, + "learning_rate": 7.82422252877814e-05, + "loss": 1.7655, + "step": 10742 + }, + { + "epoch": 3.2974217311233884, + "grad_norm": 0.3810526132583618, + "learning_rate": 7.823812345535716e-05, + "loss": 1.8804, + "step": 10743 + }, + { + "epoch": 3.2977286678944138, + "grad_norm": 0.29939520359039307, + "learning_rate": 7.823402134386722e-05, + "loss": 1.8207, + "step": 10744 + }, + { + "epoch": 3.298035604665439, + "grad_norm": 0.4053972065448761, + "learning_rate": 7.822991895335215e-05, + "loss": 1.7901, + "step": 10745 + }, + { + "epoch": 3.298342541436464, + "grad_norm": 0.4975005090236664, + "learning_rate": 7.822581628385247e-05, + "loss": 1.8344, + "step": 10746 + }, + { + "epoch": 3.2986494782074893, + "grad_norm": 0.4100436270236969, + "learning_rate": 7.822171333540874e-05, + "loss": 1.7891, + "step": 10747 + }, + { + "epoch": 3.298956414978514, + "grad_norm": 0.2817644476890564, + "learning_rate": 7.821761010806147e-05, + "loss": 1.7895, + "step": 10748 + }, + { + "epoch": 3.2992633517495396, + "grad_norm": 0.332660973072052, + "learning_rate": 7.821350660185125e-05, + "loss": 1.7281, + "step": 10749 + }, + { + "epoch": 3.299570288520565, + "grad_norm": 0.42652732133865356, + "learning_rate": 7.820940281681863e-05, + "loss": 1.7855, + "step": 10750 + }, + { + "epoch": 3.2998772252915898, + "grad_norm": 0.35700714588165283, + "learning_rate": 7.820529875300415e-05, + "loss": 1.8722, + "step": 10751 + }, + { + "epoch": 3.300184162062615, + "grad_norm": 0.25305211544036865, + "learning_rate": 7.820119441044838e-05, + "loss": 1.7696, + "step": 10752 + }, + { + "epoch": 3.3004910988336404, + "grad_norm": 0.280205637216568, + "learning_rate": 7.819708978919188e-05, + "loss": 1.756, + "step": 10753 + }, + { + "epoch": 3.3007980356046653, + "grad_norm": 0.4176226854324341, + "learning_rate": 7.819298488927521e-05, + "loss": 1.7731, + "step": 10754 + }, + { + "epoch": 3.3011049723756907, + "grad_norm": 0.4264865517616272, + "learning_rate": 7.818887971073894e-05, + "loss": 1.7851, + "step": 10755 + }, + { + "epoch": 3.301411909146716, + "grad_norm": 0.2901221215724945, + "learning_rate": 7.818477425362363e-05, + "loss": 1.7356, + "step": 10756 + }, + { + "epoch": 3.301718845917741, + "grad_norm": 0.29583361744880676, + "learning_rate": 7.818066851796986e-05, + "loss": 1.8269, + "step": 10757 + }, + { + "epoch": 3.3020257826887662, + "grad_norm": 0.38592997193336487, + "learning_rate": 7.817656250381821e-05, + "loss": 1.7515, + "step": 10758 + }, + { + "epoch": 3.302332719459791, + "grad_norm": 0.29301533102989197, + "learning_rate": 7.817245621120927e-05, + "loss": 1.7955, + "step": 10759 + }, + { + "epoch": 3.3026396562308165, + "grad_norm": 0.2770880162715912, + "learning_rate": 7.816834964018359e-05, + "loss": 1.7899, + "step": 10760 + }, + { + "epoch": 3.302946593001842, + "grad_norm": 0.32566413283348083, + "learning_rate": 7.816424279078176e-05, + "loss": 1.74, + "step": 10761 + }, + { + "epoch": 3.3032535297728667, + "grad_norm": 0.3077750504016876, + "learning_rate": 7.81601356630444e-05, + "loss": 1.8123, + "step": 10762 + }, + { + "epoch": 3.303560466543892, + "grad_norm": 0.2826370298862457, + "learning_rate": 7.815602825701206e-05, + "loss": 1.865, + "step": 10763 + }, + { + "epoch": 3.303867403314917, + "grad_norm": 0.31700822710990906, + "learning_rate": 7.815192057272534e-05, + "loss": 1.8021, + "step": 10764 + }, + { + "epoch": 3.3041743400859422, + "grad_norm": 0.33182790875434875, + "learning_rate": 7.814781261022486e-05, + "loss": 1.818, + "step": 10765 + }, + { + "epoch": 3.3044812768569676, + "grad_norm": 0.2720039486885071, + "learning_rate": 7.814370436955118e-05, + "loss": 1.8369, + "step": 10766 + }, + { + "epoch": 3.3047882136279925, + "grad_norm": 0.28134068846702576, + "learning_rate": 7.813959585074493e-05, + "loss": 1.8391, + "step": 10767 + }, + { + "epoch": 3.305095150399018, + "grad_norm": 0.25748828053474426, + "learning_rate": 7.813548705384667e-05, + "loss": 1.7987, + "step": 10768 + }, + { + "epoch": 3.305402087170043, + "grad_norm": 0.26187625527381897, + "learning_rate": 7.813137797889708e-05, + "loss": 1.7645, + "step": 10769 + }, + { + "epoch": 3.305709023941068, + "grad_norm": 0.297262579202652, + "learning_rate": 7.812726862593671e-05, + "loss": 1.771, + "step": 10770 + }, + { + "epoch": 3.3060159607120934, + "grad_norm": 0.2987872064113617, + "learning_rate": 7.812315899500618e-05, + "loss": 1.8115, + "step": 10771 + }, + { + "epoch": 3.3063228974831187, + "grad_norm": 0.31963878870010376, + "learning_rate": 7.81190490861461e-05, + "loss": 1.7685, + "step": 10772 + }, + { + "epoch": 3.3066298342541436, + "grad_norm": 0.27007177472114563, + "learning_rate": 7.81149388993971e-05, + "loss": 1.8272, + "step": 10773 + }, + { + "epoch": 3.306936771025169, + "grad_norm": 0.26818498969078064, + "learning_rate": 7.811082843479981e-05, + "loss": 1.7894, + "step": 10774 + }, + { + "epoch": 3.307243707796194, + "grad_norm": 0.28857091069221497, + "learning_rate": 7.810671769239483e-05, + "loss": 1.8769, + "step": 10775 + }, + { + "epoch": 3.307550644567219, + "grad_norm": 0.26983144879341125, + "learning_rate": 7.810260667222277e-05, + "loss": 1.796, + "step": 10776 + }, + { + "epoch": 3.3078575813382445, + "grad_norm": 0.2566467225551605, + "learning_rate": 7.809849537432432e-05, + "loss": 1.848, + "step": 10777 + }, + { + "epoch": 3.3081645181092694, + "grad_norm": 0.25607848167419434, + "learning_rate": 7.809438379874005e-05, + "loss": 1.8072, + "step": 10778 + }, + { + "epoch": 3.3084714548802947, + "grad_norm": 0.29158470034599304, + "learning_rate": 7.809027194551059e-05, + "loss": 1.7772, + "step": 10779 + }, + { + "epoch": 3.3087783916513196, + "grad_norm": 0.360897421836853, + "learning_rate": 7.808615981467664e-05, + "loss": 1.8404, + "step": 10780 + }, + { + "epoch": 3.309085328422345, + "grad_norm": 0.31121253967285156, + "learning_rate": 7.808204740627877e-05, + "loss": 1.8137, + "step": 10781 + }, + { + "epoch": 3.3093922651933703, + "grad_norm": 0.2846451699733734, + "learning_rate": 7.807793472035765e-05, + "loss": 1.8367, + "step": 10782 + }, + { + "epoch": 3.309699201964395, + "grad_norm": 0.2711004316806793, + "learning_rate": 7.807382175695393e-05, + "loss": 1.7728, + "step": 10783 + }, + { + "epoch": 3.3100061387354205, + "grad_norm": 0.2693859338760376, + "learning_rate": 7.806970851610824e-05, + "loss": 1.7026, + "step": 10784 + }, + { + "epoch": 3.310313075506446, + "grad_norm": 0.3050517439842224, + "learning_rate": 7.806559499786125e-05, + "loss": 1.8041, + "step": 10785 + }, + { + "epoch": 3.3106200122774707, + "grad_norm": 0.27304747700691223, + "learning_rate": 7.80614812022536e-05, + "loss": 1.8182, + "step": 10786 + }, + { + "epoch": 3.310926949048496, + "grad_norm": 0.28378555178642273, + "learning_rate": 7.805736712932594e-05, + "loss": 1.8519, + "step": 10787 + }, + { + "epoch": 3.3112338858195214, + "grad_norm": 0.30620133876800537, + "learning_rate": 7.805325277911892e-05, + "loss": 1.8594, + "step": 10788 + }, + { + "epoch": 3.3115408225905463, + "grad_norm": 0.2580169141292572, + "learning_rate": 7.804913815167325e-05, + "loss": 1.7897, + "step": 10789 + }, + { + "epoch": 3.3118477593615716, + "grad_norm": 0.28937023878097534, + "learning_rate": 7.804502324702951e-05, + "loss": 1.8362, + "step": 10790 + }, + { + "epoch": 3.3121546961325965, + "grad_norm": 0.28032705187797546, + "learning_rate": 7.804090806522844e-05, + "loss": 1.8168, + "step": 10791 + }, + { + "epoch": 3.312461632903622, + "grad_norm": 0.33712559938430786, + "learning_rate": 7.803679260631069e-05, + "loss": 1.7489, + "step": 10792 + }, + { + "epoch": 3.312768569674647, + "grad_norm": 0.40536820888519287, + "learning_rate": 7.80326768703169e-05, + "loss": 1.8413, + "step": 10793 + }, + { + "epoch": 3.313075506445672, + "grad_norm": 0.34967559576034546, + "learning_rate": 7.802856085728778e-05, + "loss": 1.8076, + "step": 10794 + }, + { + "epoch": 3.3133824432166974, + "grad_norm": 0.2429870367050171, + "learning_rate": 7.8024444567264e-05, + "loss": 1.8002, + "step": 10795 + }, + { + "epoch": 3.3136893799877223, + "grad_norm": 0.40956684947013855, + "learning_rate": 7.802032800028621e-05, + "loss": 1.8151, + "step": 10796 + }, + { + "epoch": 3.3139963167587476, + "grad_norm": 0.4908781945705414, + "learning_rate": 7.801621115639512e-05, + "loss": 1.8124, + "step": 10797 + }, + { + "epoch": 3.314303253529773, + "grad_norm": 0.3922197222709656, + "learning_rate": 7.801209403563143e-05, + "loss": 1.7911, + "step": 10798 + }, + { + "epoch": 3.314610190300798, + "grad_norm": 0.29467105865478516, + "learning_rate": 7.800797663803578e-05, + "loss": 1.8472, + "step": 10799 + }, + { + "epoch": 3.314917127071823, + "grad_norm": 0.384974867105484, + "learning_rate": 7.800385896364891e-05, + "loss": 1.8139, + "step": 10800 + }, + { + "epoch": 3.3152240638428485, + "grad_norm": 0.4605129063129425, + "learning_rate": 7.79997410125115e-05, + "loss": 1.7982, + "step": 10801 + }, + { + "epoch": 3.3155310006138734, + "grad_norm": 0.2982464134693146, + "learning_rate": 7.799562278466423e-05, + "loss": 1.8496, + "step": 10802 + }, + { + "epoch": 3.3158379373848987, + "grad_norm": 0.3101392984390259, + "learning_rate": 7.79915042801478e-05, + "loss": 1.8172, + "step": 10803 + }, + { + "epoch": 3.316144874155924, + "grad_norm": 0.3651282489299774, + "learning_rate": 7.798738549900292e-05, + "loss": 1.7497, + "step": 10804 + }, + { + "epoch": 3.316451810926949, + "grad_norm": 0.28504419326782227, + "learning_rate": 7.79832664412703e-05, + "loss": 1.8027, + "step": 10805 + }, + { + "epoch": 3.3167587476979743, + "grad_norm": 0.28333309292793274, + "learning_rate": 7.797914710699063e-05, + "loss": 1.8121, + "step": 10806 + }, + { + "epoch": 3.317065684468999, + "grad_norm": 0.37549784779548645, + "learning_rate": 7.797502749620462e-05, + "loss": 1.817, + "step": 10807 + }, + { + "epoch": 3.3173726212400245, + "grad_norm": 0.3864210844039917, + "learning_rate": 7.797090760895301e-05, + "loss": 1.852, + "step": 10808 + }, + { + "epoch": 3.31767955801105, + "grad_norm": 0.2422102987766266, + "learning_rate": 7.79667874452765e-05, + "loss": 1.7523, + "step": 10809 + }, + { + "epoch": 3.3179864947820747, + "grad_norm": 0.307892382144928, + "learning_rate": 7.79626670052158e-05, + "loss": 1.7436, + "step": 10810 + }, + { + "epoch": 3.3182934315531, + "grad_norm": 0.29607462882995605, + "learning_rate": 7.795854628881162e-05, + "loss": 1.768, + "step": 10811 + }, + { + "epoch": 3.3186003683241254, + "grad_norm": 0.23334427177906036, + "learning_rate": 7.795442529610471e-05, + "loss": 1.7687, + "step": 10812 + }, + { + "epoch": 3.3189073050951503, + "grad_norm": 0.26257455348968506, + "learning_rate": 7.795030402713578e-05, + "loss": 1.8266, + "step": 10813 + }, + { + "epoch": 3.3192142418661756, + "grad_norm": 0.3252788782119751, + "learning_rate": 7.794618248194556e-05, + "loss": 1.8645, + "step": 10814 + }, + { + "epoch": 3.319521178637201, + "grad_norm": 0.3807232975959778, + "learning_rate": 7.79420606605748e-05, + "loss": 1.8154, + "step": 10815 + }, + { + "epoch": 3.319828115408226, + "grad_norm": 0.3395625948905945, + "learning_rate": 7.793793856306422e-05, + "loss": 1.8002, + "step": 10816 + }, + { + "epoch": 3.320135052179251, + "grad_norm": 0.2896415889263153, + "learning_rate": 7.793381618945455e-05, + "loss": 1.8077, + "step": 10817 + }, + { + "epoch": 3.320441988950276, + "grad_norm": 0.27733489871025085, + "learning_rate": 7.792969353978652e-05, + "loss": 1.7976, + "step": 10818 + }, + { + "epoch": 3.3207489257213014, + "grad_norm": 0.36985141038894653, + "learning_rate": 7.79255706141009e-05, + "loss": 1.8724, + "step": 10819 + }, + { + "epoch": 3.3210558624923268, + "grad_norm": 0.37886983156204224, + "learning_rate": 7.792144741243843e-05, + "loss": 1.8249, + "step": 10820 + }, + { + "epoch": 3.3213627992633517, + "grad_norm": 0.3030721843242645, + "learning_rate": 7.791732393483986e-05, + "loss": 1.7975, + "step": 10821 + }, + { + "epoch": 3.321669736034377, + "grad_norm": 0.2637709081172943, + "learning_rate": 7.791320018134592e-05, + "loss": 1.7205, + "step": 10822 + }, + { + "epoch": 3.321976672805402, + "grad_norm": 0.35307520627975464, + "learning_rate": 7.790907615199736e-05, + "loss": 1.8786, + "step": 10823 + }, + { + "epoch": 3.322283609576427, + "grad_norm": 0.3333272635936737, + "learning_rate": 7.790495184683497e-05, + "loss": 1.7715, + "step": 10824 + }, + { + "epoch": 3.3225905463474525, + "grad_norm": 0.2597469091415405, + "learning_rate": 7.790082726589948e-05, + "loss": 1.8379, + "step": 10825 + }, + { + "epoch": 3.3228974831184774, + "grad_norm": 0.34176257252693176, + "learning_rate": 7.789670240923168e-05, + "loss": 1.8305, + "step": 10826 + }, + { + "epoch": 3.3232044198895028, + "grad_norm": 0.37954533100128174, + "learning_rate": 7.789257727687229e-05, + "loss": 1.7728, + "step": 10827 + }, + { + "epoch": 3.323511356660528, + "grad_norm": 0.2840248644351959, + "learning_rate": 7.788845186886212e-05, + "loss": 1.8059, + "step": 10828 + }, + { + "epoch": 3.323818293431553, + "grad_norm": 0.3650275766849518, + "learning_rate": 7.788432618524193e-05, + "loss": 1.8127, + "step": 10829 + }, + { + "epoch": 3.3241252302025783, + "grad_norm": 0.4869692623615265, + "learning_rate": 7.788020022605247e-05, + "loss": 1.833, + "step": 10830 + }, + { + "epoch": 3.3244321669736037, + "grad_norm": 0.3419482707977295, + "learning_rate": 7.787607399133453e-05, + "loss": 1.7812, + "step": 10831 + }, + { + "epoch": 3.3247391037446286, + "grad_norm": 0.27625617384910583, + "learning_rate": 7.787194748112889e-05, + "loss": 1.8513, + "step": 10832 + }, + { + "epoch": 3.325046040515654, + "grad_norm": 0.4287806749343872, + "learning_rate": 7.786782069547633e-05, + "loss": 1.836, + "step": 10833 + }, + { + "epoch": 3.325352977286679, + "grad_norm": 0.4345545172691345, + "learning_rate": 7.786369363441763e-05, + "loss": 1.8027, + "step": 10834 + }, + { + "epoch": 3.325659914057704, + "grad_norm": 0.32976534962654114, + "learning_rate": 7.78595662979936e-05, + "loss": 1.7987, + "step": 10835 + }, + { + "epoch": 3.3259668508287294, + "grad_norm": 0.2677469849586487, + "learning_rate": 7.785543868624498e-05, + "loss": 1.8312, + "step": 10836 + }, + { + "epoch": 3.3262737875997543, + "grad_norm": 0.2547740638256073, + "learning_rate": 7.785131079921259e-05, + "loss": 1.7844, + "step": 10837 + }, + { + "epoch": 3.3265807243707797, + "grad_norm": 0.26755592226982117, + "learning_rate": 7.784718263693725e-05, + "loss": 1.8263, + "step": 10838 + }, + { + "epoch": 3.3268876611418046, + "grad_norm": 0.23884403705596924, + "learning_rate": 7.784305419945969e-05, + "loss": 1.7862, + "step": 10839 + }, + { + "epoch": 3.32719459791283, + "grad_norm": 0.2896903157234192, + "learning_rate": 7.783892548682077e-05, + "loss": 1.9138, + "step": 10840 + }, + { + "epoch": 3.3275015346838552, + "grad_norm": 0.3201359510421753, + "learning_rate": 7.783479649906127e-05, + "loss": 1.8382, + "step": 10841 + }, + { + "epoch": 3.32780847145488, + "grad_norm": 0.39285311102867126, + "learning_rate": 7.7830667236222e-05, + "loss": 1.7763, + "step": 10842 + }, + { + "epoch": 3.3281154082259055, + "grad_norm": 0.435007244348526, + "learning_rate": 7.782653769834376e-05, + "loss": 1.8415, + "step": 10843 + }, + { + "epoch": 3.328422344996931, + "grad_norm": 0.34605318307876587, + "learning_rate": 7.782240788546736e-05, + "loss": 1.757, + "step": 10844 + }, + { + "epoch": 3.3287292817679557, + "grad_norm": 0.26830604672431946, + "learning_rate": 7.781827779763362e-05, + "loss": 1.7779, + "step": 10845 + }, + { + "epoch": 3.329036218538981, + "grad_norm": 0.41851529479026794, + "learning_rate": 7.781414743488336e-05, + "loss": 1.8609, + "step": 10846 + }, + { + "epoch": 3.3293431553100064, + "grad_norm": 0.5058079361915588, + "learning_rate": 7.78100167972574e-05, + "loss": 1.8146, + "step": 10847 + }, + { + "epoch": 3.3296500920810312, + "grad_norm": 0.34394967555999756, + "learning_rate": 7.780588588479654e-05, + "loss": 1.8079, + "step": 10848 + }, + { + "epoch": 3.3299570288520566, + "grad_norm": 0.3033885061740875, + "learning_rate": 7.780175469754161e-05, + "loss": 1.8223, + "step": 10849 + }, + { + "epoch": 3.3302639656230815, + "grad_norm": 0.4431045651435852, + "learning_rate": 7.779762323553347e-05, + "loss": 1.8841, + "step": 10850 + }, + { + "epoch": 3.330570902394107, + "grad_norm": 0.3451448976993561, + "learning_rate": 7.77934914988129e-05, + "loss": 1.8092, + "step": 10851 + }, + { + "epoch": 3.330877839165132, + "grad_norm": 0.26580891013145447, + "learning_rate": 7.778935948742077e-05, + "loss": 1.8244, + "step": 10852 + }, + { + "epoch": 3.331184775936157, + "grad_norm": 0.32079070806503296, + "learning_rate": 7.778522720139792e-05, + "loss": 1.7816, + "step": 10853 + }, + { + "epoch": 3.3314917127071824, + "grad_norm": 0.35789042711257935, + "learning_rate": 7.778109464078514e-05, + "loss": 1.8211, + "step": 10854 + }, + { + "epoch": 3.3317986494782073, + "grad_norm": 0.2808612585067749, + "learning_rate": 7.77769618056233e-05, + "loss": 1.8387, + "step": 10855 + }, + { + "epoch": 3.3321055862492326, + "grad_norm": 0.24760548770427704, + "learning_rate": 7.777282869595326e-05, + "loss": 1.7795, + "step": 10856 + }, + { + "epoch": 3.332412523020258, + "grad_norm": 0.2840912640094757, + "learning_rate": 7.776869531181583e-05, + "loss": 1.7492, + "step": 10857 + }, + { + "epoch": 3.332719459791283, + "grad_norm": 0.2881413698196411, + "learning_rate": 7.77645616532519e-05, + "loss": 1.8157, + "step": 10858 + }, + { + "epoch": 3.333026396562308, + "grad_norm": 0.2508779764175415, + "learning_rate": 7.776042772030228e-05, + "loss": 1.8196, + "step": 10859 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.3307822048664093, + "learning_rate": 7.775629351300785e-05, + "loss": 1.8195, + "step": 10860 + }, + { + "epoch": 3.3336402701043584, + "grad_norm": 0.34392043948173523, + "learning_rate": 7.775215903140946e-05, + "loss": 1.7775, + "step": 10861 + }, + { + "epoch": 3.3339472068753837, + "grad_norm": 0.2594252824783325, + "learning_rate": 7.774802427554796e-05, + "loss": 1.7687, + "step": 10862 + }, + { + "epoch": 3.334254143646409, + "grad_norm": 0.3109053075313568, + "learning_rate": 7.774388924546423e-05, + "loss": 1.7908, + "step": 10863 + }, + { + "epoch": 3.334561080417434, + "grad_norm": 0.4801923930644989, + "learning_rate": 7.773975394119913e-05, + "loss": 1.8316, + "step": 10864 + }, + { + "epoch": 3.3348680171884593, + "grad_norm": 0.4754973351955414, + "learning_rate": 7.77356183627935e-05, + "loss": 1.8015, + "step": 10865 + }, + { + "epoch": 3.335174953959484, + "grad_norm": 0.29624658823013306, + "learning_rate": 7.773148251028825e-05, + "loss": 1.8179, + "step": 10866 + }, + { + "epoch": 3.3354818907305095, + "grad_norm": 0.32207581400871277, + "learning_rate": 7.772734638372423e-05, + "loss": 1.799, + "step": 10867 + }, + { + "epoch": 3.335788827501535, + "grad_norm": 0.5227517485618591, + "learning_rate": 7.772320998314233e-05, + "loss": 1.8452, + "step": 10868 + }, + { + "epoch": 3.3360957642725597, + "grad_norm": 0.4081100523471832, + "learning_rate": 7.771907330858341e-05, + "loss": 1.8182, + "step": 10869 + }, + { + "epoch": 3.336402701043585, + "grad_norm": 0.23786653578281403, + "learning_rate": 7.771493636008838e-05, + "loss": 1.7392, + "step": 10870 + }, + { + "epoch": 3.33670963781461, + "grad_norm": 0.37913820147514343, + "learning_rate": 7.771079913769807e-05, + "loss": 1.7559, + "step": 10871 + }, + { + "epoch": 3.3370165745856353, + "grad_norm": 0.4939163625240326, + "learning_rate": 7.770666164145344e-05, + "loss": 1.8076, + "step": 10872 + }, + { + "epoch": 3.3373235113566606, + "grad_norm": 0.3322528302669525, + "learning_rate": 7.770252387139532e-05, + "loss": 1.8045, + "step": 10873 + }, + { + "epoch": 3.337630448127686, + "grad_norm": 0.3685782849788666, + "learning_rate": 7.769838582756461e-05, + "loss": 1.7703, + "step": 10874 + }, + { + "epoch": 3.337937384898711, + "grad_norm": 0.5564271807670593, + "learning_rate": 7.769424751000224e-05, + "loss": 1.7697, + "step": 10875 + }, + { + "epoch": 3.338244321669736, + "grad_norm": 0.38610726594924927, + "learning_rate": 7.769010891874906e-05, + "loss": 1.7944, + "step": 10876 + }, + { + "epoch": 3.338551258440761, + "grad_norm": 0.23838558793067932, + "learning_rate": 7.768597005384602e-05, + "loss": 1.765, + "step": 10877 + }, + { + "epoch": 3.3388581952117864, + "grad_norm": 0.4334571063518524, + "learning_rate": 7.768183091533399e-05, + "loss": 1.7854, + "step": 10878 + }, + { + "epoch": 3.3391651319828117, + "grad_norm": 0.44844719767570496, + "learning_rate": 7.767769150325386e-05, + "loss": 1.7955, + "step": 10879 + }, + { + "epoch": 3.3394720687538366, + "grad_norm": 0.26543378829956055, + "learning_rate": 7.767355181764659e-05, + "loss": 1.8311, + "step": 10880 + }, + { + "epoch": 3.339779005524862, + "grad_norm": 0.39401358366012573, + "learning_rate": 7.766941185855304e-05, + "loss": 1.8264, + "step": 10881 + }, + { + "epoch": 3.340085942295887, + "grad_norm": 0.5476824045181274, + "learning_rate": 7.766527162601416e-05, + "loss": 1.8051, + "step": 10882 + }, + { + "epoch": 3.340392879066912, + "grad_norm": 0.4021138548851013, + "learning_rate": 7.766113112007084e-05, + "loss": 1.7941, + "step": 10883 + }, + { + "epoch": 3.3406998158379375, + "grad_norm": 0.3262040317058563, + "learning_rate": 7.765699034076402e-05, + "loss": 1.8317, + "step": 10884 + }, + { + "epoch": 3.3410067526089624, + "grad_norm": 0.5461146831512451, + "learning_rate": 7.765284928813459e-05, + "loss": 1.833, + "step": 10885 + }, + { + "epoch": 3.3413136893799877, + "grad_norm": 0.5067405700683594, + "learning_rate": 7.764870796222351e-05, + "loss": 1.7862, + "step": 10886 + }, + { + "epoch": 3.341620626151013, + "grad_norm": 0.2731069028377533, + "learning_rate": 7.76445663630717e-05, + "loss": 1.8173, + "step": 10887 + }, + { + "epoch": 3.341927562922038, + "grad_norm": 0.48928195238113403, + "learning_rate": 7.764042449072008e-05, + "loss": 1.7992, + "step": 10888 + }, + { + "epoch": 3.3422344996930633, + "grad_norm": 0.5338504910469055, + "learning_rate": 7.763628234520958e-05, + "loss": 1.7891, + "step": 10889 + }, + { + "epoch": 3.3425414364640886, + "grad_norm": 0.3136523365974426, + "learning_rate": 7.763213992658114e-05, + "loss": 1.8623, + "step": 10890 + }, + { + "epoch": 3.3428483732351135, + "grad_norm": 0.36551395058631897, + "learning_rate": 7.762799723487568e-05, + "loss": 1.8474, + "step": 10891 + }, + { + "epoch": 3.343155310006139, + "grad_norm": 0.35772353410720825, + "learning_rate": 7.762385427013419e-05, + "loss": 1.8625, + "step": 10892 + }, + { + "epoch": 3.3434622467771637, + "grad_norm": 0.29944708943367004, + "learning_rate": 7.761971103239755e-05, + "loss": 1.8181, + "step": 10893 + }, + { + "epoch": 3.343769183548189, + "grad_norm": 0.3395330309867859, + "learning_rate": 7.761556752170676e-05, + "loss": 1.7943, + "step": 10894 + }, + { + "epoch": 3.3440761203192144, + "grad_norm": 0.3624265193939209, + "learning_rate": 7.761142373810274e-05, + "loss": 1.8234, + "step": 10895 + }, + { + "epoch": 3.3443830570902393, + "grad_norm": 0.25409621000289917, + "learning_rate": 7.760727968162644e-05, + "loss": 1.7532, + "step": 10896 + }, + { + "epoch": 3.3446899938612646, + "grad_norm": 0.321437805891037, + "learning_rate": 7.760313535231883e-05, + "loss": 1.8808, + "step": 10897 + }, + { + "epoch": 3.3449969306322895, + "grad_norm": 0.2919142544269562, + "learning_rate": 7.759899075022086e-05, + "loss": 1.7677, + "step": 10898 + }, + { + "epoch": 3.345303867403315, + "grad_norm": 0.26515716314315796, + "learning_rate": 7.759484587537346e-05, + "loss": 1.8118, + "step": 10899 + }, + { + "epoch": 3.34561080417434, + "grad_norm": 0.2963240146636963, + "learning_rate": 7.759070072781764e-05, + "loss": 1.8329, + "step": 10900 + }, + { + "epoch": 3.345917740945365, + "grad_norm": 0.3186480700969696, + "learning_rate": 7.758655530759435e-05, + "loss": 1.8013, + "step": 10901 + }, + { + "epoch": 3.3462246777163904, + "grad_norm": 0.256145715713501, + "learning_rate": 7.758240961474454e-05, + "loss": 1.7865, + "step": 10902 + }, + { + "epoch": 3.3465316144874158, + "grad_norm": 0.28951629996299744, + "learning_rate": 7.757826364930921e-05, + "loss": 1.8091, + "step": 10903 + }, + { + "epoch": 3.3468385512584407, + "grad_norm": 0.2692483365535736, + "learning_rate": 7.75741174113293e-05, + "loss": 1.8308, + "step": 10904 + }, + { + "epoch": 3.347145488029466, + "grad_norm": 0.27615389227867126, + "learning_rate": 7.75699709008458e-05, + "loss": 1.7888, + "step": 10905 + }, + { + "epoch": 3.3474524248004913, + "grad_norm": 0.2819034457206726, + "learning_rate": 7.75658241178997e-05, + "loss": 1.7624, + "step": 10906 + }, + { + "epoch": 3.347759361571516, + "grad_norm": 0.2627592086791992, + "learning_rate": 7.756167706253196e-05, + "loss": 1.7696, + "step": 10907 + }, + { + "epoch": 3.3480662983425415, + "grad_norm": 0.3528621196746826, + "learning_rate": 7.755752973478356e-05, + "loss": 1.7725, + "step": 10908 + }, + { + "epoch": 3.3483732351135664, + "grad_norm": 0.35949698090553284, + "learning_rate": 7.755338213469552e-05, + "loss": 1.8163, + "step": 10909 + }, + { + "epoch": 3.3486801718845918, + "grad_norm": 0.25142577290534973, + "learning_rate": 7.75492342623088e-05, + "loss": 1.7879, + "step": 10910 + }, + { + "epoch": 3.348987108655617, + "grad_norm": 0.25766023993492126, + "learning_rate": 7.75450861176644e-05, + "loss": 1.8143, + "step": 10911 + }, + { + "epoch": 3.349294045426642, + "grad_norm": 0.2736956477165222, + "learning_rate": 7.754093770080331e-05, + "loss": 1.8907, + "step": 10912 + }, + { + "epoch": 3.3496009821976673, + "grad_norm": 0.23700755834579468, + "learning_rate": 7.753678901176654e-05, + "loss": 1.813, + "step": 10913 + }, + { + "epoch": 3.349907918968692, + "grad_norm": 0.245509073138237, + "learning_rate": 7.753264005059507e-05, + "loss": 1.8019, + "step": 10914 + }, + { + "epoch": 3.3502148557397176, + "grad_norm": 0.232910618185997, + "learning_rate": 7.752849081732993e-05, + "loss": 1.784, + "step": 10915 + }, + { + "epoch": 3.350521792510743, + "grad_norm": 0.22989360988140106, + "learning_rate": 7.75243413120121e-05, + "loss": 1.7597, + "step": 10916 + }, + { + "epoch": 3.350828729281768, + "grad_norm": 0.2093925178050995, + "learning_rate": 7.752019153468258e-05, + "loss": 1.7698, + "step": 10917 + }, + { + "epoch": 3.351135666052793, + "grad_norm": 0.25539630651474, + "learning_rate": 7.751604148538241e-05, + "loss": 1.8287, + "step": 10918 + }, + { + "epoch": 3.3514426028238185, + "grad_norm": 0.2731820046901703, + "learning_rate": 7.75118911641526e-05, + "loss": 1.8862, + "step": 10919 + }, + { + "epoch": 3.3517495395948433, + "grad_norm": 0.2464541345834732, + "learning_rate": 7.750774057103416e-05, + "loss": 1.8165, + "step": 10920 + }, + { + "epoch": 3.3520564763658687, + "grad_norm": 0.26380276679992676, + "learning_rate": 7.75035897060681e-05, + "loss": 1.78, + "step": 10921 + }, + { + "epoch": 3.352363413136894, + "grad_norm": 0.3080748915672302, + "learning_rate": 7.749943856929542e-05, + "loss": 1.7925, + "step": 10922 + }, + { + "epoch": 3.352670349907919, + "grad_norm": 0.317754864692688, + "learning_rate": 7.74952871607572e-05, + "loss": 1.8248, + "step": 10923 + }, + { + "epoch": 3.3529772866789442, + "grad_norm": 0.2525196373462677, + "learning_rate": 7.749113548049442e-05, + "loss": 1.762, + "step": 10924 + }, + { + "epoch": 3.353284223449969, + "grad_norm": 0.3149549961090088, + "learning_rate": 7.748698352854814e-05, + "loss": 1.8289, + "step": 10925 + }, + { + "epoch": 3.3535911602209945, + "grad_norm": 0.35744383931159973, + "learning_rate": 7.748283130495937e-05, + "loss": 1.8132, + "step": 10926 + }, + { + "epoch": 3.35389809699202, + "grad_norm": 0.28599128127098083, + "learning_rate": 7.747867880976916e-05, + "loss": 1.7351, + "step": 10927 + }, + { + "epoch": 3.3542050337630447, + "grad_norm": 0.24428869783878326, + "learning_rate": 7.747452604301852e-05, + "loss": 1.794, + "step": 10928 + }, + { + "epoch": 3.35451197053407, + "grad_norm": 0.29067808389663696, + "learning_rate": 7.747037300474854e-05, + "loss": 1.8181, + "step": 10929 + }, + { + "epoch": 3.354818907305095, + "grad_norm": 0.32417505979537964, + "learning_rate": 7.746621969500021e-05, + "loss": 1.8338, + "step": 10930 + }, + { + "epoch": 3.3551258440761202, + "grad_norm": 0.29536551237106323, + "learning_rate": 7.746206611381462e-05, + "loss": 1.8732, + "step": 10931 + }, + { + "epoch": 3.3554327808471456, + "grad_norm": 0.3169345259666443, + "learning_rate": 7.745791226123278e-05, + "loss": 1.876, + "step": 10932 + }, + { + "epoch": 3.3557397176181705, + "grad_norm": 0.2680271565914154, + "learning_rate": 7.745375813729576e-05, + "loss": 1.7347, + "step": 10933 + }, + { + "epoch": 3.356046654389196, + "grad_norm": 0.28339266777038574, + "learning_rate": 7.74496037420446e-05, + "loss": 1.8507, + "step": 10934 + }, + { + "epoch": 3.356353591160221, + "grad_norm": 0.2567409574985504, + "learning_rate": 7.744544907552038e-05, + "loss": 1.8244, + "step": 10935 + }, + { + "epoch": 3.356660527931246, + "grad_norm": 0.266063928604126, + "learning_rate": 7.744129413776416e-05, + "loss": 1.7864, + "step": 10936 + }, + { + "epoch": 3.3569674647022714, + "grad_norm": 0.2490999698638916, + "learning_rate": 7.743713892881696e-05, + "loss": 1.7637, + "step": 10937 + }, + { + "epoch": 3.3572744014732967, + "grad_norm": 0.25857025384902954, + "learning_rate": 7.743298344871988e-05, + "loss": 1.8101, + "step": 10938 + }, + { + "epoch": 3.3575813382443216, + "grad_norm": 0.2549006938934326, + "learning_rate": 7.742882769751398e-05, + "loss": 1.7782, + "step": 10939 + }, + { + "epoch": 3.357888275015347, + "grad_norm": 0.23915350437164307, + "learning_rate": 7.742467167524035e-05, + "loss": 1.7822, + "step": 10940 + }, + { + "epoch": 3.358195211786372, + "grad_norm": 0.25501590967178345, + "learning_rate": 7.742051538194e-05, + "loss": 1.798, + "step": 10941 + }, + { + "epoch": 3.358502148557397, + "grad_norm": 0.29332005977630615, + "learning_rate": 7.741635881765408e-05, + "loss": 1.8334, + "step": 10942 + }, + { + "epoch": 3.3588090853284225, + "grad_norm": 0.28878241777420044, + "learning_rate": 7.741220198242362e-05, + "loss": 1.8266, + "step": 10943 + }, + { + "epoch": 3.3591160220994474, + "grad_norm": 0.3068650960922241, + "learning_rate": 7.740804487628971e-05, + "loss": 1.8562, + "step": 10944 + }, + { + "epoch": 3.3594229588704727, + "grad_norm": 0.2522405683994293, + "learning_rate": 7.740388749929343e-05, + "loss": 1.8001, + "step": 10945 + }, + { + "epoch": 3.359729895641498, + "grad_norm": 0.3073521554470062, + "learning_rate": 7.739972985147588e-05, + "loss": 1.7454, + "step": 10946 + }, + { + "epoch": 3.360036832412523, + "grad_norm": 0.3018052577972412, + "learning_rate": 7.739557193287815e-05, + "loss": 1.7888, + "step": 10947 + }, + { + "epoch": 3.3603437691835483, + "grad_norm": 0.2738604247570038, + "learning_rate": 7.73914137435413e-05, + "loss": 1.7208, + "step": 10948 + }, + { + "epoch": 3.3606507059545736, + "grad_norm": 0.37699586153030396, + "learning_rate": 7.738725528350646e-05, + "loss": 1.8175, + "step": 10949 + }, + { + "epoch": 3.3609576427255985, + "grad_norm": 0.3479778468608856, + "learning_rate": 7.738309655281471e-05, + "loss": 1.818, + "step": 10950 + }, + { + "epoch": 3.361264579496624, + "grad_norm": 0.24871166050434113, + "learning_rate": 7.737893755150715e-05, + "loss": 1.7046, + "step": 10951 + }, + { + "epoch": 3.3615715162676487, + "grad_norm": 0.45015642046928406, + "learning_rate": 7.737477827962488e-05, + "loss": 1.8517, + "step": 10952 + }, + { + "epoch": 3.361878453038674, + "grad_norm": 0.4149077534675598, + "learning_rate": 7.7370618737209e-05, + "loss": 1.7403, + "step": 10953 + }, + { + "epoch": 3.3621853898096994, + "grad_norm": 0.2556059658527374, + "learning_rate": 7.736645892430064e-05, + "loss": 1.8167, + "step": 10954 + }, + { + "epoch": 3.3624923265807243, + "grad_norm": 0.3153657615184784, + "learning_rate": 7.736229884094088e-05, + "loss": 1.8471, + "step": 10955 + }, + { + "epoch": 3.3627992633517496, + "grad_norm": 0.27943772077560425, + "learning_rate": 7.735813848717084e-05, + "loss": 1.7742, + "step": 10956 + }, + { + "epoch": 3.3631062001227745, + "grad_norm": 0.28270283341407776, + "learning_rate": 7.735397786303164e-05, + "loss": 1.8418, + "step": 10957 + }, + { + "epoch": 3.3634131368938, + "grad_norm": 0.3596261441707611, + "learning_rate": 7.734981696856442e-05, + "loss": 1.8213, + "step": 10958 + }, + { + "epoch": 3.363720073664825, + "grad_norm": 0.3678492307662964, + "learning_rate": 7.734565580381026e-05, + "loss": 1.806, + "step": 10959 + }, + { + "epoch": 3.36402701043585, + "grad_norm": 0.27758681774139404, + "learning_rate": 7.734149436881031e-05, + "loss": 1.7832, + "step": 10960 + }, + { + "epoch": 3.3643339472068754, + "grad_norm": 0.2821379005908966, + "learning_rate": 7.733733266360568e-05, + "loss": 1.8888, + "step": 10961 + }, + { + "epoch": 3.3646408839779007, + "grad_norm": 0.33676958084106445, + "learning_rate": 7.733317068823751e-05, + "loss": 1.902, + "step": 10962 + }, + { + "epoch": 3.3649478207489256, + "grad_norm": 0.3116114139556885, + "learning_rate": 7.732900844274691e-05, + "loss": 1.8228, + "step": 10963 + }, + { + "epoch": 3.365254757519951, + "grad_norm": 0.3286324143409729, + "learning_rate": 7.732484592717506e-05, + "loss": 1.8707, + "step": 10964 + }, + { + "epoch": 3.3655616942909763, + "grad_norm": 0.2732192873954773, + "learning_rate": 7.732068314156304e-05, + "loss": 1.773, + "step": 10965 + }, + { + "epoch": 3.365868631062001, + "grad_norm": 0.26663896441459656, + "learning_rate": 7.731652008595204e-05, + "loss": 1.7837, + "step": 10966 + }, + { + "epoch": 3.3661755678330265, + "grad_norm": 0.27447745203971863, + "learning_rate": 7.731235676038317e-05, + "loss": 1.9103, + "step": 10967 + }, + { + "epoch": 3.3664825046040514, + "grad_norm": 0.30832916498184204, + "learning_rate": 7.730819316489757e-05, + "loss": 1.7552, + "step": 10968 + }, + { + "epoch": 3.3667894413750767, + "grad_norm": 0.29657161235809326, + "learning_rate": 7.73040292995364e-05, + "loss": 1.7654, + "step": 10969 + }, + { + "epoch": 3.367096378146102, + "grad_norm": 0.30434274673461914, + "learning_rate": 7.729986516434082e-05, + "loss": 1.8646, + "step": 10970 + }, + { + "epoch": 3.367403314917127, + "grad_norm": 0.25926661491394043, + "learning_rate": 7.729570075935198e-05, + "loss": 1.7555, + "step": 10971 + }, + { + "epoch": 3.3677102516881523, + "grad_norm": 0.2775980532169342, + "learning_rate": 7.729153608461102e-05, + "loss": 1.8427, + "step": 10972 + }, + { + "epoch": 3.368017188459177, + "grad_norm": 0.23915666341781616, + "learning_rate": 7.72873711401591e-05, + "loss": 1.7902, + "step": 10973 + }, + { + "epoch": 3.3683241252302025, + "grad_norm": 0.2603691518306732, + "learning_rate": 7.728320592603737e-05, + "loss": 1.8587, + "step": 10974 + }, + { + "epoch": 3.368631062001228, + "grad_norm": 0.2579508125782013, + "learning_rate": 7.727904044228703e-05, + "loss": 1.7617, + "step": 10975 + }, + { + "epoch": 3.3689379987722528, + "grad_norm": 0.3384297788143158, + "learning_rate": 7.72748746889492e-05, + "loss": 1.8499, + "step": 10976 + }, + { + "epoch": 3.369244935543278, + "grad_norm": 0.36756646633148193, + "learning_rate": 7.727070866606509e-05, + "loss": 1.808, + "step": 10977 + }, + { + "epoch": 3.3695518723143034, + "grad_norm": 0.3212372958660126, + "learning_rate": 7.726654237367587e-05, + "loss": 1.8245, + "step": 10978 + }, + { + "epoch": 3.3698588090853283, + "grad_norm": 0.23782415688037872, + "learning_rate": 7.726237581182267e-05, + "loss": 1.7629, + "step": 10979 + }, + { + "epoch": 3.3701657458563536, + "grad_norm": 0.2782919108867645, + "learning_rate": 7.725820898054669e-05, + "loss": 1.8, + "step": 10980 + }, + { + "epoch": 3.370472682627379, + "grad_norm": 0.2973455488681793, + "learning_rate": 7.725404187988914e-05, + "loss": 1.7949, + "step": 10981 + }, + { + "epoch": 3.370779619398404, + "grad_norm": 0.2875392735004425, + "learning_rate": 7.724987450989114e-05, + "loss": 1.8019, + "step": 10982 + }, + { + "epoch": 3.371086556169429, + "grad_norm": 0.26133236289024353, + "learning_rate": 7.724570687059394e-05, + "loss": 1.7984, + "step": 10983 + }, + { + "epoch": 3.371393492940454, + "grad_norm": 0.2760173976421356, + "learning_rate": 7.724153896203867e-05, + "loss": 1.8082, + "step": 10984 + }, + { + "epoch": 3.3717004297114794, + "grad_norm": 0.26373061537742615, + "learning_rate": 7.723737078426656e-05, + "loss": 1.8408, + "step": 10985 + }, + { + "epoch": 3.3720073664825048, + "grad_norm": 0.29425618052482605, + "learning_rate": 7.723320233731879e-05, + "loss": 1.7992, + "step": 10986 + }, + { + "epoch": 3.3723143032535297, + "grad_norm": 0.29822099208831787, + "learning_rate": 7.722903362123655e-05, + "loss": 1.8204, + "step": 10987 + }, + { + "epoch": 3.372621240024555, + "grad_norm": 0.25945618748664856, + "learning_rate": 7.722486463606104e-05, + "loss": 1.7376, + "step": 10988 + }, + { + "epoch": 3.37292817679558, + "grad_norm": 0.26367196440696716, + "learning_rate": 7.722069538183345e-05, + "loss": 1.814, + "step": 10989 + }, + { + "epoch": 3.373235113566605, + "grad_norm": 0.25015249848365784, + "learning_rate": 7.7216525858595e-05, + "loss": 1.8199, + "step": 10990 + }, + { + "epoch": 3.3735420503376305, + "grad_norm": 0.3035781681537628, + "learning_rate": 7.72123560663869e-05, + "loss": 1.739, + "step": 10991 + }, + { + "epoch": 3.3738489871086554, + "grad_norm": 0.2847912013530731, + "learning_rate": 7.720818600525033e-05, + "loss": 1.8754, + "step": 10992 + }, + { + "epoch": 3.3741559238796808, + "grad_norm": 0.2533976435661316, + "learning_rate": 7.720401567522653e-05, + "loss": 1.7616, + "step": 10993 + }, + { + "epoch": 3.374462860650706, + "grad_norm": 0.250828355550766, + "learning_rate": 7.719984507635669e-05, + "loss": 1.7973, + "step": 10994 + }, + { + "epoch": 3.374769797421731, + "grad_norm": 0.3019898235797882, + "learning_rate": 7.719567420868206e-05, + "loss": 1.7563, + "step": 10995 + }, + { + "epoch": 3.3750767341927563, + "grad_norm": 0.2703310549259186, + "learning_rate": 7.719150307224382e-05, + "loss": 1.8183, + "step": 10996 + }, + { + "epoch": 3.3753836709637817, + "grad_norm": 0.2434745579957962, + "learning_rate": 7.718733166708321e-05, + "loss": 1.7913, + "step": 10997 + }, + { + "epoch": 3.3756906077348066, + "grad_norm": 0.28036773204803467, + "learning_rate": 7.718315999324146e-05, + "loss": 1.7884, + "step": 10998 + }, + { + "epoch": 3.375997544505832, + "grad_norm": 0.25123077630996704, + "learning_rate": 7.717898805075978e-05, + "loss": 1.7394, + "step": 10999 + }, + { + "epoch": 3.376304481276857, + "grad_norm": 0.2313947230577469, + "learning_rate": 7.717481583967943e-05, + "loss": 1.7537, + "step": 11000 + }, + { + "epoch": 3.376611418047882, + "grad_norm": 0.27152860164642334, + "learning_rate": 7.71706433600416e-05, + "loss": 1.8596, + "step": 11001 + }, + { + "epoch": 3.3769183548189075, + "grad_norm": 0.32866382598876953, + "learning_rate": 7.716647061188757e-05, + "loss": 1.9007, + "step": 11002 + }, + { + "epoch": 3.3772252915899323, + "grad_norm": 0.2842368185520172, + "learning_rate": 7.716229759525854e-05, + "loss": 1.7781, + "step": 11003 + }, + { + "epoch": 3.3775322283609577, + "grad_norm": 0.30411216616630554, + "learning_rate": 7.715812431019576e-05, + "loss": 1.7403, + "step": 11004 + }, + { + "epoch": 3.3778391651319826, + "grad_norm": 0.31848132610321045, + "learning_rate": 7.71539507567405e-05, + "loss": 1.817, + "step": 11005 + }, + { + "epoch": 3.378146101903008, + "grad_norm": 0.24206148087978363, + "learning_rate": 7.714977693493397e-05, + "loss": 1.7796, + "step": 11006 + }, + { + "epoch": 3.3784530386740332, + "grad_norm": 0.2982998490333557, + "learning_rate": 7.714560284481742e-05, + "loss": 1.7883, + "step": 11007 + }, + { + "epoch": 3.378759975445058, + "grad_norm": 0.24857483804225922, + "learning_rate": 7.714142848643213e-05, + "loss": 1.7447, + "step": 11008 + }, + { + "epoch": 3.3790669122160835, + "grad_norm": 0.2509039044380188, + "learning_rate": 7.713725385981932e-05, + "loss": 1.8362, + "step": 11009 + }, + { + "epoch": 3.379373848987109, + "grad_norm": 0.2759779095649719, + "learning_rate": 7.713307896502027e-05, + "loss": 1.8655, + "step": 11010 + }, + { + "epoch": 3.3796807857581337, + "grad_norm": 0.264776349067688, + "learning_rate": 7.712890380207623e-05, + "loss": 1.8221, + "step": 11011 + }, + { + "epoch": 3.379987722529159, + "grad_norm": 0.2771971821784973, + "learning_rate": 7.712472837102846e-05, + "loss": 1.6992, + "step": 11012 + }, + { + "epoch": 3.3802946593001844, + "grad_norm": 0.2749316096305847, + "learning_rate": 7.712055267191822e-05, + "loss": 1.8128, + "step": 11013 + }, + { + "epoch": 3.3806015960712092, + "grad_norm": 0.256656289100647, + "learning_rate": 7.71163767047868e-05, + "loss": 1.8382, + "step": 11014 + }, + { + "epoch": 3.3809085328422346, + "grad_norm": 0.27646976709365845, + "learning_rate": 7.711220046967545e-05, + "loss": 1.8321, + "step": 11015 + }, + { + "epoch": 3.3812154696132595, + "grad_norm": 0.3083149194717407, + "learning_rate": 7.710802396662542e-05, + "loss": 1.904, + "step": 11016 + }, + { + "epoch": 3.381522406384285, + "grad_norm": 0.2750856280326843, + "learning_rate": 7.710384719567803e-05, + "loss": 1.7596, + "step": 11017 + }, + { + "epoch": 3.38182934315531, + "grad_norm": 0.3029455244541168, + "learning_rate": 7.709967015687452e-05, + "loss": 1.8542, + "step": 11018 + }, + { + "epoch": 3.382136279926335, + "grad_norm": 0.3144093453884125, + "learning_rate": 7.709549285025622e-05, + "loss": 1.7489, + "step": 11019 + }, + { + "epoch": 3.3824432166973604, + "grad_norm": 0.2675442099571228, + "learning_rate": 7.709131527586433e-05, + "loss": 1.7324, + "step": 11020 + }, + { + "epoch": 3.3827501534683857, + "grad_norm": 0.2906095087528229, + "learning_rate": 7.708713743374021e-05, + "loss": 1.7848, + "step": 11021 + }, + { + "epoch": 3.3830570902394106, + "grad_norm": 0.25141623616218567, + "learning_rate": 7.708295932392513e-05, + "loss": 1.7423, + "step": 11022 + }, + { + "epoch": 3.383364027010436, + "grad_norm": 0.25832003355026245, + "learning_rate": 7.707878094646037e-05, + "loss": 1.7792, + "step": 11023 + }, + { + "epoch": 3.3836709637814613, + "grad_norm": 0.23710070550441742, + "learning_rate": 7.70746023013872e-05, + "loss": 1.7916, + "step": 11024 + }, + { + "epoch": 3.383977900552486, + "grad_norm": 0.286735862493515, + "learning_rate": 7.707042338874697e-05, + "loss": 1.8272, + "step": 11025 + }, + { + "epoch": 3.3842848373235115, + "grad_norm": 0.2536577582359314, + "learning_rate": 7.706624420858094e-05, + "loss": 1.7839, + "step": 11026 + }, + { + "epoch": 3.3845917740945364, + "grad_norm": 0.5564702749252319, + "learning_rate": 7.706206476093043e-05, + "loss": 1.7832, + "step": 11027 + }, + { + "epoch": 3.3848987108655617, + "grad_norm": 0.34694772958755493, + "learning_rate": 7.705788504583671e-05, + "loss": 1.8668, + "step": 11028 + }, + { + "epoch": 3.385205647636587, + "grad_norm": 0.30388176441192627, + "learning_rate": 7.705370506334113e-05, + "loss": 1.8244, + "step": 11029 + }, + { + "epoch": 3.385512584407612, + "grad_norm": 0.2998919188976288, + "learning_rate": 7.704952481348497e-05, + "loss": 1.7927, + "step": 11030 + }, + { + "epoch": 3.3858195211786373, + "grad_norm": 0.2714936435222626, + "learning_rate": 7.704534429630955e-05, + "loss": 1.8757, + "step": 11031 + }, + { + "epoch": 3.386126457949662, + "grad_norm": 0.26670241355895996, + "learning_rate": 7.704116351185619e-05, + "loss": 1.8146, + "step": 11032 + }, + { + "epoch": 3.3864333947206875, + "grad_norm": 0.2500552833080292, + "learning_rate": 7.703698246016621e-05, + "loss": 1.7984, + "step": 11033 + }, + { + "epoch": 3.386740331491713, + "grad_norm": 0.2494918406009674, + "learning_rate": 7.703280114128091e-05, + "loss": 1.7433, + "step": 11034 + }, + { + "epoch": 3.3870472682627377, + "grad_norm": 0.25658491253852844, + "learning_rate": 7.702861955524163e-05, + "loss": 1.8487, + "step": 11035 + }, + { + "epoch": 3.387354205033763, + "grad_norm": 0.2871410548686981, + "learning_rate": 7.702443770208969e-05, + "loss": 1.7919, + "step": 11036 + }, + { + "epoch": 3.3876611418047884, + "grad_norm": 0.3347938060760498, + "learning_rate": 7.702025558186643e-05, + "loss": 1.8091, + "step": 11037 + }, + { + "epoch": 3.3879680785758133, + "grad_norm": 0.39016643166542053, + "learning_rate": 7.701607319461315e-05, + "loss": 1.7816, + "step": 11038 + }, + { + "epoch": 3.3882750153468386, + "grad_norm": 0.3423028290271759, + "learning_rate": 7.701189054037121e-05, + "loss": 1.8454, + "step": 11039 + }, + { + "epoch": 3.388581952117864, + "grad_norm": 0.27592089772224426, + "learning_rate": 7.700770761918192e-05, + "loss": 1.8431, + "step": 11040 + }, + { + "epoch": 3.388888888888889, + "grad_norm": 0.46047264337539673, + "learning_rate": 7.700352443108665e-05, + "loss": 1.8412, + "step": 11041 + }, + { + "epoch": 3.389195825659914, + "grad_norm": 0.49226754903793335, + "learning_rate": 7.699934097612673e-05, + "loss": 1.8212, + "step": 11042 + }, + { + "epoch": 3.389502762430939, + "grad_norm": 0.3958778381347656, + "learning_rate": 7.699515725434348e-05, + "loss": 1.747, + "step": 11043 + }, + { + "epoch": 3.3898096992019644, + "grad_norm": 0.26097169518470764, + "learning_rate": 7.699097326577827e-05, + "loss": 1.7631, + "step": 11044 + }, + { + "epoch": 3.3901166359729897, + "grad_norm": 0.2922612130641937, + "learning_rate": 7.698678901047245e-05, + "loss": 1.7891, + "step": 11045 + }, + { + "epoch": 3.3904235727440146, + "grad_norm": 0.4195055365562439, + "learning_rate": 7.698260448846734e-05, + "loss": 1.7765, + "step": 11046 + }, + { + "epoch": 3.39073050951504, + "grad_norm": 0.4572988450527191, + "learning_rate": 7.697841969980434e-05, + "loss": 1.8085, + "step": 11047 + }, + { + "epoch": 3.391037446286065, + "grad_norm": 0.38819587230682373, + "learning_rate": 7.697423464452478e-05, + "loss": 1.8854, + "step": 11048 + }, + { + "epoch": 3.39134438305709, + "grad_norm": 0.27421653270721436, + "learning_rate": 7.697004932267003e-05, + "loss": 1.8327, + "step": 11049 + }, + { + "epoch": 3.3916513198281155, + "grad_norm": 0.33559146523475647, + "learning_rate": 7.696586373428142e-05, + "loss": 1.8109, + "step": 11050 + }, + { + "epoch": 3.3919582565991404, + "grad_norm": 0.39438655972480774, + "learning_rate": 7.696167787940037e-05, + "loss": 1.7909, + "step": 11051 + }, + { + "epoch": 3.3922651933701657, + "grad_norm": 0.3425842523574829, + "learning_rate": 7.695749175806819e-05, + "loss": 1.8571, + "step": 11052 + }, + { + "epoch": 3.392572130141191, + "grad_norm": 0.2860080301761627, + "learning_rate": 7.695330537032628e-05, + "loss": 1.8546, + "step": 11053 + }, + { + "epoch": 3.392879066912216, + "grad_norm": 0.35894665122032166, + "learning_rate": 7.694911871621601e-05, + "loss": 1.7895, + "step": 11054 + }, + { + "epoch": 3.3931860036832413, + "grad_norm": 0.351193904876709, + "learning_rate": 7.694493179577879e-05, + "loss": 1.7453, + "step": 11055 + }, + { + "epoch": 3.3934929404542666, + "grad_norm": 0.24812865257263184, + "learning_rate": 7.694074460905592e-05, + "loss": 1.8131, + "step": 11056 + }, + { + "epoch": 3.3937998772252915, + "grad_norm": 0.38620972633361816, + "learning_rate": 7.693655715608883e-05, + "loss": 1.8346, + "step": 11057 + }, + { + "epoch": 3.394106813996317, + "grad_norm": 0.5005692839622498, + "learning_rate": 7.69323694369189e-05, + "loss": 1.9031, + "step": 11058 + }, + { + "epoch": 3.3944137507673418, + "grad_norm": 0.4321887791156769, + "learning_rate": 7.692818145158751e-05, + "loss": 1.8783, + "step": 11059 + }, + { + "epoch": 3.394720687538367, + "grad_norm": 0.269307017326355, + "learning_rate": 7.692399320013603e-05, + "loss": 1.8075, + "step": 11060 + }, + { + "epoch": 3.3950276243093924, + "grad_norm": 0.2945556342601776, + "learning_rate": 7.69198046826059e-05, + "loss": 1.8366, + "step": 11061 + }, + { + "epoch": 3.3953345610804173, + "grad_norm": 0.30531853437423706, + "learning_rate": 7.691561589903847e-05, + "loss": 1.7665, + "step": 11062 + }, + { + "epoch": 3.3956414978514426, + "grad_norm": 0.25105199217796326, + "learning_rate": 7.691142684947513e-05, + "loss": 1.782, + "step": 11063 + }, + { + "epoch": 3.3959484346224675, + "grad_norm": 0.3373202085494995, + "learning_rate": 7.69072375339573e-05, + "loss": 1.8148, + "step": 11064 + }, + { + "epoch": 3.396255371393493, + "grad_norm": 0.34207093715667725, + "learning_rate": 7.690304795252638e-05, + "loss": 1.8287, + "step": 11065 + }, + { + "epoch": 3.396562308164518, + "grad_norm": 0.26281681656837463, + "learning_rate": 7.68988581052238e-05, + "loss": 1.8551, + "step": 11066 + }, + { + "epoch": 3.396869244935543, + "grad_norm": 0.3091152608394623, + "learning_rate": 7.689466799209091e-05, + "loss": 1.7689, + "step": 11067 + }, + { + "epoch": 3.3971761817065684, + "grad_norm": 0.37421298027038574, + "learning_rate": 7.689047761316914e-05, + "loss": 1.7908, + "step": 11068 + }, + { + "epoch": 3.3974831184775938, + "grad_norm": 0.3745511770248413, + "learning_rate": 7.688628696849993e-05, + "loss": 1.8408, + "step": 11069 + }, + { + "epoch": 3.3977900552486187, + "grad_norm": 0.3003663122653961, + "learning_rate": 7.688209605812467e-05, + "loss": 1.9109, + "step": 11070 + }, + { + "epoch": 3.398096992019644, + "grad_norm": 0.3437681496143341, + "learning_rate": 7.687790488208478e-05, + "loss": 1.811, + "step": 11071 + }, + { + "epoch": 3.3984039287906693, + "grad_norm": 0.3480641841888428, + "learning_rate": 7.687371344042168e-05, + "loss": 1.8114, + "step": 11072 + }, + { + "epoch": 3.398710865561694, + "grad_norm": 0.24670913815498352, + "learning_rate": 7.686952173317679e-05, + "loss": 1.7959, + "step": 11073 + }, + { + "epoch": 3.3990178023327196, + "grad_norm": 0.2939499020576477, + "learning_rate": 7.686532976039154e-05, + "loss": 1.7518, + "step": 11074 + }, + { + "epoch": 3.3993247391037444, + "grad_norm": 0.3332279622554779, + "learning_rate": 7.686113752210736e-05, + "loss": 1.843, + "step": 11075 + }, + { + "epoch": 3.3996316758747698, + "grad_norm": 0.22967280447483063, + "learning_rate": 7.685694501836566e-05, + "loss": 1.7408, + "step": 11076 + }, + { + "epoch": 3.399938612645795, + "grad_norm": 0.3443470001220703, + "learning_rate": 7.685275224920789e-05, + "loss": 1.8004, + "step": 11077 + }, + { + "epoch": 3.40024554941682, + "grad_norm": 0.3725457489490509, + "learning_rate": 7.684855921467548e-05, + "loss": 1.833, + "step": 11078 + }, + { + "epoch": 3.4005524861878453, + "grad_norm": 0.3178638219833374, + "learning_rate": 7.68443659148099e-05, + "loss": 1.8055, + "step": 11079 + }, + { + "epoch": 3.4008594229588702, + "grad_norm": 0.2609167695045471, + "learning_rate": 7.684017234965254e-05, + "loss": 1.7881, + "step": 11080 + }, + { + "epoch": 3.4011663597298956, + "grad_norm": 0.26975762844085693, + "learning_rate": 7.683597851924486e-05, + "loss": 1.8424, + "step": 11081 + }, + { + "epoch": 3.401473296500921, + "grad_norm": 0.266661673784256, + "learning_rate": 7.683178442362832e-05, + "loss": 1.7785, + "step": 11082 + }, + { + "epoch": 3.401780233271946, + "grad_norm": 0.27915671467781067, + "learning_rate": 7.682759006284436e-05, + "loss": 1.8241, + "step": 11083 + }, + { + "epoch": 3.402087170042971, + "grad_norm": 0.25167274475097656, + "learning_rate": 7.682339543693444e-05, + "loss": 1.7637, + "step": 11084 + }, + { + "epoch": 3.4023941068139965, + "grad_norm": 0.2439529299736023, + "learning_rate": 7.681920054593999e-05, + "loss": 1.7796, + "step": 11085 + }, + { + "epoch": 3.4027010435850213, + "grad_norm": 0.26224252581596375, + "learning_rate": 7.681500538990249e-05, + "loss": 1.8018, + "step": 11086 + }, + { + "epoch": 3.4030079803560467, + "grad_norm": 0.25093868374824524, + "learning_rate": 7.681080996886336e-05, + "loss": 1.7664, + "step": 11087 + }, + { + "epoch": 3.403314917127072, + "grad_norm": 0.26393210887908936, + "learning_rate": 7.680661428286413e-05, + "loss": 1.8389, + "step": 11088 + }, + { + "epoch": 3.403621853898097, + "grad_norm": 0.24750283360481262, + "learning_rate": 7.680241833194622e-05, + "loss": 1.8358, + "step": 11089 + }, + { + "epoch": 3.4039287906691222, + "grad_norm": 0.21568982303142548, + "learning_rate": 7.67982221161511e-05, + "loss": 1.7874, + "step": 11090 + }, + { + "epoch": 3.404235727440147, + "grad_norm": 0.24407126009464264, + "learning_rate": 7.679402563552023e-05, + "loss": 1.7753, + "step": 11091 + }, + { + "epoch": 3.4045426642111725, + "grad_norm": 0.23288260400295258, + "learning_rate": 7.67898288900951e-05, + "loss": 1.8046, + "step": 11092 + }, + { + "epoch": 3.404849600982198, + "grad_norm": 0.2548544108867645, + "learning_rate": 7.678563187991718e-05, + "loss": 1.8778, + "step": 11093 + }, + { + "epoch": 3.4051565377532227, + "grad_norm": 0.24008090794086456, + "learning_rate": 7.678143460502796e-05, + "loss": 1.7912, + "step": 11094 + }, + { + "epoch": 3.405463474524248, + "grad_norm": 0.26085031032562256, + "learning_rate": 7.677723706546889e-05, + "loss": 1.849, + "step": 11095 + }, + { + "epoch": 3.4057704112952734, + "grad_norm": 0.2830932140350342, + "learning_rate": 7.677303926128147e-05, + "loss": 1.8265, + "step": 11096 + }, + { + "epoch": 3.4060773480662982, + "grad_norm": 0.27593597769737244, + "learning_rate": 7.676884119250718e-05, + "loss": 1.8555, + "step": 11097 + }, + { + "epoch": 3.4063842848373236, + "grad_norm": 0.2403372824192047, + "learning_rate": 7.676464285918751e-05, + "loss": 1.7243, + "step": 11098 + }, + { + "epoch": 3.406691221608349, + "grad_norm": 0.28830090165138245, + "learning_rate": 7.676044426136397e-05, + "loss": 1.8108, + "step": 11099 + }, + { + "epoch": 3.406998158379374, + "grad_norm": 0.2918153405189514, + "learning_rate": 7.675624539907802e-05, + "loss": 1.7875, + "step": 11100 + }, + { + "epoch": 3.407305095150399, + "grad_norm": 0.2609013020992279, + "learning_rate": 7.675204627237117e-05, + "loss": 1.778, + "step": 11101 + }, + { + "epoch": 3.407612031921424, + "grad_norm": 0.2714763283729553, + "learning_rate": 7.674784688128494e-05, + "loss": 1.8472, + "step": 11102 + }, + { + "epoch": 3.4079189686924494, + "grad_norm": 0.25857117772102356, + "learning_rate": 7.674364722586078e-05, + "loss": 1.7495, + "step": 11103 + }, + { + "epoch": 3.4082259054634747, + "grad_norm": 0.25485143065452576, + "learning_rate": 7.673944730614023e-05, + "loss": 1.7817, + "step": 11104 + }, + { + "epoch": 3.4085328422344996, + "grad_norm": 0.2735857665538788, + "learning_rate": 7.67352471221648e-05, + "loss": 1.7522, + "step": 11105 + }, + { + "epoch": 3.408839779005525, + "grad_norm": 0.25079572200775146, + "learning_rate": 7.6731046673976e-05, + "loss": 1.765, + "step": 11106 + }, + { + "epoch": 3.40914671577655, + "grad_norm": 0.3080148696899414, + "learning_rate": 7.672684596161532e-05, + "loss": 1.8305, + "step": 11107 + }, + { + "epoch": 3.409453652547575, + "grad_norm": 0.23771968483924866, + "learning_rate": 7.672264498512427e-05, + "loss": 1.7837, + "step": 11108 + }, + { + "epoch": 3.4097605893186005, + "grad_norm": 0.29941999912261963, + "learning_rate": 7.671844374454437e-05, + "loss": 1.8013, + "step": 11109 + }, + { + "epoch": 3.4100675260896254, + "grad_norm": 0.27871644496917725, + "learning_rate": 7.671424223991717e-05, + "loss": 1.8598, + "step": 11110 + }, + { + "epoch": 3.4103744628606507, + "grad_norm": 0.2751443684101105, + "learning_rate": 7.671004047128416e-05, + "loss": 1.8341, + "step": 11111 + }, + { + "epoch": 3.410681399631676, + "grad_norm": 0.27227312326431274, + "learning_rate": 7.670583843868688e-05, + "loss": 1.81, + "step": 11112 + }, + { + "epoch": 3.410988336402701, + "grad_norm": 0.29617756605148315, + "learning_rate": 7.670163614216685e-05, + "loss": 1.8795, + "step": 11113 + }, + { + "epoch": 3.4112952731737263, + "grad_norm": 0.268920361995697, + "learning_rate": 7.669743358176563e-05, + "loss": 1.7659, + "step": 11114 + }, + { + "epoch": 3.4116022099447516, + "grad_norm": 0.2875109314918518, + "learning_rate": 7.669323075752467e-05, + "loss": 1.8263, + "step": 11115 + }, + { + "epoch": 3.4119091467157765, + "grad_norm": 0.34703585505485535, + "learning_rate": 7.668902766948558e-05, + "loss": 1.7622, + "step": 11116 + }, + { + "epoch": 3.412216083486802, + "grad_norm": 0.3090265393257141, + "learning_rate": 7.668482431768989e-05, + "loss": 1.7381, + "step": 11117 + }, + { + "epoch": 3.4125230202578267, + "grad_norm": 0.2619737684726715, + "learning_rate": 7.668062070217911e-05, + "loss": 1.8004, + "step": 11118 + }, + { + "epoch": 3.412829957028852, + "grad_norm": 0.289815217256546, + "learning_rate": 7.667641682299482e-05, + "loss": 1.7946, + "step": 11119 + }, + { + "epoch": 3.4131368937998774, + "grad_norm": 0.28732073307037354, + "learning_rate": 7.667221268017852e-05, + "loss": 1.8746, + "step": 11120 + }, + { + "epoch": 3.4134438305709023, + "grad_norm": 0.23232576251029968, + "learning_rate": 7.666800827377178e-05, + "loss": 1.7403, + "step": 11121 + }, + { + "epoch": 3.4137507673419276, + "grad_norm": 0.22903507947921753, + "learning_rate": 7.666380360381616e-05, + "loss": 1.7785, + "step": 11122 + }, + { + "epoch": 3.4140577041129525, + "grad_norm": 0.25023025274276733, + "learning_rate": 7.665959867035321e-05, + "loss": 1.7881, + "step": 11123 + }, + { + "epoch": 3.414364640883978, + "grad_norm": 0.2199166864156723, + "learning_rate": 7.665539347342449e-05, + "loss": 1.7522, + "step": 11124 + }, + { + "epoch": 3.414671577655003, + "grad_norm": 0.2539862394332886, + "learning_rate": 7.665118801307152e-05, + "loss": 1.7964, + "step": 11125 + }, + { + "epoch": 3.414978514426028, + "grad_norm": 0.22670161724090576, + "learning_rate": 7.664698228933591e-05, + "loss": 1.7071, + "step": 11126 + }, + { + "epoch": 3.4152854511970534, + "grad_norm": 0.24827396869659424, + "learning_rate": 7.664277630225919e-05, + "loss": 1.7897, + "step": 11127 + }, + { + "epoch": 3.4155923879680787, + "grad_norm": 0.29391366243362427, + "learning_rate": 7.663857005188296e-05, + "loss": 1.7967, + "step": 11128 + }, + { + "epoch": 3.4158993247391036, + "grad_norm": 0.3201812505722046, + "learning_rate": 7.663436353824874e-05, + "loss": 1.7681, + "step": 11129 + }, + { + "epoch": 3.416206261510129, + "grad_norm": 0.2274552583694458, + "learning_rate": 7.663015676139814e-05, + "loss": 1.7535, + "step": 11130 + }, + { + "epoch": 3.4165131982811543, + "grad_norm": 0.3955044150352478, + "learning_rate": 7.662594972137273e-05, + "loss": 1.8175, + "step": 11131 + }, + { + "epoch": 3.416820135052179, + "grad_norm": 0.46493569016456604, + "learning_rate": 7.662174241821406e-05, + "loss": 1.7806, + "step": 11132 + }, + { + "epoch": 3.4171270718232045, + "grad_norm": 0.37731611728668213, + "learning_rate": 7.661753485196375e-05, + "loss": 1.7555, + "step": 11133 + }, + { + "epoch": 3.4174340085942294, + "grad_norm": 0.23983556032180786, + "learning_rate": 7.661332702266334e-05, + "loss": 1.7662, + "step": 11134 + }, + { + "epoch": 3.4177409453652547, + "grad_norm": 0.34964314103126526, + "learning_rate": 7.660911893035445e-05, + "loss": 1.7786, + "step": 11135 + }, + { + "epoch": 3.41804788213628, + "grad_norm": 0.44820764660835266, + "learning_rate": 7.660491057507864e-05, + "loss": 1.778, + "step": 11136 + }, + { + "epoch": 3.418354818907305, + "grad_norm": 0.32936233282089233, + "learning_rate": 7.660070195687752e-05, + "loss": 1.8181, + "step": 11137 + }, + { + "epoch": 3.4186617556783303, + "grad_norm": 0.2874850332736969, + "learning_rate": 7.659649307579266e-05, + "loss": 1.8733, + "step": 11138 + }, + { + "epoch": 3.418968692449355, + "grad_norm": 0.46269866824150085, + "learning_rate": 7.659228393186566e-05, + "loss": 1.8566, + "step": 11139 + }, + { + "epoch": 3.4192756292203805, + "grad_norm": 0.5873839855194092, + "learning_rate": 7.658807452513816e-05, + "loss": 1.8317, + "step": 11140 + }, + { + "epoch": 3.419582565991406, + "grad_norm": 0.43150341510772705, + "learning_rate": 7.65838648556517e-05, + "loss": 1.7702, + "step": 11141 + }, + { + "epoch": 3.4198895027624308, + "grad_norm": 0.2803891599178314, + "learning_rate": 7.65796549234479e-05, + "loss": 1.8043, + "step": 11142 + }, + { + "epoch": 3.420196439533456, + "grad_norm": 0.37295013666152954, + "learning_rate": 7.657544472856838e-05, + "loss": 1.7923, + "step": 11143 + }, + { + "epoch": 3.4205033763044814, + "grad_norm": 0.3922573924064636, + "learning_rate": 7.657123427105473e-05, + "loss": 1.8231, + "step": 11144 + }, + { + "epoch": 3.4208103130755063, + "grad_norm": 0.27254152297973633, + "learning_rate": 7.656702355094859e-05, + "loss": 1.8168, + "step": 11145 + }, + { + "epoch": 3.4211172498465316, + "grad_norm": 0.28005337715148926, + "learning_rate": 7.656281256829152e-05, + "loss": 1.8047, + "step": 11146 + }, + { + "epoch": 3.421424186617557, + "grad_norm": 0.4369073808193207, + "learning_rate": 7.655860132312519e-05, + "loss": 1.7243, + "step": 11147 + }, + { + "epoch": 3.421731123388582, + "grad_norm": 0.4127553701400757, + "learning_rate": 7.655438981549119e-05, + "loss": 1.8148, + "step": 11148 + }, + { + "epoch": 3.422038060159607, + "grad_norm": 0.3131798207759857, + "learning_rate": 7.655017804543114e-05, + "loss": 1.789, + "step": 11149 + }, + { + "epoch": 3.422344996930632, + "grad_norm": 0.2947194576263428, + "learning_rate": 7.654596601298666e-05, + "loss": 1.8221, + "step": 11150 + }, + { + "epoch": 3.4226519337016574, + "grad_norm": 0.3072497546672821, + "learning_rate": 7.654175371819941e-05, + "loss": 1.7747, + "step": 11151 + }, + { + "epoch": 3.4229588704726828, + "grad_norm": 0.29408320784568787, + "learning_rate": 7.653754116111099e-05, + "loss": 1.9009, + "step": 11152 + }, + { + "epoch": 3.4232658072437077, + "grad_norm": 0.2629215717315674, + "learning_rate": 7.653332834176303e-05, + "loss": 1.7354, + "step": 11153 + }, + { + "epoch": 3.423572744014733, + "grad_norm": 0.2850257456302643, + "learning_rate": 7.652911526019716e-05, + "loss": 1.8422, + "step": 11154 + }, + { + "epoch": 3.423879680785758, + "grad_norm": 0.29787111282348633, + "learning_rate": 7.652490191645503e-05, + "loss": 1.8122, + "step": 11155 + }, + { + "epoch": 3.424186617556783, + "grad_norm": 0.2670947015285492, + "learning_rate": 7.652068831057826e-05, + "loss": 1.7734, + "step": 11156 + }, + { + "epoch": 3.4244935543278086, + "grad_norm": 0.26415133476257324, + "learning_rate": 7.651647444260853e-05, + "loss": 1.7661, + "step": 11157 + }, + { + "epoch": 3.424800491098834, + "grad_norm": 0.2614886164665222, + "learning_rate": 7.651226031258745e-05, + "loss": 1.6918, + "step": 11158 + }, + { + "epoch": 3.425107427869859, + "grad_norm": 0.28485649824142456, + "learning_rate": 7.650804592055667e-05, + "loss": 1.7771, + "step": 11159 + }, + { + "epoch": 3.425414364640884, + "grad_norm": 0.26080289483070374, + "learning_rate": 7.650383126655784e-05, + "loss": 1.7637, + "step": 11160 + }, + { + "epoch": 3.425721301411909, + "grad_norm": 0.2503695487976074, + "learning_rate": 7.649961635063261e-05, + "loss": 1.7864, + "step": 11161 + }, + { + "epoch": 3.4260282381829343, + "grad_norm": 0.3165570795536041, + "learning_rate": 7.649540117282263e-05, + "loss": 1.8107, + "step": 11162 + }, + { + "epoch": 3.4263351749539597, + "grad_norm": 0.28411731123924255, + "learning_rate": 7.649118573316959e-05, + "loss": 1.7557, + "step": 11163 + }, + { + "epoch": 3.4266421117249846, + "grad_norm": 0.24469570815563202, + "learning_rate": 7.648697003171512e-05, + "loss": 1.7597, + "step": 11164 + }, + { + "epoch": 3.42694904849601, + "grad_norm": 0.31968292593955994, + "learning_rate": 7.648275406850087e-05, + "loss": 1.7796, + "step": 11165 + }, + { + "epoch": 3.427255985267035, + "grad_norm": 0.24520765244960785, + "learning_rate": 7.647853784356856e-05, + "loss": 1.7931, + "step": 11166 + }, + { + "epoch": 3.42756292203806, + "grad_norm": 0.23946821689605713, + "learning_rate": 7.647432135695977e-05, + "loss": 1.7143, + "step": 11167 + }, + { + "epoch": 3.4278698588090855, + "grad_norm": 0.321455180644989, + "learning_rate": 7.647010460871624e-05, + "loss": 1.8682, + "step": 11168 + }, + { + "epoch": 3.4281767955801103, + "grad_norm": 0.2803197503089905, + "learning_rate": 7.646588759887964e-05, + "loss": 1.8, + "step": 11169 + }, + { + "epoch": 3.4284837323511357, + "grad_norm": 0.2597559988498688, + "learning_rate": 7.64616703274916e-05, + "loss": 1.8027, + "step": 11170 + }, + { + "epoch": 3.428790669122161, + "grad_norm": 0.25055503845214844, + "learning_rate": 7.645745279459384e-05, + "loss": 1.7659, + "step": 11171 + }, + { + "epoch": 3.429097605893186, + "grad_norm": 0.34582629799842834, + "learning_rate": 7.645323500022803e-05, + "loss": 1.7868, + "step": 11172 + }, + { + "epoch": 3.4294045426642112, + "grad_norm": 0.32845041155815125, + "learning_rate": 7.644901694443584e-05, + "loss": 1.8247, + "step": 11173 + }, + { + "epoch": 3.4297114794352366, + "grad_norm": 0.2570398449897766, + "learning_rate": 7.644479862725896e-05, + "loss": 1.7802, + "step": 11174 + }, + { + "epoch": 3.4300184162062615, + "grad_norm": 0.23117294907569885, + "learning_rate": 7.644058004873908e-05, + "loss": 1.7575, + "step": 11175 + }, + { + "epoch": 3.430325352977287, + "grad_norm": 0.2417830377817154, + "learning_rate": 7.64363612089179e-05, + "loss": 1.7954, + "step": 11176 + }, + { + "epoch": 3.4306322897483117, + "grad_norm": 0.249378964304924, + "learning_rate": 7.643214210783708e-05, + "loss": 1.8161, + "step": 11177 + }, + { + "epoch": 3.430939226519337, + "grad_norm": 0.24494746327400208, + "learning_rate": 7.642792274553836e-05, + "loss": 1.825, + "step": 11178 + }, + { + "epoch": 3.4312461632903624, + "grad_norm": 0.2663760185241699, + "learning_rate": 7.642370312206342e-05, + "loss": 1.7589, + "step": 11179 + }, + { + "epoch": 3.4315531000613873, + "grad_norm": 0.2819322645664215, + "learning_rate": 7.641948323745395e-05, + "loss": 1.8097, + "step": 11180 + }, + { + "epoch": 3.4318600368324126, + "grad_norm": 0.26917630434036255, + "learning_rate": 7.641526309175166e-05, + "loss": 1.7934, + "step": 11181 + }, + { + "epoch": 3.4321669736034375, + "grad_norm": 0.31618112325668335, + "learning_rate": 7.641104268499826e-05, + "loss": 1.8522, + "step": 11182 + }, + { + "epoch": 3.432473910374463, + "grad_norm": 0.29209139943122864, + "learning_rate": 7.640682201723546e-05, + "loss": 1.7499, + "step": 11183 + }, + { + "epoch": 3.432780847145488, + "grad_norm": 0.24831914901733398, + "learning_rate": 7.640260108850496e-05, + "loss": 1.7897, + "step": 11184 + }, + { + "epoch": 3.433087783916513, + "grad_norm": 0.2459818720817566, + "learning_rate": 7.639837989884849e-05, + "loss": 1.7604, + "step": 11185 + }, + { + "epoch": 3.4333947206875384, + "grad_norm": 0.27157485485076904, + "learning_rate": 7.639415844830774e-05, + "loss": 1.7776, + "step": 11186 + }, + { + "epoch": 3.4337016574585637, + "grad_norm": 0.3021515905857086, + "learning_rate": 7.638993673692445e-05, + "loss": 1.7771, + "step": 11187 + }, + { + "epoch": 3.4340085942295886, + "grad_norm": 0.2591722309589386, + "learning_rate": 7.638571476474036e-05, + "loss": 1.8333, + "step": 11188 + }, + { + "epoch": 3.434315531000614, + "grad_norm": 0.2255258709192276, + "learning_rate": 7.638149253179717e-05, + "loss": 1.7647, + "step": 11189 + }, + { + "epoch": 3.4346224677716393, + "grad_norm": 0.2585793733596802, + "learning_rate": 7.637727003813658e-05, + "loss": 1.786, + "step": 11190 + }, + { + "epoch": 3.434929404542664, + "grad_norm": 0.23649543523788452, + "learning_rate": 7.637304728380036e-05, + "loss": 1.822, + "step": 11191 + }, + { + "epoch": 3.4352363413136895, + "grad_norm": 0.2610832452774048, + "learning_rate": 7.636882426883023e-05, + "loss": 1.7925, + "step": 11192 + }, + { + "epoch": 3.4355432780847144, + "grad_norm": 0.26230642199516296, + "learning_rate": 7.636460099326793e-05, + "loss": 1.8169, + "step": 11193 + }, + { + "epoch": 3.4358502148557397, + "grad_norm": 0.2800561189651489, + "learning_rate": 7.636037745715518e-05, + "loss": 1.845, + "step": 11194 + }, + { + "epoch": 3.436157151626765, + "grad_norm": 0.27790409326553345, + "learning_rate": 7.635615366053372e-05, + "loss": 1.8141, + "step": 11195 + }, + { + "epoch": 3.43646408839779, + "grad_norm": 0.2894865870475769, + "learning_rate": 7.635192960344533e-05, + "loss": 1.7916, + "step": 11196 + }, + { + "epoch": 3.4367710251688153, + "grad_norm": 0.22310738265514374, + "learning_rate": 7.634770528593171e-05, + "loss": 1.79, + "step": 11197 + }, + { + "epoch": 3.43707796193984, + "grad_norm": 0.2837755084037781, + "learning_rate": 7.634348070803463e-05, + "loss": 1.8763, + "step": 11198 + }, + { + "epoch": 3.4373848987108655, + "grad_norm": 0.32488104701042175, + "learning_rate": 7.633925586979583e-05, + "loss": 1.8331, + "step": 11199 + }, + { + "epoch": 3.437691835481891, + "grad_norm": 0.2708779573440552, + "learning_rate": 7.633503077125706e-05, + "loss": 1.761, + "step": 11200 + }, + { + "epoch": 3.4379987722529157, + "grad_norm": 0.23929642140865326, + "learning_rate": 7.633080541246008e-05, + "loss": 1.8217, + "step": 11201 + }, + { + "epoch": 3.438305709023941, + "grad_norm": 0.3213331997394562, + "learning_rate": 7.632657979344667e-05, + "loss": 1.8375, + "step": 11202 + }, + { + "epoch": 3.4386126457949664, + "grad_norm": 0.38420629501342773, + "learning_rate": 7.632235391425854e-05, + "loss": 1.765, + "step": 11203 + }, + { + "epoch": 3.4389195825659913, + "grad_norm": 0.40466073155403137, + "learning_rate": 7.631812777493749e-05, + "loss": 1.8262, + "step": 11204 + }, + { + "epoch": 3.4392265193370166, + "grad_norm": 0.35904639959335327, + "learning_rate": 7.631390137552527e-05, + "loss": 1.894, + "step": 11205 + }, + { + "epoch": 3.439533456108042, + "grad_norm": 0.28880515694618225, + "learning_rate": 7.630967471606368e-05, + "loss": 1.87, + "step": 11206 + }, + { + "epoch": 3.439840392879067, + "grad_norm": 0.2878882884979248, + "learning_rate": 7.630544779659444e-05, + "loss": 1.7841, + "step": 11207 + }, + { + "epoch": 3.440147329650092, + "grad_norm": 0.36002418398857117, + "learning_rate": 7.630122061715935e-05, + "loss": 1.7318, + "step": 11208 + }, + { + "epoch": 3.440454266421117, + "grad_norm": 0.3304644227027893, + "learning_rate": 7.629699317780019e-05, + "loss": 1.8581, + "step": 11209 + }, + { + "epoch": 3.4407612031921424, + "grad_norm": 0.23396331071853638, + "learning_rate": 7.629276547855872e-05, + "loss": 1.7897, + "step": 11210 + }, + { + "epoch": 3.4410681399631677, + "grad_norm": 0.34914183616638184, + "learning_rate": 7.628853751947674e-05, + "loss": 1.8531, + "step": 11211 + }, + { + "epoch": 3.4413750767341926, + "grad_norm": 0.3700502812862396, + "learning_rate": 7.6284309300596e-05, + "loss": 1.7884, + "step": 11212 + }, + { + "epoch": 3.441682013505218, + "grad_norm": 0.24606801569461823, + "learning_rate": 7.628008082195835e-05, + "loss": 1.7292, + "step": 11213 + }, + { + "epoch": 3.441988950276243, + "grad_norm": 0.26344993710517883, + "learning_rate": 7.627585208360551e-05, + "loss": 1.7832, + "step": 11214 + }, + { + "epoch": 3.442295887047268, + "grad_norm": 0.4034743010997772, + "learning_rate": 7.62716230855793e-05, + "loss": 1.8164, + "step": 11215 + }, + { + "epoch": 3.4426028238182935, + "grad_norm": 0.4508039355278015, + "learning_rate": 7.626739382792152e-05, + "loss": 1.7855, + "step": 11216 + }, + { + "epoch": 3.4429097605893184, + "grad_norm": 0.2963111400604248, + "learning_rate": 7.626316431067395e-05, + "loss": 1.7995, + "step": 11217 + }, + { + "epoch": 3.4432166973603437, + "grad_norm": 0.35248515009880066, + "learning_rate": 7.625893453387841e-05, + "loss": 1.8761, + "step": 11218 + }, + { + "epoch": 3.443523634131369, + "grad_norm": 0.4032224416732788, + "learning_rate": 7.625470449757668e-05, + "loss": 1.7746, + "step": 11219 + }, + { + "epoch": 3.443830570902394, + "grad_norm": 0.3505195081233978, + "learning_rate": 7.625047420181057e-05, + "loss": 1.851, + "step": 11220 + }, + { + "epoch": 3.4441375076734193, + "grad_norm": 0.288968563079834, + "learning_rate": 7.62462436466219e-05, + "loss": 1.8055, + "step": 11221 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.43141910433769226, + "learning_rate": 7.624201283205246e-05, + "loss": 1.816, + "step": 11222 + }, + { + "epoch": 3.4447513812154695, + "grad_norm": 0.46902137994766235, + "learning_rate": 7.623778175814407e-05, + "loss": 1.8478, + "step": 11223 + }, + { + "epoch": 3.445058317986495, + "grad_norm": 0.3333328366279602, + "learning_rate": 7.623355042493854e-05, + "loss": 1.7949, + "step": 11224 + }, + { + "epoch": 3.4453652547575198, + "grad_norm": 0.2625340521335602, + "learning_rate": 7.622931883247768e-05, + "loss": 1.745, + "step": 11225 + }, + { + "epoch": 3.445672191528545, + "grad_norm": 0.4565848410129547, + "learning_rate": 7.622508698080333e-05, + "loss": 1.796, + "step": 11226 + }, + { + "epoch": 3.4459791282995704, + "grad_norm": 0.4676518738269806, + "learning_rate": 7.622085486995729e-05, + "loss": 1.8115, + "step": 11227 + }, + { + "epoch": 3.4462860650705953, + "grad_norm": 0.3828938603401184, + "learning_rate": 7.62166224999814e-05, + "loss": 1.8758, + "step": 11228 + }, + { + "epoch": 3.4465930018416207, + "grad_norm": 0.2786383628845215, + "learning_rate": 7.621238987091747e-05, + "loss": 1.7616, + "step": 11229 + }, + { + "epoch": 3.446899938612646, + "grad_norm": 0.4442835748195648, + "learning_rate": 7.620815698280734e-05, + "loss": 1.8342, + "step": 11230 + }, + { + "epoch": 3.447206875383671, + "grad_norm": 0.45760586857795715, + "learning_rate": 7.620392383569286e-05, + "loss": 1.8159, + "step": 11231 + }, + { + "epoch": 3.447513812154696, + "grad_norm": 0.2567009925842285, + "learning_rate": 7.619969042961583e-05, + "loss": 1.774, + "step": 11232 + }, + { + "epoch": 3.4478207489257215, + "grad_norm": 0.3720102310180664, + "learning_rate": 7.619545676461812e-05, + "loss": 1.8366, + "step": 11233 + }, + { + "epoch": 3.4481276856967464, + "grad_norm": 0.36436137557029724, + "learning_rate": 7.619122284074154e-05, + "loss": 1.832, + "step": 11234 + }, + { + "epoch": 3.4484346224677718, + "grad_norm": 0.310310959815979, + "learning_rate": 7.618698865802795e-05, + "loss": 1.9023, + "step": 11235 + }, + { + "epoch": 3.4487415592387967, + "grad_norm": 0.2693026661872864, + "learning_rate": 7.618275421651916e-05, + "loss": 1.7696, + "step": 11236 + }, + { + "epoch": 3.449048496009822, + "grad_norm": 0.2942425608634949, + "learning_rate": 7.61785195162571e-05, + "loss": 1.822, + "step": 11237 + }, + { + "epoch": 3.4493554327808473, + "grad_norm": 0.22454749047756195, + "learning_rate": 7.617428455728353e-05, + "loss": 1.7011, + "step": 11238 + }, + { + "epoch": 3.449662369551872, + "grad_norm": 0.23345038294792175, + "learning_rate": 7.617004933964035e-05, + "loss": 1.7563, + "step": 11239 + }, + { + "epoch": 3.4499693063228976, + "grad_norm": 0.24990662932395935, + "learning_rate": 7.616581386336941e-05, + "loss": 1.8031, + "step": 11240 + }, + { + "epoch": 3.4502762430939224, + "grad_norm": 0.2919348478317261, + "learning_rate": 7.616157812851254e-05, + "loss": 1.7355, + "step": 11241 + }, + { + "epoch": 3.450583179864948, + "grad_norm": 0.2926909327507019, + "learning_rate": 7.615734213511165e-05, + "loss": 1.8341, + "step": 11242 + }, + { + "epoch": 3.450890116635973, + "grad_norm": 0.24316683411598206, + "learning_rate": 7.615310588320855e-05, + "loss": 1.8154, + "step": 11243 + }, + { + "epoch": 3.451197053406998, + "grad_norm": 0.23154498636722565, + "learning_rate": 7.614886937284513e-05, + "loss": 1.7904, + "step": 11244 + }, + { + "epoch": 3.4515039901780233, + "grad_norm": 0.25973939895629883, + "learning_rate": 7.614463260406327e-05, + "loss": 1.7598, + "step": 11245 + }, + { + "epoch": 3.4518109269490487, + "grad_norm": 0.22110119462013245, + "learning_rate": 7.614039557690482e-05, + "loss": 1.7903, + "step": 11246 + }, + { + "epoch": 3.4521178637200736, + "grad_norm": 0.26184993982315063, + "learning_rate": 7.613615829141165e-05, + "loss": 1.748, + "step": 11247 + }, + { + "epoch": 3.452424800491099, + "grad_norm": 0.26128727197647095, + "learning_rate": 7.613192074762565e-05, + "loss": 1.7786, + "step": 11248 + }, + { + "epoch": 3.4527317372621242, + "grad_norm": 0.23230813443660736, + "learning_rate": 7.612768294558871e-05, + "loss": 1.8114, + "step": 11249 + }, + { + "epoch": 3.453038674033149, + "grad_norm": 0.2686540186405182, + "learning_rate": 7.612344488534268e-05, + "loss": 1.7311, + "step": 11250 + }, + { + "epoch": 3.4533456108041745, + "grad_norm": 0.25553348660469055, + "learning_rate": 7.611920656692946e-05, + "loss": 1.8468, + "step": 11251 + }, + { + "epoch": 3.4536525475751993, + "grad_norm": 0.2639308273792267, + "learning_rate": 7.611496799039092e-05, + "loss": 1.8292, + "step": 11252 + }, + { + "epoch": 3.4539594843462247, + "grad_norm": 0.2468358874320984, + "learning_rate": 7.611072915576895e-05, + "loss": 1.8173, + "step": 11253 + }, + { + "epoch": 3.45426642111725, + "grad_norm": 0.27236035466194153, + "learning_rate": 7.610649006310549e-05, + "loss": 1.8082, + "step": 11254 + }, + { + "epoch": 3.454573357888275, + "grad_norm": 0.2277914434671402, + "learning_rate": 7.610225071244237e-05, + "loss": 1.7483, + "step": 11255 + }, + { + "epoch": 3.4548802946593002, + "grad_norm": 0.2292868196964264, + "learning_rate": 7.60980111038215e-05, + "loss": 1.7716, + "step": 11256 + }, + { + "epoch": 3.455187231430325, + "grad_norm": 0.22116152942180634, + "learning_rate": 7.60937712372848e-05, + "loss": 1.773, + "step": 11257 + }, + { + "epoch": 3.4554941682013505, + "grad_norm": 0.23238304257392883, + "learning_rate": 7.608953111287416e-05, + "loss": 1.7602, + "step": 11258 + }, + { + "epoch": 3.455801104972376, + "grad_norm": 0.2810615003108978, + "learning_rate": 7.608529073063149e-05, + "loss": 1.8781, + "step": 11259 + }, + { + "epoch": 3.4561080417434007, + "grad_norm": 0.2516821324825287, + "learning_rate": 7.608105009059867e-05, + "loss": 1.835, + "step": 11260 + }, + { + "epoch": 3.456414978514426, + "grad_norm": 0.25698330998420715, + "learning_rate": 7.607680919281763e-05, + "loss": 1.7859, + "step": 11261 + }, + { + "epoch": 3.4567219152854514, + "grad_norm": 0.2597602903842926, + "learning_rate": 7.60725680373303e-05, + "loss": 1.8287, + "step": 11262 + }, + { + "epoch": 3.4570288520564763, + "grad_norm": 0.2564091980457306, + "learning_rate": 7.606832662417855e-05, + "loss": 1.8003, + "step": 11263 + }, + { + "epoch": 3.4573357888275016, + "grad_norm": 0.2872684597969055, + "learning_rate": 7.606408495340432e-05, + "loss": 1.8242, + "step": 11264 + }, + { + "epoch": 3.457642725598527, + "grad_norm": 0.27513590455055237, + "learning_rate": 7.605984302504952e-05, + "loss": 1.8605, + "step": 11265 + }, + { + "epoch": 3.457949662369552, + "grad_norm": 0.27768459916114807, + "learning_rate": 7.605560083915609e-05, + "loss": 1.7948, + "step": 11266 + }, + { + "epoch": 3.458256599140577, + "grad_norm": 0.23911382257938385, + "learning_rate": 7.605135839576593e-05, + "loss": 1.7575, + "step": 11267 + }, + { + "epoch": 3.458563535911602, + "grad_norm": 0.26773568987846375, + "learning_rate": 7.604711569492098e-05, + "loss": 1.752, + "step": 11268 + }, + { + "epoch": 3.4588704726826274, + "grad_norm": 0.30079394578933716, + "learning_rate": 7.604287273666316e-05, + "loss": 1.8022, + "step": 11269 + }, + { + "epoch": 3.4591774094536527, + "grad_norm": 0.27393853664398193, + "learning_rate": 7.603862952103441e-05, + "loss": 1.8054, + "step": 11270 + }, + { + "epoch": 3.4594843462246776, + "grad_norm": 0.2794870436191559, + "learning_rate": 7.603438604807667e-05, + "loss": 1.808, + "step": 11271 + }, + { + "epoch": 3.459791282995703, + "grad_norm": 0.26482146978378296, + "learning_rate": 7.603014231783185e-05, + "loss": 1.8696, + "step": 11272 + }, + { + "epoch": 3.460098219766728, + "grad_norm": 0.2755354344844818, + "learning_rate": 7.602589833034192e-05, + "loss": 1.8412, + "step": 11273 + }, + { + "epoch": 3.460405156537753, + "grad_norm": 0.2666642367839813, + "learning_rate": 7.602165408564883e-05, + "loss": 1.8333, + "step": 11274 + }, + { + "epoch": 3.4607120933087785, + "grad_norm": 0.26958519220352173, + "learning_rate": 7.601740958379448e-05, + "loss": 1.7943, + "step": 11275 + }, + { + "epoch": 3.4610190300798034, + "grad_norm": 0.2915789783000946, + "learning_rate": 7.601316482482084e-05, + "loss": 1.7519, + "step": 11276 + }, + { + "epoch": 3.4613259668508287, + "grad_norm": 0.2456950694322586, + "learning_rate": 7.600891980876985e-05, + "loss": 1.8064, + "step": 11277 + }, + { + "epoch": 3.461632903621854, + "grad_norm": 0.2517867088317871, + "learning_rate": 7.600467453568348e-05, + "loss": 1.7766, + "step": 11278 + }, + { + "epoch": 3.461939840392879, + "grad_norm": 0.24567969143390656, + "learning_rate": 7.600042900560368e-05, + "loss": 1.7331, + "step": 11279 + }, + { + "epoch": 3.4622467771639043, + "grad_norm": 0.23986820876598358, + "learning_rate": 7.599618321857239e-05, + "loss": 1.7477, + "step": 11280 + }, + { + "epoch": 3.4625537139349296, + "grad_norm": 0.2555375397205353, + "learning_rate": 7.599193717463158e-05, + "loss": 1.8154, + "step": 11281 + }, + { + "epoch": 3.4628606507059545, + "grad_norm": 0.2522781193256378, + "learning_rate": 7.598769087382323e-05, + "loss": 1.7821, + "step": 11282 + }, + { + "epoch": 3.46316758747698, + "grad_norm": 0.25631004571914673, + "learning_rate": 7.598344431618926e-05, + "loss": 1.8043, + "step": 11283 + }, + { + "epoch": 3.4634745242480047, + "grad_norm": 0.2611328661441803, + "learning_rate": 7.597919750177168e-05, + "loss": 1.8036, + "step": 11284 + }, + { + "epoch": 3.46378146101903, + "grad_norm": 0.255670428276062, + "learning_rate": 7.597495043061244e-05, + "loss": 1.7375, + "step": 11285 + }, + { + "epoch": 3.4640883977900554, + "grad_norm": 0.2687236964702606, + "learning_rate": 7.597070310275353e-05, + "loss": 1.7496, + "step": 11286 + }, + { + "epoch": 3.4643953345610803, + "grad_norm": 0.2643752992153168, + "learning_rate": 7.596645551823688e-05, + "loss": 1.8444, + "step": 11287 + }, + { + "epoch": 3.4647022713321056, + "grad_norm": 0.2564511299133301, + "learning_rate": 7.596220767710452e-05, + "loss": 1.7557, + "step": 11288 + }, + { + "epoch": 3.4650092081031305, + "grad_norm": 0.2510208487510681, + "learning_rate": 7.59579595793984e-05, + "loss": 1.7234, + "step": 11289 + }, + { + "epoch": 3.465316144874156, + "grad_norm": 0.2765158712863922, + "learning_rate": 7.595371122516051e-05, + "loss": 1.8215, + "step": 11290 + }, + { + "epoch": 3.465623081645181, + "grad_norm": 0.28233039379119873, + "learning_rate": 7.594946261443286e-05, + "loss": 1.7752, + "step": 11291 + }, + { + "epoch": 3.465930018416206, + "grad_norm": 0.26971468329429626, + "learning_rate": 7.594521374725735e-05, + "loss": 1.7924, + "step": 11292 + }, + { + "epoch": 3.4662369551872314, + "grad_norm": 0.29425930976867676, + "learning_rate": 7.594096462367608e-05, + "loss": 1.8144, + "step": 11293 + }, + { + "epoch": 3.4665438919582567, + "grad_norm": 0.233150452375412, + "learning_rate": 7.593671524373098e-05, + "loss": 1.7741, + "step": 11294 + }, + { + "epoch": 3.4668508287292816, + "grad_norm": 0.2947762608528137, + "learning_rate": 7.593246560746406e-05, + "loss": 1.8031, + "step": 11295 + }, + { + "epoch": 3.467157765500307, + "grad_norm": 0.250552773475647, + "learning_rate": 7.59282157149173e-05, + "loss": 1.7501, + "step": 11296 + }, + { + "epoch": 3.4674647022713323, + "grad_norm": 0.26091331243515015, + "learning_rate": 7.592396556613274e-05, + "loss": 1.836, + "step": 11297 + }, + { + "epoch": 3.467771639042357, + "grad_norm": 0.28625619411468506, + "learning_rate": 7.591971516115233e-05, + "loss": 1.7555, + "step": 11298 + }, + { + "epoch": 3.4680785758133825, + "grad_norm": 0.2723398804664612, + "learning_rate": 7.591546450001811e-05, + "loss": 1.825, + "step": 11299 + }, + { + "epoch": 3.4683855125844074, + "grad_norm": 0.24289946258068085, + "learning_rate": 7.591121358277211e-05, + "loss": 1.7441, + "step": 11300 + }, + { + "epoch": 3.4686924493554327, + "grad_norm": 0.2706952691078186, + "learning_rate": 7.590696240945629e-05, + "loss": 1.8651, + "step": 11301 + }, + { + "epoch": 3.468999386126458, + "grad_norm": 0.24632862210273743, + "learning_rate": 7.590271098011268e-05, + "loss": 1.8229, + "step": 11302 + }, + { + "epoch": 3.469306322897483, + "grad_norm": 0.29275211691856384, + "learning_rate": 7.58984592947833e-05, + "loss": 1.7591, + "step": 11303 + }, + { + "epoch": 3.4696132596685083, + "grad_norm": 0.29228144884109497, + "learning_rate": 7.589420735351016e-05, + "loss": 1.8395, + "step": 11304 + }, + { + "epoch": 3.4699201964395336, + "grad_norm": 0.28339114785194397, + "learning_rate": 7.588995515633528e-05, + "loss": 1.8543, + "step": 11305 + }, + { + "epoch": 3.4702271332105585, + "grad_norm": 0.2834693193435669, + "learning_rate": 7.588570270330071e-05, + "loss": 1.826, + "step": 11306 + }, + { + "epoch": 3.470534069981584, + "grad_norm": 0.26130759716033936, + "learning_rate": 7.588144999444844e-05, + "loss": 1.7887, + "step": 11307 + }, + { + "epoch": 3.470841006752609, + "grad_norm": 0.29554685950279236, + "learning_rate": 7.587719702982052e-05, + "loss": 1.819, + "step": 11308 + }, + { + "epoch": 3.471147943523634, + "grad_norm": 0.2687968611717224, + "learning_rate": 7.587294380945898e-05, + "loss": 1.7354, + "step": 11309 + }, + { + "epoch": 3.4714548802946594, + "grad_norm": 0.28795287013053894, + "learning_rate": 7.586869033340582e-05, + "loss": 1.8267, + "step": 11310 + }, + { + "epoch": 3.4717618170656843, + "grad_norm": 0.33244553208351135, + "learning_rate": 7.58644366017031e-05, + "loss": 1.86, + "step": 11311 + }, + { + "epoch": 3.4720687538367097, + "grad_norm": 0.2878025472164154, + "learning_rate": 7.586018261439288e-05, + "loss": 1.7587, + "step": 11312 + }, + { + "epoch": 3.472375690607735, + "grad_norm": 0.26856711506843567, + "learning_rate": 7.585592837151716e-05, + "loss": 1.7351, + "step": 11313 + }, + { + "epoch": 3.47268262737876, + "grad_norm": 0.2554367780685425, + "learning_rate": 7.585167387311802e-05, + "loss": 1.7664, + "step": 11314 + }, + { + "epoch": 3.472989564149785, + "grad_norm": 0.3193204700946808, + "learning_rate": 7.584741911923748e-05, + "loss": 1.7487, + "step": 11315 + }, + { + "epoch": 3.47329650092081, + "grad_norm": 0.3227958679199219, + "learning_rate": 7.584316410991759e-05, + "loss": 1.8107, + "step": 11316 + }, + { + "epoch": 3.4736034376918354, + "grad_norm": 0.33891916275024414, + "learning_rate": 7.58389088452004e-05, + "loss": 1.8466, + "step": 11317 + }, + { + "epoch": 3.4739103744628608, + "grad_norm": 0.27050724625587463, + "learning_rate": 7.583465332512797e-05, + "loss": 1.7877, + "step": 11318 + }, + { + "epoch": 3.4742173112338857, + "grad_norm": 0.2935837209224701, + "learning_rate": 7.583039754974235e-05, + "loss": 1.7932, + "step": 11319 + }, + { + "epoch": 3.474524248004911, + "grad_norm": 0.27780550718307495, + "learning_rate": 7.582614151908561e-05, + "loss": 1.8374, + "step": 11320 + }, + { + "epoch": 3.4748311847759363, + "grad_norm": 0.2579033076763153, + "learning_rate": 7.58218852331998e-05, + "loss": 1.7305, + "step": 11321 + }, + { + "epoch": 3.4751381215469612, + "grad_norm": 0.2531716227531433, + "learning_rate": 7.581762869212699e-05, + "loss": 1.8136, + "step": 11322 + }, + { + "epoch": 3.4754450583179866, + "grad_norm": 0.25504544377326965, + "learning_rate": 7.581337189590924e-05, + "loss": 1.787, + "step": 11323 + }, + { + "epoch": 3.475751995089012, + "grad_norm": 0.23659855127334595, + "learning_rate": 7.580911484458861e-05, + "loss": 1.77, + "step": 11324 + }, + { + "epoch": 3.476058931860037, + "grad_norm": 0.22556856274604797, + "learning_rate": 7.580485753820721e-05, + "loss": 1.7808, + "step": 11325 + }, + { + "epoch": 3.476365868631062, + "grad_norm": 0.2860291600227356, + "learning_rate": 7.580059997680705e-05, + "loss": 1.8224, + "step": 11326 + }, + { + "epoch": 3.476672805402087, + "grad_norm": 0.3134596645832062, + "learning_rate": 7.579634216043023e-05, + "loss": 1.8278, + "step": 11327 + }, + { + "epoch": 3.4769797421731123, + "grad_norm": 0.2883087992668152, + "learning_rate": 7.579208408911887e-05, + "loss": 1.7917, + "step": 11328 + }, + { + "epoch": 3.4772866789441377, + "grad_norm": 0.2743333578109741, + "learning_rate": 7.578782576291501e-05, + "loss": 1.8228, + "step": 11329 + }, + { + "epoch": 3.4775936157151626, + "grad_norm": 0.25026053190231323, + "learning_rate": 7.578356718186073e-05, + "loss": 1.7717, + "step": 11330 + }, + { + "epoch": 3.477900552486188, + "grad_norm": 0.246905118227005, + "learning_rate": 7.577930834599813e-05, + "loss": 1.7979, + "step": 11331 + }, + { + "epoch": 3.478207489257213, + "grad_norm": 0.24709418416023254, + "learning_rate": 7.577504925536929e-05, + "loss": 1.8111, + "step": 11332 + }, + { + "epoch": 3.478514426028238, + "grad_norm": 0.25685814023017883, + "learning_rate": 7.577078991001632e-05, + "loss": 1.8255, + "step": 11333 + }, + { + "epoch": 3.4788213627992635, + "grad_norm": 0.23937836289405823, + "learning_rate": 7.576653030998129e-05, + "loss": 1.7254, + "step": 11334 + }, + { + "epoch": 3.4791282995702884, + "grad_norm": 0.22638650238513947, + "learning_rate": 7.57622704553063e-05, + "loss": 1.7847, + "step": 11335 + }, + { + "epoch": 3.4794352363413137, + "grad_norm": 0.26083993911743164, + "learning_rate": 7.575801034603347e-05, + "loss": 1.7947, + "step": 11336 + }, + { + "epoch": 3.479742173112339, + "grad_norm": 0.2715466022491455, + "learning_rate": 7.575374998220488e-05, + "loss": 1.848, + "step": 11337 + }, + { + "epoch": 3.480049109883364, + "grad_norm": 0.25554224848747253, + "learning_rate": 7.574948936386262e-05, + "loss": 1.7811, + "step": 11338 + }, + { + "epoch": 3.4803560466543892, + "grad_norm": 0.2689397931098938, + "learning_rate": 7.574522849104882e-05, + "loss": 1.82, + "step": 11339 + }, + { + "epoch": 3.4806629834254146, + "grad_norm": 0.25027474761009216, + "learning_rate": 7.57409673638056e-05, + "loss": 1.775, + "step": 11340 + }, + { + "epoch": 3.4809699201964395, + "grad_norm": 0.2545457184314728, + "learning_rate": 7.573670598217504e-05, + "loss": 1.8056, + "step": 11341 + }, + { + "epoch": 3.481276856967465, + "grad_norm": 0.28404027223587036, + "learning_rate": 7.573244434619928e-05, + "loss": 1.8372, + "step": 11342 + }, + { + "epoch": 3.4815837937384897, + "grad_norm": 0.28046950697898865, + "learning_rate": 7.572818245592041e-05, + "loss": 1.7851, + "step": 11343 + }, + { + "epoch": 3.481890730509515, + "grad_norm": 0.23005759716033936, + "learning_rate": 7.572392031138056e-05, + "loss": 1.7059, + "step": 11344 + }, + { + "epoch": 3.4821976672805404, + "grad_norm": 0.2931719124317169, + "learning_rate": 7.571965791262185e-05, + "loss": 1.84, + "step": 11345 + }, + { + "epoch": 3.4825046040515653, + "grad_norm": 0.4399266242980957, + "learning_rate": 7.571539525968642e-05, + "loss": 1.7465, + "step": 11346 + }, + { + "epoch": 3.4828115408225906, + "grad_norm": 0.48957565426826477, + "learning_rate": 7.571113235261638e-05, + "loss": 1.8494, + "step": 11347 + }, + { + "epoch": 3.4831184775936155, + "grad_norm": 0.37828895449638367, + "learning_rate": 7.570686919145385e-05, + "loss": 1.7598, + "step": 11348 + }, + { + "epoch": 3.483425414364641, + "grad_norm": 0.22943973541259766, + "learning_rate": 7.570260577624098e-05, + "loss": 1.7443, + "step": 11349 + }, + { + "epoch": 3.483732351135666, + "grad_norm": 0.3245384991168976, + "learning_rate": 7.569834210701987e-05, + "loss": 1.7232, + "step": 11350 + }, + { + "epoch": 3.484039287906691, + "grad_norm": 0.4419693648815155, + "learning_rate": 7.569407818383271e-05, + "loss": 1.841, + "step": 11351 + }, + { + "epoch": 3.4843462246777164, + "grad_norm": 0.4061864912509918, + "learning_rate": 7.568981400672159e-05, + "loss": 1.8274, + "step": 11352 + }, + { + "epoch": 3.4846531614487417, + "grad_norm": 0.2609417736530304, + "learning_rate": 7.56855495757287e-05, + "loss": 1.8631, + "step": 11353 + }, + { + "epoch": 3.4849600982197666, + "grad_norm": 0.28758567571640015, + "learning_rate": 7.568128489089612e-05, + "loss": 1.8169, + "step": 11354 + }, + { + "epoch": 3.485267034990792, + "grad_norm": 0.40643060207366943, + "learning_rate": 7.567701995226606e-05, + "loss": 1.809, + "step": 11355 + }, + { + "epoch": 3.4855739717618173, + "grad_norm": 0.37649446725845337, + "learning_rate": 7.56727547598806e-05, + "loss": 1.7661, + "step": 11356 + }, + { + "epoch": 3.485880908532842, + "grad_norm": 0.22863779962062836, + "learning_rate": 7.566848931378197e-05, + "loss": 1.808, + "step": 11357 + }, + { + "epoch": 3.4861878453038675, + "grad_norm": 0.4487019181251526, + "learning_rate": 7.566422361401226e-05, + "loss": 1.7627, + "step": 11358 + }, + { + "epoch": 3.4864947820748924, + "grad_norm": 0.4583640694618225, + "learning_rate": 7.565995766061367e-05, + "loss": 1.8186, + "step": 11359 + }, + { + "epoch": 3.4868017188459177, + "grad_norm": 0.27231526374816895, + "learning_rate": 7.565569145362833e-05, + "loss": 1.8465, + "step": 11360 + }, + { + "epoch": 3.487108655616943, + "grad_norm": 0.3877887725830078, + "learning_rate": 7.565142499309841e-05, + "loss": 1.7668, + "step": 11361 + }, + { + "epoch": 3.487415592387968, + "grad_norm": 0.5511242747306824, + "learning_rate": 7.564715827906606e-05, + "loss": 1.8417, + "step": 11362 + }, + { + "epoch": 3.4877225291589933, + "grad_norm": 0.5112231373786926, + "learning_rate": 7.564289131157348e-05, + "loss": 1.8038, + "step": 11363 + }, + { + "epoch": 3.488029465930018, + "grad_norm": 0.279502809047699, + "learning_rate": 7.56386240906628e-05, + "loss": 1.7545, + "step": 11364 + }, + { + "epoch": 3.4883364027010435, + "grad_norm": 0.30080464482307434, + "learning_rate": 7.563435661637623e-05, + "loss": 1.8136, + "step": 11365 + }, + { + "epoch": 3.488643339472069, + "grad_norm": 0.4424717128276825, + "learning_rate": 7.563008888875591e-05, + "loss": 1.7542, + "step": 11366 + }, + { + "epoch": 3.4889502762430937, + "grad_norm": 0.42144715785980225, + "learning_rate": 7.562582090784403e-05, + "loss": 1.8245, + "step": 11367 + }, + { + "epoch": 3.489257213014119, + "grad_norm": 0.2533668875694275, + "learning_rate": 7.562155267368277e-05, + "loss": 1.8654, + "step": 11368 + }, + { + "epoch": 3.4895641497851444, + "grad_norm": 0.3327534794807434, + "learning_rate": 7.56172841863143e-05, + "loss": 1.7882, + "step": 11369 + }, + { + "epoch": 3.4898710865561693, + "grad_norm": 0.44001486897468567, + "learning_rate": 7.561301544578081e-05, + "loss": 1.8397, + "step": 11370 + }, + { + "epoch": 3.4901780233271946, + "grad_norm": 0.2779090106487274, + "learning_rate": 7.56087464521245e-05, + "loss": 1.7398, + "step": 11371 + }, + { + "epoch": 3.49048496009822, + "grad_norm": 0.3018067479133606, + "learning_rate": 7.560447720538755e-05, + "loss": 1.8076, + "step": 11372 + }, + { + "epoch": 3.490791896869245, + "grad_norm": 0.4370935261249542, + "learning_rate": 7.560020770561216e-05, + "loss": 1.8057, + "step": 11373 + }, + { + "epoch": 3.49109883364027, + "grad_norm": 0.2936978042125702, + "learning_rate": 7.559593795284047e-05, + "loss": 1.7726, + "step": 11374 + }, + { + "epoch": 3.491405770411295, + "grad_norm": 0.28825095295906067, + "learning_rate": 7.559166794711476e-05, + "loss": 1.8039, + "step": 11375 + }, + { + "epoch": 3.4917127071823204, + "grad_norm": 0.39334073662757874, + "learning_rate": 7.55873976884772e-05, + "loss": 1.8388, + "step": 11376 + }, + { + "epoch": 3.4920196439533457, + "grad_norm": 0.33880460262298584, + "learning_rate": 7.558312717696995e-05, + "loss": 1.7791, + "step": 11377 + }, + { + "epoch": 3.4923265807243706, + "grad_norm": 0.4433762729167938, + "learning_rate": 7.557885641263524e-05, + "loss": 1.7786, + "step": 11378 + }, + { + "epoch": 3.492633517495396, + "grad_norm": 0.4710264205932617, + "learning_rate": 7.557458539551527e-05, + "loss": 1.7193, + "step": 11379 + }, + { + "epoch": 3.4929404542664213, + "grad_norm": 0.27514326572418213, + "learning_rate": 7.557031412565228e-05, + "loss": 1.823, + "step": 11380 + }, + { + "epoch": 3.493247391037446, + "grad_norm": 0.4681413471698761, + "learning_rate": 7.556604260308846e-05, + "loss": 1.7598, + "step": 11381 + }, + { + "epoch": 3.4935543278084715, + "grad_norm": 0.5032503604888916, + "learning_rate": 7.556177082786602e-05, + "loss": 1.741, + "step": 11382 + }, + { + "epoch": 3.493861264579497, + "grad_norm": 0.2677086889743805, + "learning_rate": 7.555749880002716e-05, + "loss": 1.8528, + "step": 11383 + }, + { + "epoch": 3.4941682013505218, + "grad_norm": 0.43870940804481506, + "learning_rate": 7.555322651961414e-05, + "loss": 1.7632, + "step": 11384 + }, + { + "epoch": 3.494475138121547, + "grad_norm": 0.5403209924697876, + "learning_rate": 7.554895398666914e-05, + "loss": 1.8181, + "step": 11385 + }, + { + "epoch": 3.494782074892572, + "grad_norm": 0.2714318335056305, + "learning_rate": 7.554468120123441e-05, + "loss": 1.8151, + "step": 11386 + }, + { + "epoch": 3.4950890116635973, + "grad_norm": 0.49661698937416077, + "learning_rate": 7.554040816335217e-05, + "loss": 1.8116, + "step": 11387 + }, + { + "epoch": 3.4953959484346226, + "grad_norm": 0.49954715371131897, + "learning_rate": 7.553613487306465e-05, + "loss": 1.8841, + "step": 11388 + }, + { + "epoch": 3.4957028852056475, + "grad_norm": 0.28189441561698914, + "learning_rate": 7.553186133041406e-05, + "loss": 1.7834, + "step": 11389 + }, + { + "epoch": 3.496009821976673, + "grad_norm": 0.36029115319252014, + "learning_rate": 7.552758753544267e-05, + "loss": 1.7796, + "step": 11390 + }, + { + "epoch": 3.4963167587476978, + "grad_norm": 0.45023465156555176, + "learning_rate": 7.552331348819268e-05, + "loss": 1.8773, + "step": 11391 + }, + { + "epoch": 3.496623695518723, + "grad_norm": 0.3235788643360138, + "learning_rate": 7.551903918870636e-05, + "loss": 1.7984, + "step": 11392 + }, + { + "epoch": 3.4969306322897484, + "grad_norm": 0.25656190514564514, + "learning_rate": 7.551476463702596e-05, + "loss": 1.8403, + "step": 11393 + }, + { + "epoch": 3.4972375690607733, + "grad_norm": 0.2866458594799042, + "learning_rate": 7.551048983319366e-05, + "loss": 1.7428, + "step": 11394 + }, + { + "epoch": 3.4975445058317987, + "grad_norm": 0.2713877856731415, + "learning_rate": 7.550621477725177e-05, + "loss": 1.8508, + "step": 11395 + }, + { + "epoch": 3.497851442602824, + "grad_norm": 0.27978867292404175, + "learning_rate": 7.55019394692425e-05, + "loss": 1.8049, + "step": 11396 + }, + { + "epoch": 3.498158379373849, + "grad_norm": 0.3275020122528076, + "learning_rate": 7.549766390920814e-05, + "loss": 1.8553, + "step": 11397 + }, + { + "epoch": 3.498465316144874, + "grad_norm": 0.29947492480278015, + "learning_rate": 7.54933880971909e-05, + "loss": 1.7614, + "step": 11398 + }, + { + "epoch": 3.4987722529158995, + "grad_norm": 0.25790849328041077, + "learning_rate": 7.548911203323308e-05, + "loss": 1.8223, + "step": 11399 + }, + { + "epoch": 3.4990791896869244, + "grad_norm": 0.3145451545715332, + "learning_rate": 7.54848357173769e-05, + "loss": 1.7642, + "step": 11400 + }, + { + "epoch": 3.4993861264579498, + "grad_norm": 0.29052913188934326, + "learning_rate": 7.548055914966463e-05, + "loss": 1.7728, + "step": 11401 + }, + { + "epoch": 3.4996930632289747, + "grad_norm": 0.2741037905216217, + "learning_rate": 7.547628233013854e-05, + "loss": 1.7382, + "step": 11402 + }, + { + "epoch": 3.5, + "grad_norm": 0.2562723755836487, + "learning_rate": 7.54720052588409e-05, + "loss": 1.7455, + "step": 11403 + }, + { + "epoch": 3.5003069367710253, + "grad_norm": 0.27649983763694763, + "learning_rate": 7.546772793581398e-05, + "loss": 1.7194, + "step": 11404 + }, + { + "epoch": 3.5006138735420502, + "grad_norm": 0.27290579676628113, + "learning_rate": 7.546345036110004e-05, + "loss": 1.87, + "step": 11405 + }, + { + "epoch": 3.5009208103130756, + "grad_norm": 0.33585605025291443, + "learning_rate": 7.545917253474136e-05, + "loss": 1.7703, + "step": 11406 + }, + { + "epoch": 3.5012277470841005, + "grad_norm": 0.2592691481113434, + "learning_rate": 7.545489445678022e-05, + "loss": 1.7657, + "step": 11407 + }, + { + "epoch": 3.501534683855126, + "grad_norm": 0.3081367015838623, + "learning_rate": 7.545061612725888e-05, + "loss": 1.8067, + "step": 11408 + }, + { + "epoch": 3.501841620626151, + "grad_norm": 0.31012001633644104, + "learning_rate": 7.544633754621965e-05, + "loss": 1.8009, + "step": 11409 + }, + { + "epoch": 3.5021485573971765, + "grad_norm": 0.28232479095458984, + "learning_rate": 7.54420587137048e-05, + "loss": 1.8124, + "step": 11410 + }, + { + "epoch": 3.5024554941682013, + "grad_norm": 0.24079222977161407, + "learning_rate": 7.54377796297566e-05, + "loss": 1.789, + "step": 11411 + }, + { + "epoch": 3.5027624309392267, + "grad_norm": 0.27347204089164734, + "learning_rate": 7.543350029441737e-05, + "loss": 1.7704, + "step": 11412 + }, + { + "epoch": 3.5030693677102516, + "grad_norm": 0.25545811653137207, + "learning_rate": 7.542922070772935e-05, + "loss": 1.7871, + "step": 11413 + }, + { + "epoch": 3.503376304481277, + "grad_norm": 0.2507263123989105, + "learning_rate": 7.54249408697349e-05, + "loss": 1.8424, + "step": 11414 + }, + { + "epoch": 3.5036832412523022, + "grad_norm": 0.2776084244251251, + "learning_rate": 7.542066078047627e-05, + "loss": 1.8246, + "step": 11415 + }, + { + "epoch": 3.503990178023327, + "grad_norm": 0.32833749055862427, + "learning_rate": 7.541638043999577e-05, + "loss": 1.7785, + "step": 11416 + }, + { + "epoch": 3.5042971147943525, + "grad_norm": 0.258486270904541, + "learning_rate": 7.541209984833571e-05, + "loss": 1.7543, + "step": 11417 + }, + { + "epoch": 3.5046040515653774, + "grad_norm": 0.25825178623199463, + "learning_rate": 7.540781900553837e-05, + "loss": 1.7939, + "step": 11418 + }, + { + "epoch": 3.5049109883364027, + "grad_norm": 0.26980888843536377, + "learning_rate": 7.540353791164606e-05, + "loss": 1.7777, + "step": 11419 + }, + { + "epoch": 3.505217925107428, + "grad_norm": 0.24103333055973053, + "learning_rate": 7.539925656670111e-05, + "loss": 1.7565, + "step": 11420 + }, + { + "epoch": 3.505524861878453, + "grad_norm": 0.25192007422447205, + "learning_rate": 7.539497497074584e-05, + "loss": 1.7696, + "step": 11421 + }, + { + "epoch": 3.5058317986494782, + "grad_norm": 0.218489870429039, + "learning_rate": 7.539069312382252e-05, + "loss": 1.761, + "step": 11422 + }, + { + "epoch": 3.506138735420503, + "grad_norm": 0.27533552050590515, + "learning_rate": 7.53864110259735e-05, + "loss": 1.7374, + "step": 11423 + }, + { + "epoch": 3.5064456721915285, + "grad_norm": 0.2603490650653839, + "learning_rate": 7.538212867724108e-05, + "loss": 1.8342, + "step": 11424 + }, + { + "epoch": 3.506752608962554, + "grad_norm": 0.27340635657310486, + "learning_rate": 7.537784607766758e-05, + "loss": 1.8099, + "step": 11425 + }, + { + "epoch": 3.507059545733579, + "grad_norm": 0.25342679023742676, + "learning_rate": 7.537356322729537e-05, + "loss": 1.7949, + "step": 11426 + }, + { + "epoch": 3.507366482504604, + "grad_norm": 0.292819082736969, + "learning_rate": 7.536928012616669e-05, + "loss": 1.9049, + "step": 11427 + }, + { + "epoch": 3.5076734192756294, + "grad_norm": 0.28256532549858093, + "learning_rate": 7.536499677432393e-05, + "loss": 1.8464, + "step": 11428 + }, + { + "epoch": 3.5079803560466543, + "grad_norm": 0.2672989070415497, + "learning_rate": 7.536071317180942e-05, + "loss": 1.8301, + "step": 11429 + }, + { + "epoch": 3.5082872928176796, + "grad_norm": 0.2525518238544464, + "learning_rate": 7.535642931866546e-05, + "loss": 1.8054, + "step": 11430 + }, + { + "epoch": 3.508594229588705, + "grad_norm": 0.2622447609901428, + "learning_rate": 7.535214521493442e-05, + "loss": 1.8293, + "step": 11431 + }, + { + "epoch": 3.50890116635973, + "grad_norm": 0.27057385444641113, + "learning_rate": 7.534786086065859e-05, + "loss": 1.7426, + "step": 11432 + }, + { + "epoch": 3.509208103130755, + "grad_norm": 0.27363866567611694, + "learning_rate": 7.534357625588038e-05, + "loss": 1.7138, + "step": 11433 + }, + { + "epoch": 3.50951503990178, + "grad_norm": 0.3029060363769531, + "learning_rate": 7.533929140064207e-05, + "loss": 1.864, + "step": 11434 + }, + { + "epoch": 3.5098219766728054, + "grad_norm": 0.3144821524620056, + "learning_rate": 7.533500629498604e-05, + "loss": 1.7846, + "step": 11435 + }, + { + "epoch": 3.5101289134438307, + "grad_norm": 0.44535213708877563, + "learning_rate": 7.533072093895461e-05, + "loss": 1.799, + "step": 11436 + }, + { + "epoch": 3.5104358502148556, + "grad_norm": 0.25344160199165344, + "learning_rate": 7.532643533259017e-05, + "loss": 1.7391, + "step": 11437 + }, + { + "epoch": 3.510742786985881, + "grad_norm": 0.286026269197464, + "learning_rate": 7.532214947593506e-05, + "loss": 1.8436, + "step": 11438 + }, + { + "epoch": 3.511049723756906, + "grad_norm": 0.3317352533340454, + "learning_rate": 7.53178633690316e-05, + "loss": 1.8507, + "step": 11439 + }, + { + "epoch": 3.511356660527931, + "grad_norm": 0.2547265589237213, + "learning_rate": 7.53135770119222e-05, + "loss": 1.7483, + "step": 11440 + }, + { + "epoch": 3.5116635972989565, + "grad_norm": 0.24281835556030273, + "learning_rate": 7.530929040464917e-05, + "loss": 1.759, + "step": 11441 + }, + { + "epoch": 3.511970534069982, + "grad_norm": 0.2935381829738617, + "learning_rate": 7.530500354725491e-05, + "loss": 1.8235, + "step": 11442 + }, + { + "epoch": 3.5122774708410067, + "grad_norm": 0.26642969250679016, + "learning_rate": 7.53007164397818e-05, + "loss": 1.8324, + "step": 11443 + }, + { + "epoch": 3.512584407612032, + "grad_norm": 0.24830882251262665, + "learning_rate": 7.529642908227215e-05, + "loss": 1.8132, + "step": 11444 + }, + { + "epoch": 3.512891344383057, + "grad_norm": 0.3100191056728363, + "learning_rate": 7.529214147476838e-05, + "loss": 1.8453, + "step": 11445 + }, + { + "epoch": 3.5131982811540823, + "grad_norm": 0.27948811650276184, + "learning_rate": 7.528785361731282e-05, + "loss": 1.7792, + "step": 11446 + }, + { + "epoch": 3.5135052179251076, + "grad_norm": 0.26978832483291626, + "learning_rate": 7.528356550994787e-05, + "loss": 1.7857, + "step": 11447 + }, + { + "epoch": 3.5138121546961325, + "grad_norm": 0.30527836084365845, + "learning_rate": 7.527927715271592e-05, + "loss": 1.807, + "step": 11448 + }, + { + "epoch": 3.514119091467158, + "grad_norm": 0.2915664315223694, + "learning_rate": 7.527498854565934e-05, + "loss": 1.8414, + "step": 11449 + }, + { + "epoch": 3.5144260282381827, + "grad_norm": 0.2854034900665283, + "learning_rate": 7.52706996888205e-05, + "loss": 1.793, + "step": 11450 + }, + { + "epoch": 3.514732965009208, + "grad_norm": 0.30281978845596313, + "learning_rate": 7.52664105822418e-05, + "loss": 1.7896, + "step": 11451 + }, + { + "epoch": 3.5150399017802334, + "grad_norm": 0.3317166566848755, + "learning_rate": 7.526212122596561e-05, + "loss": 1.7776, + "step": 11452 + }, + { + "epoch": 3.5153468385512583, + "grad_norm": 0.3400021195411682, + "learning_rate": 7.525783162003434e-05, + "loss": 1.8411, + "step": 11453 + }, + { + "epoch": 3.5156537753222836, + "grad_norm": 0.25169485807418823, + "learning_rate": 7.525354176449037e-05, + "loss": 1.7871, + "step": 11454 + }, + { + "epoch": 3.5159607120933085, + "grad_norm": 0.3442455530166626, + "learning_rate": 7.52492516593761e-05, + "loss": 1.7644, + "step": 11455 + }, + { + "epoch": 3.516267648864334, + "grad_norm": 0.35644033551216125, + "learning_rate": 7.524496130473394e-05, + "loss": 1.801, + "step": 11456 + }, + { + "epoch": 3.516574585635359, + "grad_norm": 0.3180185854434967, + "learning_rate": 7.524067070060625e-05, + "loss": 1.7897, + "step": 11457 + }, + { + "epoch": 3.5168815224063845, + "grad_norm": 0.2417978048324585, + "learning_rate": 7.523637984703548e-05, + "loss": 1.8527, + "step": 11458 + }, + { + "epoch": 3.5171884591774094, + "grad_norm": 0.29661375284194946, + "learning_rate": 7.5232088744064e-05, + "loss": 1.8276, + "step": 11459 + }, + { + "epoch": 3.5174953959484347, + "grad_norm": 0.2467545121908188, + "learning_rate": 7.522779739173424e-05, + "loss": 1.7819, + "step": 11460 + }, + { + "epoch": 3.5178023327194596, + "grad_norm": 0.26177898049354553, + "learning_rate": 7.522350579008859e-05, + "loss": 1.8017, + "step": 11461 + }, + { + "epoch": 3.518109269490485, + "grad_norm": 0.28740498423576355, + "learning_rate": 7.521921393916948e-05, + "loss": 1.7863, + "step": 11462 + }, + { + "epoch": 3.5184162062615103, + "grad_norm": 0.28685200214385986, + "learning_rate": 7.521492183901932e-05, + "loss": 1.8069, + "step": 11463 + }, + { + "epoch": 3.518723143032535, + "grad_norm": 0.24174338579177856, + "learning_rate": 7.521062948968051e-05, + "loss": 1.7523, + "step": 11464 + }, + { + "epoch": 3.5190300798035605, + "grad_norm": 0.23273243010044098, + "learning_rate": 7.520633689119548e-05, + "loss": 1.7827, + "step": 11465 + }, + { + "epoch": 3.5193370165745854, + "grad_norm": 0.22708217799663544, + "learning_rate": 7.520204404360667e-05, + "loss": 1.7377, + "step": 11466 + }, + { + "epoch": 3.5196439533456108, + "grad_norm": 0.24725353717803955, + "learning_rate": 7.519775094695649e-05, + "loss": 1.7828, + "step": 11467 + }, + { + "epoch": 3.519950890116636, + "grad_norm": 0.23046265542507172, + "learning_rate": 7.519345760128736e-05, + "loss": 1.7427, + "step": 11468 + }, + { + "epoch": 3.520257826887661, + "grad_norm": 0.2618728280067444, + "learning_rate": 7.518916400664171e-05, + "loss": 1.8133, + "step": 11469 + }, + { + "epoch": 3.5205647636586863, + "grad_norm": 0.23232363164424896, + "learning_rate": 7.5184870163062e-05, + "loss": 1.7468, + "step": 11470 + }, + { + "epoch": 3.520871700429711, + "grad_norm": 0.21993626654148102, + "learning_rate": 7.51805760705906e-05, + "loss": 1.7565, + "step": 11471 + }, + { + "epoch": 3.5211786372007365, + "grad_norm": 0.23563124239444733, + "learning_rate": 7.517628172927001e-05, + "loss": 1.7795, + "step": 11472 + }, + { + "epoch": 3.521485573971762, + "grad_norm": 0.24502862989902496, + "learning_rate": 7.517198713914266e-05, + "loss": 1.813, + "step": 11473 + }, + { + "epoch": 3.521792510742787, + "grad_norm": 0.24745969474315643, + "learning_rate": 7.516769230025097e-05, + "loss": 1.7601, + "step": 11474 + }, + { + "epoch": 3.522099447513812, + "grad_norm": 0.27686986327171326, + "learning_rate": 7.516339721263739e-05, + "loss": 1.8121, + "step": 11475 + }, + { + "epoch": 3.5224063842848374, + "grad_norm": 0.3110332787036896, + "learning_rate": 7.515910187634439e-05, + "loss": 1.7978, + "step": 11476 + }, + { + "epoch": 3.5227133210558623, + "grad_norm": 0.3394792377948761, + "learning_rate": 7.515480629141436e-05, + "loss": 1.8427, + "step": 11477 + }, + { + "epoch": 3.5230202578268877, + "grad_norm": 0.2802537679672241, + "learning_rate": 7.515051045788984e-05, + "loss": 1.7343, + "step": 11478 + }, + { + "epoch": 3.523327194597913, + "grad_norm": 0.23687711358070374, + "learning_rate": 7.514621437581319e-05, + "loss": 1.7786, + "step": 11479 + }, + { + "epoch": 3.523634131368938, + "grad_norm": 0.31114310026168823, + "learning_rate": 7.514191804522693e-05, + "loss": 1.8137, + "step": 11480 + }, + { + "epoch": 3.523941068139963, + "grad_norm": 0.3257891833782196, + "learning_rate": 7.513762146617351e-05, + "loss": 1.8015, + "step": 11481 + }, + { + "epoch": 3.524248004910988, + "grad_norm": 0.24353443086147308, + "learning_rate": 7.513332463869536e-05, + "loss": 1.7485, + "step": 11482 + }, + { + "epoch": 3.5245549416820134, + "grad_norm": 0.29861485958099365, + "learning_rate": 7.512902756283498e-05, + "loss": 1.7993, + "step": 11483 + }, + { + "epoch": 3.5248618784530388, + "grad_norm": 0.40380924940109253, + "learning_rate": 7.51247302386348e-05, + "loss": 1.7664, + "step": 11484 + }, + { + "epoch": 3.525168815224064, + "grad_norm": 0.3365862965583801, + "learning_rate": 7.512043266613733e-05, + "loss": 1.7512, + "step": 11485 + }, + { + "epoch": 3.525475751995089, + "grad_norm": 0.2502824068069458, + "learning_rate": 7.511613484538502e-05, + "loss": 1.8414, + "step": 11486 + }, + { + "epoch": 3.5257826887661143, + "grad_norm": 0.2598603069782257, + "learning_rate": 7.511183677642034e-05, + "loss": 1.7358, + "step": 11487 + }, + { + "epoch": 3.5260896255371392, + "grad_norm": 0.30246880650520325, + "learning_rate": 7.510753845928576e-05, + "loss": 1.791, + "step": 11488 + }, + { + "epoch": 3.5263965623081646, + "grad_norm": 0.25170832872390747, + "learning_rate": 7.510323989402378e-05, + "loss": 1.7498, + "step": 11489 + }, + { + "epoch": 3.52670349907919, + "grad_norm": 0.2925282418727875, + "learning_rate": 7.509894108067688e-05, + "loss": 1.8413, + "step": 11490 + }, + { + "epoch": 3.527010435850215, + "grad_norm": 0.2643601596355438, + "learning_rate": 7.509464201928752e-05, + "loss": 1.8052, + "step": 11491 + }, + { + "epoch": 3.52731737262124, + "grad_norm": 0.2938917279243469, + "learning_rate": 7.50903427098982e-05, + "loss": 1.7308, + "step": 11492 + }, + { + "epoch": 3.527624309392265, + "grad_norm": 0.2978343367576599, + "learning_rate": 7.508604315255142e-05, + "loss": 1.8147, + "step": 11493 + }, + { + "epoch": 3.5279312461632903, + "grad_norm": 0.2507816255092621, + "learning_rate": 7.508174334728963e-05, + "loss": 1.774, + "step": 11494 + }, + { + "epoch": 3.5282381829343157, + "grad_norm": 0.32971861958503723, + "learning_rate": 7.507744329415538e-05, + "loss": 1.7634, + "step": 11495 + }, + { + "epoch": 3.5285451197053406, + "grad_norm": 0.3149639964103699, + "learning_rate": 7.507314299319113e-05, + "loss": 1.8032, + "step": 11496 + }, + { + "epoch": 3.528852056476366, + "grad_norm": 0.2721364498138428, + "learning_rate": 7.506884244443937e-05, + "loss": 1.7702, + "step": 11497 + }, + { + "epoch": 3.529158993247391, + "grad_norm": 0.29375985264778137, + "learning_rate": 7.506454164794263e-05, + "loss": 1.8673, + "step": 11498 + }, + { + "epoch": 3.529465930018416, + "grad_norm": 0.379944384098053, + "learning_rate": 7.50602406037434e-05, + "loss": 1.883, + "step": 11499 + }, + { + "epoch": 3.5297728667894415, + "grad_norm": 0.4041840136051178, + "learning_rate": 7.505593931188417e-05, + "loss": 1.7998, + "step": 11500 + }, + { + "epoch": 3.530079803560467, + "grad_norm": 0.30013784766197205, + "learning_rate": 7.505163777240747e-05, + "loss": 1.775, + "step": 11501 + }, + { + "epoch": 3.5303867403314917, + "grad_norm": 0.25161153078079224, + "learning_rate": 7.50473359853558e-05, + "loss": 1.8609, + "step": 11502 + }, + { + "epoch": 3.530693677102517, + "grad_norm": 0.2803831100463867, + "learning_rate": 7.504303395077168e-05, + "loss": 1.8397, + "step": 11503 + }, + { + "epoch": 3.531000613873542, + "grad_norm": 0.26678118109703064, + "learning_rate": 7.503873166869762e-05, + "loss": 1.7877, + "step": 11504 + }, + { + "epoch": 3.5313075506445673, + "grad_norm": 0.24280449748039246, + "learning_rate": 7.503442913917613e-05, + "loss": 1.7891, + "step": 11505 + }, + { + "epoch": 3.5316144874155926, + "grad_norm": 0.26461485028266907, + "learning_rate": 7.503012636224976e-05, + "loss": 1.7993, + "step": 11506 + }, + { + "epoch": 3.5319214241866175, + "grad_norm": 0.27001824975013733, + "learning_rate": 7.502582333796098e-05, + "loss": 1.7719, + "step": 11507 + }, + { + "epoch": 3.532228360957643, + "grad_norm": 0.27585846185684204, + "learning_rate": 7.502152006635237e-05, + "loss": 1.7412, + "step": 11508 + }, + { + "epoch": 3.5325352977286677, + "grad_norm": 0.24896648526191711, + "learning_rate": 7.501721654746643e-05, + "loss": 1.7459, + "step": 11509 + }, + { + "epoch": 3.532842234499693, + "grad_norm": 0.2308502197265625, + "learning_rate": 7.501291278134569e-05, + "loss": 1.7717, + "step": 11510 + }, + { + "epoch": 3.5331491712707184, + "grad_norm": 0.3026069104671478, + "learning_rate": 7.500860876803267e-05, + "loss": 1.8578, + "step": 11511 + }, + { + "epoch": 3.5334561080417433, + "grad_norm": 0.30242082476615906, + "learning_rate": 7.500430450756995e-05, + "loss": 1.7793, + "step": 11512 + }, + { + "epoch": 3.5337630448127686, + "grad_norm": 0.2583339214324951, + "learning_rate": 7.500000000000001e-05, + "loss": 1.8388, + "step": 11513 + }, + { + "epoch": 3.5340699815837935, + "grad_norm": 0.29673871397972107, + "learning_rate": 7.499569524536542e-05, + "loss": 1.7749, + "step": 11514 + }, + { + "epoch": 3.534376918354819, + "grad_norm": 0.35199788212776184, + "learning_rate": 7.499139024370874e-05, + "loss": 1.7863, + "step": 11515 + }, + { + "epoch": 3.534683855125844, + "grad_norm": 0.25776436924934387, + "learning_rate": 7.498708499507247e-05, + "loss": 1.7568, + "step": 11516 + }, + { + "epoch": 3.5349907918968695, + "grad_norm": 0.26081520318984985, + "learning_rate": 7.498277949949919e-05, + "loss": 1.807, + "step": 11517 + }, + { + "epoch": 3.5352977286678944, + "grad_norm": 0.29247912764549255, + "learning_rate": 7.497847375703145e-05, + "loss": 1.7568, + "step": 11518 + }, + { + "epoch": 3.5356046654389197, + "grad_norm": 0.20964498817920685, + "learning_rate": 7.497416776771178e-05, + "loss": 1.7601, + "step": 11519 + }, + { + "epoch": 3.5359116022099446, + "grad_norm": 0.28739818930625916, + "learning_rate": 7.496986153158273e-05, + "loss": 1.7915, + "step": 11520 + }, + { + "epoch": 3.53621853898097, + "grad_norm": 0.3109932839870453, + "learning_rate": 7.496555504868691e-05, + "loss": 1.8046, + "step": 11521 + }, + { + "epoch": 3.5365254757519953, + "grad_norm": 0.259284108877182, + "learning_rate": 7.496124831906681e-05, + "loss": 1.7595, + "step": 11522 + }, + { + "epoch": 3.53683241252302, + "grad_norm": 0.265909343957901, + "learning_rate": 7.495694134276504e-05, + "loss": 1.8249, + "step": 11523 + }, + { + "epoch": 3.5371393492940455, + "grad_norm": 0.2478799819946289, + "learning_rate": 7.495263411982415e-05, + "loss": 1.8531, + "step": 11524 + }, + { + "epoch": 3.5374462860650704, + "grad_norm": 0.2636432945728302, + "learning_rate": 7.494832665028671e-05, + "loss": 1.8114, + "step": 11525 + }, + { + "epoch": 3.5377532228360957, + "grad_norm": 0.25323864817619324, + "learning_rate": 7.494401893419527e-05, + "loss": 1.8271, + "step": 11526 + }, + { + "epoch": 3.538060159607121, + "grad_norm": 0.2352467179298401, + "learning_rate": 7.493971097159241e-05, + "loss": 1.7524, + "step": 11527 + }, + { + "epoch": 3.538367096378146, + "grad_norm": 0.2788623869419098, + "learning_rate": 7.493540276252072e-05, + "loss": 1.8238, + "step": 11528 + }, + { + "epoch": 3.5386740331491713, + "grad_norm": 0.3506326377391815, + "learning_rate": 7.493109430702277e-05, + "loss": 1.8525, + "step": 11529 + }, + { + "epoch": 3.538980969920196, + "grad_norm": 0.3685263395309448, + "learning_rate": 7.492678560514113e-05, + "loss": 1.8497, + "step": 11530 + }, + { + "epoch": 3.5392879066912215, + "grad_norm": 0.32200056314468384, + "learning_rate": 7.492247665691837e-05, + "loss": 1.7587, + "step": 11531 + }, + { + "epoch": 3.539594843462247, + "grad_norm": 0.2800062894821167, + "learning_rate": 7.49181674623971e-05, + "loss": 1.8188, + "step": 11532 + }, + { + "epoch": 3.539901780233272, + "grad_norm": 0.24137580394744873, + "learning_rate": 7.491385802161989e-05, + "loss": 1.7947, + "step": 11533 + }, + { + "epoch": 3.540208717004297, + "grad_norm": 0.21900027990341187, + "learning_rate": 7.490954833462933e-05, + "loss": 1.7722, + "step": 11534 + }, + { + "epoch": 3.5405156537753224, + "grad_norm": 0.25009945034980774, + "learning_rate": 7.490523840146803e-05, + "loss": 1.8173, + "step": 11535 + }, + { + "epoch": 3.5408225905463473, + "grad_norm": 0.2778431475162506, + "learning_rate": 7.490092822217855e-05, + "loss": 1.8368, + "step": 11536 + }, + { + "epoch": 3.5411295273173726, + "grad_norm": 0.2845982611179352, + "learning_rate": 7.48966177968035e-05, + "loss": 1.7539, + "step": 11537 + }, + { + "epoch": 3.541436464088398, + "grad_norm": 0.27480921149253845, + "learning_rate": 7.48923071253855e-05, + "loss": 1.8494, + "step": 11538 + }, + { + "epoch": 3.541743400859423, + "grad_norm": 0.2722087502479553, + "learning_rate": 7.488799620796711e-05, + "loss": 1.8422, + "step": 11539 + }, + { + "epoch": 3.542050337630448, + "grad_norm": 0.2984340190887451, + "learning_rate": 7.488368504459097e-05, + "loss": 1.8042, + "step": 11540 + }, + { + "epoch": 3.542357274401473, + "grad_norm": 0.2405850738286972, + "learning_rate": 7.487937363529966e-05, + "loss": 1.749, + "step": 11541 + }, + { + "epoch": 3.5426642111724984, + "grad_norm": 0.24816973507404327, + "learning_rate": 7.487506198013579e-05, + "loss": 1.8671, + "step": 11542 + }, + { + "epoch": 3.5429711479435237, + "grad_norm": 0.2796473503112793, + "learning_rate": 7.487075007914199e-05, + "loss": 1.8023, + "step": 11543 + }, + { + "epoch": 3.5432780847145486, + "grad_norm": 0.2600162625312805, + "learning_rate": 7.486643793236086e-05, + "loss": 1.7997, + "step": 11544 + }, + { + "epoch": 3.543585021485574, + "grad_norm": 0.2746226489543915, + "learning_rate": 7.486212553983503e-05, + "loss": 1.7773, + "step": 11545 + }, + { + "epoch": 3.5438919582565993, + "grad_norm": 0.24142079055309296, + "learning_rate": 7.485781290160708e-05, + "loss": 1.791, + "step": 11546 + }, + { + "epoch": 3.544198895027624, + "grad_norm": 0.2472934126853943, + "learning_rate": 7.485350001771966e-05, + "loss": 1.8183, + "step": 11547 + }, + { + "epoch": 3.5445058317986495, + "grad_norm": 0.26891404390335083, + "learning_rate": 7.48491868882154e-05, + "loss": 1.7421, + "step": 11548 + }, + { + "epoch": 3.544812768569675, + "grad_norm": 0.24820464849472046, + "learning_rate": 7.48448735131369e-05, + "loss": 1.7372, + "step": 11549 + }, + { + "epoch": 3.5451197053406998, + "grad_norm": 0.2456594705581665, + "learning_rate": 7.484055989252679e-05, + "loss": 1.7883, + "step": 11550 + }, + { + "epoch": 3.545426642111725, + "grad_norm": 0.32420551776885986, + "learning_rate": 7.48362460264277e-05, + "loss": 1.8363, + "step": 11551 + }, + { + "epoch": 3.54573357888275, + "grad_norm": 0.3187662661075592, + "learning_rate": 7.483193191488229e-05, + "loss": 1.7957, + "step": 11552 + }, + { + "epoch": 3.5460405156537753, + "grad_norm": 0.2845410108566284, + "learning_rate": 7.482761755793316e-05, + "loss": 1.8288, + "step": 11553 + }, + { + "epoch": 3.5463474524248007, + "grad_norm": 0.2816021740436554, + "learning_rate": 7.482330295562298e-05, + "loss": 1.7562, + "step": 11554 + }, + { + "epoch": 3.5466543891958255, + "grad_norm": 0.28938058018684387, + "learning_rate": 7.481898810799435e-05, + "loss": 1.8139, + "step": 11555 + }, + { + "epoch": 3.546961325966851, + "grad_norm": 0.3305707573890686, + "learning_rate": 7.481467301508995e-05, + "loss": 1.8956, + "step": 11556 + }, + { + "epoch": 3.5472682627378758, + "grad_norm": 0.3890376091003418, + "learning_rate": 7.48103576769524e-05, + "loss": 1.8552, + "step": 11557 + }, + { + "epoch": 3.547575199508901, + "grad_norm": 0.3900652825832367, + "learning_rate": 7.480604209362434e-05, + "loss": 1.7748, + "step": 11558 + }, + { + "epoch": 3.5478821362799264, + "grad_norm": 0.3297326862812042, + "learning_rate": 7.480172626514845e-05, + "loss": 1.8201, + "step": 11559 + }, + { + "epoch": 3.5481890730509518, + "grad_norm": 0.28797218203544617, + "learning_rate": 7.479741019156737e-05, + "loss": 1.7652, + "step": 11560 + }, + { + "epoch": 3.5484960098219767, + "grad_norm": 0.2764691114425659, + "learning_rate": 7.479309387292373e-05, + "loss": 1.7534, + "step": 11561 + }, + { + "epoch": 3.548802946593002, + "grad_norm": 0.25067585706710815, + "learning_rate": 7.47887773092602e-05, + "loss": 1.7849, + "step": 11562 + }, + { + "epoch": 3.549109883364027, + "grad_norm": 0.29966798424720764, + "learning_rate": 7.478446050061947e-05, + "loss": 1.8299, + "step": 11563 + }, + { + "epoch": 3.549416820135052, + "grad_norm": 0.24068406224250793, + "learning_rate": 7.478014344704416e-05, + "loss": 1.8366, + "step": 11564 + }, + { + "epoch": 3.5497237569060776, + "grad_norm": 0.2559303641319275, + "learning_rate": 7.477582614857695e-05, + "loss": 1.7665, + "step": 11565 + }, + { + "epoch": 3.5500306936771024, + "grad_norm": 0.24617858231067657, + "learning_rate": 7.47715086052605e-05, + "loss": 1.8334, + "step": 11566 + }, + { + "epoch": 3.550337630448128, + "grad_norm": 0.2433501034975052, + "learning_rate": 7.476719081713749e-05, + "loss": 1.7963, + "step": 11567 + }, + { + "epoch": 3.5506445672191527, + "grad_norm": 0.2583518326282501, + "learning_rate": 7.476287278425057e-05, + "loss": 1.8311, + "step": 11568 + }, + { + "epoch": 3.550951503990178, + "grad_norm": 0.3232485055923462, + "learning_rate": 7.475855450664244e-05, + "loss": 1.9162, + "step": 11569 + }, + { + "epoch": 3.5512584407612033, + "grad_norm": 0.28247153759002686, + "learning_rate": 7.475423598435576e-05, + "loss": 1.8027, + "step": 11570 + }, + { + "epoch": 3.5515653775322282, + "grad_norm": 0.27201834321022034, + "learning_rate": 7.47499172174332e-05, + "loss": 1.7822, + "step": 11571 + }, + { + "epoch": 3.5518723143032536, + "grad_norm": 0.2408471554517746, + "learning_rate": 7.474559820591748e-05, + "loss": 1.7735, + "step": 11572 + }, + { + "epoch": 3.5521792510742785, + "grad_norm": 0.24187393486499786, + "learning_rate": 7.474127894985124e-05, + "loss": 1.7931, + "step": 11573 + }, + { + "epoch": 3.552486187845304, + "grad_norm": 0.2759699523448944, + "learning_rate": 7.473695944927717e-05, + "loss": 1.8407, + "step": 11574 + }, + { + "epoch": 3.552793124616329, + "grad_norm": 0.2503111958503723, + "learning_rate": 7.473263970423797e-05, + "loss": 1.7613, + "step": 11575 + }, + { + "epoch": 3.5531000613873545, + "grad_norm": 0.24795177578926086, + "learning_rate": 7.472831971477633e-05, + "loss": 1.8221, + "step": 11576 + }, + { + "epoch": 3.5534069981583793, + "grad_norm": 0.23190177977085114, + "learning_rate": 7.472399948093494e-05, + "loss": 1.7541, + "step": 11577 + }, + { + "epoch": 3.5537139349294047, + "grad_norm": 0.24650825560092926, + "learning_rate": 7.471967900275653e-05, + "loss": 1.8002, + "step": 11578 + }, + { + "epoch": 3.5540208717004296, + "grad_norm": 0.256598562002182, + "learning_rate": 7.471535828028372e-05, + "loss": 1.7052, + "step": 11579 + }, + { + "epoch": 3.554327808471455, + "grad_norm": 0.2715381681919098, + "learning_rate": 7.471103731355926e-05, + "loss": 1.7701, + "step": 11580 + }, + { + "epoch": 3.5546347452424802, + "grad_norm": 0.29806044697761536, + "learning_rate": 7.470671610262586e-05, + "loss": 1.7614, + "step": 11581 + }, + { + "epoch": 3.554941682013505, + "grad_norm": 0.26364314556121826, + "learning_rate": 7.470239464752621e-05, + "loss": 1.7957, + "step": 11582 + }, + { + "epoch": 3.5552486187845305, + "grad_norm": 0.29270800948143005, + "learning_rate": 7.4698072948303e-05, + "loss": 1.8263, + "step": 11583 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.25941839814186096, + "learning_rate": 7.469375100499898e-05, + "loss": 1.8517, + "step": 11584 + }, + { + "epoch": 3.5558624923265807, + "grad_norm": 0.29509237408638, + "learning_rate": 7.468942881765681e-05, + "loss": 1.8643, + "step": 11585 + }, + { + "epoch": 3.556169429097606, + "grad_norm": 0.23090367019176483, + "learning_rate": 7.468510638631926e-05, + "loss": 1.7239, + "step": 11586 + }, + { + "epoch": 3.556476365868631, + "grad_norm": 0.2696724236011505, + "learning_rate": 7.468078371102901e-05, + "loss": 1.848, + "step": 11587 + }, + { + "epoch": 3.5567833026396563, + "grad_norm": 0.2691192626953125, + "learning_rate": 7.46764607918288e-05, + "loss": 1.8194, + "step": 11588 + }, + { + "epoch": 3.557090239410681, + "grad_norm": 0.26616501808166504, + "learning_rate": 7.467213762876131e-05, + "loss": 1.8382, + "step": 11589 + }, + { + "epoch": 3.5573971761817065, + "grad_norm": 0.30629831552505493, + "learning_rate": 7.466781422186933e-05, + "loss": 1.8417, + "step": 11590 + }, + { + "epoch": 3.557704112952732, + "grad_norm": 0.27212417125701904, + "learning_rate": 7.466349057119552e-05, + "loss": 1.7612, + "step": 11591 + }, + { + "epoch": 3.558011049723757, + "grad_norm": 0.2872084379196167, + "learning_rate": 7.465916667678266e-05, + "loss": 1.7998, + "step": 11592 + }, + { + "epoch": 3.558317986494782, + "grad_norm": 0.3017117977142334, + "learning_rate": 7.465484253867348e-05, + "loss": 1.7996, + "step": 11593 + }, + { + "epoch": 3.5586249232658074, + "grad_norm": 0.2707957327365875, + "learning_rate": 7.465051815691066e-05, + "loss": 1.7678, + "step": 11594 + }, + { + "epoch": 3.5589318600368323, + "grad_norm": 0.28932711482048035, + "learning_rate": 7.464619353153702e-05, + "loss": 1.8576, + "step": 11595 + }, + { + "epoch": 3.5592387968078576, + "grad_norm": 0.2585125267505646, + "learning_rate": 7.464186866259519e-05, + "loss": 1.8678, + "step": 11596 + }, + { + "epoch": 3.559545733578883, + "grad_norm": 0.24386851489543915, + "learning_rate": 7.4637543550128e-05, + "loss": 1.7778, + "step": 11597 + }, + { + "epoch": 3.559852670349908, + "grad_norm": 0.2375860959291458, + "learning_rate": 7.463321819417817e-05, + "loss": 1.8096, + "step": 11598 + }, + { + "epoch": 3.560159607120933, + "grad_norm": 0.2341299206018448, + "learning_rate": 7.462889259478842e-05, + "loss": 1.7191, + "step": 11599 + }, + { + "epoch": 3.560466543891958, + "grad_norm": 0.2510595917701721, + "learning_rate": 7.462456675200154e-05, + "loss": 1.7763, + "step": 11600 + }, + { + "epoch": 3.5607734806629834, + "grad_norm": 0.2554674744606018, + "learning_rate": 7.462024066586025e-05, + "loss": 1.7578, + "step": 11601 + }, + { + "epoch": 3.5610804174340087, + "grad_norm": 0.25040730834007263, + "learning_rate": 7.46159143364073e-05, + "loss": 1.8194, + "step": 11602 + }, + { + "epoch": 3.5613873542050336, + "grad_norm": 0.24294932186603546, + "learning_rate": 7.461158776368547e-05, + "loss": 1.8063, + "step": 11603 + }, + { + "epoch": 3.561694290976059, + "grad_norm": 0.2388325333595276, + "learning_rate": 7.46072609477375e-05, + "loss": 1.7942, + "step": 11604 + }, + { + "epoch": 3.562001227747084, + "grad_norm": 0.2569502890110016, + "learning_rate": 7.460293388860615e-05, + "loss": 1.7824, + "step": 11605 + }, + { + "epoch": 3.562308164518109, + "grad_norm": 0.24004346132278442, + "learning_rate": 7.45986065863342e-05, + "loss": 1.8676, + "step": 11606 + }, + { + "epoch": 3.5626151012891345, + "grad_norm": 0.25446319580078125, + "learning_rate": 7.45942790409644e-05, + "loss": 1.7726, + "step": 11607 + }, + { + "epoch": 3.56292203806016, + "grad_norm": 0.26257482171058655, + "learning_rate": 7.458995125253951e-05, + "loss": 1.779, + "step": 11608 + }, + { + "epoch": 3.5632289748311847, + "grad_norm": 0.27703070640563965, + "learning_rate": 7.458562322110231e-05, + "loss": 1.8247, + "step": 11609 + }, + { + "epoch": 3.56353591160221, + "grad_norm": 0.25478535890579224, + "learning_rate": 7.458129494669556e-05, + "loss": 1.7794, + "step": 11610 + }, + { + "epoch": 3.563842848373235, + "grad_norm": 0.26173365116119385, + "learning_rate": 7.457696642936207e-05, + "loss": 1.758, + "step": 11611 + }, + { + "epoch": 3.5641497851442603, + "grad_norm": 0.25077274441719055, + "learning_rate": 7.45726376691446e-05, + "loss": 1.8234, + "step": 11612 + }, + { + "epoch": 3.5644567219152856, + "grad_norm": 0.2591109275817871, + "learning_rate": 7.456830866608589e-05, + "loss": 1.7723, + "step": 11613 + }, + { + "epoch": 3.5647636586863105, + "grad_norm": 0.2653447091579437, + "learning_rate": 7.456397942022877e-05, + "loss": 1.7839, + "step": 11614 + }, + { + "epoch": 3.565070595457336, + "grad_norm": 0.3203454911708832, + "learning_rate": 7.455964993161601e-05, + "loss": 1.8548, + "step": 11615 + }, + { + "epoch": 3.5653775322283607, + "grad_norm": 0.3041793704032898, + "learning_rate": 7.455532020029039e-05, + "loss": 1.7925, + "step": 11616 + }, + { + "epoch": 3.565684468999386, + "grad_norm": 0.26066139340400696, + "learning_rate": 7.45509902262947e-05, + "loss": 1.7905, + "step": 11617 + }, + { + "epoch": 3.5659914057704114, + "grad_norm": 0.2483314871788025, + "learning_rate": 7.454666000967174e-05, + "loss": 1.7658, + "step": 11618 + }, + { + "epoch": 3.5662983425414367, + "grad_norm": 0.24285900592803955, + "learning_rate": 7.45423295504643e-05, + "loss": 1.7575, + "step": 11619 + }, + { + "epoch": 3.5666052793124616, + "grad_norm": 0.27231669425964355, + "learning_rate": 7.453799884871517e-05, + "loss": 1.8389, + "step": 11620 + }, + { + "epoch": 3.566912216083487, + "grad_norm": 0.24324406683444977, + "learning_rate": 7.453366790446717e-05, + "loss": 1.7775, + "step": 11621 + }, + { + "epoch": 3.567219152854512, + "grad_norm": 0.2724440097808838, + "learning_rate": 7.452933671776305e-05, + "loss": 1.8135, + "step": 11622 + }, + { + "epoch": 3.567526089625537, + "grad_norm": 0.22207655012607574, + "learning_rate": 7.452500528864568e-05, + "loss": 1.722, + "step": 11623 + }, + { + "epoch": 3.5678330263965625, + "grad_norm": 0.25650298595428467, + "learning_rate": 7.452067361715782e-05, + "loss": 1.7813, + "step": 11624 + }, + { + "epoch": 3.5681399631675874, + "grad_norm": 0.2582200765609741, + "learning_rate": 7.45163417033423e-05, + "loss": 1.8253, + "step": 11625 + }, + { + "epoch": 3.5684468999386127, + "grad_norm": 0.29545384645462036, + "learning_rate": 7.451200954724188e-05, + "loss": 1.8108, + "step": 11626 + }, + { + "epoch": 3.5687538367096376, + "grad_norm": 0.30457428097724915, + "learning_rate": 7.450767714889946e-05, + "loss": 1.8257, + "step": 11627 + }, + { + "epoch": 3.569060773480663, + "grad_norm": 0.2955166697502136, + "learning_rate": 7.450334450835781e-05, + "loss": 1.8172, + "step": 11628 + }, + { + "epoch": 3.5693677102516883, + "grad_norm": 0.2793857753276825, + "learning_rate": 7.449901162565974e-05, + "loss": 1.8493, + "step": 11629 + }, + { + "epoch": 3.569674647022713, + "grad_norm": 0.27154335379600525, + "learning_rate": 7.449467850084808e-05, + "loss": 1.8306, + "step": 11630 + }, + { + "epoch": 3.5699815837937385, + "grad_norm": 0.22336189448833466, + "learning_rate": 7.449034513396564e-05, + "loss": 1.7435, + "step": 11631 + }, + { + "epoch": 3.5702885205647634, + "grad_norm": 0.22799183428287506, + "learning_rate": 7.448601152505526e-05, + "loss": 1.7818, + "step": 11632 + }, + { + "epoch": 3.5705954573357888, + "grad_norm": 0.26670658588409424, + "learning_rate": 7.448167767415976e-05, + "loss": 1.7777, + "step": 11633 + }, + { + "epoch": 3.570902394106814, + "grad_norm": 0.2848666310310364, + "learning_rate": 7.447734358132196e-05, + "loss": 1.7572, + "step": 11634 + }, + { + "epoch": 3.5712093308778394, + "grad_norm": 0.26843544840812683, + "learning_rate": 7.447300924658473e-05, + "loss": 1.7642, + "step": 11635 + }, + { + "epoch": 3.5715162676488643, + "grad_norm": 0.24666404724121094, + "learning_rate": 7.446867466999087e-05, + "loss": 1.7533, + "step": 11636 + }, + { + "epoch": 3.5718232044198897, + "grad_norm": 0.31111210584640503, + "learning_rate": 7.44643398515832e-05, + "loss": 1.7875, + "step": 11637 + }, + { + "epoch": 3.5721301411909145, + "grad_norm": 0.3157108724117279, + "learning_rate": 7.446000479140462e-05, + "loss": 1.7879, + "step": 11638 + }, + { + "epoch": 3.57243707796194, + "grad_norm": 0.2935558259487152, + "learning_rate": 7.445566948949792e-05, + "loss": 1.7819, + "step": 11639 + }, + { + "epoch": 3.572744014732965, + "grad_norm": 0.2265472710132599, + "learning_rate": 7.445133394590597e-05, + "loss": 1.7518, + "step": 11640 + }, + { + "epoch": 3.57305095150399, + "grad_norm": 0.2564176023006439, + "learning_rate": 7.444699816067159e-05, + "loss": 1.7281, + "step": 11641 + }, + { + "epoch": 3.5733578882750154, + "grad_norm": 0.27933555841445923, + "learning_rate": 7.444266213383766e-05, + "loss": 1.7852, + "step": 11642 + }, + { + "epoch": 3.5736648250460403, + "grad_norm": 0.29105356335639954, + "learning_rate": 7.4438325865447e-05, + "loss": 1.8056, + "step": 11643 + }, + { + "epoch": 3.5739717618170657, + "grad_norm": 0.27665549516677856, + "learning_rate": 7.443398935554249e-05, + "loss": 1.7249, + "step": 11644 + }, + { + "epoch": 3.574278698588091, + "grad_norm": 0.21899232268333435, + "learning_rate": 7.442965260416698e-05, + "loss": 1.7689, + "step": 11645 + }, + { + "epoch": 3.574585635359116, + "grad_norm": 0.3250672221183777, + "learning_rate": 7.442531561136333e-05, + "loss": 1.8058, + "step": 11646 + }, + { + "epoch": 3.574892572130141, + "grad_norm": 0.42442524433135986, + "learning_rate": 7.442097837717438e-05, + "loss": 1.7887, + "step": 11647 + }, + { + "epoch": 3.575199508901166, + "grad_norm": 0.33108964562416077, + "learning_rate": 7.441664090164302e-05, + "loss": 1.7628, + "step": 11648 + }, + { + "epoch": 3.5755064456721914, + "grad_norm": 0.23050357401371002, + "learning_rate": 7.44123031848121e-05, + "loss": 1.8121, + "step": 11649 + }, + { + "epoch": 3.575813382443217, + "grad_norm": 0.29251593351364136, + "learning_rate": 7.440796522672448e-05, + "loss": 1.8051, + "step": 11650 + }, + { + "epoch": 3.576120319214242, + "grad_norm": 0.3764750063419342, + "learning_rate": 7.440362702742305e-05, + "loss": 1.9002, + "step": 11651 + }, + { + "epoch": 3.576427255985267, + "grad_norm": 0.3751949071884155, + "learning_rate": 7.439928858695069e-05, + "loss": 1.821, + "step": 11652 + }, + { + "epoch": 3.5767341927562923, + "grad_norm": 0.268476665019989, + "learning_rate": 7.439494990535024e-05, + "loss": 1.8241, + "step": 11653 + }, + { + "epoch": 3.5770411295273172, + "grad_norm": 0.3072795271873474, + "learning_rate": 7.439061098266459e-05, + "loss": 1.8169, + "step": 11654 + }, + { + "epoch": 3.5773480662983426, + "grad_norm": 0.4948901832103729, + "learning_rate": 7.438627181893664e-05, + "loss": 1.7706, + "step": 11655 + }, + { + "epoch": 3.577655003069368, + "grad_norm": 0.5892601013183594, + "learning_rate": 7.438193241420926e-05, + "loss": 1.7631, + "step": 11656 + }, + { + "epoch": 3.577961939840393, + "grad_norm": 0.4599401652812958, + "learning_rate": 7.437759276852533e-05, + "loss": 1.7471, + "step": 11657 + }, + { + "epoch": 3.578268876611418, + "grad_norm": 0.2545170783996582, + "learning_rate": 7.437325288192773e-05, + "loss": 1.7945, + "step": 11658 + }, + { + "epoch": 3.578575813382443, + "grad_norm": 0.3136496841907501, + "learning_rate": 7.436891275445938e-05, + "loss": 1.828, + "step": 11659 + }, + { + "epoch": 3.5788827501534684, + "grad_norm": 0.3631688058376312, + "learning_rate": 7.436457238616313e-05, + "loss": 1.8302, + "step": 11660 + }, + { + "epoch": 3.5791896869244937, + "grad_norm": 0.3097386658191681, + "learning_rate": 7.436023177708192e-05, + "loss": 1.8397, + "step": 11661 + }, + { + "epoch": 3.5794966236955186, + "grad_norm": 0.20948798954486847, + "learning_rate": 7.43558909272586e-05, + "loss": 1.7844, + "step": 11662 + }, + { + "epoch": 3.579803560466544, + "grad_norm": 0.24327392876148224, + "learning_rate": 7.43515498367361e-05, + "loss": 1.7827, + "step": 11663 + }, + { + "epoch": 3.580110497237569, + "grad_norm": 0.25268325209617615, + "learning_rate": 7.434720850555731e-05, + "loss": 1.8224, + "step": 11664 + }, + { + "epoch": 3.580417434008594, + "grad_norm": 0.24883607029914856, + "learning_rate": 7.434286693376513e-05, + "loss": 1.8189, + "step": 11665 + }, + { + "epoch": 3.5807243707796195, + "grad_norm": 0.2942518889904022, + "learning_rate": 7.433852512140248e-05, + "loss": 1.8325, + "step": 11666 + }, + { + "epoch": 3.581031307550645, + "grad_norm": 0.3556186556816101, + "learning_rate": 7.433418306851225e-05, + "loss": 1.7511, + "step": 11667 + }, + { + "epoch": 3.5813382443216697, + "grad_norm": 0.421220600605011, + "learning_rate": 7.432984077513738e-05, + "loss": 1.8081, + "step": 11668 + }, + { + "epoch": 3.581645181092695, + "grad_norm": 0.3338243067264557, + "learning_rate": 7.432549824132074e-05, + "loss": 1.8274, + "step": 11669 + }, + { + "epoch": 3.58195211786372, + "grad_norm": 0.25091543793678284, + "learning_rate": 7.432115546710528e-05, + "loss": 1.7637, + "step": 11670 + }, + { + "epoch": 3.5822590546347453, + "grad_norm": 0.29870370030403137, + "learning_rate": 7.431681245253389e-05, + "loss": 1.8036, + "step": 11671 + }, + { + "epoch": 3.5825659914057706, + "grad_norm": 0.2682137191295624, + "learning_rate": 7.431246919764953e-05, + "loss": 1.8252, + "step": 11672 + }, + { + "epoch": 3.5828729281767955, + "grad_norm": 0.28790801763534546, + "learning_rate": 7.430812570249508e-05, + "loss": 1.7713, + "step": 11673 + }, + { + "epoch": 3.583179864947821, + "grad_norm": 0.26357609033584595, + "learning_rate": 7.43037819671135e-05, + "loss": 1.8388, + "step": 11674 + }, + { + "epoch": 3.5834868017188457, + "grad_norm": 0.2505483031272888, + "learning_rate": 7.42994379915477e-05, + "loss": 1.7722, + "step": 11675 + }, + { + "epoch": 3.583793738489871, + "grad_norm": 0.2535844147205353, + "learning_rate": 7.42950937758406e-05, + "loss": 1.756, + "step": 11676 + }, + { + "epoch": 3.5841006752608964, + "grad_norm": 0.23045027256011963, + "learning_rate": 7.429074932003515e-05, + "loss": 1.791, + "step": 11677 + }, + { + "epoch": 3.5844076120319213, + "grad_norm": 0.22525762021541595, + "learning_rate": 7.428640462417428e-05, + "loss": 1.7234, + "step": 11678 + }, + { + "epoch": 3.5847145488029466, + "grad_norm": 0.2402270883321762, + "learning_rate": 7.428205968830094e-05, + "loss": 1.845, + "step": 11679 + }, + { + "epoch": 3.5850214855739715, + "grad_norm": 0.24909646809101105, + "learning_rate": 7.427771451245802e-05, + "loss": 1.8537, + "step": 11680 + }, + { + "epoch": 3.585328422344997, + "grad_norm": 0.25813063979148865, + "learning_rate": 7.427336909668853e-05, + "loss": 1.7353, + "step": 11681 + }, + { + "epoch": 3.585635359116022, + "grad_norm": 0.26073768734931946, + "learning_rate": 7.426902344103534e-05, + "loss": 1.8142, + "step": 11682 + }, + { + "epoch": 3.5859422958870475, + "grad_norm": 0.2498280256986618, + "learning_rate": 7.426467754554147e-05, + "loss": 1.7996, + "step": 11683 + }, + { + "epoch": 3.5862492326580724, + "grad_norm": 0.3131188154220581, + "learning_rate": 7.426033141024981e-05, + "loss": 1.7793, + "step": 11684 + }, + { + "epoch": 3.5865561694290977, + "grad_norm": 0.24118199944496155, + "learning_rate": 7.425598503520337e-05, + "loss": 1.8249, + "step": 11685 + }, + { + "epoch": 3.5868631062001226, + "grad_norm": 0.2791197597980499, + "learning_rate": 7.425163842044504e-05, + "loss": 1.7966, + "step": 11686 + }, + { + "epoch": 3.587170042971148, + "grad_norm": 0.2298576384782791, + "learning_rate": 7.424729156601781e-05, + "loss": 1.7224, + "step": 11687 + }, + { + "epoch": 3.5874769797421733, + "grad_norm": 0.23113438487052917, + "learning_rate": 7.424294447196462e-05, + "loss": 1.7641, + "step": 11688 + }, + { + "epoch": 3.587783916513198, + "grad_norm": 0.3064495027065277, + "learning_rate": 7.423859713832847e-05, + "loss": 1.8688, + "step": 11689 + }, + { + "epoch": 3.5880908532842235, + "grad_norm": 0.22847676277160645, + "learning_rate": 7.423424956515228e-05, + "loss": 1.7513, + "step": 11690 + }, + { + "epoch": 3.5883977900552484, + "grad_norm": 0.2797350585460663, + "learning_rate": 7.422990175247905e-05, + "loss": 1.8268, + "step": 11691 + }, + { + "epoch": 3.5887047268262737, + "grad_norm": 0.2753821313381195, + "learning_rate": 7.422555370035171e-05, + "loss": 1.7313, + "step": 11692 + }, + { + "epoch": 3.589011663597299, + "grad_norm": 0.2981179654598236, + "learning_rate": 7.422120540881326e-05, + "loss": 1.8455, + "step": 11693 + }, + { + "epoch": 3.5893186003683244, + "grad_norm": 0.33028867840766907, + "learning_rate": 7.421685687790667e-05, + "loss": 1.8397, + "step": 11694 + }, + { + "epoch": 3.5896255371393493, + "grad_norm": 0.409173846244812, + "learning_rate": 7.421250810767487e-05, + "loss": 1.8088, + "step": 11695 + }, + { + "epoch": 3.5899324739103746, + "grad_norm": 0.4118194878101349, + "learning_rate": 7.42081590981609e-05, + "loss": 1.7719, + "step": 11696 + }, + { + "epoch": 3.5902394106813995, + "grad_norm": 0.34716179966926575, + "learning_rate": 7.420380984940773e-05, + "loss": 1.8063, + "step": 11697 + }, + { + "epoch": 3.590546347452425, + "grad_norm": 0.27763083577156067, + "learning_rate": 7.419946036145829e-05, + "loss": 1.7777, + "step": 11698 + }, + { + "epoch": 3.59085328422345, + "grad_norm": 0.3175280690193176, + "learning_rate": 7.419511063435562e-05, + "loss": 1.697, + "step": 11699 + }, + { + "epoch": 3.591160220994475, + "grad_norm": 0.3151503801345825, + "learning_rate": 7.419076066814268e-05, + "loss": 1.8067, + "step": 11700 + }, + { + "epoch": 3.5914671577655004, + "grad_norm": 0.26914867758750916, + "learning_rate": 7.418641046286245e-05, + "loss": 1.7797, + "step": 11701 + }, + { + "epoch": 3.5917740945365253, + "grad_norm": 0.27231964468955994, + "learning_rate": 7.418206001855797e-05, + "loss": 1.7931, + "step": 11702 + }, + { + "epoch": 3.5920810313075506, + "grad_norm": 0.3352177143096924, + "learning_rate": 7.417770933527217e-05, + "loss": 1.9187, + "step": 11703 + }, + { + "epoch": 3.592387968078576, + "grad_norm": 0.3510081470012665, + "learning_rate": 7.417335841304808e-05, + "loss": 1.7889, + "step": 11704 + }, + { + "epoch": 3.592694904849601, + "grad_norm": 0.24949313700199127, + "learning_rate": 7.41690072519287e-05, + "loss": 1.7683, + "step": 11705 + }, + { + "epoch": 3.593001841620626, + "grad_norm": 0.28442221879959106, + "learning_rate": 7.416465585195702e-05, + "loss": 1.7889, + "step": 11706 + }, + { + "epoch": 3.593308778391651, + "grad_norm": 0.3355824649333954, + "learning_rate": 7.416030421317605e-05, + "loss": 1.7637, + "step": 11707 + }, + { + "epoch": 3.5936157151626764, + "grad_norm": 0.33569446206092834, + "learning_rate": 7.415595233562878e-05, + "loss": 1.919, + "step": 11708 + }, + { + "epoch": 3.5939226519337018, + "grad_norm": 0.2488354742527008, + "learning_rate": 7.415160021935825e-05, + "loss": 1.8424, + "step": 11709 + }, + { + "epoch": 3.594229588704727, + "grad_norm": 0.2701130509376526, + "learning_rate": 7.414724786440746e-05, + "loss": 1.7586, + "step": 11710 + }, + { + "epoch": 3.594536525475752, + "grad_norm": 0.26289790868759155, + "learning_rate": 7.414289527081939e-05, + "loss": 1.7975, + "step": 11711 + }, + { + "epoch": 3.5948434622467773, + "grad_norm": 0.25382301211357117, + "learning_rate": 7.413854243863707e-05, + "loss": 1.7393, + "step": 11712 + }, + { + "epoch": 3.595150399017802, + "grad_norm": 0.28282979130744934, + "learning_rate": 7.413418936790357e-05, + "loss": 1.8048, + "step": 11713 + }, + { + "epoch": 3.5954573357888275, + "grad_norm": 0.28001347184181213, + "learning_rate": 7.412983605866183e-05, + "loss": 1.7864, + "step": 11714 + }, + { + "epoch": 3.595764272559853, + "grad_norm": 0.26107707619667053, + "learning_rate": 7.412548251095491e-05, + "loss": 1.8016, + "step": 11715 + }, + { + "epoch": 3.5960712093308778, + "grad_norm": 0.2518761456012726, + "learning_rate": 7.412112872482583e-05, + "loss": 1.7565, + "step": 11716 + }, + { + "epoch": 3.596378146101903, + "grad_norm": 0.25911152362823486, + "learning_rate": 7.411677470031762e-05, + "loss": 1.8333, + "step": 11717 + }, + { + "epoch": 3.596685082872928, + "grad_norm": 0.3411506414413452, + "learning_rate": 7.41124204374733e-05, + "loss": 1.8027, + "step": 11718 + }, + { + "epoch": 3.5969920196439533, + "grad_norm": 0.28535547852516174, + "learning_rate": 7.410806593633593e-05, + "loss": 1.7596, + "step": 11719 + }, + { + "epoch": 3.5972989564149787, + "grad_norm": 0.24665530025959015, + "learning_rate": 7.410371119694852e-05, + "loss": 1.7777, + "step": 11720 + }, + { + "epoch": 3.5976058931860035, + "grad_norm": 0.29162275791168213, + "learning_rate": 7.40993562193541e-05, + "loss": 1.795, + "step": 11721 + }, + { + "epoch": 3.597912829957029, + "grad_norm": 0.2712220549583435, + "learning_rate": 7.409500100359573e-05, + "loss": 1.824, + "step": 11722 + }, + { + "epoch": 3.5982197667280538, + "grad_norm": 0.239755779504776, + "learning_rate": 7.40906455497164e-05, + "loss": 1.7534, + "step": 11723 + }, + { + "epoch": 3.598526703499079, + "grad_norm": 0.26056957244873047, + "learning_rate": 7.408628985775922e-05, + "loss": 1.757, + "step": 11724 + }, + { + "epoch": 3.5988336402701044, + "grad_norm": 0.3230258822441101, + "learning_rate": 7.40819339277672e-05, + "loss": 1.8684, + "step": 11725 + }, + { + "epoch": 3.5991405770411298, + "grad_norm": 0.26070696115493774, + "learning_rate": 7.407757775978339e-05, + "loss": 1.7868, + "step": 11726 + }, + { + "epoch": 3.5994475138121547, + "grad_norm": 0.24940893054008484, + "learning_rate": 7.407322135385085e-05, + "loss": 1.8391, + "step": 11727 + }, + { + "epoch": 3.59975445058318, + "grad_norm": 0.2717723250389099, + "learning_rate": 7.406886471001263e-05, + "loss": 1.7567, + "step": 11728 + }, + { + "epoch": 3.600061387354205, + "grad_norm": 0.2328445315361023, + "learning_rate": 7.406450782831177e-05, + "loss": 1.7761, + "step": 11729 + }, + { + "epoch": 3.6003683241252302, + "grad_norm": 0.2740287184715271, + "learning_rate": 7.406015070879136e-05, + "loss": 1.8599, + "step": 11730 + }, + { + "epoch": 3.6006752608962556, + "grad_norm": 0.2930558919906616, + "learning_rate": 7.405579335149441e-05, + "loss": 1.852, + "step": 11731 + }, + { + "epoch": 3.6009821976672804, + "grad_norm": 0.30175161361694336, + "learning_rate": 7.405143575646403e-05, + "loss": 1.8861, + "step": 11732 + }, + { + "epoch": 3.601289134438306, + "grad_norm": 0.2617531418800354, + "learning_rate": 7.404707792374328e-05, + "loss": 1.7598, + "step": 11733 + }, + { + "epoch": 3.6015960712093307, + "grad_norm": 0.25384122133255005, + "learning_rate": 7.404271985337517e-05, + "loss": 1.7634, + "step": 11734 + }, + { + "epoch": 3.601903007980356, + "grad_norm": 0.31706711649894714, + "learning_rate": 7.403836154540284e-05, + "loss": 1.8125, + "step": 11735 + }, + { + "epoch": 3.6022099447513813, + "grad_norm": 0.299662709236145, + "learning_rate": 7.403400299986932e-05, + "loss": 1.748, + "step": 11736 + }, + { + "epoch": 3.6025168815224062, + "grad_norm": 0.23828944563865662, + "learning_rate": 7.40296442168177e-05, + "loss": 1.7473, + "step": 11737 + }, + { + "epoch": 3.6028238182934316, + "grad_norm": 0.22611604630947113, + "learning_rate": 7.402528519629106e-05, + "loss": 1.7519, + "step": 11738 + }, + { + "epoch": 3.6031307550644565, + "grad_norm": 0.28498536348342896, + "learning_rate": 7.402092593833246e-05, + "loss": 1.7792, + "step": 11739 + }, + { + "epoch": 3.603437691835482, + "grad_norm": 0.2404283881187439, + "learning_rate": 7.4016566442985e-05, + "loss": 1.7434, + "step": 11740 + }, + { + "epoch": 3.603744628606507, + "grad_norm": 0.2291589230298996, + "learning_rate": 7.401220671029173e-05, + "loss": 1.7623, + "step": 11741 + }, + { + "epoch": 3.6040515653775325, + "grad_norm": 0.23962698876857758, + "learning_rate": 7.400784674029578e-05, + "loss": 1.7232, + "step": 11742 + }, + { + "epoch": 3.6043585021485574, + "grad_norm": 0.3015185594558716, + "learning_rate": 7.400348653304022e-05, + "loss": 1.7808, + "step": 11743 + }, + { + "epoch": 3.6046654389195827, + "grad_norm": 0.30623099207878113, + "learning_rate": 7.399912608856813e-05, + "loss": 1.8518, + "step": 11744 + }, + { + "epoch": 3.6049723756906076, + "grad_norm": 0.2698235511779785, + "learning_rate": 7.39947654069226e-05, + "loss": 1.7829, + "step": 11745 + }, + { + "epoch": 3.605279312461633, + "grad_norm": 0.2195274829864502, + "learning_rate": 7.399040448814674e-05, + "loss": 1.7709, + "step": 11746 + }, + { + "epoch": 3.6055862492326582, + "grad_norm": 0.22962357103824615, + "learning_rate": 7.398604333228366e-05, + "loss": 1.7482, + "step": 11747 + }, + { + "epoch": 3.605893186003683, + "grad_norm": 0.2403932511806488, + "learning_rate": 7.398168193937642e-05, + "loss": 1.8063, + "step": 11748 + }, + { + "epoch": 3.6062001227747085, + "grad_norm": 0.23542718589305878, + "learning_rate": 7.397732030946816e-05, + "loss": 1.7599, + "step": 11749 + }, + { + "epoch": 3.6065070595457334, + "grad_norm": 0.2462490350008011, + "learning_rate": 7.397295844260195e-05, + "loss": 1.8183, + "step": 11750 + }, + { + "epoch": 3.6068139963167587, + "grad_norm": 0.21428349614143372, + "learning_rate": 7.396859633882091e-05, + "loss": 1.6944, + "step": 11751 + }, + { + "epoch": 3.607120933087784, + "grad_norm": 0.21240907907485962, + "learning_rate": 7.396423399816817e-05, + "loss": 1.7795, + "step": 11752 + }, + { + "epoch": 3.607427869858809, + "grad_norm": 0.23413677513599396, + "learning_rate": 7.395987142068682e-05, + "loss": 1.8015, + "step": 11753 + }, + { + "epoch": 3.6077348066298343, + "grad_norm": 0.26724907755851746, + "learning_rate": 7.395550860641998e-05, + "loss": 1.8174, + "step": 11754 + }, + { + "epoch": 3.608041743400859, + "grad_norm": 0.22077679634094238, + "learning_rate": 7.395114555541077e-05, + "loss": 1.7929, + "step": 11755 + }, + { + "epoch": 3.6083486801718845, + "grad_norm": 0.2475263774394989, + "learning_rate": 7.394678226770228e-05, + "loss": 1.7744, + "step": 11756 + }, + { + "epoch": 3.60865561694291, + "grad_norm": 0.22579342126846313, + "learning_rate": 7.394241874333764e-05, + "loss": 1.79, + "step": 11757 + }, + { + "epoch": 3.608962553713935, + "grad_norm": 0.26798152923583984, + "learning_rate": 7.393805498236001e-05, + "loss": 1.8087, + "step": 11758 + }, + { + "epoch": 3.60926949048496, + "grad_norm": 0.2755621373653412, + "learning_rate": 7.393369098481248e-05, + "loss": 1.7834, + "step": 11759 + }, + { + "epoch": 3.6095764272559854, + "grad_norm": 0.2741812467575073, + "learning_rate": 7.39293267507382e-05, + "loss": 1.7948, + "step": 11760 + }, + { + "epoch": 3.6098833640270103, + "grad_norm": 0.2378924936056137, + "learning_rate": 7.392496228018028e-05, + "loss": 1.8317, + "step": 11761 + }, + { + "epoch": 3.6101903007980356, + "grad_norm": 0.2628132700920105, + "learning_rate": 7.392059757318187e-05, + "loss": 1.8123, + "step": 11762 + }, + { + "epoch": 3.610497237569061, + "grad_norm": 0.2613002359867096, + "learning_rate": 7.391623262978607e-05, + "loss": 1.795, + "step": 11763 + }, + { + "epoch": 3.610804174340086, + "grad_norm": 0.27272161841392517, + "learning_rate": 7.391186745003608e-05, + "loss": 1.7808, + "step": 11764 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.21366162598133087, + "learning_rate": 7.390750203397497e-05, + "loss": 1.77, + "step": 11765 + }, + { + "epoch": 3.611418047882136, + "grad_norm": 0.25559261441230774, + "learning_rate": 7.390313638164593e-05, + "loss": 1.8442, + "step": 11766 + }, + { + "epoch": 3.6117249846531614, + "grad_norm": 0.23794838786125183, + "learning_rate": 7.389877049309207e-05, + "loss": 1.8237, + "step": 11767 + }, + { + "epoch": 3.6120319214241867, + "grad_norm": 0.2690154016017914, + "learning_rate": 7.389440436835656e-05, + "loss": 1.8194, + "step": 11768 + }, + { + "epoch": 3.612338858195212, + "grad_norm": 0.26148009300231934, + "learning_rate": 7.389003800748254e-05, + "loss": 1.7862, + "step": 11769 + }, + { + "epoch": 3.612645794966237, + "grad_norm": 0.26414936780929565, + "learning_rate": 7.388567141051315e-05, + "loss": 1.7815, + "step": 11770 + }, + { + "epoch": 3.6129527317372623, + "grad_norm": 0.24473857879638672, + "learning_rate": 7.388130457749157e-05, + "loss": 1.801, + "step": 11771 + }, + { + "epoch": 3.613259668508287, + "grad_norm": 0.24356001615524292, + "learning_rate": 7.387693750846094e-05, + "loss": 1.8031, + "step": 11772 + }, + { + "epoch": 3.6135666052793125, + "grad_norm": 0.26716411113739014, + "learning_rate": 7.387257020346441e-05, + "loss": 1.7999, + "step": 11773 + }, + { + "epoch": 3.613873542050338, + "grad_norm": 0.2730760872364044, + "learning_rate": 7.386820266254516e-05, + "loss": 1.8079, + "step": 11774 + }, + { + "epoch": 3.6141804788213627, + "grad_norm": 0.2570728361606598, + "learning_rate": 7.386383488574635e-05, + "loss": 1.7374, + "step": 11775 + }, + { + "epoch": 3.614487415592388, + "grad_norm": 0.24992883205413818, + "learning_rate": 7.385946687311112e-05, + "loss": 1.8432, + "step": 11776 + }, + { + "epoch": 3.614794352363413, + "grad_norm": 0.28632259368896484, + "learning_rate": 7.385509862468266e-05, + "loss": 1.8014, + "step": 11777 + }, + { + "epoch": 3.6151012891344383, + "grad_norm": 0.257303923368454, + "learning_rate": 7.385073014050412e-05, + "loss": 1.8166, + "step": 11778 + }, + { + "epoch": 3.6154082259054636, + "grad_norm": 0.2791872024536133, + "learning_rate": 7.38463614206187e-05, + "loss": 1.7865, + "step": 11779 + }, + { + "epoch": 3.6157151626764885, + "grad_norm": 0.25708603858947754, + "learning_rate": 7.384199246506956e-05, + "loss": 1.807, + "step": 11780 + }, + { + "epoch": 3.616022099447514, + "grad_norm": 0.28693172335624695, + "learning_rate": 7.383762327389988e-05, + "loss": 1.8049, + "step": 11781 + }, + { + "epoch": 3.6163290362185387, + "grad_norm": 0.2731167674064636, + "learning_rate": 7.383325384715283e-05, + "loss": 1.8937, + "step": 11782 + }, + { + "epoch": 3.616635972989564, + "grad_norm": 0.26151663064956665, + "learning_rate": 7.38288841848716e-05, + "loss": 1.8288, + "step": 11783 + }, + { + "epoch": 3.6169429097605894, + "grad_norm": 0.2732257843017578, + "learning_rate": 7.382451428709936e-05, + "loss": 1.7668, + "step": 11784 + }, + { + "epoch": 3.6172498465316147, + "grad_norm": 0.2747575640678406, + "learning_rate": 7.38201441538793e-05, + "loss": 1.7991, + "step": 11785 + }, + { + "epoch": 3.6175567833026396, + "grad_norm": 0.2884783446788788, + "learning_rate": 7.381577378525462e-05, + "loss": 1.7798, + "step": 11786 + }, + { + "epoch": 3.617863720073665, + "grad_norm": 0.2716344892978668, + "learning_rate": 7.381140318126851e-05, + "loss": 1.7923, + "step": 11787 + }, + { + "epoch": 3.61817065684469, + "grad_norm": 0.3007747232913971, + "learning_rate": 7.380703234196416e-05, + "loss": 1.8397, + "step": 11788 + }, + { + "epoch": 3.618477593615715, + "grad_norm": 0.39218056201934814, + "learning_rate": 7.380266126738476e-05, + "loss": 1.8517, + "step": 11789 + }, + { + "epoch": 3.6187845303867405, + "grad_norm": 0.43425866961479187, + "learning_rate": 7.379828995757351e-05, + "loss": 1.7518, + "step": 11790 + }, + { + "epoch": 3.6190914671577654, + "grad_norm": 0.34399518370628357, + "learning_rate": 7.37939184125736e-05, + "loss": 1.7607, + "step": 11791 + }, + { + "epoch": 3.6193984039287908, + "grad_norm": 0.23124302923679352, + "learning_rate": 7.378954663242825e-05, + "loss": 1.7898, + "step": 11792 + }, + { + "epoch": 3.6197053406998156, + "grad_norm": 0.32839757204055786, + "learning_rate": 7.378517461718066e-05, + "loss": 1.7472, + "step": 11793 + }, + { + "epoch": 3.620012277470841, + "grad_norm": 0.38583460450172424, + "learning_rate": 7.378080236687403e-05, + "loss": 1.7947, + "step": 11794 + }, + { + "epoch": 3.6203192142418663, + "grad_norm": 0.4622896909713745, + "learning_rate": 7.377642988155157e-05, + "loss": 1.9023, + "step": 11795 + }, + { + "epoch": 3.620626151012891, + "grad_norm": 0.3783189058303833, + "learning_rate": 7.37720571612565e-05, + "loss": 1.7813, + "step": 11796 + }, + { + "epoch": 3.6209330877839165, + "grad_norm": 0.3468814790248871, + "learning_rate": 7.376768420603204e-05, + "loss": 1.7509, + "step": 11797 + }, + { + "epoch": 3.6212400245549414, + "grad_norm": 0.2602507174015045, + "learning_rate": 7.376331101592138e-05, + "loss": 1.8158, + "step": 11798 + }, + { + "epoch": 3.6215469613259668, + "grad_norm": 0.28337883949279785, + "learning_rate": 7.375893759096775e-05, + "loss": 1.7755, + "step": 11799 + }, + { + "epoch": 3.621853898096992, + "grad_norm": 0.3644609749317169, + "learning_rate": 7.375456393121437e-05, + "loss": 1.8193, + "step": 11800 + }, + { + "epoch": 3.6221608348680174, + "grad_norm": 0.338211327791214, + "learning_rate": 7.375019003670448e-05, + "loss": 1.821, + "step": 11801 + }, + { + "epoch": 3.6224677716390423, + "grad_norm": 0.23850654065608978, + "learning_rate": 7.374581590748129e-05, + "loss": 1.7317, + "step": 11802 + }, + { + "epoch": 3.6227747084100677, + "grad_norm": 0.3496716618537903, + "learning_rate": 7.374144154358801e-05, + "loss": 1.8361, + "step": 11803 + }, + { + "epoch": 3.6230816451810925, + "grad_norm": 0.5585216283798218, + "learning_rate": 7.37370669450679e-05, + "loss": 1.7667, + "step": 11804 + }, + { + "epoch": 3.623388581952118, + "grad_norm": 0.4578089714050293, + "learning_rate": 7.373269211196418e-05, + "loss": 1.8051, + "step": 11805 + }, + { + "epoch": 3.623695518723143, + "grad_norm": 0.28195759654045105, + "learning_rate": 7.37283170443201e-05, + "loss": 1.7823, + "step": 11806 + }, + { + "epoch": 3.624002455494168, + "grad_norm": 0.4066108465194702, + "learning_rate": 7.372394174217887e-05, + "loss": 1.7819, + "step": 11807 + }, + { + "epoch": 3.6243093922651934, + "grad_norm": 0.5368703007698059, + "learning_rate": 7.371956620558375e-05, + "loss": 1.8121, + "step": 11808 + }, + { + "epoch": 3.6246163290362183, + "grad_norm": 0.36627063155174255, + "learning_rate": 7.371519043457795e-05, + "loss": 1.7944, + "step": 11809 + }, + { + "epoch": 3.6249232658072437, + "grad_norm": 0.3100780248641968, + "learning_rate": 7.371081442920476e-05, + "loss": 1.783, + "step": 11810 + }, + { + "epoch": 3.625230202578269, + "grad_norm": 0.3277178704738617, + "learning_rate": 7.370643818950741e-05, + "loss": 1.8105, + "step": 11811 + }, + { + "epoch": 3.625537139349294, + "grad_norm": 0.3887772560119629, + "learning_rate": 7.370206171552914e-05, + "loss": 1.8136, + "step": 11812 + }, + { + "epoch": 3.6258440761203192, + "grad_norm": 0.2770824134349823, + "learning_rate": 7.36976850073132e-05, + "loss": 1.7852, + "step": 11813 + }, + { + "epoch": 3.626151012891344, + "grad_norm": 0.26357728242874146, + "learning_rate": 7.369330806490284e-05, + "loss": 1.7621, + "step": 11814 + }, + { + "epoch": 3.6264579496623695, + "grad_norm": 0.3387344181537628, + "learning_rate": 7.368893088834135e-05, + "loss": 1.7785, + "step": 11815 + }, + { + "epoch": 3.626764886433395, + "grad_norm": 0.35155174136161804, + "learning_rate": 7.368455347767193e-05, + "loss": 1.8081, + "step": 11816 + }, + { + "epoch": 3.62707182320442, + "grad_norm": 0.2855289876461029, + "learning_rate": 7.368017583293788e-05, + "loss": 1.8245, + "step": 11817 + }, + { + "epoch": 3.627378759975445, + "grad_norm": 0.28462162613868713, + "learning_rate": 7.367579795418245e-05, + "loss": 1.8066, + "step": 11818 + }, + { + "epoch": 3.6276856967464703, + "grad_norm": 0.40696555376052856, + "learning_rate": 7.367141984144891e-05, + "loss": 1.8897, + "step": 11819 + }, + { + "epoch": 3.6279926335174952, + "grad_norm": 0.472782701253891, + "learning_rate": 7.366704149478054e-05, + "loss": 1.8071, + "step": 11820 + }, + { + "epoch": 3.6282995702885206, + "grad_norm": 0.27022916078567505, + "learning_rate": 7.366266291422057e-05, + "loss": 1.8574, + "step": 11821 + }, + { + "epoch": 3.628606507059546, + "grad_norm": 0.4207148253917694, + "learning_rate": 7.365828409981231e-05, + "loss": 1.7759, + "step": 11822 + }, + { + "epoch": 3.628913443830571, + "grad_norm": 0.42866072058677673, + "learning_rate": 7.365390505159902e-05, + "loss": 1.7366, + "step": 11823 + }, + { + "epoch": 3.629220380601596, + "grad_norm": 0.28288859128952026, + "learning_rate": 7.364952576962398e-05, + "loss": 1.8591, + "step": 11824 + }, + { + "epoch": 3.629527317372621, + "grad_norm": 0.30544906854629517, + "learning_rate": 7.364514625393045e-05, + "loss": 1.7965, + "step": 11825 + }, + { + "epoch": 3.6298342541436464, + "grad_norm": 0.3251616954803467, + "learning_rate": 7.364076650456173e-05, + "loss": 1.8197, + "step": 11826 + }, + { + "epoch": 3.6301411909146717, + "grad_norm": 0.3133888840675354, + "learning_rate": 7.363638652156109e-05, + "loss": 1.7978, + "step": 11827 + }, + { + "epoch": 3.630448127685697, + "grad_norm": 0.29004594683647156, + "learning_rate": 7.363200630497185e-05, + "loss": 1.8035, + "step": 11828 + }, + { + "epoch": 3.630755064456722, + "grad_norm": 0.2781279683113098, + "learning_rate": 7.362762585483725e-05, + "loss": 1.8462, + "step": 11829 + }, + { + "epoch": 3.6310620012277472, + "grad_norm": 0.29003822803497314, + "learning_rate": 7.362324517120063e-05, + "loss": 1.7952, + "step": 11830 + }, + { + "epoch": 3.631368937998772, + "grad_norm": 0.2510940134525299, + "learning_rate": 7.361886425410524e-05, + "loss": 1.7645, + "step": 11831 + }, + { + "epoch": 3.6316758747697975, + "grad_norm": 0.23798540234565735, + "learning_rate": 7.361448310359438e-05, + "loss": 1.7329, + "step": 11832 + }, + { + "epoch": 3.631982811540823, + "grad_norm": 0.2711278796195984, + "learning_rate": 7.361010171971137e-05, + "loss": 1.8245, + "step": 11833 + }, + { + "epoch": 3.6322897483118477, + "grad_norm": 0.2895669639110565, + "learning_rate": 7.360572010249949e-05, + "loss": 1.7668, + "step": 11834 + }, + { + "epoch": 3.632596685082873, + "grad_norm": 0.2216273844242096, + "learning_rate": 7.360133825200205e-05, + "loss": 1.8164, + "step": 11835 + }, + { + "epoch": 3.632903621853898, + "grad_norm": 0.3075082302093506, + "learning_rate": 7.359695616826236e-05, + "loss": 1.8159, + "step": 11836 + }, + { + "epoch": 3.6332105586249233, + "grad_norm": 0.3208801746368408, + "learning_rate": 7.35925738513237e-05, + "loss": 1.8385, + "step": 11837 + }, + { + "epoch": 3.6335174953959486, + "grad_norm": 0.272517591714859, + "learning_rate": 7.35881913012294e-05, + "loss": 1.7653, + "step": 11838 + }, + { + "epoch": 3.6338244321669735, + "grad_norm": 0.23105360567569733, + "learning_rate": 7.358380851802277e-05, + "loss": 1.7697, + "step": 11839 + }, + { + "epoch": 3.634131368937999, + "grad_norm": 0.2643153667449951, + "learning_rate": 7.357942550174714e-05, + "loss": 1.7885, + "step": 11840 + }, + { + "epoch": 3.6344383057090237, + "grad_norm": 0.22643202543258667, + "learning_rate": 7.357504225244579e-05, + "loss": 1.746, + "step": 11841 + }, + { + "epoch": 3.634745242480049, + "grad_norm": 0.27782970666885376, + "learning_rate": 7.357065877016207e-05, + "loss": 1.794, + "step": 11842 + }, + { + "epoch": 3.6350521792510744, + "grad_norm": 0.3035561740398407, + "learning_rate": 7.356627505493925e-05, + "loss": 1.7892, + "step": 11843 + }, + { + "epoch": 3.6353591160220997, + "grad_norm": 0.31859731674194336, + "learning_rate": 7.356189110682072e-05, + "loss": 1.7636, + "step": 11844 + }, + { + "epoch": 3.6356660527931246, + "grad_norm": 0.2960890233516693, + "learning_rate": 7.355750692584977e-05, + "loss": 1.8294, + "step": 11845 + }, + { + "epoch": 3.63597298956415, + "grad_norm": 0.2544194459915161, + "learning_rate": 7.355312251206972e-05, + "loss": 1.7603, + "step": 11846 + }, + { + "epoch": 3.636279926335175, + "grad_norm": 0.27864789962768555, + "learning_rate": 7.354873786552391e-05, + "loss": 1.7917, + "step": 11847 + }, + { + "epoch": 3.6365868631062, + "grad_norm": 0.32552552223205566, + "learning_rate": 7.354435298625568e-05, + "loss": 1.7769, + "step": 11848 + }, + { + "epoch": 3.6368937998772255, + "grad_norm": 0.25094640254974365, + "learning_rate": 7.353996787430833e-05, + "loss": 1.8371, + "step": 11849 + }, + { + "epoch": 3.6372007366482504, + "grad_norm": 0.26656433939933777, + "learning_rate": 7.353558252972524e-05, + "loss": 1.7686, + "step": 11850 + }, + { + "epoch": 3.6375076734192757, + "grad_norm": 0.3023635745048523, + "learning_rate": 7.353119695254973e-05, + "loss": 1.7892, + "step": 11851 + }, + { + "epoch": 3.6378146101903006, + "grad_norm": 0.2822463810443878, + "learning_rate": 7.352681114282514e-05, + "loss": 1.8221, + "step": 11852 + }, + { + "epoch": 3.638121546961326, + "grad_norm": 0.31159496307373047, + "learning_rate": 7.35224251005948e-05, + "loss": 1.803, + "step": 11853 + }, + { + "epoch": 3.6384284837323513, + "grad_norm": 0.3133087158203125, + "learning_rate": 7.351803882590207e-05, + "loss": 1.744, + "step": 11854 + }, + { + "epoch": 3.638735420503376, + "grad_norm": 0.3050002455711365, + "learning_rate": 7.351365231879029e-05, + "loss": 1.7522, + "step": 11855 + }, + { + "epoch": 3.6390423572744015, + "grad_norm": 0.2729037404060364, + "learning_rate": 7.350926557930283e-05, + "loss": 1.7629, + "step": 11856 + }, + { + "epoch": 3.6393492940454264, + "grad_norm": 0.3181995153427124, + "learning_rate": 7.350487860748303e-05, + "loss": 1.7603, + "step": 11857 + }, + { + "epoch": 3.6396562308164517, + "grad_norm": 0.352651447057724, + "learning_rate": 7.350049140337423e-05, + "loss": 1.8177, + "step": 11858 + }, + { + "epoch": 3.639963167587477, + "grad_norm": 0.22935177385807037, + "learning_rate": 7.349610396701981e-05, + "loss": 1.7421, + "step": 11859 + }, + { + "epoch": 3.6402701043585024, + "grad_norm": 0.26442599296569824, + "learning_rate": 7.349171629846312e-05, + "loss": 1.8026, + "step": 11860 + }, + { + "epoch": 3.6405770411295273, + "grad_norm": 0.25357648730278015, + "learning_rate": 7.348732839774751e-05, + "loss": 1.788, + "step": 11861 + }, + { + "epoch": 3.6408839779005526, + "grad_norm": 0.26959577202796936, + "learning_rate": 7.348294026491635e-05, + "loss": 1.884, + "step": 11862 + }, + { + "epoch": 3.6411909146715775, + "grad_norm": 0.2243001013994217, + "learning_rate": 7.347855190001304e-05, + "loss": 1.7765, + "step": 11863 + }, + { + "epoch": 3.641497851442603, + "grad_norm": 0.2480708807706833, + "learning_rate": 7.34741633030809e-05, + "loss": 1.7597, + "step": 11864 + }, + { + "epoch": 3.641804788213628, + "grad_norm": 0.22512994706630707, + "learning_rate": 7.346977447416332e-05, + "loss": 1.7647, + "step": 11865 + }, + { + "epoch": 3.642111724984653, + "grad_norm": 0.24961981177330017, + "learning_rate": 7.346538541330368e-05, + "loss": 1.8178, + "step": 11866 + }, + { + "epoch": 3.6424186617556784, + "grad_norm": 0.320896714925766, + "learning_rate": 7.346099612054533e-05, + "loss": 1.85, + "step": 11867 + }, + { + "epoch": 3.6427255985267033, + "grad_norm": 0.3420880436897278, + "learning_rate": 7.345660659593167e-05, + "loss": 1.8661, + "step": 11868 + }, + { + "epoch": 3.6430325352977286, + "grad_norm": 0.2675844132900238, + "learning_rate": 7.34522168395061e-05, + "loss": 1.8177, + "step": 11869 + }, + { + "epoch": 3.643339472068754, + "grad_norm": 0.23993943631649017, + "learning_rate": 7.344782685131195e-05, + "loss": 1.7365, + "step": 11870 + }, + { + "epoch": 3.643646408839779, + "grad_norm": 0.21805813908576965, + "learning_rate": 7.344343663139264e-05, + "loss": 1.7813, + "step": 11871 + }, + { + "epoch": 3.643953345610804, + "grad_norm": 0.24334421753883362, + "learning_rate": 7.343904617979154e-05, + "loss": 1.7763, + "step": 11872 + }, + { + "epoch": 3.644260282381829, + "grad_norm": 0.22768431901931763, + "learning_rate": 7.343465549655206e-05, + "loss": 1.7817, + "step": 11873 + }, + { + "epoch": 3.6445672191528544, + "grad_norm": 0.23828962445259094, + "learning_rate": 7.343026458171757e-05, + "loss": 1.8391, + "step": 11874 + }, + { + "epoch": 3.6448741559238798, + "grad_norm": 0.24838197231292725, + "learning_rate": 7.342587343533149e-05, + "loss": 1.759, + "step": 11875 + }, + { + "epoch": 3.645181092694905, + "grad_norm": 0.22732019424438477, + "learning_rate": 7.342148205743718e-05, + "loss": 1.7348, + "step": 11876 + }, + { + "epoch": 3.64548802946593, + "grad_norm": 0.25106775760650635, + "learning_rate": 7.341709044807807e-05, + "loss": 1.8121, + "step": 11877 + }, + { + "epoch": 3.6457949662369553, + "grad_norm": 0.28532838821411133, + "learning_rate": 7.341269860729753e-05, + "loss": 1.7147, + "step": 11878 + }, + { + "epoch": 3.64610190300798, + "grad_norm": 0.3041890859603882, + "learning_rate": 7.340830653513899e-05, + "loss": 1.7666, + "step": 11879 + }, + { + "epoch": 3.6464088397790055, + "grad_norm": 0.3142147958278656, + "learning_rate": 7.340391423164585e-05, + "loss": 1.8707, + "step": 11880 + }, + { + "epoch": 3.646715776550031, + "grad_norm": 0.28531381487846375, + "learning_rate": 7.339952169686151e-05, + "loss": 1.7961, + "step": 11881 + }, + { + "epoch": 3.6470227133210558, + "grad_norm": 0.33779671788215637, + "learning_rate": 7.339512893082938e-05, + "loss": 1.7428, + "step": 11882 + }, + { + "epoch": 3.647329650092081, + "grad_norm": 0.29611849784851074, + "learning_rate": 7.339073593359287e-05, + "loss": 1.8803, + "step": 11883 + }, + { + "epoch": 3.647636586863106, + "grad_norm": 0.31248557567596436, + "learning_rate": 7.33863427051954e-05, + "loss": 1.7868, + "step": 11884 + }, + { + "epoch": 3.6479435236341313, + "grad_norm": 0.42829564213752747, + "learning_rate": 7.338194924568039e-05, + "loss": 1.8558, + "step": 11885 + }, + { + "epoch": 3.6482504604051567, + "grad_norm": 0.431023508310318, + "learning_rate": 7.337755555509126e-05, + "loss": 1.7565, + "step": 11886 + }, + { + "epoch": 3.6485573971761815, + "grad_norm": 0.2917975187301636, + "learning_rate": 7.33731616334714e-05, + "loss": 1.8067, + "step": 11887 + }, + { + "epoch": 3.648864333947207, + "grad_norm": 0.3072175085544586, + "learning_rate": 7.336876748086427e-05, + "loss": 1.782, + "step": 11888 + }, + { + "epoch": 3.6491712707182318, + "grad_norm": 0.33658862113952637, + "learning_rate": 7.336437309731327e-05, + "loss": 1.8007, + "step": 11889 + }, + { + "epoch": 3.649478207489257, + "grad_norm": 0.23774033784866333, + "learning_rate": 7.335997848286185e-05, + "loss": 1.7606, + "step": 11890 + }, + { + "epoch": 3.6497851442602824, + "grad_norm": 0.3373236358165741, + "learning_rate": 7.335558363755344e-05, + "loss": 1.7335, + "step": 11891 + }, + { + "epoch": 3.650092081031308, + "grad_norm": 0.3906517028808594, + "learning_rate": 7.335118856143145e-05, + "loss": 1.7974, + "step": 11892 + }, + { + "epoch": 3.6503990178023327, + "grad_norm": 0.37715303897857666, + "learning_rate": 7.334679325453934e-05, + "loss": 1.8875, + "step": 11893 + }, + { + "epoch": 3.650705954573358, + "grad_norm": 0.278540700674057, + "learning_rate": 7.334239771692053e-05, + "loss": 1.8165, + "step": 11894 + }, + { + "epoch": 3.651012891344383, + "grad_norm": 0.24434895813465118, + "learning_rate": 7.333800194861845e-05, + "loss": 1.7756, + "step": 11895 + }, + { + "epoch": 3.6513198281154082, + "grad_norm": 0.25057271122932434, + "learning_rate": 7.333360594967658e-05, + "loss": 1.7932, + "step": 11896 + }, + { + "epoch": 3.6516267648864336, + "grad_norm": 0.3277342617511749, + "learning_rate": 7.332920972013833e-05, + "loss": 1.7781, + "step": 11897 + }, + { + "epoch": 3.6519337016574585, + "grad_norm": 0.2754829525947571, + "learning_rate": 7.332481326004715e-05, + "loss": 1.7916, + "step": 11898 + }, + { + "epoch": 3.652240638428484, + "grad_norm": 0.24490588903427124, + "learning_rate": 7.332041656944651e-05, + "loss": 1.7904, + "step": 11899 + }, + { + "epoch": 3.6525475751995087, + "grad_norm": 0.3176959455013275, + "learning_rate": 7.331601964837982e-05, + "loss": 1.7379, + "step": 11900 + }, + { + "epoch": 3.652854511970534, + "grad_norm": 0.3435784876346588, + "learning_rate": 7.331162249689057e-05, + "loss": 1.7635, + "step": 11901 + }, + { + "epoch": 3.6531614487415593, + "grad_norm": 0.335697740316391, + "learning_rate": 7.330722511502221e-05, + "loss": 1.7903, + "step": 11902 + }, + { + "epoch": 3.6534683855125847, + "grad_norm": 0.2748894691467285, + "learning_rate": 7.330282750281819e-05, + "loss": 1.8259, + "step": 11903 + }, + { + "epoch": 3.6537753222836096, + "grad_norm": 0.36754751205444336, + "learning_rate": 7.329842966032197e-05, + "loss": 1.7728, + "step": 11904 + }, + { + "epoch": 3.654082259054635, + "grad_norm": 0.4355713129043579, + "learning_rate": 7.3294031587577e-05, + "loss": 1.7447, + "step": 11905 + }, + { + "epoch": 3.65438919582566, + "grad_norm": 0.3967476487159729, + "learning_rate": 7.328963328462677e-05, + "loss": 1.8299, + "step": 11906 + }, + { + "epoch": 3.654696132596685, + "grad_norm": 0.23805755376815796, + "learning_rate": 7.328523475151472e-05, + "loss": 1.7631, + "step": 11907 + }, + { + "epoch": 3.6550030693677105, + "grad_norm": 0.40350377559661865, + "learning_rate": 7.328083598828435e-05, + "loss": 1.8693, + "step": 11908 + }, + { + "epoch": 3.6553100061387354, + "grad_norm": 0.4743673801422119, + "learning_rate": 7.32764369949791e-05, + "loss": 1.7887, + "step": 11909 + }, + { + "epoch": 3.6556169429097607, + "grad_norm": 0.33830127120018005, + "learning_rate": 7.327203777164246e-05, + "loss": 1.7527, + "step": 11910 + }, + { + "epoch": 3.6559238796807856, + "grad_norm": 0.2465003877878189, + "learning_rate": 7.326763831831791e-05, + "loss": 1.7898, + "step": 11911 + }, + { + "epoch": 3.656230816451811, + "grad_norm": 0.31647852063179016, + "learning_rate": 7.326323863504892e-05, + "loss": 1.8056, + "step": 11912 + }, + { + "epoch": 3.6565377532228363, + "grad_norm": 0.31436124444007874, + "learning_rate": 7.325883872187896e-05, + "loss": 1.7972, + "step": 11913 + }, + { + "epoch": 3.656844689993861, + "grad_norm": 0.260405957698822, + "learning_rate": 7.325443857885153e-05, + "loss": 1.8109, + "step": 11914 + }, + { + "epoch": 3.6571516267648865, + "grad_norm": 0.29312583804130554, + "learning_rate": 7.325003820601011e-05, + "loss": 1.8947, + "step": 11915 + }, + { + "epoch": 3.6574585635359114, + "grad_norm": 0.2641582190990448, + "learning_rate": 7.324563760339819e-05, + "loss": 1.7737, + "step": 11916 + }, + { + "epoch": 3.6577655003069367, + "grad_norm": 0.2338121086359024, + "learning_rate": 7.324123677105923e-05, + "loss": 1.7462, + "step": 11917 + }, + { + "epoch": 3.658072437077962, + "grad_norm": 0.27877378463745117, + "learning_rate": 7.323683570903676e-05, + "loss": 1.8371, + "step": 11918 + }, + { + "epoch": 3.6583793738489874, + "grad_norm": 0.24238766729831696, + "learning_rate": 7.323243441737427e-05, + "loss": 1.7304, + "step": 11919 + }, + { + "epoch": 3.6586863106200123, + "grad_norm": 0.2349759042263031, + "learning_rate": 7.322803289611525e-05, + "loss": 1.7422, + "step": 11920 + }, + { + "epoch": 3.6589932473910376, + "grad_norm": 0.2254217565059662, + "learning_rate": 7.322363114530318e-05, + "loss": 1.7296, + "step": 11921 + }, + { + "epoch": 3.6593001841620625, + "grad_norm": 0.24533270299434662, + "learning_rate": 7.321922916498158e-05, + "loss": 1.7834, + "step": 11922 + }, + { + "epoch": 3.659607120933088, + "grad_norm": 0.24993161857128143, + "learning_rate": 7.321482695519393e-05, + "loss": 1.8502, + "step": 11923 + }, + { + "epoch": 3.659914057704113, + "grad_norm": 0.2540178894996643, + "learning_rate": 7.321042451598378e-05, + "loss": 1.8372, + "step": 11924 + }, + { + "epoch": 3.660220994475138, + "grad_norm": 0.2241390198469162, + "learning_rate": 7.32060218473946e-05, + "loss": 1.7619, + "step": 11925 + }, + { + "epoch": 3.6605279312461634, + "grad_norm": 0.2137840837240219, + "learning_rate": 7.32016189494699e-05, + "loss": 1.751, + "step": 11926 + }, + { + "epoch": 3.6608348680171883, + "grad_norm": 0.2596585154533386, + "learning_rate": 7.319721582225323e-05, + "loss": 1.7773, + "step": 11927 + }, + { + "epoch": 3.6611418047882136, + "grad_norm": 0.24898354709148407, + "learning_rate": 7.319281246578806e-05, + "loss": 1.7347, + "step": 11928 + }, + { + "epoch": 3.661448741559239, + "grad_norm": 0.26553863286972046, + "learning_rate": 7.31884088801179e-05, + "loss": 1.7812, + "step": 11929 + }, + { + "epoch": 3.661755678330264, + "grad_norm": 0.2494438737630844, + "learning_rate": 7.318400506528633e-05, + "loss": 1.7554, + "step": 11930 + }, + { + "epoch": 3.662062615101289, + "grad_norm": 0.2794995903968811, + "learning_rate": 7.317960102133682e-05, + "loss": 1.7495, + "step": 11931 + }, + { + "epoch": 3.662369551872314, + "grad_norm": 0.2843860983848572, + "learning_rate": 7.317519674831293e-05, + "loss": 1.7734, + "step": 11932 + }, + { + "epoch": 3.6626764886433394, + "grad_norm": 0.28261128067970276, + "learning_rate": 7.317079224625813e-05, + "loss": 1.7794, + "step": 11933 + }, + { + "epoch": 3.6629834254143647, + "grad_norm": 0.2552426755428314, + "learning_rate": 7.316638751521599e-05, + "loss": 1.8397, + "step": 11934 + }, + { + "epoch": 3.66329036218539, + "grad_norm": 0.4140608608722687, + "learning_rate": 7.316198255523002e-05, + "loss": 1.848, + "step": 11935 + }, + { + "epoch": 3.663597298956415, + "grad_norm": 0.3709854483604431, + "learning_rate": 7.315757736634377e-05, + "loss": 1.8489, + "step": 11936 + }, + { + "epoch": 3.6639042357274403, + "grad_norm": 0.23637300729751587, + "learning_rate": 7.315317194860078e-05, + "loss": 1.7549, + "step": 11937 + }, + { + "epoch": 3.664211172498465, + "grad_norm": 0.32884421944618225, + "learning_rate": 7.314876630204456e-05, + "loss": 1.8061, + "step": 11938 + }, + { + "epoch": 3.6645181092694905, + "grad_norm": 0.33354130387306213, + "learning_rate": 7.314436042671867e-05, + "loss": 1.8346, + "step": 11939 + }, + { + "epoch": 3.664825046040516, + "grad_norm": 0.25776317715644836, + "learning_rate": 7.313995432266663e-05, + "loss": 1.8598, + "step": 11940 + }, + { + "epoch": 3.6651319828115407, + "grad_norm": 0.2910402715206146, + "learning_rate": 7.313554798993202e-05, + "loss": 1.7613, + "step": 11941 + }, + { + "epoch": 3.665438919582566, + "grad_norm": 0.3487538695335388, + "learning_rate": 7.313114142855836e-05, + "loss": 1.8105, + "step": 11942 + }, + { + "epoch": 3.665745856353591, + "grad_norm": 0.27271291613578796, + "learning_rate": 7.312673463858918e-05, + "loss": 1.8107, + "step": 11943 + }, + { + "epoch": 3.6660527931246163, + "grad_norm": 0.2613036632537842, + "learning_rate": 7.312232762006809e-05, + "loss": 1.7871, + "step": 11944 + }, + { + "epoch": 3.6663597298956416, + "grad_norm": 0.30594903230667114, + "learning_rate": 7.311792037303859e-05, + "loss": 1.8043, + "step": 11945 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.3960847854614258, + "learning_rate": 7.311351289754425e-05, + "loss": 1.8434, + "step": 11946 + }, + { + "epoch": 3.666973603437692, + "grad_norm": 0.33369311690330505, + "learning_rate": 7.310910519362861e-05, + "loss": 1.7496, + "step": 11947 + }, + { + "epoch": 3.6672805402087167, + "grad_norm": 0.29852384328842163, + "learning_rate": 7.310469726133528e-05, + "loss": 1.858, + "step": 11948 + }, + { + "epoch": 3.667587476979742, + "grad_norm": 0.2610527276992798, + "learning_rate": 7.310028910070777e-05, + "loss": 1.7642, + "step": 11949 + }, + { + "epoch": 3.6678944137507674, + "grad_norm": 0.3606704771518707, + "learning_rate": 7.309588071178967e-05, + "loss": 1.845, + "step": 11950 + }, + { + "epoch": 3.6682013505217927, + "grad_norm": 0.3157273828983307, + "learning_rate": 7.309147209462454e-05, + "loss": 1.7864, + "step": 11951 + }, + { + "epoch": 3.6685082872928176, + "grad_norm": 0.23907925188541412, + "learning_rate": 7.308706324925594e-05, + "loss": 1.8363, + "step": 11952 + }, + { + "epoch": 3.668815224063843, + "grad_norm": 0.3365088999271393, + "learning_rate": 7.308265417572747e-05, + "loss": 1.8755, + "step": 11953 + }, + { + "epoch": 3.669122160834868, + "grad_norm": 0.29404979944229126, + "learning_rate": 7.307824487408266e-05, + "loss": 1.8128, + "step": 11954 + }, + { + "epoch": 3.669429097605893, + "grad_norm": 0.2689574658870697, + "learning_rate": 7.307383534436511e-05, + "loss": 1.8072, + "step": 11955 + }, + { + "epoch": 3.6697360343769185, + "grad_norm": 0.28394198417663574, + "learning_rate": 7.306942558661841e-05, + "loss": 1.7919, + "step": 11956 + }, + { + "epoch": 3.6700429711479434, + "grad_norm": 0.2594783902168274, + "learning_rate": 7.306501560088612e-05, + "loss": 1.7467, + "step": 11957 + }, + { + "epoch": 3.6703499079189688, + "grad_norm": 0.24765191972255707, + "learning_rate": 7.30606053872118e-05, + "loss": 1.7876, + "step": 11958 + }, + { + "epoch": 3.6706568446899936, + "grad_norm": 0.22157172858715057, + "learning_rate": 7.305619494563909e-05, + "loss": 1.7802, + "step": 11959 + }, + { + "epoch": 3.670963781461019, + "grad_norm": 0.270151287317276, + "learning_rate": 7.305178427621155e-05, + "loss": 1.7723, + "step": 11960 + }, + { + "epoch": 3.6712707182320443, + "grad_norm": 0.3163939118385315, + "learning_rate": 7.304737337897277e-05, + "loss": 1.8488, + "step": 11961 + }, + { + "epoch": 3.671577655003069, + "grad_norm": 0.2605706453323364, + "learning_rate": 7.304296225396632e-05, + "loss": 1.7442, + "step": 11962 + }, + { + "epoch": 3.6718845917740945, + "grad_norm": 0.31179291009902954, + "learning_rate": 7.303855090123582e-05, + "loss": 1.831, + "step": 11963 + }, + { + "epoch": 3.6721915285451194, + "grad_norm": 0.33365359902381897, + "learning_rate": 7.303413932082483e-05, + "loss": 1.8376, + "step": 11964 + }, + { + "epoch": 3.6724984653161448, + "grad_norm": 0.2952130138874054, + "learning_rate": 7.302972751277701e-05, + "loss": 1.7733, + "step": 11965 + }, + { + "epoch": 3.67280540208717, + "grad_norm": 0.24270877242088318, + "learning_rate": 7.302531547713592e-05, + "loss": 1.8367, + "step": 11966 + }, + { + "epoch": 3.6731123388581954, + "grad_norm": 0.34315919876098633, + "learning_rate": 7.302090321394517e-05, + "loss": 1.7901, + "step": 11967 + }, + { + "epoch": 3.6734192756292203, + "grad_norm": 0.33511418104171753, + "learning_rate": 7.301649072324834e-05, + "loss": 1.7929, + "step": 11968 + }, + { + "epoch": 3.6737262124002457, + "grad_norm": 0.22397933900356293, + "learning_rate": 7.301207800508907e-05, + "loss": 1.7533, + "step": 11969 + }, + { + "epoch": 3.6740331491712706, + "grad_norm": 0.2882738411426544, + "learning_rate": 7.300766505951095e-05, + "loss": 1.8071, + "step": 11970 + }, + { + "epoch": 3.674340085942296, + "grad_norm": 0.242112398147583, + "learning_rate": 7.300325188655761e-05, + "loss": 1.7739, + "step": 11971 + }, + { + "epoch": 3.674647022713321, + "grad_norm": 0.27754491567611694, + "learning_rate": 7.299883848627265e-05, + "loss": 1.8295, + "step": 11972 + }, + { + "epoch": 3.674953959484346, + "grad_norm": 0.2787899076938629, + "learning_rate": 7.29944248586997e-05, + "loss": 1.7682, + "step": 11973 + }, + { + "epoch": 3.6752608962553714, + "grad_norm": 0.24448934197425842, + "learning_rate": 7.299001100388234e-05, + "loss": 1.7826, + "step": 11974 + }, + { + "epoch": 3.6755678330263963, + "grad_norm": 0.37869495153427124, + "learning_rate": 7.298559692186421e-05, + "loss": 1.8582, + "step": 11975 + }, + { + "epoch": 3.6758747697974217, + "grad_norm": 0.3299996256828308, + "learning_rate": 7.298118261268897e-05, + "loss": 1.7716, + "step": 11976 + }, + { + "epoch": 3.676181706568447, + "grad_norm": 0.278891384601593, + "learning_rate": 7.29767680764002e-05, + "loss": 1.879, + "step": 11977 + }, + { + "epoch": 3.6764886433394723, + "grad_norm": 0.29326459765434265, + "learning_rate": 7.297235331304155e-05, + "loss": 1.804, + "step": 11978 + }, + { + "epoch": 3.6767955801104972, + "grad_norm": 0.2697092592716217, + "learning_rate": 7.296793832265663e-05, + "loss": 1.7842, + "step": 11979 + }, + { + "epoch": 3.6771025168815226, + "grad_norm": 0.3045118749141693, + "learning_rate": 7.296352310528909e-05, + "loss": 1.7959, + "step": 11980 + }, + { + "epoch": 3.6774094536525475, + "grad_norm": 0.278647780418396, + "learning_rate": 7.295910766098252e-05, + "loss": 1.7907, + "step": 11981 + }, + { + "epoch": 3.677716390423573, + "grad_norm": 0.2370275855064392, + "learning_rate": 7.295469198978063e-05, + "loss": 1.757, + "step": 11982 + }, + { + "epoch": 3.678023327194598, + "grad_norm": 0.3061021566390991, + "learning_rate": 7.295027609172702e-05, + "loss": 1.7927, + "step": 11983 + }, + { + "epoch": 3.678330263965623, + "grad_norm": 0.2844544053077698, + "learning_rate": 7.294585996686532e-05, + "loss": 1.7705, + "step": 11984 + }, + { + "epoch": 3.6786372007366483, + "grad_norm": 0.31121113896369934, + "learning_rate": 7.29414436152392e-05, + "loss": 1.783, + "step": 11985 + }, + { + "epoch": 3.6789441375076732, + "grad_norm": 0.2566785514354706, + "learning_rate": 7.293702703689225e-05, + "loss": 1.7781, + "step": 11986 + }, + { + "epoch": 3.6792510742786986, + "grad_norm": 0.22176961600780487, + "learning_rate": 7.293261023186818e-05, + "loss": 1.7302, + "step": 11987 + }, + { + "epoch": 3.679558011049724, + "grad_norm": 0.21547441184520721, + "learning_rate": 7.292819320021062e-05, + "loss": 1.7666, + "step": 11988 + }, + { + "epoch": 3.679864947820749, + "grad_norm": 0.26309674978256226, + "learning_rate": 7.29237759419632e-05, + "loss": 1.7817, + "step": 11989 + }, + { + "epoch": 3.680171884591774, + "grad_norm": 0.2558063864707947, + "learning_rate": 7.29193584571696e-05, + "loss": 1.8257, + "step": 11990 + }, + { + "epoch": 3.680478821362799, + "grad_norm": 0.24516844749450684, + "learning_rate": 7.291494074587347e-05, + "loss": 1.7803, + "step": 11991 + }, + { + "epoch": 3.6807857581338244, + "grad_norm": 0.22891047596931458, + "learning_rate": 7.291052280811843e-05, + "loss": 1.7977, + "step": 11992 + }, + { + "epoch": 3.6810926949048497, + "grad_norm": 0.2776026129722595, + "learning_rate": 7.290610464394822e-05, + "loss": 1.8486, + "step": 11993 + }, + { + "epoch": 3.681399631675875, + "grad_norm": 0.31472426652908325, + "learning_rate": 7.290168625340644e-05, + "loss": 1.7841, + "step": 11994 + }, + { + "epoch": 3.6817065684469, + "grad_norm": 0.3459274470806122, + "learning_rate": 7.289726763653677e-05, + "loss": 1.7458, + "step": 11995 + }, + { + "epoch": 3.6820135052179253, + "grad_norm": 0.23645849525928497, + "learning_rate": 7.289284879338289e-05, + "loss": 1.781, + "step": 11996 + }, + { + "epoch": 3.68232044198895, + "grad_norm": 0.3257114291191101, + "learning_rate": 7.288842972398845e-05, + "loss": 1.8269, + "step": 11997 + }, + { + "epoch": 3.6826273787599755, + "grad_norm": 0.5450126528739929, + "learning_rate": 7.288401042839713e-05, + "loss": 1.8342, + "step": 11998 + }, + { + "epoch": 3.682934315531001, + "grad_norm": 0.5080512762069702, + "learning_rate": 7.287959090665262e-05, + "loss": 1.8097, + "step": 11999 + }, + { + "epoch": 3.6832412523020257, + "grad_norm": 0.3005252480506897, + "learning_rate": 7.287517115879858e-05, + "loss": 1.8271, + "step": 12000 + }, + { + "epoch": 3.683548189073051, + "grad_norm": 0.2760924994945526, + "learning_rate": 7.287075118487869e-05, + "loss": 1.8267, + "step": 12001 + }, + { + "epoch": 3.683855125844076, + "grad_norm": 0.3475865423679352, + "learning_rate": 7.286633098493663e-05, + "loss": 1.785, + "step": 12002 + }, + { + "epoch": 3.6841620626151013, + "grad_norm": 0.2905690670013428, + "learning_rate": 7.286191055901608e-05, + "loss": 1.8283, + "step": 12003 + }, + { + "epoch": 3.6844689993861266, + "grad_norm": 0.23666246235370636, + "learning_rate": 7.285748990716072e-05, + "loss": 1.7665, + "step": 12004 + }, + { + "epoch": 3.6847759361571515, + "grad_norm": 0.32329514622688293, + "learning_rate": 7.285306902941427e-05, + "loss": 1.7267, + "step": 12005 + }, + { + "epoch": 3.685082872928177, + "grad_norm": 0.32345879077911377, + "learning_rate": 7.28486479258204e-05, + "loss": 1.7529, + "step": 12006 + }, + { + "epoch": 3.6853898096992017, + "grad_norm": 0.2727855443954468, + "learning_rate": 7.284422659642279e-05, + "loss": 1.8279, + "step": 12007 + }, + { + "epoch": 3.685696746470227, + "grad_norm": 0.37847277522087097, + "learning_rate": 7.283980504126513e-05, + "loss": 1.7809, + "step": 12008 + }, + { + "epoch": 3.6860036832412524, + "grad_norm": 0.44694215059280396, + "learning_rate": 7.283538326039113e-05, + "loss": 1.8184, + "step": 12009 + }, + { + "epoch": 3.6863106200122777, + "grad_norm": 0.2868261933326721, + "learning_rate": 7.28309612538445e-05, + "loss": 1.7461, + "step": 12010 + }, + { + "epoch": 3.6866175567833026, + "grad_norm": 0.2601351737976074, + "learning_rate": 7.282653902166894e-05, + "loss": 1.8011, + "step": 12011 + }, + { + "epoch": 3.686924493554328, + "grad_norm": 0.328185498714447, + "learning_rate": 7.282211656390813e-05, + "loss": 1.7934, + "step": 12012 + }, + { + "epoch": 3.687231430325353, + "grad_norm": 0.2712559103965759, + "learning_rate": 7.281769388060578e-05, + "loss": 1.7566, + "step": 12013 + }, + { + "epoch": 3.687538367096378, + "grad_norm": 0.2725805938243866, + "learning_rate": 7.281327097180562e-05, + "loss": 1.8024, + "step": 12014 + }, + { + "epoch": 3.6878453038674035, + "grad_norm": 0.37282630801200867, + "learning_rate": 7.280884783755133e-05, + "loss": 1.7624, + "step": 12015 + }, + { + "epoch": 3.6881522406384284, + "grad_norm": 0.36519256234169006, + "learning_rate": 7.280442447788664e-05, + "loss": 1.8691, + "step": 12016 + }, + { + "epoch": 3.6884591774094537, + "grad_norm": 0.21699345111846924, + "learning_rate": 7.280000089285528e-05, + "loss": 1.7308, + "step": 12017 + }, + { + "epoch": 3.6887661141804786, + "grad_norm": 0.3159945011138916, + "learning_rate": 7.279557708250094e-05, + "loss": 1.8144, + "step": 12018 + }, + { + "epoch": 3.689073050951504, + "grad_norm": 0.2927449643611908, + "learning_rate": 7.279115304686735e-05, + "loss": 1.7746, + "step": 12019 + }, + { + "epoch": 3.6893799877225293, + "grad_norm": 0.279208242893219, + "learning_rate": 7.278672878599819e-05, + "loss": 1.7678, + "step": 12020 + }, + { + "epoch": 3.689686924493554, + "grad_norm": 0.40005648136138916, + "learning_rate": 7.278230429993725e-05, + "loss": 1.7876, + "step": 12021 + }, + { + "epoch": 3.6899938612645795, + "grad_norm": 0.3444392681121826, + "learning_rate": 7.277787958872824e-05, + "loss": 1.7591, + "step": 12022 + }, + { + "epoch": 3.6903007980356044, + "grad_norm": 0.21841467916965485, + "learning_rate": 7.277345465241485e-05, + "loss": 1.785, + "step": 12023 + }, + { + "epoch": 3.6906077348066297, + "grad_norm": 0.32463181018829346, + "learning_rate": 7.276902949104084e-05, + "loss": 1.8164, + "step": 12024 + }, + { + "epoch": 3.690914671577655, + "grad_norm": 0.36221247911453247, + "learning_rate": 7.276460410464994e-05, + "loss": 1.7529, + "step": 12025 + }, + { + "epoch": 3.6912216083486804, + "grad_norm": 0.24451927840709686, + "learning_rate": 7.276017849328588e-05, + "loss": 1.8031, + "step": 12026 + }, + { + "epoch": 3.6915285451197053, + "grad_norm": 0.3055694103240967, + "learning_rate": 7.275575265699239e-05, + "loss": 1.8158, + "step": 12027 + }, + { + "epoch": 3.6918354818907306, + "grad_norm": 0.4315083622932434, + "learning_rate": 7.27513265958132e-05, + "loss": 1.8322, + "step": 12028 + }, + { + "epoch": 3.6921424186617555, + "grad_norm": 0.3391095697879791, + "learning_rate": 7.274690030979209e-05, + "loss": 1.8214, + "step": 12029 + }, + { + "epoch": 3.692449355432781, + "grad_norm": 0.22714883089065552, + "learning_rate": 7.274247379897277e-05, + "loss": 1.7312, + "step": 12030 + }, + { + "epoch": 3.692756292203806, + "grad_norm": 0.24982765316963196, + "learning_rate": 7.273804706339899e-05, + "loss": 1.738, + "step": 12031 + }, + { + "epoch": 3.693063228974831, + "grad_norm": 0.32509860396385193, + "learning_rate": 7.273362010311451e-05, + "loss": 1.7773, + "step": 12032 + }, + { + "epoch": 3.6933701657458564, + "grad_norm": 0.2643086612224579, + "learning_rate": 7.272919291816307e-05, + "loss": 1.7545, + "step": 12033 + }, + { + "epoch": 3.6936771025168813, + "grad_norm": 0.2568800747394562, + "learning_rate": 7.272476550858842e-05, + "loss": 1.8055, + "step": 12034 + }, + { + "epoch": 3.6939840392879066, + "grad_norm": 0.27418240904808044, + "learning_rate": 7.272033787443433e-05, + "loss": 1.7769, + "step": 12035 + }, + { + "epoch": 3.694290976058932, + "grad_norm": 0.2459677755832672, + "learning_rate": 7.271591001574453e-05, + "loss": 1.7971, + "step": 12036 + }, + { + "epoch": 3.694597912829957, + "grad_norm": 0.22349393367767334, + "learning_rate": 7.27114819325628e-05, + "loss": 1.7791, + "step": 12037 + }, + { + "epoch": 3.694904849600982, + "grad_norm": 0.25321197509765625, + "learning_rate": 7.270705362493288e-05, + "loss": 1.7475, + "step": 12038 + }, + { + "epoch": 3.695211786372007, + "grad_norm": 0.2585916519165039, + "learning_rate": 7.270262509289855e-05, + "loss": 1.7801, + "step": 12039 + }, + { + "epoch": 3.6955187231430324, + "grad_norm": 0.2673574686050415, + "learning_rate": 7.269819633650359e-05, + "loss": 1.7578, + "step": 12040 + }, + { + "epoch": 3.6958256599140578, + "grad_norm": 0.2509469985961914, + "learning_rate": 7.269376735579175e-05, + "loss": 1.7994, + "step": 12041 + }, + { + "epoch": 3.696132596685083, + "grad_norm": 0.28527703881263733, + "learning_rate": 7.268933815080679e-05, + "loss": 1.7752, + "step": 12042 + }, + { + "epoch": 3.696439533456108, + "grad_norm": 0.22716578841209412, + "learning_rate": 7.268490872159248e-05, + "loss": 1.7186, + "step": 12043 + }, + { + "epoch": 3.6967464702271333, + "grad_norm": 0.24888403713703156, + "learning_rate": 7.268047906819262e-05, + "loss": 1.7882, + "step": 12044 + }, + { + "epoch": 3.697053406998158, + "grad_norm": 0.28976112604141235, + "learning_rate": 7.267604919065096e-05, + "loss": 1.7655, + "step": 12045 + }, + { + "epoch": 3.6973603437691835, + "grad_norm": 0.24668502807617188, + "learning_rate": 7.267161908901131e-05, + "loss": 1.8051, + "step": 12046 + }, + { + "epoch": 3.697667280540209, + "grad_norm": 0.2464776188135147, + "learning_rate": 7.266718876331742e-05, + "loss": 1.809, + "step": 12047 + }, + { + "epoch": 3.6979742173112338, + "grad_norm": 0.27648577094078064, + "learning_rate": 7.266275821361309e-05, + "loss": 1.7869, + "step": 12048 + }, + { + "epoch": 3.698281154082259, + "grad_norm": 0.26427242159843445, + "learning_rate": 7.26583274399421e-05, + "loss": 1.7681, + "step": 12049 + }, + { + "epoch": 3.698588090853284, + "grad_norm": 0.24595285952091217, + "learning_rate": 7.265389644234823e-05, + "loss": 1.7209, + "step": 12050 + }, + { + "epoch": 3.6988950276243093, + "grad_norm": 0.32514405250549316, + "learning_rate": 7.26494652208753e-05, + "loss": 1.8702, + "step": 12051 + }, + { + "epoch": 3.6992019643953347, + "grad_norm": 0.24512936174869537, + "learning_rate": 7.264503377556705e-05, + "loss": 1.784, + "step": 12052 + }, + { + "epoch": 3.69950890116636, + "grad_norm": 0.28698310256004333, + "learning_rate": 7.264060210646733e-05, + "loss": 1.905, + "step": 12053 + }, + { + "epoch": 3.699815837937385, + "grad_norm": 0.2995007336139679, + "learning_rate": 7.263617021361989e-05, + "loss": 1.7822, + "step": 12054 + }, + { + "epoch": 3.7001227747084102, + "grad_norm": 0.25869423151016235, + "learning_rate": 7.263173809706855e-05, + "loss": 1.7988, + "step": 12055 + }, + { + "epoch": 3.700429711479435, + "grad_norm": 0.350918710231781, + "learning_rate": 7.262730575685711e-05, + "loss": 1.9504, + "step": 12056 + }, + { + "epoch": 3.7007366482504604, + "grad_norm": 0.3407665491104126, + "learning_rate": 7.262287319302937e-05, + "loss": 1.8506, + "step": 12057 + }, + { + "epoch": 3.701043585021486, + "grad_norm": 0.3039441704750061, + "learning_rate": 7.261844040562915e-05, + "loss": 1.7841, + "step": 12058 + }, + { + "epoch": 3.7013505217925107, + "grad_norm": 0.23483428359031677, + "learning_rate": 7.261400739470023e-05, + "loss": 1.7899, + "step": 12059 + }, + { + "epoch": 3.701657458563536, + "grad_norm": 0.30779507756233215, + "learning_rate": 7.260957416028645e-05, + "loss": 1.8131, + "step": 12060 + }, + { + "epoch": 3.701964395334561, + "grad_norm": 0.29901376366615295, + "learning_rate": 7.26051407024316e-05, + "loss": 1.7861, + "step": 12061 + }, + { + "epoch": 3.7022713321055862, + "grad_norm": 0.30058762431144714, + "learning_rate": 7.260070702117949e-05, + "loss": 1.7485, + "step": 12062 + }, + { + "epoch": 3.7025782688766116, + "grad_norm": 0.24523651599884033, + "learning_rate": 7.259627311657396e-05, + "loss": 1.772, + "step": 12063 + }, + { + "epoch": 3.7028852056476365, + "grad_norm": 0.24375474452972412, + "learning_rate": 7.259183898865882e-05, + "loss": 1.7848, + "step": 12064 + }, + { + "epoch": 3.703192142418662, + "grad_norm": 0.2562403380870819, + "learning_rate": 7.258740463747788e-05, + "loss": 1.7447, + "step": 12065 + }, + { + "epoch": 3.7034990791896867, + "grad_norm": 0.265229195356369, + "learning_rate": 7.258297006307496e-05, + "loss": 1.8111, + "step": 12066 + }, + { + "epoch": 3.703806015960712, + "grad_norm": 0.2836552858352661, + "learning_rate": 7.25785352654939e-05, + "loss": 1.7952, + "step": 12067 + }, + { + "epoch": 3.7041129527317374, + "grad_norm": 0.3269572854042053, + "learning_rate": 7.257410024477852e-05, + "loss": 1.8604, + "step": 12068 + }, + { + "epoch": 3.7044198895027627, + "grad_norm": 0.2391490638256073, + "learning_rate": 7.256966500097264e-05, + "loss": 1.7417, + "step": 12069 + }, + { + "epoch": 3.7047268262737876, + "grad_norm": 0.2610675096511841, + "learning_rate": 7.256522953412011e-05, + "loss": 1.7712, + "step": 12070 + }, + { + "epoch": 3.705033763044813, + "grad_norm": 0.24954774975776672, + "learning_rate": 7.256079384426477e-05, + "loss": 1.7506, + "step": 12071 + }, + { + "epoch": 3.705340699815838, + "grad_norm": 0.2603892385959625, + "learning_rate": 7.255635793145042e-05, + "loss": 1.8105, + "step": 12072 + }, + { + "epoch": 3.705647636586863, + "grad_norm": 0.32728591561317444, + "learning_rate": 7.255192179572092e-05, + "loss": 1.8448, + "step": 12073 + }, + { + "epoch": 3.7059545733578885, + "grad_norm": 0.4559340178966522, + "learning_rate": 7.254748543712013e-05, + "loss": 1.7232, + "step": 12074 + }, + { + "epoch": 3.7062615101289134, + "grad_norm": 0.36526206135749817, + "learning_rate": 7.254304885569186e-05, + "loss": 1.7874, + "step": 12075 + }, + { + "epoch": 3.7065684468999387, + "grad_norm": 0.21606837213039398, + "learning_rate": 7.253861205147998e-05, + "loss": 1.7266, + "step": 12076 + }, + { + "epoch": 3.7068753836709636, + "grad_norm": 0.3629585802555084, + "learning_rate": 7.253417502452831e-05, + "loss": 1.7722, + "step": 12077 + }, + { + "epoch": 3.707182320441989, + "grad_norm": 0.4224923551082611, + "learning_rate": 7.252973777488072e-05, + "loss": 1.7369, + "step": 12078 + }, + { + "epoch": 3.7074892572130143, + "grad_norm": 0.32245784997940063, + "learning_rate": 7.252530030258106e-05, + "loss": 1.7836, + "step": 12079 + }, + { + "epoch": 3.707796193984039, + "grad_norm": 0.29909494519233704, + "learning_rate": 7.252086260767317e-05, + "loss": 1.8718, + "step": 12080 + }, + { + "epoch": 3.7081031307550645, + "grad_norm": 0.21995799243450165, + "learning_rate": 7.251642469020093e-05, + "loss": 1.7103, + "step": 12081 + }, + { + "epoch": 3.7084100675260894, + "grad_norm": 0.2737572193145752, + "learning_rate": 7.251198655020818e-05, + "loss": 1.7787, + "step": 12082 + }, + { + "epoch": 3.7087170042971147, + "grad_norm": 0.22417058050632477, + "learning_rate": 7.250754818773879e-05, + "loss": 1.7782, + "step": 12083 + }, + { + "epoch": 3.70902394106814, + "grad_norm": 0.3350662887096405, + "learning_rate": 7.25031096028366e-05, + "loss": 1.8193, + "step": 12084 + }, + { + "epoch": 3.7093308778391654, + "grad_norm": 0.3199101686477661, + "learning_rate": 7.24986707955455e-05, + "loss": 1.831, + "step": 12085 + }, + { + "epoch": 3.7096378146101903, + "grad_norm": 0.2513977289199829, + "learning_rate": 7.249423176590936e-05, + "loss": 1.8288, + "step": 12086 + }, + { + "epoch": 3.7099447513812156, + "grad_norm": 0.30411866307258606, + "learning_rate": 7.248979251397203e-05, + "loss": 1.7837, + "step": 12087 + }, + { + "epoch": 3.7102516881522405, + "grad_norm": 0.30755332112312317, + "learning_rate": 7.248535303977738e-05, + "loss": 1.8016, + "step": 12088 + }, + { + "epoch": 3.710558624923266, + "grad_norm": 0.25746986269950867, + "learning_rate": 7.248091334336929e-05, + "loss": 1.8014, + "step": 12089 + }, + { + "epoch": 3.710865561694291, + "grad_norm": 0.3327447772026062, + "learning_rate": 7.247647342479164e-05, + "loss": 1.752, + "step": 12090 + }, + { + "epoch": 3.711172498465316, + "grad_norm": 0.3101816475391388, + "learning_rate": 7.247203328408832e-05, + "loss": 1.7867, + "step": 12091 + }, + { + "epoch": 3.7114794352363414, + "grad_norm": 0.2168906182050705, + "learning_rate": 7.246759292130318e-05, + "loss": 1.7452, + "step": 12092 + }, + { + "epoch": 3.7117863720073663, + "grad_norm": 0.34260258078575134, + "learning_rate": 7.246315233648013e-05, + "loss": 1.8156, + "step": 12093 + }, + { + "epoch": 3.7120933087783916, + "grad_norm": 0.2730714976787567, + "learning_rate": 7.245871152966303e-05, + "loss": 1.7429, + "step": 12094 + }, + { + "epoch": 3.712400245549417, + "grad_norm": 0.2560936212539673, + "learning_rate": 7.245427050089578e-05, + "loss": 1.7969, + "step": 12095 + }, + { + "epoch": 3.712707182320442, + "grad_norm": 0.27510303258895874, + "learning_rate": 7.244982925022228e-05, + "loss": 1.7981, + "step": 12096 + }, + { + "epoch": 3.713014119091467, + "grad_norm": 0.29171642661094666, + "learning_rate": 7.24453877776864e-05, + "loss": 1.7913, + "step": 12097 + }, + { + "epoch": 3.713321055862492, + "grad_norm": 0.26431843638420105, + "learning_rate": 7.244094608333206e-05, + "loss": 1.8262, + "step": 12098 + }, + { + "epoch": 3.7136279926335174, + "grad_norm": 0.30747905373573303, + "learning_rate": 7.243650416720311e-05, + "loss": 1.7951, + "step": 12099 + }, + { + "epoch": 3.7139349294045427, + "grad_norm": 0.346443772315979, + "learning_rate": 7.24320620293435e-05, + "loss": 1.7677, + "step": 12100 + }, + { + "epoch": 3.714241866175568, + "grad_norm": 0.2910652458667755, + "learning_rate": 7.242761966979709e-05, + "loss": 1.7887, + "step": 12101 + }, + { + "epoch": 3.714548802946593, + "grad_norm": 0.22342006862163544, + "learning_rate": 7.24231770886078e-05, + "loss": 1.7678, + "step": 12102 + }, + { + "epoch": 3.7148557397176183, + "grad_norm": 0.24125796556472778, + "learning_rate": 7.241873428581954e-05, + "loss": 1.7436, + "step": 12103 + }, + { + "epoch": 3.715162676488643, + "grad_norm": 0.23542635142803192, + "learning_rate": 7.24142912614762e-05, + "loss": 1.7942, + "step": 12104 + }, + { + "epoch": 3.7154696132596685, + "grad_norm": 0.22476384043693542, + "learning_rate": 7.240984801562169e-05, + "loss": 1.8235, + "step": 12105 + }, + { + "epoch": 3.715776550030694, + "grad_norm": 0.25123465061187744, + "learning_rate": 7.240540454829992e-05, + "loss": 1.8112, + "step": 12106 + }, + { + "epoch": 3.7160834868017187, + "grad_norm": 0.27230000495910645, + "learning_rate": 7.240096085955483e-05, + "loss": 1.8312, + "step": 12107 + }, + { + "epoch": 3.716390423572744, + "grad_norm": 0.2722976803779602, + "learning_rate": 7.239651694943031e-05, + "loss": 1.8368, + "step": 12108 + }, + { + "epoch": 3.716697360343769, + "grad_norm": 0.264138400554657, + "learning_rate": 7.239207281797028e-05, + "loss": 1.8206, + "step": 12109 + }, + { + "epoch": 3.7170042971147943, + "grad_norm": 0.28813931345939636, + "learning_rate": 7.238762846521866e-05, + "loss": 1.7391, + "step": 12110 + }, + { + "epoch": 3.7173112338858196, + "grad_norm": 0.2319631576538086, + "learning_rate": 7.238318389121939e-05, + "loss": 1.7574, + "step": 12111 + }, + { + "epoch": 3.717618170656845, + "grad_norm": 0.2507809102535248, + "learning_rate": 7.237873909601635e-05, + "loss": 1.7359, + "step": 12112 + }, + { + "epoch": 3.71792510742787, + "grad_norm": 0.2717304825782776, + "learning_rate": 7.237429407965351e-05, + "loss": 1.774, + "step": 12113 + }, + { + "epoch": 3.718232044198895, + "grad_norm": 0.2619280517101288, + "learning_rate": 7.236984884217478e-05, + "loss": 1.8083, + "step": 12114 + }, + { + "epoch": 3.71853898096992, + "grad_norm": 0.22268806397914886, + "learning_rate": 7.23654033836241e-05, + "loss": 1.7436, + "step": 12115 + }, + { + "epoch": 3.7188459177409454, + "grad_norm": 0.2341407984495163, + "learning_rate": 7.236095770404539e-05, + "loss": 1.7807, + "step": 12116 + }, + { + "epoch": 3.7191528545119708, + "grad_norm": 0.23519712686538696, + "learning_rate": 7.235651180348258e-05, + "loss": 1.8051, + "step": 12117 + }, + { + "epoch": 3.7194597912829956, + "grad_norm": 0.2391074150800705, + "learning_rate": 7.235206568197963e-05, + "loss": 1.8377, + "step": 12118 + }, + { + "epoch": 3.719766728054021, + "grad_norm": 0.26821592450141907, + "learning_rate": 7.234761933958045e-05, + "loss": 1.8586, + "step": 12119 + }, + { + "epoch": 3.720073664825046, + "grad_norm": 0.24971134960651398, + "learning_rate": 7.234317277632902e-05, + "loss": 1.8404, + "step": 12120 + }, + { + "epoch": 3.720380601596071, + "grad_norm": 0.20817919075489044, + "learning_rate": 7.233872599226926e-05, + "loss": 1.7204, + "step": 12121 + }, + { + "epoch": 3.7206875383670965, + "grad_norm": 0.29301291704177856, + "learning_rate": 7.233427898744509e-05, + "loss": 1.8528, + "step": 12122 + }, + { + "epoch": 3.7209944751381214, + "grad_norm": 0.22214651107788086, + "learning_rate": 7.23298317619005e-05, + "loss": 1.748, + "step": 12123 + }, + { + "epoch": 3.7213014119091468, + "grad_norm": 0.2511044442653656, + "learning_rate": 7.232538431567941e-05, + "loss": 1.8146, + "step": 12124 + }, + { + "epoch": 3.7216083486801717, + "grad_norm": 0.26976367831230164, + "learning_rate": 7.232093664882581e-05, + "loss": 1.8483, + "step": 12125 + }, + { + "epoch": 3.721915285451197, + "grad_norm": 0.2538089156150818, + "learning_rate": 7.231648876138361e-05, + "loss": 1.8097, + "step": 12126 + }, + { + "epoch": 3.7222222222222223, + "grad_norm": 0.2353016883134842, + "learning_rate": 7.231204065339682e-05, + "loss": 1.737, + "step": 12127 + }, + { + "epoch": 3.7225291589932477, + "grad_norm": 0.3205147981643677, + "learning_rate": 7.230759232490935e-05, + "loss": 1.8116, + "step": 12128 + }, + { + "epoch": 3.7228360957642725, + "grad_norm": 0.39056599140167236, + "learning_rate": 7.230314377596516e-05, + "loss": 1.7785, + "step": 12129 + }, + { + "epoch": 3.723143032535298, + "grad_norm": 0.3846863806247711, + "learning_rate": 7.229869500660825e-05, + "loss": 1.738, + "step": 12130 + }, + { + "epoch": 3.7234499693063228, + "grad_norm": 0.24412120878696442, + "learning_rate": 7.229424601688256e-05, + "loss": 1.7351, + "step": 12131 + }, + { + "epoch": 3.723756906077348, + "grad_norm": 0.2978009581565857, + "learning_rate": 7.228979680683206e-05, + "loss": 1.8037, + "step": 12132 + }, + { + "epoch": 3.7240638428483734, + "grad_norm": 0.33787262439727783, + "learning_rate": 7.228534737650074e-05, + "loss": 1.8421, + "step": 12133 + }, + { + "epoch": 3.7243707796193983, + "grad_norm": 0.2536921203136444, + "learning_rate": 7.228089772593254e-05, + "loss": 1.7472, + "step": 12134 + }, + { + "epoch": 3.7246777163904237, + "grad_norm": 0.24103601276874542, + "learning_rate": 7.227644785517144e-05, + "loss": 1.8011, + "step": 12135 + }, + { + "epoch": 3.7249846531614486, + "grad_norm": 0.3653033375740051, + "learning_rate": 7.227199776426146e-05, + "loss": 1.8018, + "step": 12136 + }, + { + "epoch": 3.725291589932474, + "grad_norm": 0.35728752613067627, + "learning_rate": 7.226754745324652e-05, + "loss": 1.7684, + "step": 12137 + }, + { + "epoch": 3.7255985267034992, + "grad_norm": 0.262018620967865, + "learning_rate": 7.226309692217063e-05, + "loss": 1.8124, + "step": 12138 + }, + { + "epoch": 3.725905463474524, + "grad_norm": 0.3467118442058563, + "learning_rate": 7.225864617107776e-05, + "loss": 1.8761, + "step": 12139 + }, + { + "epoch": 3.7262124002455494, + "grad_norm": 0.4365626871585846, + "learning_rate": 7.22541952000119e-05, + "loss": 1.7159, + "step": 12140 + }, + { + "epoch": 3.7265193370165743, + "grad_norm": 0.2819811999797821, + "learning_rate": 7.224974400901705e-05, + "loss": 1.8051, + "step": 12141 + }, + { + "epoch": 3.7268262737875997, + "grad_norm": 0.39062437415122986, + "learning_rate": 7.224529259813719e-05, + "loss": 1.8517, + "step": 12142 + }, + { + "epoch": 3.727133210558625, + "grad_norm": 0.4383927285671234, + "learning_rate": 7.22408409674163e-05, + "loss": 1.8295, + "step": 12143 + }, + { + "epoch": 3.7274401473296503, + "grad_norm": 0.3043094575405121, + "learning_rate": 7.223638911689839e-05, + "loss": 1.7653, + "step": 12144 + }, + { + "epoch": 3.7277470841006752, + "grad_norm": 0.25198984146118164, + "learning_rate": 7.223193704662746e-05, + "loss": 1.7561, + "step": 12145 + }, + { + "epoch": 3.7280540208717006, + "grad_norm": 0.353565514087677, + "learning_rate": 7.222748475664749e-05, + "loss": 1.8077, + "step": 12146 + }, + { + "epoch": 3.7283609576427255, + "grad_norm": 0.39757224917411804, + "learning_rate": 7.222303224700248e-05, + "loss": 1.7622, + "step": 12147 + }, + { + "epoch": 3.728667894413751, + "grad_norm": 0.35595703125, + "learning_rate": 7.221857951773644e-05, + "loss": 1.8436, + "step": 12148 + }, + { + "epoch": 3.728974831184776, + "grad_norm": 0.2469715029001236, + "learning_rate": 7.221412656889338e-05, + "loss": 1.8531, + "step": 12149 + }, + { + "epoch": 3.729281767955801, + "grad_norm": 0.35324424505233765, + "learning_rate": 7.22096734005173e-05, + "loss": 1.7361, + "step": 12150 + }, + { + "epoch": 3.7295887047268264, + "grad_norm": 0.3783365488052368, + "learning_rate": 7.220522001265223e-05, + "loss": 1.7459, + "step": 12151 + }, + { + "epoch": 3.7298956414978512, + "grad_norm": 0.27526360750198364, + "learning_rate": 7.220076640534212e-05, + "loss": 1.8867, + "step": 12152 + }, + { + "epoch": 3.7302025782688766, + "grad_norm": 0.30863118171691895, + "learning_rate": 7.219631257863105e-05, + "loss": 1.7363, + "step": 12153 + }, + { + "epoch": 3.730509515039902, + "grad_norm": 0.38505107164382935, + "learning_rate": 7.219185853256301e-05, + "loss": 1.764, + "step": 12154 + }, + { + "epoch": 3.730816451810927, + "grad_norm": 0.2925978899002075, + "learning_rate": 7.218740426718202e-05, + "loss": 1.7693, + "step": 12155 + }, + { + "epoch": 3.731123388581952, + "grad_norm": 0.24510078132152557, + "learning_rate": 7.218294978253209e-05, + "loss": 1.8089, + "step": 12156 + }, + { + "epoch": 3.731430325352977, + "grad_norm": 0.33029109239578247, + "learning_rate": 7.217849507865724e-05, + "loss": 1.6885, + "step": 12157 + }, + { + "epoch": 3.7317372621240024, + "grad_norm": 0.333970308303833, + "learning_rate": 7.217404015560149e-05, + "loss": 1.8132, + "step": 12158 + }, + { + "epoch": 3.7320441988950277, + "grad_norm": 0.2467660754919052, + "learning_rate": 7.216958501340891e-05, + "loss": 1.8021, + "step": 12159 + }, + { + "epoch": 3.732351135666053, + "grad_norm": 0.2701449990272522, + "learning_rate": 7.216512965212348e-05, + "loss": 1.7006, + "step": 12160 + }, + { + "epoch": 3.732658072437078, + "grad_norm": 0.2784138023853302, + "learning_rate": 7.216067407178926e-05, + "loss": 1.7616, + "step": 12161 + }, + { + "epoch": 3.7329650092081033, + "grad_norm": 0.2082870900630951, + "learning_rate": 7.215621827245026e-05, + "loss": 1.7391, + "step": 12162 + }, + { + "epoch": 3.733271945979128, + "grad_norm": 0.2477869987487793, + "learning_rate": 7.215176225415053e-05, + "loss": 1.7761, + "step": 12163 + }, + { + "epoch": 3.7335788827501535, + "grad_norm": 0.28395572304725647, + "learning_rate": 7.21473060169341e-05, + "loss": 1.8181, + "step": 12164 + }, + { + "epoch": 3.733885819521179, + "grad_norm": 0.20430058240890503, + "learning_rate": 7.2142849560845e-05, + "loss": 1.7035, + "step": 12165 + }, + { + "epoch": 3.7341927562922037, + "grad_norm": 0.30061420798301697, + "learning_rate": 7.21383928859273e-05, + "loss": 1.7703, + "step": 12166 + }, + { + "epoch": 3.734499693063229, + "grad_norm": 0.33865803480148315, + "learning_rate": 7.2133935992225e-05, + "loss": 1.8204, + "step": 12167 + }, + { + "epoch": 3.734806629834254, + "grad_norm": 0.29172980785369873, + "learning_rate": 7.212947887978221e-05, + "loss": 1.739, + "step": 12168 + }, + { + "epoch": 3.7351135666052793, + "grad_norm": 0.2799396812915802, + "learning_rate": 7.212502154864291e-05, + "loss": 1.8503, + "step": 12169 + }, + { + "epoch": 3.7354205033763046, + "grad_norm": 0.2945539355278015, + "learning_rate": 7.212056399885118e-05, + "loss": 1.7523, + "step": 12170 + }, + { + "epoch": 3.7357274401473295, + "grad_norm": 0.2395290732383728, + "learning_rate": 7.211610623045108e-05, + "loss": 1.7728, + "step": 12171 + }, + { + "epoch": 3.736034376918355, + "grad_norm": 0.24369286000728607, + "learning_rate": 7.211164824348667e-05, + "loss": 1.7725, + "step": 12172 + }, + { + "epoch": 3.7363413136893797, + "grad_norm": 0.3272435963153839, + "learning_rate": 7.210719003800197e-05, + "loss": 1.8531, + "step": 12173 + }, + { + "epoch": 3.736648250460405, + "grad_norm": 0.23954182863235474, + "learning_rate": 7.210273161404107e-05, + "loss": 1.7807, + "step": 12174 + }, + { + "epoch": 3.7369551872314304, + "grad_norm": 0.24547603726387024, + "learning_rate": 7.209827297164801e-05, + "loss": 1.8481, + "step": 12175 + }, + { + "epoch": 3.7372621240024557, + "grad_norm": 0.26926249265670776, + "learning_rate": 7.209381411086687e-05, + "loss": 1.7496, + "step": 12176 + }, + { + "epoch": 3.7375690607734806, + "grad_norm": 0.22948235273361206, + "learning_rate": 7.208935503174172e-05, + "loss": 1.7681, + "step": 12177 + }, + { + "epoch": 3.737875997544506, + "grad_norm": 0.2697654664516449, + "learning_rate": 7.20848957343166e-05, + "loss": 1.789, + "step": 12178 + }, + { + "epoch": 3.738182934315531, + "grad_norm": 0.235344797372818, + "learning_rate": 7.208043621863562e-05, + "loss": 1.8309, + "step": 12179 + }, + { + "epoch": 3.738489871086556, + "grad_norm": 0.2688879072666168, + "learning_rate": 7.20759764847428e-05, + "loss": 1.7898, + "step": 12180 + }, + { + "epoch": 3.7387968078575815, + "grad_norm": 0.26818978786468506, + "learning_rate": 7.207151653268226e-05, + "loss": 1.7882, + "step": 12181 + }, + { + "epoch": 3.7391037446286064, + "grad_norm": 0.2612875998020172, + "learning_rate": 7.206705636249804e-05, + "loss": 1.7352, + "step": 12182 + }, + { + "epoch": 3.7394106813996317, + "grad_norm": 0.22547565400600433, + "learning_rate": 7.206259597423425e-05, + "loss": 1.733, + "step": 12183 + }, + { + "epoch": 3.7397176181706566, + "grad_norm": 0.24645474553108215, + "learning_rate": 7.205813536793495e-05, + "loss": 1.8064, + "step": 12184 + }, + { + "epoch": 3.740024554941682, + "grad_norm": 0.25879329442977905, + "learning_rate": 7.205367454364424e-05, + "loss": 1.8134, + "step": 12185 + }, + { + "epoch": 3.7403314917127073, + "grad_norm": 0.22420097887516022, + "learning_rate": 7.204921350140617e-05, + "loss": 1.7819, + "step": 12186 + }, + { + "epoch": 3.7406384284837326, + "grad_norm": 0.2569858431816101, + "learning_rate": 7.204475224126487e-05, + "loss": 1.784, + "step": 12187 + }, + { + "epoch": 3.7409453652547575, + "grad_norm": 0.23769912123680115, + "learning_rate": 7.20402907632644e-05, + "loss": 1.7853, + "step": 12188 + }, + { + "epoch": 3.741252302025783, + "grad_norm": 0.26935988664627075, + "learning_rate": 7.203582906744885e-05, + "loss": 1.806, + "step": 12189 + }, + { + "epoch": 3.7415592387968077, + "grad_norm": 0.2544274628162384, + "learning_rate": 7.203136715386233e-05, + "loss": 1.7988, + "step": 12190 + }, + { + "epoch": 3.741866175567833, + "grad_norm": 0.22665882110595703, + "learning_rate": 7.202690502254892e-05, + "loss": 1.7798, + "step": 12191 + }, + { + "epoch": 3.7421731123388584, + "grad_norm": 0.24512888491153717, + "learning_rate": 7.202244267355273e-05, + "loss": 1.816, + "step": 12192 + }, + { + "epoch": 3.7424800491098833, + "grad_norm": 0.2408553808927536, + "learning_rate": 7.201798010691785e-05, + "loss": 1.7417, + "step": 12193 + }, + { + "epoch": 3.7427869858809086, + "grad_norm": 0.23142600059509277, + "learning_rate": 7.201351732268838e-05, + "loss": 1.7771, + "step": 12194 + }, + { + "epoch": 3.7430939226519335, + "grad_norm": 0.245071142911911, + "learning_rate": 7.200905432090844e-05, + "loss": 1.7556, + "step": 12195 + }, + { + "epoch": 3.743400859422959, + "grad_norm": 0.2623934745788574, + "learning_rate": 7.200459110162211e-05, + "loss": 1.8042, + "step": 12196 + }, + { + "epoch": 3.743707796193984, + "grad_norm": 0.2531217038631439, + "learning_rate": 7.200012766487353e-05, + "loss": 1.7709, + "step": 12197 + }, + { + "epoch": 3.744014732965009, + "grad_norm": 0.23839864134788513, + "learning_rate": 7.19956640107068e-05, + "loss": 1.8202, + "step": 12198 + }, + { + "epoch": 3.7443216697360344, + "grad_norm": 0.2342260777950287, + "learning_rate": 7.1991200139166e-05, + "loss": 1.827, + "step": 12199 + }, + { + "epoch": 3.7446286065070593, + "grad_norm": 0.25511276721954346, + "learning_rate": 7.198673605029528e-05, + "loss": 1.7766, + "step": 12200 + }, + { + "epoch": 3.7449355432780846, + "grad_norm": 0.27601274847984314, + "learning_rate": 7.198227174413876e-05, + "loss": 1.7716, + "step": 12201 + }, + { + "epoch": 3.74524248004911, + "grad_norm": 0.3027385175228119, + "learning_rate": 7.197780722074056e-05, + "loss": 1.8007, + "step": 12202 + }, + { + "epoch": 3.7455494168201353, + "grad_norm": 0.31242382526397705, + "learning_rate": 7.197334248014477e-05, + "loss": 1.8089, + "step": 12203 + }, + { + "epoch": 3.74585635359116, + "grad_norm": 0.3673859238624573, + "learning_rate": 7.196887752239551e-05, + "loss": 1.8017, + "step": 12204 + }, + { + "epoch": 3.7461632903621855, + "grad_norm": 0.3152726888656616, + "learning_rate": 7.196441234753695e-05, + "loss": 1.7108, + "step": 12205 + }, + { + "epoch": 3.7464702271332104, + "grad_norm": 0.2606927156448364, + "learning_rate": 7.195994695561319e-05, + "loss": 1.8066, + "step": 12206 + }, + { + "epoch": 3.7467771639042358, + "grad_norm": 0.37624871730804443, + "learning_rate": 7.195548134666836e-05, + "loss": 1.725, + "step": 12207 + }, + { + "epoch": 3.747084100675261, + "grad_norm": 0.4138187766075134, + "learning_rate": 7.195101552074658e-05, + "loss": 1.7838, + "step": 12208 + }, + { + "epoch": 3.747391037446286, + "grad_norm": 0.3668459951877594, + "learning_rate": 7.194654947789204e-05, + "loss": 1.7575, + "step": 12209 + }, + { + "epoch": 3.7476979742173113, + "grad_norm": 0.27947792410850525, + "learning_rate": 7.19420832181488e-05, + "loss": 1.792, + "step": 12210 + }, + { + "epoch": 3.748004910988336, + "grad_norm": 0.2507692873477936, + "learning_rate": 7.193761674156103e-05, + "loss": 1.7752, + "step": 12211 + }, + { + "epoch": 3.7483118477593615, + "grad_norm": 0.3209949731826782, + "learning_rate": 7.193315004817289e-05, + "loss": 1.8491, + "step": 12212 + }, + { + "epoch": 3.748618784530387, + "grad_norm": 0.32883042097091675, + "learning_rate": 7.192868313802849e-05, + "loss": 1.8135, + "step": 12213 + }, + { + "epoch": 3.7489257213014118, + "grad_norm": 0.2450616955757141, + "learning_rate": 7.192421601117201e-05, + "loss": 1.7722, + "step": 12214 + }, + { + "epoch": 3.749232658072437, + "grad_norm": 0.2545110285282135, + "learning_rate": 7.191974866764757e-05, + "loss": 1.7866, + "step": 12215 + }, + { + "epoch": 3.749539594843462, + "grad_norm": 0.264017790555954, + "learning_rate": 7.191528110749932e-05, + "loss": 1.778, + "step": 12216 + }, + { + "epoch": 3.7498465316144873, + "grad_norm": 0.3156309425830841, + "learning_rate": 7.191081333077142e-05, + "loss": 1.7917, + "step": 12217 + }, + { + "epoch": 3.7501534683855127, + "grad_norm": 0.3578774631023407, + "learning_rate": 7.190634533750802e-05, + "loss": 1.8468, + "step": 12218 + }, + { + "epoch": 3.750460405156538, + "grad_norm": 0.30735981464385986, + "learning_rate": 7.19018771277533e-05, + "loss": 1.7502, + "step": 12219 + }, + { + "epoch": 3.750767341927563, + "grad_norm": 0.22870220243930817, + "learning_rate": 7.189740870155135e-05, + "loss": 1.7686, + "step": 12220 + }, + { + "epoch": 3.7510742786985882, + "grad_norm": 0.30297720432281494, + "learning_rate": 7.18929400589464e-05, + "loss": 1.826, + "step": 12221 + }, + { + "epoch": 3.751381215469613, + "grad_norm": 0.2735389173030853, + "learning_rate": 7.188847119998257e-05, + "loss": 1.8142, + "step": 12222 + }, + { + "epoch": 3.7516881522406385, + "grad_norm": 0.2823885679244995, + "learning_rate": 7.188400212470405e-05, + "loss": 1.8028, + "step": 12223 + }, + { + "epoch": 3.751995089011664, + "grad_norm": 0.4184139370918274, + "learning_rate": 7.187953283315499e-05, + "loss": 1.8467, + "step": 12224 + }, + { + "epoch": 3.7523020257826887, + "grad_norm": 0.3559226095676422, + "learning_rate": 7.187506332537957e-05, + "loss": 1.7416, + "step": 12225 + }, + { + "epoch": 3.752608962553714, + "grad_norm": 0.26055800914764404, + "learning_rate": 7.187059360142194e-05, + "loss": 1.8309, + "step": 12226 + }, + { + "epoch": 3.752915899324739, + "grad_norm": 0.28032660484313965, + "learning_rate": 7.186612366132629e-05, + "loss": 1.7926, + "step": 12227 + }, + { + "epoch": 3.7532228360957642, + "grad_norm": 0.26229965686798096, + "learning_rate": 7.18616535051368e-05, + "loss": 1.7368, + "step": 12228 + }, + { + "epoch": 3.7535297728667896, + "grad_norm": 0.2779417634010315, + "learning_rate": 7.185718313289763e-05, + "loss": 1.8418, + "step": 12229 + }, + { + "epoch": 3.7538367096378145, + "grad_norm": 0.26164770126342773, + "learning_rate": 7.185271254465295e-05, + "loss": 1.7511, + "step": 12230 + }, + { + "epoch": 3.75414364640884, + "grad_norm": 0.30725157260894775, + "learning_rate": 7.184824174044698e-05, + "loss": 1.7661, + "step": 12231 + }, + { + "epoch": 3.7544505831798647, + "grad_norm": 0.33111417293548584, + "learning_rate": 7.184377072032386e-05, + "loss": 1.7341, + "step": 12232 + }, + { + "epoch": 3.75475751995089, + "grad_norm": 0.23978343605995178, + "learning_rate": 7.183929948432779e-05, + "loss": 1.7151, + "step": 12233 + }, + { + "epoch": 3.7550644567219154, + "grad_norm": 0.3057664632797241, + "learning_rate": 7.183482803250299e-05, + "loss": 1.8446, + "step": 12234 + }, + { + "epoch": 3.7553713934929407, + "grad_norm": 0.2629055678844452, + "learning_rate": 7.18303563648936e-05, + "loss": 1.7415, + "step": 12235 + }, + { + "epoch": 3.7556783302639656, + "grad_norm": 0.22703498601913452, + "learning_rate": 7.182588448154386e-05, + "loss": 1.8188, + "step": 12236 + }, + { + "epoch": 3.755985267034991, + "grad_norm": 0.3014034032821655, + "learning_rate": 7.182141238249792e-05, + "loss": 1.8634, + "step": 12237 + }, + { + "epoch": 3.756292203806016, + "grad_norm": 0.28859084844589233, + "learning_rate": 7.181694006779998e-05, + "loss": 1.7509, + "step": 12238 + }, + { + "epoch": 3.756599140577041, + "grad_norm": 0.293720543384552, + "learning_rate": 7.181246753749426e-05, + "loss": 1.777, + "step": 12239 + }, + { + "epoch": 3.7569060773480665, + "grad_norm": 0.2374580055475235, + "learning_rate": 7.180799479162496e-05, + "loss": 1.7492, + "step": 12240 + }, + { + "epoch": 3.7572130141190914, + "grad_norm": 0.30106452107429504, + "learning_rate": 7.180352183023627e-05, + "loss": 1.7538, + "step": 12241 + }, + { + "epoch": 3.7575199508901167, + "grad_norm": 0.3504682183265686, + "learning_rate": 7.179904865337238e-05, + "loss": 1.7477, + "step": 12242 + }, + { + "epoch": 3.7578268876611416, + "grad_norm": 0.2901679575443268, + "learning_rate": 7.179457526107754e-05, + "loss": 1.9412, + "step": 12243 + }, + { + "epoch": 3.758133824432167, + "grad_norm": 0.37690606713294983, + "learning_rate": 7.179010165339591e-05, + "loss": 1.8222, + "step": 12244 + }, + { + "epoch": 3.7584407612031923, + "grad_norm": 0.45126965641975403, + "learning_rate": 7.178562783037172e-05, + "loss": 1.8563, + "step": 12245 + }, + { + "epoch": 3.758747697974217, + "grad_norm": 0.2747548818588257, + "learning_rate": 7.178115379204921e-05, + "loss": 1.7179, + "step": 12246 + }, + { + "epoch": 3.7590546347452425, + "grad_norm": 0.43243977427482605, + "learning_rate": 7.177667953847257e-05, + "loss": 1.8157, + "step": 12247 + }, + { + "epoch": 3.7593615715162674, + "grad_norm": 0.529448390007019, + "learning_rate": 7.177220506968602e-05, + "loss": 1.8113, + "step": 12248 + }, + { + "epoch": 3.7596685082872927, + "grad_norm": 0.3099314868450165, + "learning_rate": 7.176773038573377e-05, + "loss": 1.7833, + "step": 12249 + }, + { + "epoch": 3.759975445058318, + "grad_norm": 0.3111872375011444, + "learning_rate": 7.176325548666004e-05, + "loss": 1.7965, + "step": 12250 + }, + { + "epoch": 3.7602823818293434, + "grad_norm": 0.38437551259994507, + "learning_rate": 7.175878037250907e-05, + "loss": 1.7822, + "step": 12251 + }, + { + "epoch": 3.7605893186003683, + "grad_norm": 0.33643704652786255, + "learning_rate": 7.175430504332509e-05, + "loss": 1.7839, + "step": 12252 + }, + { + "epoch": 3.7608962553713936, + "grad_norm": 0.24705304205417633, + "learning_rate": 7.174982949915232e-05, + "loss": 1.8302, + "step": 12253 + }, + { + "epoch": 3.7612031921424185, + "grad_norm": 0.3615458309650421, + "learning_rate": 7.174535374003497e-05, + "loss": 1.7963, + "step": 12254 + }, + { + "epoch": 3.761510128913444, + "grad_norm": 0.36486589908599854, + "learning_rate": 7.17408777660173e-05, + "loss": 1.7933, + "step": 12255 + }, + { + "epoch": 3.761817065684469, + "grad_norm": 0.2566867172718048, + "learning_rate": 7.173640157714352e-05, + "loss": 1.7254, + "step": 12256 + }, + { + "epoch": 3.762124002455494, + "grad_norm": 0.2602523863315582, + "learning_rate": 7.17319251734579e-05, + "loss": 1.7357, + "step": 12257 + }, + { + "epoch": 3.7624309392265194, + "grad_norm": 0.3626105785369873, + "learning_rate": 7.172744855500464e-05, + "loss": 1.7971, + "step": 12258 + }, + { + "epoch": 3.7627378759975443, + "grad_norm": 0.36327603459358215, + "learning_rate": 7.172297172182802e-05, + "loss": 1.7819, + "step": 12259 + }, + { + "epoch": 3.7630448127685696, + "grad_norm": 0.25935736298561096, + "learning_rate": 7.171849467397224e-05, + "loss": 1.8112, + "step": 12260 + }, + { + "epoch": 3.763351749539595, + "grad_norm": 0.2779700756072998, + "learning_rate": 7.171401741148156e-05, + "loss": 1.786, + "step": 12261 + }, + { + "epoch": 3.7636586863106203, + "grad_norm": 0.3089013695716858, + "learning_rate": 7.170953993440025e-05, + "loss": 1.7808, + "step": 12262 + }, + { + "epoch": 3.763965623081645, + "grad_norm": 0.2562308609485626, + "learning_rate": 7.170506224277253e-05, + "loss": 1.8207, + "step": 12263 + }, + { + "epoch": 3.7642725598526705, + "grad_norm": 0.2907634973526001, + "learning_rate": 7.170058433664268e-05, + "loss": 1.7638, + "step": 12264 + }, + { + "epoch": 3.7645794966236954, + "grad_norm": 0.30341312289237976, + "learning_rate": 7.169610621605493e-05, + "loss": 1.7827, + "step": 12265 + }, + { + "epoch": 3.7648864333947207, + "grad_norm": 0.27091866731643677, + "learning_rate": 7.169162788105353e-05, + "loss": 1.786, + "step": 12266 + }, + { + "epoch": 3.765193370165746, + "grad_norm": 0.234042689204216, + "learning_rate": 7.168714933168277e-05, + "loss": 1.7638, + "step": 12267 + }, + { + "epoch": 3.765500306936771, + "grad_norm": 0.2477465271949768, + "learning_rate": 7.168267056798686e-05, + "loss": 1.7275, + "step": 12268 + }, + { + "epoch": 3.7658072437077963, + "grad_norm": 0.25578543543815613, + "learning_rate": 7.167819159001012e-05, + "loss": 1.7831, + "step": 12269 + }, + { + "epoch": 3.766114180478821, + "grad_norm": 0.26629674434661865, + "learning_rate": 7.167371239779678e-05, + "loss": 1.7866, + "step": 12270 + }, + { + "epoch": 3.7664211172498465, + "grad_norm": 0.31350967288017273, + "learning_rate": 7.16692329913911e-05, + "loss": 1.7755, + "step": 12271 + }, + { + "epoch": 3.766728054020872, + "grad_norm": 0.2670116126537323, + "learning_rate": 7.166475337083735e-05, + "loss": 1.7524, + "step": 12272 + }, + { + "epoch": 3.7670349907918967, + "grad_norm": 0.26503682136535645, + "learning_rate": 7.166027353617983e-05, + "loss": 1.7867, + "step": 12273 + }, + { + "epoch": 3.767341927562922, + "grad_norm": 0.3674192428588867, + "learning_rate": 7.165579348746278e-05, + "loss": 1.7604, + "step": 12274 + }, + { + "epoch": 3.767648864333947, + "grad_norm": 0.4120824337005615, + "learning_rate": 7.16513132247305e-05, + "loss": 1.7905, + "step": 12275 + }, + { + "epoch": 3.7679558011049723, + "grad_norm": 0.29074826836586, + "learning_rate": 7.164683274802723e-05, + "loss": 1.7539, + "step": 12276 + }, + { + "epoch": 3.7682627378759976, + "grad_norm": 0.22223204374313354, + "learning_rate": 7.164235205739729e-05, + "loss": 1.755, + "step": 12277 + }, + { + "epoch": 3.768569674647023, + "grad_norm": 0.23997461795806885, + "learning_rate": 7.163787115288494e-05, + "loss": 1.8024, + "step": 12278 + }, + { + "epoch": 3.768876611418048, + "grad_norm": 0.2556418776512146, + "learning_rate": 7.163339003453445e-05, + "loss": 1.7717, + "step": 12279 + }, + { + "epoch": 3.769183548189073, + "grad_norm": 0.3107141852378845, + "learning_rate": 7.162890870239013e-05, + "loss": 1.8257, + "step": 12280 + }, + { + "epoch": 3.769490484960098, + "grad_norm": 0.35293644666671753, + "learning_rate": 7.162442715649627e-05, + "loss": 1.7855, + "step": 12281 + }, + { + "epoch": 3.7697974217311234, + "grad_norm": 0.25989311933517456, + "learning_rate": 7.161994539689713e-05, + "loss": 1.7816, + "step": 12282 + }, + { + "epoch": 3.7701043585021488, + "grad_norm": 0.25615137815475464, + "learning_rate": 7.161546342363701e-05, + "loss": 1.7738, + "step": 12283 + }, + { + "epoch": 3.7704112952731736, + "grad_norm": 0.29345229268074036, + "learning_rate": 7.161098123676023e-05, + "loss": 1.8496, + "step": 12284 + }, + { + "epoch": 3.770718232044199, + "grad_norm": 0.2975969612598419, + "learning_rate": 7.160649883631105e-05, + "loss": 1.7342, + "step": 12285 + }, + { + "epoch": 3.771025168815224, + "grad_norm": 0.28458064794540405, + "learning_rate": 7.16020162223338e-05, + "loss": 1.8253, + "step": 12286 + }, + { + "epoch": 3.771332105586249, + "grad_norm": 0.2798703908920288, + "learning_rate": 7.159753339487276e-05, + "loss": 1.746, + "step": 12287 + }, + { + "epoch": 3.7716390423572745, + "grad_norm": 0.380044549703598, + "learning_rate": 7.159305035397223e-05, + "loss": 1.769, + "step": 12288 + }, + { + "epoch": 3.7719459791282994, + "grad_norm": 0.28760263323783875, + "learning_rate": 7.158856709967654e-05, + "loss": 1.7466, + "step": 12289 + }, + { + "epoch": 3.7722529158993248, + "grad_norm": 0.23314130306243896, + "learning_rate": 7.158408363202996e-05, + "loss": 1.7545, + "step": 12290 + }, + { + "epoch": 3.7725598526703497, + "grad_norm": 0.2864209711551666, + "learning_rate": 7.15795999510768e-05, + "loss": 1.7549, + "step": 12291 + }, + { + "epoch": 3.772866789441375, + "grad_norm": 0.2605510354042053, + "learning_rate": 7.15751160568614e-05, + "loss": 1.7684, + "step": 12292 + }, + { + "epoch": 3.7731737262124003, + "grad_norm": 0.2475409358739853, + "learning_rate": 7.157063194942806e-05, + "loss": 1.7841, + "step": 12293 + }, + { + "epoch": 3.7734806629834257, + "grad_norm": 0.22479289770126343, + "learning_rate": 7.15661476288211e-05, + "loss": 1.7592, + "step": 12294 + }, + { + "epoch": 3.7737875997544506, + "grad_norm": 0.22076937556266785, + "learning_rate": 7.156166309508482e-05, + "loss": 1.7853, + "step": 12295 + }, + { + "epoch": 3.774094536525476, + "grad_norm": 0.26082465052604675, + "learning_rate": 7.155717834826353e-05, + "loss": 1.7828, + "step": 12296 + }, + { + "epoch": 3.7744014732965008, + "grad_norm": 0.24771755933761597, + "learning_rate": 7.15526933884016e-05, + "loss": 1.758, + "step": 12297 + }, + { + "epoch": 3.774708410067526, + "grad_norm": 0.23806311190128326, + "learning_rate": 7.15482082155433e-05, + "loss": 1.7237, + "step": 12298 + }, + { + "epoch": 3.7750153468385514, + "grad_norm": 0.24822844564914703, + "learning_rate": 7.154372282973299e-05, + "loss": 1.7828, + "step": 12299 + }, + { + "epoch": 3.7753222836095763, + "grad_norm": 0.24423740804195404, + "learning_rate": 7.153923723101496e-05, + "loss": 1.8014, + "step": 12300 + }, + { + "epoch": 3.7756292203806017, + "grad_norm": 0.24966634809970856, + "learning_rate": 7.15347514194336e-05, + "loss": 1.8005, + "step": 12301 + }, + { + "epoch": 3.7759361571516266, + "grad_norm": 0.2549348473548889, + "learning_rate": 7.153026539503317e-05, + "loss": 1.8473, + "step": 12302 + }, + { + "epoch": 3.776243093922652, + "grad_norm": 0.23709465563297272, + "learning_rate": 7.152577915785807e-05, + "loss": 1.8031, + "step": 12303 + }, + { + "epoch": 3.7765500306936772, + "grad_norm": 0.28554168343544006, + "learning_rate": 7.152129270795258e-05, + "loss": 1.7836, + "step": 12304 + }, + { + "epoch": 3.776856967464702, + "grad_norm": 0.2568756639957428, + "learning_rate": 7.151680604536107e-05, + "loss": 1.7345, + "step": 12305 + }, + { + "epoch": 3.7771639042357275, + "grad_norm": 0.23883797228336334, + "learning_rate": 7.151231917012787e-05, + "loss": 1.7342, + "step": 12306 + }, + { + "epoch": 3.7774708410067523, + "grad_norm": 0.24026677012443542, + "learning_rate": 7.150783208229732e-05, + "loss": 1.8156, + "step": 12307 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.25756222009658813, + "learning_rate": 7.150334478191376e-05, + "loss": 1.8204, + "step": 12308 + }, + { + "epoch": 3.778084714548803, + "grad_norm": 0.24917428195476532, + "learning_rate": 7.149885726902156e-05, + "loss": 1.7867, + "step": 12309 + }, + { + "epoch": 3.7783916513198283, + "grad_norm": 0.26269277930259705, + "learning_rate": 7.149436954366504e-05, + "loss": 1.8233, + "step": 12310 + }, + { + "epoch": 3.7786985880908532, + "grad_norm": 0.2502293586730957, + "learning_rate": 7.148988160588857e-05, + "loss": 1.8329, + "step": 12311 + }, + { + "epoch": 3.7790055248618786, + "grad_norm": 0.24845796823501587, + "learning_rate": 7.14853934557365e-05, + "loss": 1.7936, + "step": 12312 + }, + { + "epoch": 3.7793124616329035, + "grad_norm": 0.2453537881374359, + "learning_rate": 7.148090509325315e-05, + "loss": 1.8149, + "step": 12313 + }, + { + "epoch": 3.779619398403929, + "grad_norm": 0.2336922138929367, + "learning_rate": 7.147641651848293e-05, + "loss": 1.7826, + "step": 12314 + }, + { + "epoch": 3.779926335174954, + "grad_norm": 0.25542667508125305, + "learning_rate": 7.147192773147017e-05, + "loss": 1.801, + "step": 12315 + }, + { + "epoch": 3.780233271945979, + "grad_norm": 0.2301866114139557, + "learning_rate": 7.146743873225923e-05, + "loss": 1.7302, + "step": 12316 + }, + { + "epoch": 3.7805402087170044, + "grad_norm": 0.25821468234062195, + "learning_rate": 7.14629495208945e-05, + "loss": 1.7704, + "step": 12317 + }, + { + "epoch": 3.7808471454880292, + "grad_norm": 0.22537970542907715, + "learning_rate": 7.145846009742029e-05, + "loss": 1.7281, + "step": 12318 + }, + { + "epoch": 3.7811540822590546, + "grad_norm": 0.2565869688987732, + "learning_rate": 7.145397046188102e-05, + "loss": 1.8077, + "step": 12319 + }, + { + "epoch": 3.78146101903008, + "grad_norm": 0.2588396966457367, + "learning_rate": 7.144948061432105e-05, + "loss": 1.7438, + "step": 12320 + }, + { + "epoch": 3.781767955801105, + "grad_norm": 0.2538135349750519, + "learning_rate": 7.144499055478472e-05, + "loss": 1.8253, + "step": 12321 + }, + { + "epoch": 3.78207489257213, + "grad_norm": 0.2272680401802063, + "learning_rate": 7.144050028331644e-05, + "loss": 1.7408, + "step": 12322 + }, + { + "epoch": 3.782381829343155, + "grad_norm": 0.25010406970977783, + "learning_rate": 7.143600979996055e-05, + "loss": 1.8219, + "step": 12323 + }, + { + "epoch": 3.7826887661141804, + "grad_norm": 0.2560291290283203, + "learning_rate": 7.143151910476144e-05, + "loss": 1.7734, + "step": 12324 + }, + { + "epoch": 3.7829957028852057, + "grad_norm": 0.24927431344985962, + "learning_rate": 7.142702819776352e-05, + "loss": 1.7682, + "step": 12325 + }, + { + "epoch": 3.783302639656231, + "grad_norm": 0.2501368224620819, + "learning_rate": 7.142253707901114e-05, + "loss": 1.818, + "step": 12326 + }, + { + "epoch": 3.783609576427256, + "grad_norm": 0.3132917284965515, + "learning_rate": 7.141804574854871e-05, + "loss": 1.7793, + "step": 12327 + }, + { + "epoch": 3.7839165131982813, + "grad_norm": 0.24229925870895386, + "learning_rate": 7.141355420642057e-05, + "loss": 1.7585, + "step": 12328 + }, + { + "epoch": 3.784223449969306, + "grad_norm": 0.22612906992435455, + "learning_rate": 7.140906245267116e-05, + "loss": 1.7374, + "step": 12329 + }, + { + "epoch": 3.7845303867403315, + "grad_norm": 0.26354333758354187, + "learning_rate": 7.140457048734482e-05, + "loss": 1.7751, + "step": 12330 + }, + { + "epoch": 3.784837323511357, + "grad_norm": 0.21500451862812042, + "learning_rate": 7.140007831048599e-05, + "loss": 1.7827, + "step": 12331 + }, + { + "epoch": 3.7851442602823817, + "grad_norm": 0.2826332151889801, + "learning_rate": 7.139558592213904e-05, + "loss": 1.7522, + "step": 12332 + }, + { + "epoch": 3.785451197053407, + "grad_norm": 0.3217725455760956, + "learning_rate": 7.139109332234837e-05, + "loss": 1.8758, + "step": 12333 + }, + { + "epoch": 3.785758133824432, + "grad_norm": 0.26934614777565, + "learning_rate": 7.138660051115837e-05, + "loss": 1.8322, + "step": 12334 + }, + { + "epoch": 3.7860650705954573, + "grad_norm": 0.2653827667236328, + "learning_rate": 7.138210748861346e-05, + "loss": 1.7651, + "step": 12335 + }, + { + "epoch": 3.7863720073664826, + "grad_norm": 0.30470311641693115, + "learning_rate": 7.137761425475802e-05, + "loss": 1.855, + "step": 12336 + }, + { + "epoch": 3.786678944137508, + "grad_norm": 0.2558726370334625, + "learning_rate": 7.137312080963647e-05, + "loss": 1.7174, + "step": 12337 + }, + { + "epoch": 3.786985880908533, + "grad_norm": 0.24025602638721466, + "learning_rate": 7.136862715329322e-05, + "loss": 1.7565, + "step": 12338 + }, + { + "epoch": 3.787292817679558, + "grad_norm": 0.34205392003059387, + "learning_rate": 7.136413328577267e-05, + "loss": 1.8116, + "step": 12339 + }, + { + "epoch": 3.787599754450583, + "grad_norm": 0.4069152772426605, + "learning_rate": 7.135963920711923e-05, + "loss": 1.7662, + "step": 12340 + }, + { + "epoch": 3.7879066912216084, + "grad_norm": 0.3915627598762512, + "learning_rate": 7.13551449173773e-05, + "loss": 1.81, + "step": 12341 + }, + { + "epoch": 3.7882136279926337, + "grad_norm": 0.27136507630348206, + "learning_rate": 7.135065041659134e-05, + "loss": 1.7845, + "step": 12342 + }, + { + "epoch": 3.7885205647636586, + "grad_norm": 0.2924078106880188, + "learning_rate": 7.134615570480572e-05, + "loss": 1.8606, + "step": 12343 + }, + { + "epoch": 3.788827501534684, + "grad_norm": 0.35581526160240173, + "learning_rate": 7.134166078206488e-05, + "loss": 1.7785, + "step": 12344 + }, + { + "epoch": 3.789134438305709, + "grad_norm": 0.3003756105899811, + "learning_rate": 7.133716564841324e-05, + "loss": 1.7321, + "step": 12345 + }, + { + "epoch": 3.789441375076734, + "grad_norm": 0.2586000859737396, + "learning_rate": 7.133267030389524e-05, + "loss": 1.7889, + "step": 12346 + }, + { + "epoch": 3.7897483118477595, + "grad_norm": 0.28053075075149536, + "learning_rate": 7.132817474855527e-05, + "loss": 1.8216, + "step": 12347 + }, + { + "epoch": 3.7900552486187844, + "grad_norm": 0.3064870834350586, + "learning_rate": 7.132367898243777e-05, + "loss": 1.7528, + "step": 12348 + }, + { + "epoch": 3.7903621853898097, + "grad_norm": 0.3045158386230469, + "learning_rate": 7.131918300558719e-05, + "loss": 1.8251, + "step": 12349 + }, + { + "epoch": 3.7906691221608346, + "grad_norm": 0.2438485324382782, + "learning_rate": 7.131468681804794e-05, + "loss": 1.7505, + "step": 12350 + }, + { + "epoch": 3.79097605893186, + "grad_norm": 0.24239958822727203, + "learning_rate": 7.131019041986447e-05, + "loss": 1.7544, + "step": 12351 + }, + { + "epoch": 3.7912829957028853, + "grad_norm": 0.24632441997528076, + "learning_rate": 7.130569381108121e-05, + "loss": 1.7485, + "step": 12352 + }, + { + "epoch": 3.7915899324739106, + "grad_norm": 0.22553624212741852, + "learning_rate": 7.13011969917426e-05, + "loss": 1.803, + "step": 12353 + }, + { + "epoch": 3.7918968692449355, + "grad_norm": 0.2164420485496521, + "learning_rate": 7.129669996189306e-05, + "loss": 1.7307, + "step": 12354 + }, + { + "epoch": 3.792203806015961, + "grad_norm": 0.25104281306266785, + "learning_rate": 7.129220272157705e-05, + "loss": 1.8154, + "step": 12355 + }, + { + "epoch": 3.7925107427869857, + "grad_norm": 0.25533202290534973, + "learning_rate": 7.128770527083903e-05, + "loss": 1.8046, + "step": 12356 + }, + { + "epoch": 3.792817679558011, + "grad_norm": 0.24428130686283112, + "learning_rate": 7.128320760972341e-05, + "loss": 1.7984, + "step": 12357 + }, + { + "epoch": 3.7931246163290364, + "grad_norm": 0.2366408109664917, + "learning_rate": 7.127870973827467e-05, + "loss": 1.7781, + "step": 12358 + }, + { + "epoch": 3.7934315531000613, + "grad_norm": 0.2558888792991638, + "learning_rate": 7.127421165653722e-05, + "loss": 1.7858, + "step": 12359 + }, + { + "epoch": 3.7937384898710866, + "grad_norm": 0.25825443863868713, + "learning_rate": 7.126971336455558e-05, + "loss": 1.8292, + "step": 12360 + }, + { + "epoch": 3.7940454266421115, + "grad_norm": 0.2554624080657959, + "learning_rate": 7.126521486237415e-05, + "loss": 1.822, + "step": 12361 + }, + { + "epoch": 3.794352363413137, + "grad_norm": 0.3030763268470764, + "learning_rate": 7.126071615003742e-05, + "loss": 1.8261, + "step": 12362 + }, + { + "epoch": 3.794659300184162, + "grad_norm": 0.3047907054424286, + "learning_rate": 7.125621722758981e-05, + "loss": 1.8419, + "step": 12363 + }, + { + "epoch": 3.794966236955187, + "grad_norm": 0.27782654762268066, + "learning_rate": 7.12517180950758e-05, + "loss": 1.7959, + "step": 12364 + }, + { + "epoch": 3.7952731737262124, + "grad_norm": 0.24526572227478027, + "learning_rate": 7.124721875253986e-05, + "loss": 1.7313, + "step": 12365 + }, + { + "epoch": 3.7955801104972373, + "grad_norm": 0.23718179762363434, + "learning_rate": 7.124271920002646e-05, + "loss": 1.7479, + "step": 12366 + }, + { + "epoch": 3.7958870472682626, + "grad_norm": 0.2880019247531891, + "learning_rate": 7.123821943758004e-05, + "loss": 1.7792, + "step": 12367 + }, + { + "epoch": 3.796193984039288, + "grad_norm": 0.28923723101615906, + "learning_rate": 7.123371946524511e-05, + "loss": 1.7474, + "step": 12368 + }, + { + "epoch": 3.7965009208103133, + "grad_norm": 0.2281525880098343, + "learning_rate": 7.122921928306612e-05, + "loss": 1.8106, + "step": 12369 + }, + { + "epoch": 3.796807857581338, + "grad_norm": 0.34825438261032104, + "learning_rate": 7.122471889108752e-05, + "loss": 1.8076, + "step": 12370 + }, + { + "epoch": 3.7971147943523635, + "grad_norm": 0.41145995259284973, + "learning_rate": 7.122021828935382e-05, + "loss": 1.7692, + "step": 12371 + }, + { + "epoch": 3.7974217311233884, + "grad_norm": 0.31711262464523315, + "learning_rate": 7.12157174779095e-05, + "loss": 1.8101, + "step": 12372 + }, + { + "epoch": 3.7977286678944138, + "grad_norm": 0.3044308125972748, + "learning_rate": 7.1211216456799e-05, + "loss": 1.8238, + "step": 12373 + }, + { + "epoch": 3.798035604665439, + "grad_norm": 0.3750055134296417, + "learning_rate": 7.120671522606683e-05, + "loss": 1.7323, + "step": 12374 + }, + { + "epoch": 3.798342541436464, + "grad_norm": 0.38852599263191223, + "learning_rate": 7.120221378575749e-05, + "loss": 1.8402, + "step": 12375 + }, + { + "epoch": 3.7986494782074893, + "grad_norm": 0.3430371582508087, + "learning_rate": 7.119771213591541e-05, + "loss": 1.8369, + "step": 12376 + }, + { + "epoch": 3.798956414978514, + "grad_norm": 0.4787428677082062, + "learning_rate": 7.119321027658515e-05, + "loss": 1.7977, + "step": 12377 + }, + { + "epoch": 3.7992633517495396, + "grad_norm": 0.4263977110385895, + "learning_rate": 7.118870820781114e-05, + "loss": 1.8208, + "step": 12378 + }, + { + "epoch": 3.799570288520565, + "grad_norm": 0.28649669885635376, + "learning_rate": 7.118420592963793e-05, + "loss": 1.773, + "step": 12379 + }, + { + "epoch": 3.7998772252915898, + "grad_norm": 0.26070261001586914, + "learning_rate": 7.117970344210996e-05, + "loss": 1.6866, + "step": 12380 + }, + { + "epoch": 3.800184162062615, + "grad_norm": 0.30127593874931335, + "learning_rate": 7.117520074527173e-05, + "loss": 1.7208, + "step": 12381 + }, + { + "epoch": 3.80049109883364, + "grad_norm": 0.23639258742332458, + "learning_rate": 7.117069783916777e-05, + "loss": 1.7504, + "step": 12382 + }, + { + "epoch": 3.8007980356046653, + "grad_norm": 0.2852858901023865, + "learning_rate": 7.116619472384256e-05, + "loss": 1.7954, + "step": 12383 + }, + { + "epoch": 3.8011049723756907, + "grad_norm": 0.2673225998878479, + "learning_rate": 7.116169139934063e-05, + "loss": 1.7562, + "step": 12384 + }, + { + "epoch": 3.801411909146716, + "grad_norm": 0.21615394949913025, + "learning_rate": 7.115718786570644e-05, + "loss": 1.7126, + "step": 12385 + }, + { + "epoch": 3.801718845917741, + "grad_norm": 0.2165435254573822, + "learning_rate": 7.115268412298453e-05, + "loss": 1.7171, + "step": 12386 + }, + { + "epoch": 3.8020257826887662, + "grad_norm": 0.280564546585083, + "learning_rate": 7.114818017121939e-05, + "loss": 1.7711, + "step": 12387 + }, + { + "epoch": 3.802332719459791, + "grad_norm": 0.3023521304130554, + "learning_rate": 7.114367601045555e-05, + "loss": 1.7538, + "step": 12388 + }, + { + "epoch": 3.8026396562308165, + "grad_norm": 0.27252480387687683, + "learning_rate": 7.11391716407375e-05, + "loss": 1.7604, + "step": 12389 + }, + { + "epoch": 3.802946593001842, + "grad_norm": 0.2122909128665924, + "learning_rate": 7.113466706210976e-05, + "loss": 1.716, + "step": 12390 + }, + { + "epoch": 3.8032535297728667, + "grad_norm": 0.30141574144363403, + "learning_rate": 7.113016227461686e-05, + "loss": 1.7636, + "step": 12391 + }, + { + "epoch": 3.803560466543892, + "grad_norm": 0.33359697461128235, + "learning_rate": 7.112565727830331e-05, + "loss": 1.7805, + "step": 12392 + }, + { + "epoch": 3.803867403314917, + "grad_norm": 0.3161376714706421, + "learning_rate": 7.112115207321364e-05, + "loss": 1.7974, + "step": 12393 + }, + { + "epoch": 3.8041743400859422, + "grad_norm": 0.29028698801994324, + "learning_rate": 7.111664665939235e-05, + "loss": 1.83, + "step": 12394 + }, + { + "epoch": 3.8044812768569676, + "grad_norm": 0.38829556107521057, + "learning_rate": 7.1112141036884e-05, + "loss": 1.8684, + "step": 12395 + }, + { + "epoch": 3.804788213627993, + "grad_norm": 0.4118283987045288, + "learning_rate": 7.110763520573309e-05, + "loss": 1.7812, + "step": 12396 + }, + { + "epoch": 3.805095150399018, + "grad_norm": 0.3907717168331146, + "learning_rate": 7.110312916598416e-05, + "loss": 1.7789, + "step": 12397 + }, + { + "epoch": 3.805402087170043, + "grad_norm": 0.2768644690513611, + "learning_rate": 7.109862291768173e-05, + "loss": 1.8575, + "step": 12398 + }, + { + "epoch": 3.805709023941068, + "grad_norm": 0.3234006464481354, + "learning_rate": 7.109411646087035e-05, + "loss": 1.7485, + "step": 12399 + }, + { + "epoch": 3.8060159607120934, + "grad_norm": 0.415475994348526, + "learning_rate": 7.108960979559454e-05, + "loss": 1.7363, + "step": 12400 + }, + { + "epoch": 3.8063228974831187, + "grad_norm": 0.38654613494873047, + "learning_rate": 7.108510292189884e-05, + "loss": 1.7907, + "step": 12401 + }, + { + "epoch": 3.8066298342541436, + "grad_norm": 0.2541481852531433, + "learning_rate": 7.10805958398278e-05, + "loss": 1.8458, + "step": 12402 + }, + { + "epoch": 3.806936771025169, + "grad_norm": 0.32562851905822754, + "learning_rate": 7.107608854942597e-05, + "loss": 1.7989, + "step": 12403 + }, + { + "epoch": 3.807243707796194, + "grad_norm": 0.3628395199775696, + "learning_rate": 7.107158105073786e-05, + "loss": 1.8044, + "step": 12404 + }, + { + "epoch": 3.807550644567219, + "grad_norm": 0.3363969027996063, + "learning_rate": 7.106707334380805e-05, + "loss": 1.8078, + "step": 12405 + }, + { + "epoch": 3.8078575813382445, + "grad_norm": 0.2853989601135254, + "learning_rate": 7.106256542868108e-05, + "loss": 1.7913, + "step": 12406 + }, + { + "epoch": 3.8081645181092694, + "grad_norm": 0.33455806970596313, + "learning_rate": 7.105805730540148e-05, + "loss": 1.7252, + "step": 12407 + }, + { + "epoch": 3.8084714548802947, + "grad_norm": 0.28103405237197876, + "learning_rate": 7.105354897401382e-05, + "loss": 1.6942, + "step": 12408 + }, + { + "epoch": 3.8087783916513196, + "grad_norm": 0.23230718076229095, + "learning_rate": 7.104904043456264e-05, + "loss": 1.7723, + "step": 12409 + }, + { + "epoch": 3.809085328422345, + "grad_norm": 0.2883053421974182, + "learning_rate": 7.104453168709251e-05, + "loss": 1.8015, + "step": 12410 + }, + { + "epoch": 3.8093922651933703, + "grad_norm": 0.28462252020835876, + "learning_rate": 7.104002273164798e-05, + "loss": 1.791, + "step": 12411 + }, + { + "epoch": 3.8096992019643956, + "grad_norm": 0.3004699647426605, + "learning_rate": 7.103551356827363e-05, + "loss": 1.8401, + "step": 12412 + }, + { + "epoch": 3.8100061387354205, + "grad_norm": 0.2546156048774719, + "learning_rate": 7.1031004197014e-05, + "loss": 1.7645, + "step": 12413 + }, + { + "epoch": 3.810313075506446, + "grad_norm": 0.24532915651798248, + "learning_rate": 7.102649461791364e-05, + "loss": 1.8, + "step": 12414 + }, + { + "epoch": 3.8106200122774707, + "grad_norm": 0.2432405799627304, + "learning_rate": 7.102198483101716e-05, + "loss": 1.7957, + "step": 12415 + }, + { + "epoch": 3.810926949048496, + "grad_norm": 0.24405215680599213, + "learning_rate": 7.101747483636908e-05, + "loss": 1.79, + "step": 12416 + }, + { + "epoch": 3.8112338858195214, + "grad_norm": 0.29519838094711304, + "learning_rate": 7.101296463401401e-05, + "loss": 1.8087, + "step": 12417 + }, + { + "epoch": 3.8115408225905463, + "grad_norm": 0.28205612301826477, + "learning_rate": 7.100845422399652e-05, + "loss": 1.7897, + "step": 12418 + }, + { + "epoch": 3.8118477593615716, + "grad_norm": 0.25014567375183105, + "learning_rate": 7.100394360636115e-05, + "loss": 1.7574, + "step": 12419 + }, + { + "epoch": 3.8121546961325965, + "grad_norm": 0.3133499026298523, + "learning_rate": 7.099943278115251e-05, + "loss": 1.7957, + "step": 12420 + }, + { + "epoch": 3.812461632903622, + "grad_norm": 0.3706473708152771, + "learning_rate": 7.099492174841516e-05, + "loss": 1.8519, + "step": 12421 + }, + { + "epoch": 3.812768569674647, + "grad_norm": 0.30085715651512146, + "learning_rate": 7.09904105081937e-05, + "loss": 1.778, + "step": 12422 + }, + { + "epoch": 3.813075506445672, + "grad_norm": 0.23897981643676758, + "learning_rate": 7.09858990605327e-05, + "loss": 1.7289, + "step": 12423 + }, + { + "epoch": 3.8133824432166974, + "grad_norm": 0.30046290159225464, + "learning_rate": 7.098138740547673e-05, + "loss": 1.8838, + "step": 12424 + }, + { + "epoch": 3.8136893799877223, + "grad_norm": 0.32126328349113464, + "learning_rate": 7.097687554307041e-05, + "loss": 1.7916, + "step": 12425 + }, + { + "epoch": 3.8139963167587476, + "grad_norm": 0.2922256886959076, + "learning_rate": 7.097236347335829e-05, + "loss": 1.8305, + "step": 12426 + }, + { + "epoch": 3.814303253529773, + "grad_norm": 0.2772706151008606, + "learning_rate": 7.0967851196385e-05, + "loss": 1.7694, + "step": 12427 + }, + { + "epoch": 3.8146101903007983, + "grad_norm": 0.25763455033302307, + "learning_rate": 7.096333871219511e-05, + "loss": 1.8716, + "step": 12428 + }, + { + "epoch": 3.814917127071823, + "grad_norm": 0.2631739377975464, + "learning_rate": 7.095882602083322e-05, + "loss": 1.7771, + "step": 12429 + }, + { + "epoch": 3.8152240638428485, + "grad_norm": 0.29229632019996643, + "learning_rate": 7.095431312234392e-05, + "loss": 1.7865, + "step": 12430 + }, + { + "epoch": 3.8155310006138734, + "grad_norm": 0.2672729790210724, + "learning_rate": 7.094980001677181e-05, + "loss": 1.7848, + "step": 12431 + }, + { + "epoch": 3.8158379373848987, + "grad_norm": 0.2388373166322708, + "learning_rate": 7.094528670416152e-05, + "loss": 1.75, + "step": 12432 + }, + { + "epoch": 3.816144874155924, + "grad_norm": 0.2385305017232895, + "learning_rate": 7.094077318455762e-05, + "loss": 1.748, + "step": 12433 + }, + { + "epoch": 3.816451810926949, + "grad_norm": 0.25421401858329773, + "learning_rate": 7.093625945800471e-05, + "loss": 1.779, + "step": 12434 + }, + { + "epoch": 3.8167587476979743, + "grad_norm": 0.2785158157348633, + "learning_rate": 7.093174552454743e-05, + "loss": 1.8295, + "step": 12435 + }, + { + "epoch": 3.817065684468999, + "grad_norm": 0.2907472252845764, + "learning_rate": 7.092723138423036e-05, + "loss": 1.8216, + "step": 12436 + }, + { + "epoch": 3.8173726212400245, + "grad_norm": 0.253955215215683, + "learning_rate": 7.092271703709814e-05, + "loss": 1.8394, + "step": 12437 + }, + { + "epoch": 3.81767955801105, + "grad_norm": 0.32139912247657776, + "learning_rate": 7.091820248319537e-05, + "loss": 1.8634, + "step": 12438 + }, + { + "epoch": 3.8179864947820747, + "grad_norm": 0.25890466570854187, + "learning_rate": 7.091368772256664e-05, + "loss": 1.7336, + "step": 12439 + }, + { + "epoch": 3.8182934315531, + "grad_norm": 0.2823775112628937, + "learning_rate": 7.090917275525661e-05, + "loss": 1.7927, + "step": 12440 + }, + { + "epoch": 3.818600368324125, + "grad_norm": 0.28739333152770996, + "learning_rate": 7.090465758130988e-05, + "loss": 1.7807, + "step": 12441 + }, + { + "epoch": 3.8189073050951503, + "grad_norm": 0.36823949217796326, + "learning_rate": 7.090014220077106e-05, + "loss": 1.7288, + "step": 12442 + }, + { + "epoch": 3.8192142418661756, + "grad_norm": 0.3061312735080719, + "learning_rate": 7.089562661368479e-05, + "loss": 1.8039, + "step": 12443 + }, + { + "epoch": 3.819521178637201, + "grad_norm": 0.25867924094200134, + "learning_rate": 7.089111082009569e-05, + "loss": 1.7678, + "step": 12444 + }, + { + "epoch": 3.819828115408226, + "grad_norm": 0.26834985613822937, + "learning_rate": 7.088659482004837e-05, + "loss": 1.7592, + "step": 12445 + }, + { + "epoch": 3.820135052179251, + "grad_norm": 0.25608211755752563, + "learning_rate": 7.08820786135875e-05, + "loss": 1.7622, + "step": 12446 + }, + { + "epoch": 3.820441988950276, + "grad_norm": 0.2512456774711609, + "learning_rate": 7.087756220075769e-05, + "loss": 1.7648, + "step": 12447 + }, + { + "epoch": 3.8207489257213014, + "grad_norm": 0.2434878647327423, + "learning_rate": 7.087304558160355e-05, + "loss": 1.7435, + "step": 12448 + }, + { + "epoch": 3.8210558624923268, + "grad_norm": 0.26456570625305176, + "learning_rate": 7.086852875616978e-05, + "loss": 1.7342, + "step": 12449 + }, + { + "epoch": 3.8213627992633517, + "grad_norm": 0.2958984971046448, + "learning_rate": 7.086401172450095e-05, + "loss": 1.8532, + "step": 12450 + }, + { + "epoch": 3.821669736034377, + "grad_norm": 0.25939157605171204, + "learning_rate": 7.085949448664172e-05, + "loss": 1.7746, + "step": 12451 + }, + { + "epoch": 3.821976672805402, + "grad_norm": 0.2210223525762558, + "learning_rate": 7.085497704263675e-05, + "loss": 1.7745, + "step": 12452 + }, + { + "epoch": 3.822283609576427, + "grad_norm": 0.2409319430589676, + "learning_rate": 7.085045939253068e-05, + "loss": 1.7981, + "step": 12453 + }, + { + "epoch": 3.8225905463474525, + "grad_norm": 0.26331812143325806, + "learning_rate": 7.084594153636815e-05, + "loss": 1.8163, + "step": 12454 + }, + { + "epoch": 3.8228974831184774, + "grad_norm": 0.2613828480243683, + "learning_rate": 7.08414234741938e-05, + "loss": 1.8362, + "step": 12455 + }, + { + "epoch": 3.8232044198895028, + "grad_norm": 0.3139529228210449, + "learning_rate": 7.083690520605228e-05, + "loss": 1.8247, + "step": 12456 + }, + { + "epoch": 3.8235113566605277, + "grad_norm": 0.2958570718765259, + "learning_rate": 7.083238673198826e-05, + "loss": 1.8011, + "step": 12457 + }, + { + "epoch": 3.823818293431553, + "grad_norm": 0.2517626881599426, + "learning_rate": 7.082786805204639e-05, + "loss": 1.7353, + "step": 12458 + }, + { + "epoch": 3.8241252302025783, + "grad_norm": 0.2443888783454895, + "learning_rate": 7.082334916627132e-05, + "loss": 1.7916, + "step": 12459 + }, + { + "epoch": 3.8244321669736037, + "grad_norm": 0.283514142036438, + "learning_rate": 7.08188300747077e-05, + "loss": 1.8048, + "step": 12460 + }, + { + "epoch": 3.8247391037446286, + "grad_norm": 0.24775351583957672, + "learning_rate": 7.08143107774002e-05, + "loss": 1.8145, + "step": 12461 + }, + { + "epoch": 3.825046040515654, + "grad_norm": 0.27904003858566284, + "learning_rate": 7.080979127439347e-05, + "loss": 1.8003, + "step": 12462 + }, + { + "epoch": 3.825352977286679, + "grad_norm": 0.24997512996196747, + "learning_rate": 7.08052715657322e-05, + "loss": 1.7962, + "step": 12463 + }, + { + "epoch": 3.825659914057704, + "grad_norm": 0.25874343514442444, + "learning_rate": 7.080075165146104e-05, + "loss": 1.7861, + "step": 12464 + }, + { + "epoch": 3.8259668508287294, + "grad_norm": 0.2964434027671814, + "learning_rate": 7.079623153162467e-05, + "loss": 1.7618, + "step": 12465 + }, + { + "epoch": 3.8262737875997543, + "grad_norm": 0.26403337717056274, + "learning_rate": 7.079171120626774e-05, + "loss": 1.8016, + "step": 12466 + }, + { + "epoch": 3.8265807243707797, + "grad_norm": 0.28369295597076416, + "learning_rate": 7.078719067543494e-05, + "loss": 1.7517, + "step": 12467 + }, + { + "epoch": 3.8268876611418046, + "grad_norm": 0.254312127828598, + "learning_rate": 7.078266993917093e-05, + "loss": 1.8085, + "step": 12468 + }, + { + "epoch": 3.82719459791283, + "grad_norm": 0.24992622435092926, + "learning_rate": 7.077814899752038e-05, + "loss": 1.7657, + "step": 12469 + }, + { + "epoch": 3.8275015346838552, + "grad_norm": 0.26485762000083923, + "learning_rate": 7.077362785052802e-05, + "loss": 1.7303, + "step": 12470 + }, + { + "epoch": 3.8278084714548806, + "grad_norm": 0.29864901304244995, + "learning_rate": 7.076910649823846e-05, + "loss": 1.7734, + "step": 12471 + }, + { + "epoch": 3.8281154082259055, + "grad_norm": 0.2973599433898926, + "learning_rate": 7.076458494069644e-05, + "loss": 1.8055, + "step": 12472 + }, + { + "epoch": 3.828422344996931, + "grad_norm": 0.2150362730026245, + "learning_rate": 7.07600631779466e-05, + "loss": 1.7377, + "step": 12473 + }, + { + "epoch": 3.8287292817679557, + "grad_norm": 0.26443010568618774, + "learning_rate": 7.075554121003367e-05, + "loss": 1.837, + "step": 12474 + }, + { + "epoch": 3.829036218538981, + "grad_norm": 0.27365007996559143, + "learning_rate": 7.075101903700231e-05, + "loss": 1.7784, + "step": 12475 + }, + { + "epoch": 3.8293431553100064, + "grad_norm": 0.22037263214588165, + "learning_rate": 7.074649665889721e-05, + "loss": 1.8182, + "step": 12476 + }, + { + "epoch": 3.8296500920810312, + "grad_norm": 0.29614946246147156, + "learning_rate": 7.074197407576308e-05, + "loss": 1.7993, + "step": 12477 + }, + { + "epoch": 3.8299570288520566, + "grad_norm": 0.25135520100593567, + "learning_rate": 7.07374512876446e-05, + "loss": 1.8211, + "step": 12478 + }, + { + "epoch": 3.8302639656230815, + "grad_norm": 0.2711503207683563, + "learning_rate": 7.073292829458645e-05, + "loss": 1.8274, + "step": 12479 + }, + { + "epoch": 3.830570902394107, + "grad_norm": 0.38659265637397766, + "learning_rate": 7.072840509663338e-05, + "loss": 1.796, + "step": 12480 + }, + { + "epoch": 3.830877839165132, + "grad_norm": 0.39382728934288025, + "learning_rate": 7.072388169383005e-05, + "loss": 1.8439, + "step": 12481 + }, + { + "epoch": 3.831184775936157, + "grad_norm": 0.27570033073425293, + "learning_rate": 7.071935808622118e-05, + "loss": 1.8155, + "step": 12482 + }, + { + "epoch": 3.8314917127071824, + "grad_norm": 0.29054465889930725, + "learning_rate": 7.071483427385147e-05, + "loss": 1.754, + "step": 12483 + }, + { + "epoch": 3.8317986494782073, + "grad_norm": 0.4138031303882599, + "learning_rate": 7.071031025676562e-05, + "loss": 1.7686, + "step": 12484 + }, + { + "epoch": 3.8321055862492326, + "grad_norm": 0.3447251617908478, + "learning_rate": 7.070578603500833e-05, + "loss": 1.8135, + "step": 12485 + }, + { + "epoch": 3.832412523020258, + "grad_norm": 0.265115886926651, + "learning_rate": 7.070126160862436e-05, + "loss": 1.803, + "step": 12486 + }, + { + "epoch": 3.8327194597912833, + "grad_norm": 0.4288817346096039, + "learning_rate": 7.069673697765837e-05, + "loss": 1.7814, + "step": 12487 + }, + { + "epoch": 3.833026396562308, + "grad_norm": 0.4890103340148926, + "learning_rate": 7.06922121421551e-05, + "loss": 1.8318, + "step": 12488 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.3676142990589142, + "learning_rate": 7.068768710215928e-05, + "loss": 1.7792, + "step": 12489 + }, + { + "epoch": 3.8336402701043584, + "grad_norm": 0.23254090547561646, + "learning_rate": 7.068316185771557e-05, + "loss": 1.7154, + "step": 12490 + }, + { + "epoch": 3.8339472068753837, + "grad_norm": 0.35014036297798157, + "learning_rate": 7.067863640886876e-05, + "loss": 1.7031, + "step": 12491 + }, + { + "epoch": 3.834254143646409, + "grad_norm": 0.32155317068099976, + "learning_rate": 7.067411075566353e-05, + "loss": 1.7692, + "step": 12492 + }, + { + "epoch": 3.834561080417434, + "grad_norm": 0.260772705078125, + "learning_rate": 7.066958489814463e-05, + "loss": 1.7488, + "step": 12493 + }, + { + "epoch": 3.8348680171884593, + "grad_norm": 0.2624910771846771, + "learning_rate": 7.066505883635678e-05, + "loss": 1.7436, + "step": 12494 + }, + { + "epoch": 3.835174953959484, + "grad_norm": 0.2782299220561981, + "learning_rate": 7.066053257034471e-05, + "loss": 1.8219, + "step": 12495 + }, + { + "epoch": 3.8354818907305095, + "grad_norm": 0.2749497890472412, + "learning_rate": 7.065600610015312e-05, + "loss": 1.8068, + "step": 12496 + }, + { + "epoch": 3.835788827501535, + "grad_norm": 0.2730359733104706, + "learning_rate": 7.06514794258268e-05, + "loss": 1.7588, + "step": 12497 + }, + { + "epoch": 3.8360957642725597, + "grad_norm": 0.3606291711330414, + "learning_rate": 7.064695254741044e-05, + "loss": 1.8509, + "step": 12498 + }, + { + "epoch": 3.836402701043585, + "grad_norm": 0.23282989859580994, + "learning_rate": 7.064242546494879e-05, + "loss": 1.7444, + "step": 12499 + }, + { + "epoch": 3.83670963781461, + "grad_norm": 0.2554507255554199, + "learning_rate": 7.06378981784866e-05, + "loss": 1.7486, + "step": 12500 + }, + { + "epoch": 3.8370165745856353, + "grad_norm": 0.2916143834590912, + "learning_rate": 7.06333706880686e-05, + "loss": 1.8035, + "step": 12501 + }, + { + "epoch": 3.8373235113566606, + "grad_norm": 0.23719090223312378, + "learning_rate": 7.062884299373955e-05, + "loss": 1.7896, + "step": 12502 + }, + { + "epoch": 3.837630448127686, + "grad_norm": 0.2596152126789093, + "learning_rate": 7.062431509554417e-05, + "loss": 1.7944, + "step": 12503 + }, + { + "epoch": 3.837937384898711, + "grad_norm": 0.29140764474868774, + "learning_rate": 7.061978699352723e-05, + "loss": 1.7988, + "step": 12504 + }, + { + "epoch": 3.838244321669736, + "grad_norm": 0.3421068489551544, + "learning_rate": 7.061525868773347e-05, + "loss": 1.751, + "step": 12505 + }, + { + "epoch": 3.838551258440761, + "grad_norm": 0.2705349624156952, + "learning_rate": 7.061073017820764e-05, + "loss": 1.7578, + "step": 12506 + }, + { + "epoch": 3.8388581952117864, + "grad_norm": 0.2403286248445511, + "learning_rate": 7.060620146499448e-05, + "loss": 1.8422, + "step": 12507 + }, + { + "epoch": 3.8391651319828117, + "grad_norm": 0.3860442042350769, + "learning_rate": 7.060167254813876e-05, + "loss": 1.8168, + "step": 12508 + }, + { + "epoch": 3.8394720687538366, + "grad_norm": 0.4729512631893158, + "learning_rate": 7.059714342768526e-05, + "loss": 1.7786, + "step": 12509 + }, + { + "epoch": 3.839779005524862, + "grad_norm": 0.3522968888282776, + "learning_rate": 7.059261410367871e-05, + "loss": 1.8749, + "step": 12510 + }, + { + "epoch": 3.840085942295887, + "grad_norm": 0.28071436285972595, + "learning_rate": 7.058808457616386e-05, + "loss": 1.7959, + "step": 12511 + }, + { + "epoch": 3.840392879066912, + "grad_norm": 0.4356439411640167, + "learning_rate": 7.05835548451855e-05, + "loss": 1.8045, + "step": 12512 + }, + { + "epoch": 3.8406998158379375, + "grad_norm": 0.4051562249660492, + "learning_rate": 7.057902491078839e-05, + "loss": 1.7909, + "step": 12513 + }, + { + "epoch": 3.8410067526089624, + "grad_norm": 0.2817205488681793, + "learning_rate": 7.057449477301728e-05, + "loss": 1.8736, + "step": 12514 + }, + { + "epoch": 3.8413136893799877, + "grad_norm": 0.33369559049606323, + "learning_rate": 7.056996443191697e-05, + "loss": 1.7799, + "step": 12515 + }, + { + "epoch": 3.8416206261510126, + "grad_norm": 0.369954913854599, + "learning_rate": 7.056543388753221e-05, + "loss": 1.795, + "step": 12516 + }, + { + "epoch": 3.841927562922038, + "grad_norm": 0.289474755525589, + "learning_rate": 7.056090313990778e-05, + "loss": 1.786, + "step": 12517 + }, + { + "epoch": 3.8422344996930633, + "grad_norm": 0.2431849092245102, + "learning_rate": 7.055637218908845e-05, + "loss": 1.7363, + "step": 12518 + }, + { + "epoch": 3.8425414364640886, + "grad_norm": 0.3736060857772827, + "learning_rate": 7.0551841035119e-05, + "loss": 1.8234, + "step": 12519 + }, + { + "epoch": 3.8428483732351135, + "grad_norm": 0.34008854627609253, + "learning_rate": 7.054730967804422e-05, + "loss": 1.8001, + "step": 12520 + }, + { + "epoch": 3.843155310006139, + "grad_norm": 0.24852876365184784, + "learning_rate": 7.054277811790887e-05, + "loss": 1.8298, + "step": 12521 + }, + { + "epoch": 3.8434622467771637, + "grad_norm": 0.3491046726703644, + "learning_rate": 7.053824635475777e-05, + "loss": 1.7336, + "step": 12522 + }, + { + "epoch": 3.843769183548189, + "grad_norm": 0.38757824897766113, + "learning_rate": 7.053371438863566e-05, + "loss": 1.8241, + "step": 12523 + }, + { + "epoch": 3.8440761203192144, + "grad_norm": 0.2607647180557251, + "learning_rate": 7.052918221958735e-05, + "loss": 1.7813, + "step": 12524 + }, + { + "epoch": 3.8443830570902393, + "grad_norm": 0.25634410977363586, + "learning_rate": 7.052464984765764e-05, + "loss": 1.7836, + "step": 12525 + }, + { + "epoch": 3.8446899938612646, + "grad_norm": 0.3113503158092499, + "learning_rate": 7.052011727289129e-05, + "loss": 1.8477, + "step": 12526 + }, + { + "epoch": 3.8449969306322895, + "grad_norm": 0.2852596044540405, + "learning_rate": 7.051558449533313e-05, + "loss": 1.7607, + "step": 12527 + }, + { + "epoch": 3.845303867403315, + "grad_norm": 0.24841541051864624, + "learning_rate": 7.051105151502795e-05, + "loss": 1.8109, + "step": 12528 + }, + { + "epoch": 3.84561080417434, + "grad_norm": 0.2231549620628357, + "learning_rate": 7.050651833202053e-05, + "loss": 1.7245, + "step": 12529 + }, + { + "epoch": 3.845917740945365, + "grad_norm": 0.21975892782211304, + "learning_rate": 7.050198494635566e-05, + "loss": 1.7512, + "step": 12530 + }, + { + "epoch": 3.8462246777163904, + "grad_norm": 0.2546280324459076, + "learning_rate": 7.049745135807816e-05, + "loss": 1.8003, + "step": 12531 + }, + { + "epoch": 3.8465316144874153, + "grad_norm": 0.21507929265499115, + "learning_rate": 7.049291756723284e-05, + "loss": 1.7616, + "step": 12532 + }, + { + "epoch": 3.8468385512584407, + "grad_norm": 0.24927987158298492, + "learning_rate": 7.04883835738645e-05, + "loss": 1.7519, + "step": 12533 + }, + { + "epoch": 3.847145488029466, + "grad_norm": 0.24988602101802826, + "learning_rate": 7.048384937801793e-05, + "loss": 1.7966, + "step": 12534 + }, + { + "epoch": 3.8474524248004913, + "grad_norm": 0.24039845168590546, + "learning_rate": 7.047931497973798e-05, + "loss": 1.7834, + "step": 12535 + }, + { + "epoch": 3.847759361571516, + "grad_norm": 0.22826696932315826, + "learning_rate": 7.047478037906943e-05, + "loss": 1.7334, + "step": 12536 + }, + { + "epoch": 3.8480662983425415, + "grad_norm": 0.22260744869709015, + "learning_rate": 7.047024557605708e-05, + "loss": 1.787, + "step": 12537 + }, + { + "epoch": 3.8483732351135664, + "grad_norm": 0.2457917332649231, + "learning_rate": 7.046571057074578e-05, + "loss": 1.7865, + "step": 12538 + }, + { + "epoch": 3.8486801718845918, + "grad_norm": 0.23952928185462952, + "learning_rate": 7.046117536318035e-05, + "loss": 1.7764, + "step": 12539 + }, + { + "epoch": 3.848987108655617, + "grad_norm": 0.22186748683452606, + "learning_rate": 7.045663995340557e-05, + "loss": 1.7917, + "step": 12540 + }, + { + "epoch": 3.849294045426642, + "grad_norm": 0.24234962463378906, + "learning_rate": 7.045210434146629e-05, + "loss": 1.7697, + "step": 12541 + }, + { + "epoch": 3.8496009821976673, + "grad_norm": 0.2510770857334137, + "learning_rate": 7.044756852740732e-05, + "loss": 1.8012, + "step": 12542 + }, + { + "epoch": 3.849907918968692, + "grad_norm": 0.24910703301429749, + "learning_rate": 7.044303251127349e-05, + "loss": 1.831, + "step": 12543 + }, + { + "epoch": 3.8502148557397176, + "grad_norm": 0.3159966468811035, + "learning_rate": 7.043849629310964e-05, + "loss": 1.8029, + "step": 12544 + }, + { + "epoch": 3.850521792510743, + "grad_norm": 0.3155403733253479, + "learning_rate": 7.04339598729606e-05, + "loss": 1.7429, + "step": 12545 + }, + { + "epoch": 3.8508287292817682, + "grad_norm": 0.3037515878677368, + "learning_rate": 7.042942325087117e-05, + "loss": 1.8186, + "step": 12546 + }, + { + "epoch": 3.851135666052793, + "grad_norm": 0.2319766730070114, + "learning_rate": 7.042488642688621e-05, + "loss": 1.7853, + "step": 12547 + }, + { + "epoch": 3.8514426028238185, + "grad_norm": 0.23911969363689423, + "learning_rate": 7.042034940105055e-05, + "loss": 1.8314, + "step": 12548 + }, + { + "epoch": 3.8517495395948433, + "grad_norm": 0.2541846036911011, + "learning_rate": 7.041581217340905e-05, + "loss": 1.8289, + "step": 12549 + }, + { + "epoch": 3.8520564763658687, + "grad_norm": 0.22234943509101868, + "learning_rate": 7.04112747440065e-05, + "loss": 1.7847, + "step": 12550 + }, + { + "epoch": 3.852363413136894, + "grad_norm": 0.2747870981693268, + "learning_rate": 7.04067371128878e-05, + "loss": 1.7875, + "step": 12551 + }, + { + "epoch": 3.852670349907919, + "grad_norm": 0.28589147329330444, + "learning_rate": 7.040219928009775e-05, + "loss": 1.7289, + "step": 12552 + }, + { + "epoch": 3.8529772866789442, + "grad_norm": 0.21180351078510284, + "learning_rate": 7.039766124568119e-05, + "loss": 1.7611, + "step": 12553 + }, + { + "epoch": 3.853284223449969, + "grad_norm": 0.27751782536506653, + "learning_rate": 7.0393123009683e-05, + "loss": 1.7481, + "step": 12554 + }, + { + "epoch": 3.8535911602209945, + "grad_norm": 0.32883307337760925, + "learning_rate": 7.038858457214802e-05, + "loss": 1.7271, + "step": 12555 + }, + { + "epoch": 3.85389809699202, + "grad_norm": 0.30965641140937805, + "learning_rate": 7.03840459331211e-05, + "loss": 1.81, + "step": 12556 + }, + { + "epoch": 3.8542050337630447, + "grad_norm": 0.25184348225593567, + "learning_rate": 7.037950709264709e-05, + "loss": 1.7642, + "step": 12557 + }, + { + "epoch": 3.85451197053407, + "grad_norm": 0.2376822829246521, + "learning_rate": 7.037496805077084e-05, + "loss": 1.7774, + "step": 12558 + }, + { + "epoch": 3.854818907305095, + "grad_norm": 0.2395993024110794, + "learning_rate": 7.03704288075372e-05, + "loss": 1.8397, + "step": 12559 + }, + { + "epoch": 3.8551258440761202, + "grad_norm": 0.26460394263267517, + "learning_rate": 7.036588936299107e-05, + "loss": 1.7472, + "step": 12560 + }, + { + "epoch": 3.8554327808471456, + "grad_norm": 0.34742459654808044, + "learning_rate": 7.036134971717725e-05, + "loss": 1.8003, + "step": 12561 + }, + { + "epoch": 3.855739717618171, + "grad_norm": 0.2829316556453705, + "learning_rate": 7.035680987014068e-05, + "loss": 1.7765, + "step": 12562 + }, + { + "epoch": 3.856046654389196, + "grad_norm": 0.3087223172187805, + "learning_rate": 7.035226982192615e-05, + "loss": 1.8462, + "step": 12563 + }, + { + "epoch": 3.856353591160221, + "grad_norm": 0.2806380093097687, + "learning_rate": 7.034772957257858e-05, + "loss": 1.7704, + "step": 12564 + }, + { + "epoch": 3.856660527931246, + "grad_norm": 0.25598087906837463, + "learning_rate": 7.03431891221428e-05, + "loss": 1.7843, + "step": 12565 + }, + { + "epoch": 3.8569674647022714, + "grad_norm": 0.30833700299263, + "learning_rate": 7.033864847066373e-05, + "loss": 1.8404, + "step": 12566 + }, + { + "epoch": 3.8572744014732967, + "grad_norm": 0.29562532901763916, + "learning_rate": 7.03341076181862e-05, + "loss": 1.8044, + "step": 12567 + }, + { + "epoch": 3.8575813382443216, + "grad_norm": 0.2901719808578491, + "learning_rate": 7.03295665647551e-05, + "loss": 1.7789, + "step": 12568 + }, + { + "epoch": 3.857888275015347, + "grad_norm": 0.25453686714172363, + "learning_rate": 7.03250253104153e-05, + "loss": 1.6792, + "step": 12569 + }, + { + "epoch": 3.858195211786372, + "grad_norm": 0.26009416580200195, + "learning_rate": 7.03204838552117e-05, + "loss": 1.7835, + "step": 12570 + }, + { + "epoch": 3.858502148557397, + "grad_norm": 0.28074127435684204, + "learning_rate": 7.031594219918916e-05, + "loss": 1.7932, + "step": 12571 + }, + { + "epoch": 3.8588090853284225, + "grad_norm": 0.3341725170612335, + "learning_rate": 7.031140034239258e-05, + "loss": 1.7439, + "step": 12572 + }, + { + "epoch": 3.8591160220994474, + "grad_norm": 0.28142449259757996, + "learning_rate": 7.030685828486684e-05, + "loss": 1.8263, + "step": 12573 + }, + { + "epoch": 3.8594229588704727, + "grad_norm": 0.2571438252925873, + "learning_rate": 7.030231602665681e-05, + "loss": 1.7628, + "step": 12574 + }, + { + "epoch": 3.8597298956414976, + "grad_norm": 0.3079041838645935, + "learning_rate": 7.029777356780741e-05, + "loss": 1.7879, + "step": 12575 + }, + { + "epoch": 3.860036832412523, + "grad_norm": 0.2605433464050293, + "learning_rate": 7.029323090836349e-05, + "loss": 1.7841, + "step": 12576 + }, + { + "epoch": 3.8603437691835483, + "grad_norm": 0.24069640040397644, + "learning_rate": 7.028868804836999e-05, + "loss": 1.7939, + "step": 12577 + }, + { + "epoch": 3.8606507059545736, + "grad_norm": 0.26801639795303345, + "learning_rate": 7.028414498787177e-05, + "loss": 1.8082, + "step": 12578 + }, + { + "epoch": 3.8609576427255985, + "grad_norm": 0.28828585147857666, + "learning_rate": 7.027960172691375e-05, + "loss": 1.8094, + "step": 12579 + }, + { + "epoch": 3.861264579496624, + "grad_norm": 0.22927051782608032, + "learning_rate": 7.027505826554082e-05, + "loss": 1.7758, + "step": 12580 + }, + { + "epoch": 3.8615715162676487, + "grad_norm": 0.25755998492240906, + "learning_rate": 7.027051460379788e-05, + "loss": 1.8429, + "step": 12581 + }, + { + "epoch": 3.861878453038674, + "grad_norm": 0.23636581003665924, + "learning_rate": 7.026597074172982e-05, + "loss": 1.7662, + "step": 12582 + }, + { + "epoch": 3.8621853898096994, + "grad_norm": 0.22599349915981293, + "learning_rate": 7.026142667938156e-05, + "loss": 1.7199, + "step": 12583 + }, + { + "epoch": 3.8624923265807243, + "grad_norm": 0.2504875659942627, + "learning_rate": 7.025688241679802e-05, + "loss": 1.8473, + "step": 12584 + }, + { + "epoch": 3.8627992633517496, + "grad_norm": 0.3012976348400116, + "learning_rate": 7.025233795402408e-05, + "loss": 1.8715, + "step": 12585 + }, + { + "epoch": 3.8631062001227745, + "grad_norm": 0.31703677773475647, + "learning_rate": 7.024779329110469e-05, + "loss": 1.8143, + "step": 12586 + }, + { + "epoch": 3.8634131368938, + "grad_norm": 0.27287593483924866, + "learning_rate": 7.024324842808472e-05, + "loss": 1.7227, + "step": 12587 + }, + { + "epoch": 3.863720073664825, + "grad_norm": 0.24663801491260529, + "learning_rate": 7.02387033650091e-05, + "loss": 1.7529, + "step": 12588 + }, + { + "epoch": 3.86402701043585, + "grad_norm": 0.26127147674560547, + "learning_rate": 7.023415810192277e-05, + "loss": 1.7629, + "step": 12589 + }, + { + "epoch": 3.8643339472068754, + "grad_norm": 0.3457142114639282, + "learning_rate": 7.022961263887062e-05, + "loss": 1.8212, + "step": 12590 + }, + { + "epoch": 3.8646408839779003, + "grad_norm": 0.3296070694923401, + "learning_rate": 7.022506697589759e-05, + "loss": 1.7907, + "step": 12591 + }, + { + "epoch": 3.8649478207489256, + "grad_norm": 0.29474303126335144, + "learning_rate": 7.022052111304858e-05, + "loss": 1.7866, + "step": 12592 + }, + { + "epoch": 3.865254757519951, + "grad_norm": 0.2535403072834015, + "learning_rate": 7.021597505036852e-05, + "loss": 1.7607, + "step": 12593 + }, + { + "epoch": 3.8655616942909763, + "grad_norm": 0.26691222190856934, + "learning_rate": 7.021142878790237e-05, + "loss": 1.8063, + "step": 12594 + }, + { + "epoch": 3.865868631062001, + "grad_norm": 0.2784755229949951, + "learning_rate": 7.020688232569502e-05, + "loss": 1.8065, + "step": 12595 + }, + { + "epoch": 3.8661755678330265, + "grad_norm": 0.23714317381381989, + "learning_rate": 7.020233566379142e-05, + "loss": 1.8317, + "step": 12596 + }, + { + "epoch": 3.8664825046040514, + "grad_norm": 0.25010553002357483, + "learning_rate": 7.019778880223649e-05, + "loss": 1.8493, + "step": 12597 + }, + { + "epoch": 3.8667894413750767, + "grad_norm": 0.2798489034175873, + "learning_rate": 7.01932417410752e-05, + "loss": 1.8134, + "step": 12598 + }, + { + "epoch": 3.867096378146102, + "grad_norm": 0.26199260354042053, + "learning_rate": 7.018869448035243e-05, + "loss": 1.6931, + "step": 12599 + }, + { + "epoch": 3.867403314917127, + "grad_norm": 0.24582891166210175, + "learning_rate": 7.018414702011314e-05, + "loss": 1.8076, + "step": 12600 + }, + { + "epoch": 3.8677102516881523, + "grad_norm": 0.25493237376213074, + "learning_rate": 7.01795993604023e-05, + "loss": 1.7851, + "step": 12601 + }, + { + "epoch": 3.868017188459177, + "grad_norm": 0.2607674300670624, + "learning_rate": 7.017505150126483e-05, + "loss": 1.7285, + "step": 12602 + }, + { + "epoch": 3.8683241252302025, + "grad_norm": 0.23629581928253174, + "learning_rate": 7.017050344274568e-05, + "loss": 1.8254, + "step": 12603 + }, + { + "epoch": 3.868631062001228, + "grad_norm": 0.3129318058490753, + "learning_rate": 7.016595518488979e-05, + "loss": 1.7914, + "step": 12604 + }, + { + "epoch": 3.8689379987722528, + "grad_norm": 0.3178271949291229, + "learning_rate": 7.01614067277421e-05, + "loss": 1.8139, + "step": 12605 + }, + { + "epoch": 3.869244935543278, + "grad_norm": 0.3230711817741394, + "learning_rate": 7.015685807134757e-05, + "loss": 1.8203, + "step": 12606 + }, + { + "epoch": 3.869551872314303, + "grad_norm": 0.26339825987815857, + "learning_rate": 7.015230921575118e-05, + "loss": 1.8022, + "step": 12607 + }, + { + "epoch": 3.8698588090853283, + "grad_norm": 0.25337356328964233, + "learning_rate": 7.014776016099785e-05, + "loss": 1.7779, + "step": 12608 + }, + { + "epoch": 3.8701657458563536, + "grad_norm": 0.2506195306777954, + "learning_rate": 7.014321090713253e-05, + "loss": 1.7858, + "step": 12609 + }, + { + "epoch": 3.870472682627379, + "grad_norm": 0.26249951124191284, + "learning_rate": 7.013866145420021e-05, + "loss": 1.8051, + "step": 12610 + }, + { + "epoch": 3.870779619398404, + "grad_norm": 0.25666534900665283, + "learning_rate": 7.013411180224581e-05, + "loss": 1.7945, + "step": 12611 + }, + { + "epoch": 3.871086556169429, + "grad_norm": 0.23901648819446564, + "learning_rate": 7.012956195131433e-05, + "loss": 1.7844, + "step": 12612 + }, + { + "epoch": 3.871393492940454, + "grad_norm": 0.26814451813697815, + "learning_rate": 7.012501190145071e-05, + "loss": 1.7713, + "step": 12613 + }, + { + "epoch": 3.8717004297114794, + "grad_norm": 0.28377315402030945, + "learning_rate": 7.012046165269995e-05, + "loss": 1.7866, + "step": 12614 + }, + { + "epoch": 3.8720073664825048, + "grad_norm": 0.2751680612564087, + "learning_rate": 7.011591120510699e-05, + "loss": 1.7215, + "step": 12615 + }, + { + "epoch": 3.8723143032535297, + "grad_norm": 0.21988113224506378, + "learning_rate": 7.011136055871679e-05, + "loss": 1.8009, + "step": 12616 + }, + { + "epoch": 3.872621240024555, + "grad_norm": 0.26462143659591675, + "learning_rate": 7.010680971357434e-05, + "loss": 1.7618, + "step": 12617 + }, + { + "epoch": 3.87292817679558, + "grad_norm": 0.29054632782936096, + "learning_rate": 7.010225866972462e-05, + "loss": 1.7549, + "step": 12618 + }, + { + "epoch": 3.873235113566605, + "grad_norm": 0.31341224908828735, + "learning_rate": 7.00977074272126e-05, + "loss": 1.8827, + "step": 12619 + }, + { + "epoch": 3.8735420503376305, + "grad_norm": 0.24252115190029144, + "learning_rate": 7.009315598608324e-05, + "loss": 1.7544, + "step": 12620 + }, + { + "epoch": 3.873848987108656, + "grad_norm": 0.30036893486976624, + "learning_rate": 7.008860434638154e-05, + "loss": 1.7465, + "step": 12621 + }, + { + "epoch": 3.8741559238796808, + "grad_norm": 0.3217438757419586, + "learning_rate": 7.00840525081525e-05, + "loss": 1.72, + "step": 12622 + }, + { + "epoch": 3.874462860650706, + "grad_norm": 0.22507290542125702, + "learning_rate": 7.007950047144105e-05, + "loss": 1.7177, + "step": 12623 + }, + { + "epoch": 3.874769797421731, + "grad_norm": 0.3014441728591919, + "learning_rate": 7.007494823629224e-05, + "loss": 1.7502, + "step": 12624 + }, + { + "epoch": 3.8750767341927563, + "grad_norm": 0.3836904466152191, + "learning_rate": 7.0070395802751e-05, + "loss": 1.7971, + "step": 12625 + }, + { + "epoch": 3.8753836709637817, + "grad_norm": 0.33565691113471985, + "learning_rate": 7.006584317086235e-05, + "loss": 1.7439, + "step": 12626 + }, + { + "epoch": 3.8756906077348066, + "grad_norm": 0.2292134314775467, + "learning_rate": 7.006129034067128e-05, + "loss": 1.7998, + "step": 12627 + }, + { + "epoch": 3.875997544505832, + "grad_norm": 0.26385873556137085, + "learning_rate": 7.005673731222277e-05, + "loss": 1.7914, + "step": 12628 + }, + { + "epoch": 3.876304481276857, + "grad_norm": 0.2854950428009033, + "learning_rate": 7.005218408556184e-05, + "loss": 1.7761, + "step": 12629 + }, + { + "epoch": 3.876611418047882, + "grad_norm": 0.34260645508766174, + "learning_rate": 7.004763066073348e-05, + "loss": 1.8015, + "step": 12630 + }, + { + "epoch": 3.8769183548189075, + "grad_norm": 0.3223683834075928, + "learning_rate": 7.004307703778267e-05, + "loss": 1.7453, + "step": 12631 + }, + { + "epoch": 3.8772252915899323, + "grad_norm": 0.24715089797973633, + "learning_rate": 7.003852321675442e-05, + "loss": 1.7813, + "step": 12632 + }, + { + "epoch": 3.8775322283609577, + "grad_norm": 0.22822390496730804, + "learning_rate": 7.003396919769377e-05, + "loss": 1.7982, + "step": 12633 + }, + { + "epoch": 3.8778391651319826, + "grad_norm": 0.24125081300735474, + "learning_rate": 7.002941498064565e-05, + "loss": 1.8606, + "step": 12634 + }, + { + "epoch": 3.878146101903008, + "grad_norm": 0.23512506484985352, + "learning_rate": 7.002486056565513e-05, + "loss": 1.7469, + "step": 12635 + }, + { + "epoch": 3.8784530386740332, + "grad_norm": 0.2908322215080261, + "learning_rate": 7.00203059527672e-05, + "loss": 1.796, + "step": 12636 + }, + { + "epoch": 3.8787599754450586, + "grad_norm": 0.22931252419948578, + "learning_rate": 7.001575114202689e-05, + "loss": 1.7482, + "step": 12637 + }, + { + "epoch": 3.8790669122160835, + "grad_norm": 0.22574284672737122, + "learning_rate": 7.001119613347917e-05, + "loss": 1.7698, + "step": 12638 + }, + { + "epoch": 3.879373848987109, + "grad_norm": 0.23129726946353912, + "learning_rate": 7.000664092716909e-05, + "loss": 1.776, + "step": 12639 + }, + { + "epoch": 3.8796807857581337, + "grad_norm": 0.2763366401195526, + "learning_rate": 7.000208552314165e-05, + "loss": 1.7814, + "step": 12640 + }, + { + "epoch": 3.879987722529159, + "grad_norm": 0.29870158433914185, + "learning_rate": 6.99975299214419e-05, + "loss": 1.7467, + "step": 12641 + }, + { + "epoch": 3.8802946593001844, + "grad_norm": 0.33574381470680237, + "learning_rate": 6.999297412211484e-05, + "loss": 1.8159, + "step": 12642 + }, + { + "epoch": 3.8806015960712092, + "grad_norm": 0.30309897661209106, + "learning_rate": 6.998841812520547e-05, + "loss": 1.8454, + "step": 12643 + }, + { + "epoch": 3.8809085328422346, + "grad_norm": 0.27399247884750366, + "learning_rate": 6.998386193075886e-05, + "loss": 1.7956, + "step": 12644 + }, + { + "epoch": 3.8812154696132595, + "grad_norm": 0.28649580478668213, + "learning_rate": 6.997930553881998e-05, + "loss": 1.8308, + "step": 12645 + }, + { + "epoch": 3.881522406384285, + "grad_norm": 0.2716052532196045, + "learning_rate": 6.997474894943392e-05, + "loss": 1.7698, + "step": 12646 + }, + { + "epoch": 3.88182934315531, + "grad_norm": 0.21380536258220673, + "learning_rate": 6.997019216264567e-05, + "loss": 1.7028, + "step": 12647 + }, + { + "epoch": 3.882136279926335, + "grad_norm": 0.25262731313705444, + "learning_rate": 6.996563517850028e-05, + "loss": 1.8236, + "step": 12648 + }, + { + "epoch": 3.8824432166973604, + "grad_norm": 0.21150052547454834, + "learning_rate": 6.996107799704277e-05, + "loss": 1.7437, + "step": 12649 + }, + { + "epoch": 3.8827501534683853, + "grad_norm": 0.2614554464817047, + "learning_rate": 6.995652061831821e-05, + "loss": 1.7575, + "step": 12650 + }, + { + "epoch": 3.8830570902394106, + "grad_norm": 0.214684396982193, + "learning_rate": 6.995196304237159e-05, + "loss": 1.8195, + "step": 12651 + }, + { + "epoch": 3.883364027010436, + "grad_norm": 0.2226872444152832, + "learning_rate": 6.994740526924798e-05, + "loss": 1.7556, + "step": 12652 + }, + { + "epoch": 3.8836709637814613, + "grad_norm": 0.22270764410495758, + "learning_rate": 6.994284729899246e-05, + "loss": 1.7536, + "step": 12653 + }, + { + "epoch": 3.883977900552486, + "grad_norm": 0.20683564245700836, + "learning_rate": 6.993828913165e-05, + "loss": 1.7728, + "step": 12654 + }, + { + "epoch": 3.8842848373235115, + "grad_norm": 0.23667018115520477, + "learning_rate": 6.993373076726568e-05, + "loss": 1.7819, + "step": 12655 + }, + { + "epoch": 3.8845917740945364, + "grad_norm": 0.2265234887599945, + "learning_rate": 6.992917220588455e-05, + "loss": 1.7502, + "step": 12656 + }, + { + "epoch": 3.8848987108655617, + "grad_norm": 0.24490754306316376, + "learning_rate": 6.992461344755168e-05, + "loss": 1.7513, + "step": 12657 + }, + { + "epoch": 3.885205647636587, + "grad_norm": 0.23001348972320557, + "learning_rate": 6.992005449231208e-05, + "loss": 1.733, + "step": 12658 + }, + { + "epoch": 3.885512584407612, + "grad_norm": 0.25424695014953613, + "learning_rate": 6.991549534021084e-05, + "loss": 1.7621, + "step": 12659 + }, + { + "epoch": 3.8858195211786373, + "grad_norm": 0.25552862882614136, + "learning_rate": 6.991093599129299e-05, + "loss": 1.7974, + "step": 12660 + }, + { + "epoch": 3.886126457949662, + "grad_norm": 0.26876959204673767, + "learning_rate": 6.99063764456036e-05, + "loss": 1.7924, + "step": 12661 + }, + { + "epoch": 3.8864333947206875, + "grad_norm": 0.2754429578781128, + "learning_rate": 6.990181670318772e-05, + "loss": 1.7981, + "step": 12662 + }, + { + "epoch": 3.886740331491713, + "grad_norm": 0.281818687915802, + "learning_rate": 6.989725676409044e-05, + "loss": 1.7328, + "step": 12663 + }, + { + "epoch": 3.8870472682627377, + "grad_norm": 0.21676552295684814, + "learning_rate": 6.989269662835681e-05, + "loss": 1.7376, + "step": 12664 + }, + { + "epoch": 3.887354205033763, + "grad_norm": 0.276115745306015, + "learning_rate": 6.98881362960319e-05, + "loss": 1.7784, + "step": 12665 + }, + { + "epoch": 3.887661141804788, + "grad_norm": 0.2806364893913269, + "learning_rate": 6.988357576716075e-05, + "loss": 1.8078, + "step": 12666 + }, + { + "epoch": 3.8879680785758133, + "grad_norm": 0.27620184421539307, + "learning_rate": 6.987901504178845e-05, + "loss": 1.8115, + "step": 12667 + }, + { + "epoch": 3.8882750153468386, + "grad_norm": 0.23845402896404266, + "learning_rate": 6.987445411996009e-05, + "loss": 1.7485, + "step": 12668 + }, + { + "epoch": 3.888581952117864, + "grad_norm": 0.25063586235046387, + "learning_rate": 6.986989300172071e-05, + "loss": 1.7663, + "step": 12669 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.2417975515127182, + "learning_rate": 6.98653316871154e-05, + "loss": 1.7562, + "step": 12670 + }, + { + "epoch": 3.889195825659914, + "grad_norm": 0.24952733516693115, + "learning_rate": 6.986077017618923e-05, + "loss": 1.8063, + "step": 12671 + }, + { + "epoch": 3.889502762430939, + "grad_norm": 0.25847554206848145, + "learning_rate": 6.985620846898732e-05, + "loss": 1.7722, + "step": 12672 + }, + { + "epoch": 3.8898096992019644, + "grad_norm": 0.23762650787830353, + "learning_rate": 6.985164656555471e-05, + "loss": 1.8368, + "step": 12673 + }, + { + "epoch": 3.8901166359729897, + "grad_norm": 0.25346314907073975, + "learning_rate": 6.984708446593648e-05, + "loss": 1.7957, + "step": 12674 + }, + { + "epoch": 3.8904235727440146, + "grad_norm": 0.2466745674610138, + "learning_rate": 6.984252217017774e-05, + "loss": 1.8286, + "step": 12675 + }, + { + "epoch": 3.89073050951504, + "grad_norm": 0.25413215160369873, + "learning_rate": 6.983795967832356e-05, + "loss": 1.7711, + "step": 12676 + }, + { + "epoch": 3.891037446286065, + "grad_norm": 0.2315925806760788, + "learning_rate": 6.983339699041903e-05, + "loss": 1.7546, + "step": 12677 + }, + { + "epoch": 3.89134438305709, + "grad_norm": 0.26473405957221985, + "learning_rate": 6.982883410650925e-05, + "loss": 1.7563, + "step": 12678 + }, + { + "epoch": 3.8916513198281155, + "grad_norm": 0.24176491796970367, + "learning_rate": 6.982427102663932e-05, + "loss": 1.7734, + "step": 12679 + }, + { + "epoch": 3.891958256599141, + "grad_norm": 0.25444844365119934, + "learning_rate": 6.98197077508543e-05, + "loss": 1.803, + "step": 12680 + }, + { + "epoch": 3.8922651933701657, + "grad_norm": 0.25234144926071167, + "learning_rate": 6.981514427919933e-05, + "loss": 1.8099, + "step": 12681 + }, + { + "epoch": 3.892572130141191, + "grad_norm": 0.2571142315864563, + "learning_rate": 6.98105806117195e-05, + "loss": 1.8618, + "step": 12682 + }, + { + "epoch": 3.892879066912216, + "grad_norm": 0.21235275268554688, + "learning_rate": 6.980601674845988e-05, + "loss": 1.7121, + "step": 12683 + }, + { + "epoch": 3.8931860036832413, + "grad_norm": 0.27078527212142944, + "learning_rate": 6.98014526894656e-05, + "loss": 1.8103, + "step": 12684 + }, + { + "epoch": 3.8934929404542666, + "grad_norm": 0.3198096454143524, + "learning_rate": 6.979688843478176e-05, + "loss": 1.7529, + "step": 12685 + }, + { + "epoch": 3.8937998772252915, + "grad_norm": 0.3170493245124817, + "learning_rate": 6.979232398445345e-05, + "loss": 1.7629, + "step": 12686 + }, + { + "epoch": 3.894106813996317, + "grad_norm": 0.2495265007019043, + "learning_rate": 6.978775933852582e-05, + "loss": 1.7407, + "step": 12687 + }, + { + "epoch": 3.8944137507673418, + "grad_norm": 0.24570141732692719, + "learning_rate": 6.978319449704395e-05, + "loss": 1.7688, + "step": 12688 + }, + { + "epoch": 3.894720687538367, + "grad_norm": 0.23956388235092163, + "learning_rate": 6.977862946005295e-05, + "loss": 1.7115, + "step": 12689 + }, + { + "epoch": 3.8950276243093924, + "grad_norm": 0.21548940241336823, + "learning_rate": 6.977406422759793e-05, + "loss": 1.7611, + "step": 12690 + }, + { + "epoch": 3.8953345610804173, + "grad_norm": 0.25797295570373535, + "learning_rate": 6.976949879972403e-05, + "loss": 1.7688, + "step": 12691 + }, + { + "epoch": 3.8956414978514426, + "grad_norm": 0.28257784247398376, + "learning_rate": 6.976493317647636e-05, + "loss": 1.7517, + "step": 12692 + }, + { + "epoch": 3.8959484346224675, + "grad_norm": 0.23828580975532532, + "learning_rate": 6.976036735790004e-05, + "loss": 1.7877, + "step": 12693 + }, + { + "epoch": 3.896255371393493, + "grad_norm": 0.22915001213550568, + "learning_rate": 6.975580134404017e-05, + "loss": 1.7741, + "step": 12694 + }, + { + "epoch": 3.896562308164518, + "grad_norm": 0.22975030541419983, + "learning_rate": 6.97512351349419e-05, + "loss": 1.772, + "step": 12695 + }, + { + "epoch": 3.8968692449355435, + "grad_norm": 0.29515185952186584, + "learning_rate": 6.974666873065034e-05, + "loss": 1.8001, + "step": 12696 + }, + { + "epoch": 3.8971761817065684, + "grad_norm": 0.26904794573783875, + "learning_rate": 6.974210213121064e-05, + "loss": 1.7069, + "step": 12697 + }, + { + "epoch": 3.8974831184775938, + "grad_norm": 0.2549479603767395, + "learning_rate": 6.97375353366679e-05, + "loss": 1.7419, + "step": 12698 + }, + { + "epoch": 3.8977900552486187, + "grad_norm": 0.23750101029872894, + "learning_rate": 6.973296834706729e-05, + "loss": 1.7815, + "step": 12699 + }, + { + "epoch": 3.898096992019644, + "grad_norm": 0.23529762029647827, + "learning_rate": 6.972840116245389e-05, + "loss": 1.8139, + "step": 12700 + }, + { + "epoch": 3.8984039287906693, + "grad_norm": 0.3212098777294159, + "learning_rate": 6.97238337828729e-05, + "loss": 1.7507, + "step": 12701 + }, + { + "epoch": 3.898710865561694, + "grad_norm": 0.3167687952518463, + "learning_rate": 6.971926620836941e-05, + "loss": 1.8062, + "step": 12702 + }, + { + "epoch": 3.8990178023327196, + "grad_norm": 0.31298309564590454, + "learning_rate": 6.971469843898855e-05, + "loss": 1.8127, + "step": 12703 + }, + { + "epoch": 3.8993247391037444, + "grad_norm": 0.2537378668785095, + "learning_rate": 6.971013047477551e-05, + "loss": 1.7675, + "step": 12704 + }, + { + "epoch": 3.8996316758747698, + "grad_norm": 0.24292805790901184, + "learning_rate": 6.97055623157754e-05, + "loss": 1.8004, + "step": 12705 + }, + { + "epoch": 3.899938612645795, + "grad_norm": 0.2929537296295166, + "learning_rate": 6.970099396203338e-05, + "loss": 1.7963, + "step": 12706 + }, + { + "epoch": 3.90024554941682, + "grad_norm": 0.30531612038612366, + "learning_rate": 6.969642541359459e-05, + "loss": 1.7347, + "step": 12707 + }, + { + "epoch": 3.9005524861878453, + "grad_norm": 0.3138202726840973, + "learning_rate": 6.969185667050417e-05, + "loss": 1.7987, + "step": 12708 + }, + { + "epoch": 3.9008594229588702, + "grad_norm": 0.2366247922182083, + "learning_rate": 6.96872877328073e-05, + "loss": 1.7671, + "step": 12709 + }, + { + "epoch": 3.9011663597298956, + "grad_norm": 0.26251721382141113, + "learning_rate": 6.96827186005491e-05, + "loss": 1.7657, + "step": 12710 + }, + { + "epoch": 3.901473296500921, + "grad_norm": 0.32497119903564453, + "learning_rate": 6.967814927377474e-05, + "loss": 1.7873, + "step": 12711 + }, + { + "epoch": 3.9017802332719462, + "grad_norm": 0.3290228843688965, + "learning_rate": 6.967357975252939e-05, + "loss": 1.8076, + "step": 12712 + }, + { + "epoch": 3.902087170042971, + "grad_norm": 0.2737300992012024, + "learning_rate": 6.966901003685817e-05, + "loss": 1.7405, + "step": 12713 + }, + { + "epoch": 3.9023941068139965, + "grad_norm": 0.25465309619903564, + "learning_rate": 6.966444012680626e-05, + "loss": 1.8063, + "step": 12714 + }, + { + "epoch": 3.9027010435850213, + "grad_norm": 0.2397255003452301, + "learning_rate": 6.965987002241885e-05, + "loss": 1.8079, + "step": 12715 + }, + { + "epoch": 3.9030079803560467, + "grad_norm": 0.23115718364715576, + "learning_rate": 6.965529972374108e-05, + "loss": 1.8032, + "step": 12716 + }, + { + "epoch": 3.903314917127072, + "grad_norm": 0.2536461055278778, + "learning_rate": 6.96507292308181e-05, + "loss": 1.7477, + "step": 12717 + }, + { + "epoch": 3.903621853898097, + "grad_norm": 0.27151185274124146, + "learning_rate": 6.96461585436951e-05, + "loss": 1.75, + "step": 12718 + }, + { + "epoch": 3.9039287906691222, + "grad_norm": 0.26894113421440125, + "learning_rate": 6.964158766241726e-05, + "loss": 1.7816, + "step": 12719 + }, + { + "epoch": 3.904235727440147, + "grad_norm": 0.23541375994682312, + "learning_rate": 6.963701658702972e-05, + "loss": 1.7991, + "step": 12720 + }, + { + "epoch": 3.9045426642111725, + "grad_norm": 0.22142915427684784, + "learning_rate": 6.96324453175777e-05, + "loss": 1.7245, + "step": 12721 + }, + { + "epoch": 3.904849600982198, + "grad_norm": 0.32864269614219666, + "learning_rate": 6.962787385410632e-05, + "loss": 1.7631, + "step": 12722 + }, + { + "epoch": 3.9051565377532227, + "grad_norm": 0.23657776415348053, + "learning_rate": 6.96233021966608e-05, + "loss": 1.8081, + "step": 12723 + }, + { + "epoch": 3.905463474524248, + "grad_norm": 0.24790632724761963, + "learning_rate": 6.961873034528629e-05, + "loss": 1.7193, + "step": 12724 + }, + { + "epoch": 3.905770411295273, + "grad_norm": 0.2517886459827423, + "learning_rate": 6.961415830002801e-05, + "loss": 1.7785, + "step": 12725 + }, + { + "epoch": 3.9060773480662982, + "grad_norm": 0.2340923547744751, + "learning_rate": 6.960958606093113e-05, + "loss": 1.7632, + "step": 12726 + }, + { + "epoch": 3.9063842848373236, + "grad_norm": 0.23260441422462463, + "learning_rate": 6.960501362804079e-05, + "loss": 1.7865, + "step": 12727 + }, + { + "epoch": 3.906691221608349, + "grad_norm": 0.22616329789161682, + "learning_rate": 6.960044100140224e-05, + "loss": 1.7851, + "step": 12728 + }, + { + "epoch": 3.906998158379374, + "grad_norm": 0.2849951982498169, + "learning_rate": 6.959586818106064e-05, + "loss": 1.8618, + "step": 12729 + }, + { + "epoch": 3.907305095150399, + "grad_norm": 0.3279374837875366, + "learning_rate": 6.95912951670612e-05, + "loss": 1.8563, + "step": 12730 + }, + { + "epoch": 3.907612031921424, + "grad_norm": 0.24359555542469025, + "learning_rate": 6.958672195944906e-05, + "loss": 1.7604, + "step": 12731 + }, + { + "epoch": 3.9079189686924494, + "grad_norm": 0.30881935358047485, + "learning_rate": 6.958214855826947e-05, + "loss": 1.8463, + "step": 12732 + }, + { + "epoch": 3.9082259054634747, + "grad_norm": 0.25361543893814087, + "learning_rate": 6.957757496356763e-05, + "loss": 1.7831, + "step": 12733 + }, + { + "epoch": 3.9085328422344996, + "grad_norm": 0.26763513684272766, + "learning_rate": 6.957300117538869e-05, + "loss": 1.8383, + "step": 12734 + }, + { + "epoch": 3.908839779005525, + "grad_norm": 0.2238057255744934, + "learning_rate": 6.95684271937779e-05, + "loss": 1.7702, + "step": 12735 + }, + { + "epoch": 3.90914671577655, + "grad_norm": 0.22110232710838318, + "learning_rate": 6.956385301878045e-05, + "loss": 1.7931, + "step": 12736 + }, + { + "epoch": 3.909453652547575, + "grad_norm": 0.23765070736408234, + "learning_rate": 6.955927865044152e-05, + "loss": 1.7212, + "step": 12737 + }, + { + "epoch": 3.9097605893186005, + "grad_norm": 0.22324508428573608, + "learning_rate": 6.955470408880633e-05, + "loss": 1.7161, + "step": 12738 + }, + { + "epoch": 3.9100675260896254, + "grad_norm": 0.22485347092151642, + "learning_rate": 6.955012933392012e-05, + "loss": 1.7374, + "step": 12739 + }, + { + "epoch": 3.9103744628606507, + "grad_norm": 0.28046715259552, + "learning_rate": 6.954555438582806e-05, + "loss": 1.9264, + "step": 12740 + }, + { + "epoch": 3.9106813996316756, + "grad_norm": 0.26391276717185974, + "learning_rate": 6.954097924457536e-05, + "loss": 1.7343, + "step": 12741 + }, + { + "epoch": 3.910988336402701, + "grad_norm": 0.29596614837646484, + "learning_rate": 6.953640391020726e-05, + "loss": 1.8111, + "step": 12742 + }, + { + "epoch": 3.9112952731737263, + "grad_norm": 0.2709808051586151, + "learning_rate": 6.953182838276896e-05, + "loss": 1.7776, + "step": 12743 + }, + { + "epoch": 3.9116022099447516, + "grad_norm": 0.2585100531578064, + "learning_rate": 6.952725266230571e-05, + "loss": 1.7774, + "step": 12744 + }, + { + "epoch": 3.9119091467157765, + "grad_norm": 0.26490530371665955, + "learning_rate": 6.952267674886268e-05, + "loss": 1.78, + "step": 12745 + }, + { + "epoch": 3.912216083486802, + "grad_norm": 0.23654767870903015, + "learning_rate": 6.951810064248512e-05, + "loss": 1.8263, + "step": 12746 + }, + { + "epoch": 3.9125230202578267, + "grad_norm": 0.2495296597480774, + "learning_rate": 6.951352434321826e-05, + "loss": 1.787, + "step": 12747 + }, + { + "epoch": 3.912829957028852, + "grad_norm": 0.24038313329219818, + "learning_rate": 6.950894785110728e-05, + "loss": 1.774, + "step": 12748 + }, + { + "epoch": 3.9131368937998774, + "grad_norm": 0.23738732933998108, + "learning_rate": 6.950437116619749e-05, + "loss": 1.7401, + "step": 12749 + }, + { + "epoch": 3.9134438305709023, + "grad_norm": 0.28192025423049927, + "learning_rate": 6.949979428853405e-05, + "loss": 1.8416, + "step": 12750 + }, + { + "epoch": 3.9137507673419276, + "grad_norm": 0.30579057335853577, + "learning_rate": 6.949521721816221e-05, + "loss": 1.7404, + "step": 12751 + }, + { + "epoch": 3.9140577041129525, + "grad_norm": 0.23972894251346588, + "learning_rate": 6.949063995512721e-05, + "loss": 1.7543, + "step": 12752 + }, + { + "epoch": 3.914364640883978, + "grad_norm": 0.2837793231010437, + "learning_rate": 6.94860624994743e-05, + "loss": 1.7779, + "step": 12753 + }, + { + "epoch": 3.914671577655003, + "grad_norm": 0.3344916105270386, + "learning_rate": 6.948148485124868e-05, + "loss": 1.7803, + "step": 12754 + }, + { + "epoch": 3.9149785144260285, + "grad_norm": 0.24271291494369507, + "learning_rate": 6.94769070104956e-05, + "loss": 1.7362, + "step": 12755 + }, + { + "epoch": 3.9152854511970534, + "grad_norm": 0.25299304723739624, + "learning_rate": 6.947232897726031e-05, + "loss": 1.7685, + "step": 12756 + }, + { + "epoch": 3.9155923879680787, + "grad_norm": 0.24766205251216888, + "learning_rate": 6.946775075158807e-05, + "loss": 1.829, + "step": 12757 + }, + { + "epoch": 3.9158993247391036, + "grad_norm": 0.2508428692817688, + "learning_rate": 6.94631723335241e-05, + "loss": 1.809, + "step": 12758 + }, + { + "epoch": 3.916206261510129, + "grad_norm": 0.2172096222639084, + "learning_rate": 6.945859372311365e-05, + "loss": 1.7376, + "step": 12759 + }, + { + "epoch": 3.9165131982811543, + "grad_norm": 0.28976425528526306, + "learning_rate": 6.945401492040198e-05, + "loss": 1.8229, + "step": 12760 + }, + { + "epoch": 3.916820135052179, + "grad_norm": 0.3528063893318176, + "learning_rate": 6.944943592543432e-05, + "loss": 1.7559, + "step": 12761 + }, + { + "epoch": 3.9171270718232045, + "grad_norm": 0.46312370896339417, + "learning_rate": 6.944485673825595e-05, + "loss": 1.7664, + "step": 12762 + }, + { + "epoch": 3.9174340085942294, + "grad_norm": 0.4466164708137512, + "learning_rate": 6.94402773589121e-05, + "loss": 1.7833, + "step": 12763 + }, + { + "epoch": 3.9177409453652547, + "grad_norm": 0.2637740969657898, + "learning_rate": 6.943569778744804e-05, + "loss": 1.818, + "step": 12764 + }, + { + "epoch": 3.91804788213628, + "grad_norm": 0.37515267729759216, + "learning_rate": 6.943111802390901e-05, + "loss": 1.7898, + "step": 12765 + }, + { + "epoch": 3.918354818907305, + "grad_norm": 0.45146289467811584, + "learning_rate": 6.942653806834029e-05, + "loss": 1.7797, + "step": 12766 + }, + { + "epoch": 3.9186617556783303, + "grad_norm": 0.2809859812259674, + "learning_rate": 6.942195792078712e-05, + "loss": 1.7836, + "step": 12767 + }, + { + "epoch": 3.918968692449355, + "grad_norm": 0.3606306314468384, + "learning_rate": 6.94173775812948e-05, + "loss": 1.7657, + "step": 12768 + }, + { + "epoch": 3.9192756292203805, + "grad_norm": 0.49528738856315613, + "learning_rate": 6.941279704990857e-05, + "loss": 1.7628, + "step": 12769 + }, + { + "epoch": 3.919582565991406, + "grad_norm": 0.3484322428703308, + "learning_rate": 6.940821632667371e-05, + "loss": 1.7939, + "step": 12770 + }, + { + "epoch": 3.919889502762431, + "grad_norm": 0.2479606419801712, + "learning_rate": 6.940363541163546e-05, + "loss": 1.813, + "step": 12771 + }, + { + "epoch": 3.920196439533456, + "grad_norm": 0.3491765558719635, + "learning_rate": 6.939905430483911e-05, + "loss": 1.7338, + "step": 12772 + }, + { + "epoch": 3.9205033763044814, + "grad_norm": 0.291810005903244, + "learning_rate": 6.939447300632995e-05, + "loss": 1.7445, + "step": 12773 + }, + { + "epoch": 3.9208103130755063, + "grad_norm": 0.2467527985572815, + "learning_rate": 6.938989151615324e-05, + "loss": 1.8462, + "step": 12774 + }, + { + "epoch": 3.9211172498465316, + "grad_norm": 0.35656824707984924, + "learning_rate": 6.938530983435426e-05, + "loss": 1.7751, + "step": 12775 + }, + { + "epoch": 3.921424186617557, + "grad_norm": 0.31269776821136475, + "learning_rate": 6.938072796097828e-05, + "loss": 1.7714, + "step": 12776 + }, + { + "epoch": 3.921731123388582, + "grad_norm": 0.2082831859588623, + "learning_rate": 6.937614589607058e-05, + "loss": 1.7263, + "step": 12777 + }, + { + "epoch": 3.922038060159607, + "grad_norm": 0.27583765983581543, + "learning_rate": 6.937156363967646e-05, + "loss": 1.6822, + "step": 12778 + }, + { + "epoch": 3.922344996930632, + "grad_norm": 0.32773876190185547, + "learning_rate": 6.93669811918412e-05, + "loss": 1.7792, + "step": 12779 + }, + { + "epoch": 3.9226519337016574, + "grad_norm": 0.2583121657371521, + "learning_rate": 6.936239855261007e-05, + "loss": 1.7812, + "step": 12780 + }, + { + "epoch": 3.9229588704726828, + "grad_norm": 0.245570570230484, + "learning_rate": 6.935781572202836e-05, + "loss": 1.7252, + "step": 12781 + }, + { + "epoch": 3.9232658072437077, + "grad_norm": 0.2379419505596161, + "learning_rate": 6.935323270014138e-05, + "loss": 1.7485, + "step": 12782 + }, + { + "epoch": 3.923572744014733, + "grad_norm": 0.2239784598350525, + "learning_rate": 6.934864948699439e-05, + "loss": 1.7444, + "step": 12783 + }, + { + "epoch": 3.923879680785758, + "grad_norm": 0.2366618812084198, + "learning_rate": 6.934406608263274e-05, + "loss": 1.777, + "step": 12784 + }, + { + "epoch": 3.924186617556783, + "grad_norm": 0.22583791613578796, + "learning_rate": 6.933948248710169e-05, + "loss": 1.7291, + "step": 12785 + }, + { + "epoch": 3.9244935543278086, + "grad_norm": 0.24141047894954681, + "learning_rate": 6.933489870044651e-05, + "loss": 1.7748, + "step": 12786 + }, + { + "epoch": 3.924800491098834, + "grad_norm": 0.2389962524175644, + "learning_rate": 6.933031472271255e-05, + "loss": 1.7957, + "step": 12787 + }, + { + "epoch": 3.925107427869859, + "grad_norm": 0.25230300426483154, + "learning_rate": 6.932573055394509e-05, + "loss": 1.7621, + "step": 12788 + }, + { + "epoch": 3.925414364640884, + "grad_norm": 0.23894043266773224, + "learning_rate": 6.932114619418941e-05, + "loss": 1.7285, + "step": 12789 + }, + { + "epoch": 3.925721301411909, + "grad_norm": 0.2650291919708252, + "learning_rate": 6.931656164349086e-05, + "loss": 1.7613, + "step": 12790 + }, + { + "epoch": 3.9260282381829343, + "grad_norm": 0.20616789162158966, + "learning_rate": 6.931197690189472e-05, + "loss": 1.7505, + "step": 12791 + }, + { + "epoch": 3.9263351749539597, + "grad_norm": 0.23915675282478333, + "learning_rate": 6.930739196944633e-05, + "loss": 1.7477, + "step": 12792 + }, + { + "epoch": 3.9266421117249846, + "grad_norm": 0.2522687613964081, + "learning_rate": 6.930280684619094e-05, + "loss": 1.8, + "step": 12793 + }, + { + "epoch": 3.92694904849601, + "grad_norm": 0.264167845249176, + "learning_rate": 6.929822153217391e-05, + "loss": 1.7516, + "step": 12794 + }, + { + "epoch": 3.927255985267035, + "grad_norm": 0.21358054876327515, + "learning_rate": 6.929363602744054e-05, + "loss": 1.7207, + "step": 12795 + }, + { + "epoch": 3.92756292203806, + "grad_norm": 0.25632721185684204, + "learning_rate": 6.928905033203617e-05, + "loss": 1.7446, + "step": 12796 + }, + { + "epoch": 3.9278698588090855, + "grad_norm": 0.2717185318470001, + "learning_rate": 6.928446444600608e-05, + "loss": 1.8555, + "step": 12797 + }, + { + "epoch": 3.9281767955801103, + "grad_norm": 0.2871767282485962, + "learning_rate": 6.927987836939561e-05, + "loss": 1.7861, + "step": 12798 + }, + { + "epoch": 3.9284837323511357, + "grad_norm": 0.282507061958313, + "learning_rate": 6.927529210225009e-05, + "loss": 1.7683, + "step": 12799 + }, + { + "epoch": 3.9287906691221606, + "grad_norm": 0.24870644509792328, + "learning_rate": 6.927070564461482e-05, + "loss": 1.7355, + "step": 12800 + }, + { + "epoch": 3.929097605893186, + "grad_norm": 0.2093631625175476, + "learning_rate": 6.926611899653516e-05, + "loss": 1.7691, + "step": 12801 + }, + { + "epoch": 3.9294045426642112, + "grad_norm": 0.34258076548576355, + "learning_rate": 6.926153215805642e-05, + "loss": 1.8398, + "step": 12802 + }, + { + "epoch": 3.9297114794352366, + "grad_norm": 0.39179500937461853, + "learning_rate": 6.925694512922391e-05, + "loss": 1.8229, + "step": 12803 + }, + { + "epoch": 3.9300184162062615, + "grad_norm": 0.36814743280410767, + "learning_rate": 6.9252357910083e-05, + "loss": 1.7759, + "step": 12804 + }, + { + "epoch": 3.930325352977287, + "grad_norm": 0.2659403085708618, + "learning_rate": 6.924777050067902e-05, + "loss": 1.7553, + "step": 12805 + }, + { + "epoch": 3.9306322897483117, + "grad_norm": 0.20617491006851196, + "learning_rate": 6.924318290105724e-05, + "loss": 1.7398, + "step": 12806 + }, + { + "epoch": 3.930939226519337, + "grad_norm": 0.23730522394180298, + "learning_rate": 6.923859511126309e-05, + "loss": 1.699, + "step": 12807 + }, + { + "epoch": 3.9312461632903624, + "grad_norm": 0.24865423142910004, + "learning_rate": 6.923400713134184e-05, + "loss": 1.7801, + "step": 12808 + }, + { + "epoch": 3.9315531000613873, + "grad_norm": 0.2495356798171997, + "learning_rate": 6.92294189613389e-05, + "loss": 1.803, + "step": 12809 + }, + { + "epoch": 3.9318600368324126, + "grad_norm": 0.24223244190216064, + "learning_rate": 6.922483060129955e-05, + "loss": 1.751, + "step": 12810 + }, + { + "epoch": 3.9321669736034375, + "grad_norm": 0.2541450262069702, + "learning_rate": 6.922024205126913e-05, + "loss": 1.7721, + "step": 12811 + }, + { + "epoch": 3.932473910374463, + "grad_norm": 0.24528831243515015, + "learning_rate": 6.921565331129304e-05, + "loss": 1.792, + "step": 12812 + }, + { + "epoch": 3.932780847145488, + "grad_norm": 0.22789500653743744, + "learning_rate": 6.921106438141659e-05, + "loss": 1.8455, + "step": 12813 + }, + { + "epoch": 3.933087783916513, + "grad_norm": 0.26267170906066895, + "learning_rate": 6.920647526168515e-05, + "loss": 1.7254, + "step": 12814 + }, + { + "epoch": 3.9333947206875384, + "grad_norm": 0.23044808208942413, + "learning_rate": 6.920188595214406e-05, + "loss": 1.7217, + "step": 12815 + }, + { + "epoch": 3.9337016574585633, + "grad_norm": 0.2304011732339859, + "learning_rate": 6.919729645283867e-05, + "loss": 1.8121, + "step": 12816 + }, + { + "epoch": 3.9340085942295886, + "grad_norm": 0.21516792476177216, + "learning_rate": 6.919270676381435e-05, + "loss": 1.7305, + "step": 12817 + }, + { + "epoch": 3.934315531000614, + "grad_norm": 0.24698840081691742, + "learning_rate": 6.918811688511646e-05, + "loss": 1.7967, + "step": 12818 + }, + { + "epoch": 3.9346224677716393, + "grad_norm": 0.23132537305355072, + "learning_rate": 6.918352681679035e-05, + "loss": 1.7439, + "step": 12819 + }, + { + "epoch": 3.934929404542664, + "grad_norm": 0.2597793936729431, + "learning_rate": 6.917893655888139e-05, + "loss": 1.7882, + "step": 12820 + }, + { + "epoch": 3.9352363413136895, + "grad_norm": 0.23946607112884521, + "learning_rate": 6.917434611143493e-05, + "loss": 1.7991, + "step": 12821 + }, + { + "epoch": 3.9355432780847144, + "grad_norm": 0.25808244943618774, + "learning_rate": 6.916975547449634e-05, + "loss": 1.845, + "step": 12822 + }, + { + "epoch": 3.9358502148557397, + "grad_norm": 0.26082557439804077, + "learning_rate": 6.9165164648111e-05, + "loss": 1.7562, + "step": 12823 + }, + { + "epoch": 3.936157151626765, + "grad_norm": 0.24810053408145905, + "learning_rate": 6.916057363232425e-05, + "loss": 1.778, + "step": 12824 + }, + { + "epoch": 3.93646408839779, + "grad_norm": 0.24168157577514648, + "learning_rate": 6.91559824271815e-05, + "loss": 1.7628, + "step": 12825 + }, + { + "epoch": 3.9367710251688153, + "grad_norm": 0.23800434172153473, + "learning_rate": 6.91513910327281e-05, + "loss": 1.8063, + "step": 12826 + }, + { + "epoch": 3.93707796193984, + "grad_norm": 0.23055073618888855, + "learning_rate": 6.914679944900944e-05, + "loss": 1.749, + "step": 12827 + }, + { + "epoch": 3.9373848987108655, + "grad_norm": 0.22455987334251404, + "learning_rate": 6.914220767607088e-05, + "loss": 1.7471, + "step": 12828 + }, + { + "epoch": 3.937691835481891, + "grad_norm": 0.21808378398418427, + "learning_rate": 6.913761571395778e-05, + "loss": 1.7503, + "step": 12829 + }, + { + "epoch": 3.937998772252916, + "grad_norm": 0.23136213421821594, + "learning_rate": 6.913302356271556e-05, + "loss": 1.752, + "step": 12830 + }, + { + "epoch": 3.938305709023941, + "grad_norm": 0.29579970240592957, + "learning_rate": 6.912843122238959e-05, + "loss": 1.8028, + "step": 12831 + }, + { + "epoch": 3.9386126457949664, + "grad_norm": 0.28578072786331177, + "learning_rate": 6.912383869302526e-05, + "loss": 1.8183, + "step": 12832 + }, + { + "epoch": 3.9389195825659913, + "grad_norm": 0.2616737186908722, + "learning_rate": 6.911924597466793e-05, + "loss": 1.8366, + "step": 12833 + }, + { + "epoch": 3.9392265193370166, + "grad_norm": 0.29275768995285034, + "learning_rate": 6.911465306736302e-05, + "loss": 1.731, + "step": 12834 + }, + { + "epoch": 3.939533456108042, + "grad_norm": 0.3300873041152954, + "learning_rate": 6.91100599711559e-05, + "loss": 1.8713, + "step": 12835 + }, + { + "epoch": 3.939840392879067, + "grad_norm": 0.2744643986225128, + "learning_rate": 6.910546668609195e-05, + "loss": 1.8479, + "step": 12836 + }, + { + "epoch": 3.940147329650092, + "grad_norm": 0.25248417258262634, + "learning_rate": 6.91008732122166e-05, + "loss": 1.7962, + "step": 12837 + }, + { + "epoch": 3.940454266421117, + "grad_norm": 0.3068546652793884, + "learning_rate": 6.909627954957521e-05, + "loss": 1.759, + "step": 12838 + }, + { + "epoch": 3.9407612031921424, + "grad_norm": 0.3273559808731079, + "learning_rate": 6.909168569821321e-05, + "loss": 1.814, + "step": 12839 + }, + { + "epoch": 3.9410681399631677, + "grad_norm": 0.31192758679389954, + "learning_rate": 6.908709165817597e-05, + "loss": 1.7906, + "step": 12840 + }, + { + "epoch": 3.9413750767341926, + "grad_norm": 0.24487090110778809, + "learning_rate": 6.90824974295089e-05, + "loss": 1.8238, + "step": 12841 + }, + { + "epoch": 3.941682013505218, + "grad_norm": 0.24863721430301666, + "learning_rate": 6.907790301225743e-05, + "loss": 1.7651, + "step": 12842 + }, + { + "epoch": 3.941988950276243, + "grad_norm": 0.26555630564689636, + "learning_rate": 6.907330840646693e-05, + "loss": 1.8268, + "step": 12843 + }, + { + "epoch": 3.942295887047268, + "grad_norm": 0.2439817190170288, + "learning_rate": 6.906871361218281e-05, + "loss": 1.7291, + "step": 12844 + }, + { + "epoch": 3.9426028238182935, + "grad_norm": 0.2410304993391037, + "learning_rate": 6.906411862945048e-05, + "loss": 1.712, + "step": 12845 + }, + { + "epoch": 3.942909760589319, + "grad_norm": 0.28575149178504944, + "learning_rate": 6.905952345831537e-05, + "loss": 1.7269, + "step": 12846 + }, + { + "epoch": 3.9432166973603437, + "grad_norm": 0.3055815100669861, + "learning_rate": 6.905492809882286e-05, + "loss": 1.7234, + "step": 12847 + }, + { + "epoch": 3.943523634131369, + "grad_norm": 0.2762533724308014, + "learning_rate": 6.905033255101839e-05, + "loss": 1.7768, + "step": 12848 + }, + { + "epoch": 3.943830570902394, + "grad_norm": 0.22819125652313232, + "learning_rate": 6.904573681494738e-05, + "loss": 1.7416, + "step": 12849 + }, + { + "epoch": 3.9441375076734193, + "grad_norm": 0.21664194762706757, + "learning_rate": 6.904114089065523e-05, + "loss": 1.7506, + "step": 12850 + }, + { + "epoch": 3.9444444444444446, + "grad_norm": 0.21935151517391205, + "learning_rate": 6.903654477818735e-05, + "loss": 1.7522, + "step": 12851 + }, + { + "epoch": 3.9447513812154695, + "grad_norm": 0.2204175442457199, + "learning_rate": 6.903194847758918e-05, + "loss": 1.7753, + "step": 12852 + }, + { + "epoch": 3.945058317986495, + "grad_norm": 0.23130151629447937, + "learning_rate": 6.902735198890615e-05, + "loss": 1.7743, + "step": 12853 + }, + { + "epoch": 3.9453652547575198, + "grad_norm": 0.2548399567604065, + "learning_rate": 6.902275531218368e-05, + "loss": 1.8373, + "step": 12854 + }, + { + "epoch": 3.945672191528545, + "grad_norm": 0.2905479371547699, + "learning_rate": 6.901815844746718e-05, + "loss": 1.8336, + "step": 12855 + }, + { + "epoch": 3.9459791282995704, + "grad_norm": 0.2698945105075836, + "learning_rate": 6.90135613948021e-05, + "loss": 1.7498, + "step": 12856 + }, + { + "epoch": 3.9462860650705953, + "grad_norm": 0.24966828525066376, + "learning_rate": 6.900896415423387e-05, + "loss": 1.7664, + "step": 12857 + }, + { + "epoch": 3.9465930018416207, + "grad_norm": 0.23272784054279327, + "learning_rate": 6.90043667258079e-05, + "loss": 1.7742, + "step": 12858 + }, + { + "epoch": 3.9468999386126455, + "grad_norm": 0.2277698516845703, + "learning_rate": 6.899976910956965e-05, + "loss": 1.7465, + "step": 12859 + }, + { + "epoch": 3.947206875383671, + "grad_norm": 0.2376442402601242, + "learning_rate": 6.899517130556454e-05, + "loss": 1.7995, + "step": 12860 + }, + { + "epoch": 3.947513812154696, + "grad_norm": 0.25591593980789185, + "learning_rate": 6.899057331383802e-05, + "loss": 1.8017, + "step": 12861 + }, + { + "epoch": 3.9478207489257215, + "grad_norm": 0.2715262472629547, + "learning_rate": 6.898597513443551e-05, + "loss": 1.7967, + "step": 12862 + }, + { + "epoch": 3.9481276856967464, + "grad_norm": 0.20916256308555603, + "learning_rate": 6.898137676740246e-05, + "loss": 1.7711, + "step": 12863 + }, + { + "epoch": 3.9484346224677718, + "grad_norm": 0.2570229768753052, + "learning_rate": 6.897677821278435e-05, + "loss": 1.833, + "step": 12864 + }, + { + "epoch": 3.9487415592387967, + "grad_norm": 0.26343438029289246, + "learning_rate": 6.897217947062657e-05, + "loss": 1.7625, + "step": 12865 + }, + { + "epoch": 3.949048496009822, + "grad_norm": 0.23407024145126343, + "learning_rate": 6.896758054097459e-05, + "loss": 1.7211, + "step": 12866 + }, + { + "epoch": 3.9493554327808473, + "grad_norm": 0.2554715573787689, + "learning_rate": 6.896298142387387e-05, + "loss": 1.8548, + "step": 12867 + }, + { + "epoch": 3.949662369551872, + "grad_norm": 0.24143370985984802, + "learning_rate": 6.895838211936986e-05, + "loss": 1.7635, + "step": 12868 + }, + { + "epoch": 3.9499693063228976, + "grad_norm": 0.24634715914726257, + "learning_rate": 6.8953782627508e-05, + "loss": 1.8012, + "step": 12869 + }, + { + "epoch": 3.9502762430939224, + "grad_norm": 0.22740426659584045, + "learning_rate": 6.894918294833375e-05, + "loss": 1.7294, + "step": 12870 + }, + { + "epoch": 3.950583179864948, + "grad_norm": 0.2651631832122803, + "learning_rate": 6.894458308189257e-05, + "loss": 1.8289, + "step": 12871 + }, + { + "epoch": 3.950890116635973, + "grad_norm": 0.28693267703056335, + "learning_rate": 6.893998302822991e-05, + "loss": 1.8462, + "step": 12872 + }, + { + "epoch": 3.951197053406998, + "grad_norm": 0.26584213972091675, + "learning_rate": 6.893538278739125e-05, + "loss": 1.7621, + "step": 12873 + }, + { + "epoch": 3.9515039901780233, + "grad_norm": 0.29970669746398926, + "learning_rate": 6.893078235942203e-05, + "loss": 1.7659, + "step": 12874 + }, + { + "epoch": 3.9518109269490482, + "grad_norm": 0.2271152138710022, + "learning_rate": 6.892618174436771e-05, + "loss": 1.7151, + "step": 12875 + }, + { + "epoch": 3.9521178637200736, + "grad_norm": 0.24783682823181152, + "learning_rate": 6.892158094227379e-05, + "loss": 1.761, + "step": 12876 + }, + { + "epoch": 3.952424800491099, + "grad_norm": 0.2371140718460083, + "learning_rate": 6.891697995318573e-05, + "loss": 1.7557, + "step": 12877 + }, + { + "epoch": 3.9527317372621242, + "grad_norm": 0.29708394408226013, + "learning_rate": 6.891237877714896e-05, + "loss": 1.8629, + "step": 12878 + }, + { + "epoch": 3.953038674033149, + "grad_norm": 0.2724219262599945, + "learning_rate": 6.890777741420899e-05, + "loss": 1.7378, + "step": 12879 + }, + { + "epoch": 3.9533456108041745, + "grad_norm": 0.2227276861667633, + "learning_rate": 6.890317586441126e-05, + "loss": 1.6989, + "step": 12880 + }, + { + "epoch": 3.9536525475751993, + "grad_norm": 0.2546161413192749, + "learning_rate": 6.889857412780128e-05, + "loss": 1.8688, + "step": 12881 + }, + { + "epoch": 3.9539594843462247, + "grad_norm": 0.24882884323596954, + "learning_rate": 6.889397220442452e-05, + "loss": 1.8137, + "step": 12882 + }, + { + "epoch": 3.95426642111725, + "grad_norm": 0.2549113929271698, + "learning_rate": 6.888937009432644e-05, + "loss": 1.8366, + "step": 12883 + }, + { + "epoch": 3.954573357888275, + "grad_norm": 0.30032673478126526, + "learning_rate": 6.888476779755255e-05, + "loss": 1.8267, + "step": 12884 + }, + { + "epoch": 3.9548802946593002, + "grad_norm": 0.2887294292449951, + "learning_rate": 6.888016531414832e-05, + "loss": 1.8295, + "step": 12885 + }, + { + "epoch": 3.955187231430325, + "grad_norm": 0.2947406470775604, + "learning_rate": 6.88755626441592e-05, + "loss": 1.7713, + "step": 12886 + }, + { + "epoch": 3.9554941682013505, + "grad_norm": 0.2967108190059662, + "learning_rate": 6.887095978763072e-05, + "loss": 1.7636, + "step": 12887 + }, + { + "epoch": 3.955801104972376, + "grad_norm": 0.2495311200618744, + "learning_rate": 6.886635674460836e-05, + "loss": 1.8148, + "step": 12888 + }, + { + "epoch": 3.9561080417434007, + "grad_norm": 0.23367099463939667, + "learning_rate": 6.88617535151376e-05, + "loss": 1.7353, + "step": 12889 + }, + { + "epoch": 3.956414978514426, + "grad_norm": 0.36790570616722107, + "learning_rate": 6.885715009926395e-05, + "loss": 1.7853, + "step": 12890 + }, + { + "epoch": 3.9567219152854514, + "grad_norm": 0.5013020038604736, + "learning_rate": 6.885254649703287e-05, + "loss": 1.7923, + "step": 12891 + }, + { + "epoch": 3.9570288520564763, + "grad_norm": 0.4446276128292084, + "learning_rate": 6.884794270848988e-05, + "loss": 1.7504, + "step": 12892 + }, + { + "epoch": 3.9573357888275016, + "grad_norm": 0.2478526383638382, + "learning_rate": 6.88433387336805e-05, + "loss": 1.7629, + "step": 12893 + }, + { + "epoch": 3.957642725598527, + "grad_norm": 0.30111798644065857, + "learning_rate": 6.883873457265019e-05, + "loss": 1.8291, + "step": 12894 + }, + { + "epoch": 3.957949662369552, + "grad_norm": 0.3812437951564789, + "learning_rate": 6.883413022544445e-05, + "loss": 1.7919, + "step": 12895 + }, + { + "epoch": 3.958256599140577, + "grad_norm": 0.2895318269729614, + "learning_rate": 6.882952569210881e-05, + "loss": 1.7467, + "step": 12896 + }, + { + "epoch": 3.958563535911602, + "grad_norm": 0.30391454696655273, + "learning_rate": 6.882492097268873e-05, + "loss": 1.8145, + "step": 12897 + }, + { + "epoch": 3.9588704726826274, + "grad_norm": 0.5033623576164246, + "learning_rate": 6.882031606722977e-05, + "loss": 1.8231, + "step": 12898 + }, + { + "epoch": 3.9591774094536527, + "grad_norm": 0.5351777672767639, + "learning_rate": 6.881571097577742e-05, + "loss": 1.807, + "step": 12899 + }, + { + "epoch": 3.9594843462246776, + "grad_norm": 0.35540491342544556, + "learning_rate": 6.881110569837719e-05, + "loss": 1.7626, + "step": 12900 + }, + { + "epoch": 3.959791282995703, + "grad_norm": 0.22447600960731506, + "learning_rate": 6.880650023507457e-05, + "loss": 1.7392, + "step": 12901 + }, + { + "epoch": 3.960098219766728, + "grad_norm": 0.44619202613830566, + "learning_rate": 6.88018945859151e-05, + "loss": 1.8138, + "step": 12902 + }, + { + "epoch": 3.960405156537753, + "grad_norm": 0.41381633281707764, + "learning_rate": 6.879728875094428e-05, + "loss": 1.7676, + "step": 12903 + }, + { + "epoch": 3.9607120933087785, + "grad_norm": 0.2601528465747833, + "learning_rate": 6.879268273020764e-05, + "loss": 1.8406, + "step": 12904 + }, + { + "epoch": 3.961019030079804, + "grad_norm": 0.3309035003185272, + "learning_rate": 6.878807652375071e-05, + "loss": 1.7673, + "step": 12905 + }, + { + "epoch": 3.9613259668508287, + "grad_norm": 0.5281669497489929, + "learning_rate": 6.878347013161899e-05, + "loss": 1.7686, + "step": 12906 + }, + { + "epoch": 3.961632903621854, + "grad_norm": 0.5397645831108093, + "learning_rate": 6.8778863553858e-05, + "loss": 1.8575, + "step": 12907 + }, + { + "epoch": 3.961939840392879, + "grad_norm": 0.329485684633255, + "learning_rate": 6.877425679051327e-05, + "loss": 1.8185, + "step": 12908 + }, + { + "epoch": 3.9622467771639043, + "grad_norm": 0.3012789487838745, + "learning_rate": 6.876964984163034e-05, + "loss": 1.7962, + "step": 12909 + }, + { + "epoch": 3.9625537139349296, + "grad_norm": 0.5596817135810852, + "learning_rate": 6.876504270725472e-05, + "loss": 1.7972, + "step": 12910 + }, + { + "epoch": 3.9628606507059545, + "grad_norm": 0.5374729633331299, + "learning_rate": 6.876043538743197e-05, + "loss": 1.7863, + "step": 12911 + }, + { + "epoch": 3.96316758747698, + "grad_norm": 0.24617290496826172, + "learning_rate": 6.875582788220757e-05, + "loss": 1.7555, + "step": 12912 + }, + { + "epoch": 3.9634745242480047, + "grad_norm": 0.3493972420692444, + "learning_rate": 6.875122019162712e-05, + "loss": 1.8595, + "step": 12913 + }, + { + "epoch": 3.96378146101903, + "grad_norm": 0.4293089807033539, + "learning_rate": 6.874661231573609e-05, + "loss": 1.7647, + "step": 12914 + }, + { + "epoch": 3.9640883977900554, + "grad_norm": 0.30602574348449707, + "learning_rate": 6.874200425458006e-05, + "loss": 1.7122, + "step": 12915 + }, + { + "epoch": 3.9643953345610803, + "grad_norm": 0.22776013612747192, + "learning_rate": 6.873739600820457e-05, + "loss": 1.7136, + "step": 12916 + }, + { + "epoch": 3.9647022713321056, + "grad_norm": 0.3727327585220337, + "learning_rate": 6.873278757665513e-05, + "loss": 1.8314, + "step": 12917 + }, + { + "epoch": 3.9650092081031305, + "grad_norm": 0.35110536217689514, + "learning_rate": 6.872817895997733e-05, + "loss": 1.7506, + "step": 12918 + }, + { + "epoch": 3.965316144874156, + "grad_norm": 0.275560587644577, + "learning_rate": 6.872357015821666e-05, + "loss": 1.7865, + "step": 12919 + }, + { + "epoch": 3.965623081645181, + "grad_norm": 0.2686980366706848, + "learning_rate": 6.871896117141873e-05, + "loss": 1.8431, + "step": 12920 + }, + { + "epoch": 3.9659300184162065, + "grad_norm": 0.3299664556980133, + "learning_rate": 6.871435199962901e-05, + "loss": 1.7988, + "step": 12921 + }, + { + "epoch": 3.9662369551872314, + "grad_norm": 0.2833637297153473, + "learning_rate": 6.870974264289313e-05, + "loss": 1.6993, + "step": 12922 + }, + { + "epoch": 3.9665438919582567, + "grad_norm": 0.25062620639801025, + "learning_rate": 6.870513310125659e-05, + "loss": 1.7814, + "step": 12923 + }, + { + "epoch": 3.9668508287292816, + "grad_norm": 0.26609909534454346, + "learning_rate": 6.870052337476498e-05, + "loss": 1.7871, + "step": 12924 + }, + { + "epoch": 3.967157765500307, + "grad_norm": 0.22760890424251556, + "learning_rate": 6.869591346346382e-05, + "loss": 1.7941, + "step": 12925 + }, + { + "epoch": 3.9674647022713323, + "grad_norm": 0.2845582067966461, + "learning_rate": 6.869130336739869e-05, + "loss": 1.8215, + "step": 12926 + }, + { + "epoch": 3.967771639042357, + "grad_norm": 0.254948228597641, + "learning_rate": 6.868669308661514e-05, + "loss": 1.7515, + "step": 12927 + }, + { + "epoch": 3.9680785758133825, + "grad_norm": 0.2372167855501175, + "learning_rate": 6.868208262115875e-05, + "loss": 1.7524, + "step": 12928 + }, + { + "epoch": 3.9683855125844074, + "grad_norm": 0.31165993213653564, + "learning_rate": 6.867747197107506e-05, + "loss": 1.8139, + "step": 12929 + }, + { + "epoch": 3.9686924493554327, + "grad_norm": 0.2617839276790619, + "learning_rate": 6.867286113640965e-05, + "loss": 1.7388, + "step": 12930 + }, + { + "epoch": 3.968999386126458, + "grad_norm": 0.22749558091163635, + "learning_rate": 6.866825011720807e-05, + "loss": 1.7421, + "step": 12931 + }, + { + "epoch": 3.969306322897483, + "grad_norm": 0.27737462520599365, + "learning_rate": 6.86636389135159e-05, + "loss": 1.7977, + "step": 12932 + }, + { + "epoch": 3.9696132596685083, + "grad_norm": 0.3331063985824585, + "learning_rate": 6.865902752537871e-05, + "loss": 1.7925, + "step": 12933 + }, + { + "epoch": 3.969920196439533, + "grad_norm": 0.24229519069194794, + "learning_rate": 6.86544159528421e-05, + "loss": 1.7782, + "step": 12934 + }, + { + "epoch": 3.9702271332105585, + "grad_norm": 0.29494860768318176, + "learning_rate": 6.86498041959516e-05, + "loss": 1.7713, + "step": 12935 + }, + { + "epoch": 3.970534069981584, + "grad_norm": 0.26064008474349976, + "learning_rate": 6.86451922547528e-05, + "loss": 1.7161, + "step": 12936 + }, + { + "epoch": 3.970841006752609, + "grad_norm": 0.2656785547733307, + "learning_rate": 6.864058012929129e-05, + "loss": 1.8154, + "step": 12937 + }, + { + "epoch": 3.971147943523634, + "grad_norm": 0.21170997619628906, + "learning_rate": 6.863596781961263e-05, + "loss": 1.7614, + "step": 12938 + }, + { + "epoch": 3.9714548802946594, + "grad_norm": 0.21709072589874268, + "learning_rate": 6.863135532576241e-05, + "loss": 1.7896, + "step": 12939 + }, + { + "epoch": 3.9717618170656843, + "grad_norm": 0.2361367791891098, + "learning_rate": 6.862674264778623e-05, + "loss": 1.7775, + "step": 12940 + }, + { + "epoch": 3.9720687538367097, + "grad_norm": 0.22042550146579742, + "learning_rate": 6.862212978572967e-05, + "loss": 1.7781, + "step": 12941 + }, + { + "epoch": 3.972375690607735, + "grad_norm": 0.2535422146320343, + "learning_rate": 6.86175167396383e-05, + "loss": 1.7665, + "step": 12942 + }, + { + "epoch": 3.97268262737876, + "grad_norm": 0.23741906881332397, + "learning_rate": 6.861290350955771e-05, + "loss": 1.7829, + "step": 12943 + }, + { + "epoch": 3.972989564149785, + "grad_norm": 0.23789910972118378, + "learning_rate": 6.860829009553351e-05, + "loss": 1.7745, + "step": 12944 + }, + { + "epoch": 3.97329650092081, + "grad_norm": 0.26867765188217163, + "learning_rate": 6.860367649761127e-05, + "loss": 1.7239, + "step": 12945 + }, + { + "epoch": 3.9736034376918354, + "grad_norm": 0.3211663067340851, + "learning_rate": 6.85990627158366e-05, + "loss": 1.7976, + "step": 12946 + }, + { + "epoch": 3.9739103744628608, + "grad_norm": 0.26177310943603516, + "learning_rate": 6.85944487502551e-05, + "loss": 1.7446, + "step": 12947 + }, + { + "epoch": 3.9742173112338857, + "grad_norm": 0.23622745275497437, + "learning_rate": 6.858983460091234e-05, + "loss": 1.7824, + "step": 12948 + }, + { + "epoch": 3.974524248004911, + "grad_norm": 0.24372988939285278, + "learning_rate": 6.858522026785395e-05, + "loss": 1.8014, + "step": 12949 + }, + { + "epoch": 3.974831184775936, + "grad_norm": 0.2566998600959778, + "learning_rate": 6.85806057511255e-05, + "loss": 1.742, + "step": 12950 + }, + { + "epoch": 3.9751381215469612, + "grad_norm": 0.24418365955352783, + "learning_rate": 6.857599105077264e-05, + "loss": 1.7331, + "step": 12951 + }, + { + "epoch": 3.9754450583179866, + "grad_norm": 0.2260327935218811, + "learning_rate": 6.857137616684094e-05, + "loss": 1.7173, + "step": 12952 + }, + { + "epoch": 3.975751995089012, + "grad_norm": 0.277044415473938, + "learning_rate": 6.856676109937602e-05, + "loss": 1.7255, + "step": 12953 + }, + { + "epoch": 3.976058931860037, + "grad_norm": 0.228300079703331, + "learning_rate": 6.856214584842348e-05, + "loss": 1.7796, + "step": 12954 + }, + { + "epoch": 3.976365868631062, + "grad_norm": 0.2246638983488083, + "learning_rate": 6.855753041402893e-05, + "loss": 1.7458, + "step": 12955 + }, + { + "epoch": 3.976672805402087, + "grad_norm": 0.22235621511936188, + "learning_rate": 6.855291479623799e-05, + "loss": 1.7585, + "step": 12956 + }, + { + "epoch": 3.9769797421731123, + "grad_norm": 0.23710694909095764, + "learning_rate": 6.854829899509627e-05, + "loss": 1.767, + "step": 12957 + }, + { + "epoch": 3.9772866789441377, + "grad_norm": 0.2527346611022949, + "learning_rate": 6.854368301064939e-05, + "loss": 1.828, + "step": 12958 + }, + { + "epoch": 3.9775936157151626, + "grad_norm": 0.25032514333724976, + "learning_rate": 6.853906684294298e-05, + "loss": 1.8533, + "step": 12959 + }, + { + "epoch": 3.977900552486188, + "grad_norm": 0.2346320003271103, + "learning_rate": 6.853445049202262e-05, + "loss": 1.8046, + "step": 12960 + }, + { + "epoch": 3.978207489257213, + "grad_norm": 0.22576460242271423, + "learning_rate": 6.852983395793398e-05, + "loss": 1.7502, + "step": 12961 + }, + { + "epoch": 3.978514426028238, + "grad_norm": 0.2230147123336792, + "learning_rate": 6.852521724072266e-05, + "loss": 1.7362, + "step": 12962 + }, + { + "epoch": 3.9788213627992635, + "grad_norm": 0.2339705526828766, + "learning_rate": 6.852060034043425e-05, + "loss": 1.763, + "step": 12963 + }, + { + "epoch": 3.979128299570289, + "grad_norm": 0.24511271715164185, + "learning_rate": 6.851598325711446e-05, + "loss": 1.7988, + "step": 12964 + }, + { + "epoch": 3.9794352363413137, + "grad_norm": 0.2927285134792328, + "learning_rate": 6.851136599080885e-05, + "loss": 1.8346, + "step": 12965 + }, + { + "epoch": 3.979742173112339, + "grad_norm": 0.2593212425708771, + "learning_rate": 6.850674854156305e-05, + "loss": 1.7368, + "step": 12966 + }, + { + "epoch": 3.980049109883364, + "grad_norm": 0.3013291656970978, + "learning_rate": 6.850213090942275e-05, + "loss": 1.7911, + "step": 12967 + }, + { + "epoch": 3.9803560466543892, + "grad_norm": 0.3420047163963318, + "learning_rate": 6.849751309443352e-05, + "loss": 1.7899, + "step": 12968 + }, + { + "epoch": 3.9806629834254146, + "grad_norm": 0.2901746928691864, + "learning_rate": 6.849289509664105e-05, + "loss": 1.8244, + "step": 12969 + }, + { + "epoch": 3.9809699201964395, + "grad_norm": 0.2389298677444458, + "learning_rate": 6.848827691609093e-05, + "loss": 1.7116, + "step": 12970 + }, + { + "epoch": 3.981276856967465, + "grad_norm": 0.3153960704803467, + "learning_rate": 6.848365855282882e-05, + "loss": 1.7665, + "step": 12971 + }, + { + "epoch": 3.9815837937384897, + "grad_norm": 0.3162175118923187, + "learning_rate": 6.847904000690036e-05, + "loss": 1.7722, + "step": 12972 + }, + { + "epoch": 3.981890730509515, + "grad_norm": 0.27458643913269043, + "learning_rate": 6.847442127835122e-05, + "loss": 1.8095, + "step": 12973 + }, + { + "epoch": 3.9821976672805404, + "grad_norm": 0.22330710291862488, + "learning_rate": 6.846980236722699e-05, + "loss": 1.7179, + "step": 12974 + }, + { + "epoch": 3.9825046040515653, + "grad_norm": 0.2940923869609833, + "learning_rate": 6.846518327357339e-05, + "loss": 1.7363, + "step": 12975 + }, + { + "epoch": 3.9828115408225906, + "grad_norm": 0.26479849219322205, + "learning_rate": 6.846056399743599e-05, + "loss": 1.7788, + "step": 12976 + }, + { + "epoch": 3.9831184775936155, + "grad_norm": 0.24145057797431946, + "learning_rate": 6.845594453886048e-05, + "loss": 1.7825, + "step": 12977 + }, + { + "epoch": 3.983425414364641, + "grad_norm": 0.2795869708061218, + "learning_rate": 6.845132489789252e-05, + "loss": 1.7705, + "step": 12978 + }, + { + "epoch": 3.983732351135666, + "grad_norm": 0.3117202818393707, + "learning_rate": 6.844670507457776e-05, + "loss": 1.8183, + "step": 12979 + }, + { + "epoch": 3.9840392879066915, + "grad_norm": 0.2666899263858795, + "learning_rate": 6.844208506896184e-05, + "loss": 1.7434, + "step": 12980 + }, + { + "epoch": 3.9843462246777164, + "grad_norm": 0.24682332575321198, + "learning_rate": 6.843746488109042e-05, + "loss": 1.751, + "step": 12981 + }, + { + "epoch": 3.9846531614487417, + "grad_norm": 0.2558208703994751, + "learning_rate": 6.843284451100916e-05, + "loss": 1.7983, + "step": 12982 + }, + { + "epoch": 3.9849600982197666, + "grad_norm": 0.4236481189727783, + "learning_rate": 6.842822395876374e-05, + "loss": 1.8584, + "step": 12983 + }, + { + "epoch": 3.985267034990792, + "grad_norm": 0.4931485950946808, + "learning_rate": 6.84236032243998e-05, + "loss": 1.7617, + "step": 12984 + }, + { + "epoch": 3.9855739717618173, + "grad_norm": 0.37793654203414917, + "learning_rate": 6.841898230796302e-05, + "loss": 1.7411, + "step": 12985 + }, + { + "epoch": 3.985880908532842, + "grad_norm": 0.2093842774629593, + "learning_rate": 6.841436120949906e-05, + "loss": 1.772, + "step": 12986 + }, + { + "epoch": 3.9861878453038675, + "grad_norm": 0.4065552055835724, + "learning_rate": 6.840973992905359e-05, + "loss": 1.7675, + "step": 12987 + }, + { + "epoch": 3.9864947820748924, + "grad_norm": 0.5334183573722839, + "learning_rate": 6.840511846667228e-05, + "loss": 1.7872, + "step": 12988 + }, + { + "epoch": 3.9868017188459177, + "grad_norm": 0.378974974155426, + "learning_rate": 6.84004968224008e-05, + "loss": 1.8288, + "step": 12989 + }, + { + "epoch": 3.987108655616943, + "grad_norm": 0.22518309950828552, + "learning_rate": 6.839587499628483e-05, + "loss": 1.7715, + "step": 12990 + }, + { + "epoch": 3.987415592387968, + "grad_norm": 0.4270850718021393, + "learning_rate": 6.839125298837003e-05, + "loss": 1.7797, + "step": 12991 + }, + { + "epoch": 3.9877225291589933, + "grad_norm": 0.4629896879196167, + "learning_rate": 6.838663079870211e-05, + "loss": 1.7936, + "step": 12992 + }, + { + "epoch": 3.988029465930018, + "grad_norm": 0.29273948073387146, + "learning_rate": 6.838200842732672e-05, + "loss": 1.8264, + "step": 12993 + }, + { + "epoch": 3.9883364027010435, + "grad_norm": 0.31575852632522583, + "learning_rate": 6.837738587428954e-05, + "loss": 1.8043, + "step": 12994 + }, + { + "epoch": 3.988643339472069, + "grad_norm": 0.40602433681488037, + "learning_rate": 6.837276313963627e-05, + "loss": 1.7409, + "step": 12995 + }, + { + "epoch": 3.988950276243094, + "grad_norm": 0.23413142561912537, + "learning_rate": 6.836814022341259e-05, + "loss": 1.8585, + "step": 12996 + }, + { + "epoch": 3.989257213014119, + "grad_norm": 0.3518814444541931, + "learning_rate": 6.836351712566416e-05, + "loss": 1.7768, + "step": 12997 + }, + { + "epoch": 3.9895641497851444, + "grad_norm": 0.3811505436897278, + "learning_rate": 6.83588938464367e-05, + "loss": 1.7738, + "step": 12998 + }, + { + "epoch": 3.9898710865561693, + "grad_norm": 0.2516780197620392, + "learning_rate": 6.835427038577589e-05, + "loss": 1.7351, + "step": 12999 + }, + { + "epoch": 3.9901780233271946, + "grad_norm": 0.23704510927200317, + "learning_rate": 6.834964674372744e-05, + "loss": 1.7907, + "step": 13000 + }, + { + "epoch": 3.99048496009822, + "grad_norm": 0.2890201807022095, + "learning_rate": 6.8345022920337e-05, + "loss": 1.9546, + "step": 13001 + }, + { + "epoch": 3.990791896869245, + "grad_norm": 0.2678101360797882, + "learning_rate": 6.834039891565031e-05, + "loss": 1.7338, + "step": 13002 + }, + { + "epoch": 3.99109883364027, + "grad_norm": 0.31726256012916565, + "learning_rate": 6.833577472971304e-05, + "loss": 1.8464, + "step": 13003 + }, + { + "epoch": 3.991405770411295, + "grad_norm": 0.28112682700157166, + "learning_rate": 6.83311503625709e-05, + "loss": 1.7427, + "step": 13004 + }, + { + "epoch": 3.9917127071823204, + "grad_norm": 0.2651563584804535, + "learning_rate": 6.832652581426958e-05, + "loss": 1.8117, + "step": 13005 + }, + { + "epoch": 3.9920196439533457, + "grad_norm": 0.3095388114452362, + "learning_rate": 6.83219010848548e-05, + "loss": 1.8286, + "step": 13006 + }, + { + "epoch": 3.9923265807243706, + "grad_norm": 0.24704942107200623, + "learning_rate": 6.831727617437225e-05, + "loss": 1.77, + "step": 13007 + }, + { + "epoch": 3.992633517495396, + "grad_norm": 0.24868519604206085, + "learning_rate": 6.831265108286764e-05, + "loss": 1.8129, + "step": 13008 + }, + { + "epoch": 3.992940454266421, + "grad_norm": 0.26511049270629883, + "learning_rate": 6.830802581038669e-05, + "loss": 1.7539, + "step": 13009 + }, + { + "epoch": 3.993247391037446, + "grad_norm": 0.2823421061038971, + "learning_rate": 6.830340035697508e-05, + "loss": 1.8068, + "step": 13010 + }, + { + "epoch": 3.9935543278084715, + "grad_norm": 0.28526121377944946, + "learning_rate": 6.829877472267856e-05, + "loss": 1.764, + "step": 13011 + }, + { + "epoch": 3.993861264579497, + "grad_norm": 0.2576456069946289, + "learning_rate": 6.829414890754281e-05, + "loss": 1.728, + "step": 13012 + }, + { + "epoch": 3.9941682013505218, + "grad_norm": 0.27154842019081116, + "learning_rate": 6.828952291161356e-05, + "loss": 1.797, + "step": 13013 + }, + { + "epoch": 3.994475138121547, + "grad_norm": 0.3129710555076599, + "learning_rate": 6.828489673493652e-05, + "loss": 1.769, + "step": 13014 + }, + { + "epoch": 3.994782074892572, + "grad_norm": 0.40118902921676636, + "learning_rate": 6.828027037755742e-05, + "loss": 1.8029, + "step": 13015 + }, + { + "epoch": 3.9950890116635973, + "grad_norm": 0.33228442072868347, + "learning_rate": 6.827564383952197e-05, + "loss": 1.7295, + "step": 13016 + }, + { + "epoch": 3.9953959484346226, + "grad_norm": 0.218771830201149, + "learning_rate": 6.827101712087591e-05, + "loss": 1.7693, + "step": 13017 + }, + { + "epoch": 3.9957028852056475, + "grad_norm": 0.31354373693466187, + "learning_rate": 6.826639022166492e-05, + "loss": 1.743, + "step": 13018 + }, + { + "epoch": 3.996009821976673, + "grad_norm": 0.3584701418876648, + "learning_rate": 6.826176314193478e-05, + "loss": 1.7597, + "step": 13019 + }, + { + "epoch": 3.9963167587476978, + "grad_norm": 0.2692064344882965, + "learning_rate": 6.82571358817312e-05, + "loss": 1.7871, + "step": 13020 + }, + { + "epoch": 3.996623695518723, + "grad_norm": 0.3064020276069641, + "learning_rate": 6.825250844109987e-05, + "loss": 1.7858, + "step": 13021 + }, + { + "epoch": 3.9969306322897484, + "grad_norm": 0.29913413524627686, + "learning_rate": 6.824788082008657e-05, + "loss": 1.7773, + "step": 13022 + }, + { + "epoch": 3.9972375690607733, + "grad_norm": 0.2682165801525116, + "learning_rate": 6.824325301873703e-05, + "loss": 1.8321, + "step": 13023 + }, + { + "epoch": 3.9975445058317987, + "grad_norm": 0.3274376690387726, + "learning_rate": 6.823862503709694e-05, + "loss": 1.8514, + "step": 13024 + }, + { + "epoch": 3.9978514426028235, + "grad_norm": 0.29828041791915894, + "learning_rate": 6.823399687521211e-05, + "loss": 1.7923, + "step": 13025 + }, + { + "epoch": 3.998158379373849, + "grad_norm": 0.22339288890361786, + "learning_rate": 6.82293685331282e-05, + "loss": 1.756, + "step": 13026 + }, + { + "epoch": 3.998465316144874, + "grad_norm": 0.2254658192396164, + "learning_rate": 6.8224740010891e-05, + "loss": 1.7392, + "step": 13027 + }, + { + "epoch": 3.9987722529158995, + "grad_norm": 0.24932752549648285, + "learning_rate": 6.822011130854624e-05, + "loss": 1.7538, + "step": 13028 + }, + { + "epoch": 3.9990791896869244, + "grad_norm": 0.21429690718650818, + "learning_rate": 6.821548242613966e-05, + "loss": 1.7746, + "step": 13029 + }, + { + "epoch": 3.9993861264579498, + "grad_norm": 0.25503116846084595, + "learning_rate": 6.8210853363717e-05, + "loss": 1.814, + "step": 13030 + }, + { + "epoch": 3.9996930632289747, + "grad_norm": 0.23168155550956726, + "learning_rate": 6.820622412132402e-05, + "loss": 1.769, + "step": 13031 + }, + { + "epoch": 4.0, + "grad_norm": 0.2252223789691925, + "learning_rate": 6.820159469900645e-05, + "loss": 1.7782, + "step": 13032 + }, + { + "epoch": 4.000306936771025, + "grad_norm": 0.1996588408946991, + "learning_rate": 6.819696509681007e-05, + "loss": 1.6839, + "step": 13033 + }, + { + "epoch": 4.000613873542051, + "grad_norm": 0.22297053039073944, + "learning_rate": 6.81923353147806e-05, + "loss": 1.7767, + "step": 13034 + }, + { + "epoch": 4.000920810313075, + "grad_norm": 0.25867611169815063, + "learning_rate": 6.818770535296381e-05, + "loss": 1.8623, + "step": 13035 + }, + { + "epoch": 4.0012277470841005, + "grad_norm": 0.2173648178577423, + "learning_rate": 6.818307521140547e-05, + "loss": 1.8034, + "step": 13036 + }, + { + "epoch": 4.001534683855126, + "grad_norm": 0.23634609580039978, + "learning_rate": 6.81784448901513e-05, + "loss": 1.7503, + "step": 13037 + }, + { + "epoch": 4.001841620626151, + "grad_norm": 0.2626810073852539, + "learning_rate": 6.81738143892471e-05, + "loss": 1.8116, + "step": 13038 + }, + { + "epoch": 4.0021485573971765, + "grad_norm": 0.27888983488082886, + "learning_rate": 6.816918370873861e-05, + "loss": 1.8032, + "step": 13039 + }, + { + "epoch": 4.002455494168202, + "grad_norm": 0.275038480758667, + "learning_rate": 6.816455284867162e-05, + "loss": 1.7445, + "step": 13040 + }, + { + "epoch": 4.002762430939226, + "grad_norm": 0.3475828170776367, + "learning_rate": 6.815992180909184e-05, + "loss": 1.7404, + "step": 13041 + }, + { + "epoch": 4.003069367710252, + "grad_norm": 0.27314287424087524, + "learning_rate": 6.815529059004507e-05, + "loss": 1.8333, + "step": 13042 + }, + { + "epoch": 4.003376304481277, + "grad_norm": 0.34846973419189453, + "learning_rate": 6.815065919157709e-05, + "loss": 1.7921, + "step": 13043 + }, + { + "epoch": 4.003683241252302, + "grad_norm": 0.4191788136959076, + "learning_rate": 6.814602761373365e-05, + "loss": 1.8018, + "step": 13044 + }, + { + "epoch": 4.003990178023328, + "grad_norm": 0.2655608057975769, + "learning_rate": 6.814139585656055e-05, + "loss": 1.7638, + "step": 13045 + }, + { + "epoch": 4.004297114794352, + "grad_norm": 0.25938618183135986, + "learning_rate": 6.813676392010353e-05, + "loss": 1.794, + "step": 13046 + }, + { + "epoch": 4.004604051565377, + "grad_norm": 0.3464813828468323, + "learning_rate": 6.813213180440837e-05, + "loss": 1.8662, + "step": 13047 + }, + { + "epoch": 4.004910988336403, + "grad_norm": 0.30185338854789734, + "learning_rate": 6.812749950952087e-05, + "loss": 1.8029, + "step": 13048 + }, + { + "epoch": 4.005217925107428, + "grad_norm": 0.23291908204555511, + "learning_rate": 6.812286703548678e-05, + "loss": 1.7365, + "step": 13049 + }, + { + "epoch": 4.005524861878453, + "grad_norm": 0.3542841374874115, + "learning_rate": 6.811823438235189e-05, + "loss": 1.8674, + "step": 13050 + }, + { + "epoch": 4.005831798649478, + "grad_norm": 0.2914685606956482, + "learning_rate": 6.811360155016202e-05, + "loss": 1.8306, + "step": 13051 + }, + { + "epoch": 4.006138735420503, + "grad_norm": 0.24888737499713898, + "learning_rate": 6.810896853896289e-05, + "loss": 1.7767, + "step": 13052 + }, + { + "epoch": 4.0064456721915285, + "grad_norm": 0.2977537512779236, + "learning_rate": 6.810433534880033e-05, + "loss": 1.8227, + "step": 13053 + }, + { + "epoch": 4.006752608962554, + "grad_norm": 0.3367510735988617, + "learning_rate": 6.809970197972013e-05, + "loss": 1.734, + "step": 13054 + }, + { + "epoch": 4.007059545733579, + "grad_norm": 0.28098800778388977, + "learning_rate": 6.809506843176806e-05, + "loss": 1.7032, + "step": 13055 + }, + { + "epoch": 4.0073664825046045, + "grad_norm": 0.24016784131526947, + "learning_rate": 6.809043470498991e-05, + "loss": 1.7863, + "step": 13056 + }, + { + "epoch": 4.007673419275629, + "grad_norm": 0.2883957624435425, + "learning_rate": 6.808580079943148e-05, + "loss": 1.7342, + "step": 13057 + }, + { + "epoch": 4.007980356046654, + "grad_norm": 0.3069116473197937, + "learning_rate": 6.808116671513856e-05, + "loss": 1.8544, + "step": 13058 + }, + { + "epoch": 4.00828729281768, + "grad_norm": 0.24113236367702484, + "learning_rate": 6.807653245215697e-05, + "loss": 1.7692, + "step": 13059 + }, + { + "epoch": 4.008594229588705, + "grad_norm": 0.2651619017124176, + "learning_rate": 6.807189801053249e-05, + "loss": 1.8096, + "step": 13060 + }, + { + "epoch": 4.00890116635973, + "grad_norm": 0.2636481523513794, + "learning_rate": 6.806726339031092e-05, + "loss": 1.8062, + "step": 13061 + }, + { + "epoch": 4.009208103130755, + "grad_norm": 0.22691169381141663, + "learning_rate": 6.806262859153807e-05, + "loss": 1.7001, + "step": 13062 + }, + { + "epoch": 4.00951503990178, + "grad_norm": 0.23288170993328094, + "learning_rate": 6.805799361425972e-05, + "loss": 1.7508, + "step": 13063 + }, + { + "epoch": 4.009821976672805, + "grad_norm": 0.243272602558136, + "learning_rate": 6.80533584585217e-05, + "loss": 1.7797, + "step": 13064 + }, + { + "epoch": 4.010128913443831, + "grad_norm": 0.24594646692276, + "learning_rate": 6.80487231243698e-05, + "loss": 1.7894, + "step": 13065 + }, + { + "epoch": 4.010435850214856, + "grad_norm": 0.21726086735725403, + "learning_rate": 6.804408761184986e-05, + "loss": 1.7472, + "step": 13066 + }, + { + "epoch": 4.0107427869858805, + "grad_norm": 0.2262321561574936, + "learning_rate": 6.803945192100767e-05, + "loss": 1.7563, + "step": 13067 + }, + { + "epoch": 4.011049723756906, + "grad_norm": 0.2449522763490677, + "learning_rate": 6.803481605188903e-05, + "loss": 1.7282, + "step": 13068 + }, + { + "epoch": 4.011356660527931, + "grad_norm": 0.2281760573387146, + "learning_rate": 6.803018000453975e-05, + "loss": 1.8191, + "step": 13069 + }, + { + "epoch": 4.0116635972989565, + "grad_norm": 0.3039850890636444, + "learning_rate": 6.80255437790057e-05, + "loss": 1.8258, + "step": 13070 + }, + { + "epoch": 4.011970534069982, + "grad_norm": 0.3978467881679535, + "learning_rate": 6.802090737533264e-05, + "loss": 1.7338, + "step": 13071 + }, + { + "epoch": 4.012277470841007, + "grad_norm": 0.29175812005996704, + "learning_rate": 6.801627079356641e-05, + "loss": 1.7754, + "step": 13072 + }, + { + "epoch": 4.012584407612032, + "grad_norm": 0.24228449165821075, + "learning_rate": 6.801163403375285e-05, + "loss": 1.7624, + "step": 13073 + }, + { + "epoch": 4.012891344383057, + "grad_norm": 0.34527531266212463, + "learning_rate": 6.800699709593776e-05, + "loss": 1.87, + "step": 13074 + }, + { + "epoch": 4.013198281154082, + "grad_norm": 0.1995161920785904, + "learning_rate": 6.800235998016696e-05, + "loss": 1.7253, + "step": 13075 + }, + { + "epoch": 4.013505217925108, + "grad_norm": 0.3509151339530945, + "learning_rate": 6.799772268648628e-05, + "loss": 1.8013, + "step": 13076 + }, + { + "epoch": 4.013812154696133, + "grad_norm": 0.38569679856300354, + "learning_rate": 6.799308521494156e-05, + "loss": 1.7761, + "step": 13077 + }, + { + "epoch": 4.014119091467157, + "grad_norm": 0.2636256814002991, + "learning_rate": 6.798844756557865e-05, + "loss": 1.8101, + "step": 13078 + }, + { + "epoch": 4.014426028238183, + "grad_norm": 0.2570696473121643, + "learning_rate": 6.798380973844335e-05, + "loss": 1.7561, + "step": 13079 + }, + { + "epoch": 4.014732965009208, + "grad_norm": 0.38540002703666687, + "learning_rate": 6.797917173358148e-05, + "loss": 1.7893, + "step": 13080 + }, + { + "epoch": 4.015039901780233, + "grad_norm": 0.2974525988101959, + "learning_rate": 6.79745335510389e-05, + "loss": 1.8331, + "step": 13081 + }, + { + "epoch": 4.015346838551259, + "grad_norm": 0.2563362419605255, + "learning_rate": 6.796989519086146e-05, + "loss": 1.7784, + "step": 13082 + }, + { + "epoch": 4.015653775322283, + "grad_norm": 0.37037795782089233, + "learning_rate": 6.7965256653095e-05, + "loss": 1.7947, + "step": 13083 + }, + { + "epoch": 4.0159607120933085, + "grad_norm": 0.4145336449146271, + "learning_rate": 6.796061793778531e-05, + "loss": 1.7633, + "step": 13084 + }, + { + "epoch": 4.016267648864334, + "grad_norm": 0.32278406620025635, + "learning_rate": 6.795597904497828e-05, + "loss": 1.7827, + "step": 13085 + }, + { + "epoch": 4.016574585635359, + "grad_norm": 0.26466837525367737, + "learning_rate": 6.795133997471974e-05, + "loss": 1.7441, + "step": 13086 + }, + { + "epoch": 4.0168815224063845, + "grad_norm": 0.3212043344974518, + "learning_rate": 6.794670072705553e-05, + "loss": 1.7602, + "step": 13087 + }, + { + "epoch": 4.01718845917741, + "grad_norm": 0.3054736852645874, + "learning_rate": 6.79420613020315e-05, + "loss": 1.7417, + "step": 13088 + }, + { + "epoch": 4.017495395948434, + "grad_norm": 0.22281476855278015, + "learning_rate": 6.793742169969351e-05, + "loss": 1.7675, + "step": 13089 + }, + { + "epoch": 4.01780233271946, + "grad_norm": 0.32630839943885803, + "learning_rate": 6.793278192008742e-05, + "loss": 1.8409, + "step": 13090 + }, + { + "epoch": 4.018109269490485, + "grad_norm": 0.2658778429031372, + "learning_rate": 6.792814196325905e-05, + "loss": 1.7718, + "step": 13091 + }, + { + "epoch": 4.01841620626151, + "grad_norm": 0.24016901850700378, + "learning_rate": 6.792350182925429e-05, + "loss": 1.8393, + "step": 13092 + }, + { + "epoch": 4.018723143032536, + "grad_norm": 0.2882223427295685, + "learning_rate": 6.791886151811897e-05, + "loss": 1.7497, + "step": 13093 + }, + { + "epoch": 4.01903007980356, + "grad_norm": 0.24340751767158508, + "learning_rate": 6.791422102989895e-05, + "loss": 1.72, + "step": 13094 + }, + { + "epoch": 4.019337016574585, + "grad_norm": 0.235665962100029, + "learning_rate": 6.79095803646401e-05, + "loss": 1.7269, + "step": 13095 + }, + { + "epoch": 4.019643953345611, + "grad_norm": 0.32772955298423767, + "learning_rate": 6.79049395223883e-05, + "loss": 1.7916, + "step": 13096 + }, + { + "epoch": 4.019950890116636, + "grad_norm": 0.3189625144004822, + "learning_rate": 6.790029850318938e-05, + "loss": 1.7571, + "step": 13097 + }, + { + "epoch": 4.020257826887661, + "grad_norm": 0.2211185097694397, + "learning_rate": 6.789565730708921e-05, + "loss": 1.793, + "step": 13098 + }, + { + "epoch": 4.020564763658686, + "grad_norm": 0.2840392291545868, + "learning_rate": 6.789101593413367e-05, + "loss": 1.7434, + "step": 13099 + }, + { + "epoch": 4.020871700429711, + "grad_norm": 0.27857357263565063, + "learning_rate": 6.788637438436863e-05, + "loss": 1.742, + "step": 13100 + }, + { + "epoch": 4.0211786372007365, + "grad_norm": 0.314628005027771, + "learning_rate": 6.788173265783996e-05, + "loss": 1.7881, + "step": 13101 + }, + { + "epoch": 4.021485573971762, + "grad_norm": 0.2994774580001831, + "learning_rate": 6.787709075459352e-05, + "loss": 1.7741, + "step": 13102 + }, + { + "epoch": 4.021792510742787, + "grad_norm": 0.3256312310695648, + "learning_rate": 6.787244867467519e-05, + "loss": 1.7758, + "step": 13103 + }, + { + "epoch": 4.0220994475138125, + "grad_norm": 0.2332412451505661, + "learning_rate": 6.786780641813083e-05, + "loss": 1.7654, + "step": 13104 + }, + { + "epoch": 4.022406384284837, + "grad_norm": 0.23226258158683777, + "learning_rate": 6.786316398500636e-05, + "loss": 1.7605, + "step": 13105 + }, + { + "epoch": 4.022713321055862, + "grad_norm": 0.24631965160369873, + "learning_rate": 6.785852137534763e-05, + "loss": 1.7469, + "step": 13106 + }, + { + "epoch": 4.023020257826888, + "grad_norm": 0.1969226449728012, + "learning_rate": 6.785387858920051e-05, + "loss": 1.8151, + "step": 13107 + }, + { + "epoch": 4.023327194597913, + "grad_norm": 0.22769485414028168, + "learning_rate": 6.784923562661091e-05, + "loss": 1.7024, + "step": 13108 + }, + { + "epoch": 4.023634131368938, + "grad_norm": 0.2174670249223709, + "learning_rate": 6.78445924876247e-05, + "loss": 1.8094, + "step": 13109 + }, + { + "epoch": 4.023941068139963, + "grad_norm": 0.2606858015060425, + "learning_rate": 6.783994917228775e-05, + "loss": 1.8043, + "step": 13110 + }, + { + "epoch": 4.024248004910988, + "grad_norm": 0.24721349775791168, + "learning_rate": 6.783530568064599e-05, + "loss": 1.842, + "step": 13111 + }, + { + "epoch": 4.024554941682013, + "grad_norm": 0.2353603094816208, + "learning_rate": 6.783066201274529e-05, + "loss": 1.76, + "step": 13112 + }, + { + "epoch": 4.024861878453039, + "grad_norm": 0.22285830974578857, + "learning_rate": 6.782601816863153e-05, + "loss": 1.8014, + "step": 13113 + }, + { + "epoch": 4.025168815224064, + "grad_norm": 0.2482440173625946, + "learning_rate": 6.782137414835061e-05, + "loss": 1.7552, + "step": 13114 + }, + { + "epoch": 4.0254757519950894, + "grad_norm": 0.19926191866397858, + "learning_rate": 6.781672995194842e-05, + "loss": 1.7549, + "step": 13115 + }, + { + "epoch": 4.025782688766114, + "grad_norm": 0.2342877984046936, + "learning_rate": 6.781208557947086e-05, + "loss": 1.8622, + "step": 13116 + }, + { + "epoch": 4.026089625537139, + "grad_norm": 0.24096547067165375, + "learning_rate": 6.780744103096382e-05, + "loss": 1.7795, + "step": 13117 + }, + { + "epoch": 4.026396562308165, + "grad_norm": 0.23714657127857208, + "learning_rate": 6.780279630647322e-05, + "loss": 1.799, + "step": 13118 + }, + { + "epoch": 4.02670349907919, + "grad_norm": 0.28252026438713074, + "learning_rate": 6.779815140604496e-05, + "loss": 1.7573, + "step": 13119 + }, + { + "epoch": 4.027010435850215, + "grad_norm": 0.28028404712677, + "learning_rate": 6.779350632972493e-05, + "loss": 1.8103, + "step": 13120 + }, + { + "epoch": 4.02731737262124, + "grad_norm": 0.21088312566280365, + "learning_rate": 6.778886107755904e-05, + "loss": 1.7169, + "step": 13121 + }, + { + "epoch": 4.027624309392265, + "grad_norm": 0.22282038629055023, + "learning_rate": 6.77842156495932e-05, + "loss": 1.7206, + "step": 13122 + }, + { + "epoch": 4.02793124616329, + "grad_norm": 0.3281327784061432, + "learning_rate": 6.777957004587331e-05, + "loss": 1.8664, + "step": 13123 + }, + { + "epoch": 4.028238182934316, + "grad_norm": 0.29496827721595764, + "learning_rate": 6.77749242664453e-05, + "loss": 1.7532, + "step": 13124 + }, + { + "epoch": 4.028545119705341, + "grad_norm": 0.25299328565597534, + "learning_rate": 6.777027831135508e-05, + "loss": 1.7836, + "step": 13125 + }, + { + "epoch": 4.0288520564763655, + "grad_norm": 0.3000280559062958, + "learning_rate": 6.776563218064854e-05, + "loss": 1.8079, + "step": 13126 + }, + { + "epoch": 4.029158993247391, + "grad_norm": 0.3613673448562622, + "learning_rate": 6.77609858743716e-05, + "loss": 1.7931, + "step": 13127 + }, + { + "epoch": 4.029465930018416, + "grad_norm": 0.25613468885421753, + "learning_rate": 6.77563393925702e-05, + "loss": 1.7522, + "step": 13128 + }, + { + "epoch": 4.0297728667894415, + "grad_norm": 0.24391578137874603, + "learning_rate": 6.775169273529026e-05, + "loss": 1.818, + "step": 13129 + }, + { + "epoch": 4.030079803560467, + "grad_norm": 0.2806173264980316, + "learning_rate": 6.774704590257768e-05, + "loss": 1.7349, + "step": 13130 + }, + { + "epoch": 4.030386740331492, + "grad_norm": 0.22214172780513763, + "learning_rate": 6.774239889447838e-05, + "loss": 1.759, + "step": 13131 + }, + { + "epoch": 4.030693677102517, + "grad_norm": 0.27285513281822205, + "learning_rate": 6.773775171103828e-05, + "loss": 1.742, + "step": 13132 + }, + { + "epoch": 4.031000613873542, + "grad_norm": 0.22302402555942535, + "learning_rate": 6.773310435230334e-05, + "loss": 1.7277, + "step": 13133 + }, + { + "epoch": 4.031307550644567, + "grad_norm": 0.2350187450647354, + "learning_rate": 6.772845681831947e-05, + "loss": 1.8648, + "step": 13134 + }, + { + "epoch": 4.031614487415593, + "grad_norm": 0.2665547728538513, + "learning_rate": 6.772380910913261e-05, + "loss": 1.776, + "step": 13135 + }, + { + "epoch": 4.031921424186618, + "grad_norm": 0.30652403831481934, + "learning_rate": 6.771916122478867e-05, + "loss": 1.7884, + "step": 13136 + }, + { + "epoch": 4.032228360957642, + "grad_norm": 0.29372814297676086, + "learning_rate": 6.771451316533359e-05, + "loss": 1.8203, + "step": 13137 + }, + { + "epoch": 4.032535297728668, + "grad_norm": 0.2244873046875, + "learning_rate": 6.770986493081329e-05, + "loss": 1.7869, + "step": 13138 + }, + { + "epoch": 4.032842234499693, + "grad_norm": 0.25075265765190125, + "learning_rate": 6.770521652127375e-05, + "loss": 1.772, + "step": 13139 + }, + { + "epoch": 4.033149171270718, + "grad_norm": 0.28118211030960083, + "learning_rate": 6.770056793676087e-05, + "loss": 1.7922, + "step": 13140 + }, + { + "epoch": 4.033456108041744, + "grad_norm": 0.25199100375175476, + "learning_rate": 6.769591917732062e-05, + "loss": 1.7526, + "step": 13141 + }, + { + "epoch": 4.033763044812768, + "grad_norm": 0.2920379638671875, + "learning_rate": 6.769127024299892e-05, + "loss": 1.8365, + "step": 13142 + }, + { + "epoch": 4.0340699815837935, + "grad_norm": 0.23018018901348114, + "learning_rate": 6.768662113384171e-05, + "loss": 1.7411, + "step": 13143 + }, + { + "epoch": 4.034376918354819, + "grad_norm": 0.23253841698169708, + "learning_rate": 6.768197184989494e-05, + "loss": 1.7921, + "step": 13144 + }, + { + "epoch": 4.034683855125844, + "grad_norm": 0.22618864476680756, + "learning_rate": 6.767732239120456e-05, + "loss": 1.7421, + "step": 13145 + }, + { + "epoch": 4.0349907918968695, + "grad_norm": 0.24552187323570251, + "learning_rate": 6.767267275781655e-05, + "loss": 1.7299, + "step": 13146 + }, + { + "epoch": 4.035297728667895, + "grad_norm": 0.22562766075134277, + "learning_rate": 6.76680229497768e-05, + "loss": 1.766, + "step": 13147 + }, + { + "epoch": 4.035604665438919, + "grad_norm": 0.28718629479408264, + "learning_rate": 6.76633729671313e-05, + "loss": 1.7366, + "step": 13148 + }, + { + "epoch": 4.035911602209945, + "grad_norm": 0.38769885897636414, + "learning_rate": 6.765872280992598e-05, + "loss": 1.8244, + "step": 13149 + }, + { + "epoch": 4.03621853898097, + "grad_norm": 0.4232725501060486, + "learning_rate": 6.765407247820683e-05, + "loss": 1.8244, + "step": 13150 + }, + { + "epoch": 4.036525475751995, + "grad_norm": 0.2771088778972626, + "learning_rate": 6.764942197201977e-05, + "loss": 1.7863, + "step": 13151 + }, + { + "epoch": 4.036832412523021, + "grad_norm": 0.2917862832546234, + "learning_rate": 6.76447712914108e-05, + "loss": 1.791, + "step": 13152 + }, + { + "epoch": 4.037139349294045, + "grad_norm": 0.37355467677116394, + "learning_rate": 6.764012043642584e-05, + "loss": 1.74, + "step": 13153 + }, + { + "epoch": 4.03744628606507, + "grad_norm": 0.35664018988609314, + "learning_rate": 6.763546940711089e-05, + "loss": 1.7734, + "step": 13154 + }, + { + "epoch": 4.037753222836096, + "grad_norm": 0.2335754930973053, + "learning_rate": 6.763081820351188e-05, + "loss": 1.7765, + "step": 13155 + }, + { + "epoch": 4.038060159607121, + "grad_norm": 0.2825562357902527, + "learning_rate": 6.762616682567478e-05, + "loss": 1.7867, + "step": 13156 + }, + { + "epoch": 4.038367096378146, + "grad_norm": 0.3103202283382416, + "learning_rate": 6.762151527364559e-05, + "loss": 1.7331, + "step": 13157 + }, + { + "epoch": 4.038674033149171, + "grad_norm": 0.2897353172302246, + "learning_rate": 6.761686354747025e-05, + "loss": 1.7638, + "step": 13158 + }, + { + "epoch": 4.038980969920196, + "grad_norm": 0.21260851621627808, + "learning_rate": 6.761221164719474e-05, + "loss": 1.7302, + "step": 13159 + }, + { + "epoch": 4.0392879066912215, + "grad_norm": 0.2878021001815796, + "learning_rate": 6.760755957286503e-05, + "loss": 1.7368, + "step": 13160 + }, + { + "epoch": 4.039594843462247, + "grad_norm": 0.2785978317260742, + "learning_rate": 6.76029073245271e-05, + "loss": 1.7258, + "step": 13161 + }, + { + "epoch": 4.039901780233272, + "grad_norm": 0.1963953971862793, + "learning_rate": 6.759825490222692e-05, + "loss": 1.755, + "step": 13162 + }, + { + "epoch": 4.0402087170042975, + "grad_norm": 0.26776790618896484, + "learning_rate": 6.759360230601047e-05, + "loss": 1.7676, + "step": 13163 + }, + { + "epoch": 4.040515653775322, + "grad_norm": 0.2751332223415375, + "learning_rate": 6.758894953592373e-05, + "loss": 1.7313, + "step": 13164 + }, + { + "epoch": 4.040822590546347, + "grad_norm": 0.2339213341474533, + "learning_rate": 6.758429659201269e-05, + "loss": 1.714, + "step": 13165 + }, + { + "epoch": 4.041129527317373, + "grad_norm": 0.2624664008617401, + "learning_rate": 6.75796434743233e-05, + "loss": 1.8296, + "step": 13166 + }, + { + "epoch": 4.041436464088398, + "grad_norm": 0.40156883001327515, + "learning_rate": 6.757499018290159e-05, + "loss": 1.8228, + "step": 13167 + }, + { + "epoch": 4.041743400859423, + "grad_norm": 0.32976576685905457, + "learning_rate": 6.757033671779352e-05, + "loss": 1.7403, + "step": 13168 + }, + { + "epoch": 4.042050337630448, + "grad_norm": 0.2343887835741043, + "learning_rate": 6.756568307904508e-05, + "loss": 1.7837, + "step": 13169 + }, + { + "epoch": 4.042357274401473, + "grad_norm": 0.36174145340919495, + "learning_rate": 6.756102926670227e-05, + "loss": 1.7291, + "step": 13170 + }, + { + "epoch": 4.042664211172498, + "grad_norm": 0.3324793577194214, + "learning_rate": 6.755637528081108e-05, + "loss": 1.7414, + "step": 13171 + }, + { + "epoch": 4.042971147943524, + "grad_norm": 0.21945348381996155, + "learning_rate": 6.75517211214175e-05, + "loss": 1.7762, + "step": 13172 + }, + { + "epoch": 4.043278084714549, + "grad_norm": 0.31069812178611755, + "learning_rate": 6.75470667885675e-05, + "loss": 1.7666, + "step": 13173 + }, + { + "epoch": 4.043585021485574, + "grad_norm": 0.3931153118610382, + "learning_rate": 6.754241228230713e-05, + "loss": 1.7871, + "step": 13174 + }, + { + "epoch": 4.043891958256599, + "grad_norm": 0.25559595227241516, + "learning_rate": 6.753775760268234e-05, + "loss": 1.7916, + "step": 13175 + }, + { + "epoch": 4.044198895027624, + "grad_norm": 0.3686937391757965, + "learning_rate": 6.753310274973917e-05, + "loss": 1.7642, + "step": 13176 + }, + { + "epoch": 4.0445058317986495, + "grad_norm": 0.4793247580528259, + "learning_rate": 6.75284477235236e-05, + "loss": 1.739, + "step": 13177 + }, + { + "epoch": 4.044812768569675, + "grad_norm": 0.36179354786872864, + "learning_rate": 6.752379252408164e-05, + "loss": 1.7993, + "step": 13178 + }, + { + "epoch": 4.0451197053407, + "grad_norm": 0.22559234499931335, + "learning_rate": 6.751913715145926e-05, + "loss": 1.7401, + "step": 13179 + }, + { + "epoch": 4.045426642111725, + "grad_norm": 0.29058873653411865, + "learning_rate": 6.751448160570253e-05, + "loss": 1.8089, + "step": 13180 + }, + { + "epoch": 4.04573357888275, + "grad_norm": 0.3069808781147003, + "learning_rate": 6.750982588685742e-05, + "loss": 1.7587, + "step": 13181 + }, + { + "epoch": 4.046040515653775, + "grad_norm": 0.2292155921459198, + "learning_rate": 6.750516999496994e-05, + "loss": 1.7429, + "step": 13182 + }, + { + "epoch": 4.046347452424801, + "grad_norm": 0.2520677149295807, + "learning_rate": 6.750051393008612e-05, + "loss": 1.7842, + "step": 13183 + }, + { + "epoch": 4.046654389195826, + "grad_norm": 0.32546502351760864, + "learning_rate": 6.749585769225194e-05, + "loss": 1.8057, + "step": 13184 + }, + { + "epoch": 4.04696132596685, + "grad_norm": 0.27634644508361816, + "learning_rate": 6.749120128151346e-05, + "loss": 1.7708, + "step": 13185 + }, + { + "epoch": 4.047268262737876, + "grad_norm": 0.2546750009059906, + "learning_rate": 6.748654469791668e-05, + "loss": 1.8744, + "step": 13186 + }, + { + "epoch": 4.047575199508901, + "grad_norm": 0.43873605132102966, + "learning_rate": 6.748188794150761e-05, + "loss": 1.8573, + "step": 13187 + }, + { + "epoch": 4.047882136279926, + "grad_norm": 0.45526960492134094, + "learning_rate": 6.747723101233227e-05, + "loss": 1.7761, + "step": 13188 + }, + { + "epoch": 4.048189073050952, + "grad_norm": 0.24995557963848114, + "learning_rate": 6.74725739104367e-05, + "loss": 1.7679, + "step": 13189 + }, + { + "epoch": 4.048496009821977, + "grad_norm": 0.3203068971633911, + "learning_rate": 6.74679166358669e-05, + "loss": 1.7772, + "step": 13190 + }, + { + "epoch": 4.0488029465930016, + "grad_norm": 0.37020671367645264, + "learning_rate": 6.746325918866893e-05, + "loss": 1.8002, + "step": 13191 + }, + { + "epoch": 4.049109883364027, + "grad_norm": 0.2543959319591522, + "learning_rate": 6.745860156888878e-05, + "loss": 1.8057, + "step": 13192 + }, + { + "epoch": 4.049416820135052, + "grad_norm": 0.2566509246826172, + "learning_rate": 6.74539437765725e-05, + "loss": 1.7853, + "step": 13193 + }, + { + "epoch": 4.0497237569060776, + "grad_norm": 0.2545804977416992, + "learning_rate": 6.744928581176612e-05, + "loss": 1.8136, + "step": 13194 + }, + { + "epoch": 4.050030693677103, + "grad_norm": 0.24307197332382202, + "learning_rate": 6.744462767451568e-05, + "loss": 1.7919, + "step": 13195 + }, + { + "epoch": 4.050337630448127, + "grad_norm": 0.24427616596221924, + "learning_rate": 6.743996936486719e-05, + "loss": 1.8037, + "step": 13196 + }, + { + "epoch": 4.050644567219153, + "grad_norm": 0.2154439389705658, + "learning_rate": 6.743531088286673e-05, + "loss": 1.7088, + "step": 13197 + }, + { + "epoch": 4.050951503990178, + "grad_norm": 0.22251558303833008, + "learning_rate": 6.743065222856027e-05, + "loss": 1.7512, + "step": 13198 + }, + { + "epoch": 4.051258440761203, + "grad_norm": 0.2373272329568863, + "learning_rate": 6.74259934019939e-05, + "loss": 1.8056, + "step": 13199 + }, + { + "epoch": 4.051565377532229, + "grad_norm": 0.23308727145195007, + "learning_rate": 6.742133440321366e-05, + "loss": 1.731, + "step": 13200 + }, + { + "epoch": 4.051872314303253, + "grad_norm": 0.2438805252313614, + "learning_rate": 6.741667523226557e-05, + "loss": 1.7938, + "step": 13201 + }, + { + "epoch": 4.0521792510742785, + "grad_norm": 0.22354702651500702, + "learning_rate": 6.741201588919569e-05, + "loss": 1.762, + "step": 13202 + }, + { + "epoch": 4.052486187845304, + "grad_norm": 0.2505488097667694, + "learning_rate": 6.740735637405006e-05, + "loss": 1.7627, + "step": 13203 + }, + { + "epoch": 4.052793124616329, + "grad_norm": 0.21378709375858307, + "learning_rate": 6.740269668687474e-05, + "loss": 1.7598, + "step": 13204 + }, + { + "epoch": 4.0531000613873545, + "grad_norm": 0.24863660335540771, + "learning_rate": 6.739803682771577e-05, + "loss": 1.7665, + "step": 13205 + }, + { + "epoch": 4.05340699815838, + "grad_norm": 0.3041808605194092, + "learning_rate": 6.739337679661921e-05, + "loss": 1.7909, + "step": 13206 + }, + { + "epoch": 4.053713934929404, + "grad_norm": 0.2745797634124756, + "learning_rate": 6.738871659363109e-05, + "loss": 1.7547, + "step": 13207 + }, + { + "epoch": 4.05402087170043, + "grad_norm": 0.2610073387622833, + "learning_rate": 6.738405621879748e-05, + "loss": 1.7723, + "step": 13208 + }, + { + "epoch": 4.054327808471455, + "grad_norm": 0.22728075087070465, + "learning_rate": 6.737939567216446e-05, + "loss": 1.7865, + "step": 13209 + }, + { + "epoch": 4.05463474524248, + "grad_norm": 0.2877669930458069, + "learning_rate": 6.737473495377804e-05, + "loss": 1.8352, + "step": 13210 + }, + { + "epoch": 4.054941682013506, + "grad_norm": 0.35316282510757446, + "learning_rate": 6.737007406368432e-05, + "loss": 1.8202, + "step": 13211 + }, + { + "epoch": 4.05524861878453, + "grad_norm": 0.34625691175460815, + "learning_rate": 6.736541300192936e-05, + "loss": 1.8456, + "step": 13212 + }, + { + "epoch": 4.055555555555555, + "grad_norm": 0.2432134598493576, + "learning_rate": 6.736075176855917e-05, + "loss": 1.8237, + "step": 13213 + }, + { + "epoch": 4.055862492326581, + "grad_norm": 0.27446529269218445, + "learning_rate": 6.735609036361989e-05, + "loss": 1.71, + "step": 13214 + }, + { + "epoch": 4.056169429097606, + "grad_norm": 0.2870408892631531, + "learning_rate": 6.735142878715754e-05, + "loss": 1.7473, + "step": 13215 + }, + { + "epoch": 4.056476365868631, + "grad_norm": 0.22249078750610352, + "learning_rate": 6.734676703921822e-05, + "loss": 1.7462, + "step": 13216 + }, + { + "epoch": 4.056783302639656, + "grad_norm": 0.25519105792045593, + "learning_rate": 6.734210511984796e-05, + "loss": 1.7022, + "step": 13217 + }, + { + "epoch": 4.057090239410681, + "grad_norm": 0.3366561830043793, + "learning_rate": 6.733744302909285e-05, + "loss": 1.787, + "step": 13218 + }, + { + "epoch": 4.0573971761817065, + "grad_norm": 0.2443208247423172, + "learning_rate": 6.733278076699897e-05, + "loss": 1.8048, + "step": 13219 + }, + { + "epoch": 4.057704112952732, + "grad_norm": 0.2893153131008148, + "learning_rate": 6.73281183336124e-05, + "loss": 1.7805, + "step": 13220 + }, + { + "epoch": 4.058011049723757, + "grad_norm": 0.3178043067455292, + "learning_rate": 6.73234557289792e-05, + "loss": 1.8264, + "step": 13221 + }, + { + "epoch": 4.0583179864947825, + "grad_norm": 0.27355703711509705, + "learning_rate": 6.731879295314546e-05, + "loss": 1.8427, + "step": 13222 + }, + { + "epoch": 4.058624923265807, + "grad_norm": 0.32180166244506836, + "learning_rate": 6.731413000615726e-05, + "loss": 1.7332, + "step": 13223 + }, + { + "epoch": 4.058931860036832, + "grad_norm": 0.3736574351787567, + "learning_rate": 6.730946688806067e-05, + "loss": 1.7447, + "step": 13224 + }, + { + "epoch": 4.059238796807858, + "grad_norm": 0.2526068687438965, + "learning_rate": 6.73048035989018e-05, + "loss": 1.8104, + "step": 13225 + }, + { + "epoch": 4.059545733578883, + "grad_norm": 0.29076167941093445, + "learning_rate": 6.73001401387267e-05, + "loss": 1.7977, + "step": 13226 + }, + { + "epoch": 4.059852670349908, + "grad_norm": 0.37963762879371643, + "learning_rate": 6.729547650758148e-05, + "loss": 1.8336, + "step": 13227 + }, + { + "epoch": 4.060159607120933, + "grad_norm": 0.31584078073501587, + "learning_rate": 6.729081270551222e-05, + "loss": 1.7843, + "step": 13228 + }, + { + "epoch": 4.060466543891958, + "grad_norm": 0.22793468832969666, + "learning_rate": 6.728614873256502e-05, + "loss": 1.7444, + "step": 13229 + }, + { + "epoch": 4.060773480662983, + "grad_norm": 0.3114435076713562, + "learning_rate": 6.728148458878596e-05, + "loss": 1.8012, + "step": 13230 + }, + { + "epoch": 4.061080417434009, + "grad_norm": 0.29843854904174805, + "learning_rate": 6.727682027422116e-05, + "loss": 1.8014, + "step": 13231 + }, + { + "epoch": 4.061387354205034, + "grad_norm": 0.22745616734027863, + "learning_rate": 6.727215578891668e-05, + "loss": 1.7303, + "step": 13232 + }, + { + "epoch": 4.0616942909760585, + "grad_norm": 0.2701241970062256, + "learning_rate": 6.726749113291864e-05, + "loss": 1.7665, + "step": 13233 + }, + { + "epoch": 4.062001227747084, + "grad_norm": 0.29304635524749756, + "learning_rate": 6.726282630627313e-05, + "loss": 1.875, + "step": 13234 + }, + { + "epoch": 4.062308164518109, + "grad_norm": 0.21467708051204681, + "learning_rate": 6.725816130902625e-05, + "loss": 1.7442, + "step": 13235 + }, + { + "epoch": 4.0626151012891345, + "grad_norm": 0.23517470061779022, + "learning_rate": 6.72534961412241e-05, + "loss": 1.7154, + "step": 13236 + }, + { + "epoch": 4.06292203806016, + "grad_norm": 0.21483808755874634, + "learning_rate": 6.724883080291278e-05, + "loss": 1.7162, + "step": 13237 + }, + { + "epoch": 4.063228974831185, + "grad_norm": 0.2274744212627411, + "learning_rate": 6.724416529413843e-05, + "loss": 1.8066, + "step": 13238 + }, + { + "epoch": 4.06353591160221, + "grad_norm": 0.24682378768920898, + "learning_rate": 6.723949961494712e-05, + "loss": 1.7905, + "step": 13239 + }, + { + "epoch": 4.063842848373235, + "grad_norm": 0.2516227066516876, + "learning_rate": 6.723483376538498e-05, + "loss": 1.7693, + "step": 13240 + }, + { + "epoch": 4.06414978514426, + "grad_norm": 0.22076398134231567, + "learning_rate": 6.723016774549808e-05, + "loss": 1.7357, + "step": 13241 + }, + { + "epoch": 4.064456721915286, + "grad_norm": 0.20741026103496552, + "learning_rate": 6.722550155533258e-05, + "loss": 1.8082, + "step": 13242 + }, + { + "epoch": 4.064763658686311, + "grad_norm": 0.2074010819196701, + "learning_rate": 6.722083519493458e-05, + "loss": 1.71, + "step": 13243 + }, + { + "epoch": 4.065070595457335, + "grad_norm": 0.2661527991294861, + "learning_rate": 6.72161686643502e-05, + "loss": 1.7448, + "step": 13244 + }, + { + "epoch": 4.065377532228361, + "grad_norm": 0.2877216935157776, + "learning_rate": 6.721150196362555e-05, + "loss": 1.7574, + "step": 13245 + }, + { + "epoch": 4.065684468999386, + "grad_norm": 0.2520955801010132, + "learning_rate": 6.720683509280675e-05, + "loss": 1.7717, + "step": 13246 + }, + { + "epoch": 4.065991405770411, + "grad_norm": 0.2219560444355011, + "learning_rate": 6.72021680519399e-05, + "loss": 1.7355, + "step": 13247 + }, + { + "epoch": 4.066298342541437, + "grad_norm": 0.24671706557273865, + "learning_rate": 6.719750084107117e-05, + "loss": 1.8204, + "step": 13248 + }, + { + "epoch": 4.066605279312462, + "grad_norm": 0.24512135982513428, + "learning_rate": 6.719283346024664e-05, + "loss": 1.826, + "step": 13249 + }, + { + "epoch": 4.0669122160834865, + "grad_norm": 0.24370841681957245, + "learning_rate": 6.718816590951247e-05, + "loss": 1.8322, + "step": 13250 + }, + { + "epoch": 4.067219152854512, + "grad_norm": 0.2312363088130951, + "learning_rate": 6.718349818891475e-05, + "loss": 1.7621, + "step": 13251 + }, + { + "epoch": 4.067526089625537, + "grad_norm": 0.2500494420528412, + "learning_rate": 6.717883029849965e-05, + "loss": 1.829, + "step": 13252 + }, + { + "epoch": 4.0678330263965625, + "grad_norm": 0.29882633686065674, + "learning_rate": 6.717416223831324e-05, + "loss": 1.799, + "step": 13253 + }, + { + "epoch": 4.068139963167588, + "grad_norm": 0.21962928771972656, + "learning_rate": 6.716949400840172e-05, + "loss": 1.7714, + "step": 13254 + }, + { + "epoch": 4.068446899938612, + "grad_norm": 0.25544899702072144, + "learning_rate": 6.716482560881121e-05, + "loss": 1.7911, + "step": 13255 + }, + { + "epoch": 4.068753836709638, + "grad_norm": 0.24865686893463135, + "learning_rate": 6.716015703958781e-05, + "loss": 1.7107, + "step": 13256 + }, + { + "epoch": 4.069060773480663, + "grad_norm": 0.22669239342212677, + "learning_rate": 6.715548830077769e-05, + "loss": 1.8503, + "step": 13257 + }, + { + "epoch": 4.069367710251688, + "grad_norm": 0.2973819077014923, + "learning_rate": 6.715081939242698e-05, + "loss": 1.7859, + "step": 13258 + }, + { + "epoch": 4.069674647022714, + "grad_norm": 0.3178746700286865, + "learning_rate": 6.714615031458181e-05, + "loss": 1.7705, + "step": 13259 + }, + { + "epoch": 4.069981583793738, + "grad_norm": 0.20452535152435303, + "learning_rate": 6.714148106728835e-05, + "loss": 1.7386, + "step": 13260 + }, + { + "epoch": 4.070288520564763, + "grad_norm": 0.30288320779800415, + "learning_rate": 6.713681165059271e-05, + "loss": 1.7823, + "step": 13261 + }, + { + "epoch": 4.070595457335789, + "grad_norm": 0.30014416575431824, + "learning_rate": 6.713214206454107e-05, + "loss": 1.7626, + "step": 13262 + }, + { + "epoch": 4.070902394106814, + "grad_norm": 0.25144243240356445, + "learning_rate": 6.712747230917956e-05, + "loss": 1.8359, + "step": 13263 + }, + { + "epoch": 4.071209330877839, + "grad_norm": 0.308148592710495, + "learning_rate": 6.712280238455432e-05, + "loss": 1.7226, + "step": 13264 + }, + { + "epoch": 4.071516267648865, + "grad_norm": 0.2704198658466339, + "learning_rate": 6.711813229071151e-05, + "loss": 1.7982, + "step": 13265 + }, + { + "epoch": 4.071823204419889, + "grad_norm": 0.3928656280040741, + "learning_rate": 6.711346202769729e-05, + "loss": 1.7987, + "step": 13266 + }, + { + "epoch": 4.0721301411909145, + "grad_norm": 0.3603350520133972, + "learning_rate": 6.71087915955578e-05, + "loss": 1.7963, + "step": 13267 + }, + { + "epoch": 4.07243707796194, + "grad_norm": 0.2673214077949524, + "learning_rate": 6.710412099433921e-05, + "loss": 1.8011, + "step": 13268 + }, + { + "epoch": 4.072744014732965, + "grad_norm": 0.2523653209209442, + "learning_rate": 6.709945022408768e-05, + "loss": 1.755, + "step": 13269 + }, + { + "epoch": 4.0730509515039905, + "grad_norm": 0.3818903863430023, + "learning_rate": 6.709477928484934e-05, + "loss": 1.7968, + "step": 13270 + }, + { + "epoch": 4.073357888275015, + "grad_norm": 0.31509929895401, + "learning_rate": 6.709010817667039e-05, + "loss": 1.744, + "step": 13271 + }, + { + "epoch": 4.07366482504604, + "grad_norm": 0.21875518560409546, + "learning_rate": 6.708543689959697e-05, + "loss": 1.7511, + "step": 13272 + }, + { + "epoch": 4.073971761817066, + "grad_norm": 0.25381338596343994, + "learning_rate": 6.708076545367523e-05, + "loss": 1.7523, + "step": 13273 + }, + { + "epoch": 4.074278698588091, + "grad_norm": 0.24193842709064484, + "learning_rate": 6.707609383895137e-05, + "loss": 1.7713, + "step": 13274 + }, + { + "epoch": 4.074585635359116, + "grad_norm": 0.21972359716892242, + "learning_rate": 6.707142205547154e-05, + "loss": 1.7329, + "step": 13275 + }, + { + "epoch": 4.074892572130141, + "grad_norm": 0.22188499569892883, + "learning_rate": 6.706675010328192e-05, + "loss": 1.7507, + "step": 13276 + }, + { + "epoch": 4.075199508901166, + "grad_norm": 0.23344436287879944, + "learning_rate": 6.706207798242865e-05, + "loss": 1.771, + "step": 13277 + }, + { + "epoch": 4.0755064456721914, + "grad_norm": 0.3008805513381958, + "learning_rate": 6.705740569295795e-05, + "loss": 1.775, + "step": 13278 + }, + { + "epoch": 4.075813382443217, + "grad_norm": 0.31407982110977173, + "learning_rate": 6.705273323491595e-05, + "loss": 1.7625, + "step": 13279 + }, + { + "epoch": 4.076120319214242, + "grad_norm": 0.2430381178855896, + "learning_rate": 6.704806060834886e-05, + "loss": 1.7706, + "step": 13280 + }, + { + "epoch": 4.0764272559852675, + "grad_norm": 0.23250171542167664, + "learning_rate": 6.704338781330284e-05, + "loss": 1.7977, + "step": 13281 + }, + { + "epoch": 4.076734192756292, + "grad_norm": 0.22073723375797272, + "learning_rate": 6.703871484982407e-05, + "loss": 1.7686, + "step": 13282 + }, + { + "epoch": 4.077041129527317, + "grad_norm": 0.24987035989761353, + "learning_rate": 6.703404171795874e-05, + "loss": 1.736, + "step": 13283 + }, + { + "epoch": 4.077348066298343, + "grad_norm": 0.2697623670101166, + "learning_rate": 6.702936841775301e-05, + "loss": 1.8367, + "step": 13284 + }, + { + "epoch": 4.077655003069368, + "grad_norm": 0.21592749655246735, + "learning_rate": 6.702469494925309e-05, + "loss": 1.7467, + "step": 13285 + }, + { + "epoch": 4.077961939840393, + "grad_norm": 0.2612052261829376, + "learning_rate": 6.702002131250515e-05, + "loss": 1.7689, + "step": 13286 + }, + { + "epoch": 4.078268876611418, + "grad_norm": 0.3004797697067261, + "learning_rate": 6.701534750755539e-05, + "loss": 1.7586, + "step": 13287 + }, + { + "epoch": 4.078575813382443, + "grad_norm": 0.24615366756916046, + "learning_rate": 6.701067353444998e-05, + "loss": 1.7636, + "step": 13288 + }, + { + "epoch": 4.078882750153468, + "grad_norm": 0.23401159048080444, + "learning_rate": 6.700599939323515e-05, + "loss": 1.8015, + "step": 13289 + }, + { + "epoch": 4.079189686924494, + "grad_norm": 0.24546295404434204, + "learning_rate": 6.700132508395705e-05, + "loss": 1.7606, + "step": 13290 + }, + { + "epoch": 4.079496623695519, + "grad_norm": 0.24664412438869476, + "learning_rate": 6.69966506066619e-05, + "loss": 1.7994, + "step": 13291 + }, + { + "epoch": 4.0798035604665435, + "grad_norm": 0.2780163288116455, + "learning_rate": 6.699197596139587e-05, + "loss": 1.7972, + "step": 13292 + }, + { + "epoch": 4.080110497237569, + "grad_norm": 0.2554188668727875, + "learning_rate": 6.698730114820517e-05, + "loss": 1.7928, + "step": 13293 + }, + { + "epoch": 4.080417434008594, + "grad_norm": 0.2471141666173935, + "learning_rate": 6.698262616713602e-05, + "loss": 1.7948, + "step": 13294 + }, + { + "epoch": 4.0807243707796195, + "grad_norm": 0.2556581199169159, + "learning_rate": 6.697795101823461e-05, + "loss": 1.7942, + "step": 13295 + }, + { + "epoch": 4.081031307550645, + "grad_norm": 0.24462421238422394, + "learning_rate": 6.697327570154712e-05, + "loss": 1.7336, + "step": 13296 + }, + { + "epoch": 4.08133824432167, + "grad_norm": 0.22378689050674438, + "learning_rate": 6.696860021711978e-05, + "loss": 1.7703, + "step": 13297 + }, + { + "epoch": 4.081645181092695, + "grad_norm": 0.23949933052062988, + "learning_rate": 6.69639245649988e-05, + "loss": 1.7651, + "step": 13298 + }, + { + "epoch": 4.08195211786372, + "grad_norm": 0.27751216292381287, + "learning_rate": 6.695924874523035e-05, + "loss": 1.7866, + "step": 13299 + }, + { + "epoch": 4.082259054634745, + "grad_norm": 0.22700226306915283, + "learning_rate": 6.695457275786068e-05, + "loss": 1.79, + "step": 13300 + }, + { + "epoch": 4.082565991405771, + "grad_norm": 0.2138090431690216, + "learning_rate": 6.694989660293598e-05, + "loss": 1.7882, + "step": 13301 + }, + { + "epoch": 4.082872928176796, + "grad_norm": 0.2963469326496124, + "learning_rate": 6.694522028050246e-05, + "loss": 1.8779, + "step": 13302 + }, + { + "epoch": 4.08317986494782, + "grad_norm": 0.31833669543266296, + "learning_rate": 6.694054379060634e-05, + "loss": 1.7923, + "step": 13303 + }, + { + "epoch": 4.083486801718846, + "grad_norm": 0.27751585841178894, + "learning_rate": 6.693586713329385e-05, + "loss": 1.7557, + "step": 13304 + }, + { + "epoch": 4.083793738489871, + "grad_norm": 0.23790816962718964, + "learning_rate": 6.69311903086112e-05, + "loss": 1.7587, + "step": 13305 + }, + { + "epoch": 4.084100675260896, + "grad_norm": 0.24153777956962585, + "learning_rate": 6.692651331660458e-05, + "loss": 1.7573, + "step": 13306 + }, + { + "epoch": 4.084407612031922, + "grad_norm": 0.26607179641723633, + "learning_rate": 6.692183615732025e-05, + "loss": 1.7823, + "step": 13307 + }, + { + "epoch": 4.084714548802946, + "grad_norm": 0.26670268177986145, + "learning_rate": 6.691715883080442e-05, + "loss": 1.784, + "step": 13308 + }, + { + "epoch": 4.0850214855739715, + "grad_norm": 0.25980666279792786, + "learning_rate": 6.69124813371033e-05, + "loss": 1.797, + "step": 13309 + }, + { + "epoch": 4.085328422344997, + "grad_norm": 0.2805597484111786, + "learning_rate": 6.690780367626314e-05, + "loss": 1.8298, + "step": 13310 + }, + { + "epoch": 4.085635359116022, + "grad_norm": 0.27198413014411926, + "learning_rate": 6.690312584833012e-05, + "loss": 1.8104, + "step": 13311 + }, + { + "epoch": 4.0859422958870475, + "grad_norm": 0.2619116008281708, + "learning_rate": 6.689844785335054e-05, + "loss": 1.771, + "step": 13312 + }, + { + "epoch": 4.086249232658073, + "grad_norm": 0.22647863626480103, + "learning_rate": 6.689376969137057e-05, + "loss": 1.8114, + "step": 13313 + }, + { + "epoch": 4.086556169429097, + "grad_norm": 1.469475507736206, + "learning_rate": 6.68890913624365e-05, + "loss": 1.8796, + "step": 13314 + }, + { + "epoch": 4.086863106200123, + "grad_norm": 0.4577515423297882, + "learning_rate": 6.68844128665945e-05, + "loss": 1.716, + "step": 13315 + }, + { + "epoch": 4.087170042971148, + "grad_norm": 0.5830543637275696, + "learning_rate": 6.687973420389085e-05, + "loss": 1.7692, + "step": 13316 + }, + { + "epoch": 4.087476979742173, + "grad_norm": 0.4404197037220001, + "learning_rate": 6.687505537437178e-05, + "loss": 1.7909, + "step": 13317 + }, + { + "epoch": 4.087783916513199, + "grad_norm": 0.31379908323287964, + "learning_rate": 6.68703763780835e-05, + "loss": 1.7957, + "step": 13318 + }, + { + "epoch": 4.088090853284223, + "grad_norm": 0.49588730931282043, + "learning_rate": 6.686569721507229e-05, + "loss": 1.7126, + "step": 13319 + }, + { + "epoch": 4.088397790055248, + "grad_norm": 0.3690234124660492, + "learning_rate": 6.686101788538437e-05, + "loss": 1.8233, + "step": 13320 + }, + { + "epoch": 4.088704726826274, + "grad_norm": 0.337310254573822, + "learning_rate": 6.685633838906598e-05, + "loss": 1.6886, + "step": 13321 + }, + { + "epoch": 4.089011663597299, + "grad_norm": 0.5164821147918701, + "learning_rate": 6.685165872616337e-05, + "loss": 1.7967, + "step": 13322 + }, + { + "epoch": 4.089318600368324, + "grad_norm": 0.36501309275627136, + "learning_rate": 6.68469788967228e-05, + "loss": 1.755, + "step": 13323 + }, + { + "epoch": 4.08962553713935, + "grad_norm": 0.35017216205596924, + "learning_rate": 6.684229890079052e-05, + "loss": 1.7595, + "step": 13324 + }, + { + "epoch": 4.089932473910374, + "grad_norm": 0.5622650980949402, + "learning_rate": 6.683761873841277e-05, + "loss": 1.7841, + "step": 13325 + }, + { + "epoch": 4.0902394106813995, + "grad_norm": 0.47010260820388794, + "learning_rate": 6.683293840963578e-05, + "loss": 1.7537, + "step": 13326 + }, + { + "epoch": 4.090546347452425, + "grad_norm": 0.25515374541282654, + "learning_rate": 6.682825791450584e-05, + "loss": 1.7692, + "step": 13327 + }, + { + "epoch": 4.09085328422345, + "grad_norm": 0.5063003897666931, + "learning_rate": 6.682357725306919e-05, + "loss": 1.7454, + "step": 13328 + }, + { + "epoch": 4.0911602209944755, + "grad_norm": 0.4197622835636139, + "learning_rate": 6.681889642537209e-05, + "loss": 1.7792, + "step": 13329 + }, + { + "epoch": 4.0914671577655, + "grad_norm": 0.24038295447826385, + "learning_rate": 6.68142154314608e-05, + "loss": 1.7631, + "step": 13330 + }, + { + "epoch": 4.091774094536525, + "grad_norm": 0.42108532786369324, + "learning_rate": 6.680953427138159e-05, + "loss": 1.7784, + "step": 13331 + }, + { + "epoch": 4.092081031307551, + "grad_norm": 0.33729633688926697, + "learning_rate": 6.68048529451807e-05, + "loss": 1.8057, + "step": 13332 + }, + { + "epoch": 4.092387968078576, + "grad_norm": 0.31847241520881653, + "learning_rate": 6.68001714529044e-05, + "loss": 1.7375, + "step": 13333 + }, + { + "epoch": 4.092694904849601, + "grad_norm": 0.45276644825935364, + "learning_rate": 6.679548979459896e-05, + "loss": 1.7507, + "step": 13334 + }, + { + "epoch": 4.093001841620626, + "grad_norm": 0.3781665861606598, + "learning_rate": 6.679080797031065e-05, + "loss": 1.7718, + "step": 13335 + }, + { + "epoch": 4.093308778391651, + "grad_norm": 0.25868359208106995, + "learning_rate": 6.678612598008573e-05, + "loss": 1.8105, + "step": 13336 + }, + { + "epoch": 4.093615715162676, + "grad_norm": 0.32834702730178833, + "learning_rate": 6.678144382397048e-05, + "loss": 1.7883, + "step": 13337 + }, + { + "epoch": 4.093922651933702, + "grad_norm": 0.2830568253993988, + "learning_rate": 6.677676150201116e-05, + "loss": 1.7994, + "step": 13338 + }, + { + "epoch": 4.094229588704727, + "grad_norm": 0.219541534781456, + "learning_rate": 6.677207901425405e-05, + "loss": 1.7344, + "step": 13339 + }, + { + "epoch": 4.094536525475752, + "grad_norm": 0.2557326555252075, + "learning_rate": 6.676739636074542e-05, + "loss": 1.7734, + "step": 13340 + }, + { + "epoch": 4.094843462246777, + "grad_norm": 0.2741365432739258, + "learning_rate": 6.676271354153156e-05, + "loss": 1.7912, + "step": 13341 + }, + { + "epoch": 4.095150399017802, + "grad_norm": 0.31258970499038696, + "learning_rate": 6.675803055665874e-05, + "loss": 1.7798, + "step": 13342 + }, + { + "epoch": 4.0954573357888275, + "grad_norm": 0.30181947350502014, + "learning_rate": 6.675334740617322e-05, + "loss": 1.7746, + "step": 13343 + }, + { + "epoch": 4.095764272559853, + "grad_norm": 0.3000102937221527, + "learning_rate": 6.674866409012133e-05, + "loss": 1.7842, + "step": 13344 + }, + { + "epoch": 4.096071209330878, + "grad_norm": 0.22871005535125732, + "learning_rate": 6.674398060854931e-05, + "loss": 1.7473, + "step": 13345 + }, + { + "epoch": 4.096378146101903, + "grad_norm": 0.2700810432434082, + "learning_rate": 6.673929696150346e-05, + "loss": 1.7862, + "step": 13346 + }, + { + "epoch": 4.096685082872928, + "grad_norm": 0.27537551522254944, + "learning_rate": 6.673461314903007e-05, + "loss": 1.7843, + "step": 13347 + }, + { + "epoch": 4.096992019643953, + "grad_norm": 0.23700574040412903, + "learning_rate": 6.672992917117542e-05, + "loss": 1.765, + "step": 13348 + }, + { + "epoch": 4.097298956414979, + "grad_norm": 0.23331589996814728, + "learning_rate": 6.672524502798583e-05, + "loss": 1.7894, + "step": 13349 + }, + { + "epoch": 4.097605893186004, + "grad_norm": 0.28591978549957275, + "learning_rate": 6.672056071950753e-05, + "loss": 1.7736, + "step": 13350 + }, + { + "epoch": 4.097912829957028, + "grad_norm": 0.3000452518463135, + "learning_rate": 6.671587624578685e-05, + "loss": 1.7635, + "step": 13351 + }, + { + "epoch": 4.098219766728054, + "grad_norm": 0.21877998113632202, + "learning_rate": 6.67111916068701e-05, + "loss": 1.7225, + "step": 13352 + }, + { + "epoch": 4.098526703499079, + "grad_norm": 0.2598817050457001, + "learning_rate": 6.670650680280358e-05, + "loss": 1.6874, + "step": 13353 + }, + { + "epoch": 4.098833640270104, + "grad_norm": 0.3063203692436218, + "learning_rate": 6.670182183363353e-05, + "loss": 1.7821, + "step": 13354 + }, + { + "epoch": 4.09914057704113, + "grad_norm": 0.2328508347272873, + "learning_rate": 6.66971366994063e-05, + "loss": 1.788, + "step": 13355 + }, + { + "epoch": 4.099447513812155, + "grad_norm": 0.33936765789985657, + "learning_rate": 6.669245140016817e-05, + "loss": 1.8159, + "step": 13356 + }, + { + "epoch": 4.0997544505831796, + "grad_norm": 0.27464553713798523, + "learning_rate": 6.668776593596546e-05, + "loss": 1.7371, + "step": 13357 + }, + { + "epoch": 4.100061387354205, + "grad_norm": 0.24255812168121338, + "learning_rate": 6.668308030684447e-05, + "loss": 1.7993, + "step": 13358 + }, + { + "epoch": 4.10036832412523, + "grad_norm": 0.27203628420829773, + "learning_rate": 6.667839451285149e-05, + "loss": 1.8253, + "step": 13359 + }, + { + "epoch": 4.100675260896256, + "grad_norm": 0.2503862679004669, + "learning_rate": 6.667370855403286e-05, + "loss": 1.7927, + "step": 13360 + }, + { + "epoch": 4.100982197667281, + "grad_norm": 0.2616904377937317, + "learning_rate": 6.666902243043486e-05, + "loss": 1.8226, + "step": 13361 + }, + { + "epoch": 4.101289134438305, + "grad_norm": 0.26707521080970764, + "learning_rate": 6.666433614210379e-05, + "loss": 1.8485, + "step": 13362 + }, + { + "epoch": 4.101596071209331, + "grad_norm": 0.2427528202533722, + "learning_rate": 6.6659649689086e-05, + "loss": 1.7387, + "step": 13363 + }, + { + "epoch": 4.101903007980356, + "grad_norm": 0.2319549173116684, + "learning_rate": 6.66549630714278e-05, + "loss": 1.7396, + "step": 13364 + }, + { + "epoch": 4.102209944751381, + "grad_norm": 0.2248002141714096, + "learning_rate": 6.665027628917548e-05, + "loss": 1.7817, + "step": 13365 + }, + { + "epoch": 4.102516881522407, + "grad_norm": 0.21929535269737244, + "learning_rate": 6.664558934237538e-05, + "loss": 1.7478, + "step": 13366 + }, + { + "epoch": 4.102823818293431, + "grad_norm": 0.21144583821296692, + "learning_rate": 6.66409022310738e-05, + "loss": 1.7602, + "step": 13367 + }, + { + "epoch": 4.1031307550644565, + "grad_norm": 0.21984660625457764, + "learning_rate": 6.663621495531707e-05, + "loss": 1.7541, + "step": 13368 + }, + { + "epoch": 4.103437691835482, + "grad_norm": 0.2075357735157013, + "learning_rate": 6.663152751515152e-05, + "loss": 1.7362, + "step": 13369 + }, + { + "epoch": 4.103744628606507, + "grad_norm": 0.23316961526870728, + "learning_rate": 6.662683991062347e-05, + "loss": 1.8273, + "step": 13370 + }, + { + "epoch": 4.1040515653775325, + "grad_norm": 0.23142337799072266, + "learning_rate": 6.662215214177922e-05, + "loss": 1.7543, + "step": 13371 + }, + { + "epoch": 4.104358502148558, + "grad_norm": 0.24335260689258575, + "learning_rate": 6.661746420866515e-05, + "loss": 1.8328, + "step": 13372 + }, + { + "epoch": 4.104665438919582, + "grad_norm": 0.2440192997455597, + "learning_rate": 6.661277611132753e-05, + "loss": 1.8114, + "step": 13373 + }, + { + "epoch": 4.104972375690608, + "grad_norm": 0.252808541059494, + "learning_rate": 6.660808784981273e-05, + "loss": 1.8556, + "step": 13374 + }, + { + "epoch": 4.105279312461633, + "grad_norm": 0.24564477801322937, + "learning_rate": 6.660339942416708e-05, + "loss": 1.8231, + "step": 13375 + }, + { + "epoch": 4.105586249232658, + "grad_norm": 0.2371874898672104, + "learning_rate": 6.65987108344369e-05, + "loss": 1.7763, + "step": 13376 + }, + { + "epoch": 4.105893186003684, + "grad_norm": 0.22882802784442902, + "learning_rate": 6.659402208066854e-05, + "loss": 1.7388, + "step": 13377 + }, + { + "epoch": 4.106200122774708, + "grad_norm": 0.24857540428638458, + "learning_rate": 6.658933316290832e-05, + "loss": 1.7735, + "step": 13378 + }, + { + "epoch": 4.106507059545733, + "grad_norm": 0.22574029862880707, + "learning_rate": 6.658464408120257e-05, + "loss": 1.7403, + "step": 13379 + }, + { + "epoch": 4.106813996316759, + "grad_norm": 0.24944272637367249, + "learning_rate": 6.657995483559767e-05, + "loss": 1.7827, + "step": 13380 + }, + { + "epoch": 4.107120933087784, + "grad_norm": 0.27386224269866943, + "learning_rate": 6.657526542613992e-05, + "loss": 1.7673, + "step": 13381 + }, + { + "epoch": 4.107427869858809, + "grad_norm": 0.29222097992897034, + "learning_rate": 6.65705758528757e-05, + "loss": 1.7958, + "step": 13382 + }, + { + "epoch": 4.107734806629834, + "grad_norm": 0.2471150904893875, + "learning_rate": 6.656588611585133e-05, + "loss": 1.7706, + "step": 13383 + }, + { + "epoch": 4.108041743400859, + "grad_norm": 0.289316862821579, + "learning_rate": 6.656119621511317e-05, + "loss": 1.7828, + "step": 13384 + }, + { + "epoch": 4.1083486801718845, + "grad_norm": 0.36710497736930847, + "learning_rate": 6.655650615070756e-05, + "loss": 1.712, + "step": 13385 + }, + { + "epoch": 4.10865561694291, + "grad_norm": 0.2999880611896515, + "learning_rate": 6.655181592268084e-05, + "loss": 1.7711, + "step": 13386 + }, + { + "epoch": 4.108962553713935, + "grad_norm": 0.332011342048645, + "learning_rate": 6.654712553107939e-05, + "loss": 1.907, + "step": 13387 + }, + { + "epoch": 4.1092694904849605, + "grad_norm": 0.43125995993614197, + "learning_rate": 6.654243497594953e-05, + "loss": 1.7819, + "step": 13388 + }, + { + "epoch": 4.109576427255985, + "grad_norm": 0.33719149231910706, + "learning_rate": 6.653774425733765e-05, + "loss": 1.797, + "step": 13389 + }, + { + "epoch": 4.10988336402701, + "grad_norm": 0.23091599345207214, + "learning_rate": 6.653305337529006e-05, + "loss": 1.7384, + "step": 13390 + }, + { + "epoch": 4.110190300798036, + "grad_norm": 0.4283982515335083, + "learning_rate": 6.652836232985317e-05, + "loss": 1.8284, + "step": 13391 + }, + { + "epoch": 4.110497237569061, + "grad_norm": 0.43575870990753174, + "learning_rate": 6.652367112107332e-05, + "loss": 1.7235, + "step": 13392 + }, + { + "epoch": 4.110804174340086, + "grad_norm": 0.246877059340477, + "learning_rate": 6.651897974899685e-05, + "loss": 1.7174, + "step": 13393 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.36063629388809204, + "learning_rate": 6.651428821367015e-05, + "loss": 1.8064, + "step": 13394 + }, + { + "epoch": 4.111418047882136, + "grad_norm": 0.4454420804977417, + "learning_rate": 6.650959651513957e-05, + "loss": 1.7575, + "step": 13395 + }, + { + "epoch": 4.111724984653161, + "grad_norm": 0.2788856327533722, + "learning_rate": 6.650490465345149e-05, + "loss": 1.7696, + "step": 13396 + }, + { + "epoch": 4.112031921424187, + "grad_norm": 0.40281879901885986, + "learning_rate": 6.650021262865225e-05, + "loss": 1.8368, + "step": 13397 + }, + { + "epoch": 4.112338858195212, + "grad_norm": 0.5151103138923645, + "learning_rate": 6.649552044078825e-05, + "loss": 1.8224, + "step": 13398 + }, + { + "epoch": 4.112645794966237, + "grad_norm": 0.29390639066696167, + "learning_rate": 6.649082808990586e-05, + "loss": 1.7846, + "step": 13399 + }, + { + "epoch": 4.112952731737262, + "grad_norm": 0.3061942458152771, + "learning_rate": 6.648613557605142e-05, + "loss": 1.7954, + "step": 13400 + }, + { + "epoch": 4.113259668508287, + "grad_norm": 0.47628748416900635, + "learning_rate": 6.648144289927132e-05, + "loss": 1.7782, + "step": 13401 + }, + { + "epoch": 4.1135666052793125, + "grad_norm": 0.4299588203430176, + "learning_rate": 6.647675005961197e-05, + "loss": 1.7459, + "step": 13402 + }, + { + "epoch": 4.113873542050338, + "grad_norm": 0.24556589126586914, + "learning_rate": 6.64720570571197e-05, + "loss": 1.753, + "step": 13403 + }, + { + "epoch": 4.114180478821363, + "grad_norm": 0.29620522260665894, + "learning_rate": 6.646736389184092e-05, + "loss": 1.773, + "step": 13404 + }, + { + "epoch": 4.114487415592388, + "grad_norm": 0.37710070610046387, + "learning_rate": 6.646267056382199e-05, + "loss": 1.8389, + "step": 13405 + }, + { + "epoch": 4.114794352363413, + "grad_norm": 0.2562984824180603, + "learning_rate": 6.64579770731093e-05, + "loss": 1.7905, + "step": 13406 + }, + { + "epoch": 4.115101289134438, + "grad_norm": 0.3999946713447571, + "learning_rate": 6.645328341974924e-05, + "loss": 1.7734, + "step": 13407 + }, + { + "epoch": 4.115408225905464, + "grad_norm": 0.36087217926979065, + "learning_rate": 6.644858960378817e-05, + "loss": 1.801, + "step": 13408 + }, + { + "epoch": 4.115715162676489, + "grad_norm": 0.2520254850387573, + "learning_rate": 6.644389562527251e-05, + "loss": 1.7394, + "step": 13409 + }, + { + "epoch": 4.116022099447513, + "grad_norm": 0.4321835935115814, + "learning_rate": 6.643920148424864e-05, + "loss": 1.8091, + "step": 13410 + }, + { + "epoch": 4.116329036218539, + "grad_norm": 0.40900173783302307, + "learning_rate": 6.643450718076294e-05, + "loss": 1.8198, + "step": 13411 + }, + { + "epoch": 4.116635972989564, + "grad_norm": 0.23693956434726715, + "learning_rate": 6.642981271486182e-05, + "loss": 1.6807, + "step": 13412 + }, + { + "epoch": 4.116942909760589, + "grad_norm": 0.33526891469955444, + "learning_rate": 6.642511808659164e-05, + "loss": 1.8673, + "step": 13413 + }, + { + "epoch": 4.117249846531615, + "grad_norm": 0.4037325382232666, + "learning_rate": 6.642042329599883e-05, + "loss": 1.743, + "step": 13414 + }, + { + "epoch": 4.11755678330264, + "grad_norm": 0.25629740953445435, + "learning_rate": 6.641572834312975e-05, + "loss": 1.6904, + "step": 13415 + }, + { + "epoch": 4.1178637200736645, + "grad_norm": 0.29203253984451294, + "learning_rate": 6.641103322803087e-05, + "loss": 1.7811, + "step": 13416 + }, + { + "epoch": 4.11817065684469, + "grad_norm": 0.423926442861557, + "learning_rate": 6.64063379507485e-05, + "loss": 1.7341, + "step": 13417 + }, + { + "epoch": 4.118477593615715, + "grad_norm": 0.29561251401901245, + "learning_rate": 6.64016425113291e-05, + "loss": 1.7915, + "step": 13418 + }, + { + "epoch": 4.1187845303867405, + "grad_norm": 0.2536832094192505, + "learning_rate": 6.639694690981903e-05, + "loss": 1.7628, + "step": 13419 + }, + { + "epoch": 4.119091467157766, + "grad_norm": 0.2931392192840576, + "learning_rate": 6.639225114626475e-05, + "loss": 1.7877, + "step": 13420 + }, + { + "epoch": 4.11939840392879, + "grad_norm": 0.2219499796628952, + "learning_rate": 6.638755522071263e-05, + "loss": 1.7183, + "step": 13421 + }, + { + "epoch": 4.119705340699816, + "grad_norm": 0.2951931953430176, + "learning_rate": 6.638285913320908e-05, + "loss": 1.7983, + "step": 13422 + }, + { + "epoch": 4.120012277470841, + "grad_norm": 0.3495960533618927, + "learning_rate": 6.63781628838005e-05, + "loss": 1.7531, + "step": 13423 + }, + { + "epoch": 4.120319214241866, + "grad_norm": 0.2389262616634369, + "learning_rate": 6.637346647253333e-05, + "loss": 1.7454, + "step": 13424 + }, + { + "epoch": 4.120626151012892, + "grad_norm": 0.28729167580604553, + "learning_rate": 6.636876989945395e-05, + "loss": 1.8105, + "step": 13425 + }, + { + "epoch": 4.120933087783916, + "grad_norm": 0.2620082199573517, + "learning_rate": 6.636407316460882e-05, + "loss": 1.7948, + "step": 13426 + }, + { + "epoch": 4.121240024554941, + "grad_norm": 0.2694189250469208, + "learning_rate": 6.635937626804432e-05, + "loss": 1.809, + "step": 13427 + }, + { + "epoch": 4.121546961325967, + "grad_norm": 0.2660866379737854, + "learning_rate": 6.635467920980687e-05, + "loss": 1.7431, + "step": 13428 + }, + { + "epoch": 4.121853898096992, + "grad_norm": 0.2579907774925232, + "learning_rate": 6.634998198994289e-05, + "loss": 1.7941, + "step": 13429 + }, + { + "epoch": 4.122160834868017, + "grad_norm": 0.28349989652633667, + "learning_rate": 6.634528460849881e-05, + "loss": 1.8142, + "step": 13430 + }, + { + "epoch": 4.122467771639043, + "grad_norm": 0.28716522455215454, + "learning_rate": 6.634058706552104e-05, + "loss": 1.7496, + "step": 13431 + }, + { + "epoch": 4.122774708410067, + "grad_norm": 0.23228077590465546, + "learning_rate": 6.633588936105601e-05, + "loss": 1.7399, + "step": 13432 + }, + { + "epoch": 4.1230816451810925, + "grad_norm": 0.3649841248989105, + "learning_rate": 6.633119149515017e-05, + "loss": 1.7696, + "step": 13433 + }, + { + "epoch": 4.123388581952118, + "grad_norm": 0.2757830321788788, + "learning_rate": 6.632649346784992e-05, + "loss": 1.8329, + "step": 13434 + }, + { + "epoch": 4.123695518723143, + "grad_norm": 0.28163692355155945, + "learning_rate": 6.632179527920167e-05, + "loss": 1.7761, + "step": 13435 + }, + { + "epoch": 4.1240024554941686, + "grad_norm": 0.3453187048435211, + "learning_rate": 6.631709692925188e-05, + "loss": 1.7843, + "step": 13436 + }, + { + "epoch": 4.124309392265193, + "grad_norm": 0.2792697250843048, + "learning_rate": 6.631239841804698e-05, + "loss": 1.7889, + "step": 13437 + }, + { + "epoch": 4.124616329036218, + "grad_norm": 0.21881693601608276, + "learning_rate": 6.630769974563339e-05, + "loss": 1.8015, + "step": 13438 + }, + { + "epoch": 4.124923265807244, + "grad_norm": 0.4464910328388214, + "learning_rate": 6.630300091205756e-05, + "loss": 1.7851, + "step": 13439 + }, + { + "epoch": 4.125230202578269, + "grad_norm": 0.40191107988357544, + "learning_rate": 6.629830191736591e-05, + "loss": 1.8608, + "step": 13440 + }, + { + "epoch": 4.125537139349294, + "grad_norm": 0.2809060513973236, + "learning_rate": 6.62936027616049e-05, + "loss": 1.7374, + "step": 13441 + }, + { + "epoch": 4.12584407612032, + "grad_norm": 0.24980643391609192, + "learning_rate": 6.628890344482095e-05, + "loss": 1.8152, + "step": 13442 + }, + { + "epoch": 4.126151012891344, + "grad_norm": 0.24538342654705048, + "learning_rate": 6.62842039670605e-05, + "loss": 1.7687, + "step": 13443 + }, + { + "epoch": 4.1264579496623695, + "grad_norm": 0.24684634804725647, + "learning_rate": 6.627950432837002e-05, + "loss": 1.787, + "step": 13444 + }, + { + "epoch": 4.126764886433395, + "grad_norm": 0.22724607586860657, + "learning_rate": 6.627480452879593e-05, + "loss": 1.7871, + "step": 13445 + }, + { + "epoch": 4.12707182320442, + "grad_norm": 0.24724406003952026, + "learning_rate": 6.627010456838469e-05, + "loss": 1.7524, + "step": 13446 + }, + { + "epoch": 4.1273787599754455, + "grad_norm": 0.24219536781311035, + "learning_rate": 6.626540444718274e-05, + "loss": 1.7754, + "step": 13447 + }, + { + "epoch": 4.12768569674647, + "grad_norm": 0.24857915937900543, + "learning_rate": 6.626070416523652e-05, + "loss": 1.7839, + "step": 13448 + }, + { + "epoch": 4.127992633517495, + "grad_norm": 0.2639105021953583, + "learning_rate": 6.625600372259248e-05, + "loss": 1.7546, + "step": 13449 + }, + { + "epoch": 4.128299570288521, + "grad_norm": 0.23598137497901917, + "learning_rate": 6.62513031192971e-05, + "loss": 1.7957, + "step": 13450 + }, + { + "epoch": 4.128606507059546, + "grad_norm": 0.3038909137248993, + "learning_rate": 6.624660235539682e-05, + "loss": 1.8117, + "step": 13451 + }, + { + "epoch": 4.128913443830571, + "grad_norm": 0.27671241760253906, + "learning_rate": 6.624190143093809e-05, + "loss": 1.729, + "step": 13452 + }, + { + "epoch": 4.129220380601596, + "grad_norm": 0.24638360738754272, + "learning_rate": 6.623720034596735e-05, + "loss": 1.7414, + "step": 13453 + }, + { + "epoch": 4.129527317372621, + "grad_norm": 0.24073924124240875, + "learning_rate": 6.623249910053111e-05, + "loss": 1.8046, + "step": 13454 + }, + { + "epoch": 4.129834254143646, + "grad_norm": 0.29734376072883606, + "learning_rate": 6.622779769467578e-05, + "loss": 1.8336, + "step": 13455 + }, + { + "epoch": 4.130141190914672, + "grad_norm": 0.23182810842990875, + "learning_rate": 6.622309612844785e-05, + "loss": 1.7742, + "step": 13456 + }, + { + "epoch": 4.130448127685697, + "grad_norm": 0.2179390788078308, + "learning_rate": 6.621839440189378e-05, + "loss": 1.7656, + "step": 13457 + }, + { + "epoch": 4.1307550644567215, + "grad_norm": 0.21389013528823853, + "learning_rate": 6.621369251506002e-05, + "loss": 1.7504, + "step": 13458 + }, + { + "epoch": 4.131062001227747, + "grad_norm": 0.22306203842163086, + "learning_rate": 6.620899046799305e-05, + "loss": 1.7573, + "step": 13459 + }, + { + "epoch": 4.131368937998772, + "grad_norm": 0.2699708938598633, + "learning_rate": 6.620428826073934e-05, + "loss": 1.7419, + "step": 13460 + }, + { + "epoch": 4.1316758747697975, + "grad_norm": 0.34087565541267395, + "learning_rate": 6.619958589334534e-05, + "loss": 1.7545, + "step": 13461 + }, + { + "epoch": 4.131982811540823, + "grad_norm": 0.2934977412223816, + "learning_rate": 6.619488336585755e-05, + "loss": 1.7611, + "step": 13462 + }, + { + "epoch": 4.132289748311848, + "grad_norm": 0.22545567154884338, + "learning_rate": 6.619018067832243e-05, + "loss": 1.7562, + "step": 13463 + }, + { + "epoch": 4.132596685082873, + "grad_norm": 0.23334743082523346, + "learning_rate": 6.618547783078647e-05, + "loss": 1.7784, + "step": 13464 + }, + { + "epoch": 4.132903621853898, + "grad_norm": 0.22466403245925903, + "learning_rate": 6.618077482329612e-05, + "loss": 1.7277, + "step": 13465 + }, + { + "epoch": 4.133210558624923, + "grad_norm": 0.23504197597503662, + "learning_rate": 6.617607165589785e-05, + "loss": 1.7983, + "step": 13466 + }, + { + "epoch": 4.133517495395949, + "grad_norm": 0.2500833570957184, + "learning_rate": 6.617136832863819e-05, + "loss": 1.7826, + "step": 13467 + }, + { + "epoch": 4.133824432166974, + "grad_norm": 0.22398658096790314, + "learning_rate": 6.616666484156357e-05, + "loss": 1.7281, + "step": 13468 + }, + { + "epoch": 4.134131368937998, + "grad_norm": 0.2537873089313507, + "learning_rate": 6.616196119472052e-05, + "loss": 1.7598, + "step": 13469 + }, + { + "epoch": 4.134438305709024, + "grad_norm": 0.26881173253059387, + "learning_rate": 6.615725738815546e-05, + "loss": 1.8161, + "step": 13470 + }, + { + "epoch": 4.134745242480049, + "grad_norm": 0.3311346471309662, + "learning_rate": 6.615255342191492e-05, + "loss": 1.7954, + "step": 13471 + }, + { + "epoch": 4.135052179251074, + "grad_norm": 0.2562953233718872, + "learning_rate": 6.614784929604539e-05, + "loss": 1.7284, + "step": 13472 + }, + { + "epoch": 4.1353591160221, + "grad_norm": 0.2563154101371765, + "learning_rate": 6.614314501059334e-05, + "loss": 1.7995, + "step": 13473 + }, + { + "epoch": 4.135666052793125, + "grad_norm": 0.24861161410808563, + "learning_rate": 6.613844056560527e-05, + "loss": 1.7589, + "step": 13474 + }, + { + "epoch": 4.1359729895641495, + "grad_norm": 0.23815487325191498, + "learning_rate": 6.613373596112769e-05, + "loss": 1.6906, + "step": 13475 + }, + { + "epoch": 4.136279926335175, + "grad_norm": 0.25394049286842346, + "learning_rate": 6.612903119720705e-05, + "loss": 1.781, + "step": 13476 + }, + { + "epoch": 4.1365868631062, + "grad_norm": 0.24501466751098633, + "learning_rate": 6.612432627388988e-05, + "loss": 1.797, + "step": 13477 + }, + { + "epoch": 4.1368937998772255, + "grad_norm": 0.24909707903862, + "learning_rate": 6.611962119122267e-05, + "loss": 1.7643, + "step": 13478 + }, + { + "epoch": 4.137200736648251, + "grad_norm": 0.24954476952552795, + "learning_rate": 6.611491594925192e-05, + "loss": 1.8219, + "step": 13479 + }, + { + "epoch": 4.137507673419275, + "grad_norm": 0.30572372674942017, + "learning_rate": 6.611021054802411e-05, + "loss": 1.8039, + "step": 13480 + }, + { + "epoch": 4.137814610190301, + "grad_norm": 0.27466365694999695, + "learning_rate": 6.610550498758577e-05, + "loss": 1.6945, + "step": 13481 + }, + { + "epoch": 4.138121546961326, + "grad_norm": 0.2614271640777588, + "learning_rate": 6.610079926798339e-05, + "loss": 1.8648, + "step": 13482 + }, + { + "epoch": 4.138428483732351, + "grad_norm": 0.23645827174186707, + "learning_rate": 6.609609338926346e-05, + "loss": 1.7424, + "step": 13483 + }, + { + "epoch": 4.138735420503377, + "grad_norm": 0.24473626911640167, + "learning_rate": 6.609138735147253e-05, + "loss": 1.8036, + "step": 13484 + }, + { + "epoch": 4.139042357274401, + "grad_norm": 0.2472417950630188, + "learning_rate": 6.608668115465706e-05, + "loss": 1.794, + "step": 13485 + }, + { + "epoch": 4.139349294045426, + "grad_norm": 0.25330284237861633, + "learning_rate": 6.608197479886358e-05, + "loss": 1.8052, + "step": 13486 + }, + { + "epoch": 4.139656230816452, + "grad_norm": 0.24279309809207916, + "learning_rate": 6.60772682841386e-05, + "loss": 1.7375, + "step": 13487 + }, + { + "epoch": 4.139963167587477, + "grad_norm": 0.22319461405277252, + "learning_rate": 6.607256161052862e-05, + "loss": 1.7696, + "step": 13488 + }, + { + "epoch": 4.140270104358502, + "grad_norm": 0.25261563062667847, + "learning_rate": 6.606785477808017e-05, + "loss": 1.7646, + "step": 13489 + }, + { + "epoch": 4.140577041129528, + "grad_norm": 0.3127744793891907, + "learning_rate": 6.606314778683977e-05, + "loss": 1.7899, + "step": 13490 + }, + { + "epoch": 4.140883977900552, + "grad_norm": 0.3550816774368286, + "learning_rate": 6.605844063685392e-05, + "loss": 1.7971, + "step": 13491 + }, + { + "epoch": 4.1411909146715775, + "grad_norm": 0.20977813005447388, + "learning_rate": 6.605373332816916e-05, + "loss": 1.7416, + "step": 13492 + }, + { + "epoch": 4.141497851442603, + "grad_norm": 0.26593849062919617, + "learning_rate": 6.6049025860832e-05, + "loss": 1.7586, + "step": 13493 + }, + { + "epoch": 4.141804788213628, + "grad_norm": 0.2452937364578247, + "learning_rate": 6.604431823488893e-05, + "loss": 1.757, + "step": 13494 + }, + { + "epoch": 4.1421117249846535, + "grad_norm": 0.21029168367385864, + "learning_rate": 6.603961045038652e-05, + "loss": 1.7665, + "step": 13495 + }, + { + "epoch": 4.142418661755678, + "grad_norm": 0.2396312952041626, + "learning_rate": 6.603490250737128e-05, + "loss": 1.7609, + "step": 13496 + }, + { + "epoch": 4.142725598526703, + "grad_norm": 0.23266808688640594, + "learning_rate": 6.603019440588975e-05, + "loss": 1.7893, + "step": 13497 + }, + { + "epoch": 4.143032535297729, + "grad_norm": 0.25235217809677124, + "learning_rate": 6.602548614598842e-05, + "loss": 1.7465, + "step": 13498 + }, + { + "epoch": 4.143339472068754, + "grad_norm": 0.22944024205207825, + "learning_rate": 6.602077772771386e-05, + "loss": 1.7052, + "step": 13499 + }, + { + "epoch": 4.143646408839779, + "grad_norm": 0.2116660475730896, + "learning_rate": 6.601606915111257e-05, + "loss": 1.7042, + "step": 13500 + }, + { + "epoch": 4.143953345610804, + "grad_norm": 0.21777184307575226, + "learning_rate": 6.601136041623111e-05, + "loss": 1.7938, + "step": 13501 + }, + { + "epoch": 4.144260282381829, + "grad_norm": 0.23663075268268585, + "learning_rate": 6.600665152311601e-05, + "loss": 1.7475, + "step": 13502 + }, + { + "epoch": 4.144567219152854, + "grad_norm": 0.20644642412662506, + "learning_rate": 6.600194247181377e-05, + "loss": 1.7992, + "step": 13503 + }, + { + "epoch": 4.14487415592388, + "grad_norm": 0.21479010581970215, + "learning_rate": 6.599723326237098e-05, + "loss": 1.7877, + "step": 13504 + }, + { + "epoch": 4.145181092694905, + "grad_norm": 0.2266562283039093, + "learning_rate": 6.599252389483413e-05, + "loss": 1.8097, + "step": 13505 + }, + { + "epoch": 4.14548802946593, + "grad_norm": 0.2053738683462143, + "learning_rate": 6.59878143692498e-05, + "loss": 1.6878, + "step": 13506 + }, + { + "epoch": 4.145794966236955, + "grad_norm": 0.19583995640277863, + "learning_rate": 6.598310468566452e-05, + "loss": 1.7547, + "step": 13507 + }, + { + "epoch": 4.14610190300798, + "grad_norm": 0.23421542346477509, + "learning_rate": 6.597839484412484e-05, + "loss": 1.7926, + "step": 13508 + }, + { + "epoch": 4.1464088397790055, + "grad_norm": 0.24575260281562805, + "learning_rate": 6.597368484467728e-05, + "loss": 1.7311, + "step": 13509 + }, + { + "epoch": 4.146715776550031, + "grad_norm": 0.27519574761390686, + "learning_rate": 6.596897468736842e-05, + "loss": 1.7858, + "step": 13510 + }, + { + "epoch": 4.147022713321056, + "grad_norm": 0.26434022188186646, + "learning_rate": 6.596426437224477e-05, + "loss": 1.7387, + "step": 13511 + }, + { + "epoch": 4.147329650092081, + "grad_norm": 0.2192772775888443, + "learning_rate": 6.595955389935291e-05, + "loss": 1.7565, + "step": 13512 + }, + { + "epoch": 4.147636586863106, + "grad_norm": 0.21047350764274597, + "learning_rate": 6.595484326873938e-05, + "loss": 1.7234, + "step": 13513 + }, + { + "epoch": 4.147943523634131, + "grad_norm": 0.22838951647281647, + "learning_rate": 6.595013248045075e-05, + "loss": 1.8205, + "step": 13514 + }, + { + "epoch": 4.148250460405157, + "grad_norm": 0.3467923402786255, + "learning_rate": 6.594542153453356e-05, + "loss": 1.7973, + "step": 13515 + }, + { + "epoch": 4.148557397176182, + "grad_norm": 0.241237074136734, + "learning_rate": 6.594071043103438e-05, + "loss": 1.7764, + "step": 13516 + }, + { + "epoch": 4.148864333947207, + "grad_norm": 0.22543516755104065, + "learning_rate": 6.593599916999973e-05, + "loss": 1.7528, + "step": 13517 + }, + { + "epoch": 4.149171270718232, + "grad_norm": 0.24590276181697845, + "learning_rate": 6.593128775147623e-05, + "loss": 1.7422, + "step": 13518 + }, + { + "epoch": 4.149478207489257, + "grad_norm": 0.2434391975402832, + "learning_rate": 6.592657617551038e-05, + "loss": 1.7523, + "step": 13519 + }, + { + "epoch": 4.149785144260282, + "grad_norm": 0.23169009387493134, + "learning_rate": 6.592186444214877e-05, + "loss": 1.8158, + "step": 13520 + }, + { + "epoch": 4.150092081031308, + "grad_norm": 0.2217840999364853, + "learning_rate": 6.591715255143798e-05, + "loss": 1.7487, + "step": 13521 + }, + { + "epoch": 4.150399017802333, + "grad_norm": 0.2405092418193817, + "learning_rate": 6.591244050342454e-05, + "loss": 1.7726, + "step": 13522 + }, + { + "epoch": 4.150705954573358, + "grad_norm": 0.29432612657546997, + "learning_rate": 6.590772829815504e-05, + "loss": 1.7841, + "step": 13523 + }, + { + "epoch": 4.151012891344383, + "grad_norm": 0.2708737850189209, + "learning_rate": 6.590301593567605e-05, + "loss": 1.8551, + "step": 13524 + }, + { + "epoch": 4.151319828115408, + "grad_norm": 0.26643216609954834, + "learning_rate": 6.589830341603413e-05, + "loss": 1.7697, + "step": 13525 + }, + { + "epoch": 4.151626764886434, + "grad_norm": 0.3672652840614319, + "learning_rate": 6.589359073927587e-05, + "loss": 1.8292, + "step": 13526 + }, + { + "epoch": 4.151933701657459, + "grad_norm": 0.2413325160741806, + "learning_rate": 6.588887790544782e-05, + "loss": 1.7514, + "step": 13527 + }, + { + "epoch": 4.152240638428483, + "grad_norm": 0.3248155117034912, + "learning_rate": 6.588416491459657e-05, + "loss": 1.7437, + "step": 13528 + }, + { + "epoch": 4.152547575199509, + "grad_norm": 0.40951836109161377, + "learning_rate": 6.587945176676869e-05, + "loss": 1.7779, + "step": 13529 + }, + { + "epoch": 4.152854511970534, + "grad_norm": 0.23874351382255554, + "learning_rate": 6.587473846201075e-05, + "loss": 1.8343, + "step": 13530 + }, + { + "epoch": 4.153161448741559, + "grad_norm": 0.4535207450389862, + "learning_rate": 6.587002500036936e-05, + "loss": 1.8301, + "step": 13531 + }, + { + "epoch": 4.153468385512585, + "grad_norm": 0.458003968000412, + "learning_rate": 6.586531138189108e-05, + "loss": 1.7053, + "step": 13532 + }, + { + "epoch": 4.153775322283609, + "grad_norm": 0.24350887537002563, + "learning_rate": 6.586059760662248e-05, + "loss": 1.7642, + "step": 13533 + }, + { + "epoch": 4.1540822590546345, + "grad_norm": 0.46951553225517273, + "learning_rate": 6.585588367461017e-05, + "loss": 1.7345, + "step": 13534 + }, + { + "epoch": 4.15438919582566, + "grad_norm": 0.5524527430534363, + "learning_rate": 6.585116958590072e-05, + "loss": 1.7677, + "step": 13535 + }, + { + "epoch": 4.154696132596685, + "grad_norm": 0.2887112498283386, + "learning_rate": 6.584645534054072e-05, + "loss": 1.7704, + "step": 13536 + }, + { + "epoch": 4.1550030693677105, + "grad_norm": 0.36243724822998047, + "learning_rate": 6.584174093857675e-05, + "loss": 1.8133, + "step": 13537 + }, + { + "epoch": 4.155310006138736, + "grad_norm": 0.3869550824165344, + "learning_rate": 6.583702638005543e-05, + "loss": 1.7253, + "step": 13538 + }, + { + "epoch": 4.15561694290976, + "grad_norm": 0.25859662890434265, + "learning_rate": 6.583231166502333e-05, + "loss": 1.7683, + "step": 13539 + }, + { + "epoch": 4.155923879680786, + "grad_norm": 0.3011144995689392, + "learning_rate": 6.582759679352704e-05, + "loss": 1.7139, + "step": 13540 + }, + { + "epoch": 4.156230816451811, + "grad_norm": 0.38033372163772583, + "learning_rate": 6.582288176561316e-05, + "loss": 1.8182, + "step": 13541 + }, + { + "epoch": 4.156537753222836, + "grad_norm": 0.2224060595035553, + "learning_rate": 6.581816658132829e-05, + "loss": 1.7527, + "step": 13542 + }, + { + "epoch": 4.156844689993862, + "grad_norm": 0.4147234261035919, + "learning_rate": 6.581345124071903e-05, + "loss": 1.7339, + "step": 13543 + }, + { + "epoch": 4.157151626764886, + "grad_norm": 0.45334625244140625, + "learning_rate": 6.580873574383198e-05, + "loss": 1.8166, + "step": 13544 + }, + { + "epoch": 4.157458563535911, + "grad_norm": 0.3050530254840851, + "learning_rate": 6.580402009071372e-05, + "loss": 1.7967, + "step": 13545 + }, + { + "epoch": 4.157765500306937, + "grad_norm": 0.25901293754577637, + "learning_rate": 6.579930428141088e-05, + "loss": 1.7806, + "step": 13546 + }, + { + "epoch": 4.158072437077962, + "grad_norm": 0.3142934739589691, + "learning_rate": 6.579458831597006e-05, + "loss": 1.7724, + "step": 13547 + }, + { + "epoch": 4.158379373848987, + "grad_norm": 0.23943179845809937, + "learning_rate": 6.578987219443787e-05, + "loss": 1.7515, + "step": 13548 + }, + { + "epoch": 4.158686310620013, + "grad_norm": 0.2838635742664337, + "learning_rate": 6.578515591686089e-05, + "loss": 1.7707, + "step": 13549 + }, + { + "epoch": 4.158993247391037, + "grad_norm": 0.3064457178115845, + "learning_rate": 6.578043948328575e-05, + "loss": 1.7839, + "step": 13550 + }, + { + "epoch": 4.1593001841620625, + "grad_norm": 0.2311718463897705, + "learning_rate": 6.577572289375907e-05, + "loss": 1.8298, + "step": 13551 + }, + { + "epoch": 4.159607120933088, + "grad_norm": 0.35726481676101685, + "learning_rate": 6.577100614832743e-05, + "loss": 1.811, + "step": 13552 + }, + { + "epoch": 4.159914057704113, + "grad_norm": 0.3176140785217285, + "learning_rate": 6.576628924703749e-05, + "loss": 1.732, + "step": 13553 + }, + { + "epoch": 4.1602209944751385, + "grad_norm": 0.2325647473335266, + "learning_rate": 6.576157218993582e-05, + "loss": 1.827, + "step": 13554 + }, + { + "epoch": 4.160527931246163, + "grad_norm": 0.32260453701019287, + "learning_rate": 6.575685497706905e-05, + "loss": 1.8218, + "step": 13555 + }, + { + "epoch": 4.160834868017188, + "grad_norm": 0.2638537287712097, + "learning_rate": 6.575213760848382e-05, + "loss": 1.7091, + "step": 13556 + }, + { + "epoch": 4.161141804788214, + "grad_norm": 0.2501799762248993, + "learning_rate": 6.574742008422671e-05, + "loss": 1.7707, + "step": 13557 + }, + { + "epoch": 4.161448741559239, + "grad_norm": 0.3212645649909973, + "learning_rate": 6.574270240434439e-05, + "loss": 1.7541, + "step": 13558 + }, + { + "epoch": 4.161755678330264, + "grad_norm": 0.25915586948394775, + "learning_rate": 6.573798456888345e-05, + "loss": 1.7597, + "step": 13559 + }, + { + "epoch": 4.162062615101289, + "grad_norm": 0.2538192868232727, + "learning_rate": 6.573326657789052e-05, + "loss": 1.8507, + "step": 13560 + }, + { + "epoch": 4.162369551872314, + "grad_norm": 0.2542131543159485, + "learning_rate": 6.572854843141223e-05, + "loss": 1.782, + "step": 13561 + }, + { + "epoch": 4.162676488643339, + "grad_norm": 0.26163414120674133, + "learning_rate": 6.572383012949521e-05, + "loss": 1.8482, + "step": 13562 + }, + { + "epoch": 4.162983425414365, + "grad_norm": 0.2566238343715668, + "learning_rate": 6.571911167218608e-05, + "loss": 1.7284, + "step": 13563 + }, + { + "epoch": 4.16329036218539, + "grad_norm": 0.28413113951683044, + "learning_rate": 6.571439305953147e-05, + "loss": 1.7473, + "step": 13564 + }, + { + "epoch": 4.163597298956415, + "grad_norm": 0.20399242639541626, + "learning_rate": 6.570967429157802e-05, + "loss": 1.6942, + "step": 13565 + }, + { + "epoch": 4.16390423572744, + "grad_norm": 0.256104439496994, + "learning_rate": 6.570495536837235e-05, + "loss": 1.7346, + "step": 13566 + }, + { + "epoch": 4.164211172498465, + "grad_norm": 0.350909560918808, + "learning_rate": 6.570023628996112e-05, + "loss": 1.8284, + "step": 13567 + }, + { + "epoch": 4.1645181092694905, + "grad_norm": 0.23500367999076843, + "learning_rate": 6.569551705639096e-05, + "loss": 1.7504, + "step": 13568 + }, + { + "epoch": 4.164825046040516, + "grad_norm": 0.26683783531188965, + "learning_rate": 6.569079766770849e-05, + "loss": 1.7293, + "step": 13569 + }, + { + "epoch": 4.165131982811541, + "grad_norm": 0.3145855963230133, + "learning_rate": 6.568607812396037e-05, + "loss": 1.8171, + "step": 13570 + }, + { + "epoch": 4.165438919582566, + "grad_norm": 0.2354860156774521, + "learning_rate": 6.568135842519324e-05, + "loss": 1.7555, + "step": 13571 + }, + { + "epoch": 4.165745856353591, + "grad_norm": 0.2893243730068207, + "learning_rate": 6.56766385714537e-05, + "loss": 1.7636, + "step": 13572 + }, + { + "epoch": 4.166052793124616, + "grad_norm": 0.20707663893699646, + "learning_rate": 6.567191856278846e-05, + "loss": 1.7239, + "step": 13573 + }, + { + "epoch": 4.166359729895642, + "grad_norm": 0.34200331568717957, + "learning_rate": 6.566719839924412e-05, + "loss": 1.7848, + "step": 13574 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.23326615989208221, + "learning_rate": 6.566247808086734e-05, + "loss": 1.7447, + "step": 13575 + }, + { + "epoch": 4.166973603437691, + "grad_norm": 0.22375629842281342, + "learning_rate": 6.565775760770479e-05, + "loss": 1.7429, + "step": 13576 + }, + { + "epoch": 4.167280540208717, + "grad_norm": 0.2412862777709961, + "learning_rate": 6.565303697980308e-05, + "loss": 1.7671, + "step": 13577 + }, + { + "epoch": 4.167587476979742, + "grad_norm": 0.2482215315103531, + "learning_rate": 6.56483161972089e-05, + "loss": 1.812, + "step": 13578 + }, + { + "epoch": 4.167894413750767, + "grad_norm": 0.2252974659204483, + "learning_rate": 6.564359525996889e-05, + "loss": 1.8173, + "step": 13579 + }, + { + "epoch": 4.168201350521793, + "grad_norm": 0.23497292399406433, + "learning_rate": 6.563887416812969e-05, + "loss": 1.7945, + "step": 13580 + }, + { + "epoch": 4.168508287292818, + "grad_norm": 0.24911245703697205, + "learning_rate": 6.563415292173796e-05, + "loss": 1.7516, + "step": 13581 + }, + { + "epoch": 4.1688152240638425, + "grad_norm": 0.20920930802822113, + "learning_rate": 6.562943152084039e-05, + "loss": 1.765, + "step": 13582 + }, + { + "epoch": 4.169122160834868, + "grad_norm": 0.26001816987991333, + "learning_rate": 6.562470996548361e-05, + "loss": 1.7504, + "step": 13583 + }, + { + "epoch": 4.169429097605893, + "grad_norm": 0.2504529058933258, + "learning_rate": 6.561998825571429e-05, + "loss": 1.7689, + "step": 13584 + }, + { + "epoch": 4.1697360343769185, + "grad_norm": 0.2210187464952469, + "learning_rate": 6.561526639157908e-05, + "loss": 1.752, + "step": 13585 + }, + { + "epoch": 4.170042971147944, + "grad_norm": 0.26323240995407104, + "learning_rate": 6.561054437312467e-05, + "loss": 1.8104, + "step": 13586 + }, + { + "epoch": 4.170349907918968, + "grad_norm": 0.20436744391918182, + "learning_rate": 6.560582220039771e-05, + "loss": 1.7281, + "step": 13587 + }, + { + "epoch": 4.170656844689994, + "grad_norm": 0.2053878903388977, + "learning_rate": 6.560109987344487e-05, + "loss": 1.7192, + "step": 13588 + }, + { + "epoch": 4.170963781461019, + "grad_norm": 0.2416568547487259, + "learning_rate": 6.559637739231281e-05, + "loss": 1.7679, + "step": 13589 + }, + { + "epoch": 4.171270718232044, + "grad_norm": 0.23847989737987518, + "learning_rate": 6.55916547570482e-05, + "loss": 1.7182, + "step": 13590 + }, + { + "epoch": 4.17157765500307, + "grad_norm": 0.2057785540819168, + "learning_rate": 6.558693196769772e-05, + "loss": 1.816, + "step": 13591 + }, + { + "epoch": 4.171884591774095, + "grad_norm": 0.2270805537700653, + "learning_rate": 6.558220902430804e-05, + "loss": 1.7091, + "step": 13592 + }, + { + "epoch": 4.172191528545119, + "grad_norm": 0.22143644094467163, + "learning_rate": 6.557748592692585e-05, + "loss": 1.7446, + "step": 13593 + }, + { + "epoch": 4.172498465316145, + "grad_norm": 0.2032770961523056, + "learning_rate": 6.557276267559781e-05, + "loss": 1.7501, + "step": 13594 + }, + { + "epoch": 4.17280540208717, + "grad_norm": 0.20851244032382965, + "learning_rate": 6.55680392703706e-05, + "loss": 1.8283, + "step": 13595 + }, + { + "epoch": 4.173112338858195, + "grad_norm": 0.2603934109210968, + "learning_rate": 6.55633157112909e-05, + "loss": 1.8523, + "step": 13596 + }, + { + "epoch": 4.173419275629221, + "grad_norm": 0.2232515811920166, + "learning_rate": 6.55585919984054e-05, + "loss": 1.7803, + "step": 13597 + }, + { + "epoch": 4.173726212400245, + "grad_norm": 0.2541115880012512, + "learning_rate": 6.555386813176075e-05, + "loss": 1.7407, + "step": 13598 + }, + { + "epoch": 4.1740331491712706, + "grad_norm": 0.3044603765010834, + "learning_rate": 6.55491441114037e-05, + "loss": 1.8257, + "step": 13599 + }, + { + "epoch": 4.174340085942296, + "grad_norm": 0.29227301478385925, + "learning_rate": 6.554441993738086e-05, + "loss": 1.7998, + "step": 13600 + }, + { + "epoch": 4.174647022713321, + "grad_norm": 0.25166594982147217, + "learning_rate": 6.553969560973896e-05, + "loss": 1.8258, + "step": 13601 + }, + { + "epoch": 4.1749539594843466, + "grad_norm": 0.22973991930484772, + "learning_rate": 6.55349711285247e-05, + "loss": 1.7871, + "step": 13602 + }, + { + "epoch": 4.175260896255371, + "grad_norm": 0.2615009844303131, + "learning_rate": 6.553024649378473e-05, + "loss": 1.7572, + "step": 13603 + }, + { + "epoch": 4.175567833026396, + "grad_norm": 0.24145473539829254, + "learning_rate": 6.552552170556576e-05, + "loss": 1.7546, + "step": 13604 + }, + { + "epoch": 4.175874769797422, + "grad_norm": 0.21989156305789948, + "learning_rate": 6.55207967639145e-05, + "loss": 1.6939, + "step": 13605 + }, + { + "epoch": 4.176181706568447, + "grad_norm": 0.206025168299675, + "learning_rate": 6.551607166887761e-05, + "loss": 1.7531, + "step": 13606 + }, + { + "epoch": 4.176488643339472, + "grad_norm": 0.2175903469324112, + "learning_rate": 6.551134642050181e-05, + "loss": 1.7631, + "step": 13607 + }, + { + "epoch": 4.176795580110497, + "grad_norm": 0.23259282112121582, + "learning_rate": 6.550662101883379e-05, + "loss": 1.7773, + "step": 13608 + }, + { + "epoch": 4.177102516881522, + "grad_norm": 0.23955227434635162, + "learning_rate": 6.550189546392025e-05, + "loss": 1.7321, + "step": 13609 + }, + { + "epoch": 4.1774094536525475, + "grad_norm": 0.23614998161792755, + "learning_rate": 6.549716975580792e-05, + "loss": 1.7855, + "step": 13610 + }, + { + "epoch": 4.177716390423573, + "grad_norm": 0.2274426817893982, + "learning_rate": 6.549244389454345e-05, + "loss": 1.7778, + "step": 13611 + }, + { + "epoch": 4.178023327194598, + "grad_norm": 0.2204308807849884, + "learning_rate": 6.548771788017358e-05, + "loss": 1.7175, + "step": 13612 + }, + { + "epoch": 4.1783302639656235, + "grad_norm": 0.2283930778503418, + "learning_rate": 6.548299171274501e-05, + "loss": 1.8081, + "step": 13613 + }, + { + "epoch": 4.178637200736648, + "grad_norm": 0.25433486700057983, + "learning_rate": 6.547826539230442e-05, + "loss": 1.8009, + "step": 13614 + }, + { + "epoch": 4.178944137507673, + "grad_norm": 0.24452579021453857, + "learning_rate": 6.547353891889856e-05, + "loss": 1.7244, + "step": 13615 + }, + { + "epoch": 4.179251074278699, + "grad_norm": 0.20611275732517242, + "learning_rate": 6.546881229257411e-05, + "loss": 1.7566, + "step": 13616 + }, + { + "epoch": 4.179558011049724, + "grad_norm": 0.24557232856750488, + "learning_rate": 6.546408551337779e-05, + "loss": 1.7638, + "step": 13617 + }, + { + "epoch": 4.179864947820749, + "grad_norm": 0.2158801257610321, + "learning_rate": 6.545935858135631e-05, + "loss": 1.7659, + "step": 13618 + }, + { + "epoch": 4.180171884591774, + "grad_norm": 0.23800688982009888, + "learning_rate": 6.54546314965564e-05, + "loss": 1.7468, + "step": 13619 + }, + { + "epoch": 4.180478821362799, + "grad_norm": 0.2504122853279114, + "learning_rate": 6.544990425902476e-05, + "loss": 1.7682, + "step": 13620 + }, + { + "epoch": 4.180785758133824, + "grad_norm": 0.21556814014911652, + "learning_rate": 6.54451768688081e-05, + "loss": 1.772, + "step": 13621 + }, + { + "epoch": 4.18109269490485, + "grad_norm": 0.23404552042484283, + "learning_rate": 6.544044932595315e-05, + "loss": 1.7844, + "step": 13622 + }, + { + "epoch": 4.181399631675875, + "grad_norm": 0.22129055857658386, + "learning_rate": 6.543572163050664e-05, + "loss": 1.7725, + "step": 13623 + }, + { + "epoch": 4.1817065684469, + "grad_norm": 0.2533521354198456, + "learning_rate": 6.543099378251528e-05, + "loss": 1.7908, + "step": 13624 + }, + { + "epoch": 4.182013505217925, + "grad_norm": 0.2905815541744232, + "learning_rate": 6.542626578202579e-05, + "loss": 1.7913, + "step": 13625 + }, + { + "epoch": 4.18232044198895, + "grad_norm": 0.3330783247947693, + "learning_rate": 6.54215376290849e-05, + "loss": 1.8374, + "step": 13626 + }, + { + "epoch": 4.1826273787599755, + "grad_norm": 0.29268717765808105, + "learning_rate": 6.541680932373933e-05, + "loss": 1.8714, + "step": 13627 + }, + { + "epoch": 4.182934315531001, + "grad_norm": 0.2820781171321869, + "learning_rate": 6.541208086603584e-05, + "loss": 1.8089, + "step": 13628 + }, + { + "epoch": 4.183241252302026, + "grad_norm": 0.3062323033809662, + "learning_rate": 6.54073522560211e-05, + "loss": 1.7307, + "step": 13629 + }, + { + "epoch": 4.183548189073051, + "grad_norm": 0.3010510504245758, + "learning_rate": 6.54026234937419e-05, + "loss": 1.7523, + "step": 13630 + }, + { + "epoch": 4.183855125844076, + "grad_norm": 0.21932095289230347, + "learning_rate": 6.539789457924493e-05, + "loss": 1.737, + "step": 13631 + }, + { + "epoch": 4.184162062615101, + "grad_norm": 0.2710212469100952, + "learning_rate": 6.539316551257695e-05, + "loss": 1.7228, + "step": 13632 + }, + { + "epoch": 4.184468999386127, + "grad_norm": 0.2885816991329193, + "learning_rate": 6.538843629378469e-05, + "loss": 1.8734, + "step": 13633 + }, + { + "epoch": 4.184775936157152, + "grad_norm": 0.2621026635169983, + "learning_rate": 6.538370692291487e-05, + "loss": 1.7884, + "step": 13634 + }, + { + "epoch": 4.185082872928176, + "grad_norm": 0.30503126978874207, + "learning_rate": 6.537897740001426e-05, + "loss": 1.7833, + "step": 13635 + }, + { + "epoch": 4.185389809699202, + "grad_norm": 0.29491373896598816, + "learning_rate": 6.537424772512955e-05, + "loss": 1.7894, + "step": 13636 + }, + { + "epoch": 4.185696746470227, + "grad_norm": 0.24423296749591827, + "learning_rate": 6.536951789830754e-05, + "loss": 1.7409, + "step": 13637 + }, + { + "epoch": 4.186003683241252, + "grad_norm": 0.2184748351573944, + "learning_rate": 6.536478791959495e-05, + "loss": 1.747, + "step": 13638 + }, + { + "epoch": 4.186310620012278, + "grad_norm": 0.2348455935716629, + "learning_rate": 6.53600577890385e-05, + "loss": 1.7422, + "step": 13639 + }, + { + "epoch": 4.186617556783303, + "grad_norm": 0.2554566264152527, + "learning_rate": 6.535532750668497e-05, + "loss": 1.7623, + "step": 13640 + }, + { + "epoch": 4.1869244935543275, + "grad_norm": 0.26424553990364075, + "learning_rate": 6.535059707258109e-05, + "loss": 1.8408, + "step": 13641 + }, + { + "epoch": 4.187231430325353, + "grad_norm": 0.35363274812698364, + "learning_rate": 6.534586648677361e-05, + "loss": 1.7435, + "step": 13642 + }, + { + "epoch": 4.187538367096378, + "grad_norm": 0.3225265443325043, + "learning_rate": 6.534113574930926e-05, + "loss": 1.7181, + "step": 13643 + }, + { + "epoch": 4.1878453038674035, + "grad_norm": 0.23529650270938873, + "learning_rate": 6.533640486023485e-05, + "loss": 1.7712, + "step": 13644 + }, + { + "epoch": 4.188152240638429, + "grad_norm": 0.3490132987499237, + "learning_rate": 6.53316738195971e-05, + "loss": 1.7329, + "step": 13645 + }, + { + "epoch": 4.188459177409453, + "grad_norm": 0.3759285509586334, + "learning_rate": 6.532694262744274e-05, + "loss": 1.802, + "step": 13646 + }, + { + "epoch": 4.188766114180479, + "grad_norm": 0.27383577823638916, + "learning_rate": 6.532221128381858e-05, + "loss": 1.801, + "step": 13647 + }, + { + "epoch": 4.189073050951504, + "grad_norm": 0.23240652680397034, + "learning_rate": 6.531747978877132e-05, + "loss": 1.8415, + "step": 13648 + }, + { + "epoch": 4.189379987722529, + "grad_norm": 0.3302704989910126, + "learning_rate": 6.531274814234773e-05, + "loss": 1.7765, + "step": 13649 + }, + { + "epoch": 4.189686924493555, + "grad_norm": 0.3209368586540222, + "learning_rate": 6.530801634459463e-05, + "loss": 1.6935, + "step": 13650 + }, + { + "epoch": 4.189993861264579, + "grad_norm": 0.26643648743629456, + "learning_rate": 6.530328439555872e-05, + "loss": 1.8159, + "step": 13651 + }, + { + "epoch": 4.190300798035604, + "grad_norm": 0.22594431042671204, + "learning_rate": 6.529855229528679e-05, + "loss": 1.7764, + "step": 13652 + }, + { + "epoch": 4.19060773480663, + "grad_norm": 0.3288109302520752, + "learning_rate": 6.529382004382561e-05, + "loss": 1.7963, + "step": 13653 + }, + { + "epoch": 4.190914671577655, + "grad_norm": 0.3067106604576111, + "learning_rate": 6.528908764122191e-05, + "loss": 1.7564, + "step": 13654 + }, + { + "epoch": 4.19122160834868, + "grad_norm": 0.23437078297138214, + "learning_rate": 6.528435508752249e-05, + "loss": 1.759, + "step": 13655 + }, + { + "epoch": 4.191528545119706, + "grad_norm": 0.30662333965301514, + "learning_rate": 6.527962238277413e-05, + "loss": 1.7549, + "step": 13656 + }, + { + "epoch": 4.19183548189073, + "grad_norm": 0.3545009195804596, + "learning_rate": 6.527488952702356e-05, + "loss": 1.7761, + "step": 13657 + }, + { + "epoch": 4.1921424186617555, + "grad_norm": 0.2509438991546631, + "learning_rate": 6.52701565203176e-05, + "loss": 1.7162, + "step": 13658 + }, + { + "epoch": 4.192449355432781, + "grad_norm": 0.24423806369304657, + "learning_rate": 6.5265423362703e-05, + "loss": 1.735, + "step": 13659 + }, + { + "epoch": 4.192756292203806, + "grad_norm": 0.37365156412124634, + "learning_rate": 6.526069005422654e-05, + "loss": 1.7697, + "step": 13660 + }, + { + "epoch": 4.1930632289748315, + "grad_norm": 0.4025731682777405, + "learning_rate": 6.525595659493499e-05, + "loss": 1.7931, + "step": 13661 + }, + { + "epoch": 4.193370165745856, + "grad_norm": 0.31360915303230286, + "learning_rate": 6.525122298487514e-05, + "loss": 1.8014, + "step": 13662 + }, + { + "epoch": 4.193677102516881, + "grad_norm": 0.2480524778366089, + "learning_rate": 6.524648922409376e-05, + "loss": 1.7753, + "step": 13663 + }, + { + "epoch": 4.193984039287907, + "grad_norm": 0.33740919828414917, + "learning_rate": 6.524175531263765e-05, + "loss": 1.7296, + "step": 13664 + }, + { + "epoch": 4.194290976058932, + "grad_norm": 0.26871639490127563, + "learning_rate": 6.523702125055358e-05, + "loss": 1.7113, + "step": 13665 + }, + { + "epoch": 4.194597912829957, + "grad_norm": 0.2687455415725708, + "learning_rate": 6.52322870378883e-05, + "loss": 1.7645, + "step": 13666 + }, + { + "epoch": 4.194904849600983, + "grad_norm": 0.4207400679588318, + "learning_rate": 6.522755267468868e-05, + "loss": 1.7758, + "step": 13667 + }, + { + "epoch": 4.195211786372007, + "grad_norm": 0.36043494939804077, + "learning_rate": 6.522281816100142e-05, + "loss": 1.7433, + "step": 13668 + }, + { + "epoch": 4.195518723143032, + "grad_norm": 0.2515890598297119, + "learning_rate": 6.52180834968734e-05, + "loss": 1.7646, + "step": 13669 + }, + { + "epoch": 4.195825659914058, + "grad_norm": 0.2871458828449249, + "learning_rate": 6.521334868235132e-05, + "loss": 1.8147, + "step": 13670 + }, + { + "epoch": 4.196132596685083, + "grad_norm": 0.28454354405403137, + "learning_rate": 6.5208613717482e-05, + "loss": 1.8576, + "step": 13671 + }, + { + "epoch": 4.196439533456108, + "grad_norm": 0.2520541548728943, + "learning_rate": 6.520387860231227e-05, + "loss": 1.7513, + "step": 13672 + }, + { + "epoch": 4.196746470227133, + "grad_norm": 0.22782307863235474, + "learning_rate": 6.51991433368889e-05, + "loss": 1.7737, + "step": 13673 + }, + { + "epoch": 4.197053406998158, + "grad_norm": 0.2451259195804596, + "learning_rate": 6.519440792125869e-05, + "loss": 1.7483, + "step": 13674 + }, + { + "epoch": 4.1973603437691835, + "grad_norm": 0.21915963292121887, + "learning_rate": 6.518967235546841e-05, + "loss": 1.718, + "step": 13675 + }, + { + "epoch": 4.197667280540209, + "grad_norm": 0.23005805909633636, + "learning_rate": 6.51849366395649e-05, + "loss": 1.7786, + "step": 13676 + }, + { + "epoch": 4.197974217311234, + "grad_norm": 0.25039517879486084, + "learning_rate": 6.518020077359494e-05, + "loss": 1.7785, + "step": 13677 + }, + { + "epoch": 4.198281154082259, + "grad_norm": 0.26631081104278564, + "learning_rate": 6.517546475760535e-05, + "loss": 1.7921, + "step": 13678 + }, + { + "epoch": 4.198588090853284, + "grad_norm": 0.2220793515443802, + "learning_rate": 6.517072859164292e-05, + "loss": 1.7696, + "step": 13679 + }, + { + "epoch": 4.198895027624309, + "grad_norm": 0.24681030213832855, + "learning_rate": 6.516599227575446e-05, + "loss": 1.7702, + "step": 13680 + }, + { + "epoch": 4.199201964395335, + "grad_norm": 0.2421828955411911, + "learning_rate": 6.516125580998678e-05, + "loss": 1.8058, + "step": 13681 + }, + { + "epoch": 4.19950890116636, + "grad_norm": 0.2170087695121765, + "learning_rate": 6.515651919438667e-05, + "loss": 1.7271, + "step": 13682 + }, + { + "epoch": 4.199815837937384, + "grad_norm": 0.23383566737174988, + "learning_rate": 6.515178242900096e-05, + "loss": 1.7515, + "step": 13683 + }, + { + "epoch": 4.20012277470841, + "grad_norm": 0.2522997558116913, + "learning_rate": 6.514704551387645e-05, + "loss": 1.7619, + "step": 13684 + }, + { + "epoch": 4.200429711479435, + "grad_norm": 0.20973703265190125, + "learning_rate": 6.514230844905995e-05, + "loss": 1.7326, + "step": 13685 + }, + { + "epoch": 4.2007366482504604, + "grad_norm": 0.2308073341846466, + "learning_rate": 6.513757123459832e-05, + "loss": 1.811, + "step": 13686 + }, + { + "epoch": 4.201043585021486, + "grad_norm": 0.21751229465007782, + "learning_rate": 6.51328338705383e-05, + "loss": 1.7795, + "step": 13687 + }, + { + "epoch": 4.201350521792511, + "grad_norm": 0.2357407957315445, + "learning_rate": 6.512809635692675e-05, + "loss": 1.8069, + "step": 13688 + }, + { + "epoch": 4.201657458563536, + "grad_norm": 0.32245033979415894, + "learning_rate": 6.51233586938105e-05, + "loss": 1.8179, + "step": 13689 + }, + { + "epoch": 4.201964395334561, + "grad_norm": 0.22740167379379272, + "learning_rate": 6.511862088123635e-05, + "loss": 1.7482, + "step": 13690 + }, + { + "epoch": 4.202271332105586, + "grad_norm": 0.26880496740341187, + "learning_rate": 6.511388291925114e-05, + "loss": 1.7919, + "step": 13691 + }, + { + "epoch": 4.202578268876612, + "grad_norm": 0.2261822521686554, + "learning_rate": 6.510914480790166e-05, + "loss": 1.7543, + "step": 13692 + }, + { + "epoch": 4.202885205647637, + "grad_norm": 0.2635782063007355, + "learning_rate": 6.510440654723477e-05, + "loss": 1.7874, + "step": 13693 + }, + { + "epoch": 4.203192142418661, + "grad_norm": 0.2505982518196106, + "learning_rate": 6.509966813729726e-05, + "loss": 1.8016, + "step": 13694 + }, + { + "epoch": 4.203499079189687, + "grad_norm": 0.23177236318588257, + "learning_rate": 6.5094929578136e-05, + "loss": 1.7582, + "step": 13695 + }, + { + "epoch": 4.203806015960712, + "grad_norm": 0.2315056324005127, + "learning_rate": 6.509019086979779e-05, + "loss": 1.7418, + "step": 13696 + }, + { + "epoch": 4.204112952731737, + "grad_norm": 0.25565484166145325, + "learning_rate": 6.508545201232947e-05, + "loss": 1.7476, + "step": 13697 + }, + { + "epoch": 4.204419889502763, + "grad_norm": 0.29210081696510315, + "learning_rate": 6.508071300577787e-05, + "loss": 1.8397, + "step": 13698 + }, + { + "epoch": 4.204726826273788, + "grad_norm": 0.2830582559108734, + "learning_rate": 6.507597385018984e-05, + "loss": 1.834, + "step": 13699 + }, + { + "epoch": 4.2050337630448125, + "grad_norm": 0.23013398051261902, + "learning_rate": 6.507123454561217e-05, + "loss": 1.7593, + "step": 13700 + }, + { + "epoch": 4.205340699815838, + "grad_norm": 0.21970276534557343, + "learning_rate": 6.506649509209174e-05, + "loss": 1.754, + "step": 13701 + }, + { + "epoch": 4.205647636586863, + "grad_norm": 0.32052233815193176, + "learning_rate": 6.50617554896754e-05, + "loss": 1.7531, + "step": 13702 + }, + { + "epoch": 4.2059545733578885, + "grad_norm": 0.2597332000732422, + "learning_rate": 6.505701573840995e-05, + "loss": 1.7836, + "step": 13703 + }, + { + "epoch": 4.206261510128914, + "grad_norm": 0.22070355713367462, + "learning_rate": 6.505227583834224e-05, + "loss": 1.7225, + "step": 13704 + }, + { + "epoch": 4.206568446899938, + "grad_norm": 0.27219358086586, + "learning_rate": 6.50475357895191e-05, + "loss": 1.8215, + "step": 13705 + }, + { + "epoch": 4.206875383670964, + "grad_norm": 0.32541659474372864, + "learning_rate": 6.504279559198741e-05, + "loss": 1.7786, + "step": 13706 + }, + { + "epoch": 4.207182320441989, + "grad_norm": 0.25871729850769043, + "learning_rate": 6.5038055245794e-05, + "loss": 1.7621, + "step": 13707 + }, + { + "epoch": 4.207489257213014, + "grad_norm": 0.2190464735031128, + "learning_rate": 6.50333147509857e-05, + "loss": 1.7612, + "step": 13708 + }, + { + "epoch": 4.20779619398404, + "grad_norm": 0.19565832614898682, + "learning_rate": 6.50285741076094e-05, + "loss": 1.7581, + "step": 13709 + }, + { + "epoch": 4.208103130755064, + "grad_norm": 0.1889251321554184, + "learning_rate": 6.50238333157119e-05, + "loss": 1.7611, + "step": 13710 + }, + { + "epoch": 4.208410067526089, + "grad_norm": 0.2013053596019745, + "learning_rate": 6.501909237534008e-05, + "loss": 1.7393, + "step": 13711 + }, + { + "epoch": 4.208717004297115, + "grad_norm": 0.1899433434009552, + "learning_rate": 6.501435128654077e-05, + "loss": 1.7122, + "step": 13712 + }, + { + "epoch": 4.20902394106814, + "grad_norm": 0.19337882101535797, + "learning_rate": 6.500961004936085e-05, + "loss": 1.7538, + "step": 13713 + }, + { + "epoch": 4.209330877839165, + "grad_norm": 0.20419920980930328, + "learning_rate": 6.500486866384718e-05, + "loss": 1.728, + "step": 13714 + }, + { + "epoch": 4.209637814610191, + "grad_norm": 0.20615679025650024, + "learning_rate": 6.50001271300466e-05, + "loss": 1.7843, + "step": 13715 + }, + { + "epoch": 4.209944751381215, + "grad_norm": 0.22178977727890015, + "learning_rate": 6.499538544800596e-05, + "loss": 1.7751, + "step": 13716 + }, + { + "epoch": 4.2102516881522405, + "grad_norm": 0.23703891038894653, + "learning_rate": 6.499064361777214e-05, + "loss": 1.7304, + "step": 13717 + }, + { + "epoch": 4.210558624923266, + "grad_norm": 0.2785723805427551, + "learning_rate": 6.498590163939198e-05, + "loss": 1.802, + "step": 13718 + }, + { + "epoch": 4.210865561694291, + "grad_norm": 0.23277060687541962, + "learning_rate": 6.498115951291237e-05, + "loss": 1.7316, + "step": 13719 + }, + { + "epoch": 4.2111724984653165, + "grad_norm": 0.22289474308490753, + "learning_rate": 6.497641723838017e-05, + "loss": 1.8469, + "step": 13720 + }, + { + "epoch": 4.211479435236341, + "grad_norm": 0.2715846002101898, + "learning_rate": 6.497167481584221e-05, + "loss": 1.7919, + "step": 13721 + }, + { + "epoch": 4.211786372007366, + "grad_norm": 0.29262226819992065, + "learning_rate": 6.49669322453454e-05, + "loss": 1.8379, + "step": 13722 + }, + { + "epoch": 4.212093308778392, + "grad_norm": 0.29136186838150024, + "learning_rate": 6.49621895269366e-05, + "loss": 1.789, + "step": 13723 + }, + { + "epoch": 4.212400245549417, + "grad_norm": 0.25110194087028503, + "learning_rate": 6.495744666066266e-05, + "loss": 1.7574, + "step": 13724 + }, + { + "epoch": 4.212707182320442, + "grad_norm": 0.2301366776227951, + "learning_rate": 6.495270364657048e-05, + "loss": 1.7637, + "step": 13725 + }, + { + "epoch": 4.213014119091467, + "grad_norm": 0.2556478977203369, + "learning_rate": 6.49479604847069e-05, + "loss": 1.7975, + "step": 13726 + }, + { + "epoch": 4.213321055862492, + "grad_norm": 0.2645667493343353, + "learning_rate": 6.494321717511884e-05, + "loss": 1.7594, + "step": 13727 + }, + { + "epoch": 4.213627992633517, + "grad_norm": 0.23664188385009766, + "learning_rate": 6.493847371785312e-05, + "loss": 1.7963, + "step": 13728 + }, + { + "epoch": 4.213934929404543, + "grad_norm": 0.2947930693626404, + "learning_rate": 6.493373011295665e-05, + "loss": 1.7477, + "step": 13729 + }, + { + "epoch": 4.214241866175568, + "grad_norm": 0.34598737955093384, + "learning_rate": 6.492898636047631e-05, + "loss": 1.7014, + "step": 13730 + }, + { + "epoch": 4.214548802946593, + "grad_norm": 0.24406935274600983, + "learning_rate": 6.4924242460459e-05, + "loss": 1.7436, + "step": 13731 + }, + { + "epoch": 4.214855739717618, + "grad_norm": 0.27176225185394287, + "learning_rate": 6.491949841295156e-05, + "loss": 1.8429, + "step": 13732 + }, + { + "epoch": 4.215162676488643, + "grad_norm": 0.2506968080997467, + "learning_rate": 6.491475421800089e-05, + "loss": 1.7519, + "step": 13733 + }, + { + "epoch": 4.2154696132596685, + "grad_norm": 0.2240980863571167, + "learning_rate": 6.491000987565387e-05, + "loss": 1.7595, + "step": 13734 + }, + { + "epoch": 4.215776550030694, + "grad_norm": 0.23201732337474823, + "learning_rate": 6.490526538595741e-05, + "loss": 1.7466, + "step": 13735 + }, + { + "epoch": 4.216083486801719, + "grad_norm": 0.24624750018119812, + "learning_rate": 6.490052074895836e-05, + "loss": 1.7364, + "step": 13736 + }, + { + "epoch": 4.216390423572744, + "grad_norm": 0.22936980426311493, + "learning_rate": 6.489577596470366e-05, + "loss": 1.7095, + "step": 13737 + }, + { + "epoch": 4.216697360343769, + "grad_norm": 0.2106638103723526, + "learning_rate": 6.489103103324016e-05, + "loss": 1.7387, + "step": 13738 + }, + { + "epoch": 4.217004297114794, + "grad_norm": 0.2936140298843384, + "learning_rate": 6.488628595461477e-05, + "loss": 1.9129, + "step": 13739 + }, + { + "epoch": 4.21731123388582, + "grad_norm": 0.21871696412563324, + "learning_rate": 6.488154072887435e-05, + "loss": 1.7489, + "step": 13740 + }, + { + "epoch": 4.217618170656845, + "grad_norm": 0.25941070914268494, + "learning_rate": 6.487679535606583e-05, + "loss": 1.7788, + "step": 13741 + }, + { + "epoch": 4.21792510742787, + "grad_norm": 0.2540862560272217, + "learning_rate": 6.487204983623612e-05, + "loss": 1.8074, + "step": 13742 + }, + { + "epoch": 4.218232044198895, + "grad_norm": 0.25180327892303467, + "learning_rate": 6.486730416943207e-05, + "loss": 1.7503, + "step": 13743 + }, + { + "epoch": 4.21853898096992, + "grad_norm": 0.26625585556030273, + "learning_rate": 6.486255835570063e-05, + "loss": 1.8149, + "step": 13744 + }, + { + "epoch": 4.218845917740945, + "grad_norm": 0.3023914396762848, + "learning_rate": 6.485781239508867e-05, + "loss": 1.8599, + "step": 13745 + }, + { + "epoch": 4.219152854511971, + "grad_norm": 0.2683780789375305, + "learning_rate": 6.48530662876431e-05, + "loss": 1.7911, + "step": 13746 + }, + { + "epoch": 4.219459791282996, + "grad_norm": 0.20747442543506622, + "learning_rate": 6.484832003341081e-05, + "loss": 1.7343, + "step": 13747 + }, + { + "epoch": 4.2197667280540205, + "grad_norm": 0.29284465312957764, + "learning_rate": 6.484357363243873e-05, + "loss": 1.7917, + "step": 13748 + }, + { + "epoch": 4.220073664825046, + "grad_norm": 0.24303840100765228, + "learning_rate": 6.483882708477376e-05, + "loss": 1.7921, + "step": 13749 + }, + { + "epoch": 4.220380601596071, + "grad_norm": 0.26253026723861694, + "learning_rate": 6.48340803904628e-05, + "loss": 1.7971, + "step": 13750 + }, + { + "epoch": 4.2206875383670965, + "grad_norm": 0.23888511955738068, + "learning_rate": 6.482933354955275e-05, + "loss": 1.7967, + "step": 13751 + }, + { + "epoch": 4.220994475138122, + "grad_norm": 0.24966883659362793, + "learning_rate": 6.482458656209054e-05, + "loss": 1.7924, + "step": 13752 + }, + { + "epoch": 4.221301411909146, + "grad_norm": 0.26556864380836487, + "learning_rate": 6.481983942812309e-05, + "loss": 1.8608, + "step": 13753 + }, + { + "epoch": 4.221608348680172, + "grad_norm": 0.29064711928367615, + "learning_rate": 6.48150921476973e-05, + "loss": 1.7785, + "step": 13754 + }, + { + "epoch": 4.221915285451197, + "grad_norm": 0.30876123905181885, + "learning_rate": 6.481034472086008e-05, + "loss": 1.8287, + "step": 13755 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.2622467875480652, + "learning_rate": 6.480559714765835e-05, + "loss": 1.8336, + "step": 13756 + }, + { + "epoch": 4.222529158993248, + "grad_norm": 0.2502644956111908, + "learning_rate": 6.480084942813902e-05, + "loss": 1.7803, + "step": 13757 + }, + { + "epoch": 4.222836095764273, + "grad_norm": 0.2879922688007355, + "learning_rate": 6.479610156234903e-05, + "loss": 1.7544, + "step": 13758 + }, + { + "epoch": 4.223143032535297, + "grad_norm": 0.2831384241580963, + "learning_rate": 6.47913535503353e-05, + "loss": 1.887, + "step": 13759 + }, + { + "epoch": 4.223449969306323, + "grad_norm": 0.3221064805984497, + "learning_rate": 6.478660539214474e-05, + "loss": 1.7455, + "step": 13760 + }, + { + "epoch": 4.223756906077348, + "grad_norm": 0.4231930673122406, + "learning_rate": 6.478185708782427e-05, + "loss": 1.8209, + "step": 13761 + }, + { + "epoch": 4.224063842848373, + "grad_norm": 0.34327802062034607, + "learning_rate": 6.477710863742083e-05, + "loss": 1.7754, + "step": 13762 + }, + { + "epoch": 4.224370779619399, + "grad_norm": 0.21713349223136902, + "learning_rate": 6.477236004098135e-05, + "loss": 1.7576, + "step": 13763 + }, + { + "epoch": 4.224677716390423, + "grad_norm": 0.3262602388858795, + "learning_rate": 6.476761129855275e-05, + "loss": 1.7772, + "step": 13764 + }, + { + "epoch": 4.2249846531614486, + "grad_norm": 0.3231413662433624, + "learning_rate": 6.476286241018195e-05, + "loss": 1.7821, + "step": 13765 + }, + { + "epoch": 4.225291589932474, + "grad_norm": 0.2440098226070404, + "learning_rate": 6.475811337591588e-05, + "loss": 1.7684, + "step": 13766 + }, + { + "epoch": 4.225598526703499, + "grad_norm": 0.329949289560318, + "learning_rate": 6.475336419580151e-05, + "loss": 1.8564, + "step": 13767 + }, + { + "epoch": 4.225905463474525, + "grad_norm": 0.3567483425140381, + "learning_rate": 6.474861486988574e-05, + "loss": 1.7625, + "step": 13768 + }, + { + "epoch": 4.226212400245549, + "grad_norm": 0.25257283449172974, + "learning_rate": 6.47438653982155e-05, + "loss": 1.823, + "step": 13769 + }, + { + "epoch": 4.226519337016574, + "grad_norm": 0.31542617082595825, + "learning_rate": 6.473911578083776e-05, + "loss": 1.7817, + "step": 13770 + }, + { + "epoch": 4.2268262737876, + "grad_norm": 0.29670149087905884, + "learning_rate": 6.473436601779944e-05, + "loss": 1.7493, + "step": 13771 + }, + { + "epoch": 4.227133210558625, + "grad_norm": 0.2635453939437866, + "learning_rate": 6.472961610914745e-05, + "loss": 1.792, + "step": 13772 + }, + { + "epoch": 4.22744014732965, + "grad_norm": 0.25017979741096497, + "learning_rate": 6.472486605492878e-05, + "loss": 1.7183, + "step": 13773 + }, + { + "epoch": 4.227747084100676, + "grad_norm": 0.3766646087169647, + "learning_rate": 6.472011585519034e-05, + "loss": 1.8039, + "step": 13774 + }, + { + "epoch": 4.2280540208717, + "grad_norm": 0.29860204458236694, + "learning_rate": 6.47153655099791e-05, + "loss": 1.8016, + "step": 13775 + }, + { + "epoch": 4.2283609576427255, + "grad_norm": 0.2540898323059082, + "learning_rate": 6.4710615019342e-05, + "loss": 1.8481, + "step": 13776 + }, + { + "epoch": 4.228667894413751, + "grad_norm": 0.3677786886692047, + "learning_rate": 6.470586438332597e-05, + "loss": 1.7663, + "step": 13777 + }, + { + "epoch": 4.228974831184776, + "grad_norm": 0.35693466663360596, + "learning_rate": 6.470111360197797e-05, + "loss": 1.7733, + "step": 13778 + }, + { + "epoch": 4.2292817679558015, + "grad_norm": 0.23747926950454712, + "learning_rate": 6.469636267534496e-05, + "loss": 1.7938, + "step": 13779 + }, + { + "epoch": 4.229588704726826, + "grad_norm": 0.32890695333480835, + "learning_rate": 6.469161160347386e-05, + "loss": 1.7233, + "step": 13780 + }, + { + "epoch": 4.229895641497851, + "grad_norm": 0.3437706530094147, + "learning_rate": 6.468686038641164e-05, + "loss": 1.7716, + "step": 13781 + }, + { + "epoch": 4.230202578268877, + "grad_norm": 0.23452162742614746, + "learning_rate": 6.468210902420527e-05, + "loss": 1.764, + "step": 13782 + }, + { + "epoch": 4.230509515039902, + "grad_norm": 0.3205265402793884, + "learning_rate": 6.46773575169017e-05, + "loss": 1.7464, + "step": 13783 + }, + { + "epoch": 4.230816451810927, + "grad_norm": 0.4234732985496521, + "learning_rate": 6.467260586454787e-05, + "loss": 1.7786, + "step": 13784 + }, + { + "epoch": 4.231123388581952, + "grad_norm": 0.2484128773212433, + "learning_rate": 6.466785406719076e-05, + "loss": 1.8125, + "step": 13785 + }, + { + "epoch": 4.231430325352977, + "grad_norm": 0.3696556091308594, + "learning_rate": 6.46631021248773e-05, + "loss": 1.7974, + "step": 13786 + }, + { + "epoch": 4.231737262124002, + "grad_norm": 0.4251437485218048, + "learning_rate": 6.465835003765449e-05, + "loss": 1.7486, + "step": 13787 + }, + { + "epoch": 4.232044198895028, + "grad_norm": 0.2507621943950653, + "learning_rate": 6.465359780556927e-05, + "loss": 1.829, + "step": 13788 + }, + { + "epoch": 4.232351135666053, + "grad_norm": 0.2911818325519562, + "learning_rate": 6.464884542866861e-05, + "loss": 1.7401, + "step": 13789 + }, + { + "epoch": 4.232658072437078, + "grad_norm": 0.35354506969451904, + "learning_rate": 6.464409290699946e-05, + "loss": 1.7848, + "step": 13790 + }, + { + "epoch": 4.232965009208103, + "grad_norm": 0.2659081518650055, + "learning_rate": 6.46393402406088e-05, + "loss": 1.7408, + "step": 13791 + }, + { + "epoch": 4.233271945979128, + "grad_norm": 0.22676481306552887, + "learning_rate": 6.46345874295436e-05, + "loss": 1.7542, + "step": 13792 + }, + { + "epoch": 4.2335788827501535, + "grad_norm": 0.2549789845943451, + "learning_rate": 6.462983447385085e-05, + "loss": 1.8095, + "step": 13793 + }, + { + "epoch": 4.233885819521179, + "grad_norm": 0.2157238870859146, + "learning_rate": 6.462508137357748e-05, + "loss": 1.7529, + "step": 13794 + }, + { + "epoch": 4.234192756292204, + "grad_norm": 0.2494724988937378, + "learning_rate": 6.46203281287705e-05, + "loss": 1.7839, + "step": 13795 + }, + { + "epoch": 4.234499693063229, + "grad_norm": 0.29560065269470215, + "learning_rate": 6.461557473947685e-05, + "loss": 1.7239, + "step": 13796 + }, + { + "epoch": 4.234806629834254, + "grad_norm": 0.23693916201591492, + "learning_rate": 6.461082120574354e-05, + "loss": 1.8074, + "step": 13797 + }, + { + "epoch": 4.235113566605279, + "grad_norm": 0.2538869082927704, + "learning_rate": 6.460606752761752e-05, + "loss": 1.8319, + "step": 13798 + }, + { + "epoch": 4.235420503376305, + "grad_norm": 0.3186401426792145, + "learning_rate": 6.460131370514578e-05, + "loss": 1.7877, + "step": 13799 + }, + { + "epoch": 4.23572744014733, + "grad_norm": 0.2473619133234024, + "learning_rate": 6.45965597383753e-05, + "loss": 1.8323, + "step": 13800 + }, + { + "epoch": 4.236034376918354, + "grad_norm": 0.32806503772735596, + "learning_rate": 6.459180562735307e-05, + "loss": 1.744, + "step": 13801 + }, + { + "epoch": 4.23634131368938, + "grad_norm": 0.3975784480571747, + "learning_rate": 6.458705137212606e-05, + "loss": 1.7216, + "step": 13802 + }, + { + "epoch": 4.236648250460405, + "grad_norm": 0.2946135997772217, + "learning_rate": 6.458229697274125e-05, + "loss": 1.8781, + "step": 13803 + }, + { + "epoch": 4.23695518723143, + "grad_norm": 0.25109192728996277, + "learning_rate": 6.457754242924565e-05, + "loss": 1.7458, + "step": 13804 + }, + { + "epoch": 4.237262124002456, + "grad_norm": 0.2763883173465729, + "learning_rate": 6.457278774168623e-05, + "loss": 1.7612, + "step": 13805 + }, + { + "epoch": 4.237569060773481, + "grad_norm": 0.22427856922149658, + "learning_rate": 6.456803291010996e-05, + "loss": 1.8049, + "step": 13806 + }, + { + "epoch": 4.2378759975445055, + "grad_norm": 0.28295788168907166, + "learning_rate": 6.456327793456387e-05, + "loss": 1.7608, + "step": 13807 + }, + { + "epoch": 4.238182934315531, + "grad_norm": 0.27857527136802673, + "learning_rate": 6.455852281509493e-05, + "loss": 1.7281, + "step": 13808 + }, + { + "epoch": 4.238489871086556, + "grad_norm": 0.24014849960803986, + "learning_rate": 6.455376755175012e-05, + "loss": 1.7247, + "step": 13809 + }, + { + "epoch": 4.2387968078575815, + "grad_norm": 0.25149038434028625, + "learning_rate": 6.454901214457646e-05, + "loss": 1.8575, + "step": 13810 + }, + { + "epoch": 4.239103744628607, + "grad_norm": 0.32072681188583374, + "learning_rate": 6.454425659362093e-05, + "loss": 1.7421, + "step": 13811 + }, + { + "epoch": 4.239410681399631, + "grad_norm": 0.28418242931365967, + "learning_rate": 6.453950089893054e-05, + "loss": 1.7031, + "step": 13812 + }, + { + "epoch": 4.239717618170657, + "grad_norm": 0.23725132644176483, + "learning_rate": 6.453474506055228e-05, + "loss": 1.7901, + "step": 13813 + }, + { + "epoch": 4.240024554941682, + "grad_norm": 0.3056317865848541, + "learning_rate": 6.452998907853315e-05, + "loss": 1.7414, + "step": 13814 + }, + { + "epoch": 4.240331491712707, + "grad_norm": 0.3111891448497772, + "learning_rate": 6.452523295292013e-05, + "loss": 1.7532, + "step": 13815 + }, + { + "epoch": 4.240638428483733, + "grad_norm": 0.2126779705286026, + "learning_rate": 6.452047668376027e-05, + "loss": 1.6779, + "step": 13816 + }, + { + "epoch": 4.240945365254758, + "grad_norm": 0.26660779118537903, + "learning_rate": 6.451572027110054e-05, + "loss": 1.7162, + "step": 13817 + }, + { + "epoch": 4.241252302025782, + "grad_norm": 0.25901922583580017, + "learning_rate": 6.451096371498794e-05, + "loss": 1.7784, + "step": 13818 + }, + { + "epoch": 4.241559238796808, + "grad_norm": 0.24091807007789612, + "learning_rate": 6.450620701546953e-05, + "loss": 1.7928, + "step": 13819 + }, + { + "epoch": 4.241866175567833, + "grad_norm": 0.25097009539604187, + "learning_rate": 6.450145017259225e-05, + "loss": 1.761, + "step": 13820 + }, + { + "epoch": 4.242173112338858, + "grad_norm": 0.22978942096233368, + "learning_rate": 6.449669318640315e-05, + "loss": 1.7891, + "step": 13821 + }, + { + "epoch": 4.242480049109884, + "grad_norm": 0.27255937457084656, + "learning_rate": 6.449193605694923e-05, + "loss": 1.7964, + "step": 13822 + }, + { + "epoch": 4.242786985880908, + "grad_norm": 0.2210773378610611, + "learning_rate": 6.44871787842775e-05, + "loss": 1.7628, + "step": 13823 + }, + { + "epoch": 4.2430939226519335, + "grad_norm": 0.25784751772880554, + "learning_rate": 6.448242136843497e-05, + "loss": 1.7596, + "step": 13824 + }, + { + "epoch": 4.243400859422959, + "grad_norm": 0.23475486040115356, + "learning_rate": 6.447766380946868e-05, + "loss": 1.8174, + "step": 13825 + }, + { + "epoch": 4.243707796193984, + "grad_norm": 0.2567705512046814, + "learning_rate": 6.447290610742561e-05, + "loss": 1.737, + "step": 13826 + }, + { + "epoch": 4.2440147329650095, + "grad_norm": 0.23973144590854645, + "learning_rate": 6.446814826235281e-05, + "loss": 1.7881, + "step": 13827 + }, + { + "epoch": 4.244321669736034, + "grad_norm": 0.25584739446640015, + "learning_rate": 6.446339027429729e-05, + "loss": 1.7673, + "step": 13828 + }, + { + "epoch": 4.244628606507059, + "grad_norm": 0.2653748393058777, + "learning_rate": 6.445863214330608e-05, + "loss": 1.7443, + "step": 13829 + }, + { + "epoch": 4.244935543278085, + "grad_norm": 0.2492038607597351, + "learning_rate": 6.445387386942619e-05, + "loss": 1.7223, + "step": 13830 + }, + { + "epoch": 4.24524248004911, + "grad_norm": 0.2282228320837021, + "learning_rate": 6.444911545270464e-05, + "loss": 1.7577, + "step": 13831 + }, + { + "epoch": 4.245549416820135, + "grad_norm": 0.2411092072725296, + "learning_rate": 6.444435689318845e-05, + "loss": 1.7324, + "step": 13832 + }, + { + "epoch": 4.245856353591161, + "grad_norm": 0.21557089686393738, + "learning_rate": 6.443959819092468e-05, + "loss": 1.7355, + "step": 13833 + }, + { + "epoch": 4.246163290362185, + "grad_norm": 0.2500394880771637, + "learning_rate": 6.443483934596033e-05, + "loss": 1.775, + "step": 13834 + }, + { + "epoch": 4.24647022713321, + "grad_norm": 0.24135248363018036, + "learning_rate": 6.443008035834244e-05, + "loss": 1.7885, + "step": 13835 + }, + { + "epoch": 4.246777163904236, + "grad_norm": 0.22860904037952423, + "learning_rate": 6.442532122811803e-05, + "loss": 1.7891, + "step": 13836 + }, + { + "epoch": 4.247084100675261, + "grad_norm": 0.2277665138244629, + "learning_rate": 6.442056195533415e-05, + "loss": 1.7583, + "step": 13837 + }, + { + "epoch": 4.247391037446286, + "grad_norm": 0.22822454571723938, + "learning_rate": 6.441580254003782e-05, + "loss": 1.7777, + "step": 13838 + }, + { + "epoch": 4.247697974217311, + "grad_norm": 0.24274896085262299, + "learning_rate": 6.441104298227608e-05, + "loss": 1.7537, + "step": 13839 + }, + { + "epoch": 4.248004910988336, + "grad_norm": 0.25080999732017517, + "learning_rate": 6.440628328209598e-05, + "loss": 1.7537, + "step": 13840 + }, + { + "epoch": 4.2483118477593615, + "grad_norm": 0.22409579157829285, + "learning_rate": 6.440152343954453e-05, + "loss": 1.7652, + "step": 13841 + }, + { + "epoch": 4.248618784530387, + "grad_norm": 0.24028798937797546, + "learning_rate": 6.439676345466877e-05, + "loss": 1.7512, + "step": 13842 + }, + { + "epoch": 4.248925721301412, + "grad_norm": 0.28739503026008606, + "learning_rate": 6.439200332751576e-05, + "loss": 1.8034, + "step": 13843 + }, + { + "epoch": 4.249232658072437, + "grad_norm": 0.2244807928800583, + "learning_rate": 6.438724305813255e-05, + "loss": 1.7243, + "step": 13844 + }, + { + "epoch": 4.249539594843462, + "grad_norm": 0.24478118121623993, + "learning_rate": 6.438248264656618e-05, + "loss": 1.7754, + "step": 13845 + }, + { + "epoch": 4.249846531614487, + "grad_norm": 0.25554370880126953, + "learning_rate": 6.437772209286368e-05, + "loss": 1.7845, + "step": 13846 + }, + { + "epoch": 4.250153468385513, + "grad_norm": 0.24478472769260406, + "learning_rate": 6.43729613970721e-05, + "loss": 1.7954, + "step": 13847 + }, + { + "epoch": 4.250460405156538, + "grad_norm": 0.22287282347679138, + "learning_rate": 6.436820055923849e-05, + "loss": 1.7379, + "step": 13848 + }, + { + "epoch": 4.250767341927563, + "grad_norm": 0.2810569703578949, + "learning_rate": 6.43634395794099e-05, + "loss": 1.8492, + "step": 13849 + }, + { + "epoch": 4.251074278698588, + "grad_norm": 0.2544163465499878, + "learning_rate": 6.435867845763337e-05, + "loss": 1.7846, + "step": 13850 + }, + { + "epoch": 4.251381215469613, + "grad_norm": 0.27879175543785095, + "learning_rate": 6.435391719395598e-05, + "loss": 1.767, + "step": 13851 + }, + { + "epoch": 4.2516881522406385, + "grad_norm": 0.2876715362071991, + "learning_rate": 6.434915578842477e-05, + "loss": 1.8048, + "step": 13852 + }, + { + "epoch": 4.251995089011664, + "grad_norm": 0.27844297885894775, + "learning_rate": 6.434439424108678e-05, + "loss": 1.7472, + "step": 13853 + }, + { + "epoch": 4.252302025782689, + "grad_norm": 0.2417020946741104, + "learning_rate": 6.43396325519891e-05, + "loss": 1.8481, + "step": 13854 + }, + { + "epoch": 4.252608962553714, + "grad_norm": 0.23828522861003876, + "learning_rate": 6.433487072117874e-05, + "loss": 1.7536, + "step": 13855 + }, + { + "epoch": 4.252915899324739, + "grad_norm": 0.22304333746433258, + "learning_rate": 6.43301087487028e-05, + "loss": 1.741, + "step": 13856 + }, + { + "epoch": 4.253222836095764, + "grad_norm": 0.27089163661003113, + "learning_rate": 6.432534663460832e-05, + "loss": 1.7974, + "step": 13857 + }, + { + "epoch": 4.25352977286679, + "grad_norm": 0.2439592182636261, + "learning_rate": 6.432058437894237e-05, + "loss": 1.7713, + "step": 13858 + }, + { + "epoch": 4.253836709637815, + "grad_norm": 0.2368553727865219, + "learning_rate": 6.431582198175203e-05, + "loss": 1.6915, + "step": 13859 + }, + { + "epoch": 4.25414364640884, + "grad_norm": 0.25248441100120544, + "learning_rate": 6.431105944308431e-05, + "loss": 1.7286, + "step": 13860 + }, + { + "epoch": 4.254450583179865, + "grad_norm": 0.20928484201431274, + "learning_rate": 6.430629676298634e-05, + "loss": 1.79, + "step": 13861 + }, + { + "epoch": 4.25475751995089, + "grad_norm": 0.25262540578842163, + "learning_rate": 6.430153394150514e-05, + "loss": 1.7443, + "step": 13862 + }, + { + "epoch": 4.255064456721915, + "grad_norm": 0.27508237957954407, + "learning_rate": 6.429677097868783e-05, + "loss": 1.8207, + "step": 13863 + }, + { + "epoch": 4.255371393492941, + "grad_norm": 0.28129303455352783, + "learning_rate": 6.429200787458141e-05, + "loss": 1.7589, + "step": 13864 + }, + { + "epoch": 4.255678330263966, + "grad_norm": 0.3205658495426178, + "learning_rate": 6.428724462923302e-05, + "loss": 1.8037, + "step": 13865 + }, + { + "epoch": 4.2559852670349905, + "grad_norm": 0.24048078060150146, + "learning_rate": 6.428248124268969e-05, + "loss": 1.7303, + "step": 13866 + }, + { + "epoch": 4.256292203806016, + "grad_norm": 0.24742475152015686, + "learning_rate": 6.427771771499852e-05, + "loss": 1.7753, + "step": 13867 + }, + { + "epoch": 4.256599140577041, + "grad_norm": 0.3082354962825775, + "learning_rate": 6.427295404620656e-05, + "loss": 1.7275, + "step": 13868 + }, + { + "epoch": 4.2569060773480665, + "grad_norm": 0.23319822549819946, + "learning_rate": 6.426819023636093e-05, + "loss": 1.7562, + "step": 13869 + }, + { + "epoch": 4.257213014119092, + "grad_norm": 0.2611405551433563, + "learning_rate": 6.426342628550866e-05, + "loss": 1.7417, + "step": 13870 + }, + { + "epoch": 4.257519950890116, + "grad_norm": 0.2577543258666992, + "learning_rate": 6.425866219369686e-05, + "loss": 1.6906, + "step": 13871 + }, + { + "epoch": 4.257826887661142, + "grad_norm": 0.31353357434272766, + "learning_rate": 6.42538979609726e-05, + "loss": 1.7155, + "step": 13872 + }, + { + "epoch": 4.258133824432167, + "grad_norm": 0.23280073702335358, + "learning_rate": 6.424913358738296e-05, + "loss": 1.7576, + "step": 13873 + }, + { + "epoch": 4.258440761203192, + "grad_norm": 0.24087542295455933, + "learning_rate": 6.424436907297504e-05, + "loss": 1.7622, + "step": 13874 + }, + { + "epoch": 4.258747697974218, + "grad_norm": 0.3146509826183319, + "learning_rate": 6.42396044177959e-05, + "loss": 1.769, + "step": 13875 + }, + { + "epoch": 4.259054634745242, + "grad_norm": 0.2645811438560486, + "learning_rate": 6.423483962189268e-05, + "loss": 1.7713, + "step": 13876 + }, + { + "epoch": 4.259361571516267, + "grad_norm": 0.2166455090045929, + "learning_rate": 6.423007468531238e-05, + "loss": 1.7705, + "step": 13877 + }, + { + "epoch": 4.259668508287293, + "grad_norm": 0.29142528772354126, + "learning_rate": 6.422530960810217e-05, + "loss": 1.7725, + "step": 13878 + }, + { + "epoch": 4.259975445058318, + "grad_norm": 0.28777652978897095, + "learning_rate": 6.422054439030911e-05, + "loss": 1.7853, + "step": 13879 + }, + { + "epoch": 4.260282381829343, + "grad_norm": 0.2285117357969284, + "learning_rate": 6.42157790319803e-05, + "loss": 1.7034, + "step": 13880 + }, + { + "epoch": 4.260589318600369, + "grad_norm": 0.32407644391059875, + "learning_rate": 6.421101353316282e-05, + "loss": 1.7858, + "step": 13881 + }, + { + "epoch": 4.260896255371393, + "grad_norm": 0.4803469777107239, + "learning_rate": 6.420624789390378e-05, + "loss": 1.7337, + "step": 13882 + }, + { + "epoch": 4.2612031921424185, + "grad_norm": 0.4245823919773102, + "learning_rate": 6.420148211425027e-05, + "loss": 1.8024, + "step": 13883 + }, + { + "epoch": 4.261510128913444, + "grad_norm": 0.22298674285411835, + "learning_rate": 6.419671619424938e-05, + "loss": 1.7129, + "step": 13884 + }, + { + "epoch": 4.261817065684469, + "grad_norm": 0.46955862641334534, + "learning_rate": 6.419195013394824e-05, + "loss": 1.7151, + "step": 13885 + }, + { + "epoch": 4.2621240024554945, + "grad_norm": 0.4809224009513855, + "learning_rate": 6.418718393339392e-05, + "loss": 1.7697, + "step": 13886 + }, + { + "epoch": 4.262430939226519, + "grad_norm": 0.2741130292415619, + "learning_rate": 6.418241759263353e-05, + "loss": 1.8133, + "step": 13887 + }, + { + "epoch": 4.262737875997544, + "grad_norm": 0.3673117756843567, + "learning_rate": 6.417765111171419e-05, + "loss": 1.7424, + "step": 13888 + }, + { + "epoch": 4.26304481276857, + "grad_norm": 0.4609327018260956, + "learning_rate": 6.417288449068299e-05, + "loss": 1.741, + "step": 13889 + }, + { + "epoch": 4.263351749539595, + "grad_norm": 0.2929460406303406, + "learning_rate": 6.416811772958702e-05, + "loss": 1.8385, + "step": 13890 + }, + { + "epoch": 4.26365868631062, + "grad_norm": 0.2727305293083191, + "learning_rate": 6.416335082847342e-05, + "loss": 1.794, + "step": 13891 + }, + { + "epoch": 4.263965623081646, + "grad_norm": 0.26089411973953247, + "learning_rate": 6.41585837873893e-05, + "loss": 1.7907, + "step": 13892 + }, + { + "epoch": 4.26427255985267, + "grad_norm": 0.24655573070049286, + "learning_rate": 6.415381660638174e-05, + "loss": 1.7481, + "step": 13893 + }, + { + "epoch": 4.264579496623695, + "grad_norm": 0.4186919629573822, + "learning_rate": 6.414904928549787e-05, + "loss": 1.8021, + "step": 13894 + }, + { + "epoch": 4.264886433394721, + "grad_norm": 0.38188236951828003, + "learning_rate": 6.414428182478478e-05, + "loss": 1.75, + "step": 13895 + }, + { + "epoch": 4.265193370165746, + "grad_norm": 0.23686440289020538, + "learning_rate": 6.413951422428963e-05, + "loss": 1.7882, + "step": 13896 + }, + { + "epoch": 4.265500306936771, + "grad_norm": 0.35963737964630127, + "learning_rate": 6.413474648405952e-05, + "loss": 1.7427, + "step": 13897 + }, + { + "epoch": 4.265807243707796, + "grad_norm": 0.38558289408683777, + "learning_rate": 6.412997860414155e-05, + "loss": 1.7622, + "step": 13898 + }, + { + "epoch": 4.266114180478821, + "grad_norm": 0.2311459481716156, + "learning_rate": 6.412521058458285e-05, + "loss": 1.7894, + "step": 13899 + }, + { + "epoch": 4.2664211172498465, + "grad_norm": 0.2647818624973297, + "learning_rate": 6.412044242543054e-05, + "loss": 1.7399, + "step": 13900 + }, + { + "epoch": 4.266728054020872, + "grad_norm": 0.3174133002758026, + "learning_rate": 6.411567412673174e-05, + "loss": 1.7552, + "step": 13901 + }, + { + "epoch": 4.267034990791897, + "grad_norm": 0.25207316875457764, + "learning_rate": 6.411090568853358e-05, + "loss": 1.7876, + "step": 13902 + }, + { + "epoch": 4.267341927562922, + "grad_norm": 0.24549202620983124, + "learning_rate": 6.410613711088317e-05, + "loss": 1.8554, + "step": 13903 + }, + { + "epoch": 4.267648864333947, + "grad_norm": 0.26293641328811646, + "learning_rate": 6.410136839382765e-05, + "loss": 1.8553, + "step": 13904 + }, + { + "epoch": 4.267955801104972, + "grad_norm": 0.20258362591266632, + "learning_rate": 6.409659953741416e-05, + "loss": 1.7205, + "step": 13905 + }, + { + "epoch": 4.268262737875998, + "grad_norm": 0.24885907769203186, + "learning_rate": 6.409183054168979e-05, + "loss": 1.7718, + "step": 13906 + }, + { + "epoch": 4.268569674647023, + "grad_norm": 0.22737209498882294, + "learning_rate": 6.408706140670169e-05, + "loss": 1.7228, + "step": 13907 + }, + { + "epoch": 4.268876611418047, + "grad_norm": 0.2201235145330429, + "learning_rate": 6.4082292132497e-05, + "loss": 1.7451, + "step": 13908 + }, + { + "epoch": 4.269183548189073, + "grad_norm": 0.24108454585075378, + "learning_rate": 6.407752271912285e-05, + "loss": 1.7531, + "step": 13909 + }, + { + "epoch": 4.269490484960098, + "grad_norm": 0.21723641455173492, + "learning_rate": 6.407275316662636e-05, + "loss": 1.7139, + "step": 13910 + }, + { + "epoch": 4.269797421731123, + "grad_norm": 0.22557848691940308, + "learning_rate": 6.406798347505469e-05, + "loss": 1.7633, + "step": 13911 + }, + { + "epoch": 4.270104358502149, + "grad_norm": 0.24664700031280518, + "learning_rate": 6.406321364445494e-05, + "loss": 1.7854, + "step": 13912 + }, + { + "epoch": 4.270411295273174, + "grad_norm": 0.2599056661128998, + "learning_rate": 6.405844367487428e-05, + "loss": 1.7662, + "step": 13913 + }, + { + "epoch": 4.2707182320441985, + "grad_norm": 0.2378663718700409, + "learning_rate": 6.405367356635982e-05, + "loss": 1.7477, + "step": 13914 + }, + { + "epoch": 4.271025168815224, + "grad_norm": 0.27158626914024353, + "learning_rate": 6.404890331895876e-05, + "loss": 1.7426, + "step": 13915 + }, + { + "epoch": 4.271332105586249, + "grad_norm": 0.28585317730903625, + "learning_rate": 6.404413293271818e-05, + "loss": 1.7492, + "step": 13916 + }, + { + "epoch": 4.2716390423572745, + "grad_norm": 0.2321750968694687, + "learning_rate": 6.403936240768526e-05, + "loss": 1.8594, + "step": 13917 + }, + { + "epoch": 4.2719459791283, + "grad_norm": 0.25824111700057983, + "learning_rate": 6.40345917439071e-05, + "loss": 1.7622, + "step": 13918 + }, + { + "epoch": 4.272252915899324, + "grad_norm": 0.24641194939613342, + "learning_rate": 6.40298209414309e-05, + "loss": 1.7519, + "step": 13919 + }, + { + "epoch": 4.27255985267035, + "grad_norm": 0.2132398933172226, + "learning_rate": 6.40250500003038e-05, + "loss": 1.7339, + "step": 13920 + }, + { + "epoch": 4.272866789441375, + "grad_norm": 0.22630736231803894, + "learning_rate": 6.402027892057292e-05, + "loss": 1.7396, + "step": 13921 + }, + { + "epoch": 4.2731737262124, + "grad_norm": 0.295163631439209, + "learning_rate": 6.401550770228543e-05, + "loss": 1.8063, + "step": 13922 + }, + { + "epoch": 4.273480662983426, + "grad_norm": 0.2722746729850769, + "learning_rate": 6.401073634548848e-05, + "loss": 1.7775, + "step": 13923 + }, + { + "epoch": 4.273787599754451, + "grad_norm": 0.23201976716518402, + "learning_rate": 6.400596485022922e-05, + "loss": 1.7755, + "step": 13924 + }, + { + "epoch": 4.274094536525475, + "grad_norm": 0.23880761861801147, + "learning_rate": 6.40011932165548e-05, + "loss": 1.778, + "step": 13925 + }, + { + "epoch": 4.274401473296501, + "grad_norm": 0.22305625677108765, + "learning_rate": 6.399642144451239e-05, + "loss": 1.761, + "step": 13926 + }, + { + "epoch": 4.274708410067526, + "grad_norm": 0.21874886751174927, + "learning_rate": 6.399164953414914e-05, + "loss": 1.7148, + "step": 13927 + }, + { + "epoch": 4.2750153468385514, + "grad_norm": 0.2003604918718338, + "learning_rate": 6.398687748551221e-05, + "loss": 1.8049, + "step": 13928 + }, + { + "epoch": 4.275322283609577, + "grad_norm": 0.2443511188030243, + "learning_rate": 6.398210529864875e-05, + "loss": 1.782, + "step": 13929 + }, + { + "epoch": 4.275629220380601, + "grad_norm": 0.2297198623418808, + "learning_rate": 6.397733297360594e-05, + "loss": 1.7682, + "step": 13930 + }, + { + "epoch": 4.275936157151627, + "grad_norm": 0.23474562168121338, + "learning_rate": 6.39725605104309e-05, + "loss": 1.7809, + "step": 13931 + }, + { + "epoch": 4.276243093922652, + "grad_norm": 0.25908544659614563, + "learning_rate": 6.396778790917087e-05, + "loss": 1.7343, + "step": 13932 + }, + { + "epoch": 4.276550030693677, + "grad_norm": 0.2440379112958908, + "learning_rate": 6.396301516987295e-05, + "loss": 1.786, + "step": 13933 + }, + { + "epoch": 4.276856967464703, + "grad_norm": 0.26185858249664307, + "learning_rate": 6.395824229258435e-05, + "loss": 1.7863, + "step": 13934 + }, + { + "epoch": 4.277163904235728, + "grad_norm": 0.24470919370651245, + "learning_rate": 6.39534692773522e-05, + "loss": 1.7774, + "step": 13935 + }, + { + "epoch": 4.277470841006752, + "grad_norm": 0.2612632215023041, + "learning_rate": 6.39486961242237e-05, + "loss": 1.7536, + "step": 13936 + }, + { + "epoch": 4.277777777777778, + "grad_norm": 0.26870301365852356, + "learning_rate": 6.3943922833246e-05, + "loss": 1.8177, + "step": 13937 + }, + { + "epoch": 4.278084714548803, + "grad_norm": 0.24445784091949463, + "learning_rate": 6.393914940446628e-05, + "loss": 1.7539, + "step": 13938 + }, + { + "epoch": 4.278391651319828, + "grad_norm": 0.2622319757938385, + "learning_rate": 6.393437583793174e-05, + "loss": 1.8252, + "step": 13939 + }, + { + "epoch": 4.278698588090854, + "grad_norm": 0.2586652636528015, + "learning_rate": 6.39296021336895e-05, + "loss": 1.7975, + "step": 13940 + }, + { + "epoch": 4.279005524861878, + "grad_norm": 0.19488228857517242, + "learning_rate": 6.392482829178678e-05, + "loss": 1.7678, + "step": 13941 + }, + { + "epoch": 4.2793124616329035, + "grad_norm": 0.23956604301929474, + "learning_rate": 6.392005431227074e-05, + "loss": 1.7444, + "step": 13942 + }, + { + "epoch": 4.279619398403929, + "grad_norm": 0.24195842444896698, + "learning_rate": 6.391528019518857e-05, + "loss": 1.8116, + "step": 13943 + }, + { + "epoch": 4.279926335174954, + "grad_norm": 0.21479523181915283, + "learning_rate": 6.391050594058746e-05, + "loss": 1.7351, + "step": 13944 + }, + { + "epoch": 4.2802332719459795, + "grad_norm": 0.2309941202402115, + "learning_rate": 6.390573154851456e-05, + "loss": 1.8245, + "step": 13945 + }, + { + "epoch": 4.280540208717004, + "grad_norm": 0.2375536412000656, + "learning_rate": 6.390095701901706e-05, + "loss": 1.7921, + "step": 13946 + }, + { + "epoch": 4.280847145488029, + "grad_norm": 0.25518664717674255, + "learning_rate": 6.389618235214216e-05, + "loss": 1.7549, + "step": 13947 + }, + { + "epoch": 4.281154082259055, + "grad_norm": 0.2579016089439392, + "learning_rate": 6.389140754793705e-05, + "loss": 1.7637, + "step": 13948 + }, + { + "epoch": 4.28146101903008, + "grad_norm": 0.25350916385650635, + "learning_rate": 6.388663260644892e-05, + "loss": 1.746, + "step": 13949 + }, + { + "epoch": 4.281767955801105, + "grad_norm": 0.2994026839733124, + "learning_rate": 6.388185752772493e-05, + "loss": 1.8196, + "step": 13950 + }, + { + "epoch": 4.28207489257213, + "grad_norm": 0.29938533902168274, + "learning_rate": 6.387708231181229e-05, + "loss": 1.7187, + "step": 13951 + }, + { + "epoch": 4.282381829343155, + "grad_norm": 0.23865137994289398, + "learning_rate": 6.387230695875819e-05, + "loss": 1.7317, + "step": 13952 + }, + { + "epoch": 4.28268876611418, + "grad_norm": 0.23812857270240784, + "learning_rate": 6.386753146860982e-05, + "loss": 1.7536, + "step": 13953 + }, + { + "epoch": 4.282995702885206, + "grad_norm": 0.3395650088787079, + "learning_rate": 6.386275584141438e-05, + "loss": 1.7932, + "step": 13954 + }, + { + "epoch": 4.283302639656231, + "grad_norm": 0.38207507133483887, + "learning_rate": 6.385798007721906e-05, + "loss": 1.8196, + "step": 13955 + }, + { + "epoch": 4.283609576427256, + "grad_norm": 0.32960978150367737, + "learning_rate": 6.385320417607107e-05, + "loss": 1.7898, + "step": 13956 + }, + { + "epoch": 4.283916513198281, + "grad_norm": 0.22978928685188293, + "learning_rate": 6.384842813801757e-05, + "loss": 1.7835, + "step": 13957 + }, + { + "epoch": 4.284223449969306, + "grad_norm": 0.24607588350772858, + "learning_rate": 6.38436519631058e-05, + "loss": 1.7829, + "step": 13958 + }, + { + "epoch": 4.2845303867403315, + "grad_norm": 0.2770270109176636, + "learning_rate": 6.383887565138295e-05, + "loss": 1.7294, + "step": 13959 + }, + { + "epoch": 4.284837323511357, + "grad_norm": 0.27644863724708557, + "learning_rate": 6.383409920289622e-05, + "loss": 1.829, + "step": 13960 + }, + { + "epoch": 4.285144260282382, + "grad_norm": 0.3870919942855835, + "learning_rate": 6.382932261769282e-05, + "loss": 1.8146, + "step": 13961 + }, + { + "epoch": 4.285451197053407, + "grad_norm": 0.3562348186969757, + "learning_rate": 6.382454589581994e-05, + "loss": 1.8225, + "step": 13962 + }, + { + "epoch": 4.285758133824432, + "grad_norm": 0.28444886207580566, + "learning_rate": 6.38197690373248e-05, + "loss": 1.7734, + "step": 13963 + }, + { + "epoch": 4.286065070595457, + "grad_norm": 0.27935758233070374, + "learning_rate": 6.381499204225459e-05, + "loss": 1.7402, + "step": 13964 + }, + { + "epoch": 4.286372007366483, + "grad_norm": 0.34188997745513916, + "learning_rate": 6.381021491065653e-05, + "loss": 1.7661, + "step": 13965 + }, + { + "epoch": 4.286678944137508, + "grad_norm": 0.28648918867111206, + "learning_rate": 6.380543764257785e-05, + "loss": 1.8312, + "step": 13966 + }, + { + "epoch": 4.286985880908533, + "grad_norm": 0.2733290493488312, + "learning_rate": 6.380066023806572e-05, + "loss": 1.7505, + "step": 13967 + }, + { + "epoch": 4.287292817679558, + "grad_norm": 0.3344273865222931, + "learning_rate": 6.37958826971674e-05, + "loss": 1.8392, + "step": 13968 + }, + { + "epoch": 4.287599754450583, + "grad_norm": 0.2655799090862274, + "learning_rate": 6.379110501993006e-05, + "loss": 1.7575, + "step": 13969 + }, + { + "epoch": 4.287906691221608, + "grad_norm": 0.2569151818752289, + "learning_rate": 6.378632720640095e-05, + "loss": 1.6619, + "step": 13970 + }, + { + "epoch": 4.288213627992634, + "grad_norm": 0.2477198988199234, + "learning_rate": 6.378154925662727e-05, + "loss": 1.7532, + "step": 13971 + }, + { + "epoch": 4.288520564763659, + "grad_norm": 0.2867630422115326, + "learning_rate": 6.377677117065624e-05, + "loss": 1.7725, + "step": 13972 + }, + { + "epoch": 4.2888275015346835, + "grad_norm": 0.28316137194633484, + "learning_rate": 6.37719929485351e-05, + "loss": 1.7628, + "step": 13973 + }, + { + "epoch": 4.289134438305709, + "grad_norm": 0.2934304475784302, + "learning_rate": 6.376721459031106e-05, + "loss": 1.7346, + "step": 13974 + }, + { + "epoch": 4.289441375076734, + "grad_norm": 0.22847147285938263, + "learning_rate": 6.376243609603129e-05, + "loss": 1.7409, + "step": 13975 + }, + { + "epoch": 4.2897483118477595, + "grad_norm": 0.360441118478775, + "learning_rate": 6.375765746574311e-05, + "loss": 1.808, + "step": 13976 + }, + { + "epoch": 4.290055248618785, + "grad_norm": 0.2750907242298126, + "learning_rate": 6.375287869949367e-05, + "loss": 1.8046, + "step": 13977 + }, + { + "epoch": 4.290362185389809, + "grad_norm": 0.26193201541900635, + "learning_rate": 6.374809979733022e-05, + "loss": 1.7097, + "step": 13978 + }, + { + "epoch": 4.290669122160835, + "grad_norm": 0.3282175064086914, + "learning_rate": 6.37433207593e-05, + "loss": 1.7924, + "step": 13979 + }, + { + "epoch": 4.29097605893186, + "grad_norm": 0.2845167815685272, + "learning_rate": 6.373854158545021e-05, + "loss": 1.7663, + "step": 13980 + }, + { + "epoch": 4.291282995702885, + "grad_norm": 0.21816621720790863, + "learning_rate": 6.37337622758281e-05, + "loss": 1.7368, + "step": 13981 + }, + { + "epoch": 4.291589932473911, + "grad_norm": 0.264272540807724, + "learning_rate": 6.372898283048094e-05, + "loss": 1.7377, + "step": 13982 + }, + { + "epoch": 4.291896869244935, + "grad_norm": 0.2182006686925888, + "learning_rate": 6.37242032494559e-05, + "loss": 1.8107, + "step": 13983 + }, + { + "epoch": 4.29220380601596, + "grad_norm": 0.26856422424316406, + "learning_rate": 6.371942353280023e-05, + "loss": 1.7708, + "step": 13984 + }, + { + "epoch": 4.292510742786986, + "grad_norm": 0.3025323748588562, + "learning_rate": 6.37146436805612e-05, + "loss": 1.7768, + "step": 13985 + }, + { + "epoch": 4.292817679558011, + "grad_norm": 0.2949144244194031, + "learning_rate": 6.3709863692786e-05, + "loss": 1.7848, + "step": 13986 + }, + { + "epoch": 4.293124616329036, + "grad_norm": 0.20670418441295624, + "learning_rate": 6.370508356952188e-05, + "loss": 1.7367, + "step": 13987 + }, + { + "epoch": 4.293431553100062, + "grad_norm": 0.2453860342502594, + "learning_rate": 6.370030331081611e-05, + "loss": 1.7246, + "step": 13988 + }, + { + "epoch": 4.293738489871086, + "grad_norm": 0.3413507044315338, + "learning_rate": 6.369552291671592e-05, + "loss": 1.7829, + "step": 13989 + }, + { + "epoch": 4.2940454266421115, + "grad_norm": 0.28352782130241394, + "learning_rate": 6.369074238726856e-05, + "loss": 1.7755, + "step": 13990 + }, + { + "epoch": 4.294352363413137, + "grad_norm": 0.21408751606941223, + "learning_rate": 6.368596172252124e-05, + "loss": 1.7292, + "step": 13991 + }, + { + "epoch": 4.294659300184162, + "grad_norm": 0.28372085094451904, + "learning_rate": 6.36811809225212e-05, + "loss": 1.8197, + "step": 13992 + }, + { + "epoch": 4.2949662369551875, + "grad_norm": 0.2400829792022705, + "learning_rate": 6.367639998731573e-05, + "loss": 1.7559, + "step": 13993 + }, + { + "epoch": 4.295273173726212, + "grad_norm": 0.22853593528270721, + "learning_rate": 6.367161891695207e-05, + "loss": 1.8116, + "step": 13994 + }, + { + "epoch": 4.295580110497237, + "grad_norm": 0.22098208963871002, + "learning_rate": 6.366683771147745e-05, + "loss": 1.7269, + "step": 13995 + }, + { + "epoch": 4.295887047268263, + "grad_norm": 0.22293934226036072, + "learning_rate": 6.366205637093914e-05, + "loss": 1.7944, + "step": 13996 + }, + { + "epoch": 4.296193984039288, + "grad_norm": 0.26120004057884216, + "learning_rate": 6.365727489538437e-05, + "loss": 1.7581, + "step": 13997 + }, + { + "epoch": 4.296500920810313, + "grad_norm": 0.2568937838077545, + "learning_rate": 6.365249328486041e-05, + "loss": 1.7356, + "step": 13998 + }, + { + "epoch": 4.296807857581339, + "grad_norm": 0.2419043630361557, + "learning_rate": 6.364771153941449e-05, + "loss": 1.8127, + "step": 13999 + }, + { + "epoch": 4.297114794352363, + "grad_norm": 0.2521972060203552, + "learning_rate": 6.364292965909391e-05, + "loss": 1.7445, + "step": 14000 + }, + { + "epoch": 4.297421731123388, + "grad_norm": 0.3269292414188385, + "learning_rate": 6.363814764394589e-05, + "loss": 1.7835, + "step": 14001 + }, + { + "epoch": 4.297728667894414, + "grad_norm": 0.258405864238739, + "learning_rate": 6.36333654940177e-05, + "loss": 1.7407, + "step": 14002 + }, + { + "epoch": 4.298035604665439, + "grad_norm": 0.21527236700057983, + "learning_rate": 6.362858320935662e-05, + "loss": 1.7729, + "step": 14003 + }, + { + "epoch": 4.298342541436464, + "grad_norm": 0.25343602895736694, + "learning_rate": 6.362380079000988e-05, + "loss": 1.8087, + "step": 14004 + }, + { + "epoch": 4.298649478207489, + "grad_norm": 0.26110637187957764, + "learning_rate": 6.361901823602474e-05, + "loss": 1.813, + "step": 14005 + }, + { + "epoch": 4.298956414978514, + "grad_norm": 0.26749926805496216, + "learning_rate": 6.361423554744851e-05, + "loss": 1.8193, + "step": 14006 + }, + { + "epoch": 4.2992633517495396, + "grad_norm": 0.22357676923274994, + "learning_rate": 6.360945272432841e-05, + "loss": 1.7498, + "step": 14007 + }, + { + "epoch": 4.299570288520565, + "grad_norm": 0.2367832362651825, + "learning_rate": 6.360466976671172e-05, + "loss": 1.7843, + "step": 14008 + }, + { + "epoch": 4.29987722529159, + "grad_norm": 0.23594366014003754, + "learning_rate": 6.35998866746457e-05, + "loss": 1.7442, + "step": 14009 + }, + { + "epoch": 4.300184162062616, + "grad_norm": 0.2660543918609619, + "learning_rate": 6.359510344817765e-05, + "loss": 1.7557, + "step": 14010 + }, + { + "epoch": 4.30049109883364, + "grad_norm": 0.191593199968338, + "learning_rate": 6.359032008735481e-05, + "loss": 1.7988, + "step": 14011 + }, + { + "epoch": 4.300798035604665, + "grad_norm": 0.2755490243434906, + "learning_rate": 6.358553659222447e-05, + "loss": 1.7551, + "step": 14012 + }, + { + "epoch": 4.301104972375691, + "grad_norm": 0.2900530993938446, + "learning_rate": 6.358075296283387e-05, + "loss": 1.7523, + "step": 14013 + }, + { + "epoch": 4.301411909146716, + "grad_norm": 0.22242774069309235, + "learning_rate": 6.357596919923033e-05, + "loss": 1.7626, + "step": 14014 + }, + { + "epoch": 4.301718845917741, + "grad_norm": 0.26636210083961487, + "learning_rate": 6.357118530146108e-05, + "loss": 1.7855, + "step": 14015 + }, + { + "epoch": 4.302025782688766, + "grad_norm": 0.3055269718170166, + "learning_rate": 6.356640126957344e-05, + "loss": 1.7528, + "step": 14016 + }, + { + "epoch": 4.302332719459791, + "grad_norm": 0.29695719480514526, + "learning_rate": 6.356161710361468e-05, + "loss": 1.7482, + "step": 14017 + }, + { + "epoch": 4.3026396562308165, + "grad_norm": 0.2369711697101593, + "learning_rate": 6.355683280363207e-05, + "loss": 1.7635, + "step": 14018 + }, + { + "epoch": 4.302946593001842, + "grad_norm": 0.26681363582611084, + "learning_rate": 6.35520483696729e-05, + "loss": 1.8814, + "step": 14019 + }, + { + "epoch": 4.303253529772867, + "grad_norm": 0.2623308598995209, + "learning_rate": 6.354726380178442e-05, + "loss": 1.8645, + "step": 14020 + }, + { + "epoch": 4.303560466543892, + "grad_norm": 0.23326413333415985, + "learning_rate": 6.354247910001394e-05, + "loss": 1.8093, + "step": 14021 + }, + { + "epoch": 4.303867403314917, + "grad_norm": 0.3037295639514923, + "learning_rate": 6.353769426440875e-05, + "loss": 1.8556, + "step": 14022 + }, + { + "epoch": 4.304174340085942, + "grad_norm": 0.23624882102012634, + "learning_rate": 6.353290929501616e-05, + "loss": 1.803, + "step": 14023 + }, + { + "epoch": 4.304481276856968, + "grad_norm": 0.22106927633285522, + "learning_rate": 6.35281241918834e-05, + "loss": 1.7133, + "step": 14024 + }, + { + "epoch": 4.304788213627993, + "grad_norm": 0.2374040186405182, + "learning_rate": 6.352333895505778e-05, + "loss": 1.8127, + "step": 14025 + }, + { + "epoch": 4.305095150399017, + "grad_norm": 0.2782450318336487, + "learning_rate": 6.35185535845866e-05, + "loss": 1.8613, + "step": 14026 + }, + { + "epoch": 4.305402087170043, + "grad_norm": 0.2527763843536377, + "learning_rate": 6.351376808051717e-05, + "loss": 1.7533, + "step": 14027 + }, + { + "epoch": 4.305709023941068, + "grad_norm": 0.2462318390607834, + "learning_rate": 6.350898244289675e-05, + "loss": 1.8075, + "step": 14028 + }, + { + "epoch": 4.306015960712093, + "grad_norm": 0.2646189332008362, + "learning_rate": 6.350419667177265e-05, + "loss": 1.8261, + "step": 14029 + }, + { + "epoch": 4.306322897483119, + "grad_norm": 0.24918611347675323, + "learning_rate": 6.349941076719218e-05, + "loss": 1.7542, + "step": 14030 + }, + { + "epoch": 4.306629834254144, + "grad_norm": 0.22440841794013977, + "learning_rate": 6.349462472920259e-05, + "loss": 1.7897, + "step": 14031 + }, + { + "epoch": 4.3069367710251685, + "grad_norm": 0.28614330291748047, + "learning_rate": 6.348983855785121e-05, + "loss": 1.88, + "step": 14032 + }, + { + "epoch": 4.307243707796194, + "grad_norm": 0.25015848875045776, + "learning_rate": 6.348505225318535e-05, + "loss": 1.8008, + "step": 14033 + }, + { + "epoch": 4.307550644567219, + "grad_norm": 0.2468707263469696, + "learning_rate": 6.34802658152523e-05, + "loss": 1.8025, + "step": 14034 + }, + { + "epoch": 4.3078575813382445, + "grad_norm": 0.30504748225212097, + "learning_rate": 6.347547924409937e-05, + "loss": 1.8765, + "step": 14035 + }, + { + "epoch": 4.30816451810927, + "grad_norm": 0.35419392585754395, + "learning_rate": 6.347069253977385e-05, + "loss": 1.7807, + "step": 14036 + }, + { + "epoch": 4.308471454880294, + "grad_norm": 0.33683931827545166, + "learning_rate": 6.346590570232305e-05, + "loss": 1.7244, + "step": 14037 + }, + { + "epoch": 4.30877839165132, + "grad_norm": 0.3339467942714691, + "learning_rate": 6.346111873179427e-05, + "loss": 1.7642, + "step": 14038 + }, + { + "epoch": 4.309085328422345, + "grad_norm": 0.2369392216205597, + "learning_rate": 6.345633162823484e-05, + "loss": 1.7127, + "step": 14039 + }, + { + "epoch": 4.30939226519337, + "grad_norm": 0.26469686627388, + "learning_rate": 6.345154439169206e-05, + "loss": 1.7235, + "step": 14040 + }, + { + "epoch": 4.309699201964396, + "grad_norm": 0.2737344205379486, + "learning_rate": 6.344675702221321e-05, + "loss": 1.783, + "step": 14041 + }, + { + "epoch": 4.310006138735421, + "grad_norm": 0.2381773442029953, + "learning_rate": 6.344196951984565e-05, + "loss": 1.7172, + "step": 14042 + }, + { + "epoch": 4.310313075506445, + "grad_norm": 0.28199076652526855, + "learning_rate": 6.343718188463663e-05, + "loss": 1.8315, + "step": 14043 + }, + { + "epoch": 4.310620012277471, + "grad_norm": 0.24378590285778046, + "learning_rate": 6.343239411663353e-05, + "loss": 1.7828, + "step": 14044 + }, + { + "epoch": 4.310926949048496, + "grad_norm": 0.26343944668769836, + "learning_rate": 6.342760621588365e-05, + "loss": 1.7679, + "step": 14045 + }, + { + "epoch": 4.311233885819521, + "grad_norm": 0.23703521490097046, + "learning_rate": 6.342281818243427e-05, + "loss": 1.7885, + "step": 14046 + }, + { + "epoch": 4.311540822590547, + "grad_norm": 0.2230173498392105, + "learning_rate": 6.341803001633276e-05, + "loss": 1.767, + "step": 14047 + }, + { + "epoch": 4.311847759361571, + "grad_norm": 0.249002143740654, + "learning_rate": 6.34132417176264e-05, + "loss": 1.8032, + "step": 14048 + }, + { + "epoch": 4.3121546961325965, + "grad_norm": 0.2383791208267212, + "learning_rate": 6.34084532863625e-05, + "loss": 1.7558, + "step": 14049 + }, + { + "epoch": 4.312461632903622, + "grad_norm": 0.2783047556877136, + "learning_rate": 6.340366472258843e-05, + "loss": 1.8389, + "step": 14050 + }, + { + "epoch": 4.312768569674647, + "grad_norm": 0.2654891312122345, + "learning_rate": 6.339887602635148e-05, + "loss": 1.7989, + "step": 14051 + }, + { + "epoch": 4.3130755064456725, + "grad_norm": 0.2638411521911621, + "learning_rate": 6.3394087197699e-05, + "loss": 1.8707, + "step": 14052 + }, + { + "epoch": 4.313382443216697, + "grad_norm": 0.3026179075241089, + "learning_rate": 6.338929823667829e-05, + "loss": 1.7892, + "step": 14053 + }, + { + "epoch": 4.313689379987722, + "grad_norm": 0.27496880292892456, + "learning_rate": 6.338450914333668e-05, + "loss": 1.7398, + "step": 14054 + }, + { + "epoch": 4.313996316758748, + "grad_norm": 0.2601073086261749, + "learning_rate": 6.337971991772151e-05, + "loss": 1.7646, + "step": 14055 + }, + { + "epoch": 4.314303253529773, + "grad_norm": 0.2061719298362732, + "learning_rate": 6.337493055988011e-05, + "loss": 1.7372, + "step": 14056 + }, + { + "epoch": 4.314610190300798, + "grad_norm": 0.23722340166568756, + "learning_rate": 6.337014106985981e-05, + "loss": 1.7457, + "step": 14057 + }, + { + "epoch": 4.314917127071823, + "grad_norm": 0.2729428708553314, + "learning_rate": 6.336535144770793e-05, + "loss": 1.8423, + "step": 14058 + }, + { + "epoch": 4.315224063842848, + "grad_norm": 0.23520450294017792, + "learning_rate": 6.336056169347182e-05, + "loss": 1.8124, + "step": 14059 + }, + { + "epoch": 4.315531000613873, + "grad_norm": 0.25142738223075867, + "learning_rate": 6.33557718071988e-05, + "loss": 1.7285, + "step": 14060 + }, + { + "epoch": 4.315837937384899, + "grad_norm": 0.24833035469055176, + "learning_rate": 6.335098178893621e-05, + "loss": 1.766, + "step": 14061 + }, + { + "epoch": 4.316144874155924, + "grad_norm": 0.2406177669763565, + "learning_rate": 6.334619163873141e-05, + "loss": 1.8824, + "step": 14062 + }, + { + "epoch": 4.316451810926949, + "grad_norm": 0.23077574372291565, + "learning_rate": 6.334140135663172e-05, + "loss": 1.7589, + "step": 14063 + }, + { + "epoch": 4.316758747697974, + "grad_norm": 0.20476560294628143, + "learning_rate": 6.333661094268448e-05, + "loss": 1.7331, + "step": 14064 + }, + { + "epoch": 4.317065684468999, + "grad_norm": 0.207991823554039, + "learning_rate": 6.333182039693704e-05, + "loss": 1.6876, + "step": 14065 + }, + { + "epoch": 4.3173726212400245, + "grad_norm": 0.20813052356243134, + "learning_rate": 6.332702971943671e-05, + "loss": 1.775, + "step": 14066 + }, + { + "epoch": 4.31767955801105, + "grad_norm": 0.2470991462469101, + "learning_rate": 6.332223891023087e-05, + "loss": 1.7673, + "step": 14067 + }, + { + "epoch": 4.317986494782075, + "grad_norm": 0.23855723440647125, + "learning_rate": 6.331744796936687e-05, + "loss": 1.7842, + "step": 14068 + }, + { + "epoch": 4.3182934315531, + "grad_norm": 0.21852652728557587, + "learning_rate": 6.331265689689204e-05, + "loss": 1.7727, + "step": 14069 + }, + { + "epoch": 4.318600368324125, + "grad_norm": 0.284496545791626, + "learning_rate": 6.330786569285374e-05, + "loss": 1.8248, + "step": 14070 + }, + { + "epoch": 4.31890730509515, + "grad_norm": 0.21709981560707092, + "learning_rate": 6.33030743572993e-05, + "loss": 1.7547, + "step": 14071 + }, + { + "epoch": 4.319214241866176, + "grad_norm": 0.24209457635879517, + "learning_rate": 6.329828289027608e-05, + "loss": 1.7695, + "step": 14072 + }, + { + "epoch": 4.319521178637201, + "grad_norm": 0.24869373440742493, + "learning_rate": 6.329349129183144e-05, + "loss": 1.8204, + "step": 14073 + }, + { + "epoch": 4.319828115408226, + "grad_norm": 0.21702703833580017, + "learning_rate": 6.328869956201274e-05, + "loss": 1.779, + "step": 14074 + }, + { + "epoch": 4.320135052179251, + "grad_norm": 0.22993850708007812, + "learning_rate": 6.328390770086731e-05, + "loss": 1.7935, + "step": 14075 + }, + { + "epoch": 4.320441988950276, + "grad_norm": 0.23491734266281128, + "learning_rate": 6.327911570844252e-05, + "loss": 1.7261, + "step": 14076 + }, + { + "epoch": 4.320748925721301, + "grad_norm": 0.2479303777217865, + "learning_rate": 6.327432358478571e-05, + "loss": 1.7683, + "step": 14077 + }, + { + "epoch": 4.321055862492327, + "grad_norm": 0.24261580407619476, + "learning_rate": 6.326953132994427e-05, + "loss": 1.7147, + "step": 14078 + }, + { + "epoch": 4.321362799263352, + "grad_norm": 0.24627646803855896, + "learning_rate": 6.326473894396553e-05, + "loss": 1.7976, + "step": 14079 + }, + { + "epoch": 4.3216697360343765, + "grad_norm": 0.269149512052536, + "learning_rate": 6.325994642689688e-05, + "loss": 1.7247, + "step": 14080 + }, + { + "epoch": 4.321976672805402, + "grad_norm": 0.4162158966064453, + "learning_rate": 6.325515377878566e-05, + "loss": 1.7485, + "step": 14081 + }, + { + "epoch": 4.322283609576427, + "grad_norm": 0.366459459066391, + "learning_rate": 6.325036099967925e-05, + "loss": 1.7286, + "step": 14082 + }, + { + "epoch": 4.3225905463474525, + "grad_norm": 0.2465270757675171, + "learning_rate": 6.324556808962499e-05, + "loss": 1.8097, + "step": 14083 + }, + { + "epoch": 4.322897483118478, + "grad_norm": 0.2911076843738556, + "learning_rate": 6.324077504867026e-05, + "loss": 1.7979, + "step": 14084 + }, + { + "epoch": 4.323204419889503, + "grad_norm": 0.33455169200897217, + "learning_rate": 6.323598187686245e-05, + "loss": 1.7988, + "step": 14085 + }, + { + "epoch": 4.323511356660528, + "grad_norm": 0.25020337104797363, + "learning_rate": 6.32311885742489e-05, + "loss": 1.7184, + "step": 14086 + }, + { + "epoch": 4.323818293431553, + "grad_norm": 0.23941513895988464, + "learning_rate": 6.322639514087699e-05, + "loss": 1.7672, + "step": 14087 + }, + { + "epoch": 4.324125230202578, + "grad_norm": 0.35258981585502625, + "learning_rate": 6.32216015767941e-05, + "loss": 1.7571, + "step": 14088 + }, + { + "epoch": 4.324432166973604, + "grad_norm": 0.2854993939399719, + "learning_rate": 6.321680788204758e-05, + "loss": 1.8096, + "step": 14089 + }, + { + "epoch": 4.324739103744629, + "grad_norm": 0.24422863125801086, + "learning_rate": 6.321201405668482e-05, + "loss": 1.778, + "step": 14090 + }, + { + "epoch": 4.3250460405156534, + "grad_norm": 0.36629122495651245, + "learning_rate": 6.320722010075321e-05, + "loss": 1.716, + "step": 14091 + }, + { + "epoch": 4.325352977286679, + "grad_norm": 0.37115517258644104, + "learning_rate": 6.32024260143001e-05, + "loss": 1.77, + "step": 14092 + }, + { + "epoch": 4.325659914057704, + "grad_norm": 0.21540327370166779, + "learning_rate": 6.319763179737288e-05, + "loss": 1.7529, + "step": 14093 + }, + { + "epoch": 4.3259668508287294, + "grad_norm": 0.2573898732662201, + "learning_rate": 6.319283745001892e-05, + "loss": 1.8101, + "step": 14094 + }, + { + "epoch": 4.326273787599755, + "grad_norm": 0.29481247067451477, + "learning_rate": 6.31880429722856e-05, + "loss": 1.7459, + "step": 14095 + }, + { + "epoch": 4.326580724370779, + "grad_norm": 0.23474647104740143, + "learning_rate": 6.318324836422031e-05, + "loss": 1.786, + "step": 14096 + }, + { + "epoch": 4.326887661141805, + "grad_norm": 0.2884673476219177, + "learning_rate": 6.317845362587045e-05, + "loss": 1.8123, + "step": 14097 + }, + { + "epoch": 4.32719459791283, + "grad_norm": 0.39008447527885437, + "learning_rate": 6.317365875728338e-05, + "loss": 1.7729, + "step": 14098 + }, + { + "epoch": 4.327501534683855, + "grad_norm": 0.30568063259124756, + "learning_rate": 6.316886375850651e-05, + "loss": 1.7088, + "step": 14099 + }, + { + "epoch": 4.327808471454881, + "grad_norm": 0.2538018524646759, + "learning_rate": 6.316406862958718e-05, + "loss": 1.8028, + "step": 14100 + }, + { + "epoch": 4.328115408225905, + "grad_norm": 0.3815068006515503, + "learning_rate": 6.315927337057281e-05, + "loss": 1.7143, + "step": 14101 + }, + { + "epoch": 4.32842234499693, + "grad_norm": 0.3813243508338928, + "learning_rate": 6.31544779815108e-05, + "loss": 1.7072, + "step": 14102 + }, + { + "epoch": 4.328729281767956, + "grad_norm": 0.22438868880271912, + "learning_rate": 6.314968246244852e-05, + "loss": 1.7445, + "step": 14103 + }, + { + "epoch": 4.329036218538981, + "grad_norm": 0.3818886876106262, + "learning_rate": 6.314488681343337e-05, + "loss": 1.8292, + "step": 14104 + }, + { + "epoch": 4.329343155310006, + "grad_norm": 0.4376567006111145, + "learning_rate": 6.314009103451277e-05, + "loss": 1.8224, + "step": 14105 + }, + { + "epoch": 4.329650092081032, + "grad_norm": 0.2741515636444092, + "learning_rate": 6.313529512573406e-05, + "loss": 1.8078, + "step": 14106 + }, + { + "epoch": 4.329957028852056, + "grad_norm": 0.264343798160553, + "learning_rate": 6.313049908714467e-05, + "loss": 1.7314, + "step": 14107 + }, + { + "epoch": 4.3302639656230815, + "grad_norm": 0.3601943552494049, + "learning_rate": 6.312570291879201e-05, + "loss": 1.7351, + "step": 14108 + }, + { + "epoch": 4.330570902394107, + "grad_norm": 0.2931751012802124, + "learning_rate": 6.312090662072345e-05, + "loss": 1.8117, + "step": 14109 + }, + { + "epoch": 4.330877839165132, + "grad_norm": 0.27670225501060486, + "learning_rate": 6.31161101929864e-05, + "loss": 1.7707, + "step": 14110 + }, + { + "epoch": 4.3311847759361575, + "grad_norm": 0.33669596910476685, + "learning_rate": 6.311131363562825e-05, + "loss": 1.7337, + "step": 14111 + }, + { + "epoch": 4.331491712707182, + "grad_norm": 0.232634037733078, + "learning_rate": 6.310651694869643e-05, + "loss": 1.7372, + "step": 14112 + }, + { + "epoch": 4.331798649478207, + "grad_norm": 0.28611311316490173, + "learning_rate": 6.310172013223832e-05, + "loss": 1.6977, + "step": 14113 + }, + { + "epoch": 4.332105586249233, + "grad_norm": 0.30207201838493347, + "learning_rate": 6.309692318630132e-05, + "loss": 1.7765, + "step": 14114 + }, + { + "epoch": 4.332412523020258, + "grad_norm": 0.20757484436035156, + "learning_rate": 6.309212611093287e-05, + "loss": 1.697, + "step": 14115 + }, + { + "epoch": 4.332719459791283, + "grad_norm": 0.31472963094711304, + "learning_rate": 6.308732890618034e-05, + "loss": 1.7757, + "step": 14116 + }, + { + "epoch": 4.333026396562309, + "grad_norm": 0.37042325735092163, + "learning_rate": 6.308253157209117e-05, + "loss": 1.7745, + "step": 14117 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.25001442432403564, + "learning_rate": 6.307773410871275e-05, + "loss": 1.7461, + "step": 14118 + }, + { + "epoch": 4.333640270104358, + "grad_norm": 0.2691943347454071, + "learning_rate": 6.307293651609248e-05, + "loss": 1.7539, + "step": 14119 + }, + { + "epoch": 4.333947206875384, + "grad_norm": 0.30845868587493896, + "learning_rate": 6.306813879427782e-05, + "loss": 1.7559, + "step": 14120 + }, + { + "epoch": 4.334254143646409, + "grad_norm": 0.2244730293750763, + "learning_rate": 6.306334094331613e-05, + "loss": 1.7609, + "step": 14121 + }, + { + "epoch": 4.334561080417434, + "grad_norm": 0.32132062315940857, + "learning_rate": 6.305854296325485e-05, + "loss": 1.7837, + "step": 14122 + }, + { + "epoch": 4.334868017188459, + "grad_norm": 0.3762948513031006, + "learning_rate": 6.30537448541414e-05, + "loss": 1.7631, + "step": 14123 + }, + { + "epoch": 4.335174953959484, + "grad_norm": 0.24174273014068604, + "learning_rate": 6.30489466160232e-05, + "loss": 1.7532, + "step": 14124 + }, + { + "epoch": 4.3354818907305095, + "grad_norm": 0.23468497395515442, + "learning_rate": 6.304414824894765e-05, + "loss": 1.7731, + "step": 14125 + }, + { + "epoch": 4.335788827501535, + "grad_norm": 0.29086077213287354, + "learning_rate": 6.303934975296218e-05, + "loss": 1.7668, + "step": 14126 + }, + { + "epoch": 4.33609576427256, + "grad_norm": 0.2889879643917084, + "learning_rate": 6.303455112811422e-05, + "loss": 1.8188, + "step": 14127 + }, + { + "epoch": 4.336402701043585, + "grad_norm": 0.2335619181394577, + "learning_rate": 6.302975237445119e-05, + "loss": 1.7944, + "step": 14128 + }, + { + "epoch": 4.33670963781461, + "grad_norm": 0.29027310013771057, + "learning_rate": 6.302495349202051e-05, + "loss": 1.7771, + "step": 14129 + }, + { + "epoch": 4.337016574585635, + "grad_norm": 0.31961241364479065, + "learning_rate": 6.302015448086959e-05, + "loss": 1.8187, + "step": 14130 + }, + { + "epoch": 4.337323511356661, + "grad_norm": 0.26015788316726685, + "learning_rate": 6.301535534104587e-05, + "loss": 1.7819, + "step": 14131 + }, + { + "epoch": 4.337630448127686, + "grad_norm": 0.2440631091594696, + "learning_rate": 6.30105560725968e-05, + "loss": 1.7127, + "step": 14132 + }, + { + "epoch": 4.337937384898711, + "grad_norm": 0.304441899061203, + "learning_rate": 6.300575667556979e-05, + "loss": 1.7619, + "step": 14133 + }, + { + "epoch": 4.338244321669736, + "grad_norm": 0.3085228204727173, + "learning_rate": 6.300095715001226e-05, + "loss": 1.8287, + "step": 14134 + }, + { + "epoch": 4.338551258440761, + "grad_norm": 0.2863372564315796, + "learning_rate": 6.299615749597165e-05, + "loss": 1.8068, + "step": 14135 + }, + { + "epoch": 4.338858195211786, + "grad_norm": 0.25255265831947327, + "learning_rate": 6.299135771349537e-05, + "loss": 1.7506, + "step": 14136 + }, + { + "epoch": 4.339165131982812, + "grad_norm": 0.30224961042404175, + "learning_rate": 6.298655780263092e-05, + "loss": 1.7292, + "step": 14137 + }, + { + "epoch": 4.339472068753837, + "grad_norm": 0.24222104251384735, + "learning_rate": 6.298175776342567e-05, + "loss": 1.7616, + "step": 14138 + }, + { + "epoch": 4.3397790055248615, + "grad_norm": 0.3236368000507355, + "learning_rate": 6.29769575959271e-05, + "loss": 1.787, + "step": 14139 + }, + { + "epoch": 4.340085942295887, + "grad_norm": 0.26049408316612244, + "learning_rate": 6.297215730018261e-05, + "loss": 1.7108, + "step": 14140 + }, + { + "epoch": 4.340392879066912, + "grad_norm": 0.22833532094955444, + "learning_rate": 6.296735687623967e-05, + "loss": 1.7661, + "step": 14141 + }, + { + "epoch": 4.3406998158379375, + "grad_norm": 0.28397905826568604, + "learning_rate": 6.296255632414571e-05, + "loss": 1.7163, + "step": 14142 + }, + { + "epoch": 4.341006752608963, + "grad_norm": 0.3072611093521118, + "learning_rate": 6.295775564394817e-05, + "loss": 1.857, + "step": 14143 + }, + { + "epoch": 4.341313689379987, + "grad_norm": 0.22901058197021484, + "learning_rate": 6.295295483569448e-05, + "loss": 1.7325, + "step": 14144 + }, + { + "epoch": 4.341620626151013, + "grad_norm": 0.27433091402053833, + "learning_rate": 6.294815389943212e-05, + "loss": 1.8229, + "step": 14145 + }, + { + "epoch": 4.341927562922038, + "grad_norm": 0.2635616958141327, + "learning_rate": 6.29433528352085e-05, + "loss": 1.7585, + "step": 14146 + }, + { + "epoch": 4.342234499693063, + "grad_norm": 0.29129260778427124, + "learning_rate": 6.293855164307108e-05, + "loss": 1.8294, + "step": 14147 + }, + { + "epoch": 4.342541436464089, + "grad_norm": 0.3429001569747925, + "learning_rate": 6.293375032306731e-05, + "loss": 1.7725, + "step": 14148 + }, + { + "epoch": 4.342848373235114, + "grad_norm": 0.22407259047031403, + "learning_rate": 6.292894887524464e-05, + "loss": 1.7018, + "step": 14149 + }, + { + "epoch": 4.343155310006138, + "grad_norm": 0.3319321274757385, + "learning_rate": 6.292414729965053e-05, + "loss": 1.8472, + "step": 14150 + }, + { + "epoch": 4.343462246777164, + "grad_norm": 0.42744341492652893, + "learning_rate": 6.291934559633241e-05, + "loss": 1.8118, + "step": 14151 + }, + { + "epoch": 4.343769183548189, + "grad_norm": 0.24572840332984924, + "learning_rate": 6.291454376533774e-05, + "loss": 1.7184, + "step": 14152 + }, + { + "epoch": 4.344076120319214, + "grad_norm": 0.2485980987548828, + "learning_rate": 6.290974180671397e-05, + "loss": 1.7649, + "step": 14153 + }, + { + "epoch": 4.34438305709024, + "grad_norm": 0.3911706209182739, + "learning_rate": 6.29049397205086e-05, + "loss": 1.8105, + "step": 14154 + }, + { + "epoch": 4.344689993861264, + "grad_norm": 0.3008342981338501, + "learning_rate": 6.290013750676902e-05, + "loss": 1.7671, + "step": 14155 + }, + { + "epoch": 4.3449969306322895, + "grad_norm": 0.2072051614522934, + "learning_rate": 6.289533516554274e-05, + "loss": 1.7406, + "step": 14156 + }, + { + "epoch": 4.345303867403315, + "grad_norm": 0.3047312796115875, + "learning_rate": 6.289053269687719e-05, + "loss": 1.8133, + "step": 14157 + }, + { + "epoch": 4.34561080417434, + "grad_norm": 0.28260552883148193, + "learning_rate": 6.288573010081984e-05, + "loss": 1.7253, + "step": 14158 + }, + { + "epoch": 4.3459177409453655, + "grad_norm": 0.2474137246608734, + "learning_rate": 6.288092737741815e-05, + "loss": 1.822, + "step": 14159 + }, + { + "epoch": 4.346224677716391, + "grad_norm": 0.23717878758907318, + "learning_rate": 6.287612452671961e-05, + "loss": 1.7826, + "step": 14160 + }, + { + "epoch": 4.346531614487415, + "grad_norm": 0.2646107077598572, + "learning_rate": 6.287132154877163e-05, + "loss": 1.8118, + "step": 14161 + }, + { + "epoch": 4.346838551258441, + "grad_norm": 0.22026480734348297, + "learning_rate": 6.286651844362172e-05, + "loss": 1.7767, + "step": 14162 + }, + { + "epoch": 4.347145488029466, + "grad_norm": 0.2692350447177887, + "learning_rate": 6.286171521131733e-05, + "loss": 1.8718, + "step": 14163 + }, + { + "epoch": 4.347452424800491, + "grad_norm": 0.2749998867511749, + "learning_rate": 6.285691185190592e-05, + "loss": 1.7689, + "step": 14164 + }, + { + "epoch": 4.347759361571517, + "grad_norm": 0.24552448093891144, + "learning_rate": 6.2852108365435e-05, + "loss": 1.8049, + "step": 14165 + }, + { + "epoch": 4.348066298342541, + "grad_norm": 0.20530807971954346, + "learning_rate": 6.2847304751952e-05, + "loss": 1.7606, + "step": 14166 + }, + { + "epoch": 4.348373235113566, + "grad_norm": 0.23396088182926178, + "learning_rate": 6.28425010115044e-05, + "loss": 1.7482, + "step": 14167 + }, + { + "epoch": 4.348680171884592, + "grad_norm": 0.20512452721595764, + "learning_rate": 6.283769714413968e-05, + "loss": 1.6976, + "step": 14168 + }, + { + "epoch": 4.348987108655617, + "grad_norm": 0.20287172496318817, + "learning_rate": 6.283289314990531e-05, + "loss": 1.7439, + "step": 14169 + }, + { + "epoch": 4.349294045426642, + "grad_norm": 0.2193746268749237, + "learning_rate": 6.282808902884876e-05, + "loss": 1.763, + "step": 14170 + }, + { + "epoch": 4.349600982197667, + "grad_norm": 0.20415273308753967, + "learning_rate": 6.282328478101753e-05, + "loss": 1.7025, + "step": 14171 + }, + { + "epoch": 4.349907918968692, + "grad_norm": 0.19286803901195526, + "learning_rate": 6.281848040645907e-05, + "loss": 1.7529, + "step": 14172 + }, + { + "epoch": 4.350214855739718, + "grad_norm": 0.20908218622207642, + "learning_rate": 6.281367590522088e-05, + "loss": 1.7896, + "step": 14173 + }, + { + "epoch": 4.350521792510743, + "grad_norm": 0.2599989175796509, + "learning_rate": 6.280887127735045e-05, + "loss": 1.764, + "step": 14174 + }, + { + "epoch": 4.350828729281768, + "grad_norm": 0.23955710232257843, + "learning_rate": 6.280406652289523e-05, + "loss": 1.7321, + "step": 14175 + }, + { + "epoch": 4.351135666052793, + "grad_norm": 0.2311990112066269, + "learning_rate": 6.279926164190272e-05, + "loss": 1.7338, + "step": 14176 + }, + { + "epoch": 4.351442602823818, + "grad_norm": 0.2599658966064453, + "learning_rate": 6.27944566344204e-05, + "loss": 1.7444, + "step": 14177 + }, + { + "epoch": 4.351749539594843, + "grad_norm": 0.23079386353492737, + "learning_rate": 6.278965150049579e-05, + "loss": 1.7011, + "step": 14178 + }, + { + "epoch": 4.352056476365869, + "grad_norm": 0.24844171106815338, + "learning_rate": 6.278484624017631e-05, + "loss": 1.7298, + "step": 14179 + }, + { + "epoch": 4.352363413136894, + "grad_norm": 0.24839860200881958, + "learning_rate": 6.27800408535095e-05, + "loss": 1.7717, + "step": 14180 + }, + { + "epoch": 4.352670349907919, + "grad_norm": 0.2652966380119324, + "learning_rate": 6.277523534054284e-05, + "loss": 1.7759, + "step": 14181 + }, + { + "epoch": 4.352977286678944, + "grad_norm": 0.2787603735923767, + "learning_rate": 6.277042970132381e-05, + "loss": 1.8981, + "step": 14182 + }, + { + "epoch": 4.353284223449969, + "grad_norm": 0.2535475194454193, + "learning_rate": 6.276562393589991e-05, + "loss": 1.7538, + "step": 14183 + }, + { + "epoch": 4.3535911602209945, + "grad_norm": 0.3210967183113098, + "learning_rate": 6.276081804431863e-05, + "loss": 1.7087, + "step": 14184 + }, + { + "epoch": 4.35389809699202, + "grad_norm": 0.29936519265174866, + "learning_rate": 6.275601202662749e-05, + "loss": 1.7647, + "step": 14185 + }, + { + "epoch": 4.354205033763045, + "grad_norm": 0.21980762481689453, + "learning_rate": 6.275120588287394e-05, + "loss": 1.7759, + "step": 14186 + }, + { + "epoch": 4.35451197053407, + "grad_norm": 0.26833051443099976, + "learning_rate": 6.274639961310549e-05, + "loss": 1.7648, + "step": 14187 + }, + { + "epoch": 4.354818907305095, + "grad_norm": 0.27998095750808716, + "learning_rate": 6.274159321736966e-05, + "loss": 1.746, + "step": 14188 + }, + { + "epoch": 4.35512584407612, + "grad_norm": 0.21354494988918304, + "learning_rate": 6.273678669571395e-05, + "loss": 1.7417, + "step": 14189 + }, + { + "epoch": 4.355432780847146, + "grad_norm": 0.2295297235250473, + "learning_rate": 6.273198004818583e-05, + "loss": 1.7805, + "step": 14190 + }, + { + "epoch": 4.355739717618171, + "grad_norm": 0.2416422963142395, + "learning_rate": 6.272717327483283e-05, + "loss": 1.73, + "step": 14191 + }, + { + "epoch": 4.356046654389196, + "grad_norm": 0.2685304880142212, + "learning_rate": 6.272236637570244e-05, + "loss": 1.7936, + "step": 14192 + }, + { + "epoch": 4.356353591160221, + "grad_norm": 0.32481294870376587, + "learning_rate": 6.271755935084218e-05, + "loss": 1.7192, + "step": 14193 + }, + { + "epoch": 4.356660527931246, + "grad_norm": 0.2428581267595291, + "learning_rate": 6.271275220029954e-05, + "loss": 1.7428, + "step": 14194 + }, + { + "epoch": 4.356967464702271, + "grad_norm": 0.2266654521226883, + "learning_rate": 6.270794492412203e-05, + "loss": 1.7266, + "step": 14195 + }, + { + "epoch": 4.357274401473297, + "grad_norm": 0.25062093138694763, + "learning_rate": 6.270313752235716e-05, + "loss": 1.7476, + "step": 14196 + }, + { + "epoch": 4.357581338244322, + "grad_norm": 0.24085770547389984, + "learning_rate": 6.269832999505244e-05, + "loss": 1.7981, + "step": 14197 + }, + { + "epoch": 4.3578882750153465, + "grad_norm": 0.27035796642303467, + "learning_rate": 6.269352234225536e-05, + "loss": 1.8867, + "step": 14198 + }, + { + "epoch": 4.358195211786372, + "grad_norm": 0.22464458644390106, + "learning_rate": 6.268871456401348e-05, + "loss": 1.7514, + "step": 14199 + }, + { + "epoch": 4.358502148557397, + "grad_norm": 0.22485734522342682, + "learning_rate": 6.268390666037427e-05, + "loss": 1.7558, + "step": 14200 + }, + { + "epoch": 4.3588090853284225, + "grad_norm": 0.2052135169506073, + "learning_rate": 6.267909863138527e-05, + "loss": 1.7453, + "step": 14201 + }, + { + "epoch": 4.359116022099448, + "grad_norm": 0.2130763679742813, + "learning_rate": 6.267429047709397e-05, + "loss": 1.7712, + "step": 14202 + }, + { + "epoch": 4.359422958870473, + "grad_norm": 0.23146997392177582, + "learning_rate": 6.266948219754793e-05, + "loss": 1.6978, + "step": 14203 + }, + { + "epoch": 4.359729895641498, + "grad_norm": 0.21657225489616394, + "learning_rate": 6.266467379279463e-05, + "loss": 1.7641, + "step": 14204 + }, + { + "epoch": 4.360036832412523, + "grad_norm": 0.2598700523376465, + "learning_rate": 6.265986526288158e-05, + "loss": 1.7956, + "step": 14205 + }, + { + "epoch": 4.360343769183548, + "grad_norm": 0.23497453331947327, + "learning_rate": 6.265505660785633e-05, + "loss": 1.7835, + "step": 14206 + }, + { + "epoch": 4.360650705954574, + "grad_norm": 0.2491760104894638, + "learning_rate": 6.265024782776641e-05, + "loss": 1.8454, + "step": 14207 + }, + { + "epoch": 4.360957642725599, + "grad_norm": 0.224884033203125, + "learning_rate": 6.264543892265932e-05, + "loss": 1.8383, + "step": 14208 + }, + { + "epoch": 4.361264579496623, + "grad_norm": 0.24057646095752716, + "learning_rate": 6.264062989258259e-05, + "loss": 1.7437, + "step": 14209 + }, + { + "epoch": 4.361571516267649, + "grad_norm": 0.24661841988563538, + "learning_rate": 6.263582073758374e-05, + "loss": 1.8151, + "step": 14210 + }, + { + "epoch": 4.361878453038674, + "grad_norm": 0.24618980288505554, + "learning_rate": 6.263101145771031e-05, + "loss": 1.7955, + "step": 14211 + }, + { + "epoch": 4.362185389809699, + "grad_norm": 0.2615448236465454, + "learning_rate": 6.262620205300981e-05, + "loss": 1.7819, + "step": 14212 + }, + { + "epoch": 4.362492326580725, + "grad_norm": 0.3528309464454651, + "learning_rate": 6.26213925235298e-05, + "loss": 1.7723, + "step": 14213 + }, + { + "epoch": 4.362799263351749, + "grad_norm": 0.3099561035633087, + "learning_rate": 6.261658286931779e-05, + "loss": 1.7361, + "step": 14214 + }, + { + "epoch": 4.3631062001227745, + "grad_norm": 0.23693235218524933, + "learning_rate": 6.26117730904213e-05, + "loss": 1.8117, + "step": 14215 + }, + { + "epoch": 4.3634131368938, + "grad_norm": 0.4164150655269623, + "learning_rate": 6.260696318688786e-05, + "loss": 1.7908, + "step": 14216 + }, + { + "epoch": 4.363720073664825, + "grad_norm": 0.39376336336135864, + "learning_rate": 6.260215315876506e-05, + "loss": 1.7832, + "step": 14217 + }, + { + "epoch": 4.3640270104358505, + "grad_norm": 0.24071799218654633, + "learning_rate": 6.259734300610037e-05, + "loss": 1.7569, + "step": 14218 + }, + { + "epoch": 4.364333947206875, + "grad_norm": 0.4305122494697571, + "learning_rate": 6.259253272894136e-05, + "loss": 1.7974, + "step": 14219 + }, + { + "epoch": 4.3646408839779, + "grad_norm": 0.3023197054862976, + "learning_rate": 6.258772232733556e-05, + "loss": 1.7589, + "step": 14220 + }, + { + "epoch": 4.364947820748926, + "grad_norm": 0.23253366351127625, + "learning_rate": 6.258291180133052e-05, + "loss": 1.7138, + "step": 14221 + }, + { + "epoch": 4.365254757519951, + "grad_norm": 0.41141277551651, + "learning_rate": 6.257810115097376e-05, + "loss": 1.7608, + "step": 14222 + }, + { + "epoch": 4.365561694290976, + "grad_norm": 0.3308235704898834, + "learning_rate": 6.257329037631284e-05, + "loss": 1.8006, + "step": 14223 + }, + { + "epoch": 4.365868631062002, + "grad_norm": 0.2635105848312378, + "learning_rate": 6.256847947739528e-05, + "loss": 1.7275, + "step": 14224 + }, + { + "epoch": 4.366175567833026, + "grad_norm": 0.45886602997779846, + "learning_rate": 6.256366845426864e-05, + "loss": 1.7701, + "step": 14225 + }, + { + "epoch": 4.366482504604051, + "grad_norm": 0.48503565788269043, + "learning_rate": 6.255885730698049e-05, + "loss": 1.7409, + "step": 14226 + }, + { + "epoch": 4.366789441375077, + "grad_norm": 0.26727184653282166, + "learning_rate": 6.255404603557833e-05, + "loss": 1.7288, + "step": 14227 + }, + { + "epoch": 4.367096378146102, + "grad_norm": 0.3343912363052368, + "learning_rate": 6.254923464010974e-05, + "loss": 1.764, + "step": 14228 + }, + { + "epoch": 4.367403314917127, + "grad_norm": 0.40050622820854187, + "learning_rate": 6.254442312062224e-05, + "loss": 1.7653, + "step": 14229 + }, + { + "epoch": 4.367710251688152, + "grad_norm": 0.23941144347190857, + "learning_rate": 6.253961147716341e-05, + "loss": 1.6886, + "step": 14230 + }, + { + "epoch": 4.368017188459177, + "grad_norm": 0.25737255811691284, + "learning_rate": 6.253479970978079e-05, + "loss": 1.8047, + "step": 14231 + }, + { + "epoch": 4.3683241252302025, + "grad_norm": 0.28780993819236755, + "learning_rate": 6.252998781852192e-05, + "loss": 1.7453, + "step": 14232 + }, + { + "epoch": 4.368631062001228, + "grad_norm": 0.2362327128648758, + "learning_rate": 6.252517580343438e-05, + "loss": 1.7963, + "step": 14233 + }, + { + "epoch": 4.368937998772253, + "grad_norm": 0.263013631105423, + "learning_rate": 6.252036366456571e-05, + "loss": 1.7837, + "step": 14234 + }, + { + "epoch": 4.3692449355432785, + "grad_norm": 0.27674412727355957, + "learning_rate": 6.251555140196347e-05, + "loss": 1.767, + "step": 14235 + }, + { + "epoch": 4.369551872314303, + "grad_norm": 0.2360621690750122, + "learning_rate": 6.251073901567522e-05, + "loss": 1.7806, + "step": 14236 + }, + { + "epoch": 4.369858809085328, + "grad_norm": 0.2568018138408661, + "learning_rate": 6.25059265057485e-05, + "loss": 1.7672, + "step": 14237 + }, + { + "epoch": 4.370165745856354, + "grad_norm": 0.2512381374835968, + "learning_rate": 6.25011138722309e-05, + "loss": 1.7506, + "step": 14238 + }, + { + "epoch": 4.370472682627379, + "grad_norm": 0.21587291359901428, + "learning_rate": 6.249630111516994e-05, + "loss": 1.7336, + "step": 14239 + }, + { + "epoch": 4.370779619398404, + "grad_norm": 0.21791933476924896, + "learning_rate": 6.249148823461323e-05, + "loss": 1.7588, + "step": 14240 + }, + { + "epoch": 4.371086556169429, + "grad_norm": 0.23061512410640717, + "learning_rate": 6.248667523060831e-05, + "loss": 1.742, + "step": 14241 + }, + { + "epoch": 4.371393492940454, + "grad_norm": 0.2007007598876953, + "learning_rate": 6.248186210320274e-05, + "loss": 1.7227, + "step": 14242 + }, + { + "epoch": 4.371700429711479, + "grad_norm": 0.2564350366592407, + "learning_rate": 6.247704885244411e-05, + "loss": 1.7529, + "step": 14243 + }, + { + "epoch": 4.372007366482505, + "grad_norm": 0.21880537271499634, + "learning_rate": 6.247223547837995e-05, + "loss": 1.7828, + "step": 14244 + }, + { + "epoch": 4.37231430325353, + "grad_norm": 0.26154282689094543, + "learning_rate": 6.246742198105785e-05, + "loss": 1.7895, + "step": 14245 + }, + { + "epoch": 4.3726212400245545, + "grad_norm": 0.2652645707130432, + "learning_rate": 6.24626083605254e-05, + "loss": 1.8038, + "step": 14246 + }, + { + "epoch": 4.37292817679558, + "grad_norm": 0.21463751792907715, + "learning_rate": 6.245779461683013e-05, + "loss": 1.7139, + "step": 14247 + }, + { + "epoch": 4.373235113566605, + "grad_norm": 0.21285851299762726, + "learning_rate": 6.245298075001961e-05, + "loss": 1.7686, + "step": 14248 + }, + { + "epoch": 4.3735420503376305, + "grad_norm": 0.258602499961853, + "learning_rate": 6.244816676014149e-05, + "loss": 1.8518, + "step": 14249 + }, + { + "epoch": 4.373848987108656, + "grad_norm": 0.25747501850128174, + "learning_rate": 6.244335264724323e-05, + "loss": 1.8019, + "step": 14250 + }, + { + "epoch": 4.37415592387968, + "grad_norm": 0.24678784608840942, + "learning_rate": 6.243853841137251e-05, + "loss": 1.7846, + "step": 14251 + }, + { + "epoch": 4.374462860650706, + "grad_norm": 0.31382107734680176, + "learning_rate": 6.243372405257685e-05, + "loss": 1.8389, + "step": 14252 + }, + { + "epoch": 4.374769797421731, + "grad_norm": 0.30522868037223816, + "learning_rate": 6.242890957090383e-05, + "loss": 1.8057, + "step": 14253 + }, + { + "epoch": 4.375076734192756, + "grad_norm": 0.2449347972869873, + "learning_rate": 6.242409496640106e-05, + "loss": 1.7144, + "step": 14254 + }, + { + "epoch": 4.375383670963782, + "grad_norm": 0.3193594217300415, + "learning_rate": 6.241928023911609e-05, + "loss": 1.7404, + "step": 14255 + }, + { + "epoch": 4.375690607734807, + "grad_norm": 0.23948179185390472, + "learning_rate": 6.241446538909651e-05, + "loss": 1.7338, + "step": 14256 + }, + { + "epoch": 4.3759975445058314, + "grad_norm": 0.35325706005096436, + "learning_rate": 6.240965041638991e-05, + "loss": 1.7673, + "step": 14257 + }, + { + "epoch": 4.376304481276857, + "grad_norm": 0.38753262162208557, + "learning_rate": 6.240483532104387e-05, + "loss": 1.769, + "step": 14258 + }, + { + "epoch": 4.376611418047882, + "grad_norm": 0.2749052941799164, + "learning_rate": 6.2400020103106e-05, + "loss": 1.8086, + "step": 14259 + }, + { + "epoch": 4.3769183548189075, + "grad_norm": 0.2553126811981201, + "learning_rate": 6.239520476262384e-05, + "loss": 1.7733, + "step": 14260 + }, + { + "epoch": 4.377225291589933, + "grad_norm": 0.2854517698287964, + "learning_rate": 6.2390389299645e-05, + "loss": 1.7926, + "step": 14261 + }, + { + "epoch": 4.377532228360957, + "grad_norm": 0.24617259204387665, + "learning_rate": 6.238557371421708e-05, + "loss": 1.7297, + "step": 14262 + }, + { + "epoch": 4.377839165131983, + "grad_norm": 0.2555331289768219, + "learning_rate": 6.238075800638765e-05, + "loss": 1.7566, + "step": 14263 + }, + { + "epoch": 4.378146101903008, + "grad_norm": 0.31666773557662964, + "learning_rate": 6.237594217620432e-05, + "loss": 1.8003, + "step": 14264 + }, + { + "epoch": 4.378453038674033, + "grad_norm": 0.24166476726531982, + "learning_rate": 6.237112622371468e-05, + "loss": 1.7425, + "step": 14265 + }, + { + "epoch": 4.378759975445059, + "grad_norm": 0.21237102150917053, + "learning_rate": 6.236631014896633e-05, + "loss": 1.73, + "step": 14266 + }, + { + "epoch": 4.379066912216084, + "grad_norm": 0.2739151120185852, + "learning_rate": 6.236149395200683e-05, + "loss": 1.7113, + "step": 14267 + }, + { + "epoch": 4.379373848987108, + "grad_norm": 0.23700746893882751, + "learning_rate": 6.23566776328838e-05, + "loss": 1.7256, + "step": 14268 + }, + { + "epoch": 4.379680785758134, + "grad_norm": 0.22366748750209808, + "learning_rate": 6.235186119164485e-05, + "loss": 1.7981, + "step": 14269 + }, + { + "epoch": 4.379987722529159, + "grad_norm": 0.28440114855766296, + "learning_rate": 6.234704462833758e-05, + "loss": 1.8087, + "step": 14270 + }, + { + "epoch": 4.380294659300184, + "grad_norm": 0.2706616520881653, + "learning_rate": 6.234222794300957e-05, + "loss": 1.7502, + "step": 14271 + }, + { + "epoch": 4.38060159607121, + "grad_norm": 0.21666266024112701, + "learning_rate": 6.233741113570843e-05, + "loss": 1.7639, + "step": 14272 + }, + { + "epoch": 4.380908532842234, + "grad_norm": 0.26790255308151245, + "learning_rate": 6.233259420648175e-05, + "loss": 1.796, + "step": 14273 + }, + { + "epoch": 4.3812154696132595, + "grad_norm": 0.22233673930168152, + "learning_rate": 6.232777715537715e-05, + "loss": 1.7661, + "step": 14274 + }, + { + "epoch": 4.381522406384285, + "grad_norm": 0.3277546763420105, + "learning_rate": 6.232295998244223e-05, + "loss": 1.7932, + "step": 14275 + }, + { + "epoch": 4.38182934315531, + "grad_norm": 0.2907596826553345, + "learning_rate": 6.231814268772463e-05, + "loss": 1.7103, + "step": 14276 + }, + { + "epoch": 4.3821362799263355, + "grad_norm": 0.2318384349346161, + "learning_rate": 6.231332527127188e-05, + "loss": 1.7351, + "step": 14277 + }, + { + "epoch": 4.382443216697361, + "grad_norm": 0.32904061675071716, + "learning_rate": 6.230850773313163e-05, + "loss": 1.7967, + "step": 14278 + }, + { + "epoch": 4.382750153468385, + "grad_norm": 0.2455490082502365, + "learning_rate": 6.230369007335153e-05, + "loss": 1.7474, + "step": 14279 + }, + { + "epoch": 4.383057090239411, + "grad_norm": 0.23648180067539215, + "learning_rate": 6.229887229197913e-05, + "loss": 1.7106, + "step": 14280 + }, + { + "epoch": 4.383364027010436, + "grad_norm": 0.29552599787712097, + "learning_rate": 6.229405438906207e-05, + "loss": 1.7765, + "step": 14281 + }, + { + "epoch": 4.383670963781461, + "grad_norm": 0.2094641923904419, + "learning_rate": 6.228923636464796e-05, + "loss": 1.7105, + "step": 14282 + }, + { + "epoch": 4.383977900552487, + "grad_norm": 0.24632154405117035, + "learning_rate": 6.228441821878441e-05, + "loss": 1.7913, + "step": 14283 + }, + { + "epoch": 4.384284837323511, + "grad_norm": 0.28114691376686096, + "learning_rate": 6.227959995151904e-05, + "loss": 1.7456, + "step": 14284 + }, + { + "epoch": 4.384591774094536, + "grad_norm": 0.24226875603199005, + "learning_rate": 6.227478156289946e-05, + "loss": 1.797, + "step": 14285 + }, + { + "epoch": 4.384898710865562, + "grad_norm": 0.2526854872703552, + "learning_rate": 6.22699630529733e-05, + "loss": 1.7155, + "step": 14286 + }, + { + "epoch": 4.385205647636587, + "grad_norm": 0.312916100025177, + "learning_rate": 6.226514442178818e-05, + "loss": 1.7808, + "step": 14287 + }, + { + "epoch": 4.385512584407612, + "grad_norm": 0.23087100684642792, + "learning_rate": 6.22603256693917e-05, + "loss": 1.7543, + "step": 14288 + }, + { + "epoch": 4.385819521178637, + "grad_norm": 0.3042476177215576, + "learning_rate": 6.22555067958315e-05, + "loss": 1.747, + "step": 14289 + }, + { + "epoch": 4.386126457949662, + "grad_norm": 0.2604007422924042, + "learning_rate": 6.225068780115522e-05, + "loss": 1.7262, + "step": 14290 + }, + { + "epoch": 4.3864333947206875, + "grad_norm": 0.2200118750333786, + "learning_rate": 6.224586868541044e-05, + "loss": 1.75, + "step": 14291 + }, + { + "epoch": 4.386740331491713, + "grad_norm": 0.3452017307281494, + "learning_rate": 6.224104944864481e-05, + "loss": 1.7598, + "step": 14292 + }, + { + "epoch": 4.387047268262738, + "grad_norm": 0.3169453740119934, + "learning_rate": 6.223623009090597e-05, + "loss": 1.7939, + "step": 14293 + }, + { + "epoch": 4.387354205033763, + "grad_norm": 0.23640502989292145, + "learning_rate": 6.223141061224151e-05, + "loss": 1.8005, + "step": 14294 + }, + { + "epoch": 4.387661141804788, + "grad_norm": 0.26212456822395325, + "learning_rate": 6.22265910126991e-05, + "loss": 1.7951, + "step": 14295 + }, + { + "epoch": 4.387968078575813, + "grad_norm": 0.2687644362449646, + "learning_rate": 6.222177129232634e-05, + "loss": 1.7674, + "step": 14296 + }, + { + "epoch": 4.388275015346839, + "grad_norm": 0.2553202211856842, + "learning_rate": 6.221695145117086e-05, + "loss": 1.8142, + "step": 14297 + }, + { + "epoch": 4.388581952117864, + "grad_norm": 0.3317619264125824, + "learning_rate": 6.221213148928034e-05, + "loss": 1.7884, + "step": 14298 + }, + { + "epoch": 4.388888888888889, + "grad_norm": 0.3059331476688385, + "learning_rate": 6.220731140670235e-05, + "loss": 1.7377, + "step": 14299 + }, + { + "epoch": 4.389195825659914, + "grad_norm": 0.21544015407562256, + "learning_rate": 6.220249120348457e-05, + "loss": 1.6818, + "step": 14300 + }, + { + "epoch": 4.389502762430939, + "grad_norm": 0.3112640380859375, + "learning_rate": 6.219767087967461e-05, + "loss": 1.72, + "step": 14301 + }, + { + "epoch": 4.389809699201964, + "grad_norm": 0.2572654187679291, + "learning_rate": 6.219285043532011e-05, + "loss": 1.793, + "step": 14302 + }, + { + "epoch": 4.39011663597299, + "grad_norm": 0.2621476948261261, + "learning_rate": 6.218802987046874e-05, + "loss": 1.8301, + "step": 14303 + }, + { + "epoch": 4.390423572744015, + "grad_norm": 0.2592658996582031, + "learning_rate": 6.218320918516809e-05, + "loss": 1.7219, + "step": 14304 + }, + { + "epoch": 4.3907305095150395, + "grad_norm": 0.25503265857696533, + "learning_rate": 6.217838837946584e-05, + "loss": 1.8149, + "step": 14305 + }, + { + "epoch": 4.391037446286065, + "grad_norm": 0.21944166719913483, + "learning_rate": 6.217356745340962e-05, + "loss": 1.7174, + "step": 14306 + }, + { + "epoch": 4.39134438305709, + "grad_norm": 0.2937396466732025, + "learning_rate": 6.216874640704707e-05, + "loss": 1.8562, + "step": 14307 + }, + { + "epoch": 4.3916513198281155, + "grad_norm": 0.22520211338996887, + "learning_rate": 6.216392524042581e-05, + "loss": 1.7701, + "step": 14308 + }, + { + "epoch": 4.391958256599141, + "grad_norm": 0.24397830665111542, + "learning_rate": 6.215910395359355e-05, + "loss": 1.7794, + "step": 14309 + }, + { + "epoch": 4.392265193370166, + "grad_norm": 0.2867623567581177, + "learning_rate": 6.215428254659788e-05, + "loss": 1.7275, + "step": 14310 + }, + { + "epoch": 4.392572130141191, + "grad_norm": 0.2632426917552948, + "learning_rate": 6.214946101948648e-05, + "loss": 1.7919, + "step": 14311 + }, + { + "epoch": 4.392879066912216, + "grad_norm": 0.23146092891693115, + "learning_rate": 6.214463937230696e-05, + "loss": 1.744, + "step": 14312 + }, + { + "epoch": 4.393186003683241, + "grad_norm": 0.21877676248550415, + "learning_rate": 6.213981760510701e-05, + "loss": 1.7577, + "step": 14313 + }, + { + "epoch": 4.393492940454267, + "grad_norm": 0.2320399284362793, + "learning_rate": 6.213499571793426e-05, + "loss": 1.7864, + "step": 14314 + }, + { + "epoch": 4.393799877225292, + "grad_norm": 0.2951548993587494, + "learning_rate": 6.213017371083638e-05, + "loss": 1.8257, + "step": 14315 + }, + { + "epoch": 4.394106813996316, + "grad_norm": 0.26062941551208496, + "learning_rate": 6.212535158386102e-05, + "loss": 1.7448, + "step": 14316 + }, + { + "epoch": 4.394413750767342, + "grad_norm": 0.24760986864566803, + "learning_rate": 6.21205293370558e-05, + "loss": 1.7902, + "step": 14317 + }, + { + "epoch": 4.394720687538367, + "grad_norm": 0.2686399221420288, + "learning_rate": 6.211570697046844e-05, + "loss": 1.8209, + "step": 14318 + }, + { + "epoch": 4.395027624309392, + "grad_norm": 0.2599134147167206, + "learning_rate": 6.211088448414653e-05, + "loss": 1.8231, + "step": 14319 + }, + { + "epoch": 4.395334561080418, + "grad_norm": 0.254044771194458, + "learning_rate": 6.210606187813778e-05, + "loss": 1.806, + "step": 14320 + }, + { + "epoch": 4.395641497851442, + "grad_norm": 0.262229323387146, + "learning_rate": 6.210123915248982e-05, + "loss": 1.7857, + "step": 14321 + }, + { + "epoch": 4.3959484346224675, + "grad_norm": 0.2849259078502655, + "learning_rate": 6.209641630725033e-05, + "loss": 1.8005, + "step": 14322 + }, + { + "epoch": 4.396255371393493, + "grad_norm": 0.35480254888534546, + "learning_rate": 6.209159334246697e-05, + "loss": 1.8189, + "step": 14323 + }, + { + "epoch": 4.396562308164518, + "grad_norm": 0.2599184215068817, + "learning_rate": 6.20867702581874e-05, + "loss": 1.7384, + "step": 14324 + }, + { + "epoch": 4.3968692449355435, + "grad_norm": 0.23994222283363342, + "learning_rate": 6.208194705445926e-05, + "loss": 1.7566, + "step": 14325 + }, + { + "epoch": 4.397176181706568, + "grad_norm": 0.24361753463745117, + "learning_rate": 6.207712373133024e-05, + "loss": 1.6965, + "step": 14326 + }, + { + "epoch": 4.397483118477593, + "grad_norm": 0.23925161361694336, + "learning_rate": 6.207230028884803e-05, + "loss": 1.7596, + "step": 14327 + }, + { + "epoch": 4.397790055248619, + "grad_norm": 0.24365897476673126, + "learning_rate": 6.206747672706025e-05, + "loss": 1.7951, + "step": 14328 + }, + { + "epoch": 4.398096992019644, + "grad_norm": 0.25245413184165955, + "learning_rate": 6.206265304601461e-05, + "loss": 1.8086, + "step": 14329 + }, + { + "epoch": 4.398403928790669, + "grad_norm": 0.24272513389587402, + "learning_rate": 6.205782924575874e-05, + "loss": 1.8148, + "step": 14330 + }, + { + "epoch": 4.398710865561695, + "grad_norm": 0.21299590170383453, + "learning_rate": 6.205300532634036e-05, + "loss": 1.7666, + "step": 14331 + }, + { + "epoch": 4.399017802332719, + "grad_norm": 0.23543189465999603, + "learning_rate": 6.20481812878071e-05, + "loss": 1.7629, + "step": 14332 + }, + { + "epoch": 4.399324739103744, + "grad_norm": 0.2284495085477829, + "learning_rate": 6.204335713020665e-05, + "loss": 1.768, + "step": 14333 + }, + { + "epoch": 4.39963167587477, + "grad_norm": 0.23158542811870575, + "learning_rate": 6.20385328535867e-05, + "loss": 1.7761, + "step": 14334 + }, + { + "epoch": 4.399938612645795, + "grad_norm": 0.2378150224685669, + "learning_rate": 6.20337084579949e-05, + "loss": 1.8483, + "step": 14335 + }, + { + "epoch": 4.4002455494168204, + "grad_norm": 0.2407436966896057, + "learning_rate": 6.202888394347892e-05, + "loss": 1.7364, + "step": 14336 + }, + { + "epoch": 4.400552486187845, + "grad_norm": 0.256259560585022, + "learning_rate": 6.202405931008649e-05, + "loss": 1.7376, + "step": 14337 + }, + { + "epoch": 4.40085942295887, + "grad_norm": 0.29293057322502136, + "learning_rate": 6.201923455786524e-05, + "loss": 1.7493, + "step": 14338 + }, + { + "epoch": 4.401166359729896, + "grad_norm": 0.24025334417819977, + "learning_rate": 6.201440968686288e-05, + "loss": 1.7522, + "step": 14339 + }, + { + "epoch": 4.401473296500921, + "grad_norm": 0.3215656280517578, + "learning_rate": 6.200958469712708e-05, + "loss": 1.7748, + "step": 14340 + }, + { + "epoch": 4.401780233271946, + "grad_norm": 0.43553170561790466, + "learning_rate": 6.200475958870553e-05, + "loss": 1.771, + "step": 14341 + }, + { + "epoch": 4.402087170042972, + "grad_norm": 0.3112131953239441, + "learning_rate": 6.19999343616459e-05, + "loss": 1.7655, + "step": 14342 + }, + { + "epoch": 4.402394106813996, + "grad_norm": 0.25197842717170715, + "learning_rate": 6.199510901599589e-05, + "loss": 1.7214, + "step": 14343 + }, + { + "epoch": 4.402701043585021, + "grad_norm": 0.33227142691612244, + "learning_rate": 6.19902835518032e-05, + "loss": 1.7332, + "step": 14344 + }, + { + "epoch": 4.403007980356047, + "grad_norm": 0.27962982654571533, + "learning_rate": 6.198545796911548e-05, + "loss": 1.6943, + "step": 14345 + }, + { + "epoch": 4.403314917127072, + "grad_norm": 0.24374182522296906, + "learning_rate": 6.198063226798044e-05, + "loss": 1.7222, + "step": 14346 + }, + { + "epoch": 4.403621853898097, + "grad_norm": 0.3101944625377655, + "learning_rate": 6.197580644844576e-05, + "loss": 1.7113, + "step": 14347 + }, + { + "epoch": 4.403928790669122, + "grad_norm": 0.25919321179389954, + "learning_rate": 6.197098051055916e-05, + "loss": 1.71, + "step": 14348 + }, + { + "epoch": 4.404235727440147, + "grad_norm": 0.23140330612659454, + "learning_rate": 6.19661544543683e-05, + "loss": 1.7472, + "step": 14349 + }, + { + "epoch": 4.4045426642111725, + "grad_norm": 0.3274286687374115, + "learning_rate": 6.19613282799209e-05, + "loss": 1.7093, + "step": 14350 + }, + { + "epoch": 4.404849600982198, + "grad_norm": 0.3187442123889923, + "learning_rate": 6.195650198726464e-05, + "loss": 1.7488, + "step": 14351 + }, + { + "epoch": 4.405156537753223, + "grad_norm": 0.20547433197498322, + "learning_rate": 6.195167557644722e-05, + "loss": 1.7295, + "step": 14352 + }, + { + "epoch": 4.4054634745242485, + "grad_norm": 0.2623414993286133, + "learning_rate": 6.194684904751633e-05, + "loss": 1.8258, + "step": 14353 + }, + { + "epoch": 4.405770411295273, + "grad_norm": 0.2468457818031311, + "learning_rate": 6.194202240051967e-05, + "loss": 1.6957, + "step": 14354 + }, + { + "epoch": 4.406077348066298, + "grad_norm": 0.2082364559173584, + "learning_rate": 6.193719563550496e-05, + "loss": 1.7596, + "step": 14355 + }, + { + "epoch": 4.406384284837324, + "grad_norm": 0.27072983980178833, + "learning_rate": 6.193236875251988e-05, + "loss": 1.7341, + "step": 14356 + }, + { + "epoch": 4.406691221608349, + "grad_norm": 0.2630362808704376, + "learning_rate": 6.192754175161215e-05, + "loss": 1.7664, + "step": 14357 + }, + { + "epoch": 4.406998158379374, + "grad_norm": 0.25400006771087646, + "learning_rate": 6.192271463282944e-05, + "loss": 1.7582, + "step": 14358 + }, + { + "epoch": 4.407305095150399, + "grad_norm": 0.22256311774253845, + "learning_rate": 6.191788739621949e-05, + "loss": 1.7389, + "step": 14359 + }, + { + "epoch": 4.407612031921424, + "grad_norm": 0.2160387486219406, + "learning_rate": 6.191306004182999e-05, + "loss": 1.7051, + "step": 14360 + }, + { + "epoch": 4.407918968692449, + "grad_norm": 0.20665684342384338, + "learning_rate": 6.190823256970865e-05, + "loss": 1.7606, + "step": 14361 + }, + { + "epoch": 4.408225905463475, + "grad_norm": 0.2173188328742981, + "learning_rate": 6.190340497990318e-05, + "loss": 1.7944, + "step": 14362 + }, + { + "epoch": 4.4085328422345, + "grad_norm": 0.189287930727005, + "learning_rate": 6.189857727246127e-05, + "loss": 1.7283, + "step": 14363 + }, + { + "epoch": 4.4088397790055245, + "grad_norm": 0.2531645596027374, + "learning_rate": 6.189374944743065e-05, + "loss": 1.7554, + "step": 14364 + }, + { + "epoch": 4.40914671577655, + "grad_norm": 0.25439125299453735, + "learning_rate": 6.188892150485903e-05, + "loss": 1.8032, + "step": 14365 + }, + { + "epoch": 4.409453652547575, + "grad_norm": 0.20938685536384583, + "learning_rate": 6.188409344479412e-05, + "loss": 1.7385, + "step": 14366 + }, + { + "epoch": 4.4097605893186005, + "grad_norm": 0.20471477508544922, + "learning_rate": 6.187926526728364e-05, + "loss": 1.7487, + "step": 14367 + }, + { + "epoch": 4.410067526089626, + "grad_norm": 0.2381851226091385, + "learning_rate": 6.187443697237529e-05, + "loss": 1.7443, + "step": 14368 + }, + { + "epoch": 4.41037446286065, + "grad_norm": 0.21584098041057587, + "learning_rate": 6.18696085601168e-05, + "loss": 1.7818, + "step": 14369 + }, + { + "epoch": 4.410681399631676, + "grad_norm": 0.2575368583202362, + "learning_rate": 6.186478003055587e-05, + "loss": 1.8204, + "step": 14370 + }, + { + "epoch": 4.410988336402701, + "grad_norm": 0.21133238077163696, + "learning_rate": 6.185995138374024e-05, + "loss": 1.7274, + "step": 14371 + }, + { + "epoch": 4.411295273173726, + "grad_norm": 0.24918322265148163, + "learning_rate": 6.18551226197176e-05, + "loss": 1.8021, + "step": 14372 + }, + { + "epoch": 4.411602209944752, + "grad_norm": 0.2253655642271042, + "learning_rate": 6.185029373853572e-05, + "loss": 1.7308, + "step": 14373 + }, + { + "epoch": 4.411909146715777, + "grad_norm": 0.20098713040351868, + "learning_rate": 6.184546474024226e-05, + "loss": 1.7549, + "step": 14374 + }, + { + "epoch": 4.412216083486801, + "grad_norm": 0.25612789392471313, + "learning_rate": 6.1840635624885e-05, + "loss": 1.8305, + "step": 14375 + }, + { + "epoch": 4.412523020257827, + "grad_norm": 0.24287539720535278, + "learning_rate": 6.183580639251164e-05, + "loss": 1.7339, + "step": 14376 + }, + { + "epoch": 4.412829957028852, + "grad_norm": 0.2304944545030594, + "learning_rate": 6.183097704316988e-05, + "loss": 1.7023, + "step": 14377 + }, + { + "epoch": 4.413136893799877, + "grad_norm": 0.21911773085594177, + "learning_rate": 6.18261475769075e-05, + "loss": 1.7305, + "step": 14378 + }, + { + "epoch": 4.413443830570903, + "grad_norm": 0.24207864701747894, + "learning_rate": 6.182131799377217e-05, + "loss": 1.7318, + "step": 14379 + }, + { + "epoch": 4.413750767341927, + "grad_norm": 0.2551634609699249, + "learning_rate": 6.181648829381165e-05, + "loss": 1.8101, + "step": 14380 + }, + { + "epoch": 4.4140577041129525, + "grad_norm": 0.4114011526107788, + "learning_rate": 6.181165847707368e-05, + "loss": 1.772, + "step": 14381 + }, + { + "epoch": 4.414364640883978, + "grad_norm": 0.4592796862125397, + "learning_rate": 6.180682854360598e-05, + "loss": 1.7359, + "step": 14382 + }, + { + "epoch": 4.414671577655003, + "grad_norm": 0.2599259614944458, + "learning_rate": 6.180199849345627e-05, + "loss": 1.7028, + "step": 14383 + }, + { + "epoch": 4.4149785144260285, + "grad_norm": 0.3489506244659424, + "learning_rate": 6.17971683266723e-05, + "loss": 1.8252, + "step": 14384 + }, + { + "epoch": 4.415285451197054, + "grad_norm": 0.44563809037208557, + "learning_rate": 6.179233804330179e-05, + "loss": 1.6894, + "step": 14385 + }, + { + "epoch": 4.415592387968078, + "grad_norm": 0.2596888542175293, + "learning_rate": 6.17875076433925e-05, + "loss": 1.8141, + "step": 14386 + }, + { + "epoch": 4.415899324739104, + "grad_norm": 0.3560626804828644, + "learning_rate": 6.178267712699213e-05, + "loss": 1.7764, + "step": 14387 + }, + { + "epoch": 4.416206261510129, + "grad_norm": 0.3746717572212219, + "learning_rate": 6.177784649414843e-05, + "loss": 1.7528, + "step": 14388 + }, + { + "epoch": 4.416513198281154, + "grad_norm": 0.23248885571956635, + "learning_rate": 6.177301574490918e-05, + "loss": 1.7148, + "step": 14389 + }, + { + "epoch": 4.41682013505218, + "grad_norm": 0.26936978101730347, + "learning_rate": 6.176818487932208e-05, + "loss": 1.7199, + "step": 14390 + }, + { + "epoch": 4.417127071823204, + "grad_norm": 0.3102504014968872, + "learning_rate": 6.176335389743486e-05, + "loss": 1.6886, + "step": 14391 + }, + { + "epoch": 4.417434008594229, + "grad_norm": 0.24406832456588745, + "learning_rate": 6.175852279929531e-05, + "loss": 1.7766, + "step": 14392 + }, + { + "epoch": 4.417740945365255, + "grad_norm": 0.271158903837204, + "learning_rate": 6.175369158495112e-05, + "loss": 1.8099, + "step": 14393 + }, + { + "epoch": 4.41804788213628, + "grad_norm": 0.343667209148407, + "learning_rate": 6.174886025445008e-05, + "loss": 1.779, + "step": 14394 + }, + { + "epoch": 4.418354818907305, + "grad_norm": 0.37423139810562134, + "learning_rate": 6.17440288078399e-05, + "loss": 1.7796, + "step": 14395 + }, + { + "epoch": 4.41866175567833, + "grad_norm": 0.3152335286140442, + "learning_rate": 6.173919724516836e-05, + "loss": 1.7388, + "step": 14396 + }, + { + "epoch": 4.418968692449355, + "grad_norm": 0.21467824280261993, + "learning_rate": 6.173436556648319e-05, + "loss": 1.7689, + "step": 14397 + }, + { + "epoch": 4.4192756292203805, + "grad_norm": 0.2861369848251343, + "learning_rate": 6.172953377183213e-05, + "loss": 1.819, + "step": 14398 + }, + { + "epoch": 4.419582565991406, + "grad_norm": 0.34777504205703735, + "learning_rate": 6.172470186126295e-05, + "loss": 1.7444, + "step": 14399 + }, + { + "epoch": 4.419889502762431, + "grad_norm": 0.2728833854198456, + "learning_rate": 6.171986983482339e-05, + "loss": 1.7637, + "step": 14400 + }, + { + "epoch": 4.420196439533456, + "grad_norm": 0.2593914270401001, + "learning_rate": 6.17150376925612e-05, + "loss": 1.8196, + "step": 14401 + }, + { + "epoch": 4.420503376304481, + "grad_norm": 0.29425305128097534, + "learning_rate": 6.171020543452416e-05, + "loss": 1.7511, + "step": 14402 + }, + { + "epoch": 4.420810313075506, + "grad_norm": 0.2587110102176666, + "learning_rate": 6.170537306076e-05, + "loss": 1.8085, + "step": 14403 + }, + { + "epoch": 4.421117249846532, + "grad_norm": 0.22442933917045593, + "learning_rate": 6.170054057131648e-05, + "loss": 1.8023, + "step": 14404 + }, + { + "epoch": 4.421424186617557, + "grad_norm": 0.23302629590034485, + "learning_rate": 6.169570796624136e-05, + "loss": 1.7995, + "step": 14405 + }, + { + "epoch": 4.421731123388582, + "grad_norm": 0.2295885682106018, + "learning_rate": 6.169087524558239e-05, + "loss": 1.7948, + "step": 14406 + }, + { + "epoch": 4.422038060159607, + "grad_norm": 0.2161262482404709, + "learning_rate": 6.168604240938735e-05, + "loss": 1.7159, + "step": 14407 + }, + { + "epoch": 4.422344996930632, + "grad_norm": 0.20746205747127533, + "learning_rate": 6.1681209457704e-05, + "loss": 1.7703, + "step": 14408 + }, + { + "epoch": 4.422651933701657, + "grad_norm": 0.25677376985549927, + "learning_rate": 6.167637639058006e-05, + "loss": 1.7819, + "step": 14409 + }, + { + "epoch": 4.422958870472683, + "grad_norm": 0.226568341255188, + "learning_rate": 6.167154320806336e-05, + "loss": 1.7661, + "step": 14410 + }, + { + "epoch": 4.423265807243708, + "grad_norm": 0.22997824847698212, + "learning_rate": 6.166670991020162e-05, + "loss": 1.7364, + "step": 14411 + }, + { + "epoch": 4.4235727440147325, + "grad_norm": 0.2528770864009857, + "learning_rate": 6.166187649704261e-05, + "loss": 1.8505, + "step": 14412 + }, + { + "epoch": 4.423879680785758, + "grad_norm": 0.27278614044189453, + "learning_rate": 6.165704296863409e-05, + "loss": 1.7855, + "step": 14413 + }, + { + "epoch": 4.424186617556783, + "grad_norm": 0.23086364567279816, + "learning_rate": 6.165220932502385e-05, + "loss": 1.7489, + "step": 14414 + }, + { + "epoch": 4.4244935543278086, + "grad_norm": 0.2570587396621704, + "learning_rate": 6.164737556625965e-05, + "loss": 1.8008, + "step": 14415 + }, + { + "epoch": 4.424800491098834, + "grad_norm": 0.2637264132499695, + "learning_rate": 6.164254169238923e-05, + "loss": 1.7563, + "step": 14416 + }, + { + "epoch": 4.425107427869859, + "grad_norm": 0.23046623170375824, + "learning_rate": 6.163770770346043e-05, + "loss": 1.7433, + "step": 14417 + }, + { + "epoch": 4.425414364640884, + "grad_norm": 0.2531467080116272, + "learning_rate": 6.163287359952095e-05, + "loss": 1.8122, + "step": 14418 + }, + { + "epoch": 4.425721301411909, + "grad_norm": 0.26507216691970825, + "learning_rate": 6.162803938061861e-05, + "loss": 1.7019, + "step": 14419 + }, + { + "epoch": 4.426028238182934, + "grad_norm": 0.229641854763031, + "learning_rate": 6.162320504680117e-05, + "loss": 1.7518, + "step": 14420 + }, + { + "epoch": 4.42633517495396, + "grad_norm": 0.22777152061462402, + "learning_rate": 6.161837059811641e-05, + "loss": 1.8094, + "step": 14421 + }, + { + "epoch": 4.426642111724985, + "grad_norm": 0.22121338546276093, + "learning_rate": 6.161353603461209e-05, + "loss": 1.7204, + "step": 14422 + }, + { + "epoch": 4.4269490484960095, + "grad_norm": 0.21914128959178925, + "learning_rate": 6.1608701356336e-05, + "loss": 1.7554, + "step": 14423 + }, + { + "epoch": 4.427255985267035, + "grad_norm": 0.22649390995502472, + "learning_rate": 6.160386656333593e-05, + "loss": 1.8058, + "step": 14424 + }, + { + "epoch": 4.42756292203806, + "grad_norm": 0.24529023468494415, + "learning_rate": 6.159903165565964e-05, + "loss": 1.7302, + "step": 14425 + }, + { + "epoch": 4.4278698588090855, + "grad_norm": 0.2726481854915619, + "learning_rate": 6.159419663335492e-05, + "loss": 1.825, + "step": 14426 + }, + { + "epoch": 4.428176795580111, + "grad_norm": 0.2772440016269684, + "learning_rate": 6.158936149646957e-05, + "loss": 1.7322, + "step": 14427 + }, + { + "epoch": 4.428483732351136, + "grad_norm": 0.29778853058815, + "learning_rate": 6.158452624505135e-05, + "loss": 1.7421, + "step": 14428 + }, + { + "epoch": 4.428790669122161, + "grad_norm": 0.21327480673789978, + "learning_rate": 6.157969087914804e-05, + "loss": 1.7269, + "step": 14429 + }, + { + "epoch": 4.429097605893186, + "grad_norm": 0.2718868851661682, + "learning_rate": 6.157485539880744e-05, + "loss": 1.7817, + "step": 14430 + }, + { + "epoch": 4.429404542664211, + "grad_norm": 0.32242509722709656, + "learning_rate": 6.157001980407735e-05, + "loss": 1.7115, + "step": 14431 + }, + { + "epoch": 4.429711479435237, + "grad_norm": 0.2931978106498718, + "learning_rate": 6.156518409500553e-05, + "loss": 1.7822, + "step": 14432 + }, + { + "epoch": 4.430018416206262, + "grad_norm": 0.229528546333313, + "learning_rate": 6.156034827163977e-05, + "loss": 1.7623, + "step": 14433 + }, + { + "epoch": 4.430325352977286, + "grad_norm": 0.28702354431152344, + "learning_rate": 6.15555123340279e-05, + "loss": 1.8101, + "step": 14434 + }, + { + "epoch": 4.430632289748312, + "grad_norm": 0.27162131667137146, + "learning_rate": 6.155067628221766e-05, + "loss": 1.7525, + "step": 14435 + }, + { + "epoch": 4.430939226519337, + "grad_norm": 0.24290388822555542, + "learning_rate": 6.154584011625688e-05, + "loss": 1.8701, + "step": 14436 + }, + { + "epoch": 4.431246163290362, + "grad_norm": 0.3055405020713806, + "learning_rate": 6.154100383619334e-05, + "loss": 1.8659, + "step": 14437 + }, + { + "epoch": 4.431553100061388, + "grad_norm": 0.24528950452804565, + "learning_rate": 6.153616744207483e-05, + "loss": 1.8493, + "step": 14438 + }, + { + "epoch": 4.431860036832412, + "grad_norm": 0.2611897587776184, + "learning_rate": 6.153133093394917e-05, + "loss": 1.7905, + "step": 14439 + }, + { + "epoch": 4.4321669736034375, + "grad_norm": 0.2172730267047882, + "learning_rate": 6.15264943118641e-05, + "loss": 1.7087, + "step": 14440 + }, + { + "epoch": 4.432473910374463, + "grad_norm": 0.2320949286222458, + "learning_rate": 6.152165757586749e-05, + "loss": 1.7473, + "step": 14441 + }, + { + "epoch": 4.432780847145488, + "grad_norm": 0.2602086365222931, + "learning_rate": 6.15168207260071e-05, + "loss": 1.7365, + "step": 14442 + }, + { + "epoch": 4.4330877839165135, + "grad_norm": 0.25193190574645996, + "learning_rate": 6.151198376233074e-05, + "loss": 1.8205, + "step": 14443 + }, + { + "epoch": 4.433394720687538, + "grad_norm": 0.2894204556941986, + "learning_rate": 6.150714668488621e-05, + "loss": 1.7759, + "step": 14444 + }, + { + "epoch": 4.433701657458563, + "grad_norm": 0.24150310456752777, + "learning_rate": 6.150230949372131e-05, + "loss": 1.8415, + "step": 14445 + }, + { + "epoch": 4.434008594229589, + "grad_norm": 0.23475918173789978, + "learning_rate": 6.149747218888384e-05, + "loss": 1.7487, + "step": 14446 + }, + { + "epoch": 4.434315531000614, + "grad_norm": 0.29425546526908875, + "learning_rate": 6.149263477042162e-05, + "loss": 1.7538, + "step": 14447 + }, + { + "epoch": 4.434622467771639, + "grad_norm": 0.26241615414619446, + "learning_rate": 6.148779723838244e-05, + "loss": 1.7564, + "step": 14448 + }, + { + "epoch": 4.434929404542665, + "grad_norm": 0.23195287585258484, + "learning_rate": 6.148295959281411e-05, + "loss": 1.837, + "step": 14449 + }, + { + "epoch": 4.435236341313689, + "grad_norm": 0.34972792863845825, + "learning_rate": 6.147812183376445e-05, + "loss": 1.7632, + "step": 14450 + }, + { + "epoch": 4.435543278084714, + "grad_norm": 0.3536125719547272, + "learning_rate": 6.147328396128126e-05, + "loss": 1.8372, + "step": 14451 + }, + { + "epoch": 4.43585021485574, + "grad_norm": 0.2086079865694046, + "learning_rate": 6.146844597541235e-05, + "loss": 1.7014, + "step": 14452 + }, + { + "epoch": 4.436157151626765, + "grad_norm": 0.25547802448272705, + "learning_rate": 6.146360787620554e-05, + "loss": 1.7544, + "step": 14453 + }, + { + "epoch": 4.43646408839779, + "grad_norm": 0.26176998019218445, + "learning_rate": 6.145876966370864e-05, + "loss": 1.7617, + "step": 14454 + }, + { + "epoch": 4.436771025168815, + "grad_norm": 0.2672959566116333, + "learning_rate": 6.145393133796946e-05, + "loss": 1.8178, + "step": 14455 + }, + { + "epoch": 4.43707796193984, + "grad_norm": 0.23373909294605255, + "learning_rate": 6.144909289903582e-05, + "loss": 1.7295, + "step": 14456 + }, + { + "epoch": 4.4373848987108655, + "grad_norm": 0.2369835078716278, + "learning_rate": 6.144425434695551e-05, + "loss": 1.8097, + "step": 14457 + }, + { + "epoch": 4.437691835481891, + "grad_norm": 0.25528979301452637, + "learning_rate": 6.14394156817764e-05, + "loss": 1.7523, + "step": 14458 + }, + { + "epoch": 4.437998772252916, + "grad_norm": 0.2541787624359131, + "learning_rate": 6.143457690354626e-05, + "loss": 1.7606, + "step": 14459 + }, + { + "epoch": 4.4383057090239415, + "grad_norm": 0.2032637745141983, + "learning_rate": 6.142973801231295e-05, + "loss": 1.7967, + "step": 14460 + }, + { + "epoch": 4.438612645794966, + "grad_norm": 0.2413996160030365, + "learning_rate": 6.142489900812426e-05, + "loss": 1.7688, + "step": 14461 + }, + { + "epoch": 4.438919582565991, + "grad_norm": 0.43451038002967834, + "learning_rate": 6.142005989102803e-05, + "loss": 1.8269, + "step": 14462 + }, + { + "epoch": 4.439226519337017, + "grad_norm": 0.23981481790542603, + "learning_rate": 6.141522066107206e-05, + "loss": 1.7628, + "step": 14463 + }, + { + "epoch": 4.439533456108042, + "grad_norm": 0.25396493077278137, + "learning_rate": 6.14103813183042e-05, + "loss": 1.7913, + "step": 14464 + }, + { + "epoch": 4.439840392879067, + "grad_norm": 0.2567536532878876, + "learning_rate": 6.140554186277225e-05, + "loss": 1.7612, + "step": 14465 + }, + { + "epoch": 4.440147329650092, + "grad_norm": 0.2201337069272995, + "learning_rate": 6.140070229452406e-05, + "loss": 1.7541, + "step": 14466 + }, + { + "epoch": 4.440454266421117, + "grad_norm": 0.24202953279018402, + "learning_rate": 6.139586261360746e-05, + "loss": 1.777, + "step": 14467 + }, + { + "epoch": 4.440761203192142, + "grad_norm": 0.23891687393188477, + "learning_rate": 6.139102282007024e-05, + "loss": 1.7509, + "step": 14468 + }, + { + "epoch": 4.441068139963168, + "grad_norm": 0.21132555603981018, + "learning_rate": 6.138618291396026e-05, + "loss": 1.7362, + "step": 14469 + }, + { + "epoch": 4.441375076734193, + "grad_norm": 0.2731861472129822, + "learning_rate": 6.138134289532536e-05, + "loss": 1.8063, + "step": 14470 + }, + { + "epoch": 4.4416820135052175, + "grad_norm": 0.29503315687179565, + "learning_rate": 6.137650276421336e-05, + "loss": 1.7193, + "step": 14471 + }, + { + "epoch": 4.441988950276243, + "grad_norm": 0.2778526544570923, + "learning_rate": 6.137166252067208e-05, + "loss": 1.7507, + "step": 14472 + }, + { + "epoch": 4.442295887047268, + "grad_norm": 0.2907710075378418, + "learning_rate": 6.136682216474938e-05, + "loss": 1.7939, + "step": 14473 + }, + { + "epoch": 4.4426028238182935, + "grad_norm": 0.4133768379688263, + "learning_rate": 6.136198169649306e-05, + "loss": 1.8012, + "step": 14474 + }, + { + "epoch": 4.442909760589319, + "grad_norm": 0.2505052983760834, + "learning_rate": 6.135714111595099e-05, + "loss": 1.8426, + "step": 14475 + }, + { + "epoch": 4.443216697360343, + "grad_norm": 0.3884379267692566, + "learning_rate": 6.135230042317099e-05, + "loss": 1.7383, + "step": 14476 + }, + { + "epoch": 4.443523634131369, + "grad_norm": 0.42902377247810364, + "learning_rate": 6.134745961820091e-05, + "loss": 1.732, + "step": 14477 + }, + { + "epoch": 4.443830570902394, + "grad_norm": 0.21782708168029785, + "learning_rate": 6.134261870108858e-05, + "loss": 1.7369, + "step": 14478 + }, + { + "epoch": 4.444137507673419, + "grad_norm": 0.4160648286342621, + "learning_rate": 6.133777767188186e-05, + "loss": 1.8083, + "step": 14479 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.5057216882705688, + "learning_rate": 6.133293653062856e-05, + "loss": 1.8971, + "step": 14480 + }, + { + "epoch": 4.44475138121547, + "grad_norm": 0.2189750075340271, + "learning_rate": 6.132809527737654e-05, + "loss": 1.7508, + "step": 14481 + }, + { + "epoch": 4.445058317986494, + "grad_norm": 0.4415782392024994, + "learning_rate": 6.132325391217364e-05, + "loss": 1.8548, + "step": 14482 + }, + { + "epoch": 4.44536525475752, + "grad_norm": 0.3907296359539032, + "learning_rate": 6.13184124350677e-05, + "loss": 1.7879, + "step": 14483 + }, + { + "epoch": 4.445672191528545, + "grad_norm": 0.24117955565452576, + "learning_rate": 6.131357084610659e-05, + "loss": 1.7227, + "step": 14484 + }, + { + "epoch": 4.44597912829957, + "grad_norm": 0.3083679974079132, + "learning_rate": 6.130872914533815e-05, + "loss": 1.7505, + "step": 14485 + }, + { + "epoch": 4.446286065070596, + "grad_norm": 0.27730658650398254, + "learning_rate": 6.13038873328102e-05, + "loss": 1.7485, + "step": 14486 + }, + { + "epoch": 4.44659300184162, + "grad_norm": 0.28548410534858704, + "learning_rate": 6.12990454085706e-05, + "loss": 1.8145, + "step": 14487 + }, + { + "epoch": 4.4468999386126455, + "grad_norm": 0.24743106961250305, + "learning_rate": 6.129420337266724e-05, + "loss": 1.7131, + "step": 14488 + }, + { + "epoch": 4.447206875383671, + "grad_norm": 0.2899693250656128, + "learning_rate": 6.128936122514794e-05, + "loss": 1.8567, + "step": 14489 + }, + { + "epoch": 4.447513812154696, + "grad_norm": 0.259916752576828, + "learning_rate": 6.128451896606053e-05, + "loss": 1.7563, + "step": 14490 + }, + { + "epoch": 4.4478207489257215, + "grad_norm": 0.21112586557865143, + "learning_rate": 6.12796765954529e-05, + "loss": 1.6975, + "step": 14491 + }, + { + "epoch": 4.448127685696747, + "grad_norm": 0.2890239953994751, + "learning_rate": 6.12748341133729e-05, + "loss": 1.7904, + "step": 14492 + }, + { + "epoch": 4.448434622467771, + "grad_norm": 0.23394012451171875, + "learning_rate": 6.126999151986839e-05, + "loss": 1.7559, + "step": 14493 + }, + { + "epoch": 4.448741559238797, + "grad_norm": 0.3492949903011322, + "learning_rate": 6.12651488149872e-05, + "loss": 1.7734, + "step": 14494 + }, + { + "epoch": 4.449048496009822, + "grad_norm": 0.48309218883514404, + "learning_rate": 6.126030599877723e-05, + "loss": 1.7798, + "step": 14495 + }, + { + "epoch": 4.449355432780847, + "grad_norm": 0.341146320104599, + "learning_rate": 6.12554630712863e-05, + "loss": 1.7921, + "step": 14496 + }, + { + "epoch": 4.449662369551873, + "grad_norm": 0.223160982131958, + "learning_rate": 6.125062003256229e-05, + "loss": 1.7784, + "step": 14497 + }, + { + "epoch": 4.449969306322897, + "grad_norm": 0.32664811611175537, + "learning_rate": 6.124577688265306e-05, + "loss": 1.7353, + "step": 14498 + }, + { + "epoch": 4.4502762430939224, + "grad_norm": 0.215936541557312, + "learning_rate": 6.124093362160646e-05, + "loss": 1.68, + "step": 14499 + }, + { + "epoch": 4.450583179864948, + "grad_norm": 0.26081225275993347, + "learning_rate": 6.123609024947038e-05, + "loss": 1.7107, + "step": 14500 + }, + { + "epoch": 4.450890116635973, + "grad_norm": 0.3124069571495056, + "learning_rate": 6.123124676629267e-05, + "loss": 1.7338, + "step": 14501 + }, + { + "epoch": 4.4511970534069984, + "grad_norm": 0.23125620186328888, + "learning_rate": 6.122640317212118e-05, + "loss": 1.7842, + "step": 14502 + }, + { + "epoch": 4.451503990178024, + "grad_norm": 0.27065595984458923, + "learning_rate": 6.122155946700381e-05, + "loss": 1.7284, + "step": 14503 + }, + { + "epoch": 4.451810926949048, + "grad_norm": 0.4677436053752899, + "learning_rate": 6.121671565098841e-05, + "loss": 1.8156, + "step": 14504 + }, + { + "epoch": 4.452117863720074, + "grad_norm": 0.36325082182884216, + "learning_rate": 6.121187172412285e-05, + "loss": 1.7875, + "step": 14505 + }, + { + "epoch": 4.452424800491099, + "grad_norm": 0.23409567773342133, + "learning_rate": 6.1207027686455e-05, + "loss": 1.7421, + "step": 14506 + }, + { + "epoch": 4.452731737262124, + "grad_norm": 0.36919257044792175, + "learning_rate": 6.120218353803273e-05, + "loss": 1.7545, + "step": 14507 + }, + { + "epoch": 4.45303867403315, + "grad_norm": 0.318452388048172, + "learning_rate": 6.119733927890393e-05, + "loss": 1.7179, + "step": 14508 + }, + { + "epoch": 4.453345610804174, + "grad_norm": 0.21279768645763397, + "learning_rate": 6.119249490911643e-05, + "loss": 1.7534, + "step": 14509 + }, + { + "epoch": 4.453652547575199, + "grad_norm": 0.30565473437309265, + "learning_rate": 6.118765042871816e-05, + "loss": 1.7962, + "step": 14510 + }, + { + "epoch": 4.453959484346225, + "grad_norm": 0.2608480453491211, + "learning_rate": 6.118280583775697e-05, + "loss": 1.7336, + "step": 14511 + }, + { + "epoch": 4.45426642111725, + "grad_norm": 0.22978845238685608, + "learning_rate": 6.117796113628075e-05, + "loss": 1.8244, + "step": 14512 + }, + { + "epoch": 4.454573357888275, + "grad_norm": 0.26357781887054443, + "learning_rate": 6.117311632433735e-05, + "loss": 1.7425, + "step": 14513 + }, + { + "epoch": 4.4548802946593, + "grad_norm": 0.22127102315425873, + "learning_rate": 6.116827140197467e-05, + "loss": 1.7679, + "step": 14514 + }, + { + "epoch": 4.455187231430325, + "grad_norm": 0.2876584231853485, + "learning_rate": 6.116342636924058e-05, + "loss": 1.8104, + "step": 14515 + }, + { + "epoch": 4.4554941682013505, + "grad_norm": 0.28290677070617676, + "learning_rate": 6.115858122618297e-05, + "loss": 1.7485, + "step": 14516 + }, + { + "epoch": 4.455801104972376, + "grad_norm": 0.21914640069007874, + "learning_rate": 6.115373597284974e-05, + "loss": 1.7736, + "step": 14517 + }, + { + "epoch": 4.456108041743401, + "grad_norm": 0.2603909969329834, + "learning_rate": 6.114889060928873e-05, + "loss": 1.7446, + "step": 14518 + }, + { + "epoch": 4.456414978514426, + "grad_norm": 0.2157236635684967, + "learning_rate": 6.114404513554784e-05, + "loss": 1.7594, + "step": 14519 + }, + { + "epoch": 4.456721915285451, + "grad_norm": 0.27622368931770325, + "learning_rate": 6.113919955167499e-05, + "loss": 1.8154, + "step": 14520 + }, + { + "epoch": 4.457028852056476, + "grad_norm": 0.27298516035079956, + "learning_rate": 6.113435385771803e-05, + "loss": 1.7458, + "step": 14521 + }, + { + "epoch": 4.457335788827502, + "grad_norm": 0.22220586240291595, + "learning_rate": 6.112950805372485e-05, + "loss": 1.7102, + "step": 14522 + }, + { + "epoch": 4.457642725598527, + "grad_norm": 0.19480876624584198, + "learning_rate": 6.112466213974336e-05, + "loss": 1.7696, + "step": 14523 + }, + { + "epoch": 4.457949662369552, + "grad_norm": 0.24261653423309326, + "learning_rate": 6.111981611582144e-05, + "loss": 1.8193, + "step": 14524 + }, + { + "epoch": 4.458256599140577, + "grad_norm": 0.2502967417240143, + "learning_rate": 6.111496998200697e-05, + "loss": 1.7701, + "step": 14525 + }, + { + "epoch": 4.458563535911602, + "grad_norm": 0.25764599442481995, + "learning_rate": 6.111012373834786e-05, + "loss": 1.8055, + "step": 14526 + }, + { + "epoch": 4.458870472682627, + "grad_norm": 0.24085427820682526, + "learning_rate": 6.110527738489198e-05, + "loss": 1.7592, + "step": 14527 + }, + { + "epoch": 4.459177409453653, + "grad_norm": 0.2469809502363205, + "learning_rate": 6.110043092168727e-05, + "loss": 1.6977, + "step": 14528 + }, + { + "epoch": 4.459484346224678, + "grad_norm": 0.21888838708400726, + "learning_rate": 6.109558434878159e-05, + "loss": 1.777, + "step": 14529 + }, + { + "epoch": 4.4597912829957025, + "grad_norm": 0.2094014585018158, + "learning_rate": 6.109073766622281e-05, + "loss": 1.7041, + "step": 14530 + }, + { + "epoch": 4.460098219766728, + "grad_norm": 0.23801055550575256, + "learning_rate": 6.108589087405888e-05, + "loss": 1.8392, + "step": 14531 + }, + { + "epoch": 4.460405156537753, + "grad_norm": 0.2164965718984604, + "learning_rate": 6.108104397233769e-05, + "loss": 1.7643, + "step": 14532 + }, + { + "epoch": 4.4607120933087785, + "grad_norm": 0.21322336792945862, + "learning_rate": 6.107619696110712e-05, + "loss": 1.7063, + "step": 14533 + }, + { + "epoch": 4.461019030079804, + "grad_norm": 0.29019200801849365, + "learning_rate": 6.107134984041507e-05, + "loss": 1.8254, + "step": 14534 + }, + { + "epoch": 4.461325966850829, + "grad_norm": 0.2765025496482849, + "learning_rate": 6.106650261030947e-05, + "loss": 1.7609, + "step": 14535 + }, + { + "epoch": 4.461632903621854, + "grad_norm": 0.20879749953746796, + "learning_rate": 6.106165527083818e-05, + "loss": 1.7387, + "step": 14536 + }, + { + "epoch": 4.461939840392879, + "grad_norm": 0.22295843064785004, + "learning_rate": 6.105680782204913e-05, + "loss": 1.7691, + "step": 14537 + }, + { + "epoch": 4.462246777163904, + "grad_norm": 0.23502351343631744, + "learning_rate": 6.105196026399025e-05, + "loss": 1.7335, + "step": 14538 + }, + { + "epoch": 4.46255371393493, + "grad_norm": 0.22143007814884186, + "learning_rate": 6.104711259670941e-05, + "loss": 1.7338, + "step": 14539 + }, + { + "epoch": 4.462860650705955, + "grad_norm": 0.22361041605472565, + "learning_rate": 6.104226482025453e-05, + "loss": 1.7033, + "step": 14540 + }, + { + "epoch": 4.463167587476979, + "grad_norm": 0.27104905247688293, + "learning_rate": 6.10374169346735e-05, + "loss": 1.7926, + "step": 14541 + }, + { + "epoch": 4.463474524248005, + "grad_norm": 0.23564264178276062, + "learning_rate": 6.103256894001427e-05, + "loss": 1.7522, + "step": 14542 + }, + { + "epoch": 4.46378146101903, + "grad_norm": 0.2585970163345337, + "learning_rate": 6.102772083632471e-05, + "loss": 1.7755, + "step": 14543 + }, + { + "epoch": 4.464088397790055, + "grad_norm": 0.358634889125824, + "learning_rate": 6.102287262365276e-05, + "loss": 1.8092, + "step": 14544 + }, + { + "epoch": 4.464395334561081, + "grad_norm": 0.2862946689128876, + "learning_rate": 6.1018024302046314e-05, + "loss": 1.7051, + "step": 14545 + }, + { + "epoch": 4.464702271332105, + "grad_norm": 0.21907158195972443, + "learning_rate": 6.101317587155331e-05, + "loss": 1.7882, + "step": 14546 + }, + { + "epoch": 4.4650092081031305, + "grad_norm": 0.24268488585948944, + "learning_rate": 6.100832733222164e-05, + "loss": 1.7756, + "step": 14547 + }, + { + "epoch": 4.465316144874156, + "grad_norm": 0.2350744605064392, + "learning_rate": 6.1003478684099214e-05, + "loss": 1.7483, + "step": 14548 + }, + { + "epoch": 4.465623081645181, + "grad_norm": 0.22902250289916992, + "learning_rate": 6.099862992723397e-05, + "loss": 1.7687, + "step": 14549 + }, + { + "epoch": 4.4659300184162065, + "grad_norm": 0.23590944707393646, + "learning_rate": 6.099378106167382e-05, + "loss": 1.8481, + "step": 14550 + }, + { + "epoch": 4.466236955187231, + "grad_norm": 0.23644296824932098, + "learning_rate": 6.098893208746668e-05, + "loss": 1.7422, + "step": 14551 + }, + { + "epoch": 4.466543891958256, + "grad_norm": 0.23782360553741455, + "learning_rate": 6.0984083004660475e-05, + "loss": 1.7852, + "step": 14552 + }, + { + "epoch": 4.466850828729282, + "grad_norm": 0.2546575665473938, + "learning_rate": 6.097923381330313e-05, + "loss": 1.8483, + "step": 14553 + }, + { + "epoch": 4.467157765500307, + "grad_norm": 0.2555409371852875, + "learning_rate": 6.097438451344254e-05, + "loss": 1.7887, + "step": 14554 + }, + { + "epoch": 4.467464702271332, + "grad_norm": 0.28074198961257935, + "learning_rate": 6.0969535105126664e-05, + "loss": 1.7521, + "step": 14555 + }, + { + "epoch": 4.467771639042358, + "grad_norm": 0.22622554004192352, + "learning_rate": 6.096468558840341e-05, + "loss": 1.8088, + "step": 14556 + }, + { + "epoch": 4.468078575813382, + "grad_norm": 0.302749902009964, + "learning_rate": 6.095983596332071e-05, + "loss": 1.8192, + "step": 14557 + }, + { + "epoch": 4.468385512584407, + "grad_norm": 0.27925750613212585, + "learning_rate": 6.0954986229926494e-05, + "loss": 1.8453, + "step": 14558 + }, + { + "epoch": 4.468692449355433, + "grad_norm": 0.2246330976486206, + "learning_rate": 6.095013638826868e-05, + "loss": 1.744, + "step": 14559 + }, + { + "epoch": 4.468999386126458, + "grad_norm": 0.26677101850509644, + "learning_rate": 6.094528643839518e-05, + "loss": 1.708, + "step": 14560 + }, + { + "epoch": 4.469306322897483, + "grad_norm": 0.23684042692184448, + "learning_rate": 6.094043638035396e-05, + "loss": 1.713, + "step": 14561 + }, + { + "epoch": 4.469613259668508, + "grad_norm": 0.2470075935125351, + "learning_rate": 6.093558621419294e-05, + "loss": 1.8096, + "step": 14562 + }, + { + "epoch": 4.469920196439533, + "grad_norm": 0.2775517702102661, + "learning_rate": 6.093073593996005e-05, + "loss": 1.697, + "step": 14563 + }, + { + "epoch": 4.4702271332105585, + "grad_norm": 0.21053175628185272, + "learning_rate": 6.092588555770322e-05, + "loss": 1.6894, + "step": 14564 + }, + { + "epoch": 4.470534069981584, + "grad_norm": 0.2555869221687317, + "learning_rate": 6.0921035067470366e-05, + "loss": 1.7051, + "step": 14565 + }, + { + "epoch": 4.470841006752609, + "grad_norm": 0.34468984603881836, + "learning_rate": 6.0916184469309454e-05, + "loss": 1.7317, + "step": 14566 + }, + { + "epoch": 4.4711479435236345, + "grad_norm": 0.2517752945423126, + "learning_rate": 6.0911333763268407e-05, + "loss": 1.7524, + "step": 14567 + }, + { + "epoch": 4.471454880294659, + "grad_norm": 0.2749727666378021, + "learning_rate": 6.090648294939517e-05, + "loss": 1.7045, + "step": 14568 + }, + { + "epoch": 4.471761817065684, + "grad_norm": 0.36250773072242737, + "learning_rate": 6.0901632027737673e-05, + "loss": 1.7196, + "step": 14569 + }, + { + "epoch": 4.47206875383671, + "grad_norm": 0.2317698448896408, + "learning_rate": 6.089678099834386e-05, + "loss": 1.7318, + "step": 14570 + }, + { + "epoch": 4.472375690607735, + "grad_norm": 0.2863345444202423, + "learning_rate": 6.089192986126166e-05, + "loss": 1.7798, + "step": 14571 + }, + { + "epoch": 4.47268262737876, + "grad_norm": 0.3493366241455078, + "learning_rate": 6.088707861653904e-05, + "loss": 1.7749, + "step": 14572 + }, + { + "epoch": 4.472989564149785, + "grad_norm": 0.25718605518341064, + "learning_rate": 6.0882227264223924e-05, + "loss": 1.7683, + "step": 14573 + }, + { + "epoch": 4.47329650092081, + "grad_norm": 0.2320062816143036, + "learning_rate": 6.087737580436426e-05, + "loss": 1.8296, + "step": 14574 + }, + { + "epoch": 4.473603437691835, + "grad_norm": 0.29071560502052307, + "learning_rate": 6.087252423700799e-05, + "loss": 1.7428, + "step": 14575 + }, + { + "epoch": 4.473910374462861, + "grad_norm": 0.24233707785606384, + "learning_rate": 6.086767256220306e-05, + "loss": 1.7332, + "step": 14576 + }, + { + "epoch": 4.474217311233886, + "grad_norm": 0.228043332695961, + "learning_rate": 6.086282077999742e-05, + "loss": 1.7697, + "step": 14577 + }, + { + "epoch": 4.474524248004911, + "grad_norm": 0.29154402017593384, + "learning_rate": 6.085796889043902e-05, + "loss": 1.8043, + "step": 14578 + }, + { + "epoch": 4.474831184775936, + "grad_norm": 0.30543211102485657, + "learning_rate": 6.0853116893575814e-05, + "loss": 1.7665, + "step": 14579 + }, + { + "epoch": 4.475138121546961, + "grad_norm": 0.22792959213256836, + "learning_rate": 6.0848264789455754e-05, + "loss": 1.729, + "step": 14580 + }, + { + "epoch": 4.475445058317987, + "grad_norm": 0.2615707218647003, + "learning_rate": 6.084341257812677e-05, + "loss": 1.7438, + "step": 14581 + }, + { + "epoch": 4.475751995089012, + "grad_norm": 0.23342981934547424, + "learning_rate": 6.083856025963681e-05, + "loss": 1.7158, + "step": 14582 + }, + { + "epoch": 4.476058931860037, + "grad_norm": 0.22279240190982819, + "learning_rate": 6.083370783403387e-05, + "loss": 1.7413, + "step": 14583 + }, + { + "epoch": 4.476365868631062, + "grad_norm": 0.28867462277412415, + "learning_rate": 6.082885530136587e-05, + "loss": 1.7932, + "step": 14584 + }, + { + "epoch": 4.476672805402087, + "grad_norm": 0.2947152256965637, + "learning_rate": 6.082400266168078e-05, + "loss": 1.8986, + "step": 14585 + }, + { + "epoch": 4.476979742173112, + "grad_norm": 0.2948935627937317, + "learning_rate": 6.0819149915026555e-05, + "loss": 1.9134, + "step": 14586 + }, + { + "epoch": 4.477286678944138, + "grad_norm": 0.4436163902282715, + "learning_rate": 6.081429706145114e-05, + "loss": 1.7616, + "step": 14587 + }, + { + "epoch": 4.477593615715163, + "grad_norm": 0.4879693388938904, + "learning_rate": 6.080944410100249e-05, + "loss": 1.8155, + "step": 14588 + }, + { + "epoch": 4.4779005524861875, + "grad_norm": 0.29742667078971863, + "learning_rate": 6.08045910337286e-05, + "loss": 1.7428, + "step": 14589 + }, + { + "epoch": 4.478207489257213, + "grad_norm": 0.2994751036167145, + "learning_rate": 6.0799737859677395e-05, + "loss": 1.7764, + "step": 14590 + }, + { + "epoch": 4.478514426028238, + "grad_norm": 0.46379905939102173, + "learning_rate": 6.079488457889686e-05, + "loss": 1.7289, + "step": 14591 + }, + { + "epoch": 4.4788213627992635, + "grad_norm": 0.3511717617511749, + "learning_rate": 6.0790031191434946e-05, + "loss": 1.7658, + "step": 14592 + }, + { + "epoch": 4.479128299570289, + "grad_norm": 0.22678083181381226, + "learning_rate": 6.0785177697339626e-05, + "loss": 1.7973, + "step": 14593 + }, + { + "epoch": 4.479435236341313, + "grad_norm": 0.31201767921447754, + "learning_rate": 6.0780324096658837e-05, + "loss": 1.7542, + "step": 14594 + }, + { + "epoch": 4.479742173112339, + "grad_norm": 0.23759113252162933, + "learning_rate": 6.077547038944058e-05, + "loss": 1.7191, + "step": 14595 + }, + { + "epoch": 4.480049109883364, + "grad_norm": 0.25801756978034973, + "learning_rate": 6.077061657573282e-05, + "loss": 1.8229, + "step": 14596 + }, + { + "epoch": 4.480356046654389, + "grad_norm": 0.3435722887516022, + "learning_rate": 6.0765762655583514e-05, + "loss": 1.7633, + "step": 14597 + }, + { + "epoch": 4.480662983425415, + "grad_norm": 0.2710443437099457, + "learning_rate": 6.076090862904063e-05, + "loss": 1.8126, + "step": 14598 + }, + { + "epoch": 4.48096992019644, + "grad_norm": 0.25750285387039185, + "learning_rate": 6.075605449615212e-05, + "loss": 1.7382, + "step": 14599 + }, + { + "epoch": 4.481276856967464, + "grad_norm": 0.3638051152229309, + "learning_rate": 6.075120025696598e-05, + "loss": 1.8191, + "step": 14600 + }, + { + "epoch": 4.48158379373849, + "grad_norm": 0.24185293912887573, + "learning_rate": 6.074634591153019e-05, + "loss": 1.7637, + "step": 14601 + }, + { + "epoch": 4.481890730509515, + "grad_norm": 0.317283570766449, + "learning_rate": 6.0741491459892707e-05, + "loss": 1.7805, + "step": 14602 + }, + { + "epoch": 4.48219766728054, + "grad_norm": 0.33884385228157043, + "learning_rate": 6.073663690210151e-05, + "loss": 1.7719, + "step": 14603 + }, + { + "epoch": 4.482504604051566, + "grad_norm": 0.2554258704185486, + "learning_rate": 6.073178223820457e-05, + "loss": 1.836, + "step": 14604 + }, + { + "epoch": 4.48281154082259, + "grad_norm": 0.3363535702228546, + "learning_rate": 6.072692746824987e-05, + "loss": 1.8249, + "step": 14605 + }, + { + "epoch": 4.4831184775936155, + "grad_norm": 0.36090195178985596, + "learning_rate": 6.072207259228537e-05, + "loss": 1.733, + "step": 14606 + }, + { + "epoch": 4.483425414364641, + "grad_norm": 0.21928483247756958, + "learning_rate": 6.071721761035909e-05, + "loss": 1.7413, + "step": 14607 + }, + { + "epoch": 4.483732351135666, + "grad_norm": 0.4256608486175537, + "learning_rate": 6.071236252251897e-05, + "loss": 1.7585, + "step": 14608 + }, + { + "epoch": 4.4840392879066915, + "grad_norm": 0.41980308294296265, + "learning_rate": 6.0707507328813007e-05, + "loss": 1.7584, + "step": 14609 + }, + { + "epoch": 4.484346224677717, + "grad_norm": 0.200295090675354, + "learning_rate": 6.0702652029289186e-05, + "loss": 1.7492, + "step": 14610 + }, + { + "epoch": 4.484653161448741, + "grad_norm": 0.41847771406173706, + "learning_rate": 6.069779662399549e-05, + "loss": 1.8101, + "step": 14611 + }, + { + "epoch": 4.484960098219767, + "grad_norm": 0.4846353530883789, + "learning_rate": 6.069294111297987e-05, + "loss": 1.8227, + "step": 14612 + }, + { + "epoch": 4.485267034990792, + "grad_norm": 0.23216098546981812, + "learning_rate": 6.068808549629036e-05, + "loss": 1.6811, + "step": 14613 + }, + { + "epoch": 4.485573971761817, + "grad_norm": 0.34903186559677124, + "learning_rate": 6.0683229773974934e-05, + "loss": 1.6858, + "step": 14614 + }, + { + "epoch": 4.485880908532843, + "grad_norm": 0.4349122941493988, + "learning_rate": 6.0678373946081556e-05, + "loss": 1.7704, + "step": 14615 + }, + { + "epoch": 4.486187845303867, + "grad_norm": 0.25738775730133057, + "learning_rate": 6.067351801265824e-05, + "loss": 1.7487, + "step": 14616 + }, + { + "epoch": 4.486494782074892, + "grad_norm": 0.3052736818790436, + "learning_rate": 6.0668661973752936e-05, + "loss": 1.7528, + "step": 14617 + }, + { + "epoch": 4.486801718845918, + "grad_norm": 0.3400498628616333, + "learning_rate": 6.066380582941368e-05, + "loss": 1.7414, + "step": 14618 + }, + { + "epoch": 4.487108655616943, + "grad_norm": 0.28251948952674866, + "learning_rate": 6.065894957968845e-05, + "loss": 1.8078, + "step": 14619 + }, + { + "epoch": 4.487415592387968, + "grad_norm": 0.26907965540885925, + "learning_rate": 6.0654093224625216e-05, + "loss": 1.8143, + "step": 14620 + }, + { + "epoch": 4.487722529158993, + "grad_norm": 0.2821955978870392, + "learning_rate": 6.064923676427201e-05, + "loss": 1.7163, + "step": 14621 + }, + { + "epoch": 4.488029465930018, + "grad_norm": 0.2223028987646103, + "learning_rate": 6.0644380198676786e-05, + "loss": 1.704, + "step": 14622 + }, + { + "epoch": 4.4883364027010435, + "grad_norm": 0.25243067741394043, + "learning_rate": 6.063952352788755e-05, + "loss": 1.7236, + "step": 14623 + }, + { + "epoch": 4.488643339472069, + "grad_norm": 0.30026015639305115, + "learning_rate": 6.063466675195233e-05, + "loss": 1.7575, + "step": 14624 + }, + { + "epoch": 4.488950276243094, + "grad_norm": 0.2055491805076599, + "learning_rate": 6.0629809870919085e-05, + "loss": 1.7294, + "step": 14625 + }, + { + "epoch": 4.4892572130141195, + "grad_norm": 0.2507593035697937, + "learning_rate": 6.0624952884835836e-05, + "loss": 1.762, + "step": 14626 + }, + { + "epoch": 4.489564149785144, + "grad_norm": 0.21385909616947174, + "learning_rate": 6.0620095793750576e-05, + "loss": 1.7396, + "step": 14627 + }, + { + "epoch": 4.489871086556169, + "grad_norm": 0.21926651895046234, + "learning_rate": 6.06152385977113e-05, + "loss": 1.7863, + "step": 14628 + }, + { + "epoch": 4.490178023327195, + "grad_norm": 0.21950845420360565, + "learning_rate": 6.0610381296766016e-05, + "loss": 1.7576, + "step": 14629 + }, + { + "epoch": 4.49048496009822, + "grad_norm": 0.2030971795320511, + "learning_rate": 6.0605523890962736e-05, + "loss": 1.7069, + "step": 14630 + }, + { + "epoch": 4.490791896869245, + "grad_norm": 0.23991432785987854, + "learning_rate": 6.0600666380349436e-05, + "loss": 1.7598, + "step": 14631 + }, + { + "epoch": 4.49109883364027, + "grad_norm": 0.23766861855983734, + "learning_rate": 6.059580876497415e-05, + "loss": 1.7687, + "step": 14632 + }, + { + "epoch": 4.491405770411295, + "grad_norm": 0.2361454963684082, + "learning_rate": 6.059095104488487e-05, + "loss": 1.7883, + "step": 14633 + }, + { + "epoch": 4.49171270718232, + "grad_norm": 0.3128328323364258, + "learning_rate": 6.058609322012958e-05, + "loss": 1.8087, + "step": 14634 + }, + { + "epoch": 4.492019643953346, + "grad_norm": 0.2958957850933075, + "learning_rate": 6.0581235290756335e-05, + "loss": 1.782, + "step": 14635 + }, + { + "epoch": 4.492326580724371, + "grad_norm": 0.2197243571281433, + "learning_rate": 6.057637725681312e-05, + "loss": 1.7408, + "step": 14636 + }, + { + "epoch": 4.4926335174953955, + "grad_norm": 0.22227831184864044, + "learning_rate": 6.0571519118347944e-05, + "loss": 1.734, + "step": 14637 + }, + { + "epoch": 4.492940454266421, + "grad_norm": 0.2784527540206909, + "learning_rate": 6.056666087540882e-05, + "loss": 1.8017, + "step": 14638 + }, + { + "epoch": 4.493247391037446, + "grad_norm": 0.21929821372032166, + "learning_rate": 6.056180252804377e-05, + "loss": 1.7271, + "step": 14639 + }, + { + "epoch": 4.4935543278084715, + "grad_norm": 0.2156134843826294, + "learning_rate": 6.055694407630077e-05, + "loss": 1.8082, + "step": 14640 + }, + { + "epoch": 4.493861264579497, + "grad_norm": 0.22672387957572937, + "learning_rate": 6.0552085520227875e-05, + "loss": 1.7506, + "step": 14641 + }, + { + "epoch": 4.494168201350522, + "grad_norm": 0.228785440325737, + "learning_rate": 6.0547226859873086e-05, + "loss": 1.7023, + "step": 14642 + }, + { + "epoch": 4.494475138121547, + "grad_norm": 0.19483685493469238, + "learning_rate": 6.054236809528443e-05, + "loss": 1.6879, + "step": 14643 + }, + { + "epoch": 4.494782074892572, + "grad_norm": 0.24911309778690338, + "learning_rate": 6.0537509226509904e-05, + "loss": 1.7856, + "step": 14644 + }, + { + "epoch": 4.495089011663597, + "grad_norm": 0.24811938405036926, + "learning_rate": 6.053265025359753e-05, + "loss": 1.7581, + "step": 14645 + }, + { + "epoch": 4.495395948434623, + "grad_norm": 0.2487260401248932, + "learning_rate": 6.052779117659534e-05, + "loss": 1.7536, + "step": 14646 + }, + { + "epoch": 4.495702885205648, + "grad_norm": 0.2594854235649109, + "learning_rate": 6.052293199555136e-05, + "loss": 1.7822, + "step": 14647 + }, + { + "epoch": 4.496009821976672, + "grad_norm": 0.22837325930595398, + "learning_rate": 6.051807271051359e-05, + "loss": 1.7542, + "step": 14648 + }, + { + "epoch": 4.496316758747698, + "grad_norm": 0.23106649518013, + "learning_rate": 6.051321332153005e-05, + "loss": 1.7758, + "step": 14649 + }, + { + "epoch": 4.496623695518723, + "grad_norm": 0.29424673318862915, + "learning_rate": 6.050835382864878e-05, + "loss": 1.8335, + "step": 14650 + }, + { + "epoch": 4.496930632289748, + "grad_norm": 0.28297343850135803, + "learning_rate": 6.050349423191779e-05, + "loss": 1.7711, + "step": 14651 + }, + { + "epoch": 4.497237569060774, + "grad_norm": 0.2001795768737793, + "learning_rate": 6.049863453138511e-05, + "loss": 1.7008, + "step": 14652 + }, + { + "epoch": 4.497544505831799, + "grad_norm": 0.35177022218704224, + "learning_rate": 6.04937747270988e-05, + "loss": 1.7763, + "step": 14653 + }, + { + "epoch": 4.4978514426028235, + "grad_norm": 0.28870898485183716, + "learning_rate": 6.0488914819106835e-05, + "loss": 1.7373, + "step": 14654 + }, + { + "epoch": 4.498158379373849, + "grad_norm": 0.23962664604187012, + "learning_rate": 6.048405480745727e-05, + "loss": 1.7278, + "step": 14655 + }, + { + "epoch": 4.498465316144874, + "grad_norm": 0.324505478143692, + "learning_rate": 6.047919469219813e-05, + "loss": 1.7674, + "step": 14656 + }, + { + "epoch": 4.4987722529158995, + "grad_norm": 0.38313817977905273, + "learning_rate": 6.047433447337744e-05, + "loss": 1.789, + "step": 14657 + }, + { + "epoch": 4.499079189686925, + "grad_norm": 0.2101358324289322, + "learning_rate": 6.046947415104324e-05, + "loss": 1.7331, + "step": 14658 + }, + { + "epoch": 4.499386126457949, + "grad_norm": 0.3388524353504181, + "learning_rate": 6.046461372524357e-05, + "loss": 1.8467, + "step": 14659 + }, + { + "epoch": 4.499693063228975, + "grad_norm": 0.3360123634338379, + "learning_rate": 6.045975319602645e-05, + "loss": 1.8427, + "step": 14660 + }, + { + "epoch": 4.5, + "grad_norm": 0.27596545219421387, + "learning_rate": 6.0454892563439914e-05, + "loss": 1.7768, + "step": 14661 + }, + { + "epoch": 4.500306936771025, + "grad_norm": 0.2580861747264862, + "learning_rate": 6.0450031827532e-05, + "loss": 1.763, + "step": 14662 + }, + { + "epoch": 4.500613873542051, + "grad_norm": 0.3521091938018799, + "learning_rate": 6.044517098835074e-05, + "loss": 1.7118, + "step": 14663 + }, + { + "epoch": 4.500920810313076, + "grad_norm": 0.29412439465522766, + "learning_rate": 6.0440310045944204e-05, + "loss": 1.7252, + "step": 14664 + }, + { + "epoch": 4.5012277470841005, + "grad_norm": 0.23845252394676208, + "learning_rate": 6.043544900036039e-05, + "loss": 1.7622, + "step": 14665 + }, + { + "epoch": 4.501534683855126, + "grad_norm": 0.22957031428813934, + "learning_rate": 6.043058785164736e-05, + "loss": 1.7527, + "step": 14666 + }, + { + "epoch": 4.501841620626151, + "grad_norm": 0.2564462721347809, + "learning_rate": 6.042572659985314e-05, + "loss": 1.801, + "step": 14667 + }, + { + "epoch": 4.5021485573971765, + "grad_norm": 0.22588051855564117, + "learning_rate": 6.042086524502576e-05, + "loss": 1.7387, + "step": 14668 + }, + { + "epoch": 4.502455494168201, + "grad_norm": 0.2609740197658539, + "learning_rate": 6.0416003787213306e-05, + "loss": 1.7615, + "step": 14669 + }, + { + "epoch": 4.502762430939226, + "grad_norm": 0.2535521984100342, + "learning_rate": 6.041114222646379e-05, + "loss": 1.7398, + "step": 14670 + }, + { + "epoch": 4.503069367710252, + "grad_norm": 0.2512127757072449, + "learning_rate": 6.040628056282527e-05, + "loss": 1.7679, + "step": 14671 + }, + { + "epoch": 4.503376304481277, + "grad_norm": 0.2438639998435974, + "learning_rate": 6.0401418796345774e-05, + "loss": 1.7, + "step": 14672 + }, + { + "epoch": 4.503683241252302, + "grad_norm": 0.23428042232990265, + "learning_rate": 6.0396556927073376e-05, + "loss": 1.7748, + "step": 14673 + }, + { + "epoch": 4.503990178023328, + "grad_norm": 0.22894345223903656, + "learning_rate": 6.03916949550561e-05, + "loss": 1.7881, + "step": 14674 + }, + { + "epoch": 4.504297114794352, + "grad_norm": 0.24813716113567352, + "learning_rate": 6.0386832880342006e-05, + "loss": 1.7676, + "step": 14675 + }, + { + "epoch": 4.504604051565377, + "grad_norm": 0.23448842763900757, + "learning_rate": 6.038197070297914e-05, + "loss": 1.7828, + "step": 14676 + }, + { + "epoch": 4.504910988336403, + "grad_norm": 0.25302332639694214, + "learning_rate": 6.037710842301556e-05, + "loss": 1.8061, + "step": 14677 + }, + { + "epoch": 4.505217925107428, + "grad_norm": 0.2411813735961914, + "learning_rate": 6.0372246040499305e-05, + "loss": 1.6901, + "step": 14678 + }, + { + "epoch": 4.505524861878453, + "grad_norm": 0.3154819905757904, + "learning_rate": 6.036738355547844e-05, + "loss": 1.7472, + "step": 14679 + }, + { + "epoch": 4.505831798649478, + "grad_norm": 0.2935639023780823, + "learning_rate": 6.0362520968001014e-05, + "loss": 1.7508, + "step": 14680 + }, + { + "epoch": 4.506138735420503, + "grad_norm": 0.27064070105552673, + "learning_rate": 6.035765827811508e-05, + "loss": 1.8133, + "step": 14681 + }, + { + "epoch": 4.5064456721915285, + "grad_norm": 0.23748525977134705, + "learning_rate": 6.03527954858687e-05, + "loss": 1.7742, + "step": 14682 + }, + { + "epoch": 4.506752608962554, + "grad_norm": 0.216410830616951, + "learning_rate": 6.034793259130992e-05, + "loss": 1.7448, + "step": 14683 + }, + { + "epoch": 4.507059545733579, + "grad_norm": 0.23339977860450745, + "learning_rate": 6.034306959448681e-05, + "loss": 1.7437, + "step": 14684 + }, + { + "epoch": 4.5073664825046045, + "grad_norm": 0.23951120674610138, + "learning_rate": 6.0338206495447414e-05, + "loss": 1.7535, + "step": 14685 + }, + { + "epoch": 4.507673419275629, + "grad_norm": 0.22137518227100372, + "learning_rate": 6.0333343294239816e-05, + "loss": 1.7537, + "step": 14686 + }, + { + "epoch": 4.507980356046654, + "grad_norm": 0.2550075054168701, + "learning_rate": 6.032847999091206e-05, + "loss": 1.8069, + "step": 14687 + }, + { + "epoch": 4.50828729281768, + "grad_norm": 0.2166420966386795, + "learning_rate": 6.032361658551221e-05, + "loss": 1.7746, + "step": 14688 + }, + { + "epoch": 4.508594229588705, + "grad_norm": 0.21926096081733704, + "learning_rate": 6.031875307808833e-05, + "loss": 1.7848, + "step": 14689 + }, + { + "epoch": 4.50890116635973, + "grad_norm": 0.27769652009010315, + "learning_rate": 6.031388946868848e-05, + "loss": 1.7563, + "step": 14690 + }, + { + "epoch": 4.509208103130755, + "grad_norm": 0.23417410254478455, + "learning_rate": 6.030902575736074e-05, + "loss": 1.7475, + "step": 14691 + }, + { + "epoch": 4.50951503990178, + "grad_norm": 0.25454118847846985, + "learning_rate": 6.030416194415314e-05, + "loss": 1.7416, + "step": 14692 + }, + { + "epoch": 4.509821976672805, + "grad_norm": 0.3118220567703247, + "learning_rate": 6.029929802911379e-05, + "loss": 1.8001, + "step": 14693 + }, + { + "epoch": 4.510128913443831, + "grad_norm": 0.2338017225265503, + "learning_rate": 6.029443401229075e-05, + "loss": 1.7243, + "step": 14694 + }, + { + "epoch": 4.510435850214856, + "grad_norm": 0.2490454763174057, + "learning_rate": 6.028956989373207e-05, + "loss": 1.7866, + "step": 14695 + }, + { + "epoch": 4.510742786985881, + "grad_norm": 0.2579275369644165, + "learning_rate": 6.028470567348582e-05, + "loss": 1.7594, + "step": 14696 + }, + { + "epoch": 4.511049723756906, + "grad_norm": 0.23982174694538116, + "learning_rate": 6.0279841351600094e-05, + "loss": 1.7444, + "step": 14697 + }, + { + "epoch": 4.511356660527931, + "grad_norm": 0.2160159945487976, + "learning_rate": 6.027497692812295e-05, + "loss": 1.7002, + "step": 14698 + }, + { + "epoch": 4.5116635972989565, + "grad_norm": 0.24604511260986328, + "learning_rate": 6.0270112403102455e-05, + "loss": 1.7654, + "step": 14699 + }, + { + "epoch": 4.511970534069982, + "grad_norm": 0.21978263556957245, + "learning_rate": 6.026524777658669e-05, + "loss": 1.7278, + "step": 14700 + }, + { + "epoch": 4.512277470841006, + "grad_norm": 0.2814212441444397, + "learning_rate": 6.026038304862373e-05, + "loss": 1.7743, + "step": 14701 + }, + { + "epoch": 4.512584407612032, + "grad_norm": 0.23798944056034088, + "learning_rate": 6.025551821926165e-05, + "loss": 1.7348, + "step": 14702 + }, + { + "epoch": 4.512891344383057, + "grad_norm": 0.22415988147258759, + "learning_rate": 6.025065328854853e-05, + "loss": 1.7973, + "step": 14703 + }, + { + "epoch": 4.513198281154082, + "grad_norm": 0.34614792466163635, + "learning_rate": 6.0245788256532445e-05, + "loss": 1.7263, + "step": 14704 + }, + { + "epoch": 4.513505217925108, + "grad_norm": 0.333918958902359, + "learning_rate": 6.0240923123261485e-05, + "loss": 1.7305, + "step": 14705 + }, + { + "epoch": 4.513812154696133, + "grad_norm": 0.22231793403625488, + "learning_rate": 6.02360578887837e-05, + "loss": 1.806, + "step": 14706 + }, + { + "epoch": 4.514119091467157, + "grad_norm": 0.23323194682598114, + "learning_rate": 6.023119255314721e-05, + "loss": 1.7076, + "step": 14707 + }, + { + "epoch": 4.514426028238183, + "grad_norm": 0.26695477962493896, + "learning_rate": 6.022632711640007e-05, + "loss": 1.775, + "step": 14708 + }, + { + "epoch": 4.514732965009208, + "grad_norm": 0.21446476876735687, + "learning_rate": 6.0221461578590364e-05, + "loss": 1.7524, + "step": 14709 + }, + { + "epoch": 4.515039901780233, + "grad_norm": 0.2677358090877533, + "learning_rate": 6.0216595939766204e-05, + "loss": 1.7513, + "step": 14710 + }, + { + "epoch": 4.515346838551259, + "grad_norm": 0.28648239374160767, + "learning_rate": 6.021173019997565e-05, + "loss": 1.7249, + "step": 14711 + }, + { + "epoch": 4.515653775322283, + "grad_norm": 0.2178548276424408, + "learning_rate": 6.020686435926678e-05, + "loss": 1.7502, + "step": 14712 + }, + { + "epoch": 4.5159607120933085, + "grad_norm": 0.3391740024089813, + "learning_rate": 6.02019984176877e-05, + "loss": 1.6828, + "step": 14713 + }, + { + "epoch": 4.516267648864334, + "grad_norm": 0.25222229957580566, + "learning_rate": 6.01971323752865e-05, + "loss": 1.6982, + "step": 14714 + }, + { + "epoch": 4.516574585635359, + "grad_norm": 0.28776636719703674, + "learning_rate": 6.019226623211125e-05, + "loss": 1.8595, + "step": 14715 + }, + { + "epoch": 4.5168815224063845, + "grad_norm": 0.3240084648132324, + "learning_rate": 6.018739998821006e-05, + "loss": 1.7461, + "step": 14716 + }, + { + "epoch": 4.51718845917741, + "grad_norm": 0.26735052466392517, + "learning_rate": 6.0182533643631015e-05, + "loss": 1.7955, + "step": 14717 + }, + { + "epoch": 4.517495395948434, + "grad_norm": 0.24573692679405212, + "learning_rate": 6.017766719842219e-05, + "loss": 1.7441, + "step": 14718 + }, + { + "epoch": 4.51780233271946, + "grad_norm": 0.27401313185691833, + "learning_rate": 6.01728006526317e-05, + "loss": 1.7399, + "step": 14719 + }, + { + "epoch": 4.518109269490485, + "grad_norm": 0.23578806221485138, + "learning_rate": 6.016793400630763e-05, + "loss": 1.7936, + "step": 14720 + }, + { + "epoch": 4.51841620626151, + "grad_norm": 0.27763426303863525, + "learning_rate": 6.0163067259498074e-05, + "loss": 1.7263, + "step": 14721 + }, + { + "epoch": 4.518723143032536, + "grad_norm": 0.27102044224739075, + "learning_rate": 6.015820041225113e-05, + "loss": 1.7085, + "step": 14722 + }, + { + "epoch": 4.51903007980356, + "grad_norm": 0.2046152651309967, + "learning_rate": 6.01533334646149e-05, + "loss": 1.7602, + "step": 14723 + }, + { + "epoch": 4.519337016574585, + "grad_norm": 0.2645253837108612, + "learning_rate": 6.0148466416637484e-05, + "loss": 1.7729, + "step": 14724 + }, + { + "epoch": 4.519643953345611, + "grad_norm": 0.27467650175094604, + "learning_rate": 6.014359926836697e-05, + "loss": 1.7834, + "step": 14725 + }, + { + "epoch": 4.519950890116636, + "grad_norm": 0.30357635021209717, + "learning_rate": 6.013873201985145e-05, + "loss": 1.8685, + "step": 14726 + }, + { + "epoch": 4.520257826887661, + "grad_norm": 0.22923336923122406, + "learning_rate": 6.013386467113905e-05, + "loss": 1.7531, + "step": 14727 + }, + { + "epoch": 4.520564763658687, + "grad_norm": 0.2792156934738159, + "learning_rate": 6.012899722227786e-05, + "loss": 1.7927, + "step": 14728 + }, + { + "epoch": 4.520871700429711, + "grad_norm": 0.286161869764328, + "learning_rate": 6.012412967331598e-05, + "loss": 1.77, + "step": 14729 + }, + { + "epoch": 4.5211786372007365, + "grad_norm": 0.23964659869670868, + "learning_rate": 6.011926202430151e-05, + "loss": 1.7873, + "step": 14730 + }, + { + "epoch": 4.521485573971762, + "grad_norm": 0.2250162959098816, + "learning_rate": 6.011439427528258e-05, + "loss": 1.741, + "step": 14731 + }, + { + "epoch": 4.521792510742787, + "grad_norm": 0.2797175347805023, + "learning_rate": 6.010952642630726e-05, + "loss": 1.7482, + "step": 14732 + }, + { + "epoch": 4.5220994475138125, + "grad_norm": 0.22159560024738312, + "learning_rate": 6.010465847742368e-05, + "loss": 1.7591, + "step": 14733 + }, + { + "epoch": 4.522406384284837, + "grad_norm": 0.26638463139533997, + "learning_rate": 6.009979042867995e-05, + "loss": 1.8564, + "step": 14734 + }, + { + "epoch": 4.522713321055862, + "grad_norm": 0.2972821891307831, + "learning_rate": 6.009492228012416e-05, + "loss": 1.7569, + "step": 14735 + }, + { + "epoch": 4.523020257826888, + "grad_norm": 0.28108885884284973, + "learning_rate": 6.0090054031804444e-05, + "loss": 1.7256, + "step": 14736 + }, + { + "epoch": 4.523327194597913, + "grad_norm": 0.22359851002693176, + "learning_rate": 6.008518568376888e-05, + "loss": 1.7342, + "step": 14737 + }, + { + "epoch": 4.523634131368938, + "grad_norm": 0.2620728015899658, + "learning_rate": 6.008031723606562e-05, + "loss": 1.7703, + "step": 14738 + }, + { + "epoch": 4.523941068139964, + "grad_norm": 0.2641485333442688, + "learning_rate": 6.007544868874274e-05, + "loss": 1.6944, + "step": 14739 + }, + { + "epoch": 4.524248004910988, + "grad_norm": 0.24957752227783203, + "learning_rate": 6.007058004184839e-05, + "loss": 1.7746, + "step": 14740 + }, + { + "epoch": 4.524554941682013, + "grad_norm": 0.29830998182296753, + "learning_rate": 6.006571129543065e-05, + "loss": 1.7718, + "step": 14741 + }, + { + "epoch": 4.524861878453039, + "grad_norm": 0.32740798592567444, + "learning_rate": 6.006084244953766e-05, + "loss": 1.8194, + "step": 14742 + }, + { + "epoch": 4.525168815224064, + "grad_norm": 0.2614956796169281, + "learning_rate": 6.005597350421751e-05, + "loss": 1.7078, + "step": 14743 + }, + { + "epoch": 4.525475751995089, + "grad_norm": 0.23940515518188477, + "learning_rate": 6.005110445951836e-05, + "loss": 1.7488, + "step": 14744 + }, + { + "epoch": 4.525782688766114, + "grad_norm": 0.25485914945602417, + "learning_rate": 6.004623531548829e-05, + "loss": 1.7705, + "step": 14745 + }, + { + "epoch": 4.526089625537139, + "grad_norm": 0.213532954454422, + "learning_rate": 6.0041366072175445e-05, + "loss": 1.7501, + "step": 14746 + }, + { + "epoch": 4.526396562308165, + "grad_norm": 0.2420104295015335, + "learning_rate": 6.003649672962792e-05, + "loss": 1.717, + "step": 14747 + }, + { + "epoch": 4.52670349907919, + "grad_norm": 0.26179102063179016, + "learning_rate": 6.0031627287893865e-05, + "loss": 1.7665, + "step": 14748 + }, + { + "epoch": 4.527010435850215, + "grad_norm": 0.22032082080841064, + "learning_rate": 6.002675774702139e-05, + "loss": 1.7555, + "step": 14749 + }, + { + "epoch": 4.52731737262124, + "grad_norm": 0.23915240168571472, + "learning_rate": 6.002188810705861e-05, + "loss": 1.8219, + "step": 14750 + }, + { + "epoch": 4.527624309392265, + "grad_norm": 0.2275150567293167, + "learning_rate": 6.0017018368053665e-05, + "loss": 1.7418, + "step": 14751 + }, + { + "epoch": 4.52793124616329, + "grad_norm": 0.2349669486284256, + "learning_rate": 6.001214853005467e-05, + "loss": 1.7814, + "step": 14752 + }, + { + "epoch": 4.528238182934316, + "grad_norm": 0.29985731840133667, + "learning_rate": 6.000727859310975e-05, + "loss": 1.7109, + "step": 14753 + }, + { + "epoch": 4.528545119705341, + "grad_norm": 0.27282044291496277, + "learning_rate": 6.0002408557267044e-05, + "loss": 1.7806, + "step": 14754 + }, + { + "epoch": 4.5288520564763655, + "grad_norm": 0.20906320214271545, + "learning_rate": 5.9997538422574675e-05, + "loss": 1.7221, + "step": 14755 + }, + { + "epoch": 4.529158993247391, + "grad_norm": 0.24553455412387848, + "learning_rate": 5.999266818908076e-05, + "loss": 1.793, + "step": 14756 + }, + { + "epoch": 4.529465930018416, + "grad_norm": 0.29730647802352905, + "learning_rate": 5.998779785683345e-05, + "loss": 1.7597, + "step": 14757 + }, + { + "epoch": 4.5297728667894415, + "grad_norm": 0.28297582268714905, + "learning_rate": 5.998292742588087e-05, + "loss": 1.7459, + "step": 14758 + }, + { + "epoch": 4.530079803560467, + "grad_norm": 0.21853844821453094, + "learning_rate": 5.997805689627115e-05, + "loss": 1.7234, + "step": 14759 + }, + { + "epoch": 4.530386740331492, + "grad_norm": 0.2997361421585083, + "learning_rate": 5.997318626805242e-05, + "loss": 1.7294, + "step": 14760 + }, + { + "epoch": 4.530693677102517, + "grad_norm": 0.3298671543598175, + "learning_rate": 5.9968315541272804e-05, + "loss": 1.7837, + "step": 14761 + }, + { + "epoch": 4.531000613873542, + "grad_norm": 0.22812490165233612, + "learning_rate": 5.996344471598047e-05, + "loss": 1.7509, + "step": 14762 + }, + { + "epoch": 4.531307550644567, + "grad_norm": 0.3179669678211212, + "learning_rate": 5.995857379222354e-05, + "loss": 1.8354, + "step": 14763 + }, + { + "epoch": 4.531614487415593, + "grad_norm": 0.3072827458381653, + "learning_rate": 5.9953702770050135e-05, + "loss": 1.8051, + "step": 14764 + }, + { + "epoch": 4.531921424186618, + "grad_norm": 0.19386722147464752, + "learning_rate": 5.994883164950841e-05, + "loss": 1.7093, + "step": 14765 + }, + { + "epoch": 4.532228360957642, + "grad_norm": 0.2380950152873993, + "learning_rate": 5.99439604306465e-05, + "loss": 1.7547, + "step": 14766 + }, + { + "epoch": 4.532535297728668, + "grad_norm": 0.32604947686195374, + "learning_rate": 5.993908911351254e-05, + "loss": 1.8708, + "step": 14767 + }, + { + "epoch": 4.532842234499693, + "grad_norm": 0.2436954528093338, + "learning_rate": 5.993421769815468e-05, + "loss": 1.7272, + "step": 14768 + }, + { + "epoch": 4.533149171270718, + "grad_norm": 0.2470337301492691, + "learning_rate": 5.992934618462105e-05, + "loss": 1.7242, + "step": 14769 + }, + { + "epoch": 4.533456108041744, + "grad_norm": 0.25720325112342834, + "learning_rate": 5.992447457295981e-05, + "loss": 1.7219, + "step": 14770 + }, + { + "epoch": 4.533763044812769, + "grad_norm": 0.2518918812274933, + "learning_rate": 5.991960286321909e-05, + "loss": 1.7916, + "step": 14771 + }, + { + "epoch": 4.5340699815837935, + "grad_norm": 0.2561487853527069, + "learning_rate": 5.9914731055447037e-05, + "loss": 1.7695, + "step": 14772 + }, + { + "epoch": 4.534376918354819, + "grad_norm": 0.25361356139183044, + "learning_rate": 5.9909859149691804e-05, + "loss": 1.7464, + "step": 14773 + }, + { + "epoch": 4.534683855125844, + "grad_norm": 0.22827522456645966, + "learning_rate": 5.9904987146001545e-05, + "loss": 1.7288, + "step": 14774 + }, + { + "epoch": 4.5349907918968695, + "grad_norm": 0.2417261302471161, + "learning_rate": 5.9900115044424385e-05, + "loss": 1.7311, + "step": 14775 + }, + { + "epoch": 4.535297728667894, + "grad_norm": 0.20756755769252777, + "learning_rate": 5.9895242845008495e-05, + "loss": 1.7799, + "step": 14776 + }, + { + "epoch": 4.535604665438919, + "grad_norm": 0.21999207139015198, + "learning_rate": 5.989037054780201e-05, + "loss": 1.7782, + "step": 14777 + }, + { + "epoch": 4.535911602209945, + "grad_norm": 0.22863444685935974, + "learning_rate": 5.988549815285308e-05, + "loss": 1.7869, + "step": 14778 + }, + { + "epoch": 4.53621853898097, + "grad_norm": 0.23033374547958374, + "learning_rate": 5.988062566020987e-05, + "loss": 1.7328, + "step": 14779 + }, + { + "epoch": 4.536525475751995, + "grad_norm": 0.21903404593467712, + "learning_rate": 5.987575306992053e-05, + "loss": 1.7689, + "step": 14780 + }, + { + "epoch": 4.536832412523021, + "grad_norm": 0.2433948963880539, + "learning_rate": 5.98708803820332e-05, + "loss": 1.7647, + "step": 14781 + }, + { + "epoch": 4.537139349294045, + "grad_norm": 0.2564239799976349, + "learning_rate": 5.986600759659606e-05, + "loss": 1.7958, + "step": 14782 + }, + { + "epoch": 4.53744628606507, + "grad_norm": 0.24009190499782562, + "learning_rate": 5.9861134713657244e-05, + "loss": 1.7511, + "step": 14783 + }, + { + "epoch": 4.537753222836096, + "grad_norm": 0.2578975558280945, + "learning_rate": 5.985626173326491e-05, + "loss": 1.8285, + "step": 14784 + }, + { + "epoch": 4.538060159607121, + "grad_norm": 0.24334335327148438, + "learning_rate": 5.9851388655467225e-05, + "loss": 1.7391, + "step": 14785 + }, + { + "epoch": 4.538367096378146, + "grad_norm": 0.26446983218193054, + "learning_rate": 5.9846515480312335e-05, + "loss": 1.8232, + "step": 14786 + }, + { + "epoch": 4.538674033149171, + "grad_norm": 0.3125670850276947, + "learning_rate": 5.9841642207848415e-05, + "loss": 1.7202, + "step": 14787 + }, + { + "epoch": 4.538980969920196, + "grad_norm": 0.2524511218070984, + "learning_rate": 5.983676883812361e-05, + "loss": 1.7653, + "step": 14788 + }, + { + "epoch": 4.5392879066912215, + "grad_norm": 0.3693946897983551, + "learning_rate": 5.98318953711861e-05, + "loss": 1.7457, + "step": 14789 + }, + { + "epoch": 4.539594843462247, + "grad_norm": 0.32625386118888855, + "learning_rate": 5.9827021807084026e-05, + "loss": 1.784, + "step": 14790 + }, + { + "epoch": 4.539901780233272, + "grad_norm": 0.24243168532848358, + "learning_rate": 5.9822148145865574e-05, + "loss": 1.7651, + "step": 14791 + }, + { + "epoch": 4.5402087170042975, + "grad_norm": 0.2950129210948944, + "learning_rate": 5.9817274387578895e-05, + "loss": 1.7316, + "step": 14792 + }, + { + "epoch": 4.540515653775322, + "grad_norm": 0.29455235600471497, + "learning_rate": 5.981240053227216e-05, + "loss": 1.7504, + "step": 14793 + }, + { + "epoch": 4.540822590546347, + "grad_norm": 0.23161925375461578, + "learning_rate": 5.980752657999352e-05, + "loss": 1.7663, + "step": 14794 + }, + { + "epoch": 4.541129527317373, + "grad_norm": 0.2725144922733307, + "learning_rate": 5.980265253079116e-05, + "loss": 1.765, + "step": 14795 + }, + { + "epoch": 4.541436464088398, + "grad_norm": 0.30911222100257874, + "learning_rate": 5.979777838471324e-05, + "loss": 1.7888, + "step": 14796 + }, + { + "epoch": 4.541743400859423, + "grad_norm": 0.2818063497543335, + "learning_rate": 5.979290414180794e-05, + "loss": 1.8047, + "step": 14797 + }, + { + "epoch": 4.542050337630448, + "grad_norm": 0.23335030674934387, + "learning_rate": 5.978802980212341e-05, + "loss": 1.8205, + "step": 14798 + }, + { + "epoch": 4.542357274401473, + "grad_norm": 0.24228201806545258, + "learning_rate": 5.9783155365707855e-05, + "loss": 1.7774, + "step": 14799 + }, + { + "epoch": 4.542664211172498, + "grad_norm": 0.2410847544670105, + "learning_rate": 5.97782808326094e-05, + "loss": 1.6959, + "step": 14800 + }, + { + "epoch": 4.542971147943524, + "grad_norm": 0.24812567234039307, + "learning_rate": 5.9773406202876245e-05, + "loss": 1.8158, + "step": 14801 + }, + { + "epoch": 4.543278084714549, + "grad_norm": 0.2606147229671478, + "learning_rate": 5.9768531476556566e-05, + "loss": 1.7478, + "step": 14802 + }, + { + "epoch": 4.543585021485574, + "grad_norm": 0.24853013455867767, + "learning_rate": 5.976365665369854e-05, + "loss": 1.8158, + "step": 14803 + }, + { + "epoch": 4.543891958256599, + "grad_norm": 0.2320917695760727, + "learning_rate": 5.9758781734350334e-05, + "loss": 1.7812, + "step": 14804 + }, + { + "epoch": 4.544198895027624, + "grad_norm": 0.3460223376750946, + "learning_rate": 5.9753906718560127e-05, + "loss": 1.7562, + "step": 14805 + }, + { + "epoch": 4.5445058317986495, + "grad_norm": 0.2941136658191681, + "learning_rate": 5.9749031606376086e-05, + "loss": 1.7562, + "step": 14806 + }, + { + "epoch": 4.544812768569675, + "grad_norm": 0.2371312975883484, + "learning_rate": 5.9744156397846404e-05, + "loss": 1.7793, + "step": 14807 + }, + { + "epoch": 4.5451197053407, + "grad_norm": 0.2885094881057739, + "learning_rate": 5.973928109301926e-05, + "loss": 1.7564, + "step": 14808 + }, + { + "epoch": 4.545426642111725, + "grad_norm": 0.2369023859500885, + "learning_rate": 5.973440569194284e-05, + "loss": 1.7862, + "step": 14809 + }, + { + "epoch": 4.54573357888275, + "grad_norm": 0.26628994941711426, + "learning_rate": 5.972953019466531e-05, + "loss": 1.7828, + "step": 14810 + }, + { + "epoch": 4.546040515653775, + "grad_norm": 0.3091031610965729, + "learning_rate": 5.9724654601234864e-05, + "loss": 1.7623, + "step": 14811 + }, + { + "epoch": 4.546347452424801, + "grad_norm": 0.24652205407619476, + "learning_rate": 5.971977891169966e-05, + "loss": 1.6982, + "step": 14812 + }, + { + "epoch": 4.546654389195826, + "grad_norm": 0.21779046952724457, + "learning_rate": 5.971490312610793e-05, + "loss": 1.7363, + "step": 14813 + }, + { + "epoch": 4.546961325966851, + "grad_norm": 0.24130751192569733, + "learning_rate": 5.971002724450783e-05, + "loss": 1.7014, + "step": 14814 + }, + { + "epoch": 4.547268262737876, + "grad_norm": 0.21868734061717987, + "learning_rate": 5.9705151266947534e-05, + "loss": 1.7872, + "step": 14815 + }, + { + "epoch": 4.547575199508901, + "grad_norm": 0.257376492023468, + "learning_rate": 5.9700275193475275e-05, + "loss": 1.75, + "step": 14816 + }, + { + "epoch": 4.547882136279926, + "grad_norm": 0.3182791769504547, + "learning_rate": 5.9695399024139174e-05, + "loss": 1.7965, + "step": 14817 + }, + { + "epoch": 4.548189073050952, + "grad_norm": 0.25553280115127563, + "learning_rate": 5.969052275898748e-05, + "loss": 1.8394, + "step": 14818 + }, + { + "epoch": 4.548496009821976, + "grad_norm": 0.2810833752155304, + "learning_rate": 5.9685646398068354e-05, + "loss": 1.704, + "step": 14819 + }, + { + "epoch": 4.5488029465930016, + "grad_norm": 0.21320512890815735, + "learning_rate": 5.9680769941429993e-05, + "loss": 1.7248, + "step": 14820 + }, + { + "epoch": 4.549109883364027, + "grad_norm": 0.3159593939781189, + "learning_rate": 5.96758933891206e-05, + "loss": 1.7885, + "step": 14821 + }, + { + "epoch": 4.549416820135052, + "grad_norm": 0.21894599497318268, + "learning_rate": 5.967101674118834e-05, + "loss": 1.7388, + "step": 14822 + }, + { + "epoch": 4.5497237569060776, + "grad_norm": 0.24804852902889252, + "learning_rate": 5.9666139997681424e-05, + "loss": 1.7631, + "step": 14823 + }, + { + "epoch": 4.550030693677103, + "grad_norm": 0.2678423523902893, + "learning_rate": 5.966126315864806e-05, + "loss": 1.7631, + "step": 14824 + }, + { + "epoch": 4.550337630448127, + "grad_norm": 0.229649156332016, + "learning_rate": 5.9656386224136426e-05, + "loss": 1.7292, + "step": 14825 + }, + { + "epoch": 4.550644567219153, + "grad_norm": 0.25248458981513977, + "learning_rate": 5.965150919419473e-05, + "loss": 1.8, + "step": 14826 + }, + { + "epoch": 4.550951503990178, + "grad_norm": 0.2583169937133789, + "learning_rate": 5.964663206887116e-05, + "loss": 1.7641, + "step": 14827 + }, + { + "epoch": 4.551258440761203, + "grad_norm": 0.21465209126472473, + "learning_rate": 5.964175484821392e-05, + "loss": 1.7475, + "step": 14828 + }, + { + "epoch": 4.551565377532229, + "grad_norm": 0.28028783202171326, + "learning_rate": 5.963687753227118e-05, + "loss": 1.7649, + "step": 14829 + }, + { + "epoch": 4.551872314303253, + "grad_norm": 0.30248284339904785, + "learning_rate": 5.9632000121091194e-05, + "loss": 1.6969, + "step": 14830 + }, + { + "epoch": 4.5521792510742785, + "grad_norm": 0.24335962533950806, + "learning_rate": 5.962712261472213e-05, + "loss": 1.7295, + "step": 14831 + }, + { + "epoch": 4.552486187845304, + "grad_norm": 0.21014504134655, + "learning_rate": 5.9622245013212206e-05, + "loss": 1.7508, + "step": 14832 + }, + { + "epoch": 4.552793124616329, + "grad_norm": 0.24892041087150574, + "learning_rate": 5.961736731660963e-05, + "loss": 1.7317, + "step": 14833 + }, + { + "epoch": 4.5531000613873545, + "grad_norm": 0.2159881740808487, + "learning_rate": 5.9612489524962556e-05, + "loss": 1.7114, + "step": 14834 + }, + { + "epoch": 4.55340699815838, + "grad_norm": 0.2952292263507843, + "learning_rate": 5.960761163831925e-05, + "loss": 1.8226, + "step": 14835 + }, + { + "epoch": 4.553713934929404, + "grad_norm": 0.3019000291824341, + "learning_rate": 5.9602733656727895e-05, + "loss": 1.7391, + "step": 14836 + }, + { + "epoch": 4.55402087170043, + "grad_norm": 0.2273966521024704, + "learning_rate": 5.9597855580236696e-05, + "loss": 1.7718, + "step": 14837 + }, + { + "epoch": 4.554327808471455, + "grad_norm": 0.2462005764245987, + "learning_rate": 5.959297740889386e-05, + "loss": 1.8428, + "step": 14838 + }, + { + "epoch": 4.55463474524248, + "grad_norm": 0.2773323059082031, + "learning_rate": 5.95880991427476e-05, + "loss": 1.6878, + "step": 14839 + }, + { + "epoch": 4.554941682013506, + "grad_norm": 0.26519861817359924, + "learning_rate": 5.958322078184611e-05, + "loss": 1.737, + "step": 14840 + }, + { + "epoch": 4.55524861878453, + "grad_norm": 0.20157647132873535, + "learning_rate": 5.9578342326237626e-05, + "loss": 1.7164, + "step": 14841 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.21715669333934784, + "learning_rate": 5.957346377597035e-05, + "loss": 1.705, + "step": 14842 + }, + { + "epoch": 4.555862492326581, + "grad_norm": 0.3056442439556122, + "learning_rate": 5.95685851310925e-05, + "loss": 1.7672, + "step": 14843 + }, + { + "epoch": 4.556169429097606, + "grad_norm": 0.24832262098789215, + "learning_rate": 5.956370639165228e-05, + "loss": 1.7305, + "step": 14844 + }, + { + "epoch": 4.556476365868631, + "grad_norm": 0.25814661383628845, + "learning_rate": 5.955882755769791e-05, + "loss": 1.7562, + "step": 14845 + }, + { + "epoch": 4.556783302639657, + "grad_norm": 0.38242629170417786, + "learning_rate": 5.95539486292776e-05, + "loss": 1.7077, + "step": 14846 + }, + { + "epoch": 4.557090239410681, + "grad_norm": 0.2901807427406311, + "learning_rate": 5.954906960643956e-05, + "loss": 1.7233, + "step": 14847 + }, + { + "epoch": 4.5573971761817065, + "grad_norm": 0.22636106610298157, + "learning_rate": 5.954419048923202e-05, + "loss": 1.777, + "step": 14848 + }, + { + "epoch": 4.557704112952732, + "grad_norm": 0.32392850518226624, + "learning_rate": 5.953931127770321e-05, + "loss": 1.7477, + "step": 14849 + }, + { + "epoch": 4.558011049723757, + "grad_norm": 0.3403460681438446, + "learning_rate": 5.953443197190134e-05, + "loss": 1.7712, + "step": 14850 + }, + { + "epoch": 4.558317986494782, + "grad_norm": 0.22923234105110168, + "learning_rate": 5.95295525718746e-05, + "loss": 1.8154, + "step": 14851 + }, + { + "epoch": 4.558624923265807, + "grad_norm": 0.25152841210365295, + "learning_rate": 5.952467307767124e-05, + "loss": 1.7091, + "step": 14852 + }, + { + "epoch": 4.558931860036832, + "grad_norm": 0.27743563055992126, + "learning_rate": 5.951979348933949e-05, + "loss": 1.7621, + "step": 14853 + }, + { + "epoch": 4.559238796807858, + "grad_norm": 0.25809308886528015, + "learning_rate": 5.951491380692756e-05, + "loss": 1.7669, + "step": 14854 + }, + { + "epoch": 4.559545733578883, + "grad_norm": 0.24863946437835693, + "learning_rate": 5.9510034030483676e-05, + "loss": 1.7354, + "step": 14855 + }, + { + "epoch": 4.559852670349908, + "grad_norm": 0.2896040380001068, + "learning_rate": 5.9505154160056066e-05, + "loss": 1.7878, + "step": 14856 + }, + { + "epoch": 4.560159607120933, + "grad_norm": 0.23814482986927032, + "learning_rate": 5.950027419569294e-05, + "loss": 1.7781, + "step": 14857 + }, + { + "epoch": 4.560466543891958, + "grad_norm": 0.2531175911426544, + "learning_rate": 5.949539413744253e-05, + "loss": 1.762, + "step": 14858 + }, + { + "epoch": 4.560773480662983, + "grad_norm": 0.2541767656803131, + "learning_rate": 5.949051398535308e-05, + "loss": 1.7722, + "step": 14859 + }, + { + "epoch": 4.561080417434009, + "grad_norm": 0.25216221809387207, + "learning_rate": 5.948563373947281e-05, + "loss": 1.754, + "step": 14860 + }, + { + "epoch": 4.561387354205034, + "grad_norm": 0.24421775341033936, + "learning_rate": 5.948075339984994e-05, + "loss": 1.7976, + "step": 14861 + }, + { + "epoch": 4.5616942909760585, + "grad_norm": 0.24435418844223022, + "learning_rate": 5.947587296653272e-05, + "loss": 1.79, + "step": 14862 + }, + { + "epoch": 4.562001227747084, + "grad_norm": 0.24471627175807953, + "learning_rate": 5.947099243956936e-05, + "loss": 1.755, + "step": 14863 + }, + { + "epoch": 4.562308164518109, + "grad_norm": 0.2762158215045929, + "learning_rate": 5.9466111819008096e-05, + "loss": 1.7695, + "step": 14864 + }, + { + "epoch": 4.5626151012891345, + "grad_norm": 0.23841319978237152, + "learning_rate": 5.9461231104897174e-05, + "loss": 1.7302, + "step": 14865 + }, + { + "epoch": 4.56292203806016, + "grad_norm": 0.260231077671051, + "learning_rate": 5.9456350297284826e-05, + "loss": 1.7917, + "step": 14866 + }, + { + "epoch": 4.563228974831185, + "grad_norm": 0.2752247452735901, + "learning_rate": 5.945146939621929e-05, + "loss": 1.7953, + "step": 14867 + }, + { + "epoch": 4.56353591160221, + "grad_norm": 0.28760650753974915, + "learning_rate": 5.944658840174878e-05, + "loss": 1.8582, + "step": 14868 + }, + { + "epoch": 4.563842848373235, + "grad_norm": 0.24311676621437073, + "learning_rate": 5.944170731392153e-05, + "loss": 1.8006, + "step": 14869 + }, + { + "epoch": 4.56414978514426, + "grad_norm": 0.2692974805831909, + "learning_rate": 5.943682613278583e-05, + "loss": 1.6984, + "step": 14870 + }, + { + "epoch": 4.564456721915286, + "grad_norm": 0.2784348726272583, + "learning_rate": 5.943194485838985e-05, + "loss": 1.8082, + "step": 14871 + }, + { + "epoch": 4.564763658686311, + "grad_norm": 0.2557264268398285, + "learning_rate": 5.9427063490781885e-05, + "loss": 1.7715, + "step": 14872 + }, + { + "epoch": 4.565070595457335, + "grad_norm": 0.3738742470741272, + "learning_rate": 5.942218203001015e-05, + "loss": 1.7549, + "step": 14873 + }, + { + "epoch": 4.565377532228361, + "grad_norm": 0.2424495816230774, + "learning_rate": 5.941730047612288e-05, + "loss": 1.7388, + "step": 14874 + }, + { + "epoch": 4.565684468999386, + "grad_norm": 0.27020737528800964, + "learning_rate": 5.941241882916833e-05, + "loss": 1.752, + "step": 14875 + }, + { + "epoch": 4.565991405770411, + "grad_norm": 0.3763764798641205, + "learning_rate": 5.940753708919474e-05, + "loss": 1.7918, + "step": 14876 + }, + { + "epoch": 4.566298342541437, + "grad_norm": 0.26782163977622986, + "learning_rate": 5.940265525625036e-05, + "loss": 1.7244, + "step": 14877 + }, + { + "epoch": 4.566605279312462, + "grad_norm": 0.24978911876678467, + "learning_rate": 5.9397773330383434e-05, + "loss": 1.7706, + "step": 14878 + }, + { + "epoch": 4.5669122160834865, + "grad_norm": 0.32905304431915283, + "learning_rate": 5.93928913116422e-05, + "loss": 1.7381, + "step": 14879 + }, + { + "epoch": 4.567219152854512, + "grad_norm": 0.2196444720029831, + "learning_rate": 5.93880092000749e-05, + "loss": 1.7605, + "step": 14880 + }, + { + "epoch": 4.567526089625537, + "grad_norm": 0.3156622350215912, + "learning_rate": 5.9383126995729786e-05, + "loss": 1.9181, + "step": 14881 + }, + { + "epoch": 4.5678330263965625, + "grad_norm": 0.2895203232765198, + "learning_rate": 5.937824469865513e-05, + "loss": 1.7967, + "step": 14882 + }, + { + "epoch": 4.568139963167588, + "grad_norm": 0.24854810535907745, + "learning_rate": 5.937336230889916e-05, + "loss": 1.7332, + "step": 14883 + }, + { + "epoch": 4.568446899938612, + "grad_norm": 0.3417081832885742, + "learning_rate": 5.936847982651013e-05, + "loss": 1.7525, + "step": 14884 + }, + { + "epoch": 4.568753836709638, + "grad_norm": 0.2874949276447296, + "learning_rate": 5.936359725153629e-05, + "loss": 1.7659, + "step": 14885 + }, + { + "epoch": 4.569060773480663, + "grad_norm": 0.25031307339668274, + "learning_rate": 5.935871458402588e-05, + "loss": 1.8061, + "step": 14886 + }, + { + "epoch": 4.569367710251688, + "grad_norm": 0.27047309279441833, + "learning_rate": 5.935383182402717e-05, + "loss": 1.7318, + "step": 14887 + }, + { + "epoch": 4.569674647022714, + "grad_norm": 0.2642819881439209, + "learning_rate": 5.9348948971588425e-05, + "loss": 1.849, + "step": 14888 + }, + { + "epoch": 4.569981583793739, + "grad_norm": 0.2452307790517807, + "learning_rate": 5.9344066026757886e-05, + "loss": 1.7491, + "step": 14889 + }, + { + "epoch": 4.570288520564763, + "grad_norm": 0.24055036902427673, + "learning_rate": 5.9339182989583795e-05, + "loss": 1.7573, + "step": 14890 + }, + { + "epoch": 4.570595457335789, + "grad_norm": 0.23036183416843414, + "learning_rate": 5.933429986011444e-05, + "loss": 1.7841, + "step": 14891 + }, + { + "epoch": 4.570902394106814, + "grad_norm": 0.27987608313560486, + "learning_rate": 5.932941663839805e-05, + "loss": 1.7835, + "step": 14892 + }, + { + "epoch": 4.571209330877839, + "grad_norm": 0.31747013330459595, + "learning_rate": 5.93245333244829e-05, + "loss": 1.7905, + "step": 14893 + }, + { + "epoch": 4.571516267648864, + "grad_norm": 0.24841344356536865, + "learning_rate": 5.931964991841725e-05, + "loss": 1.8003, + "step": 14894 + }, + { + "epoch": 4.571823204419889, + "grad_norm": 0.2416950911283493, + "learning_rate": 5.9314766420249356e-05, + "loss": 1.7787, + "step": 14895 + }, + { + "epoch": 4.5721301411909145, + "grad_norm": 0.2322494238615036, + "learning_rate": 5.930988283002748e-05, + "loss": 1.8153, + "step": 14896 + }, + { + "epoch": 4.57243707796194, + "grad_norm": 0.22629016637802124, + "learning_rate": 5.930499914779989e-05, + "loss": 1.6743, + "step": 14897 + }, + { + "epoch": 4.572744014732965, + "grad_norm": 0.21481508016586304, + "learning_rate": 5.930011537361483e-05, + "loss": 1.7301, + "step": 14898 + }, + { + "epoch": 4.5730509515039905, + "grad_norm": 0.1993340700864792, + "learning_rate": 5.9295231507520586e-05, + "loss": 1.6796, + "step": 14899 + }, + { + "epoch": 4.573357888275015, + "grad_norm": 0.21681822836399078, + "learning_rate": 5.929034754956543e-05, + "loss": 1.7333, + "step": 14900 + }, + { + "epoch": 4.57366482504604, + "grad_norm": 0.23105305433273315, + "learning_rate": 5.928546349979761e-05, + "loss": 1.8207, + "step": 14901 + }, + { + "epoch": 4.573971761817066, + "grad_norm": 0.24656468629837036, + "learning_rate": 5.9280579358265384e-05, + "loss": 1.7805, + "step": 14902 + }, + { + "epoch": 4.574278698588091, + "grad_norm": 0.28564780950546265, + "learning_rate": 5.927569512501704e-05, + "loss": 1.7224, + "step": 14903 + }, + { + "epoch": 4.574585635359116, + "grad_norm": 0.26030251383781433, + "learning_rate": 5.927081080010084e-05, + "loss": 1.7417, + "step": 14904 + }, + { + "epoch": 4.574892572130141, + "grad_norm": 0.21427087485790253, + "learning_rate": 5.926592638356505e-05, + "loss": 1.7239, + "step": 14905 + }, + { + "epoch": 4.575199508901166, + "grad_norm": 0.2351662665605545, + "learning_rate": 5.9261041875457956e-05, + "loss": 1.7711, + "step": 14906 + }, + { + "epoch": 4.5755064456721914, + "grad_norm": 0.27335020899772644, + "learning_rate": 5.925615727582781e-05, + "loss": 1.7496, + "step": 14907 + }, + { + "epoch": 4.575813382443217, + "grad_norm": 0.27849945425987244, + "learning_rate": 5.925127258472289e-05, + "loss": 1.7576, + "step": 14908 + }, + { + "epoch": 4.576120319214242, + "grad_norm": 0.27859339118003845, + "learning_rate": 5.924638780219147e-05, + "loss": 1.8076, + "step": 14909 + }, + { + "epoch": 4.5764272559852675, + "grad_norm": 0.24664369225502014, + "learning_rate": 5.9241502928281836e-05, + "loss": 1.7657, + "step": 14910 + }, + { + "epoch": 4.576734192756292, + "grad_norm": 0.29881149530410767, + "learning_rate": 5.923661796304224e-05, + "loss": 1.7611, + "step": 14911 + }, + { + "epoch": 4.577041129527317, + "grad_norm": 0.2672356367111206, + "learning_rate": 5.9231732906520984e-05, + "loss": 1.7605, + "step": 14912 + }, + { + "epoch": 4.577348066298343, + "grad_norm": 0.24282832443714142, + "learning_rate": 5.9226847758766336e-05, + "loss": 1.7037, + "step": 14913 + }, + { + "epoch": 4.577655003069368, + "grad_norm": 0.3822915852069855, + "learning_rate": 5.922196251982656e-05, + "loss": 1.7609, + "step": 14914 + }, + { + "epoch": 4.577961939840393, + "grad_norm": 0.30721214413642883, + "learning_rate": 5.921707718974994e-05, + "loss": 1.7398, + "step": 14915 + }, + { + "epoch": 4.578268876611418, + "grad_norm": 0.235477477312088, + "learning_rate": 5.921219176858477e-05, + "loss": 1.6869, + "step": 14916 + }, + { + "epoch": 4.578575813382443, + "grad_norm": 0.3752216100692749, + "learning_rate": 5.920730625637934e-05, + "loss": 1.7296, + "step": 14917 + }, + { + "epoch": 4.578882750153468, + "grad_norm": 0.36901310086250305, + "learning_rate": 5.920242065318189e-05, + "loss": 1.7405, + "step": 14918 + }, + { + "epoch": 4.579189686924494, + "grad_norm": 0.2308608740568161, + "learning_rate": 5.9197534959040725e-05, + "loss": 1.7953, + "step": 14919 + }, + { + "epoch": 4.579496623695519, + "grad_norm": 0.3286738991737366, + "learning_rate": 5.919264917400412e-05, + "loss": 1.7669, + "step": 14920 + }, + { + "epoch": 4.579803560466544, + "grad_norm": 0.3944021165370941, + "learning_rate": 5.918776329812039e-05, + "loss": 1.7165, + "step": 14921 + }, + { + "epoch": 4.580110497237569, + "grad_norm": 0.22054845094680786, + "learning_rate": 5.9182877331437795e-05, + "loss": 1.7739, + "step": 14922 + }, + { + "epoch": 4.580417434008594, + "grad_norm": 0.3467540740966797, + "learning_rate": 5.9177991274004605e-05, + "loss": 1.7713, + "step": 14923 + }, + { + "epoch": 4.5807243707796195, + "grad_norm": 0.4313695728778839, + "learning_rate": 5.917310512586914e-05, + "loss": 1.7654, + "step": 14924 + }, + { + "epoch": 4.581031307550645, + "grad_norm": 0.2723502814769745, + "learning_rate": 5.9168218887079685e-05, + "loss": 1.7314, + "step": 14925 + }, + { + "epoch": 4.581338244321669, + "grad_norm": 0.2641250789165497, + "learning_rate": 5.9163332557684504e-05, + "loss": 1.7303, + "step": 14926 + }, + { + "epoch": 4.581645181092695, + "grad_norm": 0.3780760169029236, + "learning_rate": 5.915844613773189e-05, + "loss": 1.7748, + "step": 14927 + }, + { + "epoch": 4.58195211786372, + "grad_norm": 0.23379632830619812, + "learning_rate": 5.915355962727015e-05, + "loss": 1.7482, + "step": 14928 + }, + { + "epoch": 4.582259054634745, + "grad_norm": 0.35227084159851074, + "learning_rate": 5.914867302634758e-05, + "loss": 1.8198, + "step": 14929 + }, + { + "epoch": 4.582565991405771, + "grad_norm": 0.34348124265670776, + "learning_rate": 5.914378633501245e-05, + "loss": 1.8364, + "step": 14930 + }, + { + "epoch": 4.582872928176796, + "grad_norm": 0.2446804940700531, + "learning_rate": 5.9138899553313066e-05, + "loss": 1.7779, + "step": 14931 + }, + { + "epoch": 4.58317986494782, + "grad_norm": 0.23893557488918304, + "learning_rate": 5.913401268129772e-05, + "loss": 1.7582, + "step": 14932 + }, + { + "epoch": 4.583486801718846, + "grad_norm": 0.3046814203262329, + "learning_rate": 5.912912571901471e-05, + "loss": 1.6871, + "step": 14933 + }, + { + "epoch": 4.583793738489871, + "grad_norm": 0.2232733964920044, + "learning_rate": 5.912423866651233e-05, + "loss": 1.7269, + "step": 14934 + }, + { + "epoch": 4.584100675260896, + "grad_norm": 0.18664126098155975, + "learning_rate": 5.911935152383888e-05, + "loss": 1.7155, + "step": 14935 + }, + { + "epoch": 4.584407612031922, + "grad_norm": 0.2573263347148895, + "learning_rate": 5.911446429104265e-05, + "loss": 1.7901, + "step": 14936 + }, + { + "epoch": 4.584714548802946, + "grad_norm": 0.2382393181324005, + "learning_rate": 5.910957696817194e-05, + "loss": 1.7407, + "step": 14937 + }, + { + "epoch": 4.5850214855739715, + "grad_norm": 0.28363972902297974, + "learning_rate": 5.910468955527504e-05, + "loss": 1.7971, + "step": 14938 + }, + { + "epoch": 4.585328422344997, + "grad_norm": 0.3173120617866516, + "learning_rate": 5.909980205240027e-05, + "loss": 1.744, + "step": 14939 + }, + { + "epoch": 4.585635359116022, + "grad_norm": 0.2281302511692047, + "learning_rate": 5.909491445959592e-05, + "loss": 1.6976, + "step": 14940 + }, + { + "epoch": 4.5859422958870475, + "grad_norm": 0.24962912499904633, + "learning_rate": 5.9090026776910304e-05, + "loss": 1.7979, + "step": 14941 + }, + { + "epoch": 4.586249232658073, + "grad_norm": 0.22330854833126068, + "learning_rate": 5.908513900439171e-05, + "loss": 1.7854, + "step": 14942 + }, + { + "epoch": 4.586556169429097, + "grad_norm": 0.20861582458019257, + "learning_rate": 5.908025114208845e-05, + "loss": 1.7133, + "step": 14943 + }, + { + "epoch": 4.586863106200123, + "grad_norm": 0.21838510036468506, + "learning_rate": 5.90753631900488e-05, + "loss": 1.6919, + "step": 14944 + }, + { + "epoch": 4.587170042971148, + "grad_norm": 0.252798467874527, + "learning_rate": 5.907047514832112e-05, + "loss": 1.838, + "step": 14945 + }, + { + "epoch": 4.587476979742173, + "grad_norm": 0.326893150806427, + "learning_rate": 5.906558701695369e-05, + "loss": 1.7303, + "step": 14946 + }, + { + "epoch": 4.587783916513199, + "grad_norm": 0.36489585041999817, + "learning_rate": 5.9060698795994804e-05, + "loss": 1.7631, + "step": 14947 + }, + { + "epoch": 4.588090853284223, + "grad_norm": 0.27491649985313416, + "learning_rate": 5.905581048549279e-05, + "loss": 1.7773, + "step": 14948 + }, + { + "epoch": 4.588397790055248, + "grad_norm": 0.2334890067577362, + "learning_rate": 5.905092208549595e-05, + "loss": 1.7254, + "step": 14949 + }, + { + "epoch": 4.588704726826274, + "grad_norm": 0.24383895099163055, + "learning_rate": 5.904603359605257e-05, + "loss": 1.7496, + "step": 14950 + }, + { + "epoch": 4.589011663597299, + "grad_norm": 0.2144637256860733, + "learning_rate": 5.904114501721102e-05, + "loss": 1.7028, + "step": 14951 + }, + { + "epoch": 4.589318600368324, + "grad_norm": 0.19675977528095245, + "learning_rate": 5.9036256349019555e-05, + "loss": 1.7548, + "step": 14952 + }, + { + "epoch": 4.58962553713935, + "grad_norm": 0.23712843656539917, + "learning_rate": 5.903136759152652e-05, + "loss": 1.7722, + "step": 14953 + }, + { + "epoch": 4.589932473910374, + "grad_norm": 0.20307733118534088, + "learning_rate": 5.902647874478021e-05, + "loss": 1.7177, + "step": 14954 + }, + { + "epoch": 4.5902394106813995, + "grad_norm": 0.21767669916152954, + "learning_rate": 5.9021589808828936e-05, + "loss": 1.7963, + "step": 14955 + }, + { + "epoch": 4.590546347452425, + "grad_norm": 0.2056351602077484, + "learning_rate": 5.9016700783721036e-05, + "loss": 1.7439, + "step": 14956 + }, + { + "epoch": 4.59085328422345, + "grad_norm": 0.20480911433696747, + "learning_rate": 5.90118116695048e-05, + "loss": 1.7122, + "step": 14957 + }, + { + "epoch": 4.5911602209944755, + "grad_norm": 0.24091731011867523, + "learning_rate": 5.900692246622858e-05, + "loss": 1.7862, + "step": 14958 + }, + { + "epoch": 4.5914671577655, + "grad_norm": 0.20246434211730957, + "learning_rate": 5.900203317394066e-05, + "loss": 1.6895, + "step": 14959 + }, + { + "epoch": 4.591774094536525, + "grad_norm": 0.23771630227565765, + "learning_rate": 5.899714379268938e-05, + "loss": 1.7794, + "step": 14960 + }, + { + "epoch": 4.592081031307551, + "grad_norm": 0.2638718783855438, + "learning_rate": 5.899225432252303e-05, + "loss": 1.8059, + "step": 14961 + }, + { + "epoch": 4.592387968078576, + "grad_norm": 0.24251408874988556, + "learning_rate": 5.898736476348997e-05, + "loss": 1.8063, + "step": 14962 + }, + { + "epoch": 4.592694904849601, + "grad_norm": 0.2487735152244568, + "learning_rate": 5.8982475115638515e-05, + "loss": 1.7615, + "step": 14963 + }, + { + "epoch": 4.593001841620627, + "grad_norm": 0.23507241904735565, + "learning_rate": 5.897758537901696e-05, + "loss": 1.7496, + "step": 14964 + }, + { + "epoch": 4.593308778391651, + "grad_norm": 0.22354768216609955, + "learning_rate": 5.897269555367365e-05, + "loss": 1.7293, + "step": 14965 + }, + { + "epoch": 4.593615715162676, + "grad_norm": 0.2711353003978729, + "learning_rate": 5.89678056396569e-05, + "loss": 1.8127, + "step": 14966 + }, + { + "epoch": 4.593922651933702, + "grad_norm": 0.30061110854148865, + "learning_rate": 5.8962915637015036e-05, + "loss": 1.7653, + "step": 14967 + }, + { + "epoch": 4.594229588704727, + "grad_norm": 0.24577318131923676, + "learning_rate": 5.895802554579639e-05, + "loss": 1.7888, + "step": 14968 + }, + { + "epoch": 4.5945365254757515, + "grad_norm": 0.25568944215774536, + "learning_rate": 5.895313536604929e-05, + "loss": 1.7912, + "step": 14969 + }, + { + "epoch": 4.594843462246777, + "grad_norm": 0.2710168957710266, + "learning_rate": 5.894824509782206e-05, + "loss": 1.7681, + "step": 14970 + }, + { + "epoch": 4.595150399017802, + "grad_norm": 0.24056777358055115, + "learning_rate": 5.894335474116303e-05, + "loss": 1.7729, + "step": 14971 + }, + { + "epoch": 4.5954573357888275, + "grad_norm": 0.21956710517406464, + "learning_rate": 5.89384642961205e-05, + "loss": 1.7576, + "step": 14972 + }, + { + "epoch": 4.595764272559853, + "grad_norm": 0.27499106526374817, + "learning_rate": 5.893357376274284e-05, + "loss": 1.7909, + "step": 14973 + }, + { + "epoch": 4.596071209330878, + "grad_norm": 0.28581273555755615, + "learning_rate": 5.8928683141078376e-05, + "loss": 1.7592, + "step": 14974 + }, + { + "epoch": 4.596378146101903, + "grad_norm": 0.23218442499637604, + "learning_rate": 5.892379243117543e-05, + "loss": 1.7142, + "step": 14975 + }, + { + "epoch": 4.596685082872928, + "grad_norm": 0.34015771746635437, + "learning_rate": 5.891890163308234e-05, + "loss": 1.7457, + "step": 14976 + }, + { + "epoch": 4.596992019643953, + "grad_norm": 0.2630012333393097, + "learning_rate": 5.8914010746847435e-05, + "loss": 1.7612, + "step": 14977 + }, + { + "epoch": 4.597298956414979, + "grad_norm": 0.2265843003988266, + "learning_rate": 5.890911977251904e-05, + "loss": 1.7272, + "step": 14978 + }, + { + "epoch": 4.597605893186004, + "grad_norm": 0.22325244545936584, + "learning_rate": 5.8904228710145505e-05, + "loss": 1.7447, + "step": 14979 + }, + { + "epoch": 4.597912829957028, + "grad_norm": 0.23512716591358185, + "learning_rate": 5.889933755977517e-05, + "loss": 1.7123, + "step": 14980 + }, + { + "epoch": 4.598219766728054, + "grad_norm": 0.22534869611263275, + "learning_rate": 5.8894446321456365e-05, + "loss": 1.785, + "step": 14981 + }, + { + "epoch": 4.598526703499079, + "grad_norm": 0.2447836697101593, + "learning_rate": 5.888955499523743e-05, + "loss": 1.7154, + "step": 14982 + }, + { + "epoch": 4.598833640270104, + "grad_norm": 0.2451140582561493, + "learning_rate": 5.88846635811667e-05, + "loss": 1.7494, + "step": 14983 + }, + { + "epoch": 4.59914057704113, + "grad_norm": 0.2253585308790207, + "learning_rate": 5.8879772079292504e-05, + "loss": 1.7591, + "step": 14984 + }, + { + "epoch": 4.599447513812155, + "grad_norm": 0.21714572608470917, + "learning_rate": 5.887488048966322e-05, + "loss": 1.7314, + "step": 14985 + }, + { + "epoch": 4.5997544505831796, + "grad_norm": 0.24897411465644836, + "learning_rate": 5.8869988812327145e-05, + "loss": 1.776, + "step": 14986 + }, + { + "epoch": 4.600061387354205, + "grad_norm": 0.22575093805789948, + "learning_rate": 5.8865097047332653e-05, + "loss": 1.7168, + "step": 14987 + }, + { + "epoch": 4.60036832412523, + "grad_norm": 0.22857412695884705, + "learning_rate": 5.886020519472808e-05, + "loss": 1.8262, + "step": 14988 + }, + { + "epoch": 4.600675260896256, + "grad_norm": 0.22741298377513885, + "learning_rate": 5.885531325456174e-05, + "loss": 1.6732, + "step": 14989 + }, + { + "epoch": 4.600982197667281, + "grad_norm": 0.2229645550251007, + "learning_rate": 5.885042122688202e-05, + "loss": 1.7384, + "step": 14990 + }, + { + "epoch": 4.601289134438305, + "grad_norm": 0.22609494626522064, + "learning_rate": 5.884552911173726e-05, + "loss": 1.714, + "step": 14991 + }, + { + "epoch": 4.601596071209331, + "grad_norm": 0.2629149854183197, + "learning_rate": 5.884063690917578e-05, + "loss": 1.8133, + "step": 14992 + }, + { + "epoch": 4.601903007980356, + "grad_norm": 0.220725417137146, + "learning_rate": 5.883574461924597e-05, + "loss": 1.6898, + "step": 14993 + }, + { + "epoch": 4.602209944751381, + "grad_norm": 0.207612082362175, + "learning_rate": 5.8830852241996135e-05, + "loss": 1.7302, + "step": 14994 + }, + { + "epoch": 4.602516881522407, + "grad_norm": 0.22418084740638733, + "learning_rate": 5.8825959777474625e-05, + "loss": 1.763, + "step": 14995 + }, + { + "epoch": 4.602823818293432, + "grad_norm": 0.30606865882873535, + "learning_rate": 5.882106722572983e-05, + "loss": 1.7657, + "step": 14996 + }, + { + "epoch": 4.6031307550644565, + "grad_norm": 0.2947966456413269, + "learning_rate": 5.881617458681008e-05, + "loss": 1.7796, + "step": 14997 + }, + { + "epoch": 4.603437691835482, + "grad_norm": 0.23430216312408447, + "learning_rate": 5.881128186076372e-05, + "loss": 1.78, + "step": 14998 + }, + { + "epoch": 4.603744628606507, + "grad_norm": 0.28081849217414856, + "learning_rate": 5.880638904763911e-05, + "loss": 1.6791, + "step": 14999 + }, + { + "epoch": 4.6040515653775325, + "grad_norm": 0.25459226965904236, + "learning_rate": 5.88014961474846e-05, + "loss": 1.8064, + "step": 15000 + }, + { + "epoch": 4.604358502148557, + "grad_norm": 0.2358713001012802, + "learning_rate": 5.879660316034854e-05, + "loss": 1.763, + "step": 15001 + }, + { + "epoch": 4.604665438919582, + "grad_norm": 0.32954758405685425, + "learning_rate": 5.879171008627931e-05, + "loss": 1.7462, + "step": 15002 + }, + { + "epoch": 4.604972375690608, + "grad_norm": 0.2588615417480469, + "learning_rate": 5.878681692532523e-05, + "loss": 1.7771, + "step": 15003 + }, + { + "epoch": 4.605279312461633, + "grad_norm": 0.21216195821762085, + "learning_rate": 5.878192367753468e-05, + "loss": 1.7128, + "step": 15004 + }, + { + "epoch": 4.605586249232658, + "grad_norm": 0.26849040389060974, + "learning_rate": 5.8777030342956016e-05, + "loss": 1.7048, + "step": 15005 + }, + { + "epoch": 4.605893186003684, + "grad_norm": 0.22343295812606812, + "learning_rate": 5.877213692163759e-05, + "loss": 1.7695, + "step": 15006 + }, + { + "epoch": 4.606200122774708, + "grad_norm": 0.2794288694858551, + "learning_rate": 5.876724341362776e-05, + "loss": 1.7856, + "step": 15007 + }, + { + "epoch": 4.606507059545733, + "grad_norm": 0.3525427579879761, + "learning_rate": 5.8762349818974905e-05, + "loss": 1.7807, + "step": 15008 + }, + { + "epoch": 4.606813996316759, + "grad_norm": 0.25886499881744385, + "learning_rate": 5.875745613772736e-05, + "loss": 1.7818, + "step": 15009 + }, + { + "epoch": 4.607120933087784, + "grad_norm": 0.24822987616062164, + "learning_rate": 5.8752562369933515e-05, + "loss": 1.7369, + "step": 15010 + }, + { + "epoch": 4.607427869858809, + "grad_norm": 0.26067355275154114, + "learning_rate": 5.874766851564171e-05, + "loss": 1.7056, + "step": 15011 + }, + { + "epoch": 4.607734806629834, + "grad_norm": 0.2869747579097748, + "learning_rate": 5.874277457490033e-05, + "loss": 1.7284, + "step": 15012 + }, + { + "epoch": 4.608041743400859, + "grad_norm": 0.23153580725193024, + "learning_rate": 5.87378805477577e-05, + "loss": 1.7331, + "step": 15013 + }, + { + "epoch": 4.6083486801718845, + "grad_norm": 0.29307299852371216, + "learning_rate": 5.873298643426223e-05, + "loss": 1.7376, + "step": 15014 + }, + { + "epoch": 4.60865561694291, + "grad_norm": 0.25638771057128906, + "learning_rate": 5.872809223446227e-05, + "loss": 1.7585, + "step": 15015 + }, + { + "epoch": 4.608962553713935, + "grad_norm": 0.2272702306509018, + "learning_rate": 5.872319794840618e-05, + "loss": 1.7482, + "step": 15016 + }, + { + "epoch": 4.6092694904849605, + "grad_norm": 0.2579486072063446, + "learning_rate": 5.8718303576142356e-05, + "loss": 1.778, + "step": 15017 + }, + { + "epoch": 4.609576427255985, + "grad_norm": 0.2216452956199646, + "learning_rate": 5.871340911771912e-05, + "loss": 1.7517, + "step": 15018 + }, + { + "epoch": 4.60988336402701, + "grad_norm": 0.22628961503505707, + "learning_rate": 5.870851457318488e-05, + "loss": 1.7579, + "step": 15019 + }, + { + "epoch": 4.610190300798036, + "grad_norm": 0.31018149852752686, + "learning_rate": 5.8703619942588e-05, + "loss": 1.7911, + "step": 15020 + }, + { + "epoch": 4.610497237569061, + "grad_norm": 0.2618122100830078, + "learning_rate": 5.869872522597683e-05, + "loss": 1.8121, + "step": 15021 + }, + { + "epoch": 4.610804174340086, + "grad_norm": 0.26085740327835083, + "learning_rate": 5.869383042339978e-05, + "loss": 1.7952, + "step": 15022 + }, + { + "epoch": 4.611111111111111, + "grad_norm": 0.25237780809402466, + "learning_rate": 5.86889355349052e-05, + "loss": 1.7575, + "step": 15023 + }, + { + "epoch": 4.611418047882136, + "grad_norm": 0.27550897002220154, + "learning_rate": 5.868404056054144e-05, + "loss": 1.7816, + "step": 15024 + }, + { + "epoch": 4.611724984653161, + "grad_norm": 0.2458692342042923, + "learning_rate": 5.8679145500356926e-05, + "loss": 1.7783, + "step": 15025 + }, + { + "epoch": 4.612031921424187, + "grad_norm": 0.25606176257133484, + "learning_rate": 5.867425035439999e-05, + "loss": 1.7863, + "step": 15026 + }, + { + "epoch": 4.612338858195212, + "grad_norm": 0.3206995725631714, + "learning_rate": 5.866935512271905e-05, + "loss": 1.7468, + "step": 15027 + }, + { + "epoch": 4.612645794966237, + "grad_norm": 0.2754824459552765, + "learning_rate": 5.866445980536245e-05, + "loss": 1.793, + "step": 15028 + }, + { + "epoch": 4.612952731737262, + "grad_norm": 0.25168612599372864, + "learning_rate": 5.865956440237859e-05, + "loss": 1.7252, + "step": 15029 + }, + { + "epoch": 4.613259668508287, + "grad_norm": 0.3226735293865204, + "learning_rate": 5.8654668913815815e-05, + "loss": 1.7291, + "step": 15030 + }, + { + "epoch": 4.6135666052793125, + "grad_norm": 0.2580295503139496, + "learning_rate": 5.864977333972255e-05, + "loss": 1.7622, + "step": 15031 + }, + { + "epoch": 4.613873542050338, + "grad_norm": 0.21486075222492218, + "learning_rate": 5.864487768014715e-05, + "loss": 1.7662, + "step": 15032 + }, + { + "epoch": 4.614180478821363, + "grad_norm": 0.2331690639257431, + "learning_rate": 5.8639981935137996e-05, + "loss": 1.7389, + "step": 15033 + }, + { + "epoch": 4.614487415592388, + "grad_norm": 0.2573511302471161, + "learning_rate": 5.863508610474348e-05, + "loss": 1.7699, + "step": 15034 + }, + { + "epoch": 4.614794352363413, + "grad_norm": 0.2260694056749344, + "learning_rate": 5.863019018901199e-05, + "loss": 1.7784, + "step": 15035 + }, + { + "epoch": 4.615101289134438, + "grad_norm": 0.2283065915107727, + "learning_rate": 5.8625294187991895e-05, + "loss": 1.7061, + "step": 15036 + }, + { + "epoch": 4.615408225905464, + "grad_norm": 0.24772310256958008, + "learning_rate": 5.862039810173159e-05, + "loss": 1.7568, + "step": 15037 + }, + { + "epoch": 4.615715162676489, + "grad_norm": 0.2515513002872467, + "learning_rate": 5.861550193027945e-05, + "loss": 1.7445, + "step": 15038 + }, + { + "epoch": 4.616022099447514, + "grad_norm": 0.26472151279449463, + "learning_rate": 5.8610605673683885e-05, + "loss": 1.7735, + "step": 15039 + }, + { + "epoch": 4.616329036218539, + "grad_norm": 0.24053528904914856, + "learning_rate": 5.8605709331993254e-05, + "loss": 1.8009, + "step": 15040 + }, + { + "epoch": 4.616635972989564, + "grad_norm": 0.25125381350517273, + "learning_rate": 5.860081290525596e-05, + "loss": 1.7712, + "step": 15041 + }, + { + "epoch": 4.616942909760589, + "grad_norm": 0.23056018352508545, + "learning_rate": 5.85959163935204e-05, + "loss": 1.7684, + "step": 15042 + }, + { + "epoch": 4.617249846531615, + "grad_norm": 0.2533007562160492, + "learning_rate": 5.859101979683494e-05, + "loss": 1.7793, + "step": 15043 + }, + { + "epoch": 4.617556783302639, + "grad_norm": 0.21007375419139862, + "learning_rate": 5.8586123115248e-05, + "loss": 1.7484, + "step": 15044 + }, + { + "epoch": 4.6178637200736645, + "grad_norm": 0.21329566836357117, + "learning_rate": 5.858122634880797e-05, + "loss": 1.7763, + "step": 15045 + }, + { + "epoch": 4.61817065684469, + "grad_norm": 0.2362898588180542, + "learning_rate": 5.857632949756322e-05, + "loss": 1.7484, + "step": 15046 + }, + { + "epoch": 4.618477593615715, + "grad_norm": 0.2168794423341751, + "learning_rate": 5.857143256156214e-05, + "loss": 1.7752, + "step": 15047 + }, + { + "epoch": 4.6187845303867405, + "grad_norm": 0.24761471152305603, + "learning_rate": 5.856653554085316e-05, + "loss": 1.7793, + "step": 15048 + }, + { + "epoch": 4.619091467157766, + "grad_norm": 0.23202158510684967, + "learning_rate": 5.856163843548466e-05, + "loss": 1.6862, + "step": 15049 + }, + { + "epoch": 4.61939840392879, + "grad_norm": 0.23868000507354736, + "learning_rate": 5.855674124550501e-05, + "loss": 1.8075, + "step": 15050 + }, + { + "epoch": 4.619705340699816, + "grad_norm": 0.3063114583492279, + "learning_rate": 5.855184397096265e-05, + "loss": 1.8051, + "step": 15051 + }, + { + "epoch": 4.620012277470841, + "grad_norm": 0.22672493755817413, + "learning_rate": 5.854694661190594e-05, + "loss": 1.7478, + "step": 15052 + }, + { + "epoch": 4.620319214241866, + "grad_norm": 0.3403559923171997, + "learning_rate": 5.8542049168383296e-05, + "loss": 1.765, + "step": 15053 + }, + { + "epoch": 4.620626151012892, + "grad_norm": 0.33852189779281616, + "learning_rate": 5.853715164044312e-05, + "loss": 1.7602, + "step": 15054 + }, + { + "epoch": 4.620933087783916, + "grad_norm": 0.25166940689086914, + "learning_rate": 5.85322540281338e-05, + "loss": 1.7584, + "step": 15055 + }, + { + "epoch": 4.621240024554941, + "grad_norm": 0.3417987823486328, + "learning_rate": 5.8527356331503757e-05, + "loss": 1.8491, + "step": 15056 + }, + { + "epoch": 4.621546961325967, + "grad_norm": 0.3286994397640228, + "learning_rate": 5.852245855060138e-05, + "loss": 1.7146, + "step": 15057 + }, + { + "epoch": 4.621853898096992, + "grad_norm": 0.24394257366657257, + "learning_rate": 5.851756068547505e-05, + "loss": 1.8762, + "step": 15058 + }, + { + "epoch": 4.622160834868017, + "grad_norm": 0.34945347905158997, + "learning_rate": 5.851266273617321e-05, + "loss": 1.8086, + "step": 15059 + }, + { + "epoch": 4.622467771639043, + "grad_norm": 0.30189210176467896, + "learning_rate": 5.850776470274425e-05, + "loss": 1.7366, + "step": 15060 + }, + { + "epoch": 4.622774708410067, + "grad_norm": 0.24050579965114594, + "learning_rate": 5.850286658523657e-05, + "loss": 1.7599, + "step": 15061 + }, + { + "epoch": 4.6230816451810925, + "grad_norm": 0.33650726079940796, + "learning_rate": 5.849796838369857e-05, + "loss": 1.7343, + "step": 15062 + }, + { + "epoch": 4.623388581952118, + "grad_norm": 0.2855902910232544, + "learning_rate": 5.849307009817868e-05, + "loss": 1.7325, + "step": 15063 + }, + { + "epoch": 4.623695518723143, + "grad_norm": 0.2562592923641205, + "learning_rate": 5.8488171728725275e-05, + "loss": 1.7772, + "step": 15064 + }, + { + "epoch": 4.6240024554941686, + "grad_norm": 0.23494984209537506, + "learning_rate": 5.84832732753868e-05, + "loss": 1.7263, + "step": 15065 + }, + { + "epoch": 4.624309392265193, + "grad_norm": 0.23248226940631866, + "learning_rate": 5.847837473821164e-05, + "loss": 1.7441, + "step": 15066 + }, + { + "epoch": 4.624616329036218, + "grad_norm": 0.2291254848241806, + "learning_rate": 5.847347611724821e-05, + "loss": 1.7742, + "step": 15067 + }, + { + "epoch": 4.624923265807244, + "grad_norm": 0.28305280208587646, + "learning_rate": 5.8468577412544925e-05, + "loss": 1.8224, + "step": 15068 + }, + { + "epoch": 4.625230202578269, + "grad_norm": 0.25531691312789917, + "learning_rate": 5.84636786241502e-05, + "loss": 1.7458, + "step": 15069 + }, + { + "epoch": 4.625537139349294, + "grad_norm": 0.2363462746143341, + "learning_rate": 5.845877975211242e-05, + "loss": 1.7977, + "step": 15070 + }, + { + "epoch": 4.62584407612032, + "grad_norm": 0.2707001864910126, + "learning_rate": 5.845388079648004e-05, + "loss": 1.774, + "step": 15071 + }, + { + "epoch": 4.626151012891344, + "grad_norm": 0.22281844913959503, + "learning_rate": 5.844898175730146e-05, + "loss": 1.7888, + "step": 15072 + }, + { + "epoch": 4.6264579496623695, + "grad_norm": 0.24809995293617249, + "learning_rate": 5.8444082634625086e-05, + "loss": 1.7895, + "step": 15073 + }, + { + "epoch": 4.626764886433395, + "grad_norm": 0.2842096984386444, + "learning_rate": 5.843918342849933e-05, + "loss": 1.7323, + "step": 15074 + }, + { + "epoch": 4.62707182320442, + "grad_norm": 0.21343614161014557, + "learning_rate": 5.843428413897261e-05, + "loss": 1.7298, + "step": 15075 + }, + { + "epoch": 4.627378759975445, + "grad_norm": 0.2420526146888733, + "learning_rate": 5.842938476609336e-05, + "loss": 1.778, + "step": 15076 + }, + { + "epoch": 4.62768569674647, + "grad_norm": 0.22202003002166748, + "learning_rate": 5.842448530990999e-05, + "loss": 1.779, + "step": 15077 + }, + { + "epoch": 4.627992633517495, + "grad_norm": 0.26784011721611023, + "learning_rate": 5.841958577047092e-05, + "loss": 1.799, + "step": 15078 + }, + { + "epoch": 4.628299570288521, + "grad_norm": 0.3230212926864624, + "learning_rate": 5.841468614782457e-05, + "loss": 1.7789, + "step": 15079 + }, + { + "epoch": 4.628606507059546, + "grad_norm": 0.24062715470790863, + "learning_rate": 5.840978644201935e-05, + "loss": 1.7697, + "step": 15080 + }, + { + "epoch": 4.628913443830571, + "grad_norm": 0.2882130444049835, + "learning_rate": 5.84048866531037e-05, + "loss": 1.7946, + "step": 15081 + }, + { + "epoch": 4.629220380601596, + "grad_norm": 0.3145603537559509, + "learning_rate": 5.839998678112602e-05, + "loss": 1.7116, + "step": 15082 + }, + { + "epoch": 4.629527317372621, + "grad_norm": 0.270997017621994, + "learning_rate": 5.839508682613477e-05, + "loss": 1.8281, + "step": 15083 + }, + { + "epoch": 4.629834254143646, + "grad_norm": 0.27299395203590393, + "learning_rate": 5.839018678817834e-05, + "loss": 1.8233, + "step": 15084 + }, + { + "epoch": 4.630141190914672, + "grad_norm": 0.2684478461742401, + "learning_rate": 5.838528666730517e-05, + "loss": 1.8111, + "step": 15085 + }, + { + "epoch": 4.630448127685697, + "grad_norm": 0.2365201860666275, + "learning_rate": 5.838038646356367e-05, + "loss": 1.7475, + "step": 15086 + }, + { + "epoch": 4.6307550644567215, + "grad_norm": 0.2661258280277252, + "learning_rate": 5.8375486177002305e-05, + "loss": 1.748, + "step": 15087 + }, + { + "epoch": 4.631062001227747, + "grad_norm": 0.2865012586116791, + "learning_rate": 5.8370585807669455e-05, + "loss": 1.7525, + "step": 15088 + }, + { + "epoch": 4.631368937998772, + "grad_norm": 0.2445172518491745, + "learning_rate": 5.836568535561358e-05, + "loss": 1.7278, + "step": 15089 + }, + { + "epoch": 4.6316758747697975, + "grad_norm": 0.28192558884620667, + "learning_rate": 5.8360784820883083e-05, + "loss": 1.7371, + "step": 15090 + }, + { + "epoch": 4.631982811540823, + "grad_norm": 0.38927358388900757, + "learning_rate": 5.835588420352642e-05, + "loss": 1.8088, + "step": 15091 + }, + { + "epoch": 4.632289748311848, + "grad_norm": 0.3409229516983032, + "learning_rate": 5.8350983503592025e-05, + "loss": 1.8011, + "step": 15092 + }, + { + "epoch": 4.632596685082873, + "grad_norm": 0.2464994341135025, + "learning_rate": 5.8346082721128294e-05, + "loss": 1.8354, + "step": 15093 + }, + { + "epoch": 4.632903621853898, + "grad_norm": 0.38765814900398254, + "learning_rate": 5.834118185618369e-05, + "loss": 1.7811, + "step": 15094 + }, + { + "epoch": 4.633210558624923, + "grad_norm": 0.42435070872306824, + "learning_rate": 5.833628090880664e-05, + "loss": 1.7855, + "step": 15095 + }, + { + "epoch": 4.633517495395949, + "grad_norm": 0.244876891374588, + "learning_rate": 5.833137987904558e-05, + "loss": 1.7494, + "step": 15096 + }, + { + "epoch": 4.633824432166974, + "grad_norm": 0.30353477597236633, + "learning_rate": 5.8326478766948934e-05, + "loss": 1.7772, + "step": 15097 + }, + { + "epoch": 4.634131368937998, + "grad_norm": 0.38839244842529297, + "learning_rate": 5.8321577572565146e-05, + "loss": 1.7689, + "step": 15098 + }, + { + "epoch": 4.634438305709024, + "grad_norm": 0.357129842042923, + "learning_rate": 5.8316676295942644e-05, + "loss": 1.7777, + "step": 15099 + }, + { + "epoch": 4.634745242480049, + "grad_norm": 0.23458799719810486, + "learning_rate": 5.831177493712988e-05, + "loss": 1.7544, + "step": 15100 + }, + { + "epoch": 4.635052179251074, + "grad_norm": 0.23751308023929596, + "learning_rate": 5.830687349617529e-05, + "loss": 1.7491, + "step": 15101 + }, + { + "epoch": 4.6353591160221, + "grad_norm": 0.31978943943977356, + "learning_rate": 5.83019719731273e-05, + "loss": 1.7439, + "step": 15102 + }, + { + "epoch": 4.635666052793125, + "grad_norm": 0.2751142084598541, + "learning_rate": 5.829707036803438e-05, + "loss": 1.8598, + "step": 15103 + }, + { + "epoch": 4.6359729895641495, + "grad_norm": 0.23670406639575958, + "learning_rate": 5.8292168680944914e-05, + "loss": 1.7629, + "step": 15104 + }, + { + "epoch": 4.636279926335175, + "grad_norm": 0.2447349727153778, + "learning_rate": 5.828726691190739e-05, + "loss": 1.7606, + "step": 15105 + }, + { + "epoch": 4.6365868631062, + "grad_norm": 0.2739902436733246, + "learning_rate": 5.828236506097023e-05, + "loss": 1.707, + "step": 15106 + }, + { + "epoch": 4.6368937998772255, + "grad_norm": 0.2050863653421402, + "learning_rate": 5.82774631281819e-05, + "loss": 1.7235, + "step": 15107 + }, + { + "epoch": 4.637200736648251, + "grad_norm": 0.3005560338497162, + "learning_rate": 5.827256111359082e-05, + "loss": 1.7785, + "step": 15108 + }, + { + "epoch": 4.637507673419275, + "grad_norm": 0.27168264985084534, + "learning_rate": 5.8267659017245434e-05, + "loss": 1.7844, + "step": 15109 + }, + { + "epoch": 4.637814610190301, + "grad_norm": 0.2965840995311737, + "learning_rate": 5.82627568391942e-05, + "loss": 1.7631, + "step": 15110 + }, + { + "epoch": 4.638121546961326, + "grad_norm": 0.3114408552646637, + "learning_rate": 5.825785457948556e-05, + "loss": 1.77, + "step": 15111 + }, + { + "epoch": 4.638428483732351, + "grad_norm": 0.2638910114765167, + "learning_rate": 5.825295223816796e-05, + "loss": 1.9183, + "step": 15112 + }, + { + "epoch": 4.638735420503377, + "grad_norm": 0.3293665051460266, + "learning_rate": 5.824804981528986e-05, + "loss": 1.6779, + "step": 15113 + }, + { + "epoch": 4.639042357274402, + "grad_norm": 0.28586456179618835, + "learning_rate": 5.824314731089968e-05, + "loss": 1.7905, + "step": 15114 + }, + { + "epoch": 4.639349294045426, + "grad_norm": 0.2254554182291031, + "learning_rate": 5.8238244725045906e-05, + "loss": 1.7602, + "step": 15115 + }, + { + "epoch": 4.639656230816452, + "grad_norm": 0.2770406901836395, + "learning_rate": 5.823334205777695e-05, + "loss": 1.7789, + "step": 15116 + }, + { + "epoch": 4.639963167587477, + "grad_norm": 0.2867025136947632, + "learning_rate": 5.822843930914129e-05, + "loss": 1.7408, + "step": 15117 + }, + { + "epoch": 4.640270104358502, + "grad_norm": 0.23486989736557007, + "learning_rate": 5.822353647918737e-05, + "loss": 1.7489, + "step": 15118 + }, + { + "epoch": 4.640577041129527, + "grad_norm": 0.2274324595928192, + "learning_rate": 5.821863356796367e-05, + "loss": 1.768, + "step": 15119 + }, + { + "epoch": 4.640883977900552, + "grad_norm": 0.25032591819763184, + "learning_rate": 5.821373057551858e-05, + "loss": 1.7602, + "step": 15120 + }, + { + "epoch": 4.6411909146715775, + "grad_norm": 0.22332963347434998, + "learning_rate": 5.820882750190059e-05, + "loss": 1.756, + "step": 15121 + }, + { + "epoch": 4.641497851442603, + "grad_norm": 0.24975591897964478, + "learning_rate": 5.820392434715817e-05, + "loss": 1.6963, + "step": 15122 + }, + { + "epoch": 4.641804788213628, + "grad_norm": 0.27892687916755676, + "learning_rate": 5.819902111133976e-05, + "loss": 1.8295, + "step": 15123 + }, + { + "epoch": 4.6421117249846535, + "grad_norm": 0.23914897441864014, + "learning_rate": 5.819411779449381e-05, + "loss": 1.7636, + "step": 15124 + }, + { + "epoch": 4.642418661755678, + "grad_norm": 0.2349565476179123, + "learning_rate": 5.818921439666879e-05, + "loss": 1.7823, + "step": 15125 + }, + { + "epoch": 4.642725598526703, + "grad_norm": 0.2075800597667694, + "learning_rate": 5.818431091791315e-05, + "loss": 1.7282, + "step": 15126 + }, + { + "epoch": 4.643032535297729, + "grad_norm": 0.19781073927879333, + "learning_rate": 5.817940735827535e-05, + "loss": 1.7598, + "step": 15127 + }, + { + "epoch": 4.643339472068754, + "grad_norm": 0.21997439861297607, + "learning_rate": 5.8174503717803866e-05, + "loss": 1.766, + "step": 15128 + }, + { + "epoch": 4.643646408839779, + "grad_norm": 0.23971444368362427, + "learning_rate": 5.816959999654713e-05, + "loss": 1.7824, + "step": 15129 + }, + { + "epoch": 4.643953345610804, + "grad_norm": 0.23357853293418884, + "learning_rate": 5.816469619455363e-05, + "loss": 1.7353, + "step": 15130 + }, + { + "epoch": 4.644260282381829, + "grad_norm": 0.22030897438526154, + "learning_rate": 5.815979231187181e-05, + "loss": 1.7413, + "step": 15131 + }, + { + "epoch": 4.644567219152854, + "grad_norm": 0.2322571873664856, + "learning_rate": 5.815488834855014e-05, + "loss": 1.7305, + "step": 15132 + }, + { + "epoch": 4.64487415592388, + "grad_norm": 0.25256821513175964, + "learning_rate": 5.814998430463709e-05, + "loss": 1.7533, + "step": 15133 + }, + { + "epoch": 4.645181092694905, + "grad_norm": 0.248504638671875, + "learning_rate": 5.81450801801811e-05, + "loss": 1.7345, + "step": 15134 + }, + { + "epoch": 4.64548802946593, + "grad_norm": 0.22850964963436127, + "learning_rate": 5.8140175975230673e-05, + "loss": 1.8308, + "step": 15135 + }, + { + "epoch": 4.645794966236955, + "grad_norm": 0.3517951965332031, + "learning_rate": 5.813527168983426e-05, + "loss": 1.811, + "step": 15136 + }, + { + "epoch": 4.64610190300798, + "grad_norm": 0.32132068276405334, + "learning_rate": 5.813036732404031e-05, + "loss": 1.7584, + "step": 15137 + }, + { + "epoch": 4.6464088397790055, + "grad_norm": 0.2349396049976349, + "learning_rate": 5.812546287789731e-05, + "loss": 1.7762, + "step": 15138 + }, + { + "epoch": 4.646715776550031, + "grad_norm": 0.23519493639469147, + "learning_rate": 5.812055835145372e-05, + "loss": 1.7428, + "step": 15139 + }, + { + "epoch": 4.647022713321056, + "grad_norm": 0.29277852177619934, + "learning_rate": 5.8115653744758016e-05, + "loss": 1.7599, + "step": 15140 + }, + { + "epoch": 4.647329650092081, + "grad_norm": 0.2347593754529953, + "learning_rate": 5.811074905785867e-05, + "loss": 1.7401, + "step": 15141 + }, + { + "epoch": 4.647636586863106, + "grad_norm": 0.23080264031887054, + "learning_rate": 5.8105844290804147e-05, + "loss": 1.7705, + "step": 15142 + }, + { + "epoch": 4.647943523634131, + "grad_norm": 0.24686801433563232, + "learning_rate": 5.810093944364291e-05, + "loss": 1.7409, + "step": 15143 + }, + { + "epoch": 4.648250460405157, + "grad_norm": 0.24098120629787445, + "learning_rate": 5.809603451642344e-05, + "loss": 1.7893, + "step": 15144 + }, + { + "epoch": 4.648557397176182, + "grad_norm": 0.23020638525485992, + "learning_rate": 5.809112950919422e-05, + "loss": 1.7589, + "step": 15145 + }, + { + "epoch": 4.648864333947207, + "grad_norm": 0.3036736249923706, + "learning_rate": 5.808622442200371e-05, + "loss": 1.7964, + "step": 15146 + }, + { + "epoch": 4.649171270718232, + "grad_norm": 0.2965635657310486, + "learning_rate": 5.808131925490039e-05, + "loss": 1.7986, + "step": 15147 + }, + { + "epoch": 4.649478207489257, + "grad_norm": 0.22241640090942383, + "learning_rate": 5.8076414007932745e-05, + "loss": 1.749, + "step": 15148 + }, + { + "epoch": 4.649785144260282, + "grad_norm": 0.20304246246814728, + "learning_rate": 5.8071508681149246e-05, + "loss": 1.7374, + "step": 15149 + }, + { + "epoch": 4.650092081031308, + "grad_norm": 0.19534410536289215, + "learning_rate": 5.806660327459834e-05, + "loss": 1.7087, + "step": 15150 + }, + { + "epoch": 4.650399017802332, + "grad_norm": 0.2151753008365631, + "learning_rate": 5.806169778832856e-05, + "loss": 1.7409, + "step": 15151 + }, + { + "epoch": 4.650705954573358, + "grad_norm": 0.2180301696062088, + "learning_rate": 5.805679222238836e-05, + "loss": 1.7522, + "step": 15152 + }, + { + "epoch": 4.651012891344383, + "grad_norm": 0.19917607307434082, + "learning_rate": 5.8051886576826205e-05, + "loss": 1.768, + "step": 15153 + }, + { + "epoch": 4.651319828115408, + "grad_norm": 0.2312052994966507, + "learning_rate": 5.804698085169059e-05, + "loss": 1.7799, + "step": 15154 + }, + { + "epoch": 4.651626764886434, + "grad_norm": 0.21541514992713928, + "learning_rate": 5.804207504702999e-05, + "loss": 1.7595, + "step": 15155 + }, + { + "epoch": 4.651933701657459, + "grad_norm": 0.2029450386762619, + "learning_rate": 5.803716916289289e-05, + "loss": 1.7727, + "step": 15156 + }, + { + "epoch": 4.652240638428484, + "grad_norm": 0.21796850860118866, + "learning_rate": 5.8032263199327787e-05, + "loss": 1.7445, + "step": 15157 + }, + { + "epoch": 4.652547575199509, + "grad_norm": 0.20309078693389893, + "learning_rate": 5.802735715638314e-05, + "loss": 1.6971, + "step": 15158 + }, + { + "epoch": 4.652854511970534, + "grad_norm": 0.21270112693309784, + "learning_rate": 5.802245103410745e-05, + "loss": 1.7162, + "step": 15159 + }, + { + "epoch": 4.653161448741559, + "grad_norm": 0.25357750058174133, + "learning_rate": 5.8017544832549184e-05, + "loss": 1.7534, + "step": 15160 + }, + { + "epoch": 4.653468385512585, + "grad_norm": 0.24015015363693237, + "learning_rate": 5.8012638551756847e-05, + "loss": 1.7639, + "step": 15161 + }, + { + "epoch": 4.653775322283609, + "grad_norm": 0.20507018268108368, + "learning_rate": 5.800773219177893e-05, + "loss": 1.7293, + "step": 15162 + }, + { + "epoch": 4.6540822590546345, + "grad_norm": 0.23399868607521057, + "learning_rate": 5.800282575266389e-05, + "loss": 1.8286, + "step": 15163 + }, + { + "epoch": 4.65438919582566, + "grad_norm": 0.27126726508140564, + "learning_rate": 5.799791923446025e-05, + "loss": 1.8028, + "step": 15164 + }, + { + "epoch": 4.654696132596685, + "grad_norm": 0.23644569516181946, + "learning_rate": 5.7993012637216494e-05, + "loss": 1.7138, + "step": 15165 + }, + { + "epoch": 4.6550030693677105, + "grad_norm": 0.21557916700839996, + "learning_rate": 5.7988105960981086e-05, + "loss": 1.7703, + "step": 15166 + }, + { + "epoch": 4.655310006138736, + "grad_norm": 0.22030150890350342, + "learning_rate": 5.798319920580254e-05, + "loss": 1.7282, + "step": 15167 + }, + { + "epoch": 4.65561694290976, + "grad_norm": 0.2092939168214798, + "learning_rate": 5.7978292371729325e-05, + "loss": 1.7853, + "step": 15168 + }, + { + "epoch": 4.655923879680786, + "grad_norm": 0.21643707156181335, + "learning_rate": 5.797338545880997e-05, + "loss": 1.7582, + "step": 15169 + }, + { + "epoch": 4.656230816451811, + "grad_norm": 0.3064669668674469, + "learning_rate": 5.796847846709294e-05, + "loss": 1.8139, + "step": 15170 + }, + { + "epoch": 4.656537753222836, + "grad_norm": 0.3060479760169983, + "learning_rate": 5.796357139662674e-05, + "loss": 1.7356, + "step": 15171 + }, + { + "epoch": 4.656844689993862, + "grad_norm": 0.23546656966209412, + "learning_rate": 5.7958664247459835e-05, + "loss": 1.7937, + "step": 15172 + }, + { + "epoch": 4.657151626764886, + "grad_norm": 0.2890888750553131, + "learning_rate": 5.795375701964077e-05, + "loss": 1.7305, + "step": 15173 + }, + { + "epoch": 4.657458563535911, + "grad_norm": 0.27948084473609924, + "learning_rate": 5.794884971321801e-05, + "loss": 1.7428, + "step": 15174 + }, + { + "epoch": 4.657765500306937, + "grad_norm": 0.2354089468717575, + "learning_rate": 5.794394232824007e-05, + "loss": 1.7622, + "step": 15175 + }, + { + "epoch": 4.658072437077962, + "grad_norm": 0.3271159827709198, + "learning_rate": 5.793903486475541e-05, + "loss": 1.7826, + "step": 15176 + }, + { + "epoch": 4.658379373848987, + "grad_norm": 0.3561338782310486, + "learning_rate": 5.793412732281257e-05, + "loss": 1.7698, + "step": 15177 + }, + { + "epoch": 4.658686310620013, + "grad_norm": 0.2913050949573517, + "learning_rate": 5.7929219702460035e-05, + "loss": 1.8156, + "step": 15178 + }, + { + "epoch": 4.658993247391037, + "grad_norm": 0.2345089465379715, + "learning_rate": 5.7924312003746294e-05, + "loss": 1.7859, + "step": 15179 + }, + { + "epoch": 4.6593001841620625, + "grad_norm": 0.3018132150173187, + "learning_rate": 5.7919404226719865e-05, + "loss": 1.7622, + "step": 15180 + }, + { + "epoch": 4.659607120933088, + "grad_norm": 0.29134172201156616, + "learning_rate": 5.791449637142924e-05, + "loss": 1.7287, + "step": 15181 + }, + { + "epoch": 4.659914057704113, + "grad_norm": 0.24126321077346802, + "learning_rate": 5.7909588437922924e-05, + "loss": 1.7969, + "step": 15182 + }, + { + "epoch": 4.6602209944751385, + "grad_norm": 0.27053284645080566, + "learning_rate": 5.7904680426249415e-05, + "loss": 1.7399, + "step": 15183 + }, + { + "epoch": 4.660527931246163, + "grad_norm": 0.2636512219905853, + "learning_rate": 5.789977233645722e-05, + "loss": 1.7615, + "step": 15184 + }, + { + "epoch": 4.660834868017188, + "grad_norm": 0.2263207584619522, + "learning_rate": 5.789486416859484e-05, + "loss": 1.7668, + "step": 15185 + }, + { + "epoch": 4.661141804788214, + "grad_norm": 0.25387826561927795, + "learning_rate": 5.78899559227108e-05, + "loss": 1.7594, + "step": 15186 + }, + { + "epoch": 4.661448741559239, + "grad_norm": 0.2268977165222168, + "learning_rate": 5.7885047598853596e-05, + "loss": 1.75, + "step": 15187 + }, + { + "epoch": 4.661755678330264, + "grad_norm": 0.29093095660209656, + "learning_rate": 5.788013919707172e-05, + "loss": 1.7291, + "step": 15188 + }, + { + "epoch": 4.66206261510129, + "grad_norm": 0.26578736305236816, + "learning_rate": 5.7875230717413684e-05, + "loss": 1.7276, + "step": 15189 + }, + { + "epoch": 4.662369551872314, + "grad_norm": 0.2548983097076416, + "learning_rate": 5.7870322159928e-05, + "loss": 1.755, + "step": 15190 + }, + { + "epoch": 4.662676488643339, + "grad_norm": 0.2246701419353485, + "learning_rate": 5.7865413524663184e-05, + "loss": 1.751, + "step": 15191 + }, + { + "epoch": 4.662983425414365, + "grad_norm": 0.3069002032279968, + "learning_rate": 5.7860504811667747e-05, + "loss": 1.7522, + "step": 15192 + }, + { + "epoch": 4.66329036218539, + "grad_norm": 0.3081241250038147, + "learning_rate": 5.7855596020990186e-05, + "loss": 1.7152, + "step": 15193 + }, + { + "epoch": 4.6635972989564145, + "grad_norm": 0.29006731510162354, + "learning_rate": 5.7850687152679026e-05, + "loss": 1.8471, + "step": 15194 + }, + { + "epoch": 4.66390423572744, + "grad_norm": 0.24131664633750916, + "learning_rate": 5.7845778206782786e-05, + "loss": 1.763, + "step": 15195 + }, + { + "epoch": 4.664211172498465, + "grad_norm": 0.21808001399040222, + "learning_rate": 5.784086918334994e-05, + "loss": 1.6989, + "step": 15196 + }, + { + "epoch": 4.6645181092694905, + "grad_norm": 0.2413240373134613, + "learning_rate": 5.783596008242904e-05, + "loss": 1.7869, + "step": 15197 + }, + { + "epoch": 4.664825046040516, + "grad_norm": 0.23310934007167816, + "learning_rate": 5.7831050904068594e-05, + "loss": 1.8017, + "step": 15198 + }, + { + "epoch": 4.665131982811541, + "grad_norm": 0.2577926814556122, + "learning_rate": 5.7826141648317125e-05, + "loss": 1.6938, + "step": 15199 + }, + { + "epoch": 4.665438919582566, + "grad_norm": 0.22523443400859833, + "learning_rate": 5.782123231522312e-05, + "loss": 1.8104, + "step": 15200 + }, + { + "epoch": 4.665745856353591, + "grad_norm": 0.23603026568889618, + "learning_rate": 5.781632290483512e-05, + "loss": 1.7484, + "step": 15201 + }, + { + "epoch": 4.666052793124616, + "grad_norm": 0.23195989429950714, + "learning_rate": 5.781141341720162e-05, + "loss": 1.7786, + "step": 15202 + }, + { + "epoch": 4.666359729895642, + "grad_norm": 0.21838274598121643, + "learning_rate": 5.780650385237118e-05, + "loss": 1.7509, + "step": 15203 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.26656514406204224, + "learning_rate": 5.780159421039229e-05, + "loss": 1.7875, + "step": 15204 + }, + { + "epoch": 4.666973603437691, + "grad_norm": 0.2293243706226349, + "learning_rate": 5.7796684491313456e-05, + "loss": 1.7518, + "step": 15205 + }, + { + "epoch": 4.667280540208717, + "grad_norm": 0.24190817773342133, + "learning_rate": 5.779177469518323e-05, + "loss": 1.7593, + "step": 15206 + }, + { + "epoch": 4.667587476979742, + "grad_norm": 0.31113871932029724, + "learning_rate": 5.77868648220501e-05, + "loss": 1.7911, + "step": 15207 + }, + { + "epoch": 4.667894413750767, + "grad_norm": 0.2875262498855591, + "learning_rate": 5.778195487196263e-05, + "loss": 1.7871, + "step": 15208 + }, + { + "epoch": 4.668201350521793, + "grad_norm": 0.2172149419784546, + "learning_rate": 5.777704484496931e-05, + "loss": 1.7592, + "step": 15209 + }, + { + "epoch": 4.668508287292818, + "grad_norm": 0.3282458186149597, + "learning_rate": 5.7772134741118675e-05, + "loss": 1.7687, + "step": 15210 + }, + { + "epoch": 4.6688152240638425, + "grad_norm": 0.36963000893592834, + "learning_rate": 5.7767224560459255e-05, + "loss": 1.812, + "step": 15211 + }, + { + "epoch": 4.669122160834868, + "grad_norm": 0.22387740015983582, + "learning_rate": 5.776231430303957e-05, + "loss": 1.7449, + "step": 15212 + }, + { + "epoch": 4.669429097605893, + "grad_norm": 0.21468734741210938, + "learning_rate": 5.775740396890813e-05, + "loss": 1.716, + "step": 15213 + }, + { + "epoch": 4.6697360343769185, + "grad_norm": 0.2478475719690323, + "learning_rate": 5.7752493558113486e-05, + "loss": 1.7182, + "step": 15214 + }, + { + "epoch": 4.670042971147944, + "grad_norm": 0.20924845337867737, + "learning_rate": 5.774758307070416e-05, + "loss": 1.784, + "step": 15215 + }, + { + "epoch": 4.670349907918968, + "grad_norm": 0.2933209538459778, + "learning_rate": 5.774267250672868e-05, + "loss": 1.8375, + "step": 15216 + }, + { + "epoch": 4.670656844689994, + "grad_norm": 0.2744538486003876, + "learning_rate": 5.7737761866235565e-05, + "loss": 1.7019, + "step": 15217 + }, + { + "epoch": 4.670963781461019, + "grad_norm": 0.20991720259189606, + "learning_rate": 5.773285114927336e-05, + "loss": 1.7189, + "step": 15218 + }, + { + "epoch": 4.671270718232044, + "grad_norm": 0.2873254716396332, + "learning_rate": 5.772794035589057e-05, + "loss": 1.7492, + "step": 15219 + }, + { + "epoch": 4.67157765500307, + "grad_norm": 0.2781519591808319, + "learning_rate": 5.772302948613576e-05, + "loss": 1.7342, + "step": 15220 + }, + { + "epoch": 4.671884591774095, + "grad_norm": 0.23288768529891968, + "learning_rate": 5.7718118540057455e-05, + "loss": 1.7245, + "step": 15221 + }, + { + "epoch": 4.672191528545119, + "grad_norm": 0.40817564725875854, + "learning_rate": 5.771320751770417e-05, + "loss": 1.7659, + "step": 15222 + }, + { + "epoch": 4.672498465316145, + "grad_norm": 0.45521771907806396, + "learning_rate": 5.770829641912444e-05, + "loss": 1.7875, + "step": 15223 + }, + { + "epoch": 4.67280540208717, + "grad_norm": 0.22353248298168182, + "learning_rate": 5.77033852443668e-05, + "loss": 1.7098, + "step": 15224 + }, + { + "epoch": 4.673112338858195, + "grad_norm": 0.4066791534423828, + "learning_rate": 5.769847399347981e-05, + "loss": 1.7277, + "step": 15225 + }, + { + "epoch": 4.67341927562922, + "grad_norm": 0.4299545884132385, + "learning_rate": 5.769356266651198e-05, + "loss": 1.7777, + "step": 15226 + }, + { + "epoch": 4.673726212400245, + "grad_norm": 0.21037638187408447, + "learning_rate": 5.768865126351186e-05, + "loss": 1.7263, + "step": 15227 + }, + { + "epoch": 4.6740331491712706, + "grad_norm": 0.3390437066555023, + "learning_rate": 5.768373978452798e-05, + "loss": 1.7457, + "step": 15228 + }, + { + "epoch": 4.674340085942296, + "grad_norm": 0.40003323554992676, + "learning_rate": 5.767882822960887e-05, + "loss": 1.8137, + "step": 15229 + }, + { + "epoch": 4.674647022713321, + "grad_norm": 0.2212848961353302, + "learning_rate": 5.767391659880308e-05, + "loss": 1.7131, + "step": 15230 + }, + { + "epoch": 4.6749539594843466, + "grad_norm": 0.30634984374046326, + "learning_rate": 5.766900489215915e-05, + "loss": 1.7775, + "step": 15231 + }, + { + "epoch": 4.675260896255372, + "grad_norm": 0.31412798166275024, + "learning_rate": 5.766409310972563e-05, + "loss": 1.7383, + "step": 15232 + }, + { + "epoch": 4.675567833026396, + "grad_norm": 0.21125225722789764, + "learning_rate": 5.7659181251551045e-05, + "loss": 1.8046, + "step": 15233 + }, + { + "epoch": 4.675874769797422, + "grad_norm": 0.3234494924545288, + "learning_rate": 5.765426931768394e-05, + "loss": 1.7838, + "step": 15234 + }, + { + "epoch": 4.676181706568447, + "grad_norm": 0.2668779194355011, + "learning_rate": 5.764935730817286e-05, + "loss": 1.7464, + "step": 15235 + }, + { + "epoch": 4.676488643339472, + "grad_norm": 0.22423583269119263, + "learning_rate": 5.764444522306633e-05, + "loss": 1.7165, + "step": 15236 + }, + { + "epoch": 4.676795580110497, + "grad_norm": 0.29066675901412964, + "learning_rate": 5.7639533062412945e-05, + "loss": 1.75, + "step": 15237 + }, + { + "epoch": 4.677102516881522, + "grad_norm": 0.2963598370552063, + "learning_rate": 5.76346208262612e-05, + "loss": 1.8168, + "step": 15238 + }, + { + "epoch": 4.6774094536525475, + "grad_norm": 0.21484358608722687, + "learning_rate": 5.7629708514659655e-05, + "loss": 1.71, + "step": 15239 + }, + { + "epoch": 4.677716390423573, + "grad_norm": 0.20657925307750702, + "learning_rate": 5.762479612765686e-05, + "loss": 1.7239, + "step": 15240 + }, + { + "epoch": 4.678023327194598, + "grad_norm": 0.21336235105991364, + "learning_rate": 5.761988366530136e-05, + "loss": 1.7952, + "step": 15241 + }, + { + "epoch": 4.6783302639656235, + "grad_norm": 0.24156586825847626, + "learning_rate": 5.7614971127641696e-05, + "loss": 1.7709, + "step": 15242 + }, + { + "epoch": 4.678637200736648, + "grad_norm": 0.2633824944496155, + "learning_rate": 5.761005851472643e-05, + "loss": 1.7404, + "step": 15243 + }, + { + "epoch": 4.678944137507673, + "grad_norm": 0.23302829265594482, + "learning_rate": 5.760514582660411e-05, + "loss": 1.7006, + "step": 15244 + }, + { + "epoch": 4.679251074278699, + "grad_norm": 0.22404874861240387, + "learning_rate": 5.7600233063323283e-05, + "loss": 1.7731, + "step": 15245 + }, + { + "epoch": 4.679558011049724, + "grad_norm": 0.23217839002609253, + "learning_rate": 5.7595320224932495e-05, + "loss": 1.7452, + "step": 15246 + }, + { + "epoch": 4.679864947820749, + "grad_norm": 0.23131491243839264, + "learning_rate": 5.7590407311480296e-05, + "loss": 1.7547, + "step": 15247 + }, + { + "epoch": 4.680171884591774, + "grad_norm": 0.21907350420951843, + "learning_rate": 5.7585494323015245e-05, + "loss": 1.7556, + "step": 15248 + }, + { + "epoch": 4.680478821362799, + "grad_norm": 0.22416768968105316, + "learning_rate": 5.7580581259585895e-05, + "loss": 1.7783, + "step": 15249 + }, + { + "epoch": 4.680785758133824, + "grad_norm": 0.20203055441379547, + "learning_rate": 5.75756681212408e-05, + "loss": 1.7285, + "step": 15250 + }, + { + "epoch": 4.68109269490485, + "grad_norm": 0.27838602662086487, + "learning_rate": 5.75707549080285e-05, + "loss": 1.7489, + "step": 15251 + }, + { + "epoch": 4.681399631675875, + "grad_norm": 0.2415023297071457, + "learning_rate": 5.7565841619997586e-05, + "loss": 1.7453, + "step": 15252 + }, + { + "epoch": 4.6817065684469, + "grad_norm": 0.22986920177936554, + "learning_rate": 5.756092825719658e-05, + "loss": 1.7315, + "step": 15253 + }, + { + "epoch": 4.682013505217925, + "grad_norm": 0.2427850216627121, + "learning_rate": 5.755601481967404e-05, + "loss": 1.772, + "step": 15254 + }, + { + "epoch": 4.68232044198895, + "grad_norm": 0.24556589126586914, + "learning_rate": 5.755110130747854e-05, + "loss": 1.7475, + "step": 15255 + }, + { + "epoch": 4.6826273787599755, + "grad_norm": 0.25252529978752136, + "learning_rate": 5.754618772065864e-05, + "loss": 1.7152, + "step": 15256 + }, + { + "epoch": 4.682934315531001, + "grad_norm": 0.24599005281925201, + "learning_rate": 5.754127405926287e-05, + "loss": 1.7911, + "step": 15257 + }, + { + "epoch": 4.683241252302026, + "grad_norm": 0.18961480259895325, + "learning_rate": 5.7536360323339836e-05, + "loss": 1.681, + "step": 15258 + }, + { + "epoch": 4.683548189073051, + "grad_norm": 0.24372327327728271, + "learning_rate": 5.7531446512938035e-05, + "loss": 1.7771, + "step": 15259 + }, + { + "epoch": 4.683855125844076, + "grad_norm": 0.23239269852638245, + "learning_rate": 5.752653262810609e-05, + "loss": 1.7502, + "step": 15260 + }, + { + "epoch": 4.684162062615101, + "grad_norm": 0.25076135993003845, + "learning_rate": 5.752161866889254e-05, + "loss": 1.7974, + "step": 15261 + }, + { + "epoch": 4.684468999386127, + "grad_norm": 0.2703748941421509, + "learning_rate": 5.7516704635345945e-05, + "loss": 1.7245, + "step": 15262 + }, + { + "epoch": 4.684775936157152, + "grad_norm": 0.19247616827487946, + "learning_rate": 5.751179052751487e-05, + "loss": 1.7105, + "step": 15263 + }, + { + "epoch": 4.685082872928177, + "grad_norm": 0.23166817426681519, + "learning_rate": 5.750687634544787e-05, + "loss": 1.8026, + "step": 15264 + }, + { + "epoch": 4.685389809699202, + "grad_norm": 0.22434166073799133, + "learning_rate": 5.7501962089193507e-05, + "loss": 1.7779, + "step": 15265 + }, + { + "epoch": 4.685696746470227, + "grad_norm": 0.190699502825737, + "learning_rate": 5.749704775880037e-05, + "loss": 1.726, + "step": 15266 + }, + { + "epoch": 4.686003683241252, + "grad_norm": 0.22995290160179138, + "learning_rate": 5.749213335431702e-05, + "loss": 1.7495, + "step": 15267 + }, + { + "epoch": 4.686310620012278, + "grad_norm": 0.2712057828903198, + "learning_rate": 5.7487218875792016e-05, + "loss": 1.7862, + "step": 15268 + }, + { + "epoch": 4.686617556783302, + "grad_norm": 0.2524562180042267, + "learning_rate": 5.7482304323273913e-05, + "loss": 1.7092, + "step": 15269 + }, + { + "epoch": 4.6869244935543275, + "grad_norm": 0.23810559511184692, + "learning_rate": 5.747738969681131e-05, + "loss": 1.8049, + "step": 15270 + }, + { + "epoch": 4.687231430325353, + "grad_norm": 0.25521910190582275, + "learning_rate": 5.747247499645275e-05, + "loss": 1.8124, + "step": 15271 + }, + { + "epoch": 4.687538367096378, + "grad_norm": 0.27797845005989075, + "learning_rate": 5.746756022224682e-05, + "loss": 1.7694, + "step": 15272 + }, + { + "epoch": 4.6878453038674035, + "grad_norm": 0.23849260807037354, + "learning_rate": 5.746264537424208e-05, + "loss": 1.7771, + "step": 15273 + }, + { + "epoch": 4.688152240638429, + "grad_norm": 0.24368882179260254, + "learning_rate": 5.74577304524871e-05, + "loss": 1.8143, + "step": 15274 + }, + { + "epoch": 4.688459177409453, + "grad_norm": 0.2712198793888092, + "learning_rate": 5.745281545703045e-05, + "loss": 1.7683, + "step": 15275 + }, + { + "epoch": 4.688766114180479, + "grad_norm": 0.30913081765174866, + "learning_rate": 5.7447900387920716e-05, + "loss": 1.7111, + "step": 15276 + }, + { + "epoch": 4.689073050951504, + "grad_norm": 0.22123363614082336, + "learning_rate": 5.744298524520646e-05, + "loss": 1.7466, + "step": 15277 + }, + { + "epoch": 4.689379987722529, + "grad_norm": 0.32836318016052246, + "learning_rate": 5.743807002893628e-05, + "loss": 1.8083, + "step": 15278 + }, + { + "epoch": 4.689686924493555, + "grad_norm": 0.33319979906082153, + "learning_rate": 5.743315473915871e-05, + "loss": 1.7122, + "step": 15279 + }, + { + "epoch": 4.689993861264579, + "grad_norm": 0.252163290977478, + "learning_rate": 5.742823937592236e-05, + "loss": 1.7599, + "step": 15280 + }, + { + "epoch": 4.690300798035604, + "grad_norm": 0.23248571157455444, + "learning_rate": 5.7423323939275797e-05, + "loss": 1.7791, + "step": 15281 + }, + { + "epoch": 4.69060773480663, + "grad_norm": 0.27024057507514954, + "learning_rate": 5.741840842926759e-05, + "loss": 1.7608, + "step": 15282 + }, + { + "epoch": 4.690914671577655, + "grad_norm": 0.21888256072998047, + "learning_rate": 5.7413492845946326e-05, + "loss": 1.7407, + "step": 15283 + }, + { + "epoch": 4.69122160834868, + "grad_norm": 0.2574782073497772, + "learning_rate": 5.740857718936058e-05, + "loss": 1.707, + "step": 15284 + }, + { + "epoch": 4.691528545119706, + "grad_norm": 0.2541569769382477, + "learning_rate": 5.740366145955893e-05, + "loss": 1.7301, + "step": 15285 + }, + { + "epoch": 4.69183548189073, + "grad_norm": 0.23484647274017334, + "learning_rate": 5.7398745656589955e-05, + "loss": 1.772, + "step": 15286 + }, + { + "epoch": 4.6921424186617555, + "grad_norm": 0.2827093005180359, + "learning_rate": 5.739382978050225e-05, + "loss": 1.7745, + "step": 15287 + }, + { + "epoch": 4.692449355432781, + "grad_norm": 0.300387978553772, + "learning_rate": 5.738891383134437e-05, + "loss": 1.7966, + "step": 15288 + }, + { + "epoch": 4.692756292203806, + "grad_norm": 0.2414523959159851, + "learning_rate": 5.7383997809164926e-05, + "loss": 1.7355, + "step": 15289 + }, + { + "epoch": 4.6930632289748315, + "grad_norm": 0.21221841871738434, + "learning_rate": 5.737908171401248e-05, + "loss": 1.7935, + "step": 15290 + }, + { + "epoch": 4.693370165745856, + "grad_norm": 0.23488084971904755, + "learning_rate": 5.737416554593563e-05, + "loss": 1.7447, + "step": 15291 + }, + { + "epoch": 4.693677102516881, + "grad_norm": 0.26176631450653076, + "learning_rate": 5.7369249304982954e-05, + "loss": 1.769, + "step": 15292 + }, + { + "epoch": 4.693984039287907, + "grad_norm": 0.23060615360736847, + "learning_rate": 5.736433299120303e-05, + "loss": 1.7344, + "step": 15293 + }, + { + "epoch": 4.694290976058932, + "grad_norm": 0.2536846399307251, + "learning_rate": 5.7359416604644456e-05, + "loss": 1.7862, + "step": 15294 + }, + { + "epoch": 4.694597912829957, + "grad_norm": 0.23221342265605927, + "learning_rate": 5.735450014535581e-05, + "loss": 1.743, + "step": 15295 + }, + { + "epoch": 4.694904849600983, + "grad_norm": 0.25320062041282654, + "learning_rate": 5.734958361338568e-05, + "loss": 1.8001, + "step": 15296 + }, + { + "epoch": 4.695211786372007, + "grad_norm": 0.23132461309432983, + "learning_rate": 5.734466700878267e-05, + "loss": 1.7676, + "step": 15297 + }, + { + "epoch": 4.695518723143032, + "grad_norm": 0.2222728580236435, + "learning_rate": 5.7339750331595346e-05, + "loss": 1.7267, + "step": 15298 + }, + { + "epoch": 4.695825659914058, + "grad_norm": 0.2505118250846863, + "learning_rate": 5.733483358187231e-05, + "loss": 1.7467, + "step": 15299 + }, + { + "epoch": 4.696132596685083, + "grad_norm": 0.23609887063503265, + "learning_rate": 5.732991675966214e-05, + "loss": 1.7319, + "step": 15300 + }, + { + "epoch": 4.696439533456108, + "grad_norm": 0.2939738631248474, + "learning_rate": 5.732499986501345e-05, + "loss": 1.8676, + "step": 15301 + }, + { + "epoch": 4.696746470227133, + "grad_norm": 0.29868564009666443, + "learning_rate": 5.7320082897974814e-05, + "loss": 1.7541, + "step": 15302 + }, + { + "epoch": 4.697053406998158, + "grad_norm": 0.2366383820772171, + "learning_rate": 5.731516585859482e-05, + "loss": 1.7531, + "step": 15303 + }, + { + "epoch": 4.6973603437691835, + "grad_norm": 0.2721317410469055, + "learning_rate": 5.731024874692208e-05, + "loss": 1.7444, + "step": 15304 + }, + { + "epoch": 4.697667280540209, + "grad_norm": 0.24925900995731354, + "learning_rate": 5.730533156300517e-05, + "loss": 1.7716, + "step": 15305 + }, + { + "epoch": 4.697974217311234, + "grad_norm": 0.23012754321098328, + "learning_rate": 5.7300414306892704e-05, + "loss": 1.7211, + "step": 15306 + }, + { + "epoch": 4.6982811540822595, + "grad_norm": 0.21274085342884064, + "learning_rate": 5.7295496978633254e-05, + "loss": 1.7853, + "step": 15307 + }, + { + "epoch": 4.698588090853284, + "grad_norm": 0.21799001097679138, + "learning_rate": 5.729057957827544e-05, + "loss": 1.7505, + "step": 15308 + }, + { + "epoch": 4.698895027624309, + "grad_norm": 0.22365793585777283, + "learning_rate": 5.728566210586783e-05, + "loss": 1.7934, + "step": 15309 + }, + { + "epoch": 4.699201964395335, + "grad_norm": 0.23325085639953613, + "learning_rate": 5.728074456145903e-05, + "loss": 1.7354, + "step": 15310 + }, + { + "epoch": 4.69950890116636, + "grad_norm": 0.2175164669752121, + "learning_rate": 5.7275826945097654e-05, + "loss": 1.7541, + "step": 15311 + }, + { + "epoch": 4.699815837937384, + "grad_norm": 0.24657388031482697, + "learning_rate": 5.727090925683231e-05, + "loss": 1.814, + "step": 15312 + }, + { + "epoch": 4.70012277470841, + "grad_norm": 0.2437550574541092, + "learning_rate": 5.726599149671156e-05, + "loss": 1.7234, + "step": 15313 + }, + { + "epoch": 4.700429711479435, + "grad_norm": 0.21053487062454224, + "learning_rate": 5.726107366478402e-05, + "loss": 1.7788, + "step": 15314 + }, + { + "epoch": 4.7007366482504604, + "grad_norm": 0.2007097452878952, + "learning_rate": 5.725615576109831e-05, + "loss": 1.7453, + "step": 15315 + }, + { + "epoch": 4.701043585021486, + "grad_norm": 0.19331564009189606, + "learning_rate": 5.725123778570299e-05, + "loss": 1.7142, + "step": 15316 + }, + { + "epoch": 4.701350521792511, + "grad_norm": 0.24291567504405975, + "learning_rate": 5.7246319738646706e-05, + "loss": 1.8081, + "step": 15317 + }, + { + "epoch": 4.701657458563536, + "grad_norm": 0.21423695981502533, + "learning_rate": 5.724140161997804e-05, + "loss": 1.7021, + "step": 15318 + }, + { + "epoch": 4.701964395334561, + "grad_norm": 0.20857618749141693, + "learning_rate": 5.72364834297456e-05, + "loss": 1.7447, + "step": 15319 + }, + { + "epoch": 4.702271332105586, + "grad_norm": 0.2547401487827301, + "learning_rate": 5.7231565167998e-05, + "loss": 1.7505, + "step": 15320 + }, + { + "epoch": 4.702578268876612, + "grad_norm": 0.2729472219944, + "learning_rate": 5.7226646834783825e-05, + "loss": 1.7974, + "step": 15321 + }, + { + "epoch": 4.702885205647637, + "grad_norm": 0.23258371651172638, + "learning_rate": 5.722172843015169e-05, + "loss": 1.7562, + "step": 15322 + }, + { + "epoch": 4.703192142418661, + "grad_norm": 0.23399893939495087, + "learning_rate": 5.72168099541502e-05, + "loss": 1.7674, + "step": 15323 + }, + { + "epoch": 4.703499079189687, + "grad_norm": 0.2678206264972687, + "learning_rate": 5.721189140682797e-05, + "loss": 1.7331, + "step": 15324 + }, + { + "epoch": 4.703806015960712, + "grad_norm": 0.19472146034240723, + "learning_rate": 5.7206972788233593e-05, + "loss": 1.7003, + "step": 15325 + }, + { + "epoch": 4.704112952731737, + "grad_norm": 0.2199394404888153, + "learning_rate": 5.72020540984157e-05, + "loss": 1.7072, + "step": 15326 + }, + { + "epoch": 4.704419889502763, + "grad_norm": 0.219175323843956, + "learning_rate": 5.719713533742287e-05, + "loss": 1.7591, + "step": 15327 + }, + { + "epoch": 4.704726826273788, + "grad_norm": 0.21127547323703766, + "learning_rate": 5.719221650530374e-05, + "loss": 1.8059, + "step": 15328 + }, + { + "epoch": 4.7050337630448125, + "grad_norm": 0.22189834713935852, + "learning_rate": 5.7187297602106905e-05, + "loss": 1.7529, + "step": 15329 + }, + { + "epoch": 4.705340699815838, + "grad_norm": 0.19945195317268372, + "learning_rate": 5.7182378627881e-05, + "loss": 1.7133, + "step": 15330 + }, + { + "epoch": 4.705647636586863, + "grad_norm": 0.2177499681711197, + "learning_rate": 5.7177459582674595e-05, + "loss": 1.7451, + "step": 15331 + }, + { + "epoch": 4.7059545733578885, + "grad_norm": 0.19489440321922302, + "learning_rate": 5.717254046653635e-05, + "loss": 1.7499, + "step": 15332 + }, + { + "epoch": 4.706261510128914, + "grad_norm": 0.21366968750953674, + "learning_rate": 5.716762127951485e-05, + "loss": 1.7683, + "step": 15333 + }, + { + "epoch": 4.706568446899938, + "grad_norm": 0.2894177734851837, + "learning_rate": 5.71627020216587e-05, + "loss": 1.8235, + "step": 15334 + }, + { + "epoch": 4.706875383670964, + "grad_norm": 0.22175677120685577, + "learning_rate": 5.7157782693016534e-05, + "loss": 1.7421, + "step": 15335 + }, + { + "epoch": 4.707182320441989, + "grad_norm": 0.23653541505336761, + "learning_rate": 5.715286329363698e-05, + "loss": 1.6937, + "step": 15336 + }, + { + "epoch": 4.707489257213014, + "grad_norm": 0.3015746772289276, + "learning_rate": 5.714794382356863e-05, + "loss": 1.7159, + "step": 15337 + }, + { + "epoch": 4.70779619398404, + "grad_norm": 0.24045881628990173, + "learning_rate": 5.714302428286011e-05, + "loss": 1.7263, + "step": 15338 + }, + { + "epoch": 4.708103130755065, + "grad_norm": 0.19836920499801636, + "learning_rate": 5.7138104671560035e-05, + "loss": 1.7604, + "step": 15339 + }, + { + "epoch": 4.708410067526089, + "grad_norm": 0.2430238276720047, + "learning_rate": 5.7133184989717036e-05, + "loss": 1.7147, + "step": 15340 + }, + { + "epoch": 4.708717004297115, + "grad_norm": 0.19388417899608612, + "learning_rate": 5.712826523737971e-05, + "loss": 1.7153, + "step": 15341 + }, + { + "epoch": 4.70902394106814, + "grad_norm": 0.19648151099681854, + "learning_rate": 5.7123345414596694e-05, + "loss": 1.7373, + "step": 15342 + }, + { + "epoch": 4.709330877839165, + "grad_norm": 0.20326325297355652, + "learning_rate": 5.711842552141661e-05, + "loss": 1.7012, + "step": 15343 + }, + { + "epoch": 4.70963781461019, + "grad_norm": 0.20798304677009583, + "learning_rate": 5.711350555788806e-05, + "loss": 1.7134, + "step": 15344 + }, + { + "epoch": 4.709944751381215, + "grad_norm": 0.29318806529045105, + "learning_rate": 5.7108585524059674e-05, + "loss": 1.7661, + "step": 15345 + }, + { + "epoch": 4.7102516881522405, + "grad_norm": 0.273318350315094, + "learning_rate": 5.710366541998009e-05, + "loss": 1.7329, + "step": 15346 + }, + { + "epoch": 4.710558624923266, + "grad_norm": 0.2306031584739685, + "learning_rate": 5.7098745245697925e-05, + "loss": 1.8152, + "step": 15347 + }, + { + "epoch": 4.710865561694291, + "grad_norm": 0.27630630135536194, + "learning_rate": 5.709382500126179e-05, + "loss": 1.7955, + "step": 15348 + }, + { + "epoch": 4.7111724984653165, + "grad_norm": 0.2366025298833847, + "learning_rate": 5.7088904686720326e-05, + "loss": 1.7943, + "step": 15349 + }, + { + "epoch": 4.711479435236341, + "grad_norm": 0.24196656048297882, + "learning_rate": 5.708398430212215e-05, + "loss": 1.698, + "step": 15350 + }, + { + "epoch": 4.711786372007366, + "grad_norm": 0.2770058512687683, + "learning_rate": 5.707906384751588e-05, + "loss": 1.7618, + "step": 15351 + }, + { + "epoch": 4.712093308778392, + "grad_norm": 0.20432323217391968, + "learning_rate": 5.7074143322950157e-05, + "loss": 1.7422, + "step": 15352 + }, + { + "epoch": 4.712400245549417, + "grad_norm": 0.25543150305747986, + "learning_rate": 5.70692227284736e-05, + "loss": 1.7744, + "step": 15353 + }, + { + "epoch": 4.712707182320442, + "grad_norm": 0.24315913021564484, + "learning_rate": 5.7064302064134855e-05, + "loss": 1.7127, + "step": 15354 + }, + { + "epoch": 4.713014119091467, + "grad_norm": 0.23636099696159363, + "learning_rate": 5.705938132998252e-05, + "loss": 1.7725, + "step": 15355 + }, + { + "epoch": 4.713321055862492, + "grad_norm": 0.26809820532798767, + "learning_rate": 5.705446052606526e-05, + "loss": 1.8338, + "step": 15356 + }, + { + "epoch": 4.713627992633517, + "grad_norm": 0.24969002604484558, + "learning_rate": 5.704953965243167e-05, + "loss": 1.8225, + "step": 15357 + }, + { + "epoch": 4.713934929404543, + "grad_norm": 0.23189692199230194, + "learning_rate": 5.70446187091304e-05, + "loss": 1.7901, + "step": 15358 + }, + { + "epoch": 4.714241866175568, + "grad_norm": 0.22373750805854797, + "learning_rate": 5.703969769621008e-05, + "loss": 1.6919, + "step": 15359 + }, + { + "epoch": 4.714548802946593, + "grad_norm": 0.23963531851768494, + "learning_rate": 5.703477661371934e-05, + "loss": 1.7806, + "step": 15360 + }, + { + "epoch": 4.714855739717618, + "grad_norm": 0.20365150272846222, + "learning_rate": 5.702985546170683e-05, + "loss": 1.7207, + "step": 15361 + }, + { + "epoch": 4.715162676488643, + "grad_norm": 0.245658278465271, + "learning_rate": 5.702493424022114e-05, + "loss": 1.7589, + "step": 15362 + }, + { + "epoch": 4.7154696132596685, + "grad_norm": 0.22633756697177887, + "learning_rate": 5.702001294931094e-05, + "loss": 1.7893, + "step": 15363 + }, + { + "epoch": 4.715776550030694, + "grad_norm": 0.21587726473808289, + "learning_rate": 5.701509158902487e-05, + "loss": 1.8095, + "step": 15364 + }, + { + "epoch": 4.716083486801719, + "grad_norm": 0.22553963959217072, + "learning_rate": 5.701017015941155e-05, + "loss": 1.7419, + "step": 15365 + }, + { + "epoch": 4.716390423572744, + "grad_norm": 0.2276087999343872, + "learning_rate": 5.700524866051962e-05, + "loss": 1.7052, + "step": 15366 + }, + { + "epoch": 4.716697360343769, + "grad_norm": 0.22236761450767517, + "learning_rate": 5.700032709239771e-05, + "loss": 1.8612, + "step": 15367 + }, + { + "epoch": 4.717004297114794, + "grad_norm": 0.22816185653209686, + "learning_rate": 5.6995405455094465e-05, + "loss": 1.78, + "step": 15368 + }, + { + "epoch": 4.71731123388582, + "grad_norm": 0.21597479283809662, + "learning_rate": 5.6990483748658516e-05, + "loss": 1.8276, + "step": 15369 + }, + { + "epoch": 4.717618170656845, + "grad_norm": 0.22209586203098297, + "learning_rate": 5.6985561973138533e-05, + "loss": 1.74, + "step": 15370 + }, + { + "epoch": 4.71792510742787, + "grad_norm": 0.24249997735023499, + "learning_rate": 5.6980640128583116e-05, + "loss": 1.8035, + "step": 15371 + }, + { + "epoch": 4.718232044198895, + "grad_norm": 0.23326106369495392, + "learning_rate": 5.6975718215040943e-05, + "loss": 1.7969, + "step": 15372 + }, + { + "epoch": 4.71853898096992, + "grad_norm": 0.215044766664505, + "learning_rate": 5.6970796232560596e-05, + "loss": 1.7345, + "step": 15373 + }, + { + "epoch": 4.718845917740945, + "grad_norm": 0.20231883227825165, + "learning_rate": 5.696587418119078e-05, + "loss": 1.7231, + "step": 15374 + }, + { + "epoch": 4.719152854511971, + "grad_norm": 0.2136038839817047, + "learning_rate": 5.696095206098011e-05, + "loss": 1.7421, + "step": 15375 + }, + { + "epoch": 4.719459791282996, + "grad_norm": 0.2662335932254791, + "learning_rate": 5.6956029871977235e-05, + "loss": 1.7518, + "step": 15376 + }, + { + "epoch": 4.7197667280540205, + "grad_norm": 0.25649648904800415, + "learning_rate": 5.6951107614230783e-05, + "loss": 1.8314, + "step": 15377 + }, + { + "epoch": 4.720073664825046, + "grad_norm": 0.21995560824871063, + "learning_rate": 5.6946185287789425e-05, + "loss": 1.7511, + "step": 15378 + }, + { + "epoch": 4.720380601596071, + "grad_norm": 0.3388935923576355, + "learning_rate": 5.694126289270177e-05, + "loss": 1.7975, + "step": 15379 + }, + { + "epoch": 4.7206875383670965, + "grad_norm": 0.32886409759521484, + "learning_rate": 5.693634042901651e-05, + "loss": 1.7153, + "step": 15380 + }, + { + "epoch": 4.720994475138122, + "grad_norm": 0.21727977693080902, + "learning_rate": 5.693141789678226e-05, + "loss": 1.7095, + "step": 15381 + }, + { + "epoch": 4.721301411909147, + "grad_norm": 0.2680833041667938, + "learning_rate": 5.6926495296047675e-05, + "loss": 1.696, + "step": 15382 + }, + { + "epoch": 4.721608348680172, + "grad_norm": 0.2645499110221863, + "learning_rate": 5.692157262686141e-05, + "loss": 1.6889, + "step": 15383 + }, + { + "epoch": 4.721915285451197, + "grad_norm": 0.20362348854541779, + "learning_rate": 5.69166498892721e-05, + "loss": 1.7303, + "step": 15384 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.24259062111377716, + "learning_rate": 5.691172708332839e-05, + "loss": 1.7684, + "step": 15385 + }, + { + "epoch": 4.722529158993248, + "grad_norm": 0.24204276502132416, + "learning_rate": 5.690680420907897e-05, + "loss": 1.7728, + "step": 15386 + }, + { + "epoch": 4.722836095764272, + "grad_norm": 0.3038320243358612, + "learning_rate": 5.690188126657244e-05, + "loss": 1.7573, + "step": 15387 + }, + { + "epoch": 4.723143032535297, + "grad_norm": 0.24619868397712708, + "learning_rate": 5.689695825585749e-05, + "loss": 1.754, + "step": 15388 + }, + { + "epoch": 4.723449969306323, + "grad_norm": 0.19441325962543488, + "learning_rate": 5.689203517698276e-05, + "loss": 1.726, + "step": 15389 + }, + { + "epoch": 4.723756906077348, + "grad_norm": 0.2874276340007782, + "learning_rate": 5.688711202999688e-05, + "loss": 1.7704, + "step": 15390 + }, + { + "epoch": 4.724063842848373, + "grad_norm": 0.24488390982151031, + "learning_rate": 5.6882188814948535e-05, + "loss": 1.7477, + "step": 15391 + }, + { + "epoch": 4.724370779619399, + "grad_norm": 0.22674018144607544, + "learning_rate": 5.687726553188636e-05, + "loss": 1.7287, + "step": 15392 + }, + { + "epoch": 4.724677716390423, + "grad_norm": 0.2653258442878723, + "learning_rate": 5.687234218085902e-05, + "loss": 1.7415, + "step": 15393 + }, + { + "epoch": 4.7249846531614486, + "grad_norm": 0.20345374941825867, + "learning_rate": 5.686741876191516e-05, + "loss": 1.764, + "step": 15394 + }, + { + "epoch": 4.725291589932474, + "grad_norm": 0.23193977773189545, + "learning_rate": 5.686249527510345e-05, + "loss": 1.7557, + "step": 15395 + }, + { + "epoch": 4.725598526703499, + "grad_norm": 0.26426708698272705, + "learning_rate": 5.685757172047253e-05, + "loss": 1.7708, + "step": 15396 + }, + { + "epoch": 4.725905463474525, + "grad_norm": 0.21377156674861908, + "learning_rate": 5.685264809807107e-05, + "loss": 1.6921, + "step": 15397 + }, + { + "epoch": 4.726212400245549, + "grad_norm": 0.21628457307815552, + "learning_rate": 5.684772440794773e-05, + "loss": 1.72, + "step": 15398 + }, + { + "epoch": 4.726519337016574, + "grad_norm": 0.19200581312179565, + "learning_rate": 5.684280065015116e-05, + "loss": 1.7311, + "step": 15399 + }, + { + "epoch": 4.7268262737876, + "grad_norm": 0.22227540612220764, + "learning_rate": 5.683787682473003e-05, + "loss": 1.7451, + "step": 15400 + }, + { + "epoch": 4.727133210558625, + "grad_norm": 0.18053604662418365, + "learning_rate": 5.683295293173299e-05, + "loss": 1.6816, + "step": 15401 + }, + { + "epoch": 4.72744014732965, + "grad_norm": 0.19827169179916382, + "learning_rate": 5.682802897120869e-05, + "loss": 1.7315, + "step": 15402 + }, + { + "epoch": 4.727747084100676, + "grad_norm": 0.2768021821975708, + "learning_rate": 5.682310494320582e-05, + "loss": 1.7714, + "step": 15403 + }, + { + "epoch": 4.7280540208717, + "grad_norm": 0.2613474428653717, + "learning_rate": 5.6818180847773027e-05, + "loss": 1.7332, + "step": 15404 + }, + { + "epoch": 4.7283609576427255, + "grad_norm": 0.21546787023544312, + "learning_rate": 5.681325668495898e-05, + "loss": 1.771, + "step": 15405 + }, + { + "epoch": 4.728667894413751, + "grad_norm": 0.24442137777805328, + "learning_rate": 5.680833245481234e-05, + "loss": 1.7296, + "step": 15406 + }, + { + "epoch": 4.728974831184776, + "grad_norm": 0.2622109055519104, + "learning_rate": 5.680340815738175e-05, + "loss": 1.7778, + "step": 15407 + }, + { + "epoch": 4.7292817679558015, + "grad_norm": 0.22379513084888458, + "learning_rate": 5.6798483792715904e-05, + "loss": 1.7953, + "step": 15408 + }, + { + "epoch": 4.729588704726826, + "grad_norm": 0.21901065111160278, + "learning_rate": 5.679355936086346e-05, + "loss": 1.7287, + "step": 15409 + }, + { + "epoch": 4.729895641497851, + "grad_norm": 0.3023792505264282, + "learning_rate": 5.6788634861873066e-05, + "loss": 1.7851, + "step": 15410 + }, + { + "epoch": 4.730202578268877, + "grad_norm": 0.23882482945919037, + "learning_rate": 5.678371029579342e-05, + "loss": 1.7621, + "step": 15411 + }, + { + "epoch": 4.730509515039902, + "grad_norm": 0.2661043703556061, + "learning_rate": 5.6778785662673175e-05, + "loss": 1.7453, + "step": 15412 + }, + { + "epoch": 4.730816451810927, + "grad_norm": 0.330208957195282, + "learning_rate": 5.677386096256099e-05, + "loss": 1.761, + "step": 15413 + }, + { + "epoch": 4.731123388581953, + "grad_norm": 0.2686570882797241, + "learning_rate": 5.676893619550552e-05, + "loss": 1.7539, + "step": 15414 + }, + { + "epoch": 4.731430325352977, + "grad_norm": 0.24308046698570251, + "learning_rate": 5.676401136155548e-05, + "loss": 1.7345, + "step": 15415 + }, + { + "epoch": 4.731737262124002, + "grad_norm": 0.4137137830257416, + "learning_rate": 5.67590864607595e-05, + "loss": 1.7688, + "step": 15416 + }, + { + "epoch": 4.732044198895028, + "grad_norm": 0.32161539793014526, + "learning_rate": 5.675416149316628e-05, + "loss": 1.7881, + "step": 15417 + }, + { + "epoch": 4.732351135666053, + "grad_norm": 0.2336999475955963, + "learning_rate": 5.674923645882447e-05, + "loss": 1.755, + "step": 15418 + }, + { + "epoch": 4.7326580724370775, + "grad_norm": 0.32781684398651123, + "learning_rate": 5.6744311357782754e-05, + "loss": 1.8062, + "step": 15419 + }, + { + "epoch": 4.732965009208103, + "grad_norm": 0.2475704401731491, + "learning_rate": 5.6739386190089795e-05, + "loss": 1.725, + "step": 15420 + }, + { + "epoch": 4.733271945979128, + "grad_norm": 0.26295650005340576, + "learning_rate": 5.673446095579427e-05, + "loss": 1.7673, + "step": 15421 + }, + { + "epoch": 4.7335788827501535, + "grad_norm": 0.3454873859882355, + "learning_rate": 5.6729535654944864e-05, + "loss": 1.7523, + "step": 15422 + }, + { + "epoch": 4.733885819521179, + "grad_norm": 0.2306666374206543, + "learning_rate": 5.672461028759024e-05, + "loss": 1.7085, + "step": 15423 + }, + { + "epoch": 4.734192756292204, + "grad_norm": 0.30825871229171753, + "learning_rate": 5.671968485377908e-05, + "loss": 1.7642, + "step": 15424 + }, + { + "epoch": 4.734499693063229, + "grad_norm": 0.42611342668533325, + "learning_rate": 5.6714759353560045e-05, + "loss": 1.7832, + "step": 15425 + }, + { + "epoch": 4.734806629834254, + "grad_norm": 0.29502514004707336, + "learning_rate": 5.670983378698182e-05, + "loss": 1.8153, + "step": 15426 + }, + { + "epoch": 4.735113566605279, + "grad_norm": 0.28416305780410767, + "learning_rate": 5.6704908154093096e-05, + "loss": 1.756, + "step": 15427 + }, + { + "epoch": 4.735420503376305, + "grad_norm": 0.43111103773117065, + "learning_rate": 5.6699982454942534e-05, + "loss": 1.7797, + "step": 15428 + }, + { + "epoch": 4.73572744014733, + "grad_norm": 0.27667397260665894, + "learning_rate": 5.669505668957882e-05, + "loss": 1.7316, + "step": 15429 + }, + { + "epoch": 4.736034376918354, + "grad_norm": 0.3045295774936676, + "learning_rate": 5.669013085805063e-05, + "loss": 1.7591, + "step": 15430 + }, + { + "epoch": 4.73634131368938, + "grad_norm": 0.4494635760784149, + "learning_rate": 5.6685204960406635e-05, + "loss": 1.8295, + "step": 15431 + }, + { + "epoch": 4.736648250460405, + "grad_norm": 0.2951449453830719, + "learning_rate": 5.6680278996695544e-05, + "loss": 1.7857, + "step": 15432 + }, + { + "epoch": 4.73695518723143, + "grad_norm": 0.2714167535305023, + "learning_rate": 5.6675352966966014e-05, + "loss": 1.816, + "step": 15433 + }, + { + "epoch": 4.737262124002456, + "grad_norm": 0.32701000571250916, + "learning_rate": 5.667042687126673e-05, + "loss": 1.7637, + "step": 15434 + }, + { + "epoch": 4.737569060773481, + "grad_norm": 0.2466556429862976, + "learning_rate": 5.666550070964638e-05, + "loss": 1.7805, + "step": 15435 + }, + { + "epoch": 4.7378759975445055, + "grad_norm": 0.3283855617046356, + "learning_rate": 5.666057448215365e-05, + "loss": 1.786, + "step": 15436 + }, + { + "epoch": 4.738182934315531, + "grad_norm": 0.35860660672187805, + "learning_rate": 5.6655648188837205e-05, + "loss": 1.8309, + "step": 15437 + }, + { + "epoch": 4.738489871086556, + "grad_norm": 0.22293898463249207, + "learning_rate": 5.665072182974576e-05, + "loss": 1.7317, + "step": 15438 + }, + { + "epoch": 4.7387968078575815, + "grad_norm": 0.3155089020729065, + "learning_rate": 5.664579540492798e-05, + "loss": 1.7202, + "step": 15439 + }, + { + "epoch": 4.739103744628607, + "grad_norm": 0.28723904490470886, + "learning_rate": 5.6640868914432566e-05, + "loss": 1.7788, + "step": 15440 + }, + { + "epoch": 4.739410681399631, + "grad_norm": 0.2461984008550644, + "learning_rate": 5.6635942358308183e-05, + "loss": 1.8504, + "step": 15441 + }, + { + "epoch": 4.739717618170657, + "grad_norm": 0.2503122091293335, + "learning_rate": 5.663101573660351e-05, + "loss": 1.7375, + "step": 15442 + }, + { + "epoch": 4.740024554941682, + "grad_norm": 0.24925372004508972, + "learning_rate": 5.662608904936727e-05, + "loss": 1.7152, + "step": 15443 + }, + { + "epoch": 4.740331491712707, + "grad_norm": 0.2734573483467102, + "learning_rate": 5.662116229664813e-05, + "loss": 1.7476, + "step": 15444 + }, + { + "epoch": 4.740638428483733, + "grad_norm": 0.38122060894966125, + "learning_rate": 5.661623547849479e-05, + "loss": 1.7682, + "step": 15445 + }, + { + "epoch": 4.740945365254758, + "grad_norm": 0.3786417245864868, + "learning_rate": 5.661130859495593e-05, + "loss": 1.7446, + "step": 15446 + }, + { + "epoch": 4.741252302025782, + "grad_norm": 0.22618255019187927, + "learning_rate": 5.6606381646080244e-05, + "loss": 1.7427, + "step": 15447 + }, + { + "epoch": 4.741559238796808, + "grad_norm": 0.3000899851322174, + "learning_rate": 5.6601454631916405e-05, + "loss": 1.7087, + "step": 15448 + }, + { + "epoch": 4.741866175567833, + "grad_norm": 0.36542513966560364, + "learning_rate": 5.659652755251315e-05, + "loss": 1.7985, + "step": 15449 + }, + { + "epoch": 4.742173112338858, + "grad_norm": 0.23550496995449066, + "learning_rate": 5.659160040791912e-05, + "loss": 1.8163, + "step": 15450 + }, + { + "epoch": 4.742480049109884, + "grad_norm": 0.25615251064300537, + "learning_rate": 5.658667319818305e-05, + "loss": 1.7372, + "step": 15451 + }, + { + "epoch": 4.742786985880908, + "grad_norm": 0.28744083642959595, + "learning_rate": 5.6581745923353615e-05, + "loss": 1.7193, + "step": 15452 + }, + { + "epoch": 4.7430939226519335, + "grad_norm": 0.2500229775905609, + "learning_rate": 5.65768185834795e-05, + "loss": 1.7263, + "step": 15453 + }, + { + "epoch": 4.743400859422959, + "grad_norm": 0.21520425379276276, + "learning_rate": 5.6571891178609394e-05, + "loss": 1.7337, + "step": 15454 + }, + { + "epoch": 4.743707796193984, + "grad_norm": 0.212506502866745, + "learning_rate": 5.656696370879202e-05, + "loss": 1.7672, + "step": 15455 + }, + { + "epoch": 4.7440147329650095, + "grad_norm": 0.21143417060375214, + "learning_rate": 5.656203617407607e-05, + "loss": 1.7189, + "step": 15456 + }, + { + "epoch": 4.744321669736035, + "grad_norm": 0.18320922553539276, + "learning_rate": 5.6557108574510243e-05, + "loss": 1.7521, + "step": 15457 + }, + { + "epoch": 4.744628606507059, + "grad_norm": 0.19202999770641327, + "learning_rate": 5.655218091014321e-05, + "loss": 1.6756, + "step": 15458 + }, + { + "epoch": 4.744935543278085, + "grad_norm": 0.2152331918478012, + "learning_rate": 5.654725318102367e-05, + "loss": 1.7653, + "step": 15459 + }, + { + "epoch": 4.74524248004911, + "grad_norm": 0.24565903842449188, + "learning_rate": 5.6542325387200354e-05, + "loss": 1.7654, + "step": 15460 + }, + { + "epoch": 4.745549416820135, + "grad_norm": 0.2504819333553314, + "learning_rate": 5.653739752872195e-05, + "loss": 1.7073, + "step": 15461 + }, + { + "epoch": 4.74585635359116, + "grad_norm": 0.19258706271648407, + "learning_rate": 5.653246960563714e-05, + "loss": 1.7106, + "step": 15462 + }, + { + "epoch": 4.746163290362185, + "grad_norm": 0.22961968183517456, + "learning_rate": 5.652754161799465e-05, + "loss": 1.7868, + "step": 15463 + }, + { + "epoch": 4.74647022713321, + "grad_norm": 0.2763231098651886, + "learning_rate": 5.652261356584315e-05, + "loss": 1.7714, + "step": 15464 + }, + { + "epoch": 4.746777163904236, + "grad_norm": 0.23866096138954163, + "learning_rate": 5.651768544923136e-05, + "loss": 1.7537, + "step": 15465 + }, + { + "epoch": 4.747084100675261, + "grad_norm": 0.21851976215839386, + "learning_rate": 5.6512757268207997e-05, + "loss": 1.8109, + "step": 15466 + }, + { + "epoch": 4.747391037446286, + "grad_norm": 0.22249393165111542, + "learning_rate": 5.6507829022821745e-05, + "loss": 1.7357, + "step": 15467 + }, + { + "epoch": 4.747697974217311, + "grad_norm": 0.20202289521694183, + "learning_rate": 5.650290071312131e-05, + "loss": 1.7867, + "step": 15468 + }, + { + "epoch": 4.748004910988336, + "grad_norm": 0.20618727803230286, + "learning_rate": 5.649797233915539e-05, + "loss": 1.6904, + "step": 15469 + }, + { + "epoch": 4.7483118477593615, + "grad_norm": 0.25609052181243896, + "learning_rate": 5.649304390097272e-05, + "loss": 1.7287, + "step": 15470 + }, + { + "epoch": 4.748618784530387, + "grad_norm": 0.22966544330120087, + "learning_rate": 5.648811539862195e-05, + "loss": 1.7384, + "step": 15471 + }, + { + "epoch": 4.748925721301412, + "grad_norm": 0.24070143699645996, + "learning_rate": 5.6483186832151856e-05, + "loss": 1.7625, + "step": 15472 + }, + { + "epoch": 4.749232658072437, + "grad_norm": 0.22642426192760468, + "learning_rate": 5.647825820161109e-05, + "loss": 1.7291, + "step": 15473 + }, + { + "epoch": 4.749539594843462, + "grad_norm": 0.23255646228790283, + "learning_rate": 5.64733295070484e-05, + "loss": 1.8076, + "step": 15474 + }, + { + "epoch": 4.749846531614487, + "grad_norm": 0.20902042090892792, + "learning_rate": 5.646840074851246e-05, + "loss": 1.6627, + "step": 15475 + }, + { + "epoch": 4.750153468385513, + "grad_norm": 0.21608836948871613, + "learning_rate": 5.646347192605198e-05, + "loss": 1.7458, + "step": 15476 + }, + { + "epoch": 4.750460405156538, + "grad_norm": 0.22368495166301727, + "learning_rate": 5.6458543039715694e-05, + "loss": 1.7601, + "step": 15477 + }, + { + "epoch": 4.750767341927563, + "grad_norm": 0.30586308240890503, + "learning_rate": 5.645361408955231e-05, + "loss": 1.8389, + "step": 15478 + }, + { + "epoch": 4.751074278698588, + "grad_norm": 0.25122150778770447, + "learning_rate": 5.644868507561052e-05, + "loss": 1.7509, + "step": 15479 + }, + { + "epoch": 4.751381215469613, + "grad_norm": 0.28435763716697693, + "learning_rate": 5.644375599793904e-05, + "loss": 1.7723, + "step": 15480 + }, + { + "epoch": 4.7516881522406385, + "grad_norm": 0.3111409842967987, + "learning_rate": 5.643882685658659e-05, + "loss": 1.7973, + "step": 15481 + }, + { + "epoch": 4.751995089011664, + "grad_norm": 0.3108380138874054, + "learning_rate": 5.6433897651601874e-05, + "loss": 1.8126, + "step": 15482 + }, + { + "epoch": 4.752302025782689, + "grad_norm": 0.25894731283187866, + "learning_rate": 5.642896838303362e-05, + "loss": 1.7849, + "step": 15483 + }, + { + "epoch": 4.752608962553714, + "grad_norm": 0.39321839809417725, + "learning_rate": 5.642403905093052e-05, + "loss": 1.7583, + "step": 15484 + }, + { + "epoch": 4.752915899324739, + "grad_norm": 0.3206121027469635, + "learning_rate": 5.6419109655341315e-05, + "loss": 1.8061, + "step": 15485 + }, + { + "epoch": 4.753222836095764, + "grad_norm": 0.2817624807357788, + "learning_rate": 5.64141801963147e-05, + "loss": 1.8252, + "step": 15486 + }, + { + "epoch": 4.75352977286679, + "grad_norm": 0.3344736397266388, + "learning_rate": 5.6409250673899405e-05, + "loss": 1.6975, + "step": 15487 + }, + { + "epoch": 4.753836709637815, + "grad_norm": 0.21873882412910461, + "learning_rate": 5.640432108814413e-05, + "loss": 1.7126, + "step": 15488 + }, + { + "epoch": 4.75414364640884, + "grad_norm": 0.3317199945449829, + "learning_rate": 5.639939143909758e-05, + "loss": 1.7826, + "step": 15489 + }, + { + "epoch": 4.754450583179865, + "grad_norm": 0.34901630878448486, + "learning_rate": 5.639446172680854e-05, + "loss": 1.7411, + "step": 15490 + }, + { + "epoch": 4.75475751995089, + "grad_norm": 0.24015867710113525, + "learning_rate": 5.6389531951325645e-05, + "loss": 1.7514, + "step": 15491 + }, + { + "epoch": 4.755064456721915, + "grad_norm": 0.28364554047584534, + "learning_rate": 5.6384602112697674e-05, + "loss": 1.7569, + "step": 15492 + }, + { + "epoch": 4.755371393492941, + "grad_norm": 0.3561246693134308, + "learning_rate": 5.637967221097329e-05, + "loss": 1.7212, + "step": 15493 + }, + { + "epoch": 4.755678330263965, + "grad_norm": 0.3383684456348419, + "learning_rate": 5.637474224620126e-05, + "loss": 1.6866, + "step": 15494 + }, + { + "epoch": 4.7559852670349905, + "grad_norm": 0.2399235963821411, + "learning_rate": 5.63698122184303e-05, + "loss": 1.7609, + "step": 15495 + }, + { + "epoch": 4.756292203806016, + "grad_norm": 0.38559645414352417, + "learning_rate": 5.636488212770912e-05, + "loss": 1.7509, + "step": 15496 + }, + { + "epoch": 4.756599140577041, + "grad_norm": 0.365005224943161, + "learning_rate": 5.635995197408645e-05, + "loss": 1.7894, + "step": 15497 + }, + { + "epoch": 4.7569060773480665, + "grad_norm": 0.21254757046699524, + "learning_rate": 5.635502175761099e-05, + "loss": 1.6969, + "step": 15498 + }, + { + "epoch": 4.757213014119092, + "grad_norm": 0.42865821719169617, + "learning_rate": 5.635009147833149e-05, + "loss": 1.7989, + "step": 15499 + }, + { + "epoch": 4.757519950890116, + "grad_norm": 0.35717228055000305, + "learning_rate": 5.634516113629665e-05, + "loss": 1.7338, + "step": 15500 + }, + { + "epoch": 4.757826887661142, + "grad_norm": 0.21582463383674622, + "learning_rate": 5.634023073155523e-05, + "loss": 1.7429, + "step": 15501 + }, + { + "epoch": 4.758133824432167, + "grad_norm": 0.3376842141151428, + "learning_rate": 5.633530026415592e-05, + "loss": 1.7703, + "step": 15502 + }, + { + "epoch": 4.758440761203192, + "grad_norm": 0.2760981023311615, + "learning_rate": 5.633036973414747e-05, + "loss": 1.7389, + "step": 15503 + }, + { + "epoch": 4.758747697974218, + "grad_norm": 0.3808997571468353, + "learning_rate": 5.63254391415786e-05, + "loss": 1.7513, + "step": 15504 + }, + { + "epoch": 4.759054634745242, + "grad_norm": 0.5152496695518494, + "learning_rate": 5.6320508486498014e-05, + "loss": 1.7376, + "step": 15505 + }, + { + "epoch": 4.759361571516267, + "grad_norm": 0.33983346819877625, + "learning_rate": 5.6315577768954464e-05, + "loss": 1.7209, + "step": 15506 + }, + { + "epoch": 4.759668508287293, + "grad_norm": 0.27064043283462524, + "learning_rate": 5.631064698899669e-05, + "loss": 1.7808, + "step": 15507 + }, + { + "epoch": 4.759975445058318, + "grad_norm": 0.3659237027168274, + "learning_rate": 5.630571614667339e-05, + "loss": 1.7706, + "step": 15508 + }, + { + "epoch": 4.760282381829343, + "grad_norm": 0.246379554271698, + "learning_rate": 5.63007852420333e-05, + "loss": 1.7425, + "step": 15509 + }, + { + "epoch": 4.760589318600369, + "grad_norm": 0.2683795392513275, + "learning_rate": 5.629585427512518e-05, + "loss": 1.7332, + "step": 15510 + }, + { + "epoch": 4.760896255371393, + "grad_norm": 0.32626205682754517, + "learning_rate": 5.6290923245997704e-05, + "loss": 1.786, + "step": 15511 + }, + { + "epoch": 4.7612031921424185, + "grad_norm": 0.23723098635673523, + "learning_rate": 5.6285992154699666e-05, + "loss": 1.7305, + "step": 15512 + }, + { + "epoch": 4.761510128913444, + "grad_norm": 0.26316091418266296, + "learning_rate": 5.628106100127976e-05, + "loss": 1.7804, + "step": 15513 + }, + { + "epoch": 4.761817065684469, + "grad_norm": 0.24376356601715088, + "learning_rate": 5.6276129785786726e-05, + "loss": 1.738, + "step": 15514 + }, + { + "epoch": 4.7621240024554945, + "grad_norm": 0.27778422832489014, + "learning_rate": 5.627119850826931e-05, + "loss": 1.7444, + "step": 15515 + }, + { + "epoch": 4.762430939226519, + "grad_norm": 0.3134306073188782, + "learning_rate": 5.6266267168776224e-05, + "loss": 1.7696, + "step": 15516 + }, + { + "epoch": 4.762737875997544, + "grad_norm": 0.2354283481836319, + "learning_rate": 5.6261335767356195e-05, + "loss": 1.799, + "step": 15517 + }, + { + "epoch": 4.76304481276857, + "grad_norm": 0.26902756094932556, + "learning_rate": 5.6256404304058e-05, + "loss": 1.7091, + "step": 15518 + }, + { + "epoch": 4.763351749539595, + "grad_norm": 0.2760716676712036, + "learning_rate": 5.6251472778930345e-05, + "loss": 1.742, + "step": 15519 + }, + { + "epoch": 4.76365868631062, + "grad_norm": 0.2138829231262207, + "learning_rate": 5.624654119202197e-05, + "loss": 1.7093, + "step": 15520 + }, + { + "epoch": 4.763965623081646, + "grad_norm": 0.31404614448547363, + "learning_rate": 5.624160954338162e-05, + "loss": 1.7467, + "step": 15521 + }, + { + "epoch": 4.76427255985267, + "grad_norm": 0.24810083210468292, + "learning_rate": 5.623667783305803e-05, + "loss": 1.745, + "step": 15522 + }, + { + "epoch": 4.764579496623695, + "grad_norm": 0.23674242198467255, + "learning_rate": 5.6231746061099913e-05, + "loss": 1.7662, + "step": 15523 + }, + { + "epoch": 4.764886433394721, + "grad_norm": 0.264230877161026, + "learning_rate": 5.622681422755606e-05, + "loss": 1.7627, + "step": 15524 + }, + { + "epoch": 4.765193370165746, + "grad_norm": 0.2982041537761688, + "learning_rate": 5.6221882332475165e-05, + "loss": 1.7558, + "step": 15525 + }, + { + "epoch": 4.765500306936771, + "grad_norm": 0.29215967655181885, + "learning_rate": 5.6216950375905975e-05, + "loss": 1.7981, + "step": 15526 + }, + { + "epoch": 4.765807243707796, + "grad_norm": 0.20014487206935883, + "learning_rate": 5.6212018357897244e-05, + "loss": 1.7113, + "step": 15527 + }, + { + "epoch": 4.766114180478821, + "grad_norm": 0.22359825670719147, + "learning_rate": 5.620708627849769e-05, + "loss": 1.7356, + "step": 15528 + }, + { + "epoch": 4.7664211172498465, + "grad_norm": 0.2254783809185028, + "learning_rate": 5.620215413775609e-05, + "loss": 1.7397, + "step": 15529 + }, + { + "epoch": 4.766728054020872, + "grad_norm": 0.2827560305595398, + "learning_rate": 5.619722193572117e-05, + "loss": 1.732, + "step": 15530 + }, + { + "epoch": 4.767034990791897, + "grad_norm": 0.22591307759284973, + "learning_rate": 5.619228967244165e-05, + "loss": 1.7713, + "step": 15531 + }, + { + "epoch": 4.7673419275629225, + "grad_norm": 0.25872737169265747, + "learning_rate": 5.618735734796632e-05, + "loss": 1.7291, + "step": 15532 + }, + { + "epoch": 4.767648864333947, + "grad_norm": 0.24515275657176971, + "learning_rate": 5.6182424962343884e-05, + "loss": 1.8079, + "step": 15533 + }, + { + "epoch": 4.767955801104972, + "grad_norm": 0.2456643134355545, + "learning_rate": 5.617749251562309e-05, + "loss": 1.7082, + "step": 15534 + }, + { + "epoch": 4.768262737875998, + "grad_norm": 0.21684220433235168, + "learning_rate": 5.6172560007852716e-05, + "loss": 1.7563, + "step": 15535 + }, + { + "epoch": 4.768569674647023, + "grad_norm": 0.2141445428133011, + "learning_rate": 5.616762743908147e-05, + "loss": 1.7115, + "step": 15536 + }, + { + "epoch": 4.768876611418047, + "grad_norm": 0.22502638399600983, + "learning_rate": 5.616269480935812e-05, + "loss": 1.723, + "step": 15537 + }, + { + "epoch": 4.769183548189073, + "grad_norm": 0.23387989401817322, + "learning_rate": 5.6157762118731416e-05, + "loss": 1.7775, + "step": 15538 + }, + { + "epoch": 4.769490484960098, + "grad_norm": 0.19615057110786438, + "learning_rate": 5.6152829367250096e-05, + "loss": 1.7696, + "step": 15539 + }, + { + "epoch": 4.769797421731123, + "grad_norm": 0.2408154010772705, + "learning_rate": 5.614789655496289e-05, + "loss": 1.7758, + "step": 15540 + }, + { + "epoch": 4.770104358502149, + "grad_norm": 0.20994634926319122, + "learning_rate": 5.614296368191859e-05, + "loss": 1.6935, + "step": 15541 + }, + { + "epoch": 4.770411295273174, + "grad_norm": 0.24135129153728485, + "learning_rate": 5.613803074816591e-05, + "loss": 1.7644, + "step": 15542 + }, + { + "epoch": 4.7707182320441985, + "grad_norm": 0.2380143105983734, + "learning_rate": 5.6133097753753625e-05, + "loss": 1.741, + "step": 15543 + }, + { + "epoch": 4.771025168815224, + "grad_norm": 0.30300623178482056, + "learning_rate": 5.6128164698730465e-05, + "loss": 1.7935, + "step": 15544 + }, + { + "epoch": 4.771332105586249, + "grad_norm": 0.2620760500431061, + "learning_rate": 5.612323158314519e-05, + "loss": 1.7436, + "step": 15545 + }, + { + "epoch": 4.7716390423572745, + "grad_norm": 0.3791491389274597, + "learning_rate": 5.6118298407046544e-05, + "loss": 1.7503, + "step": 15546 + }, + { + "epoch": 4.7719459791283, + "grad_norm": 0.3830909729003906, + "learning_rate": 5.61133651704833e-05, + "loss": 1.7651, + "step": 15547 + }, + { + "epoch": 4.772252915899324, + "grad_norm": 0.26680612564086914, + "learning_rate": 5.610843187350419e-05, + "loss": 1.8075, + "step": 15548 + }, + { + "epoch": 4.77255985267035, + "grad_norm": 0.38018953800201416, + "learning_rate": 5.610349851615798e-05, + "loss": 1.8301, + "step": 15549 + }, + { + "epoch": 4.772866789441375, + "grad_norm": 0.4514484107494354, + "learning_rate": 5.6098565098493414e-05, + "loss": 1.7709, + "step": 15550 + }, + { + "epoch": 4.7731737262124, + "grad_norm": 0.28267863392829895, + "learning_rate": 5.6093631620559254e-05, + "loss": 1.8087, + "step": 15551 + }, + { + "epoch": 4.773480662983426, + "grad_norm": 0.22541162371635437, + "learning_rate": 5.6088698082404256e-05, + "loss": 1.7457, + "step": 15552 + }, + { + "epoch": 4.773787599754451, + "grad_norm": 0.3012544512748718, + "learning_rate": 5.608376448407718e-05, + "loss": 1.7454, + "step": 15553 + }, + { + "epoch": 4.774094536525475, + "grad_norm": 0.2460169941186905, + "learning_rate": 5.607883082562677e-05, + "loss": 1.8237, + "step": 15554 + }, + { + "epoch": 4.774401473296501, + "grad_norm": 0.2918507158756256, + "learning_rate": 5.6073897107101804e-05, + "loss": 1.7416, + "step": 15555 + }, + { + "epoch": 4.774708410067526, + "grad_norm": 0.3104710280895233, + "learning_rate": 5.6068963328551016e-05, + "loss": 1.8162, + "step": 15556 + }, + { + "epoch": 4.7750153468385514, + "grad_norm": 0.2576459050178528, + "learning_rate": 5.606402949002317e-05, + "loss": 1.7732, + "step": 15557 + }, + { + "epoch": 4.775322283609577, + "grad_norm": 0.2373739629983902, + "learning_rate": 5.605909559156706e-05, + "loss": 1.7812, + "step": 15558 + }, + { + "epoch": 4.775629220380601, + "grad_norm": 0.30436694622039795, + "learning_rate": 5.6054161633231385e-05, + "loss": 1.7606, + "step": 15559 + }, + { + "epoch": 4.775936157151627, + "grad_norm": 0.3058558702468872, + "learning_rate": 5.604922761506495e-05, + "loss": 1.8384, + "step": 15560 + }, + { + "epoch": 4.776243093922652, + "grad_norm": 0.26421624422073364, + "learning_rate": 5.6044293537116496e-05, + "loss": 1.8041, + "step": 15561 + }, + { + "epoch": 4.776550030693677, + "grad_norm": 0.4945085346698761, + "learning_rate": 5.603935939943479e-05, + "loss": 1.7522, + "step": 15562 + }, + { + "epoch": 4.776856967464703, + "grad_norm": 0.41049134731292725, + "learning_rate": 5.6034425202068595e-05, + "loss": 1.7471, + "step": 15563 + }, + { + "epoch": 4.777163904235728, + "grad_norm": 0.22972853481769562, + "learning_rate": 5.602949094506668e-05, + "loss": 1.7041, + "step": 15564 + }, + { + "epoch": 4.777470841006752, + "grad_norm": 0.37373700737953186, + "learning_rate": 5.6024556628477785e-05, + "loss": 1.7811, + "step": 15565 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.3603375554084778, + "learning_rate": 5.6019622252350714e-05, + "loss": 1.8396, + "step": 15566 + }, + { + "epoch": 4.778084714548803, + "grad_norm": 0.2085956335067749, + "learning_rate": 5.601468781673419e-05, + "loss": 1.7453, + "step": 15567 + }, + { + "epoch": 4.778391651319828, + "grad_norm": 0.28871405124664307, + "learning_rate": 5.6009753321677e-05, + "loss": 1.7135, + "step": 15568 + }, + { + "epoch": 4.778698588090853, + "grad_norm": 0.2378411591053009, + "learning_rate": 5.600481876722791e-05, + "loss": 1.77, + "step": 15569 + }, + { + "epoch": 4.779005524861878, + "grad_norm": 0.2902696430683136, + "learning_rate": 5.599988415343567e-05, + "loss": 1.7416, + "step": 15570 + }, + { + "epoch": 4.7793124616329035, + "grad_norm": 0.36155447363853455, + "learning_rate": 5.5994949480349066e-05, + "loss": 1.7095, + "step": 15571 + }, + { + "epoch": 4.779619398403929, + "grad_norm": 0.24867403507232666, + "learning_rate": 5.599001474801686e-05, + "loss": 1.8063, + "step": 15572 + }, + { + "epoch": 4.779926335174954, + "grad_norm": 0.24853186309337616, + "learning_rate": 5.5985079956487815e-05, + "loss": 1.7537, + "step": 15573 + }, + { + "epoch": 4.7802332719459795, + "grad_norm": 0.31984636187553406, + "learning_rate": 5.598014510581071e-05, + "loss": 1.7888, + "step": 15574 + }, + { + "epoch": 4.780540208717004, + "grad_norm": 0.23907123506069183, + "learning_rate": 5.597521019603429e-05, + "loss": 1.7157, + "step": 15575 + }, + { + "epoch": 4.780847145488029, + "grad_norm": 0.25759413838386536, + "learning_rate": 5.597027522720736e-05, + "loss": 1.7579, + "step": 15576 + }, + { + "epoch": 4.781154082259055, + "grad_norm": 0.34123921394348145, + "learning_rate": 5.5965340199378654e-05, + "loss": 1.838, + "step": 15577 + }, + { + "epoch": 4.78146101903008, + "grad_norm": 0.2769980728626251, + "learning_rate": 5.596040511259697e-05, + "loss": 1.7889, + "step": 15578 + }, + { + "epoch": 4.781767955801105, + "grad_norm": 0.21936915814876556, + "learning_rate": 5.5955469966911066e-05, + "loss": 1.7434, + "step": 15579 + }, + { + "epoch": 4.78207489257213, + "grad_norm": 0.27583181858062744, + "learning_rate": 5.59505347623697e-05, + "loss": 1.7229, + "step": 15580 + }, + { + "epoch": 4.782381829343155, + "grad_norm": 0.24246171116828918, + "learning_rate": 5.594559949902168e-05, + "loss": 1.7368, + "step": 15581 + }, + { + "epoch": 4.78268876611418, + "grad_norm": 0.22705630958080292, + "learning_rate": 5.594066417691576e-05, + "loss": 1.7261, + "step": 15582 + }, + { + "epoch": 4.782995702885206, + "grad_norm": 0.23308728635311127, + "learning_rate": 5.593572879610072e-05, + "loss": 1.7451, + "step": 15583 + }, + { + "epoch": 4.783302639656231, + "grad_norm": 0.21654267609119415, + "learning_rate": 5.5930793356625324e-05, + "loss": 1.7133, + "step": 15584 + }, + { + "epoch": 4.783609576427256, + "grad_norm": 0.22884133458137512, + "learning_rate": 5.5925857858538347e-05, + "loss": 1.6899, + "step": 15585 + }, + { + "epoch": 4.783916513198281, + "grad_norm": 0.2396838665008545, + "learning_rate": 5.5920922301888555e-05, + "loss": 1.7837, + "step": 15586 + }, + { + "epoch": 4.784223449969306, + "grad_norm": 0.22941450774669647, + "learning_rate": 5.5915986686724765e-05, + "loss": 1.7443, + "step": 15587 + }, + { + "epoch": 4.7845303867403315, + "grad_norm": 0.23992502689361572, + "learning_rate": 5.591105101309572e-05, + "loss": 1.8054, + "step": 15588 + }, + { + "epoch": 4.784837323511357, + "grad_norm": 0.2540588974952698, + "learning_rate": 5.59061152810502e-05, + "loss": 1.855, + "step": 15589 + }, + { + "epoch": 4.785144260282382, + "grad_norm": 0.22691720724105835, + "learning_rate": 5.590117949063699e-05, + "loss": 1.7441, + "step": 15590 + }, + { + "epoch": 4.785451197053407, + "grad_norm": 0.23691289126873016, + "learning_rate": 5.5896243641904864e-05, + "loss": 1.8156, + "step": 15591 + }, + { + "epoch": 4.785758133824432, + "grad_norm": 0.2749332785606384, + "learning_rate": 5.589130773490261e-05, + "loss": 1.8157, + "step": 15592 + }, + { + "epoch": 4.786065070595457, + "grad_norm": 0.2435624748468399, + "learning_rate": 5.588637176967899e-05, + "loss": 1.7473, + "step": 15593 + }, + { + "epoch": 4.786372007366483, + "grad_norm": 0.22931383550167084, + "learning_rate": 5.5881435746282795e-05, + "loss": 1.7652, + "step": 15594 + }, + { + "epoch": 4.786678944137508, + "grad_norm": 0.23916593194007874, + "learning_rate": 5.587649966476282e-05, + "loss": 1.7415, + "step": 15595 + }, + { + "epoch": 4.786985880908533, + "grad_norm": 0.23483172059059143, + "learning_rate": 5.5871563525167814e-05, + "loss": 1.7308, + "step": 15596 + }, + { + "epoch": 4.787292817679558, + "grad_norm": 0.24850021302700043, + "learning_rate": 5.586662732754656e-05, + "loss": 1.8294, + "step": 15597 + }, + { + "epoch": 4.787599754450583, + "grad_norm": 0.2439260333776474, + "learning_rate": 5.586169107194788e-05, + "loss": 1.7599, + "step": 15598 + }, + { + "epoch": 4.787906691221608, + "grad_norm": 0.22379007935523987, + "learning_rate": 5.585675475842054e-05, + "loss": 1.7278, + "step": 15599 + }, + { + "epoch": 4.788213627992634, + "grad_norm": 0.2633908689022064, + "learning_rate": 5.58518183870133e-05, + "loss": 1.7318, + "step": 15600 + }, + { + "epoch": 4.788520564763659, + "grad_norm": 0.20992474257946014, + "learning_rate": 5.584688195777497e-05, + "loss": 1.7003, + "step": 15601 + }, + { + "epoch": 4.7888275015346835, + "grad_norm": 0.2460084706544876, + "learning_rate": 5.584194547075432e-05, + "loss": 1.78, + "step": 15602 + }, + { + "epoch": 4.789134438305709, + "grad_norm": 0.23955418169498444, + "learning_rate": 5.583700892600013e-05, + "loss": 1.7953, + "step": 15603 + }, + { + "epoch": 4.789441375076734, + "grad_norm": 0.2495713233947754, + "learning_rate": 5.583207232356121e-05, + "loss": 1.7874, + "step": 15604 + }, + { + "epoch": 4.7897483118477595, + "grad_norm": 0.22878028452396393, + "learning_rate": 5.5827135663486344e-05, + "loss": 1.7961, + "step": 15605 + }, + { + "epoch": 4.790055248618785, + "grad_norm": 0.2299363762140274, + "learning_rate": 5.582219894582429e-05, + "loss": 1.7497, + "step": 15606 + }, + { + "epoch": 4.79036218538981, + "grad_norm": 0.22896108031272888, + "learning_rate": 5.5817262170623865e-05, + "loss": 1.7543, + "step": 15607 + }, + { + "epoch": 4.790669122160835, + "grad_norm": 0.2150495946407318, + "learning_rate": 5.581232533793383e-05, + "loss": 1.8034, + "step": 15608 + }, + { + "epoch": 4.79097605893186, + "grad_norm": 0.21317999064922333, + "learning_rate": 5.580738844780301e-05, + "loss": 1.7482, + "step": 15609 + }, + { + "epoch": 4.791282995702885, + "grad_norm": 0.21904391050338745, + "learning_rate": 5.580245150028016e-05, + "loss": 1.7647, + "step": 15610 + }, + { + "epoch": 4.791589932473911, + "grad_norm": 0.2026481032371521, + "learning_rate": 5.5797514495414095e-05, + "loss": 1.6997, + "step": 15611 + }, + { + "epoch": 4.791896869244935, + "grad_norm": 0.22508487105369568, + "learning_rate": 5.579257743325359e-05, + "loss": 1.8258, + "step": 15612 + }, + { + "epoch": 4.79220380601596, + "grad_norm": 0.2801211178302765, + "learning_rate": 5.5787640313847435e-05, + "loss": 1.6991, + "step": 15613 + }, + { + "epoch": 4.792510742786986, + "grad_norm": 0.2696724236011505, + "learning_rate": 5.578270313724442e-05, + "loss": 1.7339, + "step": 15614 + }, + { + "epoch": 4.792817679558011, + "grad_norm": 0.2909143269062042, + "learning_rate": 5.577776590349334e-05, + "loss": 1.8481, + "step": 15615 + }, + { + "epoch": 4.793124616329036, + "grad_norm": 0.21682757139205933, + "learning_rate": 5.5772828612643005e-05, + "loss": 1.759, + "step": 15616 + }, + { + "epoch": 4.793431553100062, + "grad_norm": 0.23074059188365936, + "learning_rate": 5.576789126474219e-05, + "loss": 1.7652, + "step": 15617 + }, + { + "epoch": 4.793738489871086, + "grad_norm": 0.24018999934196472, + "learning_rate": 5.576295385983969e-05, + "loss": 1.7986, + "step": 15618 + }, + { + "epoch": 4.7940454266421115, + "grad_norm": 0.23987948894500732, + "learning_rate": 5.575801639798431e-05, + "loss": 1.779, + "step": 15619 + }, + { + "epoch": 4.794352363413137, + "grad_norm": 0.2138533890247345, + "learning_rate": 5.575307887922482e-05, + "loss": 1.7097, + "step": 15620 + }, + { + "epoch": 4.794659300184162, + "grad_norm": 0.1995106190443039, + "learning_rate": 5.5748141303610044e-05, + "loss": 1.6924, + "step": 15621 + }, + { + "epoch": 4.7949662369551875, + "grad_norm": 0.23547641932964325, + "learning_rate": 5.574320367118877e-05, + "loss": 1.8492, + "step": 15622 + }, + { + "epoch": 4.795273173726212, + "grad_norm": 0.22931239008903503, + "learning_rate": 5.5738265982009794e-05, + "loss": 1.8054, + "step": 15623 + }, + { + "epoch": 4.795580110497237, + "grad_norm": 0.19957222044467926, + "learning_rate": 5.573332823612191e-05, + "loss": 1.7464, + "step": 15624 + }, + { + "epoch": 4.795887047268263, + "grad_norm": 0.1990327090024948, + "learning_rate": 5.5728390433573905e-05, + "loss": 1.7438, + "step": 15625 + }, + { + "epoch": 4.796193984039288, + "grad_norm": 0.22276802361011505, + "learning_rate": 5.572345257441459e-05, + "loss": 1.7674, + "step": 15626 + }, + { + "epoch": 4.796500920810313, + "grad_norm": 0.2109617441892624, + "learning_rate": 5.571851465869277e-05, + "loss": 1.7577, + "step": 15627 + }, + { + "epoch": 4.796807857581339, + "grad_norm": 0.22917217016220093, + "learning_rate": 5.5713576686457234e-05, + "loss": 1.7478, + "step": 15628 + }, + { + "epoch": 4.797114794352363, + "grad_norm": 0.21016938984394073, + "learning_rate": 5.570863865775678e-05, + "loss": 1.8078, + "step": 15629 + }, + { + "epoch": 4.797421731123388, + "grad_norm": 0.22478216886520386, + "learning_rate": 5.5703700572640215e-05, + "loss": 1.7621, + "step": 15630 + }, + { + "epoch": 4.797728667894414, + "grad_norm": 0.26899904012680054, + "learning_rate": 5.569876243115634e-05, + "loss": 1.8065, + "step": 15631 + }, + { + "epoch": 4.798035604665439, + "grad_norm": 0.23187808692455292, + "learning_rate": 5.569382423335394e-05, + "loss": 1.7337, + "step": 15632 + }, + { + "epoch": 4.798342541436464, + "grad_norm": 0.2264855057001114, + "learning_rate": 5.568888597928185e-05, + "loss": 1.7879, + "step": 15633 + }, + { + "epoch": 4.798649478207489, + "grad_norm": 0.244137242436409, + "learning_rate": 5.568394766898886e-05, + "loss": 1.8307, + "step": 15634 + }, + { + "epoch": 4.798956414978514, + "grad_norm": 0.2400583177804947, + "learning_rate": 5.5679009302523744e-05, + "loss": 1.76, + "step": 15635 + }, + { + "epoch": 4.7992633517495396, + "grad_norm": 0.2324059158563614, + "learning_rate": 5.5674070879935347e-05, + "loss": 1.7594, + "step": 15636 + }, + { + "epoch": 4.799570288520565, + "grad_norm": 0.21753786504268646, + "learning_rate": 5.566913240127244e-05, + "loss": 1.7568, + "step": 15637 + }, + { + "epoch": 4.79987722529159, + "grad_norm": 0.21557624638080597, + "learning_rate": 5.566419386658386e-05, + "loss": 1.7733, + "step": 15638 + }, + { + "epoch": 4.800184162062616, + "grad_norm": 0.22795113921165466, + "learning_rate": 5.565925527591839e-05, + "loss": 1.7624, + "step": 15639 + }, + { + "epoch": 4.80049109883364, + "grad_norm": 0.23035180568695068, + "learning_rate": 5.565431662932484e-05, + "loss": 1.7436, + "step": 15640 + }, + { + "epoch": 4.800798035604665, + "grad_norm": 0.2569425404071808, + "learning_rate": 5.564937792685203e-05, + "loss": 1.7027, + "step": 15641 + }, + { + "epoch": 4.801104972375691, + "grad_norm": 0.20544980466365814, + "learning_rate": 5.564443916854875e-05, + "loss": 1.7125, + "step": 15642 + }, + { + "epoch": 4.801411909146716, + "grad_norm": 0.25040850043296814, + "learning_rate": 5.5639500354463815e-05, + "loss": 1.7646, + "step": 15643 + }, + { + "epoch": 4.8017188459177405, + "grad_norm": 0.1991344839334488, + "learning_rate": 5.563456148464602e-05, + "loss": 1.7206, + "step": 15644 + }, + { + "epoch": 4.802025782688766, + "grad_norm": 0.236537903547287, + "learning_rate": 5.56296225591442e-05, + "loss": 1.7288, + "step": 15645 + }, + { + "epoch": 4.802332719459791, + "grad_norm": 0.253619521856308, + "learning_rate": 5.562468357800714e-05, + "loss": 1.7347, + "step": 15646 + }, + { + "epoch": 4.8026396562308165, + "grad_norm": 0.22038741409778595, + "learning_rate": 5.561974454128367e-05, + "loss": 1.7854, + "step": 15647 + }, + { + "epoch": 4.802946593001842, + "grad_norm": 0.24848157167434692, + "learning_rate": 5.5614805449022576e-05, + "loss": 1.6904, + "step": 15648 + }, + { + "epoch": 4.803253529772867, + "grad_norm": 0.28735271096229553, + "learning_rate": 5.56098663012727e-05, + "loss": 1.7476, + "step": 15649 + }, + { + "epoch": 4.803560466543892, + "grad_norm": 0.2658432722091675, + "learning_rate": 5.5604927098082825e-05, + "loss": 1.7314, + "step": 15650 + }, + { + "epoch": 4.803867403314917, + "grad_norm": 0.20409154891967773, + "learning_rate": 5.559998783950179e-05, + "loss": 1.7698, + "step": 15651 + }, + { + "epoch": 4.804174340085942, + "grad_norm": 0.21932728588581085, + "learning_rate": 5.5595048525578384e-05, + "loss": 1.7808, + "step": 15652 + }, + { + "epoch": 4.804481276856968, + "grad_norm": 0.2549879848957062, + "learning_rate": 5.559010915636143e-05, + "loss": 1.8294, + "step": 15653 + }, + { + "epoch": 4.804788213627993, + "grad_norm": 0.2002289742231369, + "learning_rate": 5.5585169731899736e-05, + "loss": 1.732, + "step": 15654 + }, + { + "epoch": 4.805095150399017, + "grad_norm": 0.19988931715488434, + "learning_rate": 5.558023025224212e-05, + "loss": 1.7482, + "step": 15655 + }, + { + "epoch": 4.805402087170043, + "grad_norm": 0.21265259385108948, + "learning_rate": 5.55752907174374e-05, + "loss": 1.8003, + "step": 15656 + }, + { + "epoch": 4.805709023941068, + "grad_norm": 0.22365640103816986, + "learning_rate": 5.5570351127534395e-05, + "loss": 1.7536, + "step": 15657 + }, + { + "epoch": 4.806015960712093, + "grad_norm": 0.25516408681869507, + "learning_rate": 5.556541148258192e-05, + "loss": 1.7648, + "step": 15658 + }, + { + "epoch": 4.806322897483119, + "grad_norm": 0.24870765209197998, + "learning_rate": 5.5560471782628775e-05, + "loss": 1.7793, + "step": 15659 + }, + { + "epoch": 4.806629834254144, + "grad_norm": 0.22119416296482086, + "learning_rate": 5.555553202772379e-05, + "loss": 1.7464, + "step": 15660 + }, + { + "epoch": 4.8069367710251685, + "grad_norm": 0.2781904637813568, + "learning_rate": 5.555059221791579e-05, + "loss": 1.7537, + "step": 15661 + }, + { + "epoch": 4.807243707796194, + "grad_norm": 0.2433774471282959, + "learning_rate": 5.5545652353253574e-05, + "loss": 1.74, + "step": 15662 + }, + { + "epoch": 4.807550644567219, + "grad_norm": 0.19932180643081665, + "learning_rate": 5.554071243378598e-05, + "loss": 1.75, + "step": 15663 + }, + { + "epoch": 4.8078575813382445, + "grad_norm": 0.2428865283727646, + "learning_rate": 5.553577245956182e-05, + "loss": 1.7198, + "step": 15664 + }, + { + "epoch": 4.80816451810927, + "grad_norm": 0.2914198338985443, + "learning_rate": 5.553083243062991e-05, + "loss": 1.7544, + "step": 15665 + }, + { + "epoch": 4.808471454880294, + "grad_norm": 0.2274291068315506, + "learning_rate": 5.5525892347039056e-05, + "loss": 1.8213, + "step": 15666 + }, + { + "epoch": 4.80877839165132, + "grad_norm": 0.23662471771240234, + "learning_rate": 5.552095220883811e-05, + "loss": 1.8025, + "step": 15667 + }, + { + "epoch": 4.809085328422345, + "grad_norm": 0.23062555491924286, + "learning_rate": 5.551601201607587e-05, + "loss": 1.7109, + "step": 15668 + }, + { + "epoch": 4.80939226519337, + "grad_norm": 0.19986943900585175, + "learning_rate": 5.551107176880117e-05, + "loss": 1.7442, + "step": 15669 + }, + { + "epoch": 4.809699201964396, + "grad_norm": 0.2545560300350189, + "learning_rate": 5.5506131467062836e-05, + "loss": 1.7609, + "step": 15670 + }, + { + "epoch": 4.810006138735421, + "grad_norm": 0.253296434879303, + "learning_rate": 5.550119111090968e-05, + "loss": 1.7307, + "step": 15671 + }, + { + "epoch": 4.810313075506445, + "grad_norm": 0.19617940485477448, + "learning_rate": 5.549625070039052e-05, + "loss": 1.7507, + "step": 15672 + }, + { + "epoch": 4.810620012277471, + "grad_norm": 0.2525297999382019, + "learning_rate": 5.5491310235554193e-05, + "loss": 1.8021, + "step": 15673 + }, + { + "epoch": 4.810926949048496, + "grad_norm": 0.20537389814853668, + "learning_rate": 5.548636971644953e-05, + "loss": 1.7432, + "step": 15674 + }, + { + "epoch": 4.811233885819521, + "grad_norm": 0.19924211502075195, + "learning_rate": 5.548142914312533e-05, + "loss": 1.7741, + "step": 15675 + }, + { + "epoch": 4.811540822590547, + "grad_norm": 0.21121448278427124, + "learning_rate": 5.547648851563046e-05, + "loss": 1.7198, + "step": 15676 + }, + { + "epoch": 4.811847759361571, + "grad_norm": 0.23504914343357086, + "learning_rate": 5.547154783401369e-05, + "loss": 1.7173, + "step": 15677 + }, + { + "epoch": 4.8121546961325965, + "grad_norm": 0.2362392097711563, + "learning_rate": 5.54666070983239e-05, + "loss": 1.7752, + "step": 15678 + }, + { + "epoch": 4.812461632903622, + "grad_norm": 0.2524966895580292, + "learning_rate": 5.5461666308609886e-05, + "loss": 1.7943, + "step": 15679 + }, + { + "epoch": 4.812768569674647, + "grad_norm": 0.2250952422618866, + "learning_rate": 5.5456725464920476e-05, + "loss": 1.7606, + "step": 15680 + }, + { + "epoch": 4.8130755064456725, + "grad_norm": 0.21753156185150146, + "learning_rate": 5.5451784567304524e-05, + "loss": 1.7846, + "step": 15681 + }, + { + "epoch": 4.813382443216698, + "grad_norm": 0.220795676112175, + "learning_rate": 5.5446843615810825e-05, + "loss": 1.7422, + "step": 15682 + }, + { + "epoch": 4.813689379987722, + "grad_norm": 0.23597733676433563, + "learning_rate": 5.544190261048823e-05, + "loss": 1.7818, + "step": 15683 + }, + { + "epoch": 4.813996316758748, + "grad_norm": 0.2625976502895355, + "learning_rate": 5.543696155138557e-05, + "loss": 1.7796, + "step": 15684 + }, + { + "epoch": 4.814303253529773, + "grad_norm": 0.20515871047973633, + "learning_rate": 5.5432020438551656e-05, + "loss": 1.7096, + "step": 15685 + }, + { + "epoch": 4.814610190300798, + "grad_norm": 0.19353924691677094, + "learning_rate": 5.542707927203536e-05, + "loss": 1.7541, + "step": 15686 + }, + { + "epoch": 4.814917127071823, + "grad_norm": 0.21998172998428345, + "learning_rate": 5.5422138051885454e-05, + "loss": 1.7696, + "step": 15687 + }, + { + "epoch": 4.815224063842848, + "grad_norm": 0.27576857805252075, + "learning_rate": 5.5417196778150816e-05, + "loss": 1.7491, + "step": 15688 + }, + { + "epoch": 4.815531000613873, + "grad_norm": 0.28202036023139954, + "learning_rate": 5.5412255450880254e-05, + "loss": 1.8615, + "step": 15689 + }, + { + "epoch": 4.815837937384899, + "grad_norm": 0.29632845520973206, + "learning_rate": 5.540731407012263e-05, + "loss": 1.7698, + "step": 15690 + }, + { + "epoch": 4.816144874155924, + "grad_norm": 0.35393890738487244, + "learning_rate": 5.540237263592675e-05, + "loss": 1.7924, + "step": 15691 + }, + { + "epoch": 4.816451810926949, + "grad_norm": 0.23756493628025055, + "learning_rate": 5.5397431148341447e-05, + "loss": 1.8301, + "step": 15692 + }, + { + "epoch": 4.816758747697974, + "grad_norm": 0.310153603553772, + "learning_rate": 5.53924896074156e-05, + "loss": 1.8162, + "step": 15693 + }, + { + "epoch": 4.817065684468999, + "grad_norm": 0.3355565369129181, + "learning_rate": 5.538754801319797e-05, + "loss": 1.7738, + "step": 15694 + }, + { + "epoch": 4.8173726212400245, + "grad_norm": 0.2360079288482666, + "learning_rate": 5.5382606365737446e-05, + "loss": 1.6883, + "step": 15695 + }, + { + "epoch": 4.81767955801105, + "grad_norm": 0.2932819724082947, + "learning_rate": 5.537766466508286e-05, + "loss": 1.8045, + "step": 15696 + }, + { + "epoch": 4.817986494782075, + "grad_norm": 0.31298181414604187, + "learning_rate": 5.537272291128304e-05, + "loss": 1.7516, + "step": 15697 + }, + { + "epoch": 4.8182934315531, + "grad_norm": 0.22871924936771393, + "learning_rate": 5.5367781104386806e-05, + "loss": 1.7386, + "step": 15698 + }, + { + "epoch": 4.818600368324125, + "grad_norm": 0.27097782492637634, + "learning_rate": 5.5362839244443034e-05, + "loss": 1.733, + "step": 15699 + }, + { + "epoch": 4.81890730509515, + "grad_norm": 0.23296736180782318, + "learning_rate": 5.535789733150052e-05, + "loss": 1.7735, + "step": 15700 + }, + { + "epoch": 4.819214241866176, + "grad_norm": 0.22650237381458282, + "learning_rate": 5.5352955365608125e-05, + "loss": 1.7443, + "step": 15701 + }, + { + "epoch": 4.819521178637201, + "grad_norm": 0.25525161623954773, + "learning_rate": 5.534801334681471e-05, + "loss": 1.7379, + "step": 15702 + }, + { + "epoch": 4.819828115408226, + "grad_norm": 0.2249457836151123, + "learning_rate": 5.534307127516908e-05, + "loss": 1.7393, + "step": 15703 + }, + { + "epoch": 4.820135052179251, + "grad_norm": 0.1995566338300705, + "learning_rate": 5.5338129150720084e-05, + "loss": 1.7411, + "step": 15704 + }, + { + "epoch": 4.820441988950276, + "grad_norm": 0.250851035118103, + "learning_rate": 5.533318697351657e-05, + "loss": 1.7801, + "step": 15705 + }, + { + "epoch": 4.820748925721301, + "grad_norm": 0.3175830543041229, + "learning_rate": 5.532824474360737e-05, + "loss": 1.7553, + "step": 15706 + }, + { + "epoch": 4.821055862492327, + "grad_norm": 0.22842039167881012, + "learning_rate": 5.532330246104134e-05, + "loss": 1.7489, + "step": 15707 + }, + { + "epoch": 4.821362799263352, + "grad_norm": 0.21125485002994537, + "learning_rate": 5.531836012586732e-05, + "loss": 1.7543, + "step": 15708 + }, + { + "epoch": 4.8216697360343765, + "grad_norm": 0.33028700947761536, + "learning_rate": 5.531341773813414e-05, + "loss": 1.8237, + "step": 15709 + }, + { + "epoch": 4.821976672805402, + "grad_norm": 0.324564129114151, + "learning_rate": 5.530847529789067e-05, + "loss": 1.7288, + "step": 15710 + }, + { + "epoch": 4.822283609576427, + "grad_norm": 0.3299528956413269, + "learning_rate": 5.530353280518571e-05, + "loss": 1.7536, + "step": 15711 + }, + { + "epoch": 4.8225905463474525, + "grad_norm": 0.3535030782222748, + "learning_rate": 5.5298590260068136e-05, + "loss": 1.7941, + "step": 15712 + }, + { + "epoch": 4.822897483118478, + "grad_norm": 0.2627669870853424, + "learning_rate": 5.5293647662586804e-05, + "loss": 1.7638, + "step": 15713 + }, + { + "epoch": 4.823204419889503, + "grad_norm": 0.25569450855255127, + "learning_rate": 5.5288705012790535e-05, + "loss": 1.7396, + "step": 15714 + }, + { + "epoch": 4.823511356660528, + "grad_norm": 0.26099520921707153, + "learning_rate": 5.528376231072817e-05, + "loss": 1.7415, + "step": 15715 + }, + { + "epoch": 4.823818293431553, + "grad_norm": 0.31833693385124207, + "learning_rate": 5.527881955644858e-05, + "loss": 1.7683, + "step": 15716 + }, + { + "epoch": 4.824125230202578, + "grad_norm": 0.2753448188304901, + "learning_rate": 5.5273876750000594e-05, + "loss": 1.6653, + "step": 15717 + }, + { + "epoch": 4.824432166973604, + "grad_norm": 0.23816895484924316, + "learning_rate": 5.526893389143307e-05, + "loss": 1.7575, + "step": 15718 + }, + { + "epoch": 4.824739103744628, + "grad_norm": 0.25376051664352417, + "learning_rate": 5.5263990980794856e-05, + "loss": 1.755, + "step": 15719 + }, + { + "epoch": 4.8250460405156534, + "grad_norm": 0.2483726590871811, + "learning_rate": 5.52590480181348e-05, + "loss": 1.7566, + "step": 15720 + }, + { + "epoch": 4.825352977286679, + "grad_norm": 0.2073517143726349, + "learning_rate": 5.5254105003501746e-05, + "loss": 1.7069, + "step": 15721 + }, + { + "epoch": 4.825659914057704, + "grad_norm": 0.3166659474372864, + "learning_rate": 5.524916193694455e-05, + "loss": 1.7012, + "step": 15722 + }, + { + "epoch": 4.8259668508287294, + "grad_norm": 0.24518641829490662, + "learning_rate": 5.524421881851205e-05, + "loss": 1.7027, + "step": 15723 + }, + { + "epoch": 4.826273787599755, + "grad_norm": 0.23137906193733215, + "learning_rate": 5.523927564825311e-05, + "loss": 1.746, + "step": 15724 + }, + { + "epoch": 4.82658072437078, + "grad_norm": 0.27937051653862, + "learning_rate": 5.5234332426216586e-05, + "loss": 1.7064, + "step": 15725 + }, + { + "epoch": 4.826887661141805, + "grad_norm": 0.26408496499061584, + "learning_rate": 5.522938915245131e-05, + "loss": 1.6598, + "step": 15726 + }, + { + "epoch": 4.82719459791283, + "grad_norm": 0.22269997000694275, + "learning_rate": 5.5224445827006164e-05, + "loss": 1.7166, + "step": 15727 + }, + { + "epoch": 4.827501534683855, + "grad_norm": 0.22687453031539917, + "learning_rate": 5.5219502449929964e-05, + "loss": 1.7156, + "step": 15728 + }, + { + "epoch": 4.827808471454881, + "grad_norm": 0.26355600357055664, + "learning_rate": 5.5214559021271585e-05, + "loss": 1.8016, + "step": 15729 + }, + { + "epoch": 4.828115408225905, + "grad_norm": 0.30103012919425964, + "learning_rate": 5.520961554107987e-05, + "loss": 1.7856, + "step": 15730 + }, + { + "epoch": 4.82842234499693, + "grad_norm": 0.22604018449783325, + "learning_rate": 5.520467200940369e-05, + "loss": 1.813, + "step": 15731 + }, + { + "epoch": 4.828729281767956, + "grad_norm": 0.25435203313827515, + "learning_rate": 5.51997284262919e-05, + "loss": 1.7511, + "step": 15732 + }, + { + "epoch": 4.829036218538981, + "grad_norm": 0.2740691304206848, + "learning_rate": 5.519478479179333e-05, + "loss": 1.7326, + "step": 15733 + }, + { + "epoch": 4.829343155310006, + "grad_norm": 0.19710861146450043, + "learning_rate": 5.5189841105956866e-05, + "loss": 1.7581, + "step": 15734 + }, + { + "epoch": 4.829650092081032, + "grad_norm": 0.2315293401479721, + "learning_rate": 5.518489736883132e-05, + "loss": 1.6796, + "step": 15735 + }, + { + "epoch": 4.829957028852056, + "grad_norm": 0.2465476542711258, + "learning_rate": 5.51799535804656e-05, + "loss": 1.7276, + "step": 15736 + }, + { + "epoch": 4.8302639656230815, + "grad_norm": 0.20438486337661743, + "learning_rate": 5.5175009740908546e-05, + "loss": 1.7188, + "step": 15737 + }, + { + "epoch": 4.830570902394107, + "grad_norm": 0.24328351020812988, + "learning_rate": 5.5170065850209016e-05, + "loss": 1.7165, + "step": 15738 + }, + { + "epoch": 4.830877839165132, + "grad_norm": 0.22486837208271027, + "learning_rate": 5.516512190841586e-05, + "loss": 1.7369, + "step": 15739 + }, + { + "epoch": 4.8311847759361575, + "grad_norm": 0.2065822333097458, + "learning_rate": 5.5160177915577934e-05, + "loss": 1.7125, + "step": 15740 + }, + { + "epoch": 4.831491712707182, + "grad_norm": 0.21223095059394836, + "learning_rate": 5.5155233871744104e-05, + "loss": 1.7319, + "step": 15741 + }, + { + "epoch": 4.831798649478207, + "grad_norm": 0.25712934136390686, + "learning_rate": 5.515028977696325e-05, + "loss": 1.7847, + "step": 15742 + }, + { + "epoch": 4.832105586249233, + "grad_norm": 0.21289978921413422, + "learning_rate": 5.5145345631284215e-05, + "loss": 1.7629, + "step": 15743 + }, + { + "epoch": 4.832412523020258, + "grad_norm": 0.22347134351730347, + "learning_rate": 5.514040143475585e-05, + "loss": 1.7491, + "step": 15744 + }, + { + "epoch": 4.832719459791283, + "grad_norm": 0.20660510659217834, + "learning_rate": 5.513545718742702e-05, + "loss": 1.7377, + "step": 15745 + }, + { + "epoch": 4.833026396562309, + "grad_norm": 0.21612273156642914, + "learning_rate": 5.513051288934658e-05, + "loss": 1.7973, + "step": 15746 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.22515933215618134, + "learning_rate": 5.512556854056342e-05, + "loss": 1.7774, + "step": 15747 + }, + { + "epoch": 4.833640270104358, + "grad_norm": 0.21075554192066193, + "learning_rate": 5.512062414112639e-05, + "loss": 1.7741, + "step": 15748 + }, + { + "epoch": 4.833947206875384, + "grad_norm": 0.2203720659017563, + "learning_rate": 5.511567969108436e-05, + "loss": 1.7902, + "step": 15749 + }, + { + "epoch": 4.834254143646409, + "grad_norm": 0.20247167348861694, + "learning_rate": 5.511073519048616e-05, + "loss": 1.7084, + "step": 15750 + }, + { + "epoch": 4.834561080417434, + "grad_norm": 0.247711181640625, + "learning_rate": 5.5105790639380695e-05, + "loss": 1.8465, + "step": 15751 + }, + { + "epoch": 4.834868017188459, + "grad_norm": 0.22866854071617126, + "learning_rate": 5.51008460378168e-05, + "loss": 1.7252, + "step": 15752 + }, + { + "epoch": 4.835174953959484, + "grad_norm": 0.2335643470287323, + "learning_rate": 5.5095901385843374e-05, + "loss": 1.703, + "step": 15753 + }, + { + "epoch": 4.8354818907305095, + "grad_norm": 0.20874348282814026, + "learning_rate": 5.509095668350926e-05, + "loss": 1.7114, + "step": 15754 + }, + { + "epoch": 4.835788827501535, + "grad_norm": 0.19156917929649353, + "learning_rate": 5.5086011930863314e-05, + "loss": 1.6975, + "step": 15755 + }, + { + "epoch": 4.83609576427256, + "grad_norm": 0.23480524122714996, + "learning_rate": 5.508106712795443e-05, + "loss": 1.8291, + "step": 15756 + }, + { + "epoch": 4.8364027010435855, + "grad_norm": 0.20430417358875275, + "learning_rate": 5.5076122274831454e-05, + "loss": 1.7605, + "step": 15757 + }, + { + "epoch": 4.83670963781461, + "grad_norm": 0.26790598034858704, + "learning_rate": 5.5071177371543256e-05, + "loss": 1.7541, + "step": 15758 + }, + { + "epoch": 4.837016574585635, + "grad_norm": 0.3339289724826813, + "learning_rate": 5.506623241813873e-05, + "loss": 1.7566, + "step": 15759 + }, + { + "epoch": 4.837323511356661, + "grad_norm": 0.30528193712234497, + "learning_rate": 5.5061287414666726e-05, + "loss": 1.7371, + "step": 15760 + }, + { + "epoch": 4.837630448127686, + "grad_norm": 0.21059657633304596, + "learning_rate": 5.5056342361176114e-05, + "loss": 1.7599, + "step": 15761 + }, + { + "epoch": 4.83793738489871, + "grad_norm": 0.27918973565101624, + "learning_rate": 5.5051397257715756e-05, + "loss": 1.7485, + "step": 15762 + }, + { + "epoch": 4.838244321669736, + "grad_norm": 0.23147793114185333, + "learning_rate": 5.5046452104334514e-05, + "loss": 1.7121, + "step": 15763 + }, + { + "epoch": 4.838551258440761, + "grad_norm": 0.22028742730617523, + "learning_rate": 5.5041506901081294e-05, + "loss": 1.803, + "step": 15764 + }, + { + "epoch": 4.838858195211786, + "grad_norm": 0.22840891778469086, + "learning_rate": 5.5036561648004946e-05, + "loss": 1.7555, + "step": 15765 + }, + { + "epoch": 4.839165131982812, + "grad_norm": 0.2610893249511719, + "learning_rate": 5.503161634515433e-05, + "loss": 1.7873, + "step": 15766 + }, + { + "epoch": 4.839472068753837, + "grad_norm": 0.2530003786087036, + "learning_rate": 5.502667099257836e-05, + "loss": 1.7604, + "step": 15767 + }, + { + "epoch": 4.8397790055248615, + "grad_norm": 0.20120400190353394, + "learning_rate": 5.5021725590325854e-05, + "loss": 1.7476, + "step": 15768 + }, + { + "epoch": 4.840085942295887, + "grad_norm": 0.2189723700284958, + "learning_rate": 5.501678013844571e-05, + "loss": 1.7174, + "step": 15769 + }, + { + "epoch": 4.840392879066912, + "grad_norm": 0.2511899173259735, + "learning_rate": 5.501183463698683e-05, + "loss": 1.7589, + "step": 15770 + }, + { + "epoch": 4.8406998158379375, + "grad_norm": 0.24899333715438843, + "learning_rate": 5.5006889085998035e-05, + "loss": 1.7253, + "step": 15771 + }, + { + "epoch": 4.841006752608963, + "grad_norm": 0.21223559975624084, + "learning_rate": 5.5001943485528254e-05, + "loss": 1.6949, + "step": 15772 + }, + { + "epoch": 4.841313689379987, + "grad_norm": 0.21394596993923187, + "learning_rate": 5.499699783562632e-05, + "loss": 1.7827, + "step": 15773 + }, + { + "epoch": 4.841620626151013, + "grad_norm": 0.2379613220691681, + "learning_rate": 5.4992052136341134e-05, + "loss": 1.7968, + "step": 15774 + }, + { + "epoch": 4.841927562922038, + "grad_norm": 0.23748385906219482, + "learning_rate": 5.498710638772154e-05, + "loss": 1.797, + "step": 15775 + }, + { + "epoch": 4.842234499693063, + "grad_norm": 0.2502206265926361, + "learning_rate": 5.498216058981646e-05, + "loss": 1.7292, + "step": 15776 + }, + { + "epoch": 4.842541436464089, + "grad_norm": 0.23613516986370087, + "learning_rate": 5.497721474267475e-05, + "loss": 1.7353, + "step": 15777 + }, + { + "epoch": 4.842848373235114, + "grad_norm": 0.25274696946144104, + "learning_rate": 5.497226884634527e-05, + "loss": 1.7782, + "step": 15778 + }, + { + "epoch": 4.843155310006138, + "grad_norm": 0.19574183225631714, + "learning_rate": 5.496732290087694e-05, + "loss": 1.6926, + "step": 15779 + }, + { + "epoch": 4.843462246777164, + "grad_norm": 0.21040405333042145, + "learning_rate": 5.496237690631858e-05, + "loss": 1.7235, + "step": 15780 + }, + { + "epoch": 4.843769183548189, + "grad_norm": 0.22499679028987885, + "learning_rate": 5.495743086271913e-05, + "loss": 1.7889, + "step": 15781 + }, + { + "epoch": 4.844076120319214, + "grad_norm": 0.24623246490955353, + "learning_rate": 5.4952484770127433e-05, + "loss": 1.7357, + "step": 15782 + }, + { + "epoch": 4.84438305709024, + "grad_norm": 0.21706275641918182, + "learning_rate": 5.494753862859238e-05, + "loss": 1.7349, + "step": 15783 + }, + { + "epoch": 4.844689993861264, + "grad_norm": 0.20705166459083557, + "learning_rate": 5.4942592438162855e-05, + "loss": 1.7047, + "step": 15784 + }, + { + "epoch": 4.8449969306322895, + "grad_norm": 0.21216751635074615, + "learning_rate": 5.493764619888773e-05, + "loss": 1.7335, + "step": 15785 + }, + { + "epoch": 4.845303867403315, + "grad_norm": 0.2945895195007324, + "learning_rate": 5.493269991081588e-05, + "loss": 1.838, + "step": 15786 + }, + { + "epoch": 4.84561080417434, + "grad_norm": 0.22013652324676514, + "learning_rate": 5.492775357399621e-05, + "loss": 1.7541, + "step": 15787 + }, + { + "epoch": 4.8459177409453655, + "grad_norm": 0.25428512692451477, + "learning_rate": 5.4922807188477585e-05, + "loss": 1.7405, + "step": 15788 + }, + { + "epoch": 4.846224677716391, + "grad_norm": 0.23189012706279755, + "learning_rate": 5.49178607543089e-05, + "loss": 1.8075, + "step": 15789 + }, + { + "epoch": 4.846531614487415, + "grad_norm": 0.21637389063835144, + "learning_rate": 5.491291427153904e-05, + "loss": 1.7229, + "step": 15790 + }, + { + "epoch": 4.846838551258441, + "grad_norm": 0.20628009736537933, + "learning_rate": 5.490796774021687e-05, + "loss": 1.7605, + "step": 15791 + }, + { + "epoch": 4.847145488029466, + "grad_norm": 0.20845308899879456, + "learning_rate": 5.4903021160391276e-05, + "loss": 1.7864, + "step": 15792 + }, + { + "epoch": 4.847452424800491, + "grad_norm": 0.20367322862148285, + "learning_rate": 5.4898074532111164e-05, + "loss": 1.733, + "step": 15793 + }, + { + "epoch": 4.847759361571516, + "grad_norm": 0.2066505253314972, + "learning_rate": 5.489312785542543e-05, + "loss": 1.7113, + "step": 15794 + }, + { + "epoch": 4.848066298342541, + "grad_norm": 0.23874987661838531, + "learning_rate": 5.488818113038292e-05, + "loss": 1.7735, + "step": 15795 + }, + { + "epoch": 4.848373235113566, + "grad_norm": 0.26583850383758545, + "learning_rate": 5.488323435703254e-05, + "loss": 1.8019, + "step": 15796 + }, + { + "epoch": 4.848680171884592, + "grad_norm": 0.25207552313804626, + "learning_rate": 5.487828753542317e-05, + "loss": 1.7491, + "step": 15797 + }, + { + "epoch": 4.848987108655617, + "grad_norm": 0.23065905272960663, + "learning_rate": 5.48733406656037e-05, + "loss": 1.7451, + "step": 15798 + }, + { + "epoch": 4.849294045426642, + "grad_norm": 0.26914483308792114, + "learning_rate": 5.486839374762304e-05, + "loss": 1.7553, + "step": 15799 + }, + { + "epoch": 4.849600982197668, + "grad_norm": 0.2509605884552002, + "learning_rate": 5.4863446781530046e-05, + "loss": 1.7124, + "step": 15800 + }, + { + "epoch": 4.849907918968692, + "grad_norm": 0.2618432343006134, + "learning_rate": 5.485849976737362e-05, + "loss": 1.7368, + "step": 15801 + }, + { + "epoch": 4.850214855739718, + "grad_norm": 0.46875160932540894, + "learning_rate": 5.485355270520266e-05, + "loss": 1.7883, + "step": 15802 + }, + { + "epoch": 4.850521792510743, + "grad_norm": 0.37585484981536865, + "learning_rate": 5.4848605595066025e-05, + "loss": 1.7894, + "step": 15803 + }, + { + "epoch": 4.850828729281768, + "grad_norm": 0.2244408279657364, + "learning_rate": 5.4843658437012646e-05, + "loss": 1.7394, + "step": 15804 + }, + { + "epoch": 4.851135666052793, + "grad_norm": 0.4061773419380188, + "learning_rate": 5.48387112310914e-05, + "loss": 1.7703, + "step": 15805 + }, + { + "epoch": 4.851442602823818, + "grad_norm": 0.35925009846687317, + "learning_rate": 5.483376397735117e-05, + "loss": 1.7798, + "step": 15806 + }, + { + "epoch": 4.851749539594843, + "grad_norm": 0.23050184547901154, + "learning_rate": 5.482881667584084e-05, + "loss": 1.7984, + "step": 15807 + }, + { + "epoch": 4.852056476365869, + "grad_norm": 0.37308645248413086, + "learning_rate": 5.4823869326609335e-05, + "loss": 1.6747, + "step": 15808 + }, + { + "epoch": 4.852363413136894, + "grad_norm": 0.29826754331588745, + "learning_rate": 5.481892192970551e-05, + "loss": 1.7432, + "step": 15809 + }, + { + "epoch": 4.852670349907919, + "grad_norm": 0.23652370274066925, + "learning_rate": 5.4813974485178266e-05, + "loss": 1.7557, + "step": 15810 + }, + { + "epoch": 4.852977286678944, + "grad_norm": 0.40549808740615845, + "learning_rate": 5.4809026993076526e-05, + "loss": 1.7317, + "step": 15811 + }, + { + "epoch": 4.853284223449969, + "grad_norm": 0.3367961347103119, + "learning_rate": 5.4804079453449156e-05, + "loss": 1.7648, + "step": 15812 + }, + { + "epoch": 4.8535911602209945, + "grad_norm": 0.21629661321640015, + "learning_rate": 5.4799131866345055e-05, + "loss": 1.7986, + "step": 15813 + }, + { + "epoch": 4.85389809699202, + "grad_norm": 0.26381492614746094, + "learning_rate": 5.4794184231813105e-05, + "loss": 1.7401, + "step": 15814 + }, + { + "epoch": 4.854205033763045, + "grad_norm": 0.22319363057613373, + "learning_rate": 5.478923654990223e-05, + "loss": 1.7773, + "step": 15815 + }, + { + "epoch": 4.85451197053407, + "grad_norm": 0.2547159492969513, + "learning_rate": 5.4784288820661326e-05, + "loss": 1.8194, + "step": 15816 + }, + { + "epoch": 4.854818907305095, + "grad_norm": 0.29574522376060486, + "learning_rate": 5.477934104413925e-05, + "loss": 1.7351, + "step": 15817 + }, + { + "epoch": 4.85512584407612, + "grad_norm": 0.17389361560344696, + "learning_rate": 5.4774393220384945e-05, + "loss": 1.6957, + "step": 15818 + }, + { + "epoch": 4.855432780847146, + "grad_norm": 0.23746751248836517, + "learning_rate": 5.476944534944728e-05, + "loss": 1.7713, + "step": 15819 + }, + { + "epoch": 4.855739717618171, + "grad_norm": 0.182356595993042, + "learning_rate": 5.476449743137516e-05, + "loss": 1.7144, + "step": 15820 + }, + { + "epoch": 4.856046654389196, + "grad_norm": 0.23716382682323456, + "learning_rate": 5.4759549466217475e-05, + "loss": 1.7451, + "step": 15821 + }, + { + "epoch": 4.856353591160221, + "grad_norm": 0.316806823015213, + "learning_rate": 5.475460145402313e-05, + "loss": 1.7823, + "step": 15822 + }, + { + "epoch": 4.856660527931246, + "grad_norm": 0.2333129197359085, + "learning_rate": 5.474965339484105e-05, + "loss": 1.7788, + "step": 15823 + }, + { + "epoch": 4.856967464702271, + "grad_norm": 0.21180212497711182, + "learning_rate": 5.47447052887201e-05, + "loss": 1.7513, + "step": 15824 + }, + { + "epoch": 4.857274401473297, + "grad_norm": 0.22641299664974213, + "learning_rate": 5.473975713570919e-05, + "loss": 1.7514, + "step": 15825 + }, + { + "epoch": 4.857581338244322, + "grad_norm": 0.3179668188095093, + "learning_rate": 5.473480893585723e-05, + "loss": 1.7939, + "step": 15826 + }, + { + "epoch": 4.8578882750153465, + "grad_norm": 0.27463147044181824, + "learning_rate": 5.472986068921309e-05, + "loss": 1.7487, + "step": 15827 + }, + { + "epoch": 4.858195211786372, + "grad_norm": 0.18621626496315002, + "learning_rate": 5.472491239582572e-05, + "loss": 1.7155, + "step": 15828 + }, + { + "epoch": 4.858502148557397, + "grad_norm": 0.2437327802181244, + "learning_rate": 5.471996405574399e-05, + "loss": 1.7586, + "step": 15829 + }, + { + "epoch": 4.8588090853284225, + "grad_norm": 0.26658934354782104, + "learning_rate": 5.47150156690168e-05, + "loss": 1.7331, + "step": 15830 + }, + { + "epoch": 4.859116022099448, + "grad_norm": 0.2257174700498581, + "learning_rate": 5.471006723569308e-05, + "loss": 1.7556, + "step": 15831 + }, + { + "epoch": 4.859422958870473, + "grad_norm": 0.25434550642967224, + "learning_rate": 5.470511875582168e-05, + "loss": 1.7196, + "step": 15832 + }, + { + "epoch": 4.859729895641498, + "grad_norm": 0.2251453697681427, + "learning_rate": 5.470017022945156e-05, + "loss": 1.7174, + "step": 15833 + }, + { + "epoch": 4.860036832412523, + "grad_norm": 0.2757972180843353, + "learning_rate": 5.469522165663161e-05, + "loss": 1.7701, + "step": 15834 + }, + { + "epoch": 4.860343769183548, + "grad_norm": 0.2771994173526764, + "learning_rate": 5.469027303741072e-05, + "loss": 1.8085, + "step": 15835 + }, + { + "epoch": 4.860650705954574, + "grad_norm": 0.23825454711914062, + "learning_rate": 5.468532437183781e-05, + "loss": 1.733, + "step": 15836 + }, + { + "epoch": 4.860957642725598, + "grad_norm": 0.18100066483020782, + "learning_rate": 5.468037565996177e-05, + "loss": 1.7012, + "step": 15837 + }, + { + "epoch": 4.861264579496623, + "grad_norm": 0.22552812099456787, + "learning_rate": 5.4675426901831506e-05, + "loss": 1.728, + "step": 15838 + }, + { + "epoch": 4.861571516267649, + "grad_norm": 0.2505643665790558, + "learning_rate": 5.467047809749595e-05, + "loss": 1.7219, + "step": 15839 + }, + { + "epoch": 4.861878453038674, + "grad_norm": 0.25920796394348145, + "learning_rate": 5.4665529247003975e-05, + "loss": 1.7945, + "step": 15840 + }, + { + "epoch": 4.862185389809699, + "grad_norm": 0.23549394309520721, + "learning_rate": 5.466058035040452e-05, + "loss": 1.7904, + "step": 15841 + }, + { + "epoch": 4.862492326580725, + "grad_norm": 0.26510992646217346, + "learning_rate": 5.465563140774648e-05, + "loss": 1.8051, + "step": 15842 + }, + { + "epoch": 4.862799263351749, + "grad_norm": 0.19175390899181366, + "learning_rate": 5.465068241907876e-05, + "loss": 1.6799, + "step": 15843 + }, + { + "epoch": 4.8631062001227745, + "grad_norm": 0.2588976323604584, + "learning_rate": 5.464573338445025e-05, + "loss": 1.7394, + "step": 15844 + }, + { + "epoch": 4.8634131368938, + "grad_norm": 0.28729483485221863, + "learning_rate": 5.464078430390991e-05, + "loss": 1.797, + "step": 15845 + }, + { + "epoch": 4.863720073664825, + "grad_norm": 0.21302445232868195, + "learning_rate": 5.463583517750661e-05, + "loss": 1.7303, + "step": 15846 + }, + { + "epoch": 4.8640270104358505, + "grad_norm": 0.2407636195421219, + "learning_rate": 5.463088600528926e-05, + "loss": 1.7175, + "step": 15847 + }, + { + "epoch": 4.864333947206875, + "grad_norm": 0.25653502345085144, + "learning_rate": 5.4625936787306784e-05, + "loss": 1.6996, + "step": 15848 + }, + { + "epoch": 4.8646408839779, + "grad_norm": 0.2100832760334015, + "learning_rate": 5.462098752360809e-05, + "loss": 1.7416, + "step": 15849 + }, + { + "epoch": 4.864947820748926, + "grad_norm": 0.2785186469554901, + "learning_rate": 5.461603821424208e-05, + "loss": 1.74, + "step": 15850 + }, + { + "epoch": 4.865254757519951, + "grad_norm": 0.2896614968776703, + "learning_rate": 5.4611088859257696e-05, + "loss": 1.7436, + "step": 15851 + }, + { + "epoch": 4.865561694290976, + "grad_norm": 0.18890418112277985, + "learning_rate": 5.460613945870382e-05, + "loss": 1.7093, + "step": 15852 + }, + { + "epoch": 4.865868631062002, + "grad_norm": 0.27681079506874084, + "learning_rate": 5.4601190012629364e-05, + "loss": 1.8772, + "step": 15853 + }, + { + "epoch": 4.866175567833026, + "grad_norm": 0.24658115208148956, + "learning_rate": 5.4596240521083265e-05, + "loss": 1.776, + "step": 15854 + }, + { + "epoch": 4.866482504604051, + "grad_norm": 0.21958144009113312, + "learning_rate": 5.459129098411441e-05, + "loss": 1.7503, + "step": 15855 + }, + { + "epoch": 4.866789441375077, + "grad_norm": 0.2778300642967224, + "learning_rate": 5.458634140177174e-05, + "loss": 1.8194, + "step": 15856 + }, + { + "epoch": 4.867096378146102, + "grad_norm": 0.28673580288887024, + "learning_rate": 5.458139177410414e-05, + "loss": 1.8033, + "step": 15857 + }, + { + "epoch": 4.867403314917127, + "grad_norm": 0.24472850561141968, + "learning_rate": 5.457644210116055e-05, + "loss": 1.7304, + "step": 15858 + }, + { + "epoch": 4.867710251688152, + "grad_norm": 0.24581189453601837, + "learning_rate": 5.4571492382989886e-05, + "loss": 1.7443, + "step": 15859 + }, + { + "epoch": 4.868017188459177, + "grad_norm": 0.22296221554279327, + "learning_rate": 5.4566542619641045e-05, + "loss": 1.7201, + "step": 15860 + }, + { + "epoch": 4.8683241252302025, + "grad_norm": 0.2378673404455185, + "learning_rate": 5.456159281116295e-05, + "loss": 1.7893, + "step": 15861 + }, + { + "epoch": 4.868631062001228, + "grad_norm": 0.3320823907852173, + "learning_rate": 5.4556642957604534e-05, + "loss": 1.7944, + "step": 15862 + }, + { + "epoch": 4.868937998772253, + "grad_norm": 0.3303453326225281, + "learning_rate": 5.45516930590147e-05, + "loss": 1.7267, + "step": 15863 + }, + { + "epoch": 4.8692449355432785, + "grad_norm": 0.223227858543396, + "learning_rate": 5.454674311544235e-05, + "loss": 1.7477, + "step": 15864 + }, + { + "epoch": 4.869551872314303, + "grad_norm": 0.3012549579143524, + "learning_rate": 5.454179312693643e-05, + "loss": 1.731, + "step": 15865 + }, + { + "epoch": 4.869858809085328, + "grad_norm": 0.3780311942100525, + "learning_rate": 5.453684309354585e-05, + "loss": 1.7296, + "step": 15866 + }, + { + "epoch": 4.870165745856354, + "grad_norm": 0.2753889262676239, + "learning_rate": 5.4531893015319526e-05, + "loss": 1.8024, + "step": 15867 + }, + { + "epoch": 4.870472682627379, + "grad_norm": 0.2270934134721756, + "learning_rate": 5.452694289230639e-05, + "loss": 1.7095, + "step": 15868 + }, + { + "epoch": 4.870779619398404, + "grad_norm": 0.2621576488018036, + "learning_rate": 5.452199272455534e-05, + "loss": 1.75, + "step": 15869 + }, + { + "epoch": 4.871086556169429, + "grad_norm": 0.22175776958465576, + "learning_rate": 5.45170425121153e-05, + "loss": 1.7658, + "step": 15870 + }, + { + "epoch": 4.871393492940454, + "grad_norm": 0.2038736790418625, + "learning_rate": 5.451209225503521e-05, + "loss": 1.6916, + "step": 15871 + }, + { + "epoch": 4.871700429711479, + "grad_norm": 0.2493467777967453, + "learning_rate": 5.450714195336397e-05, + "loss": 1.7408, + "step": 15872 + }, + { + "epoch": 4.872007366482505, + "grad_norm": 0.1966754049062729, + "learning_rate": 5.450219160715052e-05, + "loss": 1.7379, + "step": 15873 + }, + { + "epoch": 4.87231430325353, + "grad_norm": 0.23193517327308655, + "learning_rate": 5.4497241216443775e-05, + "loss": 1.7736, + "step": 15874 + }, + { + "epoch": 4.872621240024555, + "grad_norm": 0.2164391279220581, + "learning_rate": 5.4492290781292646e-05, + "loss": 1.7618, + "step": 15875 + }, + { + "epoch": 4.87292817679558, + "grad_norm": 0.286460816860199, + "learning_rate": 5.448734030174607e-05, + "loss": 1.7745, + "step": 15876 + }, + { + "epoch": 4.873235113566605, + "grad_norm": 0.3454538881778717, + "learning_rate": 5.448238977785298e-05, + "loss": 1.7605, + "step": 15877 + }, + { + "epoch": 4.8735420503376305, + "grad_norm": 0.26775062084198, + "learning_rate": 5.447743920966227e-05, + "loss": 1.7263, + "step": 15878 + }, + { + "epoch": 4.873848987108656, + "grad_norm": 0.2644907832145691, + "learning_rate": 5.447248859722289e-05, + "loss": 1.8489, + "step": 15879 + }, + { + "epoch": 4.87415592387968, + "grad_norm": 0.21646654605865479, + "learning_rate": 5.446753794058376e-05, + "loss": 1.7605, + "step": 15880 + }, + { + "epoch": 4.874462860650706, + "grad_norm": 0.23431318998336792, + "learning_rate": 5.446258723979381e-05, + "loss": 1.7209, + "step": 15881 + }, + { + "epoch": 4.874769797421731, + "grad_norm": 0.24665607511997223, + "learning_rate": 5.4457636494901934e-05, + "loss": 1.813, + "step": 15882 + }, + { + "epoch": 4.875076734192756, + "grad_norm": 0.26269975304603577, + "learning_rate": 5.445268570595708e-05, + "loss": 1.8255, + "step": 15883 + }, + { + "epoch": 4.875383670963782, + "grad_norm": 0.2722402811050415, + "learning_rate": 5.444773487300819e-05, + "loss": 1.7795, + "step": 15884 + }, + { + "epoch": 4.875690607734807, + "grad_norm": 0.3235624134540558, + "learning_rate": 5.444278399610417e-05, + "loss": 1.7804, + "step": 15885 + }, + { + "epoch": 4.8759975445058314, + "grad_norm": 0.2647583782672882, + "learning_rate": 5.4437833075293964e-05, + "loss": 1.7359, + "step": 15886 + }, + { + "epoch": 4.876304481276857, + "grad_norm": 0.272370845079422, + "learning_rate": 5.443288211062649e-05, + "loss": 1.7605, + "step": 15887 + }, + { + "epoch": 4.876611418047882, + "grad_norm": 0.3147594630718231, + "learning_rate": 5.4427931102150675e-05, + "loss": 1.7118, + "step": 15888 + }, + { + "epoch": 4.8769183548189075, + "grad_norm": 0.22751441597938538, + "learning_rate": 5.442298004991544e-05, + "loss": 1.723, + "step": 15889 + }, + { + "epoch": 4.877225291589933, + "grad_norm": 0.2121521681547165, + "learning_rate": 5.441802895396972e-05, + "loss": 1.7485, + "step": 15890 + }, + { + "epoch": 4.877532228360957, + "grad_norm": 0.25370222330093384, + "learning_rate": 5.4413077814362466e-05, + "loss": 1.8064, + "step": 15891 + }, + { + "epoch": 4.877839165131983, + "grad_norm": 0.19492633640766144, + "learning_rate": 5.440812663114259e-05, + "loss": 1.6773, + "step": 15892 + }, + { + "epoch": 4.878146101903008, + "grad_norm": 0.2101750522851944, + "learning_rate": 5.440317540435901e-05, + "loss": 1.7215, + "step": 15893 + }, + { + "epoch": 4.878453038674033, + "grad_norm": 0.21150651574134827, + "learning_rate": 5.439822413406068e-05, + "loss": 1.7875, + "step": 15894 + }, + { + "epoch": 4.878759975445059, + "grad_norm": 0.21008379757404327, + "learning_rate": 5.439327282029651e-05, + "loss": 1.7108, + "step": 15895 + }, + { + "epoch": 4.879066912216084, + "grad_norm": 0.22885502874851227, + "learning_rate": 5.4388321463115453e-05, + "loss": 1.7899, + "step": 15896 + }, + { + "epoch": 4.879373848987108, + "grad_norm": 0.24868059158325195, + "learning_rate": 5.4383370062566444e-05, + "loss": 1.7368, + "step": 15897 + }, + { + "epoch": 4.879680785758134, + "grad_norm": 0.27225378155708313, + "learning_rate": 5.437841861869838e-05, + "loss": 1.7623, + "step": 15898 + }, + { + "epoch": 4.879987722529159, + "grad_norm": 0.23353120684623718, + "learning_rate": 5.437346713156023e-05, + "loss": 1.7908, + "step": 15899 + }, + { + "epoch": 4.880294659300184, + "grad_norm": 0.19032470881938934, + "learning_rate": 5.436851560120091e-05, + "loss": 1.7511, + "step": 15900 + }, + { + "epoch": 4.88060159607121, + "grad_norm": 0.23714862763881683, + "learning_rate": 5.4363564027669345e-05, + "loss": 1.7197, + "step": 15901 + }, + { + "epoch": 4.880908532842234, + "grad_norm": 0.24897022545337677, + "learning_rate": 5.4358612411014495e-05, + "loss": 1.7822, + "step": 15902 + }, + { + "epoch": 4.8812154696132595, + "grad_norm": 0.21433588862419128, + "learning_rate": 5.435366075128528e-05, + "loss": 1.7928, + "step": 15903 + }, + { + "epoch": 4.881522406384285, + "grad_norm": 0.30019649863243103, + "learning_rate": 5.4348709048530646e-05, + "loss": 1.8067, + "step": 15904 + }, + { + "epoch": 4.88182934315531, + "grad_norm": 0.20227669179439545, + "learning_rate": 5.4343757302799515e-05, + "loss": 1.7254, + "step": 15905 + }, + { + "epoch": 4.8821362799263355, + "grad_norm": 0.23447728157043457, + "learning_rate": 5.4338805514140836e-05, + "loss": 1.7314, + "step": 15906 + }, + { + "epoch": 4.882443216697361, + "grad_norm": 0.29545050859451294, + "learning_rate": 5.4333853682603506e-05, + "loss": 1.7659, + "step": 15907 + }, + { + "epoch": 4.882750153468385, + "grad_norm": 0.245390385389328, + "learning_rate": 5.432890180823652e-05, + "loss": 1.7264, + "step": 15908 + }, + { + "epoch": 4.883057090239411, + "grad_norm": 0.209987074136734, + "learning_rate": 5.432394989108879e-05, + "loss": 1.7174, + "step": 15909 + }, + { + "epoch": 4.883364027010436, + "grad_norm": 0.2402341365814209, + "learning_rate": 5.431899793120925e-05, + "loss": 1.7512, + "step": 15910 + }, + { + "epoch": 4.883670963781461, + "grad_norm": 0.26227688789367676, + "learning_rate": 5.431404592864684e-05, + "loss": 1.7697, + "step": 15911 + }, + { + "epoch": 4.883977900552486, + "grad_norm": 0.2556503117084503, + "learning_rate": 5.4309093883450504e-05, + "loss": 1.8191, + "step": 15912 + }, + { + "epoch": 4.884284837323511, + "grad_norm": 0.24766884744167328, + "learning_rate": 5.4304141795669174e-05, + "loss": 1.7574, + "step": 15913 + }, + { + "epoch": 4.884591774094536, + "grad_norm": 0.19925951957702637, + "learning_rate": 5.429918966535179e-05, + "loss": 1.7249, + "step": 15914 + }, + { + "epoch": 4.884898710865562, + "grad_norm": 0.1899442970752716, + "learning_rate": 5.4294237492547294e-05, + "loss": 1.7446, + "step": 15915 + }, + { + "epoch": 4.885205647636587, + "grad_norm": 0.25900956988334656, + "learning_rate": 5.4289285277304636e-05, + "loss": 1.725, + "step": 15916 + }, + { + "epoch": 4.885512584407612, + "grad_norm": 0.2537781000137329, + "learning_rate": 5.428433301967274e-05, + "loss": 1.7861, + "step": 15917 + }, + { + "epoch": 4.885819521178637, + "grad_norm": 0.26432034373283386, + "learning_rate": 5.427938071970054e-05, + "loss": 1.7538, + "step": 15918 + }, + { + "epoch": 4.886126457949662, + "grad_norm": 0.22722363471984863, + "learning_rate": 5.4274428377437e-05, + "loss": 1.7631, + "step": 15919 + }, + { + "epoch": 4.8864333947206875, + "grad_norm": 0.24846172332763672, + "learning_rate": 5.426947599293106e-05, + "loss": 1.7833, + "step": 15920 + }, + { + "epoch": 4.886740331491713, + "grad_norm": 0.24821995198726654, + "learning_rate": 5.426452356623165e-05, + "loss": 1.7638, + "step": 15921 + }, + { + "epoch": 4.887047268262738, + "grad_norm": 0.2796781063079834, + "learning_rate": 5.425957109738773e-05, + "loss": 1.6982, + "step": 15922 + }, + { + "epoch": 4.887354205033763, + "grad_norm": 0.2875385284423828, + "learning_rate": 5.425461858644821e-05, + "loss": 1.7172, + "step": 15923 + }, + { + "epoch": 4.887661141804788, + "grad_norm": 0.21614491939544678, + "learning_rate": 5.424966603346207e-05, + "loss": 1.7521, + "step": 15924 + }, + { + "epoch": 4.887968078575813, + "grad_norm": 0.22944390773773193, + "learning_rate": 5.4244713438478235e-05, + "loss": 1.772, + "step": 15925 + }, + { + "epoch": 4.888275015346839, + "grad_norm": 0.21566039323806763, + "learning_rate": 5.423976080154566e-05, + "loss": 1.734, + "step": 15926 + }, + { + "epoch": 4.888581952117864, + "grad_norm": 0.4253925383090973, + "learning_rate": 5.4234808122713275e-05, + "loss": 1.8017, + "step": 15927 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.239146426320076, + "learning_rate": 5.422985540203004e-05, + "loss": 1.7229, + "step": 15928 + }, + { + "epoch": 4.889195825659914, + "grad_norm": 0.2344054877758026, + "learning_rate": 5.42249026395449e-05, + "loss": 1.7111, + "step": 15929 + }, + { + "epoch": 4.889502762430939, + "grad_norm": 0.21717922389507294, + "learning_rate": 5.421994983530679e-05, + "loss": 1.7427, + "step": 15930 + }, + { + "epoch": 4.889809699201964, + "grad_norm": 0.26895472407341003, + "learning_rate": 5.421499698936466e-05, + "loss": 1.8402, + "step": 15931 + }, + { + "epoch": 4.89011663597299, + "grad_norm": 0.25761866569519043, + "learning_rate": 5.421004410176746e-05, + "loss": 1.7822, + "step": 15932 + }, + { + "epoch": 4.890423572744015, + "grad_norm": 0.24465128779411316, + "learning_rate": 5.420509117256415e-05, + "loss": 1.8074, + "step": 15933 + }, + { + "epoch": 4.8907305095150395, + "grad_norm": 0.2527398467063904, + "learning_rate": 5.4200138201803655e-05, + "loss": 1.7522, + "step": 15934 + }, + { + "epoch": 4.891037446286065, + "grad_norm": 0.23118112981319427, + "learning_rate": 5.4195185189534916e-05, + "loss": 1.7394, + "step": 15935 + }, + { + "epoch": 4.89134438305709, + "grad_norm": 0.2054537534713745, + "learning_rate": 5.419023213580691e-05, + "loss": 1.7096, + "step": 15936 + }, + { + "epoch": 4.8916513198281155, + "grad_norm": 0.2929638922214508, + "learning_rate": 5.418527904066858e-05, + "loss": 1.8733, + "step": 15937 + }, + { + "epoch": 4.891958256599141, + "grad_norm": 0.2957170009613037, + "learning_rate": 5.418032590416886e-05, + "loss": 1.7201, + "step": 15938 + }, + { + "epoch": 4.892265193370166, + "grad_norm": 0.2520081698894501, + "learning_rate": 5.417537272635672e-05, + "loss": 1.7034, + "step": 15939 + }, + { + "epoch": 4.892572130141191, + "grad_norm": 0.25217053294181824, + "learning_rate": 5.41704195072811e-05, + "loss": 1.8538, + "step": 15940 + }, + { + "epoch": 4.892879066912216, + "grad_norm": 0.23605379462242126, + "learning_rate": 5.416546624699093e-05, + "loss": 1.724, + "step": 15941 + }, + { + "epoch": 4.893186003683241, + "grad_norm": 0.321750283241272, + "learning_rate": 5.416051294553519e-05, + "loss": 1.806, + "step": 15942 + }, + { + "epoch": 4.893492940454267, + "grad_norm": 0.23800241947174072, + "learning_rate": 5.415555960296284e-05, + "loss": 1.7578, + "step": 15943 + }, + { + "epoch": 4.893799877225292, + "grad_norm": 0.3423094153404236, + "learning_rate": 5.4150606219322796e-05, + "loss": 1.7324, + "step": 15944 + }, + { + "epoch": 4.894106813996316, + "grad_norm": 0.453074187040329, + "learning_rate": 5.414565279466404e-05, + "loss": 1.7268, + "step": 15945 + }, + { + "epoch": 4.894413750767342, + "grad_norm": 0.21972697973251343, + "learning_rate": 5.4140699329035504e-05, + "loss": 1.6547, + "step": 15946 + }, + { + "epoch": 4.894720687538367, + "grad_norm": 0.32876282930374146, + "learning_rate": 5.413574582248616e-05, + "loss": 1.7527, + "step": 15947 + }, + { + "epoch": 4.895027624309392, + "grad_norm": 0.34035229682922363, + "learning_rate": 5.413079227506494e-05, + "loss": 1.7636, + "step": 15948 + }, + { + "epoch": 4.895334561080418, + "grad_norm": 0.2410411536693573, + "learning_rate": 5.412583868682082e-05, + "loss": 1.8114, + "step": 15949 + }, + { + "epoch": 4.895641497851443, + "grad_norm": 0.2787366211414337, + "learning_rate": 5.412088505780274e-05, + "loss": 1.7393, + "step": 15950 + }, + { + "epoch": 4.8959484346224675, + "grad_norm": 0.23288428783416748, + "learning_rate": 5.411593138805966e-05, + "loss": 1.7413, + "step": 15951 + }, + { + "epoch": 4.896255371393493, + "grad_norm": 0.26302778720855713, + "learning_rate": 5.411097767764053e-05, + "loss": 1.7372, + "step": 15952 + }, + { + "epoch": 4.896562308164518, + "grad_norm": 0.31638020277023315, + "learning_rate": 5.410602392659431e-05, + "loss": 1.8114, + "step": 15953 + }, + { + "epoch": 4.8968692449355435, + "grad_norm": 0.23361825942993164, + "learning_rate": 5.410107013496996e-05, + "loss": 1.7592, + "step": 15954 + }, + { + "epoch": 4.897176181706568, + "grad_norm": 0.19887785613536835, + "learning_rate": 5.409611630281642e-05, + "loss": 1.7509, + "step": 15955 + }, + { + "epoch": 4.897483118477593, + "grad_norm": 0.22396783530712128, + "learning_rate": 5.409116243018266e-05, + "loss": 1.6841, + "step": 15956 + }, + { + "epoch": 4.897790055248619, + "grad_norm": 0.20397686958312988, + "learning_rate": 5.4086208517117645e-05, + "loss": 1.7427, + "step": 15957 + }, + { + "epoch": 4.898096992019644, + "grad_norm": 0.20848311483860016, + "learning_rate": 5.4081254563670314e-05, + "loss": 1.713, + "step": 15958 + }, + { + "epoch": 4.898403928790669, + "grad_norm": 0.2739275395870209, + "learning_rate": 5.407630056988964e-05, + "loss": 1.7673, + "step": 15959 + }, + { + "epoch": 4.898710865561695, + "grad_norm": 0.21485929191112518, + "learning_rate": 5.407134653582456e-05, + "loss": 1.7347, + "step": 15960 + }, + { + "epoch": 4.899017802332719, + "grad_norm": 0.26980286836624146, + "learning_rate": 5.406639246152406e-05, + "loss": 1.7158, + "step": 15961 + }, + { + "epoch": 4.899324739103744, + "grad_norm": 0.22327515482902527, + "learning_rate": 5.4061438347037084e-05, + "loss": 1.7387, + "step": 15962 + }, + { + "epoch": 4.89963167587477, + "grad_norm": 0.2542823553085327, + "learning_rate": 5.4056484192412603e-05, + "loss": 1.7826, + "step": 15963 + }, + { + "epoch": 4.899938612645795, + "grad_norm": 0.3248840868473053, + "learning_rate": 5.405152999769956e-05, + "loss": 1.7878, + "step": 15964 + }, + { + "epoch": 4.9002455494168204, + "grad_norm": 0.21210803091526031, + "learning_rate": 5.404657576294691e-05, + "loss": 1.7378, + "step": 15965 + }, + { + "epoch": 4.900552486187845, + "grad_norm": 0.25679782032966614, + "learning_rate": 5.404162148820365e-05, + "loss": 1.7493, + "step": 15966 + }, + { + "epoch": 4.90085942295887, + "grad_norm": 0.36698678135871887, + "learning_rate": 5.4036667173518704e-05, + "loss": 1.7662, + "step": 15967 + }, + { + "epoch": 4.901166359729896, + "grad_norm": 0.3396874964237213, + "learning_rate": 5.403171281894105e-05, + "loss": 1.7618, + "step": 15968 + }, + { + "epoch": 4.901473296500921, + "grad_norm": 0.2792030870914459, + "learning_rate": 5.402675842451964e-05, + "loss": 1.7858, + "step": 15969 + }, + { + "epoch": 4.901780233271946, + "grad_norm": 0.24499626457691193, + "learning_rate": 5.4021803990303454e-05, + "loss": 1.7503, + "step": 15970 + }, + { + "epoch": 4.902087170042972, + "grad_norm": 0.29185110330581665, + "learning_rate": 5.401684951634144e-05, + "loss": 1.7536, + "step": 15971 + }, + { + "epoch": 4.902394106813996, + "grad_norm": 0.2480020374059677, + "learning_rate": 5.401189500268256e-05, + "loss": 1.7877, + "step": 15972 + }, + { + "epoch": 4.902701043585021, + "grad_norm": 0.3302663564682007, + "learning_rate": 5.400694044937579e-05, + "loss": 1.8693, + "step": 15973 + }, + { + "epoch": 4.903007980356047, + "grad_norm": 0.2500915825366974, + "learning_rate": 5.400198585647008e-05, + "loss": 1.7489, + "step": 15974 + }, + { + "epoch": 4.903314917127072, + "grad_norm": 0.25079864263534546, + "learning_rate": 5.399703122401441e-05, + "loss": 1.7965, + "step": 15975 + }, + { + "epoch": 4.903621853898097, + "grad_norm": 0.2643207907676697, + "learning_rate": 5.399207655205771e-05, + "loss": 1.7696, + "step": 15976 + }, + { + "epoch": 4.903928790669122, + "grad_norm": 0.23719522356987, + "learning_rate": 5.398712184064899e-05, + "loss": 1.7608, + "step": 15977 + }, + { + "epoch": 4.904235727440147, + "grad_norm": 0.25226888060569763, + "learning_rate": 5.3982167089837184e-05, + "loss": 1.8055, + "step": 15978 + }, + { + "epoch": 4.9045426642111725, + "grad_norm": 0.21601852774620056, + "learning_rate": 5.39772122996713e-05, + "loss": 1.7553, + "step": 15979 + }, + { + "epoch": 4.904849600982198, + "grad_norm": 0.20275430381298065, + "learning_rate": 5.397225747020023e-05, + "loss": 1.7221, + "step": 15980 + }, + { + "epoch": 4.905156537753223, + "grad_norm": 0.24815937876701355, + "learning_rate": 5.3967302601473e-05, + "loss": 1.8098, + "step": 15981 + }, + { + "epoch": 4.9054634745242485, + "grad_norm": 0.2193612903356552, + "learning_rate": 5.3962347693538575e-05, + "loss": 1.7116, + "step": 15982 + }, + { + "epoch": 4.905770411295273, + "grad_norm": 0.21409118175506592, + "learning_rate": 5.395739274644589e-05, + "loss": 1.7503, + "step": 15983 + }, + { + "epoch": 4.906077348066298, + "grad_norm": 0.20907564461231232, + "learning_rate": 5.3952437760243935e-05, + "loss": 1.7518, + "step": 15984 + }, + { + "epoch": 4.906384284837324, + "grad_norm": 0.21193571388721466, + "learning_rate": 5.394748273498168e-05, + "loss": 1.6905, + "step": 15985 + }, + { + "epoch": 4.906691221608349, + "grad_norm": 0.19729891419410706, + "learning_rate": 5.394252767070808e-05, + "loss": 1.7398, + "step": 15986 + }, + { + "epoch": 4.906998158379373, + "grad_norm": 0.2654789686203003, + "learning_rate": 5.393757256747211e-05, + "loss": 1.7931, + "step": 15987 + }, + { + "epoch": 4.907305095150399, + "grad_norm": 0.2627345025539398, + "learning_rate": 5.3932617425322726e-05, + "loss": 1.8174, + "step": 15988 + }, + { + "epoch": 4.907612031921424, + "grad_norm": 0.27162298560142517, + "learning_rate": 5.392766224430894e-05, + "loss": 1.8015, + "step": 15989 + }, + { + "epoch": 4.907918968692449, + "grad_norm": 0.24248667061328888, + "learning_rate": 5.3922707024479676e-05, + "loss": 1.7457, + "step": 15990 + }, + { + "epoch": 4.908225905463475, + "grad_norm": 0.24715331196784973, + "learning_rate": 5.391775176588393e-05, + "loss": 1.7724, + "step": 15991 + }, + { + "epoch": 4.9085328422345, + "grad_norm": 0.26335644721984863, + "learning_rate": 5.3912796468570656e-05, + "loss": 1.7183, + "step": 15992 + }, + { + "epoch": 4.9088397790055245, + "grad_norm": 0.23459944128990173, + "learning_rate": 5.3907841132588843e-05, + "loss": 1.7245, + "step": 15993 + }, + { + "epoch": 4.90914671577655, + "grad_norm": 0.21779637038707733, + "learning_rate": 5.3902885757987444e-05, + "loss": 1.7485, + "step": 15994 + }, + { + "epoch": 4.909453652547575, + "grad_norm": 0.227664977312088, + "learning_rate": 5.389793034481545e-05, + "loss": 1.7418, + "step": 15995 + }, + { + "epoch": 4.9097605893186005, + "grad_norm": 0.26230278611183167, + "learning_rate": 5.389297489312183e-05, + "loss": 1.7619, + "step": 15996 + }, + { + "epoch": 4.910067526089626, + "grad_norm": 0.22563579678535461, + "learning_rate": 5.388801940295555e-05, + "loss": 1.7168, + "step": 15997 + }, + { + "epoch": 4.91037446286065, + "grad_norm": 0.24829435348510742, + "learning_rate": 5.388306387436556e-05, + "loss": 1.7422, + "step": 15998 + }, + { + "epoch": 4.910681399631676, + "grad_norm": 0.24395976960659027, + "learning_rate": 5.387810830740088e-05, + "loss": 1.7783, + "step": 15999 + }, + { + "epoch": 4.910988336402701, + "grad_norm": 0.2189297378063202, + "learning_rate": 5.387315270211044e-05, + "loss": 1.7885, + "step": 16000 + }, + { + "epoch": 4.911295273173726, + "grad_norm": 0.21750971674919128, + "learning_rate": 5.386819705854324e-05, + "loss": 1.7659, + "step": 16001 + }, + { + "epoch": 4.911602209944752, + "grad_norm": 0.21907657384872437, + "learning_rate": 5.386324137674826e-05, + "loss": 1.789, + "step": 16002 + }, + { + "epoch": 4.911909146715777, + "grad_norm": 0.18778781592845917, + "learning_rate": 5.3858285656774465e-05, + "loss": 1.7151, + "step": 16003 + }, + { + "epoch": 4.912216083486801, + "grad_norm": 0.24217712879180908, + "learning_rate": 5.385332989867082e-05, + "loss": 1.8108, + "step": 16004 + }, + { + "epoch": 4.912523020257827, + "grad_norm": 0.27637016773223877, + "learning_rate": 5.384837410248632e-05, + "loss": 1.8368, + "step": 16005 + }, + { + "epoch": 4.912829957028852, + "grad_norm": 0.22366084158420563, + "learning_rate": 5.3843418268269926e-05, + "loss": 1.7351, + "step": 16006 + }, + { + "epoch": 4.913136893799877, + "grad_norm": 0.2742357552051544, + "learning_rate": 5.383846239607062e-05, + "loss": 1.7599, + "step": 16007 + }, + { + "epoch": 4.913443830570903, + "grad_norm": 0.2288598269224167, + "learning_rate": 5.383350648593738e-05, + "loss": 1.7056, + "step": 16008 + }, + { + "epoch": 4.913750767341927, + "grad_norm": 0.23319020867347717, + "learning_rate": 5.382855053791919e-05, + "loss": 1.7356, + "step": 16009 + }, + { + "epoch": 4.9140577041129525, + "grad_norm": 0.2232198268175125, + "learning_rate": 5.382359455206499e-05, + "loss": 1.7375, + "step": 16010 + }, + { + "epoch": 4.914364640883978, + "grad_norm": 0.24420048296451569, + "learning_rate": 5.381863852842381e-05, + "loss": 1.8287, + "step": 16011 + }, + { + "epoch": 4.914671577655003, + "grad_norm": 0.22653080523014069, + "learning_rate": 5.381368246704461e-05, + "loss": 1.7137, + "step": 16012 + }, + { + "epoch": 4.9149785144260285, + "grad_norm": 0.20439405739307404, + "learning_rate": 5.380872636797637e-05, + "loss": 1.7688, + "step": 16013 + }, + { + "epoch": 4.915285451197054, + "grad_norm": 0.2602155804634094, + "learning_rate": 5.380377023126806e-05, + "loss": 1.7875, + "step": 16014 + }, + { + "epoch": 4.915592387968078, + "grad_norm": 0.2757892608642578, + "learning_rate": 5.3798814056968647e-05, + "loss": 1.7446, + "step": 16015 + }, + { + "epoch": 4.915899324739104, + "grad_norm": 0.25938209891319275, + "learning_rate": 5.379385784512714e-05, + "loss": 1.6997, + "step": 16016 + }, + { + "epoch": 4.916206261510129, + "grad_norm": 0.2056962549686432, + "learning_rate": 5.37889015957925e-05, + "loss": 1.6961, + "step": 16017 + }, + { + "epoch": 4.916513198281154, + "grad_norm": 0.24388402700424194, + "learning_rate": 5.3783945309013714e-05, + "loss": 1.712, + "step": 16018 + }, + { + "epoch": 4.91682013505218, + "grad_norm": 0.2381993532180786, + "learning_rate": 5.3778988984839775e-05, + "loss": 1.7444, + "step": 16019 + }, + { + "epoch": 4.917127071823204, + "grad_norm": 0.20201562345027924, + "learning_rate": 5.377403262331964e-05, + "loss": 1.7254, + "step": 16020 + }, + { + "epoch": 4.917434008594229, + "grad_norm": 0.24019409716129303, + "learning_rate": 5.376907622450229e-05, + "loss": 1.684, + "step": 16021 + }, + { + "epoch": 4.917740945365255, + "grad_norm": 0.2441694289445877, + "learning_rate": 5.376411978843674e-05, + "loss": 1.7334, + "step": 16022 + }, + { + "epoch": 4.91804788213628, + "grad_norm": 0.23866300284862518, + "learning_rate": 5.3759163315171945e-05, + "loss": 1.7258, + "step": 16023 + }, + { + "epoch": 4.918354818907305, + "grad_norm": 0.28068670630455017, + "learning_rate": 5.375420680475689e-05, + "loss": 1.8049, + "step": 16024 + }, + { + "epoch": 4.918661755678331, + "grad_norm": 0.2956274151802063, + "learning_rate": 5.3749250257240566e-05, + "loss": 1.8544, + "step": 16025 + }, + { + "epoch": 4.918968692449355, + "grad_norm": 0.1971627175807953, + "learning_rate": 5.374429367267196e-05, + "loss": 1.7314, + "step": 16026 + }, + { + "epoch": 4.9192756292203805, + "grad_norm": 0.28565749526023865, + "learning_rate": 5.373933705110004e-05, + "loss": 1.7587, + "step": 16027 + }, + { + "epoch": 4.919582565991406, + "grad_norm": 0.3087369501590729, + "learning_rate": 5.37343803925738e-05, + "loss": 1.7708, + "step": 16028 + }, + { + "epoch": 4.919889502762431, + "grad_norm": 0.22460010647773743, + "learning_rate": 5.372942369714223e-05, + "loss": 1.7401, + "step": 16029 + }, + { + "epoch": 4.920196439533456, + "grad_norm": 0.29492735862731934, + "learning_rate": 5.3724466964854326e-05, + "loss": 1.7033, + "step": 16030 + }, + { + "epoch": 4.920503376304481, + "grad_norm": 0.24452674388885498, + "learning_rate": 5.371951019575904e-05, + "loss": 1.7688, + "step": 16031 + }, + { + "epoch": 4.920810313075506, + "grad_norm": 0.24686957895755768, + "learning_rate": 5.3714553389905366e-05, + "loss": 1.7463, + "step": 16032 + }, + { + "epoch": 4.921117249846532, + "grad_norm": 0.23661597073078156, + "learning_rate": 5.37095965473423e-05, + "loss": 1.7256, + "step": 16033 + }, + { + "epoch": 4.921424186617557, + "grad_norm": 0.22861288487911224, + "learning_rate": 5.370463966811884e-05, + "loss": 1.7722, + "step": 16034 + }, + { + "epoch": 4.921731123388582, + "grad_norm": 0.2453136146068573, + "learning_rate": 5.3699682752283944e-05, + "loss": 1.7343, + "step": 16035 + }, + { + "epoch": 4.922038060159607, + "grad_norm": 0.25267064571380615, + "learning_rate": 5.369472579988663e-05, + "loss": 1.7817, + "step": 16036 + }, + { + "epoch": 4.922344996930632, + "grad_norm": 0.25301575660705566, + "learning_rate": 5.368976881097586e-05, + "loss": 1.8146, + "step": 16037 + }, + { + "epoch": 4.922651933701657, + "grad_norm": 0.23579831421375275, + "learning_rate": 5.368481178560062e-05, + "loss": 1.8089, + "step": 16038 + }, + { + "epoch": 4.922958870472683, + "grad_norm": 0.2181949019432068, + "learning_rate": 5.367985472380993e-05, + "loss": 1.7689, + "step": 16039 + }, + { + "epoch": 4.923265807243708, + "grad_norm": 0.24622827768325806, + "learning_rate": 5.367489762565276e-05, + "loss": 1.791, + "step": 16040 + }, + { + "epoch": 4.9235727440147325, + "grad_norm": 0.2545134723186493, + "learning_rate": 5.3669940491178084e-05, + "loss": 1.738, + "step": 16041 + }, + { + "epoch": 4.923879680785758, + "grad_norm": 0.258139431476593, + "learning_rate": 5.366498332043491e-05, + "loss": 1.8303, + "step": 16042 + }, + { + "epoch": 4.924186617556783, + "grad_norm": 0.23804105818271637, + "learning_rate": 5.366002611347223e-05, + "loss": 1.751, + "step": 16043 + }, + { + "epoch": 4.9244935543278086, + "grad_norm": 0.2354477345943451, + "learning_rate": 5.365506887033901e-05, + "loss": 1.7911, + "step": 16044 + }, + { + "epoch": 4.924800491098834, + "grad_norm": 0.22212550044059753, + "learning_rate": 5.3650111591084276e-05, + "loss": 1.7439, + "step": 16045 + }, + { + "epoch": 4.925107427869859, + "grad_norm": 0.23621168732643127, + "learning_rate": 5.3645154275756984e-05, + "loss": 1.7339, + "step": 16046 + }, + { + "epoch": 4.925414364640884, + "grad_norm": 0.2163209468126297, + "learning_rate": 5.364019692440616e-05, + "loss": 1.7247, + "step": 16047 + }, + { + "epoch": 4.925721301411909, + "grad_norm": 0.21352291107177734, + "learning_rate": 5.3635239537080774e-05, + "loss": 1.7431, + "step": 16048 + }, + { + "epoch": 4.926028238182934, + "grad_norm": 0.3170754909515381, + "learning_rate": 5.36302821138298e-05, + "loss": 1.8075, + "step": 16049 + }, + { + "epoch": 4.92633517495396, + "grad_norm": 0.27073633670806885, + "learning_rate": 5.362532465470226e-05, + "loss": 1.7209, + "step": 16050 + }, + { + "epoch": 4.926642111724985, + "grad_norm": 0.2677803039550781, + "learning_rate": 5.362036715974714e-05, + "loss": 1.7454, + "step": 16051 + }, + { + "epoch": 4.9269490484960095, + "grad_norm": 0.3555704355239868, + "learning_rate": 5.3615409629013436e-05, + "loss": 1.7737, + "step": 16052 + }, + { + "epoch": 4.927255985267035, + "grad_norm": 0.2819947302341461, + "learning_rate": 5.3610452062550124e-05, + "loss": 1.7588, + "step": 16053 + }, + { + "epoch": 4.92756292203806, + "grad_norm": 0.26638996601104736, + "learning_rate": 5.360549446040621e-05, + "loss": 1.8078, + "step": 16054 + }, + { + "epoch": 4.9278698588090855, + "grad_norm": 0.37828773260116577, + "learning_rate": 5.360053682263069e-05, + "loss": 1.7527, + "step": 16055 + }, + { + "epoch": 4.928176795580111, + "grad_norm": 0.35836395621299744, + "learning_rate": 5.359557914927254e-05, + "loss": 1.7199, + "step": 16056 + }, + { + "epoch": 4.928483732351136, + "grad_norm": 0.2720802128314972, + "learning_rate": 5.359062144038078e-05, + "loss": 1.7598, + "step": 16057 + }, + { + "epoch": 4.928790669122161, + "grad_norm": 0.36662939190864563, + "learning_rate": 5.358566369600441e-05, + "loss": 1.7199, + "step": 16058 + }, + { + "epoch": 4.929097605893186, + "grad_norm": 0.42243221402168274, + "learning_rate": 5.3580705916192395e-05, + "loss": 1.7584, + "step": 16059 + }, + { + "epoch": 4.929404542664211, + "grad_norm": 0.21667765080928802, + "learning_rate": 5.357574810099375e-05, + "loss": 1.7608, + "step": 16060 + }, + { + "epoch": 4.929711479435237, + "grad_norm": 0.48101645708084106, + "learning_rate": 5.3570790250457456e-05, + "loss": 1.8157, + "step": 16061 + }, + { + "epoch": 4.930018416206261, + "grad_norm": 0.5289245843887329, + "learning_rate": 5.356583236463253e-05, + "loss": 1.7173, + "step": 16062 + }, + { + "epoch": 4.930325352977286, + "grad_norm": 0.21454930305480957, + "learning_rate": 5.356087444356795e-05, + "loss": 1.7399, + "step": 16063 + }, + { + "epoch": 4.930632289748312, + "grad_norm": 0.5648324489593506, + "learning_rate": 5.355591648731274e-05, + "loss": 1.7814, + "step": 16064 + }, + { + "epoch": 4.930939226519337, + "grad_norm": 0.5669483542442322, + "learning_rate": 5.355095849591587e-05, + "loss": 1.7769, + "step": 16065 + }, + { + "epoch": 4.931246163290362, + "grad_norm": 0.33108505606651306, + "learning_rate": 5.354600046942635e-05, + "loss": 1.7704, + "step": 16066 + }, + { + "epoch": 4.931553100061388, + "grad_norm": 0.31149306893348694, + "learning_rate": 5.3541042407893164e-05, + "loss": 1.7631, + "step": 16067 + }, + { + "epoch": 4.931860036832412, + "grad_norm": 0.30377596616744995, + "learning_rate": 5.353608431136532e-05, + "loss": 1.7888, + "step": 16068 + }, + { + "epoch": 4.9321669736034375, + "grad_norm": 0.25041452050209045, + "learning_rate": 5.3531126179891825e-05, + "loss": 1.7507, + "step": 16069 + }, + { + "epoch": 4.932473910374463, + "grad_norm": 0.33900725841522217, + "learning_rate": 5.352616801352167e-05, + "loss": 1.7365, + "step": 16070 + }, + { + "epoch": 4.932780847145488, + "grad_norm": 0.23939846456050873, + "learning_rate": 5.352120981230386e-05, + "loss": 1.7934, + "step": 16071 + }, + { + "epoch": 4.9330877839165135, + "grad_norm": 0.2419881969690323, + "learning_rate": 5.351625157628739e-05, + "loss": 1.7555, + "step": 16072 + }, + { + "epoch": 4.933394720687538, + "grad_norm": 0.3517596423625946, + "learning_rate": 5.351129330552125e-05, + "loss": 1.7102, + "step": 16073 + }, + { + "epoch": 4.933701657458563, + "grad_norm": 0.2660250663757324, + "learning_rate": 5.350633500005446e-05, + "loss": 1.7692, + "step": 16074 + }, + { + "epoch": 4.934008594229589, + "grad_norm": 0.20726454257965088, + "learning_rate": 5.350137665993601e-05, + "loss": 1.718, + "step": 16075 + }, + { + "epoch": 4.934315531000614, + "grad_norm": 0.28218522667884827, + "learning_rate": 5.3496418285214914e-05, + "loss": 1.8402, + "step": 16076 + }, + { + "epoch": 4.934622467771639, + "grad_norm": 0.2142515480518341, + "learning_rate": 5.349145987594015e-05, + "loss": 1.7571, + "step": 16077 + }, + { + "epoch": 4.934929404542665, + "grad_norm": 0.2777026891708374, + "learning_rate": 5.348650143216074e-05, + "loss": 1.7617, + "step": 16078 + }, + { + "epoch": 4.935236341313689, + "grad_norm": 0.24057620763778687, + "learning_rate": 5.348154295392567e-05, + "loss": 1.7149, + "step": 16079 + }, + { + "epoch": 4.935543278084714, + "grad_norm": 0.22220350801944733, + "learning_rate": 5.3476584441283964e-05, + "loss": 1.7402, + "step": 16080 + }, + { + "epoch": 4.93585021485574, + "grad_norm": 0.2451290488243103, + "learning_rate": 5.347162589428462e-05, + "loss": 1.7004, + "step": 16081 + }, + { + "epoch": 4.936157151626765, + "grad_norm": 0.25621771812438965, + "learning_rate": 5.3466667312976625e-05, + "loss": 1.7765, + "step": 16082 + }, + { + "epoch": 4.93646408839779, + "grad_norm": 0.217393159866333, + "learning_rate": 5.346170869740899e-05, + "loss": 1.7695, + "step": 16083 + }, + { + "epoch": 4.936771025168815, + "grad_norm": 0.21248537302017212, + "learning_rate": 5.345675004763071e-05, + "loss": 1.7277, + "step": 16084 + }, + { + "epoch": 4.93707796193984, + "grad_norm": 0.19431474804878235, + "learning_rate": 5.3451791363690805e-05, + "loss": 1.7352, + "step": 16085 + }, + { + "epoch": 4.9373848987108655, + "grad_norm": 0.20233909785747528, + "learning_rate": 5.344683264563829e-05, + "loss": 1.71, + "step": 16086 + }, + { + "epoch": 4.937691835481891, + "grad_norm": 0.2199622094631195, + "learning_rate": 5.344187389352214e-05, + "loss": 1.7443, + "step": 16087 + }, + { + "epoch": 4.937998772252916, + "grad_norm": 0.23495158553123474, + "learning_rate": 5.343691510739138e-05, + "loss": 1.7758, + "step": 16088 + }, + { + "epoch": 4.9383057090239415, + "grad_norm": 0.228348970413208, + "learning_rate": 5.3431956287295015e-05, + "loss": 1.7645, + "step": 16089 + }, + { + "epoch": 4.938612645794966, + "grad_norm": 0.2337537258863449, + "learning_rate": 5.342699743328203e-05, + "loss": 1.7353, + "step": 16090 + }, + { + "epoch": 4.938919582565991, + "grad_norm": 0.1899309754371643, + "learning_rate": 5.3422038545401454e-05, + "loss": 1.6907, + "step": 16091 + }, + { + "epoch": 4.939226519337017, + "grad_norm": 0.2479192316532135, + "learning_rate": 5.341707962370229e-05, + "loss": 1.7961, + "step": 16092 + }, + { + "epoch": 4.939533456108042, + "grad_norm": 0.2444314956665039, + "learning_rate": 5.341212066823355e-05, + "loss": 1.7768, + "step": 16093 + }, + { + "epoch": 4.939840392879067, + "grad_norm": 0.2123393714427948, + "learning_rate": 5.340716167904423e-05, + "loss": 1.7617, + "step": 16094 + }, + { + "epoch": 4.940147329650092, + "grad_norm": 0.20779116451740265, + "learning_rate": 5.340220265618334e-05, + "loss": 1.6951, + "step": 16095 + }, + { + "epoch": 4.940454266421117, + "grad_norm": 0.22189265489578247, + "learning_rate": 5.3397243599699884e-05, + "loss": 1.8368, + "step": 16096 + }, + { + "epoch": 4.940761203192142, + "grad_norm": 0.22316497564315796, + "learning_rate": 5.3392284509642875e-05, + "loss": 1.7096, + "step": 16097 + }, + { + "epoch": 4.941068139963168, + "grad_norm": 0.20406664907932281, + "learning_rate": 5.3387325386061346e-05, + "loss": 1.7269, + "step": 16098 + }, + { + "epoch": 4.941375076734193, + "grad_norm": 0.263007789850235, + "learning_rate": 5.338236622900427e-05, + "loss": 1.7663, + "step": 16099 + }, + { + "epoch": 4.941682013505218, + "grad_norm": 0.24388311803340912, + "learning_rate": 5.3377407038520654e-05, + "loss": 1.7113, + "step": 16100 + }, + { + "epoch": 4.941988950276243, + "grad_norm": 0.21918313205242157, + "learning_rate": 5.3372447814659524e-05, + "loss": 1.775, + "step": 16101 + }, + { + "epoch": 4.942295887047268, + "grad_norm": 0.30842962861061096, + "learning_rate": 5.336748855746989e-05, + "loss": 1.8229, + "step": 16102 + }, + { + "epoch": 4.9426028238182935, + "grad_norm": 0.2875657379627228, + "learning_rate": 5.336252926700077e-05, + "loss": 1.7377, + "step": 16103 + }, + { + "epoch": 4.942909760589319, + "grad_norm": 0.23411425948143005, + "learning_rate": 5.3357569943301156e-05, + "loss": 1.754, + "step": 16104 + }, + { + "epoch": 4.943216697360343, + "grad_norm": 0.29758864641189575, + "learning_rate": 5.335261058642007e-05, + "loss": 1.7471, + "step": 16105 + }, + { + "epoch": 4.943523634131369, + "grad_norm": 0.31761085987091064, + "learning_rate": 5.3347651196406534e-05, + "loss": 1.7658, + "step": 16106 + }, + { + "epoch": 4.943830570902394, + "grad_norm": 0.2487023025751114, + "learning_rate": 5.334269177330952e-05, + "loss": 1.786, + "step": 16107 + }, + { + "epoch": 4.944137507673419, + "grad_norm": 0.23954913020133972, + "learning_rate": 5.333773231717808e-05, + "loss": 1.8486, + "step": 16108 + }, + { + "epoch": 4.944444444444445, + "grad_norm": 0.24893096089363098, + "learning_rate": 5.3332772828061214e-05, + "loss": 1.7927, + "step": 16109 + }, + { + "epoch": 4.94475138121547, + "grad_norm": 0.28653839230537415, + "learning_rate": 5.332781330600795e-05, + "loss": 1.8331, + "step": 16110 + }, + { + "epoch": 4.945058317986494, + "grad_norm": 0.2597404718399048, + "learning_rate": 5.332285375106726e-05, + "loss": 1.7128, + "step": 16111 + }, + { + "epoch": 4.94536525475752, + "grad_norm": 0.23813198506832123, + "learning_rate": 5.3317894163288196e-05, + "loss": 1.7483, + "step": 16112 + }, + { + "epoch": 4.945672191528545, + "grad_norm": 0.2545793652534485, + "learning_rate": 5.331293454271974e-05, + "loss": 1.7987, + "step": 16113 + }, + { + "epoch": 4.94597912829957, + "grad_norm": 0.2453712821006775, + "learning_rate": 5.330797488941095e-05, + "loss": 1.7376, + "step": 16114 + }, + { + "epoch": 4.946286065070596, + "grad_norm": 0.20583751797676086, + "learning_rate": 5.33030152034108e-05, + "loss": 1.7038, + "step": 16115 + }, + { + "epoch": 4.94659300184162, + "grad_norm": 0.22557811439037323, + "learning_rate": 5.3298055484768313e-05, + "loss": 1.6999, + "step": 16116 + }, + { + "epoch": 4.9468999386126455, + "grad_norm": 0.23163801431655884, + "learning_rate": 5.329309573353252e-05, + "loss": 1.7575, + "step": 16117 + }, + { + "epoch": 4.947206875383671, + "grad_norm": 0.3560176491737366, + "learning_rate": 5.3288135949752394e-05, + "loss": 1.8494, + "step": 16118 + }, + { + "epoch": 4.947513812154696, + "grad_norm": 0.306379109621048, + "learning_rate": 5.328317613347701e-05, + "loss": 1.7229, + "step": 16119 + }, + { + "epoch": 4.9478207489257215, + "grad_norm": 0.24428823590278625, + "learning_rate": 5.3278216284755344e-05, + "loss": 1.7939, + "step": 16120 + }, + { + "epoch": 4.948127685696747, + "grad_norm": 0.22251521050930023, + "learning_rate": 5.327325640363643e-05, + "loss": 1.7624, + "step": 16121 + }, + { + "epoch": 4.948434622467771, + "grad_norm": 0.23310889303684235, + "learning_rate": 5.326829649016928e-05, + "loss": 1.7727, + "step": 16122 + }, + { + "epoch": 4.948741559238797, + "grad_norm": 0.22457881271839142, + "learning_rate": 5.326333654440291e-05, + "loss": 1.7602, + "step": 16123 + }, + { + "epoch": 4.949048496009822, + "grad_norm": 0.24032343924045563, + "learning_rate": 5.325837656638631e-05, + "loss": 1.7591, + "step": 16124 + }, + { + "epoch": 4.949355432780847, + "grad_norm": 0.25082892179489136, + "learning_rate": 5.3253416556168546e-05, + "loss": 1.7745, + "step": 16125 + }, + { + "epoch": 4.949662369551873, + "grad_norm": 0.22859038412570953, + "learning_rate": 5.3248456513798615e-05, + "loss": 1.7475, + "step": 16126 + }, + { + "epoch": 4.949969306322897, + "grad_norm": 0.27282553911209106, + "learning_rate": 5.3243496439325525e-05, + "loss": 1.7438, + "step": 16127 + }, + { + "epoch": 4.9502762430939224, + "grad_norm": 0.23622353374958038, + "learning_rate": 5.3238536332798303e-05, + "loss": 1.7625, + "step": 16128 + }, + { + "epoch": 4.950583179864948, + "grad_norm": 0.28060024976730347, + "learning_rate": 5.3233576194265975e-05, + "loss": 1.8028, + "step": 16129 + }, + { + "epoch": 4.950890116635973, + "grad_norm": 0.33281829953193665, + "learning_rate": 5.322861602377755e-05, + "loss": 1.7163, + "step": 16130 + }, + { + "epoch": 4.9511970534069984, + "grad_norm": 0.26457497477531433, + "learning_rate": 5.322365582138203e-05, + "loss": 1.7347, + "step": 16131 + }, + { + "epoch": 4.951503990178024, + "grad_norm": 0.21651674807071686, + "learning_rate": 5.3218695587128476e-05, + "loss": 1.7123, + "step": 16132 + }, + { + "epoch": 4.951810926949048, + "grad_norm": 0.2299882024526596, + "learning_rate": 5.3213735321065885e-05, + "loss": 1.775, + "step": 16133 + }, + { + "epoch": 4.952117863720074, + "grad_norm": 0.2252396047115326, + "learning_rate": 5.3208775023243265e-05, + "loss": 1.7598, + "step": 16134 + }, + { + "epoch": 4.952424800491099, + "grad_norm": 0.2263660430908203, + "learning_rate": 5.3203814693709655e-05, + "loss": 1.7519, + "step": 16135 + }, + { + "epoch": 4.952731737262124, + "grad_norm": 0.2425432950258255, + "learning_rate": 5.3198854332514056e-05, + "loss": 1.7769, + "step": 16136 + }, + { + "epoch": 4.953038674033149, + "grad_norm": 0.22624996304512024, + "learning_rate": 5.319389393970553e-05, + "loss": 1.7686, + "step": 16137 + }, + { + "epoch": 4.953345610804174, + "grad_norm": 0.2240568846464157, + "learning_rate": 5.318893351533306e-05, + "loss": 1.7795, + "step": 16138 + }, + { + "epoch": 4.953652547575199, + "grad_norm": 0.21708132326602936, + "learning_rate": 5.318397305944568e-05, + "loss": 1.7348, + "step": 16139 + }, + { + "epoch": 4.953959484346225, + "grad_norm": 0.2263328731060028, + "learning_rate": 5.3179012572092415e-05, + "loss": 1.7645, + "step": 16140 + }, + { + "epoch": 4.95426642111725, + "grad_norm": 0.2541986107826233, + "learning_rate": 5.3174052053322274e-05, + "loss": 1.723, + "step": 16141 + }, + { + "epoch": 4.954573357888275, + "grad_norm": 0.25829461216926575, + "learning_rate": 5.316909150318429e-05, + "loss": 1.7469, + "step": 16142 + }, + { + "epoch": 4.9548802946593, + "grad_norm": 0.21251125633716583, + "learning_rate": 5.3164130921727494e-05, + "loss": 1.7699, + "step": 16143 + }, + { + "epoch": 4.955187231430325, + "grad_norm": 0.29195618629455566, + "learning_rate": 5.315917030900091e-05, + "loss": 1.7373, + "step": 16144 + }, + { + "epoch": 4.9554941682013505, + "grad_norm": 0.29457888007164, + "learning_rate": 5.315420966505355e-05, + "loss": 1.7202, + "step": 16145 + }, + { + "epoch": 4.955801104972376, + "grad_norm": 0.19679461419582367, + "learning_rate": 5.314924898993443e-05, + "loss": 1.75, + "step": 16146 + }, + { + "epoch": 4.956108041743401, + "grad_norm": 0.287955105304718, + "learning_rate": 5.314428828369259e-05, + "loss": 1.7385, + "step": 16147 + }, + { + "epoch": 4.956414978514426, + "grad_norm": 0.3081825375556946, + "learning_rate": 5.313932754637706e-05, + "loss": 1.7558, + "step": 16148 + }, + { + "epoch": 4.956721915285451, + "grad_norm": 0.25226521492004395, + "learning_rate": 5.3134366778036846e-05, + "loss": 1.8407, + "step": 16149 + }, + { + "epoch": 4.957028852056476, + "grad_norm": 0.43601852655410767, + "learning_rate": 5.3129405978720984e-05, + "loss": 1.7762, + "step": 16150 + }, + { + "epoch": 4.957335788827502, + "grad_norm": 0.3630274832248688, + "learning_rate": 5.31244451484785e-05, + "loss": 1.7802, + "step": 16151 + }, + { + "epoch": 4.957642725598527, + "grad_norm": 0.21337948739528656, + "learning_rate": 5.311948428735841e-05, + "loss": 1.7107, + "step": 16152 + }, + { + "epoch": 4.957949662369552, + "grad_norm": 0.38581085205078125, + "learning_rate": 5.311452339540974e-05, + "loss": 1.7583, + "step": 16153 + }, + { + "epoch": 4.958256599140577, + "grad_norm": 0.28447309136390686, + "learning_rate": 5.310956247268154e-05, + "loss": 1.6992, + "step": 16154 + }, + { + "epoch": 4.958563535911602, + "grad_norm": 0.24510730803012848, + "learning_rate": 5.310460151922283e-05, + "loss": 1.7059, + "step": 16155 + }, + { + "epoch": 4.958870472682627, + "grad_norm": 0.41670146584510803, + "learning_rate": 5.309964053508262e-05, + "loss": 1.7191, + "step": 16156 + }, + { + "epoch": 4.959177409453653, + "grad_norm": 0.3123849034309387, + "learning_rate": 5.309467952030993e-05, + "loss": 1.7161, + "step": 16157 + }, + { + "epoch": 4.959484346224678, + "grad_norm": 0.2275281697511673, + "learning_rate": 5.308971847495382e-05, + "loss": 1.722, + "step": 16158 + }, + { + "epoch": 4.9597912829957025, + "grad_norm": 0.40216436982154846, + "learning_rate": 5.308475739906329e-05, + "loss": 1.7477, + "step": 16159 + }, + { + "epoch": 4.960098219766728, + "grad_norm": 0.259981244802475, + "learning_rate": 5.307979629268739e-05, + "loss": 1.7384, + "step": 16160 + }, + { + "epoch": 4.960405156537753, + "grad_norm": 0.22969573736190796, + "learning_rate": 5.3074835155875134e-05, + "loss": 1.7328, + "step": 16161 + }, + { + "epoch": 4.9607120933087785, + "grad_norm": 0.2773746848106384, + "learning_rate": 5.3069873988675556e-05, + "loss": 1.7333, + "step": 16162 + }, + { + "epoch": 4.961019030079804, + "grad_norm": 0.2764189541339874, + "learning_rate": 5.306491279113768e-05, + "loss": 1.7956, + "step": 16163 + }, + { + "epoch": 4.961325966850829, + "grad_norm": 0.3640958070755005, + "learning_rate": 5.305995156331054e-05, + "loss": 1.7464, + "step": 16164 + }, + { + "epoch": 4.961632903621854, + "grad_norm": 0.3573450446128845, + "learning_rate": 5.305499030524317e-05, + "loss": 1.75, + "step": 16165 + }, + { + "epoch": 4.961939840392879, + "grad_norm": 0.24313980340957642, + "learning_rate": 5.305002901698459e-05, + "loss": 1.7505, + "step": 16166 + }, + { + "epoch": 4.962246777163904, + "grad_norm": 0.3417615592479706, + "learning_rate": 5.304506769858384e-05, + "loss": 1.7387, + "step": 16167 + }, + { + "epoch": 4.96255371393493, + "grad_norm": 0.23209623992443085, + "learning_rate": 5.304010635008995e-05, + "loss": 1.7111, + "step": 16168 + }, + { + "epoch": 4.962860650705955, + "grad_norm": 0.2994776666164398, + "learning_rate": 5.3035144971551944e-05, + "loss": 1.75, + "step": 16169 + }, + { + "epoch": 4.963167587476979, + "grad_norm": 0.3147084712982178, + "learning_rate": 5.303018356301884e-05, + "loss": 1.7598, + "step": 16170 + }, + { + "epoch": 4.963474524248005, + "grad_norm": 0.20136526226997375, + "learning_rate": 5.30252221245397e-05, + "loss": 1.7217, + "step": 16171 + }, + { + "epoch": 4.96378146101903, + "grad_norm": 0.3308684229850769, + "learning_rate": 5.302026065616355e-05, + "loss": 1.7554, + "step": 16172 + }, + { + "epoch": 4.964088397790055, + "grad_norm": 0.22890877723693848, + "learning_rate": 5.30152991579394e-05, + "loss": 1.7598, + "step": 16173 + }, + { + "epoch": 4.964395334561081, + "grad_norm": 0.3036035895347595, + "learning_rate": 5.301033762991631e-05, + "loss": 1.758, + "step": 16174 + }, + { + "epoch": 4.964702271332106, + "grad_norm": 0.2983579933643341, + "learning_rate": 5.300537607214329e-05, + "loss": 1.8132, + "step": 16175 + }, + { + "epoch": 4.9650092081031305, + "grad_norm": 0.21401815116405487, + "learning_rate": 5.300041448466937e-05, + "loss": 1.7179, + "step": 16176 + }, + { + "epoch": 4.965316144874156, + "grad_norm": 0.2939651608467102, + "learning_rate": 5.2995452867543606e-05, + "loss": 1.7928, + "step": 16177 + }, + { + "epoch": 4.965623081645181, + "grad_norm": 0.24803484976291656, + "learning_rate": 5.2990491220815034e-05, + "loss": 1.7366, + "step": 16178 + }, + { + "epoch": 4.9659300184162065, + "grad_norm": 0.1999569535255432, + "learning_rate": 5.2985529544532656e-05, + "loss": 1.6691, + "step": 16179 + }, + { + "epoch": 4.966236955187231, + "grad_norm": 0.22315269708633423, + "learning_rate": 5.298056783874553e-05, + "loss": 1.7693, + "step": 16180 + }, + { + "epoch": 4.966543891958256, + "grad_norm": 0.22688794136047363, + "learning_rate": 5.2975606103502694e-05, + "loss": 1.8401, + "step": 16181 + }, + { + "epoch": 4.966850828729282, + "grad_norm": 0.2592024505138397, + "learning_rate": 5.297064433885317e-05, + "loss": 1.8054, + "step": 16182 + }, + { + "epoch": 4.967157765500307, + "grad_norm": 0.2508920133113861, + "learning_rate": 5.2965682544846e-05, + "loss": 1.766, + "step": 16183 + }, + { + "epoch": 4.967464702271332, + "grad_norm": 0.22318799793720245, + "learning_rate": 5.296072072153022e-05, + "loss": 1.751, + "step": 16184 + }, + { + "epoch": 4.967771639042358, + "grad_norm": 0.2348448485136032, + "learning_rate": 5.2955758868954855e-05, + "loss": 1.7844, + "step": 16185 + }, + { + "epoch": 4.968078575813382, + "grad_norm": 0.23294343054294586, + "learning_rate": 5.295079698716895e-05, + "loss": 1.7685, + "step": 16186 + }, + { + "epoch": 4.968385512584407, + "grad_norm": 0.20854508876800537, + "learning_rate": 5.2945835076221526e-05, + "loss": 1.6914, + "step": 16187 + }, + { + "epoch": 4.968692449355433, + "grad_norm": 0.21952031552791595, + "learning_rate": 5.294087313616165e-05, + "loss": 1.7121, + "step": 16188 + }, + { + "epoch": 4.968999386126458, + "grad_norm": 0.24097788333892822, + "learning_rate": 5.2935911167038346e-05, + "loss": 1.7712, + "step": 16189 + }, + { + "epoch": 4.969306322897483, + "grad_norm": 0.24433603882789612, + "learning_rate": 5.293094916890063e-05, + "loss": 1.7608, + "step": 16190 + }, + { + "epoch": 4.969613259668508, + "grad_norm": 0.22209061682224274, + "learning_rate": 5.292598714179757e-05, + "loss": 1.7563, + "step": 16191 + }, + { + "epoch": 4.969920196439533, + "grad_norm": 0.24291595816612244, + "learning_rate": 5.29210250857782e-05, + "loss": 1.7765, + "step": 16192 + }, + { + "epoch": 4.9702271332105585, + "grad_norm": 0.3143673837184906, + "learning_rate": 5.291606300089151e-05, + "loss": 1.7945, + "step": 16193 + }, + { + "epoch": 4.970534069981584, + "grad_norm": 0.22693613171577454, + "learning_rate": 5.291110088718661e-05, + "loss": 1.7411, + "step": 16194 + }, + { + "epoch": 4.970841006752609, + "grad_norm": 0.2271365374326706, + "learning_rate": 5.2906138744712494e-05, + "loss": 1.7754, + "step": 16195 + }, + { + "epoch": 4.9711479435236345, + "grad_norm": 0.2428499162197113, + "learning_rate": 5.290117657351822e-05, + "loss": 1.8007, + "step": 16196 + }, + { + "epoch": 4.971454880294659, + "grad_norm": 0.21862711012363434, + "learning_rate": 5.289621437365281e-05, + "loss": 1.7484, + "step": 16197 + }, + { + "epoch": 4.971761817065684, + "grad_norm": 0.26744964718818665, + "learning_rate": 5.2891252145165315e-05, + "loss": 1.7759, + "step": 16198 + }, + { + "epoch": 4.97206875383671, + "grad_norm": 0.2608526647090912, + "learning_rate": 5.288628988810477e-05, + "loss": 1.8527, + "step": 16199 + }, + { + "epoch": 4.972375690607735, + "grad_norm": 0.2245805710554123, + "learning_rate": 5.2881327602520216e-05, + "loss": 1.7773, + "step": 16200 + }, + { + "epoch": 4.97268262737876, + "grad_norm": 0.22023041546344757, + "learning_rate": 5.2876365288460694e-05, + "loss": 1.7101, + "step": 16201 + }, + { + "epoch": 4.972989564149785, + "grad_norm": 0.22034525871276855, + "learning_rate": 5.287140294597525e-05, + "loss": 1.7672, + "step": 16202 + }, + { + "epoch": 4.97329650092081, + "grad_norm": 0.23101158440113068, + "learning_rate": 5.286644057511292e-05, + "loss": 1.741, + "step": 16203 + }, + { + "epoch": 4.973603437691835, + "grad_norm": 0.23050430417060852, + "learning_rate": 5.286147817592273e-05, + "loss": 1.7727, + "step": 16204 + }, + { + "epoch": 4.973910374462861, + "grad_norm": 0.21803520619869232, + "learning_rate": 5.285651574845374e-05, + "loss": 1.7353, + "step": 16205 + }, + { + "epoch": 4.974217311233886, + "grad_norm": 0.22252169251441956, + "learning_rate": 5.2851553292754995e-05, + "loss": 1.7658, + "step": 16206 + }, + { + "epoch": 4.974524248004911, + "grad_norm": 0.22458864748477936, + "learning_rate": 5.284659080887552e-05, + "loss": 1.7157, + "step": 16207 + }, + { + "epoch": 4.974831184775936, + "grad_norm": 0.20769210159778595, + "learning_rate": 5.2841628296864376e-05, + "loss": 1.7731, + "step": 16208 + }, + { + "epoch": 4.975138121546961, + "grad_norm": 0.1952340304851532, + "learning_rate": 5.283666575677059e-05, + "loss": 1.6907, + "step": 16209 + }, + { + "epoch": 4.975445058317987, + "grad_norm": 0.21943804621696472, + "learning_rate": 5.28317031886432e-05, + "loss": 1.8007, + "step": 16210 + }, + { + "epoch": 4.975751995089012, + "grad_norm": 0.21987493336200714, + "learning_rate": 5.2826740592531276e-05, + "loss": 1.7205, + "step": 16211 + }, + { + "epoch": 4.976058931860036, + "grad_norm": 0.2076522558927536, + "learning_rate": 5.2821777968483845e-05, + "loss": 1.7063, + "step": 16212 + }, + { + "epoch": 4.976365868631062, + "grad_norm": 0.19126583635807037, + "learning_rate": 5.281681531654994e-05, + "loss": 1.7118, + "step": 16213 + }, + { + "epoch": 4.976672805402087, + "grad_norm": 0.22308050096035004, + "learning_rate": 5.2811852636778625e-05, + "loss": 1.7565, + "step": 16214 + }, + { + "epoch": 4.976979742173112, + "grad_norm": 0.23187528550624847, + "learning_rate": 5.280688992921893e-05, + "loss": 1.8261, + "step": 16215 + }, + { + "epoch": 4.977286678944138, + "grad_norm": 0.21373791992664337, + "learning_rate": 5.28019271939199e-05, + "loss": 1.6974, + "step": 16216 + }, + { + "epoch": 4.977593615715163, + "grad_norm": 0.21647346019744873, + "learning_rate": 5.2796964430930585e-05, + "loss": 1.7967, + "step": 16217 + }, + { + "epoch": 4.9779005524861875, + "grad_norm": 0.2231660932302475, + "learning_rate": 5.279200164030002e-05, + "loss": 1.7495, + "step": 16218 + }, + { + "epoch": 4.978207489257213, + "grad_norm": 0.2810545563697815, + "learning_rate": 5.278703882207728e-05, + "loss": 1.875, + "step": 16219 + }, + { + "epoch": 4.978514426028238, + "grad_norm": 0.298984557390213, + "learning_rate": 5.2782075976311374e-05, + "loss": 1.7494, + "step": 16220 + }, + { + "epoch": 4.9788213627992635, + "grad_norm": 0.2530893385410309, + "learning_rate": 5.2777113103051365e-05, + "loss": 1.7594, + "step": 16221 + }, + { + "epoch": 4.979128299570289, + "grad_norm": 0.26165664196014404, + "learning_rate": 5.277215020234629e-05, + "loss": 1.7543, + "step": 16222 + }, + { + "epoch": 4.979435236341313, + "grad_norm": 0.25115957856178284, + "learning_rate": 5.276718727424521e-05, + "loss": 1.7925, + "step": 16223 + }, + { + "epoch": 4.979742173112339, + "grad_norm": 0.22134126722812653, + "learning_rate": 5.276222431879716e-05, + "loss": 1.8359, + "step": 16224 + }, + { + "epoch": 4.980049109883364, + "grad_norm": 0.24447613954544067, + "learning_rate": 5.275726133605119e-05, + "loss": 1.7693, + "step": 16225 + }, + { + "epoch": 4.980356046654389, + "grad_norm": 0.23025095462799072, + "learning_rate": 5.275229832605635e-05, + "loss": 1.7911, + "step": 16226 + }, + { + "epoch": 4.980662983425415, + "grad_norm": 0.23424232006072998, + "learning_rate": 5.2747335288861686e-05, + "loss": 1.7628, + "step": 16227 + }, + { + "epoch": 4.98096992019644, + "grad_norm": 0.24598535895347595, + "learning_rate": 5.2742372224516235e-05, + "loss": 1.7651, + "step": 16228 + }, + { + "epoch": 4.981276856967464, + "grad_norm": 0.262893944978714, + "learning_rate": 5.273740913306906e-05, + "loss": 1.7282, + "step": 16229 + }, + { + "epoch": 4.98158379373849, + "grad_norm": 0.21981783211231232, + "learning_rate": 5.2732446014569207e-05, + "loss": 1.7448, + "step": 16230 + }, + { + "epoch": 4.981890730509515, + "grad_norm": 0.24244973063468933, + "learning_rate": 5.272748286906573e-05, + "loss": 1.7216, + "step": 16231 + }, + { + "epoch": 4.98219766728054, + "grad_norm": 0.2365221232175827, + "learning_rate": 5.272251969660766e-05, + "loss": 1.7227, + "step": 16232 + }, + { + "epoch": 4.982504604051566, + "grad_norm": 0.2081129401922226, + "learning_rate": 5.271755649724405e-05, + "loss": 1.7184, + "step": 16233 + }, + { + "epoch": 4.98281154082259, + "grad_norm": 0.2256374955177307, + "learning_rate": 5.271259327102395e-05, + "loss": 1.7412, + "step": 16234 + }, + { + "epoch": 4.9831184775936155, + "grad_norm": 0.23727381229400635, + "learning_rate": 5.270763001799643e-05, + "loss": 1.8095, + "step": 16235 + }, + { + "epoch": 4.983425414364641, + "grad_norm": 0.21498435735702515, + "learning_rate": 5.2702666738210504e-05, + "loss": 1.744, + "step": 16236 + }, + { + "epoch": 4.983732351135666, + "grad_norm": 0.24772173166275024, + "learning_rate": 5.269770343171525e-05, + "loss": 1.741, + "step": 16237 + }, + { + "epoch": 4.9840392879066915, + "grad_norm": 0.2835623621940613, + "learning_rate": 5.269274009855971e-05, + "loss": 1.7765, + "step": 16238 + }, + { + "epoch": 4.984346224677717, + "grad_norm": 0.2570044696331024, + "learning_rate": 5.2687776738792926e-05, + "loss": 1.8206, + "step": 16239 + }, + { + "epoch": 4.984653161448741, + "grad_norm": 0.21549640595912933, + "learning_rate": 5.268281335246397e-05, + "loss": 1.7022, + "step": 16240 + }, + { + "epoch": 4.984960098219767, + "grad_norm": 0.23158684372901917, + "learning_rate": 5.267784993962187e-05, + "loss": 1.7882, + "step": 16241 + }, + { + "epoch": 4.985267034990792, + "grad_norm": 0.22778423130512238, + "learning_rate": 5.26728865003157e-05, + "loss": 1.7358, + "step": 16242 + }, + { + "epoch": 4.985573971761817, + "grad_norm": 0.23197145760059357, + "learning_rate": 5.266792303459449e-05, + "loss": 1.7687, + "step": 16243 + }, + { + "epoch": 4.985880908532843, + "grad_norm": 0.19270172715187073, + "learning_rate": 5.26629595425073e-05, + "loss": 1.6999, + "step": 16244 + }, + { + "epoch": 4.986187845303867, + "grad_norm": 0.25262632966041565, + "learning_rate": 5.2657996024103175e-05, + "loss": 1.7536, + "step": 16245 + }, + { + "epoch": 4.986494782074892, + "grad_norm": 0.18620926141738892, + "learning_rate": 5.2653032479431185e-05, + "loss": 1.7033, + "step": 16246 + }, + { + "epoch": 4.986801718845918, + "grad_norm": 0.19537273049354553, + "learning_rate": 5.2648068908540374e-05, + "loss": 1.7457, + "step": 16247 + }, + { + "epoch": 4.987108655616943, + "grad_norm": 0.19447599351406097, + "learning_rate": 5.26431053114798e-05, + "loss": 1.7053, + "step": 16248 + }, + { + "epoch": 4.987415592387968, + "grad_norm": 0.20431137084960938, + "learning_rate": 5.263814168829852e-05, + "loss": 1.7695, + "step": 16249 + }, + { + "epoch": 4.987722529158994, + "grad_norm": 0.21123024821281433, + "learning_rate": 5.263317803904554e-05, + "loss": 1.7666, + "step": 16250 + }, + { + "epoch": 4.988029465930018, + "grad_norm": 0.21279335021972656, + "learning_rate": 5.262821436376998e-05, + "loss": 1.7231, + "step": 16251 + }, + { + "epoch": 4.9883364027010435, + "grad_norm": 0.22504910826683044, + "learning_rate": 5.262325066252085e-05, + "loss": 1.7657, + "step": 16252 + }, + { + "epoch": 4.988643339472069, + "grad_norm": 0.23505981266498566, + "learning_rate": 5.261828693534723e-05, + "loss": 1.7576, + "step": 16253 + }, + { + "epoch": 4.988950276243094, + "grad_norm": 0.21553601324558258, + "learning_rate": 5.261332318229817e-05, + "loss": 1.7782, + "step": 16254 + }, + { + "epoch": 4.989257213014119, + "grad_norm": 0.29189521074295044, + "learning_rate": 5.26083594034227e-05, + "loss": 1.7664, + "step": 16255 + }, + { + "epoch": 4.989564149785144, + "grad_norm": 0.38108906149864197, + "learning_rate": 5.26033955987699e-05, + "loss": 1.8573, + "step": 16256 + }, + { + "epoch": 4.989871086556169, + "grad_norm": 0.30329224467277527, + "learning_rate": 5.2598431768388824e-05, + "loss": 1.7584, + "step": 16257 + }, + { + "epoch": 4.990178023327195, + "grad_norm": 0.2437417358160019, + "learning_rate": 5.259346791232852e-05, + "loss": 1.7352, + "step": 16258 + }, + { + "epoch": 4.99048496009822, + "grad_norm": 0.3601737320423126, + "learning_rate": 5.258850403063804e-05, + "loss": 1.7206, + "step": 16259 + }, + { + "epoch": 4.990791896869245, + "grad_norm": 0.20259195566177368, + "learning_rate": 5.258354012336646e-05, + "loss": 1.7403, + "step": 16260 + }, + { + "epoch": 4.99109883364027, + "grad_norm": 0.38022148609161377, + "learning_rate": 5.257857619056281e-05, + "loss": 1.7783, + "step": 16261 + }, + { + "epoch": 4.991405770411295, + "grad_norm": 0.30131712555885315, + "learning_rate": 5.257361223227615e-05, + "loss": 1.7826, + "step": 16262 + }, + { + "epoch": 4.99171270718232, + "grad_norm": 0.24159663915634155, + "learning_rate": 5.2568648248555565e-05, + "loss": 1.7792, + "step": 16263 + }, + { + "epoch": 4.992019643953346, + "grad_norm": 0.4641213119029999, + "learning_rate": 5.2563684239450084e-05, + "loss": 1.7432, + "step": 16264 + }, + { + "epoch": 4.992326580724371, + "grad_norm": 0.3526865541934967, + "learning_rate": 5.255872020500877e-05, + "loss": 1.7736, + "step": 16265 + }, + { + "epoch": 4.9926335174953955, + "grad_norm": 0.2396051585674286, + "learning_rate": 5.255375614528071e-05, + "loss": 1.7505, + "step": 16266 + }, + { + "epoch": 4.992940454266421, + "grad_norm": 0.320987343788147, + "learning_rate": 5.25487920603149e-05, + "loss": 1.8229, + "step": 16267 + }, + { + "epoch": 4.993247391037446, + "grad_norm": 0.24689678847789764, + "learning_rate": 5.254382795016044e-05, + "loss": 1.7011, + "step": 16268 + }, + { + "epoch": 4.9935543278084715, + "grad_norm": 0.2407137155532837, + "learning_rate": 5.253886381486639e-05, + "loss": 1.741, + "step": 16269 + }, + { + "epoch": 4.993861264579497, + "grad_norm": 0.3677252531051636, + "learning_rate": 5.25338996544818e-05, + "loss": 1.7792, + "step": 16270 + }, + { + "epoch": 4.994168201350522, + "grad_norm": 0.25096553564071655, + "learning_rate": 5.252893546905573e-05, + "loss": 1.7523, + "step": 16271 + }, + { + "epoch": 4.994475138121547, + "grad_norm": 0.2966327965259552, + "learning_rate": 5.252397125863723e-05, + "loss": 1.7114, + "step": 16272 + }, + { + "epoch": 4.994782074892572, + "grad_norm": 0.36577650904655457, + "learning_rate": 5.2519007023275356e-05, + "loss": 1.7609, + "step": 16273 + }, + { + "epoch": 4.995089011663597, + "grad_norm": 0.2450687140226364, + "learning_rate": 5.25140427630192e-05, + "loss": 1.7452, + "step": 16274 + }, + { + "epoch": 4.995395948434623, + "grad_norm": 0.20782120525836945, + "learning_rate": 5.250907847791778e-05, + "loss": 1.7109, + "step": 16275 + }, + { + "epoch": 4.995702885205648, + "grad_norm": 0.2423330545425415, + "learning_rate": 5.25041141680202e-05, + "loss": 1.7234, + "step": 16276 + }, + { + "epoch": 4.996009821976672, + "grad_norm": 0.20855975151062012, + "learning_rate": 5.2499149833375484e-05, + "loss": 1.7734, + "step": 16277 + }, + { + "epoch": 4.996316758747698, + "grad_norm": 0.24400894343852997, + "learning_rate": 5.24941854740327e-05, + "loss": 1.7566, + "step": 16278 + }, + { + "epoch": 4.996623695518723, + "grad_norm": 0.4378018379211426, + "learning_rate": 5.2489221090040906e-05, + "loss": 1.7536, + "step": 16279 + }, + { + "epoch": 4.996930632289748, + "grad_norm": 0.20726722478866577, + "learning_rate": 5.248425668144918e-05, + "loss": 1.8008, + "step": 16280 + }, + { + "epoch": 4.997237569060774, + "grad_norm": 0.2506333589553833, + "learning_rate": 5.247929224830658e-05, + "loss": 1.7404, + "step": 16281 + }, + { + "epoch": 4.997544505831799, + "grad_norm": 0.24178004264831543, + "learning_rate": 5.247432779066216e-05, + "loss": 1.7517, + "step": 16282 + }, + { + "epoch": 4.9978514426028235, + "grad_norm": 0.2500220835208893, + "learning_rate": 5.246936330856499e-05, + "loss": 1.7705, + "step": 16283 + }, + { + "epoch": 4.998158379373849, + "grad_norm": 0.30043718218803406, + "learning_rate": 5.24643988020641e-05, + "loss": 1.8118, + "step": 16284 + }, + { + "epoch": 4.998465316144874, + "grad_norm": 0.284805566072464, + "learning_rate": 5.245943427120859e-05, + "loss": 1.7968, + "step": 16285 + }, + { + "epoch": 4.9987722529158995, + "grad_norm": 0.3652406632900238, + "learning_rate": 5.245446971604751e-05, + "loss": 1.7785, + "step": 16286 + }, + { + "epoch": 4.999079189686924, + "grad_norm": 0.24879656732082367, + "learning_rate": 5.244950513662992e-05, + "loss": 1.734, + "step": 16287 + }, + { + "epoch": 4.999386126457949, + "grad_norm": 0.2374224215745926, + "learning_rate": 5.244454053300488e-05, + "loss": 1.7394, + "step": 16288 + }, + { + "epoch": 4.999693063228975, + "grad_norm": 0.27090463042259216, + "learning_rate": 5.243957590522147e-05, + "loss": 1.7529, + "step": 16289 + }, + { + "epoch": 5.0, + "grad_norm": 0.23060791194438934, + "learning_rate": 5.243461125332873e-05, + "loss": 1.7599, + "step": 16290 + }, + { + "epoch": 5.000306936771025, + "grad_norm": 0.21159487962722778, + "learning_rate": 5.242964657737572e-05, + "loss": 1.747, + "step": 16291 + }, + { + "epoch": 5.000613873542051, + "grad_norm": 0.21556304395198822, + "learning_rate": 5.242468187741154e-05, + "loss": 1.7653, + "step": 16292 + }, + { + "epoch": 5.000920810313075, + "grad_norm": 0.2569669783115387, + "learning_rate": 5.241971715348524e-05, + "loss": 1.7284, + "step": 16293 + }, + { + "epoch": 5.0012277470841005, + "grad_norm": 0.2827381491661072, + "learning_rate": 5.241475240564586e-05, + "loss": 1.7765, + "step": 16294 + }, + { + "epoch": 5.001534683855126, + "grad_norm": 0.22498267889022827, + "learning_rate": 5.240978763394249e-05, + "loss": 1.729, + "step": 16295 + }, + { + "epoch": 5.001841620626151, + "grad_norm": 0.23975814878940582, + "learning_rate": 5.240482283842418e-05, + "loss": 1.7968, + "step": 16296 + }, + { + "epoch": 5.0021485573971765, + "grad_norm": 0.20811420679092407, + "learning_rate": 5.239985801914e-05, + "loss": 1.6931, + "step": 16297 + }, + { + "epoch": 5.002455494168202, + "grad_norm": 0.22985060513019562, + "learning_rate": 5.2394893176139014e-05, + "loss": 1.7724, + "step": 16298 + }, + { + "epoch": 5.002762430939226, + "grad_norm": 0.22867995500564575, + "learning_rate": 5.2389928309470305e-05, + "loss": 1.7179, + "step": 16299 + }, + { + "epoch": 5.003069367710252, + "grad_norm": 0.2543974220752716, + "learning_rate": 5.238496341918293e-05, + "loss": 1.7859, + "step": 16300 + }, + { + "epoch": 5.003376304481277, + "grad_norm": 0.226583793759346, + "learning_rate": 5.237999850532592e-05, + "loss": 1.7567, + "step": 16301 + }, + { + "epoch": 5.003683241252302, + "grad_norm": 0.21744728088378906, + "learning_rate": 5.237503356794838e-05, + "loss": 1.7345, + "step": 16302 + }, + { + "epoch": 5.003990178023328, + "grad_norm": 0.25915467739105225, + "learning_rate": 5.2370068607099373e-05, + "loss": 1.7179, + "step": 16303 + }, + { + "epoch": 5.004297114794352, + "grad_norm": 0.20572461187839508, + "learning_rate": 5.236510362282796e-05, + "loss": 1.7211, + "step": 16304 + }, + { + "epoch": 5.004604051565377, + "grad_norm": 0.2821461856365204, + "learning_rate": 5.236013861518321e-05, + "loss": 1.7894, + "step": 16305 + }, + { + "epoch": 5.004910988336403, + "grad_norm": 0.22273759543895721, + "learning_rate": 5.235517358421417e-05, + "loss": 1.7919, + "step": 16306 + }, + { + "epoch": 5.005217925107428, + "grad_norm": 0.23875468969345093, + "learning_rate": 5.2350208529969935e-05, + "loss": 1.7558, + "step": 16307 + }, + { + "epoch": 5.005524861878453, + "grad_norm": 0.24673783779144287, + "learning_rate": 5.234524345249955e-05, + "loss": 1.7705, + "step": 16308 + }, + { + "epoch": 5.005831798649478, + "grad_norm": 0.21992872655391693, + "learning_rate": 5.234027835185211e-05, + "loss": 1.7059, + "step": 16309 + }, + { + "epoch": 5.006138735420503, + "grad_norm": 0.19214966893196106, + "learning_rate": 5.233531322807667e-05, + "loss": 1.6647, + "step": 16310 + }, + { + "epoch": 5.0064456721915285, + "grad_norm": 0.18525120615959167, + "learning_rate": 5.233034808122228e-05, + "loss": 1.719, + "step": 16311 + }, + { + "epoch": 5.006752608962554, + "grad_norm": 0.25996243953704834, + "learning_rate": 5.232538291133804e-05, + "loss": 1.7227, + "step": 16312 + }, + { + "epoch": 5.007059545733579, + "grad_norm": 0.2163757085800171, + "learning_rate": 5.232041771847299e-05, + "loss": 1.6962, + "step": 16313 + }, + { + "epoch": 5.0073664825046045, + "grad_norm": 0.23484158515930176, + "learning_rate": 5.231545250267621e-05, + "loss": 1.7816, + "step": 16314 + }, + { + "epoch": 5.007673419275629, + "grad_norm": 0.2188636213541031, + "learning_rate": 5.2310487263996776e-05, + "loss": 1.7477, + "step": 16315 + }, + { + "epoch": 5.007980356046654, + "grad_norm": 0.1950213611125946, + "learning_rate": 5.230552200248377e-05, + "loss": 1.7165, + "step": 16316 + }, + { + "epoch": 5.00828729281768, + "grad_norm": 0.25340089201927185, + "learning_rate": 5.230055671818623e-05, + "loss": 1.7764, + "step": 16317 + }, + { + "epoch": 5.008594229588705, + "grad_norm": 0.23749271035194397, + "learning_rate": 5.2295591411153245e-05, + "loss": 1.7193, + "step": 16318 + }, + { + "epoch": 5.00890116635973, + "grad_norm": 0.2317294180393219, + "learning_rate": 5.229062608143387e-05, + "loss": 1.7607, + "step": 16319 + }, + { + "epoch": 5.009208103130755, + "grad_norm": 0.2751505672931671, + "learning_rate": 5.228566072907719e-05, + "loss": 1.7562, + "step": 16320 + }, + { + "epoch": 5.00951503990178, + "grad_norm": 0.29476025700569153, + "learning_rate": 5.2280695354132267e-05, + "loss": 1.687, + "step": 16321 + }, + { + "epoch": 5.009821976672805, + "grad_norm": 0.20734120905399323, + "learning_rate": 5.227572995664819e-05, + "loss": 1.7608, + "step": 16322 + }, + { + "epoch": 5.010128913443831, + "grad_norm": 0.2537878155708313, + "learning_rate": 5.227076453667401e-05, + "loss": 1.7947, + "step": 16323 + }, + { + "epoch": 5.010435850214856, + "grad_norm": 0.23516076803207397, + "learning_rate": 5.2265799094258796e-05, + "loss": 1.7545, + "step": 16324 + }, + { + "epoch": 5.0107427869858805, + "grad_norm": 0.2581529915332794, + "learning_rate": 5.226083362945162e-05, + "loss": 1.7529, + "step": 16325 + }, + { + "epoch": 5.011049723756906, + "grad_norm": 0.2982035279273987, + "learning_rate": 5.225586814230158e-05, + "loss": 1.74, + "step": 16326 + }, + { + "epoch": 5.011356660527931, + "grad_norm": 0.2773981988430023, + "learning_rate": 5.225090263285772e-05, + "loss": 1.7562, + "step": 16327 + }, + { + "epoch": 5.0116635972989565, + "grad_norm": 0.19992689788341522, + "learning_rate": 5.2245937101169116e-05, + "loss": 1.6896, + "step": 16328 + }, + { + "epoch": 5.011970534069982, + "grad_norm": 0.2913428246974945, + "learning_rate": 5.224097154728486e-05, + "loss": 1.7574, + "step": 16329 + }, + { + "epoch": 5.012277470841007, + "grad_norm": 0.23173104226589203, + "learning_rate": 5.2236005971254e-05, + "loss": 1.6954, + "step": 16330 + }, + { + "epoch": 5.012584407612032, + "grad_norm": 0.2019525170326233, + "learning_rate": 5.2231040373125614e-05, + "loss": 1.7711, + "step": 16331 + }, + { + "epoch": 5.012891344383057, + "grad_norm": 0.29070746898651123, + "learning_rate": 5.222607475294878e-05, + "loss": 1.8201, + "step": 16332 + }, + { + "epoch": 5.013198281154082, + "grad_norm": 0.22005079686641693, + "learning_rate": 5.222110911077258e-05, + "loss": 1.7421, + "step": 16333 + }, + { + "epoch": 5.013505217925108, + "grad_norm": 0.24422192573547363, + "learning_rate": 5.2216143446646085e-05, + "loss": 1.7074, + "step": 16334 + }, + { + "epoch": 5.013812154696133, + "grad_norm": 0.2417927384376526, + "learning_rate": 5.221117776061836e-05, + "loss": 1.7726, + "step": 16335 + }, + { + "epoch": 5.014119091467157, + "grad_norm": 0.245828777551651, + "learning_rate": 5.2206212052738454e-05, + "loss": 1.7932, + "step": 16336 + }, + { + "epoch": 5.014426028238183, + "grad_norm": 0.24054239690303802, + "learning_rate": 5.220124632305548e-05, + "loss": 1.727, + "step": 16337 + }, + { + "epoch": 5.014732965009208, + "grad_norm": 0.2572494149208069, + "learning_rate": 5.21962805716185e-05, + "loss": 1.7234, + "step": 16338 + }, + { + "epoch": 5.015039901780233, + "grad_norm": 0.33624622225761414, + "learning_rate": 5.2191314798476595e-05, + "loss": 1.7499, + "step": 16339 + }, + { + "epoch": 5.015346838551259, + "grad_norm": 0.22321413457393646, + "learning_rate": 5.218634900367883e-05, + "loss": 1.7155, + "step": 16340 + }, + { + "epoch": 5.015653775322283, + "grad_norm": 0.26709917187690735, + "learning_rate": 5.218138318727429e-05, + "loss": 1.8346, + "step": 16341 + }, + { + "epoch": 5.0159607120933085, + "grad_norm": 0.27600952982902527, + "learning_rate": 5.217641734931202e-05, + "loss": 1.789, + "step": 16342 + }, + { + "epoch": 5.016267648864334, + "grad_norm": 0.21392405033111572, + "learning_rate": 5.217145148984114e-05, + "loss": 1.7266, + "step": 16343 + }, + { + "epoch": 5.016574585635359, + "grad_norm": 0.3215450942516327, + "learning_rate": 5.2166485608910696e-05, + "loss": 1.7453, + "step": 16344 + }, + { + "epoch": 5.0168815224063845, + "grad_norm": 0.22328032553195953, + "learning_rate": 5.2161519706569776e-05, + "loss": 1.7209, + "step": 16345 + }, + { + "epoch": 5.01718845917741, + "grad_norm": 0.2438887059688568, + "learning_rate": 5.215655378286744e-05, + "loss": 1.7289, + "step": 16346 + }, + { + "epoch": 5.017495395948434, + "grad_norm": 0.30078747868537903, + "learning_rate": 5.2151587837852786e-05, + "loss": 1.7483, + "step": 16347 + }, + { + "epoch": 5.01780233271946, + "grad_norm": 0.21723167598247528, + "learning_rate": 5.214662187157488e-05, + "loss": 1.7654, + "step": 16348 + }, + { + "epoch": 5.018109269490485, + "grad_norm": 0.26358669996261597, + "learning_rate": 5.2141655884082784e-05, + "loss": 1.7563, + "step": 16349 + }, + { + "epoch": 5.01841620626151, + "grad_norm": 0.24285505712032318, + "learning_rate": 5.2136689875425615e-05, + "loss": 1.7377, + "step": 16350 + }, + { + "epoch": 5.018723143032536, + "grad_norm": 0.2401108294725418, + "learning_rate": 5.2131723845652416e-05, + "loss": 1.7445, + "step": 16351 + }, + { + "epoch": 5.01903007980356, + "grad_norm": 0.3347793519496918, + "learning_rate": 5.212675779481226e-05, + "loss": 1.7872, + "step": 16352 + }, + { + "epoch": 5.019337016574585, + "grad_norm": 0.306728720664978, + "learning_rate": 5.212179172295424e-05, + "loss": 1.8051, + "step": 16353 + }, + { + "epoch": 5.019643953345611, + "grad_norm": 0.22297725081443787, + "learning_rate": 5.211682563012743e-05, + "loss": 1.7082, + "step": 16354 + }, + { + "epoch": 5.019950890116636, + "grad_norm": 0.24047277867794037, + "learning_rate": 5.211185951638091e-05, + "loss": 1.7024, + "step": 16355 + }, + { + "epoch": 5.020257826887661, + "grad_norm": 0.19570080935955048, + "learning_rate": 5.210689338176377e-05, + "loss": 1.6947, + "step": 16356 + }, + { + "epoch": 5.020564763658686, + "grad_norm": 0.2024889886379242, + "learning_rate": 5.2101927226325066e-05, + "loss": 1.7168, + "step": 16357 + }, + { + "epoch": 5.020871700429711, + "grad_norm": 0.23546278476715088, + "learning_rate": 5.209696105011388e-05, + "loss": 1.7697, + "step": 16358 + }, + { + "epoch": 5.0211786372007365, + "grad_norm": 0.21003498136997223, + "learning_rate": 5.209199485317928e-05, + "loss": 1.7198, + "step": 16359 + }, + { + "epoch": 5.021485573971762, + "grad_norm": 0.21375493705272675, + "learning_rate": 5.208702863557039e-05, + "loss": 1.7689, + "step": 16360 + }, + { + "epoch": 5.021792510742787, + "grad_norm": 0.21549762785434723, + "learning_rate": 5.2082062397336254e-05, + "loss": 1.6936, + "step": 16361 + }, + { + "epoch": 5.0220994475138125, + "grad_norm": 0.22633691132068634, + "learning_rate": 5.207709613852595e-05, + "loss": 1.7512, + "step": 16362 + }, + { + "epoch": 5.022406384284837, + "grad_norm": 0.21888238191604614, + "learning_rate": 5.2072129859188566e-05, + "loss": 1.7082, + "step": 16363 + }, + { + "epoch": 5.022713321055862, + "grad_norm": 0.2416619062423706, + "learning_rate": 5.206716355937318e-05, + "loss": 1.7938, + "step": 16364 + }, + { + "epoch": 5.023020257826888, + "grad_norm": 0.22451527416706085, + "learning_rate": 5.206219723912886e-05, + "loss": 1.7372, + "step": 16365 + }, + { + "epoch": 5.023327194597913, + "grad_norm": 0.19698494672775269, + "learning_rate": 5.2057230898504716e-05, + "loss": 1.7205, + "step": 16366 + }, + { + "epoch": 5.023634131368938, + "grad_norm": 0.2441127747297287, + "learning_rate": 5.205226453754982e-05, + "loss": 1.7625, + "step": 16367 + }, + { + "epoch": 5.023941068139963, + "grad_norm": 0.21940121054649353, + "learning_rate": 5.204729815631323e-05, + "loss": 1.7985, + "step": 16368 + }, + { + "epoch": 5.024248004910988, + "grad_norm": 0.21751399338245392, + "learning_rate": 5.204233175484403e-05, + "loss": 1.7759, + "step": 16369 + }, + { + "epoch": 5.024554941682013, + "grad_norm": 0.20261377096176147, + "learning_rate": 5.2037365333191315e-05, + "loss": 1.746, + "step": 16370 + }, + { + "epoch": 5.024861878453039, + "grad_norm": 0.2628774046897888, + "learning_rate": 5.2032398891404166e-05, + "loss": 1.8178, + "step": 16371 + }, + { + "epoch": 5.025168815224064, + "grad_norm": 0.20626378059387207, + "learning_rate": 5.2027432429531665e-05, + "loss": 1.7456, + "step": 16372 + }, + { + "epoch": 5.0254757519950894, + "grad_norm": 0.25548869371414185, + "learning_rate": 5.2022465947622876e-05, + "loss": 1.8098, + "step": 16373 + }, + { + "epoch": 5.025782688766114, + "grad_norm": 0.1978374719619751, + "learning_rate": 5.20174994457269e-05, + "loss": 1.685, + "step": 16374 + }, + { + "epoch": 5.026089625537139, + "grad_norm": 0.2708980143070221, + "learning_rate": 5.201253292389282e-05, + "loss": 1.7464, + "step": 16375 + }, + { + "epoch": 5.026396562308165, + "grad_norm": 0.2730494737625122, + "learning_rate": 5.2007566382169706e-05, + "loss": 1.7391, + "step": 16376 + }, + { + "epoch": 5.02670349907919, + "grad_norm": 0.243557408452034, + "learning_rate": 5.2002599820606624e-05, + "loss": 1.7439, + "step": 16377 + }, + { + "epoch": 5.027010435850215, + "grad_norm": 0.2208259105682373, + "learning_rate": 5.19976332392527e-05, + "loss": 1.7612, + "step": 16378 + }, + { + "epoch": 5.02731737262124, + "grad_norm": 0.21288715302944183, + "learning_rate": 5.199266663815698e-05, + "loss": 1.7546, + "step": 16379 + }, + { + "epoch": 5.027624309392265, + "grad_norm": 0.2106054425239563, + "learning_rate": 5.198770001736857e-05, + "loss": 1.7281, + "step": 16380 + }, + { + "epoch": 5.02793124616329, + "grad_norm": 0.2247164249420166, + "learning_rate": 5.198273337693654e-05, + "loss": 1.8405, + "step": 16381 + }, + { + "epoch": 5.028238182934316, + "grad_norm": 0.21713724732398987, + "learning_rate": 5.197776671690998e-05, + "loss": 1.7333, + "step": 16382 + }, + { + "epoch": 5.028545119705341, + "grad_norm": 0.24063727259635925, + "learning_rate": 5.1972800037337956e-05, + "loss": 1.7608, + "step": 16383 + }, + { + "epoch": 5.0288520564763655, + "grad_norm": 0.22022177278995514, + "learning_rate": 5.196783333826959e-05, + "loss": 1.7045, + "step": 16384 + }, + { + "epoch": 5.029158993247391, + "grad_norm": 0.21348948776721954, + "learning_rate": 5.1962866619753927e-05, + "loss": 1.7516, + "step": 16385 + }, + { + "epoch": 5.029465930018416, + "grad_norm": 0.289315789937973, + "learning_rate": 5.195789988184007e-05, + "loss": 1.8555, + "step": 16386 + }, + { + "epoch": 5.0297728667894415, + "grad_norm": 0.30966848134994507, + "learning_rate": 5.19529331245771e-05, + "loss": 1.7245, + "step": 16387 + }, + { + "epoch": 5.030079803560467, + "grad_norm": 0.24625633656978607, + "learning_rate": 5.194796634801409e-05, + "loss": 1.7788, + "step": 16388 + }, + { + "epoch": 5.030386740331492, + "grad_norm": 0.25937986373901367, + "learning_rate": 5.1942999552200136e-05, + "loss": 1.7655, + "step": 16389 + }, + { + "epoch": 5.030693677102517, + "grad_norm": 0.3056741952896118, + "learning_rate": 5.1938032737184325e-05, + "loss": 1.7167, + "step": 16390 + }, + { + "epoch": 5.031000613873542, + "grad_norm": 0.29773563146591187, + "learning_rate": 5.1933065903015743e-05, + "loss": 1.7247, + "step": 16391 + }, + { + "epoch": 5.031307550644567, + "grad_norm": 0.26433971524238586, + "learning_rate": 5.192809904974347e-05, + "loss": 1.7779, + "step": 16392 + }, + { + "epoch": 5.031614487415593, + "grad_norm": 0.3308073580265045, + "learning_rate": 5.192313217741659e-05, + "loss": 1.7782, + "step": 16393 + }, + { + "epoch": 5.031921424186618, + "grad_norm": 0.2584165632724762, + "learning_rate": 5.1918165286084176e-05, + "loss": 1.7812, + "step": 16394 + }, + { + "epoch": 5.032228360957642, + "grad_norm": 0.31678953766822815, + "learning_rate": 5.1913198375795346e-05, + "loss": 1.7341, + "step": 16395 + }, + { + "epoch": 5.032535297728668, + "grad_norm": 0.3527325391769409, + "learning_rate": 5.190823144659916e-05, + "loss": 1.7844, + "step": 16396 + }, + { + "epoch": 5.032842234499693, + "grad_norm": 0.29233935475349426, + "learning_rate": 5.1903264498544724e-05, + "loss": 1.7993, + "step": 16397 + }, + { + "epoch": 5.033149171270718, + "grad_norm": 0.24549467861652374, + "learning_rate": 5.1898297531681106e-05, + "loss": 1.7294, + "step": 16398 + }, + { + "epoch": 5.033456108041744, + "grad_norm": 0.3446930944919586, + "learning_rate": 5.18933305460574e-05, + "loss": 1.6818, + "step": 16399 + }, + { + "epoch": 5.033763044812768, + "grad_norm": 0.2628229856491089, + "learning_rate": 5.188836354172268e-05, + "loss": 1.7867, + "step": 16400 + }, + { + "epoch": 5.0340699815837935, + "grad_norm": 0.26548629999160767, + "learning_rate": 5.188339651872607e-05, + "loss": 1.7448, + "step": 16401 + }, + { + "epoch": 5.034376918354819, + "grad_norm": 0.29242032766342163, + "learning_rate": 5.187842947711662e-05, + "loss": 1.7103, + "step": 16402 + }, + { + "epoch": 5.034683855125844, + "grad_norm": 0.2515408992767334, + "learning_rate": 5.187346241694343e-05, + "loss": 1.7865, + "step": 16403 + }, + { + "epoch": 5.0349907918968695, + "grad_norm": 0.2253103256225586, + "learning_rate": 5.186849533825559e-05, + "loss": 1.6993, + "step": 16404 + }, + { + "epoch": 5.035297728667895, + "grad_norm": 0.2743360102176666, + "learning_rate": 5.1863528241102154e-05, + "loss": 1.7532, + "step": 16405 + }, + { + "epoch": 5.035604665438919, + "grad_norm": 0.22807851433753967, + "learning_rate": 5.185856112553227e-05, + "loss": 1.7873, + "step": 16406 + }, + { + "epoch": 5.035911602209945, + "grad_norm": 0.23719090223312378, + "learning_rate": 5.1853593991594985e-05, + "loss": 1.7555, + "step": 16407 + }, + { + "epoch": 5.03621853898097, + "grad_norm": 0.2964477241039276, + "learning_rate": 5.184862683933941e-05, + "loss": 1.7204, + "step": 16408 + }, + { + "epoch": 5.036525475751995, + "grad_norm": 0.23717865347862244, + "learning_rate": 5.18436596688146e-05, + "loss": 1.7239, + "step": 16409 + }, + { + "epoch": 5.036832412523021, + "grad_norm": 0.22650085389614105, + "learning_rate": 5.1838692480069686e-05, + "loss": 1.7148, + "step": 16410 + }, + { + "epoch": 5.037139349294045, + "grad_norm": 0.25606781244277954, + "learning_rate": 5.183372527315371e-05, + "loss": 1.7916, + "step": 16411 + }, + { + "epoch": 5.03744628606507, + "grad_norm": 0.22266390919685364, + "learning_rate": 5.182875804811581e-05, + "loss": 1.7481, + "step": 16412 + }, + { + "epoch": 5.037753222836096, + "grad_norm": 0.23481780290603638, + "learning_rate": 5.1823790805005045e-05, + "loss": 1.8014, + "step": 16413 + }, + { + "epoch": 5.038060159607121, + "grad_norm": 0.2629338800907135, + "learning_rate": 5.1818823543870506e-05, + "loss": 1.81, + "step": 16414 + }, + { + "epoch": 5.038367096378146, + "grad_norm": 0.22891482710838318, + "learning_rate": 5.18138562647613e-05, + "loss": 1.757, + "step": 16415 + }, + { + "epoch": 5.038674033149171, + "grad_norm": 0.2666641175746918, + "learning_rate": 5.180888896772649e-05, + "loss": 1.7457, + "step": 16416 + }, + { + "epoch": 5.038980969920196, + "grad_norm": 0.37610310316085815, + "learning_rate": 5.180392165281517e-05, + "loss": 1.8214, + "step": 16417 + }, + { + "epoch": 5.0392879066912215, + "grad_norm": 0.2521277964115143, + "learning_rate": 5.1798954320076455e-05, + "loss": 1.7731, + "step": 16418 + }, + { + "epoch": 5.039594843462247, + "grad_norm": 0.25097090005874634, + "learning_rate": 5.1793986969559415e-05, + "loss": 1.8029, + "step": 16419 + }, + { + "epoch": 5.039901780233272, + "grad_norm": 0.2946726381778717, + "learning_rate": 5.178901960131315e-05, + "loss": 1.7483, + "step": 16420 + }, + { + "epoch": 5.0402087170042975, + "grad_norm": 0.24240419268608093, + "learning_rate": 5.1784052215386736e-05, + "loss": 1.731, + "step": 16421 + }, + { + "epoch": 5.040515653775322, + "grad_norm": 0.2403198480606079, + "learning_rate": 5.177908481182926e-05, + "loss": 1.722, + "step": 16422 + }, + { + "epoch": 5.040822590546347, + "grad_norm": 0.3451874554157257, + "learning_rate": 5.177411739068985e-05, + "loss": 1.7562, + "step": 16423 + }, + { + "epoch": 5.041129527317373, + "grad_norm": 0.3244951069355011, + "learning_rate": 5.176914995201756e-05, + "loss": 1.7321, + "step": 16424 + }, + { + "epoch": 5.041436464088398, + "grad_norm": 0.2346230000257492, + "learning_rate": 5.176418249586149e-05, + "loss": 1.7839, + "step": 16425 + }, + { + "epoch": 5.041743400859423, + "grad_norm": 0.357022225856781, + "learning_rate": 5.1759215022270744e-05, + "loss": 1.7776, + "step": 16426 + }, + { + "epoch": 5.042050337630448, + "grad_norm": 0.259007066488266, + "learning_rate": 5.17542475312944e-05, + "loss": 1.7544, + "step": 16427 + }, + { + "epoch": 5.042357274401473, + "grad_norm": 0.2516533136367798, + "learning_rate": 5.174928002298154e-05, + "loss": 1.7269, + "step": 16428 + }, + { + "epoch": 5.042664211172498, + "grad_norm": 0.3393619954586029, + "learning_rate": 5.174431249738129e-05, + "loss": 1.7487, + "step": 16429 + }, + { + "epoch": 5.042971147943524, + "grad_norm": 0.2730594873428345, + "learning_rate": 5.1739344954542714e-05, + "loss": 1.7468, + "step": 16430 + }, + { + "epoch": 5.043278084714549, + "grad_norm": 0.21233965456485748, + "learning_rate": 5.1734377394514914e-05, + "loss": 1.783, + "step": 16431 + }, + { + "epoch": 5.043585021485574, + "grad_norm": 0.3460896909236908, + "learning_rate": 5.1729409817346974e-05, + "loss": 1.7497, + "step": 16432 + }, + { + "epoch": 5.043891958256599, + "grad_norm": 0.31918221712112427, + "learning_rate": 5.1724442223088e-05, + "loss": 1.7834, + "step": 16433 + }, + { + "epoch": 5.044198895027624, + "grad_norm": 0.23016802966594696, + "learning_rate": 5.171947461178706e-05, + "loss": 1.7348, + "step": 16434 + }, + { + "epoch": 5.0445058317986495, + "grad_norm": 0.35758304595947266, + "learning_rate": 5.171450698349329e-05, + "loss": 1.7734, + "step": 16435 + }, + { + "epoch": 5.044812768569675, + "grad_norm": 0.279725581407547, + "learning_rate": 5.170953933825574e-05, + "loss": 1.7283, + "step": 16436 + }, + { + "epoch": 5.0451197053407, + "grad_norm": 0.23965120315551758, + "learning_rate": 5.170457167612354e-05, + "loss": 1.7606, + "step": 16437 + }, + { + "epoch": 5.045426642111725, + "grad_norm": 0.28026309609413147, + "learning_rate": 5.169960399714574e-05, + "loss": 1.7872, + "step": 16438 + }, + { + "epoch": 5.04573357888275, + "grad_norm": 0.3262448012828827, + "learning_rate": 5.169463630137146e-05, + "loss": 1.8654, + "step": 16439 + }, + { + "epoch": 5.046040515653775, + "grad_norm": 0.4249584674835205, + "learning_rate": 5.168966858884979e-05, + "loss": 1.7244, + "step": 16440 + }, + { + "epoch": 5.046347452424801, + "grad_norm": 0.3385370969772339, + "learning_rate": 5.168470085962984e-05, + "loss": 1.7745, + "step": 16441 + }, + { + "epoch": 5.046654389195826, + "grad_norm": 0.2321811318397522, + "learning_rate": 5.1679733113760675e-05, + "loss": 1.8093, + "step": 16442 + }, + { + "epoch": 5.04696132596685, + "grad_norm": 0.3426755368709564, + "learning_rate": 5.167476535129141e-05, + "loss": 1.7752, + "step": 16443 + }, + { + "epoch": 5.047268262737876, + "grad_norm": 0.27672505378723145, + "learning_rate": 5.166979757227114e-05, + "loss": 1.7619, + "step": 16444 + }, + { + "epoch": 5.047575199508901, + "grad_norm": 0.4111184775829315, + "learning_rate": 5.1664829776748925e-05, + "loss": 1.7672, + "step": 16445 + }, + { + "epoch": 5.047882136279926, + "grad_norm": 0.40139874815940857, + "learning_rate": 5.1659861964773905e-05, + "loss": 1.7753, + "step": 16446 + }, + { + "epoch": 5.048189073050952, + "grad_norm": 0.28931725025177, + "learning_rate": 5.165489413639516e-05, + "loss": 1.7607, + "step": 16447 + }, + { + "epoch": 5.048496009821977, + "grad_norm": 0.297538161277771, + "learning_rate": 5.1649926291661775e-05, + "loss": 1.7661, + "step": 16448 + }, + { + "epoch": 5.0488029465930016, + "grad_norm": 0.4299027621746063, + "learning_rate": 5.1644958430622846e-05, + "loss": 1.6998, + "step": 16449 + }, + { + "epoch": 5.049109883364027, + "grad_norm": 0.2554767429828644, + "learning_rate": 5.163999055332749e-05, + "loss": 1.7716, + "step": 16450 + }, + { + "epoch": 5.049416820135052, + "grad_norm": 0.3561006486415863, + "learning_rate": 5.163502265982477e-05, + "loss": 1.7493, + "step": 16451 + }, + { + "epoch": 5.0497237569060776, + "grad_norm": 0.3839687407016754, + "learning_rate": 5.1630054750163806e-05, + "loss": 1.7314, + "step": 16452 + }, + { + "epoch": 5.050030693677103, + "grad_norm": 0.20022284984588623, + "learning_rate": 5.1625086824393684e-05, + "loss": 1.6992, + "step": 16453 + }, + { + "epoch": 5.050337630448127, + "grad_norm": 0.36830398440361023, + "learning_rate": 5.162011888256349e-05, + "loss": 1.7339, + "step": 16454 + }, + { + "epoch": 5.050644567219153, + "grad_norm": 0.31947389245033264, + "learning_rate": 5.161515092472236e-05, + "loss": 1.7254, + "step": 16455 + }, + { + "epoch": 5.050951503990178, + "grad_norm": 0.2779252827167511, + "learning_rate": 5.161018295091933e-05, + "loss": 1.7941, + "step": 16456 + }, + { + "epoch": 5.051258440761203, + "grad_norm": 0.3796578347682953, + "learning_rate": 5.160521496120354e-05, + "loss": 1.7389, + "step": 16457 + }, + { + "epoch": 5.051565377532229, + "grad_norm": 0.23569442331790924, + "learning_rate": 5.1600246955624076e-05, + "loss": 1.7149, + "step": 16458 + }, + { + "epoch": 5.051872314303253, + "grad_norm": 0.27342507243156433, + "learning_rate": 5.159527893423004e-05, + "loss": 1.699, + "step": 16459 + }, + { + "epoch": 5.0521792510742785, + "grad_norm": 0.2877296209335327, + "learning_rate": 5.159031089707052e-05, + "loss": 1.7668, + "step": 16460 + }, + { + "epoch": 5.052486187845304, + "grad_norm": 0.21482446789741516, + "learning_rate": 5.1585342844194605e-05, + "loss": 1.7132, + "step": 16461 + }, + { + "epoch": 5.052793124616329, + "grad_norm": 0.23588669300079346, + "learning_rate": 5.158037477565142e-05, + "loss": 1.7267, + "step": 16462 + }, + { + "epoch": 5.0531000613873545, + "grad_norm": 0.20188623666763306, + "learning_rate": 5.157540669149003e-05, + "loss": 1.7486, + "step": 16463 + }, + { + "epoch": 5.05340699815838, + "grad_norm": 0.2012643963098526, + "learning_rate": 5.157043859175955e-05, + "loss": 1.718, + "step": 16464 + }, + { + "epoch": 5.053713934929404, + "grad_norm": 0.23133818805217743, + "learning_rate": 5.156547047650908e-05, + "loss": 1.7892, + "step": 16465 + }, + { + "epoch": 5.05402087170043, + "grad_norm": 0.2524542510509491, + "learning_rate": 5.156050234578771e-05, + "loss": 1.8034, + "step": 16466 + }, + { + "epoch": 5.054327808471455, + "grad_norm": 0.20992529392242432, + "learning_rate": 5.155553419964454e-05, + "loss": 1.7158, + "step": 16467 + }, + { + "epoch": 5.05463474524248, + "grad_norm": 0.23815447092056274, + "learning_rate": 5.155056603812868e-05, + "loss": 1.7632, + "step": 16468 + }, + { + "epoch": 5.054941682013506, + "grad_norm": 0.3306051790714264, + "learning_rate": 5.1545597861289205e-05, + "loss": 1.7719, + "step": 16469 + }, + { + "epoch": 5.05524861878453, + "grad_norm": 0.287541925907135, + "learning_rate": 5.154062966917523e-05, + "loss": 1.7092, + "step": 16470 + }, + { + "epoch": 5.055555555555555, + "grad_norm": 0.28186658024787903, + "learning_rate": 5.153566146183586e-05, + "loss": 1.8548, + "step": 16471 + }, + { + "epoch": 5.055862492326581, + "grad_norm": 0.3511136472225189, + "learning_rate": 5.153069323932017e-05, + "loss": 1.8029, + "step": 16472 + }, + { + "epoch": 5.056169429097606, + "grad_norm": 0.32083824276924133, + "learning_rate": 5.152572500167728e-05, + "loss": 1.7321, + "step": 16473 + }, + { + "epoch": 5.056476365868631, + "grad_norm": 0.22571051120758057, + "learning_rate": 5.1520756748956265e-05, + "loss": 1.7218, + "step": 16474 + }, + { + "epoch": 5.056783302639656, + "grad_norm": 0.2902646064758301, + "learning_rate": 5.151578848120626e-05, + "loss": 1.7231, + "step": 16475 + }, + { + "epoch": 5.057090239410681, + "grad_norm": 0.20447610318660736, + "learning_rate": 5.1510820198476336e-05, + "loss": 1.6998, + "step": 16476 + }, + { + "epoch": 5.0573971761817065, + "grad_norm": 0.29436638951301575, + "learning_rate": 5.1505851900815606e-05, + "loss": 1.6793, + "step": 16477 + }, + { + "epoch": 5.057704112952732, + "grad_norm": 0.29718565940856934, + "learning_rate": 5.1500883588273164e-05, + "loss": 1.8322, + "step": 16478 + }, + { + "epoch": 5.058011049723757, + "grad_norm": 0.23530519008636475, + "learning_rate": 5.149591526089811e-05, + "loss": 1.7408, + "step": 16479 + }, + { + "epoch": 5.0583179864947825, + "grad_norm": 0.30735042691230774, + "learning_rate": 5.1490946918739536e-05, + "loss": 1.7454, + "step": 16480 + }, + { + "epoch": 5.058624923265807, + "grad_norm": 0.26151445508003235, + "learning_rate": 5.148597856184656e-05, + "loss": 1.7728, + "step": 16481 + }, + { + "epoch": 5.058931860036832, + "grad_norm": 0.2657756209373474, + "learning_rate": 5.1481010190268263e-05, + "loss": 1.7905, + "step": 16482 + }, + { + "epoch": 5.059238796807858, + "grad_norm": 0.25418251752853394, + "learning_rate": 5.147604180405376e-05, + "loss": 1.7676, + "step": 16483 + }, + { + "epoch": 5.059545733578883, + "grad_norm": 0.25486254692077637, + "learning_rate": 5.1471073403252154e-05, + "loss": 1.8347, + "step": 16484 + }, + { + "epoch": 5.059852670349908, + "grad_norm": 0.22693100571632385, + "learning_rate": 5.146610498791255e-05, + "loss": 1.7308, + "step": 16485 + }, + { + "epoch": 5.060159607120933, + "grad_norm": 0.22056837379932404, + "learning_rate": 5.146113655808401e-05, + "loss": 1.7158, + "step": 16486 + }, + { + "epoch": 5.060466543891958, + "grad_norm": 0.221246138215065, + "learning_rate": 5.1456168113815685e-05, + "loss": 1.6985, + "step": 16487 + }, + { + "epoch": 5.060773480662983, + "grad_norm": 0.2149408906698227, + "learning_rate": 5.145119965515664e-05, + "loss": 1.716, + "step": 16488 + }, + { + "epoch": 5.061080417434009, + "grad_norm": 0.23958513140678406, + "learning_rate": 5.144623118215599e-05, + "loss": 1.8092, + "step": 16489 + }, + { + "epoch": 5.061387354205034, + "grad_norm": 0.2870621085166931, + "learning_rate": 5.1441262694862836e-05, + "loss": 1.75, + "step": 16490 + }, + { + "epoch": 5.0616942909760585, + "grad_norm": 0.26755061745643616, + "learning_rate": 5.1436294193326276e-05, + "loss": 1.7848, + "step": 16491 + }, + { + "epoch": 5.062001227747084, + "grad_norm": 0.2434249073266983, + "learning_rate": 5.143132567759542e-05, + "loss": 1.7487, + "step": 16492 + }, + { + "epoch": 5.062308164518109, + "grad_norm": 0.3044668138027191, + "learning_rate": 5.142635714771936e-05, + "loss": 1.741, + "step": 16493 + }, + { + "epoch": 5.0626151012891345, + "grad_norm": 0.2166958749294281, + "learning_rate": 5.142138860374721e-05, + "loss": 1.7232, + "step": 16494 + }, + { + "epoch": 5.06292203806016, + "grad_norm": 0.34558552503585815, + "learning_rate": 5.141642004572806e-05, + "loss": 1.7663, + "step": 16495 + }, + { + "epoch": 5.063228974831185, + "grad_norm": 0.330751895904541, + "learning_rate": 5.141145147371102e-05, + "loss": 1.6818, + "step": 16496 + }, + { + "epoch": 5.06353591160221, + "grad_norm": 0.21613973379135132, + "learning_rate": 5.140648288774518e-05, + "loss": 1.7914, + "step": 16497 + }, + { + "epoch": 5.063842848373235, + "grad_norm": 0.32759732007980347, + "learning_rate": 5.140151428787966e-05, + "loss": 1.7543, + "step": 16498 + }, + { + "epoch": 5.06414978514426, + "grad_norm": 0.3180293142795563, + "learning_rate": 5.1396545674163556e-05, + "loss": 1.8163, + "step": 16499 + }, + { + "epoch": 5.064456721915286, + "grad_norm": 0.19757944345474243, + "learning_rate": 5.1391577046645964e-05, + "loss": 1.71, + "step": 16500 + }, + { + "epoch": 5.064763658686311, + "grad_norm": 0.253366619348526, + "learning_rate": 5.1386608405376005e-05, + "loss": 1.7266, + "step": 16501 + }, + { + "epoch": 5.065070595457335, + "grad_norm": 0.24577608704566956, + "learning_rate": 5.1381639750402754e-05, + "loss": 1.7218, + "step": 16502 + }, + { + "epoch": 5.065377532228361, + "grad_norm": 0.22847014665603638, + "learning_rate": 5.137667108177533e-05, + "loss": 1.8025, + "step": 16503 + }, + { + "epoch": 5.065684468999386, + "grad_norm": 0.2089833766222, + "learning_rate": 5.137170239954284e-05, + "loss": 1.8032, + "step": 16504 + }, + { + "epoch": 5.065991405770411, + "grad_norm": 0.21528512239456177, + "learning_rate": 5.136673370375439e-05, + "loss": 1.7227, + "step": 16505 + }, + { + "epoch": 5.066298342541437, + "grad_norm": 0.2099117785692215, + "learning_rate": 5.1361764994459074e-05, + "loss": 1.7176, + "step": 16506 + }, + { + "epoch": 5.066605279312462, + "grad_norm": 0.2140430212020874, + "learning_rate": 5.135679627170599e-05, + "loss": 1.8195, + "step": 16507 + }, + { + "epoch": 5.0669122160834865, + "grad_norm": 0.20253533124923706, + "learning_rate": 5.135182753554424e-05, + "loss": 1.7284, + "step": 16508 + }, + { + "epoch": 5.067219152854512, + "grad_norm": 0.19945639371871948, + "learning_rate": 5.134685878602295e-05, + "loss": 1.6915, + "step": 16509 + }, + { + "epoch": 5.067526089625537, + "grad_norm": 0.20138494670391083, + "learning_rate": 5.1341890023191216e-05, + "loss": 1.7856, + "step": 16510 + }, + { + "epoch": 5.0678330263965625, + "grad_norm": 0.22124232351779938, + "learning_rate": 5.1336921247098136e-05, + "loss": 1.7674, + "step": 16511 + }, + { + "epoch": 5.068139963167588, + "grad_norm": 0.21564216911792755, + "learning_rate": 5.133195245779282e-05, + "loss": 1.6998, + "step": 16512 + }, + { + "epoch": 5.068446899938612, + "grad_norm": 0.21836799383163452, + "learning_rate": 5.1326983655324365e-05, + "loss": 1.7468, + "step": 16513 + }, + { + "epoch": 5.068753836709638, + "grad_norm": 0.2412201464176178, + "learning_rate": 5.132201483974187e-05, + "loss": 1.7433, + "step": 16514 + }, + { + "epoch": 5.069060773480663, + "grad_norm": 0.262054979801178, + "learning_rate": 5.131704601109446e-05, + "loss": 1.8315, + "step": 16515 + }, + { + "epoch": 5.069367710251688, + "grad_norm": 0.21573080122470856, + "learning_rate": 5.1312077169431225e-05, + "loss": 1.7668, + "step": 16516 + }, + { + "epoch": 5.069674647022714, + "grad_norm": 0.21407057344913483, + "learning_rate": 5.130710831480129e-05, + "loss": 1.7486, + "step": 16517 + }, + { + "epoch": 5.069981583793738, + "grad_norm": 0.2128407508134842, + "learning_rate": 5.130213944725373e-05, + "loss": 1.7618, + "step": 16518 + }, + { + "epoch": 5.070288520564763, + "grad_norm": 0.2034141719341278, + "learning_rate": 5.129717056683767e-05, + "loss": 1.726, + "step": 16519 + }, + { + "epoch": 5.070595457335789, + "grad_norm": 0.21474458277225494, + "learning_rate": 5.1292201673602205e-05, + "loss": 1.7883, + "step": 16520 + }, + { + "epoch": 5.070902394106814, + "grad_norm": 0.2102673202753067, + "learning_rate": 5.128723276759645e-05, + "loss": 1.7826, + "step": 16521 + }, + { + "epoch": 5.071209330877839, + "grad_norm": 0.21342496573925018, + "learning_rate": 5.1282263848869505e-05, + "loss": 1.7561, + "step": 16522 + }, + { + "epoch": 5.071516267648865, + "grad_norm": 0.21749620139598846, + "learning_rate": 5.1277294917470474e-05, + "loss": 1.7814, + "step": 16523 + }, + { + "epoch": 5.071823204419889, + "grad_norm": 0.20006774365901947, + "learning_rate": 5.1272325973448476e-05, + "loss": 1.6965, + "step": 16524 + }, + { + "epoch": 5.0721301411909145, + "grad_norm": 0.20878590643405914, + "learning_rate": 5.1267357016852593e-05, + "loss": 1.7426, + "step": 16525 + }, + { + "epoch": 5.07243707796194, + "grad_norm": 0.21824820339679718, + "learning_rate": 5.1262388047731946e-05, + "loss": 1.7704, + "step": 16526 + }, + { + "epoch": 5.072744014732965, + "grad_norm": 0.1992526650428772, + "learning_rate": 5.125741906613565e-05, + "loss": 1.7874, + "step": 16527 + }, + { + "epoch": 5.0730509515039905, + "grad_norm": 0.21028028428554535, + "learning_rate": 5.12524500721128e-05, + "loss": 1.7483, + "step": 16528 + }, + { + "epoch": 5.073357888275015, + "grad_norm": 0.21840833127498627, + "learning_rate": 5.12474810657125e-05, + "loss": 1.7763, + "step": 16529 + }, + { + "epoch": 5.07366482504604, + "grad_norm": 0.249269038438797, + "learning_rate": 5.124251204698387e-05, + "loss": 1.7451, + "step": 16530 + }, + { + "epoch": 5.073971761817066, + "grad_norm": 0.2176963835954666, + "learning_rate": 5.1237543015975986e-05, + "loss": 1.7079, + "step": 16531 + }, + { + "epoch": 5.074278698588091, + "grad_norm": 0.20284616947174072, + "learning_rate": 5.1232573972738e-05, + "loss": 1.7235, + "step": 16532 + }, + { + "epoch": 5.074585635359116, + "grad_norm": 0.20140530169010162, + "learning_rate": 5.1227604917318984e-05, + "loss": 1.7014, + "step": 16533 + }, + { + "epoch": 5.074892572130141, + "grad_norm": 0.2407023161649704, + "learning_rate": 5.1222635849768066e-05, + "loss": 1.7493, + "step": 16534 + }, + { + "epoch": 5.075199508901166, + "grad_norm": 0.2013770490884781, + "learning_rate": 5.121766677013433e-05, + "loss": 1.7601, + "step": 16535 + }, + { + "epoch": 5.0755064456721914, + "grad_norm": 0.23889221251010895, + "learning_rate": 5.1212697678466916e-05, + "loss": 1.7282, + "step": 16536 + }, + { + "epoch": 5.075813382443217, + "grad_norm": 0.2411198765039444, + "learning_rate": 5.120772857481489e-05, + "loss": 1.8138, + "step": 16537 + }, + { + "epoch": 5.076120319214242, + "grad_norm": 0.24521365761756897, + "learning_rate": 5.12027594592274e-05, + "loss": 1.7659, + "step": 16538 + }, + { + "epoch": 5.0764272559852675, + "grad_norm": 0.2841372787952423, + "learning_rate": 5.119779033175354e-05, + "loss": 1.7973, + "step": 16539 + }, + { + "epoch": 5.076734192756292, + "grad_norm": 0.21796928346157074, + "learning_rate": 5.1192821192442395e-05, + "loss": 1.6985, + "step": 16540 + }, + { + "epoch": 5.077041129527317, + "grad_norm": 0.2244848757982254, + "learning_rate": 5.118785204134311e-05, + "loss": 1.7413, + "step": 16541 + }, + { + "epoch": 5.077348066298343, + "grad_norm": 0.22581063210964203, + "learning_rate": 5.1182882878504766e-05, + "loss": 1.7706, + "step": 16542 + }, + { + "epoch": 5.077655003069368, + "grad_norm": 0.24478016793727875, + "learning_rate": 5.117791370397647e-05, + "loss": 1.7628, + "step": 16543 + }, + { + "epoch": 5.077961939840393, + "grad_norm": 0.31270188093185425, + "learning_rate": 5.117294451780734e-05, + "loss": 1.8254, + "step": 16544 + }, + { + "epoch": 5.078268876611418, + "grad_norm": 0.3547368049621582, + "learning_rate": 5.11679753200465e-05, + "loss": 1.781, + "step": 16545 + }, + { + "epoch": 5.078575813382443, + "grad_norm": 0.24920180439949036, + "learning_rate": 5.116300611074304e-05, + "loss": 1.7748, + "step": 16546 + }, + { + "epoch": 5.078882750153468, + "grad_norm": 0.2368776649236679, + "learning_rate": 5.115803688994607e-05, + "loss": 1.7459, + "step": 16547 + }, + { + "epoch": 5.079189686924494, + "grad_norm": 0.28341975808143616, + "learning_rate": 5.115306765770471e-05, + "loss": 1.6694, + "step": 16548 + }, + { + "epoch": 5.079496623695519, + "grad_norm": 0.2521432936191559, + "learning_rate": 5.114809841406804e-05, + "loss": 1.7544, + "step": 16549 + }, + { + "epoch": 5.0798035604665435, + "grad_norm": 0.21199844777584076, + "learning_rate": 5.11431291590852e-05, + "loss": 1.7215, + "step": 16550 + }, + { + "epoch": 5.080110497237569, + "grad_norm": 0.25157347321510315, + "learning_rate": 5.113815989280528e-05, + "loss": 1.8021, + "step": 16551 + }, + { + "epoch": 5.080417434008594, + "grad_norm": 0.2284129559993744, + "learning_rate": 5.1133190615277414e-05, + "loss": 1.7125, + "step": 16552 + }, + { + "epoch": 5.0807243707796195, + "grad_norm": 0.2297726720571518, + "learning_rate": 5.11282213265507e-05, + "loss": 1.7602, + "step": 16553 + }, + { + "epoch": 5.081031307550645, + "grad_norm": 0.22392617166042328, + "learning_rate": 5.112325202667421e-05, + "loss": 1.7251, + "step": 16554 + }, + { + "epoch": 5.08133824432167, + "grad_norm": 0.22406147420406342, + "learning_rate": 5.11182827156971e-05, + "loss": 1.7232, + "step": 16555 + }, + { + "epoch": 5.081645181092695, + "grad_norm": 0.2547284960746765, + "learning_rate": 5.111331339366846e-05, + "loss": 1.7335, + "step": 16556 + }, + { + "epoch": 5.08195211786372, + "grad_norm": 0.216146782040596, + "learning_rate": 5.1108344060637415e-05, + "loss": 1.7469, + "step": 16557 + }, + { + "epoch": 5.082259054634745, + "grad_norm": 0.1926967352628708, + "learning_rate": 5.110337471665306e-05, + "loss": 1.7492, + "step": 16558 + }, + { + "epoch": 5.082565991405771, + "grad_norm": 0.30311331152915955, + "learning_rate": 5.109840536176451e-05, + "loss": 1.8129, + "step": 16559 + }, + { + "epoch": 5.082872928176796, + "grad_norm": 0.24273787438869476, + "learning_rate": 5.109343599602087e-05, + "loss": 1.7206, + "step": 16560 + }, + { + "epoch": 5.08317986494782, + "grad_norm": 0.22736592590808868, + "learning_rate": 5.1088466619471255e-05, + "loss": 1.732, + "step": 16561 + }, + { + "epoch": 5.083486801718846, + "grad_norm": 0.21457640826702118, + "learning_rate": 5.1083497232164777e-05, + "loss": 1.726, + "step": 16562 + }, + { + "epoch": 5.083793738489871, + "grad_norm": 0.20968590676784515, + "learning_rate": 5.107852783415055e-05, + "loss": 1.8095, + "step": 16563 + }, + { + "epoch": 5.084100675260896, + "grad_norm": 0.2846728265285492, + "learning_rate": 5.107355842547768e-05, + "loss": 1.7524, + "step": 16564 + }, + { + "epoch": 5.084407612031922, + "grad_norm": 0.21162885427474976, + "learning_rate": 5.106858900619526e-05, + "loss": 1.753, + "step": 16565 + }, + { + "epoch": 5.084714548802946, + "grad_norm": 0.24349012970924377, + "learning_rate": 5.106361957635242e-05, + "loss": 1.7003, + "step": 16566 + }, + { + "epoch": 5.0850214855739715, + "grad_norm": 0.24532537162303925, + "learning_rate": 5.105865013599828e-05, + "loss": 1.7818, + "step": 16567 + }, + { + "epoch": 5.085328422344997, + "grad_norm": 0.22788558900356293, + "learning_rate": 5.1053680685181926e-05, + "loss": 1.7291, + "step": 16568 + }, + { + "epoch": 5.085635359116022, + "grad_norm": 0.22402508556842804, + "learning_rate": 5.10487112239525e-05, + "loss": 1.8292, + "step": 16569 + }, + { + "epoch": 5.0859422958870475, + "grad_norm": 0.2396162748336792, + "learning_rate": 5.1043741752359085e-05, + "loss": 1.7441, + "step": 16570 + }, + { + "epoch": 5.086249232658073, + "grad_norm": 0.22364887595176697, + "learning_rate": 5.1038772270450796e-05, + "loss": 1.7356, + "step": 16571 + }, + { + "epoch": 5.086556169429097, + "grad_norm": 0.20385414361953735, + "learning_rate": 5.103380277827676e-05, + "loss": 1.774, + "step": 16572 + }, + { + "epoch": 5.086863106200123, + "grad_norm": 0.2050715535879135, + "learning_rate": 5.102883327588608e-05, + "loss": 1.7217, + "step": 16573 + }, + { + "epoch": 5.087170042971148, + "grad_norm": 0.23750410974025726, + "learning_rate": 5.102386376332786e-05, + "loss": 1.7605, + "step": 16574 + }, + { + "epoch": 5.087476979742173, + "grad_norm": 0.24313338100910187, + "learning_rate": 5.101889424065122e-05, + "loss": 1.7498, + "step": 16575 + }, + { + "epoch": 5.087783916513199, + "grad_norm": 0.22145850956439972, + "learning_rate": 5.101392470790527e-05, + "loss": 1.7827, + "step": 16576 + }, + { + "epoch": 5.088090853284223, + "grad_norm": 0.23073779046535492, + "learning_rate": 5.100895516513912e-05, + "loss": 1.7722, + "step": 16577 + }, + { + "epoch": 5.088397790055248, + "grad_norm": 0.2112295925617218, + "learning_rate": 5.100398561240188e-05, + "loss": 1.7755, + "step": 16578 + }, + { + "epoch": 5.088704726826274, + "grad_norm": 0.23263800144195557, + "learning_rate": 5.0999016049742675e-05, + "loss": 1.7593, + "step": 16579 + }, + { + "epoch": 5.089011663597299, + "grad_norm": 0.23011381924152374, + "learning_rate": 5.09940464772106e-05, + "loss": 1.704, + "step": 16580 + }, + { + "epoch": 5.089318600368324, + "grad_norm": 0.1930779367685318, + "learning_rate": 5.0989076894854785e-05, + "loss": 1.7038, + "step": 16581 + }, + { + "epoch": 5.08962553713935, + "grad_norm": 0.2100505381822586, + "learning_rate": 5.098410730272433e-05, + "loss": 1.7671, + "step": 16582 + }, + { + "epoch": 5.089932473910374, + "grad_norm": 0.1919277459383011, + "learning_rate": 5.097913770086833e-05, + "loss": 1.651, + "step": 16583 + }, + { + "epoch": 5.0902394106813995, + "grad_norm": 0.23310615122318268, + "learning_rate": 5.097416808933594e-05, + "loss": 1.8294, + "step": 16584 + }, + { + "epoch": 5.090546347452425, + "grad_norm": 0.26191771030426025, + "learning_rate": 5.096919846817624e-05, + "loss": 1.7522, + "step": 16585 + }, + { + "epoch": 5.09085328422345, + "grad_norm": 0.2508419156074524, + "learning_rate": 5.096422883743835e-05, + "loss": 1.8025, + "step": 16586 + }, + { + "epoch": 5.0911602209944755, + "grad_norm": 0.23192499577999115, + "learning_rate": 5.0959259197171414e-05, + "loss": 1.7885, + "step": 16587 + }, + { + "epoch": 5.0914671577655, + "grad_norm": 0.2164602279663086, + "learning_rate": 5.095428954742448e-05, + "loss": 1.7299, + "step": 16588 + }, + { + "epoch": 5.091774094536525, + "grad_norm": 0.21431668102741241, + "learning_rate": 5.094931988824671e-05, + "loss": 1.7122, + "step": 16589 + }, + { + "epoch": 5.092081031307551, + "grad_norm": 0.20563583076000214, + "learning_rate": 5.094435021968722e-05, + "loss": 1.7118, + "step": 16590 + }, + { + "epoch": 5.092387968078576, + "grad_norm": 0.20916326344013214, + "learning_rate": 5.093938054179509e-05, + "loss": 1.7639, + "step": 16591 + }, + { + "epoch": 5.092694904849601, + "grad_norm": 0.21197481453418732, + "learning_rate": 5.0934410854619454e-05, + "loss": 1.7357, + "step": 16592 + }, + { + "epoch": 5.093001841620626, + "grad_norm": 0.21085995435714722, + "learning_rate": 5.092944115820942e-05, + "loss": 1.6921, + "step": 16593 + }, + { + "epoch": 5.093308778391651, + "grad_norm": 0.2608145773410797, + "learning_rate": 5.09244714526141e-05, + "loss": 1.7541, + "step": 16594 + }, + { + "epoch": 5.093615715162676, + "grad_norm": 0.2138587087392807, + "learning_rate": 5.0919501737882624e-05, + "loss": 1.727, + "step": 16595 + }, + { + "epoch": 5.093922651933702, + "grad_norm": 0.230251282453537, + "learning_rate": 5.0914532014064084e-05, + "loss": 1.7828, + "step": 16596 + }, + { + "epoch": 5.094229588704727, + "grad_norm": 0.2162851244211197, + "learning_rate": 5.0909562281207614e-05, + "loss": 1.6905, + "step": 16597 + }, + { + "epoch": 5.094536525475752, + "grad_norm": 0.20637664198875427, + "learning_rate": 5.090459253936231e-05, + "loss": 1.7484, + "step": 16598 + }, + { + "epoch": 5.094843462246777, + "grad_norm": 0.19427815079689026, + "learning_rate": 5.089962278857728e-05, + "loss": 1.7379, + "step": 16599 + }, + { + "epoch": 5.095150399017802, + "grad_norm": 0.1877593845129013, + "learning_rate": 5.089465302890165e-05, + "loss": 1.7017, + "step": 16600 + }, + { + "epoch": 5.0954573357888275, + "grad_norm": 0.19219037890434265, + "learning_rate": 5.0889683260384543e-05, + "loss": 1.7379, + "step": 16601 + }, + { + "epoch": 5.095764272559853, + "grad_norm": 0.19855685532093048, + "learning_rate": 5.088471348307507e-05, + "loss": 1.7171, + "step": 16602 + }, + { + "epoch": 5.096071209330878, + "grad_norm": 0.19119660556316376, + "learning_rate": 5.087974369702235e-05, + "loss": 1.6912, + "step": 16603 + }, + { + "epoch": 5.096378146101903, + "grad_norm": 0.2102670818567276, + "learning_rate": 5.0874773902275476e-05, + "loss": 1.6825, + "step": 16604 + }, + { + "epoch": 5.096685082872928, + "grad_norm": 0.2120765596628189, + "learning_rate": 5.0869804098883564e-05, + "loss": 1.7055, + "step": 16605 + }, + { + "epoch": 5.096992019643953, + "grad_norm": 0.25874772667884827, + "learning_rate": 5.0864834286895745e-05, + "loss": 1.7193, + "step": 16606 + }, + { + "epoch": 5.097298956414979, + "grad_norm": 0.20822012424468994, + "learning_rate": 5.085986446636113e-05, + "loss": 1.6748, + "step": 16607 + }, + { + "epoch": 5.097605893186004, + "grad_norm": 0.21364718675613403, + "learning_rate": 5.085489463732883e-05, + "loss": 1.7762, + "step": 16608 + }, + { + "epoch": 5.097912829957028, + "grad_norm": 0.21961788833141327, + "learning_rate": 5.084992479984796e-05, + "loss": 1.7243, + "step": 16609 + }, + { + "epoch": 5.098219766728054, + "grad_norm": 0.22056026756763458, + "learning_rate": 5.0844954953967624e-05, + "loss": 1.6983, + "step": 16610 + }, + { + "epoch": 5.098526703499079, + "grad_norm": 0.21347738802433014, + "learning_rate": 5.083998509973695e-05, + "loss": 1.7319, + "step": 16611 + }, + { + "epoch": 5.098833640270104, + "grad_norm": 0.23593664169311523, + "learning_rate": 5.083501523720506e-05, + "loss": 1.7121, + "step": 16612 + }, + { + "epoch": 5.09914057704113, + "grad_norm": 0.2088623344898224, + "learning_rate": 5.0830045366421055e-05, + "loss": 1.72, + "step": 16613 + }, + { + "epoch": 5.099447513812155, + "grad_norm": 0.2293832004070282, + "learning_rate": 5.082507548743406e-05, + "loss": 1.7548, + "step": 16614 + }, + { + "epoch": 5.0997544505831796, + "grad_norm": 0.2509057819843292, + "learning_rate": 5.082010560029319e-05, + "loss": 1.7729, + "step": 16615 + }, + { + "epoch": 5.100061387354205, + "grad_norm": 0.1925390362739563, + "learning_rate": 5.081513570504755e-05, + "loss": 1.7109, + "step": 16616 + }, + { + "epoch": 5.10036832412523, + "grad_norm": 0.20876559615135193, + "learning_rate": 5.081016580174626e-05, + "loss": 1.7031, + "step": 16617 + }, + { + "epoch": 5.100675260896256, + "grad_norm": 0.2038683146238327, + "learning_rate": 5.080519589043842e-05, + "loss": 1.7489, + "step": 16618 + }, + { + "epoch": 5.100982197667281, + "grad_norm": 0.25018224120140076, + "learning_rate": 5.080022597117318e-05, + "loss": 1.7884, + "step": 16619 + }, + { + "epoch": 5.101289134438305, + "grad_norm": 0.24430342018604279, + "learning_rate": 5.079525604399965e-05, + "loss": 1.7558, + "step": 16620 + }, + { + "epoch": 5.101596071209331, + "grad_norm": 0.22151432931423187, + "learning_rate": 5.079028610896692e-05, + "loss": 1.7543, + "step": 16621 + }, + { + "epoch": 5.101903007980356, + "grad_norm": 0.2313055694103241, + "learning_rate": 5.0785316166124107e-05, + "loss": 1.7755, + "step": 16622 + }, + { + "epoch": 5.102209944751381, + "grad_norm": 0.27405816316604614, + "learning_rate": 5.0780346215520355e-05, + "loss": 1.7006, + "step": 16623 + }, + { + "epoch": 5.102516881522407, + "grad_norm": 0.2209920734167099, + "learning_rate": 5.077537625720476e-05, + "loss": 1.6877, + "step": 16624 + }, + { + "epoch": 5.102823818293431, + "grad_norm": 0.20993784070014954, + "learning_rate": 5.077040629122645e-05, + "loss": 1.7558, + "step": 16625 + }, + { + "epoch": 5.1031307550644565, + "grad_norm": 0.25554344058036804, + "learning_rate": 5.076543631763453e-05, + "loss": 1.7142, + "step": 16626 + }, + { + "epoch": 5.103437691835482, + "grad_norm": 0.28980588912963867, + "learning_rate": 5.0760466336478116e-05, + "loss": 1.7632, + "step": 16627 + }, + { + "epoch": 5.103744628606507, + "grad_norm": 0.20144744217395782, + "learning_rate": 5.075549634780633e-05, + "loss": 1.7472, + "step": 16628 + }, + { + "epoch": 5.1040515653775325, + "grad_norm": 0.30335596203804016, + "learning_rate": 5.075052635166827e-05, + "loss": 1.7283, + "step": 16629 + }, + { + "epoch": 5.104358502148558, + "grad_norm": 0.3014097213745117, + "learning_rate": 5.074555634811309e-05, + "loss": 1.7273, + "step": 16630 + }, + { + "epoch": 5.104665438919582, + "grad_norm": 0.20123563706874847, + "learning_rate": 5.074058633718988e-05, + "loss": 1.7119, + "step": 16631 + }, + { + "epoch": 5.104972375690608, + "grad_norm": 0.3375137746334076, + "learning_rate": 5.073561631894776e-05, + "loss": 1.7594, + "step": 16632 + }, + { + "epoch": 5.105279312461633, + "grad_norm": 0.3471776247024536, + "learning_rate": 5.0730646293435846e-05, + "loss": 1.729, + "step": 16633 + }, + { + "epoch": 5.105586249232658, + "grad_norm": 0.26405471563339233, + "learning_rate": 5.072567626070327e-05, + "loss": 1.7472, + "step": 16634 + }, + { + "epoch": 5.105893186003684, + "grad_norm": 0.2339334636926651, + "learning_rate": 5.072070622079911e-05, + "loss": 1.7285, + "step": 16635 + }, + { + "epoch": 5.106200122774708, + "grad_norm": 0.26267752051353455, + "learning_rate": 5.0715736173772534e-05, + "loss": 1.7171, + "step": 16636 + }, + { + "epoch": 5.106507059545733, + "grad_norm": 0.22254765033721924, + "learning_rate": 5.0710766119672626e-05, + "loss": 1.7702, + "step": 16637 + }, + { + "epoch": 5.106813996316759, + "grad_norm": 0.2457888424396515, + "learning_rate": 5.070579605854852e-05, + "loss": 1.7987, + "step": 16638 + }, + { + "epoch": 5.107120933087784, + "grad_norm": 0.24500930309295654, + "learning_rate": 5.070082599044931e-05, + "loss": 1.8103, + "step": 16639 + }, + { + "epoch": 5.107427869858809, + "grad_norm": 0.24446405470371246, + "learning_rate": 5.0695855915424116e-05, + "loss": 1.7058, + "step": 16640 + }, + { + "epoch": 5.107734806629834, + "grad_norm": 0.22352534532546997, + "learning_rate": 5.0690885833522086e-05, + "loss": 1.7503, + "step": 16641 + }, + { + "epoch": 5.108041743400859, + "grad_norm": 0.2308795005083084, + "learning_rate": 5.068591574479231e-05, + "loss": 1.8064, + "step": 16642 + }, + { + "epoch": 5.1083486801718845, + "grad_norm": 0.23804180324077606, + "learning_rate": 5.068094564928392e-05, + "loss": 1.7603, + "step": 16643 + }, + { + "epoch": 5.10865561694291, + "grad_norm": 0.1956508308649063, + "learning_rate": 5.0675975547046016e-05, + "loss": 1.7448, + "step": 16644 + }, + { + "epoch": 5.108962553713935, + "grad_norm": 0.24438725411891937, + "learning_rate": 5.067100543812773e-05, + "loss": 1.7706, + "step": 16645 + }, + { + "epoch": 5.1092694904849605, + "grad_norm": 0.26129621267318726, + "learning_rate": 5.066603532257817e-05, + "loss": 1.7321, + "step": 16646 + }, + { + "epoch": 5.109576427255985, + "grad_norm": 0.2024240493774414, + "learning_rate": 5.066106520044646e-05, + "loss": 1.7033, + "step": 16647 + }, + { + "epoch": 5.10988336402701, + "grad_norm": 0.2096802294254303, + "learning_rate": 5.0656095071781716e-05, + "loss": 1.716, + "step": 16648 + }, + { + "epoch": 5.110190300798036, + "grad_norm": 0.20643317699432373, + "learning_rate": 5.0651124936633054e-05, + "loss": 1.7473, + "step": 16649 + }, + { + "epoch": 5.110497237569061, + "grad_norm": 0.2268853783607483, + "learning_rate": 5.0646154795049604e-05, + "loss": 1.7844, + "step": 16650 + }, + { + "epoch": 5.110804174340086, + "grad_norm": 0.20215095579624176, + "learning_rate": 5.064118464708046e-05, + "loss": 1.7138, + "step": 16651 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 0.19411569833755493, + "learning_rate": 5.063621449277476e-05, + "loss": 1.7526, + "step": 16652 + }, + { + "epoch": 5.111418047882136, + "grad_norm": 0.20199783146381378, + "learning_rate": 5.063124433218161e-05, + "loss": 1.806, + "step": 16653 + }, + { + "epoch": 5.111724984653161, + "grad_norm": 0.23351836204528809, + "learning_rate": 5.0626274165350165e-05, + "loss": 1.7529, + "step": 16654 + }, + { + "epoch": 5.112031921424187, + "grad_norm": 0.21098989248275757, + "learning_rate": 5.062130399232948e-05, + "loss": 1.7647, + "step": 16655 + }, + { + "epoch": 5.112338858195212, + "grad_norm": 0.21959169209003448, + "learning_rate": 5.0616333813168714e-05, + "loss": 1.7462, + "step": 16656 + }, + { + "epoch": 5.112645794966237, + "grad_norm": 0.21173696219921112, + "learning_rate": 5.061136362791696e-05, + "loss": 1.7413, + "step": 16657 + }, + { + "epoch": 5.112952731737262, + "grad_norm": 0.22357577085494995, + "learning_rate": 5.0606393436623365e-05, + "loss": 1.7163, + "step": 16658 + }, + { + "epoch": 5.113259668508287, + "grad_norm": 0.24364936351776123, + "learning_rate": 5.060142323933704e-05, + "loss": 1.8139, + "step": 16659 + }, + { + "epoch": 5.1135666052793125, + "grad_norm": 0.21646073460578918, + "learning_rate": 5.05964530361071e-05, + "loss": 1.741, + "step": 16660 + }, + { + "epoch": 5.113873542050338, + "grad_norm": 0.24261775612831116, + "learning_rate": 5.059148282698265e-05, + "loss": 1.7162, + "step": 16661 + }, + { + "epoch": 5.114180478821363, + "grad_norm": 0.22883281111717224, + "learning_rate": 5.058651261201283e-05, + "loss": 1.7342, + "step": 16662 + }, + { + "epoch": 5.114487415592388, + "grad_norm": 0.2616727352142334, + "learning_rate": 5.058154239124674e-05, + "loss": 1.8054, + "step": 16663 + }, + { + "epoch": 5.114794352363413, + "grad_norm": 0.21293358504772186, + "learning_rate": 5.0576572164733505e-05, + "loss": 1.742, + "step": 16664 + }, + { + "epoch": 5.115101289134438, + "grad_norm": 0.20037685334682465, + "learning_rate": 5.057160193252225e-05, + "loss": 1.7518, + "step": 16665 + }, + { + "epoch": 5.115408225905464, + "grad_norm": 0.19102689623832703, + "learning_rate": 5.056663169466209e-05, + "loss": 1.6892, + "step": 16666 + }, + { + "epoch": 5.115715162676489, + "grad_norm": 0.22261591255664825, + "learning_rate": 5.056166145120216e-05, + "loss": 1.7744, + "step": 16667 + }, + { + "epoch": 5.116022099447513, + "grad_norm": 0.23966702818870544, + "learning_rate": 5.055669120219154e-05, + "loss": 1.7786, + "step": 16668 + }, + { + "epoch": 5.116329036218539, + "grad_norm": 0.22008271515369415, + "learning_rate": 5.055172094767937e-05, + "loss": 1.7501, + "step": 16669 + }, + { + "epoch": 5.116635972989564, + "grad_norm": 0.21643415093421936, + "learning_rate": 5.054675068771478e-05, + "loss": 1.7548, + "step": 16670 + }, + { + "epoch": 5.116942909760589, + "grad_norm": 0.24661116302013397, + "learning_rate": 5.0541780422346894e-05, + "loss": 1.8117, + "step": 16671 + }, + { + "epoch": 5.117249846531615, + "grad_norm": 0.21393093466758728, + "learning_rate": 5.05368101516248e-05, + "loss": 1.7341, + "step": 16672 + }, + { + "epoch": 5.11755678330264, + "grad_norm": 0.30949896574020386, + "learning_rate": 5.053183987559763e-05, + "loss": 1.7703, + "step": 16673 + }, + { + "epoch": 5.1178637200736645, + "grad_norm": 0.22236786782741547, + "learning_rate": 5.052686959431451e-05, + "loss": 1.719, + "step": 16674 + }, + { + "epoch": 5.11817065684469, + "grad_norm": 0.26826921105384827, + "learning_rate": 5.052189930782455e-05, + "loss": 1.741, + "step": 16675 + }, + { + "epoch": 5.118477593615715, + "grad_norm": 0.2608947455883026, + "learning_rate": 5.051692901617688e-05, + "loss": 1.7062, + "step": 16676 + }, + { + "epoch": 5.1187845303867405, + "grad_norm": 0.20709002017974854, + "learning_rate": 5.051195871942063e-05, + "loss": 1.703, + "step": 16677 + }, + { + "epoch": 5.119091467157766, + "grad_norm": 0.18957734107971191, + "learning_rate": 5.0506988417604885e-05, + "loss": 1.762, + "step": 16678 + }, + { + "epoch": 5.11939840392879, + "grad_norm": 0.21578781306743622, + "learning_rate": 5.050201811077879e-05, + "loss": 1.7167, + "step": 16679 + }, + { + "epoch": 5.119705340699816, + "grad_norm": 0.2253631353378296, + "learning_rate": 5.049704779899145e-05, + "loss": 1.7374, + "step": 16680 + }, + { + "epoch": 5.120012277470841, + "grad_norm": 0.1977664828300476, + "learning_rate": 5.049207748229199e-05, + "loss": 1.7399, + "step": 16681 + }, + { + "epoch": 5.120319214241866, + "grad_norm": 0.2964428663253784, + "learning_rate": 5.048710716072954e-05, + "loss": 1.8359, + "step": 16682 + }, + { + "epoch": 5.120626151012892, + "grad_norm": 0.24788637459278107, + "learning_rate": 5.0482136834353224e-05, + "loss": 1.7593, + "step": 16683 + }, + { + "epoch": 5.120933087783916, + "grad_norm": 0.21537743508815765, + "learning_rate": 5.0477166503212135e-05, + "loss": 1.7472, + "step": 16684 + }, + { + "epoch": 5.121240024554941, + "grad_norm": 0.2055196613073349, + "learning_rate": 5.047219616735541e-05, + "loss": 1.7106, + "step": 16685 + }, + { + "epoch": 5.121546961325967, + "grad_norm": 0.19770687818527222, + "learning_rate": 5.046722582683215e-05, + "loss": 1.6887, + "step": 16686 + }, + { + "epoch": 5.121853898096992, + "grad_norm": 0.20407389104366302, + "learning_rate": 5.046225548169151e-05, + "loss": 1.7412, + "step": 16687 + }, + { + "epoch": 5.122160834868017, + "grad_norm": 0.20153474807739258, + "learning_rate": 5.045728513198259e-05, + "loss": 1.7643, + "step": 16688 + }, + { + "epoch": 5.122467771639043, + "grad_norm": 0.18737752735614777, + "learning_rate": 5.045231477775452e-05, + "loss": 1.763, + "step": 16689 + }, + { + "epoch": 5.122774708410067, + "grad_norm": 0.19790658354759216, + "learning_rate": 5.0447344419056385e-05, + "loss": 1.7446, + "step": 16690 + }, + { + "epoch": 5.1230816451810925, + "grad_norm": 0.21496973931789398, + "learning_rate": 5.0442374055937336e-05, + "loss": 1.7756, + "step": 16691 + }, + { + "epoch": 5.123388581952118, + "grad_norm": 0.19318655133247375, + "learning_rate": 5.043740368844649e-05, + "loss": 1.7687, + "step": 16692 + }, + { + "epoch": 5.123695518723143, + "grad_norm": 0.2237338423728943, + "learning_rate": 5.0432433316632976e-05, + "loss": 1.7258, + "step": 16693 + }, + { + "epoch": 5.1240024554941686, + "grad_norm": 0.2257162630558014, + "learning_rate": 5.042746294054589e-05, + "loss": 1.7462, + "step": 16694 + }, + { + "epoch": 5.124309392265193, + "grad_norm": 0.25666359066963196, + "learning_rate": 5.0422492560234366e-05, + "loss": 1.7318, + "step": 16695 + }, + { + "epoch": 5.124616329036218, + "grad_norm": 0.2615324556827545, + "learning_rate": 5.0417522175747536e-05, + "loss": 1.7533, + "step": 16696 + }, + { + "epoch": 5.124923265807244, + "grad_norm": 0.2372874766588211, + "learning_rate": 5.0412551787134475e-05, + "loss": 1.7361, + "step": 16697 + }, + { + "epoch": 5.125230202578269, + "grad_norm": 0.25976815819740295, + "learning_rate": 5.040758139444436e-05, + "loss": 1.7542, + "step": 16698 + }, + { + "epoch": 5.125537139349294, + "grad_norm": 0.36173003911972046, + "learning_rate": 5.040261099772629e-05, + "loss": 1.7421, + "step": 16699 + }, + { + "epoch": 5.12584407612032, + "grad_norm": 0.2767728269100189, + "learning_rate": 5.039764059702937e-05, + "loss": 1.7341, + "step": 16700 + }, + { + "epoch": 5.126151012891344, + "grad_norm": 0.20185241103172302, + "learning_rate": 5.039267019240275e-05, + "loss": 1.7068, + "step": 16701 + }, + { + "epoch": 5.1264579496623695, + "grad_norm": 0.26872581243515015, + "learning_rate": 5.0387699783895514e-05, + "loss": 1.7404, + "step": 16702 + }, + { + "epoch": 5.126764886433395, + "grad_norm": 0.2867858111858368, + "learning_rate": 5.038272937155682e-05, + "loss": 1.7702, + "step": 16703 + }, + { + "epoch": 5.12707182320442, + "grad_norm": 0.20939521491527557, + "learning_rate": 5.037775895543574e-05, + "loss": 1.7653, + "step": 16704 + }, + { + "epoch": 5.1273787599754455, + "grad_norm": 0.2674047648906708, + "learning_rate": 5.037278853558146e-05, + "loss": 1.701, + "step": 16705 + }, + { + "epoch": 5.12768569674647, + "grad_norm": 0.20776906609535217, + "learning_rate": 5.036781811204304e-05, + "loss": 1.7476, + "step": 16706 + }, + { + "epoch": 5.127992633517495, + "grad_norm": 0.2695952355861664, + "learning_rate": 5.036284768486964e-05, + "loss": 1.7206, + "step": 16707 + }, + { + "epoch": 5.128299570288521, + "grad_norm": 0.30661383271217346, + "learning_rate": 5.0357877254110363e-05, + "loss": 1.72, + "step": 16708 + }, + { + "epoch": 5.128606507059546, + "grad_norm": 0.2527785003185272, + "learning_rate": 5.0352906819814316e-05, + "loss": 1.6936, + "step": 16709 + }, + { + "epoch": 5.128913443830571, + "grad_norm": 0.23000696301460266, + "learning_rate": 5.034793638203066e-05, + "loss": 1.7634, + "step": 16710 + }, + { + "epoch": 5.129220380601596, + "grad_norm": 0.33594760298728943, + "learning_rate": 5.0342965940808486e-05, + "loss": 1.6952, + "step": 16711 + }, + { + "epoch": 5.129527317372621, + "grad_norm": 0.22834168374538422, + "learning_rate": 5.033799549619692e-05, + "loss": 1.7537, + "step": 16712 + }, + { + "epoch": 5.129834254143646, + "grad_norm": 0.26585114002227783, + "learning_rate": 5.033302504824509e-05, + "loss": 1.7554, + "step": 16713 + }, + { + "epoch": 5.130141190914672, + "grad_norm": 0.25632211565971375, + "learning_rate": 5.032805459700211e-05, + "loss": 1.8141, + "step": 16714 + }, + { + "epoch": 5.130448127685697, + "grad_norm": 0.256523996591568, + "learning_rate": 5.0323084142517084e-05, + "loss": 1.777, + "step": 16715 + }, + { + "epoch": 5.1307550644567215, + "grad_norm": 0.31409457325935364, + "learning_rate": 5.0318113684839166e-05, + "loss": 1.7414, + "step": 16716 + }, + { + "epoch": 5.131062001227747, + "grad_norm": 0.21156816184520721, + "learning_rate": 5.0313143224017455e-05, + "loss": 1.7397, + "step": 16717 + }, + { + "epoch": 5.131368937998772, + "grad_norm": 0.23596547544002533, + "learning_rate": 5.030817276010109e-05, + "loss": 1.752, + "step": 16718 + }, + { + "epoch": 5.1316758747697975, + "grad_norm": 0.2587638199329376, + "learning_rate": 5.0303202293139186e-05, + "loss": 1.7645, + "step": 16719 + }, + { + "epoch": 5.131982811540823, + "grad_norm": 0.2006666213274002, + "learning_rate": 5.029823182318084e-05, + "loss": 1.7009, + "step": 16720 + }, + { + "epoch": 5.132289748311848, + "grad_norm": 0.3075694739818573, + "learning_rate": 5.029326135027521e-05, + "loss": 1.749, + "step": 16721 + }, + { + "epoch": 5.132596685082873, + "grad_norm": 0.3116205334663391, + "learning_rate": 5.028829087447139e-05, + "loss": 1.7458, + "step": 16722 + }, + { + "epoch": 5.132903621853898, + "grad_norm": 0.17925913631916046, + "learning_rate": 5.028332039581851e-05, + "loss": 1.6502, + "step": 16723 + }, + { + "epoch": 5.133210558624923, + "grad_norm": 0.21779952943325043, + "learning_rate": 5.0278349914365694e-05, + "loss": 1.7656, + "step": 16724 + }, + { + "epoch": 5.133517495395949, + "grad_norm": 0.20085318386554718, + "learning_rate": 5.027337943016207e-05, + "loss": 1.7662, + "step": 16725 + }, + { + "epoch": 5.133824432166974, + "grad_norm": 0.19975553452968597, + "learning_rate": 5.026840894325673e-05, + "loss": 1.7392, + "step": 16726 + }, + { + "epoch": 5.134131368937998, + "grad_norm": 0.20610745251178741, + "learning_rate": 5.026343845369883e-05, + "loss": 1.7221, + "step": 16727 + }, + { + "epoch": 5.134438305709024, + "grad_norm": 0.21451768279075623, + "learning_rate": 5.025846796153747e-05, + "loss": 1.8381, + "step": 16728 + }, + { + "epoch": 5.134745242480049, + "grad_norm": 0.19518613815307617, + "learning_rate": 5.0253497466821786e-05, + "loss": 1.7483, + "step": 16729 + }, + { + "epoch": 5.135052179251074, + "grad_norm": 0.24284996092319489, + "learning_rate": 5.024852696960088e-05, + "loss": 1.7895, + "step": 16730 + }, + { + "epoch": 5.1353591160221, + "grad_norm": 0.23962461948394775, + "learning_rate": 5.0243556469923905e-05, + "loss": 1.8468, + "step": 16731 + }, + { + "epoch": 5.135666052793125, + "grad_norm": 0.20455054938793182, + "learning_rate": 5.023858596783993e-05, + "loss": 1.6973, + "step": 16732 + }, + { + "epoch": 5.1359729895641495, + "grad_norm": 0.20629842579364777, + "learning_rate": 5.023361546339813e-05, + "loss": 1.7608, + "step": 16733 + }, + { + "epoch": 5.136279926335175, + "grad_norm": 0.19375818967819214, + "learning_rate": 5.0228644956647606e-05, + "loss": 1.7327, + "step": 16734 + }, + { + "epoch": 5.1365868631062, + "grad_norm": 0.20960548520088196, + "learning_rate": 5.022367444763748e-05, + "loss": 1.7227, + "step": 16735 + }, + { + "epoch": 5.1368937998772255, + "grad_norm": 0.24732786417007446, + "learning_rate": 5.021870393641687e-05, + "loss": 1.8144, + "step": 16736 + }, + { + "epoch": 5.137200736648251, + "grad_norm": 0.22190099954605103, + "learning_rate": 5.021373342303489e-05, + "loss": 1.705, + "step": 16737 + }, + { + "epoch": 5.137507673419275, + "grad_norm": 0.2091664969921112, + "learning_rate": 5.020876290754069e-05, + "loss": 1.7926, + "step": 16738 + }, + { + "epoch": 5.137814610190301, + "grad_norm": 0.22298938035964966, + "learning_rate": 5.020379238998335e-05, + "loss": 1.7782, + "step": 16739 + }, + { + "epoch": 5.138121546961326, + "grad_norm": 0.20843006670475006, + "learning_rate": 5.019882187041203e-05, + "loss": 1.7245, + "step": 16740 + }, + { + "epoch": 5.138428483732351, + "grad_norm": 0.23383544385433197, + "learning_rate": 5.019385134887583e-05, + "loss": 1.6834, + "step": 16741 + }, + { + "epoch": 5.138735420503377, + "grad_norm": 0.3015683889389038, + "learning_rate": 5.018888082542388e-05, + "loss": 1.7636, + "step": 16742 + }, + { + "epoch": 5.139042357274401, + "grad_norm": 0.2253810614347458, + "learning_rate": 5.0183910300105284e-05, + "loss": 1.7375, + "step": 16743 + }, + { + "epoch": 5.139349294045426, + "grad_norm": 0.2064623087644577, + "learning_rate": 5.01789397729692e-05, + "loss": 1.7683, + "step": 16744 + }, + { + "epoch": 5.139656230816452, + "grad_norm": 0.2106693685054779, + "learning_rate": 5.0173969244064724e-05, + "loss": 1.7432, + "step": 16745 + }, + { + "epoch": 5.139963167587477, + "grad_norm": 0.19944638013839722, + "learning_rate": 5.016899871344097e-05, + "loss": 1.701, + "step": 16746 + }, + { + "epoch": 5.140270104358502, + "grad_norm": 0.23210744559764862, + "learning_rate": 5.016402818114708e-05, + "loss": 1.8008, + "step": 16747 + }, + { + "epoch": 5.140577041129528, + "grad_norm": 0.26014089584350586, + "learning_rate": 5.015905764723217e-05, + "loss": 1.7131, + "step": 16748 + }, + { + "epoch": 5.140883977900552, + "grad_norm": 0.25526607036590576, + "learning_rate": 5.015408711174535e-05, + "loss": 1.7525, + "step": 16749 + }, + { + "epoch": 5.1411909146715775, + "grad_norm": 0.2092386782169342, + "learning_rate": 5.0149116574735756e-05, + "loss": 1.7502, + "step": 16750 + }, + { + "epoch": 5.141497851442603, + "grad_norm": 0.21560105681419373, + "learning_rate": 5.01441460362525e-05, + "loss": 1.7903, + "step": 16751 + }, + { + "epoch": 5.141804788213628, + "grad_norm": 0.23538467288017273, + "learning_rate": 5.013917549634471e-05, + "loss": 1.6995, + "step": 16752 + }, + { + "epoch": 5.1421117249846535, + "grad_norm": 0.26545262336730957, + "learning_rate": 5.0134204955061526e-05, + "loss": 1.7511, + "step": 16753 + }, + { + "epoch": 5.142418661755678, + "grad_norm": 0.23030948638916016, + "learning_rate": 5.012923441245203e-05, + "loss": 1.7271, + "step": 16754 + }, + { + "epoch": 5.142725598526703, + "grad_norm": 0.22395408153533936, + "learning_rate": 5.012426386856537e-05, + "loss": 1.7273, + "step": 16755 + }, + { + "epoch": 5.143032535297729, + "grad_norm": 0.21355997025966644, + "learning_rate": 5.011929332345066e-05, + "loss": 1.7347, + "step": 16756 + }, + { + "epoch": 5.143339472068754, + "grad_norm": 0.2355809509754181, + "learning_rate": 5.011432277715702e-05, + "loss": 1.8289, + "step": 16757 + }, + { + "epoch": 5.143646408839779, + "grad_norm": 0.24319802224636078, + "learning_rate": 5.0109352229733584e-05, + "loss": 1.7621, + "step": 16758 + }, + { + "epoch": 5.143953345610804, + "grad_norm": 0.2591453492641449, + "learning_rate": 5.010438168122946e-05, + "loss": 1.8043, + "step": 16759 + }, + { + "epoch": 5.144260282381829, + "grad_norm": 0.22595751285552979, + "learning_rate": 5.009941113169376e-05, + "loss": 1.8137, + "step": 16760 + }, + { + "epoch": 5.144567219152854, + "grad_norm": 0.220921128988266, + "learning_rate": 5.009444058117564e-05, + "loss": 1.7105, + "step": 16761 + }, + { + "epoch": 5.14487415592388, + "grad_norm": 0.25713789463043213, + "learning_rate": 5.0089470029724195e-05, + "loss": 1.8184, + "step": 16762 + }, + { + "epoch": 5.145181092694905, + "grad_norm": 0.19849328696727753, + "learning_rate": 5.008449947738856e-05, + "loss": 1.7331, + "step": 16763 + }, + { + "epoch": 5.14548802946593, + "grad_norm": 0.2073405385017395, + "learning_rate": 5.007952892421785e-05, + "loss": 1.7053, + "step": 16764 + }, + { + "epoch": 5.145794966236955, + "grad_norm": 0.22307951748371124, + "learning_rate": 5.007455837026119e-05, + "loss": 1.7724, + "step": 16765 + }, + { + "epoch": 5.14610190300798, + "grad_norm": 0.22160649299621582, + "learning_rate": 5.006958781556769e-05, + "loss": 1.7191, + "step": 16766 + }, + { + "epoch": 5.1464088397790055, + "grad_norm": 0.2202252298593521, + "learning_rate": 5.0064617260186487e-05, + "loss": 1.7339, + "step": 16767 + }, + { + "epoch": 5.146715776550031, + "grad_norm": 0.23693829774856567, + "learning_rate": 5.005964670416671e-05, + "loss": 1.7143, + "step": 16768 + }, + { + "epoch": 5.147022713321056, + "grad_norm": 0.22675764560699463, + "learning_rate": 5.005467614755746e-05, + "loss": 1.7913, + "step": 16769 + }, + { + "epoch": 5.147329650092081, + "grad_norm": 0.21288467943668365, + "learning_rate": 5.0049705590407866e-05, + "loss": 1.7581, + "step": 16770 + }, + { + "epoch": 5.147636586863106, + "grad_norm": 0.216839998960495, + "learning_rate": 5.0044735032767064e-05, + "loss": 1.7305, + "step": 16771 + }, + { + "epoch": 5.147943523634131, + "grad_norm": 0.2111063450574875, + "learning_rate": 5.003976447468416e-05, + "loss": 1.7444, + "step": 16772 + }, + { + "epoch": 5.148250460405157, + "grad_norm": 0.2536773085594177, + "learning_rate": 5.003479391620827e-05, + "loss": 1.6952, + "step": 16773 + }, + { + "epoch": 5.148557397176182, + "grad_norm": 0.23585477471351624, + "learning_rate": 5.002982335738854e-05, + "loss": 1.6921, + "step": 16774 + }, + { + "epoch": 5.148864333947207, + "grad_norm": 0.1927027702331543, + "learning_rate": 5.002485279827407e-05, + "loss": 1.7781, + "step": 16775 + }, + { + "epoch": 5.149171270718232, + "grad_norm": 0.22545355558395386, + "learning_rate": 5.001988223891399e-05, + "loss": 1.7582, + "step": 16776 + }, + { + "epoch": 5.149478207489257, + "grad_norm": 0.20837660133838654, + "learning_rate": 5.001491167935741e-05, + "loss": 1.7379, + "step": 16777 + }, + { + "epoch": 5.149785144260282, + "grad_norm": 0.20510734617710114, + "learning_rate": 5.000994111965348e-05, + "loss": 1.7568, + "step": 16778 + }, + { + "epoch": 5.150092081031308, + "grad_norm": 0.2629711329936981, + "learning_rate": 5.00049705598513e-05, + "loss": 1.7613, + "step": 16779 + }, + { + "epoch": 5.150399017802333, + "grad_norm": 0.2390555888414383, + "learning_rate": 5e-05, + "loss": 1.7099, + "step": 16780 + }, + { + "epoch": 5.150705954573358, + "grad_norm": 0.19643893837928772, + "learning_rate": 4.9995029440148715e-05, + "loss": 1.7012, + "step": 16781 + }, + { + "epoch": 5.151012891344383, + "grad_norm": 0.1881607472896576, + "learning_rate": 4.999005888034653e-05, + "loss": 1.705, + "step": 16782 + }, + { + "epoch": 5.151319828115408, + "grad_norm": 0.3219485282897949, + "learning_rate": 4.99850883206426e-05, + "loss": 1.8089, + "step": 16783 + }, + { + "epoch": 5.151626764886434, + "grad_norm": 0.22285562753677368, + "learning_rate": 4.998011776108602e-05, + "loss": 1.7343, + "step": 16784 + }, + { + "epoch": 5.151933701657459, + "grad_norm": 0.1981910616159439, + "learning_rate": 4.9975147201725955e-05, + "loss": 1.6939, + "step": 16785 + }, + { + "epoch": 5.152240638428483, + "grad_norm": 0.2338661551475525, + "learning_rate": 4.997017664261148e-05, + "loss": 1.6833, + "step": 16786 + }, + { + "epoch": 5.152547575199509, + "grad_norm": 0.2613268792629242, + "learning_rate": 4.996520608379175e-05, + "loss": 1.7251, + "step": 16787 + }, + { + "epoch": 5.152854511970534, + "grad_norm": 0.26063668727874756, + "learning_rate": 4.996023552531586e-05, + "loss": 1.8444, + "step": 16788 + }, + { + "epoch": 5.153161448741559, + "grad_norm": 0.2711321711540222, + "learning_rate": 4.9955264967232954e-05, + "loss": 1.7257, + "step": 16789 + }, + { + "epoch": 5.153468385512585, + "grad_norm": 0.30134227871894836, + "learning_rate": 4.995029440959213e-05, + "loss": 1.7599, + "step": 16790 + }, + { + "epoch": 5.153775322283609, + "grad_norm": 0.22983741760253906, + "learning_rate": 4.994532385244255e-05, + "loss": 1.7944, + "step": 16791 + }, + { + "epoch": 5.1540822590546345, + "grad_norm": 0.2992973327636719, + "learning_rate": 4.994035329583329e-05, + "loss": 1.7507, + "step": 16792 + }, + { + "epoch": 5.15438919582566, + "grad_norm": 0.2659669518470764, + "learning_rate": 4.993538273981352e-05, + "loss": 1.7246, + "step": 16793 + }, + { + "epoch": 5.154696132596685, + "grad_norm": 0.24235470592975616, + "learning_rate": 4.9930412184432315e-05, + "loss": 1.8378, + "step": 16794 + }, + { + "epoch": 5.1550030693677105, + "grad_norm": 0.30005061626434326, + "learning_rate": 4.992544162973882e-05, + "loss": 1.7526, + "step": 16795 + }, + { + "epoch": 5.155310006138736, + "grad_norm": 0.2183740884065628, + "learning_rate": 4.992047107578215e-05, + "loss": 1.7197, + "step": 16796 + }, + { + "epoch": 5.15561694290976, + "grad_norm": 0.35874706506729126, + "learning_rate": 4.991550052261145e-05, + "loss": 1.8196, + "step": 16797 + }, + { + "epoch": 5.155923879680786, + "grad_norm": 0.42146921157836914, + "learning_rate": 4.991052997027583e-05, + "loss": 1.7165, + "step": 16798 + }, + { + "epoch": 5.156230816451811, + "grad_norm": 0.2738321125507355, + "learning_rate": 4.990555941882437e-05, + "loss": 1.7042, + "step": 16799 + }, + { + "epoch": 5.156537753222836, + "grad_norm": 0.26304566860198975, + "learning_rate": 4.990058886830625e-05, + "loss": 1.7551, + "step": 16800 + }, + { + "epoch": 5.156844689993862, + "grad_norm": 0.4301520586013794, + "learning_rate": 4.9895618318770556e-05, + "loss": 1.7219, + "step": 16801 + }, + { + "epoch": 5.157151626764886, + "grad_norm": 0.3316499590873718, + "learning_rate": 4.989064777026644e-05, + "loss": 1.8034, + "step": 16802 + }, + { + "epoch": 5.157458563535911, + "grad_norm": 0.30105581879615784, + "learning_rate": 4.9885677222842984e-05, + "loss": 1.7022, + "step": 16803 + }, + { + "epoch": 5.157765500306937, + "grad_norm": 0.3830905854701996, + "learning_rate": 4.988070667654937e-05, + "loss": 1.7898, + "step": 16804 + }, + { + "epoch": 5.158072437077962, + "grad_norm": 0.2204640656709671, + "learning_rate": 4.9875736131434644e-05, + "loss": 1.7081, + "step": 16805 + }, + { + "epoch": 5.158379373848987, + "grad_norm": 0.3620772063732147, + "learning_rate": 4.9870765587547976e-05, + "loss": 1.7345, + "step": 16806 + }, + { + "epoch": 5.158686310620013, + "grad_norm": 0.3268207907676697, + "learning_rate": 4.986579504493848e-05, + "loss": 1.7364, + "step": 16807 + }, + { + "epoch": 5.158993247391037, + "grad_norm": 0.2499808967113495, + "learning_rate": 4.986082450365529e-05, + "loss": 1.7836, + "step": 16808 + }, + { + "epoch": 5.1593001841620625, + "grad_norm": 0.3696226477622986, + "learning_rate": 4.98558539637475e-05, + "loss": 1.8094, + "step": 16809 + }, + { + "epoch": 5.159607120933088, + "grad_norm": 0.3239068388938904, + "learning_rate": 4.9850883425264256e-05, + "loss": 1.7448, + "step": 16810 + }, + { + "epoch": 5.159914057704113, + "grad_norm": 0.19875772297382355, + "learning_rate": 4.9845912888254655e-05, + "loss": 1.6945, + "step": 16811 + }, + { + "epoch": 5.1602209944751385, + "grad_norm": 0.3952203691005707, + "learning_rate": 4.984094235276784e-05, + "loss": 1.8457, + "step": 16812 + }, + { + "epoch": 5.160527931246163, + "grad_norm": 0.3052334785461426, + "learning_rate": 4.9835971818852916e-05, + "loss": 1.7371, + "step": 16813 + }, + { + "epoch": 5.160834868017188, + "grad_norm": 0.2874486446380615, + "learning_rate": 4.983100128655904e-05, + "loss": 1.7194, + "step": 16814 + }, + { + "epoch": 5.161141804788214, + "grad_norm": 0.39117491245269775, + "learning_rate": 4.98260307559353e-05, + "loss": 1.7919, + "step": 16815 + }, + { + "epoch": 5.161448741559239, + "grad_norm": 0.2532150149345398, + "learning_rate": 4.982106022703081e-05, + "loss": 1.8103, + "step": 16816 + }, + { + "epoch": 5.161755678330264, + "grad_norm": 0.3545167148113251, + "learning_rate": 4.981608969989473e-05, + "loss": 1.8093, + "step": 16817 + }, + { + "epoch": 5.162062615101289, + "grad_norm": 0.397806316614151, + "learning_rate": 4.981111917457613e-05, + "loss": 1.7885, + "step": 16818 + }, + { + "epoch": 5.162369551872314, + "grad_norm": 0.2523536682128906, + "learning_rate": 4.980614865112419e-05, + "loss": 1.797, + "step": 16819 + }, + { + "epoch": 5.162676488643339, + "grad_norm": 0.3666839301586151, + "learning_rate": 4.980117812958798e-05, + "loss": 1.7859, + "step": 16820 + }, + { + "epoch": 5.162983425414365, + "grad_norm": 0.3392138183116913, + "learning_rate": 4.9796207610016664e-05, + "loss": 1.7717, + "step": 16821 + }, + { + "epoch": 5.16329036218539, + "grad_norm": 0.21040666103363037, + "learning_rate": 4.9791237092459325e-05, + "loss": 1.7447, + "step": 16822 + }, + { + "epoch": 5.163597298956415, + "grad_norm": 0.3140225112438202, + "learning_rate": 4.978626657696512e-05, + "loss": 1.7405, + "step": 16823 + }, + { + "epoch": 5.16390423572744, + "grad_norm": 0.23963581025600433, + "learning_rate": 4.978129606358313e-05, + "loss": 1.7041, + "step": 16824 + }, + { + "epoch": 5.164211172498465, + "grad_norm": 0.32476937770843506, + "learning_rate": 4.977632555236253e-05, + "loss": 1.736, + "step": 16825 + }, + { + "epoch": 5.1645181092694905, + "grad_norm": 0.4362463653087616, + "learning_rate": 4.977135504335239e-05, + "loss": 1.7657, + "step": 16826 + }, + { + "epoch": 5.164825046040516, + "grad_norm": 0.26118260622024536, + "learning_rate": 4.976638453660188e-05, + "loss": 1.7339, + "step": 16827 + }, + { + "epoch": 5.165131982811541, + "grad_norm": 0.27284330129623413, + "learning_rate": 4.9761414032160065e-05, + "loss": 1.8086, + "step": 16828 + }, + { + "epoch": 5.165438919582566, + "grad_norm": 0.2942579388618469, + "learning_rate": 4.975644353007611e-05, + "loss": 1.7869, + "step": 16829 + }, + { + "epoch": 5.165745856353591, + "grad_norm": 0.23257993161678314, + "learning_rate": 4.975147303039912e-05, + "loss": 1.8048, + "step": 16830 + }, + { + "epoch": 5.166052793124616, + "grad_norm": 0.28638842701911926, + "learning_rate": 4.9746502533178225e-05, + "loss": 1.7744, + "step": 16831 + }, + { + "epoch": 5.166359729895642, + "grad_norm": 0.21571335196495056, + "learning_rate": 4.974153203846255e-05, + "loss": 1.7842, + "step": 16832 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 0.268883615732193, + "learning_rate": 4.9736561546301185e-05, + "loss": 1.7194, + "step": 16833 + }, + { + "epoch": 5.166973603437691, + "grad_norm": 0.22934168577194214, + "learning_rate": 4.9731591056743285e-05, + "loss": 1.757, + "step": 16834 + }, + { + "epoch": 5.167280540208717, + "grad_norm": 0.26321718096733093, + "learning_rate": 4.9726620569837946e-05, + "loss": 1.7675, + "step": 16835 + }, + { + "epoch": 5.167587476979742, + "grad_norm": 0.2893882393836975, + "learning_rate": 4.9721650085634325e-05, + "loss": 1.7134, + "step": 16836 + }, + { + "epoch": 5.167894413750767, + "grad_norm": 0.24130617082118988, + "learning_rate": 4.97166796041815e-05, + "loss": 1.7119, + "step": 16837 + }, + { + "epoch": 5.168201350521793, + "grad_norm": 0.23614190518856049, + "learning_rate": 4.9711709125528635e-05, + "loss": 1.7556, + "step": 16838 + }, + { + "epoch": 5.168508287292818, + "grad_norm": 0.2031065821647644, + "learning_rate": 4.97067386497248e-05, + "loss": 1.7678, + "step": 16839 + }, + { + "epoch": 5.1688152240638425, + "grad_norm": 0.30695948004722595, + "learning_rate": 4.970176817681917e-05, + "loss": 1.7907, + "step": 16840 + }, + { + "epoch": 5.169122160834868, + "grad_norm": 0.31256723403930664, + "learning_rate": 4.969679770686082e-05, + "loss": 1.7448, + "step": 16841 + }, + { + "epoch": 5.169429097605893, + "grad_norm": 0.24183644354343414, + "learning_rate": 4.969182723989892e-05, + "loss": 1.7259, + "step": 16842 + }, + { + "epoch": 5.1697360343769185, + "grad_norm": 0.22440548241138458, + "learning_rate": 4.9686856775982536e-05, + "loss": 1.7949, + "step": 16843 + }, + { + "epoch": 5.170042971147944, + "grad_norm": 0.29006195068359375, + "learning_rate": 4.9681886315160846e-05, + "loss": 1.7128, + "step": 16844 + }, + { + "epoch": 5.170349907918968, + "grad_norm": 0.2189658135175705, + "learning_rate": 4.967691585748292e-05, + "loss": 1.7375, + "step": 16845 + }, + { + "epoch": 5.170656844689994, + "grad_norm": 0.289909690618515, + "learning_rate": 4.967194540299791e-05, + "loss": 1.779, + "step": 16846 + }, + { + "epoch": 5.170963781461019, + "grad_norm": 0.28279590606689453, + "learning_rate": 4.966697495175492e-05, + "loss": 1.7368, + "step": 16847 + }, + { + "epoch": 5.171270718232044, + "grad_norm": 0.2056259959936142, + "learning_rate": 4.966200450380309e-05, + "loss": 1.7548, + "step": 16848 + }, + { + "epoch": 5.17157765500307, + "grad_norm": 0.2607482969760895, + "learning_rate": 4.965703405919154e-05, + "loss": 1.7178, + "step": 16849 + }, + { + "epoch": 5.171884591774095, + "grad_norm": 0.26085609197616577, + "learning_rate": 4.965206361796935e-05, + "loss": 1.751, + "step": 16850 + }, + { + "epoch": 5.172191528545119, + "grad_norm": 0.17960335314273834, + "learning_rate": 4.964709318018569e-05, + "loss": 1.6932, + "step": 16851 + }, + { + "epoch": 5.172498465316145, + "grad_norm": 0.2617340385913849, + "learning_rate": 4.964212274588965e-05, + "loss": 1.7753, + "step": 16852 + }, + { + "epoch": 5.17280540208717, + "grad_norm": 0.2454555630683899, + "learning_rate": 4.9637152315130383e-05, + "loss": 1.7587, + "step": 16853 + }, + { + "epoch": 5.173112338858195, + "grad_norm": 0.19221605360507965, + "learning_rate": 4.963218188795696e-05, + "loss": 1.7337, + "step": 16854 + }, + { + "epoch": 5.173419275629221, + "grad_norm": 0.24314738810062408, + "learning_rate": 4.9627211464418565e-05, + "loss": 1.725, + "step": 16855 + }, + { + "epoch": 5.173726212400245, + "grad_norm": 0.2533986568450928, + "learning_rate": 4.962224104456426e-05, + "loss": 1.7502, + "step": 16856 + }, + { + "epoch": 5.1740331491712706, + "grad_norm": 0.21800079941749573, + "learning_rate": 4.9617270628443195e-05, + "loss": 1.7622, + "step": 16857 + }, + { + "epoch": 5.174340085942296, + "grad_norm": 0.22742362320423126, + "learning_rate": 4.96123002161045e-05, + "loss": 1.7078, + "step": 16858 + }, + { + "epoch": 5.174647022713321, + "grad_norm": 0.22729982435703278, + "learning_rate": 4.960732980759727e-05, + "loss": 1.8349, + "step": 16859 + }, + { + "epoch": 5.1749539594843466, + "grad_norm": 0.28869518637657166, + "learning_rate": 4.9602359402970625e-05, + "loss": 1.8932, + "step": 16860 + }, + { + "epoch": 5.175260896255371, + "grad_norm": 0.21931354701519012, + "learning_rate": 4.9597389002273725e-05, + "loss": 1.6989, + "step": 16861 + }, + { + "epoch": 5.175567833026396, + "grad_norm": 0.2130192667245865, + "learning_rate": 4.959241860555564e-05, + "loss": 1.752, + "step": 16862 + }, + { + "epoch": 5.175874769797422, + "grad_norm": 0.21272781491279602, + "learning_rate": 4.958744821286553e-05, + "loss": 1.7402, + "step": 16863 + }, + { + "epoch": 5.176181706568447, + "grad_norm": 0.20279285311698914, + "learning_rate": 4.958247782425248e-05, + "loss": 1.7103, + "step": 16864 + }, + { + "epoch": 5.176488643339472, + "grad_norm": 0.23561790585517883, + "learning_rate": 4.957750743976564e-05, + "loss": 1.7742, + "step": 16865 + }, + { + "epoch": 5.176795580110497, + "grad_norm": 0.27608510851860046, + "learning_rate": 4.957253705945413e-05, + "loss": 1.7505, + "step": 16866 + }, + { + "epoch": 5.177102516881522, + "grad_norm": 0.20624001324176788, + "learning_rate": 4.956756668336704e-05, + "loss": 1.7032, + "step": 16867 + }, + { + "epoch": 5.1774094536525475, + "grad_norm": 0.23743939399719238, + "learning_rate": 4.956259631155352e-05, + "loss": 1.7469, + "step": 16868 + }, + { + "epoch": 5.177716390423573, + "grad_norm": 0.27421119809150696, + "learning_rate": 4.9557625944062675e-05, + "loss": 1.7028, + "step": 16869 + }, + { + "epoch": 5.178023327194598, + "grad_norm": 0.23788046836853027, + "learning_rate": 4.955265558094363e-05, + "loss": 1.7468, + "step": 16870 + }, + { + "epoch": 5.1783302639656235, + "grad_norm": 0.24712958931922913, + "learning_rate": 4.95476852222455e-05, + "loss": 1.7348, + "step": 16871 + }, + { + "epoch": 5.178637200736648, + "grad_norm": 0.21558570861816406, + "learning_rate": 4.9542714868017424e-05, + "loss": 1.7599, + "step": 16872 + }, + { + "epoch": 5.178944137507673, + "grad_norm": 0.2561664283275604, + "learning_rate": 4.953774451830849e-05, + "loss": 1.7673, + "step": 16873 + }, + { + "epoch": 5.179251074278699, + "grad_norm": 0.19761815667152405, + "learning_rate": 4.953277417316786e-05, + "loss": 1.743, + "step": 16874 + }, + { + "epoch": 5.179558011049724, + "grad_norm": 0.24140769243240356, + "learning_rate": 4.95278038326446e-05, + "loss": 1.8229, + "step": 16875 + }, + { + "epoch": 5.179864947820749, + "grad_norm": 0.21686211228370667, + "learning_rate": 4.9522833496787876e-05, + "loss": 1.7914, + "step": 16876 + }, + { + "epoch": 5.180171884591774, + "grad_norm": 0.2537819743156433, + "learning_rate": 4.951786316564678e-05, + "loss": 1.7532, + "step": 16877 + }, + { + "epoch": 5.180478821362799, + "grad_norm": 0.24567632377147675, + "learning_rate": 4.951289283927046e-05, + "loss": 1.7528, + "step": 16878 + }, + { + "epoch": 5.180785758133824, + "grad_norm": 0.1958467960357666, + "learning_rate": 4.9507922517708e-05, + "loss": 1.6922, + "step": 16879 + }, + { + "epoch": 5.18109269490485, + "grad_norm": 0.2012091726064682, + "learning_rate": 4.950295220100857e-05, + "loss": 1.7509, + "step": 16880 + }, + { + "epoch": 5.181399631675875, + "grad_norm": 0.2416311800479889, + "learning_rate": 4.9497981889221226e-05, + "loss": 1.7341, + "step": 16881 + }, + { + "epoch": 5.1817065684469, + "grad_norm": 0.21407842636108398, + "learning_rate": 4.949301158239513e-05, + "loss": 1.7493, + "step": 16882 + }, + { + "epoch": 5.182013505217925, + "grad_norm": 0.2354930192232132, + "learning_rate": 4.94880412805794e-05, + "loss": 1.7726, + "step": 16883 + }, + { + "epoch": 5.18232044198895, + "grad_norm": 0.2168428748846054, + "learning_rate": 4.948307098382313e-05, + "loss": 1.77, + "step": 16884 + }, + { + "epoch": 5.1826273787599755, + "grad_norm": 0.19605880975723267, + "learning_rate": 4.947810069217547e-05, + "loss": 1.7292, + "step": 16885 + }, + { + "epoch": 5.182934315531001, + "grad_norm": 0.23066702485084534, + "learning_rate": 4.947313040568551e-05, + "loss": 1.7265, + "step": 16886 + }, + { + "epoch": 5.183241252302026, + "grad_norm": 0.20139534771442413, + "learning_rate": 4.9468160124402386e-05, + "loss": 1.7443, + "step": 16887 + }, + { + "epoch": 5.183548189073051, + "grad_norm": 0.25097572803497314, + "learning_rate": 4.946318984837521e-05, + "loss": 1.7537, + "step": 16888 + }, + { + "epoch": 5.183855125844076, + "grad_norm": 0.26215067505836487, + "learning_rate": 4.945821957765313e-05, + "loss": 1.8397, + "step": 16889 + }, + { + "epoch": 5.184162062615101, + "grad_norm": 0.22072140872478485, + "learning_rate": 4.9453249312285215e-05, + "loss": 1.7052, + "step": 16890 + }, + { + "epoch": 5.184468999386127, + "grad_norm": 0.20372305810451508, + "learning_rate": 4.944827905232064e-05, + "loss": 1.7228, + "step": 16891 + }, + { + "epoch": 5.184775936157152, + "grad_norm": 0.20383495092391968, + "learning_rate": 4.944330879780847e-05, + "loss": 1.7063, + "step": 16892 + }, + { + "epoch": 5.185082872928176, + "grad_norm": 0.1903693675994873, + "learning_rate": 4.943833854879786e-05, + "loss": 1.6435, + "step": 16893 + }, + { + "epoch": 5.185389809699202, + "grad_norm": 0.20357775688171387, + "learning_rate": 4.94333683053379e-05, + "loss": 1.7485, + "step": 16894 + }, + { + "epoch": 5.185696746470227, + "grad_norm": 0.24776104092597961, + "learning_rate": 4.942839806747775e-05, + "loss": 1.718, + "step": 16895 + }, + { + "epoch": 5.186003683241252, + "grad_norm": 0.2455051839351654, + "learning_rate": 4.942342783526649e-05, + "loss": 1.7124, + "step": 16896 + }, + { + "epoch": 5.186310620012278, + "grad_norm": 0.2102014273405075, + "learning_rate": 4.941845760875328e-05, + "loss": 1.7584, + "step": 16897 + }, + { + "epoch": 5.186617556783303, + "grad_norm": 0.2177651822566986, + "learning_rate": 4.941348738798718e-05, + "loss": 1.7019, + "step": 16898 + }, + { + "epoch": 5.1869244935543275, + "grad_norm": 0.21296697854995728, + "learning_rate": 4.9408517173017355e-05, + "loss": 1.7299, + "step": 16899 + }, + { + "epoch": 5.187231430325353, + "grad_norm": 0.23485495150089264, + "learning_rate": 4.940354696389292e-05, + "loss": 1.7271, + "step": 16900 + }, + { + "epoch": 5.187538367096378, + "grad_norm": 0.27287766337394714, + "learning_rate": 4.939857676066297e-05, + "loss": 1.7601, + "step": 16901 + }, + { + "epoch": 5.1878453038674035, + "grad_norm": 0.2060246467590332, + "learning_rate": 4.939360656337665e-05, + "loss": 1.7064, + "step": 16902 + }, + { + "epoch": 5.188152240638429, + "grad_norm": 0.25422418117523193, + "learning_rate": 4.938863637208305e-05, + "loss": 1.7423, + "step": 16903 + }, + { + "epoch": 5.188459177409453, + "grad_norm": 0.2798483669757843, + "learning_rate": 4.9383666186831304e-05, + "loss": 1.7132, + "step": 16904 + }, + { + "epoch": 5.188766114180479, + "grad_norm": 0.23505693674087524, + "learning_rate": 4.9378696007670525e-05, + "loss": 1.7759, + "step": 16905 + }, + { + "epoch": 5.189073050951504, + "grad_norm": 0.23761989176273346, + "learning_rate": 4.937372583464987e-05, + "loss": 1.7076, + "step": 16906 + }, + { + "epoch": 5.189379987722529, + "grad_norm": 0.3005945086479187, + "learning_rate": 4.9368755667818385e-05, + "loss": 1.6957, + "step": 16907 + }, + { + "epoch": 5.189686924493555, + "grad_norm": 0.2502881586551666, + "learning_rate": 4.936378550722525e-05, + "loss": 1.7352, + "step": 16908 + }, + { + "epoch": 5.189993861264579, + "grad_norm": 0.24194179475307465, + "learning_rate": 4.9358815352919544e-05, + "loss": 1.738, + "step": 16909 + }, + { + "epoch": 5.190300798035604, + "grad_norm": 0.27478742599487305, + "learning_rate": 4.935384520495041e-05, + "loss": 1.7118, + "step": 16910 + }, + { + "epoch": 5.19060773480663, + "grad_norm": 0.22327560186386108, + "learning_rate": 4.9348875063366944e-05, + "loss": 1.7697, + "step": 16911 + }, + { + "epoch": 5.190914671577655, + "grad_norm": 0.21844418346881866, + "learning_rate": 4.9343904928218295e-05, + "loss": 1.7733, + "step": 16912 + }, + { + "epoch": 5.19122160834868, + "grad_norm": 0.25267866253852844, + "learning_rate": 4.933893479955354e-05, + "loss": 1.7313, + "step": 16913 + }, + { + "epoch": 5.191528545119706, + "grad_norm": 0.22045068442821503, + "learning_rate": 4.933396467742185e-05, + "loss": 1.7856, + "step": 16914 + }, + { + "epoch": 5.19183548189073, + "grad_norm": 0.22642305493354797, + "learning_rate": 4.932899456187229e-05, + "loss": 1.7326, + "step": 16915 + }, + { + "epoch": 5.1921424186617555, + "grad_norm": 0.20601733028888702, + "learning_rate": 4.9324024452953995e-05, + "loss": 1.7743, + "step": 16916 + }, + { + "epoch": 5.192449355432781, + "grad_norm": 0.25580671429634094, + "learning_rate": 4.931905435071611e-05, + "loss": 1.7705, + "step": 16917 + }, + { + "epoch": 5.192756292203806, + "grad_norm": 0.38173142075538635, + "learning_rate": 4.9314084255207706e-05, + "loss": 1.7504, + "step": 16918 + }, + { + "epoch": 5.1930632289748315, + "grad_norm": 0.2254420667886734, + "learning_rate": 4.930911416647794e-05, + "loss": 1.7344, + "step": 16919 + }, + { + "epoch": 5.193370165745856, + "grad_norm": 0.2354312688112259, + "learning_rate": 4.9304144084575896e-05, + "loss": 1.7607, + "step": 16920 + }, + { + "epoch": 5.193677102516881, + "grad_norm": 0.23879510164260864, + "learning_rate": 4.9299174009550716e-05, + "loss": 1.683, + "step": 16921 + }, + { + "epoch": 5.193984039287907, + "grad_norm": 0.228669211268425, + "learning_rate": 4.9294203941451494e-05, + "loss": 1.7776, + "step": 16922 + }, + { + "epoch": 5.194290976058932, + "grad_norm": 0.2266843616962433, + "learning_rate": 4.928923388032739e-05, + "loss": 1.7563, + "step": 16923 + }, + { + "epoch": 5.194597912829957, + "grad_norm": 0.2581404745578766, + "learning_rate": 4.928426382622747e-05, + "loss": 1.8112, + "step": 16924 + }, + { + "epoch": 5.194904849600983, + "grad_norm": 0.25179803371429443, + "learning_rate": 4.92792937792009e-05, + "loss": 1.7661, + "step": 16925 + }, + { + "epoch": 5.195211786372007, + "grad_norm": 0.23408514261245728, + "learning_rate": 4.9274323739296746e-05, + "loss": 1.7618, + "step": 16926 + }, + { + "epoch": 5.195518723143032, + "grad_norm": 0.23110872507095337, + "learning_rate": 4.926935370656416e-05, + "loss": 1.6945, + "step": 16927 + }, + { + "epoch": 5.195825659914058, + "grad_norm": 0.2863025665283203, + "learning_rate": 4.926438368105224e-05, + "loss": 1.8659, + "step": 16928 + }, + { + "epoch": 5.196132596685083, + "grad_norm": 0.2156454175710678, + "learning_rate": 4.925941366281013e-05, + "loss": 1.7281, + "step": 16929 + }, + { + "epoch": 5.196439533456108, + "grad_norm": 0.2338300198316574, + "learning_rate": 4.925444365188691e-05, + "loss": 1.7271, + "step": 16930 + }, + { + "epoch": 5.196746470227133, + "grad_norm": 0.21434102952480316, + "learning_rate": 4.924947364833173e-05, + "loss": 1.7342, + "step": 16931 + }, + { + "epoch": 5.197053406998158, + "grad_norm": 0.21619778871536255, + "learning_rate": 4.924450365219369e-05, + "loss": 1.7493, + "step": 16932 + }, + { + "epoch": 5.1973603437691835, + "grad_norm": 0.24532032012939453, + "learning_rate": 4.9239533663521896e-05, + "loss": 1.7707, + "step": 16933 + }, + { + "epoch": 5.197667280540209, + "grad_norm": 0.21795547008514404, + "learning_rate": 4.923456368236549e-05, + "loss": 1.7642, + "step": 16934 + }, + { + "epoch": 5.197974217311234, + "grad_norm": 0.2070101797580719, + "learning_rate": 4.922959370877356e-05, + "loss": 1.7377, + "step": 16935 + }, + { + "epoch": 5.198281154082259, + "grad_norm": 0.22546489536762238, + "learning_rate": 4.9224623742795256e-05, + "loss": 1.7766, + "step": 16936 + }, + { + "epoch": 5.198588090853284, + "grad_norm": 0.20723624527454376, + "learning_rate": 4.921965378447965e-05, + "loss": 1.7316, + "step": 16937 + }, + { + "epoch": 5.198895027624309, + "grad_norm": 0.21870547533035278, + "learning_rate": 4.9214683833875905e-05, + "loss": 1.7653, + "step": 16938 + }, + { + "epoch": 5.199201964395335, + "grad_norm": 0.19606490433216095, + "learning_rate": 4.920971389103309e-05, + "loss": 1.7181, + "step": 16939 + }, + { + "epoch": 5.19950890116636, + "grad_norm": 0.18372730910778046, + "learning_rate": 4.920474395600037e-05, + "loss": 1.7041, + "step": 16940 + }, + { + "epoch": 5.199815837937384, + "grad_norm": 0.22051765024662018, + "learning_rate": 4.919977402882682e-05, + "loss": 1.7172, + "step": 16941 + }, + { + "epoch": 5.20012277470841, + "grad_norm": 0.2135835587978363, + "learning_rate": 4.919480410956159e-05, + "loss": 1.6918, + "step": 16942 + }, + { + "epoch": 5.200429711479435, + "grad_norm": 0.19619768857955933, + "learning_rate": 4.918983419825376e-05, + "loss": 1.7005, + "step": 16943 + }, + { + "epoch": 5.2007366482504604, + "grad_norm": 0.22726574540138245, + "learning_rate": 4.918486429495246e-05, + "loss": 1.6775, + "step": 16944 + }, + { + "epoch": 5.201043585021486, + "grad_norm": 0.21471361815929413, + "learning_rate": 4.9179894399706815e-05, + "loss": 1.7102, + "step": 16945 + }, + { + "epoch": 5.201350521792511, + "grad_norm": 0.20113740861415863, + "learning_rate": 4.917492451256595e-05, + "loss": 1.7548, + "step": 16946 + }, + { + "epoch": 5.201657458563536, + "grad_norm": 0.2337827831506729, + "learning_rate": 4.916995463357894e-05, + "loss": 1.818, + "step": 16947 + }, + { + "epoch": 5.201964395334561, + "grad_norm": 0.2649554908275604, + "learning_rate": 4.9164984762794955e-05, + "loss": 1.7784, + "step": 16948 + }, + { + "epoch": 5.202271332105586, + "grad_norm": 0.2297617793083191, + "learning_rate": 4.916001490026306e-05, + "loss": 1.7484, + "step": 16949 + }, + { + "epoch": 5.202578268876612, + "grad_norm": 0.20791979134082794, + "learning_rate": 4.915504504603238e-05, + "loss": 1.7164, + "step": 16950 + }, + { + "epoch": 5.202885205647637, + "grad_norm": 0.21769596636295319, + "learning_rate": 4.915007520015207e-05, + "loss": 1.7783, + "step": 16951 + }, + { + "epoch": 5.203192142418661, + "grad_norm": 0.21038469672203064, + "learning_rate": 4.914510536267118e-05, + "loss": 1.6863, + "step": 16952 + }, + { + "epoch": 5.203499079189687, + "grad_norm": 0.20725449919700623, + "learning_rate": 4.914013553363889e-05, + "loss": 1.6855, + "step": 16953 + }, + { + "epoch": 5.203806015960712, + "grad_norm": 0.23879854381084442, + "learning_rate": 4.9135165713104266e-05, + "loss": 1.6986, + "step": 16954 + }, + { + "epoch": 5.204112952731737, + "grad_norm": 0.20515915751457214, + "learning_rate": 4.913019590111645e-05, + "loss": 1.6912, + "step": 16955 + }, + { + "epoch": 5.204419889502763, + "grad_norm": 0.2252528965473175, + "learning_rate": 4.912522609772453e-05, + "loss": 1.6974, + "step": 16956 + }, + { + "epoch": 5.204726826273788, + "grad_norm": 0.1946130096912384, + "learning_rate": 4.9120256302977665e-05, + "loss": 1.7009, + "step": 16957 + }, + { + "epoch": 5.2050337630448125, + "grad_norm": 0.21323645114898682, + "learning_rate": 4.9115286516924925e-05, + "loss": 1.7746, + "step": 16958 + }, + { + "epoch": 5.205340699815838, + "grad_norm": 0.20721712708473206, + "learning_rate": 4.911031673961546e-05, + "loss": 1.7103, + "step": 16959 + }, + { + "epoch": 5.205647636586863, + "grad_norm": 0.19630689918994904, + "learning_rate": 4.910534697109834e-05, + "loss": 1.7042, + "step": 16960 + }, + { + "epoch": 5.2059545733578885, + "grad_norm": 0.2036786526441574, + "learning_rate": 4.910037721142273e-05, + "loss": 1.7713, + "step": 16961 + }, + { + "epoch": 5.206261510128914, + "grad_norm": 0.20518352091312408, + "learning_rate": 4.9095407460637696e-05, + "loss": 1.7456, + "step": 16962 + }, + { + "epoch": 5.206568446899938, + "grad_norm": 0.199858620762825, + "learning_rate": 4.9090437718792404e-05, + "loss": 1.7598, + "step": 16963 + }, + { + "epoch": 5.206875383670964, + "grad_norm": 0.22860252857208252, + "learning_rate": 4.9085467985935914e-05, + "loss": 1.7947, + "step": 16964 + }, + { + "epoch": 5.207182320441989, + "grad_norm": 0.22179929912090302, + "learning_rate": 4.9080498262117395e-05, + "loss": 1.7537, + "step": 16965 + }, + { + "epoch": 5.207489257213014, + "grad_norm": 0.24737581610679626, + "learning_rate": 4.9075528547385906e-05, + "loss": 1.7932, + "step": 16966 + }, + { + "epoch": 5.20779619398404, + "grad_norm": 0.2653762400150299, + "learning_rate": 4.907055884179059e-05, + "loss": 1.7683, + "step": 16967 + }, + { + "epoch": 5.208103130755064, + "grad_norm": 0.2891876697540283, + "learning_rate": 4.9065589145380564e-05, + "loss": 1.7867, + "step": 16968 + }, + { + "epoch": 5.208410067526089, + "grad_norm": 0.23162086308002472, + "learning_rate": 4.906061945820492e-05, + "loss": 1.7981, + "step": 16969 + }, + { + "epoch": 5.208717004297115, + "grad_norm": 0.2746187150478363, + "learning_rate": 4.9055649780312805e-05, + "loss": 1.7215, + "step": 16970 + }, + { + "epoch": 5.20902394106814, + "grad_norm": 0.3217853605747223, + "learning_rate": 4.905068011175329e-05, + "loss": 1.8027, + "step": 16971 + }, + { + "epoch": 5.209330877839165, + "grad_norm": 0.21517686545848846, + "learning_rate": 4.904571045257553e-05, + "loss": 1.7055, + "step": 16972 + }, + { + "epoch": 5.209637814610191, + "grad_norm": 0.23613709211349487, + "learning_rate": 4.90407408028286e-05, + "loss": 1.751, + "step": 16973 + }, + { + "epoch": 5.209944751381215, + "grad_norm": 0.35093945264816284, + "learning_rate": 4.903577116256165e-05, + "loss": 1.7749, + "step": 16974 + }, + { + "epoch": 5.2102516881522405, + "grad_norm": 0.3289217948913574, + "learning_rate": 4.903080153182376e-05, + "loss": 1.7722, + "step": 16975 + }, + { + "epoch": 5.210558624923266, + "grad_norm": 0.29387256503105164, + "learning_rate": 4.9025831910664074e-05, + "loss": 1.8121, + "step": 16976 + }, + { + "epoch": 5.210865561694291, + "grad_norm": 0.44418805837631226, + "learning_rate": 4.9020862299131664e-05, + "loss": 1.7744, + "step": 16977 + }, + { + "epoch": 5.2111724984653165, + "grad_norm": 0.39242252707481384, + "learning_rate": 4.901589269727568e-05, + "loss": 1.7183, + "step": 16978 + }, + { + "epoch": 5.211479435236341, + "grad_norm": 0.2028690129518509, + "learning_rate": 4.901092310514522e-05, + "loss": 1.7101, + "step": 16979 + }, + { + "epoch": 5.211786372007366, + "grad_norm": 0.4025843143463135, + "learning_rate": 4.900595352278941e-05, + "loss": 1.7545, + "step": 16980 + }, + { + "epoch": 5.212093308778392, + "grad_norm": 0.284568727016449, + "learning_rate": 4.900098395025733e-05, + "loss": 1.7758, + "step": 16981 + }, + { + "epoch": 5.212400245549417, + "grad_norm": 0.2527516484260559, + "learning_rate": 4.899601438759813e-05, + "loss": 1.695, + "step": 16982 + }, + { + "epoch": 5.212707182320442, + "grad_norm": 0.3063630759716034, + "learning_rate": 4.89910448348609e-05, + "loss": 1.714, + "step": 16983 + }, + { + "epoch": 5.213014119091467, + "grad_norm": 0.22754468023777008, + "learning_rate": 4.898607529209474e-05, + "loss": 1.8315, + "step": 16984 + }, + { + "epoch": 5.213321055862492, + "grad_norm": 0.29594969749450684, + "learning_rate": 4.89811057593488e-05, + "loss": 1.6669, + "step": 16985 + }, + { + "epoch": 5.213627992633517, + "grad_norm": 0.21486569941043854, + "learning_rate": 4.897613623667215e-05, + "loss": 1.7425, + "step": 16986 + }, + { + "epoch": 5.213934929404543, + "grad_norm": 0.30908775329589844, + "learning_rate": 4.897116672411395e-05, + "loss": 1.7915, + "step": 16987 + }, + { + "epoch": 5.214241866175568, + "grad_norm": 0.23515601456165314, + "learning_rate": 4.896619722172325e-05, + "loss": 1.7226, + "step": 16988 + }, + { + "epoch": 5.214548802946593, + "grad_norm": 0.2847287952899933, + "learning_rate": 4.8961227729549215e-05, + "loss": 1.7641, + "step": 16989 + }, + { + "epoch": 5.214855739717618, + "grad_norm": 0.2986287772655487, + "learning_rate": 4.895625824764092e-05, + "loss": 1.8025, + "step": 16990 + }, + { + "epoch": 5.215162676488643, + "grad_norm": 0.23454971611499786, + "learning_rate": 4.8951288776047514e-05, + "loss": 1.7057, + "step": 16991 + }, + { + "epoch": 5.2154696132596685, + "grad_norm": 0.2578633725643158, + "learning_rate": 4.894631931481807e-05, + "loss": 1.7267, + "step": 16992 + }, + { + "epoch": 5.215776550030694, + "grad_norm": 0.29975566267967224, + "learning_rate": 4.894134986400174e-05, + "loss": 1.7452, + "step": 16993 + }, + { + "epoch": 5.216083486801719, + "grad_norm": 0.22313638031482697, + "learning_rate": 4.893638042364758e-05, + "loss": 1.6917, + "step": 16994 + }, + { + "epoch": 5.216390423572744, + "grad_norm": 0.258297860622406, + "learning_rate": 4.893141099380475e-05, + "loss": 1.7816, + "step": 16995 + }, + { + "epoch": 5.216697360343769, + "grad_norm": 0.2656872272491455, + "learning_rate": 4.892644157452233e-05, + "loss": 1.7248, + "step": 16996 + }, + { + "epoch": 5.217004297114794, + "grad_norm": 0.20239698886871338, + "learning_rate": 4.8921472165849464e-05, + "loss": 1.7629, + "step": 16997 + }, + { + "epoch": 5.21731123388582, + "grad_norm": 0.2575492262840271, + "learning_rate": 4.891650276783523e-05, + "loss": 1.719, + "step": 16998 + }, + { + "epoch": 5.217618170656845, + "grad_norm": 0.27563637495040894, + "learning_rate": 4.8911533380528756e-05, + "loss": 1.718, + "step": 16999 + }, + { + "epoch": 5.21792510742787, + "grad_norm": 0.1969723105430603, + "learning_rate": 4.890656400397915e-05, + "loss": 1.7557, + "step": 17000 + }, + { + "epoch": 5.218232044198895, + "grad_norm": 0.24336831271648407, + "learning_rate": 4.89015946382355e-05, + "loss": 1.6861, + "step": 17001 + }, + { + "epoch": 5.21853898096992, + "grad_norm": 0.2804388403892517, + "learning_rate": 4.889662528334696e-05, + "loss": 1.7411, + "step": 17002 + }, + { + "epoch": 5.218845917740945, + "grad_norm": 0.21116352081298828, + "learning_rate": 4.8891655939362596e-05, + "loss": 1.7135, + "step": 17003 + }, + { + "epoch": 5.219152854511971, + "grad_norm": 0.21042904257774353, + "learning_rate": 4.8886686606331556e-05, + "loss": 1.7224, + "step": 17004 + }, + { + "epoch": 5.219459791282996, + "grad_norm": 0.22463755309581757, + "learning_rate": 4.888171728430291e-05, + "loss": 1.8272, + "step": 17005 + }, + { + "epoch": 5.2197667280540205, + "grad_norm": 0.25604158639907837, + "learning_rate": 4.8876747973325805e-05, + "loss": 1.674, + "step": 17006 + }, + { + "epoch": 5.220073664825046, + "grad_norm": 0.3108421564102173, + "learning_rate": 4.887177867344932e-05, + "loss": 1.761, + "step": 17007 + }, + { + "epoch": 5.220380601596071, + "grad_norm": 0.25135359168052673, + "learning_rate": 4.88668093847226e-05, + "loss": 1.7455, + "step": 17008 + }, + { + "epoch": 5.2206875383670965, + "grad_norm": 0.24508307874202728, + "learning_rate": 4.886184010719471e-05, + "loss": 1.7632, + "step": 17009 + }, + { + "epoch": 5.220994475138122, + "grad_norm": 0.26777148246765137, + "learning_rate": 4.8856870840914816e-05, + "loss": 1.7814, + "step": 17010 + }, + { + "epoch": 5.221301411909146, + "grad_norm": 0.22404739260673523, + "learning_rate": 4.8851901585931967e-05, + "loss": 1.7441, + "step": 17011 + }, + { + "epoch": 5.221608348680172, + "grad_norm": 0.2406606674194336, + "learning_rate": 4.884693234229531e-05, + "loss": 1.7789, + "step": 17012 + }, + { + "epoch": 5.221915285451197, + "grad_norm": 0.27320384979248047, + "learning_rate": 4.884196311005394e-05, + "loss": 1.8046, + "step": 17013 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.3393586277961731, + "learning_rate": 4.8836993889256965e-05, + "loss": 1.7155, + "step": 17014 + }, + { + "epoch": 5.222529158993248, + "grad_norm": 0.3069504499435425, + "learning_rate": 4.88320246799535e-05, + "loss": 1.6985, + "step": 17015 + }, + { + "epoch": 5.222836095764273, + "grad_norm": 0.22184616327285767, + "learning_rate": 4.8827055482192664e-05, + "loss": 1.7996, + "step": 17016 + }, + { + "epoch": 5.223143032535297, + "grad_norm": 0.2791864573955536, + "learning_rate": 4.8822086296023544e-05, + "loss": 1.7223, + "step": 17017 + }, + { + "epoch": 5.223449969306323, + "grad_norm": 0.259726345539093, + "learning_rate": 4.8817117121495245e-05, + "loss": 1.7481, + "step": 17018 + }, + { + "epoch": 5.223756906077348, + "grad_norm": 0.19968681037425995, + "learning_rate": 4.8812147958656916e-05, + "loss": 1.702, + "step": 17019 + }, + { + "epoch": 5.224063842848373, + "grad_norm": 0.20161856710910797, + "learning_rate": 4.8807178807557616e-05, + "loss": 1.6689, + "step": 17020 + }, + { + "epoch": 5.224370779619399, + "grad_norm": 0.2365240454673767, + "learning_rate": 4.880220966824649e-05, + "loss": 1.7742, + "step": 17021 + }, + { + "epoch": 5.224677716390423, + "grad_norm": 0.20116381347179413, + "learning_rate": 4.879724054077261e-05, + "loss": 1.7584, + "step": 17022 + }, + { + "epoch": 5.2249846531614486, + "grad_norm": 0.22845037281513214, + "learning_rate": 4.879227142518511e-05, + "loss": 1.7794, + "step": 17023 + }, + { + "epoch": 5.225291589932474, + "grad_norm": 0.251724511384964, + "learning_rate": 4.87873023215331e-05, + "loss": 1.7722, + "step": 17024 + }, + { + "epoch": 5.225598526703499, + "grad_norm": 0.206145241856575, + "learning_rate": 4.878233322986568e-05, + "loss": 1.7452, + "step": 17025 + }, + { + "epoch": 5.225905463474525, + "grad_norm": 0.24065247178077698, + "learning_rate": 4.877736415023194e-05, + "loss": 1.8144, + "step": 17026 + }, + { + "epoch": 5.226212400245549, + "grad_norm": 0.2255484163761139, + "learning_rate": 4.877239508268103e-05, + "loss": 1.706, + "step": 17027 + }, + { + "epoch": 5.226519337016574, + "grad_norm": 0.21035850048065186, + "learning_rate": 4.8767426027262e-05, + "loss": 1.7167, + "step": 17028 + }, + { + "epoch": 5.2268262737876, + "grad_norm": 0.19618964195251465, + "learning_rate": 4.8762456984024025e-05, + "loss": 1.7063, + "step": 17029 + }, + { + "epoch": 5.227133210558625, + "grad_norm": 0.19595398008823395, + "learning_rate": 4.875748795301614e-05, + "loss": 1.7452, + "step": 17030 + }, + { + "epoch": 5.22744014732965, + "grad_norm": 0.22870996594429016, + "learning_rate": 4.8752518934287506e-05, + "loss": 1.8169, + "step": 17031 + }, + { + "epoch": 5.227747084100676, + "grad_norm": 0.24048443138599396, + "learning_rate": 4.87475499278872e-05, + "loss": 1.6988, + "step": 17032 + }, + { + "epoch": 5.2280540208717, + "grad_norm": 0.24177183210849762, + "learning_rate": 4.8742580933864356e-05, + "loss": 1.77, + "step": 17033 + }, + { + "epoch": 5.2283609576427255, + "grad_norm": 0.2023085057735443, + "learning_rate": 4.873761195226806e-05, + "loss": 1.7, + "step": 17034 + }, + { + "epoch": 5.228667894413751, + "grad_norm": 0.2614101767539978, + "learning_rate": 4.873264298314742e-05, + "loss": 1.767, + "step": 17035 + }, + { + "epoch": 5.228974831184776, + "grad_norm": 0.19607602059841156, + "learning_rate": 4.872767402655154e-05, + "loss": 1.7391, + "step": 17036 + }, + { + "epoch": 5.2292817679558015, + "grad_norm": 0.2053994983434677, + "learning_rate": 4.872270508252953e-05, + "loss": 1.7155, + "step": 17037 + }, + { + "epoch": 5.229588704726826, + "grad_norm": 0.18256273865699768, + "learning_rate": 4.871773615113051e-05, + "loss": 1.6999, + "step": 17038 + }, + { + "epoch": 5.229895641497851, + "grad_norm": 0.21956393122673035, + "learning_rate": 4.871276723240356e-05, + "loss": 1.7946, + "step": 17039 + }, + { + "epoch": 5.230202578268877, + "grad_norm": 0.23779109120368958, + "learning_rate": 4.870779832639781e-05, + "loss": 1.8063, + "step": 17040 + }, + { + "epoch": 5.230509515039902, + "grad_norm": 0.21662941575050354, + "learning_rate": 4.8702829433162346e-05, + "loss": 1.7276, + "step": 17041 + }, + { + "epoch": 5.230816451810927, + "grad_norm": 0.21578755974769592, + "learning_rate": 4.869786055274628e-05, + "loss": 1.7577, + "step": 17042 + }, + { + "epoch": 5.231123388581952, + "grad_norm": 0.23229347169399261, + "learning_rate": 4.8692891685198715e-05, + "loss": 1.7884, + "step": 17043 + }, + { + "epoch": 5.231430325352977, + "grad_norm": 0.2302366942167282, + "learning_rate": 4.868792283056878e-05, + "loss": 1.7823, + "step": 17044 + }, + { + "epoch": 5.231737262124002, + "grad_norm": 0.2181033343076706, + "learning_rate": 4.868295398890554e-05, + "loss": 1.7027, + "step": 17045 + }, + { + "epoch": 5.232044198895028, + "grad_norm": 0.20863409340381622, + "learning_rate": 4.8677985160258135e-05, + "loss": 1.7247, + "step": 17046 + }, + { + "epoch": 5.232351135666053, + "grad_norm": 0.2242976278066635, + "learning_rate": 4.867301634467564e-05, + "loss": 1.7799, + "step": 17047 + }, + { + "epoch": 5.232658072437078, + "grad_norm": 0.19934964179992676, + "learning_rate": 4.866804754220719e-05, + "loss": 1.6973, + "step": 17048 + }, + { + "epoch": 5.232965009208103, + "grad_norm": 0.22056198120117188, + "learning_rate": 4.8663078752901855e-05, + "loss": 1.7677, + "step": 17049 + }, + { + "epoch": 5.233271945979128, + "grad_norm": 0.2303200513124466, + "learning_rate": 4.865810997680879e-05, + "loss": 1.7517, + "step": 17050 + }, + { + "epoch": 5.2335788827501535, + "grad_norm": 0.21193410456180573, + "learning_rate": 4.8653141213977066e-05, + "loss": 1.7478, + "step": 17051 + }, + { + "epoch": 5.233885819521179, + "grad_norm": 0.18498395383358002, + "learning_rate": 4.864817246445577e-05, + "loss": 1.6891, + "step": 17052 + }, + { + "epoch": 5.234192756292204, + "grad_norm": 0.22879233956336975, + "learning_rate": 4.8643203728294036e-05, + "loss": 1.7166, + "step": 17053 + }, + { + "epoch": 5.234499693063229, + "grad_norm": 0.2128525823354721, + "learning_rate": 4.8638235005540944e-05, + "loss": 1.7993, + "step": 17054 + }, + { + "epoch": 5.234806629834254, + "grad_norm": 0.21245025098323822, + "learning_rate": 4.8633266296245634e-05, + "loss": 1.7436, + "step": 17055 + }, + { + "epoch": 5.235113566605279, + "grad_norm": 0.20301629602909088, + "learning_rate": 4.8628297600457165e-05, + "loss": 1.7774, + "step": 17056 + }, + { + "epoch": 5.235420503376305, + "grad_norm": 0.23251961171627045, + "learning_rate": 4.8623328918224687e-05, + "loss": 1.7897, + "step": 17057 + }, + { + "epoch": 5.23572744014733, + "grad_norm": 0.2272956669330597, + "learning_rate": 4.861836024959726e-05, + "loss": 1.7668, + "step": 17058 + }, + { + "epoch": 5.236034376918354, + "grad_norm": 0.20540569722652435, + "learning_rate": 4.8613391594624013e-05, + "loss": 1.7549, + "step": 17059 + }, + { + "epoch": 5.23634131368938, + "grad_norm": 0.20306967198848724, + "learning_rate": 4.8608422953354034e-05, + "loss": 1.6993, + "step": 17060 + }, + { + "epoch": 5.236648250460405, + "grad_norm": 0.19415293633937836, + "learning_rate": 4.8603454325836455e-05, + "loss": 1.7313, + "step": 17061 + }, + { + "epoch": 5.23695518723143, + "grad_norm": 0.2058337777853012, + "learning_rate": 4.859848571212034e-05, + "loss": 1.7994, + "step": 17062 + }, + { + "epoch": 5.237262124002456, + "grad_norm": 0.24489709734916687, + "learning_rate": 4.859351711225483e-05, + "loss": 1.7555, + "step": 17063 + }, + { + "epoch": 5.237569060773481, + "grad_norm": 0.22589795291423798, + "learning_rate": 4.858854852628899e-05, + "loss": 1.7136, + "step": 17064 + }, + { + "epoch": 5.2378759975445055, + "grad_norm": 0.21404492855072021, + "learning_rate": 4.858357995427195e-05, + "loss": 1.7598, + "step": 17065 + }, + { + "epoch": 5.238182934315531, + "grad_norm": 0.24936965107917786, + "learning_rate": 4.8578611396252786e-05, + "loss": 1.8027, + "step": 17066 + }, + { + "epoch": 5.238489871086556, + "grad_norm": 0.23391515016555786, + "learning_rate": 4.857364285228065e-05, + "loss": 1.7704, + "step": 17067 + }, + { + "epoch": 5.2387968078575815, + "grad_norm": 0.22633357346057892, + "learning_rate": 4.85686743224046e-05, + "loss": 1.7075, + "step": 17068 + }, + { + "epoch": 5.239103744628607, + "grad_norm": 0.221492201089859, + "learning_rate": 4.8563705806673736e-05, + "loss": 1.7755, + "step": 17069 + }, + { + "epoch": 5.239410681399631, + "grad_norm": 0.2381046712398529, + "learning_rate": 4.855873730513719e-05, + "loss": 1.7971, + "step": 17070 + }, + { + "epoch": 5.239717618170657, + "grad_norm": 0.21930988132953644, + "learning_rate": 4.855376881784402e-05, + "loss": 1.7295, + "step": 17071 + }, + { + "epoch": 5.240024554941682, + "grad_norm": 0.20897921919822693, + "learning_rate": 4.854880034484339e-05, + "loss": 1.7796, + "step": 17072 + }, + { + "epoch": 5.240331491712707, + "grad_norm": 0.26616254448890686, + "learning_rate": 4.8543831886184334e-05, + "loss": 1.7095, + "step": 17073 + }, + { + "epoch": 5.240638428483733, + "grad_norm": 0.19513870775699615, + "learning_rate": 4.853886344191601e-05, + "loss": 1.7181, + "step": 17074 + }, + { + "epoch": 5.240945365254758, + "grad_norm": 0.23476530611515045, + "learning_rate": 4.853389501208747e-05, + "loss": 1.7928, + "step": 17075 + }, + { + "epoch": 5.241252302025782, + "grad_norm": 0.18197014927864075, + "learning_rate": 4.852892659674785e-05, + "loss": 1.6888, + "step": 17076 + }, + { + "epoch": 5.241559238796808, + "grad_norm": 0.20317208766937256, + "learning_rate": 4.852395819594623e-05, + "loss": 1.7828, + "step": 17077 + }, + { + "epoch": 5.241866175567833, + "grad_norm": 0.1953772008419037, + "learning_rate": 4.851898980973175e-05, + "loss": 1.7394, + "step": 17078 + }, + { + "epoch": 5.242173112338858, + "grad_norm": 0.19714407622814178, + "learning_rate": 4.851402143815345e-05, + "loss": 1.7261, + "step": 17079 + }, + { + "epoch": 5.242480049109884, + "grad_norm": 0.2196008861064911, + "learning_rate": 4.850905308126048e-05, + "loss": 1.7387, + "step": 17080 + }, + { + "epoch": 5.242786985880908, + "grad_norm": 0.2337818443775177, + "learning_rate": 4.85040847391019e-05, + "loss": 1.7448, + "step": 17081 + }, + { + "epoch": 5.2430939226519335, + "grad_norm": 0.20940040051937103, + "learning_rate": 4.849911641172685e-05, + "loss": 1.7354, + "step": 17082 + }, + { + "epoch": 5.243400859422959, + "grad_norm": 0.2242170125246048, + "learning_rate": 4.849414809918439e-05, + "loss": 1.7325, + "step": 17083 + }, + { + "epoch": 5.243707796193984, + "grad_norm": 0.2322687953710556, + "learning_rate": 4.8489179801523675e-05, + "loss": 1.7557, + "step": 17084 + }, + { + "epoch": 5.2440147329650095, + "grad_norm": 0.20303767919540405, + "learning_rate": 4.8484211518793764e-05, + "loss": 1.7063, + "step": 17085 + }, + { + "epoch": 5.244321669736034, + "grad_norm": 0.2446853369474411, + "learning_rate": 4.8479243251043746e-05, + "loss": 1.7587, + "step": 17086 + }, + { + "epoch": 5.244628606507059, + "grad_norm": 0.22901636362075806, + "learning_rate": 4.8474274998322735e-05, + "loss": 1.7992, + "step": 17087 + }, + { + "epoch": 5.244935543278085, + "grad_norm": 0.29676303267478943, + "learning_rate": 4.846930676067984e-05, + "loss": 1.7688, + "step": 17088 + }, + { + "epoch": 5.24524248004911, + "grad_norm": 0.24160240590572357, + "learning_rate": 4.846433853816416e-05, + "loss": 1.7367, + "step": 17089 + }, + { + "epoch": 5.245549416820135, + "grad_norm": 0.2097402662038803, + "learning_rate": 4.8459370330824774e-05, + "loss": 1.721, + "step": 17090 + }, + { + "epoch": 5.245856353591161, + "grad_norm": 0.26451143622398376, + "learning_rate": 4.8454402138710814e-05, + "loss": 1.7707, + "step": 17091 + }, + { + "epoch": 5.246163290362185, + "grad_norm": 0.30428358912467957, + "learning_rate": 4.844943396187133e-05, + "loss": 1.7232, + "step": 17092 + }, + { + "epoch": 5.24647022713321, + "grad_norm": 0.24332918226718903, + "learning_rate": 4.8444465800355466e-05, + "loss": 1.8215, + "step": 17093 + }, + { + "epoch": 5.246777163904236, + "grad_norm": 0.292703777551651, + "learning_rate": 4.843949765421229e-05, + "loss": 1.7199, + "step": 17094 + }, + { + "epoch": 5.247084100675261, + "grad_norm": 0.2458789199590683, + "learning_rate": 4.843452952349094e-05, + "loss": 1.7615, + "step": 17095 + }, + { + "epoch": 5.247391037446286, + "grad_norm": 0.22538037598133087, + "learning_rate": 4.842956140824045e-05, + "loss": 1.7279, + "step": 17096 + }, + { + "epoch": 5.247697974217311, + "grad_norm": 0.2959176003932953, + "learning_rate": 4.842459330850999e-05, + "loss": 1.767, + "step": 17097 + }, + { + "epoch": 5.248004910988336, + "grad_norm": 0.26158571243286133, + "learning_rate": 4.84196252243486e-05, + "loss": 1.7387, + "step": 17098 + }, + { + "epoch": 5.2483118477593615, + "grad_norm": 0.22855687141418457, + "learning_rate": 4.84146571558054e-05, + "loss": 1.7497, + "step": 17099 + }, + { + "epoch": 5.248618784530387, + "grad_norm": 0.22470593452453613, + "learning_rate": 4.840968910292949e-05, + "loss": 1.7705, + "step": 17100 + }, + { + "epoch": 5.248925721301412, + "grad_norm": 0.24680538475513458, + "learning_rate": 4.840472106576998e-05, + "loss": 1.7426, + "step": 17101 + }, + { + "epoch": 5.249232658072437, + "grad_norm": 0.23919185996055603, + "learning_rate": 4.839975304437594e-05, + "loss": 1.78, + "step": 17102 + }, + { + "epoch": 5.249539594843462, + "grad_norm": 0.24717366695404053, + "learning_rate": 4.839478503879647e-05, + "loss": 1.7373, + "step": 17103 + }, + { + "epoch": 5.249846531614487, + "grad_norm": 0.20463785529136658, + "learning_rate": 4.838981704908068e-05, + "loss": 1.702, + "step": 17104 + }, + { + "epoch": 5.250153468385513, + "grad_norm": 0.19791419804096222, + "learning_rate": 4.838484907527766e-05, + "loss": 1.746, + "step": 17105 + }, + { + "epoch": 5.250460405156538, + "grad_norm": 0.26169353723526, + "learning_rate": 4.837988111743652e-05, + "loss": 1.7227, + "step": 17106 + }, + { + "epoch": 5.250767341927563, + "grad_norm": 0.23545648157596588, + "learning_rate": 4.837491317560633e-05, + "loss": 1.7104, + "step": 17107 + }, + { + "epoch": 5.251074278698588, + "grad_norm": 0.21569804847240448, + "learning_rate": 4.836994524983622e-05, + "loss": 1.7883, + "step": 17108 + }, + { + "epoch": 5.251381215469613, + "grad_norm": 0.2730300724506378, + "learning_rate": 4.836497734017524e-05, + "loss": 1.7105, + "step": 17109 + }, + { + "epoch": 5.2516881522406385, + "grad_norm": 0.2834697663784027, + "learning_rate": 4.836000944667253e-05, + "loss": 1.8041, + "step": 17110 + }, + { + "epoch": 5.251995089011664, + "grad_norm": 0.31536951661109924, + "learning_rate": 4.835504156937715e-05, + "loss": 1.7708, + "step": 17111 + }, + { + "epoch": 5.252302025782689, + "grad_norm": 0.3830285668373108, + "learning_rate": 4.835007370833824e-05, + "loss": 1.7464, + "step": 17112 + }, + { + "epoch": 5.252608962553714, + "grad_norm": 0.23248349130153656, + "learning_rate": 4.834510586360485e-05, + "loss": 1.7274, + "step": 17113 + }, + { + "epoch": 5.252915899324739, + "grad_norm": 0.4755091071128845, + "learning_rate": 4.834013803522611e-05, + "loss": 1.7853, + "step": 17114 + }, + { + "epoch": 5.253222836095764, + "grad_norm": 0.4267823398113251, + "learning_rate": 4.8335170223251073e-05, + "loss": 1.7424, + "step": 17115 + }, + { + "epoch": 5.25352977286679, + "grad_norm": 0.17621731758117676, + "learning_rate": 4.8330202427728876e-05, + "loss": 1.7415, + "step": 17116 + }, + { + "epoch": 5.253836709637815, + "grad_norm": 0.37484630942344666, + "learning_rate": 4.832523464870859e-05, + "loss": 1.7357, + "step": 17117 + }, + { + "epoch": 5.25414364640884, + "grad_norm": 0.27773791551589966, + "learning_rate": 4.832026688623933e-05, + "loss": 1.717, + "step": 17118 + }, + { + "epoch": 5.254450583179865, + "grad_norm": 0.31190845370292664, + "learning_rate": 4.8315299140370183e-05, + "loss": 1.7226, + "step": 17119 + }, + { + "epoch": 5.25475751995089, + "grad_norm": 0.4321303367614746, + "learning_rate": 4.8310331411150215e-05, + "loss": 1.8003, + "step": 17120 + }, + { + "epoch": 5.255064456721915, + "grad_norm": 0.31622835993766785, + "learning_rate": 4.830536369862855e-05, + "loss": 1.8462, + "step": 17121 + }, + { + "epoch": 5.255371393492941, + "grad_norm": 0.2144850194454193, + "learning_rate": 4.830039600285427e-05, + "loss": 1.8153, + "step": 17122 + }, + { + "epoch": 5.255678330263966, + "grad_norm": 0.3107511103153229, + "learning_rate": 4.829542832387649e-05, + "loss": 1.7271, + "step": 17123 + }, + { + "epoch": 5.2559852670349905, + "grad_norm": 0.24607159197330475, + "learning_rate": 4.8290460661744265e-05, + "loss": 1.7946, + "step": 17124 + }, + { + "epoch": 5.256292203806016, + "grad_norm": 0.226362943649292, + "learning_rate": 4.828549301650673e-05, + "loss": 1.7338, + "step": 17125 + }, + { + "epoch": 5.256599140577041, + "grad_norm": 0.29993724822998047, + "learning_rate": 4.828052538821294e-05, + "loss": 1.8, + "step": 17126 + }, + { + "epoch": 5.2569060773480665, + "grad_norm": 0.25639984011650085, + "learning_rate": 4.8275557776912014e-05, + "loss": 1.8009, + "step": 17127 + }, + { + "epoch": 5.257213014119092, + "grad_norm": 0.2308105081319809, + "learning_rate": 4.8270590182653024e-05, + "loss": 1.7468, + "step": 17128 + }, + { + "epoch": 5.257519950890116, + "grad_norm": 0.27337542176246643, + "learning_rate": 4.82656226054851e-05, + "loss": 1.7725, + "step": 17129 + }, + { + "epoch": 5.257826887661142, + "grad_norm": 0.24848094582557678, + "learning_rate": 4.826065504545729e-05, + "loss": 1.8084, + "step": 17130 + }, + { + "epoch": 5.258133824432167, + "grad_norm": 0.35026392340660095, + "learning_rate": 4.825568750261872e-05, + "loss": 1.7705, + "step": 17131 + }, + { + "epoch": 5.258440761203192, + "grad_norm": 0.3207968473434448, + "learning_rate": 4.825071997701846e-05, + "loss": 1.7329, + "step": 17132 + }, + { + "epoch": 5.258747697974218, + "grad_norm": 0.20949263870716095, + "learning_rate": 4.8245752468705614e-05, + "loss": 1.7658, + "step": 17133 + }, + { + "epoch": 5.259054634745242, + "grad_norm": 0.3158881366252899, + "learning_rate": 4.824078497772926e-05, + "loss": 1.7249, + "step": 17134 + }, + { + "epoch": 5.259361571516267, + "grad_norm": 0.2283414602279663, + "learning_rate": 4.823581750413852e-05, + "loss": 1.7177, + "step": 17135 + }, + { + "epoch": 5.259668508287293, + "grad_norm": 0.24753578007221222, + "learning_rate": 4.823085004798247e-05, + "loss": 1.7232, + "step": 17136 + }, + { + "epoch": 5.259975445058318, + "grad_norm": 0.20381587743759155, + "learning_rate": 4.822588260931017e-05, + "loss": 1.7049, + "step": 17137 + }, + { + "epoch": 5.260282381829343, + "grad_norm": 0.21220643818378448, + "learning_rate": 4.8220915188170746e-05, + "loss": 1.7221, + "step": 17138 + }, + { + "epoch": 5.260589318600369, + "grad_norm": 0.19324758648872375, + "learning_rate": 4.8215947784613276e-05, + "loss": 1.7168, + "step": 17139 + }, + { + "epoch": 5.260896255371393, + "grad_norm": 0.26500338315963745, + "learning_rate": 4.821098039868688e-05, + "loss": 1.7627, + "step": 17140 + }, + { + "epoch": 5.2612031921424185, + "grad_norm": 0.19597655534744263, + "learning_rate": 4.82060130304406e-05, + "loss": 1.7214, + "step": 17141 + }, + { + "epoch": 5.261510128913444, + "grad_norm": 0.2105483114719391, + "learning_rate": 4.820104567992357e-05, + "loss": 1.6742, + "step": 17142 + }, + { + "epoch": 5.261817065684469, + "grad_norm": 0.20020028948783875, + "learning_rate": 4.8196078347184837e-05, + "loss": 1.7721, + "step": 17143 + }, + { + "epoch": 5.2621240024554945, + "grad_norm": 0.2313549965620041, + "learning_rate": 4.819111103227353e-05, + "loss": 1.7644, + "step": 17144 + }, + { + "epoch": 5.262430939226519, + "grad_norm": 0.31893789768218994, + "learning_rate": 4.818614373523871e-05, + "loss": 1.747, + "step": 17145 + }, + { + "epoch": 5.262737875997544, + "grad_norm": 0.2531197667121887, + "learning_rate": 4.8181176456129505e-05, + "loss": 1.7713, + "step": 17146 + }, + { + "epoch": 5.26304481276857, + "grad_norm": 0.2063976377248764, + "learning_rate": 4.817620919499496e-05, + "loss": 1.7254, + "step": 17147 + }, + { + "epoch": 5.263351749539595, + "grad_norm": 0.22220590710639954, + "learning_rate": 4.8171241951884204e-05, + "loss": 1.7345, + "step": 17148 + }, + { + "epoch": 5.26365868631062, + "grad_norm": 0.24240384995937347, + "learning_rate": 4.8166274726846286e-05, + "loss": 1.7302, + "step": 17149 + }, + { + "epoch": 5.263965623081646, + "grad_norm": 0.215829998254776, + "learning_rate": 4.8161307519930326e-05, + "loss": 1.7725, + "step": 17150 + }, + { + "epoch": 5.26427255985267, + "grad_norm": 0.2697906494140625, + "learning_rate": 4.815634033118541e-05, + "loss": 1.7156, + "step": 17151 + }, + { + "epoch": 5.264579496623695, + "grad_norm": 0.21649456024169922, + "learning_rate": 4.815137316066061e-05, + "loss": 1.745, + "step": 17152 + }, + { + "epoch": 5.264886433394721, + "grad_norm": 0.22773787379264832, + "learning_rate": 4.8146406008405033e-05, + "loss": 1.7592, + "step": 17153 + }, + { + "epoch": 5.265193370165746, + "grad_norm": 0.2920280396938324, + "learning_rate": 4.8141438874467745e-05, + "loss": 1.8301, + "step": 17154 + }, + { + "epoch": 5.265500306936771, + "grad_norm": 0.23919162154197693, + "learning_rate": 4.813647175889785e-05, + "loss": 1.7687, + "step": 17155 + }, + { + "epoch": 5.265807243707796, + "grad_norm": 0.24617896974086761, + "learning_rate": 4.8131504661744425e-05, + "loss": 1.8279, + "step": 17156 + }, + { + "epoch": 5.266114180478821, + "grad_norm": 0.22756172716617584, + "learning_rate": 4.812653758305659e-05, + "loss": 1.7595, + "step": 17157 + }, + { + "epoch": 5.2664211172498465, + "grad_norm": 0.22939376533031464, + "learning_rate": 4.812157052288339e-05, + "loss": 1.7445, + "step": 17158 + }, + { + "epoch": 5.266728054020872, + "grad_norm": 0.21021319925785065, + "learning_rate": 4.811660348127395e-05, + "loss": 1.7875, + "step": 17159 + }, + { + "epoch": 5.267034990791897, + "grad_norm": 0.2271810919046402, + "learning_rate": 4.811163645827732e-05, + "loss": 1.74, + "step": 17160 + }, + { + "epoch": 5.267341927562922, + "grad_norm": 0.238374263048172, + "learning_rate": 4.81066694539426e-05, + "loss": 1.7717, + "step": 17161 + }, + { + "epoch": 5.267648864333947, + "grad_norm": 0.20655091106891632, + "learning_rate": 4.8101702468318885e-05, + "loss": 1.7447, + "step": 17162 + }, + { + "epoch": 5.267955801104972, + "grad_norm": 0.24652259051799774, + "learning_rate": 4.809673550145528e-05, + "loss": 1.7755, + "step": 17163 + }, + { + "epoch": 5.268262737875998, + "grad_norm": 0.20256781578063965, + "learning_rate": 4.809176855340083e-05, + "loss": 1.7689, + "step": 17164 + }, + { + "epoch": 5.268569674647023, + "grad_norm": 0.27023112773895264, + "learning_rate": 4.8086801624204665e-05, + "loss": 1.8364, + "step": 17165 + }, + { + "epoch": 5.268876611418047, + "grad_norm": 0.251638799905777, + "learning_rate": 4.808183471391582e-05, + "loss": 1.7924, + "step": 17166 + }, + { + "epoch": 5.269183548189073, + "grad_norm": 0.22897782921791077, + "learning_rate": 4.807686782258342e-05, + "loss": 1.7378, + "step": 17167 + }, + { + "epoch": 5.269490484960098, + "grad_norm": 0.19141456484794617, + "learning_rate": 4.807190095025655e-05, + "loss": 1.6911, + "step": 17168 + }, + { + "epoch": 5.269797421731123, + "grad_norm": 0.19960568845272064, + "learning_rate": 4.806693409698427e-05, + "loss": 1.71, + "step": 17169 + }, + { + "epoch": 5.270104358502149, + "grad_norm": 0.23332087695598602, + "learning_rate": 4.8061967262815694e-05, + "loss": 1.7993, + "step": 17170 + }, + { + "epoch": 5.270411295273174, + "grad_norm": 0.24831432104110718, + "learning_rate": 4.8057000447799876e-05, + "loss": 1.7459, + "step": 17171 + }, + { + "epoch": 5.2707182320441985, + "grad_norm": 0.24735838174819946, + "learning_rate": 4.805203365198593e-05, + "loss": 1.7751, + "step": 17172 + }, + { + "epoch": 5.271025168815224, + "grad_norm": 0.32630103826522827, + "learning_rate": 4.804706687542291e-05, + "loss": 1.7885, + "step": 17173 + }, + { + "epoch": 5.271332105586249, + "grad_norm": 0.29055842757225037, + "learning_rate": 4.804210011815995e-05, + "loss": 1.6819, + "step": 17174 + }, + { + "epoch": 5.2716390423572745, + "grad_norm": 0.22968806326389313, + "learning_rate": 4.803713338024608e-05, + "loss": 1.8146, + "step": 17175 + }, + { + "epoch": 5.2719459791283, + "grad_norm": 0.23430144786834717, + "learning_rate": 4.8032166661730434e-05, + "loss": 1.7401, + "step": 17176 + }, + { + "epoch": 5.272252915899324, + "grad_norm": 0.26312723755836487, + "learning_rate": 4.802719996266204e-05, + "loss": 1.8319, + "step": 17177 + }, + { + "epoch": 5.27255985267035, + "grad_norm": 0.23715369403362274, + "learning_rate": 4.802223328309003e-05, + "loss": 1.8014, + "step": 17178 + }, + { + "epoch": 5.272866789441375, + "grad_norm": 0.23943877220153809, + "learning_rate": 4.801726662306347e-05, + "loss": 1.7181, + "step": 17179 + }, + { + "epoch": 5.2731737262124, + "grad_norm": 0.2366543412208557, + "learning_rate": 4.8012299982631435e-05, + "loss": 1.6685, + "step": 17180 + }, + { + "epoch": 5.273480662983426, + "grad_norm": 0.20688587427139282, + "learning_rate": 4.8007333361843016e-05, + "loss": 1.7089, + "step": 17181 + }, + { + "epoch": 5.273787599754451, + "grad_norm": 0.2069951444864273, + "learning_rate": 4.8002366760747314e-05, + "loss": 1.7447, + "step": 17182 + }, + { + "epoch": 5.274094536525475, + "grad_norm": 0.26072344183921814, + "learning_rate": 4.7997400179393374e-05, + "loss": 1.7346, + "step": 17183 + }, + { + "epoch": 5.274401473296501, + "grad_norm": 0.2397938072681427, + "learning_rate": 4.799243361783031e-05, + "loss": 1.7556, + "step": 17184 + }, + { + "epoch": 5.274708410067526, + "grad_norm": 0.23606348037719727, + "learning_rate": 4.798746707610721e-05, + "loss": 1.732, + "step": 17185 + }, + { + "epoch": 5.2750153468385514, + "grad_norm": 0.21078252792358398, + "learning_rate": 4.798250055427311e-05, + "loss": 1.7571, + "step": 17186 + }, + { + "epoch": 5.275322283609577, + "grad_norm": 0.21331414580345154, + "learning_rate": 4.797753405237714e-05, + "loss": 1.732, + "step": 17187 + }, + { + "epoch": 5.275629220380601, + "grad_norm": 0.23700307309627533, + "learning_rate": 4.7972567570468354e-05, + "loss": 1.7354, + "step": 17188 + }, + { + "epoch": 5.275936157151627, + "grad_norm": 0.20519722998142242, + "learning_rate": 4.7967601108595845e-05, + "loss": 1.7435, + "step": 17189 + }, + { + "epoch": 5.276243093922652, + "grad_norm": 0.22358302772045135, + "learning_rate": 4.79626346668087e-05, + "loss": 1.7891, + "step": 17190 + }, + { + "epoch": 5.276550030693677, + "grad_norm": 0.2434413880109787, + "learning_rate": 4.795766824515598e-05, + "loss": 1.814, + "step": 17191 + }, + { + "epoch": 5.276856967464703, + "grad_norm": 0.2198423594236374, + "learning_rate": 4.795270184368678e-05, + "loss": 1.7212, + "step": 17192 + }, + { + "epoch": 5.277163904235728, + "grad_norm": 0.23587806522846222, + "learning_rate": 4.7947735462450205e-05, + "loss": 1.8337, + "step": 17193 + }, + { + "epoch": 5.277470841006752, + "grad_norm": 0.234666645526886, + "learning_rate": 4.794276910149528e-05, + "loss": 1.7548, + "step": 17194 + }, + { + "epoch": 5.277777777777778, + "grad_norm": 0.23363247513771057, + "learning_rate": 4.793780276087115e-05, + "loss": 1.7587, + "step": 17195 + }, + { + "epoch": 5.278084714548803, + "grad_norm": 0.23191119730472565, + "learning_rate": 4.793283644062683e-05, + "loss": 1.7691, + "step": 17196 + }, + { + "epoch": 5.278391651319828, + "grad_norm": 0.2363097071647644, + "learning_rate": 4.7927870140811445e-05, + "loss": 1.8139, + "step": 17197 + }, + { + "epoch": 5.278698588090854, + "grad_norm": 0.2852413058280945, + "learning_rate": 4.7922903861474056e-05, + "loss": 1.7905, + "step": 17198 + }, + { + "epoch": 5.279005524861878, + "grad_norm": 0.23633842170238495, + "learning_rate": 4.7917937602663764e-05, + "loss": 1.8014, + "step": 17199 + }, + { + "epoch": 5.2793124616329035, + "grad_norm": 0.27007919549942017, + "learning_rate": 4.791297136442961e-05, + "loss": 1.7242, + "step": 17200 + }, + { + "epoch": 5.279619398403929, + "grad_norm": 0.29482147097587585, + "learning_rate": 4.790800514682072e-05, + "loss": 1.7154, + "step": 17201 + }, + { + "epoch": 5.279926335174954, + "grad_norm": 0.27772340178489685, + "learning_rate": 4.790303894988614e-05, + "loss": 1.7771, + "step": 17202 + }, + { + "epoch": 5.2802332719459795, + "grad_norm": 0.21761848032474518, + "learning_rate": 4.789807277367495e-05, + "loss": 1.6983, + "step": 17203 + }, + { + "epoch": 5.280540208717004, + "grad_norm": 0.22621290385723114, + "learning_rate": 4.789310661823626e-05, + "loss": 1.7667, + "step": 17204 + }, + { + "epoch": 5.280847145488029, + "grad_norm": 0.2284683883190155, + "learning_rate": 4.7888140483619095e-05, + "loss": 1.7419, + "step": 17205 + }, + { + "epoch": 5.281154082259055, + "grad_norm": 0.20145639777183533, + "learning_rate": 4.788317436987259e-05, + "loss": 1.7068, + "step": 17206 + }, + { + "epoch": 5.28146101903008, + "grad_norm": 0.23146072030067444, + "learning_rate": 4.7878208277045775e-05, + "loss": 1.7195, + "step": 17207 + }, + { + "epoch": 5.281767955801105, + "grad_norm": 0.24014149606227875, + "learning_rate": 4.787324220518776e-05, + "loss": 1.8148, + "step": 17208 + }, + { + "epoch": 5.28207489257213, + "grad_norm": 0.21067874133586884, + "learning_rate": 4.7868276154347595e-05, + "loss": 1.7754, + "step": 17209 + }, + { + "epoch": 5.282381829343155, + "grad_norm": 0.2313496321439743, + "learning_rate": 4.786331012457441e-05, + "loss": 1.7693, + "step": 17210 + }, + { + "epoch": 5.28268876611418, + "grad_norm": 0.24190983176231384, + "learning_rate": 4.7858344115917214e-05, + "loss": 1.7342, + "step": 17211 + }, + { + "epoch": 5.282995702885206, + "grad_norm": 0.24541905522346497, + "learning_rate": 4.785337812842514e-05, + "loss": 1.7721, + "step": 17212 + }, + { + "epoch": 5.283302639656231, + "grad_norm": 0.21989032626152039, + "learning_rate": 4.784841216214722e-05, + "loss": 1.7522, + "step": 17213 + }, + { + "epoch": 5.283609576427256, + "grad_norm": 0.20637241005897522, + "learning_rate": 4.784344621713256e-05, + "loss": 1.7418, + "step": 17214 + }, + { + "epoch": 5.283916513198281, + "grad_norm": 0.22538220882415771, + "learning_rate": 4.783848029343023e-05, + "loss": 1.8287, + "step": 17215 + }, + { + "epoch": 5.284223449969306, + "grad_norm": 0.24478071928024292, + "learning_rate": 4.7833514391089315e-05, + "loss": 1.7419, + "step": 17216 + }, + { + "epoch": 5.2845303867403315, + "grad_norm": 0.22707650065422058, + "learning_rate": 4.782854851015886e-05, + "loss": 1.7831, + "step": 17217 + }, + { + "epoch": 5.284837323511357, + "grad_norm": 0.2843529284000397, + "learning_rate": 4.7823582650687984e-05, + "loss": 1.7704, + "step": 17218 + }, + { + "epoch": 5.285144260282382, + "grad_norm": 0.21647678315639496, + "learning_rate": 4.781861681272573e-05, + "loss": 1.7514, + "step": 17219 + }, + { + "epoch": 5.285451197053407, + "grad_norm": 0.2279205620288849, + "learning_rate": 4.781365099632117e-05, + "loss": 1.6803, + "step": 17220 + }, + { + "epoch": 5.285758133824432, + "grad_norm": 0.2287401556968689, + "learning_rate": 4.7808685201523417e-05, + "loss": 1.7278, + "step": 17221 + }, + { + "epoch": 5.286065070595457, + "grad_norm": 0.2103174477815628, + "learning_rate": 4.78037194283815e-05, + "loss": 1.7667, + "step": 17222 + }, + { + "epoch": 5.286372007366483, + "grad_norm": 0.24339279532432556, + "learning_rate": 4.7798753676944536e-05, + "loss": 1.7828, + "step": 17223 + }, + { + "epoch": 5.286678944137508, + "grad_norm": 0.2343035340309143, + "learning_rate": 4.779378794726156e-05, + "loss": 1.7277, + "step": 17224 + }, + { + "epoch": 5.286985880908533, + "grad_norm": 0.22456331551074982, + "learning_rate": 4.778882223938167e-05, + "loss": 1.756, + "step": 17225 + }, + { + "epoch": 5.287292817679558, + "grad_norm": 0.2211158126592636, + "learning_rate": 4.778385655335392e-05, + "loss": 1.7733, + "step": 17226 + }, + { + "epoch": 5.287599754450583, + "grad_norm": 0.2731948792934418, + "learning_rate": 4.777889088922743e-05, + "loss": 1.787, + "step": 17227 + }, + { + "epoch": 5.287906691221608, + "grad_norm": 0.19578024744987488, + "learning_rate": 4.7773925247051215e-05, + "loss": 1.7474, + "step": 17228 + }, + { + "epoch": 5.288213627992634, + "grad_norm": 0.277332067489624, + "learning_rate": 4.77689596268744e-05, + "loss": 1.7432, + "step": 17229 + }, + { + "epoch": 5.288520564763659, + "grad_norm": 0.2979765832424164, + "learning_rate": 4.7763994028746003e-05, + "loss": 1.8198, + "step": 17230 + }, + { + "epoch": 5.2888275015346835, + "grad_norm": 0.23176288604736328, + "learning_rate": 4.775902845271515e-05, + "loss": 1.7317, + "step": 17231 + }, + { + "epoch": 5.289134438305709, + "grad_norm": 0.35821911692619324, + "learning_rate": 4.7754062898830876e-05, + "loss": 1.7287, + "step": 17232 + }, + { + "epoch": 5.289441375076734, + "grad_norm": 0.2881525158882141, + "learning_rate": 4.7749097367142296e-05, + "loss": 1.7391, + "step": 17233 + }, + { + "epoch": 5.2897483118477595, + "grad_norm": 0.22021767497062683, + "learning_rate": 4.774413185769842e-05, + "loss": 1.7462, + "step": 17234 + }, + { + "epoch": 5.290055248618785, + "grad_norm": 0.3286842703819275, + "learning_rate": 4.7739166370548385e-05, + "loss": 1.7749, + "step": 17235 + }, + { + "epoch": 5.290362185389809, + "grad_norm": 0.3298519253730774, + "learning_rate": 4.773420090574122e-05, + "loss": 1.7548, + "step": 17236 + }, + { + "epoch": 5.290669122160835, + "grad_norm": 0.20910575985908508, + "learning_rate": 4.7729235463326005e-05, + "loss": 1.7308, + "step": 17237 + }, + { + "epoch": 5.29097605893186, + "grad_norm": 0.3324633240699768, + "learning_rate": 4.7724270043351835e-05, + "loss": 1.7328, + "step": 17238 + }, + { + "epoch": 5.291282995702885, + "grad_norm": 0.21235628426074982, + "learning_rate": 4.771930464586774e-05, + "loss": 1.7186, + "step": 17239 + }, + { + "epoch": 5.291589932473911, + "grad_norm": 0.2971087694168091, + "learning_rate": 4.771433927092283e-05, + "loss": 1.7947, + "step": 17240 + }, + { + "epoch": 5.291896869244935, + "grad_norm": 0.3637695908546448, + "learning_rate": 4.770937391856614e-05, + "loss": 1.7753, + "step": 17241 + }, + { + "epoch": 5.29220380601596, + "grad_norm": 0.2503713369369507, + "learning_rate": 4.770440858884678e-05, + "loss": 1.684, + "step": 17242 + }, + { + "epoch": 5.292510742786986, + "grad_norm": 0.25510790944099426, + "learning_rate": 4.7699443281813774e-05, + "loss": 1.7517, + "step": 17243 + }, + { + "epoch": 5.292817679558011, + "grad_norm": 0.3189590871334076, + "learning_rate": 4.7694477997516244e-05, + "loss": 1.7488, + "step": 17244 + }, + { + "epoch": 5.293124616329036, + "grad_norm": 0.2807229161262512, + "learning_rate": 4.7689512736003215e-05, + "loss": 1.7962, + "step": 17245 + }, + { + "epoch": 5.293431553100062, + "grad_norm": 0.2166406810283661, + "learning_rate": 4.76845474973238e-05, + "loss": 1.7423, + "step": 17246 + }, + { + "epoch": 5.293738489871086, + "grad_norm": 0.29000815749168396, + "learning_rate": 4.767958228152702e-05, + "loss": 1.7508, + "step": 17247 + }, + { + "epoch": 5.2940454266421115, + "grad_norm": 0.19301612675189972, + "learning_rate": 4.767461708866198e-05, + "loss": 1.7223, + "step": 17248 + }, + { + "epoch": 5.294352363413137, + "grad_norm": 0.2828899323940277, + "learning_rate": 4.766965191877772e-05, + "loss": 1.8139, + "step": 17249 + }, + { + "epoch": 5.294659300184162, + "grad_norm": 0.32610374689102173, + "learning_rate": 4.766468677192335e-05, + "loss": 1.7744, + "step": 17250 + }, + { + "epoch": 5.2949662369551875, + "grad_norm": 0.2175719439983368, + "learning_rate": 4.7659721648147895e-05, + "loss": 1.7345, + "step": 17251 + }, + { + "epoch": 5.295273173726212, + "grad_norm": 0.24777816236019135, + "learning_rate": 4.7654756547500457e-05, + "loss": 1.7382, + "step": 17252 + }, + { + "epoch": 5.295580110497237, + "grad_norm": 0.25927749276161194, + "learning_rate": 4.764979147003008e-05, + "loss": 1.7625, + "step": 17253 + }, + { + "epoch": 5.295887047268263, + "grad_norm": 0.2271798849105835, + "learning_rate": 4.7644826415785834e-05, + "loss": 1.6928, + "step": 17254 + }, + { + "epoch": 5.296193984039288, + "grad_norm": 0.30804958939552307, + "learning_rate": 4.763986138481682e-05, + "loss": 1.743, + "step": 17255 + }, + { + "epoch": 5.296500920810313, + "grad_norm": 0.2247130572795868, + "learning_rate": 4.763489637717205e-05, + "loss": 1.7593, + "step": 17256 + }, + { + "epoch": 5.296807857581339, + "grad_norm": 0.22203052043914795, + "learning_rate": 4.7629931392900645e-05, + "loss": 1.6923, + "step": 17257 + }, + { + "epoch": 5.297114794352363, + "grad_norm": 0.23044714331626892, + "learning_rate": 4.7624966432051624e-05, + "loss": 1.7676, + "step": 17258 + }, + { + "epoch": 5.297421731123388, + "grad_norm": 0.2824070155620575, + "learning_rate": 4.7620001494674096e-05, + "loss": 1.8272, + "step": 17259 + }, + { + "epoch": 5.297728667894414, + "grad_norm": 0.27077800035476685, + "learning_rate": 4.761503658081709e-05, + "loss": 1.8106, + "step": 17260 + }, + { + "epoch": 5.298035604665439, + "grad_norm": 0.2333833873271942, + "learning_rate": 4.7610071690529706e-05, + "loss": 1.6841, + "step": 17261 + }, + { + "epoch": 5.298342541436464, + "grad_norm": 0.2542032301425934, + "learning_rate": 4.760510682386098e-05, + "loss": 1.7656, + "step": 17262 + }, + { + "epoch": 5.298649478207489, + "grad_norm": 0.30680081248283386, + "learning_rate": 4.760014198086002e-05, + "loss": 1.7443, + "step": 17263 + }, + { + "epoch": 5.298956414978514, + "grad_norm": 0.21580225229263306, + "learning_rate": 4.759517716157583e-05, + "loss": 1.7907, + "step": 17264 + }, + { + "epoch": 5.2992633517495396, + "grad_norm": 0.2644323408603668, + "learning_rate": 4.7590212366057516e-05, + "loss": 1.6835, + "step": 17265 + }, + { + "epoch": 5.299570288520565, + "grad_norm": 0.23600110411643982, + "learning_rate": 4.758524759435414e-05, + "loss": 1.7481, + "step": 17266 + }, + { + "epoch": 5.29987722529159, + "grad_norm": 0.23825959861278534, + "learning_rate": 4.758028284651477e-05, + "loss": 1.7267, + "step": 17267 + }, + { + "epoch": 5.300184162062616, + "grad_norm": 0.2659476101398468, + "learning_rate": 4.757531812258845e-05, + "loss": 1.7303, + "step": 17268 + }, + { + "epoch": 5.30049109883364, + "grad_norm": 0.30770114064216614, + "learning_rate": 4.757035342262428e-05, + "loss": 1.7636, + "step": 17269 + }, + { + "epoch": 5.300798035604665, + "grad_norm": 0.27921241521835327, + "learning_rate": 4.756538874667129e-05, + "loss": 1.7736, + "step": 17270 + }, + { + "epoch": 5.301104972375691, + "grad_norm": 0.2518016993999481, + "learning_rate": 4.756042409477855e-05, + "loss": 1.7942, + "step": 17271 + }, + { + "epoch": 5.301411909146716, + "grad_norm": 0.2678029537200928, + "learning_rate": 4.755545946699514e-05, + "loss": 1.7179, + "step": 17272 + }, + { + "epoch": 5.301718845917741, + "grad_norm": 0.3082284927368164, + "learning_rate": 4.7550494863370094e-05, + "loss": 1.7282, + "step": 17273 + }, + { + "epoch": 5.302025782688766, + "grad_norm": 0.23269952833652496, + "learning_rate": 4.754553028395251e-05, + "loss": 1.755, + "step": 17274 + }, + { + "epoch": 5.302332719459791, + "grad_norm": 0.2273751199245453, + "learning_rate": 4.754056572879142e-05, + "loss": 1.7661, + "step": 17275 + }, + { + "epoch": 5.3026396562308165, + "grad_norm": 0.2175082415342331, + "learning_rate": 4.7535601197935915e-05, + "loss": 1.7034, + "step": 17276 + }, + { + "epoch": 5.302946593001842, + "grad_norm": 0.20551301538944244, + "learning_rate": 4.753063669143503e-05, + "loss": 1.7329, + "step": 17277 + }, + { + "epoch": 5.303253529772867, + "grad_norm": 0.2350638061761856, + "learning_rate": 4.752567220933785e-05, + "loss": 1.8361, + "step": 17278 + }, + { + "epoch": 5.303560466543892, + "grad_norm": 0.20268140733242035, + "learning_rate": 4.752070775169342e-05, + "loss": 1.6736, + "step": 17279 + }, + { + "epoch": 5.303867403314917, + "grad_norm": 0.1891544908285141, + "learning_rate": 4.7515743318550823e-05, + "loss": 1.7241, + "step": 17280 + }, + { + "epoch": 5.304174340085942, + "grad_norm": 0.22900860011577606, + "learning_rate": 4.751077890995909e-05, + "loss": 1.7321, + "step": 17281 + }, + { + "epoch": 5.304481276856968, + "grad_norm": 0.25827866792678833, + "learning_rate": 4.7505814525967304e-05, + "loss": 1.8021, + "step": 17282 + }, + { + "epoch": 5.304788213627993, + "grad_norm": 0.22459273040294647, + "learning_rate": 4.7500850166624514e-05, + "loss": 1.7845, + "step": 17283 + }, + { + "epoch": 5.305095150399017, + "grad_norm": 0.23737964034080505, + "learning_rate": 4.7495885831979816e-05, + "loss": 1.7274, + "step": 17284 + }, + { + "epoch": 5.305402087170043, + "grad_norm": 0.2267502397298813, + "learning_rate": 4.749092152208221e-05, + "loss": 1.7747, + "step": 17285 + }, + { + "epoch": 5.305709023941068, + "grad_norm": 0.31811007857322693, + "learning_rate": 4.748595723698081e-05, + "loss": 1.7852, + "step": 17286 + }, + { + "epoch": 5.306015960712093, + "grad_norm": 0.42865583300590515, + "learning_rate": 4.7480992976724655e-05, + "loss": 1.7711, + "step": 17287 + }, + { + "epoch": 5.306322897483119, + "grad_norm": 0.3211027979850769, + "learning_rate": 4.747602874136278e-05, + "loss": 1.7813, + "step": 17288 + }, + { + "epoch": 5.306629834254144, + "grad_norm": 0.22552837431430817, + "learning_rate": 4.7471064530944295e-05, + "loss": 1.7407, + "step": 17289 + }, + { + "epoch": 5.3069367710251685, + "grad_norm": 0.3119906485080719, + "learning_rate": 4.746610034551821e-05, + "loss": 1.7255, + "step": 17290 + }, + { + "epoch": 5.307243707796194, + "grad_norm": 0.26405754685401917, + "learning_rate": 4.7461136185133623e-05, + "loss": 1.6945, + "step": 17291 + }, + { + "epoch": 5.307550644567219, + "grad_norm": 0.21759621798992157, + "learning_rate": 4.7456172049839566e-05, + "loss": 1.7319, + "step": 17292 + }, + { + "epoch": 5.3078575813382445, + "grad_norm": 0.26193925738334656, + "learning_rate": 4.745120793968511e-05, + "loss": 1.7508, + "step": 17293 + }, + { + "epoch": 5.30816451810927, + "grad_norm": 0.2549780011177063, + "learning_rate": 4.74462438547193e-05, + "loss": 1.7153, + "step": 17294 + }, + { + "epoch": 5.308471454880294, + "grad_norm": 0.21164020895957947, + "learning_rate": 4.7441279794991235e-05, + "loss": 1.7315, + "step": 17295 + }, + { + "epoch": 5.30877839165132, + "grad_norm": 0.20548345148563385, + "learning_rate": 4.7436315760549914e-05, + "loss": 1.68, + "step": 17296 + }, + { + "epoch": 5.309085328422345, + "grad_norm": 0.23997166752815247, + "learning_rate": 4.7431351751444446e-05, + "loss": 1.8528, + "step": 17297 + }, + { + "epoch": 5.30939226519337, + "grad_norm": 0.2639109194278717, + "learning_rate": 4.7426387767723845e-05, + "loss": 1.8041, + "step": 17298 + }, + { + "epoch": 5.309699201964396, + "grad_norm": 0.2285986840724945, + "learning_rate": 4.7421423809437196e-05, + "loss": 1.8188, + "step": 17299 + }, + { + "epoch": 5.310006138735421, + "grad_norm": 0.22183369100093842, + "learning_rate": 4.741645987663355e-05, + "loss": 1.7581, + "step": 17300 + }, + { + "epoch": 5.310313075506445, + "grad_norm": 0.22716040909290314, + "learning_rate": 4.741149596936197e-05, + "loss": 1.7438, + "step": 17301 + }, + { + "epoch": 5.310620012277471, + "grad_norm": 0.24641327559947968, + "learning_rate": 4.740653208767148e-05, + "loss": 1.761, + "step": 17302 + }, + { + "epoch": 5.310926949048496, + "grad_norm": 0.28470689058303833, + "learning_rate": 4.7401568231611194e-05, + "loss": 1.7512, + "step": 17303 + }, + { + "epoch": 5.311233885819521, + "grad_norm": 0.23279942572116852, + "learning_rate": 4.739660440123012e-05, + "loss": 1.7797, + "step": 17304 + }, + { + "epoch": 5.311540822590547, + "grad_norm": 0.26397696137428284, + "learning_rate": 4.739164059657731e-05, + "loss": 1.748, + "step": 17305 + }, + { + "epoch": 5.311847759361571, + "grad_norm": 0.25072020292282104, + "learning_rate": 4.7386676817701856e-05, + "loss": 1.7571, + "step": 17306 + }, + { + "epoch": 5.3121546961325965, + "grad_norm": 0.20815810561180115, + "learning_rate": 4.7381713064652774e-05, + "loss": 1.7566, + "step": 17307 + }, + { + "epoch": 5.312461632903622, + "grad_norm": 0.23104289174079895, + "learning_rate": 4.7376749337479174e-05, + "loss": 1.7308, + "step": 17308 + }, + { + "epoch": 5.312768569674647, + "grad_norm": 0.21978867053985596, + "learning_rate": 4.737178563623004e-05, + "loss": 1.7997, + "step": 17309 + }, + { + "epoch": 5.3130755064456725, + "grad_norm": 0.34588614106178284, + "learning_rate": 4.736682196095447e-05, + "loss": 1.8414, + "step": 17310 + }, + { + "epoch": 5.313382443216697, + "grad_norm": 0.3475342094898224, + "learning_rate": 4.73618583117015e-05, + "loss": 1.7823, + "step": 17311 + }, + { + "epoch": 5.313689379987722, + "grad_norm": 0.1965305358171463, + "learning_rate": 4.7356894688520215e-05, + "loss": 1.7597, + "step": 17312 + }, + { + "epoch": 5.313996316758748, + "grad_norm": 0.3035048246383667, + "learning_rate": 4.7351931091459624e-05, + "loss": 1.6803, + "step": 17313 + }, + { + "epoch": 5.314303253529773, + "grad_norm": 0.27722910046577454, + "learning_rate": 4.7346967520568827e-05, + "loss": 1.7472, + "step": 17314 + }, + { + "epoch": 5.314610190300798, + "grad_norm": 0.21481415629386902, + "learning_rate": 4.734200397589682e-05, + "loss": 1.7319, + "step": 17315 + }, + { + "epoch": 5.314917127071823, + "grad_norm": 0.2570357918739319, + "learning_rate": 4.733704045749271e-05, + "loss": 1.7392, + "step": 17316 + }, + { + "epoch": 5.315224063842848, + "grad_norm": 0.2404400259256363, + "learning_rate": 4.733207696540551e-05, + "loss": 1.7231, + "step": 17317 + }, + { + "epoch": 5.315531000613873, + "grad_norm": 0.222911074757576, + "learning_rate": 4.732711349968432e-05, + "loss": 1.7584, + "step": 17318 + }, + { + "epoch": 5.315837937384899, + "grad_norm": 0.22908064723014832, + "learning_rate": 4.732215006037813e-05, + "loss": 1.7242, + "step": 17319 + }, + { + "epoch": 5.316144874155924, + "grad_norm": 0.2432398796081543, + "learning_rate": 4.7317186647536044e-05, + "loss": 1.7056, + "step": 17320 + }, + { + "epoch": 5.316451810926949, + "grad_norm": 0.1994420737028122, + "learning_rate": 4.7312223261207086e-05, + "loss": 1.6667, + "step": 17321 + }, + { + "epoch": 5.316758747697974, + "grad_norm": 0.22314350306987762, + "learning_rate": 4.73072599014403e-05, + "loss": 1.7945, + "step": 17322 + }, + { + "epoch": 5.317065684468999, + "grad_norm": 0.2309068888425827, + "learning_rate": 4.730229656828477e-05, + "loss": 1.7099, + "step": 17323 + }, + { + "epoch": 5.3173726212400245, + "grad_norm": 0.22388015687465668, + "learning_rate": 4.729733326178951e-05, + "loss": 1.7053, + "step": 17324 + }, + { + "epoch": 5.31767955801105, + "grad_norm": 0.20203040540218353, + "learning_rate": 4.72923699820036e-05, + "loss": 1.6992, + "step": 17325 + }, + { + "epoch": 5.317986494782075, + "grad_norm": 0.24416297674179077, + "learning_rate": 4.728740672897606e-05, + "loss": 1.7455, + "step": 17326 + }, + { + "epoch": 5.3182934315531, + "grad_norm": 0.2501862049102783, + "learning_rate": 4.728244350275597e-05, + "loss": 1.7609, + "step": 17327 + }, + { + "epoch": 5.318600368324125, + "grad_norm": 0.21482665836811066, + "learning_rate": 4.727748030339235e-05, + "loss": 1.7614, + "step": 17328 + }, + { + "epoch": 5.31890730509515, + "grad_norm": 0.2241419404745102, + "learning_rate": 4.727251713093429e-05, + "loss": 1.736, + "step": 17329 + }, + { + "epoch": 5.319214241866176, + "grad_norm": 0.1757260262966156, + "learning_rate": 4.726755398543079e-05, + "loss": 1.6646, + "step": 17330 + }, + { + "epoch": 5.319521178637201, + "grad_norm": 0.18697243928909302, + "learning_rate": 4.726259086693095e-05, + "loss": 1.7512, + "step": 17331 + }, + { + "epoch": 5.319828115408226, + "grad_norm": 0.22584228217601776, + "learning_rate": 4.725762777548376e-05, + "loss": 1.7439, + "step": 17332 + }, + { + "epoch": 5.320135052179251, + "grad_norm": 0.18673470616340637, + "learning_rate": 4.725266471113832e-05, + "loss": 1.7007, + "step": 17333 + }, + { + "epoch": 5.320441988950276, + "grad_norm": 0.23030288517475128, + "learning_rate": 4.7247701673943656e-05, + "loss": 1.8021, + "step": 17334 + }, + { + "epoch": 5.320748925721301, + "grad_norm": 0.19333480298519135, + "learning_rate": 4.7242738663948813e-05, + "loss": 1.6659, + "step": 17335 + }, + { + "epoch": 5.321055862492327, + "grad_norm": 0.278097003698349, + "learning_rate": 4.723777568120284e-05, + "loss": 1.7302, + "step": 17336 + }, + { + "epoch": 5.321362799263352, + "grad_norm": 0.2146742343902588, + "learning_rate": 4.72328127257548e-05, + "loss": 1.7644, + "step": 17337 + }, + { + "epoch": 5.3216697360343765, + "grad_norm": 0.25582969188690186, + "learning_rate": 4.722784979765372e-05, + "loss": 1.7872, + "step": 17338 + }, + { + "epoch": 5.321976672805402, + "grad_norm": 0.20411577820777893, + "learning_rate": 4.722288689694864e-05, + "loss": 1.7167, + "step": 17339 + }, + { + "epoch": 5.322283609576427, + "grad_norm": 0.20894703269004822, + "learning_rate": 4.7217924023688645e-05, + "loss": 1.7526, + "step": 17340 + }, + { + "epoch": 5.3225905463474525, + "grad_norm": 0.20197831094264984, + "learning_rate": 4.721296117792273e-05, + "loss": 1.711, + "step": 17341 + }, + { + "epoch": 5.322897483118478, + "grad_norm": 0.20490549504756927, + "learning_rate": 4.720799835969999e-05, + "loss": 1.7303, + "step": 17342 + }, + { + "epoch": 5.323204419889503, + "grad_norm": 0.20666229724884033, + "learning_rate": 4.720303556906943e-05, + "loss": 1.6738, + "step": 17343 + }, + { + "epoch": 5.323511356660528, + "grad_norm": 0.21899856626987457, + "learning_rate": 4.719807280608011e-05, + "loss": 1.7632, + "step": 17344 + }, + { + "epoch": 5.323818293431553, + "grad_norm": 0.2310410887002945, + "learning_rate": 4.719311007078108e-05, + "loss": 1.7568, + "step": 17345 + }, + { + "epoch": 5.324125230202578, + "grad_norm": 0.20057427883148193, + "learning_rate": 4.7188147363221394e-05, + "loss": 1.6716, + "step": 17346 + }, + { + "epoch": 5.324432166973604, + "grad_norm": 0.21361050009727478, + "learning_rate": 4.718318468345006e-05, + "loss": 1.7224, + "step": 17347 + }, + { + "epoch": 5.324739103744629, + "grad_norm": 0.28389376401901245, + "learning_rate": 4.7178222031516173e-05, + "loss": 1.8519, + "step": 17348 + }, + { + "epoch": 5.3250460405156534, + "grad_norm": 0.2094416618347168, + "learning_rate": 4.717325940746872e-05, + "loss": 1.7763, + "step": 17349 + }, + { + "epoch": 5.325352977286679, + "grad_norm": 0.2263312190771103, + "learning_rate": 4.716829681135681e-05, + "loss": 1.7961, + "step": 17350 + }, + { + "epoch": 5.325659914057704, + "grad_norm": 0.2685631811618805, + "learning_rate": 4.7163334243229417e-05, + "loss": 1.7763, + "step": 17351 + }, + { + "epoch": 5.3259668508287294, + "grad_norm": 0.2029418647289276, + "learning_rate": 4.7158371703135636e-05, + "loss": 1.7662, + "step": 17352 + }, + { + "epoch": 5.326273787599755, + "grad_norm": 0.3109094798564911, + "learning_rate": 4.715340919112447e-05, + "loss": 1.7064, + "step": 17353 + }, + { + "epoch": 5.326580724370779, + "grad_norm": 0.24679912626743317, + "learning_rate": 4.714844670724502e-05, + "loss": 1.6903, + "step": 17354 + }, + { + "epoch": 5.326887661141805, + "grad_norm": 0.2004890739917755, + "learning_rate": 4.714348425154627e-05, + "loss": 1.7242, + "step": 17355 + }, + { + "epoch": 5.32719459791283, + "grad_norm": 0.27442196011543274, + "learning_rate": 4.7138521824077284e-05, + "loss": 1.826, + "step": 17356 + }, + { + "epoch": 5.327501534683855, + "grad_norm": 0.19933666288852692, + "learning_rate": 4.713355942488711e-05, + "loss": 1.748, + "step": 17357 + }, + { + "epoch": 5.327808471454881, + "grad_norm": 0.2306378185749054, + "learning_rate": 4.712859705402476e-05, + "loss": 1.7426, + "step": 17358 + }, + { + "epoch": 5.328115408225905, + "grad_norm": 0.22484014928340912, + "learning_rate": 4.7123634711539324e-05, + "loss": 1.7355, + "step": 17359 + }, + { + "epoch": 5.32842234499693, + "grad_norm": 0.2501749098300934, + "learning_rate": 4.711867239747979e-05, + "loss": 1.7502, + "step": 17360 + }, + { + "epoch": 5.328729281767956, + "grad_norm": 0.1940663903951645, + "learning_rate": 4.711371011189525e-05, + "loss": 1.7423, + "step": 17361 + }, + { + "epoch": 5.329036218538981, + "grad_norm": 0.28115448355674744, + "learning_rate": 4.71087478548347e-05, + "loss": 1.7134, + "step": 17362 + }, + { + "epoch": 5.329343155310006, + "grad_norm": 0.29717928171157837, + "learning_rate": 4.71037856263472e-05, + "loss": 1.8145, + "step": 17363 + }, + { + "epoch": 5.329650092081032, + "grad_norm": 0.24278375506401062, + "learning_rate": 4.709882342648179e-05, + "loss": 1.689, + "step": 17364 + }, + { + "epoch": 5.329957028852056, + "grad_norm": 0.26382890343666077, + "learning_rate": 4.709386125528751e-05, + "loss": 1.801, + "step": 17365 + }, + { + "epoch": 5.3302639656230815, + "grad_norm": 0.237087219953537, + "learning_rate": 4.708889911281339e-05, + "loss": 1.7019, + "step": 17366 + }, + { + "epoch": 5.330570902394107, + "grad_norm": 0.21994253993034363, + "learning_rate": 4.7083936999108494e-05, + "loss": 1.707, + "step": 17367 + }, + { + "epoch": 5.330877839165132, + "grad_norm": 0.3028903901576996, + "learning_rate": 4.707897491422182e-05, + "loss": 1.7992, + "step": 17368 + }, + { + "epoch": 5.3311847759361575, + "grad_norm": 0.24991434812545776, + "learning_rate": 4.7074012858202435e-05, + "loss": 1.7894, + "step": 17369 + }, + { + "epoch": 5.331491712707182, + "grad_norm": 0.20631250739097595, + "learning_rate": 4.706905083109936e-05, + "loss": 1.6816, + "step": 17370 + }, + { + "epoch": 5.331798649478207, + "grad_norm": 0.23300573229789734, + "learning_rate": 4.7064088832961666e-05, + "loss": 1.7101, + "step": 17371 + }, + { + "epoch": 5.332105586249233, + "grad_norm": 0.22331316769123077, + "learning_rate": 4.705912686383837e-05, + "loss": 1.861, + "step": 17372 + }, + { + "epoch": 5.332412523020258, + "grad_norm": 0.204593226313591, + "learning_rate": 4.7054164923778485e-05, + "loss": 1.7062, + "step": 17373 + }, + { + "epoch": 5.332719459791283, + "grad_norm": 0.22207681834697723, + "learning_rate": 4.704920301283107e-05, + "loss": 1.7546, + "step": 17374 + }, + { + "epoch": 5.333026396562309, + "grad_norm": 0.2508530020713806, + "learning_rate": 4.7044241131045157e-05, + "loss": 1.7881, + "step": 17375 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.26084616780281067, + "learning_rate": 4.7039279278469804e-05, + "loss": 1.7292, + "step": 17376 + }, + { + "epoch": 5.333640270104358, + "grad_norm": 0.2122940719127655, + "learning_rate": 4.7034317455154006e-05, + "loss": 1.7493, + "step": 17377 + }, + { + "epoch": 5.333947206875384, + "grad_norm": 0.2627449333667755, + "learning_rate": 4.702935566114685e-05, + "loss": 1.759, + "step": 17378 + }, + { + "epoch": 5.334254143646409, + "grad_norm": 0.20637977123260498, + "learning_rate": 4.702439389649732e-05, + "loss": 1.8043, + "step": 17379 + }, + { + "epoch": 5.334561080417434, + "grad_norm": 0.28783395886421204, + "learning_rate": 4.701943216125447e-05, + "loss": 1.7256, + "step": 17380 + }, + { + "epoch": 5.334868017188459, + "grad_norm": 0.21130618453025818, + "learning_rate": 4.701447045546734e-05, + "loss": 1.7161, + "step": 17381 + }, + { + "epoch": 5.335174953959484, + "grad_norm": 0.2793416678905487, + "learning_rate": 4.7009508779184984e-05, + "loss": 1.7659, + "step": 17382 + }, + { + "epoch": 5.3354818907305095, + "grad_norm": 0.3088020384311676, + "learning_rate": 4.700454713245639e-05, + "loss": 1.6877, + "step": 17383 + }, + { + "epoch": 5.335788827501535, + "grad_norm": 0.19697681069374084, + "learning_rate": 4.6999585515330646e-05, + "loss": 1.7111, + "step": 17384 + }, + { + "epoch": 5.33609576427256, + "grad_norm": 0.29234182834625244, + "learning_rate": 4.699462392785673e-05, + "loss": 1.7136, + "step": 17385 + }, + { + "epoch": 5.336402701043585, + "grad_norm": 0.2593611776828766, + "learning_rate": 4.698966237008371e-05, + "loss": 1.7531, + "step": 17386 + }, + { + "epoch": 5.33670963781461, + "grad_norm": 0.20024444162845612, + "learning_rate": 4.6984700842060604e-05, + "loss": 1.7035, + "step": 17387 + }, + { + "epoch": 5.337016574585635, + "grad_norm": 0.2929787039756775, + "learning_rate": 4.697973934383647e-05, + "loss": 1.7212, + "step": 17388 + }, + { + "epoch": 5.337323511356661, + "grad_norm": 0.2425665408372879, + "learning_rate": 4.697477787546032e-05, + "loss": 1.7191, + "step": 17389 + }, + { + "epoch": 5.337630448127686, + "grad_norm": 0.19175556302070618, + "learning_rate": 4.6969816436981176e-05, + "loss": 1.7291, + "step": 17390 + }, + { + "epoch": 5.337937384898711, + "grad_norm": 0.2602384686470032, + "learning_rate": 4.696485502844809e-05, + "loss": 1.7035, + "step": 17391 + }, + { + "epoch": 5.338244321669736, + "grad_norm": 0.19117408990859985, + "learning_rate": 4.695989364991006e-05, + "loss": 1.707, + "step": 17392 + }, + { + "epoch": 5.338551258440761, + "grad_norm": 0.31086108088493347, + "learning_rate": 4.6954932301416174e-05, + "loss": 1.7397, + "step": 17393 + }, + { + "epoch": 5.338858195211786, + "grad_norm": 0.27402472496032715, + "learning_rate": 4.694997098301542e-05, + "loss": 1.7144, + "step": 17394 + }, + { + "epoch": 5.339165131982812, + "grad_norm": 0.20345155894756317, + "learning_rate": 4.694500969475685e-05, + "loss": 1.7492, + "step": 17395 + }, + { + "epoch": 5.339472068753837, + "grad_norm": 0.23786045610904694, + "learning_rate": 4.694004843668947e-05, + "loss": 1.7781, + "step": 17396 + }, + { + "epoch": 5.3397790055248615, + "grad_norm": 0.19747424125671387, + "learning_rate": 4.6935087208862335e-05, + "loss": 1.7353, + "step": 17397 + }, + { + "epoch": 5.340085942295887, + "grad_norm": 0.224543035030365, + "learning_rate": 4.693012601132445e-05, + "loss": 1.7229, + "step": 17398 + }, + { + "epoch": 5.340392879066912, + "grad_norm": 0.20840135216712952, + "learning_rate": 4.692516484412488e-05, + "loss": 1.7557, + "step": 17399 + }, + { + "epoch": 5.3406998158379375, + "grad_norm": 0.21019098162651062, + "learning_rate": 4.692020370731261e-05, + "loss": 1.7793, + "step": 17400 + }, + { + "epoch": 5.341006752608963, + "grad_norm": 0.20540091395378113, + "learning_rate": 4.691524260093672e-05, + "loss": 1.6925, + "step": 17401 + }, + { + "epoch": 5.341313689379987, + "grad_norm": 0.2414131462574005, + "learning_rate": 4.691028152504619e-05, + "loss": 1.7706, + "step": 17402 + }, + { + "epoch": 5.341620626151013, + "grad_norm": 0.19627155363559723, + "learning_rate": 4.6905320479690073e-05, + "loss": 1.6356, + "step": 17403 + }, + { + "epoch": 5.341927562922038, + "grad_norm": 0.20978952944278717, + "learning_rate": 4.690035946491741e-05, + "loss": 1.7487, + "step": 17404 + }, + { + "epoch": 5.342234499693063, + "grad_norm": 0.2524566054344177, + "learning_rate": 4.689539848077719e-05, + "loss": 1.7713, + "step": 17405 + }, + { + "epoch": 5.342541436464089, + "grad_norm": 0.1967654973268509, + "learning_rate": 4.689043752731847e-05, + "loss": 1.7358, + "step": 17406 + }, + { + "epoch": 5.342848373235114, + "grad_norm": 0.2085377424955368, + "learning_rate": 4.688547660459026e-05, + "loss": 1.7104, + "step": 17407 + }, + { + "epoch": 5.343155310006138, + "grad_norm": 0.21294310688972473, + "learning_rate": 4.688051571264161e-05, + "loss": 1.7349, + "step": 17408 + }, + { + "epoch": 5.343462246777164, + "grad_norm": 0.23702891170978546, + "learning_rate": 4.6875554851521514e-05, + "loss": 1.8048, + "step": 17409 + }, + { + "epoch": 5.343769183548189, + "grad_norm": 0.2513964772224426, + "learning_rate": 4.687059402127904e-05, + "loss": 1.6669, + "step": 17410 + }, + { + "epoch": 5.344076120319214, + "grad_norm": 0.259540855884552, + "learning_rate": 4.6865633221963165e-05, + "loss": 1.7763, + "step": 17411 + }, + { + "epoch": 5.34438305709024, + "grad_norm": 0.28354617953300476, + "learning_rate": 4.6860672453622966e-05, + "loss": 1.7912, + "step": 17412 + }, + { + "epoch": 5.344689993861264, + "grad_norm": 0.2503860592842102, + "learning_rate": 4.685571171630742e-05, + "loss": 1.6817, + "step": 17413 + }, + { + "epoch": 5.3449969306322895, + "grad_norm": 0.2317555695772171, + "learning_rate": 4.685075101006558e-05, + "loss": 1.7652, + "step": 17414 + }, + { + "epoch": 5.345303867403315, + "grad_norm": 0.23333363234996796, + "learning_rate": 4.684579033494646e-05, + "loss": 1.722, + "step": 17415 + }, + { + "epoch": 5.34561080417434, + "grad_norm": 0.22507359087467194, + "learning_rate": 4.6840829690999104e-05, + "loss": 1.7522, + "step": 17416 + }, + { + "epoch": 5.3459177409453655, + "grad_norm": 0.2298288643360138, + "learning_rate": 4.6835869078272504e-05, + "loss": 1.7425, + "step": 17417 + }, + { + "epoch": 5.346224677716391, + "grad_norm": 0.2829224765300751, + "learning_rate": 4.683090849681572e-05, + "loss": 1.7798, + "step": 17418 + }, + { + "epoch": 5.346531614487415, + "grad_norm": 0.18153807520866394, + "learning_rate": 4.682594794667773e-05, + "loss": 1.6846, + "step": 17419 + }, + { + "epoch": 5.346838551258441, + "grad_norm": 0.24153028428554535, + "learning_rate": 4.6820987427907596e-05, + "loss": 1.7474, + "step": 17420 + }, + { + "epoch": 5.347145488029466, + "grad_norm": 0.2529772222042084, + "learning_rate": 4.681602694055434e-05, + "loss": 1.7465, + "step": 17421 + }, + { + "epoch": 5.347452424800491, + "grad_norm": 0.20414131879806519, + "learning_rate": 4.681106648466696e-05, + "loss": 1.7704, + "step": 17422 + }, + { + "epoch": 5.347759361571517, + "grad_norm": 0.27280452847480774, + "learning_rate": 4.68061060602945e-05, + "loss": 1.791, + "step": 17423 + }, + { + "epoch": 5.348066298342541, + "grad_norm": 0.20767468214035034, + "learning_rate": 4.680114566748595e-05, + "loss": 1.7744, + "step": 17424 + }, + { + "epoch": 5.348373235113566, + "grad_norm": 0.2661697566509247, + "learning_rate": 4.679618530629036e-05, + "loss": 1.7999, + "step": 17425 + }, + { + "epoch": 5.348680171884592, + "grad_norm": 0.23666872084140778, + "learning_rate": 4.679122497675674e-05, + "loss": 1.7204, + "step": 17426 + }, + { + "epoch": 5.348987108655617, + "grad_norm": 0.2688015401363373, + "learning_rate": 4.678626467893414e-05, + "loss": 1.7619, + "step": 17427 + }, + { + "epoch": 5.349294045426642, + "grad_norm": 0.23924420773983002, + "learning_rate": 4.678130441287153e-05, + "loss": 1.7754, + "step": 17428 + }, + { + "epoch": 5.349600982197667, + "grad_norm": 0.25724148750305176, + "learning_rate": 4.677634417861798e-05, + "loss": 1.761, + "step": 17429 + }, + { + "epoch": 5.349907918968692, + "grad_norm": 0.2633780241012573, + "learning_rate": 4.6771383976222464e-05, + "loss": 1.8705, + "step": 17430 + }, + { + "epoch": 5.350214855739718, + "grad_norm": 0.24774575233459473, + "learning_rate": 4.6766423805734036e-05, + "loss": 1.7127, + "step": 17431 + }, + { + "epoch": 5.350521792510743, + "grad_norm": 0.29887545108795166, + "learning_rate": 4.6761463667201695e-05, + "loss": 1.7651, + "step": 17432 + }, + { + "epoch": 5.350828729281768, + "grad_norm": 0.2231605499982834, + "learning_rate": 4.6756503560674486e-05, + "loss": 1.7636, + "step": 17433 + }, + { + "epoch": 5.351135666052793, + "grad_norm": 0.27977073192596436, + "learning_rate": 4.675154348620139e-05, + "loss": 1.7108, + "step": 17434 + }, + { + "epoch": 5.351442602823818, + "grad_norm": 0.26866039633750916, + "learning_rate": 4.674658344383146e-05, + "loss": 1.7593, + "step": 17435 + }, + { + "epoch": 5.351749539594843, + "grad_norm": 0.2154620885848999, + "learning_rate": 4.6741623433613685e-05, + "loss": 1.7536, + "step": 17436 + }, + { + "epoch": 5.352056476365869, + "grad_norm": 0.276656836271286, + "learning_rate": 4.673666345559711e-05, + "loss": 1.803, + "step": 17437 + }, + { + "epoch": 5.352363413136894, + "grad_norm": 0.22247640788555145, + "learning_rate": 4.6731703509830744e-05, + "loss": 1.7273, + "step": 17438 + }, + { + "epoch": 5.352670349907919, + "grad_norm": 0.2399090677499771, + "learning_rate": 4.6726743596363574e-05, + "loss": 1.7708, + "step": 17439 + }, + { + "epoch": 5.352977286678944, + "grad_norm": 0.2550101578235626, + "learning_rate": 4.6721783715244674e-05, + "loss": 1.7016, + "step": 17440 + }, + { + "epoch": 5.353284223449969, + "grad_norm": 0.19929546117782593, + "learning_rate": 4.6716823866523e-05, + "loss": 1.7417, + "step": 17441 + }, + { + "epoch": 5.3535911602209945, + "grad_norm": 0.2496672421693802, + "learning_rate": 4.671186405024761e-05, + "loss": 1.72, + "step": 17442 + }, + { + "epoch": 5.35389809699202, + "grad_norm": 0.19827665388584137, + "learning_rate": 4.67069042664675e-05, + "loss": 1.7515, + "step": 17443 + }, + { + "epoch": 5.354205033763045, + "grad_norm": 0.2528775930404663, + "learning_rate": 4.670194451523171e-05, + "loss": 1.7429, + "step": 17444 + }, + { + "epoch": 5.35451197053407, + "grad_norm": 0.19569729268550873, + "learning_rate": 4.6696984796589215e-05, + "loss": 1.7314, + "step": 17445 + }, + { + "epoch": 5.354818907305095, + "grad_norm": 0.21892370283603668, + "learning_rate": 4.669202511058908e-05, + "loss": 1.7331, + "step": 17446 + }, + { + "epoch": 5.35512584407612, + "grad_norm": 0.21609409153461456, + "learning_rate": 4.668706545728026e-05, + "loss": 1.7267, + "step": 17447 + }, + { + "epoch": 5.355432780847146, + "grad_norm": 0.2631370425224304, + "learning_rate": 4.668210583671182e-05, + "loss": 1.7513, + "step": 17448 + }, + { + "epoch": 5.355739717618171, + "grad_norm": 0.31327441334724426, + "learning_rate": 4.667714624893274e-05, + "loss": 1.7936, + "step": 17449 + }, + { + "epoch": 5.356046654389196, + "grad_norm": 0.21602430939674377, + "learning_rate": 4.667218669399207e-05, + "loss": 1.7387, + "step": 17450 + }, + { + "epoch": 5.356353591160221, + "grad_norm": 0.2895040214061737, + "learning_rate": 4.6667227171938784e-05, + "loss": 1.7293, + "step": 17451 + }, + { + "epoch": 5.356660527931246, + "grad_norm": 0.35150307416915894, + "learning_rate": 4.666226768282193e-05, + "loss": 1.8215, + "step": 17452 + }, + { + "epoch": 5.356967464702271, + "grad_norm": 0.19034281373023987, + "learning_rate": 4.665730822669048e-05, + "loss": 1.702, + "step": 17453 + }, + { + "epoch": 5.357274401473297, + "grad_norm": 0.25586241483688354, + "learning_rate": 4.6652348803593484e-05, + "loss": 1.7809, + "step": 17454 + }, + { + "epoch": 5.357581338244322, + "grad_norm": 0.23919305205345154, + "learning_rate": 4.6647389413579944e-05, + "loss": 1.7555, + "step": 17455 + }, + { + "epoch": 5.3578882750153465, + "grad_norm": 0.22707165777683258, + "learning_rate": 4.664243005669885e-05, + "loss": 1.7633, + "step": 17456 + }, + { + "epoch": 5.358195211786372, + "grad_norm": 0.20666839182376862, + "learning_rate": 4.663747073299925e-05, + "loss": 1.6522, + "step": 17457 + }, + { + "epoch": 5.358502148557397, + "grad_norm": 0.20557542145252228, + "learning_rate": 4.663251144253012e-05, + "loss": 1.73, + "step": 17458 + }, + { + "epoch": 5.3588090853284225, + "grad_norm": 0.22375571727752686, + "learning_rate": 4.662755218534049e-05, + "loss": 1.7189, + "step": 17459 + }, + { + "epoch": 5.359116022099448, + "grad_norm": 0.261393278837204, + "learning_rate": 4.662259296147936e-05, + "loss": 1.6863, + "step": 17460 + }, + { + "epoch": 5.359422958870473, + "grad_norm": 0.2279379516839981, + "learning_rate": 4.6617633770995764e-05, + "loss": 1.7332, + "step": 17461 + }, + { + "epoch": 5.359729895641498, + "grad_norm": 0.2194606065750122, + "learning_rate": 4.6612674613938666e-05, + "loss": 1.7324, + "step": 17462 + }, + { + "epoch": 5.360036832412523, + "grad_norm": 0.27714410424232483, + "learning_rate": 4.660771549035713e-05, + "loss": 1.7386, + "step": 17463 + }, + { + "epoch": 5.360343769183548, + "grad_norm": 0.2118787169456482, + "learning_rate": 4.660275640030012e-05, + "loss": 1.7587, + "step": 17464 + }, + { + "epoch": 5.360650705954574, + "grad_norm": 0.2546979784965515, + "learning_rate": 4.6597797343816665e-05, + "loss": 1.7756, + "step": 17465 + }, + { + "epoch": 5.360957642725599, + "grad_norm": 0.194237619638443, + "learning_rate": 4.659283832095577e-05, + "loss": 1.7351, + "step": 17466 + }, + { + "epoch": 5.361264579496623, + "grad_norm": 0.23448583483695984, + "learning_rate": 4.658787933176646e-05, + "loss": 1.7051, + "step": 17467 + }, + { + "epoch": 5.361571516267649, + "grad_norm": 0.22796298563480377, + "learning_rate": 4.65829203762977e-05, + "loss": 1.7395, + "step": 17468 + }, + { + "epoch": 5.361878453038674, + "grad_norm": 0.22674904763698578, + "learning_rate": 4.657796145459855e-05, + "loss": 1.714, + "step": 17469 + }, + { + "epoch": 5.362185389809699, + "grad_norm": 0.2697311341762543, + "learning_rate": 4.657300256671797e-05, + "loss": 1.8271, + "step": 17470 + }, + { + "epoch": 5.362492326580725, + "grad_norm": 0.28040480613708496, + "learning_rate": 4.6568043712705004e-05, + "loss": 1.8192, + "step": 17471 + }, + { + "epoch": 5.362799263351749, + "grad_norm": 0.21100232005119324, + "learning_rate": 4.6563084892608644e-05, + "loss": 1.7285, + "step": 17472 + }, + { + "epoch": 5.3631062001227745, + "grad_norm": 0.23545897006988525, + "learning_rate": 4.655812610647787e-05, + "loss": 1.7302, + "step": 17473 + }, + { + "epoch": 5.3634131368938, + "grad_norm": 0.23278315365314484, + "learning_rate": 4.655316735436174e-05, + "loss": 1.7749, + "step": 17474 + }, + { + "epoch": 5.363720073664825, + "grad_norm": 0.333763986825943, + "learning_rate": 4.65482086363092e-05, + "loss": 1.7393, + "step": 17475 + }, + { + "epoch": 5.3640270104358505, + "grad_norm": 0.2743878662586212, + "learning_rate": 4.6543249952369306e-05, + "loss": 1.7274, + "step": 17476 + }, + { + "epoch": 5.364333947206875, + "grad_norm": 0.234402596950531, + "learning_rate": 4.6538291302591024e-05, + "loss": 1.7848, + "step": 17477 + }, + { + "epoch": 5.3646408839779, + "grad_norm": 0.29100897908210754, + "learning_rate": 4.65333326870234e-05, + "loss": 1.7698, + "step": 17478 + }, + { + "epoch": 5.364947820748926, + "grad_norm": 0.24178378283977509, + "learning_rate": 4.652837410571539e-05, + "loss": 1.8142, + "step": 17479 + }, + { + "epoch": 5.365254757519951, + "grad_norm": 0.4189155101776123, + "learning_rate": 4.652341555871605e-05, + "loss": 1.7435, + "step": 17480 + }, + { + "epoch": 5.365561694290976, + "grad_norm": 0.40106773376464844, + "learning_rate": 4.651845704607433e-05, + "loss": 1.837, + "step": 17481 + }, + { + "epoch": 5.365868631062002, + "grad_norm": 0.24127443134784698, + "learning_rate": 4.651349856783927e-05, + "loss": 1.7257, + "step": 17482 + }, + { + "epoch": 5.366175567833026, + "grad_norm": 0.412812739610672, + "learning_rate": 4.650854012405985e-05, + "loss": 1.762, + "step": 17483 + }, + { + "epoch": 5.366482504604051, + "grad_norm": 0.2636469602584839, + "learning_rate": 4.65035817147851e-05, + "loss": 1.7995, + "step": 17484 + }, + { + "epoch": 5.366789441375077, + "grad_norm": 0.282186895608902, + "learning_rate": 4.649862334006399e-05, + "loss": 1.75, + "step": 17485 + }, + { + "epoch": 5.367096378146102, + "grad_norm": 0.3280154764652252, + "learning_rate": 4.649366499994555e-05, + "loss": 1.7668, + "step": 17486 + }, + { + "epoch": 5.367403314917127, + "grad_norm": 0.24608035385608673, + "learning_rate": 4.648870669447875e-05, + "loss": 1.8332, + "step": 17487 + }, + { + "epoch": 5.367710251688152, + "grad_norm": 0.21927174925804138, + "learning_rate": 4.648374842371262e-05, + "loss": 1.7365, + "step": 17488 + }, + { + "epoch": 5.368017188459177, + "grad_norm": 0.2658425569534302, + "learning_rate": 4.6478790187696164e-05, + "loss": 1.841, + "step": 17489 + }, + { + "epoch": 5.3683241252302025, + "grad_norm": 0.2302858531475067, + "learning_rate": 4.647383198647834e-05, + "loss": 1.7882, + "step": 17490 + }, + { + "epoch": 5.368631062001228, + "grad_norm": 0.2562740743160248, + "learning_rate": 4.64688738201082e-05, + "loss": 1.7188, + "step": 17491 + }, + { + "epoch": 5.368937998772253, + "grad_norm": 0.28140220046043396, + "learning_rate": 4.646391568863469e-05, + "loss": 1.7482, + "step": 17492 + }, + { + "epoch": 5.3692449355432785, + "grad_norm": 0.21040008962154388, + "learning_rate": 4.6458957592106855e-05, + "loss": 1.7695, + "step": 17493 + }, + { + "epoch": 5.369551872314303, + "grad_norm": 0.25322291254997253, + "learning_rate": 4.645399953057367e-05, + "loss": 1.7127, + "step": 17494 + }, + { + "epoch": 5.369858809085328, + "grad_norm": 0.2239738404750824, + "learning_rate": 4.644904150408415e-05, + "loss": 1.7376, + "step": 17495 + }, + { + "epoch": 5.370165745856354, + "grad_norm": 0.21432901918888092, + "learning_rate": 4.644408351268727e-05, + "loss": 1.7156, + "step": 17496 + }, + { + "epoch": 5.370472682627379, + "grad_norm": 0.3057272732257843, + "learning_rate": 4.643912555643205e-05, + "loss": 1.7706, + "step": 17497 + }, + { + "epoch": 5.370779619398404, + "grad_norm": 0.2826928496360779, + "learning_rate": 4.643416763536748e-05, + "loss": 1.8298, + "step": 17498 + }, + { + "epoch": 5.371086556169429, + "grad_norm": 0.2395278513431549, + "learning_rate": 4.642920974954255e-05, + "loss": 1.7357, + "step": 17499 + }, + { + "epoch": 5.371393492940454, + "grad_norm": 0.21004743874073029, + "learning_rate": 4.642425189900626e-05, + "loss": 1.7263, + "step": 17500 + }, + { + "epoch": 5.371700429711479, + "grad_norm": 0.23981697857379913, + "learning_rate": 4.641929408380761e-05, + "loss": 1.7341, + "step": 17501 + }, + { + "epoch": 5.372007366482505, + "grad_norm": 0.1984727531671524, + "learning_rate": 4.641433630399559e-05, + "loss": 1.7133, + "step": 17502 + }, + { + "epoch": 5.37231430325353, + "grad_norm": 0.22153446078300476, + "learning_rate": 4.640937855961922e-05, + "loss": 1.8028, + "step": 17503 + }, + { + "epoch": 5.3726212400245545, + "grad_norm": 0.24257974326610565, + "learning_rate": 4.6404420850727455e-05, + "loss": 1.7842, + "step": 17504 + }, + { + "epoch": 5.37292817679558, + "grad_norm": 0.19444705545902252, + "learning_rate": 4.6399463177369316e-05, + "loss": 1.7296, + "step": 17505 + }, + { + "epoch": 5.373235113566605, + "grad_norm": 0.2068849354982376, + "learning_rate": 4.6394505539593806e-05, + "loss": 1.6949, + "step": 17506 + }, + { + "epoch": 5.3735420503376305, + "grad_norm": 0.21762309968471527, + "learning_rate": 4.638954793744989e-05, + "loss": 1.7556, + "step": 17507 + }, + { + "epoch": 5.373848987108656, + "grad_norm": 0.20791584253311157, + "learning_rate": 4.638459037098659e-05, + "loss": 1.7442, + "step": 17508 + }, + { + "epoch": 5.37415592387968, + "grad_norm": 0.27774497866630554, + "learning_rate": 4.6379632840252875e-05, + "loss": 1.7834, + "step": 17509 + }, + { + "epoch": 5.374462860650706, + "grad_norm": 0.24211421608924866, + "learning_rate": 4.637467534529775e-05, + "loss": 1.819, + "step": 17510 + }, + { + "epoch": 5.374769797421731, + "grad_norm": 0.24857789278030396, + "learning_rate": 4.636971788617022e-05, + "loss": 1.7483, + "step": 17511 + }, + { + "epoch": 5.375076734192756, + "grad_norm": 0.25142937898635864, + "learning_rate": 4.636476046291925e-05, + "loss": 1.7405, + "step": 17512 + }, + { + "epoch": 5.375383670963782, + "grad_norm": 0.25860801339149475, + "learning_rate": 4.6359803075593846e-05, + "loss": 1.7821, + "step": 17513 + }, + { + "epoch": 5.375690607734807, + "grad_norm": 0.25223109126091003, + "learning_rate": 4.635484572424302e-05, + "loss": 1.738, + "step": 17514 + }, + { + "epoch": 5.3759975445058314, + "grad_norm": 0.22931768000125885, + "learning_rate": 4.634988840891573e-05, + "loss": 1.7717, + "step": 17515 + }, + { + "epoch": 5.376304481276857, + "grad_norm": 0.21371231973171234, + "learning_rate": 4.6344931129661e-05, + "loss": 1.7741, + "step": 17516 + }, + { + "epoch": 5.376611418047882, + "grad_norm": 0.2653632164001465, + "learning_rate": 4.633997388652778e-05, + "loss": 1.7548, + "step": 17517 + }, + { + "epoch": 5.3769183548189075, + "grad_norm": 0.2559951841831207, + "learning_rate": 4.6335016679565094e-05, + "loss": 1.7833, + "step": 17518 + }, + { + "epoch": 5.377225291589933, + "grad_norm": 0.22560031712055206, + "learning_rate": 4.6330059508821914e-05, + "loss": 1.6929, + "step": 17519 + }, + { + "epoch": 5.377532228360957, + "grad_norm": 0.3084852695465088, + "learning_rate": 4.6325102374347255e-05, + "loss": 1.8107, + "step": 17520 + }, + { + "epoch": 5.377839165131983, + "grad_norm": 0.3329267203807831, + "learning_rate": 4.632014527619007e-05, + "loss": 1.6791, + "step": 17521 + }, + { + "epoch": 5.378146101903008, + "grad_norm": 0.26274019479751587, + "learning_rate": 4.631518821439939e-05, + "loss": 1.7187, + "step": 17522 + }, + { + "epoch": 5.378453038674033, + "grad_norm": 0.3769492208957672, + "learning_rate": 4.6310231189024165e-05, + "loss": 1.8366, + "step": 17523 + }, + { + "epoch": 5.378759975445059, + "grad_norm": 0.2503921687602997, + "learning_rate": 4.6305274200113385e-05, + "loss": 1.7281, + "step": 17524 + }, + { + "epoch": 5.379066912216084, + "grad_norm": 0.26305708289146423, + "learning_rate": 4.6300317247716074e-05, + "loss": 1.7231, + "step": 17525 + }, + { + "epoch": 5.379373848987108, + "grad_norm": 0.31899142265319824, + "learning_rate": 4.629536033188118e-05, + "loss": 1.8025, + "step": 17526 + }, + { + "epoch": 5.379680785758134, + "grad_norm": 0.21400104463100433, + "learning_rate": 4.629040345265772e-05, + "loss": 1.7481, + "step": 17527 + }, + { + "epoch": 5.379987722529159, + "grad_norm": 0.23147371411323547, + "learning_rate": 4.628544661009465e-05, + "loss": 1.7049, + "step": 17528 + }, + { + "epoch": 5.380294659300184, + "grad_norm": 0.21156759560108185, + "learning_rate": 4.628048980424099e-05, + "loss": 1.806, + "step": 17529 + }, + { + "epoch": 5.38060159607121, + "grad_norm": 0.22061556577682495, + "learning_rate": 4.6275533035145685e-05, + "loss": 1.7606, + "step": 17530 + }, + { + "epoch": 5.380908532842234, + "grad_norm": 0.23379987478256226, + "learning_rate": 4.6270576302857774e-05, + "loss": 1.7874, + "step": 17531 + }, + { + "epoch": 5.3812154696132595, + "grad_norm": 0.24738669395446777, + "learning_rate": 4.62656196074262e-05, + "loss": 1.7611, + "step": 17532 + }, + { + "epoch": 5.381522406384285, + "grad_norm": 0.19738905131816864, + "learning_rate": 4.6260662948899974e-05, + "loss": 1.7375, + "step": 17533 + }, + { + "epoch": 5.38182934315531, + "grad_norm": 0.2327810823917389, + "learning_rate": 4.6255706327328044e-05, + "loss": 1.7188, + "step": 17534 + }, + { + "epoch": 5.3821362799263355, + "grad_norm": 0.18944145739078522, + "learning_rate": 4.625074974275944e-05, + "loss": 1.6672, + "step": 17535 + }, + { + "epoch": 5.382443216697361, + "grad_norm": 0.20943734049797058, + "learning_rate": 4.624579319524311e-05, + "loss": 1.7238, + "step": 17536 + }, + { + "epoch": 5.382750153468385, + "grad_norm": 0.2060960829257965, + "learning_rate": 4.6240836684828074e-05, + "loss": 1.744, + "step": 17537 + }, + { + "epoch": 5.383057090239411, + "grad_norm": 0.19089816510677338, + "learning_rate": 4.6235880211563264e-05, + "loss": 1.6884, + "step": 17538 + }, + { + "epoch": 5.383364027010436, + "grad_norm": 0.22362665832042694, + "learning_rate": 4.623092377549772e-05, + "loss": 1.7076, + "step": 17539 + }, + { + "epoch": 5.383670963781461, + "grad_norm": 0.19429968297481537, + "learning_rate": 4.622596737668039e-05, + "loss": 1.7315, + "step": 17540 + }, + { + "epoch": 5.383977900552487, + "grad_norm": 0.20481903851032257, + "learning_rate": 4.622101101516024e-05, + "loss": 1.711, + "step": 17541 + }, + { + "epoch": 5.384284837323511, + "grad_norm": 0.19181163609027863, + "learning_rate": 4.6216054690986304e-05, + "loss": 1.6879, + "step": 17542 + }, + { + "epoch": 5.384591774094536, + "grad_norm": 0.23105846345424652, + "learning_rate": 4.6211098404207514e-05, + "loss": 1.7797, + "step": 17543 + }, + { + "epoch": 5.384898710865562, + "grad_norm": 0.2742008864879608, + "learning_rate": 4.6206142154872886e-05, + "loss": 1.7404, + "step": 17544 + }, + { + "epoch": 5.385205647636587, + "grad_norm": 0.2256750613451004, + "learning_rate": 4.6201185943031365e-05, + "loss": 1.7616, + "step": 17545 + }, + { + "epoch": 5.385512584407612, + "grad_norm": 0.23230868577957153, + "learning_rate": 4.6196229768731964e-05, + "loss": 1.7457, + "step": 17546 + }, + { + "epoch": 5.385819521178637, + "grad_norm": 0.2200126200914383, + "learning_rate": 4.6191273632023634e-05, + "loss": 1.7835, + "step": 17547 + }, + { + "epoch": 5.386126457949662, + "grad_norm": 0.21903863549232483, + "learning_rate": 4.6186317532955395e-05, + "loss": 1.7315, + "step": 17548 + }, + { + "epoch": 5.3864333947206875, + "grad_norm": 0.1915556788444519, + "learning_rate": 4.6181361471576186e-05, + "loss": 1.6786, + "step": 17549 + }, + { + "epoch": 5.386740331491713, + "grad_norm": 0.20177799463272095, + "learning_rate": 4.617640544793501e-05, + "loss": 1.7453, + "step": 17550 + }, + { + "epoch": 5.387047268262738, + "grad_norm": 0.2598256766796112, + "learning_rate": 4.617144946208083e-05, + "loss": 1.7931, + "step": 17551 + }, + { + "epoch": 5.387354205033763, + "grad_norm": 0.2357153594493866, + "learning_rate": 4.616649351406263e-05, + "loss": 1.7932, + "step": 17552 + }, + { + "epoch": 5.387661141804788, + "grad_norm": 0.2228964865207672, + "learning_rate": 4.616153760392938e-05, + "loss": 1.7725, + "step": 17553 + }, + { + "epoch": 5.387968078575813, + "grad_norm": 0.20811811089515686, + "learning_rate": 4.6156581731730085e-05, + "loss": 1.744, + "step": 17554 + }, + { + "epoch": 5.388275015346839, + "grad_norm": 0.20008429884910583, + "learning_rate": 4.615162589751369e-05, + "loss": 1.6973, + "step": 17555 + }, + { + "epoch": 5.388581952117864, + "grad_norm": 0.20487523078918457, + "learning_rate": 4.614667010132919e-05, + "loss": 1.7712, + "step": 17556 + }, + { + "epoch": 5.388888888888889, + "grad_norm": 0.21279677748680115, + "learning_rate": 4.6141714343225554e-05, + "loss": 1.7783, + "step": 17557 + }, + { + "epoch": 5.389195825659914, + "grad_norm": 0.28035736083984375, + "learning_rate": 4.613675862325174e-05, + "loss": 1.767, + "step": 17558 + }, + { + "epoch": 5.389502762430939, + "grad_norm": 0.27426794171333313, + "learning_rate": 4.613180294145677e-05, + "loss": 1.7909, + "step": 17559 + }, + { + "epoch": 5.389809699201964, + "grad_norm": 0.22420327365398407, + "learning_rate": 4.612684729788957e-05, + "loss": 1.6902, + "step": 17560 + }, + { + "epoch": 5.39011663597299, + "grad_norm": 0.19799382984638214, + "learning_rate": 4.612189169259915e-05, + "loss": 1.7276, + "step": 17561 + }, + { + "epoch": 5.390423572744015, + "grad_norm": 0.2508823573589325, + "learning_rate": 4.611693612563445e-05, + "loss": 1.7445, + "step": 17562 + }, + { + "epoch": 5.3907305095150395, + "grad_norm": 0.20835694670677185, + "learning_rate": 4.611198059704448e-05, + "loss": 1.696, + "step": 17563 + }, + { + "epoch": 5.391037446286065, + "grad_norm": 0.22136010229587555, + "learning_rate": 4.6107025106878176e-05, + "loss": 1.7701, + "step": 17564 + }, + { + "epoch": 5.39134438305709, + "grad_norm": 0.23835612833499908, + "learning_rate": 4.610206965518456e-05, + "loss": 1.7494, + "step": 17565 + }, + { + "epoch": 5.3916513198281155, + "grad_norm": 0.26142916083335876, + "learning_rate": 4.6097114242012554e-05, + "loss": 1.7616, + "step": 17566 + }, + { + "epoch": 5.391958256599141, + "grad_norm": 0.3366851806640625, + "learning_rate": 4.6092158867411175e-05, + "loss": 1.7409, + "step": 17567 + }, + { + "epoch": 5.392265193370166, + "grad_norm": 0.2592991292476654, + "learning_rate": 4.608720353142935e-05, + "loss": 1.7469, + "step": 17568 + }, + { + "epoch": 5.392572130141191, + "grad_norm": 0.25810322165489197, + "learning_rate": 4.608224823411608e-05, + "loss": 1.7345, + "step": 17569 + }, + { + "epoch": 5.392879066912216, + "grad_norm": 0.26776888966560364, + "learning_rate": 4.607729297552032e-05, + "loss": 1.7698, + "step": 17570 + }, + { + "epoch": 5.393186003683241, + "grad_norm": 0.21023939549922943, + "learning_rate": 4.607233775569107e-05, + "loss": 1.7681, + "step": 17571 + }, + { + "epoch": 5.393492940454267, + "grad_norm": 0.24452096223831177, + "learning_rate": 4.6067382574677265e-05, + "loss": 1.8154, + "step": 17572 + }, + { + "epoch": 5.393799877225292, + "grad_norm": 0.27084338665008545, + "learning_rate": 4.606242743252791e-05, + "loss": 1.7106, + "step": 17573 + }, + { + "epoch": 5.394106813996316, + "grad_norm": 0.24783825874328613, + "learning_rate": 4.605747232929195e-05, + "loss": 1.713, + "step": 17574 + }, + { + "epoch": 5.394413750767342, + "grad_norm": 0.2528151869773865, + "learning_rate": 4.6052517265018333e-05, + "loss": 1.8475, + "step": 17575 + }, + { + "epoch": 5.394720687538367, + "grad_norm": 0.24361065030097961, + "learning_rate": 4.604756223975609e-05, + "loss": 1.7414, + "step": 17576 + }, + { + "epoch": 5.395027624309392, + "grad_norm": 0.2751234769821167, + "learning_rate": 4.604260725355412e-05, + "loss": 1.7603, + "step": 17577 + }, + { + "epoch": 5.395334561080418, + "grad_norm": 0.23183637857437134, + "learning_rate": 4.603765230646146e-05, + "loss": 1.7053, + "step": 17578 + }, + { + "epoch": 5.395641497851442, + "grad_norm": 0.27462145686149597, + "learning_rate": 4.6032697398527005e-05, + "loss": 1.746, + "step": 17579 + }, + { + "epoch": 5.3959484346224675, + "grad_norm": 0.3665321171283722, + "learning_rate": 4.602774252979978e-05, + "loss": 1.6883, + "step": 17580 + }, + { + "epoch": 5.396255371393493, + "grad_norm": 0.22438424825668335, + "learning_rate": 4.602278770032872e-05, + "loss": 1.7473, + "step": 17581 + }, + { + "epoch": 5.396562308164518, + "grad_norm": 0.38713687658309937, + "learning_rate": 4.601783291016282e-05, + "loss": 1.7993, + "step": 17582 + }, + { + "epoch": 5.3968692449355435, + "grad_norm": 0.3399868905544281, + "learning_rate": 4.6012878159351015e-05, + "loss": 1.7709, + "step": 17583 + }, + { + "epoch": 5.397176181706568, + "grad_norm": 0.21916119754314423, + "learning_rate": 4.60079234479423e-05, + "loss": 1.7351, + "step": 17584 + }, + { + "epoch": 5.397483118477593, + "grad_norm": 0.3796394467353821, + "learning_rate": 4.600296877598561e-05, + "loss": 1.7534, + "step": 17585 + }, + { + "epoch": 5.397790055248619, + "grad_norm": 0.27824562788009644, + "learning_rate": 4.599801414352993e-05, + "loss": 1.6962, + "step": 17586 + }, + { + "epoch": 5.398096992019644, + "grad_norm": 0.21037112176418304, + "learning_rate": 4.599305955062421e-05, + "loss": 1.7062, + "step": 17587 + }, + { + "epoch": 5.398403928790669, + "grad_norm": 0.3373035192489624, + "learning_rate": 4.598810499731745e-05, + "loss": 1.8263, + "step": 17588 + }, + { + "epoch": 5.398710865561695, + "grad_norm": 0.2560507357120514, + "learning_rate": 4.5983150483658564e-05, + "loss": 1.7232, + "step": 17589 + }, + { + "epoch": 5.399017802332719, + "grad_norm": 0.23010993003845215, + "learning_rate": 4.5978196009696564e-05, + "loss": 1.805, + "step": 17590 + }, + { + "epoch": 5.399324739103744, + "grad_norm": 0.32955634593963623, + "learning_rate": 4.597324157548037e-05, + "loss": 1.7018, + "step": 17591 + }, + { + "epoch": 5.39963167587477, + "grad_norm": 0.2534363865852356, + "learning_rate": 4.5968287181058953e-05, + "loss": 1.6919, + "step": 17592 + }, + { + "epoch": 5.399938612645795, + "grad_norm": 0.23179130256175995, + "learning_rate": 4.5963332826481314e-05, + "loss": 1.7237, + "step": 17593 + }, + { + "epoch": 5.4002455494168204, + "grad_norm": 0.37712663412094116, + "learning_rate": 4.5958378511796365e-05, + "loss": 1.7694, + "step": 17594 + }, + { + "epoch": 5.400552486187845, + "grad_norm": 0.21228717267513275, + "learning_rate": 4.59534242370531e-05, + "loss": 1.7528, + "step": 17595 + }, + { + "epoch": 5.40085942295887, + "grad_norm": 0.2818812429904938, + "learning_rate": 4.5948470002300454e-05, + "loss": 1.8214, + "step": 17596 + }, + { + "epoch": 5.401166359729896, + "grad_norm": 0.24916675686836243, + "learning_rate": 4.5943515807587415e-05, + "loss": 1.7792, + "step": 17597 + }, + { + "epoch": 5.401473296500921, + "grad_norm": 0.2096913456916809, + "learning_rate": 4.593856165296291e-05, + "loss": 1.6983, + "step": 17598 + }, + { + "epoch": 5.401780233271946, + "grad_norm": 0.271124005317688, + "learning_rate": 4.593360753847595e-05, + "loss": 1.7534, + "step": 17599 + }, + { + "epoch": 5.402087170042972, + "grad_norm": 0.24798092246055603, + "learning_rate": 4.5928653464175435e-05, + "loss": 1.7783, + "step": 17600 + }, + { + "epoch": 5.402394106813996, + "grad_norm": 0.3531748056411743, + "learning_rate": 4.592369943011038e-05, + "loss": 1.7834, + "step": 17601 + }, + { + "epoch": 5.402701043585021, + "grad_norm": 0.29650232195854187, + "learning_rate": 4.591874543632969e-05, + "loss": 1.7186, + "step": 17602 + }, + { + "epoch": 5.403007980356047, + "grad_norm": 0.25578248500823975, + "learning_rate": 4.591379148288236e-05, + "loss": 1.7849, + "step": 17603 + }, + { + "epoch": 5.403314917127072, + "grad_norm": 0.3790532946586609, + "learning_rate": 4.590883756981733e-05, + "loss": 1.7192, + "step": 17604 + }, + { + "epoch": 5.403621853898097, + "grad_norm": 0.23684249818325043, + "learning_rate": 4.590388369718359e-05, + "loss": 1.7171, + "step": 17605 + }, + { + "epoch": 5.403928790669122, + "grad_norm": 0.267702579498291, + "learning_rate": 4.589892986503005e-05, + "loss": 1.7181, + "step": 17606 + }, + { + "epoch": 5.404235727440147, + "grad_norm": 0.29105648398399353, + "learning_rate": 4.5893976073405704e-05, + "loss": 1.7395, + "step": 17607 + }, + { + "epoch": 5.4045426642111725, + "grad_norm": 0.2266589254140854, + "learning_rate": 4.588902232235949e-05, + "loss": 1.7244, + "step": 17608 + }, + { + "epoch": 5.404849600982198, + "grad_norm": 0.24065524339675903, + "learning_rate": 4.588406861194035e-05, + "loss": 1.7398, + "step": 17609 + }, + { + "epoch": 5.405156537753223, + "grad_norm": 0.23166650533676147, + "learning_rate": 4.587911494219728e-05, + "loss": 1.7592, + "step": 17610 + }, + { + "epoch": 5.4054634745242485, + "grad_norm": 0.19882038235664368, + "learning_rate": 4.5874161313179186e-05, + "loss": 1.7087, + "step": 17611 + }, + { + "epoch": 5.405770411295273, + "grad_norm": 0.2688273787498474, + "learning_rate": 4.5869207724935076e-05, + "loss": 1.7791, + "step": 17612 + }, + { + "epoch": 5.406077348066298, + "grad_norm": 0.1970982402563095, + "learning_rate": 4.5864254177513855e-05, + "loss": 1.7079, + "step": 17613 + }, + { + "epoch": 5.406384284837324, + "grad_norm": 0.2531265318393707, + "learning_rate": 4.585930067096451e-05, + "loss": 1.716, + "step": 17614 + }, + { + "epoch": 5.406691221608349, + "grad_norm": 0.2610352337360382, + "learning_rate": 4.585434720533596e-05, + "loss": 1.7133, + "step": 17615 + }, + { + "epoch": 5.406998158379374, + "grad_norm": 0.2420870065689087, + "learning_rate": 4.5849393780677216e-05, + "loss": 1.7044, + "step": 17616 + }, + { + "epoch": 5.407305095150399, + "grad_norm": 0.24078647792339325, + "learning_rate": 4.584444039703717e-05, + "loss": 1.7486, + "step": 17617 + }, + { + "epoch": 5.407612031921424, + "grad_norm": 0.19324539601802826, + "learning_rate": 4.583948705446481e-05, + "loss": 1.7439, + "step": 17618 + }, + { + "epoch": 5.407918968692449, + "grad_norm": 0.2311750054359436, + "learning_rate": 4.5834533753009065e-05, + "loss": 1.7794, + "step": 17619 + }, + { + "epoch": 5.408225905463475, + "grad_norm": 0.2554466128349304, + "learning_rate": 4.5829580492718914e-05, + "loss": 1.7146, + "step": 17620 + }, + { + "epoch": 5.4085328422345, + "grad_norm": 0.2679688334465027, + "learning_rate": 4.582462727364328e-05, + "loss": 1.7677, + "step": 17621 + }, + { + "epoch": 5.4088397790055245, + "grad_norm": 0.19292913377285004, + "learning_rate": 4.5819674095831146e-05, + "loss": 1.7544, + "step": 17622 + }, + { + "epoch": 5.40914671577655, + "grad_norm": 0.2146623730659485, + "learning_rate": 4.5814720959331425e-05, + "loss": 1.7182, + "step": 17623 + }, + { + "epoch": 5.409453652547575, + "grad_norm": 0.23098216950893402, + "learning_rate": 4.5809767864193096e-05, + "loss": 1.6844, + "step": 17624 + }, + { + "epoch": 5.4097605893186005, + "grad_norm": 0.22482910752296448, + "learning_rate": 4.5804814810465096e-05, + "loss": 1.7921, + "step": 17625 + }, + { + "epoch": 5.410067526089626, + "grad_norm": 0.22098569571971893, + "learning_rate": 4.579986179819636e-05, + "loss": 1.7419, + "step": 17626 + }, + { + "epoch": 5.41037446286065, + "grad_norm": 0.2131706178188324, + "learning_rate": 4.579490882743588e-05, + "loss": 1.7587, + "step": 17627 + }, + { + "epoch": 5.410681399631676, + "grad_norm": 0.22448734939098358, + "learning_rate": 4.578995589823254e-05, + "loss": 1.6959, + "step": 17628 + }, + { + "epoch": 5.410988336402701, + "grad_norm": 0.22372964024543762, + "learning_rate": 4.578500301063536e-05, + "loss": 1.7462, + "step": 17629 + }, + { + "epoch": 5.411295273173726, + "grad_norm": 0.22140730917453766, + "learning_rate": 4.578005016469322e-05, + "loss": 1.8348, + "step": 17630 + }, + { + "epoch": 5.411602209944752, + "grad_norm": 0.21697622537612915, + "learning_rate": 4.577509736045511e-05, + "loss": 1.7634, + "step": 17631 + }, + { + "epoch": 5.411909146715777, + "grad_norm": 0.2044363021850586, + "learning_rate": 4.5770144597969954e-05, + "loss": 1.7095, + "step": 17632 + }, + { + "epoch": 5.412216083486801, + "grad_norm": 0.1910451501607895, + "learning_rate": 4.576519187728674e-05, + "loss": 1.7022, + "step": 17633 + }, + { + "epoch": 5.412523020257827, + "grad_norm": 0.21787554025650024, + "learning_rate": 4.576023919845434e-05, + "loss": 1.7206, + "step": 17634 + }, + { + "epoch": 5.412829957028852, + "grad_norm": 0.2363428920507431, + "learning_rate": 4.575528656152178e-05, + "loss": 1.8052, + "step": 17635 + }, + { + "epoch": 5.413136893799877, + "grad_norm": 0.22830195724964142, + "learning_rate": 4.575033396653793e-05, + "loss": 1.7432, + "step": 17636 + }, + { + "epoch": 5.413443830570903, + "grad_norm": 0.24867239594459534, + "learning_rate": 4.5745381413551794e-05, + "loss": 1.7011, + "step": 17637 + }, + { + "epoch": 5.413750767341927, + "grad_norm": 0.19329775869846344, + "learning_rate": 4.574042890261228e-05, + "loss": 1.7749, + "step": 17638 + }, + { + "epoch": 5.4140577041129525, + "grad_norm": 0.22917115688323975, + "learning_rate": 4.573547643376836e-05, + "loss": 1.7478, + "step": 17639 + }, + { + "epoch": 5.414364640883978, + "grad_norm": 0.23882724344730377, + "learning_rate": 4.573052400706894e-05, + "loss": 1.7396, + "step": 17640 + }, + { + "epoch": 5.414671577655003, + "grad_norm": 0.19127070903778076, + "learning_rate": 4.572557162256301e-05, + "loss": 1.6791, + "step": 17641 + }, + { + "epoch": 5.4149785144260285, + "grad_norm": 0.18385560810565948, + "learning_rate": 4.5720619280299475e-05, + "loss": 1.7288, + "step": 17642 + }, + { + "epoch": 5.415285451197054, + "grad_norm": 0.19845189154148102, + "learning_rate": 4.571566698032728e-05, + "loss": 1.7525, + "step": 17643 + }, + { + "epoch": 5.415592387968078, + "grad_norm": 0.18987210094928741, + "learning_rate": 4.571071472269539e-05, + "loss": 1.7253, + "step": 17644 + }, + { + "epoch": 5.415899324739104, + "grad_norm": 0.18257199227809906, + "learning_rate": 4.570576250745271e-05, + "loss": 1.7051, + "step": 17645 + }, + { + "epoch": 5.416206261510129, + "grad_norm": 0.22803467512130737, + "learning_rate": 4.570081033464823e-05, + "loss": 1.7478, + "step": 17646 + }, + { + "epoch": 5.416513198281154, + "grad_norm": 0.18763841688632965, + "learning_rate": 4.569585820433084e-05, + "loss": 1.7316, + "step": 17647 + }, + { + "epoch": 5.41682013505218, + "grad_norm": 0.23974654078483582, + "learning_rate": 4.56909061165495e-05, + "loss": 1.7566, + "step": 17648 + }, + { + "epoch": 5.417127071823204, + "grad_norm": 0.24336253106594086, + "learning_rate": 4.568595407135315e-05, + "loss": 1.7468, + "step": 17649 + }, + { + "epoch": 5.417434008594229, + "grad_norm": 0.23891226947307587, + "learning_rate": 4.5681002068790755e-05, + "loss": 1.7201, + "step": 17650 + }, + { + "epoch": 5.417740945365255, + "grad_norm": 0.19209685921669006, + "learning_rate": 4.56760501089112e-05, + "loss": 1.713, + "step": 17651 + }, + { + "epoch": 5.41804788213628, + "grad_norm": 0.2407880276441574, + "learning_rate": 4.567109819176349e-05, + "loss": 1.7073, + "step": 17652 + }, + { + "epoch": 5.418354818907305, + "grad_norm": 0.2385055273771286, + "learning_rate": 4.5666146317396485e-05, + "loss": 1.7387, + "step": 17653 + }, + { + "epoch": 5.41866175567833, + "grad_norm": 0.22068475186824799, + "learning_rate": 4.566119448585918e-05, + "loss": 1.7116, + "step": 17654 + }, + { + "epoch": 5.418968692449355, + "grad_norm": 0.318375825881958, + "learning_rate": 4.5656242697200496e-05, + "loss": 1.7659, + "step": 17655 + }, + { + "epoch": 5.4192756292203805, + "grad_norm": 0.25311973690986633, + "learning_rate": 4.5651290951469366e-05, + "loss": 1.7814, + "step": 17656 + }, + { + "epoch": 5.419582565991406, + "grad_norm": 0.18701443076133728, + "learning_rate": 4.5646339248714735e-05, + "loss": 1.6993, + "step": 17657 + }, + { + "epoch": 5.419889502762431, + "grad_norm": 0.2964496314525604, + "learning_rate": 4.5641387588985516e-05, + "loss": 1.8254, + "step": 17658 + }, + { + "epoch": 5.420196439533456, + "grad_norm": 0.19447220861911774, + "learning_rate": 4.563643597233067e-05, + "loss": 1.7208, + "step": 17659 + }, + { + "epoch": 5.420503376304481, + "grad_norm": 0.21666039526462555, + "learning_rate": 4.5631484398799105e-05, + "loss": 1.6695, + "step": 17660 + }, + { + "epoch": 5.420810313075506, + "grad_norm": 0.23104412853717804, + "learning_rate": 4.5626532868439796e-05, + "loss": 1.7449, + "step": 17661 + }, + { + "epoch": 5.421117249846532, + "grad_norm": 0.20463459193706512, + "learning_rate": 4.562158138130163e-05, + "loss": 1.6714, + "step": 17662 + }, + { + "epoch": 5.421424186617557, + "grad_norm": 0.21948079764842987, + "learning_rate": 4.561662993743359e-05, + "loss": 1.6957, + "step": 17663 + }, + { + "epoch": 5.421731123388582, + "grad_norm": 0.2672746777534485, + "learning_rate": 4.561167853688455e-05, + "loss": 1.7137, + "step": 17664 + }, + { + "epoch": 5.422038060159607, + "grad_norm": 0.2652325928211212, + "learning_rate": 4.5606727179703493e-05, + "loss": 1.7943, + "step": 17665 + }, + { + "epoch": 5.422344996930632, + "grad_norm": 0.17761313915252686, + "learning_rate": 4.560177586593933e-05, + "loss": 1.7072, + "step": 17666 + }, + { + "epoch": 5.422651933701657, + "grad_norm": 0.24759770929813385, + "learning_rate": 4.5596824595641e-05, + "loss": 1.7807, + "step": 17667 + }, + { + "epoch": 5.422958870472683, + "grad_norm": 0.22191929817199707, + "learning_rate": 4.5591873368857416e-05, + "loss": 1.7668, + "step": 17668 + }, + { + "epoch": 5.423265807243708, + "grad_norm": 0.21293842792510986, + "learning_rate": 4.5586922185637546e-05, + "loss": 1.7304, + "step": 17669 + }, + { + "epoch": 5.4235727440147325, + "grad_norm": 0.2646051049232483, + "learning_rate": 4.5581971046030277e-05, + "loss": 1.7258, + "step": 17670 + }, + { + "epoch": 5.423879680785758, + "grad_norm": 0.1894550621509552, + "learning_rate": 4.5577019950084574e-05, + "loss": 1.7066, + "step": 17671 + }, + { + "epoch": 5.424186617556783, + "grad_norm": 0.2533467710018158, + "learning_rate": 4.557206889784934e-05, + "loss": 1.7668, + "step": 17672 + }, + { + "epoch": 5.4244935543278086, + "grad_norm": 0.1972150355577469, + "learning_rate": 4.556711788937352e-05, + "loss": 1.7306, + "step": 17673 + }, + { + "epoch": 5.424800491098834, + "grad_norm": 0.2726735472679138, + "learning_rate": 4.5562166924706054e-05, + "loss": 1.7281, + "step": 17674 + }, + { + "epoch": 5.425107427869859, + "grad_norm": 0.2244454175233841, + "learning_rate": 4.555721600389584e-05, + "loss": 1.7461, + "step": 17675 + }, + { + "epoch": 5.425414364640884, + "grad_norm": 0.19486510753631592, + "learning_rate": 4.555226512699182e-05, + "loss": 1.7361, + "step": 17676 + }, + { + "epoch": 5.425721301411909, + "grad_norm": 0.18128283321857452, + "learning_rate": 4.554731429404293e-05, + "loss": 1.7637, + "step": 17677 + }, + { + "epoch": 5.426028238182934, + "grad_norm": 0.24709749221801758, + "learning_rate": 4.5542363505098084e-05, + "loss": 1.7928, + "step": 17678 + }, + { + "epoch": 5.42633517495396, + "grad_norm": 0.2236633151769638, + "learning_rate": 4.553741276020621e-05, + "loss": 1.8262, + "step": 17679 + }, + { + "epoch": 5.426642111724985, + "grad_norm": 0.2592087984085083, + "learning_rate": 4.553246205941626e-05, + "loss": 1.675, + "step": 17680 + }, + { + "epoch": 5.4269490484960095, + "grad_norm": 0.27751871943473816, + "learning_rate": 4.552751140277712e-05, + "loss": 1.7344, + "step": 17681 + }, + { + "epoch": 5.427255985267035, + "grad_norm": 0.23752287030220032, + "learning_rate": 4.5522560790337746e-05, + "loss": 1.7748, + "step": 17682 + }, + { + "epoch": 5.42756292203806, + "grad_norm": 0.3259925842285156, + "learning_rate": 4.5517610222147035e-05, + "loss": 1.7855, + "step": 17683 + }, + { + "epoch": 5.4278698588090855, + "grad_norm": 0.2579646706581116, + "learning_rate": 4.551265969825394e-05, + "loss": 1.7978, + "step": 17684 + }, + { + "epoch": 5.428176795580111, + "grad_norm": 0.3217744827270508, + "learning_rate": 4.550770921870735e-05, + "loss": 1.7793, + "step": 17685 + }, + { + "epoch": 5.428483732351136, + "grad_norm": 0.2930903434753418, + "learning_rate": 4.550275878355624e-05, + "loss": 1.7226, + "step": 17686 + }, + { + "epoch": 5.428790669122161, + "grad_norm": 0.1982879489660263, + "learning_rate": 4.549780839284948e-05, + "loss": 1.6841, + "step": 17687 + }, + { + "epoch": 5.429097605893186, + "grad_norm": 0.20843900740146637, + "learning_rate": 4.5492858046636046e-05, + "loss": 1.7201, + "step": 17688 + }, + { + "epoch": 5.429404542664211, + "grad_norm": 0.23116534948349, + "learning_rate": 4.5487907744964794e-05, + "loss": 1.7565, + "step": 17689 + }, + { + "epoch": 5.429711479435237, + "grad_norm": 0.19177772104740143, + "learning_rate": 4.548295748788471e-05, + "loss": 1.7479, + "step": 17690 + }, + { + "epoch": 5.430018416206262, + "grad_norm": 0.22261449694633484, + "learning_rate": 4.547800727544469e-05, + "loss": 1.7785, + "step": 17691 + }, + { + "epoch": 5.430325352977286, + "grad_norm": 0.20073406398296356, + "learning_rate": 4.547305710769363e-05, + "loss": 1.741, + "step": 17692 + }, + { + "epoch": 5.430632289748312, + "grad_norm": 0.21662208437919617, + "learning_rate": 4.546810698468049e-05, + "loss": 1.7269, + "step": 17693 + }, + { + "epoch": 5.430939226519337, + "grad_norm": 0.19540879130363464, + "learning_rate": 4.546315690645416e-05, + "loss": 1.7141, + "step": 17694 + }, + { + "epoch": 5.431246163290362, + "grad_norm": 0.20063656568527222, + "learning_rate": 4.545820687306358e-05, + "loss": 1.7244, + "step": 17695 + }, + { + "epoch": 5.431553100061388, + "grad_norm": 0.2172660082578659, + "learning_rate": 4.545325688455765e-05, + "loss": 1.7172, + "step": 17696 + }, + { + "epoch": 5.431860036832412, + "grad_norm": 0.2480388581752777, + "learning_rate": 4.5448306940985326e-05, + "loss": 1.6994, + "step": 17697 + }, + { + "epoch": 5.4321669736034375, + "grad_norm": 0.22499477863311768, + "learning_rate": 4.544335704239547e-05, + "loss": 1.7405, + "step": 17698 + }, + { + "epoch": 5.432473910374463, + "grad_norm": 0.20655590295791626, + "learning_rate": 4.5438407188837065e-05, + "loss": 1.6867, + "step": 17699 + }, + { + "epoch": 5.432780847145488, + "grad_norm": 0.2045906037092209, + "learning_rate": 4.543345738035896e-05, + "loss": 1.7752, + "step": 17700 + }, + { + "epoch": 5.4330877839165135, + "grad_norm": 0.2092052847146988, + "learning_rate": 4.542850761701013e-05, + "loss": 1.7389, + "step": 17701 + }, + { + "epoch": 5.433394720687538, + "grad_norm": 0.1943730264902115, + "learning_rate": 4.5423557898839446e-05, + "loss": 1.7276, + "step": 17702 + }, + { + "epoch": 5.433701657458563, + "grad_norm": 0.23487289249897003, + "learning_rate": 4.541860822589587e-05, + "loss": 1.8119, + "step": 17703 + }, + { + "epoch": 5.434008594229589, + "grad_norm": 0.204689159989357, + "learning_rate": 4.541365859822827e-05, + "loss": 1.7865, + "step": 17704 + }, + { + "epoch": 5.434315531000614, + "grad_norm": 0.20850931107997894, + "learning_rate": 4.5408709015885604e-05, + "loss": 1.7733, + "step": 17705 + }, + { + "epoch": 5.434622467771639, + "grad_norm": 0.18685877323150635, + "learning_rate": 4.540375947891675e-05, + "loss": 1.7526, + "step": 17706 + }, + { + "epoch": 5.434929404542665, + "grad_norm": 0.2009890079498291, + "learning_rate": 4.539880998737064e-05, + "loss": 1.6904, + "step": 17707 + }, + { + "epoch": 5.435236341313689, + "grad_norm": 0.16602718830108643, + "learning_rate": 4.5393860541296205e-05, + "loss": 1.689, + "step": 17708 + }, + { + "epoch": 5.435543278084714, + "grad_norm": 0.24318818747997284, + "learning_rate": 4.5388911140742315e-05, + "loss": 1.7993, + "step": 17709 + }, + { + "epoch": 5.43585021485574, + "grad_norm": 0.24094417691230774, + "learning_rate": 4.538396178575793e-05, + "loss": 1.7235, + "step": 17710 + }, + { + "epoch": 5.436157151626765, + "grad_norm": 0.20361751317977905, + "learning_rate": 4.537901247639192e-05, + "loss": 1.7198, + "step": 17711 + }, + { + "epoch": 5.43646408839779, + "grad_norm": 0.2563718259334564, + "learning_rate": 4.537406321269323e-05, + "loss": 1.795, + "step": 17712 + }, + { + "epoch": 5.436771025168815, + "grad_norm": 0.29895591735839844, + "learning_rate": 4.536911399471075e-05, + "loss": 1.7515, + "step": 17713 + }, + { + "epoch": 5.43707796193984, + "grad_norm": 0.22535841166973114, + "learning_rate": 4.536416482249342e-05, + "loss": 1.6998, + "step": 17714 + }, + { + "epoch": 5.4373848987108655, + "grad_norm": 0.26025068759918213, + "learning_rate": 4.53592156960901e-05, + "loss": 1.7821, + "step": 17715 + }, + { + "epoch": 5.437691835481891, + "grad_norm": 0.3473168611526489, + "learning_rate": 4.535426661554975e-05, + "loss": 1.7035, + "step": 17716 + }, + { + "epoch": 5.437998772252916, + "grad_norm": 0.22207199037075043, + "learning_rate": 4.534931758092126e-05, + "loss": 1.7485, + "step": 17717 + }, + { + "epoch": 5.4383057090239415, + "grad_norm": 0.26839709281921387, + "learning_rate": 4.534436859225353e-05, + "loss": 1.7272, + "step": 17718 + }, + { + "epoch": 5.438612645794966, + "grad_norm": 0.37715891003608704, + "learning_rate": 4.5339419649595476e-05, + "loss": 1.7254, + "step": 17719 + }, + { + "epoch": 5.438919582565991, + "grad_norm": 0.21485768258571625, + "learning_rate": 4.533447075299603e-05, + "loss": 1.7349, + "step": 17720 + }, + { + "epoch": 5.439226519337017, + "grad_norm": 0.29502415657043457, + "learning_rate": 4.5329521902504055e-05, + "loss": 1.7325, + "step": 17721 + }, + { + "epoch": 5.439533456108042, + "grad_norm": 0.29448410868644714, + "learning_rate": 4.5324573098168505e-05, + "loss": 1.768, + "step": 17722 + }, + { + "epoch": 5.439840392879067, + "grad_norm": 0.1892058402299881, + "learning_rate": 4.5319624340038244e-05, + "loss": 1.6866, + "step": 17723 + }, + { + "epoch": 5.440147329650092, + "grad_norm": 0.3365040123462677, + "learning_rate": 4.531467562816221e-05, + "loss": 1.7662, + "step": 17724 + }, + { + "epoch": 5.440454266421117, + "grad_norm": 0.2960789203643799, + "learning_rate": 4.53097269625893e-05, + "loss": 1.746, + "step": 17725 + }, + { + "epoch": 5.440761203192142, + "grad_norm": 0.21623700857162476, + "learning_rate": 4.530477834336841e-05, + "loss": 1.7619, + "step": 17726 + }, + { + "epoch": 5.441068139963168, + "grad_norm": 0.29010120034217834, + "learning_rate": 4.5299829770548456e-05, + "loss": 1.717, + "step": 17727 + }, + { + "epoch": 5.441375076734193, + "grad_norm": 0.18467605113983154, + "learning_rate": 4.529488124417833e-05, + "loss": 1.6938, + "step": 17728 + }, + { + "epoch": 5.4416820135052175, + "grad_norm": 0.2875411808490753, + "learning_rate": 4.528993276430695e-05, + "loss": 1.7633, + "step": 17729 + }, + { + "epoch": 5.441988950276243, + "grad_norm": 0.24252675473690033, + "learning_rate": 4.528498433098321e-05, + "loss": 1.6477, + "step": 17730 + }, + { + "epoch": 5.442295887047268, + "grad_norm": 0.18885886669158936, + "learning_rate": 4.5280035944256035e-05, + "loss": 1.7241, + "step": 17731 + }, + { + "epoch": 5.4426028238182935, + "grad_norm": 0.2594204246997833, + "learning_rate": 4.527508760417429e-05, + "loss": 1.6697, + "step": 17732 + }, + { + "epoch": 5.442909760589319, + "grad_norm": 0.23796287178993225, + "learning_rate": 4.527013931078692e-05, + "loss": 1.7035, + "step": 17733 + }, + { + "epoch": 5.443216697360343, + "grad_norm": 0.2591552436351776, + "learning_rate": 4.5265191064142787e-05, + "loss": 1.8014, + "step": 17734 + }, + { + "epoch": 5.443523634131369, + "grad_norm": 0.3316073417663574, + "learning_rate": 4.526024286429082e-05, + "loss": 1.752, + "step": 17735 + }, + { + "epoch": 5.443830570902394, + "grad_norm": 0.2409597635269165, + "learning_rate": 4.52552947112799e-05, + "loss": 1.7662, + "step": 17736 + }, + { + "epoch": 5.444137507673419, + "grad_norm": 0.2896713614463806, + "learning_rate": 4.5250346605158964e-05, + "loss": 1.7168, + "step": 17737 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 0.30870527029037476, + "learning_rate": 4.524539854597686e-05, + "loss": 1.704, + "step": 17738 + }, + { + "epoch": 5.44475138121547, + "grad_norm": 0.2476932406425476, + "learning_rate": 4.524045053378254e-05, + "loss": 1.7649, + "step": 17739 + }, + { + "epoch": 5.445058317986494, + "grad_norm": 0.2937077283859253, + "learning_rate": 4.5235502568624855e-05, + "loss": 1.7028, + "step": 17740 + }, + { + "epoch": 5.44536525475752, + "grad_norm": 0.22881117463111877, + "learning_rate": 4.523055465055273e-05, + "loss": 1.7539, + "step": 17741 + }, + { + "epoch": 5.445672191528545, + "grad_norm": 0.2551842927932739, + "learning_rate": 4.522560677961508e-05, + "loss": 1.7601, + "step": 17742 + }, + { + "epoch": 5.44597912829957, + "grad_norm": 0.27533504366874695, + "learning_rate": 4.5220658955860754e-05, + "loss": 1.7695, + "step": 17743 + }, + { + "epoch": 5.446286065070596, + "grad_norm": 0.23387418687343597, + "learning_rate": 4.5215711179338706e-05, + "loss": 1.7218, + "step": 17744 + }, + { + "epoch": 5.44659300184162, + "grad_norm": 0.37932485342025757, + "learning_rate": 4.521076345009777e-05, + "loss": 1.7685, + "step": 17745 + }, + { + "epoch": 5.4468999386126455, + "grad_norm": 0.2668898105621338, + "learning_rate": 4.520581576818691e-05, + "loss": 1.7217, + "step": 17746 + }, + { + "epoch": 5.447206875383671, + "grad_norm": 0.2417856752872467, + "learning_rate": 4.520086813365496e-05, + "loss": 1.692, + "step": 17747 + }, + { + "epoch": 5.447513812154696, + "grad_norm": 0.3170008063316345, + "learning_rate": 4.519592054655086e-05, + "loss": 1.7565, + "step": 17748 + }, + { + "epoch": 5.4478207489257215, + "grad_norm": 0.20711660385131836, + "learning_rate": 4.519097300692348e-05, + "loss": 1.6708, + "step": 17749 + }, + { + "epoch": 5.448127685696747, + "grad_norm": 0.2196272760629654, + "learning_rate": 4.5186025514821746e-05, + "loss": 1.7335, + "step": 17750 + }, + { + "epoch": 5.448434622467771, + "grad_norm": 0.27563074231147766, + "learning_rate": 4.5181078070294505e-05, + "loss": 1.7383, + "step": 17751 + }, + { + "epoch": 5.448741559238797, + "grad_norm": 0.185418501496315, + "learning_rate": 4.517613067339068e-05, + "loss": 1.6841, + "step": 17752 + }, + { + "epoch": 5.449048496009822, + "grad_norm": 0.26787856221199036, + "learning_rate": 4.517118332415915e-05, + "loss": 1.7733, + "step": 17753 + }, + { + "epoch": 5.449355432780847, + "grad_norm": 0.22114823758602142, + "learning_rate": 4.516623602264885e-05, + "loss": 1.7153, + "step": 17754 + }, + { + "epoch": 5.449662369551873, + "grad_norm": 0.23090483248233795, + "learning_rate": 4.51612887689086e-05, + "loss": 1.7063, + "step": 17755 + }, + { + "epoch": 5.449969306322897, + "grad_norm": 0.3227362632751465, + "learning_rate": 4.515634156298736e-05, + "loss": 1.7528, + "step": 17756 + }, + { + "epoch": 5.4502762430939224, + "grad_norm": 0.24202494323253632, + "learning_rate": 4.515139440493397e-05, + "loss": 1.8119, + "step": 17757 + }, + { + "epoch": 5.450583179864948, + "grad_norm": 0.3778383731842041, + "learning_rate": 4.5146447294797356e-05, + "loss": 1.7589, + "step": 17758 + }, + { + "epoch": 5.450890116635973, + "grad_norm": 0.3726772964000702, + "learning_rate": 4.51415002326264e-05, + "loss": 1.7095, + "step": 17759 + }, + { + "epoch": 5.4511970534069984, + "grad_norm": 0.2424323409795761, + "learning_rate": 4.5136553218469966e-05, + "loss": 1.7374, + "step": 17760 + }, + { + "epoch": 5.451503990178024, + "grad_norm": 0.4347550570964813, + "learning_rate": 4.513160625237699e-05, + "loss": 1.8339, + "step": 17761 + }, + { + "epoch": 5.451810926949048, + "grad_norm": 0.2556018829345703, + "learning_rate": 4.512665933439631e-05, + "loss": 1.7024, + "step": 17762 + }, + { + "epoch": 5.452117863720074, + "grad_norm": 0.36380240321159363, + "learning_rate": 4.512171246457685e-05, + "loss": 1.7706, + "step": 17763 + }, + { + "epoch": 5.452424800491099, + "grad_norm": 0.42120790481567383, + "learning_rate": 4.5116765642967476e-05, + "loss": 1.7609, + "step": 17764 + }, + { + "epoch": 5.452731737262124, + "grad_norm": 0.20573028922080994, + "learning_rate": 4.51118188696171e-05, + "loss": 1.7521, + "step": 17765 + }, + { + "epoch": 5.45303867403315, + "grad_norm": 0.39001402258872986, + "learning_rate": 4.510687214457458e-05, + "loss": 1.7097, + "step": 17766 + }, + { + "epoch": 5.453345610804174, + "grad_norm": 0.2778739333152771, + "learning_rate": 4.510192546788884e-05, + "loss": 1.7677, + "step": 17767 + }, + { + "epoch": 5.453652547575199, + "grad_norm": 0.2500934600830078, + "learning_rate": 4.509697883960872e-05, + "loss": 1.7322, + "step": 17768 + }, + { + "epoch": 5.453959484346225, + "grad_norm": 0.23733557760715485, + "learning_rate": 4.509203225978314e-05, + "loss": 1.7426, + "step": 17769 + }, + { + "epoch": 5.45426642111725, + "grad_norm": 0.20033739507198334, + "learning_rate": 4.508708572846096e-05, + "loss": 1.7093, + "step": 17770 + }, + { + "epoch": 5.454573357888275, + "grad_norm": 0.202667698264122, + "learning_rate": 4.508213924569111e-05, + "loss": 1.6807, + "step": 17771 + }, + { + "epoch": 5.4548802946593, + "grad_norm": 0.1980566531419754, + "learning_rate": 4.507719281152241e-05, + "loss": 1.7102, + "step": 17772 + }, + { + "epoch": 5.455187231430325, + "grad_norm": 0.20612162351608276, + "learning_rate": 4.507224642600381e-05, + "loss": 1.7692, + "step": 17773 + }, + { + "epoch": 5.4554941682013505, + "grad_norm": 0.22859175503253937, + "learning_rate": 4.506730008918412e-05, + "loss": 1.7887, + "step": 17774 + }, + { + "epoch": 5.455801104972376, + "grad_norm": 0.19720709323883057, + "learning_rate": 4.5062353801112285e-05, + "loss": 1.7557, + "step": 17775 + }, + { + "epoch": 5.456108041743401, + "grad_norm": 0.23289217054843903, + "learning_rate": 4.505740756183717e-05, + "loss": 1.7023, + "step": 17776 + }, + { + "epoch": 5.456414978514426, + "grad_norm": 0.2120361477136612, + "learning_rate": 4.505246137140763e-05, + "loss": 1.7249, + "step": 17777 + }, + { + "epoch": 5.456721915285451, + "grad_norm": 0.2094341218471527, + "learning_rate": 4.504751522987259e-05, + "loss": 1.7586, + "step": 17778 + }, + { + "epoch": 5.457028852056476, + "grad_norm": 0.22361092269420624, + "learning_rate": 4.504256913728088e-05, + "loss": 1.737, + "step": 17779 + }, + { + "epoch": 5.457335788827502, + "grad_norm": 0.2100353240966797, + "learning_rate": 4.5037623093681424e-05, + "loss": 1.704, + "step": 17780 + }, + { + "epoch": 5.457642725598527, + "grad_norm": 0.20550231635570526, + "learning_rate": 4.503267709912308e-05, + "loss": 1.7732, + "step": 17781 + }, + { + "epoch": 5.457949662369552, + "grad_norm": 0.22843749821186066, + "learning_rate": 4.502773115365474e-05, + "loss": 1.6916, + "step": 17782 + }, + { + "epoch": 5.458256599140577, + "grad_norm": 0.2351907640695572, + "learning_rate": 4.502278525732526e-05, + "loss": 1.8043, + "step": 17783 + }, + { + "epoch": 5.458563535911602, + "grad_norm": 0.271028071641922, + "learning_rate": 4.501783941018355e-05, + "loss": 1.7665, + "step": 17784 + }, + { + "epoch": 5.458870472682627, + "grad_norm": 0.1974802166223526, + "learning_rate": 4.501289361227846e-05, + "loss": 1.718, + "step": 17785 + }, + { + "epoch": 5.459177409453653, + "grad_norm": 0.23726068437099457, + "learning_rate": 4.5007947863658884e-05, + "loss": 1.7507, + "step": 17786 + }, + { + "epoch": 5.459484346224678, + "grad_norm": 0.2112259715795517, + "learning_rate": 4.5003002164373684e-05, + "loss": 1.8116, + "step": 17787 + }, + { + "epoch": 5.4597912829957025, + "grad_norm": 0.2676105201244354, + "learning_rate": 4.4998056514471764e-05, + "loss": 1.7013, + "step": 17788 + }, + { + "epoch": 5.460098219766728, + "grad_norm": 0.2735576033592224, + "learning_rate": 4.4993110914001956e-05, + "loss": 1.7516, + "step": 17789 + }, + { + "epoch": 5.460405156537753, + "grad_norm": 0.1925152987241745, + "learning_rate": 4.498816536301319e-05, + "loss": 1.7018, + "step": 17790 + }, + { + "epoch": 5.4607120933087785, + "grad_norm": 0.25037717819213867, + "learning_rate": 4.498321986155429e-05, + "loss": 1.7207, + "step": 17791 + }, + { + "epoch": 5.461019030079804, + "grad_norm": 0.20481008291244507, + "learning_rate": 4.497827440967415e-05, + "loss": 1.6988, + "step": 17792 + }, + { + "epoch": 5.461325966850829, + "grad_norm": 0.19434049725532532, + "learning_rate": 4.4973329007421673e-05, + "loss": 1.7363, + "step": 17793 + }, + { + "epoch": 5.461632903621854, + "grad_norm": 0.21797434985637665, + "learning_rate": 4.496838365484567e-05, + "loss": 1.7218, + "step": 17794 + }, + { + "epoch": 5.461939840392879, + "grad_norm": 0.18477453291416168, + "learning_rate": 4.496343835199508e-05, + "loss": 1.7204, + "step": 17795 + }, + { + "epoch": 5.462246777163904, + "grad_norm": 0.21657803654670715, + "learning_rate": 4.495849309891872e-05, + "loss": 1.7671, + "step": 17796 + }, + { + "epoch": 5.46255371393493, + "grad_norm": 0.21027342975139618, + "learning_rate": 4.495354789566549e-05, + "loss": 1.7424, + "step": 17797 + }, + { + "epoch": 5.462860650705955, + "grad_norm": 0.2016189992427826, + "learning_rate": 4.4948602742284256e-05, + "loss": 1.7706, + "step": 17798 + }, + { + "epoch": 5.463167587476979, + "grad_norm": 0.2155935913324356, + "learning_rate": 4.494365763882391e-05, + "loss": 1.7314, + "step": 17799 + }, + { + "epoch": 5.463474524248005, + "grad_norm": 0.22079701721668243, + "learning_rate": 4.493871258533328e-05, + "loss": 1.7938, + "step": 17800 + }, + { + "epoch": 5.46378146101903, + "grad_norm": 0.1907699704170227, + "learning_rate": 4.4933767581861283e-05, + "loss": 1.6958, + "step": 17801 + }, + { + "epoch": 5.464088397790055, + "grad_norm": 0.2784879207611084, + "learning_rate": 4.4928822628456735e-05, + "loss": 1.7285, + "step": 17802 + }, + { + "epoch": 5.464395334561081, + "grad_norm": 0.29470255970954895, + "learning_rate": 4.492387772516855e-05, + "loss": 1.7363, + "step": 17803 + }, + { + "epoch": 5.464702271332105, + "grad_norm": 0.21387436985969543, + "learning_rate": 4.4918932872045575e-05, + "loss": 1.7414, + "step": 17804 + }, + { + "epoch": 5.4650092081031305, + "grad_norm": 0.3102552890777588, + "learning_rate": 4.49139880691367e-05, + "loss": 1.7359, + "step": 17805 + }, + { + "epoch": 5.465316144874156, + "grad_norm": 0.2312939465045929, + "learning_rate": 4.490904331649075e-05, + "loss": 1.7609, + "step": 17806 + }, + { + "epoch": 5.465623081645181, + "grad_norm": 0.323913037776947, + "learning_rate": 4.4904098614156645e-05, + "loss": 1.7693, + "step": 17807 + }, + { + "epoch": 5.4659300184162065, + "grad_norm": 0.2975599467754364, + "learning_rate": 4.48991539621832e-05, + "loss": 1.7506, + "step": 17808 + }, + { + "epoch": 5.466236955187231, + "grad_norm": 0.24702571332454681, + "learning_rate": 4.4894209360619316e-05, + "loss": 1.8258, + "step": 17809 + }, + { + "epoch": 5.466543891958256, + "grad_norm": 0.29016581177711487, + "learning_rate": 4.488926480951386e-05, + "loss": 1.7096, + "step": 17810 + }, + { + "epoch": 5.466850828729282, + "grad_norm": 0.2194555252790451, + "learning_rate": 4.488432030891566e-05, + "loss": 1.788, + "step": 17811 + }, + { + "epoch": 5.467157765500307, + "grad_norm": 0.2504041790962219, + "learning_rate": 4.487937585887363e-05, + "loss": 1.7672, + "step": 17812 + }, + { + "epoch": 5.467464702271332, + "grad_norm": 0.2362445741891861, + "learning_rate": 4.487443145943659e-05, + "loss": 1.7426, + "step": 17813 + }, + { + "epoch": 5.467771639042358, + "grad_norm": 0.20075896382331848, + "learning_rate": 4.486948711065343e-05, + "loss": 1.7406, + "step": 17814 + }, + { + "epoch": 5.468078575813382, + "grad_norm": 0.2219153791666031, + "learning_rate": 4.486454281257299e-05, + "loss": 1.683, + "step": 17815 + }, + { + "epoch": 5.468385512584407, + "grad_norm": 0.22551953792572021, + "learning_rate": 4.4859598565244176e-05, + "loss": 1.7896, + "step": 17816 + }, + { + "epoch": 5.468692449355433, + "grad_norm": 0.2385476976633072, + "learning_rate": 4.48546543687158e-05, + "loss": 1.7799, + "step": 17817 + }, + { + "epoch": 5.468999386126458, + "grad_norm": 0.24263370037078857, + "learning_rate": 4.4849710223036764e-05, + "loss": 1.682, + "step": 17818 + }, + { + "epoch": 5.469306322897483, + "grad_norm": 0.24301160871982574, + "learning_rate": 4.484476612825589e-05, + "loss": 1.8121, + "step": 17819 + }, + { + "epoch": 5.469613259668508, + "grad_norm": 0.2516932487487793, + "learning_rate": 4.483982208442207e-05, + "loss": 1.7344, + "step": 17820 + }, + { + "epoch": 5.469920196439533, + "grad_norm": 0.24309395253658295, + "learning_rate": 4.4834878091584156e-05, + "loss": 1.7746, + "step": 17821 + }, + { + "epoch": 5.4702271332105585, + "grad_norm": 0.24711866676807404, + "learning_rate": 4.4829934149790996e-05, + "loss": 1.7887, + "step": 17822 + }, + { + "epoch": 5.470534069981584, + "grad_norm": 0.2923797369003296, + "learning_rate": 4.4824990259091445e-05, + "loss": 1.7017, + "step": 17823 + }, + { + "epoch": 5.470841006752609, + "grad_norm": 0.21658629179000854, + "learning_rate": 4.482004641953441e-05, + "loss": 1.725, + "step": 17824 + }, + { + "epoch": 5.4711479435236345, + "grad_norm": 0.233424574136734, + "learning_rate": 4.481510263116868e-05, + "loss": 1.74, + "step": 17825 + }, + { + "epoch": 5.471454880294659, + "grad_norm": 0.28997600078582764, + "learning_rate": 4.481015889404315e-05, + "loss": 1.8418, + "step": 17826 + }, + { + "epoch": 5.471761817065684, + "grad_norm": 0.2245558649301529, + "learning_rate": 4.480521520820669e-05, + "loss": 1.7519, + "step": 17827 + }, + { + "epoch": 5.47206875383671, + "grad_norm": 0.21008887887001038, + "learning_rate": 4.480027157370812e-05, + "loss": 1.6977, + "step": 17828 + }, + { + "epoch": 5.472375690607735, + "grad_norm": 0.1990261971950531, + "learning_rate": 4.479532799059633e-05, + "loss": 1.7004, + "step": 17829 + }, + { + "epoch": 5.47268262737876, + "grad_norm": 0.2354540079832077, + "learning_rate": 4.479038445892014e-05, + "loss": 1.7755, + "step": 17830 + }, + { + "epoch": 5.472989564149785, + "grad_norm": 0.21904973685741425, + "learning_rate": 4.478544097872843e-05, + "loss": 1.8328, + "step": 17831 + }, + { + "epoch": 5.47329650092081, + "grad_norm": 0.21188503503799438, + "learning_rate": 4.4780497550070055e-05, + "loss": 1.7105, + "step": 17832 + }, + { + "epoch": 5.473603437691835, + "grad_norm": 0.2196870595216751, + "learning_rate": 4.477555417299386e-05, + "loss": 1.7261, + "step": 17833 + }, + { + "epoch": 5.473910374462861, + "grad_norm": 0.24522331357002258, + "learning_rate": 4.477061084754869e-05, + "loss": 1.8101, + "step": 17834 + }, + { + "epoch": 5.474217311233886, + "grad_norm": 0.24073927104473114, + "learning_rate": 4.476566757378343e-05, + "loss": 1.8295, + "step": 17835 + }, + { + "epoch": 5.474524248004911, + "grad_norm": 0.3724605143070221, + "learning_rate": 4.476072435174689e-05, + "loss": 1.7785, + "step": 17836 + }, + { + "epoch": 5.474831184775936, + "grad_norm": 0.25552257895469666, + "learning_rate": 4.475578118148797e-05, + "loss": 1.6978, + "step": 17837 + }, + { + "epoch": 5.475138121546961, + "grad_norm": 0.22402255237102509, + "learning_rate": 4.475083806305546e-05, + "loss": 1.697, + "step": 17838 + }, + { + "epoch": 5.475445058317987, + "grad_norm": 0.25869324803352356, + "learning_rate": 4.474589499649826e-05, + "loss": 1.7026, + "step": 17839 + }, + { + "epoch": 5.475751995089012, + "grad_norm": 0.249742329120636, + "learning_rate": 4.47409519818652e-05, + "loss": 1.7738, + "step": 17840 + }, + { + "epoch": 5.476058931860037, + "grad_norm": 0.28722140192985535, + "learning_rate": 4.473600901920515e-05, + "loss": 1.7555, + "step": 17841 + }, + { + "epoch": 5.476365868631062, + "grad_norm": 0.250964879989624, + "learning_rate": 4.4731066108566926e-05, + "loss": 1.6951, + "step": 17842 + }, + { + "epoch": 5.476672805402087, + "grad_norm": 0.20562006533145905, + "learning_rate": 4.472612324999942e-05, + "loss": 1.7109, + "step": 17843 + }, + { + "epoch": 5.476979742173112, + "grad_norm": 0.26964858174324036, + "learning_rate": 4.472118044355144e-05, + "loss": 1.7468, + "step": 17844 + }, + { + "epoch": 5.477286678944138, + "grad_norm": 0.25700438022613525, + "learning_rate": 4.471623768927184e-05, + "loss": 1.7046, + "step": 17845 + }, + { + "epoch": 5.477593615715163, + "grad_norm": 0.2152809500694275, + "learning_rate": 4.47112949872095e-05, + "loss": 1.7464, + "step": 17846 + }, + { + "epoch": 5.4779005524861875, + "grad_norm": 0.26429688930511475, + "learning_rate": 4.470635233741321e-05, + "loss": 1.7629, + "step": 17847 + }, + { + "epoch": 5.478207489257213, + "grad_norm": 0.18546637892723083, + "learning_rate": 4.470140973993188e-05, + "loss": 1.7143, + "step": 17848 + }, + { + "epoch": 5.478514426028238, + "grad_norm": 0.1927761435508728, + "learning_rate": 4.46964671948143e-05, + "loss": 1.6919, + "step": 17849 + }, + { + "epoch": 5.4788213627992635, + "grad_norm": 0.21581199765205383, + "learning_rate": 4.469152470210935e-05, + "loss": 1.7596, + "step": 17850 + }, + { + "epoch": 5.479128299570289, + "grad_norm": 0.20244133472442627, + "learning_rate": 4.468658226186586e-05, + "loss": 1.7372, + "step": 17851 + }, + { + "epoch": 5.479435236341313, + "grad_norm": 0.2467198520898819, + "learning_rate": 4.468163987413269e-05, + "loss": 1.7361, + "step": 17852 + }, + { + "epoch": 5.479742173112339, + "grad_norm": 0.22134411334991455, + "learning_rate": 4.467669753895866e-05, + "loss": 1.7276, + "step": 17853 + }, + { + "epoch": 5.480049109883364, + "grad_norm": 0.1953750103712082, + "learning_rate": 4.4671755256392636e-05, + "loss": 1.6931, + "step": 17854 + }, + { + "epoch": 5.480356046654389, + "grad_norm": 0.21492068469524384, + "learning_rate": 4.466681302648343e-05, + "loss": 1.7437, + "step": 17855 + }, + { + "epoch": 5.480662983425415, + "grad_norm": 0.24377848207950592, + "learning_rate": 4.466187084927993e-05, + "loss": 1.7869, + "step": 17856 + }, + { + "epoch": 5.48096992019644, + "grad_norm": 0.23674219846725464, + "learning_rate": 4.465692872483093e-05, + "loss": 1.8142, + "step": 17857 + }, + { + "epoch": 5.481276856967464, + "grad_norm": 0.25036486983299255, + "learning_rate": 4.4651986653185304e-05, + "loss": 1.8075, + "step": 17858 + }, + { + "epoch": 5.48158379373849, + "grad_norm": 0.32649150490760803, + "learning_rate": 4.4647044634391867e-05, + "loss": 1.7177, + "step": 17859 + }, + { + "epoch": 5.481890730509515, + "grad_norm": 0.20300604403018951, + "learning_rate": 4.46421026684995e-05, + "loss": 1.6912, + "step": 17860 + }, + { + "epoch": 5.48219766728054, + "grad_norm": 0.24630679190158844, + "learning_rate": 4.4637160755557e-05, + "loss": 1.8312, + "step": 17861 + }, + { + "epoch": 5.482504604051566, + "grad_norm": 0.2263093739748001, + "learning_rate": 4.46322188956132e-05, + "loss": 1.7214, + "step": 17862 + }, + { + "epoch": 5.48281154082259, + "grad_norm": 0.22949177026748657, + "learning_rate": 4.462727708871699e-05, + "loss": 1.6882, + "step": 17863 + }, + { + "epoch": 5.4831184775936155, + "grad_norm": 0.23389381170272827, + "learning_rate": 4.4622335334917156e-05, + "loss": 1.7613, + "step": 17864 + }, + { + "epoch": 5.483425414364641, + "grad_norm": 0.2259683907032013, + "learning_rate": 4.461739363426257e-05, + "loss": 1.7021, + "step": 17865 + }, + { + "epoch": 5.483732351135666, + "grad_norm": 0.3213486969470978, + "learning_rate": 4.4612451986802036e-05, + "loss": 1.7469, + "step": 17866 + }, + { + "epoch": 5.4840392879066915, + "grad_norm": 0.3415670096874237, + "learning_rate": 4.4607510392584426e-05, + "loss": 1.7605, + "step": 17867 + }, + { + "epoch": 5.484346224677717, + "grad_norm": 0.2079494297504425, + "learning_rate": 4.460256885165855e-05, + "loss": 1.7832, + "step": 17868 + }, + { + "epoch": 5.484653161448741, + "grad_norm": 0.30334988236427307, + "learning_rate": 4.459762736407327e-05, + "loss": 1.6825, + "step": 17869 + }, + { + "epoch": 5.484960098219767, + "grad_norm": 0.22320730984210968, + "learning_rate": 4.4592685929877374e-05, + "loss": 1.7452, + "step": 17870 + }, + { + "epoch": 5.485267034990792, + "grad_norm": 0.25325682759284973, + "learning_rate": 4.458774454911975e-05, + "loss": 1.7359, + "step": 17871 + }, + { + "epoch": 5.485573971761817, + "grad_norm": 0.305501788854599, + "learning_rate": 4.458280322184919e-05, + "loss": 1.7161, + "step": 17872 + }, + { + "epoch": 5.485880908532843, + "grad_norm": 0.19486182928085327, + "learning_rate": 4.457786194811455e-05, + "loss": 1.7097, + "step": 17873 + }, + { + "epoch": 5.486187845303867, + "grad_norm": 0.3306363821029663, + "learning_rate": 4.457292072796465e-05, + "loss": 1.7653, + "step": 17874 + }, + { + "epoch": 5.486494782074892, + "grad_norm": 0.25172874331474304, + "learning_rate": 4.456797956144835e-05, + "loss": 1.7289, + "step": 17875 + }, + { + "epoch": 5.486801718845918, + "grad_norm": 0.24508661031723022, + "learning_rate": 4.456303844861444e-05, + "loss": 1.7255, + "step": 17876 + }, + { + "epoch": 5.487108655616943, + "grad_norm": 0.3043360114097595, + "learning_rate": 4.455809738951178e-05, + "loss": 1.7852, + "step": 17877 + }, + { + "epoch": 5.487415592387968, + "grad_norm": 0.22181758284568787, + "learning_rate": 4.4553156384189186e-05, + "loss": 1.7887, + "step": 17878 + }, + { + "epoch": 5.487722529158993, + "grad_norm": 0.2174321413040161, + "learning_rate": 4.454821543269549e-05, + "loss": 1.7024, + "step": 17879 + }, + { + "epoch": 5.488029465930018, + "grad_norm": 0.19634750485420227, + "learning_rate": 4.4543274535079535e-05, + "loss": 1.7451, + "step": 17880 + }, + { + "epoch": 5.4883364027010435, + "grad_norm": 0.20481908321380615, + "learning_rate": 4.4538333691390125e-05, + "loss": 1.7068, + "step": 17881 + }, + { + "epoch": 5.488643339472069, + "grad_norm": 0.2025458663702011, + "learning_rate": 4.453339290167612e-05, + "loss": 1.72, + "step": 17882 + }, + { + "epoch": 5.488950276243094, + "grad_norm": 0.21013019979000092, + "learning_rate": 4.452845216598632e-05, + "loss": 1.7113, + "step": 17883 + }, + { + "epoch": 5.4892572130141195, + "grad_norm": 0.2057499885559082, + "learning_rate": 4.452351148436956e-05, + "loss": 1.7007, + "step": 17884 + }, + { + "epoch": 5.489564149785144, + "grad_norm": 0.19957664608955383, + "learning_rate": 4.4518570856874666e-05, + "loss": 1.6999, + "step": 17885 + }, + { + "epoch": 5.489871086556169, + "grad_norm": 0.22609412670135498, + "learning_rate": 4.451363028355048e-05, + "loss": 1.8124, + "step": 17886 + }, + { + "epoch": 5.490178023327195, + "grad_norm": 0.27350863814353943, + "learning_rate": 4.4508689764445805e-05, + "loss": 1.8042, + "step": 17887 + }, + { + "epoch": 5.49048496009822, + "grad_norm": 0.23416854441165924, + "learning_rate": 4.450374929960949e-05, + "loss": 1.7607, + "step": 17888 + }, + { + "epoch": 5.490791896869245, + "grad_norm": 0.2891421318054199, + "learning_rate": 4.449880888909033e-05, + "loss": 1.7419, + "step": 17889 + }, + { + "epoch": 5.49109883364027, + "grad_norm": 0.2458745837211609, + "learning_rate": 4.449386853293717e-05, + "loss": 1.7234, + "step": 17890 + }, + { + "epoch": 5.491405770411295, + "grad_norm": 0.23390449583530426, + "learning_rate": 4.4488928231198826e-05, + "loss": 1.7482, + "step": 17891 + }, + { + "epoch": 5.49171270718232, + "grad_norm": 0.3509657084941864, + "learning_rate": 4.448398798392414e-05, + "loss": 1.7639, + "step": 17892 + }, + { + "epoch": 5.492019643953346, + "grad_norm": 0.2487955242395401, + "learning_rate": 4.4479047791161916e-05, + "loss": 1.7163, + "step": 17893 + }, + { + "epoch": 5.492326580724371, + "grad_norm": 0.22630274295806885, + "learning_rate": 4.4474107652960956e-05, + "loss": 1.7449, + "step": 17894 + }, + { + "epoch": 5.4926335174953955, + "grad_norm": 0.25909537076950073, + "learning_rate": 4.446916756937012e-05, + "loss": 1.7396, + "step": 17895 + }, + { + "epoch": 5.492940454266421, + "grad_norm": 0.29732683300971985, + "learning_rate": 4.446422754043819e-05, + "loss": 1.8109, + "step": 17896 + }, + { + "epoch": 5.493247391037446, + "grad_norm": 0.22436772286891937, + "learning_rate": 4.4459287566214035e-05, + "loss": 1.7657, + "step": 17897 + }, + { + "epoch": 5.4935543278084715, + "grad_norm": 0.24584892392158508, + "learning_rate": 4.445434764674643e-05, + "loss": 1.73, + "step": 17898 + }, + { + "epoch": 5.493861264579497, + "grad_norm": 0.27446454763412476, + "learning_rate": 4.444940778208423e-05, + "loss": 1.7428, + "step": 17899 + }, + { + "epoch": 5.494168201350522, + "grad_norm": 0.20442110300064087, + "learning_rate": 4.4444467972276215e-05, + "loss": 1.6911, + "step": 17900 + }, + { + "epoch": 5.494475138121547, + "grad_norm": 0.23089268803596497, + "learning_rate": 4.4439528217371236e-05, + "loss": 1.7192, + "step": 17901 + }, + { + "epoch": 5.494782074892572, + "grad_norm": 0.19402450323104858, + "learning_rate": 4.443458851741808e-05, + "loss": 1.7304, + "step": 17902 + }, + { + "epoch": 5.495089011663597, + "grad_norm": 0.2310219705104828, + "learning_rate": 4.442964887246561e-05, + "loss": 1.6963, + "step": 17903 + }, + { + "epoch": 5.495395948434623, + "grad_norm": 0.25573140382766724, + "learning_rate": 4.44247092825626e-05, + "loss": 1.7781, + "step": 17904 + }, + { + "epoch": 5.495702885205648, + "grad_norm": 0.20298753678798676, + "learning_rate": 4.4419769747757894e-05, + "loss": 1.763, + "step": 17905 + }, + { + "epoch": 5.496009821976672, + "grad_norm": 0.22243307530879974, + "learning_rate": 4.441483026810027e-05, + "loss": 1.7345, + "step": 17906 + }, + { + "epoch": 5.496316758747698, + "grad_norm": 0.19801411032676697, + "learning_rate": 4.4409890843638584e-05, + "loss": 1.7504, + "step": 17907 + }, + { + "epoch": 5.496623695518723, + "grad_norm": 0.2804374396800995, + "learning_rate": 4.440495147442162e-05, + "loss": 1.7985, + "step": 17908 + }, + { + "epoch": 5.496930632289748, + "grad_norm": 0.21824021637439728, + "learning_rate": 4.440001216049822e-05, + "loss": 1.6703, + "step": 17909 + }, + { + "epoch": 5.497237569060774, + "grad_norm": 0.23335935175418854, + "learning_rate": 4.439507290191719e-05, + "loss": 1.7426, + "step": 17910 + }, + { + "epoch": 5.497544505831799, + "grad_norm": 0.2093769609928131, + "learning_rate": 4.4390133698727315e-05, + "loss": 1.7178, + "step": 17911 + }, + { + "epoch": 5.4978514426028235, + "grad_norm": 0.18354324996471405, + "learning_rate": 4.438519455097743e-05, + "loss": 1.6849, + "step": 17912 + }, + { + "epoch": 5.498158379373849, + "grad_norm": 0.26826491951942444, + "learning_rate": 4.438025545871633e-05, + "loss": 1.7804, + "step": 17913 + }, + { + "epoch": 5.498465316144874, + "grad_norm": 0.29171738028526306, + "learning_rate": 4.437531642199288e-05, + "loss": 1.764, + "step": 17914 + }, + { + "epoch": 5.4987722529158995, + "grad_norm": 0.17870590090751648, + "learning_rate": 4.437037744085581e-05, + "loss": 1.6789, + "step": 17915 + }, + { + "epoch": 5.499079189686925, + "grad_norm": 0.25412192940711975, + "learning_rate": 4.4365438515354e-05, + "loss": 1.7536, + "step": 17916 + }, + { + "epoch": 5.499386126457949, + "grad_norm": 0.24465163052082062, + "learning_rate": 4.4360499645536203e-05, + "loss": 1.7582, + "step": 17917 + }, + { + "epoch": 5.499693063228975, + "grad_norm": 0.21248452365398407, + "learning_rate": 4.4355560831451264e-05, + "loss": 1.7209, + "step": 17918 + }, + { + "epoch": 5.5, + "grad_norm": 0.21018685400485992, + "learning_rate": 4.435062207314797e-05, + "loss": 1.7461, + "step": 17919 + }, + { + "epoch": 5.500306936771025, + "grad_norm": 0.1880551278591156, + "learning_rate": 4.434568337067517e-05, + "loss": 1.6818, + "step": 17920 + }, + { + "epoch": 5.500613873542051, + "grad_norm": 0.2224894016981125, + "learning_rate": 4.434074472408161e-05, + "loss": 1.8211, + "step": 17921 + }, + { + "epoch": 5.500920810313076, + "grad_norm": 0.19419749081134796, + "learning_rate": 4.433580613341615e-05, + "loss": 1.7625, + "step": 17922 + }, + { + "epoch": 5.5012277470841005, + "grad_norm": 0.2167430967092514, + "learning_rate": 4.433086759872756e-05, + "loss": 1.745, + "step": 17923 + }, + { + "epoch": 5.501534683855126, + "grad_norm": 0.1926383525133133, + "learning_rate": 4.4325929120064665e-05, + "loss": 1.7353, + "step": 17924 + }, + { + "epoch": 5.501841620626151, + "grad_norm": 0.22943224012851715, + "learning_rate": 4.432099069747625e-05, + "loss": 1.6903, + "step": 17925 + }, + { + "epoch": 5.5021485573971765, + "grad_norm": 0.18218693137168884, + "learning_rate": 4.431605233101116e-05, + "loss": 1.742, + "step": 17926 + }, + { + "epoch": 5.502455494168201, + "grad_norm": 0.2660788893699646, + "learning_rate": 4.431111402071817e-05, + "loss": 1.7208, + "step": 17927 + }, + { + "epoch": 5.502762430939226, + "grad_norm": 0.20015788078308105, + "learning_rate": 4.430617576664606e-05, + "loss": 1.721, + "step": 17928 + }, + { + "epoch": 5.503069367710252, + "grad_norm": 0.20011179149150848, + "learning_rate": 4.430123756884368e-05, + "loss": 1.7488, + "step": 17929 + }, + { + "epoch": 5.503376304481277, + "grad_norm": 0.22541452944278717, + "learning_rate": 4.429629942735979e-05, + "loss": 1.7997, + "step": 17930 + }, + { + "epoch": 5.503683241252302, + "grad_norm": 0.21067193150520325, + "learning_rate": 4.4291361342243236e-05, + "loss": 1.6652, + "step": 17931 + }, + { + "epoch": 5.503990178023328, + "grad_norm": 0.38401395082473755, + "learning_rate": 4.428642331354278e-05, + "loss": 1.815, + "step": 17932 + }, + { + "epoch": 5.504297114794352, + "grad_norm": 0.22600100934505463, + "learning_rate": 4.428148534130725e-05, + "loss": 1.7593, + "step": 17933 + }, + { + "epoch": 5.504604051565377, + "grad_norm": 0.21340666711330414, + "learning_rate": 4.427654742558542e-05, + "loss": 1.7447, + "step": 17934 + }, + { + "epoch": 5.504910988336403, + "grad_norm": 0.20676501095294952, + "learning_rate": 4.427160956642611e-05, + "loss": 1.7174, + "step": 17935 + }, + { + "epoch": 5.505217925107428, + "grad_norm": 0.2374252825975418, + "learning_rate": 4.42666717638781e-05, + "loss": 1.703, + "step": 17936 + }, + { + "epoch": 5.505524861878453, + "grad_norm": 0.20975756645202637, + "learning_rate": 4.426173401799022e-05, + "loss": 1.7076, + "step": 17937 + }, + { + "epoch": 5.505831798649478, + "grad_norm": 0.23778517544269562, + "learning_rate": 4.4256796328811226e-05, + "loss": 1.7647, + "step": 17938 + }, + { + "epoch": 5.506138735420503, + "grad_norm": 0.2088557481765747, + "learning_rate": 4.425185869638996e-05, + "loss": 1.764, + "step": 17939 + }, + { + "epoch": 5.5064456721915285, + "grad_norm": 0.26953455805778503, + "learning_rate": 4.424692112077518e-05, + "loss": 1.7351, + "step": 17940 + }, + { + "epoch": 5.506752608962554, + "grad_norm": 0.2762589454650879, + "learning_rate": 4.42419836020157e-05, + "loss": 1.7051, + "step": 17941 + }, + { + "epoch": 5.507059545733579, + "grad_norm": 0.19611702859401703, + "learning_rate": 4.4237046140160306e-05, + "loss": 1.7445, + "step": 17942 + }, + { + "epoch": 5.5073664825046045, + "grad_norm": 0.2708270251750946, + "learning_rate": 4.4232108735257824e-05, + "loss": 1.7284, + "step": 17943 + }, + { + "epoch": 5.507673419275629, + "grad_norm": 0.24194146692752838, + "learning_rate": 4.422717138735701e-05, + "loss": 1.7302, + "step": 17944 + }, + { + "epoch": 5.507980356046654, + "grad_norm": 0.21558286249637604, + "learning_rate": 4.422223409650666e-05, + "loss": 1.7435, + "step": 17945 + }, + { + "epoch": 5.50828729281768, + "grad_norm": 0.1842707246541977, + "learning_rate": 4.4217296862755597e-05, + "loss": 1.6579, + "step": 17946 + }, + { + "epoch": 5.508594229588705, + "grad_norm": 0.20211941003799438, + "learning_rate": 4.4212359686152576e-05, + "loss": 1.8017, + "step": 17947 + }, + { + "epoch": 5.50890116635973, + "grad_norm": 0.23749016225337982, + "learning_rate": 4.420742256674644e-05, + "loss": 1.6721, + "step": 17948 + }, + { + "epoch": 5.509208103130755, + "grad_norm": 0.2076852172613144, + "learning_rate": 4.420248550458592e-05, + "loss": 1.7102, + "step": 17949 + }, + { + "epoch": 5.50951503990178, + "grad_norm": 0.2599447965621948, + "learning_rate": 4.419754849971986e-05, + "loss": 1.7819, + "step": 17950 + }, + { + "epoch": 5.509821976672805, + "grad_norm": 0.2017187476158142, + "learning_rate": 4.4192611552197e-05, + "loss": 1.6812, + "step": 17951 + }, + { + "epoch": 5.510128913443831, + "grad_norm": 0.21972116827964783, + "learning_rate": 4.418767466206617e-05, + "loss": 1.7122, + "step": 17952 + }, + { + "epoch": 5.510435850214856, + "grad_norm": 0.21750569343566895, + "learning_rate": 4.418273782937613e-05, + "loss": 1.7285, + "step": 17953 + }, + { + "epoch": 5.510742786985881, + "grad_norm": 0.19349125027656555, + "learning_rate": 4.417780105417572e-05, + "loss": 1.7383, + "step": 17954 + }, + { + "epoch": 5.511049723756906, + "grad_norm": 0.2094268798828125, + "learning_rate": 4.417286433651366e-05, + "loss": 1.7107, + "step": 17955 + }, + { + "epoch": 5.511356660527931, + "grad_norm": 0.2684331238269806, + "learning_rate": 4.41679276764388e-05, + "loss": 1.7336, + "step": 17956 + }, + { + "epoch": 5.5116635972989565, + "grad_norm": 0.27616915106773376, + "learning_rate": 4.416299107399987e-05, + "loss": 1.7439, + "step": 17957 + }, + { + "epoch": 5.511970534069982, + "grad_norm": 0.23874540627002716, + "learning_rate": 4.415805452924569e-05, + "loss": 1.7979, + "step": 17958 + }, + { + "epoch": 5.512277470841006, + "grad_norm": 0.21870921552181244, + "learning_rate": 4.415311804222503e-05, + "loss": 1.6674, + "step": 17959 + }, + { + "epoch": 5.512584407612032, + "grad_norm": 0.23042429983615875, + "learning_rate": 4.414818161298671e-05, + "loss": 1.7588, + "step": 17960 + }, + { + "epoch": 5.512891344383057, + "grad_norm": 0.2957153916358948, + "learning_rate": 4.4143245241579486e-05, + "loss": 1.8412, + "step": 17961 + }, + { + "epoch": 5.513198281154082, + "grad_norm": 0.28292644023895264, + "learning_rate": 4.413830892805213e-05, + "loss": 1.7915, + "step": 17962 + }, + { + "epoch": 5.513505217925108, + "grad_norm": 0.26526281237602234, + "learning_rate": 4.413337267245344e-05, + "loss": 1.7199, + "step": 17963 + }, + { + "epoch": 5.513812154696133, + "grad_norm": 0.41243693232536316, + "learning_rate": 4.4128436474832204e-05, + "loss": 1.7419, + "step": 17964 + }, + { + "epoch": 5.514119091467157, + "grad_norm": 0.2747771739959717, + "learning_rate": 4.4123500335237214e-05, + "loss": 1.7449, + "step": 17965 + }, + { + "epoch": 5.514426028238183, + "grad_norm": 0.25944122672080994, + "learning_rate": 4.4118564253717216e-05, + "loss": 1.7667, + "step": 17966 + }, + { + "epoch": 5.514732965009208, + "grad_norm": 0.32558533549308777, + "learning_rate": 4.411362823032103e-05, + "loss": 1.7292, + "step": 17967 + }, + { + "epoch": 5.515039901780233, + "grad_norm": 0.20190958678722382, + "learning_rate": 4.4108692265097404e-05, + "loss": 1.7529, + "step": 17968 + }, + { + "epoch": 5.515346838551259, + "grad_norm": 0.35485807061195374, + "learning_rate": 4.410375635809514e-05, + "loss": 1.7335, + "step": 17969 + }, + { + "epoch": 5.515653775322283, + "grad_norm": 0.2670159935951233, + "learning_rate": 4.409882050936301e-05, + "loss": 1.6789, + "step": 17970 + }, + { + "epoch": 5.5159607120933085, + "grad_norm": 0.19106578826904297, + "learning_rate": 4.409388471894981e-05, + "loss": 1.708, + "step": 17971 + }, + { + "epoch": 5.516267648864334, + "grad_norm": 0.2707268297672272, + "learning_rate": 4.4088948986904286e-05, + "loss": 1.7917, + "step": 17972 + }, + { + "epoch": 5.516574585635359, + "grad_norm": 0.2329230159521103, + "learning_rate": 4.408401331327525e-05, + "loss": 1.7378, + "step": 17973 + }, + { + "epoch": 5.5168815224063845, + "grad_norm": 0.22164998948574066, + "learning_rate": 4.4079077698111436e-05, + "loss": 1.7287, + "step": 17974 + }, + { + "epoch": 5.51718845917741, + "grad_norm": 0.25895699858665466, + "learning_rate": 4.4074142141461665e-05, + "loss": 1.7158, + "step": 17975 + }, + { + "epoch": 5.517495395948434, + "grad_norm": 0.2617860436439514, + "learning_rate": 4.4069206643374695e-05, + "loss": 1.7767, + "step": 17976 + }, + { + "epoch": 5.51780233271946, + "grad_norm": 0.20443588495254517, + "learning_rate": 4.40642712038993e-05, + "loss": 1.7371, + "step": 17977 + }, + { + "epoch": 5.518109269490485, + "grad_norm": 0.26251545548439026, + "learning_rate": 4.4059335823084266e-05, + "loss": 1.8154, + "step": 17978 + }, + { + "epoch": 5.51841620626151, + "grad_norm": 0.2315993458032608, + "learning_rate": 4.405440050097833e-05, + "loss": 1.7426, + "step": 17979 + }, + { + "epoch": 5.518723143032536, + "grad_norm": 0.19467706978321075, + "learning_rate": 4.404946523763031e-05, + "loss": 1.7418, + "step": 17980 + }, + { + "epoch": 5.51903007980356, + "grad_norm": 0.2387837916612625, + "learning_rate": 4.4044530033088946e-05, + "loss": 1.7648, + "step": 17981 + }, + { + "epoch": 5.519337016574585, + "grad_norm": 0.21097531914710999, + "learning_rate": 4.403959488740306e-05, + "loss": 1.7198, + "step": 17982 + }, + { + "epoch": 5.519643953345611, + "grad_norm": 0.22303247451782227, + "learning_rate": 4.403465980062136e-05, + "loss": 1.7679, + "step": 17983 + }, + { + "epoch": 5.519950890116636, + "grad_norm": 0.19705620408058167, + "learning_rate": 4.4029724772792666e-05, + "loss": 1.7747, + "step": 17984 + }, + { + "epoch": 5.520257826887661, + "grad_norm": 0.20864570140838623, + "learning_rate": 4.4024789803965715e-05, + "loss": 1.6797, + "step": 17985 + }, + { + "epoch": 5.520564763658687, + "grad_norm": 0.1917724758386612, + "learning_rate": 4.401985489418931e-05, + "loss": 1.7246, + "step": 17986 + }, + { + "epoch": 5.520871700429711, + "grad_norm": 0.25668975710868835, + "learning_rate": 4.401492004351219e-05, + "loss": 1.7245, + "step": 17987 + }, + { + "epoch": 5.5211786372007365, + "grad_norm": 0.22576093673706055, + "learning_rate": 4.4009985251983146e-05, + "loss": 1.6766, + "step": 17988 + }, + { + "epoch": 5.521485573971762, + "grad_norm": 0.18614664673805237, + "learning_rate": 4.400505051965093e-05, + "loss": 1.7379, + "step": 17989 + }, + { + "epoch": 5.521792510742787, + "grad_norm": 0.21472783386707306, + "learning_rate": 4.4000115846564335e-05, + "loss": 1.7203, + "step": 17990 + }, + { + "epoch": 5.5220994475138125, + "grad_norm": 0.201142817735672, + "learning_rate": 4.39951812327721e-05, + "loss": 1.7049, + "step": 17991 + }, + { + "epoch": 5.522406384284837, + "grad_norm": 0.193614661693573, + "learning_rate": 4.3990246678323e-05, + "loss": 1.6938, + "step": 17992 + }, + { + "epoch": 5.522713321055862, + "grad_norm": 0.23343239724636078, + "learning_rate": 4.398531218326582e-05, + "loss": 1.744, + "step": 17993 + }, + { + "epoch": 5.523020257826888, + "grad_norm": 0.26271605491638184, + "learning_rate": 4.3980377747649305e-05, + "loss": 1.7458, + "step": 17994 + }, + { + "epoch": 5.523327194597913, + "grad_norm": 0.2048577219247818, + "learning_rate": 4.397544337152223e-05, + "loss": 1.763, + "step": 17995 + }, + { + "epoch": 5.523634131368938, + "grad_norm": 0.27748194336891174, + "learning_rate": 4.397050905493334e-05, + "loss": 1.7346, + "step": 17996 + }, + { + "epoch": 5.523941068139964, + "grad_norm": 0.3040253520011902, + "learning_rate": 4.3965574797931417e-05, + "loss": 1.7396, + "step": 17997 + }, + { + "epoch": 5.524248004910988, + "grad_norm": 0.3310317397117615, + "learning_rate": 4.396064060056523e-05, + "loss": 1.8094, + "step": 17998 + }, + { + "epoch": 5.524554941682013, + "grad_norm": 0.21845392882823944, + "learning_rate": 4.395570646288352e-05, + "loss": 1.7013, + "step": 17999 + }, + { + "epoch": 5.524861878453039, + "grad_norm": 0.319876492023468, + "learning_rate": 4.395077238493506e-05, + "loss": 1.7985, + "step": 18000 + }, + { + "epoch": 5.525168815224064, + "grad_norm": 0.28261950612068176, + "learning_rate": 4.394583836676863e-05, + "loss": 1.7979, + "step": 18001 + }, + { + "epoch": 5.525475751995089, + "grad_norm": 0.20874030888080597, + "learning_rate": 4.394090440843296e-05, + "loss": 1.7363, + "step": 18002 + }, + { + "epoch": 5.525782688766114, + "grad_norm": 0.28587406873703003, + "learning_rate": 4.393597050997684e-05, + "loss": 1.6787, + "step": 18003 + }, + { + "epoch": 5.526089625537139, + "grad_norm": 0.2719021439552307, + "learning_rate": 4.393103667144899e-05, + "loss": 1.7625, + "step": 18004 + }, + { + "epoch": 5.526396562308165, + "grad_norm": 0.22485414147377014, + "learning_rate": 4.392610289289821e-05, + "loss": 1.6847, + "step": 18005 + }, + { + "epoch": 5.52670349907919, + "grad_norm": 0.3500347435474396, + "learning_rate": 4.392116917437322e-05, + "loss": 1.7244, + "step": 18006 + }, + { + "epoch": 5.527010435850215, + "grad_norm": 0.26308783888816833, + "learning_rate": 4.3916235515922836e-05, + "loss": 1.7738, + "step": 18007 + }, + { + "epoch": 5.52731737262124, + "grad_norm": 0.27030646800994873, + "learning_rate": 4.391130191759574e-05, + "loss": 1.7149, + "step": 18008 + }, + { + "epoch": 5.527624309392265, + "grad_norm": 0.4137318730354309, + "learning_rate": 4.390636837944076e-05, + "loss": 1.7581, + "step": 18009 + }, + { + "epoch": 5.52793124616329, + "grad_norm": 0.2462068647146225, + "learning_rate": 4.390143490150659e-05, + "loss": 1.7767, + "step": 18010 + }, + { + "epoch": 5.528238182934316, + "grad_norm": 0.27424392104148865, + "learning_rate": 4.3896501483842036e-05, + "loss": 1.7701, + "step": 18011 + }, + { + "epoch": 5.528545119705341, + "grad_norm": 0.31268683075904846, + "learning_rate": 4.389156812649583e-05, + "loss": 1.7342, + "step": 18012 + }, + { + "epoch": 5.5288520564763655, + "grad_norm": 0.20428471267223358, + "learning_rate": 4.388663482951671e-05, + "loss": 1.7083, + "step": 18013 + }, + { + "epoch": 5.529158993247391, + "grad_norm": 0.322344034910202, + "learning_rate": 4.3881701592953475e-05, + "loss": 1.7423, + "step": 18014 + }, + { + "epoch": 5.529465930018416, + "grad_norm": 0.2267894744873047, + "learning_rate": 4.387676841685483e-05, + "loss": 1.7309, + "step": 18015 + }, + { + "epoch": 5.5297728667894415, + "grad_norm": 0.23041954636573792, + "learning_rate": 4.387183530126955e-05, + "loss": 1.7352, + "step": 18016 + }, + { + "epoch": 5.530079803560467, + "grad_norm": 0.31139662861824036, + "learning_rate": 4.386690224624638e-05, + "loss": 1.7223, + "step": 18017 + }, + { + "epoch": 5.530386740331492, + "grad_norm": 0.20144063234329224, + "learning_rate": 4.38619692518341e-05, + "loss": 1.7607, + "step": 18018 + }, + { + "epoch": 5.530693677102517, + "grad_norm": 0.23812296986579895, + "learning_rate": 4.385703631808142e-05, + "loss": 1.7599, + "step": 18019 + }, + { + "epoch": 5.531000613873542, + "grad_norm": 0.2442231923341751, + "learning_rate": 4.385210344503712e-05, + "loss": 1.7094, + "step": 18020 + }, + { + "epoch": 5.531307550644567, + "grad_norm": 0.19497406482696533, + "learning_rate": 4.384717063274992e-05, + "loss": 1.7686, + "step": 18021 + }, + { + "epoch": 5.531614487415593, + "grad_norm": 0.29085835814476013, + "learning_rate": 4.38422378812686e-05, + "loss": 1.7454, + "step": 18022 + }, + { + "epoch": 5.531921424186618, + "grad_norm": 0.2701610028743744, + "learning_rate": 4.3837305190641876e-05, + "loss": 1.7376, + "step": 18023 + }, + { + "epoch": 5.532228360957642, + "grad_norm": 0.21232132613658905, + "learning_rate": 4.383237256091854e-05, + "loss": 1.7773, + "step": 18024 + }, + { + "epoch": 5.532535297728668, + "grad_norm": 0.24131610989570618, + "learning_rate": 4.382743999214729e-05, + "loss": 1.7899, + "step": 18025 + }, + { + "epoch": 5.532842234499693, + "grad_norm": 0.2752540409564972, + "learning_rate": 4.382250748437692e-05, + "loss": 1.7603, + "step": 18026 + }, + { + "epoch": 5.533149171270718, + "grad_norm": 0.2007865607738495, + "learning_rate": 4.381757503765613e-05, + "loss": 1.7553, + "step": 18027 + }, + { + "epoch": 5.533456108041744, + "grad_norm": 0.23768723011016846, + "learning_rate": 4.38126426520337e-05, + "loss": 1.757, + "step": 18028 + }, + { + "epoch": 5.533763044812769, + "grad_norm": 0.22198502719402313, + "learning_rate": 4.3807710327558366e-05, + "loss": 1.7578, + "step": 18029 + }, + { + "epoch": 5.5340699815837935, + "grad_norm": 0.22432352602481842, + "learning_rate": 4.380277806427885e-05, + "loss": 1.75, + "step": 18030 + }, + { + "epoch": 5.534376918354819, + "grad_norm": 0.23029591143131256, + "learning_rate": 4.379784586224394e-05, + "loss": 1.7829, + "step": 18031 + }, + { + "epoch": 5.534683855125844, + "grad_norm": 0.23901896178722382, + "learning_rate": 4.379291372150232e-05, + "loss": 1.7461, + "step": 18032 + }, + { + "epoch": 5.5349907918968695, + "grad_norm": 0.20958681404590607, + "learning_rate": 4.378798164210278e-05, + "loss": 1.7224, + "step": 18033 + }, + { + "epoch": 5.535297728667894, + "grad_norm": 0.21619680523872375, + "learning_rate": 4.3783049624094036e-05, + "loss": 1.7605, + "step": 18034 + }, + { + "epoch": 5.535604665438919, + "grad_norm": 0.22988620400428772, + "learning_rate": 4.3778117667524867e-05, + "loss": 1.7668, + "step": 18035 + }, + { + "epoch": 5.535911602209945, + "grad_norm": 0.20107243955135345, + "learning_rate": 4.377318577244395e-05, + "loss": 1.7932, + "step": 18036 + }, + { + "epoch": 5.53621853898097, + "grad_norm": 0.25803956389427185, + "learning_rate": 4.376825393890009e-05, + "loss": 1.7409, + "step": 18037 + }, + { + "epoch": 5.536525475751995, + "grad_norm": 0.34292399883270264, + "learning_rate": 4.376332216694198e-05, + "loss": 1.8554, + "step": 18038 + }, + { + "epoch": 5.536832412523021, + "grad_norm": 0.23147790133953094, + "learning_rate": 4.375839045661839e-05, + "loss": 1.7918, + "step": 18039 + }, + { + "epoch": 5.537139349294045, + "grad_norm": 0.2387644350528717, + "learning_rate": 4.375345880797802e-05, + "loss": 1.7391, + "step": 18040 + }, + { + "epoch": 5.53744628606507, + "grad_norm": 0.21463727951049805, + "learning_rate": 4.374852722106966e-05, + "loss": 1.6812, + "step": 18041 + }, + { + "epoch": 5.537753222836096, + "grad_norm": 0.21994563937187195, + "learning_rate": 4.3743595695941994e-05, + "loss": 1.7727, + "step": 18042 + }, + { + "epoch": 5.538060159607121, + "grad_norm": 0.21102699637413025, + "learning_rate": 4.373866423264381e-05, + "loss": 1.7854, + "step": 18043 + }, + { + "epoch": 5.538367096378146, + "grad_norm": 0.21742786467075348, + "learning_rate": 4.3733732831223794e-05, + "loss": 1.7352, + "step": 18044 + }, + { + "epoch": 5.538674033149171, + "grad_norm": 0.20080791413784027, + "learning_rate": 4.372880149173071e-05, + "loss": 1.7264, + "step": 18045 + }, + { + "epoch": 5.538980969920196, + "grad_norm": 0.21027569472789764, + "learning_rate": 4.372387021421329e-05, + "loss": 1.766, + "step": 18046 + }, + { + "epoch": 5.5392879066912215, + "grad_norm": 0.22870683670043945, + "learning_rate": 4.371893899872025e-05, + "loss": 1.7746, + "step": 18047 + }, + { + "epoch": 5.539594843462247, + "grad_norm": 0.21248690783977509, + "learning_rate": 4.371400784530036e-05, + "loss": 1.7447, + "step": 18048 + }, + { + "epoch": 5.539901780233272, + "grad_norm": 0.23059454560279846, + "learning_rate": 4.37090767540023e-05, + "loss": 1.7827, + "step": 18049 + }, + { + "epoch": 5.5402087170042975, + "grad_norm": 0.2519036531448364, + "learning_rate": 4.370414572487485e-05, + "loss": 1.7984, + "step": 18050 + }, + { + "epoch": 5.540515653775322, + "grad_norm": 0.23621398210525513, + "learning_rate": 4.36992147579667e-05, + "loss": 1.7517, + "step": 18051 + }, + { + "epoch": 5.540822590546347, + "grad_norm": 0.24267609417438507, + "learning_rate": 4.3694283853326625e-05, + "loss": 1.8285, + "step": 18052 + }, + { + "epoch": 5.541129527317373, + "grad_norm": 0.23209960758686066, + "learning_rate": 4.368935301100332e-05, + "loss": 1.7765, + "step": 18053 + }, + { + "epoch": 5.541436464088398, + "grad_norm": 0.21277187764644623, + "learning_rate": 4.368442223104555e-05, + "loss": 1.7182, + "step": 18054 + }, + { + "epoch": 5.541743400859423, + "grad_norm": 0.20821616053581238, + "learning_rate": 4.367949151350199e-05, + "loss": 1.6766, + "step": 18055 + }, + { + "epoch": 5.542050337630448, + "grad_norm": 0.23019999265670776, + "learning_rate": 4.3674560858421414e-05, + "loss": 1.7438, + "step": 18056 + }, + { + "epoch": 5.542357274401473, + "grad_norm": 0.21547134220600128, + "learning_rate": 4.366963026585253e-05, + "loss": 1.7003, + "step": 18057 + }, + { + "epoch": 5.542664211172498, + "grad_norm": 0.22454513609409332, + "learning_rate": 4.3664699735844084e-05, + "loss": 1.7072, + "step": 18058 + }, + { + "epoch": 5.542971147943524, + "grad_norm": 0.22228482365608215, + "learning_rate": 4.365976926844477e-05, + "loss": 1.7557, + "step": 18059 + }, + { + "epoch": 5.543278084714549, + "grad_norm": 0.25762560963630676, + "learning_rate": 4.365483886370335e-05, + "loss": 1.7751, + "step": 18060 + }, + { + "epoch": 5.543585021485574, + "grad_norm": 0.2086205631494522, + "learning_rate": 4.3649908521668516e-05, + "loss": 1.7399, + "step": 18061 + }, + { + "epoch": 5.543891958256599, + "grad_norm": 0.2759089767932892, + "learning_rate": 4.3644978242389014e-05, + "loss": 1.7503, + "step": 18062 + }, + { + "epoch": 5.544198895027624, + "grad_norm": 0.2235182225704193, + "learning_rate": 4.364004802591358e-05, + "loss": 1.7313, + "step": 18063 + }, + { + "epoch": 5.5445058317986495, + "grad_norm": 0.23074570298194885, + "learning_rate": 4.3635117872290885e-05, + "loss": 1.7649, + "step": 18064 + }, + { + "epoch": 5.544812768569675, + "grad_norm": 0.24929538369178772, + "learning_rate": 4.363018778156972e-05, + "loss": 1.732, + "step": 18065 + }, + { + "epoch": 5.5451197053407, + "grad_norm": 0.26422035694122314, + "learning_rate": 4.362525775379874e-05, + "loss": 1.7276, + "step": 18066 + }, + { + "epoch": 5.545426642111725, + "grad_norm": 0.3160388767719269, + "learning_rate": 4.362032778902672e-05, + "loss": 1.7777, + "step": 18067 + }, + { + "epoch": 5.54573357888275, + "grad_norm": 0.20791196823120117, + "learning_rate": 4.3615397887302345e-05, + "loss": 1.7058, + "step": 18068 + }, + { + "epoch": 5.546040515653775, + "grad_norm": 0.31438156962394714, + "learning_rate": 4.361046804867437e-05, + "loss": 1.8102, + "step": 18069 + }, + { + "epoch": 5.546347452424801, + "grad_norm": 0.3008113205432892, + "learning_rate": 4.3605538273191475e-05, + "loss": 1.7297, + "step": 18070 + }, + { + "epoch": 5.546654389195826, + "grad_norm": 0.21147282421588898, + "learning_rate": 4.3600608560902425e-05, + "loss": 1.776, + "step": 18071 + }, + { + "epoch": 5.546961325966851, + "grad_norm": 0.25202393531799316, + "learning_rate": 4.3595678911855884e-05, + "loss": 1.7273, + "step": 18072 + }, + { + "epoch": 5.547268262737876, + "grad_norm": 0.18881210684776306, + "learning_rate": 4.3590749326100614e-05, + "loss": 1.7026, + "step": 18073 + }, + { + "epoch": 5.547575199508901, + "grad_norm": 0.25075671076774597, + "learning_rate": 4.3585819803685295e-05, + "loss": 1.7694, + "step": 18074 + }, + { + "epoch": 5.547882136279926, + "grad_norm": 0.2625887989997864, + "learning_rate": 4.358089034465869e-05, + "loss": 1.7338, + "step": 18075 + }, + { + "epoch": 5.548189073050952, + "grad_norm": 0.27278679609298706, + "learning_rate": 4.357596094906947e-05, + "loss": 1.7684, + "step": 18076 + }, + { + "epoch": 5.548496009821976, + "grad_norm": 0.283964604139328, + "learning_rate": 4.3571031616966396e-05, + "loss": 1.7539, + "step": 18077 + }, + { + "epoch": 5.5488029465930016, + "grad_norm": 0.2702009975910187, + "learning_rate": 4.3566102348398124e-05, + "loss": 1.8064, + "step": 18078 + }, + { + "epoch": 5.549109883364027, + "grad_norm": 0.449733167886734, + "learning_rate": 4.356117314341342e-05, + "loss": 1.7258, + "step": 18079 + }, + { + "epoch": 5.549416820135052, + "grad_norm": 0.3199995160102844, + "learning_rate": 4.3556244002060975e-05, + "loss": 1.7526, + "step": 18080 + }, + { + "epoch": 5.5497237569060776, + "grad_norm": 0.2803747355937958, + "learning_rate": 4.3551314924389494e-05, + "loss": 1.764, + "step": 18081 + }, + { + "epoch": 5.550030693677103, + "grad_norm": 0.28995978832244873, + "learning_rate": 4.3546385910447715e-05, + "loss": 1.7617, + "step": 18082 + }, + { + "epoch": 5.550337630448127, + "grad_norm": 0.24313311278820038, + "learning_rate": 4.354145696028431e-05, + "loss": 1.7515, + "step": 18083 + }, + { + "epoch": 5.550644567219153, + "grad_norm": 0.2668032944202423, + "learning_rate": 4.3536528073948025e-05, + "loss": 1.743, + "step": 18084 + }, + { + "epoch": 5.550951503990178, + "grad_norm": 0.22831310331821442, + "learning_rate": 4.353159925148755e-05, + "loss": 1.7971, + "step": 18085 + }, + { + "epoch": 5.551258440761203, + "grad_norm": 0.22047942876815796, + "learning_rate": 4.352667049295162e-05, + "loss": 1.6983, + "step": 18086 + }, + { + "epoch": 5.551565377532229, + "grad_norm": 0.22895069420337677, + "learning_rate": 4.35217417983889e-05, + "loss": 1.7866, + "step": 18087 + }, + { + "epoch": 5.551872314303253, + "grad_norm": 0.19946368038654327, + "learning_rate": 4.3516813167848156e-05, + "loss": 1.7129, + "step": 18088 + }, + { + "epoch": 5.5521792510742785, + "grad_norm": 0.21508903801441193, + "learning_rate": 4.351188460137804e-05, + "loss": 1.7154, + "step": 18089 + }, + { + "epoch": 5.552486187845304, + "grad_norm": 0.24813953042030334, + "learning_rate": 4.3506956099027294e-05, + "loss": 1.8326, + "step": 18090 + }, + { + "epoch": 5.552793124616329, + "grad_norm": 0.21306444704532623, + "learning_rate": 4.35020276608446e-05, + "loss": 1.7651, + "step": 18091 + }, + { + "epoch": 5.5531000613873545, + "grad_norm": 0.22041217982769012, + "learning_rate": 4.34970992868787e-05, + "loss": 1.6852, + "step": 18092 + }, + { + "epoch": 5.55340699815838, + "grad_norm": 0.21699896454811096, + "learning_rate": 4.349217097717826e-05, + "loss": 1.7524, + "step": 18093 + }, + { + "epoch": 5.553713934929404, + "grad_norm": 0.23086662590503693, + "learning_rate": 4.3487242731792015e-05, + "loss": 1.7441, + "step": 18094 + }, + { + "epoch": 5.55402087170043, + "grad_norm": 0.21898184716701508, + "learning_rate": 4.348231455076864e-05, + "loss": 1.7131, + "step": 18095 + }, + { + "epoch": 5.554327808471455, + "grad_norm": 0.17392560839653015, + "learning_rate": 4.3477386434156854e-05, + "loss": 1.7049, + "step": 18096 + }, + { + "epoch": 5.55463474524248, + "grad_norm": 0.1984172910451889, + "learning_rate": 4.3472458382005374e-05, + "loss": 1.7136, + "step": 18097 + }, + { + "epoch": 5.554941682013506, + "grad_norm": 0.19227837026119232, + "learning_rate": 4.3467530394362866e-05, + "loss": 1.7468, + "step": 18098 + }, + { + "epoch": 5.55524861878453, + "grad_norm": 0.2307087779045105, + "learning_rate": 4.346260247127807e-05, + "loss": 1.7004, + "step": 18099 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.21496252715587616, + "learning_rate": 4.345767461279965e-05, + "loss": 1.7508, + "step": 18100 + }, + { + "epoch": 5.555862492326581, + "grad_norm": 0.21119998395442963, + "learning_rate": 4.3452746818976333e-05, + "loss": 1.7965, + "step": 18101 + }, + { + "epoch": 5.556169429097606, + "grad_norm": 0.2416355311870575, + "learning_rate": 4.34478190898568e-05, + "loss": 1.7006, + "step": 18102 + }, + { + "epoch": 5.556476365868631, + "grad_norm": 0.2009642869234085, + "learning_rate": 4.344289142548978e-05, + "loss": 1.7567, + "step": 18103 + }, + { + "epoch": 5.556783302639657, + "grad_norm": 0.2387058436870575, + "learning_rate": 4.343796382592393e-05, + "loss": 1.7898, + "step": 18104 + }, + { + "epoch": 5.557090239410681, + "grad_norm": 0.19835951924324036, + "learning_rate": 4.343303629120798e-05, + "loss": 1.7888, + "step": 18105 + }, + { + "epoch": 5.5573971761817065, + "grad_norm": 0.23324637115001678, + "learning_rate": 4.3428108821390604e-05, + "loss": 1.7923, + "step": 18106 + }, + { + "epoch": 5.557704112952732, + "grad_norm": 0.22334477305412292, + "learning_rate": 4.342318141652052e-05, + "loss": 1.7234, + "step": 18107 + }, + { + "epoch": 5.558011049723757, + "grad_norm": 0.20220427215099335, + "learning_rate": 4.341825407664639e-05, + "loss": 1.7639, + "step": 18108 + }, + { + "epoch": 5.558317986494782, + "grad_norm": 0.23658546805381775, + "learning_rate": 4.3413326801816964e-05, + "loss": 1.7505, + "step": 18109 + }, + { + "epoch": 5.558624923265807, + "grad_norm": 0.21157726645469666, + "learning_rate": 4.3408399592080875e-05, + "loss": 1.7655, + "step": 18110 + }, + { + "epoch": 5.558931860036832, + "grad_norm": 0.2139829397201538, + "learning_rate": 4.340347244748687e-05, + "loss": 1.767, + "step": 18111 + }, + { + "epoch": 5.559238796807858, + "grad_norm": 0.17811299860477448, + "learning_rate": 4.339854536808359e-05, + "loss": 1.6629, + "step": 18112 + }, + { + "epoch": 5.559545733578883, + "grad_norm": 0.2005898356437683, + "learning_rate": 4.339361835391977e-05, + "loss": 1.7269, + "step": 18113 + }, + { + "epoch": 5.559852670349908, + "grad_norm": 0.21514086425304413, + "learning_rate": 4.338869140504409e-05, + "loss": 1.7806, + "step": 18114 + }, + { + "epoch": 5.560159607120933, + "grad_norm": 0.23163840174674988, + "learning_rate": 4.338376452150522e-05, + "loss": 1.7259, + "step": 18115 + }, + { + "epoch": 5.560466543891958, + "grad_norm": 0.23657509684562683, + "learning_rate": 4.337883770335189e-05, + "loss": 1.7778, + "step": 18116 + }, + { + "epoch": 5.560773480662983, + "grad_norm": 0.20135201513767242, + "learning_rate": 4.337391095063274e-05, + "loss": 1.7359, + "step": 18117 + }, + { + "epoch": 5.561080417434009, + "grad_norm": 0.22871774435043335, + "learning_rate": 4.33689842633965e-05, + "loss": 1.7658, + "step": 18118 + }, + { + "epoch": 5.561387354205034, + "grad_norm": 0.21755221486091614, + "learning_rate": 4.3364057641691835e-05, + "loss": 1.7408, + "step": 18119 + }, + { + "epoch": 5.5616942909760585, + "grad_norm": 0.215267151594162, + "learning_rate": 4.335913108556746e-05, + "loss": 1.7175, + "step": 18120 + }, + { + "epoch": 5.562001227747084, + "grad_norm": 0.25724974274635315, + "learning_rate": 4.335420459507202e-05, + "loss": 1.7197, + "step": 18121 + }, + { + "epoch": 5.562308164518109, + "grad_norm": 0.25375521183013916, + "learning_rate": 4.3349278170254254e-05, + "loss": 1.7251, + "step": 18122 + }, + { + "epoch": 5.5626151012891345, + "grad_norm": 0.24768905341625214, + "learning_rate": 4.334435181116279e-05, + "loss": 1.7405, + "step": 18123 + }, + { + "epoch": 5.56292203806016, + "grad_norm": 0.21281081438064575, + "learning_rate": 4.333942551784636e-05, + "loss": 1.7131, + "step": 18124 + }, + { + "epoch": 5.563228974831185, + "grad_norm": 0.2129398137331009, + "learning_rate": 4.333449929035361e-05, + "loss": 1.7049, + "step": 18125 + }, + { + "epoch": 5.56353591160221, + "grad_norm": 0.24582397937774658, + "learning_rate": 4.332957312873328e-05, + "loss": 1.7205, + "step": 18126 + }, + { + "epoch": 5.563842848373235, + "grad_norm": 0.21282973885536194, + "learning_rate": 4.332464703303399e-05, + "loss": 1.7655, + "step": 18127 + }, + { + "epoch": 5.56414978514426, + "grad_norm": 0.2302251160144806, + "learning_rate": 4.331972100330447e-05, + "loss": 1.7597, + "step": 18128 + }, + { + "epoch": 5.564456721915286, + "grad_norm": 0.23453226685523987, + "learning_rate": 4.331479503959336e-05, + "loss": 1.7028, + "step": 18129 + }, + { + "epoch": 5.564763658686311, + "grad_norm": 0.19723562896251678, + "learning_rate": 4.330986914194938e-05, + "loss": 1.7101, + "step": 18130 + }, + { + "epoch": 5.565070595457335, + "grad_norm": 0.22021643817424774, + "learning_rate": 4.33049433104212e-05, + "loss": 1.7123, + "step": 18131 + }, + { + "epoch": 5.565377532228361, + "grad_norm": 0.25540977716445923, + "learning_rate": 4.3300017545057484e-05, + "loss": 1.7392, + "step": 18132 + }, + { + "epoch": 5.565684468999386, + "grad_norm": 0.23482176661491394, + "learning_rate": 4.329509184590693e-05, + "loss": 1.7175, + "step": 18133 + }, + { + "epoch": 5.565991405770411, + "grad_norm": 0.19537311792373657, + "learning_rate": 4.329016621301819e-05, + "loss": 1.7583, + "step": 18134 + }, + { + "epoch": 5.566298342541437, + "grad_norm": 0.21828842163085938, + "learning_rate": 4.328524064643997e-05, + "loss": 1.7411, + "step": 18135 + }, + { + "epoch": 5.566605279312462, + "grad_norm": 0.24589122831821442, + "learning_rate": 4.328031514622093e-05, + "loss": 1.7769, + "step": 18136 + }, + { + "epoch": 5.5669122160834865, + "grad_norm": 0.20964545011520386, + "learning_rate": 4.327538971240978e-05, + "loss": 1.7743, + "step": 18137 + }, + { + "epoch": 5.567219152854512, + "grad_norm": 0.2210713028907776, + "learning_rate": 4.327046434505514e-05, + "loss": 1.7671, + "step": 18138 + }, + { + "epoch": 5.567526089625537, + "grad_norm": 0.21382687985897064, + "learning_rate": 4.3265539044205736e-05, + "loss": 1.793, + "step": 18139 + }, + { + "epoch": 5.5678330263965625, + "grad_norm": 0.23289678990840912, + "learning_rate": 4.326061380991021e-05, + "loss": 1.738, + "step": 18140 + }, + { + "epoch": 5.568139963167588, + "grad_norm": 0.23789258301258087, + "learning_rate": 4.325568864221725e-05, + "loss": 1.8315, + "step": 18141 + }, + { + "epoch": 5.568446899938612, + "grad_norm": 0.1925022453069687, + "learning_rate": 4.325076354117554e-05, + "loss": 1.6956, + "step": 18142 + }, + { + "epoch": 5.568753836709638, + "grad_norm": 0.22522561252117157, + "learning_rate": 4.324583850683373e-05, + "loss": 1.7957, + "step": 18143 + }, + { + "epoch": 5.569060773480663, + "grad_norm": 0.2787671387195587, + "learning_rate": 4.324091353924049e-05, + "loss": 1.7325, + "step": 18144 + }, + { + "epoch": 5.569367710251688, + "grad_norm": 0.2723194658756256, + "learning_rate": 4.3235988638444536e-05, + "loss": 1.7668, + "step": 18145 + }, + { + "epoch": 5.569674647022714, + "grad_norm": 0.2241704910993576, + "learning_rate": 4.3231063804494484e-05, + "loss": 1.7977, + "step": 18146 + }, + { + "epoch": 5.569981583793739, + "grad_norm": 0.2627747356891632, + "learning_rate": 4.322613903743903e-05, + "loss": 1.6775, + "step": 18147 + }, + { + "epoch": 5.570288520564763, + "grad_norm": 0.2644255757331848, + "learning_rate": 4.322121433732686e-05, + "loss": 1.7404, + "step": 18148 + }, + { + "epoch": 5.570595457335789, + "grad_norm": 0.2386743575334549, + "learning_rate": 4.321628970420659e-05, + "loss": 1.7386, + "step": 18149 + }, + { + "epoch": 5.570902394106814, + "grad_norm": 0.22444583475589752, + "learning_rate": 4.3211365138126945e-05, + "loss": 1.7482, + "step": 18150 + }, + { + "epoch": 5.571209330877839, + "grad_norm": 0.21770013868808746, + "learning_rate": 4.3206440639136554e-05, + "loss": 1.7322, + "step": 18151 + }, + { + "epoch": 5.571516267648864, + "grad_norm": 0.22356587648391724, + "learning_rate": 4.320151620728411e-05, + "loss": 1.751, + "step": 18152 + }, + { + "epoch": 5.571823204419889, + "grad_norm": 0.2040669322013855, + "learning_rate": 4.319659184261826e-05, + "loss": 1.712, + "step": 18153 + }, + { + "epoch": 5.5721301411909145, + "grad_norm": 0.20951713621616364, + "learning_rate": 4.319166754518768e-05, + "loss": 1.7308, + "step": 18154 + }, + { + "epoch": 5.57243707796194, + "grad_norm": 0.186195969581604, + "learning_rate": 4.3186743315041025e-05, + "loss": 1.7133, + "step": 18155 + }, + { + "epoch": 5.572744014732965, + "grad_norm": 0.2098865509033203, + "learning_rate": 4.318181915222698e-05, + "loss": 1.7645, + "step": 18156 + }, + { + "epoch": 5.5730509515039905, + "grad_norm": 0.20552097260951996, + "learning_rate": 4.317689505679418e-05, + "loss": 1.7156, + "step": 18157 + }, + { + "epoch": 5.573357888275015, + "grad_norm": 0.22506964206695557, + "learning_rate": 4.3171971028791314e-05, + "loss": 1.7192, + "step": 18158 + }, + { + "epoch": 5.57366482504604, + "grad_norm": 0.2296760082244873, + "learning_rate": 4.316704706826702e-05, + "loss": 1.7534, + "step": 18159 + }, + { + "epoch": 5.573971761817066, + "grad_norm": 0.20140253007411957, + "learning_rate": 4.316212317526998e-05, + "loss": 1.6906, + "step": 18160 + }, + { + "epoch": 5.574278698588091, + "grad_norm": 0.23313316702842712, + "learning_rate": 4.315719934984884e-05, + "loss": 1.6929, + "step": 18161 + }, + { + "epoch": 5.574585635359116, + "grad_norm": 0.23398169875144958, + "learning_rate": 4.315227559205228e-05, + "loss": 1.7254, + "step": 18162 + }, + { + "epoch": 5.574892572130141, + "grad_norm": 0.20836731791496277, + "learning_rate": 4.314735190192894e-05, + "loss": 1.7335, + "step": 18163 + }, + { + "epoch": 5.575199508901166, + "grad_norm": 0.19899079203605652, + "learning_rate": 4.3142428279527485e-05, + "loss": 1.69, + "step": 18164 + }, + { + "epoch": 5.5755064456721914, + "grad_norm": 0.24623680114746094, + "learning_rate": 4.313750472489657e-05, + "loss": 1.7413, + "step": 18165 + }, + { + "epoch": 5.575813382443217, + "grad_norm": 0.2432616949081421, + "learning_rate": 4.313258123808484e-05, + "loss": 1.7426, + "step": 18166 + }, + { + "epoch": 5.576120319214242, + "grad_norm": 0.22773970663547516, + "learning_rate": 4.3127657819141006e-05, + "loss": 1.7986, + "step": 18167 + }, + { + "epoch": 5.5764272559852675, + "grad_norm": 0.19891540706157684, + "learning_rate": 4.312273446811366e-05, + "loss": 1.7007, + "step": 18168 + }, + { + "epoch": 5.576734192756292, + "grad_norm": 0.23402714729309082, + "learning_rate": 4.311781118505149e-05, + "loss": 1.7774, + "step": 18169 + }, + { + "epoch": 5.577041129527317, + "grad_norm": 0.2248220294713974, + "learning_rate": 4.3112887970003134e-05, + "loss": 1.7079, + "step": 18170 + }, + { + "epoch": 5.577348066298343, + "grad_norm": 0.20901209115982056, + "learning_rate": 4.310796482301726e-05, + "loss": 1.7336, + "step": 18171 + }, + { + "epoch": 5.577655003069368, + "grad_norm": 0.21872754395008087, + "learning_rate": 4.3103041744142516e-05, + "loss": 1.7742, + "step": 18172 + }, + { + "epoch": 5.577961939840393, + "grad_norm": 0.2567403018474579, + "learning_rate": 4.309811873342757e-05, + "loss": 1.7894, + "step": 18173 + }, + { + "epoch": 5.578268876611418, + "grad_norm": 0.219998300075531, + "learning_rate": 4.3093195790921035e-05, + "loss": 1.7283, + "step": 18174 + }, + { + "epoch": 5.578575813382443, + "grad_norm": 0.1944747269153595, + "learning_rate": 4.3088272916671614e-05, + "loss": 1.7129, + "step": 18175 + }, + { + "epoch": 5.578882750153468, + "grad_norm": 0.19492141902446747, + "learning_rate": 4.308335011072791e-05, + "loss": 1.7286, + "step": 18176 + }, + { + "epoch": 5.579189686924494, + "grad_norm": 0.22383002936840057, + "learning_rate": 4.3078427373138604e-05, + "loss": 1.733, + "step": 18177 + }, + { + "epoch": 5.579496623695519, + "grad_norm": 0.20238643884658813, + "learning_rate": 4.307350470395232e-05, + "loss": 1.7522, + "step": 18178 + }, + { + "epoch": 5.579803560466544, + "grad_norm": 0.21456125378608704, + "learning_rate": 4.3068582103217755e-05, + "loss": 1.7298, + "step": 18179 + }, + { + "epoch": 5.580110497237569, + "grad_norm": 0.28084230422973633, + "learning_rate": 4.3063659570983514e-05, + "loss": 1.7805, + "step": 18180 + }, + { + "epoch": 5.580417434008594, + "grad_norm": 0.21319706737995148, + "learning_rate": 4.305873710729824e-05, + "loss": 1.6801, + "step": 18181 + }, + { + "epoch": 5.5807243707796195, + "grad_norm": 0.2279660850763321, + "learning_rate": 4.30538147122106e-05, + "loss": 1.752, + "step": 18182 + }, + { + "epoch": 5.581031307550645, + "grad_norm": 0.1958594173192978, + "learning_rate": 4.304889238576922e-05, + "loss": 1.7487, + "step": 18183 + }, + { + "epoch": 5.581338244321669, + "grad_norm": 0.19484321773052216, + "learning_rate": 4.304397012802279e-05, + "loss": 1.7222, + "step": 18184 + }, + { + "epoch": 5.581645181092695, + "grad_norm": 0.19863305985927582, + "learning_rate": 4.3039047939019906e-05, + "loss": 1.7296, + "step": 18185 + }, + { + "epoch": 5.58195211786372, + "grad_norm": 0.18674087524414062, + "learning_rate": 4.303412581880924e-05, + "loss": 1.6753, + "step": 18186 + }, + { + "epoch": 5.582259054634745, + "grad_norm": 0.22263208031654358, + "learning_rate": 4.302920376743941e-05, + "loss": 1.7431, + "step": 18187 + }, + { + "epoch": 5.582565991405771, + "grad_norm": 0.1926872879266739, + "learning_rate": 4.302428178495909e-05, + "loss": 1.7662, + "step": 18188 + }, + { + "epoch": 5.582872928176796, + "grad_norm": 0.23190459609031677, + "learning_rate": 4.301935987141689e-05, + "loss": 1.7271, + "step": 18189 + }, + { + "epoch": 5.58317986494782, + "grad_norm": 0.30057230591773987, + "learning_rate": 4.301443802686148e-05, + "loss": 1.7957, + "step": 18190 + }, + { + "epoch": 5.583486801718846, + "grad_norm": 0.2520695626735687, + "learning_rate": 4.3009516251341475e-05, + "loss": 1.7501, + "step": 18191 + }, + { + "epoch": 5.583793738489871, + "grad_norm": 0.19143317639827728, + "learning_rate": 4.300459454490555e-05, + "loss": 1.7091, + "step": 18192 + }, + { + "epoch": 5.584100675260896, + "grad_norm": 0.2064475119113922, + "learning_rate": 4.299967290760229e-05, + "loss": 1.6849, + "step": 18193 + }, + { + "epoch": 5.584407612031922, + "grad_norm": 0.3093598484992981, + "learning_rate": 4.299475133948039e-05, + "loss": 1.8479, + "step": 18194 + }, + { + "epoch": 5.584714548802946, + "grad_norm": 0.2875300943851471, + "learning_rate": 4.298982984058845e-05, + "loss": 1.7296, + "step": 18195 + }, + { + "epoch": 5.5850214855739715, + "grad_norm": 0.33194443583488464, + "learning_rate": 4.298490841097514e-05, + "loss": 1.7668, + "step": 18196 + }, + { + "epoch": 5.585328422344997, + "grad_norm": 0.20940829813480377, + "learning_rate": 4.297998705068908e-05, + "loss": 1.7316, + "step": 18197 + }, + { + "epoch": 5.585635359116022, + "grad_norm": 0.32381999492645264, + "learning_rate": 4.297506575977887e-05, + "loss": 1.7212, + "step": 18198 + }, + { + "epoch": 5.5859422958870475, + "grad_norm": 0.31585511565208435, + "learning_rate": 4.29701445382932e-05, + "loss": 1.7695, + "step": 18199 + }, + { + "epoch": 5.586249232658073, + "grad_norm": 0.2272588014602661, + "learning_rate": 4.2965223386280664e-05, + "loss": 1.7105, + "step": 18200 + }, + { + "epoch": 5.586556169429097, + "grad_norm": 0.2949761152267456, + "learning_rate": 4.296030230378993e-05, + "loss": 1.803, + "step": 18201 + }, + { + "epoch": 5.586863106200123, + "grad_norm": 0.20512579381465912, + "learning_rate": 4.29553812908696e-05, + "loss": 1.759, + "step": 18202 + }, + { + "epoch": 5.587170042971148, + "grad_norm": 0.21143598854541779, + "learning_rate": 4.295046034756835e-05, + "loss": 1.7286, + "step": 18203 + }, + { + "epoch": 5.587476979742173, + "grad_norm": 0.22148001194000244, + "learning_rate": 4.294553947393476e-05, + "loss": 1.7258, + "step": 18204 + }, + { + "epoch": 5.587783916513199, + "grad_norm": 0.17245957255363464, + "learning_rate": 4.2940618670017484e-05, + "loss": 1.6863, + "step": 18205 + }, + { + "epoch": 5.588090853284223, + "grad_norm": 0.20260390639305115, + "learning_rate": 4.293569793586515e-05, + "loss": 1.6866, + "step": 18206 + }, + { + "epoch": 5.588397790055248, + "grad_norm": 0.20671936869621277, + "learning_rate": 4.293077727152641e-05, + "loss": 1.7849, + "step": 18207 + }, + { + "epoch": 5.588704726826274, + "grad_norm": 0.21415838599205017, + "learning_rate": 4.292585667704984e-05, + "loss": 1.7279, + "step": 18208 + }, + { + "epoch": 5.589011663597299, + "grad_norm": 0.18668091297149658, + "learning_rate": 4.2920936152484134e-05, + "loss": 1.7087, + "step": 18209 + }, + { + "epoch": 5.589318600368324, + "grad_norm": 0.2253870815038681, + "learning_rate": 4.291601569787786e-05, + "loss": 1.769, + "step": 18210 + }, + { + "epoch": 5.58962553713935, + "grad_norm": 0.22426939010620117, + "learning_rate": 4.291109531327968e-05, + "loss": 1.7382, + "step": 18211 + }, + { + "epoch": 5.589932473910374, + "grad_norm": 0.21552452445030212, + "learning_rate": 4.29061749987382e-05, + "loss": 1.7316, + "step": 18212 + }, + { + "epoch": 5.5902394106813995, + "grad_norm": 0.2337147295475006, + "learning_rate": 4.290125475430209e-05, + "loss": 1.7836, + "step": 18213 + }, + { + "epoch": 5.590546347452425, + "grad_norm": 0.21780124306678772, + "learning_rate": 4.289633458001992e-05, + "loss": 1.6923, + "step": 18214 + }, + { + "epoch": 5.59085328422345, + "grad_norm": 0.20009608566761017, + "learning_rate": 4.289141447594033e-05, + "loss": 1.719, + "step": 18215 + }, + { + "epoch": 5.5911602209944755, + "grad_norm": 0.18165744841098785, + "learning_rate": 4.288649444211196e-05, + "loss": 1.6825, + "step": 18216 + }, + { + "epoch": 5.5914671577655, + "grad_norm": 0.2244826704263687, + "learning_rate": 4.288157447858341e-05, + "loss": 1.7323, + "step": 18217 + }, + { + "epoch": 5.591774094536525, + "grad_norm": 0.16875946521759033, + "learning_rate": 4.2876654585403325e-05, + "loss": 1.6787, + "step": 18218 + }, + { + "epoch": 5.592081031307551, + "grad_norm": 0.19244243204593658, + "learning_rate": 4.28717347626203e-05, + "loss": 1.7225, + "step": 18219 + }, + { + "epoch": 5.592387968078576, + "grad_norm": 0.21081633865833282, + "learning_rate": 4.286681501028299e-05, + "loss": 1.7063, + "step": 18220 + }, + { + "epoch": 5.592694904849601, + "grad_norm": 0.20926406979560852, + "learning_rate": 4.286189532843997e-05, + "loss": 1.7307, + "step": 18221 + }, + { + "epoch": 5.593001841620627, + "grad_norm": 0.20258775353431702, + "learning_rate": 4.28569757171399e-05, + "loss": 1.6917, + "step": 18222 + }, + { + "epoch": 5.593308778391651, + "grad_norm": 0.21956230700016022, + "learning_rate": 4.285205617643137e-05, + "loss": 1.7127, + "step": 18223 + }, + { + "epoch": 5.593615715162676, + "grad_norm": 0.2071436047554016, + "learning_rate": 4.284713670636303e-05, + "loss": 1.7487, + "step": 18224 + }, + { + "epoch": 5.593922651933702, + "grad_norm": 0.2002478390932083, + "learning_rate": 4.2842217306983464e-05, + "loss": 1.6544, + "step": 18225 + }, + { + "epoch": 5.594229588704727, + "grad_norm": 0.20691382884979248, + "learning_rate": 4.283729797834132e-05, + "loss": 1.768, + "step": 18226 + }, + { + "epoch": 5.5945365254757515, + "grad_norm": 0.18423563241958618, + "learning_rate": 4.283237872048517e-05, + "loss": 1.7563, + "step": 18227 + }, + { + "epoch": 5.594843462246777, + "grad_norm": 0.23055453598499298, + "learning_rate": 4.2827459533463665e-05, + "loss": 1.8083, + "step": 18228 + }, + { + "epoch": 5.595150399017802, + "grad_norm": 0.20735648274421692, + "learning_rate": 4.2822540417325396e-05, + "loss": 1.7761, + "step": 18229 + }, + { + "epoch": 5.5954573357888275, + "grad_norm": 0.2919909656047821, + "learning_rate": 4.281762137211902e-05, + "loss": 1.7836, + "step": 18230 + }, + { + "epoch": 5.595764272559853, + "grad_norm": 0.22636881470680237, + "learning_rate": 4.2812702397893113e-05, + "loss": 1.7389, + "step": 18231 + }, + { + "epoch": 5.596071209330878, + "grad_norm": 0.23788630962371826, + "learning_rate": 4.280778349469627e-05, + "loss": 1.7536, + "step": 18232 + }, + { + "epoch": 5.596378146101903, + "grad_norm": 0.22089426219463348, + "learning_rate": 4.280286466257715e-05, + "loss": 1.7584, + "step": 18233 + }, + { + "epoch": 5.596685082872928, + "grad_norm": 0.20486171543598175, + "learning_rate": 4.279794590158431e-05, + "loss": 1.7182, + "step": 18234 + }, + { + "epoch": 5.596992019643953, + "grad_norm": 0.2343701422214508, + "learning_rate": 4.2793027211766425e-05, + "loss": 1.751, + "step": 18235 + }, + { + "epoch": 5.597298956414979, + "grad_norm": 0.21734023094177246, + "learning_rate": 4.2788108593172036e-05, + "loss": 1.7084, + "step": 18236 + }, + { + "epoch": 5.597605893186004, + "grad_norm": 0.20593903958797455, + "learning_rate": 4.278319004584982e-05, + "loss": 1.6805, + "step": 18237 + }, + { + "epoch": 5.597912829957028, + "grad_norm": 0.20877878367900848, + "learning_rate": 4.2778271569848324e-05, + "loss": 1.7011, + "step": 18238 + }, + { + "epoch": 5.598219766728054, + "grad_norm": 0.23915995657444, + "learning_rate": 4.277335316521619e-05, + "loss": 1.732, + "step": 18239 + }, + { + "epoch": 5.598526703499079, + "grad_norm": 0.24310529232025146, + "learning_rate": 4.2768434832002004e-05, + "loss": 1.7859, + "step": 18240 + }, + { + "epoch": 5.598833640270104, + "grad_norm": 0.23189407587051392, + "learning_rate": 4.27635165702544e-05, + "loss": 1.7237, + "step": 18241 + }, + { + "epoch": 5.59914057704113, + "grad_norm": 0.2708875834941864, + "learning_rate": 4.275859838002195e-05, + "loss": 1.7046, + "step": 18242 + }, + { + "epoch": 5.599447513812155, + "grad_norm": 0.23692840337753296, + "learning_rate": 4.27536802613533e-05, + "loss": 1.8556, + "step": 18243 + }, + { + "epoch": 5.5997544505831796, + "grad_norm": 0.28285983204841614, + "learning_rate": 4.274876221429701e-05, + "loss": 1.6734, + "step": 18244 + }, + { + "epoch": 5.600061387354205, + "grad_norm": 0.20602203905582428, + "learning_rate": 4.27438442389017e-05, + "loss": 1.7113, + "step": 18245 + }, + { + "epoch": 5.60036832412523, + "grad_norm": 0.19719314575195312, + "learning_rate": 4.273892633521598e-05, + "loss": 1.7229, + "step": 18246 + }, + { + "epoch": 5.600675260896256, + "grad_norm": 0.2396705001592636, + "learning_rate": 4.273400850328846e-05, + "loss": 1.6986, + "step": 18247 + }, + { + "epoch": 5.600982197667281, + "grad_norm": 0.1974172443151474, + "learning_rate": 4.2729090743167724e-05, + "loss": 1.7445, + "step": 18248 + }, + { + "epoch": 5.601289134438305, + "grad_norm": 0.2193709760904312, + "learning_rate": 4.272417305490235e-05, + "loss": 1.7657, + "step": 18249 + }, + { + "epoch": 5.601596071209331, + "grad_norm": 0.24138681590557098, + "learning_rate": 4.271925543854098e-05, + "loss": 1.7388, + "step": 18250 + }, + { + "epoch": 5.601903007980356, + "grad_norm": 0.19056223332881927, + "learning_rate": 4.271433789413219e-05, + "loss": 1.6897, + "step": 18251 + }, + { + "epoch": 5.602209944751381, + "grad_norm": 0.20533505082130432, + "learning_rate": 4.270942042172459e-05, + "loss": 1.7222, + "step": 18252 + }, + { + "epoch": 5.602516881522407, + "grad_norm": 0.20570224523544312, + "learning_rate": 4.270450302136675e-05, + "loss": 1.8089, + "step": 18253 + }, + { + "epoch": 5.602823818293432, + "grad_norm": 0.2822209298610687, + "learning_rate": 4.269958569310732e-05, + "loss": 1.7523, + "step": 18254 + }, + { + "epoch": 5.6031307550644565, + "grad_norm": 0.2994859218597412, + "learning_rate": 4.269466843699484e-05, + "loss": 1.7538, + "step": 18255 + }, + { + "epoch": 5.603437691835482, + "grad_norm": 0.24851159751415253, + "learning_rate": 4.2689751253077925e-05, + "loss": 1.8162, + "step": 18256 + }, + { + "epoch": 5.603744628606507, + "grad_norm": 0.20387138426303864, + "learning_rate": 4.268483414140517e-05, + "loss": 1.6803, + "step": 18257 + }, + { + "epoch": 5.6040515653775325, + "grad_norm": 0.21620385348796844, + "learning_rate": 4.2679917102025204e-05, + "loss": 1.7236, + "step": 18258 + }, + { + "epoch": 5.604358502148557, + "grad_norm": 0.1925734579563141, + "learning_rate": 4.267500013498655e-05, + "loss": 1.7295, + "step": 18259 + }, + { + "epoch": 5.604665438919582, + "grad_norm": 0.22216086089611053, + "learning_rate": 4.267008324033787e-05, + "loss": 1.6844, + "step": 18260 + }, + { + "epoch": 5.604972375690608, + "grad_norm": 0.20293502509593964, + "learning_rate": 4.26651664181277e-05, + "loss": 1.7065, + "step": 18261 + }, + { + "epoch": 5.605279312461633, + "grad_norm": 0.21269507706165314, + "learning_rate": 4.266024966840466e-05, + "loss": 1.7573, + "step": 18262 + }, + { + "epoch": 5.605586249232658, + "grad_norm": 0.23574227094650269, + "learning_rate": 4.2655332991217334e-05, + "loss": 1.7625, + "step": 18263 + }, + { + "epoch": 5.605893186003684, + "grad_norm": 0.1875103861093521, + "learning_rate": 4.265041638661433e-05, + "loss": 1.7266, + "step": 18264 + }, + { + "epoch": 5.606200122774708, + "grad_norm": 0.20348483324050903, + "learning_rate": 4.264549985464421e-05, + "loss": 1.731, + "step": 18265 + }, + { + "epoch": 5.606507059545733, + "grad_norm": 0.2345927655696869, + "learning_rate": 4.264058339535556e-05, + "loss": 1.7809, + "step": 18266 + }, + { + "epoch": 5.606813996316759, + "grad_norm": 0.21142496168613434, + "learning_rate": 4.2635667008796985e-05, + "loss": 1.7362, + "step": 18267 + }, + { + "epoch": 5.607120933087784, + "grad_norm": 0.19670210778713226, + "learning_rate": 4.263075069501705e-05, + "loss": 1.7029, + "step": 18268 + }, + { + "epoch": 5.607427869858809, + "grad_norm": 0.20985090732574463, + "learning_rate": 4.262583445406439e-05, + "loss": 1.7478, + "step": 18269 + }, + { + "epoch": 5.607734806629834, + "grad_norm": 0.20972272753715515, + "learning_rate": 4.262091828598752e-05, + "loss": 1.7561, + "step": 18270 + }, + { + "epoch": 5.608041743400859, + "grad_norm": 0.20006676018238068, + "learning_rate": 4.261600219083509e-05, + "loss": 1.7584, + "step": 18271 + }, + { + "epoch": 5.6083486801718845, + "grad_norm": 0.21590086817741394, + "learning_rate": 4.2611086168655635e-05, + "loss": 1.7405, + "step": 18272 + }, + { + "epoch": 5.60865561694291, + "grad_norm": 0.19330906867980957, + "learning_rate": 4.260617021949776e-05, + "loss": 1.6797, + "step": 18273 + }, + { + "epoch": 5.608962553713935, + "grad_norm": 0.1955050528049469, + "learning_rate": 4.260125434341004e-05, + "loss": 1.7174, + "step": 18274 + }, + { + "epoch": 5.6092694904849605, + "grad_norm": 0.2117784321308136, + "learning_rate": 4.2596338540441086e-05, + "loss": 1.743, + "step": 18275 + }, + { + "epoch": 5.609576427255985, + "grad_norm": 0.21788950264453888, + "learning_rate": 4.2591422810639425e-05, + "loss": 1.7603, + "step": 18276 + }, + { + "epoch": 5.60988336402701, + "grad_norm": 0.2092670351266861, + "learning_rate": 4.258650715405369e-05, + "loss": 1.7379, + "step": 18277 + }, + { + "epoch": 5.610190300798036, + "grad_norm": 0.1941552758216858, + "learning_rate": 4.2581591570732414e-05, + "loss": 1.7547, + "step": 18278 + }, + { + "epoch": 5.610497237569061, + "grad_norm": 0.21306751668453217, + "learning_rate": 4.2576676060724215e-05, + "loss": 1.7284, + "step": 18279 + }, + { + "epoch": 5.610804174340086, + "grad_norm": 0.18618693947792053, + "learning_rate": 4.2571760624077635e-05, + "loss": 1.7268, + "step": 18280 + }, + { + "epoch": 5.611111111111111, + "grad_norm": 0.21530354022979736, + "learning_rate": 4.256684526084129e-05, + "loss": 1.7036, + "step": 18281 + }, + { + "epoch": 5.611418047882136, + "grad_norm": 0.23363792896270752, + "learning_rate": 4.256192997106375e-05, + "loss": 1.7797, + "step": 18282 + }, + { + "epoch": 5.611724984653161, + "grad_norm": 0.1786416620016098, + "learning_rate": 4.2557014754793544e-05, + "loss": 1.7008, + "step": 18283 + }, + { + "epoch": 5.612031921424187, + "grad_norm": 0.2042730301618576, + "learning_rate": 4.25520996120793e-05, + "loss": 1.7667, + "step": 18284 + }, + { + "epoch": 5.612338858195212, + "grad_norm": 0.2275264412164688, + "learning_rate": 4.2547184542969554e-05, + "loss": 1.8277, + "step": 18285 + }, + { + "epoch": 5.612645794966237, + "grad_norm": 0.21252553164958954, + "learning_rate": 4.2542269547512925e-05, + "loss": 1.7272, + "step": 18286 + }, + { + "epoch": 5.612952731737262, + "grad_norm": 0.20384398102760315, + "learning_rate": 4.2537354625757934e-05, + "loss": 1.6707, + "step": 18287 + }, + { + "epoch": 5.613259668508287, + "grad_norm": 0.19805553555488586, + "learning_rate": 4.253243977775321e-05, + "loss": 1.7443, + "step": 18288 + }, + { + "epoch": 5.6135666052793125, + "grad_norm": 0.20447707176208496, + "learning_rate": 4.2527525003547256e-05, + "loss": 1.7392, + "step": 18289 + }, + { + "epoch": 5.613873542050338, + "grad_norm": 0.21025662124156952, + "learning_rate": 4.25226103031887e-05, + "loss": 1.7856, + "step": 18290 + }, + { + "epoch": 5.614180478821363, + "grad_norm": 0.2131013125181198, + "learning_rate": 4.2517695676726085e-05, + "loss": 1.7521, + "step": 18291 + }, + { + "epoch": 5.614487415592388, + "grad_norm": 0.2511558532714844, + "learning_rate": 4.2512781124208e-05, + "loss": 1.6873, + "step": 18292 + }, + { + "epoch": 5.614794352363413, + "grad_norm": 0.19668610394001007, + "learning_rate": 4.2507866645682984e-05, + "loss": 1.6808, + "step": 18293 + }, + { + "epoch": 5.615101289134438, + "grad_norm": 0.22313621640205383, + "learning_rate": 4.2502952241199637e-05, + "loss": 1.7794, + "step": 18294 + }, + { + "epoch": 5.615408225905464, + "grad_norm": 0.2053089439868927, + "learning_rate": 4.249803791080649e-05, + "loss": 1.7405, + "step": 18295 + }, + { + "epoch": 5.615715162676489, + "grad_norm": 0.2052931934595108, + "learning_rate": 4.249312365455215e-05, + "loss": 1.6698, + "step": 18296 + }, + { + "epoch": 5.616022099447514, + "grad_norm": 0.223783478140831, + "learning_rate": 4.248820947248515e-05, + "loss": 1.7696, + "step": 18297 + }, + { + "epoch": 5.616329036218539, + "grad_norm": 0.3424001932144165, + "learning_rate": 4.248329536465407e-05, + "loss": 1.7724, + "step": 18298 + }, + { + "epoch": 5.616635972989564, + "grad_norm": 0.25015103816986084, + "learning_rate": 4.247838133110749e-05, + "loss": 1.7188, + "step": 18299 + }, + { + "epoch": 5.616942909760589, + "grad_norm": 0.239765465259552, + "learning_rate": 4.247346737189392e-05, + "loss": 1.695, + "step": 18300 + }, + { + "epoch": 5.617249846531615, + "grad_norm": 0.42259401082992554, + "learning_rate": 4.246855348706197e-05, + "loss": 1.6882, + "step": 18301 + }, + { + "epoch": 5.617556783302639, + "grad_norm": 0.2985959053039551, + "learning_rate": 4.246363967666018e-05, + "loss": 1.7236, + "step": 18302 + }, + { + "epoch": 5.6178637200736645, + "grad_norm": 0.22437956929206848, + "learning_rate": 4.245872594073714e-05, + "loss": 1.7158, + "step": 18303 + }, + { + "epoch": 5.61817065684469, + "grad_norm": 0.3165835440158844, + "learning_rate": 4.245381227934138e-05, + "loss": 1.7543, + "step": 18304 + }, + { + "epoch": 5.618477593615715, + "grad_norm": 0.2565564513206482, + "learning_rate": 4.244889869252148e-05, + "loss": 1.7863, + "step": 18305 + }, + { + "epoch": 5.6187845303867405, + "grad_norm": 0.25741446018218994, + "learning_rate": 4.244398518032597e-05, + "loss": 1.721, + "step": 18306 + }, + { + "epoch": 5.619091467157766, + "grad_norm": 0.26492297649383545, + "learning_rate": 4.2439071742803435e-05, + "loss": 1.7697, + "step": 18307 + }, + { + "epoch": 5.61939840392879, + "grad_norm": 0.2086823433637619, + "learning_rate": 4.243415838000243e-05, + "loss": 1.7072, + "step": 18308 + }, + { + "epoch": 5.619705340699816, + "grad_norm": 0.26784422993659973, + "learning_rate": 4.24292450919715e-05, + "loss": 1.7826, + "step": 18309 + }, + { + "epoch": 5.620012277470841, + "grad_norm": 0.21774251759052277, + "learning_rate": 4.242433187875921e-05, + "loss": 1.7204, + "step": 18310 + }, + { + "epoch": 5.620319214241866, + "grad_norm": 0.29547446966171265, + "learning_rate": 4.241941874041412e-05, + "loss": 1.7303, + "step": 18311 + }, + { + "epoch": 5.620626151012892, + "grad_norm": 0.20278988778591156, + "learning_rate": 4.241450567698476e-05, + "loss": 1.692, + "step": 18312 + }, + { + "epoch": 5.620933087783916, + "grad_norm": 0.2084289938211441, + "learning_rate": 4.240959268851971e-05, + "loss": 1.7069, + "step": 18313 + }, + { + "epoch": 5.621240024554941, + "grad_norm": 0.19901904463768005, + "learning_rate": 4.240467977506752e-05, + "loss": 1.6798, + "step": 18314 + }, + { + "epoch": 5.621546961325967, + "grad_norm": 0.24629411101341248, + "learning_rate": 4.2399766936676735e-05, + "loss": 1.775, + "step": 18315 + }, + { + "epoch": 5.621853898096992, + "grad_norm": 0.2532403767108917, + "learning_rate": 4.239485417339591e-05, + "loss": 1.7669, + "step": 18316 + }, + { + "epoch": 5.622160834868017, + "grad_norm": 0.22495722770690918, + "learning_rate": 4.2389941485273576e-05, + "loss": 1.7772, + "step": 18317 + }, + { + "epoch": 5.622467771639043, + "grad_norm": 0.2789733111858368, + "learning_rate": 4.2385028872358316e-05, + "loss": 1.751, + "step": 18318 + }, + { + "epoch": 5.622774708410067, + "grad_norm": 0.2266954481601715, + "learning_rate": 4.238011633469866e-05, + "loss": 1.7213, + "step": 18319 + }, + { + "epoch": 5.6230816451810925, + "grad_norm": 0.2163502722978592, + "learning_rate": 4.237520387234316e-05, + "loss": 1.7781, + "step": 18320 + }, + { + "epoch": 5.623388581952118, + "grad_norm": 0.25249144434928894, + "learning_rate": 4.237029148534036e-05, + "loss": 1.7293, + "step": 18321 + }, + { + "epoch": 5.623695518723143, + "grad_norm": 0.2320011854171753, + "learning_rate": 4.2365379173738826e-05, + "loss": 1.7909, + "step": 18322 + }, + { + "epoch": 5.6240024554941686, + "grad_norm": 0.22074681520462036, + "learning_rate": 4.2360466937587074e-05, + "loss": 1.743, + "step": 18323 + }, + { + "epoch": 5.624309392265193, + "grad_norm": 0.20864775776863098, + "learning_rate": 4.235555477693368e-05, + "loss": 1.726, + "step": 18324 + }, + { + "epoch": 5.624616329036218, + "grad_norm": 0.24547792971134186, + "learning_rate": 4.235064269182716e-05, + "loss": 1.7646, + "step": 18325 + }, + { + "epoch": 5.624923265807244, + "grad_norm": 0.29965806007385254, + "learning_rate": 4.234573068231607e-05, + "loss": 1.7789, + "step": 18326 + }, + { + "epoch": 5.625230202578269, + "grad_norm": 0.20844583213329315, + "learning_rate": 4.234081874844896e-05, + "loss": 1.7007, + "step": 18327 + }, + { + "epoch": 5.625537139349294, + "grad_norm": 0.2455398142337799, + "learning_rate": 4.2335906890274385e-05, + "loss": 1.7094, + "step": 18328 + }, + { + "epoch": 5.62584407612032, + "grad_norm": 0.17839518189430237, + "learning_rate": 4.233099510784085e-05, + "loss": 1.6849, + "step": 18329 + }, + { + "epoch": 5.626151012891344, + "grad_norm": 0.20219004154205322, + "learning_rate": 4.232608340119693e-05, + "loss": 1.716, + "step": 18330 + }, + { + "epoch": 5.6264579496623695, + "grad_norm": 0.23570619523525238, + "learning_rate": 4.232117177039114e-05, + "loss": 1.7622, + "step": 18331 + }, + { + "epoch": 5.626764886433395, + "grad_norm": 0.23534397780895233, + "learning_rate": 4.231626021547204e-05, + "loss": 1.7758, + "step": 18332 + }, + { + "epoch": 5.62707182320442, + "grad_norm": 0.2177352011203766, + "learning_rate": 4.231134873648817e-05, + "loss": 1.7102, + "step": 18333 + }, + { + "epoch": 5.627378759975445, + "grad_norm": 0.22886058688163757, + "learning_rate": 4.230643733348803e-05, + "loss": 1.7766, + "step": 18334 + }, + { + "epoch": 5.62768569674647, + "grad_norm": 0.20723696053028107, + "learning_rate": 4.2301526006520215e-05, + "loss": 1.7287, + "step": 18335 + }, + { + "epoch": 5.627992633517495, + "grad_norm": 0.18612104654312134, + "learning_rate": 4.229661475563321e-05, + "loss": 1.7255, + "step": 18336 + }, + { + "epoch": 5.628299570288521, + "grad_norm": 0.26456236839294434, + "learning_rate": 4.229170358087558e-05, + "loss": 1.7388, + "step": 18337 + }, + { + "epoch": 5.628606507059546, + "grad_norm": 0.25253555178642273, + "learning_rate": 4.2286792482295845e-05, + "loss": 1.7031, + "step": 18338 + }, + { + "epoch": 5.628913443830571, + "grad_norm": 0.23093348741531372, + "learning_rate": 4.228188145994257e-05, + "loss": 1.8032, + "step": 18339 + }, + { + "epoch": 5.629220380601596, + "grad_norm": 0.24142487347126007, + "learning_rate": 4.227697051386424e-05, + "loss": 1.6621, + "step": 18340 + }, + { + "epoch": 5.629527317372621, + "grad_norm": 0.2883392572402954, + "learning_rate": 4.227205964410944e-05, + "loss": 1.7125, + "step": 18341 + }, + { + "epoch": 5.629834254143646, + "grad_norm": 0.22670713067054749, + "learning_rate": 4.226714885072665e-05, + "loss": 1.7659, + "step": 18342 + }, + { + "epoch": 5.630141190914672, + "grad_norm": 0.2795337438583374, + "learning_rate": 4.226223813376444e-05, + "loss": 1.7559, + "step": 18343 + }, + { + "epoch": 5.630448127685697, + "grad_norm": 0.2513083219528198, + "learning_rate": 4.225732749327132e-05, + "loss": 1.6969, + "step": 18344 + }, + { + "epoch": 5.6307550644567215, + "grad_norm": 0.24588467180728912, + "learning_rate": 4.225241692929585e-05, + "loss": 1.7724, + "step": 18345 + }, + { + "epoch": 5.631062001227747, + "grad_norm": 0.41726353764533997, + "learning_rate": 4.224750644188651e-05, + "loss": 1.7308, + "step": 18346 + }, + { + "epoch": 5.631368937998772, + "grad_norm": 0.2512385845184326, + "learning_rate": 4.2242596031091886e-05, + "loss": 1.7068, + "step": 18347 + }, + { + "epoch": 5.6316758747697975, + "grad_norm": 0.3077464997768402, + "learning_rate": 4.223768569696044e-05, + "loss": 1.7383, + "step": 18348 + }, + { + "epoch": 5.631982811540823, + "grad_norm": 0.3460720479488373, + "learning_rate": 4.2232775439540756e-05, + "loss": 1.7317, + "step": 18349 + }, + { + "epoch": 5.632289748311848, + "grad_norm": 0.24827539920806885, + "learning_rate": 4.222786525888134e-05, + "loss": 1.6871, + "step": 18350 + }, + { + "epoch": 5.632596685082873, + "grad_norm": 0.24851584434509277, + "learning_rate": 4.22229551550307e-05, + "loss": 1.7058, + "step": 18351 + }, + { + "epoch": 5.632903621853898, + "grad_norm": 0.31132519245147705, + "learning_rate": 4.2218045128037396e-05, + "loss": 1.7523, + "step": 18352 + }, + { + "epoch": 5.633210558624923, + "grad_norm": 0.3104027807712555, + "learning_rate": 4.2213135177949906e-05, + "loss": 1.7669, + "step": 18353 + }, + { + "epoch": 5.633517495395949, + "grad_norm": 0.31351104378700256, + "learning_rate": 4.2208225304816795e-05, + "loss": 1.7031, + "step": 18354 + }, + { + "epoch": 5.633824432166974, + "grad_norm": 0.3217851221561432, + "learning_rate": 4.2203315508686555e-05, + "loss": 1.7694, + "step": 18355 + }, + { + "epoch": 5.634131368937998, + "grad_norm": 0.22287796437740326, + "learning_rate": 4.2198405789607745e-05, + "loss": 1.7742, + "step": 18356 + }, + { + "epoch": 5.634438305709024, + "grad_norm": 0.20288340747356415, + "learning_rate": 4.219349614762883e-05, + "loss": 1.7113, + "step": 18357 + }, + { + "epoch": 5.634745242480049, + "grad_norm": 0.19823449850082397, + "learning_rate": 4.218858658279839e-05, + "loss": 1.7433, + "step": 18358 + }, + { + "epoch": 5.635052179251074, + "grad_norm": 0.2756347358226776, + "learning_rate": 4.2183677095164895e-05, + "loss": 1.8278, + "step": 18359 + }, + { + "epoch": 5.6353591160221, + "grad_norm": 0.2303706556558609, + "learning_rate": 4.2178767684776895e-05, + "loss": 1.6943, + "step": 18360 + }, + { + "epoch": 5.635666052793125, + "grad_norm": 0.25089216232299805, + "learning_rate": 4.217385835168288e-05, + "loss": 1.6562, + "step": 18361 + }, + { + "epoch": 5.6359729895641495, + "grad_norm": 0.3013486862182617, + "learning_rate": 4.216894909593141e-05, + "loss": 1.7323, + "step": 18362 + }, + { + "epoch": 5.636279926335175, + "grad_norm": 0.19471928477287292, + "learning_rate": 4.2164039917570956e-05, + "loss": 1.7301, + "step": 18363 + }, + { + "epoch": 5.6365868631062, + "grad_norm": 0.3257733881473541, + "learning_rate": 4.2159130816650075e-05, + "loss": 1.7522, + "step": 18364 + }, + { + "epoch": 5.6368937998772255, + "grad_norm": 0.3065868020057678, + "learning_rate": 4.215422179321723e-05, + "loss": 1.7077, + "step": 18365 + }, + { + "epoch": 5.637200736648251, + "grad_norm": 0.20643819868564606, + "learning_rate": 4.214931284732098e-05, + "loss": 1.8033, + "step": 18366 + }, + { + "epoch": 5.637507673419275, + "grad_norm": 0.23551981151103973, + "learning_rate": 4.2144403979009826e-05, + "loss": 1.7391, + "step": 18367 + }, + { + "epoch": 5.637814610190301, + "grad_norm": 0.20602314174175262, + "learning_rate": 4.2139495188332265e-05, + "loss": 1.7593, + "step": 18368 + }, + { + "epoch": 5.638121546961326, + "grad_norm": 0.27911239862442017, + "learning_rate": 4.2134586475336834e-05, + "loss": 1.7212, + "step": 18369 + }, + { + "epoch": 5.638428483732351, + "grad_norm": 0.2700496017932892, + "learning_rate": 4.212967784007201e-05, + "loss": 1.7755, + "step": 18370 + }, + { + "epoch": 5.638735420503377, + "grad_norm": 0.24988985061645508, + "learning_rate": 4.2124769282586334e-05, + "loss": 1.7364, + "step": 18371 + }, + { + "epoch": 5.639042357274402, + "grad_norm": 0.20491284132003784, + "learning_rate": 4.211986080292829e-05, + "loss": 1.7477, + "step": 18372 + }, + { + "epoch": 5.639349294045426, + "grad_norm": 0.24953459203243256, + "learning_rate": 4.211495240114643e-05, + "loss": 1.7712, + "step": 18373 + }, + { + "epoch": 5.639656230816452, + "grad_norm": 0.2028491199016571, + "learning_rate": 4.2110044077289204e-05, + "loss": 1.701, + "step": 18374 + }, + { + "epoch": 5.639963167587477, + "grad_norm": 0.22320568561553955, + "learning_rate": 4.210513583140517e-05, + "loss": 1.7818, + "step": 18375 + }, + { + "epoch": 5.640270104358502, + "grad_norm": 0.22680947184562683, + "learning_rate": 4.210022766354278e-05, + "loss": 1.7631, + "step": 18376 + }, + { + "epoch": 5.640577041129527, + "grad_norm": 0.20724014937877655, + "learning_rate": 4.2095319573750596e-05, + "loss": 1.7757, + "step": 18377 + }, + { + "epoch": 5.640883977900552, + "grad_norm": 0.21785953640937805, + "learning_rate": 4.209041156207708e-05, + "loss": 1.7161, + "step": 18378 + }, + { + "epoch": 5.6411909146715775, + "grad_norm": 0.21751803159713745, + "learning_rate": 4.208550362857078e-05, + "loss": 1.7449, + "step": 18379 + }, + { + "epoch": 5.641497851442603, + "grad_norm": 0.1765962839126587, + "learning_rate": 4.208059577328014e-05, + "loss": 1.7191, + "step": 18380 + }, + { + "epoch": 5.641804788213628, + "grad_norm": 0.22720913589000702, + "learning_rate": 4.2075687996253724e-05, + "loss": 1.7037, + "step": 18381 + }, + { + "epoch": 5.6421117249846535, + "grad_norm": 0.23589655756950378, + "learning_rate": 4.2070780297539976e-05, + "loss": 1.8147, + "step": 18382 + }, + { + "epoch": 5.642418661755678, + "grad_norm": 0.21187056601047516, + "learning_rate": 4.2065872677187435e-05, + "loss": 1.7655, + "step": 18383 + }, + { + "epoch": 5.642725598526703, + "grad_norm": 0.24153946340084076, + "learning_rate": 4.2060965135244606e-05, + "loss": 1.7841, + "step": 18384 + }, + { + "epoch": 5.643032535297729, + "grad_norm": 0.2059229612350464, + "learning_rate": 4.205605767175995e-05, + "loss": 1.6718, + "step": 18385 + }, + { + "epoch": 5.643339472068754, + "grad_norm": 0.20235973596572876, + "learning_rate": 4.205115028678201e-05, + "loss": 1.6931, + "step": 18386 + }, + { + "epoch": 5.643646408839779, + "grad_norm": 0.25149911642074585, + "learning_rate": 4.204624298035924e-05, + "loss": 1.7465, + "step": 18387 + }, + { + "epoch": 5.643953345610804, + "grad_norm": 0.2050812691450119, + "learning_rate": 4.204133575254017e-05, + "loss": 1.7147, + "step": 18388 + }, + { + "epoch": 5.644260282381829, + "grad_norm": 0.20906420052051544, + "learning_rate": 4.2036428603373274e-05, + "loss": 1.6762, + "step": 18389 + }, + { + "epoch": 5.644567219152854, + "grad_norm": 0.20150595903396606, + "learning_rate": 4.2031521532907075e-05, + "loss": 1.678, + "step": 18390 + }, + { + "epoch": 5.64487415592388, + "grad_norm": 0.2141568511724472, + "learning_rate": 4.202661454119004e-05, + "loss": 1.7274, + "step": 18391 + }, + { + "epoch": 5.645181092694905, + "grad_norm": 0.2641741931438446, + "learning_rate": 4.202170762827069e-05, + "loss": 1.7975, + "step": 18392 + }, + { + "epoch": 5.64548802946593, + "grad_norm": 0.22928468883037567, + "learning_rate": 4.201680079419747e-05, + "loss": 1.7687, + "step": 18393 + }, + { + "epoch": 5.645794966236955, + "grad_norm": 0.22713731229305267, + "learning_rate": 4.2011894039018925e-05, + "loss": 1.7475, + "step": 18394 + }, + { + "epoch": 5.64610190300798, + "grad_norm": 0.25602981448173523, + "learning_rate": 4.200698736278351e-05, + "loss": 1.7356, + "step": 18395 + }, + { + "epoch": 5.6464088397790055, + "grad_norm": 0.2619759738445282, + "learning_rate": 4.200208076553975e-05, + "loss": 1.7334, + "step": 18396 + }, + { + "epoch": 5.646715776550031, + "grad_norm": 0.24756783246994019, + "learning_rate": 4.19971742473361e-05, + "loss": 1.7253, + "step": 18397 + }, + { + "epoch": 5.647022713321056, + "grad_norm": 0.2068249136209488, + "learning_rate": 4.199226780822109e-05, + "loss": 1.7246, + "step": 18398 + }, + { + "epoch": 5.647329650092081, + "grad_norm": 0.23219087719917297, + "learning_rate": 4.1987361448243165e-05, + "loss": 1.7388, + "step": 18399 + }, + { + "epoch": 5.647636586863106, + "grad_norm": 0.2051403522491455, + "learning_rate": 4.198245516745082e-05, + "loss": 1.7775, + "step": 18400 + }, + { + "epoch": 5.647943523634131, + "grad_norm": 0.26408639550209045, + "learning_rate": 4.1977548965892575e-05, + "loss": 1.8069, + "step": 18401 + }, + { + "epoch": 5.648250460405157, + "grad_norm": 0.2104891538619995, + "learning_rate": 4.197264284361687e-05, + "loss": 1.7335, + "step": 18402 + }, + { + "epoch": 5.648557397176182, + "grad_norm": 0.23963849246501923, + "learning_rate": 4.196773680067224e-05, + "loss": 1.7254, + "step": 18403 + }, + { + "epoch": 5.648864333947207, + "grad_norm": 0.2770128846168518, + "learning_rate": 4.1962830837107117e-05, + "loss": 1.7848, + "step": 18404 + }, + { + "epoch": 5.649171270718232, + "grad_norm": 0.23342710733413696, + "learning_rate": 4.195792495297002e-05, + "loss": 1.7818, + "step": 18405 + }, + { + "epoch": 5.649478207489257, + "grad_norm": 0.23835061490535736, + "learning_rate": 4.195301914830941e-05, + "loss": 1.7453, + "step": 18406 + }, + { + "epoch": 5.649785144260282, + "grad_norm": 0.21896767616271973, + "learning_rate": 4.194811342317381e-05, + "loss": 1.7205, + "step": 18407 + }, + { + "epoch": 5.650092081031308, + "grad_norm": 0.20222818851470947, + "learning_rate": 4.1943207777611646e-05, + "loss": 1.6833, + "step": 18408 + }, + { + "epoch": 5.650399017802332, + "grad_norm": 0.2182089239358902, + "learning_rate": 4.193830221167146e-05, + "loss": 1.7296, + "step": 18409 + }, + { + "epoch": 5.650705954573358, + "grad_norm": 0.19981688261032104, + "learning_rate": 4.1933396725401655e-05, + "loss": 1.7327, + "step": 18410 + }, + { + "epoch": 5.651012891344383, + "grad_norm": 0.23925067484378815, + "learning_rate": 4.192849131885077e-05, + "loss": 1.7545, + "step": 18411 + }, + { + "epoch": 5.651319828115408, + "grad_norm": 0.21967993676662445, + "learning_rate": 4.192358599206725e-05, + "loss": 1.6973, + "step": 18412 + }, + { + "epoch": 5.651626764886434, + "grad_norm": 0.2273840606212616, + "learning_rate": 4.1918680745099614e-05, + "loss": 1.8229, + "step": 18413 + }, + { + "epoch": 5.651933701657459, + "grad_norm": 0.26950231194496155, + "learning_rate": 4.1913775577996286e-05, + "loss": 1.7666, + "step": 18414 + }, + { + "epoch": 5.652240638428484, + "grad_norm": 0.26608848571777344, + "learning_rate": 4.190887049080579e-05, + "loss": 1.8279, + "step": 18415 + }, + { + "epoch": 5.652547575199509, + "grad_norm": 0.20856785774230957, + "learning_rate": 4.190396548357658e-05, + "loss": 1.7224, + "step": 18416 + }, + { + "epoch": 5.652854511970534, + "grad_norm": 0.2894255816936493, + "learning_rate": 4.18990605563571e-05, + "loss": 1.7308, + "step": 18417 + }, + { + "epoch": 5.653161448741559, + "grad_norm": 0.2047591209411621, + "learning_rate": 4.189415570919588e-05, + "loss": 1.758, + "step": 18418 + }, + { + "epoch": 5.653468385512585, + "grad_norm": 0.37161269783973694, + "learning_rate": 4.1889250942141346e-05, + "loss": 1.7926, + "step": 18419 + }, + { + "epoch": 5.653775322283609, + "grad_norm": 0.37338340282440186, + "learning_rate": 4.1884346255242e-05, + "loss": 1.7491, + "step": 18420 + }, + { + "epoch": 5.6540822590546345, + "grad_norm": 0.24279838800430298, + "learning_rate": 4.187944164854629e-05, + "loss": 1.7103, + "step": 18421 + }, + { + "epoch": 5.65438919582566, + "grad_norm": 0.219639852643013, + "learning_rate": 4.18745371221027e-05, + "loss": 1.7824, + "step": 18422 + }, + { + "epoch": 5.654696132596685, + "grad_norm": 0.22248409688472748, + "learning_rate": 4.186963267595969e-05, + "loss": 1.8098, + "step": 18423 + }, + { + "epoch": 5.6550030693677105, + "grad_norm": 0.2115657478570938, + "learning_rate": 4.1864728310165755e-05, + "loss": 1.72, + "step": 18424 + }, + { + "epoch": 5.655310006138736, + "grad_norm": 0.19723005592823029, + "learning_rate": 4.1859824024769325e-05, + "loss": 1.6818, + "step": 18425 + }, + { + "epoch": 5.65561694290976, + "grad_norm": 0.1828317642211914, + "learning_rate": 4.185491981981891e-05, + "loss": 1.7243, + "step": 18426 + }, + { + "epoch": 5.655923879680786, + "grad_norm": 0.271781861782074, + "learning_rate": 4.185001569536292e-05, + "loss": 1.7688, + "step": 18427 + }, + { + "epoch": 5.656230816451811, + "grad_norm": 0.3140811324119568, + "learning_rate": 4.184511165144986e-05, + "loss": 1.7319, + "step": 18428 + }, + { + "epoch": 5.656537753222836, + "grad_norm": 0.20013047754764557, + "learning_rate": 4.184020768812818e-05, + "loss": 1.7104, + "step": 18429 + }, + { + "epoch": 5.656844689993862, + "grad_norm": 0.2615044414997101, + "learning_rate": 4.183530380544638e-05, + "loss": 1.7314, + "step": 18430 + }, + { + "epoch": 5.657151626764886, + "grad_norm": 0.2645856440067291, + "learning_rate": 4.183040000345287e-05, + "loss": 1.7431, + "step": 18431 + }, + { + "epoch": 5.657458563535911, + "grad_norm": 0.1916145384311676, + "learning_rate": 4.182549628219615e-05, + "loss": 1.7013, + "step": 18432 + }, + { + "epoch": 5.657765500306937, + "grad_norm": 0.2647114396095276, + "learning_rate": 4.182059264172466e-05, + "loss": 1.7278, + "step": 18433 + }, + { + "epoch": 5.658072437077962, + "grad_norm": 0.20201756060123444, + "learning_rate": 4.1815689082086854e-05, + "loss": 1.7065, + "step": 18434 + }, + { + "epoch": 5.658379373848987, + "grad_norm": 0.23892022669315338, + "learning_rate": 4.181078560333123e-05, + "loss": 1.7365, + "step": 18435 + }, + { + "epoch": 5.658686310620013, + "grad_norm": 0.3125975728034973, + "learning_rate": 4.18058822055062e-05, + "loss": 1.7152, + "step": 18436 + }, + { + "epoch": 5.658993247391037, + "grad_norm": 0.18924804031848907, + "learning_rate": 4.180097888866027e-05, + "loss": 1.7763, + "step": 18437 + }, + { + "epoch": 5.6593001841620625, + "grad_norm": 0.28476929664611816, + "learning_rate": 4.1796075652841845e-05, + "loss": 1.7517, + "step": 18438 + }, + { + "epoch": 5.659607120933088, + "grad_norm": 0.30616337060928345, + "learning_rate": 4.1791172498099416e-05, + "loss": 1.7446, + "step": 18439 + }, + { + "epoch": 5.659914057704113, + "grad_norm": 0.3219330608844757, + "learning_rate": 4.1786269424481426e-05, + "loss": 1.8374, + "step": 18440 + }, + { + "epoch": 5.6602209944751385, + "grad_norm": 0.34074151515960693, + "learning_rate": 4.1781366432036364e-05, + "loss": 1.7915, + "step": 18441 + }, + { + "epoch": 5.660527931246163, + "grad_norm": 0.2321610003709793, + "learning_rate": 4.177646352081263e-05, + "loss": 1.7361, + "step": 18442 + }, + { + "epoch": 5.660834868017188, + "grad_norm": 0.34283575415611267, + "learning_rate": 4.1771560690858716e-05, + "loss": 1.6859, + "step": 18443 + }, + { + "epoch": 5.661141804788214, + "grad_norm": 0.32274290919303894, + "learning_rate": 4.1766657942223055e-05, + "loss": 1.7376, + "step": 18444 + }, + { + "epoch": 5.661448741559239, + "grad_norm": 0.23960906267166138, + "learning_rate": 4.1761755274954105e-05, + "loss": 1.7198, + "step": 18445 + }, + { + "epoch": 5.661755678330264, + "grad_norm": 0.2622305154800415, + "learning_rate": 4.175685268910031e-05, + "loss": 1.6997, + "step": 18446 + }, + { + "epoch": 5.66206261510129, + "grad_norm": 0.19836951792240143, + "learning_rate": 4.1751950184710157e-05, + "loss": 1.6612, + "step": 18447 + }, + { + "epoch": 5.662369551872314, + "grad_norm": 0.29541507363319397, + "learning_rate": 4.174704776183204e-05, + "loss": 1.7606, + "step": 18448 + }, + { + "epoch": 5.662676488643339, + "grad_norm": 0.21632203459739685, + "learning_rate": 4.174214542051445e-05, + "loss": 1.7108, + "step": 18449 + }, + { + "epoch": 5.662983425414365, + "grad_norm": 0.2851164638996124, + "learning_rate": 4.173724316080582e-05, + "loss": 1.747, + "step": 18450 + }, + { + "epoch": 5.66329036218539, + "grad_norm": 0.30293309688568115, + "learning_rate": 4.173234098275458e-05, + "loss": 1.7549, + "step": 18451 + }, + { + "epoch": 5.6635972989564145, + "grad_norm": 0.2131963074207306, + "learning_rate": 4.172743888640921e-05, + "loss": 1.7804, + "step": 18452 + }, + { + "epoch": 5.66390423572744, + "grad_norm": 0.234910249710083, + "learning_rate": 4.172253687181812e-05, + "loss": 1.7149, + "step": 18453 + }, + { + "epoch": 5.664211172498465, + "grad_norm": 0.21238654851913452, + "learning_rate": 4.171763493902979e-05, + "loss": 1.7272, + "step": 18454 + }, + { + "epoch": 5.6645181092694905, + "grad_norm": 0.20571236312389374, + "learning_rate": 4.171273308809263e-05, + "loss": 1.713, + "step": 18455 + }, + { + "epoch": 5.664825046040516, + "grad_norm": 0.24867361783981323, + "learning_rate": 4.1707831319055104e-05, + "loss": 1.682, + "step": 18456 + }, + { + "epoch": 5.665131982811541, + "grad_norm": 0.20556440949440002, + "learning_rate": 4.170292963196564e-05, + "loss": 1.7126, + "step": 18457 + }, + { + "epoch": 5.665438919582566, + "grad_norm": 0.26431065797805786, + "learning_rate": 4.169802802687271e-05, + "loss": 1.8142, + "step": 18458 + }, + { + "epoch": 5.665745856353591, + "grad_norm": 0.26041486859321594, + "learning_rate": 4.169312650382471e-05, + "loss": 1.7206, + "step": 18459 + }, + { + "epoch": 5.666052793124616, + "grad_norm": 0.2190525084733963, + "learning_rate": 4.1688225062870126e-05, + "loss": 1.787, + "step": 18460 + }, + { + "epoch": 5.666359729895642, + "grad_norm": 0.24726425111293793, + "learning_rate": 4.1683323704057354e-05, + "loss": 1.7677, + "step": 18461 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.22206442058086395, + "learning_rate": 4.167842242743486e-05, + "loss": 1.73, + "step": 18462 + }, + { + "epoch": 5.666973603437691, + "grad_norm": 0.22501195967197418, + "learning_rate": 4.167352123305108e-05, + "loss": 1.7213, + "step": 18463 + }, + { + "epoch": 5.667280540208717, + "grad_norm": 0.26164770126342773, + "learning_rate": 4.166862012095443e-05, + "loss": 1.7839, + "step": 18464 + }, + { + "epoch": 5.667587476979742, + "grad_norm": 0.19480809569358826, + "learning_rate": 4.166371909119336e-05, + "loss": 1.7562, + "step": 18465 + }, + { + "epoch": 5.667894413750767, + "grad_norm": 0.26677292585372925, + "learning_rate": 4.165881814381632e-05, + "loss": 1.776, + "step": 18466 + }, + { + "epoch": 5.668201350521793, + "grad_norm": 0.22019581496715546, + "learning_rate": 4.165391727887172e-05, + "loss": 1.7575, + "step": 18467 + }, + { + "epoch": 5.668508287292818, + "grad_norm": 0.23851899802684784, + "learning_rate": 4.1649016496407986e-05, + "loss": 1.7346, + "step": 18468 + }, + { + "epoch": 5.6688152240638425, + "grad_norm": 0.3118130564689636, + "learning_rate": 4.1644115796473596e-05, + "loss": 1.7808, + "step": 18469 + }, + { + "epoch": 5.669122160834868, + "grad_norm": 0.22783879935741425, + "learning_rate": 4.163921517911692e-05, + "loss": 1.831, + "step": 18470 + }, + { + "epoch": 5.669429097605893, + "grad_norm": 0.2203773707151413, + "learning_rate": 4.163431464438645e-05, + "loss": 1.7034, + "step": 18471 + }, + { + "epoch": 5.6697360343769185, + "grad_norm": 0.21838103234767914, + "learning_rate": 4.162941419233056e-05, + "loss": 1.7553, + "step": 18472 + }, + { + "epoch": 5.670042971147944, + "grad_norm": 0.18453563749790192, + "learning_rate": 4.162451382299771e-05, + "loss": 1.7139, + "step": 18473 + }, + { + "epoch": 5.670349907918968, + "grad_norm": 0.25308313965797424, + "learning_rate": 4.161961353643633e-05, + "loss": 1.7291, + "step": 18474 + }, + { + "epoch": 5.670656844689994, + "grad_norm": 0.2528827488422394, + "learning_rate": 4.1614713332694845e-05, + "loss": 1.781, + "step": 18475 + }, + { + "epoch": 5.670963781461019, + "grad_norm": 0.24774135649204254, + "learning_rate": 4.160981321182166e-05, + "loss": 1.7808, + "step": 18476 + }, + { + "epoch": 5.671270718232044, + "grad_norm": 0.25225830078125, + "learning_rate": 4.160491317386524e-05, + "loss": 1.739, + "step": 18477 + }, + { + "epoch": 5.67157765500307, + "grad_norm": 0.2095808982849121, + "learning_rate": 4.160001321887397e-05, + "loss": 1.7242, + "step": 18478 + }, + { + "epoch": 5.671884591774095, + "grad_norm": 0.23906216025352478, + "learning_rate": 4.159511334689631e-05, + "loss": 1.7071, + "step": 18479 + }, + { + "epoch": 5.672191528545119, + "grad_norm": 0.21851155161857605, + "learning_rate": 4.159021355798065e-05, + "loss": 1.7171, + "step": 18480 + }, + { + "epoch": 5.672498465316145, + "grad_norm": 0.2005140632390976, + "learning_rate": 4.158531385217544e-05, + "loss": 1.7483, + "step": 18481 + }, + { + "epoch": 5.67280540208717, + "grad_norm": 0.2230832278728485, + "learning_rate": 4.1580414229529074e-05, + "loss": 1.7386, + "step": 18482 + }, + { + "epoch": 5.673112338858195, + "grad_norm": 0.22402967512607574, + "learning_rate": 4.1575514690090014e-05, + "loss": 1.7989, + "step": 18483 + }, + { + "epoch": 5.67341927562922, + "grad_norm": 0.20350080728530884, + "learning_rate": 4.157061523390665e-05, + "loss": 1.6856, + "step": 18484 + }, + { + "epoch": 5.673726212400245, + "grad_norm": 0.2039422243833542, + "learning_rate": 4.15657158610274e-05, + "loss": 1.7262, + "step": 18485 + }, + { + "epoch": 5.6740331491712706, + "grad_norm": 0.20411522686481476, + "learning_rate": 4.156081657150069e-05, + "loss": 1.738, + "step": 18486 + }, + { + "epoch": 5.674340085942296, + "grad_norm": 0.2693086862564087, + "learning_rate": 4.155591736537493e-05, + "loss": 1.731, + "step": 18487 + }, + { + "epoch": 5.674647022713321, + "grad_norm": 0.20745019614696503, + "learning_rate": 4.1551018242698567e-05, + "loss": 1.7138, + "step": 18488 + }, + { + "epoch": 5.6749539594843466, + "grad_norm": 0.22033964097499847, + "learning_rate": 4.1546119203519964e-05, + "loss": 1.8144, + "step": 18489 + }, + { + "epoch": 5.675260896255372, + "grad_norm": 0.22859029471874237, + "learning_rate": 4.154122024788759e-05, + "loss": 1.6724, + "step": 18490 + }, + { + "epoch": 5.675567833026396, + "grad_norm": 0.2226465791463852, + "learning_rate": 4.153632137584982e-05, + "loss": 1.731, + "step": 18491 + }, + { + "epoch": 5.675874769797422, + "grad_norm": 0.19657716155052185, + "learning_rate": 4.1531422587455086e-05, + "loss": 1.6937, + "step": 18492 + }, + { + "epoch": 5.676181706568447, + "grad_norm": 0.23167578876018524, + "learning_rate": 4.152652388275179e-05, + "loss": 1.7444, + "step": 18493 + }, + { + "epoch": 5.676488643339472, + "grad_norm": 0.24468563497066498, + "learning_rate": 4.1521625261788374e-05, + "loss": 1.7173, + "step": 18494 + }, + { + "epoch": 5.676795580110497, + "grad_norm": 0.27125802636146545, + "learning_rate": 4.1516726724613206e-05, + "loss": 1.7424, + "step": 18495 + }, + { + "epoch": 5.677102516881522, + "grad_norm": 0.23816901445388794, + "learning_rate": 4.151182827127473e-05, + "loss": 1.6911, + "step": 18496 + }, + { + "epoch": 5.6774094536525475, + "grad_norm": 0.26058733463287354, + "learning_rate": 4.150692990182133e-05, + "loss": 1.7142, + "step": 18497 + }, + { + "epoch": 5.677716390423573, + "grad_norm": 0.20207929611206055, + "learning_rate": 4.150203161630143e-05, + "loss": 1.7506, + "step": 18498 + }, + { + "epoch": 5.678023327194598, + "grad_norm": 0.259857714176178, + "learning_rate": 4.1497133414763435e-05, + "loss": 1.7181, + "step": 18499 + }, + { + "epoch": 5.6783302639656235, + "grad_norm": 0.2607496380805969, + "learning_rate": 4.149223529725577e-05, + "loss": 1.7829, + "step": 18500 + }, + { + "epoch": 5.678637200736648, + "grad_norm": 0.23265719413757324, + "learning_rate": 4.148733726382681e-05, + "loss": 1.7028, + "step": 18501 + }, + { + "epoch": 5.678944137507673, + "grad_norm": 0.26610276103019714, + "learning_rate": 4.1482439314524964e-05, + "loss": 1.8604, + "step": 18502 + }, + { + "epoch": 5.679251074278699, + "grad_norm": 0.24022582173347473, + "learning_rate": 4.147754144939865e-05, + "loss": 1.7142, + "step": 18503 + }, + { + "epoch": 5.679558011049724, + "grad_norm": 0.2849755585193634, + "learning_rate": 4.1472643668496255e-05, + "loss": 1.6956, + "step": 18504 + }, + { + "epoch": 5.679864947820749, + "grad_norm": 0.24330341815948486, + "learning_rate": 4.1467745971866216e-05, + "loss": 1.7617, + "step": 18505 + }, + { + "epoch": 5.680171884591774, + "grad_norm": 0.21072770655155182, + "learning_rate": 4.146284835955689e-05, + "loss": 1.6999, + "step": 18506 + }, + { + "epoch": 5.680478821362799, + "grad_norm": 0.1971336454153061, + "learning_rate": 4.145795083161673e-05, + "loss": 1.6756, + "step": 18507 + }, + { + "epoch": 5.680785758133824, + "grad_norm": 0.18576614558696747, + "learning_rate": 4.1453053388094073e-05, + "loss": 1.6885, + "step": 18508 + }, + { + "epoch": 5.68109269490485, + "grad_norm": 0.21335965394973755, + "learning_rate": 4.144815602903737e-05, + "loss": 1.7278, + "step": 18509 + }, + { + "epoch": 5.681399631675875, + "grad_norm": 0.21756233274936676, + "learning_rate": 4.1443258754494986e-05, + "loss": 1.7549, + "step": 18510 + }, + { + "epoch": 5.6817065684469, + "grad_norm": 0.2214142084121704, + "learning_rate": 4.143836156451536e-05, + "loss": 1.6654, + "step": 18511 + }, + { + "epoch": 5.682013505217925, + "grad_norm": 0.2230863869190216, + "learning_rate": 4.143346445914684e-05, + "loss": 1.7286, + "step": 18512 + }, + { + "epoch": 5.68232044198895, + "grad_norm": 0.2283746749162674, + "learning_rate": 4.142856743843787e-05, + "loss": 1.7652, + "step": 18513 + }, + { + "epoch": 5.6826273787599755, + "grad_norm": 0.20059749484062195, + "learning_rate": 4.142367050243679e-05, + "loss": 1.6854, + "step": 18514 + }, + { + "epoch": 5.682934315531001, + "grad_norm": 0.17887794971466064, + "learning_rate": 4.141877365119204e-05, + "loss": 1.6975, + "step": 18515 + }, + { + "epoch": 5.683241252302026, + "grad_norm": 0.21266087889671326, + "learning_rate": 4.141387688475199e-05, + "loss": 1.7361, + "step": 18516 + }, + { + "epoch": 5.683548189073051, + "grad_norm": 0.20075422525405884, + "learning_rate": 4.140898020316506e-05, + "loss": 1.7496, + "step": 18517 + }, + { + "epoch": 5.683855125844076, + "grad_norm": 0.21430443227291107, + "learning_rate": 4.140408360647963e-05, + "loss": 1.7481, + "step": 18518 + }, + { + "epoch": 5.684162062615101, + "grad_norm": 0.1951984018087387, + "learning_rate": 4.139918709474405e-05, + "loss": 1.713, + "step": 18519 + }, + { + "epoch": 5.684468999386127, + "grad_norm": 0.21636274456977844, + "learning_rate": 4.1394290668006764e-05, + "loss": 1.8169, + "step": 18520 + }, + { + "epoch": 5.684775936157152, + "grad_norm": 0.21003715693950653, + "learning_rate": 4.138939432631613e-05, + "loss": 1.7453, + "step": 18521 + }, + { + "epoch": 5.685082872928177, + "grad_norm": 0.23559699952602386, + "learning_rate": 4.138449806972057e-05, + "loss": 1.7534, + "step": 18522 + }, + { + "epoch": 5.685389809699202, + "grad_norm": 0.23322029411792755, + "learning_rate": 4.137960189826843e-05, + "loss": 1.7535, + "step": 18523 + }, + { + "epoch": 5.685696746470227, + "grad_norm": 0.1998462826013565, + "learning_rate": 4.137470581200813e-05, + "loss": 1.7025, + "step": 18524 + }, + { + "epoch": 5.686003683241252, + "grad_norm": 0.22321350872516632, + "learning_rate": 4.1369809810988025e-05, + "loss": 1.7666, + "step": 18525 + }, + { + "epoch": 5.686310620012278, + "grad_norm": 0.20851604640483856, + "learning_rate": 4.136491389525653e-05, + "loss": 1.6958, + "step": 18526 + }, + { + "epoch": 5.686617556783302, + "grad_norm": 0.21494868397712708, + "learning_rate": 4.136001806486201e-05, + "loss": 1.7703, + "step": 18527 + }, + { + "epoch": 5.6869244935543275, + "grad_norm": 0.19872798025608063, + "learning_rate": 4.135512231985287e-05, + "loss": 1.7451, + "step": 18528 + }, + { + "epoch": 5.687231430325353, + "grad_norm": 0.2424371987581253, + "learning_rate": 4.1350226660277456e-05, + "loss": 1.8153, + "step": 18529 + }, + { + "epoch": 5.687538367096378, + "grad_norm": 0.20388297736644745, + "learning_rate": 4.1345331086184196e-05, + "loss": 1.6882, + "step": 18530 + }, + { + "epoch": 5.6878453038674035, + "grad_norm": 0.22662605345249176, + "learning_rate": 4.134043559762143e-05, + "loss": 1.7532, + "step": 18531 + }, + { + "epoch": 5.688152240638429, + "grad_norm": 0.2281452864408493, + "learning_rate": 4.133554019463756e-05, + "loss": 1.769, + "step": 18532 + }, + { + "epoch": 5.688459177409453, + "grad_norm": 0.2303505390882492, + "learning_rate": 4.1330644877280955e-05, + "loss": 1.7176, + "step": 18533 + }, + { + "epoch": 5.688766114180479, + "grad_norm": 0.24411743879318237, + "learning_rate": 4.132574964560001e-05, + "loss": 1.7557, + "step": 18534 + }, + { + "epoch": 5.689073050951504, + "grad_norm": 0.2674088776111603, + "learning_rate": 4.13208544996431e-05, + "loss": 1.6997, + "step": 18535 + }, + { + "epoch": 5.689379987722529, + "grad_norm": 0.22232958674430847, + "learning_rate": 4.1315959439458565e-05, + "loss": 1.7731, + "step": 18536 + }, + { + "epoch": 5.689686924493555, + "grad_norm": 0.23894453048706055, + "learning_rate": 4.131106446509483e-05, + "loss": 1.7454, + "step": 18537 + }, + { + "epoch": 5.689993861264579, + "grad_norm": 0.19710026681423187, + "learning_rate": 4.1306169576600226e-05, + "loss": 1.6872, + "step": 18538 + }, + { + "epoch": 5.690300798035604, + "grad_norm": 0.1879546344280243, + "learning_rate": 4.130127477402318e-05, + "loss": 1.6929, + "step": 18539 + }, + { + "epoch": 5.69060773480663, + "grad_norm": 0.1964653730392456, + "learning_rate": 4.129638005741201e-05, + "loss": 1.7778, + "step": 18540 + }, + { + "epoch": 5.690914671577655, + "grad_norm": 0.20161493122577667, + "learning_rate": 4.129148542681513e-05, + "loss": 1.7388, + "step": 18541 + }, + { + "epoch": 5.69122160834868, + "grad_norm": 0.26742830872535706, + "learning_rate": 4.1286590882280886e-05, + "loss": 1.7472, + "step": 18542 + }, + { + "epoch": 5.691528545119706, + "grad_norm": 0.2613312900066376, + "learning_rate": 4.128169642385766e-05, + "loss": 1.7656, + "step": 18543 + }, + { + "epoch": 5.69183548189073, + "grad_norm": 0.17979474365711212, + "learning_rate": 4.127680205159381e-05, + "loss": 1.6992, + "step": 18544 + }, + { + "epoch": 5.6921424186617555, + "grad_norm": 0.23575037717819214, + "learning_rate": 4.1271907765537745e-05, + "loss": 1.7399, + "step": 18545 + }, + { + "epoch": 5.692449355432781, + "grad_norm": 0.19461458921432495, + "learning_rate": 4.126701356573777e-05, + "loss": 1.709, + "step": 18546 + }, + { + "epoch": 5.692756292203806, + "grad_norm": 0.19715365767478943, + "learning_rate": 4.1262119452242306e-05, + "loss": 1.7634, + "step": 18547 + }, + { + "epoch": 5.6930632289748315, + "grad_norm": 0.21454904973506927, + "learning_rate": 4.125722542509969e-05, + "loss": 1.7663, + "step": 18548 + }, + { + "epoch": 5.693370165745856, + "grad_norm": 0.19884896278381348, + "learning_rate": 4.12523314843583e-05, + "loss": 1.7618, + "step": 18549 + }, + { + "epoch": 5.693677102516881, + "grad_norm": 0.2080020159482956, + "learning_rate": 4.124743763006648e-05, + "loss": 1.7379, + "step": 18550 + }, + { + "epoch": 5.693984039287907, + "grad_norm": 0.18780875205993652, + "learning_rate": 4.124254386227264e-05, + "loss": 1.7036, + "step": 18551 + }, + { + "epoch": 5.694290976058932, + "grad_norm": 0.2114439308643341, + "learning_rate": 4.123765018102512e-05, + "loss": 1.6873, + "step": 18552 + }, + { + "epoch": 5.694597912829957, + "grad_norm": 0.1712789535522461, + "learning_rate": 4.123275658637225e-05, + "loss": 1.6772, + "step": 18553 + }, + { + "epoch": 5.694904849600983, + "grad_norm": 0.2435859888792038, + "learning_rate": 4.122786307836243e-05, + "loss": 1.7946, + "step": 18554 + }, + { + "epoch": 5.695211786372007, + "grad_norm": 0.20587889850139618, + "learning_rate": 4.122296965704399e-05, + "loss": 1.7459, + "step": 18555 + }, + { + "epoch": 5.695518723143032, + "grad_norm": 0.2183443009853363, + "learning_rate": 4.121807632246534e-05, + "loss": 1.7036, + "step": 18556 + }, + { + "epoch": 5.695825659914058, + "grad_norm": 0.19276869297027588, + "learning_rate": 4.121318307467478e-05, + "loss": 1.7371, + "step": 18557 + }, + { + "epoch": 5.696132596685083, + "grad_norm": 0.19815512001514435, + "learning_rate": 4.120828991372072e-05, + "loss": 1.7038, + "step": 18558 + }, + { + "epoch": 5.696439533456108, + "grad_norm": 0.18509675562381744, + "learning_rate": 4.120339683965146e-05, + "loss": 1.6936, + "step": 18559 + }, + { + "epoch": 5.696746470227133, + "grad_norm": 0.2296193689107895, + "learning_rate": 4.1198503852515416e-05, + "loss": 1.7626, + "step": 18560 + }, + { + "epoch": 5.697053406998158, + "grad_norm": 0.2064799964427948, + "learning_rate": 4.11936109523609e-05, + "loss": 1.7387, + "step": 18561 + }, + { + "epoch": 5.6973603437691835, + "grad_norm": 0.20171360671520233, + "learning_rate": 4.1188718139236296e-05, + "loss": 1.7372, + "step": 18562 + }, + { + "epoch": 5.697667280540209, + "grad_norm": 0.19421936571598053, + "learning_rate": 4.118382541318993e-05, + "loss": 1.7187, + "step": 18563 + }, + { + "epoch": 5.697974217311234, + "grad_norm": 0.22517532110214233, + "learning_rate": 4.117893277427018e-05, + "loss": 1.7503, + "step": 18564 + }, + { + "epoch": 5.6982811540822595, + "grad_norm": 0.2293393909931183, + "learning_rate": 4.1174040222525366e-05, + "loss": 1.7174, + "step": 18565 + }, + { + "epoch": 5.698588090853284, + "grad_norm": 0.24003073573112488, + "learning_rate": 4.1169147758003876e-05, + "loss": 1.7829, + "step": 18566 + }, + { + "epoch": 5.698895027624309, + "grad_norm": 0.21476133167743683, + "learning_rate": 4.1164255380754034e-05, + "loss": 1.7906, + "step": 18567 + }, + { + "epoch": 5.699201964395335, + "grad_norm": 0.21347576379776, + "learning_rate": 4.115936309082422e-05, + "loss": 1.6986, + "step": 18568 + }, + { + "epoch": 5.69950890116636, + "grad_norm": 0.22650402784347534, + "learning_rate": 4.115447088826276e-05, + "loss": 1.7949, + "step": 18569 + }, + { + "epoch": 5.699815837937384, + "grad_norm": 0.25815197825431824, + "learning_rate": 4.114957877311799e-05, + "loss": 1.7499, + "step": 18570 + }, + { + "epoch": 5.70012277470841, + "grad_norm": 0.22644442319869995, + "learning_rate": 4.1144686745438265e-05, + "loss": 1.7689, + "step": 18571 + }, + { + "epoch": 5.700429711479435, + "grad_norm": 0.241188645362854, + "learning_rate": 4.113979480527194e-05, + "loss": 1.7341, + "step": 18572 + }, + { + "epoch": 5.7007366482504604, + "grad_norm": 0.20984862744808197, + "learning_rate": 4.1134902952667365e-05, + "loss": 1.7091, + "step": 18573 + }, + { + "epoch": 5.701043585021486, + "grad_norm": 0.25150877237319946, + "learning_rate": 4.113001118767286e-05, + "loss": 1.723, + "step": 18574 + }, + { + "epoch": 5.701350521792511, + "grad_norm": 0.21693028509616852, + "learning_rate": 4.1125119510336804e-05, + "loss": 1.7483, + "step": 18575 + }, + { + "epoch": 5.701657458563536, + "grad_norm": 0.2620212733745575, + "learning_rate": 4.11202279207075e-05, + "loss": 1.8159, + "step": 18576 + }, + { + "epoch": 5.701964395334561, + "grad_norm": 0.18722239136695862, + "learning_rate": 4.111533641883332e-05, + "loss": 1.7197, + "step": 18577 + }, + { + "epoch": 5.702271332105586, + "grad_norm": 0.21321091055870056, + "learning_rate": 4.111044500476258e-05, + "loss": 1.7408, + "step": 18578 + }, + { + "epoch": 5.702578268876612, + "grad_norm": 0.24459265172481537, + "learning_rate": 4.110555367854365e-05, + "loss": 1.8304, + "step": 18579 + }, + { + "epoch": 5.702885205647637, + "grad_norm": 0.24987100064754486, + "learning_rate": 4.110066244022483e-05, + "loss": 1.7051, + "step": 18580 + }, + { + "epoch": 5.703192142418661, + "grad_norm": 0.19059090316295624, + "learning_rate": 4.1095771289854506e-05, + "loss": 1.7489, + "step": 18581 + }, + { + "epoch": 5.703499079189687, + "grad_norm": 0.23020480573177338, + "learning_rate": 4.1090880227480966e-05, + "loss": 1.7101, + "step": 18582 + }, + { + "epoch": 5.703806015960712, + "grad_norm": 0.18733634054660797, + "learning_rate": 4.108598925315258e-05, + "loss": 1.7116, + "step": 18583 + }, + { + "epoch": 5.704112952731737, + "grad_norm": 0.1959095001220703, + "learning_rate": 4.108109836691766e-05, + "loss": 1.7283, + "step": 18584 + }, + { + "epoch": 5.704419889502763, + "grad_norm": 0.22685091197490692, + "learning_rate": 4.107620756882457e-05, + "loss": 1.7588, + "step": 18585 + }, + { + "epoch": 5.704726826273788, + "grad_norm": 0.1998603790998459, + "learning_rate": 4.107131685892164e-05, + "loss": 1.7071, + "step": 18586 + }, + { + "epoch": 5.7050337630448125, + "grad_norm": 0.2018733024597168, + "learning_rate": 4.106642623725717e-05, + "loss": 1.6782, + "step": 18587 + }, + { + "epoch": 5.705340699815838, + "grad_norm": 0.21826615929603577, + "learning_rate": 4.106153570387951e-05, + "loss": 1.736, + "step": 18588 + }, + { + "epoch": 5.705647636586863, + "grad_norm": 0.20197603106498718, + "learning_rate": 4.105664525883699e-05, + "loss": 1.6921, + "step": 18589 + }, + { + "epoch": 5.7059545733578885, + "grad_norm": 0.20943905413150787, + "learning_rate": 4.105175490217796e-05, + "loss": 1.665, + "step": 18590 + }, + { + "epoch": 5.706261510128914, + "grad_norm": 0.202060267329216, + "learning_rate": 4.104686463395071e-05, + "loss": 1.714, + "step": 18591 + }, + { + "epoch": 5.706568446899938, + "grad_norm": 0.220698744058609, + "learning_rate": 4.1041974454203623e-05, + "loss": 1.8076, + "step": 18592 + }, + { + "epoch": 5.706875383670964, + "grad_norm": 0.21536946296691895, + "learning_rate": 4.103708436298497e-05, + "loss": 1.6801, + "step": 18593 + }, + { + "epoch": 5.707182320441989, + "grad_norm": 0.21442468464374542, + "learning_rate": 4.103219436034311e-05, + "loss": 1.6921, + "step": 18594 + }, + { + "epoch": 5.707489257213014, + "grad_norm": 0.2047559767961502, + "learning_rate": 4.1027304446326356e-05, + "loss": 1.7861, + "step": 18595 + }, + { + "epoch": 5.70779619398404, + "grad_norm": 0.20304669439792633, + "learning_rate": 4.102241462098305e-05, + "loss": 1.7751, + "step": 18596 + }, + { + "epoch": 5.708103130755065, + "grad_norm": 0.18702620267868042, + "learning_rate": 4.101752488436149e-05, + "loss": 1.6951, + "step": 18597 + }, + { + "epoch": 5.708410067526089, + "grad_norm": 0.1821923404932022, + "learning_rate": 4.1012635236510034e-05, + "loss": 1.711, + "step": 18598 + }, + { + "epoch": 5.708717004297115, + "grad_norm": 0.19422096014022827, + "learning_rate": 4.100774567747696e-05, + "loss": 1.7202, + "step": 18599 + }, + { + "epoch": 5.70902394106814, + "grad_norm": 0.20800530910491943, + "learning_rate": 4.100285620731063e-05, + "loss": 1.7403, + "step": 18600 + }, + { + "epoch": 5.709330877839165, + "grad_norm": 0.221746027469635, + "learning_rate": 4.099796682605934e-05, + "loss": 1.7769, + "step": 18601 + }, + { + "epoch": 5.70963781461019, + "grad_norm": 0.19284313917160034, + "learning_rate": 4.099307753377143e-05, + "loss": 1.692, + "step": 18602 + }, + { + "epoch": 5.709944751381215, + "grad_norm": 0.17635129392147064, + "learning_rate": 4.0988188330495216e-05, + "loss": 1.7212, + "step": 18603 + }, + { + "epoch": 5.7102516881522405, + "grad_norm": 0.17728061974048615, + "learning_rate": 4.098329921627898e-05, + "loss": 1.7217, + "step": 18604 + }, + { + "epoch": 5.710558624923266, + "grad_norm": 0.19998152554035187, + "learning_rate": 4.097841019117108e-05, + "loss": 1.7583, + "step": 18605 + }, + { + "epoch": 5.710865561694291, + "grad_norm": 0.18840095400810242, + "learning_rate": 4.09735212552198e-05, + "loss": 1.7353, + "step": 18606 + }, + { + "epoch": 5.7111724984653165, + "grad_norm": 0.2528367042541504, + "learning_rate": 4.09686324084735e-05, + "loss": 1.7576, + "step": 18607 + }, + { + "epoch": 5.711479435236341, + "grad_norm": 0.27240338921546936, + "learning_rate": 4.096374365098045e-05, + "loss": 1.7303, + "step": 18608 + }, + { + "epoch": 5.711786372007366, + "grad_norm": 0.20187151432037354, + "learning_rate": 4.0958854982789e-05, + "loss": 1.7599, + "step": 18609 + }, + { + "epoch": 5.712093308778392, + "grad_norm": 0.24890528619289398, + "learning_rate": 4.095396640394742e-05, + "loss": 1.7737, + "step": 18610 + }, + { + "epoch": 5.712400245549417, + "grad_norm": 0.21524454653263092, + "learning_rate": 4.094907791450406e-05, + "loss": 1.7704, + "step": 18611 + }, + { + "epoch": 5.712707182320442, + "grad_norm": 0.20070379972457886, + "learning_rate": 4.094418951450721e-05, + "loss": 1.7358, + "step": 18612 + }, + { + "epoch": 5.713014119091467, + "grad_norm": 0.2252196967601776, + "learning_rate": 4.09393012040052e-05, + "loss": 1.7262, + "step": 18613 + }, + { + "epoch": 5.713321055862492, + "grad_norm": 0.19511987268924713, + "learning_rate": 4.093441298304631e-05, + "loss": 1.7146, + "step": 18614 + }, + { + "epoch": 5.713627992633517, + "grad_norm": 0.2047072798013687, + "learning_rate": 4.092952485167888e-05, + "loss": 1.7864, + "step": 18615 + }, + { + "epoch": 5.713934929404543, + "grad_norm": 0.21794871985912323, + "learning_rate": 4.092463680995119e-05, + "loss": 1.7759, + "step": 18616 + }, + { + "epoch": 5.714241866175568, + "grad_norm": 0.23863841593265533, + "learning_rate": 4.0919748857911566e-05, + "loss": 1.7207, + "step": 18617 + }, + { + "epoch": 5.714548802946593, + "grad_norm": 0.19706958532333374, + "learning_rate": 4.09148609956083e-05, + "loss": 1.7247, + "step": 18618 + }, + { + "epoch": 5.714855739717618, + "grad_norm": 0.23663771152496338, + "learning_rate": 4.090997322308971e-05, + "loss": 1.7929, + "step": 18619 + }, + { + "epoch": 5.715162676488643, + "grad_norm": 0.23079079389572144, + "learning_rate": 4.09050855404041e-05, + "loss": 1.763, + "step": 18620 + }, + { + "epoch": 5.7154696132596685, + "grad_norm": 0.23883379995822906, + "learning_rate": 4.0900197947599736e-05, + "loss": 1.7995, + "step": 18621 + }, + { + "epoch": 5.715776550030694, + "grad_norm": 0.2125123143196106, + "learning_rate": 4.0895310444724974e-05, + "loss": 1.8045, + "step": 18622 + }, + { + "epoch": 5.716083486801719, + "grad_norm": 0.21062424778938293, + "learning_rate": 4.0890423031828076e-05, + "loss": 1.7348, + "step": 18623 + }, + { + "epoch": 5.716390423572744, + "grad_norm": 0.24079614877700806, + "learning_rate": 4.088553570895737e-05, + "loss": 1.7462, + "step": 18624 + }, + { + "epoch": 5.716697360343769, + "grad_norm": 0.2120666354894638, + "learning_rate": 4.088064847616113e-05, + "loss": 1.7235, + "step": 18625 + }, + { + "epoch": 5.717004297114794, + "grad_norm": 0.19663050770759583, + "learning_rate": 4.0875761333487685e-05, + "loss": 1.6743, + "step": 18626 + }, + { + "epoch": 5.71731123388582, + "grad_norm": 0.24010685086250305, + "learning_rate": 4.0870874280985295e-05, + "loss": 1.6742, + "step": 18627 + }, + { + "epoch": 5.717618170656845, + "grad_norm": 0.22140294313430786, + "learning_rate": 4.086598731870228e-05, + "loss": 1.7601, + "step": 18628 + }, + { + "epoch": 5.71792510742787, + "grad_norm": 0.2876693308353424, + "learning_rate": 4.086110044668694e-05, + "loss": 1.7601, + "step": 18629 + }, + { + "epoch": 5.718232044198895, + "grad_norm": 0.3103853464126587, + "learning_rate": 4.085621366498756e-05, + "loss": 1.6824, + "step": 18630 + }, + { + "epoch": 5.71853898096992, + "grad_norm": 0.18194396793842316, + "learning_rate": 4.0851326973652424e-05, + "loss": 1.6976, + "step": 18631 + }, + { + "epoch": 5.718845917740945, + "grad_norm": 0.28400903940200806, + "learning_rate": 4.0846440372729854e-05, + "loss": 1.7352, + "step": 18632 + }, + { + "epoch": 5.719152854511971, + "grad_norm": 0.23753583431243896, + "learning_rate": 4.084155386226811e-05, + "loss": 1.7418, + "step": 18633 + }, + { + "epoch": 5.719459791282996, + "grad_norm": 0.215620756149292, + "learning_rate": 4.0836667442315514e-05, + "loss": 1.7602, + "step": 18634 + }, + { + "epoch": 5.7197667280540205, + "grad_norm": 0.21057941019535065, + "learning_rate": 4.083178111292034e-05, + "loss": 1.6818, + "step": 18635 + }, + { + "epoch": 5.720073664825046, + "grad_norm": 0.2169445902109146, + "learning_rate": 4.0826894874130863e-05, + "loss": 1.7942, + "step": 18636 + }, + { + "epoch": 5.720380601596071, + "grad_norm": 0.2779453992843628, + "learning_rate": 4.082200872599541e-05, + "loss": 1.7432, + "step": 18637 + }, + { + "epoch": 5.7206875383670965, + "grad_norm": 0.22556698322296143, + "learning_rate": 4.0817122668562224e-05, + "loss": 1.7748, + "step": 18638 + }, + { + "epoch": 5.720994475138122, + "grad_norm": 0.2570365071296692, + "learning_rate": 4.081223670187962e-05, + "loss": 1.7314, + "step": 18639 + }, + { + "epoch": 5.721301411909147, + "grad_norm": 0.266176700592041, + "learning_rate": 4.080735082599588e-05, + "loss": 1.689, + "step": 18640 + }, + { + "epoch": 5.721608348680172, + "grad_norm": 0.20190037786960602, + "learning_rate": 4.080246504095929e-05, + "loss": 1.7467, + "step": 18641 + }, + { + "epoch": 5.721915285451197, + "grad_norm": 0.2498215138912201, + "learning_rate": 4.079757934681813e-05, + "loss": 1.7063, + "step": 18642 + }, + { + "epoch": 5.722222222222222, + "grad_norm": 0.25594204664230347, + "learning_rate": 4.0792693743620695e-05, + "loss": 1.7096, + "step": 18643 + }, + { + "epoch": 5.722529158993248, + "grad_norm": 0.22674626111984253, + "learning_rate": 4.0787808231415233e-05, + "loss": 1.715, + "step": 18644 + }, + { + "epoch": 5.722836095764272, + "grad_norm": 0.267140656709671, + "learning_rate": 4.078292281025007e-05, + "loss": 1.7747, + "step": 18645 + }, + { + "epoch": 5.723143032535297, + "grad_norm": 0.21161147952079773, + "learning_rate": 4.077803748017345e-05, + "loss": 1.7312, + "step": 18646 + }, + { + "epoch": 5.723449969306323, + "grad_norm": 0.2580260634422302, + "learning_rate": 4.077315224123368e-05, + "loss": 1.7246, + "step": 18647 + }, + { + "epoch": 5.723756906077348, + "grad_norm": 0.23766927421092987, + "learning_rate": 4.076826709347902e-05, + "loss": 1.7147, + "step": 18648 + }, + { + "epoch": 5.724063842848373, + "grad_norm": 0.22764286398887634, + "learning_rate": 4.076338203695776e-05, + "loss": 1.7034, + "step": 18649 + }, + { + "epoch": 5.724370779619399, + "grad_norm": 0.28205159306526184, + "learning_rate": 4.075849707171817e-05, + "loss": 1.7472, + "step": 18650 + }, + { + "epoch": 5.724677716390423, + "grad_norm": 0.2091183066368103, + "learning_rate": 4.075361219780854e-05, + "loss": 1.7693, + "step": 18651 + }, + { + "epoch": 5.7249846531614486, + "grad_norm": 0.29513829946517944, + "learning_rate": 4.074872741527713e-05, + "loss": 1.7286, + "step": 18652 + }, + { + "epoch": 5.725291589932474, + "grad_norm": 0.226357102394104, + "learning_rate": 4.07438427241722e-05, + "loss": 1.7658, + "step": 18653 + }, + { + "epoch": 5.725598526703499, + "grad_norm": 0.23732580244541168, + "learning_rate": 4.073895812454207e-05, + "loss": 1.7591, + "step": 18654 + }, + { + "epoch": 5.725905463474525, + "grad_norm": 0.2835488021373749, + "learning_rate": 4.0734073616434956e-05, + "loss": 1.757, + "step": 18655 + }, + { + "epoch": 5.726212400245549, + "grad_norm": 0.1986306756734848, + "learning_rate": 4.0729189199899186e-05, + "loss": 1.714, + "step": 18656 + }, + { + "epoch": 5.726519337016574, + "grad_norm": 0.25071820616722107, + "learning_rate": 4.072430487498298e-05, + "loss": 1.7334, + "step": 18657 + }, + { + "epoch": 5.7268262737876, + "grad_norm": 0.19989889860153198, + "learning_rate": 4.0719420641734634e-05, + "loss": 1.7472, + "step": 18658 + }, + { + "epoch": 5.727133210558625, + "grad_norm": 0.30006101727485657, + "learning_rate": 4.071453650020241e-05, + "loss": 1.7846, + "step": 18659 + }, + { + "epoch": 5.72744014732965, + "grad_norm": 0.19856922328472137, + "learning_rate": 4.070965245043459e-05, + "loss": 1.6965, + "step": 18660 + }, + { + "epoch": 5.727747084100676, + "grad_norm": 0.20139823853969574, + "learning_rate": 4.070476849247941e-05, + "loss": 1.7265, + "step": 18661 + }, + { + "epoch": 5.7280540208717, + "grad_norm": 0.21507953107357025, + "learning_rate": 4.0699884626385184e-05, + "loss": 1.762, + "step": 18662 + }, + { + "epoch": 5.7283609576427255, + "grad_norm": 0.1885843127965927, + "learning_rate": 4.069500085220013e-05, + "loss": 1.6721, + "step": 18663 + }, + { + "epoch": 5.728667894413751, + "grad_norm": 0.2076897919178009, + "learning_rate": 4.069011716997253e-05, + "loss": 1.7399, + "step": 18664 + }, + { + "epoch": 5.728974831184776, + "grad_norm": 0.21482045948505402, + "learning_rate": 4.068523357975065e-05, + "loss": 1.7105, + "step": 18665 + }, + { + "epoch": 5.7292817679558015, + "grad_norm": 0.20438800752162933, + "learning_rate": 4.0680350081582765e-05, + "loss": 1.7408, + "step": 18666 + }, + { + "epoch": 5.729588704726826, + "grad_norm": 0.2137845903635025, + "learning_rate": 4.0675466675517104e-05, + "loss": 1.7814, + "step": 18667 + }, + { + "epoch": 5.729895641497851, + "grad_norm": 0.23009657859802246, + "learning_rate": 4.067058336160197e-05, + "loss": 1.7311, + "step": 18668 + }, + { + "epoch": 5.730202578268877, + "grad_norm": 0.20602397620677948, + "learning_rate": 4.066570013988558e-05, + "loss": 1.741, + "step": 18669 + }, + { + "epoch": 5.730509515039902, + "grad_norm": 0.24884814023971558, + "learning_rate": 4.066081701041621e-05, + "loss": 1.7222, + "step": 18670 + }, + { + "epoch": 5.730816451810927, + "grad_norm": 0.17906342446804047, + "learning_rate": 4.065593397324214e-05, + "loss": 1.6879, + "step": 18671 + }, + { + "epoch": 5.731123388581953, + "grad_norm": 0.20345427095890045, + "learning_rate": 4.0651051028411586e-05, + "loss": 1.7713, + "step": 18672 + }, + { + "epoch": 5.731430325352977, + "grad_norm": 0.21115002036094666, + "learning_rate": 4.0646168175972846e-05, + "loss": 1.7666, + "step": 18673 + }, + { + "epoch": 5.731737262124002, + "grad_norm": 0.22189734876155853, + "learning_rate": 4.064128541597413e-05, + "loss": 1.6989, + "step": 18674 + }, + { + "epoch": 5.732044198895028, + "grad_norm": 0.24036027491092682, + "learning_rate": 4.063640274846373e-05, + "loss": 1.707, + "step": 18675 + }, + { + "epoch": 5.732351135666053, + "grad_norm": 0.23091022670269012, + "learning_rate": 4.063152017348988e-05, + "loss": 1.7072, + "step": 18676 + }, + { + "epoch": 5.7326580724370775, + "grad_norm": 0.3142668306827545, + "learning_rate": 4.062663769110085e-05, + "loss": 1.7641, + "step": 18677 + }, + { + "epoch": 5.732965009208103, + "grad_norm": 0.2634848356246948, + "learning_rate": 4.0621755301344875e-05, + "loss": 1.7007, + "step": 18678 + }, + { + "epoch": 5.733271945979128, + "grad_norm": 0.21296904981136322, + "learning_rate": 4.061687300427022e-05, + "loss": 1.7201, + "step": 18679 + }, + { + "epoch": 5.7335788827501535, + "grad_norm": 0.24943144619464874, + "learning_rate": 4.0611990799925104e-05, + "loss": 1.7186, + "step": 18680 + }, + { + "epoch": 5.733885819521179, + "grad_norm": 0.2574152946472168, + "learning_rate": 4.060710868835781e-05, + "loss": 1.8671, + "step": 18681 + }, + { + "epoch": 5.734192756292204, + "grad_norm": 0.26023826003074646, + "learning_rate": 4.0602226669616564e-05, + "loss": 1.7618, + "step": 18682 + }, + { + "epoch": 5.734499693063229, + "grad_norm": 0.21078336238861084, + "learning_rate": 4.0597344743749645e-05, + "loss": 1.7548, + "step": 18683 + }, + { + "epoch": 5.734806629834254, + "grad_norm": 0.2195056676864624, + "learning_rate": 4.059246291080525e-05, + "loss": 1.6843, + "step": 18684 + }, + { + "epoch": 5.735113566605279, + "grad_norm": 0.20719893276691437, + "learning_rate": 4.058758117083168e-05, + "loss": 1.692, + "step": 18685 + }, + { + "epoch": 5.735420503376305, + "grad_norm": 0.23012077808380127, + "learning_rate": 4.058269952387713e-05, + "loss": 1.7072, + "step": 18686 + }, + { + "epoch": 5.73572744014733, + "grad_norm": 0.18598411977291107, + "learning_rate": 4.057781796998986e-05, + "loss": 1.6983, + "step": 18687 + }, + { + "epoch": 5.736034376918354, + "grad_norm": 0.20211926102638245, + "learning_rate": 4.057293650921813e-05, + "loss": 1.6818, + "step": 18688 + }, + { + "epoch": 5.73634131368938, + "grad_norm": 0.1957080215215683, + "learning_rate": 4.056805514161015e-05, + "loss": 1.7154, + "step": 18689 + }, + { + "epoch": 5.736648250460405, + "grad_norm": 0.23581798374652863, + "learning_rate": 4.0563173867214196e-05, + "loss": 1.7724, + "step": 18690 + }, + { + "epoch": 5.73695518723143, + "grad_norm": 0.22706671059131622, + "learning_rate": 4.055829268607847e-05, + "loss": 1.7387, + "step": 18691 + }, + { + "epoch": 5.737262124002456, + "grad_norm": 0.20050427317619324, + "learning_rate": 4.055341159825124e-05, + "loss": 1.7585, + "step": 18692 + }, + { + "epoch": 5.737569060773481, + "grad_norm": 0.18666231632232666, + "learning_rate": 4.054853060378072e-05, + "loss": 1.6996, + "step": 18693 + }, + { + "epoch": 5.7378759975445055, + "grad_norm": 0.23018911480903625, + "learning_rate": 4.0543649702715186e-05, + "loss": 1.7167, + "step": 18694 + }, + { + "epoch": 5.738182934315531, + "grad_norm": 0.21207039058208466, + "learning_rate": 4.053876889510282e-05, + "loss": 1.7539, + "step": 18695 + }, + { + "epoch": 5.738489871086556, + "grad_norm": 0.22042523324489594, + "learning_rate": 4.0533888180991915e-05, + "loss": 1.8145, + "step": 18696 + }, + { + "epoch": 5.7387968078575815, + "grad_norm": 0.20705139636993408, + "learning_rate": 4.0529007560430646e-05, + "loss": 1.7612, + "step": 18697 + }, + { + "epoch": 5.739103744628607, + "grad_norm": 0.20673857629299164, + "learning_rate": 4.052412703346729e-05, + "loss": 1.7338, + "step": 18698 + }, + { + "epoch": 5.739410681399631, + "grad_norm": 0.20742641389369965, + "learning_rate": 4.051924660015005e-05, + "loss": 1.7497, + "step": 18699 + }, + { + "epoch": 5.739717618170657, + "grad_norm": 0.22352617979049683, + "learning_rate": 4.05143662605272e-05, + "loss": 1.7568, + "step": 18700 + }, + { + "epoch": 5.740024554941682, + "grad_norm": 0.20306691527366638, + "learning_rate": 4.050948601464692e-05, + "loss": 1.7416, + "step": 18701 + }, + { + "epoch": 5.740331491712707, + "grad_norm": 0.22972522675991058, + "learning_rate": 4.050460586255748e-05, + "loss": 1.7907, + "step": 18702 + }, + { + "epoch": 5.740638428483733, + "grad_norm": 0.2056068629026413, + "learning_rate": 4.0499725804307084e-05, + "loss": 1.7584, + "step": 18703 + }, + { + "epoch": 5.740945365254758, + "grad_norm": 0.2150508463382721, + "learning_rate": 4.049484583994395e-05, + "loss": 1.7695, + "step": 18704 + }, + { + "epoch": 5.741252302025782, + "grad_norm": 0.20274797081947327, + "learning_rate": 4.048996596951634e-05, + "loss": 1.7398, + "step": 18705 + }, + { + "epoch": 5.741559238796808, + "grad_norm": 0.20521290600299835, + "learning_rate": 4.0485086193072444e-05, + "loss": 1.7529, + "step": 18706 + }, + { + "epoch": 5.741866175567833, + "grad_norm": 0.22344307601451874, + "learning_rate": 4.0480206510660527e-05, + "loss": 1.6729, + "step": 18707 + }, + { + "epoch": 5.742173112338858, + "grad_norm": 0.20007841289043427, + "learning_rate": 4.047532692232876e-05, + "loss": 1.7004, + "step": 18708 + }, + { + "epoch": 5.742480049109884, + "grad_norm": 0.2455853819847107, + "learning_rate": 4.047044742812541e-05, + "loss": 1.7324, + "step": 18709 + }, + { + "epoch": 5.742786985880908, + "grad_norm": 0.29901546239852905, + "learning_rate": 4.046556802809867e-05, + "loss": 1.7138, + "step": 18710 + }, + { + "epoch": 5.7430939226519335, + "grad_norm": 0.19636842608451843, + "learning_rate": 4.04606887222968e-05, + "loss": 1.7098, + "step": 18711 + }, + { + "epoch": 5.743400859422959, + "grad_norm": 0.24916070699691772, + "learning_rate": 4.045580951076797e-05, + "loss": 1.7073, + "step": 18712 + }, + { + "epoch": 5.743707796193984, + "grad_norm": 0.2122841477394104, + "learning_rate": 4.0450930393560453e-05, + "loss": 1.7608, + "step": 18713 + }, + { + "epoch": 5.7440147329650095, + "grad_norm": 0.25119176506996155, + "learning_rate": 4.044605137072241e-05, + "loss": 1.7528, + "step": 18714 + }, + { + "epoch": 5.744321669736035, + "grad_norm": 0.2128097116947174, + "learning_rate": 4.0441172442302104e-05, + "loss": 1.6834, + "step": 18715 + }, + { + "epoch": 5.744628606507059, + "grad_norm": 0.1771443784236908, + "learning_rate": 4.043629360834772e-05, + "loss": 1.6699, + "step": 18716 + }, + { + "epoch": 5.744935543278085, + "grad_norm": 0.2360549122095108, + "learning_rate": 4.043141486890751e-05, + "loss": 1.7704, + "step": 18717 + }, + { + "epoch": 5.74524248004911, + "grad_norm": 0.22453519701957703, + "learning_rate": 4.0426536224029645e-05, + "loss": 1.7305, + "step": 18718 + }, + { + "epoch": 5.745549416820135, + "grad_norm": 0.2170165628194809, + "learning_rate": 4.042165767376238e-05, + "loss": 1.7859, + "step": 18719 + }, + { + "epoch": 5.74585635359116, + "grad_norm": 0.233921617269516, + "learning_rate": 4.0416779218153896e-05, + "loss": 1.7622, + "step": 18720 + }, + { + "epoch": 5.746163290362185, + "grad_norm": 0.2698482871055603, + "learning_rate": 4.041190085725242e-05, + "loss": 1.7419, + "step": 18721 + }, + { + "epoch": 5.74647022713321, + "grad_norm": 0.28437280654907227, + "learning_rate": 4.0407022591106165e-05, + "loss": 1.7242, + "step": 18722 + }, + { + "epoch": 5.746777163904236, + "grad_norm": 0.2087356448173523, + "learning_rate": 4.040214441976332e-05, + "loss": 1.747, + "step": 18723 + }, + { + "epoch": 5.747084100675261, + "grad_norm": 0.2028181403875351, + "learning_rate": 4.039726634327213e-05, + "loss": 1.7843, + "step": 18724 + }, + { + "epoch": 5.747391037446286, + "grad_norm": 0.18513897061347961, + "learning_rate": 4.039238836168076e-05, + "loss": 1.692, + "step": 18725 + }, + { + "epoch": 5.747697974217311, + "grad_norm": 0.2308989316225052, + "learning_rate": 4.038751047503745e-05, + "loss": 1.6625, + "step": 18726 + }, + { + "epoch": 5.748004910988336, + "grad_norm": 0.23922030627727509, + "learning_rate": 4.0382632683390386e-05, + "loss": 1.7407, + "step": 18727 + }, + { + "epoch": 5.7483118477593615, + "grad_norm": 0.17225340008735657, + "learning_rate": 4.0377754986787806e-05, + "loss": 1.6888, + "step": 18728 + }, + { + "epoch": 5.748618784530387, + "grad_norm": 0.1898551732301712, + "learning_rate": 4.037287738527786e-05, + "loss": 1.6931, + "step": 18729 + }, + { + "epoch": 5.748925721301412, + "grad_norm": 0.22900012135505676, + "learning_rate": 4.036799987890881e-05, + "loss": 1.751, + "step": 18730 + }, + { + "epoch": 5.749232658072437, + "grad_norm": 0.21106193959712982, + "learning_rate": 4.0363122467728815e-05, + "loss": 1.6919, + "step": 18731 + }, + { + "epoch": 5.749539594843462, + "grad_norm": 0.19944290816783905, + "learning_rate": 4.03582451517861e-05, + "loss": 1.7232, + "step": 18732 + }, + { + "epoch": 5.749846531614487, + "grad_norm": 0.1833256036043167, + "learning_rate": 4.035336793112885e-05, + "loss": 1.7199, + "step": 18733 + }, + { + "epoch": 5.750153468385513, + "grad_norm": 0.2596902847290039, + "learning_rate": 4.0348490805805287e-05, + "loss": 1.7386, + "step": 18734 + }, + { + "epoch": 5.750460405156538, + "grad_norm": 0.23708637058734894, + "learning_rate": 4.034361377586357e-05, + "loss": 1.7697, + "step": 18735 + }, + { + "epoch": 5.750767341927563, + "grad_norm": 0.20476554334163666, + "learning_rate": 4.033873684135195e-05, + "loss": 1.7804, + "step": 18736 + }, + { + "epoch": 5.751074278698588, + "grad_norm": 0.2625868320465088, + "learning_rate": 4.033386000231858e-05, + "loss": 1.7046, + "step": 18737 + }, + { + "epoch": 5.751381215469613, + "grad_norm": 0.23011820018291473, + "learning_rate": 4.032898325881166e-05, + "loss": 1.7758, + "step": 18738 + }, + { + "epoch": 5.7516881522406385, + "grad_norm": 0.23972748219966888, + "learning_rate": 4.032410661087943e-05, + "loss": 1.7165, + "step": 18739 + }, + { + "epoch": 5.751995089011664, + "grad_norm": 0.2241208404302597, + "learning_rate": 4.031923005857001e-05, + "loss": 1.713, + "step": 18740 + }, + { + "epoch": 5.752302025782689, + "grad_norm": 0.22316952049732208, + "learning_rate": 4.0314353601931665e-05, + "loss": 1.7655, + "step": 18741 + }, + { + "epoch": 5.752608962553714, + "grad_norm": 0.2177707403898239, + "learning_rate": 4.030947724101253e-05, + "loss": 1.7517, + "step": 18742 + }, + { + "epoch": 5.752915899324739, + "grad_norm": 0.21731823682785034, + "learning_rate": 4.030460097586083e-05, + "loss": 1.718, + "step": 18743 + }, + { + "epoch": 5.753222836095764, + "grad_norm": 0.1700165718793869, + "learning_rate": 4.0299724806524744e-05, + "loss": 1.6536, + "step": 18744 + }, + { + "epoch": 5.75352977286679, + "grad_norm": 0.21920062601566315, + "learning_rate": 4.029484873305247e-05, + "loss": 1.7298, + "step": 18745 + }, + { + "epoch": 5.753836709637815, + "grad_norm": 0.22648905217647552, + "learning_rate": 4.028997275549218e-05, + "loss": 1.7878, + "step": 18746 + }, + { + "epoch": 5.75414364640884, + "grad_norm": 0.19443005323410034, + "learning_rate": 4.028509687389208e-05, + "loss": 1.7582, + "step": 18747 + }, + { + "epoch": 5.754450583179865, + "grad_norm": 0.21973860263824463, + "learning_rate": 4.028022108830034e-05, + "loss": 1.8215, + "step": 18748 + }, + { + "epoch": 5.75475751995089, + "grad_norm": 0.2215481847524643, + "learning_rate": 4.0275345398765155e-05, + "loss": 1.7092, + "step": 18749 + }, + { + "epoch": 5.755064456721915, + "grad_norm": 0.18789733946323395, + "learning_rate": 4.0270469805334696e-05, + "loss": 1.7089, + "step": 18750 + }, + { + "epoch": 5.755371393492941, + "grad_norm": 0.2423657774925232, + "learning_rate": 4.0265594308057175e-05, + "loss": 1.7412, + "step": 18751 + }, + { + "epoch": 5.755678330263965, + "grad_norm": 0.22020475566387177, + "learning_rate": 4.026071890698074e-05, + "loss": 1.7644, + "step": 18752 + }, + { + "epoch": 5.7559852670349905, + "grad_norm": 0.31772032380104065, + "learning_rate": 4.025584360215361e-05, + "loss": 1.7326, + "step": 18753 + }, + { + "epoch": 5.756292203806016, + "grad_norm": 0.23786257207393646, + "learning_rate": 4.025096839362393e-05, + "loss": 1.7652, + "step": 18754 + }, + { + "epoch": 5.756599140577041, + "grad_norm": 0.24288083612918854, + "learning_rate": 4.024609328143989e-05, + "loss": 1.6797, + "step": 18755 + }, + { + "epoch": 5.7569060773480665, + "grad_norm": 0.30519670248031616, + "learning_rate": 4.024121826564969e-05, + "loss": 1.7442, + "step": 18756 + }, + { + "epoch": 5.757213014119092, + "grad_norm": 0.218281090259552, + "learning_rate": 4.023634334630147e-05, + "loss": 1.7498, + "step": 18757 + }, + { + "epoch": 5.757519950890116, + "grad_norm": 0.215846985578537, + "learning_rate": 4.023146852344345e-05, + "loss": 1.7728, + "step": 18758 + }, + { + "epoch": 5.757826887661142, + "grad_norm": 0.2883944511413574, + "learning_rate": 4.022659379712376e-05, + "loss": 1.8098, + "step": 18759 + }, + { + "epoch": 5.758133824432167, + "grad_norm": 0.25141629576683044, + "learning_rate": 4.022171916739062e-05, + "loss": 1.6574, + "step": 18760 + }, + { + "epoch": 5.758440761203192, + "grad_norm": 0.22118757665157318, + "learning_rate": 4.021684463429216e-05, + "loss": 1.7542, + "step": 18761 + }, + { + "epoch": 5.758747697974218, + "grad_norm": 0.2437646985054016, + "learning_rate": 4.02119701978766e-05, + "loss": 1.7182, + "step": 18762 + }, + { + "epoch": 5.759054634745242, + "grad_norm": 0.24247203767299652, + "learning_rate": 4.020709585819206e-05, + "loss": 1.7134, + "step": 18763 + }, + { + "epoch": 5.759361571516267, + "grad_norm": 0.208528533577919, + "learning_rate": 4.020222161528677e-05, + "loss": 1.6966, + "step": 18764 + }, + { + "epoch": 5.759668508287293, + "grad_norm": 0.19645826518535614, + "learning_rate": 4.0197347469208843e-05, + "loss": 1.7261, + "step": 18765 + }, + { + "epoch": 5.759975445058318, + "grad_norm": 0.20066291093826294, + "learning_rate": 4.019247342000648e-05, + "loss": 1.7197, + "step": 18766 + }, + { + "epoch": 5.760282381829343, + "grad_norm": 0.25344669818878174, + "learning_rate": 4.0187599467727845e-05, + "loss": 1.7957, + "step": 18767 + }, + { + "epoch": 5.760589318600369, + "grad_norm": 0.1917620301246643, + "learning_rate": 4.018272561242111e-05, + "loss": 1.6868, + "step": 18768 + }, + { + "epoch": 5.760896255371393, + "grad_norm": 0.21996566653251648, + "learning_rate": 4.0177851854134424e-05, + "loss": 1.7128, + "step": 18769 + }, + { + "epoch": 5.7612031921424185, + "grad_norm": 0.23226283490657806, + "learning_rate": 4.017297819291598e-05, + "loss": 1.7079, + "step": 18770 + }, + { + "epoch": 5.761510128913444, + "grad_norm": 0.30606213212013245, + "learning_rate": 4.016810462881391e-05, + "loss": 1.8087, + "step": 18771 + }, + { + "epoch": 5.761817065684469, + "grad_norm": 0.2171698361635208, + "learning_rate": 4.016323116187639e-05, + "loss": 1.7377, + "step": 18772 + }, + { + "epoch": 5.7621240024554945, + "grad_norm": 0.24234412610530853, + "learning_rate": 4.01583577921516e-05, + "loss": 1.734, + "step": 18773 + }, + { + "epoch": 5.762430939226519, + "grad_norm": 0.2648961544036865, + "learning_rate": 4.015348451968767e-05, + "loss": 1.7423, + "step": 18774 + }, + { + "epoch": 5.762737875997544, + "grad_norm": 0.18316571414470673, + "learning_rate": 4.01486113445328e-05, + "loss": 1.6708, + "step": 18775 + }, + { + "epoch": 5.76304481276857, + "grad_norm": 0.241583451628685, + "learning_rate": 4.0143738266735104e-05, + "loss": 1.708, + "step": 18776 + }, + { + "epoch": 5.763351749539595, + "grad_norm": 0.2268480360507965, + "learning_rate": 4.0138865286342775e-05, + "loss": 1.7106, + "step": 18777 + }, + { + "epoch": 5.76365868631062, + "grad_norm": 0.2038748860359192, + "learning_rate": 4.0133992403403944e-05, + "loss": 1.7349, + "step": 18778 + }, + { + "epoch": 5.763965623081646, + "grad_norm": 0.24422483146190643, + "learning_rate": 4.0129119617966805e-05, + "loss": 1.659, + "step": 18779 + }, + { + "epoch": 5.76427255985267, + "grad_norm": 0.19925715029239655, + "learning_rate": 4.0124246930079476e-05, + "loss": 1.6983, + "step": 18780 + }, + { + "epoch": 5.764579496623695, + "grad_norm": 0.29671359062194824, + "learning_rate": 4.0119374339790136e-05, + "loss": 1.7188, + "step": 18781 + }, + { + "epoch": 5.764886433394721, + "grad_norm": 0.2752140760421753, + "learning_rate": 4.011450184714692e-05, + "loss": 1.738, + "step": 18782 + }, + { + "epoch": 5.765193370165746, + "grad_norm": 0.2112676352262497, + "learning_rate": 4.0109629452198e-05, + "loss": 1.7529, + "step": 18783 + }, + { + "epoch": 5.765500306936771, + "grad_norm": 0.2091330885887146, + "learning_rate": 4.010475715499151e-05, + "loss": 1.6771, + "step": 18784 + }, + { + "epoch": 5.765807243707796, + "grad_norm": 0.26556238532066345, + "learning_rate": 4.009988495557562e-05, + "loss": 1.7721, + "step": 18785 + }, + { + "epoch": 5.766114180478821, + "grad_norm": 0.20728638768196106, + "learning_rate": 4.009501285399846e-05, + "loss": 1.6893, + "step": 18786 + }, + { + "epoch": 5.7664211172498465, + "grad_norm": 0.213730126619339, + "learning_rate": 4.00901408503082e-05, + "loss": 1.704, + "step": 18787 + }, + { + "epoch": 5.766728054020872, + "grad_norm": 0.21422363817691803, + "learning_rate": 4.0085268944552975e-05, + "loss": 1.7571, + "step": 18788 + }, + { + "epoch": 5.767034990791897, + "grad_norm": 0.20936815440654755, + "learning_rate": 4.0080397136780915e-05, + "loss": 1.7423, + "step": 18789 + }, + { + "epoch": 5.7673419275629225, + "grad_norm": 0.26223674416542053, + "learning_rate": 4.007552542704021e-05, + "loss": 1.7687, + "step": 18790 + }, + { + "epoch": 5.767648864333947, + "grad_norm": 0.3524645268917084, + "learning_rate": 4.0070653815378954e-05, + "loss": 1.7754, + "step": 18791 + }, + { + "epoch": 5.767955801104972, + "grad_norm": 0.20238324999809265, + "learning_rate": 4.006578230184534e-05, + "loss": 1.7043, + "step": 18792 + }, + { + "epoch": 5.768262737875998, + "grad_norm": 0.2739984393119812, + "learning_rate": 4.006091088648747e-05, + "loss": 1.7596, + "step": 18793 + }, + { + "epoch": 5.768569674647023, + "grad_norm": 0.29209306836128235, + "learning_rate": 4.0056039569353515e-05, + "loss": 1.6857, + "step": 18794 + }, + { + "epoch": 5.768876611418047, + "grad_norm": 0.21838447451591492, + "learning_rate": 4.005116835049161e-05, + "loss": 1.7531, + "step": 18795 + }, + { + "epoch": 5.769183548189073, + "grad_norm": 0.21940091252326965, + "learning_rate": 4.0046297229949884e-05, + "loss": 1.7363, + "step": 18796 + }, + { + "epoch": 5.769490484960098, + "grad_norm": 0.22679758071899414, + "learning_rate": 4.004142620777647e-05, + "loss": 1.7586, + "step": 18797 + }, + { + "epoch": 5.769797421731123, + "grad_norm": 0.23782022297382355, + "learning_rate": 4.003655528401954e-05, + "loss": 1.7154, + "step": 18798 + }, + { + "epoch": 5.770104358502149, + "grad_norm": 0.20452092587947845, + "learning_rate": 4.0031684458727194e-05, + "loss": 1.7078, + "step": 18799 + }, + { + "epoch": 5.770411295273174, + "grad_norm": 0.22733618319034576, + "learning_rate": 4.0026813731947594e-05, + "loss": 1.6989, + "step": 18800 + }, + { + "epoch": 5.7707182320441985, + "grad_norm": 0.2322154939174652, + "learning_rate": 4.002194310372886e-05, + "loss": 1.7508, + "step": 18801 + }, + { + "epoch": 5.771025168815224, + "grad_norm": 0.24573352932929993, + "learning_rate": 4.001707257411914e-05, + "loss": 1.7245, + "step": 18802 + }, + { + "epoch": 5.771332105586249, + "grad_norm": 0.19692079722881317, + "learning_rate": 4.001220214316655e-05, + "loss": 1.7116, + "step": 18803 + }, + { + "epoch": 5.7716390423572745, + "grad_norm": 0.20525199174880981, + "learning_rate": 4.000733181091925e-05, + "loss": 1.7503, + "step": 18804 + }, + { + "epoch": 5.7719459791283, + "grad_norm": 0.2097626030445099, + "learning_rate": 4.0002461577425344e-05, + "loss": 1.8204, + "step": 18805 + }, + { + "epoch": 5.772252915899324, + "grad_norm": 0.23059608042240143, + "learning_rate": 3.9997591442732975e-05, + "loss": 1.7747, + "step": 18806 + }, + { + "epoch": 5.77255985267035, + "grad_norm": 0.22085745632648468, + "learning_rate": 3.9992721406890265e-05, + "loss": 1.7579, + "step": 18807 + }, + { + "epoch": 5.772866789441375, + "grad_norm": 0.21529869735240936, + "learning_rate": 3.9987851469945334e-05, + "loss": 1.711, + "step": 18808 + }, + { + "epoch": 5.7731737262124, + "grad_norm": 0.20563572645187378, + "learning_rate": 3.998298163194636e-05, + "loss": 1.761, + "step": 18809 + }, + { + "epoch": 5.773480662983426, + "grad_norm": 0.2081122100353241, + "learning_rate": 3.9978111892941394e-05, + "loss": 1.7112, + "step": 18810 + }, + { + "epoch": 5.773787599754451, + "grad_norm": 0.2373751550912857, + "learning_rate": 3.9973242252978635e-05, + "loss": 1.7726, + "step": 18811 + }, + { + "epoch": 5.774094536525475, + "grad_norm": 0.2742944359779358, + "learning_rate": 3.996837271210615e-05, + "loss": 1.7743, + "step": 18812 + }, + { + "epoch": 5.774401473296501, + "grad_norm": 0.20724992454051971, + "learning_rate": 3.996350327037208e-05, + "loss": 1.7052, + "step": 18813 + }, + { + "epoch": 5.774708410067526, + "grad_norm": 0.22324968874454498, + "learning_rate": 3.995863392782456e-05, + "loss": 1.7865, + "step": 18814 + }, + { + "epoch": 5.7750153468385514, + "grad_norm": 0.22314245998859406, + "learning_rate": 3.995376468451172e-05, + "loss": 1.7705, + "step": 18815 + }, + { + "epoch": 5.775322283609577, + "grad_norm": 0.20793841779232025, + "learning_rate": 3.994889554048165e-05, + "loss": 1.739, + "step": 18816 + }, + { + "epoch": 5.775629220380601, + "grad_norm": 0.20117145776748657, + "learning_rate": 3.994402649578249e-05, + "loss": 1.7256, + "step": 18817 + }, + { + "epoch": 5.775936157151627, + "grad_norm": 0.24406170845031738, + "learning_rate": 3.993915755046235e-05, + "loss": 1.8015, + "step": 18818 + }, + { + "epoch": 5.776243093922652, + "grad_norm": 0.20912545919418335, + "learning_rate": 3.993428870456935e-05, + "loss": 1.7038, + "step": 18819 + }, + { + "epoch": 5.776550030693677, + "grad_norm": 0.2587272822856903, + "learning_rate": 3.992941995815162e-05, + "loss": 1.7918, + "step": 18820 + }, + { + "epoch": 5.776856967464703, + "grad_norm": 0.2996658980846405, + "learning_rate": 3.9924551311257266e-05, + "loss": 1.7513, + "step": 18821 + }, + { + "epoch": 5.777163904235728, + "grad_norm": 0.24603547155857086, + "learning_rate": 3.991968276393441e-05, + "loss": 1.7329, + "step": 18822 + }, + { + "epoch": 5.777470841006752, + "grad_norm": 0.2321038693189621, + "learning_rate": 3.991481431623113e-05, + "loss": 1.7406, + "step": 18823 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.3397100269794464, + "learning_rate": 3.990994596819558e-05, + "loss": 1.8129, + "step": 18824 + }, + { + "epoch": 5.778084714548803, + "grad_norm": 0.2807735800743103, + "learning_rate": 3.990507771987584e-05, + "loss": 1.7579, + "step": 18825 + }, + { + "epoch": 5.778391651319828, + "grad_norm": 0.1952899694442749, + "learning_rate": 3.990020957132007e-05, + "loss": 1.7153, + "step": 18826 + }, + { + "epoch": 5.778698588090853, + "grad_norm": 0.28998714685440063, + "learning_rate": 3.989534152257632e-05, + "loss": 1.7844, + "step": 18827 + }, + { + "epoch": 5.779005524861878, + "grad_norm": 0.20929136872291565, + "learning_rate": 3.989047357369275e-05, + "loss": 1.7499, + "step": 18828 + }, + { + "epoch": 5.7793124616329035, + "grad_norm": 0.31144043803215027, + "learning_rate": 3.9885605724717436e-05, + "loss": 1.7745, + "step": 18829 + }, + { + "epoch": 5.779619398403929, + "grad_norm": 0.22598792612552643, + "learning_rate": 3.988073797569849e-05, + "loss": 1.7226, + "step": 18830 + }, + { + "epoch": 5.779926335174954, + "grad_norm": 0.1971752643585205, + "learning_rate": 3.987587032668402e-05, + "loss": 1.7033, + "step": 18831 + }, + { + "epoch": 5.7802332719459795, + "grad_norm": 0.221087247133255, + "learning_rate": 3.9871002777722156e-05, + "loss": 1.7281, + "step": 18832 + }, + { + "epoch": 5.780540208717004, + "grad_norm": 0.21678583323955536, + "learning_rate": 3.986613532886095e-05, + "loss": 1.7207, + "step": 18833 + }, + { + "epoch": 5.780847145488029, + "grad_norm": 0.2511122226715088, + "learning_rate": 3.9861267980148566e-05, + "loss": 1.7091, + "step": 18834 + }, + { + "epoch": 5.781154082259055, + "grad_norm": 0.2883855104446411, + "learning_rate": 3.985640073163304e-05, + "loss": 1.7963, + "step": 18835 + }, + { + "epoch": 5.78146101903008, + "grad_norm": 0.21786242723464966, + "learning_rate": 3.985153358336253e-05, + "loss": 1.6883, + "step": 18836 + }, + { + "epoch": 5.781767955801105, + "grad_norm": 0.18529155850410461, + "learning_rate": 3.98466665353851e-05, + "loss": 1.7194, + "step": 18837 + }, + { + "epoch": 5.78207489257213, + "grad_norm": 0.20535743236541748, + "learning_rate": 3.984179958774888e-05, + "loss": 1.6943, + "step": 18838 + }, + { + "epoch": 5.782381829343155, + "grad_norm": 0.19377392530441284, + "learning_rate": 3.983693274050195e-05, + "loss": 1.6732, + "step": 18839 + }, + { + "epoch": 5.78268876611418, + "grad_norm": 0.22373615205287933, + "learning_rate": 3.983206599369239e-05, + "loss": 1.7668, + "step": 18840 + }, + { + "epoch": 5.782995702885206, + "grad_norm": 0.2132388800382614, + "learning_rate": 3.982719934736832e-05, + "loss": 1.7155, + "step": 18841 + }, + { + "epoch": 5.783302639656231, + "grad_norm": 0.24871744215488434, + "learning_rate": 3.982233280157782e-05, + "loss": 1.7232, + "step": 18842 + }, + { + "epoch": 5.783609576427256, + "grad_norm": 0.1861848086118698, + "learning_rate": 3.981746635636902e-05, + "loss": 1.707, + "step": 18843 + }, + { + "epoch": 5.783916513198281, + "grad_norm": 0.21882779896259308, + "learning_rate": 3.981260001178995e-05, + "loss": 1.7165, + "step": 18844 + }, + { + "epoch": 5.784223449969306, + "grad_norm": 0.22144648432731628, + "learning_rate": 3.980773376788877e-05, + "loss": 1.7799, + "step": 18845 + }, + { + "epoch": 5.7845303867403315, + "grad_norm": 0.210894376039505, + "learning_rate": 3.980286762471351e-05, + "loss": 1.7539, + "step": 18846 + }, + { + "epoch": 5.784837323511357, + "grad_norm": 0.20435640215873718, + "learning_rate": 3.9798001582312305e-05, + "loss": 1.6736, + "step": 18847 + }, + { + "epoch": 5.785144260282382, + "grad_norm": 0.18998762965202332, + "learning_rate": 3.979313564073322e-05, + "loss": 1.7045, + "step": 18848 + }, + { + "epoch": 5.785451197053407, + "grad_norm": 0.19869361817836761, + "learning_rate": 3.978826980002437e-05, + "loss": 1.7444, + "step": 18849 + }, + { + "epoch": 5.785758133824432, + "grad_norm": 0.2175174504518509, + "learning_rate": 3.97834040602338e-05, + "loss": 1.7565, + "step": 18850 + }, + { + "epoch": 5.786065070595457, + "grad_norm": 0.22726793587207794, + "learning_rate": 3.977853842140964e-05, + "loss": 1.713, + "step": 18851 + }, + { + "epoch": 5.786372007366483, + "grad_norm": 0.26518720388412476, + "learning_rate": 3.9773672883599934e-05, + "loss": 1.6892, + "step": 18852 + }, + { + "epoch": 5.786678944137508, + "grad_norm": 0.20721858739852905, + "learning_rate": 3.97688074468528e-05, + "loss": 1.724, + "step": 18853 + }, + { + "epoch": 5.786985880908533, + "grad_norm": 0.22739483416080475, + "learning_rate": 3.976394211121629e-05, + "loss": 1.762, + "step": 18854 + }, + { + "epoch": 5.787292817679558, + "grad_norm": 0.21918894350528717, + "learning_rate": 3.975907687673853e-05, + "loss": 1.6812, + "step": 18855 + }, + { + "epoch": 5.787599754450583, + "grad_norm": 0.20931273698806763, + "learning_rate": 3.9754211743467574e-05, + "loss": 1.6874, + "step": 18856 + }, + { + "epoch": 5.787906691221608, + "grad_norm": 0.2015041708946228, + "learning_rate": 3.974934671145148e-05, + "loss": 1.7248, + "step": 18857 + }, + { + "epoch": 5.788213627992634, + "grad_norm": 0.21632663905620575, + "learning_rate": 3.974448178073836e-05, + "loss": 1.7313, + "step": 18858 + }, + { + "epoch": 5.788520564763659, + "grad_norm": 0.18995213508605957, + "learning_rate": 3.973961695137627e-05, + "loss": 1.6761, + "step": 18859 + }, + { + "epoch": 5.7888275015346835, + "grad_norm": 0.18678395450115204, + "learning_rate": 3.973475222341333e-05, + "loss": 1.7082, + "step": 18860 + }, + { + "epoch": 5.789134438305709, + "grad_norm": 0.1889343559741974, + "learning_rate": 3.972988759689756e-05, + "loss": 1.7296, + "step": 18861 + }, + { + "epoch": 5.789441375076734, + "grad_norm": 0.20196790993213654, + "learning_rate": 3.9725023071877074e-05, + "loss": 1.6876, + "step": 18862 + }, + { + "epoch": 5.7897483118477595, + "grad_norm": 0.198349729180336, + "learning_rate": 3.972015864839992e-05, + "loss": 1.6826, + "step": 18863 + }, + { + "epoch": 5.790055248618785, + "grad_norm": 0.21323837339878082, + "learning_rate": 3.9715294326514185e-05, + "loss": 1.7444, + "step": 18864 + }, + { + "epoch": 5.79036218538981, + "grad_norm": 0.18581731617450714, + "learning_rate": 3.9710430106267934e-05, + "loss": 1.7731, + "step": 18865 + }, + { + "epoch": 5.790669122160835, + "grad_norm": 0.21925146877765656, + "learning_rate": 3.970556598770927e-05, + "loss": 1.7505, + "step": 18866 + }, + { + "epoch": 5.79097605893186, + "grad_norm": 0.20773115754127502, + "learning_rate": 3.970070197088621e-05, + "loss": 1.7408, + "step": 18867 + }, + { + "epoch": 5.791282995702885, + "grad_norm": 0.1805189698934555, + "learning_rate": 3.9695838055846865e-05, + "loss": 1.6871, + "step": 18868 + }, + { + "epoch": 5.791589932473911, + "grad_norm": 0.24685314297676086, + "learning_rate": 3.969097424263928e-05, + "loss": 1.7186, + "step": 18869 + }, + { + "epoch": 5.791896869244935, + "grad_norm": 0.18801769614219666, + "learning_rate": 3.9686110531311526e-05, + "loss": 1.7196, + "step": 18870 + }, + { + "epoch": 5.79220380601596, + "grad_norm": 0.22717779874801636, + "learning_rate": 3.968124692191168e-05, + "loss": 1.7309, + "step": 18871 + }, + { + "epoch": 5.792510742786986, + "grad_norm": 0.23058642446994781, + "learning_rate": 3.9676383414487806e-05, + "loss": 1.6993, + "step": 18872 + }, + { + "epoch": 5.792817679558011, + "grad_norm": 0.24307532608509064, + "learning_rate": 3.967152000908796e-05, + "loss": 1.6986, + "step": 18873 + }, + { + "epoch": 5.793124616329036, + "grad_norm": 0.3032459318637848, + "learning_rate": 3.9666656705760195e-05, + "loss": 1.677, + "step": 18874 + }, + { + "epoch": 5.793431553100062, + "grad_norm": 0.22669538855552673, + "learning_rate": 3.966179350455259e-05, + "loss": 1.7361, + "step": 18875 + }, + { + "epoch": 5.793738489871086, + "grad_norm": 0.27729150652885437, + "learning_rate": 3.96569304055132e-05, + "loss": 1.746, + "step": 18876 + }, + { + "epoch": 5.7940454266421115, + "grad_norm": 0.3422098755836487, + "learning_rate": 3.96520674086901e-05, + "loss": 1.783, + "step": 18877 + }, + { + "epoch": 5.794352363413137, + "grad_norm": 0.2114052176475525, + "learning_rate": 3.964720451413131e-05, + "loss": 1.7127, + "step": 18878 + }, + { + "epoch": 5.794659300184162, + "grad_norm": 0.22928549349308014, + "learning_rate": 3.964234172188494e-05, + "loss": 1.6579, + "step": 18879 + }, + { + "epoch": 5.7949662369551875, + "grad_norm": 0.24813635647296906, + "learning_rate": 3.9637479031999e-05, + "loss": 1.728, + "step": 18880 + }, + { + "epoch": 5.795273173726212, + "grad_norm": 0.19779744744300842, + "learning_rate": 3.963261644452158e-05, + "loss": 1.7338, + "step": 18881 + }, + { + "epoch": 5.795580110497237, + "grad_norm": 0.2424263060092926, + "learning_rate": 3.96277539595007e-05, + "loss": 1.7762, + "step": 18882 + }, + { + "epoch": 5.795887047268263, + "grad_norm": 0.24621224403381348, + "learning_rate": 3.9622891576984456e-05, + "loss": 1.7746, + "step": 18883 + }, + { + "epoch": 5.796193984039288, + "grad_norm": 0.1973372846841812, + "learning_rate": 3.961802929702086e-05, + "loss": 1.7243, + "step": 18884 + }, + { + "epoch": 5.796500920810313, + "grad_norm": 0.22170570492744446, + "learning_rate": 3.961316711965801e-05, + "loss": 1.764, + "step": 18885 + }, + { + "epoch": 5.796807857581339, + "grad_norm": 0.22319282591342926, + "learning_rate": 3.9608305044943906e-05, + "loss": 1.6795, + "step": 18886 + }, + { + "epoch": 5.797114794352363, + "grad_norm": 0.20000022649765015, + "learning_rate": 3.9603443072926635e-05, + "loss": 1.7587, + "step": 18887 + }, + { + "epoch": 5.797421731123388, + "grad_norm": 0.25041815638542175, + "learning_rate": 3.959858120365424e-05, + "loss": 1.7631, + "step": 18888 + }, + { + "epoch": 5.797728667894414, + "grad_norm": 0.23383729159832, + "learning_rate": 3.959371943717474e-05, + "loss": 1.741, + "step": 18889 + }, + { + "epoch": 5.798035604665439, + "grad_norm": 0.18609663844108582, + "learning_rate": 3.958885777353623e-05, + "loss": 1.6981, + "step": 18890 + }, + { + "epoch": 5.798342541436464, + "grad_norm": 0.29523593187332153, + "learning_rate": 3.9583996212786706e-05, + "loss": 1.8018, + "step": 18891 + }, + { + "epoch": 5.798649478207489, + "grad_norm": 0.20356589555740356, + "learning_rate": 3.9579134754974244e-05, + "loss": 1.7157, + "step": 18892 + }, + { + "epoch": 5.798956414978514, + "grad_norm": 0.2901862561702728, + "learning_rate": 3.957427340014688e-05, + "loss": 1.7249, + "step": 18893 + }, + { + "epoch": 5.7992633517495396, + "grad_norm": 0.24768278002738953, + "learning_rate": 3.956941214835267e-05, + "loss": 1.6894, + "step": 18894 + }, + { + "epoch": 5.799570288520565, + "grad_norm": 0.2417999804019928, + "learning_rate": 3.956455099963962e-05, + "loss": 1.7203, + "step": 18895 + }, + { + "epoch": 5.79987722529159, + "grad_norm": 0.2889639437198639, + "learning_rate": 3.9559689954055814e-05, + "loss": 1.7531, + "step": 18896 + }, + { + "epoch": 5.800184162062616, + "grad_norm": 0.21204611659049988, + "learning_rate": 3.955482901164926e-05, + "loss": 1.7521, + "step": 18897 + }, + { + "epoch": 5.80049109883364, + "grad_norm": 0.2961438298225403, + "learning_rate": 3.954996817246801e-05, + "loss": 1.8102, + "step": 18898 + }, + { + "epoch": 5.800798035604665, + "grad_norm": 0.36562761664390564, + "learning_rate": 3.9545107436560084e-05, + "loss": 1.6722, + "step": 18899 + }, + { + "epoch": 5.801104972375691, + "grad_norm": 0.22423696517944336, + "learning_rate": 3.954024680397357e-05, + "loss": 1.7101, + "step": 18900 + }, + { + "epoch": 5.801411909146716, + "grad_norm": 0.3122335970401764, + "learning_rate": 3.953538627475644e-05, + "loss": 1.7314, + "step": 18901 + }, + { + "epoch": 5.8017188459177405, + "grad_norm": 0.39004257321357727, + "learning_rate": 3.953052584895677e-05, + "loss": 1.762, + "step": 18902 + }, + { + "epoch": 5.802025782688766, + "grad_norm": 0.1827487200498581, + "learning_rate": 3.952566552662256e-05, + "loss": 1.6935, + "step": 18903 + }, + { + "epoch": 5.802332719459791, + "grad_norm": 0.3025164306163788, + "learning_rate": 3.952080530780188e-05, + "loss": 1.7448, + "step": 18904 + }, + { + "epoch": 5.8026396562308165, + "grad_norm": 0.2313300520181656, + "learning_rate": 3.9515945192542754e-05, + "loss": 1.7686, + "step": 18905 + }, + { + "epoch": 5.802946593001842, + "grad_norm": 0.3501042425632477, + "learning_rate": 3.9511085180893184e-05, + "loss": 1.775, + "step": 18906 + }, + { + "epoch": 5.803253529772867, + "grad_norm": 0.4111124873161316, + "learning_rate": 3.950622527290123e-05, + "loss": 1.7561, + "step": 18907 + }, + { + "epoch": 5.803560466543892, + "grad_norm": 0.20877736806869507, + "learning_rate": 3.950136546861489e-05, + "loss": 1.7356, + "step": 18908 + }, + { + "epoch": 5.803867403314917, + "grad_norm": 0.33404025435447693, + "learning_rate": 3.949650576808222e-05, + "loss": 1.7289, + "step": 18909 + }, + { + "epoch": 5.804174340085942, + "grad_norm": 0.2183927446603775, + "learning_rate": 3.9491646171351234e-05, + "loss": 1.7136, + "step": 18910 + }, + { + "epoch": 5.804481276856968, + "grad_norm": 0.27149543166160583, + "learning_rate": 3.948678667846997e-05, + "loss": 1.7516, + "step": 18911 + }, + { + "epoch": 5.804788213627993, + "grad_norm": 0.2369886338710785, + "learning_rate": 3.948192728948643e-05, + "loss": 1.6767, + "step": 18912 + }, + { + "epoch": 5.805095150399017, + "grad_norm": 0.20671069622039795, + "learning_rate": 3.947706800444867e-05, + "loss": 1.7831, + "step": 18913 + }, + { + "epoch": 5.805402087170043, + "grad_norm": 0.23622260987758636, + "learning_rate": 3.9472208823404665e-05, + "loss": 1.7121, + "step": 18914 + }, + { + "epoch": 5.805709023941068, + "grad_norm": 0.21099595725536346, + "learning_rate": 3.946734974640247e-05, + "loss": 1.7137, + "step": 18915 + }, + { + "epoch": 5.806015960712093, + "grad_norm": 0.2205580472946167, + "learning_rate": 3.9462490773490094e-05, + "loss": 1.713, + "step": 18916 + }, + { + "epoch": 5.806322897483119, + "grad_norm": 0.20183326303958893, + "learning_rate": 3.9457631904715584e-05, + "loss": 1.7316, + "step": 18917 + }, + { + "epoch": 5.806629834254144, + "grad_norm": 0.27381497621536255, + "learning_rate": 3.9452773140126906e-05, + "loss": 1.7577, + "step": 18918 + }, + { + "epoch": 5.8069367710251685, + "grad_norm": 0.29962384700775146, + "learning_rate": 3.944791447977214e-05, + "loss": 1.7579, + "step": 18919 + }, + { + "epoch": 5.807243707796194, + "grad_norm": 0.22385326027870178, + "learning_rate": 3.944305592369923e-05, + "loss": 1.7795, + "step": 18920 + }, + { + "epoch": 5.807550644567219, + "grad_norm": 0.2954902648925781, + "learning_rate": 3.943819747195625e-05, + "loss": 1.6655, + "step": 18921 + }, + { + "epoch": 5.8078575813382445, + "grad_norm": 0.18947024643421173, + "learning_rate": 3.94333391245912e-05, + "loss": 1.6803, + "step": 18922 + }, + { + "epoch": 5.80816451810927, + "grad_norm": 0.26797959208488464, + "learning_rate": 3.942848088165206e-05, + "loss": 1.7671, + "step": 18923 + }, + { + "epoch": 5.808471454880294, + "grad_norm": 0.23453201353549957, + "learning_rate": 3.94236227431869e-05, + "loss": 1.7472, + "step": 18924 + }, + { + "epoch": 5.80877839165132, + "grad_norm": 0.24471673369407654, + "learning_rate": 3.941876470924367e-05, + "loss": 1.7482, + "step": 18925 + }, + { + "epoch": 5.809085328422345, + "grad_norm": 0.22249098122119904, + "learning_rate": 3.9413906779870426e-05, + "loss": 1.6794, + "step": 18926 + }, + { + "epoch": 5.80939226519337, + "grad_norm": 0.1985001564025879, + "learning_rate": 3.9409048955115144e-05, + "loss": 1.7278, + "step": 18927 + }, + { + "epoch": 5.809699201964396, + "grad_norm": 0.22482000291347504, + "learning_rate": 3.940419123502587e-05, + "loss": 1.7658, + "step": 18928 + }, + { + "epoch": 5.810006138735421, + "grad_norm": 0.18513578176498413, + "learning_rate": 3.939933361965057e-05, + "loss": 1.7154, + "step": 18929 + }, + { + "epoch": 5.810313075506445, + "grad_norm": 0.1984710991382599, + "learning_rate": 3.939447610903729e-05, + "loss": 1.7324, + "step": 18930 + }, + { + "epoch": 5.810620012277471, + "grad_norm": 0.26089081168174744, + "learning_rate": 3.938961870323399e-05, + "loss": 1.774, + "step": 18931 + }, + { + "epoch": 5.810926949048496, + "grad_norm": 0.2059585452079773, + "learning_rate": 3.9384761402288706e-05, + "loss": 1.7059, + "step": 18932 + }, + { + "epoch": 5.811233885819521, + "grad_norm": 0.1887979656457901, + "learning_rate": 3.937990420624942e-05, + "loss": 1.6829, + "step": 18933 + }, + { + "epoch": 5.811540822590547, + "grad_norm": 0.2589145600795746, + "learning_rate": 3.937504711516417e-05, + "loss": 1.7301, + "step": 18934 + }, + { + "epoch": 5.811847759361571, + "grad_norm": 0.209516704082489, + "learning_rate": 3.9370190129080907e-05, + "loss": 1.7716, + "step": 18935 + }, + { + "epoch": 5.8121546961325965, + "grad_norm": 0.3321632146835327, + "learning_rate": 3.936533324804768e-05, + "loss": 1.7754, + "step": 18936 + }, + { + "epoch": 5.812461632903622, + "grad_norm": 0.236944317817688, + "learning_rate": 3.9360476472112446e-05, + "loss": 1.7546, + "step": 18937 + }, + { + "epoch": 5.812768569674647, + "grad_norm": 0.29667431116104126, + "learning_rate": 3.9355619801323226e-05, + "loss": 1.7712, + "step": 18938 + }, + { + "epoch": 5.8130755064456725, + "grad_norm": 0.3071129620075226, + "learning_rate": 3.935076323572802e-05, + "loss": 1.7351, + "step": 18939 + }, + { + "epoch": 5.813382443216698, + "grad_norm": 0.22747032344341278, + "learning_rate": 3.934590677537479e-05, + "loss": 1.7788, + "step": 18940 + }, + { + "epoch": 5.813689379987722, + "grad_norm": 0.2575854957103729, + "learning_rate": 3.934105042031158e-05, + "loss": 1.705, + "step": 18941 + }, + { + "epoch": 5.813996316758748, + "grad_norm": 0.2561504542827606, + "learning_rate": 3.9336194170586325e-05, + "loss": 1.7309, + "step": 18942 + }, + { + "epoch": 5.814303253529773, + "grad_norm": 0.21570482850074768, + "learning_rate": 3.933133802624707e-05, + "loss": 1.7408, + "step": 18943 + }, + { + "epoch": 5.814610190300798, + "grad_norm": 0.29227179288864136, + "learning_rate": 3.932648198734177e-05, + "loss": 1.7415, + "step": 18944 + }, + { + "epoch": 5.814917127071823, + "grad_norm": 0.17847758531570435, + "learning_rate": 3.9321626053918456e-05, + "loss": 1.7926, + "step": 18945 + }, + { + "epoch": 5.815224063842848, + "grad_norm": 0.24604015052318573, + "learning_rate": 3.931677022602507e-05, + "loss": 1.7519, + "step": 18946 + }, + { + "epoch": 5.815531000613873, + "grad_norm": 0.23843185603618622, + "learning_rate": 3.931191450370965e-05, + "loss": 1.7206, + "step": 18947 + }, + { + "epoch": 5.815837937384899, + "grad_norm": 0.23431400954723358, + "learning_rate": 3.9307058887020126e-05, + "loss": 1.7743, + "step": 18948 + }, + { + "epoch": 5.816144874155924, + "grad_norm": 0.23685097694396973, + "learning_rate": 3.9302203376004525e-05, + "loss": 1.7485, + "step": 18949 + }, + { + "epoch": 5.816451810926949, + "grad_norm": 0.2129819542169571, + "learning_rate": 3.929734797071082e-05, + "loss": 1.6897, + "step": 18950 + }, + { + "epoch": 5.816758747697974, + "grad_norm": 0.24736030399799347, + "learning_rate": 3.9292492671187e-05, + "loss": 1.7292, + "step": 18951 + }, + { + "epoch": 5.817065684468999, + "grad_norm": 0.28659793734550476, + "learning_rate": 3.9287637477481025e-05, + "loss": 1.6772, + "step": 18952 + }, + { + "epoch": 5.8173726212400245, + "grad_norm": 0.22304075956344604, + "learning_rate": 3.928278238964092e-05, + "loss": 1.7991, + "step": 18953 + }, + { + "epoch": 5.81767955801105, + "grad_norm": 0.25354304909706116, + "learning_rate": 3.927792740771462e-05, + "loss": 1.7407, + "step": 18954 + }, + { + "epoch": 5.817986494782075, + "grad_norm": 0.3014552593231201, + "learning_rate": 3.927307253175014e-05, + "loss": 1.7714, + "step": 18955 + }, + { + "epoch": 5.8182934315531, + "grad_norm": 0.20537856221199036, + "learning_rate": 3.926821776179545e-05, + "loss": 1.6992, + "step": 18956 + }, + { + "epoch": 5.818600368324125, + "grad_norm": 0.29656440019607544, + "learning_rate": 3.92633630978985e-05, + "loss": 1.7476, + "step": 18957 + }, + { + "epoch": 5.81890730509515, + "grad_norm": 0.20956869423389435, + "learning_rate": 3.925850854010732e-05, + "loss": 1.808, + "step": 18958 + }, + { + "epoch": 5.819214241866176, + "grad_norm": 0.29395633935928345, + "learning_rate": 3.925365408846983e-05, + "loss": 1.7787, + "step": 18959 + }, + { + "epoch": 5.819521178637201, + "grad_norm": 0.31101030111312866, + "learning_rate": 3.9248799743034025e-05, + "loss": 1.7685, + "step": 18960 + }, + { + "epoch": 5.819828115408226, + "grad_norm": 0.2109794020652771, + "learning_rate": 3.9243945503847894e-05, + "loss": 1.7307, + "step": 18961 + }, + { + "epoch": 5.820135052179251, + "grad_norm": 0.2503393292427063, + "learning_rate": 3.9239091370959405e-05, + "loss": 1.763, + "step": 18962 + }, + { + "epoch": 5.820441988950276, + "grad_norm": 0.21757015585899353, + "learning_rate": 3.92342373444165e-05, + "loss": 1.7862, + "step": 18963 + }, + { + "epoch": 5.820748925721301, + "grad_norm": 0.22108088433742523, + "learning_rate": 3.9229383424267197e-05, + "loss": 1.6845, + "step": 18964 + }, + { + "epoch": 5.821055862492327, + "grad_norm": 0.20059655606746674, + "learning_rate": 3.922452961055941e-05, + "loss": 1.7523, + "step": 18965 + }, + { + "epoch": 5.821362799263352, + "grad_norm": 0.22009585797786713, + "learning_rate": 3.921967590334117e-05, + "loss": 1.7802, + "step": 18966 + }, + { + "epoch": 5.8216697360343765, + "grad_norm": 0.22554142773151398, + "learning_rate": 3.9214822302660386e-05, + "loss": 1.7911, + "step": 18967 + }, + { + "epoch": 5.821976672805402, + "grad_norm": 0.23434770107269287, + "learning_rate": 3.920996880856506e-05, + "loss": 1.6755, + "step": 18968 + }, + { + "epoch": 5.822283609576427, + "grad_norm": 0.2162926346063614, + "learning_rate": 3.920511542110314e-05, + "loss": 1.7145, + "step": 18969 + }, + { + "epoch": 5.8225905463474525, + "grad_norm": 0.18654806911945343, + "learning_rate": 3.9200262140322616e-05, + "loss": 1.7076, + "step": 18970 + }, + { + "epoch": 5.822897483118478, + "grad_norm": 0.22357499599456787, + "learning_rate": 3.9195408966271404e-05, + "loss": 1.791, + "step": 18971 + }, + { + "epoch": 5.823204419889503, + "grad_norm": 0.21073313057422638, + "learning_rate": 3.919055589899752e-05, + "loss": 1.7976, + "step": 18972 + }, + { + "epoch": 5.823511356660528, + "grad_norm": 0.21481956541538239, + "learning_rate": 3.9185702938548886e-05, + "loss": 1.7468, + "step": 18973 + }, + { + "epoch": 5.823818293431553, + "grad_norm": 0.22051872313022614, + "learning_rate": 3.9180850084973464e-05, + "loss": 1.7201, + "step": 18974 + }, + { + "epoch": 5.824125230202578, + "grad_norm": 0.24410493671894073, + "learning_rate": 3.917599733831924e-05, + "loss": 1.7774, + "step": 18975 + }, + { + "epoch": 5.824432166973604, + "grad_norm": 0.19711458683013916, + "learning_rate": 3.917114469863414e-05, + "loss": 1.7907, + "step": 18976 + }, + { + "epoch": 5.824739103744628, + "grad_norm": 0.2045203000307083, + "learning_rate": 3.9166292165966155e-05, + "loss": 1.7105, + "step": 18977 + }, + { + "epoch": 5.8250460405156534, + "grad_norm": 0.21570880711078644, + "learning_rate": 3.9161439740363196e-05, + "loss": 1.7312, + "step": 18978 + }, + { + "epoch": 5.825352977286679, + "grad_norm": 0.21203923225402832, + "learning_rate": 3.915658742187325e-05, + "loss": 1.7869, + "step": 18979 + }, + { + "epoch": 5.825659914057704, + "grad_norm": 0.26233312487602234, + "learning_rate": 3.915173521054426e-05, + "loss": 1.7453, + "step": 18980 + }, + { + "epoch": 5.8259668508287294, + "grad_norm": 0.23792949318885803, + "learning_rate": 3.91468831064242e-05, + "loss": 1.6886, + "step": 18981 + }, + { + "epoch": 5.826273787599755, + "grad_norm": 0.20325250923633575, + "learning_rate": 3.914203110956098e-05, + "loss": 1.7538, + "step": 18982 + }, + { + "epoch": 5.82658072437078, + "grad_norm": 0.28146329522132874, + "learning_rate": 3.9137179220002596e-05, + "loss": 1.7674, + "step": 18983 + }, + { + "epoch": 5.826887661141805, + "grad_norm": 0.2319503277540207, + "learning_rate": 3.9132327437796946e-05, + "loss": 1.7864, + "step": 18984 + }, + { + "epoch": 5.82719459791283, + "grad_norm": 0.22653794288635254, + "learning_rate": 3.9127475762992025e-05, + "loss": 1.7424, + "step": 18985 + }, + { + "epoch": 5.827501534683855, + "grad_norm": 0.26855236291885376, + "learning_rate": 3.912262419563574e-05, + "loss": 1.762, + "step": 18986 + }, + { + "epoch": 5.827808471454881, + "grad_norm": 0.18356221914291382, + "learning_rate": 3.9117772735776095e-05, + "loss": 1.7199, + "step": 18987 + }, + { + "epoch": 5.828115408225905, + "grad_norm": 0.2802455425262451, + "learning_rate": 3.911292138346096e-05, + "loss": 1.7142, + "step": 18988 + }, + { + "epoch": 5.82842234499693, + "grad_norm": 0.2638777494430542, + "learning_rate": 3.910807013873835e-05, + "loss": 1.6759, + "step": 18989 + }, + { + "epoch": 5.828729281767956, + "grad_norm": 0.18397162854671478, + "learning_rate": 3.910321900165615e-05, + "loss": 1.693, + "step": 18990 + }, + { + "epoch": 5.829036218538981, + "grad_norm": 0.20967607200145721, + "learning_rate": 3.909836797226233e-05, + "loss": 1.6908, + "step": 18991 + }, + { + "epoch": 5.829343155310006, + "grad_norm": 0.21123014390468597, + "learning_rate": 3.909351705060485e-05, + "loss": 1.7875, + "step": 18992 + }, + { + "epoch": 5.829650092081032, + "grad_norm": 0.1988777220249176, + "learning_rate": 3.90886662367316e-05, + "loss": 1.7254, + "step": 18993 + }, + { + "epoch": 5.829957028852056, + "grad_norm": 0.17793473601341248, + "learning_rate": 3.9083815530690564e-05, + "loss": 1.7233, + "step": 18994 + }, + { + "epoch": 5.8302639656230815, + "grad_norm": 0.2289644330739975, + "learning_rate": 3.9078964932529645e-05, + "loss": 1.7739, + "step": 18995 + }, + { + "epoch": 5.830570902394107, + "grad_norm": 0.18145552277565002, + "learning_rate": 3.9074114442296804e-05, + "loss": 1.6989, + "step": 18996 + }, + { + "epoch": 5.830877839165132, + "grad_norm": 0.1941588670015335, + "learning_rate": 3.9069264060039956e-05, + "loss": 1.6981, + "step": 18997 + }, + { + "epoch": 5.8311847759361575, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.9064413785807075e-05, + "loss": 1.7163, + "step": 18998 + }, + { + "epoch": 5.831491712707182, + "grad_norm": 0.19494447112083435, + "learning_rate": 3.905956361964604e-05, + "loss": 1.7481, + "step": 18999 + }, + { + "epoch": 5.831798649478207, + "grad_norm": 0.2127624899148941, + "learning_rate": 3.9054713561604826e-05, + "loss": 1.7494, + "step": 19000 + }, + { + "epoch": 5.832105586249233, + "grad_norm": 0.20107653737068176, + "learning_rate": 3.9049863611731334e-05, + "loss": 1.7483, + "step": 19001 + }, + { + "epoch": 5.832412523020258, + "grad_norm": 0.22574639320373535, + "learning_rate": 3.904501377007352e-05, + "loss": 1.8184, + "step": 19002 + }, + { + "epoch": 5.832719459791283, + "grad_norm": 0.20027579367160797, + "learning_rate": 3.9040164036679285e-05, + "loss": 1.6995, + "step": 19003 + }, + { + "epoch": 5.833026396562309, + "grad_norm": 0.21599887311458588, + "learning_rate": 3.90353144115966e-05, + "loss": 1.7487, + "step": 19004 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.21122781932353973, + "learning_rate": 3.9030464894873334e-05, + "loss": 1.7332, + "step": 19005 + }, + { + "epoch": 5.833640270104358, + "grad_norm": 0.19006453454494476, + "learning_rate": 3.902561548655747e-05, + "loss": 1.688, + "step": 19006 + }, + { + "epoch": 5.833947206875384, + "grad_norm": 0.22979344427585602, + "learning_rate": 3.9020766186696895e-05, + "loss": 1.7495, + "step": 19007 + }, + { + "epoch": 5.834254143646409, + "grad_norm": 0.18405365943908691, + "learning_rate": 3.901591699533953e-05, + "loss": 1.7395, + "step": 19008 + }, + { + "epoch": 5.834561080417434, + "grad_norm": 0.26198676228523254, + "learning_rate": 3.901106791253334e-05, + "loss": 1.8286, + "step": 19009 + }, + { + "epoch": 5.834868017188459, + "grad_norm": 0.2535797357559204, + "learning_rate": 3.900621893832619e-05, + "loss": 1.757, + "step": 19010 + }, + { + "epoch": 5.835174953959484, + "grad_norm": 0.24599581956863403, + "learning_rate": 3.900137007276605e-05, + "loss": 1.7266, + "step": 19011 + }, + { + "epoch": 5.8354818907305095, + "grad_norm": 0.25688427686691284, + "learning_rate": 3.8996521315900805e-05, + "loss": 1.7255, + "step": 19012 + }, + { + "epoch": 5.835788827501535, + "grad_norm": 0.24668128788471222, + "learning_rate": 3.8991672667778385e-05, + "loss": 1.737, + "step": 19013 + }, + { + "epoch": 5.83609576427256, + "grad_norm": 0.28365740180015564, + "learning_rate": 3.8986824128446695e-05, + "loss": 1.7129, + "step": 19014 + }, + { + "epoch": 5.8364027010435855, + "grad_norm": 0.2543952465057373, + "learning_rate": 3.89819756979537e-05, + "loss": 1.7249, + "step": 19015 + }, + { + "epoch": 5.83670963781461, + "grad_norm": 0.2868666350841522, + "learning_rate": 3.8977127376347245e-05, + "loss": 1.6985, + "step": 19016 + }, + { + "epoch": 5.837016574585635, + "grad_norm": 0.3818367123603821, + "learning_rate": 3.897227916367531e-05, + "loss": 1.6954, + "step": 19017 + }, + { + "epoch": 5.837323511356661, + "grad_norm": 0.20922113955020905, + "learning_rate": 3.896743105998574e-05, + "loss": 1.7571, + "step": 19018 + }, + { + "epoch": 5.837630448127686, + "grad_norm": 0.3669843375682831, + "learning_rate": 3.89625830653265e-05, + "loss": 1.8041, + "step": 19019 + }, + { + "epoch": 5.83793738489871, + "grad_norm": 0.2889872193336487, + "learning_rate": 3.895773517974548e-05, + "loss": 1.7775, + "step": 19020 + }, + { + "epoch": 5.838244321669736, + "grad_norm": 0.22619491815567017, + "learning_rate": 3.89528874032906e-05, + "loss": 1.7019, + "step": 19021 + }, + { + "epoch": 5.838551258440761, + "grad_norm": 0.4169046878814697, + "learning_rate": 3.894803973600976e-05, + "loss": 1.8282, + "step": 19022 + }, + { + "epoch": 5.838858195211786, + "grad_norm": 0.2567043900489807, + "learning_rate": 3.894319217795087e-05, + "loss": 1.733, + "step": 19023 + }, + { + "epoch": 5.839165131982812, + "grad_norm": 0.2435060739517212, + "learning_rate": 3.8938344729161834e-05, + "loss": 1.7208, + "step": 19024 + }, + { + "epoch": 5.839472068753837, + "grad_norm": 0.2941838204860687, + "learning_rate": 3.893349738969055e-05, + "loss": 1.7202, + "step": 19025 + }, + { + "epoch": 5.8397790055248615, + "grad_norm": 0.23542317748069763, + "learning_rate": 3.892865015958495e-05, + "loss": 1.7571, + "step": 19026 + }, + { + "epoch": 5.840085942295887, + "grad_norm": 0.3248259723186493, + "learning_rate": 3.8923803038892897e-05, + "loss": 1.7118, + "step": 19027 + }, + { + "epoch": 5.840392879066912, + "grad_norm": 0.24359026551246643, + "learning_rate": 3.891895602766234e-05, + "loss": 1.8126, + "step": 19028 + }, + { + "epoch": 5.8406998158379375, + "grad_norm": 0.3053695559501648, + "learning_rate": 3.8914109125941126e-05, + "loss": 1.6632, + "step": 19029 + }, + { + "epoch": 5.841006752608963, + "grad_norm": 0.3194943368434906, + "learning_rate": 3.8909262333777195e-05, + "loss": 1.8432, + "step": 19030 + }, + { + "epoch": 5.841313689379987, + "grad_norm": 0.23532693088054657, + "learning_rate": 3.8904415651218426e-05, + "loss": 1.716, + "step": 19031 + }, + { + "epoch": 5.841620626151013, + "grad_norm": 0.2941347062587738, + "learning_rate": 3.889956907831275e-05, + "loss": 1.7737, + "step": 19032 + }, + { + "epoch": 5.841927562922038, + "grad_norm": 0.2265428602695465, + "learning_rate": 3.889472261510801e-05, + "loss": 1.7111, + "step": 19033 + }, + { + "epoch": 5.842234499693063, + "grad_norm": 0.3023710548877716, + "learning_rate": 3.888987626165216e-05, + "loss": 1.7845, + "step": 19034 + }, + { + "epoch": 5.842541436464089, + "grad_norm": 0.2855348289012909, + "learning_rate": 3.8885030017993026e-05, + "loss": 1.8009, + "step": 19035 + }, + { + "epoch": 5.842848373235114, + "grad_norm": 0.23046357929706573, + "learning_rate": 3.888018388417857e-05, + "loss": 1.8225, + "step": 19036 + }, + { + "epoch": 5.843155310006138, + "grad_norm": 0.23732341825962067, + "learning_rate": 3.8875337860256634e-05, + "loss": 1.7542, + "step": 19037 + }, + { + "epoch": 5.843462246777164, + "grad_norm": 0.18987004458904266, + "learning_rate": 3.887049194627516e-05, + "loss": 1.7327, + "step": 19038 + }, + { + "epoch": 5.843769183548189, + "grad_norm": 0.21539908647537231, + "learning_rate": 3.8865646142281974e-05, + "loss": 1.715, + "step": 19039 + }, + { + "epoch": 5.844076120319214, + "grad_norm": 0.2991954982280731, + "learning_rate": 3.8860800448325024e-05, + "loss": 1.7728, + "step": 19040 + }, + { + "epoch": 5.84438305709024, + "grad_norm": 0.19066409766674042, + "learning_rate": 3.885595486445216e-05, + "loss": 1.7128, + "step": 19041 + }, + { + "epoch": 5.844689993861264, + "grad_norm": 0.21643762290477753, + "learning_rate": 3.885110939071128e-05, + "loss": 1.7584, + "step": 19042 + }, + { + "epoch": 5.8449969306322895, + "grad_norm": 0.20227304100990295, + "learning_rate": 3.884626402715029e-05, + "loss": 1.7053, + "step": 19043 + }, + { + "epoch": 5.845303867403315, + "grad_norm": 0.20429107546806335, + "learning_rate": 3.884141877381703e-05, + "loss": 1.761, + "step": 19044 + }, + { + "epoch": 5.84561080417434, + "grad_norm": 0.1873873621225357, + "learning_rate": 3.8836573630759435e-05, + "loss": 1.7251, + "step": 19045 + }, + { + "epoch": 5.8459177409453655, + "grad_norm": 0.18025323748588562, + "learning_rate": 3.883172859802534e-05, + "loss": 1.6696, + "step": 19046 + }, + { + "epoch": 5.846224677716391, + "grad_norm": 0.22011777758598328, + "learning_rate": 3.8826883675662664e-05, + "loss": 1.7148, + "step": 19047 + }, + { + "epoch": 5.846531614487415, + "grad_norm": 0.17827673256397247, + "learning_rate": 3.882203886371925e-05, + "loss": 1.69, + "step": 19048 + }, + { + "epoch": 5.846838551258441, + "grad_norm": 0.200766459107399, + "learning_rate": 3.881719416224303e-05, + "loss": 1.7773, + "step": 19049 + }, + { + "epoch": 5.847145488029466, + "grad_norm": 0.22770950198173523, + "learning_rate": 3.8812349571281834e-05, + "loss": 1.7156, + "step": 19050 + }, + { + "epoch": 5.847452424800491, + "grad_norm": 0.19483895599842072, + "learning_rate": 3.880750509088357e-05, + "loss": 1.7304, + "step": 19051 + }, + { + "epoch": 5.847759361571516, + "grad_norm": 0.1988774836063385, + "learning_rate": 3.8802660721096086e-05, + "loss": 1.7428, + "step": 19052 + }, + { + "epoch": 5.848066298342541, + "grad_norm": 0.19881510734558105, + "learning_rate": 3.879781646196727e-05, + "loss": 1.7268, + "step": 19053 + }, + { + "epoch": 5.848373235113566, + "grad_norm": 0.21257543563842773, + "learning_rate": 3.8792972313545e-05, + "loss": 1.7532, + "step": 19054 + }, + { + "epoch": 5.848680171884592, + "grad_norm": 0.21000613272190094, + "learning_rate": 3.878812827587716e-05, + "loss": 1.7782, + "step": 19055 + }, + { + "epoch": 5.848987108655617, + "grad_norm": 0.2136746346950531, + "learning_rate": 3.878328434901159e-05, + "loss": 1.6875, + "step": 19056 + }, + { + "epoch": 5.849294045426642, + "grad_norm": 0.20291505753993988, + "learning_rate": 3.8778440532996204e-05, + "loss": 1.74, + "step": 19057 + }, + { + "epoch": 5.849600982197668, + "grad_norm": 0.22568103671073914, + "learning_rate": 3.877359682787883e-05, + "loss": 1.7074, + "step": 19058 + }, + { + "epoch": 5.849907918968692, + "grad_norm": 0.24398963153362274, + "learning_rate": 3.876875323370734e-05, + "loss": 1.6825, + "step": 19059 + }, + { + "epoch": 5.850214855739718, + "grad_norm": 0.19684453308582306, + "learning_rate": 3.876390975052964e-05, + "loss": 1.7143, + "step": 19060 + }, + { + "epoch": 5.850521792510743, + "grad_norm": 0.2786783277988434, + "learning_rate": 3.8759066378393544e-05, + "loss": 1.8339, + "step": 19061 + }, + { + "epoch": 5.850828729281768, + "grad_norm": 0.1977633833885193, + "learning_rate": 3.875422311734697e-05, + "loss": 1.742, + "step": 19062 + }, + { + "epoch": 5.851135666052793, + "grad_norm": 0.260643869638443, + "learning_rate": 3.874937996743772e-05, + "loss": 1.7728, + "step": 19063 + }, + { + "epoch": 5.851442602823818, + "grad_norm": 0.20998433232307434, + "learning_rate": 3.874453692871372e-05, + "loss": 1.768, + "step": 19064 + }, + { + "epoch": 5.851749539594843, + "grad_norm": 0.2603224217891693, + "learning_rate": 3.873969400122278e-05, + "loss": 1.8015, + "step": 19065 + }, + { + "epoch": 5.852056476365869, + "grad_norm": 0.24428118765354156, + "learning_rate": 3.87348511850128e-05, + "loss": 1.8133, + "step": 19066 + }, + { + "epoch": 5.852363413136894, + "grad_norm": 0.19380085170269012, + "learning_rate": 3.873000848013161e-05, + "loss": 1.7331, + "step": 19067 + }, + { + "epoch": 5.852670349907919, + "grad_norm": 0.20088011026382446, + "learning_rate": 3.87251658866271e-05, + "loss": 1.7501, + "step": 19068 + }, + { + "epoch": 5.852977286678944, + "grad_norm": 0.21920672059059143, + "learning_rate": 3.8720323404547095e-05, + "loss": 1.6848, + "step": 19069 + }, + { + "epoch": 5.853284223449969, + "grad_norm": 0.21692565083503723, + "learning_rate": 3.871548103393947e-05, + "loss": 1.7132, + "step": 19070 + }, + { + "epoch": 5.8535911602209945, + "grad_norm": 0.19463133811950684, + "learning_rate": 3.871063877485207e-05, + "loss": 1.7263, + "step": 19071 + }, + { + "epoch": 5.85389809699202, + "grad_norm": 0.21563300490379333, + "learning_rate": 3.870579662733277e-05, + "loss": 1.7271, + "step": 19072 + }, + { + "epoch": 5.854205033763045, + "grad_norm": 0.19901902973651886, + "learning_rate": 3.870095459142939e-05, + "loss": 1.7153, + "step": 19073 + }, + { + "epoch": 5.85451197053407, + "grad_norm": 0.2053879052400589, + "learning_rate": 3.869611266718982e-05, + "loss": 1.7769, + "step": 19074 + }, + { + "epoch": 5.854818907305095, + "grad_norm": 0.18877504765987396, + "learning_rate": 3.869127085466188e-05, + "loss": 1.7427, + "step": 19075 + }, + { + "epoch": 5.85512584407612, + "grad_norm": 0.2000892460346222, + "learning_rate": 3.8686429153893414e-05, + "loss": 1.7245, + "step": 19076 + }, + { + "epoch": 5.855432780847146, + "grad_norm": 0.23791030049324036, + "learning_rate": 3.868158756493231e-05, + "loss": 1.7128, + "step": 19077 + }, + { + "epoch": 5.855739717618171, + "grad_norm": 0.20807631313800812, + "learning_rate": 3.8676746087826374e-05, + "loss": 1.7235, + "step": 19078 + }, + { + "epoch": 5.856046654389196, + "grad_norm": 0.2603290379047394, + "learning_rate": 3.867190472262349e-05, + "loss": 1.7272, + "step": 19079 + }, + { + "epoch": 5.856353591160221, + "grad_norm": 0.25234153866767883, + "learning_rate": 3.8667063469371456e-05, + "loss": 1.7818, + "step": 19080 + }, + { + "epoch": 5.856660527931246, + "grad_norm": 0.20621159672737122, + "learning_rate": 3.866222232811816e-05, + "loss": 1.7318, + "step": 19081 + }, + { + "epoch": 5.856967464702271, + "grad_norm": 0.19565562903881073, + "learning_rate": 3.865738129891141e-05, + "loss": 1.6364, + "step": 19082 + }, + { + "epoch": 5.857274401473297, + "grad_norm": 0.2090953141450882, + "learning_rate": 3.86525403817991e-05, + "loss": 1.7763, + "step": 19083 + }, + { + "epoch": 5.857581338244322, + "grad_norm": 0.21286322176456451, + "learning_rate": 3.864769957682901e-05, + "loss": 1.7652, + "step": 19084 + }, + { + "epoch": 5.8578882750153465, + "grad_norm": 0.20606130361557007, + "learning_rate": 3.864285888404902e-05, + "loss": 1.7267, + "step": 19085 + }, + { + "epoch": 5.858195211786372, + "grad_norm": 0.18837152421474457, + "learning_rate": 3.863801830350694e-05, + "loss": 1.7013, + "step": 19086 + }, + { + "epoch": 5.858502148557397, + "grad_norm": 0.19374001026153564, + "learning_rate": 3.8633177835250636e-05, + "loss": 1.7462, + "step": 19087 + }, + { + "epoch": 5.8588090853284225, + "grad_norm": 0.19090552628040314, + "learning_rate": 3.8628337479327914e-05, + "loss": 1.7321, + "step": 19088 + }, + { + "epoch": 5.859116022099448, + "grad_norm": 0.19487829506397247, + "learning_rate": 3.8623497235786656e-05, + "loss": 1.7323, + "step": 19089 + }, + { + "epoch": 5.859422958870473, + "grad_norm": 0.23836077749729156, + "learning_rate": 3.861865710467464e-05, + "loss": 1.7277, + "step": 19090 + }, + { + "epoch": 5.859729895641498, + "grad_norm": 0.22283829748630524, + "learning_rate": 3.861381708603974e-05, + "loss": 1.7521, + "step": 19091 + }, + { + "epoch": 5.860036832412523, + "grad_norm": 0.2094828337430954, + "learning_rate": 3.8608977179929774e-05, + "loss": 1.763, + "step": 19092 + }, + { + "epoch": 5.860343769183548, + "grad_norm": 0.30857667326927185, + "learning_rate": 3.860413738639256e-05, + "loss": 1.7112, + "step": 19093 + }, + { + "epoch": 5.860650705954574, + "grad_norm": 0.22634989023208618, + "learning_rate": 3.8599297705475954e-05, + "loss": 1.7076, + "step": 19094 + }, + { + "epoch": 5.860957642725598, + "grad_norm": 0.20488132536411285, + "learning_rate": 3.8594458137227757e-05, + "loss": 1.6821, + "step": 19095 + }, + { + "epoch": 5.861264579496623, + "grad_norm": 0.22760719060897827, + "learning_rate": 3.8589618681695826e-05, + "loss": 1.6981, + "step": 19096 + }, + { + "epoch": 5.861571516267649, + "grad_norm": 0.21168997883796692, + "learning_rate": 3.858477933892795e-05, + "loss": 1.7396, + "step": 19097 + }, + { + "epoch": 5.861878453038674, + "grad_norm": 0.24725143611431122, + "learning_rate": 3.8579940108971984e-05, + "loss": 1.791, + "step": 19098 + }, + { + "epoch": 5.862185389809699, + "grad_norm": 0.2245369702577591, + "learning_rate": 3.857510099187573e-05, + "loss": 1.7643, + "step": 19099 + }, + { + "epoch": 5.862492326580725, + "grad_norm": 0.20065639913082123, + "learning_rate": 3.8570261987687056e-05, + "loss": 1.715, + "step": 19100 + }, + { + "epoch": 5.862799263351749, + "grad_norm": 0.1857454925775528, + "learning_rate": 3.856542309645373e-05, + "loss": 1.6833, + "step": 19101 + }, + { + "epoch": 5.8631062001227745, + "grad_norm": 0.18816804885864258, + "learning_rate": 3.856058431822361e-05, + "loss": 1.7049, + "step": 19102 + }, + { + "epoch": 5.8634131368938, + "grad_norm": 0.2861626148223877, + "learning_rate": 3.855574565304448e-05, + "loss": 1.8275, + "step": 19103 + }, + { + "epoch": 5.863720073664825, + "grad_norm": 0.19937226176261902, + "learning_rate": 3.8550907100964196e-05, + "loss": 1.7137, + "step": 19104 + }, + { + "epoch": 5.8640270104358505, + "grad_norm": 0.2040586620569229, + "learning_rate": 3.854606866203055e-05, + "loss": 1.725, + "step": 19105 + }, + { + "epoch": 5.864333947206875, + "grad_norm": 0.21082650125026703, + "learning_rate": 3.854123033629137e-05, + "loss": 1.7143, + "step": 19106 + }, + { + "epoch": 5.8646408839779, + "grad_norm": 0.1977517306804657, + "learning_rate": 3.853639212379446e-05, + "loss": 1.7482, + "step": 19107 + }, + { + "epoch": 5.864947820748926, + "grad_norm": 0.2272191196680069, + "learning_rate": 3.8531554024587655e-05, + "loss": 1.7678, + "step": 19108 + }, + { + "epoch": 5.865254757519951, + "grad_norm": 0.22765736281871796, + "learning_rate": 3.852671603871876e-05, + "loss": 1.7721, + "step": 19109 + }, + { + "epoch": 5.865561694290976, + "grad_norm": 0.20707197487354279, + "learning_rate": 3.852187816623556e-05, + "loss": 1.7509, + "step": 19110 + }, + { + "epoch": 5.865868631062002, + "grad_norm": 0.2699931561946869, + "learning_rate": 3.851704040718591e-05, + "loss": 1.6845, + "step": 19111 + }, + { + "epoch": 5.866175567833026, + "grad_norm": 0.24394196271896362, + "learning_rate": 3.8512202761617575e-05, + "loss": 1.6895, + "step": 19112 + }, + { + "epoch": 5.866482504604051, + "grad_norm": 0.21921835839748383, + "learning_rate": 3.850736522957841e-05, + "loss": 1.7739, + "step": 19113 + }, + { + "epoch": 5.866789441375077, + "grad_norm": 0.2268306314945221, + "learning_rate": 3.8502527811116175e-05, + "loss": 1.7773, + "step": 19114 + }, + { + "epoch": 5.867096378146102, + "grad_norm": 0.2165728509426117, + "learning_rate": 3.84976905062787e-05, + "loss": 1.7567, + "step": 19115 + }, + { + "epoch": 5.867403314917127, + "grad_norm": 0.188106968998909, + "learning_rate": 3.8492853315113804e-05, + "loss": 1.7209, + "step": 19116 + }, + { + "epoch": 5.867710251688152, + "grad_norm": 0.20750530064105988, + "learning_rate": 3.848801623766927e-05, + "loss": 1.6999, + "step": 19117 + }, + { + "epoch": 5.868017188459177, + "grad_norm": 0.2475438266992569, + "learning_rate": 3.84831792739929e-05, + "loss": 1.7535, + "step": 19118 + }, + { + "epoch": 5.8683241252302025, + "grad_norm": 0.23291872441768646, + "learning_rate": 3.847834242413252e-05, + "loss": 1.7137, + "step": 19119 + }, + { + "epoch": 5.868631062001228, + "grad_norm": 0.18381048738956451, + "learning_rate": 3.847350568813589e-05, + "loss": 1.7657, + "step": 19120 + }, + { + "epoch": 5.868937998772253, + "grad_norm": 0.19330385327339172, + "learning_rate": 3.8468669066050845e-05, + "loss": 1.7109, + "step": 19121 + }, + { + "epoch": 5.8692449355432785, + "grad_norm": 0.22503000497817993, + "learning_rate": 3.846383255792517e-05, + "loss": 1.7668, + "step": 19122 + }, + { + "epoch": 5.869551872314303, + "grad_norm": 0.2147306352853775, + "learning_rate": 3.845899616380667e-05, + "loss": 1.74, + "step": 19123 + }, + { + "epoch": 5.869858809085328, + "grad_norm": 0.18493011593818665, + "learning_rate": 3.845415988374312e-05, + "loss": 1.7066, + "step": 19124 + }, + { + "epoch": 5.870165745856354, + "grad_norm": 0.28276753425598145, + "learning_rate": 3.844932371778235e-05, + "loss": 1.7925, + "step": 19125 + }, + { + "epoch": 5.870472682627379, + "grad_norm": 0.23486676812171936, + "learning_rate": 3.844448766597212e-05, + "loss": 1.8216, + "step": 19126 + }, + { + "epoch": 5.870779619398404, + "grad_norm": 0.24370723962783813, + "learning_rate": 3.843965172836024e-05, + "loss": 1.709, + "step": 19127 + }, + { + "epoch": 5.871086556169429, + "grad_norm": 0.22540852427482605, + "learning_rate": 3.843481590499449e-05, + "loss": 1.7608, + "step": 19128 + }, + { + "epoch": 5.871393492940454, + "grad_norm": 0.20578467845916748, + "learning_rate": 3.8429980195922666e-05, + "loss": 1.7288, + "step": 19129 + }, + { + "epoch": 5.871700429711479, + "grad_norm": 0.265325129032135, + "learning_rate": 3.842514460119258e-05, + "loss": 1.7711, + "step": 19130 + }, + { + "epoch": 5.872007366482505, + "grad_norm": 0.20076121389865875, + "learning_rate": 3.842030912085197e-05, + "loss": 1.6764, + "step": 19131 + }, + { + "epoch": 5.87231430325353, + "grad_norm": 0.23941899836063385, + "learning_rate": 3.841547375494868e-05, + "loss": 1.8157, + "step": 19132 + }, + { + "epoch": 5.872621240024555, + "grad_norm": 0.23184041678905487, + "learning_rate": 3.841063850353044e-05, + "loss": 1.6948, + "step": 19133 + }, + { + "epoch": 5.87292817679558, + "grad_norm": 0.20299546420574188, + "learning_rate": 3.840580336664508e-05, + "loss": 1.7812, + "step": 19134 + }, + { + "epoch": 5.873235113566605, + "grad_norm": 0.24654673039913177, + "learning_rate": 3.840096834434036e-05, + "loss": 1.7999, + "step": 19135 + }, + { + "epoch": 5.8735420503376305, + "grad_norm": 0.21144285798072815, + "learning_rate": 3.8396133436664085e-05, + "loss": 1.7033, + "step": 19136 + }, + { + "epoch": 5.873848987108656, + "grad_norm": 0.22186708450317383, + "learning_rate": 3.8391298643663997e-05, + "loss": 1.7292, + "step": 19137 + }, + { + "epoch": 5.87415592387968, + "grad_norm": 0.21017275750637054, + "learning_rate": 3.838646396538793e-05, + "loss": 1.6989, + "step": 19138 + }, + { + "epoch": 5.874462860650706, + "grad_norm": 0.19430704414844513, + "learning_rate": 3.83816294018836e-05, + "loss": 1.7446, + "step": 19139 + }, + { + "epoch": 5.874769797421731, + "grad_norm": 0.25048547983169556, + "learning_rate": 3.8376794953198836e-05, + "loss": 1.7358, + "step": 19140 + }, + { + "epoch": 5.875076734192756, + "grad_norm": 0.21869583427906036, + "learning_rate": 3.8371960619381406e-05, + "loss": 1.7017, + "step": 19141 + }, + { + "epoch": 5.875383670963782, + "grad_norm": 0.2053002119064331, + "learning_rate": 3.836712640047905e-05, + "loss": 1.7077, + "step": 19142 + }, + { + "epoch": 5.875690607734807, + "grad_norm": 0.2222425490617752, + "learning_rate": 3.83622922965396e-05, + "loss": 1.7259, + "step": 19143 + }, + { + "epoch": 5.8759975445058314, + "grad_norm": 0.20682495832443237, + "learning_rate": 3.8357458307610774e-05, + "loss": 1.7597, + "step": 19144 + }, + { + "epoch": 5.876304481276857, + "grad_norm": 0.2001802772283554, + "learning_rate": 3.835262443374038e-05, + "loss": 1.7546, + "step": 19145 + }, + { + "epoch": 5.876611418047882, + "grad_norm": 0.20499882102012634, + "learning_rate": 3.8347790674976166e-05, + "loss": 1.6741, + "step": 19146 + }, + { + "epoch": 5.8769183548189075, + "grad_norm": 0.17830348014831543, + "learning_rate": 3.834295703136593e-05, + "loss": 1.7067, + "step": 19147 + }, + { + "epoch": 5.877225291589933, + "grad_norm": 0.25055429339408875, + "learning_rate": 3.833812350295741e-05, + "loss": 1.753, + "step": 19148 + }, + { + "epoch": 5.877532228360957, + "grad_norm": 0.19037213921546936, + "learning_rate": 3.8333290089798415e-05, + "loss": 1.7336, + "step": 19149 + }, + { + "epoch": 5.877839165131983, + "grad_norm": 0.18041233718395233, + "learning_rate": 3.8328456791936656e-05, + "loss": 1.7172, + "step": 19150 + }, + { + "epoch": 5.878146101903008, + "grad_norm": 0.21531802415847778, + "learning_rate": 3.832362360941994e-05, + "loss": 1.7328, + "step": 19151 + }, + { + "epoch": 5.878453038674033, + "grad_norm": 0.23101283609867096, + "learning_rate": 3.831879054229601e-05, + "loss": 1.7548, + "step": 19152 + }, + { + "epoch": 5.878759975445059, + "grad_norm": 0.19029635190963745, + "learning_rate": 3.831395759061266e-05, + "loss": 1.6852, + "step": 19153 + }, + { + "epoch": 5.879066912216084, + "grad_norm": 0.20305602252483368, + "learning_rate": 3.830912475441761e-05, + "loss": 1.6982, + "step": 19154 + }, + { + "epoch": 5.879373848987108, + "grad_norm": 0.19752593338489532, + "learning_rate": 3.830429203375866e-05, + "loss": 1.7726, + "step": 19155 + }, + { + "epoch": 5.879680785758134, + "grad_norm": 0.2109406590461731, + "learning_rate": 3.8299459428683526e-05, + "loss": 1.7629, + "step": 19156 + }, + { + "epoch": 5.879987722529159, + "grad_norm": 0.19448740780353546, + "learning_rate": 3.829462693924001e-05, + "loss": 1.6981, + "step": 19157 + }, + { + "epoch": 5.880294659300184, + "grad_norm": 0.19344154000282288, + "learning_rate": 3.828979456547586e-05, + "loss": 1.6822, + "step": 19158 + }, + { + "epoch": 5.88060159607121, + "grad_norm": 0.24466145038604736, + "learning_rate": 3.82849623074388e-05, + "loss": 1.7575, + "step": 19159 + }, + { + "epoch": 5.880908532842234, + "grad_norm": 0.20174476504325867, + "learning_rate": 3.828013016517663e-05, + "loss": 1.7267, + "step": 19160 + }, + { + "epoch": 5.8812154696132595, + "grad_norm": 0.23560820519924164, + "learning_rate": 3.827529813873706e-05, + "loss": 1.7125, + "step": 19161 + }, + { + "epoch": 5.881522406384285, + "grad_norm": 0.18118280172348022, + "learning_rate": 3.827046622816789e-05, + "loss": 1.7436, + "step": 19162 + }, + { + "epoch": 5.88182934315531, + "grad_norm": 0.27250152826309204, + "learning_rate": 3.8265634433516824e-05, + "loss": 1.7249, + "step": 19163 + }, + { + "epoch": 5.8821362799263355, + "grad_norm": 0.23510734736919403, + "learning_rate": 3.826080275483166e-05, + "loss": 1.7502, + "step": 19164 + }, + { + "epoch": 5.882443216697361, + "grad_norm": 0.22708909213542938, + "learning_rate": 3.82559711921601e-05, + "loss": 1.7478, + "step": 19165 + }, + { + "epoch": 5.882750153468385, + "grad_norm": 0.292584627866745, + "learning_rate": 3.825113974554995e-05, + "loss": 1.6757, + "step": 19166 + }, + { + "epoch": 5.883057090239411, + "grad_norm": 0.22186334431171417, + "learning_rate": 3.8246308415048884e-05, + "loss": 1.7061, + "step": 19167 + }, + { + "epoch": 5.883364027010436, + "grad_norm": 0.23995520174503326, + "learning_rate": 3.8241477200704714e-05, + "loss": 1.6962, + "step": 19168 + }, + { + "epoch": 5.883670963781461, + "grad_norm": 0.25545260310173035, + "learning_rate": 3.823664610256513e-05, + "loss": 1.7582, + "step": 19169 + }, + { + "epoch": 5.883977900552486, + "grad_norm": 0.2209167629480362, + "learning_rate": 3.823181512067794e-05, + "loss": 1.7212, + "step": 19170 + }, + { + "epoch": 5.884284837323511, + "grad_norm": 0.24626508355140686, + "learning_rate": 3.8226984255090824e-05, + "loss": 1.7356, + "step": 19171 + }, + { + "epoch": 5.884591774094536, + "grad_norm": 0.22982320189476013, + "learning_rate": 3.822215350585157e-05, + "loss": 1.7516, + "step": 19172 + }, + { + "epoch": 5.884898710865562, + "grad_norm": 0.19458627700805664, + "learning_rate": 3.8217322873007874e-05, + "loss": 1.7097, + "step": 19173 + }, + { + "epoch": 5.885205647636587, + "grad_norm": 0.2030913233757019, + "learning_rate": 3.8212492356607524e-05, + "loss": 1.7273, + "step": 19174 + }, + { + "epoch": 5.885512584407612, + "grad_norm": 0.20174767076969147, + "learning_rate": 3.820766195669823e-05, + "loss": 1.7167, + "step": 19175 + }, + { + "epoch": 5.885819521178637, + "grad_norm": 0.22572553157806396, + "learning_rate": 3.820283167332772e-05, + "loss": 1.8034, + "step": 19176 + }, + { + "epoch": 5.886126457949662, + "grad_norm": 0.24423041939735413, + "learning_rate": 3.819800150654376e-05, + "loss": 1.7188, + "step": 19177 + }, + { + "epoch": 5.8864333947206875, + "grad_norm": 0.20805509388446808, + "learning_rate": 3.819317145639404e-05, + "loss": 1.7252, + "step": 19178 + }, + { + "epoch": 5.886740331491713, + "grad_norm": 0.2731400728225708, + "learning_rate": 3.8188341522926334e-05, + "loss": 1.7778, + "step": 19179 + }, + { + "epoch": 5.887047268262738, + "grad_norm": 0.2604491412639618, + "learning_rate": 3.818351170618835e-05, + "loss": 1.7524, + "step": 19180 + }, + { + "epoch": 5.887354205033763, + "grad_norm": 0.20043112337589264, + "learning_rate": 3.817868200622785e-05, + "loss": 1.7176, + "step": 19181 + }, + { + "epoch": 5.887661141804788, + "grad_norm": 0.2224988341331482, + "learning_rate": 3.817385242309253e-05, + "loss": 1.7267, + "step": 19182 + }, + { + "epoch": 5.887968078575813, + "grad_norm": 0.24603894352912903, + "learning_rate": 3.8169022956830135e-05, + "loss": 1.716, + "step": 19183 + }, + { + "epoch": 5.888275015346839, + "grad_norm": 0.19959969818592072, + "learning_rate": 3.816419360748839e-05, + "loss": 1.7461, + "step": 19184 + }, + { + "epoch": 5.888581952117864, + "grad_norm": 0.21907947957515717, + "learning_rate": 3.815936437511501e-05, + "loss": 1.6982, + "step": 19185 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.1920289248228073, + "learning_rate": 3.8154535259757735e-05, + "loss": 1.7213, + "step": 19186 + }, + { + "epoch": 5.889195825659914, + "grad_norm": 0.21930737793445587, + "learning_rate": 3.81497062614643e-05, + "loss": 1.7389, + "step": 19187 + }, + { + "epoch": 5.889502762430939, + "grad_norm": 0.1972137838602066, + "learning_rate": 3.814487738028239e-05, + "loss": 1.7317, + "step": 19188 + }, + { + "epoch": 5.889809699201964, + "grad_norm": 0.20000529289245605, + "learning_rate": 3.8140048616259785e-05, + "loss": 1.7148, + "step": 19189 + }, + { + "epoch": 5.89011663597299, + "grad_norm": 0.18828663229942322, + "learning_rate": 3.8135219969444135e-05, + "loss": 1.725, + "step": 19190 + }, + { + "epoch": 5.890423572744015, + "grad_norm": 0.2237224131822586, + "learning_rate": 3.8130391439883216e-05, + "loss": 1.7252, + "step": 19191 + }, + { + "epoch": 5.8907305095150395, + "grad_norm": 0.19954712688922882, + "learning_rate": 3.812556302762473e-05, + "loss": 1.7071, + "step": 19192 + }, + { + "epoch": 5.891037446286065, + "grad_norm": 0.23509685695171356, + "learning_rate": 3.812073473271637e-05, + "loss": 1.7603, + "step": 19193 + }, + { + "epoch": 5.89134438305709, + "grad_norm": 0.28477707505226135, + "learning_rate": 3.81159065552059e-05, + "loss": 1.8193, + "step": 19194 + }, + { + "epoch": 5.8916513198281155, + "grad_norm": 0.1936045140028, + "learning_rate": 3.811107849514098e-05, + "loss": 1.7438, + "step": 19195 + }, + { + "epoch": 5.891958256599141, + "grad_norm": 0.288253515958786, + "learning_rate": 3.810625055256936e-05, + "loss": 1.8042, + "step": 19196 + }, + { + "epoch": 5.892265193370166, + "grad_norm": 0.19256485998630524, + "learning_rate": 3.810142272753873e-05, + "loss": 1.6997, + "step": 19197 + }, + { + "epoch": 5.892572130141191, + "grad_norm": 0.2823546826839447, + "learning_rate": 3.809659502009684e-05, + "loss": 1.7133, + "step": 19198 + }, + { + "epoch": 5.892879066912216, + "grad_norm": 0.25116851925849915, + "learning_rate": 3.809176743029136e-05, + "loss": 1.7402, + "step": 19199 + }, + { + "epoch": 5.893186003683241, + "grad_norm": 0.19840675592422485, + "learning_rate": 3.808693995817003e-05, + "loss": 1.7009, + "step": 19200 + }, + { + "epoch": 5.893492940454267, + "grad_norm": 0.2703700363636017, + "learning_rate": 3.808211260378051e-05, + "loss": 1.741, + "step": 19201 + }, + { + "epoch": 5.893799877225292, + "grad_norm": 0.25683698058128357, + "learning_rate": 3.807728536717056e-05, + "loss": 1.7431, + "step": 19202 + }, + { + "epoch": 5.894106813996316, + "grad_norm": 0.19033822417259216, + "learning_rate": 3.8072458248387855e-05, + "loss": 1.7423, + "step": 19203 + }, + { + "epoch": 5.894413750767342, + "grad_norm": 0.2771024703979492, + "learning_rate": 3.806763124748012e-05, + "loss": 1.7376, + "step": 19204 + }, + { + "epoch": 5.894720687538367, + "grad_norm": 0.30265524983406067, + "learning_rate": 3.806280436449504e-05, + "loss": 1.7124, + "step": 19205 + }, + { + "epoch": 5.895027624309392, + "grad_norm": 0.21838776767253876, + "learning_rate": 3.805797759948033e-05, + "loss": 1.7319, + "step": 19206 + }, + { + "epoch": 5.895334561080418, + "grad_norm": 0.22244395315647125, + "learning_rate": 3.805315095248368e-05, + "loss": 1.7034, + "step": 19207 + }, + { + "epoch": 5.895641497851443, + "grad_norm": 0.20621941983699799, + "learning_rate": 3.8048324423552786e-05, + "loss": 1.7231, + "step": 19208 + }, + { + "epoch": 5.8959484346224675, + "grad_norm": 0.23735111951828003, + "learning_rate": 3.804349801273538e-05, + "loss": 1.7484, + "step": 19209 + }, + { + "epoch": 5.896255371393493, + "grad_norm": 0.33221447467803955, + "learning_rate": 3.803867172007911e-05, + "loss": 1.7782, + "step": 19210 + }, + { + "epoch": 5.896562308164518, + "grad_norm": 0.20859810709953308, + "learning_rate": 3.803384554563172e-05, + "loss": 1.688, + "step": 19211 + }, + { + "epoch": 5.8968692449355435, + "grad_norm": 0.25731268525123596, + "learning_rate": 3.8029019489440855e-05, + "loss": 1.7463, + "step": 19212 + }, + { + "epoch": 5.897176181706568, + "grad_norm": 0.26556700468063354, + "learning_rate": 3.802419355155425e-05, + "loss": 1.7251, + "step": 19213 + }, + { + "epoch": 5.897483118477593, + "grad_norm": 0.20397205650806427, + "learning_rate": 3.801936773201957e-05, + "loss": 1.6785, + "step": 19214 + }, + { + "epoch": 5.897790055248619, + "grad_norm": 0.2198234349489212, + "learning_rate": 3.8014542030884544e-05, + "loss": 1.7608, + "step": 19215 + }, + { + "epoch": 5.898096992019644, + "grad_norm": 0.22619546949863434, + "learning_rate": 3.800971644819681e-05, + "loss": 1.8034, + "step": 19216 + }, + { + "epoch": 5.898403928790669, + "grad_norm": 0.22074444591999054, + "learning_rate": 3.800489098400412e-05, + "loss": 1.777, + "step": 19217 + }, + { + "epoch": 5.898710865561695, + "grad_norm": 0.2555946707725525, + "learning_rate": 3.80000656383541e-05, + "loss": 1.7578, + "step": 19218 + }, + { + "epoch": 5.899017802332719, + "grad_norm": 0.2130863517522812, + "learning_rate": 3.7995240411294474e-05, + "loss": 1.7312, + "step": 19219 + }, + { + "epoch": 5.899324739103744, + "grad_norm": 0.2574099898338318, + "learning_rate": 3.799041530287291e-05, + "loss": 1.7509, + "step": 19220 + }, + { + "epoch": 5.89963167587477, + "grad_norm": 0.2556573152542114, + "learning_rate": 3.798559031313712e-05, + "loss": 1.7624, + "step": 19221 + }, + { + "epoch": 5.899938612645795, + "grad_norm": 0.19909335672855377, + "learning_rate": 3.798076544213475e-05, + "loss": 1.7466, + "step": 19222 + }, + { + "epoch": 5.9002455494168204, + "grad_norm": 0.19832594692707062, + "learning_rate": 3.7975940689913526e-05, + "loss": 1.6896, + "step": 19223 + }, + { + "epoch": 5.900552486187845, + "grad_norm": 0.18473665416240692, + "learning_rate": 3.7971116056521076e-05, + "loss": 1.7167, + "step": 19224 + }, + { + "epoch": 5.90085942295887, + "grad_norm": 0.21106892824172974, + "learning_rate": 3.796629154200512e-05, + "loss": 1.8071, + "step": 19225 + }, + { + "epoch": 5.901166359729896, + "grad_norm": 0.20903728902339935, + "learning_rate": 3.796146714641333e-05, + "loss": 1.6946, + "step": 19226 + }, + { + "epoch": 5.901473296500921, + "grad_norm": 0.21518728137016296, + "learning_rate": 3.795664286979336e-05, + "loss": 1.6899, + "step": 19227 + }, + { + "epoch": 5.901780233271946, + "grad_norm": 0.1948135644197464, + "learning_rate": 3.7951818712192926e-05, + "loss": 1.7568, + "step": 19228 + }, + { + "epoch": 5.902087170042972, + "grad_norm": 0.2222091257572174, + "learning_rate": 3.7946994673659667e-05, + "loss": 1.8118, + "step": 19229 + }, + { + "epoch": 5.902394106813996, + "grad_norm": 0.2173513025045395, + "learning_rate": 3.794217075424127e-05, + "loss": 1.7194, + "step": 19230 + }, + { + "epoch": 5.902701043585021, + "grad_norm": 0.2026323676109314, + "learning_rate": 3.79373469539854e-05, + "loss": 1.6944, + "step": 19231 + }, + { + "epoch": 5.903007980356047, + "grad_norm": 0.22178098559379578, + "learning_rate": 3.7932523272939765e-05, + "loss": 1.7328, + "step": 19232 + }, + { + "epoch": 5.903314917127072, + "grad_norm": 0.22846719622612, + "learning_rate": 3.792769971115198e-05, + "loss": 1.8065, + "step": 19233 + }, + { + "epoch": 5.903621853898097, + "grad_norm": 0.2086053490638733, + "learning_rate": 3.792287626866977e-05, + "loss": 1.7511, + "step": 19234 + }, + { + "epoch": 5.903928790669122, + "grad_norm": 0.22444705665111542, + "learning_rate": 3.791805294554075e-05, + "loss": 1.742, + "step": 19235 + }, + { + "epoch": 5.904235727440147, + "grad_norm": 0.24630236625671387, + "learning_rate": 3.7913229741812625e-05, + "loss": 1.7531, + "step": 19236 + }, + { + "epoch": 5.9045426642111725, + "grad_norm": 0.2618274986743927, + "learning_rate": 3.7908406657533036e-05, + "loss": 1.7387, + "step": 19237 + }, + { + "epoch": 5.904849600982198, + "grad_norm": 0.25871509313583374, + "learning_rate": 3.790358369274968e-05, + "loss": 1.7822, + "step": 19238 + }, + { + "epoch": 5.905156537753223, + "grad_norm": 0.22675062716007233, + "learning_rate": 3.789876084751018e-05, + "loss": 1.7788, + "step": 19239 + }, + { + "epoch": 5.9054634745242485, + "grad_norm": 0.26623663306236267, + "learning_rate": 3.789393812186224e-05, + "loss": 1.7092, + "step": 19240 + }, + { + "epoch": 5.905770411295273, + "grad_norm": 0.19448868930339813, + "learning_rate": 3.788911551585348e-05, + "loss": 1.7164, + "step": 19241 + }, + { + "epoch": 5.906077348066298, + "grad_norm": 0.22451938688755035, + "learning_rate": 3.788429302953158e-05, + "loss": 1.667, + "step": 19242 + }, + { + "epoch": 5.906384284837324, + "grad_norm": 0.2323608547449112, + "learning_rate": 3.7879470662944214e-05, + "loss": 1.7992, + "step": 19243 + }, + { + "epoch": 5.906691221608349, + "grad_norm": 0.2508258819580078, + "learning_rate": 3.7874648416139e-05, + "loss": 1.7681, + "step": 19244 + }, + { + "epoch": 5.906998158379373, + "grad_norm": 0.22333547472953796, + "learning_rate": 3.786982628916364e-05, + "loss": 1.7006, + "step": 19245 + }, + { + "epoch": 5.907305095150399, + "grad_norm": 0.19816327095031738, + "learning_rate": 3.786500428206575e-05, + "loss": 1.7458, + "step": 19246 + }, + { + "epoch": 5.907612031921424, + "grad_norm": 0.2047683447599411, + "learning_rate": 3.7860182394893006e-05, + "loss": 1.7385, + "step": 19247 + }, + { + "epoch": 5.907918968692449, + "grad_norm": 0.2124621719121933, + "learning_rate": 3.785536062769304e-05, + "loss": 1.7373, + "step": 19248 + }, + { + "epoch": 5.908225905463475, + "grad_norm": 0.200453981757164, + "learning_rate": 3.785053898051355e-05, + "loss": 1.7754, + "step": 19249 + }, + { + "epoch": 5.9085328422345, + "grad_norm": 0.19543224573135376, + "learning_rate": 3.784571745340212e-05, + "loss": 1.724, + "step": 19250 + }, + { + "epoch": 5.9088397790055245, + "grad_norm": 0.17079658806324005, + "learning_rate": 3.784089604640647e-05, + "loss": 1.6843, + "step": 19251 + }, + { + "epoch": 5.90914671577655, + "grad_norm": 0.22792236506938934, + "learning_rate": 3.783607475957418e-05, + "loss": 1.7442, + "step": 19252 + }, + { + "epoch": 5.909453652547575, + "grad_norm": 0.20699752867221832, + "learning_rate": 3.783125359295294e-05, + "loss": 1.7868, + "step": 19253 + }, + { + "epoch": 5.9097605893186005, + "grad_norm": 0.2156144678592682, + "learning_rate": 3.782643254659038e-05, + "loss": 1.7443, + "step": 19254 + }, + { + "epoch": 5.910067526089626, + "grad_norm": 0.2021300345659256, + "learning_rate": 3.782161162053417e-05, + "loss": 1.7749, + "step": 19255 + }, + { + "epoch": 5.91037446286065, + "grad_norm": 0.17613129317760468, + "learning_rate": 3.7816790814831905e-05, + "loss": 1.7001, + "step": 19256 + }, + { + "epoch": 5.910681399631676, + "grad_norm": 0.18911564350128174, + "learning_rate": 3.781197012953128e-05, + "loss": 1.6817, + "step": 19257 + }, + { + "epoch": 5.910988336402701, + "grad_norm": 0.18920689821243286, + "learning_rate": 3.780714956467989e-05, + "loss": 1.7554, + "step": 19258 + }, + { + "epoch": 5.911295273173726, + "grad_norm": 0.22030571103096008, + "learning_rate": 3.7802329120325396e-05, + "loss": 1.7554, + "step": 19259 + }, + { + "epoch": 5.911602209944752, + "grad_norm": 0.21164962649345398, + "learning_rate": 3.779750879651545e-05, + "loss": 1.74, + "step": 19260 + }, + { + "epoch": 5.911909146715777, + "grad_norm": 0.2205103188753128, + "learning_rate": 3.779268859329766e-05, + "loss": 1.7424, + "step": 19261 + }, + { + "epoch": 5.912216083486801, + "grad_norm": 0.19262658059597015, + "learning_rate": 3.7787868510719685e-05, + "loss": 1.7157, + "step": 19262 + }, + { + "epoch": 5.912523020257827, + "grad_norm": 0.19583287835121155, + "learning_rate": 3.778304854882914e-05, + "loss": 1.7343, + "step": 19263 + }, + { + "epoch": 5.912829957028852, + "grad_norm": 0.18275529146194458, + "learning_rate": 3.777822870767368e-05, + "loss": 1.6938, + "step": 19264 + }, + { + "epoch": 5.913136893799877, + "grad_norm": 0.21268916130065918, + "learning_rate": 3.7773408987300914e-05, + "loss": 1.7546, + "step": 19265 + }, + { + "epoch": 5.913443830570903, + "grad_norm": 0.20878887176513672, + "learning_rate": 3.77685893877585e-05, + "loss": 1.8109, + "step": 19266 + }, + { + "epoch": 5.913750767341927, + "grad_norm": 0.2326175421476364, + "learning_rate": 3.776376990909404e-05, + "loss": 1.7248, + "step": 19267 + }, + { + "epoch": 5.9140577041129525, + "grad_norm": 0.28189611434936523, + "learning_rate": 3.7758950551355204e-05, + "loss": 1.7796, + "step": 19268 + }, + { + "epoch": 5.914364640883978, + "grad_norm": 0.1922682821750641, + "learning_rate": 3.775413131458957e-05, + "loss": 1.7096, + "step": 19269 + }, + { + "epoch": 5.914671577655003, + "grad_norm": 0.2839193642139435, + "learning_rate": 3.774931219884479e-05, + "loss": 1.7341, + "step": 19270 + }, + { + "epoch": 5.9149785144260285, + "grad_norm": 0.2075256109237671, + "learning_rate": 3.7744493204168495e-05, + "loss": 1.7565, + "step": 19271 + }, + { + "epoch": 5.915285451197054, + "grad_norm": 0.2780497372150421, + "learning_rate": 3.7739674330608306e-05, + "loss": 1.7186, + "step": 19272 + }, + { + "epoch": 5.915592387968078, + "grad_norm": 0.26129212975502014, + "learning_rate": 3.773485557821182e-05, + "loss": 1.8468, + "step": 19273 + }, + { + "epoch": 5.915899324739104, + "grad_norm": 0.3299194276332855, + "learning_rate": 3.773003694702671e-05, + "loss": 1.7705, + "step": 19274 + }, + { + "epoch": 5.916206261510129, + "grad_norm": 0.3011106848716736, + "learning_rate": 3.772521843710054e-05, + "loss": 1.748, + "step": 19275 + }, + { + "epoch": 5.916513198281154, + "grad_norm": 0.21370603144168854, + "learning_rate": 3.7720400048480966e-05, + "loss": 1.7709, + "step": 19276 + }, + { + "epoch": 5.91682013505218, + "grad_norm": 0.29374879598617554, + "learning_rate": 3.771558178121561e-05, + "loss": 1.6948, + "step": 19277 + }, + { + "epoch": 5.917127071823204, + "grad_norm": 0.2545807659626007, + "learning_rate": 3.771076363535205e-05, + "loss": 1.7974, + "step": 19278 + }, + { + "epoch": 5.917434008594229, + "grad_norm": 0.24210263788700104, + "learning_rate": 3.7705945610937954e-05, + "loss": 1.7438, + "step": 19279 + }, + { + "epoch": 5.917740945365255, + "grad_norm": 0.26224827766418457, + "learning_rate": 3.770112770802088e-05, + "loss": 1.7294, + "step": 19280 + }, + { + "epoch": 5.91804788213628, + "grad_norm": 0.23358991742134094, + "learning_rate": 3.7696309926648486e-05, + "loss": 1.7973, + "step": 19281 + }, + { + "epoch": 5.918354818907305, + "grad_norm": 0.3466563820838928, + "learning_rate": 3.769149226686837e-05, + "loss": 1.784, + "step": 19282 + }, + { + "epoch": 5.918661755678331, + "grad_norm": 0.2416994869709015, + "learning_rate": 3.768667472872814e-05, + "loss": 1.6957, + "step": 19283 + }, + { + "epoch": 5.918968692449355, + "grad_norm": 0.2285085767507553, + "learning_rate": 3.768185731227539e-05, + "loss": 1.71, + "step": 19284 + }, + { + "epoch": 5.9192756292203805, + "grad_norm": 0.2566430866718292, + "learning_rate": 3.7677040017557775e-05, + "loss": 1.792, + "step": 19285 + }, + { + "epoch": 5.919582565991406, + "grad_norm": 0.21566689014434814, + "learning_rate": 3.767222284462285e-05, + "loss": 1.8085, + "step": 19286 + }, + { + "epoch": 5.919889502762431, + "grad_norm": 0.24078889191150665, + "learning_rate": 3.7667405793518264e-05, + "loss": 1.7221, + "step": 19287 + }, + { + "epoch": 5.920196439533456, + "grad_norm": 0.22127531468868256, + "learning_rate": 3.7662588864291584e-05, + "loss": 1.7173, + "step": 19288 + }, + { + "epoch": 5.920503376304481, + "grad_norm": 0.18165946006774902, + "learning_rate": 3.765777205699045e-05, + "loss": 1.7518, + "step": 19289 + }, + { + "epoch": 5.920810313075506, + "grad_norm": 0.2569290101528168, + "learning_rate": 3.765295537166242e-05, + "loss": 1.7716, + "step": 19290 + }, + { + "epoch": 5.921117249846532, + "grad_norm": 0.19010202586650848, + "learning_rate": 3.764813880835515e-05, + "loss": 1.7146, + "step": 19291 + }, + { + "epoch": 5.921424186617557, + "grad_norm": 0.2882116436958313, + "learning_rate": 3.7643322367116195e-05, + "loss": 1.7677, + "step": 19292 + }, + { + "epoch": 5.921731123388582, + "grad_norm": 0.30711185932159424, + "learning_rate": 3.763850604799319e-05, + "loss": 1.7506, + "step": 19293 + }, + { + "epoch": 5.922038060159607, + "grad_norm": 0.19295164942741394, + "learning_rate": 3.76336898510337e-05, + "loss": 1.715, + "step": 19294 + }, + { + "epoch": 5.922344996930632, + "grad_norm": 0.24849168956279755, + "learning_rate": 3.762887377628533e-05, + "loss": 1.6807, + "step": 19295 + }, + { + "epoch": 5.922651933701657, + "grad_norm": 0.23573634028434753, + "learning_rate": 3.7624057823795696e-05, + "loss": 1.7363, + "step": 19296 + }, + { + "epoch": 5.922958870472683, + "grad_norm": 0.24384267628192902, + "learning_rate": 3.761924199361235e-05, + "loss": 1.726, + "step": 19297 + }, + { + "epoch": 5.923265807243708, + "grad_norm": 0.2589210271835327, + "learning_rate": 3.761442628578294e-05, + "loss": 1.7771, + "step": 19298 + }, + { + "epoch": 5.9235727440147325, + "grad_norm": 0.23527951538562775, + "learning_rate": 3.760961070035501e-05, + "loss": 1.6561, + "step": 19299 + }, + { + "epoch": 5.923879680785758, + "grad_norm": 0.20286870002746582, + "learning_rate": 3.7604795237376175e-05, + "loss": 1.7464, + "step": 19300 + }, + { + "epoch": 5.924186617556783, + "grad_norm": 0.22705033421516418, + "learning_rate": 3.759997989689401e-05, + "loss": 1.7814, + "step": 19301 + }, + { + "epoch": 5.9244935543278086, + "grad_norm": 0.21780981123447418, + "learning_rate": 3.7595164678956135e-05, + "loss": 1.7601, + "step": 19302 + }, + { + "epoch": 5.924800491098834, + "grad_norm": 0.2030021697282791, + "learning_rate": 3.759034958361009e-05, + "loss": 1.7222, + "step": 19303 + }, + { + "epoch": 5.925107427869859, + "grad_norm": 0.22956500947475433, + "learning_rate": 3.758553461090351e-05, + "loss": 1.674, + "step": 19304 + }, + { + "epoch": 5.925414364640884, + "grad_norm": 0.2368287444114685, + "learning_rate": 3.758071976088392e-05, + "loss": 1.7483, + "step": 19305 + }, + { + "epoch": 5.925721301411909, + "grad_norm": 0.22852632403373718, + "learning_rate": 3.757590503359896e-05, + "loss": 1.7561, + "step": 19306 + }, + { + "epoch": 5.926028238182934, + "grad_norm": 0.21657361090183258, + "learning_rate": 3.757109042909617e-05, + "loss": 1.7814, + "step": 19307 + }, + { + "epoch": 5.92633517495396, + "grad_norm": 0.21996551752090454, + "learning_rate": 3.756627594742317e-05, + "loss": 1.732, + "step": 19308 + }, + { + "epoch": 5.926642111724985, + "grad_norm": 0.23319712281227112, + "learning_rate": 3.75614615886275e-05, + "loss": 1.6807, + "step": 19309 + }, + { + "epoch": 5.9269490484960095, + "grad_norm": 0.17926698923110962, + "learning_rate": 3.755664735275677e-05, + "loss": 1.6925, + "step": 19310 + }, + { + "epoch": 5.927255985267035, + "grad_norm": 0.18986931443214417, + "learning_rate": 3.755183323985855e-05, + "loss": 1.7002, + "step": 19311 + }, + { + "epoch": 5.92756292203806, + "grad_norm": 0.18753086030483246, + "learning_rate": 3.7547019249980385e-05, + "loss": 1.695, + "step": 19312 + }, + { + "epoch": 5.9278698588090855, + "grad_norm": 0.21354973316192627, + "learning_rate": 3.7542205383169904e-05, + "loss": 1.6629, + "step": 19313 + }, + { + "epoch": 5.928176795580111, + "grad_norm": 0.19713245332241058, + "learning_rate": 3.753739163947463e-05, + "loss": 1.707, + "step": 19314 + }, + { + "epoch": 5.928483732351136, + "grad_norm": 0.2122458517551422, + "learning_rate": 3.753257801894217e-05, + "loss": 1.7309, + "step": 19315 + }, + { + "epoch": 5.928790669122161, + "grad_norm": 0.20360666513442993, + "learning_rate": 3.7527764521620065e-05, + "loss": 1.6861, + "step": 19316 + }, + { + "epoch": 5.929097605893186, + "grad_norm": 0.2652932405471802, + "learning_rate": 3.752295114755592e-05, + "loss": 1.7662, + "step": 19317 + }, + { + "epoch": 5.929404542664211, + "grad_norm": 0.18292152881622314, + "learning_rate": 3.751813789679726e-05, + "loss": 1.6691, + "step": 19318 + }, + { + "epoch": 5.929711479435237, + "grad_norm": 0.25630465149879456, + "learning_rate": 3.75133247693917e-05, + "loss": 1.7647, + "step": 19319 + }, + { + "epoch": 5.930018416206261, + "grad_norm": 0.2463291883468628, + "learning_rate": 3.750851176538677e-05, + "loss": 1.7252, + "step": 19320 + }, + { + "epoch": 5.930325352977286, + "grad_norm": 0.19977931678295135, + "learning_rate": 3.750369888483007e-05, + "loss": 1.7694, + "step": 19321 + }, + { + "epoch": 5.930632289748312, + "grad_norm": 0.19523118436336517, + "learning_rate": 3.7498886127769116e-05, + "loss": 1.7095, + "step": 19322 + }, + { + "epoch": 5.930939226519337, + "grad_norm": 0.19273912906646729, + "learning_rate": 3.749407349425151e-05, + "loss": 1.7009, + "step": 19323 + }, + { + "epoch": 5.931246163290362, + "grad_norm": 0.2419402152299881, + "learning_rate": 3.748926098432479e-05, + "loss": 1.7167, + "step": 19324 + }, + { + "epoch": 5.931553100061388, + "grad_norm": 0.22429771721363068, + "learning_rate": 3.7484448598036534e-05, + "loss": 1.6957, + "step": 19325 + }, + { + "epoch": 5.931860036832412, + "grad_norm": 0.23211807012557983, + "learning_rate": 3.747963633543429e-05, + "loss": 1.767, + "step": 19326 + }, + { + "epoch": 5.9321669736034375, + "grad_norm": 0.23204533755779266, + "learning_rate": 3.7474824196565625e-05, + "loss": 1.7405, + "step": 19327 + }, + { + "epoch": 5.932473910374463, + "grad_norm": 0.24068887531757355, + "learning_rate": 3.747001218147809e-05, + "loss": 1.7539, + "step": 19328 + }, + { + "epoch": 5.932780847145488, + "grad_norm": 0.18140049278736115, + "learning_rate": 3.746520029021922e-05, + "loss": 1.6956, + "step": 19329 + }, + { + "epoch": 5.9330877839165135, + "grad_norm": 0.28421929478645325, + "learning_rate": 3.746038852283661e-05, + "loss": 1.8539, + "step": 19330 + }, + { + "epoch": 5.933394720687538, + "grad_norm": 0.21984805166721344, + "learning_rate": 3.745557687937777e-05, + "loss": 1.7469, + "step": 19331 + }, + { + "epoch": 5.933701657458563, + "grad_norm": 0.2500358819961548, + "learning_rate": 3.7450765359890294e-05, + "loss": 1.7184, + "step": 19332 + }, + { + "epoch": 5.934008594229589, + "grad_norm": 0.2608816623687744, + "learning_rate": 3.744595396442169e-05, + "loss": 1.6825, + "step": 19333 + }, + { + "epoch": 5.934315531000614, + "grad_norm": 0.20359274744987488, + "learning_rate": 3.7441142693019526e-05, + "loss": 1.7535, + "step": 19334 + }, + { + "epoch": 5.934622467771639, + "grad_norm": 0.24795760214328766, + "learning_rate": 3.743633154573135e-05, + "loss": 1.7829, + "step": 19335 + }, + { + "epoch": 5.934929404542665, + "grad_norm": 0.20762503147125244, + "learning_rate": 3.7431520522604736e-05, + "loss": 1.7657, + "step": 19336 + }, + { + "epoch": 5.935236341313689, + "grad_norm": 0.24349527060985565, + "learning_rate": 3.7426709623687174e-05, + "loss": 1.7037, + "step": 19337 + }, + { + "epoch": 5.935543278084714, + "grad_norm": 0.2138780951499939, + "learning_rate": 3.742189884902626e-05, + "loss": 1.7302, + "step": 19338 + }, + { + "epoch": 5.93585021485574, + "grad_norm": 0.24776574969291687, + "learning_rate": 3.741708819866949e-05, + "loss": 1.7293, + "step": 19339 + }, + { + "epoch": 5.936157151626765, + "grad_norm": 0.297888845205307, + "learning_rate": 3.7412277672664444e-05, + "loss": 1.8341, + "step": 19340 + }, + { + "epoch": 5.93646408839779, + "grad_norm": 0.2811104953289032, + "learning_rate": 3.740746727105864e-05, + "loss": 1.7188, + "step": 19341 + }, + { + "epoch": 5.936771025168815, + "grad_norm": 0.37908127903938293, + "learning_rate": 3.740265699389964e-05, + "loss": 1.765, + "step": 19342 + }, + { + "epoch": 5.93707796193984, + "grad_norm": 0.24403691291809082, + "learning_rate": 3.739784684123495e-05, + "loss": 1.6897, + "step": 19343 + }, + { + "epoch": 5.9373848987108655, + "grad_norm": 0.2393181174993515, + "learning_rate": 3.7393036813112135e-05, + "loss": 1.6843, + "step": 19344 + }, + { + "epoch": 5.937691835481891, + "grad_norm": 0.2927580177783966, + "learning_rate": 3.738822690957872e-05, + "loss": 1.6946, + "step": 19345 + }, + { + "epoch": 5.937998772252916, + "grad_norm": 0.23423373699188232, + "learning_rate": 3.738341713068223e-05, + "loss": 1.7409, + "step": 19346 + }, + { + "epoch": 5.9383057090239415, + "grad_norm": 0.2544272840023041, + "learning_rate": 3.7378607476470216e-05, + "loss": 1.698, + "step": 19347 + }, + { + "epoch": 5.938612645794966, + "grad_norm": 0.2120404839515686, + "learning_rate": 3.737379794699019e-05, + "loss": 1.7412, + "step": 19348 + }, + { + "epoch": 5.938919582565991, + "grad_norm": 0.2076033353805542, + "learning_rate": 3.736898854228971e-05, + "loss": 1.752, + "step": 19349 + }, + { + "epoch": 5.939226519337017, + "grad_norm": 0.20122376084327698, + "learning_rate": 3.736417926241627e-05, + "loss": 1.6741, + "step": 19350 + }, + { + "epoch": 5.939533456108042, + "grad_norm": 0.1856858730316162, + "learning_rate": 3.735937010741742e-05, + "loss": 1.6959, + "step": 19351 + }, + { + "epoch": 5.939840392879067, + "grad_norm": 0.22192558646202087, + "learning_rate": 3.7354561077340684e-05, + "loss": 1.7597, + "step": 19352 + }, + { + "epoch": 5.940147329650092, + "grad_norm": 0.2653545141220093, + "learning_rate": 3.73497521722336e-05, + "loss": 1.7324, + "step": 19353 + }, + { + "epoch": 5.940454266421117, + "grad_norm": 0.1975676715373993, + "learning_rate": 3.734494339214366e-05, + "loss": 1.6852, + "step": 19354 + }, + { + "epoch": 5.940761203192142, + "grad_norm": 0.26949796080589294, + "learning_rate": 3.734013473711843e-05, + "loss": 1.7695, + "step": 19355 + }, + { + "epoch": 5.941068139963168, + "grad_norm": 0.2272176742553711, + "learning_rate": 3.733532620720539e-05, + "loss": 1.745, + "step": 19356 + }, + { + "epoch": 5.941375076734193, + "grad_norm": 0.25740066170692444, + "learning_rate": 3.733051780245208e-05, + "loss": 1.7701, + "step": 19357 + }, + { + "epoch": 5.941682013505218, + "grad_norm": 0.1910635381937027, + "learning_rate": 3.732570952290602e-05, + "loss": 1.7276, + "step": 19358 + }, + { + "epoch": 5.941988950276243, + "grad_norm": 0.24896447360515594, + "learning_rate": 3.732090136861474e-05, + "loss": 1.7717, + "step": 19359 + }, + { + "epoch": 5.942295887047268, + "grad_norm": 0.20696721971035004, + "learning_rate": 3.731609333962572e-05, + "loss": 1.7053, + "step": 19360 + }, + { + "epoch": 5.9426028238182935, + "grad_norm": 0.18822510540485382, + "learning_rate": 3.731128543598653e-05, + "loss": 1.6869, + "step": 19361 + }, + { + "epoch": 5.942909760589319, + "grad_norm": 0.20757299661636353, + "learning_rate": 3.730647765774464e-05, + "loss": 1.7214, + "step": 19362 + }, + { + "epoch": 5.943216697360343, + "grad_norm": 0.21238471567630768, + "learning_rate": 3.7301670004947574e-05, + "loss": 1.6953, + "step": 19363 + }, + { + "epoch": 5.943523634131369, + "grad_norm": 0.19326119124889374, + "learning_rate": 3.729686247764286e-05, + "loss": 1.7224, + "step": 19364 + }, + { + "epoch": 5.943830570902394, + "grad_norm": 0.17631326615810394, + "learning_rate": 3.729205507587798e-05, + "loss": 1.6471, + "step": 19365 + }, + { + "epoch": 5.944137507673419, + "grad_norm": 0.1741493195295334, + "learning_rate": 3.728724779970048e-05, + "loss": 1.7169, + "step": 19366 + }, + { + "epoch": 5.944444444444445, + "grad_norm": 0.18203428387641907, + "learning_rate": 3.728244064915782e-05, + "loss": 1.7301, + "step": 19367 + }, + { + "epoch": 5.94475138121547, + "grad_norm": 0.2063162475824356, + "learning_rate": 3.727763362429756e-05, + "loss": 1.7274, + "step": 19368 + }, + { + "epoch": 5.945058317986494, + "grad_norm": 0.17239537835121155, + "learning_rate": 3.7272826725167164e-05, + "loss": 1.7194, + "step": 19369 + }, + { + "epoch": 5.94536525475752, + "grad_norm": 0.1910972148180008, + "learning_rate": 3.726801995181418e-05, + "loss": 1.7017, + "step": 19370 + }, + { + "epoch": 5.945672191528545, + "grad_norm": 0.18822111189365387, + "learning_rate": 3.726321330428606e-05, + "loss": 1.723, + "step": 19371 + }, + { + "epoch": 5.94597912829957, + "grad_norm": 0.19680333137512207, + "learning_rate": 3.725840678263035e-05, + "loss": 1.685, + "step": 19372 + }, + { + "epoch": 5.946286065070596, + "grad_norm": 0.19016215205192566, + "learning_rate": 3.725360038689451e-05, + "loss": 1.7148, + "step": 19373 + }, + { + "epoch": 5.94659300184162, + "grad_norm": 0.1992037147283554, + "learning_rate": 3.7248794117126075e-05, + "loss": 1.7278, + "step": 19374 + }, + { + "epoch": 5.9468999386126455, + "grad_norm": 0.1892910748720169, + "learning_rate": 3.724398797337252e-05, + "loss": 1.7093, + "step": 19375 + }, + { + "epoch": 5.947206875383671, + "grad_norm": 0.23379561305046082, + "learning_rate": 3.723918195568137e-05, + "loss": 1.768, + "step": 19376 + }, + { + "epoch": 5.947513812154696, + "grad_norm": 0.1986081600189209, + "learning_rate": 3.7234376064100104e-05, + "loss": 1.719, + "step": 19377 + }, + { + "epoch": 5.9478207489257215, + "grad_norm": 0.20901642739772797, + "learning_rate": 3.7229570298676195e-05, + "loss": 1.7066, + "step": 19378 + }, + { + "epoch": 5.948127685696747, + "grad_norm": 0.2102847546339035, + "learning_rate": 3.722476465945718e-05, + "loss": 1.7354, + "step": 19379 + }, + { + "epoch": 5.948434622467771, + "grad_norm": 0.1857316792011261, + "learning_rate": 3.72199591464905e-05, + "loss": 1.7159, + "step": 19380 + }, + { + "epoch": 5.948741559238797, + "grad_norm": 0.3045661151409149, + "learning_rate": 3.721515375982371e-05, + "loss": 1.8782, + "step": 19381 + }, + { + "epoch": 5.949048496009822, + "grad_norm": 0.24114711582660675, + "learning_rate": 3.7210348499504236e-05, + "loss": 1.6819, + "step": 19382 + }, + { + "epoch": 5.949355432780847, + "grad_norm": 0.20186996459960938, + "learning_rate": 3.720554336557961e-05, + "loss": 1.8028, + "step": 19383 + }, + { + "epoch": 5.949662369551873, + "grad_norm": 0.25385335087776184, + "learning_rate": 3.7200738358097295e-05, + "loss": 1.7278, + "step": 19384 + }, + { + "epoch": 5.949969306322897, + "grad_norm": 0.23390468955039978, + "learning_rate": 3.719593347710478e-05, + "loss": 1.7775, + "step": 19385 + }, + { + "epoch": 5.9502762430939224, + "grad_norm": 0.22577936947345734, + "learning_rate": 3.719112872264956e-05, + "loss": 1.7567, + "step": 19386 + }, + { + "epoch": 5.950583179864948, + "grad_norm": 0.2540932297706604, + "learning_rate": 3.718632409477912e-05, + "loss": 1.6749, + "step": 19387 + }, + { + "epoch": 5.950890116635973, + "grad_norm": 0.1994820535182953, + "learning_rate": 3.718151959354093e-05, + "loss": 1.6809, + "step": 19388 + }, + { + "epoch": 5.9511970534069984, + "grad_norm": 0.27669432759284973, + "learning_rate": 3.717671521898249e-05, + "loss": 1.7633, + "step": 19389 + }, + { + "epoch": 5.951503990178024, + "grad_norm": 0.2533062994480133, + "learning_rate": 3.717191097115125e-05, + "loss": 1.7536, + "step": 19390 + }, + { + "epoch": 5.951810926949048, + "grad_norm": 0.22249148786067963, + "learning_rate": 3.716710685009471e-05, + "loss": 1.7325, + "step": 19391 + }, + { + "epoch": 5.952117863720074, + "grad_norm": 0.3085922598838806, + "learning_rate": 3.716230285586033e-05, + "loss": 1.7046, + "step": 19392 + }, + { + "epoch": 5.952424800491099, + "grad_norm": 0.2591574192047119, + "learning_rate": 3.715749898849562e-05, + "loss": 1.7165, + "step": 19393 + }, + { + "epoch": 5.952731737262124, + "grad_norm": 0.24586348235607147, + "learning_rate": 3.715269524804803e-05, + "loss": 1.749, + "step": 19394 + }, + { + "epoch": 5.953038674033149, + "grad_norm": 0.3424640893936157, + "learning_rate": 3.714789163456502e-05, + "loss": 1.7143, + "step": 19395 + }, + { + "epoch": 5.953345610804174, + "grad_norm": 0.24856910109519958, + "learning_rate": 3.714308814809408e-05, + "loss": 1.868, + "step": 19396 + }, + { + "epoch": 5.953652547575199, + "grad_norm": 0.2758113145828247, + "learning_rate": 3.7138284788682676e-05, + "loss": 1.6722, + "step": 19397 + }, + { + "epoch": 5.953959484346225, + "grad_norm": 0.25981786847114563, + "learning_rate": 3.71334815563783e-05, + "loss": 1.764, + "step": 19398 + }, + { + "epoch": 5.95426642111725, + "grad_norm": 0.27885568141937256, + "learning_rate": 3.7128678451228385e-05, + "loss": 1.7422, + "step": 19399 + }, + { + "epoch": 5.954573357888275, + "grad_norm": 0.2909421920776367, + "learning_rate": 3.712387547328042e-05, + "loss": 1.7862, + "step": 19400 + }, + { + "epoch": 5.9548802946593, + "grad_norm": 0.2288074642419815, + "learning_rate": 3.711907262258185e-05, + "loss": 1.7054, + "step": 19401 + }, + { + "epoch": 5.955187231430325, + "grad_norm": 0.2986883819103241, + "learning_rate": 3.711426989918017e-05, + "loss": 1.7555, + "step": 19402 + }, + { + "epoch": 5.9554941682013505, + "grad_norm": 0.23201194405555725, + "learning_rate": 3.710946730312281e-05, + "loss": 1.8186, + "step": 19403 + }, + { + "epoch": 5.955801104972376, + "grad_norm": 0.2609403431415558, + "learning_rate": 3.710466483445728e-05, + "loss": 1.7743, + "step": 19404 + }, + { + "epoch": 5.956108041743401, + "grad_norm": 0.31131741404533386, + "learning_rate": 3.709986249323098e-05, + "loss": 1.7938, + "step": 19405 + }, + { + "epoch": 5.956414978514426, + "grad_norm": 0.20544753968715668, + "learning_rate": 3.7095060279491424e-05, + "loss": 1.7278, + "step": 19406 + }, + { + "epoch": 5.956721915285451, + "grad_norm": 0.3063479959964752, + "learning_rate": 3.709025819328602e-05, + "loss": 1.7544, + "step": 19407 + }, + { + "epoch": 5.957028852056476, + "grad_norm": 0.34868693351745605, + "learning_rate": 3.708545623466227e-05, + "loss": 1.7536, + "step": 19408 + }, + { + "epoch": 5.957335788827502, + "grad_norm": 0.20847822725772858, + "learning_rate": 3.70806544036676e-05, + "loss": 1.7003, + "step": 19409 + }, + { + "epoch": 5.957642725598527, + "grad_norm": 0.3250095844268799, + "learning_rate": 3.707585270034949e-05, + "loss": 1.6815, + "step": 19410 + }, + { + "epoch": 5.957949662369552, + "grad_norm": 0.24854284524917603, + "learning_rate": 3.707105112475539e-05, + "loss": 1.7665, + "step": 19411 + }, + { + "epoch": 5.958256599140577, + "grad_norm": 0.2921455502510071, + "learning_rate": 3.706624967693271e-05, + "loss": 1.7039, + "step": 19412 + }, + { + "epoch": 5.958563535911602, + "grad_norm": 0.2659071385860443, + "learning_rate": 3.706144835692894e-05, + "loss": 1.7641, + "step": 19413 + }, + { + "epoch": 5.958870472682627, + "grad_norm": 0.30329519510269165, + "learning_rate": 3.7056647164791516e-05, + "loss": 1.7962, + "step": 19414 + }, + { + "epoch": 5.959177409453653, + "grad_norm": 0.4023756682872772, + "learning_rate": 3.7051846100567906e-05, + "loss": 1.7624, + "step": 19415 + }, + { + "epoch": 5.959484346224678, + "grad_norm": 0.24528828263282776, + "learning_rate": 3.704704516430553e-05, + "loss": 1.8156, + "step": 19416 + }, + { + "epoch": 5.9597912829957025, + "grad_norm": 0.46833130717277527, + "learning_rate": 3.704224435605186e-05, + "loss": 1.798, + "step": 19417 + }, + { + "epoch": 5.960098219766728, + "grad_norm": 0.26952674984931946, + "learning_rate": 3.70374436758543e-05, + "loss": 1.743, + "step": 19418 + }, + { + "epoch": 5.960405156537753, + "grad_norm": 0.3126155734062195, + "learning_rate": 3.703264312376034e-05, + "loss": 1.8003, + "step": 19419 + }, + { + "epoch": 5.9607120933087785, + "grad_norm": 0.2833348512649536, + "learning_rate": 3.702784269981738e-05, + "loss": 1.7524, + "step": 19420 + }, + { + "epoch": 5.961019030079804, + "grad_norm": 0.25425654649734497, + "learning_rate": 3.7023042404072916e-05, + "loss": 1.7241, + "step": 19421 + }, + { + "epoch": 5.961325966850829, + "grad_norm": 0.29460933804512024, + "learning_rate": 3.701824223657433e-05, + "loss": 1.676, + "step": 19422 + }, + { + "epoch": 5.961632903621854, + "grad_norm": 0.21040670573711395, + "learning_rate": 3.7013442197369094e-05, + "loss": 1.71, + "step": 19423 + }, + { + "epoch": 5.961939840392879, + "grad_norm": 0.3200007379055023, + "learning_rate": 3.7008642286504624e-05, + "loss": 1.7108, + "step": 19424 + }, + { + "epoch": 5.962246777163904, + "grad_norm": 0.20397430658340454, + "learning_rate": 3.7003842504028366e-05, + "loss": 1.7472, + "step": 19425 + }, + { + "epoch": 5.96255371393493, + "grad_norm": 0.24811354279518127, + "learning_rate": 3.699904284998776e-05, + "loss": 1.7116, + "step": 19426 + }, + { + "epoch": 5.962860650705955, + "grad_norm": 0.20980580151081085, + "learning_rate": 3.699424332443023e-05, + "loss": 1.786, + "step": 19427 + }, + { + "epoch": 5.963167587476979, + "grad_norm": 0.1967400163412094, + "learning_rate": 3.698944392740322e-05, + "loss": 1.7141, + "step": 19428 + }, + { + "epoch": 5.963474524248005, + "grad_norm": 0.21907822787761688, + "learning_rate": 3.698464465895414e-05, + "loss": 1.6983, + "step": 19429 + }, + { + "epoch": 5.96378146101903, + "grad_norm": 0.19938960671424866, + "learning_rate": 3.697984551913043e-05, + "loss": 1.6811, + "step": 19430 + }, + { + "epoch": 5.964088397790055, + "grad_norm": 0.22280220687389374, + "learning_rate": 3.6975046507979506e-05, + "loss": 1.6838, + "step": 19431 + }, + { + "epoch": 5.964395334561081, + "grad_norm": 0.2530672550201416, + "learning_rate": 3.697024762554883e-05, + "loss": 1.8116, + "step": 19432 + }, + { + "epoch": 5.964702271332106, + "grad_norm": 0.21853135526180267, + "learning_rate": 3.696544887188579e-05, + "loss": 1.692, + "step": 19433 + }, + { + "epoch": 5.9650092081031305, + "grad_norm": 0.18738535046577454, + "learning_rate": 3.696065024703783e-05, + "loss": 1.6971, + "step": 19434 + }, + { + "epoch": 5.965316144874156, + "grad_norm": 0.21199190616607666, + "learning_rate": 3.695585175105236e-05, + "loss": 1.7526, + "step": 19435 + }, + { + "epoch": 5.965623081645181, + "grad_norm": 0.22184251248836517, + "learning_rate": 3.695105338397681e-05, + "loss": 1.8075, + "step": 19436 + }, + { + "epoch": 5.9659300184162065, + "grad_norm": 0.20191644132137299, + "learning_rate": 3.6946255145858605e-05, + "loss": 1.7427, + "step": 19437 + }, + { + "epoch": 5.966236955187231, + "grad_norm": 0.2113640457391739, + "learning_rate": 3.694145703674515e-05, + "loss": 1.7556, + "step": 19438 + }, + { + "epoch": 5.966543891958256, + "grad_norm": 0.21834735572338104, + "learning_rate": 3.693665905668387e-05, + "loss": 1.7673, + "step": 19439 + }, + { + "epoch": 5.966850828729282, + "grad_norm": 0.2260274887084961, + "learning_rate": 3.6931861205722197e-05, + "loss": 1.8168, + "step": 19440 + }, + { + "epoch": 5.967157765500307, + "grad_norm": 0.24090524017810822, + "learning_rate": 3.692706348390751e-05, + "loss": 1.821, + "step": 19441 + }, + { + "epoch": 5.967464702271332, + "grad_norm": 0.27469882369041443, + "learning_rate": 3.6922265891287256e-05, + "loss": 1.7114, + "step": 19442 + }, + { + "epoch": 5.967771639042358, + "grad_norm": 0.23479801416397095, + "learning_rate": 3.6917468427908833e-05, + "loss": 1.7334, + "step": 19443 + }, + { + "epoch": 5.968078575813382, + "grad_norm": 0.21109704673290253, + "learning_rate": 3.6912671093819663e-05, + "loss": 1.7047, + "step": 19444 + }, + { + "epoch": 5.968385512584407, + "grad_norm": 0.21141986548900604, + "learning_rate": 3.690787388906715e-05, + "loss": 1.6868, + "step": 19445 + }, + { + "epoch": 5.968692449355433, + "grad_norm": 0.21836397051811218, + "learning_rate": 3.690307681369868e-05, + "loss": 1.6923, + "step": 19446 + }, + { + "epoch": 5.968999386126458, + "grad_norm": 0.21733662486076355, + "learning_rate": 3.6898279867761695e-05, + "loss": 1.7699, + "step": 19447 + }, + { + "epoch": 5.969306322897483, + "grad_norm": 0.19220437109470367, + "learning_rate": 3.689348305130359e-05, + "loss": 1.7002, + "step": 19448 + }, + { + "epoch": 5.969613259668508, + "grad_norm": 0.22644726932048798, + "learning_rate": 3.688868636437176e-05, + "loss": 1.7024, + "step": 19449 + }, + { + "epoch": 5.969920196439533, + "grad_norm": 0.1832779198884964, + "learning_rate": 3.688388980701361e-05, + "loss": 1.699, + "step": 19450 + }, + { + "epoch": 5.9702271332105585, + "grad_norm": 0.20793284475803375, + "learning_rate": 3.687909337927658e-05, + "loss": 1.7557, + "step": 19451 + }, + { + "epoch": 5.970534069981584, + "grad_norm": 0.19485175609588623, + "learning_rate": 3.6874297081207995e-05, + "loss": 1.7641, + "step": 19452 + }, + { + "epoch": 5.970841006752609, + "grad_norm": 0.20980949699878693, + "learning_rate": 3.686950091285534e-05, + "loss": 1.7542, + "step": 19453 + }, + { + "epoch": 5.9711479435236345, + "grad_norm": 0.24902600049972534, + "learning_rate": 3.686470487426594e-05, + "loss": 1.7342, + "step": 19454 + }, + { + "epoch": 5.971454880294659, + "grad_norm": 0.20191124081611633, + "learning_rate": 3.685990896548724e-05, + "loss": 1.6844, + "step": 19455 + }, + { + "epoch": 5.971761817065684, + "grad_norm": 0.23217806220054626, + "learning_rate": 3.685511318656662e-05, + "loss": 1.7054, + "step": 19456 + }, + { + "epoch": 5.97206875383671, + "grad_norm": 0.23383383452892303, + "learning_rate": 3.6850317537551484e-05, + "loss": 1.6903, + "step": 19457 + }, + { + "epoch": 5.972375690607735, + "grad_norm": 0.2147756665945053, + "learning_rate": 3.6845522018489196e-05, + "loss": 1.736, + "step": 19458 + }, + { + "epoch": 5.97268262737876, + "grad_norm": 0.23864400386810303, + "learning_rate": 3.68407266294272e-05, + "loss": 1.7483, + "step": 19459 + }, + { + "epoch": 5.972989564149785, + "grad_norm": 0.18702742457389832, + "learning_rate": 3.6835931370412836e-05, + "loss": 1.6874, + "step": 19460 + }, + { + "epoch": 5.97329650092081, + "grad_norm": 0.2167401760816574, + "learning_rate": 3.683113624149351e-05, + "loss": 1.652, + "step": 19461 + }, + { + "epoch": 5.973603437691835, + "grad_norm": 0.17105139791965485, + "learning_rate": 3.6826341242716636e-05, + "loss": 1.7029, + "step": 19462 + }, + { + "epoch": 5.973910374462861, + "grad_norm": 0.2189798206090927, + "learning_rate": 3.682154637412956e-05, + "loss": 1.7203, + "step": 19463 + }, + { + "epoch": 5.974217311233886, + "grad_norm": 0.17864444851875305, + "learning_rate": 3.68167516357797e-05, + "loss": 1.7176, + "step": 19464 + }, + { + "epoch": 5.974524248004911, + "grad_norm": 0.22356030344963074, + "learning_rate": 3.681195702771442e-05, + "loss": 1.7492, + "step": 19465 + }, + { + "epoch": 5.974831184775936, + "grad_norm": 0.19020728766918182, + "learning_rate": 3.68071625499811e-05, + "loss": 1.6925, + "step": 19466 + }, + { + "epoch": 5.975138121546961, + "grad_norm": 0.19092151522636414, + "learning_rate": 3.680236820262714e-05, + "loss": 1.7253, + "step": 19467 + }, + { + "epoch": 5.975445058317987, + "grad_norm": 0.20842085778713226, + "learning_rate": 3.6797573985699926e-05, + "loss": 1.7251, + "step": 19468 + }, + { + "epoch": 5.975751995089012, + "grad_norm": 0.2245844155550003, + "learning_rate": 3.6792779899246796e-05, + "loss": 1.7351, + "step": 19469 + }, + { + "epoch": 5.976058931860036, + "grad_norm": 0.18867328763008118, + "learning_rate": 3.678798594331519e-05, + "loss": 1.6646, + "step": 19470 + }, + { + "epoch": 5.976365868631062, + "grad_norm": 0.2892500162124634, + "learning_rate": 3.678319211795242e-05, + "loss": 1.7146, + "step": 19471 + }, + { + "epoch": 5.976672805402087, + "grad_norm": 0.22490514814853668, + "learning_rate": 3.677839842320591e-05, + "loss": 1.7147, + "step": 19472 + }, + { + "epoch": 5.976979742173112, + "grad_norm": 0.296724796295166, + "learning_rate": 3.677360485912301e-05, + "loss": 1.7714, + "step": 19473 + }, + { + "epoch": 5.977286678944138, + "grad_norm": 0.2784444987773895, + "learning_rate": 3.676881142575111e-05, + "loss": 1.7198, + "step": 19474 + }, + { + "epoch": 5.977593615715163, + "grad_norm": 0.20270293951034546, + "learning_rate": 3.676401812313755e-05, + "loss": 1.7336, + "step": 19475 + }, + { + "epoch": 5.9779005524861875, + "grad_norm": 0.23352907598018646, + "learning_rate": 3.6759224951329745e-05, + "loss": 1.7428, + "step": 19476 + }, + { + "epoch": 5.978207489257213, + "grad_norm": 0.1892426460981369, + "learning_rate": 3.675443191037502e-05, + "loss": 1.6636, + "step": 19477 + }, + { + "epoch": 5.978514426028238, + "grad_norm": 0.22216783463954926, + "learning_rate": 3.6749639000320766e-05, + "loss": 1.7446, + "step": 19478 + }, + { + "epoch": 5.9788213627992635, + "grad_norm": 0.19465389847755432, + "learning_rate": 3.6744846221214364e-05, + "loss": 1.7403, + "step": 19479 + }, + { + "epoch": 5.979128299570289, + "grad_norm": 0.1918177455663681, + "learning_rate": 3.674005357310314e-05, + "loss": 1.6974, + "step": 19480 + }, + { + "epoch": 5.979435236341313, + "grad_norm": 0.19065791368484497, + "learning_rate": 3.673526105603449e-05, + "loss": 1.7299, + "step": 19481 + }, + { + "epoch": 5.979742173112339, + "grad_norm": 0.24036844074726105, + "learning_rate": 3.673046867005575e-05, + "loss": 1.7441, + "step": 19482 + }, + { + "epoch": 5.980049109883364, + "grad_norm": 0.22352568805217743, + "learning_rate": 3.6725676415214305e-05, + "loss": 1.7556, + "step": 19483 + }, + { + "epoch": 5.980356046654389, + "grad_norm": 0.2492935210466385, + "learning_rate": 3.67208842915575e-05, + "loss": 1.6833, + "step": 19484 + }, + { + "epoch": 5.980662983425415, + "grad_norm": 0.2554415762424469, + "learning_rate": 3.671609229913272e-05, + "loss": 1.7426, + "step": 19485 + }, + { + "epoch": 5.98096992019644, + "grad_norm": 0.24076475203037262, + "learning_rate": 3.671130043798728e-05, + "loss": 1.7362, + "step": 19486 + }, + { + "epoch": 5.981276856967464, + "grad_norm": 0.24297118186950684, + "learning_rate": 3.670650870816858e-05, + "loss": 1.7493, + "step": 19487 + }, + { + "epoch": 5.98158379373849, + "grad_norm": 0.19533030688762665, + "learning_rate": 3.6701717109723924e-05, + "loss": 1.7397, + "step": 19488 + }, + { + "epoch": 5.981890730509515, + "grad_norm": 0.24731193482875824, + "learning_rate": 3.669692564270071e-05, + "loss": 1.7483, + "step": 19489 + }, + { + "epoch": 5.98219766728054, + "grad_norm": 0.23274390399456024, + "learning_rate": 3.669213430714626e-05, + "loss": 1.7677, + "step": 19490 + }, + { + "epoch": 5.982504604051566, + "grad_norm": 0.180234894156456, + "learning_rate": 3.668734310310796e-05, + "loss": 1.7065, + "step": 19491 + }, + { + "epoch": 5.98281154082259, + "grad_norm": 0.19045281410217285, + "learning_rate": 3.6682552030633125e-05, + "loss": 1.7089, + "step": 19492 + }, + { + "epoch": 5.9831184775936155, + "grad_norm": 0.17261318862438202, + "learning_rate": 3.667776108976914e-05, + "loss": 1.7227, + "step": 19493 + }, + { + "epoch": 5.983425414364641, + "grad_norm": 0.2156316339969635, + "learning_rate": 3.667297028056329e-05, + "loss": 1.7025, + "step": 19494 + }, + { + "epoch": 5.983732351135666, + "grad_norm": 0.22288112342357635, + "learning_rate": 3.666817960306298e-05, + "loss": 1.7123, + "step": 19495 + }, + { + "epoch": 5.9840392879066915, + "grad_norm": 0.21983082592487335, + "learning_rate": 3.6663389057315543e-05, + "loss": 1.7688, + "step": 19496 + }, + { + "epoch": 5.984346224677717, + "grad_norm": 0.1804746687412262, + "learning_rate": 3.665859864336829e-05, + "loss": 1.759, + "step": 19497 + }, + { + "epoch": 5.984653161448741, + "grad_norm": 0.22762230038642883, + "learning_rate": 3.6653808361268605e-05, + "loss": 1.8128, + "step": 19498 + }, + { + "epoch": 5.984960098219767, + "grad_norm": 0.21779340505599976, + "learning_rate": 3.664901821106379e-05, + "loss": 1.7316, + "step": 19499 + }, + { + "epoch": 5.985267034990792, + "grad_norm": 0.18899449706077576, + "learning_rate": 3.664422819280121e-05, + "loss": 1.7535, + "step": 19500 + }, + { + "epoch": 5.985573971761817, + "grad_norm": 0.22799427807331085, + "learning_rate": 3.663943830652819e-05, + "loss": 1.7626, + "step": 19501 + }, + { + "epoch": 5.985880908532843, + "grad_norm": 0.19936929643154144, + "learning_rate": 3.6634648552292086e-05, + "loss": 1.6887, + "step": 19502 + }, + { + "epoch": 5.986187845303867, + "grad_norm": 0.22482532262802124, + "learning_rate": 3.6629858930140206e-05, + "loss": 1.6867, + "step": 19503 + }, + { + "epoch": 5.986494782074892, + "grad_norm": 0.23543842136859894, + "learning_rate": 3.662506944011991e-05, + "loss": 1.7715, + "step": 19504 + }, + { + "epoch": 5.986801718845918, + "grad_norm": 0.230603888630867, + "learning_rate": 3.6620280082278495e-05, + "loss": 1.7514, + "step": 19505 + }, + { + "epoch": 5.987108655616943, + "grad_norm": 0.26767033338546753, + "learning_rate": 3.6615490856663334e-05, + "loss": 1.6862, + "step": 19506 + }, + { + "epoch": 5.987415592387968, + "grad_norm": 0.18282492458820343, + "learning_rate": 3.661070176332172e-05, + "loss": 1.6569, + "step": 19507 + }, + { + "epoch": 5.987722529158994, + "grad_norm": 0.255426824092865, + "learning_rate": 3.6605912802301016e-05, + "loss": 1.7623, + "step": 19508 + }, + { + "epoch": 5.988029465930018, + "grad_norm": 0.25026118755340576, + "learning_rate": 3.6601123973648524e-05, + "loss": 1.6907, + "step": 19509 + }, + { + "epoch": 5.9883364027010435, + "grad_norm": 0.19193407893180847, + "learning_rate": 3.659633527741159e-05, + "loss": 1.7647, + "step": 19510 + }, + { + "epoch": 5.988643339472069, + "grad_norm": 0.25562727451324463, + "learning_rate": 3.6591546713637506e-05, + "loss": 1.6806, + "step": 19511 + }, + { + "epoch": 5.988950276243094, + "grad_norm": 0.2296016663312912, + "learning_rate": 3.6586758282373624e-05, + "loss": 1.7747, + "step": 19512 + }, + { + "epoch": 5.989257213014119, + "grad_norm": 0.22875753045082092, + "learning_rate": 3.6581969983667275e-05, + "loss": 1.7847, + "step": 19513 + }, + { + "epoch": 5.989564149785144, + "grad_norm": 0.24469317495822906, + "learning_rate": 3.6577181817565736e-05, + "loss": 1.6784, + "step": 19514 + }, + { + "epoch": 5.989871086556169, + "grad_norm": 0.22855928540229797, + "learning_rate": 3.657239378411638e-05, + "loss": 1.788, + "step": 19515 + }, + { + "epoch": 5.990178023327195, + "grad_norm": 0.28745612502098083, + "learning_rate": 3.656760588336647e-05, + "loss": 1.6836, + "step": 19516 + }, + { + "epoch": 5.99048496009822, + "grad_norm": 0.18221193552017212, + "learning_rate": 3.656281811536337e-05, + "loss": 1.6687, + "step": 19517 + }, + { + "epoch": 5.990791896869245, + "grad_norm": 0.2556660771369934, + "learning_rate": 3.655803048015437e-05, + "loss": 1.7351, + "step": 19518 + }, + { + "epoch": 5.99109883364027, + "grad_norm": 0.18791422247886658, + "learning_rate": 3.6553242977786803e-05, + "loss": 1.6749, + "step": 19519 + }, + { + "epoch": 5.991405770411295, + "grad_norm": 0.28149592876434326, + "learning_rate": 3.654845560830796e-05, + "loss": 1.7333, + "step": 19520 + }, + { + "epoch": 5.99171270718232, + "grad_norm": 0.24631322920322418, + "learning_rate": 3.654366837176517e-05, + "loss": 1.7672, + "step": 19521 + }, + { + "epoch": 5.992019643953346, + "grad_norm": 0.22054782509803772, + "learning_rate": 3.653888126820573e-05, + "loss": 1.7499, + "step": 19522 + }, + { + "epoch": 5.992326580724371, + "grad_norm": 0.23334862291812897, + "learning_rate": 3.653409429767696e-05, + "loss": 1.7133, + "step": 19523 + }, + { + "epoch": 5.9926335174953955, + "grad_norm": 0.19809292256832123, + "learning_rate": 3.6529307460226145e-05, + "loss": 1.6965, + "step": 19524 + }, + { + "epoch": 5.992940454266421, + "grad_norm": 0.23769772052764893, + "learning_rate": 3.652452075590064e-05, + "loss": 1.699, + "step": 19525 + }, + { + "epoch": 5.993247391037446, + "grad_norm": 0.19045031070709229, + "learning_rate": 3.6519734184747686e-05, + "loss": 1.7043, + "step": 19526 + }, + { + "epoch": 5.9935543278084715, + "grad_norm": 0.20795129239559174, + "learning_rate": 3.651494774681465e-05, + "loss": 1.7159, + "step": 19527 + }, + { + "epoch": 5.993861264579497, + "grad_norm": 0.1933370679616928, + "learning_rate": 3.651016144214878e-05, + "loss": 1.6999, + "step": 19528 + }, + { + "epoch": 5.994168201350522, + "grad_norm": 0.18360544741153717, + "learning_rate": 3.650537527079742e-05, + "loss": 1.7525, + "step": 19529 + }, + { + "epoch": 5.994475138121547, + "grad_norm": 0.21080785989761353, + "learning_rate": 3.650058923280786e-05, + "loss": 1.6832, + "step": 19530 + }, + { + "epoch": 5.994782074892572, + "grad_norm": 0.19701606035232544, + "learning_rate": 3.649580332822736e-05, + "loss": 1.7104, + "step": 19531 + }, + { + "epoch": 5.995089011663597, + "grad_norm": 0.24208703637123108, + "learning_rate": 3.6491017557103266e-05, + "loss": 1.726, + "step": 19532 + }, + { + "epoch": 5.995395948434623, + "grad_norm": 0.25981345772743225, + "learning_rate": 3.648623191948284e-05, + "loss": 1.7644, + "step": 19533 + }, + { + "epoch": 5.995702885205648, + "grad_norm": 0.24137455224990845, + "learning_rate": 3.64814464154134e-05, + "loss": 1.7354, + "step": 19534 + }, + { + "epoch": 5.996009821976672, + "grad_norm": 0.2140759378671646, + "learning_rate": 3.647666104494222e-05, + "loss": 1.7244, + "step": 19535 + }, + { + "epoch": 5.996316758747698, + "grad_norm": 0.2801622748374939, + "learning_rate": 3.647187580811663e-05, + "loss": 1.6996, + "step": 19536 + }, + { + "epoch": 5.996623695518723, + "grad_norm": 0.21048817038536072, + "learning_rate": 3.6467090704983856e-05, + "loss": 1.7378, + "step": 19537 + }, + { + "epoch": 5.996930632289748, + "grad_norm": 0.2935819625854492, + "learning_rate": 3.6462305735591254e-05, + "loss": 1.7066, + "step": 19538 + }, + { + "epoch": 5.997237569060774, + "grad_norm": 0.22473880648612976, + "learning_rate": 3.645752089998606e-05, + "loss": 1.7539, + "step": 19539 + }, + { + "epoch": 5.997544505831799, + "grad_norm": 0.20606113970279694, + "learning_rate": 3.6452736198215585e-05, + "loss": 1.7338, + "step": 19540 + }, + { + "epoch": 5.9978514426028235, + "grad_norm": 0.2702842950820923, + "learning_rate": 3.6447951630327116e-05, + "loss": 1.7171, + "step": 19541 + }, + { + "epoch": 5.998158379373849, + "grad_norm": 0.19971637427806854, + "learning_rate": 3.6443167196367946e-05, + "loss": 1.7132, + "step": 19542 + }, + { + "epoch": 5.998465316144874, + "grad_norm": 0.2352653592824936, + "learning_rate": 3.643838289638531e-05, + "loss": 1.787, + "step": 19543 + }, + { + "epoch": 5.9987722529158995, + "grad_norm": 0.2324669510126114, + "learning_rate": 3.643359873042656e-05, + "loss": 1.7039, + "step": 19544 + }, + { + "epoch": 5.999079189686924, + "grad_norm": 0.1935029774904251, + "learning_rate": 3.6428814698538914e-05, + "loss": 1.6846, + "step": 19545 + }, + { + "epoch": 5.999386126457949, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.642403080076968e-05, + "loss": 1.7018, + "step": 19546 + }, + { + "epoch": 5.999693063228975, + "grad_norm": 0.19364693760871887, + "learning_rate": 3.6419247037166146e-05, + "loss": 1.6901, + "step": 19547 + }, + { + "epoch": 6.0, + "grad_norm": 0.23718556761741638, + "learning_rate": 3.641446340777556e-05, + "loss": 1.7743, + "step": 19548 + }, + { + "epoch": 6.000306936771025, + "grad_norm": 0.23907634615898132, + "learning_rate": 3.640967991264521e-05, + "loss": 1.8225, + "step": 19549 + }, + { + "epoch": 6.000613873542051, + "grad_norm": 0.18895737826824188, + "learning_rate": 3.6404896551822365e-05, + "loss": 1.7004, + "step": 19550 + }, + { + "epoch": 6.000920810313075, + "grad_norm": 0.20192188024520874, + "learning_rate": 3.64001133253543e-05, + "loss": 1.7304, + "step": 19551 + }, + { + "epoch": 6.0012277470841005, + "grad_norm": 0.1961488425731659, + "learning_rate": 3.6395330233288285e-05, + "loss": 1.6839, + "step": 19552 + }, + { + "epoch": 6.001534683855126, + "grad_norm": 0.271635502576828, + "learning_rate": 3.639054727567161e-05, + "loss": 1.8182, + "step": 19553 + }, + { + "epoch": 6.001841620626151, + "grad_norm": 0.20838679373264313, + "learning_rate": 3.63857644525515e-05, + "loss": 1.7688, + "step": 19554 + }, + { + "epoch": 6.0021485573971765, + "grad_norm": 0.23661796748638153, + "learning_rate": 3.6380981763975266e-05, + "loss": 1.6785, + "step": 19555 + }, + { + "epoch": 6.002455494168202, + "grad_norm": 0.1728433072566986, + "learning_rate": 3.637619920999013e-05, + "loss": 1.6648, + "step": 19556 + }, + { + "epoch": 6.002762430939226, + "grad_norm": 0.2845853269100189, + "learning_rate": 3.6371416790643395e-05, + "loss": 1.7592, + "step": 19557 + }, + { + "epoch": 6.003069367710252, + "grad_norm": 0.3246566951274872, + "learning_rate": 3.636663450598229e-05, + "loss": 1.7045, + "step": 19558 + }, + { + "epoch": 6.003376304481277, + "grad_norm": 0.21857120096683502, + "learning_rate": 3.636185235605412e-05, + "loss": 1.756, + "step": 19559 + }, + { + "epoch": 6.003683241252302, + "grad_norm": 0.3583754599094391, + "learning_rate": 3.63570703409061e-05, + "loss": 1.6828, + "step": 19560 + }, + { + "epoch": 6.003990178023328, + "grad_norm": 0.25527241826057434, + "learning_rate": 3.635228846058552e-05, + "loss": 1.7611, + "step": 19561 + }, + { + "epoch": 6.004297114794352, + "grad_norm": 0.29662930965423584, + "learning_rate": 3.6347506715139604e-05, + "loss": 1.747, + "step": 19562 + }, + { + "epoch": 6.004604051565377, + "grad_norm": 0.2588978707790375, + "learning_rate": 3.634272510461564e-05, + "loss": 1.7153, + "step": 19563 + }, + { + "epoch": 6.004910988336403, + "grad_norm": 0.23874366283416748, + "learning_rate": 3.633794362906089e-05, + "loss": 1.7285, + "step": 19564 + }, + { + "epoch": 6.005217925107428, + "grad_norm": 0.2898634374141693, + "learning_rate": 3.633316228852256e-05, + "loss": 1.7539, + "step": 19565 + }, + { + "epoch": 6.005524861878453, + "grad_norm": 0.2578127682209015, + "learning_rate": 3.6328381083047946e-05, + "loss": 1.7504, + "step": 19566 + }, + { + "epoch": 6.005831798649478, + "grad_norm": 0.3094595968723297, + "learning_rate": 3.632360001268427e-05, + "loss": 1.7076, + "step": 19567 + }, + { + "epoch": 6.006138735420503, + "grad_norm": 0.27825623750686646, + "learning_rate": 3.63188190774788e-05, + "loss": 1.7651, + "step": 19568 + }, + { + "epoch": 6.0064456721915285, + "grad_norm": 0.27732032537460327, + "learning_rate": 3.631403827747878e-05, + "loss": 1.7209, + "step": 19569 + }, + { + "epoch": 6.006752608962554, + "grad_norm": 0.36446672677993774, + "learning_rate": 3.6309257612731475e-05, + "loss": 1.7191, + "step": 19570 + }, + { + "epoch": 6.007059545733579, + "grad_norm": 0.19071432948112488, + "learning_rate": 3.6304477083284076e-05, + "loss": 1.6981, + "step": 19571 + }, + { + "epoch": 6.0073664825046045, + "grad_norm": 0.40523234009742737, + "learning_rate": 3.6299696689183895e-05, + "loss": 1.7259, + "step": 19572 + }, + { + "epoch": 6.007673419275629, + "grad_norm": 0.30279576778411865, + "learning_rate": 3.6294916430478116e-05, + "loss": 1.8017, + "step": 19573 + }, + { + "epoch": 6.007980356046654, + "grad_norm": 0.2944689989089966, + "learning_rate": 3.629013630721402e-05, + "loss": 1.7347, + "step": 19574 + }, + { + "epoch": 6.00828729281768, + "grad_norm": 0.3557213246822357, + "learning_rate": 3.6285356319438814e-05, + "loss": 1.7308, + "step": 19575 + }, + { + "epoch": 6.008594229588705, + "grad_norm": 0.19888661801815033, + "learning_rate": 3.628057646719978e-05, + "loss": 1.7571, + "step": 19576 + }, + { + "epoch": 6.00890116635973, + "grad_norm": 0.34002986550331116, + "learning_rate": 3.627579675054411e-05, + "loss": 1.7417, + "step": 19577 + }, + { + "epoch": 6.009208103130755, + "grad_norm": 0.2756921350955963, + "learning_rate": 3.627101716951908e-05, + "loss": 1.7351, + "step": 19578 + }, + { + "epoch": 6.00951503990178, + "grad_norm": 0.3520946502685547, + "learning_rate": 3.6266237724171885e-05, + "loss": 1.7056, + "step": 19579 + }, + { + "epoch": 6.009821976672805, + "grad_norm": 0.3673728406429291, + "learning_rate": 3.6261458414549786e-05, + "loss": 1.6388, + "step": 19580 + }, + { + "epoch": 6.010128913443831, + "grad_norm": 0.2247757613658905, + "learning_rate": 3.625667924070003e-05, + "loss": 1.7772, + "step": 19581 + }, + { + "epoch": 6.010435850214856, + "grad_norm": 0.4387452006340027, + "learning_rate": 3.6251900202669795e-05, + "loss": 1.7629, + "step": 19582 + }, + { + "epoch": 6.0107427869858805, + "grad_norm": 0.23595796525478363, + "learning_rate": 3.624712130050636e-05, + "loss": 1.8044, + "step": 19583 + }, + { + "epoch": 6.011049723756906, + "grad_norm": 0.31198835372924805, + "learning_rate": 3.624234253425691e-05, + "loss": 1.7623, + "step": 19584 + }, + { + "epoch": 6.011356660527931, + "grad_norm": 0.25283896923065186, + "learning_rate": 3.6237563903968705e-05, + "loss": 1.7771, + "step": 19585 + }, + { + "epoch": 6.0116635972989565, + "grad_norm": 0.2595483064651489, + "learning_rate": 3.6232785409688954e-05, + "loss": 1.7405, + "step": 19586 + }, + { + "epoch": 6.011970534069982, + "grad_norm": 0.302273690700531, + "learning_rate": 3.622800705146491e-05, + "loss": 1.7236, + "step": 19587 + }, + { + "epoch": 6.012277470841007, + "grad_norm": 0.20444928109645844, + "learning_rate": 3.622322882934375e-05, + "loss": 1.6863, + "step": 19588 + }, + { + "epoch": 6.012584407612032, + "grad_norm": 0.2682531774044037, + "learning_rate": 3.621845074337273e-05, + "loss": 1.752, + "step": 19589 + }, + { + "epoch": 6.012891344383057, + "grad_norm": 0.25617173314094543, + "learning_rate": 3.621367279359905e-05, + "loss": 1.7496, + "step": 19590 + }, + { + "epoch": 6.013198281154082, + "grad_norm": 0.24514207243919373, + "learning_rate": 3.620889498006994e-05, + "loss": 1.6568, + "step": 19591 + }, + { + "epoch": 6.013505217925108, + "grad_norm": 0.2799128293991089, + "learning_rate": 3.6204117302832616e-05, + "loss": 1.7284, + "step": 19592 + }, + { + "epoch": 6.013812154696133, + "grad_norm": 0.2025543451309204, + "learning_rate": 3.619933976193428e-05, + "loss": 1.7172, + "step": 19593 + }, + { + "epoch": 6.014119091467157, + "grad_norm": 0.24697700142860413, + "learning_rate": 3.619456235742216e-05, + "loss": 1.7316, + "step": 19594 + }, + { + "epoch": 6.014426028238183, + "grad_norm": 0.2518150210380554, + "learning_rate": 3.618978508934348e-05, + "loss": 1.8183, + "step": 19595 + }, + { + "epoch": 6.014732965009208, + "grad_norm": 0.165326327085495, + "learning_rate": 3.618500795774542e-05, + "loss": 1.665, + "step": 19596 + }, + { + "epoch": 6.015039901780233, + "grad_norm": 0.19158180058002472, + "learning_rate": 3.6180230962675216e-05, + "loss": 1.7232, + "step": 19597 + }, + { + "epoch": 6.015346838551259, + "grad_norm": 0.19456413388252258, + "learning_rate": 3.6175454104180086e-05, + "loss": 1.7153, + "step": 19598 + }, + { + "epoch": 6.015653775322283, + "grad_norm": 0.233373761177063, + "learning_rate": 3.6170677382307195e-05, + "loss": 1.7914, + "step": 19599 + }, + { + "epoch": 6.0159607120933085, + "grad_norm": 0.18567882478237152, + "learning_rate": 3.6165900797103796e-05, + "loss": 1.6793, + "step": 19600 + }, + { + "epoch": 6.016267648864334, + "grad_norm": 0.2119273990392685, + "learning_rate": 3.616112434861706e-05, + "loss": 1.689, + "step": 19601 + }, + { + "epoch": 6.016574585635359, + "grad_norm": 0.1915217787027359, + "learning_rate": 3.61563480368942e-05, + "loss": 1.6835, + "step": 19602 + }, + { + "epoch": 6.0168815224063845, + "grad_norm": 0.24824760854244232, + "learning_rate": 3.615157186198244e-05, + "loss": 1.8411, + "step": 19603 + }, + { + "epoch": 6.01718845917741, + "grad_norm": 0.2198900282382965, + "learning_rate": 3.6146795823928955e-05, + "loss": 1.7311, + "step": 19604 + }, + { + "epoch": 6.017495395948434, + "grad_norm": 0.22993668913841248, + "learning_rate": 3.614201992278095e-05, + "loss": 1.7249, + "step": 19605 + }, + { + "epoch": 6.01780233271946, + "grad_norm": 0.20677974820137024, + "learning_rate": 3.613724415858564e-05, + "loss": 1.7137, + "step": 19606 + }, + { + "epoch": 6.018109269490485, + "grad_norm": 0.1844938099384308, + "learning_rate": 3.6132468531390184e-05, + "loss": 1.6512, + "step": 19607 + }, + { + "epoch": 6.01841620626151, + "grad_norm": 0.224154993891716, + "learning_rate": 3.6127693041241815e-05, + "loss": 1.7116, + "step": 19608 + }, + { + "epoch": 6.018723143032536, + "grad_norm": 0.17322199046611786, + "learning_rate": 3.612291768818772e-05, + "loss": 1.6743, + "step": 19609 + }, + { + "epoch": 6.01903007980356, + "grad_norm": 0.24451903998851776, + "learning_rate": 3.611814247227508e-05, + "loss": 1.8332, + "step": 19610 + }, + { + "epoch": 6.019337016574585, + "grad_norm": 0.1911642849445343, + "learning_rate": 3.611336739355109e-05, + "loss": 1.707, + "step": 19611 + }, + { + "epoch": 6.019643953345611, + "grad_norm": 0.20917518436908722, + "learning_rate": 3.6108592452062954e-05, + "loss": 1.7328, + "step": 19612 + }, + { + "epoch": 6.019950890116636, + "grad_norm": 0.2314450889825821, + "learning_rate": 3.610381764785784e-05, + "loss": 1.7575, + "step": 19613 + }, + { + "epoch": 6.020257826887661, + "grad_norm": 0.20701734721660614, + "learning_rate": 3.609904298098296e-05, + "loss": 1.6958, + "step": 19614 + }, + { + "epoch": 6.020564763658686, + "grad_norm": 0.2494465857744217, + "learning_rate": 3.609426845148547e-05, + "loss": 1.706, + "step": 19615 + }, + { + "epoch": 6.020871700429711, + "grad_norm": 0.25842729210853577, + "learning_rate": 3.608949405941256e-05, + "loss": 1.7667, + "step": 19616 + }, + { + "epoch": 6.0211786372007365, + "grad_norm": 0.19831863045692444, + "learning_rate": 3.608471980481145e-05, + "loss": 1.7135, + "step": 19617 + }, + { + "epoch": 6.021485573971762, + "grad_norm": 0.21611735224723816, + "learning_rate": 3.607994568772927e-05, + "loss": 1.7416, + "step": 19618 + }, + { + "epoch": 6.021792510742787, + "grad_norm": 0.2356715202331543, + "learning_rate": 3.607517170821324e-05, + "loss": 1.7696, + "step": 19619 + }, + { + "epoch": 6.0220994475138125, + "grad_norm": 0.24737675487995148, + "learning_rate": 3.6070397866310514e-05, + "loss": 1.7189, + "step": 19620 + }, + { + "epoch": 6.022406384284837, + "grad_norm": 0.19260701537132263, + "learning_rate": 3.6065624162068284e-05, + "loss": 1.7292, + "step": 19621 + }, + { + "epoch": 6.022713321055862, + "grad_norm": 0.29366952180862427, + "learning_rate": 3.6060850595533716e-05, + "loss": 1.7875, + "step": 19622 + }, + { + "epoch": 6.023020257826888, + "grad_norm": 0.2038174718618393, + "learning_rate": 3.605607716675401e-05, + "loss": 1.6777, + "step": 19623 + }, + { + "epoch": 6.023327194597913, + "grad_norm": 0.28923583030700684, + "learning_rate": 3.605130387577631e-05, + "loss": 1.7175, + "step": 19624 + }, + { + "epoch": 6.023634131368938, + "grad_norm": 0.3004317283630371, + "learning_rate": 3.6046530722647816e-05, + "loss": 1.8059, + "step": 19625 + }, + { + "epoch": 6.023941068139963, + "grad_norm": 0.19832390546798706, + "learning_rate": 3.6041757707415666e-05, + "loss": 1.7197, + "step": 19626 + }, + { + "epoch": 6.024248004910988, + "grad_norm": 0.2782927453517914, + "learning_rate": 3.6036984830127054e-05, + "loss": 1.6563, + "step": 19627 + }, + { + "epoch": 6.024554941682013, + "grad_norm": 0.20395785570144653, + "learning_rate": 3.603221209082913e-05, + "loss": 1.6972, + "step": 19628 + }, + { + "epoch": 6.024861878453039, + "grad_norm": 0.26302096247673035, + "learning_rate": 3.60274394895691e-05, + "loss": 1.7348, + "step": 19629 + }, + { + "epoch": 6.025168815224064, + "grad_norm": 0.26376327872276306, + "learning_rate": 3.6022667026394095e-05, + "loss": 1.7183, + "step": 19630 + }, + { + "epoch": 6.0254757519950894, + "grad_norm": 0.20590877532958984, + "learning_rate": 3.601789470135127e-05, + "loss": 1.7114, + "step": 19631 + }, + { + "epoch": 6.025782688766114, + "grad_norm": 0.2873607277870178, + "learning_rate": 3.6013122514487815e-05, + "loss": 1.7598, + "step": 19632 + }, + { + "epoch": 6.026089625537139, + "grad_norm": 0.24324963986873627, + "learning_rate": 3.600835046585087e-05, + "loss": 1.8844, + "step": 19633 + }, + { + "epoch": 6.026396562308165, + "grad_norm": 0.27910730242729187, + "learning_rate": 3.6003578555487624e-05, + "loss": 1.8598, + "step": 19634 + }, + { + "epoch": 6.02670349907919, + "grad_norm": 0.22766844928264618, + "learning_rate": 3.59988067834452e-05, + "loss": 1.7281, + "step": 19635 + }, + { + "epoch": 6.027010435850215, + "grad_norm": 0.2390190064907074, + "learning_rate": 3.5994035149770804e-05, + "loss": 1.7355, + "step": 19636 + }, + { + "epoch": 6.02731737262124, + "grad_norm": 0.23422548174858093, + "learning_rate": 3.598926365451153e-05, + "loss": 1.7226, + "step": 19637 + }, + { + "epoch": 6.027624309392265, + "grad_norm": 0.20240288972854614, + "learning_rate": 3.598449229771458e-05, + "loss": 1.7523, + "step": 19638 + }, + { + "epoch": 6.02793124616329, + "grad_norm": 0.26388832926750183, + "learning_rate": 3.597972107942708e-05, + "loss": 1.7003, + "step": 19639 + }, + { + "epoch": 6.028238182934316, + "grad_norm": 0.19814053177833557, + "learning_rate": 3.597494999969622e-05, + "loss": 1.7087, + "step": 19640 + }, + { + "epoch": 6.028545119705341, + "grad_norm": 0.2779136896133423, + "learning_rate": 3.5970179058569095e-05, + "loss": 1.7581, + "step": 19641 + }, + { + "epoch": 6.0288520564763655, + "grad_norm": 0.220394566655159, + "learning_rate": 3.5965408256092905e-05, + "loss": 1.7236, + "step": 19642 + }, + { + "epoch": 6.029158993247391, + "grad_norm": 0.28568828105926514, + "learning_rate": 3.596063759231476e-05, + "loss": 1.7933, + "step": 19643 + }, + { + "epoch": 6.029465930018416, + "grad_norm": 0.19509564340114594, + "learning_rate": 3.595586706728183e-05, + "loss": 1.6803, + "step": 19644 + }, + { + "epoch": 6.0297728667894415, + "grad_norm": 0.30855104327201843, + "learning_rate": 3.595109668104124e-05, + "loss": 1.7345, + "step": 19645 + }, + { + "epoch": 6.030079803560467, + "grad_norm": 0.24195496737957, + "learning_rate": 3.5946326433640174e-05, + "loss": 1.7493, + "step": 19646 + }, + { + "epoch": 6.030386740331492, + "grad_norm": 0.28324684500694275, + "learning_rate": 3.5941556325125744e-05, + "loss": 1.7959, + "step": 19647 + }, + { + "epoch": 6.030693677102517, + "grad_norm": 0.25351646542549133, + "learning_rate": 3.593678635554508e-05, + "loss": 1.7298, + "step": 19648 + }, + { + "epoch": 6.031000613873542, + "grad_norm": 0.2608177959918976, + "learning_rate": 3.593201652494534e-05, + "loss": 1.7072, + "step": 19649 + }, + { + "epoch": 6.031307550644567, + "grad_norm": 0.3182333707809448, + "learning_rate": 3.592724683337365e-05, + "loss": 1.6976, + "step": 19650 + }, + { + "epoch": 6.031614487415593, + "grad_norm": 0.19296859204769135, + "learning_rate": 3.592247728087717e-05, + "loss": 1.6879, + "step": 19651 + }, + { + "epoch": 6.031921424186618, + "grad_norm": 0.3927764594554901, + "learning_rate": 3.591770786750301e-05, + "loss": 1.6824, + "step": 19652 + }, + { + "epoch": 6.032228360957642, + "grad_norm": 0.23609496653079987, + "learning_rate": 3.591293859329833e-05, + "loss": 1.7224, + "step": 19653 + }, + { + "epoch": 6.032535297728668, + "grad_norm": 0.40787333250045776, + "learning_rate": 3.590816945831023e-05, + "loss": 1.7206, + "step": 19654 + }, + { + "epoch": 6.032842234499693, + "grad_norm": 0.31101885437965393, + "learning_rate": 3.590340046258586e-05, + "loss": 1.7446, + "step": 19655 + }, + { + "epoch": 6.033149171270718, + "grad_norm": 0.19401656091213226, + "learning_rate": 3.589863160617235e-05, + "loss": 1.6778, + "step": 19656 + }, + { + "epoch": 6.033456108041744, + "grad_norm": 0.3309115469455719, + "learning_rate": 3.589386288911684e-05, + "loss": 1.7196, + "step": 19657 + }, + { + "epoch": 6.033763044812768, + "grad_norm": 0.22281408309936523, + "learning_rate": 3.588909431146643e-05, + "loss": 1.7122, + "step": 19658 + }, + { + "epoch": 6.0340699815837935, + "grad_norm": 0.2903781831264496, + "learning_rate": 3.5884325873268275e-05, + "loss": 1.7428, + "step": 19659 + }, + { + "epoch": 6.034376918354819, + "grad_norm": 0.2529856562614441, + "learning_rate": 3.587955757456947e-05, + "loss": 1.7075, + "step": 19660 + }, + { + "epoch": 6.034683855125844, + "grad_norm": 0.2445102334022522, + "learning_rate": 3.587478941541716e-05, + "loss": 1.6631, + "step": 19661 + }, + { + "epoch": 6.0349907918968695, + "grad_norm": 0.31834688782691956, + "learning_rate": 3.5870021395858454e-05, + "loss": 1.7009, + "step": 19662 + }, + { + "epoch": 6.035297728667895, + "grad_norm": 0.20666317641735077, + "learning_rate": 3.5865253515940496e-05, + "loss": 1.7252, + "step": 19663 + }, + { + "epoch": 6.035604665438919, + "grad_norm": 0.3070019483566284, + "learning_rate": 3.586048577571039e-05, + "loss": 1.7139, + "step": 19664 + }, + { + "epoch": 6.035911602209945, + "grad_norm": 0.22463096678256989, + "learning_rate": 3.585571817521522e-05, + "loss": 1.7574, + "step": 19665 + }, + { + "epoch": 6.03621853898097, + "grad_norm": 0.25405722856521606, + "learning_rate": 3.585095071450216e-05, + "loss": 1.7135, + "step": 19666 + }, + { + "epoch": 6.036525475751995, + "grad_norm": 0.24543432891368866, + "learning_rate": 3.584618339361828e-05, + "loss": 1.7312, + "step": 19667 + }, + { + "epoch": 6.036832412523021, + "grad_norm": 0.2454189658164978, + "learning_rate": 3.584141621261073e-05, + "loss": 1.7905, + "step": 19668 + }, + { + "epoch": 6.037139349294045, + "grad_norm": 0.2163272649049759, + "learning_rate": 3.583664917152658e-05, + "loss": 1.7042, + "step": 19669 + }, + { + "epoch": 6.03744628606507, + "grad_norm": 0.2088690549135208, + "learning_rate": 3.5831882270412994e-05, + "loss": 1.7905, + "step": 19670 + }, + { + "epoch": 6.037753222836096, + "grad_norm": 0.26145869493484497, + "learning_rate": 3.5827115509317024e-05, + "loss": 1.7487, + "step": 19671 + }, + { + "epoch": 6.038060159607121, + "grad_norm": 0.20306496322155, + "learning_rate": 3.582234888828582e-05, + "loss": 1.7103, + "step": 19672 + }, + { + "epoch": 6.038367096378146, + "grad_norm": 0.2504192292690277, + "learning_rate": 3.5817582407366454e-05, + "loss": 1.7397, + "step": 19673 + }, + { + "epoch": 6.038674033149171, + "grad_norm": 0.22803208231925964, + "learning_rate": 3.5812816066606084e-05, + "loss": 1.7105, + "step": 19674 + }, + { + "epoch": 6.038980969920196, + "grad_norm": 0.24963071942329407, + "learning_rate": 3.580804986605176e-05, + "loss": 1.734, + "step": 19675 + }, + { + "epoch": 6.0392879066912215, + "grad_norm": 0.2468494027853012, + "learning_rate": 3.580328380575062e-05, + "loss": 1.6866, + "step": 19676 + }, + { + "epoch": 6.039594843462247, + "grad_norm": 0.17628586292266846, + "learning_rate": 3.579851788574973e-05, + "loss": 1.7106, + "step": 19677 + }, + { + "epoch": 6.039901780233272, + "grad_norm": 0.23965299129486084, + "learning_rate": 3.579375210609622e-05, + "loss": 1.7675, + "step": 19678 + }, + { + "epoch": 6.0402087170042975, + "grad_norm": 0.19638453423976898, + "learning_rate": 3.5788986466837175e-05, + "loss": 1.7242, + "step": 19679 + }, + { + "epoch": 6.040515653775322, + "grad_norm": 0.2602851092815399, + "learning_rate": 3.578422096801971e-05, + "loss": 1.7287, + "step": 19680 + }, + { + "epoch": 6.040822590546347, + "grad_norm": 0.25868186354637146, + "learning_rate": 3.577945560969091e-05, + "loss": 1.7604, + "step": 19681 + }, + { + "epoch": 6.041129527317373, + "grad_norm": 0.1996527463197708, + "learning_rate": 3.577469039189784e-05, + "loss": 1.7469, + "step": 19682 + }, + { + "epoch": 6.041436464088398, + "grad_norm": 0.29909980297088623, + "learning_rate": 3.576992531468763e-05, + "loss": 1.682, + "step": 19683 + }, + { + "epoch": 6.041743400859423, + "grad_norm": 0.20064286887645721, + "learning_rate": 3.576516037810734e-05, + "loss": 1.7125, + "step": 19684 + }, + { + "epoch": 6.042050337630448, + "grad_norm": 0.2134515345096588, + "learning_rate": 3.576039558220411e-05, + "loss": 1.7371, + "step": 19685 + }, + { + "epoch": 6.042357274401473, + "grad_norm": 0.20365437865257263, + "learning_rate": 3.575563092702497e-05, + "loss": 1.7446, + "step": 19686 + }, + { + "epoch": 6.042664211172498, + "grad_norm": 0.24526065587997437, + "learning_rate": 3.5750866412617054e-05, + "loss": 1.759, + "step": 19687 + }, + { + "epoch": 6.042971147943524, + "grad_norm": 0.24521295726299286, + "learning_rate": 3.5746102039027414e-05, + "loss": 1.7589, + "step": 19688 + }, + { + "epoch": 6.043278084714549, + "grad_norm": 0.2151515632867813, + "learning_rate": 3.5741337806303155e-05, + "loss": 1.761, + "step": 19689 + }, + { + "epoch": 6.043585021485574, + "grad_norm": 0.25733521580696106, + "learning_rate": 3.573657371449134e-05, + "loss": 1.7171, + "step": 19690 + }, + { + "epoch": 6.043891958256599, + "grad_norm": 0.18520839512348175, + "learning_rate": 3.5731809763639084e-05, + "loss": 1.6691, + "step": 19691 + }, + { + "epoch": 6.044198895027624, + "grad_norm": 0.24617944657802582, + "learning_rate": 3.572704595379342e-05, + "loss": 1.7869, + "step": 19692 + }, + { + "epoch": 6.0445058317986495, + "grad_norm": 0.20246629416942596, + "learning_rate": 3.5722282285001493e-05, + "loss": 1.7667, + "step": 19693 + }, + { + "epoch": 6.044812768569675, + "grad_norm": 0.21190209686756134, + "learning_rate": 3.5717518757310305e-05, + "loss": 1.6839, + "step": 19694 + }, + { + "epoch": 6.0451197053407, + "grad_norm": 0.19021087884902954, + "learning_rate": 3.571275537076699e-05, + "loss": 1.7023, + "step": 19695 + }, + { + "epoch": 6.045426642111725, + "grad_norm": 0.1793040931224823, + "learning_rate": 3.570799212541858e-05, + "loss": 1.7022, + "step": 19696 + }, + { + "epoch": 6.04573357888275, + "grad_norm": 0.19105301797389984, + "learning_rate": 3.570322902131219e-05, + "loss": 1.7151, + "step": 19697 + }, + { + "epoch": 6.046040515653775, + "grad_norm": 0.22083842754364014, + "learning_rate": 3.569846605849487e-05, + "loss": 1.7097, + "step": 19698 + }, + { + "epoch": 6.046347452424801, + "grad_norm": 0.2607622444629669, + "learning_rate": 3.569370323701368e-05, + "loss": 1.7508, + "step": 19699 + }, + { + "epoch": 6.046654389195826, + "grad_norm": 0.22349929809570312, + "learning_rate": 3.56889405569157e-05, + "loss": 1.7131, + "step": 19700 + }, + { + "epoch": 6.04696132596685, + "grad_norm": 0.19442661106586456, + "learning_rate": 3.5684178018247996e-05, + "loss": 1.7476, + "step": 19701 + }, + { + "epoch": 6.047268262737876, + "grad_norm": 0.2002776861190796, + "learning_rate": 3.5679415621057646e-05, + "loss": 1.7982, + "step": 19702 + }, + { + "epoch": 6.047575199508901, + "grad_norm": 0.21558646857738495, + "learning_rate": 3.567465336539169e-05, + "loss": 1.7231, + "step": 19703 + }, + { + "epoch": 6.047882136279926, + "grad_norm": 0.20468449592590332, + "learning_rate": 3.5669891251297224e-05, + "loss": 1.6426, + "step": 19704 + }, + { + "epoch": 6.048189073050952, + "grad_norm": 0.23098553717136383, + "learning_rate": 3.566512927882127e-05, + "loss": 1.7763, + "step": 19705 + }, + { + "epoch": 6.048496009821977, + "grad_norm": 0.22959274053573608, + "learning_rate": 3.566036744801092e-05, + "loss": 1.7663, + "step": 19706 + }, + { + "epoch": 6.0488029465930016, + "grad_norm": 0.18519435822963715, + "learning_rate": 3.5655605758913215e-05, + "loss": 1.6995, + "step": 19707 + }, + { + "epoch": 6.049109883364027, + "grad_norm": 0.2529381513595581, + "learning_rate": 3.565084421157524e-05, + "loss": 1.754, + "step": 19708 + }, + { + "epoch": 6.049416820135052, + "grad_norm": 0.2208617776632309, + "learning_rate": 3.5646082806044015e-05, + "loss": 1.6939, + "step": 19709 + }, + { + "epoch": 6.0497237569060776, + "grad_norm": 0.18433862924575806, + "learning_rate": 3.564132154236663e-05, + "loss": 1.7145, + "step": 19710 + }, + { + "epoch": 6.050030693677103, + "grad_norm": 0.1963127702474594, + "learning_rate": 3.563656042059011e-05, + "loss": 1.7101, + "step": 19711 + }, + { + "epoch": 6.050337630448127, + "grad_norm": 0.19860461354255676, + "learning_rate": 3.5631799440761526e-05, + "loss": 1.7218, + "step": 19712 + }, + { + "epoch": 6.050644567219153, + "grad_norm": 0.19304174184799194, + "learning_rate": 3.5627038602927905e-05, + "loss": 1.7575, + "step": 19713 + }, + { + "epoch": 6.050951503990178, + "grad_norm": 0.20402809977531433, + "learning_rate": 3.5622277907136335e-05, + "loss": 1.7438, + "step": 19714 + }, + { + "epoch": 6.051258440761203, + "grad_norm": 0.20821911096572876, + "learning_rate": 3.5617517353433844e-05, + "loss": 1.7381, + "step": 19715 + }, + { + "epoch": 6.051565377532229, + "grad_norm": 0.24375931918621063, + "learning_rate": 3.561275694186745e-05, + "loss": 1.8377, + "step": 19716 + }, + { + "epoch": 6.051872314303253, + "grad_norm": 0.19745339453220367, + "learning_rate": 3.560799667248424e-05, + "loss": 1.6839, + "step": 19717 + }, + { + "epoch": 6.0521792510742785, + "grad_norm": 0.2039431631565094, + "learning_rate": 3.560323654533124e-05, + "loss": 1.692, + "step": 19718 + }, + { + "epoch": 6.052486187845304, + "grad_norm": 0.23229047656059265, + "learning_rate": 3.559847656045551e-05, + "loss": 1.7408, + "step": 19719 + }, + { + "epoch": 6.052793124616329, + "grad_norm": 0.20387259125709534, + "learning_rate": 3.559371671790404e-05, + "loss": 1.7215, + "step": 19720 + }, + { + "epoch": 6.0531000613873545, + "grad_norm": 0.23960062861442566, + "learning_rate": 3.5588957017723944e-05, + "loss": 1.8048, + "step": 19721 + }, + { + "epoch": 6.05340699815838, + "grad_norm": 0.1979944109916687, + "learning_rate": 3.5584197459962196e-05, + "loss": 1.7307, + "step": 19722 + }, + { + "epoch": 6.053713934929404, + "grad_norm": 0.21914203464984894, + "learning_rate": 3.557943804466586e-05, + "loss": 1.6999, + "step": 19723 + }, + { + "epoch": 6.05402087170043, + "grad_norm": 0.22338175773620605, + "learning_rate": 3.557467877188197e-05, + "loss": 1.6977, + "step": 19724 + }, + { + "epoch": 6.054327808471455, + "grad_norm": 0.2692863643169403, + "learning_rate": 3.5569919641657576e-05, + "loss": 1.7664, + "step": 19725 + }, + { + "epoch": 6.05463474524248, + "grad_norm": 0.2882823944091797, + "learning_rate": 3.5565160654039675e-05, + "loss": 1.6943, + "step": 19726 + }, + { + "epoch": 6.054941682013506, + "grad_norm": 0.2114996612071991, + "learning_rate": 3.5560401809075336e-05, + "loss": 1.7426, + "step": 19727 + }, + { + "epoch": 6.05524861878453, + "grad_norm": 0.19616106152534485, + "learning_rate": 3.5555643106811546e-05, + "loss": 1.6616, + "step": 19728 + }, + { + "epoch": 6.055555555555555, + "grad_norm": 0.241346076130867, + "learning_rate": 3.555088454729537e-05, + "loss": 1.7423, + "step": 19729 + }, + { + "epoch": 6.055862492326581, + "grad_norm": 0.24495846033096313, + "learning_rate": 3.554612613057381e-05, + "loss": 1.7699, + "step": 19730 + }, + { + "epoch": 6.056169429097606, + "grad_norm": 0.233306422829628, + "learning_rate": 3.554136785669393e-05, + "loss": 1.7201, + "step": 19731 + }, + { + "epoch": 6.056476365868631, + "grad_norm": 0.23820927739143372, + "learning_rate": 3.553660972570272e-05, + "loss": 1.7694, + "step": 19732 + }, + { + "epoch": 6.056783302639656, + "grad_norm": 0.20664167404174805, + "learning_rate": 3.553185173764719e-05, + "loss": 1.7151, + "step": 19733 + }, + { + "epoch": 6.057090239410681, + "grad_norm": 0.22572578489780426, + "learning_rate": 3.5527093892574394e-05, + "loss": 1.7715, + "step": 19734 + }, + { + "epoch": 6.0573971761817065, + "grad_norm": 0.18554186820983887, + "learning_rate": 3.552233619053133e-05, + "loss": 1.7481, + "step": 19735 + }, + { + "epoch": 6.057704112952732, + "grad_norm": 0.2434636950492859, + "learning_rate": 3.551757863156504e-05, + "loss": 1.7992, + "step": 19736 + }, + { + "epoch": 6.058011049723757, + "grad_norm": 0.1949392408132553, + "learning_rate": 3.5512821215722514e-05, + "loss": 1.7439, + "step": 19737 + }, + { + "epoch": 6.0583179864947825, + "grad_norm": 0.2696731686592102, + "learning_rate": 3.55080639430508e-05, + "loss": 1.7092, + "step": 19738 + }, + { + "epoch": 6.058624923265807, + "grad_norm": 0.1963263303041458, + "learning_rate": 3.550330681359686e-05, + "loss": 1.6726, + "step": 19739 + }, + { + "epoch": 6.058931860036832, + "grad_norm": 0.20115122199058533, + "learning_rate": 3.549854982740776e-05, + "loss": 1.7459, + "step": 19740 + }, + { + "epoch": 6.059238796807858, + "grad_norm": 0.21378284692764282, + "learning_rate": 3.549379298453048e-05, + "loss": 1.7028, + "step": 19741 + }, + { + "epoch": 6.059545733578883, + "grad_norm": 0.21954336762428284, + "learning_rate": 3.5489036285012055e-05, + "loss": 1.7209, + "step": 19742 + }, + { + "epoch": 6.059852670349908, + "grad_norm": 0.20117704570293427, + "learning_rate": 3.548427972889946e-05, + "loss": 1.7273, + "step": 19743 + }, + { + "epoch": 6.060159607120933, + "grad_norm": 0.23786263167858124, + "learning_rate": 3.5479523316239745e-05, + "loss": 1.7519, + "step": 19744 + }, + { + "epoch": 6.060466543891958, + "grad_norm": 0.17704391479492188, + "learning_rate": 3.5474767047079864e-05, + "loss": 1.6644, + "step": 19745 + }, + { + "epoch": 6.060773480662983, + "grad_norm": 0.1883699744939804, + "learning_rate": 3.547001092146687e-05, + "loss": 1.6586, + "step": 19746 + }, + { + "epoch": 6.061080417434009, + "grad_norm": 0.19101519882678986, + "learning_rate": 3.546525493944773e-05, + "loss": 1.7575, + "step": 19747 + }, + { + "epoch": 6.061387354205034, + "grad_norm": 0.1924263834953308, + "learning_rate": 3.546049910106947e-05, + "loss": 1.743, + "step": 19748 + }, + { + "epoch": 6.0616942909760585, + "grad_norm": 0.1853020042181015, + "learning_rate": 3.5455743406379084e-05, + "loss": 1.7466, + "step": 19749 + }, + { + "epoch": 6.062001227747084, + "grad_norm": 0.21322499215602875, + "learning_rate": 3.545098785542355e-05, + "loss": 1.7625, + "step": 19750 + }, + { + "epoch": 6.062308164518109, + "grad_norm": 0.1567271500825882, + "learning_rate": 3.544623244824989e-05, + "loss": 1.6531, + "step": 19751 + }, + { + "epoch": 6.0626151012891345, + "grad_norm": 0.2125476449728012, + "learning_rate": 3.544147718490508e-05, + "loss": 1.7547, + "step": 19752 + }, + { + "epoch": 6.06292203806016, + "grad_norm": 0.19470059871673584, + "learning_rate": 3.543672206543615e-05, + "loss": 1.7327, + "step": 19753 + }, + { + "epoch": 6.063228974831185, + "grad_norm": 0.1690339744091034, + "learning_rate": 3.543196708989004e-05, + "loss": 1.6621, + "step": 19754 + }, + { + "epoch": 6.06353591160221, + "grad_norm": 0.17322230339050293, + "learning_rate": 3.54272122583138e-05, + "loss": 1.7018, + "step": 19755 + }, + { + "epoch": 6.063842848373235, + "grad_norm": 0.22174575924873352, + "learning_rate": 3.5422457570754365e-05, + "loss": 1.724, + "step": 19756 + }, + { + "epoch": 6.06414978514426, + "grad_norm": 0.20233364403247833, + "learning_rate": 3.541770302725875e-05, + "loss": 1.6518, + "step": 19757 + }, + { + "epoch": 6.064456721915286, + "grad_norm": 0.1585279405117035, + "learning_rate": 3.541294862787395e-05, + "loss": 1.6985, + "step": 19758 + }, + { + "epoch": 6.064763658686311, + "grad_norm": 0.2180105745792389, + "learning_rate": 3.540819437264694e-05, + "loss": 1.6728, + "step": 19759 + }, + { + "epoch": 6.065070595457335, + "grad_norm": 0.2295975238084793, + "learning_rate": 3.5403440261624696e-05, + "loss": 1.7566, + "step": 19760 + }, + { + "epoch": 6.065377532228361, + "grad_norm": 0.17460396885871887, + "learning_rate": 3.5398686294854234e-05, + "loss": 1.6977, + "step": 19761 + }, + { + "epoch": 6.065684468999386, + "grad_norm": 0.20828662812709808, + "learning_rate": 3.539393247238249e-05, + "loss": 1.7789, + "step": 19762 + }, + { + "epoch": 6.065991405770411, + "grad_norm": 0.2273385375738144, + "learning_rate": 3.5389178794256476e-05, + "loss": 1.7316, + "step": 19763 + }, + { + "epoch": 6.066298342541437, + "grad_norm": 0.2332257330417633, + "learning_rate": 3.538442526052316e-05, + "loss": 1.7355, + "step": 19764 + }, + { + "epoch": 6.066605279312462, + "grad_norm": 0.17953866720199585, + "learning_rate": 3.537967187122952e-05, + "loss": 1.7107, + "step": 19765 + }, + { + "epoch": 6.0669122160834865, + "grad_norm": 0.2334052473306656, + "learning_rate": 3.537491862642254e-05, + "loss": 1.7572, + "step": 19766 + }, + { + "epoch": 6.067219152854512, + "grad_norm": 0.2427968829870224, + "learning_rate": 3.5370165526149165e-05, + "loss": 1.7254, + "step": 19767 + }, + { + "epoch": 6.067526089625537, + "grad_norm": 0.2701692283153534, + "learning_rate": 3.53654125704564e-05, + "loss": 1.7525, + "step": 19768 + }, + { + "epoch": 6.0678330263965625, + "grad_norm": 0.3775569796562195, + "learning_rate": 3.536065975939121e-05, + "loss": 1.7516, + "step": 19769 + }, + { + "epoch": 6.068139963167588, + "grad_norm": 0.18971984088420868, + "learning_rate": 3.535590709300056e-05, + "loss": 1.6777, + "step": 19770 + }, + { + "epoch": 6.068446899938612, + "grad_norm": 0.2710094749927521, + "learning_rate": 3.535115457133141e-05, + "loss": 1.7612, + "step": 19771 + }, + { + "epoch": 6.068753836709638, + "grad_norm": 0.19414621591567993, + "learning_rate": 3.534640219443075e-05, + "loss": 1.6795, + "step": 19772 + }, + { + "epoch": 6.069060773480663, + "grad_norm": 0.2384893298149109, + "learning_rate": 3.534164996234552e-05, + "loss": 1.7869, + "step": 19773 + }, + { + "epoch": 6.069367710251688, + "grad_norm": 0.2206166833639145, + "learning_rate": 3.533689787512271e-05, + "loss": 1.7332, + "step": 19774 + }, + { + "epoch": 6.069674647022714, + "grad_norm": 0.19740800559520721, + "learning_rate": 3.533214593280926e-05, + "loss": 1.6744, + "step": 19775 + }, + { + "epoch": 6.069981583793738, + "grad_norm": 0.2098212093114853, + "learning_rate": 3.532739413545214e-05, + "loss": 1.731, + "step": 19776 + }, + { + "epoch": 6.070288520564763, + "grad_norm": 0.2508943974971771, + "learning_rate": 3.5322642483098304e-05, + "loss": 1.7682, + "step": 19777 + }, + { + "epoch": 6.070595457335789, + "grad_norm": 0.22202368080615997, + "learning_rate": 3.531789097579474e-05, + "loss": 1.6965, + "step": 19778 + }, + { + "epoch": 6.070902394106814, + "grad_norm": 0.19276803731918335, + "learning_rate": 3.5313139613588355e-05, + "loss": 1.6855, + "step": 19779 + }, + { + "epoch": 6.071209330877839, + "grad_norm": 0.23910140991210938, + "learning_rate": 3.530838839652616e-05, + "loss": 1.8099, + "step": 19780 + }, + { + "epoch": 6.071516267648865, + "grad_norm": 0.19440437853336334, + "learning_rate": 3.530363732465506e-05, + "loss": 1.67, + "step": 19781 + }, + { + "epoch": 6.071823204419889, + "grad_norm": 0.1954154074192047, + "learning_rate": 3.529888639802204e-05, + "loss": 1.7154, + "step": 19782 + }, + { + "epoch": 6.0721301411909145, + "grad_norm": 0.20836392045021057, + "learning_rate": 3.529413561667405e-05, + "loss": 1.7451, + "step": 19783 + }, + { + "epoch": 6.07243707796194, + "grad_norm": 0.20521731674671173, + "learning_rate": 3.5289384980658016e-05, + "loss": 1.7008, + "step": 19784 + }, + { + "epoch": 6.072744014732965, + "grad_norm": 0.22885540127754211, + "learning_rate": 3.528463449002092e-05, + "loss": 1.7605, + "step": 19785 + }, + { + "epoch": 6.0730509515039905, + "grad_norm": 0.27740219235420227, + "learning_rate": 3.5279884144809664e-05, + "loss": 1.7816, + "step": 19786 + }, + { + "epoch": 6.073357888275015, + "grad_norm": 0.24747557938098907, + "learning_rate": 3.527513394507124e-05, + "loss": 1.7207, + "step": 19787 + }, + { + "epoch": 6.07366482504604, + "grad_norm": 0.20127782225608826, + "learning_rate": 3.527038389085256e-05, + "loss": 1.702, + "step": 19788 + }, + { + "epoch": 6.073971761817066, + "grad_norm": 0.20683316886425018, + "learning_rate": 3.5265633982200595e-05, + "loss": 1.7022, + "step": 19789 + }, + { + "epoch": 6.074278698588091, + "grad_norm": 0.17829765379428864, + "learning_rate": 3.5260884219162256e-05, + "loss": 1.7099, + "step": 19790 + }, + { + "epoch": 6.074585635359116, + "grad_norm": 0.256964772939682, + "learning_rate": 3.525613460178452e-05, + "loss": 1.7226, + "step": 19791 + }, + { + "epoch": 6.074892572130141, + "grad_norm": 0.22840122878551483, + "learning_rate": 3.525138513011428e-05, + "loss": 1.7738, + "step": 19792 + }, + { + "epoch": 6.075199508901166, + "grad_norm": 0.18988655507564545, + "learning_rate": 3.52466358041985e-05, + "loss": 1.6775, + "step": 19793 + }, + { + "epoch": 6.0755064456721914, + "grad_norm": 0.21857139468193054, + "learning_rate": 3.524188662408411e-05, + "loss": 1.7596, + "step": 19794 + }, + { + "epoch": 6.075813382443217, + "grad_norm": 0.22910535335540771, + "learning_rate": 3.523713758981807e-05, + "loss": 1.7969, + "step": 19795 + }, + { + "epoch": 6.076120319214242, + "grad_norm": 0.20885716378688812, + "learning_rate": 3.523238870144726e-05, + "loss": 1.7407, + "step": 19796 + }, + { + "epoch": 6.0764272559852675, + "grad_norm": 0.2056209295988083, + "learning_rate": 3.5227639959018666e-05, + "loss": 1.759, + "step": 19797 + }, + { + "epoch": 6.076734192756292, + "grad_norm": 0.17485356330871582, + "learning_rate": 3.522289136257917e-05, + "loss": 1.6988, + "step": 19798 + }, + { + "epoch": 6.077041129527317, + "grad_norm": 0.2103404402732849, + "learning_rate": 3.521814291217573e-05, + "loss": 1.766, + "step": 19799 + }, + { + "epoch": 6.077348066298343, + "grad_norm": 0.21852105855941772, + "learning_rate": 3.521339460785528e-05, + "loss": 1.7435, + "step": 19800 + }, + { + "epoch": 6.077655003069368, + "grad_norm": 0.21578362584114075, + "learning_rate": 3.520864644966471e-05, + "loss": 1.7281, + "step": 19801 + }, + { + "epoch": 6.077961939840393, + "grad_norm": 0.20405036211013794, + "learning_rate": 3.520389843765099e-05, + "loss": 1.7367, + "step": 19802 + }, + { + "epoch": 6.078268876611418, + "grad_norm": 0.2578286826610565, + "learning_rate": 3.5199150571860996e-05, + "loss": 1.7625, + "step": 19803 + }, + { + "epoch": 6.078575813382443, + "grad_norm": 0.240324467420578, + "learning_rate": 3.519440285234168e-05, + "loss": 1.6979, + "step": 19804 + }, + { + "epoch": 6.078882750153468, + "grad_norm": 0.220765620470047, + "learning_rate": 3.5189655279139935e-05, + "loss": 1.7679, + "step": 19805 + }, + { + "epoch": 6.079189686924494, + "grad_norm": 0.2731996774673462, + "learning_rate": 3.518490785230273e-05, + "loss": 1.6723, + "step": 19806 + }, + { + "epoch": 6.079496623695519, + "grad_norm": 0.2593478262424469, + "learning_rate": 3.518016057187692e-05, + "loss": 1.7232, + "step": 19807 + }, + { + "epoch": 6.0798035604665435, + "grad_norm": 0.34642404317855835, + "learning_rate": 3.517541343790947e-05, + "loss": 1.8265, + "step": 19808 + }, + { + "epoch": 6.080110497237569, + "grad_norm": 0.3187299370765686, + "learning_rate": 3.5170666450447255e-05, + "loss": 1.6847, + "step": 19809 + }, + { + "epoch": 6.080417434008594, + "grad_norm": 0.20413202047348022, + "learning_rate": 3.5165919609537215e-05, + "loss": 1.6533, + "step": 19810 + }, + { + "epoch": 6.0807243707796195, + "grad_norm": 0.2753545343875885, + "learning_rate": 3.516117291522625e-05, + "loss": 1.7491, + "step": 19811 + }, + { + "epoch": 6.081031307550645, + "grad_norm": 0.20174793899059296, + "learning_rate": 3.515642636756128e-05, + "loss": 1.6902, + "step": 19812 + }, + { + "epoch": 6.08133824432167, + "grad_norm": 0.22567492723464966, + "learning_rate": 3.515167996658919e-05, + "loss": 1.7165, + "step": 19813 + }, + { + "epoch": 6.081645181092695, + "grad_norm": 0.2115732729434967, + "learning_rate": 3.514693371235692e-05, + "loss": 1.6888, + "step": 19814 + }, + { + "epoch": 6.08195211786372, + "grad_norm": 0.2141808122396469, + "learning_rate": 3.514218760491134e-05, + "loss": 1.7152, + "step": 19815 + }, + { + "epoch": 6.082259054634745, + "grad_norm": 0.19767558574676514, + "learning_rate": 3.513744164429938e-05, + "loss": 1.6926, + "step": 19816 + }, + { + "epoch": 6.082565991405771, + "grad_norm": 0.20220023393630981, + "learning_rate": 3.5132695830567944e-05, + "loss": 1.6727, + "step": 19817 + }, + { + "epoch": 6.082872928176796, + "grad_norm": 0.19589759409427643, + "learning_rate": 3.5127950163763896e-05, + "loss": 1.7545, + "step": 19818 + }, + { + "epoch": 6.08317986494782, + "grad_norm": 0.21303611993789673, + "learning_rate": 3.512320464393418e-05, + "loss": 1.753, + "step": 19819 + }, + { + "epoch": 6.083486801718846, + "grad_norm": 0.19438377022743225, + "learning_rate": 3.511845927112566e-05, + "loss": 1.7022, + "step": 19820 + }, + { + "epoch": 6.083793738489871, + "grad_norm": 0.21282976865768433, + "learning_rate": 3.511371404538526e-05, + "loss": 1.7099, + "step": 19821 + }, + { + "epoch": 6.084100675260896, + "grad_norm": 0.1874496042728424, + "learning_rate": 3.5108968966759846e-05, + "loss": 1.7033, + "step": 19822 + }, + { + "epoch": 6.084407612031922, + "grad_norm": 0.21199075877666473, + "learning_rate": 3.510422403529636e-05, + "loss": 1.7088, + "step": 19823 + }, + { + "epoch": 6.084714548802946, + "grad_norm": 0.21847110986709595, + "learning_rate": 3.5099479251041634e-05, + "loss": 1.7395, + "step": 19824 + }, + { + "epoch": 6.0850214855739715, + "grad_norm": 0.201395645737648, + "learning_rate": 3.509473461404261e-05, + "loss": 1.7522, + "step": 19825 + }, + { + "epoch": 6.085328422344997, + "grad_norm": 0.19637656211853027, + "learning_rate": 3.5089990124346135e-05, + "loss": 1.6774, + "step": 19826 + }, + { + "epoch": 6.085635359116022, + "grad_norm": 0.25918442010879517, + "learning_rate": 3.5085245781999124e-05, + "loss": 1.7704, + "step": 19827 + }, + { + "epoch": 6.0859422958870475, + "grad_norm": 0.21271947026252747, + "learning_rate": 3.508050158704844e-05, + "loss": 1.6902, + "step": 19828 + }, + { + "epoch": 6.086249232658073, + "grad_norm": 0.2065698802471161, + "learning_rate": 3.5075757539541024e-05, + "loss": 1.7945, + "step": 19829 + }, + { + "epoch": 6.086556169429097, + "grad_norm": 0.20247824490070343, + "learning_rate": 3.5071013639523684e-05, + "loss": 1.7532, + "step": 19830 + }, + { + "epoch": 6.086863106200123, + "grad_norm": 0.19705431163311005, + "learning_rate": 3.506626988704336e-05, + "loss": 1.6353, + "step": 19831 + }, + { + "epoch": 6.087170042971148, + "grad_norm": 0.20158523321151733, + "learning_rate": 3.5061526282146886e-05, + "loss": 1.6596, + "step": 19832 + }, + { + "epoch": 6.087476979742173, + "grad_norm": 0.19492848217487335, + "learning_rate": 3.505678282488118e-05, + "loss": 1.7107, + "step": 19833 + }, + { + "epoch": 6.087783916513199, + "grad_norm": 0.2403736114501953, + "learning_rate": 3.505203951529312e-05, + "loss": 1.7456, + "step": 19834 + }, + { + "epoch": 6.088090853284223, + "grad_norm": 0.25649771094322205, + "learning_rate": 3.504729635342954e-05, + "loss": 1.7513, + "step": 19835 + }, + { + "epoch": 6.088397790055248, + "grad_norm": 0.20172113180160522, + "learning_rate": 3.504255333933736e-05, + "loss": 1.7737, + "step": 19836 + }, + { + "epoch": 6.088704726826274, + "grad_norm": 0.2715936303138733, + "learning_rate": 3.5037810473063414e-05, + "loss": 1.759, + "step": 19837 + }, + { + "epoch": 6.089011663597299, + "grad_norm": 0.23145076632499695, + "learning_rate": 3.503306775465461e-05, + "loss": 1.7811, + "step": 19838 + }, + { + "epoch": 6.089318600368324, + "grad_norm": 0.1953691691160202, + "learning_rate": 3.502832518415778e-05, + "loss": 1.752, + "step": 19839 + }, + { + "epoch": 6.08962553713935, + "grad_norm": 0.1927584707736969, + "learning_rate": 3.502358276161986e-05, + "loss": 1.6865, + "step": 19840 + }, + { + "epoch": 6.089932473910374, + "grad_norm": 0.19294732809066772, + "learning_rate": 3.501884048708763e-05, + "loss": 1.6838, + "step": 19841 + }, + { + "epoch": 6.0902394106813995, + "grad_norm": 0.23351021111011505, + "learning_rate": 3.501409836060803e-05, + "loss": 1.8029, + "step": 19842 + }, + { + "epoch": 6.090546347452425, + "grad_norm": 0.21615718305110931, + "learning_rate": 3.5009356382227877e-05, + "loss": 1.7441, + "step": 19843 + }, + { + "epoch": 6.09085328422345, + "grad_norm": 0.19091549515724182, + "learning_rate": 3.500461455199405e-05, + "loss": 1.7056, + "step": 19844 + }, + { + "epoch": 6.0911602209944755, + "grad_norm": 0.21189090609550476, + "learning_rate": 3.499987286995341e-05, + "loss": 1.6853, + "step": 19845 + }, + { + "epoch": 6.0914671577655, + "grad_norm": 0.22545887529850006, + "learning_rate": 3.499513133615283e-05, + "loss": 1.7854, + "step": 19846 + }, + { + "epoch": 6.091774094536525, + "grad_norm": 0.21960650384426117, + "learning_rate": 3.4990389950639144e-05, + "loss": 1.7558, + "step": 19847 + }, + { + "epoch": 6.092081031307551, + "grad_norm": 0.20825782418251038, + "learning_rate": 3.4985648713459244e-05, + "loss": 1.7103, + "step": 19848 + }, + { + "epoch": 6.092387968078576, + "grad_norm": 0.20886415243148804, + "learning_rate": 3.498090762465993e-05, + "loss": 1.6897, + "step": 19849 + }, + { + "epoch": 6.092694904849601, + "grad_norm": 0.19306892156600952, + "learning_rate": 3.4976166684288115e-05, + "loss": 1.7506, + "step": 19850 + }, + { + "epoch": 6.093001841620626, + "grad_norm": 0.2178204357624054, + "learning_rate": 3.497142589239063e-05, + "loss": 1.6774, + "step": 19851 + }, + { + "epoch": 6.093308778391651, + "grad_norm": 0.1914307177066803, + "learning_rate": 3.4966685249014294e-05, + "loss": 1.7182, + "step": 19852 + }, + { + "epoch": 6.093615715162676, + "grad_norm": 0.22006092965602875, + "learning_rate": 3.496194475420602e-05, + "loss": 1.7209, + "step": 19853 + }, + { + "epoch": 6.093922651933702, + "grad_norm": 0.20621439814567566, + "learning_rate": 3.49572044080126e-05, + "loss": 1.7403, + "step": 19854 + }, + { + "epoch": 6.094229588704727, + "grad_norm": 0.24079272150993347, + "learning_rate": 3.495246421048091e-05, + "loss": 1.7619, + "step": 19855 + }, + { + "epoch": 6.094536525475752, + "grad_norm": 0.19073884189128876, + "learning_rate": 3.494772416165777e-05, + "loss": 1.6677, + "step": 19856 + }, + { + "epoch": 6.094843462246777, + "grad_norm": 0.18217229843139648, + "learning_rate": 3.494298426159007e-05, + "loss": 1.7162, + "step": 19857 + }, + { + "epoch": 6.095150399017802, + "grad_norm": 0.21901506185531616, + "learning_rate": 3.493824451032461e-05, + "loss": 1.7173, + "step": 19858 + }, + { + "epoch": 6.0954573357888275, + "grad_norm": 0.22156217694282532, + "learning_rate": 3.493350490790826e-05, + "loss": 1.8029, + "step": 19859 + }, + { + "epoch": 6.095764272559853, + "grad_norm": 0.1663675606250763, + "learning_rate": 3.4928765454387824e-05, + "loss": 1.7306, + "step": 19860 + }, + { + "epoch": 6.096071209330878, + "grad_norm": 0.19684657454490662, + "learning_rate": 3.4924026149810175e-05, + "loss": 1.6944, + "step": 19861 + }, + { + "epoch": 6.096378146101903, + "grad_norm": 0.19163468480110168, + "learning_rate": 3.4919286994222125e-05, + "loss": 1.7331, + "step": 19862 + }, + { + "epoch": 6.096685082872928, + "grad_norm": 0.20134083926677704, + "learning_rate": 3.491454798767054e-05, + "loss": 1.7365, + "step": 19863 + }, + { + "epoch": 6.096992019643953, + "grad_norm": 0.23877696692943573, + "learning_rate": 3.490980913020221e-05, + "loss": 1.753, + "step": 19864 + }, + { + "epoch": 6.097298956414979, + "grad_norm": 0.207699254155159, + "learning_rate": 3.490507042186402e-05, + "loss": 1.6835, + "step": 19865 + }, + { + "epoch": 6.097605893186004, + "grad_norm": 0.20608612895011902, + "learning_rate": 3.490033186270274e-05, + "loss": 1.7379, + "step": 19866 + }, + { + "epoch": 6.097912829957028, + "grad_norm": 0.25086313486099243, + "learning_rate": 3.489559345276524e-05, + "loss": 1.7692, + "step": 19867 + }, + { + "epoch": 6.098219766728054, + "grad_norm": 0.22025549411773682, + "learning_rate": 3.489085519209836e-05, + "loss": 1.6579, + "step": 19868 + }, + { + "epoch": 6.098526703499079, + "grad_norm": 0.23805730044841766, + "learning_rate": 3.4886117080748875e-05, + "loss": 1.7695, + "step": 19869 + }, + { + "epoch": 6.098833640270104, + "grad_norm": 0.23271869122982025, + "learning_rate": 3.4881379118763666e-05, + "loss": 1.7268, + "step": 19870 + }, + { + "epoch": 6.09914057704113, + "grad_norm": 0.21795618534088135, + "learning_rate": 3.4876641306189505e-05, + "loss": 1.6996, + "step": 19871 + }, + { + "epoch": 6.099447513812155, + "grad_norm": 0.22064761817455292, + "learning_rate": 3.487190364307326e-05, + "loss": 1.7032, + "step": 19872 + }, + { + "epoch": 6.0997544505831796, + "grad_norm": 0.23834183812141418, + "learning_rate": 3.4867166129461706e-05, + "loss": 1.6942, + "step": 19873 + }, + { + "epoch": 6.100061387354205, + "grad_norm": 0.21143686771392822, + "learning_rate": 3.486242876540171e-05, + "loss": 1.6904, + "step": 19874 + }, + { + "epoch": 6.10036832412523, + "grad_norm": 0.18099969625473022, + "learning_rate": 3.485769155094004e-05, + "loss": 1.6669, + "step": 19875 + }, + { + "epoch": 6.100675260896256, + "grad_norm": 0.25324884057044983, + "learning_rate": 3.4852954486123566e-05, + "loss": 1.7878, + "step": 19876 + }, + { + "epoch": 6.100982197667281, + "grad_norm": 0.2252139449119568, + "learning_rate": 3.4848217570999055e-05, + "loss": 1.7674, + "step": 19877 + }, + { + "epoch": 6.101289134438305, + "grad_norm": 0.19629882276058197, + "learning_rate": 3.4843480805613346e-05, + "loss": 1.6898, + "step": 19878 + }, + { + "epoch": 6.101596071209331, + "grad_norm": 0.1858786642551422, + "learning_rate": 3.483874419001323e-05, + "loss": 1.6856, + "step": 19879 + }, + { + "epoch": 6.101903007980356, + "grad_norm": 0.1842946857213974, + "learning_rate": 3.483400772424555e-05, + "loss": 1.7229, + "step": 19880 + }, + { + "epoch": 6.102209944751381, + "grad_norm": 0.18981511890888214, + "learning_rate": 3.482927140835708e-05, + "loss": 1.75, + "step": 19881 + }, + { + "epoch": 6.102516881522407, + "grad_norm": 0.19914525747299194, + "learning_rate": 3.482453524239466e-05, + "loss": 1.7702, + "step": 19882 + }, + { + "epoch": 6.102823818293431, + "grad_norm": 0.1960345208644867, + "learning_rate": 3.481979922640507e-05, + "loss": 1.7189, + "step": 19883 + }, + { + "epoch": 6.1031307550644565, + "grad_norm": 0.20309221744537354, + "learning_rate": 3.48150633604351e-05, + "loss": 1.7888, + "step": 19884 + }, + { + "epoch": 6.103437691835482, + "grad_norm": 0.20090891420841217, + "learning_rate": 3.48103276445316e-05, + "loss": 1.8017, + "step": 19885 + }, + { + "epoch": 6.103744628606507, + "grad_norm": 0.22500385344028473, + "learning_rate": 3.480559207874133e-05, + "loss": 1.7061, + "step": 19886 + }, + { + "epoch": 6.1040515653775325, + "grad_norm": 0.22594885528087616, + "learning_rate": 3.480085666311113e-05, + "loss": 1.7659, + "step": 19887 + }, + { + "epoch": 6.104358502148558, + "grad_norm": 0.2769651710987091, + "learning_rate": 3.479612139768774e-05, + "loss": 1.7668, + "step": 19888 + }, + { + "epoch": 6.104665438919582, + "grad_norm": 0.24251700937747955, + "learning_rate": 3.4791386282518e-05, + "loss": 1.8068, + "step": 19889 + }, + { + "epoch": 6.104972375690608, + "grad_norm": 0.23325790464878082, + "learning_rate": 3.478665131764869e-05, + "loss": 1.7116, + "step": 19890 + }, + { + "epoch": 6.105279312461633, + "grad_norm": 0.19998812675476074, + "learning_rate": 3.478191650312663e-05, + "loss": 1.7116, + "step": 19891 + }, + { + "epoch": 6.105586249232658, + "grad_norm": 0.20933640003204346, + "learning_rate": 3.4777181838998566e-05, + "loss": 1.7138, + "step": 19892 + }, + { + "epoch": 6.105893186003684, + "grad_norm": 0.24344035983085632, + "learning_rate": 3.477244732531134e-05, + "loss": 1.784, + "step": 19893 + }, + { + "epoch": 6.106200122774708, + "grad_norm": 0.2220575362443924, + "learning_rate": 3.4767712962111686e-05, + "loss": 1.7479, + "step": 19894 + }, + { + "epoch": 6.106507059545733, + "grad_norm": 0.2222832590341568, + "learning_rate": 3.476297874944644e-05, + "loss": 1.7278, + "step": 19895 + }, + { + "epoch": 6.106813996316759, + "grad_norm": 0.222265362739563, + "learning_rate": 3.4758244687362353e-05, + "loss": 1.7321, + "step": 19896 + }, + { + "epoch": 6.107120933087784, + "grad_norm": 0.2921304702758789, + "learning_rate": 3.475351077590625e-05, + "loss": 1.7848, + "step": 19897 + }, + { + "epoch": 6.107427869858809, + "grad_norm": 0.21015208959579468, + "learning_rate": 3.4748777015124856e-05, + "loss": 1.7987, + "step": 19898 + }, + { + "epoch": 6.107734806629834, + "grad_norm": 0.19510969519615173, + "learning_rate": 3.474404340506502e-05, + "loss": 1.7317, + "step": 19899 + }, + { + "epoch": 6.108041743400859, + "grad_norm": 0.21978609263896942, + "learning_rate": 3.473930994577348e-05, + "loss": 1.6943, + "step": 19900 + }, + { + "epoch": 6.1083486801718845, + "grad_norm": 0.1793510913848877, + "learning_rate": 3.4734576637297004e-05, + "loss": 1.6659, + "step": 19901 + }, + { + "epoch": 6.10865561694291, + "grad_norm": 0.2029319554567337, + "learning_rate": 3.4729843479682414e-05, + "loss": 1.7127, + "step": 19902 + }, + { + "epoch": 6.108962553713935, + "grad_norm": 0.2001914530992508, + "learning_rate": 3.472511047297644e-05, + "loss": 1.691, + "step": 19903 + }, + { + "epoch": 6.1092694904849605, + "grad_norm": 0.2194693237543106, + "learning_rate": 3.47203776172259e-05, + "loss": 1.7181, + "step": 19904 + }, + { + "epoch": 6.109576427255985, + "grad_norm": 0.1865277737379074, + "learning_rate": 3.4715644912477515e-05, + "loss": 1.6786, + "step": 19905 + }, + { + "epoch": 6.10988336402701, + "grad_norm": 0.20574906468391418, + "learning_rate": 3.471091235877811e-05, + "loss": 1.7681, + "step": 19906 + }, + { + "epoch": 6.110190300798036, + "grad_norm": 0.21072493493556976, + "learning_rate": 3.470617995617441e-05, + "loss": 1.7494, + "step": 19907 + }, + { + "epoch": 6.110497237569061, + "grad_norm": 0.2411658763885498, + "learning_rate": 3.470144770471323e-05, + "loss": 1.7183, + "step": 19908 + }, + { + "epoch": 6.110804174340086, + "grad_norm": 0.19782759249210358, + "learning_rate": 3.4696715604441285e-05, + "loss": 1.6823, + "step": 19909 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.315026193857193, + "learning_rate": 3.469198365540539e-05, + "loss": 1.691, + "step": 19910 + }, + { + "epoch": 6.111418047882136, + "grad_norm": 0.19840773940086365, + "learning_rate": 3.468725185765226e-05, + "loss": 1.7413, + "step": 19911 + }, + { + "epoch": 6.111724984653161, + "grad_norm": 0.1813160926103592, + "learning_rate": 3.46825202112287e-05, + "loss": 1.7095, + "step": 19912 + }, + { + "epoch": 6.112031921424187, + "grad_norm": 0.21025459468364716, + "learning_rate": 3.467778871618145e-05, + "loss": 1.7783, + "step": 19913 + }, + { + "epoch": 6.112338858195212, + "grad_norm": 0.20088298618793488, + "learning_rate": 3.4673057372557265e-05, + "loss": 1.7671, + "step": 19914 + }, + { + "epoch": 6.112645794966237, + "grad_norm": 0.21919472515583038, + "learning_rate": 3.466832618040291e-05, + "loss": 1.7052, + "step": 19915 + }, + { + "epoch": 6.112952731737262, + "grad_norm": 0.19135436415672302, + "learning_rate": 3.466359513976516e-05, + "loss": 1.7862, + "step": 19916 + }, + { + "epoch": 6.113259668508287, + "grad_norm": 0.19943594932556152, + "learning_rate": 3.465886425069074e-05, + "loss": 1.6926, + "step": 19917 + }, + { + "epoch": 6.1135666052793125, + "grad_norm": 0.19390980899333954, + "learning_rate": 3.46541335132264e-05, + "loss": 1.761, + "step": 19918 + }, + { + "epoch": 6.113873542050338, + "grad_norm": 0.22745995223522186, + "learning_rate": 3.4649402927418935e-05, + "loss": 1.7147, + "step": 19919 + }, + { + "epoch": 6.114180478821363, + "grad_norm": 0.17792920768260956, + "learning_rate": 3.4644672493315045e-05, + "loss": 1.6946, + "step": 19920 + }, + { + "epoch": 6.114487415592388, + "grad_norm": 0.2009986788034439, + "learning_rate": 3.463994221096152e-05, + "loss": 1.6977, + "step": 19921 + }, + { + "epoch": 6.114794352363413, + "grad_norm": 0.2448386251926422, + "learning_rate": 3.4635212080405066e-05, + "loss": 1.7169, + "step": 19922 + }, + { + "epoch": 6.115101289134438, + "grad_norm": 0.21506112813949585, + "learning_rate": 3.463048210169247e-05, + "loss": 1.6632, + "step": 19923 + }, + { + "epoch": 6.115408225905464, + "grad_norm": 0.1805233359336853, + "learning_rate": 3.462575227487045e-05, + "loss": 1.6742, + "step": 19924 + }, + { + "epoch": 6.115715162676489, + "grad_norm": 0.20023848116397858, + "learning_rate": 3.4621022599985766e-05, + "loss": 1.7106, + "step": 19925 + }, + { + "epoch": 6.116022099447513, + "grad_norm": 0.20388077199459076, + "learning_rate": 3.461629307708513e-05, + "loss": 1.7065, + "step": 19926 + }, + { + "epoch": 6.116329036218539, + "grad_norm": 0.23886005580425262, + "learning_rate": 3.461156370621533e-05, + "loss": 1.7177, + "step": 19927 + }, + { + "epoch": 6.116635972989564, + "grad_norm": 0.2054048627614975, + "learning_rate": 3.460683448742306e-05, + "loss": 1.6773, + "step": 19928 + }, + { + "epoch": 6.116942909760589, + "grad_norm": 0.1909634917974472, + "learning_rate": 3.460210542075508e-05, + "loss": 1.7562, + "step": 19929 + }, + { + "epoch": 6.117249846531615, + "grad_norm": 0.20221595466136932, + "learning_rate": 3.459737650625812e-05, + "loss": 1.7948, + "step": 19930 + }, + { + "epoch": 6.11755678330264, + "grad_norm": 0.25445356965065, + "learning_rate": 3.459264774397891e-05, + "loss": 1.7964, + "step": 19931 + }, + { + "epoch": 6.1178637200736645, + "grad_norm": 0.2227735072374344, + "learning_rate": 3.4587919133964176e-05, + "loss": 1.7833, + "step": 19932 + }, + { + "epoch": 6.11817065684469, + "grad_norm": 0.20591853559017181, + "learning_rate": 3.458319067626068e-05, + "loss": 1.7535, + "step": 19933 + }, + { + "epoch": 6.118477593615715, + "grad_norm": 0.22087402641773224, + "learning_rate": 3.4578462370915115e-05, + "loss": 1.7228, + "step": 19934 + }, + { + "epoch": 6.1187845303867405, + "grad_norm": 0.234156996011734, + "learning_rate": 3.457373421797423e-05, + "loss": 1.7167, + "step": 19935 + }, + { + "epoch": 6.119091467157766, + "grad_norm": 0.209685817360878, + "learning_rate": 3.4569006217484746e-05, + "loss": 1.6633, + "step": 19936 + }, + { + "epoch": 6.11939840392879, + "grad_norm": 0.18499237298965454, + "learning_rate": 3.4564278369493366e-05, + "loss": 1.6769, + "step": 19937 + }, + { + "epoch": 6.119705340699816, + "grad_norm": 0.2600767910480499, + "learning_rate": 3.455955067404686e-05, + "loss": 1.7788, + "step": 19938 + }, + { + "epoch": 6.120012277470841, + "grad_norm": 0.21499377489089966, + "learning_rate": 3.455482313119191e-05, + "loss": 1.789, + "step": 19939 + }, + { + "epoch": 6.120319214241866, + "grad_norm": 0.19618432223796844, + "learning_rate": 3.455009574097527e-05, + "loss": 1.7162, + "step": 19940 + }, + { + "epoch": 6.120626151012892, + "grad_norm": 0.23219916224479675, + "learning_rate": 3.4545368503443616e-05, + "loss": 1.7871, + "step": 19941 + }, + { + "epoch": 6.120933087783916, + "grad_norm": 0.22315794229507446, + "learning_rate": 3.45406414186437e-05, + "loss": 1.6944, + "step": 19942 + }, + { + "epoch": 6.121240024554941, + "grad_norm": 0.22536693513393402, + "learning_rate": 3.453591448662221e-05, + "loss": 1.7727, + "step": 19943 + }, + { + "epoch": 6.121546961325967, + "grad_norm": 0.21811100840568542, + "learning_rate": 3.45311877074259e-05, + "loss": 1.7037, + "step": 19944 + }, + { + "epoch": 6.121853898096992, + "grad_norm": 0.1957094967365265, + "learning_rate": 3.452646108110145e-05, + "loss": 1.7734, + "step": 19945 + }, + { + "epoch": 6.122160834868017, + "grad_norm": 0.185706228017807, + "learning_rate": 3.452173460769559e-05, + "loss": 1.6715, + "step": 19946 + }, + { + "epoch": 6.122467771639043, + "grad_norm": 0.21081562340259552, + "learning_rate": 3.4517008287255005e-05, + "loss": 1.7798, + "step": 19947 + }, + { + "epoch": 6.122774708410067, + "grad_norm": 0.24175535142421722, + "learning_rate": 3.451228211982642e-05, + "loss": 1.7111, + "step": 19948 + }, + { + "epoch": 6.1230816451810925, + "grad_norm": 0.244124636054039, + "learning_rate": 3.450755610545654e-05, + "loss": 1.7263, + "step": 19949 + }, + { + "epoch": 6.123388581952118, + "grad_norm": 0.21109984815120697, + "learning_rate": 3.45028302441921e-05, + "loss": 1.7556, + "step": 19950 + }, + { + "epoch": 6.123695518723143, + "grad_norm": 0.21721722185611725, + "learning_rate": 3.449810453607976e-05, + "loss": 1.7416, + "step": 19951 + }, + { + "epoch": 6.1240024554941686, + "grad_norm": 0.18695317208766937, + "learning_rate": 3.4493378981166216e-05, + "loss": 1.7128, + "step": 19952 + }, + { + "epoch": 6.124309392265193, + "grad_norm": 0.19175554811954498, + "learning_rate": 3.4488653579498206e-05, + "loss": 1.7014, + "step": 19953 + }, + { + "epoch": 6.124616329036218, + "grad_norm": 0.22297006845474243, + "learning_rate": 3.4483928331122405e-05, + "loss": 1.7231, + "step": 19954 + }, + { + "epoch": 6.124923265807244, + "grad_norm": 0.2407974898815155, + "learning_rate": 3.447920323608553e-05, + "loss": 1.7354, + "step": 19955 + }, + { + "epoch": 6.125230202578269, + "grad_norm": 0.19767232239246368, + "learning_rate": 3.447447829443425e-05, + "loss": 1.7487, + "step": 19956 + }, + { + "epoch": 6.125537139349294, + "grad_norm": 0.20033477246761322, + "learning_rate": 3.446975350621529e-05, + "loss": 1.7232, + "step": 19957 + }, + { + "epoch": 6.12584407612032, + "grad_norm": 0.20310243964195251, + "learning_rate": 3.446502887147532e-05, + "loss": 1.6946, + "step": 19958 + }, + { + "epoch": 6.126151012891344, + "grad_norm": 0.2322724461555481, + "learning_rate": 3.446030439026104e-05, + "loss": 1.7071, + "step": 19959 + }, + { + "epoch": 6.1264579496623695, + "grad_norm": 0.24134255945682526, + "learning_rate": 3.445558006261914e-05, + "loss": 1.7259, + "step": 19960 + }, + { + "epoch": 6.126764886433395, + "grad_norm": 0.22821731865406036, + "learning_rate": 3.445085588859632e-05, + "loss": 1.7488, + "step": 19961 + }, + { + "epoch": 6.12707182320442, + "grad_norm": 0.258241206407547, + "learning_rate": 3.444613186823924e-05, + "loss": 1.7403, + "step": 19962 + }, + { + "epoch": 6.1273787599754455, + "grad_norm": 0.18758481740951538, + "learning_rate": 3.4441408001594625e-05, + "loss": 1.7079, + "step": 19963 + }, + { + "epoch": 6.12768569674647, + "grad_norm": 0.24032682180404663, + "learning_rate": 3.443668428870911e-05, + "loss": 1.7377, + "step": 19964 + }, + { + "epoch": 6.127992633517495, + "grad_norm": 0.24468545615673065, + "learning_rate": 3.4431960729629406e-05, + "loss": 1.7724, + "step": 19965 + }, + { + "epoch": 6.128299570288521, + "grad_norm": 0.23840154707431793, + "learning_rate": 3.4427237324402197e-05, + "loss": 1.7813, + "step": 19966 + }, + { + "epoch": 6.128606507059546, + "grad_norm": 0.2476109117269516, + "learning_rate": 3.4422514073074165e-05, + "loss": 1.7578, + "step": 19967 + }, + { + "epoch": 6.128913443830571, + "grad_norm": 0.2109041064977646, + "learning_rate": 3.4417790975691974e-05, + "loss": 1.6917, + "step": 19968 + }, + { + "epoch": 6.129220380601596, + "grad_norm": 0.21841584146022797, + "learning_rate": 3.4413068032302296e-05, + "loss": 1.7511, + "step": 19969 + }, + { + "epoch": 6.129527317372621, + "grad_norm": 0.2111930102109909, + "learning_rate": 3.440834524295182e-05, + "loss": 1.7194, + "step": 19970 + }, + { + "epoch": 6.129834254143646, + "grad_norm": 0.21868006885051727, + "learning_rate": 3.440362260768721e-05, + "loss": 1.7933, + "step": 19971 + }, + { + "epoch": 6.130141190914672, + "grad_norm": 0.19846780598163605, + "learning_rate": 3.439890012655516e-05, + "loss": 1.6985, + "step": 19972 + }, + { + "epoch": 6.130448127685697, + "grad_norm": 0.218460813164711, + "learning_rate": 3.439417779960231e-05, + "loss": 1.7205, + "step": 19973 + }, + { + "epoch": 6.1307550644567215, + "grad_norm": 0.22504402697086334, + "learning_rate": 3.438945562687535e-05, + "loss": 1.7437, + "step": 19974 + }, + { + "epoch": 6.131062001227747, + "grad_norm": 0.35414671897888184, + "learning_rate": 3.438473360842093e-05, + "loss": 1.7641, + "step": 19975 + }, + { + "epoch": 6.131368937998772, + "grad_norm": 0.21090710163116455, + "learning_rate": 3.4380011744285726e-05, + "loss": 1.6817, + "step": 19976 + }, + { + "epoch": 6.1316758747697975, + "grad_norm": 0.19118748605251312, + "learning_rate": 3.437529003451639e-05, + "loss": 1.694, + "step": 19977 + }, + { + "epoch": 6.131982811540823, + "grad_norm": 0.2341139018535614, + "learning_rate": 3.437056847915962e-05, + "loss": 1.781, + "step": 19978 + }, + { + "epoch": 6.132289748311848, + "grad_norm": 0.19120962917804718, + "learning_rate": 3.4365847078262033e-05, + "loss": 1.6974, + "step": 19979 + }, + { + "epoch": 6.132596685082873, + "grad_norm": 0.1998066008090973, + "learning_rate": 3.436112583187033e-05, + "loss": 1.6933, + "step": 19980 + }, + { + "epoch": 6.132903621853898, + "grad_norm": 0.19839663803577423, + "learning_rate": 3.4356404740031123e-05, + "loss": 1.6867, + "step": 19981 + }, + { + "epoch": 6.133210558624923, + "grad_norm": 0.19892877340316772, + "learning_rate": 3.4351683802791114e-05, + "loss": 1.7349, + "step": 19982 + }, + { + "epoch": 6.133517495395949, + "grad_norm": 0.23215502500534058, + "learning_rate": 3.434696302019692e-05, + "loss": 1.7411, + "step": 19983 + }, + { + "epoch": 6.133824432166974, + "grad_norm": 0.21246971189975739, + "learning_rate": 3.4342242392295225e-05, + "loss": 1.6918, + "step": 19984 + }, + { + "epoch": 6.134131368937998, + "grad_norm": 0.18585935235023499, + "learning_rate": 3.4337521919132675e-05, + "loss": 1.71, + "step": 19985 + }, + { + "epoch": 6.134438305709024, + "grad_norm": 0.24194715917110443, + "learning_rate": 3.4332801600755896e-05, + "loss": 1.7314, + "step": 19986 + }, + { + "epoch": 6.134745242480049, + "grad_norm": 0.19925665855407715, + "learning_rate": 3.432808143721156e-05, + "loss": 1.7425, + "step": 19987 + }, + { + "epoch": 6.135052179251074, + "grad_norm": 0.22253449261188507, + "learning_rate": 3.43233614285463e-05, + "loss": 1.702, + "step": 19988 + }, + { + "epoch": 6.1353591160221, + "grad_norm": 0.22180478274822235, + "learning_rate": 3.4318641574806796e-05, + "loss": 1.6659, + "step": 19989 + }, + { + "epoch": 6.135666052793125, + "grad_norm": 0.19818264245986938, + "learning_rate": 3.431392187603964e-05, + "loss": 1.8057, + "step": 19990 + }, + { + "epoch": 6.1359729895641495, + "grad_norm": 0.34630170464515686, + "learning_rate": 3.4309202332291526e-05, + "loss": 1.7233, + "step": 19991 + }, + { + "epoch": 6.136279926335175, + "grad_norm": 0.2633006274700165, + "learning_rate": 3.430448294360905e-05, + "loss": 1.7421, + "step": 19992 + }, + { + "epoch": 6.1365868631062, + "grad_norm": 0.1976388394832611, + "learning_rate": 3.429976371003888e-05, + "loss": 1.7474, + "step": 19993 + }, + { + "epoch": 6.1368937998772255, + "grad_norm": 0.2386583834886551, + "learning_rate": 3.429504463162764e-05, + "loss": 1.7026, + "step": 19994 + }, + { + "epoch": 6.137200736648251, + "grad_norm": 0.20853812992572784, + "learning_rate": 3.4290325708422e-05, + "loss": 1.7846, + "step": 19995 + }, + { + "epoch": 6.137507673419275, + "grad_norm": 0.24667194485664368, + "learning_rate": 3.428560694046854e-05, + "loss": 1.6446, + "step": 19996 + }, + { + "epoch": 6.137814610190301, + "grad_norm": 0.24396342039108276, + "learning_rate": 3.428088832781394e-05, + "loss": 1.7368, + "step": 19997 + }, + { + "epoch": 6.138121546961326, + "grad_norm": 0.1958172619342804, + "learning_rate": 3.4276169870504804e-05, + "loss": 1.7197, + "step": 19998 + }, + { + "epoch": 6.138428483732351, + "grad_norm": 0.21487464010715485, + "learning_rate": 3.427145156858778e-05, + "loss": 1.7318, + "step": 19999 + }, + { + "epoch": 6.138735420503377, + "grad_norm": 0.2152775675058365, + "learning_rate": 3.4266733422109476e-05, + "loss": 1.7924, + "step": 20000 + }, + { + "epoch": 6.139042357274401, + "grad_norm": 0.17151346802711487, + "learning_rate": 3.426201543111656e-05, + "loss": 1.6915, + "step": 20001 + }, + { + "epoch": 6.139349294045426, + "grad_norm": 0.22197338938713074, + "learning_rate": 3.425729759565563e-05, + "loss": 1.8028, + "step": 20002 + }, + { + "epoch": 6.139656230816452, + "grad_norm": 0.23111973702907562, + "learning_rate": 3.42525799157733e-05, + "loss": 1.7515, + "step": 20003 + }, + { + "epoch": 6.139963167587477, + "grad_norm": 0.2829805314540863, + "learning_rate": 3.42478623915162e-05, + "loss": 1.8379, + "step": 20004 + }, + { + "epoch": 6.140270104358502, + "grad_norm": 0.23467600345611572, + "learning_rate": 3.424314502293096e-05, + "loss": 1.7755, + "step": 20005 + }, + { + "epoch": 6.140577041129528, + "grad_norm": 0.2047930657863617, + "learning_rate": 3.42384278100642e-05, + "loss": 1.7198, + "step": 20006 + }, + { + "epoch": 6.140883977900552, + "grad_norm": 0.1893673986196518, + "learning_rate": 3.423371075296253e-05, + "loss": 1.7318, + "step": 20007 + }, + { + "epoch": 6.1411909146715775, + "grad_norm": 0.21514710783958435, + "learning_rate": 3.422899385167259e-05, + "loss": 1.7499, + "step": 20008 + }, + { + "epoch": 6.141497851442603, + "grad_norm": 0.20030297338962555, + "learning_rate": 3.422427710624095e-05, + "loss": 1.7109, + "step": 20009 + }, + { + "epoch": 6.141804788213628, + "grad_norm": 0.23581266403198242, + "learning_rate": 3.421956051671426e-05, + "loss": 1.7834, + "step": 20010 + }, + { + "epoch": 6.1421117249846535, + "grad_norm": 0.22492484748363495, + "learning_rate": 3.421484408313911e-05, + "loss": 1.785, + "step": 20011 + }, + { + "epoch": 6.142418661755678, + "grad_norm": 0.34137019515037537, + "learning_rate": 3.421012780556215e-05, + "loss": 1.8101, + "step": 20012 + }, + { + "epoch": 6.142725598526703, + "grad_norm": 0.28489169478416443, + "learning_rate": 3.420541168402994e-05, + "loss": 1.7945, + "step": 20013 + }, + { + "epoch": 6.143032535297729, + "grad_norm": 0.259362131357193, + "learning_rate": 3.420069571858913e-05, + "loss": 1.7011, + "step": 20014 + }, + { + "epoch": 6.143339472068754, + "grad_norm": 0.3628309667110443, + "learning_rate": 3.419597990928628e-05, + "loss": 1.8273, + "step": 20015 + }, + { + "epoch": 6.143646408839779, + "grad_norm": 0.22306841611862183, + "learning_rate": 3.419126425616803e-05, + "loss": 1.7447, + "step": 20016 + }, + { + "epoch": 6.143953345610804, + "grad_norm": 0.36336812376976013, + "learning_rate": 3.4186548759280964e-05, + "loss": 1.7076, + "step": 20017 + }, + { + "epoch": 6.144260282381829, + "grad_norm": 0.23167413473129272, + "learning_rate": 3.418183341867172e-05, + "loss": 1.6924, + "step": 20018 + }, + { + "epoch": 6.144567219152854, + "grad_norm": 0.2541113495826721, + "learning_rate": 3.417711823438686e-05, + "loss": 1.755, + "step": 20019 + }, + { + "epoch": 6.14487415592388, + "grad_norm": 0.3733784854412079, + "learning_rate": 3.4172403206472975e-05, + "loss": 1.7087, + "step": 20020 + }, + { + "epoch": 6.145181092694905, + "grad_norm": 0.1940508335828781, + "learning_rate": 3.416768833497669e-05, + "loss": 1.717, + "step": 20021 + }, + { + "epoch": 6.14548802946593, + "grad_norm": 0.2707524001598358, + "learning_rate": 3.416297361994457e-05, + "loss": 1.7422, + "step": 20022 + }, + { + "epoch": 6.145794966236955, + "grad_norm": 0.25535452365875244, + "learning_rate": 3.415825906142326e-05, + "loss": 1.6915, + "step": 20023 + }, + { + "epoch": 6.14610190300798, + "grad_norm": 0.24094220995903015, + "learning_rate": 3.415354465945929e-05, + "loss": 1.7192, + "step": 20024 + }, + { + "epoch": 6.1464088397790055, + "grad_norm": 0.28329676389694214, + "learning_rate": 3.4148830414099306e-05, + "loss": 1.7272, + "step": 20025 + }, + { + "epoch": 6.146715776550031, + "grad_norm": 0.217180535197258, + "learning_rate": 3.414411632538984e-05, + "loss": 1.7195, + "step": 20026 + }, + { + "epoch": 6.147022713321056, + "grad_norm": 0.22693867981433868, + "learning_rate": 3.413940239337753e-05, + "loss": 1.6889, + "step": 20027 + }, + { + "epoch": 6.147329650092081, + "grad_norm": 0.30376315116882324, + "learning_rate": 3.413468861810892e-05, + "loss": 1.7741, + "step": 20028 + }, + { + "epoch": 6.147636586863106, + "grad_norm": 0.1928185671567917, + "learning_rate": 3.412997499963065e-05, + "loss": 1.6986, + "step": 20029 + }, + { + "epoch": 6.147943523634131, + "grad_norm": 0.260929137468338, + "learning_rate": 3.412526153798924e-05, + "loss": 1.7044, + "step": 20030 + }, + { + "epoch": 6.148250460405157, + "grad_norm": 0.23274847865104675, + "learning_rate": 3.4120548233231326e-05, + "loss": 1.7626, + "step": 20031 + }, + { + "epoch": 6.148557397176182, + "grad_norm": 0.2389308512210846, + "learning_rate": 3.411583508540344e-05, + "loss": 1.71, + "step": 20032 + }, + { + "epoch": 6.148864333947207, + "grad_norm": 0.2745562195777893, + "learning_rate": 3.411112209455219e-05, + "loss": 1.7144, + "step": 20033 + }, + { + "epoch": 6.149171270718232, + "grad_norm": 0.2369096428155899, + "learning_rate": 3.4106409260724135e-05, + "loss": 1.7879, + "step": 20034 + }, + { + "epoch": 6.149478207489257, + "grad_norm": 0.3103141486644745, + "learning_rate": 3.4101696583965874e-05, + "loss": 1.7862, + "step": 20035 + }, + { + "epoch": 6.149785144260282, + "grad_norm": 0.18625277280807495, + "learning_rate": 3.409698406432397e-05, + "loss": 1.7717, + "step": 20036 + }, + { + "epoch": 6.150092081031308, + "grad_norm": 0.2539508640766144, + "learning_rate": 3.409227170184497e-05, + "loss": 1.7023, + "step": 20037 + }, + { + "epoch": 6.150399017802333, + "grad_norm": 0.2185351699590683, + "learning_rate": 3.4087559496575474e-05, + "loss": 1.7283, + "step": 20038 + }, + { + "epoch": 6.150705954573358, + "grad_norm": 0.21225227415561676, + "learning_rate": 3.408284744856204e-05, + "loss": 1.7055, + "step": 20039 + }, + { + "epoch": 6.151012891344383, + "grad_norm": 0.23623189330101013, + "learning_rate": 3.407813555785125e-05, + "loss": 1.6862, + "step": 20040 + }, + { + "epoch": 6.151319828115408, + "grad_norm": 0.19061312079429626, + "learning_rate": 3.4073423824489634e-05, + "loss": 1.7501, + "step": 20041 + }, + { + "epoch": 6.151626764886434, + "grad_norm": 0.22176402807235718, + "learning_rate": 3.4068712248523804e-05, + "loss": 1.7417, + "step": 20042 + }, + { + "epoch": 6.151933701657459, + "grad_norm": 0.20093770325183868, + "learning_rate": 3.406400083000028e-05, + "loss": 1.7283, + "step": 20043 + }, + { + "epoch": 6.152240638428483, + "grad_norm": 0.21968910098075867, + "learning_rate": 3.4059289568965635e-05, + "loss": 1.7187, + "step": 20044 + }, + { + "epoch": 6.152547575199509, + "grad_norm": 0.19038841128349304, + "learning_rate": 3.4054578465466435e-05, + "loss": 1.7131, + "step": 20045 + }, + { + "epoch": 6.152854511970534, + "grad_norm": 0.2239457368850708, + "learning_rate": 3.404986751954925e-05, + "loss": 1.7643, + "step": 20046 + }, + { + "epoch": 6.153161448741559, + "grad_norm": 0.2357017546892166, + "learning_rate": 3.404515673126061e-05, + "loss": 1.7196, + "step": 20047 + }, + { + "epoch": 6.153468385512585, + "grad_norm": 0.2633310556411743, + "learning_rate": 3.4040446100647104e-05, + "loss": 1.7613, + "step": 20048 + }, + { + "epoch": 6.153775322283609, + "grad_norm": 0.28470975160598755, + "learning_rate": 3.403573562775524e-05, + "loss": 1.7564, + "step": 20049 + }, + { + "epoch": 6.1540822590546345, + "grad_norm": 0.37435805797576904, + "learning_rate": 3.40310253126316e-05, + "loss": 1.8365, + "step": 20050 + }, + { + "epoch": 6.15438919582566, + "grad_norm": 0.1706259697675705, + "learning_rate": 3.402631515532272e-05, + "loss": 1.7373, + "step": 20051 + }, + { + "epoch": 6.154696132596685, + "grad_norm": 0.30885928869247437, + "learning_rate": 3.402160515587518e-05, + "loss": 1.7152, + "step": 20052 + }, + { + "epoch": 6.1550030693677105, + "grad_norm": 0.21448500454425812, + "learning_rate": 3.40168953143355e-05, + "loss": 1.7463, + "step": 20053 + }, + { + "epoch": 6.155310006138736, + "grad_norm": 0.23774586617946625, + "learning_rate": 3.4012185630750204e-05, + "loss": 1.7268, + "step": 20054 + }, + { + "epoch": 6.15561694290976, + "grad_norm": 0.1943385899066925, + "learning_rate": 3.400747610516588e-05, + "loss": 1.6578, + "step": 20055 + }, + { + "epoch": 6.155923879680786, + "grad_norm": 0.27488210797309875, + "learning_rate": 3.400276673762903e-05, + "loss": 1.8204, + "step": 20056 + }, + { + "epoch": 6.156230816451811, + "grad_norm": 0.1871461570262909, + "learning_rate": 3.3998057528186244e-05, + "loss": 1.6775, + "step": 20057 + }, + { + "epoch": 6.156537753222836, + "grad_norm": 0.23566775023937225, + "learning_rate": 3.399334847688401e-05, + "loss": 1.7089, + "step": 20058 + }, + { + "epoch": 6.156844689993862, + "grad_norm": 0.26842471957206726, + "learning_rate": 3.398863958376891e-05, + "loss": 1.7554, + "step": 20059 + }, + { + "epoch": 6.157151626764886, + "grad_norm": 0.19267809391021729, + "learning_rate": 3.3983930848887435e-05, + "loss": 1.6709, + "step": 20060 + }, + { + "epoch": 6.157458563535911, + "grad_norm": 0.21130084991455078, + "learning_rate": 3.3979222272286156e-05, + "loss": 1.7312, + "step": 20061 + }, + { + "epoch": 6.157765500306937, + "grad_norm": 0.2322172224521637, + "learning_rate": 3.397451385401158e-05, + "loss": 1.8069, + "step": 20062 + }, + { + "epoch": 6.158072437077962, + "grad_norm": 0.21852418780326843, + "learning_rate": 3.396980559411027e-05, + "loss": 1.715, + "step": 20063 + }, + { + "epoch": 6.158379373848987, + "grad_norm": 0.21385829150676727, + "learning_rate": 3.3965097492628714e-05, + "loss": 1.6804, + "step": 20064 + }, + { + "epoch": 6.158686310620013, + "grad_norm": 0.21639080345630646, + "learning_rate": 3.3960389549613494e-05, + "loss": 1.655, + "step": 20065 + }, + { + "epoch": 6.158993247391037, + "grad_norm": 0.19219942390918732, + "learning_rate": 3.395568176511107e-05, + "loss": 1.7325, + "step": 20066 + }, + { + "epoch": 6.1593001841620625, + "grad_norm": 0.21853557229042053, + "learning_rate": 3.3950974139168024e-05, + "loss": 1.7204, + "step": 20067 + }, + { + "epoch": 6.159607120933088, + "grad_norm": 0.24144381284713745, + "learning_rate": 3.3946266671830854e-05, + "loss": 1.754, + "step": 20068 + }, + { + "epoch": 6.159914057704113, + "grad_norm": 0.2014230340719223, + "learning_rate": 3.394155936314609e-05, + "loss": 1.6905, + "step": 20069 + }, + { + "epoch": 6.1602209944751385, + "grad_norm": 0.26940762996673584, + "learning_rate": 3.393685221316025e-05, + "loss": 1.729, + "step": 20070 + }, + { + "epoch": 6.160527931246163, + "grad_norm": 0.1937808394432068, + "learning_rate": 3.3932145221919843e-05, + "loss": 1.7492, + "step": 20071 + }, + { + "epoch": 6.160834868017188, + "grad_norm": 0.2586243450641632, + "learning_rate": 3.39274383894714e-05, + "loss": 1.7706, + "step": 20072 + }, + { + "epoch": 6.161141804788214, + "grad_norm": 0.21995361149311066, + "learning_rate": 3.3922731715861416e-05, + "loss": 1.7716, + "step": 20073 + }, + { + "epoch": 6.161448741559239, + "grad_norm": 0.22915497422218323, + "learning_rate": 3.391802520113645e-05, + "loss": 1.716, + "step": 20074 + }, + { + "epoch": 6.161755678330264, + "grad_norm": 0.24317315220832825, + "learning_rate": 3.3913318845342956e-05, + "loss": 1.7392, + "step": 20075 + }, + { + "epoch": 6.162062615101289, + "grad_norm": 0.20439307391643524, + "learning_rate": 3.390861264852749e-05, + "loss": 1.7076, + "step": 20076 + }, + { + "epoch": 6.162369551872314, + "grad_norm": 0.2197176069021225, + "learning_rate": 3.3903906610736534e-05, + "loss": 1.7334, + "step": 20077 + }, + { + "epoch": 6.162676488643339, + "grad_norm": 0.21651993691921234, + "learning_rate": 3.389920073201662e-05, + "loss": 1.7651, + "step": 20078 + }, + { + "epoch": 6.162983425414365, + "grad_norm": 0.1999540627002716, + "learning_rate": 3.389449501241424e-05, + "loss": 1.7031, + "step": 20079 + }, + { + "epoch": 6.16329036218539, + "grad_norm": 0.21965044736862183, + "learning_rate": 3.38897894519759e-05, + "loss": 1.7243, + "step": 20080 + }, + { + "epoch": 6.163597298956415, + "grad_norm": 0.20127563178539276, + "learning_rate": 3.388508405074808e-05, + "loss": 1.693, + "step": 20081 + }, + { + "epoch": 6.16390423572744, + "grad_norm": 0.2143397182226181, + "learning_rate": 3.3880378808777336e-05, + "loss": 1.7304, + "step": 20082 + }, + { + "epoch": 6.164211172498465, + "grad_norm": 0.23116083443164825, + "learning_rate": 3.387567372611012e-05, + "loss": 1.7558, + "step": 20083 + }, + { + "epoch": 6.1645181092694905, + "grad_norm": 0.25513985753059387, + "learning_rate": 3.3870968802792946e-05, + "loss": 1.7169, + "step": 20084 + }, + { + "epoch": 6.164825046040516, + "grad_norm": 0.20549121499061584, + "learning_rate": 3.386626403887232e-05, + "loss": 1.7147, + "step": 20085 + }, + { + "epoch": 6.165131982811541, + "grad_norm": 0.2850625514984131, + "learning_rate": 3.386155943439473e-05, + "loss": 1.7865, + "step": 20086 + }, + { + "epoch": 6.165438919582566, + "grad_norm": 0.2689895033836365, + "learning_rate": 3.3856854989406675e-05, + "loss": 1.7576, + "step": 20087 + }, + { + "epoch": 6.165745856353591, + "grad_norm": 0.21677634119987488, + "learning_rate": 3.385215070395462e-05, + "loss": 1.7186, + "step": 20088 + }, + { + "epoch": 6.166052793124616, + "grad_norm": 0.19525155425071716, + "learning_rate": 3.384744657808509e-05, + "loss": 1.6713, + "step": 20089 + }, + { + "epoch": 6.166359729895642, + "grad_norm": 0.23097296059131622, + "learning_rate": 3.3842742611844555e-05, + "loss": 1.6975, + "step": 20090 + }, + { + "epoch": 6.166666666666667, + "grad_norm": 0.22210827469825745, + "learning_rate": 3.3838038805279516e-05, + "loss": 1.733, + "step": 20091 + }, + { + "epoch": 6.166973603437691, + "grad_norm": 0.3336607813835144, + "learning_rate": 3.383333515843643e-05, + "loss": 1.7441, + "step": 20092 + }, + { + "epoch": 6.167280540208717, + "grad_norm": 0.25274014472961426, + "learning_rate": 3.382863167136183e-05, + "loss": 1.7235, + "step": 20093 + }, + { + "epoch": 6.167587476979742, + "grad_norm": 0.3228790760040283, + "learning_rate": 3.3823928344102144e-05, + "loss": 1.8096, + "step": 20094 + }, + { + "epoch": 6.167894413750767, + "grad_norm": 0.34542208909988403, + "learning_rate": 3.381922517670389e-05, + "loss": 1.7431, + "step": 20095 + }, + { + "epoch": 6.168201350521793, + "grad_norm": 0.1921117901802063, + "learning_rate": 3.381452216921355e-05, + "loss": 1.787, + "step": 20096 + }, + { + "epoch": 6.168508287292818, + "grad_norm": 0.29019802808761597, + "learning_rate": 3.380981932167757e-05, + "loss": 1.7122, + "step": 20097 + }, + { + "epoch": 6.1688152240638425, + "grad_norm": 0.17999929189682007, + "learning_rate": 3.380511663414244e-05, + "loss": 1.7153, + "step": 20098 + }, + { + "epoch": 6.169122160834868, + "grad_norm": 0.2641841471195221, + "learning_rate": 3.380041410665466e-05, + "loss": 1.7317, + "step": 20099 + }, + { + "epoch": 6.169429097605893, + "grad_norm": 0.25492918491363525, + "learning_rate": 3.379571173926067e-05, + "loss": 1.6975, + "step": 20100 + }, + { + "epoch": 6.1697360343769185, + "grad_norm": 0.2554764151573181, + "learning_rate": 3.379100953200697e-05, + "loss": 1.7539, + "step": 20101 + }, + { + "epoch": 6.170042971147944, + "grad_norm": 0.2339072823524475, + "learning_rate": 3.378630748493999e-05, + "loss": 1.6871, + "step": 20102 + }, + { + "epoch": 6.170349907918968, + "grad_norm": 0.19663162529468536, + "learning_rate": 3.3781605598106236e-05, + "loss": 1.7419, + "step": 20103 + }, + { + "epoch": 6.170656844689994, + "grad_norm": 0.2479846328496933, + "learning_rate": 3.3776903871552166e-05, + "loss": 1.7849, + "step": 20104 + }, + { + "epoch": 6.170963781461019, + "grad_norm": 0.18630735576152802, + "learning_rate": 3.377220230532423e-05, + "loss": 1.7412, + "step": 20105 + }, + { + "epoch": 6.171270718232044, + "grad_norm": 0.2211095094680786, + "learning_rate": 3.376750089946892e-05, + "loss": 1.7445, + "step": 20106 + }, + { + "epoch": 6.17157765500307, + "grad_norm": 0.20783299207687378, + "learning_rate": 3.3762799654032653e-05, + "loss": 1.7346, + "step": 20107 + }, + { + "epoch": 6.171884591774095, + "grad_norm": 0.18022862076759338, + "learning_rate": 3.3758098569061934e-05, + "loss": 1.7083, + "step": 20108 + }, + { + "epoch": 6.172191528545119, + "grad_norm": 0.23707088828086853, + "learning_rate": 3.375339764460319e-05, + "loss": 1.8542, + "step": 20109 + }, + { + "epoch": 6.172498465316145, + "grad_norm": 0.2289234846830368, + "learning_rate": 3.3748696880702913e-05, + "loss": 1.7564, + "step": 20110 + }, + { + "epoch": 6.17280540208717, + "grad_norm": 0.28396767377853394, + "learning_rate": 3.374399627740752e-05, + "loss": 1.7349, + "step": 20111 + }, + { + "epoch": 6.173112338858195, + "grad_norm": 0.20154817402362823, + "learning_rate": 3.373929583476351e-05, + "loss": 1.7356, + "step": 20112 + }, + { + "epoch": 6.173419275629221, + "grad_norm": 0.22590605914592743, + "learning_rate": 3.373459555281728e-05, + "loss": 1.7291, + "step": 20113 + }, + { + "epoch": 6.173726212400245, + "grad_norm": 0.2145034223794937, + "learning_rate": 3.372989543161532e-05, + "loss": 1.7544, + "step": 20114 + }, + { + "epoch": 6.1740331491712706, + "grad_norm": 0.26797109842300415, + "learning_rate": 3.372519547120407e-05, + "loss": 1.743, + "step": 20115 + }, + { + "epoch": 6.174340085942296, + "grad_norm": 0.2795363664627075, + "learning_rate": 3.372049567162999e-05, + "loss": 1.7278, + "step": 20116 + }, + { + "epoch": 6.174647022713321, + "grad_norm": 0.21436716616153717, + "learning_rate": 3.3715796032939494e-05, + "loss": 1.7306, + "step": 20117 + }, + { + "epoch": 6.1749539594843466, + "grad_norm": 0.2593919336795807, + "learning_rate": 3.3711096555179064e-05, + "loss": 1.7323, + "step": 20118 + }, + { + "epoch": 6.175260896255371, + "grad_norm": 0.19639115035533905, + "learning_rate": 3.3706397238395124e-05, + "loss": 1.7444, + "step": 20119 + }, + { + "epoch": 6.175567833026396, + "grad_norm": 0.23408278822898865, + "learning_rate": 3.370169808263409e-05, + "loss": 1.7461, + "step": 20120 + }, + { + "epoch": 6.175874769797422, + "grad_norm": 0.21200022101402283, + "learning_rate": 3.369699908794246e-05, + "loss": 1.7588, + "step": 20121 + }, + { + "epoch": 6.176181706568447, + "grad_norm": 0.17609953880310059, + "learning_rate": 3.369230025436662e-05, + "loss": 1.6608, + "step": 20122 + }, + { + "epoch": 6.176488643339472, + "grad_norm": 0.19895964860916138, + "learning_rate": 3.3687601581953046e-05, + "loss": 1.729, + "step": 20123 + }, + { + "epoch": 6.176795580110497, + "grad_norm": 0.22833310067653656, + "learning_rate": 3.368290307074814e-05, + "loss": 1.7148, + "step": 20124 + }, + { + "epoch": 6.177102516881522, + "grad_norm": 0.1847219169139862, + "learning_rate": 3.367820472079835e-05, + "loss": 1.6894, + "step": 20125 + }, + { + "epoch": 6.1774094536525475, + "grad_norm": 0.20269884169101715, + "learning_rate": 3.36735065321501e-05, + "loss": 1.794, + "step": 20126 + }, + { + "epoch": 6.177716390423573, + "grad_norm": 0.19277122616767883, + "learning_rate": 3.3668808504849845e-05, + "loss": 1.6936, + "step": 20127 + }, + { + "epoch": 6.178023327194598, + "grad_norm": 0.23804394900798798, + "learning_rate": 3.3664110638943985e-05, + "loss": 1.746, + "step": 20128 + }, + { + "epoch": 6.1783302639656235, + "grad_norm": 0.20946018397808075, + "learning_rate": 3.365941293447897e-05, + "loss": 1.6952, + "step": 20129 + }, + { + "epoch": 6.178637200736648, + "grad_norm": 0.21680596470832825, + "learning_rate": 3.36547153915012e-05, + "loss": 1.7709, + "step": 20130 + }, + { + "epoch": 6.178944137507673, + "grad_norm": 0.22549709677696228, + "learning_rate": 3.365001801005712e-05, + "loss": 1.6814, + "step": 20131 + }, + { + "epoch": 6.179251074278699, + "grad_norm": 0.20660072565078735, + "learning_rate": 3.3645320790193136e-05, + "loss": 1.6992, + "step": 20132 + }, + { + "epoch": 6.179558011049724, + "grad_norm": 0.23697195947170258, + "learning_rate": 3.36406237319557e-05, + "loss": 1.7325, + "step": 20133 + }, + { + "epoch": 6.179864947820749, + "grad_norm": 0.20847748219966888, + "learning_rate": 3.363592683539118e-05, + "loss": 1.7066, + "step": 20134 + }, + { + "epoch": 6.180171884591774, + "grad_norm": 0.24317312240600586, + "learning_rate": 3.363123010054605e-05, + "loss": 1.7259, + "step": 20135 + }, + { + "epoch": 6.180478821362799, + "grad_norm": 0.22137925028800964, + "learning_rate": 3.3626533527466686e-05, + "loss": 1.7492, + "step": 20136 + }, + { + "epoch": 6.180785758133824, + "grad_norm": 0.23857460916042328, + "learning_rate": 3.362183711619951e-05, + "loss": 1.6671, + "step": 20137 + }, + { + "epoch": 6.18109269490485, + "grad_norm": 0.20017468929290771, + "learning_rate": 3.361714086679095e-05, + "loss": 1.7151, + "step": 20138 + }, + { + "epoch": 6.181399631675875, + "grad_norm": 0.21566617488861084, + "learning_rate": 3.361244477928739e-05, + "loss": 1.7659, + "step": 20139 + }, + { + "epoch": 6.1817065684469, + "grad_norm": 0.21695555746555328, + "learning_rate": 3.360774885373528e-05, + "loss": 1.7463, + "step": 20140 + }, + { + "epoch": 6.182013505217925, + "grad_norm": 0.19326116144657135, + "learning_rate": 3.360305309018098e-05, + "loss": 1.7182, + "step": 20141 + }, + { + "epoch": 6.18232044198895, + "grad_norm": 0.2135429084300995, + "learning_rate": 3.359835748867093e-05, + "loss": 1.8001, + "step": 20142 + }, + { + "epoch": 6.1826273787599755, + "grad_norm": 0.20097343623638153, + "learning_rate": 3.359366204925151e-05, + "loss": 1.7442, + "step": 20143 + }, + { + "epoch": 6.182934315531001, + "grad_norm": 0.212847501039505, + "learning_rate": 3.358896677196916e-05, + "loss": 1.7418, + "step": 20144 + }, + { + "epoch": 6.183241252302026, + "grad_norm": 0.18414677679538727, + "learning_rate": 3.358427165687024e-05, + "loss": 1.6813, + "step": 20145 + }, + { + "epoch": 6.183548189073051, + "grad_norm": 0.23170427978038788, + "learning_rate": 3.357957670400119e-05, + "loss": 1.7722, + "step": 20146 + }, + { + "epoch": 6.183855125844076, + "grad_norm": 0.28952550888061523, + "learning_rate": 3.357488191340837e-05, + "loss": 1.7785, + "step": 20147 + }, + { + "epoch": 6.184162062615101, + "grad_norm": 0.2126605361700058, + "learning_rate": 3.35701872851382e-05, + "loss": 1.7064, + "step": 20148 + }, + { + "epoch": 6.184468999386127, + "grad_norm": 0.2376919537782669, + "learning_rate": 3.356549281923706e-05, + "loss": 1.7322, + "step": 20149 + }, + { + "epoch": 6.184775936157152, + "grad_norm": 0.24168729782104492, + "learning_rate": 3.3560798515751375e-05, + "loss": 1.7296, + "step": 20150 + }, + { + "epoch": 6.185082872928176, + "grad_norm": 0.19746467471122742, + "learning_rate": 3.355610437472749e-05, + "loss": 1.7816, + "step": 20151 + }, + { + "epoch": 6.185389809699202, + "grad_norm": 0.2399774193763733, + "learning_rate": 3.3551410396211844e-05, + "loss": 1.7309, + "step": 20152 + }, + { + "epoch": 6.185696746470227, + "grad_norm": 0.20560777187347412, + "learning_rate": 3.3546716580250785e-05, + "loss": 1.7134, + "step": 20153 + }, + { + "epoch": 6.186003683241252, + "grad_norm": 0.22640523314476013, + "learning_rate": 3.354202292689072e-05, + "loss": 1.7572, + "step": 20154 + }, + { + "epoch": 6.186310620012278, + "grad_norm": 0.20796974003314972, + "learning_rate": 3.353732943617803e-05, + "loss": 1.6897, + "step": 20155 + }, + { + "epoch": 6.186617556783303, + "grad_norm": 0.19902797043323517, + "learning_rate": 3.35326361081591e-05, + "loss": 1.6836, + "step": 20156 + }, + { + "epoch": 6.1869244935543275, + "grad_norm": 0.30999818444252014, + "learning_rate": 3.352794294288032e-05, + "loss": 1.7704, + "step": 20157 + }, + { + "epoch": 6.187231430325353, + "grad_norm": 0.20634675025939941, + "learning_rate": 3.3523249940388045e-05, + "loss": 1.7599, + "step": 20158 + }, + { + "epoch": 6.187538367096378, + "grad_norm": 0.25650453567504883, + "learning_rate": 3.3518557100728674e-05, + "loss": 1.7441, + "step": 20159 + }, + { + "epoch": 6.1878453038674035, + "grad_norm": 0.2400079369544983, + "learning_rate": 3.351386442394858e-05, + "loss": 1.6836, + "step": 20160 + }, + { + "epoch": 6.188152240638429, + "grad_norm": 0.23734217882156372, + "learning_rate": 3.350917191009416e-05, + "loss": 1.7, + "step": 20161 + }, + { + "epoch": 6.188459177409453, + "grad_norm": 0.29579323530197144, + "learning_rate": 3.3504479559211755e-05, + "loss": 1.71, + "step": 20162 + }, + { + "epoch": 6.188766114180479, + "grad_norm": 0.18999184668064117, + "learning_rate": 3.349978737134776e-05, + "loss": 1.7396, + "step": 20163 + }, + { + "epoch": 6.189073050951504, + "grad_norm": 0.26760223507881165, + "learning_rate": 3.3495095346548525e-05, + "loss": 1.7846, + "step": 20164 + }, + { + "epoch": 6.189379987722529, + "grad_norm": 0.18416397273540497, + "learning_rate": 3.349040348486044e-05, + "loss": 1.6911, + "step": 20165 + }, + { + "epoch": 6.189686924493555, + "grad_norm": 0.23761679232120514, + "learning_rate": 3.348571178632986e-05, + "loss": 1.6776, + "step": 20166 + }, + { + "epoch": 6.189993861264579, + "grad_norm": 0.2056473195552826, + "learning_rate": 3.348102025100316e-05, + "loss": 1.697, + "step": 20167 + }, + { + "epoch": 6.190300798035604, + "grad_norm": 0.23916250467300415, + "learning_rate": 3.3476328878926685e-05, + "loss": 1.7943, + "step": 20168 + }, + { + "epoch": 6.19060773480663, + "grad_norm": 0.2205415964126587, + "learning_rate": 3.347163767014684e-05, + "loss": 1.8037, + "step": 20169 + }, + { + "epoch": 6.190914671577655, + "grad_norm": 0.28907346725463867, + "learning_rate": 3.346694662470995e-05, + "loss": 1.6875, + "step": 20170 + }, + { + "epoch": 6.19122160834868, + "grad_norm": 0.2382480502128601, + "learning_rate": 3.3462255742662364e-05, + "loss": 1.7116, + "step": 20171 + }, + { + "epoch": 6.191528545119706, + "grad_norm": 0.25309205055236816, + "learning_rate": 3.3457565024050485e-05, + "loss": 1.7584, + "step": 20172 + }, + { + "epoch": 6.19183548189073, + "grad_norm": 0.3959091901779175, + "learning_rate": 3.3452874468920626e-05, + "loss": 1.7054, + "step": 20173 + }, + { + "epoch": 6.1921424186617555, + "grad_norm": 0.22697016596794128, + "learning_rate": 3.344818407731918e-05, + "loss": 1.7373, + "step": 20174 + }, + { + "epoch": 6.192449355432781, + "grad_norm": 0.298178493976593, + "learning_rate": 3.3443493849292465e-05, + "loss": 1.7192, + "step": 20175 + }, + { + "epoch": 6.192756292203806, + "grad_norm": 0.2742854058742523, + "learning_rate": 3.343880378488685e-05, + "loss": 1.7538, + "step": 20176 + }, + { + "epoch": 6.1930632289748315, + "grad_norm": 0.23367546498775482, + "learning_rate": 3.343411388414867e-05, + "loss": 1.694, + "step": 20177 + }, + { + "epoch": 6.193370165745856, + "grad_norm": 0.2932305932044983, + "learning_rate": 3.342942414712431e-05, + "loss": 1.7291, + "step": 20178 + }, + { + "epoch": 6.193677102516881, + "grad_norm": 0.24306413531303406, + "learning_rate": 3.342473457386007e-05, + "loss": 1.6959, + "step": 20179 + }, + { + "epoch": 6.193984039287907, + "grad_norm": 0.30828577280044556, + "learning_rate": 3.3420045164402344e-05, + "loss": 1.6848, + "step": 20180 + }, + { + "epoch": 6.194290976058932, + "grad_norm": 0.18766994774341583, + "learning_rate": 3.341535591879743e-05, + "loss": 1.7261, + "step": 20181 + }, + { + "epoch": 6.194597912829957, + "grad_norm": 0.300778329372406, + "learning_rate": 3.3410666837091696e-05, + "loss": 1.7539, + "step": 20182 + }, + { + "epoch": 6.194904849600983, + "grad_norm": 0.20148977637290955, + "learning_rate": 3.340597791933147e-05, + "loss": 1.7496, + "step": 20183 + }, + { + "epoch": 6.195211786372007, + "grad_norm": 0.2746329605579376, + "learning_rate": 3.340128916556311e-05, + "loss": 1.6458, + "step": 20184 + }, + { + "epoch": 6.195518723143032, + "grad_norm": 0.2715265452861786, + "learning_rate": 3.339660057583292e-05, + "loss": 1.7799, + "step": 20185 + }, + { + "epoch": 6.195825659914058, + "grad_norm": 0.2145555317401886, + "learning_rate": 3.339191215018728e-05, + "loss": 1.6854, + "step": 20186 + }, + { + "epoch": 6.196132596685083, + "grad_norm": 0.3018960654735565, + "learning_rate": 3.338722388867248e-05, + "loss": 1.7569, + "step": 20187 + }, + { + "epoch": 6.196439533456108, + "grad_norm": 0.24876931309700012, + "learning_rate": 3.338253579133487e-05, + "loss": 1.7434, + "step": 20188 + }, + { + "epoch": 6.196746470227133, + "grad_norm": 0.3609273433685303, + "learning_rate": 3.337784785822079e-05, + "loss": 1.737, + "step": 20189 + }, + { + "epoch": 6.197053406998158, + "grad_norm": 0.21586830914020538, + "learning_rate": 3.337316008937655e-05, + "loss": 1.7553, + "step": 20190 + }, + { + "epoch": 6.1973603437691835, + "grad_norm": 0.23542988300323486, + "learning_rate": 3.3368472484848504e-05, + "loss": 1.7174, + "step": 20191 + }, + { + "epoch": 6.197667280540209, + "grad_norm": 0.19861294329166412, + "learning_rate": 3.336378504468294e-05, + "loss": 1.7268, + "step": 20192 + }, + { + "epoch": 6.197974217311234, + "grad_norm": 0.26865682005882263, + "learning_rate": 3.335909776892622e-05, + "loss": 1.7656, + "step": 20193 + }, + { + "epoch": 6.198281154082259, + "grad_norm": 0.343078076839447, + "learning_rate": 3.3354410657624624e-05, + "loss": 1.734, + "step": 20194 + }, + { + "epoch": 6.198588090853284, + "grad_norm": 0.21613667905330658, + "learning_rate": 3.334972371082453e-05, + "loss": 1.7777, + "step": 20195 + }, + { + "epoch": 6.198895027624309, + "grad_norm": 0.22268854081630707, + "learning_rate": 3.3345036928572207e-05, + "loss": 1.667, + "step": 20196 + }, + { + "epoch": 6.199201964395335, + "grad_norm": 0.22870087623596191, + "learning_rate": 3.3340350310914e-05, + "loss": 1.7532, + "step": 20197 + }, + { + "epoch": 6.19950890116636, + "grad_norm": 0.1969831883907318, + "learning_rate": 3.3335663857896205e-05, + "loss": 1.7821, + "step": 20198 + }, + { + "epoch": 6.199815837937384, + "grad_norm": 0.20414133369922638, + "learning_rate": 3.3330977569565154e-05, + "loss": 1.7449, + "step": 20199 + }, + { + "epoch": 6.20012277470841, + "grad_norm": 0.21947748959064484, + "learning_rate": 3.332629144596714e-05, + "loss": 1.6888, + "step": 20200 + }, + { + "epoch": 6.200429711479435, + "grad_norm": 0.20943035185337067, + "learning_rate": 3.332160548714851e-05, + "loss": 1.7278, + "step": 20201 + }, + { + "epoch": 6.2007366482504604, + "grad_norm": 0.22410117089748383, + "learning_rate": 3.331691969315553e-05, + "loss": 1.721, + "step": 20202 + }, + { + "epoch": 6.201043585021486, + "grad_norm": 0.21422281861305237, + "learning_rate": 3.3312234064034555e-05, + "loss": 1.7199, + "step": 20203 + }, + { + "epoch": 6.201350521792511, + "grad_norm": 0.21021418273448944, + "learning_rate": 3.330754859983184e-05, + "loss": 1.7972, + "step": 20204 + }, + { + "epoch": 6.201657458563536, + "grad_norm": 0.21155185997486115, + "learning_rate": 3.330286330059371e-05, + "loss": 1.7463, + "step": 20205 + }, + { + "epoch": 6.201964395334561, + "grad_norm": 0.20241162180900574, + "learning_rate": 3.329817816636649e-05, + "loss": 1.7804, + "step": 20206 + }, + { + "epoch": 6.202271332105586, + "grad_norm": 0.19882376492023468, + "learning_rate": 3.329349319719644e-05, + "loss": 1.7564, + "step": 20207 + }, + { + "epoch": 6.202578268876612, + "grad_norm": 0.20528686046600342, + "learning_rate": 3.328880839312991e-05, + "loss": 1.751, + "step": 20208 + }, + { + "epoch": 6.202885205647637, + "grad_norm": 0.2708488404750824, + "learning_rate": 3.328412375421315e-05, + "loss": 1.8008, + "step": 20209 + }, + { + "epoch": 6.203192142418661, + "grad_norm": 0.1986229121685028, + "learning_rate": 3.3279439280492486e-05, + "loss": 1.6833, + "step": 20210 + }, + { + "epoch": 6.203499079189687, + "grad_norm": 0.2700355350971222, + "learning_rate": 3.3274754972014186e-05, + "loss": 1.8071, + "step": 20211 + }, + { + "epoch": 6.203806015960712, + "grad_norm": 0.23060421645641327, + "learning_rate": 3.327007082882458e-05, + "loss": 1.6856, + "step": 20212 + }, + { + "epoch": 6.204112952731737, + "grad_norm": 0.20798510313034058, + "learning_rate": 3.3265386850969926e-05, + "loss": 1.7421, + "step": 20213 + }, + { + "epoch": 6.204419889502763, + "grad_norm": 0.21828265488147736, + "learning_rate": 3.3260703038496556e-05, + "loss": 1.7212, + "step": 20214 + }, + { + "epoch": 6.204726826273788, + "grad_norm": 0.1965378224849701, + "learning_rate": 3.325601939145069e-05, + "loss": 1.6987, + "step": 20215 + }, + { + "epoch": 6.2050337630448125, + "grad_norm": 0.23897121846675873, + "learning_rate": 3.325133590987868e-05, + "loss": 1.7501, + "step": 20216 + }, + { + "epoch": 6.205340699815838, + "grad_norm": 0.18647781014442444, + "learning_rate": 3.324665259382676e-05, + "loss": 1.688, + "step": 20217 + }, + { + "epoch": 6.205647636586863, + "grad_norm": 0.19906121492385864, + "learning_rate": 3.324196944334127e-05, + "loss": 1.749, + "step": 20218 + }, + { + "epoch": 6.2059545733578885, + "grad_norm": 0.2061154991388321, + "learning_rate": 3.3237286458468444e-05, + "loss": 1.757, + "step": 20219 + }, + { + "epoch": 6.206261510128914, + "grad_norm": 0.19410182535648346, + "learning_rate": 3.323260363925459e-05, + "loss": 1.6826, + "step": 20220 + }, + { + "epoch": 6.206568446899938, + "grad_norm": 0.2017979919910431, + "learning_rate": 3.322792098574597e-05, + "loss": 1.7568, + "step": 20221 + }, + { + "epoch": 6.206875383670964, + "grad_norm": 0.19491736590862274, + "learning_rate": 3.322323849798885e-05, + "loss": 1.7082, + "step": 20222 + }, + { + "epoch": 6.207182320441989, + "grad_norm": 0.19826333224773407, + "learning_rate": 3.321855617602954e-05, + "loss": 1.7654, + "step": 20223 + }, + { + "epoch": 6.207489257213014, + "grad_norm": 0.18185383081436157, + "learning_rate": 3.321387401991428e-05, + "loss": 1.6826, + "step": 20224 + }, + { + "epoch": 6.20779619398404, + "grad_norm": 0.22402678430080414, + "learning_rate": 3.320919202968937e-05, + "loss": 1.795, + "step": 20225 + }, + { + "epoch": 6.208103130755064, + "grad_norm": 0.201541468501091, + "learning_rate": 3.320451020540105e-05, + "loss": 1.6838, + "step": 20226 + }, + { + "epoch": 6.208410067526089, + "grad_norm": 0.25479504466056824, + "learning_rate": 3.3199828547095616e-05, + "loss": 1.7881, + "step": 20227 + }, + { + "epoch": 6.208717004297115, + "grad_norm": 0.2057993859052658, + "learning_rate": 3.31951470548193e-05, + "loss": 1.737, + "step": 20228 + }, + { + "epoch": 6.20902394106814, + "grad_norm": 0.183469757437706, + "learning_rate": 3.319046572861842e-05, + "loss": 1.6989, + "step": 20229 + }, + { + "epoch": 6.209330877839165, + "grad_norm": 0.21723738312721252, + "learning_rate": 3.318578456853919e-05, + "loss": 1.7537, + "step": 20230 + }, + { + "epoch": 6.209637814610191, + "grad_norm": 0.21919457614421844, + "learning_rate": 3.318110357462791e-05, + "loss": 1.7444, + "step": 20231 + }, + { + "epoch": 6.209944751381215, + "grad_norm": 0.17009909451007843, + "learning_rate": 3.317642274693081e-05, + "loss": 1.6885, + "step": 20232 + }, + { + "epoch": 6.2102516881522405, + "grad_norm": 0.19625195860862732, + "learning_rate": 3.317174208549416e-05, + "loss": 1.7255, + "step": 20233 + }, + { + "epoch": 6.210558624923266, + "grad_norm": 0.2131364941596985, + "learning_rate": 3.316706159036422e-05, + "loss": 1.7047, + "step": 20234 + }, + { + "epoch": 6.210865561694291, + "grad_norm": 0.18454425036907196, + "learning_rate": 3.316238126158725e-05, + "loss": 1.7536, + "step": 20235 + }, + { + "epoch": 6.2111724984653165, + "grad_norm": 0.2124820202589035, + "learning_rate": 3.3157701099209485e-05, + "loss": 1.7456, + "step": 20236 + }, + { + "epoch": 6.211479435236341, + "grad_norm": 0.1929594725370407, + "learning_rate": 3.3153021103277206e-05, + "loss": 1.7118, + "step": 20237 + }, + { + "epoch": 6.211786372007366, + "grad_norm": 0.19876480102539062, + "learning_rate": 3.314834127383664e-05, + "loss": 1.6855, + "step": 20238 + }, + { + "epoch": 6.212093308778392, + "grad_norm": 0.18902665376663208, + "learning_rate": 3.314366161093403e-05, + "loss": 1.7052, + "step": 20239 + }, + { + "epoch": 6.212400245549417, + "grad_norm": 0.1859758198261261, + "learning_rate": 3.313898211461566e-05, + "loss": 1.7277, + "step": 20240 + }, + { + "epoch": 6.212707182320442, + "grad_norm": 0.2160472422838211, + "learning_rate": 3.313430278492773e-05, + "loss": 1.6787, + "step": 20241 + }, + { + "epoch": 6.213014119091467, + "grad_norm": 0.24482262134552002, + "learning_rate": 3.312962362191652e-05, + "loss": 1.7439, + "step": 20242 + }, + { + "epoch": 6.213321055862492, + "grad_norm": 0.2343531847000122, + "learning_rate": 3.312494462562824e-05, + "loss": 1.7981, + "step": 20243 + }, + { + "epoch": 6.213627992633517, + "grad_norm": 0.2385960817337036, + "learning_rate": 3.3120265796109163e-05, + "loss": 1.7144, + "step": 20244 + }, + { + "epoch": 6.213934929404543, + "grad_norm": 0.21878042817115784, + "learning_rate": 3.3115587133405503e-05, + "loss": 1.7057, + "step": 20245 + }, + { + "epoch": 6.214241866175568, + "grad_norm": 0.23426075279712677, + "learning_rate": 3.311090863756351e-05, + "loss": 1.7372, + "step": 20246 + }, + { + "epoch": 6.214548802946593, + "grad_norm": 0.2369524985551834, + "learning_rate": 3.310623030862942e-05, + "loss": 1.7502, + "step": 20247 + }, + { + "epoch": 6.214855739717618, + "grad_norm": 0.31635788083076477, + "learning_rate": 3.3101552146649474e-05, + "loss": 1.7616, + "step": 20248 + }, + { + "epoch": 6.215162676488643, + "grad_norm": 0.2312999814748764, + "learning_rate": 3.309687415166986e-05, + "loss": 1.6991, + "step": 20249 + }, + { + "epoch": 6.2154696132596685, + "grad_norm": 0.23423358798027039, + "learning_rate": 3.309219632373688e-05, + "loss": 1.7737, + "step": 20250 + }, + { + "epoch": 6.215776550030694, + "grad_norm": 0.28763437271118164, + "learning_rate": 3.308751866289671e-05, + "loss": 1.7822, + "step": 20251 + }, + { + "epoch": 6.216083486801719, + "grad_norm": 0.20754525065422058, + "learning_rate": 3.30828411691956e-05, + "loss": 1.7427, + "step": 20252 + }, + { + "epoch": 6.216390423572744, + "grad_norm": 0.31858858466148376, + "learning_rate": 3.307816384267975e-05, + "loss": 1.7384, + "step": 20253 + }, + { + "epoch": 6.216697360343769, + "grad_norm": 0.21968062222003937, + "learning_rate": 3.307348668339543e-05, + "loss": 1.6896, + "step": 20254 + }, + { + "epoch": 6.217004297114794, + "grad_norm": 0.21643556654453278, + "learning_rate": 3.306880969138882e-05, + "loss": 1.7353, + "step": 20255 + }, + { + "epoch": 6.21731123388582, + "grad_norm": 0.22141097486019135, + "learning_rate": 3.306413286670616e-05, + "loss": 1.7254, + "step": 20256 + }, + { + "epoch": 6.217618170656845, + "grad_norm": 0.17666983604431152, + "learning_rate": 3.305945620939367e-05, + "loss": 1.7198, + "step": 20257 + }, + { + "epoch": 6.21792510742787, + "grad_norm": 0.25182467699050903, + "learning_rate": 3.3054779719497544e-05, + "loss": 1.7562, + "step": 20258 + }, + { + "epoch": 6.218232044198895, + "grad_norm": 0.23481281101703644, + "learning_rate": 3.305010339706404e-05, + "loss": 1.8293, + "step": 20259 + }, + { + "epoch": 6.21853898096992, + "grad_norm": 0.23981143534183502, + "learning_rate": 3.304542724213933e-05, + "loss": 1.7619, + "step": 20260 + }, + { + "epoch": 6.218845917740945, + "grad_norm": 0.2388351708650589, + "learning_rate": 3.3040751254769665e-05, + "loss": 1.7471, + "step": 20261 + }, + { + "epoch": 6.219152854511971, + "grad_norm": 0.2039698362350464, + "learning_rate": 3.3036075435001216e-05, + "loss": 1.6893, + "step": 20262 + }, + { + "epoch": 6.219459791282996, + "grad_norm": 0.218357652425766, + "learning_rate": 3.3031399782880224e-05, + "loss": 1.753, + "step": 20263 + }, + { + "epoch": 6.2197667280540205, + "grad_norm": 0.25466734170913696, + "learning_rate": 3.302672429845288e-05, + "loss": 1.7496, + "step": 20264 + }, + { + "epoch": 6.220073664825046, + "grad_norm": 0.1853330284357071, + "learning_rate": 3.302204898176541e-05, + "loss": 1.7779, + "step": 20265 + }, + { + "epoch": 6.220380601596071, + "grad_norm": 0.24044091999530792, + "learning_rate": 3.3017373832863976e-05, + "loss": 1.8226, + "step": 20266 + }, + { + "epoch": 6.2206875383670965, + "grad_norm": 0.2209070324897766, + "learning_rate": 3.3012698851794835e-05, + "loss": 1.7069, + "step": 20267 + }, + { + "epoch": 6.220994475138122, + "grad_norm": 0.2775282561779022, + "learning_rate": 3.3008024038604135e-05, + "loss": 1.7048, + "step": 20268 + }, + { + "epoch": 6.221301411909146, + "grad_norm": 0.22873717546463013, + "learning_rate": 3.3003349393338116e-05, + "loss": 1.7956, + "step": 20269 + }, + { + "epoch": 6.221608348680172, + "grad_norm": 0.27883464097976685, + "learning_rate": 3.2998674916042946e-05, + "loss": 1.6955, + "step": 20270 + }, + { + "epoch": 6.221915285451197, + "grad_norm": 0.2383071482181549, + "learning_rate": 3.2994000606764865e-05, + "loss": 1.7645, + "step": 20271 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 0.26280200481414795, + "learning_rate": 3.298932646555003e-05, + "loss": 1.7854, + "step": 20272 + }, + { + "epoch": 6.222529158993248, + "grad_norm": 0.2387673407793045, + "learning_rate": 3.2984652492444625e-05, + "loss": 1.679, + "step": 20273 + }, + { + "epoch": 6.222836095764273, + "grad_norm": 0.2136983871459961, + "learning_rate": 3.297997868749486e-05, + "loss": 1.7313, + "step": 20274 + }, + { + "epoch": 6.223143032535297, + "grad_norm": 0.2629627585411072, + "learning_rate": 3.297530505074692e-05, + "loss": 1.7452, + "step": 20275 + }, + { + "epoch": 6.223449969306323, + "grad_norm": 0.22018705308437347, + "learning_rate": 3.2970631582247e-05, + "loss": 1.7368, + "step": 20276 + }, + { + "epoch": 6.223756906077348, + "grad_norm": 0.19277356564998627, + "learning_rate": 3.296595828204128e-05, + "loss": 1.7084, + "step": 20277 + }, + { + "epoch": 6.224063842848373, + "grad_norm": 0.18806682527065277, + "learning_rate": 3.2961285150175944e-05, + "loss": 1.6576, + "step": 20278 + }, + { + "epoch": 6.224370779619399, + "grad_norm": 0.2019709348678589, + "learning_rate": 3.295661218669717e-05, + "loss": 1.7594, + "step": 20279 + }, + { + "epoch": 6.224677716390423, + "grad_norm": 0.19662119448184967, + "learning_rate": 3.295193939165114e-05, + "loss": 1.6946, + "step": 20280 + }, + { + "epoch": 6.2249846531614486, + "grad_norm": 0.1880662590265274, + "learning_rate": 3.294726676508404e-05, + "loss": 1.7232, + "step": 20281 + }, + { + "epoch": 6.225291589932474, + "grad_norm": 0.23242273926734924, + "learning_rate": 3.294259430704206e-05, + "loss": 1.7331, + "step": 20282 + }, + { + "epoch": 6.225598526703499, + "grad_norm": 0.19915202260017395, + "learning_rate": 3.293792201757134e-05, + "loss": 1.7844, + "step": 20283 + }, + { + "epoch": 6.225905463474525, + "grad_norm": 0.1845373958349228, + "learning_rate": 3.2933249896718097e-05, + "loss": 1.6803, + "step": 20284 + }, + { + "epoch": 6.226212400245549, + "grad_norm": 0.19340910017490387, + "learning_rate": 3.292857794452846e-05, + "loss": 1.6929, + "step": 20285 + }, + { + "epoch": 6.226519337016574, + "grad_norm": 0.21429216861724854, + "learning_rate": 3.292390616104863e-05, + "loss": 1.6833, + "step": 20286 + }, + { + "epoch": 6.2268262737876, + "grad_norm": 0.2267037034034729, + "learning_rate": 3.291923454632476e-05, + "loss": 1.7271, + "step": 20287 + }, + { + "epoch": 6.227133210558625, + "grad_norm": 0.23121988773345947, + "learning_rate": 3.2914563100403054e-05, + "loss": 1.8443, + "step": 20288 + }, + { + "epoch": 6.22744014732965, + "grad_norm": 0.20980899035930634, + "learning_rate": 3.290989182332964e-05, + "loss": 1.6907, + "step": 20289 + }, + { + "epoch": 6.227747084100676, + "grad_norm": 0.28162500262260437, + "learning_rate": 3.290522071515067e-05, + "loss": 1.7497, + "step": 20290 + }, + { + "epoch": 6.2280540208717, + "grad_norm": 0.2163640707731247, + "learning_rate": 3.290054977591234e-05, + "loss": 1.736, + "step": 20291 + }, + { + "epoch": 6.2283609576427255, + "grad_norm": 0.19144479930400848, + "learning_rate": 3.289587900566079e-05, + "loss": 1.7222, + "step": 20292 + }, + { + "epoch": 6.228667894413751, + "grad_norm": 0.24952897429466248, + "learning_rate": 3.2891208404442216e-05, + "loss": 1.7095, + "step": 20293 + }, + { + "epoch": 6.228974831184776, + "grad_norm": 0.19421981275081635, + "learning_rate": 3.288653797230272e-05, + "loss": 1.7231, + "step": 20294 + }, + { + "epoch": 6.2292817679558015, + "grad_norm": 0.22837944328784943, + "learning_rate": 3.288186770928851e-05, + "loss": 1.7404, + "step": 20295 + }, + { + "epoch": 6.229588704726826, + "grad_norm": 0.2292151004076004, + "learning_rate": 3.2877197615445685e-05, + "loss": 1.6999, + "step": 20296 + }, + { + "epoch": 6.229895641497851, + "grad_norm": 0.18376365303993225, + "learning_rate": 3.2872527690820456e-05, + "loss": 1.681, + "step": 20297 + }, + { + "epoch": 6.230202578268877, + "grad_norm": 0.21331918239593506, + "learning_rate": 3.286785793545893e-05, + "loss": 1.7362, + "step": 20298 + }, + { + "epoch": 6.230509515039902, + "grad_norm": 0.21247150003910065, + "learning_rate": 3.286318834940729e-05, + "loss": 1.7816, + "step": 20299 + }, + { + "epoch": 6.230816451810927, + "grad_norm": 0.19166043400764465, + "learning_rate": 3.285851893271165e-05, + "loss": 1.7209, + "step": 20300 + }, + { + "epoch": 6.231123388581952, + "grad_norm": 0.2139919251203537, + "learning_rate": 3.2853849685418195e-05, + "loss": 1.6946, + "step": 20301 + }, + { + "epoch": 6.231430325352977, + "grad_norm": 0.20296575129032135, + "learning_rate": 3.284918060757303e-05, + "loss": 1.6829, + "step": 20302 + }, + { + "epoch": 6.231737262124002, + "grad_norm": 0.2465996891260147, + "learning_rate": 3.2844511699222314e-05, + "loss": 1.751, + "step": 20303 + }, + { + "epoch": 6.232044198895028, + "grad_norm": 0.23327109217643738, + "learning_rate": 3.283984296041219e-05, + "loss": 1.736, + "step": 20304 + }, + { + "epoch": 6.232351135666053, + "grad_norm": 0.24316997826099396, + "learning_rate": 3.2835174391188806e-05, + "loss": 1.7187, + "step": 20305 + }, + { + "epoch": 6.232658072437078, + "grad_norm": 0.25280308723449707, + "learning_rate": 3.2830505991598294e-05, + "loss": 1.7087, + "step": 20306 + }, + { + "epoch": 6.232965009208103, + "grad_norm": 0.19143202900886536, + "learning_rate": 3.282583776168676e-05, + "loss": 1.674, + "step": 20307 + }, + { + "epoch": 6.233271945979128, + "grad_norm": 0.2667979598045349, + "learning_rate": 3.282116970150038e-05, + "loss": 1.7978, + "step": 20308 + }, + { + "epoch": 6.2335788827501535, + "grad_norm": 0.18397411704063416, + "learning_rate": 3.281650181108526e-05, + "loss": 1.7669, + "step": 20309 + }, + { + "epoch": 6.233885819521179, + "grad_norm": 0.2842588722705841, + "learning_rate": 3.281183409048756e-05, + "loss": 1.8238, + "step": 20310 + }, + { + "epoch": 6.234192756292204, + "grad_norm": 0.20290467143058777, + "learning_rate": 3.280716653975336e-05, + "loss": 1.7317, + "step": 20311 + }, + { + "epoch": 6.234499693063229, + "grad_norm": 0.224524587392807, + "learning_rate": 3.280249915892885e-05, + "loss": 1.8166, + "step": 20312 + }, + { + "epoch": 6.234806629834254, + "grad_norm": 0.28204405307769775, + "learning_rate": 3.2797831948060096e-05, + "loss": 1.7435, + "step": 20313 + }, + { + "epoch": 6.235113566605279, + "grad_norm": 0.2101798951625824, + "learning_rate": 3.2793164907193264e-05, + "loss": 1.6747, + "step": 20314 + }, + { + "epoch": 6.235420503376305, + "grad_norm": 0.1961289346218109, + "learning_rate": 3.278849803637445e-05, + "loss": 1.7131, + "step": 20315 + }, + { + "epoch": 6.23572744014733, + "grad_norm": 0.30541354417800903, + "learning_rate": 3.27838313356498e-05, + "loss": 1.8036, + "step": 20316 + }, + { + "epoch": 6.236034376918354, + "grad_norm": 0.21517200767993927, + "learning_rate": 3.277916480506541e-05, + "loss": 1.7684, + "step": 20317 + }, + { + "epoch": 6.23634131368938, + "grad_norm": 0.22871750593185425, + "learning_rate": 3.2774498444667426e-05, + "loss": 1.7545, + "step": 20318 + }, + { + "epoch": 6.236648250460405, + "grad_norm": 0.24596424400806427, + "learning_rate": 3.276983225450192e-05, + "loss": 1.6705, + "step": 20319 + }, + { + "epoch": 6.23695518723143, + "grad_norm": 0.19123119115829468, + "learning_rate": 3.2765166234615044e-05, + "loss": 1.7402, + "step": 20320 + }, + { + "epoch": 6.237262124002456, + "grad_norm": 0.25287121534347534, + "learning_rate": 3.276050038505288e-05, + "loss": 1.741, + "step": 20321 + }, + { + "epoch": 6.237569060773481, + "grad_norm": 0.19741536676883698, + "learning_rate": 3.275583470586158e-05, + "loss": 1.736, + "step": 20322 + }, + { + "epoch": 6.2378759975445055, + "grad_norm": 0.24529922008514404, + "learning_rate": 3.275116919708723e-05, + "loss": 1.6696, + "step": 20323 + }, + { + "epoch": 6.238182934315531, + "grad_norm": 0.25428420305252075, + "learning_rate": 3.274650385877591e-05, + "loss": 1.696, + "step": 20324 + }, + { + "epoch": 6.238489871086556, + "grad_norm": 0.19502994418144226, + "learning_rate": 3.274183869097377e-05, + "loss": 1.6976, + "step": 20325 + }, + { + "epoch": 6.2387968078575815, + "grad_norm": 0.23710335791110992, + "learning_rate": 3.273717369372688e-05, + "loss": 1.7395, + "step": 20326 + }, + { + "epoch": 6.239103744628607, + "grad_norm": 0.20904341340065002, + "learning_rate": 3.273250886708138e-05, + "loss": 1.7455, + "step": 20327 + }, + { + "epoch": 6.239410681399631, + "grad_norm": 0.2112383097410202, + "learning_rate": 3.272784421108332e-05, + "loss": 1.7401, + "step": 20328 + }, + { + "epoch": 6.239717618170657, + "grad_norm": 0.2310914695262909, + "learning_rate": 3.272317972577886e-05, + "loss": 1.8049, + "step": 20329 + }, + { + "epoch": 6.240024554941682, + "grad_norm": 0.18222108483314514, + "learning_rate": 3.271851541121404e-05, + "loss": 1.7119, + "step": 20330 + }, + { + "epoch": 6.240331491712707, + "grad_norm": 0.18739092350006104, + "learning_rate": 3.2713851267434984e-05, + "loss": 1.744, + "step": 20331 + }, + { + "epoch": 6.240638428483733, + "grad_norm": 0.17722012102603912, + "learning_rate": 3.2709187294487775e-05, + "loss": 1.7054, + "step": 20332 + }, + { + "epoch": 6.240945365254758, + "grad_norm": 0.18650192022323608, + "learning_rate": 3.270452349241854e-05, + "loss": 1.7272, + "step": 20333 + }, + { + "epoch": 6.241252302025782, + "grad_norm": 0.2004886120557785, + "learning_rate": 3.269985986127331e-05, + "loss": 1.6777, + "step": 20334 + }, + { + "epoch": 6.241559238796808, + "grad_norm": 0.1855446845293045, + "learning_rate": 3.269519640109823e-05, + "loss": 1.6823, + "step": 20335 + }, + { + "epoch": 6.241866175567833, + "grad_norm": 0.1950632780790329, + "learning_rate": 3.269053311193934e-05, + "loss": 1.7052, + "step": 20336 + }, + { + "epoch": 6.242173112338858, + "grad_norm": 0.19386698305606842, + "learning_rate": 3.268586999384276e-05, + "loss": 1.7431, + "step": 20337 + }, + { + "epoch": 6.242480049109884, + "grad_norm": 0.2266446053981781, + "learning_rate": 3.268120704685454e-05, + "loss": 1.735, + "step": 20338 + }, + { + "epoch": 6.242786985880908, + "grad_norm": 0.24133828282356262, + "learning_rate": 3.2676544271020814e-05, + "loss": 1.7707, + "step": 20339 + }, + { + "epoch": 6.2430939226519335, + "grad_norm": 0.22397162020206451, + "learning_rate": 3.267188166638763e-05, + "loss": 1.6943, + "step": 20340 + }, + { + "epoch": 6.243400859422959, + "grad_norm": 0.1614205688238144, + "learning_rate": 3.266721923300104e-05, + "loss": 1.6801, + "step": 20341 + }, + { + "epoch": 6.243707796193984, + "grad_norm": 0.22376522421836853, + "learning_rate": 3.2662556970907166e-05, + "loss": 1.6933, + "step": 20342 + }, + { + "epoch": 6.2440147329650095, + "grad_norm": 0.18614265322685242, + "learning_rate": 3.265789488015205e-05, + "loss": 1.7396, + "step": 20343 + }, + { + "epoch": 6.244321669736034, + "grad_norm": 0.2385358214378357, + "learning_rate": 3.265323296078181e-05, + "loss": 1.7782, + "step": 20344 + }, + { + "epoch": 6.244628606507059, + "grad_norm": 0.24316444993019104, + "learning_rate": 3.264857121284246e-05, + "loss": 1.7443, + "step": 20345 + }, + { + "epoch": 6.244935543278085, + "grad_norm": 0.184532031416893, + "learning_rate": 3.264390963638012e-05, + "loss": 1.7603, + "step": 20346 + }, + { + "epoch": 6.24524248004911, + "grad_norm": 0.2018461376428604, + "learning_rate": 3.2639248231440825e-05, + "loss": 1.7289, + "step": 20347 + }, + { + "epoch": 6.245549416820135, + "grad_norm": 0.23732338845729828, + "learning_rate": 3.263458699807066e-05, + "loss": 1.7924, + "step": 20348 + }, + { + "epoch": 6.245856353591161, + "grad_norm": 0.19645710289478302, + "learning_rate": 3.2629925936315674e-05, + "loss": 1.6855, + "step": 20349 + }, + { + "epoch": 6.246163290362185, + "grad_norm": 0.20730608701705933, + "learning_rate": 3.262526504622196e-05, + "loss": 1.7238, + "step": 20350 + }, + { + "epoch": 6.24647022713321, + "grad_norm": 0.21139587461948395, + "learning_rate": 3.2620604327835545e-05, + "loss": 1.7173, + "step": 20351 + }, + { + "epoch": 6.246777163904236, + "grad_norm": 0.22644877433776855, + "learning_rate": 3.261594378120252e-05, + "loss": 1.7976, + "step": 20352 + }, + { + "epoch": 6.247084100675261, + "grad_norm": 0.23719535768032074, + "learning_rate": 3.2611283406368906e-05, + "loss": 1.7549, + "step": 20353 + }, + { + "epoch": 6.247391037446286, + "grad_norm": 0.2046387791633606, + "learning_rate": 3.2606623203380807e-05, + "loss": 1.7343, + "step": 20354 + }, + { + "epoch": 6.247697974217311, + "grad_norm": 0.19325366616249084, + "learning_rate": 3.260196317228422e-05, + "loss": 1.7352, + "step": 20355 + }, + { + "epoch": 6.248004910988336, + "grad_norm": 0.2315458059310913, + "learning_rate": 3.259730331312526e-05, + "loss": 1.7838, + "step": 20356 + }, + { + "epoch": 6.2483118477593615, + "grad_norm": 0.24549536406993866, + "learning_rate": 3.2592643625949956e-05, + "loss": 1.7418, + "step": 20357 + }, + { + "epoch": 6.248618784530387, + "grad_norm": 0.2702246606349945, + "learning_rate": 3.258798411080432e-05, + "loss": 1.7651, + "step": 20358 + }, + { + "epoch": 6.248925721301412, + "grad_norm": 0.20515258610248566, + "learning_rate": 3.2583324767734444e-05, + "loss": 1.6866, + "step": 20359 + }, + { + "epoch": 6.249232658072437, + "grad_norm": 0.2696690261363983, + "learning_rate": 3.257866559678635e-05, + "loss": 1.7446, + "step": 20360 + }, + { + "epoch": 6.249539594843462, + "grad_norm": 0.19707174599170685, + "learning_rate": 3.2574006598006114e-05, + "loss": 1.6835, + "step": 20361 + }, + { + "epoch": 6.249846531614487, + "grad_norm": 0.23478952050209045, + "learning_rate": 3.256934777143974e-05, + "loss": 1.7344, + "step": 20362 + }, + { + "epoch": 6.250153468385513, + "grad_norm": 0.24214082956314087, + "learning_rate": 3.2564689117133306e-05, + "loss": 1.722, + "step": 20363 + }, + { + "epoch": 6.250460405156538, + "grad_norm": 0.18361221253871918, + "learning_rate": 3.256003063513281e-05, + "loss": 1.7336, + "step": 20364 + }, + { + "epoch": 6.250767341927563, + "grad_norm": 0.18548928201198578, + "learning_rate": 3.255537232548433e-05, + "loss": 1.6586, + "step": 20365 + }, + { + "epoch": 6.251074278698588, + "grad_norm": 0.2121812105178833, + "learning_rate": 3.2550714188233874e-05, + "loss": 1.7273, + "step": 20366 + }, + { + "epoch": 6.251381215469613, + "grad_norm": 0.2351878583431244, + "learning_rate": 3.25460562234275e-05, + "loss": 1.7101, + "step": 20367 + }, + { + "epoch": 6.2516881522406385, + "grad_norm": 0.20723144710063934, + "learning_rate": 3.2541398431111216e-05, + "loss": 1.7042, + "step": 20368 + }, + { + "epoch": 6.251995089011664, + "grad_norm": 0.19093643128871918, + "learning_rate": 3.2536740811331084e-05, + "loss": 1.7585, + "step": 20369 + }, + { + "epoch": 6.252302025782689, + "grad_norm": 0.27191361784935, + "learning_rate": 3.2532083364133094e-05, + "loss": 1.7734, + "step": 20370 + }, + { + "epoch": 6.252608962553714, + "grad_norm": 0.21019349992275238, + "learning_rate": 3.2527426089563306e-05, + "loss": 1.7015, + "step": 20371 + }, + { + "epoch": 6.252915899324739, + "grad_norm": 0.2300454080104828, + "learning_rate": 3.2522768987667744e-05, + "loss": 1.7311, + "step": 20372 + }, + { + "epoch": 6.253222836095764, + "grad_norm": 0.24723999202251434, + "learning_rate": 3.25181120584924e-05, + "loss": 1.674, + "step": 20373 + }, + { + "epoch": 6.25352977286679, + "grad_norm": 0.20302192866802216, + "learning_rate": 3.251345530208335e-05, + "loss": 1.6999, + "step": 20374 + }, + { + "epoch": 6.253836709637815, + "grad_norm": 0.25393861532211304, + "learning_rate": 3.250879871848655e-05, + "loss": 1.6761, + "step": 20375 + }, + { + "epoch": 6.25414364640884, + "grad_norm": 0.1879536211490631, + "learning_rate": 3.2504142307748064e-05, + "loss": 1.7233, + "step": 20376 + }, + { + "epoch": 6.254450583179865, + "grad_norm": 0.22197771072387695, + "learning_rate": 3.24994860699139e-05, + "loss": 1.6994, + "step": 20377 + }, + { + "epoch": 6.25475751995089, + "grad_norm": 0.24946242570877075, + "learning_rate": 3.249483000503008e-05, + "loss": 1.8488, + "step": 20378 + }, + { + "epoch": 6.255064456721915, + "grad_norm": 0.25218987464904785, + "learning_rate": 3.2490174113142594e-05, + "loss": 1.7947, + "step": 20379 + }, + { + "epoch": 6.255371393492941, + "grad_norm": 0.23970970511436462, + "learning_rate": 3.248551839429749e-05, + "loss": 1.785, + "step": 20380 + }, + { + "epoch": 6.255678330263966, + "grad_norm": 0.243649423122406, + "learning_rate": 3.248086284854074e-05, + "loss": 1.8089, + "step": 20381 + }, + { + "epoch": 6.2559852670349905, + "grad_norm": 0.18813125789165497, + "learning_rate": 3.247620747591838e-05, + "loss": 1.6892, + "step": 20382 + }, + { + "epoch": 6.256292203806016, + "grad_norm": 0.2495514154434204, + "learning_rate": 3.2471552276476404e-05, + "loss": 1.7573, + "step": 20383 + }, + { + "epoch": 6.256599140577041, + "grad_norm": 0.200107604265213, + "learning_rate": 3.2466897250260835e-05, + "loss": 1.7292, + "step": 20384 + }, + { + "epoch": 6.2569060773480665, + "grad_norm": 0.25782206654548645, + "learning_rate": 3.246224239731765e-05, + "loss": 1.8533, + "step": 20385 + }, + { + "epoch": 6.257213014119092, + "grad_norm": 0.1966158151626587, + "learning_rate": 3.245758771769288e-05, + "loss": 1.648, + "step": 20386 + }, + { + "epoch": 6.257519950890116, + "grad_norm": 0.23248116672039032, + "learning_rate": 3.245293321143249e-05, + "loss": 1.7277, + "step": 20387 + }, + { + "epoch": 6.257826887661142, + "grad_norm": 0.26347780227661133, + "learning_rate": 3.244827887858251e-05, + "loss": 1.7429, + "step": 20388 + }, + { + "epoch": 6.258133824432167, + "grad_norm": 0.20794285833835602, + "learning_rate": 3.244362471918894e-05, + "loss": 1.7358, + "step": 20389 + }, + { + "epoch": 6.258440761203192, + "grad_norm": 0.200898677110672, + "learning_rate": 3.243897073329774e-05, + "loss": 1.6661, + "step": 20390 + }, + { + "epoch": 6.258747697974218, + "grad_norm": 0.20945283770561218, + "learning_rate": 3.2434316920954935e-05, + "loss": 1.7036, + "step": 20391 + }, + { + "epoch": 6.259054634745242, + "grad_norm": 0.3154161274433136, + "learning_rate": 3.242966328220649e-05, + "loss": 1.8174, + "step": 20392 + }, + { + "epoch": 6.259361571516267, + "grad_norm": 0.19321799278259277, + "learning_rate": 3.242500981709843e-05, + "loss": 1.6823, + "step": 20393 + }, + { + "epoch": 6.259668508287293, + "grad_norm": 0.22610130906105042, + "learning_rate": 3.2420356525676696e-05, + "loss": 1.6865, + "step": 20394 + }, + { + "epoch": 6.259975445058318, + "grad_norm": 0.19190505146980286, + "learning_rate": 3.241570340798734e-05, + "loss": 1.6663, + "step": 20395 + }, + { + "epoch": 6.260282381829343, + "grad_norm": 0.21956418454647064, + "learning_rate": 3.2411050464076276e-05, + "loss": 1.7279, + "step": 20396 + }, + { + "epoch": 6.260589318600369, + "grad_norm": 0.2448553591966629, + "learning_rate": 3.240639769398956e-05, + "loss": 1.7438, + "step": 20397 + }, + { + "epoch": 6.260896255371393, + "grad_norm": 0.19194214046001434, + "learning_rate": 3.2401745097773096e-05, + "loss": 1.7429, + "step": 20398 + }, + { + "epoch": 6.2612031921424185, + "grad_norm": 0.2567521333694458, + "learning_rate": 3.239709267547291e-05, + "loss": 1.7051, + "step": 20399 + }, + { + "epoch": 6.261510128913444, + "grad_norm": 0.18335886299610138, + "learning_rate": 3.239244042713498e-05, + "loss": 1.6828, + "step": 20400 + }, + { + "epoch": 6.261817065684469, + "grad_norm": 0.20112362504005432, + "learning_rate": 3.238778835280527e-05, + "loss": 1.6887, + "step": 20401 + }, + { + "epoch": 6.2621240024554945, + "grad_norm": 0.17095179855823517, + "learning_rate": 3.238313645252975e-05, + "loss": 1.7202, + "step": 20402 + }, + { + "epoch": 6.262430939226519, + "grad_norm": 0.24681979417800903, + "learning_rate": 3.237848472635442e-05, + "loss": 1.7196, + "step": 20403 + }, + { + "epoch": 6.262737875997544, + "grad_norm": 0.2022300660610199, + "learning_rate": 3.237383317432522e-05, + "loss": 1.7265, + "step": 20404 + }, + { + "epoch": 6.26304481276857, + "grad_norm": 0.2900621294975281, + "learning_rate": 3.236918179648813e-05, + "loss": 1.7051, + "step": 20405 + }, + { + "epoch": 6.263351749539595, + "grad_norm": 0.37675586342811584, + "learning_rate": 3.2364530592889135e-05, + "loss": 1.7747, + "step": 20406 + }, + { + "epoch": 6.26365868631062, + "grad_norm": 0.19033703207969666, + "learning_rate": 3.235987956357416e-05, + "loss": 1.7529, + "step": 20407 + }, + { + "epoch": 6.263965623081646, + "grad_norm": 0.2877013385295868, + "learning_rate": 3.235522870858922e-05, + "loss": 1.6942, + "step": 20408 + }, + { + "epoch": 6.26427255985267, + "grad_norm": 0.22717125713825226, + "learning_rate": 3.235057802798023e-05, + "loss": 1.7302, + "step": 20409 + }, + { + "epoch": 6.264579496623695, + "grad_norm": 0.2571920156478882, + "learning_rate": 3.2345927521793185e-05, + "loss": 1.6782, + "step": 20410 + }, + { + "epoch": 6.264886433394721, + "grad_norm": 0.43085625767707825, + "learning_rate": 3.234127719007403e-05, + "loss": 1.7946, + "step": 20411 + }, + { + "epoch": 6.265193370165746, + "grad_norm": 0.19355928897857666, + "learning_rate": 3.2336627032868726e-05, + "loss": 1.7288, + "step": 20412 + }, + { + "epoch": 6.265500306936771, + "grad_norm": 0.24871474504470825, + "learning_rate": 3.233197705022322e-05, + "loss": 1.6862, + "step": 20413 + }, + { + "epoch": 6.265807243707796, + "grad_norm": 0.26919320225715637, + "learning_rate": 3.232732724218348e-05, + "loss": 1.8061, + "step": 20414 + }, + { + "epoch": 6.266114180478821, + "grad_norm": 0.21714363992214203, + "learning_rate": 3.2322677608795436e-05, + "loss": 1.7036, + "step": 20415 + }, + { + "epoch": 6.2664211172498465, + "grad_norm": 0.24496719241142273, + "learning_rate": 3.231802815010506e-05, + "loss": 1.7334, + "step": 20416 + }, + { + "epoch": 6.266728054020872, + "grad_norm": 0.22501519322395325, + "learning_rate": 3.231337886615831e-05, + "loss": 1.7545, + "step": 20417 + }, + { + "epoch": 6.267034990791897, + "grad_norm": 0.2683655917644501, + "learning_rate": 3.23087297570011e-05, + "loss": 1.7235, + "step": 20418 + }, + { + "epoch": 6.267341927562922, + "grad_norm": 0.23341359198093414, + "learning_rate": 3.230408082267938e-05, + "loss": 1.7389, + "step": 20419 + }, + { + "epoch": 6.267648864333947, + "grad_norm": 0.2914128601551056, + "learning_rate": 3.229943206323913e-05, + "loss": 1.7223, + "step": 20420 + }, + { + "epoch": 6.267955801104972, + "grad_norm": 0.2072528451681137, + "learning_rate": 3.229478347872625e-05, + "loss": 1.7422, + "step": 20421 + }, + { + "epoch": 6.268262737875998, + "grad_norm": 0.22678662836551666, + "learning_rate": 3.229013506918671e-05, + "loss": 1.6973, + "step": 20422 + }, + { + "epoch": 6.268569674647023, + "grad_norm": 0.1928883194923401, + "learning_rate": 3.228548683466643e-05, + "loss": 1.7235, + "step": 20423 + }, + { + "epoch": 6.268876611418047, + "grad_norm": 0.2402963638305664, + "learning_rate": 3.2280838775211345e-05, + "loss": 1.7587, + "step": 20424 + }, + { + "epoch": 6.269183548189073, + "grad_norm": 0.20416294038295746, + "learning_rate": 3.227619089086742e-05, + "loss": 1.7591, + "step": 20425 + }, + { + "epoch": 6.269490484960098, + "grad_norm": 0.20308947563171387, + "learning_rate": 3.227154318168053e-05, + "loss": 1.7264, + "step": 20426 + }, + { + "epoch": 6.269797421731123, + "grad_norm": 0.18733863532543182, + "learning_rate": 3.226689564769667e-05, + "loss": 1.6943, + "step": 20427 + }, + { + "epoch": 6.270104358502149, + "grad_norm": 0.183793842792511, + "learning_rate": 3.226224828896173e-05, + "loss": 1.7082, + "step": 20428 + }, + { + "epoch": 6.270411295273174, + "grad_norm": 0.20471547544002533, + "learning_rate": 3.225760110552165e-05, + "loss": 1.7352, + "step": 20429 + }, + { + "epoch": 6.2707182320441985, + "grad_norm": 0.23386713862419128, + "learning_rate": 3.225295409742234e-05, + "loss": 1.7666, + "step": 20430 + }, + { + "epoch": 6.271025168815224, + "grad_norm": 0.2024994194507599, + "learning_rate": 3.224830726470976e-05, + "loss": 1.6573, + "step": 20431 + }, + { + "epoch": 6.271332105586249, + "grad_norm": 0.2352776825428009, + "learning_rate": 3.2243660607429805e-05, + "loss": 1.7884, + "step": 20432 + }, + { + "epoch": 6.2716390423572745, + "grad_norm": 0.19755585491657257, + "learning_rate": 3.223901412562841e-05, + "loss": 1.6964, + "step": 20433 + }, + { + "epoch": 6.2719459791283, + "grad_norm": 0.25833839178085327, + "learning_rate": 3.223436781935148e-05, + "loss": 1.715, + "step": 20434 + }, + { + "epoch": 6.272252915899324, + "grad_norm": 0.2110220193862915, + "learning_rate": 3.222972168864493e-05, + "loss": 1.7617, + "step": 20435 + }, + { + "epoch": 6.27255985267035, + "grad_norm": 0.23262515664100647, + "learning_rate": 3.2225075733554685e-05, + "loss": 1.7616, + "step": 20436 + }, + { + "epoch": 6.272866789441375, + "grad_norm": 0.1926576942205429, + "learning_rate": 3.222042995412669e-05, + "loss": 1.6956, + "step": 20437 + }, + { + "epoch": 6.2731737262124, + "grad_norm": 0.20662757754325867, + "learning_rate": 3.22157843504068e-05, + "loss": 1.703, + "step": 20438 + }, + { + "epoch": 6.273480662983426, + "grad_norm": 0.22137406468391418, + "learning_rate": 3.2211138922440975e-05, + "loss": 1.6961, + "step": 20439 + }, + { + "epoch": 6.273787599754451, + "grad_norm": 0.25777003169059753, + "learning_rate": 3.2206493670275086e-05, + "loss": 1.704, + "step": 20440 + }, + { + "epoch": 6.274094536525475, + "grad_norm": 0.20540094375610352, + "learning_rate": 3.2201848593955046e-05, + "loss": 1.6759, + "step": 20441 + }, + { + "epoch": 6.274401473296501, + "grad_norm": 0.2447255402803421, + "learning_rate": 3.21972036935268e-05, + "loss": 1.7379, + "step": 20442 + }, + { + "epoch": 6.274708410067526, + "grad_norm": 0.2017194777727127, + "learning_rate": 3.219255896903619e-05, + "loss": 1.6518, + "step": 20443 + }, + { + "epoch": 6.2750153468385514, + "grad_norm": 0.22742003202438354, + "learning_rate": 3.2187914420529174e-05, + "loss": 1.7568, + "step": 20444 + }, + { + "epoch": 6.275322283609577, + "grad_norm": 0.2065356969833374, + "learning_rate": 3.218327004805161e-05, + "loss": 1.643, + "step": 20445 + }, + { + "epoch": 6.275629220380601, + "grad_norm": 0.18083053827285767, + "learning_rate": 3.217862585164942e-05, + "loss": 1.77, + "step": 20446 + }, + { + "epoch": 6.275936157151627, + "grad_norm": 0.2175968736410141, + "learning_rate": 3.2173981831368484e-05, + "loss": 1.738, + "step": 20447 + }, + { + "epoch": 6.276243093922652, + "grad_norm": 0.17635080218315125, + "learning_rate": 3.216933798725473e-05, + "loss": 1.7109, + "step": 20448 + }, + { + "epoch": 6.276550030693677, + "grad_norm": 0.22289423644542694, + "learning_rate": 3.216469431935401e-05, + "loss": 1.7853, + "step": 20449 + }, + { + "epoch": 6.276856967464703, + "grad_norm": 0.21214549243450165, + "learning_rate": 3.216005082771225e-05, + "loss": 1.8196, + "step": 20450 + }, + { + "epoch": 6.277163904235728, + "grad_norm": 0.21992212533950806, + "learning_rate": 3.215540751237531e-05, + "loss": 1.7445, + "step": 20451 + }, + { + "epoch": 6.277470841006752, + "grad_norm": 0.16256563365459442, + "learning_rate": 3.2150764373389096e-05, + "loss": 1.6582, + "step": 20452 + }, + { + "epoch": 6.277777777777778, + "grad_norm": 0.1885976791381836, + "learning_rate": 3.214612141079949e-05, + "loss": 1.7491, + "step": 20453 + }, + { + "epoch": 6.278084714548803, + "grad_norm": 0.24101774394512177, + "learning_rate": 3.2141478624652386e-05, + "loss": 1.7476, + "step": 20454 + }, + { + "epoch": 6.278391651319828, + "grad_norm": 0.23378998041152954, + "learning_rate": 3.213683601499364e-05, + "loss": 1.7575, + "step": 20455 + }, + { + "epoch": 6.278698588090854, + "grad_norm": 0.2032867670059204, + "learning_rate": 3.213219358186917e-05, + "loss": 1.6999, + "step": 20456 + }, + { + "epoch": 6.279005524861878, + "grad_norm": 0.21332181990146637, + "learning_rate": 3.2127551325324836e-05, + "loss": 1.6634, + "step": 20457 + }, + { + "epoch": 6.2793124616329035, + "grad_norm": 0.23767098784446716, + "learning_rate": 3.2122909245406494e-05, + "loss": 1.8023, + "step": 20458 + }, + { + "epoch": 6.279619398403929, + "grad_norm": 0.19987638294696808, + "learning_rate": 3.211826734216007e-05, + "loss": 1.6848, + "step": 20459 + }, + { + "epoch": 6.279926335174954, + "grad_norm": 0.22169579565525055, + "learning_rate": 3.2113625615631385e-05, + "loss": 1.7599, + "step": 20460 + }, + { + "epoch": 6.2802332719459795, + "grad_norm": 0.1768191009759903, + "learning_rate": 3.210898406586634e-05, + "loss": 1.6894, + "step": 20461 + }, + { + "epoch": 6.280540208717004, + "grad_norm": 0.1923041045665741, + "learning_rate": 3.21043426929108e-05, + "loss": 1.7379, + "step": 20462 + }, + { + "epoch": 6.280847145488029, + "grad_norm": 0.1836252212524414, + "learning_rate": 3.2099701496810644e-05, + "loss": 1.6748, + "step": 20463 + }, + { + "epoch": 6.281154082259055, + "grad_norm": 0.2203192561864853, + "learning_rate": 3.2095060477611705e-05, + "loss": 1.6969, + "step": 20464 + }, + { + "epoch": 6.28146101903008, + "grad_norm": 0.25511759519577026, + "learning_rate": 3.20904196353599e-05, + "loss": 1.7806, + "step": 20465 + }, + { + "epoch": 6.281767955801105, + "grad_norm": 0.19464822113513947, + "learning_rate": 3.208577897010106e-05, + "loss": 1.6784, + "step": 20466 + }, + { + "epoch": 6.28207489257213, + "grad_norm": 0.1949714869260788, + "learning_rate": 3.208113848188105e-05, + "loss": 1.713, + "step": 20467 + }, + { + "epoch": 6.282381829343155, + "grad_norm": 0.22094127535820007, + "learning_rate": 3.207649817074572e-05, + "loss": 1.7397, + "step": 20468 + }, + { + "epoch": 6.28268876611418, + "grad_norm": 0.22343899309635162, + "learning_rate": 3.2071858036740954e-05, + "loss": 1.717, + "step": 20469 + }, + { + "epoch": 6.282995702885206, + "grad_norm": 0.20854893326759338, + "learning_rate": 3.2067218079912584e-05, + "loss": 1.7255, + "step": 20470 + }, + { + "epoch": 6.283302639656231, + "grad_norm": 0.21306286752223969, + "learning_rate": 3.206257830030649e-05, + "loss": 1.7251, + "step": 20471 + }, + { + "epoch": 6.283609576427256, + "grad_norm": 0.24995777010917664, + "learning_rate": 3.20579386979685e-05, + "loss": 1.7892, + "step": 20472 + }, + { + "epoch": 6.283916513198281, + "grad_norm": 0.23720023036003113, + "learning_rate": 3.2053299272944486e-05, + "loss": 1.7843, + "step": 20473 + }, + { + "epoch": 6.284223449969306, + "grad_norm": 0.2042113095521927, + "learning_rate": 3.204866002528029e-05, + "loss": 1.7318, + "step": 20474 + }, + { + "epoch": 6.2845303867403315, + "grad_norm": 0.22996367514133453, + "learning_rate": 3.2044020955021735e-05, + "loss": 1.6875, + "step": 20475 + }, + { + "epoch": 6.284837323511357, + "grad_norm": 0.187749981880188, + "learning_rate": 3.203938206221471e-05, + "loss": 1.7297, + "step": 20476 + }, + { + "epoch": 6.285144260282382, + "grad_norm": 0.18279509246349335, + "learning_rate": 3.2034743346905025e-05, + "loss": 1.6858, + "step": 20477 + }, + { + "epoch": 6.285451197053407, + "grad_norm": 0.1871512532234192, + "learning_rate": 3.203010480913855e-05, + "loss": 1.7224, + "step": 20478 + }, + { + "epoch": 6.285758133824432, + "grad_norm": 0.17732922732830048, + "learning_rate": 3.202546644896109e-05, + "loss": 1.6872, + "step": 20479 + }, + { + "epoch": 6.286065070595457, + "grad_norm": 0.21146097779273987, + "learning_rate": 3.2020828266418527e-05, + "loss": 1.797, + "step": 20480 + }, + { + "epoch": 6.286372007366483, + "grad_norm": 0.18914340436458588, + "learning_rate": 3.201619026155666e-05, + "loss": 1.7149, + "step": 20481 + }, + { + "epoch": 6.286678944137508, + "grad_norm": 0.20919133722782135, + "learning_rate": 3.2011552434421364e-05, + "loss": 1.7803, + "step": 20482 + }, + { + "epoch": 6.286985880908533, + "grad_norm": 0.17882505059242249, + "learning_rate": 3.200691478505843e-05, + "loss": 1.757, + "step": 20483 + }, + { + "epoch": 6.287292817679558, + "grad_norm": 0.1850014477968216, + "learning_rate": 3.200227731351373e-05, + "loss": 1.7006, + "step": 20484 + }, + { + "epoch": 6.287599754450583, + "grad_norm": 0.19999323785305023, + "learning_rate": 3.1997640019833056e-05, + "loss": 1.702, + "step": 20485 + }, + { + "epoch": 6.287906691221608, + "grad_norm": 0.20464713871479034, + "learning_rate": 3.1993002904062255e-05, + "loss": 1.7272, + "step": 20486 + }, + { + "epoch": 6.288213627992634, + "grad_norm": 0.2105564922094345, + "learning_rate": 3.1988365966247154e-05, + "loss": 1.7062, + "step": 20487 + }, + { + "epoch": 6.288520564763659, + "grad_norm": 0.26322871446609497, + "learning_rate": 3.198372920643359e-05, + "loss": 1.7309, + "step": 20488 + }, + { + "epoch": 6.2888275015346835, + "grad_norm": 0.22787201404571533, + "learning_rate": 3.197909262466736e-05, + "loss": 1.7797, + "step": 20489 + }, + { + "epoch": 6.289134438305709, + "grad_norm": 0.21409621834754944, + "learning_rate": 3.1974456220994314e-05, + "loss": 1.8211, + "step": 20490 + }, + { + "epoch": 6.289441375076734, + "grad_norm": 0.2241450846195221, + "learning_rate": 3.196981999546025e-05, + "loss": 1.7255, + "step": 20491 + }, + { + "epoch": 6.2897483118477595, + "grad_norm": 0.23141883313655853, + "learning_rate": 3.1965183948110985e-05, + "loss": 1.7695, + "step": 20492 + }, + { + "epoch": 6.290055248618785, + "grad_norm": 0.209358349442482, + "learning_rate": 3.196054807899236e-05, + "loss": 1.6808, + "step": 20493 + }, + { + "epoch": 6.290362185389809, + "grad_norm": 0.20730538666248322, + "learning_rate": 3.195591238815015e-05, + "loss": 1.6847, + "step": 20494 + }, + { + "epoch": 6.290669122160835, + "grad_norm": 0.2568998634815216, + "learning_rate": 3.195127687563021e-05, + "loss": 1.664, + "step": 20495 + }, + { + "epoch": 6.29097605893186, + "grad_norm": 0.238932803273201, + "learning_rate": 3.1946641541478316e-05, + "loss": 1.7166, + "step": 20496 + }, + { + "epoch": 6.291282995702885, + "grad_norm": 0.235393688082695, + "learning_rate": 3.19420063857403e-05, + "loss": 1.6572, + "step": 20497 + }, + { + "epoch": 6.291589932473911, + "grad_norm": 0.2888807952404022, + "learning_rate": 3.1937371408461944e-05, + "loss": 1.7484, + "step": 20498 + }, + { + "epoch": 6.291896869244935, + "grad_norm": 0.18588709831237793, + "learning_rate": 3.1932736609689096e-05, + "loss": 1.7027, + "step": 20499 + }, + { + "epoch": 6.29220380601596, + "grad_norm": 0.3065604865550995, + "learning_rate": 3.1928101989467514e-05, + "loss": 1.8051, + "step": 20500 + }, + { + "epoch": 6.292510742786986, + "grad_norm": 0.2480497658252716, + "learning_rate": 3.192346754784304e-05, + "loss": 1.7749, + "step": 20501 + }, + { + "epoch": 6.292817679558011, + "grad_norm": 0.268686443567276, + "learning_rate": 3.1918833284861436e-05, + "loss": 1.7062, + "step": 20502 + }, + { + "epoch": 6.293124616329036, + "grad_norm": 0.337510883808136, + "learning_rate": 3.191419920056853e-05, + "loss": 1.745, + "step": 20503 + }, + { + "epoch": 6.293431553100062, + "grad_norm": 0.18532821536064148, + "learning_rate": 3.190956529501009e-05, + "loss": 1.7098, + "step": 20504 + }, + { + "epoch": 6.293738489871086, + "grad_norm": 0.27805468440055847, + "learning_rate": 3.1904931568231956e-05, + "loss": 1.7252, + "step": 20505 + }, + { + "epoch": 6.2940454266421115, + "grad_norm": 0.22137443721294403, + "learning_rate": 3.190029802027987e-05, + "loss": 1.7595, + "step": 20506 + }, + { + "epoch": 6.294352363413137, + "grad_norm": 0.23159445822238922, + "learning_rate": 3.189566465119968e-05, + "loss": 1.7503, + "step": 20507 + }, + { + "epoch": 6.294659300184162, + "grad_norm": 0.2089100182056427, + "learning_rate": 3.189103146103712e-05, + "loss": 1.7021, + "step": 20508 + }, + { + "epoch": 6.2949662369551875, + "grad_norm": 0.1985119879245758, + "learning_rate": 3.1886398449838e-05, + "loss": 1.7468, + "step": 20509 + }, + { + "epoch": 6.295273173726212, + "grad_norm": 0.18612028658390045, + "learning_rate": 3.188176561764812e-05, + "loss": 1.6657, + "step": 20510 + }, + { + "epoch": 6.295580110497237, + "grad_norm": 0.22453728318214417, + "learning_rate": 3.1877132964513226e-05, + "loss": 1.7223, + "step": 20511 + }, + { + "epoch": 6.295887047268263, + "grad_norm": 0.270304799079895, + "learning_rate": 3.187250049047916e-05, + "loss": 1.7548, + "step": 20512 + }, + { + "epoch": 6.296193984039288, + "grad_norm": 0.19762152433395386, + "learning_rate": 3.1867868195591643e-05, + "loss": 1.6945, + "step": 20513 + }, + { + "epoch": 6.296500920810313, + "grad_norm": 0.25173795223236084, + "learning_rate": 3.1863236079896486e-05, + "loss": 1.7303, + "step": 20514 + }, + { + "epoch": 6.296807857581339, + "grad_norm": 0.2073308676481247, + "learning_rate": 3.185860414343945e-05, + "loss": 1.7327, + "step": 20515 + }, + { + "epoch": 6.297114794352363, + "grad_norm": 0.24174070358276367, + "learning_rate": 3.185397238626635e-05, + "loss": 1.7577, + "step": 20516 + }, + { + "epoch": 6.297421731123388, + "grad_norm": 0.1950366348028183, + "learning_rate": 3.1849340808422905e-05, + "loss": 1.7137, + "step": 20517 + }, + { + "epoch": 6.297728667894414, + "grad_norm": 0.23416653275489807, + "learning_rate": 3.1844709409954936e-05, + "loss": 1.7547, + "step": 20518 + }, + { + "epoch": 6.298035604665439, + "grad_norm": 0.1939592808485031, + "learning_rate": 3.184007819090817e-05, + "loss": 1.7215, + "step": 20519 + }, + { + "epoch": 6.298342541436464, + "grad_norm": 0.21807245910167694, + "learning_rate": 3.1835447151328405e-05, + "loss": 1.7021, + "step": 20520 + }, + { + "epoch": 6.298649478207489, + "grad_norm": 0.21653762459754944, + "learning_rate": 3.183081629126138e-05, + "loss": 1.7426, + "step": 20521 + }, + { + "epoch": 6.298956414978514, + "grad_norm": 0.20749153196811676, + "learning_rate": 3.18261856107529e-05, + "loss": 1.7302, + "step": 20522 + }, + { + "epoch": 6.2992633517495396, + "grad_norm": 0.23450545966625214, + "learning_rate": 3.182155510984869e-05, + "loss": 1.7414, + "step": 20523 + }, + { + "epoch": 6.299570288520565, + "grad_norm": 0.17081578075885773, + "learning_rate": 3.181692478859455e-05, + "loss": 1.7017, + "step": 20524 + }, + { + "epoch": 6.29987722529159, + "grad_norm": 0.20244698226451874, + "learning_rate": 3.18122946470362e-05, + "loss": 1.6765, + "step": 20525 + }, + { + "epoch": 6.300184162062616, + "grad_norm": 0.20153406262397766, + "learning_rate": 3.180766468521941e-05, + "loss": 1.7437, + "step": 20526 + }, + { + "epoch": 6.30049109883364, + "grad_norm": 0.21135647594928741, + "learning_rate": 3.180303490318996e-05, + "loss": 1.7202, + "step": 20527 + }, + { + "epoch": 6.300798035604665, + "grad_norm": 0.20342735946178436, + "learning_rate": 3.1798405300993555e-05, + "loss": 1.7268, + "step": 20528 + }, + { + "epoch": 6.301104972375691, + "grad_norm": 0.21153734624385834, + "learning_rate": 3.1793775878676e-05, + "loss": 1.7455, + "step": 20529 + }, + { + "epoch": 6.301411909146716, + "grad_norm": 0.2197744995355606, + "learning_rate": 3.1789146636283015e-05, + "loss": 1.7876, + "step": 20530 + }, + { + "epoch": 6.301718845917741, + "grad_norm": 0.2236124575138092, + "learning_rate": 3.1784517573860356e-05, + "loss": 1.7454, + "step": 20531 + }, + { + "epoch": 6.302025782688766, + "grad_norm": 0.22071333229541779, + "learning_rate": 3.177988869145376e-05, + "loss": 1.7197, + "step": 20532 + }, + { + "epoch": 6.302332719459791, + "grad_norm": 0.20137591660022736, + "learning_rate": 3.177525998910901e-05, + "loss": 1.7153, + "step": 20533 + }, + { + "epoch": 6.3026396562308165, + "grad_norm": 0.18981720507144928, + "learning_rate": 3.17706314668718e-05, + "loss": 1.6948, + "step": 20534 + }, + { + "epoch": 6.302946593001842, + "grad_norm": 0.20803335309028625, + "learning_rate": 3.176600312478791e-05, + "loss": 1.7454, + "step": 20535 + }, + { + "epoch": 6.303253529772867, + "grad_norm": 0.2224191278219223, + "learning_rate": 3.176137496290305e-05, + "loss": 1.708, + "step": 20536 + }, + { + "epoch": 6.303560466543892, + "grad_norm": 0.21110501885414124, + "learning_rate": 3.175674698126298e-05, + "loss": 1.6976, + "step": 20537 + }, + { + "epoch": 6.303867403314917, + "grad_norm": 0.19902437925338745, + "learning_rate": 3.175211917991342e-05, + "loss": 1.7246, + "step": 20538 + }, + { + "epoch": 6.304174340085942, + "grad_norm": 0.1930927336215973, + "learning_rate": 3.174749155890013e-05, + "loss": 1.7849, + "step": 20539 + }, + { + "epoch": 6.304481276856968, + "grad_norm": 0.19350691139698029, + "learning_rate": 3.174286411826881e-05, + "loss": 1.7441, + "step": 20540 + }, + { + "epoch": 6.304788213627993, + "grad_norm": 0.18532924354076385, + "learning_rate": 3.173823685806523e-05, + "loss": 1.6675, + "step": 20541 + }, + { + "epoch": 6.305095150399017, + "grad_norm": 0.18890263140201569, + "learning_rate": 3.173360977833508e-05, + "loss": 1.7889, + "step": 20542 + }, + { + "epoch": 6.305402087170043, + "grad_norm": 0.20418904721736908, + "learning_rate": 3.17289828791241e-05, + "loss": 1.8298, + "step": 20543 + }, + { + "epoch": 6.305709023941068, + "grad_norm": 0.2298857718706131, + "learning_rate": 3.172435616047804e-05, + "loss": 1.7889, + "step": 20544 + }, + { + "epoch": 6.306015960712093, + "grad_norm": 0.20661889016628265, + "learning_rate": 3.171972962244258e-05, + "loss": 1.74, + "step": 20545 + }, + { + "epoch": 6.306322897483119, + "grad_norm": 0.17712774872779846, + "learning_rate": 3.1715103265063496e-05, + "loss": 1.72, + "step": 20546 + }, + { + "epoch": 6.306629834254144, + "grad_norm": 0.16776354610919952, + "learning_rate": 3.1710477088386456e-05, + "loss": 1.6715, + "step": 20547 + }, + { + "epoch": 6.3069367710251685, + "grad_norm": 0.21919682621955872, + "learning_rate": 3.170585109245721e-05, + "loss": 1.7232, + "step": 20548 + }, + { + "epoch": 6.307243707796194, + "grad_norm": 0.2026829719543457, + "learning_rate": 3.170122527732144e-05, + "loss": 1.7551, + "step": 20549 + }, + { + "epoch": 6.307550644567219, + "grad_norm": 0.18783780932426453, + "learning_rate": 3.169659964302493e-05, + "loss": 1.7024, + "step": 20550 + }, + { + "epoch": 6.3078575813382445, + "grad_norm": 0.2058420479297638, + "learning_rate": 3.1691974189613316e-05, + "loss": 1.7006, + "step": 20551 + }, + { + "epoch": 6.30816451810927, + "grad_norm": 0.21351832151412964, + "learning_rate": 3.168734891713237e-05, + "loss": 1.7586, + "step": 20552 + }, + { + "epoch": 6.308471454880294, + "grad_norm": 0.19816654920578003, + "learning_rate": 3.168272382562776e-05, + "loss": 1.7532, + "step": 20553 + }, + { + "epoch": 6.30877839165132, + "grad_norm": 0.18253186345100403, + "learning_rate": 3.16780989151452e-05, + "loss": 1.7413, + "step": 20554 + }, + { + "epoch": 6.309085328422345, + "grad_norm": 0.23097483813762665, + "learning_rate": 3.167347418573042e-05, + "loss": 1.7355, + "step": 20555 + }, + { + "epoch": 6.30939226519337, + "grad_norm": 0.1984725296497345, + "learning_rate": 3.166884963742911e-05, + "loss": 1.6754, + "step": 20556 + }, + { + "epoch": 6.309699201964396, + "grad_norm": 0.2385166734457016, + "learning_rate": 3.166422527028696e-05, + "loss": 1.7322, + "step": 20557 + }, + { + "epoch": 6.310006138735421, + "grad_norm": 0.23216524720191956, + "learning_rate": 3.165960108434971e-05, + "loss": 1.7426, + "step": 20558 + }, + { + "epoch": 6.310313075506445, + "grad_norm": 0.22017790377140045, + "learning_rate": 3.165497707966301e-05, + "loss": 1.6977, + "step": 20559 + }, + { + "epoch": 6.310620012277471, + "grad_norm": 0.2934584617614746, + "learning_rate": 3.165035325627257e-05, + "loss": 1.7252, + "step": 20560 + }, + { + "epoch": 6.310926949048496, + "grad_norm": 0.21830198168754578, + "learning_rate": 3.1645729614224126e-05, + "loss": 1.781, + "step": 20561 + }, + { + "epoch": 6.311233885819521, + "grad_norm": 0.3082836866378784, + "learning_rate": 3.1641106153563306e-05, + "loss": 1.8015, + "step": 20562 + }, + { + "epoch": 6.311540822590547, + "grad_norm": 0.22441358864307404, + "learning_rate": 3.163648287433586e-05, + "loss": 1.8058, + "step": 20563 + }, + { + "epoch": 6.311847759361571, + "grad_norm": 0.36623889207839966, + "learning_rate": 3.163185977658744e-05, + "loss": 1.7092, + "step": 20564 + }, + { + "epoch": 6.3121546961325965, + "grad_norm": 0.22231145203113556, + "learning_rate": 3.1627236860363755e-05, + "loss": 1.6432, + "step": 20565 + }, + { + "epoch": 6.312461632903622, + "grad_norm": 0.25871971249580383, + "learning_rate": 3.162261412571047e-05, + "loss": 1.7156, + "step": 20566 + }, + { + "epoch": 6.312768569674647, + "grad_norm": 0.24574241042137146, + "learning_rate": 3.16179915726733e-05, + "loss": 1.7977, + "step": 20567 + }, + { + "epoch": 6.3130755064456725, + "grad_norm": 0.197379007935524, + "learning_rate": 3.1613369201297895e-05, + "loss": 1.6966, + "step": 20568 + }, + { + "epoch": 6.313382443216697, + "grad_norm": 0.2149469256401062, + "learning_rate": 3.1608747011629975e-05, + "loss": 1.7385, + "step": 20569 + }, + { + "epoch": 6.313689379987722, + "grad_norm": 0.21942345798015594, + "learning_rate": 3.1604125003715174e-05, + "loss": 1.7369, + "step": 20570 + }, + { + "epoch": 6.313996316758748, + "grad_norm": 0.20977036654949188, + "learning_rate": 3.1599503177599197e-05, + "loss": 1.7429, + "step": 20571 + }, + { + "epoch": 6.314303253529773, + "grad_norm": 0.20113405585289001, + "learning_rate": 3.159488153332772e-05, + "loss": 1.7163, + "step": 20572 + }, + { + "epoch": 6.314610190300798, + "grad_norm": 0.22031868994235992, + "learning_rate": 3.1590260070946414e-05, + "loss": 1.7085, + "step": 20573 + }, + { + "epoch": 6.314917127071823, + "grad_norm": 0.24137777090072632, + "learning_rate": 3.158563879050094e-05, + "loss": 1.7169, + "step": 20574 + }, + { + "epoch": 6.315224063842848, + "grad_norm": 0.20265905559062958, + "learning_rate": 3.1581017692036985e-05, + "loss": 1.7466, + "step": 20575 + }, + { + "epoch": 6.315531000613873, + "grad_norm": 0.2997782528400421, + "learning_rate": 3.1576396775600206e-05, + "loss": 1.7287, + "step": 20576 + }, + { + "epoch": 6.315837937384899, + "grad_norm": 0.19672340154647827, + "learning_rate": 3.157177604123628e-05, + "loss": 1.7121, + "step": 20577 + }, + { + "epoch": 6.316144874155924, + "grad_norm": 0.26618507504463196, + "learning_rate": 3.156715548899085e-05, + "loss": 1.6958, + "step": 20578 + }, + { + "epoch": 6.316451810926949, + "grad_norm": 0.18854503333568573, + "learning_rate": 3.156253511890959e-05, + "loss": 1.7751, + "step": 20579 + }, + { + "epoch": 6.316758747697974, + "grad_norm": 0.2306061089038849, + "learning_rate": 3.155791493103819e-05, + "loss": 1.6853, + "step": 20580 + }, + { + "epoch": 6.317065684468999, + "grad_norm": 0.20650778710842133, + "learning_rate": 3.1553294925422254e-05, + "loss": 1.7021, + "step": 20581 + }, + { + "epoch": 6.3173726212400245, + "grad_norm": 0.19474658370018005, + "learning_rate": 3.1548675102107494e-05, + "loss": 1.7146, + "step": 20582 + }, + { + "epoch": 6.31767955801105, + "grad_norm": 0.2150747925043106, + "learning_rate": 3.154405546113952e-05, + "loss": 1.7473, + "step": 20583 + }, + { + "epoch": 6.317986494782075, + "grad_norm": 0.19304975867271423, + "learning_rate": 3.153943600256402e-05, + "loss": 1.7209, + "step": 20584 + }, + { + "epoch": 6.3182934315531, + "grad_norm": 0.22610948979854584, + "learning_rate": 3.153481672642662e-05, + "loss": 1.717, + "step": 20585 + }, + { + "epoch": 6.318600368324125, + "grad_norm": 0.18705105781555176, + "learning_rate": 3.1530197632773006e-05, + "loss": 1.7326, + "step": 20586 + }, + { + "epoch": 6.31890730509515, + "grad_norm": 0.25632867217063904, + "learning_rate": 3.152557872164878e-05, + "loss": 1.7391, + "step": 20587 + }, + { + "epoch": 6.319214241866176, + "grad_norm": 0.18723119795322418, + "learning_rate": 3.152095999309964e-05, + "loss": 1.7193, + "step": 20588 + }, + { + "epoch": 6.319521178637201, + "grad_norm": 0.1759091317653656, + "learning_rate": 3.1516341447171184e-05, + "loss": 1.7024, + "step": 20589 + }, + { + "epoch": 6.319828115408226, + "grad_norm": 0.1838626265525818, + "learning_rate": 3.1511723083909084e-05, + "loss": 1.7027, + "step": 20590 + }, + { + "epoch": 6.320135052179251, + "grad_norm": 0.2615656554698944, + "learning_rate": 3.1507104903358964e-05, + "loss": 1.7798, + "step": 20591 + }, + { + "epoch": 6.320441988950276, + "grad_norm": 0.18816477060317993, + "learning_rate": 3.150248690556649e-05, + "loss": 1.6778, + "step": 20592 + }, + { + "epoch": 6.320748925721301, + "grad_norm": 0.20011866092681885, + "learning_rate": 3.149786909057728e-05, + "loss": 1.6653, + "step": 20593 + }, + { + "epoch": 6.321055862492327, + "grad_norm": 0.26681140065193176, + "learning_rate": 3.149325145843696e-05, + "loss": 1.7523, + "step": 20594 + }, + { + "epoch": 6.321362799263352, + "grad_norm": 0.2062411904335022, + "learning_rate": 3.1488634009191177e-05, + "loss": 1.7584, + "step": 20595 + }, + { + "epoch": 6.3216697360343765, + "grad_norm": 0.22355243563652039, + "learning_rate": 3.148401674288556e-05, + "loss": 1.7106, + "step": 20596 + }, + { + "epoch": 6.321976672805402, + "grad_norm": 0.20189255475997925, + "learning_rate": 3.147939965956576e-05, + "loss": 1.6775, + "step": 20597 + }, + { + "epoch": 6.322283609576427, + "grad_norm": 0.23753875494003296, + "learning_rate": 3.147478275927736e-05, + "loss": 1.7661, + "step": 20598 + }, + { + "epoch": 6.3225905463474525, + "grad_norm": 0.18658648431301117, + "learning_rate": 3.147016604206604e-05, + "loss": 1.7562, + "step": 20599 + }, + { + "epoch": 6.322897483118478, + "grad_norm": 0.2610020637512207, + "learning_rate": 3.146554950797738e-05, + "loss": 1.7217, + "step": 20600 + }, + { + "epoch": 6.323204419889503, + "grad_norm": 0.18329289555549622, + "learning_rate": 3.146093315705704e-05, + "loss": 1.7206, + "step": 20601 + }, + { + "epoch": 6.323511356660528, + "grad_norm": 0.2393725961446762, + "learning_rate": 3.1456316989350606e-05, + "loss": 1.7646, + "step": 20602 + }, + { + "epoch": 6.323818293431553, + "grad_norm": 0.23535947501659393, + "learning_rate": 3.1451701004903736e-05, + "loss": 1.7718, + "step": 20603 + }, + { + "epoch": 6.324125230202578, + "grad_norm": 0.23179253935813904, + "learning_rate": 3.1447085203762014e-05, + "loss": 1.7311, + "step": 20604 + }, + { + "epoch": 6.324432166973604, + "grad_norm": 0.24929681420326233, + "learning_rate": 3.144246958597109e-05, + "loss": 1.7728, + "step": 20605 + }, + { + "epoch": 6.324739103744629, + "grad_norm": 0.22520960867404938, + "learning_rate": 3.1437854151576526e-05, + "loss": 1.749, + "step": 20606 + }, + { + "epoch": 6.3250460405156534, + "grad_norm": 0.3005391061306, + "learning_rate": 3.1433238900623997e-05, + "loss": 1.7725, + "step": 20607 + }, + { + "epoch": 6.325352977286679, + "grad_norm": 0.22625432908535004, + "learning_rate": 3.142862383315908e-05, + "loss": 1.7083, + "step": 20608 + }, + { + "epoch": 6.325659914057704, + "grad_norm": 0.28015029430389404, + "learning_rate": 3.142400894922737e-05, + "loss": 1.6862, + "step": 20609 + }, + { + "epoch": 6.3259668508287294, + "grad_norm": 0.2520587146282196, + "learning_rate": 3.141939424887451e-05, + "loss": 1.7059, + "step": 20610 + }, + { + "epoch": 6.326273787599755, + "grad_norm": 0.24668551981449127, + "learning_rate": 3.141477973214607e-05, + "loss": 1.6858, + "step": 20611 + }, + { + "epoch": 6.326580724370779, + "grad_norm": 0.2524704337120056, + "learning_rate": 3.1410165399087675e-05, + "loss": 1.6884, + "step": 20612 + }, + { + "epoch": 6.326887661141805, + "grad_norm": 0.18849264085292816, + "learning_rate": 3.1405551249744916e-05, + "loss": 1.6984, + "step": 20613 + }, + { + "epoch": 6.32719459791283, + "grad_norm": 0.2411552518606186, + "learning_rate": 3.140093728416342e-05, + "loss": 1.7455, + "step": 20614 + }, + { + "epoch": 6.327501534683855, + "grad_norm": 0.2268913835287094, + "learning_rate": 3.139632350238874e-05, + "loss": 1.7124, + "step": 20615 + }, + { + "epoch": 6.327808471454881, + "grad_norm": 0.3118770718574524, + "learning_rate": 3.1391709904466515e-05, + "loss": 1.7322, + "step": 20616 + }, + { + "epoch": 6.328115408225905, + "grad_norm": 0.25166428089141846, + "learning_rate": 3.1387096490442294e-05, + "loss": 1.7136, + "step": 20617 + }, + { + "epoch": 6.32842234499693, + "grad_norm": 0.2733297049999237, + "learning_rate": 3.138248326036172e-05, + "loss": 1.7939, + "step": 20618 + }, + { + "epoch": 6.328729281767956, + "grad_norm": 0.24583236873149872, + "learning_rate": 3.1377870214270334e-05, + "loss": 1.7105, + "step": 20619 + }, + { + "epoch": 6.329036218538981, + "grad_norm": 0.2533528506755829, + "learning_rate": 3.137325735221377e-05, + "loss": 1.7828, + "step": 20620 + }, + { + "epoch": 6.329343155310006, + "grad_norm": 0.27662715315818787, + "learning_rate": 3.136864467423758e-05, + "loss": 1.6969, + "step": 20621 + }, + { + "epoch": 6.329650092081032, + "grad_norm": 0.20107655227184296, + "learning_rate": 3.136403218038738e-05, + "loss": 1.6659, + "step": 20622 + }, + { + "epoch": 6.329957028852056, + "grad_norm": 0.21126115322113037, + "learning_rate": 3.135941987070872e-05, + "loss": 1.7372, + "step": 20623 + }, + { + "epoch": 6.3302639656230815, + "grad_norm": 0.1840609908103943, + "learning_rate": 3.1354807745247206e-05, + "loss": 1.7219, + "step": 20624 + }, + { + "epoch": 6.330570902394107, + "grad_norm": 0.23623648285865784, + "learning_rate": 3.135019580404842e-05, + "loss": 1.8059, + "step": 20625 + }, + { + "epoch": 6.330877839165132, + "grad_norm": 0.19853124022483826, + "learning_rate": 3.134558404715792e-05, + "loss": 1.7336, + "step": 20626 + }, + { + "epoch": 6.3311847759361575, + "grad_norm": 0.2261304259300232, + "learning_rate": 3.13409724746213e-05, + "loss": 1.7508, + "step": 20627 + }, + { + "epoch": 6.331491712707182, + "grad_norm": 0.1797952800989151, + "learning_rate": 3.1336361086484104e-05, + "loss": 1.6569, + "step": 20628 + }, + { + "epoch": 6.331798649478207, + "grad_norm": 0.21610359847545624, + "learning_rate": 3.133174988279195e-05, + "loss": 1.7093, + "step": 20629 + }, + { + "epoch": 6.332105586249233, + "grad_norm": 0.1818271279335022, + "learning_rate": 3.1327138863590365e-05, + "loss": 1.6951, + "step": 20630 + }, + { + "epoch": 6.332412523020258, + "grad_norm": 0.20425963401794434, + "learning_rate": 3.1322528028924956e-05, + "loss": 1.7399, + "step": 20631 + }, + { + "epoch": 6.332719459791283, + "grad_norm": 0.20357854664325714, + "learning_rate": 3.131791737884126e-05, + "loss": 1.693, + "step": 20632 + }, + { + "epoch": 6.333026396562309, + "grad_norm": 0.25307130813598633, + "learning_rate": 3.1313306913384874e-05, + "loss": 1.674, + "step": 20633 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 0.21596084535121918, + "learning_rate": 3.130869663260132e-05, + "loss": 1.7521, + "step": 20634 + }, + { + "epoch": 6.333640270104358, + "grad_norm": 0.24110902845859528, + "learning_rate": 3.1304086536536194e-05, + "loss": 1.6723, + "step": 20635 + }, + { + "epoch": 6.333947206875384, + "grad_norm": 0.21365956962108612, + "learning_rate": 3.129947662523503e-05, + "loss": 1.7702, + "step": 20636 + }, + { + "epoch": 6.334254143646409, + "grad_norm": 0.21873877942562103, + "learning_rate": 3.129486689874341e-05, + "loss": 1.7176, + "step": 20637 + }, + { + "epoch": 6.334561080417434, + "grad_norm": 0.2543679475784302, + "learning_rate": 3.129025735710687e-05, + "loss": 1.7733, + "step": 20638 + }, + { + "epoch": 6.334868017188459, + "grad_norm": 0.24591630697250366, + "learning_rate": 3.1285648000370996e-05, + "loss": 1.7212, + "step": 20639 + }, + { + "epoch": 6.335174953959484, + "grad_norm": 0.2453039139509201, + "learning_rate": 3.128103882858129e-05, + "loss": 1.7316, + "step": 20640 + }, + { + "epoch": 6.3354818907305095, + "grad_norm": 0.239897683262825, + "learning_rate": 3.127642984178334e-05, + "loss": 1.7495, + "step": 20641 + }, + { + "epoch": 6.335788827501535, + "grad_norm": 0.20719192922115326, + "learning_rate": 3.12718210400227e-05, + "loss": 1.7242, + "step": 20642 + }, + { + "epoch": 6.33609576427256, + "grad_norm": 0.1813955008983612, + "learning_rate": 3.126721242334487e-05, + "loss": 1.672, + "step": 20643 + }, + { + "epoch": 6.336402701043585, + "grad_norm": 0.20045650005340576, + "learning_rate": 3.126260399179546e-05, + "loss": 1.7854, + "step": 20644 + }, + { + "epoch": 6.33670963781461, + "grad_norm": 0.23010976612567902, + "learning_rate": 3.125799574541995e-05, + "loss": 1.7508, + "step": 20645 + }, + { + "epoch": 6.337016574585635, + "grad_norm": 0.1854519248008728, + "learning_rate": 3.1253387684263924e-05, + "loss": 1.7049, + "step": 20646 + }, + { + "epoch": 6.337323511356661, + "grad_norm": 0.2062511295080185, + "learning_rate": 3.1248779808372894e-05, + "loss": 1.6894, + "step": 20647 + }, + { + "epoch": 6.337630448127686, + "grad_norm": 0.19851341843605042, + "learning_rate": 3.124417211779244e-05, + "loss": 1.7332, + "step": 20648 + }, + { + "epoch": 6.337937384898711, + "grad_norm": 0.2099175751209259, + "learning_rate": 3.1239564612568054e-05, + "loss": 1.7577, + "step": 20649 + }, + { + "epoch": 6.338244321669736, + "grad_norm": 0.2152891904115677, + "learning_rate": 3.123495729274529e-05, + "loss": 1.7691, + "step": 20650 + }, + { + "epoch": 6.338551258440761, + "grad_norm": 0.19431835412979126, + "learning_rate": 3.123035015836967e-05, + "loss": 1.7035, + "step": 20651 + }, + { + "epoch": 6.338858195211786, + "grad_norm": 0.20863930881023407, + "learning_rate": 3.122574320948674e-05, + "loss": 1.7166, + "step": 20652 + }, + { + "epoch": 6.339165131982812, + "grad_norm": 0.17948369681835175, + "learning_rate": 3.122113644614201e-05, + "loss": 1.732, + "step": 20653 + }, + { + "epoch": 6.339472068753837, + "grad_norm": 0.2329161912202835, + "learning_rate": 3.121652986838103e-05, + "loss": 1.6934, + "step": 20654 + }, + { + "epoch": 6.3397790055248615, + "grad_norm": 0.23563681542873383, + "learning_rate": 3.12119234762493e-05, + "loss": 1.7329, + "step": 20655 + }, + { + "epoch": 6.340085942295887, + "grad_norm": 0.22654885053634644, + "learning_rate": 3.120731726979236e-05, + "loss": 1.767, + "step": 20656 + }, + { + "epoch": 6.340392879066912, + "grad_norm": 0.2507181465625763, + "learning_rate": 3.1202711249055715e-05, + "loss": 1.7071, + "step": 20657 + }, + { + "epoch": 6.3406998158379375, + "grad_norm": 0.20573864877223969, + "learning_rate": 3.1198105414084906e-05, + "loss": 1.7566, + "step": 20658 + }, + { + "epoch": 6.341006752608963, + "grad_norm": 0.23311644792556763, + "learning_rate": 3.119349976492545e-05, + "loss": 1.6778, + "step": 20659 + }, + { + "epoch": 6.341313689379987, + "grad_norm": 0.18166053295135498, + "learning_rate": 3.118889430162283e-05, + "loss": 1.7109, + "step": 20660 + }, + { + "epoch": 6.341620626151013, + "grad_norm": 0.21054090559482574, + "learning_rate": 3.11842890242226e-05, + "loss": 1.7255, + "step": 20661 + }, + { + "epoch": 6.341927562922038, + "grad_norm": 0.19898973405361176, + "learning_rate": 3.1179683932770235e-05, + "loss": 1.7017, + "step": 20662 + }, + { + "epoch": 6.342234499693063, + "grad_norm": 0.17782434821128845, + "learning_rate": 3.117507902731127e-05, + "loss": 1.6858, + "step": 20663 + }, + { + "epoch": 6.342541436464089, + "grad_norm": 0.19286927580833435, + "learning_rate": 3.117047430789121e-05, + "loss": 1.707, + "step": 20664 + }, + { + "epoch": 6.342848373235114, + "grad_norm": 0.18578651547431946, + "learning_rate": 3.1165869774555565e-05, + "loss": 1.7331, + "step": 20665 + }, + { + "epoch": 6.343155310006138, + "grad_norm": 0.19728249311447144, + "learning_rate": 3.1161265427349826e-05, + "loss": 1.7165, + "step": 20666 + }, + { + "epoch": 6.343462246777164, + "grad_norm": 0.18240176141262054, + "learning_rate": 3.115666126631952e-05, + "loss": 1.7167, + "step": 20667 + }, + { + "epoch": 6.343769183548189, + "grad_norm": 0.1928495317697525, + "learning_rate": 3.115205729151011e-05, + "loss": 1.7431, + "step": 20668 + }, + { + "epoch": 6.344076120319214, + "grad_norm": 0.19459952414035797, + "learning_rate": 3.1147453502967125e-05, + "loss": 1.7294, + "step": 20669 + }, + { + "epoch": 6.34438305709024, + "grad_norm": 0.18829894065856934, + "learning_rate": 3.1142849900736046e-05, + "loss": 1.7512, + "step": 20670 + }, + { + "epoch": 6.344689993861264, + "grad_norm": 0.19678451120853424, + "learning_rate": 3.11382464848624e-05, + "loss": 1.673, + "step": 20671 + }, + { + "epoch": 6.3449969306322895, + "grad_norm": 0.22256550192832947, + "learning_rate": 3.1133643255391635e-05, + "loss": 1.7044, + "step": 20672 + }, + { + "epoch": 6.345303867403315, + "grad_norm": 0.24741628766059875, + "learning_rate": 3.112904021236929e-05, + "loss": 1.7904, + "step": 20673 + }, + { + "epoch": 6.34561080417434, + "grad_norm": 0.20286159217357635, + "learning_rate": 3.11244373558408e-05, + "loss": 1.6976, + "step": 20674 + }, + { + "epoch": 6.3459177409453655, + "grad_norm": 0.2005387842655182, + "learning_rate": 3.11198346858517e-05, + "loss": 1.7083, + "step": 20675 + }, + { + "epoch": 6.346224677716391, + "grad_norm": 0.22312256693840027, + "learning_rate": 3.111523220244747e-05, + "loss": 1.7575, + "step": 20676 + }, + { + "epoch": 6.346531614487415, + "grad_norm": 0.2968841791152954, + "learning_rate": 3.111062990567356e-05, + "loss": 1.7813, + "step": 20677 + }, + { + "epoch": 6.346838551258441, + "grad_norm": 0.22900697588920593, + "learning_rate": 3.1106027795575496e-05, + "loss": 1.6818, + "step": 20678 + }, + { + "epoch": 6.347145488029466, + "grad_norm": 0.1912240833044052, + "learning_rate": 3.110142587219873e-05, + "loss": 1.7174, + "step": 20679 + }, + { + "epoch": 6.347452424800491, + "grad_norm": 0.20461280643939972, + "learning_rate": 3.1096824135588754e-05, + "loss": 1.6945, + "step": 20680 + }, + { + "epoch": 6.347759361571517, + "grad_norm": 0.19344913959503174, + "learning_rate": 3.109222258579103e-05, + "loss": 1.7064, + "step": 20681 + }, + { + "epoch": 6.348066298342541, + "grad_norm": 0.1833983063697815, + "learning_rate": 3.108762122285106e-05, + "loss": 1.702, + "step": 20682 + }, + { + "epoch": 6.348373235113566, + "grad_norm": 0.20344893634319305, + "learning_rate": 3.108302004681429e-05, + "loss": 1.7323, + "step": 20683 + }, + { + "epoch": 6.348680171884592, + "grad_norm": 0.18629617989063263, + "learning_rate": 3.107841905772622e-05, + "loss": 1.6841, + "step": 20684 + }, + { + "epoch": 6.348987108655617, + "grad_norm": 0.19279471039772034, + "learning_rate": 3.107381825563228e-05, + "loss": 1.7581, + "step": 20685 + }, + { + "epoch": 6.349294045426642, + "grad_norm": 0.21727058291435242, + "learning_rate": 3.106921764057798e-05, + "loss": 1.7231, + "step": 20686 + }, + { + "epoch": 6.349600982197667, + "grad_norm": 0.20952723920345306, + "learning_rate": 3.1064617212608747e-05, + "loss": 1.713, + "step": 20687 + }, + { + "epoch": 6.349907918968692, + "grad_norm": 0.2358582466840744, + "learning_rate": 3.10600169717701e-05, + "loss": 1.7291, + "step": 20688 + }, + { + "epoch": 6.350214855739718, + "grad_norm": 0.21846619248390198, + "learning_rate": 3.105541691810743e-05, + "loss": 1.7365, + "step": 20689 + }, + { + "epoch": 6.350521792510743, + "grad_norm": 0.22137843072414398, + "learning_rate": 3.1050817051666256e-05, + "loss": 1.7404, + "step": 20690 + }, + { + "epoch": 6.350828729281768, + "grad_norm": 0.2301674485206604, + "learning_rate": 3.1046217372492e-05, + "loss": 1.7422, + "step": 20691 + }, + { + "epoch": 6.351135666052793, + "grad_norm": 0.18955166637897491, + "learning_rate": 3.104161788063015e-05, + "loss": 1.7063, + "step": 20692 + }, + { + "epoch": 6.351442602823818, + "grad_norm": 0.21172095835208893, + "learning_rate": 3.103701857612614e-05, + "loss": 1.6856, + "step": 20693 + }, + { + "epoch": 6.351749539594843, + "grad_norm": 0.20921260118484497, + "learning_rate": 3.103241945902541e-05, + "loss": 1.7384, + "step": 20694 + }, + { + "epoch": 6.352056476365869, + "grad_norm": 0.21005603671073914, + "learning_rate": 3.102782052937345e-05, + "loss": 1.7118, + "step": 20695 + }, + { + "epoch": 6.352363413136894, + "grad_norm": 0.20888659358024597, + "learning_rate": 3.102322178721567e-05, + "loss": 1.7172, + "step": 20696 + }, + { + "epoch": 6.352670349907919, + "grad_norm": 0.194463849067688, + "learning_rate": 3.101862323259754e-05, + "loss": 1.6909, + "step": 20697 + }, + { + "epoch": 6.352977286678944, + "grad_norm": 0.20848685503005981, + "learning_rate": 3.1014024865564494e-05, + "loss": 1.7846, + "step": 20698 + }, + { + "epoch": 6.353284223449969, + "grad_norm": 0.18669761717319489, + "learning_rate": 3.100942668616201e-05, + "loss": 1.7542, + "step": 20699 + }, + { + "epoch": 6.3535911602209945, + "grad_norm": 0.23618464171886444, + "learning_rate": 3.100482869443547e-05, + "loss": 1.7292, + "step": 20700 + }, + { + "epoch": 6.35389809699202, + "grad_norm": 0.19389905035495758, + "learning_rate": 3.100023089043037e-05, + "loss": 1.6847, + "step": 20701 + }, + { + "epoch": 6.354205033763045, + "grad_norm": 0.20346343517303467, + "learning_rate": 3.09956332741921e-05, + "loss": 1.7096, + "step": 20702 + }, + { + "epoch": 6.35451197053407, + "grad_norm": 0.20825842022895813, + "learning_rate": 3.099103584576614e-05, + "loss": 1.6974, + "step": 20703 + }, + { + "epoch": 6.354818907305095, + "grad_norm": 0.2093508094549179, + "learning_rate": 3.0986438605197895e-05, + "loss": 1.6849, + "step": 20704 + }, + { + "epoch": 6.35512584407612, + "grad_norm": 0.2576633393764496, + "learning_rate": 3.098184155253282e-05, + "loss": 1.7974, + "step": 20705 + }, + { + "epoch": 6.355432780847146, + "grad_norm": 0.18197253346443176, + "learning_rate": 3.097724468781632e-05, + "loss": 1.6723, + "step": 20706 + }, + { + "epoch": 6.355739717618171, + "grad_norm": 0.24809512495994568, + "learning_rate": 3.0972648011093855e-05, + "loss": 1.7378, + "step": 20707 + }, + { + "epoch": 6.356046654389196, + "grad_norm": 0.2046923190355301, + "learning_rate": 3.0968051522410814e-05, + "loss": 1.7502, + "step": 20708 + }, + { + "epoch": 6.356353591160221, + "grad_norm": 0.20443019270896912, + "learning_rate": 3.096345522181265e-05, + "loss": 1.7179, + "step": 20709 + }, + { + "epoch": 6.356660527931246, + "grad_norm": 0.1906277984380722, + "learning_rate": 3.09588591093448e-05, + "loss": 1.7167, + "step": 20710 + }, + { + "epoch": 6.356967464702271, + "grad_norm": 0.20729197561740875, + "learning_rate": 3.095426318505263e-05, + "loss": 1.7193, + "step": 20711 + }, + { + "epoch": 6.357274401473297, + "grad_norm": 0.23446644842624664, + "learning_rate": 3.094966744898162e-05, + "loss": 1.7341, + "step": 20712 + }, + { + "epoch": 6.357581338244322, + "grad_norm": 0.18882590532302856, + "learning_rate": 3.094507190117715e-05, + "loss": 1.7001, + "step": 20713 + }, + { + "epoch": 6.3578882750153465, + "grad_norm": 0.27240705490112305, + "learning_rate": 3.094047654168465e-05, + "loss": 1.7641, + "step": 20714 + }, + { + "epoch": 6.358195211786372, + "grad_norm": 0.19616954028606415, + "learning_rate": 3.093588137054952e-05, + "loss": 1.751, + "step": 20715 + }, + { + "epoch": 6.358502148557397, + "grad_norm": 0.23402562737464905, + "learning_rate": 3.093128638781721e-05, + "loss": 1.7274, + "step": 20716 + }, + { + "epoch": 6.3588090853284225, + "grad_norm": 0.18189528584480286, + "learning_rate": 3.092669159353309e-05, + "loss": 1.7079, + "step": 20717 + }, + { + "epoch": 6.359116022099448, + "grad_norm": 0.21583771705627441, + "learning_rate": 3.092209698774259e-05, + "loss": 1.6811, + "step": 20718 + }, + { + "epoch": 6.359422958870473, + "grad_norm": 0.2477681040763855, + "learning_rate": 3.091750257049109e-05, + "loss": 1.6963, + "step": 20719 + }, + { + "epoch": 6.359729895641498, + "grad_norm": 0.2883109152317047, + "learning_rate": 3.091290834182403e-05, + "loss": 1.8349, + "step": 20720 + }, + { + "epoch": 6.360036832412523, + "grad_norm": 0.23407170176506042, + "learning_rate": 3.09083143017868e-05, + "loss": 1.7271, + "step": 20721 + }, + { + "epoch": 6.360343769183548, + "grad_norm": 0.2818833589553833, + "learning_rate": 3.090372045042479e-05, + "loss": 1.7852, + "step": 20722 + }, + { + "epoch": 6.360650705954574, + "grad_norm": 0.24415317177772522, + "learning_rate": 3.089912678778341e-05, + "loss": 1.6826, + "step": 20723 + }, + { + "epoch": 6.360957642725599, + "grad_norm": 0.26786303520202637, + "learning_rate": 3.0894533313908056e-05, + "loss": 1.7616, + "step": 20724 + }, + { + "epoch": 6.361264579496623, + "grad_norm": 0.3235633969306946, + "learning_rate": 3.088994002884411e-05, + "loss": 1.7637, + "step": 20725 + }, + { + "epoch": 6.361571516267649, + "grad_norm": 0.18675416707992554, + "learning_rate": 3.0885346932637e-05, + "loss": 1.7037, + "step": 20726 + }, + { + "epoch": 6.361878453038674, + "grad_norm": 0.295802503824234, + "learning_rate": 3.0880754025332084e-05, + "loss": 1.7435, + "step": 20727 + }, + { + "epoch": 6.362185389809699, + "grad_norm": 0.18665561079978943, + "learning_rate": 3.0876161306974756e-05, + "loss": 1.684, + "step": 20728 + }, + { + "epoch": 6.362492326580725, + "grad_norm": 0.2530463635921478, + "learning_rate": 3.087156877761043e-05, + "loss": 1.7934, + "step": 20729 + }, + { + "epoch": 6.362799263351749, + "grad_norm": 0.17860126495361328, + "learning_rate": 3.086697643728445e-05, + "loss": 1.6977, + "step": 20730 + }, + { + "epoch": 6.3631062001227745, + "grad_norm": 0.20118845999240875, + "learning_rate": 3.086238428604223e-05, + "loss": 1.7241, + "step": 20731 + }, + { + "epoch": 6.3634131368938, + "grad_norm": 0.18811924755573273, + "learning_rate": 3.085779232392915e-05, + "loss": 1.6918, + "step": 20732 + }, + { + "epoch": 6.363720073664825, + "grad_norm": 0.1841908097267151, + "learning_rate": 3.085320055099058e-05, + "loss": 1.735, + "step": 20733 + }, + { + "epoch": 6.3640270104358505, + "grad_norm": 0.1956033855676651, + "learning_rate": 3.08486089672719e-05, + "loss": 1.7203, + "step": 20734 + }, + { + "epoch": 6.364333947206875, + "grad_norm": 0.19844500720500946, + "learning_rate": 3.084401757281851e-05, + "loss": 1.6767, + "step": 20735 + }, + { + "epoch": 6.3646408839779, + "grad_norm": 0.2018919438123703, + "learning_rate": 3.083942636767575e-05, + "loss": 1.6912, + "step": 20736 + }, + { + "epoch": 6.364947820748926, + "grad_norm": 0.18929271399974823, + "learning_rate": 3.083483535188901e-05, + "loss": 1.6838, + "step": 20737 + }, + { + "epoch": 6.365254757519951, + "grad_norm": 0.19833499193191528, + "learning_rate": 3.0830244525503674e-05, + "loss": 1.7139, + "step": 20738 + }, + { + "epoch": 6.365561694290976, + "grad_norm": 0.17029902338981628, + "learning_rate": 3.082565388856509e-05, + "loss": 1.6665, + "step": 20739 + }, + { + "epoch": 6.365868631062002, + "grad_norm": 0.19526802003383636, + "learning_rate": 3.082106344111861e-05, + "loss": 1.7021, + "step": 20740 + }, + { + "epoch": 6.366175567833026, + "grad_norm": 0.19061279296875, + "learning_rate": 3.081647318320966e-05, + "loss": 1.7134, + "step": 20741 + }, + { + "epoch": 6.366482504604051, + "grad_norm": 0.17782293260097504, + "learning_rate": 3.081188311488354e-05, + "loss": 1.741, + "step": 20742 + }, + { + "epoch": 6.366789441375077, + "grad_norm": 0.20002372562885284, + "learning_rate": 3.080729323618565e-05, + "loss": 1.6943, + "step": 20743 + }, + { + "epoch": 6.367096378146102, + "grad_norm": 0.22873486578464508, + "learning_rate": 3.080270354716134e-05, + "loss": 1.7223, + "step": 20744 + }, + { + "epoch": 6.367403314917127, + "grad_norm": 0.191136434674263, + "learning_rate": 3.079811404785595e-05, + "loss": 1.6774, + "step": 20745 + }, + { + "epoch": 6.367710251688152, + "grad_norm": 0.20446795225143433, + "learning_rate": 3.0793524738314874e-05, + "loss": 1.7443, + "step": 20746 + }, + { + "epoch": 6.368017188459177, + "grad_norm": 0.20668596029281616, + "learning_rate": 3.078893561858341e-05, + "loss": 1.7553, + "step": 20747 + }, + { + "epoch": 6.3683241252302025, + "grad_norm": 0.18445394933223724, + "learning_rate": 3.078434668870698e-05, + "loss": 1.7365, + "step": 20748 + }, + { + "epoch": 6.368631062001228, + "grad_norm": 0.1824318915605545, + "learning_rate": 3.077975794873088e-05, + "loss": 1.7248, + "step": 20749 + }, + { + "epoch": 6.368937998772253, + "grad_norm": 0.18452249467372894, + "learning_rate": 3.077516939870047e-05, + "loss": 1.7095, + "step": 20750 + }, + { + "epoch": 6.3692449355432785, + "grad_norm": 0.17254458367824554, + "learning_rate": 3.077058103866112e-05, + "loss": 1.6937, + "step": 20751 + }, + { + "epoch": 6.369551872314303, + "grad_norm": 0.2022976130247116, + "learning_rate": 3.0765992868658154e-05, + "loss": 1.7593, + "step": 20752 + }, + { + "epoch": 6.369858809085328, + "grad_norm": 0.19274397194385529, + "learning_rate": 3.076140488873691e-05, + "loss": 1.7288, + "step": 20753 + }, + { + "epoch": 6.370165745856354, + "grad_norm": 0.18847523629665375, + "learning_rate": 3.075681709894276e-05, + "loss": 1.7293, + "step": 20754 + }, + { + "epoch": 6.370472682627379, + "grad_norm": 0.21054589748382568, + "learning_rate": 3.075222949932101e-05, + "loss": 1.7688, + "step": 20755 + }, + { + "epoch": 6.370779619398404, + "grad_norm": 0.16934558749198914, + "learning_rate": 3.0747642089917005e-05, + "loss": 1.7092, + "step": 20756 + }, + { + "epoch": 6.371086556169429, + "grad_norm": 0.19154684245586395, + "learning_rate": 3.0743054870776075e-05, + "loss": 1.6827, + "step": 20757 + }, + { + "epoch": 6.371393492940454, + "grad_norm": 0.2622900605201721, + "learning_rate": 3.0738467841943594e-05, + "loss": 1.748, + "step": 20758 + }, + { + "epoch": 6.371700429711479, + "grad_norm": 0.1767888218164444, + "learning_rate": 3.073388100346484e-05, + "loss": 1.717, + "step": 20759 + }, + { + "epoch": 6.372007366482505, + "grad_norm": 0.21692602336406708, + "learning_rate": 3.072929435538518e-05, + "loss": 1.7543, + "step": 20760 + }, + { + "epoch": 6.37231430325353, + "grad_norm": 0.19853977859020233, + "learning_rate": 3.0724707897749926e-05, + "loss": 1.7599, + "step": 20761 + }, + { + "epoch": 6.3726212400245545, + "grad_norm": 0.1904703676700592, + "learning_rate": 3.0720121630604396e-05, + "loss": 1.7094, + "step": 20762 + }, + { + "epoch": 6.37292817679558, + "grad_norm": 0.1961483359336853, + "learning_rate": 3.071553555399395e-05, + "loss": 1.7363, + "step": 20763 + }, + { + "epoch": 6.373235113566605, + "grad_norm": 0.16419392824172974, + "learning_rate": 3.071094966796385e-05, + "loss": 1.7073, + "step": 20764 + }, + { + "epoch": 6.3735420503376305, + "grad_norm": 0.1784946471452713, + "learning_rate": 3.0706363972559476e-05, + "loss": 1.699, + "step": 20765 + }, + { + "epoch": 6.373848987108656, + "grad_norm": 0.19472888112068176, + "learning_rate": 3.070177846782611e-05, + "loss": 1.7541, + "step": 20766 + }, + { + "epoch": 6.37415592387968, + "grad_norm": 0.2355004847049713, + "learning_rate": 3.0697193153809076e-05, + "loss": 1.7389, + "step": 20767 + }, + { + "epoch": 6.374462860650706, + "grad_norm": 0.1956906020641327, + "learning_rate": 3.069260803055369e-05, + "loss": 1.7197, + "step": 20768 + }, + { + "epoch": 6.374769797421731, + "grad_norm": 0.21212655305862427, + "learning_rate": 3.068802309810529e-05, + "loss": 1.7291, + "step": 20769 + }, + { + "epoch": 6.375076734192756, + "grad_norm": 0.22920182347297668, + "learning_rate": 3.068343835650914e-05, + "loss": 1.7397, + "step": 20770 + }, + { + "epoch": 6.375383670963782, + "grad_norm": 0.2143404483795166, + "learning_rate": 3.0678853805810605e-05, + "loss": 1.76, + "step": 20771 + }, + { + "epoch": 6.375690607734807, + "grad_norm": 0.1848321557044983, + "learning_rate": 3.067426944605492e-05, + "loss": 1.7127, + "step": 20772 + }, + { + "epoch": 6.3759975445058314, + "grad_norm": 0.23339331150054932, + "learning_rate": 3.0669685277287465e-05, + "loss": 1.7828, + "step": 20773 + }, + { + "epoch": 6.376304481276857, + "grad_norm": 0.19590741395950317, + "learning_rate": 3.066510129955349e-05, + "loss": 1.7224, + "step": 20774 + }, + { + "epoch": 6.376611418047882, + "grad_norm": 0.19986604154109955, + "learning_rate": 3.066051751289833e-05, + "loss": 1.7412, + "step": 20775 + }, + { + "epoch": 6.3769183548189075, + "grad_norm": 0.18629087507724762, + "learning_rate": 3.0655933917367266e-05, + "loss": 1.695, + "step": 20776 + }, + { + "epoch": 6.377225291589933, + "grad_norm": 0.2248111218214035, + "learning_rate": 3.0651350513005605e-05, + "loss": 1.7685, + "step": 20777 + }, + { + "epoch": 6.377532228360957, + "grad_norm": 0.1803683638572693, + "learning_rate": 3.064676729985864e-05, + "loss": 1.7206, + "step": 20778 + }, + { + "epoch": 6.377839165131983, + "grad_norm": 0.23836754262447357, + "learning_rate": 3.064218427797165e-05, + "loss": 1.7428, + "step": 20779 + }, + { + "epoch": 6.378146101903008, + "grad_norm": 0.22549279034137726, + "learning_rate": 3.063760144738996e-05, + "loss": 1.7314, + "step": 20780 + }, + { + "epoch": 6.378453038674033, + "grad_norm": 0.20714345574378967, + "learning_rate": 3.063301880815882e-05, + "loss": 1.7179, + "step": 20781 + }, + { + "epoch": 6.378759975445059, + "grad_norm": 0.17024052143096924, + "learning_rate": 3.0628436360323565e-05, + "loss": 1.6602, + "step": 20782 + }, + { + "epoch": 6.379066912216084, + "grad_norm": 0.20378601551055908, + "learning_rate": 3.062385410392943e-05, + "loss": 1.7708, + "step": 20783 + }, + { + "epoch": 6.379373848987108, + "grad_norm": 0.1885673850774765, + "learning_rate": 3.0619272039021734e-05, + "loss": 1.7034, + "step": 20784 + }, + { + "epoch": 6.379680785758134, + "grad_norm": 0.18746556341648102, + "learning_rate": 3.0614690165645746e-05, + "loss": 1.6946, + "step": 20785 + }, + { + "epoch": 6.379987722529159, + "grad_norm": 0.19569392502307892, + "learning_rate": 3.061010848384677e-05, + "loss": 1.7298, + "step": 20786 + }, + { + "epoch": 6.380294659300184, + "grad_norm": 0.21114139258861542, + "learning_rate": 3.0605526993670046e-05, + "loss": 1.795, + "step": 20787 + }, + { + "epoch": 6.38060159607121, + "grad_norm": 0.20940302312374115, + "learning_rate": 3.06009456951609e-05, + "loss": 1.6747, + "step": 20788 + }, + { + "epoch": 6.380908532842234, + "grad_norm": 0.21008993685245514, + "learning_rate": 3.059636458836455e-05, + "loss": 1.7219, + "step": 20789 + }, + { + "epoch": 6.3812154696132595, + "grad_norm": 0.17642457783222198, + "learning_rate": 3.0591783673326304e-05, + "loss": 1.6555, + "step": 20790 + }, + { + "epoch": 6.381522406384285, + "grad_norm": 0.2786177396774292, + "learning_rate": 3.058720295009143e-05, + "loss": 1.8463, + "step": 20791 + }, + { + "epoch": 6.38182934315531, + "grad_norm": 0.21209503710269928, + "learning_rate": 3.058262241870521e-05, + "loss": 1.6848, + "step": 20792 + }, + { + "epoch": 6.3821362799263355, + "grad_norm": 0.1880561262369156, + "learning_rate": 3.057804207921287e-05, + "loss": 1.7401, + "step": 20793 + }, + { + "epoch": 6.382443216697361, + "grad_norm": 0.22108516097068787, + "learning_rate": 3.0573461931659726e-05, + "loss": 1.7482, + "step": 20794 + }, + { + "epoch": 6.382750153468385, + "grad_norm": 0.2161533385515213, + "learning_rate": 3.0568881976091006e-05, + "loss": 1.7425, + "step": 20795 + }, + { + "epoch": 6.383057090239411, + "grad_norm": 0.22933612763881683, + "learning_rate": 3.0564302212551975e-05, + "loss": 1.7424, + "step": 20796 + }, + { + "epoch": 6.383364027010436, + "grad_norm": 0.19572989642620087, + "learning_rate": 3.0559722641087916e-05, + "loss": 1.6763, + "step": 20797 + }, + { + "epoch": 6.383670963781461, + "grad_norm": 0.2181084007024765, + "learning_rate": 3.0555143261744056e-05, + "loss": 1.7164, + "step": 20798 + }, + { + "epoch": 6.383977900552487, + "grad_norm": 0.1927991509437561, + "learning_rate": 3.055056407456569e-05, + "loss": 1.6833, + "step": 20799 + }, + { + "epoch": 6.384284837323511, + "grad_norm": 0.20569704473018646, + "learning_rate": 3.0545985079598025e-05, + "loss": 1.7716, + "step": 20800 + }, + { + "epoch": 6.384591774094536, + "grad_norm": 0.1856541931629181, + "learning_rate": 3.054140627688635e-05, + "loss": 1.6939, + "step": 20801 + }, + { + "epoch": 6.384898710865562, + "grad_norm": 0.2450970858335495, + "learning_rate": 3.05368276664759e-05, + "loss": 1.8197, + "step": 20802 + }, + { + "epoch": 6.385205647636587, + "grad_norm": 0.23325784504413605, + "learning_rate": 3.053224924841194e-05, + "loss": 1.7195, + "step": 20803 + }, + { + "epoch": 6.385512584407612, + "grad_norm": 0.19614358246326447, + "learning_rate": 3.052767102273968e-05, + "loss": 1.6966, + "step": 20804 + }, + { + "epoch": 6.385819521178637, + "grad_norm": 0.20615628361701965, + "learning_rate": 3.0523092989504415e-05, + "loss": 1.7429, + "step": 20805 + }, + { + "epoch": 6.386126457949662, + "grad_norm": 0.18418943881988525, + "learning_rate": 3.0518515148751336e-05, + "loss": 1.7612, + "step": 20806 + }, + { + "epoch": 6.3864333947206875, + "grad_norm": 0.17176245152950287, + "learning_rate": 3.0513937500525725e-05, + "loss": 1.6918, + "step": 20807 + }, + { + "epoch": 6.386740331491713, + "grad_norm": 0.22239255905151367, + "learning_rate": 3.0509360044872787e-05, + "loss": 1.8072, + "step": 20808 + }, + { + "epoch": 6.387047268262738, + "grad_norm": 0.20312704145908356, + "learning_rate": 3.0504782781837798e-05, + "loss": 1.7348, + "step": 20809 + }, + { + "epoch": 6.387354205033763, + "grad_norm": 0.23198208212852478, + "learning_rate": 3.0500205711465958e-05, + "loss": 1.7516, + "step": 20810 + }, + { + "epoch": 6.387661141804788, + "grad_norm": 0.2244081050157547, + "learning_rate": 3.0495628833802526e-05, + "loss": 1.731, + "step": 20811 + }, + { + "epoch": 6.387968078575813, + "grad_norm": 0.18282169103622437, + "learning_rate": 3.0491052148892717e-05, + "loss": 1.6743, + "step": 20812 + }, + { + "epoch": 6.388275015346839, + "grad_norm": 0.19108405709266663, + "learning_rate": 3.0486475656781753e-05, + "loss": 1.7485, + "step": 20813 + }, + { + "epoch": 6.388581952117864, + "grad_norm": 0.20574834942817688, + "learning_rate": 3.0481899357514898e-05, + "loss": 1.6979, + "step": 20814 + }, + { + "epoch": 6.388888888888889, + "grad_norm": 0.21263298392295837, + "learning_rate": 3.047732325113733e-05, + "loss": 1.687, + "step": 20815 + }, + { + "epoch": 6.389195825659914, + "grad_norm": 0.22646664083003998, + "learning_rate": 3.047274733769432e-05, + "loss": 1.7593, + "step": 20816 + }, + { + "epoch": 6.389502762430939, + "grad_norm": 0.1846906542778015, + "learning_rate": 3.046817161723104e-05, + "loss": 1.7271, + "step": 20817 + }, + { + "epoch": 6.389809699201964, + "grad_norm": 0.1965247541666031, + "learning_rate": 3.0463596089792746e-05, + "loss": 1.7121, + "step": 20818 + }, + { + "epoch": 6.39011663597299, + "grad_norm": 0.255577951669693, + "learning_rate": 3.045902075542464e-05, + "loss": 1.7311, + "step": 20819 + }, + { + "epoch": 6.390423572744015, + "grad_norm": 0.1837676465511322, + "learning_rate": 3.0454445614171966e-05, + "loss": 1.7177, + "step": 20820 + }, + { + "epoch": 6.3907305095150395, + "grad_norm": 0.24845893681049347, + "learning_rate": 3.0449870666079895e-05, + "loss": 1.6902, + "step": 20821 + }, + { + "epoch": 6.391037446286065, + "grad_norm": 0.28572577238082886, + "learning_rate": 3.0445295911193678e-05, + "loss": 1.7942, + "step": 20822 + }, + { + "epoch": 6.39134438305709, + "grad_norm": 0.20460839569568634, + "learning_rate": 3.044072134955849e-05, + "loss": 1.6747, + "step": 20823 + }, + { + "epoch": 6.3916513198281155, + "grad_norm": 0.3547010123729706, + "learning_rate": 3.0436146981219565e-05, + "loss": 1.7359, + "step": 20824 + }, + { + "epoch": 6.391958256599141, + "grad_norm": 0.20490451157093048, + "learning_rate": 3.04315728062221e-05, + "loss": 1.6863, + "step": 20825 + }, + { + "epoch": 6.392265193370166, + "grad_norm": 0.25874415040016174, + "learning_rate": 3.0426998824611307e-05, + "loss": 1.6798, + "step": 20826 + }, + { + "epoch": 6.392572130141191, + "grad_norm": 0.27858632802963257, + "learning_rate": 3.0422425036432378e-05, + "loss": 1.6943, + "step": 20827 + }, + { + "epoch": 6.392879066912216, + "grad_norm": 0.20951922237873077, + "learning_rate": 3.041785144173054e-05, + "loss": 1.7025, + "step": 20828 + }, + { + "epoch": 6.393186003683241, + "grad_norm": 0.3158397674560547, + "learning_rate": 3.0413278040550952e-05, + "loss": 1.7193, + "step": 20829 + }, + { + "epoch": 6.393492940454267, + "grad_norm": 0.18556484580039978, + "learning_rate": 3.0408704832938824e-05, + "loss": 1.7017, + "step": 20830 + }, + { + "epoch": 6.393799877225292, + "grad_norm": 0.31651169061660767, + "learning_rate": 3.0404131818939376e-05, + "loss": 1.7716, + "step": 20831 + }, + { + "epoch": 6.394106813996316, + "grad_norm": 0.2850388288497925, + "learning_rate": 3.0399558998597765e-05, + "loss": 1.7144, + "step": 20832 + }, + { + "epoch": 6.394413750767342, + "grad_norm": 0.19256308674812317, + "learning_rate": 3.0394986371959223e-05, + "loss": 1.6603, + "step": 20833 + }, + { + "epoch": 6.394720687538367, + "grad_norm": 0.2654922604560852, + "learning_rate": 3.0390413939068896e-05, + "loss": 1.6825, + "step": 20834 + }, + { + "epoch": 6.395027624309392, + "grad_norm": 0.19514231383800507, + "learning_rate": 3.0385841699971997e-05, + "loss": 1.7226, + "step": 20835 + }, + { + "epoch": 6.395334561080418, + "grad_norm": 0.27765151858329773, + "learning_rate": 3.0381269654713702e-05, + "loss": 1.7599, + "step": 20836 + }, + { + "epoch": 6.395641497851442, + "grad_norm": 0.2056504338979721, + "learning_rate": 3.0376697803339215e-05, + "loss": 1.7237, + "step": 20837 + }, + { + "epoch": 6.3959484346224675, + "grad_norm": 0.22516649961471558, + "learning_rate": 3.0372126145893688e-05, + "loss": 1.7566, + "step": 20838 + }, + { + "epoch": 6.396255371393493, + "grad_norm": 0.17632099986076355, + "learning_rate": 3.0367554682422327e-05, + "loss": 1.7014, + "step": 20839 + }, + { + "epoch": 6.396562308164518, + "grad_norm": 0.21872831881046295, + "learning_rate": 3.036298341297028e-05, + "loss": 1.6935, + "step": 20840 + }, + { + "epoch": 6.3968692449355435, + "grad_norm": 0.22132672369480133, + "learning_rate": 3.0358412337582752e-05, + "loss": 1.6735, + "step": 20841 + }, + { + "epoch": 6.397176181706568, + "grad_norm": 0.17865684628486633, + "learning_rate": 3.0353841456304895e-05, + "loss": 1.7097, + "step": 20842 + }, + { + "epoch": 6.397483118477593, + "grad_norm": 0.2069701999425888, + "learning_rate": 3.0349270769181914e-05, + "loss": 1.7592, + "step": 20843 + }, + { + "epoch": 6.397790055248619, + "grad_norm": 0.19800925254821777, + "learning_rate": 3.034470027625893e-05, + "loss": 1.6943, + "step": 20844 + }, + { + "epoch": 6.398096992019644, + "grad_norm": 0.24116787314414978, + "learning_rate": 3.0340129977581165e-05, + "loss": 1.7126, + "step": 20845 + }, + { + "epoch": 6.398403928790669, + "grad_norm": 0.1995212435722351, + "learning_rate": 3.033555987319375e-05, + "loss": 1.75, + "step": 20846 + }, + { + "epoch": 6.398710865561695, + "grad_norm": 0.23717111349105835, + "learning_rate": 3.0330989963141843e-05, + "loss": 1.7338, + "step": 20847 + }, + { + "epoch": 6.399017802332719, + "grad_norm": 0.18372474610805511, + "learning_rate": 3.0326420247470643e-05, + "loss": 1.7034, + "step": 20848 + }, + { + "epoch": 6.399324739103744, + "grad_norm": 0.25953924655914307, + "learning_rate": 3.0321850726225265e-05, + "loss": 1.731, + "step": 20849 + }, + { + "epoch": 6.39963167587477, + "grad_norm": 0.24846702814102173, + "learning_rate": 3.031728139945092e-05, + "loss": 1.7559, + "step": 20850 + }, + { + "epoch": 6.399938612645795, + "grad_norm": 0.20783887803554535, + "learning_rate": 3.0312712267192713e-05, + "loss": 1.7229, + "step": 20851 + }, + { + "epoch": 6.4002455494168204, + "grad_norm": 0.1904737949371338, + "learning_rate": 3.030814332949583e-05, + "loss": 1.6986, + "step": 20852 + }, + { + "epoch": 6.400552486187845, + "grad_norm": 0.2275397777557373, + "learning_rate": 3.030357458640541e-05, + "loss": 1.708, + "step": 20853 + }, + { + "epoch": 6.40085942295887, + "grad_norm": 0.20119737088680267, + "learning_rate": 3.0299006037966628e-05, + "loss": 1.7727, + "step": 20854 + }, + { + "epoch": 6.401166359729896, + "grad_norm": 0.17214249074459076, + "learning_rate": 3.0294437684224596e-05, + "loss": 1.6674, + "step": 20855 + }, + { + "epoch": 6.401473296500921, + "grad_norm": 0.21268978714942932, + "learning_rate": 3.02898695252245e-05, + "loss": 1.7182, + "step": 20856 + }, + { + "epoch": 6.401780233271946, + "grad_norm": 0.19911682605743408, + "learning_rate": 3.0285301561011448e-05, + "loss": 1.6861, + "step": 20857 + }, + { + "epoch": 6.402087170042972, + "grad_norm": 0.194064199924469, + "learning_rate": 3.0280733791630613e-05, + "loss": 1.6768, + "step": 20858 + }, + { + "epoch": 6.402394106813996, + "grad_norm": 0.17554323375225067, + "learning_rate": 3.027616621712711e-05, + "loss": 1.6987, + "step": 20859 + }, + { + "epoch": 6.402701043585021, + "grad_norm": 0.205257385969162, + "learning_rate": 3.027159883754611e-05, + "loss": 1.7951, + "step": 20860 + }, + { + "epoch": 6.403007980356047, + "grad_norm": 0.1766849011182785, + "learning_rate": 3.0267031652932743e-05, + "loss": 1.7157, + "step": 20861 + }, + { + "epoch": 6.403314917127072, + "grad_norm": 0.17106789350509644, + "learning_rate": 3.0262464663332106e-05, + "loss": 1.685, + "step": 20862 + }, + { + "epoch": 6.403621853898097, + "grad_norm": 0.17380768060684204, + "learning_rate": 3.0257897868789377e-05, + "loss": 1.708, + "step": 20863 + }, + { + "epoch": 6.403928790669122, + "grad_norm": 0.15817396342754364, + "learning_rate": 3.0253331269349662e-05, + "loss": 1.6629, + "step": 20864 + }, + { + "epoch": 6.404235727440147, + "grad_norm": 0.18253934383392334, + "learning_rate": 3.0248764865058122e-05, + "loss": 1.6877, + "step": 20865 + }, + { + "epoch": 6.4045426642111725, + "grad_norm": 0.20645618438720703, + "learning_rate": 3.0244198655959843e-05, + "loss": 1.7238, + "step": 20866 + }, + { + "epoch": 6.404849600982198, + "grad_norm": 0.2216680645942688, + "learning_rate": 3.0239632642099992e-05, + "loss": 1.7721, + "step": 20867 + }, + { + "epoch": 6.405156537753223, + "grad_norm": 0.21479755640029907, + "learning_rate": 3.023506682352365e-05, + "loss": 1.6686, + "step": 20868 + }, + { + "epoch": 6.4054634745242485, + "grad_norm": 0.21274925768375397, + "learning_rate": 3.0230501200275974e-05, + "loss": 1.7245, + "step": 20869 + }, + { + "epoch": 6.405770411295273, + "grad_norm": 0.19894039630889893, + "learning_rate": 3.0225935772402064e-05, + "loss": 1.6734, + "step": 20870 + }, + { + "epoch": 6.406077348066298, + "grad_norm": 0.24450170993804932, + "learning_rate": 3.022137053994707e-05, + "loss": 1.7103, + "step": 20871 + }, + { + "epoch": 6.406384284837324, + "grad_norm": 0.18289846181869507, + "learning_rate": 3.0216805502956057e-05, + "loss": 1.7866, + "step": 20872 + }, + { + "epoch": 6.406691221608349, + "grad_norm": 0.2884466350078583, + "learning_rate": 3.021224066147419e-05, + "loss": 1.7817, + "step": 20873 + }, + { + "epoch": 6.406998158379374, + "grad_norm": 0.21871373057365417, + "learning_rate": 3.0207676015546537e-05, + "loss": 1.6871, + "step": 20874 + }, + { + "epoch": 6.407305095150399, + "grad_norm": 0.239889994263649, + "learning_rate": 3.0203111565218244e-05, + "loss": 1.6412, + "step": 20875 + }, + { + "epoch": 6.407612031921424, + "grad_norm": 0.26960206031799316, + "learning_rate": 3.019854731053441e-05, + "loss": 1.7537, + "step": 20876 + }, + { + "epoch": 6.407918968692449, + "grad_norm": 0.32872483134269714, + "learning_rate": 3.019398325154013e-05, + "loss": 1.7718, + "step": 20877 + }, + { + "epoch": 6.408225905463475, + "grad_norm": 0.27766308188438416, + "learning_rate": 3.018941938828053e-05, + "loss": 1.7537, + "step": 20878 + }, + { + "epoch": 6.4085328422345, + "grad_norm": 0.1989286094903946, + "learning_rate": 3.0184855720800674e-05, + "loss": 1.7373, + "step": 20879 + }, + { + "epoch": 6.4088397790055245, + "grad_norm": 0.19748768210411072, + "learning_rate": 3.0180292249145703e-05, + "loss": 1.6821, + "step": 20880 + }, + { + "epoch": 6.40914671577655, + "grad_norm": 0.20632879436016083, + "learning_rate": 3.0175728973360694e-05, + "loss": 1.7641, + "step": 20881 + }, + { + "epoch": 6.409453652547575, + "grad_norm": 0.23808124661445618, + "learning_rate": 3.017116589349076e-05, + "loss": 1.7434, + "step": 20882 + }, + { + "epoch": 6.4097605893186005, + "grad_norm": 0.265514612197876, + "learning_rate": 3.0166603009580974e-05, + "loss": 1.7877, + "step": 20883 + }, + { + "epoch": 6.410067526089626, + "grad_norm": 0.21031250059604645, + "learning_rate": 3.0162040321676465e-05, + "loss": 1.738, + "step": 20884 + }, + { + "epoch": 6.41037446286065, + "grad_norm": 0.3011578619480133, + "learning_rate": 3.015747782982228e-05, + "loss": 1.7063, + "step": 20885 + }, + { + "epoch": 6.410681399631676, + "grad_norm": 0.28601503372192383, + "learning_rate": 3.015291553406353e-05, + "loss": 1.7021, + "step": 20886 + }, + { + "epoch": 6.410988336402701, + "grad_norm": 0.2433992624282837, + "learning_rate": 3.014835343444531e-05, + "loss": 1.6887, + "step": 20887 + }, + { + "epoch": 6.411295273173726, + "grad_norm": 0.3342660963535309, + "learning_rate": 3.014379153101269e-05, + "loss": 1.7798, + "step": 20888 + }, + { + "epoch": 6.411602209944752, + "grad_norm": 0.2390800267457962, + "learning_rate": 3.0139229823810757e-05, + "loss": 1.774, + "step": 20889 + }, + { + "epoch": 6.411909146715777, + "grad_norm": 0.2659217417240143, + "learning_rate": 3.0134668312884613e-05, + "loss": 1.7396, + "step": 20890 + }, + { + "epoch": 6.412216083486801, + "grad_norm": 0.22885620594024658, + "learning_rate": 3.0130106998279294e-05, + "loss": 1.7303, + "step": 20891 + }, + { + "epoch": 6.412523020257827, + "grad_norm": 0.20651856064796448, + "learning_rate": 3.0125545880039925e-05, + "loss": 1.7796, + "step": 20892 + }, + { + "epoch": 6.412829957028852, + "grad_norm": 0.26611828804016113, + "learning_rate": 3.0120984958211552e-05, + "loss": 1.7019, + "step": 20893 + }, + { + "epoch": 6.413136893799877, + "grad_norm": 0.2526776194572449, + "learning_rate": 3.0116424232839258e-05, + "loss": 1.7062, + "step": 20894 + }, + { + "epoch": 6.413443830570903, + "grad_norm": 0.2087634801864624, + "learning_rate": 3.0111863703968128e-05, + "loss": 1.7011, + "step": 20895 + }, + { + "epoch": 6.413750767341927, + "grad_norm": 0.20656780898571014, + "learning_rate": 3.0107303371643197e-05, + "loss": 1.7637, + "step": 20896 + }, + { + "epoch": 6.4140577041129525, + "grad_norm": 0.2083009034395218, + "learning_rate": 3.010274323590956e-05, + "loss": 1.7213, + "step": 20897 + }, + { + "epoch": 6.414364640883978, + "grad_norm": 0.22496090829372406, + "learning_rate": 3.0098183296812277e-05, + "loss": 1.7793, + "step": 20898 + }, + { + "epoch": 6.414671577655003, + "grad_norm": 0.2601132392883301, + "learning_rate": 3.0093623554396416e-05, + "loss": 1.8358, + "step": 20899 + }, + { + "epoch": 6.4149785144260285, + "grad_norm": 0.2364497184753418, + "learning_rate": 3.0089064008707026e-05, + "loss": 1.7299, + "step": 20900 + }, + { + "epoch": 6.415285451197054, + "grad_norm": 0.2011861503124237, + "learning_rate": 3.0084504659789186e-05, + "loss": 1.7521, + "step": 20901 + }, + { + "epoch": 6.415592387968078, + "grad_norm": 0.20605513453483582, + "learning_rate": 3.007994550768793e-05, + "loss": 1.7099, + "step": 20902 + }, + { + "epoch": 6.415899324739104, + "grad_norm": 0.20890796184539795, + "learning_rate": 3.0075386552448337e-05, + "loss": 1.7383, + "step": 20903 + }, + { + "epoch": 6.416206261510129, + "grad_norm": 0.20005083084106445, + "learning_rate": 3.0070827794115452e-05, + "loss": 1.6999, + "step": 20904 + }, + { + "epoch": 6.416513198281154, + "grad_norm": 0.20547670125961304, + "learning_rate": 3.006626923273433e-05, + "loss": 1.7424, + "step": 20905 + }, + { + "epoch": 6.41682013505218, + "grad_norm": 0.20799006521701813, + "learning_rate": 3.0061710868350003e-05, + "loss": 1.7266, + "step": 20906 + }, + { + "epoch": 6.417127071823204, + "grad_norm": 0.22234687209129333, + "learning_rate": 3.0057152701007563e-05, + "loss": 1.7755, + "step": 20907 + }, + { + "epoch": 6.417434008594229, + "grad_norm": 0.21947267651557922, + "learning_rate": 3.0052594730752005e-05, + "loss": 1.826, + "step": 20908 + }, + { + "epoch": 6.417740945365255, + "grad_norm": 0.2183268964290619, + "learning_rate": 3.0048036957628416e-05, + "loss": 1.7772, + "step": 20909 + }, + { + "epoch": 6.41804788213628, + "grad_norm": 0.1967134177684784, + "learning_rate": 3.0043479381681805e-05, + "loss": 1.6833, + "step": 20910 + }, + { + "epoch": 6.418354818907305, + "grad_norm": 0.2016787827014923, + "learning_rate": 3.003892200295723e-05, + "loss": 1.773, + "step": 20911 + }, + { + "epoch": 6.41866175567833, + "grad_norm": 0.2192344218492508, + "learning_rate": 3.0034364821499745e-05, + "loss": 1.7124, + "step": 20912 + }, + { + "epoch": 6.418968692449355, + "grad_norm": 0.24924327433109283, + "learning_rate": 3.002980783735434e-05, + "loss": 1.6882, + "step": 20913 + }, + { + "epoch": 6.4192756292203805, + "grad_norm": 0.2221844494342804, + "learning_rate": 3.0025251050566106e-05, + "loss": 1.8028, + "step": 20914 + }, + { + "epoch": 6.419582565991406, + "grad_norm": 0.27141162753105164, + "learning_rate": 3.0020694461180033e-05, + "loss": 1.698, + "step": 20915 + }, + { + "epoch": 6.419889502762431, + "grad_norm": 0.18856655061244965, + "learning_rate": 3.001613806924117e-05, + "loss": 1.7112, + "step": 20916 + }, + { + "epoch": 6.420196439533456, + "grad_norm": 0.2226688265800476, + "learning_rate": 3.0011581874794537e-05, + "loss": 1.6967, + "step": 20917 + }, + { + "epoch": 6.420503376304481, + "grad_norm": 0.2070344239473343, + "learning_rate": 3.000702587788518e-05, + "loss": 1.742, + "step": 20918 + }, + { + "epoch": 6.420810313075506, + "grad_norm": 0.22616387903690338, + "learning_rate": 3.00024700785581e-05, + "loss": 1.6865, + "step": 20919 + }, + { + "epoch": 6.421117249846532, + "grad_norm": 0.19745604693889618, + "learning_rate": 2.9997914476858348e-05, + "loss": 1.7328, + "step": 20920 + }, + { + "epoch": 6.421424186617557, + "grad_norm": 0.20654593408107758, + "learning_rate": 2.9993359072830906e-05, + "loss": 1.7811, + "step": 20921 + }, + { + "epoch": 6.421731123388582, + "grad_norm": 0.19188611209392548, + "learning_rate": 2.9988803866520832e-05, + "loss": 1.6808, + "step": 20922 + }, + { + "epoch": 6.422038060159607, + "grad_norm": 0.19907493889331818, + "learning_rate": 2.9984248857973118e-05, + "loss": 1.7326, + "step": 20923 + }, + { + "epoch": 6.422344996930632, + "grad_norm": 0.17484794557094574, + "learning_rate": 2.9979694047232804e-05, + "loss": 1.7166, + "step": 20924 + }, + { + "epoch": 6.422651933701657, + "grad_norm": 0.21412795782089233, + "learning_rate": 2.997513943434487e-05, + "loss": 1.7926, + "step": 20925 + }, + { + "epoch": 6.422958870472683, + "grad_norm": 0.17554008960723877, + "learning_rate": 2.9970585019354357e-05, + "loss": 1.6931, + "step": 20926 + }, + { + "epoch": 6.423265807243708, + "grad_norm": 0.16687868535518646, + "learning_rate": 2.9966030802306256e-05, + "loss": 1.6911, + "step": 20927 + }, + { + "epoch": 6.4235727440147325, + "grad_norm": 0.1802106350660324, + "learning_rate": 2.9961476783245578e-05, + "loss": 1.6921, + "step": 20928 + }, + { + "epoch": 6.423879680785758, + "grad_norm": 0.1968134343624115, + "learning_rate": 2.9956922962217347e-05, + "loss": 1.7035, + "step": 20929 + }, + { + "epoch": 6.424186617556783, + "grad_norm": 0.17703908681869507, + "learning_rate": 2.9952369339266538e-05, + "loss": 1.7122, + "step": 20930 + }, + { + "epoch": 6.4244935543278086, + "grad_norm": 0.22176744043827057, + "learning_rate": 2.9947815914438175e-05, + "loss": 1.7189, + "step": 20931 + }, + { + "epoch": 6.424800491098834, + "grad_norm": 0.19128306210041046, + "learning_rate": 2.9943262687777236e-05, + "loss": 1.7208, + "step": 20932 + }, + { + "epoch": 6.425107427869859, + "grad_norm": 0.2285725623369217, + "learning_rate": 2.9938709659328735e-05, + "loss": 1.7859, + "step": 20933 + }, + { + "epoch": 6.425414364640884, + "grad_norm": 0.1998651921749115, + "learning_rate": 2.9934156829137653e-05, + "loss": 1.6912, + "step": 20934 + }, + { + "epoch": 6.425721301411909, + "grad_norm": 0.1879023313522339, + "learning_rate": 2.9929604197249016e-05, + "loss": 1.7164, + "step": 20935 + }, + { + "epoch": 6.426028238182934, + "grad_norm": 0.2675700783729553, + "learning_rate": 2.992505176370778e-05, + "loss": 1.7475, + "step": 20936 + }, + { + "epoch": 6.42633517495396, + "grad_norm": 0.22345949709415436, + "learning_rate": 2.992049952855896e-05, + "loss": 1.6867, + "step": 20937 + }, + { + "epoch": 6.426642111724985, + "grad_norm": 0.17801997065544128, + "learning_rate": 2.9915947491847517e-05, + "loss": 1.736, + "step": 20938 + }, + { + "epoch": 6.4269490484960095, + "grad_norm": 0.22132502496242523, + "learning_rate": 2.991139565361846e-05, + "loss": 1.7244, + "step": 20939 + }, + { + "epoch": 6.427255985267035, + "grad_norm": 0.1899508535861969, + "learning_rate": 2.9906844013916758e-05, + "loss": 1.6781, + "step": 20940 + }, + { + "epoch": 6.42756292203806, + "grad_norm": 0.21948131918907166, + "learning_rate": 2.9902292572787414e-05, + "loss": 1.6911, + "step": 20941 + }, + { + "epoch": 6.4278698588090855, + "grad_norm": 0.16277503967285156, + "learning_rate": 2.9897741330275387e-05, + "loss": 1.702, + "step": 20942 + }, + { + "epoch": 6.428176795580111, + "grad_norm": 0.22303056716918945, + "learning_rate": 2.989319028642567e-05, + "loss": 1.7573, + "step": 20943 + }, + { + "epoch": 6.428483732351136, + "grad_norm": 0.21077899634838104, + "learning_rate": 2.9888639441283217e-05, + "loss": 1.7903, + "step": 20944 + }, + { + "epoch": 6.428790669122161, + "grad_norm": 0.23918256163597107, + "learning_rate": 2.988408879489303e-05, + "loss": 1.7112, + "step": 20945 + }, + { + "epoch": 6.429097605893186, + "grad_norm": 0.22226610779762268, + "learning_rate": 2.9879538347300074e-05, + "loss": 1.7039, + "step": 20946 + }, + { + "epoch": 6.429404542664211, + "grad_norm": 0.18605270981788635, + "learning_rate": 2.987498809854929e-05, + "loss": 1.7102, + "step": 20947 + }, + { + "epoch": 6.429711479435237, + "grad_norm": 0.24812746047973633, + "learning_rate": 2.987043804868569e-05, + "loss": 1.7112, + "step": 20948 + }, + { + "epoch": 6.430018416206262, + "grad_norm": 0.1869048923254013, + "learning_rate": 2.9865888197754206e-05, + "loss": 1.6946, + "step": 20949 + }, + { + "epoch": 6.430325352977286, + "grad_norm": 0.30707576870918274, + "learning_rate": 2.986133854579982e-05, + "loss": 1.7596, + "step": 20950 + }, + { + "epoch": 6.430632289748312, + "grad_norm": 0.20475640892982483, + "learning_rate": 2.985678909286748e-05, + "loss": 1.7162, + "step": 20951 + }, + { + "epoch": 6.430939226519337, + "grad_norm": 0.24273128807544708, + "learning_rate": 2.9852239839002182e-05, + "loss": 1.6803, + "step": 20952 + }, + { + "epoch": 6.431246163290362, + "grad_norm": 0.27484890818595886, + "learning_rate": 2.9847690784248834e-05, + "loss": 1.7948, + "step": 20953 + }, + { + "epoch": 6.431553100061388, + "grad_norm": 0.2204331010580063, + "learning_rate": 2.984314192865244e-05, + "loss": 1.769, + "step": 20954 + }, + { + "epoch": 6.431860036832412, + "grad_norm": 0.262463241815567, + "learning_rate": 2.9838593272257907e-05, + "loss": 1.7483, + "step": 20955 + }, + { + "epoch": 6.4321669736034375, + "grad_norm": 0.225942924618721, + "learning_rate": 2.983404481511023e-05, + "loss": 1.7228, + "step": 20956 + }, + { + "epoch": 6.432473910374463, + "grad_norm": 0.22381044924259186, + "learning_rate": 2.982949655725432e-05, + "loss": 1.7579, + "step": 20957 + }, + { + "epoch": 6.432780847145488, + "grad_norm": 0.1937711238861084, + "learning_rate": 2.982494849873518e-05, + "loss": 1.6833, + "step": 20958 + }, + { + "epoch": 6.4330877839165135, + "grad_norm": 0.2609664499759674, + "learning_rate": 2.9820400639597702e-05, + "loss": 1.7524, + "step": 20959 + }, + { + "epoch": 6.433394720687538, + "grad_norm": 0.2891463041305542, + "learning_rate": 2.981585297988686e-05, + "loss": 1.7672, + "step": 20960 + }, + { + "epoch": 6.433701657458563, + "grad_norm": 0.19604064524173737, + "learning_rate": 2.9811305519647582e-05, + "loss": 1.6684, + "step": 20961 + }, + { + "epoch": 6.434008594229589, + "grad_norm": 0.23522239923477173, + "learning_rate": 2.9806758258924822e-05, + "loss": 1.7461, + "step": 20962 + }, + { + "epoch": 6.434315531000614, + "grad_norm": 0.24907514452934265, + "learning_rate": 2.9802211197763525e-05, + "loss": 1.7702, + "step": 20963 + }, + { + "epoch": 6.434622467771639, + "grad_norm": 0.21963126957416534, + "learning_rate": 2.9797664336208592e-05, + "loss": 1.7263, + "step": 20964 + }, + { + "epoch": 6.434929404542665, + "grad_norm": 0.23124000430107117, + "learning_rate": 2.9793117674305004e-05, + "loss": 1.7362, + "step": 20965 + }, + { + "epoch": 6.435236341313689, + "grad_norm": 0.1917882263660431, + "learning_rate": 2.978857121209765e-05, + "loss": 1.7505, + "step": 20966 + }, + { + "epoch": 6.435543278084714, + "grad_norm": 0.24407804012298584, + "learning_rate": 2.9784024949631484e-05, + "loss": 1.7898, + "step": 20967 + }, + { + "epoch": 6.43585021485574, + "grad_norm": 0.210384339094162, + "learning_rate": 2.977947888695143e-05, + "loss": 1.7515, + "step": 20968 + }, + { + "epoch": 6.436157151626765, + "grad_norm": 0.20764803886413574, + "learning_rate": 2.9774933024102436e-05, + "loss": 1.7628, + "step": 20969 + }, + { + "epoch": 6.43646408839779, + "grad_norm": 0.21542097628116608, + "learning_rate": 2.9770387361129387e-05, + "loss": 1.7882, + "step": 20970 + }, + { + "epoch": 6.436771025168815, + "grad_norm": 0.1768570989370346, + "learning_rate": 2.976584189807725e-05, + "loss": 1.7471, + "step": 20971 + }, + { + "epoch": 6.43707796193984, + "grad_norm": 0.2398732751607895, + "learning_rate": 2.97612966349909e-05, + "loss": 1.6676, + "step": 20972 + }, + { + "epoch": 6.4373848987108655, + "grad_norm": 0.18291664123535156, + "learning_rate": 2.9756751571915286e-05, + "loss": 1.6791, + "step": 20973 + }, + { + "epoch": 6.437691835481891, + "grad_norm": 0.2769327759742737, + "learning_rate": 2.9752206708895314e-05, + "loss": 1.7675, + "step": 20974 + }, + { + "epoch": 6.437998772252916, + "grad_norm": 0.24859526753425598, + "learning_rate": 2.974766204597592e-05, + "loss": 1.7661, + "step": 20975 + }, + { + "epoch": 6.4383057090239415, + "grad_norm": 0.20495273172855377, + "learning_rate": 2.9743117583201984e-05, + "loss": 1.6774, + "step": 20976 + }, + { + "epoch": 6.438612645794966, + "grad_norm": 0.24650859832763672, + "learning_rate": 2.9738573320618447e-05, + "loss": 1.759, + "step": 20977 + }, + { + "epoch": 6.438919582565991, + "grad_norm": 0.21430176496505737, + "learning_rate": 2.973402925827019e-05, + "loss": 1.7273, + "step": 20978 + }, + { + "epoch": 6.439226519337017, + "grad_norm": 0.22392596304416656, + "learning_rate": 2.972948539620214e-05, + "loss": 1.7506, + "step": 20979 + }, + { + "epoch": 6.439533456108042, + "grad_norm": 0.24393923580646515, + "learning_rate": 2.9724941734459205e-05, + "loss": 1.7815, + "step": 20980 + }, + { + "epoch": 6.439840392879067, + "grad_norm": 0.2873772084712982, + "learning_rate": 2.9720398273086264e-05, + "loss": 1.7863, + "step": 20981 + }, + { + "epoch": 6.440147329650092, + "grad_norm": 0.218470498919487, + "learning_rate": 2.9715855012128246e-05, + "loss": 1.7347, + "step": 20982 + }, + { + "epoch": 6.440454266421117, + "grad_norm": 0.24520666897296906, + "learning_rate": 2.971131195163003e-05, + "loss": 1.6892, + "step": 20983 + }, + { + "epoch": 6.440761203192142, + "grad_norm": 0.2255270928144455, + "learning_rate": 2.970676909163652e-05, + "loss": 1.7179, + "step": 20984 + }, + { + "epoch": 6.441068139963168, + "grad_norm": 0.25171026587486267, + "learning_rate": 2.9702226432192604e-05, + "loss": 1.7087, + "step": 20985 + }, + { + "epoch": 6.441375076734193, + "grad_norm": 0.27045872807502747, + "learning_rate": 2.9697683973343204e-05, + "loss": 1.732, + "step": 20986 + }, + { + "epoch": 6.4416820135052175, + "grad_norm": 0.25374144315719604, + "learning_rate": 2.9693141715133177e-05, + "loss": 1.7688, + "step": 20987 + }, + { + "epoch": 6.441988950276243, + "grad_norm": 0.22694779932498932, + "learning_rate": 2.9688599657607442e-05, + "loss": 1.7105, + "step": 20988 + }, + { + "epoch": 6.442295887047268, + "grad_norm": 0.23455791175365448, + "learning_rate": 2.9684057800810845e-05, + "loss": 1.8007, + "step": 20989 + }, + { + "epoch": 6.4426028238182935, + "grad_norm": 0.23054158687591553, + "learning_rate": 2.9679516144788312e-05, + "loss": 1.6787, + "step": 20990 + }, + { + "epoch": 6.442909760589319, + "grad_norm": 0.22110030055046082, + "learning_rate": 2.9674974689584696e-05, + "loss": 1.8048, + "step": 20991 + }, + { + "epoch": 6.443216697360343, + "grad_norm": 0.22141657769680023, + "learning_rate": 2.9670433435244915e-05, + "loss": 1.7691, + "step": 20992 + }, + { + "epoch": 6.443523634131369, + "grad_norm": 0.18511974811553955, + "learning_rate": 2.9665892381813807e-05, + "loss": 1.6825, + "step": 20993 + }, + { + "epoch": 6.443830570902394, + "grad_norm": 0.21904997527599335, + "learning_rate": 2.966135152933629e-05, + "loss": 1.7711, + "step": 20994 + }, + { + "epoch": 6.444137507673419, + "grad_norm": 0.19334301352500916, + "learning_rate": 2.9656810877857196e-05, + "loss": 1.687, + "step": 20995 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 0.1766969859600067, + "learning_rate": 2.9652270427421426e-05, + "loss": 1.7211, + "step": 20996 + }, + { + "epoch": 6.44475138121547, + "grad_norm": 0.1821468323469162, + "learning_rate": 2.9647730178073864e-05, + "loss": 1.7086, + "step": 20997 + }, + { + "epoch": 6.445058317986494, + "grad_norm": 0.20812760293483734, + "learning_rate": 2.9643190129859333e-05, + "loss": 1.6844, + "step": 20998 + }, + { + "epoch": 6.44536525475752, + "grad_norm": 0.259042352437973, + "learning_rate": 2.9638650282822754e-05, + "loss": 1.7971, + "step": 20999 + }, + { + "epoch": 6.445672191528545, + "grad_norm": 0.2134076952934265, + "learning_rate": 2.9634110637008948e-05, + "loss": 1.7061, + "step": 21000 + }, + { + "epoch": 6.44597912829957, + "grad_norm": 0.21120613813400269, + "learning_rate": 2.962957119246281e-05, + "loss": 1.6708, + "step": 21001 + }, + { + "epoch": 6.446286065070596, + "grad_norm": 0.18577797710895538, + "learning_rate": 2.9625031949229176e-05, + "loss": 1.719, + "step": 21002 + }, + { + "epoch": 6.44659300184162, + "grad_norm": 0.21755708754062653, + "learning_rate": 2.962049290735294e-05, + "loss": 1.7203, + "step": 21003 + }, + { + "epoch": 6.4468999386126455, + "grad_norm": 0.2161538451910019, + "learning_rate": 2.961595406687891e-05, + "loss": 1.7254, + "step": 21004 + }, + { + "epoch": 6.447206875383671, + "grad_norm": 0.19979329407215118, + "learning_rate": 2.9611415427851995e-05, + "loss": 1.7203, + "step": 21005 + }, + { + "epoch": 6.447513812154696, + "grad_norm": 0.2103399932384491, + "learning_rate": 2.9606876990317e-05, + "loss": 1.7291, + "step": 21006 + }, + { + "epoch": 6.4478207489257215, + "grad_norm": 0.19513745605945587, + "learning_rate": 2.9602338754318815e-05, + "loss": 1.7574, + "step": 21007 + }, + { + "epoch": 6.448127685696747, + "grad_norm": 0.19819851219654083, + "learning_rate": 2.9597800719902256e-05, + "loss": 1.6913, + "step": 21008 + }, + { + "epoch": 6.448434622467771, + "grad_norm": 0.1847768872976303, + "learning_rate": 2.9593262887112215e-05, + "loss": 1.6987, + "step": 21009 + }, + { + "epoch": 6.448741559238797, + "grad_norm": 0.22399301826953888, + "learning_rate": 2.9588725255993487e-05, + "loss": 1.8328, + "step": 21010 + }, + { + "epoch": 6.449048496009822, + "grad_norm": 0.20540264248847961, + "learning_rate": 2.958418782659097e-05, + "loss": 1.765, + "step": 21011 + }, + { + "epoch": 6.449355432780847, + "grad_norm": 0.183661550283432, + "learning_rate": 2.9579650598949442e-05, + "loss": 1.7128, + "step": 21012 + }, + { + "epoch": 6.449662369551873, + "grad_norm": 0.1972927302122116, + "learning_rate": 2.9575113573113788e-05, + "loss": 1.717, + "step": 21013 + }, + { + "epoch": 6.449969306322897, + "grad_norm": 0.20188379287719727, + "learning_rate": 2.9570576749128846e-05, + "loss": 1.7603, + "step": 21014 + }, + { + "epoch": 6.4502762430939224, + "grad_norm": 0.20789781212806702, + "learning_rate": 2.9566040127039418e-05, + "loss": 1.7142, + "step": 21015 + }, + { + "epoch": 6.450583179864948, + "grad_norm": 0.19319608807563782, + "learning_rate": 2.956150370689038e-05, + "loss": 1.7524, + "step": 21016 + }, + { + "epoch": 6.450890116635973, + "grad_norm": 0.2153816968202591, + "learning_rate": 2.9556967488726516e-05, + "loss": 1.7325, + "step": 21017 + }, + { + "epoch": 6.4511970534069984, + "grad_norm": 0.19134823977947235, + "learning_rate": 2.9552431472592702e-05, + "loss": 1.7547, + "step": 21018 + }, + { + "epoch": 6.451503990178024, + "grad_norm": 0.21069955825805664, + "learning_rate": 2.9547895658533725e-05, + "loss": 1.7038, + "step": 21019 + }, + { + "epoch": 6.451810926949048, + "grad_norm": 0.20742546021938324, + "learning_rate": 2.9543360046594455e-05, + "loss": 1.7151, + "step": 21020 + }, + { + "epoch": 6.452117863720074, + "grad_norm": 0.16917672753334045, + "learning_rate": 2.9538824636819666e-05, + "loss": 1.6957, + "step": 21021 + }, + { + "epoch": 6.452424800491099, + "grad_norm": 0.21134577691555023, + "learning_rate": 2.953428942925423e-05, + "loss": 1.711, + "step": 21022 + }, + { + "epoch": 6.452731737262124, + "grad_norm": 0.19403810799121857, + "learning_rate": 2.9529754423942918e-05, + "loss": 1.734, + "step": 21023 + }, + { + "epoch": 6.45303867403315, + "grad_norm": 0.18534770607948303, + "learning_rate": 2.9525219620930582e-05, + "loss": 1.6857, + "step": 21024 + }, + { + "epoch": 6.453345610804174, + "grad_norm": 0.24268858134746552, + "learning_rate": 2.9520685020262016e-05, + "loss": 1.7316, + "step": 21025 + }, + { + "epoch": 6.453652547575199, + "grad_norm": 0.17590615153312683, + "learning_rate": 2.9516150621982063e-05, + "loss": 1.6608, + "step": 21026 + }, + { + "epoch": 6.453959484346225, + "grad_norm": 0.1949763298034668, + "learning_rate": 2.9511616426135504e-05, + "loss": 1.7955, + "step": 21027 + }, + { + "epoch": 6.45426642111725, + "grad_norm": 0.2424435019493103, + "learning_rate": 2.950708243276717e-05, + "loss": 1.7334, + "step": 21028 + }, + { + "epoch": 6.454573357888275, + "grad_norm": 0.22753369808197021, + "learning_rate": 2.950254864192184e-05, + "loss": 1.733, + "step": 21029 + }, + { + "epoch": 6.4548802946593, + "grad_norm": 0.1706271469593048, + "learning_rate": 2.949801505364435e-05, + "loss": 1.7424, + "step": 21030 + }, + { + "epoch": 6.455187231430325, + "grad_norm": 0.21614442765712738, + "learning_rate": 2.9493481667979506e-05, + "loss": 1.7813, + "step": 21031 + }, + { + "epoch": 6.4554941682013505, + "grad_norm": 0.1793162226676941, + "learning_rate": 2.9488948484972068e-05, + "loss": 1.7076, + "step": 21032 + }, + { + "epoch": 6.455801104972376, + "grad_norm": 0.19251759350299835, + "learning_rate": 2.9484415504666885e-05, + "loss": 1.7487, + "step": 21033 + }, + { + "epoch": 6.456108041743401, + "grad_norm": 0.1817556619644165, + "learning_rate": 2.947988272710871e-05, + "loss": 1.6958, + "step": 21034 + }, + { + "epoch": 6.456414978514426, + "grad_norm": 0.24368418753147125, + "learning_rate": 2.9475350152342378e-05, + "loss": 1.7867, + "step": 21035 + }, + { + "epoch": 6.456721915285451, + "grad_norm": 0.2362157702445984, + "learning_rate": 2.9470817780412653e-05, + "loss": 1.7241, + "step": 21036 + }, + { + "epoch": 6.457028852056476, + "grad_norm": 0.21049003303050995, + "learning_rate": 2.9466285611364358e-05, + "loss": 1.7146, + "step": 21037 + }, + { + "epoch": 6.457335788827502, + "grad_norm": 0.2516530454158783, + "learning_rate": 2.9461753645242246e-05, + "loss": 1.7349, + "step": 21038 + }, + { + "epoch": 6.457642725598527, + "grad_norm": 0.23165179789066315, + "learning_rate": 2.945722188209114e-05, + "loss": 1.7285, + "step": 21039 + }, + { + "epoch": 6.457949662369552, + "grad_norm": 0.27345010638237, + "learning_rate": 2.945269032195579e-05, + "loss": 1.7266, + "step": 21040 + }, + { + "epoch": 6.458256599140577, + "grad_norm": 0.16312900185585022, + "learning_rate": 2.9448158964881e-05, + "loss": 1.6781, + "step": 21041 + }, + { + "epoch": 6.458563535911602, + "grad_norm": 0.238658607006073, + "learning_rate": 2.9443627810911557e-05, + "loss": 1.6819, + "step": 21042 + }, + { + "epoch": 6.458870472682627, + "grad_norm": 0.19861388206481934, + "learning_rate": 2.943909686009223e-05, + "loss": 1.7397, + "step": 21043 + }, + { + "epoch": 6.459177409453653, + "grad_norm": 0.22675637900829315, + "learning_rate": 2.9434566112467793e-05, + "loss": 1.7231, + "step": 21044 + }, + { + "epoch": 6.459484346224678, + "grad_norm": 0.22638066112995148, + "learning_rate": 2.9430035568083043e-05, + "loss": 1.7466, + "step": 21045 + }, + { + "epoch": 6.4597912829957025, + "grad_norm": 0.2237064391374588, + "learning_rate": 2.942550522698272e-05, + "loss": 1.7373, + "step": 21046 + }, + { + "epoch": 6.460098219766728, + "grad_norm": 0.2613731324672699, + "learning_rate": 2.942097508921162e-05, + "loss": 1.7567, + "step": 21047 + }, + { + "epoch": 6.460405156537753, + "grad_norm": 0.21602070331573486, + "learning_rate": 2.941644515481452e-05, + "loss": 1.7512, + "step": 21048 + }, + { + "epoch": 6.4607120933087785, + "grad_norm": 0.30129116773605347, + "learning_rate": 2.941191542383615e-05, + "loss": 1.761, + "step": 21049 + }, + { + "epoch": 6.461019030079804, + "grad_norm": 0.2303919792175293, + "learning_rate": 2.940738589632132e-05, + "loss": 1.742, + "step": 21050 + }, + { + "epoch": 6.461325966850829, + "grad_norm": 0.2195158153772354, + "learning_rate": 2.940285657231475e-05, + "loss": 1.7169, + "step": 21051 + }, + { + "epoch": 6.461632903621854, + "grad_norm": 0.19029918313026428, + "learning_rate": 2.9398327451861242e-05, + "loss": 1.6721, + "step": 21052 + }, + { + "epoch": 6.461939840392879, + "grad_norm": 0.2006317377090454, + "learning_rate": 2.939379853500553e-05, + "loss": 1.7393, + "step": 21053 + }, + { + "epoch": 6.462246777163904, + "grad_norm": 0.222677081823349, + "learning_rate": 2.9389269821792377e-05, + "loss": 1.7858, + "step": 21054 + }, + { + "epoch": 6.46255371393493, + "grad_norm": 0.20772451162338257, + "learning_rate": 2.938474131226654e-05, + "loss": 1.735, + "step": 21055 + }, + { + "epoch": 6.462860650705955, + "grad_norm": 0.21006503701210022, + "learning_rate": 2.9380213006472778e-05, + "loss": 1.7197, + "step": 21056 + }, + { + "epoch": 6.463167587476979, + "grad_norm": 0.23545250296592712, + "learning_rate": 2.9375684904455825e-05, + "loss": 1.8278, + "step": 21057 + }, + { + "epoch": 6.463474524248005, + "grad_norm": 0.24590329825878143, + "learning_rate": 2.937115700626045e-05, + "loss": 1.6411, + "step": 21058 + }, + { + "epoch": 6.46378146101903, + "grad_norm": 0.22359445691108704, + "learning_rate": 2.9366629311931393e-05, + "loss": 1.7901, + "step": 21059 + }, + { + "epoch": 6.464088397790055, + "grad_norm": 0.22807523608207703, + "learning_rate": 2.93621018215134e-05, + "loss": 1.7472, + "step": 21060 + }, + { + "epoch": 6.464395334561081, + "grad_norm": 0.24183115363121033, + "learning_rate": 2.93575745350512e-05, + "loss": 1.7553, + "step": 21061 + }, + { + "epoch": 6.464702271332105, + "grad_norm": 0.23809055984020233, + "learning_rate": 2.935304745258958e-05, + "loss": 1.7451, + "step": 21062 + }, + { + "epoch": 6.4650092081031305, + "grad_norm": 0.28455644845962524, + "learning_rate": 2.934852057417321e-05, + "loss": 1.8112, + "step": 21063 + }, + { + "epoch": 6.465316144874156, + "grad_norm": 0.22193321585655212, + "learning_rate": 2.9343993899846888e-05, + "loss": 1.747, + "step": 21064 + }, + { + "epoch": 6.465623081645181, + "grad_norm": 0.30524322390556335, + "learning_rate": 2.933946742965532e-05, + "loss": 1.7117, + "step": 21065 + }, + { + "epoch": 6.4659300184162065, + "grad_norm": 0.19748717546463013, + "learning_rate": 2.9334941163643233e-05, + "loss": 1.6899, + "step": 21066 + }, + { + "epoch": 6.466236955187231, + "grad_norm": 0.25551193952560425, + "learning_rate": 2.933041510185539e-05, + "loss": 1.7264, + "step": 21067 + }, + { + "epoch": 6.466543891958256, + "grad_norm": 0.20016206800937653, + "learning_rate": 2.932588924433648e-05, + "loss": 1.6613, + "step": 21068 + }, + { + "epoch": 6.466850828729282, + "grad_norm": 0.31049394607543945, + "learning_rate": 2.932136359113127e-05, + "loss": 1.6575, + "step": 21069 + }, + { + "epoch": 6.467157765500307, + "grad_norm": 0.29408347606658936, + "learning_rate": 2.9316838142284436e-05, + "loss": 1.72, + "step": 21070 + }, + { + "epoch": 6.467464702271332, + "grad_norm": 0.18981193006038666, + "learning_rate": 2.9312312897840748e-05, + "loss": 1.6799, + "step": 21071 + }, + { + "epoch": 6.467771639042358, + "grad_norm": 0.26828575134277344, + "learning_rate": 2.9307787857844905e-05, + "loss": 1.6983, + "step": 21072 + }, + { + "epoch": 6.468078575813382, + "grad_norm": 0.2605530321598053, + "learning_rate": 2.9303263022341642e-05, + "loss": 1.7973, + "step": 21073 + }, + { + "epoch": 6.468385512584407, + "grad_norm": 0.389957070350647, + "learning_rate": 2.9298738391375648e-05, + "loss": 1.7288, + "step": 21074 + }, + { + "epoch": 6.468692449355433, + "grad_norm": 0.20525416731834412, + "learning_rate": 2.9294213964991667e-05, + "loss": 1.7526, + "step": 21075 + }, + { + "epoch": 6.468999386126458, + "grad_norm": 0.3628186285495758, + "learning_rate": 2.9289689743234387e-05, + "loss": 1.7055, + "step": 21076 + }, + { + "epoch": 6.469306322897483, + "grad_norm": 0.21661829948425293, + "learning_rate": 2.9285165726148545e-05, + "loss": 1.7806, + "step": 21077 + }, + { + "epoch": 6.469613259668508, + "grad_norm": 0.3815501034259796, + "learning_rate": 2.9280641913778816e-05, + "loss": 1.7257, + "step": 21078 + }, + { + "epoch": 6.469920196439533, + "grad_norm": 0.19470983743667603, + "learning_rate": 2.9276118306169957e-05, + "loss": 1.7055, + "step": 21079 + }, + { + "epoch": 6.4702271332105585, + "grad_norm": 0.36236056685447693, + "learning_rate": 2.927159490336662e-05, + "loss": 1.6748, + "step": 21080 + }, + { + "epoch": 6.470534069981584, + "grad_norm": 0.201282799243927, + "learning_rate": 2.9267071705413552e-05, + "loss": 1.6987, + "step": 21081 + }, + { + "epoch": 6.470841006752609, + "grad_norm": 0.3806697130203247, + "learning_rate": 2.9262548712355425e-05, + "loss": 1.7386, + "step": 21082 + }, + { + "epoch": 6.4711479435236345, + "grad_norm": 0.3023025691509247, + "learning_rate": 2.9258025924236933e-05, + "loss": 1.7183, + "step": 21083 + }, + { + "epoch": 6.471454880294659, + "grad_norm": 0.2648932635784149, + "learning_rate": 2.9253503341102806e-05, + "loss": 1.6755, + "step": 21084 + }, + { + "epoch": 6.471761817065684, + "grad_norm": 0.2647169828414917, + "learning_rate": 2.9248980962997707e-05, + "loss": 1.7326, + "step": 21085 + }, + { + "epoch": 6.47206875383671, + "grad_norm": 0.23535950481891632, + "learning_rate": 2.9244458789966355e-05, + "loss": 1.7541, + "step": 21086 + }, + { + "epoch": 6.472375690607735, + "grad_norm": 0.2551584541797638, + "learning_rate": 2.9239936822053403e-05, + "loss": 1.6907, + "step": 21087 + }, + { + "epoch": 6.47268262737876, + "grad_norm": 0.23313823342323303, + "learning_rate": 2.923541505930357e-05, + "loss": 1.705, + "step": 21088 + }, + { + "epoch": 6.472989564149785, + "grad_norm": 0.2368597686290741, + "learning_rate": 2.9230893501761534e-05, + "loss": 1.6666, + "step": 21089 + }, + { + "epoch": 6.47329650092081, + "grad_norm": 0.17861969769001007, + "learning_rate": 2.9226372149472003e-05, + "loss": 1.6927, + "step": 21090 + }, + { + "epoch": 6.473603437691835, + "grad_norm": 0.2212727665901184, + "learning_rate": 2.9221851002479616e-05, + "loss": 1.6972, + "step": 21091 + }, + { + "epoch": 6.473910374462861, + "grad_norm": 0.19382402300834656, + "learning_rate": 2.9217330060829096e-05, + "loss": 1.7602, + "step": 21092 + }, + { + "epoch": 6.474217311233886, + "grad_norm": 0.2762092053890228, + "learning_rate": 2.9212809324565076e-05, + "loss": 1.7642, + "step": 21093 + }, + { + "epoch": 6.474524248004911, + "grad_norm": 0.22068747878074646, + "learning_rate": 2.9208288793732274e-05, + "loss": 1.7477, + "step": 21094 + }, + { + "epoch": 6.474831184775936, + "grad_norm": 0.19979839026927948, + "learning_rate": 2.9203768468375337e-05, + "loss": 1.7266, + "step": 21095 + }, + { + "epoch": 6.475138121546961, + "grad_norm": 0.23038682341575623, + "learning_rate": 2.9199248348538965e-05, + "loss": 1.7428, + "step": 21096 + }, + { + "epoch": 6.475445058317987, + "grad_norm": 0.16841283440589905, + "learning_rate": 2.91947284342678e-05, + "loss": 1.6788, + "step": 21097 + }, + { + "epoch": 6.475751995089012, + "grad_norm": 0.22812627255916595, + "learning_rate": 2.9190208725606528e-05, + "loss": 1.7513, + "step": 21098 + }, + { + "epoch": 6.476058931860037, + "grad_norm": 0.18409393727779388, + "learning_rate": 2.9185689222599832e-05, + "loss": 1.6834, + "step": 21099 + }, + { + "epoch": 6.476365868631062, + "grad_norm": 0.26226910948753357, + "learning_rate": 2.9181169925292313e-05, + "loss": 1.7375, + "step": 21100 + }, + { + "epoch": 6.476672805402087, + "grad_norm": 0.1915685385465622, + "learning_rate": 2.9176650833728697e-05, + "loss": 1.7521, + "step": 21101 + }, + { + "epoch": 6.476979742173112, + "grad_norm": 0.22342176735401154, + "learning_rate": 2.917213194795362e-05, + "loss": 1.8018, + "step": 21102 + }, + { + "epoch": 6.477286678944138, + "grad_norm": 0.18338742852210999, + "learning_rate": 2.9167613268011745e-05, + "loss": 1.6817, + "step": 21103 + }, + { + "epoch": 6.477593615715163, + "grad_norm": 0.23008635640144348, + "learning_rate": 2.9163094793947728e-05, + "loss": 1.7037, + "step": 21104 + }, + { + "epoch": 6.4779005524861875, + "grad_norm": 0.20954197645187378, + "learning_rate": 2.9158576525806215e-05, + "loss": 1.7565, + "step": 21105 + }, + { + "epoch": 6.478207489257213, + "grad_norm": 0.21065562963485718, + "learning_rate": 2.9154058463631874e-05, + "loss": 1.6899, + "step": 21106 + }, + { + "epoch": 6.478514426028238, + "grad_norm": 0.20217828452587128, + "learning_rate": 2.9149540607469335e-05, + "loss": 1.7055, + "step": 21107 + }, + { + "epoch": 6.4788213627992635, + "grad_norm": 0.19058823585510254, + "learning_rate": 2.9145022957363244e-05, + "loss": 1.6794, + "step": 21108 + }, + { + "epoch": 6.479128299570289, + "grad_norm": 0.2308664619922638, + "learning_rate": 2.9140505513358297e-05, + "loss": 1.7322, + "step": 21109 + }, + { + "epoch": 6.479435236341313, + "grad_norm": 0.18911845982074738, + "learning_rate": 2.9135988275499056e-05, + "loss": 1.7255, + "step": 21110 + }, + { + "epoch": 6.479742173112339, + "grad_norm": 0.21459296345710754, + "learning_rate": 2.9131471243830256e-05, + "loss": 1.6599, + "step": 21111 + }, + { + "epoch": 6.480049109883364, + "grad_norm": 0.20521530508995056, + "learning_rate": 2.912695441839644e-05, + "loss": 1.7564, + "step": 21112 + }, + { + "epoch": 6.480356046654389, + "grad_norm": 0.21924994885921478, + "learning_rate": 2.912243779924232e-05, + "loss": 1.6922, + "step": 21113 + }, + { + "epoch": 6.480662983425415, + "grad_norm": 0.18219491839408875, + "learning_rate": 2.911792138641253e-05, + "loss": 1.6907, + "step": 21114 + }, + { + "epoch": 6.48096992019644, + "grad_norm": 0.23122453689575195, + "learning_rate": 2.9113405179951626e-05, + "loss": 1.7665, + "step": 21115 + }, + { + "epoch": 6.481276856967464, + "grad_norm": 0.18411210179328918, + "learning_rate": 2.9108889179904348e-05, + "loss": 1.7216, + "step": 21116 + }, + { + "epoch": 6.48158379373849, + "grad_norm": 0.2251562923192978, + "learning_rate": 2.9104373386315225e-05, + "loss": 1.7605, + "step": 21117 + }, + { + "epoch": 6.481890730509515, + "grad_norm": 0.2252185344696045, + "learning_rate": 2.9099857799228957e-05, + "loss": 1.7345, + "step": 21118 + }, + { + "epoch": 6.48219766728054, + "grad_norm": 0.20799386501312256, + "learning_rate": 2.909534241869014e-05, + "loss": 1.7497, + "step": 21119 + }, + { + "epoch": 6.482504604051566, + "grad_norm": 0.2059052586555481, + "learning_rate": 2.90908272447434e-05, + "loss": 1.7444, + "step": 21120 + }, + { + "epoch": 6.48281154082259, + "grad_norm": 0.17851221561431885, + "learning_rate": 2.9086312277433362e-05, + "loss": 1.7208, + "step": 21121 + }, + { + "epoch": 6.4831184775936155, + "grad_norm": 0.20561498403549194, + "learning_rate": 2.908179751680465e-05, + "loss": 1.731, + "step": 21122 + }, + { + "epoch": 6.483425414364641, + "grad_norm": 0.2386128008365631, + "learning_rate": 2.9077282962901868e-05, + "loss": 1.7493, + "step": 21123 + }, + { + "epoch": 6.483732351135666, + "grad_norm": 0.21024827659130096, + "learning_rate": 2.9072768615769642e-05, + "loss": 1.7353, + "step": 21124 + }, + { + "epoch": 6.4840392879066915, + "grad_norm": 0.23443256318569183, + "learning_rate": 2.9068254475452582e-05, + "loss": 1.7419, + "step": 21125 + }, + { + "epoch": 6.484346224677717, + "grad_norm": 0.1849295198917389, + "learning_rate": 2.90637405419953e-05, + "loss": 1.7239, + "step": 21126 + }, + { + "epoch": 6.484653161448741, + "grad_norm": 0.1967659890651703, + "learning_rate": 2.9059226815442385e-05, + "loss": 1.7163, + "step": 21127 + }, + { + "epoch": 6.484960098219767, + "grad_norm": 0.20395416021347046, + "learning_rate": 2.9054713295838505e-05, + "loss": 1.7108, + "step": 21128 + }, + { + "epoch": 6.485267034990792, + "grad_norm": 0.24162746965885162, + "learning_rate": 2.9050199983228184e-05, + "loss": 1.7666, + "step": 21129 + }, + { + "epoch": 6.485573971761817, + "grad_norm": 0.18104900419712067, + "learning_rate": 2.9045686877656086e-05, + "loss": 1.6863, + "step": 21130 + }, + { + "epoch": 6.485880908532843, + "grad_norm": 0.18469318747520447, + "learning_rate": 2.9041173979166813e-05, + "loss": 1.7344, + "step": 21131 + }, + { + "epoch": 6.486187845303867, + "grad_norm": 0.18488821387290955, + "learning_rate": 2.90366612878049e-05, + "loss": 1.694, + "step": 21132 + }, + { + "epoch": 6.486494782074892, + "grad_norm": 0.2030600905418396, + "learning_rate": 2.903214880361503e-05, + "loss": 1.7079, + "step": 21133 + }, + { + "epoch": 6.486801718845918, + "grad_norm": 0.2222873419523239, + "learning_rate": 2.902763652664171e-05, + "loss": 1.7193, + "step": 21134 + }, + { + "epoch": 6.487108655616943, + "grad_norm": 0.1936846524477005, + "learning_rate": 2.9023124456929608e-05, + "loss": 1.7152, + "step": 21135 + }, + { + "epoch": 6.487415592387968, + "grad_norm": 0.25259360671043396, + "learning_rate": 2.9018612594523274e-05, + "loss": 1.776, + "step": 21136 + }, + { + "epoch": 6.487722529158993, + "grad_norm": 0.22994543612003326, + "learning_rate": 2.9014100939467316e-05, + "loss": 1.7437, + "step": 21137 + }, + { + "epoch": 6.488029465930018, + "grad_norm": 0.2646990716457367, + "learning_rate": 2.900958949180631e-05, + "loss": 1.7535, + "step": 21138 + }, + { + "epoch": 6.4883364027010435, + "grad_norm": 0.22973869740962982, + "learning_rate": 2.9005078251584843e-05, + "loss": 1.6772, + "step": 21139 + }, + { + "epoch": 6.488643339472069, + "grad_norm": 0.21261750161647797, + "learning_rate": 2.9000567218847497e-05, + "loss": 1.6899, + "step": 21140 + }, + { + "epoch": 6.488950276243094, + "grad_norm": 0.24828271567821503, + "learning_rate": 2.8996056393638858e-05, + "loss": 1.7994, + "step": 21141 + }, + { + "epoch": 6.4892572130141195, + "grad_norm": 0.18308857083320618, + "learning_rate": 2.8991545776003497e-05, + "loss": 1.7847, + "step": 21142 + }, + { + "epoch": 6.489564149785144, + "grad_norm": 0.22744092345237732, + "learning_rate": 2.8987035365985994e-05, + "loss": 1.7789, + "step": 21143 + }, + { + "epoch": 6.489871086556169, + "grad_norm": 0.18573936820030212, + "learning_rate": 2.8982525163630903e-05, + "loss": 1.6649, + "step": 21144 + }, + { + "epoch": 6.490178023327195, + "grad_norm": 0.26056674122810364, + "learning_rate": 2.8978015168982863e-05, + "loss": 1.68, + "step": 21145 + }, + { + "epoch": 6.49048496009822, + "grad_norm": 0.1912553906440735, + "learning_rate": 2.897350538208635e-05, + "loss": 1.7011, + "step": 21146 + }, + { + "epoch": 6.490791896869245, + "grad_norm": 0.25937187671661377, + "learning_rate": 2.896899580298603e-05, + "loss": 1.7409, + "step": 21147 + }, + { + "epoch": 6.49109883364027, + "grad_norm": 0.22148750722408295, + "learning_rate": 2.8964486431726397e-05, + "loss": 1.6921, + "step": 21148 + }, + { + "epoch": 6.491405770411295, + "grad_norm": 0.23678559064865112, + "learning_rate": 2.8959977268352012e-05, + "loss": 1.6833, + "step": 21149 + }, + { + "epoch": 6.49171270718232, + "grad_norm": 0.2942093312740326, + "learning_rate": 2.8955468312907506e-05, + "loss": 1.7119, + "step": 21150 + }, + { + "epoch": 6.492019643953346, + "grad_norm": 0.18726128339767456, + "learning_rate": 2.8950959565437365e-05, + "loss": 1.7067, + "step": 21151 + }, + { + "epoch": 6.492326580724371, + "grad_norm": 0.23851951956748962, + "learning_rate": 2.894645102598621e-05, + "loss": 1.73, + "step": 21152 + }, + { + "epoch": 6.4926335174953955, + "grad_norm": 0.18054445087909698, + "learning_rate": 2.8941942694598533e-05, + "loss": 1.7243, + "step": 21153 + }, + { + "epoch": 6.492940454266421, + "grad_norm": 0.21889349818229675, + "learning_rate": 2.8937434571318934e-05, + "loss": 1.7789, + "step": 21154 + }, + { + "epoch": 6.493247391037446, + "grad_norm": 0.18788981437683105, + "learning_rate": 2.893292665619195e-05, + "loss": 1.7496, + "step": 21155 + }, + { + "epoch": 6.4935543278084715, + "grad_norm": 0.1964103877544403, + "learning_rate": 2.8928418949262138e-05, + "loss": 1.6732, + "step": 21156 + }, + { + "epoch": 6.493861264579497, + "grad_norm": 0.21939502656459808, + "learning_rate": 2.8923911450574043e-05, + "loss": 1.7149, + "step": 21157 + }, + { + "epoch": 6.494168201350522, + "grad_norm": 0.16927817463874817, + "learning_rate": 2.8919404160172203e-05, + "loss": 1.7093, + "step": 21158 + }, + { + "epoch": 6.494475138121547, + "grad_norm": 0.19907668232917786, + "learning_rate": 2.8914897078101166e-05, + "loss": 1.718, + "step": 21159 + }, + { + "epoch": 6.494782074892572, + "grad_norm": 0.18071576952934265, + "learning_rate": 2.891039020440548e-05, + "loss": 1.7241, + "step": 21160 + }, + { + "epoch": 6.495089011663597, + "grad_norm": 0.17780692875385284, + "learning_rate": 2.890588353912965e-05, + "loss": 1.7013, + "step": 21161 + }, + { + "epoch": 6.495395948434623, + "grad_norm": 0.20762500166893005, + "learning_rate": 2.8901377082318292e-05, + "loss": 1.8149, + "step": 21162 + }, + { + "epoch": 6.495702885205648, + "grad_norm": 0.21616768836975098, + "learning_rate": 2.889687083401585e-05, + "loss": 1.7467, + "step": 21163 + }, + { + "epoch": 6.496009821976672, + "grad_norm": 0.20075570046901703, + "learning_rate": 2.8892364794266935e-05, + "loss": 1.6643, + "step": 21164 + }, + { + "epoch": 6.496316758747698, + "grad_norm": 0.18893925845623016, + "learning_rate": 2.8887858963116028e-05, + "loss": 1.7362, + "step": 21165 + }, + { + "epoch": 6.496623695518723, + "grad_norm": 0.20031611621379852, + "learning_rate": 2.888335334060765e-05, + "loss": 1.6902, + "step": 21166 + }, + { + "epoch": 6.496930632289748, + "grad_norm": 0.2959407866001129, + "learning_rate": 2.887884792678639e-05, + "loss": 1.7874, + "step": 21167 + }, + { + "epoch": 6.497237569060774, + "grad_norm": 0.17434875667095184, + "learning_rate": 2.8874342721696697e-05, + "loss": 1.7353, + "step": 21168 + }, + { + "epoch": 6.497544505831799, + "grad_norm": 0.19451481103897095, + "learning_rate": 2.8869837725383163e-05, + "loss": 1.6942, + "step": 21169 + }, + { + "epoch": 6.4978514426028235, + "grad_norm": 0.17984920740127563, + "learning_rate": 2.886533293789025e-05, + "loss": 1.7461, + "step": 21170 + }, + { + "epoch": 6.498158379373849, + "grad_norm": 0.18166208267211914, + "learning_rate": 2.8860828359262516e-05, + "loss": 1.7202, + "step": 21171 + }, + { + "epoch": 6.498465316144874, + "grad_norm": 0.1849331557750702, + "learning_rate": 2.8856323989544472e-05, + "loss": 1.6862, + "step": 21172 + }, + { + "epoch": 6.4987722529158995, + "grad_norm": 0.17846204340457916, + "learning_rate": 2.8851819828780623e-05, + "loss": 1.7446, + "step": 21173 + }, + { + "epoch": 6.499079189686925, + "grad_norm": 0.1963818222284317, + "learning_rate": 2.8847315877015486e-05, + "loss": 1.7366, + "step": 21174 + }, + { + "epoch": 6.499386126457949, + "grad_norm": 0.1917402446269989, + "learning_rate": 2.8842812134293574e-05, + "loss": 1.7362, + "step": 21175 + }, + { + "epoch": 6.499693063228975, + "grad_norm": 0.16559138894081116, + "learning_rate": 2.883830860065939e-05, + "loss": 1.6735, + "step": 21176 + }, + { + "epoch": 6.5, + "grad_norm": 0.1820032149553299, + "learning_rate": 2.8833805276157442e-05, + "loss": 1.7107, + "step": 21177 + }, + { + "epoch": 6.500306936771025, + "grad_norm": 0.23760980367660522, + "learning_rate": 2.882930216083222e-05, + "loss": 1.7024, + "step": 21178 + }, + { + "epoch": 6.500613873542051, + "grad_norm": 0.22314296662807465, + "learning_rate": 2.8824799254728285e-05, + "loss": 1.714, + "step": 21179 + }, + { + "epoch": 6.500920810313076, + "grad_norm": 0.21919335424900055, + "learning_rate": 2.8820296557890046e-05, + "loss": 1.7625, + "step": 21180 + }, + { + "epoch": 6.5012277470841005, + "grad_norm": 0.21632128953933716, + "learning_rate": 2.88157940703621e-05, + "loss": 1.6589, + "step": 21181 + }, + { + "epoch": 6.501534683855126, + "grad_norm": 0.17998506128787994, + "learning_rate": 2.8811291792188867e-05, + "loss": 1.7528, + "step": 21182 + }, + { + "epoch": 6.501841620626151, + "grad_norm": 0.19783075153827667, + "learning_rate": 2.880678972341485e-05, + "loss": 1.6908, + "step": 21183 + }, + { + "epoch": 6.5021485573971765, + "grad_norm": 0.20510388910770416, + "learning_rate": 2.88022878640846e-05, + "loss": 1.7342, + "step": 21184 + }, + { + "epoch": 6.502455494168201, + "grad_norm": 0.24218666553497314, + "learning_rate": 2.879778621424253e-05, + "loss": 1.8, + "step": 21185 + }, + { + "epoch": 6.502762430939226, + "grad_norm": 0.1901179403066635, + "learning_rate": 2.8793284773933195e-05, + "loss": 1.699, + "step": 21186 + }, + { + "epoch": 6.503069367710252, + "grad_norm": 0.2652232348918915, + "learning_rate": 2.8788783543201007e-05, + "loss": 1.8394, + "step": 21187 + }, + { + "epoch": 6.503376304481277, + "grad_norm": 0.17701558768749237, + "learning_rate": 2.878428252209052e-05, + "loss": 1.6674, + "step": 21188 + }, + { + "epoch": 6.503683241252302, + "grad_norm": 0.17464707791805267, + "learning_rate": 2.8779781710646185e-05, + "loss": 1.6894, + "step": 21189 + }, + { + "epoch": 6.503990178023328, + "grad_norm": 0.19469478726387024, + "learning_rate": 2.877528110891249e-05, + "loss": 1.7487, + "step": 21190 + }, + { + "epoch": 6.504297114794352, + "grad_norm": 0.21656417846679688, + "learning_rate": 2.87707807169339e-05, + "loss": 1.641, + "step": 21191 + }, + { + "epoch": 6.504604051565377, + "grad_norm": 0.20374895632266998, + "learning_rate": 2.8766280534754896e-05, + "loss": 1.6692, + "step": 21192 + }, + { + "epoch": 6.504910988336403, + "grad_norm": 0.26638445258140564, + "learning_rate": 2.876178056241996e-05, + "loss": 1.7415, + "step": 21193 + }, + { + "epoch": 6.505217925107428, + "grad_norm": 0.1852893978357315, + "learning_rate": 2.8757280799973557e-05, + "loss": 1.6981, + "step": 21194 + }, + { + "epoch": 6.505524861878453, + "grad_norm": 0.20518383383750916, + "learning_rate": 2.875278124746013e-05, + "loss": 1.781, + "step": 21195 + }, + { + "epoch": 6.505831798649478, + "grad_norm": 0.19968904554843903, + "learning_rate": 2.874828190492422e-05, + "loss": 1.6813, + "step": 21196 + }, + { + "epoch": 6.506138735420503, + "grad_norm": 0.19164247810840607, + "learning_rate": 2.87437827724102e-05, + "loss": 1.6833, + "step": 21197 + }, + { + "epoch": 6.5064456721915285, + "grad_norm": 0.19305361807346344, + "learning_rate": 2.873928384996262e-05, + "loss": 1.7164, + "step": 21198 + }, + { + "epoch": 6.506752608962554, + "grad_norm": 0.1853758841753006, + "learning_rate": 2.873478513762587e-05, + "loss": 1.7481, + "step": 21199 + }, + { + "epoch": 6.507059545733579, + "grad_norm": 0.20187529921531677, + "learning_rate": 2.8730286635444425e-05, + "loss": 1.7666, + "step": 21200 + }, + { + "epoch": 6.5073664825046045, + "grad_norm": 0.19769401848316193, + "learning_rate": 2.872578834346279e-05, + "loss": 1.798, + "step": 21201 + }, + { + "epoch": 6.507673419275629, + "grad_norm": 0.1936112940311432, + "learning_rate": 2.8721290261725342e-05, + "loss": 1.6992, + "step": 21202 + }, + { + "epoch": 6.507980356046654, + "grad_norm": 0.17090481519699097, + "learning_rate": 2.871679239027662e-05, + "loss": 1.6802, + "step": 21203 + }, + { + "epoch": 6.50828729281768, + "grad_norm": 0.19443605840206146, + "learning_rate": 2.8712294729160987e-05, + "loss": 1.736, + "step": 21204 + }, + { + "epoch": 6.508594229588705, + "grad_norm": 0.19216817617416382, + "learning_rate": 2.8707797278422954e-05, + "loss": 1.7109, + "step": 21205 + }, + { + "epoch": 6.50890116635973, + "grad_norm": 0.19900040328502655, + "learning_rate": 2.8703300038106952e-05, + "loss": 1.7158, + "step": 21206 + }, + { + "epoch": 6.509208103130755, + "grad_norm": 0.17810803651809692, + "learning_rate": 2.8698803008257425e-05, + "loss": 1.6886, + "step": 21207 + }, + { + "epoch": 6.50951503990178, + "grad_norm": 0.1890508532524109, + "learning_rate": 2.8694306188918807e-05, + "loss": 1.7447, + "step": 21208 + }, + { + "epoch": 6.509821976672805, + "grad_norm": 0.17456012964248657, + "learning_rate": 2.868980958013554e-05, + "loss": 1.7094, + "step": 21209 + }, + { + "epoch": 6.510128913443831, + "grad_norm": 0.17089629173278809, + "learning_rate": 2.8685313181952066e-05, + "loss": 1.6827, + "step": 21210 + }, + { + "epoch": 6.510435850214856, + "grad_norm": 0.22681273519992828, + "learning_rate": 2.8680816994412823e-05, + "loss": 1.7374, + "step": 21211 + }, + { + "epoch": 6.510742786985881, + "grad_norm": 0.20642207562923431, + "learning_rate": 2.8676321017562225e-05, + "loss": 1.7609, + "step": 21212 + }, + { + "epoch": 6.511049723756906, + "grad_norm": 0.2360219657421112, + "learning_rate": 2.867182525144475e-05, + "loss": 1.7577, + "step": 21213 + }, + { + "epoch": 6.511356660527931, + "grad_norm": 0.19686923921108246, + "learning_rate": 2.8667329696104766e-05, + "loss": 1.7459, + "step": 21214 + }, + { + "epoch": 6.5116635972989565, + "grad_norm": 0.21280834078788757, + "learning_rate": 2.8662834351586777e-05, + "loss": 1.7837, + "step": 21215 + }, + { + "epoch": 6.511970534069982, + "grad_norm": 0.19297273457050323, + "learning_rate": 2.8658339217935136e-05, + "loss": 1.734, + "step": 21216 + }, + { + "epoch": 6.512277470841006, + "grad_norm": 0.1937931329011917, + "learning_rate": 2.8653844295194283e-05, + "loss": 1.6631, + "step": 21217 + }, + { + "epoch": 6.512584407612032, + "grad_norm": 0.2061077207326889, + "learning_rate": 2.8649349583408692e-05, + "loss": 1.7324, + "step": 21218 + }, + { + "epoch": 6.512891344383057, + "grad_norm": 0.19711358845233917, + "learning_rate": 2.8644855082622695e-05, + "loss": 1.7024, + "step": 21219 + }, + { + "epoch": 6.513198281154082, + "grad_norm": 0.17352496087551117, + "learning_rate": 2.8640360792880804e-05, + "loss": 1.7261, + "step": 21220 + }, + { + "epoch": 6.513505217925108, + "grad_norm": 0.181448295712471, + "learning_rate": 2.8635866714227344e-05, + "loss": 1.7147, + "step": 21221 + }, + { + "epoch": 6.513812154696133, + "grad_norm": 0.1827932894229889, + "learning_rate": 2.8631372846706787e-05, + "loss": 1.7338, + "step": 21222 + }, + { + "epoch": 6.514119091467157, + "grad_norm": 0.20659075677394867, + "learning_rate": 2.862687919036353e-05, + "loss": 1.6611, + "step": 21223 + }, + { + "epoch": 6.514426028238183, + "grad_norm": 0.19185996055603027, + "learning_rate": 2.8622385745241987e-05, + "loss": 1.7834, + "step": 21224 + }, + { + "epoch": 6.514732965009208, + "grad_norm": 0.19825506210327148, + "learning_rate": 2.8617892511386558e-05, + "loss": 1.7608, + "step": 21225 + }, + { + "epoch": 6.515039901780233, + "grad_norm": 0.16927020251750946, + "learning_rate": 2.861339948884164e-05, + "loss": 1.6651, + "step": 21226 + }, + { + "epoch": 6.515346838551259, + "grad_norm": 0.19211016595363617, + "learning_rate": 2.8608906677651646e-05, + "loss": 1.6673, + "step": 21227 + }, + { + "epoch": 6.515653775322283, + "grad_norm": 0.20192545652389526, + "learning_rate": 2.8604414077860974e-05, + "loss": 1.7301, + "step": 21228 + }, + { + "epoch": 6.5159607120933085, + "grad_norm": 0.2075425237417221, + "learning_rate": 2.8599921689514002e-05, + "loss": 1.783, + "step": 21229 + }, + { + "epoch": 6.516267648864334, + "grad_norm": 0.21261392533779144, + "learning_rate": 2.8595429512655192e-05, + "loss": 1.7277, + "step": 21230 + }, + { + "epoch": 6.516574585635359, + "grad_norm": 0.21201452612876892, + "learning_rate": 2.8590937547328844e-05, + "loss": 1.6582, + "step": 21231 + }, + { + "epoch": 6.5168815224063845, + "grad_norm": 0.2071799635887146, + "learning_rate": 2.858644579357944e-05, + "loss": 1.7559, + "step": 21232 + }, + { + "epoch": 6.51718845917741, + "grad_norm": 0.20225903391838074, + "learning_rate": 2.858195425145132e-05, + "loss": 1.7507, + "step": 21233 + }, + { + "epoch": 6.517495395948434, + "grad_norm": 0.2738147974014282, + "learning_rate": 2.8577462920988852e-05, + "loss": 1.7073, + "step": 21234 + }, + { + "epoch": 6.51780233271946, + "grad_norm": 0.17878220975399017, + "learning_rate": 2.8572971802236498e-05, + "loss": 1.6598, + "step": 21235 + }, + { + "epoch": 6.518109269490485, + "grad_norm": 0.21365594863891602, + "learning_rate": 2.8568480895238552e-05, + "loss": 1.7404, + "step": 21236 + }, + { + "epoch": 6.51841620626151, + "grad_norm": 0.18392804265022278, + "learning_rate": 2.856399020003948e-05, + "loss": 1.706, + "step": 21237 + }, + { + "epoch": 6.518723143032536, + "grad_norm": 0.16268405318260193, + "learning_rate": 2.855949971668358e-05, + "loss": 1.6725, + "step": 21238 + }, + { + "epoch": 6.51903007980356, + "grad_norm": 0.19590096175670624, + "learning_rate": 2.855500944521529e-05, + "loss": 1.7269, + "step": 21239 + }, + { + "epoch": 6.519337016574585, + "grad_norm": 0.19443263113498688, + "learning_rate": 2.8550519385678965e-05, + "loss": 1.686, + "step": 21240 + }, + { + "epoch": 6.519643953345611, + "grad_norm": 0.2112705111503601, + "learning_rate": 2.8546029538118985e-05, + "loss": 1.6904, + "step": 21241 + }, + { + "epoch": 6.519950890116636, + "grad_norm": 0.21015888452529907, + "learning_rate": 2.8541539902579712e-05, + "loss": 1.6972, + "step": 21242 + }, + { + "epoch": 6.520257826887661, + "grad_norm": 0.2853320837020874, + "learning_rate": 2.853705047910552e-05, + "loss": 1.7415, + "step": 21243 + }, + { + "epoch": 6.520564763658687, + "grad_norm": 0.20927128195762634, + "learning_rate": 2.853256126774077e-05, + "loss": 1.6955, + "step": 21244 + }, + { + "epoch": 6.520871700429711, + "grad_norm": 0.27824920415878296, + "learning_rate": 2.8528072268529836e-05, + "loss": 1.7666, + "step": 21245 + }, + { + "epoch": 6.5211786372007365, + "grad_norm": 0.21164646744728088, + "learning_rate": 2.8523583481517057e-05, + "loss": 1.75, + "step": 21246 + }, + { + "epoch": 6.521485573971762, + "grad_norm": 0.249397411942482, + "learning_rate": 2.851909490674686e-05, + "loss": 1.6767, + "step": 21247 + }, + { + "epoch": 6.521792510742787, + "grad_norm": 0.2311551868915558, + "learning_rate": 2.8514606544263507e-05, + "loss": 1.8071, + "step": 21248 + }, + { + "epoch": 6.5220994475138125, + "grad_norm": 0.21878042817115784, + "learning_rate": 2.8510118394111453e-05, + "loss": 1.6881, + "step": 21249 + }, + { + "epoch": 6.522406384284837, + "grad_norm": 0.2095690816640854, + "learning_rate": 2.8505630456334974e-05, + "loss": 1.6526, + "step": 21250 + }, + { + "epoch": 6.522713321055862, + "grad_norm": 0.2303982526063919, + "learning_rate": 2.850114273097844e-05, + "loss": 1.7256, + "step": 21251 + }, + { + "epoch": 6.523020257826888, + "grad_norm": 0.22640225291252136, + "learning_rate": 2.8496655218086255e-05, + "loss": 1.7797, + "step": 21252 + }, + { + "epoch": 6.523327194597913, + "grad_norm": 0.24268805980682373, + "learning_rate": 2.8492167917702683e-05, + "loss": 1.7673, + "step": 21253 + }, + { + "epoch": 6.523634131368938, + "grad_norm": 0.1988469958305359, + "learning_rate": 2.8487680829872158e-05, + "loss": 1.7126, + "step": 21254 + }, + { + "epoch": 6.523941068139964, + "grad_norm": 0.18385496735572815, + "learning_rate": 2.8483193954638942e-05, + "loss": 1.7113, + "step": 21255 + }, + { + "epoch": 6.524248004910988, + "grad_norm": 0.21865327656269073, + "learning_rate": 2.847870729204743e-05, + "loss": 1.6686, + "step": 21256 + }, + { + "epoch": 6.524554941682013, + "grad_norm": 0.16982951760292053, + "learning_rate": 2.8474220842141946e-05, + "loss": 1.6865, + "step": 21257 + }, + { + "epoch": 6.524861878453039, + "grad_norm": 0.23028478026390076, + "learning_rate": 2.8469734604966834e-05, + "loss": 1.7647, + "step": 21258 + }, + { + "epoch": 6.525168815224064, + "grad_norm": 0.1805485039949417, + "learning_rate": 2.8465248580566415e-05, + "loss": 1.7524, + "step": 21259 + }, + { + "epoch": 6.525475751995089, + "grad_norm": 0.18652063608169556, + "learning_rate": 2.8460762768985037e-05, + "loss": 1.7028, + "step": 21260 + }, + { + "epoch": 6.525782688766114, + "grad_norm": 0.22772997617721558, + "learning_rate": 2.845627717026703e-05, + "loss": 1.7866, + "step": 21261 + }, + { + "epoch": 6.526089625537139, + "grad_norm": 0.19889821112155914, + "learning_rate": 2.8451791784456718e-05, + "loss": 1.7076, + "step": 21262 + }, + { + "epoch": 6.526396562308165, + "grad_norm": 0.24747174978256226, + "learning_rate": 2.8447306611598402e-05, + "loss": 1.7615, + "step": 21263 + }, + { + "epoch": 6.52670349907919, + "grad_norm": 0.1988009363412857, + "learning_rate": 2.8442821651736473e-05, + "loss": 1.7853, + "step": 21264 + }, + { + "epoch": 6.527010435850215, + "grad_norm": 0.250032901763916, + "learning_rate": 2.8438336904915185e-05, + "loss": 1.6906, + "step": 21265 + }, + { + "epoch": 6.52731737262124, + "grad_norm": 0.15398284792900085, + "learning_rate": 2.8433852371178925e-05, + "loss": 1.6437, + "step": 21266 + }, + { + "epoch": 6.527624309392265, + "grad_norm": 0.33137503266334534, + "learning_rate": 2.8429368050571958e-05, + "loss": 1.8213, + "step": 21267 + }, + { + "epoch": 6.52793124616329, + "grad_norm": 0.23827852308750153, + "learning_rate": 2.8424883943138593e-05, + "loss": 1.7148, + "step": 21268 + }, + { + "epoch": 6.528238182934316, + "grad_norm": 0.21171489357948303, + "learning_rate": 2.8420400048923217e-05, + "loss": 1.7729, + "step": 21269 + }, + { + "epoch": 6.528545119705341, + "grad_norm": 0.21698513627052307, + "learning_rate": 2.8415916367970053e-05, + "loss": 1.7267, + "step": 21270 + }, + { + "epoch": 6.5288520564763655, + "grad_norm": 0.2217913120985031, + "learning_rate": 2.8411432900323498e-05, + "loss": 1.7259, + "step": 21271 + }, + { + "epoch": 6.529158993247391, + "grad_norm": 0.25518202781677246, + "learning_rate": 2.8406949646027768e-05, + "loss": 1.7754, + "step": 21272 + }, + { + "epoch": 6.529465930018416, + "grad_norm": 0.22206325829029083, + "learning_rate": 2.8402466605127247e-05, + "loss": 1.755, + "step": 21273 + }, + { + "epoch": 6.5297728667894415, + "grad_norm": 0.26918017864227295, + "learning_rate": 2.8397983777666206e-05, + "loss": 1.783, + "step": 21274 + }, + { + "epoch": 6.530079803560467, + "grad_norm": 0.19280646741390228, + "learning_rate": 2.8393501163688952e-05, + "loss": 1.6942, + "step": 21275 + }, + { + "epoch": 6.530386740331492, + "grad_norm": 0.24567140638828278, + "learning_rate": 2.8389018763239784e-05, + "loss": 1.7316, + "step": 21276 + }, + { + "epoch": 6.530693677102517, + "grad_norm": 0.21791695058345795, + "learning_rate": 2.8384536576362997e-05, + "loss": 1.7627, + "step": 21277 + }, + { + "epoch": 6.531000613873542, + "grad_norm": 0.2441660761833191, + "learning_rate": 2.8380054603102885e-05, + "loss": 1.7112, + "step": 21278 + }, + { + "epoch": 6.531307550644567, + "grad_norm": 0.1768653243780136, + "learning_rate": 2.837557284350375e-05, + "loss": 1.6906, + "step": 21279 + }, + { + "epoch": 6.531614487415593, + "grad_norm": 0.21037769317626953, + "learning_rate": 2.8371091297609877e-05, + "loss": 1.7197, + "step": 21280 + }, + { + "epoch": 6.531921424186618, + "grad_norm": 0.23989829421043396, + "learning_rate": 2.8366609965465563e-05, + "loss": 1.7693, + "step": 21281 + }, + { + "epoch": 6.532228360957642, + "grad_norm": 0.18302181363105774, + "learning_rate": 2.836212884711506e-05, + "loss": 1.6643, + "step": 21282 + }, + { + "epoch": 6.532535297728668, + "grad_norm": 0.2068471908569336, + "learning_rate": 2.835764794260273e-05, + "loss": 1.7431, + "step": 21283 + }, + { + "epoch": 6.532842234499693, + "grad_norm": 0.18803778290748596, + "learning_rate": 2.8353167251972777e-05, + "loss": 1.7506, + "step": 21284 + }, + { + "epoch": 6.533149171270718, + "grad_norm": 0.20789632201194763, + "learning_rate": 2.8348686775269507e-05, + "loss": 1.7174, + "step": 21285 + }, + { + "epoch": 6.533456108041744, + "grad_norm": 0.18927012383937836, + "learning_rate": 2.834420651253723e-05, + "loss": 1.6723, + "step": 21286 + }, + { + "epoch": 6.533763044812769, + "grad_norm": 0.22616887092590332, + "learning_rate": 2.8339726463820172e-05, + "loss": 1.7045, + "step": 21287 + }, + { + "epoch": 6.5340699815837935, + "grad_norm": 0.23880253732204437, + "learning_rate": 2.8335246629162658e-05, + "loss": 1.7255, + "step": 21288 + }, + { + "epoch": 6.534376918354819, + "grad_norm": 0.24279431998729706, + "learning_rate": 2.8330767008608904e-05, + "loss": 1.7548, + "step": 21289 + }, + { + "epoch": 6.534683855125844, + "grad_norm": 0.20542044937610626, + "learning_rate": 2.832628760220323e-05, + "loss": 1.6851, + "step": 21290 + }, + { + "epoch": 6.5349907918968695, + "grad_norm": 0.19426794350147247, + "learning_rate": 2.832180840998988e-05, + "loss": 1.7528, + "step": 21291 + }, + { + "epoch": 6.535297728667894, + "grad_norm": 0.2744491398334503, + "learning_rate": 2.8317329432013136e-05, + "loss": 1.7821, + "step": 21292 + }, + { + "epoch": 6.535604665438919, + "grad_norm": 0.2692170739173889, + "learning_rate": 2.8312850668317243e-05, + "loss": 1.6626, + "step": 21293 + }, + { + "epoch": 6.535911602209945, + "grad_norm": 0.24998809397220612, + "learning_rate": 2.830837211894647e-05, + "loss": 1.7031, + "step": 21294 + }, + { + "epoch": 6.53621853898097, + "grad_norm": 0.22888946533203125, + "learning_rate": 2.830389378394508e-05, + "loss": 1.7706, + "step": 21295 + }, + { + "epoch": 6.536525475751995, + "grad_norm": 0.21685005724430084, + "learning_rate": 2.8299415663357332e-05, + "loss": 1.681, + "step": 21296 + }, + { + "epoch": 6.536832412523021, + "grad_norm": 0.23309725522994995, + "learning_rate": 2.8294937757227475e-05, + "loss": 1.7781, + "step": 21297 + }, + { + "epoch": 6.537139349294045, + "grad_norm": 0.26712173223495483, + "learning_rate": 2.829046006559976e-05, + "loss": 1.6966, + "step": 21298 + }, + { + "epoch": 6.53744628606507, + "grad_norm": 0.1836499124765396, + "learning_rate": 2.8285982588518428e-05, + "loss": 1.7192, + "step": 21299 + }, + { + "epoch": 6.537753222836096, + "grad_norm": 0.24073021113872528, + "learning_rate": 2.828150532602778e-05, + "loss": 1.6997, + "step": 21300 + }, + { + "epoch": 6.538060159607121, + "grad_norm": 0.16308051347732544, + "learning_rate": 2.8277028278172014e-05, + "loss": 1.6901, + "step": 21301 + }, + { + "epoch": 6.538367096378146, + "grad_norm": 0.2330634444952011, + "learning_rate": 2.8272551444995376e-05, + "loss": 1.7426, + "step": 21302 + }, + { + "epoch": 6.538674033149171, + "grad_norm": 0.18600425124168396, + "learning_rate": 2.8268074826542123e-05, + "loss": 1.6906, + "step": 21303 + }, + { + "epoch": 6.538980969920196, + "grad_norm": 0.24717238545417786, + "learning_rate": 2.8263598422856475e-05, + "loss": 1.6962, + "step": 21304 + }, + { + "epoch": 6.5392879066912215, + "grad_norm": 0.1907368302345276, + "learning_rate": 2.8259122233982727e-05, + "loss": 1.7083, + "step": 21305 + }, + { + "epoch": 6.539594843462247, + "grad_norm": 0.22698798775672913, + "learning_rate": 2.8254646259965035e-05, + "loss": 1.7377, + "step": 21306 + }, + { + "epoch": 6.539901780233272, + "grad_norm": 0.19169457256793976, + "learning_rate": 2.8250170500847696e-05, + "loss": 1.7416, + "step": 21307 + }, + { + "epoch": 6.5402087170042975, + "grad_norm": 0.18730394542217255, + "learning_rate": 2.8245694956674918e-05, + "loss": 1.7273, + "step": 21308 + }, + { + "epoch": 6.540515653775322, + "grad_norm": 0.19813422858715057, + "learning_rate": 2.8241219627490927e-05, + "loss": 1.7638, + "step": 21309 + }, + { + "epoch": 6.540822590546347, + "grad_norm": 0.20460368692874908, + "learning_rate": 2.8236744513339965e-05, + "loss": 1.7266, + "step": 21310 + }, + { + "epoch": 6.541129527317373, + "grad_norm": 0.20448380708694458, + "learning_rate": 2.823226961426625e-05, + "loss": 1.7335, + "step": 21311 + }, + { + "epoch": 6.541436464088398, + "grad_norm": 0.21458712220191956, + "learning_rate": 2.8227794930314e-05, + "loss": 1.7274, + "step": 21312 + }, + { + "epoch": 6.541743400859423, + "grad_norm": 0.1964675635099411, + "learning_rate": 2.8223320461527442e-05, + "loss": 1.7514, + "step": 21313 + }, + { + "epoch": 6.542050337630448, + "grad_norm": 0.18982458114624023, + "learning_rate": 2.82188462079508e-05, + "loss": 1.6858, + "step": 21314 + }, + { + "epoch": 6.542357274401473, + "grad_norm": 0.21377761662006378, + "learning_rate": 2.8214372169628277e-05, + "loss": 1.727, + "step": 21315 + }, + { + "epoch": 6.542664211172498, + "grad_norm": 0.19484922289848328, + "learning_rate": 2.8209898346604087e-05, + "loss": 1.7646, + "step": 21316 + }, + { + "epoch": 6.542971147943524, + "grad_norm": 0.20614980161190033, + "learning_rate": 2.8205424738922488e-05, + "loss": 1.6705, + "step": 21317 + }, + { + "epoch": 6.543278084714549, + "grad_norm": 0.1888885796070099, + "learning_rate": 2.8200951346627636e-05, + "loss": 1.7854, + "step": 21318 + }, + { + "epoch": 6.543585021485574, + "grad_norm": 0.20957863330841064, + "learning_rate": 2.8196478169763763e-05, + "loss": 1.6971, + "step": 21319 + }, + { + "epoch": 6.543891958256599, + "grad_norm": 0.20744509994983673, + "learning_rate": 2.8192005208375073e-05, + "loss": 1.7408, + "step": 21320 + }, + { + "epoch": 6.544198895027624, + "grad_norm": 0.20038767158985138, + "learning_rate": 2.818753246250574e-05, + "loss": 1.7355, + "step": 21321 + }, + { + "epoch": 6.5445058317986495, + "grad_norm": 0.18535862863063812, + "learning_rate": 2.818305993220004e-05, + "loss": 1.7229, + "step": 21322 + }, + { + "epoch": 6.544812768569675, + "grad_norm": 0.2191225290298462, + "learning_rate": 2.8178587617502095e-05, + "loss": 1.7364, + "step": 21323 + }, + { + "epoch": 6.5451197053407, + "grad_norm": 0.2055424451828003, + "learning_rate": 2.8174115518456175e-05, + "loss": 1.7488, + "step": 21324 + }, + { + "epoch": 6.545426642111725, + "grad_norm": 0.22267968952655792, + "learning_rate": 2.8169643635106398e-05, + "loss": 1.6936, + "step": 21325 + }, + { + "epoch": 6.54573357888275, + "grad_norm": 0.20295512676239014, + "learning_rate": 2.8165171967497018e-05, + "loss": 1.7651, + "step": 21326 + }, + { + "epoch": 6.546040515653775, + "grad_norm": 0.25859618186950684, + "learning_rate": 2.81607005156722e-05, + "loss": 1.7264, + "step": 21327 + }, + { + "epoch": 6.546347452424801, + "grad_norm": 0.22232379019260406, + "learning_rate": 2.8156229279676143e-05, + "loss": 1.7282, + "step": 21328 + }, + { + "epoch": 6.546654389195826, + "grad_norm": 0.2548457682132721, + "learning_rate": 2.8151758259553035e-05, + "loss": 1.7137, + "step": 21329 + }, + { + "epoch": 6.546961325966851, + "grad_norm": 0.22040672600269318, + "learning_rate": 2.8147287455347055e-05, + "loss": 1.7553, + "step": 21330 + }, + { + "epoch": 6.547268262737876, + "grad_norm": 0.19622360169887543, + "learning_rate": 2.8142816867102388e-05, + "loss": 1.6502, + "step": 21331 + }, + { + "epoch": 6.547575199508901, + "grad_norm": 0.20849336683750153, + "learning_rate": 2.813834649486322e-05, + "loss": 1.6824, + "step": 21332 + }, + { + "epoch": 6.547882136279926, + "grad_norm": 0.18474788963794708, + "learning_rate": 2.8133876338673703e-05, + "loss": 1.7136, + "step": 21333 + }, + { + "epoch": 6.548189073050952, + "grad_norm": 0.2421834021806717, + "learning_rate": 2.8129406398578074e-05, + "loss": 1.7841, + "step": 21334 + }, + { + "epoch": 6.548496009821976, + "grad_norm": 0.18089748919010162, + "learning_rate": 2.812493667462045e-05, + "loss": 1.6918, + "step": 21335 + }, + { + "epoch": 6.5488029465930016, + "grad_norm": 0.18575069308280945, + "learning_rate": 2.8120467166845022e-05, + "loss": 1.7098, + "step": 21336 + }, + { + "epoch": 6.549109883364027, + "grad_norm": 0.20840388536453247, + "learning_rate": 2.811599787529596e-05, + "loss": 1.7405, + "step": 21337 + }, + { + "epoch": 6.549416820135052, + "grad_norm": 0.19018858671188354, + "learning_rate": 2.811152880001742e-05, + "loss": 1.7098, + "step": 21338 + }, + { + "epoch": 6.5497237569060776, + "grad_norm": 0.22326117753982544, + "learning_rate": 2.8107059941053627e-05, + "loss": 1.7452, + "step": 21339 + }, + { + "epoch": 6.550030693677103, + "grad_norm": 0.26071304082870483, + "learning_rate": 2.8102591298448643e-05, + "loss": 1.7685, + "step": 21340 + }, + { + "epoch": 6.550337630448127, + "grad_norm": 0.2253575623035431, + "learning_rate": 2.8098122872246734e-05, + "loss": 1.8025, + "step": 21341 + }, + { + "epoch": 6.550644567219153, + "grad_norm": 0.2503850758075714, + "learning_rate": 2.8093654662491975e-05, + "loss": 1.7453, + "step": 21342 + }, + { + "epoch": 6.550951503990178, + "grad_norm": 0.18953700363636017, + "learning_rate": 2.808918666922858e-05, + "loss": 1.7549, + "step": 21343 + }, + { + "epoch": 6.551258440761203, + "grad_norm": 0.21360619366168976, + "learning_rate": 2.8084718892500685e-05, + "loss": 1.7363, + "step": 21344 + }, + { + "epoch": 6.551565377532229, + "grad_norm": 0.24622702598571777, + "learning_rate": 2.8080251332352437e-05, + "loss": 1.7325, + "step": 21345 + }, + { + "epoch": 6.551872314303253, + "grad_norm": 0.20079167187213898, + "learning_rate": 2.8075783988827997e-05, + "loss": 1.7478, + "step": 21346 + }, + { + "epoch": 6.5521792510742785, + "grad_norm": 0.2337643951177597, + "learning_rate": 2.807131686197151e-05, + "loss": 1.6683, + "step": 21347 + }, + { + "epoch": 6.552486187845304, + "grad_norm": 0.20815308392047882, + "learning_rate": 2.8066849951827123e-05, + "loss": 1.7436, + "step": 21348 + }, + { + "epoch": 6.552793124616329, + "grad_norm": 0.2450367957353592, + "learning_rate": 2.8062383258438972e-05, + "loss": 1.7464, + "step": 21349 + }, + { + "epoch": 6.5531000613873545, + "grad_norm": 0.232087641954422, + "learning_rate": 2.8057916781851222e-05, + "loss": 1.7378, + "step": 21350 + }, + { + "epoch": 6.55340699815838, + "grad_norm": 0.2254600077867508, + "learning_rate": 2.8053450522107993e-05, + "loss": 1.7299, + "step": 21351 + }, + { + "epoch": 6.553713934929404, + "grad_norm": 0.23282572627067566, + "learning_rate": 2.8048984479253425e-05, + "loss": 1.7512, + "step": 21352 + }, + { + "epoch": 6.55402087170043, + "grad_norm": 0.21826763451099396, + "learning_rate": 2.8044518653331665e-05, + "loss": 1.706, + "step": 21353 + }, + { + "epoch": 6.554327808471455, + "grad_norm": 0.20807425677776337, + "learning_rate": 2.804005304438683e-05, + "loss": 1.7013, + "step": 21354 + }, + { + "epoch": 6.55463474524248, + "grad_norm": 0.21791879832744598, + "learning_rate": 2.8035587652463046e-05, + "loss": 1.7312, + "step": 21355 + }, + { + "epoch": 6.554941682013506, + "grad_norm": 0.23205329477787018, + "learning_rate": 2.8031122477604505e-05, + "loss": 1.7166, + "step": 21356 + }, + { + "epoch": 6.55524861878453, + "grad_norm": 0.1910320371389389, + "learning_rate": 2.802665751985525e-05, + "loss": 1.694, + "step": 21357 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 0.24150735139846802, + "learning_rate": 2.8022192779259472e-05, + "loss": 1.7934, + "step": 21358 + }, + { + "epoch": 6.555862492326581, + "grad_norm": 0.18308573961257935, + "learning_rate": 2.801772825586123e-05, + "loss": 1.6851, + "step": 21359 + }, + { + "epoch": 6.556169429097606, + "grad_norm": 0.28410083055496216, + "learning_rate": 2.8013263949704705e-05, + "loss": 1.7687, + "step": 21360 + }, + { + "epoch": 6.556476365868631, + "grad_norm": 0.21073146164417267, + "learning_rate": 2.8008799860833996e-05, + "loss": 1.711, + "step": 21361 + }, + { + "epoch": 6.556783302639657, + "grad_norm": 0.22758159041404724, + "learning_rate": 2.8004335989293213e-05, + "loss": 1.7495, + "step": 21362 + }, + { + "epoch": 6.557090239410681, + "grad_norm": 0.2112412452697754, + "learning_rate": 2.799987233512647e-05, + "loss": 1.7125, + "step": 21363 + }, + { + "epoch": 6.5573971761817065, + "grad_norm": 0.1804153323173523, + "learning_rate": 2.7995408898377884e-05, + "loss": 1.689, + "step": 21364 + }, + { + "epoch": 6.557704112952732, + "grad_norm": 0.17632657289505005, + "learning_rate": 2.7990945679091572e-05, + "loss": 1.6868, + "step": 21365 + }, + { + "epoch": 6.558011049723757, + "grad_norm": 0.17942996323108673, + "learning_rate": 2.7986482677311632e-05, + "loss": 1.7082, + "step": 21366 + }, + { + "epoch": 6.558317986494782, + "grad_norm": 0.278486967086792, + "learning_rate": 2.7982019893082167e-05, + "loss": 1.7909, + "step": 21367 + }, + { + "epoch": 6.558624923265807, + "grad_norm": 0.208990678191185, + "learning_rate": 2.797755732644729e-05, + "loss": 1.7643, + "step": 21368 + }, + { + "epoch": 6.558931860036832, + "grad_norm": 0.20375309884548187, + "learning_rate": 2.7973094977451096e-05, + "loss": 1.6957, + "step": 21369 + }, + { + "epoch": 6.559238796807858, + "grad_norm": 0.24685338139533997, + "learning_rate": 2.7968632846137694e-05, + "loss": 1.7574, + "step": 21370 + }, + { + "epoch": 6.559545733578883, + "grad_norm": 0.2237502634525299, + "learning_rate": 2.796417093255117e-05, + "loss": 1.7422, + "step": 21371 + }, + { + "epoch": 6.559852670349908, + "grad_norm": 0.22731846570968628, + "learning_rate": 2.795970923673561e-05, + "loss": 1.7594, + "step": 21372 + }, + { + "epoch": 6.560159607120933, + "grad_norm": 0.2518742084503174, + "learning_rate": 2.7955247758735158e-05, + "loss": 1.6817, + "step": 21373 + }, + { + "epoch": 6.560466543891958, + "grad_norm": 0.21982096135616302, + "learning_rate": 2.7950786498593827e-05, + "loss": 1.7289, + "step": 21374 + }, + { + "epoch": 6.560773480662983, + "grad_norm": 0.19061018526554108, + "learning_rate": 2.7946325456355787e-05, + "loss": 1.6809, + "step": 21375 + }, + { + "epoch": 6.561080417434009, + "grad_norm": 0.2023245394229889, + "learning_rate": 2.794186463206505e-05, + "loss": 1.7053, + "step": 21376 + }, + { + "epoch": 6.561387354205034, + "grad_norm": 0.18003186583518982, + "learning_rate": 2.7937404025765752e-05, + "loss": 1.6447, + "step": 21377 + }, + { + "epoch": 6.5616942909760585, + "grad_norm": 0.19133709371089935, + "learning_rate": 2.7932943637501956e-05, + "loss": 1.7677, + "step": 21378 + }, + { + "epoch": 6.562001227747084, + "grad_norm": 0.18476714193820953, + "learning_rate": 2.7928483467317746e-05, + "loss": 1.685, + "step": 21379 + }, + { + "epoch": 6.562308164518109, + "grad_norm": 0.2065780758857727, + "learning_rate": 2.79240235152572e-05, + "loss": 1.6827, + "step": 21380 + }, + { + "epoch": 6.5626151012891345, + "grad_norm": 0.1885409951210022, + "learning_rate": 2.79195637813644e-05, + "loss": 1.6819, + "step": 21381 + }, + { + "epoch": 6.56292203806016, + "grad_norm": 0.18055391311645508, + "learning_rate": 2.79151042656834e-05, + "loss": 1.7007, + "step": 21382 + }, + { + "epoch": 6.563228974831185, + "grad_norm": 0.25148439407348633, + "learning_rate": 2.7910644968258294e-05, + "loss": 1.7723, + "step": 21383 + }, + { + "epoch": 6.56353591160221, + "grad_norm": 0.2308066487312317, + "learning_rate": 2.7906185889133134e-05, + "loss": 1.7525, + "step": 21384 + }, + { + "epoch": 6.563842848373235, + "grad_norm": 0.19580784440040588, + "learning_rate": 2.7901727028351997e-05, + "loss": 1.7197, + "step": 21385 + }, + { + "epoch": 6.56414978514426, + "grad_norm": 0.19686979055404663, + "learning_rate": 2.7897268385958952e-05, + "loss": 1.6873, + "step": 21386 + }, + { + "epoch": 6.564456721915286, + "grad_norm": 0.2657351493835449, + "learning_rate": 2.7892809961998045e-05, + "loss": 1.7005, + "step": 21387 + }, + { + "epoch": 6.564763658686311, + "grad_norm": 0.20131130516529083, + "learning_rate": 2.7888351756513353e-05, + "loss": 1.7211, + "step": 21388 + }, + { + "epoch": 6.565070595457335, + "grad_norm": 0.2524282932281494, + "learning_rate": 2.7883893769548908e-05, + "loss": 1.7038, + "step": 21389 + }, + { + "epoch": 6.565377532228361, + "grad_norm": 0.1601654291152954, + "learning_rate": 2.787943600114883e-05, + "loss": 1.691, + "step": 21390 + }, + { + "epoch": 6.565684468999386, + "grad_norm": 0.25074124336242676, + "learning_rate": 2.787497845135709e-05, + "loss": 1.688, + "step": 21391 + }, + { + "epoch": 6.565991405770411, + "grad_norm": 0.19491349160671234, + "learning_rate": 2.787052112021782e-05, + "loss": 1.7108, + "step": 21392 + }, + { + "epoch": 6.566298342541437, + "grad_norm": 0.23931637406349182, + "learning_rate": 2.786606400777499e-05, + "loss": 1.7315, + "step": 21393 + }, + { + "epoch": 6.566605279312462, + "grad_norm": 0.1643616110086441, + "learning_rate": 2.786160711407271e-05, + "loss": 1.6745, + "step": 21394 + }, + { + "epoch": 6.5669122160834865, + "grad_norm": 0.17805394530296326, + "learning_rate": 2.7857150439155e-05, + "loss": 1.6817, + "step": 21395 + }, + { + "epoch": 6.567219152854512, + "grad_norm": 0.20370139181613922, + "learning_rate": 2.7852693983065913e-05, + "loss": 1.7173, + "step": 21396 + }, + { + "epoch": 6.567526089625537, + "grad_norm": 0.1620296984910965, + "learning_rate": 2.784823774584948e-05, + "loss": 1.7135, + "step": 21397 + }, + { + "epoch": 6.5678330263965625, + "grad_norm": 0.19116036593914032, + "learning_rate": 2.7843781727549752e-05, + "loss": 1.6815, + "step": 21398 + }, + { + "epoch": 6.568139963167588, + "grad_norm": 0.20118895173072815, + "learning_rate": 2.7839325928210757e-05, + "loss": 1.7336, + "step": 21399 + }, + { + "epoch": 6.568446899938612, + "grad_norm": 0.198282390832901, + "learning_rate": 2.7834870347876528e-05, + "loss": 1.7379, + "step": 21400 + }, + { + "epoch": 6.568753836709638, + "grad_norm": 0.19203920662403107, + "learning_rate": 2.7830414986591104e-05, + "loss": 1.6913, + "step": 21401 + }, + { + "epoch": 6.569060773480663, + "grad_norm": 0.24601610004901886, + "learning_rate": 2.7825959844398507e-05, + "loss": 1.7842, + "step": 21402 + }, + { + "epoch": 6.569367710251688, + "grad_norm": 0.19069935381412506, + "learning_rate": 2.7821504921342777e-05, + "loss": 1.706, + "step": 21403 + }, + { + "epoch": 6.569674647022714, + "grad_norm": 0.20221085846424103, + "learning_rate": 2.7817050217467945e-05, + "loss": 1.7223, + "step": 21404 + }, + { + "epoch": 6.569981583793739, + "grad_norm": 0.2129664123058319, + "learning_rate": 2.781259573281801e-05, + "loss": 1.7429, + "step": 21405 + }, + { + "epoch": 6.570288520564763, + "grad_norm": 0.20684000849723816, + "learning_rate": 2.7808141467436993e-05, + "loss": 1.7349, + "step": 21406 + }, + { + "epoch": 6.570595457335789, + "grad_norm": 0.2153804898262024, + "learning_rate": 2.7803687421368968e-05, + "loss": 1.7245, + "step": 21407 + }, + { + "epoch": 6.570902394106814, + "grad_norm": 0.245448499917984, + "learning_rate": 2.7799233594657875e-05, + "loss": 1.7102, + "step": 21408 + }, + { + "epoch": 6.571209330877839, + "grad_norm": 0.18146783113479614, + "learning_rate": 2.7794779987347807e-05, + "loss": 1.6777, + "step": 21409 + }, + { + "epoch": 6.571516267648864, + "grad_norm": 0.21388854086399078, + "learning_rate": 2.7790326599482698e-05, + "loss": 1.7263, + "step": 21410 + }, + { + "epoch": 6.571823204419889, + "grad_norm": 0.2242165058851242, + "learning_rate": 2.7785873431106625e-05, + "loss": 1.7624, + "step": 21411 + }, + { + "epoch": 6.5721301411909145, + "grad_norm": 0.23132537305355072, + "learning_rate": 2.7781420482263565e-05, + "loss": 1.7013, + "step": 21412 + }, + { + "epoch": 6.57243707796194, + "grad_norm": 0.21074987947940826, + "learning_rate": 2.777696775299753e-05, + "loss": 1.7111, + "step": 21413 + }, + { + "epoch": 6.572744014732965, + "grad_norm": 0.2933674156665802, + "learning_rate": 2.7772515243352525e-05, + "loss": 1.7515, + "step": 21414 + }, + { + "epoch": 6.5730509515039905, + "grad_norm": 0.2100256085395813, + "learning_rate": 2.7768062953372552e-05, + "loss": 1.7425, + "step": 21415 + }, + { + "epoch": 6.573357888275015, + "grad_norm": 0.21765680611133575, + "learning_rate": 2.776361088310161e-05, + "loss": 1.7064, + "step": 21416 + }, + { + "epoch": 6.57366482504604, + "grad_norm": 0.205422043800354, + "learning_rate": 2.7759159032583702e-05, + "loss": 1.7458, + "step": 21417 + }, + { + "epoch": 6.573971761817066, + "grad_norm": 0.2009960114955902, + "learning_rate": 2.775470740186282e-05, + "loss": 1.7111, + "step": 21418 + }, + { + "epoch": 6.574278698588091, + "grad_norm": 0.18974804878234863, + "learning_rate": 2.7750255990982955e-05, + "loss": 1.7385, + "step": 21419 + }, + { + "epoch": 6.574585635359116, + "grad_norm": 0.1784054934978485, + "learning_rate": 2.7745804799988106e-05, + "loss": 1.7129, + "step": 21420 + }, + { + "epoch": 6.574892572130141, + "grad_norm": 0.2047782689332962, + "learning_rate": 2.7741353828922258e-05, + "loss": 1.6972, + "step": 21421 + }, + { + "epoch": 6.575199508901166, + "grad_norm": 0.18886682391166687, + "learning_rate": 2.773690307782939e-05, + "loss": 1.6564, + "step": 21422 + }, + { + "epoch": 6.5755064456721914, + "grad_norm": 0.2088952213525772, + "learning_rate": 2.7732452546753484e-05, + "loss": 1.7309, + "step": 21423 + }, + { + "epoch": 6.575813382443217, + "grad_norm": 0.20526883006095886, + "learning_rate": 2.7728002235738565e-05, + "loss": 1.6811, + "step": 21424 + }, + { + "epoch": 6.576120319214242, + "grad_norm": 0.19648446142673492, + "learning_rate": 2.7723552144828545e-05, + "loss": 1.7237, + "step": 21425 + }, + { + "epoch": 6.5764272559852675, + "grad_norm": 0.22405673563480377, + "learning_rate": 2.7719102274067484e-05, + "loss": 1.7454, + "step": 21426 + }, + { + "epoch": 6.576734192756292, + "grad_norm": 0.24119171500205994, + "learning_rate": 2.7714652623499265e-05, + "loss": 1.7106, + "step": 21427 + }, + { + "epoch": 6.577041129527317, + "grad_norm": 0.2127196192741394, + "learning_rate": 2.771020319316794e-05, + "loss": 1.7895, + "step": 21428 + }, + { + "epoch": 6.577348066298343, + "grad_norm": 0.23805706202983856, + "learning_rate": 2.7705753983117443e-05, + "loss": 1.739, + "step": 21429 + }, + { + "epoch": 6.577655003069368, + "grad_norm": 0.24212954938411713, + "learning_rate": 2.7701304993391753e-05, + "loss": 1.683, + "step": 21430 + }, + { + "epoch": 6.577961939840393, + "grad_norm": 0.1946132481098175, + "learning_rate": 2.769685622403484e-05, + "loss": 1.6953, + "step": 21431 + }, + { + "epoch": 6.578268876611418, + "grad_norm": 0.2465951144695282, + "learning_rate": 2.769240767509067e-05, + "loss": 1.6594, + "step": 21432 + }, + { + "epoch": 6.578575813382443, + "grad_norm": 0.17029622197151184, + "learning_rate": 2.76879593466032e-05, + "loss": 1.6977, + "step": 21433 + }, + { + "epoch": 6.578882750153468, + "grad_norm": 0.23793117702007294, + "learning_rate": 2.7683511238616388e-05, + "loss": 1.6709, + "step": 21434 + }, + { + "epoch": 6.579189686924494, + "grad_norm": 0.20149341225624084, + "learning_rate": 2.76790633511742e-05, + "loss": 1.8074, + "step": 21435 + }, + { + "epoch": 6.579496623695519, + "grad_norm": 0.25029948353767395, + "learning_rate": 2.7674615684320593e-05, + "loss": 1.6649, + "step": 21436 + }, + { + "epoch": 6.579803560466544, + "grad_norm": 0.22212490439414978, + "learning_rate": 2.7670168238099515e-05, + "loss": 1.7322, + "step": 21437 + }, + { + "epoch": 6.580110497237569, + "grad_norm": 0.26087918877601624, + "learning_rate": 2.7665721012554925e-05, + "loss": 1.7285, + "step": 21438 + }, + { + "epoch": 6.580417434008594, + "grad_norm": 0.19286726415157318, + "learning_rate": 2.7661274007730776e-05, + "loss": 1.6912, + "step": 21439 + }, + { + "epoch": 6.5807243707796195, + "grad_norm": 0.23935118317604065, + "learning_rate": 2.7656827223670982e-05, + "loss": 1.6929, + "step": 21440 + }, + { + "epoch": 6.581031307550645, + "grad_norm": 0.2263423204421997, + "learning_rate": 2.7652380660419563e-05, + "loss": 1.6786, + "step": 21441 + }, + { + "epoch": 6.581338244321669, + "grad_norm": 0.19788038730621338, + "learning_rate": 2.7647934318020373e-05, + "loss": 1.7906, + "step": 21442 + }, + { + "epoch": 6.581645181092695, + "grad_norm": 0.25891759991645813, + "learning_rate": 2.7643488196517435e-05, + "loss": 1.7691, + "step": 21443 + }, + { + "epoch": 6.58195211786372, + "grad_norm": 0.25175485014915466, + "learning_rate": 2.7639042295954615e-05, + "loss": 1.7329, + "step": 21444 + }, + { + "epoch": 6.582259054634745, + "grad_norm": 0.1860336810350418, + "learning_rate": 2.7634596616375908e-05, + "loss": 1.7348, + "step": 21445 + }, + { + "epoch": 6.582565991405771, + "grad_norm": 0.2704271972179413, + "learning_rate": 2.7630151157825218e-05, + "loss": 1.7199, + "step": 21446 + }, + { + "epoch": 6.582872928176796, + "grad_norm": 0.16306720674037933, + "learning_rate": 2.762570592034649e-05, + "loss": 1.7174, + "step": 21447 + }, + { + "epoch": 6.58317986494782, + "grad_norm": 0.2585636079311371, + "learning_rate": 2.7621260903983648e-05, + "loss": 1.7392, + "step": 21448 + }, + { + "epoch": 6.583486801718846, + "grad_norm": 0.2086072564125061, + "learning_rate": 2.7616816108780623e-05, + "loss": 1.7417, + "step": 21449 + }, + { + "epoch": 6.583793738489871, + "grad_norm": 0.1747613251209259, + "learning_rate": 2.7612371534781343e-05, + "loss": 1.6607, + "step": 21450 + }, + { + "epoch": 6.584100675260896, + "grad_norm": 0.21026404201984406, + "learning_rate": 2.7607927182029726e-05, + "loss": 1.7725, + "step": 21451 + }, + { + "epoch": 6.584407612031922, + "grad_norm": 0.17881789803504944, + "learning_rate": 2.76034830505697e-05, + "loss": 1.7502, + "step": 21452 + }, + { + "epoch": 6.584714548802946, + "grad_norm": 0.2503713369369507, + "learning_rate": 2.7599039140445182e-05, + "loss": 1.798, + "step": 21453 + }, + { + "epoch": 6.5850214855739715, + "grad_norm": 0.22163939476013184, + "learning_rate": 2.7594595451700083e-05, + "loss": 1.725, + "step": 21454 + }, + { + "epoch": 6.585328422344997, + "grad_norm": 0.2154664546251297, + "learning_rate": 2.759015198437833e-05, + "loss": 1.7917, + "step": 21455 + }, + { + "epoch": 6.585635359116022, + "grad_norm": 0.1814090609550476, + "learning_rate": 2.7585708738523823e-05, + "loss": 1.6562, + "step": 21456 + }, + { + "epoch": 6.5859422958870475, + "grad_norm": 0.18815121054649353, + "learning_rate": 2.758126571418049e-05, + "loss": 1.6833, + "step": 21457 + }, + { + "epoch": 6.586249232658073, + "grad_norm": 0.19383473694324493, + "learning_rate": 2.757682291139222e-05, + "loss": 1.6987, + "step": 21458 + }, + { + "epoch": 6.586556169429097, + "grad_norm": 0.19574831426143646, + "learning_rate": 2.7572380330202912e-05, + "loss": 1.7231, + "step": 21459 + }, + { + "epoch": 6.586863106200123, + "grad_norm": 0.17509032785892487, + "learning_rate": 2.7567937970656527e-05, + "loss": 1.6452, + "step": 21460 + }, + { + "epoch": 6.587170042971148, + "grad_norm": 0.19439785182476044, + "learning_rate": 2.7563495832796886e-05, + "loss": 1.7168, + "step": 21461 + }, + { + "epoch": 6.587476979742173, + "grad_norm": 0.17384520173072815, + "learning_rate": 2.7559053916667953e-05, + "loss": 1.7128, + "step": 21462 + }, + { + "epoch": 6.587783916513199, + "grad_norm": 0.18308506906032562, + "learning_rate": 2.7554612222313597e-05, + "loss": 1.7184, + "step": 21463 + }, + { + "epoch": 6.588090853284223, + "grad_norm": 0.20052805542945862, + "learning_rate": 2.7550170749777726e-05, + "loss": 1.7239, + "step": 21464 + }, + { + "epoch": 6.588397790055248, + "grad_norm": 0.21892015635967255, + "learning_rate": 2.7545729499104215e-05, + "loss": 1.7297, + "step": 21465 + }, + { + "epoch": 6.588704726826274, + "grad_norm": 0.19819483160972595, + "learning_rate": 2.7541288470336973e-05, + "loss": 1.7303, + "step": 21466 + }, + { + "epoch": 6.589011663597299, + "grad_norm": 0.24296818673610687, + "learning_rate": 2.7536847663519884e-05, + "loss": 1.8525, + "step": 21467 + }, + { + "epoch": 6.589318600368324, + "grad_norm": 0.1971593201160431, + "learning_rate": 2.753240707869683e-05, + "loss": 1.7396, + "step": 21468 + }, + { + "epoch": 6.58962553713935, + "grad_norm": 0.24418935179710388, + "learning_rate": 2.7527966715911696e-05, + "loss": 1.7414, + "step": 21469 + }, + { + "epoch": 6.589932473910374, + "grad_norm": 0.2193990796804428, + "learning_rate": 2.7523526575208368e-05, + "loss": 1.7243, + "step": 21470 + }, + { + "epoch": 6.5902394106813995, + "grad_norm": 0.23612114787101746, + "learning_rate": 2.7519086656630722e-05, + "loss": 1.7072, + "step": 21471 + }, + { + "epoch": 6.590546347452425, + "grad_norm": 0.22282655537128448, + "learning_rate": 2.751464696022264e-05, + "loss": 1.7423, + "step": 21472 + }, + { + "epoch": 6.59085328422345, + "grad_norm": 0.21411976218223572, + "learning_rate": 2.7510207486027995e-05, + "loss": 1.7397, + "step": 21473 + }, + { + "epoch": 6.5911602209944755, + "grad_norm": 0.2244768589735031, + "learning_rate": 2.7505768234090663e-05, + "loss": 1.6964, + "step": 21474 + }, + { + "epoch": 6.5914671577655, + "grad_norm": 0.2250032275915146, + "learning_rate": 2.7501329204454512e-05, + "loss": 1.7307, + "step": 21475 + }, + { + "epoch": 6.591774094536525, + "grad_norm": 0.2643435299396515, + "learning_rate": 2.7496890397163395e-05, + "loss": 1.7298, + "step": 21476 + }, + { + "epoch": 6.592081031307551, + "grad_norm": 0.2204463928937912, + "learning_rate": 2.7492451812261232e-05, + "loss": 1.723, + "step": 21477 + }, + { + "epoch": 6.592387968078576, + "grad_norm": 0.2278377115726471, + "learning_rate": 2.7488013449791816e-05, + "loss": 1.7597, + "step": 21478 + }, + { + "epoch": 6.592694904849601, + "grad_norm": 0.18430690467357635, + "learning_rate": 2.7483575309799086e-05, + "loss": 1.6314, + "step": 21479 + }, + { + "epoch": 6.593001841620627, + "grad_norm": 0.26019781827926636, + "learning_rate": 2.7479137392326827e-05, + "loss": 1.7362, + "step": 21480 + }, + { + "epoch": 6.593308778391651, + "grad_norm": 0.2103995382785797, + "learning_rate": 2.7474699697418936e-05, + "loss": 1.7137, + "step": 21481 + }, + { + "epoch": 6.593615715162676, + "grad_norm": 0.220427006483078, + "learning_rate": 2.747026222511928e-05, + "loss": 1.7323, + "step": 21482 + }, + { + "epoch": 6.593922651933702, + "grad_norm": 0.21523109078407288, + "learning_rate": 2.7465824975471693e-05, + "loss": 1.7572, + "step": 21483 + }, + { + "epoch": 6.594229588704727, + "grad_norm": 0.21639512479305267, + "learning_rate": 2.7461387948520033e-05, + "loss": 1.7275, + "step": 21484 + }, + { + "epoch": 6.5945365254757515, + "grad_norm": 0.2043544203042984, + "learning_rate": 2.7456951144308147e-05, + "loss": 1.7454, + "step": 21485 + }, + { + "epoch": 6.594843462246777, + "grad_norm": 0.17847217619419098, + "learning_rate": 2.7452514562879882e-05, + "loss": 1.7356, + "step": 21486 + }, + { + "epoch": 6.595150399017802, + "grad_norm": 0.20756758749485016, + "learning_rate": 2.744807820427908e-05, + "loss": 1.7557, + "step": 21487 + }, + { + "epoch": 6.5954573357888275, + "grad_norm": 0.23579071462154388, + "learning_rate": 2.744364206854959e-05, + "loss": 1.7855, + "step": 21488 + }, + { + "epoch": 6.595764272559853, + "grad_norm": 0.1947307586669922, + "learning_rate": 2.7439206155735254e-05, + "loss": 1.7105, + "step": 21489 + }, + { + "epoch": 6.596071209330878, + "grad_norm": 0.1900642365217209, + "learning_rate": 2.74347704658799e-05, + "loss": 1.6692, + "step": 21490 + }, + { + "epoch": 6.596378146101903, + "grad_norm": 0.16756244003772736, + "learning_rate": 2.7430334999027375e-05, + "loss": 1.7175, + "step": 21491 + }, + { + "epoch": 6.596685082872928, + "grad_norm": 0.18581146001815796, + "learning_rate": 2.7425899755221506e-05, + "loss": 1.72, + "step": 21492 + }, + { + "epoch": 6.596992019643953, + "grad_norm": 0.2384853959083557, + "learning_rate": 2.7421464734506107e-05, + "loss": 1.718, + "step": 21493 + }, + { + "epoch": 6.597298956414979, + "grad_norm": 0.16853606700897217, + "learning_rate": 2.7417029936925065e-05, + "loss": 1.6819, + "step": 21494 + }, + { + "epoch": 6.597605893186004, + "grad_norm": 0.2273230254650116, + "learning_rate": 2.741259536252213e-05, + "loss": 1.7158, + "step": 21495 + }, + { + "epoch": 6.597912829957028, + "grad_norm": 0.2291530966758728, + "learning_rate": 2.7408161011341205e-05, + "loss": 1.7804, + "step": 21496 + }, + { + "epoch": 6.598219766728054, + "grad_norm": 0.17676831781864166, + "learning_rate": 2.740372688342604e-05, + "loss": 1.6693, + "step": 21497 + }, + { + "epoch": 6.598526703499079, + "grad_norm": 0.2386767417192459, + "learning_rate": 2.7399292978820508e-05, + "loss": 1.6932, + "step": 21498 + }, + { + "epoch": 6.598833640270104, + "grad_norm": 0.21329782903194427, + "learning_rate": 2.739485929756841e-05, + "loss": 1.7811, + "step": 21499 + }, + { + "epoch": 6.59914057704113, + "grad_norm": 0.19382116198539734, + "learning_rate": 2.7390425839713556e-05, + "loss": 1.7152, + "step": 21500 + }, + { + "epoch": 6.599447513812155, + "grad_norm": 0.1819920688867569, + "learning_rate": 2.738599260529977e-05, + "loss": 1.6571, + "step": 21501 + }, + { + "epoch": 6.5997544505831796, + "grad_norm": 0.19947806000709534, + "learning_rate": 2.738155959437086e-05, + "loss": 1.7138, + "step": 21502 + }, + { + "epoch": 6.600061387354205, + "grad_norm": 0.1851014792919159, + "learning_rate": 2.7377126806970634e-05, + "loss": 1.7109, + "step": 21503 + }, + { + "epoch": 6.60036832412523, + "grad_norm": 0.20365974307060242, + "learning_rate": 2.7372694243142905e-05, + "loss": 1.7145, + "step": 21504 + }, + { + "epoch": 6.600675260896256, + "grad_norm": 0.2070893943309784, + "learning_rate": 2.736826190293147e-05, + "loss": 1.7172, + "step": 21505 + }, + { + "epoch": 6.600982197667281, + "grad_norm": 0.19077777862548828, + "learning_rate": 2.7363829786380136e-05, + "loss": 1.7059, + "step": 21506 + }, + { + "epoch": 6.601289134438305, + "grad_norm": 0.21168744564056396, + "learning_rate": 2.73593978935327e-05, + "loss": 1.7483, + "step": 21507 + }, + { + "epoch": 6.601596071209331, + "grad_norm": 0.20746631920337677, + "learning_rate": 2.7354966224432965e-05, + "loss": 1.7165, + "step": 21508 + }, + { + "epoch": 6.601903007980356, + "grad_norm": 0.19440631568431854, + "learning_rate": 2.7350534779124732e-05, + "loss": 1.694, + "step": 21509 + }, + { + "epoch": 6.602209944751381, + "grad_norm": 0.20699405670166016, + "learning_rate": 2.7346103557651765e-05, + "loss": 1.7077, + "step": 21510 + }, + { + "epoch": 6.602516881522407, + "grad_norm": 0.19856512546539307, + "learning_rate": 2.7341672560057917e-05, + "loss": 1.77, + "step": 21511 + }, + { + "epoch": 6.602823818293432, + "grad_norm": 0.23978421092033386, + "learning_rate": 2.7337241786386915e-05, + "loss": 1.7531, + "step": 21512 + }, + { + "epoch": 6.6031307550644565, + "grad_norm": 0.1834867000579834, + "learning_rate": 2.73328112366826e-05, + "loss": 1.751, + "step": 21513 + }, + { + "epoch": 6.603437691835482, + "grad_norm": 0.2154606282711029, + "learning_rate": 2.7328380910988694e-05, + "loss": 1.737, + "step": 21514 + }, + { + "epoch": 6.603744628606507, + "grad_norm": 0.20554645359516144, + "learning_rate": 2.7323950809349035e-05, + "loss": 1.7629, + "step": 21515 + }, + { + "epoch": 6.6040515653775325, + "grad_norm": 0.20497548580169678, + "learning_rate": 2.7319520931807386e-05, + "loss": 1.7001, + "step": 21516 + }, + { + "epoch": 6.604358502148557, + "grad_norm": 0.18628253042697906, + "learning_rate": 2.7315091278407523e-05, + "loss": 1.7477, + "step": 21517 + }, + { + "epoch": 6.604665438919582, + "grad_norm": 0.20788705348968506, + "learning_rate": 2.731066184919323e-05, + "loss": 1.7185, + "step": 21518 + }, + { + "epoch": 6.604972375690608, + "grad_norm": 0.17834967374801636, + "learning_rate": 2.730623264420827e-05, + "loss": 1.67, + "step": 21519 + }, + { + "epoch": 6.605279312461633, + "grad_norm": 0.2183784693479538, + "learning_rate": 2.7301803663496417e-05, + "loss": 1.6983, + "step": 21520 + }, + { + "epoch": 6.605586249232658, + "grad_norm": 0.1735544204711914, + "learning_rate": 2.7297374907101447e-05, + "loss": 1.7352, + "step": 21521 + }, + { + "epoch": 6.605893186003684, + "grad_norm": 0.2504538893699646, + "learning_rate": 2.729294637506713e-05, + "loss": 1.7332, + "step": 21522 + }, + { + "epoch": 6.606200122774708, + "grad_norm": 0.1801074892282486, + "learning_rate": 2.728851806743722e-05, + "loss": 1.7251, + "step": 21523 + }, + { + "epoch": 6.606507059545733, + "grad_norm": 0.25701379776000977, + "learning_rate": 2.728408998425549e-05, + "loss": 1.732, + "step": 21524 + }, + { + "epoch": 6.606813996316759, + "grad_norm": 0.1801779717206955, + "learning_rate": 2.7279662125565697e-05, + "loss": 1.6793, + "step": 21525 + }, + { + "epoch": 6.607120933087784, + "grad_norm": 0.21244947612285614, + "learning_rate": 2.7275234491411595e-05, + "loss": 1.7493, + "step": 21526 + }, + { + "epoch": 6.607427869858809, + "grad_norm": 0.20944559574127197, + "learning_rate": 2.7270807081836924e-05, + "loss": 1.722, + "step": 21527 + }, + { + "epoch": 6.607734806629834, + "grad_norm": 0.2526783049106598, + "learning_rate": 2.7266379896885508e-05, + "loss": 1.7628, + "step": 21528 + }, + { + "epoch": 6.608041743400859, + "grad_norm": 0.19788937270641327, + "learning_rate": 2.7261952936601002e-05, + "loss": 1.6538, + "step": 21529 + }, + { + "epoch": 6.6083486801718845, + "grad_norm": 0.2623229920864105, + "learning_rate": 2.725752620102725e-05, + "loss": 1.7694, + "step": 21530 + }, + { + "epoch": 6.60865561694291, + "grad_norm": 0.21503256261348724, + "learning_rate": 2.7253099690207913e-05, + "loss": 1.7553, + "step": 21531 + }, + { + "epoch": 6.608962553713935, + "grad_norm": 0.2114928811788559, + "learning_rate": 2.724867340418679e-05, + "loss": 1.7067, + "step": 21532 + }, + { + "epoch": 6.6092694904849605, + "grad_norm": 0.17945198714733124, + "learning_rate": 2.7244247343007623e-05, + "loss": 1.7419, + "step": 21533 + }, + { + "epoch": 6.609576427255985, + "grad_norm": 0.19239214062690735, + "learning_rate": 2.7239821506714137e-05, + "loss": 1.7644, + "step": 21534 + }, + { + "epoch": 6.60988336402701, + "grad_norm": 0.22906997799873352, + "learning_rate": 2.7235395895350068e-05, + "loss": 1.8063, + "step": 21535 + }, + { + "epoch": 6.610190300798036, + "grad_norm": 0.1965717375278473, + "learning_rate": 2.7230970508959162e-05, + "loss": 1.7841, + "step": 21536 + }, + { + "epoch": 6.610497237569061, + "grad_norm": 0.19944418966770172, + "learning_rate": 2.7226545347585158e-05, + "loss": 1.7382, + "step": 21537 + }, + { + "epoch": 6.610804174340086, + "grad_norm": 0.17155805230140686, + "learning_rate": 2.722212041127178e-05, + "loss": 1.6621, + "step": 21538 + }, + { + "epoch": 6.611111111111111, + "grad_norm": 0.20459938049316406, + "learning_rate": 2.721769570006275e-05, + "loss": 1.7481, + "step": 21539 + }, + { + "epoch": 6.611418047882136, + "grad_norm": 0.1991354376077652, + "learning_rate": 2.7213271214001813e-05, + "loss": 1.7874, + "step": 21540 + }, + { + "epoch": 6.611724984653161, + "grad_norm": 0.25073128938674927, + "learning_rate": 2.7208846953132682e-05, + "loss": 1.7921, + "step": 21541 + }, + { + "epoch": 6.612031921424187, + "grad_norm": 0.24456258118152618, + "learning_rate": 2.7204422917499085e-05, + "loss": 1.7564, + "step": 21542 + }, + { + "epoch": 6.612338858195212, + "grad_norm": 0.18416531383991241, + "learning_rate": 2.7199999107144736e-05, + "loss": 1.7247, + "step": 21543 + }, + { + "epoch": 6.612645794966237, + "grad_norm": 0.18439221382141113, + "learning_rate": 2.7195575522113347e-05, + "loss": 1.6607, + "step": 21544 + }, + { + "epoch": 6.612952731737262, + "grad_norm": 0.20334671437740326, + "learning_rate": 2.7191152162448685e-05, + "loss": 1.7487, + "step": 21545 + }, + { + "epoch": 6.613259668508287, + "grad_norm": 0.17871633172035217, + "learning_rate": 2.718672902819438e-05, + "loss": 1.7355, + "step": 21546 + }, + { + "epoch": 6.6135666052793125, + "grad_norm": 0.23006688058376312, + "learning_rate": 2.718230611939424e-05, + "loss": 1.6489, + "step": 21547 + }, + { + "epoch": 6.613873542050338, + "grad_norm": 0.19141538441181183, + "learning_rate": 2.7177883436091877e-05, + "loss": 1.6793, + "step": 21548 + }, + { + "epoch": 6.614180478821363, + "grad_norm": 0.20549756288528442, + "learning_rate": 2.7173460978331068e-05, + "loss": 1.8331, + "step": 21549 + }, + { + "epoch": 6.614487415592388, + "grad_norm": 0.19106455147266388, + "learning_rate": 2.7169038746155495e-05, + "loss": 1.7295, + "step": 21550 + }, + { + "epoch": 6.614794352363413, + "grad_norm": 0.20190143585205078, + "learning_rate": 2.7164616739608866e-05, + "loss": 1.7032, + "step": 21551 + }, + { + "epoch": 6.615101289134438, + "grad_norm": 0.1969708949327469, + "learning_rate": 2.716019495873488e-05, + "loss": 1.6935, + "step": 21552 + }, + { + "epoch": 6.615408225905464, + "grad_norm": 0.23748311400413513, + "learning_rate": 2.7155773403577235e-05, + "loss": 1.7942, + "step": 21553 + }, + { + "epoch": 6.615715162676489, + "grad_norm": 0.29168081283569336, + "learning_rate": 2.715135207417962e-05, + "loss": 1.7121, + "step": 21554 + }, + { + "epoch": 6.616022099447514, + "grad_norm": 0.2428344041109085, + "learning_rate": 2.7146930970585738e-05, + "loss": 1.7287, + "step": 21555 + }, + { + "epoch": 6.616329036218539, + "grad_norm": 0.2520657479763031, + "learning_rate": 2.714251009283928e-05, + "loss": 1.8462, + "step": 21556 + }, + { + "epoch": 6.616635972989564, + "grad_norm": 0.2426053285598755, + "learning_rate": 2.713808944098394e-05, + "loss": 1.7094, + "step": 21557 + }, + { + "epoch": 6.616942909760589, + "grad_norm": 0.17593255639076233, + "learning_rate": 2.713366901506339e-05, + "loss": 1.6891, + "step": 21558 + }, + { + "epoch": 6.617249846531615, + "grad_norm": 0.20620940625667572, + "learning_rate": 2.7129248815121332e-05, + "loss": 1.7277, + "step": 21559 + }, + { + "epoch": 6.617556783302639, + "grad_norm": 0.21467719972133636, + "learning_rate": 2.7124828841201445e-05, + "loss": 1.7543, + "step": 21560 + }, + { + "epoch": 6.6178637200736645, + "grad_norm": 0.21372607350349426, + "learning_rate": 2.7120409093347378e-05, + "loss": 1.7207, + "step": 21561 + }, + { + "epoch": 6.61817065684469, + "grad_norm": 0.2123684585094452, + "learning_rate": 2.7115989571602884e-05, + "loss": 1.71, + "step": 21562 + }, + { + "epoch": 6.618477593615715, + "grad_norm": 0.19155478477478027, + "learning_rate": 2.711157027601155e-05, + "loss": 1.7182, + "step": 21563 + }, + { + "epoch": 6.6187845303867405, + "grad_norm": 0.23053184151649475, + "learning_rate": 2.7107151206617136e-05, + "loss": 1.7147, + "step": 21564 + }, + { + "epoch": 6.619091467157766, + "grad_norm": 0.1635691374540329, + "learning_rate": 2.7102732363463235e-05, + "loss": 1.6913, + "step": 21565 + }, + { + "epoch": 6.61939840392879, + "grad_norm": 0.19415298104286194, + "learning_rate": 2.709831374659357e-05, + "loss": 1.6813, + "step": 21566 + }, + { + "epoch": 6.619705340699816, + "grad_norm": 0.19547943770885468, + "learning_rate": 2.709389535605179e-05, + "loss": 1.6988, + "step": 21567 + }, + { + "epoch": 6.620012277470841, + "grad_norm": 0.1921805888414383, + "learning_rate": 2.7089477191881564e-05, + "loss": 1.6931, + "step": 21568 + }, + { + "epoch": 6.620319214241866, + "grad_norm": 0.18463274836540222, + "learning_rate": 2.7085059254126554e-05, + "loss": 1.7168, + "step": 21569 + }, + { + "epoch": 6.620626151012892, + "grad_norm": 0.2078532725572586, + "learning_rate": 2.7080641542830414e-05, + "loss": 1.7248, + "step": 21570 + }, + { + "epoch": 6.620933087783916, + "grad_norm": 0.18778283894062042, + "learning_rate": 2.7076224058036813e-05, + "loss": 1.6745, + "step": 21571 + }, + { + "epoch": 6.621240024554941, + "grad_norm": 0.26190707087516785, + "learning_rate": 2.70718067997894e-05, + "loss": 1.7317, + "step": 21572 + }, + { + "epoch": 6.621546961325967, + "grad_norm": 0.20449557900428772, + "learning_rate": 2.7067389768131836e-05, + "loss": 1.7167, + "step": 21573 + }, + { + "epoch": 6.621853898096992, + "grad_norm": 0.22722119092941284, + "learning_rate": 2.706297296310776e-05, + "loss": 1.7262, + "step": 21574 + }, + { + "epoch": 6.622160834868017, + "grad_norm": 0.24897173047065735, + "learning_rate": 2.7058556384760825e-05, + "loss": 1.7273, + "step": 21575 + }, + { + "epoch": 6.622467771639043, + "grad_norm": 0.19774340093135834, + "learning_rate": 2.705414003313469e-05, + "loss": 1.6765, + "step": 21576 + }, + { + "epoch": 6.622774708410067, + "grad_norm": 0.2661767303943634, + "learning_rate": 2.7049723908272995e-05, + "loss": 1.7046, + "step": 21577 + }, + { + "epoch": 6.6230816451810925, + "grad_norm": 0.2013266384601593, + "learning_rate": 2.7045308010219356e-05, + "loss": 1.7156, + "step": 21578 + }, + { + "epoch": 6.623388581952118, + "grad_norm": 0.22952915728092194, + "learning_rate": 2.7040892339017475e-05, + "loss": 1.7601, + "step": 21579 + }, + { + "epoch": 6.623695518723143, + "grad_norm": 0.18262411653995514, + "learning_rate": 2.7036476894710916e-05, + "loss": 1.7334, + "step": 21580 + }, + { + "epoch": 6.6240024554941686, + "grad_norm": 0.18907666206359863, + "learning_rate": 2.703206167734339e-05, + "loss": 1.7196, + "step": 21581 + }, + { + "epoch": 6.624309392265193, + "grad_norm": 0.2192571759223938, + "learning_rate": 2.7027646686958453e-05, + "loss": 1.7046, + "step": 21582 + }, + { + "epoch": 6.624616329036218, + "grad_norm": 0.165769562125206, + "learning_rate": 2.70232319235998e-05, + "loss": 1.7028, + "step": 21583 + }, + { + "epoch": 6.624923265807244, + "grad_norm": 0.19245828688144684, + "learning_rate": 2.701881738731103e-05, + "loss": 1.7153, + "step": 21584 + }, + { + "epoch": 6.625230202578269, + "grad_norm": 0.17638756334781647, + "learning_rate": 2.7014403078135776e-05, + "loss": 1.7071, + "step": 21585 + }, + { + "epoch": 6.625537139349294, + "grad_norm": 0.17205210030078888, + "learning_rate": 2.700998899611767e-05, + "loss": 1.6706, + "step": 21586 + }, + { + "epoch": 6.62584407612032, + "grad_norm": 0.24107681214809418, + "learning_rate": 2.700557514130032e-05, + "loss": 1.8013, + "step": 21587 + }, + { + "epoch": 6.626151012891344, + "grad_norm": 0.1839917004108429, + "learning_rate": 2.7001161513727358e-05, + "loss": 1.7381, + "step": 21588 + }, + { + "epoch": 6.6264579496623695, + "grad_norm": 0.24043352901935577, + "learning_rate": 2.6996748113442394e-05, + "loss": 1.7523, + "step": 21589 + }, + { + "epoch": 6.626764886433395, + "grad_norm": 0.23488068580627441, + "learning_rate": 2.6992334940489056e-05, + "loss": 1.7587, + "step": 21590 + }, + { + "epoch": 6.62707182320442, + "grad_norm": 0.18784530460834503, + "learning_rate": 2.698792199491094e-05, + "loss": 1.7053, + "step": 21591 + }, + { + "epoch": 6.627378759975445, + "grad_norm": 0.2758429944515228, + "learning_rate": 2.6983509276751673e-05, + "loss": 1.6927, + "step": 21592 + }, + { + "epoch": 6.62768569674647, + "grad_norm": 0.2731272280216217, + "learning_rate": 2.697909678605486e-05, + "loss": 1.7351, + "step": 21593 + }, + { + "epoch": 6.627992633517495, + "grad_norm": 0.24450576305389404, + "learning_rate": 2.6974684522864098e-05, + "loss": 1.7126, + "step": 21594 + }, + { + "epoch": 6.628299570288521, + "grad_norm": 0.21820391714572906, + "learning_rate": 2.6970272487222982e-05, + "loss": 1.7075, + "step": 21595 + }, + { + "epoch": 6.628606507059546, + "grad_norm": 0.23647959530353546, + "learning_rate": 2.696586067917517e-05, + "loss": 1.7369, + "step": 21596 + }, + { + "epoch": 6.628913443830571, + "grad_norm": 0.2665121555328369, + "learning_rate": 2.696144909876419e-05, + "loss": 1.7575, + "step": 21597 + }, + { + "epoch": 6.629220380601596, + "grad_norm": 0.19871680438518524, + "learning_rate": 2.695703774603371e-05, + "loss": 1.7334, + "step": 21598 + }, + { + "epoch": 6.629527317372621, + "grad_norm": 0.2363109588623047, + "learning_rate": 2.6952626621027245e-05, + "loss": 1.6878, + "step": 21599 + }, + { + "epoch": 6.629834254143646, + "grad_norm": 0.21958591043949127, + "learning_rate": 2.694821572378845e-05, + "loss": 1.6828, + "step": 21600 + }, + { + "epoch": 6.630141190914672, + "grad_norm": 0.20437858998775482, + "learning_rate": 2.6943805054360906e-05, + "loss": 1.7138, + "step": 21601 + }, + { + "epoch": 6.630448127685697, + "grad_norm": 0.27741923928260803, + "learning_rate": 2.6939394612788193e-05, + "loss": 1.7506, + "step": 21602 + }, + { + "epoch": 6.6307550644567215, + "grad_norm": 0.1885133981704712, + "learning_rate": 2.6934984399113917e-05, + "loss": 1.7669, + "step": 21603 + }, + { + "epoch": 6.631062001227747, + "grad_norm": 0.19453810155391693, + "learning_rate": 2.6930574413381604e-05, + "loss": 1.6837, + "step": 21604 + }, + { + "epoch": 6.631368937998772, + "grad_norm": 0.1685735285282135, + "learning_rate": 2.6926164655634894e-05, + "loss": 1.7045, + "step": 21605 + }, + { + "epoch": 6.6316758747697975, + "grad_norm": 0.2507462203502655, + "learning_rate": 2.6921755125917347e-05, + "loss": 1.7754, + "step": 21606 + }, + { + "epoch": 6.631982811540823, + "grad_norm": 0.1725471317768097, + "learning_rate": 2.691734582427255e-05, + "loss": 1.7219, + "step": 21607 + }, + { + "epoch": 6.632289748311848, + "grad_norm": 0.2633528709411621, + "learning_rate": 2.6912936750744068e-05, + "loss": 1.7362, + "step": 21608 + }, + { + "epoch": 6.632596685082873, + "grad_norm": 0.1808360069990158, + "learning_rate": 2.6908527905375474e-05, + "loss": 1.7338, + "step": 21609 + }, + { + "epoch": 6.632903621853898, + "grad_norm": 0.16186563670635223, + "learning_rate": 2.6904119288210344e-05, + "loss": 1.6752, + "step": 21610 + }, + { + "epoch": 6.633210558624923, + "grad_norm": 0.1954091340303421, + "learning_rate": 2.689971089929224e-05, + "loss": 1.714, + "step": 21611 + }, + { + "epoch": 6.633517495395949, + "grad_norm": 0.18954069912433624, + "learning_rate": 2.689530273866474e-05, + "loss": 1.7869, + "step": 21612 + }, + { + "epoch": 6.633824432166974, + "grad_norm": 0.182058185338974, + "learning_rate": 2.6890894806371392e-05, + "loss": 1.7708, + "step": 21613 + }, + { + "epoch": 6.634131368937998, + "grad_norm": 0.17313501238822937, + "learning_rate": 2.6886487102455755e-05, + "loss": 1.7064, + "step": 21614 + }, + { + "epoch": 6.634438305709024, + "grad_norm": 0.1732148379087448, + "learning_rate": 2.688207962696143e-05, + "loss": 1.7378, + "step": 21615 + }, + { + "epoch": 6.634745242480049, + "grad_norm": 0.17057274281978607, + "learning_rate": 2.687767237993191e-05, + "loss": 1.671, + "step": 21616 + }, + { + "epoch": 6.635052179251074, + "grad_norm": 0.17723220586776733, + "learning_rate": 2.6873265361410805e-05, + "loss": 1.7179, + "step": 21617 + }, + { + "epoch": 6.6353591160221, + "grad_norm": 0.18634437024593353, + "learning_rate": 2.6868858571441645e-05, + "loss": 1.7355, + "step": 21618 + }, + { + "epoch": 6.635666052793125, + "grad_norm": 0.205010786652565, + "learning_rate": 2.6864452010067985e-05, + "loss": 1.7399, + "step": 21619 + }, + { + "epoch": 6.6359729895641495, + "grad_norm": 0.2071879357099533, + "learning_rate": 2.6860045677333383e-05, + "loss": 1.7199, + "step": 21620 + }, + { + "epoch": 6.636279926335175, + "grad_norm": 0.17309685051441193, + "learning_rate": 2.685563957328134e-05, + "loss": 1.6595, + "step": 21621 + }, + { + "epoch": 6.6365868631062, + "grad_norm": 0.3505750000476837, + "learning_rate": 2.685123369795545e-05, + "loss": 1.7601, + "step": 21622 + }, + { + "epoch": 6.6368937998772255, + "grad_norm": 0.19184419512748718, + "learning_rate": 2.684682805139923e-05, + "loss": 1.7225, + "step": 21623 + }, + { + "epoch": 6.637200736648251, + "grad_norm": 0.20142409205436707, + "learning_rate": 2.6842422633656233e-05, + "loss": 1.7201, + "step": 21624 + }, + { + "epoch": 6.637507673419275, + "grad_norm": 0.18348537385463715, + "learning_rate": 2.6838017444769993e-05, + "loss": 1.6983, + "step": 21625 + }, + { + "epoch": 6.637814610190301, + "grad_norm": 0.19275228679180145, + "learning_rate": 2.6833612484784033e-05, + "loss": 1.7028, + "step": 21626 + }, + { + "epoch": 6.638121546961326, + "grad_norm": 0.21269574761390686, + "learning_rate": 2.682920775374189e-05, + "loss": 1.7888, + "step": 21627 + }, + { + "epoch": 6.638428483732351, + "grad_norm": 0.17470422387123108, + "learning_rate": 2.68248032516871e-05, + "loss": 1.7147, + "step": 21628 + }, + { + "epoch": 6.638735420503377, + "grad_norm": 0.15697288513183594, + "learning_rate": 2.6820398978663185e-05, + "loss": 1.6544, + "step": 21629 + }, + { + "epoch": 6.639042357274402, + "grad_norm": 0.18636487424373627, + "learning_rate": 2.6815994934713677e-05, + "loss": 1.721, + "step": 21630 + }, + { + "epoch": 6.639349294045426, + "grad_norm": 0.18091215193271637, + "learning_rate": 2.681159111988208e-05, + "loss": 1.6973, + "step": 21631 + }, + { + "epoch": 6.639656230816452, + "grad_norm": 0.21360217034816742, + "learning_rate": 2.6807187534211965e-05, + "loss": 1.7379, + "step": 21632 + }, + { + "epoch": 6.639963167587477, + "grad_norm": 0.20027592778205872, + "learning_rate": 2.6802784177746777e-05, + "loss": 1.7207, + "step": 21633 + }, + { + "epoch": 6.640270104358502, + "grad_norm": 0.21839644014835358, + "learning_rate": 2.679838105053011e-05, + "loss": 1.715, + "step": 21634 + }, + { + "epoch": 6.640577041129527, + "grad_norm": 0.19237302243709564, + "learning_rate": 2.6793978152605404e-05, + "loss": 1.7415, + "step": 21635 + }, + { + "epoch": 6.640883977900552, + "grad_norm": 0.1979883313179016, + "learning_rate": 2.678957548401623e-05, + "loss": 1.7005, + "step": 21636 + }, + { + "epoch": 6.6411909146715775, + "grad_norm": 0.21867144107818604, + "learning_rate": 2.678517304480609e-05, + "loss": 1.8008, + "step": 21637 + }, + { + "epoch": 6.641497851442603, + "grad_norm": 0.17232954502105713, + "learning_rate": 2.6780770835018433e-05, + "loss": 1.6867, + "step": 21638 + }, + { + "epoch": 6.641804788213628, + "grad_norm": 0.21535196900367737, + "learning_rate": 2.6776368854696853e-05, + "loss": 1.7545, + "step": 21639 + }, + { + "epoch": 6.6421117249846535, + "grad_norm": 0.18891240656375885, + "learning_rate": 2.6771967103884766e-05, + "loss": 1.7164, + "step": 21640 + }, + { + "epoch": 6.642418661755678, + "grad_norm": 0.2558320462703705, + "learning_rate": 2.6767565582625743e-05, + "loss": 1.8125, + "step": 21641 + }, + { + "epoch": 6.642725598526703, + "grad_norm": 0.20400027930736542, + "learning_rate": 2.6763164290963244e-05, + "loss": 1.7335, + "step": 21642 + }, + { + "epoch": 6.643032535297729, + "grad_norm": 0.21388766169548035, + "learning_rate": 2.6758763228940775e-05, + "loss": 1.7788, + "step": 21643 + }, + { + "epoch": 6.643339472068754, + "grad_norm": 0.20607435703277588, + "learning_rate": 2.6754362396601834e-05, + "loss": 1.7481, + "step": 21644 + }, + { + "epoch": 6.643646408839779, + "grad_norm": 0.1608831286430359, + "learning_rate": 2.6749961793989907e-05, + "loss": 1.6577, + "step": 21645 + }, + { + "epoch": 6.643953345610804, + "grad_norm": 0.19074808061122894, + "learning_rate": 2.6745561421148485e-05, + "loss": 1.7335, + "step": 21646 + }, + { + "epoch": 6.644260282381829, + "grad_norm": 0.16517756879329681, + "learning_rate": 2.6741161278121053e-05, + "loss": 1.6663, + "step": 21647 + }, + { + "epoch": 6.644567219152854, + "grad_norm": 0.18976998329162598, + "learning_rate": 2.673676136495108e-05, + "loss": 1.7231, + "step": 21648 + }, + { + "epoch": 6.64487415592388, + "grad_norm": 0.20694875717163086, + "learning_rate": 2.6732361681682106e-05, + "loss": 1.7469, + "step": 21649 + }, + { + "epoch": 6.645181092694905, + "grad_norm": 0.1994311809539795, + "learning_rate": 2.6727962228357533e-05, + "loss": 1.6864, + "step": 21650 + }, + { + "epoch": 6.64548802946593, + "grad_norm": 0.18886511027812958, + "learning_rate": 2.672356300502091e-05, + "loss": 1.6874, + "step": 21651 + }, + { + "epoch": 6.645794966236955, + "grad_norm": 0.2152819186449051, + "learning_rate": 2.6719164011715653e-05, + "loss": 1.7327, + "step": 21652 + }, + { + "epoch": 6.64610190300798, + "grad_norm": 0.20525617897510529, + "learning_rate": 2.6714765248485275e-05, + "loss": 1.7409, + "step": 21653 + }, + { + "epoch": 6.6464088397790055, + "grad_norm": 0.21892790496349335, + "learning_rate": 2.6710366715373254e-05, + "loss": 1.7281, + "step": 21654 + }, + { + "epoch": 6.646715776550031, + "grad_norm": 0.20156462490558624, + "learning_rate": 2.6705968412423e-05, + "loss": 1.7211, + "step": 21655 + }, + { + "epoch": 6.647022713321056, + "grad_norm": 0.19993625581264496, + "learning_rate": 2.670157033967806e-05, + "loss": 1.8058, + "step": 21656 + }, + { + "epoch": 6.647329650092081, + "grad_norm": 0.1970909684896469, + "learning_rate": 2.669717249718182e-05, + "loss": 1.7707, + "step": 21657 + }, + { + "epoch": 6.647636586863106, + "grad_norm": 0.19287796318531036, + "learning_rate": 2.6692774884977796e-05, + "loss": 1.688, + "step": 21658 + }, + { + "epoch": 6.647943523634131, + "grad_norm": 0.17658226191997528, + "learning_rate": 2.668837750310943e-05, + "loss": 1.6936, + "step": 21659 + }, + { + "epoch": 6.648250460405157, + "grad_norm": 0.20234479010105133, + "learning_rate": 2.6683980351620184e-05, + "loss": 1.7069, + "step": 21660 + }, + { + "epoch": 6.648557397176182, + "grad_norm": 0.1957871913909912, + "learning_rate": 2.6679583430553513e-05, + "loss": 1.736, + "step": 21661 + }, + { + "epoch": 6.648864333947207, + "grad_norm": 0.20084553956985474, + "learning_rate": 2.667518673995286e-05, + "loss": 1.7262, + "step": 21662 + }, + { + "epoch": 6.649171270718232, + "grad_norm": 0.18749211728572845, + "learning_rate": 2.667079027986169e-05, + "loss": 1.7127, + "step": 21663 + }, + { + "epoch": 6.649478207489257, + "grad_norm": 0.1747027188539505, + "learning_rate": 2.666639405032344e-05, + "loss": 1.6922, + "step": 21664 + }, + { + "epoch": 6.649785144260282, + "grad_norm": 0.3119397759437561, + "learning_rate": 2.666199805138154e-05, + "loss": 1.7373, + "step": 21665 + }, + { + "epoch": 6.650092081031308, + "grad_norm": 0.25986436009407043, + "learning_rate": 2.6657602283079498e-05, + "loss": 1.7521, + "step": 21666 + }, + { + "epoch": 6.650399017802332, + "grad_norm": 0.20535705983638763, + "learning_rate": 2.6653206745460663e-05, + "loss": 1.7144, + "step": 21667 + }, + { + "epoch": 6.650705954573358, + "grad_norm": 0.20804347097873688, + "learning_rate": 2.6648811438568566e-05, + "loss": 1.7186, + "step": 21668 + }, + { + "epoch": 6.651012891344383, + "grad_norm": 0.20753289759159088, + "learning_rate": 2.6644416362446566e-05, + "loss": 1.7098, + "step": 21669 + }, + { + "epoch": 6.651319828115408, + "grad_norm": 0.18725311756134033, + "learning_rate": 2.6640021517138148e-05, + "loss": 1.7331, + "step": 21670 + }, + { + "epoch": 6.651626764886434, + "grad_norm": 0.1907210648059845, + "learning_rate": 2.663562690268675e-05, + "loss": 1.6677, + "step": 21671 + }, + { + "epoch": 6.651933701657459, + "grad_norm": 0.19124922156333923, + "learning_rate": 2.6631232519135747e-05, + "loss": 1.7337, + "step": 21672 + }, + { + "epoch": 6.652240638428484, + "grad_norm": 0.21045447885990143, + "learning_rate": 2.6626838366528633e-05, + "loss": 1.7028, + "step": 21673 + }, + { + "epoch": 6.652547575199509, + "grad_norm": 0.1891855001449585, + "learning_rate": 2.6622444444908767e-05, + "loss": 1.7247, + "step": 21674 + }, + { + "epoch": 6.652854511970534, + "grad_norm": 0.2236541211605072, + "learning_rate": 2.6618050754319623e-05, + "loss": 1.6986, + "step": 21675 + }, + { + "epoch": 6.653161448741559, + "grad_norm": 0.19088539481163025, + "learning_rate": 2.6613657294804604e-05, + "loss": 1.7118, + "step": 21676 + }, + { + "epoch": 6.653468385512585, + "grad_norm": 0.26210764050483704, + "learning_rate": 2.660926406640714e-05, + "loss": 1.7542, + "step": 21677 + }, + { + "epoch": 6.653775322283609, + "grad_norm": 0.2564029097557068, + "learning_rate": 2.6604871069170632e-05, + "loss": 1.7395, + "step": 21678 + }, + { + "epoch": 6.6540822590546345, + "grad_norm": 0.22974301874637604, + "learning_rate": 2.6600478303138503e-05, + "loss": 1.6905, + "step": 21679 + }, + { + "epoch": 6.65438919582566, + "grad_norm": 0.299772173166275, + "learning_rate": 2.659608576835416e-05, + "loss": 1.7875, + "step": 21680 + }, + { + "epoch": 6.654696132596685, + "grad_norm": 0.26459556818008423, + "learning_rate": 2.6591693464861018e-05, + "loss": 1.7185, + "step": 21681 + }, + { + "epoch": 6.6550030693677105, + "grad_norm": 0.24505311250686646, + "learning_rate": 2.6587301392702457e-05, + "loss": 1.7105, + "step": 21682 + }, + { + "epoch": 6.655310006138736, + "grad_norm": 0.1626308262348175, + "learning_rate": 2.6582909551921953e-05, + "loss": 1.6668, + "step": 21683 + }, + { + "epoch": 6.65561694290976, + "grad_norm": 0.20354291796684265, + "learning_rate": 2.6578517942562813e-05, + "loss": 1.7437, + "step": 21684 + }, + { + "epoch": 6.655923879680786, + "grad_norm": 0.18618443608283997, + "learning_rate": 2.6574126564668532e-05, + "loss": 1.6757, + "step": 21685 + }, + { + "epoch": 6.656230816451811, + "grad_norm": 0.1863735467195511, + "learning_rate": 2.656973541828242e-05, + "loss": 1.6549, + "step": 21686 + }, + { + "epoch": 6.656537753222836, + "grad_norm": 0.2118620127439499, + "learning_rate": 2.6565344503447935e-05, + "loss": 1.6927, + "step": 21687 + }, + { + "epoch": 6.656844689993862, + "grad_norm": 0.24023136496543884, + "learning_rate": 2.6560953820208478e-05, + "loss": 1.6969, + "step": 21688 + }, + { + "epoch": 6.657151626764886, + "grad_norm": 0.21124204993247986, + "learning_rate": 2.6556563368607368e-05, + "loss": 1.6662, + "step": 21689 + }, + { + "epoch": 6.657458563535911, + "grad_norm": 0.16295355558395386, + "learning_rate": 2.6552173148688075e-05, + "loss": 1.7203, + "step": 21690 + }, + { + "epoch": 6.657765500306937, + "grad_norm": 0.18650858104228973, + "learning_rate": 2.6547783160493916e-05, + "loss": 1.7177, + "step": 21691 + }, + { + "epoch": 6.658072437077962, + "grad_norm": 0.20509213209152222, + "learning_rate": 2.6543393404068328e-05, + "loss": 1.723, + "step": 21692 + }, + { + "epoch": 6.658379373848987, + "grad_norm": 0.20985513925552368, + "learning_rate": 2.6539003879454678e-05, + "loss": 1.6679, + "step": 21693 + }, + { + "epoch": 6.658686310620013, + "grad_norm": 0.19907233119010925, + "learning_rate": 2.6534614586696338e-05, + "loss": 1.7028, + "step": 21694 + }, + { + "epoch": 6.658993247391037, + "grad_norm": 0.21793772280216217, + "learning_rate": 2.6530225525836692e-05, + "loss": 1.7706, + "step": 21695 + }, + { + "epoch": 6.6593001841620625, + "grad_norm": 0.24162191152572632, + "learning_rate": 2.6525836696919117e-05, + "loss": 1.806, + "step": 21696 + }, + { + "epoch": 6.659607120933088, + "grad_norm": 0.1735360324382782, + "learning_rate": 2.652144809998698e-05, + "loss": 1.7047, + "step": 21697 + }, + { + "epoch": 6.659914057704113, + "grad_norm": 0.18471799790859222, + "learning_rate": 2.651705973508365e-05, + "loss": 1.7306, + "step": 21698 + }, + { + "epoch": 6.6602209944751385, + "grad_norm": 0.17422814667224884, + "learning_rate": 2.6512671602252482e-05, + "loss": 1.6666, + "step": 21699 + }, + { + "epoch": 6.660527931246163, + "grad_norm": 0.19209833443164825, + "learning_rate": 2.6508283701536897e-05, + "loss": 1.6966, + "step": 21700 + }, + { + "epoch": 6.660834868017188, + "grad_norm": 0.1902640461921692, + "learning_rate": 2.650389603298019e-05, + "loss": 1.7887, + "step": 21701 + }, + { + "epoch": 6.661141804788214, + "grad_norm": 0.18551218509674072, + "learning_rate": 2.6499508596625787e-05, + "loss": 1.6851, + "step": 21702 + }, + { + "epoch": 6.661448741559239, + "grad_norm": 0.2165011614561081, + "learning_rate": 2.6495121392516976e-05, + "loss": 1.7465, + "step": 21703 + }, + { + "epoch": 6.661755678330264, + "grad_norm": 0.22871245443820953, + "learning_rate": 2.6490734420697172e-05, + "loss": 1.7487, + "step": 21704 + }, + { + "epoch": 6.66206261510129, + "grad_norm": 0.21275551617145538, + "learning_rate": 2.6486347681209723e-05, + "loss": 1.7782, + "step": 21705 + }, + { + "epoch": 6.662369551872314, + "grad_norm": 0.2926945984363556, + "learning_rate": 2.6481961174097937e-05, + "loss": 1.7413, + "step": 21706 + }, + { + "epoch": 6.662676488643339, + "grad_norm": 0.17143094539642334, + "learning_rate": 2.6477574899405233e-05, + "loss": 1.6639, + "step": 21707 + }, + { + "epoch": 6.662983425414365, + "grad_norm": 0.22194001078605652, + "learning_rate": 2.647318885717488e-05, + "loss": 1.7035, + "step": 21708 + }, + { + "epoch": 6.66329036218539, + "grad_norm": 0.18232671916484833, + "learning_rate": 2.6468803047450286e-05, + "loss": 1.6977, + "step": 21709 + }, + { + "epoch": 6.6635972989564145, + "grad_norm": 0.2626599371433258, + "learning_rate": 2.6464417470274773e-05, + "loss": 1.7422, + "step": 21710 + }, + { + "epoch": 6.66390423572744, + "grad_norm": 0.2034282237291336, + "learning_rate": 2.6460032125691668e-05, + "loss": 1.7531, + "step": 21711 + }, + { + "epoch": 6.664211172498465, + "grad_norm": 0.2308860868215561, + "learning_rate": 2.645564701374434e-05, + "loss": 1.7271, + "step": 21712 + }, + { + "epoch": 6.6645181092694905, + "grad_norm": 0.2163545936346054, + "learning_rate": 2.64512621344761e-05, + "loss": 1.7632, + "step": 21713 + }, + { + "epoch": 6.664825046040516, + "grad_norm": 0.2566233277320862, + "learning_rate": 2.644687748793029e-05, + "loss": 1.7573, + "step": 21714 + }, + { + "epoch": 6.665131982811541, + "grad_norm": 0.21093623340129852, + "learning_rate": 2.6442493074150244e-05, + "loss": 1.6703, + "step": 21715 + }, + { + "epoch": 6.665438919582566, + "grad_norm": 0.2083086222410202, + "learning_rate": 2.643810889317927e-05, + "loss": 1.6672, + "step": 21716 + }, + { + "epoch": 6.665745856353591, + "grad_norm": 0.20711155235767365, + "learning_rate": 2.643372494506075e-05, + "loss": 1.7276, + "step": 21717 + }, + { + "epoch": 6.666052793124616, + "grad_norm": 0.18977457284927368, + "learning_rate": 2.6429341229837935e-05, + "loss": 1.7207, + "step": 21718 + }, + { + "epoch": 6.666359729895642, + "grad_norm": 0.28336507081985474, + "learning_rate": 2.6424957747554224e-05, + "loss": 1.7473, + "step": 21719 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.1761232167482376, + "learning_rate": 2.642057449825286e-05, + "loss": 1.7172, + "step": 21720 + }, + { + "epoch": 6.666973603437691, + "grad_norm": 0.21672405302524567, + "learning_rate": 2.6416191481977215e-05, + "loss": 1.6561, + "step": 21721 + }, + { + "epoch": 6.667280540208717, + "grad_norm": 0.226834237575531, + "learning_rate": 2.6411808698770613e-05, + "loss": 1.7315, + "step": 21722 + }, + { + "epoch": 6.667587476979742, + "grad_norm": 0.22553586959838867, + "learning_rate": 2.6407426148676307e-05, + "loss": 1.7301, + "step": 21723 + }, + { + "epoch": 6.667894413750767, + "grad_norm": 0.1913517564535141, + "learning_rate": 2.6403043831737672e-05, + "loss": 1.6739, + "step": 21724 + }, + { + "epoch": 6.668201350521793, + "grad_norm": 0.24560052156448364, + "learning_rate": 2.6398661747997955e-05, + "loss": 1.7347, + "step": 21725 + }, + { + "epoch": 6.668508287292818, + "grad_norm": 0.27361172437667847, + "learning_rate": 2.6394279897500517e-05, + "loss": 1.7713, + "step": 21726 + }, + { + "epoch": 6.6688152240638425, + "grad_norm": 0.21486583352088928, + "learning_rate": 2.6389898280288638e-05, + "loss": 1.7504, + "step": 21727 + }, + { + "epoch": 6.669122160834868, + "grad_norm": 0.19056405127048492, + "learning_rate": 2.6385516896405627e-05, + "loss": 1.7457, + "step": 21728 + }, + { + "epoch": 6.669429097605893, + "grad_norm": 0.19316376745700836, + "learning_rate": 2.638113574589478e-05, + "loss": 1.6969, + "step": 21729 + }, + { + "epoch": 6.6697360343769185, + "grad_norm": 0.21700869500637054, + "learning_rate": 2.637675482879939e-05, + "loss": 1.7055, + "step": 21730 + }, + { + "epoch": 6.670042971147944, + "grad_norm": 0.19720883667469025, + "learning_rate": 2.637237414516275e-05, + "loss": 1.7029, + "step": 21731 + }, + { + "epoch": 6.670349907918968, + "grad_norm": 0.16528408229351044, + "learning_rate": 2.6367993695028158e-05, + "loss": 1.6915, + "step": 21732 + }, + { + "epoch": 6.670656844689994, + "grad_norm": 0.19576294720172882, + "learning_rate": 2.636361347843889e-05, + "loss": 1.7034, + "step": 21733 + }, + { + "epoch": 6.670963781461019, + "grad_norm": 0.16859273612499237, + "learning_rate": 2.6359233495438285e-05, + "loss": 1.7114, + "step": 21734 + }, + { + "epoch": 6.671270718232044, + "grad_norm": 0.20480163395404816, + "learning_rate": 2.6354853746069553e-05, + "loss": 1.7304, + "step": 21735 + }, + { + "epoch": 6.67157765500307, + "grad_norm": 0.19104263186454773, + "learning_rate": 2.6350474230376048e-05, + "loss": 1.7026, + "step": 21736 + }, + { + "epoch": 6.671884591774095, + "grad_norm": 0.18243174254894257, + "learning_rate": 2.634609494840098e-05, + "loss": 1.6769, + "step": 21737 + }, + { + "epoch": 6.672191528545119, + "grad_norm": 0.20766063034534454, + "learning_rate": 2.634171590018769e-05, + "loss": 1.7436, + "step": 21738 + }, + { + "epoch": 6.672498465316145, + "grad_norm": 0.22035297751426697, + "learning_rate": 2.6337337085779444e-05, + "loss": 1.8211, + "step": 21739 + }, + { + "epoch": 6.67280540208717, + "grad_norm": 0.18965984880924225, + "learning_rate": 2.6332958505219475e-05, + "loss": 1.7067, + "step": 21740 + }, + { + "epoch": 6.673112338858195, + "grad_norm": 0.21209993958473206, + "learning_rate": 2.632858015855111e-05, + "loss": 1.7743, + "step": 21741 + }, + { + "epoch": 6.67341927562922, + "grad_norm": 0.18409015238285065, + "learning_rate": 2.6324202045817547e-05, + "loss": 1.7494, + "step": 21742 + }, + { + "epoch": 6.673726212400245, + "grad_norm": 0.23252969980239868, + "learning_rate": 2.6319824167062125e-05, + "loss": 1.7459, + "step": 21743 + }, + { + "epoch": 6.6740331491712706, + "grad_norm": 0.16296416521072388, + "learning_rate": 2.631544652232808e-05, + "loss": 1.648, + "step": 21744 + }, + { + "epoch": 6.674340085942296, + "grad_norm": 0.2458602488040924, + "learning_rate": 2.631106911165867e-05, + "loss": 1.6847, + "step": 21745 + }, + { + "epoch": 6.674647022713321, + "grad_norm": 0.21203550696372986, + "learning_rate": 2.6306691935097162e-05, + "loss": 1.713, + "step": 21746 + }, + { + "epoch": 6.6749539594843466, + "grad_norm": 0.19969885051250458, + "learning_rate": 2.6302314992686804e-05, + "loss": 1.7445, + "step": 21747 + }, + { + "epoch": 6.675260896255372, + "grad_norm": 0.21001017093658447, + "learning_rate": 2.629793828447087e-05, + "loss": 1.703, + "step": 21748 + }, + { + "epoch": 6.675567833026396, + "grad_norm": 0.18607214093208313, + "learning_rate": 2.6293561810492595e-05, + "loss": 1.6765, + "step": 21749 + }, + { + "epoch": 6.675874769797422, + "grad_norm": 0.21806176006793976, + "learning_rate": 2.6289185570795223e-05, + "loss": 1.7099, + "step": 21750 + }, + { + "epoch": 6.676181706568447, + "grad_norm": 0.1861930787563324, + "learning_rate": 2.6284809565422052e-05, + "loss": 1.6978, + "step": 21751 + }, + { + "epoch": 6.676488643339472, + "grad_norm": 0.18779867887496948, + "learning_rate": 2.6280433794416254e-05, + "loss": 1.7132, + "step": 21752 + }, + { + "epoch": 6.676795580110497, + "grad_norm": 0.18255293369293213, + "learning_rate": 2.627605825782115e-05, + "loss": 1.7045, + "step": 21753 + }, + { + "epoch": 6.677102516881522, + "grad_norm": 0.22258871793746948, + "learning_rate": 2.6271682955679904e-05, + "loss": 1.7159, + "step": 21754 + }, + { + "epoch": 6.6774094536525475, + "grad_norm": 0.17425768077373505, + "learning_rate": 2.626730788803582e-05, + "loss": 1.6571, + "step": 21755 + }, + { + "epoch": 6.677716390423573, + "grad_norm": 0.1921091377735138, + "learning_rate": 2.6262933054932122e-05, + "loss": 1.8178, + "step": 21756 + }, + { + "epoch": 6.678023327194598, + "grad_norm": 0.16262951493263245, + "learning_rate": 2.6258558456411996e-05, + "loss": 1.6586, + "step": 21757 + }, + { + "epoch": 6.6783302639656235, + "grad_norm": 0.1853780597448349, + "learning_rate": 2.6254184092518752e-05, + "loss": 1.7116, + "step": 21758 + }, + { + "epoch": 6.678637200736648, + "grad_norm": 0.17973974347114563, + "learning_rate": 2.6249809963295536e-05, + "loss": 1.7317, + "step": 21759 + }, + { + "epoch": 6.678944137507673, + "grad_norm": 0.21258050203323364, + "learning_rate": 2.6245436068785634e-05, + "loss": 1.7852, + "step": 21760 + }, + { + "epoch": 6.679251074278699, + "grad_norm": 0.18741287291049957, + "learning_rate": 2.6241062409032262e-05, + "loss": 1.7071, + "step": 21761 + }, + { + "epoch": 6.679558011049724, + "grad_norm": 0.20436155796051025, + "learning_rate": 2.623668898407864e-05, + "loss": 1.7683, + "step": 21762 + }, + { + "epoch": 6.679864947820749, + "grad_norm": 0.18840116262435913, + "learning_rate": 2.6232315793967977e-05, + "loss": 1.7335, + "step": 21763 + }, + { + "epoch": 6.680171884591774, + "grad_norm": 0.1968357264995575, + "learning_rate": 2.62279428387435e-05, + "loss": 1.6848, + "step": 21764 + }, + { + "epoch": 6.680478821362799, + "grad_norm": 0.1774388998746872, + "learning_rate": 2.622357011844844e-05, + "loss": 1.6943, + "step": 21765 + }, + { + "epoch": 6.680785758133824, + "grad_norm": 0.2424328327178955, + "learning_rate": 2.621919763312598e-05, + "loss": 1.7479, + "step": 21766 + }, + { + "epoch": 6.68109269490485, + "grad_norm": 0.21220771968364716, + "learning_rate": 2.6214825382819353e-05, + "loss": 1.7384, + "step": 21767 + }, + { + "epoch": 6.681399631675875, + "grad_norm": 0.23322279751300812, + "learning_rate": 2.6210453367571764e-05, + "loss": 1.6625, + "step": 21768 + }, + { + "epoch": 6.6817065684469, + "grad_norm": 0.1726260483264923, + "learning_rate": 2.620608158742639e-05, + "loss": 1.7055, + "step": 21769 + }, + { + "epoch": 6.682013505217925, + "grad_norm": 0.25436410307884216, + "learning_rate": 2.6201710042426512e-05, + "loss": 1.7449, + "step": 21770 + }, + { + "epoch": 6.68232044198895, + "grad_norm": 0.20275171101093292, + "learning_rate": 2.619733873261524e-05, + "loss": 1.7575, + "step": 21771 + }, + { + "epoch": 6.6826273787599755, + "grad_norm": 0.24221903085708618, + "learning_rate": 2.6192967658035846e-05, + "loss": 1.7312, + "step": 21772 + }, + { + "epoch": 6.682934315531001, + "grad_norm": 0.30804362893104553, + "learning_rate": 2.6188596818731507e-05, + "loss": 1.7669, + "step": 21773 + }, + { + "epoch": 6.683241252302026, + "grad_norm": 0.1818273365497589, + "learning_rate": 2.6184226214745377e-05, + "loss": 1.7102, + "step": 21774 + }, + { + "epoch": 6.683548189073051, + "grad_norm": 0.28026455640792847, + "learning_rate": 2.6179855846120727e-05, + "loss": 1.7313, + "step": 21775 + }, + { + "epoch": 6.683855125844076, + "grad_norm": 0.26503586769104004, + "learning_rate": 2.6175485712900655e-05, + "loss": 1.7622, + "step": 21776 + }, + { + "epoch": 6.684162062615101, + "grad_norm": 0.19122248888015747, + "learning_rate": 2.6171115815128423e-05, + "loss": 1.7347, + "step": 21777 + }, + { + "epoch": 6.684468999386127, + "grad_norm": 0.18789063394069672, + "learning_rate": 2.6166746152847187e-05, + "loss": 1.7158, + "step": 21778 + }, + { + "epoch": 6.684775936157152, + "grad_norm": 0.17315362393856049, + "learning_rate": 2.6162376726100135e-05, + "loss": 1.6561, + "step": 21779 + }, + { + "epoch": 6.685082872928177, + "grad_norm": 0.20659680664539337, + "learning_rate": 2.615800753493045e-05, + "loss": 1.7063, + "step": 21780 + }, + { + "epoch": 6.685389809699202, + "grad_norm": 0.2051183432340622, + "learning_rate": 2.6153638579381307e-05, + "loss": 1.7213, + "step": 21781 + }, + { + "epoch": 6.685696746470227, + "grad_norm": 0.23349207639694214, + "learning_rate": 2.6149269859495884e-05, + "loss": 1.7453, + "step": 21782 + }, + { + "epoch": 6.686003683241252, + "grad_norm": 0.1979275941848755, + "learning_rate": 2.6144901375317355e-05, + "loss": 1.7482, + "step": 21783 + }, + { + "epoch": 6.686310620012278, + "grad_norm": 0.2742067873477936, + "learning_rate": 2.61405331268889e-05, + "loss": 1.7114, + "step": 21784 + }, + { + "epoch": 6.686617556783302, + "grad_norm": 0.18656300008296967, + "learning_rate": 2.6136165114253675e-05, + "loss": 1.7114, + "step": 21785 + }, + { + "epoch": 6.6869244935543275, + "grad_norm": 0.19345268607139587, + "learning_rate": 2.6131797337454834e-05, + "loss": 1.6818, + "step": 21786 + }, + { + "epoch": 6.687231430325353, + "grad_norm": 0.2194962054491043, + "learning_rate": 2.6127429796535597e-05, + "loss": 1.7519, + "step": 21787 + }, + { + "epoch": 6.687538367096378, + "grad_norm": 0.21714645624160767, + "learning_rate": 2.6123062491539054e-05, + "loss": 1.7334, + "step": 21788 + }, + { + "epoch": 6.6878453038674035, + "grad_norm": 0.1684521585702896, + "learning_rate": 2.6118695422508444e-05, + "loss": 1.6843, + "step": 21789 + }, + { + "epoch": 6.688152240638429, + "grad_norm": 0.16155442595481873, + "learning_rate": 2.6114328589486865e-05, + "loss": 1.6541, + "step": 21790 + }, + { + "epoch": 6.688459177409453, + "grad_norm": 0.18483634293079376, + "learning_rate": 2.6109961992517462e-05, + "loss": 1.688, + "step": 21791 + }, + { + "epoch": 6.688766114180479, + "grad_norm": 0.23146624863147736, + "learning_rate": 2.6105595631643466e-05, + "loss": 1.8006, + "step": 21792 + }, + { + "epoch": 6.689073050951504, + "grad_norm": 0.1852748543024063, + "learning_rate": 2.6101229506907937e-05, + "loss": 1.6624, + "step": 21793 + }, + { + "epoch": 6.689379987722529, + "grad_norm": 0.23809482157230377, + "learning_rate": 2.6096863618354105e-05, + "loss": 1.7313, + "step": 21794 + }, + { + "epoch": 6.689686924493555, + "grad_norm": 0.17145361006259918, + "learning_rate": 2.609249796602503e-05, + "loss": 1.6966, + "step": 21795 + }, + { + "epoch": 6.689993861264579, + "grad_norm": 0.1842796355485916, + "learning_rate": 2.6088132549963933e-05, + "loss": 1.6871, + "step": 21796 + }, + { + "epoch": 6.690300798035604, + "grad_norm": 0.1810201108455658, + "learning_rate": 2.608376737021392e-05, + "loss": 1.7509, + "step": 21797 + }, + { + "epoch": 6.69060773480663, + "grad_norm": 0.20428195595741272, + "learning_rate": 2.607940242681814e-05, + "loss": 1.7102, + "step": 21798 + }, + { + "epoch": 6.690914671577655, + "grad_norm": 0.1659073680639267, + "learning_rate": 2.6075037719819716e-05, + "loss": 1.7053, + "step": 21799 + }, + { + "epoch": 6.69122160834868, + "grad_norm": 0.19351087510585785, + "learning_rate": 2.60706732492618e-05, + "loss": 1.6847, + "step": 21800 + }, + { + "epoch": 6.691528545119706, + "grad_norm": 0.1734616905450821, + "learning_rate": 2.6066309015187517e-05, + "loss": 1.6989, + "step": 21801 + }, + { + "epoch": 6.69183548189073, + "grad_norm": 0.1863887459039688, + "learning_rate": 2.6061945017639995e-05, + "loss": 1.665, + "step": 21802 + }, + { + "epoch": 6.6921424186617555, + "grad_norm": 0.20225204527378082, + "learning_rate": 2.6057581256662344e-05, + "loss": 1.718, + "step": 21803 + }, + { + "epoch": 6.692449355432781, + "grad_norm": 0.22148309648036957, + "learning_rate": 2.605321773229774e-05, + "loss": 1.7801, + "step": 21804 + }, + { + "epoch": 6.692756292203806, + "grad_norm": 0.1870507448911667, + "learning_rate": 2.6048854444589242e-05, + "loss": 1.6613, + "step": 21805 + }, + { + "epoch": 6.6930632289748315, + "grad_norm": 0.18597224354743958, + "learning_rate": 2.604449139358004e-05, + "loss": 1.7284, + "step": 21806 + }, + { + "epoch": 6.693370165745856, + "grad_norm": 0.2082163542509079, + "learning_rate": 2.6040128579313193e-05, + "loss": 1.7456, + "step": 21807 + }, + { + "epoch": 6.693677102516881, + "grad_norm": 0.22506757080554962, + "learning_rate": 2.603576600183183e-05, + "loss": 1.7369, + "step": 21808 + }, + { + "epoch": 6.693984039287907, + "grad_norm": 0.20707464218139648, + "learning_rate": 2.60314036611791e-05, + "loss": 1.7176, + "step": 21809 + }, + { + "epoch": 6.694290976058932, + "grad_norm": 0.2306852787733078, + "learning_rate": 2.6027041557398053e-05, + "loss": 1.7582, + "step": 21810 + }, + { + "epoch": 6.694597912829957, + "grad_norm": 0.23120234906673431, + "learning_rate": 2.602267969053187e-05, + "loss": 1.7169, + "step": 21811 + }, + { + "epoch": 6.694904849600983, + "grad_norm": 0.24841509759426117, + "learning_rate": 2.6018318060623582e-05, + "loss": 1.7636, + "step": 21812 + }, + { + "epoch": 6.695211786372007, + "grad_norm": 0.22443681955337524, + "learning_rate": 2.601395666771635e-05, + "loss": 1.7465, + "step": 21813 + }, + { + "epoch": 6.695518723143032, + "grad_norm": 0.2905699908733368, + "learning_rate": 2.6009595511853257e-05, + "loss": 1.779, + "step": 21814 + }, + { + "epoch": 6.695825659914058, + "grad_norm": 0.18677717447280884, + "learning_rate": 2.60052345930774e-05, + "loss": 1.711, + "step": 21815 + }, + { + "epoch": 6.696132596685083, + "grad_norm": 0.2150946855545044, + "learning_rate": 2.6000873911431883e-05, + "loss": 1.7254, + "step": 21816 + }, + { + "epoch": 6.696439533456108, + "grad_norm": 0.20066408812999725, + "learning_rate": 2.5996513466959794e-05, + "loss": 1.7198, + "step": 21817 + }, + { + "epoch": 6.696746470227133, + "grad_norm": 0.23815886676311493, + "learning_rate": 2.5992153259704228e-05, + "loss": 1.749, + "step": 21818 + }, + { + "epoch": 6.697053406998158, + "grad_norm": 0.2067428082227707, + "learning_rate": 2.5987793289708273e-05, + "loss": 1.736, + "step": 21819 + }, + { + "epoch": 6.6973603437691835, + "grad_norm": 0.2126816362142563, + "learning_rate": 2.5983433557015e-05, + "loss": 1.6804, + "step": 21820 + }, + { + "epoch": 6.697667280540209, + "grad_norm": 0.2003033310174942, + "learning_rate": 2.597907406166756e-05, + "loss": 1.7303, + "step": 21821 + }, + { + "epoch": 6.697974217311234, + "grad_norm": 0.238821879029274, + "learning_rate": 2.5974714803708946e-05, + "loss": 1.7399, + "step": 21822 + }, + { + "epoch": 6.6982811540822595, + "grad_norm": 0.21327996253967285, + "learning_rate": 2.597035578318231e-05, + "loss": 1.766, + "step": 21823 + }, + { + "epoch": 6.698588090853284, + "grad_norm": 0.19689476490020752, + "learning_rate": 2.5965997000130694e-05, + "loss": 1.7621, + "step": 21824 + }, + { + "epoch": 6.698895027624309, + "grad_norm": 0.18349261581897736, + "learning_rate": 2.5961638454597158e-05, + "loss": 1.6339, + "step": 21825 + }, + { + "epoch": 6.699201964395335, + "grad_norm": 0.21475930511951447, + "learning_rate": 2.595728014662484e-05, + "loss": 1.6973, + "step": 21826 + }, + { + "epoch": 6.69950890116636, + "grad_norm": 0.2711705267429352, + "learning_rate": 2.5952922076256737e-05, + "loss": 1.7801, + "step": 21827 + }, + { + "epoch": 6.699815837937384, + "grad_norm": 0.2601792514324188, + "learning_rate": 2.5948564243535988e-05, + "loss": 1.7508, + "step": 21828 + }, + { + "epoch": 6.70012277470841, + "grad_norm": 0.206949844956398, + "learning_rate": 2.5944206648505586e-05, + "loss": 1.7853, + "step": 21829 + }, + { + "epoch": 6.700429711479435, + "grad_norm": 0.25003641843795776, + "learning_rate": 2.5939849291208653e-05, + "loss": 1.766, + "step": 21830 + }, + { + "epoch": 6.7007366482504604, + "grad_norm": 0.25864318013191223, + "learning_rate": 2.593549217168823e-05, + "loss": 1.7778, + "step": 21831 + }, + { + "epoch": 6.701043585021486, + "grad_norm": 0.20212729275226593, + "learning_rate": 2.593113528998738e-05, + "loss": 1.7249, + "step": 21832 + }, + { + "epoch": 6.701350521792511, + "grad_norm": 0.2518431842327118, + "learning_rate": 2.5926778646149154e-05, + "loss": 1.7466, + "step": 21833 + }, + { + "epoch": 6.701657458563536, + "grad_norm": 0.24284590780735016, + "learning_rate": 2.5922422240216614e-05, + "loss": 1.8309, + "step": 21834 + }, + { + "epoch": 6.701964395334561, + "grad_norm": 0.21829955279827118, + "learning_rate": 2.5918066072232817e-05, + "loss": 1.7458, + "step": 21835 + }, + { + "epoch": 6.702271332105586, + "grad_norm": 0.2842165231704712, + "learning_rate": 2.5913710142240792e-05, + "loss": 1.7379, + "step": 21836 + }, + { + "epoch": 6.702578268876612, + "grad_norm": 0.19648514688014984, + "learning_rate": 2.590935445028359e-05, + "loss": 1.7141, + "step": 21837 + }, + { + "epoch": 6.702885205647637, + "grad_norm": 0.24336646497249603, + "learning_rate": 2.5904998996404305e-05, + "loss": 1.6719, + "step": 21838 + }, + { + "epoch": 6.703192142418661, + "grad_norm": 0.17288628220558167, + "learning_rate": 2.5900643780645905e-05, + "loss": 1.6982, + "step": 21839 + }, + { + "epoch": 6.703499079189687, + "grad_norm": 0.24906334280967712, + "learning_rate": 2.5896288803051505e-05, + "loss": 1.6873, + "step": 21840 + }, + { + "epoch": 6.703806015960712, + "grad_norm": 0.2177029550075531, + "learning_rate": 2.5891934063664085e-05, + "loss": 1.6884, + "step": 21841 + }, + { + "epoch": 6.704112952731737, + "grad_norm": 0.20478956401348114, + "learning_rate": 2.5887579562526688e-05, + "loss": 1.7342, + "step": 21842 + }, + { + "epoch": 6.704419889502763, + "grad_norm": 0.26212164759635925, + "learning_rate": 2.58832252996824e-05, + "loss": 1.7304, + "step": 21843 + }, + { + "epoch": 6.704726826273788, + "grad_norm": 0.2049340009689331, + "learning_rate": 2.587887127517418e-05, + "loss": 1.7472, + "step": 21844 + }, + { + "epoch": 6.7050337630448125, + "grad_norm": 0.2453075796365738, + "learning_rate": 2.587451748904512e-05, + "loss": 1.7443, + "step": 21845 + }, + { + "epoch": 6.705340699815838, + "grad_norm": 0.19545187056064606, + "learning_rate": 2.5870163941338188e-05, + "loss": 1.7328, + "step": 21846 + }, + { + "epoch": 6.705647636586863, + "grad_norm": 0.24424482882022858, + "learning_rate": 2.5865810632096456e-05, + "loss": 1.6876, + "step": 21847 + }, + { + "epoch": 6.7059545733578885, + "grad_norm": 0.2150830626487732, + "learning_rate": 2.5861457561362922e-05, + "loss": 1.7272, + "step": 21848 + }, + { + "epoch": 6.706261510128914, + "grad_norm": 0.2632520794868469, + "learning_rate": 2.5857104729180626e-05, + "loss": 1.7542, + "step": 21849 + }, + { + "epoch": 6.706568446899938, + "grad_norm": 0.21789421141147614, + "learning_rate": 2.5852752135592563e-05, + "loss": 1.6856, + "step": 21850 + }, + { + "epoch": 6.706875383670964, + "grad_norm": 0.2227005511522293, + "learning_rate": 2.5848399780641758e-05, + "loss": 1.7473, + "step": 21851 + }, + { + "epoch": 6.707182320441989, + "grad_norm": 0.23424866795539856, + "learning_rate": 2.5844047664371218e-05, + "loss": 1.7016, + "step": 21852 + }, + { + "epoch": 6.707489257213014, + "grad_norm": 0.2125028669834137, + "learning_rate": 2.5839695786823964e-05, + "loss": 1.8296, + "step": 21853 + }, + { + "epoch": 6.70779619398404, + "grad_norm": 0.2533423900604248, + "learning_rate": 2.5835344148042972e-05, + "loss": 1.7237, + "step": 21854 + }, + { + "epoch": 6.708103130755065, + "grad_norm": 0.1951744705438614, + "learning_rate": 2.583099274807132e-05, + "loss": 1.6685, + "step": 21855 + }, + { + "epoch": 6.708410067526089, + "grad_norm": 0.2564519941806793, + "learning_rate": 2.5826641586951938e-05, + "loss": 1.7542, + "step": 21856 + }, + { + "epoch": 6.708717004297115, + "grad_norm": 0.2586502134799957, + "learning_rate": 2.5822290664727856e-05, + "loss": 1.7477, + "step": 21857 + }, + { + "epoch": 6.70902394106814, + "grad_norm": 0.30357107520103455, + "learning_rate": 2.5817939981442062e-05, + "loss": 1.7454, + "step": 21858 + }, + { + "epoch": 6.709330877839165, + "grad_norm": 0.20547500252723694, + "learning_rate": 2.5813589537137544e-05, + "loss": 1.7517, + "step": 21859 + }, + { + "epoch": 6.70963781461019, + "grad_norm": 0.2961783707141876, + "learning_rate": 2.5809239331857348e-05, + "loss": 1.698, + "step": 21860 + }, + { + "epoch": 6.709944751381215, + "grad_norm": 0.2062019556760788, + "learning_rate": 2.580488936564439e-05, + "loss": 1.7358, + "step": 21861 + }, + { + "epoch": 6.7102516881522405, + "grad_norm": 0.22287480533123016, + "learning_rate": 2.580053963854173e-05, + "loss": 1.7099, + "step": 21862 + }, + { + "epoch": 6.710558624923266, + "grad_norm": 0.1853112131357193, + "learning_rate": 2.579619015059229e-05, + "loss": 1.7493, + "step": 21863 + }, + { + "epoch": 6.710865561694291, + "grad_norm": 0.24855247139930725, + "learning_rate": 2.5791840901839105e-05, + "loss": 1.7248, + "step": 21864 + }, + { + "epoch": 6.7111724984653165, + "grad_norm": 0.18156948685646057, + "learning_rate": 2.5787491892325126e-05, + "loss": 1.6744, + "step": 21865 + }, + { + "epoch": 6.711479435236341, + "grad_norm": 0.3272082209587097, + "learning_rate": 2.5783143122093357e-05, + "loss": 1.7546, + "step": 21866 + }, + { + "epoch": 6.711786372007366, + "grad_norm": 0.2875421643257141, + "learning_rate": 2.577879459118675e-05, + "loss": 1.6477, + "step": 21867 + }, + { + "epoch": 6.712093308778392, + "grad_norm": 0.19682031869888306, + "learning_rate": 2.5774446299648297e-05, + "loss": 1.7455, + "step": 21868 + }, + { + "epoch": 6.712400245549417, + "grad_norm": 0.32829195261001587, + "learning_rate": 2.5770098247520968e-05, + "loss": 1.7817, + "step": 21869 + }, + { + "epoch": 6.712707182320442, + "grad_norm": 0.26227760314941406, + "learning_rate": 2.5765750434847724e-05, + "loss": 1.763, + "step": 21870 + }, + { + "epoch": 6.713014119091467, + "grad_norm": 0.2902637720108032, + "learning_rate": 2.576140286167152e-05, + "loss": 1.7432, + "step": 21871 + }, + { + "epoch": 6.713321055862492, + "grad_norm": 0.2290763407945633, + "learning_rate": 2.5757055528035377e-05, + "loss": 1.7149, + "step": 21872 + }, + { + "epoch": 6.713627992633517, + "grad_norm": 0.3445907533168793, + "learning_rate": 2.575270843398221e-05, + "loss": 1.7874, + "step": 21873 + }, + { + "epoch": 6.713934929404543, + "grad_norm": 0.1841191053390503, + "learning_rate": 2.574836157955498e-05, + "loss": 1.6954, + "step": 21874 + }, + { + "epoch": 6.714241866175568, + "grad_norm": 0.24168385565280914, + "learning_rate": 2.5744014964796657e-05, + "loss": 1.7153, + "step": 21875 + }, + { + "epoch": 6.714548802946593, + "grad_norm": 0.17855188250541687, + "learning_rate": 2.5739668589750175e-05, + "loss": 1.7329, + "step": 21876 + }, + { + "epoch": 6.714855739717618, + "grad_norm": 0.189789280295372, + "learning_rate": 2.5735322454458554e-05, + "loss": 1.6854, + "step": 21877 + }, + { + "epoch": 6.715162676488643, + "grad_norm": 0.1792519986629486, + "learning_rate": 2.5730976558964647e-05, + "loss": 1.7483, + "step": 21878 + }, + { + "epoch": 6.7154696132596685, + "grad_norm": 0.24460360407829285, + "learning_rate": 2.5726630903311504e-05, + "loss": 1.8337, + "step": 21879 + }, + { + "epoch": 6.715776550030694, + "grad_norm": 0.21612058579921722, + "learning_rate": 2.572228548754198e-05, + "loss": 1.7293, + "step": 21880 + }, + { + "epoch": 6.716083486801719, + "grad_norm": 0.22057892382144928, + "learning_rate": 2.5717940311699078e-05, + "loss": 1.7269, + "step": 21881 + }, + { + "epoch": 6.716390423572744, + "grad_norm": 0.19635777175426483, + "learning_rate": 2.571359537582572e-05, + "loss": 1.6744, + "step": 21882 + }, + { + "epoch": 6.716697360343769, + "grad_norm": 0.20406895875930786, + "learning_rate": 2.570925067996485e-05, + "loss": 1.6866, + "step": 21883 + }, + { + "epoch": 6.717004297114794, + "grad_norm": 0.1942419856786728, + "learning_rate": 2.5704906224159407e-05, + "loss": 1.724, + "step": 21884 + }, + { + "epoch": 6.71731123388582, + "grad_norm": 0.20423445105552673, + "learning_rate": 2.570056200845231e-05, + "loss": 1.6709, + "step": 21885 + }, + { + "epoch": 6.717618170656845, + "grad_norm": 0.27171632647514343, + "learning_rate": 2.569621803288651e-05, + "loss": 1.7532, + "step": 21886 + }, + { + "epoch": 6.71792510742787, + "grad_norm": 0.22753871977329254, + "learning_rate": 2.5691874297504926e-05, + "loss": 1.7534, + "step": 21887 + }, + { + "epoch": 6.718232044198895, + "grad_norm": 0.1907290369272232, + "learning_rate": 2.5687530802350468e-05, + "loss": 1.6696, + "step": 21888 + }, + { + "epoch": 6.71853898096992, + "grad_norm": 0.2226637750864029, + "learning_rate": 2.568318754746612e-05, + "loss": 1.7194, + "step": 21889 + }, + { + "epoch": 6.718845917740945, + "grad_norm": 0.20878726243972778, + "learning_rate": 2.5678844532894742e-05, + "loss": 1.6878, + "step": 21890 + }, + { + "epoch": 6.719152854511971, + "grad_norm": 0.18087267875671387, + "learning_rate": 2.567450175867928e-05, + "loss": 1.7432, + "step": 21891 + }, + { + "epoch": 6.719459791282996, + "grad_norm": 0.19818328320980072, + "learning_rate": 2.567015922486265e-05, + "loss": 1.6959, + "step": 21892 + }, + { + "epoch": 6.7197667280540205, + "grad_norm": 0.19593466818332672, + "learning_rate": 2.566581693148775e-05, + "loss": 1.7357, + "step": 21893 + }, + { + "epoch": 6.720073664825046, + "grad_norm": 0.24518795311450958, + "learning_rate": 2.5661474878597546e-05, + "loss": 1.7948, + "step": 21894 + }, + { + "epoch": 6.720380601596071, + "grad_norm": 0.18471074104309082, + "learning_rate": 2.5657133066234872e-05, + "loss": 1.6983, + "step": 21895 + }, + { + "epoch": 6.7206875383670965, + "grad_norm": 0.20073382556438446, + "learning_rate": 2.5652791494442718e-05, + "loss": 1.7241, + "step": 21896 + }, + { + "epoch": 6.720994475138122, + "grad_norm": 0.21688152849674225, + "learning_rate": 2.5648450163263903e-05, + "loss": 1.7073, + "step": 21897 + }, + { + "epoch": 6.721301411909147, + "grad_norm": 0.17722688615322113, + "learning_rate": 2.5644109072741406e-05, + "loss": 1.7047, + "step": 21898 + }, + { + "epoch": 6.721608348680172, + "grad_norm": 0.2060708999633789, + "learning_rate": 2.5639768222918093e-05, + "loss": 1.7246, + "step": 21899 + }, + { + "epoch": 6.721915285451197, + "grad_norm": 0.26590242981910706, + "learning_rate": 2.563542761383687e-05, + "loss": 1.8141, + "step": 21900 + }, + { + "epoch": 6.722222222222222, + "grad_norm": 0.22498780488967896, + "learning_rate": 2.5631087245540632e-05, + "loss": 1.7211, + "step": 21901 + }, + { + "epoch": 6.722529158993248, + "grad_norm": 0.20546968281269073, + "learning_rate": 2.562674711807227e-05, + "loss": 1.8001, + "step": 21902 + }, + { + "epoch": 6.722836095764272, + "grad_norm": 0.19668535888195038, + "learning_rate": 2.5622407231474683e-05, + "loss": 1.7443, + "step": 21903 + }, + { + "epoch": 6.723143032535297, + "grad_norm": 0.18932129442691803, + "learning_rate": 2.5618067585790752e-05, + "loss": 1.7307, + "step": 21904 + }, + { + "epoch": 6.723449969306323, + "grad_norm": 0.19501622021198273, + "learning_rate": 2.561372818106335e-05, + "loss": 1.7016, + "step": 21905 + }, + { + "epoch": 6.723756906077348, + "grad_norm": 0.21313562989234924, + "learning_rate": 2.5609389017335416e-05, + "loss": 1.8012, + "step": 21906 + }, + { + "epoch": 6.724063842848373, + "grad_norm": 0.174738347530365, + "learning_rate": 2.560505009464978e-05, + "loss": 1.6824, + "step": 21907 + }, + { + "epoch": 6.724370779619399, + "grad_norm": 0.20349650084972382, + "learning_rate": 2.560071141304934e-05, + "loss": 1.7813, + "step": 21908 + }, + { + "epoch": 6.724677716390423, + "grad_norm": 0.21878227591514587, + "learning_rate": 2.5596372972576967e-05, + "loss": 1.8166, + "step": 21909 + }, + { + "epoch": 6.7249846531614486, + "grad_norm": 0.2082633078098297, + "learning_rate": 2.559203477327552e-05, + "loss": 1.7197, + "step": 21910 + }, + { + "epoch": 6.725291589932474, + "grad_norm": 0.17738287150859833, + "learning_rate": 2.558769681518792e-05, + "loss": 1.7093, + "step": 21911 + }, + { + "epoch": 6.725598526703499, + "grad_norm": 0.1930074542760849, + "learning_rate": 2.5583359098356986e-05, + "loss": 1.7702, + "step": 21912 + }, + { + "epoch": 6.725905463474525, + "grad_norm": 0.17668531835079193, + "learning_rate": 2.5579021622825638e-05, + "loss": 1.7466, + "step": 21913 + }, + { + "epoch": 6.726212400245549, + "grad_norm": 0.1737186163663864, + "learning_rate": 2.5574684388636677e-05, + "loss": 1.6876, + "step": 21914 + }, + { + "epoch": 6.726519337016574, + "grad_norm": 0.18352502584457397, + "learning_rate": 2.5570347395833018e-05, + "loss": 1.6745, + "step": 21915 + }, + { + "epoch": 6.7268262737876, + "grad_norm": 0.19047673046588898, + "learning_rate": 2.5566010644457506e-05, + "loss": 1.7465, + "step": 21916 + }, + { + "epoch": 6.727133210558625, + "grad_norm": 0.1762397438287735, + "learning_rate": 2.5561674134553005e-05, + "loss": 1.6767, + "step": 21917 + }, + { + "epoch": 6.72744014732965, + "grad_norm": 0.22884784638881683, + "learning_rate": 2.5557337866162358e-05, + "loss": 1.7054, + "step": 21918 + }, + { + "epoch": 6.727747084100676, + "grad_norm": 0.17476098239421844, + "learning_rate": 2.5553001839328417e-05, + "loss": 1.721, + "step": 21919 + }, + { + "epoch": 6.7280540208717, + "grad_norm": 0.1827213317155838, + "learning_rate": 2.554866605409405e-05, + "loss": 1.78, + "step": 21920 + }, + { + "epoch": 6.7283609576427255, + "grad_norm": 0.21709343791007996, + "learning_rate": 2.554433051050209e-05, + "loss": 1.8064, + "step": 21921 + }, + { + "epoch": 6.728667894413751, + "grad_norm": 0.1972692310810089, + "learning_rate": 2.5539995208595398e-05, + "loss": 1.7231, + "step": 21922 + }, + { + "epoch": 6.728974831184776, + "grad_norm": 0.19464808702468872, + "learning_rate": 2.5535660148416802e-05, + "loss": 1.7931, + "step": 21923 + }, + { + "epoch": 6.7292817679558015, + "grad_norm": 0.19610099494457245, + "learning_rate": 2.5531325330009158e-05, + "loss": 1.7467, + "step": 21924 + }, + { + "epoch": 6.729588704726826, + "grad_norm": 0.21104763448238373, + "learning_rate": 2.5526990753415292e-05, + "loss": 1.7543, + "step": 21925 + }, + { + "epoch": 6.729895641497851, + "grad_norm": 0.1881588101387024, + "learning_rate": 2.5522656418678047e-05, + "loss": 1.7666, + "step": 21926 + }, + { + "epoch": 6.730202578268877, + "grad_norm": 0.2163291722536087, + "learning_rate": 2.551832232584025e-05, + "loss": 1.7321, + "step": 21927 + }, + { + "epoch": 6.730509515039902, + "grad_norm": 0.19252021610736847, + "learning_rate": 2.551398847494477e-05, + "loss": 1.7287, + "step": 21928 + }, + { + "epoch": 6.730816451810927, + "grad_norm": 0.22602233290672302, + "learning_rate": 2.550965486603437e-05, + "loss": 1.767, + "step": 21929 + }, + { + "epoch": 6.731123388581953, + "grad_norm": 0.21509617567062378, + "learning_rate": 2.5505321499151957e-05, + "loss": 1.7637, + "step": 21930 + }, + { + "epoch": 6.731430325352977, + "grad_norm": 0.24291658401489258, + "learning_rate": 2.5500988374340274e-05, + "loss": 1.7312, + "step": 21931 + }, + { + "epoch": 6.731737262124002, + "grad_norm": 0.26562216877937317, + "learning_rate": 2.5496655491642195e-05, + "loss": 1.7763, + "step": 21932 + }, + { + "epoch": 6.732044198895028, + "grad_norm": 0.19785790145397186, + "learning_rate": 2.5492322851100535e-05, + "loss": 1.6979, + "step": 21933 + }, + { + "epoch": 6.732351135666053, + "grad_norm": 0.20044486224651337, + "learning_rate": 2.5487990452758104e-05, + "loss": 1.7359, + "step": 21934 + }, + { + "epoch": 6.7326580724370775, + "grad_norm": 0.20468659698963165, + "learning_rate": 2.548365829665772e-05, + "loss": 1.6996, + "step": 21935 + }, + { + "epoch": 6.732965009208103, + "grad_norm": 0.16516120731830597, + "learning_rate": 2.5479326382842195e-05, + "loss": 1.717, + "step": 21936 + }, + { + "epoch": 6.733271945979128, + "grad_norm": 0.22404411435127258, + "learning_rate": 2.547499471135433e-05, + "loss": 1.7261, + "step": 21937 + }, + { + "epoch": 6.7335788827501535, + "grad_norm": 0.21485663950443268, + "learning_rate": 2.547066328223695e-05, + "loss": 1.7463, + "step": 21938 + }, + { + "epoch": 6.733885819521179, + "grad_norm": 0.330018550157547, + "learning_rate": 2.5466332095532853e-05, + "loss": 1.854, + "step": 21939 + }, + { + "epoch": 6.734192756292204, + "grad_norm": 0.25225213170051575, + "learning_rate": 2.5462001151284842e-05, + "loss": 1.722, + "step": 21940 + }, + { + "epoch": 6.734499693063229, + "grad_norm": 0.2422008365392685, + "learning_rate": 2.5457670449535713e-05, + "loss": 1.6996, + "step": 21941 + }, + { + "epoch": 6.734806629834254, + "grad_norm": 0.2421465814113617, + "learning_rate": 2.5453339990328275e-05, + "loss": 1.7014, + "step": 21942 + }, + { + "epoch": 6.735113566605279, + "grad_norm": 0.2520611882209778, + "learning_rate": 2.5449009773705313e-05, + "loss": 1.7149, + "step": 21943 + }, + { + "epoch": 6.735420503376305, + "grad_norm": 0.24940338730812073, + "learning_rate": 2.5444679799709626e-05, + "loss": 1.7423, + "step": 21944 + }, + { + "epoch": 6.73572744014733, + "grad_norm": 0.2328663021326065, + "learning_rate": 2.544035006838401e-05, + "loss": 1.6893, + "step": 21945 + }, + { + "epoch": 6.736034376918354, + "grad_norm": 0.2190757393836975, + "learning_rate": 2.5436020579771226e-05, + "loss": 1.7375, + "step": 21946 + }, + { + "epoch": 6.73634131368938, + "grad_norm": 0.2204900085926056, + "learning_rate": 2.543169133391413e-05, + "loss": 1.6971, + "step": 21947 + }, + { + "epoch": 6.736648250460405, + "grad_norm": 0.29192328453063965, + "learning_rate": 2.5427362330855415e-05, + "loss": 1.7633, + "step": 21948 + }, + { + "epoch": 6.73695518723143, + "grad_norm": 0.19859355688095093, + "learning_rate": 2.542303357063793e-05, + "loss": 1.7515, + "step": 21949 + }, + { + "epoch": 6.737262124002456, + "grad_norm": 0.23010417819023132, + "learning_rate": 2.5418705053304425e-05, + "loss": 1.7282, + "step": 21950 + }, + { + "epoch": 6.737569060773481, + "grad_norm": 0.2168324589729309, + "learning_rate": 2.5414376778897698e-05, + "loss": 1.7347, + "step": 21951 + }, + { + "epoch": 6.7378759975445055, + "grad_norm": 0.2190646231174469, + "learning_rate": 2.54100487474605e-05, + "loss": 1.7893, + "step": 21952 + }, + { + "epoch": 6.738182934315531, + "grad_norm": 0.23925794661045074, + "learning_rate": 2.5405720959035617e-05, + "loss": 1.7825, + "step": 21953 + }, + { + "epoch": 6.738489871086556, + "grad_norm": 0.17987917363643646, + "learning_rate": 2.5401393413665807e-05, + "loss": 1.724, + "step": 21954 + }, + { + "epoch": 6.7387968078575815, + "grad_norm": 0.2300983965396881, + "learning_rate": 2.5397066111393853e-05, + "loss": 1.7023, + "step": 21955 + }, + { + "epoch": 6.739103744628607, + "grad_norm": 0.2128167450428009, + "learning_rate": 2.539273905226251e-05, + "loss": 1.7218, + "step": 21956 + }, + { + "epoch": 6.739410681399631, + "grad_norm": 0.19105537235736847, + "learning_rate": 2.538841223631454e-05, + "loss": 1.7781, + "step": 21957 + }, + { + "epoch": 6.739717618170657, + "grad_norm": 0.22985289990901947, + "learning_rate": 2.5384085663592704e-05, + "loss": 1.7362, + "step": 21958 + }, + { + "epoch": 6.740024554941682, + "grad_norm": 0.18608705699443817, + "learning_rate": 2.5379759334139768e-05, + "loss": 1.7174, + "step": 21959 + }, + { + "epoch": 6.740331491712707, + "grad_norm": 0.2659450173377991, + "learning_rate": 2.5375433247998482e-05, + "loss": 1.8118, + "step": 21960 + }, + { + "epoch": 6.740638428483733, + "grad_norm": 0.1904401034116745, + "learning_rate": 2.537110740521159e-05, + "loss": 1.6789, + "step": 21961 + }, + { + "epoch": 6.740945365254758, + "grad_norm": 0.1826045662164688, + "learning_rate": 2.5366781805821847e-05, + "loss": 1.6906, + "step": 21962 + }, + { + "epoch": 6.741252302025782, + "grad_norm": 0.1919000893831253, + "learning_rate": 2.5362456449871995e-05, + "loss": 1.7412, + "step": 21963 + }, + { + "epoch": 6.741559238796808, + "grad_norm": 0.1921864151954651, + "learning_rate": 2.5358131337404822e-05, + "loss": 1.7023, + "step": 21964 + }, + { + "epoch": 6.741866175567833, + "grad_norm": 0.1628783494234085, + "learning_rate": 2.5353806468463004e-05, + "loss": 1.6842, + "step": 21965 + }, + { + "epoch": 6.742173112338858, + "grad_norm": 0.19764694571495056, + "learning_rate": 2.534948184308935e-05, + "loss": 1.7238, + "step": 21966 + }, + { + "epoch": 6.742480049109884, + "grad_norm": 0.1845860630273819, + "learning_rate": 2.534515746132653e-05, + "loss": 1.728, + "step": 21967 + }, + { + "epoch": 6.742786985880908, + "grad_norm": 0.20269328355789185, + "learning_rate": 2.5340833323217327e-05, + "loss": 1.7541, + "step": 21968 + }, + { + "epoch": 6.7430939226519335, + "grad_norm": 0.16586242616176605, + "learning_rate": 2.5336509428804468e-05, + "loss": 1.7025, + "step": 21969 + }, + { + "epoch": 6.743400859422959, + "grad_norm": 0.1693086177110672, + "learning_rate": 2.533218577813068e-05, + "loss": 1.6975, + "step": 21970 + }, + { + "epoch": 6.743707796193984, + "grad_norm": 0.2206759750843048, + "learning_rate": 2.5327862371238686e-05, + "loss": 1.764, + "step": 21971 + }, + { + "epoch": 6.7440147329650095, + "grad_norm": 0.1915574073791504, + "learning_rate": 2.532353920817122e-05, + "loss": 1.7576, + "step": 21972 + }, + { + "epoch": 6.744321669736035, + "grad_norm": 0.1741783618927002, + "learning_rate": 2.5319216288971003e-05, + "loss": 1.7394, + "step": 21973 + }, + { + "epoch": 6.744628606507059, + "grad_norm": 0.21624934673309326, + "learning_rate": 2.5314893613680755e-05, + "loss": 1.7358, + "step": 21974 + }, + { + "epoch": 6.744935543278085, + "grad_norm": 0.2350481003522873, + "learning_rate": 2.5310571182343197e-05, + "loss": 1.7801, + "step": 21975 + }, + { + "epoch": 6.74524248004911, + "grad_norm": 0.18618559837341309, + "learning_rate": 2.5306248995001048e-05, + "loss": 1.7012, + "step": 21976 + }, + { + "epoch": 6.745549416820135, + "grad_norm": 0.18479639291763306, + "learning_rate": 2.5301927051697016e-05, + "loss": 1.7238, + "step": 21977 + }, + { + "epoch": 6.74585635359116, + "grad_norm": 0.19978758692741394, + "learning_rate": 2.5297605352473818e-05, + "loss": 1.6636, + "step": 21978 + }, + { + "epoch": 6.746163290362185, + "grad_norm": 0.23122164607048035, + "learning_rate": 2.529328389737416e-05, + "loss": 1.7455, + "step": 21979 + }, + { + "epoch": 6.74647022713321, + "grad_norm": 0.20423240959644318, + "learning_rate": 2.5288962686440732e-05, + "loss": 1.7516, + "step": 21980 + }, + { + "epoch": 6.746777163904236, + "grad_norm": 0.18271920084953308, + "learning_rate": 2.52846417197163e-05, + "loss": 1.762, + "step": 21981 + }, + { + "epoch": 6.747084100675261, + "grad_norm": 0.19280247390270233, + "learning_rate": 2.528032099724349e-05, + "loss": 1.7298, + "step": 21982 + }, + { + "epoch": 6.747391037446286, + "grad_norm": 0.20908337831497192, + "learning_rate": 2.527600051906507e-05, + "loss": 1.7323, + "step": 21983 + }, + { + "epoch": 6.747697974217311, + "grad_norm": 0.18399856984615326, + "learning_rate": 2.5271680285223663e-05, + "loss": 1.6795, + "step": 21984 + }, + { + "epoch": 6.748004910988336, + "grad_norm": 0.2273191213607788, + "learning_rate": 2.5267360295762033e-05, + "loss": 1.6811, + "step": 21985 + }, + { + "epoch": 6.7483118477593615, + "grad_norm": 0.1844841092824936, + "learning_rate": 2.526304055072284e-05, + "loss": 1.7404, + "step": 21986 + }, + { + "epoch": 6.748618784530387, + "grad_norm": 0.25975871086120605, + "learning_rate": 2.5258721050148775e-05, + "loss": 1.6994, + "step": 21987 + }, + { + "epoch": 6.748925721301412, + "grad_norm": 0.1664818376302719, + "learning_rate": 2.5254401794082532e-05, + "loss": 1.6722, + "step": 21988 + }, + { + "epoch": 6.749232658072437, + "grad_norm": 0.2597639560699463, + "learning_rate": 2.5250082782566796e-05, + "loss": 1.7654, + "step": 21989 + }, + { + "epoch": 6.749539594843462, + "grad_norm": 0.19326356053352356, + "learning_rate": 2.5245764015644248e-05, + "loss": 1.668, + "step": 21990 + }, + { + "epoch": 6.749846531614487, + "grad_norm": 0.22924599051475525, + "learning_rate": 2.5241445493357574e-05, + "loss": 1.7522, + "step": 21991 + }, + { + "epoch": 6.750153468385513, + "grad_norm": 0.24588358402252197, + "learning_rate": 2.523712721574944e-05, + "loss": 1.7396, + "step": 21992 + }, + { + "epoch": 6.750460405156538, + "grad_norm": 0.1988971084356308, + "learning_rate": 2.5232809182862526e-05, + "loss": 1.7338, + "step": 21993 + }, + { + "epoch": 6.750767341927563, + "grad_norm": 0.18566425144672394, + "learning_rate": 2.5228491394739518e-05, + "loss": 1.7135, + "step": 21994 + }, + { + "epoch": 6.751074278698588, + "grad_norm": 0.22216622531414032, + "learning_rate": 2.5224173851423073e-05, + "loss": 1.744, + "step": 21995 + }, + { + "epoch": 6.751381215469613, + "grad_norm": 0.18695887923240662, + "learning_rate": 2.5219856552955863e-05, + "loss": 1.7324, + "step": 21996 + }, + { + "epoch": 6.7516881522406385, + "grad_norm": 0.1866987645626068, + "learning_rate": 2.5215539499380535e-05, + "loss": 1.6855, + "step": 21997 + }, + { + "epoch": 6.751995089011664, + "grad_norm": 0.1743573248386383, + "learning_rate": 2.521122269073981e-05, + "loss": 1.6833, + "step": 21998 + }, + { + "epoch": 6.752302025782689, + "grad_norm": 0.2173541784286499, + "learning_rate": 2.5206906127076274e-05, + "loss": 1.7434, + "step": 21999 + }, + { + "epoch": 6.752608962553714, + "grad_norm": 0.17558147013187408, + "learning_rate": 2.5202589808432665e-05, + "loss": 1.6884, + "step": 22000 + }, + { + "epoch": 6.752915899324739, + "grad_norm": 0.16630353033542633, + "learning_rate": 2.5198273734851553e-05, + "loss": 1.7005, + "step": 22001 + }, + { + "epoch": 6.753222836095764, + "grad_norm": 0.1834949105978012, + "learning_rate": 2.519395790637566e-05, + "loss": 1.7123, + "step": 22002 + }, + { + "epoch": 6.75352977286679, + "grad_norm": 0.1806751936674118, + "learning_rate": 2.5189642323047614e-05, + "loss": 1.7305, + "step": 22003 + }, + { + "epoch": 6.753836709637815, + "grad_norm": 0.2350265085697174, + "learning_rate": 2.5185326984910062e-05, + "loss": 1.772, + "step": 22004 + }, + { + "epoch": 6.75414364640884, + "grad_norm": 0.18105818331241608, + "learning_rate": 2.518101189200566e-05, + "loss": 1.7487, + "step": 22005 + }, + { + "epoch": 6.754450583179865, + "grad_norm": 0.17640845477581024, + "learning_rate": 2.517669704437704e-05, + "loss": 1.7178, + "step": 22006 + }, + { + "epoch": 6.75475751995089, + "grad_norm": 0.21648885309696198, + "learning_rate": 2.5172382442066845e-05, + "loss": 1.7144, + "step": 22007 + }, + { + "epoch": 6.755064456721915, + "grad_norm": 0.2042703926563263, + "learning_rate": 2.5168068085117724e-05, + "loss": 1.7476, + "step": 22008 + }, + { + "epoch": 6.755371393492941, + "grad_norm": 0.24397306144237518, + "learning_rate": 2.5163753973572306e-05, + "loss": 1.7033, + "step": 22009 + }, + { + "epoch": 6.755678330263965, + "grad_norm": 0.2030377835035324, + "learning_rate": 2.5159440107473232e-05, + "loss": 1.7353, + "step": 22010 + }, + { + "epoch": 6.7559852670349905, + "grad_norm": 0.2493598908185959, + "learning_rate": 2.5155126486863127e-05, + "loss": 1.7346, + "step": 22011 + }, + { + "epoch": 6.756292203806016, + "grad_norm": 0.17272062599658966, + "learning_rate": 2.5150813111784627e-05, + "loss": 1.7095, + "step": 22012 + }, + { + "epoch": 6.756599140577041, + "grad_norm": 0.2417706698179245, + "learning_rate": 2.514649998228036e-05, + "loss": 1.7631, + "step": 22013 + }, + { + "epoch": 6.7569060773480665, + "grad_norm": 0.17753612995147705, + "learning_rate": 2.5142187098392915e-05, + "loss": 1.697, + "step": 22014 + }, + { + "epoch": 6.757213014119092, + "grad_norm": 0.2246367186307907, + "learning_rate": 2.5137874460164995e-05, + "loss": 1.7216, + "step": 22015 + }, + { + "epoch": 6.757519950890116, + "grad_norm": 0.24141135811805725, + "learning_rate": 2.5133562067639134e-05, + "loss": 1.7368, + "step": 22016 + }, + { + "epoch": 6.757826887661142, + "grad_norm": 0.21253570914268494, + "learning_rate": 2.5129249920858022e-05, + "loss": 1.7029, + "step": 22017 + }, + { + "epoch": 6.758133824432167, + "grad_norm": 0.21176676452159882, + "learning_rate": 2.5124938019864198e-05, + "loss": 1.7472, + "step": 22018 + }, + { + "epoch": 6.758440761203192, + "grad_norm": 0.1990927904844284, + "learning_rate": 2.5120626364700338e-05, + "loss": 1.6686, + "step": 22019 + }, + { + "epoch": 6.758747697974218, + "grad_norm": 0.1736145317554474, + "learning_rate": 2.5116314955409038e-05, + "loss": 1.6984, + "step": 22020 + }, + { + "epoch": 6.759054634745242, + "grad_norm": 0.2618037462234497, + "learning_rate": 2.511200379203289e-05, + "loss": 1.7374, + "step": 22021 + }, + { + "epoch": 6.759361571516267, + "grad_norm": 0.25363266468048096, + "learning_rate": 2.5107692874614507e-05, + "loss": 1.7001, + "step": 22022 + }, + { + "epoch": 6.759668508287293, + "grad_norm": 0.20287153124809265, + "learning_rate": 2.51033822031965e-05, + "loss": 1.7704, + "step": 22023 + }, + { + "epoch": 6.759975445058318, + "grad_norm": 0.2401949167251587, + "learning_rate": 2.509907177782146e-05, + "loss": 1.7157, + "step": 22024 + }, + { + "epoch": 6.760282381829343, + "grad_norm": 0.177081897854805, + "learning_rate": 2.5094761598531985e-05, + "loss": 1.7572, + "step": 22025 + }, + { + "epoch": 6.760589318600369, + "grad_norm": 0.2641974687576294, + "learning_rate": 2.5090451665370674e-05, + "loss": 1.725, + "step": 22026 + }, + { + "epoch": 6.760896255371393, + "grad_norm": 0.20262297987937927, + "learning_rate": 2.5086141978380116e-05, + "loss": 1.6591, + "step": 22027 + }, + { + "epoch": 6.7612031921424185, + "grad_norm": 0.19107301533222198, + "learning_rate": 2.5081832537602913e-05, + "loss": 1.6914, + "step": 22028 + }, + { + "epoch": 6.761510128913444, + "grad_norm": 0.28122687339782715, + "learning_rate": 2.5077523343081643e-05, + "loss": 1.7759, + "step": 22029 + }, + { + "epoch": 6.761817065684469, + "grad_norm": 0.16575101017951965, + "learning_rate": 2.5073214394858897e-05, + "loss": 1.6994, + "step": 22030 + }, + { + "epoch": 6.7621240024554945, + "grad_norm": 0.26933449506759644, + "learning_rate": 2.506890569297723e-05, + "loss": 1.7565, + "step": 22031 + }, + { + "epoch": 6.762430939226519, + "grad_norm": 0.2452966868877411, + "learning_rate": 2.5064597237479292e-05, + "loss": 1.7442, + "step": 22032 + }, + { + "epoch": 6.762737875997544, + "grad_norm": 0.20781855285167694, + "learning_rate": 2.5060289028407585e-05, + "loss": 1.714, + "step": 22033 + }, + { + "epoch": 6.76304481276857, + "grad_norm": 0.1997823268175125, + "learning_rate": 2.5055981065804756e-05, + "loss": 1.7318, + "step": 22034 + }, + { + "epoch": 6.763351749539595, + "grad_norm": 0.2080194652080536, + "learning_rate": 2.50516733497133e-05, + "loss": 1.7466, + "step": 22035 + }, + { + "epoch": 6.76365868631062, + "grad_norm": 0.17558889091014862, + "learning_rate": 2.504736588017585e-05, + "loss": 1.7049, + "step": 22036 + }, + { + "epoch": 6.763965623081646, + "grad_norm": 0.1999572217464447, + "learning_rate": 2.5043058657234957e-05, + "loss": 1.7121, + "step": 22037 + }, + { + "epoch": 6.76427255985267, + "grad_norm": 0.16219176352024078, + "learning_rate": 2.5038751680933185e-05, + "loss": 1.698, + "step": 22038 + }, + { + "epoch": 6.764579496623695, + "grad_norm": 0.17965151369571686, + "learning_rate": 2.50344449513131e-05, + "loss": 1.7021, + "step": 22039 + }, + { + "epoch": 6.764886433394721, + "grad_norm": 0.18831093609333038, + "learning_rate": 2.5030138468417263e-05, + "loss": 1.7049, + "step": 22040 + }, + { + "epoch": 6.765193370165746, + "grad_norm": 0.20622828602790833, + "learning_rate": 2.5025832232288236e-05, + "loss": 1.7834, + "step": 22041 + }, + { + "epoch": 6.765500306936771, + "grad_norm": 0.22746746242046356, + "learning_rate": 2.5021526242968574e-05, + "loss": 1.7426, + "step": 22042 + }, + { + "epoch": 6.765807243707796, + "grad_norm": 0.2048977166414261, + "learning_rate": 2.5017220500500828e-05, + "loss": 1.7192, + "step": 22043 + }, + { + "epoch": 6.766114180478821, + "grad_norm": 0.19647538661956787, + "learning_rate": 2.5012915004927546e-05, + "loss": 1.6738, + "step": 22044 + }, + { + "epoch": 6.7664211172498465, + "grad_norm": 0.2133142054080963, + "learning_rate": 2.5008609756291284e-05, + "loss": 1.7482, + "step": 22045 + }, + { + "epoch": 6.766728054020872, + "grad_norm": 0.23578259348869324, + "learning_rate": 2.500430475463459e-05, + "loss": 1.696, + "step": 22046 + }, + { + "epoch": 6.767034990791897, + "grad_norm": 0.24862529337406158, + "learning_rate": 2.500000000000001e-05, + "loss": 1.7508, + "step": 22047 + }, + { + "epoch": 6.7673419275629225, + "grad_norm": 0.22704963386058807, + "learning_rate": 2.4995695492430066e-05, + "loss": 1.7739, + "step": 22048 + }, + { + "epoch": 6.767648864333947, + "grad_norm": 0.20216481387615204, + "learning_rate": 2.4991391231967347e-05, + "loss": 1.7406, + "step": 22049 + }, + { + "epoch": 6.767955801104972, + "grad_norm": 0.18778519332408905, + "learning_rate": 2.498708721865432e-05, + "loss": 1.683, + "step": 22050 + }, + { + "epoch": 6.768262737875998, + "grad_norm": 0.21680599451065063, + "learning_rate": 2.4982783452533597e-05, + "loss": 1.7652, + "step": 22051 + }, + { + "epoch": 6.768569674647023, + "grad_norm": 0.16952121257781982, + "learning_rate": 2.4978479933647637e-05, + "loss": 1.6551, + "step": 22052 + }, + { + "epoch": 6.768876611418047, + "grad_norm": 0.1979489028453827, + "learning_rate": 2.4974176662039017e-05, + "loss": 1.7399, + "step": 22053 + }, + { + "epoch": 6.769183548189073, + "grad_norm": 0.18934862315654755, + "learning_rate": 2.496987363775025e-05, + "loss": 1.7228, + "step": 22054 + }, + { + "epoch": 6.769490484960098, + "grad_norm": 0.17551462352275848, + "learning_rate": 2.496557086082387e-05, + "loss": 1.6725, + "step": 22055 + }, + { + "epoch": 6.769797421731123, + "grad_norm": 0.23561003804206848, + "learning_rate": 2.496126833130239e-05, + "loss": 1.7606, + "step": 22056 + }, + { + "epoch": 6.770104358502149, + "grad_norm": 0.19105803966522217, + "learning_rate": 2.4956966049228324e-05, + "loss": 1.6975, + "step": 22057 + }, + { + "epoch": 6.770411295273174, + "grad_norm": 0.28581124544143677, + "learning_rate": 2.4952664014644204e-05, + "loss": 1.7408, + "step": 22058 + }, + { + "epoch": 6.7707182320441985, + "grad_norm": 0.20723536610603333, + "learning_rate": 2.494836222759254e-05, + "loss": 1.752, + "step": 22059 + }, + { + "epoch": 6.771025168815224, + "grad_norm": 0.2089354693889618, + "learning_rate": 2.4944060688115846e-05, + "loss": 1.6662, + "step": 22060 + }, + { + "epoch": 6.771332105586249, + "grad_norm": 0.2299557626247406, + "learning_rate": 2.4939759396256625e-05, + "loss": 1.7978, + "step": 22061 + }, + { + "epoch": 6.7716390423572745, + "grad_norm": 0.17900820076465607, + "learning_rate": 2.493545835205739e-05, + "loss": 1.6876, + "step": 22062 + }, + { + "epoch": 6.7719459791283, + "grad_norm": 0.21412713825702667, + "learning_rate": 2.4931157555560648e-05, + "loss": 1.7347, + "step": 22063 + }, + { + "epoch": 6.772252915899324, + "grad_norm": 0.24448172748088837, + "learning_rate": 2.49268570068089e-05, + "loss": 1.7611, + "step": 22064 + }, + { + "epoch": 6.77255985267035, + "grad_norm": 0.20153972506523132, + "learning_rate": 2.4922556705844624e-05, + "loss": 1.7347, + "step": 22065 + }, + { + "epoch": 6.772866789441375, + "grad_norm": 0.2142268568277359, + "learning_rate": 2.4918256652710387e-05, + "loss": 1.7548, + "step": 22066 + }, + { + "epoch": 6.7731737262124, + "grad_norm": 0.19735601544380188, + "learning_rate": 2.4913956847448595e-05, + "loss": 1.7138, + "step": 22067 + }, + { + "epoch": 6.773480662983426, + "grad_norm": 0.1847008913755417, + "learning_rate": 2.4909657290101824e-05, + "loss": 1.6812, + "step": 22068 + }, + { + "epoch": 6.773787599754451, + "grad_norm": 0.18406464159488678, + "learning_rate": 2.4905357980712486e-05, + "loss": 1.6992, + "step": 22069 + }, + { + "epoch": 6.774094536525475, + "grad_norm": 0.19595865905284882, + "learning_rate": 2.490105891932313e-05, + "loss": 1.7118, + "step": 22070 + }, + { + "epoch": 6.774401473296501, + "grad_norm": 0.1929878294467926, + "learning_rate": 2.4896760105976218e-05, + "loss": 1.7187, + "step": 22071 + }, + { + "epoch": 6.774708410067526, + "grad_norm": 0.23972687125205994, + "learning_rate": 2.4892461540714242e-05, + "loss": 1.7293, + "step": 22072 + }, + { + "epoch": 6.7750153468385514, + "grad_norm": 0.18744204938411713, + "learning_rate": 2.4888163223579675e-05, + "loss": 1.7102, + "step": 22073 + }, + { + "epoch": 6.775322283609577, + "grad_norm": 0.20168112218379974, + "learning_rate": 2.4883865154614994e-05, + "loss": 1.7655, + "step": 22074 + }, + { + "epoch": 6.775629220380601, + "grad_norm": 0.22825658321380615, + "learning_rate": 2.487956733386268e-05, + "loss": 1.7251, + "step": 22075 + }, + { + "epoch": 6.775936157151627, + "grad_norm": 0.19441691040992737, + "learning_rate": 2.4875269761365205e-05, + "loss": 1.7657, + "step": 22076 + }, + { + "epoch": 6.776243093922652, + "grad_norm": 0.22861605882644653, + "learning_rate": 2.487097243716504e-05, + "loss": 1.7132, + "step": 22077 + }, + { + "epoch": 6.776550030693677, + "grad_norm": 0.19157674908638, + "learning_rate": 2.486667536130466e-05, + "loss": 1.7448, + "step": 22078 + }, + { + "epoch": 6.776856967464703, + "grad_norm": 0.2203369438648224, + "learning_rate": 2.486237853382652e-05, + "loss": 1.7535, + "step": 22079 + }, + { + "epoch": 6.777163904235728, + "grad_norm": 0.16477027535438538, + "learning_rate": 2.4858081954773088e-05, + "loss": 1.706, + "step": 22080 + }, + { + "epoch": 6.777470841006752, + "grad_norm": 0.16536933183670044, + "learning_rate": 2.4853785624186827e-05, + "loss": 1.6725, + "step": 22081 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 0.18266050517559052, + "learning_rate": 2.4849489542110176e-05, + "loss": 1.6799, + "step": 22082 + }, + { + "epoch": 6.778084714548803, + "grad_norm": 0.21422190964221954, + "learning_rate": 2.4845193708585647e-05, + "loss": 1.7275, + "step": 22083 + }, + { + "epoch": 6.778391651319828, + "grad_norm": 0.19356754422187805, + "learning_rate": 2.4840898123655622e-05, + "loss": 1.7172, + "step": 22084 + }, + { + "epoch": 6.778698588090853, + "grad_norm": 0.21090209484100342, + "learning_rate": 2.4836602787362628e-05, + "loss": 1.6581, + "step": 22085 + }, + { + "epoch": 6.779005524861878, + "grad_norm": 0.20072491466999054, + "learning_rate": 2.483230769974903e-05, + "loss": 1.7398, + "step": 22086 + }, + { + "epoch": 6.7793124616329035, + "grad_norm": 0.20642702281475067, + "learning_rate": 2.482801286085734e-05, + "loss": 1.7505, + "step": 22087 + }, + { + "epoch": 6.779619398403929, + "grad_norm": 0.20322991907596588, + "learning_rate": 2.4823718270729985e-05, + "loss": 1.6693, + "step": 22088 + }, + { + "epoch": 6.779926335174954, + "grad_norm": 0.17060843110084534, + "learning_rate": 2.4819423929409396e-05, + "loss": 1.6746, + "step": 22089 + }, + { + "epoch": 6.7802332719459795, + "grad_norm": 0.20697785913944244, + "learning_rate": 2.4815129836938024e-05, + "loss": 1.7413, + "step": 22090 + }, + { + "epoch": 6.780540208717004, + "grad_norm": 0.19845673441886902, + "learning_rate": 2.48108359933583e-05, + "loss": 1.694, + "step": 22091 + }, + { + "epoch": 6.780847145488029, + "grad_norm": 0.24547794461250305, + "learning_rate": 2.4806542398712657e-05, + "loss": 1.7316, + "step": 22092 + }, + { + "epoch": 6.781154082259055, + "grad_norm": 0.15587118268013, + "learning_rate": 2.4802249053043526e-05, + "loss": 1.667, + "step": 22093 + }, + { + "epoch": 6.78146101903008, + "grad_norm": 0.22754593193531036, + "learning_rate": 2.4797955956393336e-05, + "loss": 1.7504, + "step": 22094 + }, + { + "epoch": 6.781767955801105, + "grad_norm": 0.201420396566391, + "learning_rate": 2.4793663108804528e-05, + "loss": 1.749, + "step": 22095 + }, + { + "epoch": 6.78207489257213, + "grad_norm": 0.1952153891324997, + "learning_rate": 2.4789370510319504e-05, + "loss": 1.7306, + "step": 22096 + }, + { + "epoch": 6.782381829343155, + "grad_norm": 0.16750730574131012, + "learning_rate": 2.4785078160980703e-05, + "loss": 1.6775, + "step": 22097 + }, + { + "epoch": 6.78268876611418, + "grad_norm": 0.19943620264530182, + "learning_rate": 2.4780786060830535e-05, + "loss": 1.7233, + "step": 22098 + }, + { + "epoch": 6.782995702885206, + "grad_norm": 0.21302999556064606, + "learning_rate": 2.4776494209911423e-05, + "loss": 1.798, + "step": 22099 + }, + { + "epoch": 6.783302639656231, + "grad_norm": 0.22949734330177307, + "learning_rate": 2.4772202608265776e-05, + "loss": 1.7678, + "step": 22100 + }, + { + "epoch": 6.783609576427256, + "grad_norm": 0.20945954322814941, + "learning_rate": 2.4767911255935993e-05, + "loss": 1.701, + "step": 22101 + }, + { + "epoch": 6.783916513198281, + "grad_norm": 0.189425989985466, + "learning_rate": 2.476362015296454e-05, + "loss": 1.7152, + "step": 22102 + }, + { + "epoch": 6.784223449969306, + "grad_norm": 0.18826924264431, + "learning_rate": 2.4759329299393747e-05, + "loss": 1.7004, + "step": 22103 + }, + { + "epoch": 6.7845303867403315, + "grad_norm": 0.20359934866428375, + "learning_rate": 2.475503869526607e-05, + "loss": 1.705, + "step": 22104 + }, + { + "epoch": 6.784837323511357, + "grad_norm": 0.22381560504436493, + "learning_rate": 2.4750748340623896e-05, + "loss": 1.7345, + "step": 22105 + }, + { + "epoch": 6.785144260282382, + "grad_norm": 0.1750476062297821, + "learning_rate": 2.474645823550963e-05, + "loss": 1.7084, + "step": 22106 + }, + { + "epoch": 6.785451197053407, + "grad_norm": 0.17943856120109558, + "learning_rate": 2.4742168379965662e-05, + "loss": 1.7417, + "step": 22107 + }, + { + "epoch": 6.785758133824432, + "grad_norm": 0.21809861063957214, + "learning_rate": 2.4737878774034397e-05, + "loss": 1.7197, + "step": 22108 + }, + { + "epoch": 6.786065070595457, + "grad_norm": 0.19761307537555695, + "learning_rate": 2.473358941775821e-05, + "loss": 1.6763, + "step": 22109 + }, + { + "epoch": 6.786372007366483, + "grad_norm": 0.19513878226280212, + "learning_rate": 2.472930031117951e-05, + "loss": 1.6859, + "step": 22110 + }, + { + "epoch": 6.786678944137508, + "grad_norm": 0.21796870231628418, + "learning_rate": 2.4725011454340675e-05, + "loss": 1.6957, + "step": 22111 + }, + { + "epoch": 6.786985880908533, + "grad_norm": 0.1885530948638916, + "learning_rate": 2.4720722847284088e-05, + "loss": 1.731, + "step": 22112 + }, + { + "epoch": 6.787292817679558, + "grad_norm": 0.2108110785484314, + "learning_rate": 2.4716434490052137e-05, + "loss": 1.7985, + "step": 22113 + }, + { + "epoch": 6.787599754450583, + "grad_norm": 0.23425176739692688, + "learning_rate": 2.4712146382687194e-05, + "loss": 1.7177, + "step": 22114 + }, + { + "epoch": 6.787906691221608, + "grad_norm": 0.17368707060813904, + "learning_rate": 2.4707858525231652e-05, + "loss": 1.7158, + "step": 22115 + }, + { + "epoch": 6.788213627992634, + "grad_norm": 0.22731448709964752, + "learning_rate": 2.470357091772787e-05, + "loss": 1.7037, + "step": 22116 + }, + { + "epoch": 6.788520564763659, + "grad_norm": 0.19142407178878784, + "learning_rate": 2.469928356021823e-05, + "loss": 1.7283, + "step": 22117 + }, + { + "epoch": 6.7888275015346835, + "grad_norm": 0.17515631020069122, + "learning_rate": 2.4694996452745072e-05, + "loss": 1.6812, + "step": 22118 + }, + { + "epoch": 6.789134438305709, + "grad_norm": 0.17932391166687012, + "learning_rate": 2.4690709595350838e-05, + "loss": 1.6832, + "step": 22119 + }, + { + "epoch": 6.789441375076734, + "grad_norm": 0.21177144348621368, + "learning_rate": 2.4686422988077802e-05, + "loss": 1.7443, + "step": 22120 + }, + { + "epoch": 6.7897483118477595, + "grad_norm": 0.17952793836593628, + "learning_rate": 2.4682136630968412e-05, + "loss": 1.6794, + "step": 22121 + }, + { + "epoch": 6.790055248618785, + "grad_norm": 0.18464395403862, + "learning_rate": 2.467785052406495e-05, + "loss": 1.6316, + "step": 22122 + }, + { + "epoch": 6.79036218538981, + "grad_norm": 0.1936565786600113, + "learning_rate": 2.4673564667409828e-05, + "loss": 1.6935, + "step": 22123 + }, + { + "epoch": 6.790669122160835, + "grad_norm": 0.21169735491275787, + "learning_rate": 2.4669279061045387e-05, + "loss": 1.7232, + "step": 22124 + }, + { + "epoch": 6.79097605893186, + "grad_norm": 0.199925035238266, + "learning_rate": 2.466499370501397e-05, + "loss": 1.8242, + "step": 22125 + }, + { + "epoch": 6.791282995702885, + "grad_norm": 0.19049705564975739, + "learning_rate": 2.4660708599357963e-05, + "loss": 1.7342, + "step": 22126 + }, + { + "epoch": 6.791589932473911, + "grad_norm": 0.16483616828918457, + "learning_rate": 2.465642374411964e-05, + "loss": 1.7144, + "step": 22127 + }, + { + "epoch": 6.791896869244935, + "grad_norm": 0.17355477809906006, + "learning_rate": 2.4652139139341413e-05, + "loss": 1.6715, + "step": 22128 + }, + { + "epoch": 6.79220380601596, + "grad_norm": 0.17448700964450836, + "learning_rate": 2.4647854785065605e-05, + "loss": 1.6669, + "step": 22129 + }, + { + "epoch": 6.792510742786986, + "grad_norm": 0.19858810305595398, + "learning_rate": 2.4643570681334553e-05, + "loss": 1.6781, + "step": 22130 + }, + { + "epoch": 6.792817679558011, + "grad_norm": 0.17350561916828156, + "learning_rate": 2.46392868281906e-05, + "loss": 1.7005, + "step": 22131 + }, + { + "epoch": 6.793124616329036, + "grad_norm": 0.17494787275791168, + "learning_rate": 2.4635003225676078e-05, + "loss": 1.7204, + "step": 22132 + }, + { + "epoch": 6.793431553100062, + "grad_norm": 0.1988590806722641, + "learning_rate": 2.463071987383332e-05, + "loss": 1.7314, + "step": 22133 + }, + { + "epoch": 6.793738489871086, + "grad_norm": 0.18046239018440247, + "learning_rate": 2.4626436772704658e-05, + "loss": 1.706, + "step": 22134 + }, + { + "epoch": 6.7940454266421115, + "grad_norm": 0.21060462296009064, + "learning_rate": 2.4622153922332402e-05, + "loss": 1.6967, + "step": 22135 + }, + { + "epoch": 6.794352363413137, + "grad_norm": 0.22328679263591766, + "learning_rate": 2.4617871322758934e-05, + "loss": 1.7502, + "step": 22136 + }, + { + "epoch": 6.794659300184162, + "grad_norm": 0.18324224650859833, + "learning_rate": 2.46135889740265e-05, + "loss": 1.7183, + "step": 22137 + }, + { + "epoch": 6.7949662369551875, + "grad_norm": 0.2381133884191513, + "learning_rate": 2.4609306876177496e-05, + "loss": 1.739, + "step": 22138 + }, + { + "epoch": 6.795273173726212, + "grad_norm": 0.21471738815307617, + "learning_rate": 2.4605025029254164e-05, + "loss": 1.7466, + "step": 22139 + }, + { + "epoch": 6.795580110497237, + "grad_norm": 0.209581658244133, + "learning_rate": 2.4600743433298885e-05, + "loss": 1.7495, + "step": 22140 + }, + { + "epoch": 6.795887047268263, + "grad_norm": 0.1806897670030594, + "learning_rate": 2.459646208835394e-05, + "loss": 1.7137, + "step": 22141 + }, + { + "epoch": 6.796193984039288, + "grad_norm": 0.19036264717578888, + "learning_rate": 2.4592180994461644e-05, + "loss": 1.6993, + "step": 22142 + }, + { + "epoch": 6.796500920810313, + "grad_norm": 0.17937630414962769, + "learning_rate": 2.4587900151664335e-05, + "loss": 1.7102, + "step": 22143 + }, + { + "epoch": 6.796807857581339, + "grad_norm": 0.19278483092784882, + "learning_rate": 2.4583619560004244e-05, + "loss": 1.7058, + "step": 22144 + }, + { + "epoch": 6.797114794352363, + "grad_norm": 0.19507993757724762, + "learning_rate": 2.4579339219523744e-05, + "loss": 1.7137, + "step": 22145 + }, + { + "epoch": 6.797421731123388, + "grad_norm": 0.20417597889900208, + "learning_rate": 2.4575059130265115e-05, + "loss": 1.7156, + "step": 22146 + }, + { + "epoch": 6.797728667894414, + "grad_norm": 0.1898338943719864, + "learning_rate": 2.4570779292270658e-05, + "loss": 1.7501, + "step": 22147 + }, + { + "epoch": 6.798035604665439, + "grad_norm": 0.18777382373809814, + "learning_rate": 2.4566499705582656e-05, + "loss": 1.7192, + "step": 22148 + }, + { + "epoch": 6.798342541436464, + "grad_norm": 0.19526423513889313, + "learning_rate": 2.4562220370243415e-05, + "loss": 1.6637, + "step": 22149 + }, + { + "epoch": 6.798649478207489, + "grad_norm": 0.23661594092845917, + "learning_rate": 2.455794128629522e-05, + "loss": 1.7557, + "step": 22150 + }, + { + "epoch": 6.798956414978514, + "grad_norm": 0.27043846249580383, + "learning_rate": 2.4553662453780362e-05, + "loss": 1.7712, + "step": 22151 + }, + { + "epoch": 6.7992633517495396, + "grad_norm": 0.17968088388442993, + "learning_rate": 2.454938387274111e-05, + "loss": 1.6721, + "step": 22152 + }, + { + "epoch": 6.799570288520565, + "grad_norm": 0.21456219255924225, + "learning_rate": 2.45451055432198e-05, + "loss": 1.7249, + "step": 22153 + }, + { + "epoch": 6.79987722529159, + "grad_norm": 0.22433941066265106, + "learning_rate": 2.4540827465258638e-05, + "loss": 1.7319, + "step": 22154 + }, + { + "epoch": 6.800184162062616, + "grad_norm": 0.2808871567249298, + "learning_rate": 2.4536549638899976e-05, + "loss": 1.7802, + "step": 22155 + }, + { + "epoch": 6.80049109883364, + "grad_norm": 0.28654494881629944, + "learning_rate": 2.4532272064186018e-05, + "loss": 1.7431, + "step": 22156 + }, + { + "epoch": 6.800798035604665, + "grad_norm": 0.19476976990699768, + "learning_rate": 2.45279947411591e-05, + "loss": 1.6792, + "step": 22157 + }, + { + "epoch": 6.801104972375691, + "grad_norm": 0.25114744901657104, + "learning_rate": 2.452371766986146e-05, + "loss": 1.7458, + "step": 22158 + }, + { + "epoch": 6.801411909146716, + "grad_norm": 0.18099439144134521, + "learning_rate": 2.451944085033538e-05, + "loss": 1.6952, + "step": 22159 + }, + { + "epoch": 6.8017188459177405, + "grad_norm": 0.21425777673721313, + "learning_rate": 2.4515164282623138e-05, + "loss": 1.7593, + "step": 22160 + }, + { + "epoch": 6.802025782688766, + "grad_norm": 0.19833709299564362, + "learning_rate": 2.4510887966766937e-05, + "loss": 1.6643, + "step": 22161 + }, + { + "epoch": 6.802332719459791, + "grad_norm": 0.20073090493679047, + "learning_rate": 2.45066119028091e-05, + "loss": 1.7112, + "step": 22162 + }, + { + "epoch": 6.8026396562308165, + "grad_norm": 0.18599852919578552, + "learning_rate": 2.4502336090791872e-05, + "loss": 1.7121, + "step": 22163 + }, + { + "epoch": 6.802946593001842, + "grad_norm": 0.22036875784397125, + "learning_rate": 2.4498060530757498e-05, + "loss": 1.7944, + "step": 22164 + }, + { + "epoch": 6.803253529772867, + "grad_norm": 0.19521577656269073, + "learning_rate": 2.4493785222748243e-05, + "loss": 1.7463, + "step": 22165 + }, + { + "epoch": 6.803560466543892, + "grad_norm": 0.22010843455791473, + "learning_rate": 2.448951016680635e-05, + "loss": 1.6951, + "step": 22166 + }, + { + "epoch": 6.803867403314917, + "grad_norm": 0.20490090548992157, + "learning_rate": 2.448523536297407e-05, + "loss": 1.7723, + "step": 22167 + }, + { + "epoch": 6.804174340085942, + "grad_norm": 0.2298613339662552, + "learning_rate": 2.4480960811293648e-05, + "loss": 1.7644, + "step": 22168 + }, + { + "epoch": 6.804481276856968, + "grad_norm": 0.18560375273227692, + "learning_rate": 2.4476686511807306e-05, + "loss": 1.686, + "step": 22169 + }, + { + "epoch": 6.804788213627993, + "grad_norm": 0.24295780062675476, + "learning_rate": 2.4472412464557347e-05, + "loss": 1.7561, + "step": 22170 + }, + { + "epoch": 6.805095150399017, + "grad_norm": 0.1962144672870636, + "learning_rate": 2.4468138669585932e-05, + "loss": 1.7438, + "step": 22171 + }, + { + "epoch": 6.805402087170043, + "grad_norm": 0.21924439072608948, + "learning_rate": 2.4463865126935377e-05, + "loss": 1.7488, + "step": 22172 + }, + { + "epoch": 6.805709023941068, + "grad_norm": 0.1777856945991516, + "learning_rate": 2.4459591836647833e-05, + "loss": 1.6664, + "step": 22173 + }, + { + "epoch": 6.806015960712093, + "grad_norm": 0.24367454648017883, + "learning_rate": 2.4455318798765593e-05, + "loss": 1.7441, + "step": 22174 + }, + { + "epoch": 6.806322897483119, + "grad_norm": 0.2269427478313446, + "learning_rate": 2.4451046013330865e-05, + "loss": 1.7809, + "step": 22175 + }, + { + "epoch": 6.806629834254144, + "grad_norm": 0.21986174583435059, + "learning_rate": 2.444677348038587e-05, + "loss": 1.7453, + "step": 22176 + }, + { + "epoch": 6.8069367710251685, + "grad_norm": 0.1773367077112198, + "learning_rate": 2.4442501199972862e-05, + "loss": 1.6927, + "step": 22177 + }, + { + "epoch": 6.807243707796194, + "grad_norm": 0.20545031130313873, + "learning_rate": 2.4438229172133997e-05, + "loss": 1.7782, + "step": 22178 + }, + { + "epoch": 6.807550644567219, + "grad_norm": 0.1997014880180359, + "learning_rate": 2.443395739691155e-05, + "loss": 1.7295, + "step": 22179 + }, + { + "epoch": 6.8078575813382445, + "grad_norm": 0.19634006917476654, + "learning_rate": 2.4429685874347723e-05, + "loss": 1.7017, + "step": 22180 + }, + { + "epoch": 6.80816451810927, + "grad_norm": 0.2007836550474167, + "learning_rate": 2.442541460448473e-05, + "loss": 1.7252, + "step": 22181 + }, + { + "epoch": 6.808471454880294, + "grad_norm": 0.22204343974590302, + "learning_rate": 2.4421143587364775e-05, + "loss": 1.7526, + "step": 22182 + }, + { + "epoch": 6.80877839165132, + "grad_norm": 0.1906677633523941, + "learning_rate": 2.4416872823030073e-05, + "loss": 1.7121, + "step": 22183 + }, + { + "epoch": 6.809085328422345, + "grad_norm": 0.17165397107601166, + "learning_rate": 2.441260231152283e-05, + "loss": 1.6942, + "step": 22184 + }, + { + "epoch": 6.80939226519337, + "grad_norm": 0.17022575438022614, + "learning_rate": 2.4408332052885246e-05, + "loss": 1.6973, + "step": 22185 + }, + { + "epoch": 6.809699201964396, + "grad_norm": 0.16693587601184845, + "learning_rate": 2.4404062047159503e-05, + "loss": 1.6996, + "step": 22186 + }, + { + "epoch": 6.810006138735421, + "grad_norm": 0.2251187264919281, + "learning_rate": 2.4399792294387864e-05, + "loss": 1.778, + "step": 22187 + }, + { + "epoch": 6.810313075506445, + "grad_norm": 0.20622244477272034, + "learning_rate": 2.439552279461244e-05, + "loss": 1.7273, + "step": 22188 + }, + { + "epoch": 6.810620012277471, + "grad_norm": 0.19736994802951813, + "learning_rate": 2.439125354787551e-05, + "loss": 1.7096, + "step": 22189 + }, + { + "epoch": 6.810926949048496, + "grad_norm": 0.22955237329006195, + "learning_rate": 2.4386984554219182e-05, + "loss": 1.7859, + "step": 22190 + }, + { + "epoch": 6.811233885819521, + "grad_norm": 0.2283364087343216, + "learning_rate": 2.43827158136857e-05, + "loss": 1.6999, + "step": 22191 + }, + { + "epoch": 6.811540822590547, + "grad_norm": 0.18393704295158386, + "learning_rate": 2.4378447326317243e-05, + "loss": 1.654, + "step": 22192 + }, + { + "epoch": 6.811847759361571, + "grad_norm": 0.2031537890434265, + "learning_rate": 2.4374179092155986e-05, + "loss": 1.7353, + "step": 22193 + }, + { + "epoch": 6.8121546961325965, + "grad_norm": 0.1849071979522705, + "learning_rate": 2.4369911111244125e-05, + "loss": 1.7157, + "step": 22194 + }, + { + "epoch": 6.812461632903622, + "grad_norm": 0.20584192872047424, + "learning_rate": 2.4365643383623787e-05, + "loss": 1.7529, + "step": 22195 + }, + { + "epoch": 6.812768569674647, + "grad_norm": 0.24152903258800507, + "learning_rate": 2.436137590933721e-05, + "loss": 1.7662, + "step": 22196 + }, + { + "epoch": 6.8130755064456725, + "grad_norm": 0.26625362038612366, + "learning_rate": 2.4357108688426532e-05, + "loss": 1.7624, + "step": 22197 + }, + { + "epoch": 6.813382443216698, + "grad_norm": 0.27122190594673157, + "learning_rate": 2.435284172093395e-05, + "loss": 1.747, + "step": 22198 + }, + { + "epoch": 6.813689379987722, + "grad_norm": 0.18996810913085938, + "learning_rate": 2.434857500690161e-05, + "loss": 1.7377, + "step": 22199 + }, + { + "epoch": 6.813996316758748, + "grad_norm": 0.22355122864246368, + "learning_rate": 2.4344308546371686e-05, + "loss": 1.6865, + "step": 22200 + }, + { + "epoch": 6.814303253529773, + "grad_norm": 0.18468965590000153, + "learning_rate": 2.4340042339386348e-05, + "loss": 1.7091, + "step": 22201 + }, + { + "epoch": 6.814610190300798, + "grad_norm": 0.25356602668762207, + "learning_rate": 2.4335776385987747e-05, + "loss": 1.7482, + "step": 22202 + }, + { + "epoch": 6.814917127071823, + "grad_norm": 0.22462932765483856, + "learning_rate": 2.433151068621803e-05, + "loss": 1.6985, + "step": 22203 + }, + { + "epoch": 6.815224063842848, + "grad_norm": 0.2540687024593353, + "learning_rate": 2.43272452401194e-05, + "loss": 1.7878, + "step": 22204 + }, + { + "epoch": 6.815531000613873, + "grad_norm": 0.267811119556427, + "learning_rate": 2.432298004773395e-05, + "loss": 1.7862, + "step": 22205 + }, + { + "epoch": 6.815837937384899, + "grad_norm": 0.23089277744293213, + "learning_rate": 2.4318715109103894e-05, + "loss": 1.6892, + "step": 22206 + }, + { + "epoch": 6.816144874155924, + "grad_norm": 0.22740885615348816, + "learning_rate": 2.431445042427131e-05, + "loss": 1.6934, + "step": 22207 + }, + { + "epoch": 6.816451810926949, + "grad_norm": 0.18555034697055817, + "learning_rate": 2.4310185993278405e-05, + "loss": 1.6747, + "step": 22208 + }, + { + "epoch": 6.816758747697974, + "grad_norm": 0.23693101108074188, + "learning_rate": 2.430592181616729e-05, + "loss": 1.7212, + "step": 22209 + }, + { + "epoch": 6.817065684468999, + "grad_norm": 0.20551325380802155, + "learning_rate": 2.4301657892980128e-05, + "loss": 1.711, + "step": 22210 + }, + { + "epoch": 6.8173726212400245, + "grad_norm": 0.20047837495803833, + "learning_rate": 2.4297394223759056e-05, + "loss": 1.729, + "step": 22211 + }, + { + "epoch": 6.81767955801105, + "grad_norm": 0.22111602127552032, + "learning_rate": 2.4293130808546167e-05, + "loss": 1.706, + "step": 22212 + }, + { + "epoch": 6.817986494782075, + "grad_norm": 0.18199655413627625, + "learning_rate": 2.428886764738364e-05, + "loss": 1.7082, + "step": 22213 + }, + { + "epoch": 6.8182934315531, + "grad_norm": 0.18591821193695068, + "learning_rate": 2.4284604740313595e-05, + "loss": 1.6957, + "step": 22214 + }, + { + "epoch": 6.818600368324125, + "grad_norm": 0.19427789747714996, + "learning_rate": 2.4280342087378154e-05, + "loss": 1.7396, + "step": 22215 + }, + { + "epoch": 6.81890730509515, + "grad_norm": 0.233908548951149, + "learning_rate": 2.427607968861945e-05, + "loss": 1.741, + "step": 22216 + }, + { + "epoch": 6.819214241866176, + "grad_norm": 0.168926402926445, + "learning_rate": 2.4271817544079606e-05, + "loss": 1.7023, + "step": 22217 + }, + { + "epoch": 6.819521178637201, + "grad_norm": 0.34345322847366333, + "learning_rate": 2.426755565380074e-05, + "loss": 1.7201, + "step": 22218 + }, + { + "epoch": 6.819828115408226, + "grad_norm": 0.21531274914741516, + "learning_rate": 2.4263294017824974e-05, + "loss": 1.725, + "step": 22219 + }, + { + "epoch": 6.820135052179251, + "grad_norm": 0.25251755118370056, + "learning_rate": 2.4259032636194395e-05, + "loss": 1.6764, + "step": 22220 + }, + { + "epoch": 6.820441988950276, + "grad_norm": 0.246616929769516, + "learning_rate": 2.4254771508951186e-05, + "loss": 1.7971, + "step": 22221 + }, + { + "epoch": 6.820748925721301, + "grad_norm": 0.20998120307922363, + "learning_rate": 2.4250510636137375e-05, + "loss": 1.723, + "step": 22222 + }, + { + "epoch": 6.821055862492327, + "grad_norm": 0.28388240933418274, + "learning_rate": 2.4246250017795148e-05, + "loss": 1.7508, + "step": 22223 + }, + { + "epoch": 6.821362799263352, + "grad_norm": 0.18146218359470367, + "learning_rate": 2.4241989653966535e-05, + "loss": 1.7254, + "step": 22224 + }, + { + "epoch": 6.8216697360343765, + "grad_norm": 0.2384043037891388, + "learning_rate": 2.4237729544693694e-05, + "loss": 1.7624, + "step": 22225 + }, + { + "epoch": 6.821976672805402, + "grad_norm": 0.21908332407474518, + "learning_rate": 2.4233469690018714e-05, + "loss": 1.7595, + "step": 22226 + }, + { + "epoch": 6.822283609576427, + "grad_norm": 0.20963989198207855, + "learning_rate": 2.422921008998369e-05, + "loss": 1.6679, + "step": 22227 + }, + { + "epoch": 6.8225905463474525, + "grad_norm": 0.21045777201652527, + "learning_rate": 2.4224950744630732e-05, + "loss": 1.657, + "step": 22228 + }, + { + "epoch": 6.822897483118478, + "grad_norm": 0.21567417681217194, + "learning_rate": 2.4220691654001883e-05, + "loss": 1.7788, + "step": 22229 + }, + { + "epoch": 6.823204419889503, + "grad_norm": 0.2908889055252075, + "learning_rate": 2.4216432818139283e-05, + "loss": 1.7633, + "step": 22230 + }, + { + "epoch": 6.823511356660528, + "grad_norm": 0.22683843970298767, + "learning_rate": 2.4212174237085007e-05, + "loss": 1.7974, + "step": 22231 + }, + { + "epoch": 6.823818293431553, + "grad_norm": 0.25254085659980774, + "learning_rate": 2.420791591088114e-05, + "loss": 1.6871, + "step": 22232 + }, + { + "epoch": 6.824125230202578, + "grad_norm": 0.1804734766483307, + "learning_rate": 2.420365783956977e-05, + "loss": 1.7331, + "step": 22233 + }, + { + "epoch": 6.824432166973604, + "grad_norm": 0.21634186804294586, + "learning_rate": 2.419940002319297e-05, + "loss": 1.6641, + "step": 22234 + }, + { + "epoch": 6.824739103744628, + "grad_norm": 0.1941644847393036, + "learning_rate": 2.4195142461792818e-05, + "loss": 1.7198, + "step": 22235 + }, + { + "epoch": 6.8250460405156534, + "grad_norm": 0.20209947228431702, + "learning_rate": 2.4190885155411398e-05, + "loss": 1.7137, + "step": 22236 + }, + { + "epoch": 6.825352977286679, + "grad_norm": 0.17161925137043, + "learning_rate": 2.4186628104090757e-05, + "loss": 1.7059, + "step": 22237 + }, + { + "epoch": 6.825659914057704, + "grad_norm": 0.19352135062217712, + "learning_rate": 2.4182371307873025e-05, + "loss": 1.6699, + "step": 22238 + }, + { + "epoch": 6.8259668508287294, + "grad_norm": 0.20384716987609863, + "learning_rate": 2.417811476680019e-05, + "loss": 1.7167, + "step": 22239 + }, + { + "epoch": 6.826273787599755, + "grad_norm": 0.22764970362186432, + "learning_rate": 2.4173858480914402e-05, + "loss": 1.7085, + "step": 22240 + }, + { + "epoch": 6.82658072437078, + "grad_norm": 0.1988842487335205, + "learning_rate": 2.4169602450257645e-05, + "loss": 1.7458, + "step": 22241 + }, + { + "epoch": 6.826887661141805, + "grad_norm": 0.20511481165885925, + "learning_rate": 2.416534667487203e-05, + "loss": 1.7597, + "step": 22242 + }, + { + "epoch": 6.82719459791283, + "grad_norm": 0.20906902849674225, + "learning_rate": 2.4161091154799608e-05, + "loss": 1.7418, + "step": 22243 + }, + { + "epoch": 6.827501534683855, + "grad_norm": 0.22555884718894958, + "learning_rate": 2.4156835890082426e-05, + "loss": 1.8198, + "step": 22244 + }, + { + "epoch": 6.827808471454881, + "grad_norm": 0.25855058431625366, + "learning_rate": 2.4152580880762553e-05, + "loss": 1.7588, + "step": 22245 + }, + { + "epoch": 6.828115408225905, + "grad_norm": 0.16975226998329163, + "learning_rate": 2.4148326126881993e-05, + "loss": 1.6897, + "step": 22246 + }, + { + "epoch": 6.82842234499693, + "grad_norm": 0.2336781919002533, + "learning_rate": 2.414407162848284e-05, + "loss": 1.7412, + "step": 22247 + }, + { + "epoch": 6.828729281767956, + "grad_norm": 0.1660032868385315, + "learning_rate": 2.4139817385607126e-05, + "loss": 1.6221, + "step": 22248 + }, + { + "epoch": 6.829036218538981, + "grad_norm": 0.22926606237888336, + "learning_rate": 2.41355633982969e-05, + "loss": 1.7201, + "step": 22249 + }, + { + "epoch": 6.829343155310006, + "grad_norm": 0.1759374737739563, + "learning_rate": 2.4131309666594193e-05, + "loss": 1.6842, + "step": 22250 + }, + { + "epoch": 6.829650092081032, + "grad_norm": 0.23005764186382294, + "learning_rate": 2.4127056190541042e-05, + "loss": 1.7327, + "step": 22251 + }, + { + "epoch": 6.829957028852056, + "grad_norm": 0.2216579169034958, + "learning_rate": 2.412280297017949e-05, + "loss": 1.7856, + "step": 22252 + }, + { + "epoch": 6.8302639656230815, + "grad_norm": 0.22133000195026398, + "learning_rate": 2.4118550005551565e-05, + "loss": 1.7711, + "step": 22253 + }, + { + "epoch": 6.830570902394107, + "grad_norm": 0.21860742568969727, + "learning_rate": 2.41142972966993e-05, + "loss": 1.7276, + "step": 22254 + }, + { + "epoch": 6.830877839165132, + "grad_norm": 0.2484082579612732, + "learning_rate": 2.4110044843664726e-05, + "loss": 1.7038, + "step": 22255 + }, + { + "epoch": 6.8311847759361575, + "grad_norm": 0.22288921475410461, + "learning_rate": 2.410579264648984e-05, + "loss": 1.7149, + "step": 22256 + }, + { + "epoch": 6.831491712707182, + "grad_norm": 0.23635484278202057, + "learning_rate": 2.4101540705216724e-05, + "loss": 1.7296, + "step": 22257 + }, + { + "epoch": 6.831798649478207, + "grad_norm": 0.24334096908569336, + "learning_rate": 2.4097289019887324e-05, + "loss": 1.7458, + "step": 22258 + }, + { + "epoch": 6.832105586249233, + "grad_norm": 0.23019789159297943, + "learning_rate": 2.4093037590543716e-05, + "loss": 1.7296, + "step": 22259 + }, + { + "epoch": 6.832412523020258, + "grad_norm": 0.23739024996757507, + "learning_rate": 2.4088786417227895e-05, + "loss": 1.7844, + "step": 22260 + }, + { + "epoch": 6.832719459791283, + "grad_norm": 0.1969252973794937, + "learning_rate": 2.4084535499981873e-05, + "loss": 1.6692, + "step": 22261 + }, + { + "epoch": 6.833026396562309, + "grad_norm": 0.20111167430877686, + "learning_rate": 2.4080284838847682e-05, + "loss": 1.7813, + "step": 22262 + }, + { + "epoch": 6.833333333333333, + "grad_norm": 0.26112934947013855, + "learning_rate": 2.4076034433867268e-05, + "loss": 1.6852, + "step": 22263 + }, + { + "epoch": 6.833640270104358, + "grad_norm": 0.24244411289691925, + "learning_rate": 2.40717842850827e-05, + "loss": 1.7054, + "step": 22264 + }, + { + "epoch": 6.833947206875384, + "grad_norm": 0.22703053057193756, + "learning_rate": 2.406753439253595e-05, + "loss": 1.7655, + "step": 22265 + }, + { + "epoch": 6.834254143646409, + "grad_norm": 0.23935651779174805, + "learning_rate": 2.4063284756269027e-05, + "loss": 1.7462, + "step": 22266 + }, + { + "epoch": 6.834561080417434, + "grad_norm": 0.2169155478477478, + "learning_rate": 2.4059035376323928e-05, + "loss": 1.7059, + "step": 22267 + }, + { + "epoch": 6.834868017188459, + "grad_norm": 0.2045663446187973, + "learning_rate": 2.4054786252742645e-05, + "loss": 1.7166, + "step": 22268 + }, + { + "epoch": 6.835174953959484, + "grad_norm": 0.22796253859996796, + "learning_rate": 2.4050537385567172e-05, + "loss": 1.7361, + "step": 22269 + }, + { + "epoch": 6.8354818907305095, + "grad_norm": 0.20807915925979614, + "learning_rate": 2.4046288774839497e-05, + "loss": 1.7007, + "step": 22270 + }, + { + "epoch": 6.835788827501535, + "grad_norm": 0.22157903015613556, + "learning_rate": 2.4042040420601607e-05, + "loss": 1.7409, + "step": 22271 + }, + { + "epoch": 6.83609576427256, + "grad_norm": 0.21494148671627045, + "learning_rate": 2.4037792322895492e-05, + "loss": 1.7975, + "step": 22272 + }, + { + "epoch": 6.8364027010435855, + "grad_norm": 0.2275875061750412, + "learning_rate": 2.403354448176311e-05, + "loss": 1.6759, + "step": 22273 + }, + { + "epoch": 6.83670963781461, + "grad_norm": 0.21105073392391205, + "learning_rate": 2.4029296897246496e-05, + "loss": 1.7229, + "step": 22274 + }, + { + "epoch": 6.837016574585635, + "grad_norm": 0.21957579255104065, + "learning_rate": 2.4025049569387553e-05, + "loss": 1.737, + "step": 22275 + }, + { + "epoch": 6.837323511356661, + "grad_norm": 0.2291470617055893, + "learning_rate": 2.4020802498228335e-05, + "loss": 1.6731, + "step": 22276 + }, + { + "epoch": 6.837630448127686, + "grad_norm": 0.18196065723896027, + "learning_rate": 2.401655568381074e-05, + "loss": 1.6823, + "step": 22277 + }, + { + "epoch": 6.83793738489871, + "grad_norm": 0.20915214717388153, + "learning_rate": 2.401230912617678e-05, + "loss": 1.7038, + "step": 22278 + }, + { + "epoch": 6.838244321669736, + "grad_norm": 0.2060854732990265, + "learning_rate": 2.4008062825368437e-05, + "loss": 1.7514, + "step": 22279 + }, + { + "epoch": 6.838551258440761, + "grad_norm": 0.20858527719974518, + "learning_rate": 2.400381678142762e-05, + "loss": 1.7494, + "step": 22280 + }, + { + "epoch": 6.838858195211786, + "grad_norm": 0.19124718010425568, + "learning_rate": 2.3999570994396352e-05, + "loss": 1.7641, + "step": 22281 + }, + { + "epoch": 6.839165131982812, + "grad_norm": 0.28222304582595825, + "learning_rate": 2.3995325464316525e-05, + "loss": 1.7204, + "step": 22282 + }, + { + "epoch": 6.839472068753837, + "grad_norm": 0.20047026872634888, + "learning_rate": 2.399108019123016e-05, + "loss": 1.7261, + "step": 22283 + }, + { + "epoch": 6.8397790055248615, + "grad_norm": 0.2758225202560425, + "learning_rate": 2.3986835175179178e-05, + "loss": 1.6903, + "step": 22284 + }, + { + "epoch": 6.840085942295887, + "grad_norm": 0.2719727158546448, + "learning_rate": 2.3982590416205535e-05, + "loss": 1.8716, + "step": 22285 + }, + { + "epoch": 6.840392879066912, + "grad_norm": 0.3524060845375061, + "learning_rate": 2.3978345914351193e-05, + "loss": 1.7778, + "step": 22286 + }, + { + "epoch": 6.8406998158379375, + "grad_norm": 0.2711596190929413, + "learning_rate": 2.397410166965808e-05, + "loss": 1.7111, + "step": 22287 + }, + { + "epoch": 6.841006752608963, + "grad_norm": 0.2818336486816406, + "learning_rate": 2.396985768216815e-05, + "loss": 1.7292, + "step": 22288 + }, + { + "epoch": 6.841313689379987, + "grad_norm": 0.19677700102329254, + "learning_rate": 2.3965613951923343e-05, + "loss": 1.6975, + "step": 22289 + }, + { + "epoch": 6.841620626151013, + "grad_norm": 0.300997257232666, + "learning_rate": 2.3961370478965583e-05, + "loss": 1.7014, + "step": 22290 + }, + { + "epoch": 6.841927562922038, + "grad_norm": 0.23549453914165497, + "learning_rate": 2.395712726333686e-05, + "loss": 1.7052, + "step": 22291 + }, + { + "epoch": 6.842234499693063, + "grad_norm": 0.29898303747177124, + "learning_rate": 2.3952884305079026e-05, + "loss": 1.7828, + "step": 22292 + }, + { + "epoch": 6.842541436464089, + "grad_norm": 0.26108843088150024, + "learning_rate": 2.3948641604234096e-05, + "loss": 1.7023, + "step": 22293 + }, + { + "epoch": 6.842848373235114, + "grad_norm": 0.18781059980392456, + "learning_rate": 2.394439916084392e-05, + "loss": 1.6808, + "step": 22294 + }, + { + "epoch": 6.843155310006138, + "grad_norm": 0.22659730911254883, + "learning_rate": 2.3940156974950485e-05, + "loss": 1.7224, + "step": 22295 + }, + { + "epoch": 6.843462246777164, + "grad_norm": 0.17422057688236237, + "learning_rate": 2.3935915046595713e-05, + "loss": 1.668, + "step": 22296 + }, + { + "epoch": 6.843769183548189, + "grad_norm": 0.2008846402168274, + "learning_rate": 2.393167337582146e-05, + "loss": 1.7283, + "step": 22297 + }, + { + "epoch": 6.844076120319214, + "grad_norm": 0.20376072824001312, + "learning_rate": 2.392743196266973e-05, + "loss": 1.74, + "step": 22298 + }, + { + "epoch": 6.84438305709024, + "grad_norm": 0.16353756189346313, + "learning_rate": 2.3923190807182372e-05, + "loss": 1.717, + "step": 22299 + }, + { + "epoch": 6.844689993861264, + "grad_norm": 0.18436652421951294, + "learning_rate": 2.3918949909401335e-05, + "loss": 1.7257, + "step": 22300 + }, + { + "epoch": 6.8449969306322895, + "grad_norm": 0.2038460522890091, + "learning_rate": 2.3914709269368523e-05, + "loss": 1.7254, + "step": 22301 + }, + { + "epoch": 6.845303867403315, + "grad_norm": 0.17111587524414062, + "learning_rate": 2.3910468887125842e-05, + "loss": 1.6993, + "step": 22302 + }, + { + "epoch": 6.84561080417434, + "grad_norm": 0.20049406588077545, + "learning_rate": 2.3906228762715207e-05, + "loss": 1.7099, + "step": 22303 + }, + { + "epoch": 6.8459177409453655, + "grad_norm": 0.2168554663658142, + "learning_rate": 2.39019888961785e-05, + "loss": 1.725, + "step": 22304 + }, + { + "epoch": 6.846224677716391, + "grad_norm": 0.2228514850139618, + "learning_rate": 2.3897749287557647e-05, + "loss": 1.7348, + "step": 22305 + }, + { + "epoch": 6.846531614487415, + "grad_norm": 0.17166151106357574, + "learning_rate": 2.3893509936894532e-05, + "loss": 1.7451, + "step": 22306 + }, + { + "epoch": 6.846838551258441, + "grad_norm": 0.24896936118602753, + "learning_rate": 2.3889270844231026e-05, + "loss": 1.7397, + "step": 22307 + }, + { + "epoch": 6.847145488029466, + "grad_norm": 0.1984332948923111, + "learning_rate": 2.3885032009609098e-05, + "loss": 1.7167, + "step": 22308 + }, + { + "epoch": 6.847452424800491, + "grad_norm": 0.20763449370861053, + "learning_rate": 2.388079343307055e-05, + "loss": 1.7154, + "step": 22309 + }, + { + "epoch": 6.847759361571516, + "grad_norm": 0.21818630397319794, + "learning_rate": 2.3876555114657346e-05, + "loss": 1.7364, + "step": 22310 + }, + { + "epoch": 6.848066298342541, + "grad_norm": 0.21220166981220245, + "learning_rate": 2.3872317054411298e-05, + "loss": 1.74, + "step": 22311 + }, + { + "epoch": 6.848373235113566, + "grad_norm": 0.17486892640590668, + "learning_rate": 2.3868079252374343e-05, + "loss": 1.68, + "step": 22312 + }, + { + "epoch": 6.848680171884592, + "grad_norm": 0.20809298753738403, + "learning_rate": 2.386384170858837e-05, + "loss": 1.8102, + "step": 22313 + }, + { + "epoch": 6.848987108655617, + "grad_norm": 0.19927671551704407, + "learning_rate": 2.385960442309519e-05, + "loss": 1.7742, + "step": 22314 + }, + { + "epoch": 6.849294045426642, + "grad_norm": 0.18705040216445923, + "learning_rate": 2.3855367395936757e-05, + "loss": 1.689, + "step": 22315 + }, + { + "epoch": 6.849600982197668, + "grad_norm": 0.22023466229438782, + "learning_rate": 2.385113062715487e-05, + "loss": 1.7819, + "step": 22316 + }, + { + "epoch": 6.849907918968692, + "grad_norm": 0.24443435668945312, + "learning_rate": 2.384689411679146e-05, + "loss": 1.6533, + "step": 22317 + }, + { + "epoch": 6.850214855739718, + "grad_norm": 0.20103834569454193, + "learning_rate": 2.3842657864888368e-05, + "loss": 1.7274, + "step": 22318 + }, + { + "epoch": 6.850521792510743, + "grad_norm": 0.2265254408121109, + "learning_rate": 2.3838421871487465e-05, + "loss": 1.7874, + "step": 22319 + }, + { + "epoch": 6.850828729281768, + "grad_norm": 0.2775460183620453, + "learning_rate": 2.383418613663061e-05, + "loss": 1.8038, + "step": 22320 + }, + { + "epoch": 6.851135666052793, + "grad_norm": 0.2001011073589325, + "learning_rate": 2.3829950660359663e-05, + "loss": 1.7135, + "step": 22321 + }, + { + "epoch": 6.851442602823818, + "grad_norm": 0.21427330374717712, + "learning_rate": 2.382571544271648e-05, + "loss": 1.7155, + "step": 22322 + }, + { + "epoch": 6.851749539594843, + "grad_norm": 0.18420884013175964, + "learning_rate": 2.382148048374292e-05, + "loss": 1.7178, + "step": 22323 + }, + { + "epoch": 6.852056476365869, + "grad_norm": 0.19436471164226532, + "learning_rate": 2.3817245783480813e-05, + "loss": 1.7396, + "step": 22324 + }, + { + "epoch": 6.852363413136894, + "grad_norm": 0.23191674053668976, + "learning_rate": 2.381301134197207e-05, + "loss": 1.7102, + "step": 22325 + }, + { + "epoch": 6.852670349907919, + "grad_norm": 0.20381706953048706, + "learning_rate": 2.3808777159258462e-05, + "loss": 1.7671, + "step": 22326 + }, + { + "epoch": 6.852977286678944, + "grad_norm": 0.20202197134494781, + "learning_rate": 2.3804543235381897e-05, + "loss": 1.6774, + "step": 22327 + }, + { + "epoch": 6.853284223449969, + "grad_norm": 0.23496322333812714, + "learning_rate": 2.380030957038416e-05, + "loss": 1.7745, + "step": 22328 + }, + { + "epoch": 6.8535911602209945, + "grad_norm": 0.22473813593387604, + "learning_rate": 2.379607616430714e-05, + "loss": 1.7319, + "step": 22329 + }, + { + "epoch": 6.85389809699202, + "grad_norm": 0.2149224430322647, + "learning_rate": 2.3791843017192667e-05, + "loss": 1.77, + "step": 22330 + }, + { + "epoch": 6.854205033763045, + "grad_norm": 0.21146108210086823, + "learning_rate": 2.378761012908253e-05, + "loss": 1.762, + "step": 22331 + }, + { + "epoch": 6.85451197053407, + "grad_norm": 0.2031458169221878, + "learning_rate": 2.3783377500018626e-05, + "loss": 1.7007, + "step": 22332 + }, + { + "epoch": 6.854818907305095, + "grad_norm": 0.19763319194316864, + "learning_rate": 2.377914513004272e-05, + "loss": 1.6899, + "step": 22333 + }, + { + "epoch": 6.85512584407612, + "grad_norm": 0.17337046563625336, + "learning_rate": 2.3774913019196688e-05, + "loss": 1.683, + "step": 22334 + }, + { + "epoch": 6.855432780847146, + "grad_norm": 0.1850815862417221, + "learning_rate": 2.3770681167522328e-05, + "loss": 1.7284, + "step": 22335 + }, + { + "epoch": 6.855739717618171, + "grad_norm": 0.19693362712860107, + "learning_rate": 2.3766449575061477e-05, + "loss": 1.7694, + "step": 22336 + }, + { + "epoch": 6.856046654389196, + "grad_norm": 0.1981547325849533, + "learning_rate": 2.376221824185595e-05, + "loss": 1.736, + "step": 22337 + }, + { + "epoch": 6.856353591160221, + "grad_norm": 0.17638558149337769, + "learning_rate": 2.375798716794756e-05, + "loss": 1.6979, + "step": 22338 + }, + { + "epoch": 6.856660527931246, + "grad_norm": 0.20189990103244781, + "learning_rate": 2.3753756353378116e-05, + "loss": 1.7876, + "step": 22339 + }, + { + "epoch": 6.856967464702271, + "grad_norm": 0.1880224347114563, + "learning_rate": 2.3749525798189438e-05, + "loss": 1.7134, + "step": 22340 + }, + { + "epoch": 6.857274401473297, + "grad_norm": 0.2464265078306198, + "learning_rate": 2.3745295502423316e-05, + "loss": 1.7782, + "step": 22341 + }, + { + "epoch": 6.857581338244322, + "grad_norm": 0.19218963384628296, + "learning_rate": 2.3741065466121604e-05, + "loss": 1.7027, + "step": 22342 + }, + { + "epoch": 6.8578882750153465, + "grad_norm": 0.27446448802948, + "learning_rate": 2.3736835689326043e-05, + "loss": 1.772, + "step": 22343 + }, + { + "epoch": 6.858195211786372, + "grad_norm": 0.19315828382968903, + "learning_rate": 2.3732606172078497e-05, + "loss": 1.6855, + "step": 22344 + }, + { + "epoch": 6.858502148557397, + "grad_norm": 0.2668892741203308, + "learning_rate": 2.372837691442072e-05, + "loss": 1.7703, + "step": 22345 + }, + { + "epoch": 6.8588090853284225, + "grad_norm": 0.23552054166793823, + "learning_rate": 2.3724147916394497e-05, + "loss": 1.7184, + "step": 22346 + }, + { + "epoch": 6.859116022099448, + "grad_norm": 0.3194984793663025, + "learning_rate": 2.3719919178041682e-05, + "loss": 1.7531, + "step": 22347 + }, + { + "epoch": 6.859422958870473, + "grad_norm": 0.19298717379570007, + "learning_rate": 2.371569069940399e-05, + "loss": 1.7064, + "step": 22348 + }, + { + "epoch": 6.859729895641498, + "grad_norm": 0.2990693151950836, + "learning_rate": 2.3711462480523293e-05, + "loss": 1.7434, + "step": 22349 + }, + { + "epoch": 6.860036832412523, + "grad_norm": 0.1976640820503235, + "learning_rate": 2.370723452144129e-05, + "loss": 1.6881, + "step": 22350 + }, + { + "epoch": 6.860343769183548, + "grad_norm": 0.24306917190551758, + "learning_rate": 2.3703006822199825e-05, + "loss": 1.7791, + "step": 22351 + }, + { + "epoch": 6.860650705954574, + "grad_norm": 0.20065687596797943, + "learning_rate": 2.3698779382840657e-05, + "loss": 1.7162, + "step": 22352 + }, + { + "epoch": 6.860957642725598, + "grad_norm": 0.21599936485290527, + "learning_rate": 2.3694552203405574e-05, + "loss": 1.7702, + "step": 22353 + }, + { + "epoch": 6.861264579496623, + "grad_norm": 0.16836890578269958, + "learning_rate": 2.3690325283936338e-05, + "loss": 1.6676, + "step": 22354 + }, + { + "epoch": 6.861571516267649, + "grad_norm": 0.1756831407546997, + "learning_rate": 2.368609862447473e-05, + "loss": 1.6934, + "step": 22355 + }, + { + "epoch": 6.861878453038674, + "grad_norm": 0.18676789104938507, + "learning_rate": 2.3681872225062517e-05, + "loss": 1.6879, + "step": 22356 + }, + { + "epoch": 6.862185389809699, + "grad_norm": 0.18018634617328644, + "learning_rate": 2.3677646085741473e-05, + "loss": 1.7143, + "step": 22357 + }, + { + "epoch": 6.862492326580725, + "grad_norm": 0.1789008378982544, + "learning_rate": 2.3673420206553332e-05, + "loss": 1.6914, + "step": 22358 + }, + { + "epoch": 6.862799263351749, + "grad_norm": 0.1869693398475647, + "learning_rate": 2.366919458753993e-05, + "loss": 1.7431, + "step": 22359 + }, + { + "epoch": 6.8631062001227745, + "grad_norm": 0.1958019733428955, + "learning_rate": 2.3664969228742934e-05, + "loss": 1.7132, + "step": 22360 + }, + { + "epoch": 6.8634131368938, + "grad_norm": 0.199384868144989, + "learning_rate": 2.366074413020419e-05, + "loss": 1.7095, + "step": 22361 + }, + { + "epoch": 6.863720073664825, + "grad_norm": 0.2125246673822403, + "learning_rate": 2.365651929196539e-05, + "loss": 1.7125, + "step": 22362 + }, + { + "epoch": 6.8640270104358505, + "grad_norm": 0.1574707180261612, + "learning_rate": 2.3652294714068284e-05, + "loss": 1.6386, + "step": 22363 + }, + { + "epoch": 6.864333947206875, + "grad_norm": 0.30648529529571533, + "learning_rate": 2.364807039655469e-05, + "loss": 1.7665, + "step": 22364 + }, + { + "epoch": 6.8646408839779, + "grad_norm": 0.19746489822864532, + "learning_rate": 2.364384633946627e-05, + "loss": 1.6736, + "step": 22365 + }, + { + "epoch": 6.864947820748926, + "grad_norm": 0.25084391236305237, + "learning_rate": 2.3639622542844842e-05, + "loss": 1.7346, + "step": 22366 + }, + { + "epoch": 6.865254757519951, + "grad_norm": 0.1884133219718933, + "learning_rate": 2.3635399006732077e-05, + "loss": 1.6868, + "step": 22367 + }, + { + "epoch": 6.865561694290976, + "grad_norm": 0.21225856244564056, + "learning_rate": 2.3631175731169774e-05, + "loss": 1.7438, + "step": 22368 + }, + { + "epoch": 6.865868631062002, + "grad_norm": 0.1863771378993988, + "learning_rate": 2.3626952716199647e-05, + "loss": 1.7677, + "step": 22369 + }, + { + "epoch": 6.866175567833026, + "grad_norm": 0.1839088648557663, + "learning_rate": 2.362272996186343e-05, + "loss": 1.6902, + "step": 22370 + }, + { + "epoch": 6.866482504604051, + "grad_norm": 0.18304915726184845, + "learning_rate": 2.3618507468202856e-05, + "loss": 1.7142, + "step": 22371 + }, + { + "epoch": 6.866789441375077, + "grad_norm": 0.21228280663490295, + "learning_rate": 2.3614285235259655e-05, + "loss": 1.8277, + "step": 22372 + }, + { + "epoch": 6.867096378146102, + "grad_norm": 0.19515320658683777, + "learning_rate": 2.361006326307555e-05, + "loss": 1.7029, + "step": 22373 + }, + { + "epoch": 6.867403314917127, + "grad_norm": 0.16277433931827545, + "learning_rate": 2.360584155169227e-05, + "loss": 1.672, + "step": 22374 + }, + { + "epoch": 6.867710251688152, + "grad_norm": 0.2180202454328537, + "learning_rate": 2.360162010115151e-05, + "loss": 1.7516, + "step": 22375 + }, + { + "epoch": 6.868017188459177, + "grad_norm": 0.17940378189086914, + "learning_rate": 2.3597398911495055e-05, + "loss": 1.6782, + "step": 22376 + }, + { + "epoch": 6.8683241252302025, + "grad_norm": 0.20751933753490448, + "learning_rate": 2.3593177982764543e-05, + "loss": 1.7954, + "step": 22377 + }, + { + "epoch": 6.868631062001228, + "grad_norm": 0.23098444938659668, + "learning_rate": 2.3588957315001758e-05, + "loss": 1.7472, + "step": 22378 + }, + { + "epoch": 6.868937998772253, + "grad_norm": 0.2351236343383789, + "learning_rate": 2.358473690824836e-05, + "loss": 1.7959, + "step": 22379 + }, + { + "epoch": 6.8692449355432785, + "grad_norm": 0.1890626847743988, + "learning_rate": 2.3580516762546055e-05, + "loss": 1.7015, + "step": 22380 + }, + { + "epoch": 6.869551872314303, + "grad_norm": 0.21120475232601166, + "learning_rate": 2.3576296877936604e-05, + "loss": 1.7998, + "step": 22381 + }, + { + "epoch": 6.869858809085328, + "grad_norm": 0.18141280114650726, + "learning_rate": 2.3572077254461638e-05, + "loss": 1.6973, + "step": 22382 + }, + { + "epoch": 6.870165745856354, + "grad_norm": 0.19084444642066956, + "learning_rate": 2.356785789216293e-05, + "loss": 1.6853, + "step": 22383 + }, + { + "epoch": 6.870472682627379, + "grad_norm": 0.18046700954437256, + "learning_rate": 2.356363879108211e-05, + "loss": 1.7476, + "step": 22384 + }, + { + "epoch": 6.870779619398404, + "grad_norm": 0.19875061511993408, + "learning_rate": 2.3559419951260926e-05, + "loss": 1.7223, + "step": 22385 + }, + { + "epoch": 6.871086556169429, + "grad_norm": 0.2377827763557434, + "learning_rate": 2.3555201372741047e-05, + "loss": 1.7976, + "step": 22386 + }, + { + "epoch": 6.871393492940454, + "grad_norm": 0.17645993828773499, + "learning_rate": 2.3550983055564168e-05, + "loss": 1.6726, + "step": 22387 + }, + { + "epoch": 6.871700429711479, + "grad_norm": 0.19499735534191132, + "learning_rate": 2.3546764999771976e-05, + "loss": 1.67, + "step": 22388 + }, + { + "epoch": 6.872007366482505, + "grad_norm": 0.22010546922683716, + "learning_rate": 2.3542547205406163e-05, + "loss": 1.8461, + "step": 22389 + }, + { + "epoch": 6.87231430325353, + "grad_norm": 0.2101692259311676, + "learning_rate": 2.3538329672508396e-05, + "loss": 1.6922, + "step": 22390 + }, + { + "epoch": 6.872621240024555, + "grad_norm": 0.1926269382238388, + "learning_rate": 2.3534112401120372e-05, + "loss": 1.6934, + "step": 22391 + }, + { + "epoch": 6.87292817679558, + "grad_norm": 0.20662687718868256, + "learning_rate": 2.3529895391283742e-05, + "loss": 1.7284, + "step": 22392 + }, + { + "epoch": 6.873235113566605, + "grad_norm": 0.2392960786819458, + "learning_rate": 2.3525678643040235e-05, + "loss": 1.7207, + "step": 22393 + }, + { + "epoch": 6.8735420503376305, + "grad_norm": 0.2067870795726776, + "learning_rate": 2.3521462156431452e-05, + "loss": 1.7269, + "step": 22394 + }, + { + "epoch": 6.873848987108656, + "grad_norm": 0.2544265687465668, + "learning_rate": 2.351724593149914e-05, + "loss": 1.7358, + "step": 22395 + }, + { + "epoch": 6.87415592387968, + "grad_norm": 0.2243366837501526, + "learning_rate": 2.3513029968284907e-05, + "loss": 1.7625, + "step": 22396 + }, + { + "epoch": 6.874462860650706, + "grad_norm": 0.23003467917442322, + "learning_rate": 2.3508814266830414e-05, + "loss": 1.6943, + "step": 22397 + }, + { + "epoch": 6.874769797421731, + "grad_norm": 0.19257886707782745, + "learning_rate": 2.3504598827177383e-05, + "loss": 1.7393, + "step": 22398 + }, + { + "epoch": 6.875076734192756, + "grad_norm": 0.23782171308994293, + "learning_rate": 2.3500383649367404e-05, + "loss": 1.7758, + "step": 22399 + }, + { + "epoch": 6.875383670963782, + "grad_norm": 0.18137066066265106, + "learning_rate": 2.3496168733442197e-05, + "loss": 1.7083, + "step": 22400 + }, + { + "epoch": 6.875690607734807, + "grad_norm": 0.21970662474632263, + "learning_rate": 2.3491954079443344e-05, + "loss": 1.7552, + "step": 22401 + }, + { + "epoch": 6.8759975445058314, + "grad_norm": 0.2032134085893631, + "learning_rate": 2.3487739687412562e-05, + "loss": 1.7653, + "step": 22402 + }, + { + "epoch": 6.876304481276857, + "grad_norm": 0.22016118466854095, + "learning_rate": 2.348352555739148e-05, + "loss": 1.7277, + "step": 22403 + }, + { + "epoch": 6.876611418047882, + "grad_norm": 0.2250203788280487, + "learning_rate": 2.3479311689421736e-05, + "loss": 1.7451, + "step": 22404 + }, + { + "epoch": 6.8769183548189075, + "grad_norm": 0.19726359844207764, + "learning_rate": 2.3475098083544977e-05, + "loss": 1.728, + "step": 22405 + }, + { + "epoch": 6.877225291589933, + "grad_norm": 0.21295994520187378, + "learning_rate": 2.3470884739802844e-05, + "loss": 1.7438, + "step": 22406 + }, + { + "epoch": 6.877532228360957, + "grad_norm": 0.19653508067131042, + "learning_rate": 2.346667165823698e-05, + "loss": 1.7189, + "step": 22407 + }, + { + "epoch": 6.877839165131983, + "grad_norm": 0.21406517922878265, + "learning_rate": 2.3462458838889016e-05, + "loss": 1.7475, + "step": 22408 + }, + { + "epoch": 6.878146101903008, + "grad_norm": 0.20569753646850586, + "learning_rate": 2.3458246281800595e-05, + "loss": 1.7262, + "step": 22409 + }, + { + "epoch": 6.878453038674033, + "grad_norm": 0.19365517795085907, + "learning_rate": 2.3454033987013334e-05, + "loss": 1.6938, + "step": 22410 + }, + { + "epoch": 6.878759975445059, + "grad_norm": 0.20935405790805817, + "learning_rate": 2.344982195456885e-05, + "loss": 1.724, + "step": 22411 + }, + { + "epoch": 6.879066912216084, + "grad_norm": 0.2104228436946869, + "learning_rate": 2.3445610184508826e-05, + "loss": 1.7474, + "step": 22412 + }, + { + "epoch": 6.879373848987108, + "grad_norm": 0.19795742630958557, + "learning_rate": 2.3441398676874826e-05, + "loss": 1.7572, + "step": 22413 + }, + { + "epoch": 6.879680785758134, + "grad_norm": 0.20640577375888824, + "learning_rate": 2.3437187431708472e-05, + "loss": 1.7258, + "step": 22414 + }, + { + "epoch": 6.879987722529159, + "grad_norm": 0.2092565894126892, + "learning_rate": 2.3432976449051442e-05, + "loss": 1.7437, + "step": 22415 + }, + { + "epoch": 6.880294659300184, + "grad_norm": 0.2083825170993805, + "learning_rate": 2.3428765728945275e-05, + "loss": 1.7127, + "step": 22416 + }, + { + "epoch": 6.88060159607121, + "grad_norm": 0.20619866251945496, + "learning_rate": 2.3424555271431647e-05, + "loss": 1.7729, + "step": 22417 + }, + { + "epoch": 6.880908532842234, + "grad_norm": 0.22689959406852722, + "learning_rate": 2.3420345076552107e-05, + "loss": 1.7142, + "step": 22418 + }, + { + "epoch": 6.8812154696132595, + "grad_norm": 0.16664449870586395, + "learning_rate": 2.3416135144348316e-05, + "loss": 1.6857, + "step": 22419 + }, + { + "epoch": 6.881522406384285, + "grad_norm": 0.1895827353000641, + "learning_rate": 2.3411925474861856e-05, + "loss": 1.7075, + "step": 22420 + }, + { + "epoch": 6.88182934315531, + "grad_norm": 0.2058400958776474, + "learning_rate": 2.3407716068134334e-05, + "loss": 1.7623, + "step": 22421 + }, + { + "epoch": 6.8821362799263355, + "grad_norm": 0.18390826880931854, + "learning_rate": 2.3403506924207346e-05, + "loss": 1.6686, + "step": 22422 + }, + { + "epoch": 6.882443216697361, + "grad_norm": 0.1742098331451416, + "learning_rate": 2.3399298043122497e-05, + "loss": 1.6846, + "step": 22423 + }, + { + "epoch": 6.882750153468385, + "grad_norm": 0.18958622217178345, + "learning_rate": 2.3395089424921368e-05, + "loss": 1.7603, + "step": 22424 + }, + { + "epoch": 6.883057090239411, + "grad_norm": 0.21827174723148346, + "learning_rate": 2.3390881069645564e-05, + "loss": 1.6706, + "step": 22425 + }, + { + "epoch": 6.883364027010436, + "grad_norm": 0.17859303951263428, + "learning_rate": 2.338667297733667e-05, + "loss": 1.7612, + "step": 22426 + }, + { + "epoch": 6.883670963781461, + "grad_norm": 0.22383756935596466, + "learning_rate": 2.338246514803627e-05, + "loss": 1.7507, + "step": 22427 + }, + { + "epoch": 6.883977900552486, + "grad_norm": 0.20317313075065613, + "learning_rate": 2.3378257581785934e-05, + "loss": 1.6912, + "step": 22428 + }, + { + "epoch": 6.884284837323511, + "grad_norm": 0.20238614082336426, + "learning_rate": 2.3374050278627297e-05, + "loss": 1.7336, + "step": 22429 + }, + { + "epoch": 6.884591774094536, + "grad_norm": 0.2134159654378891, + "learning_rate": 2.336984323860188e-05, + "loss": 1.7252, + "step": 22430 + }, + { + "epoch": 6.884898710865562, + "grad_norm": 0.17153076827526093, + "learning_rate": 2.3365636461751277e-05, + "loss": 1.6769, + "step": 22431 + }, + { + "epoch": 6.885205647636587, + "grad_norm": 0.19001254439353943, + "learning_rate": 2.3361429948117075e-05, + "loss": 1.7812, + "step": 22432 + }, + { + "epoch": 6.885512584407612, + "grad_norm": 0.2074522078037262, + "learning_rate": 2.335722369774081e-05, + "loss": 1.7433, + "step": 22433 + }, + { + "epoch": 6.885819521178637, + "grad_norm": 0.22863705456256866, + "learning_rate": 2.3353017710664117e-05, + "loss": 1.7476, + "step": 22434 + }, + { + "epoch": 6.886126457949662, + "grad_norm": 0.19350804388523102, + "learning_rate": 2.334881198692848e-05, + "loss": 1.7071, + "step": 22435 + }, + { + "epoch": 6.8864333947206875, + "grad_norm": 0.22915633022785187, + "learning_rate": 2.3344606526575524e-05, + "loss": 1.7283, + "step": 22436 + }, + { + "epoch": 6.886740331491713, + "grad_norm": 0.21576058864593506, + "learning_rate": 2.3340401329646795e-05, + "loss": 1.7062, + "step": 22437 + }, + { + "epoch": 6.887047268262738, + "grad_norm": 0.17844067513942719, + "learning_rate": 2.333619639618384e-05, + "loss": 1.6994, + "step": 22438 + }, + { + "epoch": 6.887354205033763, + "grad_norm": 0.21019738912582397, + "learning_rate": 2.333199172622822e-05, + "loss": 1.6654, + "step": 22439 + }, + { + "epoch": 6.887661141804788, + "grad_norm": 0.1901654452085495, + "learning_rate": 2.3327787319821486e-05, + "loss": 1.7847, + "step": 22440 + }, + { + "epoch": 6.887968078575813, + "grad_norm": 0.21838930249214172, + "learning_rate": 2.3323583177005198e-05, + "loss": 1.6517, + "step": 22441 + }, + { + "epoch": 6.888275015346839, + "grad_norm": 0.16078172624111176, + "learning_rate": 2.3319379297820892e-05, + "loss": 1.7052, + "step": 22442 + }, + { + "epoch": 6.888581952117864, + "grad_norm": 0.19161897897720337, + "learning_rate": 2.331517568231012e-05, + "loss": 1.675, + "step": 22443 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 0.1874416172504425, + "learning_rate": 2.331097233051442e-05, + "loss": 1.7025, + "step": 22444 + }, + { + "epoch": 6.889195825659914, + "grad_norm": 0.1817546933889389, + "learning_rate": 2.3306769242475318e-05, + "loss": 1.7103, + "step": 22445 + }, + { + "epoch": 6.889502762430939, + "grad_norm": 0.18423372507095337, + "learning_rate": 2.3302566418234406e-05, + "loss": 1.6883, + "step": 22446 + }, + { + "epoch": 6.889809699201964, + "grad_norm": 0.1712140440940857, + "learning_rate": 2.3298363857833162e-05, + "loss": 1.7076, + "step": 22447 + }, + { + "epoch": 6.89011663597299, + "grad_norm": 0.15992864966392517, + "learning_rate": 2.3294161561313133e-05, + "loss": 1.6514, + "step": 22448 + }, + { + "epoch": 6.890423572744015, + "grad_norm": 0.24126072227954865, + "learning_rate": 2.3289959528715855e-05, + "loss": 1.7385, + "step": 22449 + }, + { + "epoch": 6.8907305095150395, + "grad_norm": 0.18130798637866974, + "learning_rate": 2.3285757760082832e-05, + "loss": 1.691, + "step": 22450 + }, + { + "epoch": 6.891037446286065, + "grad_norm": 0.20070049166679382, + "learning_rate": 2.3281556255455644e-05, + "loss": 1.7166, + "step": 22451 + }, + { + "epoch": 6.89134438305709, + "grad_norm": 0.20706996321678162, + "learning_rate": 2.327735501487574e-05, + "loss": 1.6763, + "step": 22452 + }, + { + "epoch": 6.8916513198281155, + "grad_norm": 0.22404810786247253, + "learning_rate": 2.327315403838472e-05, + "loss": 1.761, + "step": 22453 + }, + { + "epoch": 6.891958256599141, + "grad_norm": 0.21240194141864777, + "learning_rate": 2.3268953326024013e-05, + "loss": 1.7038, + "step": 22454 + }, + { + "epoch": 6.892265193370166, + "grad_norm": 0.24251966178417206, + "learning_rate": 2.32647528778352e-05, + "loss": 1.7829, + "step": 22455 + }, + { + "epoch": 6.892572130141191, + "grad_norm": 0.21213467419147491, + "learning_rate": 2.3260552693859765e-05, + "loss": 1.7433, + "step": 22456 + }, + { + "epoch": 6.892879066912216, + "grad_norm": 0.18008530139923096, + "learning_rate": 2.325635277413922e-05, + "loss": 1.7238, + "step": 22457 + }, + { + "epoch": 6.893186003683241, + "grad_norm": 0.18252789974212646, + "learning_rate": 2.325215311871508e-05, + "loss": 1.7143, + "step": 22458 + }, + { + "epoch": 6.893492940454267, + "grad_norm": 0.17830567061901093, + "learning_rate": 2.3247953727628833e-05, + "loss": 1.687, + "step": 22459 + }, + { + "epoch": 6.893799877225292, + "grad_norm": 0.19980686902999878, + "learning_rate": 2.3243754600921992e-05, + "loss": 1.7096, + "step": 22460 + }, + { + "epoch": 6.894106813996316, + "grad_norm": 0.1713438183069229, + "learning_rate": 2.3239555738636044e-05, + "loss": 1.6791, + "step": 22461 + }, + { + "epoch": 6.894413750767342, + "grad_norm": 0.17678281664848328, + "learning_rate": 2.3235357140812475e-05, + "loss": 1.6689, + "step": 22462 + }, + { + "epoch": 6.894720687538367, + "grad_norm": 0.20409992337226868, + "learning_rate": 2.3231158807492837e-05, + "loss": 1.7746, + "step": 22463 + }, + { + "epoch": 6.895027624309392, + "grad_norm": 0.19227825105190277, + "learning_rate": 2.3226960738718552e-05, + "loss": 1.7101, + "step": 22464 + }, + { + "epoch": 6.895334561080418, + "grad_norm": 0.24029433727264404, + "learning_rate": 2.3222762934531132e-05, + "loss": 1.7842, + "step": 22465 + }, + { + "epoch": 6.895641497851443, + "grad_norm": 0.21887856721878052, + "learning_rate": 2.321856539497207e-05, + "loss": 1.7032, + "step": 22466 + }, + { + "epoch": 6.8959484346224675, + "grad_norm": 0.17346082627773285, + "learning_rate": 2.321436812008282e-05, + "loss": 1.683, + "step": 22467 + }, + { + "epoch": 6.896255371393493, + "grad_norm": 0.18920177221298218, + "learning_rate": 2.3210171109904914e-05, + "loss": 1.7057, + "step": 22468 + }, + { + "epoch": 6.896562308164518, + "grad_norm": 0.21199388802051544, + "learning_rate": 2.320597436447977e-05, + "loss": 1.7534, + "step": 22469 + }, + { + "epoch": 6.8968692449355435, + "grad_norm": 0.1867530792951584, + "learning_rate": 2.320177788384893e-05, + "loss": 1.7185, + "step": 22470 + }, + { + "epoch": 6.897176181706568, + "grad_norm": 0.21009495854377747, + "learning_rate": 2.3197581668053785e-05, + "loss": 1.7379, + "step": 22471 + }, + { + "epoch": 6.897483118477593, + "grad_norm": 0.20078743994235992, + "learning_rate": 2.3193385717135874e-05, + "loss": 1.7226, + "step": 22472 + }, + { + "epoch": 6.897790055248619, + "grad_norm": 0.2135045975446701, + "learning_rate": 2.318919003113663e-05, + "loss": 1.7531, + "step": 22473 + }, + { + "epoch": 6.898096992019644, + "grad_norm": 0.18811136484146118, + "learning_rate": 2.3184994610097526e-05, + "loss": 1.6542, + "step": 22474 + }, + { + "epoch": 6.898403928790669, + "grad_norm": 0.2323937565088272, + "learning_rate": 2.3180799454060025e-05, + "loss": 1.7369, + "step": 22475 + }, + { + "epoch": 6.898710865561695, + "grad_norm": 0.19270992279052734, + "learning_rate": 2.317660456306558e-05, + "loss": 1.6818, + "step": 22476 + }, + { + "epoch": 6.899017802332719, + "grad_norm": 0.18951043486595154, + "learning_rate": 2.3172409937155654e-05, + "loss": 1.7183, + "step": 22477 + }, + { + "epoch": 6.899324739103744, + "grad_norm": 0.1758934110403061, + "learning_rate": 2.3168215576371694e-05, + "loss": 1.6826, + "step": 22478 + }, + { + "epoch": 6.89963167587477, + "grad_norm": 0.2048143893480301, + "learning_rate": 2.3164021480755133e-05, + "loss": 1.7769, + "step": 22479 + }, + { + "epoch": 6.899938612645795, + "grad_norm": 0.20538486540317535, + "learning_rate": 2.315982765034748e-05, + "loss": 1.7035, + "step": 22480 + }, + { + "epoch": 6.9002455494168204, + "grad_norm": 0.18417708575725555, + "learning_rate": 2.3155634085190124e-05, + "loss": 1.7533, + "step": 22481 + }, + { + "epoch": 6.900552486187845, + "grad_norm": 0.1978628784418106, + "learning_rate": 2.315144078532453e-05, + "loss": 1.691, + "step": 22482 + }, + { + "epoch": 6.90085942295887, + "grad_norm": 0.17665794491767883, + "learning_rate": 2.3147247750792128e-05, + "loss": 1.7018, + "step": 22483 + }, + { + "epoch": 6.901166359729896, + "grad_norm": 0.20218273997306824, + "learning_rate": 2.314305498163435e-05, + "loss": 1.7277, + "step": 22484 + }, + { + "epoch": 6.901473296500921, + "grad_norm": 0.18791642785072327, + "learning_rate": 2.3138862477892674e-05, + "loss": 1.7247, + "step": 22485 + }, + { + "epoch": 6.901780233271946, + "grad_norm": 0.1945842206478119, + "learning_rate": 2.313467023960847e-05, + "loss": 1.6648, + "step": 22486 + }, + { + "epoch": 6.902087170042972, + "grad_norm": 0.1871321201324463, + "learning_rate": 2.3130478266823237e-05, + "loss": 1.6978, + "step": 22487 + }, + { + "epoch": 6.902394106813996, + "grad_norm": 0.20094287395477295, + "learning_rate": 2.312628655957833e-05, + "loss": 1.7763, + "step": 22488 + }, + { + "epoch": 6.902701043585021, + "grad_norm": 0.1804366111755371, + "learning_rate": 2.3122095117915226e-05, + "loss": 1.689, + "step": 22489 + }, + { + "epoch": 6.903007980356047, + "grad_norm": 0.1846652776002884, + "learning_rate": 2.311790394187534e-05, + "loss": 1.7088, + "step": 22490 + }, + { + "epoch": 6.903314917127072, + "grad_norm": 0.18339675664901733, + "learning_rate": 2.311371303150008e-05, + "loss": 1.6974, + "step": 22491 + }, + { + "epoch": 6.903621853898097, + "grad_norm": 0.21333162486553192, + "learning_rate": 2.3109522386830863e-05, + "loss": 1.7614, + "step": 22492 + }, + { + "epoch": 6.903928790669122, + "grad_norm": 0.19845318794250488, + "learning_rate": 2.3105332007909104e-05, + "loss": 1.6895, + "step": 22493 + }, + { + "epoch": 6.904235727440147, + "grad_norm": 0.21082347631454468, + "learning_rate": 2.3101141894776224e-05, + "loss": 1.7397, + "step": 22494 + }, + { + "epoch": 6.9045426642111725, + "grad_norm": 0.16360893845558167, + "learning_rate": 2.3096952047473623e-05, + "loss": 1.6716, + "step": 22495 + }, + { + "epoch": 6.904849600982198, + "grad_norm": 0.2287478744983673, + "learning_rate": 2.3092762466042687e-05, + "loss": 1.7673, + "step": 22496 + }, + { + "epoch": 6.905156537753223, + "grad_norm": 0.17231078445911407, + "learning_rate": 2.308857315052489e-05, + "loss": 1.6744, + "step": 22497 + }, + { + "epoch": 6.9054634745242485, + "grad_norm": 0.2887173295021057, + "learning_rate": 2.3084384100961565e-05, + "loss": 1.7358, + "step": 22498 + }, + { + "epoch": 6.905770411295273, + "grad_norm": 0.1977192759513855, + "learning_rate": 2.3080195317394127e-05, + "loss": 1.7514, + "step": 22499 + }, + { + "epoch": 6.906077348066298, + "grad_norm": 0.24933035671710968, + "learning_rate": 2.307600679986398e-05, + "loss": 1.6845, + "step": 22500 + }, + { + "epoch": 6.906384284837324, + "grad_norm": 0.17288708686828613, + "learning_rate": 2.30718185484125e-05, + "loss": 1.7211, + "step": 22501 + }, + { + "epoch": 6.906691221608349, + "grad_norm": 0.22192007303237915, + "learning_rate": 2.306763056308112e-05, + "loss": 1.6924, + "step": 22502 + }, + { + "epoch": 6.906998158379373, + "grad_norm": 0.20500123500823975, + "learning_rate": 2.3063442843911172e-05, + "loss": 1.7412, + "step": 22503 + }, + { + "epoch": 6.907305095150399, + "grad_norm": 0.30658698081970215, + "learning_rate": 2.30592553909441e-05, + "loss": 1.7965, + "step": 22504 + }, + { + "epoch": 6.907612031921424, + "grad_norm": 0.177829772233963, + "learning_rate": 2.3055068204221224e-05, + "loss": 1.6914, + "step": 22505 + }, + { + "epoch": 6.907918968692449, + "grad_norm": 0.20281876623630524, + "learning_rate": 2.3050881283783977e-05, + "loss": 1.6946, + "step": 22506 + }, + { + "epoch": 6.908225905463475, + "grad_norm": 0.16111700236797333, + "learning_rate": 2.3046694629673716e-05, + "loss": 1.7004, + "step": 22507 + }, + { + "epoch": 6.9085328422345, + "grad_norm": 0.1911575049161911, + "learning_rate": 2.3042508241931814e-05, + "loss": 1.7013, + "step": 22508 + }, + { + "epoch": 6.9088397790055245, + "grad_norm": 0.17862342298030853, + "learning_rate": 2.303832212059965e-05, + "loss": 1.7053, + "step": 22509 + }, + { + "epoch": 6.90914671577655, + "grad_norm": 0.2268948256969452, + "learning_rate": 2.303413626571858e-05, + "loss": 1.7241, + "step": 22510 + }, + { + "epoch": 6.909453652547575, + "grad_norm": 0.1997457593679428, + "learning_rate": 2.3029950677329992e-05, + "loss": 1.6927, + "step": 22511 + }, + { + "epoch": 6.9097605893186005, + "grad_norm": 0.22120819985866547, + "learning_rate": 2.3025765355475232e-05, + "loss": 1.7447, + "step": 22512 + }, + { + "epoch": 6.910067526089626, + "grad_norm": 0.22097964584827423, + "learning_rate": 2.302158030019565e-05, + "loss": 1.7399, + "step": 22513 + }, + { + "epoch": 6.91037446286065, + "grad_norm": 0.2171044498682022, + "learning_rate": 2.3017395511532664e-05, + "loss": 1.7252, + "step": 22514 + }, + { + "epoch": 6.910681399631676, + "grad_norm": 0.1987348347902298, + "learning_rate": 2.301321098952757e-05, + "loss": 1.7071, + "step": 22515 + }, + { + "epoch": 6.910988336402701, + "grad_norm": 0.2131081372499466, + "learning_rate": 2.3009026734221746e-05, + "loss": 1.7314, + "step": 22516 + }, + { + "epoch": 6.911295273173726, + "grad_norm": 0.18867900967597961, + "learning_rate": 2.3004842745656536e-05, + "loss": 1.7431, + "step": 22517 + }, + { + "epoch": 6.911602209944752, + "grad_norm": 0.22853058576583862, + "learning_rate": 2.3000659023873277e-05, + "loss": 1.7234, + "step": 22518 + }, + { + "epoch": 6.911909146715777, + "grad_norm": 0.23441165685653687, + "learning_rate": 2.2996475568913366e-05, + "loss": 1.7535, + "step": 22519 + }, + { + "epoch": 6.912216083486801, + "grad_norm": 0.2376382052898407, + "learning_rate": 2.299229238081807e-05, + "loss": 1.7582, + "step": 22520 + }, + { + "epoch": 6.912523020257827, + "grad_norm": 0.2571510076522827, + "learning_rate": 2.2988109459628814e-05, + "loss": 1.722, + "step": 22521 + }, + { + "epoch": 6.912829957028852, + "grad_norm": 0.19782103598117828, + "learning_rate": 2.298392680538685e-05, + "loss": 1.7052, + "step": 22522 + }, + { + "epoch": 6.913136893799877, + "grad_norm": 0.24070625007152557, + "learning_rate": 2.297974441813358e-05, + "loss": 1.7306, + "step": 22523 + }, + { + "epoch": 6.913443830570903, + "grad_norm": 0.1783500611782074, + "learning_rate": 2.2975562297910307e-05, + "loss": 1.7077, + "step": 22524 + }, + { + "epoch": 6.913750767341927, + "grad_norm": 0.19469089806079865, + "learning_rate": 2.2971380444758373e-05, + "loss": 1.7275, + "step": 22525 + }, + { + "epoch": 6.9140577041129525, + "grad_norm": 0.21449480950832367, + "learning_rate": 2.2967198858719092e-05, + "loss": 1.7682, + "step": 22526 + }, + { + "epoch": 6.914364640883978, + "grad_norm": 0.21686261892318726, + "learning_rate": 2.2963017539833803e-05, + "loss": 1.6794, + "step": 22527 + }, + { + "epoch": 6.914671577655003, + "grad_norm": 0.2061273604631424, + "learning_rate": 2.2958836488143813e-05, + "loss": 1.7612, + "step": 22528 + }, + { + "epoch": 6.9149785144260285, + "grad_norm": 0.2708517611026764, + "learning_rate": 2.295465570369046e-05, + "loss": 1.7291, + "step": 22529 + }, + { + "epoch": 6.915285451197054, + "grad_norm": 0.17011860013008118, + "learning_rate": 2.295047518651503e-05, + "loss": 1.6541, + "step": 22530 + }, + { + "epoch": 6.915592387968078, + "grad_norm": 0.255305677652359, + "learning_rate": 2.294629493665889e-05, + "loss": 1.7063, + "step": 22531 + }, + { + "epoch": 6.915899324739104, + "grad_norm": 0.20172207057476044, + "learning_rate": 2.2942114954163306e-05, + "loss": 1.6678, + "step": 22532 + }, + { + "epoch": 6.916206261510129, + "grad_norm": 0.23726679384708405, + "learning_rate": 2.2937935239069603e-05, + "loss": 1.6762, + "step": 22533 + }, + { + "epoch": 6.916513198281154, + "grad_norm": 0.17716684937477112, + "learning_rate": 2.2933755791419082e-05, + "loss": 1.7302, + "step": 22534 + }, + { + "epoch": 6.91682013505218, + "grad_norm": 0.2513270974159241, + "learning_rate": 2.2929576611253035e-05, + "loss": 1.7371, + "step": 22535 + }, + { + "epoch": 6.917127071823204, + "grad_norm": 0.21994394063949585, + "learning_rate": 2.292539769861281e-05, + "loss": 1.7007, + "step": 22536 + }, + { + "epoch": 6.917434008594229, + "grad_norm": 0.2095540314912796, + "learning_rate": 2.292121905353964e-05, + "loss": 1.71, + "step": 22537 + }, + { + "epoch": 6.917740945365255, + "grad_norm": 0.24400855600833893, + "learning_rate": 2.2917040676074892e-05, + "loss": 1.7859, + "step": 22538 + }, + { + "epoch": 6.91804788213628, + "grad_norm": 0.23217935860157013, + "learning_rate": 2.2912862566259785e-05, + "loss": 1.8218, + "step": 22539 + }, + { + "epoch": 6.918354818907305, + "grad_norm": 0.23555497825145721, + "learning_rate": 2.2908684724135666e-05, + "loss": 1.7145, + "step": 22540 + }, + { + "epoch": 6.918661755678331, + "grad_norm": 0.17844347655773163, + "learning_rate": 2.2904507149743804e-05, + "loss": 1.6767, + "step": 22541 + }, + { + "epoch": 6.918968692449355, + "grad_norm": 0.20810428261756897, + "learning_rate": 2.290032984312548e-05, + "loss": 1.7359, + "step": 22542 + }, + { + "epoch": 6.9192756292203805, + "grad_norm": 0.20082542300224304, + "learning_rate": 2.289615280432198e-05, + "loss": 1.7623, + "step": 22543 + }, + { + "epoch": 6.919582565991406, + "grad_norm": 0.2005007117986679, + "learning_rate": 2.2891976033374584e-05, + "loss": 1.745, + "step": 22544 + }, + { + "epoch": 6.919889502762431, + "grad_norm": 0.18054969608783722, + "learning_rate": 2.2887799530324572e-05, + "loss": 1.6959, + "step": 22545 + }, + { + "epoch": 6.920196439533456, + "grad_norm": 0.18410442769527435, + "learning_rate": 2.2883623295213214e-05, + "loss": 1.7052, + "step": 22546 + }, + { + "epoch": 6.920503376304481, + "grad_norm": 0.17380426824092865, + "learning_rate": 2.2879447328081765e-05, + "loss": 1.6735, + "step": 22547 + }, + { + "epoch": 6.920810313075506, + "grad_norm": 0.19082246720790863, + "learning_rate": 2.2875271628971557e-05, + "loss": 1.7192, + "step": 22548 + }, + { + "epoch": 6.921117249846532, + "grad_norm": 0.17682792246341705, + "learning_rate": 2.2871096197923784e-05, + "loss": 1.649, + "step": 22549 + }, + { + "epoch": 6.921424186617557, + "grad_norm": 0.19127340614795685, + "learning_rate": 2.286692103497975e-05, + "loss": 1.7366, + "step": 22550 + }, + { + "epoch": 6.921731123388582, + "grad_norm": 0.1636040210723877, + "learning_rate": 2.2862746140180696e-05, + "loss": 1.6749, + "step": 22551 + }, + { + "epoch": 6.922038060159607, + "grad_norm": 0.2121013104915619, + "learning_rate": 2.285857151356788e-05, + "loss": 1.7342, + "step": 22552 + }, + { + "epoch": 6.922344996930632, + "grad_norm": 0.19183295965194702, + "learning_rate": 2.28543971551826e-05, + "loss": 1.7506, + "step": 22553 + }, + { + "epoch": 6.922651933701657, + "grad_norm": 0.23838891088962555, + "learning_rate": 2.285022306506604e-05, + "loss": 1.6875, + "step": 22554 + }, + { + "epoch": 6.922958870472683, + "grad_norm": 0.17147624492645264, + "learning_rate": 2.2846049243259526e-05, + "loss": 1.7074, + "step": 22555 + }, + { + "epoch": 6.923265807243708, + "grad_norm": 0.2254270762205124, + "learning_rate": 2.2841875689804236e-05, + "loss": 1.7589, + "step": 22556 + }, + { + "epoch": 6.9235727440147325, + "grad_norm": 0.249015673995018, + "learning_rate": 2.2837702404741462e-05, + "loss": 1.7708, + "step": 22557 + }, + { + "epoch": 6.923879680785758, + "grad_norm": 0.19401927292346954, + "learning_rate": 2.283352938811244e-05, + "loss": 1.696, + "step": 22558 + }, + { + "epoch": 6.924186617556783, + "grad_norm": 0.21134993433952332, + "learning_rate": 2.2829356639958398e-05, + "loss": 1.7136, + "step": 22559 + }, + { + "epoch": 6.9244935543278086, + "grad_norm": 0.17600105702877045, + "learning_rate": 2.2825184160320578e-05, + "loss": 1.679, + "step": 22560 + }, + { + "epoch": 6.924800491098834, + "grad_norm": 0.2426912486553192, + "learning_rate": 2.282101194924022e-05, + "loss": 1.7011, + "step": 22561 + }, + { + "epoch": 6.925107427869859, + "grad_norm": 0.20040342211723328, + "learning_rate": 2.281684000675855e-05, + "loss": 1.6844, + "step": 22562 + }, + { + "epoch": 6.925414364640884, + "grad_norm": 0.23790770769119263, + "learning_rate": 2.2812668332916798e-05, + "loss": 1.7318, + "step": 22563 + }, + { + "epoch": 6.925721301411909, + "grad_norm": 0.21387948095798492, + "learning_rate": 2.2808496927756196e-05, + "loss": 1.6903, + "step": 22564 + }, + { + "epoch": 6.926028238182934, + "grad_norm": 0.20471405982971191, + "learning_rate": 2.280432579131796e-05, + "loss": 1.7231, + "step": 22565 + }, + { + "epoch": 6.92633517495396, + "grad_norm": 0.1953156590461731, + "learning_rate": 2.280015492364332e-05, + "loss": 1.7322, + "step": 22566 + }, + { + "epoch": 6.926642111724985, + "grad_norm": 0.3107415437698364, + "learning_rate": 2.279598432477349e-05, + "loss": 1.7833, + "step": 22567 + }, + { + "epoch": 6.9269490484960095, + "grad_norm": 0.2114095836877823, + "learning_rate": 2.279181399474969e-05, + "loss": 1.6923, + "step": 22568 + }, + { + "epoch": 6.927255985267035, + "grad_norm": 0.21373972296714783, + "learning_rate": 2.2787643933613107e-05, + "loss": 1.6897, + "step": 22569 + }, + { + "epoch": 6.92756292203806, + "grad_norm": 0.17955096065998077, + "learning_rate": 2.278347414140502e-05, + "loss": 1.7443, + "step": 22570 + }, + { + "epoch": 6.9278698588090855, + "grad_norm": 0.19275230169296265, + "learning_rate": 2.2779304618166554e-05, + "loss": 1.7109, + "step": 22571 + }, + { + "epoch": 6.928176795580111, + "grad_norm": 0.16774436831474304, + "learning_rate": 2.277513536393899e-05, + "loss": 1.7059, + "step": 22572 + }, + { + "epoch": 6.928483732351136, + "grad_norm": 0.25093573331832886, + "learning_rate": 2.2770966378763457e-05, + "loss": 1.7501, + "step": 22573 + }, + { + "epoch": 6.928790669122161, + "grad_norm": 0.24859540164470673, + "learning_rate": 2.2766797662681216e-05, + "loss": 1.7315, + "step": 22574 + }, + { + "epoch": 6.929097605893186, + "grad_norm": 0.1736115962266922, + "learning_rate": 2.2762629215733438e-05, + "loss": 1.7422, + "step": 22575 + }, + { + "epoch": 6.929404542664211, + "grad_norm": 0.23705001175403595, + "learning_rate": 2.2758461037961326e-05, + "loss": 1.7818, + "step": 22576 + }, + { + "epoch": 6.929711479435237, + "grad_norm": 0.21123656630516052, + "learning_rate": 2.2754293129406073e-05, + "loss": 1.7652, + "step": 22577 + }, + { + "epoch": 6.930018416206261, + "grad_norm": 0.2195751667022705, + "learning_rate": 2.2750125490108858e-05, + "loss": 1.7103, + "step": 22578 + }, + { + "epoch": 6.930325352977286, + "grad_norm": 0.17324887216091156, + "learning_rate": 2.274595812011088e-05, + "loss": 1.7386, + "step": 22579 + }, + { + "epoch": 6.930632289748312, + "grad_norm": 0.3175726532936096, + "learning_rate": 2.2741791019453313e-05, + "loss": 1.7608, + "step": 22580 + }, + { + "epoch": 6.930939226519337, + "grad_norm": 0.26266980171203613, + "learning_rate": 2.273762418817734e-05, + "loss": 1.691, + "step": 22581 + }, + { + "epoch": 6.931246163290362, + "grad_norm": 0.21905983984470367, + "learning_rate": 2.273345762632415e-05, + "loss": 1.6886, + "step": 22582 + }, + { + "epoch": 6.931553100061388, + "grad_norm": 0.2201247364282608, + "learning_rate": 2.2729291333934914e-05, + "loss": 1.7313, + "step": 22583 + }, + { + "epoch": 6.931860036832412, + "grad_norm": 0.2844204306602478, + "learning_rate": 2.2725125311050805e-05, + "loss": 1.6918, + "step": 22584 + }, + { + "epoch": 6.9321669736034375, + "grad_norm": 0.22451715171337128, + "learning_rate": 2.272095955771299e-05, + "loss": 1.699, + "step": 22585 + }, + { + "epoch": 6.932473910374463, + "grad_norm": 0.27357545495033264, + "learning_rate": 2.2716794073962645e-05, + "loss": 1.7709, + "step": 22586 + }, + { + "epoch": 6.932780847145488, + "grad_norm": 0.2605188190937042, + "learning_rate": 2.271262885984093e-05, + "loss": 1.7812, + "step": 22587 + }, + { + "epoch": 6.9330877839165135, + "grad_norm": 0.1866278201341629, + "learning_rate": 2.270846391538899e-05, + "loss": 1.7204, + "step": 22588 + }, + { + "epoch": 6.933394720687538, + "grad_norm": 0.24624690413475037, + "learning_rate": 2.2704299240648043e-05, + "loss": 1.7345, + "step": 22589 + }, + { + "epoch": 6.933701657458563, + "grad_norm": 0.18003861606121063, + "learning_rate": 2.2700134835659175e-05, + "loss": 1.73, + "step": 22590 + }, + { + "epoch": 6.934008594229589, + "grad_norm": 0.2330949604511261, + "learning_rate": 2.269597070046359e-05, + "loss": 1.7614, + "step": 22591 + }, + { + "epoch": 6.934315531000614, + "grad_norm": 0.18806515634059906, + "learning_rate": 2.269180683510243e-05, + "loss": 1.7364, + "step": 22592 + }, + { + "epoch": 6.934622467771639, + "grad_norm": 0.23998546600341797, + "learning_rate": 2.268764323961684e-05, + "loss": 1.6858, + "step": 22593 + }, + { + "epoch": 6.934929404542665, + "grad_norm": 0.1707296371459961, + "learning_rate": 2.268347991404797e-05, + "loss": 1.6703, + "step": 22594 + }, + { + "epoch": 6.935236341313689, + "grad_norm": 0.19724871218204498, + "learning_rate": 2.267931685843696e-05, + "loss": 1.7338, + "step": 22595 + }, + { + "epoch": 6.935543278084714, + "grad_norm": 0.20384611189365387, + "learning_rate": 2.2675154072824955e-05, + "loss": 1.7224, + "step": 22596 + }, + { + "epoch": 6.93585021485574, + "grad_norm": 0.18632391095161438, + "learning_rate": 2.2670991557253092e-05, + "loss": 1.7006, + "step": 22597 + }, + { + "epoch": 6.936157151626765, + "grad_norm": 0.22928105294704437, + "learning_rate": 2.2666829311762505e-05, + "loss": 1.7462, + "step": 22598 + }, + { + "epoch": 6.93646408839779, + "grad_norm": 0.1905689388513565, + "learning_rate": 2.266266733639434e-05, + "loss": 1.7071, + "step": 22599 + }, + { + "epoch": 6.936771025168815, + "grad_norm": 0.2051437795162201, + "learning_rate": 2.2658505631189708e-05, + "loss": 1.6872, + "step": 22600 + }, + { + "epoch": 6.93707796193984, + "grad_norm": 0.178196981549263, + "learning_rate": 2.265434419618976e-05, + "loss": 1.7044, + "step": 22601 + }, + { + "epoch": 6.9373848987108655, + "grad_norm": 0.21399027109146118, + "learning_rate": 2.26501830314356e-05, + "loss": 1.7529, + "step": 22602 + }, + { + "epoch": 6.937691835481891, + "grad_norm": 0.21747443079948425, + "learning_rate": 2.264602213696837e-05, + "loss": 1.7662, + "step": 22603 + }, + { + "epoch": 6.937998772252916, + "grad_norm": 0.1939898282289505, + "learning_rate": 2.2641861512829177e-05, + "loss": 1.7194, + "step": 22604 + }, + { + "epoch": 6.9383057090239415, + "grad_norm": 0.2183499038219452, + "learning_rate": 2.2637701159059128e-05, + "loss": 1.6659, + "step": 22605 + }, + { + "epoch": 6.938612645794966, + "grad_norm": 0.21971984207630157, + "learning_rate": 2.2633541075699387e-05, + "loss": 1.7729, + "step": 22606 + }, + { + "epoch": 6.938919582565991, + "grad_norm": 0.2611743211746216, + "learning_rate": 2.2629381262790998e-05, + "loss": 1.8, + "step": 22607 + }, + { + "epoch": 6.939226519337017, + "grad_norm": 0.22962158918380737, + "learning_rate": 2.2625221720375144e-05, + "loss": 1.7244, + "step": 22608 + }, + { + "epoch": 6.939533456108042, + "grad_norm": 0.20961032807826996, + "learning_rate": 2.2621062448492858e-05, + "loss": 1.7107, + "step": 22609 + }, + { + "epoch": 6.939840392879067, + "grad_norm": 0.2370155155658722, + "learning_rate": 2.2616903447185293e-05, + "loss": 1.7185, + "step": 22610 + }, + { + "epoch": 6.940147329650092, + "grad_norm": 0.19033893942832947, + "learning_rate": 2.2612744716493544e-05, + "loss": 1.7034, + "step": 22611 + }, + { + "epoch": 6.940454266421117, + "grad_norm": 0.22657649219036102, + "learning_rate": 2.2608586256458704e-05, + "loss": 1.6987, + "step": 22612 + }, + { + "epoch": 6.940761203192142, + "grad_norm": 0.17767953872680664, + "learning_rate": 2.2604428067121862e-05, + "loss": 1.6934, + "step": 22613 + }, + { + "epoch": 6.941068139963168, + "grad_norm": 0.209768146276474, + "learning_rate": 2.2600270148524123e-05, + "loss": 1.7148, + "step": 22614 + }, + { + "epoch": 6.941375076734193, + "grad_norm": 0.21234147250652313, + "learning_rate": 2.2596112500706574e-05, + "loss": 1.7147, + "step": 22615 + }, + { + "epoch": 6.941682013505218, + "grad_norm": 0.17608872056007385, + "learning_rate": 2.2591955123710307e-05, + "loss": 1.6873, + "step": 22616 + }, + { + "epoch": 6.941988950276243, + "grad_norm": 0.1743561178445816, + "learning_rate": 2.25877980175764e-05, + "loss": 1.7273, + "step": 22617 + }, + { + "epoch": 6.942295887047268, + "grad_norm": 0.22064091265201569, + "learning_rate": 2.258364118234594e-05, + "loss": 1.7785, + "step": 22618 + }, + { + "epoch": 6.9426028238182935, + "grad_norm": 0.20353585481643677, + "learning_rate": 2.2579484618060005e-05, + "loss": 1.7518, + "step": 22619 + }, + { + "epoch": 6.942909760589319, + "grad_norm": 0.23978710174560547, + "learning_rate": 2.2575328324759676e-05, + "loss": 1.7576, + "step": 22620 + }, + { + "epoch": 6.943216697360343, + "grad_norm": 0.24991966784000397, + "learning_rate": 2.257117230248603e-05, + "loss": 1.7383, + "step": 22621 + }, + { + "epoch": 6.943523634131369, + "grad_norm": 0.20734381675720215, + "learning_rate": 2.256701655128011e-05, + "loss": 1.7063, + "step": 22622 + }, + { + "epoch": 6.943830570902394, + "grad_norm": 0.20097215473651886, + "learning_rate": 2.2562861071183057e-05, + "loss": 1.7647, + "step": 22623 + }, + { + "epoch": 6.944137507673419, + "grad_norm": 0.20144836604595184, + "learning_rate": 2.2558705862235852e-05, + "loss": 1.7165, + "step": 22624 + }, + { + "epoch": 6.944444444444445, + "grad_norm": 0.20394138991832733, + "learning_rate": 2.255455092447964e-05, + "loss": 1.7048, + "step": 22625 + }, + { + "epoch": 6.94475138121547, + "grad_norm": 0.21430160105228424, + "learning_rate": 2.2550396257955396e-05, + "loss": 1.7233, + "step": 22626 + }, + { + "epoch": 6.945058317986494, + "grad_norm": 0.19071494042873383, + "learning_rate": 2.254624186270425e-05, + "loss": 1.7407, + "step": 22627 + }, + { + "epoch": 6.94536525475752, + "grad_norm": 0.19658641517162323, + "learning_rate": 2.2542087738767232e-05, + "loss": 1.6371, + "step": 22628 + }, + { + "epoch": 6.945672191528545, + "grad_norm": 0.19009098410606384, + "learning_rate": 2.25379338861854e-05, + "loss": 1.7515, + "step": 22629 + }, + { + "epoch": 6.94597912829957, + "grad_norm": 0.21250933408737183, + "learning_rate": 2.2533780304999796e-05, + "loss": 1.7308, + "step": 22630 + }, + { + "epoch": 6.946286065070596, + "grad_norm": 0.22148491442203522, + "learning_rate": 2.2529626995251475e-05, + "loss": 1.705, + "step": 22631 + }, + { + "epoch": 6.94659300184162, + "grad_norm": 0.190248504281044, + "learning_rate": 2.252547395698148e-05, + "loss": 1.7507, + "step": 22632 + }, + { + "epoch": 6.9468999386126455, + "grad_norm": 0.20005743205547333, + "learning_rate": 2.2521321190230855e-05, + "loss": 1.7622, + "step": 22633 + }, + { + "epoch": 6.947206875383671, + "grad_norm": 0.24233438074588776, + "learning_rate": 2.251716869504064e-05, + "loss": 1.7119, + "step": 22634 + }, + { + "epoch": 6.947513812154696, + "grad_norm": 0.20823299884796143, + "learning_rate": 2.2513016471451874e-05, + "loss": 1.69, + "step": 22635 + }, + { + "epoch": 6.9478207489257215, + "grad_norm": 0.21486341953277588, + "learning_rate": 2.250886451950559e-05, + "loss": 1.6528, + "step": 22636 + }, + { + "epoch": 6.948127685696747, + "grad_norm": 0.22201848030090332, + "learning_rate": 2.2504712839242813e-05, + "loss": 1.7454, + "step": 22637 + }, + { + "epoch": 6.948434622467771, + "grad_norm": 0.25179341435432434, + "learning_rate": 2.2500561430704588e-05, + "loss": 1.7226, + "step": 22638 + }, + { + "epoch": 6.948741559238797, + "grad_norm": 0.2510581910610199, + "learning_rate": 2.2496410293931913e-05, + "loss": 1.7048, + "step": 22639 + }, + { + "epoch": 6.949048496009822, + "grad_norm": 0.2406487911939621, + "learning_rate": 2.2492259428965866e-05, + "loss": 1.6751, + "step": 22640 + }, + { + "epoch": 6.949355432780847, + "grad_norm": 0.2555276155471802, + "learning_rate": 2.24881088358474e-05, + "loss": 1.7369, + "step": 22641 + }, + { + "epoch": 6.949662369551873, + "grad_norm": 0.19703364372253418, + "learning_rate": 2.2483958514617597e-05, + "loss": 1.7196, + "step": 22642 + }, + { + "epoch": 6.949969306322897, + "grad_norm": 0.18491938710212708, + "learning_rate": 2.2479808465317414e-05, + "loss": 1.6923, + "step": 22643 + }, + { + "epoch": 6.9502762430939224, + "grad_norm": 0.21588458120822906, + "learning_rate": 2.247565868798791e-05, + "loss": 1.6797, + "step": 22644 + }, + { + "epoch": 6.950583179864948, + "grad_norm": 0.18480601906776428, + "learning_rate": 2.247150918267008e-05, + "loss": 1.6672, + "step": 22645 + }, + { + "epoch": 6.950890116635973, + "grad_norm": 0.261846125125885, + "learning_rate": 2.246735994940493e-05, + "loss": 1.7594, + "step": 22646 + }, + { + "epoch": 6.9511970534069984, + "grad_norm": 0.24510261416435242, + "learning_rate": 2.2463210988233468e-05, + "loss": 1.7712, + "step": 22647 + }, + { + "epoch": 6.951503990178024, + "grad_norm": 0.25896379351615906, + "learning_rate": 2.24590622991967e-05, + "loss": 1.6811, + "step": 22648 + }, + { + "epoch": 6.951810926949048, + "grad_norm": 0.26284709572792053, + "learning_rate": 2.245491388233561e-05, + "loss": 1.7269, + "step": 22649 + }, + { + "epoch": 6.952117863720074, + "grad_norm": 0.1613062471151352, + "learning_rate": 2.245076573769121e-05, + "loss": 1.6162, + "step": 22650 + }, + { + "epoch": 6.952424800491099, + "grad_norm": 0.203482523560524, + "learning_rate": 2.244661786530449e-05, + "loss": 1.7124, + "step": 22651 + }, + { + "epoch": 6.952731737262124, + "grad_norm": 0.18294258415699005, + "learning_rate": 2.2442470265216446e-05, + "loss": 1.7101, + "step": 22652 + }, + { + "epoch": 6.953038674033149, + "grad_norm": 0.1841319352388382, + "learning_rate": 2.2438322937468058e-05, + "loss": 1.723, + "step": 22653 + }, + { + "epoch": 6.953345610804174, + "grad_norm": 0.1600010097026825, + "learning_rate": 2.2434175882100322e-05, + "loss": 1.6867, + "step": 22654 + }, + { + "epoch": 6.953652547575199, + "grad_norm": 0.16904005408287048, + "learning_rate": 2.243002909915421e-05, + "loss": 1.6993, + "step": 22655 + }, + { + "epoch": 6.953959484346225, + "grad_norm": 0.20069406926631927, + "learning_rate": 2.2425882588670692e-05, + "loss": 1.6995, + "step": 22656 + }, + { + "epoch": 6.95426642111725, + "grad_norm": 0.170061394572258, + "learning_rate": 2.2421736350690808e-05, + "loss": 1.7217, + "step": 22657 + }, + { + "epoch": 6.954573357888275, + "grad_norm": 0.20549608767032623, + "learning_rate": 2.241759038525545e-05, + "loss": 1.7229, + "step": 22658 + }, + { + "epoch": 6.9548802946593, + "grad_norm": 0.20916205644607544, + "learning_rate": 2.241344469240566e-05, + "loss": 1.7499, + "step": 22659 + }, + { + "epoch": 6.955187231430325, + "grad_norm": 0.156641885638237, + "learning_rate": 2.2409299272182348e-05, + "loss": 1.6827, + "step": 22660 + }, + { + "epoch": 6.9554941682013505, + "grad_norm": 0.17876049876213074, + "learning_rate": 2.240515412462653e-05, + "loss": 1.6745, + "step": 22661 + }, + { + "epoch": 6.955801104972376, + "grad_norm": 0.17265759408473969, + "learning_rate": 2.2401009249779153e-05, + "loss": 1.7687, + "step": 22662 + }, + { + "epoch": 6.956108041743401, + "grad_norm": 0.18822525441646576, + "learning_rate": 2.2396864647681175e-05, + "loss": 1.6974, + "step": 22663 + }, + { + "epoch": 6.956414978514426, + "grad_norm": 0.18686626851558685, + "learning_rate": 2.2392720318373567e-05, + "loss": 1.7522, + "step": 22664 + }, + { + "epoch": 6.956721915285451, + "grad_norm": 0.1668211668729782, + "learning_rate": 2.238857626189727e-05, + "loss": 1.7198, + "step": 22665 + }, + { + "epoch": 6.957028852056476, + "grad_norm": 0.23307017982006073, + "learning_rate": 2.238443247829325e-05, + "loss": 1.7377, + "step": 22666 + }, + { + "epoch": 6.957335788827502, + "grad_norm": 0.1771896481513977, + "learning_rate": 2.2380288967602453e-05, + "loss": 1.7626, + "step": 22667 + }, + { + "epoch": 6.957642725598527, + "grad_norm": 0.185984805226326, + "learning_rate": 2.237614572986583e-05, + "loss": 1.7328, + "step": 22668 + }, + { + "epoch": 6.957949662369552, + "grad_norm": 0.3076271414756775, + "learning_rate": 2.2372002765124327e-05, + "loss": 1.7081, + "step": 22669 + }, + { + "epoch": 6.958256599140577, + "grad_norm": 0.17874667048454285, + "learning_rate": 2.2367860073418885e-05, + "loss": 1.6752, + "step": 22670 + }, + { + "epoch": 6.958563535911602, + "grad_norm": 0.2044304609298706, + "learning_rate": 2.2363717654790445e-05, + "loss": 1.7325, + "step": 22671 + }, + { + "epoch": 6.958870472682627, + "grad_norm": 0.19335824251174927, + "learning_rate": 2.2359575509279945e-05, + "loss": 1.7192, + "step": 22672 + }, + { + "epoch": 6.959177409453653, + "grad_norm": 0.19514116644859314, + "learning_rate": 2.23554336369283e-05, + "loss": 1.7186, + "step": 22673 + }, + { + "epoch": 6.959484346224678, + "grad_norm": 0.2779110372066498, + "learning_rate": 2.23512920377765e-05, + "loss": 1.7391, + "step": 22674 + }, + { + "epoch": 6.9597912829957025, + "grad_norm": 0.17390480637550354, + "learning_rate": 2.2347150711865406e-05, + "loss": 1.6538, + "step": 22675 + }, + { + "epoch": 6.960098219766728, + "grad_norm": 0.1640262007713318, + "learning_rate": 2.234300965923601e-05, + "loss": 1.6534, + "step": 22676 + }, + { + "epoch": 6.960405156537753, + "grad_norm": 0.17519034445285797, + "learning_rate": 2.2338868879929165e-05, + "loss": 1.6931, + "step": 22677 + }, + { + "epoch": 6.9607120933087785, + "grad_norm": 0.16885873675346375, + "learning_rate": 2.2334728373985847e-05, + "loss": 1.7204, + "step": 22678 + }, + { + "epoch": 6.961019030079804, + "grad_norm": 0.16997110843658447, + "learning_rate": 2.2330588141446963e-05, + "loss": 1.7063, + "step": 22679 + }, + { + "epoch": 6.961325966850829, + "grad_norm": 0.17793773114681244, + "learning_rate": 2.2326448182353422e-05, + "loss": 1.7382, + "step": 22680 + }, + { + "epoch": 6.961632903621854, + "grad_norm": 0.1809101551771164, + "learning_rate": 2.2322308496746134e-05, + "loss": 1.6874, + "step": 22681 + }, + { + "epoch": 6.961939840392879, + "grad_norm": 0.19095295667648315, + "learning_rate": 2.2318169084666023e-05, + "loss": 1.7122, + "step": 22682 + }, + { + "epoch": 6.962246777163904, + "grad_norm": 0.19206218421459198, + "learning_rate": 2.2314029946153992e-05, + "loss": 1.6733, + "step": 22683 + }, + { + "epoch": 6.96255371393493, + "grad_norm": 0.21243152022361755, + "learning_rate": 2.2309891081250938e-05, + "loss": 1.7026, + "step": 22684 + }, + { + "epoch": 6.962860650705955, + "grad_norm": 0.17602933943271637, + "learning_rate": 2.2305752489997777e-05, + "loss": 1.7073, + "step": 22685 + }, + { + "epoch": 6.963167587476979, + "grad_norm": 0.21810807287693024, + "learning_rate": 2.2301614172435398e-05, + "loss": 1.7323, + "step": 22686 + }, + { + "epoch": 6.963474524248005, + "grad_norm": 0.20711791515350342, + "learning_rate": 2.2297476128604706e-05, + "loss": 1.7228, + "step": 22687 + }, + { + "epoch": 6.96378146101903, + "grad_norm": 0.20376695692539215, + "learning_rate": 2.2293338358546583e-05, + "loss": 1.715, + "step": 22688 + }, + { + "epoch": 6.964088397790055, + "grad_norm": 0.20096196234226227, + "learning_rate": 2.228920086230194e-05, + "loss": 1.7239, + "step": 22689 + }, + { + "epoch": 6.964395334561081, + "grad_norm": 0.24215486645698547, + "learning_rate": 2.228506363991163e-05, + "loss": 1.7879, + "step": 22690 + }, + { + "epoch": 6.964702271332106, + "grad_norm": 0.1917567104101181, + "learning_rate": 2.2280926691416603e-05, + "loss": 1.6903, + "step": 22691 + }, + { + "epoch": 6.9650092081031305, + "grad_norm": 0.19827421009540558, + "learning_rate": 2.2276790016857673e-05, + "loss": 1.7654, + "step": 22692 + }, + { + "epoch": 6.965316144874156, + "grad_norm": 0.20852476358413696, + "learning_rate": 2.2272653616275784e-05, + "loss": 1.7452, + "step": 22693 + }, + { + "epoch": 6.965623081645181, + "grad_norm": 0.21223776042461395, + "learning_rate": 2.2268517489711755e-05, + "loss": 1.6973, + "step": 22694 + }, + { + "epoch": 6.9659300184162065, + "grad_norm": 0.1903543621301651, + "learning_rate": 2.22643816372065e-05, + "loss": 1.7398, + "step": 22695 + }, + { + "epoch": 6.966236955187231, + "grad_norm": 0.21726597845554352, + "learning_rate": 2.2260246058800888e-05, + "loss": 1.7813, + "step": 22696 + }, + { + "epoch": 6.966543891958256, + "grad_norm": 0.1710241734981537, + "learning_rate": 2.225611075453578e-05, + "loss": 1.6647, + "step": 22697 + }, + { + "epoch": 6.966850828729282, + "grad_norm": 0.199532151222229, + "learning_rate": 2.2251975724452045e-05, + "loss": 1.7503, + "step": 22698 + }, + { + "epoch": 6.967157765500307, + "grad_norm": 0.18966728448867798, + "learning_rate": 2.224784096859055e-05, + "loss": 1.8113, + "step": 22699 + }, + { + "epoch": 6.967464702271332, + "grad_norm": 0.1977413445711136, + "learning_rate": 2.2243706486992162e-05, + "loss": 1.7036, + "step": 22700 + }, + { + "epoch": 6.967771639042358, + "grad_norm": 0.1794840395450592, + "learning_rate": 2.223957227969773e-05, + "loss": 1.714, + "step": 22701 + }, + { + "epoch": 6.968078575813382, + "grad_norm": 0.1811632663011551, + "learning_rate": 2.2235438346748117e-05, + "loss": 1.6845, + "step": 22702 + }, + { + "epoch": 6.968385512584407, + "grad_norm": 0.17478540539741516, + "learning_rate": 2.2231304688184172e-05, + "loss": 1.7078, + "step": 22703 + }, + { + "epoch": 6.968692449355433, + "grad_norm": 0.22631226480007172, + "learning_rate": 2.2227171304046756e-05, + "loss": 1.7576, + "step": 22704 + }, + { + "epoch": 6.968999386126458, + "grad_norm": 0.20498304069042206, + "learning_rate": 2.2223038194376712e-05, + "loss": 1.7342, + "step": 22705 + }, + { + "epoch": 6.969306322897483, + "grad_norm": 0.18556833267211914, + "learning_rate": 2.221890535921488e-05, + "loss": 1.6583, + "step": 22706 + }, + { + "epoch": 6.969613259668508, + "grad_norm": 0.19878216087818146, + "learning_rate": 2.221477279860209e-05, + "loss": 1.7536, + "step": 22707 + }, + { + "epoch": 6.969920196439533, + "grad_norm": 0.20304621756076813, + "learning_rate": 2.221064051257924e-05, + "loss": 1.7263, + "step": 22708 + }, + { + "epoch": 6.9702271332105585, + "grad_norm": 0.18725872039794922, + "learning_rate": 2.220650850118709e-05, + "loss": 1.7174, + "step": 22709 + }, + { + "epoch": 6.970534069981584, + "grad_norm": 0.28994759917259216, + "learning_rate": 2.2202376764466554e-05, + "loss": 1.7401, + "step": 22710 + }, + { + "epoch": 6.970841006752609, + "grad_norm": 0.19320951402187347, + "learning_rate": 2.2198245302458383e-05, + "loss": 1.7204, + "step": 22711 + }, + { + "epoch": 6.9711479435236345, + "grad_norm": 0.24737104773521423, + "learning_rate": 2.2194114115203464e-05, + "loss": 1.7418, + "step": 22712 + }, + { + "epoch": 6.971454880294659, + "grad_norm": 0.18811406195163727, + "learning_rate": 2.218998320274261e-05, + "loss": 1.6999, + "step": 22713 + }, + { + "epoch": 6.971761817065684, + "grad_norm": 0.20729362964630127, + "learning_rate": 2.2185852565116638e-05, + "loss": 1.6833, + "step": 22714 + }, + { + "epoch": 6.97206875383671, + "grad_norm": 0.1862284392118454, + "learning_rate": 2.2181722202366378e-05, + "loss": 1.7232, + "step": 22715 + }, + { + "epoch": 6.972375690607735, + "grad_norm": 0.24128347635269165, + "learning_rate": 2.217759211453264e-05, + "loss": 1.7081, + "step": 22716 + }, + { + "epoch": 6.97268262737876, + "grad_norm": 0.2007059007883072, + "learning_rate": 2.217346230165625e-05, + "loss": 1.7383, + "step": 22717 + }, + { + "epoch": 6.972989564149785, + "grad_norm": 0.2177598625421524, + "learning_rate": 2.216933276377801e-05, + "loss": 1.7494, + "step": 22718 + }, + { + "epoch": 6.97329650092081, + "grad_norm": 0.20965704321861267, + "learning_rate": 2.2165203500938735e-05, + "loss": 1.7326, + "step": 22719 + }, + { + "epoch": 6.973603437691835, + "grad_norm": 0.17255879938602448, + "learning_rate": 2.2161074513179237e-05, + "loss": 1.6713, + "step": 22720 + }, + { + "epoch": 6.973910374462861, + "grad_norm": 0.21480637788772583, + "learning_rate": 2.215694580054032e-05, + "loss": 1.7248, + "step": 22721 + }, + { + "epoch": 6.974217311233886, + "grad_norm": 0.15835267305374146, + "learning_rate": 2.215281736306278e-05, + "loss": 1.7086, + "step": 22722 + }, + { + "epoch": 6.974524248004911, + "grad_norm": 0.20524290204048157, + "learning_rate": 2.2148689200787415e-05, + "loss": 1.7472, + "step": 22723 + }, + { + "epoch": 6.974831184775936, + "grad_norm": 0.16152524948120117, + "learning_rate": 2.214456131375502e-05, + "loss": 1.6373, + "step": 22724 + }, + { + "epoch": 6.975138121546961, + "grad_norm": 0.1995699107646942, + "learning_rate": 2.2140433702006425e-05, + "loss": 1.6949, + "step": 22725 + }, + { + "epoch": 6.975445058317987, + "grad_norm": 0.19927829504013062, + "learning_rate": 2.213630636558236e-05, + "loss": 1.7875, + "step": 22726 + }, + { + "epoch": 6.975751995089012, + "grad_norm": 0.19159351289272308, + "learning_rate": 2.213217930452368e-05, + "loss": 1.7067, + "step": 22727 + }, + { + "epoch": 6.976058931860036, + "grad_norm": 0.21832366287708282, + "learning_rate": 2.2128052518871107e-05, + "loss": 1.6952, + "step": 22728 + }, + { + "epoch": 6.976365868631062, + "grad_norm": 0.2433125376701355, + "learning_rate": 2.212392600866547e-05, + "loss": 1.7503, + "step": 22729 + }, + { + "epoch": 6.976672805402087, + "grad_norm": 0.25504401326179504, + "learning_rate": 2.2119799773947535e-05, + "loss": 1.7289, + "step": 22730 + }, + { + "epoch": 6.976979742173112, + "grad_norm": 0.20463863015174866, + "learning_rate": 2.211567381475808e-05, + "loss": 1.7442, + "step": 22731 + }, + { + "epoch": 6.977286678944138, + "grad_norm": 0.21862375736236572, + "learning_rate": 2.2111548131137883e-05, + "loss": 1.7266, + "step": 22732 + }, + { + "epoch": 6.977593615715163, + "grad_norm": 0.2124018520116806, + "learning_rate": 2.210742272312771e-05, + "loss": 1.7555, + "step": 22733 + }, + { + "epoch": 6.9779005524861875, + "grad_norm": 0.2911135256290436, + "learning_rate": 2.2103297590768334e-05, + "loss": 1.711, + "step": 22734 + }, + { + "epoch": 6.978207489257213, + "grad_norm": 0.2172393649816513, + "learning_rate": 2.2099172734100525e-05, + "loss": 1.7054, + "step": 22735 + }, + { + "epoch": 6.978514426028238, + "grad_norm": 0.28964513540267944, + "learning_rate": 2.2095048153165043e-05, + "loss": 1.7231, + "step": 22736 + }, + { + "epoch": 6.9788213627992635, + "grad_norm": 0.2557905316352844, + "learning_rate": 2.209092384800265e-05, + "loss": 1.7219, + "step": 22737 + }, + { + "epoch": 6.979128299570289, + "grad_norm": 0.23358628153800964, + "learning_rate": 2.2086799818654102e-05, + "loss": 1.7627, + "step": 22738 + }, + { + "epoch": 6.979435236341313, + "grad_norm": 0.18856312334537506, + "learning_rate": 2.2082676065160163e-05, + "loss": 1.6577, + "step": 22739 + }, + { + "epoch": 6.979742173112339, + "grad_norm": 0.18412479758262634, + "learning_rate": 2.207855258756158e-05, + "loss": 1.6661, + "step": 22740 + }, + { + "epoch": 6.980049109883364, + "grad_norm": 0.20592401921749115, + "learning_rate": 2.207442938589911e-05, + "loss": 1.6737, + "step": 22741 + }, + { + "epoch": 6.980356046654389, + "grad_norm": 0.2015630006790161, + "learning_rate": 2.2070306460213493e-05, + "loss": 1.73, + "step": 22742 + }, + { + "epoch": 6.980662983425415, + "grad_norm": 0.23446126282215118, + "learning_rate": 2.2066183810545454e-05, + "loss": 1.7391, + "step": 22743 + }, + { + "epoch": 6.98096992019644, + "grad_norm": 0.1810954511165619, + "learning_rate": 2.2062061436935803e-05, + "loss": 1.689, + "step": 22744 + }, + { + "epoch": 6.981276856967464, + "grad_norm": 0.25031471252441406, + "learning_rate": 2.20579393394252e-05, + "loss": 1.8161, + "step": 22745 + }, + { + "epoch": 6.98158379373849, + "grad_norm": 0.183212012052536, + "learning_rate": 2.2053817518054433e-05, + "loss": 1.6494, + "step": 22746 + }, + { + "epoch": 6.981890730509515, + "grad_norm": 0.2115766555070877, + "learning_rate": 2.204969597286422e-05, + "loss": 1.6912, + "step": 22747 + }, + { + "epoch": 6.98219766728054, + "grad_norm": 0.19966226816177368, + "learning_rate": 2.2045574703895296e-05, + "loss": 1.7002, + "step": 22748 + }, + { + "epoch": 6.982504604051566, + "grad_norm": 0.20601172745227814, + "learning_rate": 2.2041453711188385e-05, + "loss": 1.7839, + "step": 22749 + }, + { + "epoch": 6.98281154082259, + "grad_norm": 0.2174808531999588, + "learning_rate": 2.2037332994784222e-05, + "loss": 1.7169, + "step": 22750 + }, + { + "epoch": 6.9831184775936155, + "grad_norm": 0.1921808421611786, + "learning_rate": 2.2033212554723514e-05, + "loss": 1.6754, + "step": 22751 + }, + { + "epoch": 6.983425414364641, + "grad_norm": 0.1977350264787674, + "learning_rate": 2.2029092391046997e-05, + "loss": 1.7408, + "step": 22752 + }, + { + "epoch": 6.983732351135666, + "grad_norm": 0.18366695940494537, + "learning_rate": 2.2024972503795383e-05, + "loss": 1.6818, + "step": 22753 + }, + { + "epoch": 6.9840392879066915, + "grad_norm": 0.18127809464931488, + "learning_rate": 2.2020852893009387e-05, + "loss": 1.7392, + "step": 22754 + }, + { + "epoch": 6.984346224677717, + "grad_norm": 0.1973503679037094, + "learning_rate": 2.2016733558729718e-05, + "loss": 1.7416, + "step": 22755 + }, + { + "epoch": 6.984653161448741, + "grad_norm": 0.1971634328365326, + "learning_rate": 2.2012614500997096e-05, + "loss": 1.7545, + "step": 22756 + }, + { + "epoch": 6.984960098219767, + "grad_norm": 0.17244087159633636, + "learning_rate": 2.2008495719852218e-05, + "loss": 1.7348, + "step": 22757 + }, + { + "epoch": 6.985267034990792, + "grad_norm": 0.19024424254894257, + "learning_rate": 2.200437721533579e-05, + "loss": 1.6647, + "step": 22758 + }, + { + "epoch": 6.985573971761817, + "grad_norm": 0.18455122411251068, + "learning_rate": 2.200025898748852e-05, + "loss": 1.7528, + "step": 22759 + }, + { + "epoch": 6.985880908532843, + "grad_norm": 0.24437187612056732, + "learning_rate": 2.199614103635108e-05, + "loss": 1.7101, + "step": 22760 + }, + { + "epoch": 6.986187845303867, + "grad_norm": 0.18844331800937653, + "learning_rate": 2.1992023361964224e-05, + "loss": 1.6864, + "step": 22761 + }, + { + "epoch": 6.986494782074892, + "grad_norm": 0.18768003582954407, + "learning_rate": 2.1987905964368576e-05, + "loss": 1.6482, + "step": 22762 + }, + { + "epoch": 6.986801718845918, + "grad_norm": 0.19491778314113617, + "learning_rate": 2.1983788843604898e-05, + "loss": 1.7106, + "step": 22763 + }, + { + "epoch": 6.987108655616943, + "grad_norm": 0.23565757274627686, + "learning_rate": 2.1979671999713797e-05, + "loss": 1.7362, + "step": 22764 + }, + { + "epoch": 6.987415592387968, + "grad_norm": 0.2097240835428238, + "learning_rate": 2.1975555432736018e-05, + "loss": 1.7305, + "step": 22765 + }, + { + "epoch": 6.987722529158994, + "grad_norm": 0.2171555608510971, + "learning_rate": 2.197143914271223e-05, + "loss": 1.7213, + "step": 22766 + }, + { + "epoch": 6.988029465930018, + "grad_norm": 0.1993926763534546, + "learning_rate": 2.196732312968311e-05, + "loss": 1.6901, + "step": 22767 + }, + { + "epoch": 6.9883364027010435, + "grad_norm": 0.2345978319644928, + "learning_rate": 2.1963207393689346e-05, + "loss": 1.7456, + "step": 22768 + }, + { + "epoch": 6.988643339472069, + "grad_norm": 0.20831161737442017, + "learning_rate": 2.1959091934771564e-05, + "loss": 1.764, + "step": 22769 + }, + { + "epoch": 6.988950276243094, + "grad_norm": 0.24944809079170227, + "learning_rate": 2.195497675297049e-05, + "loss": 1.7398, + "step": 22770 + }, + { + "epoch": 6.989257213014119, + "grad_norm": 0.25463199615478516, + "learning_rate": 2.1950861848326777e-05, + "loss": 1.7002, + "step": 22771 + }, + { + "epoch": 6.989564149785144, + "grad_norm": 0.2298898696899414, + "learning_rate": 2.194674722088108e-05, + "loss": 1.755, + "step": 22772 + }, + { + "epoch": 6.989871086556169, + "grad_norm": 0.21839721500873566, + "learning_rate": 2.194263287067408e-05, + "loss": 1.6667, + "step": 22773 + }, + { + "epoch": 6.990178023327195, + "grad_norm": 0.2197437435388565, + "learning_rate": 2.1938518797746417e-05, + "loss": 1.6774, + "step": 22774 + }, + { + "epoch": 6.99048496009822, + "grad_norm": 0.23588024079799652, + "learning_rate": 2.1934405002138763e-05, + "loss": 1.6916, + "step": 22775 + }, + { + "epoch": 6.990791896869245, + "grad_norm": 0.20632316172122955, + "learning_rate": 2.1930291483891767e-05, + "loss": 1.6682, + "step": 22776 + }, + { + "epoch": 6.99109883364027, + "grad_norm": 0.22786293923854828, + "learning_rate": 2.192617824304607e-05, + "loss": 1.7138, + "step": 22777 + }, + { + "epoch": 6.991405770411295, + "grad_norm": 0.3235599994659424, + "learning_rate": 2.1922065279642363e-05, + "loss": 1.7545, + "step": 22778 + }, + { + "epoch": 6.99171270718232, + "grad_norm": 0.1919393390417099, + "learning_rate": 2.191795259372123e-05, + "loss": 1.7422, + "step": 22779 + }, + { + "epoch": 6.992019643953346, + "grad_norm": 0.16472585499286652, + "learning_rate": 2.1913840185323385e-05, + "loss": 1.6824, + "step": 22780 + }, + { + "epoch": 6.992326580724371, + "grad_norm": 0.21422579884529114, + "learning_rate": 2.1909728054489397e-05, + "loss": 1.696, + "step": 22781 + }, + { + "epoch": 6.9926335174953955, + "grad_norm": 0.18965782225131989, + "learning_rate": 2.190561620125996e-05, + "loss": 1.7026, + "step": 22782 + }, + { + "epoch": 6.992940454266421, + "grad_norm": 0.184856116771698, + "learning_rate": 2.190150462567569e-05, + "loss": 1.7202, + "step": 22783 + }, + { + "epoch": 6.993247391037446, + "grad_norm": 0.18382076919078827, + "learning_rate": 2.1897393327777223e-05, + "loss": 1.7525, + "step": 22784 + }, + { + "epoch": 6.9935543278084715, + "grad_norm": 0.17239750921726227, + "learning_rate": 2.1893282307605202e-05, + "loss": 1.7297, + "step": 22785 + }, + { + "epoch": 6.993861264579497, + "grad_norm": 0.18522322177886963, + "learning_rate": 2.18891715652002e-05, + "loss": 1.6952, + "step": 22786 + }, + { + "epoch": 6.994168201350522, + "grad_norm": 0.1946135014295578, + "learning_rate": 2.18850611006029e-05, + "loss": 1.6879, + "step": 22787 + }, + { + "epoch": 6.994475138121547, + "grad_norm": 0.2028069645166397, + "learning_rate": 2.188095091385391e-05, + "loss": 1.7412, + "step": 22788 + }, + { + "epoch": 6.994782074892572, + "grad_norm": 0.18794523179531097, + "learning_rate": 2.1876841004993838e-05, + "loss": 1.6936, + "step": 22789 + }, + { + "epoch": 6.995089011663597, + "grad_norm": 0.1912194788455963, + "learning_rate": 2.187273137406331e-05, + "loss": 1.7051, + "step": 22790 + }, + { + "epoch": 6.995395948434623, + "grad_norm": 0.1528688222169876, + "learning_rate": 2.1868622021102934e-05, + "loss": 1.6816, + "step": 22791 + }, + { + "epoch": 6.995702885205648, + "grad_norm": 0.2108357548713684, + "learning_rate": 2.1864512946153325e-05, + "loss": 1.7018, + "step": 22792 + }, + { + "epoch": 6.996009821976672, + "grad_norm": 0.16667310893535614, + "learning_rate": 2.1860404149255092e-05, + "loss": 1.7235, + "step": 22793 + }, + { + "epoch": 6.996316758747698, + "grad_norm": 0.16995872557163239, + "learning_rate": 2.185629563044882e-05, + "loss": 1.7086, + "step": 22794 + }, + { + "epoch": 6.996623695518723, + "grad_norm": 0.1962304711341858, + "learning_rate": 2.1852187389775165e-05, + "loss": 1.7523, + "step": 22795 + }, + { + "epoch": 6.996930632289748, + "grad_norm": 0.17774102091789246, + "learning_rate": 2.1848079427274655e-05, + "loss": 1.6649, + "step": 22796 + }, + { + "epoch": 6.997237569060774, + "grad_norm": 0.18844567239284515, + "learning_rate": 2.184397174298796e-05, + "loss": 1.7281, + "step": 22797 + }, + { + "epoch": 6.997544505831799, + "grad_norm": 0.15324150025844574, + "learning_rate": 2.1839864336955607e-05, + "loss": 1.6496, + "step": 22798 + }, + { + "epoch": 6.9978514426028235, + "grad_norm": 0.25148099660873413, + "learning_rate": 2.1835757209218233e-05, + "loss": 1.7889, + "step": 22799 + }, + { + "epoch": 6.998158379373849, + "grad_norm": 0.22258763015270233, + "learning_rate": 2.1831650359816414e-05, + "loss": 1.7303, + "step": 22800 + }, + { + "epoch": 6.998465316144874, + "grad_norm": 0.21465472877025604, + "learning_rate": 2.182754378879074e-05, + "loss": 1.733, + "step": 22801 + }, + { + "epoch": 6.9987722529158995, + "grad_norm": 0.1894017904996872, + "learning_rate": 2.182343749618181e-05, + "loss": 1.7104, + "step": 22802 + }, + { + "epoch": 6.999079189686924, + "grad_norm": 0.19616369903087616, + "learning_rate": 2.181933148203014e-05, + "loss": 1.7015, + "step": 22803 + }, + { + "epoch": 6.999386126457949, + "grad_norm": 0.1720295250415802, + "learning_rate": 2.181522574637638e-05, + "loss": 1.6609, + "step": 22804 + }, + { + "epoch": 6.999693063228975, + "grad_norm": 0.2508579194545746, + "learning_rate": 2.1811120289261077e-05, + "loss": 1.7485, + "step": 22805 + }, + { + "epoch": 7.0, + "grad_norm": 0.1701229363679886, + "learning_rate": 2.1807015110724805e-05, + "loss": 1.6822, + "step": 22806 + }, + { + "epoch": 7.000306936771025, + "grad_norm": 0.17413921654224396, + "learning_rate": 2.1802910210808135e-05, + "loss": 1.6944, + "step": 22807 + }, + { + "epoch": 7.000613873542051, + "grad_norm": 0.22573722898960114, + "learning_rate": 2.179880558955163e-05, + "loss": 1.7499, + "step": 22808 + }, + { + "epoch": 7.000920810313075, + "grad_norm": 0.2477746456861496, + "learning_rate": 2.1794701246995857e-05, + "loss": 1.7663, + "step": 22809 + }, + { + "epoch": 7.0012277470841005, + "grad_norm": 0.15338411927223206, + "learning_rate": 2.1790597183181384e-05, + "loss": 1.6425, + "step": 22810 + }, + { + "epoch": 7.001534683855126, + "grad_norm": 0.2119540572166443, + "learning_rate": 2.1786493398148738e-05, + "loss": 1.6695, + "step": 22811 + }, + { + "epoch": 7.001841620626151, + "grad_norm": 0.283037930727005, + "learning_rate": 2.178238989193854e-05, + "loss": 1.7479, + "step": 22812 + }, + { + "epoch": 7.0021485573971765, + "grad_norm": 0.2939838767051697, + "learning_rate": 2.1778286664591276e-05, + "loss": 1.733, + "step": 22813 + }, + { + "epoch": 7.002455494168202, + "grad_norm": 0.21681749820709229, + "learning_rate": 2.1774183716147552e-05, + "loss": 1.6804, + "step": 22814 + }, + { + "epoch": 7.002762430939226, + "grad_norm": 0.29066696763038635, + "learning_rate": 2.177008104664785e-05, + "loss": 1.7435, + "step": 22815 + }, + { + "epoch": 7.003069367710252, + "grad_norm": 0.17104873061180115, + "learning_rate": 2.1765978656132773e-05, + "loss": 1.6637, + "step": 22816 + }, + { + "epoch": 7.003376304481277, + "grad_norm": 0.29808685183525085, + "learning_rate": 2.1761876544642846e-05, + "loss": 1.7342, + "step": 22817 + }, + { + "epoch": 7.003683241252302, + "grad_norm": 0.20467214286327362, + "learning_rate": 2.1757774712218603e-05, + "loss": 1.7638, + "step": 22818 + }, + { + "epoch": 7.003990178023328, + "grad_norm": 0.23166583478450775, + "learning_rate": 2.1753673158900607e-05, + "loss": 1.6972, + "step": 22819 + }, + { + "epoch": 7.004297114794352, + "grad_norm": 0.20098255574703217, + "learning_rate": 2.1749571884729332e-05, + "loss": 1.6973, + "step": 22820 + }, + { + "epoch": 7.004604051565377, + "grad_norm": 0.212421715259552, + "learning_rate": 2.1745470889745358e-05, + "loss": 1.7183, + "step": 22821 + }, + { + "epoch": 7.004910988336403, + "grad_norm": 0.2496720403432846, + "learning_rate": 2.17413701739892e-05, + "loss": 1.7928, + "step": 22822 + }, + { + "epoch": 7.005217925107428, + "grad_norm": 0.21050602197647095, + "learning_rate": 2.1737269737501394e-05, + "loss": 1.7379, + "step": 22823 + }, + { + "epoch": 7.005524861878453, + "grad_norm": 0.18321558833122253, + "learning_rate": 2.1733169580322448e-05, + "loss": 1.733, + "step": 22824 + }, + { + "epoch": 7.005831798649478, + "grad_norm": 0.19890302419662476, + "learning_rate": 2.1729069702492887e-05, + "loss": 1.6799, + "step": 22825 + }, + { + "epoch": 7.006138735420503, + "grad_norm": 0.19961030781269073, + "learning_rate": 2.172497010405323e-05, + "loss": 1.6754, + "step": 22826 + }, + { + "epoch": 7.0064456721915285, + "grad_norm": 0.19672131538391113, + "learning_rate": 2.1720870785043988e-05, + "loss": 1.7099, + "step": 22827 + }, + { + "epoch": 7.006752608962554, + "grad_norm": 0.16798892617225647, + "learning_rate": 2.1716771745505666e-05, + "loss": 1.7096, + "step": 22828 + }, + { + "epoch": 7.007059545733579, + "grad_norm": 0.2276654690504074, + "learning_rate": 2.1712672985478815e-05, + "loss": 1.7627, + "step": 22829 + }, + { + "epoch": 7.0073664825046045, + "grad_norm": 0.17108316719532013, + "learning_rate": 2.1708574505003872e-05, + "loss": 1.6941, + "step": 22830 + }, + { + "epoch": 7.007673419275629, + "grad_norm": 0.2094760239124298, + "learning_rate": 2.1704476304121413e-05, + "loss": 1.7152, + "step": 22831 + }, + { + "epoch": 7.007980356046654, + "grad_norm": 0.17183393239974976, + "learning_rate": 2.1700378382871872e-05, + "loss": 1.6668, + "step": 22832 + }, + { + "epoch": 7.00828729281768, + "grad_norm": 0.2075900435447693, + "learning_rate": 2.1696280741295795e-05, + "loss": 1.7732, + "step": 22833 + }, + { + "epoch": 7.008594229588705, + "grad_norm": 0.20075511932373047, + "learning_rate": 2.169218337943368e-05, + "loss": 1.7228, + "step": 22834 + }, + { + "epoch": 7.00890116635973, + "grad_norm": 0.19461359083652496, + "learning_rate": 2.168808629732596e-05, + "loss": 1.6942, + "step": 22835 + }, + { + "epoch": 7.009208103130755, + "grad_norm": 0.18972480297088623, + "learning_rate": 2.16839894950132e-05, + "loss": 1.7087, + "step": 22836 + }, + { + "epoch": 7.00951503990178, + "grad_norm": 0.19522632658481598, + "learning_rate": 2.167989297253582e-05, + "loss": 1.7427, + "step": 22837 + }, + { + "epoch": 7.009821976672805, + "grad_norm": 0.2088990956544876, + "learning_rate": 2.1675796729934355e-05, + "loss": 1.786, + "step": 22838 + }, + { + "epoch": 7.010128913443831, + "grad_norm": 0.2052021473646164, + "learning_rate": 2.167170076724927e-05, + "loss": 1.765, + "step": 22839 + }, + { + "epoch": 7.010435850214856, + "grad_norm": 0.19566771388053894, + "learning_rate": 2.1667605084521043e-05, + "loss": 1.703, + "step": 22840 + }, + { + "epoch": 7.0107427869858805, + "grad_norm": 0.24589677155017853, + "learning_rate": 2.166350968179014e-05, + "loss": 1.7544, + "step": 22841 + }, + { + "epoch": 7.011049723756906, + "grad_norm": 0.28059569001197815, + "learning_rate": 2.1659414559097053e-05, + "loss": 1.7081, + "step": 22842 + }, + { + "epoch": 7.011356660527931, + "grad_norm": 0.20781446993350983, + "learning_rate": 2.1655319716482237e-05, + "loss": 1.6968, + "step": 22843 + }, + { + "epoch": 7.0116635972989565, + "grad_norm": 0.31703317165374756, + "learning_rate": 2.1651225153986167e-05, + "loss": 1.704, + "step": 22844 + }, + { + "epoch": 7.011970534069982, + "grad_norm": 0.19668029248714447, + "learning_rate": 2.1647130871649283e-05, + "loss": 1.738, + "step": 22845 + }, + { + "epoch": 7.012277470841007, + "grad_norm": 0.3768141567707062, + "learning_rate": 2.1643036869512105e-05, + "loss": 1.7407, + "step": 22846 + }, + { + "epoch": 7.012584407612032, + "grad_norm": 0.22228674590587616, + "learning_rate": 2.1638943147615032e-05, + "loss": 1.7162, + "step": 22847 + }, + { + "epoch": 7.012891344383057, + "grad_norm": 0.26087433099746704, + "learning_rate": 2.1634849705998572e-05, + "loss": 1.6916, + "step": 22848 + }, + { + "epoch": 7.013198281154082, + "grad_norm": 0.19660449028015137, + "learning_rate": 2.1630756544703117e-05, + "loss": 1.7024, + "step": 22849 + }, + { + "epoch": 7.013505217925108, + "grad_norm": 0.2287406474351883, + "learning_rate": 2.1626663663769176e-05, + "loss": 1.6761, + "step": 22850 + }, + { + "epoch": 7.013812154696133, + "grad_norm": 0.18974192440509796, + "learning_rate": 2.162257106323719e-05, + "loss": 1.6721, + "step": 22851 + }, + { + "epoch": 7.014119091467157, + "grad_norm": 0.25081944465637207, + "learning_rate": 2.1618478743147558e-05, + "loss": 1.7042, + "step": 22852 + }, + { + "epoch": 7.014426028238183, + "grad_norm": 0.187479630112648, + "learning_rate": 2.1614386703540785e-05, + "loss": 1.7057, + "step": 22853 + }, + { + "epoch": 7.014732965009208, + "grad_norm": 0.24785932898521423, + "learning_rate": 2.1610294944457243e-05, + "loss": 1.8033, + "step": 22854 + }, + { + "epoch": 7.015039901780233, + "grad_norm": 0.21570228040218353, + "learning_rate": 2.160620346593743e-05, + "loss": 1.7129, + "step": 22855 + }, + { + "epoch": 7.015346838551259, + "grad_norm": 0.19304436445236206, + "learning_rate": 2.160211226802175e-05, + "loss": 1.7384, + "step": 22856 + }, + { + "epoch": 7.015653775322283, + "grad_norm": 0.18901783227920532, + "learning_rate": 2.1598021350750648e-05, + "loss": 1.6851, + "step": 22857 + }, + { + "epoch": 7.0159607120933085, + "grad_norm": 0.21754276752471924, + "learning_rate": 2.159393071416454e-05, + "loss": 1.7242, + "step": 22858 + }, + { + "epoch": 7.016267648864334, + "grad_norm": 0.18334844708442688, + "learning_rate": 2.1589840358303858e-05, + "loss": 1.66, + "step": 22859 + }, + { + "epoch": 7.016574585635359, + "grad_norm": 0.17688371241092682, + "learning_rate": 2.1585750283209026e-05, + "loss": 1.6693, + "step": 22860 + }, + { + "epoch": 7.0168815224063845, + "grad_norm": 0.17173215746879578, + "learning_rate": 2.158166048892047e-05, + "loss": 1.675, + "step": 22861 + }, + { + "epoch": 7.01718845917741, + "grad_norm": 0.2144075632095337, + "learning_rate": 2.157757097547857e-05, + "loss": 1.7843, + "step": 22862 + }, + { + "epoch": 7.017495395948434, + "grad_norm": 0.18811818957328796, + "learning_rate": 2.1573481742923824e-05, + "loss": 1.6932, + "step": 22863 + }, + { + "epoch": 7.01780233271946, + "grad_norm": 0.19978533685207367, + "learning_rate": 2.1569392791296548e-05, + "loss": 1.7426, + "step": 22864 + }, + { + "epoch": 7.018109269490485, + "grad_norm": 0.19639068841934204, + "learning_rate": 2.1565304120637237e-05, + "loss": 1.7479, + "step": 22865 + }, + { + "epoch": 7.01841620626151, + "grad_norm": 0.2269967794418335, + "learning_rate": 2.1561215730986212e-05, + "loss": 1.7507, + "step": 22866 + }, + { + "epoch": 7.018723143032536, + "grad_norm": 0.19511014223098755, + "learning_rate": 2.1557127622383948e-05, + "loss": 1.7317, + "step": 22867 + }, + { + "epoch": 7.01903007980356, + "grad_norm": 0.23975026607513428, + "learning_rate": 2.1553039794870834e-05, + "loss": 1.7901, + "step": 22868 + }, + { + "epoch": 7.019337016574585, + "grad_norm": 0.20757955312728882, + "learning_rate": 2.154895224848722e-05, + "loss": 1.7823, + "step": 22869 + }, + { + "epoch": 7.019643953345611, + "grad_norm": 0.1893112063407898, + "learning_rate": 2.154486498327357e-05, + "loss": 1.6939, + "step": 22870 + }, + { + "epoch": 7.019950890116636, + "grad_norm": 0.23006685078144073, + "learning_rate": 2.1540777999270205e-05, + "loss": 1.8061, + "step": 22871 + }, + { + "epoch": 7.020257826887661, + "grad_norm": 0.25516194105148315, + "learning_rate": 2.1536691296517573e-05, + "loss": 1.6801, + "step": 22872 + }, + { + "epoch": 7.020564763658686, + "grad_norm": 0.2138557732105255, + "learning_rate": 2.153260487505604e-05, + "loss": 1.7689, + "step": 22873 + }, + { + "epoch": 7.020871700429711, + "grad_norm": 0.2618521749973297, + "learning_rate": 2.152851873492599e-05, + "loss": 1.712, + "step": 22874 + }, + { + "epoch": 7.0211786372007365, + "grad_norm": 0.19639171659946442, + "learning_rate": 2.1524432876167812e-05, + "loss": 1.6883, + "step": 22875 + }, + { + "epoch": 7.021485573971762, + "grad_norm": 0.20283572375774384, + "learning_rate": 2.152034729882187e-05, + "loss": 1.7259, + "step": 22876 + }, + { + "epoch": 7.021792510742787, + "grad_norm": 0.247970849275589, + "learning_rate": 2.151626200292855e-05, + "loss": 1.6714, + "step": 22877 + }, + { + "epoch": 7.0220994475138125, + "grad_norm": 0.20877771079540253, + "learning_rate": 2.1512176988528227e-05, + "loss": 1.7378, + "step": 22878 + }, + { + "epoch": 7.022406384284837, + "grad_norm": 0.2515791356563568, + "learning_rate": 2.1508092255661245e-05, + "loss": 1.743, + "step": 22879 + }, + { + "epoch": 7.022713321055862, + "grad_norm": 0.21451319754123688, + "learning_rate": 2.150400780436804e-05, + "loss": 1.7102, + "step": 22880 + }, + { + "epoch": 7.023020257826888, + "grad_norm": 0.23944756388664246, + "learning_rate": 2.1499923634688886e-05, + "loss": 1.7739, + "step": 22881 + }, + { + "epoch": 7.023327194597913, + "grad_norm": 0.22423309087753296, + "learning_rate": 2.149583974666423e-05, + "loss": 1.7598, + "step": 22882 + }, + { + "epoch": 7.023634131368938, + "grad_norm": 0.31337371468544006, + "learning_rate": 2.1491756140334358e-05, + "loss": 1.7417, + "step": 22883 + }, + { + "epoch": 7.023941068139963, + "grad_norm": 0.22430868446826935, + "learning_rate": 2.148767281573968e-05, + "loss": 1.712, + "step": 22884 + }, + { + "epoch": 7.024248004910988, + "grad_norm": 0.26083487272262573, + "learning_rate": 2.148358977292054e-05, + "loss": 1.6816, + "step": 22885 + }, + { + "epoch": 7.024554941682013, + "grad_norm": 0.2283557504415512, + "learning_rate": 2.1479507011917255e-05, + "loss": 1.7539, + "step": 22886 + }, + { + "epoch": 7.024861878453039, + "grad_norm": 0.22732317447662354, + "learning_rate": 2.1475424532770232e-05, + "loss": 1.697, + "step": 22887 + }, + { + "epoch": 7.025168815224064, + "grad_norm": 0.19614318013191223, + "learning_rate": 2.1471342335519746e-05, + "loss": 1.7267, + "step": 22888 + }, + { + "epoch": 7.0254757519950894, + "grad_norm": 0.23076513409614563, + "learning_rate": 2.1467260420206192e-05, + "loss": 1.7749, + "step": 22889 + }, + { + "epoch": 7.025782688766114, + "grad_norm": 0.1969364732503891, + "learning_rate": 2.1463178786869892e-05, + "loss": 1.6975, + "step": 22890 + }, + { + "epoch": 7.026089625537139, + "grad_norm": 0.2126578837633133, + "learning_rate": 2.145909743555119e-05, + "loss": 1.6815, + "step": 22891 + }, + { + "epoch": 7.026396562308165, + "grad_norm": 0.20841559767723083, + "learning_rate": 2.1455016366290414e-05, + "loss": 1.727, + "step": 22892 + }, + { + "epoch": 7.02670349907919, + "grad_norm": 0.2523893713951111, + "learning_rate": 2.1450935579127896e-05, + "loss": 1.7213, + "step": 22893 + }, + { + "epoch": 7.027010435850215, + "grad_norm": 0.16219666600227356, + "learning_rate": 2.1446855074103968e-05, + "loss": 1.6406, + "step": 22894 + }, + { + "epoch": 7.02731737262124, + "grad_norm": 0.28709226846694946, + "learning_rate": 2.144277485125895e-05, + "loss": 1.7021, + "step": 22895 + }, + { + "epoch": 7.027624309392265, + "grad_norm": 0.23238243162631989, + "learning_rate": 2.1438694910633174e-05, + "loss": 1.7347, + "step": 22896 + }, + { + "epoch": 7.02793124616329, + "grad_norm": 0.2692428231239319, + "learning_rate": 2.1434615252266948e-05, + "loss": 1.7192, + "step": 22897 + }, + { + "epoch": 7.028238182934316, + "grad_norm": 0.21163232624530792, + "learning_rate": 2.1430535876200584e-05, + "loss": 1.7437, + "step": 22898 + }, + { + "epoch": 7.028545119705341, + "grad_norm": 0.23896420001983643, + "learning_rate": 2.1426456782474446e-05, + "loss": 1.6773, + "step": 22899 + }, + { + "epoch": 7.0288520564763655, + "grad_norm": 0.19021281599998474, + "learning_rate": 2.142237797112877e-05, + "loss": 1.7084, + "step": 22900 + }, + { + "epoch": 7.029158993247391, + "grad_norm": 0.23483091592788696, + "learning_rate": 2.1418299442203926e-05, + "loss": 1.7678, + "step": 22901 + }, + { + "epoch": 7.029465930018416, + "grad_norm": 0.20831161737442017, + "learning_rate": 2.1414221195740213e-05, + "loss": 1.7454, + "step": 22902 + }, + { + "epoch": 7.0297728667894415, + "grad_norm": 0.1961016207933426, + "learning_rate": 2.141014323177789e-05, + "loss": 1.7231, + "step": 22903 + }, + { + "epoch": 7.030079803560467, + "grad_norm": 0.1877545267343521, + "learning_rate": 2.1406065550357322e-05, + "loss": 1.6925, + "step": 22904 + }, + { + "epoch": 7.030386740331492, + "grad_norm": 0.20815789699554443, + "learning_rate": 2.1401988151518738e-05, + "loss": 1.7762, + "step": 22905 + }, + { + "epoch": 7.030693677102517, + "grad_norm": 0.1902543157339096, + "learning_rate": 2.1397911035302487e-05, + "loss": 1.7663, + "step": 22906 + }, + { + "epoch": 7.031000613873542, + "grad_norm": 0.20552431046962738, + "learning_rate": 2.1393834201748846e-05, + "loss": 1.7048, + "step": 22907 + }, + { + "epoch": 7.031307550644567, + "grad_norm": 0.2380477488040924, + "learning_rate": 2.13897576508981e-05, + "loss": 1.7685, + "step": 22908 + }, + { + "epoch": 7.031614487415593, + "grad_norm": 0.18351083993911743, + "learning_rate": 2.1385681382790536e-05, + "loss": 1.7058, + "step": 22909 + }, + { + "epoch": 7.031921424186618, + "grad_norm": 0.21992792189121246, + "learning_rate": 2.1381605397466442e-05, + "loss": 1.7608, + "step": 22910 + }, + { + "epoch": 7.032228360957642, + "grad_norm": 0.24412932991981506, + "learning_rate": 2.1377529694966097e-05, + "loss": 1.7205, + "step": 22911 + }, + { + "epoch": 7.032535297728668, + "grad_norm": 0.20398534834384918, + "learning_rate": 2.137345427532978e-05, + "loss": 1.7318, + "step": 22912 + }, + { + "epoch": 7.032842234499693, + "grad_norm": 0.2346884161233902, + "learning_rate": 2.136937913859776e-05, + "loss": 1.7159, + "step": 22913 + }, + { + "epoch": 7.033149171270718, + "grad_norm": 0.19422392547130585, + "learning_rate": 2.1365304284810327e-05, + "loss": 1.7229, + "step": 22914 + }, + { + "epoch": 7.033456108041744, + "grad_norm": 0.24088126420974731, + "learning_rate": 2.1361229714007714e-05, + "loss": 1.77, + "step": 22915 + }, + { + "epoch": 7.033763044812768, + "grad_norm": 0.18886598944664001, + "learning_rate": 2.135715542623026e-05, + "loss": 1.7724, + "step": 22916 + }, + { + "epoch": 7.0340699815837935, + "grad_norm": 0.18816733360290527, + "learning_rate": 2.135308142151814e-05, + "loss": 1.7174, + "step": 22917 + }, + { + "epoch": 7.034376918354819, + "grad_norm": 0.184849813580513, + "learning_rate": 2.1349007699911694e-05, + "loss": 1.7026, + "step": 22918 + }, + { + "epoch": 7.034683855125844, + "grad_norm": 0.1638055443763733, + "learning_rate": 2.134493426145113e-05, + "loss": 1.683, + "step": 22919 + }, + { + "epoch": 7.0349907918968695, + "grad_norm": 0.18030275404453278, + "learning_rate": 2.1340861106176713e-05, + "loss": 1.6963, + "step": 22920 + }, + { + "epoch": 7.035297728667895, + "grad_norm": 0.221226304769516, + "learning_rate": 2.133678823412873e-05, + "loss": 1.7851, + "step": 22921 + }, + { + "epoch": 7.035604665438919, + "grad_norm": 0.18877451121807098, + "learning_rate": 2.1332715645347373e-05, + "loss": 1.7111, + "step": 22922 + }, + { + "epoch": 7.035911602209945, + "grad_norm": 0.17179232835769653, + "learning_rate": 2.1328643339872938e-05, + "loss": 1.6737, + "step": 22923 + }, + { + "epoch": 7.03621853898097, + "grad_norm": 0.17912441492080688, + "learning_rate": 2.1324571317745657e-05, + "loss": 1.7798, + "step": 22924 + }, + { + "epoch": 7.036525475751995, + "grad_norm": 0.2120780050754547, + "learning_rate": 2.132049957900577e-05, + "loss": 1.7353, + "step": 22925 + }, + { + "epoch": 7.036832412523021, + "grad_norm": 0.17286419868469238, + "learning_rate": 2.1316428123693517e-05, + "loss": 1.667, + "step": 22926 + }, + { + "epoch": 7.037139349294045, + "grad_norm": 0.1824301779270172, + "learning_rate": 2.1312356951849126e-05, + "loss": 1.6925, + "step": 22927 + }, + { + "epoch": 7.03744628606507, + "grad_norm": 0.16392327845096588, + "learning_rate": 2.1308286063512843e-05, + "loss": 1.7145, + "step": 22928 + }, + { + "epoch": 7.037753222836096, + "grad_norm": 0.18268297612667084, + "learning_rate": 2.1304215458724895e-05, + "loss": 1.7251, + "step": 22929 + }, + { + "epoch": 7.038060159607121, + "grad_norm": 0.19878868758678436, + "learning_rate": 2.1300145137525505e-05, + "loss": 1.7192, + "step": 22930 + }, + { + "epoch": 7.038367096378146, + "grad_norm": 0.18570293486118317, + "learning_rate": 2.1296075099954908e-05, + "loss": 1.718, + "step": 22931 + }, + { + "epoch": 7.038674033149171, + "grad_norm": 0.16497015953063965, + "learning_rate": 2.12920053460533e-05, + "loss": 1.6914, + "step": 22932 + }, + { + "epoch": 7.038980969920196, + "grad_norm": 0.20224586129188538, + "learning_rate": 2.128793587586096e-05, + "loss": 1.6941, + "step": 22933 + }, + { + "epoch": 7.0392879066912215, + "grad_norm": 0.22124920785427094, + "learning_rate": 2.1283866689418024e-05, + "loss": 1.7921, + "step": 22934 + }, + { + "epoch": 7.039594843462247, + "grad_norm": 0.20548123121261597, + "learning_rate": 2.127979778676479e-05, + "loss": 1.7488, + "step": 22935 + }, + { + "epoch": 7.039901780233272, + "grad_norm": 0.17604656517505646, + "learning_rate": 2.1275729167941405e-05, + "loss": 1.7145, + "step": 22936 + }, + { + "epoch": 7.0402087170042975, + "grad_norm": 0.17899781465530396, + "learning_rate": 2.127166083298809e-05, + "loss": 1.6703, + "step": 22937 + }, + { + "epoch": 7.040515653775322, + "grad_norm": 0.16101998090744019, + "learning_rate": 2.126759278194509e-05, + "loss": 1.715, + "step": 22938 + }, + { + "epoch": 7.040822590546347, + "grad_norm": 0.22807051241397858, + "learning_rate": 2.1263525014852542e-05, + "loss": 1.7409, + "step": 22939 + }, + { + "epoch": 7.041129527317373, + "grad_norm": 0.19442932307720184, + "learning_rate": 2.125945753175072e-05, + "loss": 1.6953, + "step": 22940 + }, + { + "epoch": 7.041436464088398, + "grad_norm": 0.24816946685314178, + "learning_rate": 2.1255390332679755e-05, + "loss": 1.7527, + "step": 22941 + }, + { + "epoch": 7.041743400859423, + "grad_norm": 0.26748740673065186, + "learning_rate": 2.1251323417679882e-05, + "loss": 1.7703, + "step": 22942 + }, + { + "epoch": 7.042050337630448, + "grad_norm": 0.19965825974941254, + "learning_rate": 2.124725678679128e-05, + "loss": 1.7303, + "step": 22943 + }, + { + "epoch": 7.042357274401473, + "grad_norm": 0.2442217618227005, + "learning_rate": 2.124319044005414e-05, + "loss": 1.7183, + "step": 22944 + }, + { + "epoch": 7.042664211172498, + "grad_norm": 0.21421664953231812, + "learning_rate": 2.1239124377508646e-05, + "loss": 1.7348, + "step": 22945 + }, + { + "epoch": 7.042971147943524, + "grad_norm": 0.26072144508361816, + "learning_rate": 2.1235058599194984e-05, + "loss": 1.7396, + "step": 22946 + }, + { + "epoch": 7.043278084714549, + "grad_norm": 0.20694412291049957, + "learning_rate": 2.1230993105153335e-05, + "loss": 1.7871, + "step": 22947 + }, + { + "epoch": 7.043585021485574, + "grad_norm": 0.298551082611084, + "learning_rate": 2.122692789542387e-05, + "loss": 1.7051, + "step": 22948 + }, + { + "epoch": 7.043891958256599, + "grad_norm": 0.22547855973243713, + "learning_rate": 2.1222862970046752e-05, + "loss": 1.7392, + "step": 22949 + }, + { + "epoch": 7.044198895027624, + "grad_norm": 0.3150571882724762, + "learning_rate": 2.1218798329062205e-05, + "loss": 1.6705, + "step": 22950 + }, + { + "epoch": 7.0445058317986495, + "grad_norm": 0.2025378942489624, + "learning_rate": 2.1214733972510327e-05, + "loss": 1.7114, + "step": 22951 + }, + { + "epoch": 7.044812768569675, + "grad_norm": 0.29046711325645447, + "learning_rate": 2.1210669900431353e-05, + "loss": 1.7745, + "step": 22952 + }, + { + "epoch": 7.0451197053407, + "grad_norm": 0.23395368456840515, + "learning_rate": 2.1206606112865396e-05, + "loss": 1.7829, + "step": 22953 + }, + { + "epoch": 7.045426642111725, + "grad_norm": 0.21395133435726166, + "learning_rate": 2.1202542609852616e-05, + "loss": 1.7211, + "step": 22954 + }, + { + "epoch": 7.04573357888275, + "grad_norm": 0.18077452480793, + "learning_rate": 2.1198479391433223e-05, + "loss": 1.7584, + "step": 22955 + }, + { + "epoch": 7.046040515653775, + "grad_norm": 0.17318682372570038, + "learning_rate": 2.1194416457647302e-05, + "loss": 1.7525, + "step": 22956 + }, + { + "epoch": 7.046347452424801, + "grad_norm": 0.18798092007637024, + "learning_rate": 2.119035380853508e-05, + "loss": 1.7525, + "step": 22957 + }, + { + "epoch": 7.046654389195826, + "grad_norm": 0.18679840862751007, + "learning_rate": 2.118629144413663e-05, + "loss": 1.7729, + "step": 22958 + }, + { + "epoch": 7.04696132596685, + "grad_norm": 0.17846907675266266, + "learning_rate": 2.1182229364492156e-05, + "loss": 1.7354, + "step": 22959 + }, + { + "epoch": 7.047268262737876, + "grad_norm": 0.22771520912647247, + "learning_rate": 2.1178167569641783e-05, + "loss": 1.7086, + "step": 22960 + }, + { + "epoch": 7.047575199508901, + "grad_norm": 0.1541738212108612, + "learning_rate": 2.1174106059625642e-05, + "loss": 1.67, + "step": 22961 + }, + { + "epoch": 7.047882136279926, + "grad_norm": 0.17698390781879425, + "learning_rate": 2.117004483448389e-05, + "loss": 1.68, + "step": 22962 + }, + { + "epoch": 7.048189073050952, + "grad_norm": 0.2220597118139267, + "learning_rate": 2.1165983894256647e-05, + "loss": 1.7783, + "step": 22963 + }, + { + "epoch": 7.048496009821977, + "grad_norm": 0.20971544086933136, + "learning_rate": 2.1161923238984055e-05, + "loss": 1.7318, + "step": 22964 + }, + { + "epoch": 7.0488029465930016, + "grad_norm": 0.2032100409269333, + "learning_rate": 2.1157862868706242e-05, + "loss": 1.6736, + "step": 22965 + }, + { + "epoch": 7.049109883364027, + "grad_norm": 0.19177256524562836, + "learning_rate": 2.115380278346331e-05, + "loss": 1.74, + "step": 22966 + }, + { + "epoch": 7.049416820135052, + "grad_norm": 0.1956746131181717, + "learning_rate": 2.1149742983295446e-05, + "loss": 1.7251, + "step": 22967 + }, + { + "epoch": 7.0497237569060776, + "grad_norm": 0.16200929880142212, + "learning_rate": 2.114568346824269e-05, + "loss": 1.6735, + "step": 22968 + }, + { + "epoch": 7.050030693677103, + "grad_norm": 0.19551095366477966, + "learning_rate": 2.1141624238345242e-05, + "loss": 1.7185, + "step": 22969 + }, + { + "epoch": 7.050337630448127, + "grad_norm": 0.17967839539051056, + "learning_rate": 2.1137565293643158e-05, + "loss": 1.7262, + "step": 22970 + }, + { + "epoch": 7.050644567219153, + "grad_norm": 0.15093082189559937, + "learning_rate": 2.1133506634176552e-05, + "loss": 1.6695, + "step": 22971 + }, + { + "epoch": 7.050951503990178, + "grad_norm": 0.20207351446151733, + "learning_rate": 2.1129448259985595e-05, + "loss": 1.7448, + "step": 22972 + }, + { + "epoch": 7.051258440761203, + "grad_norm": 0.20243801176548004, + "learning_rate": 2.112539017111031e-05, + "loss": 1.7496, + "step": 22973 + }, + { + "epoch": 7.051565377532229, + "grad_norm": 0.1967451572418213, + "learning_rate": 2.112133236759088e-05, + "loss": 1.718, + "step": 22974 + }, + { + "epoch": 7.051872314303253, + "grad_norm": 0.17668583989143372, + "learning_rate": 2.1117274849467334e-05, + "loss": 1.7295, + "step": 22975 + }, + { + "epoch": 7.0521792510742785, + "grad_norm": 0.17461778223514557, + "learning_rate": 2.1113217616779824e-05, + "loss": 1.7166, + "step": 22976 + }, + { + "epoch": 7.052486187845304, + "grad_norm": 0.18184112012386322, + "learning_rate": 2.110916066956843e-05, + "loss": 1.7092, + "step": 22977 + }, + { + "epoch": 7.052793124616329, + "grad_norm": 0.18001540005207062, + "learning_rate": 2.1105104007873246e-05, + "loss": 1.7129, + "step": 22978 + }, + { + "epoch": 7.0531000613873545, + "grad_norm": 0.15966519713401794, + "learning_rate": 2.1101047631734355e-05, + "loss": 1.7121, + "step": 22979 + }, + { + "epoch": 7.05340699815838, + "grad_norm": 0.20201170444488525, + "learning_rate": 2.109699154119185e-05, + "loss": 1.7266, + "step": 22980 + }, + { + "epoch": 7.053713934929404, + "grad_norm": 0.19559438526630402, + "learning_rate": 2.1092935736285817e-05, + "loss": 1.7492, + "step": 22981 + }, + { + "epoch": 7.05402087170043, + "grad_norm": 0.17783302068710327, + "learning_rate": 2.108888021705634e-05, + "loss": 1.6901, + "step": 22982 + }, + { + "epoch": 7.054327808471455, + "grad_norm": 0.22052957117557526, + "learning_rate": 2.108482498354347e-05, + "loss": 1.6771, + "step": 22983 + }, + { + "epoch": 7.05463474524248, + "grad_norm": 0.1899181455373764, + "learning_rate": 2.1080770035787346e-05, + "loss": 1.7011, + "step": 22984 + }, + { + "epoch": 7.054941682013506, + "grad_norm": 0.19773316383361816, + "learning_rate": 2.1076715373827964e-05, + "loss": 1.7535, + "step": 22985 + }, + { + "epoch": 7.05524861878453, + "grad_norm": 0.2244229018688202, + "learning_rate": 2.1072660997705475e-05, + "loss": 1.7938, + "step": 22986 + }, + { + "epoch": 7.055555555555555, + "grad_norm": 0.18881015479564667, + "learning_rate": 2.106860690745988e-05, + "loss": 1.6753, + "step": 22987 + }, + { + "epoch": 7.055862492326581, + "grad_norm": 0.19642052054405212, + "learning_rate": 2.106455310313126e-05, + "loss": 1.735, + "step": 22988 + }, + { + "epoch": 7.056169429097606, + "grad_norm": 0.23549412190914154, + "learning_rate": 2.106049958475971e-05, + "loss": 1.7705, + "step": 22989 + }, + { + "epoch": 7.056476365868631, + "grad_norm": 0.21001911163330078, + "learning_rate": 2.1056446352385235e-05, + "loss": 1.6802, + "step": 22990 + }, + { + "epoch": 7.056783302639656, + "grad_norm": 0.1821003556251526, + "learning_rate": 2.1052393406047953e-05, + "loss": 1.7144, + "step": 22991 + }, + { + "epoch": 7.057090239410681, + "grad_norm": 0.1979309767484665, + "learning_rate": 2.104834074578786e-05, + "loss": 1.6983, + "step": 22992 + }, + { + "epoch": 7.0573971761817065, + "grad_norm": 0.18264134228229523, + "learning_rate": 2.1044288371645045e-05, + "loss": 1.7001, + "step": 22993 + }, + { + "epoch": 7.057704112952732, + "grad_norm": 0.17276059091091156, + "learning_rate": 2.104023628365954e-05, + "loss": 1.6976, + "step": 22994 + }, + { + "epoch": 7.058011049723757, + "grad_norm": 0.18879400193691254, + "learning_rate": 2.1036184481871402e-05, + "loss": 1.6954, + "step": 22995 + }, + { + "epoch": 7.0583179864947825, + "grad_norm": 0.1956210434436798, + "learning_rate": 2.103213296632066e-05, + "loss": 1.7329, + "step": 22996 + }, + { + "epoch": 7.058624923265807, + "grad_norm": 0.21108154952526093, + "learning_rate": 2.1028081737047356e-05, + "loss": 1.7299, + "step": 22997 + }, + { + "epoch": 7.058931860036832, + "grad_norm": 0.17981186509132385, + "learning_rate": 2.1024030794091537e-05, + "loss": 1.7162, + "step": 22998 + }, + { + "epoch": 7.059238796807858, + "grad_norm": 0.1699269711971283, + "learning_rate": 2.101998013749322e-05, + "loss": 1.6842, + "step": 22999 + }, + { + "epoch": 7.059545733578883, + "grad_norm": 0.17033198475837708, + "learning_rate": 2.1015929767292435e-05, + "loss": 1.6735, + "step": 23000 + }, + { + "epoch": 7.059852670349908, + "grad_norm": 0.18620076775550842, + "learning_rate": 2.101187968352925e-05, + "loss": 1.7328, + "step": 23001 + }, + { + "epoch": 7.060159607120933, + "grad_norm": 0.17528964579105377, + "learning_rate": 2.100782988624363e-05, + "loss": 1.6567, + "step": 23002 + }, + { + "epoch": 7.060466543891958, + "grad_norm": 0.1946999728679657, + "learning_rate": 2.100378037547566e-05, + "loss": 1.7349, + "step": 23003 + }, + { + "epoch": 7.060773480662983, + "grad_norm": 0.23345647752285004, + "learning_rate": 2.0999731151265312e-05, + "loss": 1.7185, + "step": 23004 + }, + { + "epoch": 7.061080417434009, + "grad_norm": 0.20169813930988312, + "learning_rate": 2.0995682213652603e-05, + "loss": 1.7223, + "step": 23005 + }, + { + "epoch": 7.061387354205034, + "grad_norm": 0.2397730052471161, + "learning_rate": 2.0991633562677594e-05, + "loss": 1.7542, + "step": 23006 + }, + { + "epoch": 7.0616942909760585, + "grad_norm": 0.20421954989433289, + "learning_rate": 2.0987585198380227e-05, + "loss": 1.6888, + "step": 23007 + }, + { + "epoch": 7.062001227747084, + "grad_norm": 0.21555101871490479, + "learning_rate": 2.0983537120800584e-05, + "loss": 1.6796, + "step": 23008 + }, + { + "epoch": 7.062308164518109, + "grad_norm": 0.17311134934425354, + "learning_rate": 2.0979489329978603e-05, + "loss": 1.7199, + "step": 23009 + }, + { + "epoch": 7.0626151012891345, + "grad_norm": 0.25064393877983093, + "learning_rate": 2.0975441825954334e-05, + "loss": 1.6947, + "step": 23010 + }, + { + "epoch": 7.06292203806016, + "grad_norm": 0.19135847687721252, + "learning_rate": 2.0971394608767757e-05, + "loss": 1.702, + "step": 23011 + }, + { + "epoch": 7.063228974831185, + "grad_norm": 0.22994364798069, + "learning_rate": 2.0967347678458876e-05, + "loss": 1.6814, + "step": 23012 + }, + { + "epoch": 7.06353591160221, + "grad_norm": 0.21897611021995544, + "learning_rate": 2.0963301035067685e-05, + "loss": 1.7063, + "step": 23013 + }, + { + "epoch": 7.063842848373235, + "grad_norm": 0.23615150153636932, + "learning_rate": 2.0959254678634166e-05, + "loss": 1.7299, + "step": 23014 + }, + { + "epoch": 7.06414978514426, + "grad_norm": 0.1837770640850067, + "learning_rate": 2.0955208609198314e-05, + "loss": 1.7236, + "step": 23015 + }, + { + "epoch": 7.064456721915286, + "grad_norm": 0.16823385655879974, + "learning_rate": 2.0951162826800118e-05, + "loss": 1.6687, + "step": 23016 + }, + { + "epoch": 7.064763658686311, + "grad_norm": 0.17042338848114014, + "learning_rate": 2.094711733147954e-05, + "loss": 1.6907, + "step": 23017 + }, + { + "epoch": 7.065070595457335, + "grad_norm": 0.1753006875514984, + "learning_rate": 2.094307212327661e-05, + "loss": 1.7313, + "step": 23018 + }, + { + "epoch": 7.065377532228361, + "grad_norm": 0.19618375599384308, + "learning_rate": 2.093902720223123e-05, + "loss": 1.7147, + "step": 23019 + }, + { + "epoch": 7.065684468999386, + "grad_norm": 0.20214296877384186, + "learning_rate": 2.093498256838346e-05, + "loss": 1.7056, + "step": 23020 + }, + { + "epoch": 7.065991405770411, + "grad_norm": 0.20230883359909058, + "learning_rate": 2.093093822177321e-05, + "loss": 1.6628, + "step": 23021 + }, + { + "epoch": 7.066298342541437, + "grad_norm": 0.19913128018379211, + "learning_rate": 2.0926894162440446e-05, + "loss": 1.7286, + "step": 23022 + }, + { + "epoch": 7.066605279312462, + "grad_norm": 0.19535091519355774, + "learning_rate": 2.0922850390425193e-05, + "loss": 1.745, + "step": 23023 + }, + { + "epoch": 7.0669122160834865, + "grad_norm": 0.19679825007915497, + "learning_rate": 2.0918806905767337e-05, + "loss": 1.694, + "step": 23024 + }, + { + "epoch": 7.067219152854512, + "grad_norm": 0.1821403056383133, + "learning_rate": 2.0914763708506913e-05, + "loss": 1.7163, + "step": 23025 + }, + { + "epoch": 7.067526089625537, + "grad_norm": 0.17138415575027466, + "learning_rate": 2.0910720798683803e-05, + "loss": 1.6946, + "step": 23026 + }, + { + "epoch": 7.0678330263965625, + "grad_norm": 0.20219111442565918, + "learning_rate": 2.0906678176338017e-05, + "loss": 1.7437, + "step": 23027 + }, + { + "epoch": 7.068139963167588, + "grad_norm": 0.1985882669687271, + "learning_rate": 2.0902635841509494e-05, + "loss": 1.6762, + "step": 23028 + }, + { + "epoch": 7.068446899938612, + "grad_norm": 0.18586322665214539, + "learning_rate": 2.0898593794238174e-05, + "loss": 1.7296, + "step": 23029 + }, + { + "epoch": 7.068753836709638, + "grad_norm": 0.19222751259803772, + "learning_rate": 2.0894552034564013e-05, + "loss": 1.7186, + "step": 23030 + }, + { + "epoch": 7.069060773480663, + "grad_norm": 0.16107569634914398, + "learning_rate": 2.0890510562526944e-05, + "loss": 1.6898, + "step": 23031 + }, + { + "epoch": 7.069367710251688, + "grad_norm": 0.23859064280986786, + "learning_rate": 2.088646937816691e-05, + "loss": 1.7992, + "step": 23032 + }, + { + "epoch": 7.069674647022714, + "grad_norm": 0.22927051782608032, + "learning_rate": 2.0882428481523853e-05, + "loss": 1.7162, + "step": 23033 + }, + { + "epoch": 7.069981583793738, + "grad_norm": 0.18094350397586823, + "learning_rate": 2.0878387872637684e-05, + "loss": 1.7297, + "step": 23034 + }, + { + "epoch": 7.070288520564763, + "grad_norm": 0.20562811195850372, + "learning_rate": 2.087434755154839e-05, + "loss": 1.7475, + "step": 23035 + }, + { + "epoch": 7.070595457335789, + "grad_norm": 0.18405984342098236, + "learning_rate": 2.087030751829583e-05, + "loss": 1.6954, + "step": 23036 + }, + { + "epoch": 7.070902394106814, + "grad_norm": 0.26286160945892334, + "learning_rate": 2.0866267772919994e-05, + "loss": 1.7406, + "step": 23037 + }, + { + "epoch": 7.071209330877839, + "grad_norm": 0.1688467413187027, + "learning_rate": 2.086222831546077e-05, + "loss": 1.7375, + "step": 23038 + }, + { + "epoch": 7.071516267648865, + "grad_norm": 0.25445011258125305, + "learning_rate": 2.0858189145958057e-05, + "loss": 1.7479, + "step": 23039 + }, + { + "epoch": 7.071823204419889, + "grad_norm": 0.20637978613376617, + "learning_rate": 2.085415026445184e-05, + "loss": 1.7653, + "step": 23040 + }, + { + "epoch": 7.0721301411909145, + "grad_norm": 0.21693937480449677, + "learning_rate": 2.0850111670981952e-05, + "loss": 1.7392, + "step": 23041 + }, + { + "epoch": 7.07243707796194, + "grad_norm": 0.1999017745256424, + "learning_rate": 2.0846073365588388e-05, + "loss": 1.753, + "step": 23042 + }, + { + "epoch": 7.072744014732965, + "grad_norm": 0.2271260917186737, + "learning_rate": 2.0842035348310973e-05, + "loss": 1.7136, + "step": 23043 + }, + { + "epoch": 7.0730509515039905, + "grad_norm": 0.1915169358253479, + "learning_rate": 2.0837997619189675e-05, + "loss": 1.7142, + "step": 23044 + }, + { + "epoch": 7.073357888275015, + "grad_norm": 0.2250204086303711, + "learning_rate": 2.0833960178264377e-05, + "loss": 1.8039, + "step": 23045 + }, + { + "epoch": 7.07366482504604, + "grad_norm": 0.20920081436634064, + "learning_rate": 2.0829923025574976e-05, + "loss": 1.767, + "step": 23046 + }, + { + "epoch": 7.073971761817066, + "grad_norm": 0.16039173305034637, + "learning_rate": 2.082588616116138e-05, + "loss": 1.6895, + "step": 23047 + }, + { + "epoch": 7.074278698588091, + "grad_norm": 0.1849806159734726, + "learning_rate": 2.082184958506347e-05, + "loss": 1.7323, + "step": 23048 + }, + { + "epoch": 7.074585635359116, + "grad_norm": 0.22370420396327972, + "learning_rate": 2.081781329732115e-05, + "loss": 1.7478, + "step": 23049 + }, + { + "epoch": 7.074892572130141, + "grad_norm": 0.1600474864244461, + "learning_rate": 2.0813777297974296e-05, + "loss": 1.6754, + "step": 23050 + }, + { + "epoch": 7.075199508901166, + "grad_norm": 0.18357187509536743, + "learning_rate": 2.080974158706281e-05, + "loss": 1.694, + "step": 23051 + }, + { + "epoch": 7.0755064456721914, + "grad_norm": 0.17667005956172943, + "learning_rate": 2.080570616462656e-05, + "loss": 1.7053, + "step": 23052 + }, + { + "epoch": 7.075813382443217, + "grad_norm": 0.19393591582775116, + "learning_rate": 2.0801671030705417e-05, + "loss": 1.7917, + "step": 23053 + }, + { + "epoch": 7.076120319214242, + "grad_norm": 0.19432564079761505, + "learning_rate": 2.0797636185339307e-05, + "loss": 1.7276, + "step": 23054 + }, + { + "epoch": 7.0764272559852675, + "grad_norm": 0.17960594594478607, + "learning_rate": 2.079360162856806e-05, + "loss": 1.6988, + "step": 23055 + }, + { + "epoch": 7.076734192756292, + "grad_norm": 0.183505579829216, + "learning_rate": 2.0789567360431538e-05, + "loss": 1.7106, + "step": 23056 + }, + { + "epoch": 7.077041129527317, + "grad_norm": 0.27859750390052795, + "learning_rate": 2.0785533380969673e-05, + "loss": 1.779, + "step": 23057 + }, + { + "epoch": 7.077348066298343, + "grad_norm": 0.1903255134820938, + "learning_rate": 2.078149969022225e-05, + "loss": 1.7334, + "step": 23058 + }, + { + "epoch": 7.077655003069368, + "grad_norm": 0.2221076786518097, + "learning_rate": 2.0777466288229207e-05, + "loss": 1.6863, + "step": 23059 + }, + { + "epoch": 7.077961939840393, + "grad_norm": 0.15516065061092377, + "learning_rate": 2.0773433175030336e-05, + "loss": 1.6633, + "step": 23060 + }, + { + "epoch": 7.078268876611418, + "grad_norm": 0.20073910057544708, + "learning_rate": 2.0769400350665553e-05, + "loss": 1.7057, + "step": 23061 + }, + { + "epoch": 7.078575813382443, + "grad_norm": 0.1680205762386322, + "learning_rate": 2.076536781517468e-05, + "loss": 1.6659, + "step": 23062 + }, + { + "epoch": 7.078882750153468, + "grad_norm": 0.20825456082820892, + "learning_rate": 2.0761335568597584e-05, + "loss": 1.751, + "step": 23063 + }, + { + "epoch": 7.079189686924494, + "grad_norm": 0.17365674674510956, + "learning_rate": 2.0757303610974098e-05, + "loss": 1.6591, + "step": 23064 + }, + { + "epoch": 7.079496623695519, + "grad_norm": 0.21712929010391235, + "learning_rate": 2.0753271942344087e-05, + "loss": 1.7357, + "step": 23065 + }, + { + "epoch": 7.0798035604665435, + "grad_norm": 0.1841089278459549, + "learning_rate": 2.074924056274738e-05, + "loss": 1.6818, + "step": 23066 + }, + { + "epoch": 7.080110497237569, + "grad_norm": 0.20433486998081207, + "learning_rate": 2.074520947222382e-05, + "loss": 1.76, + "step": 23067 + }, + { + "epoch": 7.080417434008594, + "grad_norm": 0.1712963879108429, + "learning_rate": 2.074117867081325e-05, + "loss": 1.6426, + "step": 23068 + }, + { + "epoch": 7.0807243707796195, + "grad_norm": 0.19894109666347504, + "learning_rate": 2.0737148158555504e-05, + "loss": 1.7529, + "step": 23069 + }, + { + "epoch": 7.081031307550645, + "grad_norm": 0.19338269531726837, + "learning_rate": 2.0733117935490386e-05, + "loss": 1.8274, + "step": 23070 + }, + { + "epoch": 7.08133824432167, + "grad_norm": 0.20883139967918396, + "learning_rate": 2.0729088001657794e-05, + "loss": 1.7275, + "step": 23071 + }, + { + "epoch": 7.081645181092695, + "grad_norm": 0.18498694896697998, + "learning_rate": 2.0725058357097487e-05, + "loss": 1.6648, + "step": 23072 + }, + { + "epoch": 7.08195211786372, + "grad_norm": 0.1727421134710312, + "learning_rate": 2.0721029001849313e-05, + "loss": 1.7709, + "step": 23073 + }, + { + "epoch": 7.082259054634745, + "grad_norm": 0.16965949535369873, + "learning_rate": 2.0716999935953096e-05, + "loss": 1.6876, + "step": 23074 + }, + { + "epoch": 7.082565991405771, + "grad_norm": 0.16905519366264343, + "learning_rate": 2.0712971159448623e-05, + "loss": 1.6576, + "step": 23075 + }, + { + "epoch": 7.082872928176796, + "grad_norm": 0.2863580882549286, + "learning_rate": 2.0708942672375776e-05, + "loss": 1.7631, + "step": 23076 + }, + { + "epoch": 7.08317986494782, + "grad_norm": 0.26248931884765625, + "learning_rate": 2.070491447477429e-05, + "loss": 1.7692, + "step": 23077 + }, + { + "epoch": 7.083486801718846, + "grad_norm": 0.17670878767967224, + "learning_rate": 2.0700886566684024e-05, + "loss": 1.6725, + "step": 23078 + }, + { + "epoch": 7.083793738489871, + "grad_norm": 0.19245800375938416, + "learning_rate": 2.0696858948144775e-05, + "loss": 1.7249, + "step": 23079 + }, + { + "epoch": 7.084100675260896, + "grad_norm": 0.18651939928531647, + "learning_rate": 2.0692831619196335e-05, + "loss": 1.7616, + "step": 23080 + }, + { + "epoch": 7.084407612031922, + "grad_norm": 0.21432510018348694, + "learning_rate": 2.0688804579878514e-05, + "loss": 1.743, + "step": 23081 + }, + { + "epoch": 7.084714548802946, + "grad_norm": 0.18530069291591644, + "learning_rate": 2.0684777830231106e-05, + "loss": 1.7257, + "step": 23082 + }, + { + "epoch": 7.0850214855739715, + "grad_norm": 0.1974172443151474, + "learning_rate": 2.0680751370293903e-05, + "loss": 1.6918, + "step": 23083 + }, + { + "epoch": 7.085328422344997, + "grad_norm": 0.19517268240451813, + "learning_rate": 2.0676725200106706e-05, + "loss": 1.7421, + "step": 23084 + }, + { + "epoch": 7.085635359116022, + "grad_norm": 0.28572699427604675, + "learning_rate": 2.067269931970929e-05, + "loss": 1.7575, + "step": 23085 + }, + { + "epoch": 7.0859422958870475, + "grad_norm": 0.2062397003173828, + "learning_rate": 2.0668673729141452e-05, + "loss": 1.7085, + "step": 23086 + }, + { + "epoch": 7.086249232658073, + "grad_norm": 0.21619725227355957, + "learning_rate": 2.0664648428442973e-05, + "loss": 1.7783, + "step": 23087 + }, + { + "epoch": 7.086556169429097, + "grad_norm": 0.2732481360435486, + "learning_rate": 2.066062341765363e-05, + "loss": 1.7089, + "step": 23088 + }, + { + "epoch": 7.086863106200123, + "grad_norm": 0.19897356629371643, + "learning_rate": 2.06565986968132e-05, + "loss": 1.6487, + "step": 23089 + }, + { + "epoch": 7.087170042971148, + "grad_norm": 0.2578796148300171, + "learning_rate": 2.0652574265961466e-05, + "loss": 1.7385, + "step": 23090 + }, + { + "epoch": 7.087476979742173, + "grad_norm": 0.18980316817760468, + "learning_rate": 2.0648550125138195e-05, + "loss": 1.6651, + "step": 23091 + }, + { + "epoch": 7.087783916513199, + "grad_norm": 0.279580682516098, + "learning_rate": 2.064452627438313e-05, + "loss": 1.7189, + "step": 23092 + }, + { + "epoch": 7.088090853284223, + "grad_norm": 0.18652775883674622, + "learning_rate": 2.0640502713736103e-05, + "loss": 1.7085, + "step": 23093 + }, + { + "epoch": 7.088397790055248, + "grad_norm": 0.2729358673095703, + "learning_rate": 2.06364794432368e-05, + "loss": 1.6812, + "step": 23094 + }, + { + "epoch": 7.088704726826274, + "grad_norm": 0.1756472885608673, + "learning_rate": 2.0632456462925053e-05, + "loss": 1.6835, + "step": 23095 + }, + { + "epoch": 7.089011663597299, + "grad_norm": 0.2352994978427887, + "learning_rate": 2.062843377284055e-05, + "loss": 1.6898, + "step": 23096 + }, + { + "epoch": 7.089318600368324, + "grad_norm": 0.20231495797634125, + "learning_rate": 2.0624411373023093e-05, + "loss": 1.7294, + "step": 23097 + }, + { + "epoch": 7.08962553713935, + "grad_norm": 0.276114821434021, + "learning_rate": 2.0620389263512424e-05, + "loss": 1.6864, + "step": 23098 + }, + { + "epoch": 7.089932473910374, + "grad_norm": 0.2178632766008377, + "learning_rate": 2.0616367444348288e-05, + "loss": 1.7353, + "step": 23099 + }, + { + "epoch": 7.0902394106813995, + "grad_norm": 0.20966552197933197, + "learning_rate": 2.061234591557043e-05, + "loss": 1.6579, + "step": 23100 + }, + { + "epoch": 7.090546347452425, + "grad_norm": 0.16496559977531433, + "learning_rate": 2.0608324677218592e-05, + "loss": 1.7137, + "step": 23101 + }, + { + "epoch": 7.09085328422345, + "grad_norm": 0.19176827371120453, + "learning_rate": 2.0604303729332525e-05, + "loss": 1.6996, + "step": 23102 + }, + { + "epoch": 7.0911602209944755, + "grad_norm": 0.20933480560779572, + "learning_rate": 2.060028307195195e-05, + "loss": 1.7887, + "step": 23103 + }, + { + "epoch": 7.0914671577655, + "grad_norm": 0.1925809681415558, + "learning_rate": 2.0596262705116613e-05, + "loss": 1.6974, + "step": 23104 + }, + { + "epoch": 7.091774094536525, + "grad_norm": 0.1582585573196411, + "learning_rate": 2.0592242628866236e-05, + "loss": 1.6731, + "step": 23105 + }, + { + "epoch": 7.092081031307551, + "grad_norm": 0.20380592346191406, + "learning_rate": 2.058822284324056e-05, + "loss": 1.6911, + "step": 23106 + }, + { + "epoch": 7.092387968078576, + "grad_norm": 0.17984862625598907, + "learning_rate": 2.0584203348279307e-05, + "loss": 1.7218, + "step": 23107 + }, + { + "epoch": 7.092694904849601, + "grad_norm": 0.22097790241241455, + "learning_rate": 2.058018414402219e-05, + "loss": 1.7223, + "step": 23108 + }, + { + "epoch": 7.093001841620626, + "grad_norm": 0.20519912242889404, + "learning_rate": 2.0576165230508926e-05, + "loss": 1.7197, + "step": 23109 + }, + { + "epoch": 7.093308778391651, + "grad_norm": 0.2156807780265808, + "learning_rate": 2.0572146607779274e-05, + "loss": 1.7079, + "step": 23110 + }, + { + "epoch": 7.093615715162676, + "grad_norm": 0.21810726821422577, + "learning_rate": 2.056812827587288e-05, + "loss": 1.7456, + "step": 23111 + }, + { + "epoch": 7.093922651933702, + "grad_norm": 0.2288726568222046, + "learning_rate": 2.0564110234829536e-05, + "loss": 1.8113, + "step": 23112 + }, + { + "epoch": 7.094229588704727, + "grad_norm": 0.21279199421405792, + "learning_rate": 2.056009248468887e-05, + "loss": 1.7554, + "step": 23113 + }, + { + "epoch": 7.094536525475752, + "grad_norm": 0.18577606976032257, + "learning_rate": 2.055607502549064e-05, + "loss": 1.661, + "step": 23114 + }, + { + "epoch": 7.094843462246777, + "grad_norm": 0.17938728630542755, + "learning_rate": 2.0552057857274536e-05, + "loss": 1.6998, + "step": 23115 + }, + { + "epoch": 7.095150399017802, + "grad_norm": 0.1946432888507843, + "learning_rate": 2.0548040980080258e-05, + "loss": 1.7146, + "step": 23116 + }, + { + "epoch": 7.0954573357888275, + "grad_norm": 0.21220463514328003, + "learning_rate": 2.0544024393947496e-05, + "loss": 1.7345, + "step": 23117 + }, + { + "epoch": 7.095764272559853, + "grad_norm": 0.2006370723247528, + "learning_rate": 2.0540008098915954e-05, + "loss": 1.7636, + "step": 23118 + }, + { + "epoch": 7.096071209330878, + "grad_norm": 0.17251192033290863, + "learning_rate": 2.0535992095025312e-05, + "loss": 1.7103, + "step": 23119 + }, + { + "epoch": 7.096378146101903, + "grad_norm": 0.2393570840358734, + "learning_rate": 2.0531976382315277e-05, + "loss": 1.7636, + "step": 23120 + }, + { + "epoch": 7.096685082872928, + "grad_norm": 0.16999265551567078, + "learning_rate": 2.0527960960825516e-05, + "loss": 1.6571, + "step": 23121 + }, + { + "epoch": 7.096992019643953, + "grad_norm": 0.17626826465129852, + "learning_rate": 2.052394583059572e-05, + "loss": 1.713, + "step": 23122 + }, + { + "epoch": 7.097298956414979, + "grad_norm": 0.18373346328735352, + "learning_rate": 2.051993099166557e-05, + "loss": 1.7102, + "step": 23123 + }, + { + "epoch": 7.097605893186004, + "grad_norm": 0.1913219541311264, + "learning_rate": 2.0515916444074734e-05, + "loss": 1.7441, + "step": 23124 + }, + { + "epoch": 7.097912829957028, + "grad_norm": 0.19664399325847626, + "learning_rate": 2.0511902187862903e-05, + "loss": 1.6866, + "step": 23125 + }, + { + "epoch": 7.098219766728054, + "grad_norm": 0.16524936258792877, + "learning_rate": 2.050788822306971e-05, + "loss": 1.6709, + "step": 23126 + }, + { + "epoch": 7.098526703499079, + "grad_norm": 0.19291190803050995, + "learning_rate": 2.050387454973489e-05, + "loss": 1.7033, + "step": 23127 + }, + { + "epoch": 7.098833640270104, + "grad_norm": 0.19915525615215302, + "learning_rate": 2.0499861167898037e-05, + "loss": 1.7425, + "step": 23128 + }, + { + "epoch": 7.09914057704113, + "grad_norm": 0.21295227110385895, + "learning_rate": 2.0495848077598883e-05, + "loss": 1.7516, + "step": 23129 + }, + { + "epoch": 7.099447513812155, + "grad_norm": 0.21469831466674805, + "learning_rate": 2.0491835278877014e-05, + "loss": 1.7129, + "step": 23130 + }, + { + "epoch": 7.0997544505831796, + "grad_norm": 0.16860374808311462, + "learning_rate": 2.0487822771772143e-05, + "loss": 1.7172, + "step": 23131 + }, + { + "epoch": 7.100061387354205, + "grad_norm": 0.22386015951633453, + "learning_rate": 2.04838105563239e-05, + "loss": 1.7829, + "step": 23132 + }, + { + "epoch": 7.10036832412523, + "grad_norm": 0.22635474801063538, + "learning_rate": 2.047979863257195e-05, + "loss": 1.6956, + "step": 23133 + }, + { + "epoch": 7.100675260896256, + "grad_norm": 0.20508790016174316, + "learning_rate": 2.0475787000555924e-05, + "loss": 1.7404, + "step": 23134 + }, + { + "epoch": 7.100982197667281, + "grad_norm": 0.2055993378162384, + "learning_rate": 2.047177566031548e-05, + "loss": 1.7064, + "step": 23135 + }, + { + "epoch": 7.101289134438305, + "grad_norm": 0.19258326292037964, + "learning_rate": 2.0467764611890254e-05, + "loss": 1.7078, + "step": 23136 + }, + { + "epoch": 7.101596071209331, + "grad_norm": 0.20766718685626984, + "learning_rate": 2.046375385531989e-05, + "loss": 1.6854, + "step": 23137 + }, + { + "epoch": 7.101903007980356, + "grad_norm": 0.17945602536201477, + "learning_rate": 2.045974339064402e-05, + "loss": 1.6986, + "step": 23138 + }, + { + "epoch": 7.102209944751381, + "grad_norm": 0.17283397912979126, + "learning_rate": 2.045573321790228e-05, + "loss": 1.7296, + "step": 23139 + }, + { + "epoch": 7.102516881522407, + "grad_norm": 0.19000805914402008, + "learning_rate": 2.0451723337134298e-05, + "loss": 1.7005, + "step": 23140 + }, + { + "epoch": 7.102823818293431, + "grad_norm": 0.1966131180524826, + "learning_rate": 2.044771374837971e-05, + "loss": 1.7574, + "step": 23141 + }, + { + "epoch": 7.1031307550644565, + "grad_norm": 0.2411719709634781, + "learning_rate": 2.0443704451678137e-05, + "loss": 1.7599, + "step": 23142 + }, + { + "epoch": 7.103437691835482, + "grad_norm": 0.23902751505374908, + "learning_rate": 2.0439695447069173e-05, + "loss": 1.6805, + "step": 23143 + }, + { + "epoch": 7.103744628606507, + "grad_norm": 0.19117529690265656, + "learning_rate": 2.0435686734592508e-05, + "loss": 1.7482, + "step": 23144 + }, + { + "epoch": 7.1040515653775325, + "grad_norm": 0.18491674959659576, + "learning_rate": 2.0431678314287678e-05, + "loss": 1.6764, + "step": 23145 + }, + { + "epoch": 7.104358502148558, + "grad_norm": 0.21000699698925018, + "learning_rate": 2.042767018619437e-05, + "loss": 1.7185, + "step": 23146 + }, + { + "epoch": 7.104665438919582, + "grad_norm": 0.17373491823673248, + "learning_rate": 2.0423662350352117e-05, + "loss": 1.6945, + "step": 23147 + }, + { + "epoch": 7.104972375690608, + "grad_norm": 0.18387937545776367, + "learning_rate": 2.041965480680059e-05, + "loss": 1.766, + "step": 23148 + }, + { + "epoch": 7.105279312461633, + "grad_norm": 0.15976013243198395, + "learning_rate": 2.0415647555579376e-05, + "loss": 1.6446, + "step": 23149 + }, + { + "epoch": 7.105586249232658, + "grad_norm": 0.19251346588134766, + "learning_rate": 2.0411640596728066e-05, + "loss": 1.7122, + "step": 23150 + }, + { + "epoch": 7.105893186003684, + "grad_norm": 0.1640147864818573, + "learning_rate": 2.040763393028627e-05, + "loss": 1.7057, + "step": 23151 + }, + { + "epoch": 7.106200122774708, + "grad_norm": 0.20366166532039642, + "learning_rate": 2.0403627556293577e-05, + "loss": 1.7173, + "step": 23152 + }, + { + "epoch": 7.106507059545733, + "grad_norm": 0.18549348413944244, + "learning_rate": 2.039962147478958e-05, + "loss": 1.7215, + "step": 23153 + }, + { + "epoch": 7.106813996316759, + "grad_norm": 0.16964925825595856, + "learning_rate": 2.039561568581388e-05, + "loss": 1.6931, + "step": 23154 + }, + { + "epoch": 7.107120933087784, + "grad_norm": 0.16923274099826813, + "learning_rate": 2.0391610189406058e-05, + "loss": 1.6976, + "step": 23155 + }, + { + "epoch": 7.107427869858809, + "grad_norm": 0.17707234621047974, + "learning_rate": 2.038760498560569e-05, + "loss": 1.7102, + "step": 23156 + }, + { + "epoch": 7.107734806629834, + "grad_norm": 0.2048260122537613, + "learning_rate": 2.0383600074452376e-05, + "loss": 1.7116, + "step": 23157 + }, + { + "epoch": 7.108041743400859, + "grad_norm": 0.17328095436096191, + "learning_rate": 2.037959545598568e-05, + "loss": 1.6683, + "step": 23158 + }, + { + "epoch": 7.1083486801718845, + "grad_norm": 0.15829013288021088, + "learning_rate": 2.037559113024518e-05, + "loss": 1.6617, + "step": 23159 + }, + { + "epoch": 7.10865561694291, + "grad_norm": 0.21150968968868256, + "learning_rate": 2.037158709727044e-05, + "loss": 1.7057, + "step": 23160 + }, + { + "epoch": 7.108962553713935, + "grad_norm": 0.20321892201900482, + "learning_rate": 2.0367583357101072e-05, + "loss": 1.6811, + "step": 23161 + }, + { + "epoch": 7.1092694904849605, + "grad_norm": 0.19491781294345856, + "learning_rate": 2.0363579909776583e-05, + "loss": 1.6794, + "step": 23162 + }, + { + "epoch": 7.109576427255985, + "grad_norm": 0.155877947807312, + "learning_rate": 2.0359576755336594e-05, + "loss": 1.7434, + "step": 23163 + }, + { + "epoch": 7.10988336402701, + "grad_norm": 0.17822639644145966, + "learning_rate": 2.0355573893820613e-05, + "loss": 1.7029, + "step": 23164 + }, + { + "epoch": 7.110190300798036, + "grad_norm": 0.18152910470962524, + "learning_rate": 2.0351571325268242e-05, + "loss": 1.7277, + "step": 23165 + }, + { + "epoch": 7.110497237569061, + "grad_norm": 0.19928498566150665, + "learning_rate": 2.034756904971902e-05, + "loss": 1.7852, + "step": 23166 + }, + { + "epoch": 7.110804174340086, + "grad_norm": 0.19099318981170654, + "learning_rate": 2.0343567067212504e-05, + "loss": 1.7258, + "step": 23167 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 0.19800841808319092, + "learning_rate": 2.033956537778824e-05, + "loss": 1.7647, + "step": 23168 + }, + { + "epoch": 7.111418047882136, + "grad_norm": 0.20110327005386353, + "learning_rate": 2.0335563981485768e-05, + "loss": 1.7111, + "step": 23169 + }, + { + "epoch": 7.111724984653161, + "grad_norm": 0.1875200718641281, + "learning_rate": 2.0331562878344645e-05, + "loss": 1.7145, + "step": 23170 + }, + { + "epoch": 7.112031921424187, + "grad_norm": 0.17586658895015717, + "learning_rate": 2.032756206840441e-05, + "loss": 1.663, + "step": 23171 + }, + { + "epoch": 7.112338858195212, + "grad_norm": 0.1783432811498642, + "learning_rate": 2.032356155170459e-05, + "loss": 1.7146, + "step": 23172 + }, + { + "epoch": 7.112645794966237, + "grad_norm": 0.16075368225574493, + "learning_rate": 2.0319561328284737e-05, + "loss": 1.6414, + "step": 23173 + }, + { + "epoch": 7.112952731737262, + "grad_norm": 0.22822627425193787, + "learning_rate": 2.0315561398184367e-05, + "loss": 1.7363, + "step": 23174 + }, + { + "epoch": 7.113259668508287, + "grad_norm": 0.1882331818342209, + "learning_rate": 2.0311561761443026e-05, + "loss": 1.7384, + "step": 23175 + }, + { + "epoch": 7.1135666052793125, + "grad_norm": 0.21478623151779175, + "learning_rate": 2.0307562418100228e-05, + "loss": 1.7314, + "step": 23176 + }, + { + "epoch": 7.113873542050338, + "grad_norm": 0.18545235693454742, + "learning_rate": 2.0303563368195483e-05, + "loss": 1.7046, + "step": 23177 + }, + { + "epoch": 7.114180478821363, + "grad_norm": 0.1965286284685135, + "learning_rate": 2.0299564611768367e-05, + "loss": 1.7423, + "step": 23178 + }, + { + "epoch": 7.114487415592388, + "grad_norm": 0.1679733693599701, + "learning_rate": 2.0295566148858332e-05, + "loss": 1.6861, + "step": 23179 + }, + { + "epoch": 7.114794352363413, + "grad_norm": 0.18930186331272125, + "learning_rate": 2.029156797950495e-05, + "loss": 1.6609, + "step": 23180 + }, + { + "epoch": 7.115101289134438, + "grad_norm": 0.20774266123771667, + "learning_rate": 2.0287570103747672e-05, + "loss": 1.6919, + "step": 23181 + }, + { + "epoch": 7.115408225905464, + "grad_norm": 0.1866706907749176, + "learning_rate": 2.028357252162606e-05, + "loss": 1.7385, + "step": 23182 + }, + { + "epoch": 7.115715162676489, + "grad_norm": 0.21728016436100006, + "learning_rate": 2.0279575233179605e-05, + "loss": 1.7574, + "step": 23183 + }, + { + "epoch": 7.116022099447513, + "grad_norm": 0.16665934026241302, + "learning_rate": 2.02755782384478e-05, + "loss": 1.7046, + "step": 23184 + }, + { + "epoch": 7.116329036218539, + "grad_norm": 0.17275744676589966, + "learning_rate": 2.027158153747016e-05, + "loss": 1.6914, + "step": 23185 + }, + { + "epoch": 7.116635972989564, + "grad_norm": 0.15803802013397217, + "learning_rate": 2.026758513028617e-05, + "loss": 1.6932, + "step": 23186 + }, + { + "epoch": 7.116942909760589, + "grad_norm": 0.17434535920619965, + "learning_rate": 2.0263589016935336e-05, + "loss": 1.6714, + "step": 23187 + }, + { + "epoch": 7.117249846531615, + "grad_norm": 0.18005578219890594, + "learning_rate": 2.025959319745714e-05, + "loss": 1.6728, + "step": 23188 + }, + { + "epoch": 7.11755678330264, + "grad_norm": 0.19545695185661316, + "learning_rate": 2.025559767189108e-05, + "loss": 1.7475, + "step": 23189 + }, + { + "epoch": 7.1178637200736645, + "grad_norm": 0.19226810336112976, + "learning_rate": 2.025160244027663e-05, + "loss": 1.7447, + "step": 23190 + }, + { + "epoch": 7.11817065684469, + "grad_norm": 0.1682211458683014, + "learning_rate": 2.0247607502653286e-05, + "loss": 1.687, + "step": 23191 + }, + { + "epoch": 7.118477593615715, + "grad_norm": 0.1883849948644638, + "learning_rate": 2.0243612859060524e-05, + "loss": 1.7556, + "step": 23192 + }, + { + "epoch": 7.1187845303867405, + "grad_norm": 0.16668641567230225, + "learning_rate": 2.0239618509537817e-05, + "loss": 1.6683, + "step": 23193 + }, + { + "epoch": 7.119091467157766, + "grad_norm": 0.21448664367198944, + "learning_rate": 2.023562445412463e-05, + "loss": 1.709, + "step": 23194 + }, + { + "epoch": 7.11939840392879, + "grad_norm": 0.24347564578056335, + "learning_rate": 2.0231630692860476e-05, + "loss": 1.7775, + "step": 23195 + }, + { + "epoch": 7.119705340699816, + "grad_norm": 0.20289309322834015, + "learning_rate": 2.0227637225784767e-05, + "loss": 1.8258, + "step": 23196 + }, + { + "epoch": 7.120012277470841, + "grad_norm": 0.20075447857379913, + "learning_rate": 2.022364405293703e-05, + "loss": 1.686, + "step": 23197 + }, + { + "epoch": 7.120319214241866, + "grad_norm": 0.17129302024841309, + "learning_rate": 2.021965117435666e-05, + "loss": 1.6937, + "step": 23198 + }, + { + "epoch": 7.120626151012892, + "grad_norm": 0.222218856215477, + "learning_rate": 2.0215658590083164e-05, + "loss": 1.6812, + "step": 23199 + }, + { + "epoch": 7.120933087783916, + "grad_norm": 0.1955309957265854, + "learning_rate": 2.0211666300155996e-05, + "loss": 1.7652, + "step": 23200 + }, + { + "epoch": 7.121240024554941, + "grad_norm": 0.20479047298431396, + "learning_rate": 2.0207674304614595e-05, + "loss": 1.7393, + "step": 23201 + }, + { + "epoch": 7.121546961325967, + "grad_norm": 0.14726878702640533, + "learning_rate": 2.020368260349842e-05, + "loss": 1.6766, + "step": 23202 + }, + { + "epoch": 7.121853898096992, + "grad_norm": 0.19149260222911835, + "learning_rate": 2.0199691196846914e-05, + "loss": 1.7176, + "step": 23203 + }, + { + "epoch": 7.122160834868017, + "grad_norm": 0.17182055115699768, + "learning_rate": 2.019570008469953e-05, + "loss": 1.6828, + "step": 23204 + }, + { + "epoch": 7.122467771639043, + "grad_norm": 0.16044408082962036, + "learning_rate": 2.019170926709571e-05, + "loss": 1.6595, + "step": 23205 + }, + { + "epoch": 7.122774708410067, + "grad_norm": 0.21787980198860168, + "learning_rate": 2.0187718744074885e-05, + "loss": 1.7114, + "step": 23206 + }, + { + "epoch": 7.1230816451810925, + "grad_norm": 0.16959737241268158, + "learning_rate": 2.01837285156765e-05, + "loss": 1.7128, + "step": 23207 + }, + { + "epoch": 7.123388581952118, + "grad_norm": 0.28120318055152893, + "learning_rate": 2.0179738581939983e-05, + "loss": 1.8386, + "step": 23208 + }, + { + "epoch": 7.123695518723143, + "grad_norm": 0.19752691686153412, + "learning_rate": 2.017574894290477e-05, + "loss": 1.7123, + "step": 23209 + }, + { + "epoch": 7.1240024554941686, + "grad_norm": 0.19860398769378662, + "learning_rate": 2.0171759598610286e-05, + "loss": 1.7041, + "step": 23210 + }, + { + "epoch": 7.124309392265193, + "grad_norm": 0.17429523169994354, + "learning_rate": 2.0167770549095937e-05, + "loss": 1.6963, + "step": 23211 + }, + { + "epoch": 7.124616329036218, + "grad_norm": 0.27635815739631653, + "learning_rate": 2.01637817944012e-05, + "loss": 1.8261, + "step": 23212 + }, + { + "epoch": 7.124923265807244, + "grad_norm": 0.17512556910514832, + "learning_rate": 2.0159793334565424e-05, + "loss": 1.7311, + "step": 23213 + }, + { + "epoch": 7.125230202578269, + "grad_norm": 0.1964988112449646, + "learning_rate": 2.01558051696281e-05, + "loss": 1.6829, + "step": 23214 + }, + { + "epoch": 7.125537139349294, + "grad_norm": 0.20796819031238556, + "learning_rate": 2.0151817299628563e-05, + "loss": 1.7084, + "step": 23215 + }, + { + "epoch": 7.12584407612032, + "grad_norm": 0.19875051081180573, + "learning_rate": 2.0147829724606278e-05, + "loss": 1.7197, + "step": 23216 + }, + { + "epoch": 7.126151012891344, + "grad_norm": 0.22590650618076324, + "learning_rate": 2.0143842444600635e-05, + "loss": 1.7923, + "step": 23217 + }, + { + "epoch": 7.1264579496623695, + "grad_norm": 0.19106422364711761, + "learning_rate": 2.0139855459651042e-05, + "loss": 1.7096, + "step": 23218 + }, + { + "epoch": 7.126764886433395, + "grad_norm": 0.2105991542339325, + "learning_rate": 2.01358687697969e-05, + "loss": 1.6836, + "step": 23219 + }, + { + "epoch": 7.12707182320442, + "grad_norm": 0.18826960027217865, + "learning_rate": 2.013188237507761e-05, + "loss": 1.7347, + "step": 23220 + }, + { + "epoch": 7.1273787599754455, + "grad_norm": 0.1865578591823578, + "learning_rate": 2.012789627553256e-05, + "loss": 1.7115, + "step": 23221 + }, + { + "epoch": 7.12768569674647, + "grad_norm": 0.18389549851417542, + "learning_rate": 2.0123910471201145e-05, + "loss": 1.6817, + "step": 23222 + }, + { + "epoch": 7.127992633517495, + "grad_norm": 0.18351595103740692, + "learning_rate": 2.0119924962122766e-05, + "loss": 1.6898, + "step": 23223 + }, + { + "epoch": 7.128299570288521, + "grad_norm": 0.1913219839334488, + "learning_rate": 2.01159397483368e-05, + "loss": 1.7536, + "step": 23224 + }, + { + "epoch": 7.128606507059546, + "grad_norm": 0.17707225680351257, + "learning_rate": 2.0111954829882628e-05, + "loss": 1.6894, + "step": 23225 + }, + { + "epoch": 7.128913443830571, + "grad_norm": 0.17774651944637299, + "learning_rate": 2.0107970206799637e-05, + "loss": 1.6599, + "step": 23226 + }, + { + "epoch": 7.129220380601596, + "grad_norm": 0.14530350267887115, + "learning_rate": 2.0103985879127207e-05, + "loss": 1.6264, + "step": 23227 + }, + { + "epoch": 7.129527317372621, + "grad_norm": 0.15673531591892242, + "learning_rate": 2.010000184690471e-05, + "loss": 1.6577, + "step": 23228 + }, + { + "epoch": 7.129834254143646, + "grad_norm": 0.20691752433776855, + "learning_rate": 2.009601811017152e-05, + "loss": 1.7129, + "step": 23229 + }, + { + "epoch": 7.130141190914672, + "grad_norm": 0.16686022281646729, + "learning_rate": 2.0092034668966987e-05, + "loss": 1.6738, + "step": 23230 + }, + { + "epoch": 7.130448127685697, + "grad_norm": 0.17799030244350433, + "learning_rate": 2.0088051523330536e-05, + "loss": 1.7312, + "step": 23231 + }, + { + "epoch": 7.1307550644567215, + "grad_norm": 0.16749511659145355, + "learning_rate": 2.0084068673301454e-05, + "loss": 1.6616, + "step": 23232 + }, + { + "epoch": 7.131062001227747, + "grad_norm": 0.18347670137882233, + "learning_rate": 2.0080086118919156e-05, + "loss": 1.6622, + "step": 23233 + }, + { + "epoch": 7.131368937998772, + "grad_norm": 0.19747060537338257, + "learning_rate": 2.007610386022299e-05, + "loss": 1.7341, + "step": 23234 + }, + { + "epoch": 7.1316758747697975, + "grad_norm": 0.21067634224891663, + "learning_rate": 2.0072121897252295e-05, + "loss": 1.7252, + "step": 23235 + }, + { + "epoch": 7.131982811540823, + "grad_norm": 0.2095600962638855, + "learning_rate": 2.006814023004644e-05, + "loss": 1.7769, + "step": 23236 + }, + { + "epoch": 7.132289748311848, + "grad_norm": 0.23090791702270508, + "learning_rate": 2.0064158858644765e-05, + "loss": 1.7734, + "step": 23237 + }, + { + "epoch": 7.132596685082873, + "grad_norm": 0.19060610234737396, + "learning_rate": 2.0060177783086614e-05, + "loss": 1.7209, + "step": 23238 + }, + { + "epoch": 7.132903621853898, + "grad_norm": 0.18050087988376617, + "learning_rate": 2.0056197003411342e-05, + "loss": 1.6882, + "step": 23239 + }, + { + "epoch": 7.133210558624923, + "grad_norm": 0.1504158228635788, + "learning_rate": 2.005221651965828e-05, + "loss": 1.687, + "step": 23240 + }, + { + "epoch": 7.133517495395949, + "grad_norm": 0.22980810701847076, + "learning_rate": 2.004823633186676e-05, + "loss": 1.7254, + "step": 23241 + }, + { + "epoch": 7.133824432166974, + "grad_norm": 0.20092199742794037, + "learning_rate": 2.004425644007613e-05, + "loss": 1.7234, + "step": 23242 + }, + { + "epoch": 7.134131368937998, + "grad_norm": 0.21002927422523499, + "learning_rate": 2.0040276844325718e-05, + "loss": 1.7272, + "step": 23243 + }, + { + "epoch": 7.134438305709024, + "grad_norm": 0.18524625897407532, + "learning_rate": 2.003629754465484e-05, + "loss": 1.7189, + "step": 23244 + }, + { + "epoch": 7.134745242480049, + "grad_norm": 0.21095192432403564, + "learning_rate": 2.0032318541102845e-05, + "loss": 1.7177, + "step": 23245 + }, + { + "epoch": 7.135052179251074, + "grad_norm": 0.1700662076473236, + "learning_rate": 2.0028339833709037e-05, + "loss": 1.6925, + "step": 23246 + }, + { + "epoch": 7.1353591160221, + "grad_norm": 0.2123938947916031, + "learning_rate": 2.002436142251272e-05, + "loss": 1.7623, + "step": 23247 + }, + { + "epoch": 7.135666052793125, + "grad_norm": 0.194299578666687, + "learning_rate": 2.0020383307553275e-05, + "loss": 1.6898, + "step": 23248 + }, + { + "epoch": 7.1359729895641495, + "grad_norm": 0.18740688264369965, + "learning_rate": 2.001640548886993e-05, + "loss": 1.6519, + "step": 23249 + }, + { + "epoch": 7.136279926335175, + "grad_norm": 0.18891027569770813, + "learning_rate": 2.0012427966502085e-05, + "loss": 1.6895, + "step": 23250 + }, + { + "epoch": 7.1365868631062, + "grad_norm": 0.21313735842704773, + "learning_rate": 2.000845074048896e-05, + "loss": 1.6829, + "step": 23251 + }, + { + "epoch": 7.1368937998772255, + "grad_norm": 0.2438332885503769, + "learning_rate": 2.0004473810869923e-05, + "loss": 1.7723, + "step": 23252 + }, + { + "epoch": 7.137200736648251, + "grad_norm": 0.24475115537643433, + "learning_rate": 2.0000497177684257e-05, + "loss": 1.7192, + "step": 23253 + }, + { + "epoch": 7.137507673419275, + "grad_norm": 0.1936563402414322, + "learning_rate": 1.9996520840971267e-05, + "loss": 1.7462, + "step": 23254 + }, + { + "epoch": 7.137814610190301, + "grad_norm": 0.22365616261959076, + "learning_rate": 1.9992544800770236e-05, + "loss": 1.7405, + "step": 23255 + }, + { + "epoch": 7.138121546961326, + "grad_norm": 0.191316619515419, + "learning_rate": 1.9988569057120472e-05, + "loss": 1.6466, + "step": 23256 + }, + { + "epoch": 7.138428483732351, + "grad_norm": 0.24758055806159973, + "learning_rate": 1.9984593610061253e-05, + "loss": 1.7689, + "step": 23257 + }, + { + "epoch": 7.138735420503377, + "grad_norm": 0.2144414782524109, + "learning_rate": 1.9980618459631874e-05, + "loss": 1.7158, + "step": 23258 + }, + { + "epoch": 7.139042357274401, + "grad_norm": 0.24254034459590912, + "learning_rate": 1.9976643605871614e-05, + "loss": 1.7998, + "step": 23259 + }, + { + "epoch": 7.139349294045426, + "grad_norm": 0.21013480424880981, + "learning_rate": 1.9972669048819765e-05, + "loss": 1.7231, + "step": 23260 + }, + { + "epoch": 7.139656230816452, + "grad_norm": 0.2169421911239624, + "learning_rate": 1.9968694788515603e-05, + "loss": 1.7182, + "step": 23261 + }, + { + "epoch": 7.139963167587477, + "grad_norm": 0.19591476023197174, + "learning_rate": 1.9964720824998395e-05, + "loss": 1.7114, + "step": 23262 + }, + { + "epoch": 7.140270104358502, + "grad_norm": 0.1775221824645996, + "learning_rate": 1.9960747158307417e-05, + "loss": 1.6754, + "step": 23263 + }, + { + "epoch": 7.140577041129528, + "grad_norm": 0.19318300485610962, + "learning_rate": 1.995677378848193e-05, + "loss": 1.6794, + "step": 23264 + }, + { + "epoch": 7.140883977900552, + "grad_norm": 0.19659662246704102, + "learning_rate": 1.995280071556125e-05, + "loss": 1.703, + "step": 23265 + }, + { + "epoch": 7.1411909146715775, + "grad_norm": 0.22100697457790375, + "learning_rate": 1.994882793958457e-05, + "loss": 1.6821, + "step": 23266 + }, + { + "epoch": 7.141497851442603, + "grad_norm": 0.20475365221500397, + "learning_rate": 1.9944855460591217e-05, + "loss": 1.727, + "step": 23267 + }, + { + "epoch": 7.141804788213628, + "grad_norm": 0.2202025055885315, + "learning_rate": 1.9940883278620383e-05, + "loss": 1.7248, + "step": 23268 + }, + { + "epoch": 7.1421117249846535, + "grad_norm": 0.1800462007522583, + "learning_rate": 1.993691139371138e-05, + "loss": 1.7276, + "step": 23269 + }, + { + "epoch": 7.142418661755678, + "grad_norm": 0.2896895110607147, + "learning_rate": 1.9932939805903433e-05, + "loss": 1.7275, + "step": 23270 + }, + { + "epoch": 7.142725598526703, + "grad_norm": 0.21308782696723938, + "learning_rate": 1.99289685152358e-05, + "loss": 1.6645, + "step": 23271 + }, + { + "epoch": 7.143032535297729, + "grad_norm": 0.20210005342960358, + "learning_rate": 1.992499752174773e-05, + "loss": 1.6899, + "step": 23272 + }, + { + "epoch": 7.143339472068754, + "grad_norm": 0.18419797718524933, + "learning_rate": 1.9921026825478455e-05, + "loss": 1.7088, + "step": 23273 + }, + { + "epoch": 7.143646408839779, + "grad_norm": 0.19155149161815643, + "learning_rate": 1.9917056426467227e-05, + "loss": 1.719, + "step": 23274 + }, + { + "epoch": 7.143953345610804, + "grad_norm": 0.17220313847064972, + "learning_rate": 1.9913086324753278e-05, + "loss": 1.7408, + "step": 23275 + }, + { + "epoch": 7.144260282381829, + "grad_norm": 0.18474969267845154, + "learning_rate": 1.990911652037585e-05, + "loss": 1.7189, + "step": 23276 + }, + { + "epoch": 7.144567219152854, + "grad_norm": 0.18529154360294342, + "learning_rate": 1.9905147013374165e-05, + "loss": 1.7075, + "step": 23277 + }, + { + "epoch": 7.14487415592388, + "grad_norm": 0.18569569289684296, + "learning_rate": 1.9901177803787452e-05, + "loss": 1.7116, + "step": 23278 + }, + { + "epoch": 7.145181092694905, + "grad_norm": 0.17149175703525543, + "learning_rate": 1.9897208891654946e-05, + "loss": 1.6873, + "step": 23279 + }, + { + "epoch": 7.14548802946593, + "grad_norm": 0.18012240529060364, + "learning_rate": 1.9893240277015868e-05, + "loss": 1.709, + "step": 23280 + }, + { + "epoch": 7.145794966236955, + "grad_norm": 0.18372172117233276, + "learning_rate": 1.9889271959909412e-05, + "loss": 1.7134, + "step": 23281 + }, + { + "epoch": 7.14610190300798, + "grad_norm": 0.20667128264904022, + "learning_rate": 1.9885303940374856e-05, + "loss": 1.7452, + "step": 23282 + }, + { + "epoch": 7.1464088397790055, + "grad_norm": 0.18145184218883514, + "learning_rate": 1.9881336218451346e-05, + "loss": 1.7358, + "step": 23283 + }, + { + "epoch": 7.146715776550031, + "grad_norm": 0.179911807179451, + "learning_rate": 1.987736879417816e-05, + "loss": 1.6698, + "step": 23284 + }, + { + "epoch": 7.147022713321056, + "grad_norm": 0.18944865465164185, + "learning_rate": 1.9873401667594426e-05, + "loss": 1.7725, + "step": 23285 + }, + { + "epoch": 7.147329650092081, + "grad_norm": 0.1926117241382599, + "learning_rate": 1.986943483873942e-05, + "loss": 1.7829, + "step": 23286 + }, + { + "epoch": 7.147636586863106, + "grad_norm": 0.330503910779953, + "learning_rate": 1.9865468307652318e-05, + "loss": 1.7408, + "step": 23287 + }, + { + "epoch": 7.147943523634131, + "grad_norm": 0.22677597403526306, + "learning_rate": 1.9861502074372324e-05, + "loss": 1.7013, + "step": 23288 + }, + { + "epoch": 7.148250460405157, + "grad_norm": 0.1859201192855835, + "learning_rate": 1.9857536138938627e-05, + "loss": 1.7215, + "step": 23289 + }, + { + "epoch": 7.148557397176182, + "grad_norm": 0.22151269018650055, + "learning_rate": 1.9853570501390427e-05, + "loss": 1.6781, + "step": 23290 + }, + { + "epoch": 7.148864333947207, + "grad_norm": 0.16455405950546265, + "learning_rate": 1.984960516176691e-05, + "loss": 1.6518, + "step": 23291 + }, + { + "epoch": 7.149171270718232, + "grad_norm": 0.19687162339687347, + "learning_rate": 1.9845640120107267e-05, + "loss": 1.7375, + "step": 23292 + }, + { + "epoch": 7.149478207489257, + "grad_norm": 0.19174890220165253, + "learning_rate": 1.9841675376450686e-05, + "loss": 1.7017, + "step": 23293 + }, + { + "epoch": 7.149785144260282, + "grad_norm": 0.18458877503871918, + "learning_rate": 1.983771093083634e-05, + "loss": 1.7256, + "step": 23294 + }, + { + "epoch": 7.150092081031308, + "grad_norm": 0.212035670876503, + "learning_rate": 1.983374678330342e-05, + "loss": 1.698, + "step": 23295 + }, + { + "epoch": 7.150399017802333, + "grad_norm": 0.1793123185634613, + "learning_rate": 1.982978293389109e-05, + "loss": 1.7012, + "step": 23296 + }, + { + "epoch": 7.150705954573358, + "grad_norm": 0.2359405905008316, + "learning_rate": 1.9825819382638526e-05, + "loss": 1.7423, + "step": 23297 + }, + { + "epoch": 7.151012891344383, + "grad_norm": 0.17125526070594788, + "learning_rate": 1.9821856129584888e-05, + "loss": 1.6825, + "step": 23298 + }, + { + "epoch": 7.151319828115408, + "grad_norm": 0.2084828019142151, + "learning_rate": 1.9817893174769392e-05, + "loss": 1.6991, + "step": 23299 + }, + { + "epoch": 7.151626764886434, + "grad_norm": 0.27647483348846436, + "learning_rate": 1.9813930518231127e-05, + "loss": 1.7425, + "step": 23300 + }, + { + "epoch": 7.151933701657459, + "grad_norm": 0.23517926037311554, + "learning_rate": 1.980996816000933e-05, + "loss": 1.8411, + "step": 23301 + }, + { + "epoch": 7.152240638428483, + "grad_norm": 0.19960010051727295, + "learning_rate": 1.980600610014309e-05, + "loss": 1.7302, + "step": 23302 + }, + { + "epoch": 7.152547575199509, + "grad_norm": 0.18953165411949158, + "learning_rate": 1.9802044338671604e-05, + "loss": 1.7252, + "step": 23303 + }, + { + "epoch": 7.152854511970534, + "grad_norm": 0.1718905121088028, + "learning_rate": 1.979808287563402e-05, + "loss": 1.656, + "step": 23304 + }, + { + "epoch": 7.153161448741559, + "grad_norm": 0.17233465611934662, + "learning_rate": 1.9794121711069487e-05, + "loss": 1.6732, + "step": 23305 + }, + { + "epoch": 7.153468385512585, + "grad_norm": 0.17677003145217896, + "learning_rate": 1.979016084501714e-05, + "loss": 1.7266, + "step": 23306 + }, + { + "epoch": 7.153775322283609, + "grad_norm": 0.1815326064825058, + "learning_rate": 1.9786200277516136e-05, + "loss": 1.7029, + "step": 23307 + }, + { + "epoch": 7.1540822590546345, + "grad_norm": 0.20937341451644897, + "learning_rate": 1.978224000860561e-05, + "loss": 1.711, + "step": 23308 + }, + { + "epoch": 7.15438919582566, + "grad_norm": 0.2045155018568039, + "learning_rate": 1.97782800383247e-05, + "loss": 1.7557, + "step": 23309 + }, + { + "epoch": 7.154696132596685, + "grad_norm": 0.16426041722297668, + "learning_rate": 1.9774320366712533e-05, + "loss": 1.7373, + "step": 23310 + }, + { + "epoch": 7.1550030693677105, + "grad_norm": 0.18058224022388458, + "learning_rate": 1.977036099380825e-05, + "loss": 1.6957, + "step": 23311 + }, + { + "epoch": 7.155310006138736, + "grad_norm": 0.23552078008651733, + "learning_rate": 1.9766401919650983e-05, + "loss": 1.8032, + "step": 23312 + }, + { + "epoch": 7.15561694290976, + "grad_norm": 0.19097596406936646, + "learning_rate": 1.9762443144279852e-05, + "loss": 1.7447, + "step": 23313 + }, + { + "epoch": 7.155923879680786, + "grad_norm": 0.17892403900623322, + "learning_rate": 1.975848466773398e-05, + "loss": 1.7117, + "step": 23314 + }, + { + "epoch": 7.156230816451811, + "grad_norm": 0.18331217765808105, + "learning_rate": 1.9754526490052467e-05, + "loss": 1.6669, + "step": 23315 + }, + { + "epoch": 7.156537753222836, + "grad_norm": 0.19914311170578003, + "learning_rate": 1.975056861127449e-05, + "loss": 1.6731, + "step": 23316 + }, + { + "epoch": 7.156844689993862, + "grad_norm": 0.21710485219955444, + "learning_rate": 1.9746611031439083e-05, + "loss": 1.7214, + "step": 23317 + }, + { + "epoch": 7.157151626764886, + "grad_norm": 0.19703111052513123, + "learning_rate": 1.9742653750585437e-05, + "loss": 1.7185, + "step": 23318 + }, + { + "epoch": 7.157458563535911, + "grad_norm": 0.18581365048885345, + "learning_rate": 1.9738696768752585e-05, + "loss": 1.7113, + "step": 23319 + }, + { + "epoch": 7.157765500306937, + "grad_norm": 0.1703677624464035, + "learning_rate": 1.9734740085979687e-05, + "loss": 1.6755, + "step": 23320 + }, + { + "epoch": 7.158072437077962, + "grad_norm": 0.16760937869548798, + "learning_rate": 1.9730783702305826e-05, + "loss": 1.7082, + "step": 23321 + }, + { + "epoch": 7.158379373848987, + "grad_norm": 0.20183983445167542, + "learning_rate": 1.97268276177701e-05, + "loss": 1.7503, + "step": 23322 + }, + { + "epoch": 7.158686310620013, + "grad_norm": 0.18407952785491943, + "learning_rate": 1.972287183241163e-05, + "loss": 1.6807, + "step": 23323 + }, + { + "epoch": 7.158993247391037, + "grad_norm": 0.20135276019573212, + "learning_rate": 1.9718916346269446e-05, + "loss": 1.8001, + "step": 23324 + }, + { + "epoch": 7.1593001841620625, + "grad_norm": 0.1781267672777176, + "learning_rate": 1.9714961159382693e-05, + "loss": 1.683, + "step": 23325 + }, + { + "epoch": 7.159607120933088, + "grad_norm": 0.24990373849868774, + "learning_rate": 1.971100627179045e-05, + "loss": 1.7235, + "step": 23326 + }, + { + "epoch": 7.159914057704113, + "grad_norm": 0.19463174045085907, + "learning_rate": 1.9707051683531796e-05, + "loss": 1.735, + "step": 23327 + }, + { + "epoch": 7.1602209944751385, + "grad_norm": 0.1988895982503891, + "learning_rate": 1.9703097394645813e-05, + "loss": 1.7495, + "step": 23328 + }, + { + "epoch": 7.160527931246163, + "grad_norm": 0.1760931760072708, + "learning_rate": 1.9699143405171576e-05, + "loss": 1.6914, + "step": 23329 + }, + { + "epoch": 7.160834868017188, + "grad_norm": 0.18537557125091553, + "learning_rate": 1.9695189715148166e-05, + "loss": 1.7601, + "step": 23330 + }, + { + "epoch": 7.161141804788214, + "grad_norm": 0.2476375252008438, + "learning_rate": 1.9691236324614654e-05, + "loss": 1.8218, + "step": 23331 + }, + { + "epoch": 7.161448741559239, + "grad_norm": 0.17736093699932098, + "learning_rate": 1.968728323361009e-05, + "loss": 1.6872, + "step": 23332 + }, + { + "epoch": 7.161755678330264, + "grad_norm": 0.1851162612438202, + "learning_rate": 1.9683330442173598e-05, + "loss": 1.712, + "step": 23333 + }, + { + "epoch": 7.162062615101289, + "grad_norm": 0.20326650142669678, + "learning_rate": 1.967937795034417e-05, + "loss": 1.7668, + "step": 23334 + }, + { + "epoch": 7.162369551872314, + "grad_norm": 0.21020451188087463, + "learning_rate": 1.9675425758160925e-05, + "loss": 1.7135, + "step": 23335 + }, + { + "epoch": 7.162676488643339, + "grad_norm": 0.21629111468791962, + "learning_rate": 1.967147386566287e-05, + "loss": 1.7181, + "step": 23336 + }, + { + "epoch": 7.162983425414365, + "grad_norm": 0.18086732923984528, + "learning_rate": 1.9667522272889104e-05, + "loss": 1.7107, + "step": 23337 + }, + { + "epoch": 7.16329036218539, + "grad_norm": 0.16542381048202515, + "learning_rate": 1.9663570979878658e-05, + "loss": 1.7156, + "step": 23338 + }, + { + "epoch": 7.163597298956415, + "grad_norm": 0.18775032460689545, + "learning_rate": 1.9659619986670587e-05, + "loss": 1.6955, + "step": 23339 + }, + { + "epoch": 7.16390423572744, + "grad_norm": 0.19227592647075653, + "learning_rate": 1.9655669293303953e-05, + "loss": 1.7545, + "step": 23340 + }, + { + "epoch": 7.164211172498465, + "grad_norm": 0.1935085654258728, + "learning_rate": 1.9651718899817746e-05, + "loss": 1.7183, + "step": 23341 + }, + { + "epoch": 7.1645181092694905, + "grad_norm": 0.17873792350292206, + "learning_rate": 1.9647768806251056e-05, + "loss": 1.6644, + "step": 23342 + }, + { + "epoch": 7.164825046040516, + "grad_norm": 0.25024256110191345, + "learning_rate": 1.96438190126429e-05, + "loss": 1.7621, + "step": 23343 + }, + { + "epoch": 7.165131982811541, + "grad_norm": 0.15957331657409668, + "learning_rate": 1.9639869519032323e-05, + "loss": 1.6525, + "step": 23344 + }, + { + "epoch": 7.165438919582566, + "grad_norm": 0.19967027008533478, + "learning_rate": 1.9635920325458347e-05, + "loss": 1.7533, + "step": 23345 + }, + { + "epoch": 7.165745856353591, + "grad_norm": 0.17413713037967682, + "learning_rate": 1.9631971431960005e-05, + "loss": 1.6962, + "step": 23346 + }, + { + "epoch": 7.166052793124616, + "grad_norm": 0.19787384569644928, + "learning_rate": 1.9628022838576315e-05, + "loss": 1.7369, + "step": 23347 + }, + { + "epoch": 7.166359729895642, + "grad_norm": 0.1726577877998352, + "learning_rate": 1.962407454534631e-05, + "loss": 1.7004, + "step": 23348 + }, + { + "epoch": 7.166666666666667, + "grad_norm": 0.2136315256357193, + "learning_rate": 1.962012655230899e-05, + "loss": 1.7411, + "step": 23349 + }, + { + "epoch": 7.166973603437691, + "grad_norm": 0.18257126212120056, + "learning_rate": 1.9616178859503414e-05, + "loss": 1.7155, + "step": 23350 + }, + { + "epoch": 7.167280540208717, + "grad_norm": 0.18696577847003937, + "learning_rate": 1.961223146696854e-05, + "loss": 1.7272, + "step": 23351 + }, + { + "epoch": 7.167587476979742, + "grad_norm": 0.16375793516635895, + "learning_rate": 1.9608284374743435e-05, + "loss": 1.6706, + "step": 23352 + }, + { + "epoch": 7.167894413750767, + "grad_norm": 0.19589200615882874, + "learning_rate": 1.960433758286704e-05, + "loss": 1.7018, + "step": 23353 + }, + { + "epoch": 7.168201350521793, + "grad_norm": 0.18434208631515503, + "learning_rate": 1.9600391091378417e-05, + "loss": 1.6776, + "step": 23354 + }, + { + "epoch": 7.168508287292818, + "grad_norm": 0.23839476704597473, + "learning_rate": 1.9596444900316545e-05, + "loss": 1.7501, + "step": 23355 + }, + { + "epoch": 7.1688152240638425, + "grad_norm": 0.20229686796665192, + "learning_rate": 1.9592499009720428e-05, + "loss": 1.7249, + "step": 23356 + }, + { + "epoch": 7.169122160834868, + "grad_norm": 0.2422642856836319, + "learning_rate": 1.9588553419629076e-05, + "loss": 1.7621, + "step": 23357 + }, + { + "epoch": 7.169429097605893, + "grad_norm": 0.21856555342674255, + "learning_rate": 1.9584608130081422e-05, + "loss": 1.7362, + "step": 23358 + }, + { + "epoch": 7.1697360343769185, + "grad_norm": 0.19434040784835815, + "learning_rate": 1.958066314111652e-05, + "loss": 1.6888, + "step": 23359 + }, + { + "epoch": 7.170042971147944, + "grad_norm": 0.19806630909442902, + "learning_rate": 1.9576718452773335e-05, + "loss": 1.7461, + "step": 23360 + }, + { + "epoch": 7.170349907918968, + "grad_norm": 0.19190531969070435, + "learning_rate": 1.957277406509085e-05, + "loss": 1.6992, + "step": 23361 + }, + { + "epoch": 7.170656844689994, + "grad_norm": 0.20990152657032013, + "learning_rate": 1.9568829978108044e-05, + "loss": 1.7095, + "step": 23362 + }, + { + "epoch": 7.170963781461019, + "grad_norm": 0.18638263642787933, + "learning_rate": 1.9564886191863897e-05, + "loss": 1.7024, + "step": 23363 + }, + { + "epoch": 7.171270718232044, + "grad_norm": 0.1974666863679886, + "learning_rate": 1.9560942706397383e-05, + "loss": 1.6901, + "step": 23364 + }, + { + "epoch": 7.17157765500307, + "grad_norm": 0.171469047665596, + "learning_rate": 1.955699952174747e-05, + "loss": 1.717, + "step": 23365 + }, + { + "epoch": 7.171884591774095, + "grad_norm": 0.17386725544929504, + "learning_rate": 1.955305663795312e-05, + "loss": 1.7069, + "step": 23366 + }, + { + "epoch": 7.172191528545119, + "grad_norm": 0.1869814246892929, + "learning_rate": 1.954911405505334e-05, + "loss": 1.7478, + "step": 23367 + }, + { + "epoch": 7.172498465316145, + "grad_norm": 0.19253556430339813, + "learning_rate": 1.9545171773087033e-05, + "loss": 1.7129, + "step": 23368 + }, + { + "epoch": 7.17280540208717, + "grad_norm": 0.1625998616218567, + "learning_rate": 1.954122979209322e-05, + "loss": 1.7055, + "step": 23369 + }, + { + "epoch": 7.173112338858195, + "grad_norm": 0.172325998544693, + "learning_rate": 1.953728811211079e-05, + "loss": 1.71, + "step": 23370 + }, + { + "epoch": 7.173419275629221, + "grad_norm": 0.22542965412139893, + "learning_rate": 1.9533346733178753e-05, + "loss": 1.7548, + "step": 23371 + }, + { + "epoch": 7.173726212400245, + "grad_norm": 0.1547299474477768, + "learning_rate": 1.9529405655336042e-05, + "loss": 1.6509, + "step": 23372 + }, + { + "epoch": 7.1740331491712706, + "grad_norm": 0.21720515191555023, + "learning_rate": 1.95254648786216e-05, + "loss": 1.7427, + "step": 23373 + }, + { + "epoch": 7.174340085942296, + "grad_norm": 0.18855944275856018, + "learning_rate": 1.95215244030744e-05, + "loss": 1.7471, + "step": 23374 + }, + { + "epoch": 7.174647022713321, + "grad_norm": 0.21088628470897675, + "learning_rate": 1.951758422873332e-05, + "loss": 1.7457, + "step": 23375 + }, + { + "epoch": 7.1749539594843466, + "grad_norm": 0.20596840977668762, + "learning_rate": 1.951364435563736e-05, + "loss": 1.7098, + "step": 23376 + }, + { + "epoch": 7.175260896255371, + "grad_norm": 0.20098064839839935, + "learning_rate": 1.9509704783825433e-05, + "loss": 1.7225, + "step": 23377 + }, + { + "epoch": 7.175567833026396, + "grad_norm": 0.20860125124454498, + "learning_rate": 1.950576551333647e-05, + "loss": 1.7071, + "step": 23378 + }, + { + "epoch": 7.175874769797422, + "grad_norm": 0.1914912760257721, + "learning_rate": 1.950182654420941e-05, + "loss": 1.7262, + "step": 23379 + }, + { + "epoch": 7.176181706568447, + "grad_norm": 0.21109424531459808, + "learning_rate": 1.9497887876483178e-05, + "loss": 1.6601, + "step": 23380 + }, + { + "epoch": 7.176488643339472, + "grad_norm": 0.20514877140522003, + "learning_rate": 1.949394951019669e-05, + "loss": 1.7612, + "step": 23381 + }, + { + "epoch": 7.176795580110497, + "grad_norm": 0.20280246436595917, + "learning_rate": 1.949001144538888e-05, + "loss": 1.6754, + "step": 23382 + }, + { + "epoch": 7.177102516881522, + "grad_norm": 0.1724841594696045, + "learning_rate": 1.9486073682098654e-05, + "loss": 1.7252, + "step": 23383 + }, + { + "epoch": 7.1774094536525475, + "grad_norm": 0.16961625218391418, + "learning_rate": 1.948213622036493e-05, + "loss": 1.6835, + "step": 23384 + }, + { + "epoch": 7.177716390423573, + "grad_norm": 0.17938925325870514, + "learning_rate": 1.947819906022661e-05, + "loss": 1.6909, + "step": 23385 + }, + { + "epoch": 7.178023327194598, + "grad_norm": 0.19711901247501373, + "learning_rate": 1.9474262201722655e-05, + "loss": 1.7275, + "step": 23386 + }, + { + "epoch": 7.1783302639656235, + "grad_norm": 0.19549165666103363, + "learning_rate": 1.947032564489189e-05, + "loss": 1.7609, + "step": 23387 + }, + { + "epoch": 7.178637200736648, + "grad_norm": 0.20358525216579437, + "learning_rate": 1.9466389389773284e-05, + "loss": 1.7127, + "step": 23388 + }, + { + "epoch": 7.178944137507673, + "grad_norm": 0.18345355987548828, + "learning_rate": 1.946245343640571e-05, + "loss": 1.6807, + "step": 23389 + }, + { + "epoch": 7.179251074278699, + "grad_norm": 0.20261847972869873, + "learning_rate": 1.9458517784828074e-05, + "loss": 1.717, + "step": 23390 + }, + { + "epoch": 7.179558011049724, + "grad_norm": 0.18042106926441193, + "learning_rate": 1.9454582435079275e-05, + "loss": 1.7415, + "step": 23391 + }, + { + "epoch": 7.179864947820749, + "grad_norm": 0.1731836199760437, + "learning_rate": 1.945064738719817e-05, + "loss": 1.6661, + "step": 23392 + }, + { + "epoch": 7.180171884591774, + "grad_norm": 0.1971052885055542, + "learning_rate": 1.9446712641223685e-05, + "loss": 1.753, + "step": 23393 + }, + { + "epoch": 7.180478821362799, + "grad_norm": 0.22370313107967377, + "learning_rate": 1.94427781971947e-05, + "loss": 1.7118, + "step": 23394 + }, + { + "epoch": 7.180785758133824, + "grad_norm": 0.23129026591777802, + "learning_rate": 1.9438844055150086e-05, + "loss": 1.8087, + "step": 23395 + }, + { + "epoch": 7.18109269490485, + "grad_norm": 0.26353758573532104, + "learning_rate": 1.9434910215128727e-05, + "loss": 1.7147, + "step": 23396 + }, + { + "epoch": 7.181399631675875, + "grad_norm": 0.22333624958992004, + "learning_rate": 1.9430976677169504e-05, + "loss": 1.7403, + "step": 23397 + }, + { + "epoch": 7.1817065684469, + "grad_norm": 0.22191296517848969, + "learning_rate": 1.9427043441311284e-05, + "loss": 1.7125, + "step": 23398 + }, + { + "epoch": 7.182013505217925, + "grad_norm": 0.19174177944660187, + "learning_rate": 1.942311050759294e-05, + "loss": 1.7026, + "step": 23399 + }, + { + "epoch": 7.18232044198895, + "grad_norm": 0.2175525426864624, + "learning_rate": 1.9419177876053342e-05, + "loss": 1.6947, + "step": 23400 + }, + { + "epoch": 7.1826273787599755, + "grad_norm": 0.19419047236442566, + "learning_rate": 1.9415245546731348e-05, + "loss": 1.7309, + "step": 23401 + }, + { + "epoch": 7.182934315531001, + "grad_norm": 0.22568467259407043, + "learning_rate": 1.9411313519665806e-05, + "loss": 1.7177, + "step": 23402 + }, + { + "epoch": 7.183241252302026, + "grad_norm": 0.26983609795570374, + "learning_rate": 1.9407381794895635e-05, + "loss": 1.6779, + "step": 23403 + }, + { + "epoch": 7.183548189073051, + "grad_norm": 0.1651962548494339, + "learning_rate": 1.9403450372459602e-05, + "loss": 1.6718, + "step": 23404 + }, + { + "epoch": 7.183855125844076, + "grad_norm": 0.2337920367717743, + "learning_rate": 1.9399519252396653e-05, + "loss": 1.7271, + "step": 23405 + }, + { + "epoch": 7.184162062615101, + "grad_norm": 0.20093166828155518, + "learning_rate": 1.9395588434745547e-05, + "loss": 1.7274, + "step": 23406 + }, + { + "epoch": 7.184468999386127, + "grad_norm": 0.22497716546058655, + "learning_rate": 1.9391657919545193e-05, + "loss": 1.7419, + "step": 23407 + }, + { + "epoch": 7.184775936157152, + "grad_norm": 0.22474822402000427, + "learning_rate": 1.938772770683443e-05, + "loss": 1.8317, + "step": 23408 + }, + { + "epoch": 7.185082872928176, + "grad_norm": 0.18015392124652863, + "learning_rate": 1.9383797796652052e-05, + "loss": 1.6568, + "step": 23409 + }, + { + "epoch": 7.185389809699202, + "grad_norm": 0.18696026504039764, + "learning_rate": 1.9379868189036947e-05, + "loss": 1.6722, + "step": 23410 + }, + { + "epoch": 7.185696746470227, + "grad_norm": 0.1828698217868805, + "learning_rate": 1.9375938884027934e-05, + "loss": 1.7477, + "step": 23411 + }, + { + "epoch": 7.186003683241252, + "grad_norm": 0.20442047715187073, + "learning_rate": 1.937200988166384e-05, + "loss": 1.7269, + "step": 23412 + }, + { + "epoch": 7.186310620012278, + "grad_norm": 0.17201031744480133, + "learning_rate": 1.9368081181983494e-05, + "loss": 1.6893, + "step": 23413 + }, + { + "epoch": 7.186617556783303, + "grad_norm": 0.21501687169075012, + "learning_rate": 1.9364152785025723e-05, + "loss": 1.771, + "step": 23414 + }, + { + "epoch": 7.1869244935543275, + "grad_norm": 0.18059030175209045, + "learning_rate": 1.936022469082936e-05, + "loss": 1.7088, + "step": 23415 + }, + { + "epoch": 7.187231430325353, + "grad_norm": 0.18079128861427307, + "learning_rate": 1.9356296899433206e-05, + "loss": 1.764, + "step": 23416 + }, + { + "epoch": 7.187538367096378, + "grad_norm": 0.1960453987121582, + "learning_rate": 1.9352369410876086e-05, + "loss": 1.7302, + "step": 23417 + }, + { + "epoch": 7.1878453038674035, + "grad_norm": 0.19896337389945984, + "learning_rate": 1.9348442225196815e-05, + "loss": 1.7228, + "step": 23418 + }, + { + "epoch": 7.188152240638429, + "grad_norm": 0.19272227585315704, + "learning_rate": 1.9344515342434192e-05, + "loss": 1.7164, + "step": 23419 + }, + { + "epoch": 7.188459177409453, + "grad_norm": 0.16746973991394043, + "learning_rate": 1.9340588762627066e-05, + "loss": 1.696, + "step": 23420 + }, + { + "epoch": 7.188766114180479, + "grad_norm": 0.2421095222234726, + "learning_rate": 1.9336662485814178e-05, + "loss": 1.766, + "step": 23421 + }, + { + "epoch": 7.189073050951504, + "grad_norm": 0.17857256531715393, + "learning_rate": 1.93327365120344e-05, + "loss": 1.7216, + "step": 23422 + }, + { + "epoch": 7.189379987722529, + "grad_norm": 0.19336672127246857, + "learning_rate": 1.932881084132646e-05, + "loss": 1.7124, + "step": 23423 + }, + { + "epoch": 7.189686924493555, + "grad_norm": 0.1555519700050354, + "learning_rate": 1.9324885473729204e-05, + "loss": 1.6491, + "step": 23424 + }, + { + "epoch": 7.189993861264579, + "grad_norm": 0.17879530787467957, + "learning_rate": 1.9320960409281425e-05, + "loss": 1.697, + "step": 23425 + }, + { + "epoch": 7.190300798035604, + "grad_norm": 0.17966939508914948, + "learning_rate": 1.9317035648021862e-05, + "loss": 1.6786, + "step": 23426 + }, + { + "epoch": 7.19060773480663, + "grad_norm": 0.21742603182792664, + "learning_rate": 1.9313111189989375e-05, + "loss": 1.734, + "step": 23427 + }, + { + "epoch": 7.190914671577655, + "grad_norm": 0.22135521471500397, + "learning_rate": 1.9309187035222675e-05, + "loss": 1.7154, + "step": 23428 + }, + { + "epoch": 7.19122160834868, + "grad_norm": 0.17866137623786926, + "learning_rate": 1.930526318376059e-05, + "loss": 1.6723, + "step": 23429 + }, + { + "epoch": 7.191528545119706, + "grad_norm": 0.26034823060035706, + "learning_rate": 1.9301339635641887e-05, + "loss": 1.6975, + "step": 23430 + }, + { + "epoch": 7.19183548189073, + "grad_norm": 0.21550825238227844, + "learning_rate": 1.929741639090534e-05, + "loss": 1.7401, + "step": 23431 + }, + { + "epoch": 7.1921424186617555, + "grad_norm": 0.19205132126808167, + "learning_rate": 1.9293493449589718e-05, + "loss": 1.6543, + "step": 23432 + }, + { + "epoch": 7.192449355432781, + "grad_norm": 0.18724635243415833, + "learning_rate": 1.928957081173379e-05, + "loss": 1.7752, + "step": 23433 + }, + { + "epoch": 7.192756292203806, + "grad_norm": 0.2392650544643402, + "learning_rate": 1.928564847737633e-05, + "loss": 1.7008, + "step": 23434 + }, + { + "epoch": 7.1930632289748315, + "grad_norm": 0.18950903415679932, + "learning_rate": 1.9281726446556088e-05, + "loss": 1.7193, + "step": 23435 + }, + { + "epoch": 7.193370165745856, + "grad_norm": 0.2542276978492737, + "learning_rate": 1.9277804719311808e-05, + "loss": 1.7192, + "step": 23436 + }, + { + "epoch": 7.193677102516881, + "grad_norm": 0.1987142711877823, + "learning_rate": 1.927388329568231e-05, + "loss": 1.6943, + "step": 23437 + }, + { + "epoch": 7.193984039287907, + "grad_norm": 0.18837273120880127, + "learning_rate": 1.9269962175706275e-05, + "loss": 1.7443, + "step": 23438 + }, + { + "epoch": 7.194290976058932, + "grad_norm": 0.20432044565677643, + "learning_rate": 1.9266041359422514e-05, + "loss": 1.741, + "step": 23439 + }, + { + "epoch": 7.194597912829957, + "grad_norm": 0.17763052880764008, + "learning_rate": 1.9262120846869715e-05, + "loss": 1.6696, + "step": 23440 + }, + { + "epoch": 7.194904849600983, + "grad_norm": 0.1747766137123108, + "learning_rate": 1.9258200638086665e-05, + "loss": 1.6727, + "step": 23441 + }, + { + "epoch": 7.195211786372007, + "grad_norm": 0.22058527171611786, + "learning_rate": 1.9254280733112117e-05, + "loss": 1.7387, + "step": 23442 + }, + { + "epoch": 7.195518723143032, + "grad_norm": 0.2247757911682129, + "learning_rate": 1.925036113198475e-05, + "loss": 1.7828, + "step": 23443 + }, + { + "epoch": 7.195825659914058, + "grad_norm": 0.16923101246356964, + "learning_rate": 1.924644183474337e-05, + "loss": 1.6655, + "step": 23444 + }, + { + "epoch": 7.196132596685083, + "grad_norm": 0.1599757820367813, + "learning_rate": 1.924252284142665e-05, + "loss": 1.7002, + "step": 23445 + }, + { + "epoch": 7.196439533456108, + "grad_norm": 0.1916438341140747, + "learning_rate": 1.9238604152073358e-05, + "loss": 1.71, + "step": 23446 + }, + { + "epoch": 7.196746470227133, + "grad_norm": 0.18037991225719452, + "learning_rate": 1.9234685766722216e-05, + "loss": 1.6786, + "step": 23447 + }, + { + "epoch": 7.197053406998158, + "grad_norm": 0.20671263337135315, + "learning_rate": 1.9230767685411938e-05, + "loss": 1.7228, + "step": 23448 + }, + { + "epoch": 7.1973603437691835, + "grad_norm": 0.18949514627456665, + "learning_rate": 1.9226849908181243e-05, + "loss": 1.7794, + "step": 23449 + }, + { + "epoch": 7.197667280540209, + "grad_norm": 0.19457660615444183, + "learning_rate": 1.9222932435068857e-05, + "loss": 1.7153, + "step": 23450 + }, + { + "epoch": 7.197974217311234, + "grad_norm": 0.16834792494773865, + "learning_rate": 1.9219015266113494e-05, + "loss": 1.646, + "step": 23451 + }, + { + "epoch": 7.198281154082259, + "grad_norm": 0.21668508648872375, + "learning_rate": 1.9215098401353866e-05, + "loss": 1.7232, + "step": 23452 + }, + { + "epoch": 7.198588090853284, + "grad_norm": 0.1675579994916916, + "learning_rate": 1.9211181840828656e-05, + "loss": 1.6963, + "step": 23453 + }, + { + "epoch": 7.198895027624309, + "grad_norm": 0.19915352761745453, + "learning_rate": 1.9207265584576627e-05, + "loss": 1.7043, + "step": 23454 + }, + { + "epoch": 7.199201964395335, + "grad_norm": 0.23872216045856476, + "learning_rate": 1.920334963263642e-05, + "loss": 1.7784, + "step": 23455 + }, + { + "epoch": 7.19950890116636, + "grad_norm": 0.261321485042572, + "learning_rate": 1.919943398504679e-05, + "loss": 1.8024, + "step": 23456 + }, + { + "epoch": 7.199815837937384, + "grad_norm": 0.17026741802692413, + "learning_rate": 1.9195518641846377e-05, + "loss": 1.7451, + "step": 23457 + }, + { + "epoch": 7.20012277470841, + "grad_norm": 0.20935678482055664, + "learning_rate": 1.9191603603073915e-05, + "loss": 1.752, + "step": 23458 + }, + { + "epoch": 7.200429711479435, + "grad_norm": 0.1756788194179535, + "learning_rate": 1.9187688868768107e-05, + "loss": 1.7008, + "step": 23459 + }, + { + "epoch": 7.2007366482504604, + "grad_norm": 0.23286345601081848, + "learning_rate": 1.9183774438967577e-05, + "loss": 1.7603, + "step": 23460 + }, + { + "epoch": 7.201043585021486, + "grad_norm": 0.17519986629486084, + "learning_rate": 1.917986031371109e-05, + "loss": 1.7127, + "step": 23461 + }, + { + "epoch": 7.201350521792511, + "grad_norm": 0.2603212893009186, + "learning_rate": 1.917594649303725e-05, + "loss": 1.7169, + "step": 23462 + }, + { + "epoch": 7.201657458563536, + "grad_norm": 0.2664981484413147, + "learning_rate": 1.9172032976984792e-05, + "loss": 1.7349, + "step": 23463 + }, + { + "epoch": 7.201964395334561, + "grad_norm": 0.15484265983104706, + "learning_rate": 1.9168119765592375e-05, + "loss": 1.6753, + "step": 23464 + }, + { + "epoch": 7.202271332105586, + "grad_norm": 0.22310250997543335, + "learning_rate": 1.9164206858898664e-05, + "loss": 1.6994, + "step": 23465 + }, + { + "epoch": 7.202578268876612, + "grad_norm": 0.1998710036277771, + "learning_rate": 1.9160294256942336e-05, + "loss": 1.7556, + "step": 23466 + }, + { + "epoch": 7.202885205647637, + "grad_norm": 0.2092670500278473, + "learning_rate": 1.9156381959762058e-05, + "loss": 1.6883, + "step": 23467 + }, + { + "epoch": 7.203192142418661, + "grad_norm": 0.20657336711883545, + "learning_rate": 1.915246996739649e-05, + "loss": 1.8035, + "step": 23468 + }, + { + "epoch": 7.203499079189687, + "grad_norm": 0.2175077497959137, + "learning_rate": 1.9148558279884294e-05, + "loss": 1.7173, + "step": 23469 + }, + { + "epoch": 7.203806015960712, + "grad_norm": 0.16851630806922913, + "learning_rate": 1.9144646897264114e-05, + "loss": 1.6874, + "step": 23470 + }, + { + "epoch": 7.204112952731737, + "grad_norm": 0.23194117844104767, + "learning_rate": 1.9140735819574647e-05, + "loss": 1.7156, + "step": 23471 + }, + { + "epoch": 7.204419889502763, + "grad_norm": 0.17139053344726562, + "learning_rate": 1.9136825046854483e-05, + "loss": 1.6997, + "step": 23472 + }, + { + "epoch": 7.204726826273788, + "grad_norm": 0.18561725318431854, + "learning_rate": 1.913291457914234e-05, + "loss": 1.6575, + "step": 23473 + }, + { + "epoch": 7.2050337630448125, + "grad_norm": 0.2333156019449234, + "learning_rate": 1.9129004416476793e-05, + "loss": 1.7453, + "step": 23474 + }, + { + "epoch": 7.205340699815838, + "grad_norm": 0.2594338655471802, + "learning_rate": 1.9125094558896534e-05, + "loss": 1.7087, + "step": 23475 + }, + { + "epoch": 7.205647636586863, + "grad_norm": 0.16303664445877075, + "learning_rate": 1.91211850064402e-05, + "loss": 1.6985, + "step": 23476 + }, + { + "epoch": 7.2059545733578885, + "grad_norm": 0.2592144012451172, + "learning_rate": 1.9117275759146387e-05, + "loss": 1.7196, + "step": 23477 + }, + { + "epoch": 7.206261510128914, + "grad_norm": 0.1643611341714859, + "learning_rate": 1.9113366817053784e-05, + "loss": 1.686, + "step": 23478 + }, + { + "epoch": 7.206568446899938, + "grad_norm": 0.19730710983276367, + "learning_rate": 1.9109458180200966e-05, + "loss": 1.6883, + "step": 23479 + }, + { + "epoch": 7.206875383670964, + "grad_norm": 0.16942749917507172, + "learning_rate": 1.9105549848626602e-05, + "loss": 1.7272, + "step": 23480 + }, + { + "epoch": 7.207182320441989, + "grad_norm": 0.21967467665672302, + "learning_rate": 1.91016418223693e-05, + "loss": 1.7501, + "step": 23481 + }, + { + "epoch": 7.207489257213014, + "grad_norm": 0.17037035524845123, + "learning_rate": 1.9097734101467684e-05, + "loss": 1.72, + "step": 23482 + }, + { + "epoch": 7.20779619398404, + "grad_norm": 0.21497979760169983, + "learning_rate": 1.9093826685960374e-05, + "loss": 1.6993, + "step": 23483 + }, + { + "epoch": 7.208103130755064, + "grad_norm": 0.1462371051311493, + "learning_rate": 1.9089919575885985e-05, + "loss": 1.6249, + "step": 23484 + }, + { + "epoch": 7.208410067526089, + "grad_norm": 0.1863165646791458, + "learning_rate": 1.9086012771283122e-05, + "loss": 1.6343, + "step": 23485 + }, + { + "epoch": 7.208717004297115, + "grad_norm": 0.1705196648836136, + "learning_rate": 1.9082106272190403e-05, + "loss": 1.7115, + "step": 23486 + }, + { + "epoch": 7.20902394106814, + "grad_norm": 0.20928895473480225, + "learning_rate": 1.9078200078646413e-05, + "loss": 1.6953, + "step": 23487 + }, + { + "epoch": 7.209330877839165, + "grad_norm": 0.2172931581735611, + "learning_rate": 1.9074294190689812e-05, + "loss": 1.7436, + "step": 23488 + }, + { + "epoch": 7.209637814610191, + "grad_norm": 0.1760822981595993, + "learning_rate": 1.9070388608359124e-05, + "loss": 1.6898, + "step": 23489 + }, + { + "epoch": 7.209944751381215, + "grad_norm": 0.28154727816581726, + "learning_rate": 1.9066483331693018e-05, + "loss": 1.7583, + "step": 23490 + }, + { + "epoch": 7.2102516881522405, + "grad_norm": 0.28375890851020813, + "learning_rate": 1.9062578360730027e-05, + "loss": 1.7428, + "step": 23491 + }, + { + "epoch": 7.210558624923266, + "grad_norm": 0.2173614352941513, + "learning_rate": 1.905867369550878e-05, + "loss": 1.6902, + "step": 23492 + }, + { + "epoch": 7.210865561694291, + "grad_norm": 0.2525392174720764, + "learning_rate": 1.9054769336067875e-05, + "loss": 1.7205, + "step": 23493 + }, + { + "epoch": 7.2111724984653165, + "grad_norm": 0.22913219034671783, + "learning_rate": 1.905086528244584e-05, + "loss": 1.7269, + "step": 23494 + }, + { + "epoch": 7.211479435236341, + "grad_norm": 0.2174263298511505, + "learning_rate": 1.9046961534681327e-05, + "loss": 1.7058, + "step": 23495 + }, + { + "epoch": 7.211786372007366, + "grad_norm": 0.2277042120695114, + "learning_rate": 1.9043058092812848e-05, + "loss": 1.7048, + "step": 23496 + }, + { + "epoch": 7.212093308778392, + "grad_norm": 0.17835062742233276, + "learning_rate": 1.9039154956879036e-05, + "loss": 1.7258, + "step": 23497 + }, + { + "epoch": 7.212400245549417, + "grad_norm": 0.22751156985759735, + "learning_rate": 1.903525212691844e-05, + "loss": 1.708, + "step": 23498 + }, + { + "epoch": 7.212707182320442, + "grad_norm": 0.21247950196266174, + "learning_rate": 1.903134960296963e-05, + "loss": 1.7142, + "step": 23499 + }, + { + "epoch": 7.213014119091467, + "grad_norm": 0.2256091684103012, + "learning_rate": 1.9027447385071175e-05, + "loss": 1.6826, + "step": 23500 + }, + { + "epoch": 7.213321055862492, + "grad_norm": 0.16704921424388885, + "learning_rate": 1.902354547326164e-05, + "loss": 1.6639, + "step": 23501 + }, + { + "epoch": 7.213627992633517, + "grad_norm": 0.20211774110794067, + "learning_rate": 1.901964386757958e-05, + "loss": 1.7448, + "step": 23502 + }, + { + "epoch": 7.213934929404543, + "grad_norm": 0.2090187519788742, + "learning_rate": 1.901574256806356e-05, + "loss": 1.7425, + "step": 23503 + }, + { + "epoch": 7.214241866175568, + "grad_norm": 0.1942494809627533, + "learning_rate": 1.9011841574752114e-05, + "loss": 1.721, + "step": 23504 + }, + { + "epoch": 7.214548802946593, + "grad_norm": 0.1842714548110962, + "learning_rate": 1.900794088768385e-05, + "loss": 1.7092, + "step": 23505 + }, + { + "epoch": 7.214855739717618, + "grad_norm": 0.16807401180267334, + "learning_rate": 1.900404050689724e-05, + "loss": 1.6788, + "step": 23506 + }, + { + "epoch": 7.215162676488643, + "grad_norm": 0.16467349231243134, + "learning_rate": 1.9000140432430907e-05, + "loss": 1.6544, + "step": 23507 + }, + { + "epoch": 7.2154696132596685, + "grad_norm": 0.1806645542383194, + "learning_rate": 1.899624066432332e-05, + "loss": 1.6871, + "step": 23508 + }, + { + "epoch": 7.215776550030694, + "grad_norm": 0.16891708970069885, + "learning_rate": 1.8992341202613073e-05, + "loss": 1.6912, + "step": 23509 + }, + { + "epoch": 7.216083486801719, + "grad_norm": 0.21191391348838806, + "learning_rate": 1.89884420473387e-05, + "loss": 1.7843, + "step": 23510 + }, + { + "epoch": 7.216390423572744, + "grad_norm": 0.18484020233154297, + "learning_rate": 1.8984543198538684e-05, + "loss": 1.699, + "step": 23511 + }, + { + "epoch": 7.216697360343769, + "grad_norm": 0.2106105536222458, + "learning_rate": 1.8980644656251627e-05, + "loss": 1.7239, + "step": 23512 + }, + { + "epoch": 7.217004297114794, + "grad_norm": 0.19923320412635803, + "learning_rate": 1.8976746420515988e-05, + "loss": 1.7989, + "step": 23513 + }, + { + "epoch": 7.21731123388582, + "grad_norm": 0.21371988952159882, + "learning_rate": 1.897284849137034e-05, + "loss": 1.7071, + "step": 23514 + }, + { + "epoch": 7.217618170656845, + "grad_norm": 0.20450851321220398, + "learning_rate": 1.8968950868853184e-05, + "loss": 1.7051, + "step": 23515 + }, + { + "epoch": 7.21792510742787, + "grad_norm": 0.22700995206832886, + "learning_rate": 1.8965053553003055e-05, + "loss": 1.7556, + "step": 23516 + }, + { + "epoch": 7.218232044198895, + "grad_norm": 0.26295945048332214, + "learning_rate": 1.896115654385845e-05, + "loss": 1.7893, + "step": 23517 + }, + { + "epoch": 7.21853898096992, + "grad_norm": 0.17091867327690125, + "learning_rate": 1.8957259841457885e-05, + "loss": 1.7289, + "step": 23518 + }, + { + "epoch": 7.218845917740945, + "grad_norm": 0.24840304255485535, + "learning_rate": 1.8953363445839877e-05, + "loss": 1.6958, + "step": 23519 + }, + { + "epoch": 7.219152854511971, + "grad_norm": 0.20042046904563904, + "learning_rate": 1.8949467357042926e-05, + "loss": 1.743, + "step": 23520 + }, + { + "epoch": 7.219459791282996, + "grad_norm": 0.18286047875881195, + "learning_rate": 1.894557157510552e-05, + "loss": 1.7065, + "step": 23521 + }, + { + "epoch": 7.2197667280540205, + "grad_norm": 0.18324656784534454, + "learning_rate": 1.894167610006622e-05, + "loss": 1.7083, + "step": 23522 + }, + { + "epoch": 7.220073664825046, + "grad_norm": 0.17110426723957062, + "learning_rate": 1.8937780931963432e-05, + "loss": 1.7016, + "step": 23523 + }, + { + "epoch": 7.220380601596071, + "grad_norm": 0.19164881110191345, + "learning_rate": 1.8933886070835743e-05, + "loss": 1.7011, + "step": 23524 + }, + { + "epoch": 7.2206875383670965, + "grad_norm": 0.16899923980236053, + "learning_rate": 1.892999151672157e-05, + "loss": 1.7227, + "step": 23525 + }, + { + "epoch": 7.220994475138122, + "grad_norm": 0.18763495981693268, + "learning_rate": 1.8926097269659437e-05, + "loss": 1.6956, + "step": 23526 + }, + { + "epoch": 7.221301411909146, + "grad_norm": 0.1665162295103073, + "learning_rate": 1.8922203329687847e-05, + "loss": 1.7039, + "step": 23527 + }, + { + "epoch": 7.221608348680172, + "grad_norm": 0.20766250789165497, + "learning_rate": 1.8918309696845226e-05, + "loss": 1.7703, + "step": 23528 + }, + { + "epoch": 7.221915285451197, + "grad_norm": 0.1813010275363922, + "learning_rate": 1.891441637117012e-05, + "loss": 1.6709, + "step": 23529 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 0.15327073633670807, + "learning_rate": 1.891052335270094e-05, + "loss": 1.6518, + "step": 23530 + }, + { + "epoch": 7.222529158993248, + "grad_norm": 0.17191094160079956, + "learning_rate": 1.8906630641476203e-05, + "loss": 1.7193, + "step": 23531 + }, + { + "epoch": 7.222836095764273, + "grad_norm": 0.17976176738739014, + "learning_rate": 1.8902738237534363e-05, + "loss": 1.7162, + "step": 23532 + }, + { + "epoch": 7.223143032535297, + "grad_norm": 0.1828993558883667, + "learning_rate": 1.8898846140913894e-05, + "loss": 1.7163, + "step": 23533 + }, + { + "epoch": 7.223449969306323, + "grad_norm": 0.15828034281730652, + "learning_rate": 1.889495435165326e-05, + "loss": 1.6734, + "step": 23534 + }, + { + "epoch": 7.223756906077348, + "grad_norm": 0.2171369194984436, + "learning_rate": 1.8891062869790915e-05, + "loss": 1.7508, + "step": 23535 + }, + { + "epoch": 7.224063842848373, + "grad_norm": 0.18747110664844513, + "learning_rate": 1.888717169536532e-05, + "loss": 1.7162, + "step": 23536 + }, + { + "epoch": 7.224370779619399, + "grad_norm": 0.19177328050136566, + "learning_rate": 1.8883280828414927e-05, + "loss": 1.7044, + "step": 23537 + }, + { + "epoch": 7.224677716390423, + "grad_norm": 0.175906702876091, + "learning_rate": 1.88793902689782e-05, + "loss": 1.7126, + "step": 23538 + }, + { + "epoch": 7.2249846531614486, + "grad_norm": 0.17842896282672882, + "learning_rate": 1.887550001709357e-05, + "loss": 1.7469, + "step": 23539 + }, + { + "epoch": 7.225291589932474, + "grad_norm": 0.23797607421875, + "learning_rate": 1.8871610072799478e-05, + "loss": 1.7343, + "step": 23540 + }, + { + "epoch": 7.225598526703499, + "grad_norm": 0.2297922819852829, + "learning_rate": 1.8867720436134412e-05, + "loss": 1.7453, + "step": 23541 + }, + { + "epoch": 7.225905463474525, + "grad_norm": 0.19950568675994873, + "learning_rate": 1.8863831107136748e-05, + "loss": 1.6984, + "step": 23542 + }, + { + "epoch": 7.226212400245549, + "grad_norm": 0.2809087038040161, + "learning_rate": 1.8859942085844974e-05, + "loss": 1.7815, + "step": 23543 + }, + { + "epoch": 7.226519337016574, + "grad_norm": 0.20534642040729523, + "learning_rate": 1.8856053372297515e-05, + "loss": 1.7455, + "step": 23544 + }, + { + "epoch": 7.2268262737876, + "grad_norm": 0.20052307844161987, + "learning_rate": 1.885216496653276e-05, + "loss": 1.6655, + "step": 23545 + }, + { + "epoch": 7.227133210558625, + "grad_norm": 0.1948573738336563, + "learning_rate": 1.8848276868589205e-05, + "loss": 1.7036, + "step": 23546 + }, + { + "epoch": 7.22744014732965, + "grad_norm": 0.16764269769191742, + "learning_rate": 1.8844389078505197e-05, + "loss": 1.6605, + "step": 23547 + }, + { + "epoch": 7.227747084100676, + "grad_norm": 0.17951633036136627, + "learning_rate": 1.8840501596319214e-05, + "loss": 1.6948, + "step": 23548 + }, + { + "epoch": 7.2280540208717, + "grad_norm": 0.1906418353319168, + "learning_rate": 1.883661442206966e-05, + "loss": 1.7122, + "step": 23549 + }, + { + "epoch": 7.2283609576427255, + "grad_norm": 0.19535204768180847, + "learning_rate": 1.8832727555794943e-05, + "loss": 1.7089, + "step": 23550 + }, + { + "epoch": 7.228667894413751, + "grad_norm": 0.20654071867465973, + "learning_rate": 1.8828840997533488e-05, + "loss": 1.7113, + "step": 23551 + }, + { + "epoch": 7.228974831184776, + "grad_norm": 0.18860456347465515, + "learning_rate": 1.8824954747323692e-05, + "loss": 1.7475, + "step": 23552 + }, + { + "epoch": 7.2292817679558015, + "grad_norm": 0.21949729323387146, + "learning_rate": 1.882106880520396e-05, + "loss": 1.7819, + "step": 23553 + }, + { + "epoch": 7.229588704726826, + "grad_norm": 0.2177286595106125, + "learning_rate": 1.881718317121271e-05, + "loss": 1.7554, + "step": 23554 + }, + { + "epoch": 7.229895641497851, + "grad_norm": 0.21143296360969543, + "learning_rate": 1.8813297845388328e-05, + "loss": 1.7811, + "step": 23555 + }, + { + "epoch": 7.230202578268877, + "grad_norm": 0.24787208437919617, + "learning_rate": 1.880941282776922e-05, + "loss": 1.707, + "step": 23556 + }, + { + "epoch": 7.230509515039902, + "grad_norm": 0.18048164248466492, + "learning_rate": 1.880552811839375e-05, + "loss": 1.6841, + "step": 23557 + }, + { + "epoch": 7.230816451810927, + "grad_norm": 0.24056772887706757, + "learning_rate": 1.8801643717300375e-05, + "loss": 1.7868, + "step": 23558 + }, + { + "epoch": 7.231123388581952, + "grad_norm": 0.18564146757125854, + "learning_rate": 1.879775962452741e-05, + "loss": 1.7506, + "step": 23559 + }, + { + "epoch": 7.231430325352977, + "grad_norm": 0.25965458154678345, + "learning_rate": 1.87938758401133e-05, + "loss": 1.7307, + "step": 23560 + }, + { + "epoch": 7.231737262124002, + "grad_norm": 0.17774315178394318, + "learning_rate": 1.8789992364096394e-05, + "loss": 1.7089, + "step": 23561 + }, + { + "epoch": 7.232044198895028, + "grad_norm": 0.2488560527563095, + "learning_rate": 1.878610919651505e-05, + "loss": 1.6811, + "step": 23562 + }, + { + "epoch": 7.232351135666053, + "grad_norm": 0.1963108628988266, + "learning_rate": 1.8782226337407703e-05, + "loss": 1.6512, + "step": 23563 + }, + { + "epoch": 7.232658072437078, + "grad_norm": 0.25702449679374695, + "learning_rate": 1.8778343786812663e-05, + "loss": 1.7697, + "step": 23564 + }, + { + "epoch": 7.232965009208103, + "grad_norm": 0.18145591020584106, + "learning_rate": 1.8774461544768347e-05, + "loss": 1.6842, + "step": 23565 + }, + { + "epoch": 7.233271945979128, + "grad_norm": 0.2482728213071823, + "learning_rate": 1.87705796113131e-05, + "loss": 1.7028, + "step": 23566 + }, + { + "epoch": 7.2335788827501535, + "grad_norm": 0.16365976631641388, + "learning_rate": 1.8766697986485293e-05, + "loss": 1.7266, + "step": 23567 + }, + { + "epoch": 7.233885819521179, + "grad_norm": 0.1877463459968567, + "learning_rate": 1.876281667032328e-05, + "loss": 1.6909, + "step": 23568 + }, + { + "epoch": 7.234192756292204, + "grad_norm": 0.19121702015399933, + "learning_rate": 1.8758935662865423e-05, + "loss": 1.7303, + "step": 23569 + }, + { + "epoch": 7.234499693063229, + "grad_norm": 0.1783505082130432, + "learning_rate": 1.8755054964150072e-05, + "loss": 1.7209, + "step": 23570 + }, + { + "epoch": 7.234806629834254, + "grad_norm": 0.172771617770195, + "learning_rate": 1.8751174574215585e-05, + "loss": 1.6824, + "step": 23571 + }, + { + "epoch": 7.235113566605279, + "grad_norm": 0.1675102859735489, + "learning_rate": 1.8747294493100304e-05, + "loss": 1.6664, + "step": 23572 + }, + { + "epoch": 7.235420503376305, + "grad_norm": 0.18213391304016113, + "learning_rate": 1.8743414720842578e-05, + "loss": 1.6725, + "step": 23573 + }, + { + "epoch": 7.23572744014733, + "grad_norm": 0.2204304337501526, + "learning_rate": 1.8739535257480728e-05, + "loss": 1.7662, + "step": 23574 + }, + { + "epoch": 7.236034376918354, + "grad_norm": 0.22732098400592804, + "learning_rate": 1.873565610305315e-05, + "loss": 1.7808, + "step": 23575 + }, + { + "epoch": 7.23634131368938, + "grad_norm": 0.17859263718128204, + "learning_rate": 1.8731777257598128e-05, + "loss": 1.6767, + "step": 23576 + }, + { + "epoch": 7.236648250460405, + "grad_norm": 0.16690675914287567, + "learning_rate": 1.8727898721154007e-05, + "loss": 1.6523, + "step": 23577 + }, + { + "epoch": 7.23695518723143, + "grad_norm": 0.17576774954795837, + "learning_rate": 1.872402049375912e-05, + "loss": 1.6951, + "step": 23578 + }, + { + "epoch": 7.237262124002456, + "grad_norm": 0.20455172657966614, + "learning_rate": 1.8720142575451777e-05, + "loss": 1.7402, + "step": 23579 + }, + { + "epoch": 7.237569060773481, + "grad_norm": 0.2122879922389984, + "learning_rate": 1.8716264966270352e-05, + "loss": 1.7571, + "step": 23580 + }, + { + "epoch": 7.2378759975445055, + "grad_norm": 0.17752611637115479, + "learning_rate": 1.87123876662531e-05, + "loss": 1.7185, + "step": 23581 + }, + { + "epoch": 7.238182934315531, + "grad_norm": 0.21253602206707, + "learning_rate": 1.87085106754384e-05, + "loss": 1.7281, + "step": 23582 + }, + { + "epoch": 7.238489871086556, + "grad_norm": 0.19470329582691193, + "learning_rate": 1.8704633993864514e-05, + "loss": 1.6772, + "step": 23583 + }, + { + "epoch": 7.2387968078575815, + "grad_norm": 0.19556869566440582, + "learning_rate": 1.8700757621569786e-05, + "loss": 1.6888, + "step": 23584 + }, + { + "epoch": 7.239103744628607, + "grad_norm": 0.20525780320167542, + "learning_rate": 1.869688155859252e-05, + "loss": 1.7517, + "step": 23585 + }, + { + "epoch": 7.239410681399631, + "grad_norm": 0.23367032408714294, + "learning_rate": 1.869300580497102e-05, + "loss": 1.781, + "step": 23586 + }, + { + "epoch": 7.239717618170657, + "grad_norm": 0.1893240362405777, + "learning_rate": 1.8689130360743583e-05, + "loss": 1.7265, + "step": 23587 + }, + { + "epoch": 7.240024554941682, + "grad_norm": 0.17136700451374054, + "learning_rate": 1.868525522594851e-05, + "loss": 1.6631, + "step": 23588 + }, + { + "epoch": 7.240331491712707, + "grad_norm": 0.1984632909297943, + "learning_rate": 1.8681380400624103e-05, + "loss": 1.7337, + "step": 23589 + }, + { + "epoch": 7.240638428483733, + "grad_norm": 0.19046886265277863, + "learning_rate": 1.867750588480865e-05, + "loss": 1.7094, + "step": 23590 + }, + { + "epoch": 7.240945365254758, + "grad_norm": 0.18242189288139343, + "learning_rate": 1.8673631678540427e-05, + "loss": 1.692, + "step": 23591 + }, + { + "epoch": 7.241252302025782, + "grad_norm": 0.1741522252559662, + "learning_rate": 1.8669757781857768e-05, + "loss": 1.6975, + "step": 23592 + }, + { + "epoch": 7.241559238796808, + "grad_norm": 0.1778191328048706, + "learning_rate": 1.866588419479891e-05, + "loss": 1.7092, + "step": 23593 + }, + { + "epoch": 7.241866175567833, + "grad_norm": 0.17402158677577972, + "learning_rate": 1.866201091740215e-05, + "loss": 1.7072, + "step": 23594 + }, + { + "epoch": 7.242173112338858, + "grad_norm": 0.22215119004249573, + "learning_rate": 1.8658137949705763e-05, + "loss": 1.7205, + "step": 23595 + }, + { + "epoch": 7.242480049109884, + "grad_norm": 0.15291182696819305, + "learning_rate": 1.8654265291748013e-05, + "loss": 1.7341, + "step": 23596 + }, + { + "epoch": 7.242786985880908, + "grad_norm": 0.18226875364780426, + "learning_rate": 1.8650392943567217e-05, + "loss": 1.6731, + "step": 23597 + }, + { + "epoch": 7.2430939226519335, + "grad_norm": 0.19169047474861145, + "learning_rate": 1.864652090520158e-05, + "loss": 1.777, + "step": 23598 + }, + { + "epoch": 7.243400859422959, + "grad_norm": 0.2063349187374115, + "learning_rate": 1.8642649176689437e-05, + "loss": 1.7258, + "step": 23599 + }, + { + "epoch": 7.243707796193984, + "grad_norm": 0.18550212681293488, + "learning_rate": 1.863877775806898e-05, + "loss": 1.7041, + "step": 23600 + }, + { + "epoch": 7.2440147329650095, + "grad_norm": 0.21196649968624115, + "learning_rate": 1.8634906649378514e-05, + "loss": 1.6672, + "step": 23601 + }, + { + "epoch": 7.244321669736034, + "grad_norm": 0.26801541447639465, + "learning_rate": 1.863103585065629e-05, + "loss": 1.6981, + "step": 23602 + }, + { + "epoch": 7.244628606507059, + "grad_norm": 0.1854090690612793, + "learning_rate": 1.862716536194055e-05, + "loss": 1.7406, + "step": 23603 + }, + { + "epoch": 7.244935543278085, + "grad_norm": 0.15906888246536255, + "learning_rate": 1.8623295183269556e-05, + "loss": 1.6721, + "step": 23604 + }, + { + "epoch": 7.24524248004911, + "grad_norm": 0.2210245132446289, + "learning_rate": 1.8619425314681547e-05, + "loss": 1.7717, + "step": 23605 + }, + { + "epoch": 7.245549416820135, + "grad_norm": 0.17654140293598175, + "learning_rate": 1.861555575621477e-05, + "loss": 1.7428, + "step": 23606 + }, + { + "epoch": 7.245856353591161, + "grad_norm": 0.1582319736480713, + "learning_rate": 1.8611686507907466e-05, + "loss": 1.6814, + "step": 23607 + }, + { + "epoch": 7.246163290362185, + "grad_norm": 0.18817248940467834, + "learning_rate": 1.8607817569797852e-05, + "loss": 1.74, + "step": 23608 + }, + { + "epoch": 7.24647022713321, + "grad_norm": 0.26141074299812317, + "learning_rate": 1.8603948941924227e-05, + "loss": 1.6966, + "step": 23609 + }, + { + "epoch": 7.246777163904236, + "grad_norm": 0.16877111792564392, + "learning_rate": 1.8600080624324757e-05, + "loss": 1.6849, + "step": 23610 + }, + { + "epoch": 7.247084100675261, + "grad_norm": 0.16188141703605652, + "learning_rate": 1.8596212617037694e-05, + "loss": 1.6342, + "step": 23611 + }, + { + "epoch": 7.247391037446286, + "grad_norm": 0.19506491720676422, + "learning_rate": 1.8592344920101267e-05, + "loss": 1.6874, + "step": 23612 + }, + { + "epoch": 7.247697974217311, + "grad_norm": 0.1865006536245346, + "learning_rate": 1.8588477533553677e-05, + "loss": 1.7365, + "step": 23613 + }, + { + "epoch": 7.248004910988336, + "grad_norm": 0.16737428307533264, + "learning_rate": 1.85846104574332e-05, + "loss": 1.6971, + "step": 23614 + }, + { + "epoch": 7.2483118477593615, + "grad_norm": 0.1754695028066635, + "learning_rate": 1.858074369177798e-05, + "loss": 1.7133, + "step": 23615 + }, + { + "epoch": 7.248618784530387, + "grad_norm": 0.21066173911094666, + "learning_rate": 1.85768772366263e-05, + "loss": 1.7737, + "step": 23616 + }, + { + "epoch": 7.248925721301412, + "grad_norm": 0.2530418932437897, + "learning_rate": 1.8573011092016303e-05, + "loss": 1.7962, + "step": 23617 + }, + { + "epoch": 7.249232658072437, + "grad_norm": 0.17780029773712158, + "learning_rate": 1.8569145257986247e-05, + "loss": 1.6691, + "step": 23618 + }, + { + "epoch": 7.249539594843462, + "grad_norm": 0.2105826437473297, + "learning_rate": 1.856527973457432e-05, + "loss": 1.6943, + "step": 23619 + }, + { + "epoch": 7.249846531614487, + "grad_norm": 0.20929837226867676, + "learning_rate": 1.856141452181872e-05, + "loss": 1.7223, + "step": 23620 + }, + { + "epoch": 7.250153468385513, + "grad_norm": 0.17105531692504883, + "learning_rate": 1.8557549619757653e-05, + "loss": 1.6956, + "step": 23621 + }, + { + "epoch": 7.250460405156538, + "grad_norm": 0.21282736957073212, + "learning_rate": 1.8553685028429306e-05, + "loss": 1.7299, + "step": 23622 + }, + { + "epoch": 7.250767341927563, + "grad_norm": 0.1673511266708374, + "learning_rate": 1.8549820747871882e-05, + "loss": 1.7184, + "step": 23623 + }, + { + "epoch": 7.251074278698588, + "grad_norm": 0.1877487152814865, + "learning_rate": 1.854595677812356e-05, + "loss": 1.6989, + "step": 23624 + }, + { + "epoch": 7.251381215469613, + "grad_norm": 0.1709173619747162, + "learning_rate": 1.8542093119222504e-05, + "loss": 1.6994, + "step": 23625 + }, + { + "epoch": 7.2516881522406385, + "grad_norm": 0.18894633650779724, + "learning_rate": 1.8538229771206962e-05, + "loss": 1.665, + "step": 23626 + }, + { + "epoch": 7.251995089011664, + "grad_norm": 0.17623448371887207, + "learning_rate": 1.8534366734115056e-05, + "loss": 1.6999, + "step": 23627 + }, + { + "epoch": 7.252302025782689, + "grad_norm": 0.20008981227874756, + "learning_rate": 1.8530504007984982e-05, + "loss": 1.7147, + "step": 23628 + }, + { + "epoch": 7.252608962553714, + "grad_norm": 0.2506260573863983, + "learning_rate": 1.852664159285491e-05, + "loss": 1.7485, + "step": 23629 + }, + { + "epoch": 7.252915899324739, + "grad_norm": 0.17746438086032867, + "learning_rate": 1.8522779488763e-05, + "loss": 1.7534, + "step": 23630 + }, + { + "epoch": 7.253222836095764, + "grad_norm": 0.1910836547613144, + "learning_rate": 1.8518917695747462e-05, + "loss": 1.7167, + "step": 23631 + }, + { + "epoch": 7.25352977286679, + "grad_norm": 0.18009543418884277, + "learning_rate": 1.8515056213846398e-05, + "loss": 1.6849, + "step": 23632 + }, + { + "epoch": 7.253836709637815, + "grad_norm": 0.18150615692138672, + "learning_rate": 1.851119504309804e-05, + "loss": 1.7077, + "step": 23633 + }, + { + "epoch": 7.25414364640884, + "grad_norm": 0.1874052882194519, + "learning_rate": 1.850733418354047e-05, + "loss": 1.7398, + "step": 23634 + }, + { + "epoch": 7.254450583179865, + "grad_norm": 0.18285217881202698, + "learning_rate": 1.8503473635211897e-05, + "loss": 1.7433, + "step": 23635 + }, + { + "epoch": 7.25475751995089, + "grad_norm": 0.19326861202716827, + "learning_rate": 1.8499613398150463e-05, + "loss": 1.7095, + "step": 23636 + }, + { + "epoch": 7.255064456721915, + "grad_norm": 0.21128259599208832, + "learning_rate": 1.849575347239431e-05, + "loss": 1.7352, + "step": 23637 + }, + { + "epoch": 7.255371393492941, + "grad_norm": 0.19309113919734955, + "learning_rate": 1.849189385798159e-05, + "loss": 1.7098, + "step": 23638 + }, + { + "epoch": 7.255678330263966, + "grad_norm": 0.1877751648426056, + "learning_rate": 1.848803455495044e-05, + "loss": 1.7279, + "step": 23639 + }, + { + "epoch": 7.2559852670349905, + "grad_norm": 0.18840502202510834, + "learning_rate": 1.8484175563339e-05, + "loss": 1.7174, + "step": 23640 + }, + { + "epoch": 7.256292203806016, + "grad_norm": 0.1912582963705063, + "learning_rate": 1.848031688318541e-05, + "loss": 1.6964, + "step": 23641 + }, + { + "epoch": 7.256599140577041, + "grad_norm": 0.188243106007576, + "learning_rate": 1.847645851452779e-05, + "loss": 1.7296, + "step": 23642 + }, + { + "epoch": 7.2569060773480665, + "grad_norm": 0.15838554501533508, + "learning_rate": 1.8472600457404317e-05, + "loss": 1.6276, + "step": 23643 + }, + { + "epoch": 7.257213014119092, + "grad_norm": 0.1605941653251648, + "learning_rate": 1.8468742711853065e-05, + "loss": 1.7015, + "step": 23644 + }, + { + "epoch": 7.257519950890116, + "grad_norm": 0.23647825419902802, + "learning_rate": 1.846488527791218e-05, + "loss": 1.775, + "step": 23645 + }, + { + "epoch": 7.257826887661142, + "grad_norm": 0.2414257973432541, + "learning_rate": 1.846102815561978e-05, + "loss": 1.7456, + "step": 23646 + }, + { + "epoch": 7.258133824432167, + "grad_norm": 0.221851646900177, + "learning_rate": 1.845717134501397e-05, + "loss": 1.6875, + "step": 23647 + }, + { + "epoch": 7.258440761203192, + "grad_norm": 0.20732705295085907, + "learning_rate": 1.8453314846132914e-05, + "loss": 1.6619, + "step": 23648 + }, + { + "epoch": 7.258747697974218, + "grad_norm": 0.18818728625774384, + "learning_rate": 1.8449458659014657e-05, + "loss": 1.6961, + "step": 23649 + }, + { + "epoch": 7.259054634745242, + "grad_norm": 0.19335074722766876, + "learning_rate": 1.8445602783697374e-05, + "loss": 1.6816, + "step": 23650 + }, + { + "epoch": 7.259361571516267, + "grad_norm": 0.27334100008010864, + "learning_rate": 1.844174722021911e-05, + "loss": 1.7435, + "step": 23651 + }, + { + "epoch": 7.259668508287293, + "grad_norm": 0.18763858079910278, + "learning_rate": 1.843789196861801e-05, + "loss": 1.713, + "step": 23652 + }, + { + "epoch": 7.259975445058318, + "grad_norm": 0.2585131525993347, + "learning_rate": 1.843403702893216e-05, + "loss": 1.7151, + "step": 23653 + }, + { + "epoch": 7.260282381829343, + "grad_norm": 0.182148277759552, + "learning_rate": 1.843018240119966e-05, + "loss": 1.7018, + "step": 23654 + }, + { + "epoch": 7.260589318600369, + "grad_norm": 0.31881436705589294, + "learning_rate": 1.84263280854586e-05, + "loss": 1.7428, + "step": 23655 + }, + { + "epoch": 7.260896255371393, + "grad_norm": 0.20997895300388336, + "learning_rate": 1.8422474081747073e-05, + "loss": 1.724, + "step": 23656 + }, + { + "epoch": 7.2612031921424185, + "grad_norm": 0.25038522481918335, + "learning_rate": 1.8418620390103163e-05, + "loss": 1.739, + "step": 23657 + }, + { + "epoch": 7.261510128913444, + "grad_norm": 0.22313323616981506, + "learning_rate": 1.841476701056496e-05, + "loss": 1.7493, + "step": 23658 + }, + { + "epoch": 7.261817065684469, + "grad_norm": 0.22516389191150665, + "learning_rate": 1.8410913943170522e-05, + "loss": 1.79, + "step": 23659 + }, + { + "epoch": 7.2621240024554945, + "grad_norm": 0.1966279298067093, + "learning_rate": 1.8407061187957982e-05, + "loss": 1.7418, + "step": 23660 + }, + { + "epoch": 7.262430939226519, + "grad_norm": 0.18697889149188995, + "learning_rate": 1.840320874496536e-05, + "loss": 1.7347, + "step": 23661 + }, + { + "epoch": 7.262737875997544, + "grad_norm": 0.18226566910743713, + "learning_rate": 1.8399356614230755e-05, + "loss": 1.6979, + "step": 23662 + }, + { + "epoch": 7.26304481276857, + "grad_norm": 0.18880577385425568, + "learning_rate": 1.839550479579223e-05, + "loss": 1.6612, + "step": 23663 + }, + { + "epoch": 7.263351749539595, + "grad_norm": 0.2048085480928421, + "learning_rate": 1.8391653289687826e-05, + "loss": 1.7313, + "step": 23664 + }, + { + "epoch": 7.26365868631062, + "grad_norm": 0.238912895321846, + "learning_rate": 1.838780209595567e-05, + "loss": 1.7522, + "step": 23665 + }, + { + "epoch": 7.263965623081646, + "grad_norm": 0.1656452864408493, + "learning_rate": 1.838395121463375e-05, + "loss": 1.6742, + "step": 23666 + }, + { + "epoch": 7.26427255985267, + "grad_norm": 0.2209266573190689, + "learning_rate": 1.8380100645760186e-05, + "loss": 1.6592, + "step": 23667 + }, + { + "epoch": 7.264579496623695, + "grad_norm": 0.19701217114925385, + "learning_rate": 1.8376250389372967e-05, + "loss": 1.7211, + "step": 23668 + }, + { + "epoch": 7.264886433394721, + "grad_norm": 0.229326069355011, + "learning_rate": 1.837240044551019e-05, + "loss": 1.7044, + "step": 23669 + }, + { + "epoch": 7.265193370165746, + "grad_norm": 0.18499960005283356, + "learning_rate": 1.8368550814209894e-05, + "loss": 1.705, + "step": 23670 + }, + { + "epoch": 7.265500306936771, + "grad_norm": 0.25504955649375916, + "learning_rate": 1.8364701495510117e-05, + "loss": 1.7246, + "step": 23671 + }, + { + "epoch": 7.265807243707796, + "grad_norm": 0.25998997688293457, + "learning_rate": 1.8360852489448903e-05, + "loss": 1.8311, + "step": 23672 + }, + { + "epoch": 7.266114180478821, + "grad_norm": 0.2437162697315216, + "learning_rate": 1.8357003796064294e-05, + "loss": 1.6467, + "step": 23673 + }, + { + "epoch": 7.2664211172498465, + "grad_norm": 0.20784614980220795, + "learning_rate": 1.8353155415394315e-05, + "loss": 1.7361, + "step": 23674 + }, + { + "epoch": 7.266728054020872, + "grad_norm": 0.22633932530879974, + "learning_rate": 1.8349307347476998e-05, + "loss": 1.6518, + "step": 23675 + }, + { + "epoch": 7.267034990791897, + "grad_norm": 0.19307547807693481, + "learning_rate": 1.8345459592350367e-05, + "loss": 1.7469, + "step": 23676 + }, + { + "epoch": 7.267341927562922, + "grad_norm": 0.20418168604373932, + "learning_rate": 1.8341612150052483e-05, + "loss": 1.6892, + "step": 23677 + }, + { + "epoch": 7.267648864333947, + "grad_norm": 0.1574825942516327, + "learning_rate": 1.8337765020621332e-05, + "loss": 1.6682, + "step": 23678 + }, + { + "epoch": 7.267955801104972, + "grad_norm": 0.31023111939430237, + "learning_rate": 1.8333918204094947e-05, + "loss": 1.7382, + "step": 23679 + }, + { + "epoch": 7.268262737875998, + "grad_norm": 0.18148623406887054, + "learning_rate": 1.833007170051134e-05, + "loss": 1.726, + "step": 23680 + }, + { + "epoch": 7.268569674647023, + "grad_norm": 0.19278696179389954, + "learning_rate": 1.832622550990851e-05, + "loss": 1.7176, + "step": 23681 + }, + { + "epoch": 7.268876611418047, + "grad_norm": 0.18298377096652985, + "learning_rate": 1.832237963232452e-05, + "loss": 1.6703, + "step": 23682 + }, + { + "epoch": 7.269183548189073, + "grad_norm": 0.2019357681274414, + "learning_rate": 1.8318534067797304e-05, + "loss": 1.7771, + "step": 23683 + }, + { + "epoch": 7.269490484960098, + "grad_norm": 0.21978864073753357, + "learning_rate": 1.8314688816364944e-05, + "loss": 1.7938, + "step": 23684 + }, + { + "epoch": 7.269797421731123, + "grad_norm": 0.20009377598762512, + "learning_rate": 1.831084387806536e-05, + "loss": 1.7312, + "step": 23685 + }, + { + "epoch": 7.270104358502149, + "grad_norm": 0.16587263345718384, + "learning_rate": 1.8306999252936608e-05, + "loss": 1.7098, + "step": 23686 + }, + { + "epoch": 7.270411295273174, + "grad_norm": 0.20567362010478973, + "learning_rate": 1.8303154941016666e-05, + "loss": 1.6893, + "step": 23687 + }, + { + "epoch": 7.2707182320441985, + "grad_norm": 0.1916830986738205, + "learning_rate": 1.8299310942343527e-05, + "loss": 1.7995, + "step": 23688 + }, + { + "epoch": 7.271025168815224, + "grad_norm": 0.18361486494541168, + "learning_rate": 1.8295467256955174e-05, + "loss": 1.6708, + "step": 23689 + }, + { + "epoch": 7.271332105586249, + "grad_norm": 0.20620734989643097, + "learning_rate": 1.8291623884889597e-05, + "loss": 1.7314, + "step": 23690 + }, + { + "epoch": 7.2716390423572745, + "grad_norm": 0.22560660541057587, + "learning_rate": 1.828778082618478e-05, + "loss": 1.7418, + "step": 23691 + }, + { + "epoch": 7.2719459791283, + "grad_norm": 0.2113492786884308, + "learning_rate": 1.8283938080878697e-05, + "loss": 1.724, + "step": 23692 + }, + { + "epoch": 7.272252915899324, + "grad_norm": 0.26234012842178345, + "learning_rate": 1.8280095649009334e-05, + "loss": 1.7723, + "step": 23693 + }, + { + "epoch": 7.27255985267035, + "grad_norm": 0.1675095111131668, + "learning_rate": 1.827625353061465e-05, + "loss": 1.7473, + "step": 23694 + }, + { + "epoch": 7.272866789441375, + "grad_norm": 0.17751236259937286, + "learning_rate": 1.8272411725732623e-05, + "loss": 1.7374, + "step": 23695 + }, + { + "epoch": 7.2731737262124, + "grad_norm": 0.23158904910087585, + "learning_rate": 1.826857023440122e-05, + "loss": 1.8111, + "step": 23696 + }, + { + "epoch": 7.273480662983426, + "grad_norm": 0.17262183129787445, + "learning_rate": 1.8264729056658407e-05, + "loss": 1.7546, + "step": 23697 + }, + { + "epoch": 7.273787599754451, + "grad_norm": 0.20811094343662262, + "learning_rate": 1.8260888192542126e-05, + "loss": 1.8059, + "step": 23698 + }, + { + "epoch": 7.274094536525475, + "grad_norm": 0.17156411707401276, + "learning_rate": 1.825704764209038e-05, + "loss": 1.7261, + "step": 23699 + }, + { + "epoch": 7.274401473296501, + "grad_norm": 0.18523572385311127, + "learning_rate": 1.8253207405341067e-05, + "loss": 1.7139, + "step": 23700 + }, + { + "epoch": 7.274708410067526, + "grad_norm": 0.20626066625118256, + "learning_rate": 1.824936748233219e-05, + "loss": 1.7269, + "step": 23701 + }, + { + "epoch": 7.2750153468385514, + "grad_norm": 0.1717548966407776, + "learning_rate": 1.8245527873101647e-05, + "loss": 1.7168, + "step": 23702 + }, + { + "epoch": 7.275322283609577, + "grad_norm": 0.16322405636310577, + "learning_rate": 1.8241688577687426e-05, + "loss": 1.7392, + "step": 23703 + }, + { + "epoch": 7.275629220380601, + "grad_norm": 0.19775766134262085, + "learning_rate": 1.8237849596127447e-05, + "loss": 1.7055, + "step": 23704 + }, + { + "epoch": 7.275936157151627, + "grad_norm": 0.1969427913427353, + "learning_rate": 1.823401092845966e-05, + "loss": 1.7418, + "step": 23705 + }, + { + "epoch": 7.276243093922652, + "grad_norm": 0.1791812628507614, + "learning_rate": 1.8230172574721992e-05, + "loss": 1.6512, + "step": 23706 + }, + { + "epoch": 7.276550030693677, + "grad_norm": 0.18583156168460846, + "learning_rate": 1.8226334534952384e-05, + "loss": 1.7357, + "step": 23707 + }, + { + "epoch": 7.276856967464703, + "grad_norm": 0.20729652047157288, + "learning_rate": 1.822249680918876e-05, + "loss": 1.7323, + "step": 23708 + }, + { + "epoch": 7.277163904235728, + "grad_norm": 0.20089028775691986, + "learning_rate": 1.8218659397469045e-05, + "loss": 1.6835, + "step": 23709 + }, + { + "epoch": 7.277470841006752, + "grad_norm": 0.16569854319095612, + "learning_rate": 1.8214822299831168e-05, + "loss": 1.7486, + "step": 23710 + }, + { + "epoch": 7.277777777777778, + "grad_norm": 0.19979944825172424, + "learning_rate": 1.8210985516313044e-05, + "loss": 1.7338, + "step": 23711 + }, + { + "epoch": 7.278084714548803, + "grad_norm": 0.23528912663459778, + "learning_rate": 1.82071490469526e-05, + "loss": 1.8086, + "step": 23712 + }, + { + "epoch": 7.278391651319828, + "grad_norm": 0.18231599032878876, + "learning_rate": 1.8203312891787737e-05, + "loss": 1.744, + "step": 23713 + }, + { + "epoch": 7.278698588090854, + "grad_norm": 0.2208651602268219, + "learning_rate": 1.8199477050856374e-05, + "loss": 1.7592, + "step": 23714 + }, + { + "epoch": 7.279005524861878, + "grad_norm": 0.22329792380332947, + "learning_rate": 1.8195641524196417e-05, + "loss": 1.7242, + "step": 23715 + }, + { + "epoch": 7.2793124616329035, + "grad_norm": 0.17745757102966309, + "learning_rate": 1.8191806311845778e-05, + "loss": 1.7162, + "step": 23716 + }, + { + "epoch": 7.279619398403929, + "grad_norm": 0.19536735117435455, + "learning_rate": 1.8187971413842324e-05, + "loss": 1.6814, + "step": 23717 + }, + { + "epoch": 7.279926335174954, + "grad_norm": 0.21853455901145935, + "learning_rate": 1.8184136830224025e-05, + "loss": 1.7049, + "step": 23718 + }, + { + "epoch": 7.2802332719459795, + "grad_norm": 0.1701575070619583, + "learning_rate": 1.8180302561028696e-05, + "loss": 1.6879, + "step": 23719 + }, + { + "epoch": 7.280540208717004, + "grad_norm": 0.18729525804519653, + "learning_rate": 1.8176468606294288e-05, + "loss": 1.6944, + "step": 23720 + }, + { + "epoch": 7.280847145488029, + "grad_norm": 0.20020832121372223, + "learning_rate": 1.8172634966058667e-05, + "loss": 1.7415, + "step": 23721 + }, + { + "epoch": 7.281154082259055, + "grad_norm": 0.1983461081981659, + "learning_rate": 1.8168801640359724e-05, + "loss": 1.7198, + "step": 23722 + }, + { + "epoch": 7.28146101903008, + "grad_norm": 0.17578791081905365, + "learning_rate": 1.8164968629235334e-05, + "loss": 1.7155, + "step": 23723 + }, + { + "epoch": 7.281767955801105, + "grad_norm": 0.1944401115179062, + "learning_rate": 1.8161135932723388e-05, + "loss": 1.7579, + "step": 23724 + }, + { + "epoch": 7.28207489257213, + "grad_norm": 0.20413067936897278, + "learning_rate": 1.8157303550861753e-05, + "loss": 1.7105, + "step": 23725 + }, + { + "epoch": 7.282381829343155, + "grad_norm": 0.17515964806079865, + "learning_rate": 1.8153471483688318e-05, + "loss": 1.7448, + "step": 23726 + }, + { + "epoch": 7.28268876611418, + "grad_norm": 0.2039034515619278, + "learning_rate": 1.8149639731240938e-05, + "loss": 1.691, + "step": 23727 + }, + { + "epoch": 7.282995702885206, + "grad_norm": 0.2136354148387909, + "learning_rate": 1.8145808293557483e-05, + "loss": 1.656, + "step": 23728 + }, + { + "epoch": 7.283302639656231, + "grad_norm": 0.23029537498950958, + "learning_rate": 1.814197717067582e-05, + "loss": 1.7588, + "step": 23729 + }, + { + "epoch": 7.283609576427256, + "grad_norm": 0.371910035610199, + "learning_rate": 1.8138146362633816e-05, + "loss": 1.8138, + "step": 23730 + }, + { + "epoch": 7.283916513198281, + "grad_norm": 0.2273472249507904, + "learning_rate": 1.8134315869469327e-05, + "loss": 1.6985, + "step": 23731 + }, + { + "epoch": 7.284223449969306, + "grad_norm": 0.33206698298454285, + "learning_rate": 1.81304856912202e-05, + "loss": 1.7015, + "step": 23732 + }, + { + "epoch": 7.2845303867403315, + "grad_norm": 0.20799405872821808, + "learning_rate": 1.8126655827924295e-05, + "loss": 1.6932, + "step": 23733 + }, + { + "epoch": 7.284837323511357, + "grad_norm": 0.28721246123313904, + "learning_rate": 1.8122826279619437e-05, + "loss": 1.7726, + "step": 23734 + }, + { + "epoch": 7.285144260282382, + "grad_norm": 0.2365201711654663, + "learning_rate": 1.8118997046343533e-05, + "loss": 1.7609, + "step": 23735 + }, + { + "epoch": 7.285451197053407, + "grad_norm": 0.24772630631923676, + "learning_rate": 1.811516812813435e-05, + "loss": 1.7057, + "step": 23736 + }, + { + "epoch": 7.285758133824432, + "grad_norm": 0.19344007968902588, + "learning_rate": 1.8111339525029802e-05, + "loss": 1.7526, + "step": 23737 + }, + { + "epoch": 7.286065070595457, + "grad_norm": 0.2454877346754074, + "learning_rate": 1.8107511237067648e-05, + "loss": 1.6474, + "step": 23738 + }, + { + "epoch": 7.286372007366483, + "grad_norm": 0.18084865808486938, + "learning_rate": 1.810368326428578e-05, + "loss": 1.7381, + "step": 23739 + }, + { + "epoch": 7.286678944137508, + "grad_norm": 0.26264744997024536, + "learning_rate": 1.8099855606722012e-05, + "loss": 1.6585, + "step": 23740 + }, + { + "epoch": 7.286985880908533, + "grad_norm": 0.20219333469867706, + "learning_rate": 1.809602826441416e-05, + "loss": 1.7552, + "step": 23741 + }, + { + "epoch": 7.287292817679558, + "grad_norm": 0.23982326686382294, + "learning_rate": 1.8092201237400064e-05, + "loss": 1.6784, + "step": 23742 + }, + { + "epoch": 7.287599754450583, + "grad_norm": 0.22838538885116577, + "learning_rate": 1.8088374525717534e-05, + "loss": 1.6976, + "step": 23743 + }, + { + "epoch": 7.287906691221608, + "grad_norm": 0.22077307105064392, + "learning_rate": 1.8084548129404395e-05, + "loss": 1.721, + "step": 23744 + }, + { + "epoch": 7.288213627992634, + "grad_norm": 0.19811047613620758, + "learning_rate": 1.8080722048498448e-05, + "loss": 1.7317, + "step": 23745 + }, + { + "epoch": 7.288520564763659, + "grad_norm": 0.25160667300224304, + "learning_rate": 1.8076896283037525e-05, + "loss": 1.7725, + "step": 23746 + }, + { + "epoch": 7.2888275015346835, + "grad_norm": 0.19819392263889313, + "learning_rate": 1.807307083305942e-05, + "loss": 1.7243, + "step": 23747 + }, + { + "epoch": 7.289134438305709, + "grad_norm": 0.21769097447395325, + "learning_rate": 1.806924569860194e-05, + "loss": 1.74, + "step": 23748 + }, + { + "epoch": 7.289441375076734, + "grad_norm": 0.23126530647277832, + "learning_rate": 1.806542087970289e-05, + "loss": 1.7479, + "step": 23749 + }, + { + "epoch": 7.2897483118477595, + "grad_norm": 0.21002748608589172, + "learning_rate": 1.8061596376400065e-05, + "loss": 1.6547, + "step": 23750 + }, + { + "epoch": 7.290055248618785, + "grad_norm": 0.242569699883461, + "learning_rate": 1.8057772188731255e-05, + "loss": 1.7587, + "step": 23751 + }, + { + "epoch": 7.290362185389809, + "grad_norm": 0.19619157910346985, + "learning_rate": 1.8053948316734287e-05, + "loss": 1.6619, + "step": 23752 + }, + { + "epoch": 7.290669122160835, + "grad_norm": 0.2086232304573059, + "learning_rate": 1.8050124760446896e-05, + "loss": 1.6535, + "step": 23753 + }, + { + "epoch": 7.29097605893186, + "grad_norm": 0.1955464631319046, + "learning_rate": 1.8046301519906932e-05, + "loss": 1.6814, + "step": 23754 + }, + { + "epoch": 7.291282995702885, + "grad_norm": 0.20373155176639557, + "learning_rate": 1.8042478595152117e-05, + "loss": 1.7006, + "step": 23755 + }, + { + "epoch": 7.291589932473911, + "grad_norm": 0.20233015716075897, + "learning_rate": 1.8038655986220272e-05, + "loss": 1.7478, + "step": 23756 + }, + { + "epoch": 7.291896869244935, + "grad_norm": 0.18800894916057587, + "learning_rate": 1.803483369314916e-05, + "loss": 1.747, + "step": 23757 + }, + { + "epoch": 7.29220380601596, + "grad_norm": 0.1838926076889038, + "learning_rate": 1.8031011715976558e-05, + "loss": 1.7086, + "step": 23758 + }, + { + "epoch": 7.292510742786986, + "grad_norm": 0.1806635707616806, + "learning_rate": 1.8027190054740234e-05, + "loss": 1.6682, + "step": 23759 + }, + { + "epoch": 7.292817679558011, + "grad_norm": 0.19762687385082245, + "learning_rate": 1.802336870947796e-05, + "loss": 1.7514, + "step": 23760 + }, + { + "epoch": 7.293124616329036, + "grad_norm": 0.1739082932472229, + "learning_rate": 1.80195476802275e-05, + "loss": 1.7031, + "step": 23761 + }, + { + "epoch": 7.293431553100062, + "grad_norm": 0.18887469172477722, + "learning_rate": 1.8015726967026615e-05, + "loss": 1.7199, + "step": 23762 + }, + { + "epoch": 7.293738489871086, + "grad_norm": 0.17344269156455994, + "learning_rate": 1.8011906569913056e-05, + "loss": 1.693, + "step": 23763 + }, + { + "epoch": 7.2940454266421115, + "grad_norm": 0.16480129957199097, + "learning_rate": 1.800808648892459e-05, + "loss": 1.722, + "step": 23764 + }, + { + "epoch": 7.294352363413137, + "grad_norm": 0.17336638271808624, + "learning_rate": 1.8004266724098963e-05, + "loss": 1.6635, + "step": 23765 + }, + { + "epoch": 7.294659300184162, + "grad_norm": 0.16539151966571808, + "learning_rate": 1.8000447275473925e-05, + "loss": 1.7709, + "step": 23766 + }, + { + "epoch": 7.2949662369551875, + "grad_norm": 0.20660065114498138, + "learning_rate": 1.7996628143087226e-05, + "loss": 1.7262, + "step": 23767 + }, + { + "epoch": 7.295273173726212, + "grad_norm": 0.2292039543390274, + "learning_rate": 1.7992809326976584e-05, + "loss": 1.7444, + "step": 23768 + }, + { + "epoch": 7.295580110497237, + "grad_norm": 0.20323103666305542, + "learning_rate": 1.7988990827179795e-05, + "loss": 1.7456, + "step": 23769 + }, + { + "epoch": 7.295887047268263, + "grad_norm": 0.16919885575771332, + "learning_rate": 1.7985172643734532e-05, + "loss": 1.7304, + "step": 23770 + }, + { + "epoch": 7.296193984039288, + "grad_norm": 0.19135236740112305, + "learning_rate": 1.798135477667859e-05, + "loss": 1.7067, + "step": 23771 + }, + { + "epoch": 7.296500920810313, + "grad_norm": 0.19812993705272675, + "learning_rate": 1.7977537226049627e-05, + "loss": 1.7701, + "step": 23772 + }, + { + "epoch": 7.296807857581339, + "grad_norm": 0.22823916375637054, + "learning_rate": 1.797371999188543e-05, + "loss": 1.737, + "step": 23773 + }, + { + "epoch": 7.297114794352363, + "grad_norm": 0.1862197369337082, + "learning_rate": 1.7969903074223705e-05, + "loss": 1.675, + "step": 23774 + }, + { + "epoch": 7.297421731123388, + "grad_norm": 0.18780425190925598, + "learning_rate": 1.7966086473102168e-05, + "loss": 1.7237, + "step": 23775 + }, + { + "epoch": 7.297728667894414, + "grad_norm": 0.174093559384346, + "learning_rate": 1.7962270188558543e-05, + "loss": 1.7129, + "step": 23776 + }, + { + "epoch": 7.298035604665439, + "grad_norm": 0.22659943997859955, + "learning_rate": 1.7958454220630543e-05, + "loss": 1.7257, + "step": 23777 + }, + { + "epoch": 7.298342541436464, + "grad_norm": 0.18077917397022247, + "learning_rate": 1.7954638569355875e-05, + "loss": 1.6972, + "step": 23778 + }, + { + "epoch": 7.298649478207489, + "grad_norm": 0.18380658328533173, + "learning_rate": 1.795082323477225e-05, + "loss": 1.6577, + "step": 23779 + }, + { + "epoch": 7.298956414978514, + "grad_norm": 0.17016704380512238, + "learning_rate": 1.7947008216917384e-05, + "loss": 1.7222, + "step": 23780 + }, + { + "epoch": 7.2992633517495396, + "grad_norm": 0.2016153484582901, + "learning_rate": 1.794319351582896e-05, + "loss": 1.6833, + "step": 23781 + }, + { + "epoch": 7.299570288520565, + "grad_norm": 0.26723918318748474, + "learning_rate": 1.7939379131544687e-05, + "loss": 1.7417, + "step": 23782 + }, + { + "epoch": 7.29987722529159, + "grad_norm": 0.2555576264858246, + "learning_rate": 1.7935565064102267e-05, + "loss": 1.7373, + "step": 23783 + }, + { + "epoch": 7.300184162062616, + "grad_norm": 0.2036418914794922, + "learning_rate": 1.793175131353938e-05, + "loss": 1.7052, + "step": 23784 + }, + { + "epoch": 7.30049109883364, + "grad_norm": 0.1789570152759552, + "learning_rate": 1.792793787989371e-05, + "loss": 1.6327, + "step": 23785 + }, + { + "epoch": 7.300798035604665, + "grad_norm": 0.2353249490261078, + "learning_rate": 1.7924124763202987e-05, + "loss": 1.7771, + "step": 23786 + }, + { + "epoch": 7.301104972375691, + "grad_norm": 0.19072949886322021, + "learning_rate": 1.792031196350483e-05, + "loss": 1.7095, + "step": 23787 + }, + { + "epoch": 7.301411909146716, + "grad_norm": 0.24063248932361603, + "learning_rate": 1.791649948083699e-05, + "loss": 1.7247, + "step": 23788 + }, + { + "epoch": 7.301718845917741, + "grad_norm": 0.1916036456823349, + "learning_rate": 1.791268731523707e-05, + "loss": 1.6844, + "step": 23789 + }, + { + "epoch": 7.302025782688766, + "grad_norm": 0.2606290876865387, + "learning_rate": 1.7908875466742797e-05, + "loss": 1.771, + "step": 23790 + }, + { + "epoch": 7.302332719459791, + "grad_norm": 0.23444804549217224, + "learning_rate": 1.7905063935391824e-05, + "loss": 1.747, + "step": 23791 + }, + { + "epoch": 7.3026396562308165, + "grad_norm": 0.28058725595474243, + "learning_rate": 1.7901252721221822e-05, + "loss": 1.7284, + "step": 23792 + }, + { + "epoch": 7.302946593001842, + "grad_norm": 0.23268578946590424, + "learning_rate": 1.7897441824270456e-05, + "loss": 1.7222, + "step": 23793 + }, + { + "epoch": 7.303253529772867, + "grad_norm": 0.275336354970932, + "learning_rate": 1.789363124457539e-05, + "loss": 1.7495, + "step": 23794 + }, + { + "epoch": 7.303560466543892, + "grad_norm": 0.21838977932929993, + "learning_rate": 1.788982098217427e-05, + "loss": 1.725, + "step": 23795 + }, + { + "epoch": 7.303867403314917, + "grad_norm": 0.24108058214187622, + "learning_rate": 1.7886011037104767e-05, + "loss": 1.7804, + "step": 23796 + }, + { + "epoch": 7.304174340085942, + "grad_norm": 0.23003144562244415, + "learning_rate": 1.788220140940452e-05, + "loss": 1.8189, + "step": 23797 + }, + { + "epoch": 7.304481276856968, + "grad_norm": 0.20129653811454773, + "learning_rate": 1.7878392099111186e-05, + "loss": 1.6603, + "step": 23798 + }, + { + "epoch": 7.304788213627993, + "grad_norm": 0.26172930002212524, + "learning_rate": 1.7874583106262404e-05, + "loss": 1.7095, + "step": 23799 + }, + { + "epoch": 7.305095150399017, + "grad_norm": 0.212156742811203, + "learning_rate": 1.7870774430895825e-05, + "loss": 1.7272, + "step": 23800 + }, + { + "epoch": 7.305402087170043, + "grad_norm": 0.2775247097015381, + "learning_rate": 1.7866966073049084e-05, + "loss": 1.773, + "step": 23801 + }, + { + "epoch": 7.305709023941068, + "grad_norm": 0.23456308245658875, + "learning_rate": 1.7863158032759803e-05, + "loss": 1.7173, + "step": 23802 + }, + { + "epoch": 7.306015960712093, + "grad_norm": 0.23986588418483734, + "learning_rate": 1.785935031006566e-05, + "loss": 1.6924, + "step": 23803 + }, + { + "epoch": 7.306322897483119, + "grad_norm": 0.1909915804862976, + "learning_rate": 1.7855542905004225e-05, + "loss": 1.7047, + "step": 23804 + }, + { + "epoch": 7.306629834254144, + "grad_norm": 0.20676325261592865, + "learning_rate": 1.7851735817613192e-05, + "loss": 1.6606, + "step": 23805 + }, + { + "epoch": 7.3069367710251685, + "grad_norm": 0.1910121887922287, + "learning_rate": 1.7847929047930106e-05, + "loss": 1.7555, + "step": 23806 + }, + { + "epoch": 7.307243707796194, + "grad_norm": 0.22737936675548553, + "learning_rate": 1.784412259599265e-05, + "loss": 1.7346, + "step": 23807 + }, + { + "epoch": 7.307550644567219, + "grad_norm": 0.1553424894809723, + "learning_rate": 1.7840316461838426e-05, + "loss": 1.6755, + "step": 23808 + }, + { + "epoch": 7.3078575813382445, + "grad_norm": 0.17937089502811432, + "learning_rate": 1.7836510645505044e-05, + "loss": 1.684, + "step": 23809 + }, + { + "epoch": 7.30816451810927, + "grad_norm": 0.20183639228343964, + "learning_rate": 1.783270514703011e-05, + "loss": 1.7617, + "step": 23810 + }, + { + "epoch": 7.308471454880294, + "grad_norm": 0.21359068155288696, + "learning_rate": 1.782889996645124e-05, + "loss": 1.6897, + "step": 23811 + }, + { + "epoch": 7.30877839165132, + "grad_norm": 0.19640007615089417, + "learning_rate": 1.782509510380604e-05, + "loss": 1.7029, + "step": 23812 + }, + { + "epoch": 7.309085328422345, + "grad_norm": 0.22678261995315552, + "learning_rate": 1.7821290559132104e-05, + "loss": 1.7241, + "step": 23813 + }, + { + "epoch": 7.30939226519337, + "grad_norm": 0.1797642707824707, + "learning_rate": 1.7817486332467037e-05, + "loss": 1.7127, + "step": 23814 + }, + { + "epoch": 7.309699201964396, + "grad_norm": 0.18758134543895721, + "learning_rate": 1.7813682423848432e-05, + "loss": 1.7394, + "step": 23815 + }, + { + "epoch": 7.310006138735421, + "grad_norm": 0.2064354121685028, + "learning_rate": 1.7809878833313887e-05, + "loss": 1.7477, + "step": 23816 + }, + { + "epoch": 7.310313075506445, + "grad_norm": 0.30564701557159424, + "learning_rate": 1.780607556090098e-05, + "loss": 1.7006, + "step": 23817 + }, + { + "epoch": 7.310620012277471, + "grad_norm": 0.23694200813770294, + "learning_rate": 1.7802272606647308e-05, + "loss": 1.7821, + "step": 23818 + }, + { + "epoch": 7.310926949048496, + "grad_norm": 0.20436422526836395, + "learning_rate": 1.779846997059043e-05, + "loss": 1.6681, + "step": 23819 + }, + { + "epoch": 7.311233885819521, + "grad_norm": 0.21899428963661194, + "learning_rate": 1.779466765276798e-05, + "loss": 1.7416, + "step": 23820 + }, + { + "epoch": 7.311540822590547, + "grad_norm": 0.24186378717422485, + "learning_rate": 1.779086565321747e-05, + "loss": 1.7258, + "step": 23821 + }, + { + "epoch": 7.311847759361571, + "grad_norm": 0.22940407693386078, + "learning_rate": 1.778706397197653e-05, + "loss": 1.7211, + "step": 23822 + }, + { + "epoch": 7.3121546961325965, + "grad_norm": 0.18643233180046082, + "learning_rate": 1.778326260908268e-05, + "loss": 1.6778, + "step": 23823 + }, + { + "epoch": 7.312461632903622, + "grad_norm": 0.25372037291526794, + "learning_rate": 1.7779461564573526e-05, + "loss": 1.7252, + "step": 23824 + }, + { + "epoch": 7.312768569674647, + "grad_norm": 0.21126380562782288, + "learning_rate": 1.7775660838486612e-05, + "loss": 1.6655, + "step": 23825 + }, + { + "epoch": 7.3130755064456725, + "grad_norm": 0.19614748656749725, + "learning_rate": 1.777186043085951e-05, + "loss": 1.7223, + "step": 23826 + }, + { + "epoch": 7.313382443216697, + "grad_norm": 0.2111951857805252, + "learning_rate": 1.7768060341729768e-05, + "loss": 1.708, + "step": 23827 + }, + { + "epoch": 7.313689379987722, + "grad_norm": 0.2675856053829193, + "learning_rate": 1.7764260571134956e-05, + "loss": 1.7387, + "step": 23828 + }, + { + "epoch": 7.313996316758748, + "grad_norm": 0.19827900826931, + "learning_rate": 1.7760461119112603e-05, + "loss": 1.6809, + "step": 23829 + }, + { + "epoch": 7.314303253529773, + "grad_norm": 0.24213160574436188, + "learning_rate": 1.775666198570028e-05, + "loss": 1.7064, + "step": 23830 + }, + { + "epoch": 7.314610190300798, + "grad_norm": 0.20035916566848755, + "learning_rate": 1.7752863170935514e-05, + "loss": 1.6874, + "step": 23831 + }, + { + "epoch": 7.314917127071823, + "grad_norm": 0.23662878572940826, + "learning_rate": 1.774906467485586e-05, + "loss": 1.7651, + "step": 23832 + }, + { + "epoch": 7.315224063842848, + "grad_norm": 0.18523871898651123, + "learning_rate": 1.7745266497498847e-05, + "loss": 1.7003, + "step": 23833 + }, + { + "epoch": 7.315531000613873, + "grad_norm": 0.21452756226062775, + "learning_rate": 1.7741468638902016e-05, + "loss": 1.7012, + "step": 23834 + }, + { + "epoch": 7.315837937384899, + "grad_norm": 0.17513468861579895, + "learning_rate": 1.7737671099102904e-05, + "loss": 1.6965, + "step": 23835 + }, + { + "epoch": 7.316144874155924, + "grad_norm": 0.29025998711586, + "learning_rate": 1.7733873878139012e-05, + "loss": 1.7347, + "step": 23836 + }, + { + "epoch": 7.316451810926949, + "grad_norm": 0.14812500774860382, + "learning_rate": 1.7730076976047926e-05, + "loss": 1.6469, + "step": 23837 + }, + { + "epoch": 7.316758747697974, + "grad_norm": 0.23575027287006378, + "learning_rate": 1.77262803928671e-05, + "loss": 1.7267, + "step": 23838 + }, + { + "epoch": 7.317065684468999, + "grad_norm": 0.17986448109149933, + "learning_rate": 1.7722484128634125e-05, + "loss": 1.7206, + "step": 23839 + }, + { + "epoch": 7.3173726212400245, + "grad_norm": 0.22515927255153656, + "learning_rate": 1.7718688183386446e-05, + "loss": 1.7216, + "step": 23840 + }, + { + "epoch": 7.31767955801105, + "grad_norm": 0.1903398036956787, + "learning_rate": 1.7714892557161624e-05, + "loss": 1.7108, + "step": 23841 + }, + { + "epoch": 7.317986494782075, + "grad_norm": 0.23623183369636536, + "learning_rate": 1.7711097249997162e-05, + "loss": 1.6866, + "step": 23842 + }, + { + "epoch": 7.3182934315531, + "grad_norm": 0.18501855432987213, + "learning_rate": 1.7707302261930554e-05, + "loss": 1.6643, + "step": 23843 + }, + { + "epoch": 7.318600368324125, + "grad_norm": 0.21865275502204895, + "learning_rate": 1.770350759299932e-05, + "loss": 1.6932, + "step": 23844 + }, + { + "epoch": 7.31890730509515, + "grad_norm": 0.22363261878490448, + "learning_rate": 1.7699713243240945e-05, + "loss": 1.721, + "step": 23845 + }, + { + "epoch": 7.319214241866176, + "grad_norm": 0.25587835907936096, + "learning_rate": 1.769591921269294e-05, + "loss": 1.7375, + "step": 23846 + }, + { + "epoch": 7.319521178637201, + "grad_norm": 0.22086483240127563, + "learning_rate": 1.76921255013928e-05, + "loss": 1.6957, + "step": 23847 + }, + { + "epoch": 7.319828115408226, + "grad_norm": 0.21197499334812164, + "learning_rate": 1.7688332109378007e-05, + "loss": 1.6993, + "step": 23848 + }, + { + "epoch": 7.320135052179251, + "grad_norm": 0.21211451292037964, + "learning_rate": 1.7684539036686054e-05, + "loss": 1.7329, + "step": 23849 + }, + { + "epoch": 7.320441988950276, + "grad_norm": 0.16938872635364532, + "learning_rate": 1.7680746283354433e-05, + "loss": 1.6895, + "step": 23850 + }, + { + "epoch": 7.320748925721301, + "grad_norm": 0.21465681493282318, + "learning_rate": 1.7676953849420613e-05, + "loss": 1.7156, + "step": 23851 + }, + { + "epoch": 7.321055862492327, + "grad_norm": 0.16188180446624756, + "learning_rate": 1.7673161734922084e-05, + "loss": 1.6307, + "step": 23852 + }, + { + "epoch": 7.321362799263352, + "grad_norm": 0.2152155190706253, + "learning_rate": 1.7669369939896302e-05, + "loss": 1.7135, + "step": 23853 + }, + { + "epoch": 7.3216697360343765, + "grad_norm": 0.15789814293384552, + "learning_rate": 1.7665578464380788e-05, + "loss": 1.7269, + "step": 23854 + }, + { + "epoch": 7.321976672805402, + "grad_norm": 0.17263127863407135, + "learning_rate": 1.7661787308412948e-05, + "loss": 1.6624, + "step": 23855 + }, + { + "epoch": 7.322283609576427, + "grad_norm": 0.19711650907993317, + "learning_rate": 1.7657996472030308e-05, + "loss": 1.7837, + "step": 23856 + }, + { + "epoch": 7.3225905463474525, + "grad_norm": 0.1847725212574005, + "learning_rate": 1.765420595527027e-05, + "loss": 1.707, + "step": 23857 + }, + { + "epoch": 7.322897483118478, + "grad_norm": 0.21316368877887726, + "learning_rate": 1.7650415758170345e-05, + "loss": 1.715, + "step": 23858 + }, + { + "epoch": 7.323204419889503, + "grad_norm": 0.1912030428647995, + "learning_rate": 1.7646625880767976e-05, + "loss": 1.7465, + "step": 23859 + }, + { + "epoch": 7.323511356660528, + "grad_norm": 0.16245616972446442, + "learning_rate": 1.7642836323100614e-05, + "loss": 1.7365, + "step": 23860 + }, + { + "epoch": 7.323818293431553, + "grad_norm": 0.20665429532527924, + "learning_rate": 1.76390470852057e-05, + "loss": 1.7435, + "step": 23861 + }, + { + "epoch": 7.324125230202578, + "grad_norm": 0.17079970240592957, + "learning_rate": 1.76352581671207e-05, + "loss": 1.7094, + "step": 23862 + }, + { + "epoch": 7.324432166973604, + "grad_norm": 0.17388395965099335, + "learning_rate": 1.7631469568883042e-05, + "loss": 1.7275, + "step": 23863 + }, + { + "epoch": 7.324739103744629, + "grad_norm": 0.20209765434265137, + "learning_rate": 1.7627681290530175e-05, + "loss": 1.7755, + "step": 23864 + }, + { + "epoch": 7.3250460405156534, + "grad_norm": 0.16459977626800537, + "learning_rate": 1.7623893332099538e-05, + "loss": 1.6765, + "step": 23865 + }, + { + "epoch": 7.325352977286679, + "grad_norm": 0.18313255906105042, + "learning_rate": 1.7620105693628556e-05, + "loss": 1.6792, + "step": 23866 + }, + { + "epoch": 7.325659914057704, + "grad_norm": 0.1651672124862671, + "learning_rate": 1.761631837515468e-05, + "loss": 1.6999, + "step": 23867 + }, + { + "epoch": 7.3259668508287294, + "grad_norm": 0.17414255440235138, + "learning_rate": 1.7612531376715317e-05, + "loss": 1.69, + "step": 23868 + }, + { + "epoch": 7.326273787599755, + "grad_norm": 0.1824718415737152, + "learning_rate": 1.7608744698347908e-05, + "loss": 1.6822, + "step": 23869 + }, + { + "epoch": 7.326580724370779, + "grad_norm": 0.19557121396064758, + "learning_rate": 1.760495834008986e-05, + "loss": 1.6852, + "step": 23870 + }, + { + "epoch": 7.326887661141805, + "grad_norm": 0.17803436517715454, + "learning_rate": 1.7601172301978606e-05, + "loss": 1.7523, + "step": 23871 + }, + { + "epoch": 7.32719459791283, + "grad_norm": 0.24077050387859344, + "learning_rate": 1.7597386584051545e-05, + "loss": 1.8044, + "step": 23872 + }, + { + "epoch": 7.327501534683855, + "grad_norm": 0.20061948895454407, + "learning_rate": 1.7593601186346127e-05, + "loss": 1.7298, + "step": 23873 + }, + { + "epoch": 7.327808471454881, + "grad_norm": 0.17362944781780243, + "learning_rate": 1.758981610889971e-05, + "loss": 1.7116, + "step": 23874 + }, + { + "epoch": 7.328115408225905, + "grad_norm": 0.20858663320541382, + "learning_rate": 1.758603135174974e-05, + "loss": 1.6765, + "step": 23875 + }, + { + "epoch": 7.32842234499693, + "grad_norm": 0.1805036962032318, + "learning_rate": 1.7582246914933604e-05, + "loss": 1.694, + "step": 23876 + }, + { + "epoch": 7.328729281767956, + "grad_norm": 0.26010429859161377, + "learning_rate": 1.7578462798488704e-05, + "loss": 1.7373, + "step": 23877 + }, + { + "epoch": 7.329036218538981, + "grad_norm": 0.19902443885803223, + "learning_rate": 1.7574679002452444e-05, + "loss": 1.72, + "step": 23878 + }, + { + "epoch": 7.329343155310006, + "grad_norm": 0.21231114864349365, + "learning_rate": 1.7570895526862202e-05, + "loss": 1.7526, + "step": 23879 + }, + { + "epoch": 7.329650092081032, + "grad_norm": 0.2075740098953247, + "learning_rate": 1.7567112371755384e-05, + "loss": 1.773, + "step": 23880 + }, + { + "epoch": 7.329957028852056, + "grad_norm": 0.21381771564483643, + "learning_rate": 1.756332953716937e-05, + "loss": 1.733, + "step": 23881 + }, + { + "epoch": 7.3302639656230815, + "grad_norm": 0.21689461171627045, + "learning_rate": 1.755954702314155e-05, + "loss": 1.7234, + "step": 23882 + }, + { + "epoch": 7.330570902394107, + "grad_norm": 0.21094383299350739, + "learning_rate": 1.755576482970929e-05, + "loss": 1.7074, + "step": 23883 + }, + { + "epoch": 7.330877839165132, + "grad_norm": 0.18460774421691895, + "learning_rate": 1.7551982956909985e-05, + "loss": 1.6706, + "step": 23884 + }, + { + "epoch": 7.3311847759361575, + "grad_norm": 0.18868015706539154, + "learning_rate": 1.7548201404781e-05, + "loss": 1.6371, + "step": 23885 + }, + { + "epoch": 7.331491712707182, + "grad_norm": 0.18036094307899475, + "learning_rate": 1.7544420173359715e-05, + "loss": 1.7115, + "step": 23886 + }, + { + "epoch": 7.331798649478207, + "grad_norm": 0.17143553495407104, + "learning_rate": 1.754063926268349e-05, + "loss": 1.668, + "step": 23887 + }, + { + "epoch": 7.332105586249233, + "grad_norm": 0.1700706034898758, + "learning_rate": 1.7536858672789684e-05, + "loss": 1.7244, + "step": 23888 + }, + { + "epoch": 7.332412523020258, + "grad_norm": 0.1740385890007019, + "learning_rate": 1.7533078403715665e-05, + "loss": 1.7163, + "step": 23889 + }, + { + "epoch": 7.332719459791283, + "grad_norm": 0.206922248005867, + "learning_rate": 1.752929845549882e-05, + "loss": 1.7572, + "step": 23890 + }, + { + "epoch": 7.333026396562309, + "grad_norm": 0.22770223021507263, + "learning_rate": 1.7525518828176445e-05, + "loss": 1.7391, + "step": 23891 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.203486829996109, + "learning_rate": 1.7521739521785962e-05, + "loss": 1.7664, + "step": 23892 + }, + { + "epoch": 7.333640270104358, + "grad_norm": 0.15539827942848206, + "learning_rate": 1.7517960536364652e-05, + "loss": 1.675, + "step": 23893 + }, + { + "epoch": 7.333947206875384, + "grad_norm": 0.18226636946201324, + "learning_rate": 1.7514181871949913e-05, + "loss": 1.7097, + "step": 23894 + }, + { + "epoch": 7.334254143646409, + "grad_norm": 0.1522573083639145, + "learning_rate": 1.751040352857907e-05, + "loss": 1.6783, + "step": 23895 + }, + { + "epoch": 7.334561080417434, + "grad_norm": 0.18082024157047272, + "learning_rate": 1.750662550628946e-05, + "loss": 1.752, + "step": 23896 + }, + { + "epoch": 7.334868017188459, + "grad_norm": 0.1968161165714264, + "learning_rate": 1.750284780511844e-05, + "loss": 1.7773, + "step": 23897 + }, + { + "epoch": 7.335174953959484, + "grad_norm": 0.17520470917224884, + "learning_rate": 1.7499070425103286e-05, + "loss": 1.7244, + "step": 23898 + }, + { + "epoch": 7.3354818907305095, + "grad_norm": 0.32224342226982117, + "learning_rate": 1.749529336628139e-05, + "loss": 1.8087, + "step": 23899 + }, + { + "epoch": 7.335788827501535, + "grad_norm": 0.25473707914352417, + "learning_rate": 1.7491516628690053e-05, + "loss": 1.7677, + "step": 23900 + }, + { + "epoch": 7.33609576427256, + "grad_norm": 0.20730654895305634, + "learning_rate": 1.7487740212366604e-05, + "loss": 1.7261, + "step": 23901 + }, + { + "epoch": 7.336402701043585, + "grad_norm": 0.22070205211639404, + "learning_rate": 1.748396411734836e-05, + "loss": 1.8024, + "step": 23902 + }, + { + "epoch": 7.33670963781461, + "grad_norm": 0.16921460628509521, + "learning_rate": 1.7480188343672647e-05, + "loss": 1.6823, + "step": 23903 + }, + { + "epoch": 7.337016574585635, + "grad_norm": 0.16576658189296722, + "learning_rate": 1.747641289137677e-05, + "loss": 1.6563, + "step": 23904 + }, + { + "epoch": 7.337323511356661, + "grad_norm": 0.19541388750076294, + "learning_rate": 1.7472637760498046e-05, + "loss": 1.8023, + "step": 23905 + }, + { + "epoch": 7.337630448127686, + "grad_norm": 0.19848179817199707, + "learning_rate": 1.7468862951073754e-05, + "loss": 1.7395, + "step": 23906 + }, + { + "epoch": 7.337937384898711, + "grad_norm": 0.1627921313047409, + "learning_rate": 1.746508846314127e-05, + "loss": 1.6569, + "step": 23907 + }, + { + "epoch": 7.338244321669736, + "grad_norm": 0.1798046976327896, + "learning_rate": 1.7461314296737813e-05, + "loss": 1.6927, + "step": 23908 + }, + { + "epoch": 7.338551258440761, + "grad_norm": 0.17935742437839508, + "learning_rate": 1.7457540451900757e-05, + "loss": 1.701, + "step": 23909 + }, + { + "epoch": 7.338858195211786, + "grad_norm": 0.16761814057826996, + "learning_rate": 1.745376692866732e-05, + "loss": 1.6701, + "step": 23910 + }, + { + "epoch": 7.339165131982812, + "grad_norm": 0.1733570694923401, + "learning_rate": 1.7449993727074855e-05, + "loss": 1.705, + "step": 23911 + }, + { + "epoch": 7.339472068753837, + "grad_norm": 0.21162372827529907, + "learning_rate": 1.7446220847160626e-05, + "loss": 1.7703, + "step": 23912 + }, + { + "epoch": 7.3397790055248615, + "grad_norm": 0.18743988871574402, + "learning_rate": 1.7442448288961928e-05, + "loss": 1.6899, + "step": 23913 + }, + { + "epoch": 7.340085942295887, + "grad_norm": 0.19185546040534973, + "learning_rate": 1.743867605251605e-05, + "loss": 1.7483, + "step": 23914 + }, + { + "epoch": 7.340392879066912, + "grad_norm": 0.23066233098506927, + "learning_rate": 1.7434904137860232e-05, + "loss": 1.7564, + "step": 23915 + }, + { + "epoch": 7.3406998158379375, + "grad_norm": 0.18159757554531097, + "learning_rate": 1.743113254503179e-05, + "loss": 1.7136, + "step": 23916 + }, + { + "epoch": 7.341006752608963, + "grad_norm": 0.22666020691394806, + "learning_rate": 1.7427361274067995e-05, + "loss": 1.7589, + "step": 23917 + }, + { + "epoch": 7.341313689379987, + "grad_norm": 0.18986108899116516, + "learning_rate": 1.74235903250061e-05, + "loss": 1.7429, + "step": 23918 + }, + { + "epoch": 7.341620626151013, + "grad_norm": 0.17987726628780365, + "learning_rate": 1.741981969788338e-05, + "loss": 1.7457, + "step": 23919 + }, + { + "epoch": 7.341927562922038, + "grad_norm": 0.2370992749929428, + "learning_rate": 1.7416049392737093e-05, + "loss": 1.7594, + "step": 23920 + }, + { + "epoch": 7.342234499693063, + "grad_norm": 0.18698690831661224, + "learning_rate": 1.7412279409604508e-05, + "loss": 1.7555, + "step": 23921 + }, + { + "epoch": 7.342541436464089, + "grad_norm": 0.18401117622852325, + "learning_rate": 1.7408509748522882e-05, + "loss": 1.7355, + "step": 23922 + }, + { + "epoch": 7.342848373235114, + "grad_norm": 0.22045543789863586, + "learning_rate": 1.7404740409529448e-05, + "loss": 1.7227, + "step": 23923 + }, + { + "epoch": 7.343155310006138, + "grad_norm": 0.24414709210395813, + "learning_rate": 1.7400971392661502e-05, + "loss": 1.7551, + "step": 23924 + }, + { + "epoch": 7.343462246777164, + "grad_norm": 0.1906892955303192, + "learning_rate": 1.739720269795623e-05, + "loss": 1.7204, + "step": 23925 + }, + { + "epoch": 7.343769183548189, + "grad_norm": 0.1840149164199829, + "learning_rate": 1.7393434325450948e-05, + "loss": 1.74, + "step": 23926 + }, + { + "epoch": 7.344076120319214, + "grad_norm": 0.21434549987316132, + "learning_rate": 1.7389666275182825e-05, + "loss": 1.6961, + "step": 23927 + }, + { + "epoch": 7.34438305709024, + "grad_norm": 0.19110503792762756, + "learning_rate": 1.7385898547189146e-05, + "loss": 1.7731, + "step": 23928 + }, + { + "epoch": 7.344689993861264, + "grad_norm": 0.18905460834503174, + "learning_rate": 1.7382131141507136e-05, + "loss": 1.6925, + "step": 23929 + }, + { + "epoch": 7.3449969306322895, + "grad_norm": 0.16336308419704437, + "learning_rate": 1.7378364058174024e-05, + "loss": 1.7073, + "step": 23930 + }, + { + "epoch": 7.345303867403315, + "grad_norm": 0.16707782447338104, + "learning_rate": 1.7374597297227056e-05, + "loss": 1.7036, + "step": 23931 + }, + { + "epoch": 7.34561080417434, + "grad_norm": 0.19958938658237457, + "learning_rate": 1.7370830858703406e-05, + "loss": 1.7035, + "step": 23932 + }, + { + "epoch": 7.3459177409453655, + "grad_norm": 0.18446899950504303, + "learning_rate": 1.7367064742640348e-05, + "loss": 1.754, + "step": 23933 + }, + { + "epoch": 7.346224677716391, + "grad_norm": 0.19238999485969543, + "learning_rate": 1.736329894907508e-05, + "loss": 1.6903, + "step": 23934 + }, + { + "epoch": 7.346531614487415, + "grad_norm": 0.1985396146774292, + "learning_rate": 1.7359533478044825e-05, + "loss": 1.7342, + "step": 23935 + }, + { + "epoch": 7.346838551258441, + "grad_norm": 0.19200150668621063, + "learning_rate": 1.7355768329586784e-05, + "loss": 1.6915, + "step": 23936 + }, + { + "epoch": 7.347145488029466, + "grad_norm": 0.19772231578826904, + "learning_rate": 1.7352003503738186e-05, + "loss": 1.7341, + "step": 23937 + }, + { + "epoch": 7.347452424800491, + "grad_norm": 0.1961035579442978, + "learning_rate": 1.7348239000536214e-05, + "loss": 1.7395, + "step": 23938 + }, + { + "epoch": 7.347759361571517, + "grad_norm": 0.15188434720039368, + "learning_rate": 1.7344474820018087e-05, + "loss": 1.635, + "step": 23939 + }, + { + "epoch": 7.348066298342541, + "grad_norm": 0.18748410046100616, + "learning_rate": 1.734071096222098e-05, + "loss": 1.6878, + "step": 23940 + }, + { + "epoch": 7.348373235113566, + "grad_norm": 0.19337952136993408, + "learning_rate": 1.7336947427182143e-05, + "loss": 1.7532, + "step": 23941 + }, + { + "epoch": 7.348680171884592, + "grad_norm": 0.14804427325725555, + "learning_rate": 1.73331842149387e-05, + "loss": 1.683, + "step": 23942 + }, + { + "epoch": 7.348987108655617, + "grad_norm": 0.18310968577861786, + "learning_rate": 1.7329421325527916e-05, + "loss": 1.718, + "step": 23943 + }, + { + "epoch": 7.349294045426642, + "grad_norm": 0.18589583039283752, + "learning_rate": 1.7325658758986906e-05, + "loss": 1.7115, + "step": 23944 + }, + { + "epoch": 7.349600982197667, + "grad_norm": 0.1618955284357071, + "learning_rate": 1.7321896515352904e-05, + "loss": 1.6757, + "step": 23945 + }, + { + "epoch": 7.349907918968692, + "grad_norm": 0.20092655718326569, + "learning_rate": 1.731813459466307e-05, + "loss": 1.7537, + "step": 23946 + }, + { + "epoch": 7.350214855739718, + "grad_norm": 0.17287038266658783, + "learning_rate": 1.7314372996954592e-05, + "loss": 1.6744, + "step": 23947 + }, + { + "epoch": 7.350521792510743, + "grad_norm": 0.19176220893859863, + "learning_rate": 1.731061172226465e-05, + "loss": 1.7279, + "step": 23948 + }, + { + "epoch": 7.350828729281768, + "grad_norm": 0.2060871571302414, + "learning_rate": 1.7306850770630367e-05, + "loss": 1.7802, + "step": 23949 + }, + { + "epoch": 7.351135666052793, + "grad_norm": 0.27185341715812683, + "learning_rate": 1.7303090142088967e-05, + "loss": 1.7234, + "step": 23950 + }, + { + "epoch": 7.351442602823818, + "grad_norm": 0.19845733046531677, + "learning_rate": 1.729932983667759e-05, + "loss": 1.7503, + "step": 23951 + }, + { + "epoch": 7.351749539594843, + "grad_norm": 0.19455648958683014, + "learning_rate": 1.729556985443341e-05, + "loss": 1.8096, + "step": 23952 + }, + { + "epoch": 7.352056476365869, + "grad_norm": 0.19090545177459717, + "learning_rate": 1.729181019539357e-05, + "loss": 1.6776, + "step": 23953 + }, + { + "epoch": 7.352363413136894, + "grad_norm": 0.16086700558662415, + "learning_rate": 1.728805085959524e-05, + "loss": 1.6829, + "step": 23954 + }, + { + "epoch": 7.352670349907919, + "grad_norm": 0.2156524360179901, + "learning_rate": 1.7284291847075555e-05, + "loss": 1.7147, + "step": 23955 + }, + { + "epoch": 7.352977286678944, + "grad_norm": 0.20258861780166626, + "learning_rate": 1.728053315787168e-05, + "loss": 1.7085, + "step": 23956 + }, + { + "epoch": 7.353284223449969, + "grad_norm": 0.1877330094575882, + "learning_rate": 1.7276774792020735e-05, + "loss": 1.7311, + "step": 23957 + }, + { + "epoch": 7.3535911602209945, + "grad_norm": 0.22096484899520874, + "learning_rate": 1.727301674955992e-05, + "loss": 1.6712, + "step": 23958 + }, + { + "epoch": 7.35389809699202, + "grad_norm": 0.21456706523895264, + "learning_rate": 1.726925903052629e-05, + "loss": 1.7773, + "step": 23959 + }, + { + "epoch": 7.354205033763045, + "grad_norm": 0.2114667296409607, + "learning_rate": 1.7265501634957072e-05, + "loss": 1.669, + "step": 23960 + }, + { + "epoch": 7.35451197053407, + "grad_norm": 0.1676410287618637, + "learning_rate": 1.726174456288931e-05, + "loss": 1.6673, + "step": 23961 + }, + { + "epoch": 7.354818907305095, + "grad_norm": 0.19883838295936584, + "learning_rate": 1.72579878143602e-05, + "loss": 1.6821, + "step": 23962 + }, + { + "epoch": 7.35512584407612, + "grad_norm": 0.19240599870681763, + "learning_rate": 1.725423138940684e-05, + "loss": 1.741, + "step": 23963 + }, + { + "epoch": 7.355432780847146, + "grad_norm": 0.230613574385643, + "learning_rate": 1.7250475288066363e-05, + "loss": 1.6937, + "step": 23964 + }, + { + "epoch": 7.355739717618171, + "grad_norm": 0.17126981914043427, + "learning_rate": 1.7246719510375898e-05, + "loss": 1.6791, + "step": 23965 + }, + { + "epoch": 7.356046654389196, + "grad_norm": 0.1852734386920929, + "learning_rate": 1.7242964056372518e-05, + "loss": 1.7196, + "step": 23966 + }, + { + "epoch": 7.356353591160221, + "grad_norm": 0.1922985464334488, + "learning_rate": 1.723920892609338e-05, + "loss": 1.794, + "step": 23967 + }, + { + "epoch": 7.356660527931246, + "grad_norm": 0.1918993592262268, + "learning_rate": 1.7235454119575582e-05, + "loss": 1.7725, + "step": 23968 + }, + { + "epoch": 7.356967464702271, + "grad_norm": 0.21787014603614807, + "learning_rate": 1.723169963685623e-05, + "loss": 1.7382, + "step": 23969 + }, + { + "epoch": 7.357274401473297, + "grad_norm": 0.23753544688224792, + "learning_rate": 1.722794547797243e-05, + "loss": 1.7924, + "step": 23970 + }, + { + "epoch": 7.357581338244322, + "grad_norm": 0.2251000851392746, + "learning_rate": 1.722419164296128e-05, + "loss": 1.6794, + "step": 23971 + }, + { + "epoch": 7.3578882750153465, + "grad_norm": 0.21573983132839203, + "learning_rate": 1.7220438131859878e-05, + "loss": 1.796, + "step": 23972 + }, + { + "epoch": 7.358195211786372, + "grad_norm": 0.217384472489357, + "learning_rate": 1.721668494470532e-05, + "loss": 1.7305, + "step": 23973 + }, + { + "epoch": 7.358502148557397, + "grad_norm": 0.21815331280231476, + "learning_rate": 1.7212932081534677e-05, + "loss": 1.7348, + "step": 23974 + }, + { + "epoch": 7.3588090853284225, + "grad_norm": 0.19974499940872192, + "learning_rate": 1.7209179542385097e-05, + "loss": 1.7383, + "step": 23975 + }, + { + "epoch": 7.359116022099448, + "grad_norm": 0.20518191158771515, + "learning_rate": 1.7205427327293582e-05, + "loss": 1.7087, + "step": 23976 + }, + { + "epoch": 7.359422958870473, + "grad_norm": 0.17104744911193848, + "learning_rate": 1.7201675436297293e-05, + "loss": 1.718, + "step": 23977 + }, + { + "epoch": 7.359729895641498, + "grad_norm": 0.2165975421667099, + "learning_rate": 1.7197923869433235e-05, + "loss": 1.7907, + "step": 23978 + }, + { + "epoch": 7.360036832412523, + "grad_norm": 0.1784742921590805, + "learning_rate": 1.719417262673854e-05, + "loss": 1.6354, + "step": 23979 + }, + { + "epoch": 7.360343769183548, + "grad_norm": 0.1867162138223648, + "learning_rate": 1.719042170825026e-05, + "loss": 1.7264, + "step": 23980 + }, + { + "epoch": 7.360650705954574, + "grad_norm": 0.19704937934875488, + "learning_rate": 1.7186671114005458e-05, + "loss": 1.72, + "step": 23981 + }, + { + "epoch": 7.360957642725599, + "grad_norm": 0.20316866040229797, + "learning_rate": 1.718292084404123e-05, + "loss": 1.759, + "step": 23982 + }, + { + "epoch": 7.361264579496623, + "grad_norm": 0.20339833199977875, + "learning_rate": 1.717917089839457e-05, + "loss": 1.7537, + "step": 23983 + }, + { + "epoch": 7.361571516267649, + "grad_norm": 0.18114012479782104, + "learning_rate": 1.71754212771026e-05, + "loss": 1.7207, + "step": 23984 + }, + { + "epoch": 7.361878453038674, + "grad_norm": 0.16071686148643494, + "learning_rate": 1.7171671980202353e-05, + "loss": 1.6534, + "step": 23985 + }, + { + "epoch": 7.362185389809699, + "grad_norm": 0.15212370455265045, + "learning_rate": 1.7167923007730892e-05, + "loss": 1.6638, + "step": 23986 + }, + { + "epoch": 7.362492326580725, + "grad_norm": 0.16284595429897308, + "learning_rate": 1.7164174359725253e-05, + "loss": 1.7442, + "step": 23987 + }, + { + "epoch": 7.362799263351749, + "grad_norm": 0.18302884697914124, + "learning_rate": 1.7160426036222494e-05, + "loss": 1.7087, + "step": 23988 + }, + { + "epoch": 7.3631062001227745, + "grad_norm": 0.18764640390872955, + "learning_rate": 1.715667803725965e-05, + "loss": 1.702, + "step": 23989 + }, + { + "epoch": 7.3634131368938, + "grad_norm": 0.16912522912025452, + "learning_rate": 1.7152930362873758e-05, + "loss": 1.742, + "step": 23990 + }, + { + "epoch": 7.363720073664825, + "grad_norm": 0.21137015521526337, + "learning_rate": 1.714918301310185e-05, + "loss": 1.7074, + "step": 23991 + }, + { + "epoch": 7.3640270104358505, + "grad_norm": 0.17562401294708252, + "learning_rate": 1.7145435987981008e-05, + "loss": 1.69, + "step": 23992 + }, + { + "epoch": 7.364333947206875, + "grad_norm": 0.15575642883777618, + "learning_rate": 1.714168928754818e-05, + "loss": 1.6986, + "step": 23993 + }, + { + "epoch": 7.3646408839779, + "grad_norm": 0.18057680130004883, + "learning_rate": 1.7137942911840477e-05, + "loss": 1.7661, + "step": 23994 + }, + { + "epoch": 7.364947820748926, + "grad_norm": 0.18899883329868317, + "learning_rate": 1.7134196860894853e-05, + "loss": 1.6841, + "step": 23995 + }, + { + "epoch": 7.365254757519951, + "grad_norm": 0.15350781381130219, + "learning_rate": 1.7130451134748367e-05, + "loss": 1.7005, + "step": 23996 + }, + { + "epoch": 7.365561694290976, + "grad_norm": 0.20394811034202576, + "learning_rate": 1.7126705733438037e-05, + "loss": 1.7342, + "step": 23997 + }, + { + "epoch": 7.365868631062002, + "grad_norm": 0.1881636083126068, + "learning_rate": 1.7122960657000864e-05, + "loss": 1.6985, + "step": 23998 + }, + { + "epoch": 7.366175567833026, + "grad_norm": 0.1619534194469452, + "learning_rate": 1.711921590547388e-05, + "loss": 1.6579, + "step": 23999 + }, + { + "epoch": 7.366482504604051, + "grad_norm": 0.16795861721038818, + "learning_rate": 1.711547147889404e-05, + "loss": 1.717, + "step": 24000 + }, + { + "epoch": 7.366789441375077, + "grad_norm": 0.1452684998512268, + "learning_rate": 1.711172737729841e-05, + "loss": 1.6792, + "step": 24001 + }, + { + "epoch": 7.367096378146102, + "grad_norm": 0.14940062165260315, + "learning_rate": 1.710798360072396e-05, + "loss": 1.6731, + "step": 24002 + }, + { + "epoch": 7.367403314917127, + "grad_norm": 0.21277321875095367, + "learning_rate": 1.7104240149207694e-05, + "loss": 1.7145, + "step": 24003 + }, + { + "epoch": 7.367710251688152, + "grad_norm": 0.17097726464271545, + "learning_rate": 1.710049702278661e-05, + "loss": 1.7052, + "step": 24004 + }, + { + "epoch": 7.368017188459177, + "grad_norm": 0.15970511734485626, + "learning_rate": 1.7096754221497702e-05, + "loss": 1.6586, + "step": 24005 + }, + { + "epoch": 7.3683241252302025, + "grad_norm": 0.198451429605484, + "learning_rate": 1.7093011745377945e-05, + "loss": 1.7449, + "step": 24006 + }, + { + "epoch": 7.368631062001228, + "grad_norm": 0.19554266333580017, + "learning_rate": 1.7089269594464342e-05, + "loss": 1.7455, + "step": 24007 + }, + { + "epoch": 7.368937998772253, + "grad_norm": 0.1854190230369568, + "learning_rate": 1.7085527768793847e-05, + "loss": 1.7355, + "step": 24008 + }, + { + "epoch": 7.3692449355432785, + "grad_norm": 0.17093004286289215, + "learning_rate": 1.708178626840349e-05, + "loss": 1.6813, + "step": 24009 + }, + { + "epoch": 7.369551872314303, + "grad_norm": 0.15385115146636963, + "learning_rate": 1.707804509333018e-05, + "loss": 1.664, + "step": 24010 + }, + { + "epoch": 7.369858809085328, + "grad_norm": 0.18747489154338837, + "learning_rate": 1.7074304243610963e-05, + "loss": 1.787, + "step": 24011 + }, + { + "epoch": 7.370165745856354, + "grad_norm": 0.21749509871006012, + "learning_rate": 1.7070563719282734e-05, + "loss": 1.723, + "step": 24012 + }, + { + "epoch": 7.370472682627379, + "grad_norm": 0.18973985314369202, + "learning_rate": 1.7066823520382508e-05, + "loss": 1.7415, + "step": 24013 + }, + { + "epoch": 7.370779619398404, + "grad_norm": 0.24844922125339508, + "learning_rate": 1.706308364694724e-05, + "loss": 1.7617, + "step": 24014 + }, + { + "epoch": 7.371086556169429, + "grad_norm": 0.16565518081188202, + "learning_rate": 1.705934409901388e-05, + "loss": 1.6781, + "step": 24015 + }, + { + "epoch": 7.371393492940454, + "grad_norm": 0.22595234215259552, + "learning_rate": 1.705560487661941e-05, + "loss": 1.7706, + "step": 24016 + }, + { + "epoch": 7.371700429711479, + "grad_norm": 0.2452661544084549, + "learning_rate": 1.7051865979800723e-05, + "loss": 1.8227, + "step": 24017 + }, + { + "epoch": 7.372007366482505, + "grad_norm": 0.2285550981760025, + "learning_rate": 1.7048127408594834e-05, + "loss": 1.7554, + "step": 24018 + }, + { + "epoch": 7.37231430325353, + "grad_norm": 0.22723950445652008, + "learning_rate": 1.7044389163038656e-05, + "loss": 1.7152, + "step": 24019 + }, + { + "epoch": 7.3726212400245545, + "grad_norm": 0.20335997641086578, + "learning_rate": 1.7040651243169143e-05, + "loss": 1.6661, + "step": 24020 + }, + { + "epoch": 7.37292817679558, + "grad_norm": 0.27618682384490967, + "learning_rate": 1.703691364902323e-05, + "loss": 1.8375, + "step": 24021 + }, + { + "epoch": 7.373235113566605, + "grad_norm": 0.24076996743679047, + "learning_rate": 1.7033176380637856e-05, + "loss": 1.7581, + "step": 24022 + }, + { + "epoch": 7.3735420503376305, + "grad_norm": 0.21615716814994812, + "learning_rate": 1.702943943804996e-05, + "loss": 1.7047, + "step": 24023 + }, + { + "epoch": 7.373848987108656, + "grad_norm": 0.23503927886486053, + "learning_rate": 1.7025702821296462e-05, + "loss": 1.7926, + "step": 24024 + }, + { + "epoch": 7.37415592387968, + "grad_norm": 0.2344675064086914, + "learning_rate": 1.7021966530414303e-05, + "loss": 1.747, + "step": 24025 + }, + { + "epoch": 7.374462860650706, + "grad_norm": 0.20946700870990753, + "learning_rate": 1.701823056544039e-05, + "loss": 1.746, + "step": 24026 + }, + { + "epoch": 7.374769797421731, + "grad_norm": 0.26749730110168457, + "learning_rate": 1.7014494926411645e-05, + "loss": 1.7375, + "step": 24027 + }, + { + "epoch": 7.375076734192756, + "grad_norm": 0.19716335833072662, + "learning_rate": 1.701075961336503e-05, + "loss": 1.6677, + "step": 24028 + }, + { + "epoch": 7.375383670963782, + "grad_norm": 0.1999496966600418, + "learning_rate": 1.7007024626337382e-05, + "loss": 1.6665, + "step": 24029 + }, + { + "epoch": 7.375690607734807, + "grad_norm": 0.188812255859375, + "learning_rate": 1.7003289965365676e-05, + "loss": 1.7344, + "step": 24030 + }, + { + "epoch": 7.3759975445058314, + "grad_norm": 0.20171904563903809, + "learning_rate": 1.6999555630486795e-05, + "loss": 1.7452, + "step": 24031 + }, + { + "epoch": 7.376304481276857, + "grad_norm": 0.21260966360569, + "learning_rate": 1.6995821621737655e-05, + "loss": 1.7759, + "step": 24032 + }, + { + "epoch": 7.376611418047882, + "grad_norm": 0.1913561075925827, + "learning_rate": 1.699208793915516e-05, + "loss": 1.7342, + "step": 24033 + }, + { + "epoch": 7.3769183548189075, + "grad_norm": 0.1907757967710495, + "learning_rate": 1.6988354582776166e-05, + "loss": 1.6511, + "step": 24034 + }, + { + "epoch": 7.377225291589933, + "grad_norm": 0.15012076497077942, + "learning_rate": 1.6984621552637625e-05, + "loss": 1.6638, + "step": 24035 + }, + { + "epoch": 7.377532228360957, + "grad_norm": 0.17761732637882233, + "learning_rate": 1.6980888848776394e-05, + "loss": 1.7035, + "step": 24036 + }, + { + "epoch": 7.377839165131983, + "grad_norm": 0.15940140187740326, + "learning_rate": 1.6977156471229376e-05, + "loss": 1.6532, + "step": 24037 + }, + { + "epoch": 7.378146101903008, + "grad_norm": 0.19022013247013092, + "learning_rate": 1.6973424420033455e-05, + "loss": 1.7545, + "step": 24038 + }, + { + "epoch": 7.378453038674033, + "grad_norm": 0.1900233030319214, + "learning_rate": 1.6969692695225513e-05, + "loss": 1.7051, + "step": 24039 + }, + { + "epoch": 7.378759975445059, + "grad_norm": 0.17687582969665527, + "learning_rate": 1.6965961296842425e-05, + "loss": 1.6819, + "step": 24040 + }, + { + "epoch": 7.379066912216084, + "grad_norm": 0.16323260962963104, + "learning_rate": 1.696223022492107e-05, + "loss": 1.6642, + "step": 24041 + }, + { + "epoch": 7.379373848987108, + "grad_norm": 0.21163886785507202, + "learning_rate": 1.695849947949832e-05, + "loss": 1.6973, + "step": 24042 + }, + { + "epoch": 7.379680785758134, + "grad_norm": 0.1713307648897171, + "learning_rate": 1.6954769060611043e-05, + "loss": 1.677, + "step": 24043 + }, + { + "epoch": 7.379987722529159, + "grad_norm": 0.19575951993465424, + "learning_rate": 1.695103896829609e-05, + "loss": 1.7305, + "step": 24044 + }, + { + "epoch": 7.380294659300184, + "grad_norm": 0.16087177395820618, + "learning_rate": 1.6947309202590377e-05, + "loss": 1.6435, + "step": 24045 + }, + { + "epoch": 7.38060159607121, + "grad_norm": 0.2088652402162552, + "learning_rate": 1.6943579763530692e-05, + "loss": 1.7136, + "step": 24046 + }, + { + "epoch": 7.380908532842234, + "grad_norm": 0.18253973126411438, + "learning_rate": 1.693985065115396e-05, + "loss": 1.7461, + "step": 24047 + }, + { + "epoch": 7.3812154696132595, + "grad_norm": 0.272062212228775, + "learning_rate": 1.6936121865496967e-05, + "loss": 1.7455, + "step": 24048 + }, + { + "epoch": 7.381522406384285, + "grad_norm": 0.1884320080280304, + "learning_rate": 1.6932393406596613e-05, + "loss": 1.7242, + "step": 24049 + }, + { + "epoch": 7.38182934315531, + "grad_norm": 0.22986121475696564, + "learning_rate": 1.6928665274489748e-05, + "loss": 1.7461, + "step": 24050 + }, + { + "epoch": 7.3821362799263355, + "grad_norm": 0.19400665163993835, + "learning_rate": 1.6924937469213158e-05, + "loss": 1.7468, + "step": 24051 + }, + { + "epoch": 7.382443216697361, + "grad_norm": 0.1990167796611786, + "learning_rate": 1.6921209990803744e-05, + "loss": 1.7253, + "step": 24052 + }, + { + "epoch": 7.382750153468385, + "grad_norm": 0.16667480766773224, + "learning_rate": 1.691748283929832e-05, + "loss": 1.6763, + "step": 24053 + }, + { + "epoch": 7.383057090239411, + "grad_norm": 0.20539991557598114, + "learning_rate": 1.691375601473372e-05, + "loss": 1.7408, + "step": 24054 + }, + { + "epoch": 7.383364027010436, + "grad_norm": 0.18021859228610992, + "learning_rate": 1.6910029517146776e-05, + "loss": 1.7075, + "step": 24055 + }, + { + "epoch": 7.383670963781461, + "grad_norm": 0.17450939118862152, + "learning_rate": 1.6906303346574314e-05, + "loss": 1.7074, + "step": 24056 + }, + { + "epoch": 7.383977900552487, + "grad_norm": 0.1690986454486847, + "learning_rate": 1.690257750305316e-05, + "loss": 1.6911, + "step": 24057 + }, + { + "epoch": 7.384284837323511, + "grad_norm": 0.19716380536556244, + "learning_rate": 1.6898851986620136e-05, + "loss": 1.7075, + "step": 24058 + }, + { + "epoch": 7.384591774094536, + "grad_norm": 0.20165397226810455, + "learning_rate": 1.6895126797312054e-05, + "loss": 1.7201, + "step": 24059 + }, + { + "epoch": 7.384898710865562, + "grad_norm": 0.22149543464183807, + "learning_rate": 1.6891401935165734e-05, + "loss": 1.7407, + "step": 24060 + }, + { + "epoch": 7.385205647636587, + "grad_norm": 0.1575438529253006, + "learning_rate": 1.6887677400217966e-05, + "loss": 1.6451, + "step": 24061 + }, + { + "epoch": 7.385512584407612, + "grad_norm": 0.18075503408908844, + "learning_rate": 1.688395319250562e-05, + "loss": 1.7084, + "step": 24062 + }, + { + "epoch": 7.385819521178637, + "grad_norm": 0.16428421437740326, + "learning_rate": 1.6880229312065414e-05, + "loss": 1.7047, + "step": 24063 + }, + { + "epoch": 7.386126457949662, + "grad_norm": 0.18372805416584015, + "learning_rate": 1.6876505758934237e-05, + "loss": 1.6726, + "step": 24064 + }, + { + "epoch": 7.3864333947206875, + "grad_norm": 0.199292853474617, + "learning_rate": 1.687278253314882e-05, + "loss": 1.7472, + "step": 24065 + }, + { + "epoch": 7.386740331491713, + "grad_norm": 0.20381483435630798, + "learning_rate": 1.686905963474597e-05, + "loss": 1.7128, + "step": 24066 + }, + { + "epoch": 7.387047268262738, + "grad_norm": 0.18497546017169952, + "learning_rate": 1.6865337063762527e-05, + "loss": 1.736, + "step": 24067 + }, + { + "epoch": 7.387354205033763, + "grad_norm": 0.21320439875125885, + "learning_rate": 1.6861614820235206e-05, + "loss": 1.7391, + "step": 24068 + }, + { + "epoch": 7.387661141804788, + "grad_norm": 0.22324618697166443, + "learning_rate": 1.6857892904200863e-05, + "loss": 1.7384, + "step": 24069 + }, + { + "epoch": 7.387968078575813, + "grad_norm": 0.18035978078842163, + "learning_rate": 1.6854171315696216e-05, + "loss": 1.7029, + "step": 24070 + }, + { + "epoch": 7.388275015346839, + "grad_norm": 0.1727912276983261, + "learning_rate": 1.6850450054758092e-05, + "loss": 1.6649, + "step": 24071 + }, + { + "epoch": 7.388581952117864, + "grad_norm": 0.19713124632835388, + "learning_rate": 1.6846729121423256e-05, + "loss": 1.7508, + "step": 24072 + }, + { + "epoch": 7.388888888888889, + "grad_norm": 0.19403581321239471, + "learning_rate": 1.6843008515728464e-05, + "loss": 1.7807, + "step": 24073 + }, + { + "epoch": 7.389195825659914, + "grad_norm": 0.20204444229602814, + "learning_rate": 1.6839288237710503e-05, + "loss": 1.778, + "step": 24074 + }, + { + "epoch": 7.389502762430939, + "grad_norm": 0.20021478831768036, + "learning_rate": 1.6835568287406127e-05, + "loss": 1.7544, + "step": 24075 + }, + { + "epoch": 7.389809699201964, + "grad_norm": 0.2247730791568756, + "learning_rate": 1.6831848664852107e-05, + "loss": 1.7422, + "step": 24076 + }, + { + "epoch": 7.39011663597299, + "grad_norm": 0.21600402891635895, + "learning_rate": 1.68281293700852e-05, + "loss": 1.7491, + "step": 24077 + }, + { + "epoch": 7.390423572744015, + "grad_norm": 0.1854497194290161, + "learning_rate": 1.6824410403142145e-05, + "loss": 1.7292, + "step": 24078 + }, + { + "epoch": 7.3907305095150395, + "grad_norm": 0.21738949418067932, + "learning_rate": 1.6820691764059736e-05, + "loss": 1.6996, + "step": 24079 + }, + { + "epoch": 7.391037446286065, + "grad_norm": 0.20114775002002716, + "learning_rate": 1.6816973452874674e-05, + "loss": 1.7299, + "step": 24080 + }, + { + "epoch": 7.39134438305709, + "grad_norm": 0.17267082631587982, + "learning_rate": 1.681325546962376e-05, + "loss": 1.7181, + "step": 24081 + }, + { + "epoch": 7.3916513198281155, + "grad_norm": 0.1681009829044342, + "learning_rate": 1.680953781434369e-05, + "loss": 1.6826, + "step": 24082 + }, + { + "epoch": 7.391958256599141, + "grad_norm": 0.18807077407836914, + "learning_rate": 1.6805820487071205e-05, + "loss": 1.6934, + "step": 24083 + }, + { + "epoch": 7.392265193370166, + "grad_norm": 0.1859835982322693, + "learning_rate": 1.680210348784309e-05, + "loss": 1.7065, + "step": 24084 + }, + { + "epoch": 7.392572130141191, + "grad_norm": 0.20433956384658813, + "learning_rate": 1.679838681669601e-05, + "loss": 1.7934, + "step": 24085 + }, + { + "epoch": 7.392879066912216, + "grad_norm": 0.2428809553384781, + "learning_rate": 1.679467047366677e-05, + "loss": 1.7619, + "step": 24086 + }, + { + "epoch": 7.393186003683241, + "grad_norm": 0.25117191672325134, + "learning_rate": 1.6790954458792025e-05, + "loss": 1.7254, + "step": 24087 + }, + { + "epoch": 7.393492940454267, + "grad_norm": 0.19429172575473785, + "learning_rate": 1.6787238772108544e-05, + "loss": 1.6946, + "step": 24088 + }, + { + "epoch": 7.393799877225292, + "grad_norm": 0.18574993312358856, + "learning_rate": 1.678352341365304e-05, + "loss": 1.6953, + "step": 24089 + }, + { + "epoch": 7.394106813996316, + "grad_norm": 0.21022208034992218, + "learning_rate": 1.6779808383462227e-05, + "loss": 1.7866, + "step": 24090 + }, + { + "epoch": 7.394413750767342, + "grad_norm": 0.16711890697479248, + "learning_rate": 1.6776093681572818e-05, + "loss": 1.6988, + "step": 24091 + }, + { + "epoch": 7.394720687538367, + "grad_norm": 0.23661695420742035, + "learning_rate": 1.6772379308021524e-05, + "loss": 1.7152, + "step": 24092 + }, + { + "epoch": 7.395027624309392, + "grad_norm": 0.18410098552703857, + "learning_rate": 1.6768665262845052e-05, + "loss": 1.6643, + "step": 24093 + }, + { + "epoch": 7.395334561080418, + "grad_norm": 0.19566760957241058, + "learning_rate": 1.676495154608011e-05, + "loss": 1.7371, + "step": 24094 + }, + { + "epoch": 7.395641497851442, + "grad_norm": 0.18130381405353546, + "learning_rate": 1.6761238157763375e-05, + "loss": 1.6934, + "step": 24095 + }, + { + "epoch": 7.3959484346224675, + "grad_norm": 0.16141927242279053, + "learning_rate": 1.6757525097931603e-05, + "loss": 1.6629, + "step": 24096 + }, + { + "epoch": 7.396255371393493, + "grad_norm": 0.18370656669139862, + "learning_rate": 1.6753812366621418e-05, + "loss": 1.6931, + "step": 24097 + }, + { + "epoch": 7.396562308164518, + "grad_norm": 0.17368416488170624, + "learning_rate": 1.675009996386958e-05, + "loss": 1.7028, + "step": 24098 + }, + { + "epoch": 7.3968692449355435, + "grad_norm": 0.1704222410917282, + "learning_rate": 1.6746387889712722e-05, + "loss": 1.7241, + "step": 24099 + }, + { + "epoch": 7.397176181706568, + "grad_norm": 0.19127961993217468, + "learning_rate": 1.674267614418754e-05, + "loss": 1.6606, + "step": 24100 + }, + { + "epoch": 7.397483118477593, + "grad_norm": 0.20173178613185883, + "learning_rate": 1.673896472733075e-05, + "loss": 1.7293, + "step": 24101 + }, + { + "epoch": 7.397790055248619, + "grad_norm": 0.194651797413826, + "learning_rate": 1.6735253639178977e-05, + "loss": 1.6889, + "step": 24102 + }, + { + "epoch": 7.398096992019644, + "grad_norm": 0.16184480488300323, + "learning_rate": 1.6731542879768957e-05, + "loss": 1.6929, + "step": 24103 + }, + { + "epoch": 7.398403928790669, + "grad_norm": 0.21806742250919342, + "learning_rate": 1.67278324491373e-05, + "loss": 1.6944, + "step": 24104 + }, + { + "epoch": 7.398710865561695, + "grad_norm": 0.1599469929933548, + "learning_rate": 1.6724122347320715e-05, + "loss": 1.7107, + "step": 24105 + }, + { + "epoch": 7.399017802332719, + "grad_norm": 0.18621234595775604, + "learning_rate": 1.672041257435586e-05, + "loss": 1.6856, + "step": 24106 + }, + { + "epoch": 7.399324739103744, + "grad_norm": 0.20682603120803833, + "learning_rate": 1.6716703130279393e-05, + "loss": 1.7699, + "step": 24107 + }, + { + "epoch": 7.39963167587477, + "grad_norm": 0.19649554789066315, + "learning_rate": 1.6712994015127976e-05, + "loss": 1.7049, + "step": 24108 + }, + { + "epoch": 7.399938612645795, + "grad_norm": 0.15894706547260284, + "learning_rate": 1.6709285228938255e-05, + "loss": 1.7352, + "step": 24109 + }, + { + "epoch": 7.4002455494168204, + "grad_norm": 0.22186337411403656, + "learning_rate": 1.6705576771746896e-05, + "loss": 1.7353, + "step": 24110 + }, + { + "epoch": 7.400552486187845, + "grad_norm": 0.14689651131629944, + "learning_rate": 1.670186864359054e-05, + "loss": 1.7155, + "step": 24111 + }, + { + "epoch": 7.40085942295887, + "grad_norm": 0.2055603563785553, + "learning_rate": 1.6698160844505817e-05, + "loss": 1.6897, + "step": 24112 + }, + { + "epoch": 7.401166359729896, + "grad_norm": 0.1641531139612198, + "learning_rate": 1.6694453374529423e-05, + "loss": 1.67, + "step": 24113 + }, + { + "epoch": 7.401473296500921, + "grad_norm": 0.21150687336921692, + "learning_rate": 1.6690746233697923e-05, + "loss": 1.7507, + "step": 24114 + }, + { + "epoch": 7.401780233271946, + "grad_norm": 0.1844765543937683, + "learning_rate": 1.6687039422048035e-05, + "loss": 1.702, + "step": 24115 + }, + { + "epoch": 7.402087170042972, + "grad_norm": 0.1695966124534607, + "learning_rate": 1.6683332939616326e-05, + "loss": 1.6683, + "step": 24116 + }, + { + "epoch": 7.402394106813996, + "grad_norm": 0.17938567698001862, + "learning_rate": 1.667962678643943e-05, + "loss": 1.6947, + "step": 24117 + }, + { + "epoch": 7.402701043585021, + "grad_norm": 0.16420964896678925, + "learning_rate": 1.6675920962554027e-05, + "loss": 1.755, + "step": 24118 + }, + { + "epoch": 7.403007980356047, + "grad_norm": 0.16095438599586487, + "learning_rate": 1.667221546799667e-05, + "loss": 1.6855, + "step": 24119 + }, + { + "epoch": 7.403314917127072, + "grad_norm": 0.2089291363954544, + "learning_rate": 1.6668510302804052e-05, + "loss": 1.7213, + "step": 24120 + }, + { + "epoch": 7.403621853898097, + "grad_norm": 0.18369436264038086, + "learning_rate": 1.6664805467012717e-05, + "loss": 1.6913, + "step": 24121 + }, + { + "epoch": 7.403928790669122, + "grad_norm": 0.16405323147773743, + "learning_rate": 1.6661100960659326e-05, + "loss": 1.6529, + "step": 24122 + }, + { + "epoch": 7.404235727440147, + "grad_norm": 0.20792648196220398, + "learning_rate": 1.6657396783780477e-05, + "loss": 1.6855, + "step": 24123 + }, + { + "epoch": 7.4045426642111725, + "grad_norm": 0.17733097076416016, + "learning_rate": 1.6653692936412773e-05, + "loss": 1.727, + "step": 24124 + }, + { + "epoch": 7.404849600982198, + "grad_norm": 0.16196851432323456, + "learning_rate": 1.6649989418592825e-05, + "loss": 1.7376, + "step": 24125 + }, + { + "epoch": 7.405156537753223, + "grad_norm": 0.17193716764450073, + "learning_rate": 1.664628623035723e-05, + "loss": 1.6802, + "step": 24126 + }, + { + "epoch": 7.4054634745242485, + "grad_norm": 0.22076182067394257, + "learning_rate": 1.6642583371742576e-05, + "loss": 1.7512, + "step": 24127 + }, + { + "epoch": 7.405770411295273, + "grad_norm": 0.20766951143741608, + "learning_rate": 1.663888084278547e-05, + "loss": 1.7457, + "step": 24128 + }, + { + "epoch": 7.406077348066298, + "grad_norm": 0.16815492510795593, + "learning_rate": 1.663517864352248e-05, + "loss": 1.6867, + "step": 24129 + }, + { + "epoch": 7.406384284837324, + "grad_norm": 0.19644804298877716, + "learning_rate": 1.6631476773990246e-05, + "loss": 1.6996, + "step": 24130 + }, + { + "epoch": 7.406691221608349, + "grad_norm": 0.18717117607593536, + "learning_rate": 1.662777523422528e-05, + "loss": 1.7745, + "step": 24131 + }, + { + "epoch": 7.406998158379374, + "grad_norm": 0.1679331511259079, + "learning_rate": 1.662407402426423e-05, + "loss": 1.7213, + "step": 24132 + }, + { + "epoch": 7.407305095150399, + "grad_norm": 0.1721929907798767, + "learning_rate": 1.662037314414363e-05, + "loss": 1.6759, + "step": 24133 + }, + { + "epoch": 7.407612031921424, + "grad_norm": 0.15507890284061432, + "learning_rate": 1.661667259390005e-05, + "loss": 1.6658, + "step": 24134 + }, + { + "epoch": 7.407918968692449, + "grad_norm": 0.20528049767017365, + "learning_rate": 1.6612972373570114e-05, + "loss": 1.7508, + "step": 24135 + }, + { + "epoch": 7.408225905463475, + "grad_norm": 0.20593658089637756, + "learning_rate": 1.6609272483190315e-05, + "loss": 1.8078, + "step": 24136 + }, + { + "epoch": 7.4085328422345, + "grad_norm": 0.19905441999435425, + "learning_rate": 1.6605572922797292e-05, + "loss": 1.7933, + "step": 24137 + }, + { + "epoch": 7.4088397790055245, + "grad_norm": 0.17571881413459778, + "learning_rate": 1.6601873692427537e-05, + "loss": 1.6908, + "step": 24138 + }, + { + "epoch": 7.40914671577655, + "grad_norm": 0.2244982272386551, + "learning_rate": 1.6598174792117655e-05, + "loss": 1.6998, + "step": 24139 + }, + { + "epoch": 7.409453652547575, + "grad_norm": 0.15267951786518097, + "learning_rate": 1.6594476221904193e-05, + "loss": 1.6399, + "step": 24140 + }, + { + "epoch": 7.4097605893186005, + "grad_norm": 0.24161390960216522, + "learning_rate": 1.659077798182369e-05, + "loss": 1.6776, + "step": 24141 + }, + { + "epoch": 7.410067526089626, + "grad_norm": 0.17184343934059143, + "learning_rate": 1.658708007191271e-05, + "loss": 1.7169, + "step": 24142 + }, + { + "epoch": 7.41037446286065, + "grad_norm": 0.1589801162481308, + "learning_rate": 1.6583382492207778e-05, + "loss": 1.6727, + "step": 24143 + }, + { + "epoch": 7.410681399631676, + "grad_norm": 0.18666890263557434, + "learning_rate": 1.6579685242745452e-05, + "loss": 1.7429, + "step": 24144 + }, + { + "epoch": 7.410988336402701, + "grad_norm": 0.22418901324272156, + "learning_rate": 1.6575988323562265e-05, + "loss": 1.7834, + "step": 24145 + }, + { + "epoch": 7.411295273173726, + "grad_norm": 0.1897875964641571, + "learning_rate": 1.6572291734694734e-05, + "loss": 1.7271, + "step": 24146 + }, + { + "epoch": 7.411602209944752, + "grad_norm": 0.18204644322395325, + "learning_rate": 1.6568595476179445e-05, + "loss": 1.7003, + "step": 24147 + }, + { + "epoch": 7.411909146715777, + "grad_norm": 0.19130240380764008, + "learning_rate": 1.6564899548052853e-05, + "loss": 1.6803, + "step": 24148 + }, + { + "epoch": 7.412216083486801, + "grad_norm": 0.19467706978321075, + "learning_rate": 1.6561203950351554e-05, + "loss": 1.7529, + "step": 24149 + }, + { + "epoch": 7.412523020257827, + "grad_norm": 0.20290352404117584, + "learning_rate": 1.655750868311202e-05, + "loss": 1.7742, + "step": 24150 + }, + { + "epoch": 7.412829957028852, + "grad_norm": 0.18538729846477509, + "learning_rate": 1.6553813746370772e-05, + "loss": 1.68, + "step": 24151 + }, + { + "epoch": 7.413136893799877, + "grad_norm": 0.23339742422103882, + "learning_rate": 1.655011914016437e-05, + "loss": 1.7499, + "step": 24152 + }, + { + "epoch": 7.413443830570903, + "grad_norm": 0.21964092552661896, + "learning_rate": 1.654642486452927e-05, + "loss": 1.7394, + "step": 24153 + }, + { + "epoch": 7.413750767341927, + "grad_norm": 0.2131531536579132, + "learning_rate": 1.6542730919502032e-05, + "loss": 1.6928, + "step": 24154 + }, + { + "epoch": 7.4140577041129525, + "grad_norm": 0.20840130746364594, + "learning_rate": 1.653903730511911e-05, + "loss": 1.6785, + "step": 24155 + }, + { + "epoch": 7.414364640883978, + "grad_norm": 0.1519836038351059, + "learning_rate": 1.653534402141705e-05, + "loss": 1.6882, + "step": 24156 + }, + { + "epoch": 7.414671577655003, + "grad_norm": 0.21539351344108582, + "learning_rate": 1.653165106843233e-05, + "loss": 1.7041, + "step": 24157 + }, + { + "epoch": 7.4149785144260285, + "grad_norm": 0.2050703912973404, + "learning_rate": 1.6527958446201453e-05, + "loss": 1.7854, + "step": 24158 + }, + { + "epoch": 7.415285451197054, + "grad_norm": 0.21595771610736847, + "learning_rate": 1.652426615476091e-05, + "loss": 1.7305, + "step": 24159 + }, + { + "epoch": 7.415592387968078, + "grad_norm": 0.19248713552951813, + "learning_rate": 1.6520574194147186e-05, + "loss": 1.6834, + "step": 24160 + }, + { + "epoch": 7.415899324739104, + "grad_norm": 0.178158700466156, + "learning_rate": 1.6516882564396774e-05, + "loss": 1.7312, + "step": 24161 + }, + { + "epoch": 7.416206261510129, + "grad_norm": 0.18686197698116302, + "learning_rate": 1.6513191265546152e-05, + "loss": 1.7025, + "step": 24162 + }, + { + "epoch": 7.416513198281154, + "grad_norm": 0.1544325053691864, + "learning_rate": 1.6509500297631787e-05, + "loss": 1.6773, + "step": 24163 + }, + { + "epoch": 7.41682013505218, + "grad_norm": 0.1787567138671875, + "learning_rate": 1.6505809660690197e-05, + "loss": 1.6941, + "step": 24164 + }, + { + "epoch": 7.417127071823204, + "grad_norm": 0.16545183956623077, + "learning_rate": 1.65021193547578e-05, + "loss": 1.6618, + "step": 24165 + }, + { + "epoch": 7.417434008594229, + "grad_norm": 0.23889821767807007, + "learning_rate": 1.6498429379871126e-05, + "loss": 1.7651, + "step": 24166 + }, + { + "epoch": 7.417740945365255, + "grad_norm": 0.2012832909822464, + "learning_rate": 1.649473973606659e-05, + "loss": 1.7477, + "step": 24167 + }, + { + "epoch": 7.41804788213628, + "grad_norm": 0.18035975098609924, + "learning_rate": 1.6491050423380662e-05, + "loss": 1.6747, + "step": 24168 + }, + { + "epoch": 7.418354818907305, + "grad_norm": 0.14925292134284973, + "learning_rate": 1.6487361441849842e-05, + "loss": 1.6817, + "step": 24169 + }, + { + "epoch": 7.41866175567833, + "grad_norm": 0.19253355264663696, + "learning_rate": 1.6483672791510523e-05, + "loss": 1.6943, + "step": 24170 + }, + { + "epoch": 7.418968692449355, + "grad_norm": 0.17203082144260406, + "learning_rate": 1.6479984472399234e-05, + "loss": 1.692, + "step": 24171 + }, + { + "epoch": 7.4192756292203805, + "grad_norm": 0.19132022559642792, + "learning_rate": 1.647629648455235e-05, + "loss": 1.7029, + "step": 24172 + }, + { + "epoch": 7.419582565991406, + "grad_norm": 0.17949101328849792, + "learning_rate": 1.647260882800637e-05, + "loss": 1.6944, + "step": 24173 + }, + { + "epoch": 7.419889502762431, + "grad_norm": 0.17752930521965027, + "learning_rate": 1.646892150279772e-05, + "loss": 1.6875, + "step": 24174 + }, + { + "epoch": 7.420196439533456, + "grad_norm": 0.19464492797851562, + "learning_rate": 1.6465234508962836e-05, + "loss": 1.6988, + "step": 24175 + }, + { + "epoch": 7.420503376304481, + "grad_norm": 0.20154574513435364, + "learning_rate": 1.6461547846538168e-05, + "loss": 1.7305, + "step": 24176 + }, + { + "epoch": 7.420810313075506, + "grad_norm": 0.20944970846176147, + "learning_rate": 1.6457861515560136e-05, + "loss": 1.7699, + "step": 24177 + }, + { + "epoch": 7.421117249846532, + "grad_norm": 0.22422203421592712, + "learning_rate": 1.6454175516065175e-05, + "loss": 1.6607, + "step": 24178 + }, + { + "epoch": 7.421424186617557, + "grad_norm": 0.16106431186199188, + "learning_rate": 1.6450489848089717e-05, + "loss": 1.7204, + "step": 24179 + }, + { + "epoch": 7.421731123388582, + "grad_norm": 0.24394269287586212, + "learning_rate": 1.644680451167018e-05, + "loss": 1.7161, + "step": 24180 + }, + { + "epoch": 7.422038060159607, + "grad_norm": 0.1999186873435974, + "learning_rate": 1.644311950684299e-05, + "loss": 1.7486, + "step": 24181 + }, + { + "epoch": 7.422344996930632, + "grad_norm": 0.1865876019001007, + "learning_rate": 1.6439434833644545e-05, + "loss": 1.737, + "step": 24182 + }, + { + "epoch": 7.422651933701657, + "grad_norm": 0.18088236451148987, + "learning_rate": 1.643575049211131e-05, + "loss": 1.6821, + "step": 24183 + }, + { + "epoch": 7.422958870472683, + "grad_norm": 0.17456914484500885, + "learning_rate": 1.643206648227964e-05, + "loss": 1.7379, + "step": 24184 + }, + { + "epoch": 7.423265807243708, + "grad_norm": 0.18160004913806915, + "learning_rate": 1.642838280418595e-05, + "loss": 1.7364, + "step": 24185 + }, + { + "epoch": 7.4235727440147325, + "grad_norm": 0.18081973493099213, + "learning_rate": 1.6424699457866688e-05, + "loss": 1.7591, + "step": 24186 + }, + { + "epoch": 7.423879680785758, + "grad_norm": 0.20753513276576996, + "learning_rate": 1.6421016443358195e-05, + "loss": 1.7299, + "step": 24187 + }, + { + "epoch": 7.424186617556783, + "grad_norm": 0.2102874517440796, + "learning_rate": 1.641733376069693e-05, + "loss": 1.7876, + "step": 24188 + }, + { + "epoch": 7.4244935543278086, + "grad_norm": 0.19360920786857605, + "learning_rate": 1.6413651409919224e-05, + "loss": 1.7578, + "step": 24189 + }, + { + "epoch": 7.424800491098834, + "grad_norm": 0.1954938918352127, + "learning_rate": 1.6409969391061514e-05, + "loss": 1.7074, + "step": 24190 + }, + { + "epoch": 7.425107427869859, + "grad_norm": 0.2228705734014511, + "learning_rate": 1.6406287704160177e-05, + "loss": 1.7261, + "step": 24191 + }, + { + "epoch": 7.425414364640884, + "grad_norm": 0.18695802986621857, + "learning_rate": 1.6402606349251597e-05, + "loss": 1.7074, + "step": 24192 + }, + { + "epoch": 7.425721301411909, + "grad_norm": 0.19026046991348267, + "learning_rate": 1.639892532637215e-05, + "loss": 1.7546, + "step": 24193 + }, + { + "epoch": 7.426028238182934, + "grad_norm": 0.2086167335510254, + "learning_rate": 1.639524463555822e-05, + "loss": 1.7551, + "step": 24194 + }, + { + "epoch": 7.42633517495396, + "grad_norm": 0.201420396566391, + "learning_rate": 1.639156427684618e-05, + "loss": 1.6961, + "step": 24195 + }, + { + "epoch": 7.426642111724985, + "grad_norm": 0.1735599786043167, + "learning_rate": 1.6387884250272394e-05, + "loss": 1.7461, + "step": 24196 + }, + { + "epoch": 7.4269490484960095, + "grad_norm": 0.23944853246212006, + "learning_rate": 1.6384204555873238e-05, + "loss": 1.7001, + "step": 24197 + }, + { + "epoch": 7.427255985267035, + "grad_norm": 0.15605413913726807, + "learning_rate": 1.638052519368508e-05, + "loss": 1.7105, + "step": 24198 + }, + { + "epoch": 7.42756292203806, + "grad_norm": 0.21450987458229065, + "learning_rate": 1.6376846163744257e-05, + "loss": 1.7309, + "step": 24199 + }, + { + "epoch": 7.4278698588090855, + "grad_norm": 0.20542307198047638, + "learning_rate": 1.637316746608718e-05, + "loss": 1.72, + "step": 24200 + }, + { + "epoch": 7.428176795580111, + "grad_norm": 0.18612053990364075, + "learning_rate": 1.6369489100750157e-05, + "loss": 1.6714, + "step": 24201 + }, + { + "epoch": 7.428483732351136, + "grad_norm": 0.16587957739830017, + "learning_rate": 1.6365811067769553e-05, + "loss": 1.7494, + "step": 24202 + }, + { + "epoch": 7.428790669122161, + "grad_norm": 0.247777059674263, + "learning_rate": 1.636213336718172e-05, + "loss": 1.7048, + "step": 24203 + }, + { + "epoch": 7.429097605893186, + "grad_norm": 0.2000289410352707, + "learning_rate": 1.635845599902298e-05, + "loss": 1.7568, + "step": 24204 + }, + { + "epoch": 7.429404542664211, + "grad_norm": 0.21887128055095673, + "learning_rate": 1.6354778963329732e-05, + "loss": 1.6708, + "step": 24205 + }, + { + "epoch": 7.429711479435237, + "grad_norm": 0.18932145833969116, + "learning_rate": 1.6351102260138247e-05, + "loss": 1.7184, + "step": 24206 + }, + { + "epoch": 7.430018416206262, + "grad_norm": 0.20103856921195984, + "learning_rate": 1.63474258894849e-05, + "loss": 1.7031, + "step": 24207 + }, + { + "epoch": 7.430325352977286, + "grad_norm": 0.22598737478256226, + "learning_rate": 1.634374985140602e-05, + "loss": 1.7803, + "step": 24208 + }, + { + "epoch": 7.430632289748312, + "grad_norm": 0.22468316555023193, + "learning_rate": 1.6340074145937934e-05, + "loss": 1.7635, + "step": 24209 + }, + { + "epoch": 7.430939226519337, + "grad_norm": 0.16173744201660156, + "learning_rate": 1.6336398773116962e-05, + "loss": 1.6877, + "step": 24210 + }, + { + "epoch": 7.431246163290362, + "grad_norm": 0.17869406938552856, + "learning_rate": 1.6332723732979426e-05, + "loss": 1.6436, + "step": 24211 + }, + { + "epoch": 7.431553100061388, + "grad_norm": 0.1828129142522812, + "learning_rate": 1.6329049025561648e-05, + "loss": 1.7191, + "step": 24212 + }, + { + "epoch": 7.431860036832412, + "grad_norm": 0.19169248640537262, + "learning_rate": 1.6325374650899944e-05, + "loss": 1.7607, + "step": 24213 + }, + { + "epoch": 7.4321669736034375, + "grad_norm": 0.1680343598127365, + "learning_rate": 1.632170060903062e-05, + "loss": 1.6736, + "step": 24214 + }, + { + "epoch": 7.432473910374463, + "grad_norm": 0.20647180080413818, + "learning_rate": 1.6318026899989996e-05, + "loss": 1.7875, + "step": 24215 + }, + { + "epoch": 7.432780847145488, + "grad_norm": 0.29225587844848633, + "learning_rate": 1.6314353523814352e-05, + "loss": 1.8164, + "step": 24216 + }, + { + "epoch": 7.4330877839165135, + "grad_norm": 0.1633446216583252, + "learning_rate": 1.6310680480540048e-05, + "loss": 1.6529, + "step": 24217 + }, + { + "epoch": 7.433394720687538, + "grad_norm": 0.21215081214904785, + "learning_rate": 1.6307007770203326e-05, + "loss": 1.6323, + "step": 24218 + }, + { + "epoch": 7.433701657458563, + "grad_norm": 0.1934979110956192, + "learning_rate": 1.63033353928405e-05, + "loss": 1.7299, + "step": 24219 + }, + { + "epoch": 7.434008594229589, + "grad_norm": 0.2581390142440796, + "learning_rate": 1.6299663348487865e-05, + "loss": 1.7308, + "step": 24220 + }, + { + "epoch": 7.434315531000614, + "grad_norm": 0.2711075246334076, + "learning_rate": 1.629599163718169e-05, + "loss": 1.8736, + "step": 24221 + }, + { + "epoch": 7.434622467771639, + "grad_norm": 0.2620790898799896, + "learning_rate": 1.6292320258958316e-05, + "loss": 1.7326, + "step": 24222 + }, + { + "epoch": 7.434929404542665, + "grad_norm": 0.16254334151744843, + "learning_rate": 1.6288649213853958e-05, + "loss": 1.6996, + "step": 24223 + }, + { + "epoch": 7.435236341313689, + "grad_norm": 0.22968515753746033, + "learning_rate": 1.628497850190496e-05, + "loss": 1.694, + "step": 24224 + }, + { + "epoch": 7.435543278084714, + "grad_norm": 0.20458953082561493, + "learning_rate": 1.6281308123147533e-05, + "loss": 1.7558, + "step": 24225 + }, + { + "epoch": 7.43585021485574, + "grad_norm": 0.2327413409948349, + "learning_rate": 1.6277638077617995e-05, + "loss": 1.7581, + "step": 24226 + }, + { + "epoch": 7.436157151626765, + "grad_norm": 0.18312111496925354, + "learning_rate": 1.6273968365352604e-05, + "loss": 1.6713, + "step": 24227 + }, + { + "epoch": 7.43646408839779, + "grad_norm": 0.15935418009757996, + "learning_rate": 1.6270298986387628e-05, + "loss": 1.6996, + "step": 24228 + }, + { + "epoch": 7.436771025168815, + "grad_norm": 0.17424416542053223, + "learning_rate": 1.6266629940759322e-05, + "loss": 1.6826, + "step": 24229 + }, + { + "epoch": 7.43707796193984, + "grad_norm": 0.18982923030853271, + "learning_rate": 1.6262961228503953e-05, + "loss": 1.741, + "step": 24230 + }, + { + "epoch": 7.4373848987108655, + "grad_norm": 0.16608789563179016, + "learning_rate": 1.6259292849657777e-05, + "loss": 1.7205, + "step": 24231 + }, + { + "epoch": 7.437691835481891, + "grad_norm": 0.19830825924873352, + "learning_rate": 1.625562480425704e-05, + "loss": 1.7159, + "step": 24232 + }, + { + "epoch": 7.437998772252916, + "grad_norm": 0.1889072209596634, + "learning_rate": 1.6251957092337988e-05, + "loss": 1.7427, + "step": 24233 + }, + { + "epoch": 7.4383057090239415, + "grad_norm": 0.18454046547412872, + "learning_rate": 1.6248289713936903e-05, + "loss": 1.6962, + "step": 24234 + }, + { + "epoch": 7.438612645794966, + "grad_norm": 0.20041033625602722, + "learning_rate": 1.6244622669089987e-05, + "loss": 1.7763, + "step": 24235 + }, + { + "epoch": 7.438919582565991, + "grad_norm": 0.17226676642894745, + "learning_rate": 1.62409559578335e-05, + "loss": 1.6783, + "step": 24236 + }, + { + "epoch": 7.439226519337017, + "grad_norm": 0.1761687994003296, + "learning_rate": 1.6237289580203662e-05, + "loss": 1.6761, + "step": 24237 + }, + { + "epoch": 7.439533456108042, + "grad_norm": 0.24213027954101562, + "learning_rate": 1.6233623536236707e-05, + "loss": 1.724, + "step": 24238 + }, + { + "epoch": 7.439840392879067, + "grad_norm": 0.15541739761829376, + "learning_rate": 1.6229957825968913e-05, + "loss": 1.6594, + "step": 24239 + }, + { + "epoch": 7.440147329650092, + "grad_norm": 0.20755749940872192, + "learning_rate": 1.622629244943643e-05, + "loss": 1.7229, + "step": 24240 + }, + { + "epoch": 7.440454266421117, + "grad_norm": 0.20716612040996552, + "learning_rate": 1.6222627406675555e-05, + "loss": 1.699, + "step": 24241 + }, + { + "epoch": 7.440761203192142, + "grad_norm": 0.17423541843891144, + "learning_rate": 1.621896269772244e-05, + "loss": 1.7175, + "step": 24242 + }, + { + "epoch": 7.441068139963168, + "grad_norm": 0.17913730442523956, + "learning_rate": 1.6215298322613347e-05, + "loss": 1.7287, + "step": 24243 + }, + { + "epoch": 7.441375076734193, + "grad_norm": 0.21801607310771942, + "learning_rate": 1.6211634281384486e-05, + "loss": 1.8157, + "step": 24244 + }, + { + "epoch": 7.4416820135052175, + "grad_norm": 0.23132582008838654, + "learning_rate": 1.6207970574072056e-05, + "loss": 1.7921, + "step": 24245 + }, + { + "epoch": 7.441988950276243, + "grad_norm": 0.18289685249328613, + "learning_rate": 1.6204307200712266e-05, + "loss": 1.7222, + "step": 24246 + }, + { + "epoch": 7.442295887047268, + "grad_norm": 0.15289388597011566, + "learning_rate": 1.620064416134132e-05, + "loss": 1.6409, + "step": 24247 + }, + { + "epoch": 7.4426028238182935, + "grad_norm": 0.1684839129447937, + "learning_rate": 1.619698145599542e-05, + "loss": 1.7362, + "step": 24248 + }, + { + "epoch": 7.442909760589319, + "grad_norm": 0.16812102496623993, + "learning_rate": 1.619331908471076e-05, + "loss": 1.6849, + "step": 24249 + }, + { + "epoch": 7.443216697360343, + "grad_norm": 0.16095775365829468, + "learning_rate": 1.6189657047523526e-05, + "loss": 1.7032, + "step": 24250 + }, + { + "epoch": 7.443523634131369, + "grad_norm": 0.167144313454628, + "learning_rate": 1.6185995344469946e-05, + "loss": 1.6539, + "step": 24251 + }, + { + "epoch": 7.443830570902394, + "grad_norm": 0.18129989504814148, + "learning_rate": 1.618233397558616e-05, + "loss": 1.7057, + "step": 24252 + }, + { + "epoch": 7.444137507673419, + "grad_norm": 0.17299556732177734, + "learning_rate": 1.6178672940908374e-05, + "loss": 1.6965, + "step": 24253 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 0.14944438636302948, + "learning_rate": 1.6175012240472765e-05, + "loss": 1.6666, + "step": 24254 + }, + { + "epoch": 7.44475138121547, + "grad_norm": 0.20333626866340637, + "learning_rate": 1.6171351874315494e-05, + "loss": 1.748, + "step": 24255 + }, + { + "epoch": 7.445058317986494, + "grad_norm": 0.2233068197965622, + "learning_rate": 1.6167691842472783e-05, + "loss": 1.7662, + "step": 24256 + }, + { + "epoch": 7.44536525475752, + "grad_norm": 0.22628507018089294, + "learning_rate": 1.6164032144980738e-05, + "loss": 1.747, + "step": 24257 + }, + { + "epoch": 7.445672191528545, + "grad_norm": 0.18167820572853088, + "learning_rate": 1.6160372781875594e-05, + "loss": 1.7311, + "step": 24258 + }, + { + "epoch": 7.44597912829957, + "grad_norm": 0.1975218504667282, + "learning_rate": 1.6156713753193446e-05, + "loss": 1.7334, + "step": 24259 + }, + { + "epoch": 7.446286065070596, + "grad_norm": 0.18606813251972198, + "learning_rate": 1.6153055058970508e-05, + "loss": 1.7118, + "step": 24260 + }, + { + "epoch": 7.44659300184162, + "grad_norm": 0.14817847311496735, + "learning_rate": 1.6149396699242914e-05, + "loss": 1.6385, + "step": 24261 + }, + { + "epoch": 7.4468999386126455, + "grad_norm": 0.19018684327602386, + "learning_rate": 1.6145738674046825e-05, + "loss": 1.7511, + "step": 24262 + }, + { + "epoch": 7.447206875383671, + "grad_norm": 0.17089374363422394, + "learning_rate": 1.6142080983418385e-05, + "loss": 1.7523, + "step": 24263 + }, + { + "epoch": 7.447513812154696, + "grad_norm": 0.16370832920074463, + "learning_rate": 1.613842362739375e-05, + "loss": 1.6636, + "step": 24264 + }, + { + "epoch": 7.4478207489257215, + "grad_norm": 0.16432829201221466, + "learning_rate": 1.6134766606009055e-05, + "loss": 1.7355, + "step": 24265 + }, + { + "epoch": 7.448127685696747, + "grad_norm": 0.15270906686782837, + "learning_rate": 1.6131109919300453e-05, + "loss": 1.7169, + "step": 24266 + }, + { + "epoch": 7.448434622467771, + "grad_norm": 0.14986950159072876, + "learning_rate": 1.6127453567304053e-05, + "loss": 1.7021, + "step": 24267 + }, + { + "epoch": 7.448741559238797, + "grad_norm": 0.17727383971214294, + "learning_rate": 1.6123797550056042e-05, + "loss": 1.7144, + "step": 24268 + }, + { + "epoch": 7.449048496009822, + "grad_norm": 0.1471523940563202, + "learning_rate": 1.6120141867592504e-05, + "loss": 1.694, + "step": 24269 + }, + { + "epoch": 7.449355432780847, + "grad_norm": 0.15561319887638092, + "learning_rate": 1.611648651994958e-05, + "loss": 1.6672, + "step": 24270 + }, + { + "epoch": 7.449662369551873, + "grad_norm": 0.19121745228767395, + "learning_rate": 1.61128315071634e-05, + "loss": 1.7317, + "step": 24271 + }, + { + "epoch": 7.449969306322897, + "grad_norm": 0.27333202958106995, + "learning_rate": 1.6109176829270062e-05, + "loss": 1.7943, + "step": 24272 + }, + { + "epoch": 7.4502762430939224, + "grad_norm": 0.16996058821678162, + "learning_rate": 1.6105522486305736e-05, + "loss": 1.6883, + "step": 24273 + }, + { + "epoch": 7.450583179864948, + "grad_norm": 0.17687207460403442, + "learning_rate": 1.610186847830647e-05, + "loss": 1.6967, + "step": 24274 + }, + { + "epoch": 7.450890116635973, + "grad_norm": 0.2191249281167984, + "learning_rate": 1.6098214805308436e-05, + "loss": 1.7644, + "step": 24275 + }, + { + "epoch": 7.4511970534069984, + "grad_norm": 0.17267808318138123, + "learning_rate": 1.6094561467347684e-05, + "loss": 1.6963, + "step": 24276 + }, + { + "epoch": 7.451503990178024, + "grad_norm": 0.16276031732559204, + "learning_rate": 1.609090846446037e-05, + "loss": 1.6795, + "step": 24277 + }, + { + "epoch": 7.451810926949048, + "grad_norm": 0.16677677631378174, + "learning_rate": 1.6087255796682572e-05, + "loss": 1.699, + "step": 24278 + }, + { + "epoch": 7.452117863720074, + "grad_norm": 0.17163679003715515, + "learning_rate": 1.6083603464050383e-05, + "loss": 1.6906, + "step": 24279 + }, + { + "epoch": 7.452424800491099, + "grad_norm": 0.16087757050991058, + "learning_rate": 1.6079951466599908e-05, + "loss": 1.7173, + "step": 24280 + }, + { + "epoch": 7.452731737262124, + "grad_norm": 0.19389556348323822, + "learning_rate": 1.6076299804367228e-05, + "loss": 1.6985, + "step": 24281 + }, + { + "epoch": 7.45303867403315, + "grad_norm": 0.20400559902191162, + "learning_rate": 1.6072648477388447e-05, + "loss": 1.7336, + "step": 24282 + }, + { + "epoch": 7.453345610804174, + "grad_norm": 0.16443994641304016, + "learning_rate": 1.6068997485699632e-05, + "loss": 1.6909, + "step": 24283 + }, + { + "epoch": 7.453652547575199, + "grad_norm": 0.18333028256893158, + "learning_rate": 1.606534682933686e-05, + "loss": 1.6749, + "step": 24284 + }, + { + "epoch": 7.453959484346225, + "grad_norm": 0.21596840023994446, + "learning_rate": 1.6061696508336244e-05, + "loss": 1.7856, + "step": 24285 + }, + { + "epoch": 7.45426642111725, + "grad_norm": 0.18656609952449799, + "learning_rate": 1.6058046522733827e-05, + "loss": 1.6892, + "step": 24286 + }, + { + "epoch": 7.454573357888275, + "grad_norm": 0.18110665678977966, + "learning_rate": 1.6054396872565687e-05, + "loss": 1.7063, + "step": 24287 + }, + { + "epoch": 7.4548802946593, + "grad_norm": 0.19452248513698578, + "learning_rate": 1.605074755786789e-05, + "loss": 1.7637, + "step": 24288 + }, + { + "epoch": 7.455187231430325, + "grad_norm": 0.18945640325546265, + "learning_rate": 1.604709857867649e-05, + "loss": 1.7498, + "step": 24289 + }, + { + "epoch": 7.4554941682013505, + "grad_norm": 0.1847696155309677, + "learning_rate": 1.6043449935027592e-05, + "loss": 1.702, + "step": 24290 + }, + { + "epoch": 7.455801104972376, + "grad_norm": 0.18882444500923157, + "learning_rate": 1.6039801626957197e-05, + "loss": 1.728, + "step": 24291 + }, + { + "epoch": 7.456108041743401, + "grad_norm": 0.1981150358915329, + "learning_rate": 1.603615365450142e-05, + "loss": 1.7114, + "step": 24292 + }, + { + "epoch": 7.456414978514426, + "grad_norm": 0.2305375188589096, + "learning_rate": 1.6032506017696242e-05, + "loss": 1.7234, + "step": 24293 + }, + { + "epoch": 7.456721915285451, + "grad_norm": 0.17539730668067932, + "learning_rate": 1.6028858716577764e-05, + "loss": 1.6305, + "step": 24294 + }, + { + "epoch": 7.457028852056476, + "grad_norm": 0.19684432446956635, + "learning_rate": 1.602521175118202e-05, + "loss": 1.6958, + "step": 24295 + }, + { + "epoch": 7.457335788827502, + "grad_norm": 0.20957234501838684, + "learning_rate": 1.602156512154504e-05, + "loss": 1.6984, + "step": 24296 + }, + { + "epoch": 7.457642725598527, + "grad_norm": 0.18523702025413513, + "learning_rate": 1.6017918827702877e-05, + "loss": 1.7817, + "step": 24297 + }, + { + "epoch": 7.457949662369552, + "grad_norm": 0.1964758187532425, + "learning_rate": 1.601427286969155e-05, + "loss": 1.7597, + "step": 24298 + }, + { + "epoch": 7.458256599140577, + "grad_norm": 0.199961856007576, + "learning_rate": 1.6010627247547106e-05, + "loss": 1.6988, + "step": 24299 + }, + { + "epoch": 7.458563535911602, + "grad_norm": 0.16149461269378662, + "learning_rate": 1.6006981961305555e-05, + "loss": 1.6673, + "step": 24300 + }, + { + "epoch": 7.458870472682627, + "grad_norm": 0.2198258489370346, + "learning_rate": 1.600333701100293e-05, + "loss": 1.7159, + "step": 24301 + }, + { + "epoch": 7.459177409453653, + "grad_norm": 0.157994344830513, + "learning_rate": 1.5999692396675277e-05, + "loss": 1.7118, + "step": 24302 + }, + { + "epoch": 7.459484346224678, + "grad_norm": 0.21911758184432983, + "learning_rate": 1.5996048118358575e-05, + "loss": 1.7209, + "step": 24303 + }, + { + "epoch": 7.4597912829957025, + "grad_norm": 0.20648738741874695, + "learning_rate": 1.599240417608886e-05, + "loss": 1.7844, + "step": 24304 + }, + { + "epoch": 7.460098219766728, + "grad_norm": 0.18746837973594666, + "learning_rate": 1.598876056990214e-05, + "loss": 1.7079, + "step": 24305 + }, + { + "epoch": 7.460405156537753, + "grad_norm": 0.17767341434955597, + "learning_rate": 1.5985117299834407e-05, + "loss": 1.7579, + "step": 24306 + }, + { + "epoch": 7.4607120933087785, + "grad_norm": 0.18997585773468018, + "learning_rate": 1.598147436592171e-05, + "loss": 1.7556, + "step": 24307 + }, + { + "epoch": 7.461019030079804, + "grad_norm": 0.19356711208820343, + "learning_rate": 1.597783176819999e-05, + "loss": 1.7315, + "step": 24308 + }, + { + "epoch": 7.461325966850829, + "grad_norm": 0.23354102671146393, + "learning_rate": 1.597418950670531e-05, + "loss": 1.7622, + "step": 24309 + }, + { + "epoch": 7.461632903621854, + "grad_norm": 0.18773409724235535, + "learning_rate": 1.5970547581473604e-05, + "loss": 1.6582, + "step": 24310 + }, + { + "epoch": 7.461939840392879, + "grad_norm": 0.23704196512699127, + "learning_rate": 1.596690599254091e-05, + "loss": 1.7207, + "step": 24311 + }, + { + "epoch": 7.462246777163904, + "grad_norm": 0.1943788379430771, + "learning_rate": 1.596326473994319e-05, + "loss": 1.696, + "step": 24312 + }, + { + "epoch": 7.46255371393493, + "grad_norm": 0.22303985059261322, + "learning_rate": 1.595962382371644e-05, + "loss": 1.6963, + "step": 24313 + }, + { + "epoch": 7.462860650705955, + "grad_norm": 0.20158524811267853, + "learning_rate": 1.5955983243896643e-05, + "loss": 1.7017, + "step": 24314 + }, + { + "epoch": 7.463167587476979, + "grad_norm": 0.18768194317817688, + "learning_rate": 1.595234300051977e-05, + "loss": 1.6743, + "step": 24315 + }, + { + "epoch": 7.463474524248005, + "grad_norm": 0.27407020330429077, + "learning_rate": 1.5948703093621803e-05, + "loss": 1.7522, + "step": 24316 + }, + { + "epoch": 7.46378146101903, + "grad_norm": 0.2027997523546219, + "learning_rate": 1.5945063523238706e-05, + "loss": 1.7515, + "step": 24317 + }, + { + "epoch": 7.464088397790055, + "grad_norm": 0.2728271782398224, + "learning_rate": 1.5941424289406454e-05, + "loss": 1.7611, + "step": 24318 + }, + { + "epoch": 7.464395334561081, + "grad_norm": 0.1704578548669815, + "learning_rate": 1.593778539216101e-05, + "loss": 1.6602, + "step": 24319 + }, + { + "epoch": 7.464702271332105, + "grad_norm": 0.19684311747550964, + "learning_rate": 1.5934146831538332e-05, + "loss": 1.6824, + "step": 24320 + }, + { + "epoch": 7.4650092081031305, + "grad_norm": 0.196905255317688, + "learning_rate": 1.5930508607574386e-05, + "loss": 1.691, + "step": 24321 + }, + { + "epoch": 7.465316144874156, + "grad_norm": 0.18543855845928192, + "learning_rate": 1.5926870720305122e-05, + "loss": 1.6936, + "step": 24322 + }, + { + "epoch": 7.465623081645181, + "grad_norm": 0.24634000658988953, + "learning_rate": 1.592323316976647e-05, + "loss": 1.6857, + "step": 24323 + }, + { + "epoch": 7.4659300184162065, + "grad_norm": 0.1976090669631958, + "learning_rate": 1.5919595955994444e-05, + "loss": 1.7248, + "step": 24324 + }, + { + "epoch": 7.466236955187231, + "grad_norm": 0.21902409195899963, + "learning_rate": 1.5915959079024907e-05, + "loss": 1.7184, + "step": 24325 + }, + { + "epoch": 7.466543891958256, + "grad_norm": 0.14501455426216125, + "learning_rate": 1.591232253889387e-05, + "loss": 1.6351, + "step": 24326 + }, + { + "epoch": 7.466850828729282, + "grad_norm": 0.20591090619564056, + "learning_rate": 1.5908686335637213e-05, + "loss": 1.7188, + "step": 24327 + }, + { + "epoch": 7.467157765500307, + "grad_norm": 0.17669445276260376, + "learning_rate": 1.590505046929091e-05, + "loss": 1.6735, + "step": 24328 + }, + { + "epoch": 7.467464702271332, + "grad_norm": 0.19642697274684906, + "learning_rate": 1.590141493989089e-05, + "loss": 1.6599, + "step": 24329 + }, + { + "epoch": 7.467771639042358, + "grad_norm": 0.2049490511417389, + "learning_rate": 1.589777974747307e-05, + "loss": 1.77, + "step": 24330 + }, + { + "epoch": 7.468078575813382, + "grad_norm": 0.1877276450395584, + "learning_rate": 1.5894144892073377e-05, + "loss": 1.6774, + "step": 24331 + }, + { + "epoch": 7.468385512584407, + "grad_norm": 0.18437768518924713, + "learning_rate": 1.5890510373727735e-05, + "loss": 1.7054, + "step": 24332 + }, + { + "epoch": 7.468692449355433, + "grad_norm": 0.1850978136062622, + "learning_rate": 1.5886876192472062e-05, + "loss": 1.6664, + "step": 24333 + }, + { + "epoch": 7.468999386126458, + "grad_norm": 0.16257111728191376, + "learning_rate": 1.588324234834227e-05, + "loss": 1.7438, + "step": 24334 + }, + { + "epoch": 7.469306322897483, + "grad_norm": 0.1776656061410904, + "learning_rate": 1.5879608841374277e-05, + "loss": 1.6913, + "step": 24335 + }, + { + "epoch": 7.469613259668508, + "grad_norm": 0.183144673705101, + "learning_rate": 1.587597567160398e-05, + "loss": 1.6737, + "step": 24336 + }, + { + "epoch": 7.469920196439533, + "grad_norm": 0.15030701458454132, + "learning_rate": 1.5872342839067306e-05, + "loss": 1.6776, + "step": 24337 + }, + { + "epoch": 7.4702271332105585, + "grad_norm": 0.1987701952457428, + "learning_rate": 1.586871034380013e-05, + "loss": 1.7119, + "step": 24338 + }, + { + "epoch": 7.470534069981584, + "grad_norm": 0.20000997185707092, + "learning_rate": 1.5865078185838373e-05, + "loss": 1.6794, + "step": 24339 + }, + { + "epoch": 7.470841006752609, + "grad_norm": 0.1674201786518097, + "learning_rate": 1.5861446365217902e-05, + "loss": 1.6826, + "step": 24340 + }, + { + "epoch": 7.4711479435236345, + "grad_norm": 0.22385969758033752, + "learning_rate": 1.585781488197466e-05, + "loss": 1.7012, + "step": 24341 + }, + { + "epoch": 7.471454880294659, + "grad_norm": 0.18635201454162598, + "learning_rate": 1.585418373614446e-05, + "loss": 1.7086, + "step": 24342 + }, + { + "epoch": 7.471761817065684, + "grad_norm": 0.17345300316810608, + "learning_rate": 1.5850552927763274e-05, + "loss": 1.7068, + "step": 24343 + }, + { + "epoch": 7.47206875383671, + "grad_norm": 0.1777433305978775, + "learning_rate": 1.5846922456866904e-05, + "loss": 1.6618, + "step": 24344 + }, + { + "epoch": 7.472375690607735, + "grad_norm": 0.1821276843547821, + "learning_rate": 1.584329232349128e-05, + "loss": 1.7451, + "step": 24345 + }, + { + "epoch": 7.47268262737876, + "grad_norm": 0.1714404970407486, + "learning_rate": 1.5839662527672262e-05, + "loss": 1.7289, + "step": 24346 + }, + { + "epoch": 7.472989564149785, + "grad_norm": 0.159423828125, + "learning_rate": 1.583603306944572e-05, + "loss": 1.667, + "step": 24347 + }, + { + "epoch": 7.47329650092081, + "grad_norm": 0.22563552856445312, + "learning_rate": 1.5832403948847523e-05, + "loss": 1.7755, + "step": 24348 + }, + { + "epoch": 7.473603437691835, + "grad_norm": 0.17239433526992798, + "learning_rate": 1.582877516591354e-05, + "loss": 1.6577, + "step": 24349 + }, + { + "epoch": 7.473910374462861, + "grad_norm": 0.1671951860189438, + "learning_rate": 1.5825146720679624e-05, + "loss": 1.7438, + "step": 24350 + }, + { + "epoch": 7.474217311233886, + "grad_norm": 0.1802397519350052, + "learning_rate": 1.582151861318164e-05, + "loss": 1.686, + "step": 24351 + }, + { + "epoch": 7.474524248004911, + "grad_norm": 0.21424922347068787, + "learning_rate": 1.5817890843455442e-05, + "loss": 1.7871, + "step": 24352 + }, + { + "epoch": 7.474831184775936, + "grad_norm": 0.2275305986404419, + "learning_rate": 1.5814263411536884e-05, + "loss": 1.7461, + "step": 24353 + }, + { + "epoch": 7.475138121546961, + "grad_norm": 0.1682458072900772, + "learning_rate": 1.581063631746181e-05, + "loss": 1.6362, + "step": 24354 + }, + { + "epoch": 7.475445058317987, + "grad_norm": 0.165358304977417, + "learning_rate": 1.5807009561266068e-05, + "loss": 1.7057, + "step": 24355 + }, + { + "epoch": 7.475751995089012, + "grad_norm": 0.18032164871692657, + "learning_rate": 1.5803383142985496e-05, + "loss": 1.7645, + "step": 24356 + }, + { + "epoch": 7.476058931860037, + "grad_norm": 0.1694670170545578, + "learning_rate": 1.5799757062655935e-05, + "loss": 1.6848, + "step": 24357 + }, + { + "epoch": 7.476365868631062, + "grad_norm": 0.17879679799079895, + "learning_rate": 1.5796131320313225e-05, + "loss": 1.7425, + "step": 24358 + }, + { + "epoch": 7.476672805402087, + "grad_norm": 0.16042493283748627, + "learning_rate": 1.579250591599317e-05, + "loss": 1.6389, + "step": 24359 + }, + { + "epoch": 7.476979742173112, + "grad_norm": 0.19134685397148132, + "learning_rate": 1.5788880849731658e-05, + "loss": 1.7504, + "step": 24360 + }, + { + "epoch": 7.477286678944138, + "grad_norm": 0.16545429825782776, + "learning_rate": 1.578525612156444e-05, + "loss": 1.7184, + "step": 24361 + }, + { + "epoch": 7.477593615715163, + "grad_norm": 0.18139231204986572, + "learning_rate": 1.5781631731527397e-05, + "loss": 1.6794, + "step": 24362 + }, + { + "epoch": 7.4779005524861875, + "grad_norm": 0.19043901562690735, + "learning_rate": 1.5778007679656326e-05, + "loss": 1.7184, + "step": 24363 + }, + { + "epoch": 7.478207489257213, + "grad_norm": 0.19410157203674316, + "learning_rate": 1.577438396598703e-05, + "loss": 1.7599, + "step": 24364 + }, + { + "epoch": 7.478514426028238, + "grad_norm": 0.18464741110801697, + "learning_rate": 1.5770760590555344e-05, + "loss": 1.652, + "step": 24365 + }, + { + "epoch": 7.4788213627992635, + "grad_norm": 0.19959059357643127, + "learning_rate": 1.576713755339706e-05, + "loss": 1.7509, + "step": 24366 + }, + { + "epoch": 7.479128299570289, + "grad_norm": 0.20312312245368958, + "learning_rate": 1.576351485454799e-05, + "loss": 1.758, + "step": 24367 + }, + { + "epoch": 7.479435236341313, + "grad_norm": 0.23994365334510803, + "learning_rate": 1.5759892494043933e-05, + "loss": 1.7124, + "step": 24368 + }, + { + "epoch": 7.479742173112339, + "grad_norm": 0.22661323845386505, + "learning_rate": 1.575627047192068e-05, + "loss": 1.7251, + "step": 24369 + }, + { + "epoch": 7.480049109883364, + "grad_norm": 0.2599529027938843, + "learning_rate": 1.5752648788214038e-05, + "loss": 1.7351, + "step": 24370 + }, + { + "epoch": 7.480356046654389, + "grad_norm": 0.17298145592212677, + "learning_rate": 1.5749027442959795e-05, + "loss": 1.681, + "step": 24371 + }, + { + "epoch": 7.480662983425415, + "grad_norm": 0.18189257383346558, + "learning_rate": 1.574540643619373e-05, + "loss": 1.6938, + "step": 24372 + }, + { + "epoch": 7.48096992019644, + "grad_norm": 0.2658606767654419, + "learning_rate": 1.5741785767951645e-05, + "loss": 1.7043, + "step": 24373 + }, + { + "epoch": 7.481276856967464, + "grad_norm": 0.17898595333099365, + "learning_rate": 1.573816543826931e-05, + "loss": 1.7299, + "step": 24374 + }, + { + "epoch": 7.48158379373849, + "grad_norm": 0.2529693841934204, + "learning_rate": 1.573454544718251e-05, + "loss": 1.6378, + "step": 24375 + }, + { + "epoch": 7.481890730509515, + "grad_norm": 0.1542833298444748, + "learning_rate": 1.5730925794726993e-05, + "loss": 1.6847, + "step": 24376 + }, + { + "epoch": 7.48219766728054, + "grad_norm": 0.24731594324111938, + "learning_rate": 1.5727306480938586e-05, + "loss": 1.7028, + "step": 24377 + }, + { + "epoch": 7.482504604051566, + "grad_norm": 0.21095556020736694, + "learning_rate": 1.572368750585299e-05, + "loss": 1.7371, + "step": 24378 + }, + { + "epoch": 7.48281154082259, + "grad_norm": 0.24208855628967285, + "learning_rate": 1.5720068869506037e-05, + "loss": 1.7982, + "step": 24379 + }, + { + "epoch": 7.4831184775936155, + "grad_norm": 0.23290614783763885, + "learning_rate": 1.571645057193343e-05, + "loss": 1.7443, + "step": 24380 + }, + { + "epoch": 7.483425414364641, + "grad_norm": 0.2146376222372055, + "learning_rate": 1.5712832613170963e-05, + "loss": 1.7258, + "step": 24381 + }, + { + "epoch": 7.483732351135666, + "grad_norm": 0.20540264248847961, + "learning_rate": 1.5709214993254385e-05, + "loss": 1.6495, + "step": 24382 + }, + { + "epoch": 7.4840392879066915, + "grad_norm": 0.16472755372524261, + "learning_rate": 1.570559771221944e-05, + "loss": 1.7118, + "step": 24383 + }, + { + "epoch": 7.484346224677717, + "grad_norm": 0.194668248295784, + "learning_rate": 1.5701980770101876e-05, + "loss": 1.6948, + "step": 24384 + }, + { + "epoch": 7.484653161448741, + "grad_norm": 0.19188909232616425, + "learning_rate": 1.569836416693744e-05, + "loss": 1.7376, + "step": 24385 + }, + { + "epoch": 7.484960098219767, + "grad_norm": 0.1935901939868927, + "learning_rate": 1.569474790276188e-05, + "loss": 1.7009, + "step": 24386 + }, + { + "epoch": 7.485267034990792, + "grad_norm": 0.18449221551418304, + "learning_rate": 1.5691131977610924e-05, + "loss": 1.7542, + "step": 24387 + }, + { + "epoch": 7.485573971761817, + "grad_norm": 0.18543820083141327, + "learning_rate": 1.568751639152031e-05, + "loss": 1.7125, + "step": 24388 + }, + { + "epoch": 7.485880908532843, + "grad_norm": 0.17343461513519287, + "learning_rate": 1.5683901144525776e-05, + "loss": 1.7189, + "step": 24389 + }, + { + "epoch": 7.486187845303867, + "grad_norm": 0.16813276708126068, + "learning_rate": 1.568028623666304e-05, + "loss": 1.6416, + "step": 24390 + }, + { + "epoch": 7.486494782074892, + "grad_norm": 0.16296882927417755, + "learning_rate": 1.567667166796783e-05, + "loss": 1.6971, + "step": 24391 + }, + { + "epoch": 7.486801718845918, + "grad_norm": 0.206793412566185, + "learning_rate": 1.5673057438475875e-05, + "loss": 1.8139, + "step": 24392 + }, + { + "epoch": 7.487108655616943, + "grad_norm": 0.1937340795993805, + "learning_rate": 1.566944354822286e-05, + "loss": 1.7606, + "step": 24393 + }, + { + "epoch": 7.487415592387968, + "grad_norm": 0.19251857697963715, + "learning_rate": 1.566582999724456e-05, + "loss": 1.7225, + "step": 24394 + }, + { + "epoch": 7.487722529158993, + "grad_norm": 0.1551857739686966, + "learning_rate": 1.566221678557663e-05, + "loss": 1.6546, + "step": 24395 + }, + { + "epoch": 7.488029465930018, + "grad_norm": 0.19435563683509827, + "learning_rate": 1.565860391325482e-05, + "loss": 1.7444, + "step": 24396 + }, + { + "epoch": 7.4883364027010435, + "grad_norm": 0.21196971833705902, + "learning_rate": 1.565499138031479e-05, + "loss": 1.7124, + "step": 24397 + }, + { + "epoch": 7.488643339472069, + "grad_norm": 0.2145242542028427, + "learning_rate": 1.5651379186792276e-05, + "loss": 1.7571, + "step": 24398 + }, + { + "epoch": 7.488950276243094, + "grad_norm": 0.17056338489055634, + "learning_rate": 1.5647767332722964e-05, + "loss": 1.6514, + "step": 24399 + }, + { + "epoch": 7.4892572130141195, + "grad_norm": 0.17161786556243896, + "learning_rate": 1.5644155818142553e-05, + "loss": 1.675, + "step": 24400 + }, + { + "epoch": 7.489564149785144, + "grad_norm": 0.18978877365589142, + "learning_rate": 1.564054464308673e-05, + "loss": 1.7123, + "step": 24401 + }, + { + "epoch": 7.489871086556169, + "grad_norm": 0.16004881262779236, + "learning_rate": 1.5636933807591186e-05, + "loss": 1.6555, + "step": 24402 + }, + { + "epoch": 7.490178023327195, + "grad_norm": 0.19739225506782532, + "learning_rate": 1.56333233116916e-05, + "loss": 1.7441, + "step": 24403 + }, + { + "epoch": 7.49048496009822, + "grad_norm": 0.20770032703876495, + "learning_rate": 1.5629713155423657e-05, + "loss": 1.6704, + "step": 24404 + }, + { + "epoch": 7.490791896869245, + "grad_norm": 0.17897675931453705, + "learning_rate": 1.5626103338823033e-05, + "loss": 1.7281, + "step": 24405 + }, + { + "epoch": 7.49109883364027, + "grad_norm": 0.20801669359207153, + "learning_rate": 1.5622493861925402e-05, + "loss": 1.7008, + "step": 24406 + }, + { + "epoch": 7.491405770411295, + "grad_norm": 0.2027266025543213, + "learning_rate": 1.5618884724766442e-05, + "loss": 1.7619, + "step": 24407 + }, + { + "epoch": 7.49171270718232, + "grad_norm": 0.19207318127155304, + "learning_rate": 1.5615275927381806e-05, + "loss": 1.6985, + "step": 24408 + }, + { + "epoch": 7.492019643953346, + "grad_norm": 0.19694732129573822, + "learning_rate": 1.5611667469807175e-05, + "loss": 1.7455, + "step": 24409 + }, + { + "epoch": 7.492326580724371, + "grad_norm": 0.170238196849823, + "learning_rate": 1.560805935207818e-05, + "loss": 1.7179, + "step": 24410 + }, + { + "epoch": 7.4926335174953955, + "grad_norm": 0.16890759766101837, + "learning_rate": 1.5604451574230532e-05, + "loss": 1.7323, + "step": 24411 + }, + { + "epoch": 7.492940454266421, + "grad_norm": 0.18043142557144165, + "learning_rate": 1.5600844136299824e-05, + "loss": 1.6958, + "step": 24412 + }, + { + "epoch": 7.493247391037446, + "grad_norm": 0.23966364562511444, + "learning_rate": 1.5597237038321764e-05, + "loss": 1.754, + "step": 24413 + }, + { + "epoch": 7.4935543278084715, + "grad_norm": 0.23342584073543549, + "learning_rate": 1.5593630280331945e-05, + "loss": 1.8008, + "step": 24414 + }, + { + "epoch": 7.493861264579497, + "grad_norm": 0.17365418374538422, + "learning_rate": 1.5590023862366054e-05, + "loss": 1.7166, + "step": 24415 + }, + { + "epoch": 7.494168201350522, + "grad_norm": 0.1934911608695984, + "learning_rate": 1.558641778445971e-05, + "loss": 1.7113, + "step": 24416 + }, + { + "epoch": 7.494475138121547, + "grad_norm": 0.1935805231332779, + "learning_rate": 1.558281204664856e-05, + "loss": 1.7549, + "step": 24417 + }, + { + "epoch": 7.494782074892572, + "grad_norm": 0.18467992544174194, + "learning_rate": 1.5579206648968236e-05, + "loss": 1.6889, + "step": 24418 + }, + { + "epoch": 7.495089011663597, + "grad_norm": 0.17173317074775696, + "learning_rate": 1.5575601591454365e-05, + "loss": 1.686, + "step": 24419 + }, + { + "epoch": 7.495395948434623, + "grad_norm": 0.1706855744123459, + "learning_rate": 1.5571996874142574e-05, + "loss": 1.6747, + "step": 24420 + }, + { + "epoch": 7.495702885205648, + "grad_norm": 0.2233184576034546, + "learning_rate": 1.556839249706849e-05, + "loss": 1.7855, + "step": 24421 + }, + { + "epoch": 7.496009821976672, + "grad_norm": 0.22118456661701202, + "learning_rate": 1.5564788460267733e-05, + "loss": 1.7487, + "step": 24422 + }, + { + "epoch": 7.496316758747698, + "grad_norm": 0.21284142136573792, + "learning_rate": 1.5561184763775916e-05, + "loss": 1.7367, + "step": 24423 + }, + { + "epoch": 7.496623695518723, + "grad_norm": 0.17366403341293335, + "learning_rate": 1.5557581407628656e-05, + "loss": 1.655, + "step": 24424 + }, + { + "epoch": 7.496930632289748, + "grad_norm": 0.19864381849765778, + "learning_rate": 1.555397839186157e-05, + "loss": 1.6691, + "step": 24425 + }, + { + "epoch": 7.497237569060774, + "grad_norm": 0.1787605881690979, + "learning_rate": 1.555037571651025e-05, + "loss": 1.7063, + "step": 24426 + }, + { + "epoch": 7.497544505831799, + "grad_norm": 0.19520068168640137, + "learning_rate": 1.5546773381610302e-05, + "loss": 1.7044, + "step": 24427 + }, + { + "epoch": 7.4978514426028235, + "grad_norm": 0.18771123886108398, + "learning_rate": 1.5543171387197362e-05, + "loss": 1.6959, + "step": 24428 + }, + { + "epoch": 7.498158379373849, + "grad_norm": 0.21876849234104156, + "learning_rate": 1.5539569733306964e-05, + "loss": 1.7486, + "step": 24429 + }, + { + "epoch": 7.498465316144874, + "grad_norm": 0.21685563027858734, + "learning_rate": 1.5535968419974772e-05, + "loss": 1.7541, + "step": 24430 + }, + { + "epoch": 7.4987722529158995, + "grad_norm": 0.19595225155353546, + "learning_rate": 1.5532367447236307e-05, + "loss": 1.6882, + "step": 24431 + }, + { + "epoch": 7.499079189686925, + "grad_norm": 0.18359199166297913, + "learning_rate": 1.5528766815127198e-05, + "loss": 1.687, + "step": 24432 + }, + { + "epoch": 7.499386126457949, + "grad_norm": 0.17955231666564941, + "learning_rate": 1.5525166523683028e-05, + "loss": 1.6759, + "step": 24433 + }, + { + "epoch": 7.499693063228975, + "grad_norm": 0.18786758184432983, + "learning_rate": 1.5521566572939368e-05, + "loss": 1.7118, + "step": 24434 + }, + { + "epoch": 7.5, + "grad_norm": 0.16672605276107788, + "learning_rate": 1.551796696293179e-05, + "loss": 1.6618, + "step": 24435 + }, + { + "epoch": 7.500306936771025, + "grad_norm": 0.17066839337348938, + "learning_rate": 1.5514367693695875e-05, + "loss": 1.6974, + "step": 24436 + }, + { + "epoch": 7.500613873542051, + "grad_norm": 0.17299650609493256, + "learning_rate": 1.5510768765267193e-05, + "loss": 1.7074, + "step": 24437 + }, + { + "epoch": 7.500920810313076, + "grad_norm": 0.17507639527320862, + "learning_rate": 1.5507170177681306e-05, + "loss": 1.7295, + "step": 24438 + }, + { + "epoch": 7.5012277470841005, + "grad_norm": 0.1909082531929016, + "learning_rate": 1.5503571930973786e-05, + "loss": 1.7153, + "step": 24439 + }, + { + "epoch": 7.501534683855126, + "grad_norm": 0.2334289401769638, + "learning_rate": 1.5499974025180185e-05, + "loss": 1.713, + "step": 24440 + }, + { + "epoch": 7.501841620626151, + "grad_norm": 0.18382340669631958, + "learning_rate": 1.5496376460336058e-05, + "loss": 1.6706, + "step": 24441 + }, + { + "epoch": 7.5021485573971765, + "grad_norm": 0.1901310533285141, + "learning_rate": 1.5492779236476967e-05, + "loss": 1.7106, + "step": 24442 + }, + { + "epoch": 7.502455494168201, + "grad_norm": 0.17336180806159973, + "learning_rate": 1.5489182353638452e-05, + "loss": 1.7467, + "step": 24443 + }, + { + "epoch": 7.502762430939226, + "grad_norm": 0.18670998513698578, + "learning_rate": 1.548558581185605e-05, + "loss": 1.7101, + "step": 24444 + }, + { + "epoch": 7.503069367710252, + "grad_norm": 0.18341238796710968, + "learning_rate": 1.5481989611165353e-05, + "loss": 1.719, + "step": 24445 + }, + { + "epoch": 7.503376304481277, + "grad_norm": 0.21832694113254547, + "learning_rate": 1.5478393751601833e-05, + "loss": 1.7143, + "step": 24446 + }, + { + "epoch": 7.503683241252302, + "grad_norm": 0.1715303659439087, + "learning_rate": 1.5474798233201094e-05, + "loss": 1.6962, + "step": 24447 + }, + { + "epoch": 7.503990178023328, + "grad_norm": 0.26411953568458557, + "learning_rate": 1.5471203055998595e-05, + "loss": 1.7182, + "step": 24448 + }, + { + "epoch": 7.504297114794352, + "grad_norm": 0.1646965742111206, + "learning_rate": 1.5467608220029926e-05, + "loss": 1.6979, + "step": 24449 + }, + { + "epoch": 7.504604051565377, + "grad_norm": 0.1664915233850479, + "learning_rate": 1.5464013725330595e-05, + "loss": 1.6809, + "step": 24450 + }, + { + "epoch": 7.504910988336403, + "grad_norm": 0.1711970716714859, + "learning_rate": 1.5460419571936125e-05, + "loss": 1.6975, + "step": 24451 + }, + { + "epoch": 7.505217925107428, + "grad_norm": 0.19235998392105103, + "learning_rate": 1.5456825759882028e-05, + "loss": 1.7515, + "step": 24452 + }, + { + "epoch": 7.505524861878453, + "grad_norm": 0.2137441486120224, + "learning_rate": 1.5453232289203822e-05, + "loss": 1.7575, + "step": 24453 + }, + { + "epoch": 7.505831798649478, + "grad_norm": 0.19337041676044464, + "learning_rate": 1.544963915993703e-05, + "loss": 1.776, + "step": 24454 + }, + { + "epoch": 7.506138735420503, + "grad_norm": 0.227366104722023, + "learning_rate": 1.5446046372117152e-05, + "loss": 1.7736, + "step": 24455 + }, + { + "epoch": 7.5064456721915285, + "grad_norm": 0.1712712198495865, + "learning_rate": 1.5442453925779694e-05, + "loss": 1.6663, + "step": 24456 + }, + { + "epoch": 7.506752608962554, + "grad_norm": 0.19359993934631348, + "learning_rate": 1.5438861820960164e-05, + "loss": 1.6826, + "step": 24457 + }, + { + "epoch": 7.507059545733579, + "grad_norm": 0.22883851826190948, + "learning_rate": 1.5435270057694056e-05, + "loss": 1.7782, + "step": 24458 + }, + { + "epoch": 7.5073664825046045, + "grad_norm": 0.17109328508377075, + "learning_rate": 1.543167863601687e-05, + "loss": 1.7435, + "step": 24459 + }, + { + "epoch": 7.507673419275629, + "grad_norm": 0.21545098721981049, + "learning_rate": 1.54280875559641e-05, + "loss": 1.7277, + "step": 24460 + }, + { + "epoch": 7.507980356046654, + "grad_norm": 0.18345774710178375, + "learning_rate": 1.542449681757121e-05, + "loss": 1.7255, + "step": 24461 + }, + { + "epoch": 7.50828729281768, + "grad_norm": 0.15472757816314697, + "learning_rate": 1.5420906420873744e-05, + "loss": 1.6615, + "step": 24462 + }, + { + "epoch": 7.508594229588705, + "grad_norm": 0.2084251195192337, + "learning_rate": 1.5417316365907113e-05, + "loss": 1.6747, + "step": 24463 + }, + { + "epoch": 7.50890116635973, + "grad_norm": 0.19010984897613525, + "learning_rate": 1.5413726652706868e-05, + "loss": 1.7188, + "step": 24464 + }, + { + "epoch": 7.509208103130755, + "grad_norm": 0.22481444478034973, + "learning_rate": 1.5410137281308408e-05, + "loss": 1.8028, + "step": 24465 + }, + { + "epoch": 7.50951503990178, + "grad_norm": 0.22309516370296478, + "learning_rate": 1.5406548251747266e-05, + "loss": 1.7806, + "step": 24466 + }, + { + "epoch": 7.509821976672805, + "grad_norm": 0.19050204753875732, + "learning_rate": 1.540295956405889e-05, + "loss": 1.7188, + "step": 24467 + }, + { + "epoch": 7.510128913443831, + "grad_norm": 0.1956445276737213, + "learning_rate": 1.5399371218278745e-05, + "loss": 1.7468, + "step": 24468 + }, + { + "epoch": 7.510435850214856, + "grad_norm": 0.3492142856121063, + "learning_rate": 1.5395783214442294e-05, + "loss": 1.7502, + "step": 24469 + }, + { + "epoch": 7.510742786985881, + "grad_norm": 0.15318654477596283, + "learning_rate": 1.5392195552584997e-05, + "loss": 1.6782, + "step": 24470 + }, + { + "epoch": 7.511049723756906, + "grad_norm": 0.18576723337173462, + "learning_rate": 1.5388608232742308e-05, + "loss": 1.7455, + "step": 24471 + }, + { + "epoch": 7.511356660527931, + "grad_norm": 0.14923253655433655, + "learning_rate": 1.5385021254949677e-05, + "loss": 1.687, + "step": 24472 + }, + { + "epoch": 7.5116635972989565, + "grad_norm": 0.17453742027282715, + "learning_rate": 1.5381434619242553e-05, + "loss": 1.7072, + "step": 24473 + }, + { + "epoch": 7.511970534069982, + "grad_norm": 0.18869875371456146, + "learning_rate": 1.5377848325656384e-05, + "loss": 1.7681, + "step": 24474 + }, + { + "epoch": 7.512277470841006, + "grad_norm": 0.22205953299999237, + "learning_rate": 1.5374262374226612e-05, + "loss": 1.7526, + "step": 24475 + }, + { + "epoch": 7.512584407612032, + "grad_norm": 0.1634155809879303, + "learning_rate": 1.537067676498867e-05, + "loss": 1.704, + "step": 24476 + }, + { + "epoch": 7.512891344383057, + "grad_norm": 0.19530873000621796, + "learning_rate": 1.5367091497978004e-05, + "loss": 1.7469, + "step": 24477 + }, + { + "epoch": 7.513198281154082, + "grad_norm": 0.17038139700889587, + "learning_rate": 1.5363506573230017e-05, + "loss": 1.6363, + "step": 24478 + }, + { + "epoch": 7.513505217925108, + "grad_norm": 0.17695361375808716, + "learning_rate": 1.535992199078019e-05, + "loss": 1.7191, + "step": 24479 + }, + { + "epoch": 7.513812154696133, + "grad_norm": 0.2216692715883255, + "learning_rate": 1.535633775066389e-05, + "loss": 1.8042, + "step": 24480 + }, + { + "epoch": 7.514119091467157, + "grad_norm": 0.16862058639526367, + "learning_rate": 1.5352753852916595e-05, + "loss": 1.697, + "step": 24481 + }, + { + "epoch": 7.514426028238183, + "grad_norm": 0.20376496016979218, + "learning_rate": 1.5349170297573662e-05, + "loss": 1.7274, + "step": 24482 + }, + { + "epoch": 7.514732965009208, + "grad_norm": 0.16290763020515442, + "learning_rate": 1.5345587084670554e-05, + "loss": 1.6929, + "step": 24483 + }, + { + "epoch": 7.515039901780233, + "grad_norm": 0.21416328847408295, + "learning_rate": 1.5342004214242667e-05, + "loss": 1.756, + "step": 24484 + }, + { + "epoch": 7.515346838551259, + "grad_norm": 0.14708222448825836, + "learning_rate": 1.533842168632541e-05, + "loss": 1.6816, + "step": 24485 + }, + { + "epoch": 7.515653775322283, + "grad_norm": 0.1860494166612625, + "learning_rate": 1.5334839500954178e-05, + "loss": 1.7114, + "step": 24486 + }, + { + "epoch": 7.5159607120933085, + "grad_norm": 0.16551998257637024, + "learning_rate": 1.533125765816439e-05, + "loss": 1.6564, + "step": 24487 + }, + { + "epoch": 7.516267648864334, + "grad_norm": 0.16971731185913086, + "learning_rate": 1.5327676157991428e-05, + "loss": 1.6722, + "step": 24488 + }, + { + "epoch": 7.516574585635359, + "grad_norm": 0.17433905601501465, + "learning_rate": 1.532409500047069e-05, + "loss": 1.6944, + "step": 24489 + }, + { + "epoch": 7.5168815224063845, + "grad_norm": 0.15625490248203278, + "learning_rate": 1.5320514185637575e-05, + "loss": 1.6997, + "step": 24490 + }, + { + "epoch": 7.51718845917741, + "grad_norm": 0.19038623571395874, + "learning_rate": 1.531693371352746e-05, + "loss": 1.6999, + "step": 24491 + }, + { + "epoch": 7.517495395948434, + "grad_norm": 0.16037517786026, + "learning_rate": 1.5313353584175736e-05, + "loss": 1.6568, + "step": 24492 + }, + { + "epoch": 7.51780233271946, + "grad_norm": 0.1515430361032486, + "learning_rate": 1.5309773797617787e-05, + "loss": 1.693, + "step": 24493 + }, + { + "epoch": 7.518109269490485, + "grad_norm": 0.1792028695344925, + "learning_rate": 1.530619435388898e-05, + "loss": 1.7034, + "step": 24494 + }, + { + "epoch": 7.51841620626151, + "grad_norm": 0.18456964194774628, + "learning_rate": 1.530261525302468e-05, + "loss": 1.7565, + "step": 24495 + }, + { + "epoch": 7.518723143032536, + "grad_norm": 0.17504090070724487, + "learning_rate": 1.529903649506031e-05, + "loss": 1.7121, + "step": 24496 + }, + { + "epoch": 7.51903007980356, + "grad_norm": 0.19688715040683746, + "learning_rate": 1.529545808003116e-05, + "loss": 1.7507, + "step": 24497 + }, + { + "epoch": 7.519337016574585, + "grad_norm": 0.21039338409900665, + "learning_rate": 1.529188000797267e-05, + "loss": 1.709, + "step": 24498 + }, + { + "epoch": 7.519643953345611, + "grad_norm": 0.18255522847175598, + "learning_rate": 1.5288302278920136e-05, + "loss": 1.7497, + "step": 24499 + }, + { + "epoch": 7.519950890116636, + "grad_norm": 0.19913412630558014, + "learning_rate": 1.5284724892908958e-05, + "loss": 1.7244, + "step": 24500 + }, + { + "epoch": 7.520257826887661, + "grad_norm": 0.15792223811149597, + "learning_rate": 1.5281147849974476e-05, + "loss": 1.6916, + "step": 24501 + }, + { + "epoch": 7.520564763658687, + "grad_norm": 0.2078406661748886, + "learning_rate": 1.5277571150152038e-05, + "loss": 1.6959, + "step": 24502 + }, + { + "epoch": 7.520871700429711, + "grad_norm": 0.15596020221710205, + "learning_rate": 1.5273994793477e-05, + "loss": 1.7217, + "step": 24503 + }, + { + "epoch": 7.5211786372007365, + "grad_norm": 0.18951189517974854, + "learning_rate": 1.527041877998469e-05, + "loss": 1.7322, + "step": 24504 + }, + { + "epoch": 7.521485573971762, + "grad_norm": 0.16445964574813843, + "learning_rate": 1.526684310971046e-05, + "loss": 1.6668, + "step": 24505 + }, + { + "epoch": 7.521792510742787, + "grad_norm": 0.19513604044914246, + "learning_rate": 1.5263267782689644e-05, + "loss": 1.7464, + "step": 24506 + }, + { + "epoch": 7.5220994475138125, + "grad_norm": 0.20289716124534607, + "learning_rate": 1.525969279895758e-05, + "loss": 1.7472, + "step": 24507 + }, + { + "epoch": 7.522406384284837, + "grad_norm": 0.1716226041316986, + "learning_rate": 1.5256118158549588e-05, + "loss": 1.6872, + "step": 24508 + }, + { + "epoch": 7.522713321055862, + "grad_norm": 0.18939872086048126, + "learning_rate": 1.5252543861501006e-05, + "loss": 1.7365, + "step": 24509 + }, + { + "epoch": 7.523020257826888, + "grad_norm": 0.21382616460323334, + "learning_rate": 1.524896990784715e-05, + "loss": 1.7129, + "step": 24510 + }, + { + "epoch": 7.523327194597913, + "grad_norm": 0.18226614594459534, + "learning_rate": 1.5245396297623338e-05, + "loss": 1.7426, + "step": 24511 + }, + { + "epoch": 7.523634131368938, + "grad_norm": 0.15880146622657776, + "learning_rate": 1.5241823030864893e-05, + "loss": 1.6848, + "step": 24512 + }, + { + "epoch": 7.523941068139964, + "grad_norm": 0.1782255917787552, + "learning_rate": 1.5238250107607121e-05, + "loss": 1.7263, + "step": 24513 + }, + { + "epoch": 7.524248004910988, + "grad_norm": 0.20365844666957855, + "learning_rate": 1.5234677527885328e-05, + "loss": 1.7035, + "step": 24514 + }, + { + "epoch": 7.524554941682013, + "grad_norm": 0.1776183694601059, + "learning_rate": 1.5231105291734855e-05, + "loss": 1.6837, + "step": 24515 + }, + { + "epoch": 7.524861878453039, + "grad_norm": 0.14594987034797668, + "learning_rate": 1.5227533399190946e-05, + "loss": 1.6428, + "step": 24516 + }, + { + "epoch": 7.525168815224064, + "grad_norm": 0.19371397793293, + "learning_rate": 1.5223961850288947e-05, + "loss": 1.7108, + "step": 24517 + }, + { + "epoch": 7.525475751995089, + "grad_norm": 0.1695355474948883, + "learning_rate": 1.5220390645064148e-05, + "loss": 1.6777, + "step": 24518 + }, + { + "epoch": 7.525782688766114, + "grad_norm": 0.14815635979175568, + "learning_rate": 1.5216819783551828e-05, + "loss": 1.6967, + "step": 24519 + }, + { + "epoch": 7.526089625537139, + "grad_norm": 0.19655495882034302, + "learning_rate": 1.5213249265787283e-05, + "loss": 1.7358, + "step": 24520 + }, + { + "epoch": 7.526396562308165, + "grad_norm": 0.1817864030599594, + "learning_rate": 1.5209679091805795e-05, + "loss": 1.7132, + "step": 24521 + }, + { + "epoch": 7.52670349907919, + "grad_norm": 0.209315687417984, + "learning_rate": 1.5206109261642654e-05, + "loss": 1.7161, + "step": 24522 + }, + { + "epoch": 7.527010435850215, + "grad_norm": 0.18493252992630005, + "learning_rate": 1.520253977533313e-05, + "loss": 1.7136, + "step": 24523 + }, + { + "epoch": 7.52731737262124, + "grad_norm": 0.21916678547859192, + "learning_rate": 1.5198970632912508e-05, + "loss": 1.7464, + "step": 24524 + }, + { + "epoch": 7.527624309392265, + "grad_norm": 0.14470849931240082, + "learning_rate": 1.519540183441605e-05, + "loss": 1.6676, + "step": 24525 + }, + { + "epoch": 7.52793124616329, + "grad_norm": 0.20077016949653625, + "learning_rate": 1.5191833379879033e-05, + "loss": 1.7052, + "step": 24526 + }, + { + "epoch": 7.528238182934316, + "grad_norm": 0.17593151330947876, + "learning_rate": 1.5188265269336722e-05, + "loss": 1.7309, + "step": 24527 + }, + { + "epoch": 7.528545119705341, + "grad_norm": 0.20170791447162628, + "learning_rate": 1.518469750282438e-05, + "loss": 1.7335, + "step": 24528 + }, + { + "epoch": 7.5288520564763655, + "grad_norm": 0.1703701615333557, + "learning_rate": 1.518113008037726e-05, + "loss": 1.7141, + "step": 24529 + }, + { + "epoch": 7.529158993247391, + "grad_norm": 0.1897478848695755, + "learning_rate": 1.517756300203062e-05, + "loss": 1.7059, + "step": 24530 + }, + { + "epoch": 7.529465930018416, + "grad_norm": 0.17487141489982605, + "learning_rate": 1.5173996267819695e-05, + "loss": 1.7559, + "step": 24531 + }, + { + "epoch": 7.5297728667894415, + "grad_norm": 0.19167299568653107, + "learning_rate": 1.5170429877779785e-05, + "loss": 1.7287, + "step": 24532 + }, + { + "epoch": 7.530079803560467, + "grad_norm": 0.19433172047138214, + "learning_rate": 1.5166863831946072e-05, + "loss": 1.7182, + "step": 24533 + }, + { + "epoch": 7.530386740331492, + "grad_norm": 0.293734073638916, + "learning_rate": 1.5163298130353853e-05, + "loss": 1.7362, + "step": 24534 + }, + { + "epoch": 7.530693677102517, + "grad_norm": 0.18647685647010803, + "learning_rate": 1.515973277303831e-05, + "loss": 1.7271, + "step": 24535 + }, + { + "epoch": 7.531000613873542, + "grad_norm": 0.20918485522270203, + "learning_rate": 1.5156167760034729e-05, + "loss": 1.7225, + "step": 24536 + }, + { + "epoch": 7.531307550644567, + "grad_norm": 0.22056303918361664, + "learning_rate": 1.5152603091378315e-05, + "loss": 1.6524, + "step": 24537 + }, + { + "epoch": 7.531614487415593, + "grad_norm": 0.13695760071277618, + "learning_rate": 1.5149038767104307e-05, + "loss": 1.6639, + "step": 24538 + }, + { + "epoch": 7.531921424186618, + "grad_norm": 0.25396111607551575, + "learning_rate": 1.514547478724792e-05, + "loss": 1.7025, + "step": 24539 + }, + { + "epoch": 7.532228360957642, + "grad_norm": 0.18192961812019348, + "learning_rate": 1.5141911151844384e-05, + "loss": 1.7288, + "step": 24540 + }, + { + "epoch": 7.532535297728668, + "grad_norm": 0.24748951196670532, + "learning_rate": 1.5138347860928908e-05, + "loss": 1.7379, + "step": 24541 + }, + { + "epoch": 7.532842234499693, + "grad_norm": 0.1841045767068863, + "learning_rate": 1.5134784914536715e-05, + "loss": 1.7876, + "step": 24542 + }, + { + "epoch": 7.533149171270718, + "grad_norm": 0.21867021918296814, + "learning_rate": 1.5131222312703014e-05, + "loss": 1.7608, + "step": 24543 + }, + { + "epoch": 7.533456108041744, + "grad_norm": 0.1972149908542633, + "learning_rate": 1.512766005546301e-05, + "loss": 1.6927, + "step": 24544 + }, + { + "epoch": 7.533763044812769, + "grad_norm": 0.1728486567735672, + "learning_rate": 1.5124098142851906e-05, + "loss": 1.7656, + "step": 24545 + }, + { + "epoch": 7.5340699815837935, + "grad_norm": 0.2591659724712372, + "learning_rate": 1.512053657490491e-05, + "loss": 1.6844, + "step": 24546 + }, + { + "epoch": 7.534376918354819, + "grad_norm": 0.17187906801700592, + "learning_rate": 1.5116975351657215e-05, + "loss": 1.707, + "step": 24547 + }, + { + "epoch": 7.534683855125844, + "grad_norm": 0.26111504435539246, + "learning_rate": 1.5113414473143993e-05, + "loss": 1.7273, + "step": 24548 + }, + { + "epoch": 7.5349907918968695, + "grad_norm": 0.2153446227312088, + "learning_rate": 1.5109853939400498e-05, + "loss": 1.7458, + "step": 24549 + }, + { + "epoch": 7.535297728667894, + "grad_norm": 0.20768530666828156, + "learning_rate": 1.5106293750461835e-05, + "loss": 1.749, + "step": 24550 + }, + { + "epoch": 7.535604665438919, + "grad_norm": 0.2211574763059616, + "learning_rate": 1.5102733906363264e-05, + "loss": 1.7236, + "step": 24551 + }, + { + "epoch": 7.535911602209945, + "grad_norm": 0.15983305871486664, + "learning_rate": 1.5099174407139905e-05, + "loss": 1.6682, + "step": 24552 + }, + { + "epoch": 7.53621853898097, + "grad_norm": 0.23821383714675903, + "learning_rate": 1.5095615252826967e-05, + "loss": 1.7173, + "step": 24553 + }, + { + "epoch": 7.536525475751995, + "grad_norm": 0.1726350039243698, + "learning_rate": 1.5092056443459624e-05, + "loss": 1.7566, + "step": 24554 + }, + { + "epoch": 7.536832412523021, + "grad_norm": 0.19859814643859863, + "learning_rate": 1.5088497979073035e-05, + "loss": 1.7005, + "step": 24555 + }, + { + "epoch": 7.537139349294045, + "grad_norm": 0.14776331186294556, + "learning_rate": 1.508493985970239e-05, + "loss": 1.68, + "step": 24556 + }, + { + "epoch": 7.53744628606507, + "grad_norm": 0.20928993821144104, + "learning_rate": 1.50813820853828e-05, + "loss": 1.7536, + "step": 24557 + }, + { + "epoch": 7.537753222836096, + "grad_norm": 0.18914662301540375, + "learning_rate": 1.5077824656149475e-05, + "loss": 1.7476, + "step": 24558 + }, + { + "epoch": 7.538060159607121, + "grad_norm": 0.24415937066078186, + "learning_rate": 1.5074267572037554e-05, + "loss": 1.7225, + "step": 24559 + }, + { + "epoch": 7.538367096378146, + "grad_norm": 0.18504458665847778, + "learning_rate": 1.5070710833082196e-05, + "loss": 1.7028, + "step": 24560 + }, + { + "epoch": 7.538674033149171, + "grad_norm": 0.1846696138381958, + "learning_rate": 1.5067154439318542e-05, + "loss": 1.7204, + "step": 24561 + }, + { + "epoch": 7.538980969920196, + "grad_norm": 0.20846717059612274, + "learning_rate": 1.5063598390781747e-05, + "loss": 1.73, + "step": 24562 + }, + { + "epoch": 7.5392879066912215, + "grad_norm": 0.1950647234916687, + "learning_rate": 1.5060042687506943e-05, + "loss": 1.7008, + "step": 24563 + }, + { + "epoch": 7.539594843462247, + "grad_norm": 0.1880638748407364, + "learning_rate": 1.5056487329529278e-05, + "loss": 1.6965, + "step": 24564 + }, + { + "epoch": 7.539901780233272, + "grad_norm": 0.24405652284622192, + "learning_rate": 1.5052932316883872e-05, + "loss": 1.7407, + "step": 24565 + }, + { + "epoch": 7.5402087170042975, + "grad_norm": 0.15719062089920044, + "learning_rate": 1.5049377649605906e-05, + "loss": 1.6613, + "step": 24566 + }, + { + "epoch": 7.540515653775322, + "grad_norm": 0.20888090133666992, + "learning_rate": 1.5045823327730441e-05, + "loss": 1.7805, + "step": 24567 + }, + { + "epoch": 7.540822590546347, + "grad_norm": 0.1656443029642105, + "learning_rate": 1.504226935129267e-05, + "loss": 1.7047, + "step": 24568 + }, + { + "epoch": 7.541129527317373, + "grad_norm": 0.28847959637641907, + "learning_rate": 1.503871572032765e-05, + "loss": 1.8711, + "step": 24569 + }, + { + "epoch": 7.541436464088398, + "grad_norm": 0.1724858433008194, + "learning_rate": 1.5035162434870548e-05, + "loss": 1.6734, + "step": 24570 + }, + { + "epoch": 7.541743400859423, + "grad_norm": 0.2064351737499237, + "learning_rate": 1.5031609494956484e-05, + "loss": 1.7032, + "step": 24571 + }, + { + "epoch": 7.542050337630448, + "grad_norm": 0.175388365983963, + "learning_rate": 1.5028056900620513e-05, + "loss": 1.6606, + "step": 24572 + }, + { + "epoch": 7.542357274401473, + "grad_norm": 0.20802471041679382, + "learning_rate": 1.5024504651897814e-05, + "loss": 1.7324, + "step": 24573 + }, + { + "epoch": 7.542664211172498, + "grad_norm": 0.187152698636055, + "learning_rate": 1.502095274882343e-05, + "loss": 1.7222, + "step": 24574 + }, + { + "epoch": 7.542971147943524, + "grad_norm": 0.20112092792987823, + "learning_rate": 1.5017401191432511e-05, + "loss": 1.6959, + "step": 24575 + }, + { + "epoch": 7.543278084714549, + "grad_norm": 0.17968857288360596, + "learning_rate": 1.5013849979760136e-05, + "loss": 1.6957, + "step": 24576 + }, + { + "epoch": 7.543585021485574, + "grad_norm": 0.20532584190368652, + "learning_rate": 1.5010299113841397e-05, + "loss": 1.7471, + "step": 24577 + }, + { + "epoch": 7.543891958256599, + "grad_norm": 0.16475969552993774, + "learning_rate": 1.5006748593711394e-05, + "loss": 1.7665, + "step": 24578 + }, + { + "epoch": 7.544198895027624, + "grad_norm": 0.17632076144218445, + "learning_rate": 1.5003198419405213e-05, + "loss": 1.7317, + "step": 24579 + }, + { + "epoch": 7.5445058317986495, + "grad_norm": 0.18197286128997803, + "learning_rate": 1.4999648590957937e-05, + "loss": 1.7278, + "step": 24580 + }, + { + "epoch": 7.544812768569675, + "grad_norm": 0.18043744564056396, + "learning_rate": 1.4996099108404648e-05, + "loss": 1.7335, + "step": 24581 + }, + { + "epoch": 7.5451197053407, + "grad_norm": 0.17072297632694244, + "learning_rate": 1.4992549971780407e-05, + "loss": 1.7236, + "step": 24582 + }, + { + "epoch": 7.545426642111725, + "grad_norm": 0.17413046956062317, + "learning_rate": 1.4989001181120338e-05, + "loss": 1.6794, + "step": 24583 + }, + { + "epoch": 7.54573357888275, + "grad_norm": 0.1684887856245041, + "learning_rate": 1.4985452736459443e-05, + "loss": 1.718, + "step": 24584 + }, + { + "epoch": 7.546040515653775, + "grad_norm": 0.19497069716453552, + "learning_rate": 1.4981904637832866e-05, + "loss": 1.7323, + "step": 24585 + }, + { + "epoch": 7.546347452424801, + "grad_norm": 0.24838820099830627, + "learning_rate": 1.4978356885275596e-05, + "loss": 1.7584, + "step": 24586 + }, + { + "epoch": 7.546654389195826, + "grad_norm": 0.20870071649551392, + "learning_rate": 1.4974809478822749e-05, + "loss": 1.738, + "step": 24587 + }, + { + "epoch": 7.546961325966851, + "grad_norm": 0.21980242431163788, + "learning_rate": 1.497126241850938e-05, + "loss": 1.763, + "step": 24588 + }, + { + "epoch": 7.547268262737876, + "grad_norm": 0.2156188189983368, + "learning_rate": 1.4967715704370488e-05, + "loss": 1.7357, + "step": 24589 + }, + { + "epoch": 7.547575199508901, + "grad_norm": 0.1864207684993744, + "learning_rate": 1.4964169336441202e-05, + "loss": 1.676, + "step": 24590 + }, + { + "epoch": 7.547882136279926, + "grad_norm": 0.18940003216266632, + "learning_rate": 1.4960623314756494e-05, + "loss": 1.7614, + "step": 24591 + }, + { + "epoch": 7.548189073050952, + "grad_norm": 0.19220350682735443, + "learning_rate": 1.4957077639351463e-05, + "loss": 1.7266, + "step": 24592 + }, + { + "epoch": 7.548496009821976, + "grad_norm": 0.15492811799049377, + "learning_rate": 1.4953532310261126e-05, + "loss": 1.7359, + "step": 24593 + }, + { + "epoch": 7.5488029465930016, + "grad_norm": 0.25591567158699036, + "learning_rate": 1.4949987327520526e-05, + "loss": 1.7, + "step": 24594 + }, + { + "epoch": 7.549109883364027, + "grad_norm": 0.18157868087291718, + "learning_rate": 1.4946442691164697e-05, + "loss": 1.7204, + "step": 24595 + }, + { + "epoch": 7.549416820135052, + "grad_norm": 0.17679910361766815, + "learning_rate": 1.4942898401228662e-05, + "loss": 1.6871, + "step": 24596 + }, + { + "epoch": 7.5497237569060776, + "grad_norm": 0.2000853717327118, + "learning_rate": 1.4939354457747456e-05, + "loss": 1.7186, + "step": 24597 + }, + { + "epoch": 7.550030693677103, + "grad_norm": 0.19947710633277893, + "learning_rate": 1.49358108607561e-05, + "loss": 1.6853, + "step": 24598 + }, + { + "epoch": 7.550337630448127, + "grad_norm": 0.16325148940086365, + "learning_rate": 1.4932267610289596e-05, + "loss": 1.7027, + "step": 24599 + }, + { + "epoch": 7.550644567219153, + "grad_norm": 0.22839638590812683, + "learning_rate": 1.4928724706383007e-05, + "loss": 1.7887, + "step": 24600 + }, + { + "epoch": 7.550951503990178, + "grad_norm": 0.16242358088493347, + "learning_rate": 1.4925182149071286e-05, + "loss": 1.6617, + "step": 24601 + }, + { + "epoch": 7.551258440761203, + "grad_norm": 0.1674090027809143, + "learning_rate": 1.4921639938389504e-05, + "loss": 1.656, + "step": 24602 + }, + { + "epoch": 7.551565377532229, + "grad_norm": 0.1628156453371048, + "learning_rate": 1.4918098074372605e-05, + "loss": 1.683, + "step": 24603 + }, + { + "epoch": 7.551872314303253, + "grad_norm": 0.19156567752361298, + "learning_rate": 1.4914556557055637e-05, + "loss": 1.7174, + "step": 24604 + }, + { + "epoch": 7.5521792510742785, + "grad_norm": 0.19634003937244415, + "learning_rate": 1.4911015386473603e-05, + "loss": 1.6605, + "step": 24605 + }, + { + "epoch": 7.552486187845304, + "grad_norm": 0.19273599982261658, + "learning_rate": 1.490747456266145e-05, + "loss": 1.7092, + "step": 24606 + }, + { + "epoch": 7.552793124616329, + "grad_norm": 0.23641756176948547, + "learning_rate": 1.4903934085654231e-05, + "loss": 1.7524, + "step": 24607 + }, + { + "epoch": 7.5531000613873545, + "grad_norm": 0.19623206555843353, + "learning_rate": 1.490039395548688e-05, + "loss": 1.7281, + "step": 24608 + }, + { + "epoch": 7.55340699815838, + "grad_norm": 0.1978278011083603, + "learning_rate": 1.489685417219442e-05, + "loss": 1.7099, + "step": 24609 + }, + { + "epoch": 7.553713934929404, + "grad_norm": 0.19635866582393646, + "learning_rate": 1.489331473581182e-05, + "loss": 1.7146, + "step": 24610 + }, + { + "epoch": 7.55402087170043, + "grad_norm": 0.2121066302061081, + "learning_rate": 1.4889775646374065e-05, + "loss": 1.7598, + "step": 24611 + }, + { + "epoch": 7.554327808471455, + "grad_norm": 0.17944596707820892, + "learning_rate": 1.4886236903916122e-05, + "loss": 1.6778, + "step": 24612 + }, + { + "epoch": 7.55463474524248, + "grad_norm": 0.15834666788578033, + "learning_rate": 1.488269850847297e-05, + "loss": 1.6498, + "step": 24613 + }, + { + "epoch": 7.554941682013506, + "grad_norm": 0.18597754836082458, + "learning_rate": 1.4879160460079573e-05, + "loss": 1.7145, + "step": 24614 + }, + { + "epoch": 7.55524861878453, + "grad_norm": 0.18300876021385193, + "learning_rate": 1.4875622758770897e-05, + "loss": 1.7253, + "step": 24615 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 0.17805244028568268, + "learning_rate": 1.4872085404581887e-05, + "loss": 1.7152, + "step": 24616 + }, + { + "epoch": 7.555862492326581, + "grad_norm": 0.1987949162721634, + "learning_rate": 1.486854839754755e-05, + "loss": 1.7501, + "step": 24617 + }, + { + "epoch": 7.556169429097606, + "grad_norm": 0.17301858961582184, + "learning_rate": 1.4865011737702777e-05, + "loss": 1.7122, + "step": 24618 + }, + { + "epoch": 7.556476365868631, + "grad_norm": 0.180507093667984, + "learning_rate": 1.4861475425082583e-05, + "loss": 1.7192, + "step": 24619 + }, + { + "epoch": 7.556783302639657, + "grad_norm": 0.16658489406108856, + "learning_rate": 1.4857939459721854e-05, + "loss": 1.6879, + "step": 24620 + }, + { + "epoch": 7.557090239410681, + "grad_norm": 0.19498902559280396, + "learning_rate": 1.4854403841655578e-05, + "loss": 1.7395, + "step": 24621 + }, + { + "epoch": 7.5573971761817065, + "grad_norm": 0.1737620085477829, + "learning_rate": 1.4850868570918702e-05, + "loss": 1.7029, + "step": 24622 + }, + { + "epoch": 7.557704112952732, + "grad_norm": 0.1600165218114853, + "learning_rate": 1.4847333647546113e-05, + "loss": 1.7194, + "step": 24623 + }, + { + "epoch": 7.558011049723757, + "grad_norm": 0.18392407894134521, + "learning_rate": 1.4843799071572806e-05, + "loss": 1.6838, + "step": 24624 + }, + { + "epoch": 7.558317986494782, + "grad_norm": 0.19074605405330658, + "learning_rate": 1.4840264843033651e-05, + "loss": 1.7069, + "step": 24625 + }, + { + "epoch": 7.558624923265807, + "grad_norm": 0.18156903982162476, + "learning_rate": 1.4836730961963619e-05, + "loss": 1.6494, + "step": 24626 + }, + { + "epoch": 7.558931860036832, + "grad_norm": 0.16716471314430237, + "learning_rate": 1.4833197428397627e-05, + "loss": 1.7516, + "step": 24627 + }, + { + "epoch": 7.559238796807858, + "grad_norm": 0.18882833421230316, + "learning_rate": 1.4829664242370588e-05, + "loss": 1.7117, + "step": 24628 + }, + { + "epoch": 7.559545733578883, + "grad_norm": 0.19933676719665527, + "learning_rate": 1.482613140391742e-05, + "loss": 1.6928, + "step": 24629 + }, + { + "epoch": 7.559852670349908, + "grad_norm": 0.15574946999549866, + "learning_rate": 1.4822598913073039e-05, + "loss": 1.702, + "step": 24630 + }, + { + "epoch": 7.560159607120933, + "grad_norm": 0.1953001618385315, + "learning_rate": 1.4819066769872353e-05, + "loss": 1.75, + "step": 24631 + }, + { + "epoch": 7.560466543891958, + "grad_norm": 0.18364208936691284, + "learning_rate": 1.481553497435027e-05, + "loss": 1.6697, + "step": 24632 + }, + { + "epoch": 7.560773480662983, + "grad_norm": 0.16670002043247223, + "learning_rate": 1.4812003526541673e-05, + "loss": 1.6919, + "step": 24633 + }, + { + "epoch": 7.561080417434009, + "grad_norm": 0.19388388097286224, + "learning_rate": 1.4808472426481518e-05, + "loss": 1.7412, + "step": 24634 + }, + { + "epoch": 7.561387354205034, + "grad_norm": 0.19203592836856842, + "learning_rate": 1.4804941674204631e-05, + "loss": 1.7128, + "step": 24635 + }, + { + "epoch": 7.5616942909760585, + "grad_norm": 0.18893340229988098, + "learning_rate": 1.4801411269745974e-05, + "loss": 1.7018, + "step": 24636 + }, + { + "epoch": 7.562001227747084, + "grad_norm": 0.1825447529554367, + "learning_rate": 1.4797881213140363e-05, + "loss": 1.7216, + "step": 24637 + }, + { + "epoch": 7.562308164518109, + "grad_norm": 0.19031697511672974, + "learning_rate": 1.4794351504422743e-05, + "loss": 1.7479, + "step": 24638 + }, + { + "epoch": 7.5626151012891345, + "grad_norm": 0.18328487873077393, + "learning_rate": 1.4790822143627991e-05, + "loss": 1.7222, + "step": 24639 + }, + { + "epoch": 7.56292203806016, + "grad_norm": 0.17531271278858185, + "learning_rate": 1.4787293130790941e-05, + "loss": 1.7197, + "step": 24640 + }, + { + "epoch": 7.563228974831185, + "grad_norm": 0.17078469693660736, + "learning_rate": 1.4783764465946526e-05, + "loss": 1.7715, + "step": 24641 + }, + { + "epoch": 7.56353591160221, + "grad_norm": 0.1859765648841858, + "learning_rate": 1.4780236149129567e-05, + "loss": 1.698, + "step": 24642 + }, + { + "epoch": 7.563842848373235, + "grad_norm": 0.18488194048404694, + "learning_rate": 1.4776708180374965e-05, + "loss": 1.6943, + "step": 24643 + }, + { + "epoch": 7.56414978514426, + "grad_norm": 0.1741705685853958, + "learning_rate": 1.4773180559717586e-05, + "loss": 1.6966, + "step": 24644 + }, + { + "epoch": 7.564456721915286, + "grad_norm": 0.20310313999652863, + "learning_rate": 1.476965328719228e-05, + "loss": 1.7572, + "step": 24645 + }, + { + "epoch": 7.564763658686311, + "grad_norm": 0.20557743310928345, + "learning_rate": 1.476612636283391e-05, + "loss": 1.7419, + "step": 24646 + }, + { + "epoch": 7.565070595457335, + "grad_norm": 0.20597940683364868, + "learning_rate": 1.4762599786677329e-05, + "loss": 1.7147, + "step": 24647 + }, + { + "epoch": 7.565377532228361, + "grad_norm": 0.21609526872634888, + "learning_rate": 1.4759073558757391e-05, + "loss": 1.7678, + "step": 24648 + }, + { + "epoch": 7.565684468999386, + "grad_norm": 0.2233472615480423, + "learning_rate": 1.4755547679108945e-05, + "loss": 1.7381, + "step": 24649 + }, + { + "epoch": 7.565991405770411, + "grad_norm": 0.19561493396759033, + "learning_rate": 1.4752022147766814e-05, + "loss": 1.7254, + "step": 24650 + }, + { + "epoch": 7.566298342541437, + "grad_norm": 0.16491469740867615, + "learning_rate": 1.4748496964765896e-05, + "loss": 1.6834, + "step": 24651 + }, + { + "epoch": 7.566605279312462, + "grad_norm": 0.16946618258953094, + "learning_rate": 1.4744972130140955e-05, + "loss": 1.7154, + "step": 24652 + }, + { + "epoch": 7.5669122160834865, + "grad_norm": 0.1625654697418213, + "learning_rate": 1.4741447643926904e-05, + "loss": 1.6941, + "step": 24653 + }, + { + "epoch": 7.567219152854512, + "grad_norm": 0.16875535249710083, + "learning_rate": 1.4737923506158491e-05, + "loss": 1.6875, + "step": 24654 + }, + { + "epoch": 7.567526089625537, + "grad_norm": 0.1625872105360031, + "learning_rate": 1.4734399716870607e-05, + "loss": 1.6558, + "step": 24655 + }, + { + "epoch": 7.5678330263965625, + "grad_norm": 0.17323140799999237, + "learning_rate": 1.4730876276098071e-05, + "loss": 1.7468, + "step": 24656 + }, + { + "epoch": 7.568139963167588, + "grad_norm": 0.18788693845272064, + "learning_rate": 1.472735318387566e-05, + "loss": 1.7345, + "step": 24657 + }, + { + "epoch": 7.568446899938612, + "grad_norm": 0.18096889555454254, + "learning_rate": 1.472383044023824e-05, + "loss": 1.725, + "step": 24658 + }, + { + "epoch": 7.568753836709638, + "grad_norm": 0.2327791154384613, + "learning_rate": 1.4720308045220577e-05, + "loss": 1.7367, + "step": 24659 + }, + { + "epoch": 7.569060773480663, + "grad_norm": 0.187728151679039, + "learning_rate": 1.4716785998857525e-05, + "loss": 1.6967, + "step": 24660 + }, + { + "epoch": 7.569367710251688, + "grad_norm": 0.18520617485046387, + "learning_rate": 1.4713264301183876e-05, + "loss": 1.6576, + "step": 24661 + }, + { + "epoch": 7.569674647022714, + "grad_norm": 0.20537808537483215, + "learning_rate": 1.4709742952234428e-05, + "loss": 1.6911, + "step": 24662 + }, + { + "epoch": 7.569981583793739, + "grad_norm": 0.18872039020061493, + "learning_rate": 1.4706221952043986e-05, + "loss": 1.745, + "step": 24663 + }, + { + "epoch": 7.570288520564763, + "grad_norm": 0.16083933413028717, + "learning_rate": 1.4702701300647343e-05, + "loss": 1.6875, + "step": 24664 + }, + { + "epoch": 7.570595457335789, + "grad_norm": 0.19390366971492767, + "learning_rate": 1.4699180998079293e-05, + "loss": 1.6996, + "step": 24665 + }, + { + "epoch": 7.570902394106814, + "grad_norm": 0.20478816330432892, + "learning_rate": 1.4695661044374632e-05, + "loss": 1.7359, + "step": 24666 + }, + { + "epoch": 7.571209330877839, + "grad_norm": 0.17485570907592773, + "learning_rate": 1.4692141439568136e-05, + "loss": 1.696, + "step": 24667 + }, + { + "epoch": 7.571516267648864, + "grad_norm": 0.18266968429088593, + "learning_rate": 1.4688622183694594e-05, + "loss": 1.713, + "step": 24668 + }, + { + "epoch": 7.571823204419889, + "grad_norm": 0.14412200450897217, + "learning_rate": 1.468510327678877e-05, + "loss": 1.6938, + "step": 24669 + }, + { + "epoch": 7.5721301411909145, + "grad_norm": 0.18144819140434265, + "learning_rate": 1.4681584718885488e-05, + "loss": 1.7523, + "step": 24670 + }, + { + "epoch": 7.57243707796194, + "grad_norm": 0.32198768854141235, + "learning_rate": 1.467806651001945e-05, + "loss": 1.71, + "step": 24671 + }, + { + "epoch": 7.572744014732965, + "grad_norm": 0.1535005122423172, + "learning_rate": 1.4674548650225483e-05, + "loss": 1.6912, + "step": 24672 + }, + { + "epoch": 7.5730509515039905, + "grad_norm": 0.17982423305511475, + "learning_rate": 1.4671031139538343e-05, + "loss": 1.6928, + "step": 24673 + }, + { + "epoch": 7.573357888275015, + "grad_norm": 0.16811783611774445, + "learning_rate": 1.4667513977992747e-05, + "loss": 1.6954, + "step": 24674 + }, + { + "epoch": 7.57366482504604, + "grad_norm": 0.18918997049331665, + "learning_rate": 1.4663997165623522e-05, + "loss": 1.6967, + "step": 24675 + }, + { + "epoch": 7.573971761817066, + "grad_norm": 0.16559816896915436, + "learning_rate": 1.4660480702465357e-05, + "loss": 1.7097, + "step": 24676 + }, + { + "epoch": 7.574278698588091, + "grad_norm": 0.20471042394638062, + "learning_rate": 1.4656964588553046e-05, + "loss": 1.7032, + "step": 24677 + }, + { + "epoch": 7.574585635359116, + "grad_norm": 0.16387851536273956, + "learning_rate": 1.4653448823921329e-05, + "loss": 1.7066, + "step": 24678 + }, + { + "epoch": 7.574892572130141, + "grad_norm": 0.19144418835639954, + "learning_rate": 1.4649933408604949e-05, + "loss": 1.7272, + "step": 24679 + }, + { + "epoch": 7.575199508901166, + "grad_norm": 0.17270216345787048, + "learning_rate": 1.4646418342638646e-05, + "loss": 1.7456, + "step": 24680 + }, + { + "epoch": 7.5755064456721914, + "grad_norm": 0.1937440037727356, + "learning_rate": 1.4642903626057159e-05, + "loss": 1.6973, + "step": 24681 + }, + { + "epoch": 7.575813382443217, + "grad_norm": 0.18958482146263123, + "learning_rate": 1.463938925889522e-05, + "loss": 1.7549, + "step": 24682 + }, + { + "epoch": 7.576120319214242, + "grad_norm": 0.20584101974964142, + "learning_rate": 1.4635875241187558e-05, + "loss": 1.7013, + "step": 24683 + }, + { + "epoch": 7.5764272559852675, + "grad_norm": 0.22839057445526123, + "learning_rate": 1.463236157296891e-05, + "loss": 1.7282, + "step": 24684 + }, + { + "epoch": 7.576734192756292, + "grad_norm": 0.19894570112228394, + "learning_rate": 1.4628848254273996e-05, + "loss": 1.7115, + "step": 24685 + }, + { + "epoch": 7.577041129527317, + "grad_norm": 0.1880837082862854, + "learning_rate": 1.4625335285137515e-05, + "loss": 1.6526, + "step": 24686 + }, + { + "epoch": 7.577348066298343, + "grad_norm": 0.21545001864433289, + "learning_rate": 1.4621822665594238e-05, + "loss": 1.6709, + "step": 24687 + }, + { + "epoch": 7.577655003069368, + "grad_norm": 0.2091502994298935, + "learning_rate": 1.4618310395678813e-05, + "loss": 1.6792, + "step": 24688 + }, + { + "epoch": 7.577961939840393, + "grad_norm": 0.2100556343793869, + "learning_rate": 1.4614798475426018e-05, + "loss": 1.7112, + "step": 24689 + }, + { + "epoch": 7.578268876611418, + "grad_norm": 0.17702727019786835, + "learning_rate": 1.4611286904870502e-05, + "loss": 1.6353, + "step": 24690 + }, + { + "epoch": 7.578575813382443, + "grad_norm": 0.1935967355966568, + "learning_rate": 1.4607775684046975e-05, + "loss": 1.6638, + "step": 24691 + }, + { + "epoch": 7.578882750153468, + "grad_norm": 0.13495506346225739, + "learning_rate": 1.4604264812990193e-05, + "loss": 1.6526, + "step": 24692 + }, + { + "epoch": 7.579189686924494, + "grad_norm": 0.20418134331703186, + "learning_rate": 1.4600754291734774e-05, + "loss": 1.731, + "step": 24693 + }, + { + "epoch": 7.579496623695519, + "grad_norm": 0.1541702151298523, + "learning_rate": 1.4597244120315467e-05, + "loss": 1.7047, + "step": 24694 + }, + { + "epoch": 7.579803560466544, + "grad_norm": 0.2106262892484665, + "learning_rate": 1.4593734298766942e-05, + "loss": 1.696, + "step": 24695 + }, + { + "epoch": 7.580110497237569, + "grad_norm": 0.15727077424526215, + "learning_rate": 1.4590224827123889e-05, + "loss": 1.6782, + "step": 24696 + }, + { + "epoch": 7.580417434008594, + "grad_norm": 0.19231721758842468, + "learning_rate": 1.4586715705420983e-05, + "loss": 1.7832, + "step": 24697 + }, + { + "epoch": 7.5807243707796195, + "grad_norm": 0.18290117383003235, + "learning_rate": 1.4583206933692916e-05, + "loss": 1.6715, + "step": 24698 + }, + { + "epoch": 7.581031307550645, + "grad_norm": 0.21551427245140076, + "learning_rate": 1.4579698511974355e-05, + "loss": 1.7326, + "step": 24699 + }, + { + "epoch": 7.581338244321669, + "grad_norm": 0.21561767160892487, + "learning_rate": 1.457619044029997e-05, + "loss": 1.6682, + "step": 24700 + }, + { + "epoch": 7.581645181092695, + "grad_norm": 0.15537963807582855, + "learning_rate": 1.457268271870444e-05, + "loss": 1.719, + "step": 24701 + }, + { + "epoch": 7.58195211786372, + "grad_norm": 0.18738612532615662, + "learning_rate": 1.456917534722242e-05, + "loss": 1.7415, + "step": 24702 + }, + { + "epoch": 7.582259054634745, + "grad_norm": 0.15522584319114685, + "learning_rate": 1.456566832588856e-05, + "loss": 1.6931, + "step": 24703 + }, + { + "epoch": 7.582565991405771, + "grad_norm": 0.192890003323555, + "learning_rate": 1.4562161654737567e-05, + "loss": 1.7726, + "step": 24704 + }, + { + "epoch": 7.582872928176796, + "grad_norm": 0.2163987159729004, + "learning_rate": 1.4558655333804028e-05, + "loss": 1.7459, + "step": 24705 + }, + { + "epoch": 7.58317986494782, + "grad_norm": 0.1635672152042389, + "learning_rate": 1.4555149363122667e-05, + "loss": 1.7407, + "step": 24706 + }, + { + "epoch": 7.583486801718846, + "grad_norm": 0.1858159899711609, + "learning_rate": 1.4551643742728072e-05, + "loss": 1.7175, + "step": 24707 + }, + { + "epoch": 7.583793738489871, + "grad_norm": 0.23077011108398438, + "learning_rate": 1.4548138472654904e-05, + "loss": 1.7739, + "step": 24708 + }, + { + "epoch": 7.584100675260896, + "grad_norm": 0.22413180768489838, + "learning_rate": 1.4544633552937836e-05, + "loss": 1.7208, + "step": 24709 + }, + { + "epoch": 7.584407612031922, + "grad_norm": 0.16147246956825256, + "learning_rate": 1.4541128983611445e-05, + "loss": 1.7021, + "step": 24710 + }, + { + "epoch": 7.584714548802946, + "grad_norm": 0.17363815009593964, + "learning_rate": 1.4537624764710439e-05, + "loss": 1.6863, + "step": 24711 + }, + { + "epoch": 7.5850214855739715, + "grad_norm": 0.14971798658370972, + "learning_rate": 1.4534120896269377e-05, + "loss": 1.655, + "step": 24712 + }, + { + "epoch": 7.585328422344997, + "grad_norm": 0.15934213995933533, + "learning_rate": 1.4530617378322937e-05, + "loss": 1.6771, + "step": 24713 + }, + { + "epoch": 7.585635359116022, + "grad_norm": 0.17807291448116302, + "learning_rate": 1.4527114210905724e-05, + "loss": 1.7419, + "step": 24714 + }, + { + "epoch": 7.5859422958870475, + "grad_norm": 0.1727002114057541, + "learning_rate": 1.4523611394052356e-05, + "loss": 1.7232, + "step": 24715 + }, + { + "epoch": 7.586249232658073, + "grad_norm": 0.1625738888978958, + "learning_rate": 1.452010892779746e-05, + "loss": 1.6967, + "step": 24716 + }, + { + "epoch": 7.586556169429097, + "grad_norm": 0.2153816670179367, + "learning_rate": 1.4516606812175636e-05, + "loss": 1.7339, + "step": 24717 + }, + { + "epoch": 7.586863106200123, + "grad_norm": 0.19343912601470947, + "learning_rate": 1.451310504722151e-05, + "loss": 1.7059, + "step": 24718 + }, + { + "epoch": 7.587170042971148, + "grad_norm": 0.16220279037952423, + "learning_rate": 1.450960363296967e-05, + "loss": 1.6825, + "step": 24719 + }, + { + "epoch": 7.587476979742173, + "grad_norm": 0.1678459346294403, + "learning_rate": 1.4506102569454716e-05, + "loss": 1.728, + "step": 24720 + }, + { + "epoch": 7.587783916513199, + "grad_norm": 0.19833502173423767, + "learning_rate": 1.4502601856711295e-05, + "loss": 1.7733, + "step": 24721 + }, + { + "epoch": 7.588090853284223, + "grad_norm": 0.1593111902475357, + "learning_rate": 1.4499101494773931e-05, + "loss": 1.7017, + "step": 24722 + }, + { + "epoch": 7.588397790055248, + "grad_norm": 0.2083328664302826, + "learning_rate": 1.449560148367729e-05, + "loss": 1.7661, + "step": 24723 + }, + { + "epoch": 7.588704726826274, + "grad_norm": 0.19797182083129883, + "learning_rate": 1.4492101823455906e-05, + "loss": 1.788, + "step": 24724 + }, + { + "epoch": 7.589011663597299, + "grad_norm": 0.15613096952438354, + "learning_rate": 1.4488602514144373e-05, + "loss": 1.7295, + "step": 24725 + }, + { + "epoch": 7.589318600368324, + "grad_norm": 0.18078529834747314, + "learning_rate": 1.4485103555777307e-05, + "loss": 1.7165, + "step": 24726 + }, + { + "epoch": 7.58962553713935, + "grad_norm": 0.14951148629188538, + "learning_rate": 1.4481604948389238e-05, + "loss": 1.6431, + "step": 24727 + }, + { + "epoch": 7.589932473910374, + "grad_norm": 0.19518490135669708, + "learning_rate": 1.4478106692014797e-05, + "loss": 1.7332, + "step": 24728 + }, + { + "epoch": 7.5902394106813995, + "grad_norm": 0.17438004910945892, + "learning_rate": 1.4474608786688493e-05, + "loss": 1.6677, + "step": 24729 + }, + { + "epoch": 7.590546347452425, + "grad_norm": 0.2767544090747833, + "learning_rate": 1.4471111232444944e-05, + "loss": 1.7649, + "step": 24730 + }, + { + "epoch": 7.59085328422345, + "grad_norm": 0.21649987995624542, + "learning_rate": 1.4467614029318699e-05, + "loss": 1.7349, + "step": 24731 + }, + { + "epoch": 7.5911602209944755, + "grad_norm": 0.26566463708877563, + "learning_rate": 1.4464117177344316e-05, + "loss": 1.7474, + "step": 24732 + }, + { + "epoch": 7.5914671577655, + "grad_norm": 0.19050925970077515, + "learning_rate": 1.4460620676556358e-05, + "loss": 1.7066, + "step": 24733 + }, + { + "epoch": 7.591774094536525, + "grad_norm": 0.20030665397644043, + "learning_rate": 1.4457124526989375e-05, + "loss": 1.6589, + "step": 24734 + }, + { + "epoch": 7.592081031307551, + "grad_norm": 0.18715742230415344, + "learning_rate": 1.4453628728677921e-05, + "loss": 1.7186, + "step": 24735 + }, + { + "epoch": 7.592387968078576, + "grad_norm": 0.241498664021492, + "learning_rate": 1.4450133281656542e-05, + "loss": 1.6686, + "step": 24736 + }, + { + "epoch": 7.592694904849601, + "grad_norm": 0.20305299758911133, + "learning_rate": 1.4446638185959765e-05, + "loss": 1.7351, + "step": 24737 + }, + { + "epoch": 7.593001841620627, + "grad_norm": 0.177521750330925, + "learning_rate": 1.444314344162218e-05, + "loss": 1.6383, + "step": 24738 + }, + { + "epoch": 7.593308778391651, + "grad_norm": 0.19877439737319946, + "learning_rate": 1.443964904867826e-05, + "loss": 1.7335, + "step": 24739 + }, + { + "epoch": 7.593615715162676, + "grad_norm": 0.16544201970100403, + "learning_rate": 1.4436155007162605e-05, + "loss": 1.6952, + "step": 24740 + }, + { + "epoch": 7.593922651933702, + "grad_norm": 0.20925499498844147, + "learning_rate": 1.443266131710969e-05, + "loss": 1.7042, + "step": 24741 + }, + { + "epoch": 7.594229588704727, + "grad_norm": 0.16688574850559235, + "learning_rate": 1.4429167978554054e-05, + "loss": 1.6797, + "step": 24742 + }, + { + "epoch": 7.5945365254757515, + "grad_norm": 0.2231293022632599, + "learning_rate": 1.4425674991530258e-05, + "loss": 1.8697, + "step": 24743 + }, + { + "epoch": 7.594843462246777, + "grad_norm": 0.2114260196685791, + "learning_rate": 1.442218235607276e-05, + "loss": 1.7404, + "step": 24744 + }, + { + "epoch": 7.595150399017802, + "grad_norm": 0.1842830628156662, + "learning_rate": 1.441869007221614e-05, + "loss": 1.7687, + "step": 24745 + }, + { + "epoch": 7.5954573357888275, + "grad_norm": 0.17780441045761108, + "learning_rate": 1.4415198139994846e-05, + "loss": 1.7492, + "step": 24746 + }, + { + "epoch": 7.595764272559853, + "grad_norm": 0.18805068731307983, + "learning_rate": 1.4411706559443438e-05, + "loss": 1.757, + "step": 24747 + }, + { + "epoch": 7.596071209330878, + "grad_norm": 0.18918974697589874, + "learning_rate": 1.4408215330596403e-05, + "loss": 1.7006, + "step": 24748 + }, + { + "epoch": 7.596378146101903, + "grad_norm": 0.17850689589977264, + "learning_rate": 1.440472445348825e-05, + "loss": 1.6565, + "step": 24749 + }, + { + "epoch": 7.596685082872928, + "grad_norm": 0.20043544471263885, + "learning_rate": 1.4401233928153468e-05, + "loss": 1.7314, + "step": 24750 + }, + { + "epoch": 7.596992019643953, + "grad_norm": 0.1963229477405548, + "learning_rate": 1.4397743754626564e-05, + "loss": 1.6946, + "step": 24751 + }, + { + "epoch": 7.597298956414979, + "grad_norm": 0.2203695923089981, + "learning_rate": 1.4394253932942014e-05, + "loss": 1.7128, + "step": 24752 + }, + { + "epoch": 7.597605893186004, + "grad_norm": 0.19254128634929657, + "learning_rate": 1.4390764463134322e-05, + "loss": 1.6748, + "step": 24753 + }, + { + "epoch": 7.597912829957028, + "grad_norm": 0.19880495965480804, + "learning_rate": 1.438727534523795e-05, + "loss": 1.7155, + "step": 24754 + }, + { + "epoch": 7.598219766728054, + "grad_norm": 0.17486177384853363, + "learning_rate": 1.4383786579287428e-05, + "loss": 1.7484, + "step": 24755 + }, + { + "epoch": 7.598526703499079, + "grad_norm": 0.17247791588306427, + "learning_rate": 1.4380298165317168e-05, + "loss": 1.7225, + "step": 24756 + }, + { + "epoch": 7.598833640270104, + "grad_norm": 0.1802847534418106, + "learning_rate": 1.4376810103361714e-05, + "loss": 1.7009, + "step": 24757 + }, + { + "epoch": 7.59914057704113, + "grad_norm": 0.1934153437614441, + "learning_rate": 1.4373322393455485e-05, + "loss": 1.6957, + "step": 24758 + }, + { + "epoch": 7.599447513812155, + "grad_norm": 0.1508229374885559, + "learning_rate": 1.436983503563295e-05, + "loss": 1.6677, + "step": 24759 + }, + { + "epoch": 7.5997544505831796, + "grad_norm": 0.16684283316135406, + "learning_rate": 1.4366348029928623e-05, + "loss": 1.7394, + "step": 24760 + }, + { + "epoch": 7.600061387354205, + "grad_norm": 0.22492031753063202, + "learning_rate": 1.4362861376376896e-05, + "loss": 1.7302, + "step": 24761 + }, + { + "epoch": 7.60036832412523, + "grad_norm": 0.1654716283082962, + "learning_rate": 1.4359375075012294e-05, + "loss": 1.6487, + "step": 24762 + }, + { + "epoch": 7.600675260896256, + "grad_norm": 0.17514392733573914, + "learning_rate": 1.4355889125869198e-05, + "loss": 1.6952, + "step": 24763 + }, + { + "epoch": 7.600982197667281, + "grad_norm": 0.21000738441944122, + "learning_rate": 1.4352403528982123e-05, + "loss": 1.714, + "step": 24764 + }, + { + "epoch": 7.601289134438305, + "grad_norm": 0.18791960179805756, + "learning_rate": 1.4348918284385481e-05, + "loss": 1.7334, + "step": 24765 + }, + { + "epoch": 7.601596071209331, + "grad_norm": 0.267089307308197, + "learning_rate": 1.4345433392113734e-05, + "loss": 1.7567, + "step": 24766 + }, + { + "epoch": 7.601903007980356, + "grad_norm": 0.1814621239900589, + "learning_rate": 1.4341948852201304e-05, + "loss": 1.7031, + "step": 24767 + }, + { + "epoch": 7.602209944751381, + "grad_norm": 0.16144737601280212, + "learning_rate": 1.4338464664682639e-05, + "loss": 1.6844, + "step": 24768 + }, + { + "epoch": 7.602516881522407, + "grad_norm": 0.14824162423610687, + "learning_rate": 1.433498082959217e-05, + "loss": 1.6854, + "step": 24769 + }, + { + "epoch": 7.602823818293432, + "grad_norm": 0.1837405115365982, + "learning_rate": 1.4331497346964318e-05, + "loss": 1.7087, + "step": 24770 + }, + { + "epoch": 7.6031307550644565, + "grad_norm": 0.20706148445606232, + "learning_rate": 1.4328014216833508e-05, + "loss": 1.7816, + "step": 24771 + }, + { + "epoch": 7.603437691835482, + "grad_norm": 0.16134382784366608, + "learning_rate": 1.4324531439234196e-05, + "loss": 1.7095, + "step": 24772 + }, + { + "epoch": 7.603744628606507, + "grad_norm": 0.15924426913261414, + "learning_rate": 1.4321049014200737e-05, + "loss": 1.7115, + "step": 24773 + }, + { + "epoch": 7.6040515653775325, + "grad_norm": 0.14942041039466858, + "learning_rate": 1.4317566941767625e-05, + "loss": 1.6872, + "step": 24774 + }, + { + "epoch": 7.604358502148557, + "grad_norm": 0.1646505445241928, + "learning_rate": 1.4314085221969209e-05, + "loss": 1.663, + "step": 24775 + }, + { + "epoch": 7.604665438919582, + "grad_norm": 0.17342600226402283, + "learning_rate": 1.4310603854839904e-05, + "loss": 1.7702, + "step": 24776 + }, + { + "epoch": 7.604972375690608, + "grad_norm": 0.17148490250110626, + "learning_rate": 1.4307122840414167e-05, + "loss": 1.7392, + "step": 24777 + }, + { + "epoch": 7.605279312461633, + "grad_norm": 0.22112305462360382, + "learning_rate": 1.4303642178726328e-05, + "loss": 1.6784, + "step": 24778 + }, + { + "epoch": 7.605586249232658, + "grad_norm": 0.22548529505729675, + "learning_rate": 1.4300161869810846e-05, + "loss": 1.7405, + "step": 24779 + }, + { + "epoch": 7.605893186003684, + "grad_norm": 0.179958313703537, + "learning_rate": 1.4296681913702065e-05, + "loss": 1.6848, + "step": 24780 + }, + { + "epoch": 7.606200122774708, + "grad_norm": 0.16872282326221466, + "learning_rate": 1.4293202310434407e-05, + "loss": 1.6973, + "step": 24781 + }, + { + "epoch": 7.606507059545733, + "grad_norm": 0.20554648339748383, + "learning_rate": 1.428972306004226e-05, + "loss": 1.7111, + "step": 24782 + }, + { + "epoch": 7.606813996316759, + "grad_norm": 0.1803034543991089, + "learning_rate": 1.4286244162559993e-05, + "loss": 1.6895, + "step": 24783 + }, + { + "epoch": 7.607120933087784, + "grad_norm": 0.18902915716171265, + "learning_rate": 1.4282765618021999e-05, + "loss": 1.766, + "step": 24784 + }, + { + "epoch": 7.607427869858809, + "grad_norm": 0.16692081093788147, + "learning_rate": 1.4279287426462646e-05, + "loss": 1.688, + "step": 24785 + }, + { + "epoch": 7.607734806629834, + "grad_norm": 0.1538083851337433, + "learning_rate": 1.4275809587916317e-05, + "loss": 1.6611, + "step": 24786 + }, + { + "epoch": 7.608041743400859, + "grad_norm": 0.1921710968017578, + "learning_rate": 1.4272332102417369e-05, + "loss": 1.7338, + "step": 24787 + }, + { + "epoch": 7.6083486801718845, + "grad_norm": 0.1812380999326706, + "learning_rate": 1.4268854970000167e-05, + "loss": 1.7613, + "step": 24788 + }, + { + "epoch": 7.60865561694291, + "grad_norm": 0.1762949675321579, + "learning_rate": 1.4265378190699108e-05, + "loss": 1.6796, + "step": 24789 + }, + { + "epoch": 7.608962553713935, + "grad_norm": 0.17698180675506592, + "learning_rate": 1.4261901764548497e-05, + "loss": 1.7065, + "step": 24790 + }, + { + "epoch": 7.6092694904849605, + "grad_norm": 0.18398644030094147, + "learning_rate": 1.4258425691582756e-05, + "loss": 1.7322, + "step": 24791 + }, + { + "epoch": 7.609576427255985, + "grad_norm": 0.18370044231414795, + "learning_rate": 1.425494997183618e-05, + "loss": 1.7565, + "step": 24792 + }, + { + "epoch": 7.60988336402701, + "grad_norm": 0.19615988433361053, + "learning_rate": 1.4251474605343124e-05, + "loss": 1.7507, + "step": 24793 + }, + { + "epoch": 7.610190300798036, + "grad_norm": 0.17218533158302307, + "learning_rate": 1.4247999592137979e-05, + "loss": 1.6692, + "step": 24794 + }, + { + "epoch": 7.610497237569061, + "grad_norm": 0.19105172157287598, + "learning_rate": 1.4244524932255027e-05, + "loss": 1.7421, + "step": 24795 + }, + { + "epoch": 7.610804174340086, + "grad_norm": 0.21565218269824982, + "learning_rate": 1.424105062572867e-05, + "loss": 1.7143, + "step": 24796 + }, + { + "epoch": 7.611111111111111, + "grad_norm": 0.17394152283668518, + "learning_rate": 1.4237576672593178e-05, + "loss": 1.7202, + "step": 24797 + }, + { + "epoch": 7.611418047882136, + "grad_norm": 0.18680404126644135, + "learning_rate": 1.4234103072882926e-05, + "loss": 1.7155, + "step": 24798 + }, + { + "epoch": 7.611724984653161, + "grad_norm": 0.16173312067985535, + "learning_rate": 1.4230629826632237e-05, + "loss": 1.6549, + "step": 24799 + }, + { + "epoch": 7.612031921424187, + "grad_norm": 0.2055300772190094, + "learning_rate": 1.4227156933875423e-05, + "loss": 1.7382, + "step": 24800 + }, + { + "epoch": 7.612338858195212, + "grad_norm": 0.17331050336360931, + "learning_rate": 1.4223684394646813e-05, + "loss": 1.719, + "step": 24801 + }, + { + "epoch": 7.612645794966237, + "grad_norm": 0.23106786608695984, + "learning_rate": 1.4220212208980727e-05, + "loss": 1.7083, + "step": 24802 + }, + { + "epoch": 7.612952731737262, + "grad_norm": 0.21011751890182495, + "learning_rate": 1.4216740376911469e-05, + "loss": 1.7629, + "step": 24803 + }, + { + "epoch": 7.613259668508287, + "grad_norm": 0.15120279788970947, + "learning_rate": 1.4213268898473359e-05, + "loss": 1.673, + "step": 24804 + }, + { + "epoch": 7.6135666052793125, + "grad_norm": 0.17431862652301788, + "learning_rate": 1.4209797773700684e-05, + "loss": 1.672, + "step": 24805 + }, + { + "epoch": 7.613873542050338, + "grad_norm": 0.1592133790254593, + "learning_rate": 1.42063270026278e-05, + "loss": 1.7102, + "step": 24806 + }, + { + "epoch": 7.614180478821363, + "grad_norm": 0.22535641491413116, + "learning_rate": 1.4202856585288954e-05, + "loss": 1.7177, + "step": 24807 + }, + { + "epoch": 7.614487415592388, + "grad_norm": 0.2111314982175827, + "learning_rate": 1.4199386521718455e-05, + "loss": 1.7399, + "step": 24808 + }, + { + "epoch": 7.614794352363413, + "grad_norm": 0.18377532064914703, + "learning_rate": 1.419591681195061e-05, + "loss": 1.6713, + "step": 24809 + }, + { + "epoch": 7.615101289134438, + "grad_norm": 0.19743949174880981, + "learning_rate": 1.4192447456019681e-05, + "loss": 1.7761, + "step": 24810 + }, + { + "epoch": 7.615408225905464, + "grad_norm": 0.17827409505844116, + "learning_rate": 1.4188978453960006e-05, + "loss": 1.7091, + "step": 24811 + }, + { + "epoch": 7.615715162676489, + "grad_norm": 0.18304505944252014, + "learning_rate": 1.4185509805805802e-05, + "loss": 1.7496, + "step": 24812 + }, + { + "epoch": 7.616022099447514, + "grad_norm": 0.19510503113269806, + "learning_rate": 1.4182041511591415e-05, + "loss": 1.7436, + "step": 24813 + }, + { + "epoch": 7.616329036218539, + "grad_norm": 0.17127136886119843, + "learning_rate": 1.4178573571351056e-05, + "loss": 1.6598, + "step": 24814 + }, + { + "epoch": 7.616635972989564, + "grad_norm": 0.20133370161056519, + "learning_rate": 1.4175105985119041e-05, + "loss": 1.7802, + "step": 24815 + }, + { + "epoch": 7.616942909760589, + "grad_norm": 0.17706145346164703, + "learning_rate": 1.4171638752929634e-05, + "loss": 1.7105, + "step": 24816 + }, + { + "epoch": 7.617249846531615, + "grad_norm": 0.179647758603096, + "learning_rate": 1.4168171874817088e-05, + "loss": 1.732, + "step": 24817 + }, + { + "epoch": 7.617556783302639, + "grad_norm": 0.16380085051059723, + "learning_rate": 1.4164705350815665e-05, + "loss": 1.6671, + "step": 24818 + }, + { + "epoch": 7.6178637200736645, + "grad_norm": 0.19407404959201813, + "learning_rate": 1.4161239180959635e-05, + "loss": 1.7261, + "step": 24819 + }, + { + "epoch": 7.61817065684469, + "grad_norm": 0.1647375524044037, + "learning_rate": 1.415777336528324e-05, + "loss": 1.7438, + "step": 24820 + }, + { + "epoch": 7.618477593615715, + "grad_norm": 0.21532754600048065, + "learning_rate": 1.4154307903820735e-05, + "loss": 1.7674, + "step": 24821 + }, + { + "epoch": 7.6187845303867405, + "grad_norm": 0.1834939867258072, + "learning_rate": 1.4150842796606372e-05, + "loss": 1.7027, + "step": 24822 + }, + { + "epoch": 7.619091467157766, + "grad_norm": 0.15102218091487885, + "learning_rate": 1.4147378043674397e-05, + "loss": 1.6858, + "step": 24823 + }, + { + "epoch": 7.61939840392879, + "grad_norm": 0.161713644862175, + "learning_rate": 1.4143913645059038e-05, + "loss": 1.7149, + "step": 24824 + }, + { + "epoch": 7.619705340699816, + "grad_norm": 0.15568867325782776, + "learning_rate": 1.4140449600794547e-05, + "loss": 1.6642, + "step": 24825 + }, + { + "epoch": 7.620012277470841, + "grad_norm": 0.15993504226207733, + "learning_rate": 1.4136985910915147e-05, + "loss": 1.6497, + "step": 24826 + }, + { + "epoch": 7.620319214241866, + "grad_norm": 0.16981028020381927, + "learning_rate": 1.4133522575455055e-05, + "loss": 1.7347, + "step": 24827 + }, + { + "epoch": 7.620626151012892, + "grad_norm": 0.16143053770065308, + "learning_rate": 1.4130059594448547e-05, + "loss": 1.7166, + "step": 24828 + }, + { + "epoch": 7.620933087783916, + "grad_norm": 0.16914571821689606, + "learning_rate": 1.4126596967929789e-05, + "loss": 1.7008, + "step": 24829 + }, + { + "epoch": 7.621240024554941, + "grad_norm": 0.20040032267570496, + "learning_rate": 1.4123134695933049e-05, + "loss": 1.7099, + "step": 24830 + }, + { + "epoch": 7.621546961325967, + "grad_norm": 0.17086143791675568, + "learning_rate": 1.4119672778492493e-05, + "loss": 1.6913, + "step": 24831 + }, + { + "epoch": 7.621853898096992, + "grad_norm": 0.16268399357795715, + "learning_rate": 1.4116211215642378e-05, + "loss": 1.6919, + "step": 24832 + }, + { + "epoch": 7.622160834868017, + "grad_norm": 0.21211197972297668, + "learning_rate": 1.4112750007416891e-05, + "loss": 1.7493, + "step": 24833 + }, + { + "epoch": 7.622467771639043, + "grad_norm": 0.16767694056034088, + "learning_rate": 1.4109289153850247e-05, + "loss": 1.6863, + "step": 24834 + }, + { + "epoch": 7.622774708410067, + "grad_norm": 0.1769869178533554, + "learning_rate": 1.4105828654976639e-05, + "loss": 1.7303, + "step": 24835 + }, + { + "epoch": 7.6230816451810925, + "grad_norm": 0.2202748954296112, + "learning_rate": 1.4102368510830278e-05, + "loss": 1.7648, + "step": 24836 + }, + { + "epoch": 7.623388581952118, + "grad_norm": 0.18347454071044922, + "learning_rate": 1.4098908721445342e-05, + "loss": 1.7615, + "step": 24837 + }, + { + "epoch": 7.623695518723143, + "grad_norm": 0.17966698110103607, + "learning_rate": 1.4095449286856039e-05, + "loss": 1.7031, + "step": 24838 + }, + { + "epoch": 7.6240024554941686, + "grad_norm": 0.1794397532939911, + "learning_rate": 1.409199020709655e-05, + "loss": 1.7129, + "step": 24839 + }, + { + "epoch": 7.624309392265193, + "grad_norm": 0.1838780641555786, + "learning_rate": 1.4088531482201056e-05, + "loss": 1.6936, + "step": 24840 + }, + { + "epoch": 7.624616329036218, + "grad_norm": 0.1940378099679947, + "learning_rate": 1.4085073112203745e-05, + "loss": 1.71, + "step": 24841 + }, + { + "epoch": 7.624923265807244, + "grad_norm": 0.17340345680713654, + "learning_rate": 1.4081615097138796e-05, + "loss": 1.711, + "step": 24842 + }, + { + "epoch": 7.625230202578269, + "grad_norm": 0.23193266987800598, + "learning_rate": 1.4078157437040374e-05, + "loss": 1.7366, + "step": 24843 + }, + { + "epoch": 7.625537139349294, + "grad_norm": 0.1742531955242157, + "learning_rate": 1.4074700131942653e-05, + "loss": 1.7179, + "step": 24844 + }, + { + "epoch": 7.62584407612032, + "grad_norm": 0.22453147172927856, + "learning_rate": 1.4071243181879806e-05, + "loss": 1.708, + "step": 24845 + }, + { + "epoch": 7.626151012891344, + "grad_norm": 0.16176854074001312, + "learning_rate": 1.4067786586885977e-05, + "loss": 1.7012, + "step": 24846 + }, + { + "epoch": 7.6264579496623695, + "grad_norm": 0.16796015202999115, + "learning_rate": 1.4064330346995369e-05, + "loss": 1.6918, + "step": 24847 + }, + { + "epoch": 7.626764886433395, + "grad_norm": 0.1737142950296402, + "learning_rate": 1.4060874462242085e-05, + "loss": 1.6908, + "step": 24848 + }, + { + "epoch": 7.62707182320442, + "grad_norm": 0.1697089523077011, + "learning_rate": 1.4057418932660315e-05, + "loss": 1.6811, + "step": 24849 + }, + { + "epoch": 7.627378759975445, + "grad_norm": 0.19860011339187622, + "learning_rate": 1.40539637582842e-05, + "loss": 1.7803, + "step": 24850 + }, + { + "epoch": 7.62768569674647, + "grad_norm": 0.16383512318134308, + "learning_rate": 1.4050508939147883e-05, + "loss": 1.7004, + "step": 24851 + }, + { + "epoch": 7.627992633517495, + "grad_norm": 0.18878768384456635, + "learning_rate": 1.404705447528551e-05, + "loss": 1.6916, + "step": 24852 + }, + { + "epoch": 7.628299570288521, + "grad_norm": 0.1417449563741684, + "learning_rate": 1.4043600366731213e-05, + "loss": 1.6908, + "step": 24853 + }, + { + "epoch": 7.628606507059546, + "grad_norm": 0.19786077737808228, + "learning_rate": 1.4040146613519134e-05, + "loss": 1.7307, + "step": 24854 + }, + { + "epoch": 7.628913443830571, + "grad_norm": 0.17295710742473602, + "learning_rate": 1.40366932156834e-05, + "loss": 1.7111, + "step": 24855 + }, + { + "epoch": 7.629220380601596, + "grad_norm": 0.2160167098045349, + "learning_rate": 1.4033240173258144e-05, + "loss": 1.71, + "step": 24856 + }, + { + "epoch": 7.629527317372621, + "grad_norm": 0.1741226315498352, + "learning_rate": 1.402978748627749e-05, + "loss": 1.7024, + "step": 24857 + }, + { + "epoch": 7.629834254143646, + "grad_norm": 0.18043182790279388, + "learning_rate": 1.4026335154775561e-05, + "loss": 1.7046, + "step": 24858 + }, + { + "epoch": 7.630141190914672, + "grad_norm": 0.1592903584241867, + "learning_rate": 1.4022883178786472e-05, + "loss": 1.6913, + "step": 24859 + }, + { + "epoch": 7.630448127685697, + "grad_norm": 0.25504007935523987, + "learning_rate": 1.4019431558344337e-05, + "loss": 1.7221, + "step": 24860 + }, + { + "epoch": 7.6307550644567215, + "grad_norm": 0.15307627618312836, + "learning_rate": 1.4015980293483272e-05, + "loss": 1.6725, + "step": 24861 + }, + { + "epoch": 7.631062001227747, + "grad_norm": 0.2595232129096985, + "learning_rate": 1.4012529384237372e-05, + "loss": 1.7309, + "step": 24862 + }, + { + "epoch": 7.631368937998772, + "grad_norm": 0.19494156539440155, + "learning_rate": 1.4009078830640743e-05, + "loss": 1.737, + "step": 24863 + }, + { + "epoch": 7.6316758747697975, + "grad_norm": 0.19264118373394012, + "learning_rate": 1.4005628632727518e-05, + "loss": 1.7337, + "step": 24864 + }, + { + "epoch": 7.631982811540823, + "grad_norm": 0.18758688867092133, + "learning_rate": 1.400217879053174e-05, + "loss": 1.684, + "step": 24865 + }, + { + "epoch": 7.632289748311848, + "grad_norm": 0.17094476521015167, + "learning_rate": 1.399872930408756e-05, + "loss": 1.6724, + "step": 24866 + }, + { + "epoch": 7.632596685082873, + "grad_norm": 0.18967430293560028, + "learning_rate": 1.3995280173429003e-05, + "loss": 1.6852, + "step": 24867 + }, + { + "epoch": 7.632903621853898, + "grad_norm": 0.1686837375164032, + "learning_rate": 1.399183139859021e-05, + "loss": 1.6673, + "step": 24868 + }, + { + "epoch": 7.633210558624923, + "grad_norm": 0.19091126322746277, + "learning_rate": 1.398838297960524e-05, + "loss": 1.7423, + "step": 24869 + }, + { + "epoch": 7.633517495395949, + "grad_norm": 0.20197629928588867, + "learning_rate": 1.3984934916508186e-05, + "loss": 1.7217, + "step": 24870 + }, + { + "epoch": 7.633824432166974, + "grad_norm": 0.1490679830312729, + "learning_rate": 1.3981487209333105e-05, + "loss": 1.6367, + "step": 24871 + }, + { + "epoch": 7.634131368937998, + "grad_norm": 0.14664824306964874, + "learning_rate": 1.3978039858114084e-05, + "loss": 1.68, + "step": 24872 + }, + { + "epoch": 7.634438305709024, + "grad_norm": 0.19181138277053833, + "learning_rate": 1.3974592862885182e-05, + "loss": 1.766, + "step": 24873 + }, + { + "epoch": 7.634745242480049, + "grad_norm": 0.17716391384601593, + "learning_rate": 1.397114622368047e-05, + "loss": 1.7479, + "step": 24874 + }, + { + "epoch": 7.635052179251074, + "grad_norm": 0.16603589057922363, + "learning_rate": 1.3967699940534006e-05, + "loss": 1.6455, + "step": 24875 + }, + { + "epoch": 7.6353591160221, + "grad_norm": 0.19060885906219482, + "learning_rate": 1.3964254013479855e-05, + "loss": 1.7367, + "step": 24876 + }, + { + "epoch": 7.635666052793125, + "grad_norm": 0.18182092905044556, + "learning_rate": 1.3960808442552064e-05, + "loss": 1.7235, + "step": 24877 + }, + { + "epoch": 7.6359729895641495, + "grad_norm": 0.22578656673431396, + "learning_rate": 1.3957363227784691e-05, + "loss": 1.7229, + "step": 24878 + }, + { + "epoch": 7.636279926335175, + "grad_norm": 0.25397053360939026, + "learning_rate": 1.3953918369211776e-05, + "loss": 1.7094, + "step": 24879 + }, + { + "epoch": 7.6365868631062, + "grad_norm": 0.164917454123497, + "learning_rate": 1.3950473866867353e-05, + "loss": 1.695, + "step": 24880 + }, + { + "epoch": 7.6368937998772255, + "grad_norm": 0.18737520277500153, + "learning_rate": 1.3947029720785503e-05, + "loss": 1.6719, + "step": 24881 + }, + { + "epoch": 7.637200736648251, + "grad_norm": 0.1839492917060852, + "learning_rate": 1.3943585931000213e-05, + "loss": 1.7136, + "step": 24882 + }, + { + "epoch": 7.637507673419275, + "grad_norm": 0.17182856798171997, + "learning_rate": 1.3940142497545566e-05, + "loss": 1.678, + "step": 24883 + }, + { + "epoch": 7.637814610190301, + "grad_norm": 0.20733827352523804, + "learning_rate": 1.393669942045554e-05, + "loss": 1.6398, + "step": 24884 + }, + { + "epoch": 7.638121546961326, + "grad_norm": 0.19326196610927582, + "learning_rate": 1.3933256699764196e-05, + "loss": 1.7351, + "step": 24885 + }, + { + "epoch": 7.638428483732351, + "grad_norm": 0.2368818074464798, + "learning_rate": 1.3929814335505552e-05, + "loss": 1.7567, + "step": 24886 + }, + { + "epoch": 7.638735420503377, + "grad_norm": 0.16702532768249512, + "learning_rate": 1.3926372327713626e-05, + "loss": 1.6791, + "step": 24887 + }, + { + "epoch": 7.639042357274402, + "grad_norm": 0.18634511530399323, + "learning_rate": 1.3922930676422435e-05, + "loss": 1.691, + "step": 24888 + }, + { + "epoch": 7.639349294045426, + "grad_norm": 0.19349521398544312, + "learning_rate": 1.3919489381665985e-05, + "loss": 1.7037, + "step": 24889 + }, + { + "epoch": 7.639656230816452, + "grad_norm": 0.16760465502738953, + "learning_rate": 1.3916048443478286e-05, + "loss": 1.6871, + "step": 24890 + }, + { + "epoch": 7.639963167587477, + "grad_norm": 0.25489017367362976, + "learning_rate": 1.3912607861893351e-05, + "loss": 1.6914, + "step": 24891 + }, + { + "epoch": 7.640270104358502, + "grad_norm": 0.17488406598567963, + "learning_rate": 1.390916763694517e-05, + "loss": 1.6826, + "step": 24892 + }, + { + "epoch": 7.640577041129527, + "grad_norm": 0.2128411829471588, + "learning_rate": 1.3905727768667753e-05, + "loss": 1.711, + "step": 24893 + }, + { + "epoch": 7.640883977900552, + "grad_norm": 0.17478415369987488, + "learning_rate": 1.3902288257095087e-05, + "loss": 1.7174, + "step": 24894 + }, + { + "epoch": 7.6411909146715775, + "grad_norm": 0.20493042469024658, + "learning_rate": 1.3898849102261168e-05, + "loss": 1.7649, + "step": 24895 + }, + { + "epoch": 7.641497851442603, + "grad_norm": 0.16712170839309692, + "learning_rate": 1.3895410304199979e-05, + "loss": 1.6785, + "step": 24896 + }, + { + "epoch": 7.641804788213628, + "grad_norm": 0.18580594658851624, + "learning_rate": 1.3891971862945497e-05, + "loss": 1.7001, + "step": 24897 + }, + { + "epoch": 7.6421117249846535, + "grad_norm": 0.19040817022323608, + "learning_rate": 1.3888533778531737e-05, + "loss": 1.709, + "step": 24898 + }, + { + "epoch": 7.642418661755678, + "grad_norm": 0.17573465406894684, + "learning_rate": 1.3885096050992624e-05, + "loss": 1.7205, + "step": 24899 + }, + { + "epoch": 7.642725598526703, + "grad_norm": 0.19123490154743195, + "learning_rate": 1.3881658680362186e-05, + "loss": 1.6882, + "step": 24900 + }, + { + "epoch": 7.643032535297729, + "grad_norm": 0.18465565145015717, + "learning_rate": 1.387822166667434e-05, + "loss": 1.7294, + "step": 24901 + }, + { + "epoch": 7.643339472068754, + "grad_norm": 0.17927341163158417, + "learning_rate": 1.3874785009963098e-05, + "loss": 1.7625, + "step": 24902 + }, + { + "epoch": 7.643646408839779, + "grad_norm": 0.15983298420906067, + "learning_rate": 1.38713487102624e-05, + "loss": 1.6939, + "step": 24903 + }, + { + "epoch": 7.643953345610804, + "grad_norm": 0.20288127660751343, + "learning_rate": 1.3867912767606211e-05, + "loss": 1.7461, + "step": 24904 + }, + { + "epoch": 7.644260282381829, + "grad_norm": 0.18587160110473633, + "learning_rate": 1.3864477182028484e-05, + "loss": 1.7389, + "step": 24905 + }, + { + "epoch": 7.644567219152854, + "grad_norm": 0.17089903354644775, + "learning_rate": 1.3861041953563175e-05, + "loss": 1.6697, + "step": 24906 + }, + { + "epoch": 7.64487415592388, + "grad_norm": 0.20302993059158325, + "learning_rate": 1.3857607082244228e-05, + "loss": 1.7199, + "step": 24907 + }, + { + "epoch": 7.645181092694905, + "grad_norm": 0.14781002700328827, + "learning_rate": 1.3854172568105594e-05, + "loss": 1.687, + "step": 24908 + }, + { + "epoch": 7.64548802946593, + "grad_norm": 0.17847368121147156, + "learning_rate": 1.3850738411181214e-05, + "loss": 1.6511, + "step": 24909 + }, + { + "epoch": 7.645794966236955, + "grad_norm": 0.1448936015367508, + "learning_rate": 1.3847304611505019e-05, + "loss": 1.6601, + "step": 24910 + }, + { + "epoch": 7.64610190300798, + "grad_norm": 0.19413447380065918, + "learning_rate": 1.3843871169110955e-05, + "loss": 1.6901, + "step": 24911 + }, + { + "epoch": 7.6464088397790055, + "grad_norm": 0.18118292093276978, + "learning_rate": 1.3840438084032947e-05, + "loss": 1.7574, + "step": 24912 + }, + { + "epoch": 7.646715776550031, + "grad_norm": 0.16136041283607483, + "learning_rate": 1.3837005356304921e-05, + "loss": 1.6826, + "step": 24913 + }, + { + "epoch": 7.647022713321056, + "grad_norm": 0.1773926019668579, + "learning_rate": 1.3833572985960792e-05, + "loss": 1.7136, + "step": 24914 + }, + { + "epoch": 7.647329650092081, + "grad_norm": 0.15100078284740448, + "learning_rate": 1.3830140973034522e-05, + "loss": 1.7331, + "step": 24915 + }, + { + "epoch": 7.647636586863106, + "grad_norm": 0.16588352620601654, + "learning_rate": 1.3826709317559966e-05, + "loss": 1.6883, + "step": 24916 + }, + { + "epoch": 7.647943523634131, + "grad_norm": 0.14271478354930878, + "learning_rate": 1.3823278019571106e-05, + "loss": 1.6566, + "step": 24917 + }, + { + "epoch": 7.648250460405157, + "grad_norm": 0.18383146822452545, + "learning_rate": 1.3819847079101782e-05, + "loss": 1.7006, + "step": 24918 + }, + { + "epoch": 7.648557397176182, + "grad_norm": 0.20069970190525055, + "learning_rate": 1.3816416496185952e-05, + "loss": 1.696, + "step": 24919 + }, + { + "epoch": 7.648864333947207, + "grad_norm": 0.15686273574829102, + "learning_rate": 1.3812986270857497e-05, + "loss": 1.6998, + "step": 24920 + }, + { + "epoch": 7.649171270718232, + "grad_norm": 0.14733602106571198, + "learning_rate": 1.3809556403150326e-05, + "loss": 1.6692, + "step": 24921 + }, + { + "epoch": 7.649478207489257, + "grad_norm": 0.16720153391361237, + "learning_rate": 1.3806126893098332e-05, + "loss": 1.6841, + "step": 24922 + }, + { + "epoch": 7.649785144260282, + "grad_norm": 0.1548861712217331, + "learning_rate": 1.3802697740735404e-05, + "loss": 1.6914, + "step": 24923 + }, + { + "epoch": 7.650092081031308, + "grad_norm": 0.1591617912054062, + "learning_rate": 1.3799268946095433e-05, + "loss": 1.7121, + "step": 24924 + }, + { + "epoch": 7.650399017802332, + "grad_norm": 0.19735665619373322, + "learning_rate": 1.3795840509212305e-05, + "loss": 1.741, + "step": 24925 + }, + { + "epoch": 7.650705954573358, + "grad_norm": 0.16886921226978302, + "learning_rate": 1.37924124301199e-05, + "loss": 1.7166, + "step": 24926 + }, + { + "epoch": 7.651012891344383, + "grad_norm": 0.2084806114435196, + "learning_rate": 1.3788984708852098e-05, + "loss": 1.7525, + "step": 24927 + }, + { + "epoch": 7.651319828115408, + "grad_norm": 0.15286533534526825, + "learning_rate": 1.3785557345442773e-05, + "loss": 1.6754, + "step": 24928 + }, + { + "epoch": 7.651626764886434, + "grad_norm": 0.19647163152694702, + "learning_rate": 1.3782130339925792e-05, + "loss": 1.7114, + "step": 24929 + }, + { + "epoch": 7.651933701657459, + "grad_norm": 0.18526645004749298, + "learning_rate": 1.3778703692335031e-05, + "loss": 1.7258, + "step": 24930 + }, + { + "epoch": 7.652240638428484, + "grad_norm": 0.19880451261997223, + "learning_rate": 1.3775277402704334e-05, + "loss": 1.7065, + "step": 24931 + }, + { + "epoch": 7.652547575199509, + "grad_norm": 0.18702107667922974, + "learning_rate": 1.377185147106761e-05, + "loss": 1.7171, + "step": 24932 + }, + { + "epoch": 7.652854511970534, + "grad_norm": 0.1455291509628296, + "learning_rate": 1.3768425897458654e-05, + "loss": 1.6824, + "step": 24933 + }, + { + "epoch": 7.653161448741559, + "grad_norm": 0.16770213842391968, + "learning_rate": 1.3765000681911377e-05, + "loss": 1.6544, + "step": 24934 + }, + { + "epoch": 7.653468385512585, + "grad_norm": 0.18496285378932953, + "learning_rate": 1.3761575824459572e-05, + "loss": 1.7206, + "step": 24935 + }, + { + "epoch": 7.653775322283609, + "grad_norm": 0.1832813024520874, + "learning_rate": 1.3758151325137131e-05, + "loss": 1.7673, + "step": 24936 + }, + { + "epoch": 7.6540822590546345, + "grad_norm": 0.20916350185871124, + "learning_rate": 1.3754727183977878e-05, + "loss": 1.7224, + "step": 24937 + }, + { + "epoch": 7.65438919582566, + "grad_norm": 0.1878765970468521, + "learning_rate": 1.3751303401015653e-05, + "loss": 1.6966, + "step": 24938 + }, + { + "epoch": 7.654696132596685, + "grad_norm": 0.17944355309009552, + "learning_rate": 1.37478799762843e-05, + "loss": 1.6752, + "step": 24939 + }, + { + "epoch": 7.6550030693677105, + "grad_norm": 0.20930083096027374, + "learning_rate": 1.3744456909817638e-05, + "loss": 1.7632, + "step": 24940 + }, + { + "epoch": 7.655310006138736, + "grad_norm": 0.19838237762451172, + "learning_rate": 1.3741034201649511e-05, + "loss": 1.7039, + "step": 24941 + }, + { + "epoch": 7.65561694290976, + "grad_norm": 0.233023539185524, + "learning_rate": 1.373761185181373e-05, + "loss": 1.7117, + "step": 24942 + }, + { + "epoch": 7.655923879680786, + "grad_norm": 0.16270874440670013, + "learning_rate": 1.3734189860344127e-05, + "loss": 1.6603, + "step": 24943 + }, + { + "epoch": 7.656230816451811, + "grad_norm": 0.18456563353538513, + "learning_rate": 1.373076822727451e-05, + "loss": 1.6891, + "step": 24944 + }, + { + "epoch": 7.656537753222836, + "grad_norm": 0.17064985632896423, + "learning_rate": 1.3727346952638703e-05, + "loss": 1.6788, + "step": 24945 + }, + { + "epoch": 7.656844689993862, + "grad_norm": 0.17548689246177673, + "learning_rate": 1.3723926036470513e-05, + "loss": 1.6699, + "step": 24946 + }, + { + "epoch": 7.657151626764886, + "grad_norm": 0.1660275012254715, + "learning_rate": 1.3720505478803753e-05, + "loss": 1.6706, + "step": 24947 + }, + { + "epoch": 7.657458563535911, + "grad_norm": 0.2977990508079529, + "learning_rate": 1.3717085279672199e-05, + "loss": 1.7463, + "step": 24948 + }, + { + "epoch": 7.657765500306937, + "grad_norm": 0.24440810084342957, + "learning_rate": 1.3713665439109708e-05, + "loss": 1.7528, + "step": 24949 + }, + { + "epoch": 7.658072437077962, + "grad_norm": 0.1579941064119339, + "learning_rate": 1.3710245957150015e-05, + "loss": 1.6902, + "step": 24950 + }, + { + "epoch": 7.658379373848987, + "grad_norm": 0.197731152176857, + "learning_rate": 1.3706826833826968e-05, + "loss": 1.7377, + "step": 24951 + }, + { + "epoch": 7.658686310620013, + "grad_norm": 0.16704770922660828, + "learning_rate": 1.3703408069174301e-05, + "loss": 1.7057, + "step": 24952 + }, + { + "epoch": 7.658993247391037, + "grad_norm": 0.2167888730764389, + "learning_rate": 1.3699989663225848e-05, + "loss": 1.7668, + "step": 24953 + }, + { + "epoch": 7.6593001841620625, + "grad_norm": 0.16870343685150146, + "learning_rate": 1.369657161601537e-05, + "loss": 1.6781, + "step": 24954 + }, + { + "epoch": 7.659607120933088, + "grad_norm": 0.22422032058238983, + "learning_rate": 1.3693153927576646e-05, + "loss": 1.7034, + "step": 24955 + }, + { + "epoch": 7.659914057704113, + "grad_norm": 0.20777738094329834, + "learning_rate": 1.3689736597943465e-05, + "loss": 1.7401, + "step": 24956 + }, + { + "epoch": 7.6602209944751385, + "grad_norm": 0.17802980542182922, + "learning_rate": 1.3686319627149579e-05, + "loss": 1.7067, + "step": 24957 + }, + { + "epoch": 7.660527931246163, + "grad_norm": 0.21444065868854523, + "learning_rate": 1.368290301522877e-05, + "loss": 1.6731, + "step": 24958 + }, + { + "epoch": 7.660834868017188, + "grad_norm": 0.17638131976127625, + "learning_rate": 1.3679486762214805e-05, + "loss": 1.738, + "step": 24959 + }, + { + "epoch": 7.661141804788214, + "grad_norm": 0.1900044083595276, + "learning_rate": 1.3676070868141432e-05, + "loss": 1.7673, + "step": 24960 + }, + { + "epoch": 7.661448741559239, + "grad_norm": 0.20749469101428986, + "learning_rate": 1.3672655333042422e-05, + "loss": 1.7341, + "step": 24961 + }, + { + "epoch": 7.661755678330264, + "grad_norm": 0.21292604506015778, + "learning_rate": 1.3669240156951518e-05, + "loss": 1.7114, + "step": 24962 + }, + { + "epoch": 7.66206261510129, + "grad_norm": 0.21506401896476746, + "learning_rate": 1.3665825339902482e-05, + "loss": 1.7412, + "step": 24963 + }, + { + "epoch": 7.662369551872314, + "grad_norm": 0.21838976442813873, + "learning_rate": 1.3662410881929055e-05, + "loss": 1.7178, + "step": 24964 + }, + { + "epoch": 7.662676488643339, + "grad_norm": 0.18973253667354584, + "learning_rate": 1.365899678306497e-05, + "loss": 1.7161, + "step": 24965 + }, + { + "epoch": 7.662983425414365, + "grad_norm": 0.19278603792190552, + "learning_rate": 1.3655583043344006e-05, + "loss": 1.6952, + "step": 24966 + }, + { + "epoch": 7.66329036218539, + "grad_norm": 0.2025471180677414, + "learning_rate": 1.365216966279984e-05, + "loss": 1.6893, + "step": 24967 + }, + { + "epoch": 7.6635972989564145, + "grad_norm": 0.14461325109004974, + "learning_rate": 1.364875664146627e-05, + "loss": 1.6762, + "step": 24968 + }, + { + "epoch": 7.66390423572744, + "grad_norm": 0.22851425409317017, + "learning_rate": 1.3645343979376962e-05, + "loss": 1.7743, + "step": 24969 + }, + { + "epoch": 7.664211172498465, + "grad_norm": 0.16862350702285767, + "learning_rate": 1.3641931676565688e-05, + "loss": 1.6385, + "step": 24970 + }, + { + "epoch": 7.6645181092694905, + "grad_norm": 0.20482461154460907, + "learning_rate": 1.3638519733066157e-05, + "loss": 1.7824, + "step": 24971 + }, + { + "epoch": 7.664825046040516, + "grad_norm": 0.18505734205245972, + "learning_rate": 1.3635108148912085e-05, + "loss": 1.6845, + "step": 24972 + }, + { + "epoch": 7.665131982811541, + "grad_norm": 0.18774990737438202, + "learning_rate": 1.3631696924137189e-05, + "loss": 1.7091, + "step": 24973 + }, + { + "epoch": 7.665438919582566, + "grad_norm": 0.1967296153306961, + "learning_rate": 1.362828605877518e-05, + "loss": 1.6953, + "step": 24974 + }, + { + "epoch": 7.665745856353591, + "grad_norm": 0.16951262950897217, + "learning_rate": 1.3624875552859767e-05, + "loss": 1.7302, + "step": 24975 + }, + { + "epoch": 7.666052793124616, + "grad_norm": 0.21003109216690063, + "learning_rate": 1.3621465406424656e-05, + "loss": 1.7567, + "step": 24976 + }, + { + "epoch": 7.666359729895642, + "grad_norm": 0.19087877869606018, + "learning_rate": 1.361805561950354e-05, + "loss": 1.7373, + "step": 24977 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 0.17799946665763855, + "learning_rate": 1.3614646192130126e-05, + "loss": 1.7121, + "step": 24978 + }, + { + "epoch": 7.666973603437691, + "grad_norm": 0.15956062078475952, + "learning_rate": 1.3611237124338105e-05, + "loss": 1.6654, + "step": 24979 + }, + { + "epoch": 7.667280540208717, + "grad_norm": 0.1963697075843811, + "learning_rate": 1.3607828416161167e-05, + "loss": 1.7902, + "step": 24980 + }, + { + "epoch": 7.667587476979742, + "grad_norm": 0.22204460203647614, + "learning_rate": 1.3604420067632995e-05, + "loss": 1.8199, + "step": 24981 + }, + { + "epoch": 7.667894413750767, + "grad_norm": 0.20523740351200104, + "learning_rate": 1.3601012078787268e-05, + "loss": 1.7253, + "step": 24982 + }, + { + "epoch": 7.668201350521793, + "grad_norm": 0.18693773448467255, + "learning_rate": 1.3597604449657697e-05, + "loss": 1.7032, + "step": 24983 + }, + { + "epoch": 7.668508287292818, + "grad_norm": 0.17661312222480774, + "learning_rate": 1.3594197180277906e-05, + "loss": 1.6648, + "step": 24984 + }, + { + "epoch": 7.6688152240638425, + "grad_norm": 0.19099490344524384, + "learning_rate": 1.3590790270681631e-05, + "loss": 1.7107, + "step": 24985 + }, + { + "epoch": 7.669122160834868, + "grad_norm": 0.1854488104581833, + "learning_rate": 1.3587383720902469e-05, + "loss": 1.7241, + "step": 24986 + }, + { + "epoch": 7.669429097605893, + "grad_norm": 0.18763068318367004, + "learning_rate": 1.3583977530974146e-05, + "loss": 1.7207, + "step": 24987 + }, + { + "epoch": 7.6697360343769185, + "grad_norm": 0.15608854591846466, + "learning_rate": 1.3580571700930295e-05, + "loss": 1.6835, + "step": 24988 + }, + { + "epoch": 7.670042971147944, + "grad_norm": 0.1587948501110077, + "learning_rate": 1.3577166230804584e-05, + "loss": 1.6801, + "step": 24989 + }, + { + "epoch": 7.670349907918968, + "grad_norm": 0.21106089651584625, + "learning_rate": 1.3573761120630668e-05, + "loss": 1.7411, + "step": 24990 + }, + { + "epoch": 7.670656844689994, + "grad_norm": 0.17361705005168915, + "learning_rate": 1.3570356370442188e-05, + "loss": 1.7123, + "step": 24991 + }, + { + "epoch": 7.670963781461019, + "grad_norm": 0.16272610425949097, + "learning_rate": 1.3566951980272802e-05, + "loss": 1.7002, + "step": 24992 + }, + { + "epoch": 7.671270718232044, + "grad_norm": 0.18787643313407898, + "learning_rate": 1.3563547950156147e-05, + "loss": 1.7364, + "step": 24993 + }, + { + "epoch": 7.67157765500307, + "grad_norm": 0.18257403373718262, + "learning_rate": 1.3560144280125869e-05, + "loss": 1.6783, + "step": 24994 + }, + { + "epoch": 7.671884591774095, + "grad_norm": 0.21298269927501678, + "learning_rate": 1.3556740970215608e-05, + "loss": 1.815, + "step": 24995 + }, + { + "epoch": 7.672191528545119, + "grad_norm": 0.1805877983570099, + "learning_rate": 1.3553338020458988e-05, + "loss": 1.719, + "step": 24996 + }, + { + "epoch": 7.672498465316145, + "grad_norm": 0.210116446018219, + "learning_rate": 1.3549935430889643e-05, + "loss": 1.7603, + "step": 24997 + }, + { + "epoch": 7.67280540208717, + "grad_norm": 0.18893682956695557, + "learning_rate": 1.35465332015412e-05, + "loss": 1.6681, + "step": 24998 + }, + { + "epoch": 7.673112338858195, + "grad_norm": 0.17718489468097687, + "learning_rate": 1.354313133244729e-05, + "loss": 1.6799, + "step": 24999 + }, + { + "epoch": 7.67341927562922, + "grad_norm": 0.20092631876468658, + "learning_rate": 1.3539729823641517e-05, + "loss": 1.7273, + "step": 25000 + }, + { + "epoch": 7.673726212400245, + "grad_norm": 0.20800542831420898, + "learning_rate": 1.353632867515749e-05, + "loss": 1.7214, + "step": 25001 + }, + { + "epoch": 7.6740331491712706, + "grad_norm": 0.2119656354188919, + "learning_rate": 1.3532927887028861e-05, + "loss": 1.6701, + "step": 25002 + }, + { + "epoch": 7.674340085942296, + "grad_norm": 0.1645115315914154, + "learning_rate": 1.3529527459289188e-05, + "loss": 1.7199, + "step": 25003 + }, + { + "epoch": 7.674647022713321, + "grad_norm": 0.24434153735637665, + "learning_rate": 1.3526127391972116e-05, + "loss": 1.7295, + "step": 25004 + }, + { + "epoch": 7.6749539594843466, + "grad_norm": 0.20978261530399323, + "learning_rate": 1.3522727685111231e-05, + "loss": 1.8069, + "step": 25005 + }, + { + "epoch": 7.675260896255372, + "grad_norm": 0.19354932010173798, + "learning_rate": 1.3519328338740128e-05, + "loss": 1.7601, + "step": 25006 + }, + { + "epoch": 7.675567833026396, + "grad_norm": 0.19636447727680206, + "learning_rate": 1.3515929352892403e-05, + "loss": 1.7871, + "step": 25007 + }, + { + "epoch": 7.675874769797422, + "grad_norm": 0.18915504217147827, + "learning_rate": 1.3512530727601653e-05, + "loss": 1.6926, + "step": 25008 + }, + { + "epoch": 7.676181706568447, + "grad_norm": 0.18168985843658447, + "learning_rate": 1.3509132462901458e-05, + "loss": 1.7272, + "step": 25009 + }, + { + "epoch": 7.676488643339472, + "grad_norm": 0.17246222496032715, + "learning_rate": 1.3505734558825406e-05, + "loss": 1.7186, + "step": 25010 + }, + { + "epoch": 7.676795580110497, + "grad_norm": 0.2694617211818695, + "learning_rate": 1.3502337015407074e-05, + "loss": 1.8334, + "step": 25011 + }, + { + "epoch": 7.677102516881522, + "grad_norm": 0.1549377590417862, + "learning_rate": 1.3498939832680035e-05, + "loss": 1.7003, + "step": 25012 + }, + { + "epoch": 7.6774094536525475, + "grad_norm": 0.1559179425239563, + "learning_rate": 1.349554301067787e-05, + "loss": 1.7028, + "step": 25013 + }, + { + "epoch": 7.677716390423573, + "grad_norm": 0.17349909245967865, + "learning_rate": 1.3492146549434149e-05, + "loss": 1.6749, + "step": 25014 + }, + { + "epoch": 7.678023327194598, + "grad_norm": 0.19697749614715576, + "learning_rate": 1.348875044898243e-05, + "loss": 1.8291, + "step": 25015 + }, + { + "epoch": 7.6783302639656235, + "grad_norm": 0.17260968685150146, + "learning_rate": 1.3485354709356279e-05, + "loss": 1.6686, + "step": 25016 + }, + { + "epoch": 7.678637200736648, + "grad_norm": 0.16892582178115845, + "learning_rate": 1.3481959330589255e-05, + "loss": 1.755, + "step": 25017 + }, + { + "epoch": 7.678944137507673, + "grad_norm": 0.17961645126342773, + "learning_rate": 1.3478564312714898e-05, + "loss": 1.6937, + "step": 25018 + }, + { + "epoch": 7.679251074278699, + "grad_norm": 0.20795513689517975, + "learning_rate": 1.34751696557668e-05, + "loss": 1.799, + "step": 25019 + }, + { + "epoch": 7.679558011049724, + "grad_norm": 0.16439545154571533, + "learning_rate": 1.3471775359778461e-05, + "loss": 1.6942, + "step": 25020 + }, + { + "epoch": 7.679864947820749, + "grad_norm": 0.19526144862174988, + "learning_rate": 1.3468381424783472e-05, + "loss": 1.7255, + "step": 25021 + }, + { + "epoch": 7.680171884591774, + "grad_norm": 0.18183457851409912, + "learning_rate": 1.3464987850815319e-05, + "loss": 1.7027, + "step": 25022 + }, + { + "epoch": 7.680478821362799, + "grad_norm": 0.18443404138088226, + "learning_rate": 1.3461594637907587e-05, + "loss": 1.6973, + "step": 25023 + }, + { + "epoch": 7.680785758133824, + "grad_norm": 0.18545331060886383, + "learning_rate": 1.3458201786093794e-05, + "loss": 1.7479, + "step": 25024 + }, + { + "epoch": 7.68109269490485, + "grad_norm": 0.18329958617687225, + "learning_rate": 1.3454809295407467e-05, + "loss": 1.7301, + "step": 25025 + }, + { + "epoch": 7.681399631675875, + "grad_norm": 0.19131959974765778, + "learning_rate": 1.3451417165882136e-05, + "loss": 1.7402, + "step": 25026 + }, + { + "epoch": 7.6817065684469, + "grad_norm": 0.1782912164926529, + "learning_rate": 1.3448025397551323e-05, + "loss": 1.6771, + "step": 25027 + }, + { + "epoch": 7.682013505217925, + "grad_norm": 0.1757265031337738, + "learning_rate": 1.3444633990448546e-05, + "loss": 1.7336, + "step": 25028 + }, + { + "epoch": 7.68232044198895, + "grad_norm": 0.16550128161907196, + "learning_rate": 1.3441242944607318e-05, + "loss": 1.6335, + "step": 25029 + }, + { + "epoch": 7.6826273787599755, + "grad_norm": 0.18069832026958466, + "learning_rate": 1.3437852260061162e-05, + "loss": 1.7172, + "step": 25030 + }, + { + "epoch": 7.682934315531001, + "grad_norm": 0.21195535361766815, + "learning_rate": 1.3434461936843573e-05, + "loss": 1.7248, + "step": 25031 + }, + { + "epoch": 7.683241252302026, + "grad_norm": 0.17209839820861816, + "learning_rate": 1.3431071974988068e-05, + "loss": 1.666, + "step": 25032 + }, + { + "epoch": 7.683548189073051, + "grad_norm": 0.20565249025821686, + "learning_rate": 1.342768237452814e-05, + "loss": 1.7839, + "step": 25033 + }, + { + "epoch": 7.683855125844076, + "grad_norm": 0.2549617290496826, + "learning_rate": 1.342429313549729e-05, + "loss": 1.714, + "step": 25034 + }, + { + "epoch": 7.684162062615101, + "grad_norm": 0.1980191171169281, + "learning_rate": 1.3420904257929001e-05, + "loss": 1.7267, + "step": 25035 + }, + { + "epoch": 7.684468999386127, + "grad_norm": 0.1763298362493515, + "learning_rate": 1.3417515741856806e-05, + "loss": 1.6754, + "step": 25036 + }, + { + "epoch": 7.684775936157152, + "grad_norm": 0.15831413865089417, + "learning_rate": 1.341412758731413e-05, + "loss": 1.6885, + "step": 25037 + }, + { + "epoch": 7.685082872928177, + "grad_norm": 0.15696564316749573, + "learning_rate": 1.341073979433452e-05, + "loss": 1.7032, + "step": 25038 + }, + { + "epoch": 7.685389809699202, + "grad_norm": 0.19193214178085327, + "learning_rate": 1.3407352362951392e-05, + "loss": 1.7708, + "step": 25039 + }, + { + "epoch": 7.685696746470227, + "grad_norm": 0.1886630803346634, + "learning_rate": 1.3403965293198273e-05, + "loss": 1.7323, + "step": 25040 + }, + { + "epoch": 7.686003683241252, + "grad_norm": 0.16137991845607758, + "learning_rate": 1.340057858510862e-05, + "loss": 1.703, + "step": 25041 + }, + { + "epoch": 7.686310620012278, + "grad_norm": 0.21111373603343964, + "learning_rate": 1.33971922387159e-05, + "loss": 1.7428, + "step": 25042 + }, + { + "epoch": 7.686617556783302, + "grad_norm": 0.20256482064723969, + "learning_rate": 1.3393806254053582e-05, + "loss": 1.7651, + "step": 25043 + }, + { + "epoch": 7.6869244935543275, + "grad_norm": 0.19125118851661682, + "learning_rate": 1.3390420631155121e-05, + "loss": 1.7253, + "step": 25044 + }, + { + "epoch": 7.687231430325353, + "grad_norm": 0.22446562349796295, + "learning_rate": 1.3387035370053985e-05, + "loss": 1.7363, + "step": 25045 + }, + { + "epoch": 7.687538367096378, + "grad_norm": 0.17356424033641815, + "learning_rate": 1.3383650470783621e-05, + "loss": 1.7384, + "step": 25046 + }, + { + "epoch": 7.6878453038674035, + "grad_norm": 0.27287909388542175, + "learning_rate": 1.3380265933377489e-05, + "loss": 1.6754, + "step": 25047 + }, + { + "epoch": 7.688152240638429, + "grad_norm": 0.14978452026844025, + "learning_rate": 1.3376881757869032e-05, + "loss": 1.6693, + "step": 25048 + }, + { + "epoch": 7.688459177409453, + "grad_norm": 0.1746874898672104, + "learning_rate": 1.3373497944291691e-05, + "loss": 1.6878, + "step": 25049 + }, + { + "epoch": 7.688766114180479, + "grad_norm": 0.18032371997833252, + "learning_rate": 1.3370114492678915e-05, + "loss": 1.7153, + "step": 25050 + }, + { + "epoch": 7.689073050951504, + "grad_norm": 0.23111680150032043, + "learning_rate": 1.3366731403064131e-05, + "loss": 1.7132, + "step": 25051 + }, + { + "epoch": 7.689379987722529, + "grad_norm": 0.1587868630886078, + "learning_rate": 1.3363348675480768e-05, + "loss": 1.6692, + "step": 25052 + }, + { + "epoch": 7.689686924493555, + "grad_norm": 0.14336444437503815, + "learning_rate": 1.3359966309962301e-05, + "loss": 1.6648, + "step": 25053 + }, + { + "epoch": 7.689993861264579, + "grad_norm": 0.3048984408378601, + "learning_rate": 1.3356584306542086e-05, + "loss": 1.8109, + "step": 25054 + }, + { + "epoch": 7.690300798035604, + "grad_norm": 0.19389018416404724, + "learning_rate": 1.3353202665253617e-05, + "loss": 1.6725, + "step": 25055 + }, + { + "epoch": 7.69060773480663, + "grad_norm": 0.19246982038021088, + "learning_rate": 1.3349821386130246e-05, + "loss": 1.726, + "step": 25056 + }, + { + "epoch": 7.690914671577655, + "grad_norm": 0.19062727689743042, + "learning_rate": 1.3346440469205435e-05, + "loss": 1.7685, + "step": 25057 + }, + { + "epoch": 7.69122160834868, + "grad_norm": 0.16987577080726624, + "learning_rate": 1.3343059914512585e-05, + "loss": 1.7032, + "step": 25058 + }, + { + "epoch": 7.691528545119706, + "grad_norm": 0.17328599095344543, + "learning_rate": 1.3339679722085103e-05, + "loss": 1.7271, + "step": 25059 + }, + { + "epoch": 7.69183548189073, + "grad_norm": 0.2677443325519562, + "learning_rate": 1.3336299891956405e-05, + "loss": 1.8, + "step": 25060 + }, + { + "epoch": 7.6921424186617555, + "grad_norm": 0.18369975686073303, + "learning_rate": 1.333292042415985e-05, + "loss": 1.7483, + "step": 25061 + }, + { + "epoch": 7.692449355432781, + "grad_norm": 0.17269635200500488, + "learning_rate": 1.3329541318728883e-05, + "loss": 1.7016, + "step": 25062 + }, + { + "epoch": 7.692756292203806, + "grad_norm": 0.17280563712120056, + "learning_rate": 1.3326162575696889e-05, + "loss": 1.742, + "step": 25063 + }, + { + "epoch": 7.6930632289748315, + "grad_norm": 0.2000025361776352, + "learning_rate": 1.3322784195097243e-05, + "loss": 1.6947, + "step": 25064 + }, + { + "epoch": 7.693370165745856, + "grad_norm": 0.17853626608848572, + "learning_rate": 1.3319406176963344e-05, + "loss": 1.7075, + "step": 25065 + }, + { + "epoch": 7.693677102516881, + "grad_norm": 0.18445543944835663, + "learning_rate": 1.3316028521328571e-05, + "loss": 1.7138, + "step": 25066 + }, + { + "epoch": 7.693984039287907, + "grad_norm": 0.1965894103050232, + "learning_rate": 1.3312651228226302e-05, + "loss": 1.6904, + "step": 25067 + }, + { + "epoch": 7.694290976058932, + "grad_norm": 0.1890837699174881, + "learning_rate": 1.3309274297689923e-05, + "loss": 1.7307, + "step": 25068 + }, + { + "epoch": 7.694597912829957, + "grad_norm": 0.2157326638698578, + "learning_rate": 1.3305897729752787e-05, + "loss": 1.7466, + "step": 25069 + }, + { + "epoch": 7.694904849600983, + "grad_norm": 0.19773493707180023, + "learning_rate": 1.3302521524448302e-05, + "loss": 1.7265, + "step": 25070 + }, + { + "epoch": 7.695211786372007, + "grad_norm": 0.16688357293605804, + "learning_rate": 1.3299145681809776e-05, + "loss": 1.7049, + "step": 25071 + }, + { + "epoch": 7.695518723143032, + "grad_norm": 0.24347764253616333, + "learning_rate": 1.3295770201870639e-05, + "loss": 1.7706, + "step": 25072 + }, + { + "epoch": 7.695825659914058, + "grad_norm": 0.16198144853115082, + "learning_rate": 1.3292395084664183e-05, + "loss": 1.6873, + "step": 25073 + }, + { + "epoch": 7.696132596685083, + "grad_norm": 0.17321841418743134, + "learning_rate": 1.3289020330223806e-05, + "loss": 1.7463, + "step": 25074 + }, + { + "epoch": 7.696439533456108, + "grad_norm": 0.2611647844314575, + "learning_rate": 1.3285645938582847e-05, + "loss": 1.811, + "step": 25075 + }, + { + "epoch": 7.696746470227133, + "grad_norm": 0.18129383027553558, + "learning_rate": 1.3282271909774657e-05, + "loss": 1.7257, + "step": 25076 + }, + { + "epoch": 7.697053406998158, + "grad_norm": 0.19985437393188477, + "learning_rate": 1.3278898243832588e-05, + "loss": 1.7311, + "step": 25077 + }, + { + "epoch": 7.6973603437691835, + "grad_norm": 0.21517722308635712, + "learning_rate": 1.3275524940789941e-05, + "loss": 1.7582, + "step": 25078 + }, + { + "epoch": 7.697667280540209, + "grad_norm": 0.2302769422531128, + "learning_rate": 1.32721520006801e-05, + "loss": 1.7192, + "step": 25079 + }, + { + "epoch": 7.697974217311234, + "grad_norm": 0.18356913328170776, + "learning_rate": 1.3268779423536375e-05, + "loss": 1.6916, + "step": 25080 + }, + { + "epoch": 7.6982811540822595, + "grad_norm": 0.19134142994880676, + "learning_rate": 1.3265407209392105e-05, + "loss": 1.7309, + "step": 25081 + }, + { + "epoch": 7.698588090853284, + "grad_norm": 0.17634150385856628, + "learning_rate": 1.3262035358280605e-05, + "loss": 1.7537, + "step": 25082 + }, + { + "epoch": 7.698895027624309, + "grad_norm": 0.1921558827161789, + "learning_rate": 1.325866387023521e-05, + "loss": 1.7102, + "step": 25083 + }, + { + "epoch": 7.699201964395335, + "grad_norm": 0.15972480177879333, + "learning_rate": 1.3255292745289233e-05, + "loss": 1.6759, + "step": 25084 + }, + { + "epoch": 7.69950890116636, + "grad_norm": 0.15172120928764343, + "learning_rate": 1.325192198347599e-05, + "loss": 1.6766, + "step": 25085 + }, + { + "epoch": 7.699815837937384, + "grad_norm": 0.17827558517456055, + "learning_rate": 1.3248551584828777e-05, + "loss": 1.7421, + "step": 25086 + }, + { + "epoch": 7.70012277470841, + "grad_norm": 0.1675274819135666, + "learning_rate": 1.3245181549380948e-05, + "loss": 1.701, + "step": 25087 + }, + { + "epoch": 7.700429711479435, + "grad_norm": 0.17937950789928436, + "learning_rate": 1.3241811877165744e-05, + "loss": 1.7284, + "step": 25088 + }, + { + "epoch": 7.7007366482504604, + "grad_norm": 0.16373637318611145, + "learning_rate": 1.3238442568216535e-05, + "loss": 1.6834, + "step": 25089 + }, + { + "epoch": 7.701043585021486, + "grad_norm": 0.16055652499198914, + "learning_rate": 1.3235073622566552e-05, + "loss": 1.7087, + "step": 25090 + }, + { + "epoch": 7.701350521792511, + "grad_norm": 0.15083225071430206, + "learning_rate": 1.3231705040249131e-05, + "loss": 1.7313, + "step": 25091 + }, + { + "epoch": 7.701657458563536, + "grad_norm": 0.21110820770263672, + "learning_rate": 1.322833682129756e-05, + "loss": 1.6758, + "step": 25092 + }, + { + "epoch": 7.701964395334561, + "grad_norm": 0.18439972400665283, + "learning_rate": 1.322496896574511e-05, + "loss": 1.737, + "step": 25093 + }, + { + "epoch": 7.702271332105586, + "grad_norm": 0.18655124306678772, + "learning_rate": 1.322160147362509e-05, + "loss": 1.7268, + "step": 25094 + }, + { + "epoch": 7.702578268876612, + "grad_norm": 0.17620640993118286, + "learning_rate": 1.3218234344970725e-05, + "loss": 1.6829, + "step": 25095 + }, + { + "epoch": 7.702885205647637, + "grad_norm": 0.19085893034934998, + "learning_rate": 1.3214867579815343e-05, + "loss": 1.7382, + "step": 25096 + }, + { + "epoch": 7.703192142418661, + "grad_norm": 0.2206689864397049, + "learning_rate": 1.3211501178192203e-05, + "loss": 1.7666, + "step": 25097 + }, + { + "epoch": 7.703499079189687, + "grad_norm": 0.2047509402036667, + "learning_rate": 1.320813514013457e-05, + "loss": 1.7209, + "step": 25098 + }, + { + "epoch": 7.703806015960712, + "grad_norm": 0.22249147295951843, + "learning_rate": 1.3204769465675709e-05, + "loss": 1.8067, + "step": 25099 + }, + { + "epoch": 7.704112952731737, + "grad_norm": 0.16225707530975342, + "learning_rate": 1.3201404154848885e-05, + "loss": 1.6715, + "step": 25100 + }, + { + "epoch": 7.704419889502763, + "grad_norm": 0.19165070354938507, + "learning_rate": 1.3198039207687352e-05, + "loss": 1.7233, + "step": 25101 + }, + { + "epoch": 7.704726826273788, + "grad_norm": 0.18720564246177673, + "learning_rate": 1.3194674624224368e-05, + "loss": 1.7129, + "step": 25102 + }, + { + "epoch": 7.7050337630448125, + "grad_norm": 0.16703814268112183, + "learning_rate": 1.3191310404493163e-05, + "loss": 1.7314, + "step": 25103 + }, + { + "epoch": 7.705340699815838, + "grad_norm": 0.20206168293952942, + "learning_rate": 1.3187946548527036e-05, + "loss": 1.7278, + "step": 25104 + }, + { + "epoch": 7.705647636586863, + "grad_norm": 0.1774030476808548, + "learning_rate": 1.3184583056359163e-05, + "loss": 1.6986, + "step": 25105 + }, + { + "epoch": 7.7059545733578885, + "grad_norm": 0.1729336827993393, + "learning_rate": 1.3181219928022853e-05, + "loss": 1.7251, + "step": 25106 + }, + { + "epoch": 7.706261510128914, + "grad_norm": 0.23351258039474487, + "learning_rate": 1.3177857163551276e-05, + "loss": 1.7311, + "step": 25107 + }, + { + "epoch": 7.706568446899938, + "grad_norm": 0.2041054517030716, + "learning_rate": 1.3174494762977713e-05, + "loss": 1.7122, + "step": 25108 + }, + { + "epoch": 7.706875383670964, + "grad_norm": 0.178013876080513, + "learning_rate": 1.3171132726335373e-05, + "loss": 1.7255, + "step": 25109 + }, + { + "epoch": 7.707182320441989, + "grad_norm": 0.19265221059322357, + "learning_rate": 1.3167771053657491e-05, + "loss": 1.6747, + "step": 25110 + }, + { + "epoch": 7.707489257213014, + "grad_norm": 0.18968601524829865, + "learning_rate": 1.3164409744977297e-05, + "loss": 1.71, + "step": 25111 + }, + { + "epoch": 7.70779619398404, + "grad_norm": 0.17041562497615814, + "learning_rate": 1.3161048800327963e-05, + "loss": 1.7202, + "step": 25112 + }, + { + "epoch": 7.708103130755065, + "grad_norm": 0.20094618201255798, + "learning_rate": 1.3157688219742754e-05, + "loss": 1.7375, + "step": 25113 + }, + { + "epoch": 7.708410067526089, + "grad_norm": 0.14012686908245087, + "learning_rate": 1.3154328003254862e-05, + "loss": 1.6426, + "step": 25114 + }, + { + "epoch": 7.708717004297115, + "grad_norm": 0.18826791644096375, + "learning_rate": 1.3150968150897497e-05, + "loss": 1.7114, + "step": 25115 + }, + { + "epoch": 7.70902394106814, + "grad_norm": 0.15521864593029022, + "learning_rate": 1.3147608662703864e-05, + "loss": 1.7031, + "step": 25116 + }, + { + "epoch": 7.709330877839165, + "grad_norm": 0.19424815475940704, + "learning_rate": 1.314424953870716e-05, + "loss": 1.6815, + "step": 25117 + }, + { + "epoch": 7.70963781461019, + "grad_norm": 0.30089494585990906, + "learning_rate": 1.3140890778940584e-05, + "loss": 1.7444, + "step": 25118 + }, + { + "epoch": 7.709944751381215, + "grad_norm": 0.1784239560365677, + "learning_rate": 1.3137532383437334e-05, + "loss": 1.6659, + "step": 25119 + }, + { + "epoch": 7.7102516881522405, + "grad_norm": 0.18670935928821564, + "learning_rate": 1.3134174352230571e-05, + "loss": 1.7007, + "step": 25120 + }, + { + "epoch": 7.710558624923266, + "grad_norm": 0.21140475571155548, + "learning_rate": 1.3130816685353541e-05, + "loss": 1.7716, + "step": 25121 + }, + { + "epoch": 7.710865561694291, + "grad_norm": 0.20546187460422516, + "learning_rate": 1.3127459382839363e-05, + "loss": 1.6434, + "step": 25122 + }, + { + "epoch": 7.7111724984653165, + "grad_norm": 0.15188902616500854, + "learning_rate": 1.312410244472127e-05, + "loss": 1.6843, + "step": 25123 + }, + { + "epoch": 7.711479435236341, + "grad_norm": 0.2020019143819809, + "learning_rate": 1.3120745871032375e-05, + "loss": 1.6846, + "step": 25124 + }, + { + "epoch": 7.711786372007366, + "grad_norm": 0.19839881360530853, + "learning_rate": 1.3117389661805907e-05, + "loss": 1.7026, + "step": 25125 + }, + { + "epoch": 7.712093308778392, + "grad_norm": 0.19400818645954132, + "learning_rate": 1.311403381707501e-05, + "loss": 1.705, + "step": 25126 + }, + { + "epoch": 7.712400245549417, + "grad_norm": 0.21366959810256958, + "learning_rate": 1.311067833687285e-05, + "loss": 1.7184, + "step": 25127 + }, + { + "epoch": 7.712707182320442, + "grad_norm": 0.17402227222919464, + "learning_rate": 1.3107323221232604e-05, + "loss": 1.6613, + "step": 25128 + }, + { + "epoch": 7.713014119091467, + "grad_norm": 0.24356254935264587, + "learning_rate": 1.3103968470187384e-05, + "loss": 1.7343, + "step": 25129 + }, + { + "epoch": 7.713321055862492, + "grad_norm": 0.18612951040267944, + "learning_rate": 1.3100614083770386e-05, + "loss": 1.7298, + "step": 25130 + }, + { + "epoch": 7.713627992633517, + "grad_norm": 0.27073535323143005, + "learning_rate": 1.3097260062014743e-05, + "loss": 1.7554, + "step": 25131 + }, + { + "epoch": 7.713934929404543, + "grad_norm": 0.1498921662569046, + "learning_rate": 1.309390640495361e-05, + "loss": 1.6506, + "step": 25132 + }, + { + "epoch": 7.714241866175568, + "grad_norm": 0.2159748524427414, + "learning_rate": 1.309055311262013e-05, + "loss": 1.6549, + "step": 25133 + }, + { + "epoch": 7.714548802946593, + "grad_norm": 0.2060365229845047, + "learning_rate": 1.3087200185047433e-05, + "loss": 1.7224, + "step": 25134 + }, + { + "epoch": 7.714855739717618, + "grad_norm": 0.22525639832019806, + "learning_rate": 1.3083847622268659e-05, + "loss": 1.7508, + "step": 25135 + }, + { + "epoch": 7.715162676488643, + "grad_norm": 0.20023567974567413, + "learning_rate": 1.3080495424316936e-05, + "loss": 1.7277, + "step": 25136 + }, + { + "epoch": 7.7154696132596685, + "grad_norm": 0.19702760875225067, + "learning_rate": 1.3077143591225389e-05, + "loss": 1.7291, + "step": 25137 + }, + { + "epoch": 7.715776550030694, + "grad_norm": 0.1713123917579651, + "learning_rate": 1.3073792123027173e-05, + "loss": 1.689, + "step": 25138 + }, + { + "epoch": 7.716083486801719, + "grad_norm": 0.17696695029735565, + "learning_rate": 1.3070441019755358e-05, + "loss": 1.6816, + "step": 25139 + }, + { + "epoch": 7.716390423572744, + "grad_norm": 0.1802004724740982, + "learning_rate": 1.3067090281443122e-05, + "loss": 1.754, + "step": 25140 + }, + { + "epoch": 7.716697360343769, + "grad_norm": 0.1829070895910263, + "learning_rate": 1.3063739908123518e-05, + "loss": 1.7389, + "step": 25141 + }, + { + "epoch": 7.717004297114794, + "grad_norm": 0.16842049360275269, + "learning_rate": 1.30603898998297e-05, + "loss": 1.7257, + "step": 25142 + }, + { + "epoch": 7.71731123388582, + "grad_norm": 0.18215791881084442, + "learning_rate": 1.305704025659476e-05, + "loss": 1.6765, + "step": 25143 + }, + { + "epoch": 7.717618170656845, + "grad_norm": 0.16992273926734924, + "learning_rate": 1.3053690978451799e-05, + "loss": 1.6729, + "step": 25144 + }, + { + "epoch": 7.71792510742787, + "grad_norm": 0.1847899854183197, + "learning_rate": 1.3050342065433935e-05, + "loss": 1.6972, + "step": 25145 + }, + { + "epoch": 7.718232044198895, + "grad_norm": 0.18730273842811584, + "learning_rate": 1.3046993517574219e-05, + "loss": 1.6996, + "step": 25146 + }, + { + "epoch": 7.71853898096992, + "grad_norm": 0.1695355772972107, + "learning_rate": 1.304364533490578e-05, + "loss": 1.7581, + "step": 25147 + }, + { + "epoch": 7.718845917740945, + "grad_norm": 0.17106328904628754, + "learning_rate": 1.3040297517461709e-05, + "loss": 1.6479, + "step": 25148 + }, + { + "epoch": 7.719152854511971, + "grad_norm": 0.1726374626159668, + "learning_rate": 1.3036950065275072e-05, + "loss": 1.7078, + "step": 25149 + }, + { + "epoch": 7.719459791282996, + "grad_norm": 0.21725010871887207, + "learning_rate": 1.3033602978378962e-05, + "loss": 1.8195, + "step": 25150 + }, + { + "epoch": 7.7197667280540205, + "grad_norm": 0.24786241352558136, + "learning_rate": 1.3030256256806455e-05, + "loss": 1.7439, + "step": 25151 + }, + { + "epoch": 7.720073664825046, + "grad_norm": 0.16550323367118835, + "learning_rate": 1.3026909900590622e-05, + "loss": 1.7267, + "step": 25152 + }, + { + "epoch": 7.720380601596071, + "grad_norm": 0.1833605021238327, + "learning_rate": 1.3023563909764542e-05, + "loss": 1.6675, + "step": 25153 + }, + { + "epoch": 7.7206875383670965, + "grad_norm": 0.16360491514205933, + "learning_rate": 1.3020218284361268e-05, + "loss": 1.684, + "step": 25154 + }, + { + "epoch": 7.720994475138122, + "grad_norm": 0.20423299074172974, + "learning_rate": 1.3016873024413878e-05, + "loss": 1.708, + "step": 25155 + }, + { + "epoch": 7.721301411909147, + "grad_norm": 0.1743123084306717, + "learning_rate": 1.301352812995541e-05, + "loss": 1.7497, + "step": 25156 + }, + { + "epoch": 7.721608348680172, + "grad_norm": 0.237883523106575, + "learning_rate": 1.301018360101896e-05, + "loss": 1.6859, + "step": 25157 + }, + { + "epoch": 7.721915285451197, + "grad_norm": 0.17953886091709137, + "learning_rate": 1.300683943763753e-05, + "loss": 1.6948, + "step": 25158 + }, + { + "epoch": 7.722222222222222, + "grad_norm": 0.19036953151226044, + "learning_rate": 1.3003495639844209e-05, + "loss": 1.7207, + "step": 25159 + }, + { + "epoch": 7.722529158993248, + "grad_norm": 0.17385275661945343, + "learning_rate": 1.3000152207672028e-05, + "loss": 1.7088, + "step": 25160 + }, + { + "epoch": 7.722836095764272, + "grad_norm": 0.1848379373550415, + "learning_rate": 1.2996809141154031e-05, + "loss": 1.7351, + "step": 25161 + }, + { + "epoch": 7.723143032535297, + "grad_norm": 0.1964390128850937, + "learning_rate": 1.2993466440323271e-05, + "loss": 1.7243, + "step": 25162 + }, + { + "epoch": 7.723449969306323, + "grad_norm": 0.23729266226291656, + "learning_rate": 1.299012410521273e-05, + "loss": 1.7588, + "step": 25163 + }, + { + "epoch": 7.723756906077348, + "grad_norm": 0.16980098187923431, + "learning_rate": 1.2986782135855496e-05, + "loss": 1.7092, + "step": 25164 + }, + { + "epoch": 7.724063842848373, + "grad_norm": 0.1993054747581482, + "learning_rate": 1.2983440532284568e-05, + "loss": 1.7245, + "step": 25165 + }, + { + "epoch": 7.724370779619399, + "grad_norm": 0.18817138671875, + "learning_rate": 1.2980099294532982e-05, + "loss": 1.7019, + "step": 25166 + }, + { + "epoch": 7.724677716390423, + "grad_norm": 0.20675966143608093, + "learning_rate": 1.297675842263375e-05, + "loss": 1.6949, + "step": 25167 + }, + { + "epoch": 7.7249846531614486, + "grad_norm": 0.21214626729488373, + "learning_rate": 1.2973417916619895e-05, + "loss": 1.7056, + "step": 25168 + }, + { + "epoch": 7.725291589932474, + "grad_norm": 0.1676976978778839, + "learning_rate": 1.2970077776524426e-05, + "loss": 1.7183, + "step": 25169 + }, + { + "epoch": 7.725598526703499, + "grad_norm": 0.2368413507938385, + "learning_rate": 1.2966738002380347e-05, + "loss": 1.7868, + "step": 25170 + }, + { + "epoch": 7.725905463474525, + "grad_norm": 0.22054153680801392, + "learning_rate": 1.2963398594220672e-05, + "loss": 1.7214, + "step": 25171 + }, + { + "epoch": 7.726212400245549, + "grad_norm": 0.20026426017284393, + "learning_rate": 1.2960059552078402e-05, + "loss": 1.7703, + "step": 25172 + }, + { + "epoch": 7.726519337016574, + "grad_norm": 0.1900193840265274, + "learning_rate": 1.2956720875986516e-05, + "loss": 1.7513, + "step": 25173 + }, + { + "epoch": 7.7268262737876, + "grad_norm": 0.17151880264282227, + "learning_rate": 1.2953382565978057e-05, + "loss": 1.7382, + "step": 25174 + }, + { + "epoch": 7.727133210558625, + "grad_norm": 0.2654723525047302, + "learning_rate": 1.2950044622085955e-05, + "loss": 1.7526, + "step": 25175 + }, + { + "epoch": 7.72744014732965, + "grad_norm": 0.19927532970905304, + "learning_rate": 1.2946707044343259e-05, + "loss": 1.7208, + "step": 25176 + }, + { + "epoch": 7.727747084100676, + "grad_norm": 0.3037160038948059, + "learning_rate": 1.2943369832782887e-05, + "loss": 1.8081, + "step": 25177 + }, + { + "epoch": 7.7280540208717, + "grad_norm": 0.20067723095417023, + "learning_rate": 1.2940032987437873e-05, + "loss": 1.685, + "step": 25178 + }, + { + "epoch": 7.7283609576427255, + "grad_norm": 0.16820429265499115, + "learning_rate": 1.2936696508341189e-05, + "loss": 1.7328, + "step": 25179 + }, + { + "epoch": 7.728667894413751, + "grad_norm": 0.15474672615528107, + "learning_rate": 1.2933360395525763e-05, + "loss": 1.708, + "step": 25180 + }, + { + "epoch": 7.728974831184776, + "grad_norm": 0.17825615406036377, + "learning_rate": 1.2930024649024609e-05, + "loss": 1.7416, + "step": 25181 + }, + { + "epoch": 7.7292817679558015, + "grad_norm": 0.20498061180114746, + "learning_rate": 1.292668926887068e-05, + "loss": 1.736, + "step": 25182 + }, + { + "epoch": 7.729588704726826, + "grad_norm": 0.22965869307518005, + "learning_rate": 1.2923354255096937e-05, + "loss": 1.7167, + "step": 25183 + }, + { + "epoch": 7.729895641497851, + "grad_norm": 0.1687164008617401, + "learning_rate": 1.2920019607736338e-05, + "loss": 1.6988, + "step": 25184 + }, + { + "epoch": 7.730202578268877, + "grad_norm": 0.18255390226840973, + "learning_rate": 1.2916685326821842e-05, + "loss": 1.6891, + "step": 25185 + }, + { + "epoch": 7.730509515039902, + "grad_norm": 0.1519697606563568, + "learning_rate": 1.2913351412386393e-05, + "loss": 1.6553, + "step": 25186 + }, + { + "epoch": 7.730816451810927, + "grad_norm": 0.19137845933437347, + "learning_rate": 1.2910017864462942e-05, + "loss": 1.7246, + "step": 25187 + }, + { + "epoch": 7.731123388581953, + "grad_norm": 0.19998718798160553, + "learning_rate": 1.2906684683084436e-05, + "loss": 1.7324, + "step": 25188 + }, + { + "epoch": 7.731430325352977, + "grad_norm": 0.18066956102848053, + "learning_rate": 1.2903351868283808e-05, + "loss": 1.7299, + "step": 25189 + }, + { + "epoch": 7.731737262124002, + "grad_norm": 0.18489640951156616, + "learning_rate": 1.290001942009399e-05, + "loss": 1.7249, + "step": 25190 + }, + { + "epoch": 7.732044198895028, + "grad_norm": 0.14994095265865326, + "learning_rate": 1.2896687338547958e-05, + "loss": 1.6466, + "step": 25191 + }, + { + "epoch": 7.732351135666053, + "grad_norm": 0.19937917590141296, + "learning_rate": 1.2893355623678571e-05, + "loss": 1.7298, + "step": 25192 + }, + { + "epoch": 7.7326580724370775, + "grad_norm": 0.1435725837945938, + "learning_rate": 1.2890024275518826e-05, + "loss": 1.7384, + "step": 25193 + }, + { + "epoch": 7.732965009208103, + "grad_norm": 0.23283594846725464, + "learning_rate": 1.2886693294101582e-05, + "loss": 1.7765, + "step": 25194 + }, + { + "epoch": 7.733271945979128, + "grad_norm": 0.15489891171455383, + "learning_rate": 1.2883362679459803e-05, + "loss": 1.6911, + "step": 25195 + }, + { + "epoch": 7.7335788827501535, + "grad_norm": 0.17880970239639282, + "learning_rate": 1.2880032431626404e-05, + "loss": 1.6557, + "step": 25196 + }, + { + "epoch": 7.733885819521179, + "grad_norm": 0.1717783808708191, + "learning_rate": 1.287670255063425e-05, + "loss": 1.7112, + "step": 25197 + }, + { + "epoch": 7.734192756292204, + "grad_norm": 0.17371709644794464, + "learning_rate": 1.2873373036516313e-05, + "loss": 1.7591, + "step": 25198 + }, + { + "epoch": 7.734499693063229, + "grad_norm": 0.15894445776939392, + "learning_rate": 1.2870043889305432e-05, + "loss": 1.6615, + "step": 25199 + }, + { + "epoch": 7.734806629834254, + "grad_norm": 0.17047199606895447, + "learning_rate": 1.2866715109034554e-05, + "loss": 1.7376, + "step": 25200 + }, + { + "epoch": 7.735113566605279, + "grad_norm": 0.17434459924697876, + "learning_rate": 1.2863386695736562e-05, + "loss": 1.6871, + "step": 25201 + }, + { + "epoch": 7.735420503376305, + "grad_norm": 0.18515460193157196, + "learning_rate": 1.2860058649444351e-05, + "loss": 1.7475, + "step": 25202 + }, + { + "epoch": 7.73572744014733, + "grad_norm": 0.1510036140680313, + "learning_rate": 1.2856730970190806e-05, + "loss": 1.7101, + "step": 25203 + }, + { + "epoch": 7.736034376918354, + "grad_norm": 0.1886061728000641, + "learning_rate": 1.2853403658008817e-05, + "loss": 1.7253, + "step": 25204 + }, + { + "epoch": 7.73634131368938, + "grad_norm": 0.15830372273921967, + "learning_rate": 1.2850076712931269e-05, + "loss": 1.7024, + "step": 25205 + }, + { + "epoch": 7.736648250460405, + "grad_norm": 0.3030432462692261, + "learning_rate": 1.2846750134991031e-05, + "loss": 1.7702, + "step": 25206 + }, + { + "epoch": 7.73695518723143, + "grad_norm": 0.1946970373392105, + "learning_rate": 1.2843423924220977e-05, + "loss": 1.7199, + "step": 25207 + }, + { + "epoch": 7.737262124002456, + "grad_norm": 0.19842801988124847, + "learning_rate": 1.2840098080654012e-05, + "loss": 1.7435, + "step": 25208 + }, + { + "epoch": 7.737569060773481, + "grad_norm": 0.17269715666770935, + "learning_rate": 1.2836772604322945e-05, + "loss": 1.6837, + "step": 25209 + }, + { + "epoch": 7.7378759975445055, + "grad_norm": 0.14366893470287323, + "learning_rate": 1.2833447495260703e-05, + "loss": 1.6453, + "step": 25210 + }, + { + "epoch": 7.738182934315531, + "grad_norm": 0.2189856618642807, + "learning_rate": 1.283012275350009e-05, + "loss": 1.7341, + "step": 25211 + }, + { + "epoch": 7.738489871086556, + "grad_norm": 0.14334678649902344, + "learning_rate": 1.2826798379074007e-05, + "loss": 1.6505, + "step": 25212 + }, + { + "epoch": 7.7387968078575815, + "grad_norm": 0.2020469605922699, + "learning_rate": 1.2823474372015304e-05, + "loss": 1.7915, + "step": 25213 + }, + { + "epoch": 7.739103744628607, + "grad_norm": 0.14702250063419342, + "learning_rate": 1.2820150732356783e-05, + "loss": 1.6682, + "step": 25214 + }, + { + "epoch": 7.739410681399631, + "grad_norm": 0.2310563623905182, + "learning_rate": 1.281682746013136e-05, + "loss": 1.7447, + "step": 25215 + }, + { + "epoch": 7.739717618170657, + "grad_norm": 0.16534216701984406, + "learning_rate": 1.2813504555371808e-05, + "loss": 1.6641, + "step": 25216 + }, + { + "epoch": 7.740024554941682, + "grad_norm": 0.1390565037727356, + "learning_rate": 1.2810182018111012e-05, + "loss": 1.6912, + "step": 25217 + }, + { + "epoch": 7.740331491712707, + "grad_norm": 0.16568928956985474, + "learning_rate": 1.2806859848381797e-05, + "loss": 1.7375, + "step": 25218 + }, + { + "epoch": 7.740638428483733, + "grad_norm": 0.18870174884796143, + "learning_rate": 1.2803538046216995e-05, + "loss": 1.7158, + "step": 25219 + }, + { + "epoch": 7.740945365254758, + "grad_norm": 0.18347607553005219, + "learning_rate": 1.2800216611649429e-05, + "loss": 1.7766, + "step": 25220 + }, + { + "epoch": 7.741252302025782, + "grad_norm": 0.21285377442836761, + "learning_rate": 1.2796895544711929e-05, + "loss": 1.6876, + "step": 25221 + }, + { + "epoch": 7.741559238796808, + "grad_norm": 0.26524603366851807, + "learning_rate": 1.2793574845437311e-05, + "loss": 1.6679, + "step": 25222 + }, + { + "epoch": 7.741866175567833, + "grad_norm": 0.1671147346496582, + "learning_rate": 1.2790254513858397e-05, + "loss": 1.6853, + "step": 25223 + }, + { + "epoch": 7.742173112338858, + "grad_norm": 0.21713866293430328, + "learning_rate": 1.2786934550007979e-05, + "loss": 1.8124, + "step": 25224 + }, + { + "epoch": 7.742480049109884, + "grad_norm": 0.17161360383033752, + "learning_rate": 1.2783614953918916e-05, + "loss": 1.6862, + "step": 25225 + }, + { + "epoch": 7.742786985880908, + "grad_norm": 0.1513087898492813, + "learning_rate": 1.2780295725623947e-05, + "loss": 1.6644, + "step": 25226 + }, + { + "epoch": 7.7430939226519335, + "grad_norm": 0.13013005256652832, + "learning_rate": 1.2776976865155948e-05, + "loss": 1.6612, + "step": 25227 + }, + { + "epoch": 7.743400859422959, + "grad_norm": 0.15204063057899475, + "learning_rate": 1.2773658372547648e-05, + "loss": 1.6391, + "step": 25228 + }, + { + "epoch": 7.743707796193984, + "grad_norm": 0.15421196818351746, + "learning_rate": 1.2770340247831891e-05, + "loss": 1.7005, + "step": 25229 + }, + { + "epoch": 7.7440147329650095, + "grad_norm": 0.14045587182044983, + "learning_rate": 1.276702249104147e-05, + "loss": 1.6448, + "step": 25230 + }, + { + "epoch": 7.744321669736035, + "grad_norm": 0.17244049906730652, + "learning_rate": 1.2763705102209123e-05, + "loss": 1.6737, + "step": 25231 + }, + { + "epoch": 7.744628606507059, + "grad_norm": 0.16891124844551086, + "learning_rate": 1.2760388081367697e-05, + "loss": 1.6625, + "step": 25232 + }, + { + "epoch": 7.744935543278085, + "grad_norm": 0.18271134793758392, + "learning_rate": 1.275707142854991e-05, + "loss": 1.6963, + "step": 25233 + }, + { + "epoch": 7.74524248004911, + "grad_norm": 0.18582625687122345, + "learning_rate": 1.2753755143788593e-05, + "loss": 1.6731, + "step": 25234 + }, + { + "epoch": 7.745549416820135, + "grad_norm": 0.17610707879066467, + "learning_rate": 1.2750439227116495e-05, + "loss": 1.6976, + "step": 25235 + }, + { + "epoch": 7.74585635359116, + "grad_norm": 0.20406337082386017, + "learning_rate": 1.2747123678566391e-05, + "loss": 1.7287, + "step": 25236 + }, + { + "epoch": 7.746163290362185, + "grad_norm": 0.16879913210868835, + "learning_rate": 1.2743808498171046e-05, + "loss": 1.6594, + "step": 25237 + }, + { + "epoch": 7.74647022713321, + "grad_norm": 0.1405191272497177, + "learning_rate": 1.2740493685963217e-05, + "loss": 1.6565, + "step": 25238 + }, + { + "epoch": 7.746777163904236, + "grad_norm": 0.1460784375667572, + "learning_rate": 1.2737179241975671e-05, + "loss": 1.6336, + "step": 25239 + }, + { + "epoch": 7.747084100675261, + "grad_norm": 0.16206084191799164, + "learning_rate": 1.273386516624116e-05, + "loss": 1.7501, + "step": 25240 + }, + { + "epoch": 7.747391037446286, + "grad_norm": 0.17040394246578217, + "learning_rate": 1.2730551458792422e-05, + "loss": 1.7532, + "step": 25241 + }, + { + "epoch": 7.747697974217311, + "grad_norm": 0.15487439930438995, + "learning_rate": 1.2727238119662243e-05, + "loss": 1.6757, + "step": 25242 + }, + { + "epoch": 7.748004910988336, + "grad_norm": 0.139495387673378, + "learning_rate": 1.272392514888332e-05, + "loss": 1.6431, + "step": 25243 + }, + { + "epoch": 7.7483118477593615, + "grad_norm": 0.16329489648342133, + "learning_rate": 1.2720612546488447e-05, + "loss": 1.7353, + "step": 25244 + }, + { + "epoch": 7.748618784530387, + "grad_norm": 0.14997398853302002, + "learning_rate": 1.27173003125103e-05, + "loss": 1.6977, + "step": 25245 + }, + { + "epoch": 7.748925721301412, + "grad_norm": 0.2005717009305954, + "learning_rate": 1.2713988446981656e-05, + "loss": 1.757, + "step": 25246 + }, + { + "epoch": 7.749232658072437, + "grad_norm": 0.2027040272951126, + "learning_rate": 1.2710676949935246e-05, + "loss": 1.7506, + "step": 25247 + }, + { + "epoch": 7.749539594843462, + "grad_norm": 0.18176981806755066, + "learning_rate": 1.2707365821403755e-05, + "loss": 1.7132, + "step": 25248 + }, + { + "epoch": 7.749846531614487, + "grad_norm": 0.18690772354602814, + "learning_rate": 1.2704055061419961e-05, + "loss": 1.7725, + "step": 25249 + }, + { + "epoch": 7.750153468385513, + "grad_norm": 0.18360945582389832, + "learning_rate": 1.270074467001653e-05, + "loss": 1.6779, + "step": 25250 + }, + { + "epoch": 7.750460405156538, + "grad_norm": 0.18498149514198303, + "learning_rate": 1.269743464722621e-05, + "loss": 1.7105, + "step": 25251 + }, + { + "epoch": 7.750767341927563, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.2694124993081707e-05, + "loss": 1.7273, + "step": 25252 + }, + { + "epoch": 7.751074278698588, + "grad_norm": 0.17312094569206238, + "learning_rate": 1.2690815707615727e-05, + "loss": 1.7532, + "step": 25253 + }, + { + "epoch": 7.751381215469613, + "grad_norm": 0.18758632242679596, + "learning_rate": 1.2687506790860976e-05, + "loss": 1.7394, + "step": 25254 + }, + { + "epoch": 7.7516881522406385, + "grad_norm": 0.1642044633626938, + "learning_rate": 1.2684198242850149e-05, + "loss": 1.6699, + "step": 25255 + }, + { + "epoch": 7.751995089011664, + "grad_norm": 0.34566664695739746, + "learning_rate": 1.2680890063615947e-05, + "loss": 1.7048, + "step": 25256 + }, + { + "epoch": 7.752302025782689, + "grad_norm": 0.15046556293964386, + "learning_rate": 1.2677582253191066e-05, + "loss": 1.659, + "step": 25257 + }, + { + "epoch": 7.752608962553714, + "grad_norm": 0.1504966914653778, + "learning_rate": 1.2674274811608171e-05, + "loss": 1.6841, + "step": 25258 + }, + { + "epoch": 7.752915899324739, + "grad_norm": 0.2226656973361969, + "learning_rate": 1.2670967738900009e-05, + "loss": 1.7139, + "step": 25259 + }, + { + "epoch": 7.753222836095764, + "grad_norm": 0.18797673285007477, + "learning_rate": 1.2667661035099188e-05, + "loss": 1.7726, + "step": 25260 + }, + { + "epoch": 7.75352977286679, + "grad_norm": 0.15428531169891357, + "learning_rate": 1.266435470023845e-05, + "loss": 1.6831, + "step": 25261 + }, + { + "epoch": 7.753836709637815, + "grad_norm": 0.20027057826519012, + "learning_rate": 1.2661048734350412e-05, + "loss": 1.741, + "step": 25262 + }, + { + "epoch": 7.75414364640884, + "grad_norm": 0.14779487252235413, + "learning_rate": 1.2657743137467793e-05, + "loss": 1.6974, + "step": 25263 + }, + { + "epoch": 7.754450583179865, + "grad_norm": 0.17618241906166077, + "learning_rate": 1.2654437909623258e-05, + "loss": 1.7374, + "step": 25264 + }, + { + "epoch": 7.75475751995089, + "grad_norm": 0.18769881129264832, + "learning_rate": 1.2651133050849423e-05, + "loss": 1.7241, + "step": 25265 + }, + { + "epoch": 7.755064456721915, + "grad_norm": 0.18645870685577393, + "learning_rate": 1.2647828561179015e-05, + "loss": 1.7176, + "step": 25266 + }, + { + "epoch": 7.755371393492941, + "grad_norm": 0.17507290840148926, + "learning_rate": 1.2644524440644628e-05, + "loss": 1.6994, + "step": 25267 + }, + { + "epoch": 7.755678330263965, + "grad_norm": 0.15264524519443512, + "learning_rate": 1.264122068927896e-05, + "loss": 1.6993, + "step": 25268 + }, + { + "epoch": 7.7559852670349905, + "grad_norm": 0.1749732941389084, + "learning_rate": 1.263791730711465e-05, + "loss": 1.7265, + "step": 25269 + }, + { + "epoch": 7.756292203806016, + "grad_norm": 0.15777049958705902, + "learning_rate": 1.2634614294184332e-05, + "loss": 1.6219, + "step": 25270 + }, + { + "epoch": 7.756599140577041, + "grad_norm": 0.17740310728549957, + "learning_rate": 1.263131165052066e-05, + "loss": 1.7373, + "step": 25271 + }, + { + "epoch": 7.7569060773480665, + "grad_norm": 0.22577044367790222, + "learning_rate": 1.262800937615627e-05, + "loss": 1.7492, + "step": 25272 + }, + { + "epoch": 7.757213014119092, + "grad_norm": 0.155413419008255, + "learning_rate": 1.2624707471123791e-05, + "loss": 1.7037, + "step": 25273 + }, + { + "epoch": 7.757519950890116, + "grad_norm": 0.1755802482366562, + "learning_rate": 1.2621405935455866e-05, + "loss": 1.7057, + "step": 25274 + }, + { + "epoch": 7.757826887661142, + "grad_norm": 0.15870101749897003, + "learning_rate": 1.2618104769185096e-05, + "loss": 1.6951, + "step": 25275 + }, + { + "epoch": 7.758133824432167, + "grad_norm": 0.18285419046878815, + "learning_rate": 1.2614803972344158e-05, + "loss": 1.7443, + "step": 25276 + }, + { + "epoch": 7.758440761203192, + "grad_norm": 0.1669059544801712, + "learning_rate": 1.2611503544965609e-05, + "loss": 1.6442, + "step": 25277 + }, + { + "epoch": 7.758747697974218, + "grad_norm": 0.17830590903759003, + "learning_rate": 1.2608203487082121e-05, + "loss": 1.7432, + "step": 25278 + }, + { + "epoch": 7.759054634745242, + "grad_norm": 0.18318989872932434, + "learning_rate": 1.2604903798726259e-05, + "loss": 1.7128, + "step": 25279 + }, + { + "epoch": 7.759361571516267, + "grad_norm": 0.17735294997692108, + "learning_rate": 1.2601604479930663e-05, + "loss": 1.6719, + "step": 25280 + }, + { + "epoch": 7.759668508287293, + "grad_norm": 0.14324752986431122, + "learning_rate": 1.2598305530727949e-05, + "loss": 1.688, + "step": 25281 + }, + { + "epoch": 7.759975445058318, + "grad_norm": 0.17677859961986542, + "learning_rate": 1.2595006951150678e-05, + "loss": 1.7016, + "step": 25282 + }, + { + "epoch": 7.760282381829343, + "grad_norm": 0.16832831501960754, + "learning_rate": 1.2591708741231495e-05, + "loss": 1.6669, + "step": 25283 + }, + { + "epoch": 7.760589318600369, + "grad_norm": 0.20717547833919525, + "learning_rate": 1.2588410901002944e-05, + "loss": 1.7275, + "step": 25284 + }, + { + "epoch": 7.760896255371393, + "grad_norm": 0.2471853792667389, + "learning_rate": 1.2585113430497658e-05, + "loss": 1.779, + "step": 25285 + }, + { + "epoch": 7.7612031921424185, + "grad_norm": 0.2646878957748413, + "learning_rate": 1.2581816329748214e-05, + "loss": 1.8003, + "step": 25286 + }, + { + "epoch": 7.761510128913444, + "grad_norm": 0.2102949321269989, + "learning_rate": 1.2578519598787191e-05, + "loss": 1.764, + "step": 25287 + }, + { + "epoch": 7.761817065684469, + "grad_norm": 0.16151423752307892, + "learning_rate": 1.2575223237647171e-05, + "loss": 1.7233, + "step": 25288 + }, + { + "epoch": 7.7621240024554945, + "grad_norm": 0.22221817076206207, + "learning_rate": 1.2571927246360727e-05, + "loss": 1.7485, + "step": 25289 + }, + { + "epoch": 7.762430939226519, + "grad_norm": 0.16470851004123688, + "learning_rate": 1.2568631624960441e-05, + "loss": 1.6844, + "step": 25290 + }, + { + "epoch": 7.762737875997544, + "grad_norm": 0.17529261112213135, + "learning_rate": 1.256533637347887e-05, + "loss": 1.7409, + "step": 25291 + }, + { + "epoch": 7.76304481276857, + "grad_norm": 0.19055718183517456, + "learning_rate": 1.2562041491948579e-05, + "loss": 1.6861, + "step": 25292 + }, + { + "epoch": 7.763351749539595, + "grad_norm": 0.19183041155338287, + "learning_rate": 1.2558746980402159e-05, + "loss": 1.7493, + "step": 25293 + }, + { + "epoch": 7.76365868631062, + "grad_norm": 0.20031596720218658, + "learning_rate": 1.2555452838872123e-05, + "loss": 1.705, + "step": 25294 + }, + { + "epoch": 7.763965623081646, + "grad_norm": 0.16234149038791656, + "learning_rate": 1.2552159067391072e-05, + "loss": 1.7407, + "step": 25295 + }, + { + "epoch": 7.76427255985267, + "grad_norm": 0.15412569046020508, + "learning_rate": 1.254886566599151e-05, + "loss": 1.6599, + "step": 25296 + }, + { + "epoch": 7.764579496623695, + "grad_norm": 0.17393885552883148, + "learning_rate": 1.2545572634706022e-05, + "loss": 1.7372, + "step": 25297 + }, + { + "epoch": 7.764886433394721, + "grad_norm": 0.18662036955356598, + "learning_rate": 1.254227997356715e-05, + "loss": 1.7681, + "step": 25298 + }, + { + "epoch": 7.765193370165746, + "grad_norm": 0.16661690175533295, + "learning_rate": 1.2538987682607395e-05, + "loss": 1.754, + "step": 25299 + }, + { + "epoch": 7.765500306936771, + "grad_norm": 0.21453191339969635, + "learning_rate": 1.253569576185935e-05, + "loss": 1.7802, + "step": 25300 + }, + { + "epoch": 7.765807243707796, + "grad_norm": 0.14639903604984283, + "learning_rate": 1.2532404211355486e-05, + "loss": 1.6478, + "step": 25301 + }, + { + "epoch": 7.766114180478821, + "grad_norm": 0.17430682480335236, + "learning_rate": 1.2529113031128382e-05, + "loss": 1.687, + "step": 25302 + }, + { + "epoch": 7.7664211172498465, + "grad_norm": 0.21582552790641785, + "learning_rate": 1.2525822221210543e-05, + "loss": 1.7723, + "step": 25303 + }, + { + "epoch": 7.766728054020872, + "grad_norm": 0.21142803132534027, + "learning_rate": 1.2522531781634495e-05, + "loss": 1.7986, + "step": 25304 + }, + { + "epoch": 7.767034990791897, + "grad_norm": 0.1637791097164154, + "learning_rate": 1.251924171243275e-05, + "loss": 1.6884, + "step": 25305 + }, + { + "epoch": 7.7673419275629225, + "grad_norm": 0.19218359887599945, + "learning_rate": 1.2515952013637832e-05, + "loss": 1.7972, + "step": 25306 + }, + { + "epoch": 7.767648864333947, + "grad_norm": 0.14534975588321686, + "learning_rate": 1.2512662685282245e-05, + "loss": 1.6602, + "step": 25307 + }, + { + "epoch": 7.767955801104972, + "grad_norm": 0.2955080568790436, + "learning_rate": 1.2509373727398494e-05, + "loss": 1.763, + "step": 25308 + }, + { + "epoch": 7.768262737875998, + "grad_norm": 0.17220059037208557, + "learning_rate": 1.2506085140019086e-05, + "loss": 1.672, + "step": 25309 + }, + { + "epoch": 7.768569674647023, + "grad_norm": 0.17092043161392212, + "learning_rate": 1.2502796923176524e-05, + "loss": 1.7014, + "step": 25310 + }, + { + "epoch": 7.768876611418047, + "grad_norm": 0.2363509237766266, + "learning_rate": 1.2499509076903288e-05, + "loss": 1.7489, + "step": 25311 + }, + { + "epoch": 7.769183548189073, + "grad_norm": 0.19223156571388245, + "learning_rate": 1.2496221601231906e-05, + "loss": 1.7194, + "step": 25312 + }, + { + "epoch": 7.769490484960098, + "grad_norm": 0.18292652070522308, + "learning_rate": 1.249293449619483e-05, + "loss": 1.7422, + "step": 25313 + }, + { + "epoch": 7.769797421731123, + "grad_norm": 0.17120866477489471, + "learning_rate": 1.2489647761824547e-05, + "loss": 1.7367, + "step": 25314 + }, + { + "epoch": 7.770104358502149, + "grad_norm": 0.22178049385547638, + "learning_rate": 1.248636139815358e-05, + "loss": 1.7451, + "step": 25315 + }, + { + "epoch": 7.770411295273174, + "grad_norm": 0.15707750618457794, + "learning_rate": 1.2483075405214346e-05, + "loss": 1.6748, + "step": 25316 + }, + { + "epoch": 7.7707182320441985, + "grad_norm": 0.1570693850517273, + "learning_rate": 1.2479789783039381e-05, + "loss": 1.6895, + "step": 25317 + }, + { + "epoch": 7.771025168815224, + "grad_norm": 0.1687897890806198, + "learning_rate": 1.2476504531661093e-05, + "loss": 1.7145, + "step": 25318 + }, + { + "epoch": 7.771332105586249, + "grad_norm": 0.16047275066375732, + "learning_rate": 1.2473219651112e-05, + "loss": 1.6675, + "step": 25319 + }, + { + "epoch": 7.7716390423572745, + "grad_norm": 0.16817785799503326, + "learning_rate": 1.2469935141424544e-05, + "loss": 1.6678, + "step": 25320 + }, + { + "epoch": 7.7719459791283, + "grad_norm": 0.1511528342962265, + "learning_rate": 1.246665100263118e-05, + "loss": 1.7054, + "step": 25321 + }, + { + "epoch": 7.772252915899324, + "grad_norm": 0.145367830991745, + "learning_rate": 1.2463367234764373e-05, + "loss": 1.7037, + "step": 25322 + }, + { + "epoch": 7.77255985267035, + "grad_norm": 0.1794048696756363, + "learning_rate": 1.2460083837856573e-05, + "loss": 1.7372, + "step": 25323 + }, + { + "epoch": 7.772866789441375, + "grad_norm": 0.21238376200199127, + "learning_rate": 1.2456800811940227e-05, + "loss": 1.7796, + "step": 25324 + }, + { + "epoch": 7.7731737262124, + "grad_norm": 0.23305723071098328, + "learning_rate": 1.2453518157047784e-05, + "loss": 1.7124, + "step": 25325 + }, + { + "epoch": 7.773480662983426, + "grad_norm": 0.18229269981384277, + "learning_rate": 1.2450235873211673e-05, + "loss": 1.7202, + "step": 25326 + }, + { + "epoch": 7.773787599754451, + "grad_norm": 0.19145874679088593, + "learning_rate": 1.2446953960464346e-05, + "loss": 1.6701, + "step": 25327 + }, + { + "epoch": 7.774094536525475, + "grad_norm": 0.26310765743255615, + "learning_rate": 1.2443672418838215e-05, + "loss": 1.7674, + "step": 25328 + }, + { + "epoch": 7.774401473296501, + "grad_norm": 0.18370535969734192, + "learning_rate": 1.2440391248365756e-05, + "loss": 1.7027, + "step": 25329 + }, + { + "epoch": 7.774708410067526, + "grad_norm": 0.24704128503799438, + "learning_rate": 1.2437110449079348e-05, + "loss": 1.7238, + "step": 25330 + }, + { + "epoch": 7.7750153468385514, + "grad_norm": 0.194215789437294, + "learning_rate": 1.2433830021011433e-05, + "loss": 1.735, + "step": 25331 + }, + { + "epoch": 7.775322283609577, + "grad_norm": 0.24099037051200867, + "learning_rate": 1.2430549964194427e-05, + "loss": 1.7335, + "step": 25332 + }, + { + "epoch": 7.775629220380601, + "grad_norm": 0.1665026843547821, + "learning_rate": 1.242727027866073e-05, + "loss": 1.7245, + "step": 25333 + }, + { + "epoch": 7.775936157151627, + "grad_norm": 0.18005968630313873, + "learning_rate": 1.24239909644428e-05, + "loss": 1.6227, + "step": 25334 + }, + { + "epoch": 7.776243093922652, + "grad_norm": 0.2306728959083557, + "learning_rate": 1.2420712021572983e-05, + "loss": 1.7136, + "step": 25335 + }, + { + "epoch": 7.776550030693677, + "grad_norm": 0.1916062831878662, + "learning_rate": 1.2417433450083738e-05, + "loss": 1.7912, + "step": 25336 + }, + { + "epoch": 7.776856967464703, + "grad_norm": 0.1999555081129074, + "learning_rate": 1.2414155250007437e-05, + "loss": 1.7685, + "step": 25337 + }, + { + "epoch": 7.777163904235728, + "grad_norm": 0.18222710490226746, + "learning_rate": 1.2410877421376488e-05, + "loss": 1.7024, + "step": 25338 + }, + { + "epoch": 7.777470841006752, + "grad_norm": 0.22534650564193726, + "learning_rate": 1.2407599964223276e-05, + "loss": 1.7263, + "step": 25339 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.3313053250312805, + "learning_rate": 1.2404322878580199e-05, + "loss": 1.6988, + "step": 25340 + }, + { + "epoch": 7.778084714548803, + "grad_norm": 0.23691575229167938, + "learning_rate": 1.2401046164479635e-05, + "loss": 1.7771, + "step": 25341 + }, + { + "epoch": 7.778391651319828, + "grad_norm": 0.2119995355606079, + "learning_rate": 1.2397769821953976e-05, + "loss": 1.709, + "step": 25342 + }, + { + "epoch": 7.778698588090853, + "grad_norm": 0.20468266308307648, + "learning_rate": 1.2394493851035588e-05, + "loss": 1.7914, + "step": 25343 + }, + { + "epoch": 7.779005524861878, + "grad_norm": 0.19825033843517303, + "learning_rate": 1.2391218251756854e-05, + "loss": 1.727, + "step": 25344 + }, + { + "epoch": 7.7793124616329035, + "grad_norm": 0.19072072207927704, + "learning_rate": 1.2387943024150134e-05, + "loss": 1.7498, + "step": 25345 + }, + { + "epoch": 7.779619398403929, + "grad_norm": 0.15986371040344238, + "learning_rate": 1.2384668168247832e-05, + "loss": 1.6807, + "step": 25346 + }, + { + "epoch": 7.779926335174954, + "grad_norm": 0.1731162816286087, + "learning_rate": 1.238139368408227e-05, + "loss": 1.7, + "step": 25347 + }, + { + "epoch": 7.7802332719459795, + "grad_norm": 0.1496593952178955, + "learning_rate": 1.237811957168583e-05, + "loss": 1.6558, + "step": 25348 + }, + { + "epoch": 7.780540208717004, + "grad_norm": 0.1982542872428894, + "learning_rate": 1.2374845831090859e-05, + "loss": 1.7888, + "step": 25349 + }, + { + "epoch": 7.780847145488029, + "grad_norm": 0.1517801433801651, + "learning_rate": 1.2371572462329706e-05, + "loss": 1.6743, + "step": 25350 + }, + { + "epoch": 7.781154082259055, + "grad_norm": 0.23794496059417725, + "learning_rate": 1.2368299465434752e-05, + "loss": 1.7332, + "step": 25351 + }, + { + "epoch": 7.78146101903008, + "grad_norm": 0.20220822095870972, + "learning_rate": 1.2365026840438288e-05, + "loss": 1.7444, + "step": 25352 + }, + { + "epoch": 7.781767955801105, + "grad_norm": 0.18997377157211304, + "learning_rate": 1.236175458737272e-05, + "loss": 1.771, + "step": 25353 + }, + { + "epoch": 7.78207489257213, + "grad_norm": 0.15465202927589417, + "learning_rate": 1.2358482706270325e-05, + "loss": 1.7072, + "step": 25354 + }, + { + "epoch": 7.782381829343155, + "grad_norm": 0.1759808510541916, + "learning_rate": 1.235521119716348e-05, + "loss": 1.6761, + "step": 25355 + }, + { + "epoch": 7.78268876611418, + "grad_norm": 0.17520606517791748, + "learning_rate": 1.2351940060084505e-05, + "loss": 1.6702, + "step": 25356 + }, + { + "epoch": 7.782995702885206, + "grad_norm": 0.20305509865283966, + "learning_rate": 1.2348669295065717e-05, + "loss": 1.746, + "step": 25357 + }, + { + "epoch": 7.783302639656231, + "grad_norm": 0.14459536969661713, + "learning_rate": 1.2345398902139454e-05, + "loss": 1.6907, + "step": 25358 + }, + { + "epoch": 7.783609576427256, + "grad_norm": 0.18058347702026367, + "learning_rate": 1.2342128881338027e-05, + "loss": 1.796, + "step": 25359 + }, + { + "epoch": 7.783916513198281, + "grad_norm": 0.1778976023197174, + "learning_rate": 1.2338859232693756e-05, + "loss": 1.715, + "step": 25360 + }, + { + "epoch": 7.784223449969306, + "grad_norm": 0.1644120067358017, + "learning_rate": 1.2335589956238953e-05, + "loss": 1.6786, + "step": 25361 + }, + { + "epoch": 7.7845303867403315, + "grad_norm": 0.15315432846546173, + "learning_rate": 1.2332321052005907e-05, + "loss": 1.6503, + "step": 25362 + }, + { + "epoch": 7.784837323511357, + "grad_norm": 0.19160087406635284, + "learning_rate": 1.2329052520026973e-05, + "loss": 1.7131, + "step": 25363 + }, + { + "epoch": 7.785144260282382, + "grad_norm": 0.1778041124343872, + "learning_rate": 1.2325784360334408e-05, + "loss": 1.754, + "step": 25364 + }, + { + "epoch": 7.785451197053407, + "grad_norm": 0.17478828132152557, + "learning_rate": 1.2322516572960519e-05, + "loss": 1.7122, + "step": 25365 + }, + { + "epoch": 7.785758133824432, + "grad_norm": 0.2239549458026886, + "learning_rate": 1.2319249157937612e-05, + "loss": 1.7589, + "step": 25366 + }, + { + "epoch": 7.786065070595457, + "grad_norm": 0.21565821766853333, + "learning_rate": 1.2315982115297953e-05, + "loss": 1.7468, + "step": 25367 + }, + { + "epoch": 7.786372007366483, + "grad_norm": 0.1859208643436432, + "learning_rate": 1.231271544507387e-05, + "loss": 1.7289, + "step": 25368 + }, + { + "epoch": 7.786678944137508, + "grad_norm": 0.14813102781772614, + "learning_rate": 1.2309449147297596e-05, + "loss": 1.6543, + "step": 25369 + }, + { + "epoch": 7.786985880908533, + "grad_norm": 0.14101989567279816, + "learning_rate": 1.2306183222001472e-05, + "loss": 1.6775, + "step": 25370 + }, + { + "epoch": 7.787292817679558, + "grad_norm": 0.2041245847940445, + "learning_rate": 1.2302917669217701e-05, + "loss": 1.6874, + "step": 25371 + }, + { + "epoch": 7.787599754450583, + "grad_norm": 0.17343124747276306, + "learning_rate": 1.2299652488978614e-05, + "loss": 1.7005, + "step": 25372 + }, + { + "epoch": 7.787906691221608, + "grad_norm": 0.20174655318260193, + "learning_rate": 1.2296387681316451e-05, + "loss": 1.8073, + "step": 25373 + }, + { + "epoch": 7.788213627992634, + "grad_norm": 0.21615192294120789, + "learning_rate": 1.2293123246263488e-05, + "loss": 1.7045, + "step": 25374 + }, + { + "epoch": 7.788520564763659, + "grad_norm": 0.18587705492973328, + "learning_rate": 1.2289859183851981e-05, + "loss": 1.7497, + "step": 25375 + }, + { + "epoch": 7.7888275015346835, + "grad_norm": 0.16649113595485687, + "learning_rate": 1.228659549411419e-05, + "loss": 1.6695, + "step": 25376 + }, + { + "epoch": 7.789134438305709, + "grad_norm": 0.16547587513923645, + "learning_rate": 1.2283332177082362e-05, + "loss": 1.7119, + "step": 25377 + }, + { + "epoch": 7.789441375076734, + "grad_norm": 0.17672663927078247, + "learning_rate": 1.2280069232788755e-05, + "loss": 1.7458, + "step": 25378 + }, + { + "epoch": 7.7897483118477595, + "grad_norm": 0.15436655282974243, + "learning_rate": 1.22768066612656e-05, + "loss": 1.723, + "step": 25379 + }, + { + "epoch": 7.790055248618785, + "grad_norm": 0.1699141561985016, + "learning_rate": 1.2273544462545178e-05, + "loss": 1.7083, + "step": 25380 + }, + { + "epoch": 7.79036218538981, + "grad_norm": 0.18014399707317352, + "learning_rate": 1.2270282636659686e-05, + "loss": 1.7512, + "step": 25381 + }, + { + "epoch": 7.790669122160835, + "grad_norm": 0.1807268261909485, + "learning_rate": 1.2267021183641375e-05, + "loss": 1.7404, + "step": 25382 + }, + { + "epoch": 7.79097605893186, + "grad_norm": 0.16704204678535461, + "learning_rate": 1.2263760103522481e-05, + "loss": 1.6723, + "step": 25383 + }, + { + "epoch": 7.791282995702885, + "grad_norm": 0.1551518738269806, + "learning_rate": 1.2260499396335206e-05, + "loss": 1.7, + "step": 25384 + }, + { + "epoch": 7.791589932473911, + "grad_norm": 0.16270415484905243, + "learning_rate": 1.225723906211183e-05, + "loss": 1.7238, + "step": 25385 + }, + { + "epoch": 7.791896869244935, + "grad_norm": 0.19548700749874115, + "learning_rate": 1.225397910088451e-05, + "loss": 1.7192, + "step": 25386 + }, + { + "epoch": 7.79220380601596, + "grad_norm": 0.19115851819515228, + "learning_rate": 1.225071951268552e-05, + "loss": 1.753, + "step": 25387 + }, + { + "epoch": 7.792510742786986, + "grad_norm": 0.1557070016860962, + "learning_rate": 1.224746029754702e-05, + "loss": 1.6791, + "step": 25388 + }, + { + "epoch": 7.792817679558011, + "grad_norm": 0.16580358147621155, + "learning_rate": 1.2244201455501252e-05, + "loss": 1.6799, + "step": 25389 + }, + { + "epoch": 7.793124616329036, + "grad_norm": 0.18099573254585266, + "learning_rate": 1.2240942986580422e-05, + "loss": 1.7546, + "step": 25390 + }, + { + "epoch": 7.793431553100062, + "grad_norm": 0.2411479502916336, + "learning_rate": 1.223768489081672e-05, + "loss": 1.7315, + "step": 25391 + }, + { + "epoch": 7.793738489871086, + "grad_norm": 0.14678087830543518, + "learning_rate": 1.2234427168242351e-05, + "loss": 1.6733, + "step": 25392 + }, + { + "epoch": 7.7940454266421115, + "grad_norm": 0.17501497268676758, + "learning_rate": 1.223116981888951e-05, + "loss": 1.7416, + "step": 25393 + }, + { + "epoch": 7.794352363413137, + "grad_norm": 0.25460878014564514, + "learning_rate": 1.2227912842790384e-05, + "loss": 1.7873, + "step": 25394 + }, + { + "epoch": 7.794659300184162, + "grad_norm": 0.1701650321483612, + "learning_rate": 1.2224656239977161e-05, + "loss": 1.686, + "step": 25395 + }, + { + "epoch": 7.7949662369551875, + "grad_norm": 0.15684448182582855, + "learning_rate": 1.2221400010482009e-05, + "loss": 1.6768, + "step": 25396 + }, + { + "epoch": 7.795273173726212, + "grad_norm": 0.19048964977264404, + "learning_rate": 1.2218144154337158e-05, + "loss": 1.744, + "step": 25397 + }, + { + "epoch": 7.795580110497237, + "grad_norm": 0.20939184725284576, + "learning_rate": 1.2214888671574737e-05, + "loss": 1.818, + "step": 25398 + }, + { + "epoch": 7.795887047268263, + "grad_norm": 0.18450765311717987, + "learning_rate": 1.2211633562226932e-05, + "loss": 1.6972, + "step": 25399 + }, + { + "epoch": 7.796193984039288, + "grad_norm": 0.20349545776844025, + "learning_rate": 1.2208378826325912e-05, + "loss": 1.7784, + "step": 25400 + }, + { + "epoch": 7.796500920810313, + "grad_norm": 0.17835615575313568, + "learning_rate": 1.2205124463903828e-05, + "loss": 1.7203, + "step": 25401 + }, + { + "epoch": 7.796807857581339, + "grad_norm": 0.1525154411792755, + "learning_rate": 1.2201870474992882e-05, + "loss": 1.7194, + "step": 25402 + }, + { + "epoch": 7.797114794352363, + "grad_norm": 0.15197598934173584, + "learning_rate": 1.2198616859625184e-05, + "loss": 1.6787, + "step": 25403 + }, + { + "epoch": 7.797421731123388, + "grad_norm": 0.1602524071931839, + "learning_rate": 1.2195363617832934e-05, + "loss": 1.6919, + "step": 25404 + }, + { + "epoch": 7.797728667894414, + "grad_norm": 0.15638625621795654, + "learning_rate": 1.2192110749648233e-05, + "loss": 1.6945, + "step": 25405 + }, + { + "epoch": 7.798035604665439, + "grad_norm": 0.15247012674808502, + "learning_rate": 1.2188858255103264e-05, + "loss": 1.673, + "step": 25406 + }, + { + "epoch": 7.798342541436464, + "grad_norm": 0.16753807663917542, + "learning_rate": 1.218560613423016e-05, + "loss": 1.7088, + "step": 25407 + }, + { + "epoch": 7.798649478207489, + "grad_norm": 0.17434635758399963, + "learning_rate": 1.2182354387061063e-05, + "loss": 1.7279, + "step": 25408 + }, + { + "epoch": 7.798956414978514, + "grad_norm": 0.21984371542930603, + "learning_rate": 1.2179103013628108e-05, + "loss": 1.7203, + "step": 25409 + }, + { + "epoch": 7.7992633517495396, + "grad_norm": 0.18304525315761566, + "learning_rate": 1.2175852013963418e-05, + "loss": 1.6937, + "step": 25410 + }, + { + "epoch": 7.799570288520565, + "grad_norm": 0.20372866094112396, + "learning_rate": 1.2172601388099131e-05, + "loss": 1.6911, + "step": 25411 + }, + { + "epoch": 7.79987722529159, + "grad_norm": 0.2012174129486084, + "learning_rate": 1.216935113606737e-05, + "loss": 1.7365, + "step": 25412 + }, + { + "epoch": 7.800184162062616, + "grad_norm": 0.2146923542022705, + "learning_rate": 1.2166101257900236e-05, + "loss": 1.711, + "step": 25413 + }, + { + "epoch": 7.80049109883364, + "grad_norm": 0.202762633562088, + "learning_rate": 1.2162851753629895e-05, + "loss": 1.7459, + "step": 25414 + }, + { + "epoch": 7.800798035604665, + "grad_norm": 0.19161204993724823, + "learning_rate": 1.2159602623288418e-05, + "loss": 1.687, + "step": 25415 + }, + { + "epoch": 7.801104972375691, + "grad_norm": 0.2027188539505005, + "learning_rate": 1.2156353866907927e-05, + "loss": 1.7482, + "step": 25416 + }, + { + "epoch": 7.801411909146716, + "grad_norm": 0.17790403962135315, + "learning_rate": 1.2153105484520521e-05, + "loss": 1.7047, + "step": 25417 + }, + { + "epoch": 7.8017188459177405, + "grad_norm": 0.18325060606002808, + "learning_rate": 1.21498574761583e-05, + "loss": 1.693, + "step": 25418 + }, + { + "epoch": 7.802025782688766, + "grad_norm": 0.14223991334438324, + "learning_rate": 1.2146609841853401e-05, + "loss": 1.7168, + "step": 25419 + }, + { + "epoch": 7.802332719459791, + "grad_norm": 0.18397340178489685, + "learning_rate": 1.2143362581637863e-05, + "loss": 1.7234, + "step": 25420 + }, + { + "epoch": 7.8026396562308165, + "grad_norm": 0.16903668642044067, + "learning_rate": 1.214011569554383e-05, + "loss": 1.6884, + "step": 25421 + }, + { + "epoch": 7.802946593001842, + "grad_norm": 0.15086103975772858, + "learning_rate": 1.2136869183603339e-05, + "loss": 1.6712, + "step": 25422 + }, + { + "epoch": 7.803253529772867, + "grad_norm": 0.1743185818195343, + "learning_rate": 1.2133623045848507e-05, + "loss": 1.7167, + "step": 25423 + }, + { + "epoch": 7.803560466543892, + "grad_norm": 0.160976842045784, + "learning_rate": 1.2130377282311411e-05, + "loss": 1.7749, + "step": 25424 + }, + { + "epoch": 7.803867403314917, + "grad_norm": 0.2554323971271515, + "learning_rate": 1.2127131893024123e-05, + "loss": 1.7156, + "step": 25425 + }, + { + "epoch": 7.804174340085942, + "grad_norm": 0.1582731157541275, + "learning_rate": 1.2123886878018714e-05, + "loss": 1.7088, + "step": 25426 + }, + { + "epoch": 7.804481276856968, + "grad_norm": 0.18008622527122498, + "learning_rate": 1.2120642237327257e-05, + "loss": 1.6928, + "step": 25427 + }, + { + "epoch": 7.804788213627993, + "grad_norm": 0.29349491000175476, + "learning_rate": 1.2117397970981815e-05, + "loss": 1.7596, + "step": 25428 + }, + { + "epoch": 7.805095150399017, + "grad_norm": 0.20927627384662628, + "learning_rate": 1.211415407901445e-05, + "loss": 1.7113, + "step": 25429 + }, + { + "epoch": 7.805402087170043, + "grad_norm": 0.2126142680644989, + "learning_rate": 1.21109105614572e-05, + "loss": 1.7125, + "step": 25430 + }, + { + "epoch": 7.805709023941068, + "grad_norm": 0.20456665754318237, + "learning_rate": 1.2107667418342172e-05, + "loss": 1.7619, + "step": 25431 + }, + { + "epoch": 7.806015960712093, + "grad_norm": 0.17268066108226776, + "learning_rate": 1.2104424649701373e-05, + "loss": 1.6462, + "step": 25432 + }, + { + "epoch": 7.806322897483119, + "grad_norm": 0.16213946044445038, + "learning_rate": 1.2101182255566856e-05, + "loss": 1.6787, + "step": 25433 + }, + { + "epoch": 7.806629834254144, + "grad_norm": 0.17202046513557434, + "learning_rate": 1.2097940235970673e-05, + "loss": 1.7081, + "step": 25434 + }, + { + "epoch": 7.8069367710251685, + "grad_norm": 0.2076229751110077, + "learning_rate": 1.2094698590944842e-05, + "loss": 1.6832, + "step": 25435 + }, + { + "epoch": 7.807243707796194, + "grad_norm": 0.17209482192993164, + "learning_rate": 1.2091457320521448e-05, + "loss": 1.7722, + "step": 25436 + }, + { + "epoch": 7.807550644567219, + "grad_norm": 0.2185208946466446, + "learning_rate": 1.2088216424732463e-05, + "loss": 1.7536, + "step": 25437 + }, + { + "epoch": 7.8078575813382445, + "grad_norm": 0.1812329739332199, + "learning_rate": 1.2084975903609968e-05, + "loss": 1.7275, + "step": 25438 + }, + { + "epoch": 7.80816451810927, + "grad_norm": 0.20143690705299377, + "learning_rate": 1.208173575718594e-05, + "loss": 1.7533, + "step": 25439 + }, + { + "epoch": 7.808471454880294, + "grad_norm": 0.18351776897907257, + "learning_rate": 1.2078495985492433e-05, + "loss": 1.6831, + "step": 25440 + }, + { + "epoch": 7.80877839165132, + "grad_norm": 0.15470999479293823, + "learning_rate": 1.2075256588561462e-05, + "loss": 1.6862, + "step": 25441 + }, + { + "epoch": 7.809085328422345, + "grad_norm": 0.1751607209444046, + "learning_rate": 1.2072017566425032e-05, + "loss": 1.7182, + "step": 25442 + }, + { + "epoch": 7.80939226519337, + "grad_norm": 0.16465237736701965, + "learning_rate": 1.2068778919115153e-05, + "loss": 1.7055, + "step": 25443 + }, + { + "epoch": 7.809699201964396, + "grad_norm": 0.13899528980255127, + "learning_rate": 1.2065540646663832e-05, + "loss": 1.634, + "step": 25444 + }, + { + "epoch": 7.810006138735421, + "grad_norm": 0.21526047587394714, + "learning_rate": 1.2062302749103072e-05, + "loss": 1.759, + "step": 25445 + }, + { + "epoch": 7.810313075506445, + "grad_norm": 0.1628599315881729, + "learning_rate": 1.2059065226464872e-05, + "loss": 1.6782, + "step": 25446 + }, + { + "epoch": 7.810620012277471, + "grad_norm": 0.16853751242160797, + "learning_rate": 1.2055828078781217e-05, + "loss": 1.7123, + "step": 25447 + }, + { + "epoch": 7.810926949048496, + "grad_norm": 0.17399325966835022, + "learning_rate": 1.2052591306084138e-05, + "loss": 1.7394, + "step": 25448 + }, + { + "epoch": 7.811233885819521, + "grad_norm": 0.16147997975349426, + "learning_rate": 1.2049354908405574e-05, + "loss": 1.66, + "step": 25449 + }, + { + "epoch": 7.811540822590547, + "grad_norm": 0.1806066632270813, + "learning_rate": 1.204611888577753e-05, + "loss": 1.7193, + "step": 25450 + }, + { + "epoch": 7.811847759361571, + "grad_norm": 0.14491340517997742, + "learning_rate": 1.2042883238231984e-05, + "loss": 1.6996, + "step": 25451 + }, + { + "epoch": 7.8121546961325965, + "grad_norm": 0.24257591366767883, + "learning_rate": 1.2039647965800905e-05, + "loss": 1.734, + "step": 25452 + }, + { + "epoch": 7.812461632903622, + "grad_norm": 0.17281031608581543, + "learning_rate": 1.2036413068516295e-05, + "loss": 1.7469, + "step": 25453 + }, + { + "epoch": 7.812768569674647, + "grad_norm": 0.16350387036800385, + "learning_rate": 1.2033178546410073e-05, + "loss": 1.6755, + "step": 25454 + }, + { + "epoch": 7.8130755064456725, + "grad_norm": 0.21092571318149567, + "learning_rate": 1.202994439951427e-05, + "loss": 1.7538, + "step": 25455 + }, + { + "epoch": 7.813382443216698, + "grad_norm": 0.13705989718437195, + "learning_rate": 1.2026710627860777e-05, + "loss": 1.6563, + "step": 25456 + }, + { + "epoch": 7.813689379987722, + "grad_norm": 0.2368711531162262, + "learning_rate": 1.20234772314816e-05, + "loss": 1.7685, + "step": 25457 + }, + { + "epoch": 7.813996316758748, + "grad_norm": 0.19303718209266663, + "learning_rate": 1.2020244210408682e-05, + "loss": 1.7286, + "step": 25458 + }, + { + "epoch": 7.814303253529773, + "grad_norm": 0.17113862931728363, + "learning_rate": 1.2017011564673974e-05, + "loss": 1.6336, + "step": 25459 + }, + { + "epoch": 7.814610190300798, + "grad_norm": 0.2151467204093933, + "learning_rate": 1.2013779294309418e-05, + "loss": 1.7585, + "step": 25460 + }, + { + "epoch": 7.814917127071823, + "grad_norm": 0.21620413661003113, + "learning_rate": 1.2010547399346961e-05, + "loss": 1.7058, + "step": 25461 + }, + { + "epoch": 7.815224063842848, + "grad_norm": 0.20134735107421875, + "learning_rate": 1.2007315879818537e-05, + "loss": 1.7833, + "step": 25462 + }, + { + "epoch": 7.815531000613873, + "grad_norm": 0.16653650999069214, + "learning_rate": 1.2004084735756088e-05, + "loss": 1.7022, + "step": 25463 + }, + { + "epoch": 7.815837937384899, + "grad_norm": 0.2135760486125946, + "learning_rate": 1.2000853967191527e-05, + "loss": 1.7502, + "step": 25464 + }, + { + "epoch": 7.816144874155924, + "grad_norm": 0.19773945212364197, + "learning_rate": 1.199762357415683e-05, + "loss": 1.7369, + "step": 25465 + }, + { + "epoch": 7.816451810926949, + "grad_norm": 0.1873825341463089, + "learning_rate": 1.1994393556683876e-05, + "loss": 1.6921, + "step": 25466 + }, + { + "epoch": 7.816758747697974, + "grad_norm": 0.19304445385932922, + "learning_rate": 1.1991163914804604e-05, + "loss": 1.6934, + "step": 25467 + }, + { + "epoch": 7.817065684468999, + "grad_norm": 0.16338905692100525, + "learning_rate": 1.1987934648550924e-05, + "loss": 1.6523, + "step": 25468 + }, + { + "epoch": 7.8173726212400245, + "grad_norm": 0.16972069442272186, + "learning_rate": 1.198470575795474e-05, + "loss": 1.6907, + "step": 25469 + }, + { + "epoch": 7.81767955801105, + "grad_norm": 0.17251834273338318, + "learning_rate": 1.1981477243048e-05, + "loss": 1.7336, + "step": 25470 + }, + { + "epoch": 7.817986494782075, + "grad_norm": 0.17767611145973206, + "learning_rate": 1.197824910386256e-05, + "loss": 1.6809, + "step": 25471 + }, + { + "epoch": 7.8182934315531, + "grad_norm": 0.1854296773672104, + "learning_rate": 1.197502134043038e-05, + "loss": 1.6938, + "step": 25472 + }, + { + "epoch": 7.818600368324125, + "grad_norm": 0.15811395645141602, + "learning_rate": 1.1971793952783295e-05, + "loss": 1.6346, + "step": 25473 + }, + { + "epoch": 7.81890730509515, + "grad_norm": 0.1668241322040558, + "learning_rate": 1.196856694095324e-05, + "loss": 1.7014, + "step": 25474 + }, + { + "epoch": 7.819214241866176, + "grad_norm": 0.16705112159252167, + "learning_rate": 1.1965340304972105e-05, + "loss": 1.7509, + "step": 25475 + }, + { + "epoch": 7.819521178637201, + "grad_norm": 0.1737189143896103, + "learning_rate": 1.1962114044871764e-05, + "loss": 1.6934, + "step": 25476 + }, + { + "epoch": 7.819828115408226, + "grad_norm": 0.21887148916721344, + "learning_rate": 1.1958888160684112e-05, + "loss": 1.7163, + "step": 25477 + }, + { + "epoch": 7.820135052179251, + "grad_norm": 0.19267810881137848, + "learning_rate": 1.1955662652441018e-05, + "loss": 1.6941, + "step": 25478 + }, + { + "epoch": 7.820441988950276, + "grad_norm": 0.19797572493553162, + "learning_rate": 1.195243752017437e-05, + "loss": 1.7067, + "step": 25479 + }, + { + "epoch": 7.820748925721301, + "grad_norm": 0.20177066326141357, + "learning_rate": 1.1949212763916035e-05, + "loss": 1.7186, + "step": 25480 + }, + { + "epoch": 7.821055862492327, + "grad_norm": 0.1789240539073944, + "learning_rate": 1.1945988383697876e-05, + "loss": 1.7533, + "step": 25481 + }, + { + "epoch": 7.821362799263352, + "grad_norm": 0.2210909128189087, + "learning_rate": 1.1942764379551769e-05, + "loss": 1.7255, + "step": 25482 + }, + { + "epoch": 7.8216697360343765, + "grad_norm": 0.17705149948596954, + "learning_rate": 1.193954075150957e-05, + "loss": 1.6797, + "step": 25483 + }, + { + "epoch": 7.821976672805402, + "grad_norm": 0.17962488532066345, + "learning_rate": 1.1936317499603134e-05, + "loss": 1.7134, + "step": 25484 + }, + { + "epoch": 7.822283609576427, + "grad_norm": 0.2144375741481781, + "learning_rate": 1.193309462386432e-05, + "loss": 1.6837, + "step": 25485 + }, + { + "epoch": 7.8225905463474525, + "grad_norm": 0.19018805027008057, + "learning_rate": 1.1929872124324976e-05, + "loss": 1.7377, + "step": 25486 + }, + { + "epoch": 7.822897483118478, + "grad_norm": 0.2281246781349182, + "learning_rate": 1.1926650001016953e-05, + "loss": 1.755, + "step": 25487 + }, + { + "epoch": 7.823204419889503, + "grad_norm": 0.17724375426769257, + "learning_rate": 1.1923428253972069e-05, + "loss": 1.7018, + "step": 25488 + }, + { + "epoch": 7.823511356660528, + "grad_norm": 0.19313837587833405, + "learning_rate": 1.1920206883222218e-05, + "loss": 1.705, + "step": 25489 + }, + { + "epoch": 7.823818293431553, + "grad_norm": 0.1883455514907837, + "learning_rate": 1.191698588879917e-05, + "loss": 1.66, + "step": 25490 + }, + { + "epoch": 7.824125230202578, + "grad_norm": 0.20110155642032623, + "learning_rate": 1.1913765270734805e-05, + "loss": 1.7456, + "step": 25491 + }, + { + "epoch": 7.824432166973604, + "grad_norm": 0.23234841227531433, + "learning_rate": 1.1910545029060938e-05, + "loss": 1.6987, + "step": 25492 + }, + { + "epoch": 7.824739103744628, + "grad_norm": 0.208989679813385, + "learning_rate": 1.1907325163809386e-05, + "loss": 1.7753, + "step": 25493 + }, + { + "epoch": 7.8250460405156534, + "grad_norm": 0.19063059985637665, + "learning_rate": 1.1904105675011972e-05, + "loss": 1.6664, + "step": 25494 + }, + { + "epoch": 7.825352977286679, + "grad_norm": 0.16878041625022888, + "learning_rate": 1.1900886562700519e-05, + "loss": 1.6886, + "step": 25495 + }, + { + "epoch": 7.825659914057704, + "grad_norm": 0.19139298796653748, + "learning_rate": 1.1897667826906834e-05, + "loss": 1.7195, + "step": 25496 + }, + { + "epoch": 7.8259668508287294, + "grad_norm": 0.255795419216156, + "learning_rate": 1.1894449467662728e-05, + "loss": 1.7835, + "step": 25497 + }, + { + "epoch": 7.826273787599755, + "grad_norm": 0.17967084050178528, + "learning_rate": 1.1891231485000004e-05, + "loss": 1.6959, + "step": 25498 + }, + { + "epoch": 7.82658072437078, + "grad_norm": 0.23582984507083893, + "learning_rate": 1.1888013878950471e-05, + "loss": 1.7252, + "step": 25499 + }, + { + "epoch": 7.826887661141805, + "grad_norm": 0.189914271235466, + "learning_rate": 1.188479664954592e-05, + "loss": 1.7216, + "step": 25500 + }, + { + "epoch": 7.82719459791283, + "grad_norm": 0.19840605556964874, + "learning_rate": 1.1881579796818148e-05, + "loss": 1.714, + "step": 25501 + }, + { + "epoch": 7.827501534683855, + "grad_norm": 0.25255537033081055, + "learning_rate": 1.1878363320798946e-05, + "loss": 1.7008, + "step": 25502 + }, + { + "epoch": 7.827808471454881, + "grad_norm": 0.1863456666469574, + "learning_rate": 1.1875147221520105e-05, + "loss": 1.7804, + "step": 25503 + }, + { + "epoch": 7.828115408225905, + "grad_norm": 0.2700684368610382, + "learning_rate": 1.1871931499013405e-05, + "loss": 1.6756, + "step": 25504 + }, + { + "epoch": 7.82842234499693, + "grad_norm": 0.19838537275791168, + "learning_rate": 1.1868716153310604e-05, + "loss": 1.6828, + "step": 25505 + }, + { + "epoch": 7.828729281767956, + "grad_norm": 0.1896767020225525, + "learning_rate": 1.1865501184443533e-05, + "loss": 1.7014, + "step": 25506 + }, + { + "epoch": 7.829036218538981, + "grad_norm": 0.2330249398946762, + "learning_rate": 1.1862286592443905e-05, + "loss": 1.7509, + "step": 25507 + }, + { + "epoch": 7.829343155310006, + "grad_norm": 0.17078560590744019, + "learning_rate": 1.1859072377343539e-05, + "loss": 1.6742, + "step": 25508 + }, + { + "epoch": 7.829650092081032, + "grad_norm": 0.2834900915622711, + "learning_rate": 1.1855858539174146e-05, + "loss": 1.7676, + "step": 25509 + }, + { + "epoch": 7.829957028852056, + "grad_norm": 0.18936461210250854, + "learning_rate": 1.1852645077967533e-05, + "loss": 1.7374, + "step": 25510 + }, + { + "epoch": 7.8302639656230815, + "grad_norm": 0.2720448970794678, + "learning_rate": 1.1849431993755439e-05, + "loss": 1.7001, + "step": 25511 + }, + { + "epoch": 7.830570902394107, + "grad_norm": 0.18198262155056, + "learning_rate": 1.184621928656962e-05, + "loss": 1.6679, + "step": 25512 + }, + { + "epoch": 7.830877839165132, + "grad_norm": 0.16957701742649078, + "learning_rate": 1.1843006956441821e-05, + "loss": 1.7064, + "step": 25513 + }, + { + "epoch": 7.8311847759361575, + "grad_norm": 0.18632464110851288, + "learning_rate": 1.1839795003403798e-05, + "loss": 1.6857, + "step": 25514 + }, + { + "epoch": 7.831491712707182, + "grad_norm": 0.15639352798461914, + "learning_rate": 1.183658342748728e-05, + "loss": 1.695, + "step": 25515 + }, + { + "epoch": 7.831798649478207, + "grad_norm": 0.17000986635684967, + "learning_rate": 1.1833372228724016e-05, + "loss": 1.696, + "step": 25516 + }, + { + "epoch": 7.832105586249233, + "grad_norm": 0.23334810137748718, + "learning_rate": 1.1830161407145735e-05, + "loss": 1.7574, + "step": 25517 + }, + { + "epoch": 7.832412523020258, + "grad_norm": 0.16260294616222382, + "learning_rate": 1.1826950962784177e-05, + "loss": 1.667, + "step": 25518 + }, + { + "epoch": 7.832719459791283, + "grad_norm": 0.18244150280952454, + "learning_rate": 1.1823740895671059e-05, + "loss": 1.6836, + "step": 25519 + }, + { + "epoch": 7.833026396562309, + "grad_norm": 0.18404243886470795, + "learning_rate": 1.182053120583811e-05, + "loss": 1.6922, + "step": 25520 + }, + { + "epoch": 7.833333333333333, + "grad_norm": 0.22713635861873627, + "learning_rate": 1.1817321893317052e-05, + "loss": 1.8055, + "step": 25521 + }, + { + "epoch": 7.833640270104358, + "grad_norm": 0.14314736425876617, + "learning_rate": 1.1814112958139577e-05, + "loss": 1.6624, + "step": 25522 + }, + { + "epoch": 7.833947206875384, + "grad_norm": 0.1947709321975708, + "learning_rate": 1.1810904400337458e-05, + "loss": 1.8108, + "step": 25523 + }, + { + "epoch": 7.834254143646409, + "grad_norm": 0.1811491698026657, + "learning_rate": 1.1807696219942326e-05, + "loss": 1.7258, + "step": 25524 + }, + { + "epoch": 7.834561080417434, + "grad_norm": 0.16776522994041443, + "learning_rate": 1.1804488416985966e-05, + "loss": 1.6834, + "step": 25525 + }, + { + "epoch": 7.834868017188459, + "grad_norm": 0.1590484231710434, + "learning_rate": 1.1801280991500002e-05, + "loss": 1.6797, + "step": 25526 + }, + { + "epoch": 7.835174953959484, + "grad_norm": 0.1564435064792633, + "learning_rate": 1.179807394351618e-05, + "loss": 1.7035, + "step": 25527 + }, + { + "epoch": 7.8354818907305095, + "grad_norm": 0.17740637063980103, + "learning_rate": 1.1794867273066184e-05, + "loss": 1.6844, + "step": 25528 + }, + { + "epoch": 7.835788827501535, + "grad_norm": 0.17152990400791168, + "learning_rate": 1.1791660980181707e-05, + "loss": 1.6745, + "step": 25529 + }, + { + "epoch": 7.83609576427256, + "grad_norm": 0.17763324081897736, + "learning_rate": 1.1788455064894427e-05, + "loss": 1.6941, + "step": 25530 + }, + { + "epoch": 7.8364027010435855, + "grad_norm": 0.16168560087680817, + "learning_rate": 1.178524952723603e-05, + "loss": 1.6955, + "step": 25531 + }, + { + "epoch": 7.83670963781461, + "grad_norm": 0.1819266527891159, + "learning_rate": 1.1782044367238199e-05, + "loss": 1.6838, + "step": 25532 + }, + { + "epoch": 7.837016574585635, + "grad_norm": 0.16239593923091888, + "learning_rate": 1.1778839584932605e-05, + "loss": 1.7045, + "step": 25533 + }, + { + "epoch": 7.837323511356661, + "grad_norm": 0.18346372246742249, + "learning_rate": 1.177563518035092e-05, + "loss": 1.7418, + "step": 25534 + }, + { + "epoch": 7.837630448127686, + "grad_norm": 0.18437781929969788, + "learning_rate": 1.177243115352481e-05, + "loss": 1.7138, + "step": 25535 + }, + { + "epoch": 7.83793738489871, + "grad_norm": 0.16199420392513275, + "learning_rate": 1.1769227504485942e-05, + "loss": 1.7115, + "step": 25536 + }, + { + "epoch": 7.838244321669736, + "grad_norm": 0.174173504114151, + "learning_rate": 1.1766024233265977e-05, + "loss": 1.7115, + "step": 25537 + }, + { + "epoch": 7.838551258440761, + "grad_norm": 0.1924828737974167, + "learning_rate": 1.1762821339896567e-05, + "loss": 1.7343, + "step": 25538 + }, + { + "epoch": 7.838858195211786, + "grad_norm": 0.20509763062000275, + "learning_rate": 1.1759618824409357e-05, + "loss": 1.7296, + "step": 25539 + }, + { + "epoch": 7.839165131982812, + "grad_norm": 0.1762499213218689, + "learning_rate": 1.1756416686836035e-05, + "loss": 1.6721, + "step": 25540 + }, + { + "epoch": 7.839472068753837, + "grad_norm": 0.17260326445102692, + "learning_rate": 1.175321492720819e-05, + "loss": 1.7238, + "step": 25541 + }, + { + "epoch": 7.8397790055248615, + "grad_norm": 0.21378587186336517, + "learning_rate": 1.175001354555752e-05, + "loss": 1.7442, + "step": 25542 + }, + { + "epoch": 7.840085942295887, + "grad_norm": 0.20900048315525055, + "learning_rate": 1.1746812541915608e-05, + "loss": 1.7426, + "step": 25543 + }, + { + "epoch": 7.840392879066912, + "grad_norm": 0.2082734853029251, + "learning_rate": 1.1743611916314129e-05, + "loss": 1.7209, + "step": 25544 + }, + { + "epoch": 7.8406998158379375, + "grad_norm": 0.1696191281080246, + "learning_rate": 1.1740411668784701e-05, + "loss": 1.7039, + "step": 25545 + }, + { + "epoch": 7.841006752608963, + "grad_norm": 0.18812915682792664, + "learning_rate": 1.173721179935895e-05, + "loss": 1.6873, + "step": 25546 + }, + { + "epoch": 7.841313689379987, + "grad_norm": 0.19983457028865814, + "learning_rate": 1.1734012308068493e-05, + "loss": 1.701, + "step": 25547 + }, + { + "epoch": 7.841620626151013, + "grad_norm": 0.18811485171318054, + "learning_rate": 1.1730813194944962e-05, + "loss": 1.7466, + "step": 25548 + }, + { + "epoch": 7.841927562922038, + "grad_norm": 0.16648226976394653, + "learning_rate": 1.172761446001996e-05, + "loss": 1.7449, + "step": 25549 + }, + { + "epoch": 7.842234499693063, + "grad_norm": 0.17902494966983795, + "learning_rate": 1.1724416103325104e-05, + "loss": 1.7395, + "step": 25550 + }, + { + "epoch": 7.842541436464089, + "grad_norm": 0.2420952469110489, + "learning_rate": 1.1721218124892003e-05, + "loss": 1.728, + "step": 25551 + }, + { + "epoch": 7.842848373235114, + "grad_norm": 0.16240666806697845, + "learning_rate": 1.1718020524752266e-05, + "loss": 1.6368, + "step": 25552 + }, + { + "epoch": 7.843155310006138, + "grad_norm": 0.17968396842479706, + "learning_rate": 1.1714823302937483e-05, + "loss": 1.729, + "step": 25553 + }, + { + "epoch": 7.843462246777164, + "grad_norm": 0.17617417871952057, + "learning_rate": 1.1711626459479252e-05, + "loss": 1.6975, + "step": 25554 + }, + { + "epoch": 7.843769183548189, + "grad_norm": 0.1679859161376953, + "learning_rate": 1.1708429994409176e-05, + "loss": 1.6955, + "step": 25555 + }, + { + "epoch": 7.844076120319214, + "grad_norm": 0.1653962880373001, + "learning_rate": 1.1705233907758823e-05, + "loss": 1.7107, + "step": 25556 + }, + { + "epoch": 7.84438305709024, + "grad_norm": 0.190699502825737, + "learning_rate": 1.1702038199559817e-05, + "loss": 1.75, + "step": 25557 + }, + { + "epoch": 7.844689993861264, + "grad_norm": 0.17185768485069275, + "learning_rate": 1.1698842869843696e-05, + "loss": 1.7087, + "step": 25558 + }, + { + "epoch": 7.8449969306322895, + "grad_norm": 0.17880931496620178, + "learning_rate": 1.1695647918642084e-05, + "loss": 1.7082, + "step": 25559 + }, + { + "epoch": 7.845303867403315, + "grad_norm": 0.15360671281814575, + "learning_rate": 1.1692453345986498e-05, + "loss": 1.7028, + "step": 25560 + }, + { + "epoch": 7.84561080417434, + "grad_norm": 0.16576705873012543, + "learning_rate": 1.168925915190856e-05, + "loss": 1.7147, + "step": 25561 + }, + { + "epoch": 7.8459177409453655, + "grad_norm": 0.14623773097991943, + "learning_rate": 1.1686065336439817e-05, + "loss": 1.682, + "step": 25562 + }, + { + "epoch": 7.846224677716391, + "grad_norm": 0.16677425801753998, + "learning_rate": 1.168287189961183e-05, + "loss": 1.7089, + "step": 25563 + }, + { + "epoch": 7.846531614487415, + "grad_norm": 0.160381019115448, + "learning_rate": 1.1679678841456164e-05, + "loss": 1.6929, + "step": 25564 + }, + { + "epoch": 7.846838551258441, + "grad_norm": 0.1775302290916443, + "learning_rate": 1.1676486162004374e-05, + "loss": 1.6947, + "step": 25565 + }, + { + "epoch": 7.847145488029466, + "grad_norm": 0.1681419014930725, + "learning_rate": 1.1673293861288003e-05, + "loss": 1.7173, + "step": 25566 + }, + { + "epoch": 7.847452424800491, + "grad_norm": 0.18374401330947876, + "learning_rate": 1.1670101939338613e-05, + "loss": 1.7175, + "step": 25567 + }, + { + "epoch": 7.847759361571516, + "grad_norm": 0.19383086264133453, + "learning_rate": 1.1666910396187736e-05, + "loss": 1.6962, + "step": 25568 + }, + { + "epoch": 7.848066298342541, + "grad_norm": 0.16849574446678162, + "learning_rate": 1.1663719231866921e-05, + "loss": 1.6717, + "step": 25569 + }, + { + "epoch": 7.848373235113566, + "grad_norm": 0.2510664165019989, + "learning_rate": 1.1660528446407703e-05, + "loss": 1.7983, + "step": 25570 + }, + { + "epoch": 7.848680171884592, + "grad_norm": 0.21037714183330536, + "learning_rate": 1.1657338039841614e-05, + "loss": 1.7287, + "step": 25571 + }, + { + "epoch": 7.848987108655617, + "grad_norm": 0.15170596539974213, + "learning_rate": 1.1654148012200184e-05, + "loss": 1.7076, + "step": 25572 + }, + { + "epoch": 7.849294045426642, + "grad_norm": 0.2093864530324936, + "learning_rate": 1.1650958363514919e-05, + "loss": 1.7469, + "step": 25573 + }, + { + "epoch": 7.849600982197668, + "grad_norm": 0.15684813261032104, + "learning_rate": 1.1647769093817395e-05, + "loss": 1.6731, + "step": 25574 + }, + { + "epoch": 7.849907918968692, + "grad_norm": 0.1600468009710312, + "learning_rate": 1.1644580203139066e-05, + "loss": 1.6394, + "step": 25575 + }, + { + "epoch": 7.850214855739718, + "grad_norm": 0.1863955557346344, + "learning_rate": 1.1641391691511505e-05, + "loss": 1.7025, + "step": 25576 + }, + { + "epoch": 7.850521792510743, + "grad_norm": 0.189132422208786, + "learning_rate": 1.1638203558966166e-05, + "loss": 1.7095, + "step": 25577 + }, + { + "epoch": 7.850828729281768, + "grad_norm": 0.166460782289505, + "learning_rate": 1.1635015805534593e-05, + "loss": 1.6756, + "step": 25578 + }, + { + "epoch": 7.851135666052793, + "grad_norm": 0.15910424292087555, + "learning_rate": 1.1631828431248288e-05, + "loss": 1.6664, + "step": 25579 + }, + { + "epoch": 7.851442602823818, + "grad_norm": 0.14848501980304718, + "learning_rate": 1.1628641436138738e-05, + "loss": 1.6434, + "step": 25580 + }, + { + "epoch": 7.851749539594843, + "grad_norm": 0.1700928956270218, + "learning_rate": 1.1625454820237446e-05, + "loss": 1.7039, + "step": 25581 + }, + { + "epoch": 7.852056476365869, + "grad_norm": 0.17468976974487305, + "learning_rate": 1.1622268583575902e-05, + "loss": 1.7073, + "step": 25582 + }, + { + "epoch": 7.852363413136894, + "grad_norm": 0.18980912864208221, + "learning_rate": 1.1619082726185587e-05, + "loss": 1.6939, + "step": 25583 + }, + { + "epoch": 7.852670349907919, + "grad_norm": 0.1658385694026947, + "learning_rate": 1.1615897248098e-05, + "loss": 1.6892, + "step": 25584 + }, + { + "epoch": 7.852977286678944, + "grad_norm": 0.18137763440608978, + "learning_rate": 1.1612712149344612e-05, + "loss": 1.6608, + "step": 25585 + }, + { + "epoch": 7.853284223449969, + "grad_norm": 0.1642989218235016, + "learning_rate": 1.16095274299569e-05, + "loss": 1.6527, + "step": 25586 + }, + { + "epoch": 7.8535911602209945, + "grad_norm": 0.17476631700992584, + "learning_rate": 1.1606343089966343e-05, + "loss": 1.6622, + "step": 25587 + }, + { + "epoch": 7.85389809699202, + "grad_norm": 0.14995649456977844, + "learning_rate": 1.16031591294044e-05, + "loss": 1.6382, + "step": 25588 + }, + { + "epoch": 7.854205033763045, + "grad_norm": 0.16073103249073029, + "learning_rate": 1.1599975548302549e-05, + "loss": 1.6888, + "step": 25589 + }, + { + "epoch": 7.85451197053407, + "grad_norm": 0.1630357801914215, + "learning_rate": 1.159679234669223e-05, + "loss": 1.6717, + "step": 25590 + }, + { + "epoch": 7.854818907305095, + "grad_norm": 0.1537420153617859, + "learning_rate": 1.1593609524604948e-05, + "loss": 1.6836, + "step": 25591 + }, + { + "epoch": 7.85512584407612, + "grad_norm": 0.16389401257038116, + "learning_rate": 1.1590427082072103e-05, + "loss": 1.6941, + "step": 25592 + }, + { + "epoch": 7.855432780847146, + "grad_norm": 0.24554979801177979, + "learning_rate": 1.1587245019125192e-05, + "loss": 1.8018, + "step": 25593 + }, + { + "epoch": 7.855739717618171, + "grad_norm": 0.15020978450775146, + "learning_rate": 1.1584063335795614e-05, + "loss": 1.6815, + "step": 25594 + }, + { + "epoch": 7.856046654389196, + "grad_norm": 0.1830887496471405, + "learning_rate": 1.1580882032114853e-05, + "loss": 1.7134, + "step": 25595 + }, + { + "epoch": 7.856353591160221, + "grad_norm": 0.2381841540336609, + "learning_rate": 1.157770110811433e-05, + "loss": 1.7505, + "step": 25596 + }, + { + "epoch": 7.856660527931246, + "grad_norm": 0.210253044962883, + "learning_rate": 1.1574520563825491e-05, + "loss": 1.8048, + "step": 25597 + }, + { + "epoch": 7.856967464702271, + "grad_norm": 0.15428896248340607, + "learning_rate": 1.1571340399279756e-05, + "loss": 1.6624, + "step": 25598 + }, + { + "epoch": 7.857274401473297, + "grad_norm": 0.2932582199573517, + "learning_rate": 1.1568160614508567e-05, + "loss": 1.7192, + "step": 25599 + }, + { + "epoch": 7.857581338244322, + "grad_norm": 0.19450223445892334, + "learning_rate": 1.156498120954333e-05, + "loss": 1.753, + "step": 25600 + }, + { + "epoch": 7.8578882750153465, + "grad_norm": 0.16950540244579315, + "learning_rate": 1.1561802184415482e-05, + "loss": 1.7107, + "step": 25601 + }, + { + "epoch": 7.858195211786372, + "grad_norm": 0.18616287410259247, + "learning_rate": 1.1558623539156433e-05, + "loss": 1.6747, + "step": 25602 + }, + { + "epoch": 7.858502148557397, + "grad_norm": 0.20991890132427216, + "learning_rate": 1.1555445273797599e-05, + "loss": 1.6635, + "step": 25603 + }, + { + "epoch": 7.8588090853284225, + "grad_norm": 0.18592311441898346, + "learning_rate": 1.1552267388370386e-05, + "loss": 1.7327, + "step": 25604 + }, + { + "epoch": 7.859116022099448, + "grad_norm": 0.16478584706783295, + "learning_rate": 1.1549089882906206e-05, + "loss": 1.6523, + "step": 25605 + }, + { + "epoch": 7.859422958870473, + "grad_norm": 0.17281852662563324, + "learning_rate": 1.154591275743645e-05, + "loss": 1.7282, + "step": 25606 + }, + { + "epoch": 7.859729895641498, + "grad_norm": 0.17098689079284668, + "learning_rate": 1.1542736011992512e-05, + "loss": 1.7533, + "step": 25607 + }, + { + "epoch": 7.860036832412523, + "grad_norm": 0.1766287386417389, + "learning_rate": 1.1539559646605824e-05, + "loss": 1.6338, + "step": 25608 + }, + { + "epoch": 7.860343769183548, + "grad_norm": 0.15519756078720093, + "learning_rate": 1.1536383661307726e-05, + "loss": 1.6908, + "step": 25609 + }, + { + "epoch": 7.860650705954574, + "grad_norm": 0.18422503769397736, + "learning_rate": 1.1533208056129651e-05, + "loss": 1.6983, + "step": 25610 + }, + { + "epoch": 7.860957642725598, + "grad_norm": 0.1900123953819275, + "learning_rate": 1.1530032831102933e-05, + "loss": 1.7082, + "step": 25611 + }, + { + "epoch": 7.861264579496623, + "grad_norm": 0.15542784333229065, + "learning_rate": 1.1526857986259e-05, + "loss": 1.6979, + "step": 25612 + }, + { + "epoch": 7.861571516267649, + "grad_norm": 0.17173884809017181, + "learning_rate": 1.1523683521629197e-05, + "loss": 1.7329, + "step": 25613 + }, + { + "epoch": 7.861878453038674, + "grad_norm": 0.2399773746728897, + "learning_rate": 1.1520509437244908e-05, + "loss": 1.7224, + "step": 25614 + }, + { + "epoch": 7.862185389809699, + "grad_norm": 0.14101925492286682, + "learning_rate": 1.1517335733137502e-05, + "loss": 1.6676, + "step": 25615 + }, + { + "epoch": 7.862492326580725, + "grad_norm": 0.18625333905220032, + "learning_rate": 1.1514162409338336e-05, + "loss": 1.7269, + "step": 25616 + }, + { + "epoch": 7.862799263351749, + "grad_norm": 0.18385125696659088, + "learning_rate": 1.1510989465878774e-05, + "loss": 1.7197, + "step": 25617 + }, + { + "epoch": 7.8631062001227745, + "grad_norm": 0.16189569234848022, + "learning_rate": 1.1507816902790176e-05, + "loss": 1.662, + "step": 25618 + }, + { + "epoch": 7.8634131368938, + "grad_norm": 0.18526791036128998, + "learning_rate": 1.1504644720103885e-05, + "loss": 1.7521, + "step": 25619 + }, + { + "epoch": 7.863720073664825, + "grad_norm": 0.16588367521762848, + "learning_rate": 1.1501472917851263e-05, + "loss": 1.7238, + "step": 25620 + }, + { + "epoch": 7.8640270104358505, + "grad_norm": 0.15427199006080627, + "learning_rate": 1.1498301496063652e-05, + "loss": 1.6566, + "step": 25621 + }, + { + "epoch": 7.864333947206875, + "grad_norm": 0.1694655865430832, + "learning_rate": 1.149513045477239e-05, + "loss": 1.7446, + "step": 25622 + }, + { + "epoch": 7.8646408839779, + "grad_norm": 0.18305882811546326, + "learning_rate": 1.1491959794008823e-05, + "loss": 1.7093, + "step": 25623 + }, + { + "epoch": 7.864947820748926, + "grad_norm": 0.15975148975849152, + "learning_rate": 1.148878951380426e-05, + "loss": 1.6911, + "step": 25624 + }, + { + "epoch": 7.865254757519951, + "grad_norm": 0.18298782408237457, + "learning_rate": 1.148561961419008e-05, + "loss": 1.7188, + "step": 25625 + }, + { + "epoch": 7.865561694290976, + "grad_norm": 0.16258102655410767, + "learning_rate": 1.148245009519755e-05, + "loss": 1.6901, + "step": 25626 + }, + { + "epoch": 7.865868631062002, + "grad_norm": 0.19591568410396576, + "learning_rate": 1.1479280956858057e-05, + "loss": 1.7521, + "step": 25627 + }, + { + "epoch": 7.866175567833026, + "grad_norm": 0.15821373462677002, + "learning_rate": 1.1476112199202853e-05, + "loss": 1.6503, + "step": 25628 + }, + { + "epoch": 7.866482504604051, + "grad_norm": 0.1531122773885727, + "learning_rate": 1.147294382226331e-05, + "loss": 1.6802, + "step": 25629 + }, + { + "epoch": 7.866789441375077, + "grad_norm": 0.2105177342891693, + "learning_rate": 1.1469775826070711e-05, + "loss": 1.7705, + "step": 25630 + }, + { + "epoch": 7.867096378146102, + "grad_norm": 0.22782234847545624, + "learning_rate": 1.1466608210656377e-05, + "loss": 1.6813, + "step": 25631 + }, + { + "epoch": 7.867403314917127, + "grad_norm": 0.1824047863483429, + "learning_rate": 1.1463440976051598e-05, + "loss": 1.7149, + "step": 25632 + }, + { + "epoch": 7.867710251688152, + "grad_norm": 0.19195812940597534, + "learning_rate": 1.1460274122287685e-05, + "loss": 1.6912, + "step": 25633 + }, + { + "epoch": 7.868017188459177, + "grad_norm": 0.22274719178676605, + "learning_rate": 1.1457107649395937e-05, + "loss": 1.8499, + "step": 25634 + }, + { + "epoch": 7.8683241252302025, + "grad_norm": 0.21217535436153412, + "learning_rate": 1.1453941557407638e-05, + "loss": 1.7345, + "step": 25635 + }, + { + "epoch": 7.868631062001228, + "grad_norm": 0.20042434334754944, + "learning_rate": 1.1450775846354078e-05, + "loss": 1.6902, + "step": 25636 + }, + { + "epoch": 7.868937998772253, + "grad_norm": 0.17045147716999054, + "learning_rate": 1.1447610516266548e-05, + "loss": 1.6641, + "step": 25637 + }, + { + "epoch": 7.8692449355432785, + "grad_norm": 0.18817269802093506, + "learning_rate": 1.1444445567176326e-05, + "loss": 1.7063, + "step": 25638 + }, + { + "epoch": 7.869551872314303, + "grad_norm": 0.1746743619441986, + "learning_rate": 1.1441280999114694e-05, + "loss": 1.6838, + "step": 25639 + }, + { + "epoch": 7.869858809085328, + "grad_norm": 0.1734321415424347, + "learning_rate": 1.1438116812112925e-05, + "loss": 1.6939, + "step": 25640 + }, + { + "epoch": 7.870165745856354, + "grad_norm": 0.1745334416627884, + "learning_rate": 1.1434953006202281e-05, + "loss": 1.71, + "step": 25641 + }, + { + "epoch": 7.870472682627379, + "grad_norm": 0.20883594453334808, + "learning_rate": 1.1431789581414043e-05, + "loss": 1.6941, + "step": 25642 + }, + { + "epoch": 7.870779619398404, + "grad_norm": 0.1664251685142517, + "learning_rate": 1.1428626537779447e-05, + "loss": 1.6995, + "step": 25643 + }, + { + "epoch": 7.871086556169429, + "grad_norm": 0.16561046242713928, + "learning_rate": 1.1425463875329795e-05, + "loss": 1.7093, + "step": 25644 + }, + { + "epoch": 7.871393492940454, + "grad_norm": 0.21409009397029877, + "learning_rate": 1.1422301594096297e-05, + "loss": 1.6919, + "step": 25645 + }, + { + "epoch": 7.871700429711479, + "grad_norm": 0.19574479758739471, + "learning_rate": 1.1419139694110236e-05, + "loss": 1.777, + "step": 25646 + }, + { + "epoch": 7.872007366482505, + "grad_norm": 0.15032227337360382, + "learning_rate": 1.1415978175402853e-05, + "loss": 1.6759, + "step": 25647 + }, + { + "epoch": 7.87231430325353, + "grad_norm": 0.18372420966625214, + "learning_rate": 1.1412817038005386e-05, + "loss": 1.7304, + "step": 25648 + }, + { + "epoch": 7.872621240024555, + "grad_norm": 0.16073383390903473, + "learning_rate": 1.1409656281949077e-05, + "loss": 1.6784, + "step": 25649 + }, + { + "epoch": 7.87292817679558, + "grad_norm": 0.15698374807834625, + "learning_rate": 1.1406495907265163e-05, + "loss": 1.6877, + "step": 25650 + }, + { + "epoch": 7.873235113566605, + "grad_norm": 0.18749327957630157, + "learning_rate": 1.140333591398488e-05, + "loss": 1.708, + "step": 25651 + }, + { + "epoch": 7.8735420503376305, + "grad_norm": 0.15412451326847076, + "learning_rate": 1.1400176302139448e-05, + "loss": 1.6661, + "step": 25652 + }, + { + "epoch": 7.873848987108656, + "grad_norm": 0.22467148303985596, + "learning_rate": 1.1397017071760102e-05, + "loss": 1.8204, + "step": 25653 + }, + { + "epoch": 7.87415592387968, + "grad_norm": 0.14625288546085358, + "learning_rate": 1.1393858222878063e-05, + "loss": 1.7008, + "step": 25654 + }, + { + "epoch": 7.874462860650706, + "grad_norm": 0.14440159499645233, + "learning_rate": 1.1390699755524537e-05, + "loss": 1.652, + "step": 25655 + }, + { + "epoch": 7.874769797421731, + "grad_norm": 0.14738808572292328, + "learning_rate": 1.138754166973075e-05, + "loss": 1.6305, + "step": 25656 + }, + { + "epoch": 7.875076734192756, + "grad_norm": 0.17714212834835052, + "learning_rate": 1.1384383965527906e-05, + "loss": 1.7011, + "step": 25657 + }, + { + "epoch": 7.875383670963782, + "grad_norm": 0.17601121962070465, + "learning_rate": 1.1381226642947213e-05, + "loss": 1.7425, + "step": 25658 + }, + { + "epoch": 7.875690607734807, + "grad_norm": 0.1893182396888733, + "learning_rate": 1.1378069702019877e-05, + "loss": 1.7215, + "step": 25659 + }, + { + "epoch": 7.8759975445058314, + "grad_norm": 0.20073552429676056, + "learning_rate": 1.1374913142777077e-05, + "loss": 1.7025, + "step": 25660 + }, + { + "epoch": 7.876304481276857, + "grad_norm": 0.17025165259838104, + "learning_rate": 1.1371756965250052e-05, + "loss": 1.7046, + "step": 25661 + }, + { + "epoch": 7.876611418047882, + "grad_norm": 0.17612501978874207, + "learning_rate": 1.1368601169469933e-05, + "loss": 1.7452, + "step": 25662 + }, + { + "epoch": 7.8769183548189075, + "grad_norm": 0.2542072534561157, + "learning_rate": 1.1365445755467974e-05, + "loss": 1.765, + "step": 25663 + }, + { + "epoch": 7.877225291589933, + "grad_norm": 0.25291866064071655, + "learning_rate": 1.1362290723275293e-05, + "loss": 1.7477, + "step": 25664 + }, + { + "epoch": 7.877532228360957, + "grad_norm": 0.1848495602607727, + "learning_rate": 1.1359136072923121e-05, + "loss": 1.7278, + "step": 25665 + }, + { + "epoch": 7.877839165131983, + "grad_norm": 0.18354780972003937, + "learning_rate": 1.1355981804442605e-05, + "loss": 1.7469, + "step": 25666 + }, + { + "epoch": 7.878146101903008, + "grad_norm": 0.1843772530555725, + "learning_rate": 1.1352827917864934e-05, + "loss": 1.7654, + "step": 25667 + }, + { + "epoch": 7.878453038674033, + "grad_norm": 0.144758403301239, + "learning_rate": 1.1349674413221267e-05, + "loss": 1.6649, + "step": 25668 + }, + { + "epoch": 7.878759975445059, + "grad_norm": 0.15747511386871338, + "learning_rate": 1.1346521290542772e-05, + "loss": 1.6386, + "step": 25669 + }, + { + "epoch": 7.879066912216084, + "grad_norm": 0.17898736894130707, + "learning_rate": 1.134336854986061e-05, + "loss": 1.7, + "step": 25670 + }, + { + "epoch": 7.879373848987108, + "grad_norm": 0.19453589618206024, + "learning_rate": 1.1340216191205939e-05, + "loss": 1.7108, + "step": 25671 + }, + { + "epoch": 7.879680785758134, + "grad_norm": 0.17470498383045197, + "learning_rate": 1.1337064214609905e-05, + "loss": 1.7705, + "step": 25672 + }, + { + "epoch": 7.879987722529159, + "grad_norm": 0.1897793561220169, + "learning_rate": 1.1333912620103665e-05, + "loss": 1.7358, + "step": 25673 + }, + { + "epoch": 7.880294659300184, + "grad_norm": 0.1659744381904602, + "learning_rate": 1.1330761407718366e-05, + "loss": 1.724, + "step": 25674 + }, + { + "epoch": 7.88060159607121, + "grad_norm": 0.15303891897201538, + "learning_rate": 1.1327610577485148e-05, + "loss": 1.6878, + "step": 25675 + }, + { + "epoch": 7.880908532842234, + "grad_norm": 0.16346490383148193, + "learning_rate": 1.1324460129435144e-05, + "loss": 1.6544, + "step": 25676 + }, + { + "epoch": 7.8812154696132595, + "grad_norm": 0.19887791574001312, + "learning_rate": 1.1321310063599483e-05, + "loss": 1.7169, + "step": 25677 + }, + { + "epoch": 7.881522406384285, + "grad_norm": 0.1658533811569214, + "learning_rate": 1.1318160380009334e-05, + "loss": 1.6902, + "step": 25678 + }, + { + "epoch": 7.88182934315531, + "grad_norm": 0.16859948635101318, + "learning_rate": 1.131501107869577e-05, + "loss": 1.7015, + "step": 25679 + }, + { + "epoch": 7.8821362799263355, + "grad_norm": 0.20775821805000305, + "learning_rate": 1.1311862159689968e-05, + "loss": 1.7519, + "step": 25680 + }, + { + "epoch": 7.882443216697361, + "grad_norm": 0.18174295127391815, + "learning_rate": 1.1308713623022987e-05, + "loss": 1.7161, + "step": 25681 + }, + { + "epoch": 7.882750153468385, + "grad_norm": 0.1843954473733902, + "learning_rate": 1.1305565468725993e-05, + "loss": 1.6753, + "step": 25682 + }, + { + "epoch": 7.883057090239411, + "grad_norm": 0.1856461614370346, + "learning_rate": 1.130241769683008e-05, + "loss": 1.7139, + "step": 25683 + }, + { + "epoch": 7.883364027010436, + "grad_norm": 0.15803632140159607, + "learning_rate": 1.129927030736636e-05, + "loss": 1.6705, + "step": 25684 + }, + { + "epoch": 7.883670963781461, + "grad_norm": 0.1680101901292801, + "learning_rate": 1.1296123300365947e-05, + "loss": 1.6757, + "step": 25685 + }, + { + "epoch": 7.883977900552486, + "grad_norm": 0.157195046544075, + "learning_rate": 1.1292976675859895e-05, + "loss": 1.6922, + "step": 25686 + }, + { + "epoch": 7.884284837323511, + "grad_norm": 0.17270046472549438, + "learning_rate": 1.1289830433879356e-05, + "loss": 1.6909, + "step": 25687 + }, + { + "epoch": 7.884591774094536, + "grad_norm": 0.1880030781030655, + "learning_rate": 1.1286684574455398e-05, + "loss": 1.7139, + "step": 25688 + }, + { + "epoch": 7.884898710865562, + "grad_norm": 0.1882653832435608, + "learning_rate": 1.1283539097619112e-05, + "loss": 1.7464, + "step": 25689 + }, + { + "epoch": 7.885205647636587, + "grad_norm": 0.2060890644788742, + "learning_rate": 1.128039400340159e-05, + "loss": 1.6749, + "step": 25690 + }, + { + "epoch": 7.885512584407612, + "grad_norm": 0.20780493319034576, + "learning_rate": 1.1277249291833903e-05, + "loss": 1.7581, + "step": 25691 + }, + { + "epoch": 7.885819521178637, + "grad_norm": 0.1929686814546585, + "learning_rate": 1.1274104962947135e-05, + "loss": 1.6962, + "step": 25692 + }, + { + "epoch": 7.886126457949662, + "grad_norm": 0.21474432945251465, + "learning_rate": 1.1270961016772363e-05, + "loss": 1.6984, + "step": 25693 + }, + { + "epoch": 7.8864333947206875, + "grad_norm": 0.17453257739543915, + "learning_rate": 1.126781745334064e-05, + "loss": 1.679, + "step": 25694 + }, + { + "epoch": 7.886740331491713, + "grad_norm": 0.21506772935390472, + "learning_rate": 1.1264674272683073e-05, + "loss": 1.7209, + "step": 25695 + }, + { + "epoch": 7.887047268262738, + "grad_norm": 0.2470129430294037, + "learning_rate": 1.1261531474830672e-05, + "loss": 1.7183, + "step": 25696 + }, + { + "epoch": 7.887354205033763, + "grad_norm": 0.2026570737361908, + "learning_rate": 1.1258389059814545e-05, + "loss": 1.6579, + "step": 25697 + }, + { + "epoch": 7.887661141804788, + "grad_norm": 0.18859948217868805, + "learning_rate": 1.1255247027665699e-05, + "loss": 1.6831, + "step": 25698 + }, + { + "epoch": 7.887968078575813, + "grad_norm": 0.2106257677078247, + "learning_rate": 1.1252105378415229e-05, + "loss": 1.724, + "step": 25699 + }, + { + "epoch": 7.888275015346839, + "grad_norm": 0.17260697484016418, + "learning_rate": 1.1248964112094162e-05, + "loss": 1.6875, + "step": 25700 + }, + { + "epoch": 7.888581952117864, + "grad_norm": 0.20596550405025482, + "learning_rate": 1.1245823228733542e-05, + "loss": 1.7569, + "step": 25701 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 0.1724967509508133, + "learning_rate": 1.1242682728364428e-05, + "loss": 1.7063, + "step": 25702 + }, + { + "epoch": 7.889195825659914, + "grad_norm": 0.2189379185438156, + "learning_rate": 1.123954261101781e-05, + "loss": 1.789, + "step": 25703 + }, + { + "epoch": 7.889502762430939, + "grad_norm": 0.1539442539215088, + "learning_rate": 1.1236402876724766e-05, + "loss": 1.6573, + "step": 25704 + }, + { + "epoch": 7.889809699201964, + "grad_norm": 0.2854970693588257, + "learning_rate": 1.1233263525516313e-05, + "loss": 1.7683, + "step": 25705 + }, + { + "epoch": 7.89011663597299, + "grad_norm": 0.18263237178325653, + "learning_rate": 1.1230124557423465e-05, + "loss": 1.6911, + "step": 25706 + }, + { + "epoch": 7.890423572744015, + "grad_norm": 0.2098342627286911, + "learning_rate": 1.122698597247725e-05, + "loss": 1.7306, + "step": 25707 + }, + { + "epoch": 7.8907305095150395, + "grad_norm": 0.20822781324386597, + "learning_rate": 1.122384777070869e-05, + "loss": 1.7777, + "step": 25708 + }, + { + "epoch": 7.891037446286065, + "grad_norm": 0.24466483294963837, + "learning_rate": 1.122070995214879e-05, + "loss": 1.6966, + "step": 25709 + }, + { + "epoch": 7.89134438305709, + "grad_norm": 0.1500372439622879, + "learning_rate": 1.1217572516828561e-05, + "loss": 1.6787, + "step": 25710 + }, + { + "epoch": 7.8916513198281155, + "grad_norm": 0.2238166481256485, + "learning_rate": 1.1214435464779006e-05, + "loss": 1.7957, + "step": 25711 + }, + { + "epoch": 7.891958256599141, + "grad_norm": 0.22993433475494385, + "learning_rate": 1.1211298796031156e-05, + "loss": 1.7142, + "step": 25712 + }, + { + "epoch": 7.892265193370166, + "grad_norm": 0.15912945568561554, + "learning_rate": 1.1208162510615955e-05, + "loss": 1.7188, + "step": 25713 + }, + { + "epoch": 7.892572130141191, + "grad_norm": 0.2096986174583435, + "learning_rate": 1.1205026608564461e-05, + "loss": 1.7409, + "step": 25714 + }, + { + "epoch": 7.892879066912216, + "grad_norm": 0.18928684294223785, + "learning_rate": 1.1201891089907601e-05, + "loss": 1.6703, + "step": 25715 + }, + { + "epoch": 7.893186003683241, + "grad_norm": 0.19096077978610992, + "learning_rate": 1.119875595467641e-05, + "loss": 1.7393, + "step": 25716 + }, + { + "epoch": 7.893492940454267, + "grad_norm": 0.2286420315504074, + "learning_rate": 1.1195621202901851e-05, + "loss": 1.6995, + "step": 25717 + }, + { + "epoch": 7.893799877225292, + "grad_norm": 0.16288414597511292, + "learning_rate": 1.1192486834614912e-05, + "loss": 1.7334, + "step": 25718 + }, + { + "epoch": 7.894106813996316, + "grad_norm": 0.17358547449111938, + "learning_rate": 1.118935284984658e-05, + "loss": 1.7114, + "step": 25719 + }, + { + "epoch": 7.894413750767342, + "grad_norm": 0.16833151876926422, + "learning_rate": 1.1186219248627777e-05, + "loss": 1.6998, + "step": 25720 + }, + { + "epoch": 7.894720687538367, + "grad_norm": 0.14409767091274261, + "learning_rate": 1.118308603098952e-05, + "loss": 1.713, + "step": 25721 + }, + { + "epoch": 7.895027624309392, + "grad_norm": 0.18832024931907654, + "learning_rate": 1.1179953196962761e-05, + "loss": 1.6862, + "step": 25722 + }, + { + "epoch": 7.895334561080418, + "grad_norm": 0.1837761402130127, + "learning_rate": 1.1176820746578454e-05, + "loss": 1.6674, + "step": 25723 + }, + { + "epoch": 7.895641497851443, + "grad_norm": 0.14717474579811096, + "learning_rate": 1.1173688679867561e-05, + "loss": 1.6619, + "step": 25724 + }, + { + "epoch": 7.8959484346224675, + "grad_norm": 0.13512545824050903, + "learning_rate": 1.1170556996861032e-05, + "loss": 1.664, + "step": 25725 + }, + { + "epoch": 7.896255371393493, + "grad_norm": 0.21533837914466858, + "learning_rate": 1.1167425697589817e-05, + "loss": 1.7205, + "step": 25726 + }, + { + "epoch": 7.896562308164518, + "grad_norm": 0.15241803228855133, + "learning_rate": 1.1164294782084866e-05, + "loss": 1.6838, + "step": 25727 + }, + { + "epoch": 7.8968692449355435, + "grad_norm": 0.14889933168888092, + "learning_rate": 1.1161164250377099e-05, + "loss": 1.7197, + "step": 25728 + }, + { + "epoch": 7.897176181706568, + "grad_norm": 0.15948614478111267, + "learning_rate": 1.11580341024975e-05, + "loss": 1.6948, + "step": 25729 + }, + { + "epoch": 7.897483118477593, + "grad_norm": 0.17862235009670258, + "learning_rate": 1.1154904338476946e-05, + "loss": 1.743, + "step": 25730 + }, + { + "epoch": 7.897790055248619, + "grad_norm": 0.18168844282627106, + "learning_rate": 1.1151774958346422e-05, + "loss": 1.7291, + "step": 25731 + }, + { + "epoch": 7.898096992019644, + "grad_norm": 0.17636772990226746, + "learning_rate": 1.11486459621368e-05, + "loss": 1.7428, + "step": 25732 + }, + { + "epoch": 7.898403928790669, + "grad_norm": 0.1677904576063156, + "learning_rate": 1.1145517349879048e-05, + "loss": 1.7026, + "step": 25733 + }, + { + "epoch": 7.898710865561695, + "grad_norm": 0.1851150244474411, + "learning_rate": 1.1142389121604063e-05, + "loss": 1.7743, + "step": 25734 + }, + { + "epoch": 7.899017802332719, + "grad_norm": 0.19713786244392395, + "learning_rate": 1.1139261277342767e-05, + "loss": 1.7287, + "step": 25735 + }, + { + "epoch": 7.899324739103744, + "grad_norm": 0.2060006707906723, + "learning_rate": 1.1136133817126076e-05, + "loss": 1.7377, + "step": 25736 + }, + { + "epoch": 7.89963167587477, + "grad_norm": 0.18026013672351837, + "learning_rate": 1.1133006740984864e-05, + "loss": 1.7322, + "step": 25737 + }, + { + "epoch": 7.899938612645795, + "grad_norm": 0.1787644922733307, + "learning_rate": 1.1129880048950075e-05, + "loss": 1.7457, + "step": 25738 + }, + { + "epoch": 7.9002455494168204, + "grad_norm": 0.16092467308044434, + "learning_rate": 1.1126753741052593e-05, + "loss": 1.7451, + "step": 25739 + }, + { + "epoch": 7.900552486187845, + "grad_norm": 0.15322941541671753, + "learning_rate": 1.1123627817323318e-05, + "loss": 1.667, + "step": 25740 + }, + { + "epoch": 7.90085942295887, + "grad_norm": 0.1488087922334671, + "learning_rate": 1.1120502277793137e-05, + "loss": 1.684, + "step": 25741 + }, + { + "epoch": 7.901166359729896, + "grad_norm": 0.15332907438278198, + "learning_rate": 1.111737712249294e-05, + "loss": 1.6646, + "step": 25742 + }, + { + "epoch": 7.901473296500921, + "grad_norm": 0.19801980257034302, + "learning_rate": 1.1114252351453614e-05, + "loss": 1.7469, + "step": 25743 + }, + { + "epoch": 7.901780233271946, + "grad_norm": 0.17123407125473022, + "learning_rate": 1.1111127964706035e-05, + "loss": 1.7319, + "step": 25744 + }, + { + "epoch": 7.902087170042972, + "grad_norm": 0.1753319650888443, + "learning_rate": 1.1108003962281066e-05, + "loss": 1.7212, + "step": 25745 + }, + { + "epoch": 7.902394106813996, + "grad_norm": 0.1598043441772461, + "learning_rate": 1.1104880344209634e-05, + "loss": 1.6823, + "step": 25746 + }, + { + "epoch": 7.902701043585021, + "grad_norm": 0.14227038621902466, + "learning_rate": 1.1101757110522538e-05, + "loss": 1.6665, + "step": 25747 + }, + { + "epoch": 7.903007980356047, + "grad_norm": 0.1531791388988495, + "learning_rate": 1.1098634261250706e-05, + "loss": 1.717, + "step": 25748 + }, + { + "epoch": 7.903314917127072, + "grad_norm": 0.18077540397644043, + "learning_rate": 1.109551179642494e-05, + "loss": 1.7237, + "step": 25749 + }, + { + "epoch": 7.903621853898097, + "grad_norm": 0.22373250126838684, + "learning_rate": 1.1092389716076145e-05, + "loss": 1.7678, + "step": 25750 + }, + { + "epoch": 7.903928790669122, + "grad_norm": 0.16022193431854248, + "learning_rate": 1.1089268020235166e-05, + "loss": 1.6985, + "step": 25751 + }, + { + "epoch": 7.904235727440147, + "grad_norm": 0.17306078970432281, + "learning_rate": 1.1086146708932837e-05, + "loss": 1.6653, + "step": 25752 + }, + { + "epoch": 7.9045426642111725, + "grad_norm": 0.16284874081611633, + "learning_rate": 1.1083025782200035e-05, + "loss": 1.6762, + "step": 25753 + }, + { + "epoch": 7.904849600982198, + "grad_norm": 0.17309556901454926, + "learning_rate": 1.107990524006755e-05, + "loss": 1.7103, + "step": 25754 + }, + { + "epoch": 7.905156537753223, + "grad_norm": 0.1508374810218811, + "learning_rate": 1.107678508256627e-05, + "loss": 1.6932, + "step": 25755 + }, + { + "epoch": 7.9054634745242485, + "grad_norm": 0.1941400021314621, + "learning_rate": 1.1073665309727016e-05, + "loss": 1.7922, + "step": 25756 + }, + { + "epoch": 7.905770411295273, + "grad_norm": 0.1890190988779068, + "learning_rate": 1.107054592158061e-05, + "loss": 1.6765, + "step": 25757 + }, + { + "epoch": 7.906077348066298, + "grad_norm": 0.19425363838672638, + "learning_rate": 1.1067426918157892e-05, + "loss": 1.7284, + "step": 25758 + }, + { + "epoch": 7.906384284837324, + "grad_norm": 0.18147888779640198, + "learning_rate": 1.1064308299489678e-05, + "loss": 1.7099, + "step": 25759 + }, + { + "epoch": 7.906691221608349, + "grad_norm": 0.19644278287887573, + "learning_rate": 1.106119006560679e-05, + "loss": 1.7691, + "step": 25760 + }, + { + "epoch": 7.906998158379373, + "grad_norm": 0.14809735119342804, + "learning_rate": 1.1058072216540045e-05, + "loss": 1.6735, + "step": 25761 + }, + { + "epoch": 7.907305095150399, + "grad_norm": 0.17835088074207306, + "learning_rate": 1.105495475232024e-05, + "loss": 1.6928, + "step": 25762 + }, + { + "epoch": 7.907612031921424, + "grad_norm": 0.18341144919395447, + "learning_rate": 1.1051837672978227e-05, + "loss": 1.7393, + "step": 25763 + }, + { + "epoch": 7.907918968692449, + "grad_norm": 0.2026391327381134, + "learning_rate": 1.1048720978544753e-05, + "loss": 1.7037, + "step": 25764 + }, + { + "epoch": 7.908225905463475, + "grad_norm": 0.19855152070522308, + "learning_rate": 1.104560466905068e-05, + "loss": 1.7341, + "step": 25765 + }, + { + "epoch": 7.9085328422345, + "grad_norm": 0.18974080681800842, + "learning_rate": 1.1042488744526741e-05, + "loss": 1.6717, + "step": 25766 + }, + { + "epoch": 7.9088397790055245, + "grad_norm": 0.1727920025587082, + "learning_rate": 1.1039373205003784e-05, + "loss": 1.6994, + "step": 25767 + }, + { + "epoch": 7.90914671577655, + "grad_norm": 0.20549818873405457, + "learning_rate": 1.1036258050512566e-05, + "loss": 1.7055, + "step": 25768 + }, + { + "epoch": 7.909453652547575, + "grad_norm": 0.15696507692337036, + "learning_rate": 1.1033143281083891e-05, + "loss": 1.678, + "step": 25769 + }, + { + "epoch": 7.9097605893186005, + "grad_norm": 0.1568988859653473, + "learning_rate": 1.1030028896748546e-05, + "loss": 1.6855, + "step": 25770 + }, + { + "epoch": 7.910067526089626, + "grad_norm": 0.17795592546463013, + "learning_rate": 1.1026914897537266e-05, + "loss": 1.7306, + "step": 25771 + }, + { + "epoch": 7.91037446286065, + "grad_norm": 0.19906511902809143, + "learning_rate": 1.1023801283480872e-05, + "loss": 1.7125, + "step": 25772 + }, + { + "epoch": 7.910681399631676, + "grad_norm": 0.16972185671329498, + "learning_rate": 1.1020688054610118e-05, + "loss": 1.714, + "step": 25773 + }, + { + "epoch": 7.910988336402701, + "grad_norm": 0.20585502684116364, + "learning_rate": 1.1017575210955772e-05, + "loss": 1.7342, + "step": 25774 + }, + { + "epoch": 7.911295273173726, + "grad_norm": 0.1772177368402481, + "learning_rate": 1.1014462752548592e-05, + "loss": 1.7091, + "step": 25775 + }, + { + "epoch": 7.911602209944752, + "grad_norm": 0.1818380057811737, + "learning_rate": 1.1011350679419341e-05, + "loss": 1.7131, + "step": 25776 + }, + { + "epoch": 7.911909146715777, + "grad_norm": 0.17451459169387817, + "learning_rate": 1.1008238991598779e-05, + "loss": 1.6633, + "step": 25777 + }, + { + "epoch": 7.912216083486801, + "grad_norm": 0.18837687373161316, + "learning_rate": 1.100512768911765e-05, + "loss": 1.7132, + "step": 25778 + }, + { + "epoch": 7.912523020257827, + "grad_norm": 0.15283817052841187, + "learning_rate": 1.1002016772006695e-05, + "loss": 1.6833, + "step": 25779 + }, + { + "epoch": 7.912829957028852, + "grad_norm": 0.15264299511909485, + "learning_rate": 1.0998906240296692e-05, + "loss": 1.7098, + "step": 25780 + }, + { + "epoch": 7.913136893799877, + "grad_norm": 0.18866822123527527, + "learning_rate": 1.099579609401833e-05, + "loss": 1.7173, + "step": 25781 + }, + { + "epoch": 7.913443830570903, + "grad_norm": 0.19261083006858826, + "learning_rate": 1.0992686333202401e-05, + "loss": 1.7269, + "step": 25782 + }, + { + "epoch": 7.913750767341927, + "grad_norm": 0.19681799411773682, + "learning_rate": 1.0989576957879577e-05, + "loss": 1.6594, + "step": 25783 + }, + { + "epoch": 7.9140577041129525, + "grad_norm": 0.21298938989639282, + "learning_rate": 1.0986467968080639e-05, + "loss": 1.8509, + "step": 25784 + }, + { + "epoch": 7.914364640883978, + "grad_norm": 0.17769277095794678, + "learning_rate": 1.0983359363836287e-05, + "loss": 1.7177, + "step": 25785 + }, + { + "epoch": 7.914671577655003, + "grad_norm": 0.19831274449825287, + "learning_rate": 1.0980251145177246e-05, + "loss": 1.7107, + "step": 25786 + }, + { + "epoch": 7.9149785144260285, + "grad_norm": 0.16204139590263367, + "learning_rate": 1.0977143312134248e-05, + "loss": 1.7052, + "step": 25787 + }, + { + "epoch": 7.915285451197054, + "grad_norm": 0.1709459275007248, + "learning_rate": 1.0974035864737958e-05, + "loss": 1.6944, + "step": 25788 + }, + { + "epoch": 7.915592387968078, + "grad_norm": 0.17710284888744354, + "learning_rate": 1.0970928803019142e-05, + "loss": 1.7253, + "step": 25789 + }, + { + "epoch": 7.915899324739104, + "grad_norm": 0.17316623032093048, + "learning_rate": 1.0967822127008481e-05, + "loss": 1.6458, + "step": 25790 + }, + { + "epoch": 7.916206261510129, + "grad_norm": 0.15644441545009613, + "learning_rate": 1.0964715836736677e-05, + "loss": 1.6749, + "step": 25791 + }, + { + "epoch": 7.916513198281154, + "grad_norm": 0.1425870954990387, + "learning_rate": 1.096160993223443e-05, + "loss": 1.7283, + "step": 25792 + }, + { + "epoch": 7.91682013505218, + "grad_norm": 0.1724596619606018, + "learning_rate": 1.0958504413532438e-05, + "loss": 1.7152, + "step": 25793 + }, + { + "epoch": 7.917127071823204, + "grad_norm": 0.20472319424152374, + "learning_rate": 1.0955399280661383e-05, + "loss": 1.7818, + "step": 25794 + }, + { + "epoch": 7.917434008594229, + "grad_norm": 0.18012158572673798, + "learning_rate": 1.0952294533651963e-05, + "loss": 1.6995, + "step": 25795 + }, + { + "epoch": 7.917740945365255, + "grad_norm": 0.1460564136505127, + "learning_rate": 1.0949190172534851e-05, + "loss": 1.6752, + "step": 25796 + }, + { + "epoch": 7.91804788213628, + "grad_norm": 0.16467545926570892, + "learning_rate": 1.0946086197340733e-05, + "loss": 1.7, + "step": 25797 + }, + { + "epoch": 7.918354818907305, + "grad_norm": 0.20123273134231567, + "learning_rate": 1.0942982608100266e-05, + "loss": 1.7423, + "step": 25798 + }, + { + "epoch": 7.918661755678331, + "grad_norm": 0.160671204328537, + "learning_rate": 1.0939879404844167e-05, + "loss": 1.6992, + "step": 25799 + }, + { + "epoch": 7.918968692449355, + "grad_norm": 0.18679293990135193, + "learning_rate": 1.0936776587603043e-05, + "loss": 1.7789, + "step": 25800 + }, + { + "epoch": 7.9192756292203805, + "grad_norm": 0.1598452925682068, + "learning_rate": 1.0933674156407602e-05, + "loss": 1.6961, + "step": 25801 + }, + { + "epoch": 7.919582565991406, + "grad_norm": 0.13918142020702362, + "learning_rate": 1.0930572111288506e-05, + "loss": 1.6727, + "step": 25802 + }, + { + "epoch": 7.919889502762431, + "grad_norm": 0.16652320325374603, + "learning_rate": 1.0927470452276367e-05, + "loss": 1.7135, + "step": 25803 + }, + { + "epoch": 7.920196439533456, + "grad_norm": 0.1637706309556961, + "learning_rate": 1.0924369179401893e-05, + "loss": 1.7078, + "step": 25804 + }, + { + "epoch": 7.920503376304481, + "grad_norm": 0.19709086418151855, + "learning_rate": 1.092126829269568e-05, + "loss": 1.7425, + "step": 25805 + }, + { + "epoch": 7.920810313075506, + "grad_norm": 0.13402192294597626, + "learning_rate": 1.091816779218841e-05, + "loss": 1.663, + "step": 25806 + }, + { + "epoch": 7.921117249846532, + "grad_norm": 0.18932323157787323, + "learning_rate": 1.0915067677910718e-05, + "loss": 1.7651, + "step": 25807 + }, + { + "epoch": 7.921424186617557, + "grad_norm": 0.1586374193429947, + "learning_rate": 1.0911967949893231e-05, + "loss": 1.6709, + "step": 25808 + }, + { + "epoch": 7.921731123388582, + "grad_norm": 0.1570933312177658, + "learning_rate": 1.0908868608166589e-05, + "loss": 1.7166, + "step": 25809 + }, + { + "epoch": 7.922038060159607, + "grad_norm": 0.19786952435970306, + "learning_rate": 1.0905769652761416e-05, + "loss": 1.7347, + "step": 25810 + }, + { + "epoch": 7.922344996930632, + "grad_norm": 0.14969857037067413, + "learning_rate": 1.0902671083708343e-05, + "loss": 1.6471, + "step": 25811 + }, + { + "epoch": 7.922651933701657, + "grad_norm": 0.17460933327674866, + "learning_rate": 1.089957290103799e-05, + "loss": 1.7594, + "step": 25812 + }, + { + "epoch": 7.922958870472683, + "grad_norm": 0.17380566895008087, + "learning_rate": 1.0896475104780974e-05, + "loss": 1.6721, + "step": 25813 + }, + { + "epoch": 7.923265807243708, + "grad_norm": 0.1599249392747879, + "learning_rate": 1.0893377694967916e-05, + "loss": 1.6842, + "step": 25814 + }, + { + "epoch": 7.9235727440147325, + "grad_norm": 0.15319927036762238, + "learning_rate": 1.0890280671629398e-05, + "loss": 1.6529, + "step": 25815 + }, + { + "epoch": 7.923879680785758, + "grad_norm": 0.20122043788433075, + "learning_rate": 1.0887184034796082e-05, + "loss": 1.8009, + "step": 25816 + }, + { + "epoch": 7.924186617556783, + "grad_norm": 0.1726430058479309, + "learning_rate": 1.0884087784498515e-05, + "loss": 1.7595, + "step": 25817 + }, + { + "epoch": 7.9244935543278086, + "grad_norm": 0.1657346487045288, + "learning_rate": 1.0880991920767336e-05, + "loss": 1.7051, + "step": 25818 + }, + { + "epoch": 7.924800491098834, + "grad_norm": 0.19500960409641266, + "learning_rate": 1.0877896443633117e-05, + "loss": 1.6809, + "step": 25819 + }, + { + "epoch": 7.925107427869859, + "grad_norm": 0.18751180171966553, + "learning_rate": 1.087480135312644e-05, + "loss": 1.7613, + "step": 25820 + }, + { + "epoch": 7.925414364640884, + "grad_norm": 0.20735877752304077, + "learning_rate": 1.0871706649277935e-05, + "loss": 1.7515, + "step": 25821 + }, + { + "epoch": 7.925721301411909, + "grad_norm": 0.19349408149719238, + "learning_rate": 1.0868612332118133e-05, + "loss": 1.7053, + "step": 25822 + }, + { + "epoch": 7.926028238182934, + "grad_norm": 0.15639854967594147, + "learning_rate": 1.0865518401677649e-05, + "loss": 1.6907, + "step": 25823 + }, + { + "epoch": 7.92633517495396, + "grad_norm": 0.18366692960262299, + "learning_rate": 1.0862424857987059e-05, + "loss": 1.6791, + "step": 25824 + }, + { + "epoch": 7.926642111724985, + "grad_norm": 0.1648077666759491, + "learning_rate": 1.0859331701076913e-05, + "loss": 1.6671, + "step": 25825 + }, + { + "epoch": 7.9269490484960095, + "grad_norm": 0.17894984781742096, + "learning_rate": 1.0856238930977802e-05, + "loss": 1.736, + "step": 25826 + }, + { + "epoch": 7.927255985267035, + "grad_norm": 0.13542817533016205, + "learning_rate": 1.0853146547720278e-05, + "loss": 1.6613, + "step": 25827 + }, + { + "epoch": 7.92756292203806, + "grad_norm": 0.1598762571811676, + "learning_rate": 1.0850054551334905e-05, + "loss": 1.6828, + "step": 25828 + }, + { + "epoch": 7.9278698588090855, + "grad_norm": 0.19212616980075836, + "learning_rate": 1.0846962941852235e-05, + "loss": 1.8198, + "step": 25829 + }, + { + "epoch": 7.928176795580111, + "grad_norm": 0.19344113767147064, + "learning_rate": 1.0843871719302829e-05, + "loss": 1.7804, + "step": 25830 + }, + { + "epoch": 7.928483732351136, + "grad_norm": 0.15460920333862305, + "learning_rate": 1.0840780883717233e-05, + "loss": 1.7372, + "step": 25831 + }, + { + "epoch": 7.928790669122161, + "grad_norm": 0.19987867772579193, + "learning_rate": 1.083769043512598e-05, + "loss": 1.6923, + "step": 25832 + }, + { + "epoch": 7.929097605893186, + "grad_norm": 0.15390315651893616, + "learning_rate": 1.083460037355965e-05, + "loss": 1.6864, + "step": 25833 + }, + { + "epoch": 7.929404542664211, + "grad_norm": 0.18596698343753815, + "learning_rate": 1.0831510699048724e-05, + "loss": 1.7135, + "step": 25834 + }, + { + "epoch": 7.929711479435237, + "grad_norm": 0.172935351729393, + "learning_rate": 1.0828421411623796e-05, + "loss": 1.7426, + "step": 25835 + }, + { + "epoch": 7.930018416206261, + "grad_norm": 0.2046828418970108, + "learning_rate": 1.0825332511315356e-05, + "loss": 1.7178, + "step": 25836 + }, + { + "epoch": 7.930325352977286, + "grad_norm": 0.1382901519536972, + "learning_rate": 1.0822243998153925e-05, + "loss": 1.6811, + "step": 25837 + }, + { + "epoch": 7.930632289748312, + "grad_norm": 0.1675405353307724, + "learning_rate": 1.0819155872170068e-05, + "loss": 1.7278, + "step": 25838 + }, + { + "epoch": 7.930939226519337, + "grad_norm": 0.16732639074325562, + "learning_rate": 1.0816068133394252e-05, + "loss": 1.6847, + "step": 25839 + }, + { + "epoch": 7.931246163290362, + "grad_norm": 0.17154982686042786, + "learning_rate": 1.0812980781857047e-05, + "loss": 1.7411, + "step": 25840 + }, + { + "epoch": 7.931553100061388, + "grad_norm": 0.16475310921669006, + "learning_rate": 1.08098938175889e-05, + "loss": 1.7222, + "step": 25841 + }, + { + "epoch": 7.931860036832412, + "grad_norm": 0.1613023579120636, + "learning_rate": 1.080680724062037e-05, + "loss": 1.718, + "step": 25842 + }, + { + "epoch": 7.9321669736034375, + "grad_norm": 0.16330939531326294, + "learning_rate": 1.0803721050981941e-05, + "loss": 1.7087, + "step": 25843 + }, + { + "epoch": 7.932473910374463, + "grad_norm": 0.15881259739398956, + "learning_rate": 1.0800635248704117e-05, + "loss": 1.7309, + "step": 25844 + }, + { + "epoch": 7.932780847145488, + "grad_norm": 0.19191724061965942, + "learning_rate": 1.0797549833817389e-05, + "loss": 1.7131, + "step": 25845 + }, + { + "epoch": 7.9330877839165135, + "grad_norm": 0.17083698511123657, + "learning_rate": 1.079446480635225e-05, + "loss": 1.7117, + "step": 25846 + }, + { + "epoch": 7.933394720687538, + "grad_norm": 0.18097929656505585, + "learning_rate": 1.0791380166339193e-05, + "loss": 1.7017, + "step": 25847 + }, + { + "epoch": 7.933701657458563, + "grad_norm": 0.1556827276945114, + "learning_rate": 1.0788295913808694e-05, + "loss": 1.7589, + "step": 25848 + }, + { + "epoch": 7.934008594229589, + "grad_norm": 0.1667819619178772, + "learning_rate": 1.0785212048791226e-05, + "loss": 1.6735, + "step": 25849 + }, + { + "epoch": 7.934315531000614, + "grad_norm": 0.18772241473197937, + "learning_rate": 1.0782128571317302e-05, + "loss": 1.6984, + "step": 25850 + }, + { + "epoch": 7.934622467771639, + "grad_norm": 0.1752445250749588, + "learning_rate": 1.0779045481417343e-05, + "loss": 1.6662, + "step": 25851 + }, + { + "epoch": 7.934929404542665, + "grad_norm": 0.16619165241718292, + "learning_rate": 1.0775962779121873e-05, + "loss": 1.765, + "step": 25852 + }, + { + "epoch": 7.935236341313689, + "grad_norm": 0.1685585081577301, + "learning_rate": 1.0772880464461316e-05, + "loss": 1.6692, + "step": 25853 + }, + { + "epoch": 7.935543278084714, + "grad_norm": 0.16806848347187042, + "learning_rate": 1.076979853746613e-05, + "loss": 1.7081, + "step": 25854 + }, + { + "epoch": 7.93585021485574, + "grad_norm": 0.14273032546043396, + "learning_rate": 1.076671699816682e-05, + "loss": 1.6668, + "step": 25855 + }, + { + "epoch": 7.936157151626765, + "grad_norm": 0.24727863073349, + "learning_rate": 1.0763635846593778e-05, + "loss": 1.7624, + "step": 25856 + }, + { + "epoch": 7.93646408839779, + "grad_norm": 0.15679748356342316, + "learning_rate": 1.0760555082777506e-05, + "loss": 1.6851, + "step": 25857 + }, + { + "epoch": 7.936771025168815, + "grad_norm": 0.23388828337192535, + "learning_rate": 1.075747470674841e-05, + "loss": 1.7557, + "step": 25858 + }, + { + "epoch": 7.93707796193984, + "grad_norm": 0.15266747772693634, + "learning_rate": 1.0754394718536958e-05, + "loss": 1.6559, + "step": 25859 + }, + { + "epoch": 7.9373848987108655, + "grad_norm": 0.1945476084947586, + "learning_rate": 1.0751315118173577e-05, + "loss": 1.745, + "step": 25860 + }, + { + "epoch": 7.937691835481891, + "grad_norm": 0.18018878996372223, + "learning_rate": 1.0748235905688709e-05, + "loss": 1.7016, + "step": 25861 + }, + { + "epoch": 7.937998772252916, + "grad_norm": 0.1748870611190796, + "learning_rate": 1.0745157081112777e-05, + "loss": 1.6989, + "step": 25862 + }, + { + "epoch": 7.9383057090239415, + "grad_norm": 0.18253664672374725, + "learning_rate": 1.0742078644476217e-05, + "loss": 1.7554, + "step": 25863 + }, + { + "epoch": 7.938612645794966, + "grad_norm": 0.17009632289409637, + "learning_rate": 1.073900059580944e-05, + "loss": 1.7244, + "step": 25864 + }, + { + "epoch": 7.938919582565991, + "grad_norm": 0.17612707614898682, + "learning_rate": 1.0735922935142873e-05, + "loss": 1.6939, + "step": 25865 + }, + { + "epoch": 7.939226519337017, + "grad_norm": 0.21207575500011444, + "learning_rate": 1.0732845662506913e-05, + "loss": 1.7097, + "step": 25866 + }, + { + "epoch": 7.939533456108042, + "grad_norm": 0.2073012739419937, + "learning_rate": 1.0729768777932014e-05, + "loss": 1.7658, + "step": 25867 + }, + { + "epoch": 7.939840392879067, + "grad_norm": 0.18888477981090546, + "learning_rate": 1.072669228144853e-05, + "loss": 1.7496, + "step": 25868 + }, + { + "epoch": 7.940147329650092, + "grad_norm": 0.1822361946105957, + "learning_rate": 1.0723616173086926e-05, + "loss": 1.7344, + "step": 25869 + }, + { + "epoch": 7.940454266421117, + "grad_norm": 0.18642890453338623, + "learning_rate": 1.0720540452877547e-05, + "loss": 1.7135, + "step": 25870 + }, + { + "epoch": 7.940761203192142, + "grad_norm": 0.19198815524578094, + "learning_rate": 1.0717465120850795e-05, + "loss": 1.7128, + "step": 25871 + }, + { + "epoch": 7.941068139963168, + "grad_norm": 0.1886969953775406, + "learning_rate": 1.0714390177037109e-05, + "loss": 1.7161, + "step": 25872 + }, + { + "epoch": 7.941375076734193, + "grad_norm": 0.19693820178508759, + "learning_rate": 1.0711315621466816e-05, + "loss": 1.7086, + "step": 25873 + }, + { + "epoch": 7.941682013505218, + "grad_norm": 0.19052870571613312, + "learning_rate": 1.0708241454170353e-05, + "loss": 1.7274, + "step": 25874 + }, + { + "epoch": 7.941988950276243, + "grad_norm": 0.23586300015449524, + "learning_rate": 1.0705167675178057e-05, + "loss": 1.7169, + "step": 25875 + }, + { + "epoch": 7.942295887047268, + "grad_norm": 0.2077670842409134, + "learning_rate": 1.0702094284520336e-05, + "loss": 1.7573, + "step": 25876 + }, + { + "epoch": 7.9426028238182935, + "grad_norm": 0.20345431566238403, + "learning_rate": 1.069902128222755e-05, + "loss": 1.6821, + "step": 25877 + }, + { + "epoch": 7.942909760589319, + "grad_norm": 0.1869240552186966, + "learning_rate": 1.0695948668330075e-05, + "loss": 1.6978, + "step": 25878 + }, + { + "epoch": 7.943216697360343, + "grad_norm": 0.17814506590366364, + "learning_rate": 1.0692876442858274e-05, + "loss": 1.7027, + "step": 25879 + }, + { + "epoch": 7.943523634131369, + "grad_norm": 0.19093535840511322, + "learning_rate": 1.0689804605842502e-05, + "loss": 1.7863, + "step": 25880 + }, + { + "epoch": 7.943830570902394, + "grad_norm": 0.17859873175621033, + "learning_rate": 1.0686733157313123e-05, + "loss": 1.7431, + "step": 25881 + }, + { + "epoch": 7.944137507673419, + "grad_norm": 0.16613568365573883, + "learning_rate": 1.0683662097300484e-05, + "loss": 1.7517, + "step": 25882 + }, + { + "epoch": 7.944444444444445, + "grad_norm": 0.1588357836008072, + "learning_rate": 1.0680591425834934e-05, + "loss": 1.7017, + "step": 25883 + }, + { + "epoch": 7.94475138121547, + "grad_norm": 0.1667826622724533, + "learning_rate": 1.067752114294685e-05, + "loss": 1.6965, + "step": 25884 + }, + { + "epoch": 7.945058317986494, + "grad_norm": 0.2015296071767807, + "learning_rate": 1.0674451248666522e-05, + "loss": 1.7625, + "step": 25885 + }, + { + "epoch": 7.94536525475752, + "grad_norm": 0.17073483765125275, + "learning_rate": 1.0671381743024344e-05, + "loss": 1.7194, + "step": 25886 + }, + { + "epoch": 7.945672191528545, + "grad_norm": 0.16649815440177917, + "learning_rate": 1.0668312626050608e-05, + "loss": 1.7233, + "step": 25887 + }, + { + "epoch": 7.94597912829957, + "grad_norm": 0.14395855367183685, + "learning_rate": 1.0665243897775645e-05, + "loss": 1.6859, + "step": 25888 + }, + { + "epoch": 7.946286065070596, + "grad_norm": 0.18934515118598938, + "learning_rate": 1.0662175558229826e-05, + "loss": 1.6832, + "step": 25889 + }, + { + "epoch": 7.94659300184162, + "grad_norm": 0.16819562017917633, + "learning_rate": 1.0659107607443419e-05, + "loss": 1.7592, + "step": 25890 + }, + { + "epoch": 7.9468999386126455, + "grad_norm": 0.1701207458972931, + "learning_rate": 1.0656040045446798e-05, + "loss": 1.6909, + "step": 25891 + }, + { + "epoch": 7.947206875383671, + "grad_norm": 0.18011561036109924, + "learning_rate": 1.0652972872270217e-05, + "loss": 1.7687, + "step": 25892 + }, + { + "epoch": 7.947513812154696, + "grad_norm": 0.15422853827476501, + "learning_rate": 1.0649906087944034e-05, + "loss": 1.6957, + "step": 25893 + }, + { + "epoch": 7.9478207489257215, + "grad_norm": 0.17223568260669708, + "learning_rate": 1.0646839692498545e-05, + "loss": 1.7368, + "step": 25894 + }, + { + "epoch": 7.948127685696747, + "grad_norm": 0.16706988215446472, + "learning_rate": 1.0643773685964053e-05, + "loss": 1.6981, + "step": 25895 + }, + { + "epoch": 7.948434622467771, + "grad_norm": 0.15490150451660156, + "learning_rate": 1.0640708068370853e-05, + "loss": 1.705, + "step": 25896 + }, + { + "epoch": 7.948741559238797, + "grad_norm": 0.16119123995304108, + "learning_rate": 1.0637642839749246e-05, + "loss": 1.7519, + "step": 25897 + }, + { + "epoch": 7.949048496009822, + "grad_norm": 0.1669061779975891, + "learning_rate": 1.0634578000129524e-05, + "loss": 1.7228, + "step": 25898 + }, + { + "epoch": 7.949355432780847, + "grad_norm": 0.1974606215953827, + "learning_rate": 1.0631513549541976e-05, + "loss": 1.7188, + "step": 25899 + }, + { + "epoch": 7.949662369551873, + "grad_norm": 0.204077810049057, + "learning_rate": 1.0628449488016873e-05, + "loss": 1.7397, + "step": 25900 + }, + { + "epoch": 7.949969306322897, + "grad_norm": 0.13561539351940155, + "learning_rate": 1.0625385815584537e-05, + "loss": 1.6457, + "step": 25901 + }, + { + "epoch": 7.9502762430939224, + "grad_norm": 0.1736447811126709, + "learning_rate": 1.0622322532275186e-05, + "loss": 1.7278, + "step": 25902 + }, + { + "epoch": 7.950583179864948, + "grad_norm": 0.1712762862443924, + "learning_rate": 1.061925963811915e-05, + "loss": 1.7208, + "step": 25903 + }, + { + "epoch": 7.950890116635973, + "grad_norm": 0.15313011407852173, + "learning_rate": 1.0616197133146661e-05, + "loss": 1.671, + "step": 25904 + }, + { + "epoch": 7.9511970534069984, + "grad_norm": 0.15110735595226288, + "learning_rate": 1.0613135017387981e-05, + "loss": 1.6568, + "step": 25905 + }, + { + "epoch": 7.951503990178024, + "grad_norm": 0.22678901255130768, + "learning_rate": 1.0610073290873413e-05, + "loss": 1.7415, + "step": 25906 + }, + { + "epoch": 7.951810926949048, + "grad_norm": 0.16936101019382477, + "learning_rate": 1.0607011953633162e-05, + "loss": 1.6983, + "step": 25907 + }, + { + "epoch": 7.952117863720074, + "grad_norm": 0.18443427979946136, + "learning_rate": 1.0603951005697533e-05, + "loss": 1.7334, + "step": 25908 + }, + { + "epoch": 7.952424800491099, + "grad_norm": 0.2290949672460556, + "learning_rate": 1.0600890447096729e-05, + "loss": 1.7219, + "step": 25909 + }, + { + "epoch": 7.952731737262124, + "grad_norm": 0.19244399666786194, + "learning_rate": 1.0597830277861026e-05, + "loss": 1.7047, + "step": 25910 + }, + { + "epoch": 7.953038674033149, + "grad_norm": 0.15806549787521362, + "learning_rate": 1.0594770498020657e-05, + "loss": 1.667, + "step": 25911 + }, + { + "epoch": 7.953345610804174, + "grad_norm": 0.23782655596733093, + "learning_rate": 1.0591711107605867e-05, + "loss": 1.7271, + "step": 25912 + }, + { + "epoch": 7.953652547575199, + "grad_norm": 0.18427079916000366, + "learning_rate": 1.0588652106646885e-05, + "loss": 1.7644, + "step": 25913 + }, + { + "epoch": 7.953959484346225, + "grad_norm": 0.18687991797924042, + "learning_rate": 1.058559349517394e-05, + "loss": 1.7045, + "step": 25914 + }, + { + "epoch": 7.95426642111725, + "grad_norm": 0.17435906827449799, + "learning_rate": 1.0582535273217265e-05, + "loss": 1.6681, + "step": 25915 + }, + { + "epoch": 7.954573357888275, + "grad_norm": 0.17601260542869568, + "learning_rate": 1.0579477440807079e-05, + "loss": 1.7141, + "step": 25916 + }, + { + "epoch": 7.9548802946593, + "grad_norm": 0.19225506484508514, + "learning_rate": 1.0576419997973586e-05, + "loss": 1.7224, + "step": 25917 + }, + { + "epoch": 7.955187231430325, + "grad_norm": 0.18801991641521454, + "learning_rate": 1.0573362944747045e-05, + "loss": 1.715, + "step": 25918 + }, + { + "epoch": 7.9554941682013505, + "grad_norm": 0.21490465104579926, + "learning_rate": 1.0570306281157616e-05, + "loss": 1.7931, + "step": 25919 + }, + { + "epoch": 7.955801104972376, + "grad_norm": 0.1877163052558899, + "learning_rate": 1.0567250007235557e-05, + "loss": 1.7365, + "step": 25920 + }, + { + "epoch": 7.956108041743401, + "grad_norm": 0.18460121750831604, + "learning_rate": 1.0564194123011029e-05, + "loss": 1.7092, + "step": 25921 + }, + { + "epoch": 7.956414978514426, + "grad_norm": 0.1663859337568283, + "learning_rate": 1.0561138628514239e-05, + "loss": 1.6847, + "step": 25922 + }, + { + "epoch": 7.956721915285451, + "grad_norm": 0.1676093488931656, + "learning_rate": 1.0558083523775413e-05, + "loss": 1.6788, + "step": 25923 + }, + { + "epoch": 7.957028852056476, + "grad_norm": 0.17470842599868774, + "learning_rate": 1.0555028808824702e-05, + "loss": 1.7658, + "step": 25924 + }, + { + "epoch": 7.957335788827502, + "grad_norm": 0.17770788073539734, + "learning_rate": 1.0551974483692346e-05, + "loss": 1.6875, + "step": 25925 + }, + { + "epoch": 7.957642725598527, + "grad_norm": 0.17924711108207703, + "learning_rate": 1.054892054840847e-05, + "loss": 1.7024, + "step": 25926 + }, + { + "epoch": 7.957949662369552, + "grad_norm": 0.19387175142765045, + "learning_rate": 1.0545867003003296e-05, + "loss": 1.7806, + "step": 25927 + }, + { + "epoch": 7.958256599140577, + "grad_norm": 0.176667258143425, + "learning_rate": 1.0542813847506988e-05, + "loss": 1.7187, + "step": 25928 + }, + { + "epoch": 7.958563535911602, + "grad_norm": 0.1730370670557022, + "learning_rate": 1.0539761081949723e-05, + "loss": 1.6912, + "step": 25929 + }, + { + "epoch": 7.958870472682627, + "grad_norm": 0.1836516112089157, + "learning_rate": 1.0536708706361665e-05, + "loss": 1.684, + "step": 25930 + }, + { + "epoch": 7.959177409453653, + "grad_norm": 0.17236517369747162, + "learning_rate": 1.0533656720772983e-05, + "loss": 1.6799, + "step": 25931 + }, + { + "epoch": 7.959484346224678, + "grad_norm": 0.1655581295490265, + "learning_rate": 1.0530605125213832e-05, + "loss": 1.755, + "step": 25932 + }, + { + "epoch": 7.9597912829957025, + "grad_norm": 0.1801871806383133, + "learning_rate": 1.0527553919714383e-05, + "loss": 1.6998, + "step": 25933 + }, + { + "epoch": 7.960098219766728, + "grad_norm": 0.20504651963710785, + "learning_rate": 1.052450310430476e-05, + "loss": 1.7793, + "step": 25934 + }, + { + "epoch": 7.960405156537753, + "grad_norm": 0.2522159516811371, + "learning_rate": 1.052145267901517e-05, + "loss": 1.754, + "step": 25935 + }, + { + "epoch": 7.9607120933087785, + "grad_norm": 0.18074269592761993, + "learning_rate": 1.0518402643875691e-05, + "loss": 1.717, + "step": 25936 + }, + { + "epoch": 7.961019030079804, + "grad_norm": 0.16463595628738403, + "learning_rate": 1.0515352998916527e-05, + "loss": 1.6994, + "step": 25937 + }, + { + "epoch": 7.961325966850829, + "grad_norm": 0.17102178931236267, + "learning_rate": 1.0512303744167778e-05, + "loss": 1.6571, + "step": 25938 + }, + { + "epoch": 7.961632903621854, + "grad_norm": 0.14453014731407166, + "learning_rate": 1.0509254879659569e-05, + "loss": 1.6725, + "step": 25939 + }, + { + "epoch": 7.961939840392879, + "grad_norm": 0.1980808526277542, + "learning_rate": 1.050620640542208e-05, + "loss": 1.6847, + "step": 25940 + }, + { + "epoch": 7.962246777163904, + "grad_norm": 0.15021857619285583, + "learning_rate": 1.0503158321485378e-05, + "loss": 1.6896, + "step": 25941 + }, + { + "epoch": 7.96255371393493, + "grad_norm": 0.2223394513130188, + "learning_rate": 1.0500110627879639e-05, + "loss": 1.7167, + "step": 25942 + }, + { + "epoch": 7.962860650705955, + "grad_norm": 0.17636358737945557, + "learning_rate": 1.0497063324634937e-05, + "loss": 1.6625, + "step": 25943 + }, + { + "epoch": 7.963167587476979, + "grad_norm": 0.1823662370443344, + "learning_rate": 1.049401641178142e-05, + "loss": 1.7139, + "step": 25944 + }, + { + "epoch": 7.963474524248005, + "grad_norm": 0.1740594059228897, + "learning_rate": 1.0490969889349189e-05, + "loss": 1.7447, + "step": 25945 + }, + { + "epoch": 7.96378146101903, + "grad_norm": 0.15838129818439484, + "learning_rate": 1.0487923757368351e-05, + "loss": 1.7051, + "step": 25946 + }, + { + "epoch": 7.964088397790055, + "grad_norm": 0.4309011399745941, + "learning_rate": 1.0484878015869005e-05, + "loss": 1.7442, + "step": 25947 + }, + { + "epoch": 7.964395334561081, + "grad_norm": 0.17090202867984772, + "learning_rate": 1.0481832664881257e-05, + "loss": 1.652, + "step": 25948 + }, + { + "epoch": 7.964702271332106, + "grad_norm": 0.16977159678936005, + "learning_rate": 1.0478787704435206e-05, + "loss": 1.6894, + "step": 25949 + }, + { + "epoch": 7.9650092081031305, + "grad_norm": 0.20473513007164001, + "learning_rate": 1.0475743134560934e-05, + "loss": 1.8141, + "step": 25950 + }, + { + "epoch": 7.965316144874156, + "grad_norm": 0.1775660663843155, + "learning_rate": 1.0472698955288535e-05, + "loss": 1.7204, + "step": 25951 + }, + { + "epoch": 7.965623081645181, + "grad_norm": 0.21351923048496246, + "learning_rate": 1.046965516664809e-05, + "loss": 1.7364, + "step": 25952 + }, + { + "epoch": 7.9659300184162065, + "grad_norm": 0.2034255862236023, + "learning_rate": 1.0466611768669671e-05, + "loss": 1.7096, + "step": 25953 + }, + { + "epoch": 7.966236955187231, + "grad_norm": 0.17075900733470917, + "learning_rate": 1.0463568761383396e-05, + "loss": 1.6928, + "step": 25954 + }, + { + "epoch": 7.966543891958256, + "grad_norm": 0.18142712116241455, + "learning_rate": 1.0460526144819288e-05, + "loss": 1.7146, + "step": 25955 + }, + { + "epoch": 7.966850828729282, + "grad_norm": 0.14901846647262573, + "learning_rate": 1.0457483919007427e-05, + "loss": 1.6841, + "step": 25956 + }, + { + "epoch": 7.967157765500307, + "grad_norm": 0.17380031943321228, + "learning_rate": 1.0454442083977912e-05, + "loss": 1.6911, + "step": 25957 + }, + { + "epoch": 7.967464702271332, + "grad_norm": 0.15983760356903076, + "learning_rate": 1.045140063976075e-05, + "loss": 1.6866, + "step": 25958 + }, + { + "epoch": 7.967771639042358, + "grad_norm": 0.1559101641178131, + "learning_rate": 1.0448359586386058e-05, + "loss": 1.6793, + "step": 25959 + }, + { + "epoch": 7.968078575813382, + "grad_norm": 0.14843949675559998, + "learning_rate": 1.0445318923883829e-05, + "loss": 1.6835, + "step": 25960 + }, + { + "epoch": 7.968385512584407, + "grad_norm": 0.16452330350875854, + "learning_rate": 1.0442278652284155e-05, + "loss": 1.7304, + "step": 25961 + }, + { + "epoch": 7.968692449355433, + "grad_norm": 0.18997763097286224, + "learning_rate": 1.0439238771617066e-05, + "loss": 1.7425, + "step": 25962 + }, + { + "epoch": 7.968999386126458, + "grad_norm": 0.1654025912284851, + "learning_rate": 1.0436199281912611e-05, + "loss": 1.6909, + "step": 25963 + }, + { + "epoch": 7.969306322897483, + "grad_norm": 0.1313011646270752, + "learning_rate": 1.0433160183200823e-05, + "loss": 1.6572, + "step": 25964 + }, + { + "epoch": 7.969613259668508, + "grad_norm": 0.1584165096282959, + "learning_rate": 1.043012147551174e-05, + "loss": 1.7257, + "step": 25965 + }, + { + "epoch": 7.969920196439533, + "grad_norm": 0.17830775678157806, + "learning_rate": 1.0427083158875384e-05, + "loss": 1.7382, + "step": 25966 + }, + { + "epoch": 7.9702271332105585, + "grad_norm": 0.19006042182445526, + "learning_rate": 1.0424045233321788e-05, + "loss": 1.7366, + "step": 25967 + }, + { + "epoch": 7.970534069981584, + "grad_norm": 0.15366297960281372, + "learning_rate": 1.0421007698880974e-05, + "loss": 1.7235, + "step": 25968 + }, + { + "epoch": 7.970841006752609, + "grad_norm": 0.14415831863880157, + "learning_rate": 1.0417970555582963e-05, + "loss": 1.6945, + "step": 25969 + }, + { + "epoch": 7.9711479435236345, + "grad_norm": 0.16916446387767792, + "learning_rate": 1.041493380345775e-05, + "loss": 1.7099, + "step": 25970 + }, + { + "epoch": 7.971454880294659, + "grad_norm": 0.1456119269132614, + "learning_rate": 1.041189744253539e-05, + "loss": 1.6544, + "step": 25971 + }, + { + "epoch": 7.971761817065684, + "grad_norm": 0.20085962116718292, + "learning_rate": 1.040886147284585e-05, + "loss": 1.699, + "step": 25972 + }, + { + "epoch": 7.97206875383671, + "grad_norm": 0.1815454363822937, + "learning_rate": 1.0405825894419141e-05, + "loss": 1.7503, + "step": 25973 + }, + { + "epoch": 7.972375690607735, + "grad_norm": 0.2010805308818817, + "learning_rate": 1.040279070728527e-05, + "loss": 1.7061, + "step": 25974 + }, + { + "epoch": 7.97268262737876, + "grad_norm": 0.22105813026428223, + "learning_rate": 1.0399755911474218e-05, + "loss": 1.7262, + "step": 25975 + }, + { + "epoch": 7.972989564149785, + "grad_norm": 0.16186046600341797, + "learning_rate": 1.0396721507016017e-05, + "loss": 1.7229, + "step": 25976 + }, + { + "epoch": 7.97329650092081, + "grad_norm": 0.19990484416484833, + "learning_rate": 1.0393687493940597e-05, + "loss": 1.7006, + "step": 25977 + }, + { + "epoch": 7.973603437691835, + "grad_norm": 0.2377716600894928, + "learning_rate": 1.0390653872277983e-05, + "loss": 1.7302, + "step": 25978 + }, + { + "epoch": 7.973910374462861, + "grad_norm": 0.14087189733982086, + "learning_rate": 1.0387620642058148e-05, + "loss": 1.6563, + "step": 25979 + }, + { + "epoch": 7.974217311233886, + "grad_norm": 0.246252179145813, + "learning_rate": 1.0384587803311063e-05, + "loss": 1.6661, + "step": 25980 + }, + { + "epoch": 7.974524248004911, + "grad_norm": 0.18734396994113922, + "learning_rate": 1.0381555356066697e-05, + "loss": 1.7566, + "step": 25981 + }, + { + "epoch": 7.974831184775936, + "grad_norm": 0.1621570736169815, + "learning_rate": 1.0378523300355025e-05, + "loss": 1.6863, + "step": 25982 + }, + { + "epoch": 7.975138121546961, + "grad_norm": 0.2571845054626465, + "learning_rate": 1.0375491636206002e-05, + "loss": 1.7589, + "step": 25983 + }, + { + "epoch": 7.975445058317987, + "grad_norm": 0.1880367249250412, + "learning_rate": 1.0372460363649606e-05, + "loss": 1.6999, + "step": 25984 + }, + { + "epoch": 7.975751995089012, + "grad_norm": 0.20473778247833252, + "learning_rate": 1.0369429482715776e-05, + "loss": 1.749, + "step": 25985 + }, + { + "epoch": 7.976058931860036, + "grad_norm": 0.19917427003383636, + "learning_rate": 1.0366398993434473e-05, + "loss": 1.701, + "step": 25986 + }, + { + "epoch": 7.976365868631062, + "grad_norm": 0.1758740097284317, + "learning_rate": 1.0363368895835635e-05, + "loss": 1.6774, + "step": 25987 + }, + { + "epoch": 7.976672805402087, + "grad_norm": 0.26412737369537354, + "learning_rate": 1.0360339189949242e-05, + "loss": 1.6778, + "step": 25988 + }, + { + "epoch": 7.976979742173112, + "grad_norm": 0.19599425792694092, + "learning_rate": 1.0357309875805194e-05, + "loss": 1.777, + "step": 25989 + }, + { + "epoch": 7.977286678944138, + "grad_norm": 0.2095821648836136, + "learning_rate": 1.0354280953433449e-05, + "loss": 1.7106, + "step": 25990 + }, + { + "epoch": 7.977593615715163, + "grad_norm": 0.1743748039007187, + "learning_rate": 1.0351252422863934e-05, + "loss": 1.6891, + "step": 25991 + }, + { + "epoch": 7.9779005524861875, + "grad_norm": 0.17273737490177155, + "learning_rate": 1.0348224284126573e-05, + "loss": 1.7254, + "step": 25992 + }, + { + "epoch": 7.978207489257213, + "grad_norm": 0.2032385915517807, + "learning_rate": 1.0345196537251322e-05, + "loss": 1.707, + "step": 25993 + }, + { + "epoch": 7.978514426028238, + "grad_norm": 0.17978399991989136, + "learning_rate": 1.0342169182268057e-05, + "loss": 1.695, + "step": 25994 + }, + { + "epoch": 7.9788213627992635, + "grad_norm": 0.20567134022712708, + "learning_rate": 1.0339142219206744e-05, + "loss": 1.6726, + "step": 25995 + }, + { + "epoch": 7.979128299570289, + "grad_norm": 0.19649706780910492, + "learning_rate": 1.033611564809725e-05, + "loss": 1.737, + "step": 25996 + }, + { + "epoch": 7.979435236341313, + "grad_norm": 0.1640859991312027, + "learning_rate": 1.033308946896952e-05, + "loss": 1.6993, + "step": 25997 + }, + { + "epoch": 7.979742173112339, + "grad_norm": 0.21497343480587006, + "learning_rate": 1.0330063681853452e-05, + "loss": 1.7387, + "step": 25998 + }, + { + "epoch": 7.980049109883364, + "grad_norm": 0.14995479583740234, + "learning_rate": 1.0327038286778946e-05, + "loss": 1.6671, + "step": 25999 + }, + { + "epoch": 7.980356046654389, + "grad_norm": 0.1836833655834198, + "learning_rate": 1.0324013283775895e-05, + "loss": 1.7279, + "step": 26000 + }, + { + "epoch": 7.980662983425415, + "grad_norm": 0.14769285917282104, + "learning_rate": 1.032098867287421e-05, + "loss": 1.707, + "step": 26001 + }, + { + "epoch": 7.98096992019644, + "grad_norm": 0.24206426739692688, + "learning_rate": 1.0317964454103762e-05, + "loss": 1.8122, + "step": 26002 + }, + { + "epoch": 7.981276856967464, + "grad_norm": 0.16573204100131989, + "learning_rate": 1.0314940627494451e-05, + "loss": 1.7079, + "step": 26003 + }, + { + "epoch": 7.98158379373849, + "grad_norm": 0.1825968325138092, + "learning_rate": 1.0311917193076143e-05, + "loss": 1.6795, + "step": 26004 + }, + { + "epoch": 7.981890730509515, + "grad_norm": 0.14462140202522278, + "learning_rate": 1.0308894150878761e-05, + "loss": 1.7152, + "step": 26005 + }, + { + "epoch": 7.98219766728054, + "grad_norm": 0.15220513939857483, + "learning_rate": 1.0305871500932135e-05, + "loss": 1.6657, + "step": 26006 + }, + { + "epoch": 7.982504604051566, + "grad_norm": 0.17780731618404388, + "learning_rate": 1.030284924326615e-05, + "loss": 1.6852, + "step": 26007 + }, + { + "epoch": 7.98281154082259, + "grad_norm": 0.13492488861083984, + "learning_rate": 1.0299827377910681e-05, + "loss": 1.6331, + "step": 26008 + }, + { + "epoch": 7.9831184775936155, + "grad_norm": 0.1566525399684906, + "learning_rate": 1.0296805904895568e-05, + "loss": 1.6918, + "step": 26009 + }, + { + "epoch": 7.983425414364641, + "grad_norm": 0.17075398564338684, + "learning_rate": 1.0293784824250725e-05, + "loss": 1.7107, + "step": 26010 + }, + { + "epoch": 7.983732351135666, + "grad_norm": 0.16693715751171112, + "learning_rate": 1.0290764136005937e-05, + "loss": 1.6773, + "step": 26011 + }, + { + "epoch": 7.9840392879066915, + "grad_norm": 0.23020583391189575, + "learning_rate": 1.0287743840191122e-05, + "loss": 1.7389, + "step": 26012 + }, + { + "epoch": 7.984346224677717, + "grad_norm": 0.2185986489057541, + "learning_rate": 1.0284723936836071e-05, + "loss": 1.7039, + "step": 26013 + }, + { + "epoch": 7.984653161448741, + "grad_norm": 0.1527925282716751, + "learning_rate": 1.0281704425970673e-05, + "loss": 1.6981, + "step": 26014 + }, + { + "epoch": 7.984960098219767, + "grad_norm": 0.23389141261577606, + "learning_rate": 1.0278685307624747e-05, + "loss": 1.7511, + "step": 26015 + }, + { + "epoch": 7.985267034990792, + "grad_norm": 0.1481025218963623, + "learning_rate": 1.0275666581828137e-05, + "loss": 1.6551, + "step": 26016 + }, + { + "epoch": 7.985573971761817, + "grad_norm": 0.18131811916828156, + "learning_rate": 1.0272648248610672e-05, + "loss": 1.7024, + "step": 26017 + }, + { + "epoch": 7.985880908532843, + "grad_norm": 0.15969321131706238, + "learning_rate": 1.0269630308002182e-05, + "loss": 1.7269, + "step": 26018 + }, + { + "epoch": 7.986187845303867, + "grad_norm": 0.16655376553535461, + "learning_rate": 1.026661276003249e-05, + "loss": 1.6649, + "step": 26019 + }, + { + "epoch": 7.986494782074892, + "grad_norm": 0.16438528895378113, + "learning_rate": 1.0263595604731425e-05, + "loss": 1.6901, + "step": 26020 + }, + { + "epoch": 7.986801718845918, + "grad_norm": 0.23586809635162354, + "learning_rate": 1.0260578842128782e-05, + "loss": 1.7983, + "step": 26021 + }, + { + "epoch": 7.987108655616943, + "grad_norm": 0.15142324566841125, + "learning_rate": 1.0257562472254417e-05, + "loss": 1.6327, + "step": 26022 + }, + { + "epoch": 7.987415592387968, + "grad_norm": 0.17198510468006134, + "learning_rate": 1.0254546495138096e-05, + "loss": 1.7119, + "step": 26023 + }, + { + "epoch": 7.987722529158994, + "grad_norm": 0.1675531417131424, + "learning_rate": 1.0251530910809648e-05, + "loss": 1.695, + "step": 26024 + }, + { + "epoch": 7.988029465930018, + "grad_norm": 0.17403315007686615, + "learning_rate": 1.0248515719298867e-05, + "loss": 1.7216, + "step": 26025 + }, + { + "epoch": 7.9883364027010435, + "grad_norm": 0.16039720177650452, + "learning_rate": 1.0245500920635537e-05, + "loss": 1.7315, + "step": 26026 + }, + { + "epoch": 7.988643339472069, + "grad_norm": 0.19715416431427002, + "learning_rate": 1.0242486514849498e-05, + "loss": 1.7308, + "step": 26027 + }, + { + "epoch": 7.988950276243094, + "grad_norm": 0.14576783776283264, + "learning_rate": 1.0239472501970482e-05, + "loss": 1.6589, + "step": 26028 + }, + { + "epoch": 7.989257213014119, + "grad_norm": 0.1631615310907364, + "learning_rate": 1.0236458882028333e-05, + "loss": 1.7494, + "step": 26029 + }, + { + "epoch": 7.989564149785144, + "grad_norm": 0.19368192553520203, + "learning_rate": 1.023344565505277e-05, + "loss": 1.735, + "step": 26030 + }, + { + "epoch": 7.989871086556169, + "grad_norm": 0.1902317851781845, + "learning_rate": 1.023043282107362e-05, + "loss": 1.7573, + "step": 26031 + }, + { + "epoch": 7.990178023327195, + "grad_norm": 0.18496233224868774, + "learning_rate": 1.0227420380120651e-05, + "loss": 1.7368, + "step": 26032 + }, + { + "epoch": 7.99048496009822, + "grad_norm": 0.172613263130188, + "learning_rate": 1.0224408332223617e-05, + "loss": 1.6943, + "step": 26033 + }, + { + "epoch": 7.990791896869245, + "grad_norm": 0.19840112328529358, + "learning_rate": 1.0221396677412293e-05, + "loss": 1.7562, + "step": 26034 + }, + { + "epoch": 7.99109883364027, + "grad_norm": 0.18129339814186096, + "learning_rate": 1.0218385415716441e-05, + "loss": 1.6746, + "step": 26035 + }, + { + "epoch": 7.991405770411295, + "grad_norm": 0.17933470010757446, + "learning_rate": 1.021537454716583e-05, + "loss": 1.7324, + "step": 26036 + }, + { + "epoch": 7.99171270718232, + "grad_norm": 0.14947326481342316, + "learning_rate": 1.0212364071790198e-05, + "loss": 1.632, + "step": 26037 + }, + { + "epoch": 7.992019643953346, + "grad_norm": 0.18452878296375275, + "learning_rate": 1.0209353989619291e-05, + "loss": 1.6737, + "step": 26038 + }, + { + "epoch": 7.992326580724371, + "grad_norm": 0.18882198631763458, + "learning_rate": 1.0206344300682901e-05, + "loss": 1.7529, + "step": 26039 + }, + { + "epoch": 7.9926335174953955, + "grad_norm": 0.1855655312538147, + "learning_rate": 1.0203335005010722e-05, + "loss": 1.7347, + "step": 26040 + }, + { + "epoch": 7.992940454266421, + "grad_norm": 0.16447728872299194, + "learning_rate": 1.0200326102632518e-05, + "loss": 1.6659, + "step": 26041 + }, + { + "epoch": 7.993247391037446, + "grad_norm": 0.17379891872406006, + "learning_rate": 1.0197317593578016e-05, + "loss": 1.6962, + "step": 26042 + }, + { + "epoch": 7.9935543278084715, + "grad_norm": 0.16298875212669373, + "learning_rate": 1.0194309477876934e-05, + "loss": 1.6815, + "step": 26043 + }, + { + "epoch": 7.993861264579497, + "grad_norm": 0.1883227378129959, + "learning_rate": 1.0191301755559047e-05, + "loss": 1.7053, + "step": 26044 + }, + { + "epoch": 7.994168201350522, + "grad_norm": 0.20746919512748718, + "learning_rate": 1.0188294426654021e-05, + "loss": 1.7476, + "step": 26045 + }, + { + "epoch": 7.994475138121547, + "grad_norm": 0.1882137805223465, + "learning_rate": 1.0185287491191631e-05, + "loss": 1.7078, + "step": 26046 + }, + { + "epoch": 7.994782074892572, + "grad_norm": 0.21140792965888977, + "learning_rate": 1.0182280949201539e-05, + "loss": 1.7729, + "step": 26047 + }, + { + "epoch": 7.995089011663597, + "grad_norm": 0.18779736757278442, + "learning_rate": 1.0179274800713501e-05, + "loss": 1.7413, + "step": 26048 + }, + { + "epoch": 7.995395948434623, + "grad_norm": 0.1841782033443451, + "learning_rate": 1.0176269045757202e-05, + "loss": 1.7058, + "step": 26049 + }, + { + "epoch": 7.995702885205648, + "grad_norm": 0.19872064888477325, + "learning_rate": 1.017326368436236e-05, + "loss": 1.7522, + "step": 26050 + }, + { + "epoch": 7.996009821976672, + "grad_norm": 0.1763429492712021, + "learning_rate": 1.0170258716558667e-05, + "loss": 1.7178, + "step": 26051 + }, + { + "epoch": 7.996316758747698, + "grad_norm": 0.20209169387817383, + "learning_rate": 1.0167254142375826e-05, + "loss": 1.723, + "step": 26052 + }, + { + "epoch": 7.996623695518723, + "grad_norm": 0.15985172986984253, + "learning_rate": 1.0164249961843519e-05, + "loss": 1.6985, + "step": 26053 + }, + { + "epoch": 7.996930632289748, + "grad_norm": 0.1985132247209549, + "learning_rate": 1.0161246174991451e-05, + "loss": 1.7982, + "step": 26054 + }, + { + "epoch": 7.997237569060774, + "grad_norm": 0.17600803077220917, + "learning_rate": 1.0158242781849292e-05, + "loss": 1.7009, + "step": 26055 + }, + { + "epoch": 7.997544505831799, + "grad_norm": 0.15485480427742004, + "learning_rate": 1.015523978244673e-05, + "loss": 1.675, + "step": 26056 + }, + { + "epoch": 7.9978514426028235, + "grad_norm": 0.18465322256088257, + "learning_rate": 1.0152237176813446e-05, + "loss": 1.7156, + "step": 26057 + }, + { + "epoch": 7.998158379373849, + "grad_norm": 0.2183876633644104, + "learning_rate": 1.014923496497911e-05, + "loss": 1.7805, + "step": 26058 + }, + { + "epoch": 7.998465316144874, + "grad_norm": 0.18724960088729858, + "learning_rate": 1.014623314697339e-05, + "loss": 1.7047, + "step": 26059 + }, + { + "epoch": 7.9987722529158995, + "grad_norm": 0.15459159016609192, + "learning_rate": 1.0143231722825936e-05, + "loss": 1.6595, + "step": 26060 + }, + { + "epoch": 7.999079189686924, + "grad_norm": 0.16338171064853668, + "learning_rate": 1.0140230692566454e-05, + "loss": 1.6907, + "step": 26061 + }, + { + "epoch": 7.999386126457949, + "grad_norm": 0.16223935782909393, + "learning_rate": 1.013723005622455e-05, + "loss": 1.6866, + "step": 26062 + }, + { + "epoch": 7.999693063228975, + "grad_norm": 0.18934771418571472, + "learning_rate": 1.0134229813829931e-05, + "loss": 1.706, + "step": 26063 + }, + { + "epoch": 8.0, + "grad_norm": 0.19117574393749237, + "learning_rate": 1.0131229965412191e-05, + "loss": 1.7392, + "step": 26064 + }, + { + "epoch": 8.000306936771025, + "grad_norm": 0.20491363108158112, + "learning_rate": 1.0128230511001019e-05, + "loss": 1.7488, + "step": 26065 + }, + { + "epoch": 8.00061387354205, + "grad_norm": 0.16383573412895203, + "learning_rate": 1.0125231450626043e-05, + "loss": 1.6958, + "step": 26066 + }, + { + "epoch": 8.000920810313076, + "grad_norm": 0.17405575513839722, + "learning_rate": 1.0122232784316898e-05, + "loss": 1.701, + "step": 26067 + }, + { + "epoch": 8.001227747084101, + "grad_norm": 0.1504749059677124, + "learning_rate": 1.0119234512103226e-05, + "loss": 1.6588, + "step": 26068 + }, + { + "epoch": 8.001534683855127, + "grad_norm": 0.15705156326293945, + "learning_rate": 1.0116236634014647e-05, + "loss": 1.6746, + "step": 26069 + }, + { + "epoch": 8.00184162062615, + "grad_norm": 0.18729639053344727, + "learning_rate": 1.01132391500808e-05, + "loss": 1.7634, + "step": 26070 + }, + { + "epoch": 8.002148557397176, + "grad_norm": 0.1855447143316269, + "learning_rate": 1.0110242060331304e-05, + "loss": 1.7588, + "step": 26071 + }, + { + "epoch": 8.002455494168201, + "grad_norm": 0.16488726437091827, + "learning_rate": 1.010724536479577e-05, + "loss": 1.7406, + "step": 26072 + }, + { + "epoch": 8.002762430939226, + "grad_norm": 0.17228275537490845, + "learning_rate": 1.0104249063503823e-05, + "loss": 1.7323, + "step": 26073 + }, + { + "epoch": 8.003069367710252, + "grad_norm": 0.1483743041753769, + "learning_rate": 1.0101253156485069e-05, + "loss": 1.7033, + "step": 26074 + }, + { + "epoch": 8.003376304481277, + "grad_norm": 0.2499883621931076, + "learning_rate": 1.0098257643769116e-05, + "loss": 1.7127, + "step": 26075 + }, + { + "epoch": 8.003683241252302, + "grad_norm": 0.22971376776695251, + "learning_rate": 1.0095262525385568e-05, + "loss": 1.7582, + "step": 26076 + }, + { + "epoch": 8.003990178023328, + "grad_norm": 0.18424302339553833, + "learning_rate": 1.0092267801364014e-05, + "loss": 1.6948, + "step": 26077 + }, + { + "epoch": 8.004297114794353, + "grad_norm": 0.20067891478538513, + "learning_rate": 1.0089273471734085e-05, + "loss": 1.7259, + "step": 26078 + }, + { + "epoch": 8.004604051565378, + "grad_norm": 0.2022552639245987, + "learning_rate": 1.0086279536525322e-05, + "loss": 1.7332, + "step": 26079 + }, + { + "epoch": 8.004910988336404, + "grad_norm": 0.1658320426940918, + "learning_rate": 1.0083285995767362e-05, + "loss": 1.7424, + "step": 26080 + }, + { + "epoch": 8.005217925107427, + "grad_norm": 0.16180957853794098, + "learning_rate": 1.0080292849489741e-05, + "loss": 1.6797, + "step": 26081 + }, + { + "epoch": 8.005524861878452, + "grad_norm": 0.18383777141571045, + "learning_rate": 1.007730009772208e-05, + "loss": 1.7597, + "step": 26082 + }, + { + "epoch": 8.005831798649478, + "grad_norm": 0.17468489706516266, + "learning_rate": 1.0074307740493938e-05, + "loss": 1.7266, + "step": 26083 + }, + { + "epoch": 8.006138735420503, + "grad_norm": 0.1647786945104599, + "learning_rate": 1.0071315777834883e-05, + "loss": 1.6742, + "step": 26084 + }, + { + "epoch": 8.006445672191528, + "grad_norm": 0.23006537556648254, + "learning_rate": 1.0068324209774493e-05, + "loss": 1.6649, + "step": 26085 + }, + { + "epoch": 8.006752608962554, + "grad_norm": 0.19266989827156067, + "learning_rate": 1.0065333036342328e-05, + "loss": 1.7484, + "step": 26086 + }, + { + "epoch": 8.00705954573358, + "grad_norm": 0.1709250807762146, + "learning_rate": 1.0062342257567947e-05, + "loss": 1.6569, + "step": 26087 + }, + { + "epoch": 8.007366482504604, + "grad_norm": 0.15847361087799072, + "learning_rate": 1.005935187348091e-05, + "loss": 1.6907, + "step": 26088 + }, + { + "epoch": 8.00767341927563, + "grad_norm": 0.14707811176776886, + "learning_rate": 1.0056361884110765e-05, + "loss": 1.7121, + "step": 26089 + }, + { + "epoch": 8.007980356046655, + "grad_norm": 0.1740313321352005, + "learning_rate": 1.0053372289487067e-05, + "loss": 1.6978, + "step": 26090 + }, + { + "epoch": 8.008287292817679, + "grad_norm": 0.17271417379379272, + "learning_rate": 1.0050383089639354e-05, + "loss": 1.7673, + "step": 26091 + }, + { + "epoch": 8.008594229588704, + "grad_norm": 0.179611936211586, + "learning_rate": 1.0047394284597173e-05, + "loss": 1.7291, + "step": 26092 + }, + { + "epoch": 8.00890116635973, + "grad_norm": 0.1823183298110962, + "learning_rate": 1.0044405874390057e-05, + "loss": 1.7215, + "step": 26093 + }, + { + "epoch": 8.009208103130755, + "grad_norm": 0.2914387881755829, + "learning_rate": 1.004141785904753e-05, + "loss": 1.8169, + "step": 26094 + }, + { + "epoch": 8.00951503990178, + "grad_norm": 0.21860483288764954, + "learning_rate": 1.0038430238599156e-05, + "loss": 1.8372, + "step": 26095 + }, + { + "epoch": 8.009821976672805, + "grad_norm": 0.2060404270887375, + "learning_rate": 1.0035443013074407e-05, + "loss": 1.7224, + "step": 26096 + }, + { + "epoch": 8.01012891344383, + "grad_norm": 0.21953152120113373, + "learning_rate": 1.003245618250287e-05, + "loss": 1.7571, + "step": 26097 + }, + { + "epoch": 8.010435850214856, + "grad_norm": 0.16731835901737213, + "learning_rate": 1.0029469746913995e-05, + "loss": 1.7222, + "step": 26098 + }, + { + "epoch": 8.010742786985881, + "grad_norm": 0.19284974038600922, + "learning_rate": 1.0026483706337336e-05, + "loss": 1.6582, + "step": 26099 + }, + { + "epoch": 8.011049723756907, + "grad_norm": 0.14466765522956848, + "learning_rate": 1.00234980608024e-05, + "loss": 1.6772, + "step": 26100 + }, + { + "epoch": 8.011356660527932, + "grad_norm": 0.19553600251674652, + "learning_rate": 1.0020512810338688e-05, + "loss": 1.6841, + "step": 26101 + }, + { + "epoch": 8.011663597298956, + "grad_norm": 0.19986452162265778, + "learning_rate": 1.0017527954975698e-05, + "loss": 1.7025, + "step": 26102 + }, + { + "epoch": 8.011970534069981, + "grad_norm": 0.17204077541828156, + "learning_rate": 1.0014543494742933e-05, + "loss": 1.7508, + "step": 26103 + }, + { + "epoch": 8.012277470841006, + "grad_norm": 0.19889704883098602, + "learning_rate": 1.0011559429669887e-05, + "loss": 1.6973, + "step": 26104 + }, + { + "epoch": 8.012584407612032, + "grad_norm": 0.16140232980251312, + "learning_rate": 1.0008575759786042e-05, + "loss": 1.7932, + "step": 26105 + }, + { + "epoch": 8.012891344383057, + "grad_norm": 0.21359173953533173, + "learning_rate": 1.0005592485120896e-05, + "loss": 1.6986, + "step": 26106 + }, + { + "epoch": 8.013198281154082, + "grad_norm": 0.1766652911901474, + "learning_rate": 1.0002609605703927e-05, + "loss": 1.7275, + "step": 26107 + }, + { + "epoch": 8.013505217925108, + "grad_norm": 0.176233172416687, + "learning_rate": 9.999627121564614e-06, + "loss": 1.6787, + "step": 26108 + }, + { + "epoch": 8.013812154696133, + "grad_norm": 0.15688678622245789, + "learning_rate": 9.996645032732426e-06, + "loss": 1.6917, + "step": 26109 + }, + { + "epoch": 8.014119091467158, + "grad_norm": 0.1363043189048767, + "learning_rate": 9.993663339236842e-06, + "loss": 1.6621, + "step": 26110 + }, + { + "epoch": 8.014426028238184, + "grad_norm": 0.1586332768201828, + "learning_rate": 9.990682041107313e-06, + "loss": 1.7161, + "step": 26111 + }, + { + "epoch": 8.014732965009209, + "grad_norm": 0.19763816893100739, + "learning_rate": 9.987701138373334e-06, + "loss": 1.736, + "step": 26112 + }, + { + "epoch": 8.015039901780233, + "grad_norm": 0.15302304923534393, + "learning_rate": 9.984720631064326e-06, + "loss": 1.6814, + "step": 26113 + }, + { + "epoch": 8.015346838551258, + "grad_norm": 0.1768827736377716, + "learning_rate": 9.981740519209786e-06, + "loss": 1.7006, + "step": 26114 + }, + { + "epoch": 8.015653775322283, + "grad_norm": 0.14857567846775055, + "learning_rate": 9.978760802839116e-06, + "loss": 1.6891, + "step": 26115 + }, + { + "epoch": 8.015960712093309, + "grad_norm": 0.20578980445861816, + "learning_rate": 9.9757814819818e-06, + "loss": 1.7798, + "step": 26116 + }, + { + "epoch": 8.016267648864334, + "grad_norm": 0.16164197027683258, + "learning_rate": 9.97280255666727e-06, + "loss": 1.6855, + "step": 26117 + }, + { + "epoch": 8.01657458563536, + "grad_norm": 0.2176574170589447, + "learning_rate": 9.969824026924968e-06, + "loss": 1.8144, + "step": 26118 + }, + { + "epoch": 8.016881522406385, + "grad_norm": 0.16946040093898773, + "learning_rate": 9.966845892784326e-06, + "loss": 1.7029, + "step": 26119 + }, + { + "epoch": 8.01718845917741, + "grad_norm": 0.17593413591384888, + "learning_rate": 9.96386815427478e-06, + "loss": 1.6993, + "step": 26120 + }, + { + "epoch": 8.017495395948435, + "grad_norm": 0.16679200530052185, + "learning_rate": 9.96089081142575e-06, + "loss": 1.6993, + "step": 26121 + }, + { + "epoch": 8.01780233271946, + "grad_norm": 0.19294987618923187, + "learning_rate": 9.957913864266667e-06, + "loss": 1.7417, + "step": 26122 + }, + { + "epoch": 8.018109269490484, + "grad_norm": 0.17427025735378265, + "learning_rate": 9.954937312826951e-06, + "loss": 1.6957, + "step": 26123 + }, + { + "epoch": 8.01841620626151, + "grad_norm": 0.1996718794107437, + "learning_rate": 9.951961157136013e-06, + "loss": 1.7348, + "step": 26124 + }, + { + "epoch": 8.018723143032535, + "grad_norm": 0.19701123237609863, + "learning_rate": 9.948985397223271e-06, + "loss": 1.7336, + "step": 26125 + }, + { + "epoch": 8.01903007980356, + "grad_norm": 0.15205782651901245, + "learning_rate": 9.946010033118124e-06, + "loss": 1.6971, + "step": 26126 + }, + { + "epoch": 8.019337016574585, + "grad_norm": 0.16516798734664917, + "learning_rate": 9.943035064849986e-06, + "loss": 1.7176, + "step": 26127 + }, + { + "epoch": 8.01964395334561, + "grad_norm": 0.18073998391628265, + "learning_rate": 9.94006049244825e-06, + "loss": 1.7344, + "step": 26128 + }, + { + "epoch": 8.019950890116636, + "grad_norm": 0.15453651547431946, + "learning_rate": 9.937086315942324e-06, + "loss": 1.7268, + "step": 26129 + }, + { + "epoch": 8.020257826887661, + "grad_norm": 0.17114359140396118, + "learning_rate": 9.934112535361574e-06, + "loss": 1.6708, + "step": 26130 + }, + { + "epoch": 8.020564763658687, + "grad_norm": 0.15452778339385986, + "learning_rate": 9.931139150735431e-06, + "loss": 1.697, + "step": 26131 + }, + { + "epoch": 8.020871700429712, + "grad_norm": 0.18605299293994904, + "learning_rate": 9.928166162093234e-06, + "loss": 1.7463, + "step": 26132 + }, + { + "epoch": 8.021178637200737, + "grad_norm": 0.14081695675849915, + "learning_rate": 9.925193569464398e-06, + "loss": 1.678, + "step": 26133 + }, + { + "epoch": 8.021485573971761, + "grad_norm": 0.15573516488075256, + "learning_rate": 9.922221372878288e-06, + "loss": 1.7125, + "step": 26134 + }, + { + "epoch": 8.021792510742786, + "grad_norm": 0.1690043956041336, + "learning_rate": 9.919249572364275e-06, + "loss": 1.7067, + "step": 26135 + }, + { + "epoch": 8.022099447513812, + "grad_norm": 0.1895153820514679, + "learning_rate": 9.91627816795173e-06, + "loss": 1.7098, + "step": 26136 + }, + { + "epoch": 8.022406384284837, + "grad_norm": 0.1467704176902771, + "learning_rate": 9.913307159670022e-06, + "loss": 1.666, + "step": 26137 + }, + { + "epoch": 8.022713321055862, + "grad_norm": 0.17272399365901947, + "learning_rate": 9.910336547548505e-06, + "loss": 1.7017, + "step": 26138 + }, + { + "epoch": 8.023020257826888, + "grad_norm": 0.16714219748973846, + "learning_rate": 9.907366331616541e-06, + "loss": 1.7096, + "step": 26139 + }, + { + "epoch": 8.023327194597913, + "grad_norm": 0.1545754224061966, + "learning_rate": 9.90439651190348e-06, + "loss": 1.6768, + "step": 26140 + }, + { + "epoch": 8.023634131368938, + "grad_norm": 0.17502975463867188, + "learning_rate": 9.901427088438675e-06, + "loss": 1.6879, + "step": 26141 + }, + { + "epoch": 8.023941068139964, + "grad_norm": 0.15835684537887573, + "learning_rate": 9.898458061251465e-06, + "loss": 1.6908, + "step": 26142 + }, + { + "epoch": 8.024248004910989, + "grad_norm": 0.19534549117088318, + "learning_rate": 9.895489430371202e-06, + "loss": 1.7235, + "step": 26143 + }, + { + "epoch": 8.024554941682014, + "grad_norm": 0.18291355669498444, + "learning_rate": 9.89252119582722e-06, + "loss": 1.7618, + "step": 26144 + }, + { + "epoch": 8.024861878453038, + "grad_norm": 0.1474599689245224, + "learning_rate": 9.889553357648844e-06, + "loss": 1.7011, + "step": 26145 + }, + { + "epoch": 8.025168815224063, + "grad_norm": 0.1801324188709259, + "learning_rate": 9.886585915865421e-06, + "loss": 1.7386, + "step": 26146 + }, + { + "epoch": 8.025475751995089, + "grad_norm": 0.16178105771541595, + "learning_rate": 9.883618870506245e-06, + "loss": 1.6903, + "step": 26147 + }, + { + "epoch": 8.025782688766114, + "grad_norm": 0.15138550102710724, + "learning_rate": 9.880652221600694e-06, + "loss": 1.7064, + "step": 26148 + }, + { + "epoch": 8.02608962553714, + "grad_norm": 0.22056828439235687, + "learning_rate": 9.877685969178018e-06, + "loss": 1.7879, + "step": 26149 + }, + { + "epoch": 8.026396562308165, + "grad_norm": 0.15810613334178925, + "learning_rate": 9.874720113267599e-06, + "loss": 1.6895, + "step": 26150 + }, + { + "epoch": 8.02670349907919, + "grad_norm": 0.15241321921348572, + "learning_rate": 9.871754653898685e-06, + "loss": 1.7103, + "step": 26151 + }, + { + "epoch": 8.027010435850215, + "grad_norm": 0.1609175056219101, + "learning_rate": 9.868789591100625e-06, + "loss": 1.6845, + "step": 26152 + }, + { + "epoch": 8.02731737262124, + "grad_norm": 0.16068117320537567, + "learning_rate": 9.865824924902706e-06, + "loss": 1.6688, + "step": 26153 + }, + { + "epoch": 8.027624309392266, + "grad_norm": 0.14036257565021515, + "learning_rate": 9.862860655334233e-06, + "loss": 1.6881, + "step": 26154 + }, + { + "epoch": 8.027931246163291, + "grad_norm": 0.16418461501598358, + "learning_rate": 9.859896782424494e-06, + "loss": 1.7265, + "step": 26155 + }, + { + "epoch": 8.028238182934315, + "grad_norm": 0.19456401467323303, + "learning_rate": 9.856933306202782e-06, + "loss": 1.7152, + "step": 26156 + }, + { + "epoch": 8.02854511970534, + "grad_norm": 0.14537569880485535, + "learning_rate": 9.853970226698384e-06, + "loss": 1.6918, + "step": 26157 + }, + { + "epoch": 8.028852056476365, + "grad_norm": 0.18725928664207458, + "learning_rate": 9.851007543940578e-06, + "loss": 1.6815, + "step": 26158 + }, + { + "epoch": 8.02915899324739, + "grad_norm": 0.17676733434200287, + "learning_rate": 9.848045257958649e-06, + "loss": 1.7741, + "step": 26159 + }, + { + "epoch": 8.029465930018416, + "grad_norm": 0.1890053004026413, + "learning_rate": 9.845083368781877e-06, + "loss": 1.7433, + "step": 26160 + }, + { + "epoch": 8.029772866789441, + "grad_norm": 0.16931703686714172, + "learning_rate": 9.84212187643952e-06, + "loss": 1.7474, + "step": 26161 + }, + { + "epoch": 8.030079803560467, + "grad_norm": 0.17416565120220184, + "learning_rate": 9.839160780960855e-06, + "loss": 1.7259, + "step": 26162 + }, + { + "epoch": 8.030386740331492, + "grad_norm": 0.17702054977416992, + "learning_rate": 9.83620008237514e-06, + "loss": 1.7166, + "step": 26163 + }, + { + "epoch": 8.030693677102517, + "grad_norm": 0.1579936146736145, + "learning_rate": 9.833239780711622e-06, + "loss": 1.6593, + "step": 26164 + }, + { + "epoch": 8.031000613873543, + "grad_norm": 0.2263452112674713, + "learning_rate": 9.830279875999604e-06, + "loss": 1.7735, + "step": 26165 + }, + { + "epoch": 8.031307550644566, + "grad_norm": 0.160926952958107, + "learning_rate": 9.827320368268273e-06, + "loss": 1.7, + "step": 26166 + }, + { + "epoch": 8.031614487415592, + "grad_norm": 0.21756359934806824, + "learning_rate": 9.824361257546938e-06, + "loss": 1.736, + "step": 26167 + }, + { + "epoch": 8.031921424186617, + "grad_norm": 0.20553551614284515, + "learning_rate": 9.821402543864783e-06, + "loss": 1.7254, + "step": 26168 + }, + { + "epoch": 8.032228360957642, + "grad_norm": 0.14283208549022675, + "learning_rate": 9.818444227251089e-06, + "loss": 1.6532, + "step": 26169 + }, + { + "epoch": 8.032535297728668, + "grad_norm": 0.22624479234218597, + "learning_rate": 9.815486307735084e-06, + "loss": 1.7933, + "step": 26170 + }, + { + "epoch": 8.032842234499693, + "grad_norm": 0.15582896769046783, + "learning_rate": 9.812528785345999e-06, + "loss": 1.6959, + "step": 26171 + }, + { + "epoch": 8.033149171270718, + "grad_norm": 0.19829398393630981, + "learning_rate": 9.809571660113055e-06, + "loss": 1.7431, + "step": 26172 + }, + { + "epoch": 8.033456108041744, + "grad_norm": 0.1469334214925766, + "learning_rate": 9.806614932065477e-06, + "loss": 1.7441, + "step": 26173 + }, + { + "epoch": 8.033763044812769, + "grad_norm": 0.17737391591072083, + "learning_rate": 9.803658601232491e-06, + "loss": 1.719, + "step": 26174 + }, + { + "epoch": 8.034069981583794, + "grad_norm": 0.16895830631256104, + "learning_rate": 9.800702667643314e-06, + "loss": 1.7169, + "step": 26175 + }, + { + "epoch": 8.03437691835482, + "grad_norm": 0.17256470024585724, + "learning_rate": 9.79774713132715e-06, + "loss": 1.712, + "step": 26176 + }, + { + "epoch": 8.034683855125843, + "grad_norm": 0.1516820341348648, + "learning_rate": 9.794791992313213e-06, + "loss": 1.6345, + "step": 26177 + }, + { + "epoch": 8.034990791896869, + "grad_norm": 0.20021840929985046, + "learning_rate": 9.79183725063071e-06, + "loss": 1.6962, + "step": 26178 + }, + { + "epoch": 8.035297728667894, + "grad_norm": 0.19088859856128693, + "learning_rate": 9.788882906308832e-06, + "loss": 1.7719, + "step": 26179 + }, + { + "epoch": 8.03560466543892, + "grad_norm": 0.16831208765506744, + "learning_rate": 9.78592895937679e-06, + "loss": 1.7101, + "step": 26180 + }, + { + "epoch": 8.035911602209945, + "grad_norm": 0.15665093064308167, + "learning_rate": 9.782975409863749e-06, + "loss": 1.7328, + "step": 26181 + }, + { + "epoch": 8.03621853898097, + "grad_norm": 0.20523908734321594, + "learning_rate": 9.780022257798943e-06, + "loss": 1.7338, + "step": 26182 + }, + { + "epoch": 8.036525475751995, + "grad_norm": 0.15819329023361206, + "learning_rate": 9.777069503211505e-06, + "loss": 1.7116, + "step": 26183 + }, + { + "epoch": 8.03683241252302, + "grad_norm": 0.14828373491764069, + "learning_rate": 9.774117146130673e-06, + "loss": 1.6671, + "step": 26184 + }, + { + "epoch": 8.037139349294046, + "grad_norm": 0.17743347585201263, + "learning_rate": 9.771165186585563e-06, + "loss": 1.7474, + "step": 26185 + }, + { + "epoch": 8.037446286065071, + "grad_norm": 0.14112113416194916, + "learning_rate": 9.768213624605388e-06, + "loss": 1.6324, + "step": 26186 + }, + { + "epoch": 8.037753222836097, + "grad_norm": 0.14532047510147095, + "learning_rate": 9.76526246021931e-06, + "loss": 1.6814, + "step": 26187 + }, + { + "epoch": 8.03806015960712, + "grad_norm": 0.16272012889385223, + "learning_rate": 9.762311693456489e-06, + "loss": 1.6556, + "step": 26188 + }, + { + "epoch": 8.038367096378146, + "grad_norm": 0.17599201202392578, + "learning_rate": 9.759361324346088e-06, + "loss": 1.7186, + "step": 26189 + }, + { + "epoch": 8.03867403314917, + "grad_norm": 0.20449498295783997, + "learning_rate": 9.75641135291726e-06, + "loss": 1.7324, + "step": 26190 + }, + { + "epoch": 8.038980969920196, + "grad_norm": 0.1787404716014862, + "learning_rate": 9.753461779199168e-06, + "loss": 1.7038, + "step": 26191 + }, + { + "epoch": 8.039287906691222, + "grad_norm": 0.15954211354255676, + "learning_rate": 9.750512603220956e-06, + "loss": 1.6926, + "step": 26192 + }, + { + "epoch": 8.039594843462247, + "grad_norm": 0.21806633472442627, + "learning_rate": 9.747563825011768e-06, + "loss": 1.7317, + "step": 26193 + }, + { + "epoch": 8.039901780233272, + "grad_norm": 0.14846986532211304, + "learning_rate": 9.744615444600746e-06, + "loss": 1.655, + "step": 26194 + }, + { + "epoch": 8.040208717004298, + "grad_norm": 0.17799098789691925, + "learning_rate": 9.74166746201703e-06, + "loss": 1.6899, + "step": 26195 + }, + { + "epoch": 8.040515653775323, + "grad_norm": 0.1648644655942917, + "learning_rate": 9.738719877289754e-06, + "loss": 1.7181, + "step": 26196 + }, + { + "epoch": 8.040822590546348, + "grad_norm": 0.17811881005764008, + "learning_rate": 9.735772690448042e-06, + "loss": 1.7257, + "step": 26197 + }, + { + "epoch": 8.041129527317372, + "grad_norm": 0.19059741497039795, + "learning_rate": 9.732825901521014e-06, + "loss": 1.7306, + "step": 26198 + }, + { + "epoch": 8.041436464088397, + "grad_norm": 0.17326456308364868, + "learning_rate": 9.729879510537825e-06, + "loss": 1.6922, + "step": 26199 + }, + { + "epoch": 8.041743400859422, + "grad_norm": 0.1428811252117157, + "learning_rate": 9.726933517527548e-06, + "loss": 1.6495, + "step": 26200 + }, + { + "epoch": 8.042050337630448, + "grad_norm": 0.1494823843240738, + "learning_rate": 9.72398792251934e-06, + "loss": 1.6779, + "step": 26201 + }, + { + "epoch": 8.042357274401473, + "grad_norm": 0.19112205505371094, + "learning_rate": 9.721042725542267e-06, + "loss": 1.7794, + "step": 26202 + }, + { + "epoch": 8.042664211172498, + "grad_norm": 0.15820644795894623, + "learning_rate": 9.718097926625468e-06, + "loss": 1.6834, + "step": 26203 + }, + { + "epoch": 8.042971147943524, + "grad_norm": 0.17020943760871887, + "learning_rate": 9.715153525798043e-06, + "loss": 1.6852, + "step": 26204 + }, + { + "epoch": 8.043278084714549, + "grad_norm": 0.18933680653572083, + "learning_rate": 9.712209523089072e-06, + "loss": 1.7412, + "step": 26205 + }, + { + "epoch": 8.043585021485574, + "grad_norm": 0.16407641768455505, + "learning_rate": 9.709265918527666e-06, + "loss": 1.7209, + "step": 26206 + }, + { + "epoch": 8.0438919582566, + "grad_norm": 0.19043506681919098, + "learning_rate": 9.706322712142912e-06, + "loss": 1.7351, + "step": 26207 + }, + { + "epoch": 8.044198895027625, + "grad_norm": 0.14904475212097168, + "learning_rate": 9.703379903963889e-06, + "loss": 1.7484, + "step": 26208 + }, + { + "epoch": 8.044505831798649, + "grad_norm": 0.14778849482536316, + "learning_rate": 9.700437494019682e-06, + "loss": 1.7231, + "step": 26209 + }, + { + "epoch": 8.044812768569674, + "grad_norm": 0.186212420463562, + "learning_rate": 9.697495482339374e-06, + "loss": 1.7153, + "step": 26210 + }, + { + "epoch": 8.0451197053407, + "grad_norm": 0.13795694708824158, + "learning_rate": 9.694553868952044e-06, + "loss": 1.693, + "step": 26211 + }, + { + "epoch": 8.045426642111725, + "grad_norm": 0.16083405911922455, + "learning_rate": 9.69161265388675e-06, + "loss": 1.669, + "step": 26212 + }, + { + "epoch": 8.04573357888275, + "grad_norm": 0.15548262000083923, + "learning_rate": 9.688671837172569e-06, + "loss": 1.7265, + "step": 26213 + }, + { + "epoch": 8.046040515653775, + "grad_norm": 0.14771351218223572, + "learning_rate": 9.685731418838556e-06, + "loss": 1.6978, + "step": 26214 + }, + { + "epoch": 8.0463474524248, + "grad_norm": 0.1525130569934845, + "learning_rate": 9.682791398913765e-06, + "loss": 1.731, + "step": 26215 + }, + { + "epoch": 8.046654389195826, + "grad_norm": 0.16103293001651764, + "learning_rate": 9.679851777427284e-06, + "loss": 1.7015, + "step": 26216 + }, + { + "epoch": 8.046961325966851, + "grad_norm": 0.16990229487419128, + "learning_rate": 9.676912554408112e-06, + "loss": 1.6995, + "step": 26217 + }, + { + "epoch": 8.047268262737877, + "grad_norm": 0.14605717360973358, + "learning_rate": 9.673973729885355e-06, + "loss": 1.7085, + "step": 26218 + }, + { + "epoch": 8.047575199508902, + "grad_norm": 0.19646432995796204, + "learning_rate": 9.671035303887993e-06, + "loss": 1.8441, + "step": 26219 + }, + { + "epoch": 8.047882136279926, + "grad_norm": 0.2000361531972885, + "learning_rate": 9.668097276445115e-06, + "loss": 1.7126, + "step": 26220 + }, + { + "epoch": 8.04818907305095, + "grad_norm": 0.2262575775384903, + "learning_rate": 9.665159647585736e-06, + "loss": 1.7721, + "step": 26221 + }, + { + "epoch": 8.048496009821976, + "grad_norm": 0.1880655288696289, + "learning_rate": 9.662222417338895e-06, + "loss": 1.7151, + "step": 26222 + }, + { + "epoch": 8.048802946593002, + "grad_norm": 0.1746743619441986, + "learning_rate": 9.659285585733613e-06, + "loss": 1.6745, + "step": 26223 + }, + { + "epoch": 8.049109883364027, + "grad_norm": 0.14917364716529846, + "learning_rate": 9.656349152798916e-06, + "loss": 1.6541, + "step": 26224 + }, + { + "epoch": 8.049416820135052, + "grad_norm": 0.18189994990825653, + "learning_rate": 9.65341311856382e-06, + "loss": 1.7361, + "step": 26225 + }, + { + "epoch": 8.049723756906078, + "grad_norm": 0.16237786412239075, + "learning_rate": 9.650477483057346e-06, + "loss": 1.7446, + "step": 26226 + }, + { + "epoch": 8.050030693677103, + "grad_norm": 0.1651264876127243, + "learning_rate": 9.647542246308506e-06, + "loss": 1.7604, + "step": 26227 + }, + { + "epoch": 8.050337630448128, + "grad_norm": 0.1673632264137268, + "learning_rate": 9.644607408346296e-06, + "loss": 1.678, + "step": 26228 + }, + { + "epoch": 8.050644567219154, + "grad_norm": 0.20457343757152557, + "learning_rate": 9.641672969199738e-06, + "loss": 1.6963, + "step": 26229 + }, + { + "epoch": 8.050951503990179, + "grad_norm": 0.15247805416584015, + "learning_rate": 9.638738928897816e-06, + "loss": 1.7036, + "step": 26230 + }, + { + "epoch": 8.051258440761202, + "grad_norm": 0.21655996143817902, + "learning_rate": 9.635805287469535e-06, + "loss": 1.7422, + "step": 26231 + }, + { + "epoch": 8.051565377532228, + "grad_norm": 0.1631101369857788, + "learning_rate": 9.632872044943869e-06, + "loss": 1.6681, + "step": 26232 + }, + { + "epoch": 8.051872314303253, + "grad_norm": 0.18587349355220795, + "learning_rate": 9.629939201349853e-06, + "loss": 1.7036, + "step": 26233 + }, + { + "epoch": 8.052179251074278, + "grad_norm": 0.272533655166626, + "learning_rate": 9.627006756716405e-06, + "loss": 1.818, + "step": 26234 + }, + { + "epoch": 8.052486187845304, + "grad_norm": 0.1740235984325409, + "learning_rate": 9.624074711072572e-06, + "loss": 1.7074, + "step": 26235 + }, + { + "epoch": 8.05279312461633, + "grad_norm": 0.21405693888664246, + "learning_rate": 9.621143064447274e-06, + "loss": 1.7473, + "step": 26236 + }, + { + "epoch": 8.053100061387354, + "grad_norm": 0.172579824924469, + "learning_rate": 9.618211816869515e-06, + "loss": 1.7154, + "step": 26237 + }, + { + "epoch": 8.05340699815838, + "grad_norm": 0.19767756760120392, + "learning_rate": 9.615280968368257e-06, + "loss": 1.7011, + "step": 26238 + }, + { + "epoch": 8.053713934929405, + "grad_norm": 0.18467654287815094, + "learning_rate": 9.612350518972463e-06, + "loss": 1.6922, + "step": 26239 + }, + { + "epoch": 8.05402087170043, + "grad_norm": 0.1530679613351822, + "learning_rate": 9.609420468711088e-06, + "loss": 1.6633, + "step": 26240 + }, + { + "epoch": 8.054327808471454, + "grad_norm": 0.3850557804107666, + "learning_rate": 9.6064908176131e-06, + "loss": 1.7637, + "step": 26241 + }, + { + "epoch": 8.05463474524248, + "grad_norm": 0.1556573212146759, + "learning_rate": 9.603561565707441e-06, + "loss": 1.6853, + "step": 26242 + }, + { + "epoch": 8.054941682013505, + "grad_norm": 0.2009180188179016, + "learning_rate": 9.600632713023067e-06, + "loss": 1.7172, + "step": 26243 + }, + { + "epoch": 8.05524861878453, + "grad_norm": 0.18538115918636322, + "learning_rate": 9.597704259588919e-06, + "loss": 1.7517, + "step": 26244 + }, + { + "epoch": 8.055555555555555, + "grad_norm": 0.1626463681459427, + "learning_rate": 9.594776205433936e-06, + "loss": 1.697, + "step": 26245 + }, + { + "epoch": 8.05586249232658, + "grad_norm": 0.15908029675483704, + "learning_rate": 9.591848550587062e-06, + "loss": 1.7355, + "step": 26246 + }, + { + "epoch": 8.056169429097606, + "grad_norm": 0.1679108589887619, + "learning_rate": 9.588921295077219e-06, + "loss": 1.6732, + "step": 26247 + }, + { + "epoch": 8.056476365868631, + "grad_norm": 0.17123237252235413, + "learning_rate": 9.585994438933344e-06, + "loss": 1.7627, + "step": 26248 + }, + { + "epoch": 8.056783302639657, + "grad_norm": 0.2438436597585678, + "learning_rate": 9.583067982184346e-06, + "loss": 1.7475, + "step": 26249 + }, + { + "epoch": 8.057090239410682, + "grad_norm": 0.18769577145576477, + "learning_rate": 9.580141924859182e-06, + "loss": 1.7165, + "step": 26250 + }, + { + "epoch": 8.057397176181707, + "grad_norm": 0.18146662414073944, + "learning_rate": 9.577216266986727e-06, + "loss": 1.7601, + "step": 26251 + }, + { + "epoch": 8.057704112952731, + "grad_norm": 0.20209676027297974, + "learning_rate": 9.574291008595932e-06, + "loss": 1.7635, + "step": 26252 + }, + { + "epoch": 8.058011049723756, + "grad_norm": 0.16949260234832764, + "learning_rate": 9.571366149715665e-06, + "loss": 1.7437, + "step": 26253 + }, + { + "epoch": 8.058317986494782, + "grad_norm": 0.14449356496334076, + "learning_rate": 9.568441690374868e-06, + "loss": 1.6906, + "step": 26254 + }, + { + "epoch": 8.058624923265807, + "grad_norm": 0.21796976029872894, + "learning_rate": 9.565517630602428e-06, + "loss": 1.7986, + "step": 26255 + }, + { + "epoch": 8.058931860036832, + "grad_norm": 0.15194009244441986, + "learning_rate": 9.562593970427241e-06, + "loss": 1.6838, + "step": 26256 + }, + { + "epoch": 8.059238796807858, + "grad_norm": 0.19820080697536469, + "learning_rate": 9.559670709878198e-06, + "loss": 1.7327, + "step": 26257 + }, + { + "epoch": 8.059545733578883, + "grad_norm": 0.1478637307882309, + "learning_rate": 9.5567478489842e-06, + "loss": 1.6814, + "step": 26258 + }, + { + "epoch": 8.059852670349908, + "grad_norm": 0.147980734705925, + "learning_rate": 9.553825387774118e-06, + "loss": 1.693, + "step": 26259 + }, + { + "epoch": 8.060159607120934, + "grad_norm": 0.16274768114089966, + "learning_rate": 9.550903326276839e-06, + "loss": 1.7275, + "step": 26260 + }, + { + "epoch": 8.060466543891959, + "grad_norm": 0.16221144795417786, + "learning_rate": 9.547981664521244e-06, + "loss": 1.7071, + "step": 26261 + }, + { + "epoch": 8.060773480662984, + "grad_norm": 0.18921487033367157, + "learning_rate": 9.545060402536204e-06, + "loss": 1.6771, + "step": 26262 + }, + { + "epoch": 8.061080417434008, + "grad_norm": 0.19136327505111694, + "learning_rate": 9.542139540350586e-06, + "loss": 1.7235, + "step": 26263 + }, + { + "epoch": 8.061387354205033, + "grad_norm": 0.18764656782150269, + "learning_rate": 9.539219077993261e-06, + "loss": 1.7374, + "step": 26264 + }, + { + "epoch": 8.061694290976058, + "grad_norm": 0.16516967117786407, + "learning_rate": 9.53629901549309e-06, + "loss": 1.7124, + "step": 26265 + }, + { + "epoch": 8.062001227747084, + "grad_norm": 0.1457880437374115, + "learning_rate": 9.533379352878907e-06, + "loss": 1.6471, + "step": 26266 + }, + { + "epoch": 8.06230816451811, + "grad_norm": 0.1898411363363266, + "learning_rate": 9.530460090179622e-06, + "loss": 1.7745, + "step": 26267 + }, + { + "epoch": 8.062615101289135, + "grad_norm": 0.18252579867839813, + "learning_rate": 9.52754122742402e-06, + "loss": 1.7165, + "step": 26268 + }, + { + "epoch": 8.06292203806016, + "grad_norm": 0.1838676929473877, + "learning_rate": 9.524622764641006e-06, + "loss": 1.7169, + "step": 26269 + }, + { + "epoch": 8.063228974831185, + "grad_norm": 0.1684531718492508, + "learning_rate": 9.521704701859362e-06, + "loss": 1.6831, + "step": 26270 + }, + { + "epoch": 8.06353591160221, + "grad_norm": 0.18296435475349426, + "learning_rate": 9.51878703910798e-06, + "loss": 1.6952, + "step": 26271 + }, + { + "epoch": 8.063842848373236, + "grad_norm": 0.20634715259075165, + "learning_rate": 9.515869776415665e-06, + "loss": 1.6899, + "step": 26272 + }, + { + "epoch": 8.06414978514426, + "grad_norm": 0.18681001663208008, + "learning_rate": 9.512952913811252e-06, + "loss": 1.6648, + "step": 26273 + }, + { + "epoch": 8.064456721915285, + "grad_norm": 0.19397646188735962, + "learning_rate": 9.510036451323568e-06, + "loss": 1.7309, + "step": 26274 + }, + { + "epoch": 8.06476365868631, + "grad_norm": 0.17254865169525146, + "learning_rate": 9.507120388981438e-06, + "loss": 1.6671, + "step": 26275 + }, + { + "epoch": 8.065070595457335, + "grad_norm": 0.16224531829357147, + "learning_rate": 9.504204726813682e-06, + "loss": 1.6881, + "step": 26276 + }, + { + "epoch": 8.06537753222836, + "grad_norm": 0.16534289717674255, + "learning_rate": 9.501289464849106e-06, + "loss": 1.7372, + "step": 26277 + }, + { + "epoch": 8.065684468999386, + "grad_norm": 0.20247776806354523, + "learning_rate": 9.498374603116523e-06, + "loss": 1.7108, + "step": 26278 + }, + { + "epoch": 8.065991405770411, + "grad_norm": 0.1420232504606247, + "learning_rate": 9.49546014164474e-06, + "loss": 1.6403, + "step": 26279 + }, + { + "epoch": 8.066298342541437, + "grad_norm": 0.139396533370018, + "learning_rate": 9.492546080462567e-06, + "loss": 1.6578, + "step": 26280 + }, + { + "epoch": 8.066605279312462, + "grad_norm": 0.17437872290611267, + "learning_rate": 9.489632419598788e-06, + "loss": 1.7094, + "step": 26281 + }, + { + "epoch": 8.066912216083487, + "grad_norm": 0.29614368081092834, + "learning_rate": 9.486719159082209e-06, + "loss": 1.773, + "step": 26282 + }, + { + "epoch": 8.067219152854513, + "grad_norm": 0.20771834254264832, + "learning_rate": 9.483806298941617e-06, + "loss": 1.7421, + "step": 26283 + }, + { + "epoch": 8.067526089625536, + "grad_norm": 0.20772570371627808, + "learning_rate": 9.4808938392058e-06, + "loss": 1.7437, + "step": 26284 + }, + { + "epoch": 8.067833026396562, + "grad_norm": 0.1837359070777893, + "learning_rate": 9.477981779903522e-06, + "loss": 1.7142, + "step": 26285 + }, + { + "epoch": 8.068139963167587, + "grad_norm": 0.18425285816192627, + "learning_rate": 9.475070121063607e-06, + "loss": 1.6804, + "step": 26286 + }, + { + "epoch": 8.068446899938612, + "grad_norm": 0.16501453518867493, + "learning_rate": 9.472158862714775e-06, + "loss": 1.7466, + "step": 26287 + }, + { + "epoch": 8.068753836709638, + "grad_norm": 0.17685455083847046, + "learning_rate": 9.469248004885839e-06, + "loss": 1.6839, + "step": 26288 + }, + { + "epoch": 8.069060773480663, + "grad_norm": 0.18923965096473694, + "learning_rate": 9.466337547605547e-06, + "loss": 1.6774, + "step": 26289 + }, + { + "epoch": 8.069367710251688, + "grad_norm": 0.17584268748760223, + "learning_rate": 9.463427490902665e-06, + "loss": 1.6904, + "step": 26290 + }, + { + "epoch": 8.069674647022714, + "grad_norm": 0.25477278232574463, + "learning_rate": 9.460517834805966e-06, + "loss": 1.7898, + "step": 26291 + }, + { + "epoch": 8.069981583793739, + "grad_norm": 0.23453976213932037, + "learning_rate": 9.457608579344169e-06, + "loss": 1.7456, + "step": 26292 + }, + { + "epoch": 8.070288520564764, + "grad_norm": 0.20332537591457367, + "learning_rate": 9.45469972454605e-06, + "loss": 1.76, + "step": 26293 + }, + { + "epoch": 8.07059545733579, + "grad_norm": 0.1937316656112671, + "learning_rate": 9.451791270440358e-06, + "loss": 1.698, + "step": 26294 + }, + { + "epoch": 8.070902394106813, + "grad_norm": 0.19909465312957764, + "learning_rate": 9.448883217055832e-06, + "loss": 1.7373, + "step": 26295 + }, + { + "epoch": 8.071209330877839, + "grad_norm": 0.16824916005134583, + "learning_rate": 9.445975564421206e-06, + "loss": 1.6619, + "step": 26296 + }, + { + "epoch": 8.071516267648864, + "grad_norm": 0.17873473465442657, + "learning_rate": 9.443068312565222e-06, + "loss": 1.7438, + "step": 26297 + }, + { + "epoch": 8.07182320441989, + "grad_norm": 0.152094304561615, + "learning_rate": 9.440161461516606e-06, + "loss": 1.6513, + "step": 26298 + }, + { + "epoch": 8.072130141190915, + "grad_norm": 0.14592084288597107, + "learning_rate": 9.43725501130409e-06, + "loss": 1.6503, + "step": 26299 + }, + { + "epoch": 8.07243707796194, + "grad_norm": 0.16904598474502563, + "learning_rate": 9.434348961956396e-06, + "loss": 1.6929, + "step": 26300 + }, + { + "epoch": 8.072744014732965, + "grad_norm": 0.15297052264213562, + "learning_rate": 9.431443313502235e-06, + "loss": 1.6871, + "step": 26301 + }, + { + "epoch": 8.07305095150399, + "grad_norm": 0.20306609570980072, + "learning_rate": 9.428538065970321e-06, + "loss": 1.7779, + "step": 26302 + }, + { + "epoch": 8.073357888275016, + "grad_norm": 0.177826926112175, + "learning_rate": 9.425633219389401e-06, + "loss": 1.7021, + "step": 26303 + }, + { + "epoch": 8.073664825046041, + "grad_norm": 0.22192324697971344, + "learning_rate": 9.422728773788125e-06, + "loss": 1.7713, + "step": 26304 + }, + { + "epoch": 8.073971761817067, + "grad_norm": 0.16998204588890076, + "learning_rate": 9.419824729195253e-06, + "loss": 1.6994, + "step": 26305 + }, + { + "epoch": 8.07427869858809, + "grad_norm": 0.1606592983007431, + "learning_rate": 9.416921085639436e-06, + "loss": 1.7274, + "step": 26306 + }, + { + "epoch": 8.074585635359115, + "grad_norm": 0.17434780299663544, + "learning_rate": 9.414017843149398e-06, + "loss": 1.714, + "step": 26307 + }, + { + "epoch": 8.07489257213014, + "grad_norm": 0.16548825800418854, + "learning_rate": 9.411115001753839e-06, + "loss": 1.7361, + "step": 26308 + }, + { + "epoch": 8.075199508901166, + "grad_norm": 0.23958922922611237, + "learning_rate": 9.408212561481405e-06, + "loss": 1.7286, + "step": 26309 + }, + { + "epoch": 8.075506445672191, + "grad_norm": 0.1900513619184494, + "learning_rate": 9.405310522360821e-06, + "loss": 1.7309, + "step": 26310 + }, + { + "epoch": 8.075813382443217, + "grad_norm": 0.1576761156320572, + "learning_rate": 9.402408884420755e-06, + "loss": 1.7039, + "step": 26311 + }, + { + "epoch": 8.076120319214242, + "grad_norm": 0.17078427970409393, + "learning_rate": 9.399507647689875e-06, + "loss": 1.737, + "step": 26312 + }, + { + "epoch": 8.076427255985267, + "grad_norm": 0.138477623462677, + "learning_rate": 9.396606812196856e-06, + "loss": 1.6673, + "step": 26313 + }, + { + "epoch": 8.076734192756293, + "grad_norm": 0.1546505093574524, + "learning_rate": 9.393706377970368e-06, + "loss": 1.7146, + "step": 26314 + }, + { + "epoch": 8.077041129527318, + "grad_norm": 0.14440344274044037, + "learning_rate": 9.390806345039077e-06, + "loss": 1.7044, + "step": 26315 + }, + { + "epoch": 8.077348066298342, + "grad_norm": 0.1944594532251358, + "learning_rate": 9.387906713431632e-06, + "loss": 1.7685, + "step": 26316 + }, + { + "epoch": 8.077655003069367, + "grad_norm": 0.17758207023143768, + "learning_rate": 9.385007483176706e-06, + "loss": 1.7068, + "step": 26317 + }, + { + "epoch": 8.077961939840392, + "grad_norm": 0.20713698863983154, + "learning_rate": 9.382108654302934e-06, + "loss": 1.6488, + "step": 26318 + }, + { + "epoch": 8.078268876611418, + "grad_norm": 0.14699894189834595, + "learning_rate": 9.379210226838958e-06, + "loss": 1.6746, + "step": 26319 + }, + { + "epoch": 8.078575813382443, + "grad_norm": 0.15119978785514832, + "learning_rate": 9.376312200813465e-06, + "loss": 1.6919, + "step": 26320 + }, + { + "epoch": 8.078882750153468, + "grad_norm": 0.14071249961853027, + "learning_rate": 9.373414576255041e-06, + "loss": 1.6755, + "step": 26321 + }, + { + "epoch": 8.079189686924494, + "grad_norm": 0.22004422545433044, + "learning_rate": 9.370517353192365e-06, + "loss": 1.7808, + "step": 26322 + }, + { + "epoch": 8.079496623695519, + "grad_norm": 0.15764497220516205, + "learning_rate": 9.36762053165403e-06, + "loss": 1.7108, + "step": 26323 + }, + { + "epoch": 8.079803560466544, + "grad_norm": 0.17802847921848297, + "learning_rate": 9.364724111668693e-06, + "loss": 1.7274, + "step": 26324 + }, + { + "epoch": 8.08011049723757, + "grad_norm": 0.16950444877147675, + "learning_rate": 9.361828093264984e-06, + "loss": 1.7196, + "step": 26325 + }, + { + "epoch": 8.080417434008595, + "grad_norm": 0.16647809743881226, + "learning_rate": 9.358932476471488e-06, + "loss": 1.7027, + "step": 26326 + }, + { + "epoch": 8.080724370779619, + "grad_norm": 0.20012708008289337, + "learning_rate": 9.356037261316863e-06, + "loss": 1.7101, + "step": 26327 + }, + { + "epoch": 8.081031307550644, + "grad_norm": 0.19795066118240356, + "learning_rate": 9.353142447829672e-06, + "loss": 1.7142, + "step": 26328 + }, + { + "epoch": 8.08133824432167, + "grad_norm": 0.1786295473575592, + "learning_rate": 9.350248036038567e-06, + "loss": 1.6646, + "step": 26329 + }, + { + "epoch": 8.081645181092695, + "grad_norm": 0.17646436393260956, + "learning_rate": 9.347354025972138e-06, + "loss": 1.7044, + "step": 26330 + }, + { + "epoch": 8.08195211786372, + "grad_norm": 0.24095231294631958, + "learning_rate": 9.344460417658979e-06, + "loss": 1.823, + "step": 26331 + }, + { + "epoch": 8.082259054634745, + "grad_norm": 0.16094247996807098, + "learning_rate": 9.341567211127694e-06, + "loss": 1.6933, + "step": 26332 + }, + { + "epoch": 8.08256599140577, + "grad_norm": 0.22386589646339417, + "learning_rate": 9.338674406406872e-06, + "loss": 1.7219, + "step": 26333 + }, + { + "epoch": 8.082872928176796, + "grad_norm": 0.2110683023929596, + "learning_rate": 9.3357820035251e-06, + "loss": 1.6951, + "step": 26334 + }, + { + "epoch": 8.083179864947821, + "grad_norm": 0.2240242063999176, + "learning_rate": 9.33289000251097e-06, + "loss": 1.756, + "step": 26335 + }, + { + "epoch": 8.083486801718847, + "grad_norm": 0.19035838544368744, + "learning_rate": 9.329998403393036e-06, + "loss": 1.7657, + "step": 26336 + }, + { + "epoch": 8.083793738489872, + "grad_norm": 0.20213502645492554, + "learning_rate": 9.327107206199925e-06, + "loss": 1.6938, + "step": 26337 + }, + { + "epoch": 8.084100675260895, + "grad_norm": 0.20297139883041382, + "learning_rate": 9.324216410960157e-06, + "loss": 1.7476, + "step": 26338 + }, + { + "epoch": 8.08440761203192, + "grad_norm": 0.23968154191970825, + "learning_rate": 9.321326017702348e-06, + "loss": 1.7418, + "step": 26339 + }, + { + "epoch": 8.084714548802946, + "grad_norm": 0.19853347539901733, + "learning_rate": 9.318436026455008e-06, + "loss": 1.6943, + "step": 26340 + }, + { + "epoch": 8.085021485573971, + "grad_norm": 0.1835598647594452, + "learning_rate": 9.315546437246742e-06, + "loss": 1.7071, + "step": 26341 + }, + { + "epoch": 8.085328422344997, + "grad_norm": 0.22876964509487152, + "learning_rate": 9.312657250106106e-06, + "loss": 1.7717, + "step": 26342 + }, + { + "epoch": 8.085635359116022, + "grad_norm": 0.1632407158613205, + "learning_rate": 9.309768465061613e-06, + "loss": 1.6506, + "step": 26343 + }, + { + "epoch": 8.085942295887047, + "grad_norm": 0.1812858134508133, + "learning_rate": 9.306880082141861e-06, + "loss": 1.6826, + "step": 26344 + }, + { + "epoch": 8.086249232658073, + "grad_norm": 0.24607063829898834, + "learning_rate": 9.303992101375347e-06, + "loss": 1.7109, + "step": 26345 + }, + { + "epoch": 8.086556169429098, + "grad_norm": 0.1401972472667694, + "learning_rate": 9.301104522790648e-06, + "loss": 1.6612, + "step": 26346 + }, + { + "epoch": 8.086863106200123, + "grad_norm": 0.22876517474651337, + "learning_rate": 9.298217346416287e-06, + "loss": 1.6857, + "step": 26347 + }, + { + "epoch": 8.087170042971149, + "grad_norm": 0.22353915870189667, + "learning_rate": 9.295330572280803e-06, + "loss": 1.7071, + "step": 26348 + }, + { + "epoch": 8.087476979742172, + "grad_norm": 0.22349561750888824, + "learning_rate": 9.292444200412715e-06, + "loss": 1.7098, + "step": 26349 + }, + { + "epoch": 8.087783916513198, + "grad_norm": 0.17078392207622528, + "learning_rate": 9.289558230840556e-06, + "loss": 1.6732, + "step": 26350 + }, + { + "epoch": 8.088090853284223, + "grad_norm": 0.19569413363933563, + "learning_rate": 9.286672663592843e-06, + "loss": 1.7489, + "step": 26351 + }, + { + "epoch": 8.088397790055248, + "grad_norm": 0.1565880924463272, + "learning_rate": 9.283787498698093e-06, + "loss": 1.6984, + "step": 26352 + }, + { + "epoch": 8.088704726826274, + "grad_norm": 0.21362969279289246, + "learning_rate": 9.28090273618481e-06, + "loss": 1.7157, + "step": 26353 + }, + { + "epoch": 8.089011663597299, + "grad_norm": 0.15077799558639526, + "learning_rate": 9.278018376081532e-06, + "loss": 1.707, + "step": 26354 + }, + { + "epoch": 8.089318600368324, + "grad_norm": 0.19006888568401337, + "learning_rate": 9.27513441841672e-06, + "loss": 1.7379, + "step": 26355 + }, + { + "epoch": 8.08962553713935, + "grad_norm": 0.17935799062252045, + "learning_rate": 9.272250863218928e-06, + "loss": 1.7529, + "step": 26356 + }, + { + "epoch": 8.089932473910375, + "grad_norm": 0.1539749801158905, + "learning_rate": 9.269367710516596e-06, + "loss": 1.6717, + "step": 26357 + }, + { + "epoch": 8.0902394106814, + "grad_norm": 0.20954270660877228, + "learning_rate": 9.266484960338262e-06, + "loss": 1.7511, + "step": 26358 + }, + { + "epoch": 8.090546347452424, + "grad_norm": 0.1744573712348938, + "learning_rate": 9.263602612712408e-06, + "loss": 1.747, + "step": 26359 + }, + { + "epoch": 8.09085328422345, + "grad_norm": 0.198909193277359, + "learning_rate": 9.260720667667482e-06, + "loss": 1.6854, + "step": 26360 + }, + { + "epoch": 8.091160220994475, + "grad_norm": 0.16504423320293427, + "learning_rate": 9.25783912523202e-06, + "loss": 1.7346, + "step": 26361 + }, + { + "epoch": 8.0914671577655, + "grad_norm": 0.16309323906898499, + "learning_rate": 9.254957985434449e-06, + "loss": 1.695, + "step": 26362 + }, + { + "epoch": 8.091774094536525, + "grad_norm": 0.178558811545372, + "learning_rate": 9.25207724830327e-06, + "loss": 1.7091, + "step": 26363 + }, + { + "epoch": 8.09208103130755, + "grad_norm": 0.1758749783039093, + "learning_rate": 9.249196913866954e-06, + "loss": 1.732, + "step": 26364 + }, + { + "epoch": 8.092387968078576, + "grad_norm": 0.16251471638679504, + "learning_rate": 9.246316982153957e-06, + "loss": 1.6783, + "step": 26365 + }, + { + "epoch": 8.092694904849601, + "grad_norm": 0.1818319857120514, + "learning_rate": 9.243437453192739e-06, + "loss": 1.7208, + "step": 26366 + }, + { + "epoch": 8.093001841620627, + "grad_norm": 0.2009693682193756, + "learning_rate": 9.240558327011761e-06, + "loss": 1.7345, + "step": 26367 + }, + { + "epoch": 8.093308778391652, + "grad_norm": 0.19003108143806458, + "learning_rate": 9.237679603639477e-06, + "loss": 1.7141, + "step": 26368 + }, + { + "epoch": 8.093615715162677, + "grad_norm": 0.19530169665813446, + "learning_rate": 9.234801283104338e-06, + "loss": 1.6945, + "step": 26369 + }, + { + "epoch": 8.0939226519337, + "grad_norm": 0.14184506237506866, + "learning_rate": 9.231923365434769e-06, + "loss": 1.6484, + "step": 26370 + }, + { + "epoch": 8.094229588704726, + "grad_norm": 0.14682452380657196, + "learning_rate": 9.229045850659252e-06, + "loss": 1.6534, + "step": 26371 + }, + { + "epoch": 8.094536525475752, + "grad_norm": 0.21143727004528046, + "learning_rate": 9.22616873880618e-06, + "loss": 1.7439, + "step": 26372 + }, + { + "epoch": 8.094843462246777, + "grad_norm": 0.1664114147424698, + "learning_rate": 9.223292029904029e-06, + "loss": 1.7568, + "step": 26373 + }, + { + "epoch": 8.095150399017802, + "grad_norm": 0.17671625316143036, + "learning_rate": 9.22041572398118e-06, + "loss": 1.6594, + "step": 26374 + }, + { + "epoch": 8.095457335788828, + "grad_norm": 0.1968437135219574, + "learning_rate": 9.217539821066101e-06, + "loss": 1.734, + "step": 26375 + }, + { + "epoch": 8.095764272559853, + "grad_norm": 0.18740740418434143, + "learning_rate": 9.214664321187206e-06, + "loss": 1.7223, + "step": 26376 + }, + { + "epoch": 8.096071209330878, + "grad_norm": 0.16954728960990906, + "learning_rate": 9.21178922437288e-06, + "loss": 1.7282, + "step": 26377 + }, + { + "epoch": 8.096378146101904, + "grad_norm": 0.1979333609342575, + "learning_rate": 9.20891453065158e-06, + "loss": 1.7254, + "step": 26378 + }, + { + "epoch": 8.096685082872929, + "grad_norm": 0.1495361626148224, + "learning_rate": 9.206040240051677e-06, + "loss": 1.6936, + "step": 26379 + }, + { + "epoch": 8.096992019643954, + "grad_norm": 0.159287691116333, + "learning_rate": 9.203166352601605e-06, + "loss": 1.6658, + "step": 26380 + }, + { + "epoch": 8.097298956414978, + "grad_norm": 0.175196573138237, + "learning_rate": 9.200292868329751e-06, + "loss": 1.7779, + "step": 26381 + }, + { + "epoch": 8.097605893186003, + "grad_norm": 0.17131435871124268, + "learning_rate": 9.197419787264522e-06, + "loss": 1.7435, + "step": 26382 + }, + { + "epoch": 8.097912829957028, + "grad_norm": 0.14529173076152802, + "learning_rate": 9.194547109434299e-06, + "loss": 1.7083, + "step": 26383 + }, + { + "epoch": 8.098219766728054, + "grad_norm": 0.1824452430009842, + "learning_rate": 9.191674834867482e-06, + "loss": 1.7134, + "step": 26384 + }, + { + "epoch": 8.098526703499079, + "grad_norm": 0.18507611751556396, + "learning_rate": 9.188802963592453e-06, + "loss": 1.673, + "step": 26385 + }, + { + "epoch": 8.098833640270104, + "grad_norm": 0.19102542102336884, + "learning_rate": 9.185931495637595e-06, + "loss": 1.7058, + "step": 26386 + }, + { + "epoch": 8.09914057704113, + "grad_norm": 0.17001433670520782, + "learning_rate": 9.183060431031271e-06, + "loss": 1.6827, + "step": 26387 + }, + { + "epoch": 8.099447513812155, + "grad_norm": 0.1718425452709198, + "learning_rate": 9.18018976980189e-06, + "loss": 1.7375, + "step": 26388 + }, + { + "epoch": 8.09975445058318, + "grad_norm": 0.15681782364845276, + "learning_rate": 9.177319511977772e-06, + "loss": 1.6989, + "step": 26389 + }, + { + "epoch": 8.100061387354206, + "grad_norm": 0.156332865357399, + "learning_rate": 9.174449657587341e-06, + "loss": 1.7229, + "step": 26390 + }, + { + "epoch": 8.10036832412523, + "grad_norm": 0.2014407366514206, + "learning_rate": 9.171580206658898e-06, + "loss": 1.7589, + "step": 26391 + }, + { + "epoch": 8.100675260896255, + "grad_norm": 0.16946980357170105, + "learning_rate": 9.168711159220845e-06, + "loss": 1.7053, + "step": 26392 + }, + { + "epoch": 8.10098219766728, + "grad_norm": 0.1604216992855072, + "learning_rate": 9.165842515301526e-06, + "loss": 1.7338, + "step": 26393 + }, + { + "epoch": 8.101289134438305, + "grad_norm": 0.19191038608551025, + "learning_rate": 9.162974274929265e-06, + "loss": 1.721, + "step": 26394 + }, + { + "epoch": 8.10159607120933, + "grad_norm": 0.17082683742046356, + "learning_rate": 9.160106438132454e-06, + "loss": 1.707, + "step": 26395 + }, + { + "epoch": 8.101903007980356, + "grad_norm": 0.15988127887248993, + "learning_rate": 9.157239004939377e-06, + "loss": 1.6787, + "step": 26396 + }, + { + "epoch": 8.102209944751381, + "grad_norm": 0.21586796641349792, + "learning_rate": 9.154371975378423e-06, + "loss": 1.7105, + "step": 26397 + }, + { + "epoch": 8.102516881522407, + "grad_norm": 0.17289277911186218, + "learning_rate": 9.151505349477902e-06, + "loss": 1.7165, + "step": 26398 + }, + { + "epoch": 8.102823818293432, + "grad_norm": 0.16819556057453156, + "learning_rate": 9.148639127266145e-06, + "loss": 1.6965, + "step": 26399 + }, + { + "epoch": 8.103130755064457, + "grad_norm": 0.2234455943107605, + "learning_rate": 9.145773308771483e-06, + "loss": 1.8059, + "step": 26400 + }, + { + "epoch": 8.103437691835483, + "grad_norm": 0.15835164487361908, + "learning_rate": 9.142907894022235e-06, + "loss": 1.6851, + "step": 26401 + }, + { + "epoch": 8.103744628606506, + "grad_norm": 0.18604053556919098, + "learning_rate": 9.140042883046718e-06, + "loss": 1.7105, + "step": 26402 + }, + { + "epoch": 8.104051565377532, + "grad_norm": 0.1927308589220047, + "learning_rate": 9.137178275873243e-06, + "loss": 1.7236, + "step": 26403 + }, + { + "epoch": 8.104358502148557, + "grad_norm": 0.16214077174663544, + "learning_rate": 9.134314072530115e-06, + "loss": 1.7394, + "step": 26404 + }, + { + "epoch": 8.104665438919582, + "grad_norm": 0.2051863819360733, + "learning_rate": 9.131450273045667e-06, + "loss": 1.701, + "step": 26405 + }, + { + "epoch": 8.104972375690608, + "grad_norm": 0.1917528212070465, + "learning_rate": 9.128586877448158e-06, + "loss": 1.6984, + "step": 26406 + }, + { + "epoch": 8.105279312461633, + "grad_norm": 0.19591490924358368, + "learning_rate": 9.125723885765935e-06, + "loss": 1.7678, + "step": 26407 + }, + { + "epoch": 8.105586249232658, + "grad_norm": 0.22388321161270142, + "learning_rate": 9.122861298027242e-06, + "loss": 1.7398, + "step": 26408 + }, + { + "epoch": 8.105893186003684, + "grad_norm": 0.13983963429927826, + "learning_rate": 9.119999114260402e-06, + "loss": 1.6868, + "step": 26409 + }, + { + "epoch": 8.106200122774709, + "grad_norm": 0.16611455380916595, + "learning_rate": 9.117137334493708e-06, + "loss": 1.7029, + "step": 26410 + }, + { + "epoch": 8.106507059545734, + "grad_norm": 0.22045908868312836, + "learning_rate": 9.114275958755397e-06, + "loss": 1.7598, + "step": 26411 + }, + { + "epoch": 8.10681399631676, + "grad_norm": 0.1717766672372818, + "learning_rate": 9.111414987073801e-06, + "loss": 1.7197, + "step": 26412 + }, + { + "epoch": 8.107120933087783, + "grad_norm": 0.1627349704504013, + "learning_rate": 9.108554419477138e-06, + "loss": 1.6514, + "step": 26413 + }, + { + "epoch": 8.107427869858808, + "grad_norm": 0.16213741898536682, + "learning_rate": 9.105694255993725e-06, + "loss": 1.6873, + "step": 26414 + }, + { + "epoch": 8.107734806629834, + "grad_norm": 0.15004312992095947, + "learning_rate": 9.102834496651812e-06, + "loss": 1.7057, + "step": 26415 + }, + { + "epoch": 8.10804174340086, + "grad_norm": 0.16030706465244293, + "learning_rate": 9.099975141479655e-06, + "loss": 1.7006, + "step": 26416 + }, + { + "epoch": 8.108348680171884, + "grad_norm": 0.18823765218257904, + "learning_rate": 9.097116190505516e-06, + "loss": 1.6734, + "step": 26417 + }, + { + "epoch": 8.10865561694291, + "grad_norm": 0.19617006182670593, + "learning_rate": 9.094257643757653e-06, + "loss": 1.7135, + "step": 26418 + }, + { + "epoch": 8.108962553713935, + "grad_norm": 0.2009502351284027, + "learning_rate": 9.091399501264308e-06, + "loss": 1.7573, + "step": 26419 + }, + { + "epoch": 8.10926949048496, + "grad_norm": 0.1545785665512085, + "learning_rate": 9.088541763053732e-06, + "loss": 1.7154, + "step": 26420 + }, + { + "epoch": 8.109576427255986, + "grad_norm": 0.19506138563156128, + "learning_rate": 9.085684429154152e-06, + "loss": 1.7116, + "step": 26421 + }, + { + "epoch": 8.109883364027011, + "grad_norm": 0.15998101234436035, + "learning_rate": 9.082827499593843e-06, + "loss": 1.7107, + "step": 26422 + }, + { + "epoch": 8.110190300798035, + "grad_norm": 0.16210505366325378, + "learning_rate": 9.079970974400992e-06, + "loss": 1.6625, + "step": 26423 + }, + { + "epoch": 8.11049723756906, + "grad_norm": 0.14739912748336792, + "learning_rate": 9.077114853603875e-06, + "loss": 1.6993, + "step": 26424 + }, + { + "epoch": 8.110804174340085, + "grad_norm": 0.16882890462875366, + "learning_rate": 9.074259137230667e-06, + "loss": 1.7666, + "step": 26425 + }, + { + "epoch": 8.11111111111111, + "grad_norm": 0.1667594611644745, + "learning_rate": 9.071403825309633e-06, + "loss": 1.6876, + "step": 26426 + }, + { + "epoch": 8.111418047882136, + "grad_norm": 0.14678725600242615, + "learning_rate": 9.06854891786899e-06, + "loss": 1.6458, + "step": 26427 + }, + { + "epoch": 8.111724984653161, + "grad_norm": 0.15207096934318542, + "learning_rate": 9.06569441493691e-06, + "loss": 1.6551, + "step": 26428 + }, + { + "epoch": 8.112031921424187, + "grad_norm": 0.2019769251346588, + "learning_rate": 9.062840316541654e-06, + "loss": 1.7812, + "step": 26429 + }, + { + "epoch": 8.112338858195212, + "grad_norm": 0.12371024489402771, + "learning_rate": 9.05998662271138e-06, + "loss": 1.6389, + "step": 26430 + }, + { + "epoch": 8.112645794966237, + "grad_norm": 0.21813201904296875, + "learning_rate": 9.057133333474332e-06, + "loss": 1.6922, + "step": 26431 + }, + { + "epoch": 8.112952731737263, + "grad_norm": 0.15330322086811066, + "learning_rate": 9.054280448858682e-06, + "loss": 1.6975, + "step": 26432 + }, + { + "epoch": 8.113259668508288, + "grad_norm": 0.17849069833755493, + "learning_rate": 9.051427968892635e-06, + "loss": 1.7239, + "step": 26433 + }, + { + "epoch": 8.113566605279312, + "grad_norm": 0.13501322269439697, + "learning_rate": 9.048575893604377e-06, + "loss": 1.66, + "step": 26434 + }, + { + "epoch": 8.113873542050337, + "grad_norm": 0.1584496796131134, + "learning_rate": 9.045724223022096e-06, + "loss": 1.6864, + "step": 26435 + }, + { + "epoch": 8.114180478821362, + "grad_norm": 0.1788417398929596, + "learning_rate": 9.04287295717397e-06, + "loss": 1.7785, + "step": 26436 + }, + { + "epoch": 8.114487415592388, + "grad_norm": 0.16028213500976562, + "learning_rate": 9.04002209608818e-06, + "loss": 1.6908, + "step": 26437 + }, + { + "epoch": 8.114794352363413, + "grad_norm": 0.19472184777259827, + "learning_rate": 9.037171639792895e-06, + "loss": 1.7963, + "step": 26438 + }, + { + "epoch": 8.115101289134438, + "grad_norm": 0.155779629945755, + "learning_rate": 9.034321588316297e-06, + "loss": 1.6975, + "step": 26439 + }, + { + "epoch": 8.115408225905464, + "grad_norm": 0.191580668091774, + "learning_rate": 9.031471941686525e-06, + "loss": 1.6926, + "step": 26440 + }, + { + "epoch": 8.115715162676489, + "grad_norm": 0.13917100429534912, + "learning_rate": 9.028622699931788e-06, + "loss": 1.6735, + "step": 26441 + }, + { + "epoch": 8.116022099447514, + "grad_norm": 0.13983212411403656, + "learning_rate": 9.025773863080188e-06, + "loss": 1.6995, + "step": 26442 + }, + { + "epoch": 8.11632903621854, + "grad_norm": 0.1471131443977356, + "learning_rate": 9.022925431159922e-06, + "loss": 1.7002, + "step": 26443 + }, + { + "epoch": 8.116635972989565, + "grad_norm": 0.16679814457893372, + "learning_rate": 9.020077404199134e-06, + "loss": 1.7124, + "step": 26444 + }, + { + "epoch": 8.116942909760589, + "grad_norm": 0.1366356909275055, + "learning_rate": 9.017229782225938e-06, + "loss": 1.663, + "step": 26445 + }, + { + "epoch": 8.117249846531614, + "grad_norm": 0.1389543116092682, + "learning_rate": 9.01438256526852e-06, + "loss": 1.6991, + "step": 26446 + }, + { + "epoch": 8.11755678330264, + "grad_norm": 0.1784060299396515, + "learning_rate": 9.011535753354972e-06, + "loss": 1.769, + "step": 26447 + }, + { + "epoch": 8.117863720073665, + "grad_norm": 0.17633236944675446, + "learning_rate": 9.008689346513466e-06, + "loss": 1.7466, + "step": 26448 + }, + { + "epoch": 8.11817065684469, + "grad_norm": 0.15887171030044556, + "learning_rate": 9.005843344772119e-06, + "loss": 1.7395, + "step": 26449 + }, + { + "epoch": 8.118477593615715, + "grad_norm": 0.20275244116783142, + "learning_rate": 9.002997748159054e-06, + "loss": 1.6971, + "step": 26450 + }, + { + "epoch": 8.11878453038674, + "grad_norm": 0.18063177168369293, + "learning_rate": 9.00015255670239e-06, + "loss": 1.7438, + "step": 26451 + }, + { + "epoch": 8.119091467157766, + "grad_norm": 0.14861668646335602, + "learning_rate": 8.997307770430252e-06, + "loss": 1.645, + "step": 26452 + }, + { + "epoch": 8.119398403928791, + "grad_norm": 0.20455077290534973, + "learning_rate": 8.99446338937075e-06, + "loss": 1.6791, + "step": 26453 + }, + { + "epoch": 8.119705340699817, + "grad_norm": 0.15492217242717743, + "learning_rate": 8.991619413551999e-06, + "loss": 1.6897, + "step": 26454 + }, + { + "epoch": 8.120012277470842, + "grad_norm": 0.1854604184627533, + "learning_rate": 8.988775843002095e-06, + "loss": 1.7379, + "step": 26455 + }, + { + "epoch": 8.120319214241865, + "grad_norm": 0.16705256700515747, + "learning_rate": 8.985932677749155e-06, + "loss": 1.7181, + "step": 26456 + }, + { + "epoch": 8.12062615101289, + "grad_norm": 0.1571042388677597, + "learning_rate": 8.983089917821246e-06, + "loss": 1.6962, + "step": 26457 + }, + { + "epoch": 8.120933087783916, + "grad_norm": 0.1818968802690506, + "learning_rate": 8.980247563246508e-06, + "loss": 1.6954, + "step": 26458 + }, + { + "epoch": 8.121240024554941, + "grad_norm": 0.1823234111070633, + "learning_rate": 8.977405614052986e-06, + "loss": 1.6936, + "step": 26459 + }, + { + "epoch": 8.121546961325967, + "grad_norm": 0.1767190843820572, + "learning_rate": 8.97456407026881e-06, + "loss": 1.7147, + "step": 26460 + }, + { + "epoch": 8.121853898096992, + "grad_norm": 0.17461732029914856, + "learning_rate": 8.971722931922023e-06, + "loss": 1.7039, + "step": 26461 + }, + { + "epoch": 8.122160834868017, + "grad_norm": 0.13968271017074585, + "learning_rate": 8.968882199040702e-06, + "loss": 1.655, + "step": 26462 + }, + { + "epoch": 8.122467771639043, + "grad_norm": 0.16950756311416626, + "learning_rate": 8.966041871652969e-06, + "loss": 1.689, + "step": 26463 + }, + { + "epoch": 8.122774708410068, + "grad_norm": 0.148970365524292, + "learning_rate": 8.963201949786831e-06, + "loss": 1.6998, + "step": 26464 + }, + { + "epoch": 8.123081645181093, + "grad_norm": 0.2081855684518814, + "learning_rate": 8.960362433470392e-06, + "loss": 1.7287, + "step": 26465 + }, + { + "epoch": 8.123388581952117, + "grad_norm": 0.14865393936634064, + "learning_rate": 8.957523322731714e-06, + "loss": 1.6789, + "step": 26466 + }, + { + "epoch": 8.123695518723142, + "grad_norm": 0.19252106547355652, + "learning_rate": 8.954684617598841e-06, + "loss": 1.7475, + "step": 26467 + }, + { + "epoch": 8.124002455494168, + "grad_norm": 0.1915684938430786, + "learning_rate": 8.951846318099837e-06, + "loss": 1.6937, + "step": 26468 + }, + { + "epoch": 8.124309392265193, + "grad_norm": 0.15057072043418884, + "learning_rate": 8.949008424262744e-06, + "loss": 1.6748, + "step": 26469 + }, + { + "epoch": 8.124616329036218, + "grad_norm": 0.1801072657108307, + "learning_rate": 8.946170936115611e-06, + "loss": 1.7411, + "step": 26470 + }, + { + "epoch": 8.124923265807244, + "grad_norm": 0.1449461281299591, + "learning_rate": 8.943333853686476e-06, + "loss": 1.6751, + "step": 26471 + }, + { + "epoch": 8.125230202578269, + "grad_norm": 0.19249948859214783, + "learning_rate": 8.940497177003383e-06, + "loss": 1.6876, + "step": 26472 + }, + { + "epoch": 8.125537139349294, + "grad_norm": 0.19512195885181427, + "learning_rate": 8.937660906094359e-06, + "loss": 1.7275, + "step": 26473 + }, + { + "epoch": 8.12584407612032, + "grad_norm": 0.15998144447803497, + "learning_rate": 8.934825040987433e-06, + "loss": 1.7151, + "step": 26474 + }, + { + "epoch": 8.126151012891345, + "grad_norm": 0.17573381960391998, + "learning_rate": 8.931989581710654e-06, + "loss": 1.713, + "step": 26475 + }, + { + "epoch": 8.12645794966237, + "grad_norm": 0.16745707392692566, + "learning_rate": 8.929154528292e-06, + "loss": 1.7758, + "step": 26476 + }, + { + "epoch": 8.126764886433394, + "grad_norm": 0.14445005357265472, + "learning_rate": 8.926319880759538e-06, + "loss": 1.6821, + "step": 26477 + }, + { + "epoch": 8.12707182320442, + "grad_norm": 0.20462681353092194, + "learning_rate": 8.923485639141244e-06, + "loss": 1.7083, + "step": 26478 + }, + { + "epoch": 8.127378759975445, + "grad_norm": 0.16262570023536682, + "learning_rate": 8.92065180346513e-06, + "loss": 1.7031, + "step": 26479 + }, + { + "epoch": 8.12768569674647, + "grad_norm": 0.14214366674423218, + "learning_rate": 8.917818373759235e-06, + "loss": 1.6752, + "step": 26480 + }, + { + "epoch": 8.127992633517495, + "grad_norm": 0.18373169004917145, + "learning_rate": 8.914985350051513e-06, + "loss": 1.7211, + "step": 26481 + }, + { + "epoch": 8.12829957028852, + "grad_norm": 0.1702071875333786, + "learning_rate": 8.912152732370015e-06, + "loss": 1.7513, + "step": 26482 + }, + { + "epoch": 8.128606507059546, + "grad_norm": 0.16515198349952698, + "learning_rate": 8.90932052074268e-06, + "loss": 1.7379, + "step": 26483 + }, + { + "epoch": 8.128913443830571, + "grad_norm": 0.17008109390735626, + "learning_rate": 8.906488715197537e-06, + "loss": 1.7243, + "step": 26484 + }, + { + "epoch": 8.129220380601597, + "grad_norm": 0.15695080161094666, + "learning_rate": 8.903657315762554e-06, + "loss": 1.6951, + "step": 26485 + }, + { + "epoch": 8.129527317372622, + "grad_norm": 0.16403819620609283, + "learning_rate": 8.900826322465716e-06, + "loss": 1.7755, + "step": 26486 + }, + { + "epoch": 8.129834254143647, + "grad_norm": 0.21355034410953522, + "learning_rate": 8.897995735335007e-06, + "loss": 1.7505, + "step": 26487 + }, + { + "epoch": 8.13014119091467, + "grad_norm": 0.15604349970817566, + "learning_rate": 8.895165554398394e-06, + "loss": 1.7452, + "step": 26488 + }, + { + "epoch": 8.130448127685696, + "grad_norm": 0.18299458920955658, + "learning_rate": 8.892335779683842e-06, + "loss": 1.6737, + "step": 26489 + }, + { + "epoch": 8.130755064456721, + "grad_norm": 0.1939994990825653, + "learning_rate": 8.889506411219329e-06, + "loss": 1.7219, + "step": 26490 + }, + { + "epoch": 8.131062001227747, + "grad_norm": 0.17785221338272095, + "learning_rate": 8.886677449032794e-06, + "loss": 1.7007, + "step": 26491 + }, + { + "epoch": 8.131368937998772, + "grad_norm": 0.2067573517560959, + "learning_rate": 8.88384889315223e-06, + "loss": 1.7918, + "step": 26492 + }, + { + "epoch": 8.131675874769797, + "grad_norm": 0.18033906817436218, + "learning_rate": 8.88102074360555e-06, + "loss": 1.7, + "step": 26493 + }, + { + "epoch": 8.131982811540823, + "grad_norm": 0.17076243460178375, + "learning_rate": 8.878193000420748e-06, + "loss": 1.6883, + "step": 26494 + }, + { + "epoch": 8.132289748311848, + "grad_norm": 0.19102394580841064, + "learning_rate": 8.875365663625729e-06, + "loss": 1.7387, + "step": 26495 + }, + { + "epoch": 8.132596685082873, + "grad_norm": 0.22587478160858154, + "learning_rate": 8.872538733248442e-06, + "loss": 1.7852, + "step": 26496 + }, + { + "epoch": 8.132903621853899, + "grad_norm": 0.17067384719848633, + "learning_rate": 8.869712209316861e-06, + "loss": 1.6813, + "step": 26497 + }, + { + "epoch": 8.133210558624924, + "grad_norm": 0.19232873618602753, + "learning_rate": 8.866886091858856e-06, + "loss": 1.6644, + "step": 26498 + }, + { + "epoch": 8.133517495395948, + "grad_norm": 0.18685118854045868, + "learning_rate": 8.864060380902423e-06, + "loss": 1.6766, + "step": 26499 + }, + { + "epoch": 8.133824432166973, + "grad_norm": 0.18342606723308563, + "learning_rate": 8.861235076475433e-06, + "loss": 1.6694, + "step": 26500 + }, + { + "epoch": 8.134131368937998, + "grad_norm": 0.15469637513160706, + "learning_rate": 8.858410178605842e-06, + "loss": 1.6882, + "step": 26501 + }, + { + "epoch": 8.134438305709024, + "grad_norm": 0.19094935059547424, + "learning_rate": 8.855585687321549e-06, + "loss": 1.6662, + "step": 26502 + }, + { + "epoch": 8.134745242480049, + "grad_norm": 0.19613660871982574, + "learning_rate": 8.852761602650479e-06, + "loss": 1.6518, + "step": 26503 + }, + { + "epoch": 8.135052179251074, + "grad_norm": 0.1342541128396988, + "learning_rate": 8.849937924620538e-06, + "loss": 1.6728, + "step": 26504 + }, + { + "epoch": 8.1353591160221, + "grad_norm": 0.19099827110767365, + "learning_rate": 8.847114653259624e-06, + "loss": 1.714, + "step": 26505 + }, + { + "epoch": 8.135666052793125, + "grad_norm": 0.18886728584766388, + "learning_rate": 8.84429178859565e-06, + "loss": 1.7222, + "step": 26506 + }, + { + "epoch": 8.13597298956415, + "grad_norm": 0.16177545487880707, + "learning_rate": 8.841469330656499e-06, + "loss": 1.754, + "step": 26507 + }, + { + "epoch": 8.136279926335176, + "grad_norm": 0.1589137762784958, + "learning_rate": 8.838647279470063e-06, + "loss": 1.6889, + "step": 26508 + }, + { + "epoch": 8.1365868631062, + "grad_norm": 0.16074521839618683, + "learning_rate": 8.835825635064266e-06, + "loss": 1.6882, + "step": 26509 + }, + { + "epoch": 8.136893799877225, + "grad_norm": 0.15532740950584412, + "learning_rate": 8.833004397466937e-06, + "loss": 1.6786, + "step": 26510 + }, + { + "epoch": 8.13720073664825, + "grad_norm": 0.18151862919330597, + "learning_rate": 8.830183566706019e-06, + "loss": 1.7075, + "step": 26511 + }, + { + "epoch": 8.137507673419275, + "grad_norm": 0.15345066785812378, + "learning_rate": 8.827363142809342e-06, + "loss": 1.6895, + "step": 26512 + }, + { + "epoch": 8.1378146101903, + "grad_norm": 0.16954976320266724, + "learning_rate": 8.824543125804785e-06, + "loss": 1.727, + "step": 26513 + }, + { + "epoch": 8.138121546961326, + "grad_norm": 0.1679479032754898, + "learning_rate": 8.821723515720249e-06, + "loss": 1.7391, + "step": 26514 + }, + { + "epoch": 8.138428483732351, + "grad_norm": 0.15377631783485413, + "learning_rate": 8.818904312583547e-06, + "loss": 1.6954, + "step": 26515 + }, + { + "epoch": 8.138735420503377, + "grad_norm": 0.20345479249954224, + "learning_rate": 8.8160855164226e-06, + "loss": 1.7424, + "step": 26516 + }, + { + "epoch": 8.139042357274402, + "grad_norm": 0.18770255148410797, + "learning_rate": 8.813267127265207e-06, + "loss": 1.67, + "step": 26517 + }, + { + "epoch": 8.139349294045427, + "grad_norm": 0.16253206133842468, + "learning_rate": 8.810449145139265e-06, + "loss": 1.7004, + "step": 26518 + }, + { + "epoch": 8.139656230816453, + "grad_norm": 0.18429701030254364, + "learning_rate": 8.807631570072606e-06, + "loss": 1.7289, + "step": 26519 + }, + { + "epoch": 8.139963167587476, + "grad_norm": 0.18926598131656647, + "learning_rate": 8.80481440209307e-06, + "loss": 1.7907, + "step": 26520 + }, + { + "epoch": 8.140270104358502, + "grad_norm": 0.17855983972549438, + "learning_rate": 8.80199764122851e-06, + "loss": 1.7008, + "step": 26521 + }, + { + "epoch": 8.140577041129527, + "grad_norm": 0.20559640228748322, + "learning_rate": 8.799181287506752e-06, + "loss": 1.724, + "step": 26522 + }, + { + "epoch": 8.140883977900552, + "grad_norm": 0.1707194298505783, + "learning_rate": 8.79636534095563e-06, + "loss": 1.7274, + "step": 26523 + }, + { + "epoch": 8.141190914671578, + "grad_norm": 0.1882070004940033, + "learning_rate": 8.793549801602984e-06, + "loss": 1.7503, + "step": 26524 + }, + { + "epoch": 8.141497851442603, + "grad_norm": 0.24269217252731323, + "learning_rate": 8.790734669476613e-06, + "loss": 1.7459, + "step": 26525 + }, + { + "epoch": 8.141804788213628, + "grad_norm": 0.20310194790363312, + "learning_rate": 8.787919944604383e-06, + "loss": 1.7158, + "step": 26526 + }, + { + "epoch": 8.142111724984654, + "grad_norm": 0.18653319776058197, + "learning_rate": 8.785105627014056e-06, + "loss": 1.7135, + "step": 26527 + }, + { + "epoch": 8.142418661755679, + "grad_norm": 0.1896388828754425, + "learning_rate": 8.782291716733499e-06, + "loss": 1.7407, + "step": 26528 + }, + { + "epoch": 8.142725598526704, + "grad_norm": 0.17392487823963165, + "learning_rate": 8.779478213790482e-06, + "loss": 1.6863, + "step": 26529 + }, + { + "epoch": 8.14303253529773, + "grad_norm": 0.2389729917049408, + "learning_rate": 8.776665118212807e-06, + "loss": 1.7565, + "step": 26530 + }, + { + "epoch": 8.143339472068753, + "grad_norm": 0.1907578408718109, + "learning_rate": 8.773852430028312e-06, + "loss": 1.7135, + "step": 26531 + }, + { + "epoch": 8.143646408839778, + "grad_norm": 0.1867230087518692, + "learning_rate": 8.771040149264748e-06, + "loss": 1.657, + "step": 26532 + }, + { + "epoch": 8.143953345610804, + "grad_norm": 0.16111065447330475, + "learning_rate": 8.768228275949953e-06, + "loss": 1.6849, + "step": 26533 + }, + { + "epoch": 8.144260282381829, + "grad_norm": 0.24071912467479706, + "learning_rate": 8.76541681011167e-06, + "loss": 1.7563, + "step": 26534 + }, + { + "epoch": 8.144567219152854, + "grad_norm": 0.18996769189834595, + "learning_rate": 8.76260575177772e-06, + "loss": 1.7099, + "step": 26535 + }, + { + "epoch": 8.14487415592388, + "grad_norm": 0.17230607569217682, + "learning_rate": 8.75979510097587e-06, + "loss": 1.6848, + "step": 26536 + }, + { + "epoch": 8.145181092694905, + "grad_norm": 0.19319802522659302, + "learning_rate": 8.756984857733896e-06, + "loss": 1.7806, + "step": 26537 + }, + { + "epoch": 8.14548802946593, + "grad_norm": 0.16848497092723846, + "learning_rate": 8.754175022079569e-06, + "loss": 1.7099, + "step": 26538 + }, + { + "epoch": 8.145794966236956, + "grad_norm": 0.16230639815330505, + "learning_rate": 8.751365594040662e-06, + "loss": 1.6618, + "step": 26539 + }, + { + "epoch": 8.146101903007981, + "grad_norm": 0.15458232164382935, + "learning_rate": 8.748556573644935e-06, + "loss": 1.6975, + "step": 26540 + }, + { + "epoch": 8.146408839779005, + "grad_norm": 0.15948891639709473, + "learning_rate": 8.745747960920153e-06, + "loss": 1.6977, + "step": 26541 + }, + { + "epoch": 8.14671577655003, + "grad_norm": 0.17533692717552185, + "learning_rate": 8.742939755894053e-06, + "loss": 1.7314, + "step": 26542 + }, + { + "epoch": 8.147022713321055, + "grad_norm": 0.13606345653533936, + "learning_rate": 8.740131958594433e-06, + "loss": 1.6245, + "step": 26543 + }, + { + "epoch": 8.14732965009208, + "grad_norm": 0.1749604493379593, + "learning_rate": 8.737324569048993e-06, + "loss": 1.6881, + "step": 26544 + }, + { + "epoch": 8.147636586863106, + "grad_norm": 0.15416191518306732, + "learning_rate": 8.7345175872855e-06, + "loss": 1.6755, + "step": 26545 + }, + { + "epoch": 8.147943523634131, + "grad_norm": 0.19732356071472168, + "learning_rate": 8.731711013331695e-06, + "loss": 1.7068, + "step": 26546 + }, + { + "epoch": 8.148250460405157, + "grad_norm": 0.19295896589756012, + "learning_rate": 8.728904847215291e-06, + "loss": 1.7282, + "step": 26547 + }, + { + "epoch": 8.148557397176182, + "grad_norm": 0.18414302170276642, + "learning_rate": 8.726099088964069e-06, + "loss": 1.7059, + "step": 26548 + }, + { + "epoch": 8.148864333947207, + "grad_norm": 0.17527544498443604, + "learning_rate": 8.723293738605697e-06, + "loss": 1.6947, + "step": 26549 + }, + { + "epoch": 8.149171270718233, + "grad_norm": 0.1913319230079651, + "learning_rate": 8.720488796167958e-06, + "loss": 1.6988, + "step": 26550 + }, + { + "epoch": 8.149478207489258, + "grad_norm": 0.1604306846857071, + "learning_rate": 8.71768426167852e-06, + "loss": 1.6937, + "step": 26551 + }, + { + "epoch": 8.149785144260282, + "grad_norm": 0.1562403291463852, + "learning_rate": 8.714880135165132e-06, + "loss": 1.6633, + "step": 26552 + }, + { + "epoch": 8.150092081031307, + "grad_norm": 0.16940948367118835, + "learning_rate": 8.712076416655495e-06, + "loss": 1.6774, + "step": 26553 + }, + { + "epoch": 8.150399017802332, + "grad_norm": 0.14607203006744385, + "learning_rate": 8.709273106177324e-06, + "loss": 1.6912, + "step": 26554 + }, + { + "epoch": 8.150705954573358, + "grad_norm": 0.1811707615852356, + "learning_rate": 8.706470203758316e-06, + "loss": 1.7291, + "step": 26555 + }, + { + "epoch": 8.151012891344383, + "grad_norm": 0.18188659846782684, + "learning_rate": 8.703667709426166e-06, + "loss": 1.6994, + "step": 26556 + }, + { + "epoch": 8.151319828115408, + "grad_norm": 0.16499698162078857, + "learning_rate": 8.700865623208581e-06, + "loss": 1.7065, + "step": 26557 + }, + { + "epoch": 8.151626764886434, + "grad_norm": 0.17506305873394012, + "learning_rate": 8.69806394513325e-06, + "loss": 1.75, + "step": 26558 + }, + { + "epoch": 8.151933701657459, + "grad_norm": 0.14843741059303284, + "learning_rate": 8.695262675227844e-06, + "loss": 1.6645, + "step": 26559 + }, + { + "epoch": 8.152240638428484, + "grad_norm": 0.15281017124652863, + "learning_rate": 8.692461813520087e-06, + "loss": 1.7166, + "step": 26560 + }, + { + "epoch": 8.15254757519951, + "grad_norm": 0.17245371639728546, + "learning_rate": 8.689661360037621e-06, + "loss": 1.7418, + "step": 26561 + }, + { + "epoch": 8.152854511970535, + "grad_norm": 0.17387856543064117, + "learning_rate": 8.686861314808131e-06, + "loss": 1.6865, + "step": 26562 + }, + { + "epoch": 8.153161448741558, + "grad_norm": 0.1463180035352707, + "learning_rate": 8.684061677859296e-06, + "loss": 1.6867, + "step": 26563 + }, + { + "epoch": 8.153468385512584, + "grad_norm": 0.16704687476158142, + "learning_rate": 8.681262449218769e-06, + "loss": 1.6985, + "step": 26564 + }, + { + "epoch": 8.15377532228361, + "grad_norm": 0.17754648625850677, + "learning_rate": 8.678463628914246e-06, + "loss": 1.7067, + "step": 26565 + }, + { + "epoch": 8.154082259054634, + "grad_norm": 0.12470053881406784, + "learning_rate": 8.675665216973339e-06, + "loss": 1.6468, + "step": 26566 + }, + { + "epoch": 8.15438919582566, + "grad_norm": 0.17551906406879425, + "learning_rate": 8.672867213423757e-06, + "loss": 1.76, + "step": 26567 + }, + { + "epoch": 8.154696132596685, + "grad_norm": 0.13165321946144104, + "learning_rate": 8.670069618293098e-06, + "loss": 1.6672, + "step": 26568 + }, + { + "epoch": 8.15500306936771, + "grad_norm": 0.1410796046257019, + "learning_rate": 8.667272431609041e-06, + "loss": 1.649, + "step": 26569 + }, + { + "epoch": 8.155310006138736, + "grad_norm": 0.17227822542190552, + "learning_rate": 8.664475653399235e-06, + "loss": 1.7028, + "step": 26570 + }, + { + "epoch": 8.155616942909761, + "grad_norm": 0.15770387649536133, + "learning_rate": 8.661679283691298e-06, + "loss": 1.7608, + "step": 26571 + }, + { + "epoch": 8.155923879680786, + "grad_norm": 0.1425134390592575, + "learning_rate": 8.658883322512885e-06, + "loss": 1.6821, + "step": 26572 + }, + { + "epoch": 8.15623081645181, + "grad_norm": 0.19647212326526642, + "learning_rate": 8.656087769891608e-06, + "loss": 1.7787, + "step": 26573 + }, + { + "epoch": 8.156537753222835, + "grad_norm": 0.15315282344818115, + "learning_rate": 8.653292625855108e-06, + "loss": 1.6464, + "step": 26574 + }, + { + "epoch": 8.15684468999386, + "grad_norm": 0.1664622575044632, + "learning_rate": 8.650497890431009e-06, + "loss": 1.7189, + "step": 26575 + }, + { + "epoch": 8.157151626764886, + "grad_norm": 0.19525103271007538, + "learning_rate": 8.647703563646908e-06, + "loss": 1.71, + "step": 26576 + }, + { + "epoch": 8.157458563535911, + "grad_norm": 0.2435453087091446, + "learning_rate": 8.644909645530464e-06, + "loss": 1.7312, + "step": 26577 + }, + { + "epoch": 8.157765500306937, + "grad_norm": 0.20554441213607788, + "learning_rate": 8.642116136109252e-06, + "loss": 1.7102, + "step": 26578 + }, + { + "epoch": 8.158072437077962, + "grad_norm": 0.21100008487701416, + "learning_rate": 8.639323035410885e-06, + "loss": 1.6513, + "step": 26579 + }, + { + "epoch": 8.158379373848987, + "grad_norm": 0.20069560408592224, + "learning_rate": 8.636530343462973e-06, + "loss": 1.7457, + "step": 26580 + }, + { + "epoch": 8.158686310620013, + "grad_norm": 0.19240780174732208, + "learning_rate": 8.633738060293095e-06, + "loss": 1.6761, + "step": 26581 + }, + { + "epoch": 8.158993247391038, + "grad_norm": 0.17970497906208038, + "learning_rate": 8.63094618592889e-06, + "loss": 1.7571, + "step": 26582 + }, + { + "epoch": 8.159300184162063, + "grad_norm": 0.19709791243076324, + "learning_rate": 8.628154720397902e-06, + "loss": 1.7826, + "step": 26583 + }, + { + "epoch": 8.159607120933087, + "grad_norm": 0.2084866315126419, + "learning_rate": 8.62536366372776e-06, + "loss": 1.7113, + "step": 26584 + }, + { + "epoch": 8.159914057704112, + "grad_norm": 0.18584266304969788, + "learning_rate": 8.622573015945995e-06, + "loss": 1.675, + "step": 26585 + }, + { + "epoch": 8.160220994475138, + "grad_norm": 0.21233049035072327, + "learning_rate": 8.619782777080232e-06, + "loss": 1.7438, + "step": 26586 + }, + { + "epoch": 8.160527931246163, + "grad_norm": 0.180323526263237, + "learning_rate": 8.61699294715803e-06, + "loss": 1.6923, + "step": 26587 + }, + { + "epoch": 8.160834868017188, + "grad_norm": 0.182667076587677, + "learning_rate": 8.614203526206955e-06, + "loss": 1.7302, + "step": 26588 + }, + { + "epoch": 8.161141804788214, + "grad_norm": 0.19673213362693787, + "learning_rate": 8.611414514254584e-06, + "loss": 1.7282, + "step": 26589 + }, + { + "epoch": 8.161448741559239, + "grad_norm": 0.14357072114944458, + "learning_rate": 8.608625911328466e-06, + "loss": 1.6964, + "step": 26590 + }, + { + "epoch": 8.161755678330264, + "grad_norm": 0.25598716735839844, + "learning_rate": 8.605837717456172e-06, + "loss": 1.788, + "step": 26591 + }, + { + "epoch": 8.16206261510129, + "grad_norm": 0.16914238035678864, + "learning_rate": 8.603049932665252e-06, + "loss": 1.6069, + "step": 26592 + }, + { + "epoch": 8.162369551872315, + "grad_norm": 0.1468336582183838, + "learning_rate": 8.60026255698324e-06, + "loss": 1.7009, + "step": 26593 + }, + { + "epoch": 8.16267648864334, + "grad_norm": 0.20125585794448853, + "learning_rate": 8.597475590437726e-06, + "loss": 1.7166, + "step": 26594 + }, + { + "epoch": 8.162983425414364, + "grad_norm": 0.12715741991996765, + "learning_rate": 8.594689033056214e-06, + "loss": 1.6488, + "step": 26595 + }, + { + "epoch": 8.16329036218539, + "grad_norm": 0.2659800350666046, + "learning_rate": 8.591902884866254e-06, + "loss": 1.7325, + "step": 26596 + }, + { + "epoch": 8.163597298956415, + "grad_norm": 0.1939239799976349, + "learning_rate": 8.589117145895376e-06, + "loss": 1.6882, + "step": 26597 + }, + { + "epoch": 8.16390423572744, + "grad_norm": 0.18982990086078644, + "learning_rate": 8.586331816171101e-06, + "loss": 1.7222, + "step": 26598 + }, + { + "epoch": 8.164211172498465, + "grad_norm": 0.16025054454803467, + "learning_rate": 8.583546895720995e-06, + "loss": 1.6672, + "step": 26599 + }, + { + "epoch": 8.16451810926949, + "grad_norm": 0.1923390030860901, + "learning_rate": 8.580762384572533e-06, + "loss": 1.7261, + "step": 26600 + }, + { + "epoch": 8.164825046040516, + "grad_norm": 0.1467374712228775, + "learning_rate": 8.577978282753274e-06, + "loss": 1.6969, + "step": 26601 + }, + { + "epoch": 8.165131982811541, + "grad_norm": 0.2210266888141632, + "learning_rate": 8.575194590290685e-06, + "loss": 1.74, + "step": 26602 + }, + { + "epoch": 8.165438919582567, + "grad_norm": 0.1852598935365677, + "learning_rate": 8.572411307212319e-06, + "loss": 1.7522, + "step": 26603 + }, + { + "epoch": 8.165745856353592, + "grad_norm": 0.19316701591014862, + "learning_rate": 8.569628433545662e-06, + "loss": 1.7389, + "step": 26604 + }, + { + "epoch": 8.166052793124617, + "grad_norm": 0.2102174311876297, + "learning_rate": 8.566845969318227e-06, + "loss": 1.7134, + "step": 26605 + }, + { + "epoch": 8.16635972989564, + "grad_norm": 0.1948329359292984, + "learning_rate": 8.564063914557496e-06, + "loss": 1.7368, + "step": 26606 + }, + { + "epoch": 8.166666666666666, + "grad_norm": 0.14721956849098206, + "learning_rate": 8.561282269290977e-06, + "loss": 1.6526, + "step": 26607 + }, + { + "epoch": 8.166973603437691, + "grad_norm": 0.17424573004245758, + "learning_rate": 8.558501033546158e-06, + "loss": 1.6954, + "step": 26608 + }, + { + "epoch": 8.167280540208717, + "grad_norm": 0.14784085750579834, + "learning_rate": 8.555720207350514e-06, + "loss": 1.7166, + "step": 26609 + }, + { + "epoch": 8.167587476979742, + "grad_norm": 0.1619582176208496, + "learning_rate": 8.55293979073154e-06, + "loss": 1.716, + "step": 26610 + }, + { + "epoch": 8.167894413750767, + "grad_norm": 0.2342625856399536, + "learning_rate": 8.550159783716705e-06, + "loss": 1.7399, + "step": 26611 + }, + { + "epoch": 8.168201350521793, + "grad_norm": 0.16116589307785034, + "learning_rate": 8.547380186333482e-06, + "loss": 1.6727, + "step": 26612 + }, + { + "epoch": 8.168508287292818, + "grad_norm": 0.20995540916919708, + "learning_rate": 8.544600998609349e-06, + "loss": 1.703, + "step": 26613 + }, + { + "epoch": 8.168815224063843, + "grad_norm": 0.18031500279903412, + "learning_rate": 8.541822220571766e-06, + "loss": 1.6953, + "step": 26614 + }, + { + "epoch": 8.169122160834869, + "grad_norm": 0.1851302981376648, + "learning_rate": 8.539043852248197e-06, + "loss": 1.6931, + "step": 26615 + }, + { + "epoch": 8.169429097605892, + "grad_norm": 0.2262948453426361, + "learning_rate": 8.536265893666096e-06, + "loss": 1.7167, + "step": 26616 + }, + { + "epoch": 8.169736034376918, + "grad_norm": 0.1456020325422287, + "learning_rate": 8.533488344852903e-06, + "loss": 1.6686, + "step": 26617 + }, + { + "epoch": 8.170042971147943, + "grad_norm": 0.17165613174438477, + "learning_rate": 8.530711205836112e-06, + "loss": 1.6641, + "step": 26618 + }, + { + "epoch": 8.170349907918968, + "grad_norm": 0.18926110863685608, + "learning_rate": 8.527934476643112e-06, + "loss": 1.7155, + "step": 26619 + }, + { + "epoch": 8.170656844689994, + "grad_norm": 0.1722220927476883, + "learning_rate": 8.525158157301383e-06, + "loss": 1.7188, + "step": 26620 + }, + { + "epoch": 8.170963781461019, + "grad_norm": 0.1791582554578781, + "learning_rate": 8.522382247838351e-06, + "loss": 1.7195, + "step": 26621 + }, + { + "epoch": 8.171270718232044, + "grad_norm": 0.18020455539226532, + "learning_rate": 8.519606748281445e-06, + "loss": 1.7068, + "step": 26622 + }, + { + "epoch": 8.17157765500307, + "grad_norm": 0.17394676804542542, + "learning_rate": 8.516831658658098e-06, + "loss": 1.6977, + "step": 26623 + }, + { + "epoch": 8.171884591774095, + "grad_norm": 0.24079330265522003, + "learning_rate": 8.514056978995739e-06, + "loss": 1.7152, + "step": 26624 + }, + { + "epoch": 8.17219152854512, + "grad_norm": 0.16567498445510864, + "learning_rate": 8.511282709321784e-06, + "loss": 1.7048, + "step": 26625 + }, + { + "epoch": 8.172498465316146, + "grad_norm": 0.21935853362083435, + "learning_rate": 8.508508849663649e-06, + "loss": 1.7445, + "step": 26626 + }, + { + "epoch": 8.17280540208717, + "grad_norm": 0.18325531482696533, + "learning_rate": 8.505735400048748e-06, + "loss": 1.7343, + "step": 26627 + }, + { + "epoch": 8.173112338858195, + "grad_norm": 0.16334550082683563, + "learning_rate": 8.50296236050449e-06, + "loss": 1.727, + "step": 26628 + }, + { + "epoch": 8.17341927562922, + "grad_norm": 0.23685503005981445, + "learning_rate": 8.500189731058284e-06, + "loss": 1.6718, + "step": 26629 + }, + { + "epoch": 8.173726212400245, + "grad_norm": 0.17057496309280396, + "learning_rate": 8.49741751173752e-06, + "loss": 1.7083, + "step": 26630 + }, + { + "epoch": 8.17403314917127, + "grad_norm": 0.19941039383411407, + "learning_rate": 8.49464570256961e-06, + "loss": 1.6496, + "step": 26631 + }, + { + "epoch": 8.174340085942296, + "grad_norm": 0.1887839138507843, + "learning_rate": 8.49187430358193e-06, + "loss": 1.7896, + "step": 26632 + }, + { + "epoch": 8.174647022713321, + "grad_norm": 0.16285917162895203, + "learning_rate": 8.489103314801883e-06, + "loss": 1.6923, + "step": 26633 + }, + { + "epoch": 8.174953959484347, + "grad_norm": 0.1405196487903595, + "learning_rate": 8.48633273625683e-06, + "loss": 1.6907, + "step": 26634 + }, + { + "epoch": 8.175260896255372, + "grad_norm": 0.17885157465934753, + "learning_rate": 8.483562567974196e-06, + "loss": 1.7036, + "step": 26635 + }, + { + "epoch": 8.175567833026397, + "grad_norm": 0.1427285224199295, + "learning_rate": 8.480792809981309e-06, + "loss": 1.6997, + "step": 26636 + }, + { + "epoch": 8.175874769797423, + "grad_norm": 0.15711882710456848, + "learning_rate": 8.478023462305579e-06, + "loss": 1.6874, + "step": 26637 + }, + { + "epoch": 8.176181706568446, + "grad_norm": 0.19080850481987, + "learning_rate": 8.47525452497434e-06, + "loss": 1.7078, + "step": 26638 + }, + { + "epoch": 8.176488643339471, + "grad_norm": 0.17063139379024506, + "learning_rate": 8.472485998014984e-06, + "loss": 1.7147, + "step": 26639 + }, + { + "epoch": 8.176795580110497, + "grad_norm": 0.151056706905365, + "learning_rate": 8.469717881454865e-06, + "loss": 1.685, + "step": 26640 + }, + { + "epoch": 8.177102516881522, + "grad_norm": 0.16712957620620728, + "learning_rate": 8.466950175321331e-06, + "loss": 1.7142, + "step": 26641 + }, + { + "epoch": 8.177409453652547, + "grad_norm": 0.13982228934764862, + "learning_rate": 8.46418287964174e-06, + "loss": 1.6707, + "step": 26642 + }, + { + "epoch": 8.177716390423573, + "grad_norm": 0.14738497138023376, + "learning_rate": 8.461415994443439e-06, + "loss": 1.7381, + "step": 26643 + }, + { + "epoch": 8.178023327194598, + "grad_norm": 0.1691005975008011, + "learning_rate": 8.45864951975377e-06, + "loss": 1.6956, + "step": 26644 + }, + { + "epoch": 8.178330263965623, + "grad_norm": 0.1477413773536682, + "learning_rate": 8.455883455600078e-06, + "loss": 1.6646, + "step": 26645 + }, + { + "epoch": 8.178637200736649, + "grad_norm": 0.15620499849319458, + "learning_rate": 8.453117802009697e-06, + "loss": 1.7031, + "step": 26646 + }, + { + "epoch": 8.178944137507674, + "grad_norm": 0.1572941690683365, + "learning_rate": 8.45035255900995e-06, + "loss": 1.6509, + "step": 26647 + }, + { + "epoch": 8.1792510742787, + "grad_norm": 0.20386455953121185, + "learning_rate": 8.447587726628176e-06, + "loss": 1.7166, + "step": 26648 + }, + { + "epoch": 8.179558011049723, + "grad_norm": 0.2131095975637436, + "learning_rate": 8.444823304891697e-06, + "loss": 1.6934, + "step": 26649 + }, + { + "epoch": 8.179864947820748, + "grad_norm": 0.15402472019195557, + "learning_rate": 8.442059293827826e-06, + "loss": 1.7538, + "step": 26650 + }, + { + "epoch": 8.180171884591774, + "grad_norm": 0.17687393724918365, + "learning_rate": 8.439295693463872e-06, + "loss": 1.7374, + "step": 26651 + }, + { + "epoch": 8.180478821362799, + "grad_norm": 0.16971834003925323, + "learning_rate": 8.436532503827188e-06, + "loss": 1.7142, + "step": 26652 + }, + { + "epoch": 8.180785758133824, + "grad_norm": 0.17651747167110443, + "learning_rate": 8.433769724945017e-06, + "loss": 1.7109, + "step": 26653 + }, + { + "epoch": 8.18109269490485, + "grad_norm": 0.18742668628692627, + "learning_rate": 8.431007356844728e-06, + "loss": 1.7024, + "step": 26654 + }, + { + "epoch": 8.181399631675875, + "grad_norm": 0.1686297208070755, + "learning_rate": 8.428245399553559e-06, + "loss": 1.7669, + "step": 26655 + }, + { + "epoch": 8.1817065684469, + "grad_norm": 0.1667923480272293, + "learning_rate": 8.425483853098848e-06, + "loss": 1.6928, + "step": 26656 + }, + { + "epoch": 8.182013505217926, + "grad_norm": 0.16002421081066132, + "learning_rate": 8.422722717507874e-06, + "loss": 1.7058, + "step": 26657 + }, + { + "epoch": 8.182320441988951, + "grad_norm": 0.1531311571598053, + "learning_rate": 8.419961992807928e-06, + "loss": 1.7096, + "step": 26658 + }, + { + "epoch": 8.182627378759975, + "grad_norm": 0.16212326288223267, + "learning_rate": 8.417201679026282e-06, + "loss": 1.6849, + "step": 26659 + }, + { + "epoch": 8.182934315531, + "grad_norm": 0.17276698350906372, + "learning_rate": 8.414441776190224e-06, + "loss": 1.6697, + "step": 26660 + }, + { + "epoch": 8.183241252302025, + "grad_norm": 0.15050961077213287, + "learning_rate": 8.411682284327028e-06, + "loss": 1.6972, + "step": 26661 + }, + { + "epoch": 8.18354818907305, + "grad_norm": 0.14593006670475006, + "learning_rate": 8.40892320346396e-06, + "loss": 1.7005, + "step": 26662 + }, + { + "epoch": 8.183855125844076, + "grad_norm": 0.18584349751472473, + "learning_rate": 8.406164533628291e-06, + "loss": 1.7366, + "step": 26663 + }, + { + "epoch": 8.184162062615101, + "grad_norm": 0.18662385642528534, + "learning_rate": 8.403406274847287e-06, + "loss": 1.77, + "step": 26664 + }, + { + "epoch": 8.184468999386127, + "grad_norm": 0.1735418438911438, + "learning_rate": 8.4006484271482e-06, + "loss": 1.692, + "step": 26665 + }, + { + "epoch": 8.184775936157152, + "grad_norm": 0.22115837037563324, + "learning_rate": 8.397890990558283e-06, + "loss": 1.7321, + "step": 26666 + }, + { + "epoch": 8.185082872928177, + "grad_norm": 0.1662493795156479, + "learning_rate": 8.395133965104796e-06, + "loss": 1.7016, + "step": 26667 + }, + { + "epoch": 8.185389809699203, + "grad_norm": 0.20966672897338867, + "learning_rate": 8.392377350814967e-06, + "loss": 1.6703, + "step": 26668 + }, + { + "epoch": 8.185696746470228, + "grad_norm": 0.16722753643989563, + "learning_rate": 8.389621147716076e-06, + "loss": 1.7429, + "step": 26669 + }, + { + "epoch": 8.186003683241251, + "grad_norm": 0.20280788838863373, + "learning_rate": 8.386865355835316e-06, + "loss": 1.7155, + "step": 26670 + }, + { + "epoch": 8.186310620012277, + "grad_norm": 0.20596744120121002, + "learning_rate": 8.384109975199967e-06, + "loss": 1.7266, + "step": 26671 + }, + { + "epoch": 8.186617556783302, + "grad_norm": 0.1525292545557022, + "learning_rate": 8.381355005837205e-06, + "loss": 1.6692, + "step": 26672 + }, + { + "epoch": 8.186924493554327, + "grad_norm": 0.21745061874389648, + "learning_rate": 8.378600447774304e-06, + "loss": 1.7048, + "step": 26673 + }, + { + "epoch": 8.187231430325353, + "grad_norm": 0.2355356216430664, + "learning_rate": 8.375846301038465e-06, + "loss": 1.7842, + "step": 26674 + }, + { + "epoch": 8.187538367096378, + "grad_norm": 0.18660607933998108, + "learning_rate": 8.37309256565691e-06, + "loss": 1.698, + "step": 26675 + }, + { + "epoch": 8.187845303867404, + "grad_norm": 0.1690683364868164, + "learning_rate": 8.370339241656855e-06, + "loss": 1.6967, + "step": 26676 + }, + { + "epoch": 8.188152240638429, + "grad_norm": 0.16226762533187866, + "learning_rate": 8.367586329065508e-06, + "loss": 1.6849, + "step": 26677 + }, + { + "epoch": 8.188459177409454, + "grad_norm": 0.192795068025589, + "learning_rate": 8.364833827910074e-06, + "loss": 1.7037, + "step": 26678 + }, + { + "epoch": 8.18876611418048, + "grad_norm": 0.13591274619102478, + "learning_rate": 8.362081738217752e-06, + "loss": 1.6517, + "step": 26679 + }, + { + "epoch": 8.189073050951505, + "grad_norm": 0.16879263520240784, + "learning_rate": 8.359330060015747e-06, + "loss": 1.6751, + "step": 26680 + }, + { + "epoch": 8.189379987722528, + "grad_norm": 0.16385328769683838, + "learning_rate": 8.356578793331243e-06, + "loss": 1.7151, + "step": 26681 + }, + { + "epoch": 8.189686924493554, + "grad_norm": 0.14804807305335999, + "learning_rate": 8.353827938191438e-06, + "loss": 1.6601, + "step": 26682 + }, + { + "epoch": 8.189993861264579, + "grad_norm": 0.1534065157175064, + "learning_rate": 8.351077494623516e-06, + "loss": 1.7664, + "step": 26683 + }, + { + "epoch": 8.190300798035604, + "grad_norm": 0.16167859733104706, + "learning_rate": 8.348327462654659e-06, + "loss": 1.6573, + "step": 26684 + }, + { + "epoch": 8.19060773480663, + "grad_norm": 0.1433487832546234, + "learning_rate": 8.34557784231203e-06, + "loss": 1.6768, + "step": 26685 + }, + { + "epoch": 8.190914671577655, + "grad_norm": 0.1636372059583664, + "learning_rate": 8.342828633622834e-06, + "loss": 1.6648, + "step": 26686 + }, + { + "epoch": 8.19122160834868, + "grad_norm": 0.13938350975513458, + "learning_rate": 8.340079836614206e-06, + "loss": 1.6511, + "step": 26687 + }, + { + "epoch": 8.191528545119706, + "grad_norm": 0.19098511338233948, + "learning_rate": 8.337331451313346e-06, + "loss": 1.7305, + "step": 26688 + }, + { + "epoch": 8.191835481890731, + "grad_norm": 0.15734615921974182, + "learning_rate": 8.33458347774737e-06, + "loss": 1.6777, + "step": 26689 + }, + { + "epoch": 8.192142418661756, + "grad_norm": 0.1523539125919342, + "learning_rate": 8.331835915943475e-06, + "loss": 1.7173, + "step": 26690 + }, + { + "epoch": 8.192449355432782, + "grad_norm": 0.17726896703243256, + "learning_rate": 8.329088765928799e-06, + "loss": 1.6904, + "step": 26691 + }, + { + "epoch": 8.192756292203805, + "grad_norm": 0.18954375386238098, + "learning_rate": 8.326342027730493e-06, + "loss": 1.7062, + "step": 26692 + }, + { + "epoch": 8.19306322897483, + "grad_norm": 0.21199224889278412, + "learning_rate": 8.323595701375702e-06, + "loss": 1.7747, + "step": 26693 + }, + { + "epoch": 8.193370165745856, + "grad_norm": 0.15305975079536438, + "learning_rate": 8.320849786891566e-06, + "loss": 1.6829, + "step": 26694 + }, + { + "epoch": 8.193677102516881, + "grad_norm": 0.1407271921634674, + "learning_rate": 8.318104284305216e-06, + "loss": 1.6774, + "step": 26695 + }, + { + "epoch": 8.193984039287907, + "grad_norm": 0.15379782021045685, + "learning_rate": 8.315359193643796e-06, + "loss": 1.7037, + "step": 26696 + }, + { + "epoch": 8.194290976058932, + "grad_norm": 0.21377405524253845, + "learning_rate": 8.31261451493443e-06, + "loss": 1.7258, + "step": 26697 + }, + { + "epoch": 8.194597912829957, + "grad_norm": 0.1975884586572647, + "learning_rate": 8.309870248204238e-06, + "loss": 1.718, + "step": 26698 + }, + { + "epoch": 8.194904849600983, + "grad_norm": 0.1985187530517578, + "learning_rate": 8.307126393480341e-06, + "loss": 1.7199, + "step": 26699 + }, + { + "epoch": 8.195211786372008, + "grad_norm": 0.17664451897144318, + "learning_rate": 8.304382950789857e-06, + "loss": 1.744, + "step": 26700 + }, + { + "epoch": 8.195518723143033, + "grad_norm": 0.16517753899097443, + "learning_rate": 8.301639920159904e-06, + "loss": 1.7289, + "step": 26701 + }, + { + "epoch": 8.195825659914057, + "grad_norm": 0.15431733429431915, + "learning_rate": 8.29889730161757e-06, + "loss": 1.6854, + "step": 26702 + }, + { + "epoch": 8.196132596685082, + "grad_norm": 0.14390075206756592, + "learning_rate": 8.296155095190005e-06, + "loss": 1.6806, + "step": 26703 + }, + { + "epoch": 8.196439533456108, + "grad_norm": 0.1450011432170868, + "learning_rate": 8.293413300904246e-06, + "loss": 1.6579, + "step": 26704 + }, + { + "epoch": 8.196746470227133, + "grad_norm": 0.20312175154685974, + "learning_rate": 8.290671918787452e-06, + "loss": 1.7053, + "step": 26705 + }, + { + "epoch": 8.197053406998158, + "grad_norm": 0.13979235291481018, + "learning_rate": 8.287930948866656e-06, + "loss": 1.6751, + "step": 26706 + }, + { + "epoch": 8.197360343769184, + "grad_norm": 0.1665562391281128, + "learning_rate": 8.28519039116899e-06, + "loss": 1.7523, + "step": 26707 + }, + { + "epoch": 8.197667280540209, + "grad_norm": 0.15326659381389618, + "learning_rate": 8.282450245721524e-06, + "loss": 1.6788, + "step": 26708 + }, + { + "epoch": 8.197974217311234, + "grad_norm": 0.14121493697166443, + "learning_rate": 8.279710512551331e-06, + "loss": 1.6351, + "step": 26709 + }, + { + "epoch": 8.19828115408226, + "grad_norm": 0.16965799033641815, + "learning_rate": 8.276971191685495e-06, + "loss": 1.7694, + "step": 26710 + }, + { + "epoch": 8.198588090853285, + "grad_norm": 0.21316587924957275, + "learning_rate": 8.274232283151085e-06, + "loss": 1.6922, + "step": 26711 + }, + { + "epoch": 8.19889502762431, + "grad_norm": 0.1613110601902008, + "learning_rate": 8.271493786975165e-06, + "loss": 1.7221, + "step": 26712 + }, + { + "epoch": 8.199201964395334, + "grad_norm": 0.19140063226222992, + "learning_rate": 8.268755703184804e-06, + "loss": 1.7457, + "step": 26713 + }, + { + "epoch": 8.199508901166359, + "grad_norm": 0.1680840253829956, + "learning_rate": 8.26601803180706e-06, + "loss": 1.6948, + "step": 26714 + }, + { + "epoch": 8.199815837937384, + "grad_norm": 0.17642726004123688, + "learning_rate": 8.263280772868982e-06, + "loss": 1.6996, + "step": 26715 + }, + { + "epoch": 8.20012277470841, + "grad_norm": 0.21370023488998413, + "learning_rate": 8.26054392639763e-06, + "loss": 1.7585, + "step": 26716 + }, + { + "epoch": 8.200429711479435, + "grad_norm": 0.20721369981765747, + "learning_rate": 8.257807492420044e-06, + "loss": 1.7127, + "step": 26717 + }, + { + "epoch": 8.20073664825046, + "grad_norm": 0.14441120624542236, + "learning_rate": 8.255071470963272e-06, + "loss": 1.6627, + "step": 26718 + }, + { + "epoch": 8.201043585021486, + "grad_norm": 0.17547503113746643, + "learning_rate": 8.25233586205434e-06, + "loss": 1.7764, + "step": 26719 + }, + { + "epoch": 8.201350521792511, + "grad_norm": 0.1724909394979477, + "learning_rate": 8.24960066572032e-06, + "loss": 1.6978, + "step": 26720 + }, + { + "epoch": 8.201657458563536, + "grad_norm": 0.16465766727924347, + "learning_rate": 8.246865881988186e-06, + "loss": 1.7302, + "step": 26721 + }, + { + "epoch": 8.201964395334562, + "grad_norm": 0.18594282865524292, + "learning_rate": 8.244131510885023e-06, + "loss": 1.7354, + "step": 26722 + }, + { + "epoch": 8.202271332105587, + "grad_norm": 0.163459911942482, + "learning_rate": 8.241397552437803e-06, + "loss": 1.7069, + "step": 26723 + }, + { + "epoch": 8.20257826887661, + "grad_norm": 0.1712186485528946, + "learning_rate": 8.23866400667358e-06, + "loss": 1.7029, + "step": 26724 + }, + { + "epoch": 8.202885205647636, + "grad_norm": 0.155457004904747, + "learning_rate": 8.235930873619357e-06, + "loss": 1.6806, + "step": 26725 + }, + { + "epoch": 8.203192142418661, + "grad_norm": 0.19597770273685455, + "learning_rate": 8.233198153302146e-06, + "loss": 1.7271, + "step": 26726 + }, + { + "epoch": 8.203499079189687, + "grad_norm": 0.17909370362758636, + "learning_rate": 8.230465845748946e-06, + "loss": 1.7334, + "step": 26727 + }, + { + "epoch": 8.203806015960712, + "grad_norm": 0.1566748470067978, + "learning_rate": 8.227733950986766e-06, + "loss": 1.7965, + "step": 26728 + }, + { + "epoch": 8.204112952731737, + "grad_norm": 0.23624123632907867, + "learning_rate": 8.225002469042603e-06, + "loss": 1.7154, + "step": 26729 + }, + { + "epoch": 8.204419889502763, + "grad_norm": 0.17100931704044342, + "learning_rate": 8.222271399943448e-06, + "loss": 1.6745, + "step": 26730 + }, + { + "epoch": 8.204726826273788, + "grad_norm": 0.1762385219335556, + "learning_rate": 8.219540743716298e-06, + "loss": 1.7199, + "step": 26731 + }, + { + "epoch": 8.205033763044813, + "grad_norm": 0.19741147756576538, + "learning_rate": 8.216810500388134e-06, + "loss": 1.7582, + "step": 26732 + }, + { + "epoch": 8.205340699815839, + "grad_norm": 0.14669859409332275, + "learning_rate": 8.214080669985941e-06, + "loss": 1.6859, + "step": 26733 + }, + { + "epoch": 8.205647636586862, + "grad_norm": 0.16434574127197266, + "learning_rate": 8.211351252536692e-06, + "loss": 1.7129, + "step": 26734 + }, + { + "epoch": 8.205954573357888, + "grad_norm": 0.17041419446468353, + "learning_rate": 8.208622248067361e-06, + "loss": 1.7145, + "step": 26735 + }, + { + "epoch": 8.206261510128913, + "grad_norm": 0.16507895290851593, + "learning_rate": 8.205893656604907e-06, + "loss": 1.7486, + "step": 26736 + }, + { + "epoch": 8.206568446899938, + "grad_norm": 0.19548171758651733, + "learning_rate": 8.203165478176334e-06, + "loss": 1.7135, + "step": 26737 + }, + { + "epoch": 8.206875383670964, + "grad_norm": 0.16964592039585114, + "learning_rate": 8.200437712808556e-06, + "loss": 1.703, + "step": 26738 + }, + { + "epoch": 8.207182320441989, + "grad_norm": 0.1599748432636261, + "learning_rate": 8.197710360528571e-06, + "loss": 1.7065, + "step": 26739 + }, + { + "epoch": 8.207489257213014, + "grad_norm": 0.1665380746126175, + "learning_rate": 8.194983421363294e-06, + "loss": 1.6927, + "step": 26740 + }, + { + "epoch": 8.20779619398404, + "grad_norm": 0.13410761952400208, + "learning_rate": 8.192256895339701e-06, + "loss": 1.6373, + "step": 26741 + }, + { + "epoch": 8.208103130755065, + "grad_norm": 0.17461349070072174, + "learning_rate": 8.189530782484733e-06, + "loss": 1.7058, + "step": 26742 + }, + { + "epoch": 8.20841006752609, + "grad_norm": 0.15213793516159058, + "learning_rate": 8.186805082825327e-06, + "loss": 1.6664, + "step": 26743 + }, + { + "epoch": 8.208717004297116, + "grad_norm": 0.17611466348171234, + "learning_rate": 8.184079796388421e-06, + "loss": 1.7029, + "step": 26744 + }, + { + "epoch": 8.20902394106814, + "grad_norm": 0.16301874816417694, + "learning_rate": 8.181354923200945e-06, + "loss": 1.7024, + "step": 26745 + }, + { + "epoch": 8.209330877839164, + "grad_norm": 0.12992535531520844, + "learning_rate": 8.178630463289833e-06, + "loss": 1.6471, + "step": 26746 + }, + { + "epoch": 8.20963781461019, + "grad_norm": 0.1948312669992447, + "learning_rate": 8.175906416682006e-06, + "loss": 1.7359, + "step": 26747 + }, + { + "epoch": 8.209944751381215, + "grad_norm": 0.16086861491203308, + "learning_rate": 8.173182783404387e-06, + "loss": 1.7312, + "step": 26748 + }, + { + "epoch": 8.21025168815224, + "grad_norm": 0.20091786980628967, + "learning_rate": 8.17045956348389e-06, + "loss": 1.7038, + "step": 26749 + }, + { + "epoch": 8.210558624923266, + "grad_norm": 0.18929384648799896, + "learning_rate": 8.16773675694743e-06, + "loss": 1.7129, + "step": 26750 + }, + { + "epoch": 8.210865561694291, + "grad_norm": 0.1536511927843094, + "learning_rate": 8.16501436382191e-06, + "loss": 1.7031, + "step": 26751 + }, + { + "epoch": 8.211172498465316, + "grad_norm": 0.15490883588790894, + "learning_rate": 8.162292384134245e-06, + "loss": 1.6625, + "step": 26752 + }, + { + "epoch": 8.211479435236342, + "grad_norm": 0.18852801620960236, + "learning_rate": 8.159570817911311e-06, + "loss": 1.7691, + "step": 26753 + }, + { + "epoch": 8.211786372007367, + "grad_norm": 0.21555860340595245, + "learning_rate": 8.15684966518005e-06, + "loss": 1.7919, + "step": 26754 + }, + { + "epoch": 8.212093308778392, + "grad_norm": 0.19634628295898438, + "learning_rate": 8.154128925967297e-06, + "loss": 1.7174, + "step": 26755 + }, + { + "epoch": 8.212400245549416, + "grad_norm": 0.15788821876049042, + "learning_rate": 8.151408600299998e-06, + "loss": 1.6956, + "step": 26756 + }, + { + "epoch": 8.212707182320441, + "grad_norm": 0.17314517498016357, + "learning_rate": 8.148688688204975e-06, + "loss": 1.75, + "step": 26757 + }, + { + "epoch": 8.213014119091467, + "grad_norm": 0.15606027841567993, + "learning_rate": 8.145969189709158e-06, + "loss": 1.6696, + "step": 26758 + }, + { + "epoch": 8.213321055862492, + "grad_norm": 0.17407195270061493, + "learning_rate": 8.143250104839406e-06, + "loss": 1.7279, + "step": 26759 + }, + { + "epoch": 8.213627992633517, + "grad_norm": 0.1557784378528595, + "learning_rate": 8.140531433622589e-06, + "loss": 1.7221, + "step": 26760 + }, + { + "epoch": 8.213934929404543, + "grad_norm": 0.1544533222913742, + "learning_rate": 8.137813176085574e-06, + "loss": 1.6805, + "step": 26761 + }, + { + "epoch": 8.214241866175568, + "grad_norm": 0.1605178564786911, + "learning_rate": 8.135095332255222e-06, + "loss": 1.7783, + "step": 26762 + }, + { + "epoch": 8.214548802946593, + "grad_norm": 0.14513778686523438, + "learning_rate": 8.1323779021584e-06, + "loss": 1.6933, + "step": 26763 + }, + { + "epoch": 8.214855739717619, + "grad_norm": 0.1282239407300949, + "learning_rate": 8.12966088582196e-06, + "loss": 1.6598, + "step": 26764 + }, + { + "epoch": 8.215162676488644, + "grad_norm": 0.1373436003923416, + "learning_rate": 8.126944283272748e-06, + "loss": 1.6227, + "step": 26765 + }, + { + "epoch": 8.215469613259668, + "grad_norm": 0.1634049266576767, + "learning_rate": 8.124228094537617e-06, + "loss": 1.7346, + "step": 26766 + }, + { + "epoch": 8.215776550030693, + "grad_norm": 0.16928012669086456, + "learning_rate": 8.12151231964341e-06, + "loss": 1.6958, + "step": 26767 + }, + { + "epoch": 8.216083486801718, + "grad_norm": 0.15764811635017395, + "learning_rate": 8.11879695861696e-06, + "loss": 1.6965, + "step": 26768 + }, + { + "epoch": 8.216390423572744, + "grad_norm": 0.1514546275138855, + "learning_rate": 8.11608201148511e-06, + "loss": 1.6804, + "step": 26769 + }, + { + "epoch": 8.216697360343769, + "grad_norm": 0.17304199934005737, + "learning_rate": 8.113367478274686e-06, + "loss": 1.7869, + "step": 26770 + }, + { + "epoch": 8.217004297114794, + "grad_norm": 0.19664239883422852, + "learning_rate": 8.11065335901251e-06, + "loss": 1.7082, + "step": 26771 + }, + { + "epoch": 8.21731123388582, + "grad_norm": 0.13926036655902863, + "learning_rate": 8.107939653725405e-06, + "loss": 1.6758, + "step": 26772 + }, + { + "epoch": 8.217618170656845, + "grad_norm": 0.14624418318271637, + "learning_rate": 8.10522636244021e-06, + "loss": 1.6716, + "step": 26773 + }, + { + "epoch": 8.21792510742787, + "grad_norm": 0.15462076663970947, + "learning_rate": 8.102513485183704e-06, + "loss": 1.6953, + "step": 26774 + }, + { + "epoch": 8.218232044198896, + "grad_norm": 0.21293844282627106, + "learning_rate": 8.099801021982729e-06, + "loss": 1.69, + "step": 26775 + }, + { + "epoch": 8.218538980969921, + "grad_norm": 0.16696035861968994, + "learning_rate": 8.09708897286408e-06, + "loss": 1.721, + "step": 26776 + }, + { + "epoch": 8.218845917740945, + "grad_norm": 0.1741570085287094, + "learning_rate": 8.094377337854553e-06, + "loss": 1.69, + "step": 26777 + }, + { + "epoch": 8.21915285451197, + "grad_norm": 0.17061090469360352, + "learning_rate": 8.091666116980957e-06, + "loss": 1.6886, + "step": 26778 + }, + { + "epoch": 8.219459791282995, + "grad_norm": 0.16761218011379242, + "learning_rate": 8.088955310270075e-06, + "loss": 1.6951, + "step": 26779 + }, + { + "epoch": 8.21976672805402, + "grad_norm": 0.21173669397830963, + "learning_rate": 8.086244917748703e-06, + "loss": 1.7714, + "step": 26780 + }, + { + "epoch": 8.220073664825046, + "grad_norm": 0.1629040539264679, + "learning_rate": 8.083534939443626e-06, + "loss": 1.6712, + "step": 26781 + }, + { + "epoch": 8.220380601596071, + "grad_norm": 0.14620709419250488, + "learning_rate": 8.080825375381623e-06, + "loss": 1.6638, + "step": 26782 + }, + { + "epoch": 8.220687538367097, + "grad_norm": 0.16511180996894836, + "learning_rate": 8.078116225589477e-06, + "loss": 1.6739, + "step": 26783 + }, + { + "epoch": 8.220994475138122, + "grad_norm": 0.155776247382164, + "learning_rate": 8.075407490093951e-06, + "loss": 1.7098, + "step": 26784 + }, + { + "epoch": 8.221301411909147, + "grad_norm": 0.18273292481899261, + "learning_rate": 8.072699168921826e-06, + "loss": 1.7595, + "step": 26785 + }, + { + "epoch": 8.221608348680173, + "grad_norm": 0.20691648125648499, + "learning_rate": 8.069991262099862e-06, + "loss": 1.7044, + "step": 26786 + }, + { + "epoch": 8.221915285451198, + "grad_norm": 0.13940884172916412, + "learning_rate": 8.06728376965482e-06, + "loss": 1.6651, + "step": 26787 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 0.1676037758588791, + "learning_rate": 8.064576691613457e-06, + "loss": 1.7215, + "step": 26788 + }, + { + "epoch": 8.222529158993247, + "grad_norm": 0.18815284967422485, + "learning_rate": 8.06187002800251e-06, + "loss": 1.771, + "step": 26789 + }, + { + "epoch": 8.222836095764272, + "grad_norm": 0.16505572199821472, + "learning_rate": 8.059163778848771e-06, + "loss": 1.7072, + "step": 26790 + }, + { + "epoch": 8.223143032535297, + "grad_norm": 0.15086548030376434, + "learning_rate": 8.056457944178936e-06, + "loss": 1.6874, + "step": 26791 + }, + { + "epoch": 8.223449969306323, + "grad_norm": 0.13147135078907013, + "learning_rate": 8.053752524019792e-06, + "loss": 1.6604, + "step": 26792 + }, + { + "epoch": 8.223756906077348, + "grad_norm": 0.13695500791072845, + "learning_rate": 8.051047518398024e-06, + "loss": 1.6498, + "step": 26793 + }, + { + "epoch": 8.224063842848373, + "grad_norm": 0.16654162108898163, + "learning_rate": 8.048342927340407e-06, + "loss": 1.6993, + "step": 26794 + }, + { + "epoch": 8.224370779619399, + "grad_norm": 0.15318933129310608, + "learning_rate": 8.045638750873652e-06, + "loss": 1.716, + "step": 26795 + }, + { + "epoch": 8.224677716390424, + "grad_norm": 0.17502783238887787, + "learning_rate": 8.04293498902448e-06, + "loss": 1.6953, + "step": 26796 + }, + { + "epoch": 8.22498465316145, + "grad_norm": 0.17295950651168823, + "learning_rate": 8.040231641819623e-06, + "loss": 1.6794, + "step": 26797 + }, + { + "epoch": 8.225291589932475, + "grad_norm": 0.14702807366847992, + "learning_rate": 8.03752870928579e-06, + "loss": 1.6389, + "step": 26798 + }, + { + "epoch": 8.225598526703498, + "grad_norm": 0.21157263219356537, + "learning_rate": 8.034826191449691e-06, + "loss": 1.6817, + "step": 26799 + }, + { + "epoch": 8.225905463474524, + "grad_norm": 0.1675570011138916, + "learning_rate": 8.03212408833804e-06, + "loss": 1.7636, + "step": 26800 + }, + { + "epoch": 8.226212400245549, + "grad_norm": 0.24485285580158234, + "learning_rate": 8.029422399977531e-06, + "loss": 1.7017, + "step": 26801 + }, + { + "epoch": 8.226519337016574, + "grad_norm": 0.15588007867336273, + "learning_rate": 8.026721126394871e-06, + "loss": 1.6781, + "step": 26802 + }, + { + "epoch": 8.2268262737876, + "grad_norm": 0.16810667514801025, + "learning_rate": 8.024020267616756e-06, + "loss": 1.7046, + "step": 26803 + }, + { + "epoch": 8.227133210558625, + "grad_norm": 0.2029539942741394, + "learning_rate": 8.021319823669875e-06, + "loss": 1.6735, + "step": 26804 + }, + { + "epoch": 8.22744014732965, + "grad_norm": 0.18706166744232178, + "learning_rate": 8.018619794580917e-06, + "loss": 1.6818, + "step": 26805 + }, + { + "epoch": 8.227747084100676, + "grad_norm": 0.18221300840377808, + "learning_rate": 8.01592018037655e-06, + "loss": 1.7349, + "step": 26806 + }, + { + "epoch": 8.228054020871701, + "grad_norm": 0.20281676948070526, + "learning_rate": 8.013220981083492e-06, + "loss": 1.6942, + "step": 26807 + }, + { + "epoch": 8.228360957642726, + "grad_norm": 0.16217820346355438, + "learning_rate": 8.01052219672837e-06, + "loss": 1.6693, + "step": 26808 + }, + { + "epoch": 8.22866789441375, + "grad_norm": 0.19438619911670685, + "learning_rate": 8.007823827337901e-06, + "loss": 1.7195, + "step": 26809 + }, + { + "epoch": 8.228974831184775, + "grad_norm": 0.229817733168602, + "learning_rate": 8.005125872938707e-06, + "loss": 1.7621, + "step": 26810 + }, + { + "epoch": 8.2292817679558, + "grad_norm": 0.20305906236171722, + "learning_rate": 8.002428333557488e-06, + "loss": 1.7132, + "step": 26811 + }, + { + "epoch": 8.229588704726826, + "grad_norm": 0.16244050860404968, + "learning_rate": 7.999731209220884e-06, + "loss": 1.729, + "step": 26812 + }, + { + "epoch": 8.229895641497851, + "grad_norm": 0.18119513988494873, + "learning_rate": 7.997034499955552e-06, + "loss": 1.7431, + "step": 26813 + }, + { + "epoch": 8.230202578268877, + "grad_norm": 0.1475009173154831, + "learning_rate": 7.99433820578816e-06, + "loss": 1.7229, + "step": 26814 + }, + { + "epoch": 8.230509515039902, + "grad_norm": 0.16200442612171173, + "learning_rate": 7.991642326745314e-06, + "loss": 1.7491, + "step": 26815 + }, + { + "epoch": 8.230816451810927, + "grad_norm": 0.17432551085948944, + "learning_rate": 7.988946862853686e-06, + "loss": 1.6997, + "step": 26816 + }, + { + "epoch": 8.231123388581953, + "grad_norm": 0.2010595202445984, + "learning_rate": 7.986251814139916e-06, + "loss": 1.795, + "step": 26817 + }, + { + "epoch": 8.231430325352978, + "grad_norm": 0.15220746397972107, + "learning_rate": 7.983557180630625e-06, + "loss": 1.6912, + "step": 26818 + }, + { + "epoch": 8.231737262124003, + "grad_norm": 0.1524961143732071, + "learning_rate": 7.980862962352454e-06, + "loss": 1.6924, + "step": 26819 + }, + { + "epoch": 8.232044198895027, + "grad_norm": 0.16850624978542328, + "learning_rate": 7.978169159332016e-06, + "loss": 1.7111, + "step": 26820 + }, + { + "epoch": 8.232351135666052, + "grad_norm": 0.19621838629245758, + "learning_rate": 7.975475771595947e-06, + "loss": 1.7237, + "step": 26821 + }, + { + "epoch": 8.232658072437077, + "grad_norm": 0.23287613689899445, + "learning_rate": 7.972782799170858e-06, + "loss": 1.7222, + "step": 26822 + }, + { + "epoch": 8.232965009208103, + "grad_norm": 0.15631796419620514, + "learning_rate": 7.970090242083344e-06, + "loss": 1.7252, + "step": 26823 + }, + { + "epoch": 8.233271945979128, + "grad_norm": 0.17921209335327148, + "learning_rate": 7.967398100360062e-06, + "loss": 1.7018, + "step": 26824 + }, + { + "epoch": 8.233578882750153, + "grad_norm": 0.16767734289169312, + "learning_rate": 7.964706374027564e-06, + "loss": 1.7457, + "step": 26825 + }, + { + "epoch": 8.233885819521179, + "grad_norm": 0.15360240638256073, + "learning_rate": 7.9620150631125e-06, + "loss": 1.6886, + "step": 26826 + }, + { + "epoch": 8.234192756292204, + "grad_norm": 0.17534345388412476, + "learning_rate": 7.959324167641413e-06, + "loss": 1.7167, + "step": 26827 + }, + { + "epoch": 8.23449969306323, + "grad_norm": 0.17453409731388092, + "learning_rate": 7.956633687640941e-06, + "loss": 1.7468, + "step": 26828 + }, + { + "epoch": 8.234806629834255, + "grad_norm": 0.1416994333267212, + "learning_rate": 7.953943623137654e-06, + "loss": 1.6991, + "step": 26829 + }, + { + "epoch": 8.23511356660528, + "grad_norm": 0.14629559218883514, + "learning_rate": 7.951253974158147e-06, + "loss": 1.6891, + "step": 26830 + }, + { + "epoch": 8.235420503376304, + "grad_norm": 0.15972918272018433, + "learning_rate": 7.948564740728998e-06, + "loss": 1.711, + "step": 26831 + }, + { + "epoch": 8.235727440147329, + "grad_norm": 0.184038445353508, + "learning_rate": 7.945875922876761e-06, + "loss": 1.7481, + "step": 26832 + }, + { + "epoch": 8.236034376918354, + "grad_norm": 0.1788245588541031, + "learning_rate": 7.943187520628037e-06, + "loss": 1.7744, + "step": 26833 + }, + { + "epoch": 8.23634131368938, + "grad_norm": 0.18042324483394623, + "learning_rate": 7.940499534009382e-06, + "loss": 1.6905, + "step": 26834 + }, + { + "epoch": 8.236648250460405, + "grad_norm": 0.16115914285182953, + "learning_rate": 7.937811963047364e-06, + "loss": 1.6923, + "step": 26835 + }, + { + "epoch": 8.23695518723143, + "grad_norm": 0.18805812299251556, + "learning_rate": 7.935124807768546e-06, + "loss": 1.7636, + "step": 26836 + }, + { + "epoch": 8.237262124002456, + "grad_norm": 0.14013023674488068, + "learning_rate": 7.932438068199477e-06, + "loss": 1.657, + "step": 26837 + }, + { + "epoch": 8.237569060773481, + "grad_norm": 0.17245794832706451, + "learning_rate": 7.929751744366709e-06, + "loss": 1.7162, + "step": 26838 + }, + { + "epoch": 8.237875997544506, + "grad_norm": 0.20234355330467224, + "learning_rate": 7.927065836296793e-06, + "loss": 1.741, + "step": 26839 + }, + { + "epoch": 8.238182934315532, + "grad_norm": 0.1728539764881134, + "learning_rate": 7.924380344016264e-06, + "loss": 1.7037, + "step": 26840 + }, + { + "epoch": 8.238489871086557, + "grad_norm": 0.20881959795951843, + "learning_rate": 7.921695267551688e-06, + "loss": 1.7446, + "step": 26841 + }, + { + "epoch": 8.23879680785758, + "grad_norm": 0.15921615064144135, + "learning_rate": 7.919010606929562e-06, + "loss": 1.6777, + "step": 26842 + }, + { + "epoch": 8.239103744628606, + "grad_norm": 0.15142741799354553, + "learning_rate": 7.916326362176462e-06, + "loss": 1.6647, + "step": 26843 + }, + { + "epoch": 8.239410681399631, + "grad_norm": 0.14777293801307678, + "learning_rate": 7.913642533318865e-06, + "loss": 1.7008, + "step": 26844 + }, + { + "epoch": 8.239717618170657, + "grad_norm": 0.14506451785564423, + "learning_rate": 7.910959120383332e-06, + "loss": 1.7156, + "step": 26845 + }, + { + "epoch": 8.240024554941682, + "grad_norm": 0.17617642879486084, + "learning_rate": 7.908276123396369e-06, + "loss": 1.707, + "step": 26846 + }, + { + "epoch": 8.240331491712707, + "grad_norm": 0.1640050709247589, + "learning_rate": 7.905593542384493e-06, + "loss": 1.6965, + "step": 26847 + }, + { + "epoch": 8.240638428483733, + "grad_norm": 0.2035178244113922, + "learning_rate": 7.902911377374229e-06, + "loss": 1.7679, + "step": 26848 + }, + { + "epoch": 8.240945365254758, + "grad_norm": 0.16591937839984894, + "learning_rate": 7.900229628392041e-06, + "loss": 1.705, + "step": 26849 + }, + { + "epoch": 8.241252302025783, + "grad_norm": 0.1770060807466507, + "learning_rate": 7.897548295464474e-06, + "loss": 1.6812, + "step": 26850 + }, + { + "epoch": 8.241559238796809, + "grad_norm": 0.1637604683637619, + "learning_rate": 7.89486737861801e-06, + "loss": 1.718, + "step": 26851 + }, + { + "epoch": 8.241866175567832, + "grad_norm": 0.1458534151315689, + "learning_rate": 7.892186877879148e-06, + "loss": 1.6834, + "step": 26852 + }, + { + "epoch": 8.242173112338858, + "grad_norm": 0.14899462461471558, + "learning_rate": 7.889506793274371e-06, + "loss": 1.6815, + "step": 26853 + }, + { + "epoch": 8.242480049109883, + "grad_norm": 0.16069386899471283, + "learning_rate": 7.88682712483017e-06, + "loss": 1.7522, + "step": 26854 + }, + { + "epoch": 8.242786985880908, + "grad_norm": 0.17499712109565735, + "learning_rate": 7.884147872573034e-06, + "loss": 1.6805, + "step": 26855 + }, + { + "epoch": 8.243093922651934, + "grad_norm": 0.1455364227294922, + "learning_rate": 7.881469036529427e-06, + "loss": 1.6797, + "step": 26856 + }, + { + "epoch": 8.243400859422959, + "grad_norm": 0.2292124629020691, + "learning_rate": 7.878790616725818e-06, + "loss": 1.6923, + "step": 26857 + }, + { + "epoch": 8.243707796193984, + "grad_norm": 0.17365983128547668, + "learning_rate": 7.876112613188713e-06, + "loss": 1.713, + "step": 26858 + }, + { + "epoch": 8.24401473296501, + "grad_norm": 0.17498542368412018, + "learning_rate": 7.873435025944525e-06, + "loss": 1.6834, + "step": 26859 + }, + { + "epoch": 8.244321669736035, + "grad_norm": 0.19340896606445312, + "learning_rate": 7.870757855019772e-06, + "loss": 1.7246, + "step": 26860 + }, + { + "epoch": 8.24462860650706, + "grad_norm": 0.16443613171577454, + "learning_rate": 7.868081100440855e-06, + "loss": 1.7217, + "step": 26861 + }, + { + "epoch": 8.244935543278086, + "grad_norm": 0.1470339596271515, + "learning_rate": 7.865404762234268e-06, + "loss": 1.6504, + "step": 26862 + }, + { + "epoch": 8.245242480049109, + "grad_norm": 0.14689552783966064, + "learning_rate": 7.862728840426453e-06, + "loss": 1.7231, + "step": 26863 + }, + { + "epoch": 8.245549416820134, + "grad_norm": 0.25354984402656555, + "learning_rate": 7.860053335043843e-06, + "loss": 1.7951, + "step": 26864 + }, + { + "epoch": 8.24585635359116, + "grad_norm": 0.1774766445159912, + "learning_rate": 7.857378246112896e-06, + "loss": 1.6702, + "step": 26865 + }, + { + "epoch": 8.246163290362185, + "grad_norm": 0.16365554928779602, + "learning_rate": 7.854703573660015e-06, + "loss": 1.6945, + "step": 26866 + }, + { + "epoch": 8.24647022713321, + "grad_norm": 0.15043000876903534, + "learning_rate": 7.852029317711669e-06, + "loss": 1.6341, + "step": 26867 + }, + { + "epoch": 8.246777163904236, + "grad_norm": 0.18268270790576935, + "learning_rate": 7.849355478294274e-06, + "loss": 1.7246, + "step": 26868 + }, + { + "epoch": 8.247084100675261, + "grad_norm": 0.2022860199213028, + "learning_rate": 7.84668205543425e-06, + "loss": 1.7527, + "step": 26869 + }, + { + "epoch": 8.247391037446286, + "grad_norm": 0.15406467020511627, + "learning_rate": 7.844009049158024e-06, + "loss": 1.6678, + "step": 26870 + }, + { + "epoch": 8.247697974217312, + "grad_norm": 0.168084055185318, + "learning_rate": 7.841336459492005e-06, + "loss": 1.7018, + "step": 26871 + }, + { + "epoch": 8.248004910988337, + "grad_norm": 0.15184715390205383, + "learning_rate": 7.83866428646261e-06, + "loss": 1.6636, + "step": 26872 + }, + { + "epoch": 8.248311847759362, + "grad_norm": 0.18516378104686737, + "learning_rate": 7.835992530096248e-06, + "loss": 1.7746, + "step": 26873 + }, + { + "epoch": 8.248618784530386, + "grad_norm": 0.22552374005317688, + "learning_rate": 7.833321190419313e-06, + "loss": 1.7307, + "step": 26874 + }, + { + "epoch": 8.248925721301411, + "grad_norm": 0.14845159649848938, + "learning_rate": 7.830650267458228e-06, + "loss": 1.6831, + "step": 26875 + }, + { + "epoch": 8.249232658072437, + "grad_norm": 0.17764155566692352, + "learning_rate": 7.827979761239356e-06, + "loss": 1.7569, + "step": 26876 + }, + { + "epoch": 8.249539594843462, + "grad_norm": 0.13525958359241486, + "learning_rate": 7.825309671789128e-06, + "loss": 1.6447, + "step": 26877 + }, + { + "epoch": 8.249846531614487, + "grad_norm": 0.1541098952293396, + "learning_rate": 7.822639999133885e-06, + "loss": 1.7054, + "step": 26878 + }, + { + "epoch": 8.250153468385513, + "grad_norm": 0.1462734043598175, + "learning_rate": 7.819970743300042e-06, + "loss": 1.6801, + "step": 26879 + }, + { + "epoch": 8.250460405156538, + "grad_norm": 0.16271938383579254, + "learning_rate": 7.817301904313979e-06, + "loss": 1.7342, + "step": 26880 + }, + { + "epoch": 8.250767341927563, + "grad_norm": 0.18730363249778748, + "learning_rate": 7.814633482202055e-06, + "loss": 1.7656, + "step": 26881 + }, + { + "epoch": 8.251074278698589, + "grad_norm": 0.1343161165714264, + "learning_rate": 7.811965476990663e-06, + "loss": 1.6738, + "step": 26882 + }, + { + "epoch": 8.251381215469614, + "grad_norm": 0.18782657384872437, + "learning_rate": 7.809297888706135e-06, + "loss": 1.6946, + "step": 26883 + }, + { + "epoch": 8.25168815224064, + "grad_norm": 0.16619306802749634, + "learning_rate": 7.806630717374862e-06, + "loss": 1.7024, + "step": 26884 + }, + { + "epoch": 8.251995089011663, + "grad_norm": 0.18570290505886078, + "learning_rate": 7.803963963023192e-06, + "loss": 1.7602, + "step": 26885 + }, + { + "epoch": 8.252302025782688, + "grad_norm": 0.19790740311145782, + "learning_rate": 7.80129762567749e-06, + "loss": 1.6965, + "step": 26886 + }, + { + "epoch": 8.252608962553714, + "grad_norm": 0.17269279062747955, + "learning_rate": 7.79863170536409e-06, + "loss": 1.7585, + "step": 26887 + }, + { + "epoch": 8.252915899324739, + "grad_norm": 0.17961835861206055, + "learning_rate": 7.79596620210935e-06, + "loss": 1.6992, + "step": 26888 + }, + { + "epoch": 8.253222836095764, + "grad_norm": 0.15848924219608307, + "learning_rate": 7.793301115939611e-06, + "loss": 1.6849, + "step": 26889 + }, + { + "epoch": 8.25352977286679, + "grad_norm": 0.16328901052474976, + "learning_rate": 7.790636446881205e-06, + "loss": 1.7049, + "step": 26890 + }, + { + "epoch": 8.253836709637815, + "grad_norm": 0.15410196781158447, + "learning_rate": 7.787972194960463e-06, + "loss": 1.6764, + "step": 26891 + }, + { + "epoch": 8.25414364640884, + "grad_norm": 0.15541456639766693, + "learning_rate": 7.78530836020374e-06, + "loss": 1.6692, + "step": 26892 + }, + { + "epoch": 8.254450583179866, + "grad_norm": 0.1663745492696762, + "learning_rate": 7.782644942637318e-06, + "loss": 1.708, + "step": 26893 + }, + { + "epoch": 8.254757519950891, + "grad_norm": 0.2212733030319214, + "learning_rate": 7.779981942287567e-06, + "loss": 1.7978, + "step": 26894 + }, + { + "epoch": 8.255064456721914, + "grad_norm": 0.15269914269447327, + "learning_rate": 7.777319359180756e-06, + "loss": 1.6688, + "step": 26895 + }, + { + "epoch": 8.25537139349294, + "grad_norm": 0.18167565762996674, + "learning_rate": 7.774657193343238e-06, + "loss": 1.7394, + "step": 26896 + }, + { + "epoch": 8.255678330263965, + "grad_norm": 0.18649235367774963, + "learning_rate": 7.771995444801306e-06, + "loss": 1.7438, + "step": 26897 + }, + { + "epoch": 8.25598526703499, + "grad_norm": 0.14753280580043793, + "learning_rate": 7.769334113581267e-06, + "loss": 1.6624, + "step": 26898 + }, + { + "epoch": 8.256292203806016, + "grad_norm": 0.1815260797739029, + "learning_rate": 7.76667319970943e-06, + "loss": 1.7091, + "step": 26899 + }, + { + "epoch": 8.256599140577041, + "grad_norm": 0.18099220097064972, + "learning_rate": 7.764012703212059e-06, + "loss": 1.7285, + "step": 26900 + }, + { + "epoch": 8.256906077348066, + "grad_norm": 0.15976406633853912, + "learning_rate": 7.76135262411548e-06, + "loss": 1.7038, + "step": 26901 + }, + { + "epoch": 8.257213014119092, + "grad_norm": 0.20424988865852356, + "learning_rate": 7.758692962445974e-06, + "loss": 1.7398, + "step": 26902 + }, + { + "epoch": 8.257519950890117, + "grad_norm": 0.17021317780017853, + "learning_rate": 7.756033718229816e-06, + "loss": 1.7422, + "step": 26903 + }, + { + "epoch": 8.257826887661142, + "grad_norm": 0.2599583566188812, + "learning_rate": 7.753374891493298e-06, + "loss": 1.6943, + "step": 26904 + }, + { + "epoch": 8.258133824432168, + "grad_norm": 0.16305646300315857, + "learning_rate": 7.750716482262693e-06, + "loss": 1.7129, + "step": 26905 + }, + { + "epoch": 8.258440761203191, + "grad_norm": 0.136509507894516, + "learning_rate": 7.74805849056427e-06, + "loss": 1.666, + "step": 26906 + }, + { + "epoch": 8.258747697974217, + "grad_norm": 0.14928071200847626, + "learning_rate": 7.745400916424294e-06, + "loss": 1.6842, + "step": 26907 + }, + { + "epoch": 8.259054634745242, + "grad_norm": 0.20410865545272827, + "learning_rate": 7.74274375986902e-06, + "loss": 1.7376, + "step": 26908 + }, + { + "epoch": 8.259361571516267, + "grad_norm": 0.16844697296619415, + "learning_rate": 7.740087020924746e-06, + "loss": 1.7125, + "step": 26909 + }, + { + "epoch": 8.259668508287293, + "grad_norm": 0.1874905675649643, + "learning_rate": 7.737430699617681e-06, + "loss": 1.7534, + "step": 26910 + }, + { + "epoch": 8.259975445058318, + "grad_norm": 0.15867100656032562, + "learning_rate": 7.734774795974114e-06, + "loss": 1.7329, + "step": 26911 + }, + { + "epoch": 8.260282381829343, + "grad_norm": 0.14987660944461823, + "learning_rate": 7.732119310020258e-06, + "loss": 1.7038, + "step": 26912 + }, + { + "epoch": 8.260589318600369, + "grad_norm": 0.259883314371109, + "learning_rate": 7.729464241782381e-06, + "loss": 1.7677, + "step": 26913 + }, + { + "epoch": 8.260896255371394, + "grad_norm": 0.2080366462469101, + "learning_rate": 7.726809591286716e-06, + "loss": 1.7662, + "step": 26914 + }, + { + "epoch": 8.26120319214242, + "grad_norm": 0.1707276701927185, + "learning_rate": 7.724155358559492e-06, + "loss": 1.671, + "step": 26915 + }, + { + "epoch": 8.261510128913443, + "grad_norm": 0.17241668701171875, + "learning_rate": 7.721501543626958e-06, + "loss": 1.7227, + "step": 26916 + }, + { + "epoch": 8.261817065684468, + "grad_norm": 0.18578803539276123, + "learning_rate": 7.718848146515301e-06, + "loss": 1.6962, + "step": 26917 + }, + { + "epoch": 8.262124002455494, + "grad_norm": 0.16692428290843964, + "learning_rate": 7.716195167250778e-06, + "loss": 1.6918, + "step": 26918 + }, + { + "epoch": 8.262430939226519, + "grad_norm": 0.18908677995204926, + "learning_rate": 7.713542605859602e-06, + "loss": 1.7271, + "step": 26919 + }, + { + "epoch": 8.262737875997544, + "grad_norm": 0.2003175914287567, + "learning_rate": 7.710890462367981e-06, + "loss": 1.729, + "step": 26920 + }, + { + "epoch": 8.26304481276857, + "grad_norm": 0.16058455407619476, + "learning_rate": 7.708238736802125e-06, + "loss": 1.671, + "step": 26921 + }, + { + "epoch": 8.263351749539595, + "grad_norm": 0.1803000271320343, + "learning_rate": 7.705587429188244e-06, + "loss": 1.7582, + "step": 26922 + }, + { + "epoch": 8.26365868631062, + "grad_norm": 0.218659445643425, + "learning_rate": 7.70293653955254e-06, + "loss": 1.7431, + "step": 26923 + }, + { + "epoch": 8.263965623081646, + "grad_norm": 0.13701553642749786, + "learning_rate": 7.700286067921204e-06, + "loss": 1.6806, + "step": 26924 + }, + { + "epoch": 8.264272559852671, + "grad_norm": 0.15342164039611816, + "learning_rate": 7.697636014320436e-06, + "loss": 1.6501, + "step": 26925 + }, + { + "epoch": 8.264579496623696, + "grad_norm": 0.18738442659378052, + "learning_rate": 7.69498637877642e-06, + "loss": 1.7032, + "step": 26926 + }, + { + "epoch": 8.26488643339472, + "grad_norm": 0.14805950224399567, + "learning_rate": 7.692337161315338e-06, + "loss": 1.6641, + "step": 26927 + }, + { + "epoch": 8.265193370165745, + "grad_norm": 0.18155299127101898, + "learning_rate": 7.689688361963398e-06, + "loss": 1.6967, + "step": 26928 + }, + { + "epoch": 8.26550030693677, + "grad_norm": 0.13954955339431763, + "learning_rate": 7.68703998074673e-06, + "loss": 1.6865, + "step": 26929 + }, + { + "epoch": 8.265807243707796, + "grad_norm": 0.1464248150587082, + "learning_rate": 7.684392017691549e-06, + "loss": 1.6702, + "step": 26930 + }, + { + "epoch": 8.266114180478821, + "grad_norm": 0.16407039761543274, + "learning_rate": 7.68174447282401e-06, + "loss": 1.7265, + "step": 26931 + }, + { + "epoch": 8.266421117249847, + "grad_norm": 0.13243085145950317, + "learning_rate": 7.679097346170272e-06, + "loss": 1.67, + "step": 26932 + }, + { + "epoch": 8.266728054020872, + "grad_norm": 0.18284925818443298, + "learning_rate": 7.67645063775651e-06, + "loss": 1.7524, + "step": 26933 + }, + { + "epoch": 8.267034990791897, + "grad_norm": 0.16042175889015198, + "learning_rate": 7.673804347608849e-06, + "loss": 1.7244, + "step": 26934 + }, + { + "epoch": 8.267341927562923, + "grad_norm": 0.18213023245334625, + "learning_rate": 7.67115847575347e-06, + "loss": 1.7241, + "step": 26935 + }, + { + "epoch": 8.267648864333948, + "grad_norm": 0.1590288132429123, + "learning_rate": 7.668513022216517e-06, + "loss": 1.7056, + "step": 26936 + }, + { + "epoch": 8.267955801104973, + "grad_norm": 0.17236095666885376, + "learning_rate": 7.665867987024122e-06, + "loss": 1.7251, + "step": 26937 + }, + { + "epoch": 8.268262737875997, + "grad_norm": 0.14264018833637238, + "learning_rate": 7.663223370202439e-06, + "loss": 1.6672, + "step": 26938 + }, + { + "epoch": 8.268569674647022, + "grad_norm": 0.15768232941627502, + "learning_rate": 7.660579171777599e-06, + "loss": 1.6846, + "step": 26939 + }, + { + "epoch": 8.268876611418047, + "grad_norm": 0.12978656589984894, + "learning_rate": 7.657935391775727e-06, + "loss": 1.6615, + "step": 26940 + }, + { + "epoch": 8.269183548189073, + "grad_norm": 0.18869580328464508, + "learning_rate": 7.655292030222955e-06, + "loss": 1.7056, + "step": 26941 + }, + { + "epoch": 8.269490484960098, + "grad_norm": 0.16662544012069702, + "learning_rate": 7.652649087145409e-06, + "loss": 1.7559, + "step": 26942 + }, + { + "epoch": 8.269797421731123, + "grad_norm": 0.20138496160507202, + "learning_rate": 7.650006562569201e-06, + "loss": 1.7428, + "step": 26943 + }, + { + "epoch": 8.270104358502149, + "grad_norm": 0.16201090812683105, + "learning_rate": 7.647364456520439e-06, + "loss": 1.7456, + "step": 26944 + }, + { + "epoch": 8.270411295273174, + "grad_norm": 0.16562269628047943, + "learning_rate": 7.644722769025275e-06, + "loss": 1.7282, + "step": 26945 + }, + { + "epoch": 8.2707182320442, + "grad_norm": 0.1434047371149063, + "learning_rate": 7.642081500109754e-06, + "loss": 1.6959, + "step": 26946 + }, + { + "epoch": 8.271025168815225, + "grad_norm": 0.1424918919801712, + "learning_rate": 7.63944064980004e-06, + "loss": 1.7133, + "step": 26947 + }, + { + "epoch": 8.27133210558625, + "grad_norm": 0.23540155589580536, + "learning_rate": 7.636800218122176e-06, + "loss": 1.7156, + "step": 26948 + }, + { + "epoch": 8.271639042357274, + "grad_norm": 0.1890154927968979, + "learning_rate": 7.634160205102292e-06, + "loss": 1.7452, + "step": 26949 + }, + { + "epoch": 8.271945979128299, + "grad_norm": 0.1555023491382599, + "learning_rate": 7.631520610766486e-06, + "loss": 1.7096, + "step": 26950 + }, + { + "epoch": 8.272252915899324, + "grad_norm": 0.16713875532150269, + "learning_rate": 7.628881435140794e-06, + "loss": 1.6832, + "step": 26951 + }, + { + "epoch": 8.27255985267035, + "grad_norm": 0.18925394117832184, + "learning_rate": 7.626242678251349e-06, + "loss": 1.7755, + "step": 26952 + }, + { + "epoch": 8.272866789441375, + "grad_norm": 0.19905491173267365, + "learning_rate": 7.6236043401242074e-06, + "loss": 1.6915, + "step": 26953 + }, + { + "epoch": 8.2731737262124, + "grad_norm": 0.13694030046463013, + "learning_rate": 7.620966420785447e-06, + "loss": 1.6935, + "step": 26954 + }, + { + "epoch": 8.273480662983426, + "grad_norm": 0.1292782723903656, + "learning_rate": 7.61832892026113e-06, + "loss": 1.6823, + "step": 26955 + }, + { + "epoch": 8.273787599754451, + "grad_norm": 0.15123988687992096, + "learning_rate": 7.615691838577333e-06, + "loss": 1.6807, + "step": 26956 + }, + { + "epoch": 8.274094536525476, + "grad_norm": 0.14225423336029053, + "learning_rate": 7.6130551757601084e-06, + "loss": 1.6616, + "step": 26957 + }, + { + "epoch": 8.274401473296502, + "grad_norm": 0.15328221023082733, + "learning_rate": 7.610418931835517e-06, + "loss": 1.7211, + "step": 26958 + }, + { + "epoch": 8.274708410067525, + "grad_norm": 0.168446883559227, + "learning_rate": 7.6077831068296134e-06, + "loss": 1.7211, + "step": 26959 + }, + { + "epoch": 8.27501534683855, + "grad_norm": 0.1877220869064331, + "learning_rate": 7.6051477007684444e-06, + "loss": 1.7139, + "step": 26960 + }, + { + "epoch": 8.275322283609576, + "grad_norm": 0.14273744821548462, + "learning_rate": 7.602512713678039e-06, + "loss": 1.6996, + "step": 26961 + }, + { + "epoch": 8.275629220380601, + "grad_norm": 0.1611991822719574, + "learning_rate": 7.599878145584477e-06, + "loss": 1.6837, + "step": 26962 + }, + { + "epoch": 8.275936157151627, + "grad_norm": 0.13847516477108002, + "learning_rate": 7.597243996513747e-06, + "loss": 1.6449, + "step": 26963 + }, + { + "epoch": 8.276243093922652, + "grad_norm": 0.16816900670528412, + "learning_rate": 7.59461026649193e-06, + "loss": 1.747, + "step": 26964 + }, + { + "epoch": 8.276550030693677, + "grad_norm": 0.15942460298538208, + "learning_rate": 7.5919769555450046e-06, + "loss": 1.7461, + "step": 26965 + }, + { + "epoch": 8.276856967464703, + "grad_norm": 0.16706149280071259, + "learning_rate": 7.589344063699033e-06, + "loss": 1.7136, + "step": 26966 + }, + { + "epoch": 8.277163904235728, + "grad_norm": 0.16727334260940552, + "learning_rate": 7.586711590980028e-06, + "loss": 1.7186, + "step": 26967 + }, + { + "epoch": 8.277470841006753, + "grad_norm": 0.1510261744260788, + "learning_rate": 7.5840795374139795e-06, + "loss": 1.6795, + "step": 26968 + }, + { + "epoch": 8.277777777777779, + "grad_norm": 0.1705521196126938, + "learning_rate": 7.581447903026939e-06, + "loss": 1.6903, + "step": 26969 + }, + { + "epoch": 8.278084714548802, + "grad_norm": 0.15767472982406616, + "learning_rate": 7.57881668784487e-06, + "loss": 1.7264, + "step": 26970 + }, + { + "epoch": 8.278391651319827, + "grad_norm": 0.15771441161632538, + "learning_rate": 7.576185891893805e-06, + "loss": 1.7091, + "step": 26971 + }, + { + "epoch": 8.278698588090853, + "grad_norm": 0.22973434627056122, + "learning_rate": 7.5735555151997425e-06, + "loss": 1.7357, + "step": 26972 + }, + { + "epoch": 8.279005524861878, + "grad_norm": 0.15931910276412964, + "learning_rate": 7.570925557788672e-06, + "loss": 1.7026, + "step": 26973 + }, + { + "epoch": 8.279312461632903, + "grad_norm": 0.1451634019613266, + "learning_rate": 7.568296019686583e-06, + "loss": 1.6824, + "step": 26974 + }, + { + "epoch": 8.279619398403929, + "grad_norm": 0.14617015421390533, + "learning_rate": 7.56566690091946e-06, + "loss": 1.677, + "step": 26975 + }, + { + "epoch": 8.279926335174954, + "grad_norm": 0.14465895295143127, + "learning_rate": 7.5630382015132895e-06, + "loss": 1.7193, + "step": 26976 + }, + { + "epoch": 8.28023327194598, + "grad_norm": 0.1751926839351654, + "learning_rate": 7.560409921494044e-06, + "loss": 1.7366, + "step": 26977 + }, + { + "epoch": 8.280540208717005, + "grad_norm": 0.1478777974843979, + "learning_rate": 7.557782060887697e-06, + "loss": 1.6948, + "step": 26978 + }, + { + "epoch": 8.28084714548803, + "grad_norm": 0.25690537691116333, + "learning_rate": 7.555154619720245e-06, + "loss": 1.7284, + "step": 26979 + }, + { + "epoch": 8.281154082259055, + "grad_norm": 0.1380864977836609, + "learning_rate": 7.552527598017611e-06, + "loss": 1.6753, + "step": 26980 + }, + { + "epoch": 8.281461019030079, + "grad_norm": 0.21658651530742645, + "learning_rate": 7.5499009958057975e-06, + "loss": 1.8076, + "step": 26981 + }, + { + "epoch": 8.281767955801104, + "grad_norm": 0.16225802898406982, + "learning_rate": 7.547274813110727e-06, + "loss": 1.6716, + "step": 26982 + }, + { + "epoch": 8.28207489257213, + "grad_norm": 0.18264736235141754, + "learning_rate": 7.544649049958375e-06, + "loss": 1.7241, + "step": 26983 + }, + { + "epoch": 8.282381829343155, + "grad_norm": 0.17512252926826477, + "learning_rate": 7.542023706374695e-06, + "loss": 1.6709, + "step": 26984 + }, + { + "epoch": 8.28268876611418, + "grad_norm": 0.16799452900886536, + "learning_rate": 7.5393987823856035e-06, + "loss": 1.7333, + "step": 26985 + }, + { + "epoch": 8.282995702885206, + "grad_norm": 0.1569952517747879, + "learning_rate": 7.5367742780170835e-06, + "loss": 1.6701, + "step": 26986 + }, + { + "epoch": 8.283302639656231, + "grad_norm": 0.17452387511730194, + "learning_rate": 7.534150193295026e-06, + "loss": 1.6843, + "step": 26987 + }, + { + "epoch": 8.283609576427256, + "grad_norm": 0.1564214676618576, + "learning_rate": 7.531526528245392e-06, + "loss": 1.7154, + "step": 26988 + }, + { + "epoch": 8.283916513198282, + "grad_norm": 0.14093104004859924, + "learning_rate": 7.528903282894107e-06, + "loss": 1.6448, + "step": 26989 + }, + { + "epoch": 8.284223449969307, + "grad_norm": 0.2950015664100647, + "learning_rate": 7.526280457267093e-06, + "loss": 1.7657, + "step": 26990 + }, + { + "epoch": 8.284530386740332, + "grad_norm": 0.1342417150735855, + "learning_rate": 7.5236580513902756e-06, + "loss": 1.6761, + "step": 26991 + }, + { + "epoch": 8.284837323511356, + "grad_norm": 0.16559085249900818, + "learning_rate": 7.52103606528956e-06, + "loss": 1.7029, + "step": 26992 + }, + { + "epoch": 8.285144260282381, + "grad_norm": 0.14937730133533478, + "learning_rate": 7.5184144989908665e-06, + "loss": 1.6848, + "step": 26993 + }, + { + "epoch": 8.285451197053407, + "grad_norm": 0.14847339689731598, + "learning_rate": 7.515793352520095e-06, + "loss": 1.6735, + "step": 26994 + }, + { + "epoch": 8.285758133824432, + "grad_norm": 0.1866399198770523, + "learning_rate": 7.513172625903148e-06, + "loss": 1.6553, + "step": 26995 + }, + { + "epoch": 8.286065070595457, + "grad_norm": 0.15781863033771515, + "learning_rate": 7.510552319165953e-06, + "loss": 1.699, + "step": 26996 + }, + { + "epoch": 8.286372007366483, + "grad_norm": 0.1402381956577301, + "learning_rate": 7.507932432334358e-06, + "loss": 1.6778, + "step": 26997 + }, + { + "epoch": 8.286678944137508, + "grad_norm": 0.16515657305717468, + "learning_rate": 7.505312965434308e-06, + "loss": 1.6834, + "step": 26998 + }, + { + "epoch": 8.286985880908533, + "grad_norm": 0.16752316057682037, + "learning_rate": 7.502693918491638e-06, + "loss": 1.7714, + "step": 26999 + }, + { + "epoch": 8.287292817679559, + "grad_norm": 0.17935164272785187, + "learning_rate": 7.500075291532266e-06, + "loss": 1.6858, + "step": 27000 + }, + { + "epoch": 8.287599754450584, + "grad_norm": 0.1805913746356964, + "learning_rate": 7.497457084582065e-06, + "loss": 1.7451, + "step": 27001 + }, + { + "epoch": 8.287906691221608, + "grad_norm": 0.15834343433380127, + "learning_rate": 7.494839297666889e-06, + "loss": 1.6675, + "step": 27002 + }, + { + "epoch": 8.288213627992633, + "grad_norm": 0.18627049028873444, + "learning_rate": 7.492221930812648e-06, + "loss": 1.7207, + "step": 27003 + }, + { + "epoch": 8.288520564763658, + "grad_norm": 0.15027324855327606, + "learning_rate": 7.489604984045157e-06, + "loss": 1.686, + "step": 27004 + }, + { + "epoch": 8.288827501534684, + "grad_norm": 0.14771342277526855, + "learning_rate": 7.48698845739032e-06, + "loss": 1.6647, + "step": 27005 + }, + { + "epoch": 8.289134438305709, + "grad_norm": 0.14141151309013367, + "learning_rate": 7.48437235087398e-06, + "loss": 1.7005, + "step": 27006 + }, + { + "epoch": 8.289441375076734, + "grad_norm": 0.14843317866325378, + "learning_rate": 7.481756664521994e-06, + "loss": 1.6768, + "step": 27007 + }, + { + "epoch": 8.28974831184776, + "grad_norm": 0.21505968272686005, + "learning_rate": 7.479141398360206e-06, + "loss": 1.764, + "step": 27008 + }, + { + "epoch": 8.290055248618785, + "grad_norm": 0.1906919926404953, + "learning_rate": 7.476526552414464e-06, + "loss": 1.7079, + "step": 27009 + }, + { + "epoch": 8.29036218538981, + "grad_norm": 0.15975503623485565, + "learning_rate": 7.473912126710614e-06, + "loss": 1.7035, + "step": 27010 + }, + { + "epoch": 8.290669122160836, + "grad_norm": 0.16221746802330017, + "learning_rate": 7.471298121274489e-06, + "loss": 1.6707, + "step": 27011 + }, + { + "epoch": 8.29097605893186, + "grad_norm": 0.17168673872947693, + "learning_rate": 7.468684536131909e-06, + "loss": 1.7119, + "step": 27012 + }, + { + "epoch": 8.291282995702884, + "grad_norm": 0.15114913880825043, + "learning_rate": 7.466071371308742e-06, + "loss": 1.6867, + "step": 27013 + }, + { + "epoch": 8.29158993247391, + "grad_norm": 0.20300740003585815, + "learning_rate": 7.463458626830766e-06, + "loss": 1.7578, + "step": 27014 + }, + { + "epoch": 8.291896869244935, + "grad_norm": 0.1570715457201004, + "learning_rate": 7.460846302723845e-06, + "loss": 1.6588, + "step": 27015 + }, + { + "epoch": 8.29220380601596, + "grad_norm": 0.21273213624954224, + "learning_rate": 7.458234399013747e-06, + "loss": 1.7467, + "step": 27016 + }, + { + "epoch": 8.292510742786986, + "grad_norm": 0.16550743579864502, + "learning_rate": 7.455622915726324e-06, + "loss": 1.699, + "step": 27017 + }, + { + "epoch": 8.292817679558011, + "grad_norm": 0.20360049605369568, + "learning_rate": 7.453011852887387e-06, + "loss": 1.7572, + "step": 27018 + }, + { + "epoch": 8.293124616329036, + "grad_norm": 0.2043008953332901, + "learning_rate": 7.4504012105227004e-06, + "loss": 1.7181, + "step": 27019 + }, + { + "epoch": 8.293431553100062, + "grad_norm": 0.18581026792526245, + "learning_rate": 7.44779098865811e-06, + "loss": 1.742, + "step": 27020 + }, + { + "epoch": 8.293738489871087, + "grad_norm": 0.18011118471622467, + "learning_rate": 7.445181187319367e-06, + "loss": 1.7329, + "step": 27021 + }, + { + "epoch": 8.294045426642112, + "grad_norm": 0.18868795037269592, + "learning_rate": 7.442571806532295e-06, + "loss": 1.7289, + "step": 27022 + }, + { + "epoch": 8.294352363413138, + "grad_norm": 0.15835118293762207, + "learning_rate": 7.439962846322673e-06, + "loss": 1.6878, + "step": 27023 + }, + { + "epoch": 8.294659300184161, + "grad_norm": 0.23331916332244873, + "learning_rate": 7.437354306716282e-06, + "loss": 1.7144, + "step": 27024 + }, + { + "epoch": 8.294966236955187, + "grad_norm": 0.18101559579372406, + "learning_rate": 7.434746187738906e-06, + "loss": 1.7452, + "step": 27025 + }, + { + "epoch": 8.295273173726212, + "grad_norm": 0.16906292736530304, + "learning_rate": 7.432138489416318e-06, + "loss": 1.6772, + "step": 27026 + }, + { + "epoch": 8.295580110497237, + "grad_norm": 0.20603033900260925, + "learning_rate": 7.429531211774282e-06, + "loss": 1.7622, + "step": 27027 + }, + { + "epoch": 8.295887047268263, + "grad_norm": 0.19412389397621155, + "learning_rate": 7.426924354838571e-06, + "loss": 1.6973, + "step": 27028 + }, + { + "epoch": 8.296193984039288, + "grad_norm": 0.1702510118484497, + "learning_rate": 7.424317918634938e-06, + "loss": 1.7119, + "step": 27029 + }, + { + "epoch": 8.296500920810313, + "grad_norm": 0.1476033478975296, + "learning_rate": 7.421711903189171e-06, + "loss": 1.6961, + "step": 27030 + }, + { + "epoch": 8.296807857581339, + "grad_norm": 0.16404536366462708, + "learning_rate": 7.419106308526979e-06, + "loss": 1.6928, + "step": 27031 + }, + { + "epoch": 8.297114794352364, + "grad_norm": 0.15021127462387085, + "learning_rate": 7.416501134674159e-06, + "loss": 1.642, + "step": 27032 + }, + { + "epoch": 8.29742173112339, + "grad_norm": 0.20728830993175507, + "learning_rate": 7.4138963816564266e-06, + "loss": 1.7142, + "step": 27033 + }, + { + "epoch": 8.297728667894415, + "grad_norm": 0.16802074015140533, + "learning_rate": 7.411292049499513e-06, + "loss": 1.6983, + "step": 27034 + }, + { + "epoch": 8.298035604665438, + "grad_norm": 0.15957842767238617, + "learning_rate": 7.408688138229198e-06, + "loss": 1.6535, + "step": 27035 + }, + { + "epoch": 8.298342541436464, + "grad_norm": 0.17618007957935333, + "learning_rate": 7.40608464787117e-06, + "loss": 1.7024, + "step": 27036 + }, + { + "epoch": 8.298649478207489, + "grad_norm": 0.14615842700004578, + "learning_rate": 7.4034815784511994e-06, + "loss": 1.7188, + "step": 27037 + }, + { + "epoch": 8.298956414978514, + "grad_norm": 0.16748850047588348, + "learning_rate": 7.40087892999497e-06, + "loss": 1.6763, + "step": 27038 + }, + { + "epoch": 8.29926335174954, + "grad_norm": 0.15271888673305511, + "learning_rate": 7.398276702528229e-06, + "loss": 1.6766, + "step": 27039 + }, + { + "epoch": 8.299570288520565, + "grad_norm": 0.21336700022220612, + "learning_rate": 7.395674896076693e-06, + "loss": 1.7113, + "step": 27040 + }, + { + "epoch": 8.29987722529159, + "grad_norm": 0.15377891063690186, + "learning_rate": 7.3930735106660655e-06, + "loss": 1.7083, + "step": 27041 + }, + { + "epoch": 8.300184162062616, + "grad_norm": 0.1341678500175476, + "learning_rate": 7.390472546322058e-06, + "loss": 1.6411, + "step": 27042 + }, + { + "epoch": 8.300491098833641, + "grad_norm": 0.1506323516368866, + "learning_rate": 7.3878720030703785e-06, + "loss": 1.6784, + "step": 27043 + }, + { + "epoch": 8.300798035604666, + "grad_norm": 0.20630323886871338, + "learning_rate": 7.385271880936723e-06, + "loss": 1.7296, + "step": 27044 + }, + { + "epoch": 8.30110497237569, + "grad_norm": 0.1514928787946701, + "learning_rate": 7.382672179946787e-06, + "loss": 1.631, + "step": 27045 + }, + { + "epoch": 8.301411909146715, + "grad_norm": 0.21939171850681305, + "learning_rate": 7.3800729001262505e-06, + "loss": 1.7484, + "step": 27046 + }, + { + "epoch": 8.30171884591774, + "grad_norm": 0.13756778836250305, + "learning_rate": 7.377474041500837e-06, + "loss": 1.71, + "step": 27047 + }, + { + "epoch": 8.302025782688766, + "grad_norm": 0.23617541790008545, + "learning_rate": 7.374875604096188e-06, + "loss": 1.7366, + "step": 27048 + }, + { + "epoch": 8.302332719459791, + "grad_norm": 0.236005499958992, + "learning_rate": 7.37227758793802e-06, + "loss": 1.7263, + "step": 27049 + }, + { + "epoch": 8.302639656230816, + "grad_norm": 0.28162217140197754, + "learning_rate": 7.369679993051981e-06, + "loss": 1.7159, + "step": 27050 + }, + { + "epoch": 8.302946593001842, + "grad_norm": 0.18274159729480743, + "learning_rate": 7.3670828194637385e-06, + "loss": 1.695, + "step": 27051 + }, + { + "epoch": 8.303253529772867, + "grad_norm": 0.14628291130065918, + "learning_rate": 7.364486067198994e-06, + "loss": 1.712, + "step": 27052 + }, + { + "epoch": 8.303560466543892, + "grad_norm": 0.16443926095962524, + "learning_rate": 7.361889736283362e-06, + "loss": 1.7003, + "step": 27053 + }, + { + "epoch": 8.303867403314918, + "grad_norm": 0.24396912753582, + "learning_rate": 7.3592938267425525e-06, + "loss": 1.7882, + "step": 27054 + }, + { + "epoch": 8.304174340085943, + "grad_norm": 0.16564849019050598, + "learning_rate": 7.356698338602169e-06, + "loss": 1.7095, + "step": 27055 + }, + { + "epoch": 8.304481276856967, + "grad_norm": 0.17034487426280975, + "learning_rate": 7.3541032718879024e-06, + "loss": 1.7198, + "step": 27056 + }, + { + "epoch": 8.304788213627992, + "grad_norm": 0.15630117058753967, + "learning_rate": 7.351508626625381e-06, + "loss": 1.6642, + "step": 27057 + }, + { + "epoch": 8.305095150399017, + "grad_norm": 0.17507393658161163, + "learning_rate": 7.348914402840246e-06, + "loss": 1.7295, + "step": 27058 + }, + { + "epoch": 8.305402087170043, + "grad_norm": 0.13145345449447632, + "learning_rate": 7.346320600558138e-06, + "loss": 1.6654, + "step": 27059 + }, + { + "epoch": 8.305709023941068, + "grad_norm": 0.17676126956939697, + "learning_rate": 7.343727219804692e-06, + "loss": 1.7347, + "step": 27060 + }, + { + "epoch": 8.306015960712093, + "grad_norm": 0.16341568529605865, + "learning_rate": 7.341134260605536e-06, + "loss": 1.6905, + "step": 27061 + }, + { + "epoch": 8.306322897483119, + "grad_norm": 0.18549038469791412, + "learning_rate": 7.338541722986292e-06, + "loss": 1.7508, + "step": 27062 + }, + { + "epoch": 8.306629834254144, + "grad_norm": 0.15528292953968048, + "learning_rate": 7.335949606972575e-06, + "loss": 1.7261, + "step": 27063 + }, + { + "epoch": 8.30693677102517, + "grad_norm": 0.14363928139209747, + "learning_rate": 7.333357912590028e-06, + "loss": 1.6494, + "step": 27064 + }, + { + "epoch": 8.307243707796195, + "grad_norm": 0.33007505536079407, + "learning_rate": 7.3307666398642285e-06, + "loss": 1.7844, + "step": 27065 + }, + { + "epoch": 8.307550644567218, + "grad_norm": 0.18550951778888702, + "learning_rate": 7.328175788820818e-06, + "loss": 1.7699, + "step": 27066 + }, + { + "epoch": 8.307857581338244, + "grad_norm": 0.1789010763168335, + "learning_rate": 7.325585359485382e-06, + "loss": 1.6903, + "step": 27067 + }, + { + "epoch": 8.308164518109269, + "grad_norm": 0.17079691588878632, + "learning_rate": 7.322995351883505e-06, + "loss": 1.6704, + "step": 27068 + }, + { + "epoch": 8.308471454880294, + "grad_norm": 0.17510086297988892, + "learning_rate": 7.320405766040828e-06, + "loss": 1.7222, + "step": 27069 + }, + { + "epoch": 8.30877839165132, + "grad_norm": 0.1619461178779602, + "learning_rate": 7.317816601982896e-06, + "loss": 1.6573, + "step": 27070 + }, + { + "epoch": 8.309085328422345, + "grad_norm": 0.15886032581329346, + "learning_rate": 7.315227859735335e-06, + "loss": 1.7281, + "step": 27071 + }, + { + "epoch": 8.30939226519337, + "grad_norm": 0.1636921614408493, + "learning_rate": 7.31263953932369e-06, + "loss": 1.7061, + "step": 27072 + }, + { + "epoch": 8.309699201964396, + "grad_norm": 0.16119423508644104, + "learning_rate": 7.3100516407735745e-06, + "loss": 1.7102, + "step": 27073 + }, + { + "epoch": 8.310006138735421, + "grad_norm": 0.2373964637517929, + "learning_rate": 7.3074641641105445e-06, + "loss": 1.7585, + "step": 27074 + }, + { + "epoch": 8.310313075506446, + "grad_norm": 0.17123030126094818, + "learning_rate": 7.304877109360181e-06, + "loss": 1.737, + "step": 27075 + }, + { + "epoch": 8.310620012277472, + "grad_norm": 0.14955085515975952, + "learning_rate": 7.302290476548046e-06, + "loss": 1.6676, + "step": 27076 + }, + { + "epoch": 8.310926949048495, + "grad_norm": 0.19933636486530304, + "learning_rate": 7.299704265699703e-06, + "loss": 1.6926, + "step": 27077 + }, + { + "epoch": 8.31123388581952, + "grad_norm": 0.15449854731559753, + "learning_rate": 7.297118476840709e-06, + "loss": 1.6826, + "step": 27078 + }, + { + "epoch": 8.311540822590546, + "grad_norm": 0.16641317307949066, + "learning_rate": 7.294533109996621e-06, + "loss": 1.7117, + "step": 27079 + }, + { + "epoch": 8.311847759361571, + "grad_norm": 0.18311664462089539, + "learning_rate": 7.291948165192974e-06, + "loss": 1.7376, + "step": 27080 + }, + { + "epoch": 8.312154696132596, + "grad_norm": 0.17437715828418732, + "learning_rate": 7.289363642455349e-06, + "loss": 1.7373, + "step": 27081 + }, + { + "epoch": 8.312461632903622, + "grad_norm": 0.16356121003627777, + "learning_rate": 7.286779541809241e-06, + "loss": 1.6847, + "step": 27082 + }, + { + "epoch": 8.312768569674647, + "grad_norm": 0.182320237159729, + "learning_rate": 7.284195863280241e-06, + "loss": 1.6853, + "step": 27083 + }, + { + "epoch": 8.313075506445673, + "grad_norm": 0.1541421264410019, + "learning_rate": 7.281612606893839e-06, + "loss": 1.7121, + "step": 27084 + }, + { + "epoch": 8.313382443216698, + "grad_norm": 0.16640879213809967, + "learning_rate": 7.2790297726755716e-06, + "loss": 1.6914, + "step": 27085 + }, + { + "epoch": 8.313689379987723, + "grad_norm": 0.18245746195316315, + "learning_rate": 7.27644736065099e-06, + "loss": 1.7544, + "step": 27086 + }, + { + "epoch": 8.313996316758749, + "grad_norm": 0.13833735883235931, + "learning_rate": 7.273865370845573e-06, + "loss": 1.6519, + "step": 27087 + }, + { + "epoch": 8.314303253529772, + "grad_norm": 0.19455993175506592, + "learning_rate": 7.271283803284889e-06, + "loss": 1.7017, + "step": 27088 + }, + { + "epoch": 8.314610190300797, + "grad_norm": 0.16859467327594757, + "learning_rate": 7.268702657994397e-06, + "loss": 1.7173, + "step": 27089 + }, + { + "epoch": 8.314917127071823, + "grad_norm": 0.1667163074016571, + "learning_rate": 7.266121934999642e-06, + "loss": 1.731, + "step": 27090 + }, + { + "epoch": 8.315224063842848, + "grad_norm": 0.161153182387352, + "learning_rate": 7.263541634326115e-06, + "loss": 1.7223, + "step": 27091 + }, + { + "epoch": 8.315531000613873, + "grad_norm": 0.17027638852596283, + "learning_rate": 7.2609617559993234e-06, + "loss": 1.6741, + "step": 27092 + }, + { + "epoch": 8.315837937384899, + "grad_norm": 0.1516280472278595, + "learning_rate": 7.2583823000447526e-06, + "loss": 1.6974, + "step": 27093 + }, + { + "epoch": 8.316144874155924, + "grad_norm": 0.18429140746593475, + "learning_rate": 7.2558032664879035e-06, + "loss": 1.7003, + "step": 27094 + }, + { + "epoch": 8.31645181092695, + "grad_norm": 0.13946834206581116, + "learning_rate": 7.253224655354257e-06, + "loss": 1.7349, + "step": 27095 + }, + { + "epoch": 8.316758747697975, + "grad_norm": 0.17642852663993835, + "learning_rate": 7.250646466669303e-06, + "loss": 1.7131, + "step": 27096 + }, + { + "epoch": 8.317065684469, + "grad_norm": 0.1700926125049591, + "learning_rate": 7.2480687004585155e-06, + "loss": 1.7496, + "step": 27097 + }, + { + "epoch": 8.317372621240025, + "grad_norm": 0.19472727179527283, + "learning_rate": 7.245491356747369e-06, + "loss": 1.73, + "step": 27098 + }, + { + "epoch": 8.317679558011049, + "grad_norm": 0.16857488453388214, + "learning_rate": 7.242914435561327e-06, + "loss": 1.7275, + "step": 27099 + }, + { + "epoch": 8.317986494782074, + "grad_norm": 0.18735560774803162, + "learning_rate": 7.240337936925884e-06, + "loss": 1.7236, + "step": 27100 + }, + { + "epoch": 8.3182934315531, + "grad_norm": 0.2252741903066635, + "learning_rate": 7.237761860866476e-06, + "loss": 1.7347, + "step": 27101 + }, + { + "epoch": 8.318600368324125, + "grad_norm": 0.16848546266555786, + "learning_rate": 7.2351862074085674e-06, + "loss": 1.6956, + "step": 27102 + }, + { + "epoch": 8.31890730509515, + "grad_norm": 0.13781076669692993, + "learning_rate": 7.232610976577614e-06, + "loss": 1.7018, + "step": 27103 + }, + { + "epoch": 8.319214241866176, + "grad_norm": 0.13122199475765228, + "learning_rate": 7.230036168399052e-06, + "loss": 1.652, + "step": 27104 + }, + { + "epoch": 8.319521178637201, + "grad_norm": 0.16110749542713165, + "learning_rate": 7.22746178289837e-06, + "loss": 1.6778, + "step": 27105 + }, + { + "epoch": 8.319828115408226, + "grad_norm": 0.19378480315208435, + "learning_rate": 7.224887820100951e-06, + "loss": 1.7753, + "step": 27106 + }, + { + "epoch": 8.320135052179252, + "grad_norm": 0.18464957177639008, + "learning_rate": 7.2223142800322775e-06, + "loss": 1.7455, + "step": 27107 + }, + { + "epoch": 8.320441988950277, + "grad_norm": 0.16992080211639404, + "learning_rate": 7.2197411627177636e-06, + "loss": 1.731, + "step": 27108 + }, + { + "epoch": 8.3207489257213, + "grad_norm": 0.16602276265621185, + "learning_rate": 7.2171684681828444e-06, + "loss": 1.7236, + "step": 27109 + }, + { + "epoch": 8.321055862492326, + "grad_norm": 0.16713769733905792, + "learning_rate": 7.214596196452944e-06, + "loss": 1.6636, + "step": 27110 + }, + { + "epoch": 8.321362799263351, + "grad_norm": 0.14015473425388336, + "learning_rate": 7.212024347553475e-06, + "loss": 1.6785, + "step": 27111 + }, + { + "epoch": 8.321669736034377, + "grad_norm": 0.25452539324760437, + "learning_rate": 7.209452921509868e-06, + "loss": 1.7434, + "step": 27112 + }, + { + "epoch": 8.321976672805402, + "grad_norm": 0.14998821914196014, + "learning_rate": 7.206881918347524e-06, + "loss": 1.6973, + "step": 27113 + }, + { + "epoch": 8.322283609576427, + "grad_norm": 0.16751673817634583, + "learning_rate": 7.2043113380918515e-06, + "loss": 1.7364, + "step": 27114 + }, + { + "epoch": 8.322590546347453, + "grad_norm": 0.14287763833999634, + "learning_rate": 7.201741180768262e-06, + "loss": 1.6576, + "step": 27115 + }, + { + "epoch": 8.322897483118478, + "grad_norm": 0.14396314322948456, + "learning_rate": 7.199171446402136e-06, + "loss": 1.6541, + "step": 27116 + }, + { + "epoch": 8.323204419889503, + "grad_norm": 0.1835038661956787, + "learning_rate": 7.196602135018915e-06, + "loss": 1.6925, + "step": 27117 + }, + { + "epoch": 8.323511356660529, + "grad_norm": 0.15047648549079895, + "learning_rate": 7.194033246643939e-06, + "loss": 1.7234, + "step": 27118 + }, + { + "epoch": 8.323818293431554, + "grad_norm": 0.1479605883359909, + "learning_rate": 7.19146478130262e-06, + "loss": 1.6702, + "step": 27119 + }, + { + "epoch": 8.324125230202577, + "grad_norm": 0.15971851348876953, + "learning_rate": 7.188896739020335e-06, + "loss": 1.7189, + "step": 27120 + }, + { + "epoch": 8.324432166973603, + "grad_norm": 0.1598353087902069, + "learning_rate": 7.186329119822455e-06, + "loss": 1.7015, + "step": 27121 + }, + { + "epoch": 8.324739103744628, + "grad_norm": 0.18845009803771973, + "learning_rate": 7.183761923734389e-06, + "loss": 1.6771, + "step": 27122 + }, + { + "epoch": 8.325046040515653, + "grad_norm": 0.15288181602954865, + "learning_rate": 7.181195150781456e-06, + "loss": 1.69, + "step": 27123 + }, + { + "epoch": 8.325352977286679, + "grad_norm": 0.16455978155136108, + "learning_rate": 7.178628800989073e-06, + "loss": 1.74, + "step": 27124 + }, + { + "epoch": 8.325659914057704, + "grad_norm": 0.23335149884223938, + "learning_rate": 7.176062874382561e-06, + "loss": 1.7591, + "step": 27125 + }, + { + "epoch": 8.32596685082873, + "grad_norm": 0.16988953948020935, + "learning_rate": 7.173497370987303e-06, + "loss": 1.744, + "step": 27126 + }, + { + "epoch": 8.326273787599755, + "grad_norm": 0.16113093495368958, + "learning_rate": 7.170932290828647e-06, + "loss": 1.6717, + "step": 27127 + }, + { + "epoch": 8.32658072437078, + "grad_norm": 0.16654139757156372, + "learning_rate": 7.168367633931938e-06, + "loss": 1.6797, + "step": 27128 + }, + { + "epoch": 8.326887661141805, + "grad_norm": 0.16671477258205414, + "learning_rate": 7.165803400322524e-06, + "loss": 1.7299, + "step": 27129 + }, + { + "epoch": 8.32719459791283, + "grad_norm": 0.18269041180610657, + "learning_rate": 7.16323959002575e-06, + "loss": 1.7371, + "step": 27130 + }, + { + "epoch": 8.327501534683854, + "grad_norm": 0.17919829487800598, + "learning_rate": 7.160676203066946e-06, + "loss": 1.7158, + "step": 27131 + }, + { + "epoch": 8.32780847145488, + "grad_norm": 0.17928342521190643, + "learning_rate": 7.158113239471453e-06, + "loss": 1.6964, + "step": 27132 + }, + { + "epoch": 8.328115408225905, + "grad_norm": 0.19797661900520325, + "learning_rate": 7.155550699264585e-06, + "loss": 1.7244, + "step": 27133 + }, + { + "epoch": 8.32842234499693, + "grad_norm": 0.15853050351142883, + "learning_rate": 7.1529885824716926e-06, + "loss": 1.6674, + "step": 27134 + }, + { + "epoch": 8.328729281767956, + "grad_norm": 0.20006918907165527, + "learning_rate": 7.150426889118078e-06, + "loss": 1.7601, + "step": 27135 + }, + { + "epoch": 8.329036218538981, + "grad_norm": 0.18851491808891296, + "learning_rate": 7.147865619229055e-06, + "loss": 1.7139, + "step": 27136 + }, + { + "epoch": 8.329343155310006, + "grad_norm": 0.2384614497423172, + "learning_rate": 7.145304772829936e-06, + "loss": 1.7343, + "step": 27137 + }, + { + "epoch": 8.329650092081032, + "grad_norm": 0.15243887901306152, + "learning_rate": 7.142744349946029e-06, + "loss": 1.7071, + "step": 27138 + }, + { + "epoch": 8.329957028852057, + "grad_norm": 0.20257025957107544, + "learning_rate": 7.140184350602663e-06, + "loss": 1.7255, + "step": 27139 + }, + { + "epoch": 8.330263965623082, + "grad_norm": 0.18863585591316223, + "learning_rate": 7.137624774825091e-06, + "loss": 1.6798, + "step": 27140 + }, + { + "epoch": 8.330570902394108, + "grad_norm": 0.19403952360153198, + "learning_rate": 7.135065622638659e-06, + "loss": 1.7354, + "step": 27141 + }, + { + "epoch": 8.330877839165131, + "grad_norm": 0.17294439673423767, + "learning_rate": 7.132506894068608e-06, + "loss": 1.6935, + "step": 27142 + }, + { + "epoch": 8.331184775936157, + "grad_norm": 0.20410899817943573, + "learning_rate": 7.129948589140262e-06, + "loss": 1.7625, + "step": 27143 + }, + { + "epoch": 8.331491712707182, + "grad_norm": 0.1795405000448227, + "learning_rate": 7.127390707878889e-06, + "loss": 1.6756, + "step": 27144 + }, + { + "epoch": 8.331798649478207, + "grad_norm": 0.1823110431432724, + "learning_rate": 7.12483325030977e-06, + "loss": 1.6844, + "step": 27145 + }, + { + "epoch": 8.332105586249233, + "grad_norm": 0.18655838072299957, + "learning_rate": 7.122276216458179e-06, + "loss": 1.7289, + "step": 27146 + }, + { + "epoch": 8.332412523020258, + "grad_norm": 0.16892722249031067, + "learning_rate": 7.119719606349384e-06, + "loss": 1.7003, + "step": 27147 + }, + { + "epoch": 8.332719459791283, + "grad_norm": 0.17768113315105438, + "learning_rate": 7.117163420008654e-06, + "loss": 1.6859, + "step": 27148 + }, + { + "epoch": 8.333026396562309, + "grad_norm": 0.14221824705600739, + "learning_rate": 7.114607657461253e-06, + "loss": 1.6752, + "step": 27149 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.17095401883125305, + "learning_rate": 7.112052318732421e-06, + "loss": 1.7354, + "step": 27150 + }, + { + "epoch": 8.33364027010436, + "grad_norm": 0.1910656839609146, + "learning_rate": 7.109497403847448e-06, + "loss": 1.7124, + "step": 27151 + }, + { + "epoch": 8.333947206875383, + "grad_norm": 0.1857171505689621, + "learning_rate": 7.106942912831549e-06, + "loss": 1.7716, + "step": 27152 + }, + { + "epoch": 8.334254143646408, + "grad_norm": 0.16951163113117218, + "learning_rate": 7.104388845709981e-06, + "loss": 1.7508, + "step": 27153 + }, + { + "epoch": 8.334561080417433, + "grad_norm": 0.18096883594989777, + "learning_rate": 7.101835202507983e-06, + "loss": 1.7064, + "step": 27154 + }, + { + "epoch": 8.334868017188459, + "grad_norm": 0.19499589502811432, + "learning_rate": 7.099281983250783e-06, + "loss": 1.712, + "step": 27155 + }, + { + "epoch": 8.335174953959484, + "grad_norm": 0.23200182616710663, + "learning_rate": 7.096729187963647e-06, + "loss": 1.8253, + "step": 27156 + }, + { + "epoch": 8.33548189073051, + "grad_norm": 0.3447387218475342, + "learning_rate": 7.094176816671755e-06, + "loss": 1.7531, + "step": 27157 + }, + { + "epoch": 8.335788827501535, + "grad_norm": 0.14633947610855103, + "learning_rate": 7.091624869400376e-06, + "loss": 1.6866, + "step": 27158 + }, + { + "epoch": 8.33609576427256, + "grad_norm": 0.19512905180454254, + "learning_rate": 7.0890733461746905e-06, + "loss": 1.6853, + "step": 27159 + }, + { + "epoch": 8.336402701043585, + "grad_norm": 0.20525458455085754, + "learning_rate": 7.086522247019944e-06, + "loss": 1.69, + "step": 27160 + }, + { + "epoch": 8.33670963781461, + "grad_norm": 0.15972889959812164, + "learning_rate": 7.08397157196134e-06, + "loss": 1.6949, + "step": 27161 + }, + { + "epoch": 8.337016574585636, + "grad_norm": 0.18894724547863007, + "learning_rate": 7.081421321024079e-06, + "loss": 1.7254, + "step": 27162 + }, + { + "epoch": 8.33732351135666, + "grad_norm": 0.17392434179782867, + "learning_rate": 7.078871494233364e-06, + "loss": 1.7449, + "step": 27163 + }, + { + "epoch": 8.337630448127685, + "grad_norm": 0.16262824833393097, + "learning_rate": 7.076322091614401e-06, + "loss": 1.734, + "step": 27164 + }, + { + "epoch": 8.33793738489871, + "grad_norm": 0.1960107982158661, + "learning_rate": 7.073773113192383e-06, + "loss": 1.6464, + "step": 27165 + }, + { + "epoch": 8.338244321669736, + "grad_norm": 0.1750497817993164, + "learning_rate": 7.071224558992501e-06, + "loss": 1.7187, + "step": 27166 + }, + { + "epoch": 8.338551258440761, + "grad_norm": 0.2179764360189438, + "learning_rate": 7.068676429039928e-06, + "loss": 1.7207, + "step": 27167 + }, + { + "epoch": 8.338858195211786, + "grad_norm": 0.17758040130138397, + "learning_rate": 7.066128723359877e-06, + "loss": 1.7248, + "step": 27168 + }, + { + "epoch": 8.339165131982812, + "grad_norm": 0.16506128013134003, + "learning_rate": 7.063581441977496e-06, + "loss": 1.7788, + "step": 27169 + }, + { + "epoch": 8.339472068753837, + "grad_norm": 0.18444709479808807, + "learning_rate": 7.061034584917963e-06, + "loss": 1.6958, + "step": 27170 + }, + { + "epoch": 8.339779005524862, + "grad_norm": 0.19419504702091217, + "learning_rate": 7.0584881522064605e-06, + "loss": 1.7459, + "step": 27171 + }, + { + "epoch": 8.340085942295888, + "grad_norm": 0.19482584297657013, + "learning_rate": 7.055942143868133e-06, + "loss": 1.7043, + "step": 27172 + }, + { + "epoch": 8.340392879066913, + "grad_norm": 0.20925387740135193, + "learning_rate": 7.053396559928183e-06, + "loss": 1.7817, + "step": 27173 + }, + { + "epoch": 8.340699815837937, + "grad_norm": 0.2067698836326599, + "learning_rate": 7.050851400411712e-06, + "loss": 1.729, + "step": 27174 + }, + { + "epoch": 8.341006752608962, + "grad_norm": 0.1617327481508255, + "learning_rate": 7.048306665343923e-06, + "loss": 1.6888, + "step": 27175 + }, + { + "epoch": 8.341313689379987, + "grad_norm": 0.16514994204044342, + "learning_rate": 7.045762354749924e-06, + "loss": 1.7152, + "step": 27176 + }, + { + "epoch": 8.341620626151013, + "grad_norm": 0.17930150032043457, + "learning_rate": 7.043218468654889e-06, + "loss": 1.8112, + "step": 27177 + }, + { + "epoch": 8.341927562922038, + "grad_norm": 0.17400570213794708, + "learning_rate": 7.040675007083941e-06, + "loss": 1.7071, + "step": 27178 + }, + { + "epoch": 8.342234499693063, + "grad_norm": 0.18226927518844604, + "learning_rate": 7.038131970062228e-06, + "loss": 1.7786, + "step": 27179 + }, + { + "epoch": 8.342541436464089, + "grad_norm": 0.15586300194263458, + "learning_rate": 7.035589357614869e-06, + "loss": 1.7414, + "step": 27180 + }, + { + "epoch": 8.342848373235114, + "grad_norm": 0.18447721004486084, + "learning_rate": 7.033047169767004e-06, + "loss": 1.7123, + "step": 27181 + }, + { + "epoch": 8.34315531000614, + "grad_norm": 0.16714699566364288, + "learning_rate": 7.030505406543747e-06, + "loss": 1.728, + "step": 27182 + }, + { + "epoch": 8.343462246777165, + "grad_norm": 0.15295952558517456, + "learning_rate": 7.027964067970228e-06, + "loss": 1.6926, + "step": 27183 + }, + { + "epoch": 8.34376918354819, + "grad_norm": 0.14499974250793457, + "learning_rate": 7.025423154071537e-06, + "loss": 1.6841, + "step": 27184 + }, + { + "epoch": 8.344076120319214, + "grad_norm": 0.15066829323768616, + "learning_rate": 7.022882664872827e-06, + "loss": 1.6593, + "step": 27185 + }, + { + "epoch": 8.344383057090239, + "grad_norm": 0.17318779230117798, + "learning_rate": 7.020342600399166e-06, + "loss": 1.698, + "step": 27186 + }, + { + "epoch": 8.344689993861264, + "grad_norm": 0.19946762919425964, + "learning_rate": 7.017802960675674e-06, + "loss": 1.7257, + "step": 27187 + }, + { + "epoch": 8.34499693063229, + "grad_norm": 0.17052631080150604, + "learning_rate": 7.015263745727441e-06, + "loss": 1.7299, + "step": 27188 + }, + { + "epoch": 8.345303867403315, + "grad_norm": 0.16269686818122864, + "learning_rate": 7.012724955579558e-06, + "loss": 1.7385, + "step": 27189 + }, + { + "epoch": 8.34561080417434, + "grad_norm": 0.19195757806301117, + "learning_rate": 7.010186590257145e-06, + "loss": 1.7264, + "step": 27190 + }, + { + "epoch": 8.345917740945366, + "grad_norm": 0.14985592663288116, + "learning_rate": 7.007648649785248e-06, + "loss": 1.7135, + "step": 27191 + }, + { + "epoch": 8.34622467771639, + "grad_norm": 0.16438701748847961, + "learning_rate": 7.00511113418898e-06, + "loss": 1.6876, + "step": 27192 + }, + { + "epoch": 8.346531614487416, + "grad_norm": 0.241184800863266, + "learning_rate": 7.002574043493387e-06, + "loss": 1.8587, + "step": 27193 + }, + { + "epoch": 8.346838551258442, + "grad_norm": 0.17353931069374084, + "learning_rate": 7.000037377723567e-06, + "loss": 1.7465, + "step": 27194 + }, + { + "epoch": 8.347145488029465, + "grad_norm": 0.1923576444387436, + "learning_rate": 6.997501136904583e-06, + "loss": 1.7859, + "step": 27195 + }, + { + "epoch": 8.34745242480049, + "grad_norm": 0.1997295618057251, + "learning_rate": 6.994965321061492e-06, + "loss": 1.7612, + "step": 27196 + }, + { + "epoch": 8.347759361571516, + "grad_norm": 0.184821218252182, + "learning_rate": 6.992429930219363e-06, + "loss": 1.6761, + "step": 27197 + }, + { + "epoch": 8.348066298342541, + "grad_norm": 0.14091727137565613, + "learning_rate": 6.989894964403248e-06, + "loss": 1.6541, + "step": 27198 + }, + { + "epoch": 8.348373235113566, + "grad_norm": 0.13829854130744934, + "learning_rate": 6.987360423638206e-06, + "loss": 1.6814, + "step": 27199 + }, + { + "epoch": 8.348680171884592, + "grad_norm": 0.12685348093509674, + "learning_rate": 6.984826307949272e-06, + "loss": 1.6498, + "step": 27200 + }, + { + "epoch": 8.348987108655617, + "grad_norm": 0.17062726616859436, + "learning_rate": 6.9822926173614856e-06, + "loss": 1.7138, + "step": 27201 + }, + { + "epoch": 8.349294045426642, + "grad_norm": 0.15178726613521576, + "learning_rate": 6.979759351899923e-06, + "loss": 1.756, + "step": 27202 + }, + { + "epoch": 8.349600982197668, + "grad_norm": 0.1897916942834854, + "learning_rate": 6.97722651158958e-06, + "loss": 1.7317, + "step": 27203 + }, + { + "epoch": 8.349907918968693, + "grad_norm": 0.13750115036964417, + "learning_rate": 6.974694096455503e-06, + "loss": 1.6853, + "step": 27204 + }, + { + "epoch": 8.350214855739718, + "grad_norm": 0.17380347847938538, + "learning_rate": 6.972162106522717e-06, + "loss": 1.728, + "step": 27205 + }, + { + "epoch": 8.350521792510742, + "grad_norm": 0.1593543291091919, + "learning_rate": 6.96963054181623e-06, + "loss": 1.6904, + "step": 27206 + }, + { + "epoch": 8.350828729281767, + "grad_norm": 0.1569581925868988, + "learning_rate": 6.967099402361099e-06, + "loss": 1.6995, + "step": 27207 + }, + { + "epoch": 8.351135666052793, + "grad_norm": 0.180283784866333, + "learning_rate": 6.9645686881822935e-06, + "loss": 1.6755, + "step": 27208 + }, + { + "epoch": 8.351442602823818, + "grad_norm": 0.2145276516675949, + "learning_rate": 6.9620383993048654e-06, + "loss": 1.7705, + "step": 27209 + }, + { + "epoch": 8.351749539594843, + "grad_norm": 0.15903061628341675, + "learning_rate": 6.959508535753772e-06, + "loss": 1.702, + "step": 27210 + }, + { + "epoch": 8.352056476365869, + "grad_norm": 0.16429775953292847, + "learning_rate": 6.9569790975540565e-06, + "loss": 1.6656, + "step": 27211 + }, + { + "epoch": 8.352363413136894, + "grad_norm": 0.1546638011932373, + "learning_rate": 6.954450084730707e-06, + "loss": 1.681, + "step": 27212 + }, + { + "epoch": 8.35267034990792, + "grad_norm": 0.17022907733917236, + "learning_rate": 6.951921497308705e-06, + "loss": 1.7094, + "step": 27213 + }, + { + "epoch": 8.352977286678945, + "grad_norm": 0.18317057192325592, + "learning_rate": 6.949393335313048e-06, + "loss": 1.7395, + "step": 27214 + }, + { + "epoch": 8.35328422344997, + "grad_norm": 0.1707061231136322, + "learning_rate": 6.94686559876872e-06, + "loss": 1.6918, + "step": 27215 + }, + { + "epoch": 8.353591160220994, + "grad_norm": 0.171799436211586, + "learning_rate": 6.944338287700697e-06, + "loss": 1.7173, + "step": 27216 + }, + { + "epoch": 8.353898096992019, + "grad_norm": 0.14982536435127258, + "learning_rate": 6.941811402133963e-06, + "loss": 1.7244, + "step": 27217 + }, + { + "epoch": 8.354205033763044, + "grad_norm": 0.1584668904542923, + "learning_rate": 6.939284942093471e-06, + "loss": 1.7023, + "step": 27218 + }, + { + "epoch": 8.35451197053407, + "grad_norm": 0.18367518484592438, + "learning_rate": 6.93675890760423e-06, + "loss": 1.6977, + "step": 27219 + }, + { + "epoch": 8.354818907305095, + "grad_norm": 0.2665458619594574, + "learning_rate": 6.934233298691167e-06, + "loss": 1.7711, + "step": 27220 + }, + { + "epoch": 8.35512584407612, + "grad_norm": 0.1657658815383911, + "learning_rate": 6.931708115379249e-06, + "loss": 1.6957, + "step": 27221 + }, + { + "epoch": 8.355432780847146, + "grad_norm": 0.17687681317329407, + "learning_rate": 6.929183357693436e-06, + "loss": 1.7163, + "step": 27222 + }, + { + "epoch": 8.355739717618171, + "grad_norm": 0.1775265783071518, + "learning_rate": 6.926659025658666e-06, + "loss": 1.7595, + "step": 27223 + }, + { + "epoch": 8.356046654389196, + "grad_norm": 0.1962285041809082, + "learning_rate": 6.924135119299919e-06, + "loss": 1.7852, + "step": 27224 + }, + { + "epoch": 8.356353591160222, + "grad_norm": 0.17352642118930817, + "learning_rate": 6.921611638642095e-06, + "loss": 1.748, + "step": 27225 + }, + { + "epoch": 8.356660527931247, + "grad_norm": 0.19602125883102417, + "learning_rate": 6.919088583710176e-06, + "loss": 1.685, + "step": 27226 + }, + { + "epoch": 8.35696746470227, + "grad_norm": 0.15199948847293854, + "learning_rate": 6.9165659545290525e-06, + "loss": 1.6641, + "step": 27227 + }, + { + "epoch": 8.357274401473296, + "grad_norm": 0.15671736001968384, + "learning_rate": 6.914043751123683e-06, + "loss": 1.6915, + "step": 27228 + }, + { + "epoch": 8.357581338244321, + "grad_norm": 0.19513672590255737, + "learning_rate": 6.911521973518992e-06, + "loss": 1.7526, + "step": 27229 + }, + { + "epoch": 8.357888275015346, + "grad_norm": 0.15108506381511688, + "learning_rate": 6.9090006217398975e-06, + "loss": 1.7167, + "step": 27230 + }, + { + "epoch": 8.358195211786372, + "grad_norm": 0.19638952612876892, + "learning_rate": 6.906479695811307e-06, + "loss": 1.6937, + "step": 27231 + }, + { + "epoch": 8.358502148557397, + "grad_norm": 0.14345301687717438, + "learning_rate": 6.903959195758148e-06, + "loss": 1.7295, + "step": 27232 + }, + { + "epoch": 8.358809085328422, + "grad_norm": 0.1557627171278, + "learning_rate": 6.901439121605324e-06, + "loss": 1.7146, + "step": 27233 + }, + { + "epoch": 8.359116022099448, + "grad_norm": 0.15030202269554138, + "learning_rate": 6.898919473377741e-06, + "loss": 1.6974, + "step": 27234 + }, + { + "epoch": 8.359422958870473, + "grad_norm": 0.24213968217372894, + "learning_rate": 6.896400251100283e-06, + "loss": 1.8179, + "step": 27235 + }, + { + "epoch": 8.359729895641498, + "grad_norm": 0.1646348387002945, + "learning_rate": 6.893881454797885e-06, + "loss": 1.7001, + "step": 27236 + }, + { + "epoch": 8.360036832412524, + "grad_norm": 0.18399927020072937, + "learning_rate": 6.891363084495406e-06, + "loss": 1.746, + "step": 27237 + }, + { + "epoch": 8.360343769183547, + "grad_norm": 0.19470340013504028, + "learning_rate": 6.8888451402177365e-06, + "loss": 1.7442, + "step": 27238 + }, + { + "epoch": 8.360650705954573, + "grad_norm": 0.1420234590768814, + "learning_rate": 6.886327621989775e-06, + "loss": 1.6481, + "step": 27239 + }, + { + "epoch": 8.360957642725598, + "grad_norm": 0.1827881634235382, + "learning_rate": 6.883810529836382e-06, + "loss": 1.6842, + "step": 27240 + }, + { + "epoch": 8.361264579496623, + "grad_norm": 0.19096913933753967, + "learning_rate": 6.881293863782468e-06, + "loss": 1.7061, + "step": 27241 + }, + { + "epoch": 8.361571516267649, + "grad_norm": 0.1871458888053894, + "learning_rate": 6.878777623852855e-06, + "loss": 1.7607, + "step": 27242 + }, + { + "epoch": 8.361878453038674, + "grad_norm": 0.13643455505371094, + "learning_rate": 6.876261810072459e-06, + "loss": 1.6747, + "step": 27243 + }, + { + "epoch": 8.3621853898097, + "grad_norm": 0.16990543901920319, + "learning_rate": 6.8737464224660985e-06, + "loss": 1.7318, + "step": 27244 + }, + { + "epoch": 8.362492326580725, + "grad_norm": 0.16357167065143585, + "learning_rate": 6.871231461058658e-06, + "loss": 1.6609, + "step": 27245 + }, + { + "epoch": 8.36279926335175, + "grad_norm": 0.20114652812480927, + "learning_rate": 6.868716925874996e-06, + "loss": 1.7647, + "step": 27246 + }, + { + "epoch": 8.363106200122775, + "grad_norm": 0.18387655913829803, + "learning_rate": 6.866202816939949e-06, + "loss": 1.7213, + "step": 27247 + }, + { + "epoch": 8.3634131368938, + "grad_norm": 0.18712659180164337, + "learning_rate": 6.863689134278367e-06, + "loss": 1.7144, + "step": 27248 + }, + { + "epoch": 8.363720073664824, + "grad_norm": 0.19831795990467072, + "learning_rate": 6.861175877915088e-06, + "loss": 1.7396, + "step": 27249 + }, + { + "epoch": 8.36402701043585, + "grad_norm": 0.2181798815727234, + "learning_rate": 6.858663047874958e-06, + "loss": 1.7523, + "step": 27250 + }, + { + "epoch": 8.364333947206875, + "grad_norm": 0.17912371456623077, + "learning_rate": 6.856150644182807e-06, + "loss": 1.7617, + "step": 27251 + }, + { + "epoch": 8.3646408839779, + "grad_norm": 0.16200366616249084, + "learning_rate": 6.85363866686346e-06, + "loss": 1.6886, + "step": 27252 + }, + { + "epoch": 8.364947820748926, + "grad_norm": 0.18456755578517914, + "learning_rate": 6.851127115941747e-06, + "loss": 1.6873, + "step": 27253 + }, + { + "epoch": 8.365254757519951, + "grad_norm": 0.1649440973997116, + "learning_rate": 6.848615991442487e-06, + "loss": 1.7024, + "step": 27254 + }, + { + "epoch": 8.365561694290976, + "grad_norm": 0.17722025513648987, + "learning_rate": 6.846105293390492e-06, + "loss": 1.7401, + "step": 27255 + }, + { + "epoch": 8.365868631062002, + "grad_norm": 0.18342679738998413, + "learning_rate": 6.843595021810578e-06, + "loss": 1.7285, + "step": 27256 + }, + { + "epoch": 8.366175567833027, + "grad_norm": 0.13590754568576813, + "learning_rate": 6.841085176727557e-06, + "loss": 1.6704, + "step": 27257 + }, + { + "epoch": 8.366482504604052, + "grad_norm": 0.16721662878990173, + "learning_rate": 6.838575758166221e-06, + "loss": 1.7371, + "step": 27258 + }, + { + "epoch": 8.366789441375076, + "grad_norm": 0.15011465549468994, + "learning_rate": 6.836066766151372e-06, + "loss": 1.668, + "step": 27259 + }, + { + "epoch": 8.367096378146101, + "grad_norm": 0.15394380688667297, + "learning_rate": 6.833558200707835e-06, + "loss": 1.7402, + "step": 27260 + }, + { + "epoch": 8.367403314917127, + "grad_norm": 0.2134244441986084, + "learning_rate": 6.83105006186035e-06, + "loss": 1.6979, + "step": 27261 + }, + { + "epoch": 8.367710251688152, + "grad_norm": 0.2169496864080429, + "learning_rate": 6.8285423496337375e-06, + "loss": 1.7821, + "step": 27262 + }, + { + "epoch": 8.368017188459177, + "grad_norm": 0.16033586859703064, + "learning_rate": 6.8260350640527774e-06, + "loss": 1.6976, + "step": 27263 + }, + { + "epoch": 8.368324125230203, + "grad_norm": 0.2089877724647522, + "learning_rate": 6.823528205142244e-06, + "loss": 1.7532, + "step": 27264 + }, + { + "epoch": 8.368631062001228, + "grad_norm": 0.12897463142871857, + "learning_rate": 6.821021772926911e-06, + "loss": 1.6445, + "step": 27265 + }, + { + "epoch": 8.368937998772253, + "grad_norm": 0.18726956844329834, + "learning_rate": 6.818515767431549e-06, + "loss": 1.7296, + "step": 27266 + }, + { + "epoch": 8.369244935543279, + "grad_norm": 0.1857292354106903, + "learning_rate": 6.816010188680927e-06, + "loss": 1.7747, + "step": 27267 + }, + { + "epoch": 8.369551872314304, + "grad_norm": 0.24680334329605103, + "learning_rate": 6.813505036699802e-06, + "loss": 1.7877, + "step": 27268 + }, + { + "epoch": 8.36985880908533, + "grad_norm": 0.1404808908700943, + "learning_rate": 6.811000311512927e-06, + "loss": 1.6769, + "step": 27269 + }, + { + "epoch": 8.370165745856353, + "grad_norm": 0.18543009459972382, + "learning_rate": 6.808496013145066e-06, + "loss": 1.7325, + "step": 27270 + }, + { + "epoch": 8.370472682627378, + "grad_norm": 0.13881617784500122, + "learning_rate": 6.805992141620959e-06, + "loss": 1.7022, + "step": 27271 + }, + { + "epoch": 8.370779619398403, + "grad_norm": 0.18534715473651886, + "learning_rate": 6.80348869696536e-06, + "loss": 1.7609, + "step": 27272 + }, + { + "epoch": 8.371086556169429, + "grad_norm": 0.20225360989570618, + "learning_rate": 6.800985679202998e-06, + "loss": 1.7159, + "step": 27273 + }, + { + "epoch": 8.371393492940454, + "grad_norm": 0.1462840884923935, + "learning_rate": 6.79848308835862e-06, + "loss": 1.6607, + "step": 27274 + }, + { + "epoch": 8.37170042971148, + "grad_norm": 0.17453989386558533, + "learning_rate": 6.795980924456952e-06, + "loss": 1.7705, + "step": 27275 + }, + { + "epoch": 8.372007366482505, + "grad_norm": 0.15709565579891205, + "learning_rate": 6.793479187522711e-06, + "loss": 1.6961, + "step": 27276 + }, + { + "epoch": 8.37231430325353, + "grad_norm": 0.14979243278503418, + "learning_rate": 6.790977877580656e-06, + "loss": 1.6817, + "step": 27277 + }, + { + "epoch": 8.372621240024555, + "grad_norm": 0.16452275216579437, + "learning_rate": 6.7884769946554575e-06, + "loss": 1.693, + "step": 27278 + }, + { + "epoch": 8.37292817679558, + "grad_norm": 0.18353265523910522, + "learning_rate": 6.785976538771882e-06, + "loss": 1.7003, + "step": 27279 + }, + { + "epoch": 8.373235113566606, + "grad_norm": 0.15123683214187622, + "learning_rate": 6.783476509954595e-06, + "loss": 1.6611, + "step": 27280 + }, + { + "epoch": 8.37354205033763, + "grad_norm": 0.19939517974853516, + "learning_rate": 6.780976908228332e-06, + "loss": 1.7969, + "step": 27281 + }, + { + "epoch": 8.373848987108655, + "grad_norm": 0.2997080981731415, + "learning_rate": 6.778477733617783e-06, + "loss": 1.7822, + "step": 27282 + }, + { + "epoch": 8.37415592387968, + "grad_norm": 0.13474299013614655, + "learning_rate": 6.775978986147657e-06, + "loss": 1.7155, + "step": 27283 + }, + { + "epoch": 8.374462860650706, + "grad_norm": 0.15992368757724762, + "learning_rate": 6.773480665842635e-06, + "loss": 1.6985, + "step": 27284 + }, + { + "epoch": 8.374769797421731, + "grad_norm": 0.15250587463378906, + "learning_rate": 6.770982772727413e-06, + "loss": 1.7007, + "step": 27285 + }, + { + "epoch": 8.375076734192756, + "grad_norm": 0.1373993456363678, + "learning_rate": 6.768485306826683e-06, + "loss": 1.6852, + "step": 27286 + }, + { + "epoch": 8.375383670963782, + "grad_norm": 0.15772612392902374, + "learning_rate": 6.765988268165113e-06, + "loss": 1.6881, + "step": 27287 + }, + { + "epoch": 8.375690607734807, + "grad_norm": 0.13689690828323364, + "learning_rate": 6.76349165676739e-06, + "loss": 1.6747, + "step": 27288 + }, + { + "epoch": 8.375997544505832, + "grad_norm": 0.18657375872135162, + "learning_rate": 6.7609954726581825e-06, + "loss": 1.7324, + "step": 27289 + }, + { + "epoch": 8.376304481276858, + "grad_norm": 0.16617898643016815, + "learning_rate": 6.758499715862166e-06, + "loss": 1.6832, + "step": 27290 + }, + { + "epoch": 8.376611418047883, + "grad_norm": 0.16960306465625763, + "learning_rate": 6.756004386403996e-06, + "loss": 1.7353, + "step": 27291 + }, + { + "epoch": 8.376918354818907, + "grad_norm": 0.17030803859233856, + "learning_rate": 6.753509484308334e-06, + "loss": 1.7079, + "step": 27292 + }, + { + "epoch": 8.377225291589932, + "grad_norm": 0.16151085495948792, + "learning_rate": 6.751015009599831e-06, + "loss": 1.6706, + "step": 27293 + }, + { + "epoch": 8.377532228360957, + "grad_norm": 0.1715710461139679, + "learning_rate": 6.748520962303173e-06, + "loss": 1.7116, + "step": 27294 + }, + { + "epoch": 8.377839165131983, + "grad_norm": 0.20747625827789307, + "learning_rate": 6.746027342442951e-06, + "loss": 1.731, + "step": 27295 + }, + { + "epoch": 8.378146101903008, + "grad_norm": 0.1645912081003189, + "learning_rate": 6.743534150043867e-06, + "loss": 1.7076, + "step": 27296 + }, + { + "epoch": 8.378453038674033, + "grad_norm": 0.16044393181800842, + "learning_rate": 6.741041385130509e-06, + "loss": 1.7105, + "step": 27297 + }, + { + "epoch": 8.378759975445059, + "grad_norm": 0.18224483728408813, + "learning_rate": 6.738549047727543e-06, + "loss": 1.7258, + "step": 27298 + }, + { + "epoch": 8.379066912216084, + "grad_norm": 0.17351657152175903, + "learning_rate": 6.7360571378595915e-06, + "loss": 1.7369, + "step": 27299 + }, + { + "epoch": 8.37937384898711, + "grad_norm": 0.18293599784374237, + "learning_rate": 6.733565655551283e-06, + "loss": 1.7151, + "step": 27300 + }, + { + "epoch": 8.379680785758135, + "grad_norm": 0.1593983918428421, + "learning_rate": 6.731074600827242e-06, + "loss": 1.6544, + "step": 27301 + }, + { + "epoch": 8.379987722529158, + "grad_norm": 0.16315947473049164, + "learning_rate": 6.728583973712077e-06, + "loss": 1.7442, + "step": 27302 + }, + { + "epoch": 8.380294659300183, + "grad_norm": 0.13841219246387482, + "learning_rate": 6.726093774230408e-06, + "loss": 1.6639, + "step": 27303 + }, + { + "epoch": 8.380601596071209, + "grad_norm": 0.14162768423557281, + "learning_rate": 6.723604002406847e-06, + "loss": 1.6713, + "step": 27304 + }, + { + "epoch": 8.380908532842234, + "grad_norm": 0.1737380474805832, + "learning_rate": 6.721114658265992e-06, + "loss": 1.7197, + "step": 27305 + }, + { + "epoch": 8.38121546961326, + "grad_norm": 0.15531061589717865, + "learning_rate": 6.718625741832452e-06, + "loss": 1.7337, + "step": 27306 + }, + { + "epoch": 8.381522406384285, + "grad_norm": 0.1833781898021698, + "learning_rate": 6.716137253130816e-06, + "loss": 1.7838, + "step": 27307 + }, + { + "epoch": 8.38182934315531, + "grad_norm": 0.23010820150375366, + "learning_rate": 6.713649192185683e-06, + "loss": 1.7023, + "step": 27308 + }, + { + "epoch": 8.382136279926335, + "grad_norm": 0.14409376680850983, + "learning_rate": 6.7111615590216445e-06, + "loss": 1.6968, + "step": 27309 + }, + { + "epoch": 8.38244321669736, + "grad_norm": 0.19448643922805786, + "learning_rate": 6.7086743536632635e-06, + "loss": 1.7117, + "step": 27310 + }, + { + "epoch": 8.382750153468386, + "grad_norm": 0.18580564856529236, + "learning_rate": 6.706187576135159e-06, + "loss": 1.8183, + "step": 27311 + }, + { + "epoch": 8.383057090239411, + "grad_norm": 0.20270103216171265, + "learning_rate": 6.7037012264618675e-06, + "loss": 1.7666, + "step": 27312 + }, + { + "epoch": 8.383364027010435, + "grad_norm": 0.16575069725513458, + "learning_rate": 6.7012153046679904e-06, + "loss": 1.7542, + "step": 27313 + }, + { + "epoch": 8.38367096378146, + "grad_norm": 0.16375242173671722, + "learning_rate": 6.698729810778065e-06, + "loss": 1.7117, + "step": 27314 + }, + { + "epoch": 8.383977900552486, + "grad_norm": 0.2082248479127884, + "learning_rate": 6.696244744816682e-06, + "loss": 1.7687, + "step": 27315 + }, + { + "epoch": 8.384284837323511, + "grad_norm": 0.1562620848417282, + "learning_rate": 6.693760106808389e-06, + "loss": 1.6782, + "step": 27316 + }, + { + "epoch": 8.384591774094536, + "grad_norm": 0.1883714199066162, + "learning_rate": 6.6912758967777435e-06, + "loss": 1.7023, + "step": 27317 + }, + { + "epoch": 8.384898710865562, + "grad_norm": 0.17445886135101318, + "learning_rate": 6.688792114749292e-06, + "loss": 1.7019, + "step": 27318 + }, + { + "epoch": 8.385205647636587, + "grad_norm": 0.20479950308799744, + "learning_rate": 6.686308760747584e-06, + "loss": 1.7514, + "step": 27319 + }, + { + "epoch": 8.385512584407612, + "grad_norm": 0.21790143847465515, + "learning_rate": 6.683825834797153e-06, + "loss": 1.7243, + "step": 27320 + }, + { + "epoch": 8.385819521178638, + "grad_norm": 0.1784016340970993, + "learning_rate": 6.681343336922552e-06, + "loss": 1.7301, + "step": 27321 + }, + { + "epoch": 8.386126457949663, + "grad_norm": 0.22286179661750793, + "learning_rate": 6.678861267148301e-06, + "loss": 1.7231, + "step": 27322 + }, + { + "epoch": 8.386433394720688, + "grad_norm": 0.17854957282543182, + "learning_rate": 6.676379625498935e-06, + "loss": 1.7216, + "step": 27323 + }, + { + "epoch": 8.386740331491712, + "grad_norm": 0.1750447154045105, + "learning_rate": 6.67389841199898e-06, + "loss": 1.7603, + "step": 27324 + }, + { + "epoch": 8.387047268262737, + "grad_norm": 0.17893844842910767, + "learning_rate": 6.6714176266729545e-06, + "loss": 1.7229, + "step": 27325 + }, + { + "epoch": 8.387354205033763, + "grad_norm": 0.18705782294273376, + "learning_rate": 6.6689372695453725e-06, + "loss": 1.7021, + "step": 27326 + }, + { + "epoch": 8.387661141804788, + "grad_norm": 0.18719066679477692, + "learning_rate": 6.666457340640742e-06, + "loss": 1.7216, + "step": 27327 + }, + { + "epoch": 8.387968078575813, + "grad_norm": 0.16408847272396088, + "learning_rate": 6.663977839983604e-06, + "loss": 1.6937, + "step": 27328 + }, + { + "epoch": 8.388275015346839, + "grad_norm": 0.1739223599433899, + "learning_rate": 6.661498767598407e-06, + "loss": 1.6533, + "step": 27329 + }, + { + "epoch": 8.388581952117864, + "grad_norm": 0.19943352043628693, + "learning_rate": 6.6590201235097075e-06, + "loss": 1.753, + "step": 27330 + }, + { + "epoch": 8.38888888888889, + "grad_norm": 0.1412268429994583, + "learning_rate": 6.656541907741954e-06, + "loss": 1.6669, + "step": 27331 + }, + { + "epoch": 8.389195825659915, + "grad_norm": 0.17952445149421692, + "learning_rate": 6.654064120319664e-06, + "loss": 1.6921, + "step": 27332 + }, + { + "epoch": 8.38950276243094, + "grad_norm": 0.22117477655410767, + "learning_rate": 6.65158676126732e-06, + "loss": 1.7677, + "step": 27333 + }, + { + "epoch": 8.389809699201965, + "grad_norm": 0.1926339566707611, + "learning_rate": 6.649109830609401e-06, + "loss": 1.7237, + "step": 27334 + }, + { + "epoch": 8.390116635972989, + "grad_norm": 0.3306657671928406, + "learning_rate": 6.646633328370394e-06, + "loss": 1.7735, + "step": 27335 + }, + { + "epoch": 8.390423572744014, + "grad_norm": 0.14908578991889954, + "learning_rate": 6.644157254574762e-06, + "loss": 1.7109, + "step": 27336 + }, + { + "epoch": 8.39073050951504, + "grad_norm": 0.20824603736400604, + "learning_rate": 6.64168160924698e-06, + "loss": 1.7177, + "step": 27337 + }, + { + "epoch": 8.391037446286065, + "grad_norm": 0.22669748961925507, + "learning_rate": 6.6392063924115125e-06, + "loss": 1.7842, + "step": 27338 + }, + { + "epoch": 8.39134438305709, + "grad_norm": 0.16690780222415924, + "learning_rate": 6.6367316040928215e-06, + "loss": 1.739, + "step": 27339 + }, + { + "epoch": 8.391651319828116, + "grad_norm": 0.17900501191616058, + "learning_rate": 6.634257244315367e-06, + "loss": 1.705, + "step": 27340 + }, + { + "epoch": 8.39195825659914, + "grad_norm": 0.18606948852539062, + "learning_rate": 6.631783313103595e-06, + "loss": 1.7324, + "step": 27341 + }, + { + "epoch": 8.392265193370166, + "grad_norm": 0.15370480716228485, + "learning_rate": 6.629309810481965e-06, + "loss": 1.6834, + "step": 27342 + }, + { + "epoch": 8.392572130141192, + "grad_norm": 0.13654825091362, + "learning_rate": 6.626836736474917e-06, + "loss": 1.6729, + "step": 27343 + }, + { + "epoch": 8.392879066912217, + "grad_norm": 0.21128645539283752, + "learning_rate": 6.624364091106877e-06, + "loss": 1.7494, + "step": 27344 + }, + { + "epoch": 8.39318600368324, + "grad_norm": 0.1608622819185257, + "learning_rate": 6.621891874402314e-06, + "loss": 1.6951, + "step": 27345 + }, + { + "epoch": 8.393492940454266, + "grad_norm": 0.20148086547851562, + "learning_rate": 6.619420086385619e-06, + "loss": 1.7616, + "step": 27346 + }, + { + "epoch": 8.393799877225291, + "grad_norm": 0.1927247792482376, + "learning_rate": 6.616948727081262e-06, + "loss": 1.7088, + "step": 27347 + }, + { + "epoch": 8.394106813996316, + "grad_norm": 0.18318399786949158, + "learning_rate": 6.614477796513629e-06, + "loss": 1.7176, + "step": 27348 + }, + { + "epoch": 8.394413750767342, + "grad_norm": 0.20923443138599396, + "learning_rate": 6.612007294707162e-06, + "loss": 1.758, + "step": 27349 + }, + { + "epoch": 8.394720687538367, + "grad_norm": 0.20041905343532562, + "learning_rate": 6.609537221686268e-06, + "loss": 1.6843, + "step": 27350 + }, + { + "epoch": 8.395027624309392, + "grad_norm": 0.13480354845523834, + "learning_rate": 6.607067577475362e-06, + "loss": 1.6766, + "step": 27351 + }, + { + "epoch": 8.395334561080418, + "grad_norm": 0.2022085338830948, + "learning_rate": 6.604598362098846e-06, + "loss": 1.7448, + "step": 27352 + }, + { + "epoch": 8.395641497851443, + "grad_norm": 0.21842770278453827, + "learning_rate": 6.602129575581123e-06, + "loss": 1.7202, + "step": 27353 + }, + { + "epoch": 8.395948434622468, + "grad_norm": 0.16519947350025177, + "learning_rate": 6.599661217946596e-06, + "loss": 1.7036, + "step": 27354 + }, + { + "epoch": 8.396255371393494, + "grad_norm": 0.14931483566761017, + "learning_rate": 6.59719328921965e-06, + "loss": 1.7244, + "step": 27355 + }, + { + "epoch": 8.396562308164517, + "grad_norm": 0.22807423770427704, + "learning_rate": 6.594725789424683e-06, + "loss": 1.7758, + "step": 27356 + }, + { + "epoch": 8.396869244935543, + "grad_norm": 0.15723249316215515, + "learning_rate": 6.592258718586075e-06, + "loss": 1.7033, + "step": 27357 + }, + { + "epoch": 8.397176181706568, + "grad_norm": 0.1934487521648407, + "learning_rate": 6.589792076728207e-06, + "loss": 1.7767, + "step": 27358 + }, + { + "epoch": 8.397483118477593, + "grad_norm": 0.16923396289348602, + "learning_rate": 6.587325863875454e-06, + "loss": 1.7125, + "step": 27359 + }, + { + "epoch": 8.397790055248619, + "grad_norm": 0.1533476561307907, + "learning_rate": 6.584860080052196e-06, + "loss": 1.7245, + "step": 27360 + }, + { + "epoch": 8.398096992019644, + "grad_norm": 0.1610613465309143, + "learning_rate": 6.582394725282786e-06, + "loss": 1.6974, + "step": 27361 + }, + { + "epoch": 8.39840392879067, + "grad_norm": 0.19170965254306793, + "learning_rate": 6.579929799591622e-06, + "loss": 1.6956, + "step": 27362 + }, + { + "epoch": 8.398710865561695, + "grad_norm": 0.17479272186756134, + "learning_rate": 6.5774653030030164e-06, + "loss": 1.699, + "step": 27363 + }, + { + "epoch": 8.39901780233272, + "grad_norm": 0.15651267766952515, + "learning_rate": 6.575001235541378e-06, + "loss": 1.655, + "step": 27364 + }, + { + "epoch": 8.399324739103745, + "grad_norm": 0.13939335942268372, + "learning_rate": 6.572537597230999e-06, + "loss": 1.6963, + "step": 27365 + }, + { + "epoch": 8.399631675874769, + "grad_norm": 0.16157624125480652, + "learning_rate": 6.570074388096275e-06, + "loss": 1.6811, + "step": 27366 + }, + { + "epoch": 8.399938612645794, + "grad_norm": 0.16065873205661774, + "learning_rate": 6.567611608161528e-06, + "loss": 1.7104, + "step": 27367 + }, + { + "epoch": 8.40024554941682, + "grad_norm": 0.1657525599002838, + "learning_rate": 6.565149257451098e-06, + "loss": 1.6884, + "step": 27368 + }, + { + "epoch": 8.400552486187845, + "grad_norm": 0.1757468432188034, + "learning_rate": 6.56268733598932e-06, + "loss": 1.7112, + "step": 27369 + }, + { + "epoch": 8.40085942295887, + "grad_norm": 0.16591452062129974, + "learning_rate": 6.560225843800527e-06, + "loss": 1.7227, + "step": 27370 + }, + { + "epoch": 8.401166359729896, + "grad_norm": 0.12153175473213196, + "learning_rate": 6.557764780909048e-06, + "loss": 1.6843, + "step": 27371 + }, + { + "epoch": 8.401473296500921, + "grad_norm": 0.13953842222690582, + "learning_rate": 6.5553041473391914e-06, + "loss": 1.6518, + "step": 27372 + }, + { + "epoch": 8.401780233271946, + "grad_norm": 0.22707831859588623, + "learning_rate": 6.552843943115289e-06, + "loss": 1.7594, + "step": 27373 + }, + { + "epoch": 8.402087170042972, + "grad_norm": 0.18743011355400085, + "learning_rate": 6.550384168261647e-06, + "loss": 1.705, + "step": 27374 + }, + { + "epoch": 8.402394106813997, + "grad_norm": 0.1784582883119583, + "learning_rate": 6.547924822802576e-06, + "loss": 1.7861, + "step": 27375 + }, + { + "epoch": 8.402701043585022, + "grad_norm": 0.18942677974700928, + "learning_rate": 6.545465906762377e-06, + "loss": 1.7489, + "step": 27376 + }, + { + "epoch": 8.403007980356048, + "grad_norm": 0.1783999502658844, + "learning_rate": 6.543007420165354e-06, + "loss": 1.7533, + "step": 27377 + }, + { + "epoch": 8.403314917127071, + "grad_norm": 0.1497674137353897, + "learning_rate": 6.540549363035791e-06, + "loss": 1.6768, + "step": 27378 + }, + { + "epoch": 8.403621853898096, + "grad_norm": 0.15912608802318573, + "learning_rate": 6.538091735398016e-06, + "loss": 1.7656, + "step": 27379 + }, + { + "epoch": 8.403928790669122, + "grad_norm": 0.1886531114578247, + "learning_rate": 6.535634537276269e-06, + "loss": 1.7368, + "step": 27380 + }, + { + "epoch": 8.404235727440147, + "grad_norm": 0.1976786106824875, + "learning_rate": 6.5331777686948756e-06, + "loss": 1.7627, + "step": 27381 + }, + { + "epoch": 8.404542664211172, + "grad_norm": 0.1442447006702423, + "learning_rate": 6.5307214296780775e-06, + "loss": 1.6787, + "step": 27382 + }, + { + "epoch": 8.404849600982198, + "grad_norm": 0.21066388487815857, + "learning_rate": 6.528265520250182e-06, + "loss": 1.741, + "step": 27383 + }, + { + "epoch": 8.405156537753223, + "grad_norm": 0.19657589495182037, + "learning_rate": 6.525810040435443e-06, + "loss": 1.74, + "step": 27384 + }, + { + "epoch": 8.405463474524248, + "grad_norm": 0.20377841591835022, + "learning_rate": 6.5233549902581296e-06, + "loss": 1.7086, + "step": 27385 + }, + { + "epoch": 8.405770411295274, + "grad_norm": 0.16641706228256226, + "learning_rate": 6.520900369742505e-06, + "loss": 1.6897, + "step": 27386 + }, + { + "epoch": 8.4060773480663, + "grad_norm": 0.177897647023201, + "learning_rate": 6.518446178912829e-06, + "loss": 1.7781, + "step": 27387 + }, + { + "epoch": 8.406384284837323, + "grad_norm": 0.2529480755329132, + "learning_rate": 6.515992417793354e-06, + "loss": 1.7227, + "step": 27388 + }, + { + "epoch": 8.406691221608348, + "grad_norm": 0.17020392417907715, + "learning_rate": 6.513539086408327e-06, + "loss": 1.6836, + "step": 27389 + }, + { + "epoch": 8.406998158379373, + "grad_norm": 0.1621706336736679, + "learning_rate": 6.5110861847819944e-06, + "loss": 1.7263, + "step": 27390 + }, + { + "epoch": 8.407305095150399, + "grad_norm": 0.15788327157497406, + "learning_rate": 6.508633712938594e-06, + "loss": 1.7155, + "step": 27391 + }, + { + "epoch": 8.407612031921424, + "grad_norm": 0.1595151722431183, + "learning_rate": 6.5061816709023724e-06, + "loss": 1.7051, + "step": 27392 + }, + { + "epoch": 8.40791896869245, + "grad_norm": 0.2065821886062622, + "learning_rate": 6.503730058697555e-06, + "loss": 1.7435, + "step": 27393 + }, + { + "epoch": 8.408225905463475, + "grad_norm": 0.18513742089271545, + "learning_rate": 6.501278876348371e-06, + "loss": 1.7976, + "step": 27394 + }, + { + "epoch": 8.4085328422345, + "grad_norm": 0.1819298416376114, + "learning_rate": 6.4988281238790305e-06, + "loss": 1.7656, + "step": 27395 + }, + { + "epoch": 8.408839779005525, + "grad_norm": 0.17593856155872345, + "learning_rate": 6.496377801313791e-06, + "loss": 1.7436, + "step": 27396 + }, + { + "epoch": 8.40914671577655, + "grad_norm": 0.1425786167383194, + "learning_rate": 6.493927908676822e-06, + "loss": 1.7365, + "step": 27397 + }, + { + "epoch": 8.409453652547576, + "grad_norm": 0.1689717322587967, + "learning_rate": 6.491478445992383e-06, + "loss": 1.7116, + "step": 27398 + }, + { + "epoch": 8.4097605893186, + "grad_norm": 0.1530478596687317, + "learning_rate": 6.489029413284631e-06, + "loss": 1.7232, + "step": 27399 + }, + { + "epoch": 8.410067526089625, + "grad_norm": 0.16928789019584656, + "learning_rate": 6.486580810577802e-06, + "loss": 1.713, + "step": 27400 + }, + { + "epoch": 8.41037446286065, + "grad_norm": 0.19086188077926636, + "learning_rate": 6.484132637896085e-06, + "loss": 1.7495, + "step": 27401 + }, + { + "epoch": 8.410681399631676, + "grad_norm": 0.18510590493679047, + "learning_rate": 6.481684895263679e-06, + "loss": 1.7445, + "step": 27402 + }, + { + "epoch": 8.410988336402701, + "grad_norm": 0.144667387008667, + "learning_rate": 6.479237582704767e-06, + "loss": 1.6994, + "step": 27403 + }, + { + "epoch": 8.411295273173726, + "grad_norm": 0.15467962622642517, + "learning_rate": 6.476790700243535e-06, + "loss": 1.6807, + "step": 27404 + }, + { + "epoch": 8.411602209944752, + "grad_norm": 0.13533028960227966, + "learning_rate": 6.474344247904168e-06, + "loss": 1.6746, + "step": 27405 + }, + { + "epoch": 8.411909146715777, + "grad_norm": 0.13948698341846466, + "learning_rate": 6.471898225710843e-06, + "loss": 1.7072, + "step": 27406 + }, + { + "epoch": 8.412216083486802, + "grad_norm": 0.1758929044008255, + "learning_rate": 6.469452633687734e-06, + "loss": 1.6993, + "step": 27407 + }, + { + "epoch": 8.412523020257828, + "grad_norm": 0.20594100654125214, + "learning_rate": 6.46700747185901e-06, + "loss": 1.7468, + "step": 27408 + }, + { + "epoch": 8.412829957028851, + "grad_norm": 0.18665185570716858, + "learning_rate": 6.464562740248831e-06, + "loss": 1.6829, + "step": 27409 + }, + { + "epoch": 8.413136893799877, + "grad_norm": 0.1637166142463684, + "learning_rate": 6.4621184388813595e-06, + "loss": 1.7118, + "step": 27410 + }, + { + "epoch": 8.413443830570902, + "grad_norm": 0.1653725504875183, + "learning_rate": 6.459674567780749e-06, + "loss": 1.6986, + "step": 27411 + }, + { + "epoch": 8.413750767341927, + "grad_norm": 0.16381777822971344, + "learning_rate": 6.457231126971158e-06, + "loss": 1.7389, + "step": 27412 + }, + { + "epoch": 8.414057704112953, + "grad_norm": 0.14706309139728546, + "learning_rate": 6.454788116476734e-06, + "loss": 1.6629, + "step": 27413 + }, + { + "epoch": 8.414364640883978, + "grad_norm": 0.17818714678287506, + "learning_rate": 6.4523455363215964e-06, + "loss": 1.761, + "step": 27414 + }, + { + "epoch": 8.414671577655003, + "grad_norm": 0.18425707519054413, + "learning_rate": 6.449903386529932e-06, + "loss": 1.7169, + "step": 27415 + }, + { + "epoch": 8.414978514426029, + "grad_norm": 0.182805597782135, + "learning_rate": 6.4474616671258255e-06, + "loss": 1.6916, + "step": 27416 + }, + { + "epoch": 8.415285451197054, + "grad_norm": 0.1802895963191986, + "learning_rate": 6.4450203781334426e-06, + "loss": 1.7786, + "step": 27417 + }, + { + "epoch": 8.41559238796808, + "grad_norm": 0.18067243695259094, + "learning_rate": 6.442579519576891e-06, + "loss": 1.7489, + "step": 27418 + }, + { + "epoch": 8.415899324739105, + "grad_norm": 0.20373223721981049, + "learning_rate": 6.4401390914803075e-06, + "loss": 1.7519, + "step": 27419 + }, + { + "epoch": 8.416206261510128, + "grad_norm": 0.1414610594511032, + "learning_rate": 6.437699093867794e-06, + "loss": 1.6656, + "step": 27420 + }, + { + "epoch": 8.416513198281153, + "grad_norm": 0.14516517519950867, + "learning_rate": 6.4352595267634706e-06, + "loss": 1.6599, + "step": 27421 + }, + { + "epoch": 8.416820135052179, + "grad_norm": 0.16276796162128448, + "learning_rate": 6.4328203901914465e-06, + "loss": 1.7026, + "step": 27422 + }, + { + "epoch": 8.417127071823204, + "grad_norm": 0.15957671403884888, + "learning_rate": 6.430381684175829e-06, + "loss": 1.7185, + "step": 27423 + }, + { + "epoch": 8.41743400859423, + "grad_norm": 0.1594170182943344, + "learning_rate": 6.4279434087407166e-06, + "loss": 1.7144, + "step": 27424 + }, + { + "epoch": 8.417740945365255, + "grad_norm": 0.14235691726207733, + "learning_rate": 6.425505563910206e-06, + "loss": 1.6487, + "step": 27425 + }, + { + "epoch": 8.41804788213628, + "grad_norm": 0.17203880846500397, + "learning_rate": 6.423068149708389e-06, + "loss": 1.7252, + "step": 27426 + }, + { + "epoch": 8.418354818907305, + "grad_norm": 0.15193019807338715, + "learning_rate": 6.420631166159352e-06, + "loss": 1.7346, + "step": 27427 + }, + { + "epoch": 8.41866175567833, + "grad_norm": 0.17005006968975067, + "learning_rate": 6.418194613287182e-06, + "loss": 1.7679, + "step": 27428 + }, + { + "epoch": 8.418968692449356, + "grad_norm": 0.15492422878742218, + "learning_rate": 6.415758491115953e-06, + "loss": 1.6962, + "step": 27429 + }, + { + "epoch": 8.419275629220381, + "grad_norm": 0.13465845584869385, + "learning_rate": 6.413322799669752e-06, + "loss": 1.676, + "step": 27430 + }, + { + "epoch": 8.419582565991405, + "grad_norm": 0.20086030662059784, + "learning_rate": 6.410887538972626e-06, + "loss": 1.7341, + "step": 27431 + }, + { + "epoch": 8.41988950276243, + "grad_norm": 0.12862804532051086, + "learning_rate": 6.408452709048679e-06, + "loss": 1.6456, + "step": 27432 + }, + { + "epoch": 8.420196439533456, + "grad_norm": 0.1520070731639862, + "learning_rate": 6.40601830992193e-06, + "loss": 1.7169, + "step": 27433 + }, + { + "epoch": 8.420503376304481, + "grad_norm": 0.15394441783428192, + "learning_rate": 6.4035843416164865e-06, + "loss": 1.6876, + "step": 27434 + }, + { + "epoch": 8.420810313075506, + "grad_norm": 0.15149196982383728, + "learning_rate": 6.4011508041563475e-06, + "loss": 1.7126, + "step": 27435 + }, + { + "epoch": 8.421117249846532, + "grad_norm": 0.14014703035354614, + "learning_rate": 6.398717697565604e-06, + "loss": 1.6554, + "step": 27436 + }, + { + "epoch": 8.421424186617557, + "grad_norm": 0.1493537575006485, + "learning_rate": 6.3962850218682865e-06, + "loss": 1.6915, + "step": 27437 + }, + { + "epoch": 8.421731123388582, + "grad_norm": 0.16197362542152405, + "learning_rate": 6.393852777088438e-06, + "loss": 1.7108, + "step": 27438 + }, + { + "epoch": 8.422038060159608, + "grad_norm": 0.2058446705341339, + "learning_rate": 6.391420963250094e-06, + "loss": 1.806, + "step": 27439 + }, + { + "epoch": 8.422344996930633, + "grad_norm": 0.16983431577682495, + "learning_rate": 6.388989580377291e-06, + "loss": 1.7265, + "step": 27440 + }, + { + "epoch": 8.422651933701658, + "grad_norm": 0.15896758437156677, + "learning_rate": 6.386558628494049e-06, + "loss": 1.7081, + "step": 27441 + }, + { + "epoch": 8.422958870472682, + "grad_norm": 0.15534810721874237, + "learning_rate": 6.384128107624399e-06, + "loss": 1.7218, + "step": 27442 + }, + { + "epoch": 8.423265807243707, + "grad_norm": 0.20577791333198547, + "learning_rate": 6.381698017792365e-06, + "loss": 1.7799, + "step": 27443 + }, + { + "epoch": 8.423572744014733, + "grad_norm": 0.183476984500885, + "learning_rate": 6.37926835902195e-06, + "loss": 1.7432, + "step": 27444 + }, + { + "epoch": 8.423879680785758, + "grad_norm": 0.1834617555141449, + "learning_rate": 6.376839131337175e-06, + "loss": 1.7333, + "step": 27445 + }, + { + "epoch": 8.424186617556783, + "grad_norm": 0.15556102991104126, + "learning_rate": 6.374410334762043e-06, + "loss": 1.7119, + "step": 27446 + }, + { + "epoch": 8.424493554327809, + "grad_norm": 0.14469701051712036, + "learning_rate": 6.3719819693205565e-06, + "loss": 1.6883, + "step": 27447 + }, + { + "epoch": 8.424800491098834, + "grad_norm": 0.1339770257472992, + "learning_rate": 6.369554035036706e-06, + "loss": 1.692, + "step": 27448 + }, + { + "epoch": 8.42510742786986, + "grad_norm": 0.18144701421260834, + "learning_rate": 6.367126531934514e-06, + "loss": 1.7192, + "step": 27449 + }, + { + "epoch": 8.425414364640885, + "grad_norm": 0.20075814425945282, + "learning_rate": 6.364699460037931e-06, + "loss": 1.6681, + "step": 27450 + }, + { + "epoch": 8.42572130141191, + "grad_norm": 0.14828181266784668, + "learning_rate": 6.36227281937099e-06, + "loss": 1.6955, + "step": 27451 + }, + { + "epoch": 8.426028238182933, + "grad_norm": 0.1502649486064911, + "learning_rate": 6.35984660995762e-06, + "loss": 1.6695, + "step": 27452 + }, + { + "epoch": 8.426335174953959, + "grad_norm": 0.16594241559505463, + "learning_rate": 6.3574208318218364e-06, + "loss": 1.7092, + "step": 27453 + }, + { + "epoch": 8.426642111724984, + "grad_norm": 0.2585645020008087, + "learning_rate": 6.354995484987597e-06, + "loss": 1.7358, + "step": 27454 + }, + { + "epoch": 8.42694904849601, + "grad_norm": 0.1694081574678421, + "learning_rate": 6.352570569478877e-06, + "loss": 1.7421, + "step": 27455 + }, + { + "epoch": 8.427255985267035, + "grad_norm": 0.178135946393013, + "learning_rate": 6.350146085319647e-06, + "loss": 1.7157, + "step": 27456 + }, + { + "epoch": 8.42756292203806, + "grad_norm": 0.19647614657878876, + "learning_rate": 6.347722032533837e-06, + "loss": 1.7843, + "step": 27457 + }, + { + "epoch": 8.427869858809085, + "grad_norm": 0.1510474979877472, + "learning_rate": 6.345298411145434e-06, + "loss": 1.688, + "step": 27458 + }, + { + "epoch": 8.42817679558011, + "grad_norm": 0.2130916565656662, + "learning_rate": 6.342875221178374e-06, + "loss": 1.7817, + "step": 27459 + }, + { + "epoch": 8.428483732351136, + "grad_norm": 0.1456206738948822, + "learning_rate": 6.340452462656615e-06, + "loss": 1.6839, + "step": 27460 + }, + { + "epoch": 8.428790669122161, + "grad_norm": 0.16592659056186676, + "learning_rate": 6.338030135604089e-06, + "loss": 1.7395, + "step": 27461 + }, + { + "epoch": 8.429097605893187, + "grad_norm": 0.15017202496528625, + "learning_rate": 6.335608240044744e-06, + "loss": 1.6815, + "step": 27462 + }, + { + "epoch": 8.42940454266421, + "grad_norm": 0.14279332756996155, + "learning_rate": 6.333186776002514e-06, + "loss": 1.6845, + "step": 27463 + }, + { + "epoch": 8.429711479435236, + "grad_norm": 0.15117228031158447, + "learning_rate": 6.330765743501321e-06, + "loss": 1.7421, + "step": 27464 + }, + { + "epoch": 8.430018416206261, + "grad_norm": 0.19822575151920319, + "learning_rate": 6.328345142565084e-06, + "loss": 1.7297, + "step": 27465 + }, + { + "epoch": 8.430325352977286, + "grad_norm": 0.1589222550392151, + "learning_rate": 6.325924973217762e-06, + "loss": 1.7151, + "step": 27466 + }, + { + "epoch": 8.430632289748312, + "grad_norm": 0.19120970368385315, + "learning_rate": 6.323505235483229e-06, + "loss": 1.7373, + "step": 27467 + }, + { + "epoch": 8.430939226519337, + "grad_norm": 0.1859981119632721, + "learning_rate": 6.321085929385434e-06, + "loss": 1.6912, + "step": 27468 + }, + { + "epoch": 8.431246163290362, + "grad_norm": 0.1745872050523758, + "learning_rate": 6.318667054948246e-06, + "loss": 1.6773, + "step": 27469 + }, + { + "epoch": 8.431553100061388, + "grad_norm": 0.13402412831783295, + "learning_rate": 6.316248612195607e-06, + "loss": 1.6905, + "step": 27470 + }, + { + "epoch": 8.431860036832413, + "grad_norm": 0.22629496455192566, + "learning_rate": 6.3138306011514045e-06, + "loss": 1.7012, + "step": 27471 + }, + { + "epoch": 8.432166973603438, + "grad_norm": 0.18746718764305115, + "learning_rate": 6.31141302183953e-06, + "loss": 1.7573, + "step": 27472 + }, + { + "epoch": 8.432473910374464, + "grad_norm": 0.18313723802566528, + "learning_rate": 6.308995874283891e-06, + "loss": 1.7358, + "step": 27473 + }, + { + "epoch": 8.432780847145487, + "grad_norm": 0.19075456261634827, + "learning_rate": 6.306579158508341e-06, + "loss": 1.7091, + "step": 27474 + }, + { + "epoch": 8.433087783916513, + "grad_norm": 0.18092980980873108, + "learning_rate": 6.304162874536796e-06, + "loss": 1.6739, + "step": 27475 + }, + { + "epoch": 8.433394720687538, + "grad_norm": 0.15624219179153442, + "learning_rate": 6.301747022393123e-06, + "loss": 1.6637, + "step": 27476 + }, + { + "epoch": 8.433701657458563, + "grad_norm": 0.14825348556041718, + "learning_rate": 6.299331602101199e-06, + "loss": 1.6865, + "step": 27477 + }, + { + "epoch": 8.434008594229589, + "grad_norm": 0.2204820215702057, + "learning_rate": 6.2969166136848946e-06, + "loss": 1.7842, + "step": 27478 + }, + { + "epoch": 8.434315531000614, + "grad_norm": 0.15570053458213806, + "learning_rate": 6.294502057168072e-06, + "loss": 1.69, + "step": 27479 + }, + { + "epoch": 8.43462246777164, + "grad_norm": 0.1686720848083496, + "learning_rate": 6.292087932574603e-06, + "loss": 1.6787, + "step": 27480 + }, + { + "epoch": 8.434929404542665, + "grad_norm": 0.2100359946489334, + "learning_rate": 6.289674239928334e-06, + "loss": 1.7374, + "step": 27481 + }, + { + "epoch": 8.43523634131369, + "grad_norm": 0.1607038378715515, + "learning_rate": 6.287260979253112e-06, + "loss": 1.7067, + "step": 27482 + }, + { + "epoch": 8.435543278084715, + "grad_norm": 0.153702512383461, + "learning_rate": 6.2848481505728254e-06, + "loss": 1.6762, + "step": 27483 + }, + { + "epoch": 8.43585021485574, + "grad_norm": 0.15967734158039093, + "learning_rate": 6.282435753911264e-06, + "loss": 1.6543, + "step": 27484 + }, + { + "epoch": 8.436157151626764, + "grad_norm": 0.18866287171840668, + "learning_rate": 6.280023789292322e-06, + "loss": 1.7481, + "step": 27485 + }, + { + "epoch": 8.43646408839779, + "grad_norm": 0.13347187638282776, + "learning_rate": 6.277612256739784e-06, + "loss": 1.6398, + "step": 27486 + }, + { + "epoch": 8.436771025168815, + "grad_norm": 0.1626890003681183, + "learning_rate": 6.275201156277521e-06, + "loss": 1.7258, + "step": 27487 + }, + { + "epoch": 8.43707796193984, + "grad_norm": 0.21519014239311218, + "learning_rate": 6.272790487929353e-06, + "loss": 1.7762, + "step": 27488 + }, + { + "epoch": 8.437384898710865, + "grad_norm": 0.1610138863325119, + "learning_rate": 6.2703802517190935e-06, + "loss": 1.6999, + "step": 27489 + }, + { + "epoch": 8.43769183548189, + "grad_norm": 0.20251847803592682, + "learning_rate": 6.267970447670579e-06, + "loss": 1.6953, + "step": 27490 + }, + { + "epoch": 8.437998772252916, + "grad_norm": 0.15717832744121552, + "learning_rate": 6.265561075807591e-06, + "loss": 1.623, + "step": 27491 + }, + { + "epoch": 8.438305709023942, + "grad_norm": 0.1399519294500351, + "learning_rate": 6.2631521361539716e-06, + "loss": 1.693, + "step": 27492 + }, + { + "epoch": 8.438612645794967, + "grad_norm": 0.17747904360294342, + "learning_rate": 6.260743628733517e-06, + "loss": 1.7019, + "step": 27493 + }, + { + "epoch": 8.438919582565992, + "grad_norm": 0.1724942922592163, + "learning_rate": 6.258335553570032e-06, + "loss": 1.6647, + "step": 27494 + }, + { + "epoch": 8.439226519337016, + "grad_norm": 0.15294337272644043, + "learning_rate": 6.255927910687315e-06, + "loss": 1.7492, + "step": 27495 + }, + { + "epoch": 8.439533456108041, + "grad_norm": 0.16880661249160767, + "learning_rate": 6.253520700109156e-06, + "loss": 1.731, + "step": 27496 + }, + { + "epoch": 8.439840392879066, + "grad_norm": 0.16098125278949738, + "learning_rate": 6.251113921859347e-06, + "loss": 1.6668, + "step": 27497 + }, + { + "epoch": 8.440147329650092, + "grad_norm": 0.17218537628650665, + "learning_rate": 6.248707575961671e-06, + "loss": 1.6943, + "step": 27498 + }, + { + "epoch": 8.440454266421117, + "grad_norm": 0.19593006372451782, + "learning_rate": 6.2463016624398965e-06, + "loss": 1.7213, + "step": 27499 + }, + { + "epoch": 8.440761203192142, + "grad_norm": 0.15833450853824615, + "learning_rate": 6.243896181317837e-06, + "loss": 1.6787, + "step": 27500 + }, + { + "epoch": 8.441068139963168, + "grad_norm": 0.1378611922264099, + "learning_rate": 6.241491132619226e-06, + "loss": 1.6777, + "step": 27501 + }, + { + "epoch": 8.441375076734193, + "grad_norm": 0.25010615587234497, + "learning_rate": 6.239086516367865e-06, + "loss": 1.7474, + "step": 27502 + }, + { + "epoch": 8.441682013505218, + "grad_norm": 0.1281466782093048, + "learning_rate": 6.236682332587474e-06, + "loss": 1.6946, + "step": 27503 + }, + { + "epoch": 8.441988950276244, + "grad_norm": 0.19045543670654297, + "learning_rate": 6.234278581301855e-06, + "loss": 1.7198, + "step": 27504 + }, + { + "epoch": 8.442295887047269, + "grad_norm": 0.17753495275974274, + "learning_rate": 6.231875262534748e-06, + "loss": 1.7324, + "step": 27505 + }, + { + "epoch": 8.442602823818293, + "grad_norm": 0.14088352024555206, + "learning_rate": 6.229472376309897e-06, + "loss": 1.6683, + "step": 27506 + }, + { + "epoch": 8.442909760589318, + "grad_norm": 0.16781100630760193, + "learning_rate": 6.2270699226510685e-06, + "loss": 1.7271, + "step": 27507 + }, + { + "epoch": 8.443216697360343, + "grad_norm": 0.1857508271932602, + "learning_rate": 6.224667901581971e-06, + "loss": 1.7596, + "step": 27508 + }, + { + "epoch": 8.443523634131369, + "grad_norm": 0.18411888182163239, + "learning_rate": 6.222266313126374e-06, + "loss": 1.8193, + "step": 27509 + }, + { + "epoch": 8.443830570902394, + "grad_norm": 0.1530957967042923, + "learning_rate": 6.2198651573079965e-06, + "loss": 1.6958, + "step": 27510 + }, + { + "epoch": 8.44413750767342, + "grad_norm": 0.19102713465690613, + "learning_rate": 6.217464434150572e-06, + "loss": 1.7172, + "step": 27511 + }, + { + "epoch": 8.444444444444445, + "grad_norm": 0.16886062920093536, + "learning_rate": 6.215064143677829e-06, + "loss": 1.6811, + "step": 27512 + }, + { + "epoch": 8.44475138121547, + "grad_norm": 0.15974819660186768, + "learning_rate": 6.212664285913483e-06, + "loss": 1.694, + "step": 27513 + }, + { + "epoch": 8.445058317986495, + "grad_norm": 0.19709718227386475, + "learning_rate": 6.2102648608812544e-06, + "loss": 1.7647, + "step": 27514 + }, + { + "epoch": 8.44536525475752, + "grad_norm": 0.15339697897434235, + "learning_rate": 6.207865868604857e-06, + "loss": 1.7169, + "step": 27515 + }, + { + "epoch": 8.445672191528546, + "grad_norm": 0.14088544249534607, + "learning_rate": 6.2054673091079815e-06, + "loss": 1.6902, + "step": 27516 + }, + { + "epoch": 8.44597912829957, + "grad_norm": 0.17412640154361725, + "learning_rate": 6.203069182414367e-06, + "loss": 1.7205, + "step": 27517 + }, + { + "epoch": 8.446286065070595, + "grad_norm": 0.18837641179561615, + "learning_rate": 6.200671488547677e-06, + "loss": 1.7756, + "step": 27518 + }, + { + "epoch": 8.44659300184162, + "grad_norm": 0.18904593586921692, + "learning_rate": 6.198274227531642e-06, + "loss": 1.732, + "step": 27519 + }, + { + "epoch": 8.446899938612646, + "grad_norm": 0.13136132061481476, + "learning_rate": 6.19587739938991e-06, + "loss": 1.6844, + "step": 27520 + }, + { + "epoch": 8.44720687538367, + "grad_norm": 0.15678717195987701, + "learning_rate": 6.1934810041462066e-06, + "loss": 1.7029, + "step": 27521 + }, + { + "epoch": 8.447513812154696, + "grad_norm": 0.1661362200975418, + "learning_rate": 6.191085041824207e-06, + "loss": 1.6656, + "step": 27522 + }, + { + "epoch": 8.447820748925722, + "grad_norm": 0.1749318689107895, + "learning_rate": 6.188689512447565e-06, + "loss": 1.7412, + "step": 27523 + }, + { + "epoch": 8.448127685696747, + "grad_norm": 0.17242331802845, + "learning_rate": 6.18629441603999e-06, + "loss": 1.7037, + "step": 27524 + }, + { + "epoch": 8.448434622467772, + "grad_norm": 0.16092433035373688, + "learning_rate": 6.183899752625116e-06, + "loss": 1.6817, + "step": 27525 + }, + { + "epoch": 8.448741559238798, + "grad_norm": 0.16177381575107574, + "learning_rate": 6.1815055222266325e-06, + "loss": 1.6678, + "step": 27526 + }, + { + "epoch": 8.449048496009823, + "grad_norm": 0.1489405483007431, + "learning_rate": 6.179111724868197e-06, + "loss": 1.6839, + "step": 27527 + }, + { + "epoch": 8.449355432780846, + "grad_norm": 0.15873265266418457, + "learning_rate": 6.176718360573458e-06, + "loss": 1.6749, + "step": 27528 + }, + { + "epoch": 8.449662369551872, + "grad_norm": 0.17511235177516937, + "learning_rate": 6.174325429366079e-06, + "loss": 1.6962, + "step": 27529 + }, + { + "epoch": 8.449969306322897, + "grad_norm": 0.1452886015176773, + "learning_rate": 6.171932931269702e-06, + "loss": 1.7141, + "step": 27530 + }, + { + "epoch": 8.450276243093922, + "grad_norm": 0.20559509098529816, + "learning_rate": 6.169540866307977e-06, + "loss": 1.7116, + "step": 27531 + }, + { + "epoch": 8.450583179864948, + "grad_norm": 0.17642420530319214, + "learning_rate": 6.167149234504532e-06, + "loss": 1.7209, + "step": 27532 + }, + { + "epoch": 8.450890116635973, + "grad_norm": 0.13833492994308472, + "learning_rate": 6.164758035883001e-06, + "loss": 1.6522, + "step": 27533 + }, + { + "epoch": 8.451197053406998, + "grad_norm": 0.18079428374767303, + "learning_rate": 6.162367270467045e-06, + "loss": 1.7348, + "step": 27534 + }, + { + "epoch": 8.451503990178024, + "grad_norm": 0.19325628876686096, + "learning_rate": 6.159976938280249e-06, + "loss": 1.6947, + "step": 27535 + }, + { + "epoch": 8.45181092694905, + "grad_norm": 0.17844507098197937, + "learning_rate": 6.15758703934628e-06, + "loss": 1.7206, + "step": 27536 + }, + { + "epoch": 8.452117863720074, + "grad_norm": 0.186324343085289, + "learning_rate": 6.155197573688703e-06, + "loss": 1.743, + "step": 27537 + }, + { + "epoch": 8.452424800491098, + "grad_norm": 0.15700562298297882, + "learning_rate": 6.152808541331184e-06, + "loss": 1.7109, + "step": 27538 + }, + { + "epoch": 8.452731737262123, + "grad_norm": 0.13879023492336273, + "learning_rate": 6.150419942297314e-06, + "loss": 1.6737, + "step": 27539 + }, + { + "epoch": 8.453038674033149, + "grad_norm": 0.14589501917362213, + "learning_rate": 6.148031776610675e-06, + "loss": 1.6884, + "step": 27540 + }, + { + "epoch": 8.453345610804174, + "grad_norm": 0.14402590692043304, + "learning_rate": 6.1456440442949125e-06, + "loss": 1.6949, + "step": 27541 + }, + { + "epoch": 8.4536525475752, + "grad_norm": 0.16506166756153107, + "learning_rate": 6.143256745373571e-06, + "loss": 1.725, + "step": 27542 + }, + { + "epoch": 8.453959484346225, + "grad_norm": 0.15663643181324005, + "learning_rate": 6.140869879870287e-06, + "loss": 1.7069, + "step": 27543 + }, + { + "epoch": 8.45426642111725, + "grad_norm": 0.16058720648288727, + "learning_rate": 6.138483447808635e-06, + "loss": 1.7264, + "step": 27544 + }, + { + "epoch": 8.454573357888275, + "grad_norm": 0.23160551488399506, + "learning_rate": 6.136097449212197e-06, + "loss": 1.7573, + "step": 27545 + }, + { + "epoch": 8.4548802946593, + "grad_norm": 0.15130533277988434, + "learning_rate": 6.133711884104554e-06, + "loss": 1.705, + "step": 27546 + }, + { + "epoch": 8.455187231430326, + "grad_norm": 0.16825515031814575, + "learning_rate": 6.131326752509281e-06, + "loss": 1.7405, + "step": 27547 + }, + { + "epoch": 8.455494168201351, + "grad_norm": 0.19265486299991608, + "learning_rate": 6.128942054449943e-06, + "loss": 1.7026, + "step": 27548 + }, + { + "epoch": 8.455801104972375, + "grad_norm": 0.18873640894889832, + "learning_rate": 6.126557789950121e-06, + "loss": 1.6825, + "step": 27549 + }, + { + "epoch": 8.4561080417434, + "grad_norm": 0.13833044469356537, + "learning_rate": 6.124173959033358e-06, + "loss": 1.6589, + "step": 27550 + }, + { + "epoch": 8.456414978514426, + "grad_norm": 0.16894219815731049, + "learning_rate": 6.1217905617232394e-06, + "loss": 1.7781, + "step": 27551 + }, + { + "epoch": 8.456721915285451, + "grad_norm": 0.18338344991207123, + "learning_rate": 6.119407598043292e-06, + "loss": 1.7348, + "step": 27552 + }, + { + "epoch": 8.457028852056476, + "grad_norm": 0.17766039073467255, + "learning_rate": 6.117025068017096e-06, + "loss": 1.7126, + "step": 27553 + }, + { + "epoch": 8.457335788827502, + "grad_norm": 0.18717309832572937, + "learning_rate": 6.114642971668155e-06, + "loss": 1.7193, + "step": 27554 + }, + { + "epoch": 8.457642725598527, + "grad_norm": 0.15229196846485138, + "learning_rate": 6.112261309020045e-06, + "loss": 1.665, + "step": 27555 + }, + { + "epoch": 8.457949662369552, + "grad_norm": 0.15391093492507935, + "learning_rate": 6.109880080096303e-06, + "loss": 1.6813, + "step": 27556 + }, + { + "epoch": 8.458256599140578, + "grad_norm": 0.1363036334514618, + "learning_rate": 6.107499284920432e-06, + "loss": 1.6912, + "step": 27557 + }, + { + "epoch": 8.458563535911603, + "grad_norm": 0.15193909406661987, + "learning_rate": 6.105118923516001e-06, + "loss": 1.7219, + "step": 27558 + }, + { + "epoch": 8.458870472682626, + "grad_norm": 0.1312003880739212, + "learning_rate": 6.102738995906487e-06, + "loss": 1.7317, + "step": 27559 + }, + { + "epoch": 8.459177409453652, + "grad_norm": 0.12835659086704254, + "learning_rate": 6.100359502115449e-06, + "loss": 1.6556, + "step": 27560 + }, + { + "epoch": 8.459484346224677, + "grad_norm": 0.17296236753463745, + "learning_rate": 6.09798044216639e-06, + "loss": 1.7331, + "step": 27561 + }, + { + "epoch": 8.459791282995702, + "grad_norm": 0.1607210338115692, + "learning_rate": 6.095601816082819e-06, + "loss": 1.7297, + "step": 27562 + }, + { + "epoch": 8.460098219766728, + "grad_norm": 0.1841181367635727, + "learning_rate": 6.093223623888245e-06, + "loss": 1.7382, + "step": 27563 + }, + { + "epoch": 8.460405156537753, + "grad_norm": 0.15751226246356964, + "learning_rate": 6.090845865606165e-06, + "loss": 1.6952, + "step": 27564 + }, + { + "epoch": 8.460712093308778, + "grad_norm": 0.15703023970127106, + "learning_rate": 6.0884685412600835e-06, + "loss": 1.7476, + "step": 27565 + }, + { + "epoch": 8.461019030079804, + "grad_norm": 0.17819096148014069, + "learning_rate": 6.0860916508734985e-06, + "loss": 1.7761, + "step": 27566 + }, + { + "epoch": 8.46132596685083, + "grad_norm": 0.168768510222435, + "learning_rate": 6.08371519446988e-06, + "loss": 1.7534, + "step": 27567 + }, + { + "epoch": 8.461632903621854, + "grad_norm": 0.1577196717262268, + "learning_rate": 6.081339172072747e-06, + "loss": 1.6533, + "step": 27568 + }, + { + "epoch": 8.46193984039288, + "grad_norm": 0.19285355508327484, + "learning_rate": 6.078963583705544e-06, + "loss": 1.7127, + "step": 27569 + }, + { + "epoch": 8.462246777163903, + "grad_norm": 0.15905390679836273, + "learning_rate": 6.076588429391788e-06, + "loss": 1.6851, + "step": 27570 + }, + { + "epoch": 8.462553713934929, + "grad_norm": 0.14860354363918304, + "learning_rate": 6.074213709154908e-06, + "loss": 1.7016, + "step": 27571 + }, + { + "epoch": 8.462860650705954, + "grad_norm": 0.2003553956747055, + "learning_rate": 6.0718394230184e-06, + "loss": 1.819, + "step": 27572 + }, + { + "epoch": 8.46316758747698, + "grad_norm": 0.1739475131034851, + "learning_rate": 6.069465571005733e-06, + "loss": 1.7539, + "step": 27573 + }, + { + "epoch": 8.463474524248005, + "grad_norm": 0.20145776867866516, + "learning_rate": 6.067092153140341e-06, + "loss": 1.7472, + "step": 27574 + }, + { + "epoch": 8.46378146101903, + "grad_norm": 0.2065812349319458, + "learning_rate": 6.06471916944571e-06, + "loss": 1.7871, + "step": 27575 + }, + { + "epoch": 8.464088397790055, + "grad_norm": 0.16987882554531097, + "learning_rate": 6.0623466199452585e-06, + "loss": 1.7299, + "step": 27576 + }, + { + "epoch": 8.46439533456108, + "grad_norm": 0.1477213054895401, + "learning_rate": 6.059974504662458e-06, + "loss": 1.6829, + "step": 27577 + }, + { + "epoch": 8.464702271332106, + "grad_norm": 0.16443482041358948, + "learning_rate": 6.05760282362074e-06, + "loss": 1.7352, + "step": 27578 + }, + { + "epoch": 8.465009208103131, + "grad_norm": 0.15927115082740784, + "learning_rate": 6.055231576843551e-06, + "loss": 1.7175, + "step": 27579 + }, + { + "epoch": 8.465316144874157, + "grad_norm": 0.17477387189865112, + "learning_rate": 6.052860764354318e-06, + "loss": 1.6609, + "step": 27580 + }, + { + "epoch": 8.46562308164518, + "grad_norm": 0.22039631009101868, + "learning_rate": 6.050490386176477e-06, + "loss": 1.7664, + "step": 27581 + }, + { + "epoch": 8.465930018416206, + "grad_norm": 0.1699618101119995, + "learning_rate": 6.048120442333449e-06, + "loss": 1.7231, + "step": 27582 + }, + { + "epoch": 8.466236955187231, + "grad_norm": 0.1548585742712021, + "learning_rate": 6.045750932848654e-06, + "loss": 1.7503, + "step": 27583 + }, + { + "epoch": 8.466543891958256, + "grad_norm": 0.17046836018562317, + "learning_rate": 6.043381857745506e-06, + "loss": 1.6993, + "step": 27584 + }, + { + "epoch": 8.466850828729282, + "grad_norm": 0.1857844740152359, + "learning_rate": 6.041013217047431e-06, + "loss": 1.7132, + "step": 27585 + }, + { + "epoch": 8.467157765500307, + "grad_norm": 0.15656128525733948, + "learning_rate": 6.0386450107778105e-06, + "loss": 1.6713, + "step": 27586 + }, + { + "epoch": 8.467464702271332, + "grad_norm": 0.20369650423526764, + "learning_rate": 6.036277238960092e-06, + "loss": 1.7296, + "step": 27587 + }, + { + "epoch": 8.467771639042358, + "grad_norm": 0.15926989912986755, + "learning_rate": 6.0339099016176295e-06, + "loss": 1.6766, + "step": 27588 + }, + { + "epoch": 8.468078575813383, + "grad_norm": 0.16353332996368408, + "learning_rate": 6.0315429987738596e-06, + "loss": 1.7084, + "step": 27589 + }, + { + "epoch": 8.468385512584408, + "grad_norm": 0.16328907012939453, + "learning_rate": 6.029176530452141e-06, + "loss": 1.715, + "step": 27590 + }, + { + "epoch": 8.468692449355434, + "grad_norm": 0.20153367519378662, + "learning_rate": 6.026810496675861e-06, + "loss": 1.7363, + "step": 27591 + }, + { + "epoch": 8.468999386126457, + "grad_norm": 0.1374381184577942, + "learning_rate": 6.024444897468435e-06, + "loss": 1.6633, + "step": 27592 + }, + { + "epoch": 8.469306322897483, + "grad_norm": 0.20331406593322754, + "learning_rate": 6.022079732853198e-06, + "loss": 1.7544, + "step": 27593 + }, + { + "epoch": 8.469613259668508, + "grad_norm": 0.18052712082862854, + "learning_rate": 6.019715002853554e-06, + "loss": 1.7032, + "step": 27594 + }, + { + "epoch": 8.469920196439533, + "grad_norm": 0.18305034935474396, + "learning_rate": 6.017350707492863e-06, + "loss": 1.7249, + "step": 27595 + }, + { + "epoch": 8.470227133210559, + "grad_norm": 0.1608239710330963, + "learning_rate": 6.014986846794496e-06, + "loss": 1.7049, + "step": 27596 + }, + { + "epoch": 8.470534069981584, + "grad_norm": 0.16582928597927094, + "learning_rate": 6.012623420781804e-06, + "loss": 1.6777, + "step": 27597 + }, + { + "epoch": 8.47084100675261, + "grad_norm": 0.18023556470870972, + "learning_rate": 6.010260429478154e-06, + "loss": 1.6996, + "step": 27598 + }, + { + "epoch": 8.471147943523635, + "grad_norm": 0.1994815319776535, + "learning_rate": 6.007897872906892e-06, + "loss": 1.7455, + "step": 27599 + }, + { + "epoch": 8.47145488029466, + "grad_norm": 0.17772625386714935, + "learning_rate": 6.005535751091368e-06, + "loss": 1.7431, + "step": 27600 + }, + { + "epoch": 8.471761817065685, + "grad_norm": 0.17297807335853577, + "learning_rate": 6.003174064054929e-06, + "loss": 1.7087, + "step": 27601 + }, + { + "epoch": 8.472068753836709, + "grad_norm": 0.14986321330070496, + "learning_rate": 6.000812811820905e-06, + "loss": 1.681, + "step": 27602 + }, + { + "epoch": 8.472375690607734, + "grad_norm": 0.17512932419776917, + "learning_rate": 5.998451994412629e-06, + "loss": 1.7669, + "step": 27603 + }, + { + "epoch": 8.47268262737876, + "grad_norm": 0.18424493074417114, + "learning_rate": 5.996091611853466e-06, + "loss": 1.7296, + "step": 27604 + }, + { + "epoch": 8.472989564149785, + "grad_norm": 0.1246834322810173, + "learning_rate": 5.9937316641666906e-06, + "loss": 1.6747, + "step": 27605 + }, + { + "epoch": 8.47329650092081, + "grad_norm": 0.14435335993766785, + "learning_rate": 5.991372151375674e-06, + "loss": 1.6225, + "step": 27606 + }, + { + "epoch": 8.473603437691835, + "grad_norm": 0.16726957261562347, + "learning_rate": 5.989013073503702e-06, + "loss": 1.7052, + "step": 27607 + }, + { + "epoch": 8.47391037446286, + "grad_norm": 0.15307356417179108, + "learning_rate": 5.98665443057409e-06, + "loss": 1.7199, + "step": 27608 + }, + { + "epoch": 8.474217311233886, + "grad_norm": 0.14373189210891724, + "learning_rate": 5.984296222610175e-06, + "loss": 1.6808, + "step": 27609 + }, + { + "epoch": 8.474524248004911, + "grad_norm": 0.13142740726470947, + "learning_rate": 5.981938449635222e-06, + "loss": 1.6868, + "step": 27610 + }, + { + "epoch": 8.474831184775937, + "grad_norm": 0.13838545978069305, + "learning_rate": 5.979581111672572e-06, + "loss": 1.6723, + "step": 27611 + }, + { + "epoch": 8.475138121546962, + "grad_norm": 0.15346096456050873, + "learning_rate": 5.977224208745485e-06, + "loss": 1.7066, + "step": 27612 + }, + { + "epoch": 8.475445058317986, + "grad_norm": 0.127261221408844, + "learning_rate": 5.974867740877283e-06, + "loss": 1.6285, + "step": 27613 + }, + { + "epoch": 8.475751995089011, + "grad_norm": 0.12636838853359222, + "learning_rate": 5.972511708091239e-06, + "loss": 1.6707, + "step": 27614 + }, + { + "epoch": 8.476058931860036, + "grad_norm": 0.22297553718090057, + "learning_rate": 5.970156110410641e-06, + "loss": 1.693, + "step": 27615 + }, + { + "epoch": 8.476365868631062, + "grad_norm": 0.21933813393115997, + "learning_rate": 5.967800947858765e-06, + "loss": 1.7622, + "step": 27616 + }, + { + "epoch": 8.476672805402087, + "grad_norm": 0.19202767312526703, + "learning_rate": 5.965446220458887e-06, + "loss": 1.723, + "step": 27617 + }, + { + "epoch": 8.476979742173112, + "grad_norm": 0.13845433294773102, + "learning_rate": 5.963091928234283e-06, + "loss": 1.6824, + "step": 27618 + }, + { + "epoch": 8.477286678944138, + "grad_norm": 0.1829427033662796, + "learning_rate": 5.960738071208211e-06, + "loss": 1.7441, + "step": 27619 + }, + { + "epoch": 8.477593615715163, + "grad_norm": 0.17720428109169006, + "learning_rate": 5.958384649403931e-06, + "loss": 1.7108, + "step": 27620 + }, + { + "epoch": 8.477900552486188, + "grad_norm": 0.12632785737514496, + "learning_rate": 5.95603166284473e-06, + "loss": 1.6762, + "step": 27621 + }, + { + "epoch": 8.478207489257214, + "grad_norm": 0.15774594247341156, + "learning_rate": 5.953679111553812e-06, + "loss": 1.7076, + "step": 27622 + }, + { + "epoch": 8.478514426028239, + "grad_norm": 0.16115643084049225, + "learning_rate": 5.9513269955544795e-06, + "loss": 1.757, + "step": 27623 + }, + { + "epoch": 8.478821362799263, + "grad_norm": 0.13887029886245728, + "learning_rate": 5.948975314869937e-06, + "loss": 1.7462, + "step": 27624 + }, + { + "epoch": 8.479128299570288, + "grad_norm": 0.1517426073551178, + "learning_rate": 5.946624069523432e-06, + "loss": 1.6912, + "step": 27625 + }, + { + "epoch": 8.479435236341313, + "grad_norm": 0.15509237349033356, + "learning_rate": 5.94427325953823e-06, + "loss": 1.7022, + "step": 27626 + }, + { + "epoch": 8.479742173112339, + "grad_norm": 0.1656811237335205, + "learning_rate": 5.9419228849375175e-06, + "loss": 1.713, + "step": 27627 + }, + { + "epoch": 8.480049109883364, + "grad_norm": 0.2257215678691864, + "learning_rate": 5.93957294574457e-06, + "loss": 1.7452, + "step": 27628 + }, + { + "epoch": 8.48035604665439, + "grad_norm": 0.15382499992847443, + "learning_rate": 5.9372234419825645e-06, + "loss": 1.7056, + "step": 27629 + }, + { + "epoch": 8.480662983425415, + "grad_norm": 0.1773097813129425, + "learning_rate": 5.934874373674754e-06, + "loss": 1.7161, + "step": 27630 + }, + { + "epoch": 8.48096992019644, + "grad_norm": 0.16455380618572235, + "learning_rate": 5.932525740844341e-06, + "loss": 1.7454, + "step": 27631 + }, + { + "epoch": 8.481276856967465, + "grad_norm": 0.15213815867900848, + "learning_rate": 5.930177543514542e-06, + "loss": 1.7049, + "step": 27632 + }, + { + "epoch": 8.48158379373849, + "grad_norm": 0.17395392060279846, + "learning_rate": 5.927829781708555e-06, + "loss": 1.7026, + "step": 27633 + }, + { + "epoch": 8.481890730509516, + "grad_norm": 0.18553678691387177, + "learning_rate": 5.925482455449588e-06, + "loss": 1.7437, + "step": 27634 + }, + { + "epoch": 8.48219766728054, + "grad_norm": 0.15735404193401337, + "learning_rate": 5.9231355647608346e-06, + "loss": 1.7171, + "step": 27635 + }, + { + "epoch": 8.482504604051565, + "grad_norm": 0.14466318488121033, + "learning_rate": 5.920789109665487e-06, + "loss": 1.6698, + "step": 27636 + }, + { + "epoch": 8.48281154082259, + "grad_norm": 0.159750834107399, + "learning_rate": 5.918443090186732e-06, + "loss": 1.7045, + "step": 27637 + }, + { + "epoch": 8.483118477593615, + "grad_norm": 0.14026959240436554, + "learning_rate": 5.916097506347773e-06, + "loss": 1.6751, + "step": 27638 + }, + { + "epoch": 8.48342541436464, + "grad_norm": 0.18119752407073975, + "learning_rate": 5.913752358171765e-06, + "loss": 1.7768, + "step": 27639 + }, + { + "epoch": 8.483732351135666, + "grad_norm": 0.20957626402378082, + "learning_rate": 5.91140764568191e-06, + "loss": 1.72, + "step": 27640 + }, + { + "epoch": 8.484039287906691, + "grad_norm": 0.1649177372455597, + "learning_rate": 5.909063368901357e-06, + "loss": 1.6938, + "step": 27641 + }, + { + "epoch": 8.484346224677717, + "grad_norm": 0.17464084923267365, + "learning_rate": 5.906719527853271e-06, + "loss": 1.7369, + "step": 27642 + }, + { + "epoch": 8.484653161448742, + "grad_norm": 0.14213840663433075, + "learning_rate": 5.90437612256085e-06, + "loss": 1.6985, + "step": 27643 + }, + { + "epoch": 8.484960098219767, + "grad_norm": 0.2008642852306366, + "learning_rate": 5.902033153047209e-06, + "loss": 1.7394, + "step": 27644 + }, + { + "epoch": 8.485267034990791, + "grad_norm": 0.15051651000976562, + "learning_rate": 5.899690619335541e-06, + "loss": 1.6729, + "step": 27645 + }, + { + "epoch": 8.485573971761816, + "grad_norm": 0.17977653443813324, + "learning_rate": 5.897348521448958e-06, + "loss": 1.7501, + "step": 27646 + }, + { + "epoch": 8.485880908532842, + "grad_norm": 0.2593468427658081, + "learning_rate": 5.89500685941064e-06, + "loss": 1.7174, + "step": 27647 + }, + { + "epoch": 8.486187845303867, + "grad_norm": 0.23924550414085388, + "learning_rate": 5.8926656332437105e-06, + "loss": 1.7383, + "step": 27648 + }, + { + "epoch": 8.486494782074892, + "grad_norm": 0.1751977503299713, + "learning_rate": 5.8903248429713124e-06, + "loss": 1.7024, + "step": 27649 + }, + { + "epoch": 8.486801718845918, + "grad_norm": 0.21737132966518402, + "learning_rate": 5.887984488616582e-06, + "loss": 1.7214, + "step": 27650 + }, + { + "epoch": 8.487108655616943, + "grad_norm": 0.2042747437953949, + "learning_rate": 5.885644570202636e-06, + "loss": 1.7126, + "step": 27651 + }, + { + "epoch": 8.487415592387968, + "grad_norm": 0.14556188881397247, + "learning_rate": 5.883305087752611e-06, + "loss": 1.6919, + "step": 27652 + }, + { + "epoch": 8.487722529158994, + "grad_norm": 0.210098534822464, + "learning_rate": 5.880966041289626e-06, + "loss": 1.6728, + "step": 27653 + }, + { + "epoch": 8.488029465930019, + "grad_norm": 0.26891016960144043, + "learning_rate": 5.878627430836781e-06, + "loss": 1.7356, + "step": 27654 + }, + { + "epoch": 8.488336402701044, + "grad_norm": 0.13008984923362732, + "learning_rate": 5.876289256417217e-06, + "loss": 1.6685, + "step": 27655 + }, + { + "epoch": 8.488643339472068, + "grad_norm": 0.2077993005514145, + "learning_rate": 5.873951518054005e-06, + "loss": 1.6983, + "step": 27656 + }, + { + "epoch": 8.488950276243093, + "grad_norm": 0.19198927283287048, + "learning_rate": 5.871614215770294e-06, + "loss": 1.6703, + "step": 27657 + }, + { + "epoch": 8.489257213014119, + "grad_norm": 0.18122628331184387, + "learning_rate": 5.869277349589137e-06, + "loss": 1.8012, + "step": 27658 + }, + { + "epoch": 8.489564149785144, + "grad_norm": 0.2359529435634613, + "learning_rate": 5.866940919533642e-06, + "loss": 1.7194, + "step": 27659 + }, + { + "epoch": 8.48987108655617, + "grad_norm": 0.15916365385055542, + "learning_rate": 5.864604925626921e-06, + "loss": 1.6929, + "step": 27660 + }, + { + "epoch": 8.490178023327195, + "grad_norm": 0.16607709228992462, + "learning_rate": 5.862269367892026e-06, + "loss": 1.7001, + "step": 27661 + }, + { + "epoch": 8.49048496009822, + "grad_norm": 0.17609505355358124, + "learning_rate": 5.859934246352072e-06, + "loss": 1.736, + "step": 27662 + }, + { + "epoch": 8.490791896869245, + "grad_norm": 0.17898498475551605, + "learning_rate": 5.857599561030103e-06, + "loss": 1.7397, + "step": 27663 + }, + { + "epoch": 8.49109883364027, + "grad_norm": 0.17502975463867188, + "learning_rate": 5.855265311949215e-06, + "loss": 1.6874, + "step": 27664 + }, + { + "epoch": 8.491405770411296, + "grad_norm": 0.16041016578674316, + "learning_rate": 5.852931499132469e-06, + "loss": 1.7494, + "step": 27665 + }, + { + "epoch": 8.491712707182321, + "grad_norm": 0.12939618527889252, + "learning_rate": 5.850598122602929e-06, + "loss": 1.6397, + "step": 27666 + }, + { + "epoch": 8.492019643953345, + "grad_norm": 0.1685323715209961, + "learning_rate": 5.848265182383656e-06, + "loss": 1.7465, + "step": 27667 + }, + { + "epoch": 8.49232658072437, + "grad_norm": 0.14007940888404846, + "learning_rate": 5.845932678497707e-06, + "loss": 1.6718, + "step": 27668 + }, + { + "epoch": 8.492633517495396, + "grad_norm": 0.14807704091072083, + "learning_rate": 5.843600610968125e-06, + "loss": 1.6858, + "step": 27669 + }, + { + "epoch": 8.49294045426642, + "grad_norm": 0.14770758152008057, + "learning_rate": 5.841268979817965e-06, + "loss": 1.6655, + "step": 27670 + }, + { + "epoch": 8.493247391037446, + "grad_norm": 0.13218273222446442, + "learning_rate": 5.838937785070258e-06, + "loss": 1.7132, + "step": 27671 + }, + { + "epoch": 8.493554327808472, + "grad_norm": 0.1349583864212036, + "learning_rate": 5.836607026748076e-06, + "loss": 1.6704, + "step": 27672 + }, + { + "epoch": 8.493861264579497, + "grad_norm": 0.22880202531814575, + "learning_rate": 5.834276704874403e-06, + "loss": 1.7297, + "step": 27673 + }, + { + "epoch": 8.494168201350522, + "grad_norm": 0.17375829815864563, + "learning_rate": 5.831946819472317e-06, + "loss": 1.6857, + "step": 27674 + }, + { + "epoch": 8.494475138121548, + "grad_norm": 0.15201902389526367, + "learning_rate": 5.829617370564805e-06, + "loss": 1.7148, + "step": 27675 + }, + { + "epoch": 8.494782074892573, + "grad_norm": 0.1489444226026535, + "learning_rate": 5.827288358174898e-06, + "loss": 1.7477, + "step": 27676 + }, + { + "epoch": 8.495089011663598, + "grad_norm": 0.1331137716770172, + "learning_rate": 5.824959782325634e-06, + "loss": 1.7282, + "step": 27677 + }, + { + "epoch": 8.495395948434622, + "grad_norm": 0.1779918074607849, + "learning_rate": 5.822631643039994e-06, + "loss": 1.6677, + "step": 27678 + }, + { + "epoch": 8.495702885205647, + "grad_norm": 0.17707432806491852, + "learning_rate": 5.820303940341021e-06, + "loss": 1.7627, + "step": 27679 + }, + { + "epoch": 8.496009821976672, + "grad_norm": 0.19686660170555115, + "learning_rate": 5.817976674251674e-06, + "loss": 1.8057, + "step": 27680 + }, + { + "epoch": 8.496316758747698, + "grad_norm": 0.17378473281860352, + "learning_rate": 5.81564984479499e-06, + "loss": 1.763, + "step": 27681 + }, + { + "epoch": 8.496623695518723, + "grad_norm": 0.13753214478492737, + "learning_rate": 5.813323451993952e-06, + "loss": 1.6567, + "step": 27682 + }, + { + "epoch": 8.496930632289748, + "grad_norm": 0.19319739937782288, + "learning_rate": 5.810997495871551e-06, + "loss": 1.7447, + "step": 27683 + }, + { + "epoch": 8.497237569060774, + "grad_norm": 0.1459372490644455, + "learning_rate": 5.808671976450775e-06, + "loss": 1.6978, + "step": 27684 + }, + { + "epoch": 8.497544505831799, + "grad_norm": 0.1829099804162979, + "learning_rate": 5.806346893754599e-06, + "loss": 1.7399, + "step": 27685 + }, + { + "epoch": 8.497851442602824, + "grad_norm": 0.14952246844768524, + "learning_rate": 5.804022247806007e-06, + "loss": 1.683, + "step": 27686 + }, + { + "epoch": 8.49815837937385, + "grad_norm": 0.14325882494449615, + "learning_rate": 5.801698038627973e-06, + "loss": 1.689, + "step": 27687 + }, + { + "epoch": 8.498465316144873, + "grad_norm": 0.17999286949634552, + "learning_rate": 5.799374266243451e-06, + "loss": 1.7358, + "step": 27688 + }, + { + "epoch": 8.498772252915899, + "grad_norm": 0.17262579500675201, + "learning_rate": 5.797050930675441e-06, + "loss": 1.7249, + "step": 27689 + }, + { + "epoch": 8.499079189686924, + "grad_norm": 0.17032817006111145, + "learning_rate": 5.794728031946861e-06, + "loss": 1.7124, + "step": 27690 + }, + { + "epoch": 8.49938612645795, + "grad_norm": 0.16629208624362946, + "learning_rate": 5.7924055700807115e-06, + "loss": 1.6981, + "step": 27691 + }, + { + "epoch": 8.499693063228975, + "grad_norm": 0.19601507484912872, + "learning_rate": 5.7900835450999115e-06, + "loss": 1.6582, + "step": 27692 + }, + { + "epoch": 8.5, + "grad_norm": 0.2122369408607483, + "learning_rate": 5.787761957027405e-06, + "loss": 1.7509, + "step": 27693 + }, + { + "epoch": 8.500306936771025, + "grad_norm": 0.16086016595363617, + "learning_rate": 5.785440805886166e-06, + "loss": 1.7011, + "step": 27694 + }, + { + "epoch": 8.50061387354205, + "grad_norm": 0.15793873369693756, + "learning_rate": 5.783120091699101e-06, + "loss": 1.6879, + "step": 27695 + }, + { + "epoch": 8.500920810313076, + "grad_norm": 0.15392783284187317, + "learning_rate": 5.7807998144891735e-06, + "loss": 1.6973, + "step": 27696 + }, + { + "epoch": 8.501227747084101, + "grad_norm": 0.17782802879810333, + "learning_rate": 5.778479974279288e-06, + "loss": 1.7319, + "step": 27697 + }, + { + "epoch": 8.501534683855127, + "grad_norm": 0.139020636677742, + "learning_rate": 5.776160571092387e-06, + "loss": 1.6655, + "step": 27698 + }, + { + "epoch": 8.50184162062615, + "grad_norm": 0.1582586020231247, + "learning_rate": 5.773841604951391e-06, + "loss": 1.7134, + "step": 27699 + }, + { + "epoch": 8.502148557397176, + "grad_norm": 0.1685703545808792, + "learning_rate": 5.77152307587921e-06, + "loss": 1.7504, + "step": 27700 + }, + { + "epoch": 8.502455494168201, + "grad_norm": 0.15043340623378754, + "learning_rate": 5.769204983898763e-06, + "loss": 1.6837, + "step": 27701 + }, + { + "epoch": 8.502762430939226, + "grad_norm": 0.18134978413581848, + "learning_rate": 5.7668873290329605e-06, + "loss": 1.7698, + "step": 27702 + }, + { + "epoch": 8.503069367710252, + "grad_norm": 0.18589314818382263, + "learning_rate": 5.764570111304696e-06, + "loss": 1.7565, + "step": 27703 + }, + { + "epoch": 8.503376304481277, + "grad_norm": 0.17075087130069733, + "learning_rate": 5.762253330736883e-06, + "loss": 1.6888, + "step": 27704 + }, + { + "epoch": 8.503683241252302, + "grad_norm": 0.13238663971424103, + "learning_rate": 5.759936987352399e-06, + "loss": 1.6708, + "step": 27705 + }, + { + "epoch": 8.503990178023328, + "grad_norm": 0.1714777648448944, + "learning_rate": 5.75762108117417e-06, + "loss": 1.6934, + "step": 27706 + }, + { + "epoch": 8.504297114794353, + "grad_norm": 0.13476133346557617, + "learning_rate": 5.755305612225037e-06, + "loss": 1.707, + "step": 27707 + }, + { + "epoch": 8.504604051565378, + "grad_norm": 0.1355150043964386, + "learning_rate": 5.7529905805279285e-06, + "loss": 1.695, + "step": 27708 + }, + { + "epoch": 8.504910988336402, + "grad_norm": 0.15239351987838745, + "learning_rate": 5.750675986105686e-06, + "loss": 1.7146, + "step": 27709 + }, + { + "epoch": 8.505217925107427, + "grad_norm": 0.1348891258239746, + "learning_rate": 5.748361828981197e-06, + "loss": 1.7087, + "step": 27710 + }, + { + "epoch": 8.505524861878452, + "grad_norm": 0.1657278686761856, + "learning_rate": 5.746048109177349e-06, + "loss": 1.7222, + "step": 27711 + }, + { + "epoch": 8.505831798649478, + "grad_norm": 0.17044055461883545, + "learning_rate": 5.743734826716967e-06, + "loss": 1.7917, + "step": 27712 + }, + { + "epoch": 8.506138735420503, + "grad_norm": 0.13258327543735504, + "learning_rate": 5.741421981622963e-06, + "loss": 1.6859, + "step": 27713 + }, + { + "epoch": 8.506445672191528, + "grad_norm": 0.13243085145950317, + "learning_rate": 5.7391095739181495e-06, + "loss": 1.6832, + "step": 27714 + }, + { + "epoch": 8.506752608962554, + "grad_norm": 0.14863869547843933, + "learning_rate": 5.736797603625405e-06, + "loss": 1.6961, + "step": 27715 + }, + { + "epoch": 8.50705954573358, + "grad_norm": 0.13942895829677582, + "learning_rate": 5.73448607076757e-06, + "loss": 1.6847, + "step": 27716 + }, + { + "epoch": 8.507366482504604, + "grad_norm": 0.13684460520744324, + "learning_rate": 5.732174975367482e-06, + "loss": 1.6888, + "step": 27717 + }, + { + "epoch": 8.50767341927563, + "grad_norm": 0.1887209117412567, + "learning_rate": 5.7298643174479974e-06, + "loss": 1.7091, + "step": 27718 + }, + { + "epoch": 8.507980356046655, + "grad_norm": 0.17502547800540924, + "learning_rate": 5.727554097031934e-06, + "loss": 1.7103, + "step": 27719 + }, + { + "epoch": 8.50828729281768, + "grad_norm": 0.17275308072566986, + "learning_rate": 5.725244314142137e-06, + "loss": 1.7392, + "step": 27720 + }, + { + "epoch": 8.508594229588704, + "grad_norm": 0.13890086114406586, + "learning_rate": 5.722934968801419e-06, + "loss": 1.6711, + "step": 27721 + }, + { + "epoch": 8.50890116635973, + "grad_norm": 0.16987508535385132, + "learning_rate": 5.720626061032603e-06, + "loss": 1.6784, + "step": 27722 + }, + { + "epoch": 8.509208103130755, + "grad_norm": 0.12734577059745789, + "learning_rate": 5.718317590858529e-06, + "loss": 1.668, + "step": 27723 + }, + { + "epoch": 8.50951503990178, + "grad_norm": 0.17097610235214233, + "learning_rate": 5.716009558301977e-06, + "loss": 1.7419, + "step": 27724 + }, + { + "epoch": 8.509821976672805, + "grad_norm": 0.15415556728839874, + "learning_rate": 5.713701963385798e-06, + "loss": 1.6794, + "step": 27725 + }, + { + "epoch": 8.51012891344383, + "grad_norm": 0.115156389772892, + "learning_rate": 5.711394806132758e-06, + "loss": 1.6364, + "step": 27726 + }, + { + "epoch": 8.510435850214856, + "grad_norm": 0.1583303064107895, + "learning_rate": 5.709088086565667e-06, + "loss": 1.7185, + "step": 27727 + }, + { + "epoch": 8.510742786985881, + "grad_norm": 0.17150144279003143, + "learning_rate": 5.706781804707345e-06, + "loss": 1.7122, + "step": 27728 + }, + { + "epoch": 8.511049723756907, + "grad_norm": 0.14469772577285767, + "learning_rate": 5.7044759605805464e-06, + "loss": 1.6806, + "step": 27729 + }, + { + "epoch": 8.511356660527932, + "grad_norm": 0.1671745926141739, + "learning_rate": 5.702170554208102e-06, + "loss": 1.7051, + "step": 27730 + }, + { + "epoch": 8.511663597298956, + "grad_norm": 0.14769956469535828, + "learning_rate": 5.699865585612746e-06, + "loss": 1.7052, + "step": 27731 + }, + { + "epoch": 8.511970534069981, + "grad_norm": 0.17527055740356445, + "learning_rate": 5.697561054817296e-06, + "loss": 1.7397, + "step": 27732 + }, + { + "epoch": 8.512277470841006, + "grad_norm": 0.16712914407253265, + "learning_rate": 5.695256961844519e-06, + "loss": 1.7025, + "step": 27733 + }, + { + "epoch": 8.512584407612032, + "grad_norm": 0.14546720683574677, + "learning_rate": 5.6929533067171745e-06, + "loss": 1.667, + "step": 27734 + }, + { + "epoch": 8.512891344383057, + "grad_norm": 0.1326368749141693, + "learning_rate": 5.690650089458038e-06, + "loss": 1.7109, + "step": 27735 + }, + { + "epoch": 8.513198281154082, + "grad_norm": 0.14168506860733032, + "learning_rate": 5.688347310089864e-06, + "loss": 1.6497, + "step": 27736 + }, + { + "epoch": 8.513505217925108, + "grad_norm": 0.18198592960834503, + "learning_rate": 5.686044968635418e-06, + "loss": 1.7167, + "step": 27737 + }, + { + "epoch": 8.513812154696133, + "grad_norm": 0.14291147887706757, + "learning_rate": 5.683743065117447e-06, + "loss": 1.6855, + "step": 27738 + }, + { + "epoch": 8.514119091467158, + "grad_norm": 0.17336830496788025, + "learning_rate": 5.681441599558701e-06, + "loss": 1.738, + "step": 27739 + }, + { + "epoch": 8.514426028238184, + "grad_norm": 0.1447203904390335, + "learning_rate": 5.679140571981922e-06, + "loss": 1.7217, + "step": 27740 + }, + { + "epoch": 8.514732965009209, + "grad_norm": 0.19665221869945526, + "learning_rate": 5.676839982409849e-06, + "loss": 1.7395, + "step": 27741 + }, + { + "epoch": 8.515039901780233, + "grad_norm": 0.1405279040336609, + "learning_rate": 5.6745398308652386e-06, + "loss": 1.6559, + "step": 27742 + }, + { + "epoch": 8.515346838551258, + "grad_norm": 0.15195727348327637, + "learning_rate": 5.672240117370797e-06, + "loss": 1.6977, + "step": 27743 + }, + { + "epoch": 8.515653775322283, + "grad_norm": 0.11381472647190094, + "learning_rate": 5.669940841949261e-06, + "loss": 1.6594, + "step": 27744 + }, + { + "epoch": 8.515960712093309, + "grad_norm": 0.17271532118320465, + "learning_rate": 5.667642004623347e-06, + "loss": 1.7323, + "step": 27745 + }, + { + "epoch": 8.516267648864334, + "grad_norm": 0.15365839004516602, + "learning_rate": 5.665343605415774e-06, + "loss": 1.7257, + "step": 27746 + }, + { + "epoch": 8.51657458563536, + "grad_norm": 0.22701260447502136, + "learning_rate": 5.66304564434928e-06, + "loss": 1.6939, + "step": 27747 + }, + { + "epoch": 8.516881522406385, + "grad_norm": 0.14642612636089325, + "learning_rate": 5.660748121446535e-06, + "loss": 1.6985, + "step": 27748 + }, + { + "epoch": 8.51718845917741, + "grad_norm": 0.1659226268529892, + "learning_rate": 5.658451036730272e-06, + "loss": 1.7439, + "step": 27749 + }, + { + "epoch": 8.517495395948435, + "grad_norm": 0.14763525128364563, + "learning_rate": 5.65615439022319e-06, + "loss": 1.6714, + "step": 27750 + }, + { + "epoch": 8.51780233271946, + "grad_norm": 0.17457270622253418, + "learning_rate": 5.65385818194798e-06, + "loss": 1.7214, + "step": 27751 + }, + { + "epoch": 8.518109269490484, + "grad_norm": 0.15170279145240784, + "learning_rate": 5.651562411927335e-06, + "loss": 1.7121, + "step": 27752 + }, + { + "epoch": 8.51841620626151, + "grad_norm": 0.16129034757614136, + "learning_rate": 5.649267080183945e-06, + "loss": 1.6916, + "step": 27753 + }, + { + "epoch": 8.518723143032535, + "grad_norm": 0.20800361037254333, + "learning_rate": 5.64697218674049e-06, + "loss": 1.7482, + "step": 27754 + }, + { + "epoch": 8.51903007980356, + "grad_norm": 0.16350114345550537, + "learning_rate": 5.644677731619652e-06, + "loss": 1.6705, + "step": 27755 + }, + { + "epoch": 8.519337016574585, + "grad_norm": 0.15720658004283905, + "learning_rate": 5.642383714844107e-06, + "loss": 1.6871, + "step": 27756 + }, + { + "epoch": 8.51964395334561, + "grad_norm": 0.21885983645915985, + "learning_rate": 5.640090136436526e-06, + "loss": 1.7057, + "step": 27757 + }, + { + "epoch": 8.519950890116636, + "grad_norm": 0.1411464810371399, + "learning_rate": 5.637796996419564e-06, + "loss": 1.7103, + "step": 27758 + }, + { + "epoch": 8.520257826887661, + "grad_norm": 0.14518170058727264, + "learning_rate": 5.635504294815913e-06, + "loss": 1.7184, + "step": 27759 + }, + { + "epoch": 8.520564763658687, + "grad_norm": 0.17998449504375458, + "learning_rate": 5.633212031648199e-06, + "loss": 1.6822, + "step": 27760 + }, + { + "epoch": 8.520871700429712, + "grad_norm": 0.1301501840353012, + "learning_rate": 5.630920206939094e-06, + "loss": 1.6878, + "step": 27761 + }, + { + "epoch": 8.521178637200737, + "grad_norm": 0.16201011836528778, + "learning_rate": 5.628628820711235e-06, + "loss": 1.7581, + "step": 27762 + }, + { + "epoch": 8.521485573971761, + "grad_norm": 0.20399747788906097, + "learning_rate": 5.626337872987269e-06, + "loss": 1.7281, + "step": 27763 + }, + { + "epoch": 8.521792510742786, + "grad_norm": 0.18675439059734344, + "learning_rate": 5.624047363789858e-06, + "loss": 1.7445, + "step": 27764 + }, + { + "epoch": 8.522099447513812, + "grad_norm": 0.1858585625886917, + "learning_rate": 5.621757293141594e-06, + "loss": 1.729, + "step": 27765 + }, + { + "epoch": 8.522406384284837, + "grad_norm": 0.1731054186820984, + "learning_rate": 5.619467661065164e-06, + "loss": 1.6709, + "step": 27766 + }, + { + "epoch": 8.522713321055862, + "grad_norm": 0.2048177868127823, + "learning_rate": 5.617178467583145e-06, + "loss": 1.8187, + "step": 27767 + }, + { + "epoch": 8.523020257826888, + "grad_norm": 0.1944245547056198, + "learning_rate": 5.614889712718191e-06, + "loss": 1.7238, + "step": 27768 + }, + { + "epoch": 8.523327194597913, + "grad_norm": 0.16106872260570526, + "learning_rate": 5.612601396492906e-06, + "loss": 1.7089, + "step": 27769 + }, + { + "epoch": 8.523634131368938, + "grad_norm": 0.1933506578207016, + "learning_rate": 5.610313518929916e-06, + "loss": 1.6702, + "step": 27770 + }, + { + "epoch": 8.523941068139964, + "grad_norm": 0.14211905002593994, + "learning_rate": 5.608026080051826e-06, + "loss": 1.686, + "step": 27771 + }, + { + "epoch": 8.524248004910989, + "grad_norm": 0.1588355004787445, + "learning_rate": 5.605739079881239e-06, + "loss": 1.691, + "step": 27772 + }, + { + "epoch": 8.524554941682014, + "grad_norm": 0.2026119977235794, + "learning_rate": 5.60345251844076e-06, + "loss": 1.7024, + "step": 27773 + }, + { + "epoch": 8.524861878453038, + "grad_norm": 0.19816550612449646, + "learning_rate": 5.601166395752988e-06, + "loss": 1.7793, + "step": 27774 + }, + { + "epoch": 8.525168815224063, + "grad_norm": 0.1687595695257187, + "learning_rate": 5.59888071184051e-06, + "loss": 1.7066, + "step": 27775 + }, + { + "epoch": 8.525475751995089, + "grad_norm": 0.1844881922006607, + "learning_rate": 5.5965954667259125e-06, + "loss": 1.7091, + "step": 27776 + }, + { + "epoch": 8.525782688766114, + "grad_norm": 0.13911494612693787, + "learning_rate": 5.5943106604317895e-06, + "loss": 1.6611, + "step": 27777 + }, + { + "epoch": 8.52608962553714, + "grad_norm": 0.215097114443779, + "learning_rate": 5.592026292980718e-06, + "loss": 1.7436, + "step": 27778 + }, + { + "epoch": 8.526396562308165, + "grad_norm": 0.19177651405334473, + "learning_rate": 5.589742364395267e-06, + "loss": 1.7198, + "step": 27779 + }, + { + "epoch": 8.52670349907919, + "grad_norm": 0.16470259428024292, + "learning_rate": 5.587458874697998e-06, + "loss": 1.7405, + "step": 27780 + }, + { + "epoch": 8.527010435850215, + "grad_norm": 0.13213464617729187, + "learning_rate": 5.585175823911515e-06, + "loss": 1.6651, + "step": 27781 + }, + { + "epoch": 8.52731737262124, + "grad_norm": 0.18105588853359222, + "learning_rate": 5.582893212058338e-06, + "loss": 1.7169, + "step": 27782 + }, + { + "epoch": 8.527624309392266, + "grad_norm": 0.19358783960342407, + "learning_rate": 5.580611039161065e-06, + "loss": 1.7165, + "step": 27783 + }, + { + "epoch": 8.527931246163291, + "grad_norm": 0.13674969971179962, + "learning_rate": 5.578329305242208e-06, + "loss": 1.7086, + "step": 27784 + }, + { + "epoch": 8.528238182934315, + "grad_norm": 0.1365654170513153, + "learning_rate": 5.5760480103243475e-06, + "loss": 1.7031, + "step": 27785 + }, + { + "epoch": 8.52854511970534, + "grad_norm": 0.17749033868312836, + "learning_rate": 5.573767154430015e-06, + "loss": 1.7717, + "step": 27786 + }, + { + "epoch": 8.528852056476365, + "grad_norm": 0.16521626710891724, + "learning_rate": 5.5714867375817545e-06, + "loss": 1.6859, + "step": 27787 + }, + { + "epoch": 8.52915899324739, + "grad_norm": 0.14327271282672882, + "learning_rate": 5.569206759802103e-06, + "loss": 1.6996, + "step": 27788 + }, + { + "epoch": 8.529465930018416, + "grad_norm": 0.1895138919353485, + "learning_rate": 5.5669272211135934e-06, + "loss": 1.7127, + "step": 27789 + }, + { + "epoch": 8.529772866789441, + "grad_norm": 0.16256090998649597, + "learning_rate": 5.564648121538757e-06, + "loss": 1.7083, + "step": 27790 + }, + { + "epoch": 8.530079803560467, + "grad_norm": 0.18591371178627014, + "learning_rate": 5.562369461100103e-06, + "loss": 1.7852, + "step": 27791 + }, + { + "epoch": 8.530386740331492, + "grad_norm": 0.15933659672737122, + "learning_rate": 5.560091239820165e-06, + "loss": 1.69, + "step": 27792 + }, + { + "epoch": 8.530693677102517, + "grad_norm": 0.15374226868152618, + "learning_rate": 5.5578134577214505e-06, + "loss": 1.7397, + "step": 27793 + }, + { + "epoch": 8.531000613873543, + "grad_norm": 0.1786707490682602, + "learning_rate": 5.555536114826476e-06, + "loss": 1.7456, + "step": 27794 + }, + { + "epoch": 8.531307550644566, + "grad_norm": 0.16859668493270874, + "learning_rate": 5.553259211157741e-06, + "loss": 1.724, + "step": 27795 + }, + { + "epoch": 8.531614487415592, + "grad_norm": 0.21200759708881378, + "learning_rate": 5.5509827467377485e-06, + "loss": 1.7326, + "step": 27796 + }, + { + "epoch": 8.531921424186617, + "grad_norm": 0.16948217153549194, + "learning_rate": 5.548706721588986e-06, + "loss": 1.7082, + "step": 27797 + }, + { + "epoch": 8.532228360957642, + "grad_norm": 0.17014150321483612, + "learning_rate": 5.546431135733976e-06, + "loss": 1.7344, + "step": 27798 + }, + { + "epoch": 8.532535297728668, + "grad_norm": 0.20479294657707214, + "learning_rate": 5.544155989195171e-06, + "loss": 1.8121, + "step": 27799 + }, + { + "epoch": 8.532842234499693, + "grad_norm": 0.16958604753017426, + "learning_rate": 5.541881281995093e-06, + "loss": 1.773, + "step": 27800 + }, + { + "epoch": 8.533149171270718, + "grad_norm": 0.17606206238269806, + "learning_rate": 5.539607014156184e-06, + "loss": 1.6937, + "step": 27801 + }, + { + "epoch": 8.533456108041744, + "grad_norm": 0.1357482373714447, + "learning_rate": 5.537333185700943e-06, + "loss": 1.7234, + "step": 27802 + }, + { + "epoch": 8.533763044812769, + "grad_norm": 0.17217469215393066, + "learning_rate": 5.535059796651837e-06, + "loss": 1.722, + "step": 27803 + }, + { + "epoch": 8.534069981583794, + "grad_norm": 0.14100955426692963, + "learning_rate": 5.532786847031335e-06, + "loss": 1.6574, + "step": 27804 + }, + { + "epoch": 8.53437691835482, + "grad_norm": 0.1515544354915619, + "learning_rate": 5.530514336861897e-06, + "loss": 1.7489, + "step": 27805 + }, + { + "epoch": 8.534683855125843, + "grad_norm": 0.15518932044506073, + "learning_rate": 5.528242266165978e-06, + "loss": 1.7338, + "step": 27806 + }, + { + "epoch": 8.534990791896869, + "grad_norm": 0.15764978528022766, + "learning_rate": 5.525970634966033e-06, + "loss": 1.6971, + "step": 27807 + }, + { + "epoch": 8.535297728667894, + "grad_norm": 0.13838590681552887, + "learning_rate": 5.523699443284513e-06, + "loss": 1.723, + "step": 27808 + }, + { + "epoch": 8.53560466543892, + "grad_norm": 0.17713284492492676, + "learning_rate": 5.521428691143865e-06, + "loss": 1.7227, + "step": 27809 + }, + { + "epoch": 8.535911602209945, + "grad_norm": 0.19389420747756958, + "learning_rate": 5.51915837856653e-06, + "loss": 1.703, + "step": 27810 + }, + { + "epoch": 8.53621853898097, + "grad_norm": 0.13955099880695343, + "learning_rate": 5.516888505574941e-06, + "loss": 1.7093, + "step": 27811 + }, + { + "epoch": 8.536525475751995, + "grad_norm": 0.1319018006324768, + "learning_rate": 5.514619072191535e-06, + "loss": 1.7093, + "step": 27812 + }, + { + "epoch": 8.53683241252302, + "grad_norm": 0.14604489505290985, + "learning_rate": 5.512350078438733e-06, + "loss": 1.7113, + "step": 27813 + }, + { + "epoch": 8.537139349294046, + "grad_norm": 0.14439311623573303, + "learning_rate": 5.510081524338956e-06, + "loss": 1.7164, + "step": 27814 + }, + { + "epoch": 8.537446286065071, + "grad_norm": 0.17546533048152924, + "learning_rate": 5.507813409914647e-06, + "loss": 1.7432, + "step": 27815 + }, + { + "epoch": 8.537753222836095, + "grad_norm": 0.15710201859474182, + "learning_rate": 5.505545735188189e-06, + "loss": 1.7353, + "step": 27816 + }, + { + "epoch": 8.53806015960712, + "grad_norm": 0.19635994732379913, + "learning_rate": 5.503278500182019e-06, + "loss": 1.7042, + "step": 27817 + }, + { + "epoch": 8.538367096378146, + "grad_norm": 0.17653462290763855, + "learning_rate": 5.501011704918519e-06, + "loss": 1.7007, + "step": 27818 + }, + { + "epoch": 8.53867403314917, + "grad_norm": 0.1532578021287918, + "learning_rate": 5.498745349420109e-06, + "loss": 1.7111, + "step": 27819 + }, + { + "epoch": 8.538980969920196, + "grad_norm": 0.15368299186229706, + "learning_rate": 5.496479433709178e-06, + "loss": 1.7073, + "step": 27820 + }, + { + "epoch": 8.539287906691222, + "grad_norm": 0.19518911838531494, + "learning_rate": 5.494213957808126e-06, + "loss": 1.756, + "step": 27821 + }, + { + "epoch": 8.539594843462247, + "grad_norm": 0.13748668134212494, + "learning_rate": 5.4919489217393376e-06, + "loss": 1.6636, + "step": 27822 + }, + { + "epoch": 8.539901780233272, + "grad_norm": 0.2104724794626236, + "learning_rate": 5.489684325525191e-06, + "loss": 1.7734, + "step": 27823 + }, + { + "epoch": 8.540208717004298, + "grad_norm": 0.15495489537715912, + "learning_rate": 5.4874201691880786e-06, + "loss": 1.6858, + "step": 27824 + }, + { + "epoch": 8.540515653775323, + "grad_norm": 0.16447420418262482, + "learning_rate": 5.4851564527503674e-06, + "loss": 1.7053, + "step": 27825 + }, + { + "epoch": 8.540822590546348, + "grad_norm": 0.1427844911813736, + "learning_rate": 5.482893176234433e-06, + "loss": 1.6885, + "step": 27826 + }, + { + "epoch": 8.541129527317374, + "grad_norm": 0.14386583864688873, + "learning_rate": 5.4806303396626344e-06, + "loss": 1.6762, + "step": 27827 + }, + { + "epoch": 8.541436464088397, + "grad_norm": 0.15933938324451447, + "learning_rate": 5.478367943057344e-06, + "loss": 1.6945, + "step": 27828 + }, + { + "epoch": 8.541743400859422, + "grad_norm": 0.3127610385417938, + "learning_rate": 5.476105986440922e-06, + "loss": 1.772, + "step": 27829 + }, + { + "epoch": 8.542050337630448, + "grad_norm": 0.168161079287529, + "learning_rate": 5.473844469835709e-06, + "loss": 1.7398, + "step": 27830 + }, + { + "epoch": 8.542357274401473, + "grad_norm": 0.17208287119865417, + "learning_rate": 5.471583393264057e-06, + "loss": 1.7345, + "step": 27831 + }, + { + "epoch": 8.542664211172498, + "grad_norm": 0.18009017407894135, + "learning_rate": 5.469322756748335e-06, + "loss": 1.7785, + "step": 27832 + }, + { + "epoch": 8.542971147943524, + "grad_norm": 0.17091695964336395, + "learning_rate": 5.467062560310843e-06, + "loss": 1.689, + "step": 27833 + }, + { + "epoch": 8.543278084714549, + "grad_norm": 0.1495637446641922, + "learning_rate": 5.4648028039739675e-06, + "loss": 1.7409, + "step": 27834 + }, + { + "epoch": 8.543585021485574, + "grad_norm": 0.19924791157245636, + "learning_rate": 5.462543487759986e-06, + "loss": 1.7136, + "step": 27835 + }, + { + "epoch": 8.5438919582566, + "grad_norm": 0.19490383565425873, + "learning_rate": 5.460284611691269e-06, + "loss": 1.7371, + "step": 27836 + }, + { + "epoch": 8.544198895027625, + "grad_norm": 0.20383320748806, + "learning_rate": 5.458026175790127e-06, + "loss": 1.7268, + "step": 27837 + }, + { + "epoch": 8.544505831798649, + "grad_norm": 0.20110821723937988, + "learning_rate": 5.455768180078869e-06, + "loss": 1.7069, + "step": 27838 + }, + { + "epoch": 8.544812768569674, + "grad_norm": 0.16181184351444244, + "learning_rate": 5.453510624579827e-06, + "loss": 1.7158, + "step": 27839 + }, + { + "epoch": 8.5451197053407, + "grad_norm": 0.17110773921012878, + "learning_rate": 5.451253509315296e-06, + "loss": 1.6925, + "step": 27840 + }, + { + "epoch": 8.545426642111725, + "grad_norm": 0.16039033234119415, + "learning_rate": 5.448996834307591e-06, + "loss": 1.7281, + "step": 27841 + }, + { + "epoch": 8.54573357888275, + "grad_norm": 0.12631241977214813, + "learning_rate": 5.446740599579014e-06, + "loss": 1.6816, + "step": 27842 + }, + { + "epoch": 8.546040515653775, + "grad_norm": 0.20419110357761383, + "learning_rate": 5.444484805151856e-06, + "loss": 1.7594, + "step": 27843 + }, + { + "epoch": 8.5463474524248, + "grad_norm": 0.25453490018844604, + "learning_rate": 5.442229451048414e-06, + "loss": 1.7423, + "step": 27844 + }, + { + "epoch": 8.546654389195826, + "grad_norm": 0.15445558726787567, + "learning_rate": 5.439974537290982e-06, + "loss": 1.729, + "step": 27845 + }, + { + "epoch": 8.546961325966851, + "grad_norm": 0.16175805032253265, + "learning_rate": 5.43772006390183e-06, + "loss": 1.7515, + "step": 27846 + }, + { + "epoch": 8.547268262737877, + "grad_norm": 0.1958928406238556, + "learning_rate": 5.435466030903253e-06, + "loss": 1.7203, + "step": 27847 + }, + { + "epoch": 8.547575199508902, + "grad_norm": 0.17533376812934875, + "learning_rate": 5.433212438317514e-06, + "loss": 1.7393, + "step": 27848 + }, + { + "epoch": 8.547882136279926, + "grad_norm": 0.16437608003616333, + "learning_rate": 5.430959286166904e-06, + "loss": 1.7284, + "step": 27849 + }, + { + "epoch": 8.54818907305095, + "grad_norm": 0.16348768770694733, + "learning_rate": 5.428706574473663e-06, + "loss": 1.7284, + "step": 27850 + }, + { + "epoch": 8.548496009821976, + "grad_norm": 0.136602982878685, + "learning_rate": 5.426454303260081e-06, + "loss": 1.6606, + "step": 27851 + }, + { + "epoch": 8.548802946593002, + "grad_norm": 0.1359151154756546, + "learning_rate": 5.42420247254839e-06, + "loss": 1.6989, + "step": 27852 + }, + { + "epoch": 8.549109883364027, + "grad_norm": 0.17593000829219818, + "learning_rate": 5.421951082360866e-06, + "loss": 1.7483, + "step": 27853 + }, + { + "epoch": 8.549416820135052, + "grad_norm": 0.1791890412569046, + "learning_rate": 5.419700132719746e-06, + "loss": 1.7032, + "step": 27854 + }, + { + "epoch": 8.549723756906078, + "grad_norm": 0.15925002098083496, + "learning_rate": 5.417449623647281e-06, + "loss": 1.7055, + "step": 27855 + }, + { + "epoch": 8.550030693677103, + "grad_norm": 0.16391295194625854, + "learning_rate": 5.415199555165706e-06, + "loss": 1.6555, + "step": 27856 + }, + { + "epoch": 8.550337630448128, + "grad_norm": 0.18588928878307343, + "learning_rate": 5.412949927297262e-06, + "loss": 1.6723, + "step": 27857 + }, + { + "epoch": 8.550644567219154, + "grad_norm": 0.15956605970859528, + "learning_rate": 5.410700740064184e-06, + "loss": 1.7148, + "step": 27858 + }, + { + "epoch": 8.550951503990177, + "grad_norm": 0.14419449865818024, + "learning_rate": 5.408451993488689e-06, + "loss": 1.6997, + "step": 27859 + }, + { + "epoch": 8.551258440761202, + "grad_norm": 0.18104690313339233, + "learning_rate": 5.406203687593014e-06, + "loss": 1.7121, + "step": 27860 + }, + { + "epoch": 8.551565377532228, + "grad_norm": 0.15283553302288055, + "learning_rate": 5.40395582239937e-06, + "loss": 1.6536, + "step": 27861 + }, + { + "epoch": 8.551872314303253, + "grad_norm": 0.14498579502105713, + "learning_rate": 5.401708397929972e-06, + "loss": 1.6649, + "step": 27862 + }, + { + "epoch": 8.552179251074278, + "grad_norm": 0.1828843504190445, + "learning_rate": 5.39946141420703e-06, + "loss": 1.718, + "step": 27863 + }, + { + "epoch": 8.552486187845304, + "grad_norm": 0.20626986026763916, + "learning_rate": 5.397214871252754e-06, + "loss": 1.7561, + "step": 27864 + }, + { + "epoch": 8.55279312461633, + "grad_norm": 0.16986799240112305, + "learning_rate": 5.394968769089331e-06, + "loss": 1.7386, + "step": 27865 + }, + { + "epoch": 8.553100061387354, + "grad_norm": 0.16921544075012207, + "learning_rate": 5.392723107738995e-06, + "loss": 1.6939, + "step": 27866 + }, + { + "epoch": 8.55340699815838, + "grad_norm": 0.19882866740226746, + "learning_rate": 5.390477887223888e-06, + "loss": 1.7376, + "step": 27867 + }, + { + "epoch": 8.553713934929405, + "grad_norm": 0.17440463602542877, + "learning_rate": 5.3882331075662486e-06, + "loss": 1.7142, + "step": 27868 + }, + { + "epoch": 8.55402087170043, + "grad_norm": 0.1494864523410797, + "learning_rate": 5.38598876878822e-06, + "loss": 1.6953, + "step": 27869 + }, + { + "epoch": 8.554327808471456, + "grad_norm": 0.18791508674621582, + "learning_rate": 5.383744870912006e-06, + "loss": 1.7863, + "step": 27870 + }, + { + "epoch": 8.55463474524248, + "grad_norm": 0.19124576449394226, + "learning_rate": 5.381501413959777e-06, + "loss": 1.6668, + "step": 27871 + }, + { + "epoch": 8.554941682013505, + "grad_norm": 0.17011114954948425, + "learning_rate": 5.3792583979537016e-06, + "loss": 1.7356, + "step": 27872 + }, + { + "epoch": 8.55524861878453, + "grad_norm": 0.1780267208814621, + "learning_rate": 5.377015822915949e-06, + "loss": 1.7428, + "step": 27873 + }, + { + "epoch": 8.555555555555555, + "grad_norm": 0.18539096415042877, + "learning_rate": 5.374773688868678e-06, + "loss": 1.7534, + "step": 27874 + }, + { + "epoch": 8.55586249232658, + "grad_norm": 0.1668393909931183, + "learning_rate": 5.372531995834051e-06, + "loss": 1.6884, + "step": 27875 + }, + { + "epoch": 8.556169429097606, + "grad_norm": 0.15957699716091156, + "learning_rate": 5.3702907438342165e-06, + "loss": 1.6739, + "step": 27876 + }, + { + "epoch": 8.556476365868631, + "grad_norm": 0.17210347950458527, + "learning_rate": 5.368049932891334e-06, + "loss": 1.7062, + "step": 27877 + }, + { + "epoch": 8.556783302639657, + "grad_norm": 0.1614166796207428, + "learning_rate": 5.365809563027535e-06, + "loss": 1.675, + "step": 27878 + }, + { + "epoch": 8.557090239410682, + "grad_norm": 0.17495310306549072, + "learning_rate": 5.36356963426497e-06, + "loss": 1.7694, + "step": 27879 + }, + { + "epoch": 8.557397176181707, + "grad_norm": 0.1660371571779251, + "learning_rate": 5.361330146625771e-06, + "loss": 1.6573, + "step": 27880 + }, + { + "epoch": 8.557704112952731, + "grad_norm": 0.1997743546962738, + "learning_rate": 5.359091100132074e-06, + "loss": 1.7006, + "step": 27881 + }, + { + "epoch": 8.558011049723756, + "grad_norm": 0.21383358538150787, + "learning_rate": 5.356852494805992e-06, + "loss": 1.7677, + "step": 27882 + }, + { + "epoch": 8.558317986494782, + "grad_norm": 0.15339766442775726, + "learning_rate": 5.354614330669677e-06, + "loss": 1.6852, + "step": 27883 + }, + { + "epoch": 8.558624923265807, + "grad_norm": 0.16808396577835083, + "learning_rate": 5.352376607745213e-06, + "loss": 1.7046, + "step": 27884 + }, + { + "epoch": 8.558931860036832, + "grad_norm": 0.19627085328102112, + "learning_rate": 5.350139326054748e-06, + "loss": 1.7255, + "step": 27885 + }, + { + "epoch": 8.559238796807858, + "grad_norm": 0.16882671415805817, + "learning_rate": 5.347902485620365e-06, + "loss": 1.6823, + "step": 27886 + }, + { + "epoch": 8.559545733578883, + "grad_norm": 0.19045037031173706, + "learning_rate": 5.3456660864641846e-06, + "loss": 1.7901, + "step": 27887 + }, + { + "epoch": 8.559852670349908, + "grad_norm": 0.16998142004013062, + "learning_rate": 5.3434301286083064e-06, + "loss": 1.7226, + "step": 27888 + }, + { + "epoch": 8.560159607120934, + "grad_norm": 0.16370677947998047, + "learning_rate": 5.341194612074824e-06, + "loss": 1.7151, + "step": 27889 + }, + { + "epoch": 8.560466543891959, + "grad_norm": 0.16379667818546295, + "learning_rate": 5.3389595368858345e-06, + "loss": 1.6742, + "step": 27890 + }, + { + "epoch": 8.560773480662984, + "grad_norm": 0.1741562932729721, + "learning_rate": 5.336724903063423e-06, + "loss": 1.7162, + "step": 27891 + }, + { + "epoch": 8.561080417434008, + "grad_norm": 0.17712807655334473, + "learning_rate": 5.334490710629675e-06, + "loss": 1.71, + "step": 27892 + }, + { + "epoch": 8.561387354205033, + "grad_norm": 0.16719931364059448, + "learning_rate": 5.332256959606669e-06, + "loss": 1.7299, + "step": 27893 + }, + { + "epoch": 8.561694290976058, + "grad_norm": 0.3024488389492035, + "learning_rate": 5.330023650016475e-06, + "loss": 1.7435, + "step": 27894 + }, + { + "epoch": 8.562001227747084, + "grad_norm": 0.13923676311969757, + "learning_rate": 5.3277907818811755e-06, + "loss": 1.6856, + "step": 27895 + }, + { + "epoch": 8.56230816451811, + "grad_norm": 0.1582731008529663, + "learning_rate": 5.325558355222826e-06, + "loss": 1.7057, + "step": 27896 + }, + { + "epoch": 8.562615101289135, + "grad_norm": 0.17576326429843903, + "learning_rate": 5.323326370063497e-06, + "loss": 1.7439, + "step": 27897 + }, + { + "epoch": 8.56292203806016, + "grad_norm": 0.16990134119987488, + "learning_rate": 5.321094826425238e-06, + "loss": 1.7366, + "step": 27898 + }, + { + "epoch": 8.563228974831185, + "grad_norm": 0.14154621958732605, + "learning_rate": 5.318863724330114e-06, + "loss": 1.6824, + "step": 27899 + }, + { + "epoch": 8.56353591160221, + "grad_norm": 0.1460665911436081, + "learning_rate": 5.3166330638001635e-06, + "loss": 1.729, + "step": 27900 + }, + { + "epoch": 8.563842848373236, + "grad_norm": 0.14366431534290314, + "learning_rate": 5.314402844857424e-06, + "loss": 1.704, + "step": 27901 + }, + { + "epoch": 8.56414978514426, + "grad_norm": 0.15405386686325073, + "learning_rate": 5.312173067523968e-06, + "loss": 1.7357, + "step": 27902 + }, + { + "epoch": 8.564456721915285, + "grad_norm": 0.12789638340473175, + "learning_rate": 5.309943731821787e-06, + "loss": 1.634, + "step": 27903 + }, + { + "epoch": 8.56476365868631, + "grad_norm": 0.17007184028625488, + "learning_rate": 5.307714837772948e-06, + "loss": 1.7065, + "step": 27904 + }, + { + "epoch": 8.565070595457335, + "grad_norm": 0.1982787400484085, + "learning_rate": 5.305486385399466e-06, + "loss": 1.7459, + "step": 27905 + }, + { + "epoch": 8.56537753222836, + "grad_norm": 0.18433566391468048, + "learning_rate": 5.303258374723363e-06, + "loss": 1.7414, + "step": 27906 + }, + { + "epoch": 8.565684468999386, + "grad_norm": 0.13842104375362396, + "learning_rate": 5.30103080576666e-06, + "loss": 1.6988, + "step": 27907 + }, + { + "epoch": 8.565991405770411, + "grad_norm": 0.14736461639404297, + "learning_rate": 5.298803678551373e-06, + "loss": 1.6828, + "step": 27908 + }, + { + "epoch": 8.566298342541437, + "grad_norm": 0.14953723549842834, + "learning_rate": 5.2965769930995e-06, + "loss": 1.6896, + "step": 27909 + }, + { + "epoch": 8.566605279312462, + "grad_norm": 0.15445443987846375, + "learning_rate": 5.294350749433058e-06, + "loss": 1.7096, + "step": 27910 + }, + { + "epoch": 8.566912216083487, + "grad_norm": 0.180703803896904, + "learning_rate": 5.292124947574045e-06, + "loss": 1.7191, + "step": 27911 + }, + { + "epoch": 8.567219152854513, + "grad_norm": 0.13825593888759613, + "learning_rate": 5.289899587544461e-06, + "loss": 1.6928, + "step": 27912 + }, + { + "epoch": 8.567526089625538, + "grad_norm": 0.15663209557533264, + "learning_rate": 5.287674669366294e-06, + "loss": 1.7004, + "step": 27913 + }, + { + "epoch": 8.567833026396562, + "grad_norm": 0.14148147404193878, + "learning_rate": 5.285450193061526e-06, + "loss": 1.6961, + "step": 27914 + }, + { + "epoch": 8.568139963167587, + "grad_norm": 0.12393147498369217, + "learning_rate": 5.283226158652155e-06, + "loss": 1.6515, + "step": 27915 + }, + { + "epoch": 8.568446899938612, + "grad_norm": 0.1855689138174057, + "learning_rate": 5.281002566160148e-06, + "loss": 1.8017, + "step": 27916 + }, + { + "epoch": 8.568753836709638, + "grad_norm": 0.1665579080581665, + "learning_rate": 5.2787794156074824e-06, + "loss": 1.6935, + "step": 27917 + }, + { + "epoch": 8.569060773480663, + "grad_norm": 0.1853685826063156, + "learning_rate": 5.276556707016123e-06, + "loss": 1.7504, + "step": 27918 + }, + { + "epoch": 8.569367710251688, + "grad_norm": 0.16065651178359985, + "learning_rate": 5.274334440408063e-06, + "loss": 1.7549, + "step": 27919 + }, + { + "epoch": 8.569674647022714, + "grad_norm": 0.1630239635705948, + "learning_rate": 5.272112615805225e-06, + "loss": 1.7404, + "step": 27920 + }, + { + "epoch": 8.569981583793739, + "grad_norm": 0.1681451052427292, + "learning_rate": 5.269891233229607e-06, + "loss": 1.704, + "step": 27921 + }, + { + "epoch": 8.570288520564764, + "grad_norm": 0.14546994864940643, + "learning_rate": 5.267670292703119e-06, + "loss": 1.6656, + "step": 27922 + }, + { + "epoch": 8.57059545733579, + "grad_norm": 0.1499837189912796, + "learning_rate": 5.265449794247746e-06, + "loss": 1.6908, + "step": 27923 + }, + { + "epoch": 8.570902394106813, + "grad_norm": 0.14691168069839478, + "learning_rate": 5.263229737885417e-06, + "loss": 1.6887, + "step": 27924 + }, + { + "epoch": 8.571209330877839, + "grad_norm": 0.16261856257915497, + "learning_rate": 5.261010123638066e-06, + "loss": 1.6981, + "step": 27925 + }, + { + "epoch": 8.571516267648864, + "grad_norm": 0.1549815535545349, + "learning_rate": 5.2587909515276425e-06, + "loss": 1.6971, + "step": 27926 + }, + { + "epoch": 8.57182320441989, + "grad_norm": 0.15067234635353088, + "learning_rate": 5.256572221576067e-06, + "loss": 1.7101, + "step": 27927 + }, + { + "epoch": 8.572130141190915, + "grad_norm": 0.13761483132839203, + "learning_rate": 5.254353933805273e-06, + "loss": 1.6657, + "step": 27928 + }, + { + "epoch": 8.57243707796194, + "grad_norm": 0.1590275913476944, + "learning_rate": 5.252136088237175e-06, + "loss": 1.6776, + "step": 27929 + }, + { + "epoch": 8.572744014732965, + "grad_norm": 0.1633618026971817, + "learning_rate": 5.249918684893695e-06, + "loss": 1.724, + "step": 27930 + }, + { + "epoch": 8.57305095150399, + "grad_norm": 0.2603756785392761, + "learning_rate": 5.247701723796755e-06, + "loss": 1.7071, + "step": 27931 + }, + { + "epoch": 8.573357888275016, + "grad_norm": 0.21079567074775696, + "learning_rate": 5.245485204968248e-06, + "loss": 1.7983, + "step": 27932 + }, + { + "epoch": 8.573664825046041, + "grad_norm": 0.15369223058223724, + "learning_rate": 5.243269128430095e-06, + "loss": 1.7566, + "step": 27933 + }, + { + "epoch": 8.573971761817067, + "grad_norm": 0.19392070174217224, + "learning_rate": 5.241053494204185e-06, + "loss": 1.7287, + "step": 27934 + }, + { + "epoch": 8.57427869858809, + "grad_norm": 0.16017836332321167, + "learning_rate": 5.23883830231241e-06, + "loss": 1.6909, + "step": 27935 + }, + { + "epoch": 8.574585635359115, + "grad_norm": 0.1943294107913971, + "learning_rate": 5.2366235527766876e-06, + "loss": 1.7844, + "step": 27936 + }, + { + "epoch": 8.57489257213014, + "grad_norm": 0.17875424027442932, + "learning_rate": 5.234409245618871e-06, + "loss": 1.7385, + "step": 27937 + }, + { + "epoch": 8.575199508901166, + "grad_norm": 0.1900254637002945, + "learning_rate": 5.232195380860877e-06, + "loss": 1.7303, + "step": 27938 + }, + { + "epoch": 8.575506445672191, + "grad_norm": 0.13633303344249725, + "learning_rate": 5.229981958524549e-06, + "loss": 1.6949, + "step": 27939 + }, + { + "epoch": 8.575813382443217, + "grad_norm": 0.18683885037899017, + "learning_rate": 5.227768978631792e-06, + "loss": 1.7366, + "step": 27940 + }, + { + "epoch": 8.576120319214242, + "grad_norm": 0.15012286603450775, + "learning_rate": 5.2255564412044656e-06, + "loss": 1.71, + "step": 27941 + }, + { + "epoch": 8.576427255985267, + "grad_norm": 0.14521601796150208, + "learning_rate": 5.22334434626443e-06, + "loss": 1.724, + "step": 27942 + }, + { + "epoch": 8.576734192756293, + "grad_norm": 0.1809433549642563, + "learning_rate": 5.221132693833547e-06, + "loss": 1.7851, + "step": 27943 + }, + { + "epoch": 8.577041129527318, + "grad_norm": 0.1676371693611145, + "learning_rate": 5.218921483933681e-06, + "loss": 1.7542, + "step": 27944 + }, + { + "epoch": 8.577348066298342, + "grad_norm": 0.16963952779769897, + "learning_rate": 5.216710716586676e-06, + "loss": 1.767, + "step": 27945 + }, + { + "epoch": 8.577655003069367, + "grad_norm": 0.18276773393154144, + "learning_rate": 5.214500391814387e-06, + "loss": 1.662, + "step": 27946 + }, + { + "epoch": 8.577961939840392, + "grad_norm": 0.16285058856010437, + "learning_rate": 5.212290509638656e-06, + "loss": 1.6853, + "step": 27947 + }, + { + "epoch": 8.578268876611418, + "grad_norm": 0.18186792731285095, + "learning_rate": 5.210081070081318e-06, + "loss": 1.7408, + "step": 27948 + }, + { + "epoch": 8.578575813382443, + "grad_norm": 0.15637101233005524, + "learning_rate": 5.207872073164216e-06, + "loss": 1.7026, + "step": 27949 + }, + { + "epoch": 8.578882750153468, + "grad_norm": 0.16442300379276276, + "learning_rate": 5.2056635189091704e-06, + "loss": 1.7136, + "step": 27950 + }, + { + "epoch": 8.579189686924494, + "grad_norm": 0.18907669186592102, + "learning_rate": 5.203455407338015e-06, + "loss": 1.7706, + "step": 27951 + }, + { + "epoch": 8.579496623695519, + "grad_norm": 0.17700283229351044, + "learning_rate": 5.201247738472559e-06, + "loss": 1.7104, + "step": 27952 + }, + { + "epoch": 8.579803560466544, + "grad_norm": 0.19882333278656006, + "learning_rate": 5.199040512334647e-06, + "loss": 1.7692, + "step": 27953 + }, + { + "epoch": 8.58011049723757, + "grad_norm": 0.14343376457691193, + "learning_rate": 5.19683372894606e-06, + "loss": 1.6775, + "step": 27954 + }, + { + "epoch": 8.580417434008595, + "grad_norm": 0.13688595592975616, + "learning_rate": 5.194627388328638e-06, + "loss": 1.6787, + "step": 27955 + }, + { + "epoch": 8.580724370779619, + "grad_norm": 0.15786845982074738, + "learning_rate": 5.192421490504157e-06, + "loss": 1.7218, + "step": 27956 + }, + { + "epoch": 8.581031307550644, + "grad_norm": 0.3297908902168274, + "learning_rate": 5.190216035494433e-06, + "loss": 1.7533, + "step": 27957 + }, + { + "epoch": 8.58133824432167, + "grad_norm": 0.16763067245483398, + "learning_rate": 5.18801102332126e-06, + "loss": 1.7278, + "step": 27958 + }, + { + "epoch": 8.581645181092695, + "grad_norm": 0.18505536019802094, + "learning_rate": 5.185806454006426e-06, + "loss": 1.7291, + "step": 27959 + }, + { + "epoch": 8.58195211786372, + "grad_norm": 0.1536751091480255, + "learning_rate": 5.183602327571718e-06, + "loss": 1.7014, + "step": 27960 + }, + { + "epoch": 8.582259054634745, + "grad_norm": 0.2561737596988678, + "learning_rate": 5.181398644038921e-06, + "loss": 1.8127, + "step": 27961 + }, + { + "epoch": 8.58256599140577, + "grad_norm": 0.15304888784885406, + "learning_rate": 5.17919540342981e-06, + "loss": 1.7001, + "step": 27962 + }, + { + "epoch": 8.582872928176796, + "grad_norm": 0.16688644886016846, + "learning_rate": 5.176992605766162e-06, + "loss": 1.7398, + "step": 27963 + }, + { + "epoch": 8.583179864947821, + "grad_norm": 0.1351930946111679, + "learning_rate": 5.174790251069744e-06, + "loss": 1.6947, + "step": 27964 + }, + { + "epoch": 8.583486801718847, + "grad_norm": 0.23985813558101654, + "learning_rate": 5.172588339362322e-06, + "loss": 1.7495, + "step": 27965 + }, + { + "epoch": 8.58379373848987, + "grad_norm": 0.17094407975673676, + "learning_rate": 5.170386870665656e-06, + "loss": 1.74, + "step": 27966 + }, + { + "epoch": 8.584100675260895, + "grad_norm": 0.17786560952663422, + "learning_rate": 5.168185845001505e-06, + "loss": 1.7438, + "step": 27967 + }, + { + "epoch": 8.58440761203192, + "grad_norm": 0.16682226955890656, + "learning_rate": 5.165985262391615e-06, + "loss": 1.7193, + "step": 27968 + }, + { + "epoch": 8.584714548802946, + "grad_norm": 0.17371125519275665, + "learning_rate": 5.163785122857728e-06, + "loss": 1.677, + "step": 27969 + }, + { + "epoch": 8.585021485573971, + "grad_norm": 0.16753411293029785, + "learning_rate": 5.161585426421617e-06, + "loss": 1.6558, + "step": 27970 + }, + { + "epoch": 8.585328422344997, + "grad_norm": 0.14469672739505768, + "learning_rate": 5.159386173104979e-06, + "loss": 1.7, + "step": 27971 + }, + { + "epoch": 8.585635359116022, + "grad_norm": 0.14450986683368683, + "learning_rate": 5.157187362929583e-06, + "loss": 1.6843, + "step": 27972 + }, + { + "epoch": 8.585942295887047, + "grad_norm": 0.15462568402290344, + "learning_rate": 5.1549889959171315e-06, + "loss": 1.7028, + "step": 27973 + }, + { + "epoch": 8.586249232658073, + "grad_norm": 0.19757840037345886, + "learning_rate": 5.1527910720893694e-06, + "loss": 1.7578, + "step": 27974 + }, + { + "epoch": 8.586556169429098, + "grad_norm": 0.16309098899364471, + "learning_rate": 5.150593591468017e-06, + "loss": 1.6736, + "step": 27975 + }, + { + "epoch": 8.586863106200123, + "grad_norm": 0.20989231765270233, + "learning_rate": 5.14839655407478e-06, + "loss": 1.7418, + "step": 27976 + }, + { + "epoch": 8.587170042971149, + "grad_norm": 0.14988306164741516, + "learning_rate": 5.14619995993138e-06, + "loss": 1.6834, + "step": 27977 + }, + { + "epoch": 8.587476979742172, + "grad_norm": 0.1826607882976532, + "learning_rate": 5.144003809059522e-06, + "loss": 1.7598, + "step": 27978 + }, + { + "epoch": 8.587783916513198, + "grad_norm": 0.16675019264221191, + "learning_rate": 5.141808101480905e-06, + "loss": 1.7388, + "step": 27979 + }, + { + "epoch": 8.588090853284223, + "grad_norm": 0.17474086582660675, + "learning_rate": 5.139612837217233e-06, + "loss": 1.6897, + "step": 27980 + }, + { + "epoch": 8.588397790055248, + "grad_norm": 0.15096940100193024, + "learning_rate": 5.137418016290207e-06, + "loss": 1.6959, + "step": 27981 + }, + { + "epoch": 8.588704726826274, + "grad_norm": 0.13225309550762177, + "learning_rate": 5.1352236387215035e-06, + "loss": 1.6946, + "step": 27982 + }, + { + "epoch": 8.589011663597299, + "grad_norm": 0.13731913268566132, + "learning_rate": 5.133029704532821e-06, + "loss": 1.7076, + "step": 27983 + }, + { + "epoch": 8.589318600368324, + "grad_norm": 0.1227266862988472, + "learning_rate": 5.130836213745832e-06, + "loss": 1.6966, + "step": 27984 + }, + { + "epoch": 8.58962553713935, + "grad_norm": 0.16979724168777466, + "learning_rate": 5.128643166382224e-06, + "loss": 1.7365, + "step": 27985 + }, + { + "epoch": 8.589932473910375, + "grad_norm": 0.13253070414066315, + "learning_rate": 5.126450562463653e-06, + "loss": 1.6748, + "step": 27986 + }, + { + "epoch": 8.5902394106814, + "grad_norm": 0.13287228345870972, + "learning_rate": 5.124258402011817e-06, + "loss": 1.666, + "step": 27987 + }, + { + "epoch": 8.590546347452424, + "grad_norm": 0.1884436458349228, + "learning_rate": 5.122066685048338e-06, + "loss": 1.6974, + "step": 27988 + }, + { + "epoch": 8.59085328422345, + "grad_norm": 0.17336542904376984, + "learning_rate": 5.119875411594927e-06, + "loss": 1.6884, + "step": 27989 + }, + { + "epoch": 8.591160220994475, + "grad_norm": 0.19136151671409607, + "learning_rate": 5.117684581673188e-06, + "loss": 1.6976, + "step": 27990 + }, + { + "epoch": 8.5914671577655, + "grad_norm": 0.18627271056175232, + "learning_rate": 5.115494195304804e-06, + "loss": 1.7255, + "step": 27991 + }, + { + "epoch": 8.591774094536525, + "grad_norm": 0.1341535747051239, + "learning_rate": 5.1133042525114194e-06, + "loss": 1.661, + "step": 27992 + }, + { + "epoch": 8.59208103130755, + "grad_norm": 0.172500878572464, + "learning_rate": 5.1111147533146665e-06, + "loss": 1.7408, + "step": 27993 + }, + { + "epoch": 8.592387968078576, + "grad_norm": 0.14429397881031036, + "learning_rate": 5.108925697736188e-06, + "loss": 1.7025, + "step": 27994 + }, + { + "epoch": 8.592694904849601, + "grad_norm": 0.16930191218852997, + "learning_rate": 5.106737085797625e-06, + "loss": 1.7451, + "step": 27995 + }, + { + "epoch": 8.593001841620627, + "grad_norm": 0.17311960458755493, + "learning_rate": 5.104548917520591e-06, + "loss": 1.7077, + "step": 27996 + }, + { + "epoch": 8.593308778391652, + "grad_norm": 0.17147377133369446, + "learning_rate": 5.102361192926719e-06, + "loss": 1.701, + "step": 27997 + }, + { + "epoch": 8.593615715162677, + "grad_norm": 0.16215240955352783, + "learning_rate": 5.100173912037631e-06, + "loss": 1.6896, + "step": 27998 + }, + { + "epoch": 8.5939226519337, + "grad_norm": 0.1764577031135559, + "learning_rate": 5.097987074874944e-06, + "loss": 1.6895, + "step": 27999 + }, + { + "epoch": 8.594229588704726, + "grad_norm": 0.1574433147907257, + "learning_rate": 5.095800681460261e-06, + "loss": 1.7219, + "step": 28000 + }, + { + "epoch": 8.594536525475752, + "grad_norm": 0.1465912163257599, + "learning_rate": 5.0936147318152e-06, + "loss": 1.7077, + "step": 28001 + }, + { + "epoch": 8.594843462246777, + "grad_norm": 0.2024395614862442, + "learning_rate": 5.0914292259613524e-06, + "loss": 1.7956, + "step": 28002 + }, + { + "epoch": 8.595150399017802, + "grad_norm": 0.16168762743473053, + "learning_rate": 5.0892441639203205e-06, + "loss": 1.7311, + "step": 28003 + }, + { + "epoch": 8.595457335788828, + "grad_norm": 0.1713251769542694, + "learning_rate": 5.0870595457137185e-06, + "loss": 1.7123, + "step": 28004 + }, + { + "epoch": 8.595764272559853, + "grad_norm": 0.22206412255764008, + "learning_rate": 5.084875371363096e-06, + "loss": 1.7057, + "step": 28005 + }, + { + "epoch": 8.596071209330878, + "grad_norm": 0.14937512576580048, + "learning_rate": 5.082691640890081e-06, + "loss": 1.7231, + "step": 28006 + }, + { + "epoch": 8.596378146101904, + "grad_norm": 0.22501800954341888, + "learning_rate": 5.0805083543162155e-06, + "loss": 1.7729, + "step": 28007 + }, + { + "epoch": 8.596685082872929, + "grad_norm": 0.150779128074646, + "learning_rate": 5.0783255116631015e-06, + "loss": 1.6887, + "step": 28008 + }, + { + "epoch": 8.596992019643952, + "grad_norm": 0.1489362120628357, + "learning_rate": 5.076143112952308e-06, + "loss": 1.6774, + "step": 28009 + }, + { + "epoch": 8.597298956414978, + "grad_norm": 0.17022615671157837, + "learning_rate": 5.073961158205398e-06, + "loss": 1.6974, + "step": 28010 + }, + { + "epoch": 8.597605893186003, + "grad_norm": 0.16300532221794128, + "learning_rate": 5.071779647443931e-06, + "loss": 1.7194, + "step": 28011 + }, + { + "epoch": 8.597912829957028, + "grad_norm": 0.14973211288452148, + "learning_rate": 5.069598580689477e-06, + "loss": 1.7238, + "step": 28012 + }, + { + "epoch": 8.598219766728054, + "grad_norm": 0.1345965713262558, + "learning_rate": 5.067417957963583e-06, + "loss": 1.6372, + "step": 28013 + }, + { + "epoch": 8.598526703499079, + "grad_norm": 0.18125082552433014, + "learning_rate": 5.065237779287802e-06, + "loss": 1.7174, + "step": 28014 + }, + { + "epoch": 8.598833640270104, + "grad_norm": 0.1619734913110733, + "learning_rate": 5.063058044683671e-06, + "loss": 1.6951, + "step": 28015 + }, + { + "epoch": 8.59914057704113, + "grad_norm": 0.14732249081134796, + "learning_rate": 5.060878754172749e-06, + "loss": 1.7291, + "step": 28016 + }, + { + "epoch": 8.599447513812155, + "grad_norm": 0.14982318878173828, + "learning_rate": 5.058699907776554e-06, + "loss": 1.6962, + "step": 28017 + }, + { + "epoch": 8.59975445058318, + "grad_norm": 0.15376806259155273, + "learning_rate": 5.056521505516632e-06, + "loss": 1.6867, + "step": 28018 + }, + { + "epoch": 8.600061387354206, + "grad_norm": 0.1546332985162735, + "learning_rate": 5.054343547414509e-06, + "loss": 1.7219, + "step": 28019 + }, + { + "epoch": 8.600368324125231, + "grad_norm": 0.17485050857067108, + "learning_rate": 5.0521660334916895e-06, + "loss": 1.7266, + "step": 28020 + }, + { + "epoch": 8.600675260896255, + "grad_norm": 0.15625739097595215, + "learning_rate": 5.049988963769736e-06, + "loss": 1.7328, + "step": 28021 + }, + { + "epoch": 8.60098219766728, + "grad_norm": 0.26432421803474426, + "learning_rate": 5.0478123382701136e-06, + "loss": 1.7452, + "step": 28022 + }, + { + "epoch": 8.601289134438305, + "grad_norm": 0.16437242925167084, + "learning_rate": 5.045636157014377e-06, + "loss": 1.6945, + "step": 28023 + }, + { + "epoch": 8.60159607120933, + "grad_norm": 0.17274139821529388, + "learning_rate": 5.043460420023999e-06, + "loss": 1.6952, + "step": 28024 + }, + { + "epoch": 8.601903007980356, + "grad_norm": 0.2380651980638504, + "learning_rate": 5.0412851273205e-06, + "loss": 1.7412, + "step": 28025 + }, + { + "epoch": 8.602209944751381, + "grad_norm": 0.1543026566505432, + "learning_rate": 5.039110278925374e-06, + "loss": 1.7063, + "step": 28026 + }, + { + "epoch": 8.602516881522407, + "grad_norm": 0.15819939970970154, + "learning_rate": 5.036935874860111e-06, + "loss": 1.703, + "step": 28027 + }, + { + "epoch": 8.602823818293432, + "grad_norm": 0.20054341852664948, + "learning_rate": 5.034761915146208e-06, + "loss": 1.741, + "step": 28028 + }, + { + "epoch": 8.603130755064457, + "grad_norm": 0.1404278427362442, + "learning_rate": 5.032588399805127e-06, + "loss": 1.6822, + "step": 28029 + }, + { + "epoch": 8.603437691835483, + "grad_norm": 0.1339765340089798, + "learning_rate": 5.030415328858374e-06, + "loss": 1.6741, + "step": 28030 + }, + { + "epoch": 8.603744628606506, + "grad_norm": 0.17520250380039215, + "learning_rate": 5.028242702327413e-06, + "loss": 1.7655, + "step": 28031 + }, + { + "epoch": 8.604051565377532, + "grad_norm": 0.1701551079750061, + "learning_rate": 5.0260705202337165e-06, + "loss": 1.7219, + "step": 28032 + }, + { + "epoch": 8.604358502148557, + "grad_norm": 0.1882735937833786, + "learning_rate": 5.023898782598752e-06, + "loss": 1.7482, + "step": 28033 + }, + { + "epoch": 8.604665438919582, + "grad_norm": 0.1356845200061798, + "learning_rate": 5.021727489443984e-06, + "loss": 1.6647, + "step": 28034 + }, + { + "epoch": 8.604972375690608, + "grad_norm": 0.1686328649520874, + "learning_rate": 5.019556640790862e-06, + "loss": 1.7454, + "step": 28035 + }, + { + "epoch": 8.605279312461633, + "grad_norm": 0.16747170686721802, + "learning_rate": 5.017386236660848e-06, + "loss": 1.6747, + "step": 28036 + }, + { + "epoch": 8.605586249232658, + "grad_norm": 0.18954692780971527, + "learning_rate": 5.0152162770753795e-06, + "loss": 1.7351, + "step": 28037 + }, + { + "epoch": 8.605893186003684, + "grad_norm": 0.19075840711593628, + "learning_rate": 5.013046762055929e-06, + "loss": 1.8257, + "step": 28038 + }, + { + "epoch": 8.606200122774709, + "grad_norm": 0.22513258457183838, + "learning_rate": 5.010877691623894e-06, + "loss": 1.7548, + "step": 28039 + }, + { + "epoch": 8.606507059545734, + "grad_norm": 0.15815886855125427, + "learning_rate": 5.00870906580076e-06, + "loss": 1.6793, + "step": 28040 + }, + { + "epoch": 8.60681399631676, + "grad_norm": 0.15267199277877808, + "learning_rate": 5.006540884607913e-06, + "loss": 1.6703, + "step": 28041 + }, + { + "epoch": 8.607120933087783, + "grad_norm": 0.14877180755138397, + "learning_rate": 5.00437314806681e-06, + "loss": 1.6859, + "step": 28042 + }, + { + "epoch": 8.607427869858808, + "grad_norm": 0.18780232965946198, + "learning_rate": 5.002205856198861e-06, + "loss": 1.7205, + "step": 28043 + }, + { + "epoch": 8.607734806629834, + "grad_norm": 0.1645117998123169, + "learning_rate": 5.000039009025492e-06, + "loss": 1.7726, + "step": 28044 + }, + { + "epoch": 8.60804174340086, + "grad_norm": 0.1449744552373886, + "learning_rate": 4.997872606568116e-06, + "loss": 1.6704, + "step": 28045 + }, + { + "epoch": 8.608348680171884, + "grad_norm": 0.15839919447898865, + "learning_rate": 4.9957066488481255e-06, + "loss": 1.6844, + "step": 28046 + }, + { + "epoch": 8.60865561694291, + "grad_norm": 0.16456182301044464, + "learning_rate": 4.993541135886948e-06, + "loss": 1.7141, + "step": 28047 + }, + { + "epoch": 8.608962553713935, + "grad_norm": 0.154433935880661, + "learning_rate": 4.991376067705977e-06, + "loss": 1.7077, + "step": 28048 + }, + { + "epoch": 8.60926949048496, + "grad_norm": 0.13631665706634521, + "learning_rate": 4.989211444326608e-06, + "loss": 1.6819, + "step": 28049 + }, + { + "epoch": 8.609576427255986, + "grad_norm": 0.13026617467403412, + "learning_rate": 4.987047265770234e-06, + "loss": 1.6929, + "step": 28050 + }, + { + "epoch": 8.609883364027011, + "grad_norm": 0.1359538435935974, + "learning_rate": 4.984883532058243e-06, + "loss": 1.6534, + "step": 28051 + }, + { + "epoch": 8.610190300798035, + "grad_norm": 0.13192327320575714, + "learning_rate": 4.982720243212014e-06, + "loss": 1.694, + "step": 28052 + }, + { + "epoch": 8.61049723756906, + "grad_norm": 0.17191945016384125, + "learning_rate": 4.980557399252928e-06, + "loss": 1.7402, + "step": 28053 + }, + { + "epoch": 8.610804174340085, + "grad_norm": 0.12728241086006165, + "learning_rate": 4.978395000202363e-06, + "loss": 1.7231, + "step": 28054 + }, + { + "epoch": 8.61111111111111, + "grad_norm": 0.15232713520526886, + "learning_rate": 4.976233046081685e-06, + "loss": 1.6805, + "step": 28055 + }, + { + "epoch": 8.611418047882136, + "grad_norm": 0.13869190216064453, + "learning_rate": 4.974071536912256e-06, + "loss": 1.6771, + "step": 28056 + }, + { + "epoch": 8.611724984653161, + "grad_norm": 0.16099198162555695, + "learning_rate": 4.971910472715458e-06, + "loss": 1.6853, + "step": 28057 + }, + { + "epoch": 8.612031921424187, + "grad_norm": 0.147923544049263, + "learning_rate": 4.969749853512612e-06, + "loss": 1.7173, + "step": 28058 + }, + { + "epoch": 8.612338858195212, + "grad_norm": 0.16606341302394867, + "learning_rate": 4.967589679325102e-06, + "loss": 1.7262, + "step": 28059 + }, + { + "epoch": 8.612645794966237, + "grad_norm": 0.12743404507637024, + "learning_rate": 4.965429950174266e-06, + "loss": 1.6612, + "step": 28060 + }, + { + "epoch": 8.612952731737263, + "grad_norm": 0.12468522787094116, + "learning_rate": 4.9632706660814436e-06, + "loss": 1.6835, + "step": 28061 + }, + { + "epoch": 8.613259668508288, + "grad_norm": 0.16881446540355682, + "learning_rate": 4.9611118270679935e-06, + "loss": 1.7433, + "step": 28062 + }, + { + "epoch": 8.613566605279313, + "grad_norm": 0.2030627429485321, + "learning_rate": 4.958953433155211e-06, + "loss": 1.7739, + "step": 28063 + }, + { + "epoch": 8.613873542050337, + "grad_norm": 0.18076404929161072, + "learning_rate": 4.956795484364457e-06, + "loss": 1.7316, + "step": 28064 + }, + { + "epoch": 8.614180478821362, + "grad_norm": 0.12519899010658264, + "learning_rate": 4.954637980717058e-06, + "loss": 1.6686, + "step": 28065 + }, + { + "epoch": 8.614487415592388, + "grad_norm": 0.16320455074310303, + "learning_rate": 4.95248092223432e-06, + "loss": 1.744, + "step": 28066 + }, + { + "epoch": 8.614794352363413, + "grad_norm": 0.18789352476596832, + "learning_rate": 4.950324308937576e-06, + "loss": 1.7619, + "step": 28067 + }, + { + "epoch": 8.615101289134438, + "grad_norm": 0.13703711330890656, + "learning_rate": 4.948168140848125e-06, + "loss": 1.6652, + "step": 28068 + }, + { + "epoch": 8.615408225905464, + "grad_norm": 0.16874989867210388, + "learning_rate": 4.946012417987289e-06, + "loss": 1.6783, + "step": 28069 + }, + { + "epoch": 8.615715162676489, + "grad_norm": 0.1780901849269867, + "learning_rate": 4.943857140376362e-06, + "loss": 1.7224, + "step": 28070 + }, + { + "epoch": 8.616022099447514, + "grad_norm": 0.19460240006446838, + "learning_rate": 4.941702308036644e-06, + "loss": 1.7314, + "step": 28071 + }, + { + "epoch": 8.61632903621854, + "grad_norm": 0.14954718947410583, + "learning_rate": 4.9395479209894404e-06, + "loss": 1.708, + "step": 28072 + }, + { + "epoch": 8.616635972989565, + "grad_norm": 0.17461352050304413, + "learning_rate": 4.937393979256016e-06, + "loss": 1.7458, + "step": 28073 + }, + { + "epoch": 8.616942909760589, + "grad_norm": 0.17088642716407776, + "learning_rate": 4.935240482857706e-06, + "loss": 1.7315, + "step": 28074 + }, + { + "epoch": 8.617249846531614, + "grad_norm": 0.1478833556175232, + "learning_rate": 4.933087431815736e-06, + "loss": 1.6646, + "step": 28075 + }, + { + "epoch": 8.61755678330264, + "grad_norm": 0.1860690414905548, + "learning_rate": 4.930934826151435e-06, + "loss": 1.6472, + "step": 28076 + }, + { + "epoch": 8.617863720073665, + "grad_norm": 0.23674537241458893, + "learning_rate": 4.928782665886028e-06, + "loss": 1.7677, + "step": 28077 + }, + { + "epoch": 8.61817065684469, + "grad_norm": 0.1638643592596054, + "learning_rate": 4.926630951040817e-06, + "loss": 1.7438, + "step": 28078 + }, + { + "epoch": 8.618477593615715, + "grad_norm": 0.1631689965724945, + "learning_rate": 4.924479681637067e-06, + "loss": 1.7167, + "step": 28079 + }, + { + "epoch": 8.61878453038674, + "grad_norm": 0.1493348926305771, + "learning_rate": 4.922328857696012e-06, + "loss": 1.6929, + "step": 28080 + }, + { + "epoch": 8.619091467157766, + "grad_norm": 0.1545657068490982, + "learning_rate": 4.920178479238935e-06, + "loss": 1.7048, + "step": 28081 + }, + { + "epoch": 8.619398403928791, + "grad_norm": 0.20011793076992035, + "learning_rate": 4.918028546287073e-06, + "loss": 1.726, + "step": 28082 + }, + { + "epoch": 8.619705340699817, + "grad_norm": 0.1705177128314972, + "learning_rate": 4.915879058861678e-06, + "loss": 1.7774, + "step": 28083 + }, + { + "epoch": 8.620012277470842, + "grad_norm": 0.15467505156993866, + "learning_rate": 4.913730016983992e-06, + "loss": 1.6933, + "step": 28084 + }, + { + "epoch": 8.620319214241865, + "grad_norm": 0.1319204419851303, + "learning_rate": 4.911581420675248e-06, + "loss": 1.7309, + "step": 28085 + }, + { + "epoch": 8.62062615101289, + "grad_norm": 0.163784459233284, + "learning_rate": 4.909433269956687e-06, + "loss": 1.7221, + "step": 28086 + }, + { + "epoch": 8.620933087783916, + "grad_norm": 0.15852972865104675, + "learning_rate": 4.907285564849534e-06, + "loss": 1.7018, + "step": 28087 + }, + { + "epoch": 8.621240024554941, + "grad_norm": 0.14603203535079956, + "learning_rate": 4.905138305375018e-06, + "loss": 1.6786, + "step": 28088 + }, + { + "epoch": 8.621546961325967, + "grad_norm": 0.14899590611457825, + "learning_rate": 4.902991491554348e-06, + "loss": 1.7039, + "step": 28089 + }, + { + "epoch": 8.621853898096992, + "grad_norm": 0.13559244573116302, + "learning_rate": 4.9008451234087426e-06, + "loss": 1.6831, + "step": 28090 + }, + { + "epoch": 8.622160834868017, + "grad_norm": 0.1433703601360321, + "learning_rate": 4.898699200959439e-06, + "loss": 1.6567, + "step": 28091 + }, + { + "epoch": 8.622467771639043, + "grad_norm": 0.12275373190641403, + "learning_rate": 4.89655372422761e-06, + "loss": 1.6897, + "step": 28092 + }, + { + "epoch": 8.622774708410068, + "grad_norm": 0.12706153094768524, + "learning_rate": 4.894408693234487e-06, + "loss": 1.6287, + "step": 28093 + }, + { + "epoch": 8.623081645181093, + "grad_norm": 0.18988971412181854, + "learning_rate": 4.892264108001232e-06, + "loss": 1.7021, + "step": 28094 + }, + { + "epoch": 8.623388581952117, + "grad_norm": 0.17477858066558838, + "learning_rate": 4.8901199685490785e-06, + "loss": 1.7289, + "step": 28095 + }, + { + "epoch": 8.623695518723142, + "grad_norm": 0.16172516345977783, + "learning_rate": 4.887976274899203e-06, + "loss": 1.7265, + "step": 28096 + }, + { + "epoch": 8.624002455494168, + "grad_norm": 0.14414304494857788, + "learning_rate": 4.885833027072772e-06, + "loss": 1.6795, + "step": 28097 + }, + { + "epoch": 8.624309392265193, + "grad_norm": 0.17894591391086578, + "learning_rate": 4.8836902250909975e-06, + "loss": 1.7564, + "step": 28098 + }, + { + "epoch": 8.624616329036218, + "grad_norm": 0.141717329621315, + "learning_rate": 4.881547868975022e-06, + "loss": 1.7047, + "step": 28099 + }, + { + "epoch": 8.624923265807244, + "grad_norm": 0.2184356302022934, + "learning_rate": 4.879405958746047e-06, + "loss": 1.7447, + "step": 28100 + }, + { + "epoch": 8.625230202578269, + "grad_norm": 0.1739104986190796, + "learning_rate": 4.877264494425227e-06, + "loss": 1.7003, + "step": 28101 + }, + { + "epoch": 8.625537139349294, + "grad_norm": 0.17033645510673523, + "learning_rate": 4.875123476033721e-06, + "loss": 1.7019, + "step": 28102 + }, + { + "epoch": 8.62584407612032, + "grad_norm": 0.1620563268661499, + "learning_rate": 4.872982903592699e-06, + "loss": 1.6955, + "step": 28103 + }, + { + "epoch": 8.626151012891345, + "grad_norm": 0.16582414507865906, + "learning_rate": 4.870842777123308e-06, + "loss": 1.6687, + "step": 28104 + }, + { + "epoch": 8.62645794966237, + "grad_norm": 0.1620030403137207, + "learning_rate": 4.8687030966466985e-06, + "loss": 1.6762, + "step": 28105 + }, + { + "epoch": 8.626764886433394, + "grad_norm": 0.16777098178863525, + "learning_rate": 4.86656386218402e-06, + "loss": 1.7117, + "step": 28106 + }, + { + "epoch": 8.62707182320442, + "grad_norm": 0.16074253618717194, + "learning_rate": 4.8644250737564014e-06, + "loss": 1.7205, + "step": 28107 + }, + { + "epoch": 8.627378759975445, + "grad_norm": 0.1414494514465332, + "learning_rate": 4.862286731385007e-06, + "loss": 1.6936, + "step": 28108 + }, + { + "epoch": 8.62768569674647, + "grad_norm": 0.206336110830307, + "learning_rate": 4.860148835090933e-06, + "loss": 1.7443, + "step": 28109 + }, + { + "epoch": 8.627992633517495, + "grad_norm": 0.16304929554462433, + "learning_rate": 4.858011384895345e-06, + "loss": 1.7525, + "step": 28110 + }, + { + "epoch": 8.62829957028852, + "grad_norm": 0.16839462518692017, + "learning_rate": 4.855874380819325e-06, + "loss": 1.7462, + "step": 28111 + }, + { + "epoch": 8.628606507059546, + "grad_norm": 0.16088010370731354, + "learning_rate": 4.8537378228840246e-06, + "loss": 1.7662, + "step": 28112 + }, + { + "epoch": 8.628913443830571, + "grad_norm": 0.1818089783191681, + "learning_rate": 4.851601711110559e-06, + "loss": 1.752, + "step": 28113 + }, + { + "epoch": 8.629220380601597, + "grad_norm": 0.19034543633460999, + "learning_rate": 4.8494660455200065e-06, + "loss": 1.8474, + "step": 28114 + }, + { + "epoch": 8.629527317372622, + "grad_norm": 0.15762893855571747, + "learning_rate": 4.847330826133517e-06, + "loss": 1.7615, + "step": 28115 + }, + { + "epoch": 8.629834254143645, + "grad_norm": 0.14152835309505463, + "learning_rate": 4.845196052972145e-06, + "loss": 1.702, + "step": 28116 + }, + { + "epoch": 8.63014119091467, + "grad_norm": 0.14755114912986755, + "learning_rate": 4.8430617260570245e-06, + "loss": 1.7044, + "step": 28117 + }, + { + "epoch": 8.630448127685696, + "grad_norm": 0.1483534872531891, + "learning_rate": 4.840927845409238e-06, + "loss": 1.6798, + "step": 28118 + }, + { + "epoch": 8.630755064456721, + "grad_norm": 0.15526263415813446, + "learning_rate": 4.8387944110498685e-06, + "loss": 1.7316, + "step": 28119 + }, + { + "epoch": 8.631062001227747, + "grad_norm": 0.21519999206066132, + "learning_rate": 4.836661422999999e-06, + "loss": 1.763, + "step": 28120 + }, + { + "epoch": 8.631368937998772, + "grad_norm": 0.14445212483406067, + "learning_rate": 4.8345288812807144e-06, + "loss": 1.6894, + "step": 28121 + }, + { + "epoch": 8.631675874769797, + "grad_norm": 0.1482388973236084, + "learning_rate": 4.832396785913091e-06, + "loss": 1.6629, + "step": 28122 + }, + { + "epoch": 8.631982811540823, + "grad_norm": 0.17132261395454407, + "learning_rate": 4.830265136918194e-06, + "loss": 1.7254, + "step": 28123 + }, + { + "epoch": 8.632289748311848, + "grad_norm": 0.1567879170179367, + "learning_rate": 4.828133934317081e-06, + "loss": 1.711, + "step": 28124 + }, + { + "epoch": 8.632596685082873, + "grad_norm": 0.18352550268173218, + "learning_rate": 4.826003178130845e-06, + "loss": 1.6853, + "step": 28125 + }, + { + "epoch": 8.632903621853899, + "grad_norm": 0.17370788753032684, + "learning_rate": 4.823872868380502e-06, + "loss": 1.7716, + "step": 28126 + }, + { + "epoch": 8.633210558624924, + "grad_norm": 0.14186492562294006, + "learning_rate": 4.821743005087148e-06, + "loss": 1.7003, + "step": 28127 + }, + { + "epoch": 8.633517495395948, + "grad_norm": 0.1501329094171524, + "learning_rate": 4.819613588271788e-06, + "loss": 1.7249, + "step": 28128 + }, + { + "epoch": 8.633824432166973, + "grad_norm": 0.13921687006950378, + "learning_rate": 4.817484617955498e-06, + "loss": 1.6646, + "step": 28129 + }, + { + "epoch": 8.634131368937998, + "grad_norm": 0.14346352219581604, + "learning_rate": 4.815356094159318e-06, + "loss": 1.6784, + "step": 28130 + }, + { + "epoch": 8.634438305709024, + "grad_norm": 0.1550782024860382, + "learning_rate": 4.813228016904247e-06, + "loss": 1.7052, + "step": 28131 + }, + { + "epoch": 8.634745242480049, + "grad_norm": 0.13514211773872375, + "learning_rate": 4.81110038621137e-06, + "loss": 1.7095, + "step": 28132 + }, + { + "epoch": 8.635052179251074, + "grad_norm": 0.14162956178188324, + "learning_rate": 4.8089732021016575e-06, + "loss": 1.7001, + "step": 28133 + }, + { + "epoch": 8.6353591160221, + "grad_norm": 0.14066293835639954, + "learning_rate": 4.806846464596177e-06, + "loss": 1.7037, + "step": 28134 + }, + { + "epoch": 8.635666052793125, + "grad_norm": 0.1918545961380005, + "learning_rate": 4.804720173715921e-06, + "loss": 1.7334, + "step": 28135 + }, + { + "epoch": 8.63597298956415, + "grad_norm": 0.13358080387115479, + "learning_rate": 4.802594329481913e-06, + "loss": 1.7063, + "step": 28136 + }, + { + "epoch": 8.636279926335176, + "grad_norm": 0.14988988637924194, + "learning_rate": 4.800468931915158e-06, + "loss": 1.6871, + "step": 28137 + }, + { + "epoch": 8.6365868631062, + "grad_norm": 0.1423332244157791, + "learning_rate": 4.798343981036663e-06, + "loss": 1.7133, + "step": 28138 + }, + { + "epoch": 8.636893799877225, + "grad_norm": 0.1372760534286499, + "learning_rate": 4.796219476867425e-06, + "loss": 1.6522, + "step": 28139 + }, + { + "epoch": 8.63720073664825, + "grad_norm": 0.14779186248779297, + "learning_rate": 4.794095419428446e-06, + "loss": 1.669, + "step": 28140 + }, + { + "epoch": 8.637507673419275, + "grad_norm": 0.1412673145532608, + "learning_rate": 4.7919718087406975e-06, + "loss": 1.6767, + "step": 28141 + }, + { + "epoch": 8.6378146101903, + "grad_norm": 0.13006745278835297, + "learning_rate": 4.789848644825201e-06, + "loss": 1.6804, + "step": 28142 + }, + { + "epoch": 8.638121546961326, + "grad_norm": 0.15673677623271942, + "learning_rate": 4.787725927702896e-06, + "loss": 1.7053, + "step": 28143 + }, + { + "epoch": 8.638428483732351, + "grad_norm": 0.17693878710269928, + "learning_rate": 4.785603657394805e-06, + "loss": 1.7207, + "step": 28144 + }, + { + "epoch": 8.638735420503377, + "grad_norm": 0.15449829399585724, + "learning_rate": 4.7834818339218654e-06, + "loss": 1.7433, + "step": 28145 + }, + { + "epoch": 8.639042357274402, + "grad_norm": 0.14260755479335785, + "learning_rate": 4.781360457305062e-06, + "loss": 1.6707, + "step": 28146 + }, + { + "epoch": 8.639349294045427, + "grad_norm": 0.13936764001846313, + "learning_rate": 4.7792395275653715e-06, + "loss": 1.6749, + "step": 28147 + }, + { + "epoch": 8.639656230816453, + "grad_norm": 0.14369705319404602, + "learning_rate": 4.7771190447237215e-06, + "loss": 1.6943, + "step": 28148 + }, + { + "epoch": 8.639963167587476, + "grad_norm": 0.18439368903636932, + "learning_rate": 4.774999008801107e-06, + "loss": 1.7714, + "step": 28149 + }, + { + "epoch": 8.640270104358502, + "grad_norm": 0.15348297357559204, + "learning_rate": 4.772879419818438e-06, + "loss": 1.7315, + "step": 28150 + }, + { + "epoch": 8.640577041129527, + "grad_norm": 0.16643862426280975, + "learning_rate": 4.770760277796693e-06, + "loss": 1.7196, + "step": 28151 + }, + { + "epoch": 8.640883977900552, + "grad_norm": 0.16105540096759796, + "learning_rate": 4.768641582756811e-06, + "loss": 1.7504, + "step": 28152 + }, + { + "epoch": 8.641190914671578, + "grad_norm": 0.135291188955307, + "learning_rate": 4.766523334719714e-06, + "loss": 1.663, + "step": 28153 + }, + { + "epoch": 8.641497851442603, + "grad_norm": 0.15021322667598724, + "learning_rate": 4.764405533706351e-06, + "loss": 1.7318, + "step": 28154 + }, + { + "epoch": 8.641804788213628, + "grad_norm": 0.13949114084243774, + "learning_rate": 4.762288179737645e-06, + "loss": 1.6909, + "step": 28155 + }, + { + "epoch": 8.642111724984654, + "grad_norm": 0.17211735248565674, + "learning_rate": 4.760171272834524e-06, + "loss": 1.7539, + "step": 28156 + }, + { + "epoch": 8.642418661755679, + "grad_norm": 0.12576675415039062, + "learning_rate": 4.7580548130179034e-06, + "loss": 1.6816, + "step": 28157 + }, + { + "epoch": 8.642725598526704, + "grad_norm": 0.18624669313430786, + "learning_rate": 4.755938800308696e-06, + "loss": 1.7976, + "step": 28158 + }, + { + "epoch": 8.643032535297728, + "grad_norm": 0.20610935986042023, + "learning_rate": 4.753823234727834e-06, + "loss": 1.7192, + "step": 28159 + }, + { + "epoch": 8.643339472068753, + "grad_norm": 0.15127690136432648, + "learning_rate": 4.751708116296194e-06, + "loss": 1.6918, + "step": 28160 + }, + { + "epoch": 8.643646408839778, + "grad_norm": 0.14993508160114288, + "learning_rate": 4.7495934450347115e-06, + "loss": 1.7075, + "step": 28161 + }, + { + "epoch": 8.643953345610804, + "grad_norm": 0.16896332800388336, + "learning_rate": 4.747479220964252e-06, + "loss": 1.6971, + "step": 28162 + }, + { + "epoch": 8.644260282381829, + "grad_norm": 0.20022685825824738, + "learning_rate": 4.745365444105737e-06, + "loss": 1.7479, + "step": 28163 + }, + { + "epoch": 8.644567219152854, + "grad_norm": 0.1731337308883667, + "learning_rate": 4.7432521144800565e-06, + "loss": 1.7384, + "step": 28164 + }, + { + "epoch": 8.64487415592388, + "grad_norm": 0.13517920672893524, + "learning_rate": 4.7411392321080605e-06, + "loss": 1.6611, + "step": 28165 + }, + { + "epoch": 8.645181092694905, + "grad_norm": 0.177021324634552, + "learning_rate": 4.739026797010676e-06, + "loss": 1.7779, + "step": 28166 + }, + { + "epoch": 8.64548802946593, + "grad_norm": 0.14956676959991455, + "learning_rate": 4.736914809208737e-06, + "loss": 1.6933, + "step": 28167 + }, + { + "epoch": 8.645794966236956, + "grad_norm": 0.15683145821094513, + "learning_rate": 4.734803268723143e-06, + "loss": 1.7067, + "step": 28168 + }, + { + "epoch": 8.646101903007981, + "grad_norm": 0.198720321059227, + "learning_rate": 4.732692175574755e-06, + "loss": 1.6567, + "step": 28169 + }, + { + "epoch": 8.646408839779006, + "grad_norm": 0.18899580836296082, + "learning_rate": 4.730581529784439e-06, + "loss": 1.7069, + "step": 28170 + }, + { + "epoch": 8.64671577655003, + "grad_norm": 0.17795316874980927, + "learning_rate": 4.728471331373041e-06, + "loss": 1.6803, + "step": 28171 + }, + { + "epoch": 8.647022713321055, + "grad_norm": 0.18296107649803162, + "learning_rate": 4.7263615803614325e-06, + "loss": 1.7774, + "step": 28172 + }, + { + "epoch": 8.64732965009208, + "grad_norm": 0.13994812965393066, + "learning_rate": 4.724252276770453e-06, + "loss": 1.6826, + "step": 28173 + }, + { + "epoch": 8.647636586863106, + "grad_norm": 0.14969824254512787, + "learning_rate": 4.722143420620945e-06, + "loss": 1.6529, + "step": 28174 + }, + { + "epoch": 8.647943523634131, + "grad_norm": 0.14949028193950653, + "learning_rate": 4.7200350119337485e-06, + "loss": 1.7007, + "step": 28175 + }, + { + "epoch": 8.648250460405157, + "grad_norm": 0.14786000549793243, + "learning_rate": 4.71792705072972e-06, + "loss": 1.6999, + "step": 28176 + }, + { + "epoch": 8.648557397176182, + "grad_norm": 0.12665456533432007, + "learning_rate": 4.715819537029659e-06, + "loss": 1.6414, + "step": 28177 + }, + { + "epoch": 8.648864333947207, + "grad_norm": 0.19015786051750183, + "learning_rate": 4.713712470854437e-06, + "loss": 1.7328, + "step": 28178 + }, + { + "epoch": 8.649171270718233, + "grad_norm": 0.20775510370731354, + "learning_rate": 4.711605852224827e-06, + "loss": 1.7735, + "step": 28179 + }, + { + "epoch": 8.649478207489258, + "grad_norm": 0.13774684071540833, + "learning_rate": 4.709499681161678e-06, + "loss": 1.7139, + "step": 28180 + }, + { + "epoch": 8.649785144260282, + "grad_norm": 0.17355668544769287, + "learning_rate": 4.707393957685813e-06, + "loss": 1.7046, + "step": 28181 + }, + { + "epoch": 8.650092081031307, + "grad_norm": 0.21687985956668854, + "learning_rate": 4.70528868181801e-06, + "loss": 1.6736, + "step": 28182 + }, + { + "epoch": 8.650399017802332, + "grad_norm": 0.13978178799152374, + "learning_rate": 4.703183853579107e-06, + "loss": 1.6841, + "step": 28183 + }, + { + "epoch": 8.650705954573358, + "grad_norm": 0.1476740539073944, + "learning_rate": 4.701079472989878e-06, + "loss": 1.6633, + "step": 28184 + }, + { + "epoch": 8.651012891344383, + "grad_norm": 0.17175909876823425, + "learning_rate": 4.698975540071138e-06, + "loss": 1.7059, + "step": 28185 + }, + { + "epoch": 8.651319828115408, + "grad_norm": 0.16164059937000275, + "learning_rate": 4.696872054843671e-06, + "loss": 1.7038, + "step": 28186 + }, + { + "epoch": 8.651626764886434, + "grad_norm": 0.1541287899017334, + "learning_rate": 4.694769017328271e-06, + "loss": 1.6583, + "step": 28187 + }, + { + "epoch": 8.651933701657459, + "grad_norm": 0.19379135966300964, + "learning_rate": 4.6926664275457165e-06, + "loss": 1.7375, + "step": 28188 + }, + { + "epoch": 8.652240638428484, + "grad_norm": 0.12427667528390884, + "learning_rate": 4.690564285516785e-06, + "loss": 1.6434, + "step": 28189 + }, + { + "epoch": 8.65254757519951, + "grad_norm": 0.15416522324085236, + "learning_rate": 4.6884625912622605e-06, + "loss": 1.7551, + "step": 28190 + }, + { + "epoch": 8.652854511970535, + "grad_norm": 0.1467018723487854, + "learning_rate": 4.6863613448029035e-06, + "loss": 1.704, + "step": 28191 + }, + { + "epoch": 8.653161448741558, + "grad_norm": 0.15078933537006378, + "learning_rate": 4.684260546159469e-06, + "loss": 1.7382, + "step": 28192 + }, + { + "epoch": 8.653468385512584, + "grad_norm": 0.13681283593177795, + "learning_rate": 4.682160195352758e-06, + "loss": 1.6732, + "step": 28193 + }, + { + "epoch": 8.65377532228361, + "grad_norm": 0.16412119567394257, + "learning_rate": 4.680060292403476e-06, + "loss": 1.7394, + "step": 28194 + }, + { + "epoch": 8.654082259054634, + "grad_norm": 0.14504186809062958, + "learning_rate": 4.677960837332423e-06, + "loss": 1.6602, + "step": 28195 + }, + { + "epoch": 8.65438919582566, + "grad_norm": 0.15267091989517212, + "learning_rate": 4.6758618301603105e-06, + "loss": 1.7041, + "step": 28196 + }, + { + "epoch": 8.654696132596685, + "grad_norm": 0.1807365119457245, + "learning_rate": 4.673763270907899e-06, + "loss": 1.7556, + "step": 28197 + }, + { + "epoch": 8.65500306936771, + "grad_norm": 0.16227813065052032, + "learning_rate": 4.671665159595939e-06, + "loss": 1.6976, + "step": 28198 + }, + { + "epoch": 8.655310006138736, + "grad_norm": 0.16095015406608582, + "learning_rate": 4.6695674962451305e-06, + "loss": 1.7078, + "step": 28199 + }, + { + "epoch": 8.655616942909761, + "grad_norm": 0.1518808901309967, + "learning_rate": 4.667470280876246e-06, + "loss": 1.6999, + "step": 28200 + }, + { + "epoch": 8.655923879680786, + "grad_norm": 0.13343939185142517, + "learning_rate": 4.665373513509974e-06, + "loss": 1.7186, + "step": 28201 + }, + { + "epoch": 8.65623081645181, + "grad_norm": 0.1545572429895401, + "learning_rate": 4.6632771941670535e-06, + "loss": 1.7281, + "step": 28202 + }, + { + "epoch": 8.656537753222835, + "grad_norm": 0.13296550512313843, + "learning_rate": 4.661181322868208e-06, + "loss": 1.6632, + "step": 28203 + }, + { + "epoch": 8.65684468999386, + "grad_norm": 0.15362371504306793, + "learning_rate": 4.659085899634141e-06, + "loss": 1.7415, + "step": 28204 + }, + { + "epoch": 8.657151626764886, + "grad_norm": 0.14498870074748993, + "learning_rate": 4.65699092448556e-06, + "loss": 1.7342, + "step": 28205 + }, + { + "epoch": 8.657458563535911, + "grad_norm": 0.19409331679344177, + "learning_rate": 4.654896397443176e-06, + "loss": 1.7562, + "step": 28206 + }, + { + "epoch": 8.657765500306937, + "grad_norm": 0.15481562912464142, + "learning_rate": 4.652802318527677e-06, + "loss": 1.6905, + "step": 28207 + }, + { + "epoch": 8.658072437077962, + "grad_norm": 0.17566657066345215, + "learning_rate": 4.650708687759769e-06, + "loss": 1.6902, + "step": 28208 + }, + { + "epoch": 8.658379373848987, + "grad_norm": 0.13994581997394562, + "learning_rate": 4.648615505160125e-06, + "loss": 1.672, + "step": 28209 + }, + { + "epoch": 8.658686310620013, + "grad_norm": 0.34969639778137207, + "learning_rate": 4.646522770749467e-06, + "loss": 1.6959, + "step": 28210 + }, + { + "epoch": 8.658993247391038, + "grad_norm": 0.16637352108955383, + "learning_rate": 4.644430484548428e-06, + "loss": 1.7119, + "step": 28211 + }, + { + "epoch": 8.659300184162063, + "grad_norm": 0.16540484130382538, + "learning_rate": 4.642338646577738e-06, + "loss": 1.7541, + "step": 28212 + }, + { + "epoch": 8.659607120933089, + "grad_norm": 0.13890287280082703, + "learning_rate": 4.640247256858016e-06, + "loss": 1.7117, + "step": 28213 + }, + { + "epoch": 8.659914057704112, + "grad_norm": 0.1403251439332962, + "learning_rate": 4.63815631540997e-06, + "loss": 1.697, + "step": 28214 + }, + { + "epoch": 8.660220994475138, + "grad_norm": 0.13313040137290955, + "learning_rate": 4.63606582225426e-06, + "loss": 1.6587, + "step": 28215 + }, + { + "epoch": 8.660527931246163, + "grad_norm": 0.12887243926525116, + "learning_rate": 4.63397577741152e-06, + "loss": 1.6441, + "step": 28216 + }, + { + "epoch": 8.660834868017188, + "grad_norm": 0.15074272453784943, + "learning_rate": 4.631886180902434e-06, + "loss": 1.7176, + "step": 28217 + }, + { + "epoch": 8.661141804788214, + "grad_norm": 0.12572859227657318, + "learning_rate": 4.629797032747624e-06, + "loss": 1.6779, + "step": 28218 + }, + { + "epoch": 8.661448741559239, + "grad_norm": 0.1607646495103836, + "learning_rate": 4.627708332967762e-06, + "loss": 1.747, + "step": 28219 + }, + { + "epoch": 8.661755678330264, + "grad_norm": 0.14080339670181274, + "learning_rate": 4.625620081583482e-06, + "loss": 1.7063, + "step": 28220 + }, + { + "epoch": 8.66206261510129, + "grad_norm": 0.17140309512615204, + "learning_rate": 4.623532278615411e-06, + "loss": 1.7265, + "step": 28221 + }, + { + "epoch": 8.662369551872315, + "grad_norm": 0.1564357578754425, + "learning_rate": 4.621444924084195e-06, + "loss": 1.7265, + "step": 28222 + }, + { + "epoch": 8.66267648864334, + "grad_norm": 0.20058012008666992, + "learning_rate": 4.619358018010461e-06, + "loss": 1.7824, + "step": 28223 + }, + { + "epoch": 8.662983425414364, + "grad_norm": 0.16060246527194977, + "learning_rate": 4.617271560414827e-06, + "loss": 1.7329, + "step": 28224 + }, + { + "epoch": 8.66329036218539, + "grad_norm": 0.1967579573392868, + "learning_rate": 4.6151855513179136e-06, + "loss": 1.7386, + "step": 28225 + }, + { + "epoch": 8.663597298956415, + "grad_norm": 0.14853200316429138, + "learning_rate": 4.613099990740338e-06, + "loss": 1.6727, + "step": 28226 + }, + { + "epoch": 8.66390423572744, + "grad_norm": 0.1625850945711136, + "learning_rate": 4.611014878702713e-06, + "loss": 1.7074, + "step": 28227 + }, + { + "epoch": 8.664211172498465, + "grad_norm": 0.15605251491069794, + "learning_rate": 4.608930215225627e-06, + "loss": 1.7092, + "step": 28228 + }, + { + "epoch": 8.66451810926949, + "grad_norm": 0.14355498552322388, + "learning_rate": 4.606846000329723e-06, + "loss": 1.6819, + "step": 28229 + }, + { + "epoch": 8.664825046040516, + "grad_norm": 0.16151221096515656, + "learning_rate": 4.604762234035548e-06, + "loss": 1.7251, + "step": 28230 + }, + { + "epoch": 8.665131982811541, + "grad_norm": 0.1165589988231659, + "learning_rate": 4.60267891636374e-06, + "loss": 1.644, + "step": 28231 + }, + { + "epoch": 8.665438919582567, + "grad_norm": 0.13766367733478546, + "learning_rate": 4.6005960473348594e-06, + "loss": 1.6526, + "step": 28232 + }, + { + "epoch": 8.665745856353592, + "grad_norm": 0.15400783717632294, + "learning_rate": 4.598513626969486e-06, + "loss": 1.7356, + "step": 28233 + }, + { + "epoch": 8.666052793124617, + "grad_norm": 0.1635274887084961, + "learning_rate": 4.596431655288236e-06, + "loss": 1.6846, + "step": 28234 + }, + { + "epoch": 8.66635972989564, + "grad_norm": 0.17310741543769836, + "learning_rate": 4.5943501323116365e-06, + "loss": 1.7321, + "step": 28235 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.14390932023525238, + "learning_rate": 4.592269058060295e-06, + "loss": 1.6606, + "step": 28236 + }, + { + "epoch": 8.666973603437691, + "grad_norm": 0.15254996716976166, + "learning_rate": 4.590188432554759e-06, + "loss": 1.6796, + "step": 28237 + }, + { + "epoch": 8.667280540208717, + "grad_norm": 0.16224564611911774, + "learning_rate": 4.588108255815599e-06, + "loss": 1.7139, + "step": 28238 + }, + { + "epoch": 8.667587476979742, + "grad_norm": 0.14472807943820953, + "learning_rate": 4.586028527863373e-06, + "loss": 1.681, + "step": 28239 + }, + { + "epoch": 8.667894413750767, + "grad_norm": 0.17748364806175232, + "learning_rate": 4.583949248718627e-06, + "loss": 1.7205, + "step": 28240 + }, + { + "epoch": 8.668201350521793, + "grad_norm": 0.16917170584201813, + "learning_rate": 4.581870418401918e-06, + "loss": 1.7475, + "step": 28241 + }, + { + "epoch": 8.668508287292818, + "grad_norm": 0.15715333819389343, + "learning_rate": 4.579792036933784e-06, + "loss": 1.6988, + "step": 28242 + }, + { + "epoch": 8.668815224063843, + "grad_norm": 0.18384969234466553, + "learning_rate": 4.577714104334768e-06, + "loss": 1.715, + "step": 28243 + }, + { + "epoch": 8.669122160834869, + "grad_norm": 0.20845188200473785, + "learning_rate": 4.575636620625401e-06, + "loss": 1.784, + "step": 28244 + }, + { + "epoch": 8.669429097605892, + "grad_norm": 0.16388222575187683, + "learning_rate": 4.5735595858262095e-06, + "loss": 1.7091, + "step": 28245 + }, + { + "epoch": 8.669736034376918, + "grad_norm": 0.27372440695762634, + "learning_rate": 4.571482999957744e-06, + "loss": 1.6903, + "step": 28246 + }, + { + "epoch": 8.670042971147943, + "grad_norm": 0.14129513502120972, + "learning_rate": 4.569406863040493e-06, + "loss": 1.692, + "step": 28247 + }, + { + "epoch": 8.670349907918968, + "grad_norm": 0.1707242876291275, + "learning_rate": 4.567331175095013e-06, + "loss": 1.7542, + "step": 28248 + }, + { + "epoch": 8.670656844689994, + "grad_norm": 0.16061219573020935, + "learning_rate": 4.565255936141783e-06, + "loss": 1.7086, + "step": 28249 + }, + { + "epoch": 8.670963781461019, + "grad_norm": 0.186256542801857, + "learning_rate": 4.5631811462013116e-06, + "loss": 1.7298, + "step": 28250 + }, + { + "epoch": 8.671270718232044, + "grad_norm": 0.19365312159061432, + "learning_rate": 4.561106805294141e-06, + "loss": 1.7714, + "step": 28251 + }, + { + "epoch": 8.67157765500307, + "grad_norm": 0.12306032329797745, + "learning_rate": 4.55903291344072e-06, + "loss": 1.7148, + "step": 28252 + }, + { + "epoch": 8.671884591774095, + "grad_norm": 0.14681962132453918, + "learning_rate": 4.556959470661592e-06, + "loss": 1.6909, + "step": 28253 + }, + { + "epoch": 8.67219152854512, + "grad_norm": 0.22181211411952972, + "learning_rate": 4.554886476977205e-06, + "loss": 1.7747, + "step": 28254 + }, + { + "epoch": 8.672498465316146, + "grad_norm": 0.15168124437332153, + "learning_rate": 4.5528139324080784e-06, + "loss": 1.7206, + "step": 28255 + }, + { + "epoch": 8.67280540208717, + "grad_norm": 0.15613441169261932, + "learning_rate": 4.550741836974676e-06, + "loss": 1.7062, + "step": 28256 + }, + { + "epoch": 8.673112338858195, + "grad_norm": 0.1939506232738495, + "learning_rate": 4.548670190697485e-06, + "loss": 1.747, + "step": 28257 + }, + { + "epoch": 8.67341927562922, + "grad_norm": 0.15883082151412964, + "learning_rate": 4.5465989935969785e-06, + "loss": 1.7169, + "step": 28258 + }, + { + "epoch": 8.673726212400245, + "grad_norm": 0.14583253860473633, + "learning_rate": 4.5445282456936185e-06, + "loss": 1.6918, + "step": 28259 + }, + { + "epoch": 8.67403314917127, + "grad_norm": 0.12797339260578156, + "learning_rate": 4.5424579470078725e-06, + "loss": 1.6791, + "step": 28260 + }, + { + "epoch": 8.674340085942296, + "grad_norm": 0.18248072266578674, + "learning_rate": 4.5403880975602e-06, + "loss": 1.7338, + "step": 28261 + }, + { + "epoch": 8.674647022713321, + "grad_norm": 0.1547573208808899, + "learning_rate": 4.538318697371047e-06, + "loss": 1.6259, + "step": 28262 + }, + { + "epoch": 8.674953959484347, + "grad_norm": 0.18609635531902313, + "learning_rate": 4.536249746460897e-06, + "loss": 1.6943, + "step": 28263 + }, + { + "epoch": 8.675260896255372, + "grad_norm": 0.15615214407444, + "learning_rate": 4.534181244850161e-06, + "loss": 1.6851, + "step": 28264 + }, + { + "epoch": 8.675567833026397, + "grad_norm": 0.17061203718185425, + "learning_rate": 4.532113192559296e-06, + "loss": 1.7612, + "step": 28265 + }, + { + "epoch": 8.675874769797423, + "grad_norm": 0.17611360549926758, + "learning_rate": 4.530045589608739e-06, + "loss": 1.7109, + "step": 28266 + }, + { + "epoch": 8.676181706568446, + "grad_norm": 0.14381951093673706, + "learning_rate": 4.527978436018915e-06, + "loss": 1.6914, + "step": 28267 + }, + { + "epoch": 8.676488643339471, + "grad_norm": 0.18309952318668365, + "learning_rate": 4.525911731810273e-06, + "loss": 1.8044, + "step": 28268 + }, + { + "epoch": 8.676795580110497, + "grad_norm": 0.16398122906684875, + "learning_rate": 4.523845477003208e-06, + "loss": 1.7002, + "step": 28269 + }, + { + "epoch": 8.677102516881522, + "grad_norm": 0.12263865768909454, + "learning_rate": 4.521779671618176e-06, + "loss": 1.6777, + "step": 28270 + }, + { + "epoch": 8.677409453652547, + "grad_norm": 0.17702268064022064, + "learning_rate": 4.519714315675555e-06, + "loss": 1.697, + "step": 28271 + }, + { + "epoch": 8.677716390423573, + "grad_norm": 0.1558506339788437, + "learning_rate": 4.517649409195779e-06, + "loss": 1.7151, + "step": 28272 + }, + { + "epoch": 8.678023327194598, + "grad_norm": 0.19969215989112854, + "learning_rate": 4.5155849521992536e-06, + "loss": 1.7952, + "step": 28273 + }, + { + "epoch": 8.678330263965623, + "grad_norm": 0.14770828187465668, + "learning_rate": 4.513520944706379e-06, + "loss": 1.6846, + "step": 28274 + }, + { + "epoch": 8.678637200736649, + "grad_norm": 0.22692953050136566, + "learning_rate": 4.511457386737544e-06, + "loss": 1.7599, + "step": 28275 + }, + { + "epoch": 8.678944137507674, + "grad_norm": 0.1689091920852661, + "learning_rate": 4.509394278313156e-06, + "loss": 1.67, + "step": 28276 + }, + { + "epoch": 8.6792510742787, + "grad_norm": 0.12909743189811707, + "learning_rate": 4.507331619453592e-06, + "loss": 1.7062, + "step": 28277 + }, + { + "epoch": 8.679558011049723, + "grad_norm": 0.15877538919448853, + "learning_rate": 4.505269410179241e-06, + "loss": 1.688, + "step": 28278 + }, + { + "epoch": 8.679864947820748, + "grad_norm": 0.13565565645694733, + "learning_rate": 4.503207650510477e-06, + "loss": 1.6742, + "step": 28279 + }, + { + "epoch": 8.680171884591774, + "grad_norm": 0.1718231737613678, + "learning_rate": 4.501146340467699e-06, + "loss": 1.71, + "step": 28280 + }, + { + "epoch": 8.680478821362799, + "grad_norm": 0.14713016152381897, + "learning_rate": 4.499085480071252e-06, + "loss": 1.698, + "step": 28281 + }, + { + "epoch": 8.680785758133824, + "grad_norm": 0.15546689927577972, + "learning_rate": 4.49702506934151e-06, + "loss": 1.6863, + "step": 28282 + }, + { + "epoch": 8.68109269490485, + "grad_norm": 0.1528242826461792, + "learning_rate": 4.494965108298832e-06, + "loss": 1.7236, + "step": 28283 + }, + { + "epoch": 8.681399631675875, + "grad_norm": 0.14601372182369232, + "learning_rate": 4.4929055969635755e-06, + "loss": 1.7008, + "step": 28284 + }, + { + "epoch": 8.6817065684469, + "grad_norm": 0.18398553133010864, + "learning_rate": 4.490846535356119e-06, + "loss": 1.7117, + "step": 28285 + }, + { + "epoch": 8.682013505217926, + "grad_norm": 0.16242702305316925, + "learning_rate": 4.4887879234967675e-06, + "loss": 1.7204, + "step": 28286 + }, + { + "epoch": 8.682320441988951, + "grad_norm": 0.11883296817541122, + "learning_rate": 4.486729761405911e-06, + "loss": 1.665, + "step": 28287 + }, + { + "epoch": 8.682627378759975, + "grad_norm": 0.157135009765625, + "learning_rate": 4.484672049103844e-06, + "loss": 1.7438, + "step": 28288 + }, + { + "epoch": 8.682934315531, + "grad_norm": 0.17938226461410522, + "learning_rate": 4.482614786610939e-06, + "loss": 1.7022, + "step": 28289 + }, + { + "epoch": 8.683241252302025, + "grad_norm": 0.20547567307949066, + "learning_rate": 4.480557973947514e-06, + "loss": 1.7818, + "step": 28290 + }, + { + "epoch": 8.68354818907305, + "grad_norm": 0.2329530566930771, + "learning_rate": 4.478501611133889e-06, + "loss": 1.7702, + "step": 28291 + }, + { + "epoch": 8.683855125844076, + "grad_norm": 0.1893717646598816, + "learning_rate": 4.476445698190396e-06, + "loss": 1.7614, + "step": 28292 + }, + { + "epoch": 8.684162062615101, + "grad_norm": 0.17520616948604584, + "learning_rate": 4.474390235137349e-06, + "loss": 1.7585, + "step": 28293 + }, + { + "epoch": 8.684468999386127, + "grad_norm": 0.14743252098560333, + "learning_rate": 4.4723352219950605e-06, + "loss": 1.7008, + "step": 28294 + }, + { + "epoch": 8.684775936157152, + "grad_norm": 0.1734410971403122, + "learning_rate": 4.470280658783843e-06, + "loss": 1.6979, + "step": 28295 + }, + { + "epoch": 8.685082872928177, + "grad_norm": 0.1811109185218811, + "learning_rate": 4.468226545523985e-06, + "loss": 1.7124, + "step": 28296 + }, + { + "epoch": 8.685389809699203, + "grad_norm": 0.12056677043437958, + "learning_rate": 4.466172882235819e-06, + "loss": 1.6642, + "step": 28297 + }, + { + "epoch": 8.685696746470228, + "grad_norm": 0.159573495388031, + "learning_rate": 4.464119668939609e-06, + "loss": 1.7055, + "step": 28298 + }, + { + "epoch": 8.686003683241251, + "grad_norm": 0.17341920733451843, + "learning_rate": 4.46206690565566e-06, + "loss": 1.7036, + "step": 28299 + }, + { + "epoch": 8.686310620012277, + "grad_norm": 0.1660631000995636, + "learning_rate": 4.46001459240426e-06, + "loss": 1.7303, + "step": 28300 + }, + { + "epoch": 8.686617556783302, + "grad_norm": 0.18377192318439484, + "learning_rate": 4.4579627292056724e-06, + "loss": 1.7301, + "step": 28301 + }, + { + "epoch": 8.686924493554327, + "grad_norm": 0.13730384409427643, + "learning_rate": 4.455911316080213e-06, + "loss": 1.6399, + "step": 28302 + }, + { + "epoch": 8.687231430325353, + "grad_norm": 0.25353705883026123, + "learning_rate": 4.453860353048112e-06, + "loss": 1.7682, + "step": 28303 + }, + { + "epoch": 8.687538367096378, + "grad_norm": 0.15051604807376862, + "learning_rate": 4.451809840129673e-06, + "loss": 1.7268, + "step": 28304 + }, + { + "epoch": 8.687845303867404, + "grad_norm": 0.2090475857257843, + "learning_rate": 4.449759777345131e-06, + "loss": 1.7697, + "step": 28305 + }, + { + "epoch": 8.688152240638429, + "grad_norm": 0.13042283058166504, + "learning_rate": 4.4477101647147745e-06, + "loss": 1.667, + "step": 28306 + }, + { + "epoch": 8.688459177409454, + "grad_norm": 0.1518186628818512, + "learning_rate": 4.445661002258838e-06, + "loss": 1.7095, + "step": 28307 + }, + { + "epoch": 8.68876611418048, + "grad_norm": 0.13992765545845032, + "learning_rate": 4.443612289997584e-06, + "loss": 1.6761, + "step": 28308 + }, + { + "epoch": 8.689073050951503, + "grad_norm": 0.17726075649261475, + "learning_rate": 4.44156402795125e-06, + "loss": 1.7444, + "step": 28309 + }, + { + "epoch": 8.689379987722528, + "grad_norm": 0.15143834054470062, + "learning_rate": 4.439516216140088e-06, + "loss": 1.7078, + "step": 28310 + }, + { + "epoch": 8.689686924493554, + "grad_norm": 0.17791767418384552, + "learning_rate": 4.437468854584326e-06, + "loss": 1.7402, + "step": 28311 + }, + { + "epoch": 8.689993861264579, + "grad_norm": 0.19582994282245636, + "learning_rate": 4.435421943304208e-06, + "loss": 1.757, + "step": 28312 + }, + { + "epoch": 8.690300798035604, + "grad_norm": 0.19730351865291595, + "learning_rate": 4.43337548231994e-06, + "loss": 1.6982, + "step": 28313 + }, + { + "epoch": 8.69060773480663, + "grad_norm": 0.16093717515468597, + "learning_rate": 4.43132947165179e-06, + "loss": 1.7116, + "step": 28314 + }, + { + "epoch": 8.690914671577655, + "grad_norm": 0.16639035940170288, + "learning_rate": 4.429283911319937e-06, + "loss": 1.7166, + "step": 28315 + }, + { + "epoch": 8.69122160834868, + "grad_norm": 0.13834281265735626, + "learning_rate": 4.427238801344608e-06, + "loss": 1.7058, + "step": 28316 + }, + { + "epoch": 8.691528545119706, + "grad_norm": 0.1761016994714737, + "learning_rate": 4.4251941417460194e-06, + "loss": 1.7155, + "step": 28317 + }, + { + "epoch": 8.691835481890731, + "grad_norm": 0.17754366993904114, + "learning_rate": 4.423149932544363e-06, + "loss": 1.768, + "step": 28318 + }, + { + "epoch": 8.692142418661756, + "grad_norm": 0.1563618779182434, + "learning_rate": 4.42110617375987e-06, + "loss": 1.706, + "step": 28319 + }, + { + "epoch": 8.692449355432782, + "grad_norm": 0.16851158440113068, + "learning_rate": 4.419062865412704e-06, + "loss": 1.7084, + "step": 28320 + }, + { + "epoch": 8.692756292203805, + "grad_norm": 0.16056731343269348, + "learning_rate": 4.4170200075230925e-06, + "loss": 1.6771, + "step": 28321 + }, + { + "epoch": 8.69306322897483, + "grad_norm": 0.17098097503185272, + "learning_rate": 4.414977600111192e-06, + "loss": 1.712, + "step": 28322 + }, + { + "epoch": 8.693370165745856, + "grad_norm": 0.17442475259304047, + "learning_rate": 4.412935643197208e-06, + "loss": 1.7725, + "step": 28323 + }, + { + "epoch": 8.693677102516881, + "grad_norm": 0.16090531647205353, + "learning_rate": 4.410894136801308e-06, + "loss": 1.6996, + "step": 28324 + }, + { + "epoch": 8.693984039287907, + "grad_norm": 0.17448033392429352, + "learning_rate": 4.408853080943681e-06, + "loss": 1.6934, + "step": 28325 + }, + { + "epoch": 8.694290976058932, + "grad_norm": 0.15201367437839508, + "learning_rate": 4.406812475644484e-06, + "loss": 1.6671, + "step": 28326 + }, + { + "epoch": 8.694597912829957, + "grad_norm": 0.15211759507656097, + "learning_rate": 4.404772320923889e-06, + "loss": 1.7281, + "step": 28327 + }, + { + "epoch": 8.694904849600983, + "grad_norm": 0.1757364720106125, + "learning_rate": 4.402732616802063e-06, + "loss": 1.7085, + "step": 28328 + }, + { + "epoch": 8.695211786372008, + "grad_norm": 0.17995139956474304, + "learning_rate": 4.400693363299152e-06, + "loss": 1.7335, + "step": 28329 + }, + { + "epoch": 8.695518723143033, + "grad_norm": 0.1404990553855896, + "learning_rate": 4.398654560435312e-06, + "loss": 1.7102, + "step": 28330 + }, + { + "epoch": 8.695825659914057, + "grad_norm": 0.17141692340373993, + "learning_rate": 4.396616208230708e-06, + "loss": 1.7195, + "step": 28331 + }, + { + "epoch": 8.696132596685082, + "grad_norm": 0.17162097990512848, + "learning_rate": 4.394578306705471e-06, + "loss": 1.7075, + "step": 28332 + }, + { + "epoch": 8.696439533456108, + "grad_norm": 0.18884550034999847, + "learning_rate": 4.392540855879734e-06, + "loss": 1.72, + "step": 28333 + }, + { + "epoch": 8.696746470227133, + "grad_norm": 0.21365602314472198, + "learning_rate": 4.3905038557736425e-06, + "loss": 1.8024, + "step": 28334 + }, + { + "epoch": 8.697053406998158, + "grad_norm": 0.1939813494682312, + "learning_rate": 4.388467306407318e-06, + "loss": 1.6694, + "step": 28335 + }, + { + "epoch": 8.697360343769184, + "grad_norm": 0.20518864691257477, + "learning_rate": 4.386431207800906e-06, + "loss": 1.7708, + "step": 28336 + }, + { + "epoch": 8.697667280540209, + "grad_norm": 0.16070924699306488, + "learning_rate": 4.3843955599745025e-06, + "loss": 1.7496, + "step": 28337 + }, + { + "epoch": 8.697974217311234, + "grad_norm": 0.17010091245174408, + "learning_rate": 4.3823603629482514e-06, + "loss": 1.6996, + "step": 28338 + }, + { + "epoch": 8.69828115408226, + "grad_norm": 0.14453141391277313, + "learning_rate": 4.380325616742237e-06, + "loss": 1.7032, + "step": 28339 + }, + { + "epoch": 8.698588090853285, + "grad_norm": 0.1959836632013321, + "learning_rate": 4.378291321376593e-06, + "loss": 1.7861, + "step": 28340 + }, + { + "epoch": 8.69889502762431, + "grad_norm": 0.12473960220813751, + "learning_rate": 4.376257476871415e-06, + "loss": 1.6465, + "step": 28341 + }, + { + "epoch": 8.699201964395334, + "grad_norm": 0.17088855803012848, + "learning_rate": 4.374224083246797e-06, + "loss": 1.7701, + "step": 28342 + }, + { + "epoch": 8.699508901166359, + "grad_norm": 0.17513783276081085, + "learning_rate": 4.372191140522846e-06, + "loss": 1.7107, + "step": 28343 + }, + { + "epoch": 8.699815837937384, + "grad_norm": 0.15522748231887817, + "learning_rate": 4.370158648719641e-06, + "loss": 1.6961, + "step": 28344 + }, + { + "epoch": 8.70012277470841, + "grad_norm": 0.1434583216905594, + "learning_rate": 4.36812660785727e-06, + "loss": 1.6927, + "step": 28345 + }, + { + "epoch": 8.700429711479435, + "grad_norm": 0.1571590155363083, + "learning_rate": 4.366095017955824e-06, + "loss": 1.6747, + "step": 28346 + }, + { + "epoch": 8.70073664825046, + "grad_norm": 0.15448859333992004, + "learning_rate": 4.364063879035357e-06, + "loss": 1.7052, + "step": 28347 + }, + { + "epoch": 8.701043585021486, + "grad_norm": 0.18512596189975739, + "learning_rate": 4.362033191115983e-06, + "loss": 1.7516, + "step": 28348 + }, + { + "epoch": 8.701350521792511, + "grad_norm": 0.14646342396736145, + "learning_rate": 4.360002954217734e-06, + "loss": 1.7152, + "step": 28349 + }, + { + "epoch": 8.701657458563536, + "grad_norm": 0.15107101202011108, + "learning_rate": 4.357973168360691e-06, + "loss": 1.6659, + "step": 28350 + }, + { + "epoch": 8.701964395334562, + "grad_norm": 0.1887415051460266, + "learning_rate": 4.355943833564908e-06, + "loss": 1.7506, + "step": 28351 + }, + { + "epoch": 8.702271332105585, + "grad_norm": 0.17195916175842285, + "learning_rate": 4.353914949850424e-06, + "loss": 1.7571, + "step": 28352 + }, + { + "epoch": 8.70257826887661, + "grad_norm": 0.1679403930902481, + "learning_rate": 4.35188651723733e-06, + "loss": 1.7321, + "step": 28353 + }, + { + "epoch": 8.702885205647636, + "grad_norm": 0.1917678713798523, + "learning_rate": 4.349858535745633e-06, + "loss": 1.7387, + "step": 28354 + }, + { + "epoch": 8.703192142418661, + "grad_norm": 0.1321115791797638, + "learning_rate": 4.347831005395408e-06, + "loss": 1.7221, + "step": 28355 + }, + { + "epoch": 8.703499079189687, + "grad_norm": 0.14510731399059296, + "learning_rate": 4.345803926206654e-06, + "loss": 1.6905, + "step": 28356 + }, + { + "epoch": 8.703806015960712, + "grad_norm": 0.158061221241951, + "learning_rate": 4.343777298199431e-06, + "loss": 1.6605, + "step": 28357 + }, + { + "epoch": 8.704112952731737, + "grad_norm": 0.15366631746292114, + "learning_rate": 4.341751121393767e-06, + "loss": 1.7069, + "step": 28358 + }, + { + "epoch": 8.704419889502763, + "grad_norm": 0.20126941800117493, + "learning_rate": 4.339725395809674e-06, + "loss": 1.7704, + "step": 28359 + }, + { + "epoch": 8.704726826273788, + "grad_norm": 0.14276063442230225, + "learning_rate": 4.337700121467181e-06, + "loss": 1.6704, + "step": 28360 + }, + { + "epoch": 8.705033763044813, + "grad_norm": 0.15362146496772766, + "learning_rate": 4.335675298386293e-06, + "loss": 1.6486, + "step": 28361 + }, + { + "epoch": 8.705340699815839, + "grad_norm": 0.16178739070892334, + "learning_rate": 4.333650926587035e-06, + "loss": 1.703, + "step": 28362 + }, + { + "epoch": 8.705647636586864, + "grad_norm": 0.16188332438468933, + "learning_rate": 4.331627006089395e-06, + "loss": 1.6912, + "step": 28363 + }, + { + "epoch": 8.705954573357888, + "grad_norm": 0.1567341834306717, + "learning_rate": 4.3296035369133846e-06, + "loss": 1.6767, + "step": 28364 + }, + { + "epoch": 8.706261510128913, + "grad_norm": 0.16202545166015625, + "learning_rate": 4.327580519079011e-06, + "loss": 1.6836, + "step": 28365 + }, + { + "epoch": 8.706568446899938, + "grad_norm": 0.17161825299263, + "learning_rate": 4.325557952606252e-06, + "loss": 1.7271, + "step": 28366 + }, + { + "epoch": 8.706875383670964, + "grad_norm": 0.14774417877197266, + "learning_rate": 4.323535837515097e-06, + "loss": 1.6815, + "step": 28367 + }, + { + "epoch": 8.707182320441989, + "grad_norm": 0.19654276967048645, + "learning_rate": 4.321514173825531e-06, + "loss": 1.6633, + "step": 28368 + }, + { + "epoch": 8.707489257213014, + "grad_norm": 0.18064813315868378, + "learning_rate": 4.319492961557531e-06, + "loss": 1.7222, + "step": 28369 + }, + { + "epoch": 8.70779619398404, + "grad_norm": 0.14830774068832397, + "learning_rate": 4.317472200731087e-06, + "loss": 1.6921, + "step": 28370 + }, + { + "epoch": 8.708103130755065, + "grad_norm": 0.17077864706516266, + "learning_rate": 4.315451891366146e-06, + "loss": 1.6785, + "step": 28371 + }, + { + "epoch": 8.70841006752609, + "grad_norm": 0.1815696656703949, + "learning_rate": 4.313432033482701e-06, + "loss": 1.6865, + "step": 28372 + }, + { + "epoch": 8.708717004297116, + "grad_norm": 0.17936676740646362, + "learning_rate": 4.311412627100686e-06, + "loss": 1.7477, + "step": 28373 + }, + { + "epoch": 8.70902394106814, + "grad_norm": 0.16955824196338654, + "learning_rate": 4.30939367224007e-06, + "loss": 1.6906, + "step": 28374 + }, + { + "epoch": 8.709330877839164, + "grad_norm": 0.14489254355430603, + "learning_rate": 4.307375168920813e-06, + "loss": 1.6777, + "step": 28375 + }, + { + "epoch": 8.70963781461019, + "grad_norm": 0.18070191144943237, + "learning_rate": 4.305357117162856e-06, + "loss": 1.6955, + "step": 28376 + }, + { + "epoch": 8.709944751381215, + "grad_norm": 0.18469898402690887, + "learning_rate": 4.3033395169861375e-06, + "loss": 1.7364, + "step": 28377 + }, + { + "epoch": 8.71025168815224, + "grad_norm": 0.13740944862365723, + "learning_rate": 4.301322368410604e-06, + "loss": 1.6781, + "step": 28378 + }, + { + "epoch": 8.710558624923266, + "grad_norm": 0.16305440664291382, + "learning_rate": 4.299305671456189e-06, + "loss": 1.7277, + "step": 28379 + }, + { + "epoch": 8.710865561694291, + "grad_norm": 0.15460261702537537, + "learning_rate": 4.29728942614282e-06, + "loss": 1.7536, + "step": 28380 + }, + { + "epoch": 8.711172498465316, + "grad_norm": 0.13714177906513214, + "learning_rate": 4.2952736324904205e-06, + "loss": 1.7417, + "step": 28381 + }, + { + "epoch": 8.711479435236342, + "grad_norm": 0.22590506076812744, + "learning_rate": 4.29325829051892e-06, + "loss": 1.6888, + "step": 28382 + }, + { + "epoch": 8.711786372007367, + "grad_norm": 0.17581406235694885, + "learning_rate": 4.291243400248229e-06, + "loss": 1.7781, + "step": 28383 + }, + { + "epoch": 8.712093308778392, + "grad_norm": 0.15321393311023712, + "learning_rate": 4.289228961698266e-06, + "loss": 1.6613, + "step": 28384 + }, + { + "epoch": 8.712400245549416, + "grad_norm": 0.1657101809978485, + "learning_rate": 4.287214974888931e-06, + "loss": 1.7152, + "step": 28385 + }, + { + "epoch": 8.712707182320441, + "grad_norm": 0.18134190142154694, + "learning_rate": 4.28520143984013e-06, + "loss": 1.7265, + "step": 28386 + }, + { + "epoch": 8.713014119091467, + "grad_norm": 0.1232382282614708, + "learning_rate": 4.28318835657176e-06, + "loss": 1.6457, + "step": 28387 + }, + { + "epoch": 8.713321055862492, + "grad_norm": 0.1339728981256485, + "learning_rate": 4.281175725103715e-06, + "loss": 1.6516, + "step": 28388 + }, + { + "epoch": 8.713627992633517, + "grad_norm": 0.15603719651699066, + "learning_rate": 4.2791635454559e-06, + "loss": 1.717, + "step": 28389 + }, + { + "epoch": 8.713934929404543, + "grad_norm": 0.17226538062095642, + "learning_rate": 4.277151817648179e-06, + "loss": 1.7088, + "step": 28390 + }, + { + "epoch": 8.714241866175568, + "grad_norm": 0.17237617075443268, + "learning_rate": 4.275140541700445e-06, + "loss": 1.7467, + "step": 28391 + }, + { + "epoch": 8.714548802946593, + "grad_norm": 0.1798042505979538, + "learning_rate": 4.2731297176325734e-06, + "loss": 1.7157, + "step": 28392 + }, + { + "epoch": 8.714855739717619, + "grad_norm": 0.1701999455690384, + "learning_rate": 4.271119345464436e-06, + "loss": 1.7575, + "step": 28393 + }, + { + "epoch": 8.715162676488644, + "grad_norm": 0.13981005549430847, + "learning_rate": 4.2691094252159e-06, + "loss": 1.7315, + "step": 28394 + }, + { + "epoch": 8.715469613259668, + "grad_norm": 0.19189679622650146, + "learning_rate": 4.267099956906828e-06, + "loss": 1.7338, + "step": 28395 + }, + { + "epoch": 8.715776550030693, + "grad_norm": 0.14194947481155396, + "learning_rate": 4.265090940557076e-06, + "loss": 1.6999, + "step": 28396 + }, + { + "epoch": 8.716083486801718, + "grad_norm": 0.15809695422649384, + "learning_rate": 4.263082376186506e-06, + "loss": 1.6643, + "step": 28397 + }, + { + "epoch": 8.716390423572744, + "grad_norm": 0.12897074222564697, + "learning_rate": 4.261074263814963e-06, + "loss": 1.7096, + "step": 28398 + }, + { + "epoch": 8.716697360343769, + "grad_norm": 0.1517125964164734, + "learning_rate": 4.259066603462292e-06, + "loss": 1.7101, + "step": 28399 + }, + { + "epoch": 8.717004297114794, + "grad_norm": 0.1489602029323578, + "learning_rate": 4.257059395148333e-06, + "loss": 1.7097, + "step": 28400 + }, + { + "epoch": 8.71731123388582, + "grad_norm": 0.15182913839817047, + "learning_rate": 4.255052638892926e-06, + "loss": 1.7161, + "step": 28401 + }, + { + "epoch": 8.717618170656845, + "grad_norm": 0.1973588615655899, + "learning_rate": 4.253046334715899e-06, + "loss": 1.7452, + "step": 28402 + }, + { + "epoch": 8.71792510742787, + "grad_norm": 0.17291557788848877, + "learning_rate": 4.251040482637081e-06, + "loss": 1.7671, + "step": 28403 + }, + { + "epoch": 8.718232044198896, + "grad_norm": 0.1525208055973053, + "learning_rate": 4.249035082676295e-06, + "loss": 1.6891, + "step": 28404 + }, + { + "epoch": 8.718538980969921, + "grad_norm": 0.1681409627199173, + "learning_rate": 4.247030134853352e-06, + "loss": 1.728, + "step": 28405 + }, + { + "epoch": 8.718845917740946, + "grad_norm": 0.18142938613891602, + "learning_rate": 4.245025639188094e-06, + "loss": 1.6952, + "step": 28406 + }, + { + "epoch": 8.71915285451197, + "grad_norm": 0.17891576886177063, + "learning_rate": 4.243021595700286e-06, + "loss": 1.7304, + "step": 28407 + }, + { + "epoch": 8.719459791282995, + "grad_norm": 0.1676199585199356, + "learning_rate": 4.24101800440978e-06, + "loss": 1.6756, + "step": 28408 + }, + { + "epoch": 8.71976672805402, + "grad_norm": 0.16762350499629974, + "learning_rate": 4.239014865336339e-06, + "loss": 1.6899, + "step": 28409 + }, + { + "epoch": 8.720073664825046, + "grad_norm": 0.14751142263412476, + "learning_rate": 4.2370121784997776e-06, + "loss": 1.677, + "step": 28410 + }, + { + "epoch": 8.720380601596071, + "grad_norm": 0.16818544268608093, + "learning_rate": 4.235009943919887e-06, + "loss": 1.7132, + "step": 28411 + }, + { + "epoch": 8.720687538367097, + "grad_norm": 0.14754259586334229, + "learning_rate": 4.233008161616453e-06, + "loss": 1.6744, + "step": 28412 + }, + { + "epoch": 8.720994475138122, + "grad_norm": 0.1303185522556305, + "learning_rate": 4.231006831609258e-06, + "loss": 1.6783, + "step": 28413 + }, + { + "epoch": 8.721301411909147, + "grad_norm": 0.14147131145000458, + "learning_rate": 4.229005953918075e-06, + "loss": 1.6911, + "step": 28414 + }, + { + "epoch": 8.721608348680173, + "grad_norm": 0.19011028110980988, + "learning_rate": 4.227005528562688e-06, + "loss": 1.7245, + "step": 28415 + }, + { + "epoch": 8.721915285451198, + "grad_norm": 0.1327231526374817, + "learning_rate": 4.225005555562855e-06, + "loss": 1.6676, + "step": 28416 + }, + { + "epoch": 8.722222222222221, + "grad_norm": 0.13436436653137207, + "learning_rate": 4.223006034938354e-06, + "loss": 1.6926, + "step": 28417 + }, + { + "epoch": 8.722529158993247, + "grad_norm": 0.18722930550575256, + "learning_rate": 4.221006966708929e-06, + "loss": 1.7759, + "step": 28418 + }, + { + "epoch": 8.722836095764272, + "grad_norm": 0.18999920785427094, + "learning_rate": 4.219008350894355e-06, + "loss": 1.7385, + "step": 28419 + }, + { + "epoch": 8.723143032535297, + "grad_norm": 0.14250624179840088, + "learning_rate": 4.217010187514364e-06, + "loss": 1.7263, + "step": 28420 + }, + { + "epoch": 8.723449969306323, + "grad_norm": 0.1577407717704773, + "learning_rate": 4.21501247658872e-06, + "loss": 1.8055, + "step": 28421 + }, + { + "epoch": 8.723756906077348, + "grad_norm": 0.120110422372818, + "learning_rate": 4.213015218137145e-06, + "loss": 1.6519, + "step": 28422 + }, + { + "epoch": 8.724063842848373, + "grad_norm": 0.17998605966567993, + "learning_rate": 4.211018412179407e-06, + "loss": 1.6827, + "step": 28423 + }, + { + "epoch": 8.724370779619399, + "grad_norm": 0.14941653609275818, + "learning_rate": 4.209022058735213e-06, + "loss": 1.7089, + "step": 28424 + }, + { + "epoch": 8.724677716390424, + "grad_norm": 0.13641475141048431, + "learning_rate": 4.207026157824312e-06, + "loss": 1.6825, + "step": 28425 + }, + { + "epoch": 8.72498465316145, + "grad_norm": 0.1666809320449829, + "learning_rate": 4.205030709466401e-06, + "loss": 1.6958, + "step": 28426 + }, + { + "epoch": 8.725291589932475, + "grad_norm": 0.1236952468752861, + "learning_rate": 4.20303571368123e-06, + "loss": 1.6417, + "step": 28427 + }, + { + "epoch": 8.725598526703498, + "grad_norm": 0.1483321338891983, + "learning_rate": 4.201041170488501e-06, + "loss": 1.7082, + "step": 28428 + }, + { + "epoch": 8.725905463474524, + "grad_norm": 0.17827022075653076, + "learning_rate": 4.1990470799079255e-06, + "loss": 1.6506, + "step": 28429 + }, + { + "epoch": 8.726212400245549, + "grad_norm": 0.17171478271484375, + "learning_rate": 4.197053441959215e-06, + "loss": 1.7403, + "step": 28430 + }, + { + "epoch": 8.726519337016574, + "grad_norm": 0.18554572761058807, + "learning_rate": 4.195060256662064e-06, + "loss": 1.6899, + "step": 28431 + }, + { + "epoch": 8.7268262737876, + "grad_norm": 0.30604809522628784, + "learning_rate": 4.193067524036176e-06, + "loss": 1.7656, + "step": 28432 + }, + { + "epoch": 8.727133210558625, + "grad_norm": 0.1759488433599472, + "learning_rate": 4.191075244101245e-06, + "loss": 1.7167, + "step": 28433 + }, + { + "epoch": 8.72744014732965, + "grad_norm": 0.15285685658454895, + "learning_rate": 4.18908341687696e-06, + "loss": 1.6576, + "step": 28434 + }, + { + "epoch": 8.727747084100676, + "grad_norm": 0.17283809185028076, + "learning_rate": 4.187092042382995e-06, + "loss": 1.719, + "step": 28435 + }, + { + "epoch": 8.728054020871701, + "grad_norm": 0.1511228382587433, + "learning_rate": 4.1851011206390455e-06, + "loss": 1.6499, + "step": 28436 + }, + { + "epoch": 8.728360957642726, + "grad_norm": 0.13646523654460907, + "learning_rate": 4.183110651664779e-06, + "loss": 1.703, + "step": 28437 + }, + { + "epoch": 8.72866789441375, + "grad_norm": 0.16112352907657623, + "learning_rate": 4.181120635479863e-06, + "loss": 1.6963, + "step": 28438 + }, + { + "epoch": 8.728974831184775, + "grad_norm": 0.23064331710338593, + "learning_rate": 4.179131072103964e-06, + "loss": 1.7347, + "step": 28439 + }, + { + "epoch": 8.7292817679558, + "grad_norm": 0.17859068512916565, + "learning_rate": 4.177141961556763e-06, + "loss": 1.7963, + "step": 28440 + }, + { + "epoch": 8.729588704726826, + "grad_norm": 0.16455049812793732, + "learning_rate": 4.175153303857887e-06, + "loss": 1.6893, + "step": 28441 + }, + { + "epoch": 8.729895641497851, + "grad_norm": 0.1353607475757599, + "learning_rate": 4.173165099027021e-06, + "loss": 1.7165, + "step": 28442 + }, + { + "epoch": 8.730202578268877, + "grad_norm": 0.20421212911605835, + "learning_rate": 4.171177347083783e-06, + "loss": 1.7256, + "step": 28443 + }, + { + "epoch": 8.730509515039902, + "grad_norm": 0.17925186455249786, + "learning_rate": 4.169190048047833e-06, + "loss": 1.6819, + "step": 28444 + }, + { + "epoch": 8.730816451810927, + "grad_norm": 0.17959848046302795, + "learning_rate": 4.167203201938819e-06, + "loss": 1.7275, + "step": 28445 + }, + { + "epoch": 8.731123388581953, + "grad_norm": 0.13794639706611633, + "learning_rate": 4.165216808776357e-06, + "loss": 1.6694, + "step": 28446 + }, + { + "epoch": 8.731430325352978, + "grad_norm": 0.15895675122737885, + "learning_rate": 4.163230868580092e-06, + "loss": 1.7159, + "step": 28447 + }, + { + "epoch": 8.731737262124003, + "grad_norm": 0.16645625233650208, + "learning_rate": 4.161245381369644e-06, + "loss": 1.7068, + "step": 28448 + }, + { + "epoch": 8.732044198895027, + "grad_norm": 0.17593564093112946, + "learning_rate": 4.15926034716464e-06, + "loss": 1.7013, + "step": 28449 + }, + { + "epoch": 8.732351135666052, + "grad_norm": 0.1613699495792389, + "learning_rate": 4.157275765984692e-06, + "loss": 1.6925, + "step": 28450 + }, + { + "epoch": 8.732658072437077, + "grad_norm": 0.21205542981624603, + "learning_rate": 4.155291637849412e-06, + "loss": 1.8401, + "step": 28451 + }, + { + "epoch": 8.732965009208103, + "grad_norm": 0.16209860146045685, + "learning_rate": 4.153307962778408e-06, + "loss": 1.7068, + "step": 28452 + }, + { + "epoch": 8.733271945979128, + "grad_norm": 0.17571625113487244, + "learning_rate": 4.1513247407912905e-06, + "loss": 1.7245, + "step": 28453 + }, + { + "epoch": 8.733578882750153, + "grad_norm": 0.12565423548221588, + "learning_rate": 4.149341971907655e-06, + "loss": 1.6714, + "step": 28454 + }, + { + "epoch": 8.733885819521179, + "grad_norm": 0.14843232929706573, + "learning_rate": 4.147359656147093e-06, + "loss": 1.6685, + "step": 28455 + }, + { + "epoch": 8.734192756292204, + "grad_norm": 0.1699068695306778, + "learning_rate": 4.145377793529193e-06, + "loss": 1.6808, + "step": 28456 + }, + { + "epoch": 8.73449969306323, + "grad_norm": 0.18543531000614166, + "learning_rate": 4.143396384073556e-06, + "loss": 1.7721, + "step": 28457 + }, + { + "epoch": 8.734806629834255, + "grad_norm": 0.15792638063430786, + "learning_rate": 4.141415427799744e-06, + "loss": 1.6804, + "step": 28458 + }, + { + "epoch": 8.735113566605278, + "grad_norm": 0.19353818893432617, + "learning_rate": 4.139434924727359e-06, + "loss": 1.7062, + "step": 28459 + }, + { + "epoch": 8.735420503376304, + "grad_norm": 0.14087705314159393, + "learning_rate": 4.137454874875935e-06, + "loss": 1.6287, + "step": 28460 + }, + { + "epoch": 8.735727440147329, + "grad_norm": 0.14594002068042755, + "learning_rate": 4.135475278265077e-06, + "loss": 1.6741, + "step": 28461 + }, + { + "epoch": 8.736034376918354, + "grad_norm": 0.13943135738372803, + "learning_rate": 4.133496134914333e-06, + "loss": 1.7261, + "step": 28462 + }, + { + "epoch": 8.73634131368938, + "grad_norm": 0.20119191706180573, + "learning_rate": 4.131517444843264e-06, + "loss": 1.7719, + "step": 28463 + }, + { + "epoch": 8.736648250460405, + "grad_norm": 0.15612776577472687, + "learning_rate": 4.12953920807142e-06, + "loss": 1.6694, + "step": 28464 + }, + { + "epoch": 8.73695518723143, + "grad_norm": 0.15517298877239227, + "learning_rate": 4.127561424618359e-06, + "loss": 1.7225, + "step": 28465 + }, + { + "epoch": 8.737262124002456, + "grad_norm": 0.18650169670581818, + "learning_rate": 4.125584094503626e-06, + "loss": 1.7589, + "step": 28466 + }, + { + "epoch": 8.737569060773481, + "grad_norm": 0.19337934255599976, + "learning_rate": 4.123607217746755e-06, + "loss": 1.6754, + "step": 28467 + }, + { + "epoch": 8.737875997544506, + "grad_norm": 0.15818046033382416, + "learning_rate": 4.121630794367287e-06, + "loss": 1.7176, + "step": 28468 + }, + { + "epoch": 8.738182934315532, + "grad_norm": 0.14257800579071045, + "learning_rate": 4.11965482438475e-06, + "loss": 1.6961, + "step": 28469 + }, + { + "epoch": 8.738489871086557, + "grad_norm": 0.15100477635860443, + "learning_rate": 4.1176793078186785e-06, + "loss": 1.7161, + "step": 28470 + }, + { + "epoch": 8.73879680785758, + "grad_norm": 0.14171260595321655, + "learning_rate": 4.115704244688595e-06, + "loss": 1.6812, + "step": 28471 + }, + { + "epoch": 8.739103744628606, + "grad_norm": 0.13742563128471375, + "learning_rate": 4.1137296350140134e-06, + "loss": 1.6968, + "step": 28472 + }, + { + "epoch": 8.739410681399631, + "grad_norm": 0.131202831864357, + "learning_rate": 4.111755478814439e-06, + "loss": 1.6859, + "step": 28473 + }, + { + "epoch": 8.739717618170657, + "grad_norm": 0.14671406149864197, + "learning_rate": 4.109781776109411e-06, + "loss": 1.7227, + "step": 28474 + }, + { + "epoch": 8.740024554941682, + "grad_norm": 0.17391672730445862, + "learning_rate": 4.107808526918405e-06, + "loss": 1.6926, + "step": 28475 + }, + { + "epoch": 8.740331491712707, + "grad_norm": 0.16088297963142395, + "learning_rate": 4.105835731260943e-06, + "loss": 1.7296, + "step": 28476 + }, + { + "epoch": 8.740638428483733, + "grad_norm": 0.15273302793502808, + "learning_rate": 4.1038633891564985e-06, + "loss": 1.6888, + "step": 28477 + }, + { + "epoch": 8.740945365254758, + "grad_norm": 0.16602970659732819, + "learning_rate": 4.101891500624588e-06, + "loss": 1.6924, + "step": 28478 + }, + { + "epoch": 8.741252302025783, + "grad_norm": 0.13952100276947021, + "learning_rate": 4.099920065684681e-06, + "loss": 1.6972, + "step": 28479 + }, + { + "epoch": 8.741559238796809, + "grad_norm": 0.18140468001365662, + "learning_rate": 4.097949084356273e-06, + "loss": 1.7417, + "step": 28480 + }, + { + "epoch": 8.741866175567832, + "grad_norm": 0.19571609795093536, + "learning_rate": 4.095978556658831e-06, + "loss": 1.7261, + "step": 28481 + }, + { + "epoch": 8.742173112338858, + "grad_norm": 0.1748526245355606, + "learning_rate": 4.094008482611838e-06, + "loss": 1.7975, + "step": 28482 + }, + { + "epoch": 8.742480049109883, + "grad_norm": 0.1984734982252121, + "learning_rate": 4.092038862234759e-06, + "loss": 1.7941, + "step": 28483 + }, + { + "epoch": 8.742786985880908, + "grad_norm": 0.1336900144815445, + "learning_rate": 4.090069695547055e-06, + "loss": 1.6612, + "step": 28484 + }, + { + "epoch": 8.743093922651934, + "grad_norm": 0.1755249798297882, + "learning_rate": 4.088100982568193e-06, + "loss": 1.679, + "step": 28485 + }, + { + "epoch": 8.743400859422959, + "grad_norm": 0.17111645638942719, + "learning_rate": 4.086132723317631e-06, + "loss": 1.739, + "step": 28486 + }, + { + "epoch": 8.743707796193984, + "grad_norm": 0.18933364748954773, + "learning_rate": 4.084164917814815e-06, + "loss": 1.7469, + "step": 28487 + }, + { + "epoch": 8.74401473296501, + "grad_norm": 0.15212221443653107, + "learning_rate": 4.082197566079188e-06, + "loss": 1.7137, + "step": 28488 + }, + { + "epoch": 8.744321669736035, + "grad_norm": 0.1428573727607727, + "learning_rate": 4.080230668130203e-06, + "loss": 1.67, + "step": 28489 + }, + { + "epoch": 8.74462860650706, + "grad_norm": 0.1688205450773239, + "learning_rate": 4.078264223987283e-06, + "loss": 1.7149, + "step": 28490 + }, + { + "epoch": 8.744935543278086, + "grad_norm": 0.23390214145183563, + "learning_rate": 4.07629823366989e-06, + "loss": 1.7647, + "step": 28491 + }, + { + "epoch": 8.745242480049109, + "grad_norm": 0.163333460688591, + "learning_rate": 4.074332697197419e-06, + "loss": 1.7047, + "step": 28492 + }, + { + "epoch": 8.745549416820134, + "grad_norm": 0.14970998466014862, + "learning_rate": 4.072367614589323e-06, + "loss": 1.6921, + "step": 28493 + }, + { + "epoch": 8.74585635359116, + "grad_norm": 0.18369705975055695, + "learning_rate": 4.070402985864996e-06, + "loss": 1.7266, + "step": 28494 + }, + { + "epoch": 8.746163290362185, + "grad_norm": 0.17579036951065063, + "learning_rate": 4.068438811043873e-06, + "loss": 1.742, + "step": 28495 + }, + { + "epoch": 8.74647022713321, + "grad_norm": 0.1286322921514511, + "learning_rate": 4.066475090145355e-06, + "loss": 1.6656, + "step": 28496 + }, + { + "epoch": 8.746777163904236, + "grad_norm": 0.1595929116010666, + "learning_rate": 4.06451182318886e-06, + "loss": 1.7079, + "step": 28497 + }, + { + "epoch": 8.747084100675261, + "grad_norm": 0.14556388556957245, + "learning_rate": 4.062549010193778e-06, + "loss": 1.6948, + "step": 28498 + }, + { + "epoch": 8.747391037446286, + "grad_norm": 0.19447384774684906, + "learning_rate": 4.060586651179516e-06, + "loss": 1.7648, + "step": 28499 + }, + { + "epoch": 8.747697974217312, + "grad_norm": 0.147284135222435, + "learning_rate": 4.058624746165457e-06, + "loss": 1.713, + "step": 28500 + }, + { + "epoch": 8.748004910988337, + "grad_norm": 0.17068512737751007, + "learning_rate": 4.056663295170998e-06, + "loss": 1.708, + "step": 28501 + }, + { + "epoch": 8.74831184775936, + "grad_norm": 0.15625207126140594, + "learning_rate": 4.054702298215523e-06, + "loss": 1.7152, + "step": 28502 + }, + { + "epoch": 8.748618784530386, + "grad_norm": 0.14633874595165253, + "learning_rate": 4.052741755318407e-06, + "loss": 1.7221, + "step": 28503 + }, + { + "epoch": 8.748925721301411, + "grad_norm": 0.15166686475276947, + "learning_rate": 4.0507816664990265e-06, + "loss": 1.7179, + "step": 28504 + }, + { + "epoch": 8.749232658072437, + "grad_norm": 0.12509481608867645, + "learning_rate": 4.0488220317767555e-06, + "loss": 1.6743, + "step": 28505 + }, + { + "epoch": 8.749539594843462, + "grad_norm": 0.20686158537864685, + "learning_rate": 4.046862851170957e-06, + "loss": 1.6925, + "step": 28506 + }, + { + "epoch": 8.749846531614487, + "grad_norm": 0.12619495391845703, + "learning_rate": 4.044904124700983e-06, + "loss": 1.6932, + "step": 28507 + }, + { + "epoch": 8.750153468385513, + "grad_norm": 0.1770995706319809, + "learning_rate": 4.0429458523862205e-06, + "loss": 1.7948, + "step": 28508 + }, + { + "epoch": 8.750460405156538, + "grad_norm": 0.22418050467967987, + "learning_rate": 4.040988034245991e-06, + "loss": 1.7008, + "step": 28509 + }, + { + "epoch": 8.750767341927563, + "grad_norm": 0.14798377454280853, + "learning_rate": 4.039030670299665e-06, + "loss": 1.6673, + "step": 28510 + }, + { + "epoch": 8.751074278698589, + "grad_norm": 0.182883620262146, + "learning_rate": 4.037073760566562e-06, + "loss": 1.7223, + "step": 28511 + }, + { + "epoch": 8.751381215469614, + "grad_norm": 0.14968620240688324, + "learning_rate": 4.035117305066044e-06, + "loss": 1.6656, + "step": 28512 + }, + { + "epoch": 8.75168815224064, + "grad_norm": 0.19700272381305695, + "learning_rate": 4.03316130381744e-06, + "loss": 1.7207, + "step": 28513 + }, + { + "epoch": 8.751995089011663, + "grad_norm": 0.17926210165023804, + "learning_rate": 4.031205756840073e-06, + "loss": 1.7131, + "step": 28514 + }, + { + "epoch": 8.752302025782688, + "grad_norm": 0.1471911519765854, + "learning_rate": 4.029250664153278e-06, + "loss": 1.6731, + "step": 28515 + }, + { + "epoch": 8.752608962553714, + "grad_norm": 0.18923047184944153, + "learning_rate": 4.0272960257763725e-06, + "loss": 1.7795, + "step": 28516 + }, + { + "epoch": 8.752915899324739, + "grad_norm": 0.14930424094200134, + "learning_rate": 4.025341841728675e-06, + "loss": 1.7201, + "step": 28517 + }, + { + "epoch": 8.753222836095764, + "grad_norm": 0.17335213720798492, + "learning_rate": 4.0233881120294915e-06, + "loss": 1.7297, + "step": 28518 + }, + { + "epoch": 8.75352977286679, + "grad_norm": 0.14489638805389404, + "learning_rate": 4.021434836698135e-06, + "loss": 1.7314, + "step": 28519 + }, + { + "epoch": 8.753836709637815, + "grad_norm": 0.16861389577388763, + "learning_rate": 4.019482015753912e-06, + "loss": 1.7362, + "step": 28520 + }, + { + "epoch": 8.75414364640884, + "grad_norm": 0.1467277705669403, + "learning_rate": 4.0175296492161115e-06, + "loss": 1.6607, + "step": 28521 + }, + { + "epoch": 8.754450583179866, + "grad_norm": 0.1556902825832367, + "learning_rate": 4.015577737104037e-06, + "loss": 1.747, + "step": 28522 + }, + { + "epoch": 8.754757519950891, + "grad_norm": 0.13337039947509766, + "learning_rate": 4.013626279436977e-06, + "loss": 1.7271, + "step": 28523 + }, + { + "epoch": 8.755064456721914, + "grad_norm": 0.1599043607711792, + "learning_rate": 4.011675276234206e-06, + "loss": 1.6859, + "step": 28524 + }, + { + "epoch": 8.75537139349294, + "grad_norm": 0.11567290872335434, + "learning_rate": 4.009724727515035e-06, + "loss": 1.6577, + "step": 28525 + }, + { + "epoch": 8.755678330263965, + "grad_norm": 0.16317762434482574, + "learning_rate": 4.0077746332987e-06, + "loss": 1.7041, + "step": 28526 + }, + { + "epoch": 8.75598526703499, + "grad_norm": 0.13116325438022614, + "learning_rate": 4.005824993604506e-06, + "loss": 1.6847, + "step": 28527 + }, + { + "epoch": 8.756292203806016, + "grad_norm": 0.14927831292152405, + "learning_rate": 4.003875808451696e-06, + "loss": 1.6312, + "step": 28528 + }, + { + "epoch": 8.756599140577041, + "grad_norm": 0.15273495018482208, + "learning_rate": 4.001927077859552e-06, + "loss": 1.7027, + "step": 28529 + }, + { + "epoch": 8.756906077348066, + "grad_norm": 0.17557594180107117, + "learning_rate": 3.999978801847326e-06, + "loss": 1.7294, + "step": 28530 + }, + { + "epoch": 8.757213014119092, + "grad_norm": 0.16061940789222717, + "learning_rate": 3.998030980434269e-06, + "loss": 1.7179, + "step": 28531 + }, + { + "epoch": 8.757519950890117, + "grad_norm": 0.1431310772895813, + "learning_rate": 3.996083613639634e-06, + "loss": 1.6811, + "step": 28532 + }, + { + "epoch": 8.757826887661142, + "grad_norm": 0.16931994259357452, + "learning_rate": 3.994136701482659e-06, + "loss": 1.7246, + "step": 28533 + }, + { + "epoch": 8.758133824432168, + "grad_norm": 0.13671527802944183, + "learning_rate": 3.992190243982596e-06, + "loss": 1.6877, + "step": 28534 + }, + { + "epoch": 8.758440761203191, + "grad_norm": 0.11943815648555756, + "learning_rate": 3.990244241158675e-06, + "loss": 1.6476, + "step": 28535 + }, + { + "epoch": 8.758747697974217, + "grad_norm": 0.17011673748493195, + "learning_rate": 3.988298693030124e-06, + "loss": 1.7105, + "step": 28536 + }, + { + "epoch": 8.759054634745242, + "grad_norm": 0.1379362791776657, + "learning_rate": 3.986353599616177e-06, + "loss": 1.6691, + "step": 28537 + }, + { + "epoch": 8.759361571516267, + "grad_norm": 0.13264621794223785, + "learning_rate": 3.984408960936048e-06, + "loss": 1.6766, + "step": 28538 + }, + { + "epoch": 8.759668508287293, + "grad_norm": 0.16023825109004974, + "learning_rate": 3.982464777008965e-06, + "loss": 1.6906, + "step": 28539 + }, + { + "epoch": 8.759975445058318, + "grad_norm": 0.1602984219789505, + "learning_rate": 3.980521047854135e-06, + "loss": 1.7094, + "step": 28540 + }, + { + "epoch": 8.760282381829343, + "grad_norm": 0.15421636402606964, + "learning_rate": 3.978577773490772e-06, + "loss": 1.7467, + "step": 28541 + }, + { + "epoch": 8.760589318600369, + "grad_norm": 0.1427018642425537, + "learning_rate": 3.976634953938074e-06, + "loss": 1.7093, + "step": 28542 + }, + { + "epoch": 8.760896255371394, + "grad_norm": 0.143124058842659, + "learning_rate": 3.97469258921524e-06, + "loss": 1.6795, + "step": 28543 + }, + { + "epoch": 8.76120319214242, + "grad_norm": 0.14654754102230072, + "learning_rate": 3.97275067934148e-06, + "loss": 1.7246, + "step": 28544 + }, + { + "epoch": 8.761510128913443, + "grad_norm": 0.17374441027641296, + "learning_rate": 3.970809224335964e-06, + "loss": 1.6828, + "step": 28545 + }, + { + "epoch": 8.761817065684468, + "grad_norm": 0.1596260517835617, + "learning_rate": 3.968868224217898e-06, + "loss": 1.7816, + "step": 28546 + }, + { + "epoch": 8.762124002455494, + "grad_norm": 0.1467326581478119, + "learning_rate": 3.966927679006455e-06, + "loss": 1.6933, + "step": 28547 + }, + { + "epoch": 8.762430939226519, + "grad_norm": 0.12959735095500946, + "learning_rate": 3.9649875887208085e-06, + "loss": 1.6839, + "step": 28548 + }, + { + "epoch": 8.762737875997544, + "grad_norm": 0.13395267724990845, + "learning_rate": 3.963047953380145e-06, + "loss": 1.6968, + "step": 28549 + }, + { + "epoch": 8.76304481276857, + "grad_norm": 0.1369883418083191, + "learning_rate": 3.961108773003619e-06, + "loss": 1.6849, + "step": 28550 + }, + { + "epoch": 8.763351749539595, + "grad_norm": 0.19795149564743042, + "learning_rate": 3.959170047610405e-06, + "loss": 1.7593, + "step": 28551 + }, + { + "epoch": 8.76365868631062, + "grad_norm": 0.14946505427360535, + "learning_rate": 3.9572317772196555e-06, + "loss": 1.7309, + "step": 28552 + }, + { + "epoch": 8.763965623081646, + "grad_norm": 0.14034941792488098, + "learning_rate": 3.955293961850526e-06, + "loss": 1.6906, + "step": 28553 + }, + { + "epoch": 8.764272559852671, + "grad_norm": 0.1528625339269638, + "learning_rate": 3.9533566015221735e-06, + "loss": 1.7318, + "step": 28554 + }, + { + "epoch": 8.764579496623696, + "grad_norm": 0.15130504965782166, + "learning_rate": 3.951419696253733e-06, + "loss": 1.7147, + "step": 28555 + }, + { + "epoch": 8.764886433394722, + "grad_norm": 0.12917234003543854, + "learning_rate": 3.949483246064361e-06, + "loss": 1.687, + "step": 28556 + }, + { + "epoch": 8.765193370165745, + "grad_norm": 0.1918531060218811, + "learning_rate": 3.947547250973182e-06, + "loss": 1.7411, + "step": 28557 + }, + { + "epoch": 8.76550030693677, + "grad_norm": 0.16794945299625397, + "learning_rate": 3.9456117109993366e-06, + "loss": 1.762, + "step": 28558 + }, + { + "epoch": 8.765807243707796, + "grad_norm": 0.18833400309085846, + "learning_rate": 3.9436766261619465e-06, + "loss": 1.7641, + "step": 28559 + }, + { + "epoch": 8.766114180478821, + "grad_norm": 0.1939263939857483, + "learning_rate": 3.941741996480131e-06, + "loss": 1.7633, + "step": 28560 + }, + { + "epoch": 8.766421117249847, + "grad_norm": 0.15766844153404236, + "learning_rate": 3.939807821973029e-06, + "loss": 1.6989, + "step": 28561 + }, + { + "epoch": 8.766728054020872, + "grad_norm": 0.14704185724258423, + "learning_rate": 3.937874102659733e-06, + "loss": 1.7006, + "step": 28562 + }, + { + "epoch": 8.767034990791897, + "grad_norm": 0.1752765029668808, + "learning_rate": 3.935940838559376e-06, + "loss": 1.6738, + "step": 28563 + }, + { + "epoch": 8.767341927562923, + "grad_norm": 0.1801508069038391, + "learning_rate": 3.934008029691033e-06, + "loss": 1.7578, + "step": 28564 + }, + { + "epoch": 8.767648864333948, + "grad_norm": 0.17966793477535248, + "learning_rate": 3.932075676073838e-06, + "loss": 1.7347, + "step": 28565 + }, + { + "epoch": 8.767955801104973, + "grad_norm": 0.1435980200767517, + "learning_rate": 3.930143777726863e-06, + "loss": 1.6907, + "step": 28566 + }, + { + "epoch": 8.768262737875997, + "grad_norm": 0.1439833641052246, + "learning_rate": 3.928212334669218e-06, + "loss": 1.6804, + "step": 28567 + }, + { + "epoch": 8.768569674647022, + "grad_norm": 0.18037080764770508, + "learning_rate": 3.92628134691998e-06, + "loss": 1.7287, + "step": 28568 + }, + { + "epoch": 8.768876611418047, + "grad_norm": 0.1484454721212387, + "learning_rate": 3.924350814498229e-06, + "loss": 1.7128, + "step": 28569 + }, + { + "epoch": 8.769183548189073, + "grad_norm": 0.1302090734243393, + "learning_rate": 3.922420737423055e-06, + "loss": 1.647, + "step": 28570 + }, + { + "epoch": 8.769490484960098, + "grad_norm": 0.16756890714168549, + "learning_rate": 3.920491115713526e-06, + "loss": 1.7613, + "step": 28571 + }, + { + "epoch": 8.769797421731123, + "grad_norm": 0.17668041586875916, + "learning_rate": 3.918561949388705e-06, + "loss": 1.6957, + "step": 28572 + }, + { + "epoch": 8.770104358502149, + "grad_norm": 0.14288358390331268, + "learning_rate": 3.916633238467671e-06, + "loss": 1.6879, + "step": 28573 + }, + { + "epoch": 8.770411295273174, + "grad_norm": 0.16978147625923157, + "learning_rate": 3.9147049829694746e-06, + "loss": 1.7456, + "step": 28574 + }, + { + "epoch": 8.7707182320442, + "grad_norm": 0.13802385330200195, + "learning_rate": 3.91277718291318e-06, + "loss": 1.6799, + "step": 28575 + }, + { + "epoch": 8.771025168815225, + "grad_norm": 0.16819354891777039, + "learning_rate": 3.910849838317826e-06, + "loss": 1.7277, + "step": 28576 + }, + { + "epoch": 8.77133210558625, + "grad_norm": 0.16395528614521027, + "learning_rate": 3.908922949202465e-06, + "loss": 1.6976, + "step": 28577 + }, + { + "epoch": 8.771639042357274, + "grad_norm": 0.14518797397613525, + "learning_rate": 3.906996515586159e-06, + "loss": 1.6962, + "step": 28578 + }, + { + "epoch": 8.771945979128299, + "grad_norm": 0.17786560952663422, + "learning_rate": 3.905070537487909e-06, + "loss": 1.6593, + "step": 28579 + }, + { + "epoch": 8.772252915899324, + "grad_norm": 0.1793101727962494, + "learning_rate": 3.9031450149267845e-06, + "loss": 1.7699, + "step": 28580 + }, + { + "epoch": 8.77255985267035, + "grad_norm": 0.2498319298028946, + "learning_rate": 3.901219947921786e-06, + "loss": 1.745, + "step": 28581 + }, + { + "epoch": 8.772866789441375, + "grad_norm": 0.14886927604675293, + "learning_rate": 3.899295336491959e-06, + "loss": 1.6886, + "step": 28582 + }, + { + "epoch": 8.7731737262124, + "grad_norm": 0.1918812394142151, + "learning_rate": 3.897371180656317e-06, + "loss": 1.7717, + "step": 28583 + }, + { + "epoch": 8.773480662983426, + "grad_norm": 0.15470977127552032, + "learning_rate": 3.895447480433873e-06, + "loss": 1.6747, + "step": 28584 + }, + { + "epoch": 8.773787599754451, + "grad_norm": 0.15075071156024933, + "learning_rate": 3.893524235843648e-06, + "loss": 1.6753, + "step": 28585 + }, + { + "epoch": 8.774094536525476, + "grad_norm": 0.14186562597751617, + "learning_rate": 3.891601446904625e-06, + "loss": 1.6535, + "step": 28586 + }, + { + "epoch": 8.774401473296502, + "grad_norm": 0.16147254407405853, + "learning_rate": 3.8896791136358305e-06, + "loss": 1.6939, + "step": 28587 + }, + { + "epoch": 8.774708410067525, + "grad_norm": 0.1621028035879135, + "learning_rate": 3.8877572360562554e-06, + "loss": 1.7311, + "step": 28588 + }, + { + "epoch": 8.77501534683855, + "grad_norm": 0.1451268047094345, + "learning_rate": 3.885835814184885e-06, + "loss": 1.7029, + "step": 28589 + }, + { + "epoch": 8.775322283609576, + "grad_norm": 0.1404246985912323, + "learning_rate": 3.883914848040715e-06, + "loss": 1.7338, + "step": 28590 + }, + { + "epoch": 8.775629220380601, + "grad_norm": 0.15817701816558838, + "learning_rate": 3.881994337642731e-06, + "loss": 1.6944, + "step": 28591 + }, + { + "epoch": 8.775936157151627, + "grad_norm": 0.15462549030780792, + "learning_rate": 3.880074283009905e-06, + "loss": 1.7406, + "step": 28592 + }, + { + "epoch": 8.776243093922652, + "grad_norm": 0.1545121818780899, + "learning_rate": 3.878154684161217e-06, + "loss": 1.7009, + "step": 28593 + }, + { + "epoch": 8.776550030693677, + "grad_norm": 0.13072805106639862, + "learning_rate": 3.8762355411156305e-06, + "loss": 1.6798, + "step": 28594 + }, + { + "epoch": 8.776856967464703, + "grad_norm": 0.16369932889938354, + "learning_rate": 3.8743168538921344e-06, + "loss": 1.7046, + "step": 28595 + }, + { + "epoch": 8.777163904235728, + "grad_norm": 0.151187926530838, + "learning_rate": 3.8723986225096596e-06, + "loss": 1.7383, + "step": 28596 + }, + { + "epoch": 8.777470841006753, + "grad_norm": 0.16651193797588348, + "learning_rate": 3.8704808469871955e-06, + "loss": 1.7178, + "step": 28597 + }, + { + "epoch": 8.777777777777779, + "grad_norm": 0.1387864351272583, + "learning_rate": 3.868563527343655e-06, + "loss": 1.6644, + "step": 28598 + }, + { + "epoch": 8.778084714548802, + "grad_norm": 0.14454610645771027, + "learning_rate": 3.866646663598022e-06, + "loss": 1.6699, + "step": 28599 + }, + { + "epoch": 8.778391651319827, + "grad_norm": 0.1706279069185257, + "learning_rate": 3.864730255769223e-06, + "loss": 1.7251, + "step": 28600 + }, + { + "epoch": 8.778698588090853, + "grad_norm": 0.14636628329753876, + "learning_rate": 3.8628143038762e-06, + "loss": 1.6774, + "step": 28601 + }, + { + "epoch": 8.779005524861878, + "grad_norm": 0.17533506453037262, + "learning_rate": 3.860898807937902e-06, + "loss": 1.7587, + "step": 28602 + }, + { + "epoch": 8.779312461632903, + "grad_norm": 0.2628023326396942, + "learning_rate": 3.858983767973223e-06, + "loss": 1.7571, + "step": 28603 + }, + { + "epoch": 8.779619398403929, + "grad_norm": 0.1412924826145172, + "learning_rate": 3.857069184001116e-06, + "loss": 1.699, + "step": 28604 + }, + { + "epoch": 8.779926335174954, + "grad_norm": 0.16076254844665527, + "learning_rate": 3.855155056040505e-06, + "loss": 1.7327, + "step": 28605 + }, + { + "epoch": 8.78023327194598, + "grad_norm": 0.1440654993057251, + "learning_rate": 3.85324138411029e-06, + "loss": 1.6941, + "step": 28606 + }, + { + "epoch": 8.780540208717005, + "grad_norm": 0.1956651359796524, + "learning_rate": 3.8513281682293956e-06, + "loss": 1.728, + "step": 28607 + }, + { + "epoch": 8.78084714548803, + "grad_norm": 0.14176496863365173, + "learning_rate": 3.849415408416723e-06, + "loss": 1.7139, + "step": 28608 + }, + { + "epoch": 8.781154082259054, + "grad_norm": 0.18848197162151337, + "learning_rate": 3.84750310469118e-06, + "loss": 1.7092, + "step": 28609 + }, + { + "epoch": 8.781461019030079, + "grad_norm": 0.1622554361820221, + "learning_rate": 3.8455912570716565e-06, + "loss": 1.7137, + "step": 28610 + }, + { + "epoch": 8.781767955801104, + "grad_norm": 0.14255301654338837, + "learning_rate": 3.843679865577049e-06, + "loss": 1.6759, + "step": 28611 + }, + { + "epoch": 8.78207489257213, + "grad_norm": 0.15052112936973572, + "learning_rate": 3.841768930226264e-06, + "loss": 1.6749, + "step": 28612 + }, + { + "epoch": 8.782381829343155, + "grad_norm": 0.19591687619686127, + "learning_rate": 3.8398584510381584e-06, + "loss": 1.7263, + "step": 28613 + }, + { + "epoch": 8.78268876611418, + "grad_norm": 0.1651594340801239, + "learning_rate": 3.83794842803164e-06, + "loss": 1.763, + "step": 28614 + }, + { + "epoch": 8.782995702885206, + "grad_norm": 0.15854987502098083, + "learning_rate": 3.83603886122556e-06, + "loss": 1.7128, + "step": 28615 + }, + { + "epoch": 8.783302639656231, + "grad_norm": 0.14012815058231354, + "learning_rate": 3.834129750638804e-06, + "loss": 1.6711, + "step": 28616 + }, + { + "epoch": 8.783609576427256, + "grad_norm": 0.19335302710533142, + "learning_rate": 3.832221096290245e-06, + "loss": 1.7082, + "step": 28617 + }, + { + "epoch": 8.783916513198282, + "grad_norm": 0.13030263781547546, + "learning_rate": 3.830312898198729e-06, + "loss": 1.6831, + "step": 28618 + }, + { + "epoch": 8.784223449969307, + "grad_norm": 0.14048850536346436, + "learning_rate": 3.82840515638313e-06, + "loss": 1.7419, + "step": 28619 + }, + { + "epoch": 8.784530386740332, + "grad_norm": 0.1761157363653183, + "learning_rate": 3.826497870862284e-06, + "loss": 1.7285, + "step": 28620 + }, + { + "epoch": 8.784837323511356, + "grad_norm": 0.16928929090499878, + "learning_rate": 3.824591041655051e-06, + "loss": 1.7597, + "step": 28621 + }, + { + "epoch": 8.785144260282381, + "grad_norm": 0.12604424357414246, + "learning_rate": 3.822684668780275e-06, + "loss": 1.6895, + "step": 28622 + }, + { + "epoch": 8.785451197053407, + "grad_norm": 0.1835777759552002, + "learning_rate": 3.820778752256793e-06, + "loss": 1.7131, + "step": 28623 + }, + { + "epoch": 8.785758133824432, + "grad_norm": 0.1577402502298355, + "learning_rate": 3.818873292103447e-06, + "loss": 1.7159, + "step": 28624 + }, + { + "epoch": 8.786065070595457, + "grad_norm": 0.14781227707862854, + "learning_rate": 3.8169682883390565e-06, + "loss": 1.7179, + "step": 28625 + }, + { + "epoch": 8.786372007366483, + "grad_norm": 0.19881610572338104, + "learning_rate": 3.815063740982461e-06, + "loss": 1.7586, + "step": 28626 + }, + { + "epoch": 8.786678944137508, + "grad_norm": 0.16822806000709534, + "learning_rate": 3.813159650052467e-06, + "loss": 1.7628, + "step": 28627 + }, + { + "epoch": 8.786985880908533, + "grad_norm": 0.14510734379291534, + "learning_rate": 3.811256015567899e-06, + "loss": 1.654, + "step": 28628 + }, + { + "epoch": 8.787292817679559, + "grad_norm": 0.1547134667634964, + "learning_rate": 3.8093528375475863e-06, + "loss": 1.7204, + "step": 28629 + }, + { + "epoch": 8.787599754450584, + "grad_norm": 0.19592107832431793, + "learning_rate": 3.8074501160103027e-06, + "loss": 1.7084, + "step": 28630 + }, + { + "epoch": 8.787906691221608, + "grad_norm": 0.1543792486190796, + "learning_rate": 3.8055478509748887e-06, + "loss": 1.7322, + "step": 28631 + }, + { + "epoch": 8.788213627992633, + "grad_norm": 0.17076534032821655, + "learning_rate": 3.8036460424601128e-06, + "loss": 1.7004, + "step": 28632 + }, + { + "epoch": 8.788520564763658, + "grad_norm": 0.13622300326824188, + "learning_rate": 3.8017446904847875e-06, + "loss": 1.6867, + "step": 28633 + }, + { + "epoch": 8.788827501534684, + "grad_norm": 0.3221909999847412, + "learning_rate": 3.7998437950677035e-06, + "loss": 1.7559, + "step": 28634 + }, + { + "epoch": 8.789134438305709, + "grad_norm": 0.1811852902173996, + "learning_rate": 3.79794335622764e-06, + "loss": 1.7439, + "step": 28635 + }, + { + "epoch": 8.789441375076734, + "grad_norm": 0.1573752760887146, + "learning_rate": 3.7960433739833877e-06, + "loss": 1.7129, + "step": 28636 + }, + { + "epoch": 8.78974831184776, + "grad_norm": 0.13165032863616943, + "learning_rate": 3.7941438483536986e-06, + "loss": 1.6926, + "step": 28637 + }, + { + "epoch": 8.790055248618785, + "grad_norm": 0.14245405793190002, + "learning_rate": 3.792244779357368e-06, + "loss": 1.7072, + "step": 28638 + }, + { + "epoch": 8.79036218538981, + "grad_norm": 0.16790303587913513, + "learning_rate": 3.790346167013159e-06, + "loss": 1.6979, + "step": 28639 + }, + { + "epoch": 8.790669122160836, + "grad_norm": 0.15134595334529877, + "learning_rate": 3.7884480113398345e-06, + "loss": 1.7035, + "step": 28640 + }, + { + "epoch": 8.79097605893186, + "grad_norm": 0.1418851763010025, + "learning_rate": 3.7865503123561575e-06, + "loss": 1.6462, + "step": 28641 + }, + { + "epoch": 8.791282995702884, + "grad_norm": 0.13052044808864594, + "learning_rate": 3.784653070080868e-06, + "loss": 1.6559, + "step": 28642 + }, + { + "epoch": 8.79158993247391, + "grad_norm": 0.14758886396884918, + "learning_rate": 3.782756284532729e-06, + "loss": 1.6948, + "step": 28643 + }, + { + "epoch": 8.791896869244935, + "grad_norm": 0.1561112254858017, + "learning_rate": 3.7808599557304814e-06, + "loss": 1.6465, + "step": 28644 + }, + { + "epoch": 8.79220380601596, + "grad_norm": 0.17403864860534668, + "learning_rate": 3.77896408369286e-06, + "loss": 1.7397, + "step": 28645 + }, + { + "epoch": 8.792510742786986, + "grad_norm": 0.147226944565773, + "learning_rate": 3.7770686684386158e-06, + "loss": 1.6707, + "step": 28646 + }, + { + "epoch": 8.792817679558011, + "grad_norm": 0.1681959182024002, + "learning_rate": 3.7751737099864627e-06, + "loss": 1.6786, + "step": 28647 + }, + { + "epoch": 8.793124616329036, + "grad_norm": 0.15970535576343536, + "learning_rate": 3.773279208355146e-06, + "loss": 1.6652, + "step": 28648 + }, + { + "epoch": 8.793431553100062, + "grad_norm": 0.18252034485340118, + "learning_rate": 3.771385163563368e-06, + "loss": 1.7478, + "step": 28649 + }, + { + "epoch": 8.793738489871087, + "grad_norm": 0.22270283102989197, + "learning_rate": 3.7694915756298576e-06, + "loss": 1.7683, + "step": 28650 + }, + { + "epoch": 8.794045426642112, + "grad_norm": 0.13913489878177643, + "learning_rate": 3.7675984445733337e-06, + "loss": 1.7275, + "step": 28651 + }, + { + "epoch": 8.794352363413136, + "grad_norm": 0.16266898810863495, + "learning_rate": 3.7657057704124976e-06, + "loss": 1.7145, + "step": 28652 + }, + { + "epoch": 8.794659300184161, + "grad_norm": 0.18106494843959808, + "learning_rate": 3.763813553166068e-06, + "loss": 1.6936, + "step": 28653 + }, + { + "epoch": 8.794966236955187, + "grad_norm": 0.17213653028011322, + "learning_rate": 3.761921792852713e-06, + "loss": 1.7223, + "step": 28654 + }, + { + "epoch": 8.795273173726212, + "grad_norm": 0.14013275504112244, + "learning_rate": 3.7600304894911562e-06, + "loss": 1.7082, + "step": 28655 + }, + { + "epoch": 8.795580110497237, + "grad_norm": 0.1625421643257141, + "learning_rate": 3.758139643100078e-06, + "loss": 1.719, + "step": 28656 + }, + { + "epoch": 8.795887047268263, + "grad_norm": 0.15947094559669495, + "learning_rate": 3.756249253698174e-06, + "loss": 1.7448, + "step": 28657 + }, + { + "epoch": 8.796193984039288, + "grad_norm": 0.16739755868911743, + "learning_rate": 3.754359321304113e-06, + "loss": 1.7048, + "step": 28658 + }, + { + "epoch": 8.796500920810313, + "grad_norm": 0.17619092762470245, + "learning_rate": 3.7524698459365794e-06, + "loss": 1.7247, + "step": 28659 + }, + { + "epoch": 8.796807857581339, + "grad_norm": 0.19410766661167145, + "learning_rate": 3.7505808276142473e-06, + "loss": 1.6918, + "step": 28660 + }, + { + "epoch": 8.797114794352364, + "grad_norm": 0.13881324231624603, + "learning_rate": 3.74869226635578e-06, + "loss": 1.6997, + "step": 28661 + }, + { + "epoch": 8.79742173112339, + "grad_norm": 0.16185659170150757, + "learning_rate": 3.74680416217984e-06, + "loss": 1.6951, + "step": 28662 + }, + { + "epoch": 8.797728667894415, + "grad_norm": 0.4652320444583893, + "learning_rate": 3.744916515105107e-06, + "loss": 1.7521, + "step": 28663 + }, + { + "epoch": 8.798035604665438, + "grad_norm": 0.1286199539899826, + "learning_rate": 3.7430293251501992e-06, + "loss": 1.7106, + "step": 28664 + }, + { + "epoch": 8.798342541436464, + "grad_norm": 0.18184927105903625, + "learning_rate": 3.741142592333807e-06, + "loss": 1.7297, + "step": 28665 + }, + { + "epoch": 8.798649478207489, + "grad_norm": 0.1292438805103302, + "learning_rate": 3.7392563166745443e-06, + "loss": 1.6701, + "step": 28666 + }, + { + "epoch": 8.798956414978514, + "grad_norm": 0.16631865501403809, + "learning_rate": 3.7373704981910673e-06, + "loss": 1.7572, + "step": 28667 + }, + { + "epoch": 8.79926335174954, + "grad_norm": 0.13093185424804688, + "learning_rate": 3.7354851369020117e-06, + "loss": 1.6912, + "step": 28668 + }, + { + "epoch": 8.799570288520565, + "grad_norm": 0.16165922582149506, + "learning_rate": 3.7336002328260123e-06, + "loss": 1.668, + "step": 28669 + }, + { + "epoch": 8.79987722529159, + "grad_norm": 0.1431419402360916, + "learning_rate": 3.7317157859816987e-06, + "loss": 1.6499, + "step": 28670 + }, + { + "epoch": 8.800184162062616, + "grad_norm": 0.16933713853359222, + "learning_rate": 3.729831796387667e-06, + "loss": 1.7081, + "step": 28671 + }, + { + "epoch": 8.800491098833641, + "grad_norm": 0.15956951677799225, + "learning_rate": 3.727948264062575e-06, + "loss": 1.6981, + "step": 28672 + }, + { + "epoch": 8.800798035604666, + "grad_norm": 0.17684711515903473, + "learning_rate": 3.726065189025013e-06, + "loss": 1.7254, + "step": 28673 + }, + { + "epoch": 8.80110497237569, + "grad_norm": 0.20180673897266388, + "learning_rate": 3.7241825712935997e-06, + "loss": 1.764, + "step": 28674 + }, + { + "epoch": 8.801411909146715, + "grad_norm": 0.165853351354599, + "learning_rate": 3.7223004108869307e-06, + "loss": 1.7275, + "step": 28675 + }, + { + "epoch": 8.80171884591774, + "grad_norm": 0.25295981764793396, + "learning_rate": 3.72041870782362e-06, + "loss": 1.8427, + "step": 28676 + }, + { + "epoch": 8.802025782688766, + "grad_norm": 0.14879196882247925, + "learning_rate": 3.7185374621222567e-06, + "loss": 1.6921, + "step": 28677 + }, + { + "epoch": 8.802332719459791, + "grad_norm": 0.159479022026062, + "learning_rate": 3.716656673801433e-06, + "loss": 1.699, + "step": 28678 + }, + { + "epoch": 8.802639656230816, + "grad_norm": 0.1288701742887497, + "learning_rate": 3.714776342879722e-06, + "loss": 1.6872, + "step": 28679 + }, + { + "epoch": 8.802946593001842, + "grad_norm": 0.15079650282859802, + "learning_rate": 3.712896469375743e-06, + "loss": 1.6873, + "step": 28680 + }, + { + "epoch": 8.803253529772867, + "grad_norm": 0.1662154346704483, + "learning_rate": 3.7110170533080304e-06, + "loss": 1.7451, + "step": 28681 + }, + { + "epoch": 8.803560466543892, + "grad_norm": 0.1374291628599167, + "learning_rate": 3.709138094695197e-06, + "loss": 1.6698, + "step": 28682 + }, + { + "epoch": 8.803867403314918, + "grad_norm": 0.13723774254322052, + "learning_rate": 3.707259593555773e-06, + "loss": 1.734, + "step": 28683 + }, + { + "epoch": 8.804174340085943, + "grad_norm": 0.15156403183937073, + "learning_rate": 3.7053815499083543e-06, + "loss": 1.7228, + "step": 28684 + }, + { + "epoch": 8.804481276856967, + "grad_norm": 0.15390744805335999, + "learning_rate": 3.7035039637714876e-06, + "loss": 1.7659, + "step": 28685 + }, + { + "epoch": 8.804788213627992, + "grad_norm": 0.13234136998653412, + "learning_rate": 3.7016268351637297e-06, + "loss": 1.684, + "step": 28686 + }, + { + "epoch": 8.805095150399017, + "grad_norm": 0.20412379503250122, + "learning_rate": 3.699750164103638e-06, + "loss": 1.7228, + "step": 28687 + }, + { + "epoch": 8.805402087170043, + "grad_norm": 0.15076974034309387, + "learning_rate": 3.697873950609737e-06, + "loss": 1.7029, + "step": 28688 + }, + { + "epoch": 8.805709023941068, + "grad_norm": 0.13920028507709503, + "learning_rate": 3.6959981947005952e-06, + "loss": 1.6905, + "step": 28689 + }, + { + "epoch": 8.806015960712093, + "grad_norm": 0.13444112241268158, + "learning_rate": 3.694122896394736e-06, + "loss": 1.6483, + "step": 28690 + }, + { + "epoch": 8.806322897483119, + "grad_norm": 0.18719401955604553, + "learning_rate": 3.692248055710701e-06, + "loss": 1.7326, + "step": 28691 + }, + { + "epoch": 8.806629834254144, + "grad_norm": 0.2103775292634964, + "learning_rate": 3.690373672667008e-06, + "loss": 1.8134, + "step": 28692 + }, + { + "epoch": 8.80693677102517, + "grad_norm": 0.14053337275981903, + "learning_rate": 3.6884997472821814e-06, + "loss": 1.713, + "step": 28693 + }, + { + "epoch": 8.807243707796195, + "grad_norm": 0.21146062016487122, + "learning_rate": 3.686626279574751e-06, + "loss": 1.767, + "step": 28694 + }, + { + "epoch": 8.807550644567218, + "grad_norm": 0.1462959349155426, + "learning_rate": 3.6847532695632236e-06, + "loss": 1.7002, + "step": 28695 + }, + { + "epoch": 8.807857581338244, + "grad_norm": 0.13064992427825928, + "learning_rate": 3.682880717266102e-06, + "loss": 1.6927, + "step": 28696 + }, + { + "epoch": 8.808164518109269, + "grad_norm": 0.11652515083551407, + "learning_rate": 3.6810086227019147e-06, + "loss": 1.6717, + "step": 28697 + }, + { + "epoch": 8.808471454880294, + "grad_norm": 0.14266341924667358, + "learning_rate": 3.679136985889131e-06, + "loss": 1.6843, + "step": 28698 + }, + { + "epoch": 8.80877839165132, + "grad_norm": 0.15322953462600708, + "learning_rate": 3.677265806846286e-06, + "loss": 1.6947, + "step": 28699 + }, + { + "epoch": 8.809085328422345, + "grad_norm": 0.1330055147409439, + "learning_rate": 3.675395085591832e-06, + "loss": 1.7386, + "step": 28700 + }, + { + "epoch": 8.80939226519337, + "grad_norm": 0.14793124794960022, + "learning_rate": 3.6735248221442807e-06, + "loss": 1.6841, + "step": 28701 + }, + { + "epoch": 8.809699201964396, + "grad_norm": 0.13912439346313477, + "learning_rate": 3.6716550165221185e-06, + "loss": 1.697, + "step": 28702 + }, + { + "epoch": 8.810006138735421, + "grad_norm": 0.17170770466327667, + "learning_rate": 3.669785668743808e-06, + "loss": 1.7158, + "step": 28703 + }, + { + "epoch": 8.810313075506446, + "grad_norm": 0.14432193338871002, + "learning_rate": 3.66791677882784e-06, + "loss": 1.6617, + "step": 28704 + }, + { + "epoch": 8.810620012277472, + "grad_norm": 0.14610548317432404, + "learning_rate": 3.666048346792661e-06, + "loss": 1.6677, + "step": 28705 + }, + { + "epoch": 8.810926949048497, + "grad_norm": 0.15598154067993164, + "learning_rate": 3.664180372656756e-06, + "loss": 1.6847, + "step": 28706 + }, + { + "epoch": 8.81123388581952, + "grad_norm": 0.11805412918329239, + "learning_rate": 3.662312856438577e-06, + "loss": 1.668, + "step": 28707 + }, + { + "epoch": 8.811540822590546, + "grad_norm": 0.16846078634262085, + "learning_rate": 3.660445798156581e-06, + "loss": 1.7295, + "step": 28708 + }, + { + "epoch": 8.811847759361571, + "grad_norm": 0.11984262615442276, + "learning_rate": 3.658579197829226e-06, + "loss": 1.6711, + "step": 28709 + }, + { + "epoch": 8.812154696132596, + "grad_norm": 0.13624878227710724, + "learning_rate": 3.6567130554749476e-06, + "loss": 1.665, + "step": 28710 + }, + { + "epoch": 8.812461632903622, + "grad_norm": 0.19053621590137482, + "learning_rate": 3.654847371112197e-06, + "loss": 1.7301, + "step": 28711 + }, + { + "epoch": 8.812768569674647, + "grad_norm": 0.12689290940761566, + "learning_rate": 3.6529821447594036e-06, + "loss": 1.6683, + "step": 28712 + }, + { + "epoch": 8.813075506445673, + "grad_norm": 0.20414969325065613, + "learning_rate": 3.6511173764350094e-06, + "loss": 1.7787, + "step": 28713 + }, + { + "epoch": 8.813382443216698, + "grad_norm": 0.1935388743877411, + "learning_rate": 3.6492530661574377e-06, + "loss": 1.7021, + "step": 28714 + }, + { + "epoch": 8.813689379987723, + "grad_norm": 0.15490898489952087, + "learning_rate": 3.6473892139451072e-06, + "loss": 1.7155, + "step": 28715 + }, + { + "epoch": 8.813996316758749, + "grad_norm": 0.2282942682504654, + "learning_rate": 3.6455258198164587e-06, + "loss": 1.6895, + "step": 28716 + }, + { + "epoch": 8.814303253529772, + "grad_norm": 0.12892891466617584, + "learning_rate": 3.643662883789878e-06, + "loss": 1.6478, + "step": 28717 + }, + { + "epoch": 8.814610190300797, + "grad_norm": 0.12005404382944107, + "learning_rate": 3.641800405883811e-06, + "loss": 1.6955, + "step": 28718 + }, + { + "epoch": 8.814917127071823, + "grad_norm": 0.15036113560199738, + "learning_rate": 3.639938386116626e-06, + "loss": 1.7104, + "step": 28719 + }, + { + "epoch": 8.815224063842848, + "grad_norm": 0.13082142174243927, + "learning_rate": 3.6380768245067478e-06, + "loss": 1.6797, + "step": 28720 + }, + { + "epoch": 8.815531000613873, + "grad_norm": 0.12086073309183121, + "learning_rate": 3.6362157210725778e-06, + "loss": 1.6478, + "step": 28721 + }, + { + "epoch": 8.815837937384899, + "grad_norm": 0.15807145833969116, + "learning_rate": 3.6343550758324797e-06, + "loss": 1.6987, + "step": 28722 + }, + { + "epoch": 8.816144874155924, + "grad_norm": 0.1517954170703888, + "learning_rate": 3.6324948888048715e-06, + "loss": 1.7048, + "step": 28723 + }, + { + "epoch": 8.81645181092695, + "grad_norm": 0.12381365150213242, + "learning_rate": 3.6306351600081223e-06, + "loss": 1.6788, + "step": 28724 + }, + { + "epoch": 8.816758747697975, + "grad_norm": 0.14769119024276733, + "learning_rate": 3.6287758894606173e-06, + "loss": 1.6961, + "step": 28725 + }, + { + "epoch": 8.817065684469, + "grad_norm": 0.13606438040733337, + "learning_rate": 3.6269170771807305e-06, + "loss": 1.6603, + "step": 28726 + }, + { + "epoch": 8.817372621240025, + "grad_norm": 0.1724759191274643, + "learning_rate": 3.625058723186825e-06, + "loss": 1.7054, + "step": 28727 + }, + { + "epoch": 8.817679558011049, + "grad_norm": 0.1703757792711258, + "learning_rate": 3.6232008274972753e-06, + "loss": 1.7539, + "step": 28728 + }, + { + "epoch": 8.817986494782074, + "grad_norm": 0.17725473642349243, + "learning_rate": 3.621343390130433e-06, + "loss": 1.7774, + "step": 28729 + }, + { + "epoch": 8.8182934315531, + "grad_norm": 0.12104978412389755, + "learning_rate": 3.6194864111046558e-06, + "loss": 1.6966, + "step": 28730 + }, + { + "epoch": 8.818600368324125, + "grad_norm": 0.15737809240818024, + "learning_rate": 3.6176298904383066e-06, + "loss": 1.7527, + "step": 28731 + }, + { + "epoch": 8.81890730509515, + "grad_norm": 0.2053712159395218, + "learning_rate": 3.61577382814971e-06, + "loss": 1.695, + "step": 28732 + }, + { + "epoch": 8.819214241866176, + "grad_norm": 0.17244333028793335, + "learning_rate": 3.61391822425724e-06, + "loss": 1.7748, + "step": 28733 + }, + { + "epoch": 8.819521178637201, + "grad_norm": 0.10550814867019653, + "learning_rate": 3.612063078779204e-06, + "loss": 1.6216, + "step": 28734 + }, + { + "epoch": 8.819828115408226, + "grad_norm": 0.12428541481494904, + "learning_rate": 3.6102083917339657e-06, + "loss": 1.6863, + "step": 28735 + }, + { + "epoch": 8.820135052179252, + "grad_norm": 0.1403985470533371, + "learning_rate": 3.608354163139821e-06, + "loss": 1.7582, + "step": 28736 + }, + { + "epoch": 8.820441988950277, + "grad_norm": 0.14146897196769714, + "learning_rate": 3.6065003930151163e-06, + "loss": 1.6711, + "step": 28737 + }, + { + "epoch": 8.8207489257213, + "grad_norm": 0.1309487670660019, + "learning_rate": 3.6046470813781763e-06, + "loss": 1.6553, + "step": 28738 + }, + { + "epoch": 8.821055862492326, + "grad_norm": 0.16398943960666656, + "learning_rate": 3.602794228247297e-06, + "loss": 1.7097, + "step": 28739 + }, + { + "epoch": 8.821362799263351, + "grad_norm": 0.13138768076896667, + "learning_rate": 3.6009418336408085e-06, + "loss": 1.6641, + "step": 28740 + }, + { + "epoch": 8.821669736034377, + "grad_norm": 0.14470353722572327, + "learning_rate": 3.599089897576996e-06, + "loss": 1.6626, + "step": 28741 + }, + { + "epoch": 8.821976672805402, + "grad_norm": 0.17124676704406738, + "learning_rate": 3.597238420074178e-06, + "loss": 1.7347, + "step": 28742 + }, + { + "epoch": 8.822283609576427, + "grad_norm": 0.19663479924201965, + "learning_rate": 3.595387401150652e-06, + "loss": 1.7267, + "step": 28743 + }, + { + "epoch": 8.822590546347453, + "grad_norm": 0.14935022592544556, + "learning_rate": 3.5935368408247016e-06, + "loss": 1.7001, + "step": 28744 + }, + { + "epoch": 8.822897483118478, + "grad_norm": 0.13796019554138184, + "learning_rate": 3.591686739114625e-06, + "loss": 1.6774, + "step": 28745 + }, + { + "epoch": 8.823204419889503, + "grad_norm": 0.19741731882095337, + "learning_rate": 3.5898370960386952e-06, + "loss": 1.6887, + "step": 28746 + }, + { + "epoch": 8.823511356660529, + "grad_norm": 0.17089900374412537, + "learning_rate": 3.5879879116151984e-06, + "loss": 1.6869, + "step": 28747 + }, + { + "epoch": 8.823818293431554, + "grad_norm": 0.13532526791095734, + "learning_rate": 3.5861391858624083e-06, + "loss": 1.6525, + "step": 28748 + }, + { + "epoch": 8.824125230202577, + "grad_norm": 0.15727277100086212, + "learning_rate": 3.5842909187985886e-06, + "loss": 1.725, + "step": 28749 + }, + { + "epoch": 8.824432166973603, + "grad_norm": 0.14250576496124268, + "learning_rate": 3.5824431104420298e-06, + "loss": 1.6728, + "step": 28750 + }, + { + "epoch": 8.824739103744628, + "grad_norm": 0.1596658080816269, + "learning_rate": 3.580595760810951e-06, + "loss": 1.6933, + "step": 28751 + }, + { + "epoch": 8.825046040515653, + "grad_norm": 0.2319880872964859, + "learning_rate": 3.5787488699236537e-06, + "loss": 1.744, + "step": 28752 + }, + { + "epoch": 8.825352977286679, + "grad_norm": 0.12813101708889008, + "learning_rate": 3.5769024377983517e-06, + "loss": 1.7022, + "step": 28753 + }, + { + "epoch": 8.825659914057704, + "grad_norm": 0.1346128284931183, + "learning_rate": 3.5750564644533137e-06, + "loss": 1.6755, + "step": 28754 + }, + { + "epoch": 8.82596685082873, + "grad_norm": 0.1405024230480194, + "learning_rate": 3.5732109499067913e-06, + "loss": 1.6662, + "step": 28755 + }, + { + "epoch": 8.826273787599755, + "grad_norm": 0.16663044691085815, + "learning_rate": 3.571365894176992e-06, + "loss": 1.7237, + "step": 28756 + }, + { + "epoch": 8.82658072437078, + "grad_norm": 0.19339314103126526, + "learning_rate": 3.56952129728218e-06, + "loss": 1.729, + "step": 28757 + }, + { + "epoch": 8.826887661141805, + "grad_norm": 0.18851202726364136, + "learning_rate": 3.5676771592405624e-06, + "loss": 1.6923, + "step": 28758 + }, + { + "epoch": 8.82719459791283, + "grad_norm": 0.15386530756950378, + "learning_rate": 3.5658334800703797e-06, + "loss": 1.695, + "step": 28759 + }, + { + "epoch": 8.827501534683854, + "grad_norm": 0.17883063852787018, + "learning_rate": 3.5639902597898455e-06, + "loss": 1.746, + "step": 28760 + }, + { + "epoch": 8.82780847145488, + "grad_norm": 0.15690109133720398, + "learning_rate": 3.5621474984171733e-06, + "loss": 1.6937, + "step": 28761 + }, + { + "epoch": 8.828115408225905, + "grad_norm": 0.19555453956127167, + "learning_rate": 3.5603051959705815e-06, + "loss": 1.7524, + "step": 28762 + }, + { + "epoch": 8.82842234499693, + "grad_norm": 0.13835586607456207, + "learning_rate": 3.558463352468272e-06, + "loss": 1.6975, + "step": 28763 + }, + { + "epoch": 8.828729281767956, + "grad_norm": 0.13608703017234802, + "learning_rate": 3.556621967928453e-06, + "loss": 1.6588, + "step": 28764 + }, + { + "epoch": 8.829036218538981, + "grad_norm": 0.1849900633096695, + "learning_rate": 3.5547810423693096e-06, + "loss": 1.7236, + "step": 28765 + }, + { + "epoch": 8.829343155310006, + "grad_norm": 0.13603585958480835, + "learning_rate": 3.5529405758090382e-06, + "loss": 1.69, + "step": 28766 + }, + { + "epoch": 8.829650092081032, + "grad_norm": 0.12596213817596436, + "learning_rate": 3.5511005682658473e-06, + "loss": 1.7069, + "step": 28767 + }, + { + "epoch": 8.829957028852057, + "grad_norm": 0.17949149012565613, + "learning_rate": 3.549261019757888e-06, + "loss": 1.7836, + "step": 28768 + }, + { + "epoch": 8.830263965623082, + "grad_norm": 0.17237712442874908, + "learning_rate": 3.547421930303374e-06, + "loss": 1.6978, + "step": 28769 + }, + { + "epoch": 8.830570902394108, + "grad_norm": 0.16467876732349396, + "learning_rate": 3.5455832999204517e-06, + "loss": 1.7526, + "step": 28770 + }, + { + "epoch": 8.830877839165131, + "grad_norm": 0.1549120396375656, + "learning_rate": 3.5437451286273014e-06, + "loss": 1.6955, + "step": 28771 + }, + { + "epoch": 8.831184775936157, + "grad_norm": 0.24028703570365906, + "learning_rate": 3.541907416442103e-06, + "loss": 1.7547, + "step": 28772 + }, + { + "epoch": 8.831491712707182, + "grad_norm": 0.17325441539287567, + "learning_rate": 3.5400701633829856e-06, + "loss": 1.7041, + "step": 28773 + }, + { + "epoch": 8.831798649478207, + "grad_norm": 0.15597397089004517, + "learning_rate": 3.5382333694681467e-06, + "loss": 1.6997, + "step": 28774 + }, + { + "epoch": 8.832105586249233, + "grad_norm": 0.14938347041606903, + "learning_rate": 3.5363970347156994e-06, + "loss": 1.7271, + "step": 28775 + }, + { + "epoch": 8.832412523020258, + "grad_norm": 0.17745234072208405, + "learning_rate": 3.534561159143823e-06, + "loss": 1.714, + "step": 28776 + }, + { + "epoch": 8.832719459791283, + "grad_norm": 0.15323567390441895, + "learning_rate": 3.532725742770643e-06, + "loss": 1.7079, + "step": 28777 + }, + { + "epoch": 8.833026396562309, + "grad_norm": 0.15351314842700958, + "learning_rate": 3.5308907856143046e-06, + "loss": 1.733, + "step": 28778 + }, + { + "epoch": 8.833333333333334, + "grad_norm": 0.19209100306034088, + "learning_rate": 3.5290562876929388e-06, + "loss": 1.7362, + "step": 28779 + }, + { + "epoch": 8.83364027010436, + "grad_norm": 0.2092818021774292, + "learning_rate": 3.5272222490246753e-06, + "loss": 1.7682, + "step": 28780 + }, + { + "epoch": 8.833947206875383, + "grad_norm": 0.21600767970085144, + "learning_rate": 3.5253886696276383e-06, + "loss": 1.8015, + "step": 28781 + }, + { + "epoch": 8.834254143646408, + "grad_norm": 0.11457479000091553, + "learning_rate": 3.5235555495199525e-06, + "loss": 1.6582, + "step": 28782 + }, + { + "epoch": 8.834561080417433, + "grad_norm": 0.1698341816663742, + "learning_rate": 3.5217228887197253e-06, + "loss": 1.7348, + "step": 28783 + }, + { + "epoch": 8.834868017188459, + "grad_norm": 0.1234394982457161, + "learning_rate": 3.5198906872450866e-06, + "loss": 1.6819, + "step": 28784 + }, + { + "epoch": 8.835174953959484, + "grad_norm": 0.15412946045398712, + "learning_rate": 3.518058945114117e-06, + "loss": 1.6972, + "step": 28785 + }, + { + "epoch": 8.83548189073051, + "grad_norm": 0.16202808916568756, + "learning_rate": 3.516227662344951e-06, + "loss": 1.7439, + "step": 28786 + }, + { + "epoch": 8.835788827501535, + "grad_norm": 0.1599927842617035, + "learning_rate": 3.514396838955658e-06, + "loss": 1.7012, + "step": 28787 + }, + { + "epoch": 8.83609576427256, + "grad_norm": 0.1487586498260498, + "learning_rate": 3.512566474964335e-06, + "loss": 1.6844, + "step": 28788 + }, + { + "epoch": 8.836402701043585, + "grad_norm": 0.18033012747764587, + "learning_rate": 3.5107365703890892e-06, + "loss": 1.7855, + "step": 28789 + }, + { + "epoch": 8.83670963781461, + "grad_norm": 0.18171031773090363, + "learning_rate": 3.508907125247979e-06, + "loss": 1.703, + "step": 28790 + }, + { + "epoch": 8.837016574585636, + "grad_norm": 0.14102062582969666, + "learning_rate": 3.507078139559117e-06, + "loss": 1.6627, + "step": 28791 + }, + { + "epoch": 8.83732351135666, + "grad_norm": 0.16365323960781097, + "learning_rate": 3.505249613340539e-06, + "loss": 1.7317, + "step": 28792 + }, + { + "epoch": 8.837630448127685, + "grad_norm": 0.1492282748222351, + "learning_rate": 3.5034215466103417e-06, + "loss": 1.6633, + "step": 28793 + }, + { + "epoch": 8.83793738489871, + "grad_norm": 0.18670693039894104, + "learning_rate": 3.5015939393865937e-06, + "loss": 1.7233, + "step": 28794 + }, + { + "epoch": 8.838244321669736, + "grad_norm": 0.16062071919441223, + "learning_rate": 3.499766791687342e-06, + "loss": 1.7238, + "step": 28795 + }, + { + "epoch": 8.838551258440761, + "grad_norm": 0.158021941781044, + "learning_rate": 3.4979401035306504e-06, + "loss": 1.705, + "step": 28796 + }, + { + "epoch": 8.838858195211786, + "grad_norm": 0.14865651726722717, + "learning_rate": 3.49611387493457e-06, + "loss": 1.6777, + "step": 28797 + }, + { + "epoch": 8.839165131982812, + "grad_norm": 0.12111876904964447, + "learning_rate": 3.4942881059171483e-06, + "loss": 1.6273, + "step": 28798 + }, + { + "epoch": 8.839472068753837, + "grad_norm": 0.12468799948692322, + "learning_rate": 3.4924627964964318e-06, + "loss": 1.6626, + "step": 28799 + }, + { + "epoch": 8.839779005524862, + "grad_norm": 0.12292506545782089, + "learning_rate": 3.490637946690445e-06, + "loss": 1.6448, + "step": 28800 + }, + { + "epoch": 8.840085942295888, + "grad_norm": 0.16731779277324677, + "learning_rate": 3.4888135565172563e-06, + "loss": 1.7541, + "step": 28801 + }, + { + "epoch": 8.840392879066911, + "grad_norm": 0.16351507604122162, + "learning_rate": 3.486989625994852e-06, + "loss": 1.699, + "step": 28802 + }, + { + "epoch": 8.840699815837937, + "grad_norm": 0.12385114282369614, + "learning_rate": 3.485166155141295e-06, + "loss": 1.6852, + "step": 28803 + }, + { + "epoch": 8.841006752608962, + "grad_norm": 0.20780152082443237, + "learning_rate": 3.4833431439745822e-06, + "loss": 1.7179, + "step": 28804 + }, + { + "epoch": 8.841313689379987, + "grad_norm": 0.16182561218738556, + "learning_rate": 3.481520592512727e-06, + "loss": 1.7457, + "step": 28805 + }, + { + "epoch": 8.841620626151013, + "grad_norm": 0.1332414746284485, + "learning_rate": 3.4796985007737705e-06, + "loss": 1.7272, + "step": 28806 + }, + { + "epoch": 8.841927562922038, + "grad_norm": 0.14266319572925568, + "learning_rate": 3.477876868775681e-06, + "loss": 1.7207, + "step": 28807 + }, + { + "epoch": 8.842234499693063, + "grad_norm": 0.162164106965065, + "learning_rate": 3.4760556965364953e-06, + "loss": 1.6948, + "step": 28808 + }, + { + "epoch": 8.842541436464089, + "grad_norm": 0.14134974777698517, + "learning_rate": 3.474234984074182e-06, + "loss": 1.676, + "step": 28809 + }, + { + "epoch": 8.842848373235114, + "grad_norm": 0.16302376985549927, + "learning_rate": 3.4724147314067534e-06, + "loss": 1.7279, + "step": 28810 + }, + { + "epoch": 8.84315531000614, + "grad_norm": 0.1352432370185852, + "learning_rate": 3.4705949385521964e-06, + "loss": 1.7065, + "step": 28811 + }, + { + "epoch": 8.843462246777165, + "grad_norm": 0.13483819365501404, + "learning_rate": 3.46877560552849e-06, + "loss": 1.7275, + "step": 28812 + }, + { + "epoch": 8.84376918354819, + "grad_norm": 0.12226319313049316, + "learning_rate": 3.4669567323536157e-06, + "loss": 1.6965, + "step": 28813 + }, + { + "epoch": 8.844076120319214, + "grad_norm": 0.1687331646680832, + "learning_rate": 3.465138319045552e-06, + "loss": 1.6949, + "step": 28814 + }, + { + "epoch": 8.844383057090239, + "grad_norm": 0.17721997201442719, + "learning_rate": 3.4633203656222635e-06, + "loss": 1.6981, + "step": 28815 + }, + { + "epoch": 8.844689993861264, + "grad_norm": 0.14818120002746582, + "learning_rate": 3.4615028721017186e-06, + "loss": 1.687, + "step": 28816 + }, + { + "epoch": 8.84499693063229, + "grad_norm": 0.15871183574199677, + "learning_rate": 3.459685838501875e-06, + "loss": 1.7403, + "step": 28817 + }, + { + "epoch": 8.845303867403315, + "grad_norm": 0.16533036530017853, + "learning_rate": 3.4578692648407076e-06, + "loss": 1.7879, + "step": 28818 + }, + { + "epoch": 8.84561080417434, + "grad_norm": 0.18678778409957886, + "learning_rate": 3.456053151136135e-06, + "loss": 1.7474, + "step": 28819 + }, + { + "epoch": 8.845917740945366, + "grad_norm": 0.12712402641773224, + "learning_rate": 3.4542374974061488e-06, + "loss": 1.6635, + "step": 28820 + }, + { + "epoch": 8.84622467771639, + "grad_norm": 0.15502063930034637, + "learning_rate": 3.4524223036686566e-06, + "loss": 1.7133, + "step": 28821 + }, + { + "epoch": 8.846531614487416, + "grad_norm": 0.17015717923641205, + "learning_rate": 3.4506075699416e-06, + "loss": 1.7514, + "step": 28822 + }, + { + "epoch": 8.846838551258442, + "grad_norm": 0.15805409848690033, + "learning_rate": 3.4487932962429415e-06, + "loss": 1.7253, + "step": 28823 + }, + { + "epoch": 8.847145488029465, + "grad_norm": 0.14090047776699066, + "learning_rate": 3.446979482590579e-06, + "loss": 1.6763, + "step": 28824 + }, + { + "epoch": 8.84745242480049, + "grad_norm": 0.18115323781967163, + "learning_rate": 3.445166129002464e-06, + "loss": 1.7575, + "step": 28825 + }, + { + "epoch": 8.847759361571516, + "grad_norm": 0.18050703406333923, + "learning_rate": 3.443353235496488e-06, + "loss": 1.7688, + "step": 28826 + }, + { + "epoch": 8.848066298342541, + "grad_norm": 0.13750851154327393, + "learning_rate": 3.441540802090587e-06, + "loss": 1.7416, + "step": 28827 + }, + { + "epoch": 8.848373235113566, + "grad_norm": 0.14183515310287476, + "learning_rate": 3.439728828802674e-06, + "loss": 1.6924, + "step": 28828 + }, + { + "epoch": 8.848680171884592, + "grad_norm": 0.16401416063308716, + "learning_rate": 3.4379173156506517e-06, + "loss": 1.7041, + "step": 28829 + }, + { + "epoch": 8.848987108655617, + "grad_norm": 0.1347450613975525, + "learning_rate": 3.4361062626524166e-06, + "loss": 1.7331, + "step": 28830 + }, + { + "epoch": 8.849294045426642, + "grad_norm": 0.16579827666282654, + "learning_rate": 3.4342956698258768e-06, + "loss": 1.7628, + "step": 28831 + }, + { + "epoch": 8.849600982197668, + "grad_norm": 0.18201382458209991, + "learning_rate": 3.4324855371889177e-06, + "loss": 1.7054, + "step": 28832 + }, + { + "epoch": 8.849907918968693, + "grad_norm": 0.1637437641620636, + "learning_rate": 3.430675864759425e-06, + "loss": 1.7393, + "step": 28833 + }, + { + "epoch": 8.850214855739718, + "grad_norm": 0.1596134454011917, + "learning_rate": 3.4288666525552848e-06, + "loss": 1.7102, + "step": 28834 + }, + { + "epoch": 8.850521792510742, + "grad_norm": 0.1999501883983612, + "learning_rate": 3.4270579005943994e-06, + "loss": 1.7547, + "step": 28835 + }, + { + "epoch": 8.850828729281767, + "grad_norm": 0.15011270344257355, + "learning_rate": 3.4252496088946097e-06, + "loss": 1.6387, + "step": 28836 + }, + { + "epoch": 8.851135666052793, + "grad_norm": 0.12606796622276306, + "learning_rate": 3.4234417774738124e-06, + "loss": 1.6633, + "step": 28837 + }, + { + "epoch": 8.851442602823818, + "grad_norm": 0.19459915161132812, + "learning_rate": 3.421634406349855e-06, + "loss": 1.7424, + "step": 28838 + }, + { + "epoch": 8.851749539594843, + "grad_norm": 0.1512998342514038, + "learning_rate": 3.4198274955406062e-06, + "loss": 1.7007, + "step": 28839 + }, + { + "epoch": 8.852056476365869, + "grad_norm": 0.19419771432876587, + "learning_rate": 3.4180210450639295e-06, + "loss": 1.7223, + "step": 28840 + }, + { + "epoch": 8.852363413136894, + "grad_norm": 0.17737379670143127, + "learning_rate": 3.41621505493766e-06, + "loss": 1.7309, + "step": 28841 + }, + { + "epoch": 8.85267034990792, + "grad_norm": 0.14393949508666992, + "learning_rate": 3.414409525179674e-06, + "loss": 1.7213, + "step": 28842 + }, + { + "epoch": 8.852977286678945, + "grad_norm": 0.11586382240056992, + "learning_rate": 3.412604455807783e-06, + "loss": 1.6675, + "step": 28843 + }, + { + "epoch": 8.85328422344997, + "grad_norm": 0.18049278855323792, + "learning_rate": 3.410799846839846e-06, + "loss": 1.7558, + "step": 28844 + }, + { + "epoch": 8.853591160220994, + "grad_norm": 0.20962421596050262, + "learning_rate": 3.408995698293693e-06, + "loss": 1.7222, + "step": 28845 + }, + { + "epoch": 8.853898096992019, + "grad_norm": 0.12382032722234726, + "learning_rate": 3.4071920101871547e-06, + "loss": 1.7149, + "step": 28846 + }, + { + "epoch": 8.854205033763044, + "grad_norm": 0.15395772457122803, + "learning_rate": 3.405388782538049e-06, + "loss": 1.6986, + "step": 28847 + }, + { + "epoch": 8.85451197053407, + "grad_norm": 0.1579637974500656, + "learning_rate": 3.403586015364202e-06, + "loss": 1.7208, + "step": 28848 + }, + { + "epoch": 8.854818907305095, + "grad_norm": 0.18486931920051575, + "learning_rate": 3.4017837086834315e-06, + "loss": 1.7554, + "step": 28849 + }, + { + "epoch": 8.85512584407612, + "grad_norm": 0.1619080752134323, + "learning_rate": 3.399981862513546e-06, + "loss": 1.7581, + "step": 28850 + }, + { + "epoch": 8.855432780847146, + "grad_norm": 0.14540675282478333, + "learning_rate": 3.3981804768723425e-06, + "loss": 1.7391, + "step": 28851 + }, + { + "epoch": 8.855739717618171, + "grad_norm": 0.17640653252601624, + "learning_rate": 3.396379551777651e-06, + "loss": 1.807, + "step": 28852 + }, + { + "epoch": 8.856046654389196, + "grad_norm": 0.18279080092906952, + "learning_rate": 3.394579087247235e-06, + "loss": 1.7195, + "step": 28853 + }, + { + "epoch": 8.856353591160222, + "grad_norm": 0.17531390488147736, + "learning_rate": 3.3927790832989247e-06, + "loss": 1.7253, + "step": 28854 + }, + { + "epoch": 8.856660527931247, + "grad_norm": 0.14441180229187012, + "learning_rate": 3.3909795399504783e-06, + "loss": 1.7078, + "step": 28855 + }, + { + "epoch": 8.856967464702272, + "grad_norm": 0.16991926729679108, + "learning_rate": 3.3891804572196816e-06, + "loss": 1.6953, + "step": 28856 + }, + { + "epoch": 8.857274401473296, + "grad_norm": 0.17067831754684448, + "learning_rate": 3.3873818351243426e-06, + "loss": 1.7294, + "step": 28857 + }, + { + "epoch": 8.857581338244321, + "grad_norm": 0.14316415786743164, + "learning_rate": 3.3855836736821967e-06, + "loss": 1.7152, + "step": 28858 + }, + { + "epoch": 8.857888275015346, + "grad_norm": 0.13260309398174286, + "learning_rate": 3.383785972911052e-06, + "loss": 1.6761, + "step": 28859 + }, + { + "epoch": 8.858195211786372, + "grad_norm": 0.12228702753782272, + "learning_rate": 3.3819887328286394e-06, + "loss": 1.6802, + "step": 28860 + }, + { + "epoch": 8.858502148557397, + "grad_norm": 0.18033485114574432, + "learning_rate": 3.3801919534527495e-06, + "loss": 1.7828, + "step": 28861 + }, + { + "epoch": 8.858809085328422, + "grad_norm": 0.1613384336233139, + "learning_rate": 3.3783956348011235e-06, + "loss": 1.7068, + "step": 28862 + }, + { + "epoch": 8.859116022099448, + "grad_norm": 0.19849342107772827, + "learning_rate": 3.3765997768915204e-06, + "loss": 1.7139, + "step": 28863 + }, + { + "epoch": 8.859422958870473, + "grad_norm": 0.1470731794834137, + "learning_rate": 3.3748043797416804e-06, + "loss": 1.7104, + "step": 28864 + }, + { + "epoch": 8.859729895641498, + "grad_norm": 0.15868861973285675, + "learning_rate": 3.373009443369357e-06, + "loss": 1.7662, + "step": 28865 + }, + { + "epoch": 8.860036832412524, + "grad_norm": 0.17230434715747833, + "learning_rate": 3.37121496779228e-06, + "loss": 1.6877, + "step": 28866 + }, + { + "epoch": 8.860343769183547, + "grad_norm": 0.1297665536403656, + "learning_rate": 3.3694209530281905e-06, + "loss": 1.6687, + "step": 28867 + }, + { + "epoch": 8.860650705954573, + "grad_norm": 0.13699746131896973, + "learning_rate": 3.3676273990948136e-06, + "loss": 1.6773, + "step": 28868 + }, + { + "epoch": 8.860957642725598, + "grad_norm": 0.12981395423412323, + "learning_rate": 3.3658343060098685e-06, + "loss": 1.6752, + "step": 28869 + }, + { + "epoch": 8.861264579496623, + "grad_norm": 0.15934717655181885, + "learning_rate": 3.3640416737910794e-06, + "loss": 1.7449, + "step": 28870 + }, + { + "epoch": 8.861571516267649, + "grad_norm": 0.13023978471755981, + "learning_rate": 3.3622495024561827e-06, + "loss": 1.698, + "step": 28871 + }, + { + "epoch": 8.861878453038674, + "grad_norm": 0.14700792729854584, + "learning_rate": 3.3604577920228585e-06, + "loss": 1.732, + "step": 28872 + }, + { + "epoch": 8.8621853898097, + "grad_norm": 0.1421707421541214, + "learning_rate": 3.3586665425088314e-06, + "loss": 1.7032, + "step": 28873 + }, + { + "epoch": 8.862492326580725, + "grad_norm": 0.1941523402929306, + "learning_rate": 3.356875753931793e-06, + "loss": 1.7407, + "step": 28874 + }, + { + "epoch": 8.86279926335175, + "grad_norm": 0.15837855637073517, + "learning_rate": 3.3550854263094454e-06, + "loss": 1.755, + "step": 28875 + }, + { + "epoch": 8.863106200122775, + "grad_norm": 0.1624121218919754, + "learning_rate": 3.3532955596594916e-06, + "loss": 1.738, + "step": 28876 + }, + { + "epoch": 8.8634131368938, + "grad_norm": 0.15944771468639374, + "learning_rate": 3.3515061539996007e-06, + "loss": 1.6955, + "step": 28877 + }, + { + "epoch": 8.863720073664824, + "grad_norm": 0.17303216457366943, + "learning_rate": 3.349717209347475e-06, + "loss": 1.7012, + "step": 28878 + }, + { + "epoch": 8.86402701043585, + "grad_norm": 0.14601273834705353, + "learning_rate": 3.347928725720789e-06, + "loss": 1.696, + "step": 28879 + }, + { + "epoch": 8.864333947206875, + "grad_norm": 0.1746055781841278, + "learning_rate": 3.3461407031372125e-06, + "loss": 1.6991, + "step": 28880 + }, + { + "epoch": 8.8646408839779, + "grad_norm": 0.12818776071071625, + "learning_rate": 3.3443531416144147e-06, + "loss": 1.6828, + "step": 28881 + }, + { + "epoch": 8.864947820748926, + "grad_norm": 0.12297061085700989, + "learning_rate": 3.3425660411700697e-06, + "loss": 1.6483, + "step": 28882 + }, + { + "epoch": 8.865254757519951, + "grad_norm": 0.1359318494796753, + "learning_rate": 3.3407794018218307e-06, + "loss": 1.7182, + "step": 28883 + }, + { + "epoch": 8.865561694290976, + "grad_norm": 0.11981796473264694, + "learning_rate": 3.3389932235873612e-06, + "loss": 1.6935, + "step": 28884 + }, + { + "epoch": 8.865868631062002, + "grad_norm": 0.1271422654390335, + "learning_rate": 3.337207506484308e-06, + "loss": 1.6776, + "step": 28885 + }, + { + "epoch": 8.866175567833027, + "grad_norm": 0.1494673788547516, + "learning_rate": 3.335422250530318e-06, + "loss": 1.7041, + "step": 28886 + }, + { + "epoch": 8.866482504604052, + "grad_norm": 0.15046460926532745, + "learning_rate": 3.3336374557430272e-06, + "loss": 1.6714, + "step": 28887 + }, + { + "epoch": 8.866789441375076, + "grad_norm": 0.17862144112586975, + "learning_rate": 3.331853122140105e-06, + "loss": 1.7805, + "step": 28888 + }, + { + "epoch": 8.867096378146101, + "grad_norm": 0.13172993063926697, + "learning_rate": 3.3300692497391483e-06, + "loss": 1.6841, + "step": 28889 + }, + { + "epoch": 8.867403314917127, + "grad_norm": 0.20627157390117645, + "learning_rate": 3.3282858385578098e-06, + "loss": 1.8127, + "step": 28890 + }, + { + "epoch": 8.867710251688152, + "grad_norm": 0.22035779058933258, + "learning_rate": 3.326502888613697e-06, + "loss": 1.7813, + "step": 28891 + }, + { + "epoch": 8.868017188459177, + "grad_norm": 0.15250372886657715, + "learning_rate": 3.3247203999244358e-06, + "loss": 1.7192, + "step": 28892 + }, + { + "epoch": 8.868324125230203, + "grad_norm": 0.1745261251926422, + "learning_rate": 3.3229383725076614e-06, + "loss": 1.72, + "step": 28893 + }, + { + "epoch": 8.868631062001228, + "grad_norm": 0.1768372803926468, + "learning_rate": 3.3211568063809483e-06, + "loss": 1.7582, + "step": 28894 + }, + { + "epoch": 8.868937998772253, + "grad_norm": 0.14829827845096588, + "learning_rate": 3.3193757015619443e-06, + "loss": 1.6749, + "step": 28895 + }, + { + "epoch": 8.869244935543279, + "grad_norm": 0.13321566581726074, + "learning_rate": 3.3175950580682123e-06, + "loss": 1.6854, + "step": 28896 + }, + { + "epoch": 8.869551872314304, + "grad_norm": 0.12003330886363983, + "learning_rate": 3.315814875917372e-06, + "loss": 1.6611, + "step": 28897 + }, + { + "epoch": 8.86985880908533, + "grad_norm": 0.1468251645565033, + "learning_rate": 3.3140351551270157e-06, + "loss": 1.6674, + "step": 28898 + }, + { + "epoch": 8.870165745856355, + "grad_norm": 0.2222270667552948, + "learning_rate": 3.312255895714722e-06, + "loss": 1.6472, + "step": 28899 + }, + { + "epoch": 8.870472682627378, + "grad_norm": 0.14377200603485107, + "learning_rate": 3.3104770976980836e-06, + "loss": 1.6835, + "step": 28900 + }, + { + "epoch": 8.870779619398403, + "grad_norm": 0.19064709544181824, + "learning_rate": 3.3086987610946807e-06, + "loss": 1.7172, + "step": 28901 + }, + { + "epoch": 8.871086556169429, + "grad_norm": 0.21035094559192657, + "learning_rate": 3.306920885922077e-06, + "loss": 1.7199, + "step": 28902 + }, + { + "epoch": 8.871393492940454, + "grad_norm": 0.1529282182455063, + "learning_rate": 3.3051434721978526e-06, + "loss": 1.672, + "step": 28903 + }, + { + "epoch": 8.87170042971148, + "grad_norm": 0.13990004360675812, + "learning_rate": 3.3033665199395546e-06, + "loss": 1.7204, + "step": 28904 + }, + { + "epoch": 8.872007366482505, + "grad_norm": 0.20450010895729065, + "learning_rate": 3.3015900291647805e-06, + "loss": 1.7619, + "step": 28905 + }, + { + "epoch": 8.87231430325353, + "grad_norm": 0.13215813040733337, + "learning_rate": 3.2998139998910547e-06, + "loss": 1.6999, + "step": 28906 + }, + { + "epoch": 8.872621240024555, + "grad_norm": 0.12693628668785095, + "learning_rate": 3.2980384321359413e-06, + "loss": 1.7075, + "step": 28907 + }, + { + "epoch": 8.87292817679558, + "grad_norm": 0.1447865515947342, + "learning_rate": 3.2962633259169817e-06, + "loss": 1.697, + "step": 28908 + }, + { + "epoch": 8.873235113566606, + "grad_norm": 0.16820397973060608, + "learning_rate": 3.2944886812517173e-06, + "loss": 1.7087, + "step": 28909 + }, + { + "epoch": 8.87354205033763, + "grad_norm": 0.12102416902780533, + "learning_rate": 3.2927144981577007e-06, + "loss": 1.6655, + "step": 28910 + }, + { + "epoch": 8.873848987108655, + "grad_norm": 0.17087550461292267, + "learning_rate": 3.290940776652446e-06, + "loss": 1.7518, + "step": 28911 + }, + { + "epoch": 8.87415592387968, + "grad_norm": 0.15695004165172577, + "learning_rate": 3.2891675167535054e-06, + "loss": 1.6848, + "step": 28912 + }, + { + "epoch": 8.874462860650706, + "grad_norm": 0.16303250193595886, + "learning_rate": 3.2873947184783705e-06, + "loss": 1.7705, + "step": 28913 + }, + { + "epoch": 8.874769797421731, + "grad_norm": 0.1679360568523407, + "learning_rate": 3.2856223818445885e-06, + "loss": 1.6923, + "step": 28914 + }, + { + "epoch": 8.875076734192756, + "grad_norm": 0.1721598356962204, + "learning_rate": 3.283850506869668e-06, + "loss": 1.7164, + "step": 28915 + }, + { + "epoch": 8.875383670963782, + "grad_norm": 0.14126230776309967, + "learning_rate": 3.2820790935711223e-06, + "loss": 1.6794, + "step": 28916 + }, + { + "epoch": 8.875690607734807, + "grad_norm": 0.14232057332992554, + "learning_rate": 3.2803081419664484e-06, + "loss": 1.6844, + "step": 28917 + }, + { + "epoch": 8.875997544505832, + "grad_norm": 0.15812624990940094, + "learning_rate": 3.278537652073149e-06, + "loss": 1.6951, + "step": 28918 + }, + { + "epoch": 8.876304481276858, + "grad_norm": 0.15904119610786438, + "learning_rate": 3.276767623908733e-06, + "loss": 1.6761, + "step": 28919 + }, + { + "epoch": 8.876611418047883, + "grad_norm": 0.18227824568748474, + "learning_rate": 3.2749980574906803e-06, + "loss": 1.7714, + "step": 28920 + }, + { + "epoch": 8.876918354818907, + "grad_norm": 0.1715840995311737, + "learning_rate": 3.2732289528364766e-06, + "loss": 1.7491, + "step": 28921 + }, + { + "epoch": 8.877225291589932, + "grad_norm": 0.15899239480495453, + "learning_rate": 3.2714603099636256e-06, + "loss": 1.7188, + "step": 28922 + }, + { + "epoch": 8.877532228360957, + "grad_norm": 0.14183032512664795, + "learning_rate": 3.269692128889584e-06, + "loss": 1.71, + "step": 28923 + }, + { + "epoch": 8.877839165131983, + "grad_norm": 0.145817831158638, + "learning_rate": 3.2679244096318396e-06, + "loss": 1.7475, + "step": 28924 + }, + { + "epoch": 8.878146101903008, + "grad_norm": 0.20818611979484558, + "learning_rate": 3.2661571522078493e-06, + "loss": 1.7292, + "step": 28925 + }, + { + "epoch": 8.878453038674033, + "grad_norm": 0.18658684194087982, + "learning_rate": 3.264390356635083e-06, + "loss": 1.7588, + "step": 28926 + }, + { + "epoch": 8.878759975445059, + "grad_norm": 0.14851678907871246, + "learning_rate": 3.2626240229310214e-06, + "loss": 1.7177, + "step": 28927 + }, + { + "epoch": 8.879066912216084, + "grad_norm": 0.14433394372463226, + "learning_rate": 3.260858151113083e-06, + "loss": 1.7033, + "step": 28928 + }, + { + "epoch": 8.87937384898711, + "grad_norm": 0.18791940808296204, + "learning_rate": 3.2590927411987547e-06, + "loss": 1.7142, + "step": 28929 + }, + { + "epoch": 8.879680785758135, + "grad_norm": 0.15765266120433807, + "learning_rate": 3.2573277932054504e-06, + "loss": 1.7294, + "step": 28930 + }, + { + "epoch": 8.879987722529158, + "grad_norm": 0.17016790807247162, + "learning_rate": 3.255563307150644e-06, + "loss": 1.7263, + "step": 28931 + }, + { + "epoch": 8.880294659300183, + "grad_norm": 0.18677684664726257, + "learning_rate": 3.2537992830517505e-06, + "loss": 1.708, + "step": 28932 + }, + { + "epoch": 8.880601596071209, + "grad_norm": 0.13736851513385773, + "learning_rate": 3.2520357209262165e-06, + "loss": 1.6971, + "step": 28933 + }, + { + "epoch": 8.880908532842234, + "grad_norm": 0.15366335213184357, + "learning_rate": 3.250272620791467e-06, + "loss": 1.7093, + "step": 28934 + }, + { + "epoch": 8.88121546961326, + "grad_norm": 0.15538384020328522, + "learning_rate": 3.248509982664921e-06, + "loss": 1.7036, + "step": 28935 + }, + { + "epoch": 8.881522406384285, + "grad_norm": 0.137898787856102, + "learning_rate": 3.2467478065639988e-06, + "loss": 1.6654, + "step": 28936 + }, + { + "epoch": 8.88182934315531, + "grad_norm": 0.15095695853233337, + "learning_rate": 3.244986092506125e-06, + "loss": 1.736, + "step": 28937 + }, + { + "epoch": 8.882136279926335, + "grad_norm": 0.15554696321487427, + "learning_rate": 3.2432248405086908e-06, + "loss": 1.7172, + "step": 28938 + }, + { + "epoch": 8.88244321669736, + "grad_norm": 0.18302778899669647, + "learning_rate": 3.241464050589127e-06, + "loss": 1.7441, + "step": 28939 + }, + { + "epoch": 8.882750153468386, + "grad_norm": 0.18259480595588684, + "learning_rate": 3.2397037227648142e-06, + "loss": 1.6983, + "step": 28940 + }, + { + "epoch": 8.883057090239411, + "grad_norm": 0.14723163843154907, + "learning_rate": 3.2379438570531608e-06, + "loss": 1.7007, + "step": 28941 + }, + { + "epoch": 8.883364027010435, + "grad_norm": 0.1403069794178009, + "learning_rate": 3.2361844534715524e-06, + "loss": 1.6545, + "step": 28942 + }, + { + "epoch": 8.88367096378146, + "grad_norm": 0.1433728039264679, + "learning_rate": 3.2344255120373644e-06, + "loss": 1.6977, + "step": 28943 + }, + { + "epoch": 8.883977900552486, + "grad_norm": 0.18680740892887115, + "learning_rate": 3.2326670327680165e-06, + "loss": 1.756, + "step": 28944 + }, + { + "epoch": 8.884284837323511, + "grad_norm": 0.13080160319805145, + "learning_rate": 3.2309090156808498e-06, + "loss": 1.703, + "step": 28945 + }, + { + "epoch": 8.884591774094536, + "grad_norm": 0.126779243350029, + "learning_rate": 3.2291514607932616e-06, + "loss": 1.6717, + "step": 28946 + }, + { + "epoch": 8.884898710865562, + "grad_norm": 0.15787595510482788, + "learning_rate": 3.2273943681225992e-06, + "loss": 1.7005, + "step": 28947 + }, + { + "epoch": 8.885205647636587, + "grad_norm": 0.13189679384231567, + "learning_rate": 3.225637737686249e-06, + "loss": 1.6599, + "step": 28948 + }, + { + "epoch": 8.885512584407612, + "grad_norm": 0.13954944908618927, + "learning_rate": 3.2238815695015635e-06, + "loss": 1.7261, + "step": 28949 + }, + { + "epoch": 8.885819521178638, + "grad_norm": 0.2115267813205719, + "learning_rate": 3.2221258635858897e-06, + "loss": 1.7459, + "step": 28950 + }, + { + "epoch": 8.886126457949663, + "grad_norm": 0.15017318725585938, + "learning_rate": 3.220370619956592e-06, + "loss": 1.6929, + "step": 28951 + }, + { + "epoch": 8.886433394720687, + "grad_norm": 0.16980741918087006, + "learning_rate": 3.218615838631006e-06, + "loss": 1.802, + "step": 28952 + }, + { + "epoch": 8.886740331491712, + "grad_norm": 0.1366024613380432, + "learning_rate": 3.216861519626485e-06, + "loss": 1.6886, + "step": 28953 + }, + { + "epoch": 8.887047268262737, + "grad_norm": 0.16248583793640137, + "learning_rate": 3.2151076629603537e-06, + "loss": 1.6992, + "step": 28954 + }, + { + "epoch": 8.887354205033763, + "grad_norm": 0.1727447360754013, + "learning_rate": 3.213354268649943e-06, + "loss": 1.7412, + "step": 28955 + }, + { + "epoch": 8.887661141804788, + "grad_norm": 0.12872622907161713, + "learning_rate": 3.2116013367125996e-06, + "loss": 1.641, + "step": 28956 + }, + { + "epoch": 8.887968078575813, + "grad_norm": 0.12361441552639008, + "learning_rate": 3.2098488671656323e-06, + "loss": 1.6764, + "step": 28957 + }, + { + "epoch": 8.888275015346839, + "grad_norm": 0.1612539142370224, + "learning_rate": 3.2080968600263604e-06, + "loss": 1.6646, + "step": 28958 + }, + { + "epoch": 8.888581952117864, + "grad_norm": 0.15859587490558624, + "learning_rate": 3.2063453153121035e-06, + "loss": 1.6981, + "step": 28959 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 0.12860243022441864, + "learning_rate": 3.204594233040159e-06, + "loss": 1.6645, + "step": 28960 + }, + { + "epoch": 8.889195825659915, + "grad_norm": 0.232563316822052, + "learning_rate": 3.202843613227857e-06, + "loss": 1.6965, + "step": 28961 + }, + { + "epoch": 8.88950276243094, + "grad_norm": 0.15783043205738068, + "learning_rate": 3.2010934558924676e-06, + "loss": 1.7294, + "step": 28962 + }, + { + "epoch": 8.889809699201965, + "grad_norm": 0.13369722664356232, + "learning_rate": 3.199343761051321e-06, + "loss": 1.6778, + "step": 28963 + }, + { + "epoch": 8.890116635972989, + "grad_norm": 0.14463269710540771, + "learning_rate": 3.1975945287216756e-06, + "loss": 1.7211, + "step": 28964 + }, + { + "epoch": 8.890423572744014, + "grad_norm": 0.22744107246398926, + "learning_rate": 3.1958457589208346e-06, + "loss": 1.7234, + "step": 28965 + }, + { + "epoch": 8.89073050951504, + "grad_norm": 0.17402450740337372, + "learning_rate": 3.1940974516660836e-06, + "loss": 1.7355, + "step": 28966 + }, + { + "epoch": 8.891037446286065, + "grad_norm": 0.14022772014141083, + "learning_rate": 3.1923496069746927e-06, + "loss": 1.7029, + "step": 28967 + }, + { + "epoch": 8.89134438305709, + "grad_norm": 0.18977795541286469, + "learning_rate": 3.1906022248639368e-06, + "loss": 1.7213, + "step": 28968 + }, + { + "epoch": 8.891651319828116, + "grad_norm": 0.11371618509292603, + "learning_rate": 3.1888553053510905e-06, + "loss": 1.6521, + "step": 28969 + }, + { + "epoch": 8.89195825659914, + "grad_norm": 0.16720212996006012, + "learning_rate": 3.1871088484534073e-06, + "loss": 1.7186, + "step": 28970 + }, + { + "epoch": 8.892265193370166, + "grad_norm": 0.1317000538110733, + "learning_rate": 3.1853628541881563e-06, + "loss": 1.6905, + "step": 28971 + }, + { + "epoch": 8.892572130141192, + "grad_norm": 0.15759915113449097, + "learning_rate": 3.1836173225725797e-06, + "loss": 1.7293, + "step": 28972 + }, + { + "epoch": 8.892879066912217, + "grad_norm": 0.1597949117422104, + "learning_rate": 3.181872253623952e-06, + "loss": 1.6696, + "step": 28973 + }, + { + "epoch": 8.89318600368324, + "grad_norm": 0.12234945595264435, + "learning_rate": 3.1801276473594934e-06, + "loss": 1.7154, + "step": 28974 + }, + { + "epoch": 8.893492940454266, + "grad_norm": 0.12929682433605194, + "learning_rate": 3.1783835037964616e-06, + "loss": 1.7071, + "step": 28975 + }, + { + "epoch": 8.893799877225291, + "grad_norm": 0.1875714361667633, + "learning_rate": 3.176639822952082e-06, + "loss": 1.7708, + "step": 28976 + }, + { + "epoch": 8.894106813996316, + "grad_norm": 0.13817653059959412, + "learning_rate": 3.1748966048435858e-06, + "loss": 1.6894, + "step": 28977 + }, + { + "epoch": 8.894413750767342, + "grad_norm": 0.16731882095336914, + "learning_rate": 3.1731538494882198e-06, + "loss": 1.7706, + "step": 28978 + }, + { + "epoch": 8.894720687538367, + "grad_norm": 0.16811375319957733, + "learning_rate": 3.171411556903181e-06, + "loss": 1.7372, + "step": 28979 + }, + { + "epoch": 8.895027624309392, + "grad_norm": 0.11702638864517212, + "learning_rate": 3.1696697271057117e-06, + "loss": 1.6523, + "step": 28980 + }, + { + "epoch": 8.895334561080418, + "grad_norm": 0.12287343293428421, + "learning_rate": 3.1679283601130037e-06, + "loss": 1.6938, + "step": 28981 + }, + { + "epoch": 8.895641497851443, + "grad_norm": 0.10473133623600006, + "learning_rate": 3.166187455942282e-06, + "loss": 1.6731, + "step": 28982 + }, + { + "epoch": 8.895948434622468, + "grad_norm": 0.13022342324256897, + "learning_rate": 3.164447014610744e-06, + "loss": 1.679, + "step": 28983 + }, + { + "epoch": 8.896255371393494, + "grad_norm": 0.16077135503292084, + "learning_rate": 3.1627070361355925e-06, + "loss": 1.7466, + "step": 28984 + }, + { + "epoch": 8.896562308164517, + "grad_norm": 0.14103242754936218, + "learning_rate": 3.160967520534025e-06, + "loss": 1.6936, + "step": 28985 + }, + { + "epoch": 8.896869244935543, + "grad_norm": 0.12953349947929382, + "learning_rate": 3.1592284678232277e-06, + "loss": 1.7125, + "step": 28986 + }, + { + "epoch": 8.897176181706568, + "grad_norm": 0.11083797365427017, + "learning_rate": 3.157489878020392e-06, + "loss": 1.6455, + "step": 28987 + }, + { + "epoch": 8.897483118477593, + "grad_norm": 0.12037435173988342, + "learning_rate": 3.1557517511426936e-06, + "loss": 1.6569, + "step": 28988 + }, + { + "epoch": 8.897790055248619, + "grad_norm": 0.17309941351413727, + "learning_rate": 3.154014087207302e-06, + "loss": 1.7142, + "step": 28989 + }, + { + "epoch": 8.898096992019644, + "grad_norm": 0.15349642932415009, + "learning_rate": 3.15227688623142e-06, + "loss": 1.7375, + "step": 28990 + }, + { + "epoch": 8.89840392879067, + "grad_norm": 0.175978422164917, + "learning_rate": 3.1505401482321896e-06, + "loss": 1.7023, + "step": 28991 + }, + { + "epoch": 8.898710865561695, + "grad_norm": 0.13710327446460724, + "learning_rate": 3.14880387322678e-06, + "loss": 1.6462, + "step": 28992 + }, + { + "epoch": 8.89901780233272, + "grad_norm": 0.11777636408805847, + "learning_rate": 3.14706806123235e-06, + "loss": 1.6187, + "step": 28993 + }, + { + "epoch": 8.899324739103745, + "grad_norm": 0.1707836240530014, + "learning_rate": 3.145332712266047e-06, + "loss": 1.7314, + "step": 28994 + }, + { + "epoch": 8.899631675874769, + "grad_norm": 0.15286721289157867, + "learning_rate": 3.143597826345046e-06, + "loss": 1.6874, + "step": 28995 + }, + { + "epoch": 8.899938612645794, + "grad_norm": 0.1401689052581787, + "learning_rate": 3.141863403486456e-06, + "loss": 1.6795, + "step": 28996 + }, + { + "epoch": 8.90024554941682, + "grad_norm": 0.13194917142391205, + "learning_rate": 3.1401294437074512e-06, + "loss": 1.6967, + "step": 28997 + }, + { + "epoch": 8.900552486187845, + "grad_norm": 0.1518833339214325, + "learning_rate": 3.1383959470251413e-06, + "loss": 1.6914, + "step": 28998 + }, + { + "epoch": 8.90085942295887, + "grad_norm": 0.12354082614183426, + "learning_rate": 3.1366629134566727e-06, + "loss": 1.6809, + "step": 28999 + }, + { + "epoch": 8.901166359729896, + "grad_norm": 0.2156827449798584, + "learning_rate": 3.1349303430191712e-06, + "loss": 1.7617, + "step": 29000 + }, + { + "epoch": 8.901473296500921, + "grad_norm": 0.15934047102928162, + "learning_rate": 3.133198235729756e-06, + "loss": 1.7443, + "step": 29001 + }, + { + "epoch": 8.901780233271946, + "grad_norm": 0.13422276079654694, + "learning_rate": 3.1314665916055473e-06, + "loss": 1.7238, + "step": 29002 + }, + { + "epoch": 8.902087170042972, + "grad_norm": 0.1727958619594574, + "learning_rate": 3.1297354106636535e-06, + "loss": 1.7208, + "step": 29003 + }, + { + "epoch": 8.902394106813997, + "grad_norm": 0.14110971987247467, + "learning_rate": 3.1280046929211827e-06, + "loss": 1.6586, + "step": 29004 + }, + { + "epoch": 8.902701043585022, + "grad_norm": 0.1527067869901657, + "learning_rate": 3.126274438395249e-06, + "loss": 1.6908, + "step": 29005 + }, + { + "epoch": 8.903007980356048, + "grad_norm": 0.1663844734430313, + "learning_rate": 3.1245446471029392e-06, + "loss": 1.7263, + "step": 29006 + }, + { + "epoch": 8.903314917127071, + "grad_norm": 0.23200902342796326, + "learning_rate": 3.1228153190613563e-06, + "loss": 1.7564, + "step": 29007 + }, + { + "epoch": 8.903621853898096, + "grad_norm": 0.1557004153728485, + "learning_rate": 3.1210864542875917e-06, + "loss": 1.721, + "step": 29008 + }, + { + "epoch": 8.903928790669122, + "grad_norm": 0.1682535856962204, + "learning_rate": 3.1193580527987208e-06, + "loss": 1.7244, + "step": 29009 + }, + { + "epoch": 8.904235727440147, + "grad_norm": 0.17813025414943695, + "learning_rate": 3.117630114611836e-06, + "loss": 1.6873, + "step": 29010 + }, + { + "epoch": 8.904542664211172, + "grad_norm": 0.16720467805862427, + "learning_rate": 3.1159026397440007e-06, + "loss": 1.7588, + "step": 29011 + }, + { + "epoch": 8.904849600982198, + "grad_norm": 0.12350224703550339, + "learning_rate": 3.114175628212307e-06, + "loss": 1.6641, + "step": 29012 + }, + { + "epoch": 8.905156537753223, + "grad_norm": 0.16594655811786652, + "learning_rate": 3.112449080033797e-06, + "loss": 1.6896, + "step": 29013 + }, + { + "epoch": 8.905463474524248, + "grad_norm": 0.11925587058067322, + "learning_rate": 3.110722995225562e-06, + "loss": 1.6751, + "step": 29014 + }, + { + "epoch": 8.905770411295274, + "grad_norm": 0.15165284276008606, + "learning_rate": 3.108997373804634e-06, + "loss": 1.6983, + "step": 29015 + }, + { + "epoch": 8.9060773480663, + "grad_norm": 0.1934432089328766, + "learning_rate": 3.107272215788082e-06, + "loss": 1.6972, + "step": 29016 + }, + { + "epoch": 8.906384284837323, + "grad_norm": 0.1574355512857437, + "learning_rate": 3.1055475211929474e-06, + "loss": 1.751, + "step": 29017 + }, + { + "epoch": 8.906691221608348, + "grad_norm": 0.17686793208122253, + "learning_rate": 3.1038232900362787e-06, + "loss": 1.7705, + "step": 29018 + }, + { + "epoch": 8.906998158379373, + "grad_norm": 0.20089837908744812, + "learning_rate": 3.102099522335117e-06, + "loss": 1.8083, + "step": 29019 + }, + { + "epoch": 8.907305095150399, + "grad_norm": 0.1398555189371109, + "learning_rate": 3.1003762181064986e-06, + "loss": 1.7181, + "step": 29020 + }, + { + "epoch": 8.907612031921424, + "grad_norm": 0.14177222549915314, + "learning_rate": 3.09865337736745e-06, + "loss": 1.671, + "step": 29021 + }, + { + "epoch": 8.90791896869245, + "grad_norm": 0.17582249641418457, + "learning_rate": 3.0969310001349948e-06, + "loss": 1.7112, + "step": 29022 + }, + { + "epoch": 8.908225905463475, + "grad_norm": 0.16887766122817993, + "learning_rate": 3.0952090864261594e-06, + "loss": 1.7281, + "step": 29023 + }, + { + "epoch": 8.9085328422345, + "grad_norm": 0.1768682301044464, + "learning_rate": 3.093487636257958e-06, + "loss": 1.6584, + "step": 29024 + }, + { + "epoch": 8.908839779005525, + "grad_norm": 0.15997330844402313, + "learning_rate": 3.0917666496474095e-06, + "loss": 1.7051, + "step": 29025 + }, + { + "epoch": 8.90914671577655, + "grad_norm": 0.16596661508083344, + "learning_rate": 3.0900461266115124e-06, + "loss": 1.6899, + "step": 29026 + }, + { + "epoch": 8.909453652547576, + "grad_norm": 0.1477203071117401, + "learning_rate": 3.088326067167274e-06, + "loss": 1.6982, + "step": 29027 + }, + { + "epoch": 8.9097605893186, + "grad_norm": 0.170956552028656, + "learning_rate": 3.086606471331699e-06, + "loss": 1.6561, + "step": 29028 + }, + { + "epoch": 8.910067526089625, + "grad_norm": 0.1777859330177307, + "learning_rate": 3.0848873391217727e-06, + "loss": 1.7638, + "step": 29029 + }, + { + "epoch": 8.91037446286065, + "grad_norm": 0.20077209174633026, + "learning_rate": 3.083168670554476e-06, + "loss": 1.7588, + "step": 29030 + }, + { + "epoch": 8.910681399631676, + "grad_norm": 0.15471714735031128, + "learning_rate": 3.0814504656468234e-06, + "loss": 1.682, + "step": 29031 + }, + { + "epoch": 8.910988336402701, + "grad_norm": 0.1711329072713852, + "learning_rate": 3.0797327244157624e-06, + "loss": 1.6883, + "step": 29032 + }, + { + "epoch": 8.911295273173726, + "grad_norm": 0.11440590023994446, + "learning_rate": 3.0780154468782905e-06, + "loss": 1.6861, + "step": 29033 + }, + { + "epoch": 8.911602209944752, + "grad_norm": 0.15305832028388977, + "learning_rate": 3.0762986330513722e-06, + "loss": 1.7208, + "step": 29034 + }, + { + "epoch": 8.911909146715777, + "grad_norm": 0.13767275214195251, + "learning_rate": 3.0745822829519766e-06, + "loss": 1.7319, + "step": 29035 + }, + { + "epoch": 8.912216083486802, + "grad_norm": 0.15172621607780457, + "learning_rate": 3.0728663965970573e-06, + "loss": 1.7003, + "step": 29036 + }, + { + "epoch": 8.912523020257828, + "grad_norm": 0.16932672262191772, + "learning_rate": 3.071150974003578e-06, + "loss": 1.709, + "step": 29037 + }, + { + "epoch": 8.912829957028851, + "grad_norm": 0.13176152110099792, + "learning_rate": 3.069436015188493e-06, + "loss": 1.6714, + "step": 29038 + }, + { + "epoch": 8.913136893799877, + "grad_norm": 0.17337891459465027, + "learning_rate": 3.067721520168748e-06, + "loss": 1.7786, + "step": 29039 + }, + { + "epoch": 8.913443830570902, + "grad_norm": 0.12546442449092865, + "learning_rate": 3.0660074889612867e-06, + "loss": 1.7219, + "step": 29040 + }, + { + "epoch": 8.913750767341927, + "grad_norm": 0.21087953448295593, + "learning_rate": 3.0642939215830444e-06, + "loss": 1.7541, + "step": 29041 + }, + { + "epoch": 8.914057704112953, + "grad_norm": 0.16880549490451813, + "learning_rate": 3.062580818050964e-06, + "loss": 1.7299, + "step": 29042 + }, + { + "epoch": 8.914364640883978, + "grad_norm": 0.15600517392158508, + "learning_rate": 3.0608681783819705e-06, + "loss": 1.6801, + "step": 29043 + }, + { + "epoch": 8.914671577655003, + "grad_norm": 0.11458457261323929, + "learning_rate": 3.059156002592989e-06, + "loss": 1.6393, + "step": 29044 + }, + { + "epoch": 8.914978514426029, + "grad_norm": 0.15529881417751312, + "learning_rate": 3.0574442907009393e-06, + "loss": 1.7288, + "step": 29045 + }, + { + "epoch": 8.915285451197054, + "grad_norm": 0.15211673080921173, + "learning_rate": 3.0557330427227415e-06, + "loss": 1.6784, + "step": 29046 + }, + { + "epoch": 8.91559238796808, + "grad_norm": 0.13714905083179474, + "learning_rate": 3.054022258675293e-06, + "loss": 1.7047, + "step": 29047 + }, + { + "epoch": 8.915899324739105, + "grad_norm": 0.1595524698495865, + "learning_rate": 3.0523119385755304e-06, + "loss": 1.722, + "step": 29048 + }, + { + "epoch": 8.91620626151013, + "grad_norm": 0.16744185984134674, + "learning_rate": 3.0506020824403235e-06, + "loss": 1.6754, + "step": 29049 + }, + { + "epoch": 8.916513198281153, + "grad_norm": 0.13333237171173096, + "learning_rate": 3.048892690286598e-06, + "loss": 1.7332, + "step": 29050 + }, + { + "epoch": 8.916820135052179, + "grad_norm": 0.19067470729351044, + "learning_rate": 3.0471837621312228e-06, + "loss": 1.7034, + "step": 29051 + }, + { + "epoch": 8.917127071823204, + "grad_norm": 0.1292569637298584, + "learning_rate": 3.0454752979911018e-06, + "loss": 1.652, + "step": 29052 + }, + { + "epoch": 8.91743400859423, + "grad_norm": 0.15452222526073456, + "learning_rate": 3.0437672978831155e-06, + "loss": 1.7183, + "step": 29053 + }, + { + "epoch": 8.917740945365255, + "grad_norm": 0.16528162360191345, + "learning_rate": 3.04205976182414e-06, + "loss": 1.7099, + "step": 29054 + }, + { + "epoch": 8.91804788213628, + "grad_norm": 0.22729776799678802, + "learning_rate": 3.0403526898310553e-06, + "loss": 1.7353, + "step": 29055 + }, + { + "epoch": 8.918354818907305, + "grad_norm": 0.134805828332901, + "learning_rate": 3.038646081920732e-06, + "loss": 1.6975, + "step": 29056 + }, + { + "epoch": 8.91866175567833, + "grad_norm": 0.15781652927398682, + "learning_rate": 3.0369399381100282e-06, + "loss": 1.7197, + "step": 29057 + }, + { + "epoch": 8.918968692449356, + "grad_norm": 0.19794493913650513, + "learning_rate": 3.0352342584158146e-06, + "loss": 1.6894, + "step": 29058 + }, + { + "epoch": 8.919275629220381, + "grad_norm": 0.14306722581386566, + "learning_rate": 3.033529042854938e-06, + "loss": 1.6885, + "step": 29059 + }, + { + "epoch": 8.919582565991405, + "grad_norm": 0.1341150999069214, + "learning_rate": 3.0318242914442574e-06, + "loss": 1.7154, + "step": 29060 + }, + { + "epoch": 8.91988950276243, + "grad_norm": 0.2001344859600067, + "learning_rate": 3.0301200042006208e-06, + "loss": 1.7537, + "step": 29061 + }, + { + "epoch": 8.920196439533456, + "grad_norm": 0.22544899582862854, + "learning_rate": 3.028416181140864e-06, + "loss": 1.7656, + "step": 29062 + }, + { + "epoch": 8.920503376304481, + "grad_norm": 0.13061828911304474, + "learning_rate": 3.0267128222818298e-06, + "loss": 1.6929, + "step": 29063 + }, + { + "epoch": 8.920810313075506, + "grad_norm": 0.19021448493003845, + "learning_rate": 3.025009927640349e-06, + "loss": 1.7858, + "step": 29064 + }, + { + "epoch": 8.921117249846532, + "grad_norm": 0.15748682618141174, + "learning_rate": 3.023307497233263e-06, + "loss": 1.6983, + "step": 29065 + }, + { + "epoch": 8.921424186617557, + "grad_norm": 0.20138932764530182, + "learning_rate": 3.0216055310773704e-06, + "loss": 1.7891, + "step": 29066 + }, + { + "epoch": 8.921731123388582, + "grad_norm": 0.11930065602064133, + "learning_rate": 3.0199040291895242e-06, + "loss": 1.6733, + "step": 29067 + }, + { + "epoch": 8.922038060159608, + "grad_norm": 0.17451462149620056, + "learning_rate": 3.0182029915865107e-06, + "loss": 1.717, + "step": 29068 + }, + { + "epoch": 8.922344996930633, + "grad_norm": 0.13890404999256134, + "learning_rate": 3.0165024182851553e-06, + "loss": 1.6821, + "step": 29069 + }, + { + "epoch": 8.922651933701658, + "grad_norm": 0.15502439439296722, + "learning_rate": 3.0148023093022613e-06, + "loss": 1.6746, + "step": 29070 + }, + { + "epoch": 8.922958870472682, + "grad_norm": 0.14066965878009796, + "learning_rate": 3.013102664654627e-06, + "loss": 1.6979, + "step": 29071 + }, + { + "epoch": 8.923265807243707, + "grad_norm": 0.15466643869876862, + "learning_rate": 3.01140348435906e-06, + "loss": 1.7306, + "step": 29072 + }, + { + "epoch": 8.923572744014733, + "grad_norm": 0.15576320886611938, + "learning_rate": 3.0097047684323363e-06, + "loss": 1.7241, + "step": 29073 + }, + { + "epoch": 8.923879680785758, + "grad_norm": 0.15748077630996704, + "learning_rate": 3.008006516891254e-06, + "loss": 1.7053, + "step": 29074 + }, + { + "epoch": 8.924186617556783, + "grad_norm": 0.19139769673347473, + "learning_rate": 3.0063087297525995e-06, + "loss": 1.7361, + "step": 29075 + }, + { + "epoch": 8.924493554327809, + "grad_norm": 0.12561291456222534, + "learning_rate": 3.0046114070331423e-06, + "loss": 1.6982, + "step": 29076 + }, + { + "epoch": 8.924800491098834, + "grad_norm": 0.140936940908432, + "learning_rate": 3.002914548749658e-06, + "loss": 1.66, + "step": 29077 + }, + { + "epoch": 8.92510742786986, + "grad_norm": 0.19634532928466797, + "learning_rate": 3.001218154918922e-06, + "loss": 1.6947, + "step": 29078 + }, + { + "epoch": 8.925414364640885, + "grad_norm": 0.1971811205148697, + "learning_rate": 2.999522225557694e-06, + "loss": 1.7133, + "step": 29079 + }, + { + "epoch": 8.92572130141191, + "grad_norm": 0.15782490372657776, + "learning_rate": 2.9978267606827314e-06, + "loss": 1.6724, + "step": 29080 + }, + { + "epoch": 8.926028238182933, + "grad_norm": 0.1563064008951187, + "learning_rate": 2.9961317603107887e-06, + "loss": 1.7942, + "step": 29081 + }, + { + "epoch": 8.926335174953959, + "grad_norm": 0.1192200556397438, + "learning_rate": 2.994437224458635e-06, + "loss": 1.6736, + "step": 29082 + }, + { + "epoch": 8.926642111724984, + "grad_norm": 0.14355097711086273, + "learning_rate": 2.9927431531429905e-06, + "loss": 1.6968, + "step": 29083 + }, + { + "epoch": 8.92694904849601, + "grad_norm": 0.17257769405841827, + "learning_rate": 2.9910495463806255e-06, + "loss": 1.7353, + "step": 29084 + }, + { + "epoch": 8.927255985267035, + "grad_norm": 0.16805051267147064, + "learning_rate": 2.9893564041882484e-06, + "loss": 1.7711, + "step": 29085 + }, + { + "epoch": 8.92756292203806, + "grad_norm": 0.123812235891819, + "learning_rate": 2.9876637265826123e-06, + "loss": 1.6197, + "step": 29086 + }, + { + "epoch": 8.927869858809085, + "grad_norm": 0.38423335552215576, + "learning_rate": 2.985971513580432e-06, + "loss": 1.726, + "step": 29087 + }, + { + "epoch": 8.92817679558011, + "grad_norm": 0.14887484908103943, + "learning_rate": 2.9842797651984443e-06, + "loss": 1.7067, + "step": 29088 + }, + { + "epoch": 8.928483732351136, + "grad_norm": 0.17092695832252502, + "learning_rate": 2.982588481453358e-06, + "loss": 1.6883, + "step": 29089 + }, + { + "epoch": 8.928790669122161, + "grad_norm": 0.1591298133134842, + "learning_rate": 2.9808976623618867e-06, + "loss": 1.7219, + "step": 29090 + }, + { + "epoch": 8.929097605893187, + "grad_norm": 0.17864398658275604, + "learning_rate": 2.979207307940746e-06, + "loss": 1.7378, + "step": 29091 + }, + { + "epoch": 8.92940454266421, + "grad_norm": 0.15053904056549072, + "learning_rate": 2.977517418206638e-06, + "loss": 1.679, + "step": 29092 + }, + { + "epoch": 8.929711479435236, + "grad_norm": 0.15586422383785248, + "learning_rate": 2.975827993176267e-06, + "loss": 1.7276, + "step": 29093 + }, + { + "epoch": 8.930018416206261, + "grad_norm": 0.13955895602703094, + "learning_rate": 2.9741390328663243e-06, + "loss": 1.6727, + "step": 29094 + }, + { + "epoch": 8.930325352977286, + "grad_norm": 0.15469470620155334, + "learning_rate": 2.9724505372934973e-06, + "loss": 1.6993, + "step": 29095 + }, + { + "epoch": 8.930632289748312, + "grad_norm": 0.13510502874851227, + "learning_rate": 2.970762506474484e-06, + "loss": 1.6991, + "step": 29096 + }, + { + "epoch": 8.930939226519337, + "grad_norm": 0.13071557879447937, + "learning_rate": 2.9690749404259587e-06, + "loss": 1.6787, + "step": 29097 + }, + { + "epoch": 8.931246163290362, + "grad_norm": 0.13370119035243988, + "learning_rate": 2.9673878391645927e-06, + "loss": 1.6966, + "step": 29098 + }, + { + "epoch": 8.931553100061388, + "grad_norm": 0.21600082516670227, + "learning_rate": 2.9657012027070774e-06, + "loss": 1.7137, + "step": 29099 + }, + { + "epoch": 8.931860036832413, + "grad_norm": 0.17746025323867798, + "learning_rate": 2.964015031070061e-06, + "loss": 1.7406, + "step": 29100 + }, + { + "epoch": 8.932166973603438, + "grad_norm": 0.1861608922481537, + "learning_rate": 2.96232932427023e-06, + "loss": 1.7615, + "step": 29101 + }, + { + "epoch": 8.932473910374462, + "grad_norm": 0.128297820687294, + "learning_rate": 2.9606440823242155e-06, + "loss": 1.6525, + "step": 29102 + }, + { + "epoch": 8.932780847145487, + "grad_norm": 0.1617307960987091, + "learning_rate": 2.958959305248693e-06, + "loss": 1.6735, + "step": 29103 + }, + { + "epoch": 8.933087783916513, + "grad_norm": 0.1898767054080963, + "learning_rate": 2.9572749930603107e-06, + "loss": 1.7426, + "step": 29104 + }, + { + "epoch": 8.933394720687538, + "grad_norm": 0.14279016852378845, + "learning_rate": 2.955591145775705e-06, + "loss": 1.6855, + "step": 29105 + }, + { + "epoch": 8.933701657458563, + "grad_norm": 0.15879136323928833, + "learning_rate": 2.953907763411523e-06, + "loss": 1.6833, + "step": 29106 + }, + { + "epoch": 8.934008594229589, + "grad_norm": 0.14285622537136078, + "learning_rate": 2.9522248459843972e-06, + "loss": 1.6821, + "step": 29107 + }, + { + "epoch": 8.934315531000614, + "grad_norm": 0.1237918958067894, + "learning_rate": 2.950542393510963e-06, + "loss": 1.6676, + "step": 29108 + }, + { + "epoch": 8.93462246777164, + "grad_norm": 0.16011624038219452, + "learning_rate": 2.9488604060078473e-06, + "loss": 1.6881, + "step": 29109 + }, + { + "epoch": 8.934929404542665, + "grad_norm": 0.19365482032299042, + "learning_rate": 2.9471788834916692e-06, + "loss": 1.6895, + "step": 29110 + }, + { + "epoch": 8.93523634131369, + "grad_norm": 0.1855025440454483, + "learning_rate": 2.9454978259790435e-06, + "loss": 1.7745, + "step": 29111 + }, + { + "epoch": 8.935543278084715, + "grad_norm": 0.1319892704486847, + "learning_rate": 2.9438172334865898e-06, + "loss": 1.6836, + "step": 29112 + }, + { + "epoch": 8.93585021485574, + "grad_norm": 0.19831378757953644, + "learning_rate": 2.942137106030918e-06, + "loss": 1.7398, + "step": 29113 + }, + { + "epoch": 8.936157151626764, + "grad_norm": 0.16073055565357208, + "learning_rate": 2.9404574436286246e-06, + "loss": 1.6617, + "step": 29114 + }, + { + "epoch": 8.93646408839779, + "grad_norm": 0.19067524373531342, + "learning_rate": 2.938778246296309e-06, + "loss": 1.7244, + "step": 29115 + }, + { + "epoch": 8.936771025168815, + "grad_norm": 0.13316050171852112, + "learning_rate": 2.9370995140505843e-06, + "loss": 1.6371, + "step": 29116 + }, + { + "epoch": 8.93707796193984, + "grad_norm": 0.19948840141296387, + "learning_rate": 2.9354212469080156e-06, + "loss": 1.7279, + "step": 29117 + }, + { + "epoch": 8.937384898710865, + "grad_norm": 0.15221990644931793, + "learning_rate": 2.933743444885206e-06, + "loss": 1.7516, + "step": 29118 + }, + { + "epoch": 8.93769183548189, + "grad_norm": 0.15257437527179718, + "learning_rate": 2.932066107998721e-06, + "loss": 1.7471, + "step": 29119 + }, + { + "epoch": 8.937998772252916, + "grad_norm": 0.1491934210062027, + "learning_rate": 2.930389236265152e-06, + "loss": 1.6896, + "step": 29120 + }, + { + "epoch": 8.938305709023942, + "grad_norm": 0.12303795665502548, + "learning_rate": 2.928712829701069e-06, + "loss": 1.6793, + "step": 29121 + }, + { + "epoch": 8.938612645794967, + "grad_norm": 0.09865713864564896, + "learning_rate": 2.9270368883230313e-06, + "loss": 1.6063, + "step": 29122 + }, + { + "epoch": 8.938919582565992, + "grad_norm": 0.1656254678964615, + "learning_rate": 2.9253614121476037e-06, + "loss": 1.7507, + "step": 29123 + }, + { + "epoch": 8.939226519337016, + "grad_norm": 0.11997068673372269, + "learning_rate": 2.9236864011913445e-06, + "loss": 1.6393, + "step": 29124 + }, + { + "epoch": 8.939533456108041, + "grad_norm": 0.16391901671886444, + "learning_rate": 2.922011855470813e-06, + "loss": 1.6926, + "step": 29125 + }, + { + "epoch": 8.939840392879066, + "grad_norm": 0.1461794674396515, + "learning_rate": 2.920337775002552e-06, + "loss": 1.7243, + "step": 29126 + }, + { + "epoch": 8.940147329650092, + "grad_norm": 0.12928323447704315, + "learning_rate": 2.918664159803108e-06, + "loss": 1.6457, + "step": 29127 + }, + { + "epoch": 8.940454266421117, + "grad_norm": 0.16596664488315582, + "learning_rate": 2.9169910098890196e-06, + "loss": 1.6878, + "step": 29128 + }, + { + "epoch": 8.940761203192142, + "grad_norm": 0.1567634493112564, + "learning_rate": 2.9153183252768224e-06, + "loss": 1.6947, + "step": 29129 + }, + { + "epoch": 8.941068139963168, + "grad_norm": 0.1472834199666977, + "learning_rate": 2.9136461059830476e-06, + "loss": 1.6707, + "step": 29130 + }, + { + "epoch": 8.941375076734193, + "grad_norm": 0.1658584028482437, + "learning_rate": 2.9119743520242217e-06, + "loss": 1.7321, + "step": 29131 + }, + { + "epoch": 8.941682013505218, + "grad_norm": 0.20524124801158905, + "learning_rate": 2.9103030634168525e-06, + "loss": 1.7065, + "step": 29132 + }, + { + "epoch": 8.941988950276244, + "grad_norm": 0.16881074011325836, + "learning_rate": 2.908632240177489e-06, + "loss": 1.7052, + "step": 29133 + }, + { + "epoch": 8.942295887047269, + "grad_norm": 0.15819382667541504, + "learning_rate": 2.906961882322601e-06, + "loss": 1.7388, + "step": 29134 + }, + { + "epoch": 8.942602823818293, + "grad_norm": 0.13994456827640533, + "learning_rate": 2.905291989868736e-06, + "loss": 1.6932, + "step": 29135 + }, + { + "epoch": 8.942909760589318, + "grad_norm": 0.18177597224712372, + "learning_rate": 2.9036225628323644e-06, + "loss": 1.707, + "step": 29136 + }, + { + "epoch": 8.943216697360343, + "grad_norm": 0.14273816347122192, + "learning_rate": 2.9019536012300063e-06, + "loss": 1.6902, + "step": 29137 + }, + { + "epoch": 8.943523634131369, + "grad_norm": 0.2221340835094452, + "learning_rate": 2.9002851050781486e-06, + "loss": 1.7369, + "step": 29138 + }, + { + "epoch": 8.943830570902394, + "grad_norm": 0.14513340592384338, + "learning_rate": 2.8986170743932782e-06, + "loss": 1.7307, + "step": 29139 + }, + { + "epoch": 8.94413750767342, + "grad_norm": 0.16813357174396515, + "learning_rate": 2.8969495091918763e-06, + "loss": 1.769, + "step": 29140 + }, + { + "epoch": 8.944444444444445, + "grad_norm": 0.15906141698360443, + "learning_rate": 2.895282409490435e-06, + "loss": 1.6929, + "step": 29141 + }, + { + "epoch": 8.94475138121547, + "grad_norm": 0.16236159205436707, + "learning_rate": 2.893615775305419e-06, + "loss": 1.7309, + "step": 29142 + }, + { + "epoch": 8.945058317986495, + "grad_norm": 0.12328501045703888, + "learning_rate": 2.891949606653299e-06, + "loss": 1.7063, + "step": 29143 + }, + { + "epoch": 8.94536525475752, + "grad_norm": 0.15831345319747925, + "learning_rate": 2.89028390355055e-06, + "loss": 1.6602, + "step": 29144 + }, + { + "epoch": 8.945672191528544, + "grad_norm": 0.12445748597383499, + "learning_rate": 2.8886186660136206e-06, + "loss": 1.6565, + "step": 29145 + }, + { + "epoch": 8.94597912829957, + "grad_norm": 0.12890103459358215, + "learning_rate": 2.88695389405898e-06, + "loss": 1.7209, + "step": 29146 + }, + { + "epoch": 8.946286065070595, + "grad_norm": 0.14477044343948364, + "learning_rate": 2.885289587703072e-06, + "loss": 1.6782, + "step": 29147 + }, + { + "epoch": 8.94659300184162, + "grad_norm": 0.12625789642333984, + "learning_rate": 2.8836257469623482e-06, + "loss": 1.6538, + "step": 29148 + }, + { + "epoch": 8.946899938612646, + "grad_norm": 0.16041505336761475, + "learning_rate": 2.8819623718532418e-06, + "loss": 1.7327, + "step": 29149 + }, + { + "epoch": 8.94720687538367, + "grad_norm": 0.16730013489723206, + "learning_rate": 2.880299462392216e-06, + "loss": 1.7036, + "step": 29150 + }, + { + "epoch": 8.947513812154696, + "grad_norm": 0.1525142341852188, + "learning_rate": 2.87863701859567e-06, + "loss": 1.7013, + "step": 29151 + }, + { + "epoch": 8.947820748925722, + "grad_norm": 0.10877451300621033, + "learning_rate": 2.876975040480073e-06, + "loss": 1.6294, + "step": 29152 + }, + { + "epoch": 8.948127685696747, + "grad_norm": 0.11804116517305374, + "learning_rate": 2.875313528061807e-06, + "loss": 1.6885, + "step": 29153 + }, + { + "epoch": 8.948434622467772, + "grad_norm": 0.1718084067106247, + "learning_rate": 2.873652481357325e-06, + "loss": 1.682, + "step": 29154 + }, + { + "epoch": 8.948741559238798, + "grad_norm": 0.1881963163614273, + "learning_rate": 2.871991900383031e-06, + "loss": 1.7851, + "step": 29155 + }, + { + "epoch": 8.949048496009823, + "grad_norm": 0.14475038647651672, + "learning_rate": 2.8703317851553334e-06, + "loss": 1.6933, + "step": 29156 + }, + { + "epoch": 8.949355432780846, + "grad_norm": 0.15759755671024323, + "learning_rate": 2.8686721356906423e-06, + "loss": 1.7322, + "step": 29157 + }, + { + "epoch": 8.949662369551872, + "grad_norm": 0.13722626864910126, + "learning_rate": 2.8670129520053547e-06, + "loss": 1.7027, + "step": 29158 + }, + { + "epoch": 8.949969306322897, + "grad_norm": 0.14574597775936127, + "learning_rate": 2.8653542341158744e-06, + "loss": 1.6934, + "step": 29159 + }, + { + "epoch": 8.950276243093922, + "grad_norm": 0.1554742455482483, + "learning_rate": 2.863695982038589e-06, + "loss": 1.7272, + "step": 29160 + }, + { + "epoch": 8.950583179864948, + "grad_norm": 0.17200839519500732, + "learning_rate": 2.8620381957898845e-06, + "loss": 1.7501, + "step": 29161 + }, + { + "epoch": 8.950890116635973, + "grad_norm": 0.18733108043670654, + "learning_rate": 2.860380875386154e-06, + "loss": 1.8017, + "step": 29162 + }, + { + "epoch": 8.951197053406998, + "grad_norm": 0.13730700314044952, + "learning_rate": 2.8587240208437614e-06, + "loss": 1.6831, + "step": 29163 + }, + { + "epoch": 8.951503990178024, + "grad_norm": 0.1442563533782959, + "learning_rate": 2.8570676321790946e-06, + "loss": 1.7231, + "step": 29164 + }, + { + "epoch": 8.95181092694905, + "grad_norm": 0.14817926287651062, + "learning_rate": 2.855411709408512e-06, + "loss": 1.7043, + "step": 29165 + }, + { + "epoch": 8.952117863720074, + "grad_norm": 0.14757658541202545, + "learning_rate": 2.8537562525483787e-06, + "loss": 1.6519, + "step": 29166 + }, + { + "epoch": 8.952424800491098, + "grad_norm": 0.17929381132125854, + "learning_rate": 2.85210126161507e-06, + "loss": 1.7523, + "step": 29167 + }, + { + "epoch": 8.952731737262123, + "grad_norm": 0.13454876840114594, + "learning_rate": 2.850446736624923e-06, + "loss": 1.6921, + "step": 29168 + }, + { + "epoch": 8.953038674033149, + "grad_norm": 0.17734326422214508, + "learning_rate": 2.8487926775943085e-06, + "loss": 1.7082, + "step": 29169 + }, + { + "epoch": 8.953345610804174, + "grad_norm": 0.15544986724853516, + "learning_rate": 2.8471390845395406e-06, + "loss": 1.7067, + "step": 29170 + }, + { + "epoch": 8.9536525475752, + "grad_norm": 0.1256217509508133, + "learning_rate": 2.8454859574769955e-06, + "loss": 1.6546, + "step": 29171 + }, + { + "epoch": 8.953959484346225, + "grad_norm": 0.17201638221740723, + "learning_rate": 2.843833296422993e-06, + "loss": 1.7554, + "step": 29172 + }, + { + "epoch": 8.95426642111725, + "grad_norm": 0.1437663435935974, + "learning_rate": 2.8421811013938703e-06, + "loss": 1.6985, + "step": 29173 + }, + { + "epoch": 8.954573357888275, + "grad_norm": 0.11889111250638962, + "learning_rate": 2.8405293724059532e-06, + "loss": 1.7046, + "step": 29174 + }, + { + "epoch": 8.9548802946593, + "grad_norm": 0.21805889904499054, + "learning_rate": 2.838878109475568e-06, + "loss": 1.7835, + "step": 29175 + }, + { + "epoch": 8.955187231430326, + "grad_norm": 0.17459547519683838, + "learning_rate": 2.8372273126190342e-06, + "loss": 1.6986, + "step": 29176 + }, + { + "epoch": 8.955494168201351, + "grad_norm": 0.16686071455478668, + "learning_rate": 2.835576981852656e-06, + "loss": 1.6858, + "step": 29177 + }, + { + "epoch": 8.955801104972375, + "grad_norm": 0.19014745950698853, + "learning_rate": 2.833927117192753e-06, + "loss": 1.742, + "step": 29178 + }, + { + "epoch": 8.9561080417434, + "grad_norm": 0.10640473663806915, + "learning_rate": 2.832277718655629e-06, + "loss": 1.6363, + "step": 29179 + }, + { + "epoch": 8.956414978514426, + "grad_norm": 0.12378805875778198, + "learning_rate": 2.8306287862575777e-06, + "loss": 1.6359, + "step": 29180 + }, + { + "epoch": 8.956721915285451, + "grad_norm": 0.1519845575094223, + "learning_rate": 2.828980320014901e-06, + "loss": 1.7112, + "step": 29181 + }, + { + "epoch": 8.957028852056476, + "grad_norm": 0.1550975888967514, + "learning_rate": 2.827332319943893e-06, + "loss": 1.7417, + "step": 29182 + }, + { + "epoch": 8.957335788827502, + "grad_norm": 0.1387033611536026, + "learning_rate": 2.8256847860608224e-06, + "loss": 1.6567, + "step": 29183 + }, + { + "epoch": 8.957642725598527, + "grad_norm": 0.14006295800209045, + "learning_rate": 2.8240377183820053e-06, + "loss": 1.7156, + "step": 29184 + }, + { + "epoch": 8.957949662369552, + "grad_norm": 0.13202004134655, + "learning_rate": 2.8223911169236782e-06, + "loss": 1.6567, + "step": 29185 + }, + { + "epoch": 8.958256599140578, + "grad_norm": 0.12789477407932281, + "learning_rate": 2.8207449817021505e-06, + "loss": 1.7102, + "step": 29186 + }, + { + "epoch": 8.958563535911603, + "grad_norm": 0.1773017793893814, + "learning_rate": 2.8190993127336583e-06, + "loss": 1.7004, + "step": 29187 + }, + { + "epoch": 8.958870472682626, + "grad_norm": 0.17584890127182007, + "learning_rate": 2.81745411003449e-06, + "loss": 1.7513, + "step": 29188 + }, + { + "epoch": 8.959177409453652, + "grad_norm": 0.1679183840751648, + "learning_rate": 2.8158093736208923e-06, + "loss": 1.7319, + "step": 29189 + }, + { + "epoch": 8.959484346224677, + "grad_norm": 0.14683100581169128, + "learning_rate": 2.8141651035091255e-06, + "loss": 1.6594, + "step": 29190 + }, + { + "epoch": 8.959791282995702, + "grad_norm": 0.17727963626384735, + "learning_rate": 2.8125212997154316e-06, + "loss": 1.7577, + "step": 29191 + }, + { + "epoch": 8.960098219766728, + "grad_norm": 0.12865738570690155, + "learning_rate": 2.810877962256059e-06, + "loss": 1.656, + "step": 29192 + }, + { + "epoch": 8.960405156537753, + "grad_norm": 0.15322017669677734, + "learning_rate": 2.80923509114725e-06, + "loss": 1.6994, + "step": 29193 + }, + { + "epoch": 8.960712093308778, + "grad_norm": 0.11874222010374069, + "learning_rate": 2.8075926864052417e-06, + "loss": 1.6514, + "step": 29194 + }, + { + "epoch": 8.961019030079804, + "grad_norm": 0.13674114644527435, + "learning_rate": 2.80595074804626e-06, + "loss": 1.6781, + "step": 29195 + }, + { + "epoch": 8.96132596685083, + "grad_norm": 0.13738766312599182, + "learning_rate": 2.8043092760865364e-06, + "loss": 1.7214, + "step": 29196 + }, + { + "epoch": 8.961632903621854, + "grad_norm": 0.15917620062828064, + "learning_rate": 2.8026682705422914e-06, + "loss": 1.7561, + "step": 29197 + }, + { + "epoch": 8.96193984039288, + "grad_norm": 0.18082000315189362, + "learning_rate": 2.8010277314297395e-06, + "loss": 1.7021, + "step": 29198 + }, + { + "epoch": 8.962246777163905, + "grad_norm": 0.1440226435661316, + "learning_rate": 2.799387658765096e-06, + "loss": 1.6829, + "step": 29199 + }, + { + "epoch": 8.962553713934929, + "grad_norm": 0.18358100950717926, + "learning_rate": 2.7977480525645692e-06, + "loss": 1.7207, + "step": 29200 + }, + { + "epoch": 8.962860650705954, + "grad_norm": 0.12614849209785461, + "learning_rate": 2.796108912844364e-06, + "loss": 1.705, + "step": 29201 + }, + { + "epoch": 8.96316758747698, + "grad_norm": 0.11331766098737717, + "learning_rate": 2.7944702396206666e-06, + "loss": 1.6343, + "step": 29202 + }, + { + "epoch": 8.963474524248005, + "grad_norm": 0.17110171914100647, + "learning_rate": 2.792832032909698e-06, + "loss": 1.8129, + "step": 29203 + }, + { + "epoch": 8.96378146101903, + "grad_norm": 0.19446058571338654, + "learning_rate": 2.791194292727617e-06, + "loss": 1.7015, + "step": 29204 + }, + { + "epoch": 8.964088397790055, + "grad_norm": 0.17975226044654846, + "learning_rate": 2.789557019090644e-06, + "loss": 1.7408, + "step": 29205 + }, + { + "epoch": 8.96439533456108, + "grad_norm": 0.15492287278175354, + "learning_rate": 2.787920212014922e-06, + "loss": 1.7307, + "step": 29206 + }, + { + "epoch": 8.964702271332106, + "grad_norm": 0.14430275559425354, + "learning_rate": 2.7862838715166485e-06, + "loss": 1.7112, + "step": 29207 + }, + { + "epoch": 8.965009208103131, + "grad_norm": 0.13850049674510956, + "learning_rate": 2.7846479976119944e-06, + "loss": 1.7177, + "step": 29208 + }, + { + "epoch": 8.965316144874157, + "grad_norm": 0.17376014590263367, + "learning_rate": 2.783012590317119e-06, + "loss": 1.7612, + "step": 29209 + }, + { + "epoch": 8.96562308164518, + "grad_norm": 0.13757693767547607, + "learning_rate": 2.7813776496481868e-06, + "loss": 1.7246, + "step": 29210 + }, + { + "epoch": 8.965930018416206, + "grad_norm": 0.17782050371170044, + "learning_rate": 2.7797431756213633e-06, + "loss": 1.7196, + "step": 29211 + }, + { + "epoch": 8.966236955187231, + "grad_norm": 0.14082394540309906, + "learning_rate": 2.7781091682527906e-06, + "loss": 1.7074, + "step": 29212 + }, + { + "epoch": 8.966543891958256, + "grad_norm": 0.2748696506023407, + "learning_rate": 2.7764756275586168e-06, + "loss": 1.819, + "step": 29213 + }, + { + "epoch": 8.966850828729282, + "grad_norm": 0.134973406791687, + "learning_rate": 2.774842553554996e-06, + "loss": 1.6725, + "step": 29214 + }, + { + "epoch": 8.967157765500307, + "grad_norm": 0.15217997133731842, + "learning_rate": 2.7732099462580594e-06, + "loss": 1.6953, + "step": 29215 + }, + { + "epoch": 8.967464702271332, + "grad_norm": 0.15674369037151337, + "learning_rate": 2.771577805683939e-06, + "loss": 1.7108, + "step": 29216 + }, + { + "epoch": 8.967771639042358, + "grad_norm": 0.13885504007339478, + "learning_rate": 2.769946131848772e-06, + "loss": 1.7106, + "step": 29217 + }, + { + "epoch": 8.968078575813383, + "grad_norm": 0.13795867562294006, + "learning_rate": 2.768314924768678e-06, + "loss": 1.6831, + "step": 29218 + }, + { + "epoch": 8.968385512584408, + "grad_norm": 0.15533487498760223, + "learning_rate": 2.7666841844597724e-06, + "loss": 1.7278, + "step": 29219 + }, + { + "epoch": 8.968692449355434, + "grad_norm": 0.13686540722846985, + "learning_rate": 2.7650539109381867e-06, + "loss": 1.6854, + "step": 29220 + }, + { + "epoch": 8.968999386126457, + "grad_norm": 0.1479746252298355, + "learning_rate": 2.763424104220019e-06, + "loss": 1.7119, + "step": 29221 + }, + { + "epoch": 8.969306322897483, + "grad_norm": 0.12035561352968216, + "learning_rate": 2.7617947643213906e-06, + "loss": 1.6295, + "step": 29222 + }, + { + "epoch": 8.969613259668508, + "grad_norm": 0.12784910202026367, + "learning_rate": 2.7601658912583763e-06, + "loss": 1.6952, + "step": 29223 + }, + { + "epoch": 8.969920196439533, + "grad_norm": 0.14596527814865112, + "learning_rate": 2.7585374850471025e-06, + "loss": 1.7003, + "step": 29224 + }, + { + "epoch": 8.970227133210559, + "grad_norm": 0.17561540007591248, + "learning_rate": 2.7569095457036455e-06, + "loss": 1.7687, + "step": 29225 + }, + { + "epoch": 8.970534069981584, + "grad_norm": 0.17456963658332825, + "learning_rate": 2.7552820732441032e-06, + "loss": 1.6927, + "step": 29226 + }, + { + "epoch": 8.97084100675261, + "grad_norm": 0.15346206724643707, + "learning_rate": 2.7536550676845574e-06, + "loss": 1.7057, + "step": 29227 + }, + { + "epoch": 8.971147943523635, + "grad_norm": 0.113531194627285, + "learning_rate": 2.752028529041073e-06, + "loss": 1.6844, + "step": 29228 + }, + { + "epoch": 8.97145488029466, + "grad_norm": 0.18523596227169037, + "learning_rate": 2.7504024573297426e-06, + "loss": 1.7468, + "step": 29229 + }, + { + "epoch": 8.971761817065685, + "grad_norm": 0.14123110473155975, + "learning_rate": 2.7487768525666313e-06, + "loss": 1.699, + "step": 29230 + }, + { + "epoch": 8.972068753836709, + "grad_norm": 0.17675861716270447, + "learning_rate": 2.747151714767798e-06, + "loss": 1.745, + "step": 29231 + }, + { + "epoch": 8.972375690607734, + "grad_norm": 0.1529264897108078, + "learning_rate": 2.7455270439493085e-06, + "loss": 1.686, + "step": 29232 + }, + { + "epoch": 8.97268262737876, + "grad_norm": 0.14173699915409088, + "learning_rate": 2.743902840127216e-06, + "loss": 1.6717, + "step": 29233 + }, + { + "epoch": 8.972989564149785, + "grad_norm": 0.15535210072994232, + "learning_rate": 2.7422791033175743e-06, + "loss": 1.7433, + "step": 29234 + }, + { + "epoch": 8.97329650092081, + "grad_norm": 0.12831814587116241, + "learning_rate": 2.740655833536432e-06, + "loss": 1.7548, + "step": 29235 + }, + { + "epoch": 8.973603437691835, + "grad_norm": 0.19681085646152496, + "learning_rate": 2.739033030799815e-06, + "loss": 1.7841, + "step": 29236 + }, + { + "epoch": 8.97391037446286, + "grad_norm": 0.1496504247188568, + "learning_rate": 2.737410695123793e-06, + "loss": 1.6646, + "step": 29237 + }, + { + "epoch": 8.974217311233886, + "grad_norm": 0.15000486373901367, + "learning_rate": 2.735788826524366e-06, + "loss": 1.6938, + "step": 29238 + }, + { + "epoch": 8.974524248004911, + "grad_norm": 0.11816641688346863, + "learning_rate": 2.734167425017592e-06, + "loss": 1.6738, + "step": 29239 + }, + { + "epoch": 8.974831184775937, + "grad_norm": 0.12041781097650528, + "learning_rate": 2.7325464906194585e-06, + "loss": 1.6798, + "step": 29240 + }, + { + "epoch": 8.975138121546962, + "grad_norm": 0.1780797690153122, + "learning_rate": 2.7309260233460143e-06, + "loss": 1.7608, + "step": 29241 + }, + { + "epoch": 8.975445058317986, + "grad_norm": 0.19122804701328278, + "learning_rate": 2.7293060232132683e-06, + "loss": 1.7706, + "step": 29242 + }, + { + "epoch": 8.975751995089011, + "grad_norm": 0.16770713031291962, + "learning_rate": 2.7276864902372244e-06, + "loss": 1.736, + "step": 29243 + }, + { + "epoch": 8.976058931860036, + "grad_norm": 0.17613980174064636, + "learning_rate": 2.7260674244338922e-06, + "loss": 1.7674, + "step": 29244 + }, + { + "epoch": 8.976365868631062, + "grad_norm": 0.17744678258895874, + "learning_rate": 2.7244488258192648e-06, + "loss": 1.7564, + "step": 29245 + }, + { + "epoch": 8.976672805402087, + "grad_norm": 0.15087327361106873, + "learning_rate": 2.7228306944093394e-06, + "loss": 1.7245, + "step": 29246 + }, + { + "epoch": 8.976979742173112, + "grad_norm": 0.16417519748210907, + "learning_rate": 2.721213030220121e-06, + "loss": 1.7329, + "step": 29247 + }, + { + "epoch": 8.977286678944138, + "grad_norm": 0.15511249005794525, + "learning_rate": 2.7195958332675796e-06, + "loss": 1.6803, + "step": 29248 + }, + { + "epoch": 8.977593615715163, + "grad_norm": 0.18222862482070923, + "learning_rate": 2.7179791035677083e-06, + "loss": 1.7186, + "step": 29249 + }, + { + "epoch": 8.977900552486188, + "grad_norm": 0.16677385568618774, + "learning_rate": 2.716362841136477e-06, + "loss": 1.688, + "step": 29250 + }, + { + "epoch": 8.978207489257214, + "grad_norm": 0.1820213794708252, + "learning_rate": 2.714747045989863e-06, + "loss": 1.7801, + "step": 29251 + }, + { + "epoch": 8.978514426028239, + "grad_norm": 0.1464485377073288, + "learning_rate": 2.7131317181438355e-06, + "loss": 1.6667, + "step": 29252 + }, + { + "epoch": 8.978821362799263, + "grad_norm": 0.13353987038135529, + "learning_rate": 2.711516857614349e-06, + "loss": 1.6492, + "step": 29253 + }, + { + "epoch": 8.979128299570288, + "grad_norm": 0.14857034385204315, + "learning_rate": 2.70990246441738e-06, + "loss": 1.6902, + "step": 29254 + }, + { + "epoch": 8.979435236341313, + "grad_norm": 0.1581316888332367, + "learning_rate": 2.708288538568865e-06, + "loss": 1.7188, + "step": 29255 + }, + { + "epoch": 8.979742173112339, + "grad_norm": 0.1437988132238388, + "learning_rate": 2.7066750800847695e-06, + "loss": 1.6982, + "step": 29256 + }, + { + "epoch": 8.980049109883364, + "grad_norm": 0.15172283351421356, + "learning_rate": 2.705062088981014e-06, + "loss": 1.6898, + "step": 29257 + }, + { + "epoch": 8.98035604665439, + "grad_norm": 0.2507859170436859, + "learning_rate": 2.703449565273569e-06, + "loss": 1.7433, + "step": 29258 + }, + { + "epoch": 8.980662983425415, + "grad_norm": 0.19917117059230804, + "learning_rate": 2.701837508978361e-06, + "loss": 1.7411, + "step": 29259 + }, + { + "epoch": 8.98096992019644, + "grad_norm": 0.17466393113136292, + "learning_rate": 2.7002259201113044e-06, + "loss": 1.712, + "step": 29260 + }, + { + "epoch": 8.981276856967465, + "grad_norm": 0.1595284342765808, + "learning_rate": 2.698614798688348e-06, + "loss": 1.768, + "step": 29261 + }, + { + "epoch": 8.98158379373849, + "grad_norm": 0.1435062289237976, + "learning_rate": 2.6970041447253956e-06, + "loss": 1.6715, + "step": 29262 + }, + { + "epoch": 8.981890730509516, + "grad_norm": 0.16341650485992432, + "learning_rate": 2.695393958238379e-06, + "loss": 1.7563, + "step": 29263 + }, + { + "epoch": 8.98219766728054, + "grad_norm": 0.1981598138809204, + "learning_rate": 2.6937842392432023e-06, + "loss": 1.744, + "step": 29264 + }, + { + "epoch": 8.982504604051565, + "grad_norm": 0.1611155867576599, + "learning_rate": 2.6921749877557802e-06, + "loss": 1.6874, + "step": 29265 + }, + { + "epoch": 8.98281154082259, + "grad_norm": 0.17430151998996735, + "learning_rate": 2.690566203792011e-06, + "loss": 1.7338, + "step": 29266 + }, + { + "epoch": 8.983118477593615, + "grad_norm": 0.13210003077983856, + "learning_rate": 2.688957887367799e-06, + "loss": 1.7221, + "step": 29267 + }, + { + "epoch": 8.98342541436464, + "grad_norm": 0.167892724275589, + "learning_rate": 2.6873500384990313e-06, + "loss": 1.6985, + "step": 29268 + }, + { + "epoch": 8.983732351135666, + "grad_norm": 0.1600649207830429, + "learning_rate": 2.685742657201601e-06, + "loss": 1.7309, + "step": 29269 + }, + { + "epoch": 8.984039287906691, + "grad_norm": 0.1755276322364807, + "learning_rate": 2.6841357434913892e-06, + "loss": 1.7173, + "step": 29270 + }, + { + "epoch": 8.984346224677717, + "grad_norm": 0.14754937589168549, + "learning_rate": 2.682529297384295e-06, + "loss": 1.6948, + "step": 29271 + }, + { + "epoch": 8.984653161448742, + "grad_norm": 0.1670856773853302, + "learning_rate": 2.6809233188961614e-06, + "loss": 1.7302, + "step": 29272 + }, + { + "epoch": 8.984960098219767, + "grad_norm": 0.18906234204769135, + "learning_rate": 2.6793178080428973e-06, + "loss": 1.7336, + "step": 29273 + }, + { + "epoch": 8.985267034990791, + "grad_norm": 0.17759168148040771, + "learning_rate": 2.6777127648403345e-06, + "loss": 1.762, + "step": 29274 + }, + { + "epoch": 8.985573971761816, + "grad_norm": 0.12218867987394333, + "learning_rate": 2.676108189304355e-06, + "loss": 1.6987, + "step": 29275 + }, + { + "epoch": 8.985880908532842, + "grad_norm": 0.1504579335451126, + "learning_rate": 2.674504081450824e-06, + "loss": 1.6683, + "step": 29276 + }, + { + "epoch": 8.986187845303867, + "grad_norm": 0.15826797485351562, + "learning_rate": 2.6729004412955616e-06, + "loss": 1.7131, + "step": 29277 + }, + { + "epoch": 8.986494782074892, + "grad_norm": 0.12599892914295197, + "learning_rate": 2.671297268854456e-06, + "loss": 1.6603, + "step": 29278 + }, + { + "epoch": 8.986801718845918, + "grad_norm": 0.17663413286209106, + "learning_rate": 2.6696945641433157e-06, + "loss": 1.7231, + "step": 29279 + }, + { + "epoch": 8.987108655616943, + "grad_norm": 0.16194280982017517, + "learning_rate": 2.668092327178001e-06, + "loss": 1.695, + "step": 29280 + }, + { + "epoch": 8.987415592387968, + "grad_norm": 0.1310044527053833, + "learning_rate": 2.6664905579743384e-06, + "loss": 1.6997, + "step": 29281 + }, + { + "epoch": 8.987722529158994, + "grad_norm": 0.18553194403648376, + "learning_rate": 2.6648892565481587e-06, + "loss": 1.7594, + "step": 29282 + }, + { + "epoch": 8.988029465930019, + "grad_norm": 0.17653048038482666, + "learning_rate": 2.6632884229152887e-06, + "loss": 1.7687, + "step": 29283 + }, + { + "epoch": 8.988336402701044, + "grad_norm": 0.14085285365581512, + "learning_rate": 2.661688057091549e-06, + "loss": 1.6875, + "step": 29284 + }, + { + "epoch": 8.988643339472068, + "grad_norm": 0.14821402728557587, + "learning_rate": 2.6600881590927553e-06, + "loss": 1.7579, + "step": 29285 + }, + { + "epoch": 8.988950276243093, + "grad_norm": 0.16718199849128723, + "learning_rate": 2.658488728934716e-06, + "loss": 1.7093, + "step": 29286 + }, + { + "epoch": 8.989257213014119, + "grad_norm": 0.16012485325336456, + "learning_rate": 2.6568897666332303e-06, + "loss": 1.6937, + "step": 29287 + }, + { + "epoch": 8.989564149785144, + "grad_norm": 0.186227485537529, + "learning_rate": 2.655291272204119e-06, + "loss": 1.6682, + "step": 29288 + }, + { + "epoch": 8.98987108655617, + "grad_norm": 0.15328755974769592, + "learning_rate": 2.653693245663158e-06, + "loss": 1.7221, + "step": 29289 + }, + { + "epoch": 8.990178023327195, + "grad_norm": 0.11358486860990524, + "learning_rate": 2.6520956870261684e-06, + "loss": 1.6721, + "step": 29290 + }, + { + "epoch": 8.99048496009822, + "grad_norm": 0.16672687232494354, + "learning_rate": 2.6504985963089035e-06, + "loss": 1.7192, + "step": 29291 + }, + { + "epoch": 8.990791896869245, + "grad_norm": 0.13929708302021027, + "learning_rate": 2.6489019735271734e-06, + "loss": 1.69, + "step": 29292 + }, + { + "epoch": 8.99109883364027, + "grad_norm": 0.1592891961336136, + "learning_rate": 2.647305818696749e-06, + "loss": 1.6943, + "step": 29293 + }, + { + "epoch": 8.991405770411296, + "grad_norm": 0.1534394770860672, + "learning_rate": 2.6457101318333957e-06, + "loss": 1.6993, + "step": 29294 + }, + { + "epoch": 8.99171270718232, + "grad_norm": 0.17096973955631256, + "learning_rate": 2.6441149129529e-06, + "loss": 1.7627, + "step": 29295 + }, + { + "epoch": 8.992019643953345, + "grad_norm": 0.13695703446865082, + "learning_rate": 2.642520162071005e-06, + "loss": 1.7047, + "step": 29296 + }, + { + "epoch": 8.99232658072437, + "grad_norm": 0.13649116456508636, + "learning_rate": 2.6409258792034873e-06, + "loss": 1.6666, + "step": 29297 + }, + { + "epoch": 8.992633517495396, + "grad_norm": 0.13003148138523102, + "learning_rate": 2.639332064366096e-06, + "loss": 1.6862, + "step": 29298 + }, + { + "epoch": 8.99294045426642, + "grad_norm": 0.1290612667798996, + "learning_rate": 2.6377387175745894e-06, + "loss": 1.703, + "step": 29299 + }, + { + "epoch": 8.993247391037446, + "grad_norm": 0.14106552302837372, + "learning_rate": 2.636145838844706e-06, + "loss": 1.6771, + "step": 29300 + }, + { + "epoch": 8.993554327808472, + "grad_norm": 0.13510754704475403, + "learning_rate": 2.6345534281921937e-06, + "loss": 1.6569, + "step": 29301 + }, + { + "epoch": 8.993861264579497, + "grad_norm": 0.11940879374742508, + "learning_rate": 2.632961485632779e-06, + "loss": 1.6719, + "step": 29302 + }, + { + "epoch": 8.994168201350522, + "grad_norm": 0.22212430834770203, + "learning_rate": 2.6313700111822104e-06, + "loss": 1.7285, + "step": 29303 + }, + { + "epoch": 8.994475138121548, + "grad_norm": 0.144329234957695, + "learning_rate": 2.629779004856192e-06, + "loss": 1.6928, + "step": 29304 + }, + { + "epoch": 8.994782074892573, + "grad_norm": 0.14428433775901794, + "learning_rate": 2.6281884666704837e-06, + "loss": 1.7371, + "step": 29305 + }, + { + "epoch": 8.995089011663598, + "grad_norm": 0.12600816786289215, + "learning_rate": 2.6265983966407615e-06, + "loss": 1.6803, + "step": 29306 + }, + { + "epoch": 8.995395948434622, + "grad_norm": 0.14739328622817993, + "learning_rate": 2.6250087947827793e-06, + "loss": 1.7135, + "step": 29307 + }, + { + "epoch": 8.995702885205647, + "grad_norm": 0.14694075286388397, + "learning_rate": 2.623419661112209e-06, + "loss": 1.7161, + "step": 29308 + }, + { + "epoch": 8.996009821976672, + "grad_norm": 0.1703605204820633, + "learning_rate": 2.6218309956447864e-06, + "loss": 1.7415, + "step": 29309 + }, + { + "epoch": 8.996316758747698, + "grad_norm": 0.1334623247385025, + "learning_rate": 2.6202427983961996e-06, + "loss": 1.7227, + "step": 29310 + }, + { + "epoch": 8.996623695518723, + "grad_norm": 0.16613437235355377, + "learning_rate": 2.6186550693821364e-06, + "loss": 1.6925, + "step": 29311 + }, + { + "epoch": 8.996930632289748, + "grad_norm": 0.12817926704883575, + "learning_rate": 2.617067808618301e-06, + "loss": 1.6296, + "step": 29312 + }, + { + "epoch": 8.997237569060774, + "grad_norm": 0.13783088326454163, + "learning_rate": 2.6154810161203693e-06, + "loss": 1.6801, + "step": 29313 + }, + { + "epoch": 8.997544505831799, + "grad_norm": 0.19866502285003662, + "learning_rate": 2.6138946919040285e-06, + "loss": 1.7817, + "step": 29314 + }, + { + "epoch": 8.997851442602824, + "grad_norm": 0.12466265261173248, + "learning_rate": 2.61230883598495e-06, + "loss": 1.7001, + "step": 29315 + }, + { + "epoch": 8.99815837937385, + "grad_norm": 0.13250842690467834, + "learning_rate": 2.6107234483788158e-06, + "loss": 1.6932, + "step": 29316 + }, + { + "epoch": 8.998465316144873, + "grad_norm": 0.13475441932678223, + "learning_rate": 2.6091385291012904e-06, + "loss": 1.6906, + "step": 29317 + }, + { + "epoch": 8.998772252915899, + "grad_norm": 0.14250501990318298, + "learning_rate": 2.6075540781680284e-06, + "loss": 1.7032, + "step": 29318 + }, + { + "epoch": 8.999079189686924, + "grad_norm": 0.11724159866571426, + "learning_rate": 2.6059700955947007e-06, + "loss": 1.6319, + "step": 29319 + }, + { + "epoch": 8.99938612645795, + "grad_norm": 0.15192265808582306, + "learning_rate": 2.6043865813969505e-06, + "loss": 1.699, + "step": 29320 + }, + { + "epoch": 8.999693063228975, + "grad_norm": 0.14814937114715576, + "learning_rate": 2.6028035355904257e-06, + "loss": 1.7313, + "step": 29321 + }, + { + "epoch": 9.0, + "grad_norm": 0.20881028473377228, + "learning_rate": 2.6012209581907922e-06, + "loss": 1.8009, + "step": 29322 + }, + { + "epoch": 9.000306936771025, + "grad_norm": 0.15227021276950836, + "learning_rate": 2.5996388492136593e-06, + "loss": 1.7501, + "step": 29323 + }, + { + "epoch": 9.00061387354205, + "grad_norm": 0.1541164219379425, + "learning_rate": 2.598057208674692e-06, + "loss": 1.727, + "step": 29324 + }, + { + "epoch": 9.000920810313076, + "grad_norm": 0.15358538925647736, + "learning_rate": 2.596476036589496e-06, + "loss": 1.7363, + "step": 29325 + }, + { + "epoch": 9.001227747084101, + "grad_norm": 0.13264121115207672, + "learning_rate": 2.5948953329737126e-06, + "loss": 1.6988, + "step": 29326 + }, + { + "epoch": 9.001534683855127, + "grad_norm": 0.13748973608016968, + "learning_rate": 2.593315097842963e-06, + "loss": 1.7003, + "step": 29327 + }, + { + "epoch": 9.00184162062615, + "grad_norm": 0.1346716433763504, + "learning_rate": 2.5917353312128467e-06, + "loss": 1.6819, + "step": 29328 + }, + { + "epoch": 9.002148557397176, + "grad_norm": 0.13923269510269165, + "learning_rate": 2.5901560330990006e-06, + "loss": 1.7161, + "step": 29329 + }, + { + "epoch": 9.002455494168201, + "grad_norm": 0.17402863502502441, + "learning_rate": 2.588577203517012e-06, + "loss": 1.7039, + "step": 29330 + }, + { + "epoch": 9.002762430939226, + "grad_norm": 0.14584888517856598, + "learning_rate": 2.5869988424824964e-06, + "loss": 1.7306, + "step": 29331 + }, + { + "epoch": 9.003069367710252, + "grad_norm": 0.12232481688261032, + "learning_rate": 2.5854209500110472e-06, + "loss": 1.6689, + "step": 29332 + }, + { + "epoch": 9.003376304481277, + "grad_norm": 0.15231020748615265, + "learning_rate": 2.583843526118257e-06, + "loss": 1.7308, + "step": 29333 + }, + { + "epoch": 9.003683241252302, + "grad_norm": 0.1362350732088089, + "learning_rate": 2.582266570819719e-06, + "loss": 1.7089, + "step": 29334 + }, + { + "epoch": 9.003990178023328, + "grad_norm": 0.16162967681884766, + "learning_rate": 2.5806900841310154e-06, + "loss": 1.7254, + "step": 29335 + }, + { + "epoch": 9.004297114794353, + "grad_norm": 0.19027012586593628, + "learning_rate": 2.579114066067723e-06, + "loss": 1.7523, + "step": 29336 + }, + { + "epoch": 9.004604051565378, + "grad_norm": 0.15073107182979584, + "learning_rate": 2.5775385166454224e-06, + "loss": 1.7219, + "step": 29337 + }, + { + "epoch": 9.004910988336404, + "grad_norm": 0.18943648040294647, + "learning_rate": 2.5759634358796746e-06, + "loss": 1.7052, + "step": 29338 + }, + { + "epoch": 9.005217925107427, + "grad_norm": 0.17359869182109833, + "learning_rate": 2.5743888237860615e-06, + "loss": 1.7475, + "step": 29339 + }, + { + "epoch": 9.005524861878452, + "grad_norm": 0.1170465275645256, + "learning_rate": 2.5728146803801256e-06, + "loss": 1.6514, + "step": 29340 + }, + { + "epoch": 9.005831798649478, + "grad_norm": 0.19763801991939545, + "learning_rate": 2.5712410056774494e-06, + "loss": 1.7476, + "step": 29341 + }, + { + "epoch": 9.006138735420503, + "grad_norm": 0.11056608706712723, + "learning_rate": 2.569667799693548e-06, + "loss": 1.6644, + "step": 29342 + }, + { + "epoch": 9.006445672191528, + "grad_norm": 0.11823355406522751, + "learning_rate": 2.5680950624440038e-06, + "loss": 1.6476, + "step": 29343 + }, + { + "epoch": 9.006752608962554, + "grad_norm": 0.12750595808029175, + "learning_rate": 2.5665227939443425e-06, + "loss": 1.6715, + "step": 29344 + }, + { + "epoch": 9.00705954573358, + "grad_norm": 0.14100933074951172, + "learning_rate": 2.5649509942100967e-06, + "loss": 1.6917, + "step": 29345 + }, + { + "epoch": 9.007366482504604, + "grad_norm": 0.15324008464813232, + "learning_rate": 2.5633796632568207e-06, + "loss": 1.7051, + "step": 29346 + }, + { + "epoch": 9.00767341927563, + "grad_norm": 0.112611785531044, + "learning_rate": 2.5618088011000183e-06, + "loss": 1.6693, + "step": 29347 + }, + { + "epoch": 9.007980356046655, + "grad_norm": 0.1416759490966797, + "learning_rate": 2.560238407755228e-06, + "loss": 1.725, + "step": 29348 + }, + { + "epoch": 9.008287292817679, + "grad_norm": 0.10026350617408752, + "learning_rate": 2.558668483237969e-06, + "loss": 1.6199, + "step": 29349 + }, + { + "epoch": 9.008594229588704, + "grad_norm": 0.17179331183433533, + "learning_rate": 2.5570990275637585e-06, + "loss": 1.708, + "step": 29350 + }, + { + "epoch": 9.00890116635973, + "grad_norm": 0.17252036929130554, + "learning_rate": 2.5555300407480996e-06, + "loss": 1.6981, + "step": 29351 + }, + { + "epoch": 9.009208103130755, + "grad_norm": 0.1174364760518074, + "learning_rate": 2.5539615228064973e-06, + "loss": 1.6498, + "step": 29352 + }, + { + "epoch": 9.00951503990178, + "grad_norm": 0.16481025516986847, + "learning_rate": 2.552393473754461e-06, + "loss": 1.6855, + "step": 29353 + }, + { + "epoch": 9.009821976672805, + "grad_norm": 0.15297551453113556, + "learning_rate": 2.5508258936074836e-06, + "loss": 1.7085, + "step": 29354 + }, + { + "epoch": 9.01012891344383, + "grad_norm": 0.182330921292305, + "learning_rate": 2.5492587823810476e-06, + "loss": 1.6955, + "step": 29355 + }, + { + "epoch": 9.010435850214856, + "grad_norm": 0.13587582111358643, + "learning_rate": 2.547692140090657e-06, + "loss": 1.6933, + "step": 29356 + }, + { + "epoch": 9.010742786985881, + "grad_norm": 0.16220442950725555, + "learning_rate": 2.5461259667517723e-06, + "loss": 1.7386, + "step": 29357 + }, + { + "epoch": 9.011049723756907, + "grad_norm": 0.14470438659191132, + "learning_rate": 2.5445602623799025e-06, + "loss": 1.6613, + "step": 29358 + }, + { + "epoch": 9.011356660527932, + "grad_norm": 0.17066054046154022, + "learning_rate": 2.5429950269904856e-06, + "loss": 1.6925, + "step": 29359 + }, + { + "epoch": 9.011663597298956, + "grad_norm": 0.178157240152359, + "learning_rate": 2.541430260599026e-06, + "loss": 1.7966, + "step": 29360 + }, + { + "epoch": 9.011970534069981, + "grad_norm": 0.19744886457920074, + "learning_rate": 2.5398659632209552e-06, + "loss": 1.7737, + "step": 29361 + }, + { + "epoch": 9.012277470841006, + "grad_norm": 0.1326957792043686, + "learning_rate": 2.538302134871745e-06, + "loss": 1.6749, + "step": 29362 + }, + { + "epoch": 9.012584407612032, + "grad_norm": 0.1415095329284668, + "learning_rate": 2.5367387755668602e-06, + "loss": 1.68, + "step": 29363 + }, + { + "epoch": 9.012891344383057, + "grad_norm": 0.13428185880184174, + "learning_rate": 2.535175885321733e-06, + "loss": 1.6674, + "step": 29364 + }, + { + "epoch": 9.013198281154082, + "grad_norm": 0.1266496479511261, + "learning_rate": 2.5336134641518183e-06, + "loss": 1.6538, + "step": 29365 + }, + { + "epoch": 9.013505217925108, + "grad_norm": 0.1252683401107788, + "learning_rate": 2.532051512072564e-06, + "loss": 1.6552, + "step": 29366 + }, + { + "epoch": 9.013812154696133, + "grad_norm": 0.13982512056827545, + "learning_rate": 2.5304900290993916e-06, + "loss": 1.6614, + "step": 29367 + }, + { + "epoch": 9.014119091467158, + "grad_norm": 0.15743471682071686, + "learning_rate": 2.528929015247744e-06, + "loss": 1.759, + "step": 29368 + }, + { + "epoch": 9.014426028238184, + "grad_norm": 0.12230109423398972, + "learning_rate": 2.5273684705330424e-06, + "loss": 1.6955, + "step": 29369 + }, + { + "epoch": 9.014732965009209, + "grad_norm": 0.13204556703567505, + "learning_rate": 2.525808394970708e-06, + "loss": 1.7259, + "step": 29370 + }, + { + "epoch": 9.015039901780233, + "grad_norm": 0.15656854212284088, + "learning_rate": 2.5242487885761614e-06, + "loss": 1.7027, + "step": 29371 + }, + { + "epoch": 9.015346838551258, + "grad_norm": 0.1528550535440445, + "learning_rate": 2.5226896513648178e-06, + "loss": 1.6665, + "step": 29372 + }, + { + "epoch": 9.015653775322283, + "grad_norm": 0.26738816499710083, + "learning_rate": 2.5211309833520825e-06, + "loss": 1.7587, + "step": 29373 + }, + { + "epoch": 9.015960712093309, + "grad_norm": 0.19041690230369568, + "learning_rate": 2.519572784553348e-06, + "loss": 1.7452, + "step": 29374 + }, + { + "epoch": 9.016267648864334, + "grad_norm": 0.1666717827320099, + "learning_rate": 2.518015054984041e-06, + "loss": 1.7117, + "step": 29375 + }, + { + "epoch": 9.01657458563536, + "grad_norm": 0.18895253539085388, + "learning_rate": 2.5164577946595214e-06, + "loss": 1.7671, + "step": 29376 + }, + { + "epoch": 9.016881522406385, + "grad_norm": 0.1346922218799591, + "learning_rate": 2.5149010035952158e-06, + "loss": 1.6986, + "step": 29377 + }, + { + "epoch": 9.01718845917741, + "grad_norm": 0.15223844349384308, + "learning_rate": 2.5133446818064786e-06, + "loss": 1.725, + "step": 29378 + }, + { + "epoch": 9.017495395948435, + "grad_norm": 0.19043175876140594, + "learning_rate": 2.511788829308703e-06, + "loss": 1.733, + "step": 29379 + }, + { + "epoch": 9.01780233271946, + "grad_norm": 0.17035910487174988, + "learning_rate": 2.510233446117272e-06, + "loss": 1.6885, + "step": 29380 + }, + { + "epoch": 9.018109269490484, + "grad_norm": 0.18320874869823456, + "learning_rate": 2.5086785322475325e-06, + "loss": 1.7783, + "step": 29381 + }, + { + "epoch": 9.01841620626151, + "grad_norm": 0.13961733877658844, + "learning_rate": 2.507124087714885e-06, + "loss": 1.6768, + "step": 29382 + }, + { + "epoch": 9.018723143032535, + "grad_norm": 0.12573479115962982, + "learning_rate": 2.505570112534661e-06, + "loss": 1.6902, + "step": 29383 + }, + { + "epoch": 9.01903007980356, + "grad_norm": 0.15192000567913055, + "learning_rate": 2.504016606722237e-06, + "loss": 1.7115, + "step": 29384 + }, + { + "epoch": 9.019337016574585, + "grad_norm": 0.16358907520771027, + "learning_rate": 2.5024635702929565e-06, + "loss": 1.7075, + "step": 29385 + }, + { + "epoch": 9.01964395334561, + "grad_norm": 0.13516998291015625, + "learning_rate": 2.500911003262174e-06, + "loss": 1.6584, + "step": 29386 + }, + { + "epoch": 9.019950890116636, + "grad_norm": 0.13729408383369446, + "learning_rate": 2.4993589056452215e-06, + "loss": 1.7039, + "step": 29387 + }, + { + "epoch": 9.020257826887661, + "grad_norm": 0.1284191608428955, + "learning_rate": 2.4978072774574533e-06, + "loss": 1.6652, + "step": 29388 + }, + { + "epoch": 9.020564763658687, + "grad_norm": 0.1911778301000595, + "learning_rate": 2.4962561187141906e-06, + "loss": 1.747, + "step": 29389 + }, + { + "epoch": 9.020871700429712, + "grad_norm": 0.1233893632888794, + "learning_rate": 2.4947054294307714e-06, + "loss": 1.6631, + "step": 29390 + }, + { + "epoch": 9.021178637200737, + "grad_norm": 0.1692901849746704, + "learning_rate": 2.493155209622511e-06, + "loss": 1.7598, + "step": 29391 + }, + { + "epoch": 9.021485573971761, + "grad_norm": 0.1659780591726303, + "learning_rate": 2.4916054593047468e-06, + "loss": 1.7915, + "step": 29392 + }, + { + "epoch": 9.021792510742786, + "grad_norm": 0.1684655100107193, + "learning_rate": 2.4900561784927667e-06, + "loss": 1.7324, + "step": 29393 + }, + { + "epoch": 9.022099447513812, + "grad_norm": 0.21327704191207886, + "learning_rate": 2.488507367201914e-06, + "loss": 1.729, + "step": 29394 + }, + { + "epoch": 9.022406384284837, + "grad_norm": 0.16245315968990326, + "learning_rate": 2.486959025447472e-06, + "loss": 1.7569, + "step": 29395 + }, + { + "epoch": 9.022713321055862, + "grad_norm": 0.15231920778751373, + "learning_rate": 2.4854111532447435e-06, + "loss": 1.7024, + "step": 29396 + }, + { + "epoch": 9.023020257826888, + "grad_norm": 0.20816101133823395, + "learning_rate": 2.4838637506090447e-06, + "loss": 1.7198, + "step": 29397 + }, + { + "epoch": 9.023327194597913, + "grad_norm": 0.1711280196905136, + "learning_rate": 2.4823168175556357e-06, + "loss": 1.7073, + "step": 29398 + }, + { + "epoch": 9.023634131368938, + "grad_norm": 0.14723099768161774, + "learning_rate": 2.480770354099843e-06, + "loss": 1.7472, + "step": 29399 + }, + { + "epoch": 9.023941068139964, + "grad_norm": 0.23221471905708313, + "learning_rate": 2.47922436025691e-06, + "loss": 1.7639, + "step": 29400 + }, + { + "epoch": 9.024248004910989, + "grad_norm": 0.13510727882385254, + "learning_rate": 2.4776788360421466e-06, + "loss": 1.7258, + "step": 29401 + }, + { + "epoch": 9.024554941682014, + "grad_norm": 0.2099999636411667, + "learning_rate": 2.476133781470813e-06, + "loss": 1.8019, + "step": 29402 + }, + { + "epoch": 9.024861878453038, + "grad_norm": 0.13297688961029053, + "learning_rate": 2.47458919655818e-06, + "loss": 1.7031, + "step": 29403 + }, + { + "epoch": 9.025168815224063, + "grad_norm": 0.14716757833957672, + "learning_rate": 2.4730450813195138e-06, + "loss": 1.677, + "step": 29404 + }, + { + "epoch": 9.025475751995089, + "grad_norm": 0.14763082563877106, + "learning_rate": 2.4715014357700683e-06, + "loss": 1.6863, + "step": 29405 + }, + { + "epoch": 9.025782688766114, + "grad_norm": 0.15744271874427795, + "learning_rate": 2.469958259925109e-06, + "loss": 1.7236, + "step": 29406 + }, + { + "epoch": 9.02608962553714, + "grad_norm": 0.19316953420639038, + "learning_rate": 2.4684155537998743e-06, + "loss": 1.7547, + "step": 29407 + }, + { + "epoch": 9.026396562308165, + "grad_norm": 0.14727036654949188, + "learning_rate": 2.4668733174096126e-06, + "loss": 1.7346, + "step": 29408 + }, + { + "epoch": 9.02670349907919, + "grad_norm": 0.14740467071533203, + "learning_rate": 2.465331550769584e-06, + "loss": 1.7109, + "step": 29409 + }, + { + "epoch": 9.027010435850215, + "grad_norm": 0.1295071691274643, + "learning_rate": 2.463790253894993e-06, + "loss": 1.711, + "step": 29410 + }, + { + "epoch": 9.02731737262124, + "grad_norm": 0.20718778669834137, + "learning_rate": 2.4622494268011054e-06, + "loss": 1.7146, + "step": 29411 + }, + { + "epoch": 9.027624309392266, + "grad_norm": 0.13038063049316406, + "learning_rate": 2.46070906950312e-06, + "loss": 1.6533, + "step": 29412 + }, + { + "epoch": 9.027931246163291, + "grad_norm": 0.18726535141468048, + "learning_rate": 2.459169182016269e-06, + "loss": 1.7144, + "step": 29413 + }, + { + "epoch": 9.028238182934315, + "grad_norm": 0.1343640834093094, + "learning_rate": 2.4576297643557843e-06, + "loss": 1.7014, + "step": 29414 + }, + { + "epoch": 9.02854511970534, + "grad_norm": 0.1509372591972351, + "learning_rate": 2.4560908165368544e-06, + "loss": 1.6999, + "step": 29415 + }, + { + "epoch": 9.028852056476365, + "grad_norm": 0.1541101038455963, + "learning_rate": 2.4545523385747172e-06, + "loss": 1.6962, + "step": 29416 + }, + { + "epoch": 9.02915899324739, + "grad_norm": 0.16334660351276398, + "learning_rate": 2.4530143304845432e-06, + "loss": 1.7293, + "step": 29417 + }, + { + "epoch": 9.029465930018416, + "grad_norm": 0.14802905917167664, + "learning_rate": 2.4514767922815595e-06, + "loss": 1.6747, + "step": 29418 + }, + { + "epoch": 9.029772866789441, + "grad_norm": 0.19622576236724854, + "learning_rate": 2.4499397239809487e-06, + "loss": 1.764, + "step": 29419 + }, + { + "epoch": 9.030079803560467, + "grad_norm": 0.14734432101249695, + "learning_rate": 2.4484031255979036e-06, + "loss": 1.7371, + "step": 29420 + }, + { + "epoch": 9.030386740331492, + "grad_norm": 0.16914428770542145, + "learning_rate": 2.4468669971476123e-06, + "loss": 1.7148, + "step": 29421 + }, + { + "epoch": 9.030693677102517, + "grad_norm": 0.13942086696624756, + "learning_rate": 2.4453313386452516e-06, + "loss": 1.704, + "step": 29422 + }, + { + "epoch": 9.031000613873543, + "grad_norm": 0.12403316050767899, + "learning_rate": 2.4437961501060036e-06, + "loss": 1.6567, + "step": 29423 + }, + { + "epoch": 9.031307550644566, + "grad_norm": 0.14684323966503143, + "learning_rate": 2.4422614315450287e-06, + "loss": 1.7452, + "step": 29424 + }, + { + "epoch": 9.031614487415592, + "grad_norm": 0.1687471866607666, + "learning_rate": 2.440727182977498e-06, + "loss": 1.7265, + "step": 29425 + }, + { + "epoch": 9.031921424186617, + "grad_norm": 0.14509400725364685, + "learning_rate": 2.439193404418588e-06, + "loss": 1.6855, + "step": 29426 + }, + { + "epoch": 9.032228360957642, + "grad_norm": 0.13958261907100677, + "learning_rate": 2.4376600958834373e-06, + "loss": 1.7458, + "step": 29427 + }, + { + "epoch": 9.032535297728668, + "grad_norm": 0.18749283254146576, + "learning_rate": 2.436127257387211e-06, + "loss": 1.725, + "step": 29428 + }, + { + "epoch": 9.032842234499693, + "grad_norm": 0.1423102170228958, + "learning_rate": 2.434594888945052e-06, + "loss": 1.6655, + "step": 29429 + }, + { + "epoch": 9.033149171270718, + "grad_norm": 0.17062890529632568, + "learning_rate": 2.433062990572099e-06, + "loss": 1.7059, + "step": 29430 + }, + { + "epoch": 9.033456108041744, + "grad_norm": 0.15203866362571716, + "learning_rate": 2.4315315622835124e-06, + "loss": 1.709, + "step": 29431 + }, + { + "epoch": 9.033763044812769, + "grad_norm": 0.21039290726184845, + "learning_rate": 2.4300006040943956e-06, + "loss": 1.7815, + "step": 29432 + }, + { + "epoch": 9.034069981583794, + "grad_norm": 0.17041321098804474, + "learning_rate": 2.428470116019904e-06, + "loss": 1.7417, + "step": 29433 + }, + { + "epoch": 9.03437691835482, + "grad_norm": 0.19286702573299408, + "learning_rate": 2.426940098075148e-06, + "loss": 1.7186, + "step": 29434 + }, + { + "epoch": 9.034683855125843, + "grad_norm": 0.20875763893127441, + "learning_rate": 2.425410550275253e-06, + "loss": 1.7379, + "step": 29435 + }, + { + "epoch": 9.034990791896869, + "grad_norm": 0.16214729845523834, + "learning_rate": 2.4238814726353365e-06, + "loss": 1.7419, + "step": 29436 + }, + { + "epoch": 9.035297728667894, + "grad_norm": 0.16366153955459595, + "learning_rate": 2.422352865170513e-06, + "loss": 1.7399, + "step": 29437 + }, + { + "epoch": 9.03560466543892, + "grad_norm": 0.15280435979366302, + "learning_rate": 2.420824727895882e-06, + "loss": 1.6898, + "step": 29438 + }, + { + "epoch": 9.035911602209945, + "grad_norm": 0.1929275393486023, + "learning_rate": 2.4192970608265477e-06, + "loss": 1.7452, + "step": 29439 + }, + { + "epoch": 9.03621853898097, + "grad_norm": 0.15144196152687073, + "learning_rate": 2.417769863977609e-06, + "loss": 1.6924, + "step": 29440 + }, + { + "epoch": 9.036525475751995, + "grad_norm": 0.11187378317117691, + "learning_rate": 2.4162431373641546e-06, + "loss": 1.6537, + "step": 29441 + }, + { + "epoch": 9.03683241252302, + "grad_norm": 0.14815855026245117, + "learning_rate": 2.4147168810012664e-06, + "loss": 1.7089, + "step": 29442 + }, + { + "epoch": 9.037139349294046, + "grad_norm": 0.18288609385490417, + "learning_rate": 2.413191094904055e-06, + "loss": 1.685, + "step": 29443 + }, + { + "epoch": 9.037446286065071, + "grad_norm": 0.13843944668769836, + "learning_rate": 2.4116657790875686e-06, + "loss": 1.6736, + "step": 29444 + }, + { + "epoch": 9.037753222836097, + "grad_norm": 0.11480217427015305, + "learning_rate": 2.410140933566901e-06, + "loss": 1.6416, + "step": 29445 + }, + { + "epoch": 9.03806015960712, + "grad_norm": 0.16542355716228485, + "learning_rate": 2.408616558357113e-06, + "loss": 1.7019, + "step": 29446 + }, + { + "epoch": 9.038367096378146, + "grad_norm": 0.1372150480747223, + "learning_rate": 2.4070926534732586e-06, + "loss": 1.6731, + "step": 29447 + }, + { + "epoch": 9.03867403314917, + "grad_norm": 0.16052548587322235, + "learning_rate": 2.4055692189304257e-06, + "loss": 1.7016, + "step": 29448 + }, + { + "epoch": 9.038980969920196, + "grad_norm": 0.14994394779205322, + "learning_rate": 2.4040462547436416e-06, + "loss": 1.7227, + "step": 29449 + }, + { + "epoch": 9.039287906691222, + "grad_norm": 0.1549554169178009, + "learning_rate": 2.4025237609279827e-06, + "loss": 1.7085, + "step": 29450 + }, + { + "epoch": 9.039594843462247, + "grad_norm": 0.13443107903003693, + "learning_rate": 2.401001737498465e-06, + "loss": 1.6809, + "step": 29451 + }, + { + "epoch": 9.039901780233272, + "grad_norm": 0.18695014715194702, + "learning_rate": 2.39948018447016e-06, + "loss": 1.7208, + "step": 29452 + }, + { + "epoch": 9.040208717004298, + "grad_norm": 0.1901451200246811, + "learning_rate": 2.397959101858083e-06, + "loss": 1.7576, + "step": 29453 + }, + { + "epoch": 9.040515653775323, + "grad_norm": 0.13147258758544922, + "learning_rate": 2.396438489677283e-06, + "loss": 1.6713, + "step": 29454 + }, + { + "epoch": 9.040822590546348, + "grad_norm": 0.1695723831653595, + "learning_rate": 2.3949183479427704e-06, + "loss": 1.7205, + "step": 29455 + }, + { + "epoch": 9.041129527317372, + "grad_norm": 0.1526571363210678, + "learning_rate": 2.393398676669584e-06, + "loss": 1.6849, + "step": 29456 + }, + { + "epoch": 9.041436464088397, + "grad_norm": 0.13576491177082062, + "learning_rate": 2.3918794758727325e-06, + "loss": 1.6911, + "step": 29457 + }, + { + "epoch": 9.041743400859422, + "grad_norm": 0.15050055086612701, + "learning_rate": 2.390360745567233e-06, + "loss": 1.7058, + "step": 29458 + }, + { + "epoch": 9.042050337630448, + "grad_norm": 0.16959871351718903, + "learning_rate": 2.3888424857680837e-06, + "loss": 1.741, + "step": 29459 + }, + { + "epoch": 9.042357274401473, + "grad_norm": 0.1468123197555542, + "learning_rate": 2.3873246964903116e-06, + "loss": 1.6996, + "step": 29460 + }, + { + "epoch": 9.042664211172498, + "grad_norm": 0.14826932549476624, + "learning_rate": 2.385807377748894e-06, + "loss": 1.7072, + "step": 29461 + }, + { + "epoch": 9.042971147943524, + "grad_norm": 0.13068771362304688, + "learning_rate": 2.384290529558847e-06, + "loss": 1.6627, + "step": 29462 + }, + { + "epoch": 9.043278084714549, + "grad_norm": 0.18755924701690674, + "learning_rate": 2.382774151935141e-06, + "loss": 1.7313, + "step": 29463 + }, + { + "epoch": 9.043585021485574, + "grad_norm": 0.1287360042333603, + "learning_rate": 2.38125824489277e-06, + "loss": 1.6894, + "step": 29464 + }, + { + "epoch": 9.0438919582566, + "grad_norm": 0.1582459807395935, + "learning_rate": 2.3797428084467223e-06, + "loss": 1.7206, + "step": 29465 + }, + { + "epoch": 9.044198895027625, + "grad_norm": 0.20703738927841187, + "learning_rate": 2.3782278426119575e-06, + "loss": 1.7241, + "step": 29466 + }, + { + "epoch": 9.044505831798649, + "grad_norm": 0.14492042362689972, + "learning_rate": 2.3767133474034696e-06, + "loss": 1.7058, + "step": 29467 + }, + { + "epoch": 9.044812768569674, + "grad_norm": 0.16977067291736603, + "learning_rate": 2.375199322836197e-06, + "loss": 1.7059, + "step": 29468 + }, + { + "epoch": 9.0451197053407, + "grad_norm": 0.1448739618062973, + "learning_rate": 2.3736857689251267e-06, + "loss": 1.7367, + "step": 29469 + }, + { + "epoch": 9.045426642111725, + "grad_norm": 0.13738159835338593, + "learning_rate": 2.372172685685209e-06, + "loss": 1.7058, + "step": 29470 + }, + { + "epoch": 9.04573357888275, + "grad_norm": 0.1473991870880127, + "learning_rate": 2.3706600731313976e-06, + "loss": 1.6706, + "step": 29471 + }, + { + "epoch": 9.046040515653775, + "grad_norm": 0.18705418705940247, + "learning_rate": 2.369147931278637e-06, + "loss": 1.7088, + "step": 29472 + }, + { + "epoch": 9.0463474524248, + "grad_norm": 0.14573143422603607, + "learning_rate": 2.3676362601418757e-06, + "loss": 1.6886, + "step": 29473 + }, + { + "epoch": 9.046654389195826, + "grad_norm": 0.1586790531873703, + "learning_rate": 2.3661250597360518e-06, + "loss": 1.6573, + "step": 29474 + }, + { + "epoch": 9.046961325966851, + "grad_norm": 0.14579340815544128, + "learning_rate": 2.364614330076098e-06, + "loss": 1.7235, + "step": 29475 + }, + { + "epoch": 9.047268262737877, + "grad_norm": 0.11558994650840759, + "learning_rate": 2.3631040711769358e-06, + "loss": 1.6597, + "step": 29476 + }, + { + "epoch": 9.047575199508902, + "grad_norm": 0.1311790943145752, + "learning_rate": 2.36159428305352e-06, + "loss": 1.6846, + "step": 29477 + }, + { + "epoch": 9.047882136279926, + "grad_norm": 0.17676955461502075, + "learning_rate": 2.3600849657207323e-06, + "loss": 1.7511, + "step": 29478 + }, + { + "epoch": 9.04818907305095, + "grad_norm": 0.1472693681716919, + "learning_rate": 2.358576119193523e-06, + "loss": 1.6836, + "step": 29479 + }, + { + "epoch": 9.048496009821976, + "grad_norm": 0.15737339854240417, + "learning_rate": 2.3570677434867795e-06, + "loss": 1.7285, + "step": 29480 + }, + { + "epoch": 9.048802946593002, + "grad_norm": 0.17748746275901794, + "learning_rate": 2.355559838615412e-06, + "loss": 1.7267, + "step": 29481 + }, + { + "epoch": 9.049109883364027, + "grad_norm": 0.12016935646533966, + "learning_rate": 2.3540524045943425e-06, + "loss": 1.6677, + "step": 29482 + }, + { + "epoch": 9.049416820135052, + "grad_norm": 0.1696930080652237, + "learning_rate": 2.352545441438442e-06, + "loss": 1.6765, + "step": 29483 + }, + { + "epoch": 9.049723756906078, + "grad_norm": 0.17330607771873474, + "learning_rate": 2.3510389491626208e-06, + "loss": 1.727, + "step": 29484 + }, + { + "epoch": 9.050030693677103, + "grad_norm": 0.14688768982887268, + "learning_rate": 2.3495329277817502e-06, + "loss": 1.7149, + "step": 29485 + }, + { + "epoch": 9.050337630448128, + "grad_norm": 0.14381086826324463, + "learning_rate": 2.3480273773107297e-06, + "loss": 1.6448, + "step": 29486 + }, + { + "epoch": 9.050644567219154, + "grad_norm": 0.14638835191726685, + "learning_rate": 2.3465222977644364e-06, + "loss": 1.6979, + "step": 29487 + }, + { + "epoch": 9.050951503990179, + "grad_norm": 0.13770419359207153, + "learning_rate": 2.345017689157736e-06, + "loss": 1.6991, + "step": 29488 + }, + { + "epoch": 9.051258440761202, + "grad_norm": 0.16549327969551086, + "learning_rate": 2.3435135515055053e-06, + "loss": 1.681, + "step": 29489 + }, + { + "epoch": 9.051565377532228, + "grad_norm": 0.19915145635604858, + "learning_rate": 2.3420098848226046e-06, + "loss": 1.7726, + "step": 29490 + }, + { + "epoch": 9.051872314303253, + "grad_norm": 0.15350405871868134, + "learning_rate": 2.3405066891238945e-06, + "loss": 1.7159, + "step": 29491 + }, + { + "epoch": 9.052179251074278, + "grad_norm": 0.1314122974872589, + "learning_rate": 2.3390039644242356e-06, + "loss": 1.7224, + "step": 29492 + }, + { + "epoch": 9.052486187845304, + "grad_norm": 0.18343986570835114, + "learning_rate": 2.3375017107384655e-06, + "loss": 1.7572, + "step": 29493 + }, + { + "epoch": 9.05279312461633, + "grad_norm": 0.1556810587644577, + "learning_rate": 2.3359999280814506e-06, + "loss": 1.6476, + "step": 29494 + }, + { + "epoch": 9.053100061387354, + "grad_norm": 0.11017484217882156, + "learning_rate": 2.334498616468017e-06, + "loss": 1.6544, + "step": 29495 + }, + { + "epoch": 9.05340699815838, + "grad_norm": 0.1391851007938385, + "learning_rate": 2.332997775913004e-06, + "loss": 1.6594, + "step": 29496 + }, + { + "epoch": 9.053713934929405, + "grad_norm": 0.1584119200706482, + "learning_rate": 2.3314974064312433e-06, + "loss": 1.7588, + "step": 29497 + }, + { + "epoch": 9.05402087170043, + "grad_norm": 0.10139171034097672, + "learning_rate": 2.3299975080375625e-06, + "loss": 1.6621, + "step": 29498 + }, + { + "epoch": 9.054327808471454, + "grad_norm": 0.14895425736904144, + "learning_rate": 2.3284980807467994e-06, + "loss": 1.7145, + "step": 29499 + }, + { + "epoch": 9.05463474524248, + "grad_norm": 0.11982736736536026, + "learning_rate": 2.326999124573742e-06, + "loss": 1.6654, + "step": 29500 + }, + { + "epoch": 9.054941682013505, + "grad_norm": 0.15541890263557434, + "learning_rate": 2.32550063953324e-06, + "loss": 1.6889, + "step": 29501 + }, + { + "epoch": 9.05524861878453, + "grad_norm": 0.13237549364566803, + "learning_rate": 2.324002625640065e-06, + "loss": 1.7191, + "step": 29502 + }, + { + "epoch": 9.055555555555555, + "grad_norm": 0.16847456991672516, + "learning_rate": 2.3225050829090546e-06, + "loss": 1.7064, + "step": 29503 + }, + { + "epoch": 9.05586249232658, + "grad_norm": 0.16782483458518982, + "learning_rate": 2.321008011354986e-06, + "loss": 1.7303, + "step": 29504 + }, + { + "epoch": 9.056169429097606, + "grad_norm": 0.1684166043996811, + "learning_rate": 2.3195114109926643e-06, + "loss": 1.7071, + "step": 29505 + }, + { + "epoch": 9.056476365868631, + "grad_norm": 0.11413996666669846, + "learning_rate": 2.3180152818368774e-06, + "loss": 1.6664, + "step": 29506 + }, + { + "epoch": 9.056783302639657, + "grad_norm": 0.14353851974010468, + "learning_rate": 2.316519623902408e-06, + "loss": 1.7375, + "step": 29507 + }, + { + "epoch": 9.057090239410682, + "grad_norm": 0.20431090891361237, + "learning_rate": 2.315024437204044e-06, + "loss": 1.7307, + "step": 29508 + }, + { + "epoch": 9.057397176181707, + "grad_norm": 0.1507789045572281, + "learning_rate": 2.3135297217565576e-06, + "loss": 1.6582, + "step": 29509 + }, + { + "epoch": 9.057704112952731, + "grad_norm": 0.1449059545993805, + "learning_rate": 2.3120354775747143e-06, + "loss": 1.6808, + "step": 29510 + }, + { + "epoch": 9.058011049723756, + "grad_norm": 0.11667517572641373, + "learning_rate": 2.3105417046732915e-06, + "loss": 1.7207, + "step": 29511 + }, + { + "epoch": 9.058317986494782, + "grad_norm": 0.13248896598815918, + "learning_rate": 2.3090484030670488e-06, + "loss": 1.6908, + "step": 29512 + }, + { + "epoch": 9.058624923265807, + "grad_norm": 0.1595017910003662, + "learning_rate": 2.307555572770742e-06, + "loss": 1.7026, + "step": 29513 + }, + { + "epoch": 9.058931860036832, + "grad_norm": 0.22244125604629517, + "learning_rate": 2.3060632137991257e-06, + "loss": 1.7571, + "step": 29514 + }, + { + "epoch": 9.059238796807858, + "grad_norm": 0.1424504965543747, + "learning_rate": 2.3045713261669433e-06, + "loss": 1.701, + "step": 29515 + }, + { + "epoch": 9.059545733578883, + "grad_norm": 0.12159547954797745, + "learning_rate": 2.3030799098889444e-06, + "loss": 1.7167, + "step": 29516 + }, + { + "epoch": 9.059852670349908, + "grad_norm": 0.1438741683959961, + "learning_rate": 2.301588964979856e-06, + "loss": 1.6736, + "step": 29517 + }, + { + "epoch": 9.060159607120934, + "grad_norm": 0.19870363175868988, + "learning_rate": 2.3000984914544386e-06, + "loss": 1.7801, + "step": 29518 + }, + { + "epoch": 9.060466543891959, + "grad_norm": 0.14005307853221893, + "learning_rate": 2.298608489327392e-06, + "loss": 1.6933, + "step": 29519 + }, + { + "epoch": 9.060773480662984, + "grad_norm": 0.15449295938014984, + "learning_rate": 2.297118958613459e-06, + "loss": 1.6894, + "step": 29520 + }, + { + "epoch": 9.061080417434008, + "grad_norm": 0.15363426506519318, + "learning_rate": 2.2956298993273615e-06, + "loss": 1.6945, + "step": 29521 + }, + { + "epoch": 9.061387354205033, + "grad_norm": 0.20762746036052704, + "learning_rate": 2.294141311483805e-06, + "loss": 1.6774, + "step": 29522 + }, + { + "epoch": 9.061694290976058, + "grad_norm": 0.1773165762424469, + "learning_rate": 2.2926531950975107e-06, + "loss": 1.7868, + "step": 29523 + }, + { + "epoch": 9.062001227747084, + "grad_norm": 0.13610224425792694, + "learning_rate": 2.291165550183172e-06, + "loss": 1.6945, + "step": 29524 + }, + { + "epoch": 9.06230816451811, + "grad_norm": 0.13063403964042664, + "learning_rate": 2.2896783767555053e-06, + "loss": 1.684, + "step": 29525 + }, + { + "epoch": 9.062615101289135, + "grad_norm": 0.1523241400718689, + "learning_rate": 2.2881916748291987e-06, + "loss": 1.7392, + "step": 29526 + }, + { + "epoch": 9.06292203806016, + "grad_norm": 0.17883025109767914, + "learning_rate": 2.286705444418946e-06, + "loss": 1.7474, + "step": 29527 + }, + { + "epoch": 9.063228974831185, + "grad_norm": 0.14900827407836914, + "learning_rate": 2.2852196855394358e-06, + "loss": 1.7096, + "step": 29528 + }, + { + "epoch": 9.06353591160221, + "grad_norm": 0.1691586673259735, + "learning_rate": 2.2837343982053503e-06, + "loss": 1.7373, + "step": 29529 + }, + { + "epoch": 9.063842848373236, + "grad_norm": 0.1183643490076065, + "learning_rate": 2.282249582431367e-06, + "loss": 1.6689, + "step": 29530 + }, + { + "epoch": 9.06414978514426, + "grad_norm": 0.16844353079795837, + "learning_rate": 2.280765238232163e-06, + "loss": 1.7345, + "step": 29531 + }, + { + "epoch": 9.064456721915285, + "grad_norm": 0.13235628604888916, + "learning_rate": 2.27928136562241e-06, + "loss": 1.69, + "step": 29532 + }, + { + "epoch": 9.06476365868631, + "grad_norm": 0.13285794854164124, + "learning_rate": 2.277797964616768e-06, + "loss": 1.6842, + "step": 29533 + }, + { + "epoch": 9.065070595457335, + "grad_norm": 0.13197976350784302, + "learning_rate": 2.2763150352298866e-06, + "loss": 1.7036, + "step": 29534 + }, + { + "epoch": 9.06537753222836, + "grad_norm": 0.13822008669376373, + "learning_rate": 2.274832577476449e-06, + "loss": 1.7079, + "step": 29535 + }, + { + "epoch": 9.065684468999386, + "grad_norm": 0.14020980894565582, + "learning_rate": 2.2733505913710705e-06, + "loss": 1.716, + "step": 29536 + }, + { + "epoch": 9.065991405770411, + "grad_norm": 0.13733944296836853, + "learning_rate": 2.271869076928429e-06, + "loss": 1.7167, + "step": 29537 + }, + { + "epoch": 9.066298342541437, + "grad_norm": 0.13786739110946655, + "learning_rate": 2.27038803416314e-06, + "loss": 1.7111, + "step": 29538 + }, + { + "epoch": 9.066605279312462, + "grad_norm": 0.17205199599266052, + "learning_rate": 2.268907463089859e-06, + "loss": 1.7095, + "step": 29539 + }, + { + "epoch": 9.066912216083487, + "grad_norm": 0.16810791194438934, + "learning_rate": 2.2674273637232123e-06, + "loss": 1.7002, + "step": 29540 + }, + { + "epoch": 9.067219152854513, + "grad_norm": 0.15370075404644012, + "learning_rate": 2.2659477360778226e-06, + "loss": 1.7309, + "step": 29541 + }, + { + "epoch": 9.067526089625536, + "grad_norm": 0.18854720890522003, + "learning_rate": 2.2644685801683165e-06, + "loss": 1.7731, + "step": 29542 + }, + { + "epoch": 9.067833026396562, + "grad_norm": 0.14275872707366943, + "learning_rate": 2.2629898960093097e-06, + "loss": 1.7042, + "step": 29543 + }, + { + "epoch": 9.068139963167587, + "grad_norm": 0.13044105470180511, + "learning_rate": 2.261511683615414e-06, + "loss": 1.6886, + "step": 29544 + }, + { + "epoch": 9.068446899938612, + "grad_norm": 0.1964588612318039, + "learning_rate": 2.2600339430012442e-06, + "loss": 1.756, + "step": 29545 + }, + { + "epoch": 9.068753836709638, + "grad_norm": 0.15589101612567902, + "learning_rate": 2.2585566741814e-06, + "loss": 1.6922, + "step": 29546 + }, + { + "epoch": 9.069060773480663, + "grad_norm": 0.1840185523033142, + "learning_rate": 2.257079877170476e-06, + "loss": 1.7668, + "step": 29547 + }, + { + "epoch": 9.069367710251688, + "grad_norm": 0.11688835173845291, + "learning_rate": 2.2556035519830765e-06, + "loss": 1.6866, + "step": 29548 + }, + { + "epoch": 9.069674647022714, + "grad_norm": 0.16568957269191742, + "learning_rate": 2.2541276986337844e-06, + "loss": 1.7353, + "step": 29549 + }, + { + "epoch": 9.069981583793739, + "grad_norm": 0.1312425136566162, + "learning_rate": 2.252652317137188e-06, + "loss": 1.7476, + "step": 29550 + }, + { + "epoch": 9.070288520564764, + "grad_norm": 0.12554149329662323, + "learning_rate": 2.251177407507865e-06, + "loss": 1.6452, + "step": 29551 + }, + { + "epoch": 9.07059545733579, + "grad_norm": 0.14966057240962982, + "learning_rate": 2.249702969760398e-06, + "loss": 1.7362, + "step": 29552 + }, + { + "epoch": 9.070902394106813, + "grad_norm": 0.1935591846704483, + "learning_rate": 2.248229003909347e-06, + "loss": 1.7284, + "step": 29553 + }, + { + "epoch": 9.071209330877839, + "grad_norm": 0.1565311849117279, + "learning_rate": 2.246755509969295e-06, + "loss": 1.7068, + "step": 29554 + }, + { + "epoch": 9.071516267648864, + "grad_norm": 0.14980174601078033, + "learning_rate": 2.2452824879547806e-06, + "loss": 1.7455, + "step": 29555 + }, + { + "epoch": 9.07182320441989, + "grad_norm": 0.16639313101768494, + "learning_rate": 2.243809937880381e-06, + "loss": 1.7515, + "step": 29556 + }, + { + "epoch": 9.072130141190915, + "grad_norm": 0.12895835936069489, + "learning_rate": 2.242337859760646e-06, + "loss": 1.6401, + "step": 29557 + }, + { + "epoch": 9.07243707796194, + "grad_norm": 0.175545796751976, + "learning_rate": 2.240866253610119e-06, + "loss": 1.6815, + "step": 29558 + }, + { + "epoch": 9.072744014732965, + "grad_norm": 0.16137781739234924, + "learning_rate": 2.2393951194433437e-06, + "loss": 1.7161, + "step": 29559 + }, + { + "epoch": 9.07305095150399, + "grad_norm": 0.15323428809642792, + "learning_rate": 2.2379244572748536e-06, + "loss": 1.6917, + "step": 29560 + }, + { + "epoch": 9.073357888275016, + "grad_norm": 0.13572439551353455, + "learning_rate": 2.2364542671191978e-06, + "loss": 1.732, + "step": 29561 + }, + { + "epoch": 9.073664825046041, + "grad_norm": 0.1529226154088974, + "learning_rate": 2.234984548990887e-06, + "loss": 1.7082, + "step": 29562 + }, + { + "epoch": 9.073971761817067, + "grad_norm": 0.16901282966136932, + "learning_rate": 2.2335153029044598e-06, + "loss": 1.6796, + "step": 29563 + }, + { + "epoch": 9.07427869858809, + "grad_norm": 0.120974101126194, + "learning_rate": 2.2320465288744317e-06, + "loss": 1.6871, + "step": 29564 + }, + { + "epoch": 9.074585635359115, + "grad_norm": 0.1303488165140152, + "learning_rate": 2.2305782269153143e-06, + "loss": 1.7193, + "step": 29565 + }, + { + "epoch": 9.07489257213014, + "grad_norm": 0.13454987108707428, + "learning_rate": 2.2291103970416227e-06, + "loss": 1.6841, + "step": 29566 + }, + { + "epoch": 9.075199508901166, + "grad_norm": 0.14908376336097717, + "learning_rate": 2.2276430392678628e-06, + "loss": 1.7352, + "step": 29567 + }, + { + "epoch": 9.075506445672191, + "grad_norm": 0.16618986427783966, + "learning_rate": 2.226176153608528e-06, + "loss": 1.711, + "step": 29568 + }, + { + "epoch": 9.075813382443217, + "grad_norm": 0.1883801370859146, + "learning_rate": 2.224709740078135e-06, + "loss": 1.7297, + "step": 29569 + }, + { + "epoch": 9.076120319214242, + "grad_norm": 0.16342709958553314, + "learning_rate": 2.2232437986911492e-06, + "loss": 1.7207, + "step": 29570 + }, + { + "epoch": 9.076427255985267, + "grad_norm": 0.16771680116653442, + "learning_rate": 2.221778329462082e-06, + "loss": 1.6875, + "step": 29571 + }, + { + "epoch": 9.076734192756293, + "grad_norm": 0.1790522187948227, + "learning_rate": 2.2203133324053936e-06, + "loss": 1.675, + "step": 29572 + }, + { + "epoch": 9.077041129527318, + "grad_norm": 0.1973496973514557, + "learning_rate": 2.2188488075355785e-06, + "loss": 1.7385, + "step": 29573 + }, + { + "epoch": 9.077348066298342, + "grad_norm": 0.1493360847234726, + "learning_rate": 2.2173847548671077e-06, + "loss": 1.6645, + "step": 29574 + }, + { + "epoch": 9.077655003069367, + "grad_norm": 0.18652872741222382, + "learning_rate": 2.2159211744144424e-06, + "loss": 1.739, + "step": 29575 + }, + { + "epoch": 9.077961939840392, + "grad_norm": 0.1569397747516632, + "learning_rate": 2.2144580661920544e-06, + "loss": 1.6857, + "step": 29576 + }, + { + "epoch": 9.078268876611418, + "grad_norm": 0.14565426111221313, + "learning_rate": 2.212995430214404e-06, + "loss": 1.6808, + "step": 29577 + }, + { + "epoch": 9.078575813382443, + "grad_norm": 0.186843141913414, + "learning_rate": 2.2115332664959353e-06, + "loss": 1.6877, + "step": 29578 + }, + { + "epoch": 9.078882750153468, + "grad_norm": 0.11076909303665161, + "learning_rate": 2.2100715750511038e-06, + "loss": 1.6692, + "step": 29579 + }, + { + "epoch": 9.079189686924494, + "grad_norm": 0.15020497143268585, + "learning_rate": 2.2086103558943583e-06, + "loss": 1.71, + "step": 29580 + }, + { + "epoch": 9.079496623695519, + "grad_norm": 0.17952266335487366, + "learning_rate": 2.207149609040138e-06, + "loss": 1.6728, + "step": 29581 + }, + { + "epoch": 9.079803560466544, + "grad_norm": 0.14447824656963348, + "learning_rate": 2.205689334502875e-06, + "loss": 1.679, + "step": 29582 + }, + { + "epoch": 9.08011049723757, + "grad_norm": 0.12692491710186005, + "learning_rate": 2.204229532297003e-06, + "loss": 1.6881, + "step": 29583 + }, + { + "epoch": 9.080417434008595, + "grad_norm": 0.15565918385982513, + "learning_rate": 2.2027702024369547e-06, + "loss": 1.6702, + "step": 29584 + }, + { + "epoch": 9.080724370779619, + "grad_norm": 0.14430411159992218, + "learning_rate": 2.201311344937135e-06, + "loss": 1.6992, + "step": 29585 + }, + { + "epoch": 9.081031307550644, + "grad_norm": 0.2374502569437027, + "learning_rate": 2.1998529598119823e-06, + "loss": 1.834, + "step": 29586 + }, + { + "epoch": 9.08133824432167, + "grad_norm": 0.13957563042640686, + "learning_rate": 2.1983950470758907e-06, + "loss": 1.6617, + "step": 29587 + }, + { + "epoch": 9.081645181092695, + "grad_norm": 0.16792097687721252, + "learning_rate": 2.196937606743288e-06, + "loss": 1.7357, + "step": 29588 + }, + { + "epoch": 9.08195211786372, + "grad_norm": 0.18628741800785065, + "learning_rate": 2.195480638828551e-06, + "loss": 1.728, + "step": 29589 + }, + { + "epoch": 9.082259054634745, + "grad_norm": 0.1528443992137909, + "learning_rate": 2.1940241433461016e-06, + "loss": 1.7374, + "step": 29590 + }, + { + "epoch": 9.08256599140577, + "grad_norm": 0.1556825041770935, + "learning_rate": 2.1925681203103287e-06, + "loss": 1.744, + "step": 29591 + }, + { + "epoch": 9.082872928176796, + "grad_norm": 0.14697785675525665, + "learning_rate": 2.191112569735615e-06, + "loss": 1.6721, + "step": 29592 + }, + { + "epoch": 9.083179864947821, + "grad_norm": 0.18705244362354279, + "learning_rate": 2.1896574916363488e-06, + "loss": 1.7284, + "step": 29593 + }, + { + "epoch": 9.083486801718847, + "grad_norm": 0.2209276407957077, + "learning_rate": 2.188202886026908e-06, + "loss": 1.7261, + "step": 29594 + }, + { + "epoch": 9.083793738489872, + "grad_norm": 0.13894188404083252, + "learning_rate": 2.1867487529216748e-06, + "loss": 1.7112, + "step": 29595 + }, + { + "epoch": 9.084100675260895, + "grad_norm": 0.13467033207416534, + "learning_rate": 2.18529509233501e-06, + "loss": 1.6566, + "step": 29596 + }, + { + "epoch": 9.08440761203192, + "grad_norm": 0.11996985971927643, + "learning_rate": 2.1838419042812862e-06, + "loss": 1.652, + "step": 29597 + }, + { + "epoch": 9.084714548802946, + "grad_norm": 0.12615782022476196, + "learning_rate": 2.182389188774864e-06, + "loss": 1.7007, + "step": 29598 + }, + { + "epoch": 9.085021485573971, + "grad_norm": 0.15084239840507507, + "learning_rate": 2.1809369458300976e-06, + "loss": 1.6605, + "step": 29599 + }, + { + "epoch": 9.085328422344997, + "grad_norm": 0.15964055061340332, + "learning_rate": 2.1794851754613375e-06, + "loss": 1.7452, + "step": 29600 + }, + { + "epoch": 9.085635359116022, + "grad_norm": 0.15842875838279724, + "learning_rate": 2.178033877682939e-06, + "loss": 1.7223, + "step": 29601 + }, + { + "epoch": 9.085942295887047, + "grad_norm": 0.1889149248600006, + "learning_rate": 2.176583052509229e-06, + "loss": 1.7568, + "step": 29602 + }, + { + "epoch": 9.086249232658073, + "grad_norm": 0.14189517498016357, + "learning_rate": 2.1751326999545683e-06, + "loss": 1.7275, + "step": 29603 + }, + { + "epoch": 9.086556169429098, + "grad_norm": 0.13787707686424255, + "learning_rate": 2.1736828200332625e-06, + "loss": 1.7101, + "step": 29604 + }, + { + "epoch": 9.086863106200123, + "grad_norm": 0.1241447925567627, + "learning_rate": 2.1722334127596723e-06, + "loss": 1.6529, + "step": 29605 + }, + { + "epoch": 9.087170042971149, + "grad_norm": 0.14460496604442596, + "learning_rate": 2.1707844781480858e-06, + "loss": 1.6891, + "step": 29606 + }, + { + "epoch": 9.087476979742172, + "grad_norm": 0.1723712533712387, + "learning_rate": 2.169336016212853e-06, + "loss": 1.6892, + "step": 29607 + }, + { + "epoch": 9.087783916513198, + "grad_norm": 0.20372143387794495, + "learning_rate": 2.1678880269682734e-06, + "loss": 1.7786, + "step": 29608 + }, + { + "epoch": 9.088090853284223, + "grad_norm": 0.1281127631664276, + "learning_rate": 2.166440510428658e-06, + "loss": 1.6971, + "step": 29609 + }, + { + "epoch": 9.088397790055248, + "grad_norm": 0.17418532073497772, + "learning_rate": 2.1649934666083118e-06, + "loss": 1.6813, + "step": 29610 + }, + { + "epoch": 9.088704726826274, + "grad_norm": 0.13917995989322662, + "learning_rate": 2.16354689552154e-06, + "loss": 1.6787, + "step": 29611 + }, + { + "epoch": 9.089011663597299, + "grad_norm": 0.12206067144870758, + "learning_rate": 2.1621007971826367e-06, + "loss": 1.6792, + "step": 29612 + }, + { + "epoch": 9.089318600368324, + "grad_norm": 0.14317838847637177, + "learning_rate": 2.1606551716058907e-06, + "loss": 1.732, + "step": 29613 + }, + { + "epoch": 9.08962553713935, + "grad_norm": 0.1607116013765335, + "learning_rate": 2.1592100188055907e-06, + "loss": 1.6927, + "step": 29614 + }, + { + "epoch": 9.089932473910375, + "grad_norm": 0.14611779153347015, + "learning_rate": 2.1577653387960197e-06, + "loss": 1.7165, + "step": 29615 + }, + { + "epoch": 9.0902394106814, + "grad_norm": 0.15042389929294586, + "learning_rate": 2.156321131591449e-06, + "loss": 1.7221, + "step": 29616 + }, + { + "epoch": 9.090546347452424, + "grad_norm": 0.1669636368751526, + "learning_rate": 2.1548773972061563e-06, + "loss": 1.714, + "step": 29617 + }, + { + "epoch": 9.09085328422345, + "grad_norm": 0.22214718163013458, + "learning_rate": 2.1534341356544086e-06, + "loss": 1.6956, + "step": 29618 + }, + { + "epoch": 9.091160220994475, + "grad_norm": 0.16929394006729126, + "learning_rate": 2.151991346950466e-06, + "loss": 1.7095, + "step": 29619 + }, + { + "epoch": 9.0914671577655, + "grad_norm": 0.15387636423110962, + "learning_rate": 2.150549031108595e-06, + "loss": 1.7566, + "step": 29620 + }, + { + "epoch": 9.091774094536525, + "grad_norm": 0.19231966137886047, + "learning_rate": 2.1491071881430348e-06, + "loss": 1.7671, + "step": 29621 + }, + { + "epoch": 9.09208103130755, + "grad_norm": 0.15853071212768555, + "learning_rate": 2.1476658180680566e-06, + "loss": 1.7098, + "step": 29622 + }, + { + "epoch": 9.092387968078576, + "grad_norm": 0.11180046200752258, + "learning_rate": 2.146224920897877e-06, + "loss": 1.6501, + "step": 29623 + }, + { + "epoch": 9.092694904849601, + "grad_norm": 0.13134215772151947, + "learning_rate": 2.1447844966467625e-06, + "loss": 1.7099, + "step": 29624 + }, + { + "epoch": 9.093001841620627, + "grad_norm": 0.1667555719614029, + "learning_rate": 2.143344545328929e-06, + "loss": 1.7205, + "step": 29625 + }, + { + "epoch": 9.093308778391652, + "grad_norm": 0.18456818163394928, + "learning_rate": 2.1419050669586216e-06, + "loss": 1.737, + "step": 29626 + }, + { + "epoch": 9.093615715162677, + "grad_norm": 0.1580527424812317, + "learning_rate": 2.1404660615500506e-06, + "loss": 1.7092, + "step": 29627 + }, + { + "epoch": 9.0939226519337, + "grad_norm": 0.1242590993642807, + "learning_rate": 2.1390275291174542e-06, + "loss": 1.6641, + "step": 29628 + }, + { + "epoch": 9.094229588704726, + "grad_norm": 0.10987458378076553, + "learning_rate": 2.137589469675033e-06, + "loss": 1.6275, + "step": 29629 + }, + { + "epoch": 9.094536525475752, + "grad_norm": 0.1660260111093521, + "learning_rate": 2.1361518832370087e-06, + "loss": 1.7781, + "step": 29630 + }, + { + "epoch": 9.094843462246777, + "grad_norm": 0.11643832921981812, + "learning_rate": 2.134714769817586e-06, + "loss": 1.6596, + "step": 29631 + }, + { + "epoch": 9.095150399017802, + "grad_norm": 0.13046015799045563, + "learning_rate": 2.1332781294309654e-06, + "loss": 1.7046, + "step": 29632 + }, + { + "epoch": 9.095457335788828, + "grad_norm": 0.12697182595729828, + "learning_rate": 2.1318419620913466e-06, + "loss": 1.6422, + "step": 29633 + }, + { + "epoch": 9.095764272559853, + "grad_norm": 0.15039731562137604, + "learning_rate": 2.1304062678129233e-06, + "loss": 1.7088, + "step": 29634 + }, + { + "epoch": 9.096071209330878, + "grad_norm": 0.12595078349113464, + "learning_rate": 2.128971046609879e-06, + "loss": 1.7156, + "step": 29635 + }, + { + "epoch": 9.096378146101904, + "grad_norm": 0.13836300373077393, + "learning_rate": 2.1275362984963966e-06, + "loss": 1.7024, + "step": 29636 + }, + { + "epoch": 9.096685082872929, + "grad_norm": 0.15840092301368713, + "learning_rate": 2.12610202348667e-06, + "loss": 1.7581, + "step": 29637 + }, + { + "epoch": 9.096992019643954, + "grad_norm": 0.11084351688623428, + "learning_rate": 2.1246682215948556e-06, + "loss": 1.6444, + "step": 29638 + }, + { + "epoch": 9.097298956414978, + "grad_norm": 0.148295059800148, + "learning_rate": 2.1232348928351353e-06, + "loss": 1.6947, + "step": 29639 + }, + { + "epoch": 9.097605893186003, + "grad_norm": 0.1266920119524002, + "learning_rate": 2.121802037221665e-06, + "loss": 1.6838, + "step": 29640 + }, + { + "epoch": 9.097912829957028, + "grad_norm": 0.15683111548423767, + "learning_rate": 2.1203696547686116e-06, + "loss": 1.7522, + "step": 29641 + }, + { + "epoch": 9.098219766728054, + "grad_norm": 0.15225628018379211, + "learning_rate": 2.118937745490124e-06, + "loss": 1.7464, + "step": 29642 + }, + { + "epoch": 9.098526703499079, + "grad_norm": 0.12527376413345337, + "learning_rate": 2.1175063094003632e-06, + "loss": 1.6595, + "step": 29643 + }, + { + "epoch": 9.098833640270104, + "grad_norm": 0.13361993432044983, + "learning_rate": 2.1160753465134685e-06, + "loss": 1.6694, + "step": 29644 + }, + { + "epoch": 9.09914057704113, + "grad_norm": 0.22824819386005402, + "learning_rate": 2.114644856843584e-06, + "loss": 1.7497, + "step": 29645 + }, + { + "epoch": 9.099447513812155, + "grad_norm": 0.14481870830059052, + "learning_rate": 2.1132148404048424e-06, + "loss": 1.7107, + "step": 29646 + }, + { + "epoch": 9.09975445058318, + "grad_norm": 0.12271635234355927, + "learning_rate": 2.1117852972113828e-06, + "loss": 1.7115, + "step": 29647 + }, + { + "epoch": 9.100061387354206, + "grad_norm": 0.15120261907577515, + "learning_rate": 2.110356227277327e-06, + "loss": 1.7039, + "step": 29648 + }, + { + "epoch": 9.10036832412523, + "grad_norm": 0.15357789397239685, + "learning_rate": 2.1089276306168025e-06, + "loss": 1.7183, + "step": 29649 + }, + { + "epoch": 9.100675260896255, + "grad_norm": 0.1110130101442337, + "learning_rate": 2.1074995072439207e-06, + "loss": 1.6682, + "step": 29650 + }, + { + "epoch": 9.10098219766728, + "grad_norm": 0.14395900070667267, + "learning_rate": 2.106071857172803e-06, + "loss": 1.6806, + "step": 29651 + }, + { + "epoch": 9.101289134438305, + "grad_norm": 0.14335933327674866, + "learning_rate": 2.1046446804175555e-06, + "loss": 1.714, + "step": 29652 + }, + { + "epoch": 9.10159607120933, + "grad_norm": 0.12558147311210632, + "learning_rate": 2.103217976992272e-06, + "loss": 1.6841, + "step": 29653 + }, + { + "epoch": 9.101903007980356, + "grad_norm": 0.122133269906044, + "learning_rate": 2.101791746911075e-06, + "loss": 1.6745, + "step": 29654 + }, + { + "epoch": 9.102209944751381, + "grad_norm": 0.13860413432121277, + "learning_rate": 2.1003659901880357e-06, + "loss": 1.724, + "step": 29655 + }, + { + "epoch": 9.102516881522407, + "grad_norm": 0.16283336281776428, + "learning_rate": 2.098940706837266e-06, + "loss": 1.7328, + "step": 29656 + }, + { + "epoch": 9.102823818293432, + "grad_norm": 0.16371138393878937, + "learning_rate": 2.097515896872826e-06, + "loss": 1.7541, + "step": 29657 + }, + { + "epoch": 9.103130755064457, + "grad_norm": 0.1482359617948532, + "learning_rate": 2.096091560308816e-06, + "loss": 1.721, + "step": 29658 + }, + { + "epoch": 9.103437691835483, + "grad_norm": 0.15985986590385437, + "learning_rate": 2.0946676971593083e-06, + "loss": 1.7878, + "step": 29659 + }, + { + "epoch": 9.103744628606506, + "grad_norm": 0.1820739209651947, + "learning_rate": 2.0932443074383747e-06, + "loss": 1.717, + "step": 29660 + }, + { + "epoch": 9.104051565377532, + "grad_norm": 0.14114773273468018, + "learning_rate": 2.091821391160076e-06, + "loss": 1.6881, + "step": 29661 + }, + { + "epoch": 9.104358502148557, + "grad_norm": 0.14509153366088867, + "learning_rate": 2.090398948338479e-06, + "loss": 1.7082, + "step": 29662 + }, + { + "epoch": 9.104665438919582, + "grad_norm": 0.1653892993927002, + "learning_rate": 2.088976978987639e-06, + "loss": 1.747, + "step": 29663 + }, + { + "epoch": 9.104972375690608, + "grad_norm": 0.1548600047826767, + "learning_rate": 2.0875554831216116e-06, + "loss": 1.6506, + "step": 29664 + }, + { + "epoch": 9.105279312461633, + "grad_norm": 0.15069860219955444, + "learning_rate": 2.086134460754446e-06, + "loss": 1.681, + "step": 29665 + }, + { + "epoch": 9.105586249232658, + "grad_norm": 0.17018845677375793, + "learning_rate": 2.0847139119001824e-06, + "loss": 1.7066, + "step": 29666 + }, + { + "epoch": 9.105893186003684, + "grad_norm": 0.137167289853096, + "learning_rate": 2.0832938365728582e-06, + "loss": 1.6858, + "step": 29667 + }, + { + "epoch": 9.106200122774709, + "grad_norm": 0.13983163237571716, + "learning_rate": 2.081874234786507e-06, + "loss": 1.7306, + "step": 29668 + }, + { + "epoch": 9.106507059545734, + "grad_norm": 0.20317591726779938, + "learning_rate": 2.0804551065551626e-06, + "loss": 1.7506, + "step": 29669 + }, + { + "epoch": 9.10681399631676, + "grad_norm": 0.16218522191047668, + "learning_rate": 2.0790364518928406e-06, + "loss": 1.7475, + "step": 29670 + }, + { + "epoch": 9.107120933087783, + "grad_norm": 0.11892718076705933, + "learning_rate": 2.0776182708135805e-06, + "loss": 1.6876, + "step": 29671 + }, + { + "epoch": 9.107427869858808, + "grad_norm": 0.13815937936306, + "learning_rate": 2.076200563331371e-06, + "loss": 1.7187, + "step": 29672 + }, + { + "epoch": 9.107734806629834, + "grad_norm": 0.12870736420154572, + "learning_rate": 2.07478332946025e-06, + "loss": 1.6836, + "step": 29673 + }, + { + "epoch": 9.10804174340086, + "grad_norm": 0.13736192882061005, + "learning_rate": 2.0733665692142024e-06, + "loss": 1.6865, + "step": 29674 + }, + { + "epoch": 9.108348680171884, + "grad_norm": 0.12006348371505737, + "learning_rate": 2.071950282607238e-06, + "loss": 1.6911, + "step": 29675 + }, + { + "epoch": 9.10865561694291, + "grad_norm": 0.16973024606704712, + "learning_rate": 2.070534469653351e-06, + "loss": 1.7203, + "step": 29676 + }, + { + "epoch": 9.108962553713935, + "grad_norm": 0.12767069041728973, + "learning_rate": 2.069119130366537e-06, + "loss": 1.6747, + "step": 29677 + }, + { + "epoch": 9.10926949048496, + "grad_norm": 0.14068815112113953, + "learning_rate": 2.0677042647607837e-06, + "loss": 1.6851, + "step": 29678 + }, + { + "epoch": 9.109576427255986, + "grad_norm": 0.13680805265903473, + "learning_rate": 2.066289872850069e-06, + "loss": 1.6913, + "step": 29679 + }, + { + "epoch": 9.109883364027011, + "grad_norm": 0.126765176653862, + "learning_rate": 2.064875954648371e-06, + "loss": 1.6624, + "step": 29680 + }, + { + "epoch": 9.110190300798035, + "grad_norm": 0.15233641862869263, + "learning_rate": 2.0634625101696615e-06, + "loss": 1.673, + "step": 29681 + }, + { + "epoch": 9.11049723756906, + "grad_norm": 0.165065735578537, + "learning_rate": 2.0620495394279182e-06, + "loss": 1.7752, + "step": 29682 + }, + { + "epoch": 9.110804174340085, + "grad_norm": 0.16008982062339783, + "learning_rate": 2.060637042437097e-06, + "loss": 1.7509, + "step": 29683 + }, + { + "epoch": 9.11111111111111, + "grad_norm": 0.14805258810520172, + "learning_rate": 2.0592250192111585e-06, + "loss": 1.7481, + "step": 29684 + }, + { + "epoch": 9.111418047882136, + "grad_norm": 0.15095308423042297, + "learning_rate": 2.0578134697640585e-06, + "loss": 1.7003, + "step": 29685 + }, + { + "epoch": 9.111724984653161, + "grad_norm": 0.15223002433776855, + "learning_rate": 2.056402394109741e-06, + "loss": 1.6619, + "step": 29686 + }, + { + "epoch": 9.112031921424187, + "grad_norm": 0.21639372408390045, + "learning_rate": 2.0549917922621564e-06, + "loss": 1.6929, + "step": 29687 + }, + { + "epoch": 9.112338858195212, + "grad_norm": 0.17735840380191803, + "learning_rate": 2.053581664235249e-06, + "loss": 1.732, + "step": 29688 + }, + { + "epoch": 9.112645794966237, + "grad_norm": 0.14142274856567383, + "learning_rate": 2.052172010042941e-06, + "loss": 1.6888, + "step": 29689 + }, + { + "epoch": 9.112952731737263, + "grad_norm": 0.15996430814266205, + "learning_rate": 2.050762829699182e-06, + "loss": 1.724, + "step": 29690 + }, + { + "epoch": 9.113259668508288, + "grad_norm": 0.15198329091072083, + "learning_rate": 2.0493541232178835e-06, + "loss": 1.7437, + "step": 29691 + }, + { + "epoch": 9.113566605279312, + "grad_norm": 0.15780989825725555, + "learning_rate": 2.047945890612979e-06, + "loss": 1.7422, + "step": 29692 + }, + { + "epoch": 9.113873542050337, + "grad_norm": 0.11914275586605072, + "learning_rate": 2.046538131898368e-06, + "loss": 1.6689, + "step": 29693 + }, + { + "epoch": 9.114180478821362, + "grad_norm": 0.1314009428024292, + "learning_rate": 2.0451308470879782e-06, + "loss": 1.6496, + "step": 29694 + }, + { + "epoch": 9.114487415592388, + "grad_norm": 0.12608365714550018, + "learning_rate": 2.0437240361957154e-06, + "loss": 1.6798, + "step": 29695 + }, + { + "epoch": 9.114794352363413, + "grad_norm": 0.1259870082139969, + "learning_rate": 2.0423176992354797e-06, + "loss": 1.6673, + "step": 29696 + }, + { + "epoch": 9.115101289134438, + "grad_norm": 0.13520261645317078, + "learning_rate": 2.0409118362211654e-06, + "loss": 1.682, + "step": 29697 + }, + { + "epoch": 9.115408225905464, + "grad_norm": 0.12047206610441208, + "learning_rate": 2.0395064471666727e-06, + "loss": 1.6831, + "step": 29698 + }, + { + "epoch": 9.115715162676489, + "grad_norm": 0.17553697526454926, + "learning_rate": 2.0381015320858896e-06, + "loss": 1.7069, + "step": 29699 + }, + { + "epoch": 9.116022099447514, + "grad_norm": 0.15869703888893127, + "learning_rate": 2.0366970909926952e-06, + "loss": 1.7271, + "step": 29700 + }, + { + "epoch": 9.11632903621854, + "grad_norm": 0.13005055487155914, + "learning_rate": 2.035293123900972e-06, + "loss": 1.7204, + "step": 29701 + }, + { + "epoch": 9.116635972989565, + "grad_norm": 0.16763219237327576, + "learning_rate": 2.033889630824598e-06, + "loss": 1.7708, + "step": 29702 + }, + { + "epoch": 9.116942909760589, + "grad_norm": 0.1338036209344864, + "learning_rate": 2.03248661177744e-06, + "loss": 1.6735, + "step": 29703 + }, + { + "epoch": 9.117249846531614, + "grad_norm": 0.1424967646598816, + "learning_rate": 2.0310840667733643e-06, + "loss": 1.7344, + "step": 29704 + }, + { + "epoch": 9.11755678330264, + "grad_norm": 0.20512424409389496, + "learning_rate": 2.029681995826227e-06, + "loss": 1.7609, + "step": 29705 + }, + { + "epoch": 9.117863720073665, + "grad_norm": 0.12276902049779892, + "learning_rate": 2.028280398949889e-06, + "loss": 1.6959, + "step": 29706 + }, + { + "epoch": 9.11817065684469, + "grad_norm": 0.17198625206947327, + "learning_rate": 2.026879276158211e-06, + "loss": 1.7131, + "step": 29707 + }, + { + "epoch": 9.118477593615715, + "grad_norm": 0.1957482397556305, + "learning_rate": 2.025478627465016e-06, + "loss": 1.7182, + "step": 29708 + }, + { + "epoch": 9.11878453038674, + "grad_norm": 0.1316002756357193, + "learning_rate": 2.0240784528841707e-06, + "loss": 1.6694, + "step": 29709 + }, + { + "epoch": 9.119091467157766, + "grad_norm": 0.1468227654695511, + "learning_rate": 2.022678752429491e-06, + "loss": 1.7023, + "step": 29710 + }, + { + "epoch": 9.119398403928791, + "grad_norm": 0.14300107955932617, + "learning_rate": 2.0212795261148277e-06, + "loss": 1.693, + "step": 29711 + }, + { + "epoch": 9.119705340699817, + "grad_norm": 0.13721270859241486, + "learning_rate": 2.0198807739540026e-06, + "loss": 1.7132, + "step": 29712 + }, + { + "epoch": 9.120012277470842, + "grad_norm": 0.16772721707820892, + "learning_rate": 2.0184824959608386e-06, + "loss": 1.7564, + "step": 29713 + }, + { + "epoch": 9.120319214241865, + "grad_norm": 0.20897400379180908, + "learning_rate": 2.0170846921491516e-06, + "loss": 1.75, + "step": 29714 + }, + { + "epoch": 9.12062615101289, + "grad_norm": 0.11614206433296204, + "learning_rate": 2.0156873625327534e-06, + "loss": 1.7058, + "step": 29715 + }, + { + "epoch": 9.120933087783916, + "grad_norm": 0.11942286789417267, + "learning_rate": 2.0142905071254603e-06, + "loss": 1.6813, + "step": 29716 + }, + { + "epoch": 9.121240024554941, + "grad_norm": 0.13220490515232086, + "learning_rate": 2.0128941259410727e-06, + "loss": 1.6625, + "step": 29717 + }, + { + "epoch": 9.121546961325967, + "grad_norm": 0.11992443352937698, + "learning_rate": 2.0114982189933962e-06, + "loss": 1.7028, + "step": 29718 + }, + { + "epoch": 9.121853898096992, + "grad_norm": 0.1317398101091385, + "learning_rate": 2.010102786296214e-06, + "loss": 1.6832, + "step": 29719 + }, + { + "epoch": 9.122160834868017, + "grad_norm": 0.1504088193178177, + "learning_rate": 2.008707827863332e-06, + "loss": 1.6874, + "step": 29720 + }, + { + "epoch": 9.122467771639043, + "grad_norm": 0.138368159532547, + "learning_rate": 2.0073133437085224e-06, + "loss": 1.7256, + "step": 29721 + }, + { + "epoch": 9.122774708410068, + "grad_norm": 0.11481283605098724, + "learning_rate": 2.0059193338455683e-06, + "loss": 1.6599, + "step": 29722 + }, + { + "epoch": 9.123081645181093, + "grad_norm": 0.179039865732193, + "learning_rate": 2.004525798288248e-06, + "loss": 1.7319, + "step": 29723 + }, + { + "epoch": 9.123388581952117, + "grad_norm": 0.14884190261363983, + "learning_rate": 2.0031327370503506e-06, + "loss": 1.7074, + "step": 29724 + }, + { + "epoch": 9.123695518723142, + "grad_norm": 0.14200903475284576, + "learning_rate": 2.001740150145609e-06, + "loss": 1.6827, + "step": 29725 + }, + { + "epoch": 9.124002455494168, + "grad_norm": 0.12509983777999878, + "learning_rate": 2.0003480375878182e-06, + "loss": 1.6687, + "step": 29726 + }, + { + "epoch": 9.124309392265193, + "grad_norm": 0.1458035707473755, + "learning_rate": 1.998956399390711e-06, + "loss": 1.7075, + "step": 29727 + }, + { + "epoch": 9.124616329036218, + "grad_norm": 0.15756329894065857, + "learning_rate": 1.9975652355680554e-06, + "loss": 1.7024, + "step": 29728 + }, + { + "epoch": 9.124923265807244, + "grad_norm": 0.15760551393032074, + "learning_rate": 1.9961745461335947e-06, + "loss": 1.7479, + "step": 29729 + }, + { + "epoch": 9.125230202578269, + "grad_norm": 0.1852855086326599, + "learning_rate": 1.994784331101074e-06, + "loss": 1.7166, + "step": 29730 + }, + { + "epoch": 9.125537139349294, + "grad_norm": 0.14625653624534607, + "learning_rate": 1.993394590484232e-06, + "loss": 1.6654, + "step": 29731 + }, + { + "epoch": 9.12584407612032, + "grad_norm": 0.15028734505176544, + "learning_rate": 1.992005324296803e-06, + "loss": 1.6855, + "step": 29732 + }, + { + "epoch": 9.126151012891345, + "grad_norm": 0.15989474952220917, + "learning_rate": 1.990616532552514e-06, + "loss": 1.7315, + "step": 29733 + }, + { + "epoch": 9.12645794966237, + "grad_norm": 0.168121799826622, + "learning_rate": 1.9892282152650933e-06, + "loss": 1.7168, + "step": 29734 + }, + { + "epoch": 9.126764886433394, + "grad_norm": 0.15154367685317993, + "learning_rate": 1.9878403724482576e-06, + "loss": 1.7084, + "step": 29735 + }, + { + "epoch": 9.12707182320442, + "grad_norm": 0.18086697161197662, + "learning_rate": 1.9864530041157235e-06, + "loss": 1.78, + "step": 29736 + }, + { + "epoch": 9.127378759975445, + "grad_norm": 0.10857624560594559, + "learning_rate": 1.985066110281203e-06, + "loss": 1.635, + "step": 29737 + }, + { + "epoch": 9.12768569674647, + "grad_norm": 0.14876055717468262, + "learning_rate": 1.983679690958401e-06, + "loss": 1.6972, + "step": 29738 + }, + { + "epoch": 9.127992633517495, + "grad_norm": 0.144441157579422, + "learning_rate": 1.9822937461610235e-06, + "loss": 1.6871, + "step": 29739 + }, + { + "epoch": 9.12829957028852, + "grad_norm": 0.12115978449583054, + "learning_rate": 1.980908275902754e-06, + "loss": 1.6496, + "step": 29740 + }, + { + "epoch": 9.128606507059546, + "grad_norm": 0.11610052734613419, + "learning_rate": 1.979523280197304e-06, + "loss": 1.6429, + "step": 29741 + }, + { + "epoch": 9.128913443830571, + "grad_norm": 0.10996486991643906, + "learning_rate": 1.97813875905834e-06, + "loss": 1.6551, + "step": 29742 + }, + { + "epoch": 9.129220380601597, + "grad_norm": 0.1537560224533081, + "learning_rate": 1.9767547124995677e-06, + "loss": 1.6836, + "step": 29743 + }, + { + "epoch": 9.129527317372622, + "grad_norm": 0.11715234071016312, + "learning_rate": 1.975371140534643e-06, + "loss": 1.6968, + "step": 29744 + }, + { + "epoch": 9.129834254143647, + "grad_norm": 0.11332523077726364, + "learning_rate": 1.973988043177255e-06, + "loss": 1.6753, + "step": 29745 + }, + { + "epoch": 9.13014119091467, + "grad_norm": 0.1348869651556015, + "learning_rate": 1.9726054204410595e-06, + "loss": 1.7083, + "step": 29746 + }, + { + "epoch": 9.130448127685696, + "grad_norm": 0.15482230484485626, + "learning_rate": 1.971223272339734e-06, + "loss": 1.6969, + "step": 29747 + }, + { + "epoch": 9.130755064456721, + "grad_norm": 0.1418905109167099, + "learning_rate": 1.9698415988869346e-06, + "loss": 1.7373, + "step": 29748 + }, + { + "epoch": 9.131062001227747, + "grad_norm": 0.13672807812690735, + "learning_rate": 1.9684604000963002e-06, + "loss": 1.7197, + "step": 29749 + }, + { + "epoch": 9.131368937998772, + "grad_norm": 0.1771068125963211, + "learning_rate": 1.967079675981498e-06, + "loss": 1.7223, + "step": 29750 + }, + { + "epoch": 9.131675874769797, + "grad_norm": 0.1820068508386612, + "learning_rate": 1.965699426556167e-06, + "loss": 1.7198, + "step": 29751 + }, + { + "epoch": 9.131982811540823, + "grad_norm": 0.1300581693649292, + "learning_rate": 1.9643196518339457e-06, + "loss": 1.6856, + "step": 29752 + }, + { + "epoch": 9.132289748311848, + "grad_norm": 0.13473594188690186, + "learning_rate": 1.962940351828474e-06, + "loss": 1.6889, + "step": 29753 + }, + { + "epoch": 9.132596685082873, + "grad_norm": 0.170193150639534, + "learning_rate": 1.961561526553385e-06, + "loss": 1.7336, + "step": 29754 + }, + { + "epoch": 9.132903621853899, + "grad_norm": 0.14752201735973358, + "learning_rate": 1.9601831760222954e-06, + "loss": 1.7528, + "step": 29755 + }, + { + "epoch": 9.133210558624924, + "grad_norm": 0.18119682371616364, + "learning_rate": 1.9588053002488337e-06, + "loss": 1.7296, + "step": 29756 + }, + { + "epoch": 9.133517495395948, + "grad_norm": 0.1837453842163086, + "learning_rate": 1.957427899246611e-06, + "loss": 1.6643, + "step": 29757 + }, + { + "epoch": 9.133824432166973, + "grad_norm": 0.18625833094120026, + "learning_rate": 1.956050973029261e-06, + "loss": 1.78, + "step": 29758 + }, + { + "epoch": 9.134131368937998, + "grad_norm": 0.15884144604206085, + "learning_rate": 1.9546745216103558e-06, + "loss": 1.7168, + "step": 29759 + }, + { + "epoch": 9.134438305709024, + "grad_norm": 0.13788993656635284, + "learning_rate": 1.953298545003535e-06, + "loss": 1.7016, + "step": 29760 + }, + { + "epoch": 9.134745242480049, + "grad_norm": 0.14895956218242645, + "learning_rate": 1.951923043222359e-06, + "loss": 1.6961, + "step": 29761 + }, + { + "epoch": 9.135052179251074, + "grad_norm": 0.1548876017332077, + "learning_rate": 1.9505480162804567e-06, + "loss": 1.7352, + "step": 29762 + }, + { + "epoch": 9.1353591160221, + "grad_norm": 0.14169646799564362, + "learning_rate": 1.949173464191395e-06, + "loss": 1.7123, + "step": 29763 + }, + { + "epoch": 9.135666052793125, + "grad_norm": 0.14068526029586792, + "learning_rate": 1.9477993869687684e-06, + "loss": 1.7013, + "step": 29764 + }, + { + "epoch": 9.13597298956415, + "grad_norm": 0.15116369724273682, + "learning_rate": 1.9464257846261548e-06, + "loss": 1.6831, + "step": 29765 + }, + { + "epoch": 9.136279926335176, + "grad_norm": 0.17049194872379303, + "learning_rate": 1.9450526571771154e-06, + "loss": 1.72, + "step": 29766 + }, + { + "epoch": 9.1365868631062, + "grad_norm": 0.1429831087589264, + "learning_rate": 1.94368000463524e-06, + "loss": 1.6903, + "step": 29767 + }, + { + "epoch": 9.136893799877225, + "grad_norm": 0.2263873964548111, + "learning_rate": 1.9423078270140838e-06, + "loss": 1.7919, + "step": 29768 + }, + { + "epoch": 9.13720073664825, + "grad_norm": 0.14157186448574066, + "learning_rate": 1.940936124327214e-06, + "loss": 1.7151, + "step": 29769 + }, + { + "epoch": 9.137507673419275, + "grad_norm": 0.19576019048690796, + "learning_rate": 1.939564896588175e-06, + "loss": 1.7046, + "step": 29770 + }, + { + "epoch": 9.1378146101903, + "grad_norm": 0.15183357894420624, + "learning_rate": 1.9381941438105288e-06, + "loss": 1.6889, + "step": 29771 + }, + { + "epoch": 9.138121546961326, + "grad_norm": 0.11827339977025986, + "learning_rate": 1.936823866007814e-06, + "loss": 1.6737, + "step": 29772 + }, + { + "epoch": 9.138428483732351, + "grad_norm": 0.14976255595684052, + "learning_rate": 1.935454063193581e-06, + "loss": 1.725, + "step": 29773 + }, + { + "epoch": 9.138735420503377, + "grad_norm": 0.18152090907096863, + "learning_rate": 1.934084735381353e-06, + "loss": 1.7727, + "step": 29774 + }, + { + "epoch": 9.139042357274402, + "grad_norm": 0.19068580865859985, + "learning_rate": 1.9327158825846848e-06, + "loss": 1.7291, + "step": 29775 + }, + { + "epoch": 9.139349294045427, + "grad_norm": 0.1304289698600769, + "learning_rate": 1.9313475048170827e-06, + "loss": 1.6755, + "step": 29776 + }, + { + "epoch": 9.139656230816453, + "grad_norm": 0.14543502032756805, + "learning_rate": 1.9299796020920857e-06, + "loss": 1.7131, + "step": 29777 + }, + { + "epoch": 9.139963167587476, + "grad_norm": 0.16456526517868042, + "learning_rate": 1.9286121744231946e-06, + "loss": 1.734, + "step": 29778 + }, + { + "epoch": 9.140270104358502, + "grad_norm": 0.20563676953315735, + "learning_rate": 1.9272452218239424e-06, + "loss": 1.711, + "step": 29779 + }, + { + "epoch": 9.140577041129527, + "grad_norm": 0.12036823481321335, + "learning_rate": 1.925878744307824e-06, + "loss": 1.648, + "step": 29780 + }, + { + "epoch": 9.140883977900552, + "grad_norm": 0.13387446105480194, + "learning_rate": 1.924512741888351e-06, + "loss": 1.6611, + "step": 29781 + }, + { + "epoch": 9.141190914671578, + "grad_norm": 0.14060257375240326, + "learning_rate": 1.92314721457903e-06, + "loss": 1.699, + "step": 29782 + }, + { + "epoch": 9.141497851442603, + "grad_norm": 0.1897846907377243, + "learning_rate": 1.921782162393332e-06, + "loss": 1.6847, + "step": 29783 + }, + { + "epoch": 9.141804788213628, + "grad_norm": 0.1610451191663742, + "learning_rate": 1.920417585344769e-06, + "loss": 1.7417, + "step": 29784 + }, + { + "epoch": 9.142111724984654, + "grad_norm": 0.16606128215789795, + "learning_rate": 1.91905348344682e-06, + "loss": 1.7026, + "step": 29785 + }, + { + "epoch": 9.142418661755679, + "grad_norm": 0.13305748999118805, + "learning_rate": 1.9176898567129675e-06, + "loss": 1.6664, + "step": 29786 + }, + { + "epoch": 9.142725598526704, + "grad_norm": 0.1632613241672516, + "learning_rate": 1.9163267051566845e-06, + "loss": 1.7245, + "step": 29787 + }, + { + "epoch": 9.14303253529773, + "grad_norm": 0.14564020931720734, + "learning_rate": 1.9149640287914437e-06, + "loss": 1.6775, + "step": 29788 + }, + { + "epoch": 9.143339472068753, + "grad_norm": 0.14515992999076843, + "learning_rate": 1.9136018276307123e-06, + "loss": 1.6254, + "step": 29789 + }, + { + "epoch": 9.143646408839778, + "grad_norm": 0.14360931515693665, + "learning_rate": 1.9122401016879455e-06, + "loss": 1.7002, + "step": 29790 + }, + { + "epoch": 9.143953345610804, + "grad_norm": 0.14381720125675201, + "learning_rate": 1.9108788509766107e-06, + "loss": 1.7165, + "step": 29791 + }, + { + "epoch": 9.144260282381829, + "grad_norm": 0.14533990621566772, + "learning_rate": 1.909518075510164e-06, + "loss": 1.7003, + "step": 29792 + }, + { + "epoch": 9.144567219152854, + "grad_norm": 0.17832323908805847, + "learning_rate": 1.908157775302033e-06, + "loss": 1.7542, + "step": 29793 + }, + { + "epoch": 9.14487415592388, + "grad_norm": 0.15718503296375275, + "learning_rate": 1.9067979503656907e-06, + "loss": 1.681, + "step": 29794 + }, + { + "epoch": 9.145181092694905, + "grad_norm": 0.14168475568294525, + "learning_rate": 1.905438600714543e-06, + "loss": 1.694, + "step": 29795 + }, + { + "epoch": 9.14548802946593, + "grad_norm": 0.16925476491451263, + "learning_rate": 1.9040797263620514e-06, + "loss": 1.7163, + "step": 29796 + }, + { + "epoch": 9.145794966236956, + "grad_norm": 0.14622162282466888, + "learning_rate": 1.902721327321627e-06, + "loss": 1.7218, + "step": 29797 + }, + { + "epoch": 9.146101903007981, + "grad_norm": 0.14731308817863464, + "learning_rate": 1.9013634036067096e-06, + "loss": 1.6846, + "step": 29798 + }, + { + "epoch": 9.146408839779005, + "grad_norm": 0.16700461506843567, + "learning_rate": 1.9000059552307103e-06, + "loss": 1.7606, + "step": 29799 + }, + { + "epoch": 9.14671577655003, + "grad_norm": 0.12352433800697327, + "learning_rate": 1.8986489822070353e-06, + "loss": 1.6729, + "step": 29800 + }, + { + "epoch": 9.147022713321055, + "grad_norm": 0.1929595023393631, + "learning_rate": 1.897292484549107e-06, + "loss": 1.7356, + "step": 29801 + }, + { + "epoch": 9.14732965009208, + "grad_norm": 0.14892888069152832, + "learning_rate": 1.8959364622703313e-06, + "loss": 1.7074, + "step": 29802 + }, + { + "epoch": 9.147636586863106, + "grad_norm": 0.13839972019195557, + "learning_rate": 1.8945809153841031e-06, + "loss": 1.7107, + "step": 29803 + }, + { + "epoch": 9.147943523634131, + "grad_norm": 0.14454330503940582, + "learning_rate": 1.893225843903823e-06, + "loss": 1.7082, + "step": 29804 + }, + { + "epoch": 9.148250460405157, + "grad_norm": 0.16385792195796967, + "learning_rate": 1.8918712478428857e-06, + "loss": 1.7244, + "step": 29805 + }, + { + "epoch": 9.148557397176182, + "grad_norm": 0.17219752073287964, + "learning_rate": 1.8905171272146694e-06, + "loss": 1.7198, + "step": 29806 + }, + { + "epoch": 9.148864333947207, + "grad_norm": 0.1170208603143692, + "learning_rate": 1.8891634820325633e-06, + "loss": 1.6718, + "step": 29807 + }, + { + "epoch": 9.149171270718233, + "grad_norm": 0.13478094339370728, + "learning_rate": 1.8878103123099345e-06, + "loss": 1.6929, + "step": 29808 + }, + { + "epoch": 9.149478207489258, + "grad_norm": 0.15352758765220642, + "learning_rate": 1.8864576180601722e-06, + "loss": 1.6868, + "step": 29809 + }, + { + "epoch": 9.149785144260282, + "grad_norm": 0.253282368183136, + "learning_rate": 1.8851053992966273e-06, + "loss": 1.7221, + "step": 29810 + }, + { + "epoch": 9.150092081031307, + "grad_norm": 0.22674274444580078, + "learning_rate": 1.8837536560326829e-06, + "loss": 1.7743, + "step": 29811 + }, + { + "epoch": 9.150399017802332, + "grad_norm": 0.15920962393283844, + "learning_rate": 1.8824023882816788e-06, + "loss": 1.6974, + "step": 29812 + }, + { + "epoch": 9.150705954573358, + "grad_norm": 0.19905294477939606, + "learning_rate": 1.8810515960569819e-06, + "loss": 1.7343, + "step": 29813 + }, + { + "epoch": 9.151012891344383, + "grad_norm": 0.16976027190685272, + "learning_rate": 1.8797012793719316e-06, + "loss": 1.7161, + "step": 29814 + }, + { + "epoch": 9.151319828115408, + "grad_norm": 0.1899489164352417, + "learning_rate": 1.8783514382398837e-06, + "loss": 1.7435, + "step": 29815 + }, + { + "epoch": 9.151626764886434, + "grad_norm": 0.1622486710548401, + "learning_rate": 1.8770020726741722e-06, + "loss": 1.7077, + "step": 29816 + }, + { + "epoch": 9.151933701657459, + "grad_norm": 0.16020219027996063, + "learning_rate": 1.8756531826881197e-06, + "loss": 1.7145, + "step": 29817 + }, + { + "epoch": 9.152240638428484, + "grad_norm": 0.14600935578346252, + "learning_rate": 1.874304768295082e-06, + "loss": 1.6632, + "step": 29818 + }, + { + "epoch": 9.15254757519951, + "grad_norm": 0.15835213661193848, + "learning_rate": 1.8729568295083656e-06, + "loss": 1.7302, + "step": 29819 + }, + { + "epoch": 9.152854511970535, + "grad_norm": 0.1220908835530281, + "learning_rate": 1.871609366341298e-06, + "loss": 1.6872, + "step": 29820 + }, + { + "epoch": 9.153161448741558, + "grad_norm": 0.1540200263261795, + "learning_rate": 1.8702623788072027e-06, + "loss": 1.7784, + "step": 29821 + }, + { + "epoch": 9.153468385512584, + "grad_norm": 0.1889277845621109, + "learning_rate": 1.8689158669193795e-06, + "loss": 1.7541, + "step": 29822 + }, + { + "epoch": 9.15377532228361, + "grad_norm": 0.14910344779491425, + "learning_rate": 1.8675698306911348e-06, + "loss": 1.6919, + "step": 29823 + }, + { + "epoch": 9.154082259054634, + "grad_norm": 0.15983067452907562, + "learning_rate": 1.8662242701357857e-06, + "loss": 1.6909, + "step": 29824 + }, + { + "epoch": 9.15438919582566, + "grad_norm": 0.13017424941062927, + "learning_rate": 1.8648791852666103e-06, + "loss": 1.7112, + "step": 29825 + }, + { + "epoch": 9.154696132596685, + "grad_norm": 0.15777593851089478, + "learning_rate": 1.8635345760969204e-06, + "loss": 1.7162, + "step": 29826 + }, + { + "epoch": 9.15500306936771, + "grad_norm": 0.18125833570957184, + "learning_rate": 1.8621904426399882e-06, + "loss": 1.7825, + "step": 29827 + }, + { + "epoch": 9.155310006138736, + "grad_norm": 0.182005375623703, + "learning_rate": 1.8608467849091149e-06, + "loss": 1.7526, + "step": 29828 + }, + { + "epoch": 9.155616942909761, + "grad_norm": 0.11626452207565308, + "learning_rate": 1.8595036029175562e-06, + "loss": 1.6564, + "step": 29829 + }, + { + "epoch": 9.155923879680786, + "grad_norm": 0.15099090337753296, + "learning_rate": 1.8581608966786069e-06, + "loss": 1.6712, + "step": 29830 + }, + { + "epoch": 9.15623081645181, + "grad_norm": 0.16302652657032013, + "learning_rate": 1.8568186662055286e-06, + "loss": 1.699, + "step": 29831 + }, + { + "epoch": 9.156537753222835, + "grad_norm": 0.14438454806804657, + "learning_rate": 1.8554769115115834e-06, + "loss": 1.6718, + "step": 29832 + }, + { + "epoch": 9.15684468999386, + "grad_norm": 0.13968545198440552, + "learning_rate": 1.8541356326100433e-06, + "loss": 1.6846, + "step": 29833 + }, + { + "epoch": 9.157151626764886, + "grad_norm": 0.13068513572216034, + "learning_rate": 1.8527948295141372e-06, + "loss": 1.6761, + "step": 29834 + }, + { + "epoch": 9.157458563535911, + "grad_norm": 0.14249193668365479, + "learning_rate": 1.8514545022371433e-06, + "loss": 1.6798, + "step": 29835 + }, + { + "epoch": 9.157765500306937, + "grad_norm": 0.1251843273639679, + "learning_rate": 1.850114650792295e-06, + "loss": 1.6956, + "step": 29836 + }, + { + "epoch": 9.158072437077962, + "grad_norm": 0.1275821328163147, + "learning_rate": 1.8487752751928323e-06, + "loss": 1.6795, + "step": 29837 + }, + { + "epoch": 9.158379373848987, + "grad_norm": 0.21461836993694305, + "learning_rate": 1.8474363754519997e-06, + "loss": 1.7701, + "step": 29838 + }, + { + "epoch": 9.158686310620013, + "grad_norm": 0.11155051738023758, + "learning_rate": 1.84609795158302e-06, + "loss": 1.6207, + "step": 29839 + }, + { + "epoch": 9.158993247391038, + "grad_norm": 0.10695862770080566, + "learning_rate": 1.8447600035991275e-06, + "loss": 1.6756, + "step": 29840 + }, + { + "epoch": 9.159300184162063, + "grad_norm": 0.13841044902801514, + "learning_rate": 1.843422531513539e-06, + "loss": 1.7111, + "step": 29841 + }, + { + "epoch": 9.159607120933087, + "grad_norm": 0.14116619527339935, + "learning_rate": 1.8420855353394718e-06, + "loss": 1.653, + "step": 29842 + }, + { + "epoch": 9.159914057704112, + "grad_norm": 0.16781140863895416, + "learning_rate": 1.8407490150901485e-06, + "loss": 1.7357, + "step": 29843 + }, + { + "epoch": 9.160220994475138, + "grad_norm": 0.21595926582813263, + "learning_rate": 1.8394129707787589e-06, + "loss": 1.7183, + "step": 29844 + }, + { + "epoch": 9.160527931246163, + "grad_norm": 0.14073456823825836, + "learning_rate": 1.838077402418531e-06, + "loss": 1.6756, + "step": 29845 + }, + { + "epoch": 9.160834868017188, + "grad_norm": 0.15962691605091095, + "learning_rate": 1.8367423100226377e-06, + "loss": 1.7247, + "step": 29846 + }, + { + "epoch": 9.161141804788214, + "grad_norm": 0.17450691759586334, + "learning_rate": 1.8354076936043018e-06, + "loss": 1.7286, + "step": 29847 + }, + { + "epoch": 9.161448741559239, + "grad_norm": 0.13126425445079803, + "learning_rate": 1.834073553176685e-06, + "loss": 1.6559, + "step": 29848 + }, + { + "epoch": 9.161755678330264, + "grad_norm": 0.14806927740573883, + "learning_rate": 1.8327398887529878e-06, + "loss": 1.7202, + "step": 29849 + }, + { + "epoch": 9.16206261510129, + "grad_norm": 0.17844204604625702, + "learning_rate": 1.8314067003463942e-06, + "loss": 1.6669, + "step": 29850 + }, + { + "epoch": 9.162369551872315, + "grad_norm": 0.14012929797172546, + "learning_rate": 1.83007398797006e-06, + "loss": 1.6912, + "step": 29851 + }, + { + "epoch": 9.16267648864334, + "grad_norm": 0.1496121734380722, + "learning_rate": 1.8287417516371751e-06, + "loss": 1.7157, + "step": 29852 + }, + { + "epoch": 9.162983425414364, + "grad_norm": 0.1662236452102661, + "learning_rate": 1.8274099913608955e-06, + "loss": 1.6586, + "step": 29853 + }, + { + "epoch": 9.16329036218539, + "grad_norm": 0.14133767783641815, + "learning_rate": 1.8260787071543884e-06, + "loss": 1.7107, + "step": 29854 + }, + { + "epoch": 9.163597298956415, + "grad_norm": 0.2213003784418106, + "learning_rate": 1.8247478990308041e-06, + "loss": 1.6686, + "step": 29855 + }, + { + "epoch": 9.16390423572744, + "grad_norm": 0.14967088401317596, + "learning_rate": 1.8234175670032993e-06, + "loss": 1.7537, + "step": 29856 + }, + { + "epoch": 9.164211172498465, + "grad_norm": 0.1617511510848999, + "learning_rate": 1.8220877110850187e-06, + "loss": 1.7581, + "step": 29857 + }, + { + "epoch": 9.16451810926949, + "grad_norm": 0.15769065916538239, + "learning_rate": 1.8207583312891075e-06, + "loss": 1.7908, + "step": 29858 + }, + { + "epoch": 9.164825046040516, + "grad_norm": 0.14482183754444122, + "learning_rate": 1.8194294276286994e-06, + "loss": 1.6987, + "step": 29859 + }, + { + "epoch": 9.165131982811541, + "grad_norm": 0.19343525171279907, + "learning_rate": 1.8181010001169285e-06, + "loss": 1.7083, + "step": 29860 + }, + { + "epoch": 9.165438919582567, + "grad_norm": 0.16402462124824524, + "learning_rate": 1.8167730487669176e-06, + "loss": 1.7532, + "step": 29861 + }, + { + "epoch": 9.165745856353592, + "grad_norm": 0.13917924463748932, + "learning_rate": 1.8154455735918118e-06, + "loss": 1.7184, + "step": 29862 + }, + { + "epoch": 9.166052793124617, + "grad_norm": 0.12260928750038147, + "learning_rate": 1.8141185746047006e-06, + "loss": 1.6783, + "step": 29863 + }, + { + "epoch": 9.16635972989564, + "grad_norm": 0.1644967645406723, + "learning_rate": 1.8127920518187235e-06, + "loss": 1.7101, + "step": 29864 + }, + { + "epoch": 9.166666666666666, + "grad_norm": 0.14527414739131927, + "learning_rate": 1.8114660052469645e-06, + "loss": 1.7104, + "step": 29865 + }, + { + "epoch": 9.166973603437691, + "grad_norm": 0.10901240259408951, + "learning_rate": 1.810140434902552e-06, + "loss": 1.6169, + "step": 29866 + }, + { + "epoch": 9.167280540208717, + "grad_norm": 0.135842964053154, + "learning_rate": 1.8088153407985809e-06, + "loss": 1.7481, + "step": 29867 + }, + { + "epoch": 9.167587476979742, + "grad_norm": 0.15822531282901764, + "learning_rate": 1.8074907229481298e-06, + "loss": 1.744, + "step": 29868 + }, + { + "epoch": 9.167894413750767, + "grad_norm": 0.1288236677646637, + "learning_rate": 1.8061665813643158e-06, + "loss": 1.7064, + "step": 29869 + }, + { + "epoch": 9.168201350521793, + "grad_norm": 0.15224573016166687, + "learning_rate": 1.8048429160602009e-06, + "loss": 1.7302, + "step": 29870 + }, + { + "epoch": 9.168508287292818, + "grad_norm": 0.16641436517238617, + "learning_rate": 1.8035197270488802e-06, + "loss": 1.7236, + "step": 29871 + }, + { + "epoch": 9.168815224063843, + "grad_norm": 0.15526805818080902, + "learning_rate": 1.8021970143434264e-06, + "loss": 1.7224, + "step": 29872 + }, + { + "epoch": 9.169122160834869, + "grad_norm": 0.15454156696796417, + "learning_rate": 1.8008747779569125e-06, + "loss": 1.721, + "step": 29873 + }, + { + "epoch": 9.169429097605892, + "grad_norm": 0.20796442031860352, + "learning_rate": 1.7995530179024001e-06, + "loss": 1.775, + "step": 29874 + }, + { + "epoch": 9.169736034376918, + "grad_norm": 0.186804860830307, + "learning_rate": 1.7982317341929623e-06, + "loss": 1.7164, + "step": 29875 + }, + { + "epoch": 9.170042971147943, + "grad_norm": 0.16180850565433502, + "learning_rate": 1.796910926841644e-06, + "loss": 1.7413, + "step": 29876 + }, + { + "epoch": 9.170349907918968, + "grad_norm": 0.15675058960914612, + "learning_rate": 1.7955905958615071e-06, + "loss": 1.7242, + "step": 29877 + }, + { + "epoch": 9.170656844689994, + "grad_norm": 0.13031265139579773, + "learning_rate": 1.794270741265597e-06, + "loss": 1.7362, + "step": 29878 + }, + { + "epoch": 9.170963781461019, + "grad_norm": 0.16068242490291595, + "learning_rate": 1.7929513630669636e-06, + "loss": 1.7262, + "step": 29879 + }, + { + "epoch": 9.171270718232044, + "grad_norm": 0.11941052973270416, + "learning_rate": 1.791632461278625e-06, + "loss": 1.6661, + "step": 29880 + }, + { + "epoch": 9.17157765500307, + "grad_norm": 0.1486428678035736, + "learning_rate": 1.7903140359136483e-06, + "loss": 1.7032, + "step": 29881 + }, + { + "epoch": 9.171884591774095, + "grad_norm": 0.1541515737771988, + "learning_rate": 1.7889960869850342e-06, + "loss": 1.7176, + "step": 29882 + }, + { + "epoch": 9.17219152854512, + "grad_norm": 0.17397575080394745, + "learning_rate": 1.7876786145058167e-06, + "loss": 1.7475, + "step": 29883 + }, + { + "epoch": 9.172498465316146, + "grad_norm": 0.15537402033805847, + "learning_rate": 1.7863616184890297e-06, + "loss": 1.6786, + "step": 29884 + }, + { + "epoch": 9.17280540208717, + "grad_norm": 0.20951804518699646, + "learning_rate": 1.785045098947663e-06, + "loss": 1.8126, + "step": 29885 + }, + { + "epoch": 9.173112338858195, + "grad_norm": 0.1401960551738739, + "learning_rate": 1.7837290558947506e-06, + "loss": 1.7059, + "step": 29886 + }, + { + "epoch": 9.17341927562922, + "grad_norm": 0.13450580835342407, + "learning_rate": 1.7824134893432764e-06, + "loss": 1.6921, + "step": 29887 + }, + { + "epoch": 9.173726212400245, + "grad_norm": 0.12671135365962982, + "learning_rate": 1.7810983993062579e-06, + "loss": 1.7248, + "step": 29888 + }, + { + "epoch": 9.17403314917127, + "grad_norm": 0.13940559327602386, + "learning_rate": 1.7797837857966904e-06, + "loss": 1.6749, + "step": 29889 + }, + { + "epoch": 9.174340085942296, + "grad_norm": 0.13822492957115173, + "learning_rate": 1.7784696488275576e-06, + "loss": 1.6912, + "step": 29890 + }, + { + "epoch": 9.174647022713321, + "grad_norm": 0.1438322365283966, + "learning_rate": 1.7771559884118549e-06, + "loss": 1.6909, + "step": 29891 + }, + { + "epoch": 9.174953959484347, + "grad_norm": 0.13645079731941223, + "learning_rate": 1.7758428045625608e-06, + "loss": 1.6932, + "step": 29892 + }, + { + "epoch": 9.175260896255372, + "grad_norm": 0.16978910565376282, + "learning_rate": 1.7745300972926538e-06, + "loss": 1.7474, + "step": 29893 + }, + { + "epoch": 9.175567833026397, + "grad_norm": 0.1612422913312912, + "learning_rate": 1.7732178666151067e-06, + "loss": 1.7152, + "step": 29894 + }, + { + "epoch": 9.175874769797423, + "grad_norm": 0.20364105701446533, + "learning_rate": 1.7719061125428815e-06, + "loss": 1.8032, + "step": 29895 + }, + { + "epoch": 9.176181706568446, + "grad_norm": 0.1400647908449173, + "learning_rate": 1.7705948350889567e-06, + "loss": 1.6878, + "step": 29896 + }, + { + "epoch": 9.176488643339471, + "grad_norm": 0.17033728957176208, + "learning_rate": 1.769284034266272e-06, + "loss": 1.7088, + "step": 29897 + }, + { + "epoch": 9.176795580110497, + "grad_norm": 0.1421220600605011, + "learning_rate": 1.7679737100878002e-06, + "loss": 1.6616, + "step": 29898 + }, + { + "epoch": 9.177102516881522, + "grad_norm": 0.16700543463230133, + "learning_rate": 1.76666386256647e-06, + "loss": 1.6915, + "step": 29899 + }, + { + "epoch": 9.177409453652547, + "grad_norm": 0.11176354438066483, + "learning_rate": 1.7653544917152487e-06, + "loss": 1.6733, + "step": 29900 + }, + { + "epoch": 9.177716390423573, + "grad_norm": 0.1324780434370041, + "learning_rate": 1.7640455975470648e-06, + "loss": 1.7397, + "step": 29901 + }, + { + "epoch": 9.178023327194598, + "grad_norm": 0.19537372887134552, + "learning_rate": 1.762737180074847e-06, + "loss": 1.7479, + "step": 29902 + }, + { + "epoch": 9.178330263965623, + "grad_norm": 0.1455310732126236, + "learning_rate": 1.7614292393115462e-06, + "loss": 1.6603, + "step": 29903 + }, + { + "epoch": 9.178637200736649, + "grad_norm": 0.15979693830013275, + "learning_rate": 1.7601217752700627e-06, + "loss": 1.7247, + "step": 29904 + }, + { + "epoch": 9.178944137507674, + "grad_norm": 0.1877484917640686, + "learning_rate": 1.7588147879633365e-06, + "loss": 1.6787, + "step": 29905 + }, + { + "epoch": 9.1792510742787, + "grad_norm": 0.1619114726781845, + "learning_rate": 1.757508277404274e-06, + "loss": 1.7523, + "step": 29906 + }, + { + "epoch": 9.179558011049723, + "grad_norm": 0.19995933771133423, + "learning_rate": 1.7562022436057922e-06, + "loss": 1.7835, + "step": 29907 + }, + { + "epoch": 9.179864947820748, + "grad_norm": 0.17540034651756287, + "learning_rate": 1.7548966865807982e-06, + "loss": 1.7116, + "step": 29908 + }, + { + "epoch": 9.180171884591774, + "grad_norm": 0.1773085743188858, + "learning_rate": 1.753591606342192e-06, + "loss": 1.7527, + "step": 29909 + }, + { + "epoch": 9.180478821362799, + "grad_norm": 0.18704703450202942, + "learning_rate": 1.7522870029028694e-06, + "loss": 1.7245, + "step": 29910 + }, + { + "epoch": 9.180785758133824, + "grad_norm": 0.12332191318273544, + "learning_rate": 1.7509828762757253e-06, + "loss": 1.6869, + "step": 29911 + }, + { + "epoch": 9.18109269490485, + "grad_norm": 0.16095921397209167, + "learning_rate": 1.7496792264736439e-06, + "loss": 1.7862, + "step": 29912 + }, + { + "epoch": 9.181399631675875, + "grad_norm": 0.1321704238653183, + "learning_rate": 1.7483760535095262e-06, + "loss": 1.6952, + "step": 29913 + }, + { + "epoch": 9.1817065684469, + "grad_norm": 0.14660334587097168, + "learning_rate": 1.7470733573962227e-06, + "loss": 1.7295, + "step": 29914 + }, + { + "epoch": 9.182013505217926, + "grad_norm": 0.18334107100963593, + "learning_rate": 1.7457711381466345e-06, + "loss": 1.7875, + "step": 29915 + }, + { + "epoch": 9.182320441988951, + "grad_norm": 0.13693606853485107, + "learning_rate": 1.7444693957736069e-06, + "loss": 1.6882, + "step": 29916 + }, + { + "epoch": 9.182627378759975, + "grad_norm": 0.1939692199230194, + "learning_rate": 1.7431681302900238e-06, + "loss": 1.7296, + "step": 29917 + }, + { + "epoch": 9.182934315531, + "grad_norm": 0.219837948679924, + "learning_rate": 1.7418673417087417e-06, + "loss": 1.7595, + "step": 29918 + }, + { + "epoch": 9.183241252302025, + "grad_norm": 0.1344659924507141, + "learning_rate": 1.7405670300426002e-06, + "loss": 1.707, + "step": 29919 + }, + { + "epoch": 9.18354818907305, + "grad_norm": 0.1565396636724472, + "learning_rate": 1.7392671953044725e-06, + "loss": 1.7312, + "step": 29920 + }, + { + "epoch": 9.183855125844076, + "grad_norm": 0.1617916077375412, + "learning_rate": 1.7379678375071818e-06, + "loss": 1.6859, + "step": 29921 + }, + { + "epoch": 9.184162062615101, + "grad_norm": 0.26025474071502686, + "learning_rate": 1.7366689566635841e-06, + "loss": 1.6916, + "step": 29922 + }, + { + "epoch": 9.184468999386127, + "grad_norm": 0.10923932492733002, + "learning_rate": 1.7353705527865138e-06, + "loss": 1.6574, + "step": 29923 + }, + { + "epoch": 9.184775936157152, + "grad_norm": 0.13846524059772491, + "learning_rate": 1.7340726258887997e-06, + "loss": 1.7057, + "step": 29924 + }, + { + "epoch": 9.185082872928177, + "grad_norm": 0.16603818535804749, + "learning_rate": 1.73277517598327e-06, + "loss": 1.6955, + "step": 29925 + }, + { + "epoch": 9.185389809699203, + "grad_norm": 0.14902694523334503, + "learning_rate": 1.731478203082748e-06, + "loss": 1.6999, + "step": 29926 + }, + { + "epoch": 9.185696746470228, + "grad_norm": 0.12260756641626358, + "learning_rate": 1.7301817072000459e-06, + "loss": 1.7097, + "step": 29927 + }, + { + "epoch": 9.186003683241251, + "grad_norm": 0.1545649915933609, + "learning_rate": 1.7288856883479809e-06, + "loss": 1.6913, + "step": 29928 + }, + { + "epoch": 9.186310620012277, + "grad_norm": 0.1564372181892395, + "learning_rate": 1.7275901465393595e-06, + "loss": 1.7428, + "step": 29929 + }, + { + "epoch": 9.186617556783302, + "grad_norm": 0.14948883652687073, + "learning_rate": 1.726295081786994e-06, + "loss": 1.6928, + "step": 29930 + }, + { + "epoch": 9.186924493554327, + "grad_norm": 0.19552940130233765, + "learning_rate": 1.7250004941036568e-06, + "loss": 1.7277, + "step": 29931 + }, + { + "epoch": 9.187231430325353, + "grad_norm": 0.13902166485786438, + "learning_rate": 1.7237063835021771e-06, + "loss": 1.7208, + "step": 29932 + }, + { + "epoch": 9.187538367096378, + "grad_norm": 0.13597513735294342, + "learning_rate": 1.7224127499953169e-06, + "loss": 1.712, + "step": 29933 + }, + { + "epoch": 9.187845303867404, + "grad_norm": 0.14096584916114807, + "learning_rate": 1.7211195935958713e-06, + "loss": 1.6927, + "step": 29934 + }, + { + "epoch": 9.188152240638429, + "grad_norm": 0.1446818709373474, + "learning_rate": 1.71982691431663e-06, + "loss": 1.6898, + "step": 29935 + }, + { + "epoch": 9.188459177409454, + "grad_norm": 0.12654201686382294, + "learning_rate": 1.7185347121703388e-06, + "loss": 1.675, + "step": 29936 + }, + { + "epoch": 9.18876611418048, + "grad_norm": 0.18681016564369202, + "learning_rate": 1.7172429871698037e-06, + "loss": 1.7206, + "step": 29937 + }, + { + "epoch": 9.189073050951505, + "grad_norm": 0.10353434830904007, + "learning_rate": 1.715951739327759e-06, + "loss": 1.6492, + "step": 29938 + }, + { + "epoch": 9.189379987722528, + "grad_norm": 0.16447822749614716, + "learning_rate": 1.7146609686569837e-06, + "loss": 1.7189, + "step": 29939 + }, + { + "epoch": 9.189686924493554, + "grad_norm": 0.159690260887146, + "learning_rate": 1.713370675170234e-06, + "loss": 1.7335, + "step": 29940 + }, + { + "epoch": 9.189993861264579, + "grad_norm": 0.17329075932502747, + "learning_rate": 1.7120808588802495e-06, + "loss": 1.7113, + "step": 29941 + }, + { + "epoch": 9.190300798035604, + "grad_norm": 0.12317316979169846, + "learning_rate": 1.7107915197997925e-06, + "loss": 1.7149, + "step": 29942 + }, + { + "epoch": 9.19060773480663, + "grad_norm": 0.2204972505569458, + "learning_rate": 1.7095026579415918e-06, + "loss": 1.7845, + "step": 29943 + }, + { + "epoch": 9.190914671577655, + "grad_norm": 0.13796095550060272, + "learning_rate": 1.7082142733183925e-06, + "loss": 1.7121, + "step": 29944 + }, + { + "epoch": 9.19122160834868, + "grad_norm": 0.14287333190441132, + "learning_rate": 1.7069263659429236e-06, + "loss": 1.7026, + "step": 29945 + }, + { + "epoch": 9.191528545119706, + "grad_norm": 0.19072957336902618, + "learning_rate": 1.705638935827908e-06, + "loss": 1.7604, + "step": 29946 + }, + { + "epoch": 9.191835481890731, + "grad_norm": 0.19318242371082306, + "learning_rate": 1.7043519829860855e-06, + "loss": 1.7107, + "step": 29947 + }, + { + "epoch": 9.192142418661756, + "grad_norm": 0.1858752965927124, + "learning_rate": 1.7030655074301517e-06, + "loss": 1.7408, + "step": 29948 + }, + { + "epoch": 9.192449355432782, + "grad_norm": 0.17308852076530457, + "learning_rate": 1.701779509172846e-06, + "loss": 1.6848, + "step": 29949 + }, + { + "epoch": 9.192756292203805, + "grad_norm": 0.12158332020044327, + "learning_rate": 1.7004939882268478e-06, + "loss": 1.6964, + "step": 29950 + }, + { + "epoch": 9.19306322897483, + "grad_norm": 0.12801475822925568, + "learning_rate": 1.6992089446048908e-06, + "loss": 1.6643, + "step": 29951 + }, + { + "epoch": 9.193370165745856, + "grad_norm": 0.13018257915973663, + "learning_rate": 1.6979243783196596e-06, + "loss": 1.6741, + "step": 29952 + }, + { + "epoch": 9.193677102516881, + "grad_norm": 0.1402437686920166, + "learning_rate": 1.696640289383844e-06, + "loss": 1.737, + "step": 29953 + }, + { + "epoch": 9.193984039287907, + "grad_norm": 0.15448710322380066, + "learning_rate": 1.6953566778101448e-06, + "loss": 1.7147, + "step": 29954 + }, + { + "epoch": 9.194290976058932, + "grad_norm": 0.19089701771736145, + "learning_rate": 1.6940735436112409e-06, + "loss": 1.7047, + "step": 29955 + }, + { + "epoch": 9.194597912829957, + "grad_norm": 0.13311919569969177, + "learning_rate": 1.692790886799811e-06, + "loss": 1.6698, + "step": 29956 + }, + { + "epoch": 9.194904849600983, + "grad_norm": 0.14337676763534546, + "learning_rate": 1.691508707388545e-06, + "loss": 1.7124, + "step": 29957 + }, + { + "epoch": 9.195211786372008, + "grad_norm": 0.15666979551315308, + "learning_rate": 1.6902270053900993e-06, + "loss": 1.6884, + "step": 29958 + }, + { + "epoch": 9.195518723143033, + "grad_norm": 0.15445134043693542, + "learning_rate": 1.6889457808171472e-06, + "loss": 1.7395, + "step": 29959 + }, + { + "epoch": 9.195825659914057, + "grad_norm": 0.1683775633573532, + "learning_rate": 1.6876650336823452e-06, + "loss": 1.7808, + "step": 29960 + }, + { + "epoch": 9.196132596685082, + "grad_norm": 0.2521384060382843, + "learning_rate": 1.686384763998361e-06, + "loss": 1.7684, + "step": 29961 + }, + { + "epoch": 9.196439533456108, + "grad_norm": 0.15807218849658966, + "learning_rate": 1.6851049717778345e-06, + "loss": 1.7253, + "step": 29962 + }, + { + "epoch": 9.196746470227133, + "grad_norm": 0.18106147646903992, + "learning_rate": 1.683825657033411e-06, + "loss": 1.773, + "step": 29963 + }, + { + "epoch": 9.197053406998158, + "grad_norm": 0.14914186298847198, + "learning_rate": 1.6825468197777582e-06, + "loss": 1.6628, + "step": 29964 + }, + { + "epoch": 9.197360343769184, + "grad_norm": 0.12124781310558319, + "learning_rate": 1.681268460023483e-06, + "loss": 1.6634, + "step": 29965 + }, + { + "epoch": 9.197667280540209, + "grad_norm": 0.15450555086135864, + "learning_rate": 1.679990577783247e-06, + "loss": 1.741, + "step": 29966 + }, + { + "epoch": 9.197974217311234, + "grad_norm": 0.21389459073543549, + "learning_rate": 1.678713173069657e-06, + "loss": 1.7145, + "step": 29967 + }, + { + "epoch": 9.19828115408226, + "grad_norm": 0.1850728541612625, + "learning_rate": 1.6774362458953474e-06, + "loss": 1.7674, + "step": 29968 + }, + { + "epoch": 9.198588090853285, + "grad_norm": 0.160726860165596, + "learning_rate": 1.6761597962729413e-06, + "loss": 1.7598, + "step": 29969 + }, + { + "epoch": 9.19889502762431, + "grad_norm": 0.15501825511455536, + "learning_rate": 1.6748838242150344e-06, + "loss": 1.7443, + "step": 29970 + }, + { + "epoch": 9.199201964395334, + "grad_norm": 0.17127695679664612, + "learning_rate": 1.6736083297342609e-06, + "loss": 1.7289, + "step": 29971 + }, + { + "epoch": 9.199508901166359, + "grad_norm": 0.13027416169643402, + "learning_rate": 1.672333312843205e-06, + "loss": 1.6673, + "step": 29972 + }, + { + "epoch": 9.199815837937384, + "grad_norm": 0.16939190030097961, + "learning_rate": 1.6710587735544847e-06, + "loss": 1.7582, + "step": 29973 + }, + { + "epoch": 9.20012277470841, + "grad_norm": 0.19931311905384064, + "learning_rate": 1.6697847118806898e-06, + "loss": 1.7894, + "step": 29974 + }, + { + "epoch": 9.200429711479435, + "grad_norm": 0.16785076260566711, + "learning_rate": 1.6685111278344045e-06, + "loss": 1.7051, + "step": 29975 + }, + { + "epoch": 9.20073664825046, + "grad_norm": 0.18373487889766693, + "learning_rate": 1.667238021428219e-06, + "loss": 1.7078, + "step": 29976 + }, + { + "epoch": 9.201043585021486, + "grad_norm": 0.1502874493598938, + "learning_rate": 1.6659653926747232e-06, + "loss": 1.702, + "step": 29977 + }, + { + "epoch": 9.201350521792511, + "grad_norm": 0.17113728821277618, + "learning_rate": 1.6646932415864791e-06, + "loss": 1.6796, + "step": 29978 + }, + { + "epoch": 9.201657458563536, + "grad_norm": 0.14872509241104126, + "learning_rate": 1.6634215681760712e-06, + "loss": 1.6883, + "step": 29979 + }, + { + "epoch": 9.201964395334562, + "grad_norm": 0.14375372231006622, + "learning_rate": 1.662150372456056e-06, + "loss": 1.6833, + "step": 29980 + }, + { + "epoch": 9.202271332105587, + "grad_norm": 0.20072759687900543, + "learning_rate": 1.6608796544390127e-06, + "loss": 1.7408, + "step": 29981 + }, + { + "epoch": 9.20257826887661, + "grad_norm": 0.14475533366203308, + "learning_rate": 1.6596094141374807e-06, + "loss": 1.7138, + "step": 29982 + }, + { + "epoch": 9.202885205647636, + "grad_norm": 0.16516630351543427, + "learning_rate": 1.6583396515640338e-06, + "loss": 1.7765, + "step": 29983 + }, + { + "epoch": 9.203192142418661, + "grad_norm": 0.1530120074748993, + "learning_rate": 1.6570703667311894e-06, + "loss": 1.7047, + "step": 29984 + }, + { + "epoch": 9.203499079189687, + "grad_norm": 0.14001020789146423, + "learning_rate": 1.655801559651521e-06, + "loss": 1.691, + "step": 29985 + }, + { + "epoch": 9.203806015960712, + "grad_norm": 0.15876981616020203, + "learning_rate": 1.6545332303375626e-06, + "loss": 1.7238, + "step": 29986 + }, + { + "epoch": 9.204112952731737, + "grad_norm": 0.1669185608625412, + "learning_rate": 1.6532653788018326e-06, + "loss": 1.7345, + "step": 29987 + }, + { + "epoch": 9.204419889502763, + "grad_norm": 0.12812626361846924, + "learning_rate": 1.6519980050568817e-06, + "loss": 1.6792, + "step": 29988 + }, + { + "epoch": 9.204726826273788, + "grad_norm": 0.1336258500814438, + "learning_rate": 1.6507311091152166e-06, + "loss": 1.688, + "step": 29989 + }, + { + "epoch": 9.205033763044813, + "grad_norm": 0.18334448337554932, + "learning_rate": 1.6494646909893663e-06, + "loss": 1.745, + "step": 29990 + }, + { + "epoch": 9.205340699815839, + "grad_norm": 0.1458664983510971, + "learning_rate": 1.6481987506918428e-06, + "loss": 1.6967, + "step": 29991 + }, + { + "epoch": 9.205647636586862, + "grad_norm": 0.13565613329410553, + "learning_rate": 1.646933288235164e-06, + "loss": 1.6649, + "step": 29992 + }, + { + "epoch": 9.205954573357888, + "grad_norm": 0.1161680594086647, + "learning_rate": 1.6456683036318255e-06, + "loss": 1.6838, + "step": 29993 + }, + { + "epoch": 9.206261510128913, + "grad_norm": 0.1749819964170456, + "learning_rate": 1.6444037968943394e-06, + "loss": 1.7567, + "step": 29994 + }, + { + "epoch": 9.206568446899938, + "grad_norm": 0.1397893726825714, + "learning_rate": 1.6431397680351957e-06, + "loss": 1.7191, + "step": 29995 + }, + { + "epoch": 9.206875383670964, + "grad_norm": 0.13551786541938782, + "learning_rate": 1.64187621706689e-06, + "loss": 1.6938, + "step": 29996 + }, + { + "epoch": 9.207182320441989, + "grad_norm": 0.13458238542079926, + "learning_rate": 1.6406131440019012e-06, + "loss": 1.6701, + "step": 29997 + }, + { + "epoch": 9.207489257213014, + "grad_norm": 0.14004193246364594, + "learning_rate": 1.6393505488527194e-06, + "loss": 1.6758, + "step": 29998 + }, + { + "epoch": 9.20779619398404, + "grad_norm": 0.1691395789384842, + "learning_rate": 1.6380884316318179e-06, + "loss": 1.704, + "step": 29999 + }, + { + "epoch": 9.208103130755065, + "grad_norm": 0.13417977094650269, + "learning_rate": 1.636826792351681e-06, + "loss": 1.7207, + "step": 30000 + }, + { + "epoch": 9.20841006752609, + "grad_norm": 0.12645697593688965, + "learning_rate": 1.6355656310247658e-06, + "loss": 1.6594, + "step": 30001 + }, + { + "epoch": 9.208717004297116, + "grad_norm": 0.17769555747509003, + "learning_rate": 1.634304947663534e-06, + "loss": 1.7316, + "step": 30002 + }, + { + "epoch": 9.20902394106814, + "grad_norm": 0.12273482233285904, + "learning_rate": 1.633044742280454e-06, + "loss": 1.6724, + "step": 30003 + }, + { + "epoch": 9.209330877839164, + "grad_norm": 0.15213249623775482, + "learning_rate": 1.6317850148879654e-06, + "loss": 1.7555, + "step": 30004 + }, + { + "epoch": 9.20963781461019, + "grad_norm": 0.22034598886966705, + "learning_rate": 1.6305257654985361e-06, + "loss": 1.7395, + "step": 30005 + }, + { + "epoch": 9.209944751381215, + "grad_norm": 0.1581713706254959, + "learning_rate": 1.6292669941245953e-06, + "loss": 1.7504, + "step": 30006 + }, + { + "epoch": 9.21025168815224, + "grad_norm": 0.1384512335062027, + "learning_rate": 1.6280087007785939e-06, + "loss": 1.6991, + "step": 30007 + }, + { + "epoch": 9.210558624923266, + "grad_norm": 0.15608127415180206, + "learning_rate": 1.6267508854729608e-06, + "loss": 1.7229, + "step": 30008 + }, + { + "epoch": 9.210865561694291, + "grad_norm": 0.22049592435359955, + "learning_rate": 1.625493548220125e-06, + "loss": 1.7395, + "step": 30009 + }, + { + "epoch": 9.211172498465316, + "grad_norm": 0.13226120173931122, + "learning_rate": 1.6242366890325155e-06, + "loss": 1.6797, + "step": 30010 + }, + { + "epoch": 9.211479435236342, + "grad_norm": 0.17857056856155396, + "learning_rate": 1.6229803079225559e-06, + "loss": 1.7725, + "step": 30011 + }, + { + "epoch": 9.211786372007367, + "grad_norm": 0.14409810304641724, + "learning_rate": 1.6217244049026581e-06, + "loss": 1.6777, + "step": 30012 + }, + { + "epoch": 9.212093308778392, + "grad_norm": 0.15496647357940674, + "learning_rate": 1.6204689799852401e-06, + "loss": 1.7171, + "step": 30013 + }, + { + "epoch": 9.212400245549416, + "grad_norm": 0.1262955516576767, + "learning_rate": 1.6192140331826977e-06, + "loss": 1.7066, + "step": 30014 + }, + { + "epoch": 9.212707182320441, + "grad_norm": 0.14165538549423218, + "learning_rate": 1.6179595645074431e-06, + "loss": 1.7425, + "step": 30015 + }, + { + "epoch": 9.213014119091467, + "grad_norm": 0.1557457596063614, + "learning_rate": 1.6167055739718605e-06, + "loss": 1.7181, + "step": 30016 + }, + { + "epoch": 9.213321055862492, + "grad_norm": 0.13509629666805267, + "learning_rate": 1.6154520615883627e-06, + "loss": 1.6942, + "step": 30017 + }, + { + "epoch": 9.213627992633517, + "grad_norm": 0.14409126341342926, + "learning_rate": 1.614199027369323e-06, + "loss": 1.6949, + "step": 30018 + }, + { + "epoch": 9.213934929404543, + "grad_norm": 0.14323770999908447, + "learning_rate": 1.6129464713271315e-06, + "loss": 1.7007, + "step": 30019 + }, + { + "epoch": 9.214241866175568, + "grad_norm": 0.12424668669700623, + "learning_rate": 1.6116943934741558e-06, + "loss": 1.7118, + "step": 30020 + }, + { + "epoch": 9.214548802946593, + "grad_norm": 0.16182856261730194, + "learning_rate": 1.6104427938227807e-06, + "loss": 1.7683, + "step": 30021 + }, + { + "epoch": 9.214855739717619, + "grad_norm": 0.136052668094635, + "learning_rate": 1.609191672385374e-06, + "loss": 1.6438, + "step": 30022 + }, + { + "epoch": 9.215162676488644, + "grad_norm": 0.14279018342494965, + "learning_rate": 1.6079410291742924e-06, + "loss": 1.7062, + "step": 30023 + }, + { + "epoch": 9.215469613259668, + "grad_norm": 0.11300359666347504, + "learning_rate": 1.6066908642019097e-06, + "loss": 1.6509, + "step": 30024 + }, + { + "epoch": 9.215776550030693, + "grad_norm": 0.14017970860004425, + "learning_rate": 1.6054411774805655e-06, + "loss": 1.68, + "step": 30025 + }, + { + "epoch": 9.216083486801718, + "grad_norm": 0.12801769375801086, + "learning_rate": 1.604191969022617e-06, + "loss": 1.7377, + "step": 30026 + }, + { + "epoch": 9.216390423572744, + "grad_norm": 0.16302450001239777, + "learning_rate": 1.6029432388404097e-06, + "loss": 1.6966, + "step": 30027 + }, + { + "epoch": 9.216697360343769, + "grad_norm": 0.12138327211141586, + "learning_rate": 1.6016949869462894e-06, + "loss": 1.6836, + "step": 30028 + }, + { + "epoch": 9.217004297114794, + "grad_norm": 0.14843621850013733, + "learning_rate": 1.6004472133525794e-06, + "loss": 1.6891, + "step": 30029 + }, + { + "epoch": 9.21731123388582, + "grad_norm": 0.1426590085029602, + "learning_rate": 1.59919991807162e-06, + "loss": 1.6759, + "step": 30030 + }, + { + "epoch": 9.217618170656845, + "grad_norm": 0.1690209060907364, + "learning_rate": 1.59795310111574e-06, + "loss": 1.7315, + "step": 30031 + }, + { + "epoch": 9.21792510742787, + "grad_norm": 0.1929413378238678, + "learning_rate": 1.596706762497252e-06, + "loss": 1.7137, + "step": 30032 + }, + { + "epoch": 9.218232044198896, + "grad_norm": 0.16534923017024994, + "learning_rate": 1.5954609022284739e-06, + "loss": 1.7599, + "step": 30033 + }, + { + "epoch": 9.218538980969921, + "grad_norm": 0.16535919904708862, + "learning_rate": 1.594215520321729e-06, + "loss": 1.7358, + "step": 30034 + }, + { + "epoch": 9.218845917740945, + "grad_norm": 0.1476306915283203, + "learning_rate": 1.5929706167893188e-06, + "loss": 1.6952, + "step": 30035 + }, + { + "epoch": 9.21915285451197, + "grad_norm": 0.12421105802059174, + "learning_rate": 1.5917261916435388e-06, + "loss": 1.6731, + "step": 30036 + }, + { + "epoch": 9.219459791282995, + "grad_norm": 0.18759414553642273, + "learning_rate": 1.5904822448967017e-06, + "loss": 1.6516, + "step": 30037 + }, + { + "epoch": 9.21976672805402, + "grad_norm": 0.16421522200107574, + "learning_rate": 1.5892387765610806e-06, + "loss": 1.6702, + "step": 30038 + }, + { + "epoch": 9.220073664825046, + "grad_norm": 0.15226107835769653, + "learning_rate": 1.587995786648988e-06, + "loss": 1.6868, + "step": 30039 + }, + { + "epoch": 9.220380601596071, + "grad_norm": 0.18976561725139618, + "learning_rate": 1.5867532751726865e-06, + "loss": 1.7359, + "step": 30040 + }, + { + "epoch": 9.220687538367097, + "grad_norm": 0.1367981731891632, + "learning_rate": 1.5855112421444774e-06, + "loss": 1.6977, + "step": 30041 + }, + { + "epoch": 9.220994475138122, + "grad_norm": 0.13698583841323853, + "learning_rate": 1.5842696875766116e-06, + "loss": 1.7305, + "step": 30042 + }, + { + "epoch": 9.221301411909147, + "grad_norm": 0.14987944066524506, + "learning_rate": 1.5830286114813742e-06, + "loss": 1.706, + "step": 30043 + }, + { + "epoch": 9.221608348680173, + "grad_norm": 0.1334082931280136, + "learning_rate": 1.5817880138710273e-06, + "loss": 1.6489, + "step": 30044 + }, + { + "epoch": 9.221915285451198, + "grad_norm": 0.27590668201446533, + "learning_rate": 1.580547894757828e-06, + "loss": 1.8041, + "step": 30045 + }, + { + "epoch": 9.222222222222221, + "grad_norm": 0.13377591967582703, + "learning_rate": 1.5793082541540327e-06, + "loss": 1.7251, + "step": 30046 + }, + { + "epoch": 9.222529158993247, + "grad_norm": 0.15182198584079742, + "learning_rate": 1.5780690920718988e-06, + "loss": 1.6932, + "step": 30047 + }, + { + "epoch": 9.222836095764272, + "grad_norm": 0.12374742329120636, + "learning_rate": 1.5768304085236663e-06, + "loss": 1.6808, + "step": 30048 + }, + { + "epoch": 9.223143032535297, + "grad_norm": 0.14800786972045898, + "learning_rate": 1.5755922035215753e-06, + "loss": 1.7341, + "step": 30049 + }, + { + "epoch": 9.223449969306323, + "grad_norm": 0.18947643041610718, + "learning_rate": 1.574354477077855e-06, + "loss": 1.7121, + "step": 30050 + }, + { + "epoch": 9.223756906077348, + "grad_norm": 0.13209564983844757, + "learning_rate": 1.5731172292047625e-06, + "loss": 1.6491, + "step": 30051 + }, + { + "epoch": 9.224063842848373, + "grad_norm": 0.1743779480457306, + "learning_rate": 1.5718804599145043e-06, + "loss": 1.7364, + "step": 30052 + }, + { + "epoch": 9.224370779619399, + "grad_norm": 0.1696232557296753, + "learning_rate": 1.5706441692193096e-06, + "loss": 1.7148, + "step": 30053 + }, + { + "epoch": 9.224677716390424, + "grad_norm": 0.38987866044044495, + "learning_rate": 1.5694083571313912e-06, + "loss": 1.7351, + "step": 30054 + }, + { + "epoch": 9.22498465316145, + "grad_norm": 0.18110236525535583, + "learning_rate": 1.568173023662961e-06, + "loss": 1.7497, + "step": 30055 + }, + { + "epoch": 9.225291589932475, + "grad_norm": 0.11834049224853516, + "learning_rate": 1.566938168826243e-06, + "loss": 1.6824, + "step": 30056 + }, + { + "epoch": 9.225598526703498, + "grad_norm": 0.1685422658920288, + "learning_rate": 1.5657037926334162e-06, + "loss": 1.6938, + "step": 30057 + }, + { + "epoch": 9.225905463474524, + "grad_norm": 0.17743349075317383, + "learning_rate": 1.5644698950967095e-06, + "loss": 1.7747, + "step": 30058 + }, + { + "epoch": 9.226212400245549, + "grad_norm": 0.13532224297523499, + "learning_rate": 1.5632364762282859e-06, + "loss": 1.6867, + "step": 30059 + }, + { + "epoch": 9.226519337016574, + "grad_norm": 0.1925237476825714, + "learning_rate": 1.5620035360403517e-06, + "loss": 1.7638, + "step": 30060 + }, + { + "epoch": 9.2268262737876, + "grad_norm": 0.15505637228488922, + "learning_rate": 1.560771074545092e-06, + "loss": 1.6801, + "step": 30061 + }, + { + "epoch": 9.227133210558625, + "grad_norm": 0.15233661234378815, + "learning_rate": 1.559539091754686e-06, + "loss": 1.6854, + "step": 30062 + }, + { + "epoch": 9.22744014732965, + "grad_norm": 0.1538659930229187, + "learning_rate": 1.5583075876813013e-06, + "loss": 1.7107, + "step": 30063 + }, + { + "epoch": 9.227747084100676, + "grad_norm": 0.1162392795085907, + "learning_rate": 1.5570765623371176e-06, + "loss": 1.6593, + "step": 30064 + }, + { + "epoch": 9.228054020871701, + "grad_norm": 0.1888103187084198, + "learning_rate": 1.5558460157342913e-06, + "loss": 1.7406, + "step": 30065 + }, + { + "epoch": 9.228360957642726, + "grad_norm": 0.13712546229362488, + "learning_rate": 1.5546159478849964e-06, + "loss": 1.6892, + "step": 30066 + }, + { + "epoch": 9.22866789441375, + "grad_norm": 0.20172229409217834, + "learning_rate": 1.553386358801373e-06, + "loss": 1.7729, + "step": 30067 + }, + { + "epoch": 9.228974831184775, + "grad_norm": 0.16218116879463196, + "learning_rate": 1.5521572484955893e-06, + "loss": 1.7331, + "step": 30068 + }, + { + "epoch": 9.2292817679558, + "grad_norm": 0.15987847745418549, + "learning_rate": 1.5509286169797798e-06, + "loss": 1.7177, + "step": 30069 + }, + { + "epoch": 9.229588704726826, + "grad_norm": 0.14362195134162903, + "learning_rate": 1.5497004642660907e-06, + "loss": 1.6616, + "step": 30070 + }, + { + "epoch": 9.229895641497851, + "grad_norm": 0.15351802110671997, + "learning_rate": 1.5484727903666618e-06, + "loss": 1.7356, + "step": 30071 + }, + { + "epoch": 9.230202578268877, + "grad_norm": 0.1514216959476471, + "learning_rate": 1.5472455952936116e-06, + "loss": 1.7386, + "step": 30072 + }, + { + "epoch": 9.230509515039902, + "grad_norm": 0.1280907839536667, + "learning_rate": 1.5460188790590967e-06, + "loss": 1.6783, + "step": 30073 + }, + { + "epoch": 9.230816451810927, + "grad_norm": 0.20153765380382538, + "learning_rate": 1.544792641675208e-06, + "loss": 1.6954, + "step": 30074 + }, + { + "epoch": 9.231123388581953, + "grad_norm": 0.1277652084827423, + "learning_rate": 1.5435668831540905e-06, + "loss": 1.692, + "step": 30075 + }, + { + "epoch": 9.231430325352978, + "grad_norm": 0.1274770349264145, + "learning_rate": 1.5423416035078408e-06, + "loss": 1.7265, + "step": 30076 + }, + { + "epoch": 9.231737262124003, + "grad_norm": 0.11994244903326035, + "learning_rate": 1.5411168027485712e-06, + "loss": 1.6476, + "step": 30077 + }, + { + "epoch": 9.232044198895027, + "grad_norm": 0.1459321826696396, + "learning_rate": 1.539892480888394e-06, + "loss": 1.6627, + "step": 30078 + }, + { + "epoch": 9.232351135666052, + "grad_norm": 0.15515929460525513, + "learning_rate": 1.5386686379394e-06, + "loss": 1.6974, + "step": 30079 + }, + { + "epoch": 9.232658072437077, + "grad_norm": 0.1805061250925064, + "learning_rate": 1.5374452739136846e-06, + "loss": 1.7336, + "step": 30080 + }, + { + "epoch": 9.232965009208103, + "grad_norm": 0.10603496432304382, + "learning_rate": 1.5362223888233384e-06, + "loss": 1.6526, + "step": 30081 + }, + { + "epoch": 9.233271945979128, + "grad_norm": 0.15579989552497864, + "learning_rate": 1.5349999826804517e-06, + "loss": 1.7093, + "step": 30082 + }, + { + "epoch": 9.233578882750153, + "grad_norm": 0.15068648755550385, + "learning_rate": 1.5337780554971037e-06, + "loss": 1.6486, + "step": 30083 + }, + { + "epoch": 9.233885819521179, + "grad_norm": 0.13521051406860352, + "learning_rate": 1.532556607285357e-06, + "loss": 1.6972, + "step": 30084 + }, + { + "epoch": 9.234192756292204, + "grad_norm": 0.15651237964630127, + "learning_rate": 1.5313356380573074e-06, + "loss": 1.7232, + "step": 30085 + }, + { + "epoch": 9.23449969306323, + "grad_norm": 0.18412761390209198, + "learning_rate": 1.530115147825001e-06, + "loss": 1.7127, + "step": 30086 + }, + { + "epoch": 9.234806629834255, + "grad_norm": 0.13278020918369293, + "learning_rate": 1.528895136600511e-06, + "loss": 1.6963, + "step": 30087 + }, + { + "epoch": 9.23511356660528, + "grad_norm": 0.126597598195076, + "learning_rate": 1.527675604395884e-06, + "loss": 1.6899, + "step": 30088 + }, + { + "epoch": 9.235420503376304, + "grad_norm": 0.1658754050731659, + "learning_rate": 1.526456551223171e-06, + "loss": 1.6655, + "step": 30089 + }, + { + "epoch": 9.235727440147329, + "grad_norm": 0.2280663400888443, + "learning_rate": 1.5252379770944402e-06, + "loss": 1.7804, + "step": 30090 + }, + { + "epoch": 9.236034376918354, + "grad_norm": 0.15943841636180878, + "learning_rate": 1.5240198820217044e-06, + "loss": 1.7023, + "step": 30091 + }, + { + "epoch": 9.23634131368938, + "grad_norm": 0.12864334881305695, + "learning_rate": 1.5228022660170315e-06, + "loss": 1.6951, + "step": 30092 + }, + { + "epoch": 9.236648250460405, + "grad_norm": 0.11842049658298492, + "learning_rate": 1.5215851290924233e-06, + "loss": 1.6864, + "step": 30093 + }, + { + "epoch": 9.23695518723143, + "grad_norm": 0.11744343489408493, + "learning_rate": 1.5203684712599364e-06, + "loss": 1.6594, + "step": 30094 + }, + { + "epoch": 9.237262124002456, + "grad_norm": 0.15188898146152496, + "learning_rate": 1.5191522925315838e-06, + "loss": 1.7024, + "step": 30095 + }, + { + "epoch": 9.237569060773481, + "grad_norm": 0.16257372498512268, + "learning_rate": 1.517936592919378e-06, + "loss": 1.7568, + "step": 30096 + }, + { + "epoch": 9.237875997544506, + "grad_norm": 0.2373557835817337, + "learning_rate": 1.5167213724353424e-06, + "loss": 1.7712, + "step": 30097 + }, + { + "epoch": 9.238182934315532, + "grad_norm": 0.13525256514549255, + "learning_rate": 1.5155066310914846e-06, + "loss": 1.7128, + "step": 30098 + }, + { + "epoch": 9.238489871086557, + "grad_norm": 0.1386425495147705, + "learning_rate": 1.5142923688998055e-06, + "loss": 1.6655, + "step": 30099 + }, + { + "epoch": 9.23879680785758, + "grad_norm": 0.16497959196567535, + "learning_rate": 1.5130785858723072e-06, + "loss": 1.7426, + "step": 30100 + }, + { + "epoch": 9.239103744628606, + "grad_norm": 0.13364866375923157, + "learning_rate": 1.51186528202098e-06, + "loss": 1.6917, + "step": 30101 + }, + { + "epoch": 9.239410681399631, + "grad_norm": 0.15585513412952423, + "learning_rate": 1.5106524573578308e-06, + "loss": 1.7263, + "step": 30102 + }, + { + "epoch": 9.239717618170657, + "grad_norm": 0.17002388834953308, + "learning_rate": 1.5094401118948332e-06, + "loss": 1.7525, + "step": 30103 + }, + { + "epoch": 9.240024554941682, + "grad_norm": 0.147446408867836, + "learning_rate": 1.5082282456439666e-06, + "loss": 1.6941, + "step": 30104 + }, + { + "epoch": 9.240331491712707, + "grad_norm": 0.2109186351299286, + "learning_rate": 1.5070168586172106e-06, + "loss": 1.7316, + "step": 30105 + }, + { + "epoch": 9.240638428483733, + "grad_norm": 0.16860739886760712, + "learning_rate": 1.5058059508265276e-06, + "loss": 1.673, + "step": 30106 + }, + { + "epoch": 9.240945365254758, + "grad_norm": 0.16476429998874664, + "learning_rate": 1.504595522283908e-06, + "loss": 1.7225, + "step": 30107 + }, + { + "epoch": 9.241252302025783, + "grad_norm": 0.1818271279335022, + "learning_rate": 1.5033855730012925e-06, + "loss": 1.7372, + "step": 30108 + }, + { + "epoch": 9.241559238796809, + "grad_norm": 0.15022529661655426, + "learning_rate": 1.502176102990649e-06, + "loss": 1.7114, + "step": 30109 + }, + { + "epoch": 9.241866175567832, + "grad_norm": 0.11522844433784485, + "learning_rate": 1.500967112263918e-06, + "loss": 1.6815, + "step": 30110 + }, + { + "epoch": 9.242173112338858, + "grad_norm": 0.16297772526741028, + "learning_rate": 1.4997586008330622e-06, + "loss": 1.6865, + "step": 30111 + }, + { + "epoch": 9.242480049109883, + "grad_norm": 0.14999376237392426, + "learning_rate": 1.4985505687100222e-06, + "loss": 1.741, + "step": 30112 + }, + { + "epoch": 9.242786985880908, + "grad_norm": 0.1419779509305954, + "learning_rate": 1.4973430159067326e-06, + "loss": 1.6859, + "step": 30113 + }, + { + "epoch": 9.243093922651934, + "grad_norm": 0.10719183832406998, + "learning_rate": 1.4961359424351228e-06, + "loss": 1.6533, + "step": 30114 + }, + { + "epoch": 9.243400859422959, + "grad_norm": 0.16076189279556274, + "learning_rate": 1.494929348307128e-06, + "loss": 1.6874, + "step": 30115 + }, + { + "epoch": 9.243707796193984, + "grad_norm": 0.18850088119506836, + "learning_rate": 1.4937232335346719e-06, + "loss": 1.7701, + "step": 30116 + }, + { + "epoch": 9.24401473296501, + "grad_norm": 0.12545527517795563, + "learning_rate": 1.4925175981296725e-06, + "loss": 1.6876, + "step": 30117 + }, + { + "epoch": 9.244321669736035, + "grad_norm": 0.13523197174072266, + "learning_rate": 1.4913124421040426e-06, + "loss": 1.6807, + "step": 30118 + }, + { + "epoch": 9.24462860650706, + "grad_norm": 0.1360730528831482, + "learning_rate": 1.490107765469706e-06, + "loss": 1.6776, + "step": 30119 + }, + { + "epoch": 9.244935543278086, + "grad_norm": 0.11223732680082321, + "learning_rate": 1.4889035682385476e-06, + "loss": 1.644, + "step": 30120 + }, + { + "epoch": 9.245242480049109, + "grad_norm": 0.13906998932361603, + "learning_rate": 1.4876998504224804e-06, + "loss": 1.6728, + "step": 30121 + }, + { + "epoch": 9.245549416820134, + "grad_norm": 0.1429383009672165, + "learning_rate": 1.4864966120333946e-06, + "loss": 1.7227, + "step": 30122 + }, + { + "epoch": 9.24585635359116, + "grad_norm": 0.1267431229352951, + "learning_rate": 1.4852938530831806e-06, + "loss": 1.6871, + "step": 30123 + }, + { + "epoch": 9.246163290362185, + "grad_norm": 0.20933954417705536, + "learning_rate": 1.48409157358374e-06, + "loss": 1.7602, + "step": 30124 + }, + { + "epoch": 9.24647022713321, + "grad_norm": 0.15941432118415833, + "learning_rate": 1.4828897735469305e-06, + "loss": 1.7505, + "step": 30125 + }, + { + "epoch": 9.246777163904236, + "grad_norm": 0.16897402703762054, + "learning_rate": 1.4816884529846531e-06, + "loss": 1.831, + "step": 30126 + }, + { + "epoch": 9.247084100675261, + "grad_norm": 0.16803954541683197, + "learning_rate": 1.480487611908754e-06, + "loss": 1.7239, + "step": 30127 + }, + { + "epoch": 9.247391037446286, + "grad_norm": 0.11253303289413452, + "learning_rate": 1.479287250331124e-06, + "loss": 1.6937, + "step": 30128 + }, + { + "epoch": 9.247697974217312, + "grad_norm": 0.1583312302827835, + "learning_rate": 1.4780873682636142e-06, + "loss": 1.7121, + "step": 30129 + }, + { + "epoch": 9.248004910988337, + "grad_norm": 0.16783545911312103, + "learning_rate": 1.4768879657180822e-06, + "loss": 1.7107, + "step": 30130 + }, + { + "epoch": 9.248311847759362, + "grad_norm": 0.1669779270887375, + "learning_rate": 1.4756890427063852e-06, + "loss": 1.7255, + "step": 30131 + }, + { + "epoch": 9.248618784530386, + "grad_norm": 0.13612589240074158, + "learning_rate": 1.474490599240369e-06, + "loss": 1.7388, + "step": 30132 + }, + { + "epoch": 9.248925721301411, + "grad_norm": 0.13164813816547394, + "learning_rate": 1.4732926353318798e-06, + "loss": 1.6854, + "step": 30133 + }, + { + "epoch": 9.249232658072437, + "grad_norm": 0.1553371399641037, + "learning_rate": 1.4720951509927582e-06, + "loss": 1.6898, + "step": 30134 + }, + { + "epoch": 9.249539594843462, + "grad_norm": 0.1533356010913849, + "learning_rate": 1.470898146234828e-06, + "loss": 1.6911, + "step": 30135 + }, + { + "epoch": 9.249846531614487, + "grad_norm": 0.14966778457164764, + "learning_rate": 1.4697016210699354e-06, + "loss": 1.7064, + "step": 30136 + }, + { + "epoch": 9.250153468385513, + "grad_norm": 0.10848648101091385, + "learning_rate": 1.4685055755098876e-06, + "loss": 1.646, + "step": 30137 + }, + { + "epoch": 9.250460405156538, + "grad_norm": 0.13840216398239136, + "learning_rate": 1.4673100095665193e-06, + "loss": 1.7318, + "step": 30138 + }, + { + "epoch": 9.250767341927563, + "grad_norm": 0.12550130486488342, + "learning_rate": 1.466114923251638e-06, + "loss": 1.6526, + "step": 30139 + }, + { + "epoch": 9.251074278698589, + "grad_norm": 0.1806049644947052, + "learning_rate": 1.4649203165770454e-06, + "loss": 1.7221, + "step": 30140 + }, + { + "epoch": 9.251381215469614, + "grad_norm": 0.14293114840984344, + "learning_rate": 1.4637261895545763e-06, + "loss": 1.7138, + "step": 30141 + }, + { + "epoch": 9.25168815224064, + "grad_norm": 0.1573718935251236, + "learning_rate": 1.4625325421959935e-06, + "loss": 1.7337, + "step": 30142 + }, + { + "epoch": 9.251995089011663, + "grad_norm": 0.1105809286236763, + "learning_rate": 1.4613393745131321e-06, + "loss": 1.6805, + "step": 30143 + }, + { + "epoch": 9.252302025782688, + "grad_norm": 0.16229867935180664, + "learning_rate": 1.4601466865177493e-06, + "loss": 1.7644, + "step": 30144 + }, + { + "epoch": 9.252608962553714, + "grad_norm": 0.17748580873012543, + "learning_rate": 1.4589544782216524e-06, + "loss": 1.7499, + "step": 30145 + }, + { + "epoch": 9.252915899324739, + "grad_norm": 0.1022428423166275, + "learning_rate": 1.4577627496366153e-06, + "loss": 1.6395, + "step": 30146 + }, + { + "epoch": 9.253222836095764, + "grad_norm": 0.11667326092720032, + "learning_rate": 1.4565715007744229e-06, + "loss": 1.6806, + "step": 30147 + }, + { + "epoch": 9.25352977286679, + "grad_norm": 0.1549718827009201, + "learning_rate": 1.4553807316468381e-06, + "loss": 1.7174, + "step": 30148 + }, + { + "epoch": 9.253836709637815, + "grad_norm": 0.13834457099437714, + "learning_rate": 1.4541904422656406e-06, + "loss": 1.6922, + "step": 30149 + }, + { + "epoch": 9.25414364640884, + "grad_norm": 0.15639884769916534, + "learning_rate": 1.4530006326425815e-06, + "loss": 1.7715, + "step": 30150 + }, + { + "epoch": 9.254450583179866, + "grad_norm": 0.15321171283721924, + "learning_rate": 1.4518113027894243e-06, + "loss": 1.7332, + "step": 30151 + }, + { + "epoch": 9.254757519950891, + "grad_norm": 0.12793070077896118, + "learning_rate": 1.4506224527179257e-06, + "loss": 1.6968, + "step": 30152 + }, + { + "epoch": 9.255064456721914, + "grad_norm": 0.11214233934879303, + "learning_rate": 1.4494340824398322e-06, + "loss": 1.6751, + "step": 30153 + }, + { + "epoch": 9.25537139349294, + "grad_norm": 0.16913802921772003, + "learning_rate": 1.4482461919668844e-06, + "loss": 1.7314, + "step": 30154 + }, + { + "epoch": 9.255678330263965, + "grad_norm": 0.14455991983413696, + "learning_rate": 1.4470587813108282e-06, + "loss": 1.6877, + "step": 30155 + }, + { + "epoch": 9.25598526703499, + "grad_norm": 0.15350060164928436, + "learning_rate": 1.4458718504833934e-06, + "loss": 1.7414, + "step": 30156 + }, + { + "epoch": 9.256292203806016, + "grad_norm": 0.1487266719341278, + "learning_rate": 1.4446853994963094e-06, + "loss": 1.7093, + "step": 30157 + }, + { + "epoch": 9.256599140577041, + "grad_norm": 0.13964293897151947, + "learning_rate": 1.4434994283613058e-06, + "loss": 1.6863, + "step": 30158 + }, + { + "epoch": 9.256906077348066, + "grad_norm": 0.13903170824050903, + "learning_rate": 1.442313937090095e-06, + "loss": 1.7268, + "step": 30159 + }, + { + "epoch": 9.257213014119092, + "grad_norm": 0.12514057755470276, + "learning_rate": 1.4411289256944072e-06, + "loss": 1.6836, + "step": 30160 + }, + { + "epoch": 9.257519950890117, + "grad_norm": 0.16892002522945404, + "learning_rate": 1.439944394185938e-06, + "loss": 1.6942, + "step": 30161 + }, + { + "epoch": 9.257826887661142, + "grad_norm": 0.22416932880878448, + "learning_rate": 1.4387603425764007e-06, + "loss": 1.8449, + "step": 30162 + }, + { + "epoch": 9.258133824432168, + "grad_norm": 0.13895165920257568, + "learning_rate": 1.4375767708775022e-06, + "loss": 1.7309, + "step": 30163 + }, + { + "epoch": 9.258440761203191, + "grad_norm": 0.13725127279758453, + "learning_rate": 1.436393679100928e-06, + "loss": 1.7508, + "step": 30164 + }, + { + "epoch": 9.258747697974217, + "grad_norm": 0.1684611737728119, + "learning_rate": 1.4352110672583796e-06, + "loss": 1.7656, + "step": 30165 + }, + { + "epoch": 9.259054634745242, + "grad_norm": 0.166968435049057, + "learning_rate": 1.4340289353615365e-06, + "loss": 1.7277, + "step": 30166 + }, + { + "epoch": 9.259361571516267, + "grad_norm": 0.2129509150981903, + "learning_rate": 1.4328472834220896e-06, + "loss": 1.8107, + "step": 30167 + }, + { + "epoch": 9.259668508287293, + "grad_norm": 0.15415063500404358, + "learning_rate": 1.4316661114517072e-06, + "loss": 1.7248, + "step": 30168 + }, + { + "epoch": 9.259975445058318, + "grad_norm": 0.10856158286333084, + "learning_rate": 1.4304854194620688e-06, + "loss": 1.6306, + "step": 30169 + }, + { + "epoch": 9.260282381829343, + "grad_norm": 0.16899555921554565, + "learning_rate": 1.4293052074648427e-06, + "loss": 1.7068, + "step": 30170 + }, + { + "epoch": 9.260589318600369, + "grad_norm": 0.1331903636455536, + "learning_rate": 1.4281254754716867e-06, + "loss": 1.682, + "step": 30171 + }, + { + "epoch": 9.260896255371394, + "grad_norm": 0.10237281024456024, + "learning_rate": 1.4269462234942631e-06, + "loss": 1.6859, + "step": 30172 + }, + { + "epoch": 9.26120319214242, + "grad_norm": 0.13941270112991333, + "learning_rate": 1.4257674515442298e-06, + "loss": 1.6922, + "step": 30173 + }, + { + "epoch": 9.261510128913443, + "grad_norm": 0.16863791644573212, + "learning_rate": 1.4245891596332328e-06, + "loss": 1.7276, + "step": 30174 + }, + { + "epoch": 9.261817065684468, + "grad_norm": 0.1314782202243805, + "learning_rate": 1.4234113477729184e-06, + "loss": 1.6829, + "step": 30175 + }, + { + "epoch": 9.262124002455494, + "grad_norm": 0.19281591475009918, + "learning_rate": 1.4222340159749158e-06, + "loss": 1.7281, + "step": 30176 + }, + { + "epoch": 9.262430939226519, + "grad_norm": 0.14531417191028595, + "learning_rate": 1.421057164250883e-06, + "loss": 1.7226, + "step": 30177 + }, + { + "epoch": 9.262737875997544, + "grad_norm": 0.15508733689785004, + "learning_rate": 1.4198807926124213e-06, + "loss": 1.7588, + "step": 30178 + }, + { + "epoch": 9.26304481276857, + "grad_norm": 0.09654982388019562, + "learning_rate": 1.418704901071183e-06, + "loss": 1.6742, + "step": 30179 + }, + { + "epoch": 9.263351749539595, + "grad_norm": 0.18973948061466217, + "learning_rate": 1.4175294896387693e-06, + "loss": 1.71, + "step": 30180 + }, + { + "epoch": 9.26365868631062, + "grad_norm": 0.15489214658737183, + "learning_rate": 1.41635455832681e-06, + "loss": 1.7074, + "step": 30181 + }, + { + "epoch": 9.263965623081646, + "grad_norm": 0.15990005433559418, + "learning_rate": 1.4151801071469072e-06, + "loss": 1.6822, + "step": 30182 + }, + { + "epoch": 9.264272559852671, + "grad_norm": 0.17423443496227264, + "learning_rate": 1.4140061361106737e-06, + "loss": 1.7677, + "step": 30183 + }, + { + "epoch": 9.264579496623696, + "grad_norm": 0.15427646040916443, + "learning_rate": 1.4128326452297058e-06, + "loss": 1.7021, + "step": 30184 + }, + { + "epoch": 9.26488643339472, + "grad_norm": 0.13731053471565247, + "learning_rate": 1.4116596345156053e-06, + "loss": 1.7235, + "step": 30185 + }, + { + "epoch": 9.265193370165745, + "grad_norm": 0.13132283091545105, + "learning_rate": 1.4104871039799627e-06, + "loss": 1.7159, + "step": 30186 + }, + { + "epoch": 9.26550030693677, + "grad_norm": 0.12384344637393951, + "learning_rate": 1.409315053634369e-06, + "loss": 1.6785, + "step": 30187 + }, + { + "epoch": 9.265807243707796, + "grad_norm": 0.16857418417930603, + "learning_rate": 1.4081434834903984e-06, + "loss": 1.7453, + "step": 30188 + }, + { + "epoch": 9.266114180478821, + "grad_norm": 0.13803976774215698, + "learning_rate": 1.4069723935596412e-06, + "loss": 1.6826, + "step": 30189 + }, + { + "epoch": 9.266421117249847, + "grad_norm": 0.16141049563884735, + "learning_rate": 1.4058017838536552e-06, + "loss": 1.7113, + "step": 30190 + }, + { + "epoch": 9.266728054020872, + "grad_norm": 0.13290546834468842, + "learning_rate": 1.4046316543840254e-06, + "loss": 1.7262, + "step": 30191 + }, + { + "epoch": 9.267034990791897, + "grad_norm": 0.163112610578537, + "learning_rate": 1.4034620051623037e-06, + "loss": 1.7002, + "step": 30192 + }, + { + "epoch": 9.267341927562923, + "grad_norm": 0.11482264846563339, + "learning_rate": 1.402292836200053e-06, + "loss": 1.6714, + "step": 30193 + }, + { + "epoch": 9.267648864333948, + "grad_norm": 0.15263767540454865, + "learning_rate": 1.4011241475088367e-06, + "loss": 1.7323, + "step": 30194 + }, + { + "epoch": 9.267955801104973, + "grad_norm": 0.16607387363910675, + "learning_rate": 1.3999559391001838e-06, + "loss": 1.7116, + "step": 30195 + }, + { + "epoch": 9.268262737875997, + "grad_norm": 0.15621553361415863, + "learning_rate": 1.398788210985663e-06, + "loss": 1.7069, + "step": 30196 + }, + { + "epoch": 9.268569674647022, + "grad_norm": 0.13450275361537933, + "learning_rate": 1.3976209631767934e-06, + "loss": 1.6924, + "step": 30197 + }, + { + "epoch": 9.268876611418047, + "grad_norm": 0.18217138946056366, + "learning_rate": 1.3964541956851263e-06, + "loss": 1.7349, + "step": 30198 + }, + { + "epoch": 9.269183548189073, + "grad_norm": 0.18020178377628326, + "learning_rate": 1.3952879085221858e-06, + "loss": 1.7358, + "step": 30199 + }, + { + "epoch": 9.269490484960098, + "grad_norm": 0.1362251341342926, + "learning_rate": 1.3941221016994965e-06, + "loss": 1.724, + "step": 30200 + }, + { + "epoch": 9.269797421731123, + "grad_norm": 0.15907861292362213, + "learning_rate": 1.392956775228582e-06, + "loss": 1.712, + "step": 30201 + }, + { + "epoch": 9.270104358502149, + "grad_norm": 0.12772800028324127, + "learning_rate": 1.3917919291209614e-06, + "loss": 1.6744, + "step": 30202 + }, + { + "epoch": 9.270411295273174, + "grad_norm": 0.12429596483707428, + "learning_rate": 1.3906275633881416e-06, + "loss": 1.721, + "step": 30203 + }, + { + "epoch": 9.2707182320442, + "grad_norm": 0.20072144269943237, + "learning_rate": 1.3894636780416303e-06, + "loss": 1.7024, + "step": 30204 + }, + { + "epoch": 9.271025168815225, + "grad_norm": 0.13898633420467377, + "learning_rate": 1.3883002730929296e-06, + "loss": 1.6943, + "step": 30205 + }, + { + "epoch": 9.27133210558625, + "grad_norm": 0.11137440800666809, + "learning_rate": 1.387137348553541e-06, + "loss": 1.6666, + "step": 30206 + }, + { + "epoch": 9.271639042357274, + "grad_norm": 0.13952526450157166, + "learning_rate": 1.3859749044349501e-06, + "loss": 1.6988, + "step": 30207 + }, + { + "epoch": 9.271945979128299, + "grad_norm": 0.1566372960805893, + "learning_rate": 1.3848129407486477e-06, + "loss": 1.6942, + "step": 30208 + }, + { + "epoch": 9.272252915899324, + "grad_norm": 0.1273697465658188, + "learning_rate": 1.3836514575061244e-06, + "loss": 1.6926, + "step": 30209 + }, + { + "epoch": 9.27255985267035, + "grad_norm": 0.15591974556446075, + "learning_rate": 1.3824904547188434e-06, + "loss": 1.734, + "step": 30210 + }, + { + "epoch": 9.272866789441375, + "grad_norm": 0.14875155687332153, + "learning_rate": 1.3813299323982954e-06, + "loss": 1.7229, + "step": 30211 + }, + { + "epoch": 9.2731737262124, + "grad_norm": 0.15695714950561523, + "learning_rate": 1.3801698905559325e-06, + "loss": 1.727, + "step": 30212 + }, + { + "epoch": 9.273480662983426, + "grad_norm": 0.16134092211723328, + "learning_rate": 1.3790103292032398e-06, + "loss": 1.7321, + "step": 30213 + }, + { + "epoch": 9.273787599754451, + "grad_norm": 0.16619402170181274, + "learning_rate": 1.3778512483516527e-06, + "loss": 1.6804, + "step": 30214 + }, + { + "epoch": 9.274094536525476, + "grad_norm": 0.12403136491775513, + "learning_rate": 1.3766926480126452e-06, + "loss": 1.7067, + "step": 30215 + }, + { + "epoch": 9.274401473296502, + "grad_norm": 0.13903765380382538, + "learning_rate": 1.3755345281976584e-06, + "loss": 1.7138, + "step": 30216 + }, + { + "epoch": 9.274708410067525, + "grad_norm": 0.10627007484436035, + "learning_rate": 1.3743768889181385e-06, + "loss": 1.6693, + "step": 30217 + }, + { + "epoch": 9.27501534683855, + "grad_norm": 0.12304051220417023, + "learning_rate": 1.3732197301855265e-06, + "loss": 1.6838, + "step": 30218 + }, + { + "epoch": 9.275322283609576, + "grad_norm": 0.12596885859966278, + "learning_rate": 1.3720630520112632e-06, + "loss": 1.6924, + "step": 30219 + }, + { + "epoch": 9.275629220380601, + "grad_norm": 0.16624486446380615, + "learning_rate": 1.3709068544067672e-06, + "loss": 1.7316, + "step": 30220 + }, + { + "epoch": 9.275936157151627, + "grad_norm": 0.11655814945697784, + "learning_rate": 1.3697511373834737e-06, + "loss": 1.6877, + "step": 30221 + }, + { + "epoch": 9.276243093922652, + "grad_norm": 0.1264163851737976, + "learning_rate": 1.368595900952807e-06, + "loss": 1.6995, + "step": 30222 + }, + { + "epoch": 9.276550030693677, + "grad_norm": 0.10144982486963272, + "learning_rate": 1.3674411451261748e-06, + "loss": 1.6426, + "step": 30223 + }, + { + "epoch": 9.276856967464703, + "grad_norm": 0.13389989733695984, + "learning_rate": 1.3662868699149955e-06, + "loss": 1.7072, + "step": 30224 + }, + { + "epoch": 9.277163904235728, + "grad_norm": 0.18326976895332336, + "learning_rate": 1.3651330753306769e-06, + "loss": 1.7426, + "step": 30225 + }, + { + "epoch": 9.277470841006753, + "grad_norm": 0.1679212898015976, + "learning_rate": 1.363979761384615e-06, + "loss": 1.7158, + "step": 30226 + }, + { + "epoch": 9.277777777777779, + "grad_norm": 0.26792997121810913, + "learning_rate": 1.3628269280882066e-06, + "loss": 1.7862, + "step": 30227 + }, + { + "epoch": 9.278084714548802, + "grad_norm": 0.1797039955854416, + "learning_rate": 1.361674575452865e-06, + "loss": 1.8311, + "step": 30228 + }, + { + "epoch": 9.278391651319827, + "grad_norm": 0.14270684123039246, + "learning_rate": 1.360522703489947e-06, + "loss": 1.6823, + "step": 30229 + }, + { + "epoch": 9.278698588090853, + "grad_norm": 0.12262453138828278, + "learning_rate": 1.3593713122108665e-06, + "loss": 1.6576, + "step": 30230 + }, + { + "epoch": 9.279005524861878, + "grad_norm": 0.20434293150901794, + "learning_rate": 1.358220401626975e-06, + "loss": 1.7508, + "step": 30231 + }, + { + "epoch": 9.279312461632903, + "grad_norm": 0.12360373884439468, + "learning_rate": 1.3570699717496637e-06, + "loss": 1.6636, + "step": 30232 + }, + { + "epoch": 9.279619398403929, + "grad_norm": 0.1771468222141266, + "learning_rate": 1.3559200225903013e-06, + "loss": 1.6926, + "step": 30233 + }, + { + "epoch": 9.279926335174954, + "grad_norm": 0.13039356470108032, + "learning_rate": 1.3547705541602451e-06, + "loss": 1.6671, + "step": 30234 + }, + { + "epoch": 9.28023327194598, + "grad_norm": 0.12824147939682007, + "learning_rate": 1.3536215664708586e-06, + "loss": 1.6835, + "step": 30235 + }, + { + "epoch": 9.280540208717005, + "grad_norm": 0.15304934978485107, + "learning_rate": 1.3524730595334933e-06, + "loss": 1.7216, + "step": 30236 + }, + { + "epoch": 9.28084714548803, + "grad_norm": 0.13606427609920502, + "learning_rate": 1.3513250333595074e-06, + "loss": 1.7062, + "step": 30237 + }, + { + "epoch": 9.281154082259055, + "grad_norm": 0.1449199616909027, + "learning_rate": 1.3501774879602414e-06, + "loss": 1.6988, + "step": 30238 + }, + { + "epoch": 9.281461019030079, + "grad_norm": 0.11309704929590225, + "learning_rate": 1.3490304233470307e-06, + "loss": 1.6721, + "step": 30239 + }, + { + "epoch": 9.281767955801104, + "grad_norm": 0.17013555765151978, + "learning_rate": 1.3478838395312222e-06, + "loss": 1.7045, + "step": 30240 + }, + { + "epoch": 9.28207489257213, + "grad_norm": 0.11972448974847794, + "learning_rate": 1.3467377365241396e-06, + "loss": 1.7015, + "step": 30241 + }, + { + "epoch": 9.282381829343155, + "grad_norm": 0.17848798632621765, + "learning_rate": 1.345592114337113e-06, + "loss": 1.7063, + "step": 30242 + }, + { + "epoch": 9.28268876611418, + "grad_norm": 0.1346857249736786, + "learning_rate": 1.3444469729814612e-06, + "loss": 1.7126, + "step": 30243 + }, + { + "epoch": 9.282995702885206, + "grad_norm": 0.17026859521865845, + "learning_rate": 1.3433023124684974e-06, + "loss": 1.7094, + "step": 30244 + }, + { + "epoch": 9.283302639656231, + "grad_norm": 0.12969297170639038, + "learning_rate": 1.3421581328095456e-06, + "loss": 1.717, + "step": 30245 + }, + { + "epoch": 9.283609576427256, + "grad_norm": 0.19405554234981537, + "learning_rate": 1.3410144340159026e-06, + "loss": 1.7221, + "step": 30246 + }, + { + "epoch": 9.283916513198282, + "grad_norm": 0.16258898377418518, + "learning_rate": 1.3398712160988814e-06, + "loss": 1.7174, + "step": 30247 + }, + { + "epoch": 9.284223449969307, + "grad_norm": 0.18568632006645203, + "learning_rate": 1.338728479069762e-06, + "loss": 1.7093, + "step": 30248 + }, + { + "epoch": 9.284530386740332, + "grad_norm": 0.11301061511039734, + "learning_rate": 1.3375862229398518e-06, + "loss": 1.7053, + "step": 30249 + }, + { + "epoch": 9.284837323511356, + "grad_norm": 0.15475797653198242, + "learning_rate": 1.3364444477204418e-06, + "loss": 1.7773, + "step": 30250 + }, + { + "epoch": 9.285144260282381, + "grad_norm": 0.153490349650383, + "learning_rate": 1.3353031534228067e-06, + "loss": 1.69, + "step": 30251 + }, + { + "epoch": 9.285451197053407, + "grad_norm": 0.14238356053829193, + "learning_rate": 1.3341623400582314e-06, + "loss": 1.6917, + "step": 30252 + }, + { + "epoch": 9.285758133824432, + "grad_norm": 0.24802085757255554, + "learning_rate": 1.3330220076379906e-06, + "loss": 1.7581, + "step": 30253 + }, + { + "epoch": 9.286065070595457, + "grad_norm": 0.1755116581916809, + "learning_rate": 1.3318821561733474e-06, + "loss": 1.7433, + "step": 30254 + }, + { + "epoch": 9.286372007366483, + "grad_norm": 0.142706498503685, + "learning_rate": 1.3307427856755705e-06, + "loss": 1.8094, + "step": 30255 + }, + { + "epoch": 9.286678944137508, + "grad_norm": 0.10654154419898987, + "learning_rate": 1.3296038961559177e-06, + "loss": 1.6768, + "step": 30256 + }, + { + "epoch": 9.286985880908533, + "grad_norm": 0.1446719765663147, + "learning_rate": 1.3284654876256464e-06, + "loss": 1.763, + "step": 30257 + }, + { + "epoch": 9.287292817679559, + "grad_norm": 0.128647580742836, + "learning_rate": 1.3273275600960089e-06, + "loss": 1.7217, + "step": 30258 + }, + { + "epoch": 9.287599754450584, + "grad_norm": 0.16537147760391235, + "learning_rate": 1.3261901135782462e-06, + "loss": 1.7158, + "step": 30259 + }, + { + "epoch": 9.287906691221608, + "grad_norm": 0.12634962797164917, + "learning_rate": 1.3250531480836048e-06, + "loss": 1.7062, + "step": 30260 + }, + { + "epoch": 9.288213627992633, + "grad_norm": 0.14017465710639954, + "learning_rate": 1.323916663623309e-06, + "loss": 1.7008, + "step": 30261 + }, + { + "epoch": 9.288520564763658, + "grad_norm": 0.14252761006355286, + "learning_rate": 1.3227806602086113e-06, + "loss": 1.7241, + "step": 30262 + }, + { + "epoch": 9.288827501534684, + "grad_norm": 0.16626526415348053, + "learning_rate": 1.3216451378507132e-06, + "loss": 1.7422, + "step": 30263 + }, + { + "epoch": 9.289134438305709, + "grad_norm": 0.17778219282627106, + "learning_rate": 1.3205100965608564e-06, + "loss": 1.7595, + "step": 30264 + }, + { + "epoch": 9.289441375076734, + "grad_norm": 0.1335630863904953, + "learning_rate": 1.319375536350248e-06, + "loss": 1.7238, + "step": 30265 + }, + { + "epoch": 9.28974831184776, + "grad_norm": 0.18150761723518372, + "learning_rate": 1.3182414572301017e-06, + "loss": 1.7575, + "step": 30266 + }, + { + "epoch": 9.290055248618785, + "grad_norm": 0.10502864420413971, + "learning_rate": 1.3171078592116304e-06, + "loss": 1.6641, + "step": 30267 + }, + { + "epoch": 9.29036218538981, + "grad_norm": 0.18388547003269196, + "learning_rate": 1.315974742306031e-06, + "loss": 1.7128, + "step": 30268 + }, + { + "epoch": 9.290669122160836, + "grad_norm": 0.16178761422634125, + "learning_rate": 1.3148421065245054e-06, + "loss": 1.8073, + "step": 30269 + }, + { + "epoch": 9.29097605893186, + "grad_norm": 0.28871726989746094, + "learning_rate": 1.3137099518782449e-06, + "loss": 1.7344, + "step": 30270 + }, + { + "epoch": 9.291282995702884, + "grad_norm": 0.12639513611793518, + "learning_rate": 1.3125782783784403e-06, + "loss": 1.7105, + "step": 30271 + }, + { + "epoch": 9.29158993247391, + "grad_norm": 0.12210296839475632, + "learning_rate": 1.3114470860362716e-06, + "loss": 1.6964, + "step": 30272 + }, + { + "epoch": 9.291896869244935, + "grad_norm": 0.1808413416147232, + "learning_rate": 1.3103163748629187e-06, + "loss": 1.6897, + "step": 30273 + }, + { + "epoch": 9.29220380601596, + "grad_norm": 0.12490539252758026, + "learning_rate": 1.309186144869562e-06, + "loss": 1.6775, + "step": 30274 + }, + { + "epoch": 9.292510742786986, + "grad_norm": 0.14661727845668793, + "learning_rate": 1.3080563960673641e-06, + "loss": 1.705, + "step": 30275 + }, + { + "epoch": 9.292817679558011, + "grad_norm": 0.14526040852069855, + "learning_rate": 1.3069271284674888e-06, + "loss": 1.7507, + "step": 30276 + }, + { + "epoch": 9.293124616329036, + "grad_norm": 0.1486021727323532, + "learning_rate": 1.3057983420811049e-06, + "loss": 1.7162, + "step": 30277 + }, + { + "epoch": 9.293431553100062, + "grad_norm": 0.11850638687610626, + "learning_rate": 1.3046700369193532e-06, + "loss": 1.6996, + "step": 30278 + }, + { + "epoch": 9.293738489871087, + "grad_norm": 0.12612518668174744, + "learning_rate": 1.3035422129934027e-06, + "loss": 1.6846, + "step": 30279 + }, + { + "epoch": 9.294045426642112, + "grad_norm": 0.2112930864095688, + "learning_rate": 1.3024148703143834e-06, + "loss": 1.7302, + "step": 30280 + }, + { + "epoch": 9.294352363413138, + "grad_norm": 0.142434224486351, + "learning_rate": 1.3012880088934532e-06, + "loss": 1.7411, + "step": 30281 + }, + { + "epoch": 9.294659300184161, + "grad_norm": 0.20386098325252533, + "learning_rate": 1.3001616287417251e-06, + "loss": 1.748, + "step": 30282 + }, + { + "epoch": 9.294966236955187, + "grad_norm": 0.22800381481647491, + "learning_rate": 1.2990357298703514e-06, + "loss": 1.7431, + "step": 30283 + }, + { + "epoch": 9.295273173726212, + "grad_norm": 0.1692253053188324, + "learning_rate": 1.2979103122904512e-06, + "loss": 1.6908, + "step": 30284 + }, + { + "epoch": 9.295580110497237, + "grad_norm": 0.17138120532035828, + "learning_rate": 1.2967853760131431e-06, + "loss": 1.7099, + "step": 30285 + }, + { + "epoch": 9.295887047268263, + "grad_norm": 0.16712112724781036, + "learning_rate": 1.2956609210495518e-06, + "loss": 1.7331, + "step": 30286 + }, + { + "epoch": 9.296193984039288, + "grad_norm": 0.14170047640800476, + "learning_rate": 1.2945369474107849e-06, + "loss": 1.7089, + "step": 30287 + }, + { + "epoch": 9.296500920810313, + "grad_norm": 0.1860484778881073, + "learning_rate": 1.2934134551079503e-06, + "loss": 1.7737, + "step": 30288 + }, + { + "epoch": 9.296807857581339, + "grad_norm": 0.16710804402828217, + "learning_rate": 1.29229044415215e-06, + "loss": 1.7499, + "step": 30289 + }, + { + "epoch": 9.297114794352364, + "grad_norm": 0.11533838510513306, + "learning_rate": 1.2911679145544863e-06, + "loss": 1.6506, + "step": 30290 + }, + { + "epoch": 9.29742173112339, + "grad_norm": 0.1814284324645996, + "learning_rate": 1.2900458663260506e-06, + "loss": 1.7121, + "step": 30291 + }, + { + "epoch": 9.297728667894415, + "grad_norm": 0.11727334558963776, + "learning_rate": 1.2889242994779282e-06, + "loss": 1.6581, + "step": 30292 + }, + { + "epoch": 9.298035604665438, + "grad_norm": 0.2274969071149826, + "learning_rate": 1.2878032140212103e-06, + "loss": 1.7406, + "step": 30293 + }, + { + "epoch": 9.298342541436464, + "grad_norm": 0.12290076911449432, + "learning_rate": 1.2866826099669716e-06, + "loss": 1.6568, + "step": 30294 + }, + { + "epoch": 9.298649478207489, + "grad_norm": 0.2026246190071106, + "learning_rate": 1.2855624873262807e-06, + "loss": 1.7296, + "step": 30295 + }, + { + "epoch": 9.298956414978514, + "grad_norm": 0.13751426339149475, + "learning_rate": 1.284442846110223e-06, + "loss": 1.6897, + "step": 30296 + }, + { + "epoch": 9.29926335174954, + "grad_norm": 0.13357232511043549, + "learning_rate": 1.2833236863298459e-06, + "loss": 1.6609, + "step": 30297 + }, + { + "epoch": 9.299570288520565, + "grad_norm": 0.1956695318222046, + "learning_rate": 1.282205007996229e-06, + "loss": 1.7066, + "step": 30298 + }, + { + "epoch": 9.29987722529159, + "grad_norm": 0.11530495434999466, + "learning_rate": 1.2810868111204022e-06, + "loss": 1.6769, + "step": 30299 + }, + { + "epoch": 9.300184162062616, + "grad_norm": 0.1230783686041832, + "learning_rate": 1.2799690957134402e-06, + "loss": 1.665, + "step": 30300 + }, + { + "epoch": 9.300491098833641, + "grad_norm": 0.14144892990589142, + "learning_rate": 1.2788518617863787e-06, + "loss": 1.7247, + "step": 30301 + }, + { + "epoch": 9.300798035604666, + "grad_norm": 0.13692058622837067, + "learning_rate": 1.2777351093502588e-06, + "loss": 1.7165, + "step": 30302 + }, + { + "epoch": 9.30110497237569, + "grad_norm": 0.14115191996097565, + "learning_rate": 1.2766188384161159e-06, + "loss": 1.6991, + "step": 30303 + }, + { + "epoch": 9.301411909146715, + "grad_norm": 0.1208532303571701, + "learning_rate": 1.2755030489949803e-06, + "loss": 1.6621, + "step": 30304 + }, + { + "epoch": 9.30171884591774, + "grad_norm": 0.15770223736763, + "learning_rate": 1.2743877410978877e-06, + "loss": 1.7525, + "step": 30305 + }, + { + "epoch": 9.302025782688766, + "grad_norm": 0.1563788652420044, + "learning_rate": 1.2732729147358514e-06, + "loss": 1.7488, + "step": 30306 + }, + { + "epoch": 9.302332719459791, + "grad_norm": 0.13665367662906647, + "learning_rate": 1.2721585699198956e-06, + "loss": 1.7311, + "step": 30307 + }, + { + "epoch": 9.302639656230816, + "grad_norm": 0.23698623478412628, + "learning_rate": 1.2710447066610287e-06, + "loss": 1.7541, + "step": 30308 + }, + { + "epoch": 9.302946593001842, + "grad_norm": 0.17781539261341095, + "learning_rate": 1.2699313249702528e-06, + "loss": 1.7821, + "step": 30309 + }, + { + "epoch": 9.303253529772867, + "grad_norm": 0.14912116527557373, + "learning_rate": 1.2688184248585811e-06, + "loss": 1.7339, + "step": 30310 + }, + { + "epoch": 9.303560466543892, + "grad_norm": 0.18003590404987335, + "learning_rate": 1.2677060063370106e-06, + "loss": 1.7297, + "step": 30311 + }, + { + "epoch": 9.303867403314918, + "grad_norm": 0.11753804981708527, + "learning_rate": 1.2665940694165268e-06, + "loss": 1.6855, + "step": 30312 + }, + { + "epoch": 9.304174340085943, + "grad_norm": 0.1824817657470703, + "learning_rate": 1.2654826141081323e-06, + "loss": 1.7109, + "step": 30313 + }, + { + "epoch": 9.304481276856967, + "grad_norm": 0.13189560174942017, + "learning_rate": 1.26437164042279e-06, + "loss": 1.6894, + "step": 30314 + }, + { + "epoch": 9.304788213627992, + "grad_norm": 0.13488316535949707, + "learning_rate": 1.2632611483715029e-06, + "loss": 1.7074, + "step": 30315 + }, + { + "epoch": 9.305095150399017, + "grad_norm": 0.1344909518957138, + "learning_rate": 1.2621511379652284e-06, + "loss": 1.7152, + "step": 30316 + }, + { + "epoch": 9.305402087170043, + "grad_norm": 0.1880505383014679, + "learning_rate": 1.2610416092149468e-06, + "loss": 1.7673, + "step": 30317 + }, + { + "epoch": 9.305709023941068, + "grad_norm": 0.14804401993751526, + "learning_rate": 1.259932562131616e-06, + "loss": 1.686, + "step": 30318 + }, + { + "epoch": 9.306015960712093, + "grad_norm": 0.17230413854122162, + "learning_rate": 1.2588239967261994e-06, + "loss": 1.7075, + "step": 30319 + }, + { + "epoch": 9.306322897483119, + "grad_norm": 0.14153720438480377, + "learning_rate": 1.257715913009655e-06, + "loss": 1.6613, + "step": 30320 + }, + { + "epoch": 9.306629834254144, + "grad_norm": 0.20363643765449524, + "learning_rate": 1.2566083109929293e-06, + "loss": 1.7175, + "step": 30321 + }, + { + "epoch": 9.30693677102517, + "grad_norm": 0.1647050678730011, + "learning_rate": 1.2555011906869695e-06, + "loss": 1.719, + "step": 30322 + }, + { + "epoch": 9.307243707796195, + "grad_norm": 0.12517094612121582, + "learning_rate": 1.2543945521027167e-06, + "loss": 1.6589, + "step": 30323 + }, + { + "epoch": 9.307550644567218, + "grad_norm": 0.12023728340864182, + "learning_rate": 1.2532883952511066e-06, + "loss": 1.6757, + "step": 30324 + }, + { + "epoch": 9.307857581338244, + "grad_norm": 0.1275765597820282, + "learning_rate": 1.2521827201430692e-06, + "loss": 1.6908, + "step": 30325 + }, + { + "epoch": 9.308164518109269, + "grad_norm": 0.11408694088459015, + "learning_rate": 1.2510775267895403e-06, + "loss": 1.6488, + "step": 30326 + }, + { + "epoch": 9.308471454880294, + "grad_norm": 0.13963791728019714, + "learning_rate": 1.2499728152014334e-06, + "loss": 1.7251, + "step": 30327 + }, + { + "epoch": 9.30877839165132, + "grad_norm": 0.1533326357603073, + "learning_rate": 1.2488685853896676e-06, + "loss": 1.7334, + "step": 30328 + }, + { + "epoch": 9.309085328422345, + "grad_norm": 0.1821897327899933, + "learning_rate": 1.2477648373651563e-06, + "loss": 1.7077, + "step": 30329 + }, + { + "epoch": 9.30939226519337, + "grad_norm": 0.1111680120229721, + "learning_rate": 1.246661571138813e-06, + "loss": 1.6781, + "step": 30330 + }, + { + "epoch": 9.309699201964396, + "grad_norm": 0.13651998341083527, + "learning_rate": 1.2455587867215234e-06, + "loss": 1.718, + "step": 30331 + }, + { + "epoch": 9.310006138735421, + "grad_norm": 0.14687657356262207, + "learning_rate": 1.2444564841242123e-06, + "loss": 1.7526, + "step": 30332 + }, + { + "epoch": 9.310313075506446, + "grad_norm": 0.09129049628973007, + "learning_rate": 1.243354663357743e-06, + "loss": 1.6513, + "step": 30333 + }, + { + "epoch": 9.310620012277472, + "grad_norm": 0.11914718151092529, + "learning_rate": 1.2422533244330348e-06, + "loss": 1.6698, + "step": 30334 + }, + { + "epoch": 9.310926949048495, + "grad_norm": 0.1276206523180008, + "learning_rate": 1.2411524673609454e-06, + "loss": 1.6659, + "step": 30335 + }, + { + "epoch": 9.31123388581952, + "grad_norm": 0.12232425808906555, + "learning_rate": 1.2400520921523718e-06, + "loss": 1.6637, + "step": 30336 + }, + { + "epoch": 9.311540822590546, + "grad_norm": 0.1205383911728859, + "learning_rate": 1.238952198818183e-06, + "loss": 1.6681, + "step": 30337 + }, + { + "epoch": 9.311847759361571, + "grad_norm": 0.15745756030082703, + "learning_rate": 1.2378527873692481e-06, + "loss": 1.6975, + "step": 30338 + }, + { + "epoch": 9.312154696132596, + "grad_norm": 0.11485351622104645, + "learning_rate": 1.2367538578164307e-06, + "loss": 1.6621, + "step": 30339 + }, + { + "epoch": 9.312461632903622, + "grad_norm": 0.1697990894317627, + "learning_rate": 1.2356554101705942e-06, + "loss": 1.7309, + "step": 30340 + }, + { + "epoch": 9.312768569674647, + "grad_norm": 0.1388407200574875, + "learning_rate": 1.2345574444425912e-06, + "loss": 1.7437, + "step": 30341 + }, + { + "epoch": 9.313075506445673, + "grad_norm": 0.16522379219532013, + "learning_rate": 1.233459960643274e-06, + "loss": 1.7683, + "step": 30342 + }, + { + "epoch": 9.313382443216698, + "grad_norm": 0.13259927928447723, + "learning_rate": 1.2323629587834895e-06, + "loss": 1.7042, + "step": 30343 + }, + { + "epoch": 9.313689379987723, + "grad_norm": 0.1397528201341629, + "learning_rate": 1.2312664388740791e-06, + "loss": 1.7123, + "step": 30344 + }, + { + "epoch": 9.313996316758749, + "grad_norm": 0.1758471429347992, + "learning_rate": 1.2301704009258785e-06, + "loss": 1.6722, + "step": 30345 + }, + { + "epoch": 9.314303253529772, + "grad_norm": 0.18485552072525024, + "learning_rate": 1.229074844949718e-06, + "loss": 1.6827, + "step": 30346 + }, + { + "epoch": 9.314610190300797, + "grad_norm": 0.14430436491966248, + "learning_rate": 1.2279797709564222e-06, + "loss": 1.767, + "step": 30347 + }, + { + "epoch": 9.314917127071823, + "grad_norm": 0.16392521560192108, + "learning_rate": 1.226885178956816e-06, + "loss": 1.7177, + "step": 30348 + }, + { + "epoch": 9.315224063842848, + "grad_norm": 0.16354848444461823, + "learning_rate": 1.2257910689617235e-06, + "loss": 1.7661, + "step": 30349 + }, + { + "epoch": 9.315531000613873, + "grad_norm": 0.1507464200258255, + "learning_rate": 1.2246974409819424e-06, + "loss": 1.7143, + "step": 30350 + }, + { + "epoch": 9.315837937384899, + "grad_norm": 0.136259064078331, + "learning_rate": 1.2236042950282967e-06, + "loss": 1.6617, + "step": 30351 + }, + { + "epoch": 9.316144874155924, + "grad_norm": 0.1246718019247055, + "learning_rate": 1.222511631111578e-06, + "loss": 1.6955, + "step": 30352 + }, + { + "epoch": 9.31645181092695, + "grad_norm": 0.14265364408493042, + "learning_rate": 1.221419449242589e-06, + "loss": 1.6978, + "step": 30353 + }, + { + "epoch": 9.316758747697975, + "grad_norm": 0.1196669489145279, + "learning_rate": 1.2203277494321263e-06, + "loss": 1.6989, + "step": 30354 + }, + { + "epoch": 9.317065684469, + "grad_norm": 0.11023372411727905, + "learning_rate": 1.2192365316909705e-06, + "loss": 1.6807, + "step": 30355 + }, + { + "epoch": 9.317372621240025, + "grad_norm": 0.12376198917627335, + "learning_rate": 1.2181457960299237e-06, + "loss": 1.679, + "step": 30356 + }, + { + "epoch": 9.317679558011049, + "grad_norm": 0.1426834762096405, + "learning_rate": 1.217055542459733e-06, + "loss": 1.7351, + "step": 30357 + }, + { + "epoch": 9.317986494782074, + "grad_norm": 0.14629580080509186, + "learning_rate": 1.215965770991201e-06, + "loss": 1.7454, + "step": 30358 + }, + { + "epoch": 9.3182934315531, + "grad_norm": 0.13081271946430206, + "learning_rate": 1.214876481635091e-06, + "loss": 1.713, + "step": 30359 + }, + { + "epoch": 9.318600368324125, + "grad_norm": 0.2170068770647049, + "learning_rate": 1.2137876744021614e-06, + "loss": 1.7831, + "step": 30360 + }, + { + "epoch": 9.31890730509515, + "grad_norm": 0.13917239010334015, + "learning_rate": 1.2126993493031814e-06, + "loss": 1.702, + "step": 30361 + }, + { + "epoch": 9.319214241866176, + "grad_norm": 0.14852571487426758, + "learning_rate": 1.2116115063488975e-06, + "loss": 1.699, + "step": 30362 + }, + { + "epoch": 9.319521178637201, + "grad_norm": 0.1458083689212799, + "learning_rate": 1.2105241455500682e-06, + "loss": 1.6817, + "step": 30363 + }, + { + "epoch": 9.319828115408226, + "grad_norm": 0.1341833621263504, + "learning_rate": 1.209437266917429e-06, + "loss": 1.7099, + "step": 30364 + }, + { + "epoch": 9.320135052179252, + "grad_norm": 0.1942918747663498, + "learning_rate": 1.2083508704617274e-06, + "loss": 1.7301, + "step": 30365 + }, + { + "epoch": 9.320441988950277, + "grad_norm": 0.11925941705703735, + "learning_rate": 1.2072649561937099e-06, + "loss": 1.6722, + "step": 30366 + }, + { + "epoch": 9.3207489257213, + "grad_norm": 0.11189054697751999, + "learning_rate": 1.2061795241240904e-06, + "loss": 1.6457, + "step": 30367 + }, + { + "epoch": 9.321055862492326, + "grad_norm": 0.1742805689573288, + "learning_rate": 1.20509457426361e-06, + "loss": 1.7477, + "step": 30368 + }, + { + "epoch": 9.321362799263351, + "grad_norm": 0.2269359976053238, + "learning_rate": 1.204010106622977e-06, + "loss": 1.6954, + "step": 30369 + }, + { + "epoch": 9.321669736034377, + "grad_norm": 0.1383572667837143, + "learning_rate": 1.2029261212129218e-06, + "loss": 1.6834, + "step": 30370 + }, + { + "epoch": 9.321976672805402, + "grad_norm": 0.17733120918273926, + "learning_rate": 1.2018426180441466e-06, + "loss": 1.7426, + "step": 30371 + }, + { + "epoch": 9.322283609576427, + "grad_norm": 0.1365019828081131, + "learning_rate": 1.200759597127371e-06, + "loss": 1.7037, + "step": 30372 + }, + { + "epoch": 9.322590546347453, + "grad_norm": 0.1320653259754181, + "learning_rate": 1.1996770584732919e-06, + "loss": 1.7051, + "step": 30373 + }, + { + "epoch": 9.322897483118478, + "grad_norm": 0.16690899431705475, + "learning_rate": 1.1985950020926007e-06, + "loss": 1.7237, + "step": 30374 + }, + { + "epoch": 9.323204419889503, + "grad_norm": 0.10169432312250137, + "learning_rate": 1.1975134279959944e-06, + "loss": 1.6557, + "step": 30375 + }, + { + "epoch": 9.323511356660529, + "grad_norm": 0.10515127331018448, + "learning_rate": 1.1964323361941699e-06, + "loss": 1.6733, + "step": 30376 + }, + { + "epoch": 9.323818293431554, + "grad_norm": 0.13177691400051117, + "learning_rate": 1.1953517266978076e-06, + "loss": 1.7029, + "step": 30377 + }, + { + "epoch": 9.324125230202577, + "grad_norm": 0.12130782753229141, + "learning_rate": 1.1942715995175824e-06, + "loss": 1.6479, + "step": 30378 + }, + { + "epoch": 9.324432166973603, + "grad_norm": 0.1792365312576294, + "learning_rate": 1.193191954664169e-06, + "loss": 1.7026, + "step": 30379 + }, + { + "epoch": 9.324739103744628, + "grad_norm": 0.1391845941543579, + "learning_rate": 1.1921127921482422e-06, + "loss": 1.7122, + "step": 30380 + }, + { + "epoch": 9.325046040515653, + "grad_norm": 0.1593550443649292, + "learning_rate": 1.1910341119804657e-06, + "loss": 1.7014, + "step": 30381 + }, + { + "epoch": 9.325352977286679, + "grad_norm": 0.12819503247737885, + "learning_rate": 1.1899559141714922e-06, + "loss": 1.6717, + "step": 30382 + }, + { + "epoch": 9.325659914057704, + "grad_norm": 0.1585071086883545, + "learning_rate": 1.1888781987319907e-06, + "loss": 1.7021, + "step": 30383 + }, + { + "epoch": 9.32596685082873, + "grad_norm": 0.11215679347515106, + "learning_rate": 1.187800965672592e-06, + "loss": 1.6531, + "step": 30384 + }, + { + "epoch": 9.326273787599755, + "grad_norm": 0.10981804877519608, + "learning_rate": 1.1867242150039648e-06, + "loss": 1.6521, + "step": 30385 + }, + { + "epoch": 9.32658072437078, + "grad_norm": 0.1629389524459839, + "learning_rate": 1.1856479467367342e-06, + "loss": 1.7423, + "step": 30386 + }, + { + "epoch": 9.326887661141805, + "grad_norm": 0.1501983404159546, + "learning_rate": 1.1845721608815418e-06, + "loss": 1.7384, + "step": 30387 + }, + { + "epoch": 9.32719459791283, + "grad_norm": 0.13212816417217255, + "learning_rate": 1.1834968574490235e-06, + "loss": 1.6723, + "step": 30388 + }, + { + "epoch": 9.327501534683854, + "grad_norm": 0.140591561794281, + "learning_rate": 1.1824220364497984e-06, + "loss": 1.6677, + "step": 30389 + }, + { + "epoch": 9.32780847145488, + "grad_norm": 0.1365015208721161, + "learning_rate": 1.181347697894497e-06, + "loss": 1.6791, + "step": 30390 + }, + { + "epoch": 9.328115408225905, + "grad_norm": 0.16453112661838531, + "learning_rate": 1.1802738417937165e-06, + "loss": 1.7321, + "step": 30391 + }, + { + "epoch": 9.32842234499693, + "grad_norm": 0.18619593977928162, + "learning_rate": 1.1792004681580981e-06, + "loss": 1.7275, + "step": 30392 + }, + { + "epoch": 9.328729281767956, + "grad_norm": 0.2532525956630707, + "learning_rate": 1.178127576998228e-06, + "loss": 1.7376, + "step": 30393 + }, + { + "epoch": 9.329036218538981, + "grad_norm": 0.17427068948745728, + "learning_rate": 1.17705516832472e-06, + "loss": 1.7054, + "step": 30394 + }, + { + "epoch": 9.329343155310006, + "grad_norm": 0.13894926011562347, + "learning_rate": 1.1759832421481654e-06, + "loss": 1.6931, + "step": 30395 + }, + { + "epoch": 9.329650092081032, + "grad_norm": 0.12709759175777435, + "learning_rate": 1.174911798479167e-06, + "loss": 1.6846, + "step": 30396 + }, + { + "epoch": 9.329957028852057, + "grad_norm": 0.10510111600160599, + "learning_rate": 1.173840837328305e-06, + "loss": 1.666, + "step": 30397 + }, + { + "epoch": 9.330263965623082, + "grad_norm": 0.15923313796520233, + "learning_rate": 1.1727703587061655e-06, + "loss": 1.7103, + "step": 30398 + }, + { + "epoch": 9.330570902394108, + "grad_norm": 0.16868524253368378, + "learning_rate": 1.171700362623318e-06, + "loss": 1.7608, + "step": 30399 + }, + { + "epoch": 9.330877839165131, + "grad_norm": 0.2206472009420395, + "learning_rate": 1.170630849090365e-06, + "loss": 1.7612, + "step": 30400 + }, + { + "epoch": 9.331184775936157, + "grad_norm": 0.1557077318429947, + "learning_rate": 1.1695618181178426e-06, + "loss": 1.7387, + "step": 30401 + }, + { + "epoch": 9.331491712707182, + "grad_norm": 0.1106661707162857, + "learning_rate": 1.168493269716342e-06, + "loss": 1.7141, + "step": 30402 + }, + { + "epoch": 9.331798649478207, + "grad_norm": 0.13843196630477905, + "learning_rate": 1.1674252038963996e-06, + "loss": 1.7252, + "step": 30403 + }, + { + "epoch": 9.332105586249233, + "grad_norm": 0.1141132041811943, + "learning_rate": 1.1663576206685955e-06, + "loss": 1.6685, + "step": 30404 + }, + { + "epoch": 9.332412523020258, + "grad_norm": 0.15236155688762665, + "learning_rate": 1.1652905200434604e-06, + "loss": 1.7137, + "step": 30405 + }, + { + "epoch": 9.332719459791283, + "grad_norm": 0.15942497551441193, + "learning_rate": 1.164223902031547e-06, + "loss": 1.7117, + "step": 30406 + }, + { + "epoch": 9.333026396562309, + "grad_norm": 0.11390705406665802, + "learning_rate": 1.163157766643408e-06, + "loss": 1.6491, + "step": 30407 + }, + { + "epoch": 9.333333333333334, + "grad_norm": 0.21758639812469482, + "learning_rate": 1.1620921138895514e-06, + "loss": 1.7287, + "step": 30408 + }, + { + "epoch": 9.33364027010436, + "grad_norm": 0.13287439942359924, + "learning_rate": 1.1610269437805353e-06, + "loss": 1.6963, + "step": 30409 + }, + { + "epoch": 9.333947206875383, + "grad_norm": 0.15917138755321503, + "learning_rate": 1.1599622563268742e-06, + "loss": 1.7565, + "step": 30410 + }, + { + "epoch": 9.334254143646408, + "grad_norm": 0.13716933131217957, + "learning_rate": 1.1588980515390923e-06, + "loss": 1.6761, + "step": 30411 + }, + { + "epoch": 9.334561080417433, + "grad_norm": 0.19529521465301514, + "learning_rate": 1.1578343294277039e-06, + "loss": 1.7228, + "step": 30412 + }, + { + "epoch": 9.334868017188459, + "grad_norm": 0.2123236358165741, + "learning_rate": 1.156771090003228e-06, + "loss": 1.6959, + "step": 30413 + }, + { + "epoch": 9.335174953959484, + "grad_norm": 0.11489806324243546, + "learning_rate": 1.1557083332761675e-06, + "loss": 1.7124, + "step": 30414 + }, + { + "epoch": 9.33548189073051, + "grad_norm": 0.15767377614974976, + "learning_rate": 1.1546460592570252e-06, + "loss": 1.7181, + "step": 30415 + }, + { + "epoch": 9.335788827501535, + "grad_norm": 0.12808682024478912, + "learning_rate": 1.1535842679562924e-06, + "loss": 1.6773, + "step": 30416 + }, + { + "epoch": 9.33609576427256, + "grad_norm": 0.13981541991233826, + "learning_rate": 1.1525229593844832e-06, + "loss": 1.7389, + "step": 30417 + }, + { + "epoch": 9.336402701043585, + "grad_norm": 0.17036983370780945, + "learning_rate": 1.1514621335520614e-06, + "loss": 1.7441, + "step": 30418 + }, + { + "epoch": 9.33670963781461, + "grad_norm": 0.16650976240634918, + "learning_rate": 1.1504017904695296e-06, + "loss": 1.7574, + "step": 30419 + }, + { + "epoch": 9.337016574585636, + "grad_norm": 0.12821979820728302, + "learning_rate": 1.1493419301473518e-06, + "loss": 1.6709, + "step": 30420 + }, + { + "epoch": 9.33732351135666, + "grad_norm": 0.15850982069969177, + "learning_rate": 1.148282552596014e-06, + "loss": 1.6789, + "step": 30421 + }, + { + "epoch": 9.337630448127685, + "grad_norm": 0.17631210386753082, + "learning_rate": 1.1472236578259799e-06, + "loss": 1.692, + "step": 30422 + }, + { + "epoch": 9.33793738489871, + "grad_norm": 0.11740653961896896, + "learning_rate": 1.1461652458477135e-06, + "loss": 1.6718, + "step": 30423 + }, + { + "epoch": 9.338244321669736, + "grad_norm": 0.1481964886188507, + "learning_rate": 1.1451073166716841e-06, + "loss": 1.6934, + "step": 30424 + }, + { + "epoch": 9.338551258440761, + "grad_norm": 0.12850868701934814, + "learning_rate": 1.144049870308328e-06, + "loss": 1.7084, + "step": 30425 + }, + { + "epoch": 9.338858195211786, + "grad_norm": 0.12728431820869446, + "learning_rate": 1.142992906768109e-06, + "loss": 1.6646, + "step": 30426 + }, + { + "epoch": 9.339165131982812, + "grad_norm": 0.1583695262670517, + "learning_rate": 1.141936426061474e-06, + "loss": 1.6967, + "step": 30427 + }, + { + "epoch": 9.339472068753837, + "grad_norm": 0.1379023641347885, + "learning_rate": 1.1408804281988595e-06, + "loss": 1.7037, + "step": 30428 + }, + { + "epoch": 9.339779005524862, + "grad_norm": 0.12713885307312012, + "learning_rate": 1.1398249131907013e-06, + "loss": 1.675, + "step": 30429 + }, + { + "epoch": 9.340085942295888, + "grad_norm": 0.14857660233974457, + "learning_rate": 1.1387698810474302e-06, + "loss": 1.7329, + "step": 30430 + }, + { + "epoch": 9.340392879066913, + "grad_norm": 0.15104521811008453, + "learning_rate": 1.1377153317794765e-06, + "loss": 1.7059, + "step": 30431 + }, + { + "epoch": 9.340699815837937, + "grad_norm": 0.12101757526397705, + "learning_rate": 1.136661265397254e-06, + "loss": 1.6618, + "step": 30432 + }, + { + "epoch": 9.341006752608962, + "grad_norm": 0.13291479647159576, + "learning_rate": 1.1356076819111828e-06, + "loss": 1.6648, + "step": 30433 + }, + { + "epoch": 9.341313689379987, + "grad_norm": 0.13364644348621368, + "learning_rate": 1.134554581331687e-06, + "loss": 1.6737, + "step": 30434 + }, + { + "epoch": 9.341620626151013, + "grad_norm": 0.1292208731174469, + "learning_rate": 1.1335019636691535e-06, + "loss": 1.684, + "step": 30435 + }, + { + "epoch": 9.341927562922038, + "grad_norm": 0.11852065473794937, + "learning_rate": 1.1324498289340013e-06, + "loss": 1.7319, + "step": 30436 + }, + { + "epoch": 9.342234499693063, + "grad_norm": 0.1357669234275818, + "learning_rate": 1.1313981771366166e-06, + "loss": 1.6737, + "step": 30437 + }, + { + "epoch": 9.342541436464089, + "grad_norm": 0.10864339023828506, + "learning_rate": 1.1303470082874024e-06, + "loss": 1.6515, + "step": 30438 + }, + { + "epoch": 9.342848373235114, + "grad_norm": 0.1678614318370819, + "learning_rate": 1.129296322396739e-06, + "loss": 1.6871, + "step": 30439 + }, + { + "epoch": 9.34315531000614, + "grad_norm": 0.13384899497032166, + "learning_rate": 1.1282461194750182e-06, + "loss": 1.6888, + "step": 30440 + }, + { + "epoch": 9.343462246777165, + "grad_norm": 0.12848152220249176, + "learning_rate": 1.1271963995326151e-06, + "loss": 1.6952, + "step": 30441 + }, + { + "epoch": 9.34376918354819, + "grad_norm": 0.12591496109962463, + "learning_rate": 1.1261471625798937e-06, + "loss": 1.7404, + "step": 30442 + }, + { + "epoch": 9.344076120319214, + "grad_norm": 0.12495042383670807, + "learning_rate": 1.1250984086272397e-06, + "loss": 1.7052, + "step": 30443 + }, + { + "epoch": 9.344383057090239, + "grad_norm": 0.1944572478532791, + "learning_rate": 1.1240501376850066e-06, + "loss": 1.7768, + "step": 30444 + }, + { + "epoch": 9.344689993861264, + "grad_norm": 0.15033382177352905, + "learning_rate": 1.1230023497635579e-06, + "loss": 1.7285, + "step": 30445 + }, + { + "epoch": 9.34499693063229, + "grad_norm": 0.15685971081256866, + "learning_rate": 1.1219550448732463e-06, + "loss": 1.7317, + "step": 30446 + }, + { + "epoch": 9.345303867403315, + "grad_norm": 0.13611333072185516, + "learning_rate": 1.120908223024425e-06, + "loss": 1.6924, + "step": 30447 + }, + { + "epoch": 9.34561080417434, + "grad_norm": 0.16727523505687714, + "learning_rate": 1.1198618842274411e-06, + "loss": 1.7314, + "step": 30448 + }, + { + "epoch": 9.345917740945366, + "grad_norm": 0.11468715965747833, + "learning_rate": 1.1188160284926252e-06, + "loss": 1.6648, + "step": 30449 + }, + { + "epoch": 9.34622467771639, + "grad_norm": 0.1359895020723343, + "learning_rate": 1.1177706558303192e-06, + "loss": 1.7235, + "step": 30450 + }, + { + "epoch": 9.346531614487416, + "grad_norm": 0.12796089053153992, + "learning_rate": 1.116725766250859e-06, + "loss": 1.7034, + "step": 30451 + }, + { + "epoch": 9.346838551258442, + "grad_norm": 0.14425326883792877, + "learning_rate": 1.1156813597645588e-06, + "loss": 1.6618, + "step": 30452 + }, + { + "epoch": 9.347145488029465, + "grad_norm": 0.12873579561710358, + "learning_rate": 1.1146374363817602e-06, + "loss": 1.7002, + "step": 30453 + }, + { + "epoch": 9.34745242480049, + "grad_norm": 0.1240401417016983, + "learning_rate": 1.113593996112755e-06, + "loss": 1.7085, + "step": 30454 + }, + { + "epoch": 9.347759361571516, + "grad_norm": 0.16717098653316498, + "learning_rate": 1.1125510389678738e-06, + "loss": 1.7644, + "step": 30455 + }, + { + "epoch": 9.348066298342541, + "grad_norm": 0.14225825667381287, + "learning_rate": 1.1115085649574143e-06, + "loss": 1.754, + "step": 30456 + }, + { + "epoch": 9.348373235113566, + "grad_norm": 0.16719453036785126, + "learning_rate": 1.1104665740916787e-06, + "loss": 1.7757, + "step": 30457 + }, + { + "epoch": 9.348680171884592, + "grad_norm": 0.14928758144378662, + "learning_rate": 1.1094250663809812e-06, + "loss": 1.6919, + "step": 30458 + }, + { + "epoch": 9.348987108655617, + "grad_norm": 0.16433440148830414, + "learning_rate": 1.1083840418355862e-06, + "loss": 1.7464, + "step": 30459 + }, + { + "epoch": 9.349294045426642, + "grad_norm": 0.16641557216644287, + "learning_rate": 1.1073435004657961e-06, + "loss": 1.6872, + "step": 30460 + }, + { + "epoch": 9.349600982197668, + "grad_norm": 0.1351664960384369, + "learning_rate": 1.106303442281903e-06, + "loss": 1.6792, + "step": 30461 + }, + { + "epoch": 9.349907918968693, + "grad_norm": 0.13160523772239685, + "learning_rate": 1.1052638672941707e-06, + "loss": 1.7087, + "step": 30462 + }, + { + "epoch": 9.350214855739718, + "grad_norm": 0.13107560575008392, + "learning_rate": 1.1042247755128854e-06, + "loss": 1.655, + "step": 30463 + }, + { + "epoch": 9.350521792510742, + "grad_norm": 0.1115984246134758, + "learning_rate": 1.1031861669483058e-06, + "loss": 1.6939, + "step": 30464 + }, + { + "epoch": 9.350828729281767, + "grad_norm": 0.2041286677122116, + "learning_rate": 1.1021480416106956e-06, + "loss": 1.7502, + "step": 30465 + }, + { + "epoch": 9.351135666052793, + "grad_norm": 0.1607433408498764, + "learning_rate": 1.1011103995103245e-06, + "loss": 1.7618, + "step": 30466 + }, + { + "epoch": 9.351442602823818, + "grad_norm": 0.15420445799827576, + "learning_rate": 1.1000732406574343e-06, + "loss": 1.7348, + "step": 30467 + }, + { + "epoch": 9.351749539594843, + "grad_norm": 0.1475592702627182, + "learning_rate": 1.099036565062289e-06, + "loss": 1.6618, + "step": 30468 + }, + { + "epoch": 9.352056476365869, + "grad_norm": 0.12382391095161438, + "learning_rate": 1.0980003727351196e-06, + "loss": 1.668, + "step": 30469 + }, + { + "epoch": 9.352363413136894, + "grad_norm": 0.14605712890625, + "learning_rate": 1.096964663686184e-06, + "loss": 1.7274, + "step": 30470 + }, + { + "epoch": 9.35267034990792, + "grad_norm": 0.1413935273885727, + "learning_rate": 1.0959294379256913e-06, + "loss": 1.7173, + "step": 30471 + }, + { + "epoch": 9.352977286678945, + "grad_norm": 0.1893736571073532, + "learning_rate": 1.0948946954638994e-06, + "loss": 1.7243, + "step": 30472 + }, + { + "epoch": 9.35328422344997, + "grad_norm": 0.13228827714920044, + "learning_rate": 1.0938604363110172e-06, + "loss": 1.6907, + "step": 30473 + }, + { + "epoch": 9.353591160220994, + "grad_norm": 0.13724558055400848, + "learning_rate": 1.0928266604772697e-06, + "loss": 1.6925, + "step": 30474 + }, + { + "epoch": 9.353898096992019, + "grad_norm": 0.1286490261554718, + "learning_rate": 1.091793367972882e-06, + "loss": 1.6977, + "step": 30475 + }, + { + "epoch": 9.354205033763044, + "grad_norm": 0.17098230123519897, + "learning_rate": 1.0907605588080517e-06, + "loss": 1.7392, + "step": 30476 + }, + { + "epoch": 9.35451197053407, + "grad_norm": 0.14103081822395325, + "learning_rate": 1.0897282329929924e-06, + "loss": 1.6872, + "step": 30477 + }, + { + "epoch": 9.354818907305095, + "grad_norm": 0.14384165406227112, + "learning_rate": 1.0886963905379077e-06, + "loss": 1.6996, + "step": 30478 + }, + { + "epoch": 9.35512584407612, + "grad_norm": 0.12110382318496704, + "learning_rate": 1.087665031452989e-06, + "loss": 1.6688, + "step": 30479 + }, + { + "epoch": 9.355432780847146, + "grad_norm": 0.1337585598230362, + "learning_rate": 1.0866341557484394e-06, + "loss": 1.6703, + "step": 30480 + }, + { + "epoch": 9.355739717618171, + "grad_norm": 0.16640827059745789, + "learning_rate": 1.0856037634344341e-06, + "loss": 1.7675, + "step": 30481 + }, + { + "epoch": 9.356046654389196, + "grad_norm": 0.1333245038986206, + "learning_rate": 1.0845738545211702e-06, + "loss": 1.7147, + "step": 30482 + }, + { + "epoch": 9.356353591160222, + "grad_norm": 0.13712866604328156, + "learning_rate": 1.0835444290188124e-06, + "loss": 1.7219, + "step": 30483 + }, + { + "epoch": 9.356660527931247, + "grad_norm": 0.14520063996315002, + "learning_rate": 1.0825154869375353e-06, + "loss": 1.6548, + "step": 30484 + }, + { + "epoch": 9.35696746470227, + "grad_norm": 0.10503572225570679, + "learning_rate": 1.08148702828752e-06, + "loss": 1.6853, + "step": 30485 + }, + { + "epoch": 9.357274401473296, + "grad_norm": 0.12749113142490387, + "learning_rate": 1.080459053078914e-06, + "loss": 1.6837, + "step": 30486 + }, + { + "epoch": 9.357581338244321, + "grad_norm": 0.13570766150951385, + "learning_rate": 1.079431561321892e-06, + "loss": 1.7209, + "step": 30487 + }, + { + "epoch": 9.357888275015346, + "grad_norm": 0.10935094952583313, + "learning_rate": 1.0784045530265907e-06, + "loss": 1.6559, + "step": 30488 + }, + { + "epoch": 9.358195211786372, + "grad_norm": 0.2123469114303589, + "learning_rate": 1.0773780282031799e-06, + "loss": 1.7223, + "step": 30489 + }, + { + "epoch": 9.358502148557397, + "grad_norm": 0.12153031677007675, + "learning_rate": 1.07635198686179e-06, + "loss": 1.6842, + "step": 30490 + }, + { + "epoch": 9.358809085328422, + "grad_norm": 0.1416035294532776, + "learning_rate": 1.0753264290125576e-06, + "loss": 1.7064, + "step": 30491 + }, + { + "epoch": 9.359116022099448, + "grad_norm": 0.12089719623327255, + "learning_rate": 1.0743013546656356e-06, + "loss": 1.6848, + "step": 30492 + }, + { + "epoch": 9.359422958870473, + "grad_norm": 0.13979336619377136, + "learning_rate": 1.073276763831138e-06, + "loss": 1.6967, + "step": 30493 + }, + { + "epoch": 9.359729895641498, + "grad_norm": 0.14014959335327148, + "learning_rate": 1.072252656519196e-06, + "loss": 1.686, + "step": 30494 + }, + { + "epoch": 9.360036832412524, + "grad_norm": 0.17366288602352142, + "learning_rate": 1.0712290327399344e-06, + "loss": 1.6709, + "step": 30495 + }, + { + "epoch": 9.360343769183547, + "grad_norm": 0.1098582074046135, + "learning_rate": 1.070205892503462e-06, + "loss": 1.6552, + "step": 30496 + }, + { + "epoch": 9.360650705954573, + "grad_norm": 0.17184807360172272, + "learning_rate": 1.0691832358198984e-06, + "loss": 1.7052, + "step": 30497 + }, + { + "epoch": 9.360957642725598, + "grad_norm": 0.1819550096988678, + "learning_rate": 1.068161062699341e-06, + "loss": 1.718, + "step": 30498 + }, + { + "epoch": 9.361264579496623, + "grad_norm": 0.11239949613809586, + "learning_rate": 1.0671393731518985e-06, + "loss": 1.6928, + "step": 30499 + }, + { + "epoch": 9.361571516267649, + "grad_norm": 0.13595740497112274, + "learning_rate": 1.066118167187663e-06, + "loss": 1.6987, + "step": 30500 + }, + { + "epoch": 9.361878453038674, + "grad_norm": 0.1424037665128708, + "learning_rate": 1.0650974448167316e-06, + "loss": 1.7194, + "step": 30501 + }, + { + "epoch": 9.3621853898097, + "grad_norm": 0.17475293576717377, + "learning_rate": 1.0640772060491855e-06, + "loss": 1.7191, + "step": 30502 + }, + { + "epoch": 9.362492326580725, + "grad_norm": 0.22121796011924744, + "learning_rate": 1.0630574508951108e-06, + "loss": 1.7752, + "step": 30503 + }, + { + "epoch": 9.36279926335175, + "grad_norm": 0.19642120599746704, + "learning_rate": 1.0620381793645885e-06, + "loss": 1.6985, + "step": 30504 + }, + { + "epoch": 9.363106200122775, + "grad_norm": 0.16090667247772217, + "learning_rate": 1.0610193914676825e-06, + "loss": 1.7547, + "step": 30505 + }, + { + "epoch": 9.3634131368938, + "grad_norm": 0.15036262571811676, + "learning_rate": 1.0600010872144794e-06, + "loss": 1.7545, + "step": 30506 + }, + { + "epoch": 9.363720073664824, + "grad_norm": 0.13965867459774017, + "learning_rate": 1.0589832666150213e-06, + "loss": 1.7093, + "step": 30507 + }, + { + "epoch": 9.36402701043585, + "grad_norm": 0.14103607833385468, + "learning_rate": 1.057965929679372e-06, + "loss": 1.7519, + "step": 30508 + }, + { + "epoch": 9.364333947206875, + "grad_norm": 0.11406313627958298, + "learning_rate": 1.056949076417596e-06, + "loss": 1.6431, + "step": 30509 + }, + { + "epoch": 9.3646408839779, + "grad_norm": 0.14929352700710297, + "learning_rate": 1.0559327068397296e-06, + "loss": 1.6863, + "step": 30510 + }, + { + "epoch": 9.364947820748926, + "grad_norm": 0.12195751070976257, + "learning_rate": 1.0549168209558312e-06, + "loss": 1.6888, + "step": 30511 + }, + { + "epoch": 9.365254757519951, + "grad_norm": 0.14742396771907806, + "learning_rate": 1.0539014187759267e-06, + "loss": 1.6808, + "step": 30512 + }, + { + "epoch": 9.365561694290976, + "grad_norm": 0.20945298671722412, + "learning_rate": 1.0528865003100573e-06, + "loss": 1.7754, + "step": 30513 + }, + { + "epoch": 9.365868631062002, + "grad_norm": 0.13752134144306183, + "learning_rate": 1.0518720655682545e-06, + "loss": 1.723, + "step": 30514 + }, + { + "epoch": 9.366175567833027, + "grad_norm": 0.20715954899787903, + "learning_rate": 1.0508581145605379e-06, + "loss": 1.7787, + "step": 30515 + }, + { + "epoch": 9.366482504604052, + "grad_norm": 0.11915310472249985, + "learning_rate": 1.0498446472969326e-06, + "loss": 1.668, + "step": 30516 + }, + { + "epoch": 9.366789441375076, + "grad_norm": 0.15565282106399536, + "learning_rate": 1.0488316637874529e-06, + "loss": 1.7415, + "step": 30517 + }, + { + "epoch": 9.367096378146101, + "grad_norm": 0.17260490357875824, + "learning_rate": 1.0478191640421132e-06, + "loss": 1.7511, + "step": 30518 + }, + { + "epoch": 9.367403314917127, + "grad_norm": 0.15730834007263184, + "learning_rate": 1.0468071480709163e-06, + "loss": 1.7225, + "step": 30519 + }, + { + "epoch": 9.367710251688152, + "grad_norm": 0.11092279106378555, + "learning_rate": 1.0457956158838544e-06, + "loss": 1.6866, + "step": 30520 + }, + { + "epoch": 9.368017188459177, + "grad_norm": 0.1350366175174713, + "learning_rate": 1.0447845674909417e-06, + "loss": 1.7203, + "step": 30521 + }, + { + "epoch": 9.368324125230203, + "grad_norm": 0.13730715215206146, + "learning_rate": 1.0437740029021591e-06, + "loss": 1.7076, + "step": 30522 + }, + { + "epoch": 9.368631062001228, + "grad_norm": 0.13333722949028015, + "learning_rate": 1.0427639221274988e-06, + "loss": 1.7061, + "step": 30523 + }, + { + "epoch": 9.368937998772253, + "grad_norm": 0.18173889815807343, + "learning_rate": 1.0417543251769413e-06, + "loss": 1.7102, + "step": 30524 + }, + { + "epoch": 9.369244935543279, + "grad_norm": 0.09129618853330612, + "learning_rate": 1.040745212060451e-06, + "loss": 1.62, + "step": 30525 + }, + { + "epoch": 9.369551872314304, + "grad_norm": 0.1274579018354416, + "learning_rate": 1.0397365827880256e-06, + "loss": 1.7183, + "step": 30526 + }, + { + "epoch": 9.36985880908533, + "grad_norm": 0.14064618945121765, + "learning_rate": 1.0387284373696126e-06, + "loss": 1.679, + "step": 30527 + }, + { + "epoch": 9.370165745856353, + "grad_norm": 0.1963305026292801, + "learning_rate": 1.037720775815193e-06, + "loss": 1.6764, + "step": 30528 + }, + { + "epoch": 9.370472682627378, + "grad_norm": 0.14961928129196167, + "learning_rate": 1.0367135981346977e-06, + "loss": 1.7171, + "step": 30529 + }, + { + "epoch": 9.370779619398403, + "grad_norm": 0.16405031085014343, + "learning_rate": 1.0357069043381073e-06, + "loss": 1.7747, + "step": 30530 + }, + { + "epoch": 9.371086556169429, + "grad_norm": 0.1538914144039154, + "learning_rate": 1.0347006944353588e-06, + "loss": 1.6935, + "step": 30531 + }, + { + "epoch": 9.371393492940454, + "grad_norm": 0.13590097427368164, + "learning_rate": 1.0336949684363995e-06, + "loss": 1.734, + "step": 30532 + }, + { + "epoch": 9.37170042971148, + "grad_norm": 0.09966246783733368, + "learning_rate": 1.0326897263511602e-06, + "loss": 1.6684, + "step": 30533 + }, + { + "epoch": 9.372007366482505, + "grad_norm": 0.15132254362106323, + "learning_rate": 1.031684968189589e-06, + "loss": 1.6975, + "step": 30534 + }, + { + "epoch": 9.37231430325353, + "grad_norm": 0.18380047380924225, + "learning_rate": 1.0306806939616055e-06, + "loss": 1.7704, + "step": 30535 + }, + { + "epoch": 9.372621240024555, + "grad_norm": 0.2081698179244995, + "learning_rate": 1.0296769036771347e-06, + "loss": 1.7853, + "step": 30536 + }, + { + "epoch": 9.37292817679558, + "grad_norm": 0.1174221932888031, + "learning_rate": 1.028673597346097e-06, + "loss": 1.6579, + "step": 30537 + }, + { + "epoch": 9.373235113566606, + "grad_norm": 0.17011094093322754, + "learning_rate": 1.0276707749784175e-06, + "loss": 1.7402, + "step": 30538 + }, + { + "epoch": 9.37354205033763, + "grad_norm": 0.13609996438026428, + "learning_rate": 1.026668436583994e-06, + "loss": 1.7082, + "step": 30539 + }, + { + "epoch": 9.373848987108655, + "grad_norm": 0.1455853134393692, + "learning_rate": 1.0256665821727406e-06, + "loss": 1.751, + "step": 30540 + }, + { + "epoch": 9.37415592387968, + "grad_norm": 0.1282152682542801, + "learning_rate": 1.0246652117545552e-06, + "loss": 1.6659, + "step": 30541 + }, + { + "epoch": 9.374462860650706, + "grad_norm": 0.17218823730945587, + "learning_rate": 1.023664325339324e-06, + "loss": 1.6571, + "step": 30542 + }, + { + "epoch": 9.374769797421731, + "grad_norm": 0.1530035436153412, + "learning_rate": 1.0226639229369618e-06, + "loss": 1.6975, + "step": 30543 + }, + { + "epoch": 9.375076734192756, + "grad_norm": 0.15473347902297974, + "learning_rate": 1.0216640045573267e-06, + "loss": 1.7283, + "step": 30544 + }, + { + "epoch": 9.375383670963782, + "grad_norm": 0.17946626245975494, + "learning_rate": 1.0206645702103279e-06, + "loss": 1.7391, + "step": 30545 + }, + { + "epoch": 9.375690607734807, + "grad_norm": 0.12358242273330688, + "learning_rate": 1.0196656199058186e-06, + "loss": 1.6633, + "step": 30546 + }, + { + "epoch": 9.375997544505832, + "grad_norm": 0.1423409879207611, + "learning_rate": 1.0186671536536907e-06, + "loss": 1.6969, + "step": 30547 + }, + { + "epoch": 9.376304481276858, + "grad_norm": 0.15845637023448944, + "learning_rate": 1.0176691714637976e-06, + "loss": 1.743, + "step": 30548 + }, + { + "epoch": 9.376611418047883, + "grad_norm": 0.13585655391216278, + "learning_rate": 1.0166716733460091e-06, + "loss": 1.6668, + "step": 30549 + }, + { + "epoch": 9.376918354818907, + "grad_norm": 0.13910886645317078, + "learning_rate": 1.015674659310184e-06, + "loss": 1.7184, + "step": 30550 + }, + { + "epoch": 9.377225291589932, + "grad_norm": 0.15852247178554535, + "learning_rate": 1.01467812936617e-06, + "loss": 1.7708, + "step": 30551 + }, + { + "epoch": 9.377532228360957, + "grad_norm": 0.15506471693515778, + "learning_rate": 1.0136820835238148e-06, + "loss": 1.7377, + "step": 30552 + }, + { + "epoch": 9.377839165131983, + "grad_norm": 0.13877533376216888, + "learning_rate": 1.0126865217929715e-06, + "loss": 1.7335, + "step": 30553 + }, + { + "epoch": 9.378146101903008, + "grad_norm": 0.12510280311107635, + "learning_rate": 1.0116914441834657e-06, + "loss": 1.6979, + "step": 30554 + }, + { + "epoch": 9.378453038674033, + "grad_norm": 0.17626170814037323, + "learning_rate": 1.0106968507051451e-06, + "loss": 1.7316, + "step": 30555 + }, + { + "epoch": 9.378759975445059, + "grad_norm": 0.17140509188175201, + "learning_rate": 1.009702741367824e-06, + "loss": 1.7576, + "step": 30556 + }, + { + "epoch": 9.379066912216084, + "grad_norm": 0.17579251527786255, + "learning_rate": 1.008709116181339e-06, + "loss": 1.6942, + "step": 30557 + }, + { + "epoch": 9.37937384898711, + "grad_norm": 0.1375150978565216, + "learning_rate": 1.0077159751555099e-06, + "loss": 1.7043, + "step": 30558 + }, + { + "epoch": 9.379680785758135, + "grad_norm": 0.11679084599018097, + "learning_rate": 1.0067233183001346e-06, + "loss": 1.6868, + "step": 30559 + }, + { + "epoch": 9.379987722529158, + "grad_norm": 0.15186625719070435, + "learning_rate": 1.0057311456250495e-06, + "loss": 1.702, + "step": 30560 + }, + { + "epoch": 9.380294659300183, + "grad_norm": 0.18598486483097076, + "learning_rate": 1.0047394571400304e-06, + "loss": 1.8104, + "step": 30561 + }, + { + "epoch": 9.380601596071209, + "grad_norm": 0.12907341122627258, + "learning_rate": 1.003748252854908e-06, + "loss": 1.6878, + "step": 30562 + }, + { + "epoch": 9.380908532842234, + "grad_norm": 0.15694235265254974, + "learning_rate": 1.0027575327794525e-06, + "loss": 1.7079, + "step": 30563 + }, + { + "epoch": 9.38121546961326, + "grad_norm": 0.12046566605567932, + "learning_rate": 1.0017672969234671e-06, + "loss": 1.6602, + "step": 30564 + }, + { + "epoch": 9.381522406384285, + "grad_norm": 0.12011182308197021, + "learning_rate": 1.0007775452967383e-06, + "loss": 1.6756, + "step": 30565 + }, + { + "epoch": 9.38182934315531, + "grad_norm": 0.13124582171440125, + "learning_rate": 9.997882779090473e-07, + "loss": 1.6726, + "step": 30566 + }, + { + "epoch": 9.382136279926335, + "grad_norm": 0.1443175971508026, + "learning_rate": 9.98799494770164e-07, + "loss": 1.7491, + "step": 30567 + }, + { + "epoch": 9.38244321669736, + "grad_norm": 0.09302258491516113, + "learning_rate": 9.978111958898639e-07, + "loss": 1.6237, + "step": 30568 + }, + { + "epoch": 9.382750153468386, + "grad_norm": 0.13836117088794708, + "learning_rate": 9.968233812779172e-07, + "loss": 1.735, + "step": 30569 + }, + { + "epoch": 9.383057090239411, + "grad_norm": 0.1278647780418396, + "learning_rate": 9.958360509440879e-07, + "loss": 1.6629, + "step": 30570 + }, + { + "epoch": 9.383364027010435, + "grad_norm": 0.1527305543422699, + "learning_rate": 9.948492048981183e-07, + "loss": 1.7194, + "step": 30571 + }, + { + "epoch": 9.38367096378146, + "grad_norm": 0.1453726887702942, + "learning_rate": 9.938628431497844e-07, + "loss": 1.723, + "step": 30572 + }, + { + "epoch": 9.383977900552486, + "grad_norm": 0.1687985509634018, + "learning_rate": 9.92876965708811e-07, + "loss": 1.7092, + "step": 30573 + }, + { + "epoch": 9.384284837323511, + "grad_norm": 0.1347656548023224, + "learning_rate": 9.918915725849577e-07, + "loss": 1.6471, + "step": 30574 + }, + { + "epoch": 9.384591774094536, + "grad_norm": 0.15019412338733673, + "learning_rate": 9.90906663787955e-07, + "loss": 1.6936, + "step": 30575 + }, + { + "epoch": 9.384898710865562, + "grad_norm": 0.10301146656274796, + "learning_rate": 9.899222393275342e-07, + "loss": 1.6845, + "step": 30576 + }, + { + "epoch": 9.385205647636587, + "grad_norm": 0.15683500468730927, + "learning_rate": 9.889382992134323e-07, + "loss": 1.6929, + "step": 30577 + }, + { + "epoch": 9.385512584407612, + "grad_norm": 0.13546130061149597, + "learning_rate": 9.879548434553631e-07, + "loss": 1.6783, + "step": 30578 + }, + { + "epoch": 9.385819521178638, + "grad_norm": 0.1424054205417633, + "learning_rate": 9.869718720630583e-07, + "loss": 1.7276, + "step": 30579 + }, + { + "epoch": 9.386126457949663, + "grad_norm": 0.19902123510837555, + "learning_rate": 9.859893850462155e-07, + "loss": 1.7244, + "step": 30580 + }, + { + "epoch": 9.386433394720688, + "grad_norm": 0.13931868970394135, + "learning_rate": 9.8500738241456e-07, + "loss": 1.7272, + "step": 30581 + }, + { + "epoch": 9.386740331491712, + "grad_norm": 0.12193772196769714, + "learning_rate": 9.8402586417779e-07, + "loss": 1.6856, + "step": 30582 + }, + { + "epoch": 9.387047268262737, + "grad_norm": 0.13566039502620697, + "learning_rate": 9.83044830345603e-07, + "loss": 1.7083, + "step": 30583 + }, + { + "epoch": 9.387354205033763, + "grad_norm": 0.15738597512245178, + "learning_rate": 9.82064280927697e-07, + "loss": 1.6692, + "step": 30584 + }, + { + "epoch": 9.387661141804788, + "grad_norm": 0.13515286147594452, + "learning_rate": 9.81084215933764e-07, + "loss": 1.6891, + "step": 30585 + }, + { + "epoch": 9.387968078575813, + "grad_norm": 0.15392665565013885, + "learning_rate": 9.80104635373491e-07, + "loss": 1.7277, + "step": 30586 + }, + { + "epoch": 9.388275015346839, + "grad_norm": 0.11712920665740967, + "learning_rate": 9.79125539256548e-07, + "loss": 1.6878, + "step": 30587 + }, + { + "epoch": 9.388581952117864, + "grad_norm": 0.17001082003116608, + "learning_rate": 9.781469275926214e-07, + "loss": 1.7397, + "step": 30588 + }, + { + "epoch": 9.38888888888889, + "grad_norm": 0.154278963804245, + "learning_rate": 9.771688003913816e-07, + "loss": 1.71, + "step": 30589 + }, + { + "epoch": 9.389195825659915, + "grad_norm": 0.12404046952724457, + "learning_rate": 9.761911576624872e-07, + "loss": 1.6946, + "step": 30590 + }, + { + "epoch": 9.38950276243094, + "grad_norm": 0.152077317237854, + "learning_rate": 9.75213999415614e-07, + "loss": 1.6986, + "step": 30591 + }, + { + "epoch": 9.389809699201965, + "grad_norm": 0.11967775225639343, + "learning_rate": 9.742373256604099e-07, + "loss": 1.6376, + "step": 30592 + }, + { + "epoch": 9.390116635972989, + "grad_norm": 0.12324173748493195, + "learning_rate": 9.732611364065169e-07, + "loss": 1.704, + "step": 30593 + }, + { + "epoch": 9.390423572744014, + "grad_norm": 0.19685424864292145, + "learning_rate": 9.722854316636054e-07, + "loss": 1.7799, + "step": 30594 + }, + { + "epoch": 9.39073050951504, + "grad_norm": 0.18277420103549957, + "learning_rate": 9.713102114412953e-07, + "loss": 1.8062, + "step": 30595 + }, + { + "epoch": 9.391037446286065, + "grad_norm": 0.12882667779922485, + "learning_rate": 9.7033547574924e-07, + "loss": 1.7086, + "step": 30596 + }, + { + "epoch": 9.39134438305709, + "grad_norm": 0.11336109042167664, + "learning_rate": 9.693612245970652e-07, + "loss": 1.691, + "step": 30597 + }, + { + "epoch": 9.391651319828116, + "grad_norm": 0.1724751889705658, + "learning_rate": 9.683874579943964e-07, + "loss": 1.7329, + "step": 30598 + }, + { + "epoch": 9.39195825659914, + "grad_norm": 0.12712900340557098, + "learning_rate": 9.674141759508704e-07, + "loss": 1.6889, + "step": 30599 + }, + { + "epoch": 9.392265193370166, + "grad_norm": 0.18404419720172882, + "learning_rate": 9.664413784760907e-07, + "loss": 1.6947, + "step": 30600 + }, + { + "epoch": 9.392572130141192, + "grad_norm": 0.12651921808719635, + "learning_rate": 9.654690655796772e-07, + "loss": 1.7164, + "step": 30601 + }, + { + "epoch": 9.392879066912217, + "grad_norm": 0.1299905627965927, + "learning_rate": 9.64497237271239e-07, + "loss": 1.7192, + "step": 30602 + }, + { + "epoch": 9.39318600368324, + "grad_norm": 0.14098776876926422, + "learning_rate": 9.635258935603796e-07, + "loss": 1.6535, + "step": 30603 + }, + { + "epoch": 9.393492940454266, + "grad_norm": 0.13803884387016296, + "learning_rate": 9.62555034456697e-07, + "loss": 1.6703, + "step": 30604 + }, + { + "epoch": 9.393799877225291, + "grad_norm": 0.1579771488904953, + "learning_rate": 9.61584659969783e-07, + "loss": 1.7882, + "step": 30605 + }, + { + "epoch": 9.394106813996316, + "grad_norm": 0.11700218915939331, + "learning_rate": 9.606147701092416e-07, + "loss": 1.6802, + "step": 30606 + }, + { + "epoch": 9.394413750767342, + "grad_norm": 0.16874761879444122, + "learning_rate": 9.596453648846426e-07, + "loss": 1.6999, + "step": 30607 + }, + { + "epoch": 9.394720687538367, + "grad_norm": 0.14294692873954773, + "learning_rate": 9.586764443055785e-07, + "loss": 1.6757, + "step": 30608 + }, + { + "epoch": 9.395027624309392, + "grad_norm": 0.13398779928684235, + "learning_rate": 9.57708008381608e-07, + "loss": 1.7234, + "step": 30609 + }, + { + "epoch": 9.395334561080418, + "grad_norm": 0.15532025694847107, + "learning_rate": 9.567400571223129e-07, + "loss": 1.7504, + "step": 30610 + }, + { + "epoch": 9.395641497851443, + "grad_norm": 0.12451089173555374, + "learning_rate": 9.557725905372627e-07, + "loss": 1.6537, + "step": 30611 + }, + { + "epoch": 9.395948434622468, + "grad_norm": 0.18524393439292908, + "learning_rate": 9.548056086360114e-07, + "loss": 1.6706, + "step": 30612 + }, + { + "epoch": 9.396255371393494, + "grad_norm": 0.12702727317810059, + "learning_rate": 9.538391114281175e-07, + "loss": 1.6772, + "step": 30613 + }, + { + "epoch": 9.396562308164517, + "grad_norm": 0.1752685308456421, + "learning_rate": 9.528730989231294e-07, + "loss": 1.755, + "step": 30614 + }, + { + "epoch": 9.396869244935543, + "grad_norm": 0.13985255360603333, + "learning_rate": 9.519075711306003e-07, + "loss": 1.672, + "step": 30615 + }, + { + "epoch": 9.397176181706568, + "grad_norm": 0.14705638587474823, + "learning_rate": 9.50942528060067e-07, + "loss": 1.6884, + "step": 30616 + }, + { + "epoch": 9.397483118477593, + "grad_norm": 0.14204713702201843, + "learning_rate": 9.499779697210665e-07, + "loss": 1.6903, + "step": 30617 + }, + { + "epoch": 9.397790055248619, + "grad_norm": 0.16127781569957733, + "learning_rate": 9.490138961231355e-07, + "loss": 1.6854, + "step": 30618 + }, + { + "epoch": 9.398096992019644, + "grad_norm": 0.11951326578855515, + "learning_rate": 9.480503072757996e-07, + "loss": 1.6556, + "step": 30619 + }, + { + "epoch": 9.39840392879067, + "grad_norm": 0.11818456649780273, + "learning_rate": 9.470872031885791e-07, + "loss": 1.6873, + "step": 30620 + }, + { + "epoch": 9.398710865561695, + "grad_norm": 0.14344888925552368, + "learning_rate": 9.461245838709942e-07, + "loss": 1.6942, + "step": 30621 + }, + { + "epoch": 9.39901780233272, + "grad_norm": 0.141475647687912, + "learning_rate": 9.451624493325539e-07, + "loss": 1.7613, + "step": 30622 + }, + { + "epoch": 9.399324739103745, + "grad_norm": 0.13234710693359375, + "learning_rate": 9.442007995827784e-07, + "loss": 1.6922, + "step": 30623 + }, + { + "epoch": 9.399631675874769, + "grad_norm": 0.12975256145000458, + "learning_rate": 9.432396346311545e-07, + "loss": 1.7071, + "step": 30624 + }, + { + "epoch": 9.399938612645794, + "grad_norm": 0.12574951350688934, + "learning_rate": 9.422789544872024e-07, + "loss": 1.6663, + "step": 30625 + }, + { + "epoch": 9.40024554941682, + "grad_norm": 0.13539808988571167, + "learning_rate": 9.413187591603922e-07, + "loss": 1.7087, + "step": 30626 + }, + { + "epoch": 9.400552486187845, + "grad_norm": 0.18200458586215973, + "learning_rate": 9.403590486602221e-07, + "loss": 1.7497, + "step": 30627 + }, + { + "epoch": 9.40085942295887, + "grad_norm": 0.1656341254711151, + "learning_rate": 9.393998229961898e-07, + "loss": 1.7424, + "step": 30628 + }, + { + "epoch": 9.401166359729896, + "grad_norm": 0.13709864020347595, + "learning_rate": 9.384410821777545e-07, + "loss": 1.7228, + "step": 30629 + }, + { + "epoch": 9.401473296500921, + "grad_norm": 0.1603628247976303, + "learning_rate": 9.374828262144031e-07, + "loss": 1.7145, + "step": 30630 + }, + { + "epoch": 9.401780233271946, + "grad_norm": 0.14841997623443604, + "learning_rate": 9.365250551156002e-07, + "loss": 1.7022, + "step": 30631 + }, + { + "epoch": 9.402087170042972, + "grad_norm": 0.12113026529550552, + "learning_rate": 9.35567768890816e-07, + "loss": 1.6537, + "step": 30632 + }, + { + "epoch": 9.402394106813997, + "grad_norm": 0.1314094364643097, + "learning_rate": 9.346109675495096e-07, + "loss": 1.69, + "step": 30633 + }, + { + "epoch": 9.402701043585022, + "grad_norm": 0.1479753851890564, + "learning_rate": 9.336546511011346e-07, + "loss": 1.7403, + "step": 30634 + }, + { + "epoch": 9.403007980356048, + "grad_norm": 0.15644671022891998, + "learning_rate": 9.326988195551445e-07, + "loss": 1.7251, + "step": 30635 + }, + { + "epoch": 9.403314917127071, + "grad_norm": 0.14952129125595093, + "learning_rate": 9.317434729209817e-07, + "loss": 1.6915, + "step": 30636 + }, + { + "epoch": 9.403621853898096, + "grad_norm": 0.11758700013160706, + "learning_rate": 9.307886112080943e-07, + "loss": 1.6961, + "step": 30637 + }, + { + "epoch": 9.403928790669122, + "grad_norm": 0.10613285005092621, + "learning_rate": 9.298342344259081e-07, + "loss": 1.6668, + "step": 30638 + }, + { + "epoch": 9.404235727440147, + "grad_norm": 0.11807837337255478, + "learning_rate": 9.288803425838655e-07, + "loss": 1.6642, + "step": 30639 + }, + { + "epoch": 9.404542664211172, + "grad_norm": 0.17462679743766785, + "learning_rate": 9.279269356913866e-07, + "loss": 1.6935, + "step": 30640 + }, + { + "epoch": 9.404849600982198, + "grad_norm": 0.12297552078962326, + "learning_rate": 9.26974013757892e-07, + "loss": 1.6782, + "step": 30641 + }, + { + "epoch": 9.405156537753223, + "grad_norm": 0.11738404631614685, + "learning_rate": 9.260215767928127e-07, + "loss": 1.6913, + "step": 30642 + }, + { + "epoch": 9.405463474524248, + "grad_norm": 0.20638801157474518, + "learning_rate": 9.250696248055468e-07, + "loss": 1.7434, + "step": 30643 + }, + { + "epoch": 9.405770411295274, + "grad_norm": 0.23646225035190582, + "learning_rate": 9.241181578055036e-07, + "loss": 1.754, + "step": 30644 + }, + { + "epoch": 9.4060773480663, + "grad_norm": 0.1305943727493286, + "learning_rate": 9.231671758020921e-07, + "loss": 1.7006, + "step": 30645 + }, + { + "epoch": 9.406384284837323, + "grad_norm": 0.1624198704957962, + "learning_rate": 9.222166788047049e-07, + "loss": 1.7205, + "step": 30646 + }, + { + "epoch": 9.406691221608348, + "grad_norm": 0.17408986389636993, + "learning_rate": 9.212666668227399e-07, + "loss": 1.7302, + "step": 30647 + }, + { + "epoch": 9.406998158379373, + "grad_norm": 0.19994081556797028, + "learning_rate": 9.203171398655785e-07, + "loss": 1.7616, + "step": 30648 + }, + { + "epoch": 9.407305095150399, + "grad_norm": 0.12456551194190979, + "learning_rate": 9.19368097942619e-07, + "loss": 1.6915, + "step": 30649 + }, + { + "epoch": 9.407612031921424, + "grad_norm": 0.11373740434646606, + "learning_rate": 9.184195410632257e-07, + "loss": 1.6679, + "step": 30650 + }, + { + "epoch": 9.40791896869245, + "grad_norm": 0.1356983780860901, + "learning_rate": 9.174714692367748e-07, + "loss": 1.7142, + "step": 30651 + }, + { + "epoch": 9.408225905463475, + "grad_norm": 0.17130546271800995, + "learning_rate": 9.165238824726474e-07, + "loss": 1.7554, + "step": 30652 + }, + { + "epoch": 9.4085328422345, + "grad_norm": 0.12105514854192734, + "learning_rate": 9.155767807801918e-07, + "loss": 1.6938, + "step": 30653 + }, + { + "epoch": 9.408839779005525, + "grad_norm": 0.1510905921459198, + "learning_rate": 9.146301641687837e-07, + "loss": 1.6835, + "step": 30654 + }, + { + "epoch": 9.40914671577655, + "grad_norm": 0.1589810699224472, + "learning_rate": 9.136840326477658e-07, + "loss": 1.7135, + "step": 30655 + }, + { + "epoch": 9.409453652547576, + "grad_norm": 0.14998911321163177, + "learning_rate": 9.127383862264915e-07, + "loss": 1.7078, + "step": 30656 + }, + { + "epoch": 9.4097605893186, + "grad_norm": 0.1262497901916504, + "learning_rate": 9.11793224914309e-07, + "loss": 1.7015, + "step": 30657 + }, + { + "epoch": 9.410067526089625, + "grad_norm": 0.17526039481163025, + "learning_rate": 9.108485487205498e-07, + "loss": 1.725, + "step": 30658 + }, + { + "epoch": 9.41037446286065, + "grad_norm": 0.18700073659420013, + "learning_rate": 9.099043576545674e-07, + "loss": 1.711, + "step": 30659 + }, + { + "epoch": 9.410681399631676, + "grad_norm": 0.12407290935516357, + "learning_rate": 9.089606517256821e-07, + "loss": 1.6912, + "step": 30660 + }, + { + "epoch": 9.410988336402701, + "grad_norm": 0.14186540246009827, + "learning_rate": 9.080174309432199e-07, + "loss": 1.7082, + "step": 30661 + }, + { + "epoch": 9.411295273173726, + "grad_norm": 0.16852159798145294, + "learning_rate": 9.07074695316501e-07, + "loss": 1.7107, + "step": 30662 + }, + { + "epoch": 9.411602209944752, + "grad_norm": 0.18337292969226837, + "learning_rate": 9.061324448548403e-07, + "loss": 1.7359, + "step": 30663 + }, + { + "epoch": 9.411909146715777, + "grad_norm": 0.1463366150856018, + "learning_rate": 9.051906795675635e-07, + "loss": 1.6903, + "step": 30664 + }, + { + "epoch": 9.412216083486802, + "grad_norm": 0.1920327991247177, + "learning_rate": 9.042493994639579e-07, + "loss": 1.7113, + "step": 30665 + }, + { + "epoch": 9.412523020257828, + "grad_norm": 0.2031734585762024, + "learning_rate": 9.033086045533434e-07, + "loss": 1.7663, + "step": 30666 + }, + { + "epoch": 9.412829957028851, + "grad_norm": 0.15997421741485596, + "learning_rate": 9.023682948450019e-07, + "loss": 1.7607, + "step": 30667 + }, + { + "epoch": 9.413136893799877, + "grad_norm": 0.1264960914850235, + "learning_rate": 9.014284703482422e-07, + "loss": 1.6638, + "step": 30668 + }, + { + "epoch": 9.413443830570902, + "grad_norm": 0.13021783530712128, + "learning_rate": 9.004891310723407e-07, + "loss": 1.6783, + "step": 30669 + }, + { + "epoch": 9.413750767341927, + "grad_norm": 0.13910266757011414, + "learning_rate": 8.995502770265785e-07, + "loss": 1.7262, + "step": 30670 + }, + { + "epoch": 9.414057704112953, + "grad_norm": 0.1369626671075821, + "learning_rate": 8.986119082202482e-07, + "loss": 1.6998, + "step": 30671 + }, + { + "epoch": 9.414364640883978, + "grad_norm": 0.1432434767484665, + "learning_rate": 8.976740246626092e-07, + "loss": 1.7156, + "step": 30672 + }, + { + "epoch": 9.414671577655003, + "grad_norm": 0.2088400423526764, + "learning_rate": 8.967366263629373e-07, + "loss": 1.7551, + "step": 30673 + }, + { + "epoch": 9.414978514426029, + "grad_norm": 0.1348891705274582, + "learning_rate": 8.957997133304918e-07, + "loss": 1.6757, + "step": 30674 + }, + { + "epoch": 9.415285451197054, + "grad_norm": 0.15271534025669098, + "learning_rate": 8.94863285574532e-07, + "loss": 1.7382, + "step": 30675 + }, + { + "epoch": 9.41559238796808, + "grad_norm": 0.14035186171531677, + "learning_rate": 8.939273431043227e-07, + "loss": 1.7186, + "step": 30676 + }, + { + "epoch": 9.415899324739105, + "grad_norm": 0.11167564988136292, + "learning_rate": 8.929918859291009e-07, + "loss": 1.6706, + "step": 30677 + }, + { + "epoch": 9.416206261510128, + "grad_norm": 0.12790827453136444, + "learning_rate": 8.920569140581148e-07, + "loss": 1.6824, + "step": 30678 + }, + { + "epoch": 9.416513198281153, + "grad_norm": 0.11640806496143341, + "learning_rate": 8.911224275006069e-07, + "loss": 1.7236, + "step": 30679 + }, + { + "epoch": 9.416820135052179, + "grad_norm": 0.19866923987865448, + "learning_rate": 8.901884262658089e-07, + "loss": 1.7744, + "step": 30680 + }, + { + "epoch": 9.417127071823204, + "grad_norm": 0.12702670693397522, + "learning_rate": 8.892549103629577e-07, + "loss": 1.6858, + "step": 30681 + }, + { + "epoch": 9.41743400859423, + "grad_norm": 0.10487339645624161, + "learning_rate": 8.883218798012683e-07, + "loss": 1.6862, + "step": 30682 + }, + { + "epoch": 9.417740945365255, + "grad_norm": 0.1706196665763855, + "learning_rate": 8.87389334589972e-07, + "loss": 1.7688, + "step": 30683 + }, + { + "epoch": 9.41804788213628, + "grad_norm": 0.18874917924404144, + "learning_rate": 8.864572747382782e-07, + "loss": 1.7335, + "step": 30684 + }, + { + "epoch": 9.418354818907305, + "grad_norm": 0.12817202508449554, + "learning_rate": 8.855257002553963e-07, + "loss": 1.7276, + "step": 30685 + }, + { + "epoch": 9.41866175567833, + "grad_norm": 0.16661255061626434, + "learning_rate": 8.84594611150541e-07, + "loss": 1.7635, + "step": 30686 + }, + { + "epoch": 9.418968692449356, + "grad_norm": 0.1407301425933838, + "learning_rate": 8.836640074329106e-07, + "loss": 1.6675, + "step": 30687 + }, + { + "epoch": 9.419275629220381, + "grad_norm": 0.1266261488199234, + "learning_rate": 8.827338891116976e-07, + "loss": 1.685, + "step": 30688 + }, + { + "epoch": 9.419582565991405, + "grad_norm": 0.1475544422864914, + "learning_rate": 8.818042561961004e-07, + "loss": 1.7009, + "step": 30689 + }, + { + "epoch": 9.41988950276243, + "grad_norm": 0.15595827996730804, + "learning_rate": 8.808751086953005e-07, + "loss": 1.7571, + "step": 30690 + }, + { + "epoch": 9.420196439533456, + "grad_norm": 0.1931566298007965, + "learning_rate": 8.799464466184793e-07, + "loss": 1.7627, + "step": 30691 + }, + { + "epoch": 9.420503376304481, + "grad_norm": 0.12590163946151733, + "learning_rate": 8.790182699748128e-07, + "loss": 1.6673, + "step": 30692 + }, + { + "epoch": 9.420810313075506, + "grad_norm": 0.135042205452919, + "learning_rate": 8.780905787734939e-07, + "loss": 1.7102, + "step": 30693 + }, + { + "epoch": 9.421117249846532, + "grad_norm": 0.15336740016937256, + "learning_rate": 8.77163373023665e-07, + "loss": 1.7079, + "step": 30694 + }, + { + "epoch": 9.421424186617557, + "grad_norm": 0.1408243626356125, + "learning_rate": 8.762366527345022e-07, + "loss": 1.7039, + "step": 30695 + }, + { + "epoch": 9.421731123388582, + "grad_norm": 0.16094304621219635, + "learning_rate": 8.753104179151595e-07, + "loss": 1.7415, + "step": 30696 + }, + { + "epoch": 9.422038060159608, + "grad_norm": 0.1549450308084488, + "learning_rate": 8.743846685747903e-07, + "loss": 1.709, + "step": 30697 + }, + { + "epoch": 9.422344996930633, + "grad_norm": 0.1558622568845749, + "learning_rate": 8.734594047225486e-07, + "loss": 1.7329, + "step": 30698 + }, + { + "epoch": 9.422651933701658, + "grad_norm": 0.10400709509849548, + "learning_rate": 8.725346263675716e-07, + "loss": 1.6449, + "step": 30699 + }, + { + "epoch": 9.422958870472682, + "grad_norm": 0.12266384065151215, + "learning_rate": 8.716103335190073e-07, + "loss": 1.6983, + "step": 30700 + }, + { + "epoch": 9.423265807243707, + "grad_norm": 0.14934386312961578, + "learning_rate": 8.70686526185982e-07, + "loss": 1.7348, + "step": 30701 + }, + { + "epoch": 9.423572744014733, + "grad_norm": 0.18102359771728516, + "learning_rate": 8.697632043776271e-07, + "loss": 1.7326, + "step": 30702 + }, + { + "epoch": 9.423879680785758, + "grad_norm": 0.16218020021915436, + "learning_rate": 8.688403681030688e-07, + "loss": 1.7721, + "step": 30703 + }, + { + "epoch": 9.424186617556783, + "grad_norm": 0.15908999741077423, + "learning_rate": 8.679180173714275e-07, + "loss": 1.7115, + "step": 30704 + }, + { + "epoch": 9.424493554327809, + "grad_norm": 0.13521069288253784, + "learning_rate": 8.66996152191818e-07, + "loss": 1.683, + "step": 30705 + }, + { + "epoch": 9.424800491098834, + "grad_norm": 0.1464395523071289, + "learning_rate": 8.660747725733497e-07, + "loss": 1.7209, + "step": 30706 + }, + { + "epoch": 9.42510742786986, + "grad_norm": 0.10634544491767883, + "learning_rate": 8.651538785251267e-07, + "loss": 1.6366, + "step": 30707 + }, + { + "epoch": 9.425414364640885, + "grad_norm": 0.15586215257644653, + "learning_rate": 8.642334700562526e-07, + "loss": 1.7363, + "step": 30708 + }, + { + "epoch": 9.42572130141191, + "grad_norm": 0.14794576168060303, + "learning_rate": 8.633135471758203e-07, + "loss": 1.7143, + "step": 30709 + }, + { + "epoch": 9.426028238182933, + "grad_norm": 0.14911554753780365, + "learning_rate": 8.623941098929334e-07, + "loss": 1.6894, + "step": 30710 + }, + { + "epoch": 9.426335174953959, + "grad_norm": 0.16456535458564758, + "learning_rate": 8.614751582166625e-07, + "loss": 1.7059, + "step": 30711 + }, + { + "epoch": 9.426642111724984, + "grad_norm": 0.2710132300853729, + "learning_rate": 8.605566921560948e-07, + "loss": 1.738, + "step": 30712 + }, + { + "epoch": 9.42694904849601, + "grad_norm": 0.12046913802623749, + "learning_rate": 8.596387117203064e-07, + "loss": 1.7124, + "step": 30713 + }, + { + "epoch": 9.427255985267035, + "grad_norm": 0.1438749134540558, + "learning_rate": 8.587212169183679e-07, + "loss": 1.7014, + "step": 30714 + }, + { + "epoch": 9.42756292203806, + "grad_norm": 0.13070370256900787, + "learning_rate": 8.578042077593551e-07, + "loss": 1.6561, + "step": 30715 + }, + { + "epoch": 9.427869858809085, + "grad_norm": 0.13055887818336487, + "learning_rate": 8.568876842523166e-07, + "loss": 1.6791, + "step": 30716 + }, + { + "epoch": 9.42817679558011, + "grad_norm": 0.13492754101753235, + "learning_rate": 8.559716464063284e-07, + "loss": 1.6803, + "step": 30717 + }, + { + "epoch": 9.428483732351136, + "grad_norm": 0.17521773278713226, + "learning_rate": 8.55056094230422e-07, + "loss": 1.7152, + "step": 30718 + }, + { + "epoch": 9.428790669122161, + "grad_norm": 0.18804030120372772, + "learning_rate": 8.541410277336625e-07, + "loss": 1.7344, + "step": 30719 + }, + { + "epoch": 9.429097605893187, + "grad_norm": 0.14698217809200287, + "learning_rate": 8.532264469250873e-07, + "loss": 1.6978, + "step": 30720 + }, + { + "epoch": 9.42940454266421, + "grad_norm": 0.10534154623746872, + "learning_rate": 8.523123518137277e-07, + "loss": 1.6536, + "step": 30721 + }, + { + "epoch": 9.429711479435236, + "grad_norm": 0.13445980846881866, + "learning_rate": 8.513987424086323e-07, + "loss": 1.6999, + "step": 30722 + }, + { + "epoch": 9.430018416206261, + "grad_norm": 0.19551974534988403, + "learning_rate": 8.50485618718816e-07, + "loss": 1.7358, + "step": 30723 + }, + { + "epoch": 9.430325352977286, + "grad_norm": 0.13450706005096436, + "learning_rate": 8.495729807533104e-07, + "loss": 1.7157, + "step": 30724 + }, + { + "epoch": 9.430632289748312, + "grad_norm": 0.17215101420879364, + "learning_rate": 8.486608285211306e-07, + "loss": 1.7139, + "step": 30725 + }, + { + "epoch": 9.430939226519337, + "grad_norm": 0.15021352469921112, + "learning_rate": 8.477491620312916e-07, + "loss": 1.7179, + "step": 30726 + }, + { + "epoch": 9.431246163290362, + "grad_norm": 0.13625288009643555, + "learning_rate": 8.468379812928084e-07, + "loss": 1.6991, + "step": 30727 + }, + { + "epoch": 9.431553100061388, + "grad_norm": 0.09747711569070816, + "learning_rate": 8.459272863146794e-07, + "loss": 1.6463, + "step": 30728 + }, + { + "epoch": 9.431860036832413, + "grad_norm": 0.13644148409366608, + "learning_rate": 8.450170771059085e-07, + "loss": 1.7026, + "step": 30729 + }, + { + "epoch": 9.432166973603438, + "grad_norm": 0.12617720663547516, + "learning_rate": 8.441073536754884e-07, + "loss": 1.6705, + "step": 30730 + }, + { + "epoch": 9.432473910374464, + "grad_norm": 0.12123163044452667, + "learning_rate": 8.431981160324065e-07, + "loss": 1.681, + "step": 30731 + }, + { + "epoch": 9.432780847145487, + "grad_norm": 0.18256647884845734, + "learning_rate": 8.422893641856611e-07, + "loss": 1.7564, + "step": 30732 + }, + { + "epoch": 9.433087783916513, + "grad_norm": 0.14204107224941254, + "learning_rate": 8.413810981442171e-07, + "loss": 1.7232, + "step": 30733 + }, + { + "epoch": 9.433394720687538, + "grad_norm": 0.1158083900809288, + "learning_rate": 8.404733179170677e-07, + "loss": 1.665, + "step": 30734 + }, + { + "epoch": 9.433701657458563, + "grad_norm": 0.18204176425933838, + "learning_rate": 8.395660235131608e-07, + "loss": 1.7477, + "step": 30735 + }, + { + "epoch": 9.434008594229589, + "grad_norm": 0.15896224975585938, + "learning_rate": 8.38659214941484e-07, + "loss": 1.8096, + "step": 30736 + }, + { + "epoch": 9.434315531000614, + "grad_norm": 0.10890607535839081, + "learning_rate": 8.377528922109912e-07, + "loss": 1.6713, + "step": 30737 + }, + { + "epoch": 9.43462246777164, + "grad_norm": 0.117277592420578, + "learning_rate": 8.368470553306417e-07, + "loss": 1.7043, + "step": 30738 + }, + { + "epoch": 9.434929404542665, + "grad_norm": 0.12226385623216629, + "learning_rate": 8.359417043093787e-07, + "loss": 1.6709, + "step": 30739 + }, + { + "epoch": 9.43523634131369, + "grad_norm": 0.16085174679756165, + "learning_rate": 8.350368391561614e-07, + "loss": 1.7212, + "step": 30740 + }, + { + "epoch": 9.435543278084715, + "grad_norm": 0.15585030615329742, + "learning_rate": 8.341324598799216e-07, + "loss": 1.743, + "step": 30741 + }, + { + "epoch": 9.43585021485574, + "grad_norm": 0.11419086158275604, + "learning_rate": 8.332285664896078e-07, + "loss": 1.6775, + "step": 30742 + }, + { + "epoch": 9.436157151626764, + "grad_norm": 0.1748945116996765, + "learning_rate": 8.323251589941405e-07, + "loss": 1.7916, + "step": 30743 + }, + { + "epoch": 9.43646408839779, + "grad_norm": 0.14767593145370483, + "learning_rate": 8.314222374024572e-07, + "loss": 1.7165, + "step": 30744 + }, + { + "epoch": 9.436771025168815, + "grad_norm": 0.1396973431110382, + "learning_rate": 8.305198017234783e-07, + "loss": 1.68, + "step": 30745 + }, + { + "epoch": 9.43707796193984, + "grad_norm": 0.14533667266368866, + "learning_rate": 8.296178519661246e-07, + "loss": 1.6742, + "step": 30746 + }, + { + "epoch": 9.437384898710865, + "grad_norm": 0.14526952803134918, + "learning_rate": 8.287163881393001e-07, + "loss": 1.7084, + "step": 30747 + }, + { + "epoch": 9.43769183548189, + "grad_norm": 0.10500401258468628, + "learning_rate": 8.278154102519198e-07, + "loss": 1.6581, + "step": 30748 + }, + { + "epoch": 9.437998772252916, + "grad_norm": 0.12266987562179565, + "learning_rate": 8.269149183128988e-07, + "loss": 1.7261, + "step": 30749 + }, + { + "epoch": 9.438305709023942, + "grad_norm": 0.11223867535591125, + "learning_rate": 8.260149123311134e-07, + "loss": 1.6873, + "step": 30750 + }, + { + "epoch": 9.438612645794967, + "grad_norm": 0.1576405167579651, + "learning_rate": 8.251153923154842e-07, + "loss": 1.6975, + "step": 30751 + }, + { + "epoch": 9.438919582565992, + "grad_norm": 0.14165537059307098, + "learning_rate": 8.242163582748763e-07, + "loss": 1.6916, + "step": 30752 + }, + { + "epoch": 9.439226519337016, + "grad_norm": 0.11340904235839844, + "learning_rate": 8.233178102181882e-07, + "loss": 1.6931, + "step": 30753 + }, + { + "epoch": 9.439533456108041, + "grad_norm": 0.13339579105377197, + "learning_rate": 8.224197481542962e-07, + "loss": 1.717, + "step": 30754 + }, + { + "epoch": 9.439840392879066, + "grad_norm": 0.19762879610061646, + "learning_rate": 8.215221720920762e-07, + "loss": 1.7263, + "step": 30755 + }, + { + "epoch": 9.440147329650092, + "grad_norm": 0.13339634239673615, + "learning_rate": 8.206250820403993e-07, + "loss": 1.7135, + "step": 30756 + }, + { + "epoch": 9.440454266421117, + "grad_norm": 0.17574037611484528, + "learning_rate": 8.197284780081305e-07, + "loss": 1.704, + "step": 30757 + }, + { + "epoch": 9.440761203192142, + "grad_norm": 0.15657347440719604, + "learning_rate": 8.188323600041293e-07, + "loss": 1.7323, + "step": 30758 + }, + { + "epoch": 9.441068139963168, + "grad_norm": 0.14541512727737427, + "learning_rate": 8.179367280372552e-07, + "loss": 1.6631, + "step": 30759 + }, + { + "epoch": 9.441375076734193, + "grad_norm": 0.13230635225772858, + "learning_rate": 8.170415821163568e-07, + "loss": 1.7423, + "step": 30760 + }, + { + "epoch": 9.441682013505218, + "grad_norm": 0.11934958398342133, + "learning_rate": 8.161469222502771e-07, + "loss": 1.6604, + "step": 30761 + }, + { + "epoch": 9.441988950276244, + "grad_norm": 0.27107498049736023, + "learning_rate": 8.152527484478645e-07, + "loss": 1.7381, + "step": 30762 + }, + { + "epoch": 9.442295887047269, + "grad_norm": 0.13428844511508942, + "learning_rate": 8.143590607179508e-07, + "loss": 1.7329, + "step": 30763 + }, + { + "epoch": 9.442602823818293, + "grad_norm": 0.1932329535484314, + "learning_rate": 8.134658590693678e-07, + "loss": 1.7865, + "step": 30764 + }, + { + "epoch": 9.442909760589318, + "grad_norm": 0.14267316460609436, + "learning_rate": 8.125731435109419e-07, + "loss": 1.6631, + "step": 30765 + }, + { + "epoch": 9.443216697360343, + "grad_norm": 0.14356668293476105, + "learning_rate": 8.116809140515047e-07, + "loss": 1.7113, + "step": 30766 + }, + { + "epoch": 9.443523634131369, + "grad_norm": 0.13832272589206696, + "learning_rate": 8.107891706998605e-07, + "loss": 1.6945, + "step": 30767 + }, + { + "epoch": 9.443830570902394, + "grad_norm": 0.12506070733070374, + "learning_rate": 8.098979134648355e-07, + "loss": 1.6563, + "step": 30768 + }, + { + "epoch": 9.44413750767342, + "grad_norm": 0.12485837191343307, + "learning_rate": 8.090071423552226e-07, + "loss": 1.6704, + "step": 30769 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.14637434482574463, + "learning_rate": 8.08116857379837e-07, + "loss": 1.696, + "step": 30770 + }, + { + "epoch": 9.44475138121547, + "grad_norm": 0.16358232498168945, + "learning_rate": 8.072270585474773e-07, + "loss": 1.6779, + "step": 30771 + }, + { + "epoch": 9.445058317986495, + "grad_norm": 0.1596413105726242, + "learning_rate": 8.063377458669252e-07, + "loss": 1.7176, + "step": 30772 + }, + { + "epoch": 9.44536525475752, + "grad_norm": 0.20140093564987183, + "learning_rate": 8.054489193469794e-07, + "loss": 1.7194, + "step": 30773 + }, + { + "epoch": 9.445672191528546, + "grad_norm": 0.15662498772144318, + "learning_rate": 8.045605789964216e-07, + "loss": 1.7523, + "step": 30774 + }, + { + "epoch": 9.44597912829957, + "grad_norm": 0.13946685194969177, + "learning_rate": 8.036727248240339e-07, + "loss": 1.6938, + "step": 30775 + }, + { + "epoch": 9.446286065070595, + "grad_norm": 0.1664799004793167, + "learning_rate": 8.027853568385812e-07, + "loss": 1.7221, + "step": 30776 + }, + { + "epoch": 9.44659300184162, + "grad_norm": 0.11461447924375534, + "learning_rate": 8.018984750488456e-07, + "loss": 1.6839, + "step": 30777 + }, + { + "epoch": 9.446899938612646, + "grad_norm": 0.12960290908813477, + "learning_rate": 8.010120794635812e-07, + "loss": 1.6936, + "step": 30778 + }, + { + "epoch": 9.44720687538367, + "grad_norm": 0.16561156511306763, + "learning_rate": 8.00126170091553e-07, + "loss": 1.693, + "step": 30779 + }, + { + "epoch": 9.447513812154696, + "grad_norm": 0.1481138914823532, + "learning_rate": 7.992407469415152e-07, + "loss": 1.721, + "step": 30780 + }, + { + "epoch": 9.447820748925722, + "grad_norm": 0.15538595616817474, + "learning_rate": 7.983558100222166e-07, + "loss": 1.6943, + "step": 30781 + }, + { + "epoch": 9.448127685696747, + "grad_norm": 0.1411004513502121, + "learning_rate": 7.974713593423999e-07, + "loss": 1.7118, + "step": 30782 + }, + { + "epoch": 9.448434622467772, + "grad_norm": 0.14798758924007416, + "learning_rate": 7.965873949108193e-07, + "loss": 1.7267, + "step": 30783 + }, + { + "epoch": 9.448741559238798, + "grad_norm": 0.12365894019603729, + "learning_rate": 7.957039167361902e-07, + "loss": 1.6921, + "step": 30784 + }, + { + "epoch": 9.449048496009823, + "grad_norm": 0.12739478051662445, + "learning_rate": 7.948209248272609e-07, + "loss": 1.6827, + "step": 30785 + }, + { + "epoch": 9.449355432780846, + "grad_norm": 0.11038246005773544, + "learning_rate": 7.939384191927469e-07, + "loss": 1.6778, + "step": 30786 + }, + { + "epoch": 9.449662369551872, + "grad_norm": 0.14928553998470306, + "learning_rate": 7.930563998413798e-07, + "loss": 1.7197, + "step": 30787 + }, + { + "epoch": 9.449969306322897, + "grad_norm": 0.2465045005083084, + "learning_rate": 7.921748667818695e-07, + "loss": 1.751, + "step": 30788 + }, + { + "epoch": 9.450276243093922, + "grad_norm": 0.1846035271883011, + "learning_rate": 7.912938200229259e-07, + "loss": 1.753, + "step": 30789 + }, + { + "epoch": 9.450583179864948, + "grad_norm": 0.13683682680130005, + "learning_rate": 7.904132595732639e-07, + "loss": 1.7112, + "step": 30790 + }, + { + "epoch": 9.450890116635973, + "grad_norm": 0.1144467145204544, + "learning_rate": 7.895331854415766e-07, + "loss": 1.678, + "step": 30791 + }, + { + "epoch": 9.451197053406998, + "grad_norm": 0.11407051235437393, + "learning_rate": 7.886535976365628e-07, + "loss": 1.6573, + "step": 30792 + }, + { + "epoch": 9.451503990178024, + "grad_norm": 0.14853791892528534, + "learning_rate": 7.877744961669209e-07, + "loss": 1.7284, + "step": 30793 + }, + { + "epoch": 9.45181092694905, + "grad_norm": 0.15787862241268158, + "learning_rate": 7.868958810413385e-07, + "loss": 1.7638, + "step": 30794 + }, + { + "epoch": 9.452117863720074, + "grad_norm": 0.1264905035495758, + "learning_rate": 7.86017752268492e-07, + "loss": 1.6968, + "step": 30795 + }, + { + "epoch": 9.452424800491098, + "grad_norm": 0.15339265763759613, + "learning_rate": 7.851401098570632e-07, + "loss": 1.6885, + "step": 30796 + }, + { + "epoch": 9.452731737262123, + "grad_norm": 0.14742697775363922, + "learning_rate": 7.842629538157286e-07, + "loss": 1.7038, + "step": 30797 + }, + { + "epoch": 9.453038674033149, + "grad_norm": 0.16144371032714844, + "learning_rate": 7.833862841531536e-07, + "loss": 1.7374, + "step": 30798 + }, + { + "epoch": 9.453345610804174, + "grad_norm": 0.15689444541931152, + "learning_rate": 7.825101008779979e-07, + "loss": 1.7509, + "step": 30799 + }, + { + "epoch": 9.4536525475752, + "grad_norm": 0.16697221994400024, + "learning_rate": 7.81634403998932e-07, + "loss": 1.7841, + "step": 30800 + }, + { + "epoch": 9.453959484346225, + "grad_norm": 0.11735955625772476, + "learning_rate": 7.80759193524594e-07, + "loss": 1.6864, + "step": 30801 + }, + { + "epoch": 9.45426642111725, + "grad_norm": 0.13182209432125092, + "learning_rate": 7.798844694636487e-07, + "loss": 1.6834, + "step": 30802 + }, + { + "epoch": 9.454573357888275, + "grad_norm": 0.12708893418312073, + "learning_rate": 7.790102318247283e-07, + "loss": 1.6529, + "step": 30803 + }, + { + "epoch": 9.4548802946593, + "grad_norm": 0.11800631135702133, + "learning_rate": 7.781364806164815e-07, + "loss": 1.7123, + "step": 30804 + }, + { + "epoch": 9.455187231430326, + "grad_norm": 0.2169203758239746, + "learning_rate": 7.772632158475401e-07, + "loss": 1.7522, + "step": 30805 + }, + { + "epoch": 9.455494168201351, + "grad_norm": 0.1831941157579422, + "learning_rate": 7.763904375265307e-07, + "loss": 1.8109, + "step": 30806 + }, + { + "epoch": 9.455801104972375, + "grad_norm": 0.1484314352273941, + "learning_rate": 7.755181456620852e-07, + "loss": 1.7101, + "step": 30807 + }, + { + "epoch": 9.4561080417434, + "grad_norm": 0.10662242770195007, + "learning_rate": 7.74646340262819e-07, + "loss": 1.679, + "step": 30808 + }, + { + "epoch": 9.456414978514426, + "grad_norm": 0.13147766888141632, + "learning_rate": 7.737750213373529e-07, + "loss": 1.6738, + "step": 30809 + }, + { + "epoch": 9.456721915285451, + "grad_norm": 0.14727403223514557, + "learning_rate": 7.729041888942911e-07, + "loss": 1.7048, + "step": 30810 + }, + { + "epoch": 9.457028852056476, + "grad_norm": 0.1278834491968155, + "learning_rate": 7.720338429422436e-07, + "loss": 1.7102, + "step": 30811 + }, + { + "epoch": 9.457335788827502, + "grad_norm": 0.13472500443458557, + "learning_rate": 7.711639834898143e-07, + "loss": 1.7265, + "step": 30812 + }, + { + "epoch": 9.457642725598527, + "grad_norm": 0.1379247009754181, + "learning_rate": 7.702946105455911e-07, + "loss": 1.7381, + "step": 30813 + }, + { + "epoch": 9.457949662369552, + "grad_norm": 0.13163436949253082, + "learning_rate": 7.694257241181723e-07, + "loss": 1.7077, + "step": 30814 + }, + { + "epoch": 9.458256599140578, + "grad_norm": 0.18956807255744934, + "learning_rate": 7.685573242161459e-07, + "loss": 1.7148, + "step": 30815 + }, + { + "epoch": 9.458563535911603, + "grad_norm": 0.0954909548163414, + "learning_rate": 7.676894108480881e-07, + "loss": 1.625, + "step": 30816 + }, + { + "epoch": 9.458870472682626, + "grad_norm": 0.16598805785179138, + "learning_rate": 7.668219840225866e-07, + "loss": 1.7074, + "step": 30817 + }, + { + "epoch": 9.459177409453652, + "grad_norm": 0.13503910601139069, + "learning_rate": 7.659550437481955e-07, + "loss": 1.6738, + "step": 30818 + }, + { + "epoch": 9.459484346224677, + "grad_norm": 0.15524166822433472, + "learning_rate": 7.650885900335025e-07, + "loss": 1.7235, + "step": 30819 + }, + { + "epoch": 9.459791282995702, + "grad_norm": 0.1390114575624466, + "learning_rate": 7.642226228870563e-07, + "loss": 1.6718, + "step": 30820 + }, + { + "epoch": 9.460098219766728, + "grad_norm": 0.10782946646213531, + "learning_rate": 7.63357142317428e-07, + "loss": 1.6511, + "step": 30821 + }, + { + "epoch": 9.460405156537753, + "grad_norm": 0.29216310381889343, + "learning_rate": 7.624921483331549e-07, + "loss": 1.7298, + "step": 30822 + }, + { + "epoch": 9.460712093308778, + "grad_norm": 0.14348210394382477, + "learning_rate": 7.616276409427969e-07, + "loss": 1.7039, + "step": 30823 + }, + { + "epoch": 9.461019030079804, + "grad_norm": 0.15576337277889252, + "learning_rate": 7.607636201548918e-07, + "loss": 1.7253, + "step": 30824 + }, + { + "epoch": 9.46132596685083, + "grad_norm": 0.12783481180667877, + "learning_rate": 7.599000859779826e-07, + "loss": 1.6927, + "step": 30825 + }, + { + "epoch": 9.461632903621854, + "grad_norm": 0.1323290467262268, + "learning_rate": 7.590370384206014e-07, + "loss": 1.6943, + "step": 30826 + }, + { + "epoch": 9.46193984039288, + "grad_norm": 0.10137525945901871, + "learning_rate": 7.581744774912747e-07, + "loss": 1.6775, + "step": 30827 + }, + { + "epoch": 9.462246777163903, + "grad_norm": 0.10773646086454391, + "learning_rate": 7.573124031985346e-07, + "loss": 1.6618, + "step": 30828 + }, + { + "epoch": 9.462553713934929, + "grad_norm": 0.12834392488002777, + "learning_rate": 7.564508155508909e-07, + "loss": 1.6934, + "step": 30829 + }, + { + "epoch": 9.462860650705954, + "grad_norm": 0.17545762658119202, + "learning_rate": 7.555897145568646e-07, + "loss": 1.7689, + "step": 30830 + }, + { + "epoch": 9.46316758747698, + "grad_norm": 0.13099749386310577, + "learning_rate": 7.547291002249657e-07, + "loss": 1.6901, + "step": 30831 + }, + { + "epoch": 9.463474524248005, + "grad_norm": 0.14668162167072296, + "learning_rate": 7.538689725636927e-07, + "loss": 1.7553, + "step": 30832 + }, + { + "epoch": 9.46378146101903, + "grad_norm": 0.14195361733436584, + "learning_rate": 7.530093315815557e-07, + "loss": 1.7015, + "step": 30833 + }, + { + "epoch": 9.464088397790055, + "grad_norm": 0.11229286342859268, + "learning_rate": 7.521501772870421e-07, + "loss": 1.6858, + "step": 30834 + }, + { + "epoch": 9.46439533456108, + "grad_norm": 0.15487706661224365, + "learning_rate": 7.512915096886397e-07, + "loss": 1.7377, + "step": 30835 + }, + { + "epoch": 9.464702271332106, + "grad_norm": 0.10888294875621796, + "learning_rate": 7.504333287948529e-07, + "loss": 1.651, + "step": 30836 + }, + { + "epoch": 9.465009208103131, + "grad_norm": 0.11357124894857407, + "learning_rate": 7.495756346141358e-07, + "loss": 1.6881, + "step": 30837 + }, + { + "epoch": 9.465316144874157, + "grad_norm": 0.11690666526556015, + "learning_rate": 7.48718427154993e-07, + "loss": 1.6567, + "step": 30838 + }, + { + "epoch": 9.46562308164518, + "grad_norm": 0.189022496342659, + "learning_rate": 7.478617064258675e-07, + "loss": 1.7489, + "step": 30839 + }, + { + "epoch": 9.465930018416206, + "grad_norm": 0.15130119025707245, + "learning_rate": 7.470054724352527e-07, + "loss": 1.7174, + "step": 30840 + }, + { + "epoch": 9.466236955187231, + "grad_norm": 0.13578876852989197, + "learning_rate": 7.461497251915917e-07, + "loss": 1.7101, + "step": 30841 + }, + { + "epoch": 9.466543891958256, + "grad_norm": 0.10819463431835175, + "learning_rate": 7.452944647033499e-07, + "loss": 1.6773, + "step": 30842 + }, + { + "epoch": 9.466850828729282, + "grad_norm": 0.23427242040634155, + "learning_rate": 7.444396909789763e-07, + "loss": 1.7527, + "step": 30843 + }, + { + "epoch": 9.467157765500307, + "grad_norm": 0.16425447165966034, + "learning_rate": 7.435854040269197e-07, + "loss": 1.6726, + "step": 30844 + }, + { + "epoch": 9.467464702271332, + "grad_norm": 0.14800399541854858, + "learning_rate": 7.427316038556231e-07, + "loss": 1.7177, + "step": 30845 + }, + { + "epoch": 9.467771639042358, + "grad_norm": 0.16622939705848694, + "learning_rate": 7.418782904735189e-07, + "loss": 1.7018, + "step": 30846 + }, + { + "epoch": 9.468078575813383, + "grad_norm": 0.1474144607782364, + "learning_rate": 7.410254638890501e-07, + "loss": 1.7278, + "step": 30847 + }, + { + "epoch": 9.468385512584408, + "grad_norm": 0.13024532794952393, + "learning_rate": 7.40173124110638e-07, + "loss": 1.7262, + "step": 30848 + }, + { + "epoch": 9.468692449355434, + "grad_norm": 0.12134112417697906, + "learning_rate": 7.393212711467035e-07, + "loss": 1.722, + "step": 30849 + }, + { + "epoch": 9.468999386126457, + "grad_norm": 0.1322898268699646, + "learning_rate": 7.384699050056731e-07, + "loss": 1.6728, + "step": 30850 + }, + { + "epoch": 9.469306322897483, + "grad_norm": 0.14417654275894165, + "learning_rate": 7.37619025695957e-07, + "loss": 1.667, + "step": 30851 + }, + { + "epoch": 9.469613259668508, + "grad_norm": 0.15618880093097687, + "learning_rate": 7.367686332259538e-07, + "loss": 1.7028, + "step": 30852 + }, + { + "epoch": 9.469920196439533, + "grad_norm": 0.14198319613933563, + "learning_rate": 7.359187276040902e-07, + "loss": 1.6945, + "step": 30853 + }, + { + "epoch": 9.470227133210559, + "grad_norm": 0.19188794493675232, + "learning_rate": 7.350693088387428e-07, + "loss": 1.7103, + "step": 30854 + }, + { + "epoch": 9.470534069981584, + "grad_norm": 0.11949232220649719, + "learning_rate": 7.342203769383216e-07, + "loss": 1.6757, + "step": 30855 + }, + { + "epoch": 9.47084100675261, + "grad_norm": 0.1419954150915146, + "learning_rate": 7.333719319112031e-07, + "loss": 1.751, + "step": 30856 + }, + { + "epoch": 9.471147943523635, + "grad_norm": 0.13624246418476105, + "learning_rate": 7.325239737657863e-07, + "loss": 1.7212, + "step": 30857 + }, + { + "epoch": 9.47145488029466, + "grad_norm": 0.1910800039768219, + "learning_rate": 7.316765025104422e-07, + "loss": 1.7812, + "step": 30858 + }, + { + "epoch": 9.471761817065685, + "grad_norm": 0.1337525099515915, + "learning_rate": 7.308295181535474e-07, + "loss": 1.7106, + "step": 30859 + }, + { + "epoch": 9.472068753836709, + "grad_norm": 0.1155819520354271, + "learning_rate": 7.299830207034731e-07, + "loss": 1.6483, + "step": 30860 + }, + { + "epoch": 9.472375690607734, + "grad_norm": 0.12981106340885162, + "learning_rate": 7.291370101685846e-07, + "loss": 1.6897, + "step": 30861 + }, + { + "epoch": 9.47268262737876, + "grad_norm": 0.1460549235343933, + "learning_rate": 7.282914865572421e-07, + "loss": 1.7478, + "step": 30862 + }, + { + "epoch": 9.472989564149785, + "grad_norm": 0.14573179185390472, + "learning_rate": 7.274464498778055e-07, + "loss": 1.7013, + "step": 30863 + }, + { + "epoch": 9.47329650092081, + "grad_norm": 0.2089526355266571, + "learning_rate": 7.266019001386182e-07, + "loss": 1.7517, + "step": 30864 + }, + { + "epoch": 9.473603437691835, + "grad_norm": 0.14519059658050537, + "learning_rate": 7.257578373480345e-07, + "loss": 1.7181, + "step": 30865 + }, + { + "epoch": 9.47391037446286, + "grad_norm": 0.17337870597839355, + "learning_rate": 7.249142615143922e-07, + "loss": 1.7488, + "step": 30866 + }, + { + "epoch": 9.474217311233886, + "grad_norm": 0.1789073348045349, + "learning_rate": 7.240711726460237e-07, + "loss": 1.7748, + "step": 30867 + }, + { + "epoch": 9.474524248004911, + "grad_norm": 0.12607963383197784, + "learning_rate": 7.232285707512664e-07, + "loss": 1.7183, + "step": 30868 + }, + { + "epoch": 9.474831184775937, + "grad_norm": 0.15094679594039917, + "learning_rate": 7.223864558384475e-07, + "loss": 1.7163, + "step": 30869 + }, + { + "epoch": 9.475138121546962, + "grad_norm": 0.12432575970888138, + "learning_rate": 7.215448279158932e-07, + "loss": 1.7252, + "step": 30870 + }, + { + "epoch": 9.475445058317986, + "grad_norm": 0.16342738270759583, + "learning_rate": 7.207036869919082e-07, + "loss": 1.7132, + "step": 30871 + }, + { + "epoch": 9.475751995089011, + "grad_norm": 0.11726677417755127, + "learning_rate": 7.198630330748191e-07, + "loss": 1.6582, + "step": 30872 + }, + { + "epoch": 9.476058931860036, + "grad_norm": 0.13808207213878632, + "learning_rate": 7.190228661729193e-07, + "loss": 1.6936, + "step": 30873 + }, + { + "epoch": 9.476365868631062, + "grad_norm": 0.13612079620361328, + "learning_rate": 7.181831862945298e-07, + "loss": 1.6744, + "step": 30874 + }, + { + "epoch": 9.476672805402087, + "grad_norm": 0.13610509037971497, + "learning_rate": 7.173439934479332e-07, + "loss": 1.7474, + "step": 30875 + }, + { + "epoch": 9.476979742173112, + "grad_norm": 0.17372582852840424, + "learning_rate": 7.165052876414335e-07, + "loss": 1.7328, + "step": 30876 + }, + { + "epoch": 9.477286678944138, + "grad_norm": 0.1300712525844574, + "learning_rate": 7.156670688833078e-07, + "loss": 1.6887, + "step": 30877 + }, + { + "epoch": 9.477593615715163, + "grad_norm": 0.17369040846824646, + "learning_rate": 7.148293371818493e-07, + "loss": 1.7768, + "step": 30878 + }, + { + "epoch": 9.477900552486188, + "grad_norm": 0.15355315804481506, + "learning_rate": 7.139920925453347e-07, + "loss": 1.7148, + "step": 30879 + }, + { + "epoch": 9.478207489257214, + "grad_norm": 0.1690572053194046, + "learning_rate": 7.131553349820408e-07, + "loss": 1.711, + "step": 30880 + }, + { + "epoch": 9.478514426028239, + "grad_norm": 0.12726818025112152, + "learning_rate": 7.123190645002332e-07, + "loss": 1.6829, + "step": 30881 + }, + { + "epoch": 9.478821362799263, + "grad_norm": 0.12314258515834808, + "learning_rate": 7.114832811081717e-07, + "loss": 1.6579, + "step": 30882 + }, + { + "epoch": 9.479128299570288, + "grad_norm": 0.093282051384449, + "learning_rate": 7.106479848141279e-07, + "loss": 1.6268, + "step": 30883 + }, + { + "epoch": 9.479435236341313, + "grad_norm": 0.14540770649909973, + "learning_rate": 7.098131756263449e-07, + "loss": 1.7037, + "step": 30884 + }, + { + "epoch": 9.479742173112339, + "grad_norm": 0.12486393749713898, + "learning_rate": 7.089788535530828e-07, + "loss": 1.6861, + "step": 30885 + }, + { + "epoch": 9.480049109883364, + "grad_norm": 0.1135348379611969, + "learning_rate": 7.08145018602574e-07, + "loss": 1.6523, + "step": 30886 + }, + { + "epoch": 9.48035604665439, + "grad_norm": 0.18895356357097626, + "learning_rate": 7.073116707830729e-07, + "loss": 1.6879, + "step": 30887 + }, + { + "epoch": 9.480662983425415, + "grad_norm": 0.14413176476955414, + "learning_rate": 7.064788101028063e-07, + "loss": 1.6939, + "step": 30888 + }, + { + "epoch": 9.48096992019644, + "grad_norm": 0.16126643121242523, + "learning_rate": 7.056464365700122e-07, + "loss": 1.7301, + "step": 30889 + }, + { + "epoch": 9.481276856967465, + "grad_norm": 0.1249922662973404, + "learning_rate": 7.048145501929115e-07, + "loss": 1.6933, + "step": 30890 + }, + { + "epoch": 9.48158379373849, + "grad_norm": 0.1359063982963562, + "learning_rate": 7.039831509797202e-07, + "loss": 1.6888, + "step": 30891 + }, + { + "epoch": 9.481890730509516, + "grad_norm": 0.19966992735862732, + "learning_rate": 7.031522389386702e-07, + "loss": 1.7428, + "step": 30892 + }, + { + "epoch": 9.48219766728054, + "grad_norm": 0.10133275389671326, + "learning_rate": 7.023218140779553e-07, + "loss": 1.6776, + "step": 30893 + }, + { + "epoch": 9.482504604051565, + "grad_norm": 0.12074444442987442, + "learning_rate": 7.014918764057965e-07, + "loss": 1.732, + "step": 30894 + }, + { + "epoch": 9.48281154082259, + "grad_norm": 0.12305136024951935, + "learning_rate": 7.006624259303873e-07, + "loss": 1.6904, + "step": 30895 + }, + { + "epoch": 9.483118477593615, + "grad_norm": 0.12707793712615967, + "learning_rate": 6.998334626599268e-07, + "loss": 1.6395, + "step": 30896 + }, + { + "epoch": 9.48342541436464, + "grad_norm": 0.22196513414382935, + "learning_rate": 6.990049866026082e-07, + "loss": 1.7651, + "step": 30897 + }, + { + "epoch": 9.483732351135666, + "grad_norm": 0.1324261873960495, + "learning_rate": 6.981769977666197e-07, + "loss": 1.7045, + "step": 30898 + }, + { + "epoch": 9.484039287906691, + "grad_norm": 0.14185984432697296, + "learning_rate": 6.973494961601435e-07, + "loss": 1.713, + "step": 30899 + }, + { + "epoch": 9.484346224677717, + "grad_norm": 0.18500623106956482, + "learning_rate": 6.965224817913507e-07, + "loss": 1.7883, + "step": 30900 + }, + { + "epoch": 9.484653161448742, + "grad_norm": 0.21934804320335388, + "learning_rate": 6.956959546684294e-07, + "loss": 1.7406, + "step": 30901 + }, + { + "epoch": 9.484960098219767, + "grad_norm": 0.10997944325208664, + "learning_rate": 6.948699147995341e-07, + "loss": 1.6881, + "step": 30902 + }, + { + "epoch": 9.485267034990791, + "grad_norm": 0.14439432322978973, + "learning_rate": 6.94044362192825e-07, + "loss": 1.7053, + "step": 30903 + }, + { + "epoch": 9.485573971761816, + "grad_norm": 0.20071901381015778, + "learning_rate": 6.932192968564843e-07, + "loss": 1.7561, + "step": 30904 + }, + { + "epoch": 9.485880908532842, + "grad_norm": 0.1546691358089447, + "learning_rate": 6.92394718798639e-07, + "loss": 1.7001, + "step": 30905 + }, + { + "epoch": 9.486187845303867, + "grad_norm": 0.18300898373126984, + "learning_rate": 6.915706280274547e-07, + "loss": 1.7027, + "step": 30906 + }, + { + "epoch": 9.486494782074892, + "grad_norm": 0.17844128608703613, + "learning_rate": 6.907470245510639e-07, + "loss": 1.6942, + "step": 30907 + }, + { + "epoch": 9.486801718845918, + "grad_norm": 0.12263448536396027, + "learning_rate": 6.899239083776154e-07, + "loss": 1.6811, + "step": 30908 + }, + { + "epoch": 9.487108655616943, + "grad_norm": 0.10036440938711166, + "learning_rate": 6.891012795152419e-07, + "loss": 1.7091, + "step": 30909 + }, + { + "epoch": 9.487415592387968, + "grad_norm": 0.15316228568553925, + "learning_rate": 6.882791379720699e-07, + "loss": 1.7291, + "step": 30910 + }, + { + "epoch": 9.487722529158994, + "grad_norm": 0.10985010862350464, + "learning_rate": 6.874574837562265e-07, + "loss": 1.6553, + "step": 30911 + }, + { + "epoch": 9.488029465930019, + "grad_norm": 0.1258542537689209, + "learning_rate": 6.866363168758327e-07, + "loss": 1.6887, + "step": 30912 + }, + { + "epoch": 9.488336402701044, + "grad_norm": 0.1341710239648819, + "learning_rate": 6.858156373390045e-07, + "loss": 1.68, + "step": 30913 + }, + { + "epoch": 9.488643339472068, + "grad_norm": 0.12450239062309265, + "learning_rate": 6.849954451538465e-07, + "loss": 1.6523, + "step": 30914 + }, + { + "epoch": 9.488950276243093, + "grad_norm": 0.1216820552945137, + "learning_rate": 6.841757403284687e-07, + "loss": 1.7078, + "step": 30915 + }, + { + "epoch": 9.489257213014119, + "grad_norm": 0.1473001092672348, + "learning_rate": 6.833565228709705e-07, + "loss": 1.6773, + "step": 30916 + }, + { + "epoch": 9.489564149785144, + "grad_norm": 0.14543893933296204, + "learning_rate": 6.825377927894505e-07, + "loss": 1.7538, + "step": 30917 + }, + { + "epoch": 9.48987108655617, + "grad_norm": 0.10436581820249557, + "learning_rate": 6.817195500919915e-07, + "loss": 1.6591, + "step": 30918 + }, + { + "epoch": 9.490178023327195, + "grad_norm": 0.1426854431629181, + "learning_rate": 6.809017947866925e-07, + "loss": 1.7502, + "step": 30919 + }, + { + "epoch": 9.49048496009822, + "grad_norm": 0.1720554381608963, + "learning_rate": 6.800845268816248e-07, + "loss": 1.7449, + "step": 30920 + }, + { + "epoch": 9.490791896869245, + "grad_norm": 0.16149570047855377, + "learning_rate": 6.792677463848762e-07, + "loss": 1.732, + "step": 30921 + }, + { + "epoch": 9.49109883364027, + "grad_norm": 0.1278751790523529, + "learning_rate": 6.784514533045017e-07, + "loss": 1.6638, + "step": 30922 + }, + { + "epoch": 9.491405770411296, + "grad_norm": 0.16824519634246826, + "learning_rate": 6.77635647648589e-07, + "loss": 1.7073, + "step": 30923 + }, + { + "epoch": 9.491712707182321, + "grad_norm": 0.1375180333852768, + "learning_rate": 6.768203294251818e-07, + "loss": 1.6884, + "step": 30924 + }, + { + "epoch": 9.492019643953345, + "grad_norm": 0.1789846420288086, + "learning_rate": 6.760054986423459e-07, + "loss": 1.7331, + "step": 30925 + }, + { + "epoch": 9.49232658072437, + "grad_norm": 0.17068323493003845, + "learning_rate": 6.751911553081358e-07, + "loss": 1.6998, + "step": 30926 + }, + { + "epoch": 9.492633517495396, + "grad_norm": 0.1423347294330597, + "learning_rate": 6.743772994305952e-07, + "loss": 1.7104, + "step": 30927 + }, + { + "epoch": 9.49294045426642, + "grad_norm": 0.16446225345134735, + "learning_rate": 6.735639310177733e-07, + "loss": 1.7329, + "step": 30928 + }, + { + "epoch": 9.493247391037446, + "grad_norm": 0.12990720570087433, + "learning_rate": 6.727510500776968e-07, + "loss": 1.6933, + "step": 30929 + }, + { + "epoch": 9.493554327808472, + "grad_norm": 0.09939338266849518, + "learning_rate": 6.719386566184093e-07, + "loss": 1.6667, + "step": 30930 + }, + { + "epoch": 9.493861264579497, + "grad_norm": 0.14358317852020264, + "learning_rate": 6.711267506479379e-07, + "loss": 1.7067, + "step": 30931 + }, + { + "epoch": 9.494168201350522, + "grad_norm": 0.15358752012252808, + "learning_rate": 6.703153321743039e-07, + "loss": 1.7861, + "step": 30932 + }, + { + "epoch": 9.494475138121548, + "grad_norm": 0.14822594821453094, + "learning_rate": 6.695044012055229e-07, + "loss": 1.6869, + "step": 30933 + }, + { + "epoch": 9.494782074892573, + "grad_norm": 0.18011552095413208, + "learning_rate": 6.686939577496165e-07, + "loss": 1.7522, + "step": 30934 + }, + { + "epoch": 9.495089011663598, + "grad_norm": 0.1966308206319809, + "learning_rate": 6.678840018145893e-07, + "loss": 1.7308, + "step": 30935 + }, + { + "epoch": 9.495395948434622, + "grad_norm": 0.19889011979103088, + "learning_rate": 6.670745334084517e-07, + "loss": 1.7796, + "step": 30936 + }, + { + "epoch": 9.495702885205647, + "grad_norm": 0.10640931874513626, + "learning_rate": 6.662655525391859e-07, + "loss": 1.6536, + "step": 30937 + }, + { + "epoch": 9.496009821976672, + "grad_norm": 0.1334729939699173, + "learning_rate": 6.654570592148135e-07, + "loss": 1.7313, + "step": 30938 + }, + { + "epoch": 9.496316758747698, + "grad_norm": 0.1538962870836258, + "learning_rate": 6.646490534433003e-07, + "loss": 1.7212, + "step": 30939 + }, + { + "epoch": 9.496623695518723, + "grad_norm": 0.13990063965320587, + "learning_rate": 6.63841535232651e-07, + "loss": 1.7655, + "step": 30940 + }, + { + "epoch": 9.496930632289748, + "grad_norm": 0.14489619433879852, + "learning_rate": 6.63034504590826e-07, + "loss": 1.7151, + "step": 30941 + }, + { + "epoch": 9.497237569060774, + "grad_norm": 0.14994287490844727, + "learning_rate": 6.622279615258187e-07, + "loss": 1.7778, + "step": 30942 + }, + { + "epoch": 9.497544505831799, + "grad_norm": 0.15099942684173584, + "learning_rate": 6.614219060455895e-07, + "loss": 1.6923, + "step": 30943 + }, + { + "epoch": 9.497851442602824, + "grad_norm": 0.16680224239826202, + "learning_rate": 6.606163381581099e-07, + "loss": 1.6958, + "step": 30944 + }, + { + "epoch": 9.49815837937385, + "grad_norm": 0.1341257095336914, + "learning_rate": 6.598112578713344e-07, + "loss": 1.7436, + "step": 30945 + }, + { + "epoch": 9.498465316144873, + "grad_norm": 0.14608977735042572, + "learning_rate": 6.590066651932237e-07, + "loss": 1.7464, + "step": 30946 + }, + { + "epoch": 9.498772252915899, + "grad_norm": 0.22711209952831268, + "learning_rate": 6.582025601317321e-07, + "loss": 1.6592, + "step": 30947 + }, + { + "epoch": 9.499079189686924, + "grad_norm": 0.11007440835237503, + "learning_rate": 6.573989426948035e-07, + "loss": 1.6821, + "step": 30948 + }, + { + "epoch": 9.49938612645795, + "grad_norm": 0.135493203997612, + "learning_rate": 6.56595812890376e-07, + "loss": 1.6916, + "step": 30949 + }, + { + "epoch": 9.499693063228975, + "grad_norm": 0.11300768703222275, + "learning_rate": 6.557931707263875e-07, + "loss": 1.6867, + "step": 30950 + }, + { + "epoch": 9.5, + "grad_norm": 0.14597927033901215, + "learning_rate": 6.549910162107764e-07, + "loss": 1.682, + "step": 30951 + }, + { + "epoch": 9.500306936771025, + "grad_norm": 0.17950420081615448, + "learning_rate": 6.54189349351464e-07, + "loss": 1.7943, + "step": 30952 + }, + { + "epoch": 9.50061387354205, + "grad_norm": 0.1679387390613556, + "learning_rate": 6.533881701563771e-07, + "loss": 1.7104, + "step": 30953 + }, + { + "epoch": 9.500920810313076, + "grad_norm": 0.12778639793395996, + "learning_rate": 6.525874786334263e-07, + "loss": 1.6447, + "step": 30954 + }, + { + "epoch": 9.501227747084101, + "grad_norm": 0.17508088052272797, + "learning_rate": 6.517872747905384e-07, + "loss": 1.7618, + "step": 30955 + }, + { + "epoch": 9.501534683855127, + "grad_norm": 0.1603916436433792, + "learning_rate": 6.509875586356073e-07, + "loss": 1.7083, + "step": 30956 + }, + { + "epoch": 9.50184162062615, + "grad_norm": 0.15757711231708527, + "learning_rate": 6.501883301765432e-07, + "loss": 1.7301, + "step": 30957 + }, + { + "epoch": 9.502148557397176, + "grad_norm": 0.12382685393095016, + "learning_rate": 6.493895894212399e-07, + "loss": 1.6719, + "step": 30958 + }, + { + "epoch": 9.502455494168201, + "grad_norm": 0.16945087909698486, + "learning_rate": 6.485913363775964e-07, + "loss": 1.6606, + "step": 30959 + }, + { + "epoch": 9.502762430939226, + "grad_norm": 0.157539501786232, + "learning_rate": 6.477935710534955e-07, + "loss": 1.7092, + "step": 30960 + }, + { + "epoch": 9.503069367710252, + "grad_norm": 0.11866376549005508, + "learning_rate": 6.469962934568308e-07, + "loss": 1.6525, + "step": 30961 + }, + { + "epoch": 9.503376304481277, + "grad_norm": 0.15672917664051056, + "learning_rate": 6.461995035954737e-07, + "loss": 1.7218, + "step": 30962 + }, + { + "epoch": 9.503683241252302, + "grad_norm": 0.10983888059854507, + "learning_rate": 6.454032014772959e-07, + "loss": 1.6658, + "step": 30963 + }, + { + "epoch": 9.503990178023328, + "grad_norm": 0.14017660915851593, + "learning_rate": 6.446073871101744e-07, + "loss": 1.7096, + "step": 30964 + }, + { + "epoch": 9.504297114794353, + "grad_norm": 0.14705055952072144, + "learning_rate": 6.438120605019693e-07, + "loss": 1.7113, + "step": 30965 + }, + { + "epoch": 9.504604051565378, + "grad_norm": 0.13271331787109375, + "learning_rate": 6.430172216605468e-07, + "loss": 1.6483, + "step": 30966 + }, + { + "epoch": 9.504910988336402, + "grad_norm": 0.13414405286312103, + "learning_rate": 6.422228705937505e-07, + "loss": 1.7011, + "step": 30967 + }, + { + "epoch": 9.505217925107427, + "grad_norm": 0.12676768004894257, + "learning_rate": 6.414290073094409e-07, + "loss": 1.6963, + "step": 30968 + }, + { + "epoch": 9.505524861878452, + "grad_norm": 0.1459144800901413, + "learning_rate": 6.406356318154616e-07, + "loss": 1.7426, + "step": 30969 + }, + { + "epoch": 9.505831798649478, + "grad_norm": 0.13834135234355927, + "learning_rate": 6.398427441196509e-07, + "loss": 1.7045, + "step": 30970 + }, + { + "epoch": 9.506138735420503, + "grad_norm": 0.1961667686700821, + "learning_rate": 6.390503442298413e-07, + "loss": 1.7121, + "step": 30971 + }, + { + "epoch": 9.506445672191528, + "grad_norm": 0.10918349772691727, + "learning_rate": 6.382584321538709e-07, + "loss": 1.6482, + "step": 30972 + }, + { + "epoch": 9.506752608962554, + "grad_norm": 0.16137553751468658, + "learning_rate": 6.37467007899556e-07, + "loss": 1.7211, + "step": 30973 + }, + { + "epoch": 9.50705954573358, + "grad_norm": 0.14611978828907013, + "learning_rate": 6.366760714747344e-07, + "loss": 1.7762, + "step": 30974 + }, + { + "epoch": 9.507366482504604, + "grad_norm": 0.1840377300977707, + "learning_rate": 6.358856228872057e-07, + "loss": 1.7359, + "step": 30975 + }, + { + "epoch": 9.50767341927563, + "grad_norm": 0.15308772027492523, + "learning_rate": 6.350956621447968e-07, + "loss": 1.6772, + "step": 30976 + }, + { + "epoch": 9.507980356046655, + "grad_norm": 0.09826724231243134, + "learning_rate": 6.34306189255296e-07, + "loss": 1.6529, + "step": 30977 + }, + { + "epoch": 9.50828729281768, + "grad_norm": 0.135554239153862, + "learning_rate": 6.335172042265192e-07, + "loss": 1.6707, + "step": 30978 + }, + { + "epoch": 9.508594229588704, + "grad_norm": 0.13289806246757507, + "learning_rate": 6.327287070662658e-07, + "loss": 1.7411, + "step": 30979 + }, + { + "epoch": 9.50890116635973, + "grad_norm": 0.11493640393018723, + "learning_rate": 6.319406977823128e-07, + "loss": 1.6771, + "step": 30980 + }, + { + "epoch": 9.509208103130755, + "grad_norm": 0.17868508398532867, + "learning_rate": 6.311531763824596e-07, + "loss": 1.7614, + "step": 30981 + }, + { + "epoch": 9.50951503990178, + "grad_norm": 0.1414751559495926, + "learning_rate": 6.303661428744889e-07, + "loss": 1.7023, + "step": 30982 + }, + { + "epoch": 9.509821976672805, + "grad_norm": 0.11903268843889236, + "learning_rate": 6.295795972661777e-07, + "loss": 1.7241, + "step": 30983 + }, + { + "epoch": 9.51012891344383, + "grad_norm": 0.12880147993564606, + "learning_rate": 6.287935395652977e-07, + "loss": 1.6842, + "step": 30984 + }, + { + "epoch": 9.510435850214856, + "grad_norm": 0.11090810596942902, + "learning_rate": 6.280079697796148e-07, + "loss": 1.6561, + "step": 30985 + }, + { + "epoch": 9.510742786985881, + "grad_norm": 0.12199088931083679, + "learning_rate": 6.272228879168951e-07, + "loss": 1.6541, + "step": 30986 + }, + { + "epoch": 9.511049723756907, + "grad_norm": 0.19049455225467682, + "learning_rate": 6.264382939848989e-07, + "loss": 1.7782, + "step": 30987 + }, + { + "epoch": 9.511356660527932, + "grad_norm": 0.14614251255989075, + "learning_rate": 6.256541879913813e-07, + "loss": 1.733, + "step": 30988 + }, + { + "epoch": 9.511663597298956, + "grad_norm": 0.13675597310066223, + "learning_rate": 6.24870569944086e-07, + "loss": 1.6957, + "step": 30989 + }, + { + "epoch": 9.511970534069981, + "grad_norm": 0.11168385297060013, + "learning_rate": 6.240874398507513e-07, + "loss": 1.6745, + "step": 30990 + }, + { + "epoch": 9.512277470841006, + "grad_norm": 0.13322143256664276, + "learning_rate": 6.233047977191375e-07, + "loss": 1.6831, + "step": 30991 + }, + { + "epoch": 9.512584407612032, + "grad_norm": 0.16648098826408386, + "learning_rate": 6.225226435569553e-07, + "loss": 1.7878, + "step": 30992 + }, + { + "epoch": 9.512891344383057, + "grad_norm": 0.16310833394527435, + "learning_rate": 6.21740977371954e-07, + "loss": 1.6797, + "step": 30993 + }, + { + "epoch": 9.513198281154082, + "grad_norm": 0.13475677371025085, + "learning_rate": 6.209597991718441e-07, + "loss": 1.7132, + "step": 30994 + }, + { + "epoch": 9.513505217925108, + "grad_norm": 0.1621815413236618, + "learning_rate": 6.201791089643528e-07, + "loss": 1.7452, + "step": 30995 + }, + { + "epoch": 9.513812154696133, + "grad_norm": 0.11439715325832367, + "learning_rate": 6.193989067571959e-07, + "loss": 1.707, + "step": 30996 + }, + { + "epoch": 9.514119091467158, + "grad_norm": 0.182517409324646, + "learning_rate": 6.186191925580786e-07, + "loss": 1.6871, + "step": 30997 + }, + { + "epoch": 9.514426028238184, + "grad_norm": 0.1009940356016159, + "learning_rate": 6.17839966374717e-07, + "loss": 1.6409, + "step": 30998 + }, + { + "epoch": 9.514732965009209, + "grad_norm": 0.22212521731853485, + "learning_rate": 6.170612282147936e-07, + "loss": 1.7206, + "step": 30999 + }, + { + "epoch": 9.515039901780233, + "grad_norm": 0.1333693414926529, + "learning_rate": 6.162829780860247e-07, + "loss": 1.6929, + "step": 31000 + }, + { + "epoch": 9.515346838551258, + "grad_norm": 0.12046591937541962, + "learning_rate": 6.155052159960873e-07, + "loss": 1.6484, + "step": 31001 + }, + { + "epoch": 9.515653775322283, + "grad_norm": 0.13430583477020264, + "learning_rate": 6.147279419526753e-07, + "loss": 1.6677, + "step": 31002 + }, + { + "epoch": 9.515960712093309, + "grad_norm": 0.12045972794294357, + "learning_rate": 6.139511559634659e-07, + "loss": 1.7014, + "step": 31003 + }, + { + "epoch": 9.516267648864334, + "grad_norm": 0.1649526059627533, + "learning_rate": 6.131748580361363e-07, + "loss": 1.7326, + "step": 31004 + }, + { + "epoch": 9.51657458563536, + "grad_norm": 0.1313924789428711, + "learning_rate": 6.123990481783636e-07, + "loss": 1.7441, + "step": 31005 + }, + { + "epoch": 9.516881522406385, + "grad_norm": 0.145765021443367, + "learning_rate": 6.116237263978031e-07, + "loss": 1.7383, + "step": 31006 + }, + { + "epoch": 9.51718845917741, + "grad_norm": 0.14247392117977142, + "learning_rate": 6.108488927021261e-07, + "loss": 1.6956, + "step": 31007 + }, + { + "epoch": 9.517495395948435, + "grad_norm": 0.12804681062698364, + "learning_rate": 6.100745470989933e-07, + "loss": 1.6864, + "step": 31008 + }, + { + "epoch": 9.51780233271946, + "grad_norm": 0.15574663877487183, + "learning_rate": 6.093006895960485e-07, + "loss": 1.6709, + "step": 31009 + }, + { + "epoch": 9.518109269490484, + "grad_norm": 0.14249230921268463, + "learning_rate": 6.085273202009467e-07, + "loss": 1.7125, + "step": 31010 + }, + { + "epoch": 9.51841620626151, + "grad_norm": 0.13120415806770325, + "learning_rate": 6.077544389213207e-07, + "loss": 1.6875, + "step": 31011 + }, + { + "epoch": 9.518723143032535, + "grad_norm": 0.11910203844308853, + "learning_rate": 6.069820457648201e-07, + "loss": 1.7113, + "step": 31012 + }, + { + "epoch": 9.51903007980356, + "grad_norm": 0.13545389473438263, + "learning_rate": 6.062101407390775e-07, + "loss": 1.7356, + "step": 31013 + }, + { + "epoch": 9.519337016574585, + "grad_norm": 0.1885189414024353, + "learning_rate": 6.05438723851709e-07, + "loss": 1.7558, + "step": 31014 + }, + { + "epoch": 9.51964395334561, + "grad_norm": 0.1113700196146965, + "learning_rate": 6.04667795110353e-07, + "loss": 1.6546, + "step": 31015 + }, + { + "epoch": 9.519950890116636, + "grad_norm": 0.18005676567554474, + "learning_rate": 6.038973545226089e-07, + "loss": 1.7657, + "step": 31016 + }, + { + "epoch": 9.520257826887661, + "grad_norm": 0.12435733526945114, + "learning_rate": 6.031274020961152e-07, + "loss": 1.7219, + "step": 31017 + }, + { + "epoch": 9.520564763658687, + "grad_norm": 0.20083987712860107, + "learning_rate": 6.023579378384659e-07, + "loss": 1.7779, + "step": 31018 + }, + { + "epoch": 9.520871700429712, + "grad_norm": 0.15939640998840332, + "learning_rate": 6.015889617572656e-07, + "loss": 1.6895, + "step": 31019 + }, + { + "epoch": 9.521178637200737, + "grad_norm": 0.20790094137191772, + "learning_rate": 6.008204738601198e-07, + "loss": 1.7553, + "step": 31020 + }, + { + "epoch": 9.521485573971761, + "grad_norm": 0.10034120082855225, + "learning_rate": 6.000524741546165e-07, + "loss": 1.6232, + "step": 31021 + }, + { + "epoch": 9.521792510742786, + "grad_norm": 0.11239612102508545, + "learning_rate": 5.992849626483498e-07, + "loss": 1.6838, + "step": 31022 + }, + { + "epoch": 9.522099447513812, + "grad_norm": 0.13167715072631836, + "learning_rate": 5.985179393489083e-07, + "loss": 1.6638, + "step": 31023 + }, + { + "epoch": 9.522406384284837, + "grad_norm": 0.1241912767291069, + "learning_rate": 5.977514042638577e-07, + "loss": 1.6935, + "step": 31024 + }, + { + "epoch": 9.522713321055862, + "grad_norm": 0.125594824552536, + "learning_rate": 5.969853574007922e-07, + "loss": 1.6935, + "step": 31025 + }, + { + "epoch": 9.523020257826888, + "grad_norm": 0.1614350974559784, + "learning_rate": 5.962197987672668e-07, + "loss": 1.7097, + "step": 31026 + }, + { + "epoch": 9.523327194597913, + "grad_norm": 0.15176361799240112, + "learning_rate": 5.954547283708644e-07, + "loss": 1.6911, + "step": 31027 + }, + { + "epoch": 9.523634131368938, + "grad_norm": 0.09742459654808044, + "learning_rate": 5.946901462191234e-07, + "loss": 1.6243, + "step": 31028 + }, + { + "epoch": 9.523941068139964, + "grad_norm": 0.15997633337974548, + "learning_rate": 5.939260523196155e-07, + "loss": 1.7174, + "step": 31029 + }, + { + "epoch": 9.524248004910989, + "grad_norm": 0.21839283406734467, + "learning_rate": 5.931624466798957e-07, + "loss": 1.8457, + "step": 31030 + }, + { + "epoch": 9.524554941682014, + "grad_norm": 0.16808728873729706, + "learning_rate": 5.923993293074914e-07, + "loss": 1.7274, + "step": 31031 + }, + { + "epoch": 9.524861878453038, + "grad_norm": 0.11654167622327805, + "learning_rate": 5.916367002099688e-07, + "loss": 1.6902, + "step": 31032 + }, + { + "epoch": 9.525168815224063, + "grad_norm": 0.12978383898735046, + "learning_rate": 5.908745593948383e-07, + "loss": 1.7095, + "step": 31033 + }, + { + "epoch": 9.525475751995089, + "grad_norm": 0.13306757807731628, + "learning_rate": 5.901129068696498e-07, + "loss": 1.7218, + "step": 31034 + }, + { + "epoch": 9.525782688766114, + "grad_norm": 0.20930147171020508, + "learning_rate": 5.893517426419304e-07, + "loss": 1.8132, + "step": 31035 + }, + { + "epoch": 9.52608962553714, + "grad_norm": 0.14664147794246674, + "learning_rate": 5.885910667191907e-07, + "loss": 1.6764, + "step": 31036 + }, + { + "epoch": 9.526396562308165, + "grad_norm": 0.20831573009490967, + "learning_rate": 5.878308791089582e-07, + "loss": 1.7434, + "step": 31037 + }, + { + "epoch": 9.52670349907919, + "grad_norm": 0.13942310214042664, + "learning_rate": 5.870711798187433e-07, + "loss": 1.7272, + "step": 31038 + }, + { + "epoch": 9.527010435850215, + "grad_norm": 0.15469035506248474, + "learning_rate": 5.863119688560514e-07, + "loss": 1.6838, + "step": 31039 + }, + { + "epoch": 9.52731737262124, + "grad_norm": 0.13903473317623138, + "learning_rate": 5.855532462283875e-07, + "loss": 1.7166, + "step": 31040 + }, + { + "epoch": 9.527624309392266, + "grad_norm": 0.12209124863147736, + "learning_rate": 5.847950119432455e-07, + "loss": 1.6259, + "step": 31041 + }, + { + "epoch": 9.527931246163291, + "grad_norm": 0.09797443449497223, + "learning_rate": 5.840372660081251e-07, + "loss": 1.676, + "step": 31042 + }, + { + "epoch": 9.528238182934315, + "grad_norm": 0.14228491485118866, + "learning_rate": 5.83280008430509e-07, + "loss": 1.7104, + "step": 31043 + }, + { + "epoch": 9.52854511970534, + "grad_norm": 0.1535727083683014, + "learning_rate": 5.825232392178914e-07, + "loss": 1.7169, + "step": 31044 + }, + { + "epoch": 9.528852056476365, + "grad_norm": 0.14102879166603088, + "learning_rate": 5.817669583777386e-07, + "loss": 1.7182, + "step": 31045 + }, + { + "epoch": 9.52915899324739, + "grad_norm": 0.17063194513320923, + "learning_rate": 5.810111659175333e-07, + "loss": 1.7164, + "step": 31046 + }, + { + "epoch": 9.529465930018416, + "grad_norm": 0.15687642991542816, + "learning_rate": 5.802558618447418e-07, + "loss": 1.7198, + "step": 31047 + }, + { + "epoch": 9.529772866789441, + "grad_norm": 0.18693117797374725, + "learning_rate": 5.795010461668193e-07, + "loss": 1.7213, + "step": 31048 + }, + { + "epoch": 9.530079803560467, + "grad_norm": 0.14518466591835022, + "learning_rate": 5.787467188912432e-07, + "loss": 1.7147, + "step": 31049 + }, + { + "epoch": 9.530386740331492, + "grad_norm": 0.14564110338687897, + "learning_rate": 5.77992880025452e-07, + "loss": 1.7007, + "step": 31050 + }, + { + "epoch": 9.530693677102517, + "grad_norm": 0.14775414764881134, + "learning_rate": 5.772395295769007e-07, + "loss": 1.6947, + "step": 31051 + }, + { + "epoch": 9.531000613873543, + "grad_norm": 0.18668405711650848, + "learning_rate": 5.76486667553039e-07, + "loss": 1.7567, + "step": 31052 + }, + { + "epoch": 9.531307550644566, + "grad_norm": 0.12053389847278595, + "learning_rate": 5.757342939613053e-07, + "loss": 1.6763, + "step": 31053 + }, + { + "epoch": 9.531614487415592, + "grad_norm": 0.13077262043952942, + "learning_rate": 5.749824088091382e-07, + "loss": 1.6783, + "step": 31054 + }, + { + "epoch": 9.531921424186617, + "grad_norm": 0.1928776055574417, + "learning_rate": 5.742310121039596e-07, + "loss": 1.7303, + "step": 31055 + }, + { + "epoch": 9.532228360957642, + "grad_norm": 0.13202275335788727, + "learning_rate": 5.734801038531967e-07, + "loss": 1.7008, + "step": 31056 + }, + { + "epoch": 9.532535297728668, + "grad_norm": 0.1478370577096939, + "learning_rate": 5.72729684064277e-07, + "loss": 1.7352, + "step": 31057 + }, + { + "epoch": 9.532842234499693, + "grad_norm": 0.1766318529844284, + "learning_rate": 5.719797527446058e-07, + "loss": 1.7635, + "step": 31058 + }, + { + "epoch": 9.533149171270718, + "grad_norm": 0.13437522947788239, + "learning_rate": 5.712303099016103e-07, + "loss": 1.6607, + "step": 31059 + }, + { + "epoch": 9.533456108041744, + "grad_norm": 0.1521230787038803, + "learning_rate": 5.704813555426847e-07, + "loss": 1.6922, + "step": 31060 + }, + { + "epoch": 9.533763044812769, + "grad_norm": 0.14926433563232422, + "learning_rate": 5.697328896752341e-07, + "loss": 1.7459, + "step": 31061 + }, + { + "epoch": 9.534069981583794, + "grad_norm": 0.14931491017341614, + "learning_rate": 5.689849123066526e-07, + "loss": 1.7042, + "step": 31062 + }, + { + "epoch": 9.53437691835482, + "grad_norm": 0.1489458531141281, + "learning_rate": 5.682374234443344e-07, + "loss": 1.7353, + "step": 31063 + }, + { + "epoch": 9.534683855125843, + "grad_norm": 0.1196800023317337, + "learning_rate": 5.674904230956735e-07, + "loss": 1.6928, + "step": 31064 + }, + { + "epoch": 9.534990791896869, + "grad_norm": 0.11683658510446548, + "learning_rate": 5.667439112680417e-07, + "loss": 1.6829, + "step": 31065 + }, + { + "epoch": 9.535297728667894, + "grad_norm": 0.14123310148715973, + "learning_rate": 5.659978879688221e-07, + "loss": 1.6971, + "step": 31066 + }, + { + "epoch": 9.53560466543892, + "grad_norm": 0.13458828628063202, + "learning_rate": 5.652523532053811e-07, + "loss": 1.7138, + "step": 31067 + }, + { + "epoch": 9.535911602209945, + "grad_norm": 0.1536986231803894, + "learning_rate": 5.645073069850903e-07, + "loss": 1.7484, + "step": 31068 + }, + { + "epoch": 9.53621853898097, + "grad_norm": 0.12006396800279617, + "learning_rate": 5.637627493153164e-07, + "loss": 1.669, + "step": 31069 + }, + { + "epoch": 9.536525475751995, + "grad_norm": 0.1644553393125534, + "learning_rate": 5.630186802034143e-07, + "loss": 1.6823, + "step": 31070 + }, + { + "epoch": 9.53683241252302, + "grad_norm": 0.12596864998340607, + "learning_rate": 5.622750996567395e-07, + "loss": 1.7111, + "step": 31071 + }, + { + "epoch": 9.537139349294046, + "grad_norm": 0.14411930739879608, + "learning_rate": 5.615320076826358e-07, + "loss": 1.7183, + "step": 31072 + }, + { + "epoch": 9.537446286065071, + "grad_norm": 0.14459045231342316, + "learning_rate": 5.607894042884531e-07, + "loss": 1.7366, + "step": 31073 + }, + { + "epoch": 9.537753222836095, + "grad_norm": 0.12643924355506897, + "learning_rate": 5.600472894815245e-07, + "loss": 1.6678, + "step": 31074 + }, + { + "epoch": 9.53806015960712, + "grad_norm": 0.13994373381137848, + "learning_rate": 5.593056632691829e-07, + "loss": 1.7825, + "step": 31075 + }, + { + "epoch": 9.538367096378146, + "grad_norm": 0.11746983230113983, + "learning_rate": 5.585645256587668e-07, + "loss": 1.688, + "step": 31076 + }, + { + "epoch": 9.53867403314917, + "grad_norm": 0.13083167374134064, + "learning_rate": 5.578238766575871e-07, + "loss": 1.7332, + "step": 31077 + }, + { + "epoch": 9.538980969920196, + "grad_norm": 0.12449757009744644, + "learning_rate": 5.57083716272977e-07, + "loss": 1.7419, + "step": 31078 + }, + { + "epoch": 9.539287906691222, + "grad_norm": 0.11567985266447067, + "learning_rate": 5.563440445122415e-07, + "loss": 1.6608, + "step": 31079 + }, + { + "epoch": 9.539594843462247, + "grad_norm": 0.10740742087364197, + "learning_rate": 5.55604861382697e-07, + "loss": 1.6866, + "step": 31080 + }, + { + "epoch": 9.539901780233272, + "grad_norm": 0.1555785983800888, + "learning_rate": 5.548661668916489e-07, + "loss": 1.6391, + "step": 31081 + }, + { + "epoch": 9.540208717004298, + "grad_norm": 0.12961047887802124, + "learning_rate": 5.541279610463857e-07, + "loss": 1.6912, + "step": 31082 + }, + { + "epoch": 9.540515653775323, + "grad_norm": 0.17427892982959747, + "learning_rate": 5.533902438542183e-07, + "loss": 1.7527, + "step": 31083 + }, + { + "epoch": 9.540822590546348, + "grad_norm": 0.137424036860466, + "learning_rate": 5.526530153224241e-07, + "loss": 1.7119, + "step": 31084 + }, + { + "epoch": 9.541129527317374, + "grad_norm": 0.15986669063568115, + "learning_rate": 5.519162754582974e-07, + "loss": 1.7379, + "step": 31085 + }, + { + "epoch": 9.541436464088397, + "grad_norm": 0.34904229640960693, + "learning_rate": 5.511800242691157e-07, + "loss": 1.776, + "step": 31086 + }, + { + "epoch": 9.541743400859422, + "grad_norm": 0.10629575699567795, + "learning_rate": 5.504442617621563e-07, + "loss": 1.6572, + "step": 31087 + }, + { + "epoch": 9.542050337630448, + "grad_norm": 0.14238065481185913, + "learning_rate": 5.497089879446915e-07, + "loss": 1.6707, + "step": 31088 + }, + { + "epoch": 9.542357274401473, + "grad_norm": 0.14475369453430176, + "learning_rate": 5.48974202823982e-07, + "loss": 1.718, + "step": 31089 + }, + { + "epoch": 9.542664211172498, + "grad_norm": 0.17306506633758545, + "learning_rate": 5.482399064072996e-07, + "loss": 1.7558, + "step": 31090 + }, + { + "epoch": 9.542971147943524, + "grad_norm": 0.10227597504854202, + "learning_rate": 5.475060987018943e-07, + "loss": 1.6278, + "step": 31091 + }, + { + "epoch": 9.543278084714549, + "grad_norm": 0.15417295694351196, + "learning_rate": 5.467727797150102e-07, + "loss": 1.7579, + "step": 31092 + }, + { + "epoch": 9.543585021485574, + "grad_norm": 0.1255696415901184, + "learning_rate": 5.460399494539136e-07, + "loss": 1.7061, + "step": 31093 + }, + { + "epoch": 9.5438919582566, + "grad_norm": 0.14167217910289764, + "learning_rate": 5.453076079258268e-07, + "loss": 1.7164, + "step": 31094 + }, + { + "epoch": 9.544198895027625, + "grad_norm": 0.16300976276397705, + "learning_rate": 5.445757551380048e-07, + "loss": 1.7092, + "step": 31095 + }, + { + "epoch": 9.544505831798649, + "grad_norm": 0.12125522643327713, + "learning_rate": 5.438443910976699e-07, + "loss": 1.697, + "step": 31096 + }, + { + "epoch": 9.544812768569674, + "grad_norm": 0.15089687705039978, + "learning_rate": 5.431135158120493e-07, + "loss": 1.6578, + "step": 31097 + }, + { + "epoch": 9.5451197053407, + "grad_norm": 0.17200914025306702, + "learning_rate": 5.423831292883708e-07, + "loss": 1.689, + "step": 31098 + }, + { + "epoch": 9.545426642111725, + "grad_norm": 0.14511042833328247, + "learning_rate": 5.416532315338508e-07, + "loss": 1.7301, + "step": 31099 + }, + { + "epoch": 9.54573357888275, + "grad_norm": 0.12074702233076096, + "learning_rate": 5.409238225557001e-07, + "loss": 1.6845, + "step": 31100 + }, + { + "epoch": 9.546040515653775, + "grad_norm": 0.12752333283424377, + "learning_rate": 5.401949023611297e-07, + "loss": 1.7021, + "step": 31101 + }, + { + "epoch": 9.5463474524248, + "grad_norm": 0.1448252946138382, + "learning_rate": 5.394664709573394e-07, + "loss": 1.7037, + "step": 31102 + }, + { + "epoch": 9.546654389195826, + "grad_norm": 0.17957226932048798, + "learning_rate": 5.387385283515345e-07, + "loss": 1.7527, + "step": 31103 + }, + { + "epoch": 9.546961325966851, + "grad_norm": 0.13432875275611877, + "learning_rate": 5.380110745509093e-07, + "loss": 1.7244, + "step": 31104 + }, + { + "epoch": 9.547268262737877, + "grad_norm": 0.13721013069152832, + "learning_rate": 5.372841095626413e-07, + "loss": 1.6907, + "step": 31105 + }, + { + "epoch": 9.547575199508902, + "grad_norm": 0.14336919784545898, + "learning_rate": 5.365576333939304e-07, + "loss": 1.7179, + "step": 31106 + }, + { + "epoch": 9.547882136279926, + "grad_norm": 0.13788890838623047, + "learning_rate": 5.358316460519431e-07, + "loss": 1.7157, + "step": 31107 + }, + { + "epoch": 9.54818907305095, + "grad_norm": 0.15330001711845398, + "learning_rate": 5.351061475438623e-07, + "loss": 1.7515, + "step": 31108 + }, + { + "epoch": 9.548496009821976, + "grad_norm": 0.11875810474157333, + "learning_rate": 5.343811378768492e-07, + "loss": 1.6855, + "step": 31109 + }, + { + "epoch": 9.548802946593002, + "grad_norm": 0.1445886343717575, + "learning_rate": 5.336566170580814e-07, + "loss": 1.7519, + "step": 31110 + }, + { + "epoch": 9.549109883364027, + "grad_norm": 0.2866973578929901, + "learning_rate": 5.329325850947087e-07, + "loss": 1.6697, + "step": 31111 + }, + { + "epoch": 9.549416820135052, + "grad_norm": 0.15357863903045654, + "learning_rate": 5.322090419938919e-07, + "loss": 1.7397, + "step": 31112 + }, + { + "epoch": 9.549723756906078, + "grad_norm": 0.12374851852655411, + "learning_rate": 5.314859877627754e-07, + "loss": 1.7267, + "step": 31113 + }, + { + "epoch": 9.550030693677103, + "grad_norm": 0.12979474663734436, + "learning_rate": 5.307634224085145e-07, + "loss": 1.7158, + "step": 31114 + }, + { + "epoch": 9.550337630448128, + "grad_norm": 0.10462703555822372, + "learning_rate": 5.300413459382425e-07, + "loss": 1.6312, + "step": 31115 + }, + { + "epoch": 9.550644567219154, + "grad_norm": 0.11557597666978836, + "learning_rate": 5.293197583590926e-07, + "loss": 1.6961, + "step": 31116 + }, + { + "epoch": 9.550951503990177, + "grad_norm": 0.13233163952827454, + "learning_rate": 5.285986596782089e-07, + "loss": 1.6665, + "step": 31117 + }, + { + "epoch": 9.551258440761202, + "grad_norm": 0.13464027643203735, + "learning_rate": 5.278780499027025e-07, + "loss": 1.709, + "step": 31118 + }, + { + "epoch": 9.551565377532228, + "grad_norm": 0.15500648319721222, + "learning_rate": 5.27157929039701e-07, + "loss": 1.7837, + "step": 31119 + }, + { + "epoch": 9.551872314303253, + "grad_norm": 0.10849796235561371, + "learning_rate": 5.264382970963267e-07, + "loss": 1.6777, + "step": 31120 + }, + { + "epoch": 9.552179251074278, + "grad_norm": 0.12520049512386322, + "learning_rate": 5.25719154079679e-07, + "loss": 1.6912, + "step": 31121 + }, + { + "epoch": 9.552486187845304, + "grad_norm": 0.171976700425148, + "learning_rate": 5.250004999968806e-07, + "loss": 1.7431, + "step": 31122 + }, + { + "epoch": 9.55279312461633, + "grad_norm": 0.15759800374507904, + "learning_rate": 5.242823348550197e-07, + "loss": 1.7266, + "step": 31123 + }, + { + "epoch": 9.553100061387354, + "grad_norm": 0.14026059210300446, + "learning_rate": 5.235646586612075e-07, + "loss": 1.6999, + "step": 31124 + }, + { + "epoch": 9.55340699815838, + "grad_norm": 0.16142502427101135, + "learning_rate": 5.228474714225218e-07, + "loss": 1.7189, + "step": 31125 + }, + { + "epoch": 9.553713934929405, + "grad_norm": 0.19895243644714355, + "learning_rate": 5.221307731460567e-07, + "loss": 1.7703, + "step": 31126 + }, + { + "epoch": 9.55402087170043, + "grad_norm": 0.12162072211503983, + "learning_rate": 5.214145638388956e-07, + "loss": 1.6722, + "step": 31127 + }, + { + "epoch": 9.554327808471456, + "grad_norm": 0.15602703392505646, + "learning_rate": 5.206988435081162e-07, + "loss": 1.7385, + "step": 31128 + }, + { + "epoch": 9.55463474524248, + "grad_norm": 0.14179575443267822, + "learning_rate": 5.199836121607959e-07, + "loss": 1.7018, + "step": 31129 + }, + { + "epoch": 9.554941682013505, + "grad_norm": 0.1313495635986328, + "learning_rate": 5.192688698039904e-07, + "loss": 1.6959, + "step": 31130 + }, + { + "epoch": 9.55524861878453, + "grad_norm": 0.10791079699993134, + "learning_rate": 5.185546164447774e-07, + "loss": 1.6555, + "step": 31131 + }, + { + "epoch": 9.555555555555555, + "grad_norm": 0.14998406171798706, + "learning_rate": 5.178408520902123e-07, + "loss": 1.7, + "step": 31132 + }, + { + "epoch": 9.55586249232658, + "grad_norm": 0.1362425684928894, + "learning_rate": 5.171275767473394e-07, + "loss": 1.6853, + "step": 31133 + }, + { + "epoch": 9.556169429097606, + "grad_norm": 0.1443333774805069, + "learning_rate": 5.164147904232197e-07, + "loss": 1.7404, + "step": 31134 + }, + { + "epoch": 9.556476365868631, + "grad_norm": 0.14398255944252014, + "learning_rate": 5.157024931248866e-07, + "loss": 1.6841, + "step": 31135 + }, + { + "epoch": 9.556783302639657, + "grad_norm": 0.1562454253435135, + "learning_rate": 5.149906848593899e-07, + "loss": 1.7195, + "step": 31136 + }, + { + "epoch": 9.557090239410682, + "grad_norm": 0.10564878582954407, + "learning_rate": 5.142793656337575e-07, + "loss": 1.6851, + "step": 31137 + }, + { + "epoch": 9.557397176181707, + "grad_norm": 0.15394751727581024, + "learning_rate": 5.135685354550223e-07, + "loss": 1.7149, + "step": 31138 + }, + { + "epoch": 9.557704112952731, + "grad_norm": 0.17012141644954681, + "learning_rate": 5.128581943302069e-07, + "loss": 1.7559, + "step": 31139 + }, + { + "epoch": 9.558011049723756, + "grad_norm": 0.14832472801208496, + "learning_rate": 5.121483422663332e-07, + "loss": 1.7165, + "step": 31140 + }, + { + "epoch": 9.558317986494782, + "grad_norm": 0.16663455963134766, + "learning_rate": 5.114389792704177e-07, + "loss": 1.7719, + "step": 31141 + }, + { + "epoch": 9.558624923265807, + "grad_norm": 0.15087881684303284, + "learning_rate": 5.107301053494607e-07, + "loss": 1.673, + "step": 31142 + }, + { + "epoch": 9.558931860036832, + "grad_norm": 0.1716073453426361, + "learning_rate": 5.10021720510484e-07, + "loss": 1.7244, + "step": 31143 + }, + { + "epoch": 9.559238796807858, + "grad_norm": 0.1661565750837326, + "learning_rate": 5.093138247604768e-07, + "loss": 1.6895, + "step": 31144 + }, + { + "epoch": 9.559545733578883, + "grad_norm": 0.14260123670101166, + "learning_rate": 5.086064181064332e-07, + "loss": 1.7166, + "step": 31145 + }, + { + "epoch": 9.559852670349908, + "grad_norm": 0.12638737261295319, + "learning_rate": 5.078995005553533e-07, + "loss": 1.6924, + "step": 31146 + }, + { + "epoch": 9.560159607120934, + "grad_norm": 0.1578296571969986, + "learning_rate": 5.071930721142148e-07, + "loss": 1.7215, + "step": 31147 + }, + { + "epoch": 9.560466543891959, + "grad_norm": 0.12237422913312912, + "learning_rate": 5.064871327900067e-07, + "loss": 1.6672, + "step": 31148 + }, + { + "epoch": 9.560773480662984, + "grad_norm": 0.11540009081363678, + "learning_rate": 5.057816825897011e-07, + "loss": 1.6942, + "step": 31149 + }, + { + "epoch": 9.561080417434008, + "grad_norm": 0.11710464954376221, + "learning_rate": 5.050767215202701e-07, + "loss": 1.6721, + "step": 31150 + }, + { + "epoch": 9.561387354205033, + "grad_norm": 0.1241387203335762, + "learning_rate": 5.04372249588686e-07, + "loss": 1.6574, + "step": 31151 + }, + { + "epoch": 9.561694290976058, + "grad_norm": 0.15445421636104584, + "learning_rate": 5.036682668018933e-07, + "loss": 1.6976, + "step": 31152 + }, + { + "epoch": 9.562001227747084, + "grad_norm": 0.15151409804821014, + "learning_rate": 5.029647731668752e-07, + "loss": 1.7067, + "step": 31153 + }, + { + "epoch": 9.56230816451811, + "grad_norm": 0.18623974919319153, + "learning_rate": 5.022617686905596e-07, + "loss": 1.7709, + "step": 31154 + }, + { + "epoch": 9.562615101289135, + "grad_norm": 0.14912709593772888, + "learning_rate": 5.015592533799074e-07, + "loss": 1.6918, + "step": 31155 + }, + { + "epoch": 9.56292203806016, + "grad_norm": 0.13887201249599457, + "learning_rate": 5.008572272418633e-07, + "loss": 1.6851, + "step": 31156 + }, + { + "epoch": 9.563228974831185, + "grad_norm": 0.1401492953300476, + "learning_rate": 5.001556902833548e-07, + "loss": 1.6886, + "step": 31157 + }, + { + "epoch": 9.56353591160221, + "grad_norm": 0.13679155707359314, + "learning_rate": 4.994546425113266e-07, + "loss": 1.7129, + "step": 31158 + }, + { + "epoch": 9.563842848373236, + "grad_norm": 0.12003178894519806, + "learning_rate": 4.987540839326954e-07, + "loss": 1.7186, + "step": 31159 + }, + { + "epoch": 9.56414978514426, + "grad_norm": 0.12413342297077179, + "learning_rate": 4.980540145543944e-07, + "loss": 1.6818, + "step": 31160 + }, + { + "epoch": 9.564456721915285, + "grad_norm": 0.16514070332050323, + "learning_rate": 4.973544343833347e-07, + "loss": 1.7551, + "step": 31161 + }, + { + "epoch": 9.56476365868631, + "grad_norm": 0.1000957265496254, + "learning_rate": 4.966553434264276e-07, + "loss": 1.6372, + "step": 31162 + }, + { + "epoch": 9.565070595457335, + "grad_norm": 0.16715119779109955, + "learning_rate": 4.959567416906008e-07, + "loss": 1.783, + "step": 31163 + }, + { + "epoch": 9.56537753222836, + "grad_norm": 0.1515718400478363, + "learning_rate": 4.952586291827321e-07, + "loss": 1.7858, + "step": 31164 + }, + { + "epoch": 9.565684468999386, + "grad_norm": 0.14952874183654785, + "learning_rate": 4.945610059097439e-07, + "loss": 1.7515, + "step": 31165 + }, + { + "epoch": 9.565991405770411, + "grad_norm": 0.11136786639690399, + "learning_rate": 4.938638718785138e-07, + "loss": 1.671, + "step": 31166 + }, + { + "epoch": 9.566298342541437, + "grad_norm": 0.10691037774085999, + "learning_rate": 4.931672270959308e-07, + "loss": 1.6479, + "step": 31167 + }, + { + "epoch": 9.566605279312462, + "grad_norm": 0.1559297740459442, + "learning_rate": 4.924710715689007e-07, + "loss": 1.704, + "step": 31168 + }, + { + "epoch": 9.566912216083487, + "grad_norm": 0.13859638571739197, + "learning_rate": 4.917754053042733e-07, + "loss": 1.7035, + "step": 31169 + }, + { + "epoch": 9.567219152854513, + "grad_norm": 0.13970541954040527, + "learning_rate": 4.910802283089544e-07, + "loss": 1.6903, + "step": 31170 + }, + { + "epoch": 9.567526089625538, + "grad_norm": 0.10885283350944519, + "learning_rate": 4.903855405897884e-07, + "loss": 1.669, + "step": 31171 + }, + { + "epoch": 9.567833026396562, + "grad_norm": 0.13587352633476257, + "learning_rate": 4.896913421536531e-07, + "loss": 1.7033, + "step": 31172 + }, + { + "epoch": 9.568139963167587, + "grad_norm": 0.1579197496175766, + "learning_rate": 4.889976330074042e-07, + "loss": 1.7772, + "step": 31173 + }, + { + "epoch": 9.568446899938612, + "grad_norm": 0.172073096036911, + "learning_rate": 4.883044131579029e-07, + "loss": 1.7545, + "step": 31174 + }, + { + "epoch": 9.568753836709638, + "grad_norm": 0.15477560460567474, + "learning_rate": 4.876116826119992e-07, + "loss": 1.6961, + "step": 31175 + }, + { + "epoch": 9.569060773480663, + "grad_norm": 0.12151028960943222, + "learning_rate": 4.869194413765376e-07, + "loss": 1.6918, + "step": 31176 + }, + { + "epoch": 9.569367710251688, + "grad_norm": 0.11448194086551666, + "learning_rate": 4.862276894583573e-07, + "loss": 1.695, + "step": 31177 + }, + { + "epoch": 9.569674647022714, + "grad_norm": 0.13363254070281982, + "learning_rate": 4.855364268642915e-07, + "loss": 1.6802, + "step": 31178 + }, + { + "epoch": 9.569981583793739, + "grad_norm": 0.13119351863861084, + "learning_rate": 4.848456536011792e-07, + "loss": 1.7026, + "step": 31179 + }, + { + "epoch": 9.570288520564764, + "grad_norm": 0.1255909502506256, + "learning_rate": 4.841553696758483e-07, + "loss": 1.6627, + "step": 31180 + }, + { + "epoch": 9.57059545733579, + "grad_norm": 0.13161277770996094, + "learning_rate": 4.8346557509511e-07, + "loss": 1.6906, + "step": 31181 + }, + { + "epoch": 9.570902394106813, + "grad_norm": 0.15130144357681274, + "learning_rate": 4.827762698657922e-07, + "loss": 1.7056, + "step": 31182 + }, + { + "epoch": 9.571209330877839, + "grad_norm": 0.11054715514183044, + "learning_rate": 4.820874539947007e-07, + "loss": 1.6326, + "step": 31183 + }, + { + "epoch": 9.571516267648864, + "grad_norm": 0.22645193338394165, + "learning_rate": 4.813991274886354e-07, + "loss": 1.7981, + "step": 31184 + }, + { + "epoch": 9.57182320441989, + "grad_norm": 0.09784482419490814, + "learning_rate": 4.807112903544242e-07, + "loss": 1.6359, + "step": 31185 + }, + { + "epoch": 9.572130141190915, + "grad_norm": 0.1499309092760086, + "learning_rate": 4.80023942598834e-07, + "loss": 1.7026, + "step": 31186 + }, + { + "epoch": 9.57243707796194, + "grad_norm": 0.1763381063938141, + "learning_rate": 4.793370842286815e-07, + "loss": 1.7516, + "step": 31187 + }, + { + "epoch": 9.572744014732965, + "grad_norm": 0.16786764562129974, + "learning_rate": 4.786507152507391e-07, + "loss": 1.7487, + "step": 31188 + }, + { + "epoch": 9.57305095150399, + "grad_norm": 0.1416286677122116, + "learning_rate": 4.779648356717958e-07, + "loss": 1.7016, + "step": 31189 + }, + { + "epoch": 9.573357888275016, + "grad_norm": 0.10985523462295532, + "learning_rate": 4.772794454986296e-07, + "loss": 1.6818, + "step": 31190 + }, + { + "epoch": 9.573664825046041, + "grad_norm": 0.16240783035755157, + "learning_rate": 4.7659454473801825e-07, + "loss": 1.7029, + "step": 31191 + }, + { + "epoch": 9.573971761817067, + "grad_norm": 0.16602420806884766, + "learning_rate": 4.7591013339672306e-07, + "loss": 1.7176, + "step": 31192 + }, + { + "epoch": 9.57427869858809, + "grad_norm": 0.11623486876487732, + "learning_rate": 4.7522621148151093e-07, + "loss": 1.6796, + "step": 31193 + }, + { + "epoch": 9.574585635359115, + "grad_norm": 0.1628381758928299, + "learning_rate": 4.74542778999143e-07, + "loss": 1.7524, + "step": 31194 + }, + { + "epoch": 9.57489257213014, + "grad_norm": 0.2524288296699524, + "learning_rate": 4.738598359563695e-07, + "loss": 1.7632, + "step": 31195 + }, + { + "epoch": 9.575199508901166, + "grad_norm": 0.13695289194583893, + "learning_rate": 4.731773823599406e-07, + "loss": 1.7155, + "step": 31196 + }, + { + "epoch": 9.575506445672191, + "grad_norm": 0.16224917769432068, + "learning_rate": 4.7249541821660637e-07, + "loss": 1.7063, + "step": 31197 + }, + { + "epoch": 9.575813382443217, + "grad_norm": 0.13433055579662323, + "learning_rate": 4.718139435330893e-07, + "loss": 1.708, + "step": 31198 + }, + { + "epoch": 9.576120319214242, + "grad_norm": 0.1861371546983719, + "learning_rate": 4.7113295831615054e-07, + "loss": 1.6628, + "step": 31199 + }, + { + "epoch": 9.576427255985267, + "grad_norm": 0.19167616963386536, + "learning_rate": 4.70452462572496e-07, + "loss": 1.7231, + "step": 31200 + }, + { + "epoch": 9.576734192756293, + "grad_norm": 0.13869838416576385, + "learning_rate": 4.6977245630886455e-07, + "loss": 1.6509, + "step": 31201 + }, + { + "epoch": 9.577041129527318, + "grad_norm": 0.14124059677124023, + "learning_rate": 4.690929395319732e-07, + "loss": 1.7077, + "step": 31202 + }, + { + "epoch": 9.577348066298342, + "grad_norm": 0.13248983025550842, + "learning_rate": 4.684139122485331e-07, + "loss": 1.6877, + "step": 31203 + }, + { + "epoch": 9.577655003069367, + "grad_norm": 0.08696278929710388, + "learning_rate": 4.6773537446526125e-07, + "loss": 1.6193, + "step": 31204 + }, + { + "epoch": 9.577961939840392, + "grad_norm": 0.1554766148328781, + "learning_rate": 4.670573261888578e-07, + "loss": 1.7169, + "step": 31205 + }, + { + "epoch": 9.578268876611418, + "grad_norm": 0.13041824102401733, + "learning_rate": 4.663797674260284e-07, + "loss": 1.7088, + "step": 31206 + }, + { + "epoch": 9.578575813382443, + "grad_norm": 0.10228250920772552, + "learning_rate": 4.6570269818346224e-07, + "loss": 1.6636, + "step": 31207 + }, + { + "epoch": 9.578882750153468, + "grad_norm": 0.11005907505750656, + "learning_rate": 4.6502611846785947e-07, + "loss": 1.6668, + "step": 31208 + }, + { + "epoch": 9.579189686924494, + "grad_norm": 0.10490129142999649, + "learning_rate": 4.643500282858981e-07, + "loss": 1.6666, + "step": 31209 + }, + { + "epoch": 9.579496623695519, + "grad_norm": 0.1278064250946045, + "learning_rate": 4.636744276442673e-07, + "loss": 1.6794, + "step": 31210 + }, + { + "epoch": 9.579803560466544, + "grad_norm": 0.1835307478904724, + "learning_rate": 4.6299931654963937e-07, + "loss": 1.7349, + "step": 31211 + }, + { + "epoch": 9.58011049723757, + "grad_norm": 0.14156827330589294, + "learning_rate": 4.623246950086868e-07, + "loss": 1.667, + "step": 31212 + }, + { + "epoch": 9.580417434008595, + "grad_norm": 0.1438005119562149, + "learning_rate": 4.61650563028071e-07, + "loss": 1.7248, + "step": 31213 + }, + { + "epoch": 9.580724370779619, + "grad_norm": 0.18247459828853607, + "learning_rate": 4.609769206144698e-07, + "loss": 1.8198, + "step": 31214 + }, + { + "epoch": 9.581031307550644, + "grad_norm": 0.12175338715314865, + "learning_rate": 4.6030376777452255e-07, + "loss": 1.6584, + "step": 31215 + }, + { + "epoch": 9.58133824432167, + "grad_norm": 0.1831531524658203, + "learning_rate": 4.5963110451489045e-07, + "loss": 1.709, + "step": 31216 + }, + { + "epoch": 9.581645181092695, + "grad_norm": 0.137215718626976, + "learning_rate": 4.5895893084222377e-07, + "loss": 1.767, + "step": 31217 + }, + { + "epoch": 9.58195211786372, + "grad_norm": 0.15977118909358978, + "learning_rate": 4.5828724676315606e-07, + "loss": 1.7384, + "step": 31218 + }, + { + "epoch": 9.582259054634745, + "grad_norm": 0.12897618114948273, + "learning_rate": 4.576160522843376e-07, + "loss": 1.6501, + "step": 31219 + }, + { + "epoch": 9.58256599140577, + "grad_norm": 0.13793621957302094, + "learning_rate": 4.5694534741239084e-07, + "loss": 1.7039, + "step": 31220 + }, + { + "epoch": 9.582872928176796, + "grad_norm": 0.11358989775180817, + "learning_rate": 4.562751321539549e-07, + "loss": 1.6699, + "step": 31221 + }, + { + "epoch": 9.583179864947821, + "grad_norm": 0.16851121187210083, + "learning_rate": 4.5560540651563565e-07, + "loss": 1.7399, + "step": 31222 + }, + { + "epoch": 9.583486801718847, + "grad_norm": 0.14942096173763275, + "learning_rate": 4.549361705040722e-07, + "loss": 1.6769, + "step": 31223 + }, + { + "epoch": 9.58379373848987, + "grad_norm": 0.13010743260383606, + "learning_rate": 4.542674241258649e-07, + "loss": 1.6713, + "step": 31224 + }, + { + "epoch": 9.584100675260895, + "grad_norm": 0.10744872689247131, + "learning_rate": 4.5359916738762497e-07, + "loss": 1.6919, + "step": 31225 + }, + { + "epoch": 9.58440761203192, + "grad_norm": 0.14843374490737915, + "learning_rate": 4.5293140029595836e-07, + "loss": 1.6961, + "step": 31226 + }, + { + "epoch": 9.584714548802946, + "grad_norm": 0.12312567979097366, + "learning_rate": 4.522641228574709e-07, + "loss": 1.6818, + "step": 31227 + }, + { + "epoch": 9.585021485573971, + "grad_norm": 0.15777400135993958, + "learning_rate": 4.5159733507874057e-07, + "loss": 1.6914, + "step": 31228 + }, + { + "epoch": 9.585328422344997, + "grad_norm": 0.12530489265918732, + "learning_rate": 4.509310369663733e-07, + "loss": 1.7149, + "step": 31229 + }, + { + "epoch": 9.585635359116022, + "grad_norm": 0.1540595442056656, + "learning_rate": 4.5026522852694155e-07, + "loss": 1.7282, + "step": 31230 + }, + { + "epoch": 9.585942295887047, + "grad_norm": 0.1336304396390915, + "learning_rate": 4.4959990976704005e-07, + "loss": 1.702, + "step": 31231 + }, + { + "epoch": 9.586249232658073, + "grad_norm": 0.23668836057186127, + "learning_rate": 4.4893508069322467e-07, + "loss": 1.7604, + "step": 31232 + }, + { + "epoch": 9.586556169429098, + "grad_norm": 0.14577987790107727, + "learning_rate": 4.482707413120846e-07, + "loss": 1.6841, + "step": 31233 + }, + { + "epoch": 9.586863106200123, + "grad_norm": 0.12077435851097107, + "learning_rate": 4.476068916301701e-07, + "loss": 1.7066, + "step": 31234 + }, + { + "epoch": 9.587170042971149, + "grad_norm": 0.10890510678291321, + "learning_rate": 4.469435316540427e-07, + "loss": 1.6594, + "step": 31235 + }, + { + "epoch": 9.587476979742172, + "grad_norm": 0.1251889169216156, + "learning_rate": 4.462806613902748e-07, + "loss": 1.7127, + "step": 31236 + }, + { + "epoch": 9.587783916513198, + "grad_norm": 0.2560112774372101, + "learning_rate": 4.4561828084540013e-07, + "loss": 1.8146, + "step": 31237 + }, + { + "epoch": 9.588090853284223, + "grad_norm": 0.1295570433139801, + "learning_rate": 4.4495639002597455e-07, + "loss": 1.6671, + "step": 31238 + }, + { + "epoch": 9.588397790055248, + "grad_norm": 0.1236012801527977, + "learning_rate": 4.4429498893852617e-07, + "loss": 1.6916, + "step": 31239 + }, + { + "epoch": 9.588704726826274, + "grad_norm": 0.16924844682216644, + "learning_rate": 4.436340775896053e-07, + "loss": 1.6838, + "step": 31240 + }, + { + "epoch": 9.589011663597299, + "grad_norm": 0.1686296910047531, + "learning_rate": 4.4297365598574e-07, + "loss": 1.7578, + "step": 31241 + }, + { + "epoch": 9.589318600368324, + "grad_norm": 0.13647985458374023, + "learning_rate": 4.4231372413345296e-07, + "loss": 1.6968, + "step": 31242 + }, + { + "epoch": 9.58962553713935, + "grad_norm": 0.13135603070259094, + "learning_rate": 4.4165428203927216e-07, + "loss": 1.6594, + "step": 31243 + }, + { + "epoch": 9.589932473910375, + "grad_norm": 0.13832809031009674, + "learning_rate": 4.409953297097036e-07, + "loss": 1.7405, + "step": 31244 + }, + { + "epoch": 9.5902394106814, + "grad_norm": 0.1193947121500969, + "learning_rate": 4.403368671512753e-07, + "loss": 1.6682, + "step": 31245 + }, + { + "epoch": 9.590546347452424, + "grad_norm": 0.11434894800186157, + "learning_rate": 4.3967889437048214e-07, + "loss": 1.6781, + "step": 31246 + }, + { + "epoch": 9.59085328422345, + "grad_norm": 0.14688155055046082, + "learning_rate": 4.3902141137382444e-07, + "loss": 1.7705, + "step": 31247 + }, + { + "epoch": 9.591160220994475, + "grad_norm": 0.13387629389762878, + "learning_rate": 4.383644181678137e-07, + "loss": 1.6999, + "step": 31248 + }, + { + "epoch": 9.5914671577655, + "grad_norm": 0.21924255788326263, + "learning_rate": 4.377079147589336e-07, + "loss": 1.7334, + "step": 31249 + }, + { + "epoch": 9.591774094536525, + "grad_norm": 0.14692620933055878, + "learning_rate": 4.3705190115367335e-07, + "loss": 1.6817, + "step": 31250 + }, + { + "epoch": 9.59208103130755, + "grad_norm": 0.11326060444116592, + "learning_rate": 4.3639637735851115e-07, + "loss": 1.6599, + "step": 31251 + }, + { + "epoch": 9.592387968078576, + "grad_norm": 0.12073694914579391, + "learning_rate": 4.3574134337993066e-07, + "loss": 1.6924, + "step": 31252 + }, + { + "epoch": 9.592694904849601, + "grad_norm": 0.14962032437324524, + "learning_rate": 4.3508679922441566e-07, + "loss": 1.7259, + "step": 31253 + }, + { + "epoch": 9.593001841620627, + "grad_norm": 0.1624862551689148, + "learning_rate": 4.344327448984109e-07, + "loss": 1.7194, + "step": 31254 + }, + { + "epoch": 9.593308778391652, + "grad_norm": 0.12331227213144302, + "learning_rate": 4.3377918040840017e-07, + "loss": 1.6871, + "step": 31255 + }, + { + "epoch": 9.593615715162677, + "grad_norm": 0.17856283485889435, + "learning_rate": 4.3312610576082825e-07, + "loss": 1.7479, + "step": 31256 + }, + { + "epoch": 9.5939226519337, + "grad_norm": 0.097813680768013, + "learning_rate": 4.324735209621622e-07, + "loss": 1.6385, + "step": 31257 + }, + { + "epoch": 9.594229588704726, + "grad_norm": 0.1290784329175949, + "learning_rate": 4.318214260188469e-07, + "loss": 1.6802, + "step": 31258 + }, + { + "epoch": 9.594536525475752, + "grad_norm": 0.1114344522356987, + "learning_rate": 4.3116982093732163e-07, + "loss": 1.6691, + "step": 31259 + }, + { + "epoch": 9.594843462246777, + "grad_norm": 0.12479976564645767, + "learning_rate": 4.305187057240312e-07, + "loss": 1.6891, + "step": 31260 + }, + { + "epoch": 9.595150399017802, + "grad_norm": 0.1734507828950882, + "learning_rate": 4.2986808038540385e-07, + "loss": 1.7337, + "step": 31261 + }, + { + "epoch": 9.595457335788828, + "grad_norm": 0.14148491621017456, + "learning_rate": 4.2921794492787884e-07, + "loss": 1.7019, + "step": 31262 + }, + { + "epoch": 9.595764272559853, + "grad_norm": 0.11479593068361282, + "learning_rate": 4.285682993578788e-07, + "loss": 1.6509, + "step": 31263 + }, + { + "epoch": 9.596071209330878, + "grad_norm": 0.13279953598976135, + "learning_rate": 4.279191436818153e-07, + "loss": 1.692, + "step": 31264 + }, + { + "epoch": 9.596378146101904, + "grad_norm": 0.13242286443710327, + "learning_rate": 4.27270477906111e-07, + "loss": 1.6787, + "step": 31265 + }, + { + "epoch": 9.596685082872929, + "grad_norm": 0.1530013382434845, + "learning_rate": 4.2662230203717737e-07, + "loss": 1.7245, + "step": 31266 + }, + { + "epoch": 9.596992019643952, + "grad_norm": 0.10855519771575928, + "learning_rate": 4.259746160814204e-07, + "loss": 1.6938, + "step": 31267 + }, + { + "epoch": 9.597298956414978, + "grad_norm": 0.16191129386425018, + "learning_rate": 4.253274200452351e-07, + "loss": 1.7375, + "step": 31268 + }, + { + "epoch": 9.597605893186003, + "grad_norm": 0.13151034712791443, + "learning_rate": 4.2468071393501617e-07, + "loss": 1.713, + "step": 31269 + }, + { + "epoch": 9.597912829957028, + "grad_norm": 0.11667583882808685, + "learning_rate": 4.2403449775716977e-07, + "loss": 1.6956, + "step": 31270 + }, + { + "epoch": 9.598219766728054, + "grad_norm": 0.13867171108722687, + "learning_rate": 4.233887715180629e-07, + "loss": 1.7364, + "step": 31271 + }, + { + "epoch": 9.598526703499079, + "grad_norm": 0.09936422109603882, + "learning_rate": 4.2274353522409606e-07, + "loss": 1.6493, + "step": 31272 + }, + { + "epoch": 9.598833640270104, + "grad_norm": 0.1310657113790512, + "learning_rate": 4.2209878888162524e-07, + "loss": 1.6937, + "step": 31273 + }, + { + "epoch": 9.59914057704113, + "grad_norm": 0.1411616951227188, + "learning_rate": 4.214545324970398e-07, + "loss": 1.7071, + "step": 31274 + }, + { + "epoch": 9.599447513812155, + "grad_norm": 0.16063901782035828, + "learning_rate": 4.208107660766958e-07, + "loss": 1.7328, + "step": 31275 + }, + { + "epoch": 9.59975445058318, + "grad_norm": 0.19482840597629547, + "learning_rate": 4.2016748962696027e-07, + "loss": 1.7442, + "step": 31276 + }, + { + "epoch": 9.600061387354206, + "grad_norm": 0.1624516397714615, + "learning_rate": 4.195247031541893e-07, + "loss": 1.7261, + "step": 31277 + }, + { + "epoch": 9.600368324125231, + "grad_norm": 0.1904727965593338, + "learning_rate": 4.1888240666473345e-07, + "loss": 1.6999, + "step": 31278 + }, + { + "epoch": 9.600675260896255, + "grad_norm": 0.0954340398311615, + "learning_rate": 4.1824060016494307e-07, + "loss": 1.6738, + "step": 31279 + }, + { + "epoch": 9.60098219766728, + "grad_norm": 0.185276597738266, + "learning_rate": 4.175992836611631e-07, + "loss": 1.7959, + "step": 31280 + }, + { + "epoch": 9.601289134438305, + "grad_norm": 0.13276509940624237, + "learning_rate": 4.1695845715972184e-07, + "loss": 1.6988, + "step": 31281 + }, + { + "epoch": 9.60159607120933, + "grad_norm": 0.145119309425354, + "learning_rate": 4.163181206669642e-07, + "loss": 1.6603, + "step": 31282 + }, + { + "epoch": 9.601903007980356, + "grad_norm": 0.2778591513633728, + "learning_rate": 4.156782741892129e-07, + "loss": 1.8033, + "step": 31283 + }, + { + "epoch": 9.602209944751381, + "grad_norm": 0.12991562485694885, + "learning_rate": 4.150389177327907e-07, + "loss": 1.6833, + "step": 31284 + }, + { + "epoch": 9.602516881522407, + "grad_norm": 0.19052881002426147, + "learning_rate": 4.144000513040147e-07, + "loss": 1.727, + "step": 31285 + }, + { + "epoch": 9.602823818293432, + "grad_norm": 0.180231973528862, + "learning_rate": 4.137616749091966e-07, + "loss": 1.7223, + "step": 31286 + }, + { + "epoch": 9.603130755064457, + "grad_norm": 0.11801919341087341, + "learning_rate": 4.131237885546535e-07, + "loss": 1.6674, + "step": 31287 + }, + { + "epoch": 9.603437691835483, + "grad_norm": 0.1323625147342682, + "learning_rate": 4.1248639224668596e-07, + "loss": 1.6999, + "step": 31288 + }, + { + "epoch": 9.603744628606506, + "grad_norm": 0.16466714441776276, + "learning_rate": 4.1184948599159443e-07, + "loss": 1.6886, + "step": 31289 + }, + { + "epoch": 9.604051565377532, + "grad_norm": 0.16557957231998444, + "learning_rate": 4.112130697956629e-07, + "loss": 1.7503, + "step": 31290 + }, + { + "epoch": 9.604358502148557, + "grad_norm": 0.12221503257751465, + "learning_rate": 4.1057714366519173e-07, + "loss": 1.6695, + "step": 31291 + }, + { + "epoch": 9.604665438919582, + "grad_norm": 0.12496510148048401, + "learning_rate": 4.0994170760646487e-07, + "loss": 1.6728, + "step": 31292 + }, + { + "epoch": 9.604972375690608, + "grad_norm": 0.12658068537712097, + "learning_rate": 4.0930676162576063e-07, + "loss": 1.6813, + "step": 31293 + }, + { + "epoch": 9.605279312461633, + "grad_norm": 0.1092144325375557, + "learning_rate": 4.0867230572935176e-07, + "loss": 1.6728, + "step": 31294 + }, + { + "epoch": 9.605586249232658, + "grad_norm": 0.13999344408512115, + "learning_rate": 4.0803833992350547e-07, + "loss": 1.6931, + "step": 31295 + }, + { + "epoch": 9.605893186003684, + "grad_norm": 0.1349373310804367, + "learning_rate": 4.0740486421449455e-07, + "loss": 1.7247, + "step": 31296 + }, + { + "epoch": 9.606200122774709, + "grad_norm": 0.17605085670948029, + "learning_rate": 4.0677187860857503e-07, + "loss": 1.7334, + "step": 31297 + }, + { + "epoch": 9.606507059545734, + "grad_norm": 0.1366586685180664, + "learning_rate": 4.061393831120086e-07, + "loss": 1.7097, + "step": 31298 + }, + { + "epoch": 9.60681399631676, + "grad_norm": 0.11512716114521027, + "learning_rate": 4.0550737773103475e-07, + "loss": 1.6844, + "step": 31299 + }, + { + "epoch": 9.607120933087783, + "grad_norm": 0.1779230386018753, + "learning_rate": 4.04875862471904e-07, + "loss": 1.706, + "step": 31300 + }, + { + "epoch": 9.607427869858808, + "grad_norm": 0.11504211276769638, + "learning_rate": 4.042448373408614e-07, + "loss": 1.6816, + "step": 31301 + }, + { + "epoch": 9.607734806629834, + "grad_norm": 0.17073078453540802, + "learning_rate": 4.036143023441408e-07, + "loss": 1.6883, + "step": 31302 + }, + { + "epoch": 9.60804174340086, + "grad_norm": 0.15582023561000824, + "learning_rate": 4.0298425748797606e-07, + "loss": 1.7218, + "step": 31303 + }, + { + "epoch": 9.608348680171884, + "grad_norm": 0.1295994520187378, + "learning_rate": 4.0235470277858454e-07, + "loss": 1.6722, + "step": 31304 + }, + { + "epoch": 9.60865561694291, + "grad_norm": 0.11748214811086655, + "learning_rate": 4.0172563822219457e-07, + "loss": 1.7255, + "step": 31305 + }, + { + "epoch": 9.608962553713935, + "grad_norm": 0.15344174206256866, + "learning_rate": 4.010970638250289e-07, + "loss": 1.7227, + "step": 31306 + }, + { + "epoch": 9.60926949048496, + "grad_norm": 0.12453699111938477, + "learning_rate": 4.0046897959328256e-07, + "loss": 1.7004, + "step": 31307 + }, + { + "epoch": 9.609576427255986, + "grad_norm": 0.11904565244913101, + "learning_rate": 3.9984138553318395e-07, + "loss": 1.6949, + "step": 31308 + }, + { + "epoch": 9.609883364027011, + "grad_norm": 0.1816912293434143, + "learning_rate": 3.9921428165091703e-07, + "loss": 1.7344, + "step": 31309 + }, + { + "epoch": 9.610190300798035, + "grad_norm": 0.17511364817619324, + "learning_rate": 3.9858766795268785e-07, + "loss": 1.715, + "step": 31310 + }, + { + "epoch": 9.61049723756906, + "grad_norm": 0.14724890887737274, + "learning_rate": 3.9796154444468604e-07, + "loss": 1.7115, + "step": 31311 + }, + { + "epoch": 9.610804174340085, + "grad_norm": 0.13168582320213318, + "learning_rate": 3.97335911133101e-07, + "loss": 1.6975, + "step": 31312 + }, + { + "epoch": 9.61111111111111, + "grad_norm": 0.10625627636909485, + "learning_rate": 3.967107680241222e-07, + "loss": 1.6475, + "step": 31313 + }, + { + "epoch": 9.611418047882136, + "grad_norm": 0.16010381281375885, + "learning_rate": 3.9608611512391704e-07, + "loss": 1.7232, + "step": 31314 + }, + { + "epoch": 9.611724984653161, + "grad_norm": 0.1410607546567917, + "learning_rate": 3.9546195243865826e-07, + "loss": 1.7167, + "step": 31315 + }, + { + "epoch": 9.612031921424187, + "grad_norm": 0.1656857579946518, + "learning_rate": 3.948382799745243e-07, + "loss": 1.7294, + "step": 31316 + }, + { + "epoch": 9.612338858195212, + "grad_norm": 0.12383712828159332, + "learning_rate": 3.942150977376713e-07, + "loss": 1.6826, + "step": 31317 + }, + { + "epoch": 9.612645794966237, + "grad_norm": 0.12091368436813354, + "learning_rate": 3.9359240573426105e-07, + "loss": 1.7067, + "step": 31318 + }, + { + "epoch": 9.612952731737263, + "grad_norm": 0.11942148953676224, + "learning_rate": 3.9297020397044416e-07, + "loss": 1.6991, + "step": 31319 + }, + { + "epoch": 9.613259668508288, + "grad_norm": 0.19631130993366241, + "learning_rate": 3.9234849245237126e-07, + "loss": 1.7333, + "step": 31320 + }, + { + "epoch": 9.613566605279313, + "grad_norm": 0.11581625044345856, + "learning_rate": 3.917272711861819e-07, + "loss": 1.6685, + "step": 31321 + }, + { + "epoch": 9.613873542050337, + "grad_norm": 0.1485711932182312, + "learning_rate": 3.9110654017802675e-07, + "loss": 1.7233, + "step": 31322 + }, + { + "epoch": 9.614180478821362, + "grad_norm": 0.16040800511837006, + "learning_rate": 3.9048629943403415e-07, + "loss": 1.7581, + "step": 31323 + }, + { + "epoch": 9.614487415592388, + "grad_norm": 0.18484649062156677, + "learning_rate": 3.8986654896032705e-07, + "loss": 1.7376, + "step": 31324 + }, + { + "epoch": 9.614794352363413, + "grad_norm": 0.11399713158607483, + "learning_rate": 3.892472887630394e-07, + "loss": 1.6831, + "step": 31325 + }, + { + "epoch": 9.615101289134438, + "grad_norm": 0.14001138508319855, + "learning_rate": 3.8862851884828855e-07, + "loss": 1.7201, + "step": 31326 + }, + { + "epoch": 9.615408225905464, + "grad_norm": 0.12577788531780243, + "learning_rate": 3.880102392221863e-07, + "loss": 1.6992, + "step": 31327 + }, + { + "epoch": 9.615715162676489, + "grad_norm": 0.20776085555553436, + "learning_rate": 3.873924498908443e-07, + "loss": 1.6801, + "step": 31328 + }, + { + "epoch": 9.616022099447514, + "grad_norm": 0.1547452211380005, + "learning_rate": 3.867751508603745e-07, + "loss": 1.7288, + "step": 31329 + }, + { + "epoch": 9.61632903621854, + "grad_norm": 0.16533677279949188, + "learning_rate": 3.861583421368664e-07, + "loss": 1.6842, + "step": 31330 + }, + { + "epoch": 9.616635972989565, + "grad_norm": 0.1557091921567917, + "learning_rate": 3.8554202372642623e-07, + "loss": 1.7522, + "step": 31331 + }, + { + "epoch": 9.616942909760589, + "grad_norm": 0.1304699331521988, + "learning_rate": 3.84926195635138e-07, + "loss": 1.6621, + "step": 31332 + }, + { + "epoch": 9.617249846531614, + "grad_norm": 0.2067500501871109, + "learning_rate": 3.8431085786908573e-07, + "loss": 1.7404, + "step": 31333 + }, + { + "epoch": 9.61755678330264, + "grad_norm": 0.15577533841133118, + "learning_rate": 3.83696010434359e-07, + "loss": 1.7357, + "step": 31334 + }, + { + "epoch": 9.617863720073665, + "grad_norm": 0.13889038562774658, + "learning_rate": 3.8308165333703073e-07, + "loss": 1.6994, + "step": 31335 + }, + { + "epoch": 9.61817065684469, + "grad_norm": 0.10292867571115494, + "learning_rate": 3.824677865831683e-07, + "loss": 1.6838, + "step": 31336 + }, + { + "epoch": 9.618477593615715, + "grad_norm": 0.19257314503192902, + "learning_rate": 3.8185441017883905e-07, + "loss": 1.7868, + "step": 31337 + }, + { + "epoch": 9.61878453038674, + "grad_norm": 0.13351574540138245, + "learning_rate": 3.8124152413010486e-07, + "loss": 1.7221, + "step": 31338 + }, + { + "epoch": 9.619091467157766, + "grad_norm": 0.14897382259368896, + "learning_rate": 3.8062912844302746e-07, + "loss": 1.7551, + "step": 31339 + }, + { + "epoch": 9.619398403928791, + "grad_norm": 0.16135838627815247, + "learning_rate": 3.800172231236576e-07, + "loss": 1.7806, + "step": 31340 + }, + { + "epoch": 9.619705340699817, + "grad_norm": 0.11817923933267593, + "learning_rate": 3.794058081780405e-07, + "loss": 1.7063, + "step": 31341 + }, + { + "epoch": 9.620012277470842, + "grad_norm": 0.11679195612668991, + "learning_rate": 3.787948836122157e-07, + "loss": 1.6841, + "step": 31342 + }, + { + "epoch": 9.620319214241865, + "grad_norm": 0.1286752074956894, + "learning_rate": 3.7818444943222287e-07, + "loss": 1.6883, + "step": 31343 + }, + { + "epoch": 9.62062615101289, + "grad_norm": 0.28080862760543823, + "learning_rate": 3.775745056441016e-07, + "loss": 1.7409, + "step": 31344 + }, + { + "epoch": 9.620933087783916, + "grad_norm": 0.11734452843666077, + "learning_rate": 3.7696505225386924e-07, + "loss": 1.6682, + "step": 31345 + }, + { + "epoch": 9.621240024554941, + "grad_norm": 0.10224849730730057, + "learning_rate": 3.763560892675544e-07, + "loss": 1.6771, + "step": 31346 + }, + { + "epoch": 9.621546961325967, + "grad_norm": 0.15901216864585876, + "learning_rate": 3.7574761669117443e-07, + "loss": 1.6718, + "step": 31347 + }, + { + "epoch": 9.621853898096992, + "grad_norm": 0.1088409572839737, + "learning_rate": 3.751396345307412e-07, + "loss": 1.6783, + "step": 31348 + }, + { + "epoch": 9.622160834868017, + "grad_norm": 0.1764845997095108, + "learning_rate": 3.7453214279226654e-07, + "loss": 1.7556, + "step": 31349 + }, + { + "epoch": 9.622467771639043, + "grad_norm": 0.11249416321516037, + "learning_rate": 3.739251414817457e-07, + "loss": 1.686, + "step": 31350 + }, + { + "epoch": 9.622774708410068, + "grad_norm": 0.1254713088274002, + "learning_rate": 3.7331863060519055e-07, + "loss": 1.6517, + "step": 31351 + }, + { + "epoch": 9.623081645181093, + "grad_norm": 0.16272024810314178, + "learning_rate": 3.727126101685852e-07, + "loss": 1.7195, + "step": 31352 + }, + { + "epoch": 9.623388581952117, + "grad_norm": 0.1234750747680664, + "learning_rate": 3.721070801779192e-07, + "loss": 1.7122, + "step": 31353 + }, + { + "epoch": 9.623695518723142, + "grad_norm": 0.17801089584827423, + "learning_rate": 3.7150204063918223e-07, + "loss": 1.703, + "step": 31354 + }, + { + "epoch": 9.624002455494168, + "grad_norm": 0.16611720621585846, + "learning_rate": 3.708974915583474e-07, + "loss": 1.7806, + "step": 31355 + }, + { + "epoch": 9.624309392265193, + "grad_norm": 0.18672671914100647, + "learning_rate": 3.702934329413932e-07, + "loss": 1.7299, + "step": 31356 + }, + { + "epoch": 9.624616329036218, + "grad_norm": 0.14166928827762604, + "learning_rate": 3.6968986479428705e-07, + "loss": 1.7213, + "step": 31357 + }, + { + "epoch": 9.624923265807244, + "grad_norm": 0.1553429216146469, + "learning_rate": 3.690867871229964e-07, + "loss": 1.7198, + "step": 31358 + }, + { + "epoch": 9.625230202578269, + "grad_norm": 0.12247302383184433, + "learning_rate": 3.6848419993348315e-07, + "loss": 1.7261, + "step": 31359 + }, + { + "epoch": 9.625537139349294, + "grad_norm": 0.11835172772407532, + "learning_rate": 3.6788210323169256e-07, + "loss": 1.6798, + "step": 31360 + }, + { + "epoch": 9.62584407612032, + "grad_norm": 0.13140064477920532, + "learning_rate": 3.67280497023581e-07, + "loss": 1.693, + "step": 31361 + }, + { + "epoch": 9.626151012891345, + "grad_norm": 0.15596047043800354, + "learning_rate": 3.6667938131509925e-07, + "loss": 1.7312, + "step": 31362 + }, + { + "epoch": 9.62645794966237, + "grad_norm": 0.1632358282804489, + "learning_rate": 3.6607875611218146e-07, + "loss": 1.7315, + "step": 31363 + }, + { + "epoch": 9.626764886433394, + "grad_norm": 0.1374986320734024, + "learning_rate": 3.654786214207617e-07, + "loss": 1.6954, + "step": 31364 + }, + { + "epoch": 9.62707182320442, + "grad_norm": 0.154662624001503, + "learning_rate": 3.648789772467742e-07, + "loss": 1.7133, + "step": 31365 + }, + { + "epoch": 9.627378759975445, + "grad_norm": 0.1405872106552124, + "learning_rate": 3.6427982359614753e-07, + "loss": 1.7156, + "step": 31366 + }, + { + "epoch": 9.62768569674647, + "grad_norm": 0.11641019582748413, + "learning_rate": 3.6368116047479914e-07, + "loss": 1.6961, + "step": 31367 + }, + { + "epoch": 9.627992633517495, + "grad_norm": 0.15025056898593903, + "learning_rate": 3.630829878886466e-07, + "loss": 1.6984, + "step": 31368 + }, + { + "epoch": 9.62829957028852, + "grad_norm": 0.1593703031539917, + "learning_rate": 3.6248530584360175e-07, + "loss": 1.7281, + "step": 31369 + }, + { + "epoch": 9.628606507059546, + "grad_norm": 0.16070005297660828, + "learning_rate": 3.6188811434557103e-07, + "loss": 1.6851, + "step": 31370 + }, + { + "epoch": 9.628913443830571, + "grad_norm": 0.1515837013721466, + "learning_rate": 3.612914134004552e-07, + "loss": 1.705, + "step": 31371 + }, + { + "epoch": 9.629220380601597, + "grad_norm": 0.21579277515411377, + "learning_rate": 3.606952030141497e-07, + "loss": 1.765, + "step": 31372 + }, + { + "epoch": 9.629527317372622, + "grad_norm": 0.11283712834119797, + "learning_rate": 3.6009948319254973e-07, + "loss": 1.67, + "step": 31373 + }, + { + "epoch": 9.629834254143645, + "grad_norm": 0.10959877073764801, + "learning_rate": 3.5950425394154497e-07, + "loss": 1.6336, + "step": 31374 + }, + { + "epoch": 9.63014119091467, + "grad_norm": 0.15441931784152985, + "learning_rate": 3.5890951526700857e-07, + "loss": 1.7151, + "step": 31375 + }, + { + "epoch": 9.630448127685696, + "grad_norm": 0.10803858935832977, + "learning_rate": 3.583152671748302e-07, + "loss": 1.7114, + "step": 31376 + }, + { + "epoch": 9.630755064456721, + "grad_norm": 0.10860857367515564, + "learning_rate": 3.5772150967086637e-07, + "loss": 1.6905, + "step": 31377 + }, + { + "epoch": 9.631062001227747, + "grad_norm": 0.1574680209159851, + "learning_rate": 3.5712824276100674e-07, + "loss": 1.7139, + "step": 31378 + }, + { + "epoch": 9.631368937998772, + "grad_norm": 0.14044490456581116, + "learning_rate": 3.565354664510967e-07, + "loss": 1.7163, + "step": 31379 + }, + { + "epoch": 9.631675874769797, + "grad_norm": 0.11367516964673996, + "learning_rate": 3.559431807469982e-07, + "loss": 1.714, + "step": 31380 + }, + { + "epoch": 9.631982811540823, + "grad_norm": 0.15081267058849335, + "learning_rate": 3.553513856545676e-07, + "loss": 1.7021, + "step": 31381 + }, + { + "epoch": 9.632289748311848, + "grad_norm": 0.11578520387411118, + "learning_rate": 3.5476008117965586e-07, + "loss": 1.6566, + "step": 31382 + }, + { + "epoch": 9.632596685082873, + "grad_norm": 0.10944022983312607, + "learning_rate": 3.541692673280972e-07, + "loss": 1.6499, + "step": 31383 + }, + { + "epoch": 9.632903621853899, + "grad_norm": 0.18682554364204407, + "learning_rate": 3.5357894410574243e-07, + "loss": 1.8172, + "step": 31384 + }, + { + "epoch": 9.633210558624924, + "grad_norm": 0.14995524287223816, + "learning_rate": 3.5298911151841475e-07, + "loss": 1.7285, + "step": 31385 + }, + { + "epoch": 9.633517495395948, + "grad_norm": 0.13728348910808563, + "learning_rate": 3.523997695719483e-07, + "loss": 1.6732, + "step": 31386 + }, + { + "epoch": 9.633824432166973, + "grad_norm": 0.14575724303722382, + "learning_rate": 3.518109182721718e-07, + "loss": 1.7013, + "step": 31387 + }, + { + "epoch": 9.634131368937998, + "grad_norm": 0.140236034989357, + "learning_rate": 3.512225576248918e-07, + "loss": 1.7609, + "step": 31388 + }, + { + "epoch": 9.634438305709024, + "grad_norm": 0.14315754175186157, + "learning_rate": 3.506346876359368e-07, + "loss": 1.713, + "step": 31389 + }, + { + "epoch": 9.634745242480049, + "grad_norm": 0.17747996747493744, + "learning_rate": 3.500473083111022e-07, + "loss": 1.6988, + "step": 31390 + }, + { + "epoch": 9.635052179251074, + "grad_norm": 0.1338483840227127, + "learning_rate": 3.4946041965621124e-07, + "loss": 1.6602, + "step": 31391 + }, + { + "epoch": 9.6353591160221, + "grad_norm": 0.14221277832984924, + "learning_rate": 3.488740216770481e-07, + "loss": 1.712, + "step": 31392 + }, + { + "epoch": 9.635666052793125, + "grad_norm": 0.18484778702259064, + "learning_rate": 3.482881143794137e-07, + "loss": 1.7008, + "step": 31393 + }, + { + "epoch": 9.63597298956415, + "grad_norm": 0.11398128420114517, + "learning_rate": 3.4770269776909783e-07, + "loss": 1.6884, + "step": 31394 + }, + { + "epoch": 9.636279926335176, + "grad_norm": 0.20213046669960022, + "learning_rate": 3.4711777185188477e-07, + "loss": 1.7539, + "step": 31395 + }, + { + "epoch": 9.6365868631062, + "grad_norm": 0.15737096965312958, + "learning_rate": 3.465333366335588e-07, + "loss": 1.7451, + "step": 31396 + }, + { + "epoch": 9.636893799877225, + "grad_norm": 0.18838335573673248, + "learning_rate": 3.459493921198931e-07, + "loss": 1.6942, + "step": 31397 + }, + { + "epoch": 9.63720073664825, + "grad_norm": 0.1837395280599594, + "learning_rate": 3.453659383166552e-07, + "loss": 1.7428, + "step": 31398 + }, + { + "epoch": 9.637507673419275, + "grad_norm": 0.153046116232872, + "learning_rate": 3.4478297522961834e-07, + "loss": 1.6806, + "step": 31399 + }, + { + "epoch": 9.6378146101903, + "grad_norm": 0.16290830075740814, + "learning_rate": 3.44200502864539e-07, + "loss": 1.7669, + "step": 31400 + }, + { + "epoch": 9.638121546961326, + "grad_norm": 0.17401064932346344, + "learning_rate": 3.4361852122717364e-07, + "loss": 1.7221, + "step": 31401 + }, + { + "epoch": 9.638428483732351, + "grad_norm": 0.176009401679039, + "learning_rate": 3.4303703032327325e-07, + "loss": 1.7514, + "step": 31402 + }, + { + "epoch": 9.638735420503377, + "grad_norm": 0.1500163972377777, + "learning_rate": 3.424560301585888e-07, + "loss": 1.7216, + "step": 31403 + }, + { + "epoch": 9.639042357274402, + "grad_norm": 0.10302964597940445, + "learning_rate": 3.418755207388602e-07, + "loss": 1.6702, + "step": 31404 + }, + { + "epoch": 9.639349294045427, + "grad_norm": 0.13488547503948212, + "learning_rate": 3.412955020698216e-07, + "loss": 1.7303, + "step": 31405 + }, + { + "epoch": 9.639656230816453, + "grad_norm": 0.11274787783622742, + "learning_rate": 3.407159741572019e-07, + "loss": 1.6658, + "step": 31406 + }, + { + "epoch": 9.639963167587476, + "grad_norm": 0.17834068834781647, + "learning_rate": 3.401369370067353e-07, + "loss": 1.75, + "step": 31407 + }, + { + "epoch": 9.640270104358502, + "grad_norm": 0.1692495495080948, + "learning_rate": 3.395583906241506e-07, + "loss": 1.7364, + "step": 31408 + }, + { + "epoch": 9.640577041129527, + "grad_norm": 0.1486683338880539, + "learning_rate": 3.3898033501514323e-07, + "loss": 1.7056, + "step": 31409 + }, + { + "epoch": 9.640883977900552, + "grad_norm": 0.1396656632423401, + "learning_rate": 3.384027701854531e-07, + "loss": 1.7026, + "step": 31410 + }, + { + "epoch": 9.641190914671578, + "grad_norm": 0.09748127311468124, + "learning_rate": 3.3782569614076444e-07, + "loss": 1.6905, + "step": 31411 + }, + { + "epoch": 9.641497851442603, + "grad_norm": 0.24635939300060272, + "learning_rate": 3.3724911288679494e-07, + "loss": 1.7301, + "step": 31412 + }, + { + "epoch": 9.641804788213628, + "grad_norm": 0.1656247079372406, + "learning_rate": 3.3667302042923453e-07, + "loss": 1.7279, + "step": 31413 + }, + { + "epoch": 9.642111724984654, + "grad_norm": 0.1069309264421463, + "learning_rate": 3.360974187737842e-07, + "loss": 1.666, + "step": 31414 + }, + { + "epoch": 9.642418661755679, + "grad_norm": 0.16244177520275116, + "learning_rate": 3.355223079261227e-07, + "loss": 1.8195, + "step": 31415 + }, + { + "epoch": 9.642725598526704, + "grad_norm": 0.11351195722818375, + "learning_rate": 3.3494768789194554e-07, + "loss": 1.669, + "step": 31416 + }, + { + "epoch": 9.643032535297728, + "grad_norm": 0.20543862879276276, + "learning_rate": 3.3437355867692034e-07, + "loss": 1.7004, + "step": 31417 + }, + { + "epoch": 9.643339472068753, + "grad_norm": 0.12174477428197861, + "learning_rate": 3.337999202867259e-07, + "loss": 1.6945, + "step": 31418 + }, + { + "epoch": 9.643646408839778, + "grad_norm": 0.14274805784225464, + "learning_rate": 3.332267727270355e-07, + "loss": 1.7221, + "step": 31419 + }, + { + "epoch": 9.643953345610804, + "grad_norm": 0.13756579160690308, + "learning_rate": 3.326541160035057e-07, + "loss": 1.6995, + "step": 31420 + }, + { + "epoch": 9.644260282381829, + "grad_norm": 0.1515035182237625, + "learning_rate": 3.320819501217931e-07, + "loss": 1.7469, + "step": 31421 + }, + { + "epoch": 9.644567219152854, + "grad_norm": 0.13177438080310822, + "learning_rate": 3.315102750875654e-07, + "loss": 1.665, + "step": 31422 + }, + { + "epoch": 9.64487415592388, + "grad_norm": 0.13083817064762115, + "learning_rate": 3.309390909064625e-07, + "loss": 1.7156, + "step": 31423 + }, + { + "epoch": 9.645181092694905, + "grad_norm": 0.16704332828521729, + "learning_rate": 3.303683975841299e-07, + "loss": 1.7149, + "step": 31424 + }, + { + "epoch": 9.64548802946593, + "grad_norm": 0.11540384590625763, + "learning_rate": 3.29798195126213e-07, + "loss": 1.6848, + "step": 31425 + }, + { + "epoch": 9.645794966236956, + "grad_norm": 0.13248707354068756, + "learning_rate": 3.292284835383408e-07, + "loss": 1.6887, + "step": 31426 + }, + { + "epoch": 9.646101903007981, + "grad_norm": 0.14763472974300385, + "learning_rate": 3.2865926282614755e-07, + "loss": 1.7406, + "step": 31427 + }, + { + "epoch": 9.646408839779006, + "grad_norm": 0.17477329075336456, + "learning_rate": 3.2809053299525105e-07, + "loss": 1.7448, + "step": 31428 + }, + { + "epoch": 9.64671577655003, + "grad_norm": 0.2105390578508377, + "learning_rate": 3.275222940512801e-07, + "loss": 1.7695, + "step": 31429 + }, + { + "epoch": 9.647022713321055, + "grad_norm": 0.14712996780872345, + "learning_rate": 3.2695454599985243e-07, + "loss": 1.7161, + "step": 31430 + }, + { + "epoch": 9.64732965009208, + "grad_norm": 0.15937888622283936, + "learning_rate": 3.263872888465691e-07, + "loss": 1.7598, + "step": 31431 + }, + { + "epoch": 9.647636586863106, + "grad_norm": 0.10824455320835114, + "learning_rate": 3.258205225970423e-07, + "loss": 1.6662, + "step": 31432 + }, + { + "epoch": 9.647943523634131, + "grad_norm": 0.12431895732879639, + "learning_rate": 3.2525424725687315e-07, + "loss": 1.6932, + "step": 31433 + }, + { + "epoch": 9.648250460405157, + "grad_norm": 0.14159630239009857, + "learning_rate": 3.246884628316571e-07, + "loss": 1.7091, + "step": 31434 + }, + { + "epoch": 9.648557397176182, + "grad_norm": 0.17578476667404175, + "learning_rate": 3.241231693269842e-07, + "loss": 1.6818, + "step": 31435 + }, + { + "epoch": 9.648864333947207, + "grad_norm": 0.17417892813682556, + "learning_rate": 3.235583667484443e-07, + "loss": 1.7712, + "step": 31436 + }, + { + "epoch": 9.649171270718233, + "grad_norm": 0.12163690477609634, + "learning_rate": 3.2299405510161087e-07, + "loss": 1.7261, + "step": 31437 + }, + { + "epoch": 9.649478207489258, + "grad_norm": 0.1171955019235611, + "learning_rate": 3.224302343920738e-07, + "loss": 1.6785, + "step": 31438 + }, + { + "epoch": 9.649785144260282, + "grad_norm": 0.11423932015895844, + "learning_rate": 3.2186690462539524e-07, + "loss": 1.7166, + "step": 31439 + }, + { + "epoch": 9.650092081031307, + "grad_norm": 0.16560381650924683, + "learning_rate": 3.213040658071431e-07, + "loss": 1.7179, + "step": 31440 + }, + { + "epoch": 9.650399017802332, + "grad_norm": 0.1309049129486084, + "learning_rate": 3.207417179428851e-07, + "loss": 1.6982, + "step": 31441 + }, + { + "epoch": 9.650705954573358, + "grad_norm": 0.13441912829875946, + "learning_rate": 3.201798610381723e-07, + "loss": 1.7376, + "step": 31442 + }, + { + "epoch": 9.651012891344383, + "grad_norm": 0.10977588593959808, + "learning_rate": 3.1961849509856143e-07, + "loss": 1.7127, + "step": 31443 + }, + { + "epoch": 9.651319828115408, + "grad_norm": 0.11772170662879944, + "learning_rate": 3.190576201296036e-07, + "loss": 1.6816, + "step": 31444 + }, + { + "epoch": 9.651626764886434, + "grad_norm": 0.17650476098060608, + "learning_rate": 3.1849723613683323e-07, + "loss": 1.713, + "step": 31445 + }, + { + "epoch": 9.651933701657459, + "grad_norm": 0.12182165682315826, + "learning_rate": 3.1793734312579037e-07, + "loss": 1.6243, + "step": 31446 + }, + { + "epoch": 9.652240638428484, + "grad_norm": 0.1657133251428604, + "learning_rate": 3.17377941102015e-07, + "loss": 1.7207, + "step": 31447 + }, + { + "epoch": 9.65254757519951, + "grad_norm": 0.15303701162338257, + "learning_rate": 3.1681903007102496e-07, + "loss": 1.7345, + "step": 31448 + }, + { + "epoch": 9.652854511970535, + "grad_norm": 0.17544081807136536, + "learning_rate": 3.162606100383547e-07, + "loss": 1.7176, + "step": 31449 + }, + { + "epoch": 9.653161448741558, + "grad_norm": 0.12232106178998947, + "learning_rate": 3.157026810095165e-07, + "loss": 1.7007, + "step": 31450 + }, + { + "epoch": 9.653468385512584, + "grad_norm": 0.12764953076839447, + "learning_rate": 3.1514524299002255e-07, + "loss": 1.7214, + "step": 31451 + }, + { + "epoch": 9.65377532228361, + "grad_norm": 0.19449979066848755, + "learning_rate": 3.1458829598539077e-07, + "loss": 1.721, + "step": 31452 + }, + { + "epoch": 9.654082259054634, + "grad_norm": 0.15264229476451874, + "learning_rate": 3.1403184000111106e-07, + "loss": 1.7486, + "step": 31453 + }, + { + "epoch": 9.65438919582566, + "grad_norm": 0.12420966476202011, + "learning_rate": 3.134758750426958e-07, + "loss": 1.6993, + "step": 31454 + }, + { + "epoch": 9.654696132596685, + "grad_norm": 0.16511085629463196, + "learning_rate": 3.1292040111563503e-07, + "loss": 1.7311, + "step": 31455 + }, + { + "epoch": 9.65500306936771, + "grad_norm": 0.16847728192806244, + "learning_rate": 3.123654182254132e-07, + "loss": 1.7988, + "step": 31456 + }, + { + "epoch": 9.655310006138736, + "grad_norm": 0.1573457270860672, + "learning_rate": 3.118109263775204e-07, + "loss": 1.752, + "step": 31457 + }, + { + "epoch": 9.655616942909761, + "grad_norm": 0.11476359516382217, + "learning_rate": 3.1125692557743555e-07, + "loss": 1.7258, + "step": 31458 + }, + { + "epoch": 9.655923879680786, + "grad_norm": 0.14234037697315216, + "learning_rate": 3.1070341583063767e-07, + "loss": 1.6909, + "step": 31459 + }, + { + "epoch": 9.65623081645181, + "grad_norm": 0.11332587152719498, + "learning_rate": 3.101503971425834e-07, + "loss": 1.641, + "step": 31460 + }, + { + "epoch": 9.656537753222835, + "grad_norm": 0.10850653052330017, + "learning_rate": 3.0959786951875735e-07, + "loss": 1.678, + "step": 31461 + }, + { + "epoch": 9.65684468999386, + "grad_norm": 0.14826613664627075, + "learning_rate": 3.0904583296459953e-07, + "loss": 1.7056, + "step": 31462 + }, + { + "epoch": 9.657151626764886, + "grad_norm": 0.10023099184036255, + "learning_rate": 3.084942874855834e-07, + "loss": 1.6457, + "step": 31463 + }, + { + "epoch": 9.657458563535911, + "grad_norm": 0.12071017175912857, + "learning_rate": 3.07943233087149e-07, + "loss": 1.667, + "step": 31464 + }, + { + "epoch": 9.657765500306937, + "grad_norm": 0.13804757595062256, + "learning_rate": 3.07392669774742e-07, + "loss": 1.7054, + "step": 31465 + }, + { + "epoch": 9.658072437077962, + "grad_norm": 0.1364121288061142, + "learning_rate": 3.0684259755380805e-07, + "loss": 1.7159, + "step": 31466 + }, + { + "epoch": 9.658379373848987, + "grad_norm": 0.11550064384937286, + "learning_rate": 3.062930164297817e-07, + "loss": 1.7289, + "step": 31467 + }, + { + "epoch": 9.658686310620013, + "grad_norm": 0.13400794565677643, + "learning_rate": 3.0574392640809744e-07, + "loss": 1.7216, + "step": 31468 + }, + { + "epoch": 9.658993247391038, + "grad_norm": 0.12369029968976974, + "learning_rate": 3.0519532749417876e-07, + "loss": 1.6923, + "step": 31469 + }, + { + "epoch": 9.659300184162063, + "grad_norm": 0.1034984290599823, + "learning_rate": 3.046472196934436e-07, + "loss": 1.6398, + "step": 31470 + }, + { + "epoch": 9.659607120933089, + "grad_norm": 0.14667385816574097, + "learning_rate": 3.040996030113097e-07, + "loss": 1.7016, + "step": 31471 + }, + { + "epoch": 9.659914057704112, + "grad_norm": 0.14836667478084564, + "learning_rate": 3.0355247745319505e-07, + "loss": 1.7003, + "step": 31472 + }, + { + "epoch": 9.660220994475138, + "grad_norm": 0.1664000302553177, + "learning_rate": 3.0300584302450643e-07, + "loss": 1.7092, + "step": 31473 + }, + { + "epoch": 9.660527931246163, + "grad_norm": 0.16207198798656464, + "learning_rate": 3.0245969973063393e-07, + "loss": 1.7219, + "step": 31474 + }, + { + "epoch": 9.660834868017188, + "grad_norm": 0.1146533191204071, + "learning_rate": 3.0191404757698995e-07, + "loss": 1.6975, + "step": 31475 + }, + { + "epoch": 9.661141804788214, + "grad_norm": 0.12706562876701355, + "learning_rate": 3.01368886568959e-07, + "loss": 1.7127, + "step": 31476 + }, + { + "epoch": 9.661448741559239, + "grad_norm": 0.16857001185417175, + "learning_rate": 3.0082421671192575e-07, + "loss": 1.7362, + "step": 31477 + }, + { + "epoch": 9.661755678330264, + "grad_norm": 0.11784416437149048, + "learning_rate": 3.002800380112802e-07, + "loss": 1.6776, + "step": 31478 + }, + { + "epoch": 9.66206261510129, + "grad_norm": 0.12407553941011429, + "learning_rate": 2.99736350472396e-07, + "loss": 1.6977, + "step": 31479 + }, + { + "epoch": 9.662369551872315, + "grad_norm": 0.09917214512825012, + "learning_rate": 2.9919315410065205e-07, + "loss": 1.6435, + "step": 31480 + }, + { + "epoch": 9.66267648864334, + "grad_norm": 0.1242169663310051, + "learning_rate": 2.9865044890140524e-07, + "loss": 1.7232, + "step": 31481 + }, + { + "epoch": 9.662983425414364, + "grad_norm": 0.15999211370944977, + "learning_rate": 2.98108234880029e-07, + "loss": 1.7018, + "step": 31482 + }, + { + "epoch": 9.66329036218539, + "grad_norm": 0.14428645372390747, + "learning_rate": 2.9756651204188026e-07, + "loss": 1.7605, + "step": 31483 + }, + { + "epoch": 9.663597298956415, + "grad_norm": 0.11703366786241531, + "learning_rate": 2.970252803923046e-07, + "loss": 1.6428, + "step": 31484 + }, + { + "epoch": 9.66390423572744, + "grad_norm": 0.11491703242063522, + "learning_rate": 2.9648453993666446e-07, + "loss": 1.6709, + "step": 31485 + }, + { + "epoch": 9.664211172498465, + "grad_norm": 0.13316640257835388, + "learning_rate": 2.959442906802945e-07, + "loss": 1.6886, + "step": 31486 + }, + { + "epoch": 9.66451810926949, + "grad_norm": 0.11723330616950989, + "learning_rate": 2.9540453262853486e-07, + "loss": 1.694, + "step": 31487 + }, + { + "epoch": 9.664825046040516, + "grad_norm": 0.20565512776374817, + "learning_rate": 2.948652657867146e-07, + "loss": 1.7406, + "step": 31488 + }, + { + "epoch": 9.665131982811541, + "grad_norm": 0.147149458527565, + "learning_rate": 2.9432649016017387e-07, + "loss": 1.7548, + "step": 31489 + }, + { + "epoch": 9.665438919582567, + "grad_norm": 0.15741130709648132, + "learning_rate": 2.937882057542363e-07, + "loss": 1.7005, + "step": 31490 + }, + { + "epoch": 9.665745856353592, + "grad_norm": 0.1358392834663391, + "learning_rate": 2.9325041257421414e-07, + "loss": 1.6991, + "step": 31491 + }, + { + "epoch": 9.666052793124617, + "grad_norm": 0.12195859849452972, + "learning_rate": 2.9271311062541994e-07, + "loss": 1.6764, + "step": 31492 + }, + { + "epoch": 9.66635972989564, + "grad_norm": 0.12507489323616028, + "learning_rate": 2.921762999131772e-07, + "loss": 1.7158, + "step": 31493 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 0.12813931703567505, + "learning_rate": 2.9163998044277606e-07, + "loss": 1.7066, + "step": 31494 + }, + { + "epoch": 9.666973603437691, + "grad_norm": 0.2424009144306183, + "learning_rate": 2.91104152219529e-07, + "loss": 1.7572, + "step": 31495 + }, + { + "epoch": 9.667280540208717, + "grad_norm": 0.21357449889183044, + "learning_rate": 2.905688152487207e-07, + "loss": 1.7907, + "step": 31496 + }, + { + "epoch": 9.667587476979742, + "grad_norm": 0.15474599599838257, + "learning_rate": 2.900339695356524e-07, + "loss": 1.7448, + "step": 31497 + }, + { + "epoch": 9.667894413750767, + "grad_norm": 0.16011640429496765, + "learning_rate": 2.894996150856033e-07, + "loss": 1.7227, + "step": 31498 + }, + { + "epoch": 9.668201350521793, + "grad_norm": 0.1319362372159958, + "learning_rate": 2.8896575190385246e-07, + "loss": 1.6923, + "step": 31499 + }, + { + "epoch": 9.668508287292818, + "grad_norm": 0.11635458469390869, + "learning_rate": 2.8843237999567897e-07, + "loss": 1.6809, + "step": 31500 + }, + { + "epoch": 9.668815224063843, + "grad_norm": 0.13584496080875397, + "learning_rate": 2.8789949936635643e-07, + "loss": 1.6735, + "step": 31501 + }, + { + "epoch": 9.669122160834869, + "grad_norm": 0.16113825142383575, + "learning_rate": 2.873671100211528e-07, + "loss": 1.7549, + "step": 31502 + }, + { + "epoch": 9.669429097605892, + "grad_norm": 0.20962439477443695, + "learning_rate": 2.8683521196531394e-07, + "loss": 1.7772, + "step": 31503 + }, + { + "epoch": 9.669736034376918, + "grad_norm": 0.10806034505367279, + "learning_rate": 2.863038052041134e-07, + "loss": 1.6577, + "step": 31504 + }, + { + "epoch": 9.670042971147943, + "grad_norm": 0.12040059268474579, + "learning_rate": 2.857728897427969e-07, + "loss": 1.6706, + "step": 31505 + }, + { + "epoch": 9.670349907918968, + "grad_norm": 0.1953112930059433, + "learning_rate": 2.8524246558661036e-07, + "loss": 1.7453, + "step": 31506 + }, + { + "epoch": 9.670656844689994, + "grad_norm": 0.17382025718688965, + "learning_rate": 2.8471253274079404e-07, + "loss": 1.7831, + "step": 31507 + }, + { + "epoch": 9.670963781461019, + "grad_norm": 0.18416909873485565, + "learning_rate": 2.8418309121058804e-07, + "loss": 1.7017, + "step": 31508 + }, + { + "epoch": 9.671270718232044, + "grad_norm": 0.21286524832248688, + "learning_rate": 2.836541410012272e-07, + "loss": 1.7681, + "step": 31509 + }, + { + "epoch": 9.67157765500307, + "grad_norm": 0.11352343112230301, + "learning_rate": 2.831256821179351e-07, + "loss": 1.7172, + "step": 31510 + }, + { + "epoch": 9.671884591774095, + "grad_norm": 0.14935974776744843, + "learning_rate": 2.825977145659298e-07, + "loss": 1.7043, + "step": 31511 + }, + { + "epoch": 9.67219152854512, + "grad_norm": 0.13719774782657623, + "learning_rate": 2.8207023835044035e-07, + "loss": 1.6705, + "step": 31512 + }, + { + "epoch": 9.672498465316146, + "grad_norm": 0.1506626158952713, + "learning_rate": 2.815432534766738e-07, + "loss": 1.7178, + "step": 31513 + }, + { + "epoch": 9.67280540208717, + "grad_norm": 0.11403869092464447, + "learning_rate": 2.81016759949837e-07, + "loss": 1.6708, + "step": 31514 + }, + { + "epoch": 9.673112338858195, + "grad_norm": 0.13970594108104706, + "learning_rate": 2.804907577751259e-07, + "loss": 1.7231, + "step": 31515 + }, + { + "epoch": 9.67341927562922, + "grad_norm": 0.21075570583343506, + "learning_rate": 2.7996524695775296e-07, + "loss": 1.7665, + "step": 31516 + }, + { + "epoch": 9.673726212400245, + "grad_norm": 0.2385234236717224, + "learning_rate": 2.7944022750290844e-07, + "loss": 1.7734, + "step": 31517 + }, + { + "epoch": 9.67403314917127, + "grad_norm": 0.1346839815378189, + "learning_rate": 2.7891569941577155e-07, + "loss": 1.7094, + "step": 31518 + }, + { + "epoch": 9.674340085942296, + "grad_norm": 0.2111053764820099, + "learning_rate": 2.7839166270153814e-07, + "loss": 1.7638, + "step": 31519 + }, + { + "epoch": 9.674647022713321, + "grad_norm": 0.14439715445041656, + "learning_rate": 2.7786811736537633e-07, + "loss": 1.7203, + "step": 31520 + }, + { + "epoch": 9.674953959484347, + "grad_norm": 0.14776118099689484, + "learning_rate": 2.773450634124708e-07, + "loss": 1.7155, + "step": 31521 + }, + { + "epoch": 9.675260896255372, + "grad_norm": 0.1370704621076584, + "learning_rate": 2.768225008479786e-07, + "loss": 1.7356, + "step": 31522 + }, + { + "epoch": 9.675567833026397, + "grad_norm": 0.10558994114398956, + "learning_rate": 2.7630042967707327e-07, + "loss": 1.6784, + "step": 31523 + }, + { + "epoch": 9.675874769797423, + "grad_norm": 0.13506318628787994, + "learning_rate": 2.757788499049063e-07, + "loss": 1.7027, + "step": 31524 + }, + { + "epoch": 9.676181706568446, + "grad_norm": 0.15606056153774261, + "learning_rate": 2.7525776153664585e-07, + "loss": 1.7167, + "step": 31525 + }, + { + "epoch": 9.676488643339471, + "grad_norm": 0.13950656354427338, + "learning_rate": 2.747371645774266e-07, + "loss": 1.7167, + "step": 31526 + }, + { + "epoch": 9.676795580110497, + "grad_norm": 0.11195974797010422, + "learning_rate": 2.742170590324e-07, + "loss": 1.6788, + "step": 31527 + }, + { + "epoch": 9.677102516881522, + "grad_norm": 0.13597041368484497, + "learning_rate": 2.7369744490670093e-07, + "loss": 1.7004, + "step": 31528 + }, + { + "epoch": 9.677409453652547, + "grad_norm": 0.1279800981283188, + "learning_rate": 2.731783222054807e-07, + "loss": 1.6619, + "step": 31529 + }, + { + "epoch": 9.677716390423573, + "grad_norm": 0.11803285032510757, + "learning_rate": 2.7265969093384635e-07, + "loss": 1.6938, + "step": 31530 + }, + { + "epoch": 9.678023327194598, + "grad_norm": 0.09654967486858368, + "learning_rate": 2.7214155109694384e-07, + "loss": 1.6319, + "step": 31531 + }, + { + "epoch": 9.678330263965623, + "grad_norm": 0.14024733006954193, + "learning_rate": 2.7162390269988015e-07, + "loss": 1.7084, + "step": 31532 + }, + { + "epoch": 9.678637200736649, + "grad_norm": 0.19366827607154846, + "learning_rate": 2.7110674574777895e-07, + "loss": 1.7398, + "step": 31533 + }, + { + "epoch": 9.678944137507674, + "grad_norm": 0.15786738693714142, + "learning_rate": 2.705900802457473e-07, + "loss": 1.797, + "step": 31534 + }, + { + "epoch": 9.6792510742787, + "grad_norm": 0.1426011621952057, + "learning_rate": 2.7007390619888663e-07, + "loss": 1.7103, + "step": 31535 + }, + { + "epoch": 9.679558011049723, + "grad_norm": 0.1344282180070877, + "learning_rate": 2.695582236123151e-07, + "loss": 1.7135, + "step": 31536 + }, + { + "epoch": 9.679864947820748, + "grad_norm": 0.10107547789812088, + "learning_rate": 2.690430324911064e-07, + "loss": 1.6692, + "step": 31537 + }, + { + "epoch": 9.680171884591774, + "grad_norm": 0.19397902488708496, + "learning_rate": 2.6852833284036205e-07, + "loss": 1.7143, + "step": 31538 + }, + { + "epoch": 9.680478821362799, + "grad_norm": 0.15761269629001617, + "learning_rate": 2.6801412466517794e-07, + "loss": 1.6885, + "step": 31539 + }, + { + "epoch": 9.680785758133824, + "grad_norm": 0.12093541026115417, + "learning_rate": 2.675004079706223e-07, + "loss": 1.6949, + "step": 31540 + }, + { + "epoch": 9.68109269490485, + "grad_norm": 0.2050214260816574, + "learning_rate": 2.6698718276177424e-07, + "loss": 1.7948, + "step": 31541 + }, + { + "epoch": 9.681399631675875, + "grad_norm": 0.1070958599448204, + "learning_rate": 2.6647444904370766e-07, + "loss": 1.6969, + "step": 31542 + }, + { + "epoch": 9.6817065684469, + "grad_norm": 0.1629544496536255, + "learning_rate": 2.659622068214962e-07, + "loss": 1.6925, + "step": 31543 + }, + { + "epoch": 9.682013505217926, + "grad_norm": 0.15261006355285645, + "learning_rate": 2.6545045610019134e-07, + "loss": 1.7208, + "step": 31544 + }, + { + "epoch": 9.682320441988951, + "grad_norm": 0.2154887616634369, + "learning_rate": 2.649391968848558e-07, + "loss": 1.7614, + "step": 31545 + }, + { + "epoch": 9.682627378759975, + "grad_norm": 0.13233666121959686, + "learning_rate": 2.6442842918054657e-07, + "loss": 1.6904, + "step": 31546 + }, + { + "epoch": 9.682934315531, + "grad_norm": 0.10197919607162476, + "learning_rate": 2.6391815299230404e-07, + "loss": 1.6598, + "step": 31547 + }, + { + "epoch": 9.683241252302025, + "grad_norm": 0.14219482243061066, + "learning_rate": 2.634083683251742e-07, + "loss": 1.7124, + "step": 31548 + }, + { + "epoch": 9.68354818907305, + "grad_norm": 0.1305442601442337, + "learning_rate": 2.62899075184192e-07, + "loss": 1.6668, + "step": 31549 + }, + { + "epoch": 9.683855125844076, + "grad_norm": 0.13593846559524536, + "learning_rate": 2.6239027357439215e-07, + "loss": 1.7191, + "step": 31550 + }, + { + "epoch": 9.684162062615101, + "grad_norm": 0.16108329594135284, + "learning_rate": 2.618819635008041e-07, + "loss": 1.7046, + "step": 31551 + }, + { + "epoch": 9.684468999386127, + "grad_norm": 0.11882323026657104, + "learning_rate": 2.613741449684515e-07, + "loss": 1.6884, + "step": 31552 + }, + { + "epoch": 9.684775936157152, + "grad_norm": 0.15399985015392303, + "learning_rate": 2.6086681798235813e-07, + "loss": 1.7531, + "step": 31553 + }, + { + "epoch": 9.685082872928177, + "grad_norm": 0.14765115082263947, + "learning_rate": 2.6035998254752556e-07, + "loss": 1.7497, + "step": 31554 + }, + { + "epoch": 9.685389809699203, + "grad_norm": 0.13859078288078308, + "learning_rate": 2.5985363866897207e-07, + "loss": 1.7389, + "step": 31555 + }, + { + "epoch": 9.685696746470228, + "grad_norm": 0.11727506667375565, + "learning_rate": 2.5934778635169355e-07, + "loss": 1.7132, + "step": 31556 + }, + { + "epoch": 9.686003683241251, + "grad_norm": 0.13762840628623962, + "learning_rate": 2.5884242560069715e-07, + "loss": 1.657, + "step": 31557 + }, + { + "epoch": 9.686310620012277, + "grad_norm": 0.107251837849617, + "learning_rate": 2.583375564209789e-07, + "loss": 1.6836, + "step": 31558 + }, + { + "epoch": 9.686617556783302, + "grad_norm": 0.11991941183805466, + "learning_rate": 2.578331788175181e-07, + "loss": 1.6818, + "step": 31559 + }, + { + "epoch": 9.686924493554327, + "grad_norm": 0.17173689603805542, + "learning_rate": 2.5732929279530524e-07, + "loss": 1.7343, + "step": 31560 + }, + { + "epoch": 9.687231430325353, + "grad_norm": 0.1334245204925537, + "learning_rate": 2.568258983593197e-07, + "loss": 1.7081, + "step": 31561 + }, + { + "epoch": 9.687538367096378, + "grad_norm": 0.1360604166984558, + "learning_rate": 2.563229955145352e-07, + "loss": 1.7082, + "step": 31562 + }, + { + "epoch": 9.687845303867404, + "grad_norm": 0.3039763569831848, + "learning_rate": 2.558205842659256e-07, + "loss": 1.7443, + "step": 31563 + }, + { + "epoch": 9.688152240638429, + "grad_norm": 0.17424632608890533, + "learning_rate": 2.55318664618448e-07, + "loss": 1.7217, + "step": 31564 + }, + { + "epoch": 9.688459177409454, + "grad_norm": 0.131890669465065, + "learning_rate": 2.5481723657707066e-07, + "loss": 1.6749, + "step": 31565 + }, + { + "epoch": 9.68876611418048, + "grad_norm": 0.12297450006008148, + "learning_rate": 2.543163001467452e-07, + "loss": 1.6953, + "step": 31566 + }, + { + "epoch": 9.689073050951503, + "grad_norm": 0.1080961599946022, + "learning_rate": 2.5381585533242325e-07, + "loss": 1.6325, + "step": 31567 + }, + { + "epoch": 9.689379987722528, + "grad_norm": 0.13262009620666504, + "learning_rate": 2.533159021390508e-07, + "loss": 1.6726, + "step": 31568 + }, + { + "epoch": 9.689686924493554, + "grad_norm": 0.1416144073009491, + "learning_rate": 2.5281644057156826e-07, + "loss": 1.6882, + "step": 31569 + }, + { + "epoch": 9.689993861264579, + "grad_norm": 0.16881975531578064, + "learning_rate": 2.5231747063491076e-07, + "loss": 1.7309, + "step": 31570 + }, + { + "epoch": 9.690300798035604, + "grad_norm": 0.15745007991790771, + "learning_rate": 2.518189923340075e-07, + "loss": 1.7729, + "step": 31571 + }, + { + "epoch": 9.69060773480663, + "grad_norm": 0.10348693281412125, + "learning_rate": 2.513210056737936e-07, + "loss": 1.6627, + "step": 31572 + }, + { + "epoch": 9.690914671577655, + "grad_norm": 0.11885415762662888, + "learning_rate": 2.5082351065917607e-07, + "loss": 1.6765, + "step": 31573 + }, + { + "epoch": 9.69122160834868, + "grad_norm": 0.1581162065267563, + "learning_rate": 2.5032650729508444e-07, + "loss": 1.7229, + "step": 31574 + }, + { + "epoch": 9.691528545119706, + "grad_norm": 0.18366876244544983, + "learning_rate": 2.4982999558642583e-07, + "loss": 1.7569, + "step": 31575 + }, + { + "epoch": 9.691835481890731, + "grad_norm": 0.1248086616396904, + "learning_rate": 2.493339755381074e-07, + "loss": 1.693, + "step": 31576 + }, + { + "epoch": 9.692142418661756, + "grad_norm": 0.10602928698062897, + "learning_rate": 2.4883844715503093e-07, + "loss": 1.6759, + "step": 31577 + }, + { + "epoch": 9.692449355432782, + "grad_norm": 0.12804557383060455, + "learning_rate": 2.4834341044208677e-07, + "loss": 1.6957, + "step": 31578 + }, + { + "epoch": 9.692756292203805, + "grad_norm": 0.14855320751667023, + "learning_rate": 2.4784886540417664e-07, + "loss": 1.7114, + "step": 31579 + }, + { + "epoch": 9.69306322897483, + "grad_norm": 0.10958930104970932, + "learning_rate": 2.47354812046191e-07, + "loss": 1.6565, + "step": 31580 + }, + { + "epoch": 9.693370165745856, + "grad_norm": 0.09138862043619156, + "learning_rate": 2.4686125037299833e-07, + "loss": 1.6219, + "step": 31581 + }, + { + "epoch": 9.693677102516881, + "grad_norm": 0.1569548100233078, + "learning_rate": 2.4636818038948906e-07, + "loss": 1.7323, + "step": 31582 + }, + { + "epoch": 9.693984039287907, + "grad_norm": 0.11487089842557907, + "learning_rate": 2.4587560210052593e-07, + "loss": 1.6862, + "step": 31583 + }, + { + "epoch": 9.694290976058932, + "grad_norm": 0.1095786914229393, + "learning_rate": 2.4538351551098293e-07, + "loss": 1.6837, + "step": 31584 + }, + { + "epoch": 9.694597912829957, + "grad_norm": 0.13925141096115112, + "learning_rate": 2.4489192062572277e-07, + "loss": 1.7273, + "step": 31585 + }, + { + "epoch": 9.694904849600983, + "grad_norm": 0.1378251016139984, + "learning_rate": 2.4440081744960264e-07, + "loss": 1.7314, + "step": 31586 + }, + { + "epoch": 9.695211786372008, + "grad_norm": 0.11287980526685715, + "learning_rate": 2.439102059874798e-07, + "loss": 1.6696, + "step": 31587 + }, + { + "epoch": 9.695518723143033, + "grad_norm": 0.13116686046123505, + "learning_rate": 2.4342008624419487e-07, + "loss": 1.7259, + "step": 31588 + }, + { + "epoch": 9.695825659914057, + "grad_norm": 0.1313004493713379, + "learning_rate": 2.4293045822459945e-07, + "loss": 1.6822, + "step": 31589 + }, + { + "epoch": 9.696132596685082, + "grad_norm": 0.10652224719524384, + "learning_rate": 2.4244132193352864e-07, + "loss": 1.6898, + "step": 31590 + }, + { + "epoch": 9.696439533456108, + "grad_norm": 0.16992691159248352, + "learning_rate": 2.4195267737581183e-07, + "loss": 1.6827, + "step": 31591 + }, + { + "epoch": 9.696746470227133, + "grad_norm": 0.13106754422187805, + "learning_rate": 2.414645245562841e-07, + "loss": 1.6914, + "step": 31592 + }, + { + "epoch": 9.697053406998158, + "grad_norm": 0.11900182068347931, + "learning_rate": 2.409768634797749e-07, + "loss": 1.6585, + "step": 31593 + }, + { + "epoch": 9.697360343769184, + "grad_norm": 0.16071198880672455, + "learning_rate": 2.4048969415109147e-07, + "loss": 1.6643, + "step": 31594 + }, + { + "epoch": 9.697667280540209, + "grad_norm": 0.15770921111106873, + "learning_rate": 2.400030165750522e-07, + "loss": 1.6851, + "step": 31595 + }, + { + "epoch": 9.697974217311234, + "grad_norm": 0.14619939029216766, + "learning_rate": 2.3951683075646994e-07, + "loss": 1.6998, + "step": 31596 + }, + { + "epoch": 9.69828115408226, + "grad_norm": 0.13628968596458435, + "learning_rate": 2.3903113670015186e-07, + "loss": 1.7756, + "step": 31597 + }, + { + "epoch": 9.698588090853285, + "grad_norm": 0.12398962676525116, + "learning_rate": 2.385459344108887e-07, + "loss": 1.6911, + "step": 31598 + }, + { + "epoch": 9.69889502762431, + "grad_norm": 0.10792331397533417, + "learning_rate": 2.3806122389348761e-07, + "loss": 1.6899, + "step": 31599 + }, + { + "epoch": 9.699201964395334, + "grad_norm": 0.18480929732322693, + "learning_rate": 2.3757700515272264e-07, + "loss": 1.7635, + "step": 31600 + }, + { + "epoch": 9.699508901166359, + "grad_norm": 0.15164418518543243, + "learning_rate": 2.3709327819339543e-07, + "loss": 1.7156, + "step": 31601 + }, + { + "epoch": 9.699815837937384, + "grad_norm": 0.11357399821281433, + "learning_rate": 2.3661004302027444e-07, + "loss": 1.6584, + "step": 31602 + }, + { + "epoch": 9.70012277470841, + "grad_norm": 0.12062408030033112, + "learning_rate": 2.3612729963814473e-07, + "loss": 1.6768, + "step": 31603 + }, + { + "epoch": 9.700429711479435, + "grad_norm": 0.12755636870861053, + "learning_rate": 2.3564504805176912e-07, + "loss": 1.7017, + "step": 31604 + }, + { + "epoch": 9.70073664825046, + "grad_norm": 0.09880411624908447, + "learning_rate": 2.3516328826591605e-07, + "loss": 1.679, + "step": 31605 + }, + { + "epoch": 9.701043585021486, + "grad_norm": 0.18026186525821686, + "learning_rate": 2.3468202028535392e-07, + "loss": 1.7464, + "step": 31606 + }, + { + "epoch": 9.701350521792511, + "grad_norm": 0.10250361263751984, + "learning_rate": 2.342012441148289e-07, + "loss": 1.6973, + "step": 31607 + }, + { + "epoch": 9.701657458563536, + "grad_norm": 0.12328560650348663, + "learning_rate": 2.337209597590928e-07, + "loss": 1.6532, + "step": 31608 + }, + { + "epoch": 9.701964395334562, + "grad_norm": 0.10625593364238739, + "learning_rate": 2.3324116722289734e-07, + "loss": 1.714, + "step": 31609 + }, + { + "epoch": 9.702271332105585, + "grad_norm": 0.13381624221801758, + "learning_rate": 2.327618665109832e-07, + "loss": 1.7343, + "step": 31610 + }, + { + "epoch": 9.70257826887661, + "grad_norm": 0.14365731179714203, + "learning_rate": 2.3228305762808545e-07, + "loss": 1.6985, + "step": 31611 + }, + { + "epoch": 9.702885205647636, + "grad_norm": 0.12659169733524323, + "learning_rate": 2.3180474057893364e-07, + "loss": 1.682, + "step": 31612 + }, + { + "epoch": 9.703192142418661, + "grad_norm": 0.17809925973415375, + "learning_rate": 2.313269153682629e-07, + "loss": 1.7569, + "step": 31613 + }, + { + "epoch": 9.703499079189687, + "grad_norm": 0.15143713355064392, + "learning_rate": 2.308495820007861e-07, + "loss": 1.6935, + "step": 31614 + }, + { + "epoch": 9.703806015960712, + "grad_norm": 0.1365015059709549, + "learning_rate": 2.303727404812217e-07, + "loss": 1.7068, + "step": 31615 + }, + { + "epoch": 9.704112952731737, + "grad_norm": 0.13594263792037964, + "learning_rate": 2.2989639081428816e-07, + "loss": 1.7117, + "step": 31616 + }, + { + "epoch": 9.704419889502763, + "grad_norm": 0.10336802899837494, + "learning_rate": 2.2942053300468724e-07, + "loss": 1.6819, + "step": 31617 + }, + { + "epoch": 9.704726826273788, + "grad_norm": 0.19912748038768768, + "learning_rate": 2.2894516705713188e-07, + "loss": 1.7343, + "step": 31618 + }, + { + "epoch": 9.705033763044813, + "grad_norm": 0.15230657160282135, + "learning_rate": 2.2847029297630162e-07, + "loss": 1.7344, + "step": 31619 + }, + { + "epoch": 9.705340699815839, + "grad_norm": 0.18917563557624817, + "learning_rate": 2.2799591076690386e-07, + "loss": 1.7307, + "step": 31620 + }, + { + "epoch": 9.705647636586864, + "grad_norm": 0.1437673717737198, + "learning_rate": 2.2752202043362924e-07, + "loss": 1.7074, + "step": 31621 + }, + { + "epoch": 9.705954573357888, + "grad_norm": 0.14478498697280884, + "learning_rate": 2.2704862198114628e-07, + "loss": 1.7035, + "step": 31622 + }, + { + "epoch": 9.706261510128913, + "grad_norm": 0.1284007877111435, + "learning_rate": 2.265757154141457e-07, + "loss": 1.7298, + "step": 31623 + }, + { + "epoch": 9.706568446899938, + "grad_norm": 0.1506684273481369, + "learning_rate": 2.261033007372959e-07, + "loss": 1.7109, + "step": 31624 + }, + { + "epoch": 9.706875383670964, + "grad_norm": 0.13655513525009155, + "learning_rate": 2.2563137795526545e-07, + "loss": 1.7158, + "step": 31625 + }, + { + "epoch": 9.707182320441989, + "grad_norm": 0.1190224140882492, + "learning_rate": 2.2515994707271725e-07, + "loss": 1.7004, + "step": 31626 + }, + { + "epoch": 9.707489257213014, + "grad_norm": 0.12282036989927292, + "learning_rate": 2.246890080943198e-07, + "loss": 1.6922, + "step": 31627 + }, + { + "epoch": 9.70779619398404, + "grad_norm": 0.12748375535011292, + "learning_rate": 2.2421856102471383e-07, + "loss": 1.7268, + "step": 31628 + }, + { + "epoch": 9.708103130755065, + "grad_norm": 0.12438757717609406, + "learning_rate": 2.2374860586855671e-07, + "loss": 1.6938, + "step": 31629 + }, + { + "epoch": 9.70841006752609, + "grad_norm": 0.11385367810726166, + "learning_rate": 2.2327914263048922e-07, + "loss": 1.6872, + "step": 31630 + }, + { + "epoch": 9.708717004297116, + "grad_norm": 0.13927948474884033, + "learning_rate": 2.2281017131515757e-07, + "loss": 1.7017, + "step": 31631 + }, + { + "epoch": 9.70902394106814, + "grad_norm": 0.15019075572490692, + "learning_rate": 2.2234169192718035e-07, + "loss": 1.7064, + "step": 31632 + }, + { + "epoch": 9.709330877839164, + "grad_norm": 0.12574142217636108, + "learning_rate": 2.2187370447120936e-07, + "loss": 1.7117, + "step": 31633 + }, + { + "epoch": 9.70963781461019, + "grad_norm": 0.13135144114494324, + "learning_rate": 2.2140620895185203e-07, + "loss": 1.6889, + "step": 31634 + }, + { + "epoch": 9.709944751381215, + "grad_norm": 0.10573926568031311, + "learning_rate": 2.2093920537373803e-07, + "loss": 1.6459, + "step": 31635 + }, + { + "epoch": 9.71025168815224, + "grad_norm": 0.1492786854505539, + "learning_rate": 2.204726937414747e-07, + "loss": 1.725, + "step": 31636 + }, + { + "epoch": 9.710558624923266, + "grad_norm": 0.17757928371429443, + "learning_rate": 2.2000667405968067e-07, + "loss": 1.7061, + "step": 31637 + }, + { + "epoch": 9.710865561694291, + "grad_norm": 0.15048767626285553, + "learning_rate": 2.1954114633295774e-07, + "loss": 1.7005, + "step": 31638 + }, + { + "epoch": 9.711172498465316, + "grad_norm": 0.12468421459197998, + "learning_rate": 2.1907611056590226e-07, + "loss": 1.7114, + "step": 31639 + }, + { + "epoch": 9.711479435236342, + "grad_norm": 0.13750670850276947, + "learning_rate": 2.1861156676312167e-07, + "loss": 1.6938, + "step": 31640 + }, + { + "epoch": 9.711786372007367, + "grad_norm": 0.13854531943798065, + "learning_rate": 2.181475149291956e-07, + "loss": 1.7466, + "step": 31641 + }, + { + "epoch": 9.712093308778392, + "grad_norm": 0.18358713388442993, + "learning_rate": 2.176839550687093e-07, + "loss": 1.7657, + "step": 31642 + }, + { + "epoch": 9.712400245549416, + "grad_norm": 0.0844811424612999, + "learning_rate": 2.1722088718625354e-07, + "loss": 1.6235, + "step": 31643 + }, + { + "epoch": 9.712707182320441, + "grad_norm": 0.12278879433870316, + "learning_rate": 2.167583112863969e-07, + "loss": 1.6819, + "step": 31644 + }, + { + "epoch": 9.713014119091467, + "grad_norm": 0.13768786191940308, + "learning_rate": 2.16296227373719e-07, + "loss": 1.7045, + "step": 31645 + }, + { + "epoch": 9.713321055862492, + "grad_norm": 0.15438923239707947, + "learning_rate": 2.1583463545277739e-07, + "loss": 1.7322, + "step": 31646 + }, + { + "epoch": 9.713627992633517, + "grad_norm": 0.19160570204257965, + "learning_rate": 2.1537353552813498e-07, + "loss": 1.8026, + "step": 31647 + }, + { + "epoch": 9.713934929404543, + "grad_norm": 0.11172829568386078, + "learning_rate": 2.149129276043549e-07, + "loss": 1.6935, + "step": 31648 + }, + { + "epoch": 9.714241866175568, + "grad_norm": 0.16613627970218658, + "learning_rate": 2.1445281168598342e-07, + "loss": 1.6866, + "step": 31649 + }, + { + "epoch": 9.714548802946593, + "grad_norm": 0.12793566286563873, + "learning_rate": 2.1399318777756695e-07, + "loss": 1.6836, + "step": 31650 + }, + { + "epoch": 9.714855739717619, + "grad_norm": 0.13563989102840424, + "learning_rate": 2.1353405588365182e-07, + "loss": 1.6679, + "step": 31651 + }, + { + "epoch": 9.715162676488644, + "grad_norm": 0.15428829193115234, + "learning_rate": 2.1307541600877888e-07, + "loss": 1.6762, + "step": 31652 + }, + { + "epoch": 9.715469613259668, + "grad_norm": 0.14353898167610168, + "learning_rate": 2.1261726815746673e-07, + "loss": 1.6908, + "step": 31653 + }, + { + "epoch": 9.715776550030693, + "grad_norm": 0.12383358925580978, + "learning_rate": 2.1215961233426174e-07, + "loss": 1.6732, + "step": 31654 + }, + { + "epoch": 9.716083486801718, + "grad_norm": 0.14675362408161163, + "learning_rate": 2.117024485436714e-07, + "loss": 1.7162, + "step": 31655 + }, + { + "epoch": 9.716390423572744, + "grad_norm": 0.11572350561618805, + "learning_rate": 2.1124577679021985e-07, + "loss": 1.631, + "step": 31656 + }, + { + "epoch": 9.716697360343769, + "grad_norm": 0.1518344134092331, + "learning_rate": 2.1078959707842015e-07, + "loss": 1.7398, + "step": 31657 + }, + { + "epoch": 9.717004297114794, + "grad_norm": 0.11649021506309509, + "learning_rate": 2.1033390941277985e-07, + "loss": 1.6581, + "step": 31658 + }, + { + "epoch": 9.71731123388582, + "grad_norm": 0.12223310023546219, + "learning_rate": 2.098787137978009e-07, + "loss": 1.7291, + "step": 31659 + }, + { + "epoch": 9.717618170656845, + "grad_norm": 0.15575721859931946, + "learning_rate": 2.094240102379852e-07, + "loss": 1.686, + "step": 31660 + }, + { + "epoch": 9.71792510742787, + "grad_norm": 0.10441846400499344, + "learning_rate": 2.0896979873782918e-07, + "loss": 1.6717, + "step": 31661 + }, + { + "epoch": 9.718232044198896, + "grad_norm": 0.13644640147686005, + "learning_rate": 2.0851607930180706e-07, + "loss": 1.715, + "step": 31662 + }, + { + "epoch": 9.718538980969921, + "grad_norm": 0.1860501617193222, + "learning_rate": 2.0806285193442077e-07, + "loss": 1.707, + "step": 31663 + }, + { + "epoch": 9.718845917740946, + "grad_norm": 0.12100571393966675, + "learning_rate": 2.0761011664013897e-07, + "loss": 1.6997, + "step": 31664 + }, + { + "epoch": 9.71915285451197, + "grad_norm": 0.09347312152385712, + "learning_rate": 2.0715787342343584e-07, + "loss": 1.6688, + "step": 31665 + }, + { + "epoch": 9.719459791282995, + "grad_norm": 0.19816496968269348, + "learning_rate": 2.067061222887856e-07, + "loss": 1.713, + "step": 31666 + }, + { + "epoch": 9.71976672805402, + "grad_norm": 0.16399987041950226, + "learning_rate": 2.0625486324065135e-07, + "loss": 1.7973, + "step": 31667 + }, + { + "epoch": 9.720073664825046, + "grad_norm": 0.12751246988773346, + "learning_rate": 2.058040962834906e-07, + "loss": 1.7204, + "step": 31668 + }, + { + "epoch": 9.720380601596071, + "grad_norm": 0.16934554278850555, + "learning_rate": 2.0535382142176096e-07, + "loss": 1.7078, + "step": 31669 + }, + { + "epoch": 9.720687538367097, + "grad_norm": 0.18634845316410065, + "learning_rate": 2.0490403865990325e-07, + "loss": 1.7486, + "step": 31670 + }, + { + "epoch": 9.720994475138122, + "grad_norm": 0.1632041186094284, + "learning_rate": 2.0445474800237508e-07, + "loss": 1.7102, + "step": 31671 + }, + { + "epoch": 9.721301411909147, + "grad_norm": 0.13699625432491302, + "learning_rate": 2.0400594945361172e-07, + "loss": 1.7066, + "step": 31672 + }, + { + "epoch": 9.721608348680173, + "grad_norm": 0.11776915192604065, + "learning_rate": 2.0355764301804858e-07, + "loss": 1.7083, + "step": 31673 + }, + { + "epoch": 9.721915285451198, + "grad_norm": 0.10446945577859879, + "learning_rate": 2.031098287001154e-07, + "loss": 1.6481, + "step": 31674 + }, + { + "epoch": 9.722222222222221, + "grad_norm": 0.09323536604642868, + "learning_rate": 2.026625065042309e-07, + "loss": 1.6474, + "step": 31675 + }, + { + "epoch": 9.722529158993247, + "grad_norm": 0.11908341199159622, + "learning_rate": 2.022156764348304e-07, + "loss": 1.6824, + "step": 31676 + }, + { + "epoch": 9.722836095764272, + "grad_norm": 0.14512252807617188, + "learning_rate": 2.0176933849631596e-07, + "loss": 1.6755, + "step": 31677 + }, + { + "epoch": 9.723143032535297, + "grad_norm": 0.11505481600761414, + "learning_rate": 2.0132349269311178e-07, + "loss": 1.6893, + "step": 31678 + }, + { + "epoch": 9.723449969306323, + "grad_norm": 0.12112774699926376, + "learning_rate": 2.0087813902960884e-07, + "loss": 1.6801, + "step": 31679 + }, + { + "epoch": 9.723756906077348, + "grad_norm": 0.14546430110931396, + "learning_rate": 2.0043327751022579e-07, + "loss": 1.7217, + "step": 31680 + }, + { + "epoch": 9.724063842848373, + "grad_norm": 0.1829695701599121, + "learning_rate": 1.9998890813934247e-07, + "loss": 1.7101, + "step": 31681 + }, + { + "epoch": 9.724370779619399, + "grad_norm": 0.1490027755498886, + "learning_rate": 1.995450309213609e-07, + "loss": 1.7312, + "step": 31682 + }, + { + "epoch": 9.724677716390424, + "grad_norm": 0.14473678171634674, + "learning_rate": 1.9910164586066093e-07, + "loss": 1.6541, + "step": 31683 + }, + { + "epoch": 9.72498465316145, + "grad_norm": 0.15499809384346008, + "learning_rate": 1.9865875296162793e-07, + "loss": 1.7028, + "step": 31684 + }, + { + "epoch": 9.725291589932475, + "grad_norm": 0.17470933496952057, + "learning_rate": 1.9821635222864176e-07, + "loss": 1.7429, + "step": 31685 + }, + { + "epoch": 9.725598526703498, + "grad_norm": 0.16423273086547852, + "learning_rate": 1.977744436660711e-07, + "loss": 1.6993, + "step": 31686 + }, + { + "epoch": 9.725905463474524, + "grad_norm": 0.1388999968767166, + "learning_rate": 1.9733302727827918e-07, + "loss": 1.7257, + "step": 31687 + }, + { + "epoch": 9.726212400245549, + "grad_norm": 0.12693311274051666, + "learning_rate": 1.9689210306963467e-07, + "loss": 1.6897, + "step": 31688 + }, + { + "epoch": 9.726519337016574, + "grad_norm": 0.15273548662662506, + "learning_rate": 1.9645167104449524e-07, + "loss": 1.7357, + "step": 31689 + }, + { + "epoch": 9.7268262737876, + "grad_norm": 0.15459993481636047, + "learning_rate": 1.960117312072074e-07, + "loss": 1.7333, + "step": 31690 + }, + { + "epoch": 9.727133210558625, + "grad_norm": 0.13200953602790833, + "learning_rate": 1.9557228356212875e-07, + "loss": 1.6882, + "step": 31691 + }, + { + "epoch": 9.72744014732965, + "grad_norm": 0.10575802624225616, + "learning_rate": 1.9513332811358919e-07, + "loss": 1.6204, + "step": 31692 + }, + { + "epoch": 9.727747084100676, + "grad_norm": 0.20376244187355042, + "learning_rate": 1.9469486486593525e-07, + "loss": 1.7743, + "step": 31693 + }, + { + "epoch": 9.728054020871701, + "grad_norm": 0.10688602179288864, + "learning_rate": 1.9425689382350232e-07, + "loss": 1.6624, + "step": 31694 + }, + { + "epoch": 9.728360957642726, + "grad_norm": 0.15587441623210907, + "learning_rate": 1.9381941499060918e-07, + "loss": 1.6399, + "step": 31695 + }, + { + "epoch": 9.72866789441375, + "grad_norm": 0.1695834845304489, + "learning_rate": 1.9338242837159126e-07, + "loss": 1.7186, + "step": 31696 + }, + { + "epoch": 9.728974831184775, + "grad_norm": 0.14353398978710175, + "learning_rate": 1.9294593397075623e-07, + "loss": 1.6899, + "step": 31697 + }, + { + "epoch": 9.7292817679558, + "grad_norm": 0.12760649621486664, + "learning_rate": 1.9250993179242284e-07, + "loss": 1.7436, + "step": 31698 + }, + { + "epoch": 9.729588704726826, + "grad_norm": 0.16516657173633575, + "learning_rate": 1.920744218409043e-07, + "loss": 1.7082, + "step": 31699 + }, + { + "epoch": 9.729895641497851, + "grad_norm": 0.10934159904718399, + "learning_rate": 1.9163940412049165e-07, + "loss": 1.6901, + "step": 31700 + }, + { + "epoch": 9.730202578268877, + "grad_norm": 0.16668133437633514, + "learning_rate": 1.9120487863549807e-07, + "loss": 1.7259, + "step": 31701 + }, + { + "epoch": 9.730509515039902, + "grad_norm": 0.12656927108764648, + "learning_rate": 1.9077084539020908e-07, + "loss": 1.6655, + "step": 31702 + }, + { + "epoch": 9.730816451810927, + "grad_norm": 0.13380050659179688, + "learning_rate": 1.903373043889156e-07, + "loss": 1.7056, + "step": 31703 + }, + { + "epoch": 9.731123388581953, + "grad_norm": 0.19093738496303558, + "learning_rate": 1.8990425563590319e-07, + "loss": 1.7915, + "step": 31704 + }, + { + "epoch": 9.731430325352978, + "grad_norm": 0.14314888417720795, + "learning_rate": 1.8947169913545725e-07, + "loss": 1.6864, + "step": 31705 + }, + { + "epoch": 9.731737262124003, + "grad_norm": 0.12564614415168762, + "learning_rate": 1.8903963489184107e-07, + "loss": 1.6877, + "step": 31706 + }, + { + "epoch": 9.732044198895027, + "grad_norm": 0.15374313294887543, + "learning_rate": 1.8860806290932897e-07, + "loss": 1.6809, + "step": 31707 + }, + { + "epoch": 9.732351135666052, + "grad_norm": 0.16379213333129883, + "learning_rate": 1.8817698319219535e-07, + "loss": 1.7481, + "step": 31708 + }, + { + "epoch": 9.732658072437077, + "grad_norm": 0.24672576785087585, + "learning_rate": 1.8774639574468677e-07, + "loss": 1.7644, + "step": 31709 + }, + { + "epoch": 9.732965009208103, + "grad_norm": 0.13296177983283997, + "learning_rate": 1.8731630057106653e-07, + "loss": 1.7087, + "step": 31710 + }, + { + "epoch": 9.733271945979128, + "grad_norm": 0.12447187304496765, + "learning_rate": 1.868866976755812e-07, + "loss": 1.7066, + "step": 31711 + }, + { + "epoch": 9.733578882750153, + "grad_norm": 0.15150503814220428, + "learning_rate": 1.8645758706247741e-07, + "loss": 1.7114, + "step": 31712 + }, + { + "epoch": 9.733885819521179, + "grad_norm": 0.1416541039943695, + "learning_rate": 1.8602896873599619e-07, + "loss": 1.695, + "step": 31713 + }, + { + "epoch": 9.734192756292204, + "grad_norm": 0.18281929194927216, + "learning_rate": 1.8560084270037304e-07, + "loss": 1.6934, + "step": 31714 + }, + { + "epoch": 9.73449969306323, + "grad_norm": 0.16674144566059113, + "learning_rate": 1.8517320895984347e-07, + "loss": 1.7168, + "step": 31715 + }, + { + "epoch": 9.734806629834255, + "grad_norm": 0.12002552300691605, + "learning_rate": 1.8474606751862632e-07, + "loss": 1.6761, + "step": 31716 + }, + { + "epoch": 9.735113566605278, + "grad_norm": 0.12910617887973785, + "learning_rate": 1.84319418380946e-07, + "loss": 1.6985, + "step": 31717 + }, + { + "epoch": 9.735420503376304, + "grad_norm": 0.14131152629852295, + "learning_rate": 1.838932615510214e-07, + "loss": 1.6874, + "step": 31718 + }, + { + "epoch": 9.735727440147329, + "grad_norm": 0.11726522445678711, + "learning_rate": 1.834675970330546e-07, + "loss": 1.6776, + "step": 31719 + }, + { + "epoch": 9.736034376918354, + "grad_norm": 0.13109390437602997, + "learning_rate": 1.830424248312701e-07, + "loss": 1.7053, + "step": 31720 + }, + { + "epoch": 9.73634131368938, + "grad_norm": 0.17507077753543854, + "learning_rate": 1.826177449498534e-07, + "loss": 1.7277, + "step": 31721 + }, + { + "epoch": 9.736648250460405, + "grad_norm": 0.17200970649719238, + "learning_rate": 1.821935573930067e-07, + "loss": 1.6439, + "step": 31722 + }, + { + "epoch": 9.73695518723143, + "grad_norm": 0.11237013339996338, + "learning_rate": 1.8176986216492665e-07, + "loss": 1.6619, + "step": 31723 + }, + { + "epoch": 9.737262124002456, + "grad_norm": 0.1799420714378357, + "learning_rate": 1.8134665926978767e-07, + "loss": 1.7776, + "step": 31724 + }, + { + "epoch": 9.737569060773481, + "grad_norm": 0.13868120312690735, + "learning_rate": 1.809239487117864e-07, + "loss": 1.7625, + "step": 31725 + }, + { + "epoch": 9.737875997544506, + "grad_norm": 0.2101743519306183, + "learning_rate": 1.8050173049509177e-07, + "loss": 1.7365, + "step": 31726 + }, + { + "epoch": 9.738182934315532, + "grad_norm": 0.14918360114097595, + "learning_rate": 1.8008000462388375e-07, + "loss": 1.724, + "step": 31727 + }, + { + "epoch": 9.738489871086557, + "grad_norm": 0.14308972656726837, + "learning_rate": 1.7965877110232565e-07, + "loss": 1.7002, + "step": 31728 + }, + { + "epoch": 9.73879680785758, + "grad_norm": 0.09658967703580856, + "learning_rate": 1.792380299345753e-07, + "loss": 1.659, + "step": 31729 + }, + { + "epoch": 9.739103744628606, + "grad_norm": 0.14846646785736084, + "learning_rate": 1.7881778112479596e-07, + "loss": 1.7108, + "step": 31730 + }, + { + "epoch": 9.739410681399631, + "grad_norm": 0.13030952215194702, + "learning_rate": 1.783980246771455e-07, + "loss": 1.6758, + "step": 31731 + }, + { + "epoch": 9.739717618170657, + "grad_norm": 0.15918081998825073, + "learning_rate": 1.77978760595765e-07, + "loss": 1.7569, + "step": 31732 + }, + { + "epoch": 9.740024554941682, + "grad_norm": 0.15745976567268372, + "learning_rate": 1.7755998888479563e-07, + "loss": 1.7562, + "step": 31733 + }, + { + "epoch": 9.740331491712707, + "grad_norm": 0.21078935265541077, + "learning_rate": 1.7714170954838405e-07, + "loss": 1.8209, + "step": 31734 + }, + { + "epoch": 9.740638428483733, + "grad_norm": 0.14896774291992188, + "learning_rate": 1.767239225906603e-07, + "loss": 1.6878, + "step": 31735 + }, + { + "epoch": 9.740945365254758, + "grad_norm": 0.146200031042099, + "learning_rate": 1.7630662801575438e-07, + "loss": 1.7155, + "step": 31736 + }, + { + "epoch": 9.741252302025783, + "grad_norm": 0.09577162563800812, + "learning_rate": 1.7588982582778523e-07, + "loss": 1.6537, + "step": 31737 + }, + { + "epoch": 9.741559238796809, + "grad_norm": 0.09476766735315323, + "learning_rate": 1.7547351603088292e-07, + "loss": 1.6383, + "step": 31738 + }, + { + "epoch": 9.741866175567832, + "grad_norm": 0.166427344083786, + "learning_rate": 1.7505769862914412e-07, + "loss": 1.7393, + "step": 31739 + }, + { + "epoch": 9.742173112338858, + "grad_norm": 0.12341952323913574, + "learning_rate": 1.7464237362669333e-07, + "loss": 1.6721, + "step": 31740 + }, + { + "epoch": 9.742480049109883, + "grad_norm": 0.16770128905773163, + "learning_rate": 1.7422754102763283e-07, + "loss": 1.7883, + "step": 31741 + }, + { + "epoch": 9.742786985880908, + "grad_norm": 0.099067822098732, + "learning_rate": 1.7381320083605935e-07, + "loss": 1.6347, + "step": 31742 + }, + { + "epoch": 9.743093922651934, + "grad_norm": 0.12981869280338287, + "learning_rate": 1.7339935305606404e-07, + "loss": 1.7064, + "step": 31743 + }, + { + "epoch": 9.743400859422959, + "grad_norm": 0.12666809558868408, + "learning_rate": 1.7298599769173806e-07, + "loss": 1.6429, + "step": 31744 + }, + { + "epoch": 9.743707796193984, + "grad_norm": 0.14717376232147217, + "learning_rate": 1.7257313474717817e-07, + "loss": 1.7033, + "step": 31745 + }, + { + "epoch": 9.74401473296501, + "grad_norm": 0.12324973195791245, + "learning_rate": 1.7216076422644777e-07, + "loss": 1.7018, + "step": 31746 + }, + { + "epoch": 9.744321669736035, + "grad_norm": 0.15551744401454926, + "learning_rate": 1.717488861336325e-07, + "loss": 1.7483, + "step": 31747 + }, + { + "epoch": 9.74462860650706, + "grad_norm": 0.1447838693857193, + "learning_rate": 1.7133750047280128e-07, + "loss": 1.709, + "step": 31748 + }, + { + "epoch": 9.744935543278086, + "grad_norm": 0.10742588341236115, + "learning_rate": 1.7092660724801756e-07, + "loss": 1.6927, + "step": 31749 + }, + { + "epoch": 9.745242480049109, + "grad_norm": 0.16011138260364532, + "learning_rate": 1.7051620646333922e-07, + "loss": 1.7005, + "step": 31750 + }, + { + "epoch": 9.745549416820134, + "grad_norm": 0.11633095890283585, + "learning_rate": 1.7010629812282962e-07, + "loss": 1.6779, + "step": 31751 + }, + { + "epoch": 9.74585635359116, + "grad_norm": 0.14908172190189362, + "learning_rate": 1.696968822305356e-07, + "loss": 1.7064, + "step": 31752 + }, + { + "epoch": 9.746163290362185, + "grad_norm": 0.11630599200725555, + "learning_rate": 1.692879587904983e-07, + "loss": 1.6689, + "step": 31753 + }, + { + "epoch": 9.74647022713321, + "grad_norm": 0.15253309905529022, + "learning_rate": 1.6887952780677008e-07, + "loss": 1.6946, + "step": 31754 + }, + { + "epoch": 9.746777163904236, + "grad_norm": 0.11310866475105286, + "learning_rate": 1.6847158928338103e-07, + "loss": 1.6834, + "step": 31755 + }, + { + "epoch": 9.747084100675261, + "grad_norm": 0.13006237149238586, + "learning_rate": 1.6806414322436127e-07, + "loss": 1.6968, + "step": 31756 + }, + { + "epoch": 9.747391037446286, + "grad_norm": 0.16225489974021912, + "learning_rate": 1.676571896337409e-07, + "loss": 1.7624, + "step": 31757 + }, + { + "epoch": 9.747697974217312, + "grad_norm": 0.16863548755645752, + "learning_rate": 1.672507285155389e-07, + "loss": 1.8187, + "step": 31758 + }, + { + "epoch": 9.748004910988337, + "grad_norm": 0.17859725654125214, + "learning_rate": 1.6684475987377434e-07, + "loss": 1.7323, + "step": 31759 + }, + { + "epoch": 9.74831184775936, + "grad_norm": 0.14921754598617554, + "learning_rate": 1.664392837124551e-07, + "loss": 1.6843, + "step": 31760 + }, + { + "epoch": 9.748618784530386, + "grad_norm": 0.12703189253807068, + "learning_rate": 1.6603430003558906e-07, + "loss": 1.6854, + "step": 31761 + }, + { + "epoch": 9.748925721301411, + "grad_norm": 0.09212498366832733, + "learning_rate": 1.6562980884718414e-07, + "loss": 1.6299, + "step": 31762 + }, + { + "epoch": 9.749232658072437, + "grad_norm": 0.14094288647174835, + "learning_rate": 1.6522581015123718e-07, + "loss": 1.7041, + "step": 31763 + }, + { + "epoch": 9.749539594843462, + "grad_norm": 0.16065463423728943, + "learning_rate": 1.6482230395173382e-07, + "loss": 1.7474, + "step": 31764 + }, + { + "epoch": 9.749846531614487, + "grad_norm": 0.13509607315063477, + "learning_rate": 1.6441929025266533e-07, + "loss": 1.7262, + "step": 31765 + }, + { + "epoch": 9.750153468385513, + "grad_norm": 0.20273075997829437, + "learning_rate": 1.6401676905801743e-07, + "loss": 1.7401, + "step": 31766 + }, + { + "epoch": 9.750460405156538, + "grad_norm": 0.14658035337924957, + "learning_rate": 1.6361474037176473e-07, + "loss": 1.766, + "step": 31767 + }, + { + "epoch": 9.750767341927563, + "grad_norm": 0.13443495333194733, + "learning_rate": 1.6321320419788177e-07, + "loss": 1.6687, + "step": 31768 + }, + { + "epoch": 9.751074278698589, + "grad_norm": 0.15590953826904297, + "learning_rate": 1.628121605403321e-07, + "loss": 1.6915, + "step": 31769 + }, + { + "epoch": 9.751381215469614, + "grad_norm": 0.11670281738042831, + "learning_rate": 1.6241160940308476e-07, + "loss": 1.7105, + "step": 31770 + }, + { + "epoch": 9.75168815224064, + "grad_norm": 0.12388762086629868, + "learning_rate": 1.6201155079010322e-07, + "loss": 1.7009, + "step": 31771 + }, + { + "epoch": 9.751995089011663, + "grad_norm": 0.15842701494693756, + "learning_rate": 1.6161198470532878e-07, + "loss": 1.7133, + "step": 31772 + }, + { + "epoch": 9.752302025782688, + "grad_norm": 0.1379842609167099, + "learning_rate": 1.612129111527194e-07, + "loss": 1.7162, + "step": 31773 + }, + { + "epoch": 9.752608962553714, + "grad_norm": 0.1491837501525879, + "learning_rate": 1.6081433013621084e-07, + "loss": 1.7535, + "step": 31774 + }, + { + "epoch": 9.752915899324739, + "grad_norm": 0.17003466188907623, + "learning_rate": 1.6041624165974989e-07, + "loss": 1.746, + "step": 31775 + }, + { + "epoch": 9.753222836095764, + "grad_norm": 0.15136978030204773, + "learning_rate": 1.6001864572726676e-07, + "loss": 1.7365, + "step": 31776 + }, + { + "epoch": 9.75352977286679, + "grad_norm": 0.12367337197065353, + "learning_rate": 1.596215423426861e-07, + "loss": 1.706, + "step": 31777 + }, + { + "epoch": 9.753836709637815, + "grad_norm": 0.16162081062793732, + "learning_rate": 1.5922493150994365e-07, + "loss": 1.7657, + "step": 31778 + }, + { + "epoch": 9.75414364640884, + "grad_norm": 0.11211063712835312, + "learning_rate": 1.5882881323295294e-07, + "loss": 1.6785, + "step": 31779 + }, + { + "epoch": 9.754450583179866, + "grad_norm": 0.12570419907569885, + "learning_rate": 1.584331875156275e-07, + "loss": 1.6994, + "step": 31780 + }, + { + "epoch": 9.754757519950891, + "grad_norm": 0.11583932489156723, + "learning_rate": 1.5803805436188092e-07, + "loss": 1.6528, + "step": 31781 + }, + { + "epoch": 9.755064456721914, + "grad_norm": 0.1248580664396286, + "learning_rate": 1.5764341377561554e-07, + "loss": 1.7237, + "step": 31782 + }, + { + "epoch": 9.75537139349294, + "grad_norm": 0.1245606392621994, + "learning_rate": 1.572492657607283e-07, + "loss": 1.6775, + "step": 31783 + }, + { + "epoch": 9.755678330263965, + "grad_norm": 0.13548308610916138, + "learning_rate": 1.5685561032111607e-07, + "loss": 1.7073, + "step": 31784 + }, + { + "epoch": 9.75598526703499, + "grad_norm": 0.12355189025402069, + "learning_rate": 1.5646244746067572e-07, + "loss": 1.6977, + "step": 31785 + }, + { + "epoch": 9.756292203806016, + "grad_norm": 0.14441610872745514, + "learning_rate": 1.5606977718328197e-07, + "loss": 1.7283, + "step": 31786 + }, + { + "epoch": 9.756599140577041, + "grad_norm": 0.12972392141819, + "learning_rate": 1.556775994928261e-07, + "loss": 1.7062, + "step": 31787 + }, + { + "epoch": 9.756906077348066, + "grad_norm": 0.1203489899635315, + "learning_rate": 1.552859143931773e-07, + "loss": 1.6927, + "step": 31788 + }, + { + "epoch": 9.757213014119092, + "grad_norm": 0.14732889831066132, + "learning_rate": 1.548947218882102e-07, + "loss": 1.7197, + "step": 31789 + }, + { + "epoch": 9.757519950890117, + "grad_norm": 0.20930984616279602, + "learning_rate": 1.5450402198178283e-07, + "loss": 1.7926, + "step": 31790 + }, + { + "epoch": 9.757826887661142, + "grad_norm": 0.15674839913845062, + "learning_rate": 1.5411381467776986e-07, + "loss": 1.7204, + "step": 31791 + }, + { + "epoch": 9.758133824432168, + "grad_norm": 0.15836498141288757, + "learning_rate": 1.537240999800127e-07, + "loss": 1.7032, + "step": 31792 + }, + { + "epoch": 9.758440761203191, + "grad_norm": 0.11274401843547821, + "learning_rate": 1.5333487789237488e-07, + "loss": 1.6818, + "step": 31793 + }, + { + "epoch": 9.758747697974217, + "grad_norm": 0.10347522795200348, + "learning_rate": 1.5294614841869226e-07, + "loss": 1.6521, + "step": 31794 + }, + { + "epoch": 9.759054634745242, + "grad_norm": 0.17067036032676697, + "learning_rate": 1.525579115628173e-07, + "loss": 1.716, + "step": 31795 + }, + { + "epoch": 9.759361571516267, + "grad_norm": 0.11318463832139969, + "learning_rate": 1.5217016732858024e-07, + "loss": 1.6751, + "step": 31796 + }, + { + "epoch": 9.759668508287293, + "grad_norm": 0.15316587686538696, + "learning_rate": 1.517829157198114e-07, + "loss": 1.7025, + "step": 31797 + }, + { + "epoch": 9.759975445058318, + "grad_norm": 0.13108935952186584, + "learning_rate": 1.5139615674034658e-07, + "loss": 1.671, + "step": 31798 + }, + { + "epoch": 9.760282381829343, + "grad_norm": 0.09524109214544296, + "learning_rate": 1.5100989039399939e-07, + "loss": 1.6556, + "step": 31799 + }, + { + "epoch": 9.760589318600369, + "grad_norm": 0.13735005259513855, + "learning_rate": 1.5062411668458898e-07, + "loss": 1.6774, + "step": 31800 + }, + { + "epoch": 9.760896255371394, + "grad_norm": 0.09719503670930862, + "learning_rate": 1.50238835615929e-07, + "loss": 1.6379, + "step": 31801 + }, + { + "epoch": 9.76120319214242, + "grad_norm": 0.10058867186307907, + "learning_rate": 1.4985404719182194e-07, + "loss": 1.6724, + "step": 31802 + }, + { + "epoch": 9.761510128913443, + "grad_norm": 0.15335088968276978, + "learning_rate": 1.4946975141608143e-07, + "loss": 1.7056, + "step": 31803 + }, + { + "epoch": 9.761817065684468, + "grad_norm": 0.10160773992538452, + "learning_rate": 1.4908594829249889e-07, + "loss": 1.679, + "step": 31804 + }, + { + "epoch": 9.762124002455494, + "grad_norm": 0.14888420701026917, + "learning_rate": 1.487026378248657e-07, + "loss": 1.7528, + "step": 31805 + }, + { + "epoch": 9.762430939226519, + "grad_norm": 0.15325312316417694, + "learning_rate": 1.483198200169733e-07, + "loss": 1.7099, + "step": 31806 + }, + { + "epoch": 9.762737875997544, + "grad_norm": 0.15677091479301453, + "learning_rate": 1.47937494872602e-07, + "loss": 1.7254, + "step": 31807 + }, + { + "epoch": 9.76304481276857, + "grad_norm": 0.1600019633769989, + "learning_rate": 1.4755566239553209e-07, + "loss": 1.7397, + "step": 31808 + }, + { + "epoch": 9.763351749539595, + "grad_norm": 0.12324594706296921, + "learning_rate": 1.4717432258953834e-07, + "loss": 1.6872, + "step": 31809 + }, + { + "epoch": 9.76365868631062, + "grad_norm": 0.14790895581245422, + "learning_rate": 1.467934754583844e-07, + "loss": 1.7191, + "step": 31810 + }, + { + "epoch": 9.763965623081646, + "grad_norm": 0.17064516246318817, + "learning_rate": 1.464131210058395e-07, + "loss": 1.7568, + "step": 31811 + }, + { + "epoch": 9.764272559852671, + "grad_norm": 0.10418350994586945, + "learning_rate": 1.460332592356617e-07, + "loss": 1.6212, + "step": 31812 + }, + { + "epoch": 9.764579496623696, + "grad_norm": 0.13646866381168365, + "learning_rate": 1.4565389015159803e-07, + "loss": 1.7569, + "step": 31813 + }, + { + "epoch": 9.764886433394722, + "grad_norm": 0.1761285811662674, + "learning_rate": 1.452750137574066e-07, + "loss": 1.7455, + "step": 31814 + }, + { + "epoch": 9.765193370165745, + "grad_norm": 0.13357558846473694, + "learning_rate": 1.4489663005682885e-07, + "loss": 1.7087, + "step": 31815 + }, + { + "epoch": 9.76550030693677, + "grad_norm": 0.13213472068309784, + "learning_rate": 1.445187390536007e-07, + "loss": 1.6868, + "step": 31816 + }, + { + "epoch": 9.765807243707796, + "grad_norm": 0.1197780966758728, + "learning_rate": 1.4414134075146358e-07, + "loss": 1.6629, + "step": 31817 + }, + { + "epoch": 9.766114180478821, + "grad_norm": 0.15487425029277802, + "learning_rate": 1.437644351541423e-07, + "loss": 1.7581, + "step": 31818 + }, + { + "epoch": 9.766421117249847, + "grad_norm": 0.1623966544866562, + "learning_rate": 1.4338802226536165e-07, + "loss": 1.7389, + "step": 31819 + }, + { + "epoch": 9.766728054020872, + "grad_norm": 0.14654842019081116, + "learning_rate": 1.4301210208884085e-07, + "loss": 1.7342, + "step": 31820 + }, + { + "epoch": 9.767034990791897, + "grad_norm": 0.10672096908092499, + "learning_rate": 1.4263667462829923e-07, + "loss": 1.6767, + "step": 31821 + }, + { + "epoch": 9.767341927562923, + "grad_norm": 0.15439334511756897, + "learning_rate": 1.4226173988744485e-07, + "loss": 1.7443, + "step": 31822 + }, + { + "epoch": 9.767648864333948, + "grad_norm": 0.15827670693397522, + "learning_rate": 1.4188729786998034e-07, + "loss": 1.7237, + "step": 31823 + }, + { + "epoch": 9.767955801104973, + "grad_norm": 0.19204497337341309, + "learning_rate": 1.4151334857960828e-07, + "loss": 1.7565, + "step": 31824 + }, + { + "epoch": 9.768262737875997, + "grad_norm": 0.1090131625533104, + "learning_rate": 1.4113989202002575e-07, + "loss": 1.6489, + "step": 31825 + }, + { + "epoch": 9.768569674647022, + "grad_norm": 0.09001188725233078, + "learning_rate": 1.4076692819491865e-07, + "loss": 1.6905, + "step": 31826 + }, + { + "epoch": 9.768876611418047, + "grad_norm": 0.16483090817928314, + "learning_rate": 1.4039445710797849e-07, + "loss": 1.7372, + "step": 31827 + }, + { + "epoch": 9.769183548189073, + "grad_norm": 0.1322876214981079, + "learning_rate": 1.4002247876288565e-07, + "loss": 1.698, + "step": 31828 + }, + { + "epoch": 9.769490484960098, + "grad_norm": 0.13790275156497955, + "learning_rate": 1.3965099316331498e-07, + "loss": 1.7089, + "step": 31829 + }, + { + "epoch": 9.769797421731123, + "grad_norm": 0.1355939358472824, + "learning_rate": 1.392800003129302e-07, + "loss": 1.7114, + "step": 31830 + }, + { + "epoch": 9.770104358502149, + "grad_norm": 0.11927379667758942, + "learning_rate": 1.389095002154117e-07, + "loss": 1.6954, + "step": 31831 + }, + { + "epoch": 9.770411295273174, + "grad_norm": 0.14327041804790497, + "learning_rate": 1.3853949287441215e-07, + "loss": 1.6788, + "step": 31832 + }, + { + "epoch": 9.7707182320442, + "grad_norm": 0.11641334742307663, + "learning_rate": 1.381699782935897e-07, + "loss": 1.6544, + "step": 31833 + }, + { + "epoch": 9.771025168815225, + "grad_norm": 0.1395263820886612, + "learning_rate": 1.3780095647659696e-07, + "loss": 1.7292, + "step": 31834 + }, + { + "epoch": 9.77133210558625, + "grad_norm": 0.09742346405982971, + "learning_rate": 1.3743242742708108e-07, + "loss": 1.6545, + "step": 31835 + }, + { + "epoch": 9.771639042357274, + "grad_norm": 0.1271921843290329, + "learning_rate": 1.3706439114868354e-07, + "loss": 1.6955, + "step": 31836 + }, + { + "epoch": 9.771945979128299, + "grad_norm": 0.1599912941455841, + "learning_rate": 1.366968476450403e-07, + "loss": 1.7292, + "step": 31837 + }, + { + "epoch": 9.772252915899324, + "grad_norm": 0.12538601458072662, + "learning_rate": 1.3632979691978186e-07, + "loss": 1.6533, + "step": 31838 + }, + { + "epoch": 9.77255985267035, + "grad_norm": 0.14297179877758026, + "learning_rate": 1.3596323897654418e-07, + "loss": 1.6559, + "step": 31839 + }, + { + "epoch": 9.772866789441375, + "grad_norm": 0.2182641178369522, + "learning_rate": 1.3559717381894098e-07, + "loss": 1.6973, + "step": 31840 + }, + { + "epoch": 9.7731737262124, + "grad_norm": 0.12279269844293594, + "learning_rate": 1.3523160145059165e-07, + "loss": 1.6714, + "step": 31841 + }, + { + "epoch": 9.773480662983426, + "grad_norm": 0.107692651450634, + "learning_rate": 1.3486652187510994e-07, + "loss": 1.6728, + "step": 31842 + }, + { + "epoch": 9.773787599754451, + "grad_norm": 0.09669972956180573, + "learning_rate": 1.345019350961041e-07, + "loss": 1.6375, + "step": 31843 + }, + { + "epoch": 9.774094536525476, + "grad_norm": 0.2592116594314575, + "learning_rate": 1.341378411171823e-07, + "loss": 1.8469, + "step": 31844 + }, + { + "epoch": 9.774401473296502, + "grad_norm": 0.1268083155155182, + "learning_rate": 1.337742399419306e-07, + "loss": 1.6961, + "step": 31845 + }, + { + "epoch": 9.774708410067525, + "grad_norm": 0.11143521219491959, + "learning_rate": 1.3341113157395723e-07, + "loss": 1.6951, + "step": 31846 + }, + { + "epoch": 9.77501534683855, + "grad_norm": 0.15122225880622864, + "learning_rate": 1.330485160168371e-07, + "loss": 1.7612, + "step": 31847 + }, + { + "epoch": 9.775322283609576, + "grad_norm": 0.09748775511980057, + "learning_rate": 1.3268639327416177e-07, + "loss": 1.6303, + "step": 31848 + }, + { + "epoch": 9.775629220380601, + "grad_norm": 0.132316455245018, + "learning_rate": 1.3232476334950615e-07, + "loss": 1.6842, + "step": 31849 + }, + { + "epoch": 9.775936157151627, + "grad_norm": 0.13874708116054535, + "learning_rate": 1.319636262464452e-07, + "loss": 1.7458, + "step": 31850 + }, + { + "epoch": 9.776243093922652, + "grad_norm": 0.13404351472854614, + "learning_rate": 1.3160298196854827e-07, + "loss": 1.7135, + "step": 31851 + }, + { + "epoch": 9.776550030693677, + "grad_norm": 0.13872766494750977, + "learning_rate": 1.312428305193847e-07, + "loss": 1.7276, + "step": 31852 + }, + { + "epoch": 9.776856967464703, + "grad_norm": 0.11643758416175842, + "learning_rate": 1.3088317190250165e-07, + "loss": 1.6704, + "step": 31853 + }, + { + "epoch": 9.777163904235728, + "grad_norm": 0.10052239894866943, + "learning_rate": 1.30524006121463e-07, + "loss": 1.6525, + "step": 31854 + }, + { + "epoch": 9.777470841006753, + "grad_norm": 0.1288158893585205, + "learning_rate": 1.3016533317981582e-07, + "loss": 1.7037, + "step": 31855 + }, + { + "epoch": 9.777777777777779, + "grad_norm": 0.17798054218292236, + "learning_rate": 1.2980715308110737e-07, + "loss": 1.7172, + "step": 31856 + }, + { + "epoch": 9.778084714548802, + "grad_norm": 0.19317014515399933, + "learning_rate": 1.294494658288681e-07, + "loss": 1.8052, + "step": 31857 + }, + { + "epoch": 9.778391651319827, + "grad_norm": 0.16400828957557678, + "learning_rate": 1.2909227142664515e-07, + "loss": 1.7391, + "step": 31858 + }, + { + "epoch": 9.778698588090853, + "grad_norm": 0.17417314648628235, + "learning_rate": 1.2873556987795798e-07, + "loss": 1.7649, + "step": 31859 + }, + { + "epoch": 9.779005524861878, + "grad_norm": 0.1729496717453003, + "learning_rate": 1.2837936118633708e-07, + "loss": 1.7499, + "step": 31860 + }, + { + "epoch": 9.779312461632903, + "grad_norm": 0.14423789083957672, + "learning_rate": 1.2802364535530742e-07, + "loss": 1.6673, + "step": 31861 + }, + { + "epoch": 9.779619398403929, + "grad_norm": 0.15968292951583862, + "learning_rate": 1.2766842238837172e-07, + "loss": 1.662, + "step": 31862 + }, + { + "epoch": 9.779926335174954, + "grad_norm": 0.21190059185028076, + "learning_rate": 1.27313692289055e-07, + "loss": 1.7089, + "step": 31863 + }, + { + "epoch": 9.78023327194598, + "grad_norm": 0.16070419549942017, + "learning_rate": 1.2695945506084884e-07, + "loss": 1.7201, + "step": 31864 + }, + { + "epoch": 9.780540208717005, + "grad_norm": 0.15129558742046356, + "learning_rate": 1.2660571070726157e-07, + "loss": 1.6998, + "step": 31865 + }, + { + "epoch": 9.78084714548803, + "grad_norm": 0.11870043724775314, + "learning_rate": 1.2625245923179042e-07, + "loss": 1.6716, + "step": 31866 + }, + { + "epoch": 9.781154082259054, + "grad_norm": 0.1265040785074234, + "learning_rate": 1.25899700637927e-07, + "loss": 1.7115, + "step": 31867 + }, + { + "epoch": 9.781461019030079, + "grad_norm": 0.15591993927955627, + "learning_rate": 1.2554743492915188e-07, + "loss": 1.741, + "step": 31868 + }, + { + "epoch": 9.781767955801104, + "grad_norm": 0.1468917280435562, + "learning_rate": 1.2519566210895117e-07, + "loss": 1.7176, + "step": 31869 + }, + { + "epoch": 9.78207489257213, + "grad_norm": 0.1019337847828865, + "learning_rate": 1.248443821807943e-07, + "loss": 1.6766, + "step": 31870 + }, + { + "epoch": 9.782381829343155, + "grad_norm": 0.1504385769367218, + "learning_rate": 1.2449359514816183e-07, + "loss": 1.6557, + "step": 31871 + }, + { + "epoch": 9.78268876611418, + "grad_norm": 0.11650592088699341, + "learning_rate": 1.2414330101451765e-07, + "loss": 1.6646, + "step": 31872 + }, + { + "epoch": 9.782995702885206, + "grad_norm": 0.13004426658153534, + "learning_rate": 1.2379349978332012e-07, + "loss": 1.6843, + "step": 31873 + }, + { + "epoch": 9.783302639656231, + "grad_norm": 0.1746869832277298, + "learning_rate": 1.234441914580331e-07, + "loss": 1.7148, + "step": 31874 + }, + { + "epoch": 9.783609576427256, + "grad_norm": 0.18265002965927124, + "learning_rate": 1.230953760420983e-07, + "loss": 1.7844, + "step": 31875 + }, + { + "epoch": 9.783916513198282, + "grad_norm": 0.14182110130786896, + "learning_rate": 1.227470535389741e-07, + "loss": 1.7137, + "step": 31876 + }, + { + "epoch": 9.784223449969307, + "grad_norm": 0.12887395918369293, + "learning_rate": 1.2239922395209102e-07, + "loss": 1.7023, + "step": 31877 + }, + { + "epoch": 9.784530386740332, + "grad_norm": 0.15748070180416107, + "learning_rate": 1.2205188728489636e-07, + "loss": 1.7341, + "step": 31878 + }, + { + "epoch": 9.784837323511356, + "grad_norm": 0.13010992109775543, + "learning_rate": 1.2170504354082068e-07, + "loss": 1.6693, + "step": 31879 + }, + { + "epoch": 9.785144260282381, + "grad_norm": 0.15437988936901093, + "learning_rate": 1.2135869272328905e-07, + "loss": 1.6842, + "step": 31880 + }, + { + "epoch": 9.785451197053407, + "grad_norm": 0.12763908505439758, + "learning_rate": 1.2101283483572644e-07, + "loss": 1.6812, + "step": 31881 + }, + { + "epoch": 9.785758133824432, + "grad_norm": 0.1640697419643402, + "learning_rate": 1.206674698815524e-07, + "loss": 1.7035, + "step": 31882 + }, + { + "epoch": 9.786065070595457, + "grad_norm": 0.17316879332065582, + "learning_rate": 1.203225978641753e-07, + "loss": 1.7718, + "step": 31883 + }, + { + "epoch": 9.786372007366483, + "grad_norm": 0.13569143414497375, + "learning_rate": 1.1997821878700355e-07, + "loss": 1.6925, + "step": 31884 + }, + { + "epoch": 9.786678944137508, + "grad_norm": 0.12150706350803375, + "learning_rate": 1.1963433265344548e-07, + "loss": 1.6742, + "step": 31885 + }, + { + "epoch": 9.786985880908533, + "grad_norm": 0.120942622423172, + "learning_rate": 1.1929093946689284e-07, + "loss": 1.6946, + "step": 31886 + }, + { + "epoch": 9.787292817679559, + "grad_norm": 0.15821385383605957, + "learning_rate": 1.1894803923074849e-07, + "loss": 1.7163, + "step": 31887 + }, + { + "epoch": 9.787599754450584, + "grad_norm": 0.14717862010002136, + "learning_rate": 1.1860563194839302e-07, + "loss": 1.6732, + "step": 31888 + }, + { + "epoch": 9.787906691221608, + "grad_norm": 0.17104555666446686, + "learning_rate": 1.1826371762321264e-07, + "loss": 1.7943, + "step": 31889 + }, + { + "epoch": 9.788213627992633, + "grad_norm": 0.10379209369421005, + "learning_rate": 1.1792229625858797e-07, + "loss": 1.6595, + "step": 31890 + }, + { + "epoch": 9.788520564763658, + "grad_norm": 0.1118491068482399, + "learning_rate": 1.1758136785788854e-07, + "loss": 1.6865, + "step": 31891 + }, + { + "epoch": 9.788827501534684, + "grad_norm": 0.14659619331359863, + "learning_rate": 1.1724093242448941e-07, + "loss": 1.6714, + "step": 31892 + }, + { + "epoch": 9.789134438305709, + "grad_norm": 0.17299702763557434, + "learning_rate": 1.1690098996175458e-07, + "loss": 1.6933, + "step": 31893 + }, + { + "epoch": 9.789441375076734, + "grad_norm": 0.1982281357049942, + "learning_rate": 1.1656154047303691e-07, + "loss": 1.7074, + "step": 31894 + }, + { + "epoch": 9.78974831184776, + "grad_norm": 0.17668111622333527, + "learning_rate": 1.1622258396170594e-07, + "loss": 1.7077, + "step": 31895 + }, + { + "epoch": 9.790055248618785, + "grad_norm": 0.1569826602935791, + "learning_rate": 1.1588412043109232e-07, + "loss": 1.6985, + "step": 31896 + }, + { + "epoch": 9.79036218538981, + "grad_norm": 0.12177947908639908, + "learning_rate": 1.1554614988454893e-07, + "loss": 1.6546, + "step": 31897 + }, + { + "epoch": 9.790669122160836, + "grad_norm": 0.1377127766609192, + "learning_rate": 1.1520867232541755e-07, + "loss": 1.6724, + "step": 31898 + }, + { + "epoch": 9.79097605893186, + "grad_norm": 0.13367579877376556, + "learning_rate": 1.1487168775703439e-07, + "loss": 1.715, + "step": 31899 + }, + { + "epoch": 9.791282995702884, + "grad_norm": 0.14254575967788696, + "learning_rate": 1.1453519618273012e-07, + "loss": 1.6968, + "step": 31900 + }, + { + "epoch": 9.79158993247391, + "grad_norm": 0.15228238701820374, + "learning_rate": 1.1419919760582432e-07, + "loss": 1.7052, + "step": 31901 + }, + { + "epoch": 9.791896869244935, + "grad_norm": 0.14899186789989471, + "learning_rate": 1.1386369202964209e-07, + "loss": 1.7289, + "step": 31902 + }, + { + "epoch": 9.79220380601596, + "grad_norm": 0.10609392821788788, + "learning_rate": 1.13528679457503e-07, + "loss": 1.6753, + "step": 31903 + }, + { + "epoch": 9.792510742786986, + "grad_norm": 0.1678643375635147, + "learning_rate": 1.1319415989270443e-07, + "loss": 1.7145, + "step": 31904 + }, + { + "epoch": 9.792817679558011, + "grad_norm": 0.1617528349161148, + "learning_rate": 1.1286013333856594e-07, + "loss": 1.7059, + "step": 31905 + }, + { + "epoch": 9.793124616329036, + "grad_norm": 0.13943657279014587, + "learning_rate": 1.1252659979837932e-07, + "loss": 1.6964, + "step": 31906 + }, + { + "epoch": 9.793431553100062, + "grad_norm": 0.18889422714710236, + "learning_rate": 1.121935592754475e-07, + "loss": 1.7841, + "step": 31907 + }, + { + "epoch": 9.793738489871087, + "grad_norm": 0.1229872852563858, + "learning_rate": 1.118610117730623e-07, + "loss": 1.7406, + "step": 31908 + }, + { + "epoch": 9.794045426642112, + "grad_norm": 0.1400493085384369, + "learning_rate": 1.115289572945044e-07, + "loss": 1.7183, + "step": 31909 + }, + { + "epoch": 9.794352363413136, + "grad_norm": 0.24427293241024017, + "learning_rate": 1.1119739584305456e-07, + "loss": 1.6977, + "step": 31910 + }, + { + "epoch": 9.794659300184161, + "grad_norm": 0.19268591701984406, + "learning_rate": 1.1086632742199343e-07, + "loss": 1.7625, + "step": 31911 + }, + { + "epoch": 9.794966236955187, + "grad_norm": 0.10926581919193268, + "learning_rate": 1.105357520345962e-07, + "loss": 1.68, + "step": 31912 + }, + { + "epoch": 9.795273173726212, + "grad_norm": 0.16322609782218933, + "learning_rate": 1.1020566968412138e-07, + "loss": 1.7282, + "step": 31913 + }, + { + "epoch": 9.795580110497237, + "grad_norm": 0.1540069282054901, + "learning_rate": 1.0987608037383857e-07, + "loss": 1.7384, + "step": 31914 + }, + { + "epoch": 9.795887047268263, + "grad_norm": 0.20092691481113434, + "learning_rate": 1.095469841070007e-07, + "loss": 1.7189, + "step": 31915 + }, + { + "epoch": 9.796193984039288, + "grad_norm": 0.1929512470960617, + "learning_rate": 1.0921838088686076e-07, + "loss": 1.7626, + "step": 31916 + }, + { + "epoch": 9.796500920810313, + "grad_norm": 0.17819680273532867, + "learning_rate": 1.0889027071667168e-07, + "loss": 1.6963, + "step": 31917 + }, + { + "epoch": 9.796807857581339, + "grad_norm": 0.10324428975582123, + "learning_rate": 1.0856265359966422e-07, + "loss": 1.6863, + "step": 31918 + }, + { + "epoch": 9.797114794352364, + "grad_norm": 0.17684327065944672, + "learning_rate": 1.0823552953908578e-07, + "loss": 1.7065, + "step": 31919 + }, + { + "epoch": 9.79742173112339, + "grad_norm": 0.11119870841503143, + "learning_rate": 1.079088985381671e-07, + "loss": 1.6706, + "step": 31920 + }, + { + "epoch": 9.797728667894415, + "grad_norm": 0.16475334763526917, + "learning_rate": 1.0758276060013339e-07, + "loss": 1.7312, + "step": 31921 + }, + { + "epoch": 9.798035604665438, + "grad_norm": 0.08758127689361572, + "learning_rate": 1.0725711572820984e-07, + "loss": 1.6422, + "step": 31922 + }, + { + "epoch": 9.798342541436464, + "grad_norm": 0.17832210659980774, + "learning_rate": 1.069319639256161e-07, + "loss": 1.7204, + "step": 31923 + }, + { + "epoch": 9.798649478207489, + "grad_norm": 0.13107214868068695, + "learning_rate": 1.0660730519556628e-07, + "loss": 1.6636, + "step": 31924 + }, + { + "epoch": 9.798956414978514, + "grad_norm": 0.10268855839967728, + "learning_rate": 1.0628313954126335e-07, + "loss": 1.6844, + "step": 31925 + }, + { + "epoch": 9.79926335174954, + "grad_norm": 0.16402679681777954, + "learning_rate": 1.0595946696591586e-07, + "loss": 1.7028, + "step": 31926 + }, + { + "epoch": 9.799570288520565, + "grad_norm": 0.1430855542421341, + "learning_rate": 1.056362874727157e-07, + "loss": 1.6792, + "step": 31927 + }, + { + "epoch": 9.79987722529159, + "grad_norm": 0.16672997176647186, + "learning_rate": 1.0531360106486587e-07, + "loss": 1.7266, + "step": 31928 + }, + { + "epoch": 9.800184162062616, + "grad_norm": 0.18226337432861328, + "learning_rate": 1.0499140774555272e-07, + "loss": 1.7236, + "step": 31929 + }, + { + "epoch": 9.800491098833641, + "grad_norm": 0.1977282166481018, + "learning_rate": 1.0466970751795701e-07, + "loss": 1.7739, + "step": 31930 + }, + { + "epoch": 9.800798035604666, + "grad_norm": 0.17272333800792694, + "learning_rate": 1.0434850038525956e-07, + "loss": 1.7177, + "step": 31931 + }, + { + "epoch": 9.80110497237569, + "grad_norm": 0.1538221389055252, + "learning_rate": 1.0402778635063004e-07, + "loss": 1.7038, + "step": 31932 + }, + { + "epoch": 9.801411909146715, + "grad_norm": 0.1327001303434372, + "learning_rate": 1.0370756541724924e-07, + "loss": 1.6885, + "step": 31933 + }, + { + "epoch": 9.80171884591774, + "grad_norm": 0.15500570833683014, + "learning_rate": 1.0338783758827575e-07, + "loss": 1.7787, + "step": 31934 + }, + { + "epoch": 9.802025782688766, + "grad_norm": 0.14874790608882904, + "learning_rate": 1.0306860286686815e-07, + "loss": 1.731, + "step": 31935 + }, + { + "epoch": 9.802332719459791, + "grad_norm": 0.13585814833641052, + "learning_rate": 1.0274986125617947e-07, + "loss": 1.6973, + "step": 31936 + }, + { + "epoch": 9.802639656230816, + "grad_norm": 0.15876494348049164, + "learning_rate": 1.0243161275936274e-07, + "loss": 1.7255, + "step": 31937 + }, + { + "epoch": 9.802946593001842, + "grad_norm": 0.18510127067565918, + "learning_rate": 1.0211385737956546e-07, + "loss": 1.7395, + "step": 31938 + }, + { + "epoch": 9.803253529772867, + "grad_norm": 0.13107381761074066, + "learning_rate": 1.01796595119924e-07, + "loss": 1.6876, + "step": 31939 + }, + { + "epoch": 9.803560466543892, + "grad_norm": 0.10170239210128784, + "learning_rate": 1.0147982598357474e-07, + "loss": 1.7082, + "step": 31940 + }, + { + "epoch": 9.803867403314918, + "grad_norm": 0.15952639281749725, + "learning_rate": 1.0116354997364851e-07, + "loss": 1.718, + "step": 31941 + }, + { + "epoch": 9.804174340085943, + "grad_norm": 0.11146245896816254, + "learning_rate": 1.0084776709327059e-07, + "loss": 1.6555, + "step": 31942 + }, + { + "epoch": 9.804481276856967, + "grad_norm": 0.13348564505577087, + "learning_rate": 1.0053247734556071e-07, + "loss": 1.7032, + "step": 31943 + }, + { + "epoch": 9.804788213627992, + "grad_norm": 0.10820803791284561, + "learning_rate": 1.0021768073363858e-07, + "loss": 1.7091, + "step": 31944 + }, + { + "epoch": 9.805095150399017, + "grad_norm": 0.11644341796636581, + "learning_rate": 9.990337726061283e-08, + "loss": 1.678, + "step": 31945 + }, + { + "epoch": 9.805402087170043, + "grad_norm": 0.1656201332807541, + "learning_rate": 9.958956692958655e-08, + "loss": 1.7609, + "step": 31946 + }, + { + "epoch": 9.805709023941068, + "grad_norm": 0.12365484982728958, + "learning_rate": 9.927624974366279e-08, + "loss": 1.6953, + "step": 31947 + }, + { + "epoch": 9.806015960712093, + "grad_norm": 0.14887237548828125, + "learning_rate": 9.896342570593909e-08, + "loss": 1.6941, + "step": 31948 + }, + { + "epoch": 9.806322897483119, + "grad_norm": 0.14070530235767365, + "learning_rate": 9.86510948195074e-08, + "loss": 1.7096, + "step": 31949 + }, + { + "epoch": 9.806629834254144, + "grad_norm": 0.14970767498016357, + "learning_rate": 9.833925708745418e-08, + "loss": 1.7519, + "step": 31950 + }, + { + "epoch": 9.80693677102517, + "grad_norm": 0.1032944917678833, + "learning_rate": 9.802791251286026e-08, + "loss": 1.6524, + "step": 31951 + }, + { + "epoch": 9.807243707796195, + "grad_norm": 0.13888783752918243, + "learning_rate": 9.771706109880652e-08, + "loss": 1.706, + "step": 31952 + }, + { + "epoch": 9.807550644567218, + "grad_norm": 0.16892662644386292, + "learning_rate": 9.740670284835718e-08, + "loss": 1.7865, + "step": 31953 + }, + { + "epoch": 9.807857581338244, + "grad_norm": 0.15382327139377594, + "learning_rate": 9.709683776458755e-08, + "loss": 1.7028, + "step": 31954 + }, + { + "epoch": 9.808164518109269, + "grad_norm": 0.1603674590587616, + "learning_rate": 9.678746585055077e-08, + "loss": 1.7402, + "step": 31955 + }, + { + "epoch": 9.808471454880294, + "grad_norm": 0.13476061820983887, + "learning_rate": 9.647858710931102e-08, + "loss": 1.7446, + "step": 31956 + }, + { + "epoch": 9.80877839165132, + "grad_norm": 0.14579132199287415, + "learning_rate": 9.617020154392142e-08, + "loss": 1.732, + "step": 31957 + }, + { + "epoch": 9.809085328422345, + "grad_norm": 0.13551945984363556, + "learning_rate": 9.586230915742955e-08, + "loss": 1.7255, + "step": 31958 + }, + { + "epoch": 9.80939226519337, + "grad_norm": 0.120823934674263, + "learning_rate": 9.555490995287186e-08, + "loss": 1.6622, + "step": 31959 + }, + { + "epoch": 9.809699201964396, + "grad_norm": 0.15336096286773682, + "learning_rate": 9.524800393329037e-08, + "loss": 1.7586, + "step": 31960 + }, + { + "epoch": 9.810006138735421, + "grad_norm": 0.14215800166130066, + "learning_rate": 9.494159110172151e-08, + "loss": 1.7418, + "step": 31961 + }, + { + "epoch": 9.810313075506446, + "grad_norm": 0.12923559546470642, + "learning_rate": 9.463567146118513e-08, + "loss": 1.697, + "step": 31962 + }, + { + "epoch": 9.810620012277472, + "grad_norm": 0.11306928843259811, + "learning_rate": 9.43302450147121e-08, + "loss": 1.7063, + "step": 31963 + }, + { + "epoch": 9.810926949048497, + "grad_norm": 0.1500793844461441, + "learning_rate": 9.40253117653167e-08, + "loss": 1.7492, + "step": 31964 + }, + { + "epoch": 9.81123388581952, + "grad_norm": 0.13170574605464935, + "learning_rate": 9.372087171601873e-08, + "loss": 1.73, + "step": 31965 + }, + { + "epoch": 9.811540822590546, + "grad_norm": 0.15149074792861938, + "learning_rate": 9.341692486981579e-08, + "loss": 1.716, + "step": 31966 + }, + { + "epoch": 9.811847759361571, + "grad_norm": 0.10818208009004593, + "learning_rate": 9.311347122972769e-08, + "loss": 1.6936, + "step": 31967 + }, + { + "epoch": 9.812154696132596, + "grad_norm": 0.1262877881526947, + "learning_rate": 9.281051079873537e-08, + "loss": 1.7042, + "step": 31968 + }, + { + "epoch": 9.812461632903622, + "grad_norm": 0.18285219371318817, + "learning_rate": 9.2508043579842e-08, + "loss": 1.6812, + "step": 31969 + }, + { + "epoch": 9.812768569674647, + "grad_norm": 0.30483585596084595, + "learning_rate": 9.220606957603406e-08, + "loss": 1.8422, + "step": 31970 + }, + { + "epoch": 9.813075506445673, + "grad_norm": 0.14469990134239197, + "learning_rate": 9.190458879030362e-08, + "loss": 1.7624, + "step": 31971 + }, + { + "epoch": 9.813382443216698, + "grad_norm": 0.20810872316360474, + "learning_rate": 9.16036012256205e-08, + "loss": 1.7293, + "step": 31972 + }, + { + "epoch": 9.813689379987723, + "grad_norm": 0.16292090713977814, + "learning_rate": 9.130310688496013e-08, + "loss": 1.681, + "step": 31973 + }, + { + "epoch": 9.813996316758749, + "grad_norm": 0.17718647420406342, + "learning_rate": 9.100310577130345e-08, + "loss": 1.728, + "step": 31974 + }, + { + "epoch": 9.814303253529772, + "grad_norm": 0.13324975967407227, + "learning_rate": 9.070359788759808e-08, + "loss": 1.6776, + "step": 31975 + }, + { + "epoch": 9.814610190300797, + "grad_norm": 0.15602141618728638, + "learning_rate": 9.040458323681389e-08, + "loss": 1.6902, + "step": 31976 + }, + { + "epoch": 9.814917127071823, + "grad_norm": 0.12438742071390152, + "learning_rate": 9.010606182190962e-08, + "loss": 1.7132, + "step": 31977 + }, + { + "epoch": 9.815224063842848, + "grad_norm": 0.15935616195201874, + "learning_rate": 8.980803364582734e-08, + "loss": 1.7432, + "step": 31978 + }, + { + "epoch": 9.815531000613873, + "grad_norm": 0.18075346946716309, + "learning_rate": 8.951049871151474e-08, + "loss": 1.7268, + "step": 31979 + }, + { + "epoch": 9.815837937384899, + "grad_norm": 0.11405523866415024, + "learning_rate": 8.921345702191386e-08, + "loss": 1.6414, + "step": 31980 + }, + { + "epoch": 9.816144874155924, + "grad_norm": 0.12962454557418823, + "learning_rate": 8.891690857995572e-08, + "loss": 1.6943, + "step": 31981 + }, + { + "epoch": 9.81645181092695, + "grad_norm": 0.14757606387138367, + "learning_rate": 8.862085338857685e-08, + "loss": 1.7313, + "step": 31982 + }, + { + "epoch": 9.816758747697975, + "grad_norm": 0.16997574269771576, + "learning_rate": 8.832529145070267e-08, + "loss": 1.7795, + "step": 31983 + }, + { + "epoch": 9.817065684469, + "grad_norm": 0.13103361427783966, + "learning_rate": 8.80302227692531e-08, + "loss": 1.7153, + "step": 31984 + }, + { + "epoch": 9.817372621240025, + "grad_norm": 0.13774408400058746, + "learning_rate": 8.773564734713691e-08, + "loss": 1.7125, + "step": 31985 + }, + { + "epoch": 9.817679558011049, + "grad_norm": 0.10313444584608078, + "learning_rate": 8.744156518727398e-08, + "loss": 1.7061, + "step": 31986 + }, + { + "epoch": 9.817986494782074, + "grad_norm": 0.14256370067596436, + "learning_rate": 8.71479762925731e-08, + "loss": 1.7078, + "step": 31987 + }, + { + "epoch": 9.8182934315531, + "grad_norm": 0.13552837073802948, + "learning_rate": 8.685488066592639e-08, + "loss": 1.7447, + "step": 31988 + }, + { + "epoch": 9.818600368324125, + "grad_norm": 0.1388518065214157, + "learning_rate": 8.656227831023711e-08, + "loss": 1.7332, + "step": 31989 + }, + { + "epoch": 9.81890730509515, + "grad_norm": 0.09268537908792496, + "learning_rate": 8.627016922839182e-08, + "loss": 1.6371, + "step": 31990 + }, + { + "epoch": 9.819214241866176, + "grad_norm": 0.10252194851636887, + "learning_rate": 8.597855342328265e-08, + "loss": 1.6794, + "step": 31991 + }, + { + "epoch": 9.819521178637201, + "grad_norm": 0.08967567980289459, + "learning_rate": 8.568743089778509e-08, + "loss": 1.6455, + "step": 31992 + }, + { + "epoch": 9.819828115408226, + "grad_norm": 0.15265701711177826, + "learning_rate": 8.539680165478569e-08, + "loss": 1.7244, + "step": 31993 + }, + { + "epoch": 9.820135052179252, + "grad_norm": 0.16557417809963226, + "learning_rate": 8.510666569714332e-08, + "loss": 1.7676, + "step": 31994 + }, + { + "epoch": 9.820441988950277, + "grad_norm": 0.09994948655366898, + "learning_rate": 8.481702302773897e-08, + "loss": 1.6743, + "step": 31995 + }, + { + "epoch": 9.8207489257213, + "grad_norm": 0.13728035986423492, + "learning_rate": 8.452787364943149e-08, + "loss": 1.6864, + "step": 31996 + }, + { + "epoch": 9.821055862492326, + "grad_norm": 0.21103262901306152, + "learning_rate": 8.423921756506858e-08, + "loss": 1.7673, + "step": 31997 + }, + { + "epoch": 9.821362799263351, + "grad_norm": 0.146772101521492, + "learning_rate": 8.395105477751464e-08, + "loss": 1.7245, + "step": 31998 + }, + { + "epoch": 9.821669736034377, + "grad_norm": 0.1592164784669876, + "learning_rate": 8.366338528961182e-08, + "loss": 1.7612, + "step": 31999 + }, + { + "epoch": 9.821976672805402, + "grad_norm": 0.15586064755916595, + "learning_rate": 8.337620910420229e-08, + "loss": 1.7142, + "step": 32000 + }, + { + "epoch": 9.822283609576427, + "grad_norm": 0.14506274461746216, + "learning_rate": 8.30895262241338e-08, + "loss": 1.7085, + "step": 32001 + }, + { + "epoch": 9.822590546347453, + "grad_norm": 0.11904678493738174, + "learning_rate": 8.280333665222073e-08, + "loss": 1.7024, + "step": 32002 + }, + { + "epoch": 9.822897483118478, + "grad_norm": 0.14538206160068512, + "learning_rate": 8.251764039131083e-08, + "loss": 1.7207, + "step": 32003 + }, + { + "epoch": 9.823204419889503, + "grad_norm": 0.17649157345294952, + "learning_rate": 8.223243744421849e-08, + "loss": 1.684, + "step": 32004 + }, + { + "epoch": 9.823511356660529, + "grad_norm": 0.13790307939052582, + "learning_rate": 8.194772781375815e-08, + "loss": 1.7083, + "step": 32005 + }, + { + "epoch": 9.823818293431554, + "grad_norm": 0.12401477247476578, + "learning_rate": 8.166351150274976e-08, + "loss": 1.6712, + "step": 32006 + }, + { + "epoch": 9.824125230202577, + "grad_norm": 0.13443689048290253, + "learning_rate": 8.137978851400219e-08, + "loss": 1.7134, + "step": 32007 + }, + { + "epoch": 9.824432166973603, + "grad_norm": 0.11961400508880615, + "learning_rate": 8.109655885031875e-08, + "loss": 1.6478, + "step": 32008 + }, + { + "epoch": 9.824739103744628, + "grad_norm": 0.14795053005218506, + "learning_rate": 8.081382251449721e-08, + "loss": 1.7182, + "step": 32009 + }, + { + "epoch": 9.825046040515653, + "grad_norm": 0.10425613820552826, + "learning_rate": 8.053157950932977e-08, + "loss": 1.6385, + "step": 32010 + }, + { + "epoch": 9.825352977286679, + "grad_norm": 0.11885244399309158, + "learning_rate": 8.024982983760864e-08, + "loss": 1.6764, + "step": 32011 + }, + { + "epoch": 9.825659914057704, + "grad_norm": 0.11422543227672577, + "learning_rate": 7.99685735021205e-08, + "loss": 1.6778, + "step": 32012 + }, + { + "epoch": 9.82596685082873, + "grad_norm": 0.12039226293563843, + "learning_rate": 7.968781050564089e-08, + "loss": 1.6843, + "step": 32013 + }, + { + "epoch": 9.826273787599755, + "grad_norm": 0.13094797730445862, + "learning_rate": 7.940754085094537e-08, + "loss": 1.6757, + "step": 32014 + }, + { + "epoch": 9.82658072437078, + "grad_norm": 0.14221440255641937, + "learning_rate": 7.91277645407984e-08, + "loss": 1.7307, + "step": 32015 + }, + { + "epoch": 9.826887661141805, + "grad_norm": 0.11989296972751617, + "learning_rate": 7.884848157798109e-08, + "loss": 1.6651, + "step": 32016 + }, + { + "epoch": 9.82719459791283, + "grad_norm": 0.1768631786108017, + "learning_rate": 7.856969196523567e-08, + "loss": 1.7294, + "step": 32017 + }, + { + "epoch": 9.827501534683854, + "grad_norm": 0.1401507407426834, + "learning_rate": 7.829139570532662e-08, + "loss": 1.6958, + "step": 32018 + }, + { + "epoch": 9.82780847145488, + "grad_norm": 0.1531054675579071, + "learning_rate": 7.801359280099618e-08, + "loss": 1.7176, + "step": 32019 + }, + { + "epoch": 9.828115408225905, + "grad_norm": 0.17227032780647278, + "learning_rate": 7.773628325500326e-08, + "loss": 1.6941, + "step": 32020 + }, + { + "epoch": 9.82842234499693, + "grad_norm": 0.15229587256908417, + "learning_rate": 7.745946707007345e-08, + "loss": 1.6899, + "step": 32021 + }, + { + "epoch": 9.828729281767956, + "grad_norm": 0.1732887476682663, + "learning_rate": 7.718314424895457e-08, + "loss": 1.7557, + "step": 32022 + }, + { + "epoch": 9.829036218538981, + "grad_norm": 0.11568398028612137, + "learning_rate": 7.690731479437218e-08, + "loss": 1.7077, + "step": 32023 + }, + { + "epoch": 9.829343155310006, + "grad_norm": 0.12425289303064346, + "learning_rate": 7.663197870905747e-08, + "loss": 1.6748, + "step": 32024 + }, + { + "epoch": 9.829650092081032, + "grad_norm": 0.13480359315872192, + "learning_rate": 7.635713599571936e-08, + "loss": 1.6874, + "step": 32025 + }, + { + "epoch": 9.829957028852057, + "grad_norm": 0.1616349071264267, + "learning_rate": 7.608278665708346e-08, + "loss": 1.7273, + "step": 32026 + }, + { + "epoch": 9.830263965623082, + "grad_norm": 0.15407976508140564, + "learning_rate": 7.58089306958587e-08, + "loss": 1.6929, + "step": 32027 + }, + { + "epoch": 9.830570902394108, + "grad_norm": 0.14456650614738464, + "learning_rate": 7.553556811475404e-08, + "loss": 1.6634, + "step": 32028 + }, + { + "epoch": 9.830877839165131, + "grad_norm": 0.11235690861940384, + "learning_rate": 7.526269891646176e-08, + "loss": 1.6862, + "step": 32029 + }, + { + "epoch": 9.831184775936157, + "grad_norm": 0.11624839901924133, + "learning_rate": 7.49903231036908e-08, + "loss": 1.6836, + "step": 32030 + }, + { + "epoch": 9.831491712707182, + "grad_norm": 0.1717003732919693, + "learning_rate": 7.471844067912792e-08, + "loss": 1.7182, + "step": 32031 + }, + { + "epoch": 9.831798649478207, + "grad_norm": 0.1300148069858551, + "learning_rate": 7.444705164545429e-08, + "loss": 1.7267, + "step": 32032 + }, + { + "epoch": 9.832105586249233, + "grad_norm": 0.18420568108558655, + "learning_rate": 7.417615600536221e-08, + "loss": 1.7433, + "step": 32033 + }, + { + "epoch": 9.832412523020258, + "grad_norm": 0.16578641533851624, + "learning_rate": 7.390575376152176e-08, + "loss": 1.7013, + "step": 32034 + }, + { + "epoch": 9.832719459791283, + "grad_norm": 0.19031740725040436, + "learning_rate": 7.363584491660858e-08, + "loss": 1.783, + "step": 32035 + }, + { + "epoch": 9.833026396562309, + "grad_norm": 0.14676955342292786, + "learning_rate": 7.336642947328721e-08, + "loss": 1.6847, + "step": 32036 + }, + { + "epoch": 9.833333333333334, + "grad_norm": 0.10915904492139816, + "learning_rate": 7.309750743422217e-08, + "loss": 1.6494, + "step": 32037 + }, + { + "epoch": 9.83364027010436, + "grad_norm": 0.20945672690868378, + "learning_rate": 7.282907880207245e-08, + "loss": 1.6974, + "step": 32038 + }, + { + "epoch": 9.833947206875383, + "grad_norm": 0.12456732988357544, + "learning_rate": 7.25611435794915e-08, + "loss": 1.6804, + "step": 32039 + }, + { + "epoch": 9.834254143646408, + "grad_norm": 0.1883053332567215, + "learning_rate": 7.229370176911609e-08, + "loss": 1.7588, + "step": 32040 + }, + { + "epoch": 9.834561080417433, + "grad_norm": 0.15548336505889893, + "learning_rate": 7.202675337360521e-08, + "loss": 1.7235, + "step": 32041 + }, + { + "epoch": 9.834868017188459, + "grad_norm": 0.12813648581504822, + "learning_rate": 7.176029839558451e-08, + "loss": 1.6852, + "step": 32042 + }, + { + "epoch": 9.835174953959484, + "grad_norm": 0.1417354941368103, + "learning_rate": 7.149433683769635e-08, + "loss": 1.6887, + "step": 32043 + }, + { + "epoch": 9.83548189073051, + "grad_norm": 0.16405703127384186, + "learning_rate": 7.12288687025664e-08, + "loss": 1.793, + "step": 32044 + }, + { + "epoch": 9.835788827501535, + "grad_norm": 0.10414276272058487, + "learning_rate": 7.096389399281478e-08, + "loss": 1.6463, + "step": 32045 + }, + { + "epoch": 9.83609576427256, + "grad_norm": 0.1333547830581665, + "learning_rate": 7.069941271106162e-08, + "loss": 1.6769, + "step": 32046 + }, + { + "epoch": 9.836402701043585, + "grad_norm": 0.13679614663124084, + "learning_rate": 7.043542485992149e-08, + "loss": 1.6981, + "step": 32047 + }, + { + "epoch": 9.83670963781461, + "grad_norm": 0.19633722305297852, + "learning_rate": 7.017193044200343e-08, + "loss": 1.7121, + "step": 32048 + }, + { + "epoch": 9.837016574585636, + "grad_norm": 0.1266251504421234, + "learning_rate": 6.99089294599109e-08, + "loss": 1.6858, + "step": 32049 + }, + { + "epoch": 9.83732351135666, + "grad_norm": 0.12430547177791595, + "learning_rate": 6.964642191624182e-08, + "loss": 1.7402, + "step": 32050 + }, + { + "epoch": 9.837630448127685, + "grad_norm": 0.11596968024969101, + "learning_rate": 6.938440781359413e-08, + "loss": 1.6893, + "step": 32051 + }, + { + "epoch": 9.83793738489871, + "grad_norm": 0.1783151626586914, + "learning_rate": 6.912288715455461e-08, + "loss": 1.7444, + "step": 32052 + }, + { + "epoch": 9.838244321669736, + "grad_norm": 0.15675026178359985, + "learning_rate": 6.886185994170458e-08, + "loss": 1.7031, + "step": 32053 + }, + { + "epoch": 9.838551258440761, + "grad_norm": 0.12373685091733932, + "learning_rate": 6.860132617763081e-08, + "loss": 1.6879, + "step": 32054 + }, + { + "epoch": 9.838858195211786, + "grad_norm": 0.11986403167247772, + "learning_rate": 6.834128586490352e-08, + "loss": 1.7276, + "step": 32055 + }, + { + "epoch": 9.839165131982812, + "grad_norm": 0.12817466259002686, + "learning_rate": 6.808173900609838e-08, + "loss": 1.7128, + "step": 32056 + }, + { + "epoch": 9.839472068753837, + "grad_norm": 0.15844331681728363, + "learning_rate": 6.782268560376892e-08, + "loss": 1.7278, + "step": 32057 + }, + { + "epoch": 9.839779005524862, + "grad_norm": 0.1530577689409256, + "learning_rate": 6.756412566048531e-08, + "loss": 1.7616, + "step": 32058 + }, + { + "epoch": 9.840085942295888, + "grad_norm": 0.12964992225170135, + "learning_rate": 6.730605917879551e-08, + "loss": 1.7037, + "step": 32059 + }, + { + "epoch": 9.840392879066911, + "grad_norm": 0.1531256139278412, + "learning_rate": 6.704848616125858e-08, + "loss": 1.7433, + "step": 32060 + }, + { + "epoch": 9.840699815837937, + "grad_norm": 0.15467914938926697, + "learning_rate": 6.679140661041139e-08, + "loss": 1.737, + "step": 32061 + }, + { + "epoch": 9.841006752608962, + "grad_norm": 0.13379620015621185, + "learning_rate": 6.653482052880189e-08, + "loss": 1.7009, + "step": 32062 + }, + { + "epoch": 9.841313689379987, + "grad_norm": 0.1608572006225586, + "learning_rate": 6.62787279189614e-08, + "loss": 1.7222, + "step": 32063 + }, + { + "epoch": 9.841620626151013, + "grad_norm": 0.10191282629966736, + "learning_rate": 6.60231287834212e-08, + "loss": 1.6493, + "step": 32064 + }, + { + "epoch": 9.841927562922038, + "grad_norm": 0.1067260280251503, + "learning_rate": 6.576802312470709e-08, + "loss": 1.6836, + "step": 32065 + }, + { + "epoch": 9.842234499693063, + "grad_norm": 0.09046047180891037, + "learning_rate": 6.551341094533925e-08, + "loss": 1.6442, + "step": 32066 + }, + { + "epoch": 9.842541436464089, + "grad_norm": 0.16846902668476105, + "learning_rate": 6.525929224783789e-08, + "loss": 1.7371, + "step": 32067 + }, + { + "epoch": 9.842848373235114, + "grad_norm": 0.13322049379348755, + "learning_rate": 6.500566703470657e-08, + "loss": 1.744, + "step": 32068 + }, + { + "epoch": 9.84315531000614, + "grad_norm": 0.11230573058128357, + "learning_rate": 6.475253530846548e-08, + "loss": 1.6508, + "step": 32069 + }, + { + "epoch": 9.843462246777165, + "grad_norm": 0.14198845624923706, + "learning_rate": 6.449989707160153e-08, + "loss": 1.7364, + "step": 32070 + }, + { + "epoch": 9.84376918354819, + "grad_norm": 0.2092641144990921, + "learning_rate": 6.424775232661828e-08, + "loss": 1.7172, + "step": 32071 + }, + { + "epoch": 9.844076120319214, + "grad_norm": 0.1266733705997467, + "learning_rate": 6.399610107600818e-08, + "loss": 1.6914, + "step": 32072 + }, + { + "epoch": 9.844383057090239, + "grad_norm": 0.2110438197851181, + "learning_rate": 6.374494332225812e-08, + "loss": 1.7657, + "step": 32073 + }, + { + "epoch": 9.844689993861264, + "grad_norm": 0.13018962740898132, + "learning_rate": 6.349427906784944e-08, + "loss": 1.6803, + "step": 32074 + }, + { + "epoch": 9.84499693063229, + "grad_norm": 0.14762617647647858, + "learning_rate": 6.324410831525795e-08, + "loss": 1.73, + "step": 32075 + }, + { + "epoch": 9.845303867403315, + "grad_norm": 0.15824922919273376, + "learning_rate": 6.299443106695945e-08, + "loss": 1.763, + "step": 32076 + }, + { + "epoch": 9.84561080417434, + "grad_norm": 0.11844678223133087, + "learning_rate": 6.27452473254131e-08, + "loss": 1.6935, + "step": 32077 + }, + { + "epoch": 9.845917740945366, + "grad_norm": 0.11517791450023651, + "learning_rate": 6.249655709309465e-08, + "loss": 1.6934, + "step": 32078 + }, + { + "epoch": 9.84622467771639, + "grad_norm": 0.162859708070755, + "learning_rate": 6.224836037244663e-08, + "loss": 1.7281, + "step": 32079 + }, + { + "epoch": 9.846531614487416, + "grad_norm": 0.09734068065881729, + "learning_rate": 6.200065716593373e-08, + "loss": 1.6473, + "step": 32080 + }, + { + "epoch": 9.846838551258442, + "grad_norm": 0.115218386054039, + "learning_rate": 6.175344747600397e-08, + "loss": 1.6662, + "step": 32081 + }, + { + "epoch": 9.847145488029465, + "grad_norm": 0.11634491384029388, + "learning_rate": 6.150673130508877e-08, + "loss": 1.6455, + "step": 32082 + }, + { + "epoch": 9.84745242480049, + "grad_norm": 0.10781900584697723, + "learning_rate": 6.126050865563615e-08, + "loss": 1.654, + "step": 32083 + }, + { + "epoch": 9.847759361571516, + "grad_norm": 0.14688703417778015, + "learning_rate": 6.101477953008305e-08, + "loss": 1.7057, + "step": 32084 + }, + { + "epoch": 9.848066298342541, + "grad_norm": 0.14795304834842682, + "learning_rate": 6.076954393084421e-08, + "loss": 1.7032, + "step": 32085 + }, + { + "epoch": 9.848373235113566, + "grad_norm": 0.12772250175476074, + "learning_rate": 6.052480186035658e-08, + "loss": 1.7101, + "step": 32086 + }, + { + "epoch": 9.848680171884592, + "grad_norm": 0.14158354699611664, + "learning_rate": 6.028055332102933e-08, + "loss": 1.6748, + "step": 32087 + }, + { + "epoch": 9.848987108655617, + "grad_norm": 0.13286559283733368, + "learning_rate": 6.003679831528275e-08, + "loss": 1.6909, + "step": 32088 + }, + { + "epoch": 9.849294045426642, + "grad_norm": 0.10677133500576019, + "learning_rate": 5.979353684552047e-08, + "loss": 1.6806, + "step": 32089 + }, + { + "epoch": 9.849600982197668, + "grad_norm": 0.09260063618421555, + "learning_rate": 5.955076891415168e-08, + "loss": 1.6604, + "step": 32090 + }, + { + "epoch": 9.849907918968693, + "grad_norm": 0.17723138630390167, + "learning_rate": 5.9308494523574453e-08, + "loss": 1.7538, + "step": 32091 + }, + { + "epoch": 9.850214855739718, + "grad_norm": 0.14554916322231293, + "learning_rate": 5.9066713676181326e-08, + "loss": 1.7595, + "step": 32092 + }, + { + "epoch": 9.850521792510742, + "grad_norm": 0.14164261519908905, + "learning_rate": 5.882542637435928e-08, + "loss": 1.7205, + "step": 32093 + }, + { + "epoch": 9.850828729281767, + "grad_norm": 0.1607130765914917, + "learning_rate": 5.858463262050085e-08, + "loss": 1.7433, + "step": 32094 + }, + { + "epoch": 9.851135666052793, + "grad_norm": 0.10517904162406921, + "learning_rate": 5.834433241697634e-08, + "loss": 1.6795, + "step": 32095 + }, + { + "epoch": 9.851442602823818, + "grad_norm": 0.11845014989376068, + "learning_rate": 5.810452576616721e-08, + "loss": 1.7099, + "step": 32096 + }, + { + "epoch": 9.851749539594843, + "grad_norm": 0.17924906313419342, + "learning_rate": 5.786521267043821e-08, + "loss": 1.7513, + "step": 32097 + }, + { + "epoch": 9.852056476365869, + "grad_norm": 0.20598645508289337, + "learning_rate": 5.762639313215967e-08, + "loss": 1.7387, + "step": 32098 + }, + { + "epoch": 9.852363413136894, + "grad_norm": 0.18959027528762817, + "learning_rate": 5.738806715369083e-08, + "loss": 1.7327, + "step": 32099 + }, + { + "epoch": 9.85267034990792, + "grad_norm": 0.13945116102695465, + "learning_rate": 5.7150234737379795e-08, + "loss": 1.6933, + "step": 32100 + }, + { + "epoch": 9.852977286678945, + "grad_norm": 0.12638016045093536, + "learning_rate": 5.6912895885585795e-08, + "loss": 1.6602, + "step": 32101 + }, + { + "epoch": 9.85328422344997, + "grad_norm": 0.1453823745250702, + "learning_rate": 5.66760506006514e-08, + "loss": 1.6894, + "step": 32102 + }, + { + "epoch": 9.853591160220994, + "grad_norm": 0.1257086992263794, + "learning_rate": 5.643969888491918e-08, + "loss": 1.6999, + "step": 32103 + }, + { + "epoch": 9.853898096992019, + "grad_norm": 0.1332065314054489, + "learning_rate": 5.6203840740720605e-08, + "loss": 1.7494, + "step": 32104 + }, + { + "epoch": 9.854205033763044, + "grad_norm": 0.10547174513339996, + "learning_rate": 5.596847617038714e-08, + "loss": 1.7052, + "step": 32105 + }, + { + "epoch": 9.85451197053407, + "grad_norm": 0.12532146275043488, + "learning_rate": 5.5733605176250256e-08, + "loss": 1.6831, + "step": 32106 + }, + { + "epoch": 9.854818907305095, + "grad_norm": 0.1575230360031128, + "learning_rate": 5.549922776062477e-08, + "loss": 1.7032, + "step": 32107 + }, + { + "epoch": 9.85512584407612, + "grad_norm": 0.13303294777870178, + "learning_rate": 5.52653439258255e-08, + "loss": 1.6917, + "step": 32108 + }, + { + "epoch": 9.855432780847146, + "grad_norm": 0.10225910693407059, + "learning_rate": 5.50319536741728e-08, + "loss": 1.7108, + "step": 32109 + }, + { + "epoch": 9.855739717618171, + "grad_norm": 0.11767458915710449, + "learning_rate": 5.479905700796484e-08, + "loss": 1.6651, + "step": 32110 + }, + { + "epoch": 9.856046654389196, + "grad_norm": 0.099602110683918, + "learning_rate": 5.456665392951088e-08, + "loss": 1.6674, + "step": 32111 + }, + { + "epoch": 9.856353591160222, + "grad_norm": 0.11690317094326019, + "learning_rate": 5.433474444109799e-08, + "loss": 1.6533, + "step": 32112 + }, + { + "epoch": 9.856660527931247, + "grad_norm": 0.14385253190994263, + "learning_rate": 5.410332854502431e-08, + "loss": 1.6759, + "step": 32113 + }, + { + "epoch": 9.856967464702272, + "grad_norm": 0.16568076610565186, + "learning_rate": 5.387240624357692e-08, + "loss": 1.7523, + "step": 32114 + }, + { + "epoch": 9.857274401473296, + "grad_norm": 0.1166546419262886, + "learning_rate": 5.364197753903732e-08, + "loss": 1.6936, + "step": 32115 + }, + { + "epoch": 9.857581338244321, + "grad_norm": 0.1372339427471161, + "learning_rate": 5.3412042433681473e-08, + "loss": 1.6941, + "step": 32116 + }, + { + "epoch": 9.857888275015346, + "grad_norm": 0.14886748790740967, + "learning_rate": 5.318260092978533e-08, + "loss": 1.7423, + "step": 32117 + }, + { + "epoch": 9.858195211786372, + "grad_norm": 0.10235906392335892, + "learning_rate": 5.29536530296082e-08, + "loss": 1.6717, + "step": 32118 + }, + { + "epoch": 9.858502148557397, + "grad_norm": 0.13623642921447754, + "learning_rate": 5.2725198735420475e-08, + "loss": 1.6712, + "step": 32119 + }, + { + "epoch": 9.858809085328422, + "grad_norm": 0.14319658279418945, + "learning_rate": 5.249723804948148e-08, + "loss": 1.701, + "step": 32120 + }, + { + "epoch": 9.859116022099448, + "grad_norm": 0.14662912487983704, + "learning_rate": 5.226977097403385e-08, + "loss": 1.6885, + "step": 32121 + }, + { + "epoch": 9.859422958870473, + "grad_norm": 0.1491306722164154, + "learning_rate": 5.204279751133134e-08, + "loss": 1.7074, + "step": 32122 + }, + { + "epoch": 9.859729895641498, + "grad_norm": 0.1779826581478119, + "learning_rate": 5.181631766362216e-08, + "loss": 1.6887, + "step": 32123 + }, + { + "epoch": 9.860036832412524, + "grad_norm": 0.14555446803569794, + "learning_rate": 5.159033143313785e-08, + "loss": 1.7373, + "step": 32124 + }, + { + "epoch": 9.860343769183547, + "grad_norm": 0.10940683633089066, + "learning_rate": 5.136483882210996e-08, + "loss": 1.6554, + "step": 32125 + }, + { + "epoch": 9.860650705954573, + "grad_norm": 0.14117297530174255, + "learning_rate": 5.1139839832775594e-08, + "loss": 1.7088, + "step": 32126 + }, + { + "epoch": 9.860957642725598, + "grad_norm": 0.11220337450504303, + "learning_rate": 5.091533446734964e-08, + "loss": 1.6698, + "step": 32127 + }, + { + "epoch": 9.861264579496623, + "grad_norm": 0.19136083126068115, + "learning_rate": 5.06913227280581e-08, + "loss": 1.7741, + "step": 32128 + }, + { + "epoch": 9.861571516267649, + "grad_norm": 0.16426582634449005, + "learning_rate": 5.0467804617110317e-08, + "loss": 1.7405, + "step": 32129 + }, + { + "epoch": 9.861878453038674, + "grad_norm": 0.16608738899230957, + "learning_rate": 5.024478013671563e-08, + "loss": 1.7209, + "step": 32130 + }, + { + "epoch": 9.8621853898097, + "grad_norm": 0.224944606423378, + "learning_rate": 5.002224928907229e-08, + "loss": 1.7206, + "step": 32131 + }, + { + "epoch": 9.862492326580725, + "grad_norm": 0.09932999312877655, + "learning_rate": 4.980021207639518e-08, + "loss": 1.6681, + "step": 32132 + }, + { + "epoch": 9.86279926335175, + "grad_norm": 0.11509741097688675, + "learning_rate": 4.95786685008659e-08, + "loss": 1.7207, + "step": 32133 + }, + { + "epoch": 9.863106200122775, + "grad_norm": 0.1009160503745079, + "learning_rate": 4.9357618564671584e-08, + "loss": 1.6293, + "step": 32134 + }, + { + "epoch": 9.8634131368938, + "grad_norm": 0.11737551540136337, + "learning_rate": 4.913706227001047e-08, + "loss": 1.658, + "step": 32135 + }, + { + "epoch": 9.863720073664824, + "grad_norm": 0.1895657181739807, + "learning_rate": 4.89169996190475e-08, + "loss": 1.7315, + "step": 32136 + }, + { + "epoch": 9.86402701043585, + "grad_norm": 0.12624821066856384, + "learning_rate": 4.869743061396981e-08, + "loss": 1.678, + "step": 32137 + }, + { + "epoch": 9.864333947206875, + "grad_norm": 0.12830981612205505, + "learning_rate": 4.847835525693678e-08, + "loss": 1.7305, + "step": 32138 + }, + { + "epoch": 9.8646408839779, + "grad_norm": 0.1154761090874672, + "learning_rate": 4.82597735501189e-08, + "loss": 1.7024, + "step": 32139 + }, + { + "epoch": 9.864947820748926, + "grad_norm": 0.18320058286190033, + "learning_rate": 4.804168549567556e-08, + "loss": 1.7165, + "step": 32140 + }, + { + "epoch": 9.865254757519951, + "grad_norm": 0.1479901671409607, + "learning_rate": 4.782409109576613e-08, + "loss": 1.7079, + "step": 32141 + }, + { + "epoch": 9.865561694290976, + "grad_norm": 0.11338557302951813, + "learning_rate": 4.760699035253335e-08, + "loss": 1.6924, + "step": 32142 + }, + { + "epoch": 9.865868631062002, + "grad_norm": 0.1415034830570221, + "learning_rate": 4.73903832681255e-08, + "loss": 1.6969, + "step": 32143 + }, + { + "epoch": 9.866175567833027, + "grad_norm": 0.139898419380188, + "learning_rate": 4.7174269844685315e-08, + "loss": 1.6787, + "step": 32144 + }, + { + "epoch": 9.866482504604052, + "grad_norm": 0.16872167587280273, + "learning_rate": 4.695865008434997e-08, + "loss": 1.7279, + "step": 32145 + }, + { + "epoch": 9.866789441375076, + "grad_norm": 0.1443173587322235, + "learning_rate": 4.674352398924553e-08, + "loss": 1.7436, + "step": 32146 + }, + { + "epoch": 9.867096378146101, + "grad_norm": 0.2038755714893341, + "learning_rate": 4.652889156149809e-08, + "loss": 1.7499, + "step": 32147 + }, + { + "epoch": 9.867403314917127, + "grad_norm": 0.11941488832235336, + "learning_rate": 4.6314752803233716e-08, + "loss": 1.6899, + "step": 32148 + }, + { + "epoch": 9.867710251688152, + "grad_norm": 0.1467728614807129, + "learning_rate": 4.610110771656184e-08, + "loss": 1.7133, + "step": 32149 + }, + { + "epoch": 9.868017188459177, + "grad_norm": 0.18277500569820404, + "learning_rate": 4.5887956303602985e-08, + "loss": 1.7038, + "step": 32150 + }, + { + "epoch": 9.868324125230203, + "grad_norm": 0.09188520163297653, + "learning_rate": 4.567529856645547e-08, + "loss": 1.6514, + "step": 32151 + }, + { + "epoch": 9.868631062001228, + "grad_norm": 0.1508881002664566, + "learning_rate": 4.546313450722317e-08, + "loss": 1.7354, + "step": 32152 + }, + { + "epoch": 9.868937998772253, + "grad_norm": 0.19286566972732544, + "learning_rate": 4.525146412800441e-08, + "loss": 1.7437, + "step": 32153 + }, + { + "epoch": 9.869244935543279, + "grad_norm": 0.13278965651988983, + "learning_rate": 4.504028743089195e-08, + "loss": 1.7508, + "step": 32154 + }, + { + "epoch": 9.869551872314304, + "grad_norm": 0.17647281289100647, + "learning_rate": 4.4829604417967466e-08, + "loss": 1.7637, + "step": 32155 + }, + { + "epoch": 9.86985880908533, + "grad_norm": 0.12501446902751923, + "learning_rate": 4.461941509131817e-08, + "loss": 1.6799, + "step": 32156 + }, + { + "epoch": 9.870165745856355, + "grad_norm": 0.15084847807884216, + "learning_rate": 4.440971945302019e-08, + "loss": 1.7139, + "step": 32157 + }, + { + "epoch": 9.870472682627378, + "grad_norm": 0.1984490007162094, + "learning_rate": 4.420051750514409e-08, + "loss": 1.7522, + "step": 32158 + }, + { + "epoch": 9.870779619398403, + "grad_norm": 0.15516258776187897, + "learning_rate": 4.399180924975488e-08, + "loss": 1.7365, + "step": 32159 + }, + { + "epoch": 9.871086556169429, + "grad_norm": 0.1323643922805786, + "learning_rate": 4.3783594688923124e-08, + "loss": 1.6581, + "step": 32160 + }, + { + "epoch": 9.871393492940454, + "grad_norm": 0.13200242817401886, + "learning_rate": 4.357587382470274e-08, + "loss": 1.6713, + "step": 32161 + }, + { + "epoch": 9.87170042971148, + "grad_norm": 0.0954132005572319, + "learning_rate": 4.3368646659147635e-08, + "loss": 1.6607, + "step": 32162 + }, + { + "epoch": 9.872007366482505, + "grad_norm": 0.15339840948581696, + "learning_rate": 4.316191319430063e-08, + "loss": 1.6888, + "step": 32163 + }, + { + "epoch": 9.87231430325353, + "grad_norm": 0.27716484665870667, + "learning_rate": 4.295567343221008e-08, + "loss": 1.7807, + "step": 32164 + }, + { + "epoch": 9.872621240024555, + "grad_norm": 0.1060333251953125, + "learning_rate": 4.2749927374907684e-08, + "loss": 1.7056, + "step": 32165 + }, + { + "epoch": 9.87292817679558, + "grad_norm": 0.16034503281116486, + "learning_rate": 4.2544675024436266e-08, + "loss": 1.717, + "step": 32166 + }, + { + "epoch": 9.873235113566606, + "grad_norm": 0.12173280119895935, + "learning_rate": 4.233991638281642e-08, + "loss": 1.7021, + "step": 32167 + }, + { + "epoch": 9.87354205033763, + "grad_norm": 0.1884598582983017, + "learning_rate": 4.213565145207987e-08, + "loss": 1.6926, + "step": 32168 + }, + { + "epoch": 9.873848987108655, + "grad_norm": 0.12239779531955719, + "learning_rate": 4.193188023423611e-08, + "loss": 1.6969, + "step": 32169 + }, + { + "epoch": 9.87415592387968, + "grad_norm": 0.15470372140407562, + "learning_rate": 4.172860273130019e-08, + "loss": 1.7963, + "step": 32170 + }, + { + "epoch": 9.874462860650706, + "grad_norm": 0.11103082448244095, + "learning_rate": 4.152581894528717e-08, + "loss": 1.6866, + "step": 32171 + }, + { + "epoch": 9.874769797421731, + "grad_norm": 0.14944078028202057, + "learning_rate": 4.132352887819546e-08, + "loss": 1.7383, + "step": 32172 + }, + { + "epoch": 9.875076734192756, + "grad_norm": 0.11603175848722458, + "learning_rate": 4.1121732532029e-08, + "loss": 1.6626, + "step": 32173 + }, + { + "epoch": 9.875383670963782, + "grad_norm": 0.16313737630844116, + "learning_rate": 4.092042990878064e-08, + "loss": 1.7547, + "step": 32174 + }, + { + "epoch": 9.875690607734807, + "grad_norm": 0.10700001567602158, + "learning_rate": 4.0719621010437694e-08, + "loss": 1.6582, + "step": 32175 + }, + { + "epoch": 9.875997544505832, + "grad_norm": 0.09969279915094376, + "learning_rate": 4.0519305838981894e-08, + "loss": 1.6598, + "step": 32176 + }, + { + "epoch": 9.876304481276858, + "grad_norm": 0.18154063820838928, + "learning_rate": 4.031948439640054e-08, + "loss": 1.6844, + "step": 32177 + }, + { + "epoch": 9.876611418047883, + "grad_norm": 0.10725349187850952, + "learning_rate": 4.012015668466429e-08, + "loss": 1.6638, + "step": 32178 + }, + { + "epoch": 9.876918354818907, + "grad_norm": 0.15481308102607727, + "learning_rate": 3.992132270573823e-08, + "loss": 1.7646, + "step": 32179 + }, + { + "epoch": 9.877225291589932, + "grad_norm": 0.2573716640472412, + "learning_rate": 3.9722982461593005e-08, + "loss": 1.741, + "step": 32180 + }, + { + "epoch": 9.877532228360957, + "grad_norm": 0.14982570707798004, + "learning_rate": 3.952513595419372e-08, + "loss": 1.7354, + "step": 32181 + }, + { + "epoch": 9.877839165131983, + "grad_norm": 0.15668633580207825, + "learning_rate": 3.932778318548325e-08, + "loss": 1.7226, + "step": 32182 + }, + { + "epoch": 9.878146101903008, + "grad_norm": 0.12578873336315155, + "learning_rate": 3.913092415742114e-08, + "loss": 1.7206, + "step": 32183 + }, + { + "epoch": 9.878453038674033, + "grad_norm": 0.12647871673107147, + "learning_rate": 3.8934558871950296e-08, + "loss": 1.7102, + "step": 32184 + }, + { + "epoch": 9.878759975445059, + "grad_norm": 0.14217160642147064, + "learning_rate": 3.8738687331013603e-08, + "loss": 1.6851, + "step": 32185 + }, + { + "epoch": 9.879066912216084, + "grad_norm": 0.12461835891008377, + "learning_rate": 3.8543309536542835e-08, + "loss": 1.6956, + "step": 32186 + }, + { + "epoch": 9.87937384898711, + "grad_norm": 0.11051438748836517, + "learning_rate": 3.8348425490469796e-08, + "loss": 1.6719, + "step": 32187 + }, + { + "epoch": 9.879680785758135, + "grad_norm": 0.11611293256282806, + "learning_rate": 3.815403519472072e-08, + "loss": 1.6766, + "step": 32188 + }, + { + "epoch": 9.879987722529158, + "grad_norm": 0.17132268846035004, + "learning_rate": 3.796013865121628e-08, + "loss": 1.7112, + "step": 32189 + }, + { + "epoch": 9.880294659300183, + "grad_norm": 0.13943015038967133, + "learning_rate": 3.776673586187718e-08, + "loss": 1.7382, + "step": 32190 + }, + { + "epoch": 9.880601596071209, + "grad_norm": 0.11459454149007797, + "learning_rate": 3.757382682860744e-08, + "loss": 1.674, + "step": 32191 + }, + { + "epoch": 9.880908532842234, + "grad_norm": 0.1549069583415985, + "learning_rate": 3.738141155331665e-08, + "loss": 1.7275, + "step": 32192 + }, + { + "epoch": 9.88121546961326, + "grad_norm": 0.09938697516918182, + "learning_rate": 3.7189490037908834e-08, + "loss": 1.6483, + "step": 32193 + }, + { + "epoch": 9.881522406384285, + "grad_norm": 0.10582483559846878, + "learning_rate": 3.6998062284276934e-08, + "loss": 1.6667, + "step": 32194 + }, + { + "epoch": 9.88182934315531, + "grad_norm": 0.1391625851392746, + "learning_rate": 3.6807128294319426e-08, + "loss": 1.6919, + "step": 32195 + }, + { + "epoch": 9.882136279926335, + "grad_norm": 0.10145086795091629, + "learning_rate": 3.661668806991259e-08, + "loss": 1.6726, + "step": 32196 + }, + { + "epoch": 9.88244321669736, + "grad_norm": 0.12674877047538757, + "learning_rate": 3.642674161294379e-08, + "loss": 1.693, + "step": 32197 + }, + { + "epoch": 9.882750153468386, + "grad_norm": 0.16183172166347504, + "learning_rate": 3.6237288925294875e-08, + "loss": 1.66, + "step": 32198 + }, + { + "epoch": 9.883057090239411, + "grad_norm": 0.11870484054088593, + "learning_rate": 3.604833000883101e-08, + "loss": 1.6869, + "step": 32199 + }, + { + "epoch": 9.883364027010435, + "grad_norm": 0.149629145860672, + "learning_rate": 3.585986486542292e-08, + "loss": 1.6774, + "step": 32200 + }, + { + "epoch": 9.88367096378146, + "grad_norm": 0.13439494371414185, + "learning_rate": 3.567189349693023e-08, + "loss": 1.7167, + "step": 32201 + }, + { + "epoch": 9.883977900552486, + "grad_norm": 0.10757558792829514, + "learning_rate": 3.5484415905218114e-08, + "loss": 1.6832, + "step": 32202 + }, + { + "epoch": 9.884284837323511, + "grad_norm": 0.1354834884405136, + "learning_rate": 3.5297432092129544e-08, + "loss": 1.7285, + "step": 32203 + }, + { + "epoch": 9.884591774094536, + "grad_norm": 0.13512718677520752, + "learning_rate": 3.5110942059518594e-08, + "loss": 1.6989, + "step": 32204 + }, + { + "epoch": 9.884898710865562, + "grad_norm": 0.14214816689491272, + "learning_rate": 3.492494580922823e-08, + "loss": 1.7081, + "step": 32205 + }, + { + "epoch": 9.885205647636587, + "grad_norm": 0.12680695950984955, + "learning_rate": 3.4739443343090315e-08, + "loss": 1.7195, + "step": 32206 + }, + { + "epoch": 9.885512584407612, + "grad_norm": 0.11334585398435593, + "learning_rate": 3.455443466294783e-08, + "loss": 1.6965, + "step": 32207 + }, + { + "epoch": 9.885819521178638, + "grad_norm": 0.15353024005889893, + "learning_rate": 3.4369919770621536e-08, + "loss": 1.7505, + "step": 32208 + }, + { + "epoch": 9.886126457949663, + "grad_norm": 0.14484186470508575, + "learning_rate": 3.4185898667937756e-08, + "loss": 1.7272, + "step": 32209 + }, + { + "epoch": 9.886433394720687, + "grad_norm": 0.1442519873380661, + "learning_rate": 3.400237135671169e-08, + "loss": 1.7159, + "step": 32210 + }, + { + "epoch": 9.886740331491712, + "grad_norm": 0.15484102070331573, + "learning_rate": 3.381933783876412e-08, + "loss": 1.7064, + "step": 32211 + }, + { + "epoch": 9.887047268262737, + "grad_norm": 0.09997449070215225, + "learning_rate": 3.36367981159047e-08, + "loss": 1.6768, + "step": 32212 + }, + { + "epoch": 9.887354205033763, + "grad_norm": 0.1351270228624344, + "learning_rate": 3.3454752189926444e-08, + "loss": 1.7302, + "step": 32213 + }, + { + "epoch": 9.887661141804788, + "grad_norm": 0.12122789025306702, + "learning_rate": 3.327320006263346e-08, + "loss": 1.6963, + "step": 32214 + }, + { + "epoch": 9.887968078575813, + "grad_norm": 0.12483847141265869, + "learning_rate": 3.309214173582431e-08, + "loss": 1.6803, + "step": 32215 + }, + { + "epoch": 9.888275015346839, + "grad_norm": 0.13801445066928864, + "learning_rate": 3.2911577211280905e-08, + "loss": 1.7139, + "step": 32216 + }, + { + "epoch": 9.888581952117864, + "grad_norm": 0.19149911403656006, + "learning_rate": 3.273150649079626e-08, + "loss": 1.7526, + "step": 32217 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 0.15660177171230316, + "learning_rate": 3.255192957614672e-08, + "loss": 1.7487, + "step": 32218 + }, + { + "epoch": 9.889195825659915, + "grad_norm": 0.13127101957798004, + "learning_rate": 3.2372846469103104e-08, + "loss": 1.7105, + "step": 32219 + }, + { + "epoch": 9.88950276243094, + "grad_norm": 0.09861146658658981, + "learning_rate": 3.21942571714362e-08, + "loss": 1.6766, + "step": 32220 + }, + { + "epoch": 9.889809699201965, + "grad_norm": 0.14133897423744202, + "learning_rate": 3.201616168491683e-08, + "loss": 1.7335, + "step": 32221 + }, + { + "epoch": 9.890116635972989, + "grad_norm": 0.12263017147779465, + "learning_rate": 3.1838560011299145e-08, + "loss": 1.6699, + "step": 32222 + }, + { + "epoch": 9.890423572744014, + "grad_norm": 0.12454384565353394, + "learning_rate": 3.166145215233729e-08, + "loss": 1.7091, + "step": 32223 + }, + { + "epoch": 9.89073050951504, + "grad_norm": 0.11563286185264587, + "learning_rate": 3.148483810979097e-08, + "loss": 1.6885, + "step": 32224 + }, + { + "epoch": 9.891037446286065, + "grad_norm": 0.18573540449142456, + "learning_rate": 3.1308717885392136e-08, + "loss": 1.7238, + "step": 32225 + }, + { + "epoch": 9.89134438305709, + "grad_norm": 0.16926386952400208, + "learning_rate": 3.113309148088939e-08, + "loss": 1.6886, + "step": 32226 + }, + { + "epoch": 9.891651319828116, + "grad_norm": 0.11649619042873383, + "learning_rate": 3.0957958898020226e-08, + "loss": 1.6767, + "step": 32227 + }, + { + "epoch": 9.89195825659914, + "grad_norm": 0.10409758239984512, + "learning_rate": 3.078332013851104e-08, + "loss": 1.6549, + "step": 32228 + }, + { + "epoch": 9.892265193370166, + "grad_norm": 0.20817892253398895, + "learning_rate": 3.0609175204088234e-08, + "loss": 1.7321, + "step": 32229 + }, + { + "epoch": 9.892572130141192, + "grad_norm": 0.15646634995937347, + "learning_rate": 3.0435524096478207e-08, + "loss": 1.7278, + "step": 32230 + }, + { + "epoch": 9.892879066912217, + "grad_norm": 0.10567045956850052, + "learning_rate": 3.026236681738515e-08, + "loss": 1.6612, + "step": 32231 + }, + { + "epoch": 9.89318600368324, + "grad_norm": 0.13315534591674805, + "learning_rate": 3.0089703368529895e-08, + "loss": 1.7413, + "step": 32232 + }, + { + "epoch": 9.893492940454266, + "grad_norm": 0.11580394208431244, + "learning_rate": 2.9917533751616655e-08, + "loss": 1.6394, + "step": 32233 + }, + { + "epoch": 9.893799877225291, + "grad_norm": 0.12873095273971558, + "learning_rate": 2.974585796834961e-08, + "loss": 1.6906, + "step": 32234 + }, + { + "epoch": 9.894106813996316, + "grad_norm": 0.2108859121799469, + "learning_rate": 2.9574676020421853e-08, + "loss": 1.7822, + "step": 32235 + }, + { + "epoch": 9.894413750767342, + "grad_norm": 0.12524257600307465, + "learning_rate": 2.9403987909520926e-08, + "loss": 1.6733, + "step": 32236 + }, + { + "epoch": 9.894720687538367, + "grad_norm": 0.1606513410806656, + "learning_rate": 2.9233793637345464e-08, + "loss": 1.7441, + "step": 32237 + }, + { + "epoch": 9.895027624309392, + "grad_norm": 0.11567132920026779, + "learning_rate": 2.9064093205566358e-08, + "loss": 1.6924, + "step": 32238 + }, + { + "epoch": 9.895334561080418, + "grad_norm": 0.18729543685913086, + "learning_rate": 2.889488661586559e-08, + "loss": 1.7346, + "step": 32239 + }, + { + "epoch": 9.895641497851443, + "grad_norm": 0.11518693715333939, + "learning_rate": 2.8726173869908502e-08, + "loss": 1.6796, + "step": 32240 + }, + { + "epoch": 9.895948434622468, + "grad_norm": 0.12286285310983658, + "learning_rate": 2.8557954969377078e-08, + "loss": 1.6804, + "step": 32241 + }, + { + "epoch": 9.896255371393494, + "grad_norm": 0.11524228751659393, + "learning_rate": 2.8390229915919998e-08, + "loss": 1.6649, + "step": 32242 + }, + { + "epoch": 9.896562308164517, + "grad_norm": 0.1211162805557251, + "learning_rate": 2.8222998711202598e-08, + "loss": 1.686, + "step": 32243 + }, + { + "epoch": 9.896869244935543, + "grad_norm": 0.11552423238754272, + "learning_rate": 2.805626135687356e-08, + "loss": 1.6709, + "step": 32244 + }, + { + "epoch": 9.897176181706568, + "grad_norm": 0.15391238033771515, + "learning_rate": 2.7890017854587114e-08, + "loss": 1.7138, + "step": 32245 + }, + { + "epoch": 9.897483118477593, + "grad_norm": 0.15275777876377106, + "learning_rate": 2.772426820597529e-08, + "loss": 1.7328, + "step": 32246 + }, + { + "epoch": 9.897790055248619, + "grad_norm": 0.12468124181032181, + "learning_rate": 2.7559012412681218e-08, + "loss": 1.6861, + "step": 32247 + }, + { + "epoch": 9.898096992019644, + "grad_norm": 0.1204581931233406, + "learning_rate": 2.7394250476342475e-08, + "loss": 1.7059, + "step": 32248 + }, + { + "epoch": 9.89840392879067, + "grad_norm": 0.15671482682228088, + "learning_rate": 2.722998239857999e-08, + "loss": 1.7289, + "step": 32249 + }, + { + "epoch": 9.898710865561695, + "grad_norm": 0.13706350326538086, + "learning_rate": 2.7066208181025786e-08, + "loss": 1.7048, + "step": 32250 + }, + { + "epoch": 9.89901780233272, + "grad_norm": 0.15076833963394165, + "learning_rate": 2.6902927825289694e-08, + "loss": 1.7355, + "step": 32251 + }, + { + "epoch": 9.899324739103745, + "grad_norm": 0.14177745580673218, + "learning_rate": 2.674014133298708e-08, + "loss": 1.6877, + "step": 32252 + }, + { + "epoch": 9.899631675874769, + "grad_norm": 0.1680639237165451, + "learning_rate": 2.6577848705733322e-08, + "loss": 1.6612, + "step": 32253 + }, + { + "epoch": 9.899938612645794, + "grad_norm": 0.13200677931308746, + "learning_rate": 2.641604994512714e-08, + "loss": 1.7169, + "step": 32254 + }, + { + "epoch": 9.90024554941682, + "grad_norm": 0.1324564814567566, + "learning_rate": 2.625474505276726e-08, + "loss": 1.6906, + "step": 32255 + }, + { + "epoch": 9.900552486187845, + "grad_norm": 0.11010903865098953, + "learning_rate": 2.6093934030246846e-08, + "loss": 1.6665, + "step": 32256 + }, + { + "epoch": 9.90085942295887, + "grad_norm": 0.1629243791103363, + "learning_rate": 2.5933616879159073e-08, + "loss": 1.7339, + "step": 32257 + }, + { + "epoch": 9.901166359729896, + "grad_norm": 0.10520602762699127, + "learning_rate": 2.5773793601080453e-08, + "loss": 1.649, + "step": 32258 + }, + { + "epoch": 9.901473296500921, + "grad_norm": 0.13441254198551178, + "learning_rate": 2.561446419760416e-08, + "loss": 1.7146, + "step": 32259 + }, + { + "epoch": 9.901780233271946, + "grad_norm": 0.15586842596530914, + "learning_rate": 2.5455628670290054e-08, + "loss": 1.6921, + "step": 32260 + }, + { + "epoch": 9.902087170042972, + "grad_norm": 0.1360539346933365, + "learning_rate": 2.5297287020720206e-08, + "loss": 1.6973, + "step": 32261 + }, + { + "epoch": 9.902394106813997, + "grad_norm": 0.1683451533317566, + "learning_rate": 2.5139439250448927e-08, + "loss": 1.7456, + "step": 32262 + }, + { + "epoch": 9.902701043585022, + "grad_norm": 0.12836389243602753, + "learning_rate": 2.498208536104163e-08, + "loss": 1.6723, + "step": 32263 + }, + { + "epoch": 9.903007980356048, + "grad_norm": 0.14135409891605377, + "learning_rate": 2.482522535405263e-08, + "loss": 1.7411, + "step": 32264 + }, + { + "epoch": 9.903314917127071, + "grad_norm": 0.13020414113998413, + "learning_rate": 2.4668859231036236e-08, + "loss": 1.7077, + "step": 32265 + }, + { + "epoch": 9.903621853898096, + "grad_norm": 0.14027753472328186, + "learning_rate": 2.4512986993530106e-08, + "loss": 1.7259, + "step": 32266 + }, + { + "epoch": 9.903928790669122, + "grad_norm": 0.19937944412231445, + "learning_rate": 2.4357608643077455e-08, + "loss": 1.7673, + "step": 32267 + }, + { + "epoch": 9.904235727440147, + "grad_norm": 0.12452827394008636, + "learning_rate": 2.4202724181215942e-08, + "loss": 1.6824, + "step": 32268 + }, + { + "epoch": 9.904542664211172, + "grad_norm": 0.15908023715019226, + "learning_rate": 2.4048333609472116e-08, + "loss": 1.7399, + "step": 32269 + }, + { + "epoch": 9.904849600982198, + "grad_norm": 0.1503656804561615, + "learning_rate": 2.3894436929378094e-08, + "loss": 1.7216, + "step": 32270 + }, + { + "epoch": 9.905156537753223, + "grad_norm": 0.12779399752616882, + "learning_rate": 2.3741034142449324e-08, + "loss": 1.7447, + "step": 32271 + }, + { + "epoch": 9.905463474524248, + "grad_norm": 0.15011703968048096, + "learning_rate": 2.3588125250206815e-08, + "loss": 1.709, + "step": 32272 + }, + { + "epoch": 9.905770411295274, + "grad_norm": 0.13510404527187347, + "learning_rate": 2.3435710254154918e-08, + "loss": 1.6968, + "step": 32273 + }, + { + "epoch": 9.9060773480663, + "grad_norm": 0.1107151061296463, + "learning_rate": 2.3283789155803536e-08, + "loss": 1.6723, + "step": 32274 + }, + { + "epoch": 9.906384284837323, + "grad_norm": 0.15149912238121033, + "learning_rate": 2.3132361956657024e-08, + "loss": 1.7406, + "step": 32275 + }, + { + "epoch": 9.906691221608348, + "grad_norm": 0.14119799435138702, + "learning_rate": 2.2981428658208627e-08, + "loss": 1.757, + "step": 32276 + }, + { + "epoch": 9.906998158379373, + "grad_norm": 0.1312095820903778, + "learning_rate": 2.2830989261946045e-08, + "loss": 1.7049, + "step": 32277 + }, + { + "epoch": 9.907305095150399, + "grad_norm": 0.10459209233522415, + "learning_rate": 2.268104376936253e-08, + "loss": 1.6757, + "step": 32278 + }, + { + "epoch": 9.907612031921424, + "grad_norm": 0.16587966680526733, + "learning_rate": 2.253159218194023e-08, + "loss": 1.7207, + "step": 32279 + }, + { + "epoch": 9.90791896869245, + "grad_norm": 0.18351085484027863, + "learning_rate": 2.238263450115019e-08, + "loss": 1.7385, + "step": 32280 + }, + { + "epoch": 9.908225905463475, + "grad_norm": 0.1720595806837082, + "learning_rate": 2.2234170728469005e-08, + "loss": 1.7432, + "step": 32281 + }, + { + "epoch": 9.9085328422345, + "grad_norm": 0.1220058798789978, + "learning_rate": 2.2086200865362172e-08, + "loss": 1.7197, + "step": 32282 + }, + { + "epoch": 9.908839779005525, + "grad_norm": 0.18978485465049744, + "learning_rate": 2.1938724913295183e-08, + "loss": 1.776, + "step": 32283 + }, + { + "epoch": 9.90914671577655, + "grad_norm": 0.2161943018436432, + "learning_rate": 2.1791742873716882e-08, + "loss": 1.7852, + "step": 32284 + }, + { + "epoch": 9.909453652547576, + "grad_norm": 0.12366054207086563, + "learning_rate": 2.1645254748092757e-08, + "loss": 1.6733, + "step": 32285 + }, + { + "epoch": 9.9097605893186, + "grad_norm": 0.15332402288913727, + "learning_rate": 2.1499260537855002e-08, + "loss": 1.7465, + "step": 32286 + }, + { + "epoch": 9.910067526089625, + "grad_norm": 0.13514944911003113, + "learning_rate": 2.1353760244463562e-08, + "loss": 1.7004, + "step": 32287 + }, + { + "epoch": 9.91037446286065, + "grad_norm": 0.1976264864206314, + "learning_rate": 2.1208753869339516e-08, + "loss": 1.8062, + "step": 32288 + }, + { + "epoch": 9.910681399631676, + "grad_norm": 0.12862536311149597, + "learning_rate": 2.1064241413931707e-08, + "loss": 1.6782, + "step": 32289 + }, + { + "epoch": 9.910988336402701, + "grad_norm": 0.19731375575065613, + "learning_rate": 2.092022287965567e-08, + "loss": 1.7668, + "step": 32290 + }, + { + "epoch": 9.911295273173726, + "grad_norm": 0.11489395052194595, + "learning_rate": 2.0776698267943594e-08, + "loss": 1.6704, + "step": 32291 + }, + { + "epoch": 9.911602209944752, + "grad_norm": 0.15996041893959045, + "learning_rate": 2.0633667580205463e-08, + "loss": 1.7058, + "step": 32292 + }, + { + "epoch": 9.911909146715777, + "grad_norm": 0.12133777141571045, + "learning_rate": 2.049113081786236e-08, + "loss": 1.6644, + "step": 32293 + }, + { + "epoch": 9.912216083486802, + "grad_norm": 0.15602417290210724, + "learning_rate": 2.0349087982318714e-08, + "loss": 1.7097, + "step": 32294 + }, + { + "epoch": 9.912523020257828, + "grad_norm": 0.16324558854103088, + "learning_rate": 2.0207539074978966e-08, + "loss": 1.6846, + "step": 32295 + }, + { + "epoch": 9.912829957028851, + "grad_norm": 0.15360431373119354, + "learning_rate": 2.0066484097241988e-08, + "loss": 1.7227, + "step": 32296 + }, + { + "epoch": 9.913136893799877, + "grad_norm": 0.17100133001804352, + "learning_rate": 1.9925923050506667e-08, + "loss": 1.6693, + "step": 32297 + }, + { + "epoch": 9.913443830570902, + "grad_norm": 0.11901558190584183, + "learning_rate": 1.9785855936149677e-08, + "loss": 1.7466, + "step": 32298 + }, + { + "epoch": 9.913750767341927, + "grad_norm": 0.10561197996139526, + "learning_rate": 1.96462827555699e-08, + "loss": 1.6526, + "step": 32299 + }, + { + "epoch": 9.914057704112953, + "grad_norm": 0.10759133845567703, + "learning_rate": 1.9507203510138463e-08, + "loss": 1.6842, + "step": 32300 + }, + { + "epoch": 9.914364640883978, + "grad_norm": 0.13747403025627136, + "learning_rate": 1.9368618201232036e-08, + "loss": 1.7276, + "step": 32301 + }, + { + "epoch": 9.914671577655003, + "grad_norm": 0.18556538224220276, + "learning_rate": 1.92305268302162e-08, + "loss": 1.7007, + "step": 32302 + }, + { + "epoch": 9.914978514426029, + "grad_norm": 0.22288267314434052, + "learning_rate": 1.9092929398462078e-08, + "loss": 1.7438, + "step": 32303 + }, + { + "epoch": 9.915285451197054, + "grad_norm": 0.11585120111703873, + "learning_rate": 1.8955825907324142e-08, + "loss": 1.6834, + "step": 32304 + }, + { + "epoch": 9.91559238796808, + "grad_norm": 0.14063316583633423, + "learning_rate": 1.8819216358156864e-08, + "loss": 1.698, + "step": 32305 + }, + { + "epoch": 9.915899324739105, + "grad_norm": 0.10423889756202698, + "learning_rate": 1.8683100752320272e-08, + "loss": 1.6796, + "step": 32306 + }, + { + "epoch": 9.91620626151013, + "grad_norm": 0.10526315122842789, + "learning_rate": 1.8547479091146626e-08, + "loss": 1.6681, + "step": 32307 + }, + { + "epoch": 9.916513198281153, + "grad_norm": 0.12726645171642303, + "learning_rate": 1.8412351375984848e-08, + "loss": 1.6959, + "step": 32308 + }, + { + "epoch": 9.916820135052179, + "grad_norm": 0.13809795677661896, + "learning_rate": 1.827771760816721e-08, + "loss": 1.7263, + "step": 32309 + }, + { + "epoch": 9.917127071823204, + "grad_norm": 0.15422095358371735, + "learning_rate": 1.8143577789020426e-08, + "loss": 1.7651, + "step": 32310 + }, + { + "epoch": 9.91743400859423, + "grad_norm": 0.1087057963013649, + "learning_rate": 1.8009931919876767e-08, + "loss": 1.6589, + "step": 32311 + }, + { + "epoch": 9.917740945365255, + "grad_norm": 0.1274532526731491, + "learning_rate": 1.787678000205739e-08, + "loss": 1.698, + "step": 32312 + }, + { + "epoch": 9.91804788213628, + "grad_norm": 0.14955148100852966, + "learning_rate": 1.774412203687237e-08, + "loss": 1.7187, + "step": 32313 + }, + { + "epoch": 9.918354818907305, + "grad_norm": 0.12892384827136993, + "learning_rate": 1.761195802563731e-08, + "loss": 1.6668, + "step": 32314 + }, + { + "epoch": 9.91866175567833, + "grad_norm": 0.12298917770385742, + "learning_rate": 1.7480287969651178e-08, + "loss": 1.6624, + "step": 32315 + }, + { + "epoch": 9.918968692449356, + "grad_norm": 0.10288118571043015, + "learning_rate": 1.7349111870224032e-08, + "loss": 1.6632, + "step": 32316 + }, + { + "epoch": 9.919275629220381, + "grad_norm": 0.15588083863258362, + "learning_rate": 1.7218429728649287e-08, + "loss": 1.7164, + "step": 32317 + }, + { + "epoch": 9.919582565991405, + "grad_norm": 0.13187600672245026, + "learning_rate": 1.708824154622035e-08, + "loss": 1.6939, + "step": 32318 + }, + { + "epoch": 9.91988950276243, + "grad_norm": 0.1224738210439682, + "learning_rate": 1.695854732421398e-08, + "loss": 1.6847, + "step": 32319 + }, + { + "epoch": 9.920196439533456, + "grad_norm": 0.12615568935871124, + "learning_rate": 1.6829347063923584e-08, + "loss": 1.7332, + "step": 32320 + }, + { + "epoch": 9.920503376304481, + "grad_norm": 0.10515398532152176, + "learning_rate": 1.670064076662592e-08, + "loss": 1.6249, + "step": 32321 + }, + { + "epoch": 9.920810313075506, + "grad_norm": 0.11620636284351349, + "learning_rate": 1.657242843358109e-08, + "loss": 1.6856, + "step": 32322 + }, + { + "epoch": 9.921117249846532, + "grad_norm": 0.14267602562904358, + "learning_rate": 1.644471006606585e-08, + "loss": 1.6957, + "step": 32323 + }, + { + "epoch": 9.921424186617557, + "grad_norm": 0.14195942878723145, + "learning_rate": 1.6317485665345855e-08, + "loss": 1.7102, + "step": 32324 + }, + { + "epoch": 9.921731123388582, + "grad_norm": 0.13764344155788422, + "learning_rate": 1.6190755232664556e-08, + "loss": 1.7028, + "step": 32325 + }, + { + "epoch": 9.922038060159608, + "grad_norm": 0.13899104297161102, + "learning_rate": 1.6064518769287605e-08, + "loss": 1.6844, + "step": 32326 + }, + { + "epoch": 9.922344996930633, + "grad_norm": 0.11225128173828125, + "learning_rate": 1.5938776276458457e-08, + "loss": 1.6489, + "step": 32327 + }, + { + "epoch": 9.922651933701658, + "grad_norm": 0.10915616899728775, + "learning_rate": 1.5813527755415e-08, + "loss": 1.6916, + "step": 32328 + }, + { + "epoch": 9.922958870472682, + "grad_norm": 0.15568388998508453, + "learning_rate": 1.56887732074007e-08, + "loss": 1.7295, + "step": 32329 + }, + { + "epoch": 9.923265807243707, + "grad_norm": 0.12068216502666473, + "learning_rate": 1.556451263364789e-08, + "loss": 1.7293, + "step": 32330 + }, + { + "epoch": 9.923572744014733, + "grad_norm": 0.1622546762228012, + "learning_rate": 1.544074603538337e-08, + "loss": 1.7351, + "step": 32331 + }, + { + "epoch": 9.923879680785758, + "grad_norm": 0.10042760521173477, + "learning_rate": 1.5317473413828388e-08, + "loss": 1.6729, + "step": 32332 + }, + { + "epoch": 9.924186617556783, + "grad_norm": 0.15807488560676575, + "learning_rate": 1.5194694770204187e-08, + "loss": 1.7114, + "step": 32333 + }, + { + "epoch": 9.924493554327809, + "grad_norm": 0.1204007938504219, + "learning_rate": 1.5072410105720914e-08, + "loss": 1.7242, + "step": 32334 + }, + { + "epoch": 9.924800491098834, + "grad_norm": 0.1176806390285492, + "learning_rate": 1.495061942159426e-08, + "loss": 1.6778, + "step": 32335 + }, + { + "epoch": 9.92510742786986, + "grad_norm": 0.2244664430618286, + "learning_rate": 1.4829322719017713e-08, + "loss": 1.7206, + "step": 32336 + }, + { + "epoch": 9.925414364640885, + "grad_norm": 0.11579646915197372, + "learning_rate": 1.4708519999195868e-08, + "loss": 1.6595, + "step": 32337 + }, + { + "epoch": 9.92572130141191, + "grad_norm": 0.08797867596149445, + "learning_rate": 1.4588211263322215e-08, + "loss": 1.6587, + "step": 32338 + }, + { + "epoch": 9.926028238182933, + "grad_norm": 0.118585966527462, + "learning_rate": 1.4468396512584693e-08, + "loss": 1.6704, + "step": 32339 + }, + { + "epoch": 9.926335174953959, + "grad_norm": 0.16289199888706207, + "learning_rate": 1.4349075748171236e-08, + "loss": 1.7494, + "step": 32340 + }, + { + "epoch": 9.926642111724984, + "grad_norm": 0.09592059999704361, + "learning_rate": 1.4230248971253135e-08, + "loss": 1.6613, + "step": 32341 + }, + { + "epoch": 9.92694904849601, + "grad_norm": 0.17101891338825226, + "learning_rate": 1.4111916183012775e-08, + "loss": 1.746, + "step": 32342 + }, + { + "epoch": 9.927255985267035, + "grad_norm": 0.12958920001983643, + "learning_rate": 1.3994077384615889e-08, + "loss": 1.6704, + "step": 32343 + }, + { + "epoch": 9.92756292203806, + "grad_norm": 0.1180882677435875, + "learning_rate": 1.3876732577228212e-08, + "loss": 1.6892, + "step": 32344 + }, + { + "epoch": 9.927869858809085, + "grad_norm": 0.13923440873622894, + "learning_rate": 1.375988176200438e-08, + "loss": 1.7205, + "step": 32345 + }, + { + "epoch": 9.92817679558011, + "grad_norm": 0.11700796335935593, + "learning_rate": 1.3643524940104569e-08, + "loss": 1.6689, + "step": 32346 + }, + { + "epoch": 9.928483732351136, + "grad_norm": 0.14296385645866394, + "learning_rate": 1.3527662112677863e-08, + "loss": 1.7583, + "step": 32347 + }, + { + "epoch": 9.928790669122161, + "grad_norm": 0.14136703312397003, + "learning_rate": 1.3412293280867794e-08, + "loss": 1.7217, + "step": 32348 + }, + { + "epoch": 9.929097605893187, + "grad_norm": 0.19926518201828003, + "learning_rate": 1.3297418445817889e-08, + "loss": 1.7181, + "step": 32349 + }, + { + "epoch": 9.92940454266421, + "grad_norm": 0.12384761869907379, + "learning_rate": 1.3183037608660576e-08, + "loss": 1.7106, + "step": 32350 + }, + { + "epoch": 9.929711479435236, + "grad_norm": 0.1384219080209732, + "learning_rate": 1.3069150770528282e-08, + "loss": 1.6729, + "step": 32351 + }, + { + "epoch": 9.930018416206261, + "grad_norm": 0.11504645645618439, + "learning_rate": 1.2955757932542333e-08, + "loss": 1.7014, + "step": 32352 + }, + { + "epoch": 9.930325352977286, + "grad_norm": 0.172870472073555, + "learning_rate": 1.2842859095824056e-08, + "loss": 1.778, + "step": 32353 + }, + { + "epoch": 9.930632289748312, + "grad_norm": 0.13782678544521332, + "learning_rate": 1.2730454261494774e-08, + "loss": 1.7095, + "step": 32354 + }, + { + "epoch": 9.930939226519337, + "grad_norm": 0.12346980720758438, + "learning_rate": 1.2618543430659157e-08, + "loss": 1.6769, + "step": 32355 + }, + { + "epoch": 9.931246163290362, + "grad_norm": 0.10613575577735901, + "learning_rate": 1.2507126604427433e-08, + "loss": 1.6659, + "step": 32356 + }, + { + "epoch": 9.931553100061388, + "grad_norm": 0.16232433915138245, + "learning_rate": 1.2396203783898719e-08, + "loss": 1.741, + "step": 32357 + }, + { + "epoch": 9.931860036832413, + "grad_norm": 0.11868718266487122, + "learning_rate": 1.2285774970166586e-08, + "loss": 1.7018, + "step": 32358 + }, + { + "epoch": 9.932166973603438, + "grad_norm": 0.17840202152729034, + "learning_rate": 1.2175840164330155e-08, + "loss": 1.7294, + "step": 32359 + }, + { + "epoch": 9.932473910374462, + "grad_norm": 0.12258690595626831, + "learning_rate": 1.2066399367466342e-08, + "loss": 1.6724, + "step": 32360 + }, + { + "epoch": 9.932780847145487, + "grad_norm": 0.1263471096754074, + "learning_rate": 1.1957452580663164e-08, + "loss": 1.7157, + "step": 32361 + }, + { + "epoch": 9.933087783916513, + "grad_norm": 0.1078755185008049, + "learning_rate": 1.184899980499754e-08, + "loss": 1.671, + "step": 32362 + }, + { + "epoch": 9.933394720687538, + "grad_norm": 0.15112191438674927, + "learning_rate": 1.1741041041535284e-08, + "loss": 1.7683, + "step": 32363 + }, + { + "epoch": 9.933701657458563, + "grad_norm": 0.08500932902097702, + "learning_rate": 1.163357629134776e-08, + "loss": 1.63, + "step": 32364 + }, + { + "epoch": 9.934008594229589, + "grad_norm": 0.14534896612167358, + "learning_rate": 1.152660555549523e-08, + "loss": 1.7348, + "step": 32365 + }, + { + "epoch": 9.934315531000614, + "grad_norm": 0.107171930372715, + "learning_rate": 1.1420128835037958e-08, + "loss": 1.6783, + "step": 32366 + }, + { + "epoch": 9.93462246777164, + "grad_norm": 0.14311735332012177, + "learning_rate": 1.1314146131030656e-08, + "loss": 1.7189, + "step": 32367 + }, + { + "epoch": 9.934929404542665, + "grad_norm": 0.1567717045545578, + "learning_rate": 1.1208657444511384e-08, + "loss": 1.7471, + "step": 32368 + }, + { + "epoch": 9.93523634131369, + "grad_norm": 0.17283129692077637, + "learning_rate": 1.1103662776523749e-08, + "loss": 1.7218, + "step": 32369 + }, + { + "epoch": 9.935543278084715, + "grad_norm": 0.10981162637472153, + "learning_rate": 1.0999162128116913e-08, + "loss": 1.6921, + "step": 32370 + }, + { + "epoch": 9.93585021485574, + "grad_norm": 0.1108628660440445, + "learning_rate": 1.0895155500312281e-08, + "loss": 1.6755, + "step": 32371 + }, + { + "epoch": 9.936157151626764, + "grad_norm": 0.15141257643699646, + "learning_rate": 1.079164289413681e-08, + "loss": 1.7308, + "step": 32372 + }, + { + "epoch": 9.93646408839779, + "grad_norm": 0.2009151577949524, + "learning_rate": 1.0688624310623007e-08, + "loss": 1.7415, + "step": 32373 + }, + { + "epoch": 9.936771025168815, + "grad_norm": 0.12966850399971008, + "learning_rate": 1.0586099750786727e-08, + "loss": 1.7394, + "step": 32374 + }, + { + "epoch": 9.93707796193984, + "grad_norm": 0.13342911005020142, + "learning_rate": 1.048406921563272e-08, + "loss": 1.6998, + "step": 32375 + }, + { + "epoch": 9.937384898710865, + "grad_norm": 0.13602954149246216, + "learning_rate": 1.038253270617684e-08, + "loss": 1.6902, + "step": 32376 + }, + { + "epoch": 9.93769183548189, + "grad_norm": 0.09679561108350754, + "learning_rate": 1.0281490223418289e-08, + "loss": 1.6706, + "step": 32377 + }, + { + "epoch": 9.937998772252916, + "grad_norm": 0.1325666606426239, + "learning_rate": 1.0180941768361817e-08, + "loss": 1.677, + "step": 32378 + }, + { + "epoch": 9.938305709023942, + "grad_norm": 0.18245433270931244, + "learning_rate": 1.0080887341995526e-08, + "loss": 1.7808, + "step": 32379 + }, + { + "epoch": 9.938612645794967, + "grad_norm": 0.22659125924110413, + "learning_rate": 9.981326945313063e-09, + "loss": 1.7197, + "step": 32380 + }, + { + "epoch": 9.938919582565992, + "grad_norm": 0.13232065737247467, + "learning_rate": 9.882260579291425e-09, + "loss": 1.7133, + "step": 32381 + }, + { + "epoch": 9.939226519337016, + "grad_norm": 0.3453350365161896, + "learning_rate": 9.783688244913158e-09, + "loss": 1.8062, + "step": 32382 + }, + { + "epoch": 9.939533456108041, + "grad_norm": 0.15529876947402954, + "learning_rate": 9.685609943155261e-09, + "loss": 1.7438, + "step": 32383 + }, + { + "epoch": 9.939840392879066, + "grad_norm": 0.2087012678384781, + "learning_rate": 9.588025674983626e-09, + "loss": 1.7239, + "step": 32384 + }, + { + "epoch": 9.940147329650092, + "grad_norm": 0.14322242140769958, + "learning_rate": 9.490935441358595e-09, + "loss": 1.7341, + "step": 32385 + }, + { + "epoch": 9.940454266421117, + "grad_norm": 0.11070089042186737, + "learning_rate": 9.394339243251615e-09, + "loss": 1.6735, + "step": 32386 + }, + { + "epoch": 9.940761203192142, + "grad_norm": 0.1307358294725418, + "learning_rate": 9.298237081606376e-09, + "loss": 1.6458, + "step": 32387 + }, + { + "epoch": 9.941068139963168, + "grad_norm": 0.21708574891090393, + "learning_rate": 9.202628957377668e-09, + "loss": 1.7589, + "step": 32388 + }, + { + "epoch": 9.941375076734193, + "grad_norm": 0.12621861696243286, + "learning_rate": 9.107514871509182e-09, + "loss": 1.7402, + "step": 32389 + }, + { + "epoch": 9.941682013505218, + "grad_norm": 0.13067953288555145, + "learning_rate": 9.012894824939056e-09, + "loss": 1.6896, + "step": 32390 + }, + { + "epoch": 9.941988950276244, + "grad_norm": 0.18594002723693848, + "learning_rate": 8.918768818605427e-09, + "loss": 1.7461, + "step": 32391 + }, + { + "epoch": 9.942295887047269, + "grad_norm": 0.17440444231033325, + "learning_rate": 8.825136853435333e-09, + "loss": 1.6969, + "step": 32392 + }, + { + "epoch": 9.942602823818293, + "grad_norm": 0.12859460711479187, + "learning_rate": 8.731998930361363e-09, + "loss": 1.724, + "step": 32393 + }, + { + "epoch": 9.942909760589318, + "grad_norm": 0.14894992113113403, + "learning_rate": 8.639355050293896e-09, + "loss": 1.6802, + "step": 32394 + }, + { + "epoch": 9.943216697360343, + "grad_norm": 0.16252176463603973, + "learning_rate": 8.54720521415442e-09, + "loss": 1.7391, + "step": 32395 + }, + { + "epoch": 9.943523634131369, + "grad_norm": 0.18194718658924103, + "learning_rate": 8.455549422853315e-09, + "loss": 1.7539, + "step": 32396 + }, + { + "epoch": 9.943830570902394, + "grad_norm": 0.1416047215461731, + "learning_rate": 8.364387677295415e-09, + "loss": 1.7356, + "step": 32397 + }, + { + "epoch": 9.94413750767342, + "grad_norm": 0.1490311175584793, + "learning_rate": 8.27371997838e-09, + "loss": 1.7323, + "step": 32398 + }, + { + "epoch": 9.944444444444445, + "grad_norm": 0.1581144481897354, + "learning_rate": 8.183546327006353e-09, + "loss": 1.6542, + "step": 32399 + }, + { + "epoch": 9.94475138121547, + "grad_norm": 0.16656135022640228, + "learning_rate": 8.09386672406265e-09, + "loss": 1.7551, + "step": 32400 + }, + { + "epoch": 9.945058317986495, + "grad_norm": 0.1854424625635147, + "learning_rate": 8.004681170437067e-09, + "loss": 1.7196, + "step": 32401 + }, + { + "epoch": 9.94536525475752, + "grad_norm": 0.15835405886173248, + "learning_rate": 7.915989667006685e-09, + "loss": 1.6704, + "step": 32402 + }, + { + "epoch": 9.945672191528544, + "grad_norm": 0.12599913775920868, + "learning_rate": 7.82779221465968e-09, + "loss": 1.7035, + "step": 32403 + }, + { + "epoch": 9.94597912829957, + "grad_norm": 0.16099520027637482, + "learning_rate": 7.740088814256475e-09, + "loss": 1.6739, + "step": 32404 + }, + { + "epoch": 9.946286065070595, + "grad_norm": 0.12222954630851746, + "learning_rate": 7.652879466663043e-09, + "loss": 1.6991, + "step": 32405 + }, + { + "epoch": 9.94659300184162, + "grad_norm": 0.12297282367944717, + "learning_rate": 7.566164172750911e-09, + "loss": 1.6848, + "step": 32406 + }, + { + "epoch": 9.946899938612646, + "grad_norm": 0.12966325879096985, + "learning_rate": 7.479942933369399e-09, + "loss": 1.6989, + "step": 32407 + }, + { + "epoch": 9.94720687538367, + "grad_norm": 0.12732411921024323, + "learning_rate": 7.394215749367828e-09, + "loss": 1.6812, + "step": 32408 + }, + { + "epoch": 9.947513812154696, + "grad_norm": 0.11722669005393982, + "learning_rate": 7.308982621606619e-09, + "loss": 1.6813, + "step": 32409 + }, + { + "epoch": 9.947820748925722, + "grad_norm": 0.13928887248039246, + "learning_rate": 7.22424355091289e-09, + "loss": 1.6754, + "step": 32410 + }, + { + "epoch": 9.948127685696747, + "grad_norm": 0.1875402331352234, + "learning_rate": 7.139998538135961e-09, + "loss": 1.7694, + "step": 32411 + }, + { + "epoch": 9.948434622467772, + "grad_norm": 0.10383447259664536, + "learning_rate": 7.0562475841029485e-09, + "loss": 1.6683, + "step": 32412 + }, + { + "epoch": 9.948741559238798, + "grad_norm": 0.15648451447486877, + "learning_rate": 6.972990689635417e-09, + "loss": 1.7515, + "step": 32413 + }, + { + "epoch": 9.949048496009823, + "grad_norm": 0.15558132529258728, + "learning_rate": 6.890227855571585e-09, + "loss": 1.7311, + "step": 32414 + }, + { + "epoch": 9.949355432780846, + "grad_norm": 0.1698763221502304, + "learning_rate": 6.8079590827163645e-09, + "loss": 1.7688, + "step": 32415 + }, + { + "epoch": 9.949662369551872, + "grad_norm": 0.11423872411251068, + "learning_rate": 6.726184371885769e-09, + "loss": 1.6848, + "step": 32416 + }, + { + "epoch": 9.949969306322897, + "grad_norm": 0.11946321278810501, + "learning_rate": 6.6449037238902615e-09, + "loss": 1.6749, + "step": 32417 + }, + { + "epoch": 9.950276243093922, + "grad_norm": 0.16556024551391602, + "learning_rate": 6.564117139529202e-09, + "loss": 1.7386, + "step": 32418 + }, + { + "epoch": 9.950583179864948, + "grad_norm": 0.13177451491355896, + "learning_rate": 6.483824619607504e-09, + "loss": 1.691, + "step": 32419 + }, + { + "epoch": 9.950890116635973, + "grad_norm": 0.10789786279201508, + "learning_rate": 6.404026164913424e-09, + "loss": 1.6424, + "step": 32420 + }, + { + "epoch": 9.951197053406998, + "grad_norm": 0.1662123203277588, + "learning_rate": 6.32472177623522e-09, + "loss": 1.7511, + "step": 32421 + }, + { + "epoch": 9.951503990178024, + "grad_norm": 0.12785036861896515, + "learning_rate": 6.245911454361153e-09, + "loss": 1.6522, + "step": 32422 + }, + { + "epoch": 9.95181092694905, + "grad_norm": 0.12330711632966995, + "learning_rate": 6.167595200062825e-09, + "loss": 1.6776, + "step": 32423 + }, + { + "epoch": 9.952117863720074, + "grad_norm": 0.09788266569375992, + "learning_rate": 6.089773014122946e-09, + "loss": 1.6613, + "step": 32424 + }, + { + "epoch": 9.952424800491098, + "grad_norm": 0.18258565664291382, + "learning_rate": 6.0124448973075675e-09, + "loss": 1.6977, + "step": 32425 + }, + { + "epoch": 9.952731737262123, + "grad_norm": 0.13971279561519623, + "learning_rate": 5.935610850377194e-09, + "loss": 1.7138, + "step": 32426 + }, + { + "epoch": 9.953038674033149, + "grad_norm": 0.1644059717655182, + "learning_rate": 5.859270874092326e-09, + "loss": 1.6821, + "step": 32427 + }, + { + "epoch": 9.953345610804174, + "grad_norm": 0.20486295223236084, + "learning_rate": 5.783424969207918e-09, + "loss": 1.715, + "step": 32428 + }, + { + "epoch": 9.9536525475752, + "grad_norm": 0.16751137375831604, + "learning_rate": 5.7080731364733684e-09, + "loss": 1.75, + "step": 32429 + }, + { + "epoch": 9.953959484346225, + "grad_norm": 0.10631072521209717, + "learning_rate": 5.633215376638079e-09, + "loss": 1.6589, + "step": 32430 + }, + { + "epoch": 9.95426642111725, + "grad_norm": 0.12222696095705032, + "learning_rate": 5.5588516904403475e-09, + "loss": 1.6957, + "step": 32431 + }, + { + "epoch": 9.954573357888275, + "grad_norm": 0.1868433952331543, + "learning_rate": 5.48498207860737e-09, + "loss": 1.7432, + "step": 32432 + }, + { + "epoch": 9.9548802946593, + "grad_norm": 0.12732042372226715, + "learning_rate": 5.411606541877446e-09, + "loss": 1.6819, + "step": 32433 + }, + { + "epoch": 9.955187231430326, + "grad_norm": 0.1370798945426941, + "learning_rate": 5.338725080972218e-09, + "loss": 1.711, + "step": 32434 + }, + { + "epoch": 9.955494168201351, + "grad_norm": 0.13998152315616608, + "learning_rate": 5.266337696607782e-09, + "loss": 1.687, + "step": 32435 + }, + { + "epoch": 9.955801104972375, + "grad_norm": 0.08052362501621246, + "learning_rate": 5.194444389511333e-09, + "loss": 1.629, + "step": 32436 + }, + { + "epoch": 9.9561080417434, + "grad_norm": 0.1393222063779831, + "learning_rate": 5.123045160382311e-09, + "loss": 1.7062, + "step": 32437 + }, + { + "epoch": 9.956414978514426, + "grad_norm": 0.1532362848520279, + "learning_rate": 5.0521400099312614e-09, + "loss": 1.7417, + "step": 32438 + }, + { + "epoch": 9.956721915285451, + "grad_norm": 0.12600642442703247, + "learning_rate": 4.9817289388576215e-09, + "loss": 1.6931, + "step": 32439 + }, + { + "epoch": 9.957028852056476, + "grad_norm": 0.10447245836257935, + "learning_rate": 4.911811947860834e-09, + "loss": 1.6598, + "step": 32440 + }, + { + "epoch": 9.957335788827502, + "grad_norm": 0.13217110931873322, + "learning_rate": 4.842389037623684e-09, + "loss": 1.704, + "step": 32441 + }, + { + "epoch": 9.957642725598527, + "grad_norm": 0.13356858491897583, + "learning_rate": 4.773460208840064e-09, + "loss": 1.7246, + "step": 32442 + }, + { + "epoch": 9.957949662369552, + "grad_norm": 0.09947375953197479, + "learning_rate": 4.705025462187207e-09, + "loss": 1.649, + "step": 32443 + }, + { + "epoch": 9.958256599140578, + "grad_norm": 0.12298106402158737, + "learning_rate": 4.63708479834235e-09, + "loss": 1.7012, + "step": 32444 + }, + { + "epoch": 9.958563535911603, + "grad_norm": 0.11899507790803909, + "learning_rate": 4.569638217977179e-09, + "loss": 1.662, + "step": 32445 + }, + { + "epoch": 9.958870472682626, + "grad_norm": 0.10871480405330658, + "learning_rate": 4.502685721757827e-09, + "loss": 1.6718, + "step": 32446 + }, + { + "epoch": 9.959177409453652, + "grad_norm": 0.17709551751613617, + "learning_rate": 4.436227310344876e-09, + "loss": 1.755, + "step": 32447 + }, + { + "epoch": 9.959484346224677, + "grad_norm": 0.1668638288974762, + "learning_rate": 4.3702629843989095e-09, + "loss": 1.7693, + "step": 32448 + }, + { + "epoch": 9.959791282995702, + "grad_norm": 0.1600068211555481, + "learning_rate": 4.304792744569408e-09, + "loss": 1.6912, + "step": 32449 + }, + { + "epoch": 9.960098219766728, + "grad_norm": 0.10293091088533401, + "learning_rate": 4.2398165915003e-09, + "loss": 1.6801, + "step": 32450 + }, + { + "epoch": 9.960405156537753, + "grad_norm": 0.16815342009067535, + "learning_rate": 4.175334525841068e-09, + "loss": 1.7739, + "step": 32451 + }, + { + "epoch": 9.960712093308778, + "grad_norm": 0.08967147767543793, + "learning_rate": 4.111346548218986e-09, + "loss": 1.6412, + "step": 32452 + }, + { + "epoch": 9.961019030079804, + "grad_norm": 0.13091377913951874, + "learning_rate": 4.047852659277984e-09, + "loss": 1.7352, + "step": 32453 + }, + { + "epoch": 9.96132596685083, + "grad_norm": 0.1545487344264984, + "learning_rate": 3.9848528596397875e-09, + "loss": 1.6858, + "step": 32454 + }, + { + "epoch": 9.961632903621854, + "grad_norm": 0.1344275325536728, + "learning_rate": 3.9223471499205685e-09, + "loss": 1.7114, + "step": 32455 + }, + { + "epoch": 9.96193984039288, + "grad_norm": 0.13304302096366882, + "learning_rate": 3.860335530747605e-09, + "loss": 1.7032, + "step": 32456 + }, + { + "epoch": 9.962246777163905, + "grad_norm": 0.1125492975115776, + "learning_rate": 3.798818002731519e-09, + "loss": 1.7028, + "step": 32457 + }, + { + "epoch": 9.962553713934929, + "grad_norm": 0.12480182945728302, + "learning_rate": 3.7377945664773815e-09, + "loss": 1.7062, + "step": 32458 + }, + { + "epoch": 9.962860650705954, + "grad_norm": 0.15090548992156982, + "learning_rate": 3.677265222595816e-09, + "loss": 1.7534, + "step": 32459 + }, + { + "epoch": 9.96316758747698, + "grad_norm": 0.15135593712329865, + "learning_rate": 3.61722997167524e-09, + "loss": 1.7628, + "step": 32460 + }, + { + "epoch": 9.963474524248005, + "grad_norm": 0.14519482851028442, + "learning_rate": 3.5576888143096232e-09, + "loss": 1.6912, + "step": 32461 + }, + { + "epoch": 9.96378146101903, + "grad_norm": 0.13425637781620026, + "learning_rate": 3.4986417510929347e-09, + "loss": 1.6583, + "step": 32462 + }, + { + "epoch": 9.964088397790055, + "grad_norm": 0.15377762913703918, + "learning_rate": 3.440088782608042e-09, + "loss": 1.7223, + "step": 32463 + }, + { + "epoch": 9.96439533456108, + "grad_norm": 0.1342972368001938, + "learning_rate": 3.3820299094322604e-09, + "loss": 1.6861, + "step": 32464 + }, + { + "epoch": 9.964702271332106, + "grad_norm": 0.5013613700866699, + "learning_rate": 3.3244651321373555e-09, + "loss": 1.7253, + "step": 32465 + }, + { + "epoch": 9.965009208103131, + "grad_norm": 0.09804642200469971, + "learning_rate": 3.2673944512950915e-09, + "loss": 1.657, + "step": 32466 + }, + { + "epoch": 9.965316144874157, + "grad_norm": 0.12614911794662476, + "learning_rate": 3.210817867471683e-09, + "loss": 1.6729, + "step": 32467 + }, + { + "epoch": 9.96562308164518, + "grad_norm": 0.09813954681158066, + "learning_rate": 3.154735381216689e-09, + "loss": 1.6665, + "step": 32468 + }, + { + "epoch": 9.965930018416206, + "grad_norm": 0.11637084931135178, + "learning_rate": 3.0991469930963244e-09, + "loss": 1.6882, + "step": 32469 + }, + { + "epoch": 9.966236955187231, + "grad_norm": 0.11884592473506927, + "learning_rate": 3.0440527036490474e-09, + "loss": 1.6898, + "step": 32470 + }, + { + "epoch": 9.966543891958256, + "grad_norm": 0.11903903633356094, + "learning_rate": 2.989452513429969e-09, + "loss": 1.7055, + "step": 32471 + }, + { + "epoch": 9.966850828729282, + "grad_norm": 0.11886774003505707, + "learning_rate": 2.9353464229719962e-09, + "loss": 1.681, + "step": 32472 + }, + { + "epoch": 9.967157765500307, + "grad_norm": 0.1473800390958786, + "learning_rate": 2.8817344328080365e-09, + "loss": 1.7348, + "step": 32473 + }, + { + "epoch": 9.967464702271332, + "grad_norm": 0.16137374937534332, + "learning_rate": 2.8286165434709967e-09, + "loss": 1.7044, + "step": 32474 + }, + { + "epoch": 9.967771639042358, + "grad_norm": 0.20311129093170166, + "learning_rate": 2.7759927554882326e-09, + "loss": 1.791, + "step": 32475 + }, + { + "epoch": 9.968078575813383, + "grad_norm": 0.20193049311637878, + "learning_rate": 2.723863069375998e-09, + "loss": 1.7007, + "step": 32476 + }, + { + "epoch": 9.968385512584408, + "grad_norm": 0.208475723862648, + "learning_rate": 2.672227485656098e-09, + "loss": 1.739, + "step": 32477 + }, + { + "epoch": 9.968692449355434, + "grad_norm": 0.1389041393995285, + "learning_rate": 2.6210860048281325e-09, + "loss": 1.6907, + "step": 32478 + }, + { + "epoch": 9.968999386126457, + "grad_norm": 0.13423459231853485, + "learning_rate": 2.5704386274028046e-09, + "loss": 1.6886, + "step": 32479 + }, + { + "epoch": 9.969306322897483, + "grad_norm": 0.10988432168960571, + "learning_rate": 2.5202853538797144e-09, + "loss": 1.6567, + "step": 32480 + }, + { + "epoch": 9.969613259668508, + "grad_norm": 0.1565593034029007, + "learning_rate": 2.470626184758462e-09, + "loss": 1.7901, + "step": 32481 + }, + { + "epoch": 9.969920196439533, + "grad_norm": 0.14571799337863922, + "learning_rate": 2.421461120527546e-09, + "loss": 1.7062, + "step": 32482 + }, + { + "epoch": 9.970227133210559, + "grad_norm": 0.11386661976575851, + "learning_rate": 2.3727901616699134e-09, + "loss": 1.6825, + "step": 32483 + }, + { + "epoch": 9.970534069981584, + "grad_norm": 0.11852385848760605, + "learning_rate": 2.3246133086740617e-09, + "loss": 1.6912, + "step": 32484 + }, + { + "epoch": 9.97084100675261, + "grad_norm": 0.10368720442056656, + "learning_rate": 2.276930562006285e-09, + "loss": 1.6892, + "step": 32485 + }, + { + "epoch": 9.971147943523635, + "grad_norm": 0.13299435377120972, + "learning_rate": 2.22974192214398e-09, + "loss": 1.7124, + "step": 32486 + }, + { + "epoch": 9.97145488029466, + "grad_norm": 0.132483571767807, + "learning_rate": 2.1830473895478875e-09, + "loss": 1.6835, + "step": 32487 + }, + { + "epoch": 9.971761817065685, + "grad_norm": 0.11850076913833618, + "learning_rate": 2.1368469646898537e-09, + "loss": 1.6911, + "step": 32488 + }, + { + "epoch": 9.972068753836709, + "grad_norm": 0.15215659141540527, + "learning_rate": 2.091140648013967e-09, + "loss": 1.7441, + "step": 32489 + }, + { + "epoch": 9.972375690607734, + "grad_norm": 0.17135567963123322, + "learning_rate": 2.045928439980971e-09, + "loss": 1.7527, + "step": 32490 + }, + { + "epoch": 9.97268262737876, + "grad_norm": 0.185814768075943, + "learning_rate": 2.0012103410349537e-09, + "loss": 1.748, + "step": 32491 + }, + { + "epoch": 9.972989564149785, + "grad_norm": 0.15464171767234802, + "learning_rate": 1.956986351620005e-09, + "loss": 1.7463, + "step": 32492 + }, + { + "epoch": 9.97329650092081, + "grad_norm": 0.16765901446342468, + "learning_rate": 1.9132564721691114e-09, + "loss": 1.7198, + "step": 32493 + }, + { + "epoch": 9.973603437691835, + "grad_norm": 0.14002041518688202, + "learning_rate": 1.8700207031152606e-09, + "loss": 1.7073, + "step": 32494 + }, + { + "epoch": 9.97391037446286, + "grad_norm": 0.1588892936706543, + "learning_rate": 1.8272790448858879e-09, + "loss": 1.7064, + "step": 32495 + }, + { + "epoch": 9.974217311233886, + "grad_norm": 0.12804476916790009, + "learning_rate": 1.7850314979084292e-09, + "loss": 1.7077, + "step": 32496 + }, + { + "epoch": 9.974524248004911, + "grad_norm": 0.1506362110376358, + "learning_rate": 1.7432780625936672e-09, + "loss": 1.7021, + "step": 32497 + }, + { + "epoch": 9.974831184775937, + "grad_norm": 0.11829007416963577, + "learning_rate": 1.7020187393579356e-09, + "loss": 1.6897, + "step": 32498 + }, + { + "epoch": 9.975138121546962, + "grad_norm": 0.11921420693397522, + "learning_rate": 1.6612535286064656e-09, + "loss": 1.6818, + "step": 32499 + }, + { + "epoch": 9.975445058317986, + "grad_norm": 0.14553767442703247, + "learning_rate": 1.6209824307444888e-09, + "loss": 1.6768, + "step": 32500 + }, + { + "epoch": 9.975751995089011, + "grad_norm": 0.13848000764846802, + "learning_rate": 1.581205446166134e-09, + "loss": 1.7069, + "step": 32501 + }, + { + "epoch": 9.976058931860036, + "grad_norm": 0.1786017268896103, + "learning_rate": 1.541922575265531e-09, + "loss": 1.784, + "step": 32502 + }, + { + "epoch": 9.976365868631062, + "grad_norm": 0.1475924253463745, + "learning_rate": 1.5031338184368082e-09, + "loss": 1.716, + "step": 32503 + }, + { + "epoch": 9.976672805402087, + "grad_norm": 0.13834452629089355, + "learning_rate": 1.464839176062993e-09, + "loss": 1.6978, + "step": 32504 + }, + { + "epoch": 9.976979742173112, + "grad_norm": 0.20726680755615234, + "learning_rate": 1.4270386485104591e-09, + "loss": 1.8085, + "step": 32505 + }, + { + "epoch": 9.977286678944138, + "grad_norm": 0.13717865943908691, + "learning_rate": 1.3897322361677845e-09, + "loss": 1.7291, + "step": 32506 + }, + { + "epoch": 9.977593615715163, + "grad_norm": 0.14107772707939148, + "learning_rate": 1.3529199393902404e-09, + "loss": 1.745, + "step": 32507 + }, + { + "epoch": 9.977900552486188, + "grad_norm": 0.1317019909620285, + "learning_rate": 1.3166017585553026e-09, + "loss": 1.727, + "step": 32508 + }, + { + "epoch": 9.978207489257214, + "grad_norm": 0.1404808610677719, + "learning_rate": 1.2807776940126915e-09, + "loss": 1.758, + "step": 32509 + }, + { + "epoch": 9.978514426028239, + "grad_norm": 0.1774541139602661, + "learning_rate": 1.2454477461176784e-09, + "loss": 1.7361, + "step": 32510 + }, + { + "epoch": 9.978821362799263, + "grad_norm": 0.153715580701828, + "learning_rate": 1.2106119152199835e-09, + "loss": 1.7816, + "step": 32511 + }, + { + "epoch": 9.979128299570288, + "grad_norm": 0.15706269443035126, + "learning_rate": 1.1762702016637762e-09, + "loss": 1.7328, + "step": 32512 + }, + { + "epoch": 9.979435236341313, + "grad_norm": 0.13986578583717346, + "learning_rate": 1.142422605787674e-09, + "loss": 1.7263, + "step": 32513 + }, + { + "epoch": 9.979742173112339, + "grad_norm": 0.12627844512462616, + "learning_rate": 1.1090691279302957e-09, + "loss": 1.6769, + "step": 32514 + }, + { + "epoch": 9.980049109883364, + "grad_norm": 0.1719161719083786, + "learning_rate": 1.0762097684191563e-09, + "loss": 1.7306, + "step": 32515 + }, + { + "epoch": 9.98035604665439, + "grad_norm": 0.11803223192691803, + "learning_rate": 1.043844527576221e-09, + "loss": 1.6714, + "step": 32516 + }, + { + "epoch": 9.980662983425415, + "grad_norm": 0.18038241565227509, + "learning_rate": 1.0119734057234542e-09, + "loss": 1.7246, + "step": 32517 + }, + { + "epoch": 9.98096992019644, + "grad_norm": 0.11659211665391922, + "learning_rate": 9.805964031717186e-10, + "loss": 1.7338, + "step": 32518 + }, + { + "epoch": 9.981276856967465, + "grad_norm": 0.13037735223770142, + "learning_rate": 9.497135202429785e-10, + "loss": 1.6738, + "step": 32519 + }, + { + "epoch": 9.98158379373849, + "grad_norm": 0.1371074765920639, + "learning_rate": 9.193247572314434e-10, + "loss": 1.7392, + "step": 32520 + }, + { + "epoch": 9.981890730509516, + "grad_norm": 0.11290184408426285, + "learning_rate": 8.894301144368733e-10, + "loss": 1.6831, + "step": 32521 + }, + { + "epoch": 9.98219766728054, + "grad_norm": 0.12521374225616455, + "learning_rate": 8.600295921590285e-10, + "loss": 1.6805, + "step": 32522 + }, + { + "epoch": 9.982504604051565, + "grad_norm": 0.15203195810317993, + "learning_rate": 8.31123190692118e-10, + "loss": 1.6945, + "step": 32523 + }, + { + "epoch": 9.98281154082259, + "grad_norm": 0.153851717710495, + "learning_rate": 8.027109103136976e-10, + "loss": 1.7082, + "step": 32524 + }, + { + "epoch": 9.983118477593615, + "grad_norm": 0.16608181595802307, + "learning_rate": 7.747927513124254e-10, + "loss": 1.6922, + "step": 32525 + }, + { + "epoch": 9.98342541436464, + "grad_norm": 0.12815186381340027, + "learning_rate": 7.473687139547548e-10, + "loss": 1.6925, + "step": 32526 + }, + { + "epoch": 9.983732351135666, + "grad_norm": 0.16700038313865662, + "learning_rate": 7.204387985237926e-10, + "loss": 1.7956, + "step": 32527 + }, + { + "epoch": 9.984039287906691, + "grad_norm": 0.1810149997472763, + "learning_rate": 6.940030052748902e-10, + "loss": 1.7493, + "step": 32528 + }, + { + "epoch": 9.984346224677717, + "grad_norm": 0.14241847395896912, + "learning_rate": 6.680613344745013e-10, + "loss": 1.693, + "step": 32529 + }, + { + "epoch": 9.984653161448742, + "grad_norm": 0.1244758740067482, + "learning_rate": 6.426137863779768e-10, + "loss": 1.7357, + "step": 32530 + }, + { + "epoch": 9.984960098219767, + "grad_norm": 0.12879830598831177, + "learning_rate": 6.176603612351173e-10, + "loss": 1.7195, + "step": 32531 + }, + { + "epoch": 9.985267034990791, + "grad_norm": 0.16025955975055695, + "learning_rate": 5.932010592901715e-10, + "loss": 1.7286, + "step": 32532 + }, + { + "epoch": 9.985573971761816, + "grad_norm": 0.13949505984783173, + "learning_rate": 5.69235880798491e-10, + "loss": 1.7682, + "step": 32533 + }, + { + "epoch": 9.985880908532842, + "grad_norm": 0.10753419995307922, + "learning_rate": 5.457648259821202e-10, + "loss": 1.6935, + "step": 32534 + }, + { + "epoch": 9.986187845303867, + "grad_norm": 0.1572565734386444, + "learning_rate": 5.22787895074206e-10, + "loss": 1.6922, + "step": 32535 + }, + { + "epoch": 9.986494782074892, + "grad_norm": 0.1194307953119278, + "learning_rate": 5.003050883134464e-10, + "loss": 1.7086, + "step": 32536 + }, + { + "epoch": 9.986801718845918, + "grad_norm": 0.13817691802978516, + "learning_rate": 4.783164059107836e-10, + "loss": 1.7197, + "step": 32537 + }, + { + "epoch": 9.987108655616943, + "grad_norm": 0.12894465029239655, + "learning_rate": 4.5682184808271135e-10, + "loss": 1.699, + "step": 32538 + }, + { + "epoch": 9.987415592387968, + "grad_norm": 0.16827619075775146, + "learning_rate": 4.3582141505127403e-10, + "loss": 1.7492, + "step": 32539 + }, + { + "epoch": 9.987722529158994, + "grad_norm": 0.1456209421157837, + "learning_rate": 4.153151070163119e-10, + "loss": 1.7054, + "step": 32540 + }, + { + "epoch": 9.988029465930019, + "grad_norm": 0.11338894069194794, + "learning_rate": 3.9530292418321624e-10, + "loss": 1.6704, + "step": 32541 + }, + { + "epoch": 9.988336402701044, + "grad_norm": 0.19609282910823822, + "learning_rate": 3.757848667518271e-10, + "loss": 1.7457, + "step": 32542 + }, + { + "epoch": 9.988643339472068, + "grad_norm": 0.10050071775913239, + "learning_rate": 3.5676093491088245e-10, + "loss": 1.6396, + "step": 32543 + }, + { + "epoch": 9.988950276243093, + "grad_norm": 0.13579551875591278, + "learning_rate": 3.3823112884912024e-10, + "loss": 1.7123, + "step": 32544 + }, + { + "epoch": 9.989257213014119, + "grad_norm": 0.10279065370559692, + "learning_rate": 3.201954487552783e-10, + "loss": 1.6712, + "step": 32545 + }, + { + "epoch": 9.989564149785144, + "grad_norm": 0.1435621976852417, + "learning_rate": 3.0265389479589015e-10, + "loss": 1.6891, + "step": 32546 + }, + { + "epoch": 9.98987108655617, + "grad_norm": 0.14991097152233124, + "learning_rate": 2.8560646715969365e-10, + "loss": 1.7021, + "step": 32547 + }, + { + "epoch": 9.990178023327195, + "grad_norm": 0.08986492455005646, + "learning_rate": 2.6905316600212006e-10, + "loss": 1.636, + "step": 32548 + }, + { + "epoch": 9.99048496009822, + "grad_norm": 0.1458725482225418, + "learning_rate": 2.529939914897028e-10, + "loss": 1.7235, + "step": 32549 + }, + { + "epoch": 9.990791896869245, + "grad_norm": 0.12907643616199493, + "learning_rate": 2.374289437834243e-10, + "loss": 1.6899, + "step": 32550 + }, + { + "epoch": 9.99109883364027, + "grad_norm": 0.12231683731079102, + "learning_rate": 2.2235802303871567e-10, + "loss": 1.775, + "step": 32551 + }, + { + "epoch": 9.991405770411296, + "grad_norm": 0.11932116001844406, + "learning_rate": 2.077812294054571e-10, + "loss": 1.7045, + "step": 32552 + }, + { + "epoch": 9.99171270718232, + "grad_norm": 0.10503232479095459, + "learning_rate": 1.9369856301687528e-10, + "loss": 1.6758, + "step": 32553 + }, + { + "epoch": 9.992019643953345, + "grad_norm": 0.12878039479255676, + "learning_rate": 1.8011002402840148e-10, + "loss": 1.7219, + "step": 32554 + }, + { + "epoch": 9.99232658072437, + "grad_norm": 0.13469719886779785, + "learning_rate": 1.6701561256216025e-10, + "loss": 1.7188, + "step": 32555 + }, + { + "epoch": 9.992633517495396, + "grad_norm": 0.14357881247997284, + "learning_rate": 1.544153287513783e-10, + "loss": 1.7209, + "step": 32556 + }, + { + "epoch": 9.99294045426642, + "grad_norm": 0.11817539483308792, + "learning_rate": 1.4230917271818022e-10, + "loss": 1.6481, + "step": 32557 + }, + { + "epoch": 9.993247391037446, + "grad_norm": 0.13635072112083435, + "learning_rate": 1.306971445846905e-10, + "loss": 1.7076, + "step": 32558 + }, + { + "epoch": 9.993554327808472, + "grad_norm": 0.11874140799045563, + "learning_rate": 1.195792444674826e-10, + "loss": 1.6623, + "step": 32559 + }, + { + "epoch": 9.993861264579497, + "grad_norm": 0.15637235343456268, + "learning_rate": 1.0895547247757875e-10, + "loss": 1.7316, + "step": 32560 + }, + { + "epoch": 9.994168201350522, + "grad_norm": 0.11796044558286667, + "learning_rate": 9.882582870934798e-11, + "loss": 1.6645, + "step": 32561 + }, + { + "epoch": 9.994475138121548, + "grad_norm": 0.12259721755981445, + "learning_rate": 8.919031327936367e-11, + "loss": 1.6972, + "step": 32562 + }, + { + "epoch": 9.994782074892573, + "grad_norm": 0.12288567423820496, + "learning_rate": 8.004892626534144e-11, + "loss": 1.6727, + "step": 32563 + }, + { + "epoch": 9.995089011663598, + "grad_norm": 0.13289032876491547, + "learning_rate": 7.140166777275248e-11, + "loss": 1.7406, + "step": 32564 + }, + { + "epoch": 9.995395948434622, + "grad_norm": 0.15300492942333221, + "learning_rate": 6.324853787376128e-11, + "loss": 1.6974, + "step": 32565 + }, + { + "epoch": 9.995702885205647, + "grad_norm": 0.1378776878118515, + "learning_rate": 5.5589536662736805e-11, + "loss": 1.7007, + "step": 32566 + }, + { + "epoch": 9.996009821976672, + "grad_norm": 0.12087785452604294, + "learning_rate": 4.842466420629244e-11, + "loss": 1.681, + "step": 32567 + }, + { + "epoch": 9.996316758747698, + "grad_norm": 0.14573785662651062, + "learning_rate": 4.175392057659266e-11, + "loss": 1.7968, + "step": 32568 + }, + { + "epoch": 9.996623695518723, + "grad_norm": 0.1530621200799942, + "learning_rate": 3.557730583469976e-11, + "loss": 1.6813, + "step": 32569 + }, + { + "epoch": 9.996930632289748, + "grad_norm": 0.17444150149822235, + "learning_rate": 2.989482005277822e-11, + "loss": 1.6974, + "step": 32570 + }, + { + "epoch": 9.997237569060774, + "grad_norm": 0.09050992876291275, + "learning_rate": 2.4706463280788073e-11, + "loss": 1.6643, + "step": 32571 + }, + { + "epoch": 9.997544505831799, + "grad_norm": 0.11656873673200607, + "learning_rate": 2.001223557424048e-11, + "loss": 1.6748, + "step": 32572 + }, + { + "epoch": 9.997851442602824, + "grad_norm": 0.14353762567043304, + "learning_rate": 1.5812136971993242e-11, + "loss": 1.7003, + "step": 32573 + }, + { + "epoch": 9.99815837937385, + "grad_norm": 0.10622245073318481, + "learning_rate": 1.210616751845528e-11, + "loss": 1.6748, + "step": 32574 + }, + { + "epoch": 9.998465316144873, + "grad_norm": 0.1387772560119629, + "learning_rate": 8.894327252484403e-12, + "loss": 1.7133, + "step": 32575 + }, + { + "epoch": 9.998772252915899, + "grad_norm": 0.11832106858491898, + "learning_rate": 6.1766162018361824e-12, + "loss": 1.6548, + "step": 32576 + }, + { + "epoch": 9.999079189686924, + "grad_norm": 0.14122611284255981, + "learning_rate": 3.953034399817312e-12, + "loss": 1.737, + "step": 32577 + }, + { + "epoch": 9.99938612645795, + "grad_norm": 0.22168445587158203, + "learning_rate": 2.2235818630811368e-12, + "loss": 1.7414, + "step": 32578 + }, + { + "epoch": 9.999693063228975, + "grad_norm": 0.1424793303012848, + "learning_rate": 9.88258608281001e-13, + "loss": 1.6799, + "step": 32579 + }, + { + "epoch": 10.0, + "grad_norm": 0.1208924725651741, + "learning_rate": 2.4706465207025023e-13, + "loss": 1.6843, + "step": 32580 + } + ], + "logging_steps": 1.0, + "max_steps": 32580, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.213259355550921e+21, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-32580/training_args.bin b/checkpoint-32580/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..b6af1535a0e2165d1952d5e6c3ab94ece29b3d2e --- /dev/null +++ b/checkpoint-32580/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9c7985017961b4019da284385f3d0f4b95f69682383f9cf12749177ae67aa87 +size 7288 diff --git a/checkpoint-32580/zero_to_fp32.py b/checkpoint-32580/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-32580/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-6516/config.json b/checkpoint-6516/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a29af639fbf705188c21aae22660a85fee1ca26e --- /dev/null +++ b/checkpoint-6516/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "freeze_mm_mlp_adapter": false, + "gen_hidden_size": 1792, + "gen_pooling": "early_pool2d_4", + "gen_vision_tower": "eva-clip-E-14-plus", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "image_aspect_ratio": "square", + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "llava_llama", + "n_query": 64, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "tokenizer_model_max_length": 256, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "tune_mm_mlp_adapter": false, + "use_cache": false, + "use_mm_proj": true, + "vision_tower_pretrained": null, + "vocab_size": 128260 +} diff --git a/checkpoint-6516/generation_config.json b/checkpoint-6516/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..05892c70fa899883072c585fa444b4aa7175d6bc --- /dev/null +++ b/checkpoint-6516/generation_config.json @@ -0,0 +1,13 @@ +{ + "attn_implementation": "flash_attention_2", + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-6516/latest b/checkpoint-6516/latest new file mode 100644 index 0000000000000000000000000000000000000000..efd511b6fe7d764318a7fa4882b86f041d735c1d --- /dev/null +++ b/checkpoint-6516/latest @@ -0,0 +1 @@ +global_step6516 \ No newline at end of file diff --git a/checkpoint-6516/model-00001-of-00003.safetensors b/checkpoint-6516/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..95fcc39b717208dd9423922b4ff40a2ac3d9c860 --- /dev/null +++ b/checkpoint-6516/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82e9733b9d94e6bd587c89b261fa39a972ad5066df5b1bdc1279f75268ee1843 +size 4955415870 diff --git a/checkpoint-6516/model-00002-of-00003.safetensors b/checkpoint-6516/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1c6f3bf70f8abb1e7ffb233219debc10bc20bfc --- /dev/null +++ b/checkpoint-6516/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b088e0e2c4fb5916f448522fa5aef361db713e2c2c0ceac534662c8d52e330d +size 4971563008 diff --git a/checkpoint-6516/model-00003-of-00003.safetensors b/checkpoint-6516/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3200d079e0d2997a2e171f93cbaeda3ca66581fe --- /dev/null +++ b/checkpoint-6516/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24f85fae836bd0b95d8329784b220fa4699b8013a4e5f881c24a8ea3969c1c76 +size 4180840856 diff --git a/checkpoint-6516/model.safetensors.index.json b/checkpoint-6516/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c911c94f46f802ae304903dd7796da96c28604 --- /dev/null +++ b/checkpoint-6516/model.safetensors.index.json @@ -0,0 +1,2358 @@ +{ + "metadata": { + "total_size": 14107506086 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.bias": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.cls_token": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.pos_embed": "model-00001-of-00003.safetensors", + "model.latent_queries": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/checkpoint-6516/rng_state_0.pth b/checkpoint-6516/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..41a080f53ce537d6c9ca1ea8dff20cf3cda35b92 --- /dev/null +++ b/checkpoint-6516/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e5a76dcf6a1848d7193d0a64c511f633dd61dd697ff77551d4ad2021d47f861 +size 15984 diff --git a/checkpoint-6516/rng_state_1.pth b/checkpoint-6516/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..34814f26d42c732b3434f50adaafd1d17a0aad44 --- /dev/null +++ b/checkpoint-6516/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80123b3dafc84ba63151a119efefbdc7096b4213278bee4e20034ad011342c55 +size 15984 diff --git a/checkpoint-6516/rng_state_10.pth b/checkpoint-6516/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..7ba2ca4fb9cd1d265109ace549dff163ebd8fec8 --- /dev/null +++ b/checkpoint-6516/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d246867fa214917e8d915a5a512add3fe94a6ab0c5dd18894e7fde17a343a6 +size 15997 diff --git a/checkpoint-6516/rng_state_11.pth b/checkpoint-6516/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..8164e847d5f83567e58f17c6712ab984b8ebf166 --- /dev/null +++ b/checkpoint-6516/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b17d1b74a51d89805580769ed56dccd28905cef39f0174b95690671de73a8eca +size 15997 diff --git a/checkpoint-6516/rng_state_12.pth b/checkpoint-6516/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..19f5a4f26a4912f87db63a3a74a93715b155799e --- /dev/null +++ b/checkpoint-6516/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3301d3a6549bbded0cc4bf08a0e868d228c306e7d0b3570f662d36062df931c +size 15997 diff --git a/checkpoint-6516/rng_state_13.pth b/checkpoint-6516/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..6680f761b352b361b4e9b3c9c86f1ff92526abb5 --- /dev/null +++ b/checkpoint-6516/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:275a9aac323151cb4befb06c365b36f4fd3ef0128d239883e17b8ef9a0ccfe10 +size 15997 diff --git a/checkpoint-6516/rng_state_14.pth b/checkpoint-6516/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c6ae7d54353e223fd6c48796bd13ffc812c7b3a --- /dev/null +++ b/checkpoint-6516/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f30d1ca0c7b21fbaff24c34d0dfbfe80f7e72ba1b307b3e81ced25824d58f86 +size 15997 diff --git a/checkpoint-6516/rng_state_15.pth b/checkpoint-6516/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..73f906d5373da89aeda86043d2579cd947020491 --- /dev/null +++ b/checkpoint-6516/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cb8a2e30fda04f74c99b0d5166d22a1db667a1c07e9f3e5a327dc2abd218f79 +size 15997 diff --git a/checkpoint-6516/rng_state_16.pth b/checkpoint-6516/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b47a19e2cdcd0ec42bb0fa06c03dcd620dd3b9e --- /dev/null +++ b/checkpoint-6516/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1357c999e77072c2b2f5da8d1387892588496aff9659b2a24eb86f00c6104c00 +size 15997 diff --git a/checkpoint-6516/rng_state_17.pth b/checkpoint-6516/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..29c787234ca9610ff6fab8e9db99adb674cd18d5 --- /dev/null +++ b/checkpoint-6516/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b81c09b1dbf0b0739963ad5162b6cd9d1bc90217e79af0ff00a513a62c009723 +size 15997 diff --git a/checkpoint-6516/rng_state_18.pth b/checkpoint-6516/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..899aebeae96a20bed847860fc8d1dd1e81282888 --- /dev/null +++ b/checkpoint-6516/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55efafeaf0f63ab0d194e60fdc96319f2316eb8a0cdd3c29bdbff5a6d3859440 +size 15997 diff --git a/checkpoint-6516/rng_state_19.pth b/checkpoint-6516/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b8a4e17a217ec1c14227c1fbcbd157511762248 --- /dev/null +++ b/checkpoint-6516/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcbb6e8a9819d7f9bf6af217a00ce251cd57c85706e9125822ceb76f4f7ff61b +size 15997 diff --git a/checkpoint-6516/rng_state_2.pth b/checkpoint-6516/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f7063d7e6b72a97d56dea894476b99e9d8077440 --- /dev/null +++ b/checkpoint-6516/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24bc0b93cdbad6381aa3a04c545d8a551c7fd1517d4a8c0c00df3cf8e9be1925 +size 15984 diff --git a/checkpoint-6516/rng_state_20.pth b/checkpoint-6516/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..f042d6b3dd0beb7f043a18d6c57f66b1b9155cbe --- /dev/null +++ b/checkpoint-6516/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:241a5ebc75e3b3ae9c4183078ecab00c2af7c9015f160ba8be19b219181eaf23 +size 15997 diff --git a/checkpoint-6516/rng_state_21.pth b/checkpoint-6516/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..860e155004dfc96d508ba711835d7f00272b428c --- /dev/null +++ b/checkpoint-6516/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f98a4e47c965bdda7554f77dd05f706737d1ed1a1e91bf2f134291a004a84a35 +size 15997 diff --git a/checkpoint-6516/rng_state_22.pth b/checkpoint-6516/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..b2a3d6125f5891fc8c24259392aa4b650c687b61 --- /dev/null +++ b/checkpoint-6516/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01770ac6f726d2a5b7b489ccad959862dfb017c03595a077046b42575a67bdaa +size 15997 diff --git a/checkpoint-6516/rng_state_23.pth b/checkpoint-6516/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..f52b3e8840ec0da4a15f56bb9eefb2d8a1fb6c3b --- /dev/null +++ b/checkpoint-6516/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f2c3b1a4c258da6e1c7a0d998e4a3c64eba5839faa231ef469f2145a7c8f505 +size 15997 diff --git a/checkpoint-6516/rng_state_24.pth b/checkpoint-6516/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..2acc4389c09e244606a4a06d180fecaac1f83b97 --- /dev/null +++ b/checkpoint-6516/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c7341e98dde3db72da00e1fdb386a2fff0d497cb7d422f3b672aefdbaf6e375 +size 15997 diff --git a/checkpoint-6516/rng_state_25.pth b/checkpoint-6516/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..e0c9c81b161c10a901d4944a03cf609dff9e7ca0 --- /dev/null +++ b/checkpoint-6516/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:423d54549a66cab4e7776f809c9ec6f519eb151a0373978d5b18beb48b3356d1 +size 15997 diff --git a/checkpoint-6516/rng_state_26.pth b/checkpoint-6516/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0a5dc2a903f27dd5777ce985d15c88266090b44 --- /dev/null +++ b/checkpoint-6516/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf2b280aaa0a0faac3ae8a59c47059be35667b00e6a6af07b5ba5676bc7c502d +size 15997 diff --git a/checkpoint-6516/rng_state_27.pth b/checkpoint-6516/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..508f3f6bb7e854d99c666d642ea0a6ae92a233a1 --- /dev/null +++ b/checkpoint-6516/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a01f03b858a3f6354911f1a1034413c632bbaaf0876409f692c5aef959ee1d9 +size 15997 diff --git a/checkpoint-6516/rng_state_28.pth b/checkpoint-6516/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..fab7ed4175c296a3dad513ca558acf4b84db4d49 --- /dev/null +++ b/checkpoint-6516/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8095c068d25ef482f907e18623ce0c94fcd1ecf9d3ab66d2a4c7d491aefdcc07 +size 15997 diff --git a/checkpoint-6516/rng_state_29.pth b/checkpoint-6516/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..9424cd910d17539d795a7c75db9868ecd6179a8a --- /dev/null +++ b/checkpoint-6516/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e6e775f0fb26a55e0755092be9f34773a00d6153741d46de02b009e6dd3e5d +size 15997 diff --git a/checkpoint-6516/rng_state_3.pth b/checkpoint-6516/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..25fb3074d3d897b0612dbe773c67625137414801 --- /dev/null +++ b/checkpoint-6516/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e9489b2447a8a8f0cf70d2118051e7ae267adbd8d4f8315ade4997631caee88 +size 15984 diff --git a/checkpoint-6516/rng_state_30.pth b/checkpoint-6516/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..5170a507e19fbffcfb86e446eaeee098dbb252b8 --- /dev/null +++ b/checkpoint-6516/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9586dc2ee15fcd78a14eade9b454d911dd6cf0818fd4ef16b9ac2bf7d8b18aa7 +size 15997 diff --git a/checkpoint-6516/rng_state_31.pth b/checkpoint-6516/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..614bcc57992e21d8e7dce07d426599dd74240327 --- /dev/null +++ b/checkpoint-6516/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71a3b74890496ddfb2705386cb724d108ac0efb867baf6c69e68981e6139b516 +size 15997 diff --git a/checkpoint-6516/rng_state_32.pth b/checkpoint-6516/rng_state_32.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5ac2dfcd65d225659067a9c4c512b4746c77384 --- /dev/null +++ b/checkpoint-6516/rng_state_32.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afa504170f9670818fb97f656c6aec22ea37f4eb4f31e54843b1f84367c0737d +size 15997 diff --git a/checkpoint-6516/rng_state_33.pth b/checkpoint-6516/rng_state_33.pth new file mode 100644 index 0000000000000000000000000000000000000000..7b777376097b1b916dd9c54c4766ac225247d309 --- /dev/null +++ b/checkpoint-6516/rng_state_33.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b08a1f84cf0881a38db65e90f2002c80e74a8b5532079ffd840be58bb9363d6 +size 15997 diff --git a/checkpoint-6516/rng_state_34.pth b/checkpoint-6516/rng_state_34.pth new file mode 100644 index 0000000000000000000000000000000000000000..5d468410dd20b3de4749a186b3b4ff7dd58fc806 --- /dev/null +++ b/checkpoint-6516/rng_state_34.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3575ba340a40d6db40af98a259033ed80a55713e1079680715c2bfa54b059c29 +size 15997 diff --git a/checkpoint-6516/rng_state_35.pth b/checkpoint-6516/rng_state_35.pth new file mode 100644 index 0000000000000000000000000000000000000000..2413d4ef67160a9cfc788144cd819541673bece0 --- /dev/null +++ b/checkpoint-6516/rng_state_35.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:054f9d79b85bb8a41f1cefe8cd637b7b7df7adb498f24c5c08f76eba3c0763c1 +size 15997 diff --git a/checkpoint-6516/rng_state_36.pth b/checkpoint-6516/rng_state_36.pth new file mode 100644 index 0000000000000000000000000000000000000000..c1396b98ed9de582fe670b8c8e468a58de47c6e1 --- /dev/null +++ b/checkpoint-6516/rng_state_36.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d69371f03cebcd68e776eddf13b8ae7d099b98ce10422e46b09aab01270df2c +size 15997 diff --git a/checkpoint-6516/rng_state_37.pth b/checkpoint-6516/rng_state_37.pth new file mode 100644 index 0000000000000000000000000000000000000000..6528e7c51a7f25f883d015770572d549250993a7 --- /dev/null +++ b/checkpoint-6516/rng_state_37.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75e8e98e23277b73ca71493f868b9f9e1dfd71141c03492dd80ab5bc5576d7d2 +size 15997 diff --git a/checkpoint-6516/rng_state_38.pth b/checkpoint-6516/rng_state_38.pth new file mode 100644 index 0000000000000000000000000000000000000000..5c323b93951c6247cf1c7afcb894b9bdc5d49646 --- /dev/null +++ b/checkpoint-6516/rng_state_38.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90e9f5c9131a96e233ae1224fa879baef9b58229575008906d96e39b22f79336 +size 15997 diff --git a/checkpoint-6516/rng_state_39.pth b/checkpoint-6516/rng_state_39.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f1ee6eab112991eecf2c3f1579364d3416e670c --- /dev/null +++ b/checkpoint-6516/rng_state_39.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88be8398d250019649bfe447b71c78407ae5111c7a965df37c35ed045d624903 +size 15997 diff --git a/checkpoint-6516/rng_state_4.pth b/checkpoint-6516/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e08d7db3774d757f96c8457d6181cdd8f15b9ad0 --- /dev/null +++ b/checkpoint-6516/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:044b028c9726b81dc1b258e0624c431177fef92284c47246ad7062444d6bb939 +size 15984 diff --git a/checkpoint-6516/rng_state_40.pth b/checkpoint-6516/rng_state_40.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b3e8dbd661c88b94df5eec627d1a3fd6e76ac1f --- /dev/null +++ b/checkpoint-6516/rng_state_40.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b43388d4d75851f9220be3592eae565bf14fb46b5e7d5f3c1a76e53477b0f34 +size 15997 diff --git a/checkpoint-6516/rng_state_41.pth b/checkpoint-6516/rng_state_41.pth new file mode 100644 index 0000000000000000000000000000000000000000..5cd71ce305f9bc0259fa3f0465acaeaaf96e3d87 --- /dev/null +++ b/checkpoint-6516/rng_state_41.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f717ea9d1a9702fd244f02f95787521c1c1258fec9c7a685240430aa92c64bd8 +size 15997 diff --git a/checkpoint-6516/rng_state_42.pth b/checkpoint-6516/rng_state_42.pth new file mode 100644 index 0000000000000000000000000000000000000000..4151a14ded097c920173405f21d4b8ba08541927 --- /dev/null +++ b/checkpoint-6516/rng_state_42.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4760427e995e8ced6c38090d2048f126ddb50cc15a0fe44f168cb1354440146 +size 15997 diff --git a/checkpoint-6516/rng_state_43.pth b/checkpoint-6516/rng_state_43.pth new file mode 100644 index 0000000000000000000000000000000000000000..2ae059fab9ea2dff0da332b43f2da4b6b45445ba --- /dev/null +++ b/checkpoint-6516/rng_state_43.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65cf8fee65fea39b2ea454a02958996ecbc3b2ab3bad3a60df579385bcd7ca90 +size 15997 diff --git a/checkpoint-6516/rng_state_44.pth b/checkpoint-6516/rng_state_44.pth new file mode 100644 index 0000000000000000000000000000000000000000..3451dab3484a1c4cab402764f40ee6eaf9ec20d7 --- /dev/null +++ b/checkpoint-6516/rng_state_44.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeaef7fab6171983c9d584ff0085aa40500f690b605605e6385b6e87f3644ad5 +size 15997 diff --git a/checkpoint-6516/rng_state_45.pth b/checkpoint-6516/rng_state_45.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ae764ed2ef26056483a5ab810cd7493c6e5abfc --- /dev/null +++ b/checkpoint-6516/rng_state_45.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf6e3680a2fe49fd39835ff6dce07d04b0c2fcb77ac4781589cec79e650566d9 +size 15997 diff --git a/checkpoint-6516/rng_state_46.pth b/checkpoint-6516/rng_state_46.pth new file mode 100644 index 0000000000000000000000000000000000000000..51ec0265eeb27db2d9e34d11c587f5442141c36c --- /dev/null +++ b/checkpoint-6516/rng_state_46.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6434bce05935dd06f792c22b3a7c8b2c90b27e907d395f3795966d9662f527ed +size 15997 diff --git a/checkpoint-6516/rng_state_47.pth b/checkpoint-6516/rng_state_47.pth new file mode 100644 index 0000000000000000000000000000000000000000..007728cc566fed0da003cc93da3fcb0cf6794b7c --- /dev/null +++ b/checkpoint-6516/rng_state_47.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26cc88c97dc8c379b7c857d1e810e0435a8543b47f125dc718617cd34ac37948 +size 15997 diff --git a/checkpoint-6516/rng_state_48.pth b/checkpoint-6516/rng_state_48.pth new file mode 100644 index 0000000000000000000000000000000000000000..2ebfa66fe781e053d13d69588b897c7a003bc7c4 --- /dev/null +++ b/checkpoint-6516/rng_state_48.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80bf370addf67c9b6441b9b2a47cd9aa3e88da1c8afa4ddb596cb469a918c835 +size 15997 diff --git a/checkpoint-6516/rng_state_49.pth b/checkpoint-6516/rng_state_49.pth new file mode 100644 index 0000000000000000000000000000000000000000..52437bd4b90ce45cbf0b7e72b8e3c0924a403cf8 --- /dev/null +++ b/checkpoint-6516/rng_state_49.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86993901308193da83bcbf23ffa83e98cddb1a15e4722f4cb4bc1a4af60de59e +size 15997 diff --git a/checkpoint-6516/rng_state_5.pth b/checkpoint-6516/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..8bf482b0d878bc98394452a97f651723945e6101 --- /dev/null +++ b/checkpoint-6516/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba88f23f04834a9509be5479b0c134340fb6417112fa6cd0de4c15221d0f64bd +size 15984 diff --git a/checkpoint-6516/rng_state_50.pth b/checkpoint-6516/rng_state_50.pth new file mode 100644 index 0000000000000000000000000000000000000000..4aa61d5e8807c70fb68d9d60376abc953517b0d0 --- /dev/null +++ b/checkpoint-6516/rng_state_50.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28bc2128fae4d917e11e9fb580ac1ed92650a2e85ca7bf66372a5b56425fcc65 +size 15997 diff --git a/checkpoint-6516/rng_state_51.pth b/checkpoint-6516/rng_state_51.pth new file mode 100644 index 0000000000000000000000000000000000000000..5caf37d6834d9f5a122a2dab5ceac6cdd58496a5 --- /dev/null +++ b/checkpoint-6516/rng_state_51.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19d546b5fd920878a3eeacf1a896f202732f7e9b8af8dfa6095dda5a71735732 +size 15997 diff --git a/checkpoint-6516/rng_state_52.pth b/checkpoint-6516/rng_state_52.pth new file mode 100644 index 0000000000000000000000000000000000000000..039ea35cc247198ab4791c79b7be05b6616b387a --- /dev/null +++ b/checkpoint-6516/rng_state_52.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fd283ce0759eb855cfc01e0b182f98ff724e59cf31f6d028e2ae1c1c0d8b8fe +size 15997 diff --git a/checkpoint-6516/rng_state_53.pth b/checkpoint-6516/rng_state_53.pth new file mode 100644 index 0000000000000000000000000000000000000000..a8514ba389bfa6d20a20e6db738181014123f6ca --- /dev/null +++ b/checkpoint-6516/rng_state_53.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df81f0dcea06c4b7578794b4b67ff1618a246525cee2fd50c4dc7b12792e0385 +size 15997 diff --git a/checkpoint-6516/rng_state_54.pth b/checkpoint-6516/rng_state_54.pth new file mode 100644 index 0000000000000000000000000000000000000000..09de39a28426e6e67d6771633c9453a220277143 --- /dev/null +++ b/checkpoint-6516/rng_state_54.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e9fb6cd8b6276087f93553a9233a836941b8f99358691ef5e79167c9a35ed79 +size 15997 diff --git a/checkpoint-6516/rng_state_55.pth b/checkpoint-6516/rng_state_55.pth new file mode 100644 index 0000000000000000000000000000000000000000..d96c18e44ceb71a95223f657455bffd01d97cab3 --- /dev/null +++ b/checkpoint-6516/rng_state_55.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc215d21b37557f920522a138b5892a175368dd13ba408b4694001c8b0d85f10 +size 15997 diff --git a/checkpoint-6516/rng_state_56.pth b/checkpoint-6516/rng_state_56.pth new file mode 100644 index 0000000000000000000000000000000000000000..3184fbccfab4376c5f974568f9e488c6f6284df1 --- /dev/null +++ b/checkpoint-6516/rng_state_56.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7950ecb09e1b8ad2d41db5a9863209268a33704124c60b537e866abde5d9a3f +size 15997 diff --git a/checkpoint-6516/rng_state_57.pth b/checkpoint-6516/rng_state_57.pth new file mode 100644 index 0000000000000000000000000000000000000000..e579538c689ef70168534e7abc5ee839aa32d592 --- /dev/null +++ b/checkpoint-6516/rng_state_57.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c546a2e1aa7b9ee8e02826d94c56194b2bf05b0486b69778beb34152c4c986d1 +size 15997 diff --git a/checkpoint-6516/rng_state_58.pth b/checkpoint-6516/rng_state_58.pth new file mode 100644 index 0000000000000000000000000000000000000000..1e53df983ab4c6474981bec6ed3d9b707378067e --- /dev/null +++ b/checkpoint-6516/rng_state_58.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c4eb627ed97cee4398381a5c657ba9fd0ba81d602457fc077ecca1bb6288cad +size 15997 diff --git a/checkpoint-6516/rng_state_59.pth b/checkpoint-6516/rng_state_59.pth new file mode 100644 index 0000000000000000000000000000000000000000..90c2293917355c06d090a0e687645ae69fcd3fb1 --- /dev/null +++ b/checkpoint-6516/rng_state_59.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b0c5f8737d17f48c3235cf901087b0bb33b19db1aab983db42df23d868a34f1 +size 15997 diff --git a/checkpoint-6516/rng_state_6.pth b/checkpoint-6516/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..437df667ba584a59523dfaecd4c3f8125dc2309a --- /dev/null +++ b/checkpoint-6516/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d79c3d2129a75c5e031c768707abfed2fc3873168ef234a64d2e2ac9abc7e5a2 +size 15984 diff --git a/checkpoint-6516/rng_state_60.pth b/checkpoint-6516/rng_state_60.pth new file mode 100644 index 0000000000000000000000000000000000000000..502fbb7ffcdc9f6c4e28eac5128268f94de0e877 --- /dev/null +++ b/checkpoint-6516/rng_state_60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29f61d33d0c84c62ea4d3e27a9e74f957ddb1f02b6b13aa9abed069035493fed +size 15997 diff --git a/checkpoint-6516/rng_state_61.pth b/checkpoint-6516/rng_state_61.pth new file mode 100644 index 0000000000000000000000000000000000000000..201620db1d92ae0aa1796eae6b1ba066cf019215 --- /dev/null +++ b/checkpoint-6516/rng_state_61.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96174d1aea9e347672d0017f4ac48551fcf42602eaedc0cbdc55ff3799d56bfe +size 15997 diff --git a/checkpoint-6516/rng_state_62.pth b/checkpoint-6516/rng_state_62.pth new file mode 100644 index 0000000000000000000000000000000000000000..9aa1aa0f53317635fd354145df51d07fd1222e19 --- /dev/null +++ b/checkpoint-6516/rng_state_62.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db38109d57570d43e62cfe5204da625ca8ae06ffd3b15759729101ff7852ce09 +size 15997 diff --git a/checkpoint-6516/rng_state_63.pth b/checkpoint-6516/rng_state_63.pth new file mode 100644 index 0000000000000000000000000000000000000000..a27d7a5aa6a8f955603736e371df2172cc553178 --- /dev/null +++ b/checkpoint-6516/rng_state_63.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b1c8dbf6b3d7e94cb68c1ec4918088f404d800e40a340bcd6f6f9e75c32befa +size 15997 diff --git a/checkpoint-6516/rng_state_7.pth b/checkpoint-6516/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c5df8d00073ef7d54fac7a6d8241d90b7f3567a --- /dev/null +++ b/checkpoint-6516/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d596378917102ad94bb321b1edf0b2a1786a677274c2dc08a107d1934cbaca01 +size 15984 diff --git a/checkpoint-6516/rng_state_8.pth b/checkpoint-6516/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..51b26acf6781036d88d21288e78c11d7f17f6e76 --- /dev/null +++ b/checkpoint-6516/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a8bd0c92f3b9584b9501ac2a7ca6f1c90e0524854cde0442fb4fdb1d6a155ee +size 15984 diff --git a/checkpoint-6516/rng_state_9.pth b/checkpoint-6516/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..96c081d5dca88ab2e2d533915849b1e2bb35fea3 --- /dev/null +++ b/checkpoint-6516/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b5c61ff769f7b371211184834f5e5997d95d7ea0809dbee9d383dac4d520dae +size 15984 diff --git a/checkpoint-6516/scheduler.pt b/checkpoint-6516/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5038ef7c1196adb0602985a51f2e37ae29993e1 --- /dev/null +++ b/checkpoint-6516/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3449c6a777c6c86bbaa253985dd9c004d1fc776c12fcc37dc8abc21e08099fc6 +size 1064 diff --git a/checkpoint-6516/special_tokens_map.json b/checkpoint-6516/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad43db72a0e94321a5a9455dce616c68d1f9673 --- /dev/null +++ b/checkpoint-6516/special_tokens_map.json @@ -0,0 +1,46 @@ +{ + "additional_special_tokens": [ + { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-6516/tokenizer.json b/checkpoint-6516/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..444d43e1c25d11b63381073024becd006c83d4f6 --- /dev/null +++ b/checkpoint-6516/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fbef9068a1d82c7fafc3fdfd7c717524c8bfbcaea19c14ce4f8a4e616deb57 +size 17210651 diff --git a/checkpoint-6516/tokenizer_config.json b/checkpoint-6516/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a54102d00c210427fe2da524cea00c5ace13686 --- /dev/null +++ b/checkpoint-6516/tokenizer_config.json @@ -0,0 +1,2102 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "[IMG]", + "[/IMG]", + "" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 256, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-6516/trainer_state.json b/checkpoint-6516/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..46daf88fdbb828e5c6f455d3c3e26e66b850ccd9 --- /dev/null +++ b/checkpoint-6516/trainer_state.json @@ -0,0 +1,45646 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 6516, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003069367710251688, + "grad_norm": 1.3492016792297363, + "learning_rate": 0.0, + "loss": 6.5185, + "step": 1 + }, + { + "epoch": 0.0006138735420503376, + "grad_norm": 1.4303781986236572, + "learning_rate": 1.0224948875255626e-07, + "loss": 6.5124, + "step": 2 + }, + { + "epoch": 0.0009208103130755065, + "grad_norm": 1.3981783390045166, + "learning_rate": 2.0449897750511251e-07, + "loss": 6.5204, + "step": 3 + }, + { + "epoch": 0.0012277470841006752, + "grad_norm": 1.3760672807693481, + "learning_rate": 3.0674846625766876e-07, + "loss": 6.502, + "step": 4 + }, + { + "epoch": 0.001534683855125844, + "grad_norm": 1.3704107999801636, + "learning_rate": 4.0899795501022503e-07, + "loss": 6.5021, + "step": 5 + }, + { + "epoch": 0.001841620626151013, + "grad_norm": 1.3109549283981323, + "learning_rate": 5.112474437627812e-07, + "loss": 6.521, + "step": 6 + }, + { + "epoch": 0.002148557397176182, + "grad_norm": 1.475183367729187, + "learning_rate": 6.134969325153375e-07, + "loss": 6.521, + "step": 7 + }, + { + "epoch": 0.0024554941682013503, + "grad_norm": 1.4563297033309937, + "learning_rate": 7.157464212678937e-07, + "loss": 6.5075, + "step": 8 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 1.437183141708374, + "learning_rate": 8.179959100204501e-07, + "loss": 6.5135, + "step": 9 + }, + { + "epoch": 0.003069367710251688, + "grad_norm": 1.336928129196167, + "learning_rate": 9.202453987730062e-07, + "loss": 6.5138, + "step": 10 + }, + { + "epoch": 0.003376304481276857, + "grad_norm": 1.3220698833465576, + "learning_rate": 1.0224948875255625e-06, + "loss": 6.5187, + "step": 11 + }, + { + "epoch": 0.003683241252302026, + "grad_norm": 1.3990652561187744, + "learning_rate": 1.1247443762781187e-06, + "loss": 6.5129, + "step": 12 + }, + { + "epoch": 0.003990178023327195, + "grad_norm": 1.4394340515136719, + "learning_rate": 1.226993865030675e-06, + "loss": 6.5078, + "step": 13 + }, + { + "epoch": 0.004297114794352364, + "grad_norm": 1.3675259351730347, + "learning_rate": 1.3292433537832312e-06, + "loss": 6.5115, + "step": 14 + }, + { + "epoch": 0.004604051565377533, + "grad_norm": 1.3085063695907593, + "learning_rate": 1.4314928425357874e-06, + "loss": 6.5092, + "step": 15 + }, + { + "epoch": 0.004910988336402701, + "grad_norm": 1.4214227199554443, + "learning_rate": 1.5337423312883435e-06, + "loss": 6.5026, + "step": 16 + }, + { + "epoch": 0.0052179251074278695, + "grad_norm": 1.377146601676941, + "learning_rate": 1.6359918200409001e-06, + "loss": 6.4882, + "step": 17 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.3461124897003174, + "learning_rate": 1.7382413087934563e-06, + "loss": 6.4935, + "step": 18 + }, + { + "epoch": 0.005831798649478207, + "grad_norm": 1.3161669969558716, + "learning_rate": 1.8404907975460124e-06, + "loss": 6.4795, + "step": 19 + }, + { + "epoch": 0.006138735420503376, + "grad_norm": 1.2915974855422974, + "learning_rate": 1.942740286298569e-06, + "loss": 6.4529, + "step": 20 + }, + { + "epoch": 0.006445672191528545, + "grad_norm": 1.2675414085388184, + "learning_rate": 2.044989775051125e-06, + "loss": 6.454, + "step": 21 + }, + { + "epoch": 0.006752608962553714, + "grad_norm": 1.2769283056259155, + "learning_rate": 2.147239263803681e-06, + "loss": 6.4574, + "step": 22 + }, + { + "epoch": 0.007059545733578883, + "grad_norm": 1.2556813955307007, + "learning_rate": 2.2494887525562373e-06, + "loss": 6.4486, + "step": 23 + }, + { + "epoch": 0.007366482504604052, + "grad_norm": 1.2158268690109253, + "learning_rate": 2.3517382413087935e-06, + "loss": 6.4357, + "step": 24 + }, + { + "epoch": 0.007673419275629221, + "grad_norm": 1.2383767366409302, + "learning_rate": 2.45398773006135e-06, + "loss": 6.4347, + "step": 25 + }, + { + "epoch": 0.00798035604665439, + "grad_norm": 1.2865383625030518, + "learning_rate": 2.5562372188139062e-06, + "loss": 6.3611, + "step": 26 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 1.1501989364624023, + "learning_rate": 2.6584867075664624e-06, + "loss": 6.3247, + "step": 27 + }, + { + "epoch": 0.008594229588704727, + "grad_norm": 1.0971378087997437, + "learning_rate": 2.7607361963190186e-06, + "loss": 6.3078, + "step": 28 + }, + { + "epoch": 0.008901166359729895, + "grad_norm": 1.1365599632263184, + "learning_rate": 2.8629856850715747e-06, + "loss": 6.3211, + "step": 29 + }, + { + "epoch": 0.009208103130755065, + "grad_norm": 1.1228944063186646, + "learning_rate": 2.965235173824131e-06, + "loss": 6.3185, + "step": 30 + }, + { + "epoch": 0.009515039901780233, + "grad_norm": 1.126287579536438, + "learning_rate": 3.067484662576687e-06, + "loss": 6.2845, + "step": 31 + }, + { + "epoch": 0.009821976672805401, + "grad_norm": 1.1070353984832764, + "learning_rate": 3.1697341513292436e-06, + "loss": 6.2855, + "step": 32 + }, + { + "epoch": 0.010128913443830571, + "grad_norm": 1.101291537284851, + "learning_rate": 3.2719836400818002e-06, + "loss": 6.2764, + "step": 33 + }, + { + "epoch": 0.010435850214855739, + "grad_norm": 1.0643113851547241, + "learning_rate": 3.374233128834356e-06, + "loss": 6.2363, + "step": 34 + }, + { + "epoch": 0.010742786985880909, + "grad_norm": 0.9714563488960266, + "learning_rate": 3.4764826175869125e-06, + "loss": 6.1771, + "step": 35 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8998560309410095, + "learning_rate": 3.5787321063394683e-06, + "loss": 6.1202, + "step": 36 + }, + { + "epoch": 0.011356660527931247, + "grad_norm": 0.8481987714767456, + "learning_rate": 3.680981595092025e-06, + "loss": 6.0954, + "step": 37 + }, + { + "epoch": 0.011663597298956415, + "grad_norm": 0.8124909996986389, + "learning_rate": 3.783231083844581e-06, + "loss": 6.0832, + "step": 38 + }, + { + "epoch": 0.011970534069981584, + "grad_norm": 0.7968178391456604, + "learning_rate": 3.885480572597138e-06, + "loss": 6.0661, + "step": 39 + }, + { + "epoch": 0.012277470841006752, + "grad_norm": 0.7714207768440247, + "learning_rate": 3.987730061349693e-06, + "loss": 6.0385, + "step": 40 + }, + { + "epoch": 0.012584407612031922, + "grad_norm": 0.7436742782592773, + "learning_rate": 4.08997955010225e-06, + "loss": 6.0227, + "step": 41 + }, + { + "epoch": 0.01289134438305709, + "grad_norm": 0.7447277307510376, + "learning_rate": 4.192229038854806e-06, + "loss": 6.0208, + "step": 42 + }, + { + "epoch": 0.013198281154082258, + "grad_norm": 0.6983785629272461, + "learning_rate": 4.294478527607362e-06, + "loss": 6.0295, + "step": 43 + }, + { + "epoch": 0.013505217925107428, + "grad_norm": 0.6630908250808716, + "learning_rate": 4.3967280163599184e-06, + "loss": 6.004, + "step": 44 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.6481929421424866, + "learning_rate": 4.498977505112475e-06, + "loss": 5.9986, + "step": 45 + }, + { + "epoch": 0.014119091467157766, + "grad_norm": 0.7187685966491699, + "learning_rate": 4.601226993865031e-06, + "loss": 6.0008, + "step": 46 + }, + { + "epoch": 0.014426028238182934, + "grad_norm": 0.6550983190536499, + "learning_rate": 4.703476482617587e-06, + "loss": 5.9735, + "step": 47 + }, + { + "epoch": 0.014732965009208104, + "grad_norm": 0.6780675649642944, + "learning_rate": 4.805725971370143e-06, + "loss": 5.9568, + "step": 48 + }, + { + "epoch": 0.015039901780233272, + "grad_norm": 0.703427791595459, + "learning_rate": 4.9079754601227e-06, + "loss": 5.961, + "step": 49 + }, + { + "epoch": 0.015346838551258441, + "grad_norm": 0.6507543921470642, + "learning_rate": 5.0102249488752554e-06, + "loss": 5.9557, + "step": 50 + }, + { + "epoch": 0.01565377532228361, + "grad_norm": 0.5959481000900269, + "learning_rate": 5.1124744376278124e-06, + "loss": 5.9391, + "step": 51 + }, + { + "epoch": 0.01596071209330878, + "grad_norm": 0.5798730254173279, + "learning_rate": 5.214723926380368e-06, + "loss": 5.9488, + "step": 52 + }, + { + "epoch": 0.016267648864333947, + "grad_norm": 0.5932896137237549, + "learning_rate": 5.316973415132925e-06, + "loss": 5.9176, + "step": 53 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5772561430931091, + "learning_rate": 5.419222903885481e-06, + "loss": 5.9069, + "step": 54 + }, + { + "epoch": 0.016881522406384283, + "grad_norm": 0.5578178763389587, + "learning_rate": 5.521472392638037e-06, + "loss": 5.8924, + "step": 55 + }, + { + "epoch": 0.017188459177409455, + "grad_norm": 0.5458457469940186, + "learning_rate": 5.623721881390593e-06, + "loss": 5.9001, + "step": 56 + }, + { + "epoch": 0.017495395948434623, + "grad_norm": 0.5381231904029846, + "learning_rate": 5.7259713701431494e-06, + "loss": 5.8827, + "step": 57 + }, + { + "epoch": 0.01780233271945979, + "grad_norm": 0.540920615196228, + "learning_rate": 5.828220858895706e-06, + "loss": 5.8763, + "step": 58 + }, + { + "epoch": 0.01810926949048496, + "grad_norm": 0.5378615260124207, + "learning_rate": 5.930470347648262e-06, + "loss": 5.865, + "step": 59 + }, + { + "epoch": 0.01841620626151013, + "grad_norm": 0.5139282941818237, + "learning_rate": 6.032719836400819e-06, + "loss": 5.873, + "step": 60 + }, + { + "epoch": 0.0187231430325353, + "grad_norm": 0.5298904776573181, + "learning_rate": 6.134969325153374e-06, + "loss": 5.861, + "step": 61 + }, + { + "epoch": 0.019030079803560467, + "grad_norm": 0.503131628036499, + "learning_rate": 6.237218813905931e-06, + "loss": 5.844, + "step": 62 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.5133433938026428, + "learning_rate": 6.339468302658487e-06, + "loss": 5.8535, + "step": 63 + }, + { + "epoch": 0.019643953345610803, + "grad_norm": 0.4909187853336334, + "learning_rate": 6.4417177914110434e-06, + "loss": 5.8378, + "step": 64 + }, + { + "epoch": 0.019950890116635974, + "grad_norm": 0.6916642785072327, + "learning_rate": 6.5439672801636004e-06, + "loss": 5.8385, + "step": 65 + }, + { + "epoch": 0.020257826887661142, + "grad_norm": 0.4801484942436218, + "learning_rate": 6.646216768916155e-06, + "loss": 5.8089, + "step": 66 + }, + { + "epoch": 0.02056476365868631, + "grad_norm": 0.47745251655578613, + "learning_rate": 6.748466257668712e-06, + "loss": 5.8119, + "step": 67 + }, + { + "epoch": 0.020871700429711478, + "grad_norm": 0.4693359136581421, + "learning_rate": 6.850715746421268e-06, + "loss": 5.8038, + "step": 68 + }, + { + "epoch": 0.02117863720073665, + "grad_norm": 0.46996453404426575, + "learning_rate": 6.952965235173825e-06, + "loss": 5.7966, + "step": 69 + }, + { + "epoch": 0.021485573971761818, + "grad_norm": 0.45779168605804443, + "learning_rate": 7.05521472392638e-06, + "loss": 5.7959, + "step": 70 + }, + { + "epoch": 0.021792510742786986, + "grad_norm": 0.49008259177207947, + "learning_rate": 7.1574642126789366e-06, + "loss": 5.7861, + "step": 71 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.44727766513824463, + "learning_rate": 7.259713701431494e-06, + "loss": 5.7716, + "step": 72 + }, + { + "epoch": 0.022406384284837322, + "grad_norm": 0.4392741918563843, + "learning_rate": 7.36196319018405e-06, + "loss": 5.7776, + "step": 73 + }, + { + "epoch": 0.022713321055862493, + "grad_norm": 0.43525391817092896, + "learning_rate": 7.464212678936605e-06, + "loss": 5.7687, + "step": 74 + }, + { + "epoch": 0.02302025782688766, + "grad_norm": 0.4370710253715515, + "learning_rate": 7.566462167689162e-06, + "loss": 5.7504, + "step": 75 + }, + { + "epoch": 0.02332719459791283, + "grad_norm": 0.4349770247936249, + "learning_rate": 7.668711656441718e-06, + "loss": 5.7425, + "step": 76 + }, + { + "epoch": 0.023634131368937997, + "grad_norm": 0.42710933089256287, + "learning_rate": 7.770961145194275e-06, + "loss": 5.7562, + "step": 77 + }, + { + "epoch": 0.02394106813996317, + "grad_norm": 0.42816224694252014, + "learning_rate": 7.87321063394683e-06, + "loss": 5.7301, + "step": 78 + }, + { + "epoch": 0.024248004910988337, + "grad_norm": 0.4183364510536194, + "learning_rate": 7.975460122699386e-06, + "loss": 5.7131, + "step": 79 + }, + { + "epoch": 0.024554941682013505, + "grad_norm": 0.4179428517818451, + "learning_rate": 8.077709611451943e-06, + "loss": 5.7057, + "step": 80 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.40880727767944336, + "learning_rate": 8.1799591002045e-06, + "loss": 5.7179, + "step": 81 + }, + { + "epoch": 0.025168815224063844, + "grad_norm": 0.40961235761642456, + "learning_rate": 8.282208588957055e-06, + "loss": 5.7008, + "step": 82 + }, + { + "epoch": 0.025475751995089013, + "grad_norm": 0.46789029240608215, + "learning_rate": 8.384458077709612e-06, + "loss": 5.7071, + "step": 83 + }, + { + "epoch": 0.02578268876611418, + "grad_norm": 0.4776248335838318, + "learning_rate": 8.486707566462168e-06, + "loss": 5.6829, + "step": 84 + }, + { + "epoch": 0.02608962553713935, + "grad_norm": 0.40660589933395386, + "learning_rate": 8.588957055214725e-06, + "loss": 5.6732, + "step": 85 + }, + { + "epoch": 0.026396562308164517, + "grad_norm": 0.3984324038028717, + "learning_rate": 8.69120654396728e-06, + "loss": 5.6777, + "step": 86 + }, + { + "epoch": 0.026703499079189688, + "grad_norm": 0.3972148597240448, + "learning_rate": 8.793456032719837e-06, + "loss": 5.6598, + "step": 87 + }, + { + "epoch": 0.027010435850214856, + "grad_norm": 0.3906182050704956, + "learning_rate": 8.895705521472392e-06, + "loss": 5.6468, + "step": 88 + }, + { + "epoch": 0.027317372621240024, + "grad_norm": 0.38598939776420593, + "learning_rate": 8.99795501022495e-06, + "loss": 5.6452, + "step": 89 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.405943363904953, + "learning_rate": 9.100204498977506e-06, + "loss": 5.6408, + "step": 90 + }, + { + "epoch": 0.027931246163290364, + "grad_norm": 0.3859459161758423, + "learning_rate": 9.202453987730062e-06, + "loss": 5.613, + "step": 91 + }, + { + "epoch": 0.028238182934315532, + "grad_norm": 0.3773545026779175, + "learning_rate": 9.304703476482619e-06, + "loss": 5.6277, + "step": 92 + }, + { + "epoch": 0.0285451197053407, + "grad_norm": 0.36915943026542664, + "learning_rate": 9.406952965235174e-06, + "loss": 5.618, + "step": 93 + }, + { + "epoch": 0.028852056476365868, + "grad_norm": 0.3732316792011261, + "learning_rate": 9.509202453987731e-06, + "loss": 5.6066, + "step": 94 + }, + { + "epoch": 0.029158993247391036, + "grad_norm": 0.3670802414417267, + "learning_rate": 9.611451942740286e-06, + "loss": 5.6189, + "step": 95 + }, + { + "epoch": 0.029465930018416207, + "grad_norm": 0.3672202229499817, + "learning_rate": 9.713701431492843e-06, + "loss": 5.6046, + "step": 96 + }, + { + "epoch": 0.029772866789441375, + "grad_norm": 0.3624509871006012, + "learning_rate": 9.8159509202454e-06, + "loss": 5.585, + "step": 97 + }, + { + "epoch": 0.030079803560466543, + "grad_norm": 0.36265870928764343, + "learning_rate": 9.918200408997956e-06, + "loss": 5.5867, + "step": 98 + }, + { + "epoch": 0.03038674033149171, + "grad_norm": 0.3606979548931122, + "learning_rate": 1.0020449897750511e-05, + "loss": 5.5658, + "step": 99 + }, + { + "epoch": 0.030693677102516883, + "grad_norm": 0.36800363659858704, + "learning_rate": 1.0122699386503068e-05, + "loss": 5.5494, + "step": 100 + }, + { + "epoch": 0.03100061387354205, + "grad_norm": 0.3641016483306885, + "learning_rate": 1.0224948875255625e-05, + "loss": 5.5553, + "step": 101 + }, + { + "epoch": 0.03130755064456722, + "grad_norm": 0.36807990074157715, + "learning_rate": 1.032719836400818e-05, + "loss": 5.5315, + "step": 102 + }, + { + "epoch": 0.03161448741559239, + "grad_norm": 0.37071728706359863, + "learning_rate": 1.0429447852760736e-05, + "loss": 5.522, + "step": 103 + }, + { + "epoch": 0.03192142418661756, + "grad_norm": 0.3549076020717621, + "learning_rate": 1.0531697341513293e-05, + "loss": 5.5354, + "step": 104 + }, + { + "epoch": 0.03222836095764273, + "grad_norm": 0.3589537441730499, + "learning_rate": 1.063394683026585e-05, + "loss": 5.534, + "step": 105 + }, + { + "epoch": 0.032535297728667895, + "grad_norm": 0.4341397285461426, + "learning_rate": 1.0736196319018407e-05, + "loss": 5.5088, + "step": 106 + }, + { + "epoch": 0.03284223449969306, + "grad_norm": 0.37220680713653564, + "learning_rate": 1.0838445807770962e-05, + "loss": 5.5213, + "step": 107 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.3776145875453949, + "learning_rate": 1.0940695296523517e-05, + "loss": 5.4955, + "step": 108 + }, + { + "epoch": 0.0334561080417434, + "grad_norm": 0.38651829957962036, + "learning_rate": 1.1042944785276074e-05, + "loss": 5.4916, + "step": 109 + }, + { + "epoch": 0.03376304481276857, + "grad_norm": 0.3749970495700836, + "learning_rate": 1.1145194274028631e-05, + "loss": 5.4686, + "step": 110 + }, + { + "epoch": 0.03406998158379374, + "grad_norm": 0.38184404373168945, + "learning_rate": 1.1247443762781187e-05, + "loss": 5.4694, + "step": 111 + }, + { + "epoch": 0.03437691835481891, + "grad_norm": 0.38783952593803406, + "learning_rate": 1.1349693251533742e-05, + "loss": 5.4447, + "step": 112 + }, + { + "epoch": 0.03468385512584408, + "grad_norm": 0.369125097990036, + "learning_rate": 1.1451942740286299e-05, + "loss": 5.4506, + "step": 113 + }, + { + "epoch": 0.034990791896869246, + "grad_norm": 0.3773012161254883, + "learning_rate": 1.1554192229038856e-05, + "loss": 5.4637, + "step": 114 + }, + { + "epoch": 0.035297728667894414, + "grad_norm": 0.47702446579933167, + "learning_rate": 1.1656441717791411e-05, + "loss": 5.4487, + "step": 115 + }, + { + "epoch": 0.03560466543891958, + "grad_norm": 0.5288241505622864, + "learning_rate": 1.1758691206543968e-05, + "loss": 5.4216, + "step": 116 + }, + { + "epoch": 0.03591160220994475, + "grad_norm": 0.49916699528694153, + "learning_rate": 1.1860940695296524e-05, + "loss": 5.4055, + "step": 117 + }, + { + "epoch": 0.03621853898096992, + "grad_norm": 0.5027921795845032, + "learning_rate": 1.196319018404908e-05, + "loss": 5.4141, + "step": 118 + }, + { + "epoch": 0.036525475751995086, + "grad_norm": 0.5069209933280945, + "learning_rate": 1.2065439672801638e-05, + "loss": 5.4277, + "step": 119 + }, + { + "epoch": 0.03683241252302026, + "grad_norm": 0.5208525657653809, + "learning_rate": 1.2167689161554193e-05, + "loss": 5.4023, + "step": 120 + }, + { + "epoch": 0.03713934929404543, + "grad_norm": 0.7059593796730042, + "learning_rate": 1.2269938650306748e-05, + "loss": 5.3797, + "step": 121 + }, + { + "epoch": 0.0374462860650706, + "grad_norm": 0.71112060546875, + "learning_rate": 1.2372188139059305e-05, + "loss": 5.3619, + "step": 122 + }, + { + "epoch": 0.037753222836095765, + "grad_norm": 0.5095361471176147, + "learning_rate": 1.2474437627811862e-05, + "loss": 5.3667, + "step": 123 + }, + { + "epoch": 0.03806015960712093, + "grad_norm": 0.986062228679657, + "learning_rate": 1.2576687116564418e-05, + "loss": 5.3459, + "step": 124 + }, + { + "epoch": 0.0383670963781461, + "grad_norm": 0.693392813205719, + "learning_rate": 1.2678936605316975e-05, + "loss": 5.3165, + "step": 125 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7835625410079956, + "learning_rate": 1.278118609406953e-05, + "loss": 5.3205, + "step": 126 + }, + { + "epoch": 0.03898096992019644, + "grad_norm": 0.6314569711685181, + "learning_rate": 1.2883435582822087e-05, + "loss": 5.3287, + "step": 127 + }, + { + "epoch": 0.039287906691221605, + "grad_norm": 0.9079526662826538, + "learning_rate": 1.2985685071574644e-05, + "loss": 5.2935, + "step": 128 + }, + { + "epoch": 0.03959484346224678, + "grad_norm": 0.6998131275177002, + "learning_rate": 1.3087934560327201e-05, + "loss": 5.315, + "step": 129 + }, + { + "epoch": 0.03990178023327195, + "grad_norm": 0.7570182085037231, + "learning_rate": 1.3190184049079754e-05, + "loss": 5.293, + "step": 130 + }, + { + "epoch": 0.040208717004297116, + "grad_norm": 0.6972737908363342, + "learning_rate": 1.329243353783231e-05, + "loss": 5.2863, + "step": 131 + }, + { + "epoch": 0.040515653775322284, + "grad_norm": 0.8841190934181213, + "learning_rate": 1.3394683026584867e-05, + "loss": 5.2518, + "step": 132 + }, + { + "epoch": 0.04082259054634745, + "grad_norm": 0.6792641282081604, + "learning_rate": 1.3496932515337424e-05, + "loss": 5.2386, + "step": 133 + }, + { + "epoch": 0.04112952731737262, + "grad_norm": 0.9234145879745483, + "learning_rate": 1.359918200408998e-05, + "loss": 5.2418, + "step": 134 + }, + { + "epoch": 0.04143646408839779, + "grad_norm": 1.1438226699829102, + "learning_rate": 1.3701431492842536e-05, + "loss": 5.2298, + "step": 135 + }, + { + "epoch": 0.041743400859422956, + "grad_norm": 0.910861074924469, + "learning_rate": 1.3803680981595093e-05, + "loss": 5.2437, + "step": 136 + }, + { + "epoch": 0.042050337630448124, + "grad_norm": 0.8995844721794128, + "learning_rate": 1.390593047034765e-05, + "loss": 5.2456, + "step": 137 + }, + { + "epoch": 0.0423572744014733, + "grad_norm": 0.8543404936790466, + "learning_rate": 1.4008179959100204e-05, + "loss": 5.1888, + "step": 138 + }, + { + "epoch": 0.04266421117249847, + "grad_norm": 0.7565917372703552, + "learning_rate": 1.411042944785276e-05, + "loss": 5.1939, + "step": 139 + }, + { + "epoch": 0.042971147943523635, + "grad_norm": 0.7103878259658813, + "learning_rate": 1.4212678936605318e-05, + "loss": 5.1693, + "step": 140 + }, + { + "epoch": 0.0432780847145488, + "grad_norm": 1.008686900138855, + "learning_rate": 1.4314928425357873e-05, + "loss": 5.1467, + "step": 141 + }, + { + "epoch": 0.04358502148557397, + "grad_norm": 1.1134421825408936, + "learning_rate": 1.441717791411043e-05, + "loss": 5.1695, + "step": 142 + }, + { + "epoch": 0.04389195825659914, + "grad_norm": 0.7418283820152283, + "learning_rate": 1.4519427402862987e-05, + "loss": 5.1556, + "step": 143 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 1.3332276344299316, + "learning_rate": 1.4621676891615542e-05, + "loss": 5.1736, + "step": 144 + }, + { + "epoch": 0.044505831798649476, + "grad_norm": 0.99709153175354, + "learning_rate": 1.47239263803681e-05, + "loss": 5.1326, + "step": 145 + }, + { + "epoch": 0.044812768569674644, + "grad_norm": 2.0185158252716064, + "learning_rate": 1.4826175869120657e-05, + "loss": 5.1075, + "step": 146 + }, + { + "epoch": 0.04511970534069982, + "grad_norm": 0.9810693264007568, + "learning_rate": 1.492842535787321e-05, + "loss": 5.1181, + "step": 147 + }, + { + "epoch": 0.04542664211172499, + "grad_norm": 1.3122087717056274, + "learning_rate": 1.5030674846625767e-05, + "loss": 5.1104, + "step": 148 + }, + { + "epoch": 0.045733578882750155, + "grad_norm": 1.230662226676941, + "learning_rate": 1.5132924335378324e-05, + "loss": 5.0721, + "step": 149 + }, + { + "epoch": 0.04604051565377532, + "grad_norm": 0.9584419131278992, + "learning_rate": 1.523517382413088e-05, + "loss": 5.0574, + "step": 150 + }, + { + "epoch": 0.04634745242480049, + "grad_norm": 1.3933353424072266, + "learning_rate": 1.5337423312883436e-05, + "loss": 5.0468, + "step": 151 + }, + { + "epoch": 0.04665438919582566, + "grad_norm": 1.2336134910583496, + "learning_rate": 1.5439672801635993e-05, + "loss": 5.0596, + "step": 152 + }, + { + "epoch": 0.04696132596685083, + "grad_norm": 1.3005256652832031, + "learning_rate": 1.554192229038855e-05, + "loss": 5.0236, + "step": 153 + }, + { + "epoch": 0.047268262737875995, + "grad_norm": 1.2528692483901978, + "learning_rate": 1.5644171779141108e-05, + "loss": 5.0269, + "step": 154 + }, + { + "epoch": 0.04757519950890117, + "grad_norm": 1.0448148250579834, + "learning_rate": 1.574642126789366e-05, + "loss": 5.0338, + "step": 155 + }, + { + "epoch": 0.04788213627992634, + "grad_norm": 1.2372045516967773, + "learning_rate": 1.5848670756646218e-05, + "loss": 4.9544, + "step": 156 + }, + { + "epoch": 0.048189073050951506, + "grad_norm": 1.2700645923614502, + "learning_rate": 1.5950920245398772e-05, + "loss": 4.9723, + "step": 157 + }, + { + "epoch": 0.048496009821976674, + "grad_norm": 1.1283228397369385, + "learning_rate": 1.605316973415133e-05, + "loss": 4.9801, + "step": 158 + }, + { + "epoch": 0.04880294659300184, + "grad_norm": 1.5563665628433228, + "learning_rate": 1.6155419222903886e-05, + "loss": 4.9118, + "step": 159 + }, + { + "epoch": 0.04910988336402701, + "grad_norm": 1.3759487867355347, + "learning_rate": 1.6257668711656443e-05, + "loss": 4.9552, + "step": 160 + }, + { + "epoch": 0.04941682013505218, + "grad_norm": 1.2167878150939941, + "learning_rate": 1.6359918200409e-05, + "loss": 4.9186, + "step": 161 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.6424930095672607, + "learning_rate": 1.6462167689161557e-05, + "loss": 4.9143, + "step": 162 + }, + { + "epoch": 0.050030693677102514, + "grad_norm": 1.0009948015213013, + "learning_rate": 1.656441717791411e-05, + "loss": 4.8615, + "step": 163 + }, + { + "epoch": 0.05033763044812769, + "grad_norm": 1.8803274631500244, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.8558, + "step": 164 + }, + { + "epoch": 0.05064456721915286, + "grad_norm": 1.1819735765457153, + "learning_rate": 1.6768916155419224e-05, + "loss": 4.8453, + "step": 165 + }, + { + "epoch": 0.050951503990178025, + "grad_norm": 1.9724273681640625, + "learning_rate": 1.6871165644171778e-05, + "loss": 4.8573, + "step": 166 + }, + { + "epoch": 0.05125844076120319, + "grad_norm": 1.4624557495117188, + "learning_rate": 1.6973415132924335e-05, + "loss": 4.8494, + "step": 167 + }, + { + "epoch": 0.05156537753222836, + "grad_norm": 1.4750267267227173, + "learning_rate": 1.7075664621676892e-05, + "loss": 4.8296, + "step": 168 + }, + { + "epoch": 0.05187231430325353, + "grad_norm": 1.3206923007965088, + "learning_rate": 1.717791411042945e-05, + "loss": 4.7834, + "step": 169 + }, + { + "epoch": 0.0521792510742787, + "grad_norm": 1.4332681894302368, + "learning_rate": 1.7280163599182006e-05, + "loss": 4.8008, + "step": 170 + }, + { + "epoch": 0.052486187845303865, + "grad_norm": 1.612804651260376, + "learning_rate": 1.738241308793456e-05, + "loss": 4.7885, + "step": 171 + }, + { + "epoch": 0.05279312461632903, + "grad_norm": 1.3880311250686646, + "learning_rate": 1.7484662576687117e-05, + "loss": 4.8034, + "step": 172 + }, + { + "epoch": 0.05310006138735421, + "grad_norm": 1.7550631761550903, + "learning_rate": 1.7586912065439674e-05, + "loss": 4.7568, + "step": 173 + }, + { + "epoch": 0.053406998158379376, + "grad_norm": 1.653678297996521, + "learning_rate": 1.768916155419223e-05, + "loss": 4.7294, + "step": 174 + }, + { + "epoch": 0.053713934929404544, + "grad_norm": 1.6094826459884644, + "learning_rate": 1.7791411042944784e-05, + "loss": 4.7409, + "step": 175 + }, + { + "epoch": 0.05402087170042971, + "grad_norm": 1.7453033924102783, + "learning_rate": 1.789366053169734e-05, + "loss": 4.7191, + "step": 176 + }, + { + "epoch": 0.05432780847145488, + "grad_norm": 1.3073794841766357, + "learning_rate": 1.79959100204499e-05, + "loss": 4.7347, + "step": 177 + }, + { + "epoch": 0.05463474524248005, + "grad_norm": 2.096515655517578, + "learning_rate": 1.8098159509202455e-05, + "loss": 4.7396, + "step": 178 + }, + { + "epoch": 0.054941682013505216, + "grad_norm": 1.3826024532318115, + "learning_rate": 1.8200408997955012e-05, + "loss": 4.6988, + "step": 179 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 1.9290310144424438, + "learning_rate": 1.8302658486707566e-05, + "loss": 4.6653, + "step": 180 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.7404149770736694, + "learning_rate": 1.8404907975460123e-05, + "loss": 4.7102, + "step": 181 + }, + { + "epoch": 0.05586249232658073, + "grad_norm": 1.7535779476165771, + "learning_rate": 1.850715746421268e-05, + "loss": 4.7124, + "step": 182 + }, + { + "epoch": 0.056169429097605895, + "grad_norm": 1.7792351245880127, + "learning_rate": 1.8609406952965237e-05, + "loss": 4.6969, + "step": 183 + }, + { + "epoch": 0.056476365868631064, + "grad_norm": 2.048332452774048, + "learning_rate": 1.8711656441717794e-05, + "loss": 4.6134, + "step": 184 + }, + { + "epoch": 0.05678330263965623, + "grad_norm": 1.9558366537094116, + "learning_rate": 1.8813905930470348e-05, + "loss": 4.6739, + "step": 185 + }, + { + "epoch": 0.0570902394106814, + "grad_norm": 2.5299644470214844, + "learning_rate": 1.8916155419222905e-05, + "loss": 4.6248, + "step": 186 + }, + { + "epoch": 0.05739717618170657, + "grad_norm": 2.143704891204834, + "learning_rate": 1.9018404907975462e-05, + "loss": 4.6664, + "step": 187 + }, + { + "epoch": 0.057704112952731736, + "grad_norm": 1.925010323524475, + "learning_rate": 1.9120654396728015e-05, + "loss": 4.5657, + "step": 188 + }, + { + "epoch": 0.058011049723756904, + "grad_norm": 1.8223596811294556, + "learning_rate": 1.9222903885480572e-05, + "loss": 4.6124, + "step": 189 + }, + { + "epoch": 0.05831798649478207, + "grad_norm": 1.9519827365875244, + "learning_rate": 1.932515337423313e-05, + "loss": 4.5937, + "step": 190 + }, + { + "epoch": 0.05862492326580725, + "grad_norm": 2.062534809112549, + "learning_rate": 1.9427402862985686e-05, + "loss": 4.6023, + "step": 191 + }, + { + "epoch": 0.058931860036832415, + "grad_norm": 1.8512892723083496, + "learning_rate": 1.9529652351738243e-05, + "loss": 4.5709, + "step": 192 + }, + { + "epoch": 0.05923879680785758, + "grad_norm": 2.7771248817443848, + "learning_rate": 1.96319018404908e-05, + "loss": 4.5902, + "step": 193 + }, + { + "epoch": 0.05954573357888275, + "grad_norm": 1.8911874294281006, + "learning_rate": 1.9734151329243354e-05, + "loss": 4.4973, + "step": 194 + }, + { + "epoch": 0.05985267034990792, + "grad_norm": 2.261096715927124, + "learning_rate": 1.983640081799591e-05, + "loss": 4.5343, + "step": 195 + }, + { + "epoch": 0.06015960712093309, + "grad_norm": 1.833983302116394, + "learning_rate": 1.9938650306748465e-05, + "loss": 4.5604, + "step": 196 + }, + { + "epoch": 0.060466543891958255, + "grad_norm": 2.6909141540527344, + "learning_rate": 2.0040899795501022e-05, + "loss": 4.5411, + "step": 197 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.8085883855819702, + "learning_rate": 2.014314928425358e-05, + "loss": 4.5127, + "step": 198 + }, + { + "epoch": 0.06108041743400859, + "grad_norm": 3.082063913345337, + "learning_rate": 2.0245398773006136e-05, + "loss": 4.5055, + "step": 199 + }, + { + "epoch": 0.061387354205033766, + "grad_norm": 1.6942392587661743, + "learning_rate": 2.0347648261758693e-05, + "loss": 4.4852, + "step": 200 + }, + { + "epoch": 0.061694290976058934, + "grad_norm": 2.428569793701172, + "learning_rate": 2.044989775051125e-05, + "loss": 4.4876, + "step": 201 + }, + { + "epoch": 0.0620012277470841, + "grad_norm": 2.1669068336486816, + "learning_rate": 2.0552147239263807e-05, + "loss": 4.5156, + "step": 202 + }, + { + "epoch": 0.06230816451810927, + "grad_norm": 1.8558237552642822, + "learning_rate": 2.065439672801636e-05, + "loss": 4.495, + "step": 203 + }, + { + "epoch": 0.06261510128913444, + "grad_norm": 2.86224627494812, + "learning_rate": 2.0756646216768917e-05, + "loss": 4.4881, + "step": 204 + }, + { + "epoch": 0.06292203806015961, + "grad_norm": 2.263230562210083, + "learning_rate": 2.085889570552147e-05, + "loss": 4.4349, + "step": 205 + }, + { + "epoch": 0.06322897483118478, + "grad_norm": 2.533039093017578, + "learning_rate": 2.0961145194274028e-05, + "loss": 4.4921, + "step": 206 + }, + { + "epoch": 0.06353591160220995, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.1063394683026585e-05, + "loss": 4.4581, + "step": 207 + }, + { + "epoch": 0.06384284837323512, + "grad_norm": 1.9801981449127197, + "learning_rate": 2.1165644171779142e-05, + "loss": 4.4646, + "step": 208 + }, + { + "epoch": 0.06414978514426029, + "grad_norm": 2.8499860763549805, + "learning_rate": 2.12678936605317e-05, + "loss": 4.3913, + "step": 209 + }, + { + "epoch": 0.06445672191528545, + "grad_norm": 1.8176993131637573, + "learning_rate": 2.1370143149284256e-05, + "loss": 4.4414, + "step": 210 + }, + { + "epoch": 0.06476365868631062, + "grad_norm": 3.1497061252593994, + "learning_rate": 2.1472392638036813e-05, + "loss": 4.4164, + "step": 211 + }, + { + "epoch": 0.06507059545733579, + "grad_norm": 2.0509049892425537, + "learning_rate": 2.1574642126789367e-05, + "loss": 4.4198, + "step": 212 + }, + { + "epoch": 0.06537753222836096, + "grad_norm": 2.5346014499664307, + "learning_rate": 2.1676891615541924e-05, + "loss": 4.3628, + "step": 213 + }, + { + "epoch": 0.06568446899938613, + "grad_norm": 2.281947135925293, + "learning_rate": 2.1779141104294477e-05, + "loss": 4.3824, + "step": 214 + }, + { + "epoch": 0.0659914057704113, + "grad_norm": 2.9005074501037598, + "learning_rate": 2.1881390593047034e-05, + "loss": 4.4227, + "step": 215 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 2.5869741439819336, + "learning_rate": 2.198364008179959e-05, + "loss": 4.4231, + "step": 216 + }, + { + "epoch": 0.06660527931246163, + "grad_norm": 2.339655637741089, + "learning_rate": 2.208588957055215e-05, + "loss": 4.3901, + "step": 217 + }, + { + "epoch": 0.0669122160834868, + "grad_norm": 2.430664539337158, + "learning_rate": 2.2188139059304705e-05, + "loss": 4.3487, + "step": 218 + }, + { + "epoch": 0.06721915285451197, + "grad_norm": 2.1791040897369385, + "learning_rate": 2.2290388548057262e-05, + "loss": 4.3404, + "step": 219 + }, + { + "epoch": 0.06752608962553713, + "grad_norm": 2.7054920196533203, + "learning_rate": 2.239263803680982e-05, + "loss": 4.4186, + "step": 220 + }, + { + "epoch": 0.0678330263965623, + "grad_norm": 2.516566514968872, + "learning_rate": 2.2494887525562373e-05, + "loss": 4.4102, + "step": 221 + }, + { + "epoch": 0.06813996316758748, + "grad_norm": 2.3522324562072754, + "learning_rate": 2.259713701431493e-05, + "loss": 4.4062, + "step": 222 + }, + { + "epoch": 0.06844689993861265, + "grad_norm": 2.557600259780884, + "learning_rate": 2.2699386503067484e-05, + "loss": 4.3711, + "step": 223 + }, + { + "epoch": 0.06875383670963782, + "grad_norm": 2.0590531826019287, + "learning_rate": 2.280163599182004e-05, + "loss": 4.3546, + "step": 224 + }, + { + "epoch": 0.06906077348066299, + "grad_norm": 4.704878330230713, + "learning_rate": 2.2903885480572598e-05, + "loss": 4.39, + "step": 225 + }, + { + "epoch": 0.06936771025168816, + "grad_norm": 2.237440347671509, + "learning_rate": 2.3006134969325155e-05, + "loss": 4.3425, + "step": 226 + }, + { + "epoch": 0.06967464702271332, + "grad_norm": 3.9394450187683105, + "learning_rate": 2.3108384458077712e-05, + "loss": 4.3641, + "step": 227 + }, + { + "epoch": 0.06998158379373849, + "grad_norm": 2.4857213497161865, + "learning_rate": 2.321063394683027e-05, + "loss": 4.3435, + "step": 228 + }, + { + "epoch": 0.07028852056476366, + "grad_norm": 2.893437147140503, + "learning_rate": 2.3312883435582822e-05, + "loss": 4.329, + "step": 229 + }, + { + "epoch": 0.07059545733578883, + "grad_norm": 2.6498284339904785, + "learning_rate": 2.341513292433538e-05, + "loss": 4.3058, + "step": 230 + }, + { + "epoch": 0.070902394106814, + "grad_norm": 2.4182214736938477, + "learning_rate": 2.3517382413087936e-05, + "loss": 4.3147, + "step": 231 + }, + { + "epoch": 0.07120933087783916, + "grad_norm": 2.532050371170044, + "learning_rate": 2.361963190184049e-05, + "loss": 4.3388, + "step": 232 + }, + { + "epoch": 0.07151626764886433, + "grad_norm": 2.5818533897399902, + "learning_rate": 2.3721881390593047e-05, + "loss": 4.3023, + "step": 233 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 2.1860098838806152, + "learning_rate": 2.3824130879345604e-05, + "loss": 4.2571, + "step": 234 + }, + { + "epoch": 0.07213014119091467, + "grad_norm": 3.5780131816864014, + "learning_rate": 2.392638036809816e-05, + "loss": 4.3336, + "step": 235 + }, + { + "epoch": 0.07243707796193984, + "grad_norm": 2.24653697013855, + "learning_rate": 2.4028629856850718e-05, + "loss": 4.3013, + "step": 236 + }, + { + "epoch": 0.072744014732965, + "grad_norm": 3.59663987159729, + "learning_rate": 2.4130879345603275e-05, + "loss": 4.3248, + "step": 237 + }, + { + "epoch": 0.07305095150399017, + "grad_norm": 2.818321943283081, + "learning_rate": 2.423312883435583e-05, + "loss": 4.2876, + "step": 238 + }, + { + "epoch": 0.07335788827501534, + "grad_norm": 2.457371950149536, + "learning_rate": 2.4335378323108386e-05, + "loss": 4.2584, + "step": 239 + }, + { + "epoch": 0.07366482504604052, + "grad_norm": 3.6243598461151123, + "learning_rate": 2.4437627811860943e-05, + "loss": 4.2786, + "step": 240 + }, + { + "epoch": 0.07397176181706569, + "grad_norm": 2.113060474395752, + "learning_rate": 2.4539877300613496e-05, + "loss": 4.2071, + "step": 241 + }, + { + "epoch": 0.07427869858809086, + "grad_norm": 5.355374813079834, + "learning_rate": 2.4642126789366053e-05, + "loss": 4.2871, + "step": 242 + }, + { + "epoch": 0.07458563535911603, + "grad_norm": 2.4509847164154053, + "learning_rate": 2.474437627811861e-05, + "loss": 4.2073, + "step": 243 + }, + { + "epoch": 0.0748925721301412, + "grad_norm": 3.313793659210205, + "learning_rate": 2.4846625766871167e-05, + "loss": 4.2938, + "step": 244 + }, + { + "epoch": 0.07519950890116636, + "grad_norm": 2.731903553009033, + "learning_rate": 2.4948875255623724e-05, + "loss": 4.2023, + "step": 245 + }, + { + "epoch": 0.07550644567219153, + "grad_norm": 2.6218042373657227, + "learning_rate": 2.505112474437628e-05, + "loss": 4.2492, + "step": 246 + }, + { + "epoch": 0.0758133824432167, + "grad_norm": 3.2865426540374756, + "learning_rate": 2.5153374233128835e-05, + "loss": 4.2358, + "step": 247 + }, + { + "epoch": 0.07612031921424187, + "grad_norm": 2.21870756149292, + "learning_rate": 2.5255623721881395e-05, + "loss": 4.1989, + "step": 248 + }, + { + "epoch": 0.07642725598526703, + "grad_norm": 4.095842361450195, + "learning_rate": 2.535787321063395e-05, + "loss": 4.2484, + "step": 249 + }, + { + "epoch": 0.0767341927562922, + "grad_norm": 2.21420955657959, + "learning_rate": 2.5460122699386503e-05, + "loss": 4.1985, + "step": 250 + }, + { + "epoch": 0.07704112952731737, + "grad_norm": 3.011272668838501, + "learning_rate": 2.556237218813906e-05, + "loss": 4.2182, + "step": 251 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 2.930999279022217, + "learning_rate": 2.5664621676891613e-05, + "loss": 4.1985, + "step": 252 + }, + { + "epoch": 0.0776550030693677, + "grad_norm": 2.8528032302856445, + "learning_rate": 2.5766871165644174e-05, + "loss": 4.1859, + "step": 253 + }, + { + "epoch": 0.07796193984039287, + "grad_norm": 3.215587854385376, + "learning_rate": 2.5869120654396727e-05, + "loss": 4.2416, + "step": 254 + }, + { + "epoch": 0.07826887661141804, + "grad_norm": 3.1349990367889404, + "learning_rate": 2.5971370143149288e-05, + "loss": 4.2204, + "step": 255 + }, + { + "epoch": 0.07857581338244321, + "grad_norm": 3.146942377090454, + "learning_rate": 2.607361963190184e-05, + "loss": 4.17, + "step": 256 + }, + { + "epoch": 0.07888275015346839, + "grad_norm": 2.2611942291259766, + "learning_rate": 2.6175869120654402e-05, + "loss": 4.191, + "step": 257 + }, + { + "epoch": 0.07918968692449356, + "grad_norm": 3.434574604034424, + "learning_rate": 2.6278118609406955e-05, + "loss": 4.1854, + "step": 258 + }, + { + "epoch": 0.07949662369551873, + "grad_norm": 2.3132400512695312, + "learning_rate": 2.638036809815951e-05, + "loss": 4.233, + "step": 259 + }, + { + "epoch": 0.0798035604665439, + "grad_norm": 3.2676596641540527, + "learning_rate": 2.6482617586912066e-05, + "loss": 4.1586, + "step": 260 + }, + { + "epoch": 0.08011049723756906, + "grad_norm": 2.6182920932769775, + "learning_rate": 2.658486707566462e-05, + "loss": 4.164, + "step": 261 + }, + { + "epoch": 0.08041743400859423, + "grad_norm": 2.872018814086914, + "learning_rate": 2.668711656441718e-05, + "loss": 4.1642, + "step": 262 + }, + { + "epoch": 0.0807243707796194, + "grad_norm": 3.147237539291382, + "learning_rate": 2.6789366053169734e-05, + "loss": 4.147, + "step": 263 + }, + { + "epoch": 0.08103130755064457, + "grad_norm": 2.363360643386841, + "learning_rate": 2.6891615541922294e-05, + "loss": 4.1388, + "step": 264 + }, + { + "epoch": 0.08133824432166974, + "grad_norm": 3.364442825317383, + "learning_rate": 2.6993865030674848e-05, + "loss": 4.1678, + "step": 265 + }, + { + "epoch": 0.0816451810926949, + "grad_norm": 2.393705368041992, + "learning_rate": 2.7096114519427408e-05, + "loss": 4.1626, + "step": 266 + }, + { + "epoch": 0.08195211786372007, + "grad_norm": 3.8512558937072754, + "learning_rate": 2.719836400817996e-05, + "loss": 4.1613, + "step": 267 + }, + { + "epoch": 0.08225905463474524, + "grad_norm": 3.0992584228515625, + "learning_rate": 2.7300613496932515e-05, + "loss": 4.1486, + "step": 268 + }, + { + "epoch": 0.08256599140577041, + "grad_norm": 3.481079578399658, + "learning_rate": 2.7402862985685072e-05, + "loss": 4.1772, + "step": 269 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 3.2167513370513916, + "learning_rate": 2.7505112474437626e-05, + "loss": 4.1253, + "step": 270 + }, + { + "epoch": 0.08317986494782074, + "grad_norm": 2.9698429107666016, + "learning_rate": 2.7607361963190186e-05, + "loss": 4.0897, + "step": 271 + }, + { + "epoch": 0.08348680171884591, + "grad_norm": 3.2549962997436523, + "learning_rate": 2.770961145194274e-05, + "loss": 4.0851, + "step": 272 + }, + { + "epoch": 0.08379373848987108, + "grad_norm": 3.089301824569702, + "learning_rate": 2.78118609406953e-05, + "loss": 4.1378, + "step": 273 + }, + { + "epoch": 0.08410067526089625, + "grad_norm": 3.1799745559692383, + "learning_rate": 2.7914110429447854e-05, + "loss": 4.159, + "step": 274 + }, + { + "epoch": 0.08440761203192143, + "grad_norm": 2.7577199935913086, + "learning_rate": 2.8016359918200408e-05, + "loss": 4.0524, + "step": 275 + }, + { + "epoch": 0.0847145488029466, + "grad_norm": 3.709740161895752, + "learning_rate": 2.8118609406952968e-05, + "loss": 4.0877, + "step": 276 + }, + { + "epoch": 0.08502148557397177, + "grad_norm": 2.930482864379883, + "learning_rate": 2.822085889570552e-05, + "loss": 4.0408, + "step": 277 + }, + { + "epoch": 0.08532842234499693, + "grad_norm": 3.8216278553009033, + "learning_rate": 2.832310838445808e-05, + "loss": 4.0915, + "step": 278 + }, + { + "epoch": 0.0856353591160221, + "grad_norm": 2.7614903450012207, + "learning_rate": 2.8425357873210636e-05, + "loss": 4.0793, + "step": 279 + }, + { + "epoch": 0.08594229588704727, + "grad_norm": 4.005281448364258, + "learning_rate": 2.8527607361963193e-05, + "loss": 4.1234, + "step": 280 + }, + { + "epoch": 0.08624923265807244, + "grad_norm": 2.731640338897705, + "learning_rate": 2.8629856850715746e-05, + "loss": 4.1408, + "step": 281 + }, + { + "epoch": 0.0865561694290976, + "grad_norm": 4.439471244812012, + "learning_rate": 2.8732106339468307e-05, + "loss": 4.08, + "step": 282 + }, + { + "epoch": 0.08686310620012277, + "grad_norm": 2.929032564163208, + "learning_rate": 2.883435582822086e-05, + "loss": 4.0521, + "step": 283 + }, + { + "epoch": 0.08717004297114794, + "grad_norm": 3.3943557739257812, + "learning_rate": 2.8936605316973414e-05, + "loss": 4.0936, + "step": 284 + }, + { + "epoch": 0.08747697974217311, + "grad_norm": 2.9899704456329346, + "learning_rate": 2.9038854805725974e-05, + "loss": 4.0985, + "step": 285 + }, + { + "epoch": 0.08778391651319828, + "grad_norm": 2.8169870376586914, + "learning_rate": 2.9141104294478528e-05, + "loss": 4.1044, + "step": 286 + }, + { + "epoch": 0.08809085328422345, + "grad_norm": 4.312693119049072, + "learning_rate": 2.9243353783231085e-05, + "loss": 4.0515, + "step": 287 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 2.9270846843719482, + "learning_rate": 2.9345603271983642e-05, + "loss": 4.0221, + "step": 288 + }, + { + "epoch": 0.08870472682627378, + "grad_norm": 3.9831974506378174, + "learning_rate": 2.94478527607362e-05, + "loss": 4.0807, + "step": 289 + }, + { + "epoch": 0.08901166359729895, + "grad_norm": 2.721794605255127, + "learning_rate": 2.9550102249488753e-05, + "loss": 4.0732, + "step": 290 + }, + { + "epoch": 0.08931860036832412, + "grad_norm": 4.721047878265381, + "learning_rate": 2.9652351738241313e-05, + "loss": 4.0457, + "step": 291 + }, + { + "epoch": 0.08962553713934929, + "grad_norm": 2.785738229751587, + "learning_rate": 2.9754601226993867e-05, + "loss": 4.0288, + "step": 292 + }, + { + "epoch": 0.08993247391037447, + "grad_norm": 4.842009544372559, + "learning_rate": 2.985685071574642e-05, + "loss": 4.1193, + "step": 293 + }, + { + "epoch": 0.09023941068139964, + "grad_norm": 2.802044153213501, + "learning_rate": 2.995910020449898e-05, + "loss": 4.0055, + "step": 294 + }, + { + "epoch": 0.0905463474524248, + "grad_norm": 3.7060954570770264, + "learning_rate": 3.0061349693251534e-05, + "loss": 4.0478, + "step": 295 + }, + { + "epoch": 0.09085328422344997, + "grad_norm": 2.8033370971679688, + "learning_rate": 3.0163599182004095e-05, + "loss": 4.0344, + "step": 296 + }, + { + "epoch": 0.09116022099447514, + "grad_norm": 3.148653984069824, + "learning_rate": 3.026584867075665e-05, + "loss": 3.9825, + "step": 297 + }, + { + "epoch": 0.09146715776550031, + "grad_norm": 3.925459384918213, + "learning_rate": 3.0368098159509205e-05, + "loss": 4.0253, + "step": 298 + }, + { + "epoch": 0.09177409453652548, + "grad_norm": 2.8502724170684814, + "learning_rate": 3.047034764826176e-05, + "loss": 4.0192, + "step": 299 + }, + { + "epoch": 0.09208103130755065, + "grad_norm": 3.8444268703460693, + "learning_rate": 3.057259713701431e-05, + "loss": 4.0354, + "step": 300 + }, + { + "epoch": 0.09238796807857581, + "grad_norm": 2.935976982116699, + "learning_rate": 3.067484662576687e-05, + "loss": 4.0397, + "step": 301 + }, + { + "epoch": 0.09269490484960098, + "grad_norm": 2.9375271797180176, + "learning_rate": 3.0777096114519427e-05, + "loss": 3.975, + "step": 302 + }, + { + "epoch": 0.09300184162062615, + "grad_norm": 3.7623329162597656, + "learning_rate": 3.087934560327199e-05, + "loss": 4.0259, + "step": 303 + }, + { + "epoch": 0.09330877839165132, + "grad_norm": 3.1480228900909424, + "learning_rate": 3.098159509202454e-05, + "loss": 3.9676, + "step": 304 + }, + { + "epoch": 0.09361571516267649, + "grad_norm": 4.572622299194336, + "learning_rate": 3.10838445807771e-05, + "loss": 4.0123, + "step": 305 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 2.469806671142578, + "learning_rate": 3.1186094069529655e-05, + "loss": 4.012, + "step": 306 + }, + { + "epoch": 0.09422958870472682, + "grad_norm": 5.133090019226074, + "learning_rate": 3.1288343558282215e-05, + "loss": 3.9892, + "step": 307 + }, + { + "epoch": 0.09453652547575199, + "grad_norm": 3.379105567932129, + "learning_rate": 3.139059304703477e-05, + "loss": 4.0286, + "step": 308 + }, + { + "epoch": 0.09484346224677716, + "grad_norm": 3.1413521766662598, + "learning_rate": 3.149284253578732e-05, + "loss": 4.0238, + "step": 309 + }, + { + "epoch": 0.09515039901780234, + "grad_norm": 2.832242250442505, + "learning_rate": 3.159509202453988e-05, + "loss": 3.9955, + "step": 310 + }, + { + "epoch": 0.09545733578882751, + "grad_norm": 4.405134201049805, + "learning_rate": 3.1697341513292436e-05, + "loss": 4.0093, + "step": 311 + }, + { + "epoch": 0.09576427255985268, + "grad_norm": 2.8928587436676025, + "learning_rate": 3.179959100204499e-05, + "loss": 3.9518, + "step": 312 + }, + { + "epoch": 0.09607120933087784, + "grad_norm": 3.8899731636047363, + "learning_rate": 3.1901840490797544e-05, + "loss": 3.9773, + "step": 313 + }, + { + "epoch": 0.09637814610190301, + "grad_norm": 2.768199920654297, + "learning_rate": 3.2004089979550104e-05, + "loss": 3.9671, + "step": 314 + }, + { + "epoch": 0.09668508287292818, + "grad_norm": 3.834092378616333, + "learning_rate": 3.210633946830266e-05, + "loss": 3.9641, + "step": 315 + }, + { + "epoch": 0.09699201964395335, + "grad_norm": 3.566220998764038, + "learning_rate": 3.220858895705521e-05, + "loss": 3.9585, + "step": 316 + }, + { + "epoch": 0.09729895641497852, + "grad_norm": 3.1876113414764404, + "learning_rate": 3.231083844580777e-05, + "loss": 3.9689, + "step": 317 + }, + { + "epoch": 0.09760589318600368, + "grad_norm": 3.122142791748047, + "learning_rate": 3.2413087934560325e-05, + "loss": 3.9601, + "step": 318 + }, + { + "epoch": 0.09791282995702885, + "grad_norm": 3.825195789337158, + "learning_rate": 3.2515337423312886e-05, + "loss": 3.9413, + "step": 319 + }, + { + "epoch": 0.09821976672805402, + "grad_norm": 3.3126778602600098, + "learning_rate": 3.261758691206544e-05, + "loss": 4.0414, + "step": 320 + }, + { + "epoch": 0.09852670349907919, + "grad_norm": 3.7704360485076904, + "learning_rate": 3.2719836400818e-05, + "loss": 3.9224, + "step": 321 + }, + { + "epoch": 0.09883364027010436, + "grad_norm": 2.997194290161133, + "learning_rate": 3.282208588957055e-05, + "loss": 3.9454, + "step": 322 + }, + { + "epoch": 0.09914057704112952, + "grad_norm": 3.4990131855010986, + "learning_rate": 3.2924335378323114e-05, + "loss": 3.8682, + "step": 323 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 3.146879196166992, + "learning_rate": 3.302658486707567e-05, + "loss": 3.8863, + "step": 324 + }, + { + "epoch": 0.09975445058317986, + "grad_norm": 4.963291645050049, + "learning_rate": 3.312883435582822e-05, + "loss": 3.9951, + "step": 325 + }, + { + "epoch": 0.10006138735420503, + "grad_norm": 2.4511775970458984, + "learning_rate": 3.323108384458078e-05, + "loss": 3.875, + "step": 326 + }, + { + "epoch": 0.1003683241252302, + "grad_norm": 5.670922756195068, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.0446, + "step": 327 + }, + { + "epoch": 0.10067526089625538, + "grad_norm": 3.54237699508667, + "learning_rate": 3.3435582822085895e-05, + "loss": 3.9877, + "step": 328 + }, + { + "epoch": 0.10098219766728055, + "grad_norm": 2.9059271812438965, + "learning_rate": 3.353783231083845e-05, + "loss": 3.949, + "step": 329 + }, + { + "epoch": 0.10128913443830571, + "grad_norm": 3.870962381362915, + "learning_rate": 3.3640081799591e-05, + "loss": 3.8985, + "step": 330 + }, + { + "epoch": 0.10159607120933088, + "grad_norm": 3.275129556655884, + "learning_rate": 3.3742331288343556e-05, + "loss": 4.0209, + "step": 331 + }, + { + "epoch": 0.10190300798035605, + "grad_norm": 3.040931224822998, + "learning_rate": 3.3844580777096117e-05, + "loss": 3.9938, + "step": 332 + }, + { + "epoch": 0.10220994475138122, + "grad_norm": 4.3355584144592285, + "learning_rate": 3.394683026584867e-05, + "loss": 3.876, + "step": 333 + }, + { + "epoch": 0.10251688152240639, + "grad_norm": 3.0981085300445557, + "learning_rate": 3.4049079754601224e-05, + "loss": 3.9014, + "step": 334 + }, + { + "epoch": 0.10282381829343155, + "grad_norm": 3.2902655601501465, + "learning_rate": 3.4151329243353784e-05, + "loss": 3.9599, + "step": 335 + }, + { + "epoch": 0.10313075506445672, + "grad_norm": 3.496514081954956, + "learning_rate": 3.425357873210634e-05, + "loss": 3.9005, + "step": 336 + }, + { + "epoch": 0.10343769183548189, + "grad_norm": 3.4680685997009277, + "learning_rate": 3.43558282208589e-05, + "loss": 3.8591, + "step": 337 + }, + { + "epoch": 0.10374462860650706, + "grad_norm": 3.3041694164276123, + "learning_rate": 3.445807770961145e-05, + "loss": 3.9566, + "step": 338 + }, + { + "epoch": 0.10405156537753223, + "grad_norm": 3.519709825515747, + "learning_rate": 3.456032719836401e-05, + "loss": 3.9219, + "step": 339 + }, + { + "epoch": 0.1043585021485574, + "grad_norm": 3.932344436645508, + "learning_rate": 3.4662576687116566e-05, + "loss": 3.9155, + "step": 340 + }, + { + "epoch": 0.10466543891958256, + "grad_norm": 3.3109822273254395, + "learning_rate": 3.476482617586912e-05, + "loss": 3.9729, + "step": 341 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 4.556341648101807, + "learning_rate": 3.486707566462168e-05, + "loss": 3.9459, + "step": 342 + }, + { + "epoch": 0.1052793124616329, + "grad_norm": 2.9105725288391113, + "learning_rate": 3.4969325153374234e-05, + "loss": 3.9384, + "step": 343 + }, + { + "epoch": 0.10558624923265807, + "grad_norm": 3.865682601928711, + "learning_rate": 3.5071574642126794e-05, + "loss": 3.9826, + "step": 344 + }, + { + "epoch": 0.10589318600368323, + "grad_norm": 2.8606700897216797, + "learning_rate": 3.517382413087935e-05, + "loss": 3.8184, + "step": 345 + }, + { + "epoch": 0.10620012277470842, + "grad_norm": 4.323507785797119, + "learning_rate": 3.527607361963191e-05, + "loss": 3.8772, + "step": 346 + }, + { + "epoch": 0.10650705954573358, + "grad_norm": 2.890390157699585, + "learning_rate": 3.537832310838446e-05, + "loss": 3.8769, + "step": 347 + }, + { + "epoch": 0.10681399631675875, + "grad_norm": 4.008283615112305, + "learning_rate": 3.5480572597137015e-05, + "loss": 3.8796, + "step": 348 + }, + { + "epoch": 0.10712093308778392, + "grad_norm": 3.3605823516845703, + "learning_rate": 3.558282208588957e-05, + "loss": 3.8924, + "step": 349 + }, + { + "epoch": 0.10742786985880909, + "grad_norm": 3.6573123931884766, + "learning_rate": 3.568507157464213e-05, + "loss": 3.812, + "step": 350 + }, + { + "epoch": 0.10773480662983426, + "grad_norm": 3.0771777629852295, + "learning_rate": 3.578732106339468e-05, + "loss": 3.8958, + "step": 351 + }, + { + "epoch": 0.10804174340085942, + "grad_norm": 3.6483314037323, + "learning_rate": 3.5889570552147236e-05, + "loss": 3.8863, + "step": 352 + }, + { + "epoch": 0.10834868017188459, + "grad_norm": 3.1320669651031494, + "learning_rate": 3.59918200408998e-05, + "loss": 3.8194, + "step": 353 + }, + { + "epoch": 0.10865561694290976, + "grad_norm": 3.6510627269744873, + "learning_rate": 3.609406952965235e-05, + "loss": 3.8916, + "step": 354 + }, + { + "epoch": 0.10896255371393493, + "grad_norm": 3.0419273376464844, + "learning_rate": 3.619631901840491e-05, + "loss": 3.7907, + "step": 355 + }, + { + "epoch": 0.1092694904849601, + "grad_norm": 4.519289493560791, + "learning_rate": 3.6298568507157465e-05, + "loss": 3.8902, + "step": 356 + }, + { + "epoch": 0.10957642725598526, + "grad_norm": 2.938493251800537, + "learning_rate": 3.6400817995910025e-05, + "loss": 3.8675, + "step": 357 + }, + { + "epoch": 0.10988336402701043, + "grad_norm": 4.398004531860352, + "learning_rate": 3.650306748466258e-05, + "loss": 3.9535, + "step": 358 + }, + { + "epoch": 0.1101903007980356, + "grad_norm": 2.9128408432006836, + "learning_rate": 3.660531697341513e-05, + "loss": 3.944, + "step": 359 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 5.364169597625732, + "learning_rate": 3.670756646216769e-05, + "loss": 3.9289, + "step": 360 + }, + { + "epoch": 0.11080417434008594, + "grad_norm": 2.8434085845947266, + "learning_rate": 3.6809815950920246e-05, + "loss": 3.8204, + "step": 361 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.494234561920166, + "learning_rate": 3.6912065439672807e-05, + "loss": 3.8518, + "step": 362 + }, + { + "epoch": 0.11141804788213629, + "grad_norm": 2.959608554840088, + "learning_rate": 3.701431492842536e-05, + "loss": 3.8365, + "step": 363 + }, + { + "epoch": 0.11172498465316145, + "grad_norm": 3.4115726947784424, + "learning_rate": 3.711656441717792e-05, + "loss": 3.8507, + "step": 364 + }, + { + "epoch": 0.11203192142418662, + "grad_norm": 3.8023531436920166, + "learning_rate": 3.7218813905930474e-05, + "loss": 3.8544, + "step": 365 + }, + { + "epoch": 0.11233885819521179, + "grad_norm": 3.0639398097991943, + "learning_rate": 3.732106339468303e-05, + "loss": 3.8772, + "step": 366 + }, + { + "epoch": 0.11264579496623696, + "grad_norm": 4.241199016571045, + "learning_rate": 3.742331288343559e-05, + "loss": 3.7739, + "step": 367 + }, + { + "epoch": 0.11295273173726213, + "grad_norm": 2.977330446243286, + "learning_rate": 3.752556237218814e-05, + "loss": 3.8376, + "step": 368 + }, + { + "epoch": 0.1132596685082873, + "grad_norm": 4.574001789093018, + "learning_rate": 3.7627811860940696e-05, + "loss": 3.8761, + "step": 369 + }, + { + "epoch": 0.11356660527931246, + "grad_norm": 3.1499617099761963, + "learning_rate": 3.773006134969325e-05, + "loss": 3.8884, + "step": 370 + }, + { + "epoch": 0.11387354205033763, + "grad_norm": 3.81887149810791, + "learning_rate": 3.783231083844581e-05, + "loss": 3.8474, + "step": 371 + }, + { + "epoch": 0.1141804788213628, + "grad_norm": 3.424117088317871, + "learning_rate": 3.793456032719836e-05, + "loss": 3.8715, + "step": 372 + }, + { + "epoch": 0.11448741559238797, + "grad_norm": 4.431595325469971, + "learning_rate": 3.8036809815950924e-05, + "loss": 3.8305, + "step": 373 + }, + { + "epoch": 0.11479435236341314, + "grad_norm": 3.1664443016052246, + "learning_rate": 3.813905930470348e-05, + "loss": 3.8203, + "step": 374 + }, + { + "epoch": 0.1151012891344383, + "grad_norm": 4.312273025512695, + "learning_rate": 3.824130879345603e-05, + "loss": 3.8195, + "step": 375 + }, + { + "epoch": 0.11540822590546347, + "grad_norm": 3.0893726348876953, + "learning_rate": 3.834355828220859e-05, + "loss": 3.8248, + "step": 376 + }, + { + "epoch": 0.11571516267648864, + "grad_norm": 4.526726722717285, + "learning_rate": 3.8445807770961145e-05, + "loss": 3.8505, + "step": 377 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 2.5805325508117676, + "learning_rate": 3.8548057259713705e-05, + "loss": 3.8153, + "step": 378 + }, + { + "epoch": 0.11632903621853898, + "grad_norm": 4.6043381690979, + "learning_rate": 3.865030674846626e-05, + "loss": 3.8248, + "step": 379 + }, + { + "epoch": 0.11663597298956414, + "grad_norm": 3.0713136196136475, + "learning_rate": 3.875255623721882e-05, + "loss": 3.7687, + "step": 380 + }, + { + "epoch": 0.11694290976058933, + "grad_norm": 3.6344685554504395, + "learning_rate": 3.885480572597137e-05, + "loss": 3.8061, + "step": 381 + }, + { + "epoch": 0.1172498465316145, + "grad_norm": 3.6261723041534424, + "learning_rate": 3.895705521472393e-05, + "loss": 3.7939, + "step": 382 + }, + { + "epoch": 0.11755678330263966, + "grad_norm": 3.811779260635376, + "learning_rate": 3.905930470347649e-05, + "loss": 3.7973, + "step": 383 + }, + { + "epoch": 0.11786372007366483, + "grad_norm": 3.741685628890991, + "learning_rate": 3.916155419222904e-05, + "loss": 3.8149, + "step": 384 + }, + { + "epoch": 0.11817065684469, + "grad_norm": 3.330526351928711, + "learning_rate": 3.92638036809816e-05, + "loss": 3.8058, + "step": 385 + }, + { + "epoch": 0.11847759361571517, + "grad_norm": 3.2102115154266357, + "learning_rate": 3.9366053169734155e-05, + "loss": 3.7199, + "step": 386 + }, + { + "epoch": 0.11878453038674033, + "grad_norm": 3.670474052429199, + "learning_rate": 3.946830265848671e-05, + "loss": 3.8087, + "step": 387 + }, + { + "epoch": 0.1190914671577655, + "grad_norm": 3.218390941619873, + "learning_rate": 3.957055214723926e-05, + "loss": 3.7631, + "step": 388 + }, + { + "epoch": 0.11939840392879067, + "grad_norm": 4.2256693840026855, + "learning_rate": 3.967280163599182e-05, + "loss": 3.7624, + "step": 389 + }, + { + "epoch": 0.11970534069981584, + "grad_norm": 2.86247181892395, + "learning_rate": 3.9775051124744376e-05, + "loss": 3.7638, + "step": 390 + }, + { + "epoch": 0.120012277470841, + "grad_norm": 4.083118915557861, + "learning_rate": 3.987730061349693e-05, + "loss": 3.7581, + "step": 391 + }, + { + "epoch": 0.12031921424186617, + "grad_norm": 2.836794376373291, + "learning_rate": 3.997955010224949e-05, + "loss": 3.7466, + "step": 392 + }, + { + "epoch": 0.12062615101289134, + "grad_norm": 4.071137428283691, + "learning_rate": 4.0081799591002043e-05, + "loss": 3.7836, + "step": 393 + }, + { + "epoch": 0.12093308778391651, + "grad_norm": 3.3141064643859863, + "learning_rate": 4.0184049079754604e-05, + "loss": 3.754, + "step": 394 + }, + { + "epoch": 0.12124002455494168, + "grad_norm": 3.6064393520355225, + "learning_rate": 4.028629856850716e-05, + "loss": 3.8379, + "step": 395 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 3.7306606769561768, + "learning_rate": 4.038854805725972e-05, + "loss": 3.6848, + "step": 396 + }, + { + "epoch": 0.12185389809699201, + "grad_norm": 3.5877859592437744, + "learning_rate": 4.049079754601227e-05, + "loss": 3.8201, + "step": 397 + }, + { + "epoch": 0.12216083486801718, + "grad_norm": 3.930271625518799, + "learning_rate": 4.059304703476483e-05, + "loss": 3.7507, + "step": 398 + }, + { + "epoch": 0.12246777163904236, + "grad_norm": 2.974968194961548, + "learning_rate": 4.0695296523517386e-05, + "loss": 3.7545, + "step": 399 + }, + { + "epoch": 0.12277470841006753, + "grad_norm": 4.655934810638428, + "learning_rate": 4.079754601226994e-05, + "loss": 3.8093, + "step": 400 + }, + { + "epoch": 0.1230816451810927, + "grad_norm": 3.201986312866211, + "learning_rate": 4.08997955010225e-05, + "loss": 3.7252, + "step": 401 + }, + { + "epoch": 0.12338858195211787, + "grad_norm": 4.447626113891602, + "learning_rate": 4.100204498977505e-05, + "loss": 3.7132, + "step": 402 + }, + { + "epoch": 0.12369551872314304, + "grad_norm": 2.6518118381500244, + "learning_rate": 4.1104294478527614e-05, + "loss": 3.7637, + "step": 403 + }, + { + "epoch": 0.1240024554941682, + "grad_norm": 5.116448402404785, + "learning_rate": 4.120654396728017e-05, + "loss": 3.6991, + "step": 404 + }, + { + "epoch": 0.12430939226519337, + "grad_norm": 2.7780613899230957, + "learning_rate": 4.130879345603272e-05, + "loss": 3.7555, + "step": 405 + }, + { + "epoch": 0.12461632903621854, + "grad_norm": 4.281010627746582, + "learning_rate": 4.1411042944785274e-05, + "loss": 3.688, + "step": 406 + }, + { + "epoch": 0.12492326580724371, + "grad_norm": 2.851562023162842, + "learning_rate": 4.1513292433537835e-05, + "loss": 3.7557, + "step": 407 + }, + { + "epoch": 0.1252302025782689, + "grad_norm": 4.092229843139648, + "learning_rate": 4.161554192229039e-05, + "loss": 3.7179, + "step": 408 + }, + { + "epoch": 0.12553713934929406, + "grad_norm": 3.410094976425171, + "learning_rate": 4.171779141104294e-05, + "loss": 3.7292, + "step": 409 + }, + { + "epoch": 0.12584407612031923, + "grad_norm": 4.266562461853027, + "learning_rate": 4.18200408997955e-05, + "loss": 3.8204, + "step": 410 + }, + { + "epoch": 0.1261510128913444, + "grad_norm": 2.997642755508423, + "learning_rate": 4.1922290388548056e-05, + "loss": 3.7773, + "step": 411 + }, + { + "epoch": 0.12645794966236956, + "grad_norm": 4.50873327255249, + "learning_rate": 4.2024539877300617e-05, + "loss": 3.7255, + "step": 412 + }, + { + "epoch": 0.12676488643339473, + "grad_norm": 3.65312123298645, + "learning_rate": 4.212678936605317e-05, + "loss": 3.6472, + "step": 413 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 3.985487222671509, + "learning_rate": 4.222903885480573e-05, + "loss": 3.6915, + "step": 414 + }, + { + "epoch": 0.12737875997544507, + "grad_norm": 3.6020219326019287, + "learning_rate": 4.2331288343558284e-05, + "loss": 3.7299, + "step": 415 + }, + { + "epoch": 0.12768569674647023, + "grad_norm": 3.414529323577881, + "learning_rate": 4.243353783231084e-05, + "loss": 3.7827, + "step": 416 + }, + { + "epoch": 0.1279926335174954, + "grad_norm": 3.537292718887329, + "learning_rate": 4.25357873210634e-05, + "loss": 3.751, + "step": 417 + }, + { + "epoch": 0.12829957028852057, + "grad_norm": 3.5442280769348145, + "learning_rate": 4.263803680981595e-05, + "loss": 3.6828, + "step": 418 + }, + { + "epoch": 0.12860650705954574, + "grad_norm": 3.9816019535064697, + "learning_rate": 4.274028629856851e-05, + "loss": 3.7668, + "step": 419 + }, + { + "epoch": 0.1289134438305709, + "grad_norm": 3.1632657051086426, + "learning_rate": 4.2842535787321066e-05, + "loss": 3.6946, + "step": 420 + }, + { + "epoch": 0.12922038060159607, + "grad_norm": 4.731013298034668, + "learning_rate": 4.2944785276073626e-05, + "loss": 3.7078, + "step": 421 + }, + { + "epoch": 0.12952731737262124, + "grad_norm": 2.7973382472991943, + "learning_rate": 4.304703476482618e-05, + "loss": 3.5934, + "step": 422 + }, + { + "epoch": 0.1298342541436464, + "grad_norm": 4.555461406707764, + "learning_rate": 4.3149284253578733e-05, + "loss": 3.7406, + "step": 423 + }, + { + "epoch": 0.13014119091467158, + "grad_norm": 3.25795841217041, + "learning_rate": 4.3251533742331294e-05, + "loss": 3.6302, + "step": 424 + }, + { + "epoch": 0.13044812768569675, + "grad_norm": 3.9974427223205566, + "learning_rate": 4.335378323108385e-05, + "loss": 3.6995, + "step": 425 + }, + { + "epoch": 0.13075506445672191, + "grad_norm": 3.4234917163848877, + "learning_rate": 4.34560327198364e-05, + "loss": 3.727, + "step": 426 + }, + { + "epoch": 0.13106200122774708, + "grad_norm": 3.40573787689209, + "learning_rate": 4.3558282208588955e-05, + "loss": 3.6964, + "step": 427 + }, + { + "epoch": 0.13136893799877225, + "grad_norm": 3.6903765201568604, + "learning_rate": 4.3660531697341515e-05, + "loss": 3.7139, + "step": 428 + }, + { + "epoch": 0.13167587476979742, + "grad_norm": 3.3252439498901367, + "learning_rate": 4.376278118609407e-05, + "loss": 3.7221, + "step": 429 + }, + { + "epoch": 0.1319828115408226, + "grad_norm": 3.591610908508301, + "learning_rate": 4.386503067484663e-05, + "loss": 3.6592, + "step": 430 + }, + { + "epoch": 0.13228974831184775, + "grad_norm": 3.584683418273926, + "learning_rate": 4.396728016359918e-05, + "loss": 3.695, + "step": 431 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 3.5093443393707275, + "learning_rate": 4.4069529652351736e-05, + "loss": 3.6368, + "step": 432 + }, + { + "epoch": 0.1329036218538981, + "grad_norm": 3.5040347576141357, + "learning_rate": 4.41717791411043e-05, + "loss": 3.6463, + "step": 433 + }, + { + "epoch": 0.13321055862492326, + "grad_norm": 3.534536361694336, + "learning_rate": 4.427402862985685e-05, + "loss": 3.681, + "step": 434 + }, + { + "epoch": 0.13351749539594843, + "grad_norm": 4.016106605529785, + "learning_rate": 4.437627811860941e-05, + "loss": 3.7592, + "step": 435 + }, + { + "epoch": 0.1338244321669736, + "grad_norm": 3.4661898612976074, + "learning_rate": 4.4478527607361964e-05, + "loss": 3.6437, + "step": 436 + }, + { + "epoch": 0.13413136893799876, + "grad_norm": 3.917189359664917, + "learning_rate": 4.4580777096114525e-05, + "loss": 3.6809, + "step": 437 + }, + { + "epoch": 0.13443830570902393, + "grad_norm": 3.472147226333618, + "learning_rate": 4.468302658486708e-05, + "loss": 3.5978, + "step": 438 + }, + { + "epoch": 0.1347452424800491, + "grad_norm": 3.2357044219970703, + "learning_rate": 4.478527607361964e-05, + "loss": 3.6758, + "step": 439 + }, + { + "epoch": 0.13505217925107427, + "grad_norm": 3.8607826232910156, + "learning_rate": 4.488752556237219e-05, + "loss": 3.7155, + "step": 440 + }, + { + "epoch": 0.13535911602209943, + "grad_norm": 3.085242509841919, + "learning_rate": 4.4989775051124746e-05, + "loss": 3.674, + "step": 441 + }, + { + "epoch": 0.1356660527931246, + "grad_norm": 4.0473432540893555, + "learning_rate": 4.5092024539877307e-05, + "loss": 3.6542, + "step": 442 + }, + { + "epoch": 0.1359729895641498, + "grad_norm": 3.4742088317871094, + "learning_rate": 4.519427402862986e-05, + "loss": 3.6226, + "step": 443 + }, + { + "epoch": 0.13627992633517497, + "grad_norm": 3.8838884830474854, + "learning_rate": 4.5296523517382414e-05, + "loss": 3.695, + "step": 444 + }, + { + "epoch": 0.13658686310620013, + "grad_norm": 3.1551895141601562, + "learning_rate": 4.539877300613497e-05, + "loss": 3.6886, + "step": 445 + }, + { + "epoch": 0.1368937998772253, + "grad_norm": 3.6824824810028076, + "learning_rate": 4.550102249488753e-05, + "loss": 3.6397, + "step": 446 + }, + { + "epoch": 0.13720073664825047, + "grad_norm": 3.3671298027038574, + "learning_rate": 4.560327198364008e-05, + "loss": 3.5983, + "step": 447 + }, + { + "epoch": 0.13750767341927564, + "grad_norm": 4.11976957321167, + "learning_rate": 4.570552147239264e-05, + "loss": 3.6371, + "step": 448 + }, + { + "epoch": 0.1378146101903008, + "grad_norm": 3.2035205364227295, + "learning_rate": 4.5807770961145195e-05, + "loss": 3.6097, + "step": 449 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 4.944174289703369, + "learning_rate": 4.591002044989775e-05, + "loss": 3.6317, + "step": 450 + }, + { + "epoch": 0.13842848373235114, + "grad_norm": 3.0040266513824463, + "learning_rate": 4.601226993865031e-05, + "loss": 3.6407, + "step": 451 + }, + { + "epoch": 0.1387354205033763, + "grad_norm": 5.124639511108398, + "learning_rate": 4.611451942740286e-05, + "loss": 3.6539, + "step": 452 + }, + { + "epoch": 0.13904235727440148, + "grad_norm": 2.792884349822998, + "learning_rate": 4.6216768916155423e-05, + "loss": 3.6542, + "step": 453 + }, + { + "epoch": 0.13934929404542665, + "grad_norm": 4.394725799560547, + "learning_rate": 4.631901840490798e-05, + "loss": 3.6811, + "step": 454 + }, + { + "epoch": 0.13965623081645182, + "grad_norm": 3.209400177001953, + "learning_rate": 4.642126789366054e-05, + "loss": 3.6635, + "step": 455 + }, + { + "epoch": 0.13996316758747698, + "grad_norm": 3.6599526405334473, + "learning_rate": 4.652351738241309e-05, + "loss": 3.5732, + "step": 456 + }, + { + "epoch": 0.14027010435850215, + "grad_norm": 3.6527204513549805, + "learning_rate": 4.6625766871165645e-05, + "loss": 3.5979, + "step": 457 + }, + { + "epoch": 0.14057704112952732, + "grad_norm": 3.4562110900878906, + "learning_rate": 4.6728016359918205e-05, + "loss": 3.6761, + "step": 458 + }, + { + "epoch": 0.1408839779005525, + "grad_norm": 3.5935721397399902, + "learning_rate": 4.683026584867076e-05, + "loss": 3.6598, + "step": 459 + }, + { + "epoch": 0.14119091467157766, + "grad_norm": 3.4518251419067383, + "learning_rate": 4.693251533742332e-05, + "loss": 3.5707, + "step": 460 + }, + { + "epoch": 0.14149785144260282, + "grad_norm": 3.3248815536499023, + "learning_rate": 4.703476482617587e-05, + "loss": 3.6949, + "step": 461 + }, + { + "epoch": 0.141804788213628, + "grad_norm": 3.6379971504211426, + "learning_rate": 4.7137014314928426e-05, + "loss": 3.6265, + "step": 462 + }, + { + "epoch": 0.14211172498465316, + "grad_norm": 4.068325996398926, + "learning_rate": 4.723926380368098e-05, + "loss": 3.6096, + "step": 463 + }, + { + "epoch": 0.14241866175567833, + "grad_norm": 3.0870959758758545, + "learning_rate": 4.734151329243354e-05, + "loss": 3.5201, + "step": 464 + }, + { + "epoch": 0.1427255985267035, + "grad_norm": 4.013638973236084, + "learning_rate": 4.7443762781186094e-05, + "loss": 3.5845, + "step": 465 + }, + { + "epoch": 0.14303253529772866, + "grad_norm": 3.421921968460083, + "learning_rate": 4.754601226993865e-05, + "loss": 3.6718, + "step": 466 + }, + { + "epoch": 0.14333947206875383, + "grad_norm": 3.4814112186431885, + "learning_rate": 4.764826175869121e-05, + "loss": 3.6225, + "step": 467 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 2.9323105812072754, + "learning_rate": 4.775051124744376e-05, + "loss": 3.5881, + "step": 468 + }, + { + "epoch": 0.14395334561080417, + "grad_norm": 3.862344264984131, + "learning_rate": 4.785276073619632e-05, + "loss": 3.6264, + "step": 469 + }, + { + "epoch": 0.14426028238182934, + "grad_norm": 2.950495481491089, + "learning_rate": 4.7955010224948876e-05, + "loss": 3.5891, + "step": 470 + }, + { + "epoch": 0.1445672191528545, + "grad_norm": 4.360744476318359, + "learning_rate": 4.8057259713701436e-05, + "loss": 3.6746, + "step": 471 + }, + { + "epoch": 0.14487415592387967, + "grad_norm": 2.689297914505005, + "learning_rate": 4.815950920245399e-05, + "loss": 3.616, + "step": 472 + }, + { + "epoch": 0.14518109269490484, + "grad_norm": 4.433006286621094, + "learning_rate": 4.826175869120655e-05, + "loss": 3.6259, + "step": 473 + }, + { + "epoch": 0.14548802946593, + "grad_norm": 2.9184467792510986, + "learning_rate": 4.8364008179959104e-05, + "loss": 3.59, + "step": 474 + }, + { + "epoch": 0.14579496623695518, + "grad_norm": 4.472714424133301, + "learning_rate": 4.846625766871166e-05, + "loss": 3.5608, + "step": 475 + }, + { + "epoch": 0.14610190300798034, + "grad_norm": 3.0839431285858154, + "learning_rate": 4.856850715746422e-05, + "loss": 3.6069, + "step": 476 + }, + { + "epoch": 0.1464088397790055, + "grad_norm": 3.8900411128997803, + "learning_rate": 4.867075664621677e-05, + "loss": 3.5387, + "step": 477 + }, + { + "epoch": 0.14671577655003068, + "grad_norm": 3.0446956157684326, + "learning_rate": 4.877300613496933e-05, + "loss": 3.5374, + "step": 478 + }, + { + "epoch": 0.14702271332105588, + "grad_norm": 3.805018901824951, + "learning_rate": 4.8875255623721885e-05, + "loss": 3.6032, + "step": 479 + }, + { + "epoch": 0.14732965009208104, + "grad_norm": 2.9937491416931152, + "learning_rate": 4.897750511247444e-05, + "loss": 3.548, + "step": 480 + }, + { + "epoch": 0.1476365868631062, + "grad_norm": 4.103757858276367, + "learning_rate": 4.907975460122699e-05, + "loss": 3.6292, + "step": 481 + }, + { + "epoch": 0.14794352363413138, + "grad_norm": 2.8275530338287354, + "learning_rate": 4.918200408997955e-05, + "loss": 3.5885, + "step": 482 + }, + { + "epoch": 0.14825046040515655, + "grad_norm": 4.104444980621338, + "learning_rate": 4.928425357873211e-05, + "loss": 3.5566, + "step": 483 + }, + { + "epoch": 0.14855739717618172, + "grad_norm": 2.820648670196533, + "learning_rate": 4.938650306748466e-05, + "loss": 3.6576, + "step": 484 + }, + { + "epoch": 0.14886433394720688, + "grad_norm": 4.639568328857422, + "learning_rate": 4.948875255623722e-05, + "loss": 3.583, + "step": 485 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 2.8675858974456787, + "learning_rate": 4.9591002044989774e-05, + "loss": 3.5982, + "step": 486 + }, + { + "epoch": 0.14947820748925722, + "grad_norm": 4.820484638214111, + "learning_rate": 4.9693251533742335e-05, + "loss": 3.5479, + "step": 487 + }, + { + "epoch": 0.1497851442602824, + "grad_norm": 2.9569075107574463, + "learning_rate": 4.979550102249489e-05, + "loss": 3.5846, + "step": 488 + }, + { + "epoch": 0.15009208103130756, + "grad_norm": 4.402152061462402, + "learning_rate": 4.989775051124745e-05, + "loss": 3.5368, + "step": 489 + }, + { + "epoch": 0.15039901780233272, + "grad_norm": 3.0454704761505127, + "learning_rate": 5e-05, + "loss": 3.5233, + "step": 490 + }, + { + "epoch": 0.1507059545733579, + "grad_norm": 3.564425468444824, + "learning_rate": 5.010224948875256e-05, + "loss": 3.5747, + "step": 491 + }, + { + "epoch": 0.15101289134438306, + "grad_norm": 3.2065536975860596, + "learning_rate": 5.020449897750511e-05, + "loss": 3.4803, + "step": 492 + }, + { + "epoch": 0.15131982811540823, + "grad_norm": 4.06170129776001, + "learning_rate": 5.030674846625767e-05, + "loss": 3.5867, + "step": 493 + }, + { + "epoch": 0.1516267648864334, + "grad_norm": 2.937181234359741, + "learning_rate": 5.040899795501023e-05, + "loss": 3.5098, + "step": 494 + }, + { + "epoch": 0.15193370165745856, + "grad_norm": 3.7272653579711914, + "learning_rate": 5.051124744376279e-05, + "loss": 3.5959, + "step": 495 + }, + { + "epoch": 0.15224063842848373, + "grad_norm": 2.8606886863708496, + "learning_rate": 5.061349693251534e-05, + "loss": 3.4881, + "step": 496 + }, + { + "epoch": 0.1525475751995089, + "grad_norm": 3.4861185550689697, + "learning_rate": 5.07157464212679e-05, + "loss": 3.563, + "step": 497 + }, + { + "epoch": 0.15285451197053407, + "grad_norm": 3.1362967491149902, + "learning_rate": 5.081799591002045e-05, + "loss": 3.5564, + "step": 498 + }, + { + "epoch": 0.15316144874155924, + "grad_norm": 3.360508441925049, + "learning_rate": 5.0920245398773005e-05, + "loss": 3.5307, + "step": 499 + }, + { + "epoch": 0.1534683855125844, + "grad_norm": 3.2896840572357178, + "learning_rate": 5.1022494887525566e-05, + "loss": 3.4843, + "step": 500 + }, + { + "epoch": 0.15377532228360957, + "grad_norm": 3.320429801940918, + "learning_rate": 5.112474437627812e-05, + "loss": 3.484, + "step": 501 + }, + { + "epoch": 0.15408225905463474, + "grad_norm": 3.409586191177368, + "learning_rate": 5.122699386503068e-05, + "loss": 3.506, + "step": 502 + }, + { + "epoch": 0.1543891958256599, + "grad_norm": 3.0944409370422363, + "learning_rate": 5.1329243353783227e-05, + "loss": 3.5011, + "step": 503 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 3.7220418453216553, + "learning_rate": 5.143149284253579e-05, + "loss": 3.5629, + "step": 504 + }, + { + "epoch": 0.15500306936771024, + "grad_norm": 3.217435359954834, + "learning_rate": 5.153374233128835e-05, + "loss": 3.4957, + "step": 505 + }, + { + "epoch": 0.1553100061387354, + "grad_norm": 4.0457444190979, + "learning_rate": 5.163599182004091e-05, + "loss": 3.5152, + "step": 506 + }, + { + "epoch": 0.15561694290976058, + "grad_norm": 2.9380006790161133, + "learning_rate": 5.1738241308793455e-05, + "loss": 3.5261, + "step": 507 + }, + { + "epoch": 0.15592387968078575, + "grad_norm": 4.134535312652588, + "learning_rate": 5.1840490797546015e-05, + "loss": 3.5622, + "step": 508 + }, + { + "epoch": 0.15623081645181092, + "grad_norm": 2.8209407329559326, + "learning_rate": 5.1942740286298575e-05, + "loss": 3.5335, + "step": 509 + }, + { + "epoch": 0.15653775322283608, + "grad_norm": 4.4260711669921875, + "learning_rate": 5.204498977505112e-05, + "loss": 3.5554, + "step": 510 + }, + { + "epoch": 0.15684468999386125, + "grad_norm": 2.8649590015411377, + "learning_rate": 5.214723926380368e-05, + "loss": 3.4989, + "step": 511 + }, + { + "epoch": 0.15715162676488642, + "grad_norm": 4.0349812507629395, + "learning_rate": 5.224948875255624e-05, + "loss": 3.4883, + "step": 512 + }, + { + "epoch": 0.1574585635359116, + "grad_norm": 2.841923475265503, + "learning_rate": 5.2351738241308803e-05, + "loss": 3.4748, + "step": 513 + }, + { + "epoch": 0.15776550030693678, + "grad_norm": 3.8810653686523438, + "learning_rate": 5.245398773006135e-05, + "loss": 3.5403, + "step": 514 + }, + { + "epoch": 0.15807243707796195, + "grad_norm": 3.0830774307250977, + "learning_rate": 5.255623721881391e-05, + "loss": 3.513, + "step": 515 + }, + { + "epoch": 0.15837937384898712, + "grad_norm": 3.8688604831695557, + "learning_rate": 5.265848670756647e-05, + "loss": 3.5409, + "step": 516 + }, + { + "epoch": 0.1586863106200123, + "grad_norm": 2.854600429534912, + "learning_rate": 5.276073619631902e-05, + "loss": 3.4441, + "step": 517 + }, + { + "epoch": 0.15899324739103746, + "grad_norm": 3.9125611782073975, + "learning_rate": 5.286298568507158e-05, + "loss": 3.4953, + "step": 518 + }, + { + "epoch": 0.15930018416206262, + "grad_norm": 2.8626177310943604, + "learning_rate": 5.296523517382413e-05, + "loss": 3.5279, + "step": 519 + }, + { + "epoch": 0.1596071209330878, + "grad_norm": 3.5023677349090576, + "learning_rate": 5.306748466257669e-05, + "loss": 3.4886, + "step": 520 + }, + { + "epoch": 0.15991405770411296, + "grad_norm": 2.960505962371826, + "learning_rate": 5.316973415132924e-05, + "loss": 3.5278, + "step": 521 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 3.976245164871216, + "learning_rate": 5.32719836400818e-05, + "loss": 3.5236, + "step": 522 + }, + { + "epoch": 0.1605279312461633, + "grad_norm": 3.078248977661133, + "learning_rate": 5.337423312883436e-05, + "loss": 3.5194, + "step": 523 + }, + { + "epoch": 0.16083486801718846, + "grad_norm": 3.7498552799224854, + "learning_rate": 5.347648261758691e-05, + "loss": 3.5315, + "step": 524 + }, + { + "epoch": 0.16114180478821363, + "grad_norm": 2.87638258934021, + "learning_rate": 5.357873210633947e-05, + "loss": 3.434, + "step": 525 + }, + { + "epoch": 0.1614487415592388, + "grad_norm": 3.786454677581787, + "learning_rate": 5.368098159509203e-05, + "loss": 3.4985, + "step": 526 + }, + { + "epoch": 0.16175567833026397, + "grad_norm": 2.915156364440918, + "learning_rate": 5.378323108384459e-05, + "loss": 3.4979, + "step": 527 + }, + { + "epoch": 0.16206261510128914, + "grad_norm": 4.095824718475342, + "learning_rate": 5.3885480572597135e-05, + "loss": 3.4605, + "step": 528 + }, + { + "epoch": 0.1623695518723143, + "grad_norm": 2.793501853942871, + "learning_rate": 5.3987730061349695e-05, + "loss": 3.476, + "step": 529 + }, + { + "epoch": 0.16267648864333947, + "grad_norm": 3.9074480533599854, + "learning_rate": 5.4089979550102256e-05, + "loss": 3.4636, + "step": 530 + }, + { + "epoch": 0.16298342541436464, + "grad_norm": 2.8382515907287598, + "learning_rate": 5.4192229038854816e-05, + "loss": 3.4364, + "step": 531 + }, + { + "epoch": 0.1632903621853898, + "grad_norm": 3.4670751094818115, + "learning_rate": 5.429447852760736e-05, + "loss": 3.5033, + "step": 532 + }, + { + "epoch": 0.16359729895641498, + "grad_norm": 2.8805580139160156, + "learning_rate": 5.439672801635992e-05, + "loss": 3.471, + "step": 533 + }, + { + "epoch": 0.16390423572744015, + "grad_norm": 3.745434522628784, + "learning_rate": 5.4498977505112484e-05, + "loss": 3.4565, + "step": 534 + }, + { + "epoch": 0.1642111724984653, + "grad_norm": 3.290579319000244, + "learning_rate": 5.460122699386503e-05, + "loss": 3.47, + "step": 535 + }, + { + "epoch": 0.16451810926949048, + "grad_norm": 3.2988481521606445, + "learning_rate": 5.470347648261759e-05, + "loss": 3.3781, + "step": 536 + }, + { + "epoch": 0.16482504604051565, + "grad_norm": 3.3673248291015625, + "learning_rate": 5.4805725971370145e-05, + "loss": 3.4891, + "step": 537 + }, + { + "epoch": 0.16513198281154082, + "grad_norm": 3.1917717456817627, + "learning_rate": 5.4907975460122705e-05, + "loss": 3.4493, + "step": 538 + }, + { + "epoch": 0.16543891958256599, + "grad_norm": 3.3869614601135254, + "learning_rate": 5.501022494887525e-05, + "loss": 3.3954, + "step": 539 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 2.896742820739746, + "learning_rate": 5.511247443762781e-05, + "loss": 3.4465, + "step": 540 + }, + { + "epoch": 0.16605279312461632, + "grad_norm": 3.771268844604492, + "learning_rate": 5.521472392638037e-05, + "loss": 3.4889, + "step": 541 + }, + { + "epoch": 0.1663597298956415, + "grad_norm": 2.8693349361419678, + "learning_rate": 5.531697341513292e-05, + "loss": 3.3661, + "step": 542 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.093103885650635, + "learning_rate": 5.541922290388548e-05, + "loss": 3.4451, + "step": 543 + }, + { + "epoch": 0.16697360343769183, + "grad_norm": 3.050361394882202, + "learning_rate": 5.552147239263804e-05, + "loss": 3.4203, + "step": 544 + }, + { + "epoch": 0.167280540208717, + "grad_norm": 3.041480302810669, + "learning_rate": 5.56237218813906e-05, + "loss": 3.4173, + "step": 545 + }, + { + "epoch": 0.16758747697974216, + "grad_norm": 3.385680675506592, + "learning_rate": 5.572597137014315e-05, + "loss": 3.4408, + "step": 546 + }, + { + "epoch": 0.16789441375076733, + "grad_norm": 2.88845157623291, + "learning_rate": 5.582822085889571e-05, + "loss": 3.4536, + "step": 547 + }, + { + "epoch": 0.1682013505217925, + "grad_norm": 3.7155961990356445, + "learning_rate": 5.593047034764827e-05, + "loss": 3.4392, + "step": 548 + }, + { + "epoch": 0.1685082872928177, + "grad_norm": 3.4626615047454834, + "learning_rate": 5.6032719836400815e-05, + "loss": 3.4395, + "step": 549 + }, + { + "epoch": 0.16881522406384286, + "grad_norm": 3.182154417037964, + "learning_rate": 5.6134969325153376e-05, + "loss": 3.5239, + "step": 550 + }, + { + "epoch": 0.16912216083486803, + "grad_norm": 3.478602886199951, + "learning_rate": 5.6237218813905936e-05, + "loss": 3.4258, + "step": 551 + }, + { + "epoch": 0.1694290976058932, + "grad_norm": 2.9652369022369385, + "learning_rate": 5.6339468302658496e-05, + "loss": 3.3919, + "step": 552 + }, + { + "epoch": 0.16973603437691837, + "grad_norm": 3.736821413040161, + "learning_rate": 5.644171779141104e-05, + "loss": 3.4491, + "step": 553 + }, + { + "epoch": 0.17004297114794353, + "grad_norm": 2.7791361808776855, + "learning_rate": 5.6543967280163604e-05, + "loss": 3.4748, + "step": 554 + }, + { + "epoch": 0.1703499079189687, + "grad_norm": 4.583637714385986, + "learning_rate": 5.664621676891616e-05, + "loss": 3.4554, + "step": 555 + }, + { + "epoch": 0.17065684468999387, + "grad_norm": 2.8527474403381348, + "learning_rate": 5.674846625766872e-05, + "loss": 3.4327, + "step": 556 + }, + { + "epoch": 0.17096378146101904, + "grad_norm": 4.116163730621338, + "learning_rate": 5.685071574642127e-05, + "loss": 3.4043, + "step": 557 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 3.0130903720855713, + "learning_rate": 5.6952965235173825e-05, + "loss": 3.4823, + "step": 558 + }, + { + "epoch": 0.17157765500306937, + "grad_norm": 3.3556432723999023, + "learning_rate": 5.7055214723926385e-05, + "loss": 3.4464, + "step": 559 + }, + { + "epoch": 0.17188459177409454, + "grad_norm": 2.854952573776245, + "learning_rate": 5.715746421267893e-05, + "loss": 3.3768, + "step": 560 + }, + { + "epoch": 0.1721915285451197, + "grad_norm": 3.9891982078552246, + "learning_rate": 5.725971370143149e-05, + "loss": 3.3949, + "step": 561 + }, + { + "epoch": 0.17249846531614488, + "grad_norm": 2.980468511581421, + "learning_rate": 5.736196319018405e-05, + "loss": 3.459, + "step": 562 + }, + { + "epoch": 0.17280540208717005, + "grad_norm": 3.453510284423828, + "learning_rate": 5.7464212678936613e-05, + "loss": 3.4549, + "step": 563 + }, + { + "epoch": 0.1731123388581952, + "grad_norm": 2.8926782608032227, + "learning_rate": 5.756646216768916e-05, + "loss": 3.392, + "step": 564 + }, + { + "epoch": 0.17341927562922038, + "grad_norm": 3.3722894191741943, + "learning_rate": 5.766871165644172e-05, + "loss": 3.4002, + "step": 565 + }, + { + "epoch": 0.17372621240024555, + "grad_norm": 2.8093647956848145, + "learning_rate": 5.777096114519428e-05, + "loss": 3.3862, + "step": 566 + }, + { + "epoch": 0.17403314917127072, + "grad_norm": 4.1722731590271, + "learning_rate": 5.787321063394683e-05, + "loss": 3.3903, + "step": 567 + }, + { + "epoch": 0.17434008594229589, + "grad_norm": 2.778069257736206, + "learning_rate": 5.797546012269939e-05, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.17464702271332105, + "grad_norm": 3.8501908779144287, + "learning_rate": 5.807770961145195e-05, + "loss": 3.4094, + "step": 569 + }, + { + "epoch": 0.17495395948434622, + "grad_norm": 2.5164549350738525, + "learning_rate": 5.817995910020451e-05, + "loss": 3.4343, + "step": 570 + }, + { + "epoch": 0.1752608962553714, + "grad_norm": 4.0673065185546875, + "learning_rate": 5.8282208588957056e-05, + "loss": 3.3993, + "step": 571 + }, + { + "epoch": 0.17556783302639656, + "grad_norm": 2.7882072925567627, + "learning_rate": 5.8384458077709616e-05, + "loss": 3.4759, + "step": 572 + }, + { + "epoch": 0.17587476979742173, + "grad_norm": 3.3252487182617188, + "learning_rate": 5.848670756646217e-05, + "loss": 3.3562, + "step": 573 + }, + { + "epoch": 0.1761817065684469, + "grad_norm": 2.7499115467071533, + "learning_rate": 5.8588957055214724e-05, + "loss": 3.3376, + "step": 574 + }, + { + "epoch": 0.17648864333947206, + "grad_norm": 4.061224460601807, + "learning_rate": 5.8691206543967284e-05, + "loss": 3.3521, + "step": 575 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 3.022193431854248, + "learning_rate": 5.879345603271984e-05, + "loss": 3.3933, + "step": 576 + }, + { + "epoch": 0.1771025168815224, + "grad_norm": 3.2442128658294678, + "learning_rate": 5.88957055214724e-05, + "loss": 3.4531, + "step": 577 + }, + { + "epoch": 0.17740945365254757, + "grad_norm": 2.9524872303009033, + "learning_rate": 5.8997955010224945e-05, + "loss": 3.332, + "step": 578 + }, + { + "epoch": 0.17771639042357273, + "grad_norm": 3.4604902267456055, + "learning_rate": 5.9100204498977505e-05, + "loss": 3.3706, + "step": 579 + }, + { + "epoch": 0.1780233271945979, + "grad_norm": 3.05216646194458, + "learning_rate": 5.9202453987730066e-05, + "loss": 3.463, + "step": 580 + }, + { + "epoch": 0.17833026396562307, + "grad_norm": 3.427311658859253, + "learning_rate": 5.9304703476482626e-05, + "loss": 3.4204, + "step": 581 + }, + { + "epoch": 0.17863720073664824, + "grad_norm": 2.5583856105804443, + "learning_rate": 5.940695296523517e-05, + "loss": 3.4686, + "step": 582 + }, + { + "epoch": 0.1789441375076734, + "grad_norm": 3.85471248626709, + "learning_rate": 5.950920245398773e-05, + "loss": 3.4518, + "step": 583 + }, + { + "epoch": 0.17925107427869857, + "grad_norm": 2.6894235610961914, + "learning_rate": 5.9611451942740294e-05, + "loss": 3.4179, + "step": 584 + }, + { + "epoch": 0.17955801104972377, + "grad_norm": 3.7592904567718506, + "learning_rate": 5.971370143149284e-05, + "loss": 3.3197, + "step": 585 + }, + { + "epoch": 0.17986494782074894, + "grad_norm": 2.8180313110351562, + "learning_rate": 5.98159509202454e-05, + "loss": 3.4098, + "step": 586 + }, + { + "epoch": 0.1801718845917741, + "grad_norm": 3.5678224563598633, + "learning_rate": 5.991820040899796e-05, + "loss": 3.3644, + "step": 587 + }, + { + "epoch": 0.18047882136279927, + "grad_norm": 2.920607328414917, + "learning_rate": 6.002044989775052e-05, + "loss": 3.4158, + "step": 588 + }, + { + "epoch": 0.18078575813382444, + "grad_norm": 2.9465436935424805, + "learning_rate": 6.012269938650307e-05, + "loss": 3.3369, + "step": 589 + }, + { + "epoch": 0.1810926949048496, + "grad_norm": 3.8760533332824707, + "learning_rate": 6.022494887525563e-05, + "loss": 3.4205, + "step": 590 + }, + { + "epoch": 0.18139963167587478, + "grad_norm": 3.2972259521484375, + "learning_rate": 6.032719836400819e-05, + "loss": 3.3234, + "step": 591 + }, + { + "epoch": 0.18170656844689995, + "grad_norm": 2.8855841159820557, + "learning_rate": 6.0429447852760736e-05, + "loss": 3.4172, + "step": 592 + }, + { + "epoch": 0.18201350521792511, + "grad_norm": 3.3035166263580322, + "learning_rate": 6.05316973415133e-05, + "loss": 3.3235, + "step": 593 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 2.5975232124328613, + "learning_rate": 6.063394683026585e-05, + "loss": 3.3245, + "step": 594 + }, + { + "epoch": 0.18262737875997545, + "grad_norm": 3.68007755279541, + "learning_rate": 6.073619631901841e-05, + "loss": 3.4348, + "step": 595 + }, + { + "epoch": 0.18293431553100062, + "grad_norm": 2.774419069290161, + "learning_rate": 6.083844580777096e-05, + "loss": 3.2763, + "step": 596 + }, + { + "epoch": 0.1832412523020258, + "grad_norm": 3.686140298843384, + "learning_rate": 6.094069529652352e-05, + "loss": 3.29, + "step": 597 + }, + { + "epoch": 0.18354818907305095, + "grad_norm": 2.71142315864563, + "learning_rate": 6.104294478527609e-05, + "loss": 3.3899, + "step": 598 + }, + { + "epoch": 0.18385512584407612, + "grad_norm": 3.725736141204834, + "learning_rate": 6.114519427402863e-05, + "loss": 3.3844, + "step": 599 + }, + { + "epoch": 0.1841620626151013, + "grad_norm": 2.691237211227417, + "learning_rate": 6.124744376278119e-05, + "loss": 3.3138, + "step": 600 + }, + { + "epoch": 0.18446899938612646, + "grad_norm": 3.467499256134033, + "learning_rate": 6.134969325153375e-05, + "loss": 3.3501, + "step": 601 + }, + { + "epoch": 0.18477593615715163, + "grad_norm": 2.776309013366699, + "learning_rate": 6.14519427402863e-05, + "loss": 3.3278, + "step": 602 + }, + { + "epoch": 0.1850828729281768, + "grad_norm": 3.4674019813537598, + "learning_rate": 6.155419222903885e-05, + "loss": 3.262, + "step": 603 + }, + { + "epoch": 0.18538980969920196, + "grad_norm": 2.8091421127319336, + "learning_rate": 6.165644171779141e-05, + "loss": 3.3296, + "step": 604 + }, + { + "epoch": 0.18569674647022713, + "grad_norm": 3.4938528537750244, + "learning_rate": 6.175869120654397e-05, + "loss": 3.4028, + "step": 605 + }, + { + "epoch": 0.1860036832412523, + "grad_norm": 2.5200188159942627, + "learning_rate": 6.186094069529653e-05, + "loss": 3.3726, + "step": 606 + }, + { + "epoch": 0.18631062001227747, + "grad_norm": 3.6415109634399414, + "learning_rate": 6.196319018404908e-05, + "loss": 3.3539, + "step": 607 + }, + { + "epoch": 0.18661755678330263, + "grad_norm": 2.553532123565674, + "learning_rate": 6.206543967280163e-05, + "loss": 3.2971, + "step": 608 + }, + { + "epoch": 0.1869244935543278, + "grad_norm": 3.7287046909332275, + "learning_rate": 6.21676891615542e-05, + "loss": 3.3987, + "step": 609 + }, + { + "epoch": 0.18723143032535297, + "grad_norm": 2.6285226345062256, + "learning_rate": 6.226993865030674e-05, + "loss": 3.2446, + "step": 610 + }, + { + "epoch": 0.18753836709637814, + "grad_norm": 3.453766107559204, + "learning_rate": 6.237218813905931e-05, + "loss": 3.2644, + "step": 611 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 2.7924115657806396, + "learning_rate": 6.247443762781186e-05, + "loss": 3.3056, + "step": 612 + }, + { + "epoch": 0.18815224063842848, + "grad_norm": 3.4854533672332764, + "learning_rate": 6.257668711656443e-05, + "loss": 3.3468, + "step": 613 + }, + { + "epoch": 0.18845917740945364, + "grad_norm": 2.8738653659820557, + "learning_rate": 6.267893660531697e-05, + "loss": 3.3079, + "step": 614 + }, + { + "epoch": 0.1887661141804788, + "grad_norm": 3.496342420578003, + "learning_rate": 6.278118609406954e-05, + "loss": 3.3453, + "step": 615 + }, + { + "epoch": 0.18907305095150398, + "grad_norm": 3.1935245990753174, + "learning_rate": 6.288343558282209e-05, + "loss": 3.303, + "step": 616 + }, + { + "epoch": 0.18937998772252915, + "grad_norm": 2.9726579189300537, + "learning_rate": 6.298568507157464e-05, + "loss": 3.284, + "step": 617 + }, + { + "epoch": 0.18968692449355432, + "grad_norm": 2.8515241146087646, + "learning_rate": 6.30879345603272e-05, + "loss": 3.2748, + "step": 618 + }, + { + "epoch": 0.18999386126457948, + "grad_norm": 3.216681480407715, + "learning_rate": 6.319018404907977e-05, + "loss": 3.2613, + "step": 619 + }, + { + "epoch": 0.19030079803560468, + "grad_norm": 2.9164562225341797, + "learning_rate": 6.329243353783232e-05, + "loss": 3.3234, + "step": 620 + }, + { + "epoch": 0.19060773480662985, + "grad_norm": 2.6724259853363037, + "learning_rate": 6.339468302658487e-05, + "loss": 3.3271, + "step": 621 + }, + { + "epoch": 0.19091467157765502, + "grad_norm": 3.298551082611084, + "learning_rate": 6.349693251533743e-05, + "loss": 3.2715, + "step": 622 + }, + { + "epoch": 0.19122160834868018, + "grad_norm": 2.609632968902588, + "learning_rate": 6.359918200408998e-05, + "loss": 3.2392, + "step": 623 + }, + { + "epoch": 0.19152854511970535, + "grad_norm": 3.6469385623931885, + "learning_rate": 6.370143149284253e-05, + "loss": 3.428, + "step": 624 + }, + { + "epoch": 0.19183548189073052, + "grad_norm": 2.4231622219085693, + "learning_rate": 6.380368098159509e-05, + "loss": 3.3436, + "step": 625 + }, + { + "epoch": 0.1921424186617557, + "grad_norm": 3.9182474613189697, + "learning_rate": 6.390593047034765e-05, + "loss": 3.3375, + "step": 626 + }, + { + "epoch": 0.19244935543278086, + "grad_norm": 2.3975942134857178, + "learning_rate": 6.400817995910021e-05, + "loss": 3.2711, + "step": 627 + }, + { + "epoch": 0.19275629220380602, + "grad_norm": 3.061039447784424, + "learning_rate": 6.411042944785276e-05, + "loss": 3.3124, + "step": 628 + }, + { + "epoch": 0.1930632289748312, + "grad_norm": 2.9461817741394043, + "learning_rate": 6.421267893660532e-05, + "loss": 3.2954, + "step": 629 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 2.6603662967681885, + "learning_rate": 6.431492842535788e-05, + "loss": 3.2138, + "step": 630 + }, + { + "epoch": 0.19367710251688153, + "grad_norm": 3.339444875717163, + "learning_rate": 6.441717791411042e-05, + "loss": 3.2796, + "step": 631 + }, + { + "epoch": 0.1939840392879067, + "grad_norm": 2.59061861038208, + "learning_rate": 6.451942740286299e-05, + "loss": 3.3906, + "step": 632 + }, + { + "epoch": 0.19429097605893186, + "grad_norm": 3.704300880432129, + "learning_rate": 6.462167689161554e-05, + "loss": 3.2604, + "step": 633 + }, + { + "epoch": 0.19459791282995703, + "grad_norm": 3.110203266143799, + "learning_rate": 6.472392638036811e-05, + "loss": 3.3236, + "step": 634 + }, + { + "epoch": 0.1949048496009822, + "grad_norm": 3.016730308532715, + "learning_rate": 6.482617586912065e-05, + "loss": 3.2911, + "step": 635 + }, + { + "epoch": 0.19521178637200737, + "grad_norm": 2.896956205368042, + "learning_rate": 6.492842535787322e-05, + "loss": 3.35, + "step": 636 + }, + { + "epoch": 0.19551872314303254, + "grad_norm": 2.7913663387298584, + "learning_rate": 6.503067484662577e-05, + "loss": 3.3474, + "step": 637 + }, + { + "epoch": 0.1958256599140577, + "grad_norm": 3.285518169403076, + "learning_rate": 6.513292433537832e-05, + "loss": 3.2131, + "step": 638 + }, + { + "epoch": 0.19613259668508287, + "grad_norm": 2.588491201400757, + "learning_rate": 6.523517382413088e-05, + "loss": 3.2955, + "step": 639 + }, + { + "epoch": 0.19643953345610804, + "grad_norm": 2.9417827129364014, + "learning_rate": 6.533742331288345e-05, + "loss": 3.2917, + "step": 640 + }, + { + "epoch": 0.1967464702271332, + "grad_norm": 3.2209408283233643, + "learning_rate": 6.5439672801636e-05, + "loss": 3.233, + "step": 641 + }, + { + "epoch": 0.19705340699815838, + "grad_norm": 2.8424925804138184, + "learning_rate": 6.554192229038855e-05, + "loss": 3.3194, + "step": 642 + }, + { + "epoch": 0.19736034376918354, + "grad_norm": 2.9005842208862305, + "learning_rate": 6.56441717791411e-05, + "loss": 3.275, + "step": 643 + }, + { + "epoch": 0.1976672805402087, + "grad_norm": 3.0277016162872314, + "learning_rate": 6.574642126789366e-05, + "loss": 3.2881, + "step": 644 + }, + { + "epoch": 0.19797421731123388, + "grad_norm": 2.8932368755340576, + "learning_rate": 6.584867075664623e-05, + "loss": 3.2799, + "step": 645 + }, + { + "epoch": 0.19828115408225905, + "grad_norm": 2.994464635848999, + "learning_rate": 6.595092024539877e-05, + "loss": 3.258, + "step": 646 + }, + { + "epoch": 0.19858809085328422, + "grad_norm": 2.943040132522583, + "learning_rate": 6.605316973415133e-05, + "loss": 3.1994, + "step": 647 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 2.942765712738037, + "learning_rate": 6.615541922290389e-05, + "loss": 3.1802, + "step": 648 + }, + { + "epoch": 0.19920196439533455, + "grad_norm": 2.8036246299743652, + "learning_rate": 6.625766871165644e-05, + "loss": 3.2426, + "step": 649 + }, + { + "epoch": 0.19950890116635972, + "grad_norm": 2.814507484436035, + "learning_rate": 6.6359918200409e-05, + "loss": 3.2978, + "step": 650 + }, + { + "epoch": 0.1998158379373849, + "grad_norm": 2.8133158683776855, + "learning_rate": 6.646216768916156e-05, + "loss": 3.2435, + "step": 651 + }, + { + "epoch": 0.20012277470841006, + "grad_norm": 2.8596129417419434, + "learning_rate": 6.656441717791412e-05, + "loss": 3.2154, + "step": 652 + }, + { + "epoch": 0.20042971147943522, + "grad_norm": 2.663926839828491, + "learning_rate": 6.666666666666667e-05, + "loss": 3.2487, + "step": 653 + }, + { + "epoch": 0.2007366482504604, + "grad_norm": 3.40561580657959, + "learning_rate": 6.676891615541922e-05, + "loss": 3.1509, + "step": 654 + }, + { + "epoch": 0.20104358502148556, + "grad_norm": 2.5786798000335693, + "learning_rate": 6.687116564417179e-05, + "loss": 3.2686, + "step": 655 + }, + { + "epoch": 0.20135052179251076, + "grad_norm": 3.007436752319336, + "learning_rate": 6.697341513292433e-05, + "loss": 3.2543, + "step": 656 + }, + { + "epoch": 0.20165745856353592, + "grad_norm": 2.5966951847076416, + "learning_rate": 6.70756646216769e-05, + "loss": 3.2643, + "step": 657 + }, + { + "epoch": 0.2019643953345611, + "grad_norm": 3.2698333263397217, + "learning_rate": 6.717791411042945e-05, + "loss": 3.2002, + "step": 658 + }, + { + "epoch": 0.20227133210558626, + "grad_norm": 2.513129472732544, + "learning_rate": 6.7280163599182e-05, + "loss": 3.1551, + "step": 659 + }, + { + "epoch": 0.20257826887661143, + "grad_norm": 2.9690299034118652, + "learning_rate": 6.738241308793456e-05, + "loss": 3.3037, + "step": 660 + }, + { + "epoch": 0.2028852056476366, + "grad_norm": 2.6644227504730225, + "learning_rate": 6.748466257668711e-05, + "loss": 3.3225, + "step": 661 + }, + { + "epoch": 0.20319214241866176, + "grad_norm": 2.6990232467651367, + "learning_rate": 6.758691206543968e-05, + "loss": 3.227, + "step": 662 + }, + { + "epoch": 0.20349907918968693, + "grad_norm": 3.6271350383758545, + "learning_rate": 6.768916155419223e-05, + "loss": 3.32, + "step": 663 + }, + { + "epoch": 0.2038060159607121, + "grad_norm": 2.6351428031921387, + "learning_rate": 6.779141104294479e-05, + "loss": 3.2104, + "step": 664 + }, + { + "epoch": 0.20411295273173727, + "grad_norm": 3.980685234069824, + "learning_rate": 6.789366053169734e-05, + "loss": 3.2602, + "step": 665 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 2.5207509994506836, + "learning_rate": 6.799591002044991e-05, + "loss": 3.2256, + "step": 666 + }, + { + "epoch": 0.2047268262737876, + "grad_norm": 3.0568666458129883, + "learning_rate": 6.809815950920245e-05, + "loss": 3.2918, + "step": 667 + }, + { + "epoch": 0.20503376304481277, + "grad_norm": 2.6476826667785645, + "learning_rate": 6.820040899795501e-05, + "loss": 3.2745, + "step": 668 + }, + { + "epoch": 0.20534069981583794, + "grad_norm": 3.0413191318511963, + "learning_rate": 6.830265848670757e-05, + "loss": 3.2683, + "step": 669 + }, + { + "epoch": 0.2056476365868631, + "grad_norm": 2.6214709281921387, + "learning_rate": 6.840490797546014e-05, + "loss": 3.1399, + "step": 670 + }, + { + "epoch": 0.20595457335788828, + "grad_norm": 3.0577988624572754, + "learning_rate": 6.850715746421268e-05, + "loss": 3.2131, + "step": 671 + }, + { + "epoch": 0.20626151012891344, + "grad_norm": 2.795365571975708, + "learning_rate": 6.860940695296524e-05, + "loss": 3.1633, + "step": 672 + }, + { + "epoch": 0.2065684468999386, + "grad_norm": 3.3030495643615723, + "learning_rate": 6.87116564417178e-05, + "loss": 3.2036, + "step": 673 + }, + { + "epoch": 0.20687538367096378, + "grad_norm": 2.3182966709136963, + "learning_rate": 6.881390593047035e-05, + "loss": 3.2154, + "step": 674 + }, + { + "epoch": 0.20718232044198895, + "grad_norm": 3.133702039718628, + "learning_rate": 6.89161554192229e-05, + "loss": 3.1828, + "step": 675 + }, + { + "epoch": 0.20748925721301412, + "grad_norm": 2.555358409881592, + "learning_rate": 6.901840490797547e-05, + "loss": 3.1434, + "step": 676 + }, + { + "epoch": 0.20779619398403928, + "grad_norm": 2.990675687789917, + "learning_rate": 6.912065439672802e-05, + "loss": 3.2182, + "step": 677 + }, + { + "epoch": 0.20810313075506445, + "grad_norm": 2.5072035789489746, + "learning_rate": 6.922290388548058e-05, + "loss": 3.2735, + "step": 678 + }, + { + "epoch": 0.20841006752608962, + "grad_norm": 3.311474323272705, + "learning_rate": 6.932515337423313e-05, + "loss": 3.2152, + "step": 679 + }, + { + "epoch": 0.2087170042971148, + "grad_norm": 2.7110986709594727, + "learning_rate": 6.942740286298569e-05, + "loss": 3.1633, + "step": 680 + }, + { + "epoch": 0.20902394106813996, + "grad_norm": 2.6963095664978027, + "learning_rate": 6.952965235173824e-05, + "loss": 3.2097, + "step": 681 + }, + { + "epoch": 0.20933087783916512, + "grad_norm": 2.7126448154449463, + "learning_rate": 6.963190184049079e-05, + "loss": 3.232, + "step": 682 + }, + { + "epoch": 0.2096378146101903, + "grad_norm": 2.723257541656494, + "learning_rate": 6.973415132924336e-05, + "loss": 3.1024, + "step": 683 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 2.985407829284668, + "learning_rate": 6.983640081799591e-05, + "loss": 3.215, + "step": 684 + }, + { + "epoch": 0.21025168815224063, + "grad_norm": 2.4878063201904297, + "learning_rate": 6.993865030674847e-05, + "loss": 3.2543, + "step": 685 + }, + { + "epoch": 0.2105586249232658, + "grad_norm": 3.417191505432129, + "learning_rate": 7.004089979550102e-05, + "loss": 3.217, + "step": 686 + }, + { + "epoch": 0.21086556169429096, + "grad_norm": 2.606513738632202, + "learning_rate": 7.014314928425359e-05, + "loss": 3.1831, + "step": 687 + }, + { + "epoch": 0.21117249846531613, + "grad_norm": 2.777334213256836, + "learning_rate": 7.024539877300614e-05, + "loss": 3.1513, + "step": 688 + }, + { + "epoch": 0.2114794352363413, + "grad_norm": 2.718494415283203, + "learning_rate": 7.03476482617587e-05, + "loss": 3.1695, + "step": 689 + }, + { + "epoch": 0.21178637200736647, + "grad_norm": 3.041794776916504, + "learning_rate": 7.044989775051125e-05, + "loss": 3.2078, + "step": 690 + }, + { + "epoch": 0.21209330877839166, + "grad_norm": 2.6473169326782227, + "learning_rate": 7.055214723926382e-05, + "loss": 3.177, + "step": 691 + }, + { + "epoch": 0.21240024554941683, + "grad_norm": 3.2349517345428467, + "learning_rate": 7.065439672801636e-05, + "loss": 3.2144, + "step": 692 + }, + { + "epoch": 0.212707182320442, + "grad_norm": 2.6024651527404785, + "learning_rate": 7.075664621676892e-05, + "loss": 3.2204, + "step": 693 + }, + { + "epoch": 0.21301411909146717, + "grad_norm": 2.9090511798858643, + "learning_rate": 7.085889570552148e-05, + "loss": 3.2473, + "step": 694 + }, + { + "epoch": 0.21332105586249234, + "grad_norm": 3.230525255203247, + "learning_rate": 7.096114519427403e-05, + "loss": 3.2552, + "step": 695 + }, + { + "epoch": 0.2136279926335175, + "grad_norm": 2.2609128952026367, + "learning_rate": 7.106339468302658e-05, + "loss": 3.1302, + "step": 696 + }, + { + "epoch": 0.21393492940454267, + "grad_norm": 3.484372854232788, + "learning_rate": 7.116564417177914e-05, + "loss": 3.1578, + "step": 697 + }, + { + "epoch": 0.21424186617556784, + "grad_norm": 2.130702257156372, + "learning_rate": 7.12678936605317e-05, + "loss": 3.2089, + "step": 698 + }, + { + "epoch": 0.214548802946593, + "grad_norm": 3.0673611164093018, + "learning_rate": 7.137014314928426e-05, + "loss": 3.214, + "step": 699 + }, + { + "epoch": 0.21485573971761818, + "grad_norm": 2.572826862335205, + "learning_rate": 7.147239263803681e-05, + "loss": 3.1824, + "step": 700 + }, + { + "epoch": 0.21516267648864335, + "grad_norm": 2.8327746391296387, + "learning_rate": 7.157464212678937e-05, + "loss": 3.2384, + "step": 701 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 2.863041877746582, + "learning_rate": 7.167689161554193e-05, + "loss": 3.1102, + "step": 702 + }, + { + "epoch": 0.21577655003069368, + "grad_norm": 2.2519750595092773, + "learning_rate": 7.177914110429447e-05, + "loss": 3.1541, + "step": 703 + }, + { + "epoch": 0.21608348680171885, + "grad_norm": 3.197129011154175, + "learning_rate": 7.188139059304704e-05, + "loss": 3.2407, + "step": 704 + }, + { + "epoch": 0.21639042357274402, + "grad_norm": 2.32582426071167, + "learning_rate": 7.19836400817996e-05, + "loss": 3.1895, + "step": 705 + }, + { + "epoch": 0.21669736034376919, + "grad_norm": 3.0128488540649414, + "learning_rate": 7.208588957055215e-05, + "loss": 3.2839, + "step": 706 + }, + { + "epoch": 0.21700429711479435, + "grad_norm": 2.503342390060425, + "learning_rate": 7.21881390593047e-05, + "loss": 3.2093, + "step": 707 + }, + { + "epoch": 0.21731123388581952, + "grad_norm": 2.7540833950042725, + "learning_rate": 7.229038854805727e-05, + "loss": 3.2143, + "step": 708 + }, + { + "epoch": 0.2176181706568447, + "grad_norm": 2.8838772773742676, + "learning_rate": 7.239263803680982e-05, + "loss": 3.2051, + "step": 709 + }, + { + "epoch": 0.21792510742786986, + "grad_norm": 2.7495758533477783, + "learning_rate": 7.249488752556238e-05, + "loss": 3.0701, + "step": 710 + }, + { + "epoch": 0.21823204419889503, + "grad_norm": 2.684539794921875, + "learning_rate": 7.259713701431493e-05, + "loss": 3.1917, + "step": 711 + }, + { + "epoch": 0.2185389809699202, + "grad_norm": 2.8330819606781006, + "learning_rate": 7.26993865030675e-05, + "loss": 3.1685, + "step": 712 + }, + { + "epoch": 0.21884591774094536, + "grad_norm": 2.6974711418151855, + "learning_rate": 7.280163599182005e-05, + "loss": 3.0953, + "step": 713 + }, + { + "epoch": 0.21915285451197053, + "grad_norm": 2.5129306316375732, + "learning_rate": 7.29038854805726e-05, + "loss": 3.1371, + "step": 714 + }, + { + "epoch": 0.2194597912829957, + "grad_norm": 2.7884230613708496, + "learning_rate": 7.300613496932516e-05, + "loss": 3.1386, + "step": 715 + }, + { + "epoch": 0.21976672805402087, + "grad_norm": 2.296306610107422, + "learning_rate": 7.310838445807771e-05, + "loss": 3.1735, + "step": 716 + }, + { + "epoch": 0.22007366482504603, + "grad_norm": 2.777911424636841, + "learning_rate": 7.321063394683026e-05, + "loss": 3.1726, + "step": 717 + }, + { + "epoch": 0.2203806015960712, + "grad_norm": 2.5349695682525635, + "learning_rate": 7.331288343558282e-05, + "loss": 3.1603, + "step": 718 + }, + { + "epoch": 0.22068753836709637, + "grad_norm": 2.415412425994873, + "learning_rate": 7.341513292433539e-05, + "loss": 3.1378, + "step": 719 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 2.7188358306884766, + "learning_rate": 7.351738241308794e-05, + "loss": 3.1321, + "step": 720 + }, + { + "epoch": 0.2213014119091467, + "grad_norm": 2.4872183799743652, + "learning_rate": 7.361963190184049e-05, + "loss": 3.1283, + "step": 721 + }, + { + "epoch": 0.22160834868017187, + "grad_norm": 2.454535961151123, + "learning_rate": 7.372188139059305e-05, + "loss": 3.1085, + "step": 722 + }, + { + "epoch": 0.22191528545119704, + "grad_norm": 2.5621426105499268, + "learning_rate": 7.382413087934561e-05, + "loss": 3.1307, + "step": 723 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.513777256011963, + "learning_rate": 7.392638036809815e-05, + "loss": 3.1103, + "step": 724 + }, + { + "epoch": 0.22252915899324738, + "grad_norm": 2.596559762954712, + "learning_rate": 7.402862985685072e-05, + "loss": 3.1563, + "step": 725 + }, + { + "epoch": 0.22283609576427257, + "grad_norm": 2.371487617492676, + "learning_rate": 7.413087934560327e-05, + "loss": 3.1344, + "step": 726 + }, + { + "epoch": 0.22314303253529774, + "grad_norm": 2.7252206802368164, + "learning_rate": 7.423312883435584e-05, + "loss": 3.2139, + "step": 727 + }, + { + "epoch": 0.2234499693063229, + "grad_norm": 2.2834722995758057, + "learning_rate": 7.433537832310838e-05, + "loss": 3.1461, + "step": 728 + }, + { + "epoch": 0.22375690607734808, + "grad_norm": 3.0965540409088135, + "learning_rate": 7.443762781186095e-05, + "loss": 3.1433, + "step": 729 + }, + { + "epoch": 0.22406384284837325, + "grad_norm": 2.351365804672241, + "learning_rate": 7.45398773006135e-05, + "loss": 3.1737, + "step": 730 + }, + { + "epoch": 0.2243707796193984, + "grad_norm": 3.0938596725463867, + "learning_rate": 7.464212678936606e-05, + "loss": 3.1689, + "step": 731 + }, + { + "epoch": 0.22467771639042358, + "grad_norm": 2.415039300918579, + "learning_rate": 7.474437627811861e-05, + "loss": 3.1146, + "step": 732 + }, + { + "epoch": 0.22498465316144875, + "grad_norm": 2.8242318630218506, + "learning_rate": 7.484662576687118e-05, + "loss": 3.0812, + "step": 733 + }, + { + "epoch": 0.22529158993247392, + "grad_norm": 2.4347777366638184, + "learning_rate": 7.494887525562373e-05, + "loss": 3.203, + "step": 734 + }, + { + "epoch": 0.22559852670349909, + "grad_norm": 2.953418016433716, + "learning_rate": 7.505112474437628e-05, + "loss": 3.109, + "step": 735 + }, + { + "epoch": 0.22590546347452425, + "grad_norm": 2.600888252258301, + "learning_rate": 7.515337423312884e-05, + "loss": 3.1859, + "step": 736 + }, + { + "epoch": 0.22621240024554942, + "grad_norm": 2.7484869956970215, + "learning_rate": 7.525562372188139e-05, + "loss": 3.1169, + "step": 737 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 2.4797677993774414, + "learning_rate": 7.535787321063396e-05, + "loss": 3.0696, + "step": 738 + }, + { + "epoch": 0.22682627378759976, + "grad_norm": 2.641873359680176, + "learning_rate": 7.54601226993865e-05, + "loss": 3.1545, + "step": 739 + }, + { + "epoch": 0.22713321055862493, + "grad_norm": 2.3956825733184814, + "learning_rate": 7.556237218813907e-05, + "loss": 3.1295, + "step": 740 + }, + { + "epoch": 0.2274401473296501, + "grad_norm": 2.8832130432128906, + "learning_rate": 7.566462167689162e-05, + "loss": 3.1119, + "step": 741 + }, + { + "epoch": 0.22774708410067526, + "grad_norm": 2.3001184463500977, + "learning_rate": 7.576687116564417e-05, + "loss": 3.0068, + "step": 742 + }, + { + "epoch": 0.22805402087170043, + "grad_norm": 2.8682122230529785, + "learning_rate": 7.586912065439673e-05, + "loss": 3.0562, + "step": 743 + }, + { + "epoch": 0.2283609576427256, + "grad_norm": 2.2176413536071777, + "learning_rate": 7.59713701431493e-05, + "loss": 3.1395, + "step": 744 + }, + { + "epoch": 0.22866789441375077, + "grad_norm": 3.698274612426758, + "learning_rate": 7.607361963190185e-05, + "loss": 3.209, + "step": 745 + }, + { + "epoch": 0.22897483118477593, + "grad_norm": 2.141063928604126, + "learning_rate": 7.61758691206544e-05, + "loss": 3.1734, + "step": 746 + }, + { + "epoch": 0.2292817679558011, + "grad_norm": 2.728498697280884, + "learning_rate": 7.627811860940695e-05, + "loss": 3.1498, + "step": 747 + }, + { + "epoch": 0.22958870472682627, + "grad_norm": 2.271678924560547, + "learning_rate": 7.638036809815952e-05, + "loss": 3.1538, + "step": 748 + }, + { + "epoch": 0.22989564149785144, + "grad_norm": 2.6095521450042725, + "learning_rate": 7.648261758691206e-05, + "loss": 3.155, + "step": 749 + }, + { + "epoch": 0.2302025782688766, + "grad_norm": 2.410792112350464, + "learning_rate": 7.658486707566463e-05, + "loss": 3.0478, + "step": 750 + }, + { + "epoch": 0.23050951503990177, + "grad_norm": 2.6980888843536377, + "learning_rate": 7.668711656441718e-05, + "loss": 3.1369, + "step": 751 + }, + { + "epoch": 0.23081645181092694, + "grad_norm": 2.353308916091919, + "learning_rate": 7.678936605316974e-05, + "loss": 3.0052, + "step": 752 + }, + { + "epoch": 0.2311233885819521, + "grad_norm": 2.4530155658721924, + "learning_rate": 7.689161554192229e-05, + "loss": 3.1348, + "step": 753 + }, + { + "epoch": 0.23143032535297728, + "grad_norm": 2.393601894378662, + "learning_rate": 7.699386503067484e-05, + "loss": 2.9941, + "step": 754 + }, + { + "epoch": 0.23173726212400245, + "grad_norm": 2.576876401901245, + "learning_rate": 7.709611451942741e-05, + "loss": 3.114, + "step": 755 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 2.0420913696289062, + "learning_rate": 7.719836400817996e-05, + "loss": 3.132, + "step": 756 + }, + { + "epoch": 0.23235113566605278, + "grad_norm": 3.0095622539520264, + "learning_rate": 7.730061349693252e-05, + "loss": 3.1763, + "step": 757 + }, + { + "epoch": 0.23265807243707795, + "grad_norm": 2.224005937576294, + "learning_rate": 7.740286298568507e-05, + "loss": 3.0703, + "step": 758 + }, + { + "epoch": 0.23296500920810312, + "grad_norm": 2.7559845447540283, + "learning_rate": 7.750511247443764e-05, + "loss": 3.1026, + "step": 759 + }, + { + "epoch": 0.2332719459791283, + "grad_norm": 2.2965753078460693, + "learning_rate": 7.760736196319018e-05, + "loss": 3.0284, + "step": 760 + }, + { + "epoch": 0.23357888275015345, + "grad_norm": 2.374398708343506, + "learning_rate": 7.770961145194275e-05, + "loss": 3.0636, + "step": 761 + }, + { + "epoch": 0.23388581952117865, + "grad_norm": 2.4315314292907715, + "learning_rate": 7.78118609406953e-05, + "loss": 3.0906, + "step": 762 + }, + { + "epoch": 0.23419275629220382, + "grad_norm": 2.5609946250915527, + "learning_rate": 7.791411042944787e-05, + "loss": 3.0692, + "step": 763 + }, + { + "epoch": 0.234499693063229, + "grad_norm": 2.419597864151001, + "learning_rate": 7.80163599182004e-05, + "loss": 3.1934, + "step": 764 + }, + { + "epoch": 0.23480662983425415, + "grad_norm": 3.0499062538146973, + "learning_rate": 7.811860940695297e-05, + "loss": 3.18, + "step": 765 + }, + { + "epoch": 0.23511356660527932, + "grad_norm": 2.464421510696411, + "learning_rate": 7.822085889570553e-05, + "loss": 3.1591, + "step": 766 + }, + { + "epoch": 0.2354205033763045, + "grad_norm": 3.4370174407958984, + "learning_rate": 7.832310838445808e-05, + "loss": 3.1156, + "step": 767 + }, + { + "epoch": 0.23572744014732966, + "grad_norm": 2.207406520843506, + "learning_rate": 7.842535787321063e-05, + "loss": 3.0557, + "step": 768 + }, + { + "epoch": 0.23603437691835483, + "grad_norm": 2.484807014465332, + "learning_rate": 7.85276073619632e-05, + "loss": 3.1003, + "step": 769 + }, + { + "epoch": 0.23634131368938, + "grad_norm": 2.33217716217041, + "learning_rate": 7.862985685071576e-05, + "loss": 3.0707, + "step": 770 + }, + { + "epoch": 0.23664825046040516, + "grad_norm": 2.493717670440674, + "learning_rate": 7.873210633946831e-05, + "loss": 3.127, + "step": 771 + }, + { + "epoch": 0.23695518723143033, + "grad_norm": 2.5824413299560547, + "learning_rate": 7.883435582822086e-05, + "loss": 3.1042, + "step": 772 + }, + { + "epoch": 0.2372621240024555, + "grad_norm": 2.4137654304504395, + "learning_rate": 7.893660531697342e-05, + "loss": 3.136, + "step": 773 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 2.4657833576202393, + "learning_rate": 7.903885480572597e-05, + "loss": 3.038, + "step": 774 + }, + { + "epoch": 0.23787599754450584, + "grad_norm": 2.426260471343994, + "learning_rate": 7.914110429447852e-05, + "loss": 3.0102, + "step": 775 + }, + { + "epoch": 0.238182934315531, + "grad_norm": 2.4658050537109375, + "learning_rate": 7.924335378323109e-05, + "loss": 3.0645, + "step": 776 + }, + { + "epoch": 0.23848987108655617, + "grad_norm": 2.186267614364624, + "learning_rate": 7.934560327198364e-05, + "loss": 3.0585, + "step": 777 + }, + { + "epoch": 0.23879680785758134, + "grad_norm": 2.8824141025543213, + "learning_rate": 7.94478527607362e-05, + "loss": 3.0796, + "step": 778 + }, + { + "epoch": 0.2391037446286065, + "grad_norm": 1.9940539598464966, + "learning_rate": 7.955010224948875e-05, + "loss": 2.9894, + "step": 779 + }, + { + "epoch": 0.23941068139963168, + "grad_norm": 2.9386861324310303, + "learning_rate": 7.965235173824132e-05, + "loss": 3.1147, + "step": 780 + }, + { + "epoch": 0.23971761817065684, + "grad_norm": 2.241983413696289, + "learning_rate": 7.975460122699386e-05, + "loss": 2.9977, + "step": 781 + }, + { + "epoch": 0.240024554941682, + "grad_norm": 2.4796900749206543, + "learning_rate": 7.985685071574643e-05, + "loss": 3.0507, + "step": 782 + }, + { + "epoch": 0.24033149171270718, + "grad_norm": 2.6178741455078125, + "learning_rate": 7.995910020449898e-05, + "loss": 3.0299, + "step": 783 + }, + { + "epoch": 0.24063842848373235, + "grad_norm": 2.157179594039917, + "learning_rate": 8.006134969325155e-05, + "loss": 3.0419, + "step": 784 + }, + { + "epoch": 0.24094536525475752, + "grad_norm": 2.49029541015625, + "learning_rate": 8.016359918200409e-05, + "loss": 3.0785, + "step": 785 + }, + { + "epoch": 0.24125230202578268, + "grad_norm": 2.254014492034912, + "learning_rate": 8.026584867075665e-05, + "loss": 3.0009, + "step": 786 + }, + { + "epoch": 0.24155923879680785, + "grad_norm": 2.514465570449829, + "learning_rate": 8.036809815950921e-05, + "loss": 3.0221, + "step": 787 + }, + { + "epoch": 0.24186617556783302, + "grad_norm": 2.309812545776367, + "learning_rate": 8.047034764826176e-05, + "loss": 2.9822, + "step": 788 + }, + { + "epoch": 0.2421731123388582, + "grad_norm": 2.5367796421051025, + "learning_rate": 8.057259713701431e-05, + "loss": 2.966, + "step": 789 + }, + { + "epoch": 0.24248004910988336, + "grad_norm": 2.4668943881988525, + "learning_rate": 8.067484662576688e-05, + "loss": 3.1177, + "step": 790 + }, + { + "epoch": 0.24278698588090852, + "grad_norm": 2.9424917697906494, + "learning_rate": 8.077709611451944e-05, + "loss": 3.078, + "step": 791 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 2.3068933486938477, + "learning_rate": 8.087934560327199e-05, + "loss": 3.0415, + "step": 792 + }, + { + "epoch": 0.24340085942295886, + "grad_norm": 2.675631284713745, + "learning_rate": 8.098159509202454e-05, + "loss": 3.012, + "step": 793 + }, + { + "epoch": 0.24370779619398403, + "grad_norm": 2.0261662006378174, + "learning_rate": 8.10838445807771e-05, + "loss": 3.0023, + "step": 794 + }, + { + "epoch": 0.2440147329650092, + "grad_norm": 3.32330322265625, + "learning_rate": 8.118609406952966e-05, + "loss": 3.0992, + "step": 795 + }, + { + "epoch": 0.24432166973603436, + "grad_norm": 2.1587088108062744, + "learning_rate": 8.12883435582822e-05, + "loss": 3.0922, + "step": 796 + }, + { + "epoch": 0.24462860650705956, + "grad_norm": 2.639254331588745, + "learning_rate": 8.139059304703477e-05, + "loss": 2.9856, + "step": 797 + }, + { + "epoch": 0.24493554327808473, + "grad_norm": 1.9976975917816162, + "learning_rate": 8.149284253578732e-05, + "loss": 3.0015, + "step": 798 + }, + { + "epoch": 0.2452424800491099, + "grad_norm": 2.763504981994629, + "learning_rate": 8.159509202453988e-05, + "loss": 3.0437, + "step": 799 + }, + { + "epoch": 0.24554941682013506, + "grad_norm": 1.9080138206481934, + "learning_rate": 8.169734151329243e-05, + "loss": 3.0009, + "step": 800 + }, + { + "epoch": 0.24585635359116023, + "grad_norm": 3.1276164054870605, + "learning_rate": 8.1799591002045e-05, + "loss": 3.0433, + "step": 801 + }, + { + "epoch": 0.2461632903621854, + "grad_norm": 2.0463218688964844, + "learning_rate": 8.190184049079755e-05, + "loss": 2.988, + "step": 802 + }, + { + "epoch": 0.24647022713321057, + "grad_norm": 2.8476648330688477, + "learning_rate": 8.20040899795501e-05, + "loss": 3.0238, + "step": 803 + }, + { + "epoch": 0.24677716390423574, + "grad_norm": 1.9715898036956787, + "learning_rate": 8.210633946830266e-05, + "loss": 3.0657, + "step": 804 + }, + { + "epoch": 0.2470841006752609, + "grad_norm": 3.369995594024658, + "learning_rate": 8.220858895705523e-05, + "loss": 3.0181, + "step": 805 + }, + { + "epoch": 0.24739103744628607, + "grad_norm": 2.0333900451660156, + "learning_rate": 8.231083844580777e-05, + "loss": 3.0589, + "step": 806 + }, + { + "epoch": 0.24769797421731124, + "grad_norm": 2.5702931880950928, + "learning_rate": 8.241308793456033e-05, + "loss": 2.9908, + "step": 807 + }, + { + "epoch": 0.2480049109883364, + "grad_norm": 2.12131929397583, + "learning_rate": 8.251533742331289e-05, + "loss": 3.0519, + "step": 808 + }, + { + "epoch": 0.24831184775936158, + "grad_norm": 2.5457377433776855, + "learning_rate": 8.261758691206544e-05, + "loss": 3.019, + "step": 809 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 2.0954740047454834, + "learning_rate": 8.2719836400818e-05, + "loss": 2.9805, + "step": 810 + }, + { + "epoch": 0.2489257213014119, + "grad_norm": 2.2456700801849365, + "learning_rate": 8.282208588957055e-05, + "loss": 3.0627, + "step": 811 + }, + { + "epoch": 0.24923265807243708, + "grad_norm": 2.4453790187835693, + "learning_rate": 8.292433537832312e-05, + "loss": 3.0447, + "step": 812 + }, + { + "epoch": 0.24953959484346225, + "grad_norm": 2.1835873126983643, + "learning_rate": 8.302658486707567e-05, + "loss": 3.0008, + "step": 813 + }, + { + "epoch": 0.24984653161448742, + "grad_norm": 2.292989492416382, + "learning_rate": 8.312883435582822e-05, + "loss": 2.9175, + "step": 814 + }, + { + "epoch": 0.2501534683855126, + "grad_norm": 2.408888816833496, + "learning_rate": 8.323108384458078e-05, + "loss": 2.9649, + "step": 815 + }, + { + "epoch": 0.2504604051565378, + "grad_norm": 2.1873834133148193, + "learning_rate": 8.333333333333334e-05, + "loss": 2.9812, + "step": 816 + }, + { + "epoch": 0.25076734192756295, + "grad_norm": 2.2599284648895264, + "learning_rate": 8.343558282208588e-05, + "loss": 3.0086, + "step": 817 + }, + { + "epoch": 0.2510742786985881, + "grad_norm": 2.1902761459350586, + "learning_rate": 8.353783231083845e-05, + "loss": 2.9295, + "step": 818 + }, + { + "epoch": 0.2513812154696133, + "grad_norm": 2.4830422401428223, + "learning_rate": 8.3640081799591e-05, + "loss": 2.9808, + "step": 819 + }, + { + "epoch": 0.25168815224063845, + "grad_norm": 2.2274281978607178, + "learning_rate": 8.374233128834357e-05, + "loss": 2.9525, + "step": 820 + }, + { + "epoch": 0.2519950890116636, + "grad_norm": 2.2949111461639404, + "learning_rate": 8.384458077709611e-05, + "loss": 3.0313, + "step": 821 + }, + { + "epoch": 0.2523020257826888, + "grad_norm": 2.2345564365386963, + "learning_rate": 8.394683026584868e-05, + "loss": 2.9024, + "step": 822 + }, + { + "epoch": 0.25260896255371396, + "grad_norm": 2.488744020462036, + "learning_rate": 8.404907975460123e-05, + "loss": 2.9907, + "step": 823 + }, + { + "epoch": 0.2529158993247391, + "grad_norm": 1.9192837476730347, + "learning_rate": 8.415132924335379e-05, + "loss": 2.9792, + "step": 824 + }, + { + "epoch": 0.2532228360957643, + "grad_norm": 2.6426947116851807, + "learning_rate": 8.425357873210634e-05, + "loss": 2.972, + "step": 825 + }, + { + "epoch": 0.25352977286678946, + "grad_norm": 1.9950047731399536, + "learning_rate": 8.435582822085891e-05, + "loss": 2.9885, + "step": 826 + }, + { + "epoch": 0.25383670963781463, + "grad_norm": 2.30191969871521, + "learning_rate": 8.445807770961146e-05, + "loss": 2.9358, + "step": 827 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 2.1111395359039307, + "learning_rate": 8.456032719836401e-05, + "loss": 3.0343, + "step": 828 + }, + { + "epoch": 0.25445058317986496, + "grad_norm": 2.7292258739471436, + "learning_rate": 8.466257668711657e-05, + "loss": 2.9465, + "step": 829 + }, + { + "epoch": 0.25475751995089013, + "grad_norm": 1.9130604267120361, + "learning_rate": 8.476482617586912e-05, + "loss": 2.9443, + "step": 830 + }, + { + "epoch": 0.2550644567219153, + "grad_norm": 2.4240024089813232, + "learning_rate": 8.486707566462168e-05, + "loss": 2.963, + "step": 831 + }, + { + "epoch": 0.25537139349294047, + "grad_norm": 2.062875509262085, + "learning_rate": 8.496932515337423e-05, + "loss": 3.0127, + "step": 832 + }, + { + "epoch": 0.25567833026396564, + "grad_norm": 2.223639726638794, + "learning_rate": 8.50715746421268e-05, + "loss": 2.944, + "step": 833 + }, + { + "epoch": 0.2559852670349908, + "grad_norm": 2.2969272136688232, + "learning_rate": 8.517382413087935e-05, + "loss": 2.9495, + "step": 834 + }, + { + "epoch": 0.256292203806016, + "grad_norm": 2.1343178749084473, + "learning_rate": 8.52760736196319e-05, + "loss": 3.0383, + "step": 835 + }, + { + "epoch": 0.25659914057704114, + "grad_norm": 2.2348313331604004, + "learning_rate": 8.537832310838446e-05, + "loss": 2.9205, + "step": 836 + }, + { + "epoch": 0.2569060773480663, + "grad_norm": 2.2653896808624268, + "learning_rate": 8.548057259713702e-05, + "loss": 2.9699, + "step": 837 + }, + { + "epoch": 0.2572130141190915, + "grad_norm": 2.1332547664642334, + "learning_rate": 8.558282208588958e-05, + "loss": 2.9318, + "step": 838 + }, + { + "epoch": 0.25751995089011664, + "grad_norm": 2.5935778617858887, + "learning_rate": 8.568507157464213e-05, + "loss": 2.9754, + "step": 839 + }, + { + "epoch": 0.2578268876611418, + "grad_norm": 2.073923110961914, + "learning_rate": 8.578732106339469e-05, + "loss": 3.0396, + "step": 840 + }, + { + "epoch": 0.258133824432167, + "grad_norm": 2.485049247741699, + "learning_rate": 8.588957055214725e-05, + "loss": 2.9297, + "step": 841 + }, + { + "epoch": 0.25844076120319215, + "grad_norm": 1.9425253868103027, + "learning_rate": 8.599182004089979e-05, + "loss": 3.0131, + "step": 842 + }, + { + "epoch": 0.2587476979742173, + "grad_norm": 2.6248724460601807, + "learning_rate": 8.609406952965236e-05, + "loss": 3.0345, + "step": 843 + }, + { + "epoch": 0.2590546347452425, + "grad_norm": 1.9123374223709106, + "learning_rate": 8.619631901840491e-05, + "loss": 3.0259, + "step": 844 + }, + { + "epoch": 0.25936157151626765, + "grad_norm": 2.457913637161255, + "learning_rate": 8.629856850715747e-05, + "loss": 3.0015, + "step": 845 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 2.0444202423095703, + "learning_rate": 8.640081799591002e-05, + "loss": 2.9663, + "step": 846 + }, + { + "epoch": 0.259975445058318, + "grad_norm": 2.1673583984375, + "learning_rate": 8.650306748466259e-05, + "loss": 3.0646, + "step": 847 + }, + { + "epoch": 0.26028238182934316, + "grad_norm": 2.1198627948760986, + "learning_rate": 8.660531697341514e-05, + "loss": 2.8769, + "step": 848 + }, + { + "epoch": 0.2605893186003683, + "grad_norm": 2.379960775375366, + "learning_rate": 8.67075664621677e-05, + "loss": 2.9637, + "step": 849 + }, + { + "epoch": 0.2608962553713935, + "grad_norm": 2.3954226970672607, + "learning_rate": 8.680981595092025e-05, + "loss": 3.025, + "step": 850 + }, + { + "epoch": 0.26120319214241866, + "grad_norm": 2.254746198654175, + "learning_rate": 8.69120654396728e-05, + "loss": 2.9962, + "step": 851 + }, + { + "epoch": 0.26151012891344383, + "grad_norm": 2.0851991176605225, + "learning_rate": 8.701431492842537e-05, + "loss": 2.9399, + "step": 852 + }, + { + "epoch": 0.261817065684469, + "grad_norm": 2.2800698280334473, + "learning_rate": 8.711656441717791e-05, + "loss": 2.9465, + "step": 853 + }, + { + "epoch": 0.26212400245549416, + "grad_norm": 2.3628437519073486, + "learning_rate": 8.721881390593048e-05, + "loss": 3.0298, + "step": 854 + }, + { + "epoch": 0.26243093922651933, + "grad_norm": 1.9642207622528076, + "learning_rate": 8.732106339468303e-05, + "loss": 2.8462, + "step": 855 + }, + { + "epoch": 0.2627378759975445, + "grad_norm": 2.5833423137664795, + "learning_rate": 8.742331288343558e-05, + "loss": 2.9024, + "step": 856 + }, + { + "epoch": 0.26304481276856967, + "grad_norm": 1.7022998332977295, + "learning_rate": 8.752556237218814e-05, + "loss": 2.9948, + "step": 857 + }, + { + "epoch": 0.26335174953959484, + "grad_norm": 3.181725025177002, + "learning_rate": 8.76278118609407e-05, + "loss": 3.0634, + "step": 858 + }, + { + "epoch": 0.26365868631062, + "grad_norm": 1.8931077718734741, + "learning_rate": 8.773006134969326e-05, + "loss": 2.9974, + "step": 859 + }, + { + "epoch": 0.2639656230816452, + "grad_norm": 2.5016703605651855, + "learning_rate": 8.783231083844581e-05, + "loss": 3.0109, + "step": 860 + }, + { + "epoch": 0.26427255985267034, + "grad_norm": 1.810957908630371, + "learning_rate": 8.793456032719837e-05, + "loss": 3.0143, + "step": 861 + }, + { + "epoch": 0.2645794966236955, + "grad_norm": 2.3004086017608643, + "learning_rate": 8.803680981595093e-05, + "loss": 2.9825, + "step": 862 + }, + { + "epoch": 0.2648864333947207, + "grad_norm": 2.23740816116333, + "learning_rate": 8.813905930470347e-05, + "loss": 2.8897, + "step": 863 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 2.441157579421997, + "learning_rate": 8.824130879345604e-05, + "loss": 2.8966, + "step": 864 + }, + { + "epoch": 0.265500306936771, + "grad_norm": 2.063201665878296, + "learning_rate": 8.83435582822086e-05, + "loss": 2.9468, + "step": 865 + }, + { + "epoch": 0.2658072437077962, + "grad_norm": 2.1484951972961426, + "learning_rate": 8.844580777096115e-05, + "loss": 2.9199, + "step": 866 + }, + { + "epoch": 0.26611418047882135, + "grad_norm": 2.167827844619751, + "learning_rate": 8.85480572597137e-05, + "loss": 2.9403, + "step": 867 + }, + { + "epoch": 0.2664211172498465, + "grad_norm": 2.193556070327759, + "learning_rate": 8.865030674846625e-05, + "loss": 2.9171, + "step": 868 + }, + { + "epoch": 0.2667280540208717, + "grad_norm": 2.0754151344299316, + "learning_rate": 8.875255623721882e-05, + "loss": 2.9605, + "step": 869 + }, + { + "epoch": 0.26703499079189685, + "grad_norm": 2.1351094245910645, + "learning_rate": 8.885480572597138e-05, + "loss": 2.9272, + "step": 870 + }, + { + "epoch": 0.267341927562922, + "grad_norm": 2.0486347675323486, + "learning_rate": 8.895705521472393e-05, + "loss": 3.0308, + "step": 871 + }, + { + "epoch": 0.2676488643339472, + "grad_norm": 2.3303308486938477, + "learning_rate": 8.905930470347648e-05, + "loss": 2.9061, + "step": 872 + }, + { + "epoch": 0.26795580110497236, + "grad_norm": 1.9345083236694336, + "learning_rate": 8.916155419222905e-05, + "loss": 2.9644, + "step": 873 + }, + { + "epoch": 0.2682627378759975, + "grad_norm": 2.451918601989746, + "learning_rate": 8.926380368098159e-05, + "loss": 2.9536, + "step": 874 + }, + { + "epoch": 0.2685696746470227, + "grad_norm": 1.6964573860168457, + "learning_rate": 8.936605316973416e-05, + "loss": 2.9228, + "step": 875 + }, + { + "epoch": 0.26887661141804786, + "grad_norm": 2.2414000034332275, + "learning_rate": 8.946830265848671e-05, + "loss": 2.9776, + "step": 876 + }, + { + "epoch": 0.26918354818907303, + "grad_norm": 1.725002408027649, + "learning_rate": 8.957055214723928e-05, + "loss": 2.9837, + "step": 877 + }, + { + "epoch": 0.2694904849600982, + "grad_norm": 2.1498587131500244, + "learning_rate": 8.967280163599182e-05, + "loss": 2.8684, + "step": 878 + }, + { + "epoch": 0.26979742173112337, + "grad_norm": 1.814738392829895, + "learning_rate": 8.977505112474438e-05, + "loss": 2.9077, + "step": 879 + }, + { + "epoch": 0.27010435850214853, + "grad_norm": 2.3086628913879395, + "learning_rate": 8.987730061349694e-05, + "loss": 2.9482, + "step": 880 + }, + { + "epoch": 0.2704112952731737, + "grad_norm": 1.7470855712890625, + "learning_rate": 8.997955010224949e-05, + "loss": 2.9775, + "step": 881 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 2.2822775840759277, + "learning_rate": 9.008179959100205e-05, + "loss": 3.0004, + "step": 882 + }, + { + "epoch": 0.27102516881522404, + "grad_norm": 1.9530903100967407, + "learning_rate": 9.018404907975461e-05, + "loss": 2.949, + "step": 883 + }, + { + "epoch": 0.2713321055862492, + "grad_norm": 2.0626885890960693, + "learning_rate": 9.028629856850717e-05, + "loss": 2.9184, + "step": 884 + }, + { + "epoch": 0.2716390423572744, + "grad_norm": 2.0040712356567383, + "learning_rate": 9.038854805725972e-05, + "loss": 2.8562, + "step": 885 + }, + { + "epoch": 0.2719459791282996, + "grad_norm": 2.026193141937256, + "learning_rate": 9.049079754601227e-05, + "loss": 2.883, + "step": 886 + }, + { + "epoch": 0.27225291589932477, + "grad_norm": 1.8337095975875854, + "learning_rate": 9.059304703476483e-05, + "loss": 2.8512, + "step": 887 + }, + { + "epoch": 0.27255985267034993, + "grad_norm": 2.1098122596740723, + "learning_rate": 9.069529652351738e-05, + "loss": 2.9024, + "step": 888 + }, + { + "epoch": 0.2728667894413751, + "grad_norm": 2.065650701522827, + "learning_rate": 9.079754601226993e-05, + "loss": 2.9291, + "step": 889 + }, + { + "epoch": 0.27317372621240027, + "grad_norm": 2.204819679260254, + "learning_rate": 9.08997955010225e-05, + "loss": 2.9153, + "step": 890 + }, + { + "epoch": 0.27348066298342544, + "grad_norm": 1.7931475639343262, + "learning_rate": 9.100204498977506e-05, + "loss": 2.9104, + "step": 891 + }, + { + "epoch": 0.2737875997544506, + "grad_norm": 2.4288859367370605, + "learning_rate": 9.110429447852761e-05, + "loss": 2.9974, + "step": 892 + }, + { + "epoch": 0.2740945365254758, + "grad_norm": 2.095872640609741, + "learning_rate": 9.120654396728016e-05, + "loss": 2.8446, + "step": 893 + }, + { + "epoch": 0.27440147329650094, + "grad_norm": 2.054410696029663, + "learning_rate": 9.130879345603273e-05, + "loss": 2.9008, + "step": 894 + }, + { + "epoch": 0.2747084100675261, + "grad_norm": 2.1989710330963135, + "learning_rate": 9.141104294478528e-05, + "loss": 2.8808, + "step": 895 + }, + { + "epoch": 0.2750153468385513, + "grad_norm": 2.531081199645996, + "learning_rate": 9.151329243353784e-05, + "loss": 2.8928, + "step": 896 + }, + { + "epoch": 0.27532228360957645, + "grad_norm": 2.010425567626953, + "learning_rate": 9.161554192229039e-05, + "loss": 2.9051, + "step": 897 + }, + { + "epoch": 0.2756292203806016, + "grad_norm": 1.9320241212844849, + "learning_rate": 9.171779141104296e-05, + "loss": 2.8675, + "step": 898 + }, + { + "epoch": 0.2759361571516268, + "grad_norm": 2.2280430793762207, + "learning_rate": 9.18200408997955e-05, + "loss": 2.9082, + "step": 899 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 1.9172335863113403, + "learning_rate": 9.192229038854807e-05, + "loss": 2.8947, + "step": 900 + }, + { + "epoch": 0.2765500306936771, + "grad_norm": 2.0846056938171387, + "learning_rate": 9.202453987730062e-05, + "loss": 2.9161, + "step": 901 + }, + { + "epoch": 0.2768569674647023, + "grad_norm": 1.875034213066101, + "learning_rate": 9.212678936605317e-05, + "loss": 2.8937, + "step": 902 + }, + { + "epoch": 0.27716390423572745, + "grad_norm": 2.230164051055908, + "learning_rate": 9.222903885480573e-05, + "loss": 2.8396, + "step": 903 + }, + { + "epoch": 0.2774708410067526, + "grad_norm": 1.6204382181167603, + "learning_rate": 9.233128834355828e-05, + "loss": 2.9367, + "step": 904 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.4218156337738037, + "learning_rate": 9.243353783231085e-05, + "loss": 2.9727, + "step": 905 + }, + { + "epoch": 0.27808471454880296, + "grad_norm": 1.7401793003082275, + "learning_rate": 9.25357873210634e-05, + "loss": 2.8957, + "step": 906 + }, + { + "epoch": 0.2783916513198281, + "grad_norm": 2.2128076553344727, + "learning_rate": 9.263803680981595e-05, + "loss": 2.8725, + "step": 907 + }, + { + "epoch": 0.2786985880908533, + "grad_norm": 2.004179000854492, + "learning_rate": 9.274028629856851e-05, + "loss": 2.8879, + "step": 908 + }, + { + "epoch": 0.27900552486187846, + "grad_norm": 2.198784112930298, + "learning_rate": 9.284253578732107e-05, + "loss": 2.9655, + "step": 909 + }, + { + "epoch": 0.27931246163290363, + "grad_norm": 1.8064004182815552, + "learning_rate": 9.294478527607362e-05, + "loss": 2.7801, + "step": 910 + }, + { + "epoch": 0.2796193984039288, + "grad_norm": 2.1273581981658936, + "learning_rate": 9.304703476482618e-05, + "loss": 2.8615, + "step": 911 + }, + { + "epoch": 0.27992633517495397, + "grad_norm": 1.7843197584152222, + "learning_rate": 9.314928425357874e-05, + "loss": 2.8735, + "step": 912 + }, + { + "epoch": 0.28023327194597913, + "grad_norm": 2.234886884689331, + "learning_rate": 9.325153374233129e-05, + "loss": 2.9444, + "step": 913 + }, + { + "epoch": 0.2805402087170043, + "grad_norm": 2.0565783977508545, + "learning_rate": 9.335378323108384e-05, + "loss": 2.9784, + "step": 914 + }, + { + "epoch": 0.28084714548802947, + "grad_norm": 1.836901068687439, + "learning_rate": 9.345603271983641e-05, + "loss": 2.9217, + "step": 915 + }, + { + "epoch": 0.28115408225905464, + "grad_norm": 2.0981357097625732, + "learning_rate": 9.355828220858896e-05, + "loss": 2.9091, + "step": 916 + }, + { + "epoch": 0.2814610190300798, + "grad_norm": 1.9199821949005127, + "learning_rate": 9.366053169734152e-05, + "loss": 2.8882, + "step": 917 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 1.9928756952285767, + "learning_rate": 9.376278118609407e-05, + "loss": 2.8463, + "step": 918 + }, + { + "epoch": 0.28207489257213014, + "grad_norm": 1.9580156803131104, + "learning_rate": 9.386503067484664e-05, + "loss": 2.7814, + "step": 919 + }, + { + "epoch": 0.2823818293431553, + "grad_norm": 2.016144275665283, + "learning_rate": 9.396728016359919e-05, + "loss": 2.8725, + "step": 920 + }, + { + "epoch": 0.2826887661141805, + "grad_norm": 1.967668890953064, + "learning_rate": 9.406952965235175e-05, + "loss": 2.912, + "step": 921 + }, + { + "epoch": 0.28299570288520565, + "grad_norm": 1.8826593160629272, + "learning_rate": 9.41717791411043e-05, + "loss": 2.7885, + "step": 922 + }, + { + "epoch": 0.2833026396562308, + "grad_norm": 2.0615732669830322, + "learning_rate": 9.427402862985685e-05, + "loss": 2.9111, + "step": 923 + }, + { + "epoch": 0.283609576427256, + "grad_norm": 1.7132701873779297, + "learning_rate": 9.43762781186094e-05, + "loss": 2.89, + "step": 924 + }, + { + "epoch": 0.28391651319828115, + "grad_norm": 2.1561272144317627, + "learning_rate": 9.447852760736196e-05, + "loss": 2.8741, + "step": 925 + }, + { + "epoch": 0.2842234499693063, + "grad_norm": 1.727338433265686, + "learning_rate": 9.458077709611453e-05, + "loss": 2.8449, + "step": 926 + }, + { + "epoch": 0.2845303867403315, + "grad_norm": 2.19234299659729, + "learning_rate": 9.468302658486708e-05, + "loss": 2.8499, + "step": 927 + }, + { + "epoch": 0.28483732351135665, + "grad_norm": 1.7370812892913818, + "learning_rate": 9.478527607361963e-05, + "loss": 2.882, + "step": 928 + }, + { + "epoch": 0.2851442602823818, + "grad_norm": 2.0576157569885254, + "learning_rate": 9.488752556237219e-05, + "loss": 2.7869, + "step": 929 + }, + { + "epoch": 0.285451197053407, + "grad_norm": 1.7926486730575562, + "learning_rate": 9.498977505112476e-05, + "loss": 2.906, + "step": 930 + }, + { + "epoch": 0.28575813382443216, + "grad_norm": 1.6877856254577637, + "learning_rate": 9.50920245398773e-05, + "loss": 2.8422, + "step": 931 + }, + { + "epoch": 0.2860650705954573, + "grad_norm": 2.3053178787231445, + "learning_rate": 9.519427402862986e-05, + "loss": 2.9039, + "step": 932 + }, + { + "epoch": 0.2863720073664825, + "grad_norm": 1.7746092081069946, + "learning_rate": 9.529652351738242e-05, + "loss": 2.9082, + "step": 933 + }, + { + "epoch": 0.28667894413750766, + "grad_norm": 2.1900086402893066, + "learning_rate": 9.539877300613498e-05, + "loss": 2.8511, + "step": 934 + }, + { + "epoch": 0.28698588090853283, + "grad_norm": 1.781988501548767, + "learning_rate": 9.550102249488752e-05, + "loss": 2.8264, + "step": 935 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 1.845797061920166, + "learning_rate": 9.560327198364009e-05, + "loss": 2.8657, + "step": 936 + }, + { + "epoch": 0.28759975445058317, + "grad_norm": 1.8794586658477783, + "learning_rate": 9.570552147239264e-05, + "loss": 2.8365, + "step": 937 + }, + { + "epoch": 0.28790669122160834, + "grad_norm": 2.078359603881836, + "learning_rate": 9.58077709611452e-05, + "loss": 2.8829, + "step": 938 + }, + { + "epoch": 0.2882136279926335, + "grad_norm": 1.8091285228729248, + "learning_rate": 9.591002044989775e-05, + "loss": 2.8083, + "step": 939 + }, + { + "epoch": 0.28852056476365867, + "grad_norm": 2.0130608081817627, + "learning_rate": 9.601226993865032e-05, + "loss": 2.8922, + "step": 940 + }, + { + "epoch": 0.28882750153468384, + "grad_norm": 1.8504360914230347, + "learning_rate": 9.611451942740287e-05, + "loss": 2.8034, + "step": 941 + }, + { + "epoch": 0.289134438305709, + "grad_norm": 1.860420823097229, + "learning_rate": 9.621676891615543e-05, + "loss": 2.8249, + "step": 942 + }, + { + "epoch": 0.2894413750767342, + "grad_norm": 2.157158374786377, + "learning_rate": 9.631901840490798e-05, + "loss": 2.8629, + "step": 943 + }, + { + "epoch": 0.28974831184775934, + "grad_norm": 1.8066895008087158, + "learning_rate": 9.642126789366053e-05, + "loss": 2.7965, + "step": 944 + }, + { + "epoch": 0.2900552486187845, + "grad_norm": 1.9674500226974487, + "learning_rate": 9.65235173824131e-05, + "loss": 2.8043, + "step": 945 + }, + { + "epoch": 0.2903621853898097, + "grad_norm": 1.7899354696273804, + "learning_rate": 9.662576687116564e-05, + "loss": 2.8803, + "step": 946 + }, + { + "epoch": 0.29066912216083485, + "grad_norm": 2.220201015472412, + "learning_rate": 9.672801635991821e-05, + "loss": 2.8201, + "step": 947 + }, + { + "epoch": 0.29097605893186, + "grad_norm": 1.76320219039917, + "learning_rate": 9.683026584867076e-05, + "loss": 2.8921, + "step": 948 + }, + { + "epoch": 0.2912829957028852, + "grad_norm": 1.6863081455230713, + "learning_rate": 9.693251533742331e-05, + "loss": 2.8208, + "step": 949 + }, + { + "epoch": 0.29158993247391035, + "grad_norm": 2.1578476428985596, + "learning_rate": 9.703476482617587e-05, + "loss": 2.8972, + "step": 950 + }, + { + "epoch": 0.2918968692449355, + "grad_norm": 1.6925181150436401, + "learning_rate": 9.713701431492844e-05, + "loss": 2.8225, + "step": 951 + }, + { + "epoch": 0.2922038060159607, + "grad_norm": 1.8861147165298462, + "learning_rate": 9.723926380368099e-05, + "loss": 2.8707, + "step": 952 + }, + { + "epoch": 0.29251074278698586, + "grad_norm": 1.5894604921340942, + "learning_rate": 9.734151329243354e-05, + "loss": 2.7576, + "step": 953 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 1.9092673063278198, + "learning_rate": 9.74437627811861e-05, + "loss": 2.8659, + "step": 954 + }, + { + "epoch": 0.2931246163290362, + "grad_norm": 1.8600605726242065, + "learning_rate": 9.754601226993866e-05, + "loss": 2.752, + "step": 955 + }, + { + "epoch": 0.29343155310006136, + "grad_norm": 2.005805015563965, + "learning_rate": 9.76482617586912e-05, + "loss": 2.8511, + "step": 956 + }, + { + "epoch": 0.2937384898710866, + "grad_norm": 1.9485148191452026, + "learning_rate": 9.775051124744377e-05, + "loss": 2.9726, + "step": 957 + }, + { + "epoch": 0.29404542664211175, + "grad_norm": 1.9197280406951904, + "learning_rate": 9.785276073619632e-05, + "loss": 2.7753, + "step": 958 + }, + { + "epoch": 0.2943523634131369, + "grad_norm": 1.6279773712158203, + "learning_rate": 9.795501022494888e-05, + "loss": 2.8855, + "step": 959 + }, + { + "epoch": 0.2946593001841621, + "grad_norm": 2.0233097076416016, + "learning_rate": 9.805725971370143e-05, + "loss": 2.749, + "step": 960 + }, + { + "epoch": 0.29496623695518726, + "grad_norm": 1.550295352935791, + "learning_rate": 9.815950920245399e-05, + "loss": 2.7991, + "step": 961 + }, + { + "epoch": 0.2952731737262124, + "grad_norm": 2.3194360733032227, + "learning_rate": 9.826175869120655e-05, + "loss": 2.8208, + "step": 962 + }, + { + "epoch": 0.2955801104972376, + "grad_norm": 1.634867787361145, + "learning_rate": 9.83640081799591e-05, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.29588704726826276, + "grad_norm": 2.1152596473693848, + "learning_rate": 9.846625766871166e-05, + "loss": 2.7667, + "step": 964 + }, + { + "epoch": 0.2961939840392879, + "grad_norm": 1.8927233219146729, + "learning_rate": 9.856850715746421e-05, + "loss": 2.8308, + "step": 965 + }, + { + "epoch": 0.2965009208103131, + "grad_norm": 1.765026330947876, + "learning_rate": 9.867075664621678e-05, + "loss": 2.7546, + "step": 966 + }, + { + "epoch": 0.29680785758133826, + "grad_norm": 1.7491015195846558, + "learning_rate": 9.877300613496932e-05, + "loss": 2.8156, + "step": 967 + }, + { + "epoch": 0.29711479435236343, + "grad_norm": 1.8352077007293701, + "learning_rate": 9.887525562372189e-05, + "loss": 2.8542, + "step": 968 + }, + { + "epoch": 0.2974217311233886, + "grad_norm": 1.8892323970794678, + "learning_rate": 9.897750511247444e-05, + "loss": 2.8216, + "step": 969 + }, + { + "epoch": 0.29772866789441377, + "grad_norm": 1.7171403169631958, + "learning_rate": 9.907975460122701e-05, + "loss": 2.8428, + "step": 970 + }, + { + "epoch": 0.29803560466543894, + "grad_norm": 1.8318040370941162, + "learning_rate": 9.918200408997955e-05, + "loss": 2.7821, + "step": 971 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 1.5829975605010986, + "learning_rate": 9.928425357873212e-05, + "loss": 2.9091, + "step": 972 + }, + { + "epoch": 0.29864947820748927, + "grad_norm": 1.7248235940933228, + "learning_rate": 9.938650306748467e-05, + "loss": 2.7914, + "step": 973 + }, + { + "epoch": 0.29895641497851444, + "grad_norm": 1.7741187810897827, + "learning_rate": 9.948875255623722e-05, + "loss": 2.8711, + "step": 974 + }, + { + "epoch": 0.2992633517495396, + "grad_norm": 1.7419151067733765, + "learning_rate": 9.959100204498978e-05, + "loss": 2.8933, + "step": 975 + }, + { + "epoch": 0.2995702885205648, + "grad_norm": 1.6603926420211792, + "learning_rate": 9.969325153374234e-05, + "loss": 2.7138, + "step": 976 + }, + { + "epoch": 0.29987722529158994, + "grad_norm": 1.8423576354980469, + "learning_rate": 9.97955010224949e-05, + "loss": 2.7776, + "step": 977 + }, + { + "epoch": 0.3001841620626151, + "grad_norm": 1.5548568964004517, + "learning_rate": 9.989775051124745e-05, + "loss": 2.8193, + "step": 978 + }, + { + "epoch": 0.3004910988336403, + "grad_norm": 1.711785078048706, + "learning_rate": 0.0001, + "loss": 2.7082, + "step": 979 + }, + { + "epoch": 0.30079803560466545, + "grad_norm": 1.6395221948623657, + "learning_rate": 9.999999975293535e-05, + "loss": 2.7526, + "step": 980 + }, + { + "epoch": 0.3011049723756906, + "grad_norm": 1.829174518585205, + "learning_rate": 9.999999901174139e-05, + "loss": 2.7555, + "step": 981 + }, + { + "epoch": 0.3014119091467158, + "grad_norm": 1.5807569026947021, + "learning_rate": 9.999999777641814e-05, + "loss": 2.848, + "step": 982 + }, + { + "epoch": 0.30171884591774095, + "grad_norm": 2.014803171157837, + "learning_rate": 9.99999960469656e-05, + "loss": 2.8318, + "step": 983 + }, + { + "epoch": 0.3020257826887661, + "grad_norm": 1.4732542037963867, + "learning_rate": 9.99999938233838e-05, + "loss": 2.8143, + "step": 984 + }, + { + "epoch": 0.3023327194597913, + "grad_norm": 2.4888343811035156, + "learning_rate": 9.999999110567275e-05, + "loss": 2.7979, + "step": 985 + }, + { + "epoch": 0.30263965623081646, + "grad_norm": 1.4265737533569336, + "learning_rate": 9.99999878938325e-05, + "loss": 2.7968, + "step": 986 + }, + { + "epoch": 0.3029465930018416, + "grad_norm": 2.0397326946258545, + "learning_rate": 9.999998418786303e-05, + "loss": 2.7413, + "step": 987 + }, + { + "epoch": 0.3032535297728668, + "grad_norm": 1.6565579175949097, + "learning_rate": 9.999997998776443e-05, + "loss": 2.8249, + "step": 988 + }, + { + "epoch": 0.30356046654389196, + "grad_norm": 1.8470033407211304, + "learning_rate": 9.999997529353673e-05, + "loss": 2.7815, + "step": 989 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 1.571768045425415, + "learning_rate": 9.999997010517995e-05, + "loss": 2.7202, + "step": 990 + }, + { + "epoch": 0.3041743400859423, + "grad_norm": 1.6217811107635498, + "learning_rate": 9.999996442269417e-05, + "loss": 2.832, + "step": 991 + }, + { + "epoch": 0.30448127685696746, + "grad_norm": 1.745591640472412, + "learning_rate": 9.999995824607943e-05, + "loss": 2.8271, + "step": 992 + }, + { + "epoch": 0.30478821362799263, + "grad_norm": 1.6469355821609497, + "learning_rate": 9.99999515753358e-05, + "loss": 2.7699, + "step": 993 + }, + { + "epoch": 0.3050951503990178, + "grad_norm": 1.733182430267334, + "learning_rate": 9.999994441046334e-05, + "loss": 2.7927, + "step": 994 + }, + { + "epoch": 0.30540208717004297, + "grad_norm": 1.6043230295181274, + "learning_rate": 9.999993675146213e-05, + "loss": 2.7536, + "step": 995 + }, + { + "epoch": 0.30570902394106814, + "grad_norm": 1.8154711723327637, + "learning_rate": 9.999992859833222e-05, + "loss": 2.7795, + "step": 996 + }, + { + "epoch": 0.3060159607120933, + "grad_norm": 1.7553666830062866, + "learning_rate": 9.999991995107374e-05, + "loss": 2.8128, + "step": 997 + }, + { + "epoch": 0.3063228974831185, + "grad_norm": 1.702697992324829, + "learning_rate": 9.999991080968672e-05, + "loss": 2.7234, + "step": 998 + }, + { + "epoch": 0.30662983425414364, + "grad_norm": 1.512619972229004, + "learning_rate": 9.99999011741713e-05, + "loss": 2.7555, + "step": 999 + }, + { + "epoch": 0.3069367710251688, + "grad_norm": 1.735844612121582, + "learning_rate": 9.999989104452753e-05, + "loss": 2.7847, + "step": 1000 + }, + { + "epoch": 0.307243707796194, + "grad_norm": 1.4687904119491577, + "learning_rate": 9.999988042075555e-05, + "loss": 2.8039, + "step": 1001 + }, + { + "epoch": 0.30755064456721914, + "grad_norm": 1.6867917776107788, + "learning_rate": 9.999986930285542e-05, + "loss": 2.7643, + "step": 1002 + }, + { + "epoch": 0.3078575813382443, + "grad_norm": 1.6974400281906128, + "learning_rate": 9.99998576908273e-05, + "loss": 2.7284, + "step": 1003 + }, + { + "epoch": 0.3081645181092695, + "grad_norm": 1.6622353792190552, + "learning_rate": 9.999984558467126e-05, + "loss": 2.8364, + "step": 1004 + }, + { + "epoch": 0.30847145488029465, + "grad_norm": 1.7920496463775635, + "learning_rate": 9.999983298438744e-05, + "loss": 2.7769, + "step": 1005 + }, + { + "epoch": 0.3087783916513198, + "grad_norm": 1.7111997604370117, + "learning_rate": 9.999981988997598e-05, + "loss": 2.7323, + "step": 1006 + }, + { + "epoch": 0.309085328422345, + "grad_norm": 1.6372064352035522, + "learning_rate": 9.9999806301437e-05, + "loss": 2.8128, + "step": 1007 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 1.841002345085144, + "learning_rate": 9.999979221877061e-05, + "loss": 2.7049, + "step": 1008 + }, + { + "epoch": 0.3096992019643953, + "grad_norm": 1.4474141597747803, + "learning_rate": 9.999977764197697e-05, + "loss": 2.64, + "step": 1009 + }, + { + "epoch": 0.3100061387354205, + "grad_norm": 1.6599560976028442, + "learning_rate": 9.999976257105622e-05, + "loss": 2.7989, + "step": 1010 + }, + { + "epoch": 0.31031307550644566, + "grad_norm": 1.7502890825271606, + "learning_rate": 9.999974700600851e-05, + "loss": 2.7949, + "step": 1011 + }, + { + "epoch": 0.3106200122774708, + "grad_norm": 1.8119313716888428, + "learning_rate": 9.9999730946834e-05, + "loss": 2.7577, + "step": 1012 + }, + { + "epoch": 0.310926949048496, + "grad_norm": 1.4398404359817505, + "learning_rate": 9.999971439353284e-05, + "loss": 2.7369, + "step": 1013 + }, + { + "epoch": 0.31123388581952116, + "grad_norm": 1.8501840829849243, + "learning_rate": 9.999969734610522e-05, + "loss": 2.6651, + "step": 1014 + }, + { + "epoch": 0.31154082259054633, + "grad_norm": 1.450804352760315, + "learning_rate": 9.999967980455125e-05, + "loss": 2.7231, + "step": 1015 + }, + { + "epoch": 0.3118477593615715, + "grad_norm": 1.9445282220840454, + "learning_rate": 9.999966176887115e-05, + "loss": 2.795, + "step": 1016 + }, + { + "epoch": 0.31215469613259667, + "grad_norm": 1.6361008882522583, + "learning_rate": 9.99996432390651e-05, + "loss": 2.8894, + "step": 1017 + }, + { + "epoch": 0.31246163290362183, + "grad_norm": 2.0804831981658936, + "learning_rate": 9.999962421513325e-05, + "loss": 2.8313, + "step": 1018 + }, + { + "epoch": 0.312768569674647, + "grad_norm": 1.3779852390289307, + "learning_rate": 9.999960469707582e-05, + "loss": 2.6776, + "step": 1019 + }, + { + "epoch": 0.31307550644567217, + "grad_norm": 1.7727700471878052, + "learning_rate": 9.999958468489299e-05, + "loss": 2.8076, + "step": 1020 + }, + { + "epoch": 0.31338244321669734, + "grad_norm": 1.5273795127868652, + "learning_rate": 9.999956417858496e-05, + "loss": 2.7069, + "step": 1021 + }, + { + "epoch": 0.3136893799877225, + "grad_norm": 1.8135402202606201, + "learning_rate": 9.999954317815193e-05, + "loss": 2.7375, + "step": 1022 + }, + { + "epoch": 0.3139963167587477, + "grad_norm": 1.6642818450927734, + "learning_rate": 9.99995216835941e-05, + "loss": 2.8085, + "step": 1023 + }, + { + "epoch": 0.31430325352977284, + "grad_norm": 1.681378722190857, + "learning_rate": 9.999949969491169e-05, + "loss": 2.807, + "step": 1024 + }, + { + "epoch": 0.314610190300798, + "grad_norm": 1.5521160364151, + "learning_rate": 9.999947721210493e-05, + "loss": 2.7266, + "step": 1025 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 1.486830711364746, + "learning_rate": 9.999945423517403e-05, + "loss": 2.774, + "step": 1026 + }, + { + "epoch": 0.3152240638428484, + "grad_norm": 1.5730900764465332, + "learning_rate": 9.99994307641192e-05, + "loss": 2.7101, + "step": 1027 + }, + { + "epoch": 0.31553100061387357, + "grad_norm": 1.4835596084594727, + "learning_rate": 9.999940679894071e-05, + "loss": 2.8195, + "step": 1028 + }, + { + "epoch": 0.31583793738489874, + "grad_norm": 1.7885956764221191, + "learning_rate": 9.999938233963877e-05, + "loss": 2.796, + "step": 1029 + }, + { + "epoch": 0.3161448741559239, + "grad_norm": 1.4036259651184082, + "learning_rate": 9.999935738621362e-05, + "loss": 2.7167, + "step": 1030 + }, + { + "epoch": 0.3164518109269491, + "grad_norm": 1.7480512857437134, + "learning_rate": 9.999933193866554e-05, + "loss": 2.6774, + "step": 1031 + }, + { + "epoch": 0.31675874769797424, + "grad_norm": 1.66177499294281, + "learning_rate": 9.999930599699473e-05, + "loss": 2.7635, + "step": 1032 + }, + { + "epoch": 0.3170656844689994, + "grad_norm": 1.5088306665420532, + "learning_rate": 9.999927956120147e-05, + "loss": 2.7284, + "step": 1033 + }, + { + "epoch": 0.3173726212400246, + "grad_norm": 1.6847199201583862, + "learning_rate": 9.999925263128605e-05, + "loss": 2.8287, + "step": 1034 + }, + { + "epoch": 0.31767955801104975, + "grad_norm": 1.6092369556427002, + "learning_rate": 9.999922520724869e-05, + "loss": 2.7189, + "step": 1035 + }, + { + "epoch": 0.3179864947820749, + "grad_norm": 1.41717529296875, + "learning_rate": 9.999919728908969e-05, + "loss": 2.7134, + "step": 1036 + }, + { + "epoch": 0.3182934315531001, + "grad_norm": 1.6256498098373413, + "learning_rate": 9.999916887680931e-05, + "loss": 2.7312, + "step": 1037 + }, + { + "epoch": 0.31860036832412525, + "grad_norm": 1.4934377670288086, + "learning_rate": 9.999913997040784e-05, + "loss": 2.7548, + "step": 1038 + }, + { + "epoch": 0.3189073050951504, + "grad_norm": 1.6037719249725342, + "learning_rate": 9.999911056988557e-05, + "loss": 2.7682, + "step": 1039 + }, + { + "epoch": 0.3192142418661756, + "grad_norm": 1.4746284484863281, + "learning_rate": 9.999908067524277e-05, + "loss": 2.7256, + "step": 1040 + }, + { + "epoch": 0.31952117863720075, + "grad_norm": 1.4633710384368896, + "learning_rate": 9.999905028647976e-05, + "loss": 2.6779, + "step": 1041 + }, + { + "epoch": 0.3198281154082259, + "grad_norm": 1.6108646392822266, + "learning_rate": 9.999901940359684e-05, + "loss": 2.781, + "step": 1042 + }, + { + "epoch": 0.3201350521792511, + "grad_norm": 1.4130996465682983, + "learning_rate": 9.999898802659428e-05, + "loss": 2.6327, + "step": 1043 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 2.110307455062866, + "learning_rate": 9.999895615547244e-05, + "loss": 2.7965, + "step": 1044 + }, + { + "epoch": 0.3207489257213014, + "grad_norm": 1.500618815422058, + "learning_rate": 9.99989237902316e-05, + "loss": 2.7874, + "step": 1045 + }, + { + "epoch": 0.3210558624923266, + "grad_norm": 1.577890157699585, + "learning_rate": 9.999889093087207e-05, + "loss": 2.6816, + "step": 1046 + }, + { + "epoch": 0.32136279926335176, + "grad_norm": 1.2820981740951538, + "learning_rate": 9.999885757739422e-05, + "loss": 2.6799, + "step": 1047 + }, + { + "epoch": 0.32166973603437693, + "grad_norm": 1.629936695098877, + "learning_rate": 9.999882372979835e-05, + "loss": 2.6783, + "step": 1048 + }, + { + "epoch": 0.3219766728054021, + "grad_norm": 1.3119972944259644, + "learning_rate": 9.999878938808478e-05, + "loss": 2.6403, + "step": 1049 + }, + { + "epoch": 0.32228360957642727, + "grad_norm": 1.720093846321106, + "learning_rate": 9.999875455225389e-05, + "loss": 2.709, + "step": 1050 + }, + { + "epoch": 0.32259054634745243, + "grad_norm": 1.446273922920227, + "learning_rate": 9.999871922230599e-05, + "loss": 2.6463, + "step": 1051 + }, + { + "epoch": 0.3228974831184776, + "grad_norm": 1.5000908374786377, + "learning_rate": 9.999868339824145e-05, + "loss": 2.7502, + "step": 1052 + }, + { + "epoch": 0.32320441988950277, + "grad_norm": 1.6257869005203247, + "learning_rate": 9.999864708006061e-05, + "loss": 2.6984, + "step": 1053 + }, + { + "epoch": 0.32351135666052794, + "grad_norm": 1.509638786315918, + "learning_rate": 9.999861026776384e-05, + "loss": 2.6931, + "step": 1054 + }, + { + "epoch": 0.3238182934315531, + "grad_norm": 1.5305874347686768, + "learning_rate": 9.999857296135149e-05, + "loss": 2.8423, + "step": 1055 + }, + { + "epoch": 0.3241252302025783, + "grad_norm": 1.7664300203323364, + "learning_rate": 9.999853516082394e-05, + "loss": 2.7703, + "step": 1056 + }, + { + "epoch": 0.32443216697360344, + "grad_norm": 1.4633153676986694, + "learning_rate": 9.999849686618157e-05, + "loss": 2.7588, + "step": 1057 + }, + { + "epoch": 0.3247391037446286, + "grad_norm": 1.5177773237228394, + "learning_rate": 9.999845807742473e-05, + "loss": 2.7376, + "step": 1058 + }, + { + "epoch": 0.3250460405156538, + "grad_norm": 1.6122089624404907, + "learning_rate": 9.999841879455383e-05, + "loss": 2.7871, + "step": 1059 + }, + { + "epoch": 0.32535297728667895, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.999837901756926e-05, + "loss": 2.6602, + "step": 1060 + }, + { + "epoch": 0.3256599140577041, + "grad_norm": 1.5714327096939087, + "learning_rate": 9.99983387464714e-05, + "loss": 2.6279, + "step": 1061 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 1.399731993675232, + "learning_rate": 9.999829798126065e-05, + "loss": 2.7957, + "step": 1062 + }, + { + "epoch": 0.32627378759975445, + "grad_norm": 1.694368839263916, + "learning_rate": 9.999825672193741e-05, + "loss": 2.6859, + "step": 1063 + }, + { + "epoch": 0.3265807243707796, + "grad_norm": 1.2585967779159546, + "learning_rate": 9.99982149685021e-05, + "loss": 2.7964, + "step": 1064 + }, + { + "epoch": 0.3268876611418048, + "grad_norm": 1.802262306213379, + "learning_rate": 9.999817272095512e-05, + "loss": 2.6325, + "step": 1065 + }, + { + "epoch": 0.32719459791282995, + "grad_norm": 1.213222861289978, + "learning_rate": 9.99981299792969e-05, + "loss": 2.718, + "step": 1066 + }, + { + "epoch": 0.3275015346838551, + "grad_norm": 1.5745760202407837, + "learning_rate": 9.999808674352785e-05, + "loss": 2.8589, + "step": 1067 + }, + { + "epoch": 0.3278084714548803, + "grad_norm": 1.516995906829834, + "learning_rate": 9.999804301364839e-05, + "loss": 2.6691, + "step": 1068 + }, + { + "epoch": 0.32811540822590546, + "grad_norm": 1.4223122596740723, + "learning_rate": 9.999799878965897e-05, + "loss": 2.6899, + "step": 1069 + }, + { + "epoch": 0.3284223449969306, + "grad_norm": 1.4502828121185303, + "learning_rate": 9.999795407156003e-05, + "loss": 2.7801, + "step": 1070 + }, + { + "epoch": 0.3287292817679558, + "grad_norm": 1.4692026376724243, + "learning_rate": 9.999790885935198e-05, + "loss": 2.6869, + "step": 1071 + }, + { + "epoch": 0.32903621853898096, + "grad_norm": 1.4182246923446655, + "learning_rate": 9.999786315303532e-05, + "loss": 2.7802, + "step": 1072 + }, + { + "epoch": 0.32934315531000613, + "grad_norm": 1.781173586845398, + "learning_rate": 9.999781695261046e-05, + "loss": 2.7522, + "step": 1073 + }, + { + "epoch": 0.3296500920810313, + "grad_norm": 1.3958306312561035, + "learning_rate": 9.999777025807786e-05, + "loss": 2.6894, + "step": 1074 + }, + { + "epoch": 0.32995702885205647, + "grad_norm": 1.7938110828399658, + "learning_rate": 9.9997723069438e-05, + "loss": 2.6468, + "step": 1075 + }, + { + "epoch": 0.33026396562308163, + "grad_norm": 1.2314528226852417, + "learning_rate": 9.999767538669134e-05, + "loss": 2.7446, + "step": 1076 + }, + { + "epoch": 0.3305709023941068, + "grad_norm": 1.4881565570831299, + "learning_rate": 9.999762720983835e-05, + "loss": 2.6904, + "step": 1077 + }, + { + "epoch": 0.33087783916513197, + "grad_norm": 1.3903130292892456, + "learning_rate": 9.999757853887948e-05, + "loss": 2.7315, + "step": 1078 + }, + { + "epoch": 0.33118477593615714, + "grad_norm": 1.491129755973816, + "learning_rate": 9.999752937381525e-05, + "loss": 2.7325, + "step": 1079 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 1.4748190641403198, + "learning_rate": 9.999747971464612e-05, + "loss": 2.7288, + "step": 1080 + }, + { + "epoch": 0.3317986494782075, + "grad_norm": 1.5664055347442627, + "learning_rate": 9.99974295613726e-05, + "loss": 2.8225, + "step": 1081 + }, + { + "epoch": 0.33210558624923264, + "grad_norm": 1.4422696828842163, + "learning_rate": 9.999737891399518e-05, + "loss": 2.6537, + "step": 1082 + }, + { + "epoch": 0.3324125230202578, + "grad_norm": 1.397817850112915, + "learning_rate": 9.999732777251436e-05, + "loss": 2.6329, + "step": 1083 + }, + { + "epoch": 0.332719459791283, + "grad_norm": 1.4253548383712769, + "learning_rate": 9.999727613693063e-05, + "loss": 2.7028, + "step": 1084 + }, + { + "epoch": 0.33302639656230815, + "grad_norm": 1.4327688217163086, + "learning_rate": 9.999722400724451e-05, + "loss": 2.6524, + "step": 1085 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.999717138345654e-05, + "loss": 2.7278, + "step": 1086 + }, + { + "epoch": 0.3336402701043585, + "grad_norm": 1.536656379699707, + "learning_rate": 9.999711826556719e-05, + "loss": 2.5858, + "step": 1087 + }, + { + "epoch": 0.33394720687538365, + "grad_norm": 1.4210286140441895, + "learning_rate": 9.999706465357703e-05, + "loss": 2.7057, + "step": 1088 + }, + { + "epoch": 0.3342541436464088, + "grad_norm": 1.4605839252471924, + "learning_rate": 9.999701054748657e-05, + "loss": 2.6461, + "step": 1089 + }, + { + "epoch": 0.334561080417434, + "grad_norm": 1.4764037132263184, + "learning_rate": 9.999695594729636e-05, + "loss": 2.608, + "step": 1090 + }, + { + "epoch": 0.33486801718845915, + "grad_norm": 1.630843162536621, + "learning_rate": 9.99969008530069e-05, + "loss": 2.6165, + "step": 1091 + }, + { + "epoch": 0.3351749539594843, + "grad_norm": 1.3693522214889526, + "learning_rate": 9.999684526461879e-05, + "loss": 2.72, + "step": 1092 + }, + { + "epoch": 0.3354818907305095, + "grad_norm": 1.609580636024475, + "learning_rate": 9.999678918213254e-05, + "loss": 2.7602, + "step": 1093 + }, + { + "epoch": 0.33578882750153466, + "grad_norm": 1.3815720081329346, + "learning_rate": 9.999673260554872e-05, + "loss": 2.6297, + "step": 1094 + }, + { + "epoch": 0.3360957642725598, + "grad_norm": 1.4511120319366455, + "learning_rate": 9.999667553486787e-05, + "loss": 2.7515, + "step": 1095 + }, + { + "epoch": 0.336402701043585, + "grad_norm": 1.486387848854065, + "learning_rate": 9.999661797009057e-05, + "loss": 2.6839, + "step": 1096 + }, + { + "epoch": 0.33670963781461016, + "grad_norm": 1.239160180091858, + "learning_rate": 9.999655991121739e-05, + "loss": 2.6033, + "step": 1097 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 1.499598741531372, + "learning_rate": 9.999650135824891e-05, + "loss": 2.5582, + "step": 1098 + }, + { + "epoch": 0.33732351135666055, + "grad_norm": 1.32973051071167, + "learning_rate": 9.999644231118571e-05, + "loss": 2.6253, + "step": 1099 + }, + { + "epoch": 0.3376304481276857, + "grad_norm": 1.4025259017944336, + "learning_rate": 9.999638277002833e-05, + "loss": 2.6199, + "step": 1100 + }, + { + "epoch": 0.3379373848987109, + "grad_norm": 1.3162082433700562, + "learning_rate": 9.999632273477742e-05, + "loss": 2.5528, + "step": 1101 + }, + { + "epoch": 0.33824432166973606, + "grad_norm": 1.5454723834991455, + "learning_rate": 9.999626220543352e-05, + "loss": 2.6724, + "step": 1102 + }, + { + "epoch": 0.3385512584407612, + "grad_norm": 1.45896315574646, + "learning_rate": 9.999620118199727e-05, + "loss": 2.688, + "step": 1103 + }, + { + "epoch": 0.3388581952117864, + "grad_norm": 1.3940998315811157, + "learning_rate": 9.999613966446926e-05, + "loss": 2.6991, + "step": 1104 + }, + { + "epoch": 0.33916513198281156, + "grad_norm": 1.4427480697631836, + "learning_rate": 9.999607765285009e-05, + "loss": 2.6869, + "step": 1105 + }, + { + "epoch": 0.33947206875383673, + "grad_norm": 1.260373830795288, + "learning_rate": 9.999601514714036e-05, + "loss": 2.7011, + "step": 1106 + }, + { + "epoch": 0.3397790055248619, + "grad_norm": 1.5985103845596313, + "learning_rate": 9.999595214734072e-05, + "loss": 2.599, + "step": 1107 + }, + { + "epoch": 0.34008594229588707, + "grad_norm": 1.1968494653701782, + "learning_rate": 9.999588865345179e-05, + "loss": 2.6346, + "step": 1108 + }, + { + "epoch": 0.34039287906691224, + "grad_norm": 1.4565916061401367, + "learning_rate": 9.999582466547417e-05, + "loss": 2.6303, + "step": 1109 + }, + { + "epoch": 0.3406998158379374, + "grad_norm": 1.2992361783981323, + "learning_rate": 9.999576018340851e-05, + "loss": 2.6121, + "step": 1110 + }, + { + "epoch": 0.34100675260896257, + "grad_norm": 1.402471899986267, + "learning_rate": 9.999569520725543e-05, + "loss": 2.6697, + "step": 1111 + }, + { + "epoch": 0.34131368937998774, + "grad_norm": 1.3006439208984375, + "learning_rate": 9.99956297370156e-05, + "loss": 2.6347, + "step": 1112 + }, + { + "epoch": 0.3416206261510129, + "grad_norm": 1.4235650300979614, + "learning_rate": 9.999556377268966e-05, + "loss": 2.6869, + "step": 1113 + }, + { + "epoch": 0.3419275629220381, + "grad_norm": 1.3288183212280273, + "learning_rate": 9.999549731427824e-05, + "loss": 2.5834, + "step": 1114 + }, + { + "epoch": 0.34223449969306324, + "grad_norm": 1.430736780166626, + "learning_rate": 9.999543036178203e-05, + "loss": 2.6248, + "step": 1115 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 1.467417597770691, + "learning_rate": 9.999536291520167e-05, + "loss": 2.6563, + "step": 1116 + }, + { + "epoch": 0.3428483732351136, + "grad_norm": 1.3988397121429443, + "learning_rate": 9.999529497453782e-05, + "loss": 2.6634, + "step": 1117 + }, + { + "epoch": 0.34315531000613875, + "grad_norm": 1.2072746753692627, + "learning_rate": 9.999522653979117e-05, + "loss": 2.6129, + "step": 1118 + }, + { + "epoch": 0.3434622467771639, + "grad_norm": 1.5297373533248901, + "learning_rate": 9.999515761096239e-05, + "loss": 2.6359, + "step": 1119 + }, + { + "epoch": 0.3437691835481891, + "grad_norm": 1.2022082805633545, + "learning_rate": 9.999508818805214e-05, + "loss": 2.6934, + "step": 1120 + }, + { + "epoch": 0.34407612031921425, + "grad_norm": 1.5655800104141235, + "learning_rate": 9.999501827106114e-05, + "loss": 2.6132, + "step": 1121 + }, + { + "epoch": 0.3443830570902394, + "grad_norm": 1.1639407873153687, + "learning_rate": 9.999494785999007e-05, + "loss": 2.6416, + "step": 1122 + }, + { + "epoch": 0.3446899938612646, + "grad_norm": 1.5784116983413696, + "learning_rate": 9.999487695483962e-05, + "loss": 2.5967, + "step": 1123 + }, + { + "epoch": 0.34499693063228976, + "grad_norm": 1.1812770366668701, + "learning_rate": 9.999480555561049e-05, + "loss": 2.6303, + "step": 1124 + }, + { + "epoch": 0.3453038674033149, + "grad_norm": 1.5105888843536377, + "learning_rate": 9.99947336623034e-05, + "loss": 2.58, + "step": 1125 + }, + { + "epoch": 0.3456108041743401, + "grad_norm": 1.2969506978988647, + "learning_rate": 9.999466127491904e-05, + "loss": 2.6857, + "step": 1126 + }, + { + "epoch": 0.34591774094536526, + "grad_norm": 1.679018259048462, + "learning_rate": 9.999458839345812e-05, + "loss": 2.6304, + "step": 1127 + }, + { + "epoch": 0.3462246777163904, + "grad_norm": 1.2718015909194946, + "learning_rate": 9.99945150179214e-05, + "loss": 2.6929, + "step": 1128 + }, + { + "epoch": 0.3465316144874156, + "grad_norm": 1.5834014415740967, + "learning_rate": 9.999444114830957e-05, + "loss": 2.6477, + "step": 1129 + }, + { + "epoch": 0.34683855125844076, + "grad_norm": 1.1575955152511597, + "learning_rate": 9.999436678462338e-05, + "loss": 2.6908, + "step": 1130 + }, + { + "epoch": 0.34714548802946593, + "grad_norm": 1.6231988668441772, + "learning_rate": 9.999429192686352e-05, + "loss": 2.6741, + "step": 1131 + }, + { + "epoch": 0.3474524248004911, + "grad_norm": 1.1616390943527222, + "learning_rate": 9.99942165750308e-05, + "loss": 2.5977, + "step": 1132 + }, + { + "epoch": 0.34775936157151627, + "grad_norm": 1.6188498735427856, + "learning_rate": 9.999414072912592e-05, + "loss": 2.6776, + "step": 1133 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 1.3885529041290283, + "learning_rate": 9.999406438914962e-05, + "loss": 2.7136, + "step": 1134 + }, + { + "epoch": 0.3483732351135666, + "grad_norm": 1.4522851705551147, + "learning_rate": 9.999398755510269e-05, + "loss": 2.6817, + "step": 1135 + }, + { + "epoch": 0.34868017188459177, + "grad_norm": 1.2695082426071167, + "learning_rate": 9.999391022698588e-05, + "loss": 2.6257, + "step": 1136 + }, + { + "epoch": 0.34898710865561694, + "grad_norm": 1.1735594272613525, + "learning_rate": 9.999383240479993e-05, + "loss": 2.5908, + "step": 1137 + }, + { + "epoch": 0.3492940454266421, + "grad_norm": 1.4158523082733154, + "learning_rate": 9.999375408854564e-05, + "loss": 2.572, + "step": 1138 + }, + { + "epoch": 0.3496009821976673, + "grad_norm": 1.1342333555221558, + "learning_rate": 9.999367527822376e-05, + "loss": 2.6918, + "step": 1139 + }, + { + "epoch": 0.34990791896869244, + "grad_norm": 1.4462997913360596, + "learning_rate": 9.999359597383509e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 0.3502148557397176, + "grad_norm": 1.254346251487732, + "learning_rate": 9.99935161753804e-05, + "loss": 2.6426, + "step": 1141 + }, + { + "epoch": 0.3505217925107428, + "grad_norm": 1.5101851224899292, + "learning_rate": 9.999343588286048e-05, + "loss": 2.6261, + "step": 1142 + }, + { + "epoch": 0.35082872928176795, + "grad_norm": 1.2910065650939941, + "learning_rate": 9.999335509627612e-05, + "loss": 2.5587, + "step": 1143 + }, + { + "epoch": 0.3511356660527931, + "grad_norm": 1.4421133995056152, + "learning_rate": 9.999327381562812e-05, + "loss": 2.6812, + "step": 1144 + }, + { + "epoch": 0.3514426028238183, + "grad_norm": 1.3265037536621094, + "learning_rate": 9.999319204091728e-05, + "loss": 2.6506, + "step": 1145 + }, + { + "epoch": 0.35174953959484345, + "grad_norm": 1.346258521080017, + "learning_rate": 9.999310977214443e-05, + "loss": 2.7038, + "step": 1146 + }, + { + "epoch": 0.3520564763658686, + "grad_norm": 1.3683836460113525, + "learning_rate": 9.999302700931037e-05, + "loss": 2.5823, + "step": 1147 + }, + { + "epoch": 0.3523634131368938, + "grad_norm": 1.3593783378601074, + "learning_rate": 9.99929437524159e-05, + "loss": 2.5705, + "step": 1148 + }, + { + "epoch": 0.35267034990791896, + "grad_norm": 1.4077095985412598, + "learning_rate": 9.999286000146186e-05, + "loss": 2.6259, + "step": 1149 + }, + { + "epoch": 0.3529772866789441, + "grad_norm": 1.3095922470092773, + "learning_rate": 9.99927757564491e-05, + "loss": 2.683, + "step": 1150 + }, + { + "epoch": 0.3532842234499693, + "grad_norm": 1.4188631772994995, + "learning_rate": 9.999269101737841e-05, + "loss": 2.619, + "step": 1151 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 1.2483123540878296, + "learning_rate": 9.999260578425063e-05, + "loss": 2.6477, + "step": 1152 + }, + { + "epoch": 0.35389809699201963, + "grad_norm": 1.4601099491119385, + "learning_rate": 9.999252005706663e-05, + "loss": 2.5861, + "step": 1153 + }, + { + "epoch": 0.3542050337630448, + "grad_norm": 1.107335090637207, + "learning_rate": 9.999243383582726e-05, + "loss": 2.6308, + "step": 1154 + }, + { + "epoch": 0.35451197053406996, + "grad_norm": 1.60590398311615, + "learning_rate": 9.999234712053334e-05, + "loss": 2.7057, + "step": 1155 + }, + { + "epoch": 0.35481890730509513, + "grad_norm": 1.2256578207015991, + "learning_rate": 9.999225991118575e-05, + "loss": 2.6371, + "step": 1156 + }, + { + "epoch": 0.3551258440761203, + "grad_norm": 1.4451910257339478, + "learning_rate": 9.999217220778535e-05, + "loss": 2.6424, + "step": 1157 + }, + { + "epoch": 0.35543278084714547, + "grad_norm": 1.184781789779663, + "learning_rate": 9.999208401033299e-05, + "loss": 2.6576, + "step": 1158 + }, + { + "epoch": 0.35573971761817064, + "grad_norm": 1.3395711183547974, + "learning_rate": 9.999199531882956e-05, + "loss": 2.6109, + "step": 1159 + }, + { + "epoch": 0.3560466543891958, + "grad_norm": 1.2052571773529053, + "learning_rate": 9.999190613327594e-05, + "loss": 2.5486, + "step": 1160 + }, + { + "epoch": 0.356353591160221, + "grad_norm": 1.2690850496292114, + "learning_rate": 9.999181645367299e-05, + "loss": 2.6457, + "step": 1161 + }, + { + "epoch": 0.35666052793124614, + "grad_norm": 1.2832787036895752, + "learning_rate": 9.999172628002162e-05, + "loss": 2.6097, + "step": 1162 + }, + { + "epoch": 0.3569674647022713, + "grad_norm": 1.3791579008102417, + "learning_rate": 9.999163561232272e-05, + "loss": 2.7458, + "step": 1163 + }, + { + "epoch": 0.3572744014732965, + "grad_norm": 1.260743498802185, + "learning_rate": 9.999154445057715e-05, + "loss": 2.594, + "step": 1164 + }, + { + "epoch": 0.35758133824432164, + "grad_norm": 1.1595406532287598, + "learning_rate": 9.999145279478585e-05, + "loss": 2.5315, + "step": 1165 + }, + { + "epoch": 0.3578882750153468, + "grad_norm": 1.3424396514892578, + "learning_rate": 9.999136064494972e-05, + "loss": 2.6017, + "step": 1166 + }, + { + "epoch": 0.358195211786372, + "grad_norm": 1.317750334739685, + "learning_rate": 9.999126800106963e-05, + "loss": 2.5787, + "step": 1167 + }, + { + "epoch": 0.35850214855739715, + "grad_norm": 1.104471206665039, + "learning_rate": 9.999117486314657e-05, + "loss": 2.6801, + "step": 1168 + }, + { + "epoch": 0.3588090853284224, + "grad_norm": 1.5555830001831055, + "learning_rate": 9.99910812311814e-05, + "loss": 2.6575, + "step": 1169 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 1.1883453130722046, + "learning_rate": 9.999098710517507e-05, + "loss": 2.5801, + "step": 1170 + }, + { + "epoch": 0.3594229588704727, + "grad_norm": 1.3885222673416138, + "learning_rate": 9.99908924851285e-05, + "loss": 2.5637, + "step": 1171 + }, + { + "epoch": 0.3597298956414979, + "grad_norm": 1.1860510110855103, + "learning_rate": 9.999079737104262e-05, + "loss": 2.6528, + "step": 1172 + }, + { + "epoch": 0.36003683241252304, + "grad_norm": 1.4319096803665161, + "learning_rate": 9.99907017629184e-05, + "loss": 2.579, + "step": 1173 + }, + { + "epoch": 0.3603437691835482, + "grad_norm": 1.256819725036621, + "learning_rate": 9.999060566075676e-05, + "loss": 2.5638, + "step": 1174 + }, + { + "epoch": 0.3606507059545734, + "grad_norm": 1.5452641248703003, + "learning_rate": 9.999050906455865e-05, + "loss": 2.6318, + "step": 1175 + }, + { + "epoch": 0.36095764272559855, + "grad_norm": 1.1933847665786743, + "learning_rate": 9.999041197432503e-05, + "loss": 2.5451, + "step": 1176 + }, + { + "epoch": 0.3612645794966237, + "grad_norm": 1.245689034461975, + "learning_rate": 9.999031439005684e-05, + "loss": 2.5452, + "step": 1177 + }, + { + "epoch": 0.3615715162676489, + "grad_norm": 1.2228111028671265, + "learning_rate": 9.99902163117551e-05, + "loss": 2.5856, + "step": 1178 + }, + { + "epoch": 0.36187845303867405, + "grad_norm": 1.3547098636627197, + "learning_rate": 9.999011773942071e-05, + "loss": 2.6604, + "step": 1179 + }, + { + "epoch": 0.3621853898096992, + "grad_norm": 1.25395929813385, + "learning_rate": 9.999001867305469e-05, + "loss": 2.5947, + "step": 1180 + }, + { + "epoch": 0.3624923265807244, + "grad_norm": 1.1676687002182007, + "learning_rate": 9.9989919112658e-05, + "loss": 2.5728, + "step": 1181 + }, + { + "epoch": 0.36279926335174956, + "grad_norm": 1.2076375484466553, + "learning_rate": 9.998981905823163e-05, + "loss": 2.569, + "step": 1182 + }, + { + "epoch": 0.3631062001227747, + "grad_norm": 1.3417900800704956, + "learning_rate": 9.998971850977659e-05, + "loss": 2.5552, + "step": 1183 + }, + { + "epoch": 0.3634131368937999, + "grad_norm": 1.135088324546814, + "learning_rate": 9.998961746729383e-05, + "loss": 2.5883, + "step": 1184 + }, + { + "epoch": 0.36372007366482506, + "grad_norm": 1.3329869508743286, + "learning_rate": 9.998951593078438e-05, + "loss": 2.6398, + "step": 1185 + }, + { + "epoch": 0.36402701043585023, + "grad_norm": 1.1681292057037354, + "learning_rate": 9.998941390024923e-05, + "loss": 2.6082, + "step": 1186 + }, + { + "epoch": 0.3643339472068754, + "grad_norm": 1.4083843231201172, + "learning_rate": 9.998931137568939e-05, + "loss": 2.6585, + "step": 1187 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 1.0879896879196167, + "learning_rate": 9.998920835710587e-05, + "loss": 2.4779, + "step": 1188 + }, + { + "epoch": 0.36494782074892573, + "grad_norm": 1.2977828979492188, + "learning_rate": 9.99891048444997e-05, + "loss": 2.6586, + "step": 1189 + }, + { + "epoch": 0.3652547575199509, + "grad_norm": 1.2552378177642822, + "learning_rate": 9.998900083787188e-05, + "loss": 2.5211, + "step": 1190 + }, + { + "epoch": 0.36556169429097607, + "grad_norm": 1.178227186203003, + "learning_rate": 9.998889633722348e-05, + "loss": 2.5365, + "step": 1191 + }, + { + "epoch": 0.36586863106200124, + "grad_norm": 1.36601722240448, + "learning_rate": 9.99887913425555e-05, + "loss": 2.6108, + "step": 1192 + }, + { + "epoch": 0.3661755678330264, + "grad_norm": 1.1947816610336304, + "learning_rate": 9.998868585386898e-05, + "loss": 2.5269, + "step": 1193 + }, + { + "epoch": 0.3664825046040516, + "grad_norm": 1.3113429546356201, + "learning_rate": 9.998857987116497e-05, + "loss": 2.5241, + "step": 1194 + }, + { + "epoch": 0.36678944137507674, + "grad_norm": 1.1573466062545776, + "learning_rate": 9.99884733944445e-05, + "loss": 2.5772, + "step": 1195 + }, + { + "epoch": 0.3670963781461019, + "grad_norm": 1.3841795921325684, + "learning_rate": 9.998836642370866e-05, + "loss": 2.6254, + "step": 1196 + }, + { + "epoch": 0.3674033149171271, + "grad_norm": 1.3332045078277588, + "learning_rate": 9.998825895895848e-05, + "loss": 2.6846, + "step": 1197 + }, + { + "epoch": 0.36771025168815225, + "grad_norm": 1.1578748226165771, + "learning_rate": 9.9988151000195e-05, + "loss": 2.4717, + "step": 1198 + }, + { + "epoch": 0.3680171884591774, + "grad_norm": 1.1045753955841064, + "learning_rate": 9.998804254741934e-05, + "loss": 2.6433, + "step": 1199 + }, + { + "epoch": 0.3683241252302026, + "grad_norm": 1.3260962963104248, + "learning_rate": 9.998793360063254e-05, + "loss": 2.6385, + "step": 1200 + }, + { + "epoch": 0.36863106200122775, + "grad_norm": 1.1483805179595947, + "learning_rate": 9.998782415983568e-05, + "loss": 2.6013, + "step": 1201 + }, + { + "epoch": 0.3689379987722529, + "grad_norm": 1.1897181272506714, + "learning_rate": 9.998771422502984e-05, + "loss": 2.485, + "step": 1202 + }, + { + "epoch": 0.3692449355432781, + "grad_norm": 1.2124346494674683, + "learning_rate": 9.99876037962161e-05, + "loss": 2.6271, + "step": 1203 + }, + { + "epoch": 0.36955187231430325, + "grad_norm": 1.2274240255355835, + "learning_rate": 9.998749287339557e-05, + "loss": 2.6072, + "step": 1204 + }, + { + "epoch": 0.3698588090853284, + "grad_norm": 1.2045015096664429, + "learning_rate": 9.998738145656934e-05, + "loss": 2.5567, + "step": 1205 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 1.187698483467102, + "learning_rate": 9.998726954573852e-05, + "loss": 2.6251, + "step": 1206 + }, + { + "epoch": 0.37047268262737876, + "grad_norm": 1.1760836839675903, + "learning_rate": 9.998715714090419e-05, + "loss": 2.6544, + "step": 1207 + }, + { + "epoch": 0.3707796193984039, + "grad_norm": 1.2181260585784912, + "learning_rate": 9.998704424206746e-05, + "loss": 2.6258, + "step": 1208 + }, + { + "epoch": 0.3710865561694291, + "grad_norm": 1.2106094360351562, + "learning_rate": 9.998693084922947e-05, + "loss": 2.5932, + "step": 1209 + }, + { + "epoch": 0.37139349294045426, + "grad_norm": 1.2973625659942627, + "learning_rate": 9.998681696239133e-05, + "loss": 2.5257, + "step": 1210 + }, + { + "epoch": 0.37170042971147943, + "grad_norm": 1.2477924823760986, + "learning_rate": 9.998670258155417e-05, + "loss": 2.6579, + "step": 1211 + }, + { + "epoch": 0.3720073664825046, + "grad_norm": 1.3301422595977783, + "learning_rate": 9.998658770671913e-05, + "loss": 2.4903, + "step": 1212 + }, + { + "epoch": 0.37231430325352977, + "grad_norm": 1.224321722984314, + "learning_rate": 9.998647233788732e-05, + "loss": 2.5865, + "step": 1213 + }, + { + "epoch": 0.37262124002455493, + "grad_norm": 1.3110655546188354, + "learning_rate": 9.99863564750599e-05, + "loss": 2.6134, + "step": 1214 + }, + { + "epoch": 0.3729281767955801, + "grad_norm": 1.2323014736175537, + "learning_rate": 9.998624011823801e-05, + "loss": 2.5892, + "step": 1215 + }, + { + "epoch": 0.37323511356660527, + "grad_norm": 1.0873770713806152, + "learning_rate": 9.998612326742279e-05, + "loss": 2.4897, + "step": 1216 + }, + { + "epoch": 0.37354205033763044, + "grad_norm": 1.2789679765701294, + "learning_rate": 9.998600592261539e-05, + "loss": 2.5603, + "step": 1217 + }, + { + "epoch": 0.3738489871086556, + "grad_norm": 1.1311540603637695, + "learning_rate": 9.998588808381699e-05, + "loss": 2.5327, + "step": 1218 + }, + { + "epoch": 0.3741559238796808, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.998576975102876e-05, + "loss": 2.4789, + "step": 1219 + }, + { + "epoch": 0.37446286065070594, + "grad_norm": 1.1840651035308838, + "learning_rate": 9.998565092425182e-05, + "loss": 2.5026, + "step": 1220 + }, + { + "epoch": 0.3747697974217311, + "grad_norm": 1.3145099878311157, + "learning_rate": 9.998553160348743e-05, + "loss": 2.5424, + "step": 1221 + }, + { + "epoch": 0.3750767341927563, + "grad_norm": 1.2192758321762085, + "learning_rate": 9.998541178873668e-05, + "loss": 2.5556, + "step": 1222 + }, + { + "epoch": 0.37538367096378145, + "grad_norm": 1.1329905986785889, + "learning_rate": 9.99852914800008e-05, + "loss": 2.4624, + "step": 1223 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 1.2490339279174805, + "learning_rate": 9.9985170677281e-05, + "loss": 2.5016, + "step": 1224 + }, + { + "epoch": 0.3759975445058318, + "grad_norm": 1.1884582042694092, + "learning_rate": 9.998504938057841e-05, + "loss": 2.5345, + "step": 1225 + }, + { + "epoch": 0.37630448127685695, + "grad_norm": 1.2075775861740112, + "learning_rate": 9.998492758989428e-05, + "loss": 2.5206, + "step": 1226 + }, + { + "epoch": 0.3766114180478821, + "grad_norm": 1.238457441329956, + "learning_rate": 9.99848053052298e-05, + "loss": 2.6748, + "step": 1227 + }, + { + "epoch": 0.3769183548189073, + "grad_norm": 1.3056883811950684, + "learning_rate": 9.998468252658618e-05, + "loss": 2.6146, + "step": 1228 + }, + { + "epoch": 0.37722529158993245, + "grad_norm": 1.191575050354004, + "learning_rate": 9.998455925396461e-05, + "loss": 2.4743, + "step": 1229 + }, + { + "epoch": 0.3775322283609576, + "grad_norm": 1.2834603786468506, + "learning_rate": 9.998443548736635e-05, + "loss": 2.5504, + "step": 1230 + }, + { + "epoch": 0.3778391651319828, + "grad_norm": 1.3023632764816284, + "learning_rate": 9.99843112267926e-05, + "loss": 2.5832, + "step": 1231 + }, + { + "epoch": 0.37814610190300796, + "grad_norm": 1.1219336986541748, + "learning_rate": 9.998418647224458e-05, + "loss": 2.5715, + "step": 1232 + }, + { + "epoch": 0.3784530386740331, + "grad_norm": 1.0666810274124146, + "learning_rate": 9.998406122372354e-05, + "loss": 2.4865, + "step": 1233 + }, + { + "epoch": 0.3787599754450583, + "grad_norm": 1.3699263334274292, + "learning_rate": 9.998393548123072e-05, + "loss": 2.5523, + "step": 1234 + }, + { + "epoch": 0.37906691221608346, + "grad_norm": 1.1383014917373657, + "learning_rate": 9.998380924476733e-05, + "loss": 2.7054, + "step": 1235 + }, + { + "epoch": 0.37937384898710863, + "grad_norm": 1.1304205656051636, + "learning_rate": 9.998368251433465e-05, + "loss": 2.5007, + "step": 1236 + }, + { + "epoch": 0.3796807857581338, + "grad_norm": 1.2220405340194702, + "learning_rate": 9.998355528993394e-05, + "loss": 2.5635, + "step": 1237 + }, + { + "epoch": 0.37998772252915897, + "grad_norm": 1.1126691102981567, + "learning_rate": 9.998342757156642e-05, + "loss": 2.5795, + "step": 1238 + }, + { + "epoch": 0.38029465930018413, + "grad_norm": 1.1675945520401, + "learning_rate": 9.998329935923339e-05, + "loss": 2.564, + "step": 1239 + }, + { + "epoch": 0.38060159607120936, + "grad_norm": 1.1286569833755493, + "learning_rate": 9.998317065293607e-05, + "loss": 2.5476, + "step": 1240 + }, + { + "epoch": 0.3809085328422345, + "grad_norm": 1.1252213716506958, + "learning_rate": 9.998304145267579e-05, + "loss": 2.5406, + "step": 1241 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 1.1931700706481934, + "learning_rate": 9.998291175845378e-05, + "loss": 2.5277, + "step": 1242 + }, + { + "epoch": 0.38152240638428486, + "grad_norm": 1.2148306369781494, + "learning_rate": 9.998278157027136e-05, + "loss": 2.5178, + "step": 1243 + }, + { + "epoch": 0.38182934315531003, + "grad_norm": 1.1597660779953003, + "learning_rate": 9.998265088812978e-05, + "loss": 2.5522, + "step": 1244 + }, + { + "epoch": 0.3821362799263352, + "grad_norm": 1.105973243713379, + "learning_rate": 9.998251971203035e-05, + "loss": 2.4558, + "step": 1245 + }, + { + "epoch": 0.38244321669736037, + "grad_norm": 1.1082781553268433, + "learning_rate": 9.998238804197437e-05, + "loss": 2.5504, + "step": 1246 + }, + { + "epoch": 0.38275015346838553, + "grad_norm": 1.2124732732772827, + "learning_rate": 9.998225587796312e-05, + "loss": 2.5536, + "step": 1247 + }, + { + "epoch": 0.3830570902394107, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.998212321999795e-05, + "loss": 2.4837, + "step": 1248 + }, + { + "epoch": 0.38336402701043587, + "grad_norm": 1.353562355041504, + "learning_rate": 9.998199006808014e-05, + "loss": 2.4554, + "step": 1249 + }, + { + "epoch": 0.38367096378146104, + "grad_norm": 1.2103357315063477, + "learning_rate": 9.998185642221098e-05, + "loss": 2.4843, + "step": 1250 + }, + { + "epoch": 0.3839779005524862, + "grad_norm": 1.2572352886199951, + "learning_rate": 9.998172228239185e-05, + "loss": 2.497, + "step": 1251 + }, + { + "epoch": 0.3842848373235114, + "grad_norm": 1.0910226106643677, + "learning_rate": 9.998158764862402e-05, + "loss": 2.577, + "step": 1252 + }, + { + "epoch": 0.38459177409453654, + "grad_norm": 1.2550606727600098, + "learning_rate": 9.998145252090886e-05, + "loss": 2.5087, + "step": 1253 + }, + { + "epoch": 0.3848987108655617, + "grad_norm": 1.0103787183761597, + "learning_rate": 9.998131689924768e-05, + "loss": 2.5306, + "step": 1254 + }, + { + "epoch": 0.3852056476365869, + "grad_norm": 1.2965941429138184, + "learning_rate": 9.998118078364184e-05, + "loss": 2.5622, + "step": 1255 + }, + { + "epoch": 0.38551258440761205, + "grad_norm": 1.0791535377502441, + "learning_rate": 9.998104417409269e-05, + "loss": 2.5608, + "step": 1256 + }, + { + "epoch": 0.3858195211786372, + "grad_norm": 1.3277596235275269, + "learning_rate": 9.998090707060155e-05, + "loss": 2.5748, + "step": 1257 + }, + { + "epoch": 0.3861264579496624, + "grad_norm": 1.004031777381897, + "learning_rate": 9.99807694731698e-05, + "loss": 2.5532, + "step": 1258 + }, + { + "epoch": 0.38643339472068755, + "grad_norm": 1.4802277088165283, + "learning_rate": 9.998063138179877e-05, + "loss": 2.585, + "step": 1259 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 1.0821146965026855, + "learning_rate": 9.998049279648987e-05, + "loss": 2.5248, + "step": 1260 + }, + { + "epoch": 0.3870472682627379, + "grad_norm": 1.2902108430862427, + "learning_rate": 9.998035371724443e-05, + "loss": 2.5134, + "step": 1261 + }, + { + "epoch": 0.38735420503376305, + "grad_norm": 1.082943320274353, + "learning_rate": 9.998021414406385e-05, + "loss": 2.5937, + "step": 1262 + }, + { + "epoch": 0.3876611418047882, + "grad_norm": 1.2164193391799927, + "learning_rate": 9.998007407694949e-05, + "loss": 2.5106, + "step": 1263 + }, + { + "epoch": 0.3879680785758134, + "grad_norm": 1.0999115705490112, + "learning_rate": 9.997993351590276e-05, + "loss": 2.5458, + "step": 1264 + }, + { + "epoch": 0.38827501534683856, + "grad_norm": 1.2275537252426147, + "learning_rate": 9.997979246092503e-05, + "loss": 2.5664, + "step": 1265 + }, + { + "epoch": 0.3885819521178637, + "grad_norm": 1.3246204853057861, + "learning_rate": 9.997965091201769e-05, + "loss": 2.5289, + "step": 1266 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.2404677867889404, + "learning_rate": 9.997950886918214e-05, + "loss": 2.5302, + "step": 1267 + }, + { + "epoch": 0.38919582565991406, + "grad_norm": 1.0993810892105103, + "learning_rate": 9.99793663324198e-05, + "loss": 2.5085, + "step": 1268 + }, + { + "epoch": 0.38950276243093923, + "grad_norm": 1.3394049406051636, + "learning_rate": 9.997922330173206e-05, + "loss": 2.5882, + "step": 1269 + }, + { + "epoch": 0.3898096992019644, + "grad_norm": 1.1464321613311768, + "learning_rate": 9.997907977712036e-05, + "loss": 2.5211, + "step": 1270 + }, + { + "epoch": 0.39011663597298957, + "grad_norm": 1.1246297359466553, + "learning_rate": 9.997893575858608e-05, + "loss": 2.4204, + "step": 1271 + }, + { + "epoch": 0.39042357274401474, + "grad_norm": 1.1278076171875, + "learning_rate": 9.997879124613067e-05, + "loss": 2.4405, + "step": 1272 + }, + { + "epoch": 0.3907305095150399, + "grad_norm": 1.2284942865371704, + "learning_rate": 9.997864623975555e-05, + "loss": 2.5674, + "step": 1273 + }, + { + "epoch": 0.39103744628606507, + "grad_norm": 1.1243138313293457, + "learning_rate": 9.997850073946215e-05, + "loss": 2.489, + "step": 1274 + }, + { + "epoch": 0.39134438305709024, + "grad_norm": 1.198461890220642, + "learning_rate": 9.997835474525193e-05, + "loss": 2.51, + "step": 1275 + }, + { + "epoch": 0.3916513198281154, + "grad_norm": 1.1643213033676147, + "learning_rate": 9.997820825712629e-05, + "loss": 2.5688, + "step": 1276 + }, + { + "epoch": 0.3919582565991406, + "grad_norm": 1.2107082605361938, + "learning_rate": 9.997806127508671e-05, + "loss": 2.5614, + "step": 1277 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 1.1856440305709839, + "learning_rate": 9.997791379913464e-05, + "loss": 2.5893, + "step": 1278 + }, + { + "epoch": 0.3925721301411909, + "grad_norm": 1.166395664215088, + "learning_rate": 9.997776582927153e-05, + "loss": 2.539, + "step": 1279 + }, + { + "epoch": 0.3928790669122161, + "grad_norm": 1.1638765335083008, + "learning_rate": 9.997761736549886e-05, + "loss": 2.5384, + "step": 1280 + }, + { + "epoch": 0.39318600368324125, + "grad_norm": 1.107485055923462, + "learning_rate": 9.997746840781806e-05, + "loss": 2.559, + "step": 1281 + }, + { + "epoch": 0.3934929404542664, + "grad_norm": 1.174592137336731, + "learning_rate": 9.997731895623063e-05, + "loss": 2.5132, + "step": 1282 + }, + { + "epoch": 0.3937998772252916, + "grad_norm": 1.0407745838165283, + "learning_rate": 9.997716901073806e-05, + "loss": 2.4871, + "step": 1283 + }, + { + "epoch": 0.39410681399631675, + "grad_norm": 1.059743046760559, + "learning_rate": 9.997701857134179e-05, + "loss": 2.4865, + "step": 1284 + }, + { + "epoch": 0.3944137507673419, + "grad_norm": 1.0606070756912231, + "learning_rate": 9.997686763804335e-05, + "loss": 2.5651, + "step": 1285 + }, + { + "epoch": 0.3947206875383671, + "grad_norm": 1.0753284692764282, + "learning_rate": 9.99767162108442e-05, + "loss": 2.4699, + "step": 1286 + }, + { + "epoch": 0.39502762430939226, + "grad_norm": 1.1155509948730469, + "learning_rate": 9.997656428974585e-05, + "loss": 2.5326, + "step": 1287 + }, + { + "epoch": 0.3953345610804174, + "grad_norm": 1.2243739366531372, + "learning_rate": 9.99764118747498e-05, + "loss": 2.5189, + "step": 1288 + }, + { + "epoch": 0.3956414978514426, + "grad_norm": 1.2526514530181885, + "learning_rate": 9.997625896585757e-05, + "loss": 2.5464, + "step": 1289 + }, + { + "epoch": 0.39594843462246776, + "grad_norm": 1.297153115272522, + "learning_rate": 9.997610556307062e-05, + "loss": 2.5752, + "step": 1290 + }, + { + "epoch": 0.39625537139349293, + "grad_norm": 1.1064956188201904, + "learning_rate": 9.997595166639054e-05, + "loss": 2.5743, + "step": 1291 + }, + { + "epoch": 0.3965623081645181, + "grad_norm": 1.255810022354126, + "learning_rate": 9.997579727581879e-05, + "loss": 2.7087, + "step": 1292 + }, + { + "epoch": 0.39686924493554326, + "grad_norm": 1.4290298223495483, + "learning_rate": 9.997564239135692e-05, + "loss": 2.5417, + "step": 1293 + }, + { + "epoch": 0.39717618170656843, + "grad_norm": 1.1937109231948853, + "learning_rate": 9.997548701300648e-05, + "loss": 2.4862, + "step": 1294 + }, + { + "epoch": 0.3974831184775936, + "grad_norm": 1.1707425117492676, + "learning_rate": 9.997533114076897e-05, + "loss": 2.4715, + "step": 1295 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 1.1248551607131958, + "learning_rate": 9.997517477464596e-05, + "loss": 2.4859, + "step": 1296 + }, + { + "epoch": 0.39809699201964394, + "grad_norm": 1.1656453609466553, + "learning_rate": 9.997501791463897e-05, + "loss": 2.5402, + "step": 1297 + }, + { + "epoch": 0.3984039287906691, + "grad_norm": 0.9916674494743347, + "learning_rate": 9.997486056074956e-05, + "loss": 2.5116, + "step": 1298 + }, + { + "epoch": 0.39871086556169427, + "grad_norm": 1.3229619264602661, + "learning_rate": 9.997470271297928e-05, + "loss": 2.5565, + "step": 1299 + }, + { + "epoch": 0.39901780233271944, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.997454437132971e-05, + "loss": 2.5191, + "step": 1300 + }, + { + "epoch": 0.3993247391037446, + "grad_norm": 1.2117778062820435, + "learning_rate": 9.997438553580241e-05, + "loss": 2.558, + "step": 1301 + }, + { + "epoch": 0.3996316758747698, + "grad_norm": 1.1083563566207886, + "learning_rate": 9.997422620639892e-05, + "loss": 2.4734, + "step": 1302 + }, + { + "epoch": 0.39993861264579494, + "grad_norm": 0.9662174582481384, + "learning_rate": 9.997406638312084e-05, + "loss": 2.4866, + "step": 1303 + }, + { + "epoch": 0.4002455494168201, + "grad_norm": 1.0886632204055786, + "learning_rate": 9.997390606596976e-05, + "loss": 2.5397, + "step": 1304 + }, + { + "epoch": 0.4005524861878453, + "grad_norm": 1.2318742275238037, + "learning_rate": 9.997374525494723e-05, + "loss": 2.6281, + "step": 1305 + }, + { + "epoch": 0.40085942295887045, + "grad_norm": 1.1717815399169922, + "learning_rate": 9.997358395005487e-05, + "loss": 2.5202, + "step": 1306 + }, + { + "epoch": 0.4011663597298956, + "grad_norm": 1.0533723831176758, + "learning_rate": 9.997342215129427e-05, + "loss": 2.5096, + "step": 1307 + }, + { + "epoch": 0.4014732965009208, + "grad_norm": 1.0814248323440552, + "learning_rate": 9.997325985866701e-05, + "loss": 2.5513, + "step": 1308 + }, + { + "epoch": 0.40178023327194595, + "grad_norm": 1.078261137008667, + "learning_rate": 9.997309707217472e-05, + "loss": 2.5115, + "step": 1309 + }, + { + "epoch": 0.4020871700429711, + "grad_norm": 1.0834710597991943, + "learning_rate": 9.997293379181897e-05, + "loss": 2.4754, + "step": 1310 + }, + { + "epoch": 0.40239410681399634, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.997277001760142e-05, + "loss": 2.5068, + "step": 1311 + }, + { + "epoch": 0.4027010435850215, + "grad_norm": 1.3008345365524292, + "learning_rate": 9.997260574952366e-05, + "loss": 2.4675, + "step": 1312 + }, + { + "epoch": 0.4030079803560467, + "grad_norm": 1.176858901977539, + "learning_rate": 9.997244098758732e-05, + "loss": 2.4786, + "step": 1313 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 1.0121303796768188, + "learning_rate": 9.997227573179403e-05, + "loss": 2.476, + "step": 1314 + }, + { + "epoch": 0.403621853898097, + "grad_norm": 1.326298713684082, + "learning_rate": 9.997210998214542e-05, + "loss": 2.4093, + "step": 1315 + }, + { + "epoch": 0.4039287906691222, + "grad_norm": 0.9008898735046387, + "learning_rate": 9.997194373864314e-05, + "loss": 2.4523, + "step": 1316 + }, + { + "epoch": 0.40423572744014735, + "grad_norm": 1.0441854000091553, + "learning_rate": 9.99717770012888e-05, + "loss": 2.5419, + "step": 1317 + }, + { + "epoch": 0.4045426642111725, + "grad_norm": 1.0490028858184814, + "learning_rate": 9.997160977008408e-05, + "loss": 2.4855, + "step": 1318 + }, + { + "epoch": 0.4048496009821977, + "grad_norm": 1.0244388580322266, + "learning_rate": 9.997144204503063e-05, + "loss": 2.4555, + "step": 1319 + }, + { + "epoch": 0.40515653775322286, + "grad_norm": 1.1217700242996216, + "learning_rate": 9.99712738261301e-05, + "loss": 2.4872, + "step": 1320 + }, + { + "epoch": 0.405463474524248, + "grad_norm": 1.031691551208496, + "learning_rate": 9.997110511338414e-05, + "loss": 2.4094, + "step": 1321 + }, + { + "epoch": 0.4057704112952732, + "grad_norm": 1.1658705472946167, + "learning_rate": 9.997093590679444e-05, + "loss": 2.407, + "step": 1322 + }, + { + "epoch": 0.40607734806629836, + "grad_norm": 1.1527072191238403, + "learning_rate": 9.997076620636266e-05, + "loss": 2.5041, + "step": 1323 + }, + { + "epoch": 0.40638428483732353, + "grad_norm": 1.2039116621017456, + "learning_rate": 9.997059601209049e-05, + "loss": 2.4682, + "step": 1324 + }, + { + "epoch": 0.4066912216083487, + "grad_norm": 1.142160177230835, + "learning_rate": 9.997042532397957e-05, + "loss": 2.4629, + "step": 1325 + }, + { + "epoch": 0.40699815837937386, + "grad_norm": 0.972081184387207, + "learning_rate": 9.997025414203164e-05, + "loss": 2.3941, + "step": 1326 + }, + { + "epoch": 0.40730509515039903, + "grad_norm": 1.0181753635406494, + "learning_rate": 9.99700824662484e-05, + "loss": 2.5649, + "step": 1327 + }, + { + "epoch": 0.4076120319214242, + "grad_norm": 1.145769715309143, + "learning_rate": 9.996991029663148e-05, + "loss": 2.5284, + "step": 1328 + }, + { + "epoch": 0.40791896869244937, + "grad_norm": 1.0604028701782227, + "learning_rate": 9.996973763318262e-05, + "loss": 2.4488, + "step": 1329 + }, + { + "epoch": 0.40822590546347454, + "grad_norm": 1.161383867263794, + "learning_rate": 9.996956447590354e-05, + "loss": 2.6081, + "step": 1330 + }, + { + "epoch": 0.4085328422344997, + "grad_norm": 1.0880714654922485, + "learning_rate": 9.996939082479591e-05, + "loss": 2.4695, + "step": 1331 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 1.036556601524353, + "learning_rate": 9.99692166798615e-05, + "loss": 2.4428, + "step": 1332 + }, + { + "epoch": 0.40914671577655004, + "grad_norm": 1.079179286956787, + "learning_rate": 9.996904204110198e-05, + "loss": 2.4543, + "step": 1333 + }, + { + "epoch": 0.4094536525475752, + "grad_norm": 1.0588144063949585, + "learning_rate": 9.996886690851912e-05, + "loss": 2.4755, + "step": 1334 + }, + { + "epoch": 0.4097605893186004, + "grad_norm": 1.0359580516815186, + "learning_rate": 9.996869128211462e-05, + "loss": 2.4933, + "step": 1335 + }, + { + "epoch": 0.41006752608962554, + "grad_norm": 1.0067389011383057, + "learning_rate": 9.996851516189021e-05, + "loss": 2.4291, + "step": 1336 + }, + { + "epoch": 0.4103744628606507, + "grad_norm": 1.0173524618148804, + "learning_rate": 9.996833854784766e-05, + "loss": 2.4856, + "step": 1337 + }, + { + "epoch": 0.4106813996316759, + "grad_norm": 1.0740927457809448, + "learning_rate": 9.99681614399887e-05, + "loss": 2.5248, + "step": 1338 + }, + { + "epoch": 0.41098833640270105, + "grad_norm": 0.9638547301292419, + "learning_rate": 9.99679838383151e-05, + "loss": 2.4777, + "step": 1339 + }, + { + "epoch": 0.4112952731737262, + "grad_norm": 1.0349369049072266, + "learning_rate": 9.996780574282856e-05, + "loss": 2.5188, + "step": 1340 + }, + { + "epoch": 0.4116022099447514, + "grad_norm": 1.099743127822876, + "learning_rate": 9.996762715353089e-05, + "loss": 2.4141, + "step": 1341 + }, + { + "epoch": 0.41190914671577655, + "grad_norm": 1.027178406715393, + "learning_rate": 9.996744807042386e-05, + "loss": 2.5134, + "step": 1342 + }, + { + "epoch": 0.4122160834868017, + "grad_norm": 1.1933472156524658, + "learning_rate": 9.996726849350922e-05, + "loss": 2.4821, + "step": 1343 + }, + { + "epoch": 0.4125230202578269, + "grad_norm": 1.1663923263549805, + "learning_rate": 9.996708842278872e-05, + "loss": 2.4593, + "step": 1344 + }, + { + "epoch": 0.41282995702885206, + "grad_norm": 1.2633854150772095, + "learning_rate": 9.996690785826418e-05, + "loss": 2.5524, + "step": 1345 + }, + { + "epoch": 0.4131368937998772, + "grad_norm": 1.03873610496521, + "learning_rate": 9.996672679993737e-05, + "loss": 2.5403, + "step": 1346 + }, + { + "epoch": 0.4134438305709024, + "grad_norm": 1.106656789779663, + "learning_rate": 9.996654524781009e-05, + "loss": 2.5172, + "step": 1347 + }, + { + "epoch": 0.41375076734192756, + "grad_norm": 1.015608310699463, + "learning_rate": 9.996636320188411e-05, + "loss": 2.423, + "step": 1348 + }, + { + "epoch": 0.41405770411295273, + "grad_norm": 1.0672087669372559, + "learning_rate": 9.996618066216124e-05, + "loss": 2.4861, + "step": 1349 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 1.1289842128753662, + "learning_rate": 9.996599762864329e-05, + "loss": 2.3944, + "step": 1350 + }, + { + "epoch": 0.41467157765500307, + "grad_norm": 1.080428957939148, + "learning_rate": 9.996581410133207e-05, + "loss": 2.4563, + "step": 1351 + }, + { + "epoch": 0.41497851442602823, + "grad_norm": 1.257104516029358, + "learning_rate": 9.996563008022939e-05, + "loss": 2.437, + "step": 1352 + }, + { + "epoch": 0.4152854511970534, + "grad_norm": 1.039293646812439, + "learning_rate": 9.996544556533706e-05, + "loss": 2.4654, + "step": 1353 + }, + { + "epoch": 0.41559238796807857, + "grad_norm": 1.0976085662841797, + "learning_rate": 9.996526055665692e-05, + "loss": 2.4755, + "step": 1354 + }, + { + "epoch": 0.41589932473910374, + "grad_norm": 0.937647819519043, + "learning_rate": 9.996507505419078e-05, + "loss": 2.4687, + "step": 1355 + }, + { + "epoch": 0.4162062615101289, + "grad_norm": 1.0461267232894897, + "learning_rate": 9.996488905794047e-05, + "loss": 2.4092, + "step": 1356 + }, + { + "epoch": 0.4165131982811541, + "grad_norm": 1.0510658025741577, + "learning_rate": 9.996470256790787e-05, + "loss": 2.4806, + "step": 1357 + }, + { + "epoch": 0.41682013505217924, + "grad_norm": 1.2323371171951294, + "learning_rate": 9.996451558409478e-05, + "loss": 2.5017, + "step": 1358 + }, + { + "epoch": 0.4171270718232044, + "grad_norm": 0.9880139827728271, + "learning_rate": 9.996432810650307e-05, + "loss": 2.5171, + "step": 1359 + }, + { + "epoch": 0.4174340085942296, + "grad_norm": 1.2572466135025024, + "learning_rate": 9.996414013513458e-05, + "loss": 2.4285, + "step": 1360 + }, + { + "epoch": 0.41774094536525475, + "grad_norm": 0.8809559941291809, + "learning_rate": 9.996395166999118e-05, + "loss": 2.398, + "step": 1361 + }, + { + "epoch": 0.4180478821362799, + "grad_norm": 0.9389429688453674, + "learning_rate": 9.996376271107471e-05, + "loss": 2.4539, + "step": 1362 + }, + { + "epoch": 0.4183548189073051, + "grad_norm": 0.8821789026260376, + "learning_rate": 9.996357325838705e-05, + "loss": 2.4762, + "step": 1363 + }, + { + "epoch": 0.41866175567833025, + "grad_norm": 1.0148484706878662, + "learning_rate": 9.99633833119301e-05, + "loss": 2.5292, + "step": 1364 + }, + { + "epoch": 0.4189686924493554, + "grad_norm": 0.9861947894096375, + "learning_rate": 9.996319287170569e-05, + "loss": 2.4285, + "step": 1365 + }, + { + "epoch": 0.4192756292203806, + "grad_norm": 1.1907099485397339, + "learning_rate": 9.996300193771573e-05, + "loss": 2.4325, + "step": 1366 + }, + { + "epoch": 0.41958256599140575, + "grad_norm": 1.0746681690216064, + "learning_rate": 9.99628105099621e-05, + "loss": 2.3349, + "step": 1367 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 1.2040268182754517, + "learning_rate": 9.996261858844669e-05, + "loss": 2.4427, + "step": 1368 + }, + { + "epoch": 0.4201964395334561, + "grad_norm": 1.0487430095672607, + "learning_rate": 9.99624261731714e-05, + "loss": 2.4305, + "step": 1369 + }, + { + "epoch": 0.42050337630448126, + "grad_norm": 1.0047999620437622, + "learning_rate": 9.996223326413812e-05, + "loss": 2.4442, + "step": 1370 + }, + { + "epoch": 0.4208103130755064, + "grad_norm": 1.147078275680542, + "learning_rate": 9.996203986134879e-05, + "loss": 2.5189, + "step": 1371 + }, + { + "epoch": 0.4211172498465316, + "grad_norm": 1.2269455194473267, + "learning_rate": 9.996184596480529e-05, + "loss": 2.3905, + "step": 1372 + }, + { + "epoch": 0.42142418661755676, + "grad_norm": 0.9716771245002747, + "learning_rate": 9.996165157450954e-05, + "loss": 2.4246, + "step": 1373 + }, + { + "epoch": 0.42173112338858193, + "grad_norm": 1.0569939613342285, + "learning_rate": 9.996145669046347e-05, + "loss": 2.529, + "step": 1374 + }, + { + "epoch": 0.4220380601596071, + "grad_norm": 1.1145942211151123, + "learning_rate": 9.996126131266899e-05, + "loss": 2.3965, + "step": 1375 + }, + { + "epoch": 0.42234499693063227, + "grad_norm": 0.9990974068641663, + "learning_rate": 9.996106544112805e-05, + "loss": 2.4991, + "step": 1376 + }, + { + "epoch": 0.42265193370165743, + "grad_norm": 0.9536247253417969, + "learning_rate": 9.99608690758426e-05, + "loss": 2.4347, + "step": 1377 + }, + { + "epoch": 0.4229588704726826, + "grad_norm": 1.0053460597991943, + "learning_rate": 9.996067221681452e-05, + "loss": 2.4213, + "step": 1378 + }, + { + "epoch": 0.42326580724370777, + "grad_norm": 1.0727168321609497, + "learning_rate": 9.99604748640458e-05, + "loss": 2.4479, + "step": 1379 + }, + { + "epoch": 0.42357274401473294, + "grad_norm": 1.2539277076721191, + "learning_rate": 9.996027701753841e-05, + "loss": 2.4721, + "step": 1380 + }, + { + "epoch": 0.4238796807857581, + "grad_norm": 1.0348230600357056, + "learning_rate": 9.996007867729427e-05, + "loss": 2.4263, + "step": 1381 + }, + { + "epoch": 0.42418661755678333, + "grad_norm": 1.051802158355713, + "learning_rate": 9.995987984331533e-05, + "loss": 2.4492, + "step": 1382 + }, + { + "epoch": 0.4244935543278085, + "grad_norm": 1.0394505262374878, + "learning_rate": 9.995968051560361e-05, + "loss": 2.4625, + "step": 1383 + }, + { + "epoch": 0.42480049109883367, + "grad_norm": 1.1121852397918701, + "learning_rate": 9.995948069416103e-05, + "loss": 2.4999, + "step": 1384 + }, + { + "epoch": 0.42510742786985883, + "grad_norm": 0.9693613052368164, + "learning_rate": 9.995928037898957e-05, + "loss": 2.4112, + "step": 1385 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 1.1416810750961304, + "learning_rate": 9.995907957009123e-05, + "loss": 2.5452, + "step": 1386 + }, + { + "epoch": 0.42572130141190917, + "grad_norm": 1.010640025138855, + "learning_rate": 9.995887826746797e-05, + "loss": 2.412, + "step": 1387 + }, + { + "epoch": 0.42602823818293434, + "grad_norm": 1.0800373554229736, + "learning_rate": 9.99586764711218e-05, + "loss": 2.4451, + "step": 1388 + }, + { + "epoch": 0.4263351749539595, + "grad_norm": 1.058931589126587, + "learning_rate": 9.995847418105471e-05, + "loss": 2.474, + "step": 1389 + }, + { + "epoch": 0.4266421117249847, + "grad_norm": 1.0727131366729736, + "learning_rate": 9.99582713972687e-05, + "loss": 2.468, + "step": 1390 + }, + { + "epoch": 0.42694904849600984, + "grad_norm": 1.0237464904785156, + "learning_rate": 9.995806811976576e-05, + "loss": 2.5208, + "step": 1391 + }, + { + "epoch": 0.427255985267035, + "grad_norm": 1.036582112312317, + "learning_rate": 9.995786434854793e-05, + "loss": 2.4338, + "step": 1392 + }, + { + "epoch": 0.4275629220380602, + "grad_norm": 0.9617817997932434, + "learning_rate": 9.995766008361719e-05, + "loss": 2.4465, + "step": 1393 + }, + { + "epoch": 0.42786985880908535, + "grad_norm": 1.2188911437988281, + "learning_rate": 9.995745532497556e-05, + "loss": 2.5069, + "step": 1394 + }, + { + "epoch": 0.4281767955801105, + "grad_norm": 1.0796585083007812, + "learning_rate": 9.99572500726251e-05, + "loss": 2.4839, + "step": 1395 + }, + { + "epoch": 0.4284837323511357, + "grad_norm": 0.9843130111694336, + "learning_rate": 9.99570443265678e-05, + "loss": 2.4968, + "step": 1396 + }, + { + "epoch": 0.42879066912216085, + "grad_norm": 1.0441415309906006, + "learning_rate": 9.99568380868057e-05, + "loss": 2.4134, + "step": 1397 + }, + { + "epoch": 0.429097605893186, + "grad_norm": 0.9156177639961243, + "learning_rate": 9.995663135334085e-05, + "loss": 2.4891, + "step": 1398 + }, + { + "epoch": 0.4294045426642112, + "grad_norm": 1.1159545183181763, + "learning_rate": 9.995642412617529e-05, + "loss": 2.4507, + "step": 1399 + }, + { + "epoch": 0.42971147943523635, + "grad_norm": 0.8944577574729919, + "learning_rate": 9.995621640531107e-05, + "loss": 2.4465, + "step": 1400 + }, + { + "epoch": 0.4300184162062615, + "grad_norm": 0.9043408036231995, + "learning_rate": 9.995600819075025e-05, + "loss": 2.3726, + "step": 1401 + }, + { + "epoch": 0.4303253529772867, + "grad_norm": 0.9028464555740356, + "learning_rate": 9.995579948249486e-05, + "loss": 2.427, + "step": 1402 + }, + { + "epoch": 0.43063228974831186, + "grad_norm": 0.9497705101966858, + "learning_rate": 9.995559028054699e-05, + "loss": 2.4666, + "step": 1403 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.927601158618927, + "learning_rate": 9.995538058490868e-05, + "loss": 2.3679, + "step": 1404 + }, + { + "epoch": 0.4312461632903622, + "grad_norm": 1.050394892692566, + "learning_rate": 9.995517039558204e-05, + "loss": 2.4096, + "step": 1405 + }, + { + "epoch": 0.43155310006138736, + "grad_norm": 1.3011974096298218, + "learning_rate": 9.995495971256911e-05, + "loss": 2.4439, + "step": 1406 + }, + { + "epoch": 0.43186003683241253, + "grad_norm": 1.0740708112716675, + "learning_rate": 9.9954748535872e-05, + "loss": 2.4891, + "step": 1407 + }, + { + "epoch": 0.4321669736034377, + "grad_norm": 1.1132466793060303, + "learning_rate": 9.995453686549279e-05, + "loss": 2.46, + "step": 1408 + }, + { + "epoch": 0.43247391037446287, + "grad_norm": 1.063275933265686, + "learning_rate": 9.995432470143356e-05, + "loss": 2.5035, + "step": 1409 + }, + { + "epoch": 0.43278084714548803, + "grad_norm": 1.065679669380188, + "learning_rate": 9.99541120436964e-05, + "loss": 2.4471, + "step": 1410 + }, + { + "epoch": 0.4330877839165132, + "grad_norm": 1.017587423324585, + "learning_rate": 9.995389889228344e-05, + "loss": 2.4879, + "step": 1411 + }, + { + "epoch": 0.43339472068753837, + "grad_norm": 0.9744442701339722, + "learning_rate": 9.995368524719678e-05, + "loss": 2.3923, + "step": 1412 + }, + { + "epoch": 0.43370165745856354, + "grad_norm": 0.8916706442832947, + "learning_rate": 9.995347110843851e-05, + "loss": 2.3965, + "step": 1413 + }, + { + "epoch": 0.4340085942295887, + "grad_norm": 0.916221559047699, + "learning_rate": 9.995325647601075e-05, + "loss": 2.4742, + "step": 1414 + }, + { + "epoch": 0.4343155310006139, + "grad_norm": 0.9388782978057861, + "learning_rate": 9.995304134991565e-05, + "loss": 2.453, + "step": 1415 + }, + { + "epoch": 0.43462246777163904, + "grad_norm": 1.057085633277893, + "learning_rate": 9.995282573015532e-05, + "loss": 2.5791, + "step": 1416 + }, + { + "epoch": 0.4349294045426642, + "grad_norm": 1.055145025253296, + "learning_rate": 9.995260961673187e-05, + "loss": 2.3565, + "step": 1417 + }, + { + "epoch": 0.4352363413136894, + "grad_norm": 1.0733528137207031, + "learning_rate": 9.995239300964747e-05, + "loss": 2.5413, + "step": 1418 + }, + { + "epoch": 0.43554327808471455, + "grad_norm": 1.1478198766708374, + "learning_rate": 9.995217590890425e-05, + "loss": 2.4093, + "step": 1419 + }, + { + "epoch": 0.4358502148557397, + "grad_norm": 0.8663081526756287, + "learning_rate": 9.995195831450432e-05, + "loss": 2.3968, + "step": 1420 + }, + { + "epoch": 0.4361571516267649, + "grad_norm": 0.9811860918998718, + "learning_rate": 9.995174022644988e-05, + "loss": 2.3536, + "step": 1421 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.9883477687835693, + "learning_rate": 9.995152164474306e-05, + "loss": 2.5372, + "step": 1422 + }, + { + "epoch": 0.4367710251688152, + "grad_norm": 1.2196532487869263, + "learning_rate": 9.995130256938603e-05, + "loss": 2.429, + "step": 1423 + }, + { + "epoch": 0.4370779619398404, + "grad_norm": 1.000264286994934, + "learning_rate": 9.995108300038096e-05, + "loss": 2.4116, + "step": 1424 + }, + { + "epoch": 0.43738489871086556, + "grad_norm": 1.1259286403656006, + "learning_rate": 9.995086293773e-05, + "loss": 2.4405, + "step": 1425 + }, + { + "epoch": 0.4376918354818907, + "grad_norm": 0.9334595203399658, + "learning_rate": 9.995064238143533e-05, + "loss": 2.3849, + "step": 1426 + }, + { + "epoch": 0.4379987722529159, + "grad_norm": 0.8880285620689392, + "learning_rate": 9.995042133149914e-05, + "loss": 2.4177, + "step": 1427 + }, + { + "epoch": 0.43830570902394106, + "grad_norm": 0.8823251724243164, + "learning_rate": 9.995019978792362e-05, + "loss": 2.4876, + "step": 1428 + }, + { + "epoch": 0.4386126457949662, + "grad_norm": 0.9289014339447021, + "learning_rate": 9.994997775071094e-05, + "loss": 2.4725, + "step": 1429 + }, + { + "epoch": 0.4389195825659914, + "grad_norm": 0.9100427627563477, + "learning_rate": 9.994975521986329e-05, + "loss": 2.3834, + "step": 1430 + }, + { + "epoch": 0.43922651933701656, + "grad_norm": 0.8956978917121887, + "learning_rate": 9.99495321953829e-05, + "loss": 2.4418, + "step": 1431 + }, + { + "epoch": 0.43953345610804173, + "grad_norm": 1.1248396635055542, + "learning_rate": 9.994930867727195e-05, + "loss": 2.4389, + "step": 1432 + }, + { + "epoch": 0.4398403928790669, + "grad_norm": 0.9285669922828674, + "learning_rate": 9.994908466553266e-05, + "loss": 2.3922, + "step": 1433 + }, + { + "epoch": 0.44014732965009207, + "grad_norm": 0.9604844450950623, + "learning_rate": 9.994886016016723e-05, + "loss": 2.4365, + "step": 1434 + }, + { + "epoch": 0.44045426642111724, + "grad_norm": 1.0534024238586426, + "learning_rate": 9.99486351611779e-05, + "loss": 2.4377, + "step": 1435 + }, + { + "epoch": 0.4407612031921424, + "grad_norm": 1.1028003692626953, + "learning_rate": 9.994840966856686e-05, + "loss": 2.4299, + "step": 1436 + }, + { + "epoch": 0.44106813996316757, + "grad_norm": 1.119832158088684, + "learning_rate": 9.994818368233639e-05, + "loss": 2.4656, + "step": 1437 + }, + { + "epoch": 0.44137507673419274, + "grad_norm": 0.9782878160476685, + "learning_rate": 9.994795720248867e-05, + "loss": 2.3661, + "step": 1438 + }, + { + "epoch": 0.4416820135052179, + "grad_norm": 1.0002741813659668, + "learning_rate": 9.994773022902597e-05, + "loss": 2.4157, + "step": 1439 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 1.051486611366272, + "learning_rate": 9.994750276195053e-05, + "loss": 2.452, + "step": 1440 + }, + { + "epoch": 0.44229588704726824, + "grad_norm": 1.0375488996505737, + "learning_rate": 9.994727480126457e-05, + "loss": 2.4406, + "step": 1441 + }, + { + "epoch": 0.4426028238182934, + "grad_norm": 0.9407445192337036, + "learning_rate": 9.99470463469704e-05, + "loss": 2.3434, + "step": 1442 + }, + { + "epoch": 0.4429097605893186, + "grad_norm": 1.0371474027633667, + "learning_rate": 9.994681739907022e-05, + "loss": 2.5094, + "step": 1443 + }, + { + "epoch": 0.44321669736034375, + "grad_norm": 1.057519555091858, + "learning_rate": 9.994658795756632e-05, + "loss": 2.4501, + "step": 1444 + }, + { + "epoch": 0.4435236341313689, + "grad_norm": 0.9340078234672546, + "learning_rate": 9.994635802246097e-05, + "loss": 2.4151, + "step": 1445 + }, + { + "epoch": 0.4438305709023941, + "grad_norm": 0.8906050324440002, + "learning_rate": 9.994612759375644e-05, + "loss": 2.3837, + "step": 1446 + }, + { + "epoch": 0.44413750767341925, + "grad_norm": 0.8349595665931702, + "learning_rate": 9.994589667145497e-05, + "loss": 2.4317, + "step": 1447 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.9362117648124695, + "learning_rate": 9.994566525555891e-05, + "loss": 2.4586, + "step": 1448 + }, + { + "epoch": 0.4447513812154696, + "grad_norm": 0.869215190410614, + "learning_rate": 9.99454333460705e-05, + "loss": 2.4458, + "step": 1449 + }, + { + "epoch": 0.44505831798649476, + "grad_norm": 0.904531717300415, + "learning_rate": 9.994520094299204e-05, + "loss": 2.4198, + "step": 1450 + }, + { + "epoch": 0.4453652547575199, + "grad_norm": 0.9153178930282593, + "learning_rate": 9.994496804632583e-05, + "loss": 2.3718, + "step": 1451 + }, + { + "epoch": 0.44567219152854515, + "grad_norm": 1.0229307413101196, + "learning_rate": 9.994473465607418e-05, + "loss": 2.3787, + "step": 1452 + }, + { + "epoch": 0.4459791282995703, + "grad_norm": 1.0449415445327759, + "learning_rate": 9.994450077223938e-05, + "loss": 2.4965, + "step": 1453 + }, + { + "epoch": 0.4462860650705955, + "grad_norm": 1.0524135828018188, + "learning_rate": 9.994426639482375e-05, + "loss": 2.3518, + "step": 1454 + }, + { + "epoch": 0.44659300184162065, + "grad_norm": 1.0612086057662964, + "learning_rate": 9.994403152382961e-05, + "loss": 2.4501, + "step": 1455 + }, + { + "epoch": 0.4468999386126458, + "grad_norm": 1.0568779706954956, + "learning_rate": 9.994379615925929e-05, + "loss": 2.3754, + "step": 1456 + }, + { + "epoch": 0.447206875383671, + "grad_norm": 1.0984265804290771, + "learning_rate": 9.994356030111509e-05, + "loss": 2.4318, + "step": 1457 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.9227646589279175, + "learning_rate": 9.994332394939936e-05, + "loss": 2.3928, + "step": 1458 + }, + { + "epoch": 0.4478207489257213, + "grad_norm": 1.0073471069335938, + "learning_rate": 9.994308710411442e-05, + "loss": 2.4203, + "step": 1459 + }, + { + "epoch": 0.4481276856967465, + "grad_norm": 1.1347973346710205, + "learning_rate": 9.994284976526263e-05, + "loss": 2.4991, + "step": 1460 + }, + { + "epoch": 0.44843462246777166, + "grad_norm": 0.9912654757499695, + "learning_rate": 9.994261193284631e-05, + "loss": 2.471, + "step": 1461 + }, + { + "epoch": 0.4487415592387968, + "grad_norm": 1.0599550008773804, + "learning_rate": 9.994237360686784e-05, + "loss": 2.505, + "step": 1462 + }, + { + "epoch": 0.449048496009822, + "grad_norm": 0.9811004996299744, + "learning_rate": 9.994213478732957e-05, + "loss": 2.3868, + "step": 1463 + }, + { + "epoch": 0.44935543278084716, + "grad_norm": 0.8389631509780884, + "learning_rate": 9.994189547423384e-05, + "loss": 2.4766, + "step": 1464 + }, + { + "epoch": 0.44966236955187233, + "grad_norm": 0.8475043773651123, + "learning_rate": 9.994165566758302e-05, + "loss": 2.3666, + "step": 1465 + }, + { + "epoch": 0.4499693063228975, + "grad_norm": 0.8922824859619141, + "learning_rate": 9.994141536737951e-05, + "loss": 2.3823, + "step": 1466 + }, + { + "epoch": 0.45027624309392267, + "grad_norm": 1.0286083221435547, + "learning_rate": 9.994117457362564e-05, + "loss": 2.4639, + "step": 1467 + }, + { + "epoch": 0.45058317986494784, + "grad_norm": 1.094282865524292, + "learning_rate": 9.994093328632383e-05, + "loss": 2.3984, + "step": 1468 + }, + { + "epoch": 0.450890116635973, + "grad_norm": 1.0993603467941284, + "learning_rate": 9.994069150547642e-05, + "loss": 2.3719, + "step": 1469 + }, + { + "epoch": 0.45119705340699817, + "grad_norm": 1.0274133682250977, + "learning_rate": 9.994044923108585e-05, + "loss": 2.3644, + "step": 1470 + }, + { + "epoch": 0.45150399017802334, + "grad_norm": 0.8834434747695923, + "learning_rate": 9.994020646315448e-05, + "loss": 2.4955, + "step": 1471 + }, + { + "epoch": 0.4518109269490485, + "grad_norm": 0.8540776968002319, + "learning_rate": 9.993996320168473e-05, + "loss": 2.4292, + "step": 1472 + }, + { + "epoch": 0.4521178637200737, + "grad_norm": 0.8735383749008179, + "learning_rate": 9.993971944667897e-05, + "loss": 2.4343, + "step": 1473 + }, + { + "epoch": 0.45242480049109884, + "grad_norm": 0.976224422454834, + "learning_rate": 9.993947519813965e-05, + "loss": 2.4173, + "step": 1474 + }, + { + "epoch": 0.452731737262124, + "grad_norm": 0.9638139009475708, + "learning_rate": 9.993923045606917e-05, + "loss": 2.4322, + "step": 1475 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.9689927697181702, + "learning_rate": 9.993898522046992e-05, + "loss": 2.4625, + "step": 1476 + }, + { + "epoch": 0.45334561080417435, + "grad_norm": 1.0496052503585815, + "learning_rate": 9.993873949134437e-05, + "loss": 2.4788, + "step": 1477 + }, + { + "epoch": 0.4536525475751995, + "grad_norm": 1.0285090208053589, + "learning_rate": 9.993849326869491e-05, + "loss": 2.4119, + "step": 1478 + }, + { + "epoch": 0.4539594843462247, + "grad_norm": 0.9423730373382568, + "learning_rate": 9.993824655252401e-05, + "loss": 2.3919, + "step": 1479 + }, + { + "epoch": 0.45426642111724985, + "grad_norm": 1.0312988758087158, + "learning_rate": 9.993799934283407e-05, + "loss": 2.3829, + "step": 1480 + }, + { + "epoch": 0.454573357888275, + "grad_norm": 1.0985655784606934, + "learning_rate": 9.993775163962755e-05, + "loss": 2.3958, + "step": 1481 + }, + { + "epoch": 0.4548802946593002, + "grad_norm": 0.9346623420715332, + "learning_rate": 9.993750344290691e-05, + "loss": 2.3611, + "step": 1482 + }, + { + "epoch": 0.45518723143032536, + "grad_norm": 1.039681315422058, + "learning_rate": 9.993725475267459e-05, + "loss": 2.3989, + "step": 1483 + }, + { + "epoch": 0.4554941682013505, + "grad_norm": 0.9941854476928711, + "learning_rate": 9.993700556893304e-05, + "loss": 2.3092, + "step": 1484 + }, + { + "epoch": 0.4558011049723757, + "grad_norm": 0.9752130508422852, + "learning_rate": 9.993675589168473e-05, + "loss": 2.3727, + "step": 1485 + }, + { + "epoch": 0.45610804174340086, + "grad_norm": 0.9946039319038391, + "learning_rate": 9.993650572093216e-05, + "loss": 2.4121, + "step": 1486 + }, + { + "epoch": 0.45641497851442603, + "grad_norm": 1.1340489387512207, + "learning_rate": 9.993625505667774e-05, + "loss": 2.4477, + "step": 1487 + }, + { + "epoch": 0.4567219152854512, + "grad_norm": 0.9300981760025024, + "learning_rate": 9.993600389892399e-05, + "loss": 2.4045, + "step": 1488 + }, + { + "epoch": 0.45702885205647636, + "grad_norm": 0.8670973181724548, + "learning_rate": 9.993575224767338e-05, + "loss": 2.3596, + "step": 1489 + }, + { + "epoch": 0.45733578882750153, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.99355001029284e-05, + "loss": 2.4191, + "step": 1490 + }, + { + "epoch": 0.4576427255985267, + "grad_norm": 0.9099079370498657, + "learning_rate": 9.993524746469154e-05, + "loss": 2.4139, + "step": 1491 + }, + { + "epoch": 0.45794966236955187, + "grad_norm": 0.9740153551101685, + "learning_rate": 9.99349943329653e-05, + "loss": 2.4269, + "step": 1492 + }, + { + "epoch": 0.45825659914057704, + "grad_norm": 0.9112171530723572, + "learning_rate": 9.993474070775217e-05, + "loss": 2.3575, + "step": 1493 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 1.124553918838501, + "learning_rate": 9.993448658905466e-05, + "loss": 2.5518, + "step": 1494 + }, + { + "epoch": 0.4588704726826274, + "grad_norm": 1.1732012033462524, + "learning_rate": 9.99342319768753e-05, + "loss": 2.4346, + "step": 1495 + }, + { + "epoch": 0.45917740945365254, + "grad_norm": 0.8880025148391724, + "learning_rate": 9.993397687121659e-05, + "loss": 2.3593, + "step": 1496 + }, + { + "epoch": 0.4594843462246777, + "grad_norm": 0.9916797876358032, + "learning_rate": 9.993372127208105e-05, + "loss": 2.3283, + "step": 1497 + }, + { + "epoch": 0.4597912829957029, + "grad_norm": 0.9372622966766357, + "learning_rate": 9.99334651794712e-05, + "loss": 2.3868, + "step": 1498 + }, + { + "epoch": 0.46009821976672804, + "grad_norm": 1.0630989074707031, + "learning_rate": 9.99332085933896e-05, + "loss": 2.3605, + "step": 1499 + }, + { + "epoch": 0.4604051565377532, + "grad_norm": 1.000473976135254, + "learning_rate": 9.993295151383874e-05, + "loss": 2.3478, + "step": 1500 + }, + { + "epoch": 0.4607120933087784, + "grad_norm": 1.0269688367843628, + "learning_rate": 9.99326939408212e-05, + "loss": 2.4104, + "step": 1501 + }, + { + "epoch": 0.46101903007980355, + "grad_norm": 0.9003174901008606, + "learning_rate": 9.993243587433952e-05, + "loss": 2.3461, + "step": 1502 + }, + { + "epoch": 0.4613259668508287, + "grad_norm": 0.7938058972358704, + "learning_rate": 9.993217731439623e-05, + "loss": 2.3463, + "step": 1503 + }, + { + "epoch": 0.4616329036218539, + "grad_norm": 0.8715407252311707, + "learning_rate": 9.993191826099391e-05, + "loss": 2.3962, + "step": 1504 + }, + { + "epoch": 0.46193984039287905, + "grad_norm": 0.8319756984710693, + "learning_rate": 9.99316587141351e-05, + "loss": 2.342, + "step": 1505 + }, + { + "epoch": 0.4622467771639042, + "grad_norm": 0.846592903137207, + "learning_rate": 9.993139867382238e-05, + "loss": 2.4064, + "step": 1506 + }, + { + "epoch": 0.4625537139349294, + "grad_norm": 0.8567312955856323, + "learning_rate": 9.99311381400583e-05, + "loss": 2.3603, + "step": 1507 + }, + { + "epoch": 0.46286065070595456, + "grad_norm": 0.8784321546554565, + "learning_rate": 9.993087711284546e-05, + "loss": 2.4031, + "step": 1508 + }, + { + "epoch": 0.4631675874769797, + "grad_norm": 0.838233232498169, + "learning_rate": 9.993061559218641e-05, + "loss": 2.3156, + "step": 1509 + }, + { + "epoch": 0.4634745242480049, + "grad_norm": 0.8804462552070618, + "learning_rate": 9.993035357808376e-05, + "loss": 2.4322, + "step": 1510 + }, + { + "epoch": 0.46378146101903006, + "grad_norm": 1.1055982112884521, + "learning_rate": 9.99300910705401e-05, + "loss": 2.5006, + "step": 1511 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.9872145056724548, + "learning_rate": 9.992982806955799e-05, + "loss": 2.3547, + "step": 1512 + }, + { + "epoch": 0.4643953345610804, + "grad_norm": 1.0710479021072388, + "learning_rate": 9.99295645751401e-05, + "loss": 2.4867, + "step": 1513 + }, + { + "epoch": 0.46470227133210557, + "grad_norm": 0.9858919382095337, + "learning_rate": 9.992930058728894e-05, + "loss": 2.2986, + "step": 1514 + }, + { + "epoch": 0.46500920810313073, + "grad_norm": 0.9031065702438354, + "learning_rate": 9.992903610600719e-05, + "loss": 2.3172, + "step": 1515 + }, + { + "epoch": 0.4653161448741559, + "grad_norm": 0.923160970211029, + "learning_rate": 9.992877113129744e-05, + "loss": 2.4231, + "step": 1516 + }, + { + "epoch": 0.46562308164518107, + "grad_norm": 1.0130947828292847, + "learning_rate": 9.992850566316231e-05, + "loss": 2.3593, + "step": 1517 + }, + { + "epoch": 0.46593001841620624, + "grad_norm": 0.8947033286094666, + "learning_rate": 9.992823970160441e-05, + "loss": 2.3324, + "step": 1518 + }, + { + "epoch": 0.4662369551872314, + "grad_norm": 0.8819900155067444, + "learning_rate": 9.992797324662639e-05, + "loss": 2.2885, + "step": 1519 + }, + { + "epoch": 0.4665438919582566, + "grad_norm": 0.9434374570846558, + "learning_rate": 9.99277062982309e-05, + "loss": 2.427, + "step": 1520 + }, + { + "epoch": 0.46685082872928174, + "grad_norm": 0.9568646550178528, + "learning_rate": 9.99274388564205e-05, + "loss": 2.4059, + "step": 1521 + }, + { + "epoch": 0.4671577655003069, + "grad_norm": 0.9125105142593384, + "learning_rate": 9.992717092119794e-05, + "loss": 2.3306, + "step": 1522 + }, + { + "epoch": 0.46746470227133213, + "grad_norm": 0.8893206715583801, + "learning_rate": 9.992690249256578e-05, + "loss": 2.4211, + "step": 1523 + }, + { + "epoch": 0.4677716390423573, + "grad_norm": 0.8655402660369873, + "learning_rate": 9.992663357052672e-05, + "loss": 2.3493, + "step": 1524 + }, + { + "epoch": 0.46807857581338247, + "grad_norm": 0.7973037958145142, + "learning_rate": 9.99263641550834e-05, + "loss": 2.4255, + "step": 1525 + }, + { + "epoch": 0.46838551258440764, + "grad_norm": 0.8158934116363525, + "learning_rate": 9.992609424623849e-05, + "loss": 2.3518, + "step": 1526 + }, + { + "epoch": 0.4686924493554328, + "grad_norm": 0.7919436693191528, + "learning_rate": 9.992582384399465e-05, + "loss": 2.3762, + "step": 1527 + }, + { + "epoch": 0.468999386126458, + "grad_norm": 0.911490261554718, + "learning_rate": 9.992555294835455e-05, + "loss": 2.454, + "step": 1528 + }, + { + "epoch": 0.46930632289748314, + "grad_norm": 0.9504674077033997, + "learning_rate": 9.992528155932088e-05, + "loss": 2.3554, + "step": 1529 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.9833991527557373, + "learning_rate": 9.99250096768963e-05, + "loss": 2.4245, + "step": 1530 + }, + { + "epoch": 0.4699201964395335, + "grad_norm": 0.9994687438011169, + "learning_rate": 9.992473730108354e-05, + "loss": 2.3269, + "step": 1531 + }, + { + "epoch": 0.47022713321055865, + "grad_norm": 0.977237343788147, + "learning_rate": 9.992446443188526e-05, + "loss": 2.3938, + "step": 1532 + }, + { + "epoch": 0.4705340699815838, + "grad_norm": 1.018334150314331, + "learning_rate": 9.992419106930415e-05, + "loss": 2.3076, + "step": 1533 + }, + { + "epoch": 0.470841006752609, + "grad_norm": 0.9752077460289001, + "learning_rate": 9.992391721334293e-05, + "loss": 2.4224, + "step": 1534 + }, + { + "epoch": 0.47114794352363415, + "grad_norm": 0.9457291960716248, + "learning_rate": 9.992364286400428e-05, + "loss": 2.3859, + "step": 1535 + }, + { + "epoch": 0.4714548802946593, + "grad_norm": 0.9112275838851929, + "learning_rate": 9.992336802129096e-05, + "loss": 2.3343, + "step": 1536 + }, + { + "epoch": 0.4717618170656845, + "grad_norm": 0.7701164484024048, + "learning_rate": 9.992309268520563e-05, + "loss": 2.3912, + "step": 1537 + }, + { + "epoch": 0.47206875383670965, + "grad_norm": 0.826822817325592, + "learning_rate": 9.992281685575105e-05, + "loss": 2.3794, + "step": 1538 + }, + { + "epoch": 0.4723756906077348, + "grad_norm": 0.8690019249916077, + "learning_rate": 9.992254053292994e-05, + "loss": 2.3474, + "step": 1539 + }, + { + "epoch": 0.47268262737876, + "grad_norm": 0.935954213142395, + "learning_rate": 9.9922263716745e-05, + "loss": 2.3794, + "step": 1540 + }, + { + "epoch": 0.47298956414978516, + "grad_norm": 1.0606616735458374, + "learning_rate": 9.992198640719901e-05, + "loss": 2.3491, + "step": 1541 + }, + { + "epoch": 0.4732965009208103, + "grad_norm": 1.0020630359649658, + "learning_rate": 9.992170860429469e-05, + "loss": 2.4723, + "step": 1542 + }, + { + "epoch": 0.4736034376918355, + "grad_norm": 0.9738268256187439, + "learning_rate": 9.992143030803476e-05, + "loss": 2.4282, + "step": 1543 + }, + { + "epoch": 0.47391037446286066, + "grad_norm": 1.0320461988449097, + "learning_rate": 9.992115151842203e-05, + "loss": 2.3935, + "step": 1544 + }, + { + "epoch": 0.47421731123388583, + "grad_norm": 0.926980197429657, + "learning_rate": 9.992087223545921e-05, + "loss": 2.4403, + "step": 1545 + }, + { + "epoch": 0.474524248004911, + "grad_norm": 0.8760805130004883, + "learning_rate": 9.992059245914906e-05, + "loss": 2.3282, + "step": 1546 + }, + { + "epoch": 0.47483118477593617, + "grad_norm": 0.807569146156311, + "learning_rate": 9.992031218949435e-05, + "loss": 2.351, + "step": 1547 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.7491574883460999, + "learning_rate": 9.992003142649788e-05, + "loss": 2.3788, + "step": 1548 + }, + { + "epoch": 0.4754450583179865, + "grad_norm": 0.8402566909790039, + "learning_rate": 9.99197501701624e-05, + "loss": 2.4025, + "step": 1549 + }, + { + "epoch": 0.47575199508901167, + "grad_norm": 0.9501824975013733, + "learning_rate": 9.991946842049067e-05, + "loss": 2.4433, + "step": 1550 + }, + { + "epoch": 0.47605893186003684, + "grad_norm": 1.0070267915725708, + "learning_rate": 9.99191861774855e-05, + "loss": 2.4267, + "step": 1551 + }, + { + "epoch": 0.476365868631062, + "grad_norm": 0.9052779078483582, + "learning_rate": 9.991890344114969e-05, + "loss": 2.37, + "step": 1552 + }, + { + "epoch": 0.4766728054020872, + "grad_norm": 0.9453344345092773, + "learning_rate": 9.9918620211486e-05, + "loss": 2.4687, + "step": 1553 + }, + { + "epoch": 0.47697974217311234, + "grad_norm": 0.9836863875389099, + "learning_rate": 9.991833648849725e-05, + "loss": 2.4005, + "step": 1554 + }, + { + "epoch": 0.4772866789441375, + "grad_norm": 0.856532633304596, + "learning_rate": 9.991805227218624e-05, + "loss": 2.329, + "step": 1555 + }, + { + "epoch": 0.4775936157151627, + "grad_norm": 0.8338705897331238, + "learning_rate": 9.991776756255579e-05, + "loss": 2.3648, + "step": 1556 + }, + { + "epoch": 0.47790055248618785, + "grad_norm": 0.7738644480705261, + "learning_rate": 9.991748235960869e-05, + "loss": 2.2784, + "step": 1557 + }, + { + "epoch": 0.478207489257213, + "grad_norm": 0.7771223783493042, + "learning_rate": 9.991719666334778e-05, + "loss": 2.2747, + "step": 1558 + }, + { + "epoch": 0.4785144260282382, + "grad_norm": 0.7564612627029419, + "learning_rate": 9.991691047377588e-05, + "loss": 2.2964, + "step": 1559 + }, + { + "epoch": 0.47882136279926335, + "grad_norm": 0.7877290844917297, + "learning_rate": 9.99166237908958e-05, + "loss": 2.3149, + "step": 1560 + }, + { + "epoch": 0.4791282995702885, + "grad_norm": 0.7967450022697449, + "learning_rate": 9.991633661471039e-05, + "loss": 2.4035, + "step": 1561 + }, + { + "epoch": 0.4794352363413137, + "grad_norm": 0.8993534445762634, + "learning_rate": 9.991604894522248e-05, + "loss": 2.4028, + "step": 1562 + }, + { + "epoch": 0.47974217311233885, + "grad_norm": 0.9135516881942749, + "learning_rate": 9.991576078243494e-05, + "loss": 2.3968, + "step": 1563 + }, + { + "epoch": 0.480049109883364, + "grad_norm": 0.8438525795936584, + "learning_rate": 9.991547212635057e-05, + "loss": 2.3589, + "step": 1564 + }, + { + "epoch": 0.4803560466543892, + "grad_norm": 0.8979686498641968, + "learning_rate": 9.991518297697226e-05, + "loss": 2.3835, + "step": 1565 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.8821539878845215, + "learning_rate": 9.991489333430286e-05, + "loss": 2.3503, + "step": 1566 + }, + { + "epoch": 0.4809699201964395, + "grad_norm": 0.8649077415466309, + "learning_rate": 9.991460319834523e-05, + "loss": 2.3806, + "step": 1567 + }, + { + "epoch": 0.4812768569674647, + "grad_norm": 0.8360965847969055, + "learning_rate": 9.991431256910223e-05, + "loss": 2.3997, + "step": 1568 + }, + { + "epoch": 0.48158379373848986, + "grad_norm": 0.9178828597068787, + "learning_rate": 9.991402144657673e-05, + "loss": 2.3611, + "step": 1569 + }, + { + "epoch": 0.48189073050951503, + "grad_norm": 0.7961607575416565, + "learning_rate": 9.991372983077161e-05, + "loss": 2.3588, + "step": 1570 + }, + { + "epoch": 0.4821976672805402, + "grad_norm": 0.8136993646621704, + "learning_rate": 9.991343772168978e-05, + "loss": 2.3241, + "step": 1571 + }, + { + "epoch": 0.48250460405156537, + "grad_norm": 0.8421273231506348, + "learning_rate": 9.991314511933407e-05, + "loss": 2.3493, + "step": 1572 + }, + { + "epoch": 0.48281154082259053, + "grad_norm": 0.774861752986908, + "learning_rate": 9.991285202370743e-05, + "loss": 2.362, + "step": 1573 + }, + { + "epoch": 0.4831184775936157, + "grad_norm": 0.9181589484214783, + "learning_rate": 9.991255843481273e-05, + "loss": 2.443, + "step": 1574 + }, + { + "epoch": 0.48342541436464087, + "grad_norm": 0.873884379863739, + "learning_rate": 9.991226435265286e-05, + "loss": 2.3819, + "step": 1575 + }, + { + "epoch": 0.48373235113566604, + "grad_norm": 0.923200786113739, + "learning_rate": 9.991196977723077e-05, + "loss": 2.4152, + "step": 1576 + }, + { + "epoch": 0.4840392879066912, + "grad_norm": 0.9097923040390015, + "learning_rate": 9.99116747085493e-05, + "loss": 2.4072, + "step": 1577 + }, + { + "epoch": 0.4843462246777164, + "grad_norm": 0.8885805010795593, + "learning_rate": 9.991137914661143e-05, + "loss": 2.3963, + "step": 1578 + }, + { + "epoch": 0.48465316144874154, + "grad_norm": 0.9016655683517456, + "learning_rate": 9.991108309142006e-05, + "loss": 2.4287, + "step": 1579 + }, + { + "epoch": 0.4849600982197667, + "grad_norm": 0.957548201084137, + "learning_rate": 9.99107865429781e-05, + "loss": 2.4306, + "step": 1580 + }, + { + "epoch": 0.4852670349907919, + "grad_norm": 0.9604195356369019, + "learning_rate": 9.99104895012885e-05, + "loss": 2.3721, + "step": 1581 + }, + { + "epoch": 0.48557397176181705, + "grad_norm": 1.0423815250396729, + "learning_rate": 9.991019196635419e-05, + "loss": 2.3847, + "step": 1582 + }, + { + "epoch": 0.4858809085328422, + "grad_norm": 0.9538045525550842, + "learning_rate": 9.990989393817809e-05, + "loss": 2.4307, + "step": 1583 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 1.0103334188461304, + "learning_rate": 9.990959541676318e-05, + "loss": 2.409, + "step": 1584 + }, + { + "epoch": 0.48649478207489255, + "grad_norm": 1.0780646800994873, + "learning_rate": 9.99092964021124e-05, + "loss": 2.3314, + "step": 1585 + }, + { + "epoch": 0.4868017188459177, + "grad_norm": 1.0062072277069092, + "learning_rate": 9.99089968942287e-05, + "loss": 2.3922, + "step": 1586 + }, + { + "epoch": 0.4871086556169429, + "grad_norm": 1.0575196743011475, + "learning_rate": 9.990869689311504e-05, + "loss": 2.4156, + "step": 1587 + }, + { + "epoch": 0.48741559238796806, + "grad_norm": 0.9953998923301697, + "learning_rate": 9.990839639877438e-05, + "loss": 2.381, + "step": 1588 + }, + { + "epoch": 0.4877225291589932, + "grad_norm": 0.8848470449447632, + "learning_rate": 9.99080954112097e-05, + "loss": 2.4178, + "step": 1589 + }, + { + "epoch": 0.4880294659300184, + "grad_norm": 0.7849117517471313, + "learning_rate": 9.990779393042397e-05, + "loss": 2.3021, + "step": 1590 + }, + { + "epoch": 0.48833640270104356, + "grad_norm": 0.7611599564552307, + "learning_rate": 9.990749195642016e-05, + "loss": 2.4426, + "step": 1591 + }, + { + "epoch": 0.4886433394720687, + "grad_norm": 0.8361895084381104, + "learning_rate": 9.990718948920127e-05, + "loss": 2.3442, + "step": 1592 + }, + { + "epoch": 0.4889502762430939, + "grad_norm": 0.8249576687812805, + "learning_rate": 9.990688652877028e-05, + "loss": 2.2745, + "step": 1593 + }, + { + "epoch": 0.4892572130141191, + "grad_norm": 0.763889729976654, + "learning_rate": 9.990658307513019e-05, + "loss": 2.3123, + "step": 1594 + }, + { + "epoch": 0.4895641497851443, + "grad_norm": 0.7517281770706177, + "learning_rate": 9.990627912828399e-05, + "loss": 2.3811, + "step": 1595 + }, + { + "epoch": 0.48987108655616945, + "grad_norm": 0.8254112005233765, + "learning_rate": 9.990597468823468e-05, + "loss": 2.4269, + "step": 1596 + }, + { + "epoch": 0.4901780233271946, + "grad_norm": 0.8267236948013306, + "learning_rate": 9.99056697549853e-05, + "loss": 2.354, + "step": 1597 + }, + { + "epoch": 0.4904849600982198, + "grad_norm": 0.8511303067207336, + "learning_rate": 9.990536432853881e-05, + "loss": 2.3755, + "step": 1598 + }, + { + "epoch": 0.49079189686924496, + "grad_norm": 0.8639636635780334, + "learning_rate": 9.990505840889828e-05, + "loss": 2.3828, + "step": 1599 + }, + { + "epoch": 0.4910988336402701, + "grad_norm": 0.8371795415878296, + "learning_rate": 9.990475199606672e-05, + "loss": 2.4235, + "step": 1600 + }, + { + "epoch": 0.4914057704112953, + "grad_norm": 0.7639186382293701, + "learning_rate": 9.990444509004713e-05, + "loss": 2.3547, + "step": 1601 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.7835492491722107, + "learning_rate": 9.990413769084257e-05, + "loss": 2.2983, + "step": 1602 + }, + { + "epoch": 0.49201964395334563, + "grad_norm": 0.8301565647125244, + "learning_rate": 9.990382979845609e-05, + "loss": 2.4109, + "step": 1603 + }, + { + "epoch": 0.4923265807243708, + "grad_norm": 0.9005976915359497, + "learning_rate": 9.99035214128907e-05, + "loss": 2.3618, + "step": 1604 + }, + { + "epoch": 0.49263351749539597, + "grad_norm": 1.0234936475753784, + "learning_rate": 9.990321253414945e-05, + "loss": 2.4622, + "step": 1605 + }, + { + "epoch": 0.49294045426642114, + "grad_norm": 1.1613819599151611, + "learning_rate": 9.990290316223542e-05, + "loss": 2.3231, + "step": 1606 + }, + { + "epoch": 0.4932473910374463, + "grad_norm": 0.9382983446121216, + "learning_rate": 9.990259329715165e-05, + "loss": 2.357, + "step": 1607 + }, + { + "epoch": 0.49355432780847147, + "grad_norm": 1.0277435779571533, + "learning_rate": 9.990228293890121e-05, + "loss": 2.3497, + "step": 1608 + }, + { + "epoch": 0.49386126457949664, + "grad_norm": 0.9809542894363403, + "learning_rate": 9.990197208748716e-05, + "loss": 2.363, + "step": 1609 + }, + { + "epoch": 0.4941682013505218, + "grad_norm": 1.151412844657898, + "learning_rate": 9.990166074291255e-05, + "loss": 2.4859, + "step": 1610 + }, + { + "epoch": 0.494475138121547, + "grad_norm": 0.9663482308387756, + "learning_rate": 9.990134890518051e-05, + "loss": 2.3848, + "step": 1611 + }, + { + "epoch": 0.49478207489257214, + "grad_norm": 0.9619266986846924, + "learning_rate": 9.990103657429405e-05, + "loss": 2.3381, + "step": 1612 + }, + { + "epoch": 0.4950890116635973, + "grad_norm": 1.1306475400924683, + "learning_rate": 9.990072375025634e-05, + "loss": 2.3859, + "step": 1613 + }, + { + "epoch": 0.4953959484346225, + "grad_norm": 1.127801537513733, + "learning_rate": 9.990041043307043e-05, + "loss": 2.4259, + "step": 1614 + }, + { + "epoch": 0.49570288520564765, + "grad_norm": 0.9880200624465942, + "learning_rate": 9.990009662273941e-05, + "loss": 2.3629, + "step": 1615 + }, + { + "epoch": 0.4960098219766728, + "grad_norm": 0.940493643283844, + "learning_rate": 9.989978231926636e-05, + "loss": 2.3716, + "step": 1616 + }, + { + "epoch": 0.496316758747698, + "grad_norm": 0.7923702597618103, + "learning_rate": 9.989946752265445e-05, + "loss": 2.3017, + "step": 1617 + }, + { + "epoch": 0.49662369551872315, + "grad_norm": 0.7668408155441284, + "learning_rate": 9.989915223290673e-05, + "loss": 2.3273, + "step": 1618 + }, + { + "epoch": 0.4969306322897483, + "grad_norm": 0.7134098410606384, + "learning_rate": 9.989883645002636e-05, + "loss": 2.302, + "step": 1619 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.6878800392150879, + "learning_rate": 9.989852017401643e-05, + "loss": 2.3047, + "step": 1620 + }, + { + "epoch": 0.49754450583179866, + "grad_norm": 0.8099397420883179, + "learning_rate": 9.989820340488008e-05, + "loss": 2.4747, + "step": 1621 + }, + { + "epoch": 0.4978514426028238, + "grad_norm": 0.9677640795707703, + "learning_rate": 9.989788614262043e-05, + "loss": 2.3347, + "step": 1622 + }, + { + "epoch": 0.498158379373849, + "grad_norm": 0.7592893838882446, + "learning_rate": 9.989756838724064e-05, + "loss": 2.3238, + "step": 1623 + }, + { + "epoch": 0.49846531614487416, + "grad_norm": 0.872529923915863, + "learning_rate": 9.989725013874382e-05, + "loss": 2.4117, + "step": 1624 + }, + { + "epoch": 0.49877225291589933, + "grad_norm": 1.023362159729004, + "learning_rate": 9.989693139713315e-05, + "loss": 2.3307, + "step": 1625 + }, + { + "epoch": 0.4990791896869245, + "grad_norm": 0.8994693756103516, + "learning_rate": 9.989661216241172e-05, + "loss": 2.3661, + "step": 1626 + }, + { + "epoch": 0.49938612645794966, + "grad_norm": 0.8854429125785828, + "learning_rate": 9.989629243458275e-05, + "loss": 2.311, + "step": 1627 + }, + { + "epoch": 0.49969306322897483, + "grad_norm": 0.8326926231384277, + "learning_rate": 9.989597221364937e-05, + "loss": 2.302, + "step": 1628 + }, + { + "epoch": 0.5, + "grad_norm": 0.8778239488601685, + "learning_rate": 9.989565149961475e-05, + "loss": 2.4653, + "step": 1629 + }, + { + "epoch": 0.5003069367710252, + "grad_norm": 0.9369759559631348, + "learning_rate": 9.989533029248205e-05, + "loss": 2.4165, + "step": 1630 + }, + { + "epoch": 0.5006138735420503, + "grad_norm": 0.8510915637016296, + "learning_rate": 9.989500859225445e-05, + "loss": 2.3345, + "step": 1631 + }, + { + "epoch": 0.5009208103130756, + "grad_norm": 0.787972629070282, + "learning_rate": 9.989468639893513e-05, + "loss": 2.283, + "step": 1632 + }, + { + "epoch": 0.5012277470841007, + "grad_norm": 0.7370568513870239, + "learning_rate": 9.989436371252729e-05, + "loss": 2.2867, + "step": 1633 + }, + { + "epoch": 0.5015346838551259, + "grad_norm": 0.8459502458572388, + "learning_rate": 9.989404053303409e-05, + "loss": 2.2875, + "step": 1634 + }, + { + "epoch": 0.501841620626151, + "grad_norm": 0.9123181700706482, + "learning_rate": 9.989371686045874e-05, + "loss": 2.2653, + "step": 1635 + }, + { + "epoch": 0.5021485573971762, + "grad_norm": 1.1908178329467773, + "learning_rate": 9.989339269480445e-05, + "loss": 2.4849, + "step": 1636 + }, + { + "epoch": 0.5024554941682013, + "grad_norm": 0.8162623643875122, + "learning_rate": 9.989306803607439e-05, + "loss": 2.2409, + "step": 1637 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.9289522171020508, + "learning_rate": 9.98927428842718e-05, + "loss": 2.455, + "step": 1638 + }, + { + "epoch": 0.5030693677102517, + "grad_norm": 1.212346076965332, + "learning_rate": 9.989241723939988e-05, + "loss": 2.3461, + "step": 1639 + }, + { + "epoch": 0.5033763044812769, + "grad_norm": 0.8971593976020813, + "learning_rate": 9.989209110146184e-05, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.503683241252302, + "grad_norm": 0.9293156862258911, + "learning_rate": 9.989176447046092e-05, + "loss": 2.3235, + "step": 1641 + }, + { + "epoch": 0.5039901780233272, + "grad_norm": 0.8665596842765808, + "learning_rate": 9.989143734640034e-05, + "loss": 2.4694, + "step": 1642 + }, + { + "epoch": 0.5042971147943524, + "grad_norm": 0.7732648253440857, + "learning_rate": 9.989110972928333e-05, + "loss": 2.1985, + "step": 1643 + }, + { + "epoch": 0.5046040515653776, + "grad_norm": 0.8124692440032959, + "learning_rate": 9.989078161911314e-05, + "loss": 2.315, + "step": 1644 + }, + { + "epoch": 0.5049109883364027, + "grad_norm": 0.8534342050552368, + "learning_rate": 9.989045301589301e-05, + "loss": 2.3491, + "step": 1645 + }, + { + "epoch": 0.5052179251074279, + "grad_norm": 0.8351274132728577, + "learning_rate": 9.989012391962617e-05, + "loss": 2.3416, + "step": 1646 + }, + { + "epoch": 0.505524861878453, + "grad_norm": 0.9143189787864685, + "learning_rate": 9.988979433031588e-05, + "loss": 2.4665, + "step": 1647 + }, + { + "epoch": 0.5058317986494782, + "grad_norm": 0.8978474140167236, + "learning_rate": 9.988946424796542e-05, + "loss": 2.389, + "step": 1648 + }, + { + "epoch": 0.5061387354205034, + "grad_norm": 1.0245648622512817, + "learning_rate": 9.988913367257802e-05, + "loss": 2.3391, + "step": 1649 + }, + { + "epoch": 0.5064456721915286, + "grad_norm": 0.9991573691368103, + "learning_rate": 9.988880260415695e-05, + "loss": 2.405, + "step": 1650 + }, + { + "epoch": 0.5067526089625537, + "grad_norm": 1.042378306388855, + "learning_rate": 9.98884710427055e-05, + "loss": 2.3467, + "step": 1651 + }, + { + "epoch": 0.5070595457335789, + "grad_norm": 0.9569510817527771, + "learning_rate": 9.988813898822694e-05, + "loss": 2.31, + "step": 1652 + }, + { + "epoch": 0.507366482504604, + "grad_norm": 0.9343158006668091, + "learning_rate": 9.988780644072456e-05, + "loss": 2.3659, + "step": 1653 + }, + { + "epoch": 0.5076734192756293, + "grad_norm": 0.7857093811035156, + "learning_rate": 9.988747340020162e-05, + "loss": 2.3424, + "step": 1654 + }, + { + "epoch": 0.5079803560466544, + "grad_norm": 0.7613041996955872, + "learning_rate": 9.988713986666144e-05, + "loss": 2.2698, + "step": 1655 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.8077516555786133, + "learning_rate": 9.98868058401073e-05, + "loss": 2.3827, + "step": 1656 + }, + { + "epoch": 0.5085942295887047, + "grad_norm": 0.8794304132461548, + "learning_rate": 9.98864713205425e-05, + "loss": 2.3079, + "step": 1657 + }, + { + "epoch": 0.5089011663597299, + "grad_norm": 0.8333674073219299, + "learning_rate": 9.988613630797036e-05, + "loss": 2.3622, + "step": 1658 + }, + { + "epoch": 0.509208103130755, + "grad_norm": 0.9654781222343445, + "learning_rate": 9.988580080239417e-05, + "loss": 2.3979, + "step": 1659 + }, + { + "epoch": 0.5095150399017803, + "grad_norm": 0.9278727769851685, + "learning_rate": 9.988546480381727e-05, + "loss": 2.3728, + "step": 1660 + }, + { + "epoch": 0.5098219766728054, + "grad_norm": 0.7971704006195068, + "learning_rate": 9.988512831224298e-05, + "loss": 2.2983, + "step": 1661 + }, + { + "epoch": 0.5101289134438306, + "grad_norm": 0.8991698026657104, + "learning_rate": 9.988479132767459e-05, + "loss": 2.3992, + "step": 1662 + }, + { + "epoch": 0.5104358502148557, + "grad_norm": 1.0208392143249512, + "learning_rate": 9.988445385011546e-05, + "loss": 2.3847, + "step": 1663 + }, + { + "epoch": 0.5107427869858809, + "grad_norm": 0.878237247467041, + "learning_rate": 9.988411587956891e-05, + "loss": 2.2851, + "step": 1664 + }, + { + "epoch": 0.511049723756906, + "grad_norm": 0.903287410736084, + "learning_rate": 9.98837774160383e-05, + "loss": 2.4233, + "step": 1665 + }, + { + "epoch": 0.5113566605279313, + "grad_norm": 0.8845674991607666, + "learning_rate": 9.988343845952697e-05, + "loss": 2.2923, + "step": 1666 + }, + { + "epoch": 0.5116635972989564, + "grad_norm": 0.7729392051696777, + "learning_rate": 9.988309901003825e-05, + "loss": 2.3044, + "step": 1667 + }, + { + "epoch": 0.5119705340699816, + "grad_norm": 0.719302237033844, + "learning_rate": 9.988275906757551e-05, + "loss": 2.3207, + "step": 1668 + }, + { + "epoch": 0.5122774708410067, + "grad_norm": 0.7205179333686829, + "learning_rate": 9.988241863214211e-05, + "loss": 2.341, + "step": 1669 + }, + { + "epoch": 0.512584407612032, + "grad_norm": 0.7318145036697388, + "learning_rate": 9.988207770374142e-05, + "loss": 2.3419, + "step": 1670 + }, + { + "epoch": 0.5128913443830571, + "grad_norm": 0.770630955696106, + "learning_rate": 9.98817362823768e-05, + "loss": 2.27, + "step": 1671 + }, + { + "epoch": 0.5131982811540823, + "grad_norm": 0.6485452651977539, + "learning_rate": 9.988139436805162e-05, + "loss": 2.2715, + "step": 1672 + }, + { + "epoch": 0.5135052179251074, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.988105196076925e-05, + "loss": 2.2806, + "step": 1673 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.695818305015564, + "learning_rate": 9.98807090605331e-05, + "loss": 2.3387, + "step": 1674 + }, + { + "epoch": 0.5141190914671577, + "grad_norm": 0.7685426473617554, + "learning_rate": 9.988036566734655e-05, + "loss": 2.2921, + "step": 1675 + }, + { + "epoch": 0.514426028238183, + "grad_norm": 0.6522897481918335, + "learning_rate": 9.988002178121301e-05, + "loss": 2.2507, + "step": 1676 + }, + { + "epoch": 0.5147329650092081, + "grad_norm": 0.7442181706428528, + "learning_rate": 9.987967740213583e-05, + "loss": 2.3292, + "step": 1677 + }, + { + "epoch": 0.5150399017802333, + "grad_norm": 0.8093023300170898, + "learning_rate": 9.987933253011846e-05, + "loss": 2.3384, + "step": 1678 + }, + { + "epoch": 0.5153468385512584, + "grad_norm": 0.8014655113220215, + "learning_rate": 9.987898716516428e-05, + "loss": 2.3619, + "step": 1679 + }, + { + "epoch": 0.5156537753222836, + "grad_norm": 0.8230258822441101, + "learning_rate": 9.987864130727671e-05, + "loss": 2.3242, + "step": 1680 + }, + { + "epoch": 0.5159607120933087, + "grad_norm": 0.9222247004508972, + "learning_rate": 9.987829495645918e-05, + "loss": 2.3907, + "step": 1681 + }, + { + "epoch": 0.516267648864334, + "grad_norm": 0.9293351769447327, + "learning_rate": 9.987794811271511e-05, + "loss": 2.3632, + "step": 1682 + }, + { + "epoch": 0.5165745856353591, + "grad_norm": 0.9555168747901917, + "learning_rate": 9.987760077604791e-05, + "loss": 2.3273, + "step": 1683 + }, + { + "epoch": 0.5168815224063843, + "grad_norm": 0.9839370250701904, + "learning_rate": 9.987725294646102e-05, + "loss": 2.3451, + "step": 1684 + }, + { + "epoch": 0.5171884591774094, + "grad_norm": 1.097970962524414, + "learning_rate": 9.987690462395791e-05, + "loss": 2.308, + "step": 1685 + }, + { + "epoch": 0.5174953959484346, + "grad_norm": 0.9345484972000122, + "learning_rate": 9.987655580854198e-05, + "loss": 2.3051, + "step": 1686 + }, + { + "epoch": 0.5178023327194597, + "grad_norm": 0.8075851798057556, + "learning_rate": 9.987620650021668e-05, + "loss": 2.3005, + "step": 1687 + }, + { + "epoch": 0.518109269490485, + "grad_norm": 0.7287935614585876, + "learning_rate": 9.987585669898549e-05, + "loss": 2.3709, + "step": 1688 + }, + { + "epoch": 0.5184162062615101, + "grad_norm": 0.7611173987388611, + "learning_rate": 9.987550640485184e-05, + "loss": 2.3265, + "step": 1689 + }, + { + "epoch": 0.5187231430325353, + "grad_norm": 0.7932588458061218, + "learning_rate": 9.987515561781921e-05, + "loss": 2.3625, + "step": 1690 + }, + { + "epoch": 0.5190300798035604, + "grad_norm": 0.7837479114532471, + "learning_rate": 9.987480433789106e-05, + "loss": 2.2614, + "step": 1691 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.905799925327301, + "learning_rate": 9.987445256507085e-05, + "loss": 2.2915, + "step": 1692 + }, + { + "epoch": 0.5196439533456108, + "grad_norm": 0.9417183995246887, + "learning_rate": 9.987410029936208e-05, + "loss": 2.3624, + "step": 1693 + }, + { + "epoch": 0.519950890116636, + "grad_norm": 0.9971327185630798, + "learning_rate": 9.987374754076822e-05, + "loss": 2.3913, + "step": 1694 + }, + { + "epoch": 0.5202578268876611, + "grad_norm": 0.8719072341918945, + "learning_rate": 9.987339428929274e-05, + "loss": 2.3412, + "step": 1695 + }, + { + "epoch": 0.5205647636586863, + "grad_norm": 0.8198116421699524, + "learning_rate": 9.987304054493916e-05, + "loss": 2.333, + "step": 1696 + }, + { + "epoch": 0.5208717004297114, + "grad_norm": 0.7450931668281555, + "learning_rate": 9.987268630771096e-05, + "loss": 2.2817, + "step": 1697 + }, + { + "epoch": 0.5211786372007366, + "grad_norm": 0.6867587566375732, + "learning_rate": 9.987233157761164e-05, + "loss": 2.3456, + "step": 1698 + }, + { + "epoch": 0.5214855739717618, + "grad_norm": 0.7537778615951538, + "learning_rate": 9.987197635464471e-05, + "loss": 2.176, + "step": 1699 + }, + { + "epoch": 0.521792510742787, + "grad_norm": 0.8347577452659607, + "learning_rate": 9.987162063881366e-05, + "loss": 2.3296, + "step": 1700 + }, + { + "epoch": 0.5220994475138122, + "grad_norm": 0.8714643120765686, + "learning_rate": 9.987126443012205e-05, + "loss": 2.3648, + "step": 1701 + }, + { + "epoch": 0.5224063842848373, + "grad_norm": 0.8579849004745483, + "learning_rate": 9.987090772857336e-05, + "loss": 2.4189, + "step": 1702 + }, + { + "epoch": 0.5227133210558625, + "grad_norm": 0.8651238083839417, + "learning_rate": 9.987055053417114e-05, + "loss": 2.3036, + "step": 1703 + }, + { + "epoch": 0.5230202578268877, + "grad_norm": 0.8447873592376709, + "learning_rate": 9.98701928469189e-05, + "loss": 2.3243, + "step": 1704 + }, + { + "epoch": 0.5233271945979129, + "grad_norm": 0.8218941688537598, + "learning_rate": 9.986983466682019e-05, + "loss": 2.3888, + "step": 1705 + }, + { + "epoch": 0.523634131368938, + "grad_norm": 0.7862920761108398, + "learning_rate": 9.986947599387855e-05, + "loss": 2.335, + "step": 1706 + }, + { + "epoch": 0.5239410681399632, + "grad_norm": 0.8096200227737427, + "learning_rate": 9.986911682809749e-05, + "loss": 2.4034, + "step": 1707 + }, + { + "epoch": 0.5242480049109883, + "grad_norm": 0.8217427730560303, + "learning_rate": 9.986875716948062e-05, + "loss": 2.2659, + "step": 1708 + }, + { + "epoch": 0.5245549416820136, + "grad_norm": 0.7676928043365479, + "learning_rate": 9.986839701803146e-05, + "loss": 2.2736, + "step": 1709 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.7783572673797607, + "learning_rate": 9.986803637375356e-05, + "loss": 2.3611, + "step": 1710 + }, + { + "epoch": 0.5251688152240639, + "grad_norm": 0.7657338380813599, + "learning_rate": 9.98676752366505e-05, + "loss": 2.3573, + "step": 1711 + }, + { + "epoch": 0.525475751995089, + "grad_norm": 0.8946976065635681, + "learning_rate": 9.986731360672585e-05, + "loss": 2.3443, + "step": 1712 + }, + { + "epoch": 0.5257826887661142, + "grad_norm": 0.8047227263450623, + "learning_rate": 9.986695148398318e-05, + "loss": 2.345, + "step": 1713 + }, + { + "epoch": 0.5260896255371393, + "grad_norm": 0.8407939672470093, + "learning_rate": 9.986658886842605e-05, + "loss": 2.2828, + "step": 1714 + }, + { + "epoch": 0.5263965623081646, + "grad_norm": 0.8460215330123901, + "learning_rate": 9.986622576005806e-05, + "loss": 2.2786, + "step": 1715 + }, + { + "epoch": 0.5267034990791897, + "grad_norm": 0.8291949033737183, + "learning_rate": 9.986586215888283e-05, + "loss": 2.3491, + "step": 1716 + }, + { + "epoch": 0.5270104358502149, + "grad_norm": 0.8812628388404846, + "learning_rate": 9.98654980649039e-05, + "loss": 2.3392, + "step": 1717 + }, + { + "epoch": 0.52731737262124, + "grad_norm": 0.8666933178901672, + "learning_rate": 9.98651334781249e-05, + "loss": 2.2585, + "step": 1718 + }, + { + "epoch": 0.5276243093922652, + "grad_norm": 0.8393275737762451, + "learning_rate": 9.986476839854941e-05, + "loss": 2.3315, + "step": 1719 + }, + { + "epoch": 0.5279312461632903, + "grad_norm": 0.8431777954101562, + "learning_rate": 9.986440282618105e-05, + "loss": 2.268, + "step": 1720 + }, + { + "epoch": 0.5282381829343156, + "grad_norm": 0.8020747900009155, + "learning_rate": 9.986403676102346e-05, + "loss": 2.2306, + "step": 1721 + }, + { + "epoch": 0.5285451197053407, + "grad_norm": 0.817395806312561, + "learning_rate": 9.986367020308022e-05, + "loss": 2.2914, + "step": 1722 + }, + { + "epoch": 0.5288520564763659, + "grad_norm": 0.8034493327140808, + "learning_rate": 9.986330315235497e-05, + "loss": 2.3598, + "step": 1723 + }, + { + "epoch": 0.529158993247391, + "grad_norm": 0.9001252055168152, + "learning_rate": 9.986293560885131e-05, + "loss": 2.3456, + "step": 1724 + }, + { + "epoch": 0.5294659300184162, + "grad_norm": 0.9782349467277527, + "learning_rate": 9.986256757257293e-05, + "loss": 2.231, + "step": 1725 + }, + { + "epoch": 0.5297728667894414, + "grad_norm": 1.0022578239440918, + "learning_rate": 9.98621990435234e-05, + "loss": 2.3457, + "step": 1726 + }, + { + "epoch": 0.5300798035604666, + "grad_norm": 1.0705206394195557, + "learning_rate": 9.986183002170642e-05, + "loss": 2.2775, + "step": 1727 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.8464064598083496, + "learning_rate": 9.98614605071256e-05, + "loss": 2.4006, + "step": 1728 + }, + { + "epoch": 0.5306936771025169, + "grad_norm": 0.7128132581710815, + "learning_rate": 9.98610904997846e-05, + "loss": 2.3273, + "step": 1729 + }, + { + "epoch": 0.531000613873542, + "grad_norm": 0.8113927245140076, + "learning_rate": 9.986071999968706e-05, + "loss": 2.3467, + "step": 1730 + }, + { + "epoch": 0.5313075506445673, + "grad_norm": 0.9236831665039062, + "learning_rate": 9.986034900683669e-05, + "loss": 2.3815, + "step": 1731 + }, + { + "epoch": 0.5316144874155924, + "grad_norm": 0.9325668811798096, + "learning_rate": 9.985997752123713e-05, + "loss": 2.3411, + "step": 1732 + }, + { + "epoch": 0.5319214241866176, + "grad_norm": 0.9585117101669312, + "learning_rate": 9.985960554289203e-05, + "loss": 2.3309, + "step": 1733 + }, + { + "epoch": 0.5322283609576427, + "grad_norm": 0.9459986686706543, + "learning_rate": 9.98592330718051e-05, + "loss": 2.3525, + "step": 1734 + }, + { + "epoch": 0.5325352977286679, + "grad_norm": 0.971592366695404, + "learning_rate": 9.985886010797997e-05, + "loss": 2.3665, + "step": 1735 + }, + { + "epoch": 0.532842234499693, + "grad_norm": 0.8533779978752136, + "learning_rate": 9.985848665142039e-05, + "loss": 2.26, + "step": 1736 + }, + { + "epoch": 0.5331491712707183, + "grad_norm": 0.8224228620529175, + "learning_rate": 9.985811270213002e-05, + "loss": 2.3523, + "step": 1737 + }, + { + "epoch": 0.5334561080417434, + "grad_norm": 0.8649810552597046, + "learning_rate": 9.985773826011255e-05, + "loss": 2.3262, + "step": 1738 + }, + { + "epoch": 0.5337630448127686, + "grad_norm": 0.8099339604377747, + "learning_rate": 9.98573633253717e-05, + "loss": 2.3038, + "step": 1739 + }, + { + "epoch": 0.5340699815837937, + "grad_norm": 0.6788219213485718, + "learning_rate": 9.985698789791115e-05, + "loss": 2.3278, + "step": 1740 + }, + { + "epoch": 0.5343769183548189, + "grad_norm": 0.8716040253639221, + "learning_rate": 9.985661197773464e-05, + "loss": 2.2955, + "step": 1741 + }, + { + "epoch": 0.534683855125844, + "grad_norm": 0.8377614617347717, + "learning_rate": 9.985623556484587e-05, + "loss": 2.2801, + "step": 1742 + }, + { + "epoch": 0.5349907918968693, + "grad_norm": 0.8452683091163635, + "learning_rate": 9.985585865924853e-05, + "loss": 2.3313, + "step": 1743 + }, + { + "epoch": 0.5352977286678944, + "grad_norm": 0.8226203918457031, + "learning_rate": 9.98554812609464e-05, + "loss": 2.3464, + "step": 1744 + }, + { + "epoch": 0.5356046654389196, + "grad_norm": 0.7476974725723267, + "learning_rate": 9.985510336994316e-05, + "loss": 2.3721, + "step": 1745 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.7132230997085571, + "learning_rate": 9.98547249862426e-05, + "loss": 2.2657, + "step": 1746 + }, + { + "epoch": 0.5362185389809699, + "grad_norm": 0.7022002339363098, + "learning_rate": 9.98543461098484e-05, + "loss": 2.2656, + "step": 1747 + }, + { + "epoch": 0.536525475751995, + "grad_norm": 0.7174789309501648, + "learning_rate": 9.985396674076435e-05, + "loss": 2.2914, + "step": 1748 + }, + { + "epoch": 0.5368324125230203, + "grad_norm": 0.78509920835495, + "learning_rate": 9.985358687899417e-05, + "loss": 2.3155, + "step": 1749 + }, + { + "epoch": 0.5371393492940454, + "grad_norm": 0.7670894861221313, + "learning_rate": 9.985320652454162e-05, + "loss": 2.2608, + "step": 1750 + }, + { + "epoch": 0.5374462860650706, + "grad_norm": 0.6196603178977966, + "learning_rate": 9.985282567741047e-05, + "loss": 2.2796, + "step": 1751 + }, + { + "epoch": 0.5377532228360957, + "grad_norm": 0.7119829058647156, + "learning_rate": 9.985244433760448e-05, + "loss": 2.2262, + "step": 1752 + }, + { + "epoch": 0.538060159607121, + "grad_norm": 0.6665359735488892, + "learning_rate": 9.98520625051274e-05, + "loss": 2.2714, + "step": 1753 + }, + { + "epoch": 0.5383670963781461, + "grad_norm": 0.7960934042930603, + "learning_rate": 9.985168017998303e-05, + "loss": 2.3703, + "step": 1754 + }, + { + "epoch": 0.5386740331491713, + "grad_norm": 0.9428521394729614, + "learning_rate": 9.985129736217513e-05, + "loss": 2.3334, + "step": 1755 + }, + { + "epoch": 0.5389809699201964, + "grad_norm": 0.9900842905044556, + "learning_rate": 9.985091405170751e-05, + "loss": 2.2369, + "step": 1756 + }, + { + "epoch": 0.5392879066912216, + "grad_norm": 0.9340593814849854, + "learning_rate": 9.985053024858393e-05, + "loss": 2.4332, + "step": 1757 + }, + { + "epoch": 0.5395948434622467, + "grad_norm": 0.9241896271705627, + "learning_rate": 9.985014595280818e-05, + "loss": 2.3484, + "step": 1758 + }, + { + "epoch": 0.539901780233272, + "grad_norm": 0.7724506258964539, + "learning_rate": 9.984976116438408e-05, + "loss": 2.282, + "step": 1759 + }, + { + "epoch": 0.5402087170042971, + "grad_norm": 0.9098101854324341, + "learning_rate": 9.984937588331543e-05, + "loss": 2.3039, + "step": 1760 + }, + { + "epoch": 0.5405156537753223, + "grad_norm": 0.9430370330810547, + "learning_rate": 9.984899010960601e-05, + "loss": 2.2555, + "step": 1761 + }, + { + "epoch": 0.5408225905463474, + "grad_norm": 0.8927021026611328, + "learning_rate": 9.984860384325965e-05, + "loss": 2.3034, + "step": 1762 + }, + { + "epoch": 0.5411295273173726, + "grad_norm": 0.8331896662712097, + "learning_rate": 9.98482170842802e-05, + "loss": 2.3341, + "step": 1763 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.8311246633529663, + "learning_rate": 9.984782983267142e-05, + "loss": 2.3913, + "step": 1764 + }, + { + "epoch": 0.541743400859423, + "grad_norm": 0.7459335923194885, + "learning_rate": 9.98474420884372e-05, + "loss": 2.2912, + "step": 1765 + }, + { + "epoch": 0.5420503376304481, + "grad_norm": 0.84760981798172, + "learning_rate": 9.984705385158131e-05, + "loss": 2.316, + "step": 1766 + }, + { + "epoch": 0.5423572744014733, + "grad_norm": 0.888793408870697, + "learning_rate": 9.984666512210762e-05, + "loss": 2.3452, + "step": 1767 + }, + { + "epoch": 0.5426642111724984, + "grad_norm": 0.7977499961853027, + "learning_rate": 9.984627590001999e-05, + "loss": 2.3325, + "step": 1768 + }, + { + "epoch": 0.5429711479435236, + "grad_norm": 0.8059934377670288, + "learning_rate": 9.984588618532224e-05, + "loss": 2.3347, + "step": 1769 + }, + { + "epoch": 0.5432780847145487, + "grad_norm": 0.8190197348594666, + "learning_rate": 9.984549597801822e-05, + "loss": 2.3446, + "step": 1770 + }, + { + "epoch": 0.543585021485574, + "grad_norm": 0.774773895740509, + "learning_rate": 9.98451052781118e-05, + "loss": 2.2598, + "step": 1771 + }, + { + "epoch": 0.5438919582565992, + "grad_norm": 0.7341485023498535, + "learning_rate": 9.984471408560682e-05, + "loss": 2.2728, + "step": 1772 + }, + { + "epoch": 0.5441988950276243, + "grad_norm": 0.6881145238876343, + "learning_rate": 9.984432240050719e-05, + "loss": 2.2922, + "step": 1773 + }, + { + "epoch": 0.5445058317986495, + "grad_norm": 0.6896151304244995, + "learning_rate": 9.984393022281673e-05, + "loss": 2.2915, + "step": 1774 + }, + { + "epoch": 0.5448127685696746, + "grad_norm": 0.6902059316635132, + "learning_rate": 9.984353755253932e-05, + "loss": 2.31, + "step": 1775 + }, + { + "epoch": 0.5451197053406999, + "grad_norm": 0.7594140768051147, + "learning_rate": 9.984314438967888e-05, + "loss": 2.3092, + "step": 1776 + }, + { + "epoch": 0.545426642111725, + "grad_norm": 0.8682328462600708, + "learning_rate": 9.984275073423927e-05, + "loss": 2.2851, + "step": 1777 + }, + { + "epoch": 0.5457335788827502, + "grad_norm": 0.8747107982635498, + "learning_rate": 9.98423565862244e-05, + "loss": 2.2927, + "step": 1778 + }, + { + "epoch": 0.5460405156537753, + "grad_norm": 0.9824326038360596, + "learning_rate": 9.984196194563813e-05, + "loss": 2.3622, + "step": 1779 + }, + { + "epoch": 0.5463474524248005, + "grad_norm": 1.0006790161132812, + "learning_rate": 9.984156681248438e-05, + "loss": 2.2531, + "step": 1780 + }, + { + "epoch": 0.5466543891958257, + "grad_norm": 0.9501944184303284, + "learning_rate": 9.984117118676705e-05, + "loss": 2.3902, + "step": 1781 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.7835353016853333, + "learning_rate": 9.984077506849005e-05, + "loss": 2.2754, + "step": 1782 + }, + { + "epoch": 0.547268262737876, + "grad_norm": 0.7310026288032532, + "learning_rate": 9.984037845765732e-05, + "loss": 2.2742, + "step": 1783 + }, + { + "epoch": 0.5475751995089012, + "grad_norm": 0.9469361901283264, + "learning_rate": 9.983998135427275e-05, + "loss": 2.4026, + "step": 1784 + }, + { + "epoch": 0.5478821362799263, + "grad_norm": 1.0639240741729736, + "learning_rate": 9.983958375834025e-05, + "loss": 2.3522, + "step": 1785 + }, + { + "epoch": 0.5481890730509515, + "grad_norm": 0.7771989703178406, + "learning_rate": 9.983918566986379e-05, + "loss": 2.216, + "step": 1786 + }, + { + "epoch": 0.5484960098219767, + "grad_norm": 0.6809307932853699, + "learning_rate": 9.983878708884728e-05, + "loss": 2.256, + "step": 1787 + }, + { + "epoch": 0.5488029465930019, + "grad_norm": 0.7300165891647339, + "learning_rate": 9.983838801529469e-05, + "loss": 2.3156, + "step": 1788 + }, + { + "epoch": 0.549109883364027, + "grad_norm": 0.8352389335632324, + "learning_rate": 9.98379884492099e-05, + "loss": 2.3344, + "step": 1789 + }, + { + "epoch": 0.5494168201350522, + "grad_norm": 0.830585777759552, + "learning_rate": 9.983758839059692e-05, + "loss": 2.3076, + "step": 1790 + }, + { + "epoch": 0.5497237569060773, + "grad_norm": 0.7384640574455261, + "learning_rate": 9.983718783945968e-05, + "loss": 2.2387, + "step": 1791 + }, + { + "epoch": 0.5500306936771026, + "grad_norm": 0.7133243083953857, + "learning_rate": 9.983678679580213e-05, + "loss": 2.2933, + "step": 1792 + }, + { + "epoch": 0.5503376304481277, + "grad_norm": 0.8462459444999695, + "learning_rate": 9.983638525962823e-05, + "loss": 2.3294, + "step": 1793 + }, + { + "epoch": 0.5506445672191529, + "grad_norm": 0.7841110825538635, + "learning_rate": 9.983598323094199e-05, + "loss": 2.3156, + "step": 1794 + }, + { + "epoch": 0.550951503990178, + "grad_norm": 0.8454114198684692, + "learning_rate": 9.983558070974735e-05, + "loss": 2.2203, + "step": 1795 + }, + { + "epoch": 0.5512584407612032, + "grad_norm": 0.7741531729698181, + "learning_rate": 9.983517769604826e-05, + "loss": 2.2585, + "step": 1796 + }, + { + "epoch": 0.5515653775322283, + "grad_norm": 0.717714250087738, + "learning_rate": 9.983477418984876e-05, + "loss": 2.3127, + "step": 1797 + }, + { + "epoch": 0.5518723143032536, + "grad_norm": 0.7546361088752747, + "learning_rate": 9.983437019115283e-05, + "loss": 2.2591, + "step": 1798 + }, + { + "epoch": 0.5521792510742787, + "grad_norm": 0.7947681546211243, + "learning_rate": 9.983396569996442e-05, + "loss": 2.337, + "step": 1799 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.9286270141601562, + "learning_rate": 9.983356071628756e-05, + "loss": 2.371, + "step": 1800 + }, + { + "epoch": 0.552793124616329, + "grad_norm": 1.0236682891845703, + "learning_rate": 9.983315524012625e-05, + "loss": 2.2673, + "step": 1801 + }, + { + "epoch": 0.5531000613873542, + "grad_norm": 1.043534278869629, + "learning_rate": 9.983274927148447e-05, + "loss": 2.3204, + "step": 1802 + }, + { + "epoch": 0.5534069981583793, + "grad_norm": 0.9694257378578186, + "learning_rate": 9.983234281036626e-05, + "loss": 2.2642, + "step": 1803 + }, + { + "epoch": 0.5537139349294046, + "grad_norm": 0.8890992403030396, + "learning_rate": 9.983193585677563e-05, + "loss": 2.2546, + "step": 1804 + }, + { + "epoch": 0.5540208717004297, + "grad_norm": 0.8109140396118164, + "learning_rate": 9.983152841071662e-05, + "loss": 2.3088, + "step": 1805 + }, + { + "epoch": 0.5543278084714549, + "grad_norm": 0.7762413620948792, + "learning_rate": 9.983112047219323e-05, + "loss": 2.2277, + "step": 1806 + }, + { + "epoch": 0.55463474524248, + "grad_norm": 0.7949336767196655, + "learning_rate": 9.983071204120951e-05, + "loss": 2.3004, + "step": 1807 + }, + { + "epoch": 0.5549416820135052, + "grad_norm": 0.9118300080299377, + "learning_rate": 9.983030311776946e-05, + "loss": 2.3986, + "step": 1808 + }, + { + "epoch": 0.5552486187845304, + "grad_norm": 0.874891996383667, + "learning_rate": 9.982989370187717e-05, + "loss": 2.2721, + "step": 1809 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.8089940547943115, + "learning_rate": 9.982948379353667e-05, + "loss": 2.2846, + "step": 1810 + }, + { + "epoch": 0.5558624923265807, + "grad_norm": 0.7407395839691162, + "learning_rate": 9.982907339275198e-05, + "loss": 2.2848, + "step": 1811 + }, + { + "epoch": 0.5561694290976059, + "grad_norm": 0.7487329244613647, + "learning_rate": 9.982866249952721e-05, + "loss": 2.266, + "step": 1812 + }, + { + "epoch": 0.556476365868631, + "grad_norm": 0.7910557389259338, + "learning_rate": 9.982825111386638e-05, + "loss": 2.2975, + "step": 1813 + }, + { + "epoch": 0.5567833026396563, + "grad_norm": 0.767186164855957, + "learning_rate": 9.982783923577356e-05, + "loss": 2.2867, + "step": 1814 + }, + { + "epoch": 0.5570902394106814, + "grad_norm": 0.7296959757804871, + "learning_rate": 9.982742686525284e-05, + "loss": 2.2167, + "step": 1815 + }, + { + "epoch": 0.5573971761817066, + "grad_norm": 0.6536411643028259, + "learning_rate": 9.982701400230827e-05, + "loss": 2.2278, + "step": 1816 + }, + { + "epoch": 0.5577041129527317, + "grad_norm": 0.7393643260002136, + "learning_rate": 9.982660064694394e-05, + "loss": 2.3275, + "step": 1817 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.7837240099906921, + "learning_rate": 9.982618679916396e-05, + "loss": 2.3516, + "step": 1818 + }, + { + "epoch": 0.558317986494782, + "grad_norm": 0.8186847567558289, + "learning_rate": 9.982577245897238e-05, + "loss": 2.4104, + "step": 1819 + }, + { + "epoch": 0.5586249232658073, + "grad_norm": 0.733651340007782, + "learning_rate": 9.98253576263733e-05, + "loss": 2.2151, + "step": 1820 + }, + { + "epoch": 0.5589318600368324, + "grad_norm": 0.7452411651611328, + "learning_rate": 9.982494230137086e-05, + "loss": 2.3288, + "step": 1821 + }, + { + "epoch": 0.5592387968078576, + "grad_norm": 0.7369456887245178, + "learning_rate": 9.982452648396913e-05, + "loss": 2.3023, + "step": 1822 + }, + { + "epoch": 0.5595457335788827, + "grad_norm": 0.794789731502533, + "learning_rate": 9.982411017417222e-05, + "loss": 2.2774, + "step": 1823 + }, + { + "epoch": 0.5598526703499079, + "grad_norm": 0.7677412033081055, + "learning_rate": 9.982369337198425e-05, + "loss": 2.3213, + "step": 1824 + }, + { + "epoch": 0.560159607120933, + "grad_norm": 0.8195241689682007, + "learning_rate": 9.982327607740934e-05, + "loss": 2.3721, + "step": 1825 + }, + { + "epoch": 0.5604665438919583, + "grad_norm": 0.867115318775177, + "learning_rate": 9.982285829045162e-05, + "loss": 2.3653, + "step": 1826 + }, + { + "epoch": 0.5607734806629834, + "grad_norm": 0.8519865870475769, + "learning_rate": 9.98224400111152e-05, + "loss": 2.3646, + "step": 1827 + }, + { + "epoch": 0.5610804174340086, + "grad_norm": 0.9408721923828125, + "learning_rate": 9.982202123940425e-05, + "loss": 2.2051, + "step": 1828 + }, + { + "epoch": 0.5613873542050337, + "grad_norm": 0.985325813293457, + "learning_rate": 9.982160197532287e-05, + "loss": 2.3402, + "step": 1829 + }, + { + "epoch": 0.5616942909760589, + "grad_norm": 1.018094539642334, + "learning_rate": 9.982118221887521e-05, + "loss": 2.2712, + "step": 1830 + }, + { + "epoch": 0.562001227747084, + "grad_norm": 0.9246920347213745, + "learning_rate": 9.982076197006543e-05, + "loss": 2.3808, + "step": 1831 + }, + { + "epoch": 0.5623081645181093, + "grad_norm": 0.8519729971885681, + "learning_rate": 9.982034122889768e-05, + "loss": 2.3774, + "step": 1832 + }, + { + "epoch": 0.5626151012891344, + "grad_norm": 0.801567018032074, + "learning_rate": 9.981991999537612e-05, + "loss": 2.2713, + "step": 1833 + }, + { + "epoch": 0.5629220380601596, + "grad_norm": 0.7212518453598022, + "learning_rate": 9.981949826950492e-05, + "loss": 2.1902, + "step": 1834 + }, + { + "epoch": 0.5632289748311847, + "grad_norm": 0.7644798755645752, + "learning_rate": 9.981907605128822e-05, + "loss": 2.2751, + "step": 1835 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.7941999435424805, + "learning_rate": 9.981865334073022e-05, + "loss": 2.2991, + "step": 1836 + }, + { + "epoch": 0.5638428483732351, + "grad_norm": 0.7274888753890991, + "learning_rate": 9.981823013783508e-05, + "loss": 2.3536, + "step": 1837 + }, + { + "epoch": 0.5641497851442603, + "grad_norm": 0.845024585723877, + "learning_rate": 9.9817806442607e-05, + "loss": 2.2796, + "step": 1838 + }, + { + "epoch": 0.5644567219152854, + "grad_norm": 0.8225597739219666, + "learning_rate": 9.981738225505015e-05, + "loss": 2.3339, + "step": 1839 + }, + { + "epoch": 0.5647636586863106, + "grad_norm": 0.8456425070762634, + "learning_rate": 9.981695757516873e-05, + "loss": 2.2583, + "step": 1840 + }, + { + "epoch": 0.5650705954573357, + "grad_norm": 1.0066497325897217, + "learning_rate": 9.981653240296695e-05, + "loss": 2.3628, + "step": 1841 + }, + { + "epoch": 0.565377532228361, + "grad_norm": 0.9574379920959473, + "learning_rate": 9.981610673844899e-05, + "loss": 2.306, + "step": 1842 + }, + { + "epoch": 0.5656844689993862, + "grad_norm": 0.7427437901496887, + "learning_rate": 9.981568058161905e-05, + "loss": 2.267, + "step": 1843 + }, + { + "epoch": 0.5659914057704113, + "grad_norm": 0.6984857320785522, + "learning_rate": 9.981525393248138e-05, + "loss": 2.2095, + "step": 1844 + }, + { + "epoch": 0.5662983425414365, + "grad_norm": 0.748062789440155, + "learning_rate": 9.981482679104016e-05, + "loss": 2.211, + "step": 1845 + }, + { + "epoch": 0.5666052793124616, + "grad_norm": 0.7978217005729675, + "learning_rate": 9.981439915729964e-05, + "loss": 2.2437, + "step": 1846 + }, + { + "epoch": 0.5669122160834869, + "grad_norm": 0.807849109172821, + "learning_rate": 9.981397103126401e-05, + "loss": 2.3063, + "step": 1847 + }, + { + "epoch": 0.567219152854512, + "grad_norm": 0.8626619577407837, + "learning_rate": 9.981354241293752e-05, + "loss": 2.3616, + "step": 1848 + }, + { + "epoch": 0.5675260896255372, + "grad_norm": 0.8991526961326599, + "learning_rate": 9.981311330232442e-05, + "loss": 2.2355, + "step": 1849 + }, + { + "epoch": 0.5678330263965623, + "grad_norm": 0.7399953007698059, + "learning_rate": 9.981268369942894e-05, + "loss": 2.2452, + "step": 1850 + }, + { + "epoch": 0.5681399631675875, + "grad_norm": 0.7787104845046997, + "learning_rate": 9.981225360425533e-05, + "loss": 2.4141, + "step": 1851 + }, + { + "epoch": 0.5684468999386126, + "grad_norm": 0.8570892214775085, + "learning_rate": 9.98118230168078e-05, + "loss": 2.2487, + "step": 1852 + }, + { + "epoch": 0.5687538367096379, + "grad_norm": 0.8277538418769836, + "learning_rate": 9.981139193709068e-05, + "loss": 2.2602, + "step": 1853 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.7638106942176819, + "learning_rate": 9.981096036510817e-05, + "loss": 2.2886, + "step": 1854 + }, + { + "epoch": 0.5693677102516882, + "grad_norm": 0.8480616807937622, + "learning_rate": 9.981052830086454e-05, + "loss": 2.2893, + "step": 1855 + }, + { + "epoch": 0.5696746470227133, + "grad_norm": 0.8568599820137024, + "learning_rate": 9.98100957443641e-05, + "loss": 2.3802, + "step": 1856 + }, + { + "epoch": 0.5699815837937385, + "grad_norm": 0.7863987684249878, + "learning_rate": 9.98096626956111e-05, + "loss": 2.2996, + "step": 1857 + }, + { + "epoch": 0.5702885205647636, + "grad_norm": 0.7636334896087646, + "learning_rate": 9.980922915460979e-05, + "loss": 2.2569, + "step": 1858 + }, + { + "epoch": 0.5705954573357889, + "grad_norm": 0.7514677047729492, + "learning_rate": 9.98087951213645e-05, + "loss": 2.3317, + "step": 1859 + }, + { + "epoch": 0.570902394106814, + "grad_norm": 0.717637300491333, + "learning_rate": 9.980836059587951e-05, + "loss": 2.2855, + "step": 1860 + }, + { + "epoch": 0.5712093308778392, + "grad_norm": 0.728518545627594, + "learning_rate": 9.98079255781591e-05, + "loss": 2.3166, + "step": 1861 + }, + { + "epoch": 0.5715162676488643, + "grad_norm": 0.7158043384552002, + "learning_rate": 9.980749006820757e-05, + "loss": 2.2639, + "step": 1862 + }, + { + "epoch": 0.5718232044198895, + "grad_norm": 0.7565107941627502, + "learning_rate": 9.980705406602924e-05, + "loss": 2.2833, + "step": 1863 + }, + { + "epoch": 0.5721301411909147, + "grad_norm": 0.7873388528823853, + "learning_rate": 9.980661757162841e-05, + "loss": 2.201, + "step": 1864 + }, + { + "epoch": 0.5724370779619399, + "grad_norm": 0.7818259596824646, + "learning_rate": 9.980618058500939e-05, + "loss": 2.242, + "step": 1865 + }, + { + "epoch": 0.572744014732965, + "grad_norm": 0.7464665770530701, + "learning_rate": 9.98057431061765e-05, + "loss": 2.2325, + "step": 1866 + }, + { + "epoch": 0.5730509515039902, + "grad_norm": 0.7778184413909912, + "learning_rate": 9.980530513513406e-05, + "loss": 2.3258, + "step": 1867 + }, + { + "epoch": 0.5733578882750153, + "grad_norm": 0.825661301612854, + "learning_rate": 9.980486667188642e-05, + "loss": 2.3477, + "step": 1868 + }, + { + "epoch": 0.5736648250460405, + "grad_norm": 0.8448848724365234, + "learning_rate": 9.980442771643788e-05, + "loss": 2.3523, + "step": 1869 + }, + { + "epoch": 0.5739717618170657, + "grad_norm": 0.8330404758453369, + "learning_rate": 9.98039882687928e-05, + "loss": 2.2274, + "step": 1870 + }, + { + "epoch": 0.5742786985880909, + "grad_norm": 0.7520943284034729, + "learning_rate": 9.98035483289555e-05, + "loss": 2.2773, + "step": 1871 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.8312448263168335, + "learning_rate": 9.980310789693037e-05, + "loss": 2.302, + "step": 1872 + }, + { + "epoch": 0.5748925721301412, + "grad_norm": 0.7383994460105896, + "learning_rate": 9.980266697272173e-05, + "loss": 2.2168, + "step": 1873 + }, + { + "epoch": 0.5751995089011663, + "grad_norm": 0.9612922072410583, + "learning_rate": 9.980222555633394e-05, + "loss": 2.3558, + "step": 1874 + }, + { + "epoch": 0.5755064456721916, + "grad_norm": 0.9921227097511292, + "learning_rate": 9.980178364777136e-05, + "loss": 2.2913, + "step": 1875 + }, + { + "epoch": 0.5758133824432167, + "grad_norm": 0.9152889847755432, + "learning_rate": 9.980134124703837e-05, + "loss": 2.2615, + "step": 1876 + }, + { + "epoch": 0.5761203192142419, + "grad_norm": 0.8090541362762451, + "learning_rate": 9.980089835413936e-05, + "loss": 2.2661, + "step": 1877 + }, + { + "epoch": 0.576427255985267, + "grad_norm": 0.8074322938919067, + "learning_rate": 9.980045496907865e-05, + "loss": 2.3209, + "step": 1878 + }, + { + "epoch": 0.5767341927562922, + "grad_norm": 0.784649670124054, + "learning_rate": 9.980001109186065e-05, + "loss": 2.241, + "step": 1879 + }, + { + "epoch": 0.5770411295273173, + "grad_norm": 0.768108069896698, + "learning_rate": 9.979956672248978e-05, + "loss": 2.3333, + "step": 1880 + }, + { + "epoch": 0.5773480662983426, + "grad_norm": 0.798058271408081, + "learning_rate": 9.97991218609704e-05, + "loss": 2.3564, + "step": 1881 + }, + { + "epoch": 0.5776550030693677, + "grad_norm": 0.7606865763664246, + "learning_rate": 9.97986765073069e-05, + "loss": 2.2277, + "step": 1882 + }, + { + "epoch": 0.5779619398403929, + "grad_norm": 0.8320558667182922, + "learning_rate": 9.979823066150369e-05, + "loss": 2.3715, + "step": 1883 + }, + { + "epoch": 0.578268876611418, + "grad_norm": 0.7935798168182373, + "learning_rate": 9.979778432356517e-05, + "loss": 2.2605, + "step": 1884 + }, + { + "epoch": 0.5785758133824432, + "grad_norm": 0.6914796829223633, + "learning_rate": 9.979733749349578e-05, + "loss": 2.2699, + "step": 1885 + }, + { + "epoch": 0.5788827501534684, + "grad_norm": 0.6546899676322937, + "learning_rate": 9.979689017129989e-05, + "loss": 2.1908, + "step": 1886 + }, + { + "epoch": 0.5791896869244936, + "grad_norm": 0.7231267094612122, + "learning_rate": 9.979644235698195e-05, + "loss": 2.2084, + "step": 1887 + }, + { + "epoch": 0.5794966236955187, + "grad_norm": 0.668933093547821, + "learning_rate": 9.979599405054639e-05, + "loss": 2.2722, + "step": 1888 + }, + { + "epoch": 0.5798035604665439, + "grad_norm": 0.678191602230072, + "learning_rate": 9.979554525199763e-05, + "loss": 2.2312, + "step": 1889 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.6407462954521179, + "learning_rate": 9.97950959613401e-05, + "loss": 2.2381, + "step": 1890 + }, + { + "epoch": 0.5804174340085942, + "grad_norm": 0.6920403242111206, + "learning_rate": 9.979464617857826e-05, + "loss": 2.2678, + "step": 1891 + }, + { + "epoch": 0.5807243707796194, + "grad_norm": 0.6907110810279846, + "learning_rate": 9.979419590371651e-05, + "loss": 2.2579, + "step": 1892 + }, + { + "epoch": 0.5810313075506446, + "grad_norm": 0.7683933973312378, + "learning_rate": 9.979374513675935e-05, + "loss": 2.2184, + "step": 1893 + }, + { + "epoch": 0.5813382443216697, + "grad_norm": 0.797286868095398, + "learning_rate": 9.979329387771121e-05, + "loss": 2.2518, + "step": 1894 + }, + { + "epoch": 0.5816451810926949, + "grad_norm": 0.8192877769470215, + "learning_rate": 9.979284212657657e-05, + "loss": 2.2271, + "step": 1895 + }, + { + "epoch": 0.58195211786372, + "grad_norm": 0.7510090470314026, + "learning_rate": 9.979238988335986e-05, + "loss": 2.2864, + "step": 1896 + }, + { + "epoch": 0.5822590546347453, + "grad_norm": 0.7541393041610718, + "learning_rate": 9.979193714806558e-05, + "loss": 2.239, + "step": 1897 + }, + { + "epoch": 0.5825659914057704, + "grad_norm": 0.7353073358535767, + "learning_rate": 9.97914839206982e-05, + "loss": 2.2145, + "step": 1898 + }, + { + "epoch": 0.5828729281767956, + "grad_norm": 0.6813456416130066, + "learning_rate": 9.979103020126218e-05, + "loss": 2.194, + "step": 1899 + }, + { + "epoch": 0.5831798649478207, + "grad_norm": 0.6922066807746887, + "learning_rate": 9.979057598976202e-05, + "loss": 2.2335, + "step": 1900 + }, + { + "epoch": 0.5834868017188459, + "grad_norm": 0.5800344944000244, + "learning_rate": 9.97901212862022e-05, + "loss": 2.2159, + "step": 1901 + }, + { + "epoch": 0.583793738489871, + "grad_norm": 0.5770835280418396, + "learning_rate": 9.978966609058722e-05, + "loss": 2.2217, + "step": 1902 + }, + { + "epoch": 0.5841006752608963, + "grad_norm": 0.6217128038406372, + "learning_rate": 9.978921040292158e-05, + "loss": 2.2703, + "step": 1903 + }, + { + "epoch": 0.5844076120319214, + "grad_norm": 0.6684436798095703, + "learning_rate": 9.97887542232098e-05, + "loss": 2.2747, + "step": 1904 + }, + { + "epoch": 0.5847145488029466, + "grad_norm": 0.6261670589447021, + "learning_rate": 9.978829755145633e-05, + "loss": 2.2867, + "step": 1905 + }, + { + "epoch": 0.5850214855739717, + "grad_norm": 0.646051824092865, + "learning_rate": 9.978784038766575e-05, + "loss": 2.2493, + "step": 1906 + }, + { + "epoch": 0.5853284223449969, + "grad_norm": 0.6757060885429382, + "learning_rate": 9.978738273184254e-05, + "loss": 2.218, + "step": 1907 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.7867937684059143, + "learning_rate": 9.978692458399122e-05, + "loss": 2.3405, + "step": 1908 + }, + { + "epoch": 0.5859422958870473, + "grad_norm": 0.8349789381027222, + "learning_rate": 9.978646594411636e-05, + "loss": 2.3292, + "step": 1909 + }, + { + "epoch": 0.5862492326580724, + "grad_norm": 0.8739562034606934, + "learning_rate": 9.978600681222243e-05, + "loss": 2.2132, + "step": 1910 + }, + { + "epoch": 0.5865561694290976, + "grad_norm": 0.8187520503997803, + "learning_rate": 9.978554718831402e-05, + "loss": 2.3078, + "step": 1911 + }, + { + "epoch": 0.5868631062001227, + "grad_norm": 0.8463271856307983, + "learning_rate": 9.978508707239565e-05, + "loss": 2.1924, + "step": 1912 + }, + { + "epoch": 0.5871700429711479, + "grad_norm": 0.8674206733703613, + "learning_rate": 9.978462646447187e-05, + "loss": 2.2185, + "step": 1913 + }, + { + "epoch": 0.5874769797421732, + "grad_norm": 0.7828893065452576, + "learning_rate": 9.978416536454722e-05, + "loss": 2.3137, + "step": 1914 + }, + { + "epoch": 0.5877839165131983, + "grad_norm": 0.7868914604187012, + "learning_rate": 9.978370377262629e-05, + "loss": 2.2202, + "step": 1915 + }, + { + "epoch": 0.5880908532842235, + "grad_norm": 0.811596155166626, + "learning_rate": 9.97832416887136e-05, + "loss": 2.3463, + "step": 1916 + }, + { + "epoch": 0.5883977900552486, + "grad_norm": 0.9281075596809387, + "learning_rate": 9.978277911281375e-05, + "loss": 2.2394, + "step": 1917 + }, + { + "epoch": 0.5887047268262738, + "grad_norm": 0.8862313628196716, + "learning_rate": 9.978231604493129e-05, + "loss": 2.2456, + "step": 1918 + }, + { + "epoch": 0.589011663597299, + "grad_norm": 0.8411116600036621, + "learning_rate": 9.978185248507081e-05, + "loss": 2.2409, + "step": 1919 + }, + { + "epoch": 0.5893186003683242, + "grad_norm": 0.8205060958862305, + "learning_rate": 9.978138843323688e-05, + "loss": 2.2468, + "step": 1920 + }, + { + "epoch": 0.5896255371393493, + "grad_norm": 0.8103171586990356, + "learning_rate": 9.97809238894341e-05, + "loss": 2.2979, + "step": 1921 + }, + { + "epoch": 0.5899324739103745, + "grad_norm": 0.7937025427818298, + "learning_rate": 9.978045885366704e-05, + "loss": 2.3582, + "step": 1922 + }, + { + "epoch": 0.5902394106813996, + "grad_norm": 0.7983896136283875, + "learning_rate": 9.977999332594032e-05, + "loss": 2.2725, + "step": 1923 + }, + { + "epoch": 0.5905463474524248, + "grad_norm": 0.8274399042129517, + "learning_rate": 9.977952730625852e-05, + "loss": 2.3091, + "step": 1924 + }, + { + "epoch": 0.59085328422345, + "grad_norm": 0.9385362863540649, + "learning_rate": 9.977906079462627e-05, + "loss": 2.4322, + "step": 1925 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.8405537009239197, + "learning_rate": 9.977859379104814e-05, + "loss": 2.1606, + "step": 1926 + }, + { + "epoch": 0.5914671577655003, + "grad_norm": 0.8082418441772461, + "learning_rate": 9.97781262955288e-05, + "loss": 2.2929, + "step": 1927 + }, + { + "epoch": 0.5917740945365255, + "grad_norm": 0.7444280385971069, + "learning_rate": 9.977765830807283e-05, + "loss": 2.3217, + "step": 1928 + }, + { + "epoch": 0.5920810313075506, + "grad_norm": 0.7369982600212097, + "learning_rate": 9.977718982868485e-05, + "loss": 2.2658, + "step": 1929 + }, + { + "epoch": 0.5923879680785759, + "grad_norm": 0.6842257380485535, + "learning_rate": 9.977672085736951e-05, + "loss": 2.2243, + "step": 1930 + }, + { + "epoch": 0.592694904849601, + "grad_norm": 0.6954882740974426, + "learning_rate": 9.977625139413145e-05, + "loss": 2.2802, + "step": 1931 + }, + { + "epoch": 0.5930018416206262, + "grad_norm": 0.749829888343811, + "learning_rate": 9.97757814389753e-05, + "loss": 2.3166, + "step": 1932 + }, + { + "epoch": 0.5933087783916513, + "grad_norm": 0.7725609540939331, + "learning_rate": 9.977531099190569e-05, + "loss": 2.2367, + "step": 1933 + }, + { + "epoch": 0.5936157151626765, + "grad_norm": 0.7467440366744995, + "learning_rate": 9.977484005292728e-05, + "loss": 2.2704, + "step": 1934 + }, + { + "epoch": 0.5939226519337016, + "grad_norm": 0.7104424834251404, + "learning_rate": 9.977436862204475e-05, + "loss": 2.1983, + "step": 1935 + }, + { + "epoch": 0.5942295887047269, + "grad_norm": 0.7562711834907532, + "learning_rate": 9.977389669926272e-05, + "loss": 2.2857, + "step": 1936 + }, + { + "epoch": 0.594536525475752, + "grad_norm": 0.7803298830986023, + "learning_rate": 9.977342428458585e-05, + "loss": 2.3526, + "step": 1937 + }, + { + "epoch": 0.5948434622467772, + "grad_norm": 0.7487826943397522, + "learning_rate": 9.977295137801885e-05, + "loss": 2.2338, + "step": 1938 + }, + { + "epoch": 0.5951503990178023, + "grad_norm": 0.6969291567802429, + "learning_rate": 9.977247797956639e-05, + "loss": 2.2185, + "step": 1939 + }, + { + "epoch": 0.5954573357888275, + "grad_norm": 0.6293052434921265, + "learning_rate": 9.977200408923311e-05, + "loss": 2.2767, + "step": 1940 + }, + { + "epoch": 0.5957642725598526, + "grad_norm": 0.7457680702209473, + "learning_rate": 9.97715297070237e-05, + "loss": 2.2688, + "step": 1941 + }, + { + "epoch": 0.5960712093308779, + "grad_norm": 0.7255130410194397, + "learning_rate": 9.977105483294288e-05, + "loss": 2.2157, + "step": 1942 + }, + { + "epoch": 0.596378146101903, + "grad_norm": 0.739815890789032, + "learning_rate": 9.977057946699532e-05, + "loss": 2.306, + "step": 1943 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.7493855357170105, + "learning_rate": 9.977010360918571e-05, + "loss": 2.1893, + "step": 1944 + }, + { + "epoch": 0.5969920196439533, + "grad_norm": 0.7976173758506775, + "learning_rate": 9.976962725951878e-05, + "loss": 2.3288, + "step": 1945 + }, + { + "epoch": 0.5972989564149785, + "grad_norm": 0.9487287998199463, + "learning_rate": 9.976915041799921e-05, + "loss": 2.4484, + "step": 1946 + }, + { + "epoch": 0.5976058931860037, + "grad_norm": 0.9866845011711121, + "learning_rate": 9.976867308463174e-05, + "loss": 2.3223, + "step": 1947 + }, + { + "epoch": 0.5979128299570289, + "grad_norm": 0.9258660674095154, + "learning_rate": 9.976819525942107e-05, + "loss": 2.2358, + "step": 1948 + }, + { + "epoch": 0.598219766728054, + "grad_norm": 0.9822832345962524, + "learning_rate": 9.976771694237192e-05, + "loss": 2.2951, + "step": 1949 + }, + { + "epoch": 0.5985267034990792, + "grad_norm": 1.005528450012207, + "learning_rate": 9.976723813348902e-05, + "loss": 2.2604, + "step": 1950 + }, + { + "epoch": 0.5988336402701043, + "grad_norm": 0.8988018035888672, + "learning_rate": 9.976675883277711e-05, + "loss": 2.3419, + "step": 1951 + }, + { + "epoch": 0.5991405770411296, + "grad_norm": 0.7386319041252136, + "learning_rate": 9.976627904024091e-05, + "loss": 2.2357, + "step": 1952 + }, + { + "epoch": 0.5994475138121547, + "grad_norm": 0.7715404033660889, + "learning_rate": 9.976579875588518e-05, + "loss": 2.3482, + "step": 1953 + }, + { + "epoch": 0.5997544505831799, + "grad_norm": 0.7529712319374084, + "learning_rate": 9.976531797971464e-05, + "loss": 2.1735, + "step": 1954 + }, + { + "epoch": 0.600061387354205, + "grad_norm": 0.8589643836021423, + "learning_rate": 9.97648367117341e-05, + "loss": 2.305, + "step": 1955 + }, + { + "epoch": 0.6003683241252302, + "grad_norm": 0.9038915634155273, + "learning_rate": 9.976435495194823e-05, + "loss": 2.2123, + "step": 1956 + }, + { + "epoch": 0.6006752608962553, + "grad_norm": 0.9388678073883057, + "learning_rate": 9.976387270036186e-05, + "loss": 2.1792, + "step": 1957 + }, + { + "epoch": 0.6009821976672806, + "grad_norm": 0.7970952391624451, + "learning_rate": 9.976338995697974e-05, + "loss": 2.2425, + "step": 1958 + }, + { + "epoch": 0.6012891344383057, + "grad_norm": 0.7219900488853455, + "learning_rate": 9.976290672180662e-05, + "loss": 2.1984, + "step": 1959 + }, + { + "epoch": 0.6015960712093309, + "grad_norm": 0.639715313911438, + "learning_rate": 9.976242299484728e-05, + "loss": 2.2796, + "step": 1960 + }, + { + "epoch": 0.601903007980356, + "grad_norm": 0.6734911799430847, + "learning_rate": 9.976193877610652e-05, + "loss": 2.3066, + "step": 1961 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.8328932523727417, + "learning_rate": 9.976145406558912e-05, + "loss": 2.3958, + "step": 1962 + }, + { + "epoch": 0.6025168815224063, + "grad_norm": 0.9552088379859924, + "learning_rate": 9.976096886329986e-05, + "loss": 2.3246, + "step": 1963 + }, + { + "epoch": 0.6028238182934316, + "grad_norm": 0.8407328128814697, + "learning_rate": 9.976048316924354e-05, + "loss": 2.2922, + "step": 1964 + }, + { + "epoch": 0.6031307550644567, + "grad_norm": 0.6899709105491638, + "learning_rate": 9.975999698342495e-05, + "loss": 2.1808, + "step": 1965 + }, + { + "epoch": 0.6034376918354819, + "grad_norm": 0.8114390969276428, + "learning_rate": 9.975951030584892e-05, + "loss": 2.3516, + "step": 1966 + }, + { + "epoch": 0.603744628606507, + "grad_norm": 0.8071461319923401, + "learning_rate": 9.975902313652024e-05, + "loss": 2.2044, + "step": 1967 + }, + { + "epoch": 0.6040515653775322, + "grad_norm": 0.8767913579940796, + "learning_rate": 9.975853547544372e-05, + "loss": 2.24, + "step": 1968 + }, + { + "epoch": 0.6043585021485574, + "grad_norm": 0.817095935344696, + "learning_rate": 9.975804732262419e-05, + "loss": 2.169, + "step": 1969 + }, + { + "epoch": 0.6046654389195826, + "grad_norm": 0.6818623542785645, + "learning_rate": 9.975755867806648e-05, + "loss": 2.2869, + "step": 1970 + }, + { + "epoch": 0.6049723756906077, + "grad_norm": 0.7248693704605103, + "learning_rate": 9.97570695417754e-05, + "loss": 2.2159, + "step": 1971 + }, + { + "epoch": 0.6052793124616329, + "grad_norm": 0.6425455212593079, + "learning_rate": 9.975657991375581e-05, + "loss": 2.2173, + "step": 1972 + }, + { + "epoch": 0.605586249232658, + "grad_norm": 0.6856566071510315, + "learning_rate": 9.975608979401252e-05, + "loss": 2.2994, + "step": 1973 + }, + { + "epoch": 0.6058931860036832, + "grad_norm": 0.6731004118919373, + "learning_rate": 9.97555991825504e-05, + "loss": 2.2286, + "step": 1974 + }, + { + "epoch": 0.6062001227747084, + "grad_norm": 0.7461759448051453, + "learning_rate": 9.975510807937428e-05, + "loss": 2.2057, + "step": 1975 + }, + { + "epoch": 0.6065070595457336, + "grad_norm": 0.7256236672401428, + "learning_rate": 9.975461648448902e-05, + "loss": 2.2686, + "step": 1976 + }, + { + "epoch": 0.6068139963167587, + "grad_norm": 0.7254514098167419, + "learning_rate": 9.975412439789949e-05, + "loss": 2.2748, + "step": 1977 + }, + { + "epoch": 0.6071209330877839, + "grad_norm": 0.7280047535896301, + "learning_rate": 9.975363181961052e-05, + "loss": 2.27, + "step": 1978 + }, + { + "epoch": 0.607427869858809, + "grad_norm": 0.6801813244819641, + "learning_rate": 9.9753138749627e-05, + "loss": 2.2356, + "step": 1979 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.841946005821228, + "learning_rate": 9.975264518795382e-05, + "loss": 2.3887, + "step": 1980 + }, + { + "epoch": 0.6080417434008594, + "grad_norm": 0.9610007405281067, + "learning_rate": 9.975215113459582e-05, + "loss": 2.2857, + "step": 1981 + }, + { + "epoch": 0.6083486801718846, + "grad_norm": 0.8726536631584167, + "learning_rate": 9.975165658955791e-05, + "loss": 2.3137, + "step": 1982 + }, + { + "epoch": 0.6086556169429097, + "grad_norm": 0.9275946021080017, + "learning_rate": 9.975116155284498e-05, + "loss": 2.291, + "step": 1983 + }, + { + "epoch": 0.6089625537139349, + "grad_norm": 0.9045402407646179, + "learning_rate": 9.97506660244619e-05, + "loss": 2.2183, + "step": 1984 + }, + { + "epoch": 0.6092694904849602, + "grad_norm": 0.7913599610328674, + "learning_rate": 9.975017000441358e-05, + "loss": 2.349, + "step": 1985 + }, + { + "epoch": 0.6095764272559853, + "grad_norm": 0.714824378490448, + "learning_rate": 9.974967349270492e-05, + "loss": 2.2163, + "step": 1986 + }, + { + "epoch": 0.6098833640270105, + "grad_norm": 0.7178559899330139, + "learning_rate": 9.974917648934084e-05, + "loss": 2.2338, + "step": 1987 + }, + { + "epoch": 0.6101903007980356, + "grad_norm": 0.8417280912399292, + "learning_rate": 9.97486789943262e-05, + "loss": 2.1961, + "step": 1988 + }, + { + "epoch": 0.6104972375690608, + "grad_norm": 0.8488532304763794, + "learning_rate": 9.9748181007666e-05, + "loss": 2.2509, + "step": 1989 + }, + { + "epoch": 0.6108041743400859, + "grad_norm": 0.796309769153595, + "learning_rate": 9.974768252936509e-05, + "loss": 2.2948, + "step": 1990 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.7163965702056885, + "learning_rate": 9.974718355942843e-05, + "loss": 2.2136, + "step": 1991 + }, + { + "epoch": 0.6114180478821363, + "grad_norm": 0.6620060205459595, + "learning_rate": 9.974668409786095e-05, + "loss": 2.2442, + "step": 1992 + }, + { + "epoch": 0.6117249846531615, + "grad_norm": 0.6843542456626892, + "learning_rate": 9.974618414466759e-05, + "loss": 2.1972, + "step": 1993 + }, + { + "epoch": 0.6120319214241866, + "grad_norm": 0.699847936630249, + "learning_rate": 9.974568369985327e-05, + "loss": 2.2194, + "step": 1994 + }, + { + "epoch": 0.6123388581952118, + "grad_norm": 0.693384051322937, + "learning_rate": 9.974518276342293e-05, + "loss": 2.2446, + "step": 1995 + }, + { + "epoch": 0.612645794966237, + "grad_norm": 0.6022316813468933, + "learning_rate": 9.974468133538155e-05, + "loss": 2.2037, + "step": 1996 + }, + { + "epoch": 0.6129527317372622, + "grad_norm": 0.6317062377929688, + "learning_rate": 9.974417941573409e-05, + "loss": 2.1855, + "step": 1997 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.7291355133056641, + "learning_rate": 9.974367700448547e-05, + "loss": 2.2179, + "step": 1998 + }, + { + "epoch": 0.6135666052793125, + "grad_norm": 0.6776867508888245, + "learning_rate": 9.97431741016407e-05, + "loss": 2.2437, + "step": 1999 + }, + { + "epoch": 0.6138735420503376, + "grad_norm": 0.6598517298698425, + "learning_rate": 9.97426707072047e-05, + "loss": 2.2775, + "step": 2000 + }, + { + "epoch": 0.6141804788213628, + "grad_norm": 0.6681709289550781, + "learning_rate": 9.974216682118249e-05, + "loss": 2.2004, + "step": 2001 + }, + { + "epoch": 0.614487415592388, + "grad_norm": 0.6725168228149414, + "learning_rate": 9.974166244357903e-05, + "loss": 2.2922, + "step": 2002 + }, + { + "epoch": 0.6147943523634132, + "grad_norm": 0.6547908782958984, + "learning_rate": 9.974115757439931e-05, + "loss": 2.2195, + "step": 2003 + }, + { + "epoch": 0.6151012891344383, + "grad_norm": 0.7195348739624023, + "learning_rate": 9.974065221364831e-05, + "loss": 2.2862, + "step": 2004 + }, + { + "epoch": 0.6154082259054635, + "grad_norm": 0.7992655038833618, + "learning_rate": 9.974014636133103e-05, + "loss": 2.3109, + "step": 2005 + }, + { + "epoch": 0.6157151626764886, + "grad_norm": 0.7932934165000916, + "learning_rate": 9.973964001745249e-05, + "loss": 2.2869, + "step": 2006 + }, + { + "epoch": 0.6160220994475138, + "grad_norm": 0.7778924107551575, + "learning_rate": 9.973913318201763e-05, + "loss": 2.2046, + "step": 2007 + }, + { + "epoch": 0.616329036218539, + "grad_norm": 0.7951294183731079, + "learning_rate": 9.973862585503155e-05, + "loss": 2.221, + "step": 2008 + }, + { + "epoch": 0.6166359729895642, + "grad_norm": 0.729552686214447, + "learning_rate": 9.97381180364992e-05, + "loss": 2.2929, + "step": 2009 + }, + { + "epoch": 0.6169429097605893, + "grad_norm": 0.731516420841217, + "learning_rate": 9.973760972642561e-05, + "loss": 2.2673, + "step": 2010 + }, + { + "epoch": 0.6172498465316145, + "grad_norm": 0.6950094103813171, + "learning_rate": 9.973710092481581e-05, + "loss": 2.2029, + "step": 2011 + }, + { + "epoch": 0.6175567833026396, + "grad_norm": 0.6260825395584106, + "learning_rate": 9.973659163167484e-05, + "loss": 2.3037, + "step": 2012 + }, + { + "epoch": 0.6178637200736649, + "grad_norm": 0.6949467658996582, + "learning_rate": 9.97360818470077e-05, + "loss": 2.2699, + "step": 2013 + }, + { + "epoch": 0.61817065684469, + "grad_norm": 0.7322572469711304, + "learning_rate": 9.973557157081945e-05, + "loss": 2.2921, + "step": 2014 + }, + { + "epoch": 0.6184775936157152, + "grad_norm": 0.8999563455581665, + "learning_rate": 9.973506080311514e-05, + "loss": 2.2499, + "step": 2015 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.9269914031028748, + "learning_rate": 9.973454954389981e-05, + "loss": 2.2676, + "step": 2016 + }, + { + "epoch": 0.6190914671577655, + "grad_norm": 0.8630712628364563, + "learning_rate": 9.973403779317852e-05, + "loss": 2.1379, + "step": 2017 + }, + { + "epoch": 0.6193984039287906, + "grad_norm": 0.8249645233154297, + "learning_rate": 9.97335255509563e-05, + "loss": 2.3109, + "step": 2018 + }, + { + "epoch": 0.6197053406998159, + "grad_norm": 0.7832711338996887, + "learning_rate": 9.973301281723824e-05, + "loss": 2.1316, + "step": 2019 + }, + { + "epoch": 0.620012277470841, + "grad_norm": 0.7502821683883667, + "learning_rate": 9.97324995920294e-05, + "loss": 2.2188, + "step": 2020 + }, + { + "epoch": 0.6203192142418662, + "grad_norm": 0.7804487347602844, + "learning_rate": 9.973198587533483e-05, + "loss": 2.2639, + "step": 2021 + }, + { + "epoch": 0.6206261510128913, + "grad_norm": 0.9198356866836548, + "learning_rate": 9.973147166715963e-05, + "loss": 2.2574, + "step": 2022 + }, + { + "epoch": 0.6209330877839165, + "grad_norm": 0.8792869448661804, + "learning_rate": 9.97309569675089e-05, + "loss": 2.2228, + "step": 2023 + }, + { + "epoch": 0.6212400245549416, + "grad_norm": 0.779772937297821, + "learning_rate": 9.97304417763877e-05, + "loss": 2.2179, + "step": 2024 + }, + { + "epoch": 0.6215469613259669, + "grad_norm": 0.7702100276947021, + "learning_rate": 9.972992609380111e-05, + "loss": 2.3872, + "step": 2025 + }, + { + "epoch": 0.621853898096992, + "grad_norm": 0.8576669096946716, + "learning_rate": 9.972940991975426e-05, + "loss": 2.2279, + "step": 2026 + }, + { + "epoch": 0.6221608348680172, + "grad_norm": 0.8312802314758301, + "learning_rate": 9.972889325425223e-05, + "loss": 2.3507, + "step": 2027 + }, + { + "epoch": 0.6224677716390423, + "grad_norm": 0.7873719930648804, + "learning_rate": 9.972837609730013e-05, + "loss": 2.2252, + "step": 2028 + }, + { + "epoch": 0.6227747084100675, + "grad_norm": 0.7763897180557251, + "learning_rate": 9.972785844890307e-05, + "loss": 2.2559, + "step": 2029 + }, + { + "epoch": 0.6230816451810927, + "grad_norm": 0.7053700685501099, + "learning_rate": 9.972734030906617e-05, + "loss": 2.2248, + "step": 2030 + }, + { + "epoch": 0.6233885819521179, + "grad_norm": 0.8800643682479858, + "learning_rate": 9.972682167779453e-05, + "loss": 2.3111, + "step": 2031 + }, + { + "epoch": 0.623695518723143, + "grad_norm": 0.7237632274627686, + "learning_rate": 9.97263025550933e-05, + "loss": 2.2255, + "step": 2032 + }, + { + "epoch": 0.6240024554941682, + "grad_norm": 0.7139064073562622, + "learning_rate": 9.97257829409676e-05, + "loss": 2.2065, + "step": 2033 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.6514315009117126, + "learning_rate": 9.972526283542259e-05, + "loss": 2.2176, + "step": 2034 + }, + { + "epoch": 0.6246163290362186, + "grad_norm": 0.726828932762146, + "learning_rate": 9.972474223846337e-05, + "loss": 2.2236, + "step": 2035 + }, + { + "epoch": 0.6249232658072437, + "grad_norm": 0.7121313810348511, + "learning_rate": 9.97242211500951e-05, + "loss": 2.2696, + "step": 2036 + }, + { + "epoch": 0.6252302025782689, + "grad_norm": 0.7203021049499512, + "learning_rate": 9.972369957032293e-05, + "loss": 2.2418, + "step": 2037 + }, + { + "epoch": 0.625537139349294, + "grad_norm": 0.6843051910400391, + "learning_rate": 9.972317749915203e-05, + "loss": 2.2408, + "step": 2038 + }, + { + "epoch": 0.6258440761203192, + "grad_norm": 0.6523141264915466, + "learning_rate": 9.972265493658754e-05, + "loss": 2.1693, + "step": 2039 + }, + { + "epoch": 0.6261510128913443, + "grad_norm": 0.6263946294784546, + "learning_rate": 9.972213188263463e-05, + "loss": 2.2477, + "step": 2040 + }, + { + "epoch": 0.6264579496623696, + "grad_norm": 0.6428464651107788, + "learning_rate": 9.972160833729847e-05, + "loss": 2.2131, + "step": 2041 + }, + { + "epoch": 0.6267648864333947, + "grad_norm": 0.6333484649658203, + "learning_rate": 9.972108430058423e-05, + "loss": 2.2806, + "step": 2042 + }, + { + "epoch": 0.6270718232044199, + "grad_norm": 0.7168832421302795, + "learning_rate": 9.97205597724971e-05, + "loss": 2.2468, + "step": 2043 + }, + { + "epoch": 0.627378759975445, + "grad_norm": 0.7522227168083191, + "learning_rate": 9.972003475304226e-05, + "loss": 2.249, + "step": 2044 + }, + { + "epoch": 0.6276856967464702, + "grad_norm": 0.6810066103935242, + "learning_rate": 9.971950924222488e-05, + "loss": 2.1988, + "step": 2045 + }, + { + "epoch": 0.6279926335174953, + "grad_norm": 0.6983187198638916, + "learning_rate": 9.971898324005018e-05, + "loss": 2.2444, + "step": 2046 + }, + { + "epoch": 0.6282995702885206, + "grad_norm": 0.7261439561843872, + "learning_rate": 9.971845674652333e-05, + "loss": 2.1789, + "step": 2047 + }, + { + "epoch": 0.6286065070595457, + "grad_norm": 0.6844322681427002, + "learning_rate": 9.971792976164957e-05, + "loss": 2.2666, + "step": 2048 + }, + { + "epoch": 0.6289134438305709, + "grad_norm": 0.7166746258735657, + "learning_rate": 9.971740228543407e-05, + "loss": 2.3002, + "step": 2049 + }, + { + "epoch": 0.629220380601596, + "grad_norm": 0.7386785745620728, + "learning_rate": 9.971687431788207e-05, + "loss": 2.1798, + "step": 2050 + }, + { + "epoch": 0.6295273173726212, + "grad_norm": 0.6873611211776733, + "learning_rate": 9.971634585899878e-05, + "loss": 2.184, + "step": 2051 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.8005948066711426, + "learning_rate": 9.971581690878941e-05, + "loss": 2.2778, + "step": 2052 + }, + { + "epoch": 0.6301411909146716, + "grad_norm": 0.8972415924072266, + "learning_rate": 9.971528746725922e-05, + "loss": 2.2822, + "step": 2053 + }, + { + "epoch": 0.6304481276856968, + "grad_norm": 0.7935822010040283, + "learning_rate": 9.97147575344134e-05, + "loss": 2.1732, + "step": 2054 + }, + { + "epoch": 0.6307550644567219, + "grad_norm": 0.7891644239425659, + "learning_rate": 9.971422711025721e-05, + "loss": 2.2765, + "step": 2055 + }, + { + "epoch": 0.6310620012277471, + "grad_norm": 0.7857005000114441, + "learning_rate": 9.971369619479589e-05, + "loss": 2.2386, + "step": 2056 + }, + { + "epoch": 0.6313689379987723, + "grad_norm": 0.6909852623939514, + "learning_rate": 9.97131647880347e-05, + "loss": 2.1251, + "step": 2057 + }, + { + "epoch": 0.6316758747697975, + "grad_norm": 0.6352387070655823, + "learning_rate": 9.971263288997885e-05, + "loss": 2.1883, + "step": 2058 + }, + { + "epoch": 0.6319828115408226, + "grad_norm": 0.5811386704444885, + "learning_rate": 9.971210050063364e-05, + "loss": 2.281, + "step": 2059 + }, + { + "epoch": 0.6322897483118478, + "grad_norm": 0.6227630376815796, + "learning_rate": 9.971156762000432e-05, + "loss": 2.1346, + "step": 2060 + }, + { + "epoch": 0.6325966850828729, + "grad_norm": 0.6628422737121582, + "learning_rate": 9.971103424809616e-05, + "loss": 2.2617, + "step": 2061 + }, + { + "epoch": 0.6329036218538981, + "grad_norm": 0.7212308645248413, + "learning_rate": 9.97105003849144e-05, + "loss": 2.1764, + "step": 2062 + }, + { + "epoch": 0.6332105586249233, + "grad_norm": 0.8368894457817078, + "learning_rate": 9.970996603046435e-05, + "loss": 2.2897, + "step": 2063 + }, + { + "epoch": 0.6335174953959485, + "grad_norm": 0.8797467350959778, + "learning_rate": 9.970943118475129e-05, + "loss": 2.1987, + "step": 2064 + }, + { + "epoch": 0.6338244321669736, + "grad_norm": 0.9241101145744324, + "learning_rate": 9.970889584778047e-05, + "loss": 2.2759, + "step": 2065 + }, + { + "epoch": 0.6341313689379988, + "grad_norm": 0.8636183142662048, + "learning_rate": 9.970836001955723e-05, + "loss": 2.2188, + "step": 2066 + }, + { + "epoch": 0.6344383057090239, + "grad_norm": 0.8965754508972168, + "learning_rate": 9.970782370008682e-05, + "loss": 2.2845, + "step": 2067 + }, + { + "epoch": 0.6347452424800492, + "grad_norm": 0.9064372777938843, + "learning_rate": 9.970728688937459e-05, + "loss": 2.1787, + "step": 2068 + }, + { + "epoch": 0.6350521792510743, + "grad_norm": 0.7387171387672424, + "learning_rate": 9.970674958742579e-05, + "loss": 2.1805, + "step": 2069 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.6220484972000122, + "learning_rate": 9.970621179424578e-05, + "loss": 2.2762, + "step": 2070 + }, + { + "epoch": 0.6356660527931246, + "grad_norm": 0.6268464922904968, + "learning_rate": 9.970567350983984e-05, + "loss": 2.2491, + "step": 2071 + }, + { + "epoch": 0.6359729895641498, + "grad_norm": 0.6385738253593445, + "learning_rate": 9.97051347342133e-05, + "loss": 2.2126, + "step": 2072 + }, + { + "epoch": 0.6362799263351749, + "grad_norm": 0.7084285020828247, + "learning_rate": 9.970459546737148e-05, + "loss": 2.2364, + "step": 2073 + }, + { + "epoch": 0.6365868631062002, + "grad_norm": 0.6957145929336548, + "learning_rate": 9.97040557093197e-05, + "loss": 2.266, + "step": 2074 + }, + { + "epoch": 0.6368937998772253, + "grad_norm": 0.6037309169769287, + "learning_rate": 9.970351546006334e-05, + "loss": 2.1514, + "step": 2075 + }, + { + "epoch": 0.6372007366482505, + "grad_norm": 0.6342970132827759, + "learning_rate": 9.97029747196077e-05, + "loss": 2.1602, + "step": 2076 + }, + { + "epoch": 0.6375076734192756, + "grad_norm": 0.5793863534927368, + "learning_rate": 9.970243348795812e-05, + "loss": 2.1853, + "step": 2077 + }, + { + "epoch": 0.6378146101903008, + "grad_norm": 0.5420103073120117, + "learning_rate": 9.970189176511997e-05, + "loss": 2.1885, + "step": 2078 + }, + { + "epoch": 0.638121546961326, + "grad_norm": 0.6713188886642456, + "learning_rate": 9.97013495510986e-05, + "loss": 2.2641, + "step": 2079 + }, + { + "epoch": 0.6384284837323512, + "grad_norm": 0.7410796880722046, + "learning_rate": 9.970080684589935e-05, + "loss": 2.2248, + "step": 2080 + }, + { + "epoch": 0.6387354205033763, + "grad_norm": 0.7138017416000366, + "learning_rate": 9.970026364952761e-05, + "loss": 2.1975, + "step": 2081 + }, + { + "epoch": 0.6390423572744015, + "grad_norm": 0.7553584575653076, + "learning_rate": 9.969971996198873e-05, + "loss": 2.2482, + "step": 2082 + }, + { + "epoch": 0.6393492940454266, + "grad_norm": 0.7082852125167847, + "learning_rate": 9.969917578328808e-05, + "loss": 2.1681, + "step": 2083 + }, + { + "epoch": 0.6396562308164518, + "grad_norm": 0.6190223097801208, + "learning_rate": 9.969863111343105e-05, + "loss": 2.1995, + "step": 2084 + }, + { + "epoch": 0.639963167587477, + "grad_norm": 0.6640429496765137, + "learning_rate": 9.969808595242302e-05, + "loss": 2.2969, + "step": 2085 + }, + { + "epoch": 0.6402701043585022, + "grad_norm": 0.761377215385437, + "learning_rate": 9.969754030026936e-05, + "loss": 2.2412, + "step": 2086 + }, + { + "epoch": 0.6405770411295273, + "grad_norm": 0.7226401567459106, + "learning_rate": 9.969699415697551e-05, + "loss": 2.1852, + "step": 2087 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.6474639177322388, + "learning_rate": 9.969644752254681e-05, + "loss": 2.1867, + "step": 2088 + }, + { + "epoch": 0.6411909146715776, + "grad_norm": 0.6725835800170898, + "learning_rate": 9.96959003969887e-05, + "loss": 2.1962, + "step": 2089 + }, + { + "epoch": 0.6414978514426029, + "grad_norm": 0.6669641733169556, + "learning_rate": 9.969535278030657e-05, + "loss": 2.2045, + "step": 2090 + }, + { + "epoch": 0.641804788213628, + "grad_norm": 0.7604048252105713, + "learning_rate": 9.969480467250583e-05, + "loss": 2.2543, + "step": 2091 + }, + { + "epoch": 0.6421117249846532, + "grad_norm": 0.9369953870773315, + "learning_rate": 9.969425607359191e-05, + "loss": 2.2461, + "step": 2092 + }, + { + "epoch": 0.6424186617556783, + "grad_norm": 1.116156816482544, + "learning_rate": 9.969370698357022e-05, + "loss": 2.2447, + "step": 2093 + }, + { + "epoch": 0.6427255985267035, + "grad_norm": 0.9179674983024597, + "learning_rate": 9.96931574024462e-05, + "loss": 2.2164, + "step": 2094 + }, + { + "epoch": 0.6430325352977286, + "grad_norm": 0.7629393339157104, + "learning_rate": 9.969260733022526e-05, + "loss": 2.22, + "step": 2095 + }, + { + "epoch": 0.6433394720687539, + "grad_norm": 0.7152948379516602, + "learning_rate": 9.969205676691286e-05, + "loss": 2.1967, + "step": 2096 + }, + { + "epoch": 0.643646408839779, + "grad_norm": 0.7527763247489929, + "learning_rate": 9.969150571251442e-05, + "loss": 2.2263, + "step": 2097 + }, + { + "epoch": 0.6439533456108042, + "grad_norm": 0.9889422655105591, + "learning_rate": 9.96909541670354e-05, + "loss": 2.2127, + "step": 2098 + }, + { + "epoch": 0.6442602823818293, + "grad_norm": 1.0340619087219238, + "learning_rate": 9.969040213048125e-05, + "loss": 2.2392, + "step": 2099 + }, + { + "epoch": 0.6445672191528545, + "grad_norm": 0.735322892665863, + "learning_rate": 9.968984960285743e-05, + "loss": 2.1351, + "step": 2100 + }, + { + "epoch": 0.6448741559238796, + "grad_norm": 0.6575397849082947, + "learning_rate": 9.968929658416936e-05, + "loss": 2.2481, + "step": 2101 + }, + { + "epoch": 0.6451810926949049, + "grad_norm": 0.6891960501670837, + "learning_rate": 9.968874307442258e-05, + "loss": 2.2164, + "step": 2102 + }, + { + "epoch": 0.64548802946593, + "grad_norm": 0.792298436164856, + "learning_rate": 9.968818907362248e-05, + "loss": 2.1681, + "step": 2103 + }, + { + "epoch": 0.6457949662369552, + "grad_norm": 0.8438142538070679, + "learning_rate": 9.968763458177459e-05, + "loss": 2.2123, + "step": 2104 + }, + { + "epoch": 0.6461019030079803, + "grad_norm": 0.7494921088218689, + "learning_rate": 9.968707959888436e-05, + "loss": 2.1863, + "step": 2105 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.7049927115440369, + "learning_rate": 9.968652412495731e-05, + "loss": 2.2364, + "step": 2106 + }, + { + "epoch": 0.6467157765500307, + "grad_norm": 0.7586455345153809, + "learning_rate": 9.968596815999889e-05, + "loss": 2.1976, + "step": 2107 + }, + { + "epoch": 0.6470227133210559, + "grad_norm": 0.7762691974639893, + "learning_rate": 9.968541170401462e-05, + "loss": 2.2323, + "step": 2108 + }, + { + "epoch": 0.647329650092081, + "grad_norm": 0.8127642869949341, + "learning_rate": 9.968485475700998e-05, + "loss": 2.1577, + "step": 2109 + }, + { + "epoch": 0.6476365868631062, + "grad_norm": 0.6762635111808777, + "learning_rate": 9.968429731899049e-05, + "loss": 2.1972, + "step": 2110 + }, + { + "epoch": 0.6479435236341313, + "grad_norm": 0.675707995891571, + "learning_rate": 9.968373938996165e-05, + "loss": 2.1932, + "step": 2111 + }, + { + "epoch": 0.6482504604051565, + "grad_norm": 0.6996815204620361, + "learning_rate": 9.968318096992898e-05, + "loss": 2.2695, + "step": 2112 + }, + { + "epoch": 0.6485573971761817, + "grad_norm": 0.8519851565361023, + "learning_rate": 9.968262205889799e-05, + "loss": 2.2662, + "step": 2113 + }, + { + "epoch": 0.6488643339472069, + "grad_norm": 0.7621145844459534, + "learning_rate": 9.968206265687421e-05, + "loss": 2.2888, + "step": 2114 + }, + { + "epoch": 0.649171270718232, + "grad_norm": 0.786609411239624, + "learning_rate": 9.968150276386317e-05, + "loss": 2.3354, + "step": 2115 + }, + { + "epoch": 0.6494782074892572, + "grad_norm": 0.7693428993225098, + "learning_rate": 9.96809423798704e-05, + "loss": 2.1981, + "step": 2116 + }, + { + "epoch": 0.6497851442602823, + "grad_norm": 0.72762131690979, + "learning_rate": 9.968038150490145e-05, + "loss": 2.2387, + "step": 2117 + }, + { + "epoch": 0.6500920810313076, + "grad_norm": 0.737617015838623, + "learning_rate": 9.967982013896184e-05, + "loss": 2.258, + "step": 2118 + }, + { + "epoch": 0.6503990178023327, + "grad_norm": 0.7320968508720398, + "learning_rate": 9.967925828205712e-05, + "loss": 2.3248, + "step": 2119 + }, + { + "epoch": 0.6507059545733579, + "grad_norm": 0.7904484868049622, + "learning_rate": 9.967869593419286e-05, + "loss": 2.2121, + "step": 2120 + }, + { + "epoch": 0.651012891344383, + "grad_norm": 0.7519722580909729, + "learning_rate": 9.967813309537461e-05, + "loss": 2.1999, + "step": 2121 + }, + { + "epoch": 0.6513198281154082, + "grad_norm": 0.7201504707336426, + "learning_rate": 9.967756976560793e-05, + "loss": 2.2022, + "step": 2122 + }, + { + "epoch": 0.6516267648864333, + "grad_norm": 0.6134514808654785, + "learning_rate": 9.96770059448984e-05, + "loss": 2.2105, + "step": 2123 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.6086028218269348, + "learning_rate": 9.967644163325156e-05, + "loss": 2.212, + "step": 2124 + }, + { + "epoch": 0.6522406384284838, + "grad_norm": 0.6550475358963013, + "learning_rate": 9.967587683067302e-05, + "loss": 2.181, + "step": 2125 + }, + { + "epoch": 0.6525475751995089, + "grad_norm": 0.7557916045188904, + "learning_rate": 9.967531153716835e-05, + "loss": 2.3194, + "step": 2126 + }, + { + "epoch": 0.6528545119705341, + "grad_norm": 0.8859965801239014, + "learning_rate": 9.967474575274314e-05, + "loss": 2.2104, + "step": 2127 + }, + { + "epoch": 0.6531614487415592, + "grad_norm": 0.8049005270004272, + "learning_rate": 9.967417947740296e-05, + "loss": 2.2949, + "step": 2128 + }, + { + "epoch": 0.6534683855125845, + "grad_norm": 0.708297073841095, + "learning_rate": 9.967361271115343e-05, + "loss": 2.1703, + "step": 2129 + }, + { + "epoch": 0.6537753222836096, + "grad_norm": 0.6764169335365295, + "learning_rate": 9.967304545400016e-05, + "loss": 2.2177, + "step": 2130 + }, + { + "epoch": 0.6540822590546348, + "grad_norm": 0.6987971067428589, + "learning_rate": 9.967247770594872e-05, + "loss": 2.1699, + "step": 2131 + }, + { + "epoch": 0.6543891958256599, + "grad_norm": 0.7212976217269897, + "learning_rate": 9.967190946700476e-05, + "loss": 2.1217, + "step": 2132 + }, + { + "epoch": 0.6546961325966851, + "grad_norm": 0.6805562973022461, + "learning_rate": 9.967134073717386e-05, + "loss": 2.2295, + "step": 2133 + }, + { + "epoch": 0.6550030693677102, + "grad_norm": 0.665428102016449, + "learning_rate": 9.967077151646167e-05, + "loss": 2.1742, + "step": 2134 + }, + { + "epoch": 0.6553100061387355, + "grad_norm": 0.6691353917121887, + "learning_rate": 9.967020180487378e-05, + "loss": 2.2313, + "step": 2135 + }, + { + "epoch": 0.6556169429097606, + "grad_norm": 0.7095547914505005, + "learning_rate": 9.966963160241587e-05, + "loss": 2.1367, + "step": 2136 + }, + { + "epoch": 0.6559238796807858, + "grad_norm": 0.7050215601921082, + "learning_rate": 9.966906090909353e-05, + "loss": 2.3234, + "step": 2137 + }, + { + "epoch": 0.6562308164518109, + "grad_norm": 0.7592353820800781, + "learning_rate": 9.966848972491245e-05, + "loss": 2.1722, + "step": 2138 + }, + { + "epoch": 0.6565377532228361, + "grad_norm": 0.6520100831985474, + "learning_rate": 9.96679180498782e-05, + "loss": 2.2401, + "step": 2139 + }, + { + "epoch": 0.6568446899938613, + "grad_norm": 0.6650902628898621, + "learning_rate": 9.966734588399651e-05, + "loss": 2.2094, + "step": 2140 + }, + { + "epoch": 0.6571516267648865, + "grad_norm": 0.7236151099205017, + "learning_rate": 9.966677322727299e-05, + "loss": 2.3021, + "step": 2141 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.7160753011703491, + "learning_rate": 9.966620007971327e-05, + "loss": 2.1992, + "step": 2142 + }, + { + "epoch": 0.6577655003069368, + "grad_norm": 0.6761705279350281, + "learning_rate": 9.966562644132309e-05, + "loss": 2.1853, + "step": 2143 + }, + { + "epoch": 0.6580724370779619, + "grad_norm": 0.7017555236816406, + "learning_rate": 9.966505231210806e-05, + "loss": 2.208, + "step": 2144 + }, + { + "epoch": 0.6583793738489871, + "grad_norm": 0.7652586102485657, + "learning_rate": 9.966447769207387e-05, + "loss": 2.3065, + "step": 2145 + }, + { + "epoch": 0.6586863106200123, + "grad_norm": 0.7148436307907104, + "learning_rate": 9.966390258122621e-05, + "loss": 2.1388, + "step": 2146 + }, + { + "epoch": 0.6589932473910375, + "grad_norm": 0.5885360240936279, + "learning_rate": 9.966332697957076e-05, + "loss": 2.1463, + "step": 2147 + }, + { + "epoch": 0.6593001841620626, + "grad_norm": 0.6800816655158997, + "learning_rate": 9.966275088711321e-05, + "loss": 2.3397, + "step": 2148 + }, + { + "epoch": 0.6596071209330878, + "grad_norm": 0.6856956481933594, + "learning_rate": 9.966217430385925e-05, + "loss": 2.0893, + "step": 2149 + }, + { + "epoch": 0.6599140577041129, + "grad_norm": 0.6302888989448547, + "learning_rate": 9.966159722981456e-05, + "loss": 2.1108, + "step": 2150 + }, + { + "epoch": 0.6602209944751382, + "grad_norm": 0.6145252585411072, + "learning_rate": 9.966101966498486e-05, + "loss": 2.2668, + "step": 2151 + }, + { + "epoch": 0.6605279312461633, + "grad_norm": 0.7258949279785156, + "learning_rate": 9.966044160937586e-05, + "loss": 2.2163, + "step": 2152 + }, + { + "epoch": 0.6608348680171885, + "grad_norm": 0.6809847950935364, + "learning_rate": 9.965986306299327e-05, + "loss": 2.1828, + "step": 2153 + }, + { + "epoch": 0.6611418047882136, + "grad_norm": 0.6673223376274109, + "learning_rate": 9.96592840258428e-05, + "loss": 2.232, + "step": 2154 + }, + { + "epoch": 0.6614487415592388, + "grad_norm": 0.6483572721481323, + "learning_rate": 9.96587044979302e-05, + "loss": 2.199, + "step": 2155 + }, + { + "epoch": 0.6617556783302639, + "grad_norm": 0.6227185726165771, + "learning_rate": 9.965812447926115e-05, + "loss": 2.166, + "step": 2156 + }, + { + "epoch": 0.6620626151012892, + "grad_norm": 0.5982463955879211, + "learning_rate": 9.965754396984142e-05, + "loss": 2.2074, + "step": 2157 + }, + { + "epoch": 0.6623695518723143, + "grad_norm": 0.6357809901237488, + "learning_rate": 9.965696296967673e-05, + "loss": 2.2086, + "step": 2158 + }, + { + "epoch": 0.6626764886433395, + "grad_norm": 0.5908147692680359, + "learning_rate": 9.965638147877283e-05, + "loss": 2.1103, + "step": 2159 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.591332733631134, + "learning_rate": 9.965579949713545e-05, + "loss": 2.1698, + "step": 2160 + }, + { + "epoch": 0.6632903621853898, + "grad_norm": 0.5748336911201477, + "learning_rate": 9.965521702477038e-05, + "loss": 2.1812, + "step": 2161 + }, + { + "epoch": 0.663597298956415, + "grad_norm": 0.6643908023834229, + "learning_rate": 9.965463406168334e-05, + "loss": 2.2129, + "step": 2162 + }, + { + "epoch": 0.6639042357274402, + "grad_norm": 0.637627124786377, + "learning_rate": 9.965405060788011e-05, + "loss": 2.226, + "step": 2163 + }, + { + "epoch": 0.6642111724984653, + "grad_norm": 0.6170387268066406, + "learning_rate": 9.965346666336644e-05, + "loss": 2.2025, + "step": 2164 + }, + { + "epoch": 0.6645181092694905, + "grad_norm": 0.6038833260536194, + "learning_rate": 9.965288222814812e-05, + "loss": 2.1761, + "step": 2165 + }, + { + "epoch": 0.6648250460405156, + "grad_norm": 0.5705585479736328, + "learning_rate": 9.965229730223092e-05, + "loss": 2.1511, + "step": 2166 + }, + { + "epoch": 0.6651319828115408, + "grad_norm": 0.5994759798049927, + "learning_rate": 9.965171188562059e-05, + "loss": 2.1763, + "step": 2167 + }, + { + "epoch": 0.665438919582566, + "grad_norm": 0.5887313485145569, + "learning_rate": 9.965112597832296e-05, + "loss": 2.2185, + "step": 2168 + }, + { + "epoch": 0.6657458563535912, + "grad_norm": 0.5688689947128296, + "learning_rate": 9.96505395803438e-05, + "loss": 2.2387, + "step": 2169 + }, + { + "epoch": 0.6660527931246163, + "grad_norm": 0.6121554970741272, + "learning_rate": 9.96499526916889e-05, + "loss": 2.1938, + "step": 2170 + }, + { + "epoch": 0.6663597298956415, + "grad_norm": 0.6048038005828857, + "learning_rate": 9.964936531236407e-05, + "loss": 2.197, + "step": 2171 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6442995071411133, + "learning_rate": 9.96487774423751e-05, + "loss": 2.1725, + "step": 2172 + }, + { + "epoch": 0.6669736034376919, + "grad_norm": 0.7136862874031067, + "learning_rate": 9.964818908172783e-05, + "loss": 2.2166, + "step": 2173 + }, + { + "epoch": 0.667280540208717, + "grad_norm": 0.6902804970741272, + "learning_rate": 9.964760023042805e-05, + "loss": 2.2318, + "step": 2174 + }, + { + "epoch": 0.6675874769797422, + "grad_norm": 0.6946488618850708, + "learning_rate": 9.964701088848158e-05, + "loss": 2.177, + "step": 2175 + }, + { + "epoch": 0.6678944137507673, + "grad_norm": 0.6283712983131409, + "learning_rate": 9.964642105589425e-05, + "loss": 2.2227, + "step": 2176 + }, + { + "epoch": 0.6682013505217925, + "grad_norm": 0.5768510103225708, + "learning_rate": 9.96458307326719e-05, + "loss": 2.1559, + "step": 2177 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.6045784950256348, + "learning_rate": 9.964523991882035e-05, + "loss": 2.2018, + "step": 2178 + }, + { + "epoch": 0.6688152240638429, + "grad_norm": 0.5962889790534973, + "learning_rate": 9.964464861434544e-05, + "loss": 2.1898, + "step": 2179 + }, + { + "epoch": 0.669122160834868, + "grad_norm": 0.6611660718917847, + "learning_rate": 9.964405681925301e-05, + "loss": 2.1989, + "step": 2180 + }, + { + "epoch": 0.6694290976058932, + "grad_norm": 0.6764575242996216, + "learning_rate": 9.964346453354891e-05, + "loss": 2.2764, + "step": 2181 + }, + { + "epoch": 0.6697360343769183, + "grad_norm": 0.6795048117637634, + "learning_rate": 9.964287175723899e-05, + "loss": 2.1313, + "step": 2182 + }, + { + "epoch": 0.6700429711479435, + "grad_norm": 0.6697003841400146, + "learning_rate": 9.964227849032914e-05, + "loss": 2.1999, + "step": 2183 + }, + { + "epoch": 0.6703499079189686, + "grad_norm": 0.669682502746582, + "learning_rate": 9.964168473282519e-05, + "loss": 2.202, + "step": 2184 + }, + { + "epoch": 0.6706568446899939, + "grad_norm": 0.6823530793190002, + "learning_rate": 9.9641090484733e-05, + "loss": 2.2326, + "step": 2185 + }, + { + "epoch": 0.670963781461019, + "grad_norm": 0.7460775971412659, + "learning_rate": 9.964049574605848e-05, + "loss": 2.1594, + "step": 2186 + }, + { + "epoch": 0.6712707182320442, + "grad_norm": 0.8075460195541382, + "learning_rate": 9.963990051680744e-05, + "loss": 2.1506, + "step": 2187 + }, + { + "epoch": 0.6715776550030693, + "grad_norm": 0.8041695356369019, + "learning_rate": 9.963930479698585e-05, + "loss": 2.123, + "step": 2188 + }, + { + "epoch": 0.6718845917740945, + "grad_norm": 0.9129732251167297, + "learning_rate": 9.963870858659955e-05, + "loss": 2.116, + "step": 2189 + }, + { + "epoch": 0.6721915285451197, + "grad_norm": 0.9989685416221619, + "learning_rate": 9.963811188565444e-05, + "loss": 2.3194, + "step": 2190 + }, + { + "epoch": 0.6724984653161449, + "grad_norm": 1.0353670120239258, + "learning_rate": 9.96375146941564e-05, + "loss": 2.113, + "step": 2191 + }, + { + "epoch": 0.67280540208717, + "grad_norm": 0.897750735282898, + "learning_rate": 9.963691701211135e-05, + "loss": 2.1038, + "step": 2192 + }, + { + "epoch": 0.6731123388581952, + "grad_norm": 0.7353916168212891, + "learning_rate": 9.96363188395252e-05, + "loss": 2.2185, + "step": 2193 + }, + { + "epoch": 0.6734192756292203, + "grad_norm": 0.6474063992500305, + "learning_rate": 9.963572017640385e-05, + "loss": 2.2229, + "step": 2194 + }, + { + "epoch": 0.6737262124002455, + "grad_norm": 0.7194583415985107, + "learning_rate": 9.963512102275322e-05, + "loss": 2.2172, + "step": 2195 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.6638131737709045, + "learning_rate": 9.963452137857926e-05, + "loss": 2.2212, + "step": 2196 + }, + { + "epoch": 0.6743400859422959, + "grad_norm": 0.7219048738479614, + "learning_rate": 9.963392124388782e-05, + "loss": 2.3302, + "step": 2197 + }, + { + "epoch": 0.6746470227133211, + "grad_norm": 0.7941164374351501, + "learning_rate": 9.963332061868491e-05, + "loss": 2.2982, + "step": 2198 + }, + { + "epoch": 0.6749539594843462, + "grad_norm": 0.7356888055801392, + "learning_rate": 9.963271950297643e-05, + "loss": 2.1761, + "step": 2199 + }, + { + "epoch": 0.6752608962553714, + "grad_norm": 0.6705774664878845, + "learning_rate": 9.963211789676831e-05, + "loss": 2.2483, + "step": 2200 + }, + { + "epoch": 0.6755678330263966, + "grad_norm": 0.7958056926727295, + "learning_rate": 9.963151580006653e-05, + "loss": 2.2209, + "step": 2201 + }, + { + "epoch": 0.6758747697974218, + "grad_norm": 0.7215412259101868, + "learning_rate": 9.9630913212877e-05, + "loss": 2.1676, + "step": 2202 + }, + { + "epoch": 0.6761817065684469, + "grad_norm": 0.705649197101593, + "learning_rate": 9.963031013520572e-05, + "loss": 2.1855, + "step": 2203 + }, + { + "epoch": 0.6764886433394721, + "grad_norm": 0.7050254344940186, + "learning_rate": 9.962970656705861e-05, + "loss": 2.171, + "step": 2204 + }, + { + "epoch": 0.6767955801104972, + "grad_norm": 0.7163556218147278, + "learning_rate": 9.962910250844167e-05, + "loss": 2.1295, + "step": 2205 + }, + { + "epoch": 0.6771025168815225, + "grad_norm": 0.7195280194282532, + "learning_rate": 9.962849795936083e-05, + "loss": 2.1436, + "step": 2206 + }, + { + "epoch": 0.6774094536525476, + "grad_norm": 0.7356030344963074, + "learning_rate": 9.962789291982208e-05, + "loss": 2.2739, + "step": 2207 + }, + { + "epoch": 0.6777163904235728, + "grad_norm": 0.783649742603302, + "learning_rate": 9.962728738983143e-05, + "loss": 2.2461, + "step": 2208 + }, + { + "epoch": 0.6780233271945979, + "grad_norm": 0.6966754794120789, + "learning_rate": 9.962668136939481e-05, + "loss": 2.1977, + "step": 2209 + }, + { + "epoch": 0.6783302639656231, + "grad_norm": 0.6986487507820129, + "learning_rate": 9.962607485851825e-05, + "loss": 2.1806, + "step": 2210 + }, + { + "epoch": 0.6786372007366482, + "grad_norm": 0.6502536535263062, + "learning_rate": 9.962546785720774e-05, + "loss": 2.174, + "step": 2211 + }, + { + "epoch": 0.6789441375076735, + "grad_norm": 0.6797144412994385, + "learning_rate": 9.962486036546926e-05, + "loss": 2.2635, + "step": 2212 + }, + { + "epoch": 0.6792510742786986, + "grad_norm": 0.7190150022506714, + "learning_rate": 9.962425238330884e-05, + "loss": 2.2231, + "step": 2213 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.6770560145378113, + "learning_rate": 9.962364391073245e-05, + "loss": 2.1639, + "step": 2214 + }, + { + "epoch": 0.6798649478207489, + "grad_norm": 0.624911904335022, + "learning_rate": 9.962303494774614e-05, + "loss": 2.1754, + "step": 2215 + }, + { + "epoch": 0.6801718845917741, + "grad_norm": 0.7127423286437988, + "learning_rate": 9.96224254943559e-05, + "loss": 2.2047, + "step": 2216 + }, + { + "epoch": 0.6804788213627992, + "grad_norm": 0.6729345321655273, + "learning_rate": 9.962181555056778e-05, + "loss": 2.2245, + "step": 2217 + }, + { + "epoch": 0.6807857581338245, + "grad_norm": 0.7142044901847839, + "learning_rate": 9.96212051163878e-05, + "loss": 2.1827, + "step": 2218 + }, + { + "epoch": 0.6810926949048496, + "grad_norm": 0.686295211315155, + "learning_rate": 9.962059419182196e-05, + "loss": 2.1784, + "step": 2219 + }, + { + "epoch": 0.6813996316758748, + "grad_norm": 0.7207211256027222, + "learning_rate": 9.961998277687634e-05, + "loss": 2.2603, + "step": 2220 + }, + { + "epoch": 0.6817065684468999, + "grad_norm": 0.814552903175354, + "learning_rate": 9.961937087155697e-05, + "loss": 2.2328, + "step": 2221 + }, + { + "epoch": 0.6820135052179251, + "grad_norm": 0.851860761642456, + "learning_rate": 9.96187584758699e-05, + "loss": 2.2334, + "step": 2222 + }, + { + "epoch": 0.6823204419889503, + "grad_norm": 0.9232058525085449, + "learning_rate": 9.961814558982117e-05, + "loss": 2.2259, + "step": 2223 + }, + { + "epoch": 0.6826273787599755, + "grad_norm": 0.8393358588218689, + "learning_rate": 9.961753221341684e-05, + "loss": 2.1347, + "step": 2224 + }, + { + "epoch": 0.6829343155310006, + "grad_norm": 0.7124439477920532, + "learning_rate": 9.961691834666297e-05, + "loss": 2.195, + "step": 2225 + }, + { + "epoch": 0.6832412523020258, + "grad_norm": 0.644290566444397, + "learning_rate": 9.961630398956565e-05, + "loss": 2.1967, + "step": 2226 + }, + { + "epoch": 0.6835481890730509, + "grad_norm": 0.6896283030509949, + "learning_rate": 9.961568914213092e-05, + "loss": 2.1781, + "step": 2227 + }, + { + "epoch": 0.6838551258440762, + "grad_norm": 0.711643636226654, + "learning_rate": 9.961507380436487e-05, + "loss": 2.1091, + "step": 2228 + }, + { + "epoch": 0.6841620626151013, + "grad_norm": 0.7056689858436584, + "learning_rate": 9.961445797627358e-05, + "loss": 2.1848, + "step": 2229 + }, + { + "epoch": 0.6844689993861265, + "grad_norm": 0.60573410987854, + "learning_rate": 9.961384165786314e-05, + "loss": 2.1156, + "step": 2230 + }, + { + "epoch": 0.6847759361571516, + "grad_norm": 0.5612443089485168, + "learning_rate": 9.961322484913963e-05, + "loss": 2.2311, + "step": 2231 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.6356449723243713, + "learning_rate": 9.961260755010916e-05, + "loss": 2.1945, + "step": 2232 + }, + { + "epoch": 0.6853898096992019, + "grad_norm": 0.7393341660499573, + "learning_rate": 9.961198976077782e-05, + "loss": 2.2743, + "step": 2233 + }, + { + "epoch": 0.6856967464702272, + "grad_norm": 0.7658794522285461, + "learning_rate": 9.961137148115171e-05, + "loss": 2.1729, + "step": 2234 + }, + { + "epoch": 0.6860036832412523, + "grad_norm": 0.790540337562561, + "learning_rate": 9.961075271123697e-05, + "loss": 2.1372, + "step": 2235 + }, + { + "epoch": 0.6863106200122775, + "grad_norm": 0.71295565366745, + "learning_rate": 9.961013345103968e-05, + "loss": 2.1325, + "step": 2236 + }, + { + "epoch": 0.6866175567833026, + "grad_norm": 0.6648302674293518, + "learning_rate": 9.960951370056597e-05, + "loss": 2.1626, + "step": 2237 + }, + { + "epoch": 0.6869244935543278, + "grad_norm": 0.6276865601539612, + "learning_rate": 9.960889345982198e-05, + "loss": 2.1848, + "step": 2238 + }, + { + "epoch": 0.6872314303253529, + "grad_norm": 0.6786942481994629, + "learning_rate": 9.960827272881383e-05, + "loss": 2.2402, + "step": 2239 + }, + { + "epoch": 0.6875383670963782, + "grad_norm": 0.7752293348312378, + "learning_rate": 9.960765150754764e-05, + "loss": 2.2187, + "step": 2240 + }, + { + "epoch": 0.6878453038674033, + "grad_norm": 0.7958577871322632, + "learning_rate": 9.960702979602956e-05, + "loss": 2.1995, + "step": 2241 + }, + { + "epoch": 0.6881522406384285, + "grad_norm": 0.7327582240104675, + "learning_rate": 9.960640759426575e-05, + "loss": 2.1709, + "step": 2242 + }, + { + "epoch": 0.6884591774094536, + "grad_norm": 0.7002710103988647, + "learning_rate": 9.960578490226233e-05, + "loss": 2.1966, + "step": 2243 + }, + { + "epoch": 0.6887661141804788, + "grad_norm": 0.6163785457611084, + "learning_rate": 9.960516172002548e-05, + "loss": 2.2012, + "step": 2244 + }, + { + "epoch": 0.689073050951504, + "grad_norm": 0.6808127760887146, + "learning_rate": 9.960453804756134e-05, + "loss": 2.1704, + "step": 2245 + }, + { + "epoch": 0.6893799877225292, + "grad_norm": 0.6571208834648132, + "learning_rate": 9.960391388487609e-05, + "loss": 2.17, + "step": 2246 + }, + { + "epoch": 0.6896869244935543, + "grad_norm": 0.7180834412574768, + "learning_rate": 9.960328923197588e-05, + "loss": 2.229, + "step": 2247 + }, + { + "epoch": 0.6899938612645795, + "grad_norm": 0.7283746600151062, + "learning_rate": 9.96026640888669e-05, + "loss": 2.195, + "step": 2248 + }, + { + "epoch": 0.6903007980356046, + "grad_norm": 0.6808122992515564, + "learning_rate": 9.960203845555531e-05, + "loss": 2.1327, + "step": 2249 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.7105094790458679, + "learning_rate": 9.960141233204731e-05, + "loss": 2.2747, + "step": 2250 + }, + { + "epoch": 0.690914671577655, + "grad_norm": 0.7650291919708252, + "learning_rate": 9.960078571834909e-05, + "loss": 2.2751, + "step": 2251 + }, + { + "epoch": 0.6912216083486802, + "grad_norm": 0.8347647786140442, + "learning_rate": 9.960015861446684e-05, + "loss": 2.2101, + "step": 2252 + }, + { + "epoch": 0.6915285451197053, + "grad_norm": 0.7774063348770142, + "learning_rate": 9.959953102040672e-05, + "loss": 2.1275, + "step": 2253 + }, + { + "epoch": 0.6918354818907305, + "grad_norm": 0.7466274499893188, + "learning_rate": 9.959890293617497e-05, + "loss": 2.1352, + "step": 2254 + }, + { + "epoch": 0.6921424186617556, + "grad_norm": 0.7451669573783875, + "learning_rate": 9.959827436177781e-05, + "loss": 2.1229, + "step": 2255 + }, + { + "epoch": 0.6924493554327809, + "grad_norm": 0.651746392250061, + "learning_rate": 9.959764529722142e-05, + "loss": 2.1416, + "step": 2256 + }, + { + "epoch": 0.692756292203806, + "grad_norm": 0.6267968416213989, + "learning_rate": 9.959701574251203e-05, + "loss": 2.1346, + "step": 2257 + }, + { + "epoch": 0.6930632289748312, + "grad_norm": 0.6087000966072083, + "learning_rate": 9.959638569765586e-05, + "loss": 2.2136, + "step": 2258 + }, + { + "epoch": 0.6933701657458563, + "grad_norm": 0.6032208204269409, + "learning_rate": 9.959575516265914e-05, + "loss": 2.1211, + "step": 2259 + }, + { + "epoch": 0.6936771025168815, + "grad_norm": 0.83074551820755, + "learning_rate": 9.95951241375281e-05, + "loss": 2.2951, + "step": 2260 + }, + { + "epoch": 0.6939840392879066, + "grad_norm": 0.8564106225967407, + "learning_rate": 9.959449262226897e-05, + "loss": 2.1496, + "step": 2261 + }, + { + "epoch": 0.6942909760589319, + "grad_norm": 0.8558153510093689, + "learning_rate": 9.9593860616888e-05, + "loss": 2.2325, + "step": 2262 + }, + { + "epoch": 0.694597912829957, + "grad_norm": 0.7391008734703064, + "learning_rate": 9.959322812139143e-05, + "loss": 2.1133, + "step": 2263 + }, + { + "epoch": 0.6949048496009822, + "grad_norm": 0.6090536713600159, + "learning_rate": 9.959259513578552e-05, + "loss": 2.1453, + "step": 2264 + }, + { + "epoch": 0.6952117863720073, + "grad_norm": 0.5893986821174622, + "learning_rate": 9.95919616600765e-05, + "loss": 2.2035, + "step": 2265 + }, + { + "epoch": 0.6955187231430325, + "grad_norm": 0.6274020671844482, + "learning_rate": 9.959132769427065e-05, + "loss": 2.2118, + "step": 2266 + }, + { + "epoch": 0.6958256599140578, + "grad_norm": 0.6287395358085632, + "learning_rate": 9.959069323837424e-05, + "loss": 2.2167, + "step": 2267 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.6281611323356628, + "learning_rate": 9.959005829239354e-05, + "loss": 2.1945, + "step": 2268 + }, + { + "epoch": 0.6964395334561081, + "grad_norm": 0.6422389149665833, + "learning_rate": 9.958942285633481e-05, + "loss": 2.1826, + "step": 2269 + }, + { + "epoch": 0.6967464702271332, + "grad_norm": 0.6461887955665588, + "learning_rate": 9.958878693020434e-05, + "loss": 2.2454, + "step": 2270 + }, + { + "epoch": 0.6970534069981584, + "grad_norm": 0.562102735042572, + "learning_rate": 9.958815051400841e-05, + "loss": 2.1375, + "step": 2271 + }, + { + "epoch": 0.6973603437691835, + "grad_norm": 0.5737003087997437, + "learning_rate": 9.958751360775331e-05, + "loss": 2.2344, + "step": 2272 + }, + { + "epoch": 0.6976672805402088, + "grad_norm": 0.5516494512557983, + "learning_rate": 9.958687621144535e-05, + "loss": 2.249, + "step": 2273 + }, + { + "epoch": 0.6979742173112339, + "grad_norm": 0.7148357629776001, + "learning_rate": 9.958623832509081e-05, + "loss": 2.2383, + "step": 2274 + }, + { + "epoch": 0.6982811540822591, + "grad_norm": 0.7151525020599365, + "learning_rate": 9.958559994869599e-05, + "loss": 2.1697, + "step": 2275 + }, + { + "epoch": 0.6985880908532842, + "grad_norm": 0.6927846670150757, + "learning_rate": 9.958496108226722e-05, + "loss": 2.1534, + "step": 2276 + }, + { + "epoch": 0.6988950276243094, + "grad_norm": 0.811660647392273, + "learning_rate": 9.958432172581079e-05, + "loss": 2.2197, + "step": 2277 + }, + { + "epoch": 0.6992019643953346, + "grad_norm": 0.9680081009864807, + "learning_rate": 9.958368187933305e-05, + "loss": 2.2241, + "step": 2278 + }, + { + "epoch": 0.6995089011663598, + "grad_norm": 0.9996320605278015, + "learning_rate": 9.958304154284028e-05, + "loss": 2.1598, + "step": 2279 + }, + { + "epoch": 0.6998158379373849, + "grad_norm": 1.008695363998413, + "learning_rate": 9.958240071633884e-05, + "loss": 2.2082, + "step": 2280 + }, + { + "epoch": 0.7001227747084101, + "grad_norm": 0.9931860566139221, + "learning_rate": 9.958175939983506e-05, + "loss": 2.1478, + "step": 2281 + }, + { + "epoch": 0.7004297114794352, + "grad_norm": 0.8637800812721252, + "learning_rate": 9.958111759333528e-05, + "loss": 2.149, + "step": 2282 + }, + { + "epoch": 0.7007366482504604, + "grad_norm": 0.7089012861251831, + "learning_rate": 9.958047529684582e-05, + "loss": 2.1845, + "step": 2283 + }, + { + "epoch": 0.7010435850214856, + "grad_norm": 0.6083673238754272, + "learning_rate": 9.957983251037303e-05, + "loss": 2.1542, + "step": 2284 + }, + { + "epoch": 0.7013505217925108, + "grad_norm": 0.7092905044555664, + "learning_rate": 9.957918923392331e-05, + "loss": 2.2305, + "step": 2285 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.8416675925254822, + "learning_rate": 9.957854546750297e-05, + "loss": 2.2975, + "step": 2286 + }, + { + "epoch": 0.7019643953345611, + "grad_norm": 0.7778663039207458, + "learning_rate": 9.957790121111838e-05, + "loss": 2.2363, + "step": 2287 + }, + { + "epoch": 0.7022713321055862, + "grad_norm": 0.7886617183685303, + "learning_rate": 9.957725646477592e-05, + "loss": 2.1547, + "step": 2288 + }, + { + "epoch": 0.7025782688766115, + "grad_norm": 0.6596038937568665, + "learning_rate": 9.957661122848194e-05, + "loss": 2.1537, + "step": 2289 + }, + { + "epoch": 0.7028852056476366, + "grad_norm": 0.6441544890403748, + "learning_rate": 9.957596550224285e-05, + "loss": 2.1678, + "step": 2290 + }, + { + "epoch": 0.7031921424186618, + "grad_norm": 0.7106116414070129, + "learning_rate": 9.957531928606499e-05, + "loss": 2.2039, + "step": 2291 + }, + { + "epoch": 0.7034990791896869, + "grad_norm": 0.6948207020759583, + "learning_rate": 9.957467257995476e-05, + "loss": 2.176, + "step": 2292 + }, + { + "epoch": 0.7038060159607121, + "grad_norm": 0.6834874153137207, + "learning_rate": 9.957402538391859e-05, + "loss": 2.2182, + "step": 2293 + }, + { + "epoch": 0.7041129527317372, + "grad_norm": 0.6246630549430847, + "learning_rate": 9.957337769796282e-05, + "loss": 2.1181, + "step": 2294 + }, + { + "epoch": 0.7044198895027625, + "grad_norm": 0.6421988606452942, + "learning_rate": 9.957272952209389e-05, + "loss": 2.1352, + "step": 2295 + }, + { + "epoch": 0.7047268262737876, + "grad_norm": 0.5955870151519775, + "learning_rate": 9.95720808563182e-05, + "loss": 2.1852, + "step": 2296 + }, + { + "epoch": 0.7050337630448128, + "grad_norm": 0.6961265206336975, + "learning_rate": 9.957143170064214e-05, + "loss": 2.242, + "step": 2297 + }, + { + "epoch": 0.7053406998158379, + "grad_norm": 0.6966063380241394, + "learning_rate": 9.957078205507213e-05, + "loss": 2.1505, + "step": 2298 + }, + { + "epoch": 0.7056476365868631, + "grad_norm": 0.6155996322631836, + "learning_rate": 9.957013191961459e-05, + "loss": 2.1928, + "step": 2299 + }, + { + "epoch": 0.7059545733578882, + "grad_norm": 0.6092718839645386, + "learning_rate": 9.956948129427597e-05, + "loss": 2.138, + "step": 2300 + }, + { + "epoch": 0.7062615101289135, + "grad_norm": 0.645746111869812, + "learning_rate": 9.95688301790627e-05, + "loss": 2.2334, + "step": 2301 + }, + { + "epoch": 0.7065684468999386, + "grad_norm": 0.5959149599075317, + "learning_rate": 9.956817857398116e-05, + "loss": 2.1985, + "step": 2302 + }, + { + "epoch": 0.7068753836709638, + "grad_norm": 0.7127073407173157, + "learning_rate": 9.956752647903785e-05, + "loss": 2.2157, + "step": 2303 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.5589274764060974, + "learning_rate": 9.956687389423917e-05, + "loss": 2.1251, + "step": 2304 + }, + { + "epoch": 0.7074892572130141, + "grad_norm": 0.5502300262451172, + "learning_rate": 9.95662208195916e-05, + "loss": 2.1344, + "step": 2305 + }, + { + "epoch": 0.7077961939840393, + "grad_norm": 0.6577275991439819, + "learning_rate": 9.95655672551016e-05, + "loss": 2.1646, + "step": 2306 + }, + { + "epoch": 0.7081031307550645, + "grad_norm": 0.6241618394851685, + "learning_rate": 9.956491320077559e-05, + "loss": 2.1153, + "step": 2307 + }, + { + "epoch": 0.7084100675260896, + "grad_norm": 0.5846728086471558, + "learning_rate": 9.956425865662007e-05, + "loss": 2.1477, + "step": 2308 + }, + { + "epoch": 0.7087170042971148, + "grad_norm": 0.6005275249481201, + "learning_rate": 9.95636036226415e-05, + "loss": 2.2034, + "step": 2309 + }, + { + "epoch": 0.7090239410681399, + "grad_norm": 0.6545519828796387, + "learning_rate": 9.956294809884635e-05, + "loss": 2.23, + "step": 2310 + }, + { + "epoch": 0.7093308778391652, + "grad_norm": 0.7513750791549683, + "learning_rate": 9.956229208524108e-05, + "loss": 2.2497, + "step": 2311 + }, + { + "epoch": 0.7096378146101903, + "grad_norm": 0.7308349609375, + "learning_rate": 9.956163558183219e-05, + "loss": 2.166, + "step": 2312 + }, + { + "epoch": 0.7099447513812155, + "grad_norm": 0.6278798580169678, + "learning_rate": 9.956097858862619e-05, + "loss": 2.1994, + "step": 2313 + }, + { + "epoch": 0.7102516881522406, + "grad_norm": 0.6725621223449707, + "learning_rate": 9.956032110562953e-05, + "loss": 2.2212, + "step": 2314 + }, + { + "epoch": 0.7105586249232658, + "grad_norm": 0.7116945385932922, + "learning_rate": 9.955966313284872e-05, + "loss": 2.2033, + "step": 2315 + }, + { + "epoch": 0.7108655616942909, + "grad_norm": 0.5906245112419128, + "learning_rate": 9.95590046702903e-05, + "loss": 2.1419, + "step": 2316 + }, + { + "epoch": 0.7111724984653162, + "grad_norm": 0.6911863684654236, + "learning_rate": 9.955834571796073e-05, + "loss": 2.1697, + "step": 2317 + }, + { + "epoch": 0.7114794352363413, + "grad_norm": 0.600350558757782, + "learning_rate": 9.955768627586655e-05, + "loss": 2.0864, + "step": 2318 + }, + { + "epoch": 0.7117863720073665, + "grad_norm": 0.6246278285980225, + "learning_rate": 9.955702634401427e-05, + "loss": 2.1549, + "step": 2319 + }, + { + "epoch": 0.7120933087783916, + "grad_norm": 0.6530009508132935, + "learning_rate": 9.95563659224104e-05, + "loss": 2.1457, + "step": 2320 + }, + { + "epoch": 0.7124002455494168, + "grad_norm": 0.6566256880760193, + "learning_rate": 9.955570501106148e-05, + "loss": 2.1589, + "step": 2321 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.6607041358947754, + "learning_rate": 9.955504360997404e-05, + "loss": 2.1692, + "step": 2322 + }, + { + "epoch": 0.7130141190914672, + "grad_norm": 0.7257810235023499, + "learning_rate": 9.95543817191546e-05, + "loss": 2.2067, + "step": 2323 + }, + { + "epoch": 0.7133210558624923, + "grad_norm": 0.7413349151611328, + "learning_rate": 9.955371933860973e-05, + "loss": 2.1817, + "step": 2324 + }, + { + "epoch": 0.7136279926335175, + "grad_norm": 0.6968317031860352, + "learning_rate": 9.955305646834596e-05, + "loss": 2.2574, + "step": 2325 + }, + { + "epoch": 0.7139349294045426, + "grad_norm": 0.8065732717514038, + "learning_rate": 9.955239310836983e-05, + "loss": 2.1957, + "step": 2326 + }, + { + "epoch": 0.7142418661755678, + "grad_norm": 0.7563133835792542, + "learning_rate": 9.955172925868792e-05, + "loss": 2.2113, + "step": 2327 + }, + { + "epoch": 0.714548802946593, + "grad_norm": 0.6790496110916138, + "learning_rate": 9.955106491930678e-05, + "loss": 2.103, + "step": 2328 + }, + { + "epoch": 0.7148557397176182, + "grad_norm": 0.65167236328125, + "learning_rate": 9.955040009023298e-05, + "loss": 2.1919, + "step": 2329 + }, + { + "epoch": 0.7151626764886433, + "grad_norm": 0.6869332790374756, + "learning_rate": 9.954973477147307e-05, + "loss": 2.2141, + "step": 2330 + }, + { + "epoch": 0.7154696132596685, + "grad_norm": 0.8613699078559875, + "learning_rate": 9.954906896303363e-05, + "loss": 2.1962, + "step": 2331 + }, + { + "epoch": 0.7157765500306936, + "grad_norm": 0.8827282786369324, + "learning_rate": 9.954840266492127e-05, + "loss": 2.216, + "step": 2332 + }, + { + "epoch": 0.7160834868017188, + "grad_norm": 0.9737905263900757, + "learning_rate": 9.954773587714255e-05, + "loss": 2.2118, + "step": 2333 + }, + { + "epoch": 0.716390423572744, + "grad_norm": 0.9978635311126709, + "learning_rate": 9.954706859970404e-05, + "loss": 2.0998, + "step": 2334 + }, + { + "epoch": 0.7166973603437692, + "grad_norm": 0.8694623112678528, + "learning_rate": 9.954640083261238e-05, + "loss": 2.1533, + "step": 2335 + }, + { + "epoch": 0.7170042971147943, + "grad_norm": 0.641293466091156, + "learning_rate": 9.954573257587415e-05, + "loss": 2.2095, + "step": 2336 + }, + { + "epoch": 0.7173112338858195, + "grad_norm": 0.6289860010147095, + "learning_rate": 9.954506382949594e-05, + "loss": 2.1683, + "step": 2337 + }, + { + "epoch": 0.7176181706568447, + "grad_norm": 0.8292246460914612, + "learning_rate": 9.954439459348437e-05, + "loss": 2.1729, + "step": 2338 + }, + { + "epoch": 0.7179251074278699, + "grad_norm": 0.8990920782089233, + "learning_rate": 9.954372486784605e-05, + "loss": 2.0888, + "step": 2339 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.7905614376068115, + "learning_rate": 9.954305465258762e-05, + "loss": 2.2262, + "step": 2340 + }, + { + "epoch": 0.7185389809699202, + "grad_norm": 0.7142611145973206, + "learning_rate": 9.954238394771567e-05, + "loss": 2.1311, + "step": 2341 + }, + { + "epoch": 0.7188459177409454, + "grad_norm": 0.68161541223526, + "learning_rate": 9.954171275323684e-05, + "loss": 2.2622, + "step": 2342 + }, + { + "epoch": 0.7191528545119705, + "grad_norm": 0.7524895668029785, + "learning_rate": 9.954104106915779e-05, + "loss": 2.1709, + "step": 2343 + }, + { + "epoch": 0.7194597912829958, + "grad_norm": 0.7419885396957397, + "learning_rate": 9.954036889548511e-05, + "loss": 2.1528, + "step": 2344 + }, + { + "epoch": 0.7197667280540209, + "grad_norm": 0.8045634031295776, + "learning_rate": 9.953969623222547e-05, + "loss": 2.1774, + "step": 2345 + }, + { + "epoch": 0.7200736648250461, + "grad_norm": 0.6680217385292053, + "learning_rate": 9.953902307938554e-05, + "loss": 2.2345, + "step": 2346 + }, + { + "epoch": 0.7203806015960712, + "grad_norm": 0.6900907754898071, + "learning_rate": 9.953834943697193e-05, + "loss": 2.1696, + "step": 2347 + }, + { + "epoch": 0.7206875383670964, + "grad_norm": 0.7231009006500244, + "learning_rate": 9.953767530499132e-05, + "loss": 2.2556, + "step": 2348 + }, + { + "epoch": 0.7209944751381215, + "grad_norm": 0.7766092419624329, + "learning_rate": 9.953700068345036e-05, + "loss": 2.1522, + "step": 2349 + }, + { + "epoch": 0.7213014119091468, + "grad_norm": 0.7361852526664734, + "learning_rate": 9.953632557235574e-05, + "loss": 2.2427, + "step": 2350 + }, + { + "epoch": 0.7216083486801719, + "grad_norm": 0.7170109152793884, + "learning_rate": 9.953564997171411e-05, + "loss": 2.2439, + "step": 2351 + }, + { + "epoch": 0.7219152854511971, + "grad_norm": 0.7192662954330444, + "learning_rate": 9.953497388153214e-05, + "loss": 2.1242, + "step": 2352 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.7363288402557373, + "learning_rate": 9.953429730181653e-05, + "loss": 2.2748, + "step": 2353 + }, + { + "epoch": 0.7225291589932474, + "grad_norm": 0.8516983985900879, + "learning_rate": 9.953362023257397e-05, + "loss": 2.2471, + "step": 2354 + }, + { + "epoch": 0.7228360957642725, + "grad_norm": 0.7928574681282043, + "learning_rate": 9.953294267381114e-05, + "loss": 2.164, + "step": 2355 + }, + { + "epoch": 0.7231430325352978, + "grad_norm": 0.6803320646286011, + "learning_rate": 9.953226462553474e-05, + "loss": 2.1671, + "step": 2356 + }, + { + "epoch": 0.7234499693063229, + "grad_norm": 0.6811994910240173, + "learning_rate": 9.953158608775147e-05, + "loss": 2.1042, + "step": 2357 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.6077840328216553, + "learning_rate": 9.953090706046804e-05, + "loss": 2.2161, + "step": 2358 + }, + { + "epoch": 0.7240638428483732, + "grad_norm": 0.5938412547111511, + "learning_rate": 9.953022754369114e-05, + "loss": 2.1177, + "step": 2359 + }, + { + "epoch": 0.7243707796193984, + "grad_norm": 0.6752299070358276, + "learning_rate": 9.952954753742751e-05, + "loss": 2.2255, + "step": 2360 + }, + { + "epoch": 0.7246777163904236, + "grad_norm": 0.6745245456695557, + "learning_rate": 9.952886704168387e-05, + "loss": 2.1817, + "step": 2361 + }, + { + "epoch": 0.7249846531614488, + "grad_norm": 0.6645397543907166, + "learning_rate": 9.95281860564669e-05, + "loss": 2.2495, + "step": 2362 + }, + { + "epoch": 0.7252915899324739, + "grad_norm": 0.6758745312690735, + "learning_rate": 9.95275045817834e-05, + "loss": 2.2059, + "step": 2363 + }, + { + "epoch": 0.7255985267034991, + "grad_norm": 0.6584516763687134, + "learning_rate": 9.952682261764006e-05, + "loss": 2.1868, + "step": 2364 + }, + { + "epoch": 0.7259054634745242, + "grad_norm": 0.6335561871528625, + "learning_rate": 9.952614016404363e-05, + "loss": 2.1352, + "step": 2365 + }, + { + "epoch": 0.7262124002455494, + "grad_norm": 0.6656816601753235, + "learning_rate": 9.952545722100087e-05, + "loss": 2.1805, + "step": 2366 + }, + { + "epoch": 0.7265193370165746, + "grad_norm": 0.6262782216072083, + "learning_rate": 9.95247737885185e-05, + "loss": 2.1435, + "step": 2367 + }, + { + "epoch": 0.7268262737875998, + "grad_norm": 0.569795548915863, + "learning_rate": 9.952408986660329e-05, + "loss": 2.1547, + "step": 2368 + }, + { + "epoch": 0.7271332105586249, + "grad_norm": 0.5249118208885193, + "learning_rate": 9.952340545526199e-05, + "loss": 2.1213, + "step": 2369 + }, + { + "epoch": 0.7274401473296501, + "grad_norm": 0.5581740140914917, + "learning_rate": 9.952272055450139e-05, + "loss": 2.1866, + "step": 2370 + }, + { + "epoch": 0.7277470841006752, + "grad_norm": 0.5986969470977783, + "learning_rate": 9.952203516432821e-05, + "loss": 2.143, + "step": 2371 + }, + { + "epoch": 0.7280540208717005, + "grad_norm": 0.6426723599433899, + "learning_rate": 9.952134928474926e-05, + "loss": 2.2132, + "step": 2372 + }, + { + "epoch": 0.7283609576427256, + "grad_norm": 0.5856953263282776, + "learning_rate": 9.952066291577133e-05, + "loss": 2.1502, + "step": 2373 + }, + { + "epoch": 0.7286678944137508, + "grad_norm": 0.5420570969581604, + "learning_rate": 9.951997605740117e-05, + "loss": 2.1213, + "step": 2374 + }, + { + "epoch": 0.7289748311847759, + "grad_norm": 0.6201688647270203, + "learning_rate": 9.951928870964558e-05, + "loss": 2.218, + "step": 2375 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.7023850083351135, + "learning_rate": 9.951860087251137e-05, + "loss": 2.2787, + "step": 2376 + }, + { + "epoch": 0.7295887047268262, + "grad_norm": 0.733650803565979, + "learning_rate": 9.951791254600532e-05, + "loss": 2.1861, + "step": 2377 + }, + { + "epoch": 0.7298956414978515, + "grad_norm": 0.7177363038063049, + "learning_rate": 9.951722373013421e-05, + "loss": 2.1905, + "step": 2378 + }, + { + "epoch": 0.7302025782688766, + "grad_norm": 0.7963547706604004, + "learning_rate": 9.95165344249049e-05, + "loss": 2.1842, + "step": 2379 + }, + { + "epoch": 0.7305095150399018, + "grad_norm": 0.8466546535491943, + "learning_rate": 9.951584463032416e-05, + "loss": 2.1661, + "step": 2380 + }, + { + "epoch": 0.7308164518109269, + "grad_norm": 0.7288870811462402, + "learning_rate": 9.951515434639882e-05, + "loss": 2.1153, + "step": 2381 + }, + { + "epoch": 0.7311233885819521, + "grad_norm": 0.6168704032897949, + "learning_rate": 9.951446357313571e-05, + "loss": 2.121, + "step": 2382 + }, + { + "epoch": 0.7314303253529773, + "grad_norm": 0.6534848809242249, + "learning_rate": 9.951377231054166e-05, + "loss": 2.2087, + "step": 2383 + }, + { + "epoch": 0.7317372621240025, + "grad_norm": 0.7872020602226257, + "learning_rate": 9.951308055862347e-05, + "loss": 2.2428, + "step": 2384 + }, + { + "epoch": 0.7320441988950276, + "grad_norm": 0.864799439907074, + "learning_rate": 9.9512388317388e-05, + "loss": 2.2392, + "step": 2385 + }, + { + "epoch": 0.7323511356660528, + "grad_norm": 0.7365485429763794, + "learning_rate": 9.95116955868421e-05, + "loss": 2.1614, + "step": 2386 + }, + { + "epoch": 0.7326580724370779, + "grad_norm": 0.6509390473365784, + "learning_rate": 9.95110023669926e-05, + "loss": 2.1917, + "step": 2387 + }, + { + "epoch": 0.7329650092081031, + "grad_norm": 0.7660403847694397, + "learning_rate": 9.951030865784635e-05, + "loss": 2.2414, + "step": 2388 + }, + { + "epoch": 0.7332719459791283, + "grad_norm": 0.9997872114181519, + "learning_rate": 9.950961445941022e-05, + "loss": 2.2063, + "step": 2389 + }, + { + "epoch": 0.7335788827501535, + "grad_norm": 1.0113418102264404, + "learning_rate": 9.950891977169106e-05, + "loss": 2.1898, + "step": 2390 + }, + { + "epoch": 0.7338858195211786, + "grad_norm": 0.8849206566810608, + "learning_rate": 9.950822459469573e-05, + "loss": 2.1503, + "step": 2391 + }, + { + "epoch": 0.7341927562922038, + "grad_norm": 0.6561055779457092, + "learning_rate": 9.950752892843112e-05, + "loss": 2.1234, + "step": 2392 + }, + { + "epoch": 0.7344996930632289, + "grad_norm": 0.5568758845329285, + "learning_rate": 9.950683277290407e-05, + "loss": 2.2129, + "step": 2393 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.7019078135490417, + "learning_rate": 9.950613612812149e-05, + "loss": 2.1162, + "step": 2394 + }, + { + "epoch": 0.7351135666052793, + "grad_norm": 0.7633521556854248, + "learning_rate": 9.950543899409026e-05, + "loss": 2.2427, + "step": 2395 + }, + { + "epoch": 0.7354205033763045, + "grad_norm": 0.6743205785751343, + "learning_rate": 9.950474137081726e-05, + "loss": 2.2213, + "step": 2396 + }, + { + "epoch": 0.7357274401473296, + "grad_norm": 0.6008336544036865, + "learning_rate": 9.950404325830941e-05, + "loss": 2.1605, + "step": 2397 + }, + { + "epoch": 0.7360343769183548, + "grad_norm": 0.648760199546814, + "learning_rate": 9.950334465657357e-05, + "loss": 2.2298, + "step": 2398 + }, + { + "epoch": 0.7363413136893799, + "grad_norm": 0.6996559500694275, + "learning_rate": 9.950264556561667e-05, + "loss": 2.1616, + "step": 2399 + }, + { + "epoch": 0.7366482504604052, + "grad_norm": 0.741629421710968, + "learning_rate": 9.950194598544561e-05, + "loss": 2.2162, + "step": 2400 + }, + { + "epoch": 0.7369551872314303, + "grad_norm": 0.6144673824310303, + "learning_rate": 9.95012459160673e-05, + "loss": 2.15, + "step": 2401 + }, + { + "epoch": 0.7372621240024555, + "grad_norm": 0.5826541781425476, + "learning_rate": 9.950054535748867e-05, + "loss": 2.1792, + "step": 2402 + }, + { + "epoch": 0.7375690607734806, + "grad_norm": 0.6489288806915283, + "learning_rate": 9.949984430971665e-05, + "loss": 2.1703, + "step": 2403 + }, + { + "epoch": 0.7378759975445058, + "grad_norm": 0.6752250790596008, + "learning_rate": 9.949914277275814e-05, + "loss": 2.2561, + "step": 2404 + }, + { + "epoch": 0.738182934315531, + "grad_norm": 0.5570092797279358, + "learning_rate": 9.94984407466201e-05, + "loss": 2.1418, + "step": 2405 + }, + { + "epoch": 0.7384898710865562, + "grad_norm": 0.5966812968254089, + "learning_rate": 9.949773823130944e-05, + "loss": 2.2168, + "step": 2406 + }, + { + "epoch": 0.7387968078575813, + "grad_norm": 0.6253142952919006, + "learning_rate": 9.949703522683314e-05, + "loss": 2.1646, + "step": 2407 + }, + { + "epoch": 0.7391037446286065, + "grad_norm": 0.6673659086227417, + "learning_rate": 9.94963317331981e-05, + "loss": 2.1904, + "step": 2408 + }, + { + "epoch": 0.7394106813996317, + "grad_norm": 0.6243279576301575, + "learning_rate": 9.949562775041133e-05, + "loss": 2.2568, + "step": 2409 + }, + { + "epoch": 0.7397176181706568, + "grad_norm": 0.7014298439025879, + "learning_rate": 9.949492327847973e-05, + "loss": 2.2331, + "step": 2410 + }, + { + "epoch": 0.7400245549416821, + "grad_norm": 0.698403537273407, + "learning_rate": 9.94942183174103e-05, + "loss": 2.1928, + "step": 2411 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.6354022026062012, + "learning_rate": 9.949351286721001e-05, + "loss": 2.0975, + "step": 2412 + }, + { + "epoch": 0.7406384284837324, + "grad_norm": 0.595302164554596, + "learning_rate": 9.949280692788579e-05, + "loss": 2.177, + "step": 2413 + }, + { + "epoch": 0.7409453652547575, + "grad_norm": 0.6844484210014343, + "learning_rate": 9.949210049944465e-05, + "loss": 2.1962, + "step": 2414 + }, + { + "epoch": 0.7412523020257827, + "grad_norm": 0.6242616176605225, + "learning_rate": 9.949139358189357e-05, + "loss": 2.2143, + "step": 2415 + }, + { + "epoch": 0.7415592387968079, + "grad_norm": 0.6524595022201538, + "learning_rate": 9.949068617523954e-05, + "loss": 2.1438, + "step": 2416 + }, + { + "epoch": 0.7418661755678331, + "grad_norm": 0.6667510867118835, + "learning_rate": 9.948997827948953e-05, + "loss": 2.2115, + "step": 2417 + }, + { + "epoch": 0.7421731123388582, + "grad_norm": 0.7688906192779541, + "learning_rate": 9.948926989465056e-05, + "loss": 2.1887, + "step": 2418 + }, + { + "epoch": 0.7424800491098834, + "grad_norm": 0.6888165473937988, + "learning_rate": 9.948856102072958e-05, + "loss": 2.1349, + "step": 2419 + }, + { + "epoch": 0.7427869858809085, + "grad_norm": 0.5672495365142822, + "learning_rate": 9.948785165773367e-05, + "loss": 2.1109, + "step": 2420 + }, + { + "epoch": 0.7430939226519337, + "grad_norm": 0.5714489221572876, + "learning_rate": 9.94871418056698e-05, + "loss": 2.1483, + "step": 2421 + }, + { + "epoch": 0.7434008594229589, + "grad_norm": 0.6061533093452454, + "learning_rate": 9.948643146454498e-05, + "loss": 2.211, + "step": 2422 + }, + { + "epoch": 0.7437077961939841, + "grad_norm": 0.6132726073265076, + "learning_rate": 9.948572063436625e-05, + "loss": 2.23, + "step": 2423 + }, + { + "epoch": 0.7440147329650092, + "grad_norm": 0.684301495552063, + "learning_rate": 9.948500931514062e-05, + "loss": 2.129, + "step": 2424 + }, + { + "epoch": 0.7443216697360344, + "grad_norm": 0.6325442790985107, + "learning_rate": 9.948429750687512e-05, + "loss": 2.129, + "step": 2425 + }, + { + "epoch": 0.7446286065070595, + "grad_norm": 0.6245989203453064, + "learning_rate": 9.948358520957678e-05, + "loss": 2.1999, + "step": 2426 + }, + { + "epoch": 0.7449355432780848, + "grad_norm": 0.6638534069061279, + "learning_rate": 9.948287242325267e-05, + "loss": 2.203, + "step": 2427 + }, + { + "epoch": 0.7452424800491099, + "grad_norm": 0.6121437549591064, + "learning_rate": 9.94821591479098e-05, + "loss": 2.1204, + "step": 2428 + }, + { + "epoch": 0.7455494168201351, + "grad_norm": 0.7919846177101135, + "learning_rate": 9.948144538355522e-05, + "loss": 2.2353, + "step": 2429 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.7246984839439392, + "learning_rate": 9.948073113019602e-05, + "loss": 2.1284, + "step": 2430 + }, + { + "epoch": 0.7461632903621854, + "grad_norm": 0.6120265126228333, + "learning_rate": 9.948001638783921e-05, + "loss": 2.0873, + "step": 2431 + }, + { + "epoch": 0.7464702271332105, + "grad_norm": 0.628588080406189, + "learning_rate": 9.947930115649189e-05, + "loss": 2.1713, + "step": 2432 + }, + { + "epoch": 0.7467771639042358, + "grad_norm": 0.63116854429245, + "learning_rate": 9.947858543616111e-05, + "loss": 2.123, + "step": 2433 + }, + { + "epoch": 0.7470841006752609, + "grad_norm": 0.6533017754554749, + "learning_rate": 9.947786922685394e-05, + "loss": 2.1593, + "step": 2434 + }, + { + "epoch": 0.7473910374462861, + "grad_norm": 0.6854177117347717, + "learning_rate": 9.947715252857749e-05, + "loss": 2.162, + "step": 2435 + }, + { + "epoch": 0.7476979742173112, + "grad_norm": 0.7257967591285706, + "learning_rate": 9.94764353413388e-05, + "loss": 2.2644, + "step": 2436 + }, + { + "epoch": 0.7480049109883364, + "grad_norm": 0.6806700825691223, + "learning_rate": 9.947571766514498e-05, + "loss": 2.0875, + "step": 2437 + }, + { + "epoch": 0.7483118477593615, + "grad_norm": 0.6616181135177612, + "learning_rate": 9.947499950000312e-05, + "loss": 2.1353, + "step": 2438 + }, + { + "epoch": 0.7486187845303868, + "grad_norm": 0.7249685525894165, + "learning_rate": 9.947428084592032e-05, + "loss": 2.148, + "step": 2439 + }, + { + "epoch": 0.7489257213014119, + "grad_norm": 0.6372905969619751, + "learning_rate": 9.947356170290369e-05, + "loss": 2.1749, + "step": 2440 + }, + { + "epoch": 0.7492326580724371, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.947284207096031e-05, + "loss": 2.1909, + "step": 2441 + }, + { + "epoch": 0.7495395948434622, + "grad_norm": 0.5830507278442383, + "learning_rate": 9.94721219500973e-05, + "loss": 2.1351, + "step": 2442 + }, + { + "epoch": 0.7498465316144874, + "grad_norm": 0.650262713432312, + "learning_rate": 9.94714013403218e-05, + "loss": 2.2602, + "step": 2443 + }, + { + "epoch": 0.7501534683855126, + "grad_norm": 0.6658717393875122, + "learning_rate": 9.947068024164091e-05, + "loss": 2.0919, + "step": 2444 + }, + { + "epoch": 0.7504604051565378, + "grad_norm": 0.7299105525016785, + "learning_rate": 9.946995865406177e-05, + "loss": 2.2079, + "step": 2445 + }, + { + "epoch": 0.7507673419275629, + "grad_norm": 0.762246310710907, + "learning_rate": 9.946923657759148e-05, + "loss": 2.2225, + "step": 2446 + }, + { + "epoch": 0.7510742786985881, + "grad_norm": 0.7019835710525513, + "learning_rate": 9.946851401223722e-05, + "loss": 2.175, + "step": 2447 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.6214791536331177, + "learning_rate": 9.946779095800611e-05, + "loss": 2.2095, + "step": 2448 + }, + { + "epoch": 0.7516881522406385, + "grad_norm": 0.6380667090415955, + "learning_rate": 9.94670674149053e-05, + "loss": 2.2325, + "step": 2449 + }, + { + "epoch": 0.7519950890116636, + "grad_norm": 0.6175886392593384, + "learning_rate": 9.946634338294191e-05, + "loss": 2.1431, + "step": 2450 + }, + { + "epoch": 0.7523020257826888, + "grad_norm": 0.6642621159553528, + "learning_rate": 9.946561886212315e-05, + "loss": 2.1538, + "step": 2451 + }, + { + "epoch": 0.7526089625537139, + "grad_norm": 0.7078617215156555, + "learning_rate": 9.946489385245614e-05, + "loss": 2.1544, + "step": 2452 + }, + { + "epoch": 0.7529158993247391, + "grad_norm": 0.6939398050308228, + "learning_rate": 9.946416835394806e-05, + "loss": 2.1131, + "step": 2453 + }, + { + "epoch": 0.7532228360957642, + "grad_norm": 0.7080716490745544, + "learning_rate": 9.946344236660608e-05, + "loss": 2.2135, + "step": 2454 + }, + { + "epoch": 0.7535297728667895, + "grad_norm": 0.7451115250587463, + "learning_rate": 9.946271589043736e-05, + "loss": 2.1475, + "step": 2455 + }, + { + "epoch": 0.7538367096378146, + "grad_norm": 0.6718367338180542, + "learning_rate": 9.946198892544909e-05, + "loss": 2.1853, + "step": 2456 + }, + { + "epoch": 0.7541436464088398, + "grad_norm": 0.7071637511253357, + "learning_rate": 9.946126147164847e-05, + "loss": 2.0981, + "step": 2457 + }, + { + "epoch": 0.7544505831798649, + "grad_norm": 0.6745624542236328, + "learning_rate": 9.946053352904267e-05, + "loss": 2.1914, + "step": 2458 + }, + { + "epoch": 0.7547575199508901, + "grad_norm": 0.7267486453056335, + "learning_rate": 9.945980509763888e-05, + "loss": 2.1091, + "step": 2459 + }, + { + "epoch": 0.7550644567219152, + "grad_norm": 0.6128695607185364, + "learning_rate": 9.94590761774443e-05, + "loss": 2.1721, + "step": 2460 + }, + { + "epoch": 0.7553713934929405, + "grad_norm": 0.6574678421020508, + "learning_rate": 9.945834676846615e-05, + "loss": 2.1609, + "step": 2461 + }, + { + "epoch": 0.7556783302639656, + "grad_norm": 0.6209995150566101, + "learning_rate": 9.945761687071164e-05, + "loss": 2.1889, + "step": 2462 + }, + { + "epoch": 0.7559852670349908, + "grad_norm": 0.7425361275672913, + "learning_rate": 9.945688648418795e-05, + "loss": 2.2189, + "step": 2463 + }, + { + "epoch": 0.7562922038060159, + "grad_norm": 1.0604934692382812, + "learning_rate": 9.945615560890234e-05, + "loss": 2.1858, + "step": 2464 + }, + { + "epoch": 0.7565991405770411, + "grad_norm": 0.7162829041481018, + "learning_rate": 9.945542424486201e-05, + "loss": 2.101, + "step": 2465 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.6361207962036133, + "learning_rate": 9.945469239207416e-05, + "loss": 2.0836, + "step": 2466 + }, + { + "epoch": 0.7572130141190915, + "grad_norm": 0.5858156085014343, + "learning_rate": 9.945396005054609e-05, + "loss": 2.2059, + "step": 2467 + }, + { + "epoch": 0.7575199508901166, + "grad_norm": 0.7322074174880981, + "learning_rate": 9.945322722028498e-05, + "loss": 2.2295, + "step": 2468 + }, + { + "epoch": 0.7578268876611418, + "grad_norm": 0.775900661945343, + "learning_rate": 9.945249390129811e-05, + "loss": 2.2171, + "step": 2469 + }, + { + "epoch": 0.7581338244321669, + "grad_norm": 0.8801379799842834, + "learning_rate": 9.94517600935927e-05, + "loss": 2.1632, + "step": 2470 + }, + { + "epoch": 0.7584407612031921, + "grad_norm": 0.8258405923843384, + "learning_rate": 9.945102579717602e-05, + "loss": 2.1591, + "step": 2471 + }, + { + "epoch": 0.7587476979742173, + "grad_norm": 0.7472482323646545, + "learning_rate": 9.945029101205532e-05, + "loss": 2.2242, + "step": 2472 + }, + { + "epoch": 0.7590546347452425, + "grad_norm": 0.6594643592834473, + "learning_rate": 9.944955573823785e-05, + "loss": 2.1217, + "step": 2473 + }, + { + "epoch": 0.7593615715162676, + "grad_norm": 0.6547524333000183, + "learning_rate": 9.944881997573088e-05, + "loss": 2.131, + "step": 2474 + }, + { + "epoch": 0.7596685082872928, + "grad_norm": 0.6630129814147949, + "learning_rate": 9.94480837245417e-05, + "loss": 2.1264, + "step": 2475 + }, + { + "epoch": 0.7599754450583179, + "grad_norm": 0.6877384781837463, + "learning_rate": 9.944734698467757e-05, + "loss": 2.2453, + "step": 2476 + }, + { + "epoch": 0.7602823818293432, + "grad_norm": 0.6736158728599548, + "learning_rate": 9.944660975614579e-05, + "loss": 2.1425, + "step": 2477 + }, + { + "epoch": 0.7605893186003683, + "grad_norm": 0.6140786409378052, + "learning_rate": 9.944587203895361e-05, + "loss": 2.1345, + "step": 2478 + }, + { + "epoch": 0.7608962553713935, + "grad_norm": 0.5515910387039185, + "learning_rate": 9.944513383310837e-05, + "loss": 2.086, + "step": 2479 + }, + { + "epoch": 0.7612031921424187, + "grad_norm": 0.49419671297073364, + "learning_rate": 9.944439513861731e-05, + "loss": 2.1069, + "step": 2480 + }, + { + "epoch": 0.7615101289134438, + "grad_norm": 0.5526577234268188, + "learning_rate": 9.944365595548777e-05, + "loss": 2.1702, + "step": 2481 + }, + { + "epoch": 0.761817065684469, + "grad_norm": 0.5430580973625183, + "learning_rate": 9.944291628372702e-05, + "loss": 2.121, + "step": 2482 + }, + { + "epoch": 0.7621240024554942, + "grad_norm": 0.5333554148674011, + "learning_rate": 9.94421761233424e-05, + "loss": 2.1154, + "step": 2483 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.5856761932373047, + "learning_rate": 9.944143547434124e-05, + "loss": 2.1734, + "step": 2484 + }, + { + "epoch": 0.7627378759975445, + "grad_norm": 0.6619083881378174, + "learning_rate": 9.944069433673082e-05, + "loss": 2.2068, + "step": 2485 + }, + { + "epoch": 0.7630448127685697, + "grad_norm": 0.5791018009185791, + "learning_rate": 9.943995271051849e-05, + "loss": 2.0834, + "step": 2486 + }, + { + "epoch": 0.7633517495395948, + "grad_norm": 0.5942522287368774, + "learning_rate": 9.943921059571155e-05, + "loss": 2.2001, + "step": 2487 + }, + { + "epoch": 0.7636586863106201, + "grad_norm": 0.6285880208015442, + "learning_rate": 9.943846799231738e-05, + "loss": 2.1601, + "step": 2488 + }, + { + "epoch": 0.7639656230816452, + "grad_norm": 0.6337715983390808, + "learning_rate": 9.943772490034326e-05, + "loss": 2.1722, + "step": 2489 + }, + { + "epoch": 0.7642725598526704, + "grad_norm": 0.6912121772766113, + "learning_rate": 9.94369813197966e-05, + "loss": 2.1933, + "step": 2490 + }, + { + "epoch": 0.7645794966236955, + "grad_norm": 0.8028284311294556, + "learning_rate": 9.943623725068469e-05, + "loss": 2.129, + "step": 2491 + }, + { + "epoch": 0.7648864333947207, + "grad_norm": 0.8527138233184814, + "learning_rate": 9.943549269301491e-05, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.7651933701657458, + "grad_norm": 0.8422580361366272, + "learning_rate": 9.943474764679462e-05, + "loss": 2.2958, + "step": 2493 + }, + { + "epoch": 0.7655003069367711, + "grad_norm": 0.7698150873184204, + "learning_rate": 9.943400211203118e-05, + "loss": 2.1415, + "step": 2494 + }, + { + "epoch": 0.7658072437077962, + "grad_norm": 0.6360690593719482, + "learning_rate": 9.943325608873196e-05, + "loss": 2.1188, + "step": 2495 + }, + { + "epoch": 0.7661141804788214, + "grad_norm": 0.6225799918174744, + "learning_rate": 9.943250957690433e-05, + "loss": 2.1006, + "step": 2496 + }, + { + "epoch": 0.7664211172498465, + "grad_norm": 0.6694490909576416, + "learning_rate": 9.943176257655567e-05, + "loss": 2.2455, + "step": 2497 + }, + { + "epoch": 0.7667280540208717, + "grad_norm": 0.6188158988952637, + "learning_rate": 9.943101508769335e-05, + "loss": 2.0853, + "step": 2498 + }, + { + "epoch": 0.7670349907918969, + "grad_norm": 0.5934504866600037, + "learning_rate": 9.943026711032477e-05, + "loss": 2.0718, + "step": 2499 + }, + { + "epoch": 0.7673419275629221, + "grad_norm": 0.6261292695999146, + "learning_rate": 9.942951864445732e-05, + "loss": 2.1747, + "step": 2500 + }, + { + "epoch": 0.7676488643339472, + "grad_norm": 0.5891184210777283, + "learning_rate": 9.94287696900984e-05, + "loss": 2.1637, + "step": 2501 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.5321740508079529, + "learning_rate": 9.94280202472554e-05, + "loss": 2.0717, + "step": 2502 + }, + { + "epoch": 0.7682627378759975, + "grad_norm": 0.5563281178474426, + "learning_rate": 9.942727031593573e-05, + "loss": 2.1654, + "step": 2503 + }, + { + "epoch": 0.7685696746470227, + "grad_norm": 0.5672664046287537, + "learning_rate": 9.942651989614681e-05, + "loss": 2.0853, + "step": 2504 + }, + { + "epoch": 0.7688766114180479, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.942576898789606e-05, + "loss": 2.0636, + "step": 2505 + }, + { + "epoch": 0.7691835481890731, + "grad_norm": 0.5802470445632935, + "learning_rate": 9.942501759119088e-05, + "loss": 2.0924, + "step": 2506 + }, + { + "epoch": 0.7694904849600982, + "grad_norm": 0.5630003213882446, + "learning_rate": 9.94242657060387e-05, + "loss": 2.1975, + "step": 2507 + }, + { + "epoch": 0.7697974217311234, + "grad_norm": 0.6001835465431213, + "learning_rate": 9.942351333244697e-05, + "loss": 2.1187, + "step": 2508 + }, + { + "epoch": 0.7701043585021485, + "grad_norm": 0.6702088117599487, + "learning_rate": 9.942276047042311e-05, + "loss": 2.1489, + "step": 2509 + }, + { + "epoch": 0.7704112952731738, + "grad_norm": 0.7941808700561523, + "learning_rate": 9.942200711997456e-05, + "loss": 2.1404, + "step": 2510 + }, + { + "epoch": 0.7707182320441989, + "grad_norm": 0.8202539682388306, + "learning_rate": 9.942125328110876e-05, + "loss": 2.1242, + "step": 2511 + }, + { + "epoch": 0.7710251688152241, + "grad_norm": 0.7667655348777771, + "learning_rate": 9.942049895383319e-05, + "loss": 2.118, + "step": 2512 + }, + { + "epoch": 0.7713321055862492, + "grad_norm": 0.6766887307167053, + "learning_rate": 9.941974413815527e-05, + "loss": 2.2632, + "step": 2513 + }, + { + "epoch": 0.7716390423572744, + "grad_norm": 0.5923287272453308, + "learning_rate": 9.941898883408248e-05, + "loss": 2.1096, + "step": 2514 + }, + { + "epoch": 0.7719459791282995, + "grad_norm": 0.8847586512565613, + "learning_rate": 9.941823304162227e-05, + "loss": 2.2629, + "step": 2515 + }, + { + "epoch": 0.7722529158993248, + "grad_norm": 1.2274069786071777, + "learning_rate": 9.941747676078211e-05, + "loss": 2.2493, + "step": 2516 + }, + { + "epoch": 0.7725598526703499, + "grad_norm": 0.8637729287147522, + "learning_rate": 9.94167199915695e-05, + "loss": 2.1545, + "step": 2517 + }, + { + "epoch": 0.7728667894413751, + "grad_norm": 0.7852178812026978, + "learning_rate": 9.941596273399187e-05, + "loss": 2.1984, + "step": 2518 + }, + { + "epoch": 0.7731737262124002, + "grad_norm": 0.6839576959609985, + "learning_rate": 9.941520498805677e-05, + "loss": 2.1913, + "step": 2519 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.7051649689674377, + "learning_rate": 9.941444675377163e-05, + "loss": 2.1678, + "step": 2520 + }, + { + "epoch": 0.7737875997544506, + "grad_norm": 0.702549159526825, + "learning_rate": 9.941368803114395e-05, + "loss": 2.1426, + "step": 2521 + }, + { + "epoch": 0.7740945365254758, + "grad_norm": 0.6717942953109741, + "learning_rate": 9.941292882018127e-05, + "loss": 2.1873, + "step": 2522 + }, + { + "epoch": 0.7744014732965009, + "grad_norm": 0.6705282926559448, + "learning_rate": 9.941216912089104e-05, + "loss": 2.1363, + "step": 2523 + }, + { + "epoch": 0.7747084100675261, + "grad_norm": 0.5858317017555237, + "learning_rate": 9.941140893328082e-05, + "loss": 2.1019, + "step": 2524 + }, + { + "epoch": 0.7750153468385512, + "grad_norm": 0.6353682279586792, + "learning_rate": 9.941064825735808e-05, + "loss": 2.1765, + "step": 2525 + }, + { + "epoch": 0.7753222836095764, + "grad_norm": 0.6573354601860046, + "learning_rate": 9.940988709313035e-05, + "loss": 2.0636, + "step": 2526 + }, + { + "epoch": 0.7756292203806016, + "grad_norm": 0.6040489077568054, + "learning_rate": 9.940912544060517e-05, + "loss": 2.0902, + "step": 2527 + }, + { + "epoch": 0.7759361571516268, + "grad_norm": 0.7024530172348022, + "learning_rate": 9.940836329979004e-05, + "loss": 2.2198, + "step": 2528 + }, + { + "epoch": 0.7762430939226519, + "grad_norm": 0.6910196542739868, + "learning_rate": 9.940760067069251e-05, + "loss": 2.0546, + "step": 2529 + }, + { + "epoch": 0.7765500306936771, + "grad_norm": 0.6841506361961365, + "learning_rate": 9.940683755332012e-05, + "loss": 2.2159, + "step": 2530 + }, + { + "epoch": 0.7768569674647022, + "grad_norm": 0.6503066420555115, + "learning_rate": 9.940607394768038e-05, + "loss": 2.2156, + "step": 2531 + }, + { + "epoch": 0.7771639042357275, + "grad_norm": 0.6512146592140198, + "learning_rate": 9.940530985378089e-05, + "loss": 2.1417, + "step": 2532 + }, + { + "epoch": 0.7774708410067526, + "grad_norm": 0.6234787106513977, + "learning_rate": 9.940454527162914e-05, + "loss": 2.1315, + "step": 2533 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.6279457211494446, + "learning_rate": 9.940378020123273e-05, + "loss": 2.2699, + "step": 2534 + }, + { + "epoch": 0.7780847145488029, + "grad_norm": 0.6793956160545349, + "learning_rate": 9.940301464259921e-05, + "loss": 2.2488, + "step": 2535 + }, + { + "epoch": 0.7783916513198281, + "grad_norm": 0.721234142780304, + "learning_rate": 9.940224859573614e-05, + "loss": 2.1183, + "step": 2536 + }, + { + "epoch": 0.7786985880908532, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.94014820606511e-05, + "loss": 2.0995, + "step": 2537 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.6358578205108643, + "learning_rate": 9.940071503735165e-05, + "loss": 2.2024, + "step": 2538 + }, + { + "epoch": 0.7793124616329036, + "grad_norm": 0.6250868439674377, + "learning_rate": 9.939994752584538e-05, + "loss": 2.1574, + "step": 2539 + }, + { + "epoch": 0.7796193984039288, + "grad_norm": 0.7657763361930847, + "learning_rate": 9.939917952613989e-05, + "loss": 2.2625, + "step": 2540 + }, + { + "epoch": 0.7799263351749539, + "grad_norm": 0.7625400424003601, + "learning_rate": 9.939841103824275e-05, + "loss": 2.1809, + "step": 2541 + }, + { + "epoch": 0.7802332719459791, + "grad_norm": 0.8593107461929321, + "learning_rate": 9.939764206216155e-05, + "loss": 2.2359, + "step": 2542 + }, + { + "epoch": 0.7805402087170042, + "grad_norm": 0.8441007733345032, + "learning_rate": 9.93968725979039e-05, + "loss": 2.1844, + "step": 2543 + }, + { + "epoch": 0.7808471454880295, + "grad_norm": 0.6408470273017883, + "learning_rate": 9.93961026454774e-05, + "loss": 2.1871, + "step": 2544 + }, + { + "epoch": 0.7811540822590546, + "grad_norm": 0.6779976487159729, + "learning_rate": 9.939533220488966e-05, + "loss": 2.1651, + "step": 2545 + }, + { + "epoch": 0.7814610190300798, + "grad_norm": 0.5885556936264038, + "learning_rate": 9.93945612761483e-05, + "loss": 2.0172, + "step": 2546 + }, + { + "epoch": 0.7817679558011049, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.939378985926094e-05, + "loss": 2.1358, + "step": 2547 + }, + { + "epoch": 0.7820748925721301, + "grad_norm": 0.685183584690094, + "learning_rate": 9.939301795423519e-05, + "loss": 2.1822, + "step": 2548 + }, + { + "epoch": 0.7823818293431553, + "grad_norm": 0.6666997671127319, + "learning_rate": 9.939224556107869e-05, + "loss": 2.288, + "step": 2549 + }, + { + "epoch": 0.7826887661141805, + "grad_norm": 0.6401170492172241, + "learning_rate": 9.939147267979905e-05, + "loss": 2.1038, + "step": 2550 + }, + { + "epoch": 0.7829957028852057, + "grad_norm": 0.645182728767395, + "learning_rate": 9.939069931040396e-05, + "loss": 2.1285, + "step": 2551 + }, + { + "epoch": 0.7833026396562308, + "grad_norm": 0.6795851588249207, + "learning_rate": 9.9389925452901e-05, + "loss": 2.1844, + "step": 2552 + }, + { + "epoch": 0.783609576427256, + "grad_norm": 0.7027488946914673, + "learning_rate": 9.938915110729788e-05, + "loss": 2.1712, + "step": 2553 + }, + { + "epoch": 0.7839165131982812, + "grad_norm": 0.7076524496078491, + "learning_rate": 9.93883762736022e-05, + "loss": 2.1812, + "step": 2554 + }, + { + "epoch": 0.7842234499693064, + "grad_norm": 0.5979459881782532, + "learning_rate": 9.938760095182165e-05, + "loss": 2.0877, + "step": 2555 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.6408665776252747, + "learning_rate": 9.938682514196387e-05, + "loss": 2.191, + "step": 2556 + }, + { + "epoch": 0.7848373235113567, + "grad_norm": 0.6545908451080322, + "learning_rate": 9.938604884403654e-05, + "loss": 2.0933, + "step": 2557 + }, + { + "epoch": 0.7851442602823818, + "grad_norm": 0.7271838784217834, + "learning_rate": 9.938527205804733e-05, + "loss": 2.1804, + "step": 2558 + }, + { + "epoch": 0.785451197053407, + "grad_norm": 0.6371840834617615, + "learning_rate": 9.938449478400391e-05, + "loss": 2.1161, + "step": 2559 + }, + { + "epoch": 0.7857581338244322, + "grad_norm": 0.5922467708587646, + "learning_rate": 9.938371702191398e-05, + "loss": 2.0929, + "step": 2560 + }, + { + "epoch": 0.7860650705954574, + "grad_norm": 0.536125898361206, + "learning_rate": 9.938293877178522e-05, + "loss": 2.0815, + "step": 2561 + }, + { + "epoch": 0.7863720073664825, + "grad_norm": 0.6026225090026855, + "learning_rate": 9.93821600336253e-05, + "loss": 2.1719, + "step": 2562 + }, + { + "epoch": 0.7866789441375077, + "grad_norm": 0.584267795085907, + "learning_rate": 9.938138080744192e-05, + "loss": 2.1515, + "step": 2563 + }, + { + "epoch": 0.7869858809085328, + "grad_norm": 0.6616362929344177, + "learning_rate": 9.938060109324281e-05, + "loss": 2.2425, + "step": 2564 + }, + { + "epoch": 0.787292817679558, + "grad_norm": 0.669987678527832, + "learning_rate": 9.937982089103566e-05, + "loss": 2.1883, + "step": 2565 + }, + { + "epoch": 0.7875997544505832, + "grad_norm": 0.6769465208053589, + "learning_rate": 9.937904020082815e-05, + "loss": 2.1508, + "step": 2566 + }, + { + "epoch": 0.7879066912216084, + "grad_norm": 0.5796112418174744, + "learning_rate": 9.937825902262805e-05, + "loss": 2.0925, + "step": 2567 + }, + { + "epoch": 0.7882136279926335, + "grad_norm": 0.5895870923995972, + "learning_rate": 9.937747735644305e-05, + "loss": 2.1002, + "step": 2568 + }, + { + "epoch": 0.7885205647636587, + "grad_norm": 0.5870219469070435, + "learning_rate": 9.937669520228088e-05, + "loss": 2.1189, + "step": 2569 + }, + { + "epoch": 0.7888275015346838, + "grad_norm": 0.6191404461860657, + "learning_rate": 9.937591256014925e-05, + "loss": 2.1783, + "step": 2570 + }, + { + "epoch": 0.7891344383057091, + "grad_norm": 0.6033806204795837, + "learning_rate": 9.937512943005592e-05, + "loss": 2.1507, + "step": 2571 + }, + { + "epoch": 0.7894413750767342, + "grad_norm": 0.6319470405578613, + "learning_rate": 9.937434581200863e-05, + "loss": 2.2088, + "step": 2572 + }, + { + "epoch": 0.7897483118477594, + "grad_norm": 0.621004581451416, + "learning_rate": 9.93735617060151e-05, + "loss": 2.1523, + "step": 2573 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.6069821715354919, + "learning_rate": 9.937277711208311e-05, + "loss": 2.1437, + "step": 2574 + }, + { + "epoch": 0.7903621853898097, + "grad_norm": 0.6186996102333069, + "learning_rate": 9.937199203022039e-05, + "loss": 2.1541, + "step": 2575 + }, + { + "epoch": 0.7906691221608348, + "grad_norm": 0.6531949639320374, + "learning_rate": 9.937120646043471e-05, + "loss": 2.1928, + "step": 2576 + }, + { + "epoch": 0.7909760589318601, + "grad_norm": 0.5974560379981995, + "learning_rate": 9.937042040273383e-05, + "loss": 2.1814, + "step": 2577 + }, + { + "epoch": 0.7912829957028852, + "grad_norm": 0.59506756067276, + "learning_rate": 9.936963385712552e-05, + "loss": 2.2143, + "step": 2578 + }, + { + "epoch": 0.7915899324739104, + "grad_norm": 0.5878757834434509, + "learning_rate": 9.936884682361755e-05, + "loss": 2.0718, + "step": 2579 + }, + { + "epoch": 0.7918968692449355, + "grad_norm": 0.6318243145942688, + "learning_rate": 9.936805930221769e-05, + "loss": 2.1465, + "step": 2580 + }, + { + "epoch": 0.7922038060159607, + "grad_norm": 0.6474836468696594, + "learning_rate": 9.936727129293376e-05, + "loss": 2.0869, + "step": 2581 + }, + { + "epoch": 0.7925107427869859, + "grad_norm": 0.6589438915252686, + "learning_rate": 9.936648279577349e-05, + "loss": 2.1422, + "step": 2582 + }, + { + "epoch": 0.7928176795580111, + "grad_norm": 0.6935134530067444, + "learning_rate": 9.93656938107447e-05, + "loss": 2.1571, + "step": 2583 + }, + { + "epoch": 0.7931246163290362, + "grad_norm": 0.655430793762207, + "learning_rate": 9.936490433785522e-05, + "loss": 2.1044, + "step": 2584 + }, + { + "epoch": 0.7934315531000614, + "grad_norm": 0.6856111288070679, + "learning_rate": 9.93641143771128e-05, + "loss": 2.0551, + "step": 2585 + }, + { + "epoch": 0.7937384898710865, + "grad_norm": 0.6783097386360168, + "learning_rate": 9.936332392852527e-05, + "loss": 2.1475, + "step": 2586 + }, + { + "epoch": 0.7940454266421118, + "grad_norm": 0.6746678948402405, + "learning_rate": 9.936253299210045e-05, + "loss": 2.1462, + "step": 2587 + }, + { + "epoch": 0.7943523634131369, + "grad_norm": 0.6854017972946167, + "learning_rate": 9.936174156784614e-05, + "loss": 2.1649, + "step": 2588 + }, + { + "epoch": 0.7946593001841621, + "grad_norm": 0.6740380525588989, + "learning_rate": 9.936094965577017e-05, + "loss": 2.06, + "step": 2589 + }, + { + "epoch": 0.7949662369551872, + "grad_norm": 0.6354179978370667, + "learning_rate": 9.936015725588037e-05, + "loss": 2.1938, + "step": 2590 + }, + { + "epoch": 0.7952731737262124, + "grad_norm": 0.6496716141700745, + "learning_rate": 9.935936436818453e-05, + "loss": 2.089, + "step": 2591 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.5996106266975403, + "learning_rate": 9.935857099269057e-05, + "loss": 2.2254, + "step": 2592 + }, + { + "epoch": 0.7958870472682628, + "grad_norm": 0.5630382895469666, + "learning_rate": 9.935777712940625e-05, + "loss": 2.069, + "step": 2593 + }, + { + "epoch": 0.7961939840392879, + "grad_norm": 0.5480468273162842, + "learning_rate": 9.935698277833946e-05, + "loss": 2.1288, + "step": 2594 + }, + { + "epoch": 0.7965009208103131, + "grad_norm": 0.5127096772193909, + "learning_rate": 9.935618793949803e-05, + "loss": 2.0753, + "step": 2595 + }, + { + "epoch": 0.7968078575813382, + "grad_norm": 0.6451439261436462, + "learning_rate": 9.935539261288983e-05, + "loss": 2.3005, + "step": 2596 + }, + { + "epoch": 0.7971147943523634, + "grad_norm": 0.7047737836837769, + "learning_rate": 9.935459679852271e-05, + "loss": 2.1307, + "step": 2597 + }, + { + "epoch": 0.7974217311233885, + "grad_norm": 0.6382983922958374, + "learning_rate": 9.935380049640454e-05, + "loss": 2.1136, + "step": 2598 + }, + { + "epoch": 0.7977286678944138, + "grad_norm": 0.7337773442268372, + "learning_rate": 9.935300370654317e-05, + "loss": 2.0719, + "step": 2599 + }, + { + "epoch": 0.7980356046654389, + "grad_norm": 0.7481197118759155, + "learning_rate": 9.935220642894652e-05, + "loss": 2.2263, + "step": 2600 + }, + { + "epoch": 0.7983425414364641, + "grad_norm": 0.7383365631103516, + "learning_rate": 9.93514086636224e-05, + "loss": 2.2207, + "step": 2601 + }, + { + "epoch": 0.7986494782074892, + "grad_norm": 0.800762951374054, + "learning_rate": 9.935061041057876e-05, + "loss": 2.1848, + "step": 2602 + }, + { + "epoch": 0.7989564149785144, + "grad_norm": 0.6972829699516296, + "learning_rate": 9.934981166982346e-05, + "loss": 2.1301, + "step": 2603 + }, + { + "epoch": 0.7992633517495396, + "grad_norm": 0.5842304229736328, + "learning_rate": 9.93490124413644e-05, + "loss": 2.1311, + "step": 2604 + }, + { + "epoch": 0.7995702885205648, + "grad_norm": 0.6070491075515747, + "learning_rate": 9.934821272520946e-05, + "loss": 2.2226, + "step": 2605 + }, + { + "epoch": 0.7998772252915899, + "grad_norm": 0.6141406297683716, + "learning_rate": 9.934741252136656e-05, + "loss": 2.1425, + "step": 2606 + }, + { + "epoch": 0.8001841620626151, + "grad_norm": 0.5515148043632507, + "learning_rate": 9.934661182984363e-05, + "loss": 2.1138, + "step": 2607 + }, + { + "epoch": 0.8004910988336402, + "grad_norm": 0.5819688439369202, + "learning_rate": 9.934581065064854e-05, + "loss": 2.0835, + "step": 2608 + }, + { + "epoch": 0.8007980356046654, + "grad_norm": 0.593979001045227, + "learning_rate": 9.934500898378922e-05, + "loss": 2.2262, + "step": 2609 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.6978363990783691, + "learning_rate": 9.934420682927361e-05, + "loss": 2.1283, + "step": 2610 + }, + { + "epoch": 0.8014119091467158, + "grad_norm": 0.6205853223800659, + "learning_rate": 9.934340418710963e-05, + "loss": 2.1254, + "step": 2611 + }, + { + "epoch": 0.8017188459177409, + "grad_norm": 0.5547113418579102, + "learning_rate": 9.93426010573052e-05, + "loss": 2.0895, + "step": 2612 + }, + { + "epoch": 0.8020257826887661, + "grad_norm": 0.5652415156364441, + "learning_rate": 9.934179743986827e-05, + "loss": 2.1496, + "step": 2613 + }, + { + "epoch": 0.8023327194597912, + "grad_norm": 0.5833094120025635, + "learning_rate": 9.934099333480678e-05, + "loss": 2.1159, + "step": 2614 + }, + { + "epoch": 0.8026396562308165, + "grad_norm": 0.5929473638534546, + "learning_rate": 9.934018874212866e-05, + "loss": 2.1512, + "step": 2615 + }, + { + "epoch": 0.8029465930018416, + "grad_norm": 0.6359207630157471, + "learning_rate": 9.93393836618419e-05, + "loss": 2.1384, + "step": 2616 + }, + { + "epoch": 0.8032535297728668, + "grad_norm": 0.5934728384017944, + "learning_rate": 9.933857809395441e-05, + "loss": 2.1087, + "step": 2617 + }, + { + "epoch": 0.8035604665438919, + "grad_norm": 0.5685787796974182, + "learning_rate": 9.933777203847418e-05, + "loss": 2.1521, + "step": 2618 + }, + { + "epoch": 0.8038674033149171, + "grad_norm": 0.6276339292526245, + "learning_rate": 9.933696549540918e-05, + "loss": 2.1151, + "step": 2619 + }, + { + "epoch": 0.8041743400859422, + "grad_norm": 0.6206804513931274, + "learning_rate": 9.933615846476736e-05, + "loss": 2.1872, + "step": 2620 + }, + { + "epoch": 0.8044812768569675, + "grad_norm": 0.6645623445510864, + "learning_rate": 9.933535094655671e-05, + "loss": 2.217, + "step": 2621 + }, + { + "epoch": 0.8047882136279927, + "grad_norm": 0.6639950275421143, + "learning_rate": 9.93345429407852e-05, + "loss": 2.1479, + "step": 2622 + }, + { + "epoch": 0.8050951503990178, + "grad_norm": 0.6284301280975342, + "learning_rate": 9.933373444746081e-05, + "loss": 2.1763, + "step": 2623 + }, + { + "epoch": 0.805402087170043, + "grad_norm": 0.5974198579788208, + "learning_rate": 9.933292546659156e-05, + "loss": 2.1453, + "step": 2624 + }, + { + "epoch": 0.8057090239410681, + "grad_norm": 0.6465814113616943, + "learning_rate": 9.933211599818541e-05, + "loss": 2.1999, + "step": 2625 + }, + { + "epoch": 0.8060159607120934, + "grad_norm": 0.6099503040313721, + "learning_rate": 9.933130604225038e-05, + "loss": 2.1523, + "step": 2626 + }, + { + "epoch": 0.8063228974831185, + "grad_norm": 0.5749596953392029, + "learning_rate": 9.933049559879448e-05, + "loss": 2.0802, + "step": 2627 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.5541282892227173, + "learning_rate": 9.93296846678257e-05, + "loss": 2.0851, + "step": 2628 + }, + { + "epoch": 0.8069367710251688, + "grad_norm": 0.5884469747543335, + "learning_rate": 9.932887324935207e-05, + "loss": 2.1824, + "step": 2629 + }, + { + "epoch": 0.807243707796194, + "grad_norm": 0.7330854535102844, + "learning_rate": 9.93280613433816e-05, + "loss": 2.1463, + "step": 2630 + }, + { + "epoch": 0.8075506445672191, + "grad_norm": 0.7012677192687988, + "learning_rate": 9.932724894992232e-05, + "loss": 2.0907, + "step": 2631 + }, + { + "epoch": 0.8078575813382444, + "grad_norm": 0.6487980484962463, + "learning_rate": 9.932643606898224e-05, + "loss": 2.2131, + "step": 2632 + }, + { + "epoch": 0.8081645181092695, + "grad_norm": 0.7956567406654358, + "learning_rate": 9.932562270056941e-05, + "loss": 2.2289, + "step": 2633 + }, + { + "epoch": 0.8084714548802947, + "grad_norm": 0.7904889583587646, + "learning_rate": 9.932480884469187e-05, + "loss": 2.195, + "step": 2634 + }, + { + "epoch": 0.8087783916513198, + "grad_norm": 0.8088505864143372, + "learning_rate": 9.932399450135766e-05, + "loss": 2.1199, + "step": 2635 + }, + { + "epoch": 0.809085328422345, + "grad_norm": 0.7557070851325989, + "learning_rate": 9.932317967057483e-05, + "loss": 2.177, + "step": 2636 + }, + { + "epoch": 0.8093922651933702, + "grad_norm": 0.8585113286972046, + "learning_rate": 9.932236435235143e-05, + "loss": 2.2215, + "step": 2637 + }, + { + "epoch": 0.8096992019643954, + "grad_norm": 0.9541242718696594, + "learning_rate": 9.932154854669551e-05, + "loss": 2.0971, + "step": 2638 + }, + { + "epoch": 0.8100061387354205, + "grad_norm": 0.9696017503738403, + "learning_rate": 9.932073225361513e-05, + "loss": 2.1723, + "step": 2639 + }, + { + "epoch": 0.8103130755064457, + "grad_norm": 0.9876028895378113, + "learning_rate": 9.931991547311839e-05, + "loss": 2.2266, + "step": 2640 + }, + { + "epoch": 0.8106200122774708, + "grad_norm": 0.9169884324073792, + "learning_rate": 9.931909820521332e-05, + "loss": 2.1453, + "step": 2641 + }, + { + "epoch": 0.810926949048496, + "grad_norm": 0.7645174860954285, + "learning_rate": 9.931828044990801e-05, + "loss": 2.1683, + "step": 2642 + }, + { + "epoch": 0.8112338858195212, + "grad_norm": 0.6733110547065735, + "learning_rate": 9.931746220721056e-05, + "loss": 2.0869, + "step": 2643 + }, + { + "epoch": 0.8115408225905464, + "grad_norm": 0.6033461689949036, + "learning_rate": 9.931664347712904e-05, + "loss": 2.1395, + "step": 2644 + }, + { + "epoch": 0.8118477593615715, + "grad_norm": 0.5953301191329956, + "learning_rate": 9.931582425967154e-05, + "loss": 2.0886, + "step": 2645 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.6587704420089722, + "learning_rate": 9.931500455484616e-05, + "loss": 2.1846, + "step": 2646 + }, + { + "epoch": 0.8124616329036218, + "grad_norm": 0.5837808847427368, + "learning_rate": 9.931418436266101e-05, + "loss": 2.0953, + "step": 2647 + }, + { + "epoch": 0.8127685696746471, + "grad_norm": 0.5593163967132568, + "learning_rate": 9.931336368312417e-05, + "loss": 2.1044, + "step": 2648 + }, + { + "epoch": 0.8130755064456722, + "grad_norm": 0.5758668780326843, + "learning_rate": 9.931254251624378e-05, + "loss": 2.1813, + "step": 2649 + }, + { + "epoch": 0.8133824432166974, + "grad_norm": 0.7128240466117859, + "learning_rate": 9.931172086202793e-05, + "loss": 2.1743, + "step": 2650 + }, + { + "epoch": 0.8136893799877225, + "grad_norm": 0.6214346885681152, + "learning_rate": 9.931089872048476e-05, + "loss": 2.0566, + "step": 2651 + }, + { + "epoch": 0.8139963167587477, + "grad_norm": 0.6279975771903992, + "learning_rate": 9.931007609162239e-05, + "loss": 2.1487, + "step": 2652 + }, + { + "epoch": 0.8143032535297728, + "grad_norm": 0.6137428879737854, + "learning_rate": 9.930925297544895e-05, + "loss": 2.1281, + "step": 2653 + }, + { + "epoch": 0.8146101903007981, + "grad_norm": 0.7433622479438782, + "learning_rate": 9.930842937197255e-05, + "loss": 2.2398, + "step": 2654 + }, + { + "epoch": 0.8149171270718232, + "grad_norm": 0.7490934729576111, + "learning_rate": 9.930760528120137e-05, + "loss": 2.0626, + "step": 2655 + }, + { + "epoch": 0.8152240638428484, + "grad_norm": 0.6829020380973816, + "learning_rate": 9.930678070314352e-05, + "loss": 2.0685, + "step": 2656 + }, + { + "epoch": 0.8155310006138735, + "grad_norm": 0.6328942775726318, + "learning_rate": 9.930595563780718e-05, + "loss": 2.1415, + "step": 2657 + }, + { + "epoch": 0.8158379373848987, + "grad_norm": 0.6919183135032654, + "learning_rate": 9.930513008520048e-05, + "loss": 2.1764, + "step": 2658 + }, + { + "epoch": 0.8161448741559238, + "grad_norm": 0.6600683331489563, + "learning_rate": 9.930430404533158e-05, + "loss": 2.2252, + "step": 2659 + }, + { + "epoch": 0.8164518109269491, + "grad_norm": 0.6614112257957458, + "learning_rate": 9.930347751820866e-05, + "loss": 2.0842, + "step": 2660 + }, + { + "epoch": 0.8167587476979742, + "grad_norm": 0.634395182132721, + "learning_rate": 9.930265050383987e-05, + "loss": 2.1784, + "step": 2661 + }, + { + "epoch": 0.8170656844689994, + "grad_norm": 0.6563819050788879, + "learning_rate": 9.930182300223338e-05, + "loss": 2.1845, + "step": 2662 + }, + { + "epoch": 0.8173726212400245, + "grad_norm": 0.7023175954818726, + "learning_rate": 9.93009950133974e-05, + "loss": 2.1913, + "step": 2663 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.6042037010192871, + "learning_rate": 9.930016653734007e-05, + "loss": 2.1624, + "step": 2664 + }, + { + "epoch": 0.8179864947820749, + "grad_norm": 0.5729875564575195, + "learning_rate": 9.929933757406962e-05, + "loss": 2.0439, + "step": 2665 + }, + { + "epoch": 0.8182934315531001, + "grad_norm": 0.5399687886238098, + "learning_rate": 9.929850812359421e-05, + "loss": 2.1438, + "step": 2666 + }, + { + "epoch": 0.8186003683241252, + "grad_norm": 0.6325745582580566, + "learning_rate": 9.929767818592205e-05, + "loss": 2.1644, + "step": 2667 + }, + { + "epoch": 0.8189073050951504, + "grad_norm": 0.6303146481513977, + "learning_rate": 9.929684776106134e-05, + "loss": 2.1106, + "step": 2668 + }, + { + "epoch": 0.8192142418661755, + "grad_norm": 0.6482712030410767, + "learning_rate": 9.929601684902027e-05, + "loss": 2.0877, + "step": 2669 + }, + { + "epoch": 0.8195211786372008, + "grad_norm": 0.6858036518096924, + "learning_rate": 9.92951854498071e-05, + "loss": 2.1263, + "step": 2670 + }, + { + "epoch": 0.8198281154082259, + "grad_norm": 0.6214284896850586, + "learning_rate": 9.929435356343e-05, + "loss": 2.1516, + "step": 2671 + }, + { + "epoch": 0.8201350521792511, + "grad_norm": 0.5486865639686584, + "learning_rate": 9.92935211898972e-05, + "loss": 2.1199, + "step": 2672 + }, + { + "epoch": 0.8204419889502762, + "grad_norm": 0.62936931848526, + "learning_rate": 9.929268832921693e-05, + "loss": 2.1555, + "step": 2673 + }, + { + "epoch": 0.8207489257213014, + "grad_norm": 0.6402064561843872, + "learning_rate": 9.929185498139744e-05, + "loss": 2.1017, + "step": 2674 + }, + { + "epoch": 0.8210558624923265, + "grad_norm": 0.7254593372344971, + "learning_rate": 9.929102114644693e-05, + "loss": 2.1145, + "step": 2675 + }, + { + "epoch": 0.8213627992633518, + "grad_norm": 0.776472806930542, + "learning_rate": 9.929018682437366e-05, + "loss": 2.2582, + "step": 2676 + }, + { + "epoch": 0.8216697360343769, + "grad_norm": 0.7073757648468018, + "learning_rate": 9.928935201518587e-05, + "loss": 2.1135, + "step": 2677 + }, + { + "epoch": 0.8219766728054021, + "grad_norm": 0.7075079679489136, + "learning_rate": 9.928851671889184e-05, + "loss": 2.128, + "step": 2678 + }, + { + "epoch": 0.8222836095764272, + "grad_norm": 0.7937450408935547, + "learning_rate": 9.928768093549979e-05, + "loss": 2.1401, + "step": 2679 + }, + { + "epoch": 0.8225905463474524, + "grad_norm": 0.7523970603942871, + "learning_rate": 9.928684466501797e-05, + "loss": 2.2055, + "step": 2680 + }, + { + "epoch": 0.8228974831184775, + "grad_norm": 0.6644876599311829, + "learning_rate": 9.928600790745466e-05, + "loss": 2.1449, + "step": 2681 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.6054069399833679, + "learning_rate": 9.928517066281816e-05, + "loss": 2.1191, + "step": 2682 + }, + { + "epoch": 0.8235113566605279, + "grad_norm": 0.6610973477363586, + "learning_rate": 9.92843329311167e-05, + "loss": 2.2247, + "step": 2683 + }, + { + "epoch": 0.8238182934315531, + "grad_norm": 0.69968181848526, + "learning_rate": 9.928349471235858e-05, + "loss": 2.149, + "step": 2684 + }, + { + "epoch": 0.8241252302025782, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.928265600655206e-05, + "loss": 2.1906, + "step": 2685 + }, + { + "epoch": 0.8244321669736034, + "grad_norm": 0.6621972918510437, + "learning_rate": 9.928181681370547e-05, + "loss": 2.1259, + "step": 2686 + }, + { + "epoch": 0.8247391037446286, + "grad_norm": 0.6452053785324097, + "learning_rate": 9.928097713382708e-05, + "loss": 2.1301, + "step": 2687 + }, + { + "epoch": 0.8250460405156538, + "grad_norm": 0.6137326955795288, + "learning_rate": 9.928013696692519e-05, + "loss": 2.0942, + "step": 2688 + }, + { + "epoch": 0.8253529772866789, + "grad_norm": 0.6449215412139893, + "learning_rate": 9.92792963130081e-05, + "loss": 2.2135, + "step": 2689 + }, + { + "epoch": 0.8256599140577041, + "grad_norm": 0.5838732123374939, + "learning_rate": 9.927845517208411e-05, + "loss": 2.1161, + "step": 2690 + }, + { + "epoch": 0.8259668508287292, + "grad_norm": 0.6642805337905884, + "learning_rate": 9.927761354416157e-05, + "loss": 2.1228, + "step": 2691 + }, + { + "epoch": 0.8262737875997545, + "grad_norm": 0.653274416923523, + "learning_rate": 9.927677142924874e-05, + "loss": 2.1777, + "step": 2692 + }, + { + "epoch": 0.8265807243707797, + "grad_norm": 0.6471827030181885, + "learning_rate": 9.927592882735398e-05, + "loss": 2.0756, + "step": 2693 + }, + { + "epoch": 0.8268876611418048, + "grad_norm": 0.6215457916259766, + "learning_rate": 9.927508573848562e-05, + "loss": 2.0691, + "step": 2694 + }, + { + "epoch": 0.82719459791283, + "grad_norm": 0.6343390345573425, + "learning_rate": 9.927424216265198e-05, + "loss": 2.2145, + "step": 2695 + }, + { + "epoch": 0.8275015346838551, + "grad_norm": 0.5296334624290466, + "learning_rate": 9.927339809986138e-05, + "loss": 2.0861, + "step": 2696 + }, + { + "epoch": 0.8278084714548803, + "grad_norm": 0.6457146406173706, + "learning_rate": 9.92725535501222e-05, + "loss": 2.1703, + "step": 2697 + }, + { + "epoch": 0.8281154082259055, + "grad_norm": 0.753579318523407, + "learning_rate": 9.927170851344276e-05, + "loss": 2.1628, + "step": 2698 + }, + { + "epoch": 0.8284223449969307, + "grad_norm": 0.7327163815498352, + "learning_rate": 9.927086298983141e-05, + "loss": 2.105, + "step": 2699 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.7786175608634949, + "learning_rate": 9.927001697929653e-05, + "loss": 2.084, + "step": 2700 + }, + { + "epoch": 0.829036218538981, + "grad_norm": 0.6370857357978821, + "learning_rate": 9.926917048184646e-05, + "loss": 2.0888, + "step": 2701 + }, + { + "epoch": 0.8293431553100061, + "grad_norm": 0.6600006818771362, + "learning_rate": 9.926832349748955e-05, + "loss": 2.148, + "step": 2702 + }, + { + "epoch": 0.8296500920810314, + "grad_norm": 0.6266845464706421, + "learning_rate": 9.926747602623422e-05, + "loss": 2.2182, + "step": 2703 + }, + { + "epoch": 0.8299570288520565, + "grad_norm": 0.588934600353241, + "learning_rate": 9.92666280680888e-05, + "loss": 2.1879, + "step": 2704 + }, + { + "epoch": 0.8302639656230817, + "grad_norm": 0.6467881202697754, + "learning_rate": 9.926577962306168e-05, + "loss": 2.1082, + "step": 2705 + }, + { + "epoch": 0.8305709023941068, + "grad_norm": 0.6256638765335083, + "learning_rate": 9.926493069116127e-05, + "loss": 2.1007, + "step": 2706 + }, + { + "epoch": 0.830877839165132, + "grad_norm": 0.5710256099700928, + "learning_rate": 9.926408127239592e-05, + "loss": 2.0783, + "step": 2707 + }, + { + "epoch": 0.8311847759361571, + "grad_norm": 0.5836597681045532, + "learning_rate": 9.926323136677405e-05, + "loss": 2.1292, + "step": 2708 + }, + { + "epoch": 0.8314917127071824, + "grad_norm": 0.6420408487319946, + "learning_rate": 9.926238097430405e-05, + "loss": 2.1191, + "step": 2709 + }, + { + "epoch": 0.8317986494782075, + "grad_norm": 0.6192520260810852, + "learning_rate": 9.926153009499433e-05, + "loss": 2.1401, + "step": 2710 + }, + { + "epoch": 0.8321055862492327, + "grad_norm": 0.5986925959587097, + "learning_rate": 9.92606787288533e-05, + "loss": 2.0466, + "step": 2711 + }, + { + "epoch": 0.8324125230202578, + "grad_norm": 0.6386710405349731, + "learning_rate": 9.925982687588937e-05, + "loss": 2.1975, + "step": 2712 + }, + { + "epoch": 0.832719459791283, + "grad_norm": 0.6678250432014465, + "learning_rate": 9.925897453611095e-05, + "loss": 2.1744, + "step": 2713 + }, + { + "epoch": 0.8330263965623081, + "grad_norm": 0.628873348236084, + "learning_rate": 9.925812170952648e-05, + "loss": 2.0901, + "step": 2714 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6365368366241455, + "learning_rate": 9.925726839614438e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.8336402701043585, + "grad_norm": 0.6812825798988342, + "learning_rate": 9.925641459597309e-05, + "loss": 2.1163, + "step": 2716 + }, + { + "epoch": 0.8339472068753837, + "grad_norm": 0.6961301565170288, + "learning_rate": 9.925556030902103e-05, + "loss": 2.1634, + "step": 2717 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.687017023563385, + "learning_rate": 9.925470553529666e-05, + "loss": 2.1921, + "step": 2718 + }, + { + "epoch": 0.834561080417434, + "grad_norm": 0.6528787612915039, + "learning_rate": 9.925385027480841e-05, + "loss": 2.1148, + "step": 2719 + }, + { + "epoch": 0.8348680171884592, + "grad_norm": 0.6092917323112488, + "learning_rate": 9.925299452756476e-05, + "loss": 2.0154, + "step": 2720 + }, + { + "epoch": 0.8351749539594844, + "grad_norm": 0.6537092328071594, + "learning_rate": 9.925213829357413e-05, + "loss": 2.1775, + "step": 2721 + }, + { + "epoch": 0.8354818907305095, + "grad_norm": 0.6560773849487305, + "learning_rate": 9.925128157284503e-05, + "loss": 2.1628, + "step": 2722 + }, + { + "epoch": 0.8357888275015347, + "grad_norm": 0.5976104140281677, + "learning_rate": 9.925042436538588e-05, + "loss": 2.1527, + "step": 2723 + }, + { + "epoch": 0.8360957642725598, + "grad_norm": 0.6577131152153015, + "learning_rate": 9.924956667120516e-05, + "loss": 2.1449, + "step": 2724 + }, + { + "epoch": 0.836402701043585, + "grad_norm": 0.6574232578277588, + "learning_rate": 9.924870849031136e-05, + "loss": 2.0517, + "step": 2725 + }, + { + "epoch": 0.8367096378146102, + "grad_norm": 0.5988326072692871, + "learning_rate": 9.924784982271297e-05, + "loss": 2.0975, + "step": 2726 + }, + { + "epoch": 0.8370165745856354, + "grad_norm": 0.5970706939697266, + "learning_rate": 9.924699066841845e-05, + "loss": 2.1754, + "step": 2727 + }, + { + "epoch": 0.8373235113566605, + "grad_norm": 0.6547200679779053, + "learning_rate": 9.924613102743632e-05, + "loss": 2.1651, + "step": 2728 + }, + { + "epoch": 0.8376304481276857, + "grad_norm": 0.643358588218689, + "learning_rate": 9.924527089977504e-05, + "loss": 2.1355, + "step": 2729 + }, + { + "epoch": 0.8379373848987108, + "grad_norm": 0.6696504950523376, + "learning_rate": 9.924441028544314e-05, + "loss": 2.1444, + "step": 2730 + }, + { + "epoch": 0.8382443216697361, + "grad_norm": 0.5923263430595398, + "learning_rate": 9.924354918444911e-05, + "loss": 2.1656, + "step": 2731 + }, + { + "epoch": 0.8385512584407612, + "grad_norm": 0.6507698893547058, + "learning_rate": 9.924268759680146e-05, + "loss": 2.1172, + "step": 2732 + }, + { + "epoch": 0.8388581952117864, + "grad_norm": 0.6240561008453369, + "learning_rate": 9.924182552250873e-05, + "loss": 2.113, + "step": 2733 + }, + { + "epoch": 0.8391651319828115, + "grad_norm": 0.7350605726242065, + "learning_rate": 9.92409629615794e-05, + "loss": 2.2099, + "step": 2734 + }, + { + "epoch": 0.8394720687538367, + "grad_norm": 0.679027795791626, + "learning_rate": 9.924009991402202e-05, + "loss": 2.1202, + "step": 2735 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.7187801003456116, + "learning_rate": 9.923923637984512e-05, + "loss": 2.1994, + "step": 2736 + }, + { + "epoch": 0.8400859422958871, + "grad_norm": 0.7437569499015808, + "learning_rate": 9.92383723590572e-05, + "loss": 2.1778, + "step": 2737 + }, + { + "epoch": 0.8403928790669122, + "grad_norm": 0.7004902958869934, + "learning_rate": 9.923750785166686e-05, + "loss": 2.1478, + "step": 2738 + }, + { + "epoch": 0.8406998158379374, + "grad_norm": 0.632478654384613, + "learning_rate": 9.923664285768258e-05, + "loss": 2.1785, + "step": 2739 + }, + { + "epoch": 0.8410067526089625, + "grad_norm": 0.6399826407432556, + "learning_rate": 9.923577737711295e-05, + "loss": 2.1708, + "step": 2740 + }, + { + "epoch": 0.8413136893799877, + "grad_norm": 0.649340033531189, + "learning_rate": 9.92349114099665e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.8416206261510129, + "grad_norm": 0.6143749952316284, + "learning_rate": 9.923404495625182e-05, + "loss": 2.0696, + "step": 2742 + }, + { + "epoch": 0.8419275629220381, + "grad_norm": 0.655846357345581, + "learning_rate": 9.923317801597742e-05, + "loss": 2.1163, + "step": 2743 + }, + { + "epoch": 0.8422344996930632, + "grad_norm": 0.588096022605896, + "learning_rate": 9.923231058915192e-05, + "loss": 2.0893, + "step": 2744 + }, + { + "epoch": 0.8425414364640884, + "grad_norm": 0.5445908904075623, + "learning_rate": 9.923144267578386e-05, + "loss": 2.1223, + "step": 2745 + }, + { + "epoch": 0.8428483732351135, + "grad_norm": 0.5372910499572754, + "learning_rate": 9.923057427588182e-05, + "loss": 2.1386, + "step": 2746 + }, + { + "epoch": 0.8431553100061387, + "grad_norm": 0.5118899345397949, + "learning_rate": 9.922970538945442e-05, + "loss": 2.0532, + "step": 2747 + }, + { + "epoch": 0.8434622467771639, + "grad_norm": 0.5252440571784973, + "learning_rate": 9.922883601651019e-05, + "loss": 2.1679, + "step": 2748 + }, + { + "epoch": 0.8437691835481891, + "grad_norm": 0.5978875160217285, + "learning_rate": 9.922796615705776e-05, + "loss": 2.2054, + "step": 2749 + }, + { + "epoch": 0.8440761203192142, + "grad_norm": 0.5642610788345337, + "learning_rate": 9.922709581110572e-05, + "loss": 2.1886, + "step": 2750 + }, + { + "epoch": 0.8443830570902394, + "grad_norm": 0.6332407593727112, + "learning_rate": 9.922622497866265e-05, + "loss": 2.1618, + "step": 2751 + }, + { + "epoch": 0.8446899938612645, + "grad_norm": 0.6971728801727295, + "learning_rate": 9.922535365973718e-05, + "loss": 2.1011, + "step": 2752 + }, + { + "epoch": 0.8449969306322898, + "grad_norm": 0.6917250156402588, + "learning_rate": 9.922448185433792e-05, + "loss": 2.1408, + "step": 2753 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.748960554599762, + "learning_rate": 9.922360956247348e-05, + "loss": 2.1612, + "step": 2754 + }, + { + "epoch": 0.8456108041743401, + "grad_norm": 0.6739722490310669, + "learning_rate": 9.922273678415245e-05, + "loss": 2.1234, + "step": 2755 + }, + { + "epoch": 0.8459177409453652, + "grad_norm": 0.6310722827911377, + "learning_rate": 9.922186351938351e-05, + "loss": 2.1476, + "step": 2756 + }, + { + "epoch": 0.8462246777163904, + "grad_norm": 0.5992079973220825, + "learning_rate": 9.922098976817527e-05, + "loss": 2.1009, + "step": 2757 + }, + { + "epoch": 0.8465316144874155, + "grad_norm": 0.5697188973426819, + "learning_rate": 9.922011553053637e-05, + "loss": 2.1277, + "step": 2758 + }, + { + "epoch": 0.8468385512584408, + "grad_norm": 0.7005256414413452, + "learning_rate": 9.921924080647541e-05, + "loss": 2.1592, + "step": 2759 + }, + { + "epoch": 0.8471454880294659, + "grad_norm": 0.7664382457733154, + "learning_rate": 9.921836559600109e-05, + "loss": 2.2328, + "step": 2760 + }, + { + "epoch": 0.8474524248004911, + "grad_norm": 0.8668230772018433, + "learning_rate": 9.921748989912201e-05, + "loss": 2.2285, + "step": 2761 + }, + { + "epoch": 0.8477593615715162, + "grad_norm": 0.9423169493675232, + "learning_rate": 9.921661371584685e-05, + "loss": 2.1172, + "step": 2762 + }, + { + "epoch": 0.8480662983425414, + "grad_norm": 0.8547552824020386, + "learning_rate": 9.921573704618428e-05, + "loss": 2.1426, + "step": 2763 + }, + { + "epoch": 0.8483732351135667, + "grad_norm": 0.7568690776824951, + "learning_rate": 9.921485989014294e-05, + "loss": 2.0861, + "step": 2764 + }, + { + "epoch": 0.8486801718845918, + "grad_norm": 0.6535828709602356, + "learning_rate": 9.92139822477315e-05, + "loss": 2.1705, + "step": 2765 + }, + { + "epoch": 0.848987108655617, + "grad_norm": 0.6099218130111694, + "learning_rate": 9.921310411895867e-05, + "loss": 2.1666, + "step": 2766 + }, + { + "epoch": 0.8492940454266421, + "grad_norm": 0.6315065026283264, + "learning_rate": 9.92122255038331e-05, + "loss": 2.1868, + "step": 2767 + }, + { + "epoch": 0.8496009821976673, + "grad_norm": 0.6861329078674316, + "learning_rate": 9.921134640236344e-05, + "loss": 2.1056, + "step": 2768 + }, + { + "epoch": 0.8499079189686924, + "grad_norm": 0.6357519626617432, + "learning_rate": 9.921046681455844e-05, + "loss": 2.1272, + "step": 2769 + }, + { + "epoch": 0.8502148557397177, + "grad_norm": 0.6245810389518738, + "learning_rate": 9.920958674042676e-05, + "loss": 2.1313, + "step": 2770 + }, + { + "epoch": 0.8505217925107428, + "grad_norm": 0.6087192296981812, + "learning_rate": 9.920870617997709e-05, + "loss": 2.123, + "step": 2771 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.6384228467941284, + "learning_rate": 9.920782513321814e-05, + "loss": 2.1343, + "step": 2772 + }, + { + "epoch": 0.8511356660527931, + "grad_norm": 0.6143882274627686, + "learning_rate": 9.920694360015863e-05, + "loss": 2.0706, + "step": 2773 + }, + { + "epoch": 0.8514426028238183, + "grad_norm": 0.5561975240707397, + "learning_rate": 9.920606158080725e-05, + "loss": 2.1015, + "step": 2774 + }, + { + "epoch": 0.8517495395948435, + "grad_norm": 0.5434146523475647, + "learning_rate": 9.920517907517275e-05, + "loss": 2.1306, + "step": 2775 + }, + { + "epoch": 0.8520564763658687, + "grad_norm": 0.6028591990470886, + "learning_rate": 9.920429608326382e-05, + "loss": 2.1665, + "step": 2776 + }, + { + "epoch": 0.8523634131368938, + "grad_norm": 0.6491599082946777, + "learning_rate": 9.920341260508918e-05, + "loss": 2.0715, + "step": 2777 + }, + { + "epoch": 0.852670349907919, + "grad_norm": 0.6350167989730835, + "learning_rate": 9.92025286406576e-05, + "loss": 2.1492, + "step": 2778 + }, + { + "epoch": 0.8529772866789441, + "grad_norm": 0.5726897120475769, + "learning_rate": 9.92016441899778e-05, + "loss": 2.1128, + "step": 2779 + }, + { + "epoch": 0.8532842234499693, + "grad_norm": 0.5680630207061768, + "learning_rate": 9.92007592530585e-05, + "loss": 2.0718, + "step": 2780 + }, + { + "epoch": 0.8535911602209945, + "grad_norm": 0.5901346802711487, + "learning_rate": 9.919987382990845e-05, + "loss": 2.0577, + "step": 2781 + }, + { + "epoch": 0.8538980969920197, + "grad_norm": 0.5756994485855103, + "learning_rate": 9.919898792053643e-05, + "loss": 2.106, + "step": 2782 + }, + { + "epoch": 0.8542050337630448, + "grad_norm": 0.5831238031387329, + "learning_rate": 9.919810152495116e-05, + "loss": 2.0507, + "step": 2783 + }, + { + "epoch": 0.85451197053407, + "grad_norm": 0.529931902885437, + "learning_rate": 9.919721464316143e-05, + "loss": 2.0934, + "step": 2784 + }, + { + "epoch": 0.8548189073050951, + "grad_norm": 0.603672981262207, + "learning_rate": 9.919632727517597e-05, + "loss": 2.164, + "step": 2785 + }, + { + "epoch": 0.8551258440761204, + "grad_norm": 0.5741528868675232, + "learning_rate": 9.919543942100357e-05, + "loss": 2.0948, + "step": 2786 + }, + { + "epoch": 0.8554327808471455, + "grad_norm": 0.5689142942428589, + "learning_rate": 9.919455108065303e-05, + "loss": 2.1572, + "step": 2787 + }, + { + "epoch": 0.8557397176181707, + "grad_norm": 0.5767523646354675, + "learning_rate": 9.919366225413308e-05, + "loss": 2.0528, + "step": 2788 + }, + { + "epoch": 0.8560466543891958, + "grad_norm": 0.6004374623298645, + "learning_rate": 9.919277294145252e-05, + "loss": 2.1078, + "step": 2789 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.6199560761451721, + "learning_rate": 9.919188314262017e-05, + "loss": 2.034, + "step": 2790 + }, + { + "epoch": 0.8566605279312461, + "grad_norm": 0.5928464531898499, + "learning_rate": 9.919099285764478e-05, + "loss": 2.1226, + "step": 2791 + }, + { + "epoch": 0.8569674647022714, + "grad_norm": 0.5620111227035522, + "learning_rate": 9.919010208653517e-05, + "loss": 2.1387, + "step": 2792 + }, + { + "epoch": 0.8572744014732965, + "grad_norm": 0.6035314798355103, + "learning_rate": 9.918921082930015e-05, + "loss": 2.0888, + "step": 2793 + }, + { + "epoch": 0.8575813382443217, + "grad_norm": 0.6842171549797058, + "learning_rate": 9.91883190859485e-05, + "loss": 2.15, + "step": 2794 + }, + { + "epoch": 0.8578882750153468, + "grad_norm": 0.7600229978561401, + "learning_rate": 9.918742685648906e-05, + "loss": 2.1776, + "step": 2795 + }, + { + "epoch": 0.858195211786372, + "grad_norm": 0.641504168510437, + "learning_rate": 9.918653414093065e-05, + "loss": 2.086, + "step": 2796 + }, + { + "epoch": 0.8585021485573971, + "grad_norm": 0.6062462329864502, + "learning_rate": 9.918564093928207e-05, + "loss": 2.0772, + "step": 2797 + }, + { + "epoch": 0.8588090853284224, + "grad_norm": 0.5259165167808533, + "learning_rate": 9.918474725155214e-05, + "loss": 2.1034, + "step": 2798 + }, + { + "epoch": 0.8591160220994475, + "grad_norm": 0.532511830329895, + "learning_rate": 9.918385307774973e-05, + "loss": 2.103, + "step": 2799 + }, + { + "epoch": 0.8594229588704727, + "grad_norm": 0.5996485352516174, + "learning_rate": 9.918295841788366e-05, + "loss": 2.1698, + "step": 2800 + }, + { + "epoch": 0.8597298956414978, + "grad_norm": 0.5895976424217224, + "learning_rate": 9.918206327196276e-05, + "loss": 2.132, + "step": 2801 + }, + { + "epoch": 0.860036832412523, + "grad_norm": 0.6363179087638855, + "learning_rate": 9.918116763999588e-05, + "loss": 2.0967, + "step": 2802 + }, + { + "epoch": 0.8603437691835482, + "grad_norm": 0.6594113707542419, + "learning_rate": 9.918027152199187e-05, + "loss": 2.1266, + "step": 2803 + }, + { + "epoch": 0.8606507059545734, + "grad_norm": 0.694879412651062, + "learning_rate": 9.917937491795961e-05, + "loss": 2.0694, + "step": 2804 + }, + { + "epoch": 0.8609576427255985, + "grad_norm": 0.6310710906982422, + "learning_rate": 9.917847782790793e-05, + "loss": 2.1546, + "step": 2805 + }, + { + "epoch": 0.8612645794966237, + "grad_norm": 0.6166081428527832, + "learning_rate": 9.917758025184572e-05, + "loss": 2.131, + "step": 2806 + }, + { + "epoch": 0.8615715162676488, + "grad_norm": 0.5857066512107849, + "learning_rate": 9.917668218978182e-05, + "loss": 2.1529, + "step": 2807 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.6374151706695557, + "learning_rate": 9.917578364172513e-05, + "loss": 2.151, + "step": 2808 + }, + { + "epoch": 0.8621853898096992, + "grad_norm": 0.6760959625244141, + "learning_rate": 9.917488460768453e-05, + "loss": 2.1955, + "step": 2809 + }, + { + "epoch": 0.8624923265807244, + "grad_norm": 0.6308501362800598, + "learning_rate": 9.917398508766889e-05, + "loss": 2.1449, + "step": 2810 + }, + { + "epoch": 0.8627992633517495, + "grad_norm": 0.615181028842926, + "learning_rate": 9.91730850816871e-05, + "loss": 2.0326, + "step": 2811 + }, + { + "epoch": 0.8631062001227747, + "grad_norm": 0.6746891736984253, + "learning_rate": 9.917218458974809e-05, + "loss": 2.1472, + "step": 2812 + }, + { + "epoch": 0.8634131368937998, + "grad_norm": 0.6594959497451782, + "learning_rate": 9.91712836118607e-05, + "loss": 2.0879, + "step": 2813 + }, + { + "epoch": 0.8637200736648251, + "grad_norm": 0.6843087077140808, + "learning_rate": 9.91703821480339e-05, + "loss": 2.13, + "step": 2814 + }, + { + "epoch": 0.8640270104358502, + "grad_norm": 0.7513928413391113, + "learning_rate": 9.916948019827653e-05, + "loss": 2.1866, + "step": 2815 + }, + { + "epoch": 0.8643339472068754, + "grad_norm": 0.7352319955825806, + "learning_rate": 9.916857776259755e-05, + "loss": 2.0844, + "step": 2816 + }, + { + "epoch": 0.8646408839779005, + "grad_norm": 0.6901769638061523, + "learning_rate": 9.916767484100587e-05, + "loss": 2.086, + "step": 2817 + }, + { + "epoch": 0.8649478207489257, + "grad_norm": 0.621734619140625, + "learning_rate": 9.91667714335104e-05, + "loss": 2.0764, + "step": 2818 + }, + { + "epoch": 0.8652547575199508, + "grad_norm": 0.5779813528060913, + "learning_rate": 9.916586754012008e-05, + "loss": 2.0568, + "step": 2819 + }, + { + "epoch": 0.8655616942909761, + "grad_norm": 0.566251814365387, + "learning_rate": 9.916496316084385e-05, + "loss": 2.1624, + "step": 2820 + }, + { + "epoch": 0.8658686310620012, + "grad_norm": 0.6039763689041138, + "learning_rate": 9.916405829569062e-05, + "loss": 2.0412, + "step": 2821 + }, + { + "epoch": 0.8661755678330264, + "grad_norm": 0.587469220161438, + "learning_rate": 9.916315294466935e-05, + "loss": 2.1513, + "step": 2822 + }, + { + "epoch": 0.8664825046040515, + "grad_norm": 0.5792883634567261, + "learning_rate": 9.916224710778901e-05, + "loss": 2.055, + "step": 2823 + }, + { + "epoch": 0.8667894413750767, + "grad_norm": 0.5533844232559204, + "learning_rate": 9.916134078505852e-05, + "loss": 2.1237, + "step": 2824 + }, + { + "epoch": 0.8670963781461019, + "grad_norm": 0.6140845417976379, + "learning_rate": 9.916043397648685e-05, + "loss": 2.1481, + "step": 2825 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.6092365384101868, + "learning_rate": 9.915952668208295e-05, + "loss": 2.1567, + "step": 2826 + }, + { + "epoch": 0.8677102516881522, + "grad_norm": 0.5712884068489075, + "learning_rate": 9.915861890185578e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.8680171884591774, + "grad_norm": 0.5314213633537292, + "learning_rate": 9.915771063581434e-05, + "loss": 2.0408, + "step": 2828 + }, + { + "epoch": 0.8683241252302025, + "grad_norm": 0.5258345007896423, + "learning_rate": 9.915680188396759e-05, + "loss": 2.0968, + "step": 2829 + }, + { + "epoch": 0.8686310620012277, + "grad_norm": 0.6071497797966003, + "learning_rate": 9.915589264632453e-05, + "loss": 2.0924, + "step": 2830 + }, + { + "epoch": 0.8689379987722529, + "grad_norm": 0.6742420792579651, + "learning_rate": 9.915498292289408e-05, + "loss": 2.1276, + "step": 2831 + }, + { + "epoch": 0.8692449355432781, + "grad_norm": 0.7642729878425598, + "learning_rate": 9.915407271368533e-05, + "loss": 2.204, + "step": 2832 + }, + { + "epoch": 0.8695518723143032, + "grad_norm": 0.8024489283561707, + "learning_rate": 9.915316201870718e-05, + "loss": 2.163, + "step": 2833 + }, + { + "epoch": 0.8698588090853284, + "grad_norm": 0.8268367648124695, + "learning_rate": 9.915225083796871e-05, + "loss": 2.117, + "step": 2834 + }, + { + "epoch": 0.8701657458563536, + "grad_norm": 0.7761407494544983, + "learning_rate": 9.915133917147888e-05, + "loss": 2.0727, + "step": 2835 + }, + { + "epoch": 0.8704726826273788, + "grad_norm": 0.7515753507614136, + "learning_rate": 9.91504270192467e-05, + "loss": 2.075, + "step": 2836 + }, + { + "epoch": 0.870779619398404, + "grad_norm": 0.6203973889350891, + "learning_rate": 9.914951438128119e-05, + "loss": 2.1163, + "step": 2837 + }, + { + "epoch": 0.8710865561694291, + "grad_norm": 0.6056976318359375, + "learning_rate": 9.914860125759138e-05, + "loss": 2.1515, + "step": 2838 + }, + { + "epoch": 0.8713934929404543, + "grad_norm": 0.6472234725952148, + "learning_rate": 9.914768764818627e-05, + "loss": 2.1618, + "step": 2839 + }, + { + "epoch": 0.8717004297114794, + "grad_norm": 0.5981749892234802, + "learning_rate": 9.914677355307491e-05, + "loss": 2.0763, + "step": 2840 + }, + { + "epoch": 0.8720073664825047, + "grad_norm": 0.5721938014030457, + "learning_rate": 9.914585897226634e-05, + "loss": 2.0916, + "step": 2841 + }, + { + "epoch": 0.8723143032535298, + "grad_norm": 0.6079535484313965, + "learning_rate": 9.914494390576958e-05, + "loss": 2.0767, + "step": 2842 + }, + { + "epoch": 0.872621240024555, + "grad_norm": 0.6684066653251648, + "learning_rate": 9.914402835359368e-05, + "loss": 2.2712, + "step": 2843 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.6992711424827576, + "learning_rate": 9.91431123157477e-05, + "loss": 2.0813, + "step": 2844 + }, + { + "epoch": 0.8732351135666053, + "grad_norm": 0.6585392951965332, + "learning_rate": 9.914219579224065e-05, + "loss": 2.1303, + "step": 2845 + }, + { + "epoch": 0.8735420503376304, + "grad_norm": 0.7267395257949829, + "learning_rate": 9.914127878308164e-05, + "loss": 2.2253, + "step": 2846 + }, + { + "epoch": 0.8738489871086557, + "grad_norm": 0.6764006018638611, + "learning_rate": 9.91403612882797e-05, + "loss": 2.0886, + "step": 2847 + }, + { + "epoch": 0.8741559238796808, + "grad_norm": 0.612808108329773, + "learning_rate": 9.91394433078439e-05, + "loss": 2.0469, + "step": 2848 + }, + { + "epoch": 0.874462860650706, + "grad_norm": 0.5598782896995544, + "learning_rate": 9.913852484178334e-05, + "loss": 2.1745, + "step": 2849 + }, + { + "epoch": 0.8747697974217311, + "grad_norm": 0.6498168706893921, + "learning_rate": 9.913760589010707e-05, + "loss": 2.2657, + "step": 2850 + }, + { + "epoch": 0.8750767341927563, + "grad_norm": 0.6796014904975891, + "learning_rate": 9.913668645282418e-05, + "loss": 2.1056, + "step": 2851 + }, + { + "epoch": 0.8753836709637814, + "grad_norm": 0.7409440279006958, + "learning_rate": 9.913576652994376e-05, + "loss": 2.1533, + "step": 2852 + }, + { + "epoch": 0.8756906077348067, + "grad_norm": 0.7044464945793152, + "learning_rate": 9.913484612147488e-05, + "loss": 2.2088, + "step": 2853 + }, + { + "epoch": 0.8759975445058318, + "grad_norm": 0.6333544254302979, + "learning_rate": 9.913392522742666e-05, + "loss": 2.132, + "step": 2854 + }, + { + "epoch": 0.876304481276857, + "grad_norm": 0.603382408618927, + "learning_rate": 9.91330038478082e-05, + "loss": 2.0657, + "step": 2855 + }, + { + "epoch": 0.8766114180478821, + "grad_norm": 0.5919856429100037, + "learning_rate": 9.913208198262858e-05, + "loss": 2.0854, + "step": 2856 + }, + { + "epoch": 0.8769183548189073, + "grad_norm": 0.6033365726470947, + "learning_rate": 9.913115963189694e-05, + "loss": 2.0825, + "step": 2857 + }, + { + "epoch": 0.8772252915899325, + "grad_norm": 0.5917964577674866, + "learning_rate": 9.913023679562238e-05, + "loss": 2.1608, + "step": 2858 + }, + { + "epoch": 0.8775322283609577, + "grad_norm": 0.5953360795974731, + "learning_rate": 9.912931347381402e-05, + "loss": 2.1454, + "step": 2859 + }, + { + "epoch": 0.8778391651319828, + "grad_norm": 0.5949352979660034, + "learning_rate": 9.9128389666481e-05, + "loss": 2.1575, + "step": 2860 + }, + { + "epoch": 0.878146101903008, + "grad_norm": 0.5468181371688843, + "learning_rate": 9.912746537363243e-05, + "loss": 2.151, + "step": 2861 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.5476632714271545, + "learning_rate": 9.912654059527746e-05, + "loss": 2.1015, + "step": 2862 + }, + { + "epoch": 0.8787599754450584, + "grad_norm": 0.6881390810012817, + "learning_rate": 9.912561533142521e-05, + "loss": 2.2002, + "step": 2863 + }, + { + "epoch": 0.8790669122160835, + "grad_norm": 0.6663404703140259, + "learning_rate": 9.912468958208486e-05, + "loss": 2.0691, + "step": 2864 + }, + { + "epoch": 0.8793738489871087, + "grad_norm": 0.5739100575447083, + "learning_rate": 9.91237633472655e-05, + "loss": 2.0852, + "step": 2865 + }, + { + "epoch": 0.8796807857581338, + "grad_norm": 0.5227558016777039, + "learning_rate": 9.912283662697635e-05, + "loss": 2.1144, + "step": 2866 + }, + { + "epoch": 0.879987722529159, + "grad_norm": 0.5626821517944336, + "learning_rate": 9.912190942122652e-05, + "loss": 2.0796, + "step": 2867 + }, + { + "epoch": 0.8802946593001841, + "grad_norm": 0.5367855429649353, + "learning_rate": 9.912098173002518e-05, + "loss": 2.0768, + "step": 2868 + }, + { + "epoch": 0.8806015960712094, + "grad_norm": 0.5285482406616211, + "learning_rate": 9.912005355338152e-05, + "loss": 2.0832, + "step": 2869 + }, + { + "epoch": 0.8809085328422345, + "grad_norm": 0.5384502410888672, + "learning_rate": 9.91191248913047e-05, + "loss": 2.0187, + "step": 2870 + }, + { + "epoch": 0.8812154696132597, + "grad_norm": 0.5099567770957947, + "learning_rate": 9.91181957438039e-05, + "loss": 2.0865, + "step": 2871 + }, + { + "epoch": 0.8815224063842848, + "grad_norm": 0.5513966679573059, + "learning_rate": 9.911726611088831e-05, + "loss": 2.1097, + "step": 2872 + }, + { + "epoch": 0.88182934315531, + "grad_norm": 0.5411790609359741, + "learning_rate": 9.911633599256709e-05, + "loss": 2.0964, + "step": 2873 + }, + { + "epoch": 0.8821362799263351, + "grad_norm": 0.6151100397109985, + "learning_rate": 9.911540538884947e-05, + "loss": 2.1006, + "step": 2874 + }, + { + "epoch": 0.8824432166973604, + "grad_norm": 0.754391610622406, + "learning_rate": 9.911447429974461e-05, + "loss": 2.1493, + "step": 2875 + }, + { + "epoch": 0.8827501534683855, + "grad_norm": 0.7485715746879578, + "learning_rate": 9.911354272526172e-05, + "loss": 2.1136, + "step": 2876 + }, + { + "epoch": 0.8830570902394107, + "grad_norm": 0.6808591485023499, + "learning_rate": 9.911261066541003e-05, + "loss": 2.1238, + "step": 2877 + }, + { + "epoch": 0.8833640270104358, + "grad_norm": 0.5771127343177795, + "learning_rate": 9.911167812019874e-05, + "loss": 2.0846, + "step": 2878 + }, + { + "epoch": 0.883670963781461, + "grad_norm": 0.5991767048835754, + "learning_rate": 9.911074508963705e-05, + "loss": 2.1486, + "step": 2879 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.6899440884590149, + "learning_rate": 9.91098115737342e-05, + "loss": 2.1357, + "step": 2880 + }, + { + "epoch": 0.8842848373235114, + "grad_norm": 0.7102574110031128, + "learning_rate": 9.91088775724994e-05, + "loss": 2.1269, + "step": 2881 + }, + { + "epoch": 0.8845917740945365, + "grad_norm": 0.7238754034042358, + "learning_rate": 9.910794308594189e-05, + "loss": 2.0829, + "step": 2882 + }, + { + "epoch": 0.8848987108655617, + "grad_norm": 0.7232441902160645, + "learning_rate": 9.91070081140709e-05, + "loss": 2.1704, + "step": 2883 + }, + { + "epoch": 0.8852056476365868, + "grad_norm": 0.7136173844337463, + "learning_rate": 9.910607265689569e-05, + "loss": 2.1553, + "step": 2884 + }, + { + "epoch": 0.885512584407612, + "grad_norm": 0.6566216945648193, + "learning_rate": 9.910513671442547e-05, + "loss": 2.0856, + "step": 2885 + }, + { + "epoch": 0.8858195211786372, + "grad_norm": 0.5712916851043701, + "learning_rate": 9.910420028666951e-05, + "loss": 2.1399, + "step": 2886 + }, + { + "epoch": 0.8861264579496624, + "grad_norm": 0.727664589881897, + "learning_rate": 9.910326337363707e-05, + "loss": 2.088, + "step": 2887 + }, + { + "epoch": 0.8864333947206875, + "grad_norm": 0.799963653087616, + "learning_rate": 9.91023259753374e-05, + "loss": 2.0984, + "step": 2888 + }, + { + "epoch": 0.8867403314917127, + "grad_norm": 0.9462977051734924, + "learning_rate": 9.910138809177975e-05, + "loss": 2.1262, + "step": 2889 + }, + { + "epoch": 0.8870472682627378, + "grad_norm": 0.9130533933639526, + "learning_rate": 9.910044972297343e-05, + "loss": 2.1967, + "step": 2890 + }, + { + "epoch": 0.887354205033763, + "grad_norm": 0.6971304416656494, + "learning_rate": 9.909951086892767e-05, + "loss": 2.0797, + "step": 2891 + }, + { + "epoch": 0.8876611418047882, + "grad_norm": 0.5822353363037109, + "learning_rate": 9.909857152965176e-05, + "loss": 2.1152, + "step": 2892 + }, + { + "epoch": 0.8879680785758134, + "grad_norm": 0.5885453820228577, + "learning_rate": 9.9097631705155e-05, + "loss": 2.0323, + "step": 2893 + }, + { + "epoch": 0.8882750153468385, + "grad_norm": 0.6249284744262695, + "learning_rate": 9.909669139544666e-05, + "loss": 2.1076, + "step": 2894 + }, + { + "epoch": 0.8885819521178637, + "grad_norm": 0.6117702722549438, + "learning_rate": 9.909575060053604e-05, + "loss": 2.0608, + "step": 2895 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.560357928276062, + "learning_rate": 9.909480932043245e-05, + "loss": 2.145, + "step": 2896 + }, + { + "epoch": 0.8891958256599141, + "grad_norm": 0.5442607998847961, + "learning_rate": 9.909386755514516e-05, + "loss": 2.1091, + "step": 2897 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.5653077363967896, + "learning_rate": 9.909292530468351e-05, + "loss": 2.1097, + "step": 2898 + }, + { + "epoch": 0.8898096992019644, + "grad_norm": 0.531939685344696, + "learning_rate": 9.909198256905679e-05, + "loss": 2.0866, + "step": 2899 + }, + { + "epoch": 0.8901166359729895, + "grad_norm": 0.6238400340080261, + "learning_rate": 9.909103934827433e-05, + "loss": 2.1421, + "step": 2900 + }, + { + "epoch": 0.8904235727440147, + "grad_norm": 0.5685901045799255, + "learning_rate": 9.909009564234543e-05, + "loss": 2.0019, + "step": 2901 + }, + { + "epoch": 0.8907305095150398, + "grad_norm": 0.5979083180427551, + "learning_rate": 9.908915145127945e-05, + "loss": 2.0891, + "step": 2902 + }, + { + "epoch": 0.8910374462860651, + "grad_norm": 0.5847237706184387, + "learning_rate": 9.90882067750857e-05, + "loss": 2.1165, + "step": 2903 + }, + { + "epoch": 0.8913443830570903, + "grad_norm": 0.6281530261039734, + "learning_rate": 9.908726161377351e-05, + "loss": 2.1396, + "step": 2904 + }, + { + "epoch": 0.8916513198281154, + "grad_norm": 0.5685252547264099, + "learning_rate": 9.908631596735225e-05, + "loss": 2.0781, + "step": 2905 + }, + { + "epoch": 0.8919582565991406, + "grad_norm": 0.5427065491676331, + "learning_rate": 9.908536983583123e-05, + "loss": 2.1387, + "step": 2906 + }, + { + "epoch": 0.8922651933701657, + "grad_norm": 0.5972270965576172, + "learning_rate": 9.908442321921982e-05, + "loss": 2.0546, + "step": 2907 + }, + { + "epoch": 0.892572130141191, + "grad_norm": 0.562685489654541, + "learning_rate": 9.908347611752735e-05, + "loss": 2.093, + "step": 2908 + }, + { + "epoch": 0.8928790669122161, + "grad_norm": 0.6781734824180603, + "learning_rate": 9.908252853076323e-05, + "loss": 2.1589, + "step": 2909 + }, + { + "epoch": 0.8931860036832413, + "grad_norm": 0.7591540813446045, + "learning_rate": 9.908158045893678e-05, + "loss": 2.164, + "step": 2910 + }, + { + "epoch": 0.8934929404542664, + "grad_norm": 0.7161938548088074, + "learning_rate": 9.908063190205738e-05, + "loss": 2.079, + "step": 2911 + }, + { + "epoch": 0.8937998772252916, + "grad_norm": 0.7338036298751831, + "learning_rate": 9.907968286013442e-05, + "loss": 2.0033, + "step": 2912 + }, + { + "epoch": 0.8941068139963168, + "grad_norm": 0.7641176581382751, + "learning_rate": 9.907873333317727e-05, + "loss": 2.187, + "step": 2913 + }, + { + "epoch": 0.894413750767342, + "grad_norm": 0.6073760390281677, + "learning_rate": 9.90777833211953e-05, + "loss": 2.0589, + "step": 2914 + }, + { + "epoch": 0.8947206875383671, + "grad_norm": 0.49493756890296936, + "learning_rate": 9.907683282419791e-05, + "loss": 2.0555, + "step": 2915 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.6428996920585632, + "learning_rate": 9.907588184219449e-05, + "loss": 2.1083, + "step": 2916 + }, + { + "epoch": 0.8953345610804174, + "grad_norm": 0.6752644777297974, + "learning_rate": 9.907493037519447e-05, + "loss": 2.0987, + "step": 2917 + }, + { + "epoch": 0.8956414978514426, + "grad_norm": 0.5719494223594666, + "learning_rate": 9.907397842320719e-05, + "loss": 2.1735, + "step": 2918 + }, + { + "epoch": 0.8959484346224678, + "grad_norm": 0.5799626111984253, + "learning_rate": 9.907302598624211e-05, + "loss": 2.0978, + "step": 2919 + }, + { + "epoch": 0.896255371393493, + "grad_norm": 0.5407500267028809, + "learning_rate": 9.907207306430861e-05, + "loss": 2.0303, + "step": 2920 + }, + { + "epoch": 0.8965623081645181, + "grad_norm": 0.5950884222984314, + "learning_rate": 9.907111965741614e-05, + "loss": 2.0721, + "step": 2921 + }, + { + "epoch": 0.8968692449355433, + "grad_norm": 0.7711441516876221, + "learning_rate": 9.907016576557409e-05, + "loss": 2.1693, + "step": 2922 + }, + { + "epoch": 0.8971761817065684, + "grad_norm": 0.5522177815437317, + "learning_rate": 9.906921138879191e-05, + "loss": 2.1057, + "step": 2923 + }, + { + "epoch": 0.8974831184775937, + "grad_norm": 0.5743894577026367, + "learning_rate": 9.906825652707903e-05, + "loss": 2.119, + "step": 2924 + }, + { + "epoch": 0.8977900552486188, + "grad_norm": 0.5996440649032593, + "learning_rate": 9.906730118044486e-05, + "loss": 2.1251, + "step": 2925 + }, + { + "epoch": 0.898096992019644, + "grad_norm": 0.691302478313446, + "learning_rate": 9.906634534889887e-05, + "loss": 2.1459, + "step": 2926 + }, + { + "epoch": 0.8984039287906691, + "grad_norm": 0.6125866770744324, + "learning_rate": 9.90653890324505e-05, + "loss": 2.0739, + "step": 2927 + }, + { + "epoch": 0.8987108655616943, + "grad_norm": 0.5285681486129761, + "learning_rate": 9.906443223110919e-05, + "loss": 2.0398, + "step": 2928 + }, + { + "epoch": 0.8990178023327194, + "grad_norm": 0.5747935771942139, + "learning_rate": 9.90634749448844e-05, + "loss": 2.0688, + "step": 2929 + }, + { + "epoch": 0.8993247391037447, + "grad_norm": 0.5686646103858948, + "learning_rate": 9.90625171737856e-05, + "loss": 2.1196, + "step": 2930 + }, + { + "epoch": 0.8996316758747698, + "grad_norm": 0.5320247411727905, + "learning_rate": 9.906155891782225e-05, + "loss": 2.1069, + "step": 2931 + }, + { + "epoch": 0.899938612645795, + "grad_norm": 0.5626047849655151, + "learning_rate": 9.906060017700383e-05, + "loss": 2.1091, + "step": 2932 + }, + { + "epoch": 0.9002455494168201, + "grad_norm": 0.5284978151321411, + "learning_rate": 9.905964095133979e-05, + "loss": 2.036, + "step": 2933 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.5362093448638916, + "learning_rate": 9.905868124083962e-05, + "loss": 2.1273, + "step": 2934 + }, + { + "epoch": 0.9008594229588704, + "grad_norm": 0.5583781599998474, + "learning_rate": 9.90577210455128e-05, + "loss": 2.0871, + "step": 2935 + }, + { + "epoch": 0.9011663597298957, + "grad_norm": 0.5552016496658325, + "learning_rate": 9.905676036536883e-05, + "loss": 2.0785, + "step": 2936 + }, + { + "epoch": 0.9014732965009208, + "grad_norm": 0.6875657439231873, + "learning_rate": 9.905579920041724e-05, + "loss": 2.083, + "step": 2937 + }, + { + "epoch": 0.901780233271946, + "grad_norm": 0.5396340489387512, + "learning_rate": 9.905483755066744e-05, + "loss": 2.0717, + "step": 2938 + }, + { + "epoch": 0.9020871700429711, + "grad_norm": 0.594739556312561, + "learning_rate": 9.9053875416129e-05, + "loss": 2.1305, + "step": 2939 + }, + { + "epoch": 0.9023941068139963, + "grad_norm": 0.6208831667900085, + "learning_rate": 9.905291279681143e-05, + "loss": 2.0034, + "step": 2940 + }, + { + "epoch": 0.9027010435850215, + "grad_norm": 0.5154325366020203, + "learning_rate": 9.90519496927242e-05, + "loss": 2.098, + "step": 2941 + }, + { + "epoch": 0.9030079803560467, + "grad_norm": 0.5217738151550293, + "learning_rate": 9.905098610387687e-05, + "loss": 2.0467, + "step": 2942 + }, + { + "epoch": 0.9033149171270718, + "grad_norm": 0.5623623728752136, + "learning_rate": 9.905002203027894e-05, + "loss": 2.1854, + "step": 2943 + }, + { + "epoch": 0.903621853898097, + "grad_norm": 0.5365456938743591, + "learning_rate": 9.904905747193993e-05, + "loss": 2.1021, + "step": 2944 + }, + { + "epoch": 0.9039287906691221, + "grad_norm": 0.5391906499862671, + "learning_rate": 9.904809242886941e-05, + "loss": 2.1102, + "step": 2945 + }, + { + "epoch": 0.9042357274401474, + "grad_norm": 0.5439971685409546, + "learning_rate": 9.904712690107687e-05, + "loss": 2.0691, + "step": 2946 + }, + { + "epoch": 0.9045426642111725, + "grad_norm": 0.539383053779602, + "learning_rate": 9.904616088857189e-05, + "loss": 2.0514, + "step": 2947 + }, + { + "epoch": 0.9048496009821977, + "grad_norm": 0.5370060801506042, + "learning_rate": 9.904519439136399e-05, + "loss": 2.1069, + "step": 2948 + }, + { + "epoch": 0.9051565377532228, + "grad_norm": 0.5136541724205017, + "learning_rate": 9.904422740946274e-05, + "loss": 2.0519, + "step": 2949 + }, + { + "epoch": 0.905463474524248, + "grad_norm": 0.4970051348209381, + "learning_rate": 9.904325994287768e-05, + "loss": 2.0624, + "step": 2950 + }, + { + "epoch": 0.9057704112952731, + "grad_norm": 0.5003986954689026, + "learning_rate": 9.90422919916184e-05, + "loss": 2.135, + "step": 2951 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.5559821724891663, + "learning_rate": 9.904132355569443e-05, + "loss": 2.0733, + "step": 2952 + }, + { + "epoch": 0.9063842848373235, + "grad_norm": 0.5450533628463745, + "learning_rate": 9.904035463511537e-05, + "loss": 2.1491, + "step": 2953 + }, + { + "epoch": 0.9066912216083487, + "grad_norm": 0.5789141058921814, + "learning_rate": 9.903938522989076e-05, + "loss": 2.0604, + "step": 2954 + }, + { + "epoch": 0.9069981583793738, + "grad_norm": 0.6327412128448486, + "learning_rate": 9.903841534003023e-05, + "loss": 2.1307, + "step": 2955 + }, + { + "epoch": 0.907305095150399, + "grad_norm": 0.5694023966789246, + "learning_rate": 9.90374449655433e-05, + "loss": 2.1322, + "step": 2956 + }, + { + "epoch": 0.9076120319214241, + "grad_norm": 0.6241337060928345, + "learning_rate": 9.903647410643963e-05, + "loss": 2.1026, + "step": 2957 + }, + { + "epoch": 0.9079189686924494, + "grad_norm": 0.6257766485214233, + "learning_rate": 9.903550276272878e-05, + "loss": 2.0449, + "step": 2958 + }, + { + "epoch": 0.9082259054634745, + "grad_norm": 0.708626389503479, + "learning_rate": 9.903453093442032e-05, + "loss": 2.095, + "step": 2959 + }, + { + "epoch": 0.9085328422344997, + "grad_norm": 0.6769086122512817, + "learning_rate": 9.903355862152391e-05, + "loss": 2.0939, + "step": 2960 + }, + { + "epoch": 0.9088397790055248, + "grad_norm": 0.6221890449523926, + "learning_rate": 9.903258582404913e-05, + "loss": 2.1552, + "step": 2961 + }, + { + "epoch": 0.90914671577655, + "grad_norm": 0.7477858662605286, + "learning_rate": 9.903161254200561e-05, + "loss": 2.1155, + "step": 2962 + }, + { + "epoch": 0.9094536525475752, + "grad_norm": 0.665538489818573, + "learning_rate": 9.903063877540294e-05, + "loss": 2.1032, + "step": 2963 + }, + { + "epoch": 0.9097605893186004, + "grad_norm": 0.5973435044288635, + "learning_rate": 9.902966452425076e-05, + "loss": 2.0793, + "step": 2964 + }, + { + "epoch": 0.9100675260896255, + "grad_norm": 0.6544547080993652, + "learning_rate": 9.90286897885587e-05, + "loss": 2.1566, + "step": 2965 + }, + { + "epoch": 0.9103744628606507, + "grad_norm": 0.7162452936172485, + "learning_rate": 9.90277145683364e-05, + "loss": 2.1234, + "step": 2966 + }, + { + "epoch": 0.9106813996316758, + "grad_norm": 0.8400503993034363, + "learning_rate": 9.902673886359349e-05, + "loss": 2.216, + "step": 2967 + }, + { + "epoch": 0.910988336402701, + "grad_norm": 1.0350611209869385, + "learning_rate": 9.902576267433961e-05, + "loss": 2.0785, + "step": 2968 + }, + { + "epoch": 0.9112952731737262, + "grad_norm": 0.9551987051963806, + "learning_rate": 9.90247860005844e-05, + "loss": 2.0652, + "step": 2969 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.839712381362915, + "learning_rate": 9.902380884233751e-05, + "loss": 2.1197, + "step": 2970 + }, + { + "epoch": 0.9119091467157765, + "grad_norm": 0.6588022708892822, + "learning_rate": 9.902283119960863e-05, + "loss": 2.155, + "step": 2971 + }, + { + "epoch": 0.9122160834868017, + "grad_norm": 0.6532430052757263, + "learning_rate": 9.902185307240739e-05, + "loss": 2.0947, + "step": 2972 + }, + { + "epoch": 0.9125230202578268, + "grad_norm": 0.7890481352806091, + "learning_rate": 9.902087446074346e-05, + "loss": 2.0246, + "step": 2973 + }, + { + "epoch": 0.9128299570288521, + "grad_norm": 0.6234511137008667, + "learning_rate": 9.901989536462652e-05, + "loss": 2.1033, + "step": 2974 + }, + { + "epoch": 0.9131368937998773, + "grad_norm": 0.5875300168991089, + "learning_rate": 9.901891578406623e-05, + "loss": 2.0553, + "step": 2975 + }, + { + "epoch": 0.9134438305709024, + "grad_norm": 0.6868174076080322, + "learning_rate": 9.901793571907231e-05, + "loss": 2.1398, + "step": 2976 + }, + { + "epoch": 0.9137507673419276, + "grad_norm": 0.7423301339149475, + "learning_rate": 9.90169551696544e-05, + "loss": 2.1034, + "step": 2977 + }, + { + "epoch": 0.9140577041129527, + "grad_norm": 0.588916003704071, + "learning_rate": 9.901597413582222e-05, + "loss": 2.078, + "step": 2978 + }, + { + "epoch": 0.914364640883978, + "grad_norm": 0.5895309448242188, + "learning_rate": 9.901499261758544e-05, + "loss": 2.0902, + "step": 2979 + }, + { + "epoch": 0.9146715776550031, + "grad_norm": 0.5403301119804382, + "learning_rate": 9.901401061495379e-05, + "loss": 2.0291, + "step": 2980 + }, + { + "epoch": 0.9149785144260283, + "grad_norm": 0.6102077960968018, + "learning_rate": 9.901302812793696e-05, + "loss": 2.0415, + "step": 2981 + }, + { + "epoch": 0.9152854511970534, + "grad_norm": 0.6728450059890747, + "learning_rate": 9.901204515654465e-05, + "loss": 2.105, + "step": 2982 + }, + { + "epoch": 0.9155923879680786, + "grad_norm": 0.5886163711547852, + "learning_rate": 9.901106170078657e-05, + "loss": 2.0186, + "step": 2983 + }, + { + "epoch": 0.9158993247391037, + "grad_norm": 0.539252758026123, + "learning_rate": 9.901007776067247e-05, + "loss": 2.0604, + "step": 2984 + }, + { + "epoch": 0.916206261510129, + "grad_norm": 0.6169516444206238, + "learning_rate": 9.900909333621205e-05, + "loss": 2.1257, + "step": 2985 + }, + { + "epoch": 0.9165131982811541, + "grad_norm": 0.5624274015426636, + "learning_rate": 9.900810842741506e-05, + "loss": 2.0325, + "step": 2986 + }, + { + "epoch": 0.9168201350521793, + "grad_norm": 0.5931735634803772, + "learning_rate": 9.900712303429119e-05, + "loss": 2.0815, + "step": 2987 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.5720505714416504, + "learning_rate": 9.900613715685023e-05, + "loss": 2.1261, + "step": 2988 + }, + { + "epoch": 0.9174340085942296, + "grad_norm": 0.5752067565917969, + "learning_rate": 9.900515079510189e-05, + "loss": 2.1402, + "step": 2989 + }, + { + "epoch": 0.9177409453652547, + "grad_norm": 0.5836917757987976, + "learning_rate": 9.900416394905591e-05, + "loss": 2.0523, + "step": 2990 + }, + { + "epoch": 0.91804788213628, + "grad_norm": 0.6408325433731079, + "learning_rate": 9.900317661872209e-05, + "loss": 2.1874, + "step": 2991 + }, + { + "epoch": 0.9183548189073051, + "grad_norm": 0.6188341379165649, + "learning_rate": 9.900218880411013e-05, + "loss": 2.0903, + "step": 2992 + }, + { + "epoch": 0.9186617556783303, + "grad_norm": 0.5740565657615662, + "learning_rate": 9.900120050522985e-05, + "loss": 2.1243, + "step": 2993 + }, + { + "epoch": 0.9189686924493554, + "grad_norm": 0.635638952255249, + "learning_rate": 9.900021172209096e-05, + "loss": 2.089, + "step": 2994 + }, + { + "epoch": 0.9192756292203806, + "grad_norm": 0.5538209676742554, + "learning_rate": 9.899922245470326e-05, + "loss": 2.0489, + "step": 2995 + }, + { + "epoch": 0.9195825659914058, + "grad_norm": 0.5440292954444885, + "learning_rate": 9.899823270307654e-05, + "loss": 2.0534, + "step": 2996 + }, + { + "epoch": 0.919889502762431, + "grad_norm": 0.6203792691230774, + "learning_rate": 9.899724246722055e-05, + "loss": 2.2799, + "step": 2997 + }, + { + "epoch": 0.9201964395334561, + "grad_norm": 0.6299278140068054, + "learning_rate": 9.89962517471451e-05, + "loss": 2.0813, + "step": 2998 + }, + { + "epoch": 0.9205033763044813, + "grad_norm": 0.6156774759292603, + "learning_rate": 9.899526054285997e-05, + "loss": 2.1345, + "step": 2999 + }, + { + "epoch": 0.9208103130755064, + "grad_norm": 0.5940032601356506, + "learning_rate": 9.899426885437496e-05, + "loss": 2.133, + "step": 3000 + }, + { + "epoch": 0.9211172498465316, + "grad_norm": 0.6210232377052307, + "learning_rate": 9.899327668169987e-05, + "loss": 2.0275, + "step": 3001 + }, + { + "epoch": 0.9214241866175568, + "grad_norm": 0.5578985214233398, + "learning_rate": 9.89922840248445e-05, + "loss": 2.0806, + "step": 3002 + }, + { + "epoch": 0.921731123388582, + "grad_norm": 0.5264963507652283, + "learning_rate": 9.899129088381866e-05, + "loss": 2.1233, + "step": 3003 + }, + { + "epoch": 0.9220380601596071, + "grad_norm": 0.5414119958877563, + "learning_rate": 9.899029725863218e-05, + "loss": 2.1052, + "step": 3004 + }, + { + "epoch": 0.9223449969306323, + "grad_norm": 0.5933207869529724, + "learning_rate": 9.898930314929486e-05, + "loss": 2.108, + "step": 3005 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.6170317530632019, + "learning_rate": 9.898830855581654e-05, + "loss": 2.0997, + "step": 3006 + }, + { + "epoch": 0.9229588704726827, + "grad_norm": 0.5930282473564148, + "learning_rate": 9.898731347820705e-05, + "loss": 2.0507, + "step": 3007 + }, + { + "epoch": 0.9232658072437078, + "grad_norm": 0.5894142985343933, + "learning_rate": 9.898631791647619e-05, + "loss": 2.0687, + "step": 3008 + }, + { + "epoch": 0.923572744014733, + "grad_norm": 0.6560437083244324, + "learning_rate": 9.898532187063383e-05, + "loss": 2.096, + "step": 3009 + }, + { + "epoch": 0.9238796807857581, + "grad_norm": 0.6083245873451233, + "learning_rate": 9.898432534068983e-05, + "loss": 2.0526, + "step": 3010 + }, + { + "epoch": 0.9241866175567833, + "grad_norm": 0.5152565240859985, + "learning_rate": 9.8983328326654e-05, + "loss": 2.0802, + "step": 3011 + }, + { + "epoch": 0.9244935543278084, + "grad_norm": 0.6326588988304138, + "learning_rate": 9.89823308285362e-05, + "loss": 2.1246, + "step": 3012 + }, + { + "epoch": 0.9248004910988337, + "grad_norm": 0.6821309328079224, + "learning_rate": 9.898133284634632e-05, + "loss": 2.1106, + "step": 3013 + }, + { + "epoch": 0.9251074278698588, + "grad_norm": 0.6192164421081543, + "learning_rate": 9.898033438009419e-05, + "loss": 2.0475, + "step": 3014 + }, + { + "epoch": 0.925414364640884, + "grad_norm": 0.6112427115440369, + "learning_rate": 9.897933542978967e-05, + "loss": 2.0904, + "step": 3015 + }, + { + "epoch": 0.9257213014119091, + "grad_norm": 0.5729427933692932, + "learning_rate": 9.897833599544268e-05, + "loss": 2.1151, + "step": 3016 + }, + { + "epoch": 0.9260282381829343, + "grad_norm": 0.6200255751609802, + "learning_rate": 9.897733607706305e-05, + "loss": 2.0815, + "step": 3017 + }, + { + "epoch": 0.9263351749539595, + "grad_norm": 0.635920524597168, + "learning_rate": 9.897633567466068e-05, + "loss": 2.0724, + "step": 3018 + }, + { + "epoch": 0.9266421117249847, + "grad_norm": 0.5916038155555725, + "learning_rate": 9.897533478824546e-05, + "loss": 2.1527, + "step": 3019 + }, + { + "epoch": 0.9269490484960098, + "grad_norm": 0.5552941560745239, + "learning_rate": 9.897433341782727e-05, + "loss": 2.0958, + "step": 3020 + }, + { + "epoch": 0.927255985267035, + "grad_norm": 0.562383770942688, + "learning_rate": 9.897333156341602e-05, + "loss": 2.0939, + "step": 3021 + }, + { + "epoch": 0.9275629220380601, + "grad_norm": 0.5227869153022766, + "learning_rate": 9.897232922502158e-05, + "loss": 2.1358, + "step": 3022 + }, + { + "epoch": 0.9278698588090853, + "grad_norm": 0.5671074986457825, + "learning_rate": 9.897132640265391e-05, + "loss": 2.0877, + "step": 3023 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.5176356434822083, + "learning_rate": 9.897032309632287e-05, + "loss": 2.0392, + "step": 3024 + }, + { + "epoch": 0.9284837323511357, + "grad_norm": 0.5160155296325684, + "learning_rate": 9.89693193060384e-05, + "loss": 2.069, + "step": 3025 + }, + { + "epoch": 0.9287906691221608, + "grad_norm": 0.5034440159797668, + "learning_rate": 9.896831503181042e-05, + "loss": 2.0348, + "step": 3026 + }, + { + "epoch": 0.929097605893186, + "grad_norm": 0.5146151781082153, + "learning_rate": 9.896731027364884e-05, + "loss": 2.0884, + "step": 3027 + }, + { + "epoch": 0.9294045426642111, + "grad_norm": 0.7153071165084839, + "learning_rate": 9.896630503156361e-05, + "loss": 2.2295, + "step": 3028 + }, + { + "epoch": 0.9297114794352364, + "grad_norm": 0.7201753258705139, + "learning_rate": 9.896529930556464e-05, + "loss": 2.1285, + "step": 3029 + }, + { + "epoch": 0.9300184162062615, + "grad_norm": 0.7110029458999634, + "learning_rate": 9.89642930956619e-05, + "loss": 2.1371, + "step": 3030 + }, + { + "epoch": 0.9303253529772867, + "grad_norm": 0.695444643497467, + "learning_rate": 9.896328640186531e-05, + "loss": 2.0698, + "step": 3031 + }, + { + "epoch": 0.9306322897483118, + "grad_norm": 0.6157357096672058, + "learning_rate": 9.896227922418482e-05, + "loss": 2.1294, + "step": 3032 + }, + { + "epoch": 0.930939226519337, + "grad_norm": 0.5473730564117432, + "learning_rate": 9.896127156263039e-05, + "loss": 2.0487, + "step": 3033 + }, + { + "epoch": 0.9312461632903621, + "grad_norm": 0.6400229334831238, + "learning_rate": 9.896026341721198e-05, + "loss": 2.0422, + "step": 3034 + }, + { + "epoch": 0.9315531000613874, + "grad_norm": 0.5046324729919434, + "learning_rate": 9.895925478793955e-05, + "loss": 2.0715, + "step": 3035 + }, + { + "epoch": 0.9318600368324125, + "grad_norm": 0.5316528081893921, + "learning_rate": 9.895824567482307e-05, + "loss": 2.11, + "step": 3036 + }, + { + "epoch": 0.9321669736034377, + "grad_norm": 0.5760478973388672, + "learning_rate": 9.895723607787251e-05, + "loss": 2.0885, + "step": 3037 + }, + { + "epoch": 0.9324739103744628, + "grad_norm": 0.5034705996513367, + "learning_rate": 9.895622599709785e-05, + "loss": 2.0024, + "step": 3038 + }, + { + "epoch": 0.932780847145488, + "grad_norm": 0.46088743209838867, + "learning_rate": 9.895521543250906e-05, + "loss": 2.0794, + "step": 3039 + }, + { + "epoch": 0.9330877839165131, + "grad_norm": 0.5219544172286987, + "learning_rate": 9.895420438411616e-05, + "loss": 2.1002, + "step": 3040 + }, + { + "epoch": 0.9333947206875384, + "grad_norm": 0.5363453030586243, + "learning_rate": 9.89531928519291e-05, + "loss": 2.0629, + "step": 3041 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.5860787630081177, + "learning_rate": 9.89521808359579e-05, + "loss": 2.0999, + "step": 3042 + }, + { + "epoch": 0.9340085942295887, + "grad_norm": 0.7155836224555969, + "learning_rate": 9.895116833621255e-05, + "loss": 2.1674, + "step": 3043 + }, + { + "epoch": 0.9343155310006138, + "grad_norm": 0.8029196262359619, + "learning_rate": 9.895015535270307e-05, + "loss": 2.0776, + "step": 3044 + }, + { + "epoch": 0.934622467771639, + "grad_norm": 0.6973832845687866, + "learning_rate": 9.894914188543946e-05, + "loss": 2.0537, + "step": 3045 + }, + { + "epoch": 0.9349294045426643, + "grad_norm": 0.6646706461906433, + "learning_rate": 9.894812793443175e-05, + "loss": 2.0857, + "step": 3046 + }, + { + "epoch": 0.9352363413136894, + "grad_norm": 0.6343888640403748, + "learning_rate": 9.894711349968995e-05, + "loss": 2.0832, + "step": 3047 + }, + { + "epoch": 0.9355432780847146, + "grad_norm": 0.54819256067276, + "learning_rate": 9.894609858122407e-05, + "loss": 2.1576, + "step": 3048 + }, + { + "epoch": 0.9358502148557397, + "grad_norm": 0.6905701160430908, + "learning_rate": 9.894508317904419e-05, + "loss": 2.0685, + "step": 3049 + }, + { + "epoch": 0.9361571516267649, + "grad_norm": 0.605591356754303, + "learning_rate": 9.894406729316028e-05, + "loss": 2.0931, + "step": 3050 + }, + { + "epoch": 0.93646408839779, + "grad_norm": 0.5702943801879883, + "learning_rate": 9.89430509235824e-05, + "loss": 2.1224, + "step": 3051 + }, + { + "epoch": 0.9367710251688153, + "grad_norm": 0.5855122804641724, + "learning_rate": 9.894203407032064e-05, + "loss": 2.0747, + "step": 3052 + }, + { + "epoch": 0.9370779619398404, + "grad_norm": 0.6002167463302612, + "learning_rate": 9.894101673338498e-05, + "loss": 2.0991, + "step": 3053 + }, + { + "epoch": 0.9373848987108656, + "grad_norm": 0.5914842486381531, + "learning_rate": 9.893999891278553e-05, + "loss": 2.0427, + "step": 3054 + }, + { + "epoch": 0.9376918354818907, + "grad_norm": 0.6283048391342163, + "learning_rate": 9.893898060853232e-05, + "loss": 2.0558, + "step": 3055 + }, + { + "epoch": 0.937998772252916, + "grad_norm": 0.5955209136009216, + "learning_rate": 9.893796182063542e-05, + "loss": 2.1286, + "step": 3056 + }, + { + "epoch": 0.9383057090239411, + "grad_norm": 0.5579878687858582, + "learning_rate": 9.893694254910489e-05, + "loss": 2.0799, + "step": 3057 + }, + { + "epoch": 0.9386126457949663, + "grad_norm": 0.5690281391143799, + "learning_rate": 9.893592279395082e-05, + "loss": 2.0699, + "step": 3058 + }, + { + "epoch": 0.9389195825659914, + "grad_norm": 0.5189259648323059, + "learning_rate": 9.893490255518327e-05, + "loss": 2.0627, + "step": 3059 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.5205439925193787, + "learning_rate": 9.893388183281233e-05, + "loss": 2.0136, + "step": 3060 + }, + { + "epoch": 0.9395334561080417, + "grad_norm": 0.492593914270401, + "learning_rate": 9.89328606268481e-05, + "loss": 2.0799, + "step": 3061 + }, + { + "epoch": 0.939840392879067, + "grad_norm": 0.6511666178703308, + "learning_rate": 9.893183893730067e-05, + "loss": 2.1297, + "step": 3062 + }, + { + "epoch": 0.9401473296500921, + "grad_norm": 0.7640050053596497, + "learning_rate": 9.89308167641801e-05, + "loss": 2.1384, + "step": 3063 + }, + { + "epoch": 0.9404542664211173, + "grad_norm": 0.7526536583900452, + "learning_rate": 9.892979410749654e-05, + "loss": 2.0454, + "step": 3064 + }, + { + "epoch": 0.9407612031921424, + "grad_norm": 0.7140639424324036, + "learning_rate": 9.892877096726007e-05, + "loss": 2.0219, + "step": 3065 + }, + { + "epoch": 0.9410681399631676, + "grad_norm": 0.6584374308586121, + "learning_rate": 9.89277473434808e-05, + "loss": 2.0943, + "step": 3066 + }, + { + "epoch": 0.9413750767341927, + "grad_norm": 0.5889024138450623, + "learning_rate": 9.892672323616888e-05, + "loss": 2.1088, + "step": 3067 + }, + { + "epoch": 0.941682013505218, + "grad_norm": 0.6196749806404114, + "learning_rate": 9.892569864533438e-05, + "loss": 2.101, + "step": 3068 + }, + { + "epoch": 0.9419889502762431, + "grad_norm": 0.6432211399078369, + "learning_rate": 9.892467357098744e-05, + "loss": 2.0828, + "step": 3069 + }, + { + "epoch": 0.9422958870472683, + "grad_norm": 0.6448069214820862, + "learning_rate": 9.892364801313823e-05, + "loss": 2.1389, + "step": 3070 + }, + { + "epoch": 0.9426028238182934, + "grad_norm": 0.597197949886322, + "learning_rate": 9.892262197179682e-05, + "loss": 2.0902, + "step": 3071 + }, + { + "epoch": 0.9429097605893186, + "grad_norm": 0.625348687171936, + "learning_rate": 9.892159544697341e-05, + "loss": 2.0659, + "step": 3072 + }, + { + "epoch": 0.9432166973603437, + "grad_norm": 0.5109166502952576, + "learning_rate": 9.892056843867812e-05, + "loss": 2.0895, + "step": 3073 + }, + { + "epoch": 0.943523634131369, + "grad_norm": 0.5917959213256836, + "learning_rate": 9.891954094692108e-05, + "loss": 2.0646, + "step": 3074 + }, + { + "epoch": 0.9438305709023941, + "grad_norm": 0.5320633053779602, + "learning_rate": 9.891851297171249e-05, + "loss": 2.107, + "step": 3075 + }, + { + "epoch": 0.9441375076734193, + "grad_norm": 0.5271332263946533, + "learning_rate": 9.891748451306246e-05, + "loss": 2.0984, + "step": 3076 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.5389983057975769, + "learning_rate": 9.89164555709812e-05, + "loss": 2.1097, + "step": 3077 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.5536573529243469, + "learning_rate": 9.891542614547885e-05, + "loss": 2.1271, + "step": 3078 + }, + { + "epoch": 0.9450583179864948, + "grad_norm": 0.5481712222099304, + "learning_rate": 9.891439623656558e-05, + "loss": 2.0975, + "step": 3079 + }, + { + "epoch": 0.94536525475752, + "grad_norm": 0.626431941986084, + "learning_rate": 9.891336584425157e-05, + "loss": 2.1561, + "step": 3080 + }, + { + "epoch": 0.9456721915285451, + "grad_norm": 0.7452689409255981, + "learning_rate": 9.891233496854702e-05, + "loss": 2.0791, + "step": 3081 + }, + { + "epoch": 0.9459791282995703, + "grad_norm": 0.9399113059043884, + "learning_rate": 9.89113036094621e-05, + "loss": 2.0706, + "step": 3082 + }, + { + "epoch": 0.9462860650705954, + "grad_norm": 1.0733267068862915, + "learning_rate": 9.891027176700701e-05, + "loss": 2.0705, + "step": 3083 + }, + { + "epoch": 0.9465930018416207, + "grad_norm": 0.7521542906761169, + "learning_rate": 9.890923944119194e-05, + "loss": 2.0862, + "step": 3084 + }, + { + "epoch": 0.9468999386126458, + "grad_norm": 0.5447198152542114, + "learning_rate": 9.890820663202713e-05, + "loss": 2.1047, + "step": 3085 + }, + { + "epoch": 0.947206875383671, + "grad_norm": 0.5733833312988281, + "learning_rate": 9.890717333952273e-05, + "loss": 2.121, + "step": 3086 + }, + { + "epoch": 0.9475138121546961, + "grad_norm": 0.7225440144538879, + "learning_rate": 9.890613956368899e-05, + "loss": 2.0533, + "step": 3087 + }, + { + "epoch": 0.9478207489257213, + "grad_norm": 0.6377096176147461, + "learning_rate": 9.89051053045361e-05, + "loss": 2.07, + "step": 3088 + }, + { + "epoch": 0.9481276856967464, + "grad_norm": 0.556656002998352, + "learning_rate": 9.890407056207432e-05, + "loss": 2.1103, + "step": 3089 + }, + { + "epoch": 0.9484346224677717, + "grad_norm": 0.6807621121406555, + "learning_rate": 9.890303533631382e-05, + "loss": 2.1351, + "step": 3090 + }, + { + "epoch": 0.9487415592387968, + "grad_norm": 0.7187803983688354, + "learning_rate": 9.890199962726487e-05, + "loss": 2.0582, + "step": 3091 + }, + { + "epoch": 0.949048496009822, + "grad_norm": 0.6201196908950806, + "learning_rate": 9.890096343493771e-05, + "loss": 2.0799, + "step": 3092 + }, + { + "epoch": 0.9493554327808471, + "grad_norm": 0.6258496046066284, + "learning_rate": 9.889992675934257e-05, + "loss": 2.156, + "step": 3093 + }, + { + "epoch": 0.9496623695518723, + "grad_norm": 0.6191570162773132, + "learning_rate": 9.889888960048967e-05, + "loss": 2.0121, + "step": 3094 + }, + { + "epoch": 0.9499693063228974, + "grad_norm": 0.5668848752975464, + "learning_rate": 9.88978519583893e-05, + "loss": 2.0954, + "step": 3095 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.5596859455108643, + "learning_rate": 9.88968138330517e-05, + "loss": 2.1274, + "step": 3096 + }, + { + "epoch": 0.9505831798649478, + "grad_norm": 0.6199706196784973, + "learning_rate": 9.889577522448712e-05, + "loss": 2.0588, + "step": 3097 + }, + { + "epoch": 0.950890116635973, + "grad_norm": 0.5129860639572144, + "learning_rate": 9.889473613270584e-05, + "loss": 2.0722, + "step": 3098 + }, + { + "epoch": 0.9511970534069981, + "grad_norm": 0.513263463973999, + "learning_rate": 9.88936965577181e-05, + "loss": 2.0298, + "step": 3099 + }, + { + "epoch": 0.9515039901780233, + "grad_norm": 0.4870156943798065, + "learning_rate": 9.88926564995342e-05, + "loss": 2.025, + "step": 3100 + }, + { + "epoch": 0.9518109269490485, + "grad_norm": 0.5310595035552979, + "learning_rate": 9.889161595816442e-05, + "loss": 2.0767, + "step": 3101 + }, + { + "epoch": 0.9521178637200737, + "grad_norm": 0.5993812084197998, + "learning_rate": 9.889057493361903e-05, + "loss": 2.1931, + "step": 3102 + }, + { + "epoch": 0.9524248004910988, + "grad_norm": 0.6157637238502502, + "learning_rate": 9.888953342590832e-05, + "loss": 2.0757, + "step": 3103 + }, + { + "epoch": 0.952731737262124, + "grad_norm": 0.6280032992362976, + "learning_rate": 9.88884914350426e-05, + "loss": 2.0042, + "step": 3104 + }, + { + "epoch": 0.9530386740331491, + "grad_norm": 0.6740781664848328, + "learning_rate": 9.888744896103212e-05, + "loss": 2.0663, + "step": 3105 + }, + { + "epoch": 0.9533456108041743, + "grad_norm": 0.5851804614067078, + "learning_rate": 9.888640600388725e-05, + "loss": 2.0585, + "step": 3106 + }, + { + "epoch": 0.9536525475751995, + "grad_norm": 0.6590312719345093, + "learning_rate": 9.888536256361825e-05, + "loss": 2.0698, + "step": 3107 + }, + { + "epoch": 0.9539594843462247, + "grad_norm": 0.5356595516204834, + "learning_rate": 9.888431864023544e-05, + "loss": 2.1019, + "step": 3108 + }, + { + "epoch": 0.9542664211172498, + "grad_norm": 0.6401084661483765, + "learning_rate": 9.888327423374915e-05, + "loss": 2.1176, + "step": 3109 + }, + { + "epoch": 0.954573357888275, + "grad_norm": 0.6582900285720825, + "learning_rate": 9.888222934416968e-05, + "loss": 2.0375, + "step": 3110 + }, + { + "epoch": 0.9548802946593001, + "grad_norm": 0.6245424151420593, + "learning_rate": 9.888118397150738e-05, + "loss": 1.9913, + "step": 3111 + }, + { + "epoch": 0.9551872314303254, + "grad_norm": 0.5871780514717102, + "learning_rate": 9.888013811577256e-05, + "loss": 2.1434, + "step": 3112 + }, + { + "epoch": 0.9554941682013505, + "grad_norm": 0.6295487284660339, + "learning_rate": 9.887909177697559e-05, + "loss": 2.0805, + "step": 3113 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.5844045877456665, + "learning_rate": 9.887804495512676e-05, + "loss": 2.076, + "step": 3114 + }, + { + "epoch": 0.9561080417434008, + "grad_norm": 0.5581921339035034, + "learning_rate": 9.887699765023645e-05, + "loss": 2.131, + "step": 3115 + }, + { + "epoch": 0.956414978514426, + "grad_norm": 0.6659174561500549, + "learning_rate": 9.8875949862315e-05, + "loss": 2.0759, + "step": 3116 + }, + { + "epoch": 0.9567219152854513, + "grad_norm": 0.5852961540222168, + "learning_rate": 9.887490159137276e-05, + "loss": 2.0486, + "step": 3117 + }, + { + "epoch": 0.9570288520564764, + "grad_norm": 0.6077566146850586, + "learning_rate": 9.887385283742011e-05, + "loss": 2.1132, + "step": 3118 + }, + { + "epoch": 0.9573357888275016, + "grad_norm": 0.5991361141204834, + "learning_rate": 9.88728036004674e-05, + "loss": 2.0322, + "step": 3119 + }, + { + "epoch": 0.9576427255985267, + "grad_norm": 0.5832391977310181, + "learning_rate": 9.887175388052499e-05, + "loss": 2.135, + "step": 3120 + }, + { + "epoch": 0.9579496623695519, + "grad_norm": 0.5479732751846313, + "learning_rate": 9.887070367760327e-05, + "loss": 2.1222, + "step": 3121 + }, + { + "epoch": 0.958256599140577, + "grad_norm": 0.5630220770835876, + "learning_rate": 9.88696529917126e-05, + "loss": 2.1247, + "step": 3122 + }, + { + "epoch": 0.9585635359116023, + "grad_norm": 0.7052439451217651, + "learning_rate": 9.88686018228634e-05, + "loss": 2.204, + "step": 3123 + }, + { + "epoch": 0.9588704726826274, + "grad_norm": 0.5995638370513916, + "learning_rate": 9.8867550171066e-05, + "loss": 2.0153, + "step": 3124 + }, + { + "epoch": 0.9591774094536526, + "grad_norm": 0.5689408779144287, + "learning_rate": 9.886649803633086e-05, + "loss": 2.0341, + "step": 3125 + }, + { + "epoch": 0.9594843462246777, + "grad_norm": 0.5247456431388855, + "learning_rate": 9.886544541866832e-05, + "loss": 2.0657, + "step": 3126 + }, + { + "epoch": 0.9597912829957029, + "grad_norm": 0.5596463084220886, + "learning_rate": 9.886439231808882e-05, + "loss": 2.0829, + "step": 3127 + }, + { + "epoch": 0.960098219766728, + "grad_norm": 0.4993874430656433, + "learning_rate": 9.886333873460275e-05, + "loss": 2.0517, + "step": 3128 + }, + { + "epoch": 0.9604051565377533, + "grad_norm": 0.5776910185813904, + "learning_rate": 9.886228466822054e-05, + "loss": 2.0124, + "step": 3129 + }, + { + "epoch": 0.9607120933087784, + "grad_norm": 0.5871354341506958, + "learning_rate": 9.886123011895258e-05, + "loss": 2.0327, + "step": 3130 + }, + { + "epoch": 0.9610190300798036, + "grad_norm": 0.5873207449913025, + "learning_rate": 9.886017508680931e-05, + "loss": 2.0756, + "step": 3131 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.6422720551490784, + "learning_rate": 9.885911957180113e-05, + "loss": 2.0649, + "step": 3132 + }, + { + "epoch": 0.9616329036218539, + "grad_norm": 0.6040814518928528, + "learning_rate": 9.885806357393853e-05, + "loss": 2.066, + "step": 3133 + }, + { + "epoch": 0.961939840392879, + "grad_norm": 0.6629621982574463, + "learning_rate": 9.885700709323189e-05, + "loss": 2.0824, + "step": 3134 + }, + { + "epoch": 0.9622467771639043, + "grad_norm": 0.572485625743866, + "learning_rate": 9.885595012969168e-05, + "loss": 2.0572, + "step": 3135 + }, + { + "epoch": 0.9625537139349294, + "grad_norm": 0.5050783753395081, + "learning_rate": 9.885489268332833e-05, + "loss": 2.0645, + "step": 3136 + }, + { + "epoch": 0.9628606507059546, + "grad_norm": 0.5744417309761047, + "learning_rate": 9.885383475415229e-05, + "loss": 2.0549, + "step": 3137 + }, + { + "epoch": 0.9631675874769797, + "grad_norm": 0.5604275465011597, + "learning_rate": 9.885277634217403e-05, + "loss": 2.1339, + "step": 3138 + }, + { + "epoch": 0.963474524248005, + "grad_norm": 0.6182584762573242, + "learning_rate": 9.8851717447404e-05, + "loss": 2.0397, + "step": 3139 + }, + { + "epoch": 0.9637814610190301, + "grad_norm": 0.510515570640564, + "learning_rate": 9.885065806985266e-05, + "loss": 1.9761, + "step": 3140 + }, + { + "epoch": 0.9640883977900553, + "grad_norm": 0.4881763756275177, + "learning_rate": 9.884959820953048e-05, + "loss": 2.005, + "step": 3141 + }, + { + "epoch": 0.9643953345610804, + "grad_norm": 0.47206851840019226, + "learning_rate": 9.884853786644794e-05, + "loss": 2.0661, + "step": 3142 + }, + { + "epoch": 0.9647022713321056, + "grad_norm": 0.5691676735877991, + "learning_rate": 9.884747704061552e-05, + "loss": 2.1316, + "step": 3143 + }, + { + "epoch": 0.9650092081031307, + "grad_norm": 0.5338765978813171, + "learning_rate": 9.884641573204372e-05, + "loss": 2.0715, + "step": 3144 + }, + { + "epoch": 0.965316144874156, + "grad_norm": 0.5721597075462341, + "learning_rate": 9.884535394074299e-05, + "loss": 2.1004, + "step": 3145 + }, + { + "epoch": 0.9656230816451811, + "grad_norm": 0.5269518494606018, + "learning_rate": 9.884429166672384e-05, + "loss": 2.1233, + "step": 3146 + }, + { + "epoch": 0.9659300184162063, + "grad_norm": 0.5264385342597961, + "learning_rate": 9.884322890999678e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.9662369551872314, + "grad_norm": 0.6094604730606079, + "learning_rate": 9.88421656705723e-05, + "loss": 2.1009, + "step": 3148 + }, + { + "epoch": 0.9665438919582566, + "grad_norm": 0.5538906455039978, + "learning_rate": 9.884110194846093e-05, + "loss": 2.0055, + "step": 3149 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.591526985168457, + "learning_rate": 9.884003774367313e-05, + "loss": 2.0655, + "step": 3150 + }, + { + "epoch": 0.967157765500307, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.883897305621948e-05, + "loss": 2.0775, + "step": 3151 + }, + { + "epoch": 0.9674647022713321, + "grad_norm": 0.5074640512466431, + "learning_rate": 9.883790788611045e-05, + "loss": 2.0322, + "step": 3152 + }, + { + "epoch": 0.9677716390423573, + "grad_norm": 0.5111376047134399, + "learning_rate": 9.883684223335661e-05, + "loss": 2.0972, + "step": 3153 + }, + { + "epoch": 0.9680785758133824, + "grad_norm": 0.5187644362449646, + "learning_rate": 9.883577609796846e-05, + "loss": 2.072, + "step": 3154 + }, + { + "epoch": 0.9683855125844076, + "grad_norm": 0.5285201072692871, + "learning_rate": 9.883470947995654e-05, + "loss": 2.0468, + "step": 3155 + }, + { + "epoch": 0.9686924493554327, + "grad_norm": 0.49360916018486023, + "learning_rate": 9.883364237933142e-05, + "loss": 2.07, + "step": 3156 + }, + { + "epoch": 0.968999386126458, + "grad_norm": 0.6359294056892395, + "learning_rate": 9.88325747961036e-05, + "loss": 2.1169, + "step": 3157 + }, + { + "epoch": 0.9693063228974831, + "grad_norm": 0.6274764537811279, + "learning_rate": 9.883150673028367e-05, + "loss": 2.1412, + "step": 3158 + }, + { + "epoch": 0.9696132596685083, + "grad_norm": 0.5755917429924011, + "learning_rate": 9.883043818188215e-05, + "loss": 2.0547, + "step": 3159 + }, + { + "epoch": 0.9699201964395334, + "grad_norm": 0.4765770137310028, + "learning_rate": 9.882936915090964e-05, + "loss": 2.02, + "step": 3160 + }, + { + "epoch": 0.9702271332105586, + "grad_norm": 0.5085053443908691, + "learning_rate": 9.882829963737667e-05, + "loss": 2.0355, + "step": 3161 + }, + { + "epoch": 0.9705340699815838, + "grad_norm": 0.49804505705833435, + "learning_rate": 9.882722964129385e-05, + "loss": 2.1274, + "step": 3162 + }, + { + "epoch": 0.970841006752609, + "grad_norm": 0.5575076341629028, + "learning_rate": 9.882615916267171e-05, + "loss": 2.0661, + "step": 3163 + }, + { + "epoch": 0.9711479435236341, + "grad_norm": 0.5678727626800537, + "learning_rate": 9.882508820152084e-05, + "loss": 2.1135, + "step": 3164 + }, + { + "epoch": 0.9714548802946593, + "grad_norm": 0.5505611896514893, + "learning_rate": 9.882401675785185e-05, + "loss": 2.0888, + "step": 3165 + }, + { + "epoch": 0.9717618170656844, + "grad_norm": 0.5224125385284424, + "learning_rate": 9.88229448316753e-05, + "loss": 2.0492, + "step": 3166 + }, + { + "epoch": 0.9720687538367097, + "grad_norm": 0.437215656042099, + "learning_rate": 9.882187242300178e-05, + "loss": 1.9927, + "step": 3167 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.4914848804473877, + "learning_rate": 9.882079953184192e-05, + "loss": 2.0309, + "step": 3168 + }, + { + "epoch": 0.97268262737876, + "grad_norm": 0.4990764260292053, + "learning_rate": 9.88197261582063e-05, + "loss": 2.0408, + "step": 3169 + }, + { + "epoch": 0.9729895641497851, + "grad_norm": 0.5283234715461731, + "learning_rate": 9.881865230210552e-05, + "loss": 2.0627, + "step": 3170 + }, + { + "epoch": 0.9732965009208103, + "grad_norm": 0.5771347284317017, + "learning_rate": 9.88175779635502e-05, + "loss": 2.1591, + "step": 3171 + }, + { + "epoch": 0.9736034376918354, + "grad_norm": 0.5020268559455872, + "learning_rate": 9.881650314255098e-05, + "loss": 2.0311, + "step": 3172 + }, + { + "epoch": 0.9739103744628607, + "grad_norm": 0.5476529002189636, + "learning_rate": 9.881542783911846e-05, + "loss": 2.1114, + "step": 3173 + }, + { + "epoch": 0.9742173112338858, + "grad_norm": 0.5630559921264648, + "learning_rate": 9.881435205326327e-05, + "loss": 2.0617, + "step": 3174 + }, + { + "epoch": 0.974524248004911, + "grad_norm": 0.5931001305580139, + "learning_rate": 9.881327578499604e-05, + "loss": 2.0376, + "step": 3175 + }, + { + "epoch": 0.9748311847759361, + "grad_norm": 0.6123979091644287, + "learning_rate": 9.881219903432742e-05, + "loss": 2.0995, + "step": 3176 + }, + { + "epoch": 0.9751381215469613, + "grad_norm": 0.6064465641975403, + "learning_rate": 9.881112180126802e-05, + "loss": 2.0533, + "step": 3177 + }, + { + "epoch": 0.9754450583179864, + "grad_norm": 0.6071485877037048, + "learning_rate": 9.881004408582852e-05, + "loss": 2.1007, + "step": 3178 + }, + { + "epoch": 0.9757519950890117, + "grad_norm": 0.6021482944488525, + "learning_rate": 9.880896588801954e-05, + "loss": 2.0528, + "step": 3179 + }, + { + "epoch": 0.9760589318600368, + "grad_norm": 0.5204832553863525, + "learning_rate": 9.880788720785177e-05, + "loss": 2.0489, + "step": 3180 + }, + { + "epoch": 0.976365868631062, + "grad_norm": 0.5347138047218323, + "learning_rate": 9.880680804533585e-05, + "loss": 2.1021, + "step": 3181 + }, + { + "epoch": 0.9766728054020871, + "grad_norm": 0.6318790912628174, + "learning_rate": 9.880572840048243e-05, + "loss": 2.0808, + "step": 3182 + }, + { + "epoch": 0.9769797421731123, + "grad_norm": 0.6978665590286255, + "learning_rate": 9.88046482733022e-05, + "loss": 2.0067, + "step": 3183 + }, + { + "epoch": 0.9772866789441375, + "grad_norm": 0.7986917495727539, + "learning_rate": 9.880356766380582e-05, + "loss": 2.0239, + "step": 3184 + }, + { + "epoch": 0.9775936157151627, + "grad_norm": 0.853898286819458, + "learning_rate": 9.880248657200402e-05, + "loss": 2.085, + "step": 3185 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.8207793235778809, + "learning_rate": 9.880140499790741e-05, + "loss": 2.0504, + "step": 3186 + }, + { + "epoch": 0.978207489257213, + "grad_norm": 0.7750336527824402, + "learning_rate": 9.880032294152673e-05, + "loss": 2.0962, + "step": 3187 + }, + { + "epoch": 0.9785144260282382, + "grad_norm": 0.7141241431236267, + "learning_rate": 9.879924040287263e-05, + "loss": 2.0655, + "step": 3188 + }, + { + "epoch": 0.9788213627992634, + "grad_norm": 0.6119080781936646, + "learning_rate": 9.879815738195585e-05, + "loss": 2.0611, + "step": 3189 + }, + { + "epoch": 0.9791282995702886, + "grad_norm": 0.5963751673698425, + "learning_rate": 9.879707387878708e-05, + "loss": 2.0978, + "step": 3190 + }, + { + "epoch": 0.9794352363413137, + "grad_norm": 0.5016428828239441, + "learning_rate": 9.879598989337703e-05, + "loss": 2.0323, + "step": 3191 + }, + { + "epoch": 0.9797421731123389, + "grad_norm": 0.5610151290893555, + "learning_rate": 9.87949054257364e-05, + "loss": 2.1362, + "step": 3192 + }, + { + "epoch": 0.980049109883364, + "grad_norm": 0.5687069296836853, + "learning_rate": 9.879382047587591e-05, + "loss": 2.0234, + "step": 3193 + }, + { + "epoch": 0.9803560466543892, + "grad_norm": 0.6210914254188538, + "learning_rate": 9.87927350438063e-05, + "loss": 2.0455, + "step": 3194 + }, + { + "epoch": 0.9806629834254144, + "grad_norm": 0.530215322971344, + "learning_rate": 9.879164912953827e-05, + "loss": 2.0607, + "step": 3195 + }, + { + "epoch": 0.9809699201964396, + "grad_norm": 0.5462486147880554, + "learning_rate": 9.879056273308258e-05, + "loss": 2.1229, + "step": 3196 + }, + { + "epoch": 0.9812768569674647, + "grad_norm": 0.5765405297279358, + "learning_rate": 9.878947585444994e-05, + "loss": 2.0575, + "step": 3197 + }, + { + "epoch": 0.9815837937384899, + "grad_norm": 0.531679630279541, + "learning_rate": 9.878838849365111e-05, + "loss": 2.0208, + "step": 3198 + }, + { + "epoch": 0.981890730509515, + "grad_norm": 0.5190781950950623, + "learning_rate": 9.878730065069683e-05, + "loss": 2.0073, + "step": 3199 + }, + { + "epoch": 0.9821976672805403, + "grad_norm": 0.6260761022567749, + "learning_rate": 9.878621232559784e-05, + "loss": 2.1144, + "step": 3200 + }, + { + "epoch": 0.9825046040515654, + "grad_norm": 0.664830207824707, + "learning_rate": 9.878512351836491e-05, + "loss": 2.1423, + "step": 3201 + }, + { + "epoch": 0.9828115408225906, + "grad_norm": 0.7107433676719666, + "learning_rate": 9.878403422900881e-05, + "loss": 2.0851, + "step": 3202 + }, + { + "epoch": 0.9831184775936157, + "grad_norm": 0.7426268458366394, + "learning_rate": 9.878294445754027e-05, + "loss": 2.0637, + "step": 3203 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.7643515467643738, + "learning_rate": 9.878185420397008e-05, + "loss": 2.0623, + "step": 3204 + }, + { + "epoch": 0.983732351135666, + "grad_norm": 0.644257664680481, + "learning_rate": 9.878076346830904e-05, + "loss": 2.103, + "step": 3205 + }, + { + "epoch": 0.9840392879066913, + "grad_norm": 0.5871284008026123, + "learning_rate": 9.877967225056787e-05, + "loss": 2.0695, + "step": 3206 + }, + { + "epoch": 0.9843462246777164, + "grad_norm": 0.6907737851142883, + "learning_rate": 9.877858055075742e-05, + "loss": 2.1148, + "step": 3207 + }, + { + "epoch": 0.9846531614487416, + "grad_norm": 0.6685691475868225, + "learning_rate": 9.877748836888843e-05, + "loss": 2.0356, + "step": 3208 + }, + { + "epoch": 0.9849600982197667, + "grad_norm": 0.797210156917572, + "learning_rate": 9.87763957049717e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.9852670349907919, + "grad_norm": 0.8397588133811951, + "learning_rate": 9.877530255901806e-05, + "loss": 2.0697, + "step": 3210 + }, + { + "epoch": 0.985573971761817, + "grad_norm": 0.6988976001739502, + "learning_rate": 9.877420893103828e-05, + "loss": 2.0676, + "step": 3211 + }, + { + "epoch": 0.9858809085328423, + "grad_norm": 0.5828577876091003, + "learning_rate": 9.877311482104319e-05, + "loss": 2.0988, + "step": 3212 + }, + { + "epoch": 0.9861878453038674, + "grad_norm": 0.66143798828125, + "learning_rate": 9.877202022904359e-05, + "loss": 2.101, + "step": 3213 + }, + { + "epoch": 0.9864947820748926, + "grad_norm": 0.7351155877113342, + "learning_rate": 9.877092515505028e-05, + "loss": 2.0198, + "step": 3214 + }, + { + "epoch": 0.9868017188459177, + "grad_norm": 0.6817437410354614, + "learning_rate": 9.876982959907413e-05, + "loss": 2.1182, + "step": 3215 + }, + { + "epoch": 0.9871086556169429, + "grad_norm": 0.6640676259994507, + "learning_rate": 9.876873356112592e-05, + "loss": 2.1264, + "step": 3216 + }, + { + "epoch": 0.987415592387968, + "grad_norm": 0.6146695017814636, + "learning_rate": 9.876763704121652e-05, + "loss": 2.0378, + "step": 3217 + }, + { + "epoch": 0.9877225291589933, + "grad_norm": 0.6681298017501831, + "learning_rate": 9.876654003935672e-05, + "loss": 2.1916, + "step": 3218 + }, + { + "epoch": 0.9880294659300184, + "grad_norm": 0.7407983541488647, + "learning_rate": 9.876544255555742e-05, + "loss": 2.0996, + "step": 3219 + }, + { + "epoch": 0.9883364027010436, + "grad_norm": 0.5995208621025085, + "learning_rate": 9.876434458982941e-05, + "loss": 2.0023, + "step": 3220 + }, + { + "epoch": 0.9886433394720687, + "grad_norm": 0.6491377949714661, + "learning_rate": 9.876324614218357e-05, + "loss": 2.129, + "step": 3221 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.6356569528579712, + "learning_rate": 9.876214721263074e-05, + "loss": 2.1396, + "step": 3222 + }, + { + "epoch": 0.9892572130141191, + "grad_norm": 0.6149557828903198, + "learning_rate": 9.876104780118182e-05, + "loss": 2.0204, + "step": 3223 + }, + { + "epoch": 0.9895641497851443, + "grad_norm": 0.600841224193573, + "learning_rate": 9.875994790784764e-05, + "loss": 2.0585, + "step": 3224 + }, + { + "epoch": 0.9898710865561694, + "grad_norm": 0.6398041248321533, + "learning_rate": 9.875884753263906e-05, + "loss": 2.1296, + "step": 3225 + }, + { + "epoch": 0.9901780233271946, + "grad_norm": 0.5978466272354126, + "learning_rate": 9.875774667556697e-05, + "loss": 1.9765, + "step": 3226 + }, + { + "epoch": 0.9904849600982197, + "grad_norm": 0.49499931931495667, + "learning_rate": 9.875664533664227e-05, + "loss": 2.0516, + "step": 3227 + }, + { + "epoch": 0.990791896869245, + "grad_norm": 0.5660768151283264, + "learning_rate": 9.875554351587579e-05, + "loss": 2.0743, + "step": 3228 + }, + { + "epoch": 0.9910988336402701, + "grad_norm": 0.56971275806427, + "learning_rate": 9.875444121327849e-05, + "loss": 2.0794, + "step": 3229 + }, + { + "epoch": 0.9914057704112953, + "grad_norm": 0.5806300044059753, + "learning_rate": 9.87533384288612e-05, + "loss": 2.1636, + "step": 3230 + }, + { + "epoch": 0.9917127071823204, + "grad_norm": 0.5485837459564209, + "learning_rate": 9.875223516263485e-05, + "loss": 2.025, + "step": 3231 + }, + { + "epoch": 0.9920196439533456, + "grad_norm": 0.6353451013565063, + "learning_rate": 9.875113141461034e-05, + "loss": 2.1033, + "step": 3232 + }, + { + "epoch": 0.9923265807243707, + "grad_norm": 0.577608048915863, + "learning_rate": 9.875002718479858e-05, + "loss": 2.1306, + "step": 3233 + }, + { + "epoch": 0.992633517495396, + "grad_norm": 0.5305901765823364, + "learning_rate": 9.874892247321046e-05, + "loss": 2.1123, + "step": 3234 + }, + { + "epoch": 0.9929404542664211, + "grad_norm": 0.5554118752479553, + "learning_rate": 9.874781727985693e-05, + "loss": 2.0524, + "step": 3235 + }, + { + "epoch": 0.9932473910374463, + "grad_norm": 0.48555269837379456, + "learning_rate": 9.87467116047489e-05, + "loss": 2.0699, + "step": 3236 + }, + { + "epoch": 0.9935543278084714, + "grad_norm": 0.578976035118103, + "learning_rate": 9.874560544789729e-05, + "loss": 2.0747, + "step": 3237 + }, + { + "epoch": 0.9938612645794966, + "grad_norm": 0.5508282780647278, + "learning_rate": 9.874449880931304e-05, + "loss": 2.0947, + "step": 3238 + }, + { + "epoch": 0.9941682013505218, + "grad_norm": 0.5458595752716064, + "learning_rate": 9.874339168900707e-05, + "loss": 2.0417, + "step": 3239 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.5668261647224426, + "learning_rate": 9.874228408699035e-05, + "loss": 2.0948, + "step": 3240 + }, + { + "epoch": 0.9947820748925721, + "grad_norm": 0.6127253174781799, + "learning_rate": 9.87411760032738e-05, + "loss": 2.0904, + "step": 3241 + }, + { + "epoch": 0.9950890116635973, + "grad_norm": 0.5736191868782043, + "learning_rate": 9.874006743786839e-05, + "loss": 2.0637, + "step": 3242 + }, + { + "epoch": 0.9953959484346224, + "grad_norm": 0.574163019657135, + "learning_rate": 9.873895839078507e-05, + "loss": 2.0925, + "step": 3243 + }, + { + "epoch": 0.9957028852056476, + "grad_norm": 0.5660602450370789, + "learning_rate": 9.873784886203478e-05, + "loss": 2.0743, + "step": 3244 + }, + { + "epoch": 0.9960098219766728, + "grad_norm": 0.6037993431091309, + "learning_rate": 9.87367388516285e-05, + "loss": 2.1274, + "step": 3245 + }, + { + "epoch": 0.996316758747698, + "grad_norm": 0.5664488673210144, + "learning_rate": 9.873562835957722e-05, + "loss": 2.0403, + "step": 3246 + }, + { + "epoch": 0.9966236955187231, + "grad_norm": 0.6170254349708557, + "learning_rate": 9.873451738589188e-05, + "loss": 2.0198, + "step": 3247 + }, + { + "epoch": 0.9969306322897483, + "grad_norm": 0.5582032799720764, + "learning_rate": 9.873340593058348e-05, + "loss": 2.1494, + "step": 3248 + }, + { + "epoch": 0.9972375690607734, + "grad_norm": 0.5565598607063293, + "learning_rate": 9.8732293993663e-05, + "loss": 2.1062, + "step": 3249 + }, + { + "epoch": 0.9975445058317987, + "grad_norm": 0.5526474118232727, + "learning_rate": 9.873118157514142e-05, + "loss": 2.1184, + "step": 3250 + }, + { + "epoch": 0.9978514426028238, + "grad_norm": 0.5864302515983582, + "learning_rate": 9.873006867502975e-05, + "loss": 2.1389, + "step": 3251 + }, + { + "epoch": 0.998158379373849, + "grad_norm": 0.5295118689537048, + "learning_rate": 9.872895529333899e-05, + "loss": 2.05, + "step": 3252 + }, + { + "epoch": 0.9984653161448741, + "grad_norm": 0.553537905216217, + "learning_rate": 9.872784143008012e-05, + "loss": 2.0464, + "step": 3253 + }, + { + "epoch": 0.9987722529158993, + "grad_norm": 0.558159589767456, + "learning_rate": 9.872672708526416e-05, + "loss": 2.1013, + "step": 3254 + }, + { + "epoch": 0.9990791896869244, + "grad_norm": 0.5479860901832581, + "learning_rate": 9.872561225890211e-05, + "loss": 2.0497, + "step": 3255 + }, + { + "epoch": 0.9993861264579497, + "grad_norm": 0.5538234114646912, + "learning_rate": 9.872449695100503e-05, + "loss": 2.1239, + "step": 3256 + }, + { + "epoch": 0.9996930632289748, + "grad_norm": 0.5970771908760071, + "learning_rate": 9.872338116158389e-05, + "loss": 2.0693, + "step": 3257 + }, + { + "epoch": 1.0, + "grad_norm": 0.5118132829666138, + "learning_rate": 9.872226489064975e-05, + "loss": 2.0302, + "step": 3258 + }, + { + "epoch": 1.0003069367710251, + "grad_norm": 0.538902223110199, + "learning_rate": 9.872114813821363e-05, + "loss": 2.0604, + "step": 3259 + }, + { + "epoch": 1.0006138735420504, + "grad_norm": 0.47124916315078735, + "learning_rate": 9.872003090428657e-05, + "loss": 2.054, + "step": 3260 + }, + { + "epoch": 1.0009208103130756, + "grad_norm": 0.5109235048294067, + "learning_rate": 9.87189131888796e-05, + "loss": 2.0107, + "step": 3261 + }, + { + "epoch": 1.0012277470841007, + "grad_norm": 0.5530306696891785, + "learning_rate": 9.871779499200377e-05, + "loss": 2.0914, + "step": 3262 + }, + { + "epoch": 1.0015346838551258, + "grad_norm": 0.6271992325782776, + "learning_rate": 9.871667631367017e-05, + "loss": 1.9855, + "step": 3263 + }, + { + "epoch": 1.0018416206261511, + "grad_norm": 0.5752004384994507, + "learning_rate": 9.871555715388978e-05, + "loss": 2.0689, + "step": 3264 + }, + { + "epoch": 1.0021485573971762, + "grad_norm": 0.6185278296470642, + "learning_rate": 9.871443751267373e-05, + "loss": 2.0751, + "step": 3265 + }, + { + "epoch": 1.0024554941682013, + "grad_norm": 0.625248908996582, + "learning_rate": 9.871331739003304e-05, + "loss": 2.102, + "step": 3266 + }, + { + "epoch": 1.0027624309392265, + "grad_norm": 0.6345300078392029, + "learning_rate": 9.87121967859788e-05, + "loss": 2.0898, + "step": 3267 + }, + { + "epoch": 1.0030693677102518, + "grad_norm": 0.6836622953414917, + "learning_rate": 9.871107570052207e-05, + "loss": 2.1348, + "step": 3268 + }, + { + "epoch": 1.003376304481277, + "grad_norm": 0.699739933013916, + "learning_rate": 9.870995413367397e-05, + "loss": 2.0085, + "step": 3269 + }, + { + "epoch": 1.003683241252302, + "grad_norm": 0.650558590888977, + "learning_rate": 9.870883208544553e-05, + "loss": 2.0927, + "step": 3270 + }, + { + "epoch": 1.0039901780233271, + "grad_norm": 0.6837300658226013, + "learning_rate": 9.870770955584785e-05, + "loss": 2.1415, + "step": 3271 + }, + { + "epoch": 1.0042971147943525, + "grad_norm": 0.595761239528656, + "learning_rate": 9.870658654489206e-05, + "loss": 2.0372, + "step": 3272 + }, + { + "epoch": 1.0046040515653776, + "grad_norm": 0.5177203416824341, + "learning_rate": 9.870546305258922e-05, + "loss": 2.053, + "step": 3273 + }, + { + "epoch": 1.0049109883364027, + "grad_norm": 0.5392438173294067, + "learning_rate": 9.870433907895045e-05, + "loss": 2.0886, + "step": 3274 + }, + { + "epoch": 1.0052179251074278, + "grad_norm": 0.594776451587677, + "learning_rate": 9.870321462398686e-05, + "loss": 2.0158, + "step": 3275 + }, + { + "epoch": 1.0055248618784531, + "grad_norm": 0.6363179683685303, + "learning_rate": 9.870208968770955e-05, + "loss": 2.0532, + "step": 3276 + }, + { + "epoch": 1.0058317986494782, + "grad_norm": 0.7506567239761353, + "learning_rate": 9.870096427012965e-05, + "loss": 2.1288, + "step": 3277 + }, + { + "epoch": 1.0061387354205034, + "grad_norm": 0.7155289053916931, + "learning_rate": 9.869983837125828e-05, + "loss": 2.0859, + "step": 3278 + }, + { + "epoch": 1.0064456721915285, + "grad_norm": 0.7589760422706604, + "learning_rate": 9.869871199110656e-05, + "loss": 2.1668, + "step": 3279 + }, + { + "epoch": 1.0067526089625538, + "grad_norm": 0.6161168217658997, + "learning_rate": 9.869758512968562e-05, + "loss": 2.0421, + "step": 3280 + }, + { + "epoch": 1.007059545733579, + "grad_norm": 0.5722637176513672, + "learning_rate": 9.86964577870066e-05, + "loss": 2.1333, + "step": 3281 + }, + { + "epoch": 1.007366482504604, + "grad_norm": 0.6443020701408386, + "learning_rate": 9.869532996308065e-05, + "loss": 2.0227, + "step": 3282 + }, + { + "epoch": 1.0076734192756291, + "grad_norm": 0.6603342890739441, + "learning_rate": 9.869420165791891e-05, + "loss": 2.0888, + "step": 3283 + }, + { + "epoch": 1.0079803560466545, + "grad_norm": 0.6666482090950012, + "learning_rate": 9.869307287153251e-05, + "loss": 2.0132, + "step": 3284 + }, + { + "epoch": 1.0082872928176796, + "grad_norm": 0.6691575646400452, + "learning_rate": 9.869194360393264e-05, + "loss": 2.0752, + "step": 3285 + }, + { + "epoch": 1.0085942295887047, + "grad_norm": 0.6142565011978149, + "learning_rate": 9.869081385513044e-05, + "loss": 2.0491, + "step": 3286 + }, + { + "epoch": 1.0089011663597298, + "grad_norm": 0.5869930386543274, + "learning_rate": 9.868968362513708e-05, + "loss": 2.1252, + "step": 3287 + }, + { + "epoch": 1.0092081031307552, + "grad_norm": 0.532183825969696, + "learning_rate": 9.868855291396373e-05, + "loss": 2.0589, + "step": 3288 + }, + { + "epoch": 1.0095150399017803, + "grad_norm": 0.616374135017395, + "learning_rate": 9.868742172162156e-05, + "loss": 2.0808, + "step": 3289 + }, + { + "epoch": 1.0098219766728054, + "grad_norm": 0.5750923156738281, + "learning_rate": 9.868629004812176e-05, + "loss": 2.0407, + "step": 3290 + }, + { + "epoch": 1.0101289134438305, + "grad_norm": 0.6161531209945679, + "learning_rate": 9.86851578934755e-05, + "loss": 2.0938, + "step": 3291 + }, + { + "epoch": 1.0104358502148558, + "grad_norm": 0.5369158983230591, + "learning_rate": 9.868402525769397e-05, + "loss": 2.1298, + "step": 3292 + }, + { + "epoch": 1.010742786985881, + "grad_norm": 0.5134824514389038, + "learning_rate": 9.868289214078837e-05, + "loss": 2.0345, + "step": 3293 + }, + { + "epoch": 1.011049723756906, + "grad_norm": 0.4972594082355499, + "learning_rate": 9.868175854276991e-05, + "loss": 2.1264, + "step": 3294 + }, + { + "epoch": 1.0113566605279312, + "grad_norm": 0.5727534890174866, + "learning_rate": 9.868062446364976e-05, + "loss": 2.1668, + "step": 3295 + }, + { + "epoch": 1.0116635972989565, + "grad_norm": 0.6384626030921936, + "learning_rate": 9.867948990343915e-05, + "loss": 2.1125, + "step": 3296 + }, + { + "epoch": 1.0119705340699816, + "grad_norm": 0.7591070532798767, + "learning_rate": 9.867835486214929e-05, + "loss": 2.0975, + "step": 3297 + }, + { + "epoch": 1.0122774708410067, + "grad_norm": 0.7940282821655273, + "learning_rate": 9.86772193397914e-05, + "loss": 2.0107, + "step": 3298 + }, + { + "epoch": 1.0125844076120318, + "grad_norm": 0.6877933144569397, + "learning_rate": 9.86760833363767e-05, + "loss": 2.0684, + "step": 3299 + }, + { + "epoch": 1.0128913443830572, + "grad_norm": 0.5361137986183167, + "learning_rate": 9.867494685191641e-05, + "loss": 2.0426, + "step": 3300 + }, + { + "epoch": 1.0131982811540823, + "grad_norm": 0.5104349851608276, + "learning_rate": 9.867380988642177e-05, + "loss": 2.0849, + "step": 3301 + }, + { + "epoch": 1.0135052179251074, + "grad_norm": 0.6133849024772644, + "learning_rate": 9.867267243990399e-05, + "loss": 2.0789, + "step": 3302 + }, + { + "epoch": 1.0138121546961325, + "grad_norm": 0.6607559323310852, + "learning_rate": 9.867153451237436e-05, + "loss": 2.0978, + "step": 3303 + }, + { + "epoch": 1.0141190914671578, + "grad_norm": 0.6853774189949036, + "learning_rate": 9.867039610384409e-05, + "loss": 2.1612, + "step": 3304 + }, + { + "epoch": 1.014426028238183, + "grad_norm": 0.6326626539230347, + "learning_rate": 9.866925721432442e-05, + "loss": 2.0887, + "step": 3305 + }, + { + "epoch": 1.014732965009208, + "grad_norm": 0.5483830571174622, + "learning_rate": 9.866811784382665e-05, + "loss": 2.0522, + "step": 3306 + }, + { + "epoch": 1.0150399017802332, + "grad_norm": 0.5980744957923889, + "learning_rate": 9.866697799236201e-05, + "loss": 2.0666, + "step": 3307 + }, + { + "epoch": 1.0153468385512585, + "grad_norm": 0.6047075986862183, + "learning_rate": 9.866583765994177e-05, + "loss": 2.0924, + "step": 3308 + }, + { + "epoch": 1.0156537753222836, + "grad_norm": 0.5932674407958984, + "learning_rate": 9.86646968465772e-05, + "loss": 2.0426, + "step": 3309 + }, + { + "epoch": 1.0159607120933087, + "grad_norm": 0.5349873304367065, + "learning_rate": 9.866355555227957e-05, + "loss": 2.027, + "step": 3310 + }, + { + "epoch": 1.0162676488643339, + "grad_norm": 0.5090891122817993, + "learning_rate": 9.866241377706015e-05, + "loss": 2.0554, + "step": 3311 + }, + { + "epoch": 1.0165745856353592, + "grad_norm": 0.605268120765686, + "learning_rate": 9.866127152093025e-05, + "loss": 2.0788, + "step": 3312 + }, + { + "epoch": 1.0168815224063843, + "grad_norm": 0.6006563305854797, + "learning_rate": 9.866012878390113e-05, + "loss": 2.0154, + "step": 3313 + }, + { + "epoch": 1.0171884591774094, + "grad_norm": 0.6412727236747742, + "learning_rate": 9.865898556598409e-05, + "loss": 2.0948, + "step": 3314 + }, + { + "epoch": 1.0174953959484345, + "grad_norm": 0.512140154838562, + "learning_rate": 9.865784186719046e-05, + "loss": 2.0314, + "step": 3315 + }, + { + "epoch": 1.0178023327194599, + "grad_norm": 0.48285913467407227, + "learning_rate": 9.865669768753151e-05, + "loss": 1.9689, + "step": 3316 + }, + { + "epoch": 1.018109269490485, + "grad_norm": 0.6067737340927124, + "learning_rate": 9.865555302701854e-05, + "loss": 2.1042, + "step": 3317 + }, + { + "epoch": 1.01841620626151, + "grad_norm": 0.6272363662719727, + "learning_rate": 9.865440788566289e-05, + "loss": 2.1092, + "step": 3318 + }, + { + "epoch": 1.0187231430325352, + "grad_norm": 0.6264182925224304, + "learning_rate": 9.865326226347586e-05, + "loss": 2.0445, + "step": 3319 + }, + { + "epoch": 1.0190300798035605, + "grad_norm": 0.5642834901809692, + "learning_rate": 9.86521161604688e-05, + "loss": 2.1041, + "step": 3320 + }, + { + "epoch": 1.0193370165745856, + "grad_norm": 0.5188324451446533, + "learning_rate": 9.865096957665297e-05, + "loss": 2.0174, + "step": 3321 + }, + { + "epoch": 1.0196439533456108, + "grad_norm": 0.5204416513442993, + "learning_rate": 9.864982251203976e-05, + "loss": 2.0927, + "step": 3322 + }, + { + "epoch": 1.0199508901166359, + "grad_norm": 0.5845292806625366, + "learning_rate": 9.86486749666405e-05, + "loss": 2.0751, + "step": 3323 + }, + { + "epoch": 1.0202578268876612, + "grad_norm": 0.5514994263648987, + "learning_rate": 9.86475269404665e-05, + "loss": 2.0976, + "step": 3324 + }, + { + "epoch": 1.0205647636586863, + "grad_norm": 0.6578981280326843, + "learning_rate": 9.864637843352915e-05, + "loss": 2.0668, + "step": 3325 + }, + { + "epoch": 1.0208717004297114, + "grad_norm": 0.6396434307098389, + "learning_rate": 9.864522944583976e-05, + "loss": 2.0648, + "step": 3326 + }, + { + "epoch": 1.0211786372007365, + "grad_norm": 0.548759400844574, + "learning_rate": 9.86440799774097e-05, + "loss": 2.0873, + "step": 3327 + }, + { + "epoch": 1.0214855739717619, + "grad_norm": 0.5739279985427856, + "learning_rate": 9.864293002825033e-05, + "loss": 2.0623, + "step": 3328 + }, + { + "epoch": 1.021792510742787, + "grad_norm": 0.5882315039634705, + "learning_rate": 9.864177959837303e-05, + "loss": 2.0399, + "step": 3329 + }, + { + "epoch": 1.022099447513812, + "grad_norm": 0.563359797000885, + "learning_rate": 9.864062868778914e-05, + "loss": 2.0839, + "step": 3330 + }, + { + "epoch": 1.0224063842848374, + "grad_norm": 0.6162607073783875, + "learning_rate": 9.863947729651006e-05, + "loss": 2.0439, + "step": 3331 + }, + { + "epoch": 1.0227133210558625, + "grad_norm": 0.6540365815162659, + "learning_rate": 9.863832542454715e-05, + "loss": 2.1234, + "step": 3332 + }, + { + "epoch": 1.0230202578268877, + "grad_norm": 0.6401089429855347, + "learning_rate": 9.86371730719118e-05, + "loss": 2.0418, + "step": 3333 + }, + { + "epoch": 1.0233271945979128, + "grad_norm": 0.6456391215324402, + "learning_rate": 9.86360202386154e-05, + "loss": 2.1191, + "step": 3334 + }, + { + "epoch": 1.023634131368938, + "grad_norm": 0.59992516040802, + "learning_rate": 9.863486692466933e-05, + "loss": 2.0582, + "step": 3335 + }, + { + "epoch": 1.0239410681399632, + "grad_norm": 0.5932520627975464, + "learning_rate": 9.8633713130085e-05, + "loss": 2.1812, + "step": 3336 + }, + { + "epoch": 1.0242480049109883, + "grad_norm": 0.6322866082191467, + "learning_rate": 9.863255885487384e-05, + "loss": 2.1523, + "step": 3337 + }, + { + "epoch": 1.0245549416820134, + "grad_norm": 0.6291313171386719, + "learning_rate": 9.863140409904719e-05, + "loss": 2.0495, + "step": 3338 + }, + { + "epoch": 1.0248618784530388, + "grad_norm": 0.6272565126419067, + "learning_rate": 9.863024886261653e-05, + "loss": 1.9812, + "step": 3339 + }, + { + "epoch": 1.025168815224064, + "grad_norm": 0.6485729217529297, + "learning_rate": 9.862909314559323e-05, + "loss": 2.0826, + "step": 3340 + }, + { + "epoch": 1.025475751995089, + "grad_norm": 0.608239471912384, + "learning_rate": 9.862793694798875e-05, + "loss": 2.0519, + "step": 3341 + }, + { + "epoch": 1.0257826887661141, + "grad_norm": 0.5492779612541199, + "learning_rate": 9.862678026981447e-05, + "loss": 1.9901, + "step": 3342 + }, + { + "epoch": 1.0260896255371394, + "grad_norm": 0.524030327796936, + "learning_rate": 9.862562311108187e-05, + "loss": 2.0695, + "step": 3343 + }, + { + "epoch": 1.0263965623081646, + "grad_norm": 0.6835227608680725, + "learning_rate": 9.862446547180235e-05, + "loss": 2.1312, + "step": 3344 + }, + { + "epoch": 1.0267034990791897, + "grad_norm": 0.6771748065948486, + "learning_rate": 9.862330735198736e-05, + "loss": 2.0566, + "step": 3345 + }, + { + "epoch": 1.0270104358502148, + "grad_norm": 0.609993577003479, + "learning_rate": 9.862214875164835e-05, + "loss": 2.1463, + "step": 3346 + }, + { + "epoch": 1.0273173726212401, + "grad_norm": 0.6617777347564697, + "learning_rate": 9.862098967079677e-05, + "loss": 2.0485, + "step": 3347 + }, + { + "epoch": 1.0276243093922652, + "grad_norm": 0.7935113906860352, + "learning_rate": 9.861983010944407e-05, + "loss": 2.0528, + "step": 3348 + }, + { + "epoch": 1.0279312461632903, + "grad_norm": 0.7510255575180054, + "learning_rate": 9.861867006760172e-05, + "loss": 1.9803, + "step": 3349 + }, + { + "epoch": 1.0282381829343155, + "grad_norm": 0.6944519281387329, + "learning_rate": 9.861750954528117e-05, + "loss": 2.0488, + "step": 3350 + }, + { + "epoch": 1.0285451197053408, + "grad_norm": 0.6057126522064209, + "learning_rate": 9.861634854249389e-05, + "loss": 2.1465, + "step": 3351 + }, + { + "epoch": 1.028852056476366, + "grad_norm": 0.6156182289123535, + "learning_rate": 9.861518705925135e-05, + "loss": 2.1227, + "step": 3352 + }, + { + "epoch": 1.029158993247391, + "grad_norm": 0.6016978621482849, + "learning_rate": 9.861402509556506e-05, + "loss": 2.0238, + "step": 3353 + }, + { + "epoch": 1.0294659300184161, + "grad_norm": 0.5987950563430786, + "learning_rate": 9.861286265144648e-05, + "loss": 2.0529, + "step": 3354 + }, + { + "epoch": 1.0297728667894415, + "grad_norm": 0.6011384725570679, + "learning_rate": 9.861169972690707e-05, + "loss": 2.0612, + "step": 3355 + }, + { + "epoch": 1.0300798035604666, + "grad_norm": 0.5217840671539307, + "learning_rate": 9.861053632195838e-05, + "loss": 2.0472, + "step": 3356 + }, + { + "epoch": 1.0303867403314917, + "grad_norm": 0.5202180743217468, + "learning_rate": 9.860937243661186e-05, + "loss": 2.1301, + "step": 3357 + }, + { + "epoch": 1.0306936771025168, + "grad_norm": 0.572290301322937, + "learning_rate": 9.860820807087905e-05, + "loss": 2.0309, + "step": 3358 + }, + { + "epoch": 1.0310006138735421, + "grad_norm": 0.5088694095611572, + "learning_rate": 9.860704322477142e-05, + "loss": 2.0789, + "step": 3359 + }, + { + "epoch": 1.0313075506445673, + "grad_norm": 0.5546056032180786, + "learning_rate": 9.860587789830052e-05, + "loss": 1.9708, + "step": 3360 + }, + { + "epoch": 1.0316144874155924, + "grad_norm": 0.5152996182441711, + "learning_rate": 9.860471209147782e-05, + "loss": 2.0656, + "step": 3361 + }, + { + "epoch": 1.0319214241866175, + "grad_norm": 0.4997018873691559, + "learning_rate": 9.860354580431488e-05, + "loss": 2.1404, + "step": 3362 + }, + { + "epoch": 1.0322283609576428, + "grad_norm": 0.5464209318161011, + "learning_rate": 9.860237903682321e-05, + "loss": 2.0013, + "step": 3363 + }, + { + "epoch": 1.032535297728668, + "grad_norm": 0.4934932589530945, + "learning_rate": 9.860121178901435e-05, + "loss": 2.0873, + "step": 3364 + }, + { + "epoch": 1.032842234499693, + "grad_norm": 0.5755184292793274, + "learning_rate": 9.860004406089982e-05, + "loss": 2.0706, + "step": 3365 + }, + { + "epoch": 1.0331491712707181, + "grad_norm": 0.6155427098274231, + "learning_rate": 9.859887585249117e-05, + "loss": 2.1153, + "step": 3366 + }, + { + "epoch": 1.0334561080417435, + "grad_norm": 0.6251068711280823, + "learning_rate": 9.859770716379995e-05, + "loss": 1.9988, + "step": 3367 + }, + { + "epoch": 1.0337630448127686, + "grad_norm": 0.5652515888214111, + "learning_rate": 9.85965379948377e-05, + "loss": 1.9834, + "step": 3368 + }, + { + "epoch": 1.0340699815837937, + "grad_norm": 0.49031418561935425, + "learning_rate": 9.859536834561599e-05, + "loss": 2.0719, + "step": 3369 + }, + { + "epoch": 1.0343769183548188, + "grad_norm": 0.5014585852622986, + "learning_rate": 9.859419821614635e-05, + "loss": 2.0309, + "step": 3370 + }, + { + "epoch": 1.0346838551258442, + "grad_norm": 0.5657221674919128, + "learning_rate": 9.859302760644036e-05, + "loss": 2.048, + "step": 3371 + }, + { + "epoch": 1.0349907918968693, + "grad_norm": 0.7023506164550781, + "learning_rate": 9.85918565165096e-05, + "loss": 2.033, + "step": 3372 + }, + { + "epoch": 1.0352977286678944, + "grad_norm": 0.5712850689888, + "learning_rate": 9.859068494636565e-05, + "loss": 2.1006, + "step": 3373 + }, + { + "epoch": 1.0356046654389195, + "grad_norm": 0.5352653861045837, + "learning_rate": 9.858951289602004e-05, + "loss": 1.9775, + "step": 3374 + }, + { + "epoch": 1.0359116022099448, + "grad_norm": 0.5282073616981506, + "learning_rate": 9.85883403654844e-05, + "loss": 2.0388, + "step": 3375 + }, + { + "epoch": 1.03621853898097, + "grad_norm": 0.6164727210998535, + "learning_rate": 9.85871673547703e-05, + "loss": 2.0758, + "step": 3376 + }, + { + "epoch": 1.036525475751995, + "grad_norm": 0.6034660935401917, + "learning_rate": 9.858599386388933e-05, + "loss": 2.0619, + "step": 3377 + }, + { + "epoch": 1.0368324125230202, + "grad_norm": 0.6129952073097229, + "learning_rate": 9.85848198928531e-05, + "loss": 2.0709, + "step": 3378 + }, + { + "epoch": 1.0371393492940455, + "grad_norm": 0.6287248134613037, + "learning_rate": 9.85836454416732e-05, + "loss": 2.1493, + "step": 3379 + }, + { + "epoch": 1.0374462860650706, + "grad_norm": 0.675419807434082, + "learning_rate": 9.858247051036124e-05, + "loss": 2.0558, + "step": 3380 + }, + { + "epoch": 1.0377532228360957, + "grad_norm": 0.6493481397628784, + "learning_rate": 9.858129509892882e-05, + "loss": 2.2019, + "step": 3381 + }, + { + "epoch": 1.0380601596071208, + "grad_norm": 0.6690036058425903, + "learning_rate": 9.85801192073876e-05, + "loss": 2.0069, + "step": 3382 + }, + { + "epoch": 1.0383670963781462, + "grad_norm": 0.6682954430580139, + "learning_rate": 9.857894283574913e-05, + "loss": 2.0559, + "step": 3383 + }, + { + "epoch": 1.0386740331491713, + "grad_norm": 0.6408236622810364, + "learning_rate": 9.857776598402508e-05, + "loss": 2.0837, + "step": 3384 + }, + { + "epoch": 1.0389809699201964, + "grad_norm": 0.7896385192871094, + "learning_rate": 9.85765886522271e-05, + "loss": 2.1344, + "step": 3385 + }, + { + "epoch": 1.0392879066912215, + "grad_norm": 0.7404007911682129, + "learning_rate": 9.857541084036677e-05, + "loss": 2.0937, + "step": 3386 + }, + { + "epoch": 1.0395948434622468, + "grad_norm": 0.6780609488487244, + "learning_rate": 9.857423254845577e-05, + "loss": 2.0279, + "step": 3387 + }, + { + "epoch": 1.039901780233272, + "grad_norm": 0.5989474654197693, + "learning_rate": 9.857305377650574e-05, + "loss": 2.0997, + "step": 3388 + }, + { + "epoch": 1.040208717004297, + "grad_norm": 0.5449484586715698, + "learning_rate": 9.857187452452832e-05, + "loss": 2.0544, + "step": 3389 + }, + { + "epoch": 1.0405156537753222, + "grad_norm": 0.6261779069900513, + "learning_rate": 9.857069479253516e-05, + "loss": 2.024, + "step": 3390 + }, + { + "epoch": 1.0408225905463475, + "grad_norm": 0.6665713787078857, + "learning_rate": 9.856951458053794e-05, + "loss": 2.1139, + "step": 3391 + }, + { + "epoch": 1.0411295273173726, + "grad_norm": 0.5861490964889526, + "learning_rate": 9.856833388854829e-05, + "loss": 2.0087, + "step": 3392 + }, + { + "epoch": 1.0414364640883977, + "grad_norm": 0.5511623620986938, + "learning_rate": 9.856715271657793e-05, + "loss": 2.106, + "step": 3393 + }, + { + "epoch": 1.0417434008594229, + "grad_norm": 0.5450705885887146, + "learning_rate": 9.856597106463848e-05, + "loss": 2.0669, + "step": 3394 + }, + { + "epoch": 1.0420503376304482, + "grad_norm": 0.5172801613807678, + "learning_rate": 9.856478893274163e-05, + "loss": 2.0492, + "step": 3395 + }, + { + "epoch": 1.0423572744014733, + "grad_norm": 0.580157458782196, + "learning_rate": 9.856360632089907e-05, + "loss": 2.0794, + "step": 3396 + }, + { + "epoch": 1.0426642111724984, + "grad_norm": 0.5138662457466125, + "learning_rate": 9.856242322912251e-05, + "loss": 2.0813, + "step": 3397 + }, + { + "epoch": 1.0429711479435237, + "grad_norm": 0.5626689791679382, + "learning_rate": 9.85612396574236e-05, + "loss": 2.071, + "step": 3398 + }, + { + "epoch": 1.0432780847145489, + "grad_norm": 0.6069894433021545, + "learning_rate": 9.856005560581407e-05, + "loss": 2.132, + "step": 3399 + }, + { + "epoch": 1.043585021485574, + "grad_norm": 0.547346293926239, + "learning_rate": 9.85588710743056e-05, + "loss": 2.0572, + "step": 3400 + }, + { + "epoch": 1.043891958256599, + "grad_norm": 0.5712311863899231, + "learning_rate": 9.855768606290992e-05, + "loss": 2.0943, + "step": 3401 + }, + { + "epoch": 1.0441988950276242, + "grad_norm": 0.5945014953613281, + "learning_rate": 9.85565005716387e-05, + "loss": 2.1004, + "step": 3402 + }, + { + "epoch": 1.0445058317986495, + "grad_norm": 0.5712563395500183, + "learning_rate": 9.85553146005037e-05, + "loss": 2.0817, + "step": 3403 + }, + { + "epoch": 1.0448127685696746, + "grad_norm": 0.552578866481781, + "learning_rate": 9.855412814951661e-05, + "loss": 2.0514, + "step": 3404 + }, + { + "epoch": 1.0451197053406998, + "grad_norm": 0.5654930472373962, + "learning_rate": 9.855294121868918e-05, + "loss": 2.1342, + "step": 3405 + }, + { + "epoch": 1.045426642111725, + "grad_norm": 0.516094446182251, + "learning_rate": 9.855175380803312e-05, + "loss": 2.01, + "step": 3406 + }, + { + "epoch": 1.0457335788827502, + "grad_norm": 0.5198549628257751, + "learning_rate": 9.855056591756018e-05, + "loss": 2.0423, + "step": 3407 + }, + { + "epoch": 1.0460405156537753, + "grad_norm": 0.45312678813934326, + "learning_rate": 9.854937754728209e-05, + "loss": 1.9767, + "step": 3408 + }, + { + "epoch": 1.0463474524248004, + "grad_norm": 0.4647958278656006, + "learning_rate": 9.854818869721059e-05, + "loss": 2.107, + "step": 3409 + }, + { + "epoch": 1.0466543891958258, + "grad_norm": 0.5034347772598267, + "learning_rate": 9.854699936735742e-05, + "loss": 2.0358, + "step": 3410 + }, + { + "epoch": 1.0469613259668509, + "grad_norm": 0.48189103603363037, + "learning_rate": 9.854580955773435e-05, + "loss": 2.0441, + "step": 3411 + }, + { + "epoch": 1.047268262737876, + "grad_norm": 0.5315099954605103, + "learning_rate": 9.854461926835316e-05, + "loss": 2.0222, + "step": 3412 + }, + { + "epoch": 1.047575199508901, + "grad_norm": 0.6013970971107483, + "learning_rate": 9.854342849922557e-05, + "loss": 2.09, + "step": 3413 + }, + { + "epoch": 1.0478821362799264, + "grad_norm": 0.7554240226745605, + "learning_rate": 9.854223725036339e-05, + "loss": 2.0411, + "step": 3414 + }, + { + "epoch": 1.0481890730509515, + "grad_norm": 0.7160158157348633, + "learning_rate": 9.854104552177835e-05, + "loss": 2.0858, + "step": 3415 + }, + { + "epoch": 1.0484960098219767, + "grad_norm": 0.5641576051712036, + "learning_rate": 9.853985331348225e-05, + "loss": 2.0287, + "step": 3416 + }, + { + "epoch": 1.0488029465930018, + "grad_norm": 0.5947676301002502, + "learning_rate": 9.853866062548687e-05, + "loss": 2.1177, + "step": 3417 + }, + { + "epoch": 1.049109883364027, + "grad_norm": 0.5780991911888123, + "learning_rate": 9.853746745780401e-05, + "loss": 2.024, + "step": 3418 + }, + { + "epoch": 1.0494168201350522, + "grad_norm": 0.6753053665161133, + "learning_rate": 9.853627381044543e-05, + "loss": 2.1303, + "step": 3419 + }, + { + "epoch": 1.0497237569060773, + "grad_norm": 0.7183442711830139, + "learning_rate": 9.853507968342295e-05, + "loss": 2.0845, + "step": 3420 + }, + { + "epoch": 1.0500306936771024, + "grad_norm": 0.6768840551376343, + "learning_rate": 9.853388507674837e-05, + "loss": 2.0991, + "step": 3421 + }, + { + "epoch": 1.0503376304481278, + "grad_norm": 0.624703049659729, + "learning_rate": 9.85326899904335e-05, + "loss": 2.0952, + "step": 3422 + }, + { + "epoch": 1.050644567219153, + "grad_norm": 0.523289144039154, + "learning_rate": 9.853149442449013e-05, + "loss": 2.0244, + "step": 3423 + }, + { + "epoch": 1.050951503990178, + "grad_norm": 0.4939860701560974, + "learning_rate": 9.853029837893008e-05, + "loss": 2.0312, + "step": 3424 + }, + { + "epoch": 1.0512584407612031, + "grad_norm": 0.5685132145881653, + "learning_rate": 9.852910185376519e-05, + "loss": 2.0863, + "step": 3425 + }, + { + "epoch": 1.0515653775322285, + "grad_norm": 0.5713129639625549, + "learning_rate": 9.852790484900725e-05, + "loss": 2.1182, + "step": 3426 + }, + { + "epoch": 1.0518723143032536, + "grad_norm": 0.5626100301742554, + "learning_rate": 9.852670736466813e-05, + "loss": 2.0187, + "step": 3427 + }, + { + "epoch": 1.0521792510742787, + "grad_norm": 0.5129684805870056, + "learning_rate": 9.852550940075965e-05, + "loss": 2.0354, + "step": 3428 + }, + { + "epoch": 1.0524861878453038, + "grad_norm": 0.6123769879341125, + "learning_rate": 9.852431095729361e-05, + "loss": 2.1315, + "step": 3429 + }, + { + "epoch": 1.0527931246163291, + "grad_norm": 0.66834956407547, + "learning_rate": 9.852311203428192e-05, + "loss": 2.1642, + "step": 3430 + }, + { + "epoch": 1.0531000613873542, + "grad_norm": 0.6253052353858948, + "learning_rate": 9.85219126317364e-05, + "loss": 2.0651, + "step": 3431 + }, + { + "epoch": 1.0534069981583793, + "grad_norm": 0.5162510871887207, + "learning_rate": 9.852071274966888e-05, + "loss": 2.0029, + "step": 3432 + }, + { + "epoch": 1.0537139349294045, + "grad_norm": 0.5725626349449158, + "learning_rate": 9.851951238809125e-05, + "loss": 2.0875, + "step": 3433 + }, + { + "epoch": 1.0540208717004298, + "grad_norm": 0.5319885611534119, + "learning_rate": 9.851831154701537e-05, + "loss": 2.0042, + "step": 3434 + }, + { + "epoch": 1.054327808471455, + "grad_norm": 0.5030925273895264, + "learning_rate": 9.851711022645307e-05, + "loss": 1.9805, + "step": 3435 + }, + { + "epoch": 1.05463474524248, + "grad_norm": 0.5786148309707642, + "learning_rate": 9.851590842641627e-05, + "loss": 2.1456, + "step": 3436 + }, + { + "epoch": 1.0549416820135051, + "grad_norm": 0.6246622800827026, + "learning_rate": 9.851470614691682e-05, + "loss": 2.042, + "step": 3437 + }, + { + "epoch": 1.0552486187845305, + "grad_norm": 0.5181210041046143, + "learning_rate": 9.851350338796662e-05, + "loss": 2.0423, + "step": 3438 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.5505120754241943, + "learning_rate": 9.851230014957754e-05, + "loss": 2.0478, + "step": 3439 + }, + { + "epoch": 1.0558624923265807, + "grad_norm": 0.6193632483482361, + "learning_rate": 9.851109643176147e-05, + "loss": 1.9904, + "step": 3440 + }, + { + "epoch": 1.0561694290976058, + "grad_norm": 0.6332803964614868, + "learning_rate": 9.85098922345303e-05, + "loss": 2.0037, + "step": 3441 + }, + { + "epoch": 1.0564763658686311, + "grad_norm": 0.5601481199264526, + "learning_rate": 9.850868755789595e-05, + "loss": 2.141, + "step": 3442 + }, + { + "epoch": 1.0567833026396563, + "grad_norm": 0.588182270526886, + "learning_rate": 9.850748240187033e-05, + "loss": 2.17, + "step": 3443 + }, + { + "epoch": 1.0570902394106814, + "grad_norm": 0.5955865383148193, + "learning_rate": 9.850627676646533e-05, + "loss": 2.1004, + "step": 3444 + }, + { + "epoch": 1.0573971761817065, + "grad_norm": 0.6412670612335205, + "learning_rate": 9.850507065169288e-05, + "loss": 2.0642, + "step": 3445 + }, + { + "epoch": 1.0577041129527318, + "grad_norm": 0.5597305297851562, + "learning_rate": 9.850386405756489e-05, + "loss": 2.0412, + "step": 3446 + }, + { + "epoch": 1.058011049723757, + "grad_norm": 0.5633887052536011, + "learning_rate": 9.850265698409328e-05, + "loss": 1.9976, + "step": 3447 + }, + { + "epoch": 1.058317986494782, + "grad_norm": 0.5924213528633118, + "learning_rate": 9.850144943128998e-05, + "loss": 2.0715, + "step": 3448 + }, + { + "epoch": 1.0586249232658071, + "grad_norm": 0.5968048572540283, + "learning_rate": 9.850024139916694e-05, + "loss": 2.0755, + "step": 3449 + }, + { + "epoch": 1.0589318600368325, + "grad_norm": 0.5745044946670532, + "learning_rate": 9.849903288773609e-05, + "loss": 2.0615, + "step": 3450 + }, + { + "epoch": 1.0592387968078576, + "grad_norm": 0.5154273509979248, + "learning_rate": 9.849782389700936e-05, + "loss": 2.0429, + "step": 3451 + }, + { + "epoch": 1.0595457335788827, + "grad_norm": 0.5307286977767944, + "learning_rate": 9.849661442699871e-05, + "loss": 2.0788, + "step": 3452 + }, + { + "epoch": 1.0598526703499078, + "grad_norm": 0.5445010662078857, + "learning_rate": 9.84954044777161e-05, + "loss": 2.0598, + "step": 3453 + }, + { + "epoch": 1.0601596071209332, + "grad_norm": 0.5858064889907837, + "learning_rate": 9.849419404917347e-05, + "loss": 2.069, + "step": 3454 + }, + { + "epoch": 1.0604665438919583, + "grad_norm": 0.5906962156295776, + "learning_rate": 9.84929831413828e-05, + "loss": 2.1256, + "step": 3455 + }, + { + "epoch": 1.0607734806629834, + "grad_norm": 0.6632845997810364, + "learning_rate": 9.849177175435605e-05, + "loss": 2.1002, + "step": 3456 + }, + { + "epoch": 1.0610804174340085, + "grad_norm": 0.6352782845497131, + "learning_rate": 9.849055988810518e-05, + "loss": 2.0901, + "step": 3457 + }, + { + "epoch": 1.0613873542050338, + "grad_norm": 0.5406731963157654, + "learning_rate": 9.848934754264218e-05, + "loss": 2.0562, + "step": 3458 + }, + { + "epoch": 1.061694290976059, + "grad_norm": 0.6067590117454529, + "learning_rate": 9.848813471797902e-05, + "loss": 2.0914, + "step": 3459 + }, + { + "epoch": 1.062001227747084, + "grad_norm": 0.5876826047897339, + "learning_rate": 9.84869214141277e-05, + "loss": 2.0065, + "step": 3460 + }, + { + "epoch": 1.0623081645181092, + "grad_norm": 0.611648440361023, + "learning_rate": 9.84857076311002e-05, + "loss": 2.1252, + "step": 3461 + }, + { + "epoch": 1.0626151012891345, + "grad_norm": 0.568358302116394, + "learning_rate": 9.848449336890853e-05, + "loss": 2.0312, + "step": 3462 + }, + { + "epoch": 1.0629220380601596, + "grad_norm": 0.5303518772125244, + "learning_rate": 9.848327862756466e-05, + "loss": 1.9989, + "step": 3463 + }, + { + "epoch": 1.0632289748311847, + "grad_norm": 0.5377182960510254, + "learning_rate": 9.848206340708062e-05, + "loss": 2.0759, + "step": 3464 + }, + { + "epoch": 1.06353591160221, + "grad_norm": 0.5178431868553162, + "learning_rate": 9.848084770746842e-05, + "loss": 2.0613, + "step": 3465 + }, + { + "epoch": 1.0638428483732352, + "grad_norm": 0.4605518877506256, + "learning_rate": 9.847963152874007e-05, + "loss": 1.9961, + "step": 3466 + }, + { + "epoch": 1.0641497851442603, + "grad_norm": 0.5262506604194641, + "learning_rate": 9.847841487090758e-05, + "loss": 2.032, + "step": 3467 + }, + { + "epoch": 1.0644567219152854, + "grad_norm": 0.5210484862327576, + "learning_rate": 9.847719773398298e-05, + "loss": 2.106, + "step": 3468 + }, + { + "epoch": 1.0647636586863105, + "grad_norm": 0.5159584283828735, + "learning_rate": 9.84759801179783e-05, + "loss": 2.07, + "step": 3469 + }, + { + "epoch": 1.0650705954573358, + "grad_norm": 0.5094224810600281, + "learning_rate": 9.847476202290557e-05, + "loss": 2.1379, + "step": 3470 + }, + { + "epoch": 1.065377532228361, + "grad_norm": 0.5180851221084595, + "learning_rate": 9.847354344877684e-05, + "loss": 2.0911, + "step": 3471 + }, + { + "epoch": 1.065684468999386, + "grad_norm": 0.5476199984550476, + "learning_rate": 9.847232439560412e-05, + "loss": 2.0654, + "step": 3472 + }, + { + "epoch": 1.0659914057704114, + "grad_norm": 0.5314182639122009, + "learning_rate": 9.84711048633995e-05, + "loss": 1.9829, + "step": 3473 + }, + { + "epoch": 1.0662983425414365, + "grad_norm": 0.549379825592041, + "learning_rate": 9.8469884852175e-05, + "loss": 2.0876, + "step": 3474 + }, + { + "epoch": 1.0666052793124616, + "grad_norm": 0.6280861496925354, + "learning_rate": 9.84686643619427e-05, + "loss": 2.1026, + "step": 3475 + }, + { + "epoch": 1.0669122160834867, + "grad_norm": 0.5838838219642639, + "learning_rate": 9.846744339271464e-05, + "loss": 2.0553, + "step": 3476 + }, + { + "epoch": 1.0672191528545119, + "grad_norm": 0.6090747117996216, + "learning_rate": 9.84662219445029e-05, + "loss": 2.0983, + "step": 3477 + }, + { + "epoch": 1.0675260896255372, + "grad_norm": 0.515504002571106, + "learning_rate": 9.846500001731955e-05, + "loss": 2.0992, + "step": 3478 + }, + { + "epoch": 1.0678330263965623, + "grad_norm": 0.5083954930305481, + "learning_rate": 9.846377761117667e-05, + "loss": 1.9851, + "step": 3479 + }, + { + "epoch": 1.0681399631675874, + "grad_norm": 0.5102222561836243, + "learning_rate": 9.846255472608632e-05, + "loss": 2.0553, + "step": 3480 + }, + { + "epoch": 1.0684468999386127, + "grad_norm": 0.5123574137687683, + "learning_rate": 9.846133136206061e-05, + "loss": 2.0382, + "step": 3481 + }, + { + "epoch": 1.0687538367096379, + "grad_norm": 0.5657833814620972, + "learning_rate": 9.84601075191116e-05, + "loss": 2.0735, + "step": 3482 + }, + { + "epoch": 1.069060773480663, + "grad_norm": 0.5460711121559143, + "learning_rate": 9.845888319725143e-05, + "loss": 2.0445, + "step": 3483 + }, + { + "epoch": 1.069367710251688, + "grad_norm": 0.42860034108161926, + "learning_rate": 9.845765839649217e-05, + "loss": 2.0166, + "step": 3484 + }, + { + "epoch": 1.0696746470227134, + "grad_norm": 0.5413190126419067, + "learning_rate": 9.845643311684592e-05, + "loss": 1.9923, + "step": 3485 + }, + { + "epoch": 1.0699815837937385, + "grad_norm": 0.4982166290283203, + "learning_rate": 9.84552073583248e-05, + "loss": 2.0279, + "step": 3486 + }, + { + "epoch": 1.0702885205647636, + "grad_norm": 0.4824393689632416, + "learning_rate": 9.845398112094091e-05, + "loss": 1.9661, + "step": 3487 + }, + { + "epoch": 1.0705954573357888, + "grad_norm": 0.5690898895263672, + "learning_rate": 9.845275440470639e-05, + "loss": 2.0866, + "step": 3488 + }, + { + "epoch": 1.070902394106814, + "grad_norm": 0.6087098717689514, + "learning_rate": 9.845152720963335e-05, + "loss": 2.055, + "step": 3489 + }, + { + "epoch": 1.0712093308778392, + "grad_norm": 0.5754218101501465, + "learning_rate": 9.845029953573392e-05, + "loss": 2.0577, + "step": 3490 + }, + { + "epoch": 1.0715162676488643, + "grad_norm": 0.619746744632721, + "learning_rate": 9.844907138302023e-05, + "loss": 2.0694, + "step": 3491 + }, + { + "epoch": 1.0718232044198894, + "grad_norm": 0.5165389776229858, + "learning_rate": 9.844784275150442e-05, + "loss": 1.9618, + "step": 3492 + }, + { + "epoch": 1.0721301411909148, + "grad_norm": 0.5098079442977905, + "learning_rate": 9.844661364119863e-05, + "loss": 2.0021, + "step": 3493 + }, + { + "epoch": 1.0724370779619399, + "grad_norm": 0.5978688597679138, + "learning_rate": 9.8445384052115e-05, + "loss": 2.0861, + "step": 3494 + }, + { + "epoch": 1.072744014732965, + "grad_norm": 0.5498695373535156, + "learning_rate": 9.844415398426572e-05, + "loss": 2.095, + "step": 3495 + }, + { + "epoch": 1.07305095150399, + "grad_norm": 0.4890369474887848, + "learning_rate": 9.844292343766289e-05, + "loss": 1.9819, + "step": 3496 + }, + { + "epoch": 1.0733578882750154, + "grad_norm": 0.49551400542259216, + "learning_rate": 9.844169241231871e-05, + "loss": 2.109, + "step": 3497 + }, + { + "epoch": 1.0736648250460405, + "grad_norm": 0.5358633399009705, + "learning_rate": 9.844046090824533e-05, + "loss": 2.0579, + "step": 3498 + }, + { + "epoch": 1.0739717618170657, + "grad_norm": 0.5990919470787048, + "learning_rate": 9.843922892545492e-05, + "loss": 2.1962, + "step": 3499 + }, + { + "epoch": 1.0742786985880908, + "grad_norm": 0.5973169207572937, + "learning_rate": 9.843799646395967e-05, + "loss": 2.0691, + "step": 3500 + }, + { + "epoch": 1.074585635359116, + "grad_norm": 0.5875831246376038, + "learning_rate": 9.843676352377172e-05, + "loss": 2.0807, + "step": 3501 + }, + { + "epoch": 1.0748925721301412, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.84355301049033e-05, + "loss": 2.0694, + "step": 3502 + }, + { + "epoch": 1.0751995089011663, + "grad_norm": 0.7694209814071655, + "learning_rate": 9.843429620736659e-05, + "loss": 2.1504, + "step": 3503 + }, + { + "epoch": 1.0755064456721914, + "grad_norm": 0.7930089831352234, + "learning_rate": 9.843306183117376e-05, + "loss": 2.0635, + "step": 3504 + }, + { + "epoch": 1.0758133824432168, + "grad_norm": 0.6518469452857971, + "learning_rate": 9.843182697633704e-05, + "loss": 2.0395, + "step": 3505 + }, + { + "epoch": 1.076120319214242, + "grad_norm": 0.49737605452537537, + "learning_rate": 9.843059164286861e-05, + "loss": 1.9875, + "step": 3506 + }, + { + "epoch": 1.076427255985267, + "grad_norm": 0.5311492085456848, + "learning_rate": 9.84293558307807e-05, + "loss": 2.1331, + "step": 3507 + }, + { + "epoch": 1.0767341927562921, + "grad_norm": 0.6801449656486511, + "learning_rate": 9.842811954008551e-05, + "loss": 2.0991, + "step": 3508 + }, + { + "epoch": 1.0770411295273175, + "grad_norm": 0.5404406189918518, + "learning_rate": 9.842688277079523e-05, + "loss": 2.0482, + "step": 3509 + }, + { + "epoch": 1.0773480662983426, + "grad_norm": 0.6136532425880432, + "learning_rate": 9.842564552292215e-05, + "loss": 2.1016, + "step": 3510 + }, + { + "epoch": 1.0776550030693677, + "grad_norm": 0.5874183773994446, + "learning_rate": 9.842440779647843e-05, + "loss": 2.0495, + "step": 3511 + }, + { + "epoch": 1.0779619398403928, + "grad_norm": 0.4891047775745392, + "learning_rate": 9.842316959147635e-05, + "loss": 2.0592, + "step": 3512 + }, + { + "epoch": 1.0782688766114181, + "grad_norm": 0.5115689635276794, + "learning_rate": 9.84219309079281e-05, + "loss": 2.0084, + "step": 3513 + }, + { + "epoch": 1.0785758133824432, + "grad_norm": 0.5662370324134827, + "learning_rate": 9.842069174584597e-05, + "loss": 2.1134, + "step": 3514 + }, + { + "epoch": 1.0788827501534684, + "grad_norm": 0.6859605312347412, + "learning_rate": 9.841945210524217e-05, + "loss": 2.1144, + "step": 3515 + }, + { + "epoch": 1.0791896869244935, + "grad_norm": 0.8003933429718018, + "learning_rate": 9.841821198612897e-05, + "loss": 2.0353, + "step": 3516 + }, + { + "epoch": 1.0794966236955188, + "grad_norm": 0.8481027483940125, + "learning_rate": 9.841697138851863e-05, + "loss": 2.1012, + "step": 3517 + }, + { + "epoch": 1.079803560466544, + "grad_norm": 0.7234178185462952, + "learning_rate": 9.84157303124234e-05, + "loss": 2.1134, + "step": 3518 + }, + { + "epoch": 1.080110497237569, + "grad_norm": 0.6129522919654846, + "learning_rate": 9.841448875785553e-05, + "loss": 2.0736, + "step": 3519 + }, + { + "epoch": 1.0804174340085941, + "grad_norm": 0.4983314573764801, + "learning_rate": 9.841324672482732e-05, + "loss": 2.0334, + "step": 3520 + }, + { + "epoch": 1.0807243707796195, + "grad_norm": 0.6069099307060242, + "learning_rate": 9.841200421335101e-05, + "loss": 2.0506, + "step": 3521 + }, + { + "epoch": 1.0810313075506446, + "grad_norm": 0.5841798186302185, + "learning_rate": 9.841076122343893e-05, + "loss": 2.0491, + "step": 3522 + }, + { + "epoch": 1.0813382443216697, + "grad_norm": 0.5629861354827881, + "learning_rate": 9.84095177551033e-05, + "loss": 2.0435, + "step": 3523 + }, + { + "epoch": 1.0816451810926948, + "grad_norm": 0.48676446080207825, + "learning_rate": 9.840827380835646e-05, + "loss": 2.0543, + "step": 3524 + }, + { + "epoch": 1.0819521178637201, + "grad_norm": 0.5119389295578003, + "learning_rate": 9.840702938321069e-05, + "loss": 2.0461, + "step": 3525 + }, + { + "epoch": 1.0822590546347453, + "grad_norm": 0.47259917855262756, + "learning_rate": 9.840578447967827e-05, + "loss": 2.0494, + "step": 3526 + }, + { + "epoch": 1.0825659914057704, + "grad_norm": 0.5083605647087097, + "learning_rate": 9.840453909777153e-05, + "loss": 2.0518, + "step": 3527 + }, + { + "epoch": 1.0828729281767955, + "grad_norm": 0.46149778366088867, + "learning_rate": 9.840329323750276e-05, + "loss": 2.0087, + "step": 3528 + }, + { + "epoch": 1.0831798649478208, + "grad_norm": 0.4698919951915741, + "learning_rate": 9.840204689888427e-05, + "loss": 2.0715, + "step": 3529 + }, + { + "epoch": 1.083486801718846, + "grad_norm": 0.514570951461792, + "learning_rate": 9.840080008192838e-05, + "loss": 2.1067, + "step": 3530 + }, + { + "epoch": 1.083793738489871, + "grad_norm": 0.5938723087310791, + "learning_rate": 9.839955278664743e-05, + "loss": 2.1246, + "step": 3531 + }, + { + "epoch": 1.0841006752608962, + "grad_norm": 0.58525550365448, + "learning_rate": 9.839830501305372e-05, + "loss": 2.0695, + "step": 3532 + }, + { + "epoch": 1.0844076120319215, + "grad_norm": 0.5693490505218506, + "learning_rate": 9.83970567611596e-05, + "loss": 2.0166, + "step": 3533 + }, + { + "epoch": 1.0847145488029466, + "grad_norm": 0.544964075088501, + "learning_rate": 9.839580803097738e-05, + "loss": 2.0093, + "step": 3534 + }, + { + "epoch": 1.0850214855739717, + "grad_norm": 0.5509639978408813, + "learning_rate": 9.839455882251945e-05, + "loss": 2.0511, + "step": 3535 + }, + { + "epoch": 1.0853284223449968, + "grad_norm": 0.5092516541481018, + "learning_rate": 9.83933091357981e-05, + "loss": 2.0586, + "step": 3536 + }, + { + "epoch": 1.0856353591160222, + "grad_norm": 0.5163968205451965, + "learning_rate": 9.83920589708257e-05, + "loss": 2.0541, + "step": 3537 + }, + { + "epoch": 1.0859422958870473, + "grad_norm": 0.49756479263305664, + "learning_rate": 9.839080832761464e-05, + "loss": 2.0495, + "step": 3538 + }, + { + "epoch": 1.0862492326580724, + "grad_norm": 0.6246916055679321, + "learning_rate": 9.838955720617722e-05, + "loss": 2.2082, + "step": 3539 + }, + { + "epoch": 1.0865561694290977, + "grad_norm": 0.5826153755187988, + "learning_rate": 9.838830560652585e-05, + "loss": 2.0318, + "step": 3540 + }, + { + "epoch": 1.0868631062001228, + "grad_norm": 0.6131548285484314, + "learning_rate": 9.838705352867287e-05, + "loss": 2.1172, + "step": 3541 + }, + { + "epoch": 1.087170042971148, + "grad_norm": 0.7028201818466187, + "learning_rate": 9.838580097263068e-05, + "loss": 2.061, + "step": 3542 + }, + { + "epoch": 1.087476979742173, + "grad_norm": 0.7061073780059814, + "learning_rate": 9.838454793841166e-05, + "loss": 2.0944, + "step": 3543 + }, + { + "epoch": 1.0877839165131982, + "grad_norm": 0.6820229887962341, + "learning_rate": 9.838329442602814e-05, + "loss": 2.072, + "step": 3544 + }, + { + "epoch": 1.0880908532842235, + "grad_norm": 0.5658139586448669, + "learning_rate": 9.838204043549257e-05, + "loss": 2.0499, + "step": 3545 + }, + { + "epoch": 1.0883977900552486, + "grad_norm": 0.5714126825332642, + "learning_rate": 9.838078596681731e-05, + "loss": 2.06, + "step": 3546 + }, + { + "epoch": 1.0887047268262737, + "grad_norm": 0.5343610048294067, + "learning_rate": 9.837953102001477e-05, + "loss": 2.0932, + "step": 3547 + }, + { + "epoch": 1.089011663597299, + "grad_norm": 0.5799851417541504, + "learning_rate": 9.837827559509735e-05, + "loss": 2.0615, + "step": 3548 + }, + { + "epoch": 1.0893186003683242, + "grad_norm": 0.5679401159286499, + "learning_rate": 9.837701969207745e-05, + "loss": 2.0161, + "step": 3549 + }, + { + "epoch": 1.0896255371393493, + "grad_norm": 0.5369420647621155, + "learning_rate": 9.83757633109675e-05, + "loss": 2.0066, + "step": 3550 + }, + { + "epoch": 1.0899324739103744, + "grad_norm": 0.5276355147361755, + "learning_rate": 9.837450645177988e-05, + "loss": 2.03, + "step": 3551 + }, + { + "epoch": 1.0902394106813997, + "grad_norm": 0.49717894196510315, + "learning_rate": 9.837324911452705e-05, + "loss": 1.9897, + "step": 3552 + }, + { + "epoch": 1.0905463474524248, + "grad_norm": 0.460783451795578, + "learning_rate": 9.837199129922142e-05, + "loss": 2.089, + "step": 3553 + }, + { + "epoch": 1.09085328422345, + "grad_norm": 0.505473792552948, + "learning_rate": 9.837073300587541e-05, + "loss": 2.035, + "step": 3554 + }, + { + "epoch": 1.091160220994475, + "grad_norm": 0.4588155150413513, + "learning_rate": 9.836947423450147e-05, + "loss": 2.0029, + "step": 3555 + }, + { + "epoch": 1.0914671577655004, + "grad_norm": 0.5151825547218323, + "learning_rate": 9.836821498511203e-05, + "loss": 2.1075, + "step": 3556 + }, + { + "epoch": 1.0917740945365255, + "grad_norm": 0.46669647097587585, + "learning_rate": 9.836695525771955e-05, + "loss": 2.0468, + "step": 3557 + }, + { + "epoch": 1.0920810313075506, + "grad_norm": 0.49291539192199707, + "learning_rate": 9.836569505233647e-05, + "loss": 2.1201, + "step": 3558 + }, + { + "epoch": 1.0923879680785757, + "grad_norm": 0.49323126673698425, + "learning_rate": 9.836443436897525e-05, + "loss": 1.9796, + "step": 3559 + }, + { + "epoch": 1.092694904849601, + "grad_norm": 0.4784039258956909, + "learning_rate": 9.836317320764832e-05, + "loss": 2.0267, + "step": 3560 + }, + { + "epoch": 1.0930018416206262, + "grad_norm": 0.5402999520301819, + "learning_rate": 9.836191156836818e-05, + "loss": 2.07, + "step": 3561 + }, + { + "epoch": 1.0933087783916513, + "grad_norm": 0.5989857912063599, + "learning_rate": 9.83606494511473e-05, + "loss": 2.0518, + "step": 3562 + }, + { + "epoch": 1.0936157151626764, + "grad_norm": 0.685855507850647, + "learning_rate": 9.835938685599811e-05, + "loss": 2.0632, + "step": 3563 + }, + { + "epoch": 1.0939226519337018, + "grad_norm": 0.7716066837310791, + "learning_rate": 9.835812378293312e-05, + "loss": 2.0758, + "step": 3564 + }, + { + "epoch": 1.0942295887047269, + "grad_norm": 0.6822659969329834, + "learning_rate": 9.835686023196481e-05, + "loss": 2.0077, + "step": 3565 + }, + { + "epoch": 1.094536525475752, + "grad_norm": 0.5031718611717224, + "learning_rate": 9.835559620310566e-05, + "loss": 2.0432, + "step": 3566 + }, + { + "epoch": 1.094843462246777, + "grad_norm": 0.5570902228355408, + "learning_rate": 9.835433169636818e-05, + "loss": 2.1203, + "step": 3567 + }, + { + "epoch": 1.0951503990178024, + "grad_norm": 0.6224993467330933, + "learning_rate": 9.835306671176484e-05, + "loss": 2.0281, + "step": 3568 + }, + { + "epoch": 1.0954573357888275, + "grad_norm": 0.67215895652771, + "learning_rate": 9.835180124930816e-05, + "loss": 2.1158, + "step": 3569 + }, + { + "epoch": 1.0957642725598526, + "grad_norm": 0.5764983892440796, + "learning_rate": 9.835053530901064e-05, + "loss": 1.9735, + "step": 3570 + }, + { + "epoch": 1.0960712093308778, + "grad_norm": 0.48459672927856445, + "learning_rate": 9.834926889088478e-05, + "loss": 2.0074, + "step": 3571 + }, + { + "epoch": 1.096378146101903, + "grad_norm": 0.4789890944957733, + "learning_rate": 9.834800199494312e-05, + "loss": 1.9942, + "step": 3572 + }, + { + "epoch": 1.0966850828729282, + "grad_norm": 0.5133237838745117, + "learning_rate": 9.834673462119817e-05, + "loss": 2.0204, + "step": 3573 + }, + { + "epoch": 1.0969920196439533, + "grad_norm": 0.638518750667572, + "learning_rate": 9.834546676966244e-05, + "loss": 2.1396, + "step": 3574 + }, + { + "epoch": 1.0972989564149784, + "grad_norm": 0.5471677780151367, + "learning_rate": 9.834419844034848e-05, + "loss": 1.99, + "step": 3575 + }, + { + "epoch": 1.0976058931860038, + "grad_norm": 0.5372926592826843, + "learning_rate": 9.83429296332688e-05, + "loss": 2.0241, + "step": 3576 + }, + { + "epoch": 1.0979128299570289, + "grad_norm": 0.5284983515739441, + "learning_rate": 9.834166034843597e-05, + "loss": 2.0705, + "step": 3577 + }, + { + "epoch": 1.098219766728054, + "grad_norm": 0.5212574601173401, + "learning_rate": 9.834039058586252e-05, + "loss": 2.0648, + "step": 3578 + }, + { + "epoch": 1.098526703499079, + "grad_norm": 0.439454048871994, + "learning_rate": 9.833912034556099e-05, + "loss": 1.9981, + "step": 3579 + }, + { + "epoch": 1.0988336402701044, + "grad_norm": 0.529550313949585, + "learning_rate": 9.833784962754394e-05, + "loss": 2.0092, + "step": 3580 + }, + { + "epoch": 1.0991405770411296, + "grad_norm": 0.5555844902992249, + "learning_rate": 9.833657843182394e-05, + "loss": 2.0457, + "step": 3581 + }, + { + "epoch": 1.0994475138121547, + "grad_norm": 0.56191086769104, + "learning_rate": 9.833530675841352e-05, + "loss": 2.0742, + "step": 3582 + }, + { + "epoch": 1.0997544505831798, + "grad_norm": 0.5119436383247375, + "learning_rate": 9.833403460732529e-05, + "loss": 2.0836, + "step": 3583 + }, + { + "epoch": 1.1000613873542051, + "grad_norm": 0.48049578070640564, + "learning_rate": 9.833276197857179e-05, + "loss": 2.0018, + "step": 3584 + }, + { + "epoch": 1.1003683241252302, + "grad_norm": 0.48501092195510864, + "learning_rate": 9.83314888721656e-05, + "loss": 2.0158, + "step": 3585 + }, + { + "epoch": 1.1006752608962553, + "grad_norm": 0.528548538684845, + "learning_rate": 9.833021528811932e-05, + "loss": 2.0327, + "step": 3586 + }, + { + "epoch": 1.1009821976672804, + "grad_norm": 0.5243194699287415, + "learning_rate": 9.832894122644551e-05, + "loss": 1.9874, + "step": 3587 + }, + { + "epoch": 1.1012891344383058, + "grad_norm": 0.46920302510261536, + "learning_rate": 9.832766668715681e-05, + "loss": 2.0487, + "step": 3588 + }, + { + "epoch": 1.101596071209331, + "grad_norm": 0.45994171500205994, + "learning_rate": 9.832639167026575e-05, + "loss": 2.0926, + "step": 3589 + }, + { + "epoch": 1.101903007980356, + "grad_norm": 0.5337465405464172, + "learning_rate": 9.832511617578497e-05, + "loss": 1.9957, + "step": 3590 + }, + { + "epoch": 1.1022099447513811, + "grad_norm": 0.5920217633247375, + "learning_rate": 9.832384020372707e-05, + "loss": 2.0571, + "step": 3591 + }, + { + "epoch": 1.1025168815224065, + "grad_norm": 0.651720404624939, + "learning_rate": 9.832256375410466e-05, + "loss": 2.0382, + "step": 3592 + }, + { + "epoch": 1.1028238182934316, + "grad_norm": 0.6063461899757385, + "learning_rate": 9.832128682693035e-05, + "loss": 1.9932, + "step": 3593 + }, + { + "epoch": 1.1031307550644567, + "grad_norm": 0.5111881494522095, + "learning_rate": 9.832000942221676e-05, + "loss": 1.9821, + "step": 3594 + }, + { + "epoch": 1.1034376918354818, + "grad_norm": 0.5419835448265076, + "learning_rate": 9.831873153997652e-05, + "loss": 2.0535, + "step": 3595 + }, + { + "epoch": 1.1037446286065071, + "grad_norm": 0.5685762763023376, + "learning_rate": 9.831745318022226e-05, + "loss": 2.0715, + "step": 3596 + }, + { + "epoch": 1.1040515653775322, + "grad_norm": 0.6095051765441895, + "learning_rate": 9.831617434296659e-05, + "loss": 2.0382, + "step": 3597 + }, + { + "epoch": 1.1043585021485574, + "grad_norm": 0.548292338848114, + "learning_rate": 9.831489502822217e-05, + "loss": 1.98, + "step": 3598 + }, + { + "epoch": 1.1046654389195825, + "grad_norm": 0.5056986808776855, + "learning_rate": 9.831361523600165e-05, + "loss": 2.0271, + "step": 3599 + }, + { + "epoch": 1.1049723756906078, + "grad_norm": 0.48790082335472107, + "learning_rate": 9.831233496631767e-05, + "loss": 1.9555, + "step": 3600 + }, + { + "epoch": 1.105279312461633, + "grad_norm": 0.4663766622543335, + "learning_rate": 9.831105421918287e-05, + "loss": 1.9985, + "step": 3601 + }, + { + "epoch": 1.105586249232658, + "grad_norm": 0.4549616277217865, + "learning_rate": 9.83097729946099e-05, + "loss": 2.0543, + "step": 3602 + }, + { + "epoch": 1.1058931860036831, + "grad_norm": 0.46699193120002747, + "learning_rate": 9.830849129261146e-05, + "loss": 2.0395, + "step": 3603 + }, + { + "epoch": 1.1062001227747085, + "grad_norm": 0.4600387215614319, + "learning_rate": 9.830720911320019e-05, + "loss": 2.0155, + "step": 3604 + }, + { + "epoch": 1.1065070595457336, + "grad_norm": 0.4854283034801483, + "learning_rate": 9.830592645638877e-05, + "loss": 2.0698, + "step": 3605 + }, + { + "epoch": 1.1068139963167587, + "grad_norm": 0.5249526500701904, + "learning_rate": 9.830464332218987e-05, + "loss": 2.0842, + "step": 3606 + }, + { + "epoch": 1.107120933087784, + "grad_norm": 0.6377332806587219, + "learning_rate": 9.830335971061616e-05, + "loss": 2.1399, + "step": 3607 + }, + { + "epoch": 1.1074278698588091, + "grad_norm": 0.632194995880127, + "learning_rate": 9.830207562168034e-05, + "loss": 2.1203, + "step": 3608 + }, + { + "epoch": 1.1077348066298343, + "grad_norm": 0.5585857629776001, + "learning_rate": 9.830079105539512e-05, + "loss": 2.0219, + "step": 3609 + }, + { + "epoch": 1.1080417434008594, + "grad_norm": 0.5613297820091248, + "learning_rate": 9.829950601177316e-05, + "loss": 2.0464, + "step": 3610 + }, + { + "epoch": 1.1083486801718845, + "grad_norm": 0.5213276743888855, + "learning_rate": 9.829822049082716e-05, + "loss": 2.0134, + "step": 3611 + }, + { + "epoch": 1.1086556169429098, + "grad_norm": 0.5008644461631775, + "learning_rate": 9.829693449256984e-05, + "loss": 1.9952, + "step": 3612 + }, + { + "epoch": 1.108962553713935, + "grad_norm": 0.5565455555915833, + "learning_rate": 9.829564801701392e-05, + "loss": 1.9737, + "step": 3613 + }, + { + "epoch": 1.10926949048496, + "grad_norm": 0.6150243878364563, + "learning_rate": 9.82943610641721e-05, + "loss": 2.0414, + "step": 3614 + }, + { + "epoch": 1.1095764272559854, + "grad_norm": 0.6731769442558289, + "learning_rate": 9.829307363405709e-05, + "loss": 2.0262, + "step": 3615 + }, + { + "epoch": 1.1098833640270105, + "grad_norm": 0.5681004524230957, + "learning_rate": 9.829178572668162e-05, + "loss": 2.0303, + "step": 3616 + }, + { + "epoch": 1.1101903007980356, + "grad_norm": 0.4748475253582001, + "learning_rate": 9.829049734205841e-05, + "loss": 1.9756, + "step": 3617 + }, + { + "epoch": 1.1104972375690607, + "grad_norm": 0.4218698740005493, + "learning_rate": 9.82892084802002e-05, + "loss": 2.0243, + "step": 3618 + }, + { + "epoch": 1.1108041743400858, + "grad_norm": 0.47928178310394287, + "learning_rate": 9.828791914111976e-05, + "loss": 2.0368, + "step": 3619 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5805749297142029, + "learning_rate": 9.828662932482977e-05, + "loss": 2.0071, + "step": 3620 + }, + { + "epoch": 1.1114180478821363, + "grad_norm": 0.5580070614814758, + "learning_rate": 9.828533903134302e-05, + "loss": 1.9568, + "step": 3621 + }, + { + "epoch": 1.1117249846531614, + "grad_norm": 0.572694718837738, + "learning_rate": 9.828404826067224e-05, + "loss": 2.0128, + "step": 3622 + }, + { + "epoch": 1.1120319214241867, + "grad_norm": 0.605338990688324, + "learning_rate": 9.828275701283021e-05, + "loss": 2.0638, + "step": 3623 + }, + { + "epoch": 1.1123388581952118, + "grad_norm": 0.550521969795227, + "learning_rate": 9.828146528782967e-05, + "loss": 2.118, + "step": 3624 + }, + { + "epoch": 1.112645794966237, + "grad_norm": 0.5420751571655273, + "learning_rate": 9.828017308568337e-05, + "loss": 2.0685, + "step": 3625 + }, + { + "epoch": 1.112952731737262, + "grad_norm": 0.5761057734489441, + "learning_rate": 9.827888040640414e-05, + "loss": 2.1111, + "step": 3626 + }, + { + "epoch": 1.1132596685082874, + "grad_norm": 0.5724154710769653, + "learning_rate": 9.827758725000468e-05, + "loss": 2.0596, + "step": 3627 + }, + { + "epoch": 1.1135666052793125, + "grad_norm": 0.5120618343353271, + "learning_rate": 9.827629361649783e-05, + "loss": 1.9811, + "step": 3628 + }, + { + "epoch": 1.1138735420503376, + "grad_norm": 0.4449520409107208, + "learning_rate": 9.827499950589633e-05, + "loss": 1.9935, + "step": 3629 + }, + { + "epoch": 1.1141804788213627, + "grad_norm": 0.5478667616844177, + "learning_rate": 9.827370491821302e-05, + "loss": 2.0142, + "step": 3630 + }, + { + "epoch": 1.114487415592388, + "grad_norm": 0.6170383095741272, + "learning_rate": 9.827240985346064e-05, + "loss": 2.0588, + "step": 3631 + }, + { + "epoch": 1.1147943523634132, + "grad_norm": 0.5950221419334412, + "learning_rate": 9.827111431165202e-05, + "loss": 2.0187, + "step": 3632 + }, + { + "epoch": 1.1151012891344383, + "grad_norm": 0.5250533819198608, + "learning_rate": 9.826981829279995e-05, + "loss": 2.0288, + "step": 3633 + }, + { + "epoch": 1.1154082259054634, + "grad_norm": 0.6252482533454895, + "learning_rate": 9.826852179691725e-05, + "loss": 2.1834, + "step": 3634 + }, + { + "epoch": 1.1157151626764887, + "grad_norm": 0.5258986353874207, + "learning_rate": 9.826722482401673e-05, + "loss": 1.9894, + "step": 3635 + }, + { + "epoch": 1.1160220994475138, + "grad_norm": 0.5532206892967224, + "learning_rate": 9.82659273741112e-05, + "loss": 2.013, + "step": 3636 + }, + { + "epoch": 1.116329036218539, + "grad_norm": 0.5178828835487366, + "learning_rate": 9.826462944721349e-05, + "loss": 1.955, + "step": 3637 + }, + { + "epoch": 1.116635972989564, + "grad_norm": 0.5466227531433105, + "learning_rate": 9.826333104333642e-05, + "loss": 2.1073, + "step": 3638 + }, + { + "epoch": 1.1169429097605894, + "grad_norm": 0.5513507723808289, + "learning_rate": 9.826203216249282e-05, + "loss": 2.0735, + "step": 3639 + }, + { + "epoch": 1.1172498465316145, + "grad_norm": 0.5485204458236694, + "learning_rate": 9.826073280469554e-05, + "loss": 2.0699, + "step": 3640 + }, + { + "epoch": 1.1175567833026396, + "grad_norm": 0.5148037075996399, + "learning_rate": 9.825943296995741e-05, + "loss": 1.9364, + "step": 3641 + }, + { + "epoch": 1.1178637200736647, + "grad_norm": 0.5639125108718872, + "learning_rate": 9.825813265829127e-05, + "loss": 2.078, + "step": 3642 + }, + { + "epoch": 1.11817065684469, + "grad_norm": 0.581631064414978, + "learning_rate": 9.825683186970997e-05, + "loss": 2.0404, + "step": 3643 + }, + { + "epoch": 1.1184775936157152, + "grad_norm": 0.5630286335945129, + "learning_rate": 9.82555306042264e-05, + "loss": 2.0615, + "step": 3644 + }, + { + "epoch": 1.1187845303867403, + "grad_norm": 0.5661062598228455, + "learning_rate": 9.825422886185338e-05, + "loss": 2.0432, + "step": 3645 + }, + { + "epoch": 1.1190914671577654, + "grad_norm": 0.4960556626319885, + "learning_rate": 9.825292664260379e-05, + "loss": 2.0576, + "step": 3646 + }, + { + "epoch": 1.1193984039287908, + "grad_norm": 0.5052362084388733, + "learning_rate": 9.825162394649048e-05, + "loss": 2.0615, + "step": 3647 + }, + { + "epoch": 1.1197053406998159, + "grad_norm": 0.566758930683136, + "learning_rate": 9.825032077352636e-05, + "loss": 2.0821, + "step": 3648 + }, + { + "epoch": 1.120012277470841, + "grad_norm": 0.5705568790435791, + "learning_rate": 9.824901712372429e-05, + "loss": 2.1455, + "step": 3649 + }, + { + "epoch": 1.120319214241866, + "grad_norm": 0.5584011673927307, + "learning_rate": 9.824771299709714e-05, + "loss": 2.0911, + "step": 3650 + }, + { + "epoch": 1.1206261510128914, + "grad_norm": 0.5621497631072998, + "learning_rate": 9.824640839365782e-05, + "loss": 2.1209, + "step": 3651 + }, + { + "epoch": 1.1209330877839165, + "grad_norm": 0.4893646240234375, + "learning_rate": 9.824510331341921e-05, + "loss": 1.977, + "step": 3652 + }, + { + "epoch": 1.1212400245549416, + "grad_norm": 0.5626688599586487, + "learning_rate": 9.82437977563942e-05, + "loss": 2.1114, + "step": 3653 + }, + { + "epoch": 1.1215469613259668, + "grad_norm": 0.5714966058731079, + "learning_rate": 9.824249172259573e-05, + "loss": 2.021, + "step": 3654 + }, + { + "epoch": 1.121853898096992, + "grad_norm": 0.5190821886062622, + "learning_rate": 9.824118521203666e-05, + "loss": 1.9788, + "step": 3655 + }, + { + "epoch": 1.1221608348680172, + "grad_norm": 0.46421363949775696, + "learning_rate": 9.823987822472994e-05, + "loss": 1.9762, + "step": 3656 + }, + { + "epoch": 1.1224677716390423, + "grad_norm": 0.5071156620979309, + "learning_rate": 9.823857076068846e-05, + "loss": 1.9625, + "step": 3657 + }, + { + "epoch": 1.1227747084100674, + "grad_norm": 0.5762679576873779, + "learning_rate": 9.823726281992515e-05, + "loss": 2.0543, + "step": 3658 + }, + { + "epoch": 1.1230816451810928, + "grad_norm": 0.6275226473808289, + "learning_rate": 9.823595440245294e-05, + "loss": 2.0878, + "step": 3659 + }, + { + "epoch": 1.1233885819521179, + "grad_norm": 0.6893213391304016, + "learning_rate": 9.823464550828476e-05, + "loss": 2.1059, + "step": 3660 + }, + { + "epoch": 1.123695518723143, + "grad_norm": 0.5521993041038513, + "learning_rate": 9.823333613743353e-05, + "loss": 2.035, + "step": 3661 + }, + { + "epoch": 1.124002455494168, + "grad_norm": 0.4918796718120575, + "learning_rate": 9.823202628991221e-05, + "loss": 1.9873, + "step": 3662 + }, + { + "epoch": 1.1243093922651934, + "grad_norm": 0.5177932977676392, + "learning_rate": 9.823071596573373e-05, + "loss": 2.0376, + "step": 3663 + }, + { + "epoch": 1.1246163290362186, + "grad_norm": 0.5337314009666443, + "learning_rate": 9.822940516491106e-05, + "loss": 2.1065, + "step": 3664 + }, + { + "epoch": 1.1249232658072437, + "grad_norm": 0.5179010629653931, + "learning_rate": 9.822809388745713e-05, + "loss": 1.9642, + "step": 3665 + }, + { + "epoch": 1.125230202578269, + "grad_norm": 0.5394679307937622, + "learning_rate": 9.82267821333849e-05, + "loss": 2.0275, + "step": 3666 + }, + { + "epoch": 1.1255371393492941, + "grad_norm": 0.582873523235321, + "learning_rate": 9.822546990270735e-05, + "loss": 2.0369, + "step": 3667 + }, + { + "epoch": 1.1258440761203192, + "grad_norm": 0.6595674753189087, + "learning_rate": 9.822415719543745e-05, + "loss": 1.9776, + "step": 3668 + }, + { + "epoch": 1.1261510128913443, + "grad_norm": 0.8103840947151184, + "learning_rate": 9.822284401158814e-05, + "loss": 2.0784, + "step": 3669 + }, + { + "epoch": 1.1264579496623695, + "grad_norm": 0.9062070250511169, + "learning_rate": 9.822153035117245e-05, + "loss": 1.9886, + "step": 3670 + }, + { + "epoch": 1.1267648864333948, + "grad_norm": 0.8718156814575195, + "learning_rate": 9.822021621420333e-05, + "loss": 2.0499, + "step": 3671 + }, + { + "epoch": 1.12707182320442, + "grad_norm": 0.6499583721160889, + "learning_rate": 9.821890160069375e-05, + "loss": 2.0734, + "step": 3672 + }, + { + "epoch": 1.127378759975445, + "grad_norm": 0.4573141932487488, + "learning_rate": 9.821758651065673e-05, + "loss": 2.0306, + "step": 3673 + }, + { + "epoch": 1.1276856967464703, + "grad_norm": 0.6441135406494141, + "learning_rate": 9.821627094410526e-05, + "loss": 2.051, + "step": 3674 + }, + { + "epoch": 1.1279926335174955, + "grad_norm": 0.7201390266418457, + "learning_rate": 9.821495490105235e-05, + "loss": 2.0187, + "step": 3675 + }, + { + "epoch": 1.1282995702885206, + "grad_norm": 0.6751874685287476, + "learning_rate": 9.821363838151099e-05, + "loss": 2.0363, + "step": 3676 + }, + { + "epoch": 1.1286065070595457, + "grad_norm": 0.5435949563980103, + "learning_rate": 9.821232138549419e-05, + "loss": 1.939, + "step": 3677 + }, + { + "epoch": 1.1289134438305708, + "grad_norm": 0.605248212814331, + "learning_rate": 9.821100391301497e-05, + "loss": 2.146, + "step": 3678 + }, + { + "epoch": 1.1292203806015961, + "grad_norm": 0.6798139810562134, + "learning_rate": 9.820968596408636e-05, + "loss": 2.0423, + "step": 3679 + }, + { + "epoch": 1.1295273173726212, + "grad_norm": 0.6683683395385742, + "learning_rate": 9.820836753872137e-05, + "loss": 1.9768, + "step": 3680 + }, + { + "epoch": 1.1298342541436464, + "grad_norm": 0.578346312046051, + "learning_rate": 9.820704863693304e-05, + "loss": 1.9313, + "step": 3681 + }, + { + "epoch": 1.1301411909146717, + "grad_norm": 0.5639599561691284, + "learning_rate": 9.820572925873441e-05, + "loss": 2.0706, + "step": 3682 + }, + { + "epoch": 1.1304481276856968, + "grad_norm": 0.5749368071556091, + "learning_rate": 9.82044094041385e-05, + "loss": 2.0072, + "step": 3683 + }, + { + "epoch": 1.130755064456722, + "grad_norm": 0.6490229368209839, + "learning_rate": 9.820308907315836e-05, + "loss": 1.9947, + "step": 3684 + }, + { + "epoch": 1.131062001227747, + "grad_norm": 0.6207692623138428, + "learning_rate": 9.820176826580705e-05, + "loss": 2.1426, + "step": 3685 + }, + { + "epoch": 1.1313689379987721, + "grad_norm": 0.6421573162078857, + "learning_rate": 9.82004469820976e-05, + "loss": 2.0558, + "step": 3686 + }, + { + "epoch": 1.1316758747697975, + "grad_norm": 0.5462764501571655, + "learning_rate": 9.81991252220431e-05, + "loss": 2.0072, + "step": 3687 + }, + { + "epoch": 1.1319828115408226, + "grad_norm": 0.49791282415390015, + "learning_rate": 9.819780298565657e-05, + "loss": 1.9949, + "step": 3688 + }, + { + "epoch": 1.1322897483118477, + "grad_norm": 0.5120366215705872, + "learning_rate": 9.819648027295112e-05, + "loss": 2.0503, + "step": 3689 + }, + { + "epoch": 1.132596685082873, + "grad_norm": 0.5118343830108643, + "learning_rate": 9.81951570839398e-05, + "loss": 2.0104, + "step": 3690 + }, + { + "epoch": 1.1329036218538981, + "grad_norm": 0.44520822167396545, + "learning_rate": 9.81938334186357e-05, + "loss": 2.0024, + "step": 3691 + }, + { + "epoch": 1.1332105586249233, + "grad_norm": 0.5505960583686829, + "learning_rate": 9.819250927705188e-05, + "loss": 2.0924, + "step": 3692 + }, + { + "epoch": 1.1335174953959484, + "grad_norm": 0.5269182920455933, + "learning_rate": 9.819118465920143e-05, + "loss": 2.0553, + "step": 3693 + }, + { + "epoch": 1.1338244321669735, + "grad_norm": 0.4864311218261719, + "learning_rate": 9.818985956509745e-05, + "loss": 2.0405, + "step": 3694 + }, + { + "epoch": 1.1341313689379988, + "grad_norm": 0.515202522277832, + "learning_rate": 9.818853399475304e-05, + "loss": 2.0211, + "step": 3695 + }, + { + "epoch": 1.134438305709024, + "grad_norm": 0.5360483527183533, + "learning_rate": 9.818720794818128e-05, + "loss": 2.1077, + "step": 3696 + }, + { + "epoch": 1.134745242480049, + "grad_norm": 0.5469255447387695, + "learning_rate": 9.818588142539531e-05, + "loss": 1.9538, + "step": 3697 + }, + { + "epoch": 1.1350521792510744, + "grad_norm": 0.5042214393615723, + "learning_rate": 9.818455442640819e-05, + "loss": 2.0477, + "step": 3698 + }, + { + "epoch": 1.1353591160220995, + "grad_norm": 0.5678744316101074, + "learning_rate": 9.81832269512331e-05, + "loss": 2.0871, + "step": 3699 + }, + { + "epoch": 1.1356660527931246, + "grad_norm": 0.5218677520751953, + "learning_rate": 9.818189899988308e-05, + "loss": 2.1014, + "step": 3700 + }, + { + "epoch": 1.1359729895641497, + "grad_norm": 0.5141727924346924, + "learning_rate": 9.818057057237132e-05, + "loss": 2.0385, + "step": 3701 + }, + { + "epoch": 1.136279926335175, + "grad_norm": 0.5288038849830627, + "learning_rate": 9.81792416687109e-05, + "loss": 2.0736, + "step": 3702 + }, + { + "epoch": 1.1365868631062002, + "grad_norm": 0.5533168911933899, + "learning_rate": 9.817791228891499e-05, + "loss": 2.032, + "step": 3703 + }, + { + "epoch": 1.1368937998772253, + "grad_norm": 0.4840674102306366, + "learning_rate": 9.81765824329967e-05, + "loss": 2.027, + "step": 3704 + }, + { + "epoch": 1.1372007366482504, + "grad_norm": 0.5060023069381714, + "learning_rate": 9.817525210096921e-05, + "loss": 2.0561, + "step": 3705 + }, + { + "epoch": 1.1375076734192757, + "grad_norm": 0.48830488324165344, + "learning_rate": 9.817392129284561e-05, + "loss": 1.9807, + "step": 3706 + }, + { + "epoch": 1.1378146101903008, + "grad_norm": 0.4644564390182495, + "learning_rate": 9.817259000863911e-05, + "loss": 1.9871, + "step": 3707 + }, + { + "epoch": 1.138121546961326, + "grad_norm": 0.4644739329814911, + "learning_rate": 9.817125824836283e-05, + "loss": 2.0253, + "step": 3708 + }, + { + "epoch": 1.138428483732351, + "grad_norm": 0.5376463532447815, + "learning_rate": 9.816992601202994e-05, + "loss": 2.0693, + "step": 3709 + }, + { + "epoch": 1.1387354205033764, + "grad_norm": 0.49980148673057556, + "learning_rate": 9.816859329965363e-05, + "loss": 2.0123, + "step": 3710 + }, + { + "epoch": 1.1390423572744015, + "grad_norm": 0.5452225208282471, + "learning_rate": 9.816726011124702e-05, + "loss": 2.0725, + "step": 3711 + }, + { + "epoch": 1.1393492940454266, + "grad_norm": 0.5428896546363831, + "learning_rate": 9.816592644682332e-05, + "loss": 2.0446, + "step": 3712 + }, + { + "epoch": 1.1396562308164517, + "grad_norm": 0.5448847413063049, + "learning_rate": 9.816459230639571e-05, + "loss": 2.0262, + "step": 3713 + }, + { + "epoch": 1.139963167587477, + "grad_norm": 0.48574572801589966, + "learning_rate": 9.816325768997736e-05, + "loss": 2.0105, + "step": 3714 + }, + { + "epoch": 1.1402701043585022, + "grad_norm": 0.5566397905349731, + "learning_rate": 9.816192259758147e-05, + "loss": 2.0665, + "step": 3715 + }, + { + "epoch": 1.1405770411295273, + "grad_norm": 0.6098625659942627, + "learning_rate": 9.816058702922124e-05, + "loss": 2.0589, + "step": 3716 + }, + { + "epoch": 1.1408839779005524, + "grad_norm": 0.6118699312210083, + "learning_rate": 9.815925098490985e-05, + "loss": 2.0683, + "step": 3717 + }, + { + "epoch": 1.1411909146715777, + "grad_norm": 0.5213121175765991, + "learning_rate": 9.815791446466053e-05, + "loss": 2.0226, + "step": 3718 + }, + { + "epoch": 1.1414978514426029, + "grad_norm": 0.45717960596084595, + "learning_rate": 9.815657746848648e-05, + "loss": 2.0371, + "step": 3719 + }, + { + "epoch": 1.141804788213628, + "grad_norm": 0.4613656997680664, + "learning_rate": 9.815523999640088e-05, + "loss": 2.0702, + "step": 3720 + }, + { + "epoch": 1.142111724984653, + "grad_norm": 0.4527476727962494, + "learning_rate": 9.8153902048417e-05, + "loss": 1.9893, + "step": 3721 + }, + { + "epoch": 1.1424186617556784, + "grad_norm": 0.4524305462837219, + "learning_rate": 9.815256362454801e-05, + "loss": 1.975, + "step": 3722 + }, + { + "epoch": 1.1427255985267035, + "grad_norm": 0.4421180188655853, + "learning_rate": 9.815122472480718e-05, + "loss": 1.9987, + "step": 3723 + }, + { + "epoch": 1.1430325352977286, + "grad_norm": 0.4833788275718689, + "learning_rate": 9.814988534920771e-05, + "loss": 2.0246, + "step": 3724 + }, + { + "epoch": 1.1433394720687537, + "grad_norm": 0.46547624468803406, + "learning_rate": 9.814854549776287e-05, + "loss": 2.0007, + "step": 3725 + }, + { + "epoch": 1.143646408839779, + "grad_norm": 0.43220648169517517, + "learning_rate": 9.814720517048587e-05, + "loss": 1.9845, + "step": 3726 + }, + { + "epoch": 1.1439533456108042, + "grad_norm": 0.473910391330719, + "learning_rate": 9.814586436738998e-05, + "loss": 2.0518, + "step": 3727 + }, + { + "epoch": 1.1442602823818293, + "grad_norm": 0.507354199886322, + "learning_rate": 9.814452308848843e-05, + "loss": 2.0708, + "step": 3728 + }, + { + "epoch": 1.1445672191528544, + "grad_norm": 0.4585053622722626, + "learning_rate": 9.814318133379448e-05, + "loss": 2.0124, + "step": 3729 + }, + { + "epoch": 1.1448741559238798, + "grad_norm": 0.5280457735061646, + "learning_rate": 9.81418391033214e-05, + "loss": 2.0424, + "step": 3730 + }, + { + "epoch": 1.1451810926949049, + "grad_norm": 0.5173056125640869, + "learning_rate": 9.814049639708245e-05, + "loss": 1.9666, + "step": 3731 + }, + { + "epoch": 1.14548802946593, + "grad_norm": 0.5850839018821716, + "learning_rate": 9.81391532150909e-05, + "loss": 2.0765, + "step": 3732 + }, + { + "epoch": 1.145794966236955, + "grad_norm": 0.5450417995452881, + "learning_rate": 9.813780955736002e-05, + "loss": 2.0696, + "step": 3733 + }, + { + "epoch": 1.1461019030079804, + "grad_norm": 0.4577319622039795, + "learning_rate": 9.81364654239031e-05, + "loss": 2.0493, + "step": 3734 + }, + { + "epoch": 1.1464088397790055, + "grad_norm": 0.5211838483810425, + "learning_rate": 9.813512081473339e-05, + "loss": 2.0578, + "step": 3735 + }, + { + "epoch": 1.1467157765500307, + "grad_norm": 0.6763051152229309, + "learning_rate": 9.813377572986422e-05, + "loss": 2.0859, + "step": 3736 + }, + { + "epoch": 1.1470227133210558, + "grad_norm": 0.8591815233230591, + "learning_rate": 9.813243016930887e-05, + "loss": 1.9743, + "step": 3737 + }, + { + "epoch": 1.147329650092081, + "grad_norm": 0.8573755025863647, + "learning_rate": 9.813108413308063e-05, + "loss": 2.048, + "step": 3738 + }, + { + "epoch": 1.1476365868631062, + "grad_norm": 0.6887713074684143, + "learning_rate": 9.812973762119281e-05, + "loss": 2.0184, + "step": 3739 + }, + { + "epoch": 1.1479435236341313, + "grad_norm": 0.5491438508033752, + "learning_rate": 9.81283906336587e-05, + "loss": 2.0373, + "step": 3740 + }, + { + "epoch": 1.1482504604051567, + "grad_norm": 0.6413923501968384, + "learning_rate": 9.812704317049164e-05, + "loss": 2.067, + "step": 3741 + }, + { + "epoch": 1.1485573971761818, + "grad_norm": 0.8731338381767273, + "learning_rate": 9.812569523170492e-05, + "loss": 1.9996, + "step": 3742 + }, + { + "epoch": 1.1488643339472069, + "grad_norm": 0.8043886423110962, + "learning_rate": 9.812434681731189e-05, + "loss": 2.0464, + "step": 3743 + }, + { + "epoch": 1.149171270718232, + "grad_norm": 0.5458279252052307, + "learning_rate": 9.812299792732584e-05, + "loss": 2.0026, + "step": 3744 + }, + { + "epoch": 1.149478207489257, + "grad_norm": 0.5135432481765747, + "learning_rate": 9.812164856176011e-05, + "loss": 2.0302, + "step": 3745 + }, + { + "epoch": 1.1497851442602824, + "grad_norm": 0.6673153638839722, + "learning_rate": 9.812029872062807e-05, + "loss": 2.0435, + "step": 3746 + }, + { + "epoch": 1.1500920810313076, + "grad_norm": 0.6777083873748779, + "learning_rate": 9.811894840394302e-05, + "loss": 2.0591, + "step": 3747 + }, + { + "epoch": 1.1503990178023327, + "grad_norm": 0.6660524010658264, + "learning_rate": 9.811759761171833e-05, + "loss": 2.0461, + "step": 3748 + }, + { + "epoch": 1.150705954573358, + "grad_norm": 0.6079594492912292, + "learning_rate": 9.811624634396733e-05, + "loss": 2.0708, + "step": 3749 + }, + { + "epoch": 1.1510128913443831, + "grad_norm": 0.5242465734481812, + "learning_rate": 9.811489460070337e-05, + "loss": 2.0513, + "step": 3750 + }, + { + "epoch": 1.1513198281154082, + "grad_norm": 0.7091820240020752, + "learning_rate": 9.811354238193984e-05, + "loss": 2.0356, + "step": 3751 + }, + { + "epoch": 1.1516267648864333, + "grad_norm": 0.6781896948814392, + "learning_rate": 9.811218968769007e-05, + "loss": 2.0693, + "step": 3752 + }, + { + "epoch": 1.1519337016574585, + "grad_norm": 0.6036314368247986, + "learning_rate": 9.811083651796744e-05, + "loss": 2.134, + "step": 3753 + }, + { + "epoch": 1.1522406384284838, + "grad_norm": 0.6173892617225647, + "learning_rate": 9.810948287278534e-05, + "loss": 2.056, + "step": 3754 + }, + { + "epoch": 1.152547575199509, + "grad_norm": 0.4903198182582855, + "learning_rate": 9.810812875215712e-05, + "loss": 2.0037, + "step": 3755 + }, + { + "epoch": 1.152854511970534, + "grad_norm": 0.5527236461639404, + "learning_rate": 9.810677415609619e-05, + "loss": 2.0334, + "step": 3756 + }, + { + "epoch": 1.1531614487415593, + "grad_norm": 0.5342993140220642, + "learning_rate": 9.81054190846159e-05, + "loss": 2.0376, + "step": 3757 + }, + { + "epoch": 1.1534683855125845, + "grad_norm": 0.4860527515411377, + "learning_rate": 9.810406353772968e-05, + "loss": 2.0009, + "step": 3758 + }, + { + "epoch": 1.1537753222836096, + "grad_norm": 0.49722176790237427, + "learning_rate": 9.810270751545089e-05, + "loss": 2.051, + "step": 3759 + }, + { + "epoch": 1.1540822590546347, + "grad_norm": 0.4714743196964264, + "learning_rate": 9.810135101779296e-05, + "loss": 2.0474, + "step": 3760 + }, + { + "epoch": 1.1543891958256598, + "grad_norm": 0.5183619856834412, + "learning_rate": 9.80999940447693e-05, + "loss": 2.1032, + "step": 3761 + }, + { + "epoch": 1.1546961325966851, + "grad_norm": 0.6118659377098083, + "learning_rate": 9.809863659639328e-05, + "loss": 2.0967, + "step": 3762 + }, + { + "epoch": 1.1550030693677102, + "grad_norm": 0.49166184663772583, + "learning_rate": 9.809727867267838e-05, + "loss": 2.0683, + "step": 3763 + }, + { + "epoch": 1.1553100061387354, + "grad_norm": 0.5190026164054871, + "learning_rate": 9.809592027363795e-05, + "loss": 2.0161, + "step": 3764 + }, + { + "epoch": 1.1556169429097607, + "grad_norm": 0.516914427280426, + "learning_rate": 9.809456139928546e-05, + "loss": 2.0886, + "step": 3765 + }, + { + "epoch": 1.1559238796807858, + "grad_norm": 0.49737948179244995, + "learning_rate": 9.809320204963433e-05, + "loss": 2.0111, + "step": 3766 + }, + { + "epoch": 1.156230816451811, + "grad_norm": 0.44676536321640015, + "learning_rate": 9.809184222469796e-05, + "loss": 2.0571, + "step": 3767 + }, + { + "epoch": 1.156537753222836, + "grad_norm": 0.5008999109268188, + "learning_rate": 9.809048192448983e-05, + "loss": 2.0489, + "step": 3768 + }, + { + "epoch": 1.1568446899938611, + "grad_norm": 0.5116657614707947, + "learning_rate": 9.80891211490234e-05, + "loss": 1.9571, + "step": 3769 + }, + { + "epoch": 1.1571516267648865, + "grad_norm": 0.49909651279449463, + "learning_rate": 9.808775989831207e-05, + "loss": 2.0568, + "step": 3770 + }, + { + "epoch": 1.1574585635359116, + "grad_norm": 0.5186662077903748, + "learning_rate": 9.80863981723693e-05, + "loss": 2.0283, + "step": 3771 + }, + { + "epoch": 1.1577655003069367, + "grad_norm": 0.4974740445613861, + "learning_rate": 9.808503597120858e-05, + "loss": 1.9525, + "step": 3772 + }, + { + "epoch": 1.158072437077962, + "grad_norm": 0.5369553565979004, + "learning_rate": 9.808367329484333e-05, + "loss": 1.9627, + "step": 3773 + }, + { + "epoch": 1.1583793738489871, + "grad_norm": 0.5084113478660583, + "learning_rate": 9.808231014328704e-05, + "loss": 1.9563, + "step": 3774 + }, + { + "epoch": 1.1586863106200123, + "grad_norm": 0.6059956550598145, + "learning_rate": 9.808094651655319e-05, + "loss": 2.078, + "step": 3775 + }, + { + "epoch": 1.1589932473910374, + "grad_norm": 0.5677124261856079, + "learning_rate": 9.807958241465523e-05, + "loss": 1.9977, + "step": 3776 + }, + { + "epoch": 1.1593001841620627, + "grad_norm": 0.5582616329193115, + "learning_rate": 9.807821783760667e-05, + "loss": 2.0053, + "step": 3777 + }, + { + "epoch": 1.1596071209330878, + "grad_norm": 0.5558032989501953, + "learning_rate": 9.807685278542097e-05, + "loss": 2.0015, + "step": 3778 + }, + { + "epoch": 1.159914057704113, + "grad_norm": 0.553292989730835, + "learning_rate": 9.807548725811165e-05, + "loss": 2.133, + "step": 3779 + }, + { + "epoch": 1.160220994475138, + "grad_norm": 0.5281317234039307, + "learning_rate": 9.807412125569217e-05, + "loss": 2.0018, + "step": 3780 + }, + { + "epoch": 1.1605279312461634, + "grad_norm": 0.45385050773620605, + "learning_rate": 9.807275477817605e-05, + "loss": 1.9986, + "step": 3781 + }, + { + "epoch": 1.1608348680171885, + "grad_norm": 0.5843673944473267, + "learning_rate": 9.80713878255768e-05, + "loss": 2.0653, + "step": 3782 + }, + { + "epoch": 1.1611418047882136, + "grad_norm": 0.6193283796310425, + "learning_rate": 9.807002039790792e-05, + "loss": 1.9646, + "step": 3783 + }, + { + "epoch": 1.1614487415592387, + "grad_norm": 0.5831897258758545, + "learning_rate": 9.806865249518292e-05, + "loss": 1.9708, + "step": 3784 + }, + { + "epoch": 1.161755678330264, + "grad_norm": 0.49771901965141296, + "learning_rate": 9.806728411741533e-05, + "loss": 1.9953, + "step": 3785 + }, + { + "epoch": 1.1620626151012892, + "grad_norm": 0.5003515481948853, + "learning_rate": 9.806591526461864e-05, + "loss": 2.0503, + "step": 3786 + }, + { + "epoch": 1.1623695518723143, + "grad_norm": 0.5710052847862244, + "learning_rate": 9.806454593680642e-05, + "loss": 1.9976, + "step": 3787 + }, + { + "epoch": 1.1626764886433394, + "grad_norm": 0.5180788040161133, + "learning_rate": 9.806317613399218e-05, + "loss": 1.9872, + "step": 3788 + }, + { + "epoch": 1.1629834254143647, + "grad_norm": 0.5202008485794067, + "learning_rate": 9.806180585618949e-05, + "loss": 1.9628, + "step": 3789 + }, + { + "epoch": 1.1632903621853898, + "grad_norm": 0.47358211874961853, + "learning_rate": 9.806043510341183e-05, + "loss": 1.9994, + "step": 3790 + }, + { + "epoch": 1.163597298956415, + "grad_norm": 0.4258720278739929, + "learning_rate": 9.80590638756728e-05, + "loss": 1.9547, + "step": 3791 + }, + { + "epoch": 1.16390423572744, + "grad_norm": 0.4487614035606384, + "learning_rate": 9.805769217298593e-05, + "loss": 1.9912, + "step": 3792 + }, + { + "epoch": 1.1642111724984654, + "grad_norm": 0.4970495104789734, + "learning_rate": 9.805631999536477e-05, + "loss": 2.0568, + "step": 3793 + }, + { + "epoch": 1.1645181092694905, + "grad_norm": 0.4535474479198456, + "learning_rate": 9.805494734282289e-05, + "loss": 2.0088, + "step": 3794 + }, + { + "epoch": 1.1648250460405156, + "grad_norm": 0.44582805037498474, + "learning_rate": 9.805357421537385e-05, + "loss": 1.9694, + "step": 3795 + }, + { + "epoch": 1.1651319828115407, + "grad_norm": 0.43872734904289246, + "learning_rate": 9.805220061303125e-05, + "loss": 2.0041, + "step": 3796 + }, + { + "epoch": 1.165438919582566, + "grad_norm": 0.5050458908081055, + "learning_rate": 9.805082653580861e-05, + "loss": 1.9963, + "step": 3797 + }, + { + "epoch": 1.1657458563535912, + "grad_norm": 0.5346884727478027, + "learning_rate": 9.804945198371956e-05, + "loss": 2.0334, + "step": 3798 + }, + { + "epoch": 1.1660527931246163, + "grad_norm": 0.5607240796089172, + "learning_rate": 9.804807695677764e-05, + "loss": 2.0474, + "step": 3799 + }, + { + "epoch": 1.1663597298956414, + "grad_norm": 0.5343592166900635, + "learning_rate": 9.804670145499648e-05, + "loss": 2.0542, + "step": 3800 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5195753574371338, + "learning_rate": 9.804532547838964e-05, + "loss": 2.0816, + "step": 3801 + }, + { + "epoch": 1.1669736034376919, + "grad_norm": 0.575821042060852, + "learning_rate": 9.804394902697075e-05, + "loss": 2.0182, + "step": 3802 + }, + { + "epoch": 1.167280540208717, + "grad_norm": 0.6385466456413269, + "learning_rate": 9.804257210075339e-05, + "loss": 2.0519, + "step": 3803 + }, + { + "epoch": 1.167587476979742, + "grad_norm": 0.7202457785606384, + "learning_rate": 9.804119469975117e-05, + "loss": 1.9871, + "step": 3804 + }, + { + "epoch": 1.1678944137507674, + "grad_norm": 0.696793258190155, + "learning_rate": 9.803981682397772e-05, + "loss": 2.1018, + "step": 3805 + }, + { + "epoch": 1.1682013505217925, + "grad_norm": 0.6217656135559082, + "learning_rate": 9.803843847344662e-05, + "loss": 2.1009, + "step": 3806 + }, + { + "epoch": 1.1685082872928176, + "grad_norm": 0.5296351313591003, + "learning_rate": 9.803705964817153e-05, + "loss": 2.1057, + "step": 3807 + }, + { + "epoch": 1.168815224063843, + "grad_norm": 0.5280975699424744, + "learning_rate": 9.803568034816606e-05, + "loss": 2.0019, + "step": 3808 + }, + { + "epoch": 1.169122160834868, + "grad_norm": 0.4981881380081177, + "learning_rate": 9.803430057344385e-05, + "loss": 1.9918, + "step": 3809 + }, + { + "epoch": 1.1694290976058932, + "grad_norm": 0.43662941455841064, + "learning_rate": 9.803292032401852e-05, + "loss": 2.0273, + "step": 3810 + }, + { + "epoch": 1.1697360343769183, + "grad_norm": 0.5039259791374207, + "learning_rate": 9.80315395999037e-05, + "loss": 2.0475, + "step": 3811 + }, + { + "epoch": 1.1700429711479434, + "grad_norm": 0.4330410957336426, + "learning_rate": 9.803015840111308e-05, + "loss": 1.99, + "step": 3812 + }, + { + "epoch": 1.1703499079189688, + "grad_norm": 0.4603813886642456, + "learning_rate": 9.802877672766026e-05, + "loss": 2.0288, + "step": 3813 + }, + { + "epoch": 1.1706568446899939, + "grad_norm": 0.45815590023994446, + "learning_rate": 9.802739457955894e-05, + "loss": 2.0026, + "step": 3814 + }, + { + "epoch": 1.170963781461019, + "grad_norm": 0.46995803713798523, + "learning_rate": 9.802601195682275e-05, + "loss": 2.0608, + "step": 3815 + }, + { + "epoch": 1.1712707182320443, + "grad_norm": 0.4511576294898987, + "learning_rate": 9.802462885946536e-05, + "loss": 1.9793, + "step": 3816 + }, + { + "epoch": 1.1715776550030694, + "grad_norm": 0.49079468846321106, + "learning_rate": 9.802324528750044e-05, + "loss": 2.0049, + "step": 3817 + }, + { + "epoch": 1.1718845917740945, + "grad_norm": 0.47245466709136963, + "learning_rate": 9.802186124094166e-05, + "loss": 1.9562, + "step": 3818 + }, + { + "epoch": 1.1721915285451197, + "grad_norm": 0.485575795173645, + "learning_rate": 9.80204767198027e-05, + "loss": 2.0212, + "step": 3819 + }, + { + "epoch": 1.1724984653161448, + "grad_norm": 0.5924440622329712, + "learning_rate": 9.801909172409724e-05, + "loss": 1.9875, + "step": 3820 + }, + { + "epoch": 1.17280540208717, + "grad_norm": 0.48908641934394836, + "learning_rate": 9.801770625383899e-05, + "loss": 1.9778, + "step": 3821 + }, + { + "epoch": 1.1731123388581952, + "grad_norm": 0.4372415840625763, + "learning_rate": 9.80163203090416e-05, + "loss": 1.9368, + "step": 3822 + }, + { + "epoch": 1.1734192756292203, + "grad_norm": 0.5811094641685486, + "learning_rate": 9.801493388971881e-05, + "loss": 2.1293, + "step": 3823 + }, + { + "epoch": 1.1737262124002457, + "grad_norm": 0.516983151435852, + "learning_rate": 9.801354699588428e-05, + "loss": 2.039, + "step": 3824 + }, + { + "epoch": 1.1740331491712708, + "grad_norm": 0.53409343957901, + "learning_rate": 9.801215962755175e-05, + "loss": 2.0294, + "step": 3825 + }, + { + "epoch": 1.1743400859422959, + "grad_norm": 0.5703202486038208, + "learning_rate": 9.801077178473492e-05, + "loss": 2.0241, + "step": 3826 + }, + { + "epoch": 1.174647022713321, + "grad_norm": 0.49341192841529846, + "learning_rate": 9.80093834674475e-05, + "loss": 1.9092, + "step": 3827 + }, + { + "epoch": 1.174953959484346, + "grad_norm": 0.46960577368736267, + "learning_rate": 9.800799467570321e-05, + "loss": 1.9994, + "step": 3828 + }, + { + "epoch": 1.1752608962553714, + "grad_norm": 0.468108594417572, + "learning_rate": 9.800660540951577e-05, + "loss": 1.9471, + "step": 3829 + }, + { + "epoch": 1.1755678330263966, + "grad_norm": 0.4133259057998657, + "learning_rate": 9.800521566889893e-05, + "loss": 2.0159, + "step": 3830 + }, + { + "epoch": 1.1758747697974217, + "grad_norm": 0.44991979002952576, + "learning_rate": 9.800382545386641e-05, + "loss": 2.0179, + "step": 3831 + }, + { + "epoch": 1.176181706568447, + "grad_norm": 0.43111294507980347, + "learning_rate": 9.800243476443195e-05, + "loss": 2.1092, + "step": 3832 + }, + { + "epoch": 1.1764886433394721, + "grad_norm": 0.4859693944454193, + "learning_rate": 9.800104360060929e-05, + "loss": 2.0134, + "step": 3833 + }, + { + "epoch": 1.1767955801104972, + "grad_norm": 0.474960058927536, + "learning_rate": 9.799965196241219e-05, + "loss": 2.0288, + "step": 3834 + }, + { + "epoch": 1.1771025168815223, + "grad_norm": 0.5269008278846741, + "learning_rate": 9.79982598498544e-05, + "loss": 2.063, + "step": 3835 + }, + { + "epoch": 1.1774094536525475, + "grad_norm": 0.4923003613948822, + "learning_rate": 9.799686726294965e-05, + "loss": 1.9506, + "step": 3836 + }, + { + "epoch": 1.1777163904235728, + "grad_norm": 0.5355561971664429, + "learning_rate": 9.799547420171175e-05, + "loss": 2.0066, + "step": 3837 + }, + { + "epoch": 1.178023327194598, + "grad_norm": 0.6095728874206543, + "learning_rate": 9.799408066615443e-05, + "loss": 1.9799, + "step": 3838 + }, + { + "epoch": 1.178330263965623, + "grad_norm": 0.5268104672431946, + "learning_rate": 9.799268665629148e-05, + "loss": 2.0409, + "step": 3839 + }, + { + "epoch": 1.1786372007366483, + "grad_norm": 0.4478130340576172, + "learning_rate": 9.799129217213667e-05, + "loss": 1.9521, + "step": 3840 + }, + { + "epoch": 1.1789441375076735, + "grad_norm": 0.4691653847694397, + "learning_rate": 9.798989721370379e-05, + "loss": 2.0432, + "step": 3841 + }, + { + "epoch": 1.1792510742786986, + "grad_norm": 0.5602376461029053, + "learning_rate": 9.798850178100661e-05, + "loss": 2.0557, + "step": 3842 + }, + { + "epoch": 1.1795580110497237, + "grad_norm": 0.5619905591011047, + "learning_rate": 9.798710587405893e-05, + "loss": 2.0258, + "step": 3843 + }, + { + "epoch": 1.179864947820749, + "grad_norm": 0.5845574736595154, + "learning_rate": 9.798570949287454e-05, + "loss": 2.0637, + "step": 3844 + }, + { + "epoch": 1.1801718845917741, + "grad_norm": 0.5339313745498657, + "learning_rate": 9.798431263746725e-05, + "loss": 2.0265, + "step": 3845 + }, + { + "epoch": 1.1804788213627992, + "grad_norm": 0.45720914006233215, + "learning_rate": 9.798291530785086e-05, + "loss": 1.9745, + "step": 3846 + }, + { + "epoch": 1.1807857581338244, + "grad_norm": 0.5121282935142517, + "learning_rate": 9.798151750403917e-05, + "loss": 2.0427, + "step": 3847 + }, + { + "epoch": 1.1810926949048497, + "grad_norm": 0.48100459575653076, + "learning_rate": 9.7980119226046e-05, + "loss": 2.0307, + "step": 3848 + }, + { + "epoch": 1.1813996316758748, + "grad_norm": 0.4424034655094147, + "learning_rate": 9.797872047388517e-05, + "loss": 1.9697, + "step": 3849 + }, + { + "epoch": 1.1817065684469, + "grad_norm": 0.45154938101768494, + "learning_rate": 9.797732124757051e-05, + "loss": 1.9689, + "step": 3850 + }, + { + "epoch": 1.182013505217925, + "grad_norm": 0.4807071387767792, + "learning_rate": 9.797592154711584e-05, + "loss": 1.9616, + "step": 3851 + }, + { + "epoch": 1.1823204419889504, + "grad_norm": 0.5113904476165771, + "learning_rate": 9.797452137253498e-05, + "loss": 2.0158, + "step": 3852 + }, + { + "epoch": 1.1826273787599755, + "grad_norm": 0.5456753969192505, + "learning_rate": 9.797312072384179e-05, + "loss": 1.977, + "step": 3853 + }, + { + "epoch": 1.1829343155310006, + "grad_norm": 0.5545704364776611, + "learning_rate": 9.797171960105012e-05, + "loss": 2.0622, + "step": 3854 + }, + { + "epoch": 1.1832412523020257, + "grad_norm": 0.651498556137085, + "learning_rate": 9.797031800417377e-05, + "loss": 2.0739, + "step": 3855 + }, + { + "epoch": 1.183548189073051, + "grad_norm": 0.748968780040741, + "learning_rate": 9.796891593322665e-05, + "loss": 2.0713, + "step": 3856 + }, + { + "epoch": 1.1838551258440762, + "grad_norm": 0.8724157214164734, + "learning_rate": 9.796751338822256e-05, + "loss": 2.0224, + "step": 3857 + }, + { + "epoch": 1.1841620626151013, + "grad_norm": 0.8158844709396362, + "learning_rate": 9.796611036917542e-05, + "loss": 2.0165, + "step": 3858 + }, + { + "epoch": 1.1844689993861264, + "grad_norm": 0.6231487989425659, + "learning_rate": 9.796470687609904e-05, + "loss": 1.9607, + "step": 3859 + }, + { + "epoch": 1.1847759361571517, + "grad_norm": 0.49367067217826843, + "learning_rate": 9.796330290900731e-05, + "loss": 2.0074, + "step": 3860 + }, + { + "epoch": 1.1850828729281768, + "grad_norm": 0.5546393990516663, + "learning_rate": 9.796189846791413e-05, + "loss": 1.9688, + "step": 3861 + }, + { + "epoch": 1.185389809699202, + "grad_norm": 0.5880963802337646, + "learning_rate": 9.796049355283333e-05, + "loss": 2.0192, + "step": 3862 + }, + { + "epoch": 1.185696746470227, + "grad_norm": 0.6064910292625427, + "learning_rate": 9.795908816377884e-05, + "loss": 2.0236, + "step": 3863 + }, + { + "epoch": 1.1860036832412524, + "grad_norm": 0.524116575717926, + "learning_rate": 9.795768230076454e-05, + "loss": 2.0315, + "step": 3864 + }, + { + "epoch": 1.1863106200122775, + "grad_norm": 0.449158251285553, + "learning_rate": 9.79562759638043e-05, + "loss": 1.9423, + "step": 3865 + }, + { + "epoch": 1.1866175567833026, + "grad_norm": 0.5623016953468323, + "learning_rate": 9.795486915291203e-05, + "loss": 2.096, + "step": 3866 + }, + { + "epoch": 1.1869244935543277, + "grad_norm": 0.6107217073440552, + "learning_rate": 9.795346186810164e-05, + "loss": 1.9994, + "step": 3867 + }, + { + "epoch": 1.187231430325353, + "grad_norm": 0.5559211373329163, + "learning_rate": 9.795205410938704e-05, + "loss": 2.0138, + "step": 3868 + }, + { + "epoch": 1.1875383670963782, + "grad_norm": 0.5022037029266357, + "learning_rate": 9.795064587678212e-05, + "loss": 2.0835, + "step": 3869 + }, + { + "epoch": 1.1878453038674033, + "grad_norm": 0.5760810971260071, + "learning_rate": 9.794923717030082e-05, + "loss": 2.0839, + "step": 3870 + }, + { + "epoch": 1.1881522406384284, + "grad_norm": 0.559018075466156, + "learning_rate": 9.794782798995706e-05, + "loss": 2.0397, + "step": 3871 + }, + { + "epoch": 1.1884591774094537, + "grad_norm": 0.48842501640319824, + "learning_rate": 9.794641833576477e-05, + "loss": 2.022, + "step": 3872 + }, + { + "epoch": 1.1887661141804788, + "grad_norm": 0.47267377376556396, + "learning_rate": 9.794500820773785e-05, + "loss": 1.9677, + "step": 3873 + }, + { + "epoch": 1.189073050951504, + "grad_norm": 0.5107980966567993, + "learning_rate": 9.794359760589026e-05, + "loss": 2.124, + "step": 3874 + }, + { + "epoch": 1.189379987722529, + "grad_norm": 0.4993875026702881, + "learning_rate": 9.794218653023595e-05, + "loss": 1.9528, + "step": 3875 + }, + { + "epoch": 1.1896869244935544, + "grad_norm": 0.49543896317481995, + "learning_rate": 9.794077498078885e-05, + "loss": 2.0257, + "step": 3876 + }, + { + "epoch": 1.1899938612645795, + "grad_norm": 0.5207403302192688, + "learning_rate": 9.79393629575629e-05, + "loss": 2.0853, + "step": 3877 + }, + { + "epoch": 1.1903007980356046, + "grad_norm": 0.44884833693504333, + "learning_rate": 9.793795046057208e-05, + "loss": 1.9366, + "step": 3878 + }, + { + "epoch": 1.1906077348066297, + "grad_norm": 0.47921934723854065, + "learning_rate": 9.793653748983033e-05, + "loss": 2.0614, + "step": 3879 + }, + { + "epoch": 1.190914671577655, + "grad_norm": 0.5371566414833069, + "learning_rate": 9.793512404535163e-05, + "loss": 2.0433, + "step": 3880 + }, + { + "epoch": 1.1912216083486802, + "grad_norm": 0.48760104179382324, + "learning_rate": 9.793371012714994e-05, + "loss": 2.0061, + "step": 3881 + }, + { + "epoch": 1.1915285451197053, + "grad_norm": 0.47291669249534607, + "learning_rate": 9.793229573523922e-05, + "loss": 2.0661, + "step": 3882 + }, + { + "epoch": 1.1918354818907306, + "grad_norm": 0.5348502397537231, + "learning_rate": 9.793088086963347e-05, + "loss": 2.0131, + "step": 3883 + }, + { + "epoch": 1.1921424186617557, + "grad_norm": 0.6291812062263489, + "learning_rate": 9.792946553034666e-05, + "loss": 2.0312, + "step": 3884 + }, + { + "epoch": 1.1924493554327809, + "grad_norm": 0.5620503425598145, + "learning_rate": 9.792804971739276e-05, + "loss": 2.0429, + "step": 3885 + }, + { + "epoch": 1.192756292203806, + "grad_norm": 0.4984607696533203, + "learning_rate": 9.792663343078581e-05, + "loss": 2.0183, + "step": 3886 + }, + { + "epoch": 1.193063228974831, + "grad_norm": 0.5867961645126343, + "learning_rate": 9.792521667053975e-05, + "loss": 2.0609, + "step": 3887 + }, + { + "epoch": 1.1933701657458564, + "grad_norm": 0.5819169282913208, + "learning_rate": 9.792379943666863e-05, + "loss": 1.9412, + "step": 3888 + }, + { + "epoch": 1.1936771025168815, + "grad_norm": 0.6232548952102661, + "learning_rate": 9.792238172918643e-05, + "loss": 2.0607, + "step": 3889 + }, + { + "epoch": 1.1939840392879066, + "grad_norm": 0.5859619379043579, + "learning_rate": 9.792096354810716e-05, + "loss": 2.0718, + "step": 3890 + }, + { + "epoch": 1.194290976058932, + "grad_norm": 0.47209057211875916, + "learning_rate": 9.791954489344485e-05, + "loss": 1.9872, + "step": 3891 + }, + { + "epoch": 1.194597912829957, + "grad_norm": 0.5183662176132202, + "learning_rate": 9.79181257652135e-05, + "loss": 2.0782, + "step": 3892 + }, + { + "epoch": 1.1949048496009822, + "grad_norm": 0.551873505115509, + "learning_rate": 9.791670616342715e-05, + "loss": 2.0477, + "step": 3893 + }, + { + "epoch": 1.1952117863720073, + "grad_norm": 0.47254955768585205, + "learning_rate": 9.791528608809984e-05, + "loss": 1.9859, + "step": 3894 + }, + { + "epoch": 1.1955187231430324, + "grad_norm": 0.45482897758483887, + "learning_rate": 9.791386553924556e-05, + "loss": 1.9939, + "step": 3895 + }, + { + "epoch": 1.1958256599140578, + "grad_norm": 0.4687066078186035, + "learning_rate": 9.79124445168784e-05, + "loss": 1.9982, + "step": 3896 + }, + { + "epoch": 1.1961325966850829, + "grad_norm": 0.4855460524559021, + "learning_rate": 9.791102302101236e-05, + "loss": 1.9667, + "step": 3897 + }, + { + "epoch": 1.196439533456108, + "grad_norm": 0.48152467608451843, + "learning_rate": 9.790960105166153e-05, + "loss": 1.9914, + "step": 3898 + }, + { + "epoch": 1.1967464702271333, + "grad_norm": 0.48487406969070435, + "learning_rate": 9.790817860883993e-05, + "loss": 1.9978, + "step": 3899 + }, + { + "epoch": 1.1970534069981584, + "grad_norm": 0.47665563225746155, + "learning_rate": 9.790675569256162e-05, + "loss": 1.9995, + "step": 3900 + }, + { + "epoch": 1.1973603437691835, + "grad_norm": 0.48938530683517456, + "learning_rate": 9.790533230284069e-05, + "loss": 2.0461, + "step": 3901 + }, + { + "epoch": 1.1976672805402087, + "grad_norm": 0.6336411237716675, + "learning_rate": 9.790390843969119e-05, + "loss": 2.0003, + "step": 3902 + }, + { + "epoch": 1.1979742173112338, + "grad_norm": 0.6946616172790527, + "learning_rate": 9.790248410312717e-05, + "loss": 1.9979, + "step": 3903 + }, + { + "epoch": 1.198281154082259, + "grad_norm": 0.7829384803771973, + "learning_rate": 9.790105929316274e-05, + "loss": 2.015, + "step": 3904 + }, + { + "epoch": 1.1985880908532842, + "grad_norm": 0.6874059438705444, + "learning_rate": 9.789963400981197e-05, + "loss": 1.9887, + "step": 3905 + }, + { + "epoch": 1.1988950276243093, + "grad_norm": 0.6074720025062561, + "learning_rate": 9.789820825308893e-05, + "loss": 2.0287, + "step": 3906 + }, + { + "epoch": 1.1992019643953347, + "grad_norm": 0.49311673641204834, + "learning_rate": 9.789678202300774e-05, + "loss": 1.9846, + "step": 3907 + }, + { + "epoch": 1.1995089011663598, + "grad_norm": 0.5266487002372742, + "learning_rate": 9.789535531958244e-05, + "loss": 2.017, + "step": 3908 + }, + { + "epoch": 1.1998158379373849, + "grad_norm": 0.6170570850372314, + "learning_rate": 9.789392814282721e-05, + "loss": 2.0615, + "step": 3909 + }, + { + "epoch": 1.20012277470841, + "grad_norm": 0.5820409059524536, + "learning_rate": 9.789250049275609e-05, + "loss": 2.0459, + "step": 3910 + }, + { + "epoch": 1.2004297114794351, + "grad_norm": 0.5220739841461182, + "learning_rate": 9.78910723693832e-05, + "loss": 2.0843, + "step": 3911 + }, + { + "epoch": 1.2007366482504604, + "grad_norm": 0.5884750485420227, + "learning_rate": 9.788964377272267e-05, + "loss": 2.1068, + "step": 3912 + }, + { + "epoch": 1.2010435850214856, + "grad_norm": 0.5634950995445251, + "learning_rate": 9.788821470278861e-05, + "loss": 2.0206, + "step": 3913 + }, + { + "epoch": 1.2013505217925107, + "grad_norm": 0.5219514966011047, + "learning_rate": 9.788678515959517e-05, + "loss": 2.0802, + "step": 3914 + }, + { + "epoch": 1.201657458563536, + "grad_norm": 0.5870078206062317, + "learning_rate": 9.788535514315642e-05, + "loss": 2.0149, + "step": 3915 + }, + { + "epoch": 1.2019643953345611, + "grad_norm": 0.4850577414035797, + "learning_rate": 9.788392465348653e-05, + "loss": 2.0424, + "step": 3916 + }, + { + "epoch": 1.2022713321055862, + "grad_norm": 0.5354881882667542, + "learning_rate": 9.788249369059964e-05, + "loss": 2.0822, + "step": 3917 + }, + { + "epoch": 1.2025782688766113, + "grad_norm": 0.5817529559135437, + "learning_rate": 9.788106225450988e-05, + "loss": 2.0384, + "step": 3918 + }, + { + "epoch": 1.2028852056476367, + "grad_norm": 0.5685575008392334, + "learning_rate": 9.78796303452314e-05, + "loss": 1.9777, + "step": 3919 + }, + { + "epoch": 1.2031921424186618, + "grad_norm": 0.5086472034454346, + "learning_rate": 9.787819796277835e-05, + "loss": 1.9109, + "step": 3920 + }, + { + "epoch": 1.203499079189687, + "grad_norm": 0.45905008912086487, + "learning_rate": 9.787676510716488e-05, + "loss": 1.9945, + "step": 3921 + }, + { + "epoch": 1.203806015960712, + "grad_norm": 0.6052672863006592, + "learning_rate": 9.787533177840516e-05, + "loss": 2.0873, + "step": 3922 + }, + { + "epoch": 1.2041129527317374, + "grad_norm": 0.636320173740387, + "learning_rate": 9.787389797651334e-05, + "loss": 1.954, + "step": 3923 + }, + { + "epoch": 1.2044198895027625, + "grad_norm": 0.5775459408760071, + "learning_rate": 9.78724637015036e-05, + "loss": 1.9632, + "step": 3924 + }, + { + "epoch": 1.2047268262737876, + "grad_norm": 0.4593936502933502, + "learning_rate": 9.787102895339013e-05, + "loss": 1.948, + "step": 3925 + }, + { + "epoch": 1.2050337630448127, + "grad_norm": 0.4568643867969513, + "learning_rate": 9.78695937321871e-05, + "loss": 1.977, + "step": 3926 + }, + { + "epoch": 1.205340699815838, + "grad_norm": 0.6079357266426086, + "learning_rate": 9.786815803790867e-05, + "loss": 1.9738, + "step": 3927 + }, + { + "epoch": 1.2056476365868631, + "grad_norm": 0.5991626977920532, + "learning_rate": 9.786672187056905e-05, + "loss": 1.9603, + "step": 3928 + }, + { + "epoch": 1.2059545733578882, + "grad_norm": 0.4844282865524292, + "learning_rate": 9.786528523018242e-05, + "loss": 1.9739, + "step": 3929 + }, + { + "epoch": 1.2062615101289134, + "grad_norm": 0.43694475293159485, + "learning_rate": 9.786384811676298e-05, + "loss": 1.957, + "step": 3930 + }, + { + "epoch": 1.2065684468999387, + "grad_norm": 0.5742451548576355, + "learning_rate": 9.786241053032496e-05, + "loss": 1.9872, + "step": 3931 + }, + { + "epoch": 1.2068753836709638, + "grad_norm": 0.6246824860572815, + "learning_rate": 9.786097247088255e-05, + "loss": 2.0747, + "step": 3932 + }, + { + "epoch": 1.207182320441989, + "grad_norm": 0.5364731550216675, + "learning_rate": 9.785953393844996e-05, + "loss": 1.9793, + "step": 3933 + }, + { + "epoch": 1.207489257213014, + "grad_norm": 0.42909273505210876, + "learning_rate": 9.785809493304139e-05, + "loss": 1.9959, + "step": 3934 + }, + { + "epoch": 1.2077961939840394, + "grad_norm": 0.43952879309654236, + "learning_rate": 9.785665545467108e-05, + "loss": 2.0019, + "step": 3935 + }, + { + "epoch": 1.2081031307550645, + "grad_norm": 0.45972180366516113, + "learning_rate": 9.785521550335323e-05, + "loss": 1.9504, + "step": 3936 + }, + { + "epoch": 1.2084100675260896, + "grad_norm": 0.5592246651649475, + "learning_rate": 9.785377507910212e-05, + "loss": 2.0214, + "step": 3937 + }, + { + "epoch": 1.2087170042971147, + "grad_norm": 0.6084285378456116, + "learning_rate": 9.785233418193196e-05, + "loss": 2.08, + "step": 3938 + }, + { + "epoch": 1.20902394106814, + "grad_norm": 0.5370670557022095, + "learning_rate": 9.785089281185698e-05, + "loss": 2.0877, + "step": 3939 + }, + { + "epoch": 1.2093308778391652, + "grad_norm": 0.466501921415329, + "learning_rate": 9.784945096889143e-05, + "loss": 1.9795, + "step": 3940 + }, + { + "epoch": 1.2096378146101903, + "grad_norm": 0.48617517948150635, + "learning_rate": 9.784800865304954e-05, + "loss": 2.0099, + "step": 3941 + }, + { + "epoch": 1.2099447513812154, + "grad_norm": 0.528110921382904, + "learning_rate": 9.78465658643456e-05, + "loss": 2.0597, + "step": 3942 + }, + { + "epoch": 1.2102516881522407, + "grad_norm": 0.47355538606643677, + "learning_rate": 9.784512260279385e-05, + "loss": 2.0145, + "step": 3943 + }, + { + "epoch": 1.2105586249232658, + "grad_norm": 0.46970823407173157, + "learning_rate": 9.784367886840856e-05, + "loss": 2.0533, + "step": 3944 + }, + { + "epoch": 1.210865561694291, + "grad_norm": 0.41206037998199463, + "learning_rate": 9.784223466120399e-05, + "loss": 1.9226, + "step": 3945 + }, + { + "epoch": 1.211172498465316, + "grad_norm": 0.4298155605792999, + "learning_rate": 9.784078998119442e-05, + "loss": 2.0686, + "step": 3946 + }, + { + "epoch": 1.2114794352363414, + "grad_norm": 0.4616359770298004, + "learning_rate": 9.783934482839412e-05, + "loss": 2.0063, + "step": 3947 + }, + { + "epoch": 1.2117863720073665, + "grad_norm": 0.476726233959198, + "learning_rate": 9.783789920281737e-05, + "loss": 1.9868, + "step": 3948 + }, + { + "epoch": 1.2120933087783916, + "grad_norm": 0.5075610876083374, + "learning_rate": 9.783645310447846e-05, + "loss": 2.1019, + "step": 3949 + }, + { + "epoch": 1.212400245549417, + "grad_norm": 0.49806225299835205, + "learning_rate": 9.78350065333917e-05, + "loss": 2.0503, + "step": 3950 + }, + { + "epoch": 1.212707182320442, + "grad_norm": 0.5278452634811401, + "learning_rate": 9.783355948957134e-05, + "loss": 2.0513, + "step": 3951 + }, + { + "epoch": 1.2130141190914672, + "grad_norm": 0.5634627938270569, + "learning_rate": 9.783211197303174e-05, + "loss": 2.1135, + "step": 3952 + }, + { + "epoch": 1.2133210558624923, + "grad_norm": 0.5152999758720398, + "learning_rate": 9.783066398378715e-05, + "loss": 2.0392, + "step": 3953 + }, + { + "epoch": 1.2136279926335174, + "grad_norm": 0.48095864057540894, + "learning_rate": 9.782921552185191e-05, + "loss": 1.982, + "step": 3954 + }, + { + "epoch": 1.2139349294045427, + "grad_norm": 0.47377893328666687, + "learning_rate": 9.782776658724034e-05, + "loss": 1.9538, + "step": 3955 + }, + { + "epoch": 1.2142418661755678, + "grad_norm": 0.5260181427001953, + "learning_rate": 9.782631717996675e-05, + "loss": 2.1197, + "step": 3956 + }, + { + "epoch": 1.214548802946593, + "grad_norm": 0.5640038251876831, + "learning_rate": 9.782486730004544e-05, + "loss": 2.0338, + "step": 3957 + }, + { + "epoch": 1.2148557397176183, + "grad_norm": 0.5091645121574402, + "learning_rate": 9.782341694749078e-05, + "loss": 1.9921, + "step": 3958 + }, + { + "epoch": 1.2151626764886434, + "grad_norm": 0.48285624384880066, + "learning_rate": 9.782196612231706e-05, + "loss": 2.0358, + "step": 3959 + }, + { + "epoch": 1.2154696132596685, + "grad_norm": 0.5013573169708252, + "learning_rate": 9.782051482453867e-05, + "loss": 1.9378, + "step": 3960 + }, + { + "epoch": 1.2157765500306936, + "grad_norm": 0.42000052332878113, + "learning_rate": 9.781906305416991e-05, + "loss": 1.9232, + "step": 3961 + }, + { + "epoch": 1.2160834868017187, + "grad_norm": 0.4651196599006653, + "learning_rate": 9.781761081122514e-05, + "loss": 2.0244, + "step": 3962 + }, + { + "epoch": 1.216390423572744, + "grad_norm": 0.48081469535827637, + "learning_rate": 9.781615809571871e-05, + "loss": 1.938, + "step": 3963 + }, + { + "epoch": 1.2166973603437692, + "grad_norm": 0.4692462086677551, + "learning_rate": 9.7814704907665e-05, + "loss": 1.9592, + "step": 3964 + }, + { + "epoch": 1.2170042971147943, + "grad_norm": 0.5545635223388672, + "learning_rate": 9.781325124707832e-05, + "loss": 2.0882, + "step": 3965 + }, + { + "epoch": 1.2173112338858196, + "grad_norm": 0.47801801562309265, + "learning_rate": 9.78117971139731e-05, + "loss": 2.0127, + "step": 3966 + }, + { + "epoch": 1.2176181706568447, + "grad_norm": 0.4705824851989746, + "learning_rate": 9.781034250836364e-05, + "loss": 2.0659, + "step": 3967 + }, + { + "epoch": 1.2179251074278699, + "grad_norm": 0.4757092297077179, + "learning_rate": 9.78088874302644e-05, + "loss": 1.9177, + "step": 3968 + }, + { + "epoch": 1.218232044198895, + "grad_norm": 0.4563291370868683, + "learning_rate": 9.780743187968968e-05, + "loss": 1.991, + "step": 3969 + }, + { + "epoch": 1.21853898096992, + "grad_norm": 0.4641762375831604, + "learning_rate": 9.78059758566539e-05, + "loss": 2.0357, + "step": 3970 + }, + { + "epoch": 1.2188459177409454, + "grad_norm": 0.510754406452179, + "learning_rate": 9.780451936117145e-05, + "loss": 2.0754, + "step": 3971 + }, + { + "epoch": 1.2191528545119705, + "grad_norm": 0.5595460534095764, + "learning_rate": 9.780306239325671e-05, + "loss": 2.0449, + "step": 3972 + }, + { + "epoch": 1.2194597912829956, + "grad_norm": 0.5778231620788574, + "learning_rate": 9.780160495292412e-05, + "loss": 2.0187, + "step": 3973 + }, + { + "epoch": 1.219766728054021, + "grad_norm": 0.5098022818565369, + "learning_rate": 9.780014704018803e-05, + "loss": 1.9881, + "step": 3974 + }, + { + "epoch": 1.220073664825046, + "grad_norm": 0.46725937724113464, + "learning_rate": 9.779868865506288e-05, + "loss": 1.9929, + "step": 3975 + }, + { + "epoch": 1.2203806015960712, + "grad_norm": 0.48517540097236633, + "learning_rate": 9.779722979756304e-05, + "loss": 1.9446, + "step": 3976 + }, + { + "epoch": 1.2206875383670963, + "grad_norm": 0.5013269186019897, + "learning_rate": 9.7795770467703e-05, + "loss": 2.0256, + "step": 3977 + }, + { + "epoch": 1.2209944751381214, + "grad_norm": 0.4918982982635498, + "learning_rate": 9.779431066549713e-05, + "loss": 1.9732, + "step": 3978 + }, + { + "epoch": 1.2213014119091468, + "grad_norm": 0.45646655559539795, + "learning_rate": 9.779285039095987e-05, + "loss": 1.9672, + "step": 3979 + }, + { + "epoch": 1.2216083486801719, + "grad_norm": 0.4712901711463928, + "learning_rate": 9.779138964410565e-05, + "loss": 2.0074, + "step": 3980 + }, + { + "epoch": 1.221915285451197, + "grad_norm": 0.4901394844055176, + "learning_rate": 9.77899284249489e-05, + "loss": 2.0073, + "step": 3981 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.579247772693634, + "learning_rate": 9.778846673350407e-05, + "loss": 2.0983, + "step": 3982 + }, + { + "epoch": 1.2225291589932474, + "grad_norm": 0.6108444929122925, + "learning_rate": 9.77870045697856e-05, + "loss": 2.0268, + "step": 3983 + }, + { + "epoch": 1.2228360957642725, + "grad_norm": 0.5592121481895447, + "learning_rate": 9.778554193380796e-05, + "loss": 2.0549, + "step": 3984 + }, + { + "epoch": 1.2231430325352977, + "grad_norm": 0.538088858127594, + "learning_rate": 9.778407882558556e-05, + "loss": 1.9398, + "step": 3985 + }, + { + "epoch": 1.223449969306323, + "grad_norm": 0.5928295850753784, + "learning_rate": 9.77826152451329e-05, + "loss": 2.0341, + "step": 3986 + }, + { + "epoch": 1.223756906077348, + "grad_norm": 0.566687822341919, + "learning_rate": 9.778115119246442e-05, + "loss": 2.0629, + "step": 3987 + }, + { + "epoch": 1.2240638428483732, + "grad_norm": 0.7019027471542358, + "learning_rate": 9.777968666759461e-05, + "loss": 1.9979, + "step": 3988 + }, + { + "epoch": 1.2243707796193983, + "grad_norm": 0.7198969721794128, + "learning_rate": 9.777822167053793e-05, + "loss": 1.9898, + "step": 3989 + }, + { + "epoch": 1.2246777163904237, + "grad_norm": 0.6319006085395813, + "learning_rate": 9.777675620130887e-05, + "loss": 1.9591, + "step": 3990 + }, + { + "epoch": 1.2249846531614488, + "grad_norm": 0.5372903347015381, + "learning_rate": 9.777529025992187e-05, + "loss": 1.9605, + "step": 3991 + }, + { + "epoch": 1.225291589932474, + "grad_norm": 0.47436487674713135, + "learning_rate": 9.777382384639147e-05, + "loss": 1.9667, + "step": 3992 + }, + { + "epoch": 1.225598526703499, + "grad_norm": 0.5885797739028931, + "learning_rate": 9.777235696073214e-05, + "loss": 2.0363, + "step": 3993 + }, + { + "epoch": 1.2259054634745243, + "grad_norm": 0.6333138346672058, + "learning_rate": 9.777088960295838e-05, + "loss": 1.9352, + "step": 3994 + }, + { + "epoch": 1.2262124002455494, + "grad_norm": 0.6364251971244812, + "learning_rate": 9.776942177308468e-05, + "loss": 1.9577, + "step": 3995 + }, + { + "epoch": 1.2265193370165746, + "grad_norm": 0.5114668607711792, + "learning_rate": 9.776795347112557e-05, + "loss": 2.0241, + "step": 3996 + }, + { + "epoch": 1.2268262737875997, + "grad_norm": 0.6139995455741882, + "learning_rate": 9.776648469709556e-05, + "loss": 1.9847, + "step": 3997 + }, + { + "epoch": 1.227133210558625, + "grad_norm": 0.6104671955108643, + "learning_rate": 9.776501545100911e-05, + "loss": 1.9311, + "step": 3998 + }, + { + "epoch": 1.2274401473296501, + "grad_norm": 0.5099297761917114, + "learning_rate": 9.776354573288081e-05, + "loss": 2.0877, + "step": 3999 + }, + { + "epoch": 1.2277470841006752, + "grad_norm": 0.48199233412742615, + "learning_rate": 9.776207554272516e-05, + "loss": 1.9802, + "step": 4000 + }, + { + "epoch": 1.2280540208717003, + "grad_norm": 0.5323067307472229, + "learning_rate": 9.776060488055667e-05, + "loss": 2.0278, + "step": 4001 + }, + { + "epoch": 1.2283609576427257, + "grad_norm": 0.49086472392082214, + "learning_rate": 9.775913374638988e-05, + "loss": 2.0242, + "step": 4002 + }, + { + "epoch": 1.2286678944137508, + "grad_norm": 0.4812946319580078, + "learning_rate": 9.775766214023936e-05, + "loss": 1.9762, + "step": 4003 + }, + { + "epoch": 1.228974831184776, + "grad_norm": 0.44118809700012207, + "learning_rate": 9.775619006211962e-05, + "loss": 1.9242, + "step": 4004 + }, + { + "epoch": 1.229281767955801, + "grad_norm": 0.4507352113723755, + "learning_rate": 9.775471751204522e-05, + "loss": 2.0015, + "step": 4005 + }, + { + "epoch": 1.2295887047268264, + "grad_norm": 0.4620691239833832, + "learning_rate": 9.775324449003072e-05, + "loss": 2.0269, + "step": 4006 + }, + { + "epoch": 1.2298956414978515, + "grad_norm": 0.5053025484085083, + "learning_rate": 9.775177099609065e-05, + "loss": 1.9764, + "step": 4007 + }, + { + "epoch": 1.2302025782688766, + "grad_norm": 0.5113483667373657, + "learning_rate": 9.775029703023961e-05, + "loss": 2.0583, + "step": 4008 + }, + { + "epoch": 1.2305095150399017, + "grad_norm": 0.517400324344635, + "learning_rate": 9.774882259249214e-05, + "loss": 2.0918, + "step": 4009 + }, + { + "epoch": 1.230816451810927, + "grad_norm": 0.5575035214424133, + "learning_rate": 9.774734768286282e-05, + "loss": 2.0573, + "step": 4010 + }, + { + "epoch": 1.2311233885819521, + "grad_norm": 0.5556582808494568, + "learning_rate": 9.774587230136622e-05, + "loss": 1.9612, + "step": 4011 + }, + { + "epoch": 1.2314303253529773, + "grad_norm": 0.541752815246582, + "learning_rate": 9.774439644801693e-05, + "loss": 2.0165, + "step": 4012 + }, + { + "epoch": 1.2317372621240024, + "grad_norm": 0.46944886445999146, + "learning_rate": 9.774292012282953e-05, + "loss": 2.0068, + "step": 4013 + }, + { + "epoch": 1.2320441988950277, + "grad_norm": 0.5507385730743408, + "learning_rate": 9.77414433258186e-05, + "loss": 2.0092, + "step": 4014 + }, + { + "epoch": 1.2323511356660528, + "grad_norm": 0.550862193107605, + "learning_rate": 9.773996605699875e-05, + "loss": 1.9887, + "step": 4015 + }, + { + "epoch": 1.232658072437078, + "grad_norm": 0.5281004905700684, + "learning_rate": 9.77384883163846e-05, + "loss": 2.0214, + "step": 4016 + }, + { + "epoch": 1.232965009208103, + "grad_norm": 0.5682541131973267, + "learning_rate": 9.77370101039907e-05, + "loss": 2.0021, + "step": 4017 + }, + { + "epoch": 1.2332719459791284, + "grad_norm": 0.5083168745040894, + "learning_rate": 9.77355314198317e-05, + "loss": 1.9589, + "step": 4018 + }, + { + "epoch": 1.2335788827501535, + "grad_norm": 0.48763957619667053, + "learning_rate": 9.773405226392218e-05, + "loss": 1.9517, + "step": 4019 + }, + { + "epoch": 1.2338858195211786, + "grad_norm": 0.4721868634223938, + "learning_rate": 9.77325726362768e-05, + "loss": 1.959, + "step": 4020 + }, + { + "epoch": 1.2341927562922037, + "grad_norm": 0.5072606205940247, + "learning_rate": 9.773109253691016e-05, + "loss": 2.0252, + "step": 4021 + }, + { + "epoch": 1.234499693063229, + "grad_norm": 0.483260840177536, + "learning_rate": 9.772961196583686e-05, + "loss": 2.0205, + "step": 4022 + }, + { + "epoch": 1.2348066298342542, + "grad_norm": 0.4468609392642975, + "learning_rate": 9.772813092307158e-05, + "loss": 2.0182, + "step": 4023 + }, + { + "epoch": 1.2351135666052793, + "grad_norm": 0.4950753152370453, + "learning_rate": 9.772664940862893e-05, + "loss": 2.0276, + "step": 4024 + }, + { + "epoch": 1.2354205033763046, + "grad_norm": 0.45740416646003723, + "learning_rate": 9.772516742252356e-05, + "loss": 1.9519, + "step": 4025 + }, + { + "epoch": 1.2357274401473297, + "grad_norm": 0.409072607755661, + "learning_rate": 9.772368496477011e-05, + "loss": 1.9441, + "step": 4026 + }, + { + "epoch": 1.2360343769183548, + "grad_norm": 0.44857287406921387, + "learning_rate": 9.772220203538325e-05, + "loss": 1.9941, + "step": 4027 + }, + { + "epoch": 1.23634131368938, + "grad_norm": 0.4610998034477234, + "learning_rate": 9.77207186343776e-05, + "loss": 1.9855, + "step": 4028 + }, + { + "epoch": 1.236648250460405, + "grad_norm": 0.4809660017490387, + "learning_rate": 9.771923476176784e-05, + "loss": 1.9596, + "step": 4029 + }, + { + "epoch": 1.2369551872314304, + "grad_norm": 0.5011657476425171, + "learning_rate": 9.771775041756865e-05, + "loss": 1.9537, + "step": 4030 + }, + { + "epoch": 1.2372621240024555, + "grad_norm": 0.476001501083374, + "learning_rate": 9.771626560179465e-05, + "loss": 1.9447, + "step": 4031 + }, + { + "epoch": 1.2375690607734806, + "grad_norm": 0.4733816385269165, + "learning_rate": 9.771478031446057e-05, + "loss": 2.08, + "step": 4032 + }, + { + "epoch": 1.237875997544506, + "grad_norm": 0.4763995409011841, + "learning_rate": 9.771329455558108e-05, + "loss": 1.9483, + "step": 4033 + }, + { + "epoch": 1.238182934315531, + "grad_norm": 0.4906281530857086, + "learning_rate": 9.771180832517082e-05, + "loss": 1.9619, + "step": 4034 + }, + { + "epoch": 1.2384898710865562, + "grad_norm": 0.48713672161102295, + "learning_rate": 9.77103216232445e-05, + "loss": 1.9753, + "step": 4035 + }, + { + "epoch": 1.2387968078575813, + "grad_norm": 0.5214180946350098, + "learning_rate": 9.770883444981683e-05, + "loss": 2.0407, + "step": 4036 + }, + { + "epoch": 1.2391037446286064, + "grad_norm": 0.5161129236221313, + "learning_rate": 9.77073468049025e-05, + "loss": 2.0298, + "step": 4037 + }, + { + "epoch": 1.2394106813996317, + "grad_norm": 0.5041607022285461, + "learning_rate": 9.770585868851621e-05, + "loss": 1.9898, + "step": 4038 + }, + { + "epoch": 1.2397176181706568, + "grad_norm": 0.5076795220375061, + "learning_rate": 9.770437010067264e-05, + "loss": 1.9899, + "step": 4039 + }, + { + "epoch": 1.240024554941682, + "grad_norm": 0.47992074489593506, + "learning_rate": 9.770288104138654e-05, + "loss": 1.9923, + "step": 4040 + }, + { + "epoch": 1.2403314917127073, + "grad_norm": 0.4655405580997467, + "learning_rate": 9.770139151067261e-05, + "loss": 2.0082, + "step": 4041 + }, + { + "epoch": 1.2406384284837324, + "grad_norm": 0.499953031539917, + "learning_rate": 9.769990150854558e-05, + "loss": 2.0412, + "step": 4042 + }, + { + "epoch": 1.2409453652547575, + "grad_norm": 0.5288184285163879, + "learning_rate": 9.769841103502016e-05, + "loss": 2.0163, + "step": 4043 + }, + { + "epoch": 1.2412523020257826, + "grad_norm": 0.6660463809967041, + "learning_rate": 9.769692009011107e-05, + "loss": 2.1644, + "step": 4044 + }, + { + "epoch": 1.2415592387968077, + "grad_norm": 0.7020677328109741, + "learning_rate": 9.769542867383306e-05, + "loss": 1.9921, + "step": 4045 + }, + { + "epoch": 1.241866175567833, + "grad_norm": 0.8394366502761841, + "learning_rate": 9.769393678620089e-05, + "loss": 2.0099, + "step": 4046 + }, + { + "epoch": 1.2421731123388582, + "grad_norm": 0.9541008472442627, + "learning_rate": 9.769244442722927e-05, + "loss": 2.0035, + "step": 4047 + }, + { + "epoch": 1.2424800491098833, + "grad_norm": 0.8454573750495911, + "learning_rate": 9.769095159693296e-05, + "loss": 2.0075, + "step": 4048 + }, + { + "epoch": 1.2427869858809086, + "grad_norm": 0.6634951233863831, + "learning_rate": 9.768945829532672e-05, + "loss": 2.0352, + "step": 4049 + }, + { + "epoch": 1.2430939226519337, + "grad_norm": 0.5453166365623474, + "learning_rate": 9.76879645224253e-05, + "loss": 2.0259, + "step": 4050 + }, + { + "epoch": 1.2434008594229589, + "grad_norm": 0.8018995523452759, + "learning_rate": 9.768647027824344e-05, + "loss": 2.0175, + "step": 4051 + }, + { + "epoch": 1.243707796193984, + "grad_norm": 0.8518994450569153, + "learning_rate": 9.768497556279596e-05, + "loss": 1.986, + "step": 4052 + }, + { + "epoch": 1.244014732965009, + "grad_norm": 0.670764684677124, + "learning_rate": 9.76834803760976e-05, + "loss": 1.9779, + "step": 4053 + }, + { + "epoch": 1.2443216697360344, + "grad_norm": 0.5042433142662048, + "learning_rate": 9.768198471816312e-05, + "loss": 1.9808, + "step": 4054 + }, + { + "epoch": 1.2446286065070595, + "grad_norm": 0.45487603545188904, + "learning_rate": 9.768048858900733e-05, + "loss": 2.011, + "step": 4055 + }, + { + "epoch": 1.2449355432780846, + "grad_norm": 0.5012104511260986, + "learning_rate": 9.767899198864502e-05, + "loss": 1.9945, + "step": 4056 + }, + { + "epoch": 1.24524248004911, + "grad_norm": 0.6275805234909058, + "learning_rate": 9.767749491709095e-05, + "loss": 2.0397, + "step": 4057 + }, + { + "epoch": 1.245549416820135, + "grad_norm": 0.601513683795929, + "learning_rate": 9.767599737435993e-05, + "loss": 2.0201, + "step": 4058 + }, + { + "epoch": 1.2458563535911602, + "grad_norm": 0.531112551689148, + "learning_rate": 9.767449936046678e-05, + "loss": 2.0449, + "step": 4059 + }, + { + "epoch": 1.2461632903621853, + "grad_norm": 0.48515528440475464, + "learning_rate": 9.767300087542626e-05, + "loss": 2.0318, + "step": 4060 + }, + { + "epoch": 1.2464702271332107, + "grad_norm": 0.49292388558387756, + "learning_rate": 9.767150191925321e-05, + "loss": 2.0004, + "step": 4061 + }, + { + "epoch": 1.2467771639042358, + "grad_norm": 0.6046907901763916, + "learning_rate": 9.767000249196242e-05, + "loss": 2.0141, + "step": 4062 + }, + { + "epoch": 1.2470841006752609, + "grad_norm": 0.5311875939369202, + "learning_rate": 9.766850259356876e-05, + "loss": 1.9909, + "step": 4063 + }, + { + "epoch": 1.247391037446286, + "grad_norm": 0.535664975643158, + "learning_rate": 9.7667002224087e-05, + "loss": 2.07, + "step": 4064 + }, + { + "epoch": 1.2476979742173113, + "grad_norm": 0.594886839389801, + "learning_rate": 9.766550138353199e-05, + "loss": 1.9646, + "step": 4065 + }, + { + "epoch": 1.2480049109883364, + "grad_norm": 0.6726763844490051, + "learning_rate": 9.766400007191856e-05, + "loss": 1.9778, + "step": 4066 + }, + { + "epoch": 1.2483118477593615, + "grad_norm": 0.6045297384262085, + "learning_rate": 9.766249828926154e-05, + "loss": 2.0215, + "step": 4067 + }, + { + "epoch": 1.2486187845303867, + "grad_norm": 0.56207275390625, + "learning_rate": 9.766099603557576e-05, + "loss": 2.0252, + "step": 4068 + }, + { + "epoch": 1.248925721301412, + "grad_norm": 0.6623022556304932, + "learning_rate": 9.765949331087611e-05, + "loss": 1.975, + "step": 4069 + }, + { + "epoch": 1.249232658072437, + "grad_norm": 0.6274738311767578, + "learning_rate": 9.76579901151774e-05, + "loss": 2.037, + "step": 4070 + }, + { + "epoch": 1.2495395948434622, + "grad_norm": 0.5161643028259277, + "learning_rate": 9.76564864484945e-05, + "loss": 1.969, + "step": 4071 + }, + { + "epoch": 1.2498465316144873, + "grad_norm": 0.5624449849128723, + "learning_rate": 9.765498231084227e-05, + "loss": 2.0322, + "step": 4072 + }, + { + "epoch": 1.2501534683855127, + "grad_norm": 0.6198796629905701, + "learning_rate": 9.765347770223556e-05, + "loss": 1.986, + "step": 4073 + }, + { + "epoch": 1.2504604051565378, + "grad_norm": 0.5928165316581726, + "learning_rate": 9.765197262268927e-05, + "loss": 1.9886, + "step": 4074 + }, + { + "epoch": 1.250767341927563, + "grad_norm": 0.476484090089798, + "learning_rate": 9.765046707221825e-05, + "loss": 2.0476, + "step": 4075 + }, + { + "epoch": 1.2510742786985882, + "grad_norm": 0.5001220703125, + "learning_rate": 9.764896105083738e-05, + "loss": 1.9222, + "step": 4076 + }, + { + "epoch": 1.2513812154696133, + "grad_norm": 0.5429214239120483, + "learning_rate": 9.764745455856156e-05, + "loss": 2.0005, + "step": 4077 + }, + { + "epoch": 1.2516881522406385, + "grad_norm": 0.49443748593330383, + "learning_rate": 9.764594759540566e-05, + "loss": 1.9746, + "step": 4078 + }, + { + "epoch": 1.2519950890116636, + "grad_norm": 0.46963369846343994, + "learning_rate": 9.764444016138458e-05, + "loss": 1.9133, + "step": 4079 + }, + { + "epoch": 1.2523020257826887, + "grad_norm": 0.5112172365188599, + "learning_rate": 9.764293225651324e-05, + "loss": 1.9488, + "step": 4080 + }, + { + "epoch": 1.252608962553714, + "grad_norm": 0.4584117829799652, + "learning_rate": 9.764142388080648e-05, + "loss": 1.9895, + "step": 4081 + }, + { + "epoch": 1.2529158993247391, + "grad_norm": 0.48059090971946716, + "learning_rate": 9.763991503427927e-05, + "loss": 2.0436, + "step": 4082 + }, + { + "epoch": 1.2532228360957642, + "grad_norm": 0.5877810120582581, + "learning_rate": 9.763840571694649e-05, + "loss": 1.97, + "step": 4083 + }, + { + "epoch": 1.2535297728667896, + "grad_norm": 0.5370834469795227, + "learning_rate": 9.763689592882306e-05, + "loss": 2.0369, + "step": 4084 + }, + { + "epoch": 1.2538367096378147, + "grad_norm": 0.5483170747756958, + "learning_rate": 9.763538566992392e-05, + "loss": 2.066, + "step": 4085 + }, + { + "epoch": 1.2541436464088398, + "grad_norm": 0.5209359526634216, + "learning_rate": 9.763387494026396e-05, + "loss": 2.0685, + "step": 4086 + }, + { + "epoch": 1.254450583179865, + "grad_norm": 0.5569130182266235, + "learning_rate": 9.763236373985813e-05, + "loss": 2.0253, + "step": 4087 + }, + { + "epoch": 1.25475751995089, + "grad_norm": 0.48483753204345703, + "learning_rate": 9.763085206872136e-05, + "loss": 1.9851, + "step": 4088 + }, + { + "epoch": 1.2550644567219154, + "grad_norm": 0.4289563000202179, + "learning_rate": 9.76293399268686e-05, + "loss": 1.9374, + "step": 4089 + }, + { + "epoch": 1.2553713934929405, + "grad_norm": 0.4691961109638214, + "learning_rate": 9.762782731431478e-05, + "loss": 1.9588, + "step": 4090 + }, + { + "epoch": 1.2556783302639656, + "grad_norm": 0.49626582860946655, + "learning_rate": 9.762631423107488e-05, + "loss": 1.999, + "step": 4091 + }, + { + "epoch": 1.255985267034991, + "grad_norm": 0.5099872946739197, + "learning_rate": 9.762480067716381e-05, + "loss": 2.013, + "step": 4092 + }, + { + "epoch": 1.256292203806016, + "grad_norm": 0.47525838017463684, + "learning_rate": 9.762328665259654e-05, + "loss": 1.9953, + "step": 4093 + }, + { + "epoch": 1.2565991405770411, + "grad_norm": 0.4277878999710083, + "learning_rate": 9.762177215738804e-05, + "loss": 1.9623, + "step": 4094 + }, + { + "epoch": 1.2569060773480663, + "grad_norm": 0.46068885922431946, + "learning_rate": 9.762025719155328e-05, + "loss": 2.0012, + "step": 4095 + }, + { + "epoch": 1.2572130141190914, + "grad_norm": 0.4566059410572052, + "learning_rate": 9.761874175510723e-05, + "loss": 1.9666, + "step": 4096 + }, + { + "epoch": 1.2575199508901167, + "grad_norm": 0.44656631350517273, + "learning_rate": 9.761722584806487e-05, + "loss": 1.9912, + "step": 4097 + }, + { + "epoch": 1.2578268876611418, + "grad_norm": 0.5149295330047607, + "learning_rate": 9.761570947044117e-05, + "loss": 1.9876, + "step": 4098 + }, + { + "epoch": 1.258133824432167, + "grad_norm": 0.5265617370605469, + "learning_rate": 9.761419262225111e-05, + "loss": 2.0817, + "step": 4099 + }, + { + "epoch": 1.2584407612031923, + "grad_norm": 0.5015068054199219, + "learning_rate": 9.76126753035097e-05, + "loss": 1.9767, + "step": 4100 + }, + { + "epoch": 1.2587476979742174, + "grad_norm": 0.5178890228271484, + "learning_rate": 9.761115751423192e-05, + "loss": 1.9968, + "step": 4101 + }, + { + "epoch": 1.2590546347452425, + "grad_norm": 0.46565014123916626, + "learning_rate": 9.760963925443279e-05, + "loss": 1.8977, + "step": 4102 + }, + { + "epoch": 1.2593615715162676, + "grad_norm": 0.466398686170578, + "learning_rate": 9.760812052412728e-05, + "loss": 2.0317, + "step": 4103 + }, + { + "epoch": 1.2596685082872927, + "grad_norm": 0.48445576429367065, + "learning_rate": 9.760660132333043e-05, + "loss": 1.9953, + "step": 4104 + }, + { + "epoch": 1.259975445058318, + "grad_norm": 0.5716978907585144, + "learning_rate": 9.760508165205724e-05, + "loss": 2.0468, + "step": 4105 + }, + { + "epoch": 1.2602823818293432, + "grad_norm": 0.5168376564979553, + "learning_rate": 9.760356151032273e-05, + "loss": 1.9896, + "step": 4106 + }, + { + "epoch": 1.2605893186003683, + "grad_norm": 0.5014469027519226, + "learning_rate": 9.760204089814192e-05, + "loss": 2.0855, + "step": 4107 + }, + { + "epoch": 1.2608962553713936, + "grad_norm": 0.5283352732658386, + "learning_rate": 9.760051981552984e-05, + "loss": 2.0477, + "step": 4108 + }, + { + "epoch": 1.2612031921424187, + "grad_norm": 0.4526209533214569, + "learning_rate": 9.759899826250153e-05, + "loss": 1.9638, + "step": 4109 + }, + { + "epoch": 1.2615101289134438, + "grad_norm": 0.4565027058124542, + "learning_rate": 9.759747623907203e-05, + "loss": 1.9401, + "step": 4110 + }, + { + "epoch": 1.261817065684469, + "grad_norm": 0.48825928568840027, + "learning_rate": 9.759595374525636e-05, + "loss": 1.9721, + "step": 4111 + }, + { + "epoch": 1.262124002455494, + "grad_norm": 0.4922933578491211, + "learning_rate": 9.759443078106958e-05, + "loss": 1.969, + "step": 4112 + }, + { + "epoch": 1.2624309392265194, + "grad_norm": 0.5227758884429932, + "learning_rate": 9.759290734652674e-05, + "loss": 2.0144, + "step": 4113 + }, + { + "epoch": 1.2627378759975445, + "grad_norm": 0.48013919591903687, + "learning_rate": 9.759138344164289e-05, + "loss": 1.9889, + "step": 4114 + }, + { + "epoch": 1.2630448127685696, + "grad_norm": 0.5039379596710205, + "learning_rate": 9.758985906643309e-05, + "loss": 1.9313, + "step": 4115 + }, + { + "epoch": 1.263351749539595, + "grad_norm": 0.5248776078224182, + "learning_rate": 9.758833422091244e-05, + "loss": 2.0091, + "step": 4116 + }, + { + "epoch": 1.26365868631062, + "grad_norm": 0.4788825809955597, + "learning_rate": 9.758680890509595e-05, + "loss": 2.0197, + "step": 4117 + }, + { + "epoch": 1.2639656230816452, + "grad_norm": 0.4926285743713379, + "learning_rate": 9.758528311899873e-05, + "loss": 2.0558, + "step": 4118 + }, + { + "epoch": 1.2642725598526703, + "grad_norm": 0.44785842299461365, + "learning_rate": 9.758375686263586e-05, + "loss": 1.9505, + "step": 4119 + }, + { + "epoch": 1.2645794966236954, + "grad_norm": 0.44693484902381897, + "learning_rate": 9.75822301360224e-05, + "loss": 1.9734, + "step": 4120 + }, + { + "epoch": 1.2648864333947207, + "grad_norm": 0.4691752791404724, + "learning_rate": 9.758070293917346e-05, + "loss": 2.0069, + "step": 4121 + }, + { + "epoch": 1.2651933701657458, + "grad_norm": 0.4718364477157593, + "learning_rate": 9.757917527210413e-05, + "loss": 1.9926, + "step": 4122 + }, + { + "epoch": 1.265500306936771, + "grad_norm": 0.47527435421943665, + "learning_rate": 9.757764713482949e-05, + "loss": 2.0304, + "step": 4123 + }, + { + "epoch": 1.2658072437077963, + "grad_norm": 0.5030924677848816, + "learning_rate": 9.757611852736467e-05, + "loss": 2.0281, + "step": 4124 + }, + { + "epoch": 1.2661141804788214, + "grad_norm": 0.5260440707206726, + "learning_rate": 9.757458944972475e-05, + "loss": 1.9952, + "step": 4125 + }, + { + "epoch": 1.2664211172498465, + "grad_norm": 0.5542300939559937, + "learning_rate": 9.757305990192486e-05, + "loss": 1.979, + "step": 4126 + }, + { + "epoch": 1.2667280540208716, + "grad_norm": 0.5589221715927124, + "learning_rate": 9.757152988398011e-05, + "loss": 2.0123, + "step": 4127 + }, + { + "epoch": 1.2670349907918967, + "grad_norm": 0.48933175206184387, + "learning_rate": 9.75699993959056e-05, + "loss": 1.9671, + "step": 4128 + }, + { + "epoch": 1.267341927562922, + "grad_norm": 0.4785501956939697, + "learning_rate": 9.75684684377165e-05, + "loss": 1.9452, + "step": 4129 + }, + { + "epoch": 1.2676488643339472, + "grad_norm": 0.5000367760658264, + "learning_rate": 9.75669370094279e-05, + "loss": 1.9637, + "step": 4130 + }, + { + "epoch": 1.2679558011049723, + "grad_norm": 0.5292743444442749, + "learning_rate": 9.756540511105496e-05, + "loss": 2.0464, + "step": 4131 + }, + { + "epoch": 1.2682627378759976, + "grad_norm": 0.4979592561721802, + "learning_rate": 9.75638727426128e-05, + "loss": 1.9863, + "step": 4132 + }, + { + "epoch": 1.2685696746470227, + "grad_norm": 0.4681611657142639, + "learning_rate": 9.756233990411656e-05, + "loss": 1.9978, + "step": 4133 + }, + { + "epoch": 1.2688766114180479, + "grad_norm": 0.5034354329109192, + "learning_rate": 9.756080659558142e-05, + "loss": 2.0332, + "step": 4134 + }, + { + "epoch": 1.269183548189073, + "grad_norm": 0.4815942347049713, + "learning_rate": 9.75592728170225e-05, + "loss": 1.9669, + "step": 4135 + }, + { + "epoch": 1.269490484960098, + "grad_norm": 0.49555137753486633, + "learning_rate": 9.755773856845498e-05, + "loss": 1.9774, + "step": 4136 + }, + { + "epoch": 1.2697974217311234, + "grad_norm": 0.5533550381660461, + "learning_rate": 9.755620384989401e-05, + "loss": 2.0236, + "step": 4137 + }, + { + "epoch": 1.2701043585021485, + "grad_norm": 0.49497511982917786, + "learning_rate": 9.755466866135476e-05, + "loss": 1.9266, + "step": 4138 + }, + { + "epoch": 1.2704112952731736, + "grad_norm": 0.5009804964065552, + "learning_rate": 9.755313300285239e-05, + "loss": 1.9463, + "step": 4139 + }, + { + "epoch": 1.270718232044199, + "grad_norm": 0.49870428442955017, + "learning_rate": 9.755159687440209e-05, + "loss": 1.9566, + "step": 4140 + }, + { + "epoch": 1.271025168815224, + "grad_norm": 0.49113500118255615, + "learning_rate": 9.755006027601905e-05, + "loss": 2.0075, + "step": 4141 + }, + { + "epoch": 1.2713321055862492, + "grad_norm": 0.45977187156677246, + "learning_rate": 9.754852320771845e-05, + "loss": 1.9358, + "step": 4142 + }, + { + "epoch": 1.2716390423572743, + "grad_norm": 0.5493664145469666, + "learning_rate": 9.754698566951545e-05, + "loss": 1.9996, + "step": 4143 + }, + { + "epoch": 1.2719459791282997, + "grad_norm": 0.4791078567504883, + "learning_rate": 9.75454476614253e-05, + "loss": 1.9426, + "step": 4144 + }, + { + "epoch": 1.2722529158993248, + "grad_norm": 0.4809282720088959, + "learning_rate": 9.754390918346315e-05, + "loss": 2.0197, + "step": 4145 + }, + { + "epoch": 1.2725598526703499, + "grad_norm": 0.5380387902259827, + "learning_rate": 9.754237023564423e-05, + "loss": 2.0261, + "step": 4146 + }, + { + "epoch": 1.272866789441375, + "grad_norm": 0.48302608728408813, + "learning_rate": 9.754083081798374e-05, + "loss": 2.0539, + "step": 4147 + }, + { + "epoch": 1.2731737262124003, + "grad_norm": 0.5752124786376953, + "learning_rate": 9.75392909304969e-05, + "loss": 2.0901, + "step": 4148 + }, + { + "epoch": 1.2734806629834254, + "grad_norm": 0.5538807511329651, + "learning_rate": 9.75377505731989e-05, + "loss": 1.9721, + "step": 4149 + }, + { + "epoch": 1.2737875997544506, + "grad_norm": 0.6331756114959717, + "learning_rate": 9.753620974610502e-05, + "loss": 2.0124, + "step": 4150 + }, + { + "epoch": 1.2740945365254759, + "grad_norm": 0.6422140598297119, + "learning_rate": 9.753466844923042e-05, + "loss": 2.0115, + "step": 4151 + }, + { + "epoch": 1.274401473296501, + "grad_norm": 0.6650347113609314, + "learning_rate": 9.753312668259038e-05, + "loss": 1.9735, + "step": 4152 + }, + { + "epoch": 1.274708410067526, + "grad_norm": 0.587230384349823, + "learning_rate": 9.753158444620013e-05, + "loss": 1.9382, + "step": 4153 + }, + { + "epoch": 1.2750153468385512, + "grad_norm": 0.5357664823532104, + "learning_rate": 9.75300417400749e-05, + "loss": 2.0437, + "step": 4154 + }, + { + "epoch": 1.2753222836095763, + "grad_norm": 0.5058115720748901, + "learning_rate": 9.752849856422994e-05, + "loss": 2.0031, + "step": 4155 + }, + { + "epoch": 1.2756292203806017, + "grad_norm": 0.5913745164871216, + "learning_rate": 9.75269549186805e-05, + "loss": 1.9923, + "step": 4156 + }, + { + "epoch": 1.2759361571516268, + "grad_norm": 0.6766920685768127, + "learning_rate": 9.752541080344181e-05, + "loss": 1.9619, + "step": 4157 + }, + { + "epoch": 1.276243093922652, + "grad_norm": 0.606132984161377, + "learning_rate": 9.752386621852919e-05, + "loss": 1.9689, + "step": 4158 + }, + { + "epoch": 1.2765500306936772, + "grad_norm": 0.521133542060852, + "learning_rate": 9.752232116395785e-05, + "loss": 1.9602, + "step": 4159 + }, + { + "epoch": 1.2768569674647023, + "grad_norm": 0.45266324281692505, + "learning_rate": 9.75207756397431e-05, + "loss": 2.0032, + "step": 4160 + }, + { + "epoch": 1.2771639042357275, + "grad_norm": 0.5078892707824707, + "learning_rate": 9.751922964590017e-05, + "loss": 2.0656, + "step": 4161 + }, + { + "epoch": 1.2774708410067526, + "grad_norm": 0.5042154788970947, + "learning_rate": 9.751768318244437e-05, + "loss": 1.9356, + "step": 4162 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.5866135954856873, + "learning_rate": 9.751613624939098e-05, + "loss": 1.9655, + "step": 4163 + }, + { + "epoch": 1.278084714548803, + "grad_norm": 0.6038163304328918, + "learning_rate": 9.751458884675527e-05, + "loss": 1.9445, + "step": 4164 + }, + { + "epoch": 1.2783916513198281, + "grad_norm": 0.4938269555568695, + "learning_rate": 9.751304097455254e-05, + "loss": 2.0164, + "step": 4165 + }, + { + "epoch": 1.2786985880908532, + "grad_norm": 0.4289272427558899, + "learning_rate": 9.75114926327981e-05, + "loss": 1.912, + "step": 4166 + }, + { + "epoch": 1.2790055248618786, + "grad_norm": 0.524058997631073, + "learning_rate": 9.750994382150724e-05, + "loss": 1.9279, + "step": 4167 + }, + { + "epoch": 1.2793124616329037, + "grad_norm": 0.6318224668502808, + "learning_rate": 9.750839454069527e-05, + "loss": 1.98, + "step": 4168 + }, + { + "epoch": 1.2796193984039288, + "grad_norm": 0.5709670782089233, + "learning_rate": 9.750684479037749e-05, + "loss": 2.0029, + "step": 4169 + }, + { + "epoch": 1.279926335174954, + "grad_norm": 0.4621523916721344, + "learning_rate": 9.750529457056924e-05, + "loss": 2.0295, + "step": 4170 + }, + { + "epoch": 1.280233271945979, + "grad_norm": 0.5812001824378967, + "learning_rate": 9.750374388128581e-05, + "loss": 2.0839, + "step": 4171 + }, + { + "epoch": 1.2805402087170044, + "grad_norm": 0.6389874219894409, + "learning_rate": 9.750219272254256e-05, + "loss": 2.0825, + "step": 4172 + }, + { + "epoch": 1.2808471454880295, + "grad_norm": 0.49902382493019104, + "learning_rate": 9.750064109435478e-05, + "loss": 1.8902, + "step": 4173 + }, + { + "epoch": 1.2811540822590546, + "grad_norm": 0.5641525983810425, + "learning_rate": 9.749908899673783e-05, + "loss": 2.0463, + "step": 4174 + }, + { + "epoch": 1.28146101903008, + "grad_norm": 0.5977841019630432, + "learning_rate": 9.749753642970704e-05, + "loss": 2.0253, + "step": 4175 + }, + { + "epoch": 1.281767955801105, + "grad_norm": 0.5438104271888733, + "learning_rate": 9.749598339327777e-05, + "loss": 1.9862, + "step": 4176 + }, + { + "epoch": 1.2820748925721301, + "grad_norm": 0.4542587697505951, + "learning_rate": 9.749442988746535e-05, + "loss": 1.9476, + "step": 4177 + }, + { + "epoch": 1.2823818293431553, + "grad_norm": 0.4900791347026825, + "learning_rate": 9.749287591228513e-05, + "loss": 2.0093, + "step": 4178 + }, + { + "epoch": 1.2826887661141804, + "grad_norm": 0.5837534666061401, + "learning_rate": 9.749132146775247e-05, + "loss": 2.0699, + "step": 4179 + }, + { + "epoch": 1.2829957028852057, + "grad_norm": 0.5315881967544556, + "learning_rate": 9.748976655388274e-05, + "loss": 1.9514, + "step": 4180 + }, + { + "epoch": 1.2833026396562308, + "grad_norm": 0.5284895300865173, + "learning_rate": 9.74882111706913e-05, + "loss": 2.0171, + "step": 4181 + }, + { + "epoch": 1.283609576427256, + "grad_norm": 0.521202802658081, + "learning_rate": 9.748665531819352e-05, + "loss": 2.025, + "step": 4182 + }, + { + "epoch": 1.2839165131982813, + "grad_norm": 0.5437573194503784, + "learning_rate": 9.748509899640479e-05, + "loss": 2.0352, + "step": 4183 + }, + { + "epoch": 1.2842234499693064, + "grad_norm": 0.5394143462181091, + "learning_rate": 9.748354220534048e-05, + "loss": 2.0245, + "step": 4184 + }, + { + "epoch": 1.2845303867403315, + "grad_norm": 0.47468093037605286, + "learning_rate": 9.748198494501597e-05, + "loss": 1.9719, + "step": 4185 + }, + { + "epoch": 1.2848373235113566, + "grad_norm": 0.5312216877937317, + "learning_rate": 9.748042721544666e-05, + "loss": 2.0111, + "step": 4186 + }, + { + "epoch": 1.2851442602823817, + "grad_norm": 0.525694727897644, + "learning_rate": 9.747886901664794e-05, + "loss": 2.0582, + "step": 4187 + }, + { + "epoch": 1.285451197053407, + "grad_norm": 0.4965955317020416, + "learning_rate": 9.74773103486352e-05, + "loss": 1.9777, + "step": 4188 + }, + { + "epoch": 1.2857581338244322, + "grad_norm": 0.4391513466835022, + "learning_rate": 9.747575121142385e-05, + "loss": 1.9725, + "step": 4189 + }, + { + "epoch": 1.2860650705954573, + "grad_norm": 0.48999011516571045, + "learning_rate": 9.74741916050293e-05, + "loss": 1.953, + "step": 4190 + }, + { + "epoch": 1.2863720073664826, + "grad_norm": 0.5297304391860962, + "learning_rate": 9.747263152946698e-05, + "loss": 2.0484, + "step": 4191 + }, + { + "epoch": 1.2866789441375077, + "grad_norm": 0.4878230690956116, + "learning_rate": 9.747107098475226e-05, + "loss": 2.0423, + "step": 4192 + }, + { + "epoch": 1.2869858809085328, + "grad_norm": 0.538070023059845, + "learning_rate": 9.74695099709006e-05, + "loss": 2.0699, + "step": 4193 + }, + { + "epoch": 1.287292817679558, + "grad_norm": 0.6656436324119568, + "learning_rate": 9.746794848792743e-05, + "loss": 2.0689, + "step": 4194 + }, + { + "epoch": 1.287599754450583, + "grad_norm": 0.6416848301887512, + "learning_rate": 9.746638653584819e-05, + "loss": 1.9796, + "step": 4195 + }, + { + "epoch": 1.2879066912216084, + "grad_norm": 0.5917447805404663, + "learning_rate": 9.746482411467827e-05, + "loss": 2.0324, + "step": 4196 + }, + { + "epoch": 1.2882136279926335, + "grad_norm": 0.5234537124633789, + "learning_rate": 9.746326122443314e-05, + "loss": 2.0468, + "step": 4197 + }, + { + "epoch": 1.2885205647636586, + "grad_norm": 0.4885808229446411, + "learning_rate": 9.746169786512827e-05, + "loss": 1.9619, + "step": 4198 + }, + { + "epoch": 1.288827501534684, + "grad_norm": 0.5776945948600769, + "learning_rate": 9.746013403677905e-05, + "loss": 2.0167, + "step": 4199 + }, + { + "epoch": 1.289134438305709, + "grad_norm": 0.5722271203994751, + "learning_rate": 9.745856973940099e-05, + "loss": 1.9751, + "step": 4200 + }, + { + "epoch": 1.2894413750767342, + "grad_norm": 0.49253931641578674, + "learning_rate": 9.745700497300951e-05, + "loss": 1.9821, + "step": 4201 + }, + { + "epoch": 1.2897483118477593, + "grad_norm": 0.4739282727241516, + "learning_rate": 9.74554397376201e-05, + "loss": 1.9926, + "step": 4202 + }, + { + "epoch": 1.2900552486187844, + "grad_norm": 0.5133153200149536, + "learning_rate": 9.745387403324823e-05, + "loss": 1.9655, + "step": 4203 + }, + { + "epoch": 1.2903621853898097, + "grad_norm": 0.48941388726234436, + "learning_rate": 9.745230785990935e-05, + "loss": 1.9401, + "step": 4204 + }, + { + "epoch": 1.2906691221608348, + "grad_norm": 0.5998152494430542, + "learning_rate": 9.745074121761896e-05, + "loss": 2.0223, + "step": 4205 + }, + { + "epoch": 1.29097605893186, + "grad_norm": 0.4423331618309021, + "learning_rate": 9.744917410639253e-05, + "loss": 1.9602, + "step": 4206 + }, + { + "epoch": 1.2912829957028853, + "grad_norm": 0.5387418866157532, + "learning_rate": 9.744760652624553e-05, + "loss": 2.0631, + "step": 4207 + }, + { + "epoch": 1.2915899324739104, + "grad_norm": 0.5992900729179382, + "learning_rate": 9.744603847719352e-05, + "loss": 1.9805, + "step": 4208 + }, + { + "epoch": 1.2918968692449355, + "grad_norm": 0.5033924579620361, + "learning_rate": 9.744446995925192e-05, + "loss": 1.9817, + "step": 4209 + }, + { + "epoch": 1.2922038060159606, + "grad_norm": 0.47493448853492737, + "learning_rate": 9.744290097243624e-05, + "loss": 2.0259, + "step": 4210 + }, + { + "epoch": 1.2925107427869857, + "grad_norm": 0.5161942839622498, + "learning_rate": 9.744133151676203e-05, + "loss": 1.9686, + "step": 4211 + }, + { + "epoch": 1.292817679558011, + "grad_norm": 0.4476351737976074, + "learning_rate": 9.743976159224477e-05, + "loss": 1.9488, + "step": 4212 + }, + { + "epoch": 1.2931246163290362, + "grad_norm": 0.5168361663818359, + "learning_rate": 9.743819119889999e-05, + "loss": 2.0645, + "step": 4213 + }, + { + "epoch": 1.2934315531000613, + "grad_norm": 0.5098811984062195, + "learning_rate": 9.743662033674319e-05, + "loss": 1.9889, + "step": 4214 + }, + { + "epoch": 1.2937384898710866, + "grad_norm": 0.5559372305870056, + "learning_rate": 9.74350490057899e-05, + "loss": 2.0348, + "step": 4215 + }, + { + "epoch": 1.2940454266421118, + "grad_norm": 0.5274948477745056, + "learning_rate": 9.743347720605566e-05, + "loss": 2.0566, + "step": 4216 + }, + { + "epoch": 1.2943523634131369, + "grad_norm": 0.5009967088699341, + "learning_rate": 9.743190493755601e-05, + "loss": 1.9915, + "step": 4217 + }, + { + "epoch": 1.2946593001841622, + "grad_norm": 0.5365834832191467, + "learning_rate": 9.743033220030646e-05, + "loss": 2.0581, + "step": 4218 + }, + { + "epoch": 1.2949662369551873, + "grad_norm": 0.519478976726532, + "learning_rate": 9.742875899432255e-05, + "loss": 1.9766, + "step": 4219 + }, + { + "epoch": 1.2952731737262124, + "grad_norm": 0.48030364513397217, + "learning_rate": 9.742718531961988e-05, + "loss": 2.0006, + "step": 4220 + }, + { + "epoch": 1.2955801104972375, + "grad_norm": 0.5257472991943359, + "learning_rate": 9.742561117621394e-05, + "loss": 2.0636, + "step": 4221 + }, + { + "epoch": 1.2958870472682626, + "grad_norm": 0.44784319400787354, + "learning_rate": 9.742403656412034e-05, + "loss": 1.9975, + "step": 4222 + }, + { + "epoch": 1.296193984039288, + "grad_norm": 0.4997022747993469, + "learning_rate": 9.742246148335459e-05, + "loss": 2.0167, + "step": 4223 + }, + { + "epoch": 1.296500920810313, + "grad_norm": 0.43378305435180664, + "learning_rate": 9.742088593393228e-05, + "loss": 1.9202, + "step": 4224 + }, + { + "epoch": 1.2968078575813382, + "grad_norm": 0.5256497859954834, + "learning_rate": 9.741930991586899e-05, + "loss": 2.0306, + "step": 4225 + }, + { + "epoch": 1.2971147943523635, + "grad_norm": 0.5017027258872986, + "learning_rate": 9.741773342918028e-05, + "loss": 2.0124, + "step": 4226 + }, + { + "epoch": 1.2974217311233887, + "grad_norm": 0.5393915176391602, + "learning_rate": 9.741615647388175e-05, + "loss": 2.0255, + "step": 4227 + }, + { + "epoch": 1.2977286678944138, + "grad_norm": 0.48618295788764954, + "learning_rate": 9.741457904998896e-05, + "loss": 1.9863, + "step": 4228 + }, + { + "epoch": 1.2980356046654389, + "grad_norm": 0.48060059547424316, + "learning_rate": 9.741300115751752e-05, + "loss": 2.0787, + "step": 4229 + }, + { + "epoch": 1.298342541436464, + "grad_norm": 0.4966236650943756, + "learning_rate": 9.741142279648298e-05, + "loss": 1.9818, + "step": 4230 + }, + { + "epoch": 1.2986494782074893, + "grad_norm": 0.5178021788597107, + "learning_rate": 9.7409843966901e-05, + "loss": 1.9847, + "step": 4231 + }, + { + "epoch": 1.2989564149785144, + "grad_norm": 0.5161063075065613, + "learning_rate": 9.740826466878716e-05, + "loss": 2.0028, + "step": 4232 + }, + { + "epoch": 1.2992633517495396, + "grad_norm": 0.5972462296485901, + "learning_rate": 9.740668490215705e-05, + "loss": 2.0205, + "step": 4233 + }, + { + "epoch": 1.2995702885205649, + "grad_norm": 0.5929185152053833, + "learning_rate": 9.740510466702629e-05, + "loss": 1.9802, + "step": 4234 + }, + { + "epoch": 1.29987722529159, + "grad_norm": 0.5496684908866882, + "learning_rate": 9.74035239634105e-05, + "loss": 1.9331, + "step": 4235 + }, + { + "epoch": 1.3001841620626151, + "grad_norm": 0.5822622179985046, + "learning_rate": 9.740194279132531e-05, + "loss": 2.1079, + "step": 4236 + }, + { + "epoch": 1.3004910988336402, + "grad_norm": 0.5886369943618774, + "learning_rate": 9.740036115078634e-05, + "loss": 1.9938, + "step": 4237 + }, + { + "epoch": 1.3007980356046653, + "grad_norm": 0.5259171724319458, + "learning_rate": 9.73987790418092e-05, + "loss": 2.0787, + "step": 4238 + }, + { + "epoch": 1.3011049723756907, + "grad_norm": 0.6112152934074402, + "learning_rate": 9.739719646440956e-05, + "loss": 2.0488, + "step": 4239 + }, + { + "epoch": 1.3014119091467158, + "grad_norm": 0.5786338448524475, + "learning_rate": 9.739561341860306e-05, + "loss": 1.9917, + "step": 4240 + }, + { + "epoch": 1.301718845917741, + "grad_norm": 0.5099230408668518, + "learning_rate": 9.739402990440531e-05, + "loss": 1.9949, + "step": 4241 + }, + { + "epoch": 1.3020257826887662, + "grad_norm": 0.5040346384048462, + "learning_rate": 9.739244592183198e-05, + "loss": 1.9368, + "step": 4242 + }, + { + "epoch": 1.3023327194597913, + "grad_norm": 0.48172008991241455, + "learning_rate": 9.739086147089871e-05, + "loss": 1.97, + "step": 4243 + }, + { + "epoch": 1.3026396562308165, + "grad_norm": 0.5350810885429382, + "learning_rate": 9.738927655162119e-05, + "loss": 2.0584, + "step": 4244 + }, + { + "epoch": 1.3029465930018416, + "grad_norm": 0.566371738910675, + "learning_rate": 9.738769116401505e-05, + "loss": 2.0138, + "step": 4245 + }, + { + "epoch": 1.3032535297728667, + "grad_norm": 0.5697746872901917, + "learning_rate": 9.738610530809598e-05, + "loss": 2.0319, + "step": 4246 + }, + { + "epoch": 1.303560466543892, + "grad_norm": 0.5186757445335388, + "learning_rate": 9.738451898387964e-05, + "loss": 1.9958, + "step": 4247 + }, + { + "epoch": 1.3038674033149171, + "grad_norm": 0.5318703651428223, + "learning_rate": 9.73829321913817e-05, + "loss": 2.0857, + "step": 4248 + }, + { + "epoch": 1.3041743400859422, + "grad_norm": 0.5013560056686401, + "learning_rate": 9.738134493061786e-05, + "loss": 1.9545, + "step": 4249 + }, + { + "epoch": 1.3044812768569676, + "grad_norm": 0.499009907245636, + "learning_rate": 9.737975720160382e-05, + "loss": 1.9773, + "step": 4250 + }, + { + "epoch": 1.3047882136279927, + "grad_norm": 0.5187140703201294, + "learning_rate": 9.737816900435522e-05, + "loss": 1.9826, + "step": 4251 + }, + { + "epoch": 1.3050951503990178, + "grad_norm": 0.4950683116912842, + "learning_rate": 9.73765803388878e-05, + "loss": 2.0061, + "step": 4252 + }, + { + "epoch": 1.305402087170043, + "grad_norm": 0.40729087591171265, + "learning_rate": 9.737499120521722e-05, + "loss": 1.9502, + "step": 4253 + }, + { + "epoch": 1.305709023941068, + "grad_norm": 0.4959156811237335, + "learning_rate": 9.737340160335924e-05, + "loss": 2.0975, + "step": 4254 + }, + { + "epoch": 1.3060159607120934, + "grad_norm": 0.5127618312835693, + "learning_rate": 9.737181153332952e-05, + "loss": 2.0098, + "step": 4255 + }, + { + "epoch": 1.3063228974831185, + "grad_norm": 0.45458972454071045, + "learning_rate": 9.737022099514381e-05, + "loss": 1.9475, + "step": 4256 + }, + { + "epoch": 1.3066298342541436, + "grad_norm": 0.5024627447128296, + "learning_rate": 9.736862998881779e-05, + "loss": 2.0682, + "step": 4257 + }, + { + "epoch": 1.306936771025169, + "grad_norm": 0.5217326283454895, + "learning_rate": 9.736703851436722e-05, + "loss": 2.0363, + "step": 4258 + }, + { + "epoch": 1.307243707796194, + "grad_norm": 0.4798679053783417, + "learning_rate": 9.736544657180781e-05, + "loss": 2.0357, + "step": 4259 + }, + { + "epoch": 1.3075506445672191, + "grad_norm": 0.6031736135482788, + "learning_rate": 9.73638541611553e-05, + "loss": 2.0143, + "step": 4260 + }, + { + "epoch": 1.3078575813382443, + "grad_norm": 0.4914969801902771, + "learning_rate": 9.736226128242542e-05, + "loss": 1.9292, + "step": 4261 + }, + { + "epoch": 1.3081645181092694, + "grad_norm": 0.40556418895721436, + "learning_rate": 9.736066793563392e-05, + "loss": 1.9528, + "step": 4262 + }, + { + "epoch": 1.3084714548802947, + "grad_norm": 0.45605841279029846, + "learning_rate": 9.735907412079652e-05, + "loss": 2.0704, + "step": 4263 + }, + { + "epoch": 1.3087783916513198, + "grad_norm": 0.4992324113845825, + "learning_rate": 9.7357479837929e-05, + "loss": 2.0211, + "step": 4264 + }, + { + "epoch": 1.309085328422345, + "grad_norm": 0.4904097020626068, + "learning_rate": 9.735588508704712e-05, + "loss": 1.987, + "step": 4265 + }, + { + "epoch": 1.3093922651933703, + "grad_norm": 0.5436086058616638, + "learning_rate": 9.735428986816661e-05, + "loss": 2.0704, + "step": 4266 + }, + { + "epoch": 1.3096992019643954, + "grad_norm": 0.4850294589996338, + "learning_rate": 9.735269418130326e-05, + "loss": 1.9576, + "step": 4267 + }, + { + "epoch": 1.3100061387354205, + "grad_norm": 0.44082164764404297, + "learning_rate": 9.735109802647283e-05, + "loss": 2.0018, + "step": 4268 + }, + { + "epoch": 1.3103130755064456, + "grad_norm": 0.4844531714916229, + "learning_rate": 9.73495014036911e-05, + "loss": 1.9852, + "step": 4269 + }, + { + "epoch": 1.3106200122774707, + "grad_norm": 0.547596275806427, + "learning_rate": 9.734790431297384e-05, + "loss": 2.0632, + "step": 4270 + }, + { + "epoch": 1.310926949048496, + "grad_norm": 0.517882764339447, + "learning_rate": 9.734630675433684e-05, + "loss": 1.9851, + "step": 4271 + }, + { + "epoch": 1.3112338858195212, + "grad_norm": 0.5148623585700989, + "learning_rate": 9.734470872779589e-05, + "loss": 2.0446, + "step": 4272 + }, + { + "epoch": 1.3115408225905463, + "grad_norm": 0.5872887372970581, + "learning_rate": 9.734311023336678e-05, + "loss": 2.0588, + "step": 4273 + }, + { + "epoch": 1.3118477593615716, + "grad_norm": 0.7116255164146423, + "learning_rate": 9.73415112710653e-05, + "loss": 2.0213, + "step": 4274 + }, + { + "epoch": 1.3121546961325967, + "grad_norm": 0.8191964626312256, + "learning_rate": 9.733991184090725e-05, + "loss": 1.9528, + "step": 4275 + }, + { + "epoch": 1.3124616329036218, + "grad_norm": 0.8214605450630188, + "learning_rate": 9.733831194290846e-05, + "loss": 1.9614, + "step": 4276 + }, + { + "epoch": 1.312768569674647, + "grad_norm": 0.7057182788848877, + "learning_rate": 9.733671157708472e-05, + "loss": 2.0767, + "step": 4277 + }, + { + "epoch": 1.313075506445672, + "grad_norm": 0.5114007592201233, + "learning_rate": 9.733511074345185e-05, + "loss": 1.946, + "step": 4278 + }, + { + "epoch": 1.3133824432166974, + "grad_norm": 0.5347970128059387, + "learning_rate": 9.733350944202566e-05, + "loss": 1.9658, + "step": 4279 + }, + { + "epoch": 1.3136893799877225, + "grad_norm": 0.6962214112281799, + "learning_rate": 9.733190767282202e-05, + "loss": 2.0943, + "step": 4280 + }, + { + "epoch": 1.3139963167587476, + "grad_norm": 0.5942707657814026, + "learning_rate": 9.733030543585668e-05, + "loss": 2.0101, + "step": 4281 + }, + { + "epoch": 1.314303253529773, + "grad_norm": 0.46218639612197876, + "learning_rate": 9.732870273114556e-05, + "loss": 2.0292, + "step": 4282 + }, + { + "epoch": 1.314610190300798, + "grad_norm": 0.5194444060325623, + "learning_rate": 9.732709955870445e-05, + "loss": 2.0666, + "step": 4283 + }, + { + "epoch": 1.3149171270718232, + "grad_norm": 0.5112141370773315, + "learning_rate": 9.732549591854918e-05, + "loss": 2.0205, + "step": 4284 + }, + { + "epoch": 1.3152240638428485, + "grad_norm": 0.5282790660858154, + "learning_rate": 9.732389181069566e-05, + "loss": 2.0704, + "step": 4285 + }, + { + "epoch": 1.3155310006138736, + "grad_norm": 0.4598311185836792, + "learning_rate": 9.732228723515968e-05, + "loss": 1.9485, + "step": 4286 + }, + { + "epoch": 1.3158379373848987, + "grad_norm": 0.4700186550617218, + "learning_rate": 9.732068219195711e-05, + "loss": 2.0329, + "step": 4287 + }, + { + "epoch": 1.3161448741559238, + "grad_norm": 0.4512452781200409, + "learning_rate": 9.731907668110384e-05, + "loss": 1.9829, + "step": 4288 + }, + { + "epoch": 1.316451810926949, + "grad_norm": 0.5053353309631348, + "learning_rate": 9.731747070261572e-05, + "loss": 2.0583, + "step": 4289 + }, + { + "epoch": 1.3167587476979743, + "grad_norm": 0.48143625259399414, + "learning_rate": 9.73158642565086e-05, + "loss": 2.014, + "step": 4290 + }, + { + "epoch": 1.3170656844689994, + "grad_norm": 0.4843716025352478, + "learning_rate": 9.73142573427984e-05, + "loss": 1.9951, + "step": 4291 + }, + { + "epoch": 1.3173726212400245, + "grad_norm": 0.45646217465400696, + "learning_rate": 9.731264996150098e-05, + "loss": 1.9701, + "step": 4292 + }, + { + "epoch": 1.3176795580110499, + "grad_norm": 0.5176306962966919, + "learning_rate": 9.73110421126322e-05, + "loss": 1.9915, + "step": 4293 + }, + { + "epoch": 1.317986494782075, + "grad_norm": 0.4862259328365326, + "learning_rate": 9.730943379620799e-05, + "loss": 2.0157, + "step": 4294 + }, + { + "epoch": 1.3182934315531, + "grad_norm": 0.4941593110561371, + "learning_rate": 9.730782501224423e-05, + "loss": 2.0164, + "step": 4295 + }, + { + "epoch": 1.3186003683241252, + "grad_norm": 0.46818530559539795, + "learning_rate": 9.73062157607568e-05, + "loss": 1.9749, + "step": 4296 + }, + { + "epoch": 1.3189073050951503, + "grad_norm": 0.41685113310813904, + "learning_rate": 9.730460604176163e-05, + "loss": 1.9443, + "step": 4297 + }, + { + "epoch": 1.3192142418661756, + "grad_norm": 0.40586861968040466, + "learning_rate": 9.73029958552746e-05, + "loss": 1.9227, + "step": 4298 + }, + { + "epoch": 1.3195211786372008, + "grad_norm": 0.3946068286895752, + "learning_rate": 9.730138520131167e-05, + "loss": 1.9073, + "step": 4299 + }, + { + "epoch": 1.3198281154082259, + "grad_norm": 0.3722321093082428, + "learning_rate": 9.729977407988871e-05, + "loss": 1.9299, + "step": 4300 + }, + { + "epoch": 1.3201350521792512, + "grad_norm": 0.39335691928863525, + "learning_rate": 9.729816249102164e-05, + "loss": 1.9673, + "step": 4301 + }, + { + "epoch": 1.3204419889502763, + "grad_norm": 0.4342779815196991, + "learning_rate": 9.729655043472643e-05, + "loss": 2.0704, + "step": 4302 + }, + { + "epoch": 1.3207489257213014, + "grad_norm": 0.46981000900268555, + "learning_rate": 9.729493791101899e-05, + "loss": 2.0593, + "step": 4303 + }, + { + "epoch": 1.3210558624923265, + "grad_norm": 0.4319849908351898, + "learning_rate": 9.729332491991524e-05, + "loss": 1.9378, + "step": 4304 + }, + { + "epoch": 1.3213627992633517, + "grad_norm": 0.4555012285709381, + "learning_rate": 9.729171146143115e-05, + "loss": 1.993, + "step": 4305 + }, + { + "epoch": 1.321669736034377, + "grad_norm": 0.5122297406196594, + "learning_rate": 9.729009753558262e-05, + "loss": 2.0237, + "step": 4306 + }, + { + "epoch": 1.321976672805402, + "grad_norm": 0.4814549386501312, + "learning_rate": 9.728848314238566e-05, + "loss": 2.0063, + "step": 4307 + }, + { + "epoch": 1.3222836095764272, + "grad_norm": 0.45410022139549255, + "learning_rate": 9.728686828185618e-05, + "loss": 2.0262, + "step": 4308 + }, + { + "epoch": 1.3225905463474525, + "grad_norm": 0.44759154319763184, + "learning_rate": 9.728525295401014e-05, + "loss": 1.9746, + "step": 4309 + }, + { + "epoch": 1.3228974831184777, + "grad_norm": 0.41539889574050903, + "learning_rate": 9.728363715886352e-05, + "loss": 1.9197, + "step": 4310 + }, + { + "epoch": 1.3232044198895028, + "grad_norm": 0.549961268901825, + "learning_rate": 9.72820208964323e-05, + "loss": 2.0168, + "step": 4311 + }, + { + "epoch": 1.3235113566605279, + "grad_norm": 0.6832249164581299, + "learning_rate": 9.728040416673243e-05, + "loss": 1.9711, + "step": 4312 + }, + { + "epoch": 1.323818293431553, + "grad_norm": 0.7458481788635254, + "learning_rate": 9.727878696977988e-05, + "loss": 2.1677, + "step": 4313 + }, + { + "epoch": 1.3241252302025783, + "grad_norm": 0.6268119812011719, + "learning_rate": 9.727716930559066e-05, + "loss": 2.0222, + "step": 4314 + }, + { + "epoch": 1.3244321669736034, + "grad_norm": 0.540987491607666, + "learning_rate": 9.727555117418075e-05, + "loss": 2.0552, + "step": 4315 + }, + { + "epoch": 1.3247391037446286, + "grad_norm": 0.6105024814605713, + "learning_rate": 9.727393257556612e-05, + "loss": 1.9287, + "step": 4316 + }, + { + "epoch": 1.325046040515654, + "grad_norm": 0.594327449798584, + "learning_rate": 9.727231350976277e-05, + "loss": 1.9737, + "step": 4317 + }, + { + "epoch": 1.325352977286679, + "grad_norm": 0.5686312913894653, + "learning_rate": 9.727069397678674e-05, + "loss": 1.988, + "step": 4318 + }, + { + "epoch": 1.3256599140577041, + "grad_norm": 0.5335875153541565, + "learning_rate": 9.726907397665399e-05, + "loss": 1.9992, + "step": 4319 + }, + { + "epoch": 1.3259668508287292, + "grad_norm": 0.514209508895874, + "learning_rate": 9.726745350938055e-05, + "loss": 2.0928, + "step": 4320 + }, + { + "epoch": 1.3262737875997543, + "grad_norm": 0.58844393491745, + "learning_rate": 9.726583257498242e-05, + "loss": 1.968, + "step": 4321 + }, + { + "epoch": 1.3265807243707797, + "grad_norm": 0.5247591733932495, + "learning_rate": 9.726421117347563e-05, + "loss": 1.9529, + "step": 4322 + }, + { + "epoch": 1.3268876611418048, + "grad_norm": 0.5057464241981506, + "learning_rate": 9.726258930487622e-05, + "loss": 2.0595, + "step": 4323 + }, + { + "epoch": 1.32719459791283, + "grad_norm": 0.564689040184021, + "learning_rate": 9.726096696920019e-05, + "loss": 1.9974, + "step": 4324 + }, + { + "epoch": 1.3275015346838552, + "grad_norm": 0.5755618214607239, + "learning_rate": 9.725934416646358e-05, + "loss": 1.9949, + "step": 4325 + }, + { + "epoch": 1.3278084714548803, + "grad_norm": 0.5969316959381104, + "learning_rate": 9.725772089668243e-05, + "loss": 1.972, + "step": 4326 + }, + { + "epoch": 1.3281154082259055, + "grad_norm": 0.5776877403259277, + "learning_rate": 9.725609715987278e-05, + "loss": 2.1018, + "step": 4327 + }, + { + "epoch": 1.3284223449969306, + "grad_norm": 0.5471270680427551, + "learning_rate": 9.725447295605071e-05, + "loss": 2.0153, + "step": 4328 + }, + { + "epoch": 1.3287292817679557, + "grad_norm": 0.49090373516082764, + "learning_rate": 9.725284828523222e-05, + "loss": 1.9651, + "step": 4329 + }, + { + "epoch": 1.329036218538981, + "grad_norm": 0.49420034885406494, + "learning_rate": 9.725122314743337e-05, + "loss": 2.0119, + "step": 4330 + }, + { + "epoch": 1.3293431553100061, + "grad_norm": 0.4841148853302002, + "learning_rate": 9.724959754267027e-05, + "loss": 1.974, + "step": 4331 + }, + { + "epoch": 1.3296500920810312, + "grad_norm": 0.42349007725715637, + "learning_rate": 9.724797147095893e-05, + "loss": 1.9779, + "step": 4332 + }, + { + "epoch": 1.3299570288520566, + "grad_norm": 0.47239863872528076, + "learning_rate": 9.724634493231545e-05, + "loss": 1.9184, + "step": 4333 + }, + { + "epoch": 1.3302639656230817, + "grad_norm": 0.5583773255348206, + "learning_rate": 9.72447179267559e-05, + "loss": 2.0742, + "step": 4334 + }, + { + "epoch": 1.3305709023941068, + "grad_norm": 0.486937552690506, + "learning_rate": 9.724309045429636e-05, + "loss": 2.0101, + "step": 4335 + }, + { + "epoch": 1.330877839165132, + "grad_norm": 0.42204493284225464, + "learning_rate": 9.724146251495289e-05, + "loss": 1.9564, + "step": 4336 + }, + { + "epoch": 1.331184775936157, + "grad_norm": 0.451628714799881, + "learning_rate": 9.723983410874163e-05, + "loss": 1.9949, + "step": 4337 + }, + { + "epoch": 1.3314917127071824, + "grad_norm": 0.4453491270542145, + "learning_rate": 9.723820523567861e-05, + "loss": 1.9415, + "step": 4338 + }, + { + "epoch": 1.3317986494782075, + "grad_norm": 0.4628424644470215, + "learning_rate": 9.723657589577999e-05, + "loss": 2.0296, + "step": 4339 + }, + { + "epoch": 1.3321055862492326, + "grad_norm": 0.5362148284912109, + "learning_rate": 9.723494608906181e-05, + "loss": 2.0719, + "step": 4340 + }, + { + "epoch": 1.332412523020258, + "grad_norm": 0.45357146859169006, + "learning_rate": 9.723331581554023e-05, + "loss": 1.9107, + "step": 4341 + }, + { + "epoch": 1.332719459791283, + "grad_norm": 0.5042485594749451, + "learning_rate": 9.723168507523133e-05, + "loss": 1.9838, + "step": 4342 + }, + { + "epoch": 1.3330263965623081, + "grad_norm": 0.4797585606575012, + "learning_rate": 9.723005386815123e-05, + "loss": 1.9779, + "step": 4343 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4489155113697052, + "learning_rate": 9.722842219431607e-05, + "loss": 1.9805, + "step": 4344 + }, + { + "epoch": 1.3336402701043584, + "grad_norm": 0.43091216683387756, + "learning_rate": 9.722679005374196e-05, + "loss": 1.9708, + "step": 4345 + }, + { + "epoch": 1.3339472068753837, + "grad_norm": 0.453937292098999, + "learning_rate": 9.722515744644502e-05, + "loss": 2.0038, + "step": 4346 + }, + { + "epoch": 1.3342541436464088, + "grad_norm": 0.38905346393585205, + "learning_rate": 9.722352437244138e-05, + "loss": 2.0042, + "step": 4347 + }, + { + "epoch": 1.334561080417434, + "grad_norm": 0.46686118841171265, + "learning_rate": 9.722189083174722e-05, + "loss": 2.0733, + "step": 4348 + }, + { + "epoch": 1.3348680171884593, + "grad_norm": 0.42737439274787903, + "learning_rate": 9.722025682437865e-05, + "loss": 1.9572, + "step": 4349 + }, + { + "epoch": 1.3351749539594844, + "grad_norm": 0.3857511878013611, + "learning_rate": 9.721862235035181e-05, + "loss": 1.9288, + "step": 4350 + }, + { + "epoch": 1.3354818907305095, + "grad_norm": 0.42448824644088745, + "learning_rate": 9.721698740968288e-05, + "loss": 1.99, + "step": 4351 + }, + { + "epoch": 1.3357888275015346, + "grad_norm": 0.4753642976284027, + "learning_rate": 9.721535200238802e-05, + "loss": 2.0268, + "step": 4352 + }, + { + "epoch": 1.3360957642725597, + "grad_norm": 0.5248960256576538, + "learning_rate": 9.721371612848336e-05, + "loss": 2.008, + "step": 4353 + }, + { + "epoch": 1.336402701043585, + "grad_norm": 0.5046865344047546, + "learning_rate": 9.721207978798507e-05, + "loss": 1.9248, + "step": 4354 + }, + { + "epoch": 1.3367096378146102, + "grad_norm": 0.48205190896987915, + "learning_rate": 9.721044298090937e-05, + "loss": 1.9895, + "step": 4355 + }, + { + "epoch": 1.3370165745856353, + "grad_norm": 0.46149346232414246, + "learning_rate": 9.720880570727238e-05, + "loss": 2.0001, + "step": 4356 + }, + { + "epoch": 1.3373235113566606, + "grad_norm": 0.6212405562400818, + "learning_rate": 9.72071679670903e-05, + "loss": 2.0772, + "step": 4357 + }, + { + "epoch": 1.3376304481276857, + "grad_norm": 0.6935828924179077, + "learning_rate": 9.720552976037934e-05, + "loss": 1.9865, + "step": 4358 + }, + { + "epoch": 1.3379373848987108, + "grad_norm": 0.6850154399871826, + "learning_rate": 9.720389108715564e-05, + "loss": 1.9964, + "step": 4359 + }, + { + "epoch": 1.3382443216697362, + "grad_norm": 0.5925734043121338, + "learning_rate": 9.720225194743544e-05, + "loss": 2.0109, + "step": 4360 + }, + { + "epoch": 1.3385512584407613, + "grad_norm": 0.47503459453582764, + "learning_rate": 9.720061234123492e-05, + "loss": 2.0406, + "step": 4361 + }, + { + "epoch": 1.3388581952117864, + "grad_norm": 0.44226083159446716, + "learning_rate": 9.719897226857026e-05, + "loss": 1.953, + "step": 4362 + }, + { + "epoch": 1.3391651319828115, + "grad_norm": 0.5688608884811401, + "learning_rate": 9.719733172945772e-05, + "loss": 1.9422, + "step": 4363 + }, + { + "epoch": 1.3394720687538366, + "grad_norm": 0.6097545027732849, + "learning_rate": 9.719569072391347e-05, + "loss": 2.0204, + "step": 4364 + }, + { + "epoch": 1.339779005524862, + "grad_norm": 0.44313064217567444, + "learning_rate": 9.719404925195374e-05, + "loss": 1.9458, + "step": 4365 + }, + { + "epoch": 1.340085942295887, + "grad_norm": 0.495632141828537, + "learning_rate": 9.719240731359476e-05, + "loss": 1.9682, + "step": 4366 + }, + { + "epoch": 1.3403928790669122, + "grad_norm": 0.5843736529350281, + "learning_rate": 9.719076490885275e-05, + "loss": 1.9948, + "step": 4367 + }, + { + "epoch": 1.3406998158379375, + "grad_norm": 0.6249645352363586, + "learning_rate": 9.718912203774395e-05, + "loss": 1.9675, + "step": 4368 + }, + { + "epoch": 1.3410067526089626, + "grad_norm": 0.48386043310165405, + "learning_rate": 9.718747870028457e-05, + "loss": 1.9678, + "step": 4369 + }, + { + "epoch": 1.3413136893799877, + "grad_norm": 0.4797835648059845, + "learning_rate": 9.718583489649088e-05, + "loss": 2.0118, + "step": 4370 + }, + { + "epoch": 1.3416206261510129, + "grad_norm": 0.6131169199943542, + "learning_rate": 9.718419062637911e-05, + "loss": 2.0057, + "step": 4371 + }, + { + "epoch": 1.341927562922038, + "grad_norm": 0.6230120062828064, + "learning_rate": 9.718254588996552e-05, + "loss": 1.9871, + "step": 4372 + }, + { + "epoch": 1.3422344996930633, + "grad_norm": 0.5323978662490845, + "learning_rate": 9.718090068726633e-05, + "loss": 1.9389, + "step": 4373 + }, + { + "epoch": 1.3425414364640884, + "grad_norm": 0.429446280002594, + "learning_rate": 9.717925501829786e-05, + "loss": 1.9928, + "step": 4374 + }, + { + "epoch": 1.3428483732351135, + "grad_norm": 0.5588231086730957, + "learning_rate": 9.717760888307632e-05, + "loss": 2.0197, + "step": 4375 + }, + { + "epoch": 1.3431553100061389, + "grad_norm": 0.608248770236969, + "learning_rate": 9.7175962281618e-05, + "loss": 1.9486, + "step": 4376 + }, + { + "epoch": 1.343462246777164, + "grad_norm": 0.6100868582725525, + "learning_rate": 9.717431521393918e-05, + "loss": 2.044, + "step": 4377 + }, + { + "epoch": 1.343769183548189, + "grad_norm": 0.5428611636161804, + "learning_rate": 9.717266768005611e-05, + "loss": 2.0078, + "step": 4378 + }, + { + "epoch": 1.3440761203192142, + "grad_norm": 0.4338260889053345, + "learning_rate": 9.71710196799851e-05, + "loss": 1.9206, + "step": 4379 + }, + { + "epoch": 1.3443830570902393, + "grad_norm": 0.4879632294178009, + "learning_rate": 9.716937121374243e-05, + "loss": 1.9852, + "step": 4380 + }, + { + "epoch": 1.3446899938612646, + "grad_norm": 0.5174580216407776, + "learning_rate": 9.716772228134438e-05, + "loss": 1.9328, + "step": 4381 + }, + { + "epoch": 1.3449969306322898, + "grad_norm": 0.4461662173271179, + "learning_rate": 9.716607288280726e-05, + "loss": 1.9653, + "step": 4382 + }, + { + "epoch": 1.3453038674033149, + "grad_norm": 0.49747103452682495, + "learning_rate": 9.716442301814735e-05, + "loss": 1.9904, + "step": 4383 + }, + { + "epoch": 1.3456108041743402, + "grad_norm": 0.5059060454368591, + "learning_rate": 9.716277268738097e-05, + "loss": 1.9408, + "step": 4384 + }, + { + "epoch": 1.3459177409453653, + "grad_norm": 0.47981831431388855, + "learning_rate": 9.716112189052445e-05, + "loss": 1.9604, + "step": 4385 + }, + { + "epoch": 1.3462246777163904, + "grad_norm": 0.48941048979759216, + "learning_rate": 9.715947062759405e-05, + "loss": 2.0005, + "step": 4386 + }, + { + "epoch": 1.3465316144874155, + "grad_norm": 0.4544732868671417, + "learning_rate": 9.715781889860613e-05, + "loss": 1.9641, + "step": 4387 + }, + { + "epoch": 1.3468385512584407, + "grad_norm": 0.4564060866832733, + "learning_rate": 9.715616670357701e-05, + "loss": 1.8786, + "step": 4388 + }, + { + "epoch": 1.347145488029466, + "grad_norm": 0.4216209352016449, + "learning_rate": 9.715451404252301e-05, + "loss": 1.9402, + "step": 4389 + }, + { + "epoch": 1.347452424800491, + "grad_norm": 0.5024694204330444, + "learning_rate": 9.715286091546046e-05, + "loss": 1.9815, + "step": 4390 + }, + { + "epoch": 1.3477593615715162, + "grad_norm": 0.523953378200531, + "learning_rate": 9.715120732240571e-05, + "loss": 2.008, + "step": 4391 + }, + { + "epoch": 1.3480662983425415, + "grad_norm": 0.5068427920341492, + "learning_rate": 9.714955326337508e-05, + "loss": 1.9984, + "step": 4392 + }, + { + "epoch": 1.3483732351135667, + "grad_norm": 0.4349055290222168, + "learning_rate": 9.714789873838494e-05, + "loss": 1.9576, + "step": 4393 + }, + { + "epoch": 1.3486801718845918, + "grad_norm": 0.4677357077598572, + "learning_rate": 9.714624374745162e-05, + "loss": 2.0491, + "step": 4394 + }, + { + "epoch": 1.3489871086556169, + "grad_norm": 0.5942007899284363, + "learning_rate": 9.71445882905915e-05, + "loss": 1.9951, + "step": 4395 + }, + { + "epoch": 1.349294045426642, + "grad_norm": 0.5354358553886414, + "learning_rate": 9.714293236782092e-05, + "loss": 2.0033, + "step": 4396 + }, + { + "epoch": 1.3496009821976673, + "grad_norm": 0.5081890821456909, + "learning_rate": 9.714127597915625e-05, + "loss": 1.9944, + "step": 4397 + }, + { + "epoch": 1.3499079189686924, + "grad_norm": 0.5279759764671326, + "learning_rate": 9.713961912461386e-05, + "loss": 2.025, + "step": 4398 + }, + { + "epoch": 1.3502148557397176, + "grad_norm": 0.41777312755584717, + "learning_rate": 9.713796180421012e-05, + "loss": 1.9214, + "step": 4399 + }, + { + "epoch": 1.350521792510743, + "grad_norm": 0.48946598172187805, + "learning_rate": 9.713630401796141e-05, + "loss": 1.9851, + "step": 4400 + }, + { + "epoch": 1.350828729281768, + "grad_norm": 0.45182350277900696, + "learning_rate": 9.713464576588413e-05, + "loss": 1.9825, + "step": 4401 + }, + { + "epoch": 1.3511356660527931, + "grad_norm": 0.4178939461708069, + "learning_rate": 9.713298704799465e-05, + "loss": 1.8944, + "step": 4402 + }, + { + "epoch": 1.3514426028238182, + "grad_norm": 0.4178236424922943, + "learning_rate": 9.713132786430937e-05, + "loss": 1.9884, + "step": 4403 + }, + { + "epoch": 1.3517495395948433, + "grad_norm": 0.45951130986213684, + "learning_rate": 9.712966821484467e-05, + "loss": 2.0786, + "step": 4404 + }, + { + "epoch": 1.3520564763658687, + "grad_norm": 0.4884461760520935, + "learning_rate": 9.712800809961697e-05, + "loss": 2.0494, + "step": 4405 + }, + { + "epoch": 1.3523634131368938, + "grad_norm": 0.5342240929603577, + "learning_rate": 9.712634751864268e-05, + "loss": 2.1068, + "step": 4406 + }, + { + "epoch": 1.352670349907919, + "grad_norm": 0.5503208637237549, + "learning_rate": 9.71246864719382e-05, + "loss": 1.9588, + "step": 4407 + }, + { + "epoch": 1.3529772866789442, + "grad_norm": 0.5576291084289551, + "learning_rate": 9.712302495951994e-05, + "loss": 2.0461, + "step": 4408 + }, + { + "epoch": 1.3532842234499693, + "grad_norm": 0.5063806772232056, + "learning_rate": 9.712136298140433e-05, + "loss": 1.9606, + "step": 4409 + }, + { + "epoch": 1.3535911602209945, + "grad_norm": 0.5391512513160706, + "learning_rate": 9.71197005376078e-05, + "loss": 2.0115, + "step": 4410 + }, + { + "epoch": 1.3538980969920196, + "grad_norm": 0.4934769868850708, + "learning_rate": 9.711803762814676e-05, + "loss": 1.9966, + "step": 4411 + }, + { + "epoch": 1.3542050337630447, + "grad_norm": 0.4658334255218506, + "learning_rate": 9.711637425303766e-05, + "loss": 1.9477, + "step": 4412 + }, + { + "epoch": 1.35451197053407, + "grad_norm": 0.4407191574573517, + "learning_rate": 9.711471041229693e-05, + "loss": 1.9334, + "step": 4413 + }, + { + "epoch": 1.3548189073050951, + "grad_norm": 0.5043092370033264, + "learning_rate": 9.711304610594104e-05, + "loss": 2.0068, + "step": 4414 + }, + { + "epoch": 1.3551258440761202, + "grad_norm": 0.4502009451389313, + "learning_rate": 9.711138133398639e-05, + "loss": 1.9389, + "step": 4415 + }, + { + "epoch": 1.3554327808471456, + "grad_norm": 0.41863033175468445, + "learning_rate": 9.710971609644945e-05, + "loss": 1.9244, + "step": 4416 + }, + { + "epoch": 1.3557397176181707, + "grad_norm": 0.47590091824531555, + "learning_rate": 9.71080503933467e-05, + "loss": 2.0144, + "step": 4417 + }, + { + "epoch": 1.3560466543891958, + "grad_norm": 0.47155439853668213, + "learning_rate": 9.71063842246946e-05, + "loss": 2.0729, + "step": 4418 + }, + { + "epoch": 1.356353591160221, + "grad_norm": 0.5231152176856995, + "learning_rate": 9.710471759050957e-05, + "loss": 2.0654, + "step": 4419 + }, + { + "epoch": 1.356660527931246, + "grad_norm": 0.5952544212341309, + "learning_rate": 9.710305049080812e-05, + "loss": 1.9983, + "step": 4420 + }, + { + "epoch": 1.3569674647022714, + "grad_norm": 0.4810022711753845, + "learning_rate": 9.710138292560673e-05, + "loss": 1.9725, + "step": 4421 + }, + { + "epoch": 1.3572744014732965, + "grad_norm": 0.553421676158905, + "learning_rate": 9.709971489492185e-05, + "loss": 2.0666, + "step": 4422 + }, + { + "epoch": 1.3575813382443216, + "grad_norm": 0.48790663480758667, + "learning_rate": 9.709804639877001e-05, + "loss": 1.9312, + "step": 4423 + }, + { + "epoch": 1.357888275015347, + "grad_norm": 0.42968273162841797, + "learning_rate": 9.709637743716764e-05, + "loss": 1.9061, + "step": 4424 + }, + { + "epoch": 1.358195211786372, + "grad_norm": 0.40183690190315247, + "learning_rate": 9.709470801013128e-05, + "loss": 2.0547, + "step": 4425 + }, + { + "epoch": 1.3585021485573971, + "grad_norm": 0.5162881016731262, + "learning_rate": 9.70930381176774e-05, + "loss": 2.0246, + "step": 4426 + }, + { + "epoch": 1.3588090853284225, + "grad_norm": 0.517995297908783, + "learning_rate": 9.709136775982252e-05, + "loss": 2.0029, + "step": 4427 + }, + { + "epoch": 1.3591160220994476, + "grad_norm": 0.47416025400161743, + "learning_rate": 9.708969693658314e-05, + "loss": 1.9517, + "step": 4428 + }, + { + "epoch": 1.3594229588704727, + "grad_norm": 0.4192255437374115, + "learning_rate": 9.708802564797578e-05, + "loss": 1.9138, + "step": 4429 + }, + { + "epoch": 1.3597298956414978, + "grad_norm": 0.4643617868423462, + "learning_rate": 9.708635389401697e-05, + "loss": 1.9753, + "step": 4430 + }, + { + "epoch": 1.360036832412523, + "grad_norm": 0.5007988214492798, + "learning_rate": 9.708468167472317e-05, + "loss": 1.9654, + "step": 4431 + }, + { + "epoch": 1.3603437691835483, + "grad_norm": 0.5188244581222534, + "learning_rate": 9.708300899011098e-05, + "loss": 1.9959, + "step": 4432 + }, + { + "epoch": 1.3606507059545734, + "grad_norm": 0.5209388732910156, + "learning_rate": 9.70813358401969e-05, + "loss": 2.0028, + "step": 4433 + }, + { + "epoch": 1.3609576427255985, + "grad_norm": 0.48829126358032227, + "learning_rate": 9.707966222499745e-05, + "loss": 2.0554, + "step": 4434 + }, + { + "epoch": 1.3612645794966238, + "grad_norm": 0.4373438358306885, + "learning_rate": 9.707798814452919e-05, + "loss": 1.9611, + "step": 4435 + }, + { + "epoch": 1.361571516267649, + "grad_norm": 0.4294830858707428, + "learning_rate": 9.707631359880867e-05, + "loss": 1.9049, + "step": 4436 + }, + { + "epoch": 1.361878453038674, + "grad_norm": 0.46988123655319214, + "learning_rate": 9.70746385878524e-05, + "loss": 1.9221, + "step": 4437 + }, + { + "epoch": 1.3621853898096992, + "grad_norm": 0.4956746995449066, + "learning_rate": 9.707296311167697e-05, + "loss": 1.9215, + "step": 4438 + }, + { + "epoch": 1.3624923265807243, + "grad_norm": 0.43748801946640015, + "learning_rate": 9.707128717029894e-05, + "loss": 1.9882, + "step": 4439 + }, + { + "epoch": 1.3627992633517496, + "grad_norm": 0.4926415979862213, + "learning_rate": 9.706961076373485e-05, + "loss": 1.9664, + "step": 4440 + }, + { + "epoch": 1.3631062001227747, + "grad_norm": 0.5239415764808655, + "learning_rate": 9.706793389200129e-05, + "loss": 1.9809, + "step": 4441 + }, + { + "epoch": 1.3634131368937998, + "grad_norm": 0.5134629607200623, + "learning_rate": 9.706625655511481e-05, + "loss": 1.9559, + "step": 4442 + }, + { + "epoch": 1.3637200736648252, + "grad_norm": 0.49562570452690125, + "learning_rate": 9.706457875309198e-05, + "loss": 1.9603, + "step": 4443 + }, + { + "epoch": 1.3640270104358503, + "grad_norm": 0.45000702142715454, + "learning_rate": 9.706290048594942e-05, + "loss": 1.9395, + "step": 4444 + }, + { + "epoch": 1.3643339472068754, + "grad_norm": 0.4216759502887726, + "learning_rate": 9.70612217537037e-05, + "loss": 1.8857, + "step": 4445 + }, + { + "epoch": 1.3646408839779005, + "grad_norm": 0.5022158622741699, + "learning_rate": 9.705954255637138e-05, + "loss": 1.9388, + "step": 4446 + }, + { + "epoch": 1.3649478207489256, + "grad_norm": 0.5086642503738403, + "learning_rate": 9.70578628939691e-05, + "loss": 1.9325, + "step": 4447 + }, + { + "epoch": 1.365254757519951, + "grad_norm": 0.4891139566898346, + "learning_rate": 9.705618276651342e-05, + "loss": 1.9068, + "step": 4448 + }, + { + "epoch": 1.365561694290976, + "grad_norm": 0.42479926347732544, + "learning_rate": 9.705450217402096e-05, + "loss": 2.0345, + "step": 4449 + }, + { + "epoch": 1.3658686310620012, + "grad_norm": 0.45347172021865845, + "learning_rate": 9.705282111650834e-05, + "loss": 1.9343, + "step": 4450 + }, + { + "epoch": 1.3661755678330265, + "grad_norm": 0.5443231463432312, + "learning_rate": 9.705113959399217e-05, + "loss": 2.0428, + "step": 4451 + }, + { + "epoch": 1.3664825046040516, + "grad_norm": 0.5320110321044922, + "learning_rate": 9.704945760648905e-05, + "loss": 2.0015, + "step": 4452 + }, + { + "epoch": 1.3667894413750767, + "grad_norm": 0.5018410086631775, + "learning_rate": 9.704777515401561e-05, + "loss": 1.9284, + "step": 4453 + }, + { + "epoch": 1.3670963781461019, + "grad_norm": 0.4587440490722656, + "learning_rate": 9.704609223658848e-05, + "loss": 1.8945, + "step": 4454 + }, + { + "epoch": 1.367403314917127, + "grad_norm": 0.4634784758090973, + "learning_rate": 9.70444088542243e-05, + "loss": 1.9564, + "step": 4455 + }, + { + "epoch": 1.3677102516881523, + "grad_norm": 0.43047839403152466, + "learning_rate": 9.70427250069397e-05, + "loss": 2.0417, + "step": 4456 + }, + { + "epoch": 1.3680171884591774, + "grad_norm": 0.46661630272865295, + "learning_rate": 9.70410406947513e-05, + "loss": 2.0563, + "step": 4457 + }, + { + "epoch": 1.3683241252302025, + "grad_norm": 0.46544912457466125, + "learning_rate": 9.703935591767579e-05, + "loss": 2.0115, + "step": 4458 + }, + { + "epoch": 1.3686310620012279, + "grad_norm": 0.466172993183136, + "learning_rate": 9.703767067572977e-05, + "loss": 1.9177, + "step": 4459 + }, + { + "epoch": 1.368937998772253, + "grad_norm": 0.44513949751853943, + "learning_rate": 9.703598496892994e-05, + "loss": 1.9954, + "step": 4460 + }, + { + "epoch": 1.369244935543278, + "grad_norm": 0.4502551257610321, + "learning_rate": 9.703429879729293e-05, + "loss": 1.9155, + "step": 4461 + }, + { + "epoch": 1.3695518723143032, + "grad_norm": 0.4618416726589203, + "learning_rate": 9.703261216083541e-05, + "loss": 2.015, + "step": 4462 + }, + { + "epoch": 1.3698588090853283, + "grad_norm": 0.4691082239151001, + "learning_rate": 9.703092505957405e-05, + "loss": 2.0332, + "step": 4463 + }, + { + "epoch": 1.3701657458563536, + "grad_norm": 0.5674530863761902, + "learning_rate": 9.702923749352553e-05, + "loss": 2.0, + "step": 4464 + }, + { + "epoch": 1.3704726826273788, + "grad_norm": 0.5828661322593689, + "learning_rate": 9.702754946270651e-05, + "loss": 1.9727, + "step": 4465 + }, + { + "epoch": 1.3707796193984039, + "grad_norm": 0.5861548781394958, + "learning_rate": 9.702586096713369e-05, + "loss": 2.0337, + "step": 4466 + }, + { + "epoch": 1.3710865561694292, + "grad_norm": 0.5607923865318298, + "learning_rate": 9.702417200682374e-05, + "loss": 1.9639, + "step": 4467 + }, + { + "epoch": 1.3713934929404543, + "grad_norm": 0.553827702999115, + "learning_rate": 9.702248258179337e-05, + "loss": 1.9644, + "step": 4468 + }, + { + "epoch": 1.3717004297114794, + "grad_norm": 0.6120470762252808, + "learning_rate": 9.702079269205925e-05, + "loss": 1.9562, + "step": 4469 + }, + { + "epoch": 1.3720073664825045, + "grad_norm": 0.6354473829269409, + "learning_rate": 9.70191023376381e-05, + "loss": 2.0984, + "step": 4470 + }, + { + "epoch": 1.3723143032535297, + "grad_norm": 0.5426626801490784, + "learning_rate": 9.701741151854665e-05, + "loss": 1.9473, + "step": 4471 + }, + { + "epoch": 1.372621240024555, + "grad_norm": 0.5632089376449585, + "learning_rate": 9.701572023480156e-05, + "loss": 2.0167, + "step": 4472 + }, + { + "epoch": 1.37292817679558, + "grad_norm": 0.5315039157867432, + "learning_rate": 9.701402848641957e-05, + "loss": 1.9537, + "step": 4473 + }, + { + "epoch": 1.3732351135666052, + "grad_norm": 0.4552931785583496, + "learning_rate": 9.70123362734174e-05, + "loss": 1.9553, + "step": 4474 + }, + { + "epoch": 1.3735420503376305, + "grad_norm": 0.49282166361808777, + "learning_rate": 9.701064359581176e-05, + "loss": 2.0409, + "step": 4475 + }, + { + "epoch": 1.3738489871086557, + "grad_norm": 0.46548575162887573, + "learning_rate": 9.700895045361939e-05, + "loss": 1.9707, + "step": 4476 + }, + { + "epoch": 1.3741559238796808, + "grad_norm": 0.4619027078151703, + "learning_rate": 9.7007256846857e-05, + "loss": 1.9531, + "step": 4477 + }, + { + "epoch": 1.3744628606507059, + "grad_norm": 0.5122626423835754, + "learning_rate": 9.700556277554138e-05, + "loss": 2.0625, + "step": 4478 + }, + { + "epoch": 1.374769797421731, + "grad_norm": 0.487246036529541, + "learning_rate": 9.700386823968922e-05, + "loss": 1.9667, + "step": 4479 + }, + { + "epoch": 1.3750767341927563, + "grad_norm": 0.5093865990638733, + "learning_rate": 9.700217323931729e-05, + "loss": 1.9982, + "step": 4480 + }, + { + "epoch": 1.3753836709637814, + "grad_norm": 0.47049981355667114, + "learning_rate": 9.700047777444232e-05, + "loss": 1.9876, + "step": 4481 + }, + { + "epoch": 1.3756906077348066, + "grad_norm": 0.4997411370277405, + "learning_rate": 9.699878184508109e-05, + "loss": 1.9925, + "step": 4482 + }, + { + "epoch": 1.375997544505832, + "grad_norm": 0.49374327063560486, + "learning_rate": 9.699708545125034e-05, + "loss": 1.9468, + "step": 4483 + }, + { + "epoch": 1.376304481276857, + "grad_norm": 0.44101378321647644, + "learning_rate": 9.699538859296686e-05, + "loss": 2.0577, + "step": 4484 + }, + { + "epoch": 1.3766114180478821, + "grad_norm": 0.47289925813674927, + "learning_rate": 9.699369127024741e-05, + "loss": 1.9611, + "step": 4485 + }, + { + "epoch": 1.3769183548189072, + "grad_norm": 0.4616342782974243, + "learning_rate": 9.699199348310875e-05, + "loss": 2.0196, + "step": 4486 + }, + { + "epoch": 1.3772252915899323, + "grad_norm": 0.45797309279441833, + "learning_rate": 9.699029523156766e-05, + "loss": 2.0168, + "step": 4487 + }, + { + "epoch": 1.3775322283609577, + "grad_norm": 0.5224477648735046, + "learning_rate": 9.698859651564095e-05, + "loss": 2.0312, + "step": 4488 + }, + { + "epoch": 1.3778391651319828, + "grad_norm": 0.4831027388572693, + "learning_rate": 9.698689733534539e-05, + "loss": 2.0084, + "step": 4489 + }, + { + "epoch": 1.378146101903008, + "grad_norm": 0.49492040276527405, + "learning_rate": 9.698519769069774e-05, + "loss": 1.9474, + "step": 4490 + }, + { + "epoch": 1.3784530386740332, + "grad_norm": 0.4911774694919586, + "learning_rate": 9.698349758171486e-05, + "loss": 1.987, + "step": 4491 + }, + { + "epoch": 1.3787599754450584, + "grad_norm": 0.5415390729904175, + "learning_rate": 9.69817970084135e-05, + "loss": 1.9927, + "step": 4492 + }, + { + "epoch": 1.3790669122160835, + "grad_norm": 0.6870381832122803, + "learning_rate": 9.698009597081048e-05, + "loss": 2.0348, + "step": 4493 + }, + { + "epoch": 1.3793738489871086, + "grad_norm": 0.6322616934776306, + "learning_rate": 9.697839446892263e-05, + "loss": 2.0119, + "step": 4494 + }, + { + "epoch": 1.3796807857581337, + "grad_norm": 0.5950151681900024, + "learning_rate": 9.697669250276675e-05, + "loss": 2.002, + "step": 4495 + }, + { + "epoch": 1.379987722529159, + "grad_norm": 0.4321151673793793, + "learning_rate": 9.697499007235966e-05, + "loss": 1.9173, + "step": 4496 + }, + { + "epoch": 1.3802946593001841, + "grad_norm": 0.4627344608306885, + "learning_rate": 9.697328717771818e-05, + "loss": 2.0289, + "step": 4497 + }, + { + "epoch": 1.3806015960712092, + "grad_norm": 0.5040726661682129, + "learning_rate": 9.697158381885915e-05, + "loss": 1.9844, + "step": 4498 + }, + { + "epoch": 1.3809085328422346, + "grad_norm": 0.5219398736953735, + "learning_rate": 9.696987999579939e-05, + "loss": 1.9536, + "step": 4499 + }, + { + "epoch": 1.3812154696132597, + "grad_norm": 0.487734317779541, + "learning_rate": 9.696817570855575e-05, + "loss": 1.9655, + "step": 4500 + }, + { + "epoch": 1.3815224063842848, + "grad_norm": 0.40818822383880615, + "learning_rate": 9.696647095714506e-05, + "loss": 1.9524, + "step": 4501 + }, + { + "epoch": 1.3818293431553101, + "grad_norm": 0.41752889752388, + "learning_rate": 9.69647657415842e-05, + "loss": 1.9927, + "step": 4502 + }, + { + "epoch": 1.3821362799263353, + "grad_norm": 0.44540464878082275, + "learning_rate": 9.696306006188998e-05, + "loss": 1.9207, + "step": 4503 + }, + { + "epoch": 1.3824432166973604, + "grad_norm": 0.44818806648254395, + "learning_rate": 9.696135391807927e-05, + "loss": 1.9054, + "step": 4504 + }, + { + "epoch": 1.3827501534683855, + "grad_norm": 0.430758535861969, + "learning_rate": 9.695964731016896e-05, + "loss": 1.9644, + "step": 4505 + }, + { + "epoch": 1.3830570902394106, + "grad_norm": 0.3787635564804077, + "learning_rate": 9.695794023817586e-05, + "loss": 1.9601, + "step": 4506 + }, + { + "epoch": 1.383364027010436, + "grad_norm": 0.42520588636398315, + "learning_rate": 9.695623270211689e-05, + "loss": 1.9681, + "step": 4507 + }, + { + "epoch": 1.383670963781461, + "grad_norm": 0.39063912630081177, + "learning_rate": 9.69545247020089e-05, + "loss": 2.0323, + "step": 4508 + }, + { + "epoch": 1.3839779005524862, + "grad_norm": 0.41405799984931946, + "learning_rate": 9.695281623786879e-05, + "loss": 1.9239, + "step": 4509 + }, + { + "epoch": 1.3842848373235115, + "grad_norm": 0.4275501072406769, + "learning_rate": 9.695110730971342e-05, + "loss": 1.941, + "step": 4510 + }, + { + "epoch": 1.3845917740945366, + "grad_norm": 0.5254966616630554, + "learning_rate": 9.694939791755968e-05, + "loss": 1.9997, + "step": 4511 + }, + { + "epoch": 1.3848987108655617, + "grad_norm": 0.581857442855835, + "learning_rate": 9.694768806142448e-05, + "loss": 2.0085, + "step": 4512 + }, + { + "epoch": 1.3852056476365868, + "grad_norm": 0.6330662965774536, + "learning_rate": 9.69459777413247e-05, + "loss": 1.9898, + "step": 4513 + }, + { + "epoch": 1.385512584407612, + "grad_norm": 0.693536639213562, + "learning_rate": 9.694426695727727e-05, + "loss": 1.9466, + "step": 4514 + }, + { + "epoch": 1.3858195211786373, + "grad_norm": 0.6494079232215881, + "learning_rate": 9.694255570929906e-05, + "loss": 1.9523, + "step": 4515 + }, + { + "epoch": 1.3861264579496624, + "grad_norm": 0.573515772819519, + "learning_rate": 9.694084399740701e-05, + "loss": 1.9789, + "step": 4516 + }, + { + "epoch": 1.3864333947206875, + "grad_norm": 0.5253448486328125, + "learning_rate": 9.693913182161805e-05, + "loss": 2.0348, + "step": 4517 + }, + { + "epoch": 1.3867403314917128, + "grad_norm": 0.49921590089797974, + "learning_rate": 9.693741918194904e-05, + "loss": 1.9684, + "step": 4518 + }, + { + "epoch": 1.387047268262738, + "grad_norm": 0.5164174437522888, + "learning_rate": 9.693570607841696e-05, + "loss": 2.0104, + "step": 4519 + }, + { + "epoch": 1.387354205033763, + "grad_norm": 0.5620231032371521, + "learning_rate": 9.693399251103872e-05, + "loss": 1.9969, + "step": 4520 + }, + { + "epoch": 1.3876611418047882, + "grad_norm": 0.495890349149704, + "learning_rate": 9.693227847983126e-05, + "loss": 2.0037, + "step": 4521 + }, + { + "epoch": 1.3879680785758133, + "grad_norm": 0.4942645728588104, + "learning_rate": 9.693056398481151e-05, + "loss": 2.0199, + "step": 4522 + }, + { + "epoch": 1.3882750153468386, + "grad_norm": 0.5366860628128052, + "learning_rate": 9.692884902599643e-05, + "loss": 2.0395, + "step": 4523 + }, + { + "epoch": 1.3885819521178637, + "grad_norm": 0.48179951310157776, + "learning_rate": 9.692713360340295e-05, + "loss": 2.0292, + "step": 4524 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.4709320366382599, + "learning_rate": 9.692541771704804e-05, + "loss": 2.006, + "step": 4525 + }, + { + "epoch": 1.3891958256599142, + "grad_norm": 0.4311975836753845, + "learning_rate": 9.692370136694864e-05, + "loss": 2.0122, + "step": 4526 + }, + { + "epoch": 1.3895027624309393, + "grad_norm": 0.4489841163158417, + "learning_rate": 9.692198455312172e-05, + "loss": 1.9635, + "step": 4527 + }, + { + "epoch": 1.3898096992019644, + "grad_norm": 0.40383243560791016, + "learning_rate": 9.692026727558425e-05, + "loss": 1.9352, + "step": 4528 + }, + { + "epoch": 1.3901166359729895, + "grad_norm": 0.4732677638530731, + "learning_rate": 9.691854953435319e-05, + "loss": 1.9882, + "step": 4529 + }, + { + "epoch": 1.3904235727440146, + "grad_norm": 0.5124688744544983, + "learning_rate": 9.691683132944553e-05, + "loss": 2.0068, + "step": 4530 + }, + { + "epoch": 1.39073050951504, + "grad_norm": 0.4810490906238556, + "learning_rate": 9.691511266087824e-05, + "loss": 2.0163, + "step": 4531 + }, + { + "epoch": 1.391037446286065, + "grad_norm": 0.4019710421562195, + "learning_rate": 9.691339352866831e-05, + "loss": 1.8943, + "step": 4532 + }, + { + "epoch": 1.3913443830570902, + "grad_norm": 0.4144287705421448, + "learning_rate": 9.691167393283274e-05, + "loss": 1.9457, + "step": 4533 + }, + { + "epoch": 1.3916513198281155, + "grad_norm": 0.42622655630111694, + "learning_rate": 9.690995387338851e-05, + "loss": 1.9618, + "step": 4534 + }, + { + "epoch": 1.3919582565991406, + "grad_norm": 0.4547794461250305, + "learning_rate": 9.690823335035259e-05, + "loss": 2.0243, + "step": 4535 + }, + { + "epoch": 1.3922651933701657, + "grad_norm": 0.4298909604549408, + "learning_rate": 9.690651236374205e-05, + "loss": 1.9872, + "step": 4536 + }, + { + "epoch": 1.3925721301411909, + "grad_norm": 0.40463829040527344, + "learning_rate": 9.690479091357386e-05, + "loss": 1.9617, + "step": 4537 + }, + { + "epoch": 1.392879066912216, + "grad_norm": 0.441487580537796, + "learning_rate": 9.690306899986502e-05, + "loss": 1.8965, + "step": 4538 + }, + { + "epoch": 1.3931860036832413, + "grad_norm": 0.4713582694530487, + "learning_rate": 9.690134662263256e-05, + "loss": 2.0112, + "step": 4539 + }, + { + "epoch": 1.3934929404542664, + "grad_norm": 0.5772922039031982, + "learning_rate": 9.689962378189351e-05, + "loss": 1.9903, + "step": 4540 + }, + { + "epoch": 1.3937998772252915, + "grad_norm": 0.6658890247344971, + "learning_rate": 9.689790047766489e-05, + "loss": 2.0569, + "step": 4541 + }, + { + "epoch": 1.3941068139963169, + "grad_norm": 0.6710116267204285, + "learning_rate": 9.689617670996372e-05, + "loss": 1.9692, + "step": 4542 + }, + { + "epoch": 1.394413750767342, + "grad_norm": 0.6778390407562256, + "learning_rate": 9.689445247880707e-05, + "loss": 2.0363, + "step": 4543 + }, + { + "epoch": 1.394720687538367, + "grad_norm": 0.6921203136444092, + "learning_rate": 9.689272778421192e-05, + "loss": 2.0104, + "step": 4544 + }, + { + "epoch": 1.3950276243093922, + "grad_norm": 0.48772117495536804, + "learning_rate": 9.689100262619537e-05, + "loss": 2.0006, + "step": 4545 + }, + { + "epoch": 1.3953345610804173, + "grad_norm": 0.4956360459327698, + "learning_rate": 9.688927700477445e-05, + "loss": 1.9724, + "step": 4546 + }, + { + "epoch": 1.3956414978514426, + "grad_norm": 0.6304072141647339, + "learning_rate": 9.68875509199662e-05, + "loss": 1.9904, + "step": 4547 + }, + { + "epoch": 1.3959484346224678, + "grad_norm": 0.6372275948524475, + "learning_rate": 9.68858243717877e-05, + "loss": 2.0328, + "step": 4548 + }, + { + "epoch": 1.3962553713934929, + "grad_norm": 0.48642870783805847, + "learning_rate": 9.688409736025601e-05, + "loss": 1.9898, + "step": 4549 + }, + { + "epoch": 1.3965623081645182, + "grad_norm": 0.41096800565719604, + "learning_rate": 9.688236988538817e-05, + "loss": 1.8945, + "step": 4550 + }, + { + "epoch": 1.3968692449355433, + "grad_norm": 0.48746830224990845, + "learning_rate": 9.68806419472013e-05, + "loss": 1.9809, + "step": 4551 + }, + { + "epoch": 1.3971761817065684, + "grad_norm": 0.5296676754951477, + "learning_rate": 9.687891354571242e-05, + "loss": 1.9194, + "step": 4552 + }, + { + "epoch": 1.3974831184775935, + "grad_norm": 0.43177086114883423, + "learning_rate": 9.687718468093865e-05, + "loss": 1.8785, + "step": 4553 + }, + { + "epoch": 1.3977900552486187, + "grad_norm": 0.4617565870285034, + "learning_rate": 9.687545535289705e-05, + "loss": 2.0021, + "step": 4554 + }, + { + "epoch": 1.398096992019644, + "grad_norm": 0.4460168182849884, + "learning_rate": 9.687372556160477e-05, + "loss": 1.9368, + "step": 4555 + }, + { + "epoch": 1.398403928790669, + "grad_norm": 0.5051010847091675, + "learning_rate": 9.687199530707882e-05, + "loss": 2.0321, + "step": 4556 + }, + { + "epoch": 1.3987108655616942, + "grad_norm": 0.5623685717582703, + "learning_rate": 9.687026458933636e-05, + "loss": 2.007, + "step": 4557 + }, + { + "epoch": 1.3990178023327196, + "grad_norm": 0.48149919509887695, + "learning_rate": 9.686853340839446e-05, + "loss": 1.9346, + "step": 4558 + }, + { + "epoch": 1.3993247391037447, + "grad_norm": 0.4651631712913513, + "learning_rate": 9.686680176427025e-05, + "loss": 1.9603, + "step": 4559 + }, + { + "epoch": 1.3996316758747698, + "grad_norm": 0.5255021452903748, + "learning_rate": 9.686506965698083e-05, + "loss": 2.0206, + "step": 4560 + }, + { + "epoch": 1.3999386126457949, + "grad_norm": 0.5137404799461365, + "learning_rate": 9.686333708654334e-05, + "loss": 1.9736, + "step": 4561 + }, + { + "epoch": 1.40024554941682, + "grad_norm": 0.5037943124771118, + "learning_rate": 9.686160405297487e-05, + "loss": 1.9886, + "step": 4562 + }, + { + "epoch": 1.4005524861878453, + "grad_norm": 0.46424365043640137, + "learning_rate": 9.685987055629256e-05, + "loss": 1.9316, + "step": 4563 + }, + { + "epoch": 1.4008594229588704, + "grad_norm": 0.4839535355567932, + "learning_rate": 9.685813659651355e-05, + "loss": 1.9651, + "step": 4564 + }, + { + "epoch": 1.4011663597298956, + "grad_norm": 0.48972323536872864, + "learning_rate": 9.685640217365497e-05, + "loss": 1.9544, + "step": 4565 + }, + { + "epoch": 1.401473296500921, + "grad_norm": 0.43038102984428406, + "learning_rate": 9.685466728773396e-05, + "loss": 1.9522, + "step": 4566 + }, + { + "epoch": 1.401780233271946, + "grad_norm": 0.5174641013145447, + "learning_rate": 9.685293193876765e-05, + "loss": 2.046, + "step": 4567 + }, + { + "epoch": 1.4020871700429711, + "grad_norm": 0.6731263995170593, + "learning_rate": 9.685119612677323e-05, + "loss": 2.0123, + "step": 4568 + }, + { + "epoch": 1.4023941068139965, + "grad_norm": 0.5863515734672546, + "learning_rate": 9.684945985176782e-05, + "loss": 1.9951, + "step": 4569 + }, + { + "epoch": 1.4027010435850216, + "grad_norm": 0.4479050934314728, + "learning_rate": 9.684772311376859e-05, + "loss": 1.9287, + "step": 4570 + }, + { + "epoch": 1.4030079803560467, + "grad_norm": 0.432740718126297, + "learning_rate": 9.68459859127927e-05, + "loss": 1.955, + "step": 4571 + }, + { + "epoch": 1.4033149171270718, + "grad_norm": 0.571775496006012, + "learning_rate": 9.684424824885731e-05, + "loss": 1.9519, + "step": 4572 + }, + { + "epoch": 1.403621853898097, + "grad_norm": 0.6454880237579346, + "learning_rate": 9.684251012197963e-05, + "loss": 1.9858, + "step": 4573 + }, + { + "epoch": 1.4039287906691222, + "grad_norm": 0.5274731516838074, + "learning_rate": 9.684077153217677e-05, + "loss": 1.9956, + "step": 4574 + }, + { + "epoch": 1.4042357274401474, + "grad_norm": 0.4459272027015686, + "learning_rate": 9.683903247946597e-05, + "loss": 2.0412, + "step": 4575 + }, + { + "epoch": 1.4045426642111725, + "grad_norm": 0.47089213132858276, + "learning_rate": 9.683729296386441e-05, + "loss": 1.9247, + "step": 4576 + }, + { + "epoch": 1.4048496009821978, + "grad_norm": 0.628490149974823, + "learning_rate": 9.683555298538927e-05, + "loss": 2.1311, + "step": 4577 + }, + { + "epoch": 1.405156537753223, + "grad_norm": 0.5498626232147217, + "learning_rate": 9.683381254405773e-05, + "loss": 1.9538, + "step": 4578 + }, + { + "epoch": 1.405463474524248, + "grad_norm": 0.4556458294391632, + "learning_rate": 9.6832071639887e-05, + "loss": 1.9957, + "step": 4579 + }, + { + "epoch": 1.4057704112952731, + "grad_norm": 0.5684164762496948, + "learning_rate": 9.68303302728943e-05, + "loss": 1.9339, + "step": 4580 + }, + { + "epoch": 1.4060773480662982, + "grad_norm": 0.5723292231559753, + "learning_rate": 9.682858844309682e-05, + "loss": 2.0043, + "step": 4581 + }, + { + "epoch": 1.4063842848373236, + "grad_norm": 0.4734770953655243, + "learning_rate": 9.682684615051178e-05, + "loss": 1.9854, + "step": 4582 + }, + { + "epoch": 1.4066912216083487, + "grad_norm": 0.49376189708709717, + "learning_rate": 9.682510339515642e-05, + "loss": 2.0436, + "step": 4583 + }, + { + "epoch": 1.4069981583793738, + "grad_norm": 0.6263520121574402, + "learning_rate": 9.682336017704793e-05, + "loss": 1.9426, + "step": 4584 + }, + { + "epoch": 1.4073050951503991, + "grad_norm": 0.5852357745170593, + "learning_rate": 9.682161649620355e-05, + "loss": 1.9865, + "step": 4585 + }, + { + "epoch": 1.4076120319214243, + "grad_norm": 0.45548367500305176, + "learning_rate": 9.681987235264052e-05, + "loss": 2.0454, + "step": 4586 + }, + { + "epoch": 1.4079189686924494, + "grad_norm": 0.4961472153663635, + "learning_rate": 9.681812774637607e-05, + "loss": 2.0414, + "step": 4587 + }, + { + "epoch": 1.4082259054634745, + "grad_norm": 0.5739028453826904, + "learning_rate": 9.681638267742741e-05, + "loss": 1.9591, + "step": 4588 + }, + { + "epoch": 1.4085328422344996, + "grad_norm": 0.546283483505249, + "learning_rate": 9.681463714581184e-05, + "loss": 1.9631, + "step": 4589 + }, + { + "epoch": 1.408839779005525, + "grad_norm": 0.4757421910762787, + "learning_rate": 9.681289115154659e-05, + "loss": 1.954, + "step": 4590 + }, + { + "epoch": 1.40914671577655, + "grad_norm": 0.5116898417472839, + "learning_rate": 9.681114469464891e-05, + "loss": 1.9816, + "step": 4591 + }, + { + "epoch": 1.4094536525475752, + "grad_norm": 0.6128544807434082, + "learning_rate": 9.680939777513607e-05, + "loss": 1.9408, + "step": 4592 + }, + { + "epoch": 1.4097605893186005, + "grad_norm": 0.5577036142349243, + "learning_rate": 9.680765039302531e-05, + "loss": 1.906, + "step": 4593 + }, + { + "epoch": 1.4100675260896256, + "grad_norm": 0.4608074128627777, + "learning_rate": 9.680590254833393e-05, + "loss": 1.9421, + "step": 4594 + }, + { + "epoch": 1.4103744628606507, + "grad_norm": 0.4221206307411194, + "learning_rate": 9.680415424107917e-05, + "loss": 1.9596, + "step": 4595 + }, + { + "epoch": 1.4106813996316758, + "grad_norm": 0.4278069734573364, + "learning_rate": 9.680240547127832e-05, + "loss": 1.9718, + "step": 4596 + }, + { + "epoch": 1.410988336402701, + "grad_norm": 0.48608019948005676, + "learning_rate": 9.680065623894869e-05, + "loss": 2.0595, + "step": 4597 + }, + { + "epoch": 1.4112952731737263, + "grad_norm": 0.4559817910194397, + "learning_rate": 9.679890654410753e-05, + "loss": 1.959, + "step": 4598 + }, + { + "epoch": 1.4116022099447514, + "grad_norm": 0.5122750997543335, + "learning_rate": 9.679715638677216e-05, + "loss": 2.0669, + "step": 4599 + }, + { + "epoch": 1.4119091467157765, + "grad_norm": 0.5203170776367188, + "learning_rate": 9.679540576695985e-05, + "loss": 1.9475, + "step": 4600 + }, + { + "epoch": 1.4122160834868018, + "grad_norm": 0.5420581698417664, + "learning_rate": 9.679365468468791e-05, + "loss": 1.9603, + "step": 4601 + }, + { + "epoch": 1.412523020257827, + "grad_norm": 0.527387261390686, + "learning_rate": 9.679190313997364e-05, + "loss": 1.9172, + "step": 4602 + }, + { + "epoch": 1.412829957028852, + "grad_norm": 0.48417946696281433, + "learning_rate": 9.679015113283438e-05, + "loss": 1.9619, + "step": 4603 + }, + { + "epoch": 1.4131368937998772, + "grad_norm": 0.49174100160598755, + "learning_rate": 9.678839866328742e-05, + "loss": 1.9959, + "step": 4604 + }, + { + "epoch": 1.4134438305709023, + "grad_norm": 0.5096092224121094, + "learning_rate": 9.678664573135006e-05, + "loss": 2.0046, + "step": 4605 + }, + { + "epoch": 1.4137507673419276, + "grad_norm": 0.4536958634853363, + "learning_rate": 9.678489233703965e-05, + "loss": 1.9289, + "step": 4606 + }, + { + "epoch": 1.4140577041129527, + "grad_norm": 0.40438196063041687, + "learning_rate": 9.678313848037353e-05, + "loss": 1.9488, + "step": 4607 + }, + { + "epoch": 1.4143646408839778, + "grad_norm": 0.4447456896305084, + "learning_rate": 9.6781384161369e-05, + "loss": 1.9638, + "step": 4608 + }, + { + "epoch": 1.4146715776550032, + "grad_norm": 0.44451746344566345, + "learning_rate": 9.677962938004342e-05, + "loss": 1.9026, + "step": 4609 + }, + { + "epoch": 1.4149785144260283, + "grad_norm": 0.4262266457080841, + "learning_rate": 9.677787413641412e-05, + "loss": 1.9408, + "step": 4610 + }, + { + "epoch": 1.4152854511970534, + "grad_norm": 0.42755937576293945, + "learning_rate": 9.677611843049845e-05, + "loss": 1.9542, + "step": 4611 + }, + { + "epoch": 1.4155923879680785, + "grad_norm": 0.43264830112457275, + "learning_rate": 9.677436226231375e-05, + "loss": 2.0244, + "step": 4612 + }, + { + "epoch": 1.4158993247391036, + "grad_norm": 0.4521278142929077, + "learning_rate": 9.67726056318774e-05, + "loss": 2.0343, + "step": 4613 + }, + { + "epoch": 1.416206261510129, + "grad_norm": 0.45257535576820374, + "learning_rate": 9.677084853920675e-05, + "loss": 1.9743, + "step": 4614 + }, + { + "epoch": 1.416513198281154, + "grad_norm": 0.42859771847724915, + "learning_rate": 9.676909098431915e-05, + "loss": 2.0067, + "step": 4615 + }, + { + "epoch": 1.4168201350521792, + "grad_norm": 0.4057050049304962, + "learning_rate": 9.6767332967232e-05, + "loss": 1.9074, + "step": 4616 + }, + { + "epoch": 1.4171270718232045, + "grad_norm": 0.46177807450294495, + "learning_rate": 9.676557448796264e-05, + "loss": 1.9899, + "step": 4617 + }, + { + "epoch": 1.4174340085942296, + "grad_norm": 0.44164395332336426, + "learning_rate": 9.676381554652846e-05, + "loss": 1.9759, + "step": 4618 + }, + { + "epoch": 1.4177409453652547, + "grad_norm": 0.42987993359565735, + "learning_rate": 9.676205614294684e-05, + "loss": 1.8783, + "step": 4619 + }, + { + "epoch": 1.4180478821362799, + "grad_norm": 0.541702389717102, + "learning_rate": 9.67602962772352e-05, + "loss": 2.0099, + "step": 4620 + }, + { + "epoch": 1.418354818907305, + "grad_norm": 0.42173272371292114, + "learning_rate": 9.67585359494109e-05, + "loss": 1.9281, + "step": 4621 + }, + { + "epoch": 1.4186617556783303, + "grad_norm": 0.432476669549942, + "learning_rate": 9.67567751594913e-05, + "loss": 1.9124, + "step": 4622 + }, + { + "epoch": 1.4189686924493554, + "grad_norm": 0.4952125549316406, + "learning_rate": 9.675501390749388e-05, + "loss": 1.973, + "step": 4623 + }, + { + "epoch": 1.4192756292203805, + "grad_norm": 0.5270698070526123, + "learning_rate": 9.6753252193436e-05, + "loss": 2.003, + "step": 4624 + }, + { + "epoch": 1.4195825659914059, + "grad_norm": 0.5735524892807007, + "learning_rate": 9.67514900173351e-05, + "loss": 1.9266, + "step": 4625 + }, + { + "epoch": 1.419889502762431, + "grad_norm": 0.508196234703064, + "learning_rate": 9.674972737920855e-05, + "loss": 1.9633, + "step": 4626 + }, + { + "epoch": 1.420196439533456, + "grad_norm": 0.4321250319480896, + "learning_rate": 9.674796427907379e-05, + "loss": 1.9994, + "step": 4627 + }, + { + "epoch": 1.4205033763044812, + "grad_norm": 0.5697643756866455, + "learning_rate": 9.674620071694826e-05, + "loss": 2.0018, + "step": 4628 + }, + { + "epoch": 1.4208103130755063, + "grad_norm": 0.6797513365745544, + "learning_rate": 9.674443669284936e-05, + "loss": 2.0514, + "step": 4629 + }, + { + "epoch": 1.4211172498465316, + "grad_norm": 0.6622742414474487, + "learning_rate": 9.674267220679456e-05, + "loss": 1.9315, + "step": 4630 + }, + { + "epoch": 1.4214241866175568, + "grad_norm": 0.5143589377403259, + "learning_rate": 9.674090725880125e-05, + "loss": 1.9691, + "step": 4631 + }, + { + "epoch": 1.4217311233885819, + "grad_norm": 0.4472220838069916, + "learning_rate": 9.673914184888692e-05, + "loss": 1.9629, + "step": 4632 + }, + { + "epoch": 1.4220380601596072, + "grad_norm": 0.4992378354072571, + "learning_rate": 9.6737375977069e-05, + "loss": 1.9202, + "step": 4633 + }, + { + "epoch": 1.4223449969306323, + "grad_norm": 0.5463345646858215, + "learning_rate": 9.673560964336493e-05, + "loss": 2.0143, + "step": 4634 + }, + { + "epoch": 1.4226519337016574, + "grad_norm": 0.4566437304019928, + "learning_rate": 9.673384284779217e-05, + "loss": 1.8907, + "step": 4635 + }, + { + "epoch": 1.4229588704726825, + "grad_norm": 0.41718652844429016, + "learning_rate": 9.673207559036816e-05, + "loss": 1.8955, + "step": 4636 + }, + { + "epoch": 1.4232658072437077, + "grad_norm": 0.5017329454421997, + "learning_rate": 9.673030787111043e-05, + "loss": 1.9745, + "step": 4637 + }, + { + "epoch": 1.423572744014733, + "grad_norm": 0.48890092968940735, + "learning_rate": 9.67285396900364e-05, + "loss": 1.9448, + "step": 4638 + }, + { + "epoch": 1.423879680785758, + "grad_norm": 0.4519537687301636, + "learning_rate": 9.672677104716352e-05, + "loss": 1.9572, + "step": 4639 + }, + { + "epoch": 1.4241866175567832, + "grad_norm": 0.4786919355392456, + "learning_rate": 9.672500194250932e-05, + "loss": 2.0212, + "step": 4640 + }, + { + "epoch": 1.4244935543278086, + "grad_norm": 0.4938487112522125, + "learning_rate": 9.672323237609127e-05, + "loss": 1.9842, + "step": 4641 + }, + { + "epoch": 1.4248004910988337, + "grad_norm": 0.5786599516868591, + "learning_rate": 9.672146234792686e-05, + "loss": 1.9575, + "step": 4642 + }, + { + "epoch": 1.4251074278698588, + "grad_norm": 0.5532247424125671, + "learning_rate": 9.671969185803356e-05, + "loss": 1.9972, + "step": 4643 + }, + { + "epoch": 1.4254143646408841, + "grad_norm": 0.5058014988899231, + "learning_rate": 9.671792090642889e-05, + "loss": 2.0042, + "step": 4644 + }, + { + "epoch": 1.4257213014119092, + "grad_norm": 0.46545106172561646, + "learning_rate": 9.671614949313033e-05, + "loss": 1.9853, + "step": 4645 + }, + { + "epoch": 1.4260282381829343, + "grad_norm": 0.47626879811286926, + "learning_rate": 9.671437761815541e-05, + "loss": 1.9725, + "step": 4646 + }, + { + "epoch": 1.4263351749539595, + "grad_norm": 0.4476237893104553, + "learning_rate": 9.671260528152165e-05, + "loss": 1.8876, + "step": 4647 + }, + { + "epoch": 1.4266421117249846, + "grad_norm": 0.4290693700313568, + "learning_rate": 9.671083248324651e-05, + "loss": 1.9766, + "step": 4648 + }, + { + "epoch": 1.42694904849601, + "grad_norm": 0.443131685256958, + "learning_rate": 9.670905922334757e-05, + "loss": 2.0201, + "step": 4649 + }, + { + "epoch": 1.427255985267035, + "grad_norm": 0.5181389451026917, + "learning_rate": 9.670728550184231e-05, + "loss": 2.0013, + "step": 4650 + }, + { + "epoch": 1.4275629220380601, + "grad_norm": 0.48453402519226074, + "learning_rate": 9.670551131874829e-05, + "loss": 1.9536, + "step": 4651 + }, + { + "epoch": 1.4278698588090855, + "grad_norm": 0.49652302265167236, + "learning_rate": 9.670373667408303e-05, + "loss": 1.9934, + "step": 4652 + }, + { + "epoch": 1.4281767955801106, + "grad_norm": 0.47071191668510437, + "learning_rate": 9.670196156786406e-05, + "loss": 2.0319, + "step": 4653 + }, + { + "epoch": 1.4284837323511357, + "grad_norm": 0.46828708052635193, + "learning_rate": 9.670018600010894e-05, + "loss": 1.9248, + "step": 4654 + }, + { + "epoch": 1.4287906691221608, + "grad_norm": 0.48472490906715393, + "learning_rate": 9.669840997083524e-05, + "loss": 1.9681, + "step": 4655 + }, + { + "epoch": 1.429097605893186, + "grad_norm": 0.48628562688827515, + "learning_rate": 9.669663348006044e-05, + "loss": 1.9818, + "step": 4656 + }, + { + "epoch": 1.4294045426642112, + "grad_norm": 0.40770742297172546, + "learning_rate": 9.669485652780215e-05, + "loss": 1.927, + "step": 4657 + }, + { + "epoch": 1.4297114794352364, + "grad_norm": 0.5005267858505249, + "learning_rate": 9.669307911407794e-05, + "loss": 2.0564, + "step": 4658 + }, + { + "epoch": 1.4300184162062615, + "grad_norm": 0.42432111501693726, + "learning_rate": 9.669130123890533e-05, + "loss": 1.9344, + "step": 4659 + }, + { + "epoch": 1.4303253529772868, + "grad_norm": 0.42347240447998047, + "learning_rate": 9.668952290230192e-05, + "loss": 1.962, + "step": 4660 + }, + { + "epoch": 1.430632289748312, + "grad_norm": 0.4718005955219269, + "learning_rate": 9.668774410428529e-05, + "loss": 2.0081, + "step": 4661 + }, + { + "epoch": 1.430939226519337, + "grad_norm": 0.45922374725341797, + "learning_rate": 9.6685964844873e-05, + "loss": 1.9378, + "step": 4662 + }, + { + "epoch": 1.4312461632903621, + "grad_norm": 0.43764227628707886, + "learning_rate": 9.668418512408263e-05, + "loss": 2.0084, + "step": 4663 + }, + { + "epoch": 1.4315531000613873, + "grad_norm": 0.42079678177833557, + "learning_rate": 9.668240494193179e-05, + "loss": 1.9675, + "step": 4664 + }, + { + "epoch": 1.4318600368324126, + "grad_norm": 0.4470539093017578, + "learning_rate": 9.668062429843808e-05, + "loss": 1.9781, + "step": 4665 + }, + { + "epoch": 1.4321669736034377, + "grad_norm": 0.4903084337711334, + "learning_rate": 9.667884319361906e-05, + "loss": 1.9612, + "step": 4666 + }, + { + "epoch": 1.4324739103744628, + "grad_norm": 0.4906228482723236, + "learning_rate": 9.667706162749234e-05, + "loss": 2.0115, + "step": 4667 + }, + { + "epoch": 1.4327808471454881, + "grad_norm": 0.4868105351924896, + "learning_rate": 9.667527960007556e-05, + "loss": 1.9648, + "step": 4668 + }, + { + "epoch": 1.4330877839165133, + "grad_norm": 0.5115882754325867, + "learning_rate": 9.667349711138632e-05, + "loss": 2.0366, + "step": 4669 + }, + { + "epoch": 1.4333947206875384, + "grad_norm": 0.47366276383399963, + "learning_rate": 9.66717141614422e-05, + "loss": 1.9467, + "step": 4670 + }, + { + "epoch": 1.4337016574585635, + "grad_norm": 0.6110171675682068, + "learning_rate": 9.666993075026086e-05, + "loss": 1.9272, + "step": 4671 + }, + { + "epoch": 1.4340085942295886, + "grad_norm": 0.5915683507919312, + "learning_rate": 9.66681468778599e-05, + "loss": 2.0444, + "step": 4672 + }, + { + "epoch": 1.434315531000614, + "grad_norm": 0.5783519744873047, + "learning_rate": 9.666636254425697e-05, + "loss": 1.9579, + "step": 4673 + }, + { + "epoch": 1.434622467771639, + "grad_norm": 0.4646502137184143, + "learning_rate": 9.66645777494697e-05, + "loss": 1.9172, + "step": 4674 + }, + { + "epoch": 1.4349294045426642, + "grad_norm": 0.4184744656085968, + "learning_rate": 9.666279249351571e-05, + "loss": 1.9189, + "step": 4675 + }, + { + "epoch": 1.4352363413136895, + "grad_norm": 0.5444575548171997, + "learning_rate": 9.666100677641266e-05, + "loss": 2.045, + "step": 4676 + }, + { + "epoch": 1.4355432780847146, + "grad_norm": 0.5232846140861511, + "learning_rate": 9.665922059817818e-05, + "loss": 2.0059, + "step": 4677 + }, + { + "epoch": 1.4358502148557397, + "grad_norm": 0.439259797334671, + "learning_rate": 9.665743395882994e-05, + "loss": 1.9164, + "step": 4678 + }, + { + "epoch": 1.4361571516267648, + "grad_norm": 0.405073344707489, + "learning_rate": 9.66556468583856e-05, + "loss": 1.9211, + "step": 4679 + }, + { + "epoch": 1.43646408839779, + "grad_norm": 0.47113174200057983, + "learning_rate": 9.665385929686279e-05, + "loss": 2.0732, + "step": 4680 + }, + { + "epoch": 1.4367710251688153, + "grad_norm": 0.4710143506526947, + "learning_rate": 9.665207127427923e-05, + "loss": 1.9153, + "step": 4681 + }, + { + "epoch": 1.4370779619398404, + "grad_norm": 0.41988152265548706, + "learning_rate": 9.665028279065254e-05, + "loss": 1.9985, + "step": 4682 + }, + { + "epoch": 1.4373848987108655, + "grad_norm": 0.4629889130592346, + "learning_rate": 9.664849384600042e-05, + "loss": 2.0188, + "step": 4683 + }, + { + "epoch": 1.4376918354818908, + "grad_norm": 0.42099106311798096, + "learning_rate": 9.664670444034051e-05, + "loss": 1.8915, + "step": 4684 + }, + { + "epoch": 1.437998772252916, + "grad_norm": 0.4132508337497711, + "learning_rate": 9.664491457369056e-05, + "loss": 1.9842, + "step": 4685 + }, + { + "epoch": 1.438305709023941, + "grad_norm": 0.4019499123096466, + "learning_rate": 9.664312424606822e-05, + "loss": 1.8653, + "step": 4686 + }, + { + "epoch": 1.4386126457949662, + "grad_norm": 0.40366294980049133, + "learning_rate": 9.664133345749118e-05, + "loss": 1.8993, + "step": 4687 + }, + { + "epoch": 1.4389195825659913, + "grad_norm": 0.4391988217830658, + "learning_rate": 9.663954220797715e-05, + "loss": 1.9471, + "step": 4688 + }, + { + "epoch": 1.4392265193370166, + "grad_norm": 0.44109684228897095, + "learning_rate": 9.663775049754382e-05, + "loss": 1.9579, + "step": 4689 + }, + { + "epoch": 1.4395334561080417, + "grad_norm": 0.45682960748672485, + "learning_rate": 9.663595832620891e-05, + "loss": 1.9757, + "step": 4690 + }, + { + "epoch": 1.4398403928790668, + "grad_norm": 0.4106207489967346, + "learning_rate": 9.663416569399013e-05, + "loss": 2.0038, + "step": 4691 + }, + { + "epoch": 1.4401473296500922, + "grad_norm": 0.4627512991428375, + "learning_rate": 9.66323726009052e-05, + "loss": 2.0253, + "step": 4692 + }, + { + "epoch": 1.4404542664211173, + "grad_norm": 0.43822941184043884, + "learning_rate": 9.663057904697182e-05, + "loss": 1.9565, + "step": 4693 + }, + { + "epoch": 1.4407612031921424, + "grad_norm": 0.46254315972328186, + "learning_rate": 9.662878503220772e-05, + "loss": 2.0042, + "step": 4694 + }, + { + "epoch": 1.4410681399631675, + "grad_norm": 0.49801671504974365, + "learning_rate": 9.662699055663065e-05, + "loss": 1.9725, + "step": 4695 + }, + { + "epoch": 1.4413750767341926, + "grad_norm": 0.40280646085739136, + "learning_rate": 9.662519562025832e-05, + "loss": 1.9016, + "step": 4696 + }, + { + "epoch": 1.441682013505218, + "grad_norm": 0.4095497131347656, + "learning_rate": 9.662340022310848e-05, + "loss": 2.0054, + "step": 4697 + }, + { + "epoch": 1.441988950276243, + "grad_norm": 0.44916659593582153, + "learning_rate": 9.662160436519889e-05, + "loss": 2.0126, + "step": 4698 + }, + { + "epoch": 1.4422958870472682, + "grad_norm": 0.47450655698776245, + "learning_rate": 9.661980804654725e-05, + "loss": 1.9679, + "step": 4699 + }, + { + "epoch": 1.4426028238182935, + "grad_norm": 0.4454696774482727, + "learning_rate": 9.661801126717136e-05, + "loss": 1.9335, + "step": 4700 + }, + { + "epoch": 1.4429097605893186, + "grad_norm": 0.5009927153587341, + "learning_rate": 9.661621402708896e-05, + "loss": 1.9777, + "step": 4701 + }, + { + "epoch": 1.4432166973603437, + "grad_norm": 0.49912458658218384, + "learning_rate": 9.66144163263178e-05, + "loss": 2.0095, + "step": 4702 + }, + { + "epoch": 1.4435236341313689, + "grad_norm": 0.4477069079875946, + "learning_rate": 9.661261816487568e-05, + "loss": 1.9265, + "step": 4703 + }, + { + "epoch": 1.443830570902394, + "grad_norm": 0.4170798361301422, + "learning_rate": 9.661081954278033e-05, + "loss": 1.9458, + "step": 4704 + }, + { + "epoch": 1.4441375076734193, + "grad_norm": 0.45160573720932007, + "learning_rate": 9.660902046004953e-05, + "loss": 1.9596, + "step": 4705 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4391551911830902, + "learning_rate": 9.660722091670109e-05, + "loss": 1.9158, + "step": 4706 + }, + { + "epoch": 1.4447513812154695, + "grad_norm": 0.5183218121528625, + "learning_rate": 9.660542091275276e-05, + "loss": 2.0055, + "step": 4707 + }, + { + "epoch": 1.4450583179864949, + "grad_norm": 0.49749481678009033, + "learning_rate": 9.660362044822235e-05, + "loss": 1.9695, + "step": 4708 + }, + { + "epoch": 1.44536525475752, + "grad_norm": 0.4839307963848114, + "learning_rate": 9.660181952312766e-05, + "loss": 1.9447, + "step": 4709 + }, + { + "epoch": 1.445672191528545, + "grad_norm": 0.5218588709831238, + "learning_rate": 9.660001813748647e-05, + "loss": 1.9892, + "step": 4710 + }, + { + "epoch": 1.4459791282995704, + "grad_norm": 0.5628986954689026, + "learning_rate": 9.659821629131658e-05, + "loss": 2.0598, + "step": 4711 + }, + { + "epoch": 1.4462860650705955, + "grad_norm": 0.5226300358772278, + "learning_rate": 9.65964139846358e-05, + "loss": 1.977, + "step": 4712 + }, + { + "epoch": 1.4465930018416207, + "grad_norm": 0.4345463216304779, + "learning_rate": 9.659461121746196e-05, + "loss": 1.9649, + "step": 4713 + }, + { + "epoch": 1.4468999386126458, + "grad_norm": 0.47233885526657104, + "learning_rate": 9.659280798981285e-05, + "loss": 1.9791, + "step": 4714 + }, + { + "epoch": 1.4472068753836709, + "grad_norm": 0.5272542238235474, + "learning_rate": 9.659100430170631e-05, + "loss": 2.0153, + "step": 4715 + }, + { + "epoch": 1.4475138121546962, + "grad_norm": 0.5567492246627808, + "learning_rate": 9.658920015316015e-05, + "loss": 2.0196, + "step": 4716 + }, + { + "epoch": 1.4478207489257213, + "grad_norm": 0.5393046140670776, + "learning_rate": 9.658739554419222e-05, + "loss": 1.9871, + "step": 4717 + }, + { + "epoch": 1.4481276856967464, + "grad_norm": 0.46408072113990784, + "learning_rate": 9.658559047482034e-05, + "loss": 1.9896, + "step": 4718 + }, + { + "epoch": 1.4484346224677718, + "grad_norm": 0.47001218795776367, + "learning_rate": 9.658378494506234e-05, + "loss": 2.0281, + "step": 4719 + }, + { + "epoch": 1.4487415592387969, + "grad_norm": 0.555749773979187, + "learning_rate": 9.658197895493608e-05, + "loss": 2.0184, + "step": 4720 + }, + { + "epoch": 1.449048496009822, + "grad_norm": 0.6206443905830383, + "learning_rate": 9.65801725044594e-05, + "loss": 1.9788, + "step": 4721 + }, + { + "epoch": 1.449355432780847, + "grad_norm": 0.533336877822876, + "learning_rate": 9.657836559365016e-05, + "loss": 1.9755, + "step": 4722 + }, + { + "epoch": 1.4496623695518722, + "grad_norm": 0.4553185701370239, + "learning_rate": 9.65765582225262e-05, + "loss": 1.9791, + "step": 4723 + }, + { + "epoch": 1.4499693063228976, + "grad_norm": 0.5754305124282837, + "learning_rate": 9.65747503911054e-05, + "loss": 1.9485, + "step": 4724 + }, + { + "epoch": 1.4502762430939227, + "grad_norm": 0.6812698245048523, + "learning_rate": 9.657294209940562e-05, + "loss": 2.0326, + "step": 4725 + }, + { + "epoch": 1.4505831798649478, + "grad_norm": 0.7532522678375244, + "learning_rate": 9.657113334744472e-05, + "loss": 1.9387, + "step": 4726 + }, + { + "epoch": 1.4508901166359731, + "grad_norm": 0.5618684887886047, + "learning_rate": 9.656932413524058e-05, + "loss": 1.9395, + "step": 4727 + }, + { + "epoch": 1.4511970534069982, + "grad_norm": 0.4818387031555176, + "learning_rate": 9.65675144628111e-05, + "loss": 1.9473, + "step": 4728 + }, + { + "epoch": 1.4515039901780233, + "grad_norm": 0.5152607560157776, + "learning_rate": 9.656570433017413e-05, + "loss": 1.894, + "step": 4729 + }, + { + "epoch": 1.4518109269490485, + "grad_norm": 0.5098578333854675, + "learning_rate": 9.656389373734759e-05, + "loss": 1.9519, + "step": 4730 + }, + { + "epoch": 1.4521178637200736, + "grad_norm": 0.5862317681312561, + "learning_rate": 9.656208268434936e-05, + "loss": 1.9968, + "step": 4731 + }, + { + "epoch": 1.452424800491099, + "grad_norm": 0.501220703125, + "learning_rate": 9.656027117119732e-05, + "loss": 1.993, + "step": 4732 + }, + { + "epoch": 1.452731737262124, + "grad_norm": 0.4974796772003174, + "learning_rate": 9.655845919790943e-05, + "loss": 2.0007, + "step": 4733 + }, + { + "epoch": 1.4530386740331491, + "grad_norm": 0.513671875, + "learning_rate": 9.655664676450351e-05, + "loss": 1.9321, + "step": 4734 + }, + { + "epoch": 1.4533456108041745, + "grad_norm": 0.5111755728721619, + "learning_rate": 9.655483387099756e-05, + "loss": 2.0187, + "step": 4735 + }, + { + "epoch": 1.4536525475751996, + "grad_norm": 0.47103258967399597, + "learning_rate": 9.655302051740942e-05, + "loss": 1.9716, + "step": 4736 + }, + { + "epoch": 1.4539594843462247, + "grad_norm": 0.4526553750038147, + "learning_rate": 9.655120670375707e-05, + "loss": 2.0424, + "step": 4737 + }, + { + "epoch": 1.4542664211172498, + "grad_norm": 0.44393640756607056, + "learning_rate": 9.65493924300584e-05, + "loss": 1.9318, + "step": 4738 + }, + { + "epoch": 1.454573357888275, + "grad_norm": 0.4070759415626526, + "learning_rate": 9.654757769633136e-05, + "loss": 1.9292, + "step": 4739 + }, + { + "epoch": 1.4548802946593002, + "grad_norm": 0.4010253846645355, + "learning_rate": 9.654576250259387e-05, + "loss": 1.9641, + "step": 4740 + }, + { + "epoch": 1.4551872314303254, + "grad_norm": 0.39156264066696167, + "learning_rate": 9.654394684886387e-05, + "loss": 1.9575, + "step": 4741 + }, + { + "epoch": 1.4554941682013505, + "grad_norm": 0.4360155463218689, + "learning_rate": 9.65421307351593e-05, + "loss": 1.9615, + "step": 4742 + }, + { + "epoch": 1.4558011049723758, + "grad_norm": 0.4203348755836487, + "learning_rate": 9.654031416149813e-05, + "loss": 1.9629, + "step": 4743 + }, + { + "epoch": 1.456108041743401, + "grad_norm": 0.42294225096702576, + "learning_rate": 9.653849712789828e-05, + "loss": 1.9756, + "step": 4744 + }, + { + "epoch": 1.456414978514426, + "grad_norm": 0.46253907680511475, + "learning_rate": 9.653667963437775e-05, + "loss": 2.0128, + "step": 4745 + }, + { + "epoch": 1.4567219152854511, + "grad_norm": 0.41743987798690796, + "learning_rate": 9.653486168095446e-05, + "loss": 1.938, + "step": 4746 + }, + { + "epoch": 1.4570288520564763, + "grad_norm": 0.43411263823509216, + "learning_rate": 9.653304326764639e-05, + "loss": 1.9744, + "step": 4747 + }, + { + "epoch": 1.4573357888275016, + "grad_norm": 0.4569607973098755, + "learning_rate": 9.653122439447151e-05, + "loss": 1.9844, + "step": 4748 + }, + { + "epoch": 1.4576427255985267, + "grad_norm": 0.41858115792274475, + "learning_rate": 9.652940506144781e-05, + "loss": 1.9835, + "step": 4749 + }, + { + "epoch": 1.4579496623695518, + "grad_norm": 0.4259703755378723, + "learning_rate": 9.652758526859324e-05, + "loss": 1.9467, + "step": 4750 + }, + { + "epoch": 1.4582565991405771, + "grad_norm": 0.49847620725631714, + "learning_rate": 9.652576501592583e-05, + "loss": 1.989, + "step": 4751 + }, + { + "epoch": 1.4585635359116023, + "grad_norm": 0.5898705720901489, + "learning_rate": 9.652394430346352e-05, + "loss": 1.9896, + "step": 4752 + }, + { + "epoch": 1.4588704726826274, + "grad_norm": 0.6528434157371521, + "learning_rate": 9.652212313122433e-05, + "loss": 1.9814, + "step": 4753 + }, + { + "epoch": 1.4591774094536525, + "grad_norm": 0.5704251527786255, + "learning_rate": 9.652030149922624e-05, + "loss": 1.9735, + "step": 4754 + }, + { + "epoch": 1.4594843462246776, + "grad_norm": 0.4349142014980316, + "learning_rate": 9.651847940748727e-05, + "loss": 1.9923, + "step": 4755 + }, + { + "epoch": 1.459791282995703, + "grad_norm": 0.43891096115112305, + "learning_rate": 9.651665685602542e-05, + "loss": 1.9429, + "step": 4756 + }, + { + "epoch": 1.460098219766728, + "grad_norm": 0.5881633758544922, + "learning_rate": 9.651483384485871e-05, + "loss": 2.0075, + "step": 4757 + }, + { + "epoch": 1.4604051565377532, + "grad_norm": 0.569064736366272, + "learning_rate": 9.651301037400515e-05, + "loss": 1.9968, + "step": 4758 + }, + { + "epoch": 1.4607120933087785, + "grad_norm": 0.49636805057525635, + "learning_rate": 9.651118644348276e-05, + "loss": 2.0844, + "step": 4759 + }, + { + "epoch": 1.4610190300798036, + "grad_norm": 0.4893283247947693, + "learning_rate": 9.650936205330955e-05, + "loss": 1.9635, + "step": 4760 + }, + { + "epoch": 1.4613259668508287, + "grad_norm": 0.5199632048606873, + "learning_rate": 9.650753720350358e-05, + "loss": 1.8934, + "step": 4761 + }, + { + "epoch": 1.4616329036218538, + "grad_norm": 0.5655859708786011, + "learning_rate": 9.650571189408287e-05, + "loss": 2.0473, + "step": 4762 + }, + { + "epoch": 1.461939840392879, + "grad_norm": 0.5004158020019531, + "learning_rate": 9.650388612506545e-05, + "loss": 1.9388, + "step": 4763 + }, + { + "epoch": 1.4622467771639043, + "grad_norm": 0.5075541734695435, + "learning_rate": 9.650205989646937e-05, + "loss": 2.0362, + "step": 4764 + }, + { + "epoch": 1.4625537139349294, + "grad_norm": 0.52835613489151, + "learning_rate": 9.650023320831267e-05, + "loss": 1.9849, + "step": 4765 + }, + { + "epoch": 1.4628606507059545, + "grad_norm": 0.5208338499069214, + "learning_rate": 9.649840606061342e-05, + "loss": 1.9619, + "step": 4766 + }, + { + "epoch": 1.4631675874769798, + "grad_norm": 0.4954691529273987, + "learning_rate": 9.649657845338966e-05, + "loss": 1.9282, + "step": 4767 + }, + { + "epoch": 1.463474524248005, + "grad_norm": 0.4260660409927368, + "learning_rate": 9.649475038665947e-05, + "loss": 2.0108, + "step": 4768 + }, + { + "epoch": 1.46378146101903, + "grad_norm": 0.4954771101474762, + "learning_rate": 9.64929218604409e-05, + "loss": 1.9995, + "step": 4769 + }, + { + "epoch": 1.4640883977900552, + "grad_norm": 0.6004415154457092, + "learning_rate": 9.649109287475202e-05, + "loss": 1.9816, + "step": 4770 + }, + { + "epoch": 1.4643953345610803, + "grad_norm": 0.6472858190536499, + "learning_rate": 9.648926342961092e-05, + "loss": 1.927, + "step": 4771 + }, + { + "epoch": 1.4647022713321056, + "grad_norm": 0.5293224453926086, + "learning_rate": 9.648743352503567e-05, + "loss": 1.9082, + "step": 4772 + }, + { + "epoch": 1.4650092081031307, + "grad_norm": 0.4413148760795593, + "learning_rate": 9.648560316104435e-05, + "loss": 1.9368, + "step": 4773 + }, + { + "epoch": 1.4653161448741558, + "grad_norm": 0.4727863371372223, + "learning_rate": 9.648377233765507e-05, + "loss": 1.944, + "step": 4774 + }, + { + "epoch": 1.4656230816451812, + "grad_norm": 0.5681154131889343, + "learning_rate": 9.648194105488589e-05, + "loss": 2.0003, + "step": 4775 + }, + { + "epoch": 1.4659300184162063, + "grad_norm": 0.5893644690513611, + "learning_rate": 9.648010931275493e-05, + "loss": 1.936, + "step": 4776 + }, + { + "epoch": 1.4662369551872314, + "grad_norm": 0.5034298300743103, + "learning_rate": 9.647827711128029e-05, + "loss": 2.0318, + "step": 4777 + }, + { + "epoch": 1.4665438919582565, + "grad_norm": 0.4954885244369507, + "learning_rate": 9.647644445048006e-05, + "loss": 2.0053, + "step": 4778 + }, + { + "epoch": 1.4668508287292816, + "grad_norm": 0.475923627614975, + "learning_rate": 9.647461133037236e-05, + "loss": 1.8911, + "step": 4779 + }, + { + "epoch": 1.467157765500307, + "grad_norm": 0.4725008010864258, + "learning_rate": 9.647277775097534e-05, + "loss": 1.8954, + "step": 4780 + }, + { + "epoch": 1.467464702271332, + "grad_norm": 0.4183707535266876, + "learning_rate": 9.647094371230707e-05, + "loss": 1.9891, + "step": 4781 + }, + { + "epoch": 1.4677716390423572, + "grad_norm": 0.4862513244152069, + "learning_rate": 9.64691092143857e-05, + "loss": 2.0364, + "step": 4782 + }, + { + "epoch": 1.4680785758133825, + "grad_norm": 0.5038082599639893, + "learning_rate": 9.646727425722936e-05, + "loss": 1.9304, + "step": 4783 + }, + { + "epoch": 1.4683855125844076, + "grad_norm": 0.47281327843666077, + "learning_rate": 9.646543884085618e-05, + "loss": 1.9453, + "step": 4784 + }, + { + "epoch": 1.4686924493554327, + "grad_norm": 0.42275354266166687, + "learning_rate": 9.646360296528431e-05, + "loss": 1.9434, + "step": 4785 + }, + { + "epoch": 1.468999386126458, + "grad_norm": 0.5757746696472168, + "learning_rate": 9.646176663053185e-05, + "loss": 2.0241, + "step": 4786 + }, + { + "epoch": 1.4693063228974832, + "grad_norm": 0.6757779121398926, + "learning_rate": 9.645992983661701e-05, + "loss": 1.9823, + "step": 4787 + }, + { + "epoch": 1.4696132596685083, + "grad_norm": 0.7052981853485107, + "learning_rate": 9.645809258355792e-05, + "loss": 2.0553, + "step": 4788 + }, + { + "epoch": 1.4699201964395334, + "grad_norm": 0.5630238652229309, + "learning_rate": 9.64562548713727e-05, + "loss": 2.0241, + "step": 4789 + }, + { + "epoch": 1.4702271332105585, + "grad_norm": 0.5034958124160767, + "learning_rate": 9.645441670007955e-05, + "loss": 1.9788, + "step": 4790 + }, + { + "epoch": 1.4705340699815839, + "grad_norm": 0.48978129029273987, + "learning_rate": 9.645257806969663e-05, + "loss": 1.9415, + "step": 4791 + }, + { + "epoch": 1.470841006752609, + "grad_norm": 0.4718508720397949, + "learning_rate": 9.645073898024211e-05, + "loss": 1.9657, + "step": 4792 + }, + { + "epoch": 1.471147943523634, + "grad_norm": 0.5171064734458923, + "learning_rate": 9.644889943173417e-05, + "loss": 1.9311, + "step": 4793 + }, + { + "epoch": 1.4714548802946594, + "grad_norm": 0.4556005597114563, + "learning_rate": 9.644705942419097e-05, + "loss": 1.9093, + "step": 4794 + }, + { + "epoch": 1.4717618170656845, + "grad_norm": 0.44836321473121643, + "learning_rate": 9.64452189576307e-05, + "loss": 1.9715, + "step": 4795 + }, + { + "epoch": 1.4720687538367097, + "grad_norm": 0.5139105916023254, + "learning_rate": 9.644337803207155e-05, + "loss": 1.967, + "step": 4796 + }, + { + "epoch": 1.4723756906077348, + "grad_norm": 0.49145743250846863, + "learning_rate": 9.644153664753173e-05, + "loss": 1.9679, + "step": 4797 + }, + { + "epoch": 1.4726826273787599, + "grad_norm": 0.4353790283203125, + "learning_rate": 9.643969480402942e-05, + "loss": 1.9438, + "step": 4798 + }, + { + "epoch": 1.4729895641497852, + "grad_norm": 0.39393118023872375, + "learning_rate": 9.643785250158283e-05, + "loss": 1.91, + "step": 4799 + }, + { + "epoch": 1.4732965009208103, + "grad_norm": 0.4250284731388092, + "learning_rate": 9.643600974021017e-05, + "loss": 1.9315, + "step": 4800 + }, + { + "epoch": 1.4736034376918354, + "grad_norm": 0.40301406383514404, + "learning_rate": 9.643416651992962e-05, + "loss": 1.9344, + "step": 4801 + }, + { + "epoch": 1.4739103744628608, + "grad_norm": 0.4428589940071106, + "learning_rate": 9.643232284075944e-05, + "loss": 1.9767, + "step": 4802 + }, + { + "epoch": 1.4742173112338859, + "grad_norm": 0.5098150372505188, + "learning_rate": 9.643047870271783e-05, + "loss": 2.0471, + "step": 4803 + }, + { + "epoch": 1.474524248004911, + "grad_norm": 0.5230079293251038, + "learning_rate": 9.642863410582302e-05, + "loss": 1.9647, + "step": 4804 + }, + { + "epoch": 1.474831184775936, + "grad_norm": 0.44200628995895386, + "learning_rate": 9.642678905009322e-05, + "loss": 1.9046, + "step": 4805 + }, + { + "epoch": 1.4751381215469612, + "grad_norm": 0.42684751749038696, + "learning_rate": 9.642494353554669e-05, + "loss": 1.82, + "step": 4806 + }, + { + "epoch": 1.4754450583179866, + "grad_norm": 0.3907437324523926, + "learning_rate": 9.642309756220165e-05, + "loss": 1.9257, + "step": 4807 + }, + { + "epoch": 1.4757519950890117, + "grad_norm": 0.43622660636901855, + "learning_rate": 9.642125113007636e-05, + "loss": 1.9319, + "step": 4808 + }, + { + "epoch": 1.4760589318600368, + "grad_norm": 0.4553097188472748, + "learning_rate": 9.641940423918905e-05, + "loss": 1.9699, + "step": 4809 + }, + { + "epoch": 1.4763658686310621, + "grad_norm": 0.48997193574905396, + "learning_rate": 9.641755688955798e-05, + "loss": 1.9843, + "step": 4810 + }, + { + "epoch": 1.4766728054020872, + "grad_norm": 0.5008227825164795, + "learning_rate": 9.641570908120141e-05, + "loss": 1.9616, + "step": 4811 + }, + { + "epoch": 1.4769797421731123, + "grad_norm": 0.49788615107536316, + "learning_rate": 9.64138608141376e-05, + "loss": 2.0233, + "step": 4812 + }, + { + "epoch": 1.4772866789441375, + "grad_norm": 0.509159505367279, + "learning_rate": 9.64120120883848e-05, + "loss": 1.9982, + "step": 4813 + }, + { + "epoch": 1.4775936157151626, + "grad_norm": 0.4976164996623993, + "learning_rate": 9.641016290396132e-05, + "loss": 1.9944, + "step": 4814 + }, + { + "epoch": 1.477900552486188, + "grad_norm": 0.4925370514392853, + "learning_rate": 9.640831326088539e-05, + "loss": 1.9547, + "step": 4815 + }, + { + "epoch": 1.478207489257213, + "grad_norm": 0.5058705806732178, + "learning_rate": 9.64064631591753e-05, + "loss": 2.0147, + "step": 4816 + }, + { + "epoch": 1.4785144260282381, + "grad_norm": 0.5614715814590454, + "learning_rate": 9.640461259884937e-05, + "loss": 1.9475, + "step": 4817 + }, + { + "epoch": 1.4788213627992635, + "grad_norm": 0.4417608380317688, + "learning_rate": 9.640276157992582e-05, + "loss": 1.9422, + "step": 4818 + }, + { + "epoch": 1.4791282995702886, + "grad_norm": 0.5124607682228088, + "learning_rate": 9.6400910102423e-05, + "loss": 1.9489, + "step": 4819 + }, + { + "epoch": 1.4794352363413137, + "grad_norm": 0.4931279420852661, + "learning_rate": 9.63990581663592e-05, + "loss": 1.9717, + "step": 4820 + }, + { + "epoch": 1.4797421731123388, + "grad_norm": 0.4716447591781616, + "learning_rate": 9.639720577175271e-05, + "loss": 1.9758, + "step": 4821 + }, + { + "epoch": 1.480049109883364, + "grad_norm": 0.4613695740699768, + "learning_rate": 9.639535291862183e-05, + "loss": 1.8998, + "step": 4822 + }, + { + "epoch": 1.4803560466543892, + "grad_norm": 0.4430600702762604, + "learning_rate": 9.639349960698489e-05, + "loss": 1.9539, + "step": 4823 + }, + { + "epoch": 1.4806629834254144, + "grad_norm": 0.45596009492874146, + "learning_rate": 9.639164583686018e-05, + "loss": 1.9626, + "step": 4824 + }, + { + "epoch": 1.4809699201964395, + "grad_norm": 0.4248705804347992, + "learning_rate": 9.638979160826604e-05, + "loss": 1.9627, + "step": 4825 + }, + { + "epoch": 1.4812768569674648, + "grad_norm": 0.43419960141181946, + "learning_rate": 9.63879369212208e-05, + "loss": 1.9589, + "step": 4826 + }, + { + "epoch": 1.48158379373849, + "grad_norm": 0.4715637266635895, + "learning_rate": 9.638608177574278e-05, + "loss": 1.981, + "step": 4827 + }, + { + "epoch": 1.481890730509515, + "grad_norm": 0.41809993982315063, + "learning_rate": 9.63842261718503e-05, + "loss": 1.9587, + "step": 4828 + }, + { + "epoch": 1.4821976672805401, + "grad_norm": 0.4085060656070709, + "learning_rate": 9.63823701095617e-05, + "loss": 1.9497, + "step": 4829 + }, + { + "epoch": 1.4825046040515653, + "grad_norm": 0.4199173152446747, + "learning_rate": 9.638051358889535e-05, + "loss": 1.9543, + "step": 4830 + }, + { + "epoch": 1.4828115408225906, + "grad_norm": 0.4560040235519409, + "learning_rate": 9.637865660986958e-05, + "loss": 1.9451, + "step": 4831 + }, + { + "epoch": 1.4831184775936157, + "grad_norm": 0.4059405028820038, + "learning_rate": 9.637679917250272e-05, + "loss": 1.9154, + "step": 4832 + }, + { + "epoch": 1.4834254143646408, + "grad_norm": 0.43314236402511597, + "learning_rate": 9.637494127681318e-05, + "loss": 1.9589, + "step": 4833 + }, + { + "epoch": 1.4837323511356661, + "grad_norm": 0.3866138458251953, + "learning_rate": 9.637308292281928e-05, + "loss": 1.9239, + "step": 4834 + }, + { + "epoch": 1.4840392879066913, + "grad_norm": 0.40781381726264954, + "learning_rate": 9.637122411053939e-05, + "loss": 1.9805, + "step": 4835 + }, + { + "epoch": 1.4843462246777164, + "grad_norm": 0.4605334401130676, + "learning_rate": 9.636936483999189e-05, + "loss": 1.9571, + "step": 4836 + }, + { + "epoch": 1.4846531614487415, + "grad_norm": 0.4730539917945862, + "learning_rate": 9.636750511119513e-05, + "loss": 1.9429, + "step": 4837 + }, + { + "epoch": 1.4849600982197666, + "grad_norm": 0.47973817586898804, + "learning_rate": 9.636564492416753e-05, + "loss": 1.9865, + "step": 4838 + }, + { + "epoch": 1.485267034990792, + "grad_norm": 0.4541794955730438, + "learning_rate": 9.636378427892744e-05, + "loss": 1.9796, + "step": 4839 + }, + { + "epoch": 1.485573971761817, + "grad_norm": 0.4863722026348114, + "learning_rate": 9.636192317549327e-05, + "loss": 1.9581, + "step": 4840 + }, + { + "epoch": 1.4858809085328422, + "grad_norm": 0.4559536278247833, + "learning_rate": 9.636006161388338e-05, + "loss": 1.9444, + "step": 4841 + }, + { + "epoch": 1.4861878453038675, + "grad_norm": 0.4385206401348114, + "learning_rate": 9.63581995941162e-05, + "loss": 1.9323, + "step": 4842 + }, + { + "epoch": 1.4864947820748926, + "grad_norm": 0.48802945017814636, + "learning_rate": 9.635633711621012e-05, + "loss": 1.9643, + "step": 4843 + }, + { + "epoch": 1.4868017188459177, + "grad_norm": 0.4051367938518524, + "learning_rate": 9.635447418018355e-05, + "loss": 1.9342, + "step": 4844 + }, + { + "epoch": 1.4871086556169428, + "grad_norm": 0.46384257078170776, + "learning_rate": 9.63526107860549e-05, + "loss": 1.9656, + "step": 4845 + }, + { + "epoch": 1.487415592387968, + "grad_norm": 0.3950713574886322, + "learning_rate": 9.635074693384257e-05, + "loss": 1.8673, + "step": 4846 + }, + { + "epoch": 1.4877225291589933, + "grad_norm": 0.4694644808769226, + "learning_rate": 9.634888262356501e-05, + "loss": 1.9484, + "step": 4847 + }, + { + "epoch": 1.4880294659300184, + "grad_norm": 0.45068567991256714, + "learning_rate": 9.63470178552406e-05, + "loss": 1.9221, + "step": 4848 + }, + { + "epoch": 1.4883364027010435, + "grad_norm": 0.44717836380004883, + "learning_rate": 9.634515262888781e-05, + "loss": 1.9968, + "step": 4849 + }, + { + "epoch": 1.4886433394720688, + "grad_norm": 0.42189615964889526, + "learning_rate": 9.634328694452506e-05, + "loss": 2.0262, + "step": 4850 + }, + { + "epoch": 1.488950276243094, + "grad_norm": 0.4895322322845459, + "learning_rate": 9.63414208021708e-05, + "loss": 2.0628, + "step": 4851 + }, + { + "epoch": 1.489257213014119, + "grad_norm": 0.4732883870601654, + "learning_rate": 9.633955420184342e-05, + "loss": 1.9487, + "step": 4852 + }, + { + "epoch": 1.4895641497851444, + "grad_norm": 0.4426051676273346, + "learning_rate": 9.633768714356143e-05, + "loss": 2.0181, + "step": 4853 + }, + { + "epoch": 1.4898710865561695, + "grad_norm": 0.5831739902496338, + "learning_rate": 9.633581962734326e-05, + "loss": 1.9311, + "step": 4854 + }, + { + "epoch": 1.4901780233271946, + "grad_norm": 0.6048587560653687, + "learning_rate": 9.633395165320734e-05, + "loss": 1.9159, + "step": 4855 + }, + { + "epoch": 1.4904849600982197, + "grad_norm": 0.60125732421875, + "learning_rate": 9.633208322117218e-05, + "loss": 1.9732, + "step": 4856 + }, + { + "epoch": 1.4907918968692448, + "grad_norm": 0.4806794822216034, + "learning_rate": 9.63302143312562e-05, + "loss": 1.9101, + "step": 4857 + }, + { + "epoch": 1.4910988336402702, + "grad_norm": 0.4032946228981018, + "learning_rate": 9.632834498347789e-05, + "loss": 1.9097, + "step": 4858 + }, + { + "epoch": 1.4914057704112953, + "grad_norm": 0.400632381439209, + "learning_rate": 9.632647517785571e-05, + "loss": 1.9949, + "step": 4859 + }, + { + "epoch": 1.4917127071823204, + "grad_norm": 0.49766576290130615, + "learning_rate": 9.632460491440818e-05, + "loss": 1.9762, + "step": 4860 + }, + { + "epoch": 1.4920196439533457, + "grad_norm": 0.6273209452629089, + "learning_rate": 9.632273419315372e-05, + "loss": 2.0797, + "step": 4861 + }, + { + "epoch": 1.4923265807243709, + "grad_norm": 0.5848406553268433, + "learning_rate": 9.632086301411087e-05, + "loss": 1.9366, + "step": 4862 + }, + { + "epoch": 1.492633517495396, + "grad_norm": 0.4683595597743988, + "learning_rate": 9.631899137729809e-05, + "loss": 1.9802, + "step": 4863 + }, + { + "epoch": 1.492940454266421, + "grad_norm": 0.43066033720970154, + "learning_rate": 9.63171192827339e-05, + "loss": 1.9621, + "step": 4864 + }, + { + "epoch": 1.4932473910374462, + "grad_norm": 0.47469422221183777, + "learning_rate": 9.63152467304368e-05, + "loss": 1.9795, + "step": 4865 + }, + { + "epoch": 1.4935543278084715, + "grad_norm": 0.5453927516937256, + "learning_rate": 9.631337372042526e-05, + "loss": 1.9711, + "step": 4866 + }, + { + "epoch": 1.4938612645794966, + "grad_norm": 0.5361614227294922, + "learning_rate": 9.631150025271782e-05, + "loss": 1.9849, + "step": 4867 + }, + { + "epoch": 1.4941682013505218, + "grad_norm": 0.4773578643798828, + "learning_rate": 9.6309626327333e-05, + "loss": 2.065, + "step": 4868 + }, + { + "epoch": 1.494475138121547, + "grad_norm": 0.428091824054718, + "learning_rate": 9.630775194428932e-05, + "loss": 1.9448, + "step": 4869 + }, + { + "epoch": 1.4947820748925722, + "grad_norm": 0.41679108142852783, + "learning_rate": 9.630587710360527e-05, + "loss": 1.9511, + "step": 4870 + }, + { + "epoch": 1.4950890116635973, + "grad_norm": 0.5072546601295471, + "learning_rate": 9.630400180529942e-05, + "loss": 1.9973, + "step": 4871 + }, + { + "epoch": 1.4953959484346224, + "grad_norm": 0.5230575799942017, + "learning_rate": 9.630212604939026e-05, + "loss": 1.9659, + "step": 4872 + }, + { + "epoch": 1.4957028852056475, + "grad_norm": 0.44307753443717957, + "learning_rate": 9.630024983589638e-05, + "loss": 1.9056, + "step": 4873 + }, + { + "epoch": 1.4960098219766729, + "grad_norm": 0.43783196806907654, + "learning_rate": 9.629837316483628e-05, + "loss": 1.9716, + "step": 4874 + }, + { + "epoch": 1.496316758747698, + "grad_norm": 0.4553990960121155, + "learning_rate": 9.629649603622852e-05, + "loss": 2.044, + "step": 4875 + }, + { + "epoch": 1.496623695518723, + "grad_norm": 0.49152833223342896, + "learning_rate": 9.629461845009164e-05, + "loss": 1.948, + "step": 4876 + }, + { + "epoch": 1.4969306322897484, + "grad_norm": 0.4371738135814667, + "learning_rate": 9.629274040644422e-05, + "loss": 1.9497, + "step": 4877 + }, + { + "epoch": 1.4972375690607735, + "grad_norm": 0.4973873198032379, + "learning_rate": 9.629086190530482e-05, + "loss": 2.0053, + "step": 4878 + }, + { + "epoch": 1.4975445058317987, + "grad_norm": 0.4250672459602356, + "learning_rate": 9.628898294669197e-05, + "loss": 1.9617, + "step": 4879 + }, + { + "epoch": 1.4978514426028238, + "grad_norm": 0.4514639675617218, + "learning_rate": 9.628710353062427e-05, + "loss": 1.9503, + "step": 4880 + }, + { + "epoch": 1.4981583793738489, + "grad_norm": 0.4960804879665375, + "learning_rate": 9.628522365712027e-05, + "loss": 1.9932, + "step": 4881 + }, + { + "epoch": 1.4984653161448742, + "grad_norm": 0.5604363083839417, + "learning_rate": 9.628334332619857e-05, + "loss": 2.0186, + "step": 4882 + }, + { + "epoch": 1.4987722529158993, + "grad_norm": 0.5125443935394287, + "learning_rate": 9.628146253787776e-05, + "loss": 1.9897, + "step": 4883 + }, + { + "epoch": 1.4990791896869244, + "grad_norm": 0.4029771089553833, + "learning_rate": 9.627958129217639e-05, + "loss": 1.9083, + "step": 4884 + }, + { + "epoch": 1.4993861264579498, + "grad_norm": 0.4608222544193268, + "learning_rate": 9.627769958911308e-05, + "loss": 2.0153, + "step": 4885 + }, + { + "epoch": 1.4996930632289749, + "grad_norm": 0.4253246486186981, + "learning_rate": 9.627581742870641e-05, + "loss": 1.9278, + "step": 4886 + }, + { + "epoch": 1.5, + "grad_norm": 0.4247463047504425, + "learning_rate": 9.6273934810975e-05, + "loss": 1.9456, + "step": 4887 + }, + { + "epoch": 1.5003069367710253, + "grad_norm": 0.44055816531181335, + "learning_rate": 9.627205173593744e-05, + "loss": 2.0225, + "step": 4888 + }, + { + "epoch": 1.5006138735420502, + "grad_norm": 0.47912710905075073, + "learning_rate": 9.627016820361235e-05, + "loss": 1.9716, + "step": 4889 + }, + { + "epoch": 1.5009208103130756, + "grad_norm": 0.47608625888824463, + "learning_rate": 9.626828421401832e-05, + "loss": 1.9444, + "step": 4890 + }, + { + "epoch": 1.5012277470841007, + "grad_norm": 0.4757349193096161, + "learning_rate": 9.6266399767174e-05, + "loss": 2.0699, + "step": 4891 + }, + { + "epoch": 1.5015346838551258, + "grad_norm": 0.5556650757789612, + "learning_rate": 9.6264514863098e-05, + "loss": 1.99, + "step": 4892 + }, + { + "epoch": 1.5018416206261511, + "grad_norm": 0.5072291493415833, + "learning_rate": 9.626262950180894e-05, + "loss": 1.9435, + "step": 4893 + }, + { + "epoch": 1.5021485573971762, + "grad_norm": 0.47811564803123474, + "learning_rate": 9.626074368332546e-05, + "loss": 1.9399, + "step": 4894 + }, + { + "epoch": 1.5024554941682013, + "grad_norm": 0.4613232910633087, + "learning_rate": 9.62588574076662e-05, + "loss": 1.9259, + "step": 4895 + }, + { + "epoch": 1.5027624309392267, + "grad_norm": 0.4170697331428528, + "learning_rate": 9.62569706748498e-05, + "loss": 1.9319, + "step": 4896 + }, + { + "epoch": 1.5030693677102516, + "grad_norm": 0.4731575548648834, + "learning_rate": 9.62550834848949e-05, + "loss": 1.9862, + "step": 4897 + }, + { + "epoch": 1.503376304481277, + "grad_norm": 0.49881401658058167, + "learning_rate": 9.625319583782016e-05, + "loss": 1.9837, + "step": 4898 + }, + { + "epoch": 1.503683241252302, + "grad_norm": 0.4689660668373108, + "learning_rate": 9.625130773364424e-05, + "loss": 1.9662, + "step": 4899 + }, + { + "epoch": 1.5039901780233271, + "grad_norm": 0.48389768600463867, + "learning_rate": 9.624941917238577e-05, + "loss": 2.0087, + "step": 4900 + }, + { + "epoch": 1.5042971147943525, + "grad_norm": 0.46716609597206116, + "learning_rate": 9.624753015406342e-05, + "loss": 1.9718, + "step": 4901 + }, + { + "epoch": 1.5046040515653776, + "grad_norm": 0.544793963432312, + "learning_rate": 9.62456406786959e-05, + "loss": 1.9878, + "step": 4902 + }, + { + "epoch": 1.5049109883364027, + "grad_norm": 0.44499701261520386, + "learning_rate": 9.624375074630183e-05, + "loss": 1.8849, + "step": 4903 + }, + { + "epoch": 1.505217925107428, + "grad_norm": 0.42464208602905273, + "learning_rate": 9.624186035689993e-05, + "loss": 1.8995, + "step": 4904 + }, + { + "epoch": 1.505524861878453, + "grad_norm": 0.41650670766830444, + "learning_rate": 9.623996951050885e-05, + "loss": 1.9138, + "step": 4905 + }, + { + "epoch": 1.5058317986494782, + "grad_norm": 0.37955889105796814, + "learning_rate": 9.62380782071473e-05, + "loss": 1.9746, + "step": 4906 + }, + { + "epoch": 1.5061387354205034, + "grad_norm": 0.3799228072166443, + "learning_rate": 9.623618644683394e-05, + "loss": 1.942, + "step": 4907 + }, + { + "epoch": 1.5064456721915285, + "grad_norm": 0.3799766004085541, + "learning_rate": 9.623429422958751e-05, + "loss": 1.9025, + "step": 4908 + }, + { + "epoch": 1.5067526089625538, + "grad_norm": 0.3780234456062317, + "learning_rate": 9.623240155542668e-05, + "loss": 1.9581, + "step": 4909 + }, + { + "epoch": 1.507059545733579, + "grad_norm": 0.36379706859588623, + "learning_rate": 9.623050842437014e-05, + "loss": 1.9299, + "step": 4910 + }, + { + "epoch": 1.507366482504604, + "grad_norm": 0.5230580568313599, + "learning_rate": 9.622861483643663e-05, + "loss": 2.0306, + "step": 4911 + }, + { + "epoch": 1.5076734192756294, + "grad_norm": 0.443945050239563, + "learning_rate": 9.622672079164486e-05, + "loss": 1.9032, + "step": 4912 + }, + { + "epoch": 1.5079803560466543, + "grad_norm": 0.4689701795578003, + "learning_rate": 9.622482629001355e-05, + "loss": 1.9901, + "step": 4913 + }, + { + "epoch": 1.5082872928176796, + "grad_norm": 0.4483632445335388, + "learning_rate": 9.622293133156139e-05, + "loss": 1.948, + "step": 4914 + }, + { + "epoch": 1.5085942295887047, + "grad_norm": 0.4064919948577881, + "learning_rate": 9.622103591630715e-05, + "loss": 1.9487, + "step": 4915 + }, + { + "epoch": 1.5089011663597298, + "grad_norm": 0.44170522689819336, + "learning_rate": 9.621914004426952e-05, + "loss": 1.9929, + "step": 4916 + }, + { + "epoch": 1.5092081031307552, + "grad_norm": 0.45979443192481995, + "learning_rate": 9.621724371546727e-05, + "loss": 1.9428, + "step": 4917 + }, + { + "epoch": 1.5095150399017803, + "grad_norm": 0.5258452892303467, + "learning_rate": 9.621534692991913e-05, + "loss": 2.0049, + "step": 4918 + }, + { + "epoch": 1.5098219766728054, + "grad_norm": 0.45191919803619385, + "learning_rate": 9.621344968764385e-05, + "loss": 2.0364, + "step": 4919 + }, + { + "epoch": 1.5101289134438307, + "grad_norm": 0.539245069026947, + "learning_rate": 9.621155198866016e-05, + "loss": 2.072, + "step": 4920 + }, + { + "epoch": 1.5104358502148556, + "grad_norm": 0.5410256385803223, + "learning_rate": 9.620965383298684e-05, + "loss": 2.0231, + "step": 4921 + }, + { + "epoch": 1.510742786985881, + "grad_norm": 0.4409741759300232, + "learning_rate": 9.620775522064264e-05, + "loss": 1.9024, + "step": 4922 + }, + { + "epoch": 1.511049723756906, + "grad_norm": 0.4911535680294037, + "learning_rate": 9.620585615164631e-05, + "loss": 2.0057, + "step": 4923 + }, + { + "epoch": 1.5113566605279312, + "grad_norm": 0.48139557242393494, + "learning_rate": 9.620395662601663e-05, + "loss": 2.0175, + "step": 4924 + }, + { + "epoch": 1.5116635972989565, + "grad_norm": 0.5130077004432678, + "learning_rate": 9.620205664377238e-05, + "loss": 1.952, + "step": 4925 + }, + { + "epoch": 1.5119705340699816, + "grad_norm": 0.5428542494773865, + "learning_rate": 9.62001562049323e-05, + "loss": 1.977, + "step": 4926 + }, + { + "epoch": 1.5122774708410067, + "grad_norm": 0.4586256444454193, + "learning_rate": 9.619825530951522e-05, + "loss": 1.9997, + "step": 4927 + }, + { + "epoch": 1.512584407612032, + "grad_norm": 0.3941349387168884, + "learning_rate": 9.61963539575399e-05, + "loss": 1.9174, + "step": 4928 + }, + { + "epoch": 1.512891344383057, + "grad_norm": 0.4396456480026245, + "learning_rate": 9.619445214902511e-05, + "loss": 1.9696, + "step": 4929 + }, + { + "epoch": 1.5131982811540823, + "grad_norm": 0.5413886904716492, + "learning_rate": 9.61925498839897e-05, + "loss": 2.0332, + "step": 4930 + }, + { + "epoch": 1.5135052179251074, + "grad_norm": 0.5946230888366699, + "learning_rate": 9.619064716245242e-05, + "loss": 2.0433, + "step": 4931 + }, + { + "epoch": 1.5138121546961325, + "grad_norm": 0.6353569030761719, + "learning_rate": 9.618874398443211e-05, + "loss": 1.9828, + "step": 4932 + }, + { + "epoch": 1.5141190914671578, + "grad_norm": 0.523690938949585, + "learning_rate": 9.618684034994754e-05, + "loss": 1.9024, + "step": 4933 + }, + { + "epoch": 1.514426028238183, + "grad_norm": 0.4437367022037506, + "learning_rate": 9.618493625901754e-05, + "loss": 1.9961, + "step": 4934 + }, + { + "epoch": 1.514732965009208, + "grad_norm": 0.48458734154701233, + "learning_rate": 9.618303171166094e-05, + "loss": 1.9515, + "step": 4935 + }, + { + "epoch": 1.5150399017802334, + "grad_norm": 0.47659310698509216, + "learning_rate": 9.618112670789657e-05, + "loss": 1.9943, + "step": 4936 + }, + { + "epoch": 1.5153468385512583, + "grad_norm": 0.49281415343284607, + "learning_rate": 9.617922124774322e-05, + "loss": 1.9311, + "step": 4937 + }, + { + "epoch": 1.5156537753222836, + "grad_norm": 0.4706041216850281, + "learning_rate": 9.617731533121972e-05, + "loss": 1.9478, + "step": 4938 + }, + { + "epoch": 1.5159607120933087, + "grad_norm": 0.4187149405479431, + "learning_rate": 9.617540895834496e-05, + "loss": 1.9915, + "step": 4939 + }, + { + "epoch": 1.5162676488643339, + "grad_norm": 0.3792540431022644, + "learning_rate": 9.617350212913772e-05, + "loss": 1.8609, + "step": 4940 + }, + { + "epoch": 1.5165745856353592, + "grad_norm": 0.46558165550231934, + "learning_rate": 9.617159484361688e-05, + "loss": 1.9574, + "step": 4941 + }, + { + "epoch": 1.5168815224063843, + "grad_norm": 0.4930344820022583, + "learning_rate": 9.616968710180127e-05, + "loss": 1.9924, + "step": 4942 + }, + { + "epoch": 1.5171884591774094, + "grad_norm": 0.44909337162971497, + "learning_rate": 9.616777890370976e-05, + "loss": 1.9674, + "step": 4943 + }, + { + "epoch": 1.5174953959484347, + "grad_norm": 0.43266600370407104, + "learning_rate": 9.616587024936119e-05, + "loss": 1.8899, + "step": 4944 + }, + { + "epoch": 1.5178023327194596, + "grad_norm": 0.43229207396507263, + "learning_rate": 9.616396113877444e-05, + "loss": 1.9671, + "step": 4945 + }, + { + "epoch": 1.518109269490485, + "grad_norm": 0.4609402120113373, + "learning_rate": 9.616205157196837e-05, + "loss": 1.9844, + "step": 4946 + }, + { + "epoch": 1.51841620626151, + "grad_norm": 0.4598314166069031, + "learning_rate": 9.616014154896184e-05, + "loss": 1.985, + "step": 4947 + }, + { + "epoch": 1.5187231430325352, + "grad_norm": 0.4746960997581482, + "learning_rate": 9.615823106977376e-05, + "loss": 2.0199, + "step": 4948 + }, + { + "epoch": 1.5190300798035605, + "grad_norm": 0.47560420632362366, + "learning_rate": 9.615632013442295e-05, + "loss": 1.8864, + "step": 4949 + }, + { + "epoch": 1.5193370165745856, + "grad_norm": 0.447837233543396, + "learning_rate": 9.615440874292835e-05, + "loss": 1.9699, + "step": 4950 + }, + { + "epoch": 1.5196439533456108, + "grad_norm": 0.49653175473213196, + "learning_rate": 9.615249689530883e-05, + "loss": 2.0645, + "step": 4951 + }, + { + "epoch": 1.519950890116636, + "grad_norm": 0.47083014249801636, + "learning_rate": 9.615058459158328e-05, + "loss": 2.01, + "step": 4952 + }, + { + "epoch": 1.520257826887661, + "grad_norm": 0.5299197435379028, + "learning_rate": 9.614867183177061e-05, + "loss": 2.0232, + "step": 4953 + }, + { + "epoch": 1.5205647636586863, + "grad_norm": 0.5005922317504883, + "learning_rate": 9.614675861588971e-05, + "loss": 1.9703, + "step": 4954 + }, + { + "epoch": 1.5208717004297114, + "grad_norm": 0.5131978392601013, + "learning_rate": 9.61448449439595e-05, + "loss": 1.9921, + "step": 4955 + }, + { + "epoch": 1.5211786372007365, + "grad_norm": 0.5278428196907043, + "learning_rate": 9.614293081599889e-05, + "loss": 1.9111, + "step": 4956 + }, + { + "epoch": 1.5214855739717619, + "grad_norm": 0.4914579689502716, + "learning_rate": 9.614101623202678e-05, + "loss": 2.0398, + "step": 4957 + }, + { + "epoch": 1.521792510742787, + "grad_norm": 0.454863041639328, + "learning_rate": 9.61391011920621e-05, + "loss": 1.9674, + "step": 4958 + }, + { + "epoch": 1.522099447513812, + "grad_norm": 0.464491605758667, + "learning_rate": 9.613718569612379e-05, + "loss": 2.0123, + "step": 4959 + }, + { + "epoch": 1.5224063842848374, + "grad_norm": 0.4252295196056366, + "learning_rate": 9.613526974423078e-05, + "loss": 1.9796, + "step": 4960 + }, + { + "epoch": 1.5227133210558625, + "grad_norm": 0.4643968641757965, + "learning_rate": 9.613335333640199e-05, + "loss": 1.9448, + "step": 4961 + }, + { + "epoch": 1.5230202578268877, + "grad_norm": 0.4204397201538086, + "learning_rate": 9.613143647265635e-05, + "loss": 2.0191, + "step": 4962 + }, + { + "epoch": 1.523327194597913, + "grad_norm": 0.3838767111301422, + "learning_rate": 9.612951915301283e-05, + "loss": 1.9057, + "step": 4963 + }, + { + "epoch": 1.5236341313689379, + "grad_norm": 0.4353863000869751, + "learning_rate": 9.612760137749035e-05, + "loss": 2.0435, + "step": 4964 + }, + { + "epoch": 1.5239410681399632, + "grad_norm": 0.4082738757133484, + "learning_rate": 9.612568314610788e-05, + "loss": 1.9229, + "step": 4965 + }, + { + "epoch": 1.5242480049109883, + "grad_norm": 0.4382591247558594, + "learning_rate": 9.612376445888437e-05, + "loss": 1.9185, + "step": 4966 + }, + { + "epoch": 1.5245549416820134, + "grad_norm": 0.48340749740600586, + "learning_rate": 9.61218453158388e-05, + "loss": 1.9669, + "step": 4967 + }, + { + "epoch": 1.5248618784530388, + "grad_norm": 0.47423556447029114, + "learning_rate": 9.611992571699012e-05, + "loss": 1.9372, + "step": 4968 + }, + { + "epoch": 1.525168815224064, + "grad_norm": 0.4070637822151184, + "learning_rate": 9.611800566235728e-05, + "loss": 2.0201, + "step": 4969 + }, + { + "epoch": 1.525475751995089, + "grad_norm": 0.43758198618888855, + "learning_rate": 9.61160851519593e-05, + "loss": 1.982, + "step": 4970 + }, + { + "epoch": 1.5257826887661143, + "grad_norm": 0.4724174737930298, + "learning_rate": 9.611416418581513e-05, + "loss": 1.9938, + "step": 4971 + }, + { + "epoch": 1.5260896255371392, + "grad_norm": 0.492405503988266, + "learning_rate": 9.611224276394374e-05, + "loss": 1.9462, + "step": 4972 + }, + { + "epoch": 1.5263965623081646, + "grad_norm": 0.5064161419868469, + "learning_rate": 9.611032088636418e-05, + "loss": 2.0326, + "step": 4973 + }, + { + "epoch": 1.5267034990791897, + "grad_norm": 0.4256031811237335, + "learning_rate": 9.610839855309537e-05, + "loss": 1.8885, + "step": 4974 + }, + { + "epoch": 1.5270104358502148, + "grad_norm": 0.4283316731452942, + "learning_rate": 9.610647576415636e-05, + "loss": 2.005, + "step": 4975 + }, + { + "epoch": 1.5273173726212401, + "grad_norm": 0.44234412908554077, + "learning_rate": 9.610455251956614e-05, + "loss": 1.9626, + "step": 4976 + }, + { + "epoch": 1.5276243093922652, + "grad_norm": 0.4135831594467163, + "learning_rate": 9.610262881934369e-05, + "loss": 1.9529, + "step": 4977 + }, + { + "epoch": 1.5279312461632903, + "grad_norm": 0.48090922832489014, + "learning_rate": 9.610070466350805e-05, + "loss": 2.0239, + "step": 4978 + }, + { + "epoch": 1.5282381829343157, + "grad_norm": 0.4546974301338196, + "learning_rate": 9.609878005207822e-05, + "loss": 1.9556, + "step": 4979 + }, + { + "epoch": 1.5285451197053406, + "grad_norm": 0.4197862148284912, + "learning_rate": 9.609685498507323e-05, + "loss": 1.9117, + "step": 4980 + }, + { + "epoch": 1.528852056476366, + "grad_norm": 0.4376974105834961, + "learning_rate": 9.60949294625121e-05, + "loss": 1.9514, + "step": 4981 + }, + { + "epoch": 1.529158993247391, + "grad_norm": 0.3671407401561737, + "learning_rate": 9.609300348441385e-05, + "loss": 1.9042, + "step": 4982 + }, + { + "epoch": 1.5294659300184161, + "grad_norm": 0.4326031506061554, + "learning_rate": 9.609107705079754e-05, + "loss": 1.9606, + "step": 4983 + }, + { + "epoch": 1.5297728667894415, + "grad_norm": 0.423308402299881, + "learning_rate": 9.608915016168218e-05, + "loss": 1.9663, + "step": 4984 + }, + { + "epoch": 1.5300798035604666, + "grad_norm": 0.46309906244277954, + "learning_rate": 9.608722281708683e-05, + "loss": 2.0114, + "step": 4985 + }, + { + "epoch": 1.5303867403314917, + "grad_norm": 0.4619913101196289, + "learning_rate": 9.608529501703053e-05, + "loss": 1.9328, + "step": 4986 + }, + { + "epoch": 1.530693677102517, + "grad_norm": 0.4335738718509674, + "learning_rate": 9.608336676153234e-05, + "loss": 1.9069, + "step": 4987 + }, + { + "epoch": 1.531000613873542, + "grad_norm": 0.40606966614723206, + "learning_rate": 9.608143805061129e-05, + "loss": 1.9243, + "step": 4988 + }, + { + "epoch": 1.5313075506445673, + "grad_norm": 0.45613235235214233, + "learning_rate": 9.607950888428649e-05, + "loss": 1.9943, + "step": 4989 + }, + { + "epoch": 1.5316144874155924, + "grad_norm": 0.4905582666397095, + "learning_rate": 9.607757926257696e-05, + "loss": 1.9649, + "step": 4990 + }, + { + "epoch": 1.5319214241866175, + "grad_norm": 0.44312527775764465, + "learning_rate": 9.607564918550179e-05, + "loss": 1.927, + "step": 4991 + }, + { + "epoch": 1.5322283609576428, + "grad_norm": 0.5193700790405273, + "learning_rate": 9.607371865308004e-05, + "loss": 1.9038, + "step": 4992 + }, + { + "epoch": 1.532535297728668, + "grad_norm": 0.5528806447982788, + "learning_rate": 9.607178766533078e-05, + "loss": 1.9194, + "step": 4993 + }, + { + "epoch": 1.532842234499693, + "grad_norm": 0.6561285257339478, + "learning_rate": 9.606985622227314e-05, + "loss": 2.0098, + "step": 4994 + }, + { + "epoch": 1.5331491712707184, + "grad_norm": 0.5642603635787964, + "learning_rate": 9.606792432392617e-05, + "loss": 1.9781, + "step": 4995 + }, + { + "epoch": 1.5334561080417433, + "grad_norm": 0.4974311590194702, + "learning_rate": 9.606599197030896e-05, + "loss": 1.9558, + "step": 4996 + }, + { + "epoch": 1.5337630448127686, + "grad_norm": 0.4324510395526886, + "learning_rate": 9.606405916144063e-05, + "loss": 1.9749, + "step": 4997 + }, + { + "epoch": 1.5340699815837937, + "grad_norm": 0.45244327187538147, + "learning_rate": 9.606212589734027e-05, + "loss": 1.8902, + "step": 4998 + }, + { + "epoch": 1.5343769183548188, + "grad_norm": 0.5418685078620911, + "learning_rate": 9.606019217802698e-05, + "loss": 1.9766, + "step": 4999 + }, + { + "epoch": 1.5346838551258442, + "grad_norm": 0.48479241132736206, + "learning_rate": 9.605825800351987e-05, + "loss": 1.9949, + "step": 5000 + }, + { + "epoch": 1.5349907918968693, + "grad_norm": 0.4958111643791199, + "learning_rate": 9.605632337383806e-05, + "loss": 1.988, + "step": 5001 + }, + { + "epoch": 1.5352977286678944, + "grad_norm": 0.47347983717918396, + "learning_rate": 9.605438828900067e-05, + "loss": 1.9157, + "step": 5002 + }, + { + "epoch": 1.5356046654389197, + "grad_norm": 0.4018974304199219, + "learning_rate": 9.605245274902684e-05, + "loss": 1.9347, + "step": 5003 + }, + { + "epoch": 1.5359116022099446, + "grad_norm": 0.46161791682243347, + "learning_rate": 9.605051675393565e-05, + "loss": 1.9785, + "step": 5004 + }, + { + "epoch": 1.53621853898097, + "grad_norm": 0.5113234519958496, + "learning_rate": 9.604858030374627e-05, + "loss": 1.9595, + "step": 5005 + }, + { + "epoch": 1.536525475751995, + "grad_norm": 0.6643409132957458, + "learning_rate": 9.604664339847784e-05, + "loss": 2.0395, + "step": 5006 + }, + { + "epoch": 1.5368324125230202, + "grad_norm": 0.6759974360466003, + "learning_rate": 9.604470603814948e-05, + "loss": 1.9058, + "step": 5007 + }, + { + "epoch": 1.5371393492940455, + "grad_norm": 0.5576213598251343, + "learning_rate": 9.604276822278035e-05, + "loss": 1.9326, + "step": 5008 + }, + { + "epoch": 1.5374462860650706, + "grad_norm": 0.4472630023956299, + "learning_rate": 9.60408299523896e-05, + "loss": 1.9553, + "step": 5009 + }, + { + "epoch": 1.5377532228360957, + "grad_norm": 0.48445144295692444, + "learning_rate": 9.603889122699638e-05, + "loss": 2.0136, + "step": 5010 + }, + { + "epoch": 1.538060159607121, + "grad_norm": 0.4793097972869873, + "learning_rate": 9.603695204661987e-05, + "loss": 1.9777, + "step": 5011 + }, + { + "epoch": 1.538367096378146, + "grad_norm": 0.5003167390823364, + "learning_rate": 9.60350124112792e-05, + "loss": 1.9672, + "step": 5012 + }, + { + "epoch": 1.5386740331491713, + "grad_norm": 0.5131042003631592, + "learning_rate": 9.603307232099355e-05, + "loss": 2.0058, + "step": 5013 + }, + { + "epoch": 1.5389809699201964, + "grad_norm": 0.4145869314670563, + "learning_rate": 9.603113177578212e-05, + "loss": 1.9332, + "step": 5014 + }, + { + "epoch": 1.5392879066912215, + "grad_norm": 0.4939991235733032, + "learning_rate": 9.602919077566404e-05, + "loss": 1.9967, + "step": 5015 + }, + { + "epoch": 1.5395948434622468, + "grad_norm": 0.4768902361392975, + "learning_rate": 9.602724932065853e-05, + "loss": 1.873, + "step": 5016 + }, + { + "epoch": 1.539901780233272, + "grad_norm": 0.45381611585617065, + "learning_rate": 9.602530741078476e-05, + "loss": 1.9416, + "step": 5017 + }, + { + "epoch": 1.540208717004297, + "grad_norm": 0.43104392290115356, + "learning_rate": 9.602336504606193e-05, + "loss": 1.9566, + "step": 5018 + }, + { + "epoch": 1.5405156537753224, + "grad_norm": 0.5354776978492737, + "learning_rate": 9.602142222650924e-05, + "loss": 1.9939, + "step": 5019 + }, + { + "epoch": 1.5408225905463473, + "grad_norm": 0.5623740553855896, + "learning_rate": 9.601947895214586e-05, + "loss": 1.9622, + "step": 5020 + }, + { + "epoch": 1.5411295273173726, + "grad_norm": 0.5234485268592834, + "learning_rate": 9.601753522299103e-05, + "loss": 1.9636, + "step": 5021 + }, + { + "epoch": 1.5414364640883977, + "grad_norm": 0.416384756565094, + "learning_rate": 9.601559103906396e-05, + "loss": 1.92, + "step": 5022 + }, + { + "epoch": 1.5417434008594229, + "grad_norm": 0.47080478072166443, + "learning_rate": 9.601364640038384e-05, + "loss": 1.9147, + "step": 5023 + }, + { + "epoch": 1.5420503376304482, + "grad_norm": 0.527463972568512, + "learning_rate": 9.601170130696988e-05, + "loss": 1.9458, + "step": 5024 + }, + { + "epoch": 1.5423572744014733, + "grad_norm": 0.4761022925376892, + "learning_rate": 9.600975575884134e-05, + "loss": 1.95, + "step": 5025 + }, + { + "epoch": 1.5426642111724984, + "grad_norm": 0.48202264308929443, + "learning_rate": 9.600780975601741e-05, + "loss": 1.9618, + "step": 5026 + }, + { + "epoch": 1.5429711479435237, + "grad_norm": 0.43222522735595703, + "learning_rate": 9.600586329851735e-05, + "loss": 1.9869, + "step": 5027 + }, + { + "epoch": 1.5432780847145486, + "grad_norm": 0.40816691517829895, + "learning_rate": 9.600391638636037e-05, + "loss": 1.991, + "step": 5028 + }, + { + "epoch": 1.543585021485574, + "grad_norm": 0.4365478754043579, + "learning_rate": 9.600196901956572e-05, + "loss": 1.9904, + "step": 5029 + }, + { + "epoch": 1.5438919582565993, + "grad_norm": 0.41411092877388, + "learning_rate": 9.600002119815268e-05, + "loss": 1.9449, + "step": 5030 + }, + { + "epoch": 1.5441988950276242, + "grad_norm": 0.41023650765419006, + "learning_rate": 9.599807292214045e-05, + "loss": 1.9318, + "step": 5031 + }, + { + "epoch": 1.5445058317986495, + "grad_norm": 0.4844631254673004, + "learning_rate": 9.599612419154831e-05, + "loss": 1.9884, + "step": 5032 + }, + { + "epoch": 1.5448127685696746, + "grad_norm": 0.4347037374973297, + "learning_rate": 9.59941750063955e-05, + "loss": 1.8992, + "step": 5033 + }, + { + "epoch": 1.5451197053406998, + "grad_norm": 0.6414445638656616, + "learning_rate": 9.59922253667013e-05, + "loss": 2.0268, + "step": 5034 + }, + { + "epoch": 1.545426642111725, + "grad_norm": 0.6607222557067871, + "learning_rate": 9.599027527248498e-05, + "loss": 2.0116, + "step": 5035 + }, + { + "epoch": 1.5457335788827502, + "grad_norm": 0.6406869292259216, + "learning_rate": 9.59883247237658e-05, + "loss": 1.9256, + "step": 5036 + }, + { + "epoch": 1.5460405156537753, + "grad_norm": 0.5388308167457581, + "learning_rate": 9.598637372056303e-05, + "loss": 1.906, + "step": 5037 + }, + { + "epoch": 1.5463474524248007, + "grad_norm": 0.42285510897636414, + "learning_rate": 9.598442226289596e-05, + "loss": 1.9137, + "step": 5038 + }, + { + "epoch": 1.5466543891958255, + "grad_norm": 0.5622994303703308, + "learning_rate": 9.598247035078389e-05, + "loss": 1.9825, + "step": 5039 + }, + { + "epoch": 1.5469613259668509, + "grad_norm": 0.7120574116706848, + "learning_rate": 9.59805179842461e-05, + "loss": 1.9467, + "step": 5040 + }, + { + "epoch": 1.547268262737876, + "grad_norm": 0.7050338983535767, + "learning_rate": 9.597856516330187e-05, + "loss": 1.9763, + "step": 5041 + }, + { + "epoch": 1.547575199508901, + "grad_norm": 0.4908922016620636, + "learning_rate": 9.597661188797051e-05, + "loss": 1.9826, + "step": 5042 + }, + { + "epoch": 1.5478821362799264, + "grad_norm": 0.47363361716270447, + "learning_rate": 9.597465815827133e-05, + "loss": 1.9769, + "step": 5043 + }, + { + "epoch": 1.5481890730509515, + "grad_norm": 0.6289864182472229, + "learning_rate": 9.597270397422364e-05, + "loss": 1.9364, + "step": 5044 + }, + { + "epoch": 1.5484960098219767, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.597074933584673e-05, + "loss": 1.949, + "step": 5045 + }, + { + "epoch": 1.548802946593002, + "grad_norm": 0.559152364730835, + "learning_rate": 9.596879424315993e-05, + "loss": 2.0194, + "step": 5046 + }, + { + "epoch": 1.5491098833640269, + "grad_norm": 0.4613901674747467, + "learning_rate": 9.596683869618257e-05, + "loss": 1.9658, + "step": 5047 + }, + { + "epoch": 1.5494168201350522, + "grad_norm": 0.6245483160018921, + "learning_rate": 9.596488269493396e-05, + "loss": 1.9265, + "step": 5048 + }, + { + "epoch": 1.5497237569060773, + "grad_norm": 0.8100824356079102, + "learning_rate": 9.596292623943343e-05, + "loss": 1.9536, + "step": 5049 + }, + { + "epoch": 1.5500306936771024, + "grad_norm": 0.7486092448234558, + "learning_rate": 9.596096932970035e-05, + "loss": 1.9801, + "step": 5050 + }, + { + "epoch": 1.5503376304481278, + "grad_norm": 0.4803295135498047, + "learning_rate": 9.595901196575401e-05, + "loss": 1.9943, + "step": 5051 + }, + { + "epoch": 1.550644567219153, + "grad_norm": 0.5027125477790833, + "learning_rate": 9.595705414761379e-05, + "loss": 1.9036, + "step": 5052 + }, + { + "epoch": 1.550951503990178, + "grad_norm": 0.5785070657730103, + "learning_rate": 9.595509587529902e-05, + "loss": 1.9489, + "step": 5053 + }, + { + "epoch": 1.5512584407612033, + "grad_norm": 0.6017338633537292, + "learning_rate": 9.595313714882906e-05, + "loss": 1.9964, + "step": 5054 + }, + { + "epoch": 1.5515653775322282, + "grad_norm": 0.5023195147514343, + "learning_rate": 9.595117796822326e-05, + "loss": 1.9778, + "step": 5055 + }, + { + "epoch": 1.5518723143032536, + "grad_norm": 0.4488884508609772, + "learning_rate": 9.594921833350099e-05, + "loss": 2.0141, + "step": 5056 + }, + { + "epoch": 1.5521792510742787, + "grad_norm": 0.47110801935195923, + "learning_rate": 9.59472582446816e-05, + "loss": 1.9294, + "step": 5057 + }, + { + "epoch": 1.5524861878453038, + "grad_norm": 0.5292330980300903, + "learning_rate": 9.594529770178449e-05, + "loss": 2.0427, + "step": 5058 + }, + { + "epoch": 1.5527931246163291, + "grad_norm": 0.522756814956665, + "learning_rate": 9.5943336704829e-05, + "loss": 1.9854, + "step": 5059 + }, + { + "epoch": 1.5531000613873542, + "grad_norm": 0.44659632444381714, + "learning_rate": 9.594137525383455e-05, + "loss": 2.028, + "step": 5060 + }, + { + "epoch": 1.5534069981583793, + "grad_norm": 0.4745616614818573, + "learning_rate": 9.593941334882048e-05, + "loss": 1.9994, + "step": 5061 + }, + { + "epoch": 1.5537139349294047, + "grad_norm": 0.41752973198890686, + "learning_rate": 9.593745098980622e-05, + "loss": 1.9466, + "step": 5062 + }, + { + "epoch": 1.5540208717004296, + "grad_norm": 0.4548248052597046, + "learning_rate": 9.593548817681115e-05, + "loss": 1.9064, + "step": 5063 + }, + { + "epoch": 1.554327808471455, + "grad_norm": 0.45780888199806213, + "learning_rate": 9.593352490985464e-05, + "loss": 2.0254, + "step": 5064 + }, + { + "epoch": 1.55463474524248, + "grad_norm": 0.4118718206882477, + "learning_rate": 9.593156118895613e-05, + "loss": 1.9761, + "step": 5065 + }, + { + "epoch": 1.5549416820135051, + "grad_norm": 0.41350236535072327, + "learning_rate": 9.592959701413501e-05, + "loss": 1.9476, + "step": 5066 + }, + { + "epoch": 1.5552486187845305, + "grad_norm": 0.4116091728210449, + "learning_rate": 9.59276323854107e-05, + "loss": 1.9325, + "step": 5067 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.44039735198020935, + "learning_rate": 9.592566730280259e-05, + "loss": 1.9916, + "step": 5068 + }, + { + "epoch": 1.5558624923265807, + "grad_norm": 0.4028816819190979, + "learning_rate": 9.592370176633012e-05, + "loss": 1.916, + "step": 5069 + }, + { + "epoch": 1.556169429097606, + "grad_norm": 0.42046302556991577, + "learning_rate": 9.592173577601271e-05, + "loss": 1.961, + "step": 5070 + }, + { + "epoch": 1.556476365868631, + "grad_norm": 0.3749450147151947, + "learning_rate": 9.591976933186982e-05, + "loss": 1.9279, + "step": 5071 + }, + { + "epoch": 1.5567833026396563, + "grad_norm": 0.3441384434700012, + "learning_rate": 9.591780243392081e-05, + "loss": 1.8967, + "step": 5072 + }, + { + "epoch": 1.5570902394106814, + "grad_norm": 0.4032546877861023, + "learning_rate": 9.59158350821852e-05, + "loss": 1.9912, + "step": 5073 + }, + { + "epoch": 1.5573971761817065, + "grad_norm": 0.44628265500068665, + "learning_rate": 9.591386727668238e-05, + "loss": 2.0539, + "step": 5074 + }, + { + "epoch": 1.5577041129527318, + "grad_norm": 0.43606969714164734, + "learning_rate": 9.59118990174318e-05, + "loss": 1.97, + "step": 5075 + }, + { + "epoch": 1.558011049723757, + "grad_norm": 0.42076775431632996, + "learning_rate": 9.590993030445295e-05, + "loss": 1.962, + "step": 5076 + }, + { + "epoch": 1.558317986494782, + "grad_norm": 0.34569117426872253, + "learning_rate": 9.590796113776526e-05, + "loss": 1.8815, + "step": 5077 + }, + { + "epoch": 1.5586249232658074, + "grad_norm": 0.3931111693382263, + "learning_rate": 9.590599151738817e-05, + "loss": 1.9016, + "step": 5078 + }, + { + "epoch": 1.5589318600368323, + "grad_norm": 0.3952369689941406, + "learning_rate": 9.590402144334117e-05, + "loss": 1.9277, + "step": 5079 + }, + { + "epoch": 1.5592387968078576, + "grad_norm": 0.3960857689380646, + "learning_rate": 9.590205091564372e-05, + "loss": 1.947, + "step": 5080 + }, + { + "epoch": 1.5595457335788827, + "grad_norm": 0.37946292757987976, + "learning_rate": 9.590007993431532e-05, + "loss": 1.9907, + "step": 5081 + }, + { + "epoch": 1.5598526703499078, + "grad_norm": 0.41619375348091125, + "learning_rate": 9.589810849937541e-05, + "loss": 1.9451, + "step": 5082 + }, + { + "epoch": 1.5601596071209332, + "grad_norm": 0.39266669750213623, + "learning_rate": 9.58961366108435e-05, + "loss": 2.0137, + "step": 5083 + }, + { + "epoch": 1.5604665438919583, + "grad_norm": 0.39510276913642883, + "learning_rate": 9.589416426873907e-05, + "loss": 1.947, + "step": 5084 + }, + { + "epoch": 1.5607734806629834, + "grad_norm": 0.40243181586265564, + "learning_rate": 9.58921914730816e-05, + "loss": 1.8957, + "step": 5085 + }, + { + "epoch": 1.5610804174340087, + "grad_norm": 0.39877578616142273, + "learning_rate": 9.58902182238906e-05, + "loss": 1.9497, + "step": 5086 + }, + { + "epoch": 1.5613873542050336, + "grad_norm": 0.39367151260375977, + "learning_rate": 9.588824452118557e-05, + "loss": 1.9616, + "step": 5087 + }, + { + "epoch": 1.561694290976059, + "grad_norm": 0.35690104961395264, + "learning_rate": 9.5886270364986e-05, + "loss": 1.9108, + "step": 5088 + }, + { + "epoch": 1.562001227747084, + "grad_norm": 0.39512762427330017, + "learning_rate": 9.588429575531141e-05, + "loss": 1.9909, + "step": 5089 + }, + { + "epoch": 1.5623081645181092, + "grad_norm": 0.39253926277160645, + "learning_rate": 9.588232069218132e-05, + "loss": 1.937, + "step": 5090 + }, + { + "epoch": 1.5626151012891345, + "grad_norm": 0.37811553478240967, + "learning_rate": 9.588034517561526e-05, + "loss": 1.8918, + "step": 5091 + }, + { + "epoch": 1.5629220380601596, + "grad_norm": 0.38191986083984375, + "learning_rate": 9.587836920563272e-05, + "loss": 1.9149, + "step": 5092 + }, + { + "epoch": 1.5632289748311847, + "grad_norm": 0.3903779089450836, + "learning_rate": 9.587639278225326e-05, + "loss": 1.9714, + "step": 5093 + }, + { + "epoch": 1.56353591160221, + "grad_norm": 0.4467499554157257, + "learning_rate": 9.587441590549639e-05, + "loss": 1.8822, + "step": 5094 + }, + { + "epoch": 1.563842848373235, + "grad_norm": 0.3819296956062317, + "learning_rate": 9.587243857538164e-05, + "loss": 1.9212, + "step": 5095 + }, + { + "epoch": 1.5641497851442603, + "grad_norm": 0.4305097162723541, + "learning_rate": 9.587046079192858e-05, + "loss": 1.9264, + "step": 5096 + }, + { + "epoch": 1.5644567219152854, + "grad_norm": 0.4135383367538452, + "learning_rate": 9.586848255515675e-05, + "loss": 1.9743, + "step": 5097 + }, + { + "epoch": 1.5647636586863105, + "grad_norm": 0.44688066840171814, + "learning_rate": 9.586650386508566e-05, + "loss": 1.8804, + "step": 5098 + }, + { + "epoch": 1.5650705954573358, + "grad_norm": 0.5358461737632751, + "learning_rate": 9.586452472173492e-05, + "loss": 1.9485, + "step": 5099 + }, + { + "epoch": 1.565377532228361, + "grad_norm": 0.5585343837738037, + "learning_rate": 9.586254512512408e-05, + "loss": 2.0901, + "step": 5100 + }, + { + "epoch": 1.565684468999386, + "grad_norm": 0.4682343602180481, + "learning_rate": 9.586056507527266e-05, + "loss": 1.8877, + "step": 5101 + }, + { + "epoch": 1.5659914057704114, + "grad_norm": 0.44076529145240784, + "learning_rate": 9.585858457220026e-05, + "loss": 1.93, + "step": 5102 + }, + { + "epoch": 1.5662983425414365, + "grad_norm": 0.4613071382045746, + "learning_rate": 9.585660361592646e-05, + "loss": 1.9689, + "step": 5103 + }, + { + "epoch": 1.5666052793124616, + "grad_norm": 0.4589289128780365, + "learning_rate": 9.585462220647082e-05, + "loss": 1.8876, + "step": 5104 + }, + { + "epoch": 1.566912216083487, + "grad_norm": 0.3495907485485077, + "learning_rate": 9.585264034385292e-05, + "loss": 1.9013, + "step": 5105 + }, + { + "epoch": 1.5672191528545119, + "grad_norm": 0.42263728380203247, + "learning_rate": 9.585065802809235e-05, + "loss": 1.8886, + "step": 5106 + }, + { + "epoch": 1.5675260896255372, + "grad_norm": 0.4275301694869995, + "learning_rate": 9.584867525920872e-05, + "loss": 1.9865, + "step": 5107 + }, + { + "epoch": 1.5678330263965623, + "grad_norm": 0.4228142201900482, + "learning_rate": 9.584669203722161e-05, + "loss": 1.8573, + "step": 5108 + }, + { + "epoch": 1.5681399631675874, + "grad_norm": 0.4422524571418762, + "learning_rate": 9.58447083621506e-05, + "loss": 1.924, + "step": 5109 + }, + { + "epoch": 1.5684468999386127, + "grad_norm": 0.41540947556495667, + "learning_rate": 9.584272423401532e-05, + "loss": 1.969, + "step": 5110 + }, + { + "epoch": 1.5687538367096379, + "grad_norm": 0.3963775336742401, + "learning_rate": 9.584073965283538e-05, + "loss": 1.9509, + "step": 5111 + }, + { + "epoch": 1.569060773480663, + "grad_norm": 0.41465985774993896, + "learning_rate": 9.583875461863037e-05, + "loss": 1.9393, + "step": 5112 + }, + { + "epoch": 1.5693677102516883, + "grad_norm": 0.4396083652973175, + "learning_rate": 9.583676913141991e-05, + "loss": 1.9872, + "step": 5113 + }, + { + "epoch": 1.5696746470227132, + "grad_norm": 0.4247182607650757, + "learning_rate": 9.583478319122366e-05, + "loss": 1.9807, + "step": 5114 + }, + { + "epoch": 1.5699815837937385, + "grad_norm": 0.3612080216407776, + "learning_rate": 9.583279679806119e-05, + "loss": 1.9563, + "step": 5115 + }, + { + "epoch": 1.5702885205647636, + "grad_norm": 0.40084055066108704, + "learning_rate": 9.583080995195217e-05, + "loss": 1.9099, + "step": 5116 + }, + { + "epoch": 1.5705954573357888, + "grad_norm": 0.432381272315979, + "learning_rate": 9.582882265291621e-05, + "loss": 2.0167, + "step": 5117 + }, + { + "epoch": 1.570902394106814, + "grad_norm": 0.45490768551826477, + "learning_rate": 9.5826834900973e-05, + "loss": 1.9179, + "step": 5118 + }, + { + "epoch": 1.5712093308778392, + "grad_norm": 0.39158329367637634, + "learning_rate": 9.582484669614211e-05, + "loss": 1.8716, + "step": 5119 + }, + { + "epoch": 1.5715162676488643, + "grad_norm": 0.45607441663742065, + "learning_rate": 9.582285803844324e-05, + "loss": 1.9631, + "step": 5120 + }, + { + "epoch": 1.5718232044198897, + "grad_norm": 0.42591094970703125, + "learning_rate": 9.582086892789604e-05, + "loss": 1.9809, + "step": 5121 + }, + { + "epoch": 1.5721301411909145, + "grad_norm": 0.46772903203964233, + "learning_rate": 9.581887936452015e-05, + "loss": 1.9991, + "step": 5122 + }, + { + "epoch": 1.5724370779619399, + "grad_norm": 0.4450485408306122, + "learning_rate": 9.581688934833524e-05, + "loss": 1.9471, + "step": 5123 + }, + { + "epoch": 1.572744014732965, + "grad_norm": 0.37539350986480713, + "learning_rate": 9.581489887936097e-05, + "loss": 1.8624, + "step": 5124 + }, + { + "epoch": 1.57305095150399, + "grad_norm": 0.4184030294418335, + "learning_rate": 9.581290795761702e-05, + "loss": 1.9746, + "step": 5125 + }, + { + "epoch": 1.5733578882750154, + "grad_norm": 0.43275317549705505, + "learning_rate": 9.581091658312305e-05, + "loss": 2.0484, + "step": 5126 + }, + { + "epoch": 1.5736648250460405, + "grad_norm": 0.48845502734184265, + "learning_rate": 9.580892475589876e-05, + "loss": 1.9331, + "step": 5127 + }, + { + "epoch": 1.5739717618170657, + "grad_norm": 0.4653528034687042, + "learning_rate": 9.580693247596383e-05, + "loss": 1.8888, + "step": 5128 + }, + { + "epoch": 1.574278698588091, + "grad_norm": 0.4371016323566437, + "learning_rate": 9.580493974333794e-05, + "loss": 1.9004, + "step": 5129 + }, + { + "epoch": 1.5745856353591159, + "grad_norm": 0.4274102747440338, + "learning_rate": 9.580294655804079e-05, + "loss": 1.9877, + "step": 5130 + }, + { + "epoch": 1.5748925721301412, + "grad_norm": 0.4053245484828949, + "learning_rate": 9.580095292009208e-05, + "loss": 1.9253, + "step": 5131 + }, + { + "epoch": 1.5751995089011663, + "grad_norm": 0.47868627309799194, + "learning_rate": 9.579895882951151e-05, + "loss": 1.9659, + "step": 5132 + }, + { + "epoch": 1.5755064456721914, + "grad_norm": 0.47420576214790344, + "learning_rate": 9.579696428631877e-05, + "loss": 1.9115, + "step": 5133 + }, + { + "epoch": 1.5758133824432168, + "grad_norm": 0.41192150115966797, + "learning_rate": 9.57949692905336e-05, + "loss": 1.8949, + "step": 5134 + }, + { + "epoch": 1.576120319214242, + "grad_norm": 0.44949471950531006, + "learning_rate": 9.57929738421757e-05, + "loss": 1.9393, + "step": 5135 + }, + { + "epoch": 1.576427255985267, + "grad_norm": 0.38450154662132263, + "learning_rate": 9.57909779412648e-05, + "loss": 1.8399, + "step": 5136 + }, + { + "epoch": 1.5767341927562923, + "grad_norm": 0.43553364276885986, + "learning_rate": 9.57889815878206e-05, + "loss": 1.9477, + "step": 5137 + }, + { + "epoch": 1.5770411295273172, + "grad_norm": 0.4546982944011688, + "learning_rate": 9.578698478186285e-05, + "loss": 1.9169, + "step": 5138 + }, + { + "epoch": 1.5773480662983426, + "grad_norm": 0.47802838683128357, + "learning_rate": 9.57849875234113e-05, + "loss": 1.9204, + "step": 5139 + }, + { + "epoch": 1.5776550030693677, + "grad_norm": 0.3648034930229187, + "learning_rate": 9.578298981248565e-05, + "loss": 1.9157, + "step": 5140 + }, + { + "epoch": 1.5779619398403928, + "grad_norm": 0.41951245069503784, + "learning_rate": 9.578099164910565e-05, + "loss": 1.9171, + "step": 5141 + }, + { + "epoch": 1.5782688766114181, + "grad_norm": 0.5198701620101929, + "learning_rate": 9.577899303329107e-05, + "loss": 1.9786, + "step": 5142 + }, + { + "epoch": 1.5785758133824432, + "grad_norm": 0.45244187116622925, + "learning_rate": 9.577699396506165e-05, + "loss": 2.0044, + "step": 5143 + }, + { + "epoch": 1.5788827501534684, + "grad_norm": 0.3874819874763489, + "learning_rate": 9.577499444443715e-05, + "loss": 1.9385, + "step": 5144 + }, + { + "epoch": 1.5791896869244937, + "grad_norm": 0.4578075110912323, + "learning_rate": 9.577299447143733e-05, + "loss": 1.9679, + "step": 5145 + }, + { + "epoch": 1.5794966236955186, + "grad_norm": 0.6001343727111816, + "learning_rate": 9.577099404608192e-05, + "loss": 1.9331, + "step": 5146 + }, + { + "epoch": 1.579803560466544, + "grad_norm": 0.5592501759529114, + "learning_rate": 9.576899316839074e-05, + "loss": 1.8968, + "step": 5147 + }, + { + "epoch": 1.580110497237569, + "grad_norm": 0.4333004951477051, + "learning_rate": 9.576699183838356e-05, + "loss": 2.0378, + "step": 5148 + }, + { + "epoch": 1.5804174340085941, + "grad_norm": 0.40593892335891724, + "learning_rate": 9.576499005608011e-05, + "loss": 1.9878, + "step": 5149 + }, + { + "epoch": 1.5807243707796195, + "grad_norm": 0.4805290400981903, + "learning_rate": 9.576298782150023e-05, + "loss": 1.9897, + "step": 5150 + }, + { + "epoch": 1.5810313075506446, + "grad_norm": 0.4620860517024994, + "learning_rate": 9.576098513466367e-05, + "loss": 1.9808, + "step": 5151 + }, + { + "epoch": 1.5813382443216697, + "grad_norm": 0.47085410356521606, + "learning_rate": 9.575898199559023e-05, + "loss": 1.9526, + "step": 5152 + }, + { + "epoch": 1.581645181092695, + "grad_norm": 0.512971043586731, + "learning_rate": 9.575697840429971e-05, + "loss": 1.9684, + "step": 5153 + }, + { + "epoch": 1.58195211786372, + "grad_norm": 0.5474939346313477, + "learning_rate": 9.575497436081193e-05, + "loss": 2.0052, + "step": 5154 + }, + { + "epoch": 1.5822590546347453, + "grad_norm": 0.6277830004692078, + "learning_rate": 9.575296986514666e-05, + "loss": 2.042, + "step": 5155 + }, + { + "epoch": 1.5825659914057704, + "grad_norm": 0.46941256523132324, + "learning_rate": 9.575096491732372e-05, + "loss": 1.952, + "step": 5156 + }, + { + "epoch": 1.5828729281767955, + "grad_norm": 0.4948115646839142, + "learning_rate": 9.574895951736294e-05, + "loss": 1.9573, + "step": 5157 + }, + { + "epoch": 1.5831798649478208, + "grad_norm": 0.5677160024642944, + "learning_rate": 9.574695366528411e-05, + "loss": 1.9696, + "step": 5158 + }, + { + "epoch": 1.583486801718846, + "grad_norm": 0.5915918350219727, + "learning_rate": 9.574494736110708e-05, + "loss": 1.9822, + "step": 5159 + }, + { + "epoch": 1.583793738489871, + "grad_norm": 0.556413471698761, + "learning_rate": 9.574294060485168e-05, + "loss": 1.9548, + "step": 5160 + }, + { + "epoch": 1.5841006752608964, + "grad_norm": 0.4706072509288788, + "learning_rate": 9.574093339653772e-05, + "loss": 2.0052, + "step": 5161 + }, + { + "epoch": 1.5844076120319213, + "grad_norm": 0.3931087553501129, + "learning_rate": 9.573892573618505e-05, + "loss": 1.9071, + "step": 5162 + }, + { + "epoch": 1.5847145488029466, + "grad_norm": 0.4590308368206024, + "learning_rate": 9.573691762381349e-05, + "loss": 2.048, + "step": 5163 + }, + { + "epoch": 1.5850214855739717, + "grad_norm": 0.4404078423976898, + "learning_rate": 9.573490905944293e-05, + "loss": 1.9426, + "step": 5164 + }, + { + "epoch": 1.5853284223449968, + "grad_norm": 0.486074298620224, + "learning_rate": 9.573290004309318e-05, + "loss": 1.9937, + "step": 5165 + }, + { + "epoch": 1.5856353591160222, + "grad_norm": 0.4650556445121765, + "learning_rate": 9.57308905747841e-05, + "loss": 1.9821, + "step": 5166 + }, + { + "epoch": 1.5859422958870473, + "grad_norm": 0.48193567991256714, + "learning_rate": 9.572888065453557e-05, + "loss": 2.0143, + "step": 5167 + }, + { + "epoch": 1.5862492326580724, + "grad_norm": 0.43178877234458923, + "learning_rate": 9.572687028236744e-05, + "loss": 2.0066, + "step": 5168 + }, + { + "epoch": 1.5865561694290977, + "grad_norm": 0.5256033539772034, + "learning_rate": 9.572485945829957e-05, + "loss": 2.0431, + "step": 5169 + }, + { + "epoch": 1.5868631062001226, + "grad_norm": 0.4714619517326355, + "learning_rate": 9.572284818235182e-05, + "loss": 1.9411, + "step": 5170 + }, + { + "epoch": 1.587170042971148, + "grad_norm": 0.4224734902381897, + "learning_rate": 9.572083645454411e-05, + "loss": 1.9648, + "step": 5171 + }, + { + "epoch": 1.5874769797421733, + "grad_norm": 0.45965152978897095, + "learning_rate": 9.571882427489628e-05, + "loss": 1.9241, + "step": 5172 + }, + { + "epoch": 1.5877839165131982, + "grad_norm": 0.459114670753479, + "learning_rate": 9.571681164342825e-05, + "loss": 2.0197, + "step": 5173 + }, + { + "epoch": 1.5880908532842235, + "grad_norm": 0.4278501272201538, + "learning_rate": 9.571479856015988e-05, + "loss": 1.9411, + "step": 5174 + }, + { + "epoch": 1.5883977900552486, + "grad_norm": 0.6875150799751282, + "learning_rate": 9.571278502511107e-05, + "loss": 1.8876, + "step": 5175 + }, + { + "epoch": 1.5887047268262737, + "grad_norm": 0.4596772789955139, + "learning_rate": 9.571077103830174e-05, + "loss": 1.9002, + "step": 5176 + }, + { + "epoch": 1.589011663597299, + "grad_norm": 0.47587937116622925, + "learning_rate": 9.570875659975178e-05, + "loss": 2.0034, + "step": 5177 + }, + { + "epoch": 1.5893186003683242, + "grad_norm": 0.42494842410087585, + "learning_rate": 9.570674170948109e-05, + "loss": 1.9668, + "step": 5178 + }, + { + "epoch": 1.5896255371393493, + "grad_norm": 0.4231310784816742, + "learning_rate": 9.570472636750957e-05, + "loss": 1.9365, + "step": 5179 + }, + { + "epoch": 1.5899324739103746, + "grad_norm": 0.4585247337818146, + "learning_rate": 9.570271057385719e-05, + "loss": 1.9707, + "step": 5180 + }, + { + "epoch": 1.5902394106813995, + "grad_norm": 0.4146895408630371, + "learning_rate": 9.570069432854382e-05, + "loss": 1.9405, + "step": 5181 + }, + { + "epoch": 1.5905463474524248, + "grad_norm": 0.42243605852127075, + "learning_rate": 9.56986776315894e-05, + "loss": 1.8893, + "step": 5182 + }, + { + "epoch": 1.59085328422345, + "grad_norm": 0.44299328327178955, + "learning_rate": 9.569666048301386e-05, + "loss": 1.9596, + "step": 5183 + }, + { + "epoch": 1.591160220994475, + "grad_norm": 0.4950970709323883, + "learning_rate": 9.569464288283716e-05, + "loss": 1.9066, + "step": 5184 + }, + { + "epoch": 1.5914671577655004, + "grad_norm": 0.4664969742298126, + "learning_rate": 9.569262483107919e-05, + "loss": 1.9485, + "step": 5185 + }, + { + "epoch": 1.5917740945365255, + "grad_norm": 0.5052160024642944, + "learning_rate": 9.569060632775993e-05, + "loss": 1.9189, + "step": 5186 + }, + { + "epoch": 1.5920810313075506, + "grad_norm": 0.4109063446521759, + "learning_rate": 9.568858737289932e-05, + "loss": 1.9236, + "step": 5187 + }, + { + "epoch": 1.592387968078576, + "grad_norm": 0.4078194499015808, + "learning_rate": 9.568656796651731e-05, + "loss": 1.9465, + "step": 5188 + }, + { + "epoch": 1.5926949048496009, + "grad_norm": 0.43199312686920166, + "learning_rate": 9.568454810863385e-05, + "loss": 1.9537, + "step": 5189 + }, + { + "epoch": 1.5930018416206262, + "grad_norm": 0.46389925479888916, + "learning_rate": 9.568252779926891e-05, + "loss": 1.9463, + "step": 5190 + }, + { + "epoch": 1.5933087783916513, + "grad_norm": 0.4130708575248718, + "learning_rate": 9.568050703844247e-05, + "loss": 1.948, + "step": 5191 + }, + { + "epoch": 1.5936157151626764, + "grad_norm": 0.4699256122112274, + "learning_rate": 9.567848582617448e-05, + "loss": 1.957, + "step": 5192 + }, + { + "epoch": 1.5939226519337018, + "grad_norm": 0.41965460777282715, + "learning_rate": 9.56764641624849e-05, + "loss": 1.9622, + "step": 5193 + }, + { + "epoch": 1.5942295887047269, + "grad_norm": 0.4313151240348816, + "learning_rate": 9.567444204739376e-05, + "loss": 1.981, + "step": 5194 + }, + { + "epoch": 1.594536525475752, + "grad_norm": 0.4149332642555237, + "learning_rate": 9.5672419480921e-05, + "loss": 1.9542, + "step": 5195 + }, + { + "epoch": 1.5948434622467773, + "grad_norm": 0.4456483721733093, + "learning_rate": 9.567039646308661e-05, + "loss": 2.0206, + "step": 5196 + }, + { + "epoch": 1.5951503990178022, + "grad_norm": 0.46637552976608276, + "learning_rate": 9.56683729939106e-05, + "loss": 2.0264, + "step": 5197 + }, + { + "epoch": 1.5954573357888275, + "grad_norm": 0.4809871315956116, + "learning_rate": 9.566634907341297e-05, + "loss": 1.9113, + "step": 5198 + }, + { + "epoch": 1.5957642725598526, + "grad_norm": 0.5220670104026794, + "learning_rate": 9.566432470161371e-05, + "loss": 1.9806, + "step": 5199 + }, + { + "epoch": 1.5960712093308778, + "grad_norm": 0.5020555853843689, + "learning_rate": 9.566229987853283e-05, + "loss": 1.9925, + "step": 5200 + }, + { + "epoch": 1.596378146101903, + "grad_norm": 0.5481683611869812, + "learning_rate": 9.566027460419034e-05, + "loss": 1.978, + "step": 5201 + }, + { + "epoch": 1.5966850828729282, + "grad_norm": 0.5014147758483887, + "learning_rate": 9.565824887860624e-05, + "loss": 1.9402, + "step": 5202 + }, + { + "epoch": 1.5969920196439533, + "grad_norm": 0.43973588943481445, + "learning_rate": 9.565622270180057e-05, + "loss": 1.9877, + "step": 5203 + }, + { + "epoch": 1.5972989564149787, + "grad_norm": 0.5172939300537109, + "learning_rate": 9.565419607379335e-05, + "loss": 1.9304, + "step": 5204 + }, + { + "epoch": 1.5976058931860035, + "grad_norm": 0.4767214357852936, + "learning_rate": 9.56521689946046e-05, + "loss": 1.9063, + "step": 5205 + }, + { + "epoch": 1.5979128299570289, + "grad_norm": 0.48810651898384094, + "learning_rate": 9.565014146425437e-05, + "loss": 1.9473, + "step": 5206 + }, + { + "epoch": 1.598219766728054, + "grad_norm": 0.4204402565956116, + "learning_rate": 9.564811348276269e-05, + "loss": 1.9562, + "step": 5207 + }, + { + "epoch": 1.598526703499079, + "grad_norm": 0.42679163813591003, + "learning_rate": 9.564608505014958e-05, + "loss": 1.8904, + "step": 5208 + }, + { + "epoch": 1.5988336402701044, + "grad_norm": 0.4240354299545288, + "learning_rate": 9.56440561664351e-05, + "loss": 1.9982, + "step": 5209 + }, + { + "epoch": 1.5991405770411296, + "grad_norm": 0.41588497161865234, + "learning_rate": 9.564202683163932e-05, + "loss": 1.9904, + "step": 5210 + }, + { + "epoch": 1.5994475138121547, + "grad_norm": 0.486240029335022, + "learning_rate": 9.563999704578226e-05, + "loss": 1.9379, + "step": 5211 + }, + { + "epoch": 1.59975445058318, + "grad_norm": 0.4628448188304901, + "learning_rate": 9.563796680888403e-05, + "loss": 2.0061, + "step": 5212 + }, + { + "epoch": 1.600061387354205, + "grad_norm": 0.4514544606208801, + "learning_rate": 9.563593612096464e-05, + "loss": 1.9692, + "step": 5213 + }, + { + "epoch": 1.6003683241252302, + "grad_norm": 0.3869803845882416, + "learning_rate": 9.563390498204419e-05, + "loss": 1.8801, + "step": 5214 + }, + { + "epoch": 1.6006752608962553, + "grad_norm": 0.47029098868370056, + "learning_rate": 9.563187339214274e-05, + "loss": 2.0457, + "step": 5215 + }, + { + "epoch": 1.6009821976672804, + "grad_norm": 0.49051982164382935, + "learning_rate": 9.562984135128037e-05, + "loss": 1.9121, + "step": 5216 + }, + { + "epoch": 1.6012891344383058, + "grad_norm": 0.5087830424308777, + "learning_rate": 9.562780885947717e-05, + "loss": 1.9165, + "step": 5217 + }, + { + "epoch": 1.601596071209331, + "grad_norm": 0.4597826600074768, + "learning_rate": 9.562577591675322e-05, + "loss": 1.9037, + "step": 5218 + }, + { + "epoch": 1.601903007980356, + "grad_norm": 0.43610528111457825, + "learning_rate": 9.562374252312858e-05, + "loss": 1.8785, + "step": 5219 + }, + { + "epoch": 1.6022099447513813, + "grad_norm": 0.45797282457351685, + "learning_rate": 9.56217086786234e-05, + "loss": 2.0713, + "step": 5220 + }, + { + "epoch": 1.6025168815224062, + "grad_norm": 0.46097078919410706, + "learning_rate": 9.561967438325777e-05, + "loss": 1.9176, + "step": 5221 + }, + { + "epoch": 1.6028238182934316, + "grad_norm": 0.47368288040161133, + "learning_rate": 9.561763963705176e-05, + "loss": 1.9333, + "step": 5222 + }, + { + "epoch": 1.6031307550644567, + "grad_norm": 0.5048179626464844, + "learning_rate": 9.561560444002551e-05, + "loss": 1.9473, + "step": 5223 + }, + { + "epoch": 1.6034376918354818, + "grad_norm": 0.42069435119628906, + "learning_rate": 9.56135687921991e-05, + "loss": 1.8507, + "step": 5224 + }, + { + "epoch": 1.6037446286065071, + "grad_norm": 0.37166985869407654, + "learning_rate": 9.561153269359269e-05, + "loss": 1.9404, + "step": 5225 + }, + { + "epoch": 1.6040515653775322, + "grad_norm": 0.42752668261528015, + "learning_rate": 9.560949614422637e-05, + "loss": 1.9791, + "step": 5226 + }, + { + "epoch": 1.6043585021485574, + "grad_norm": 0.4334527552127838, + "learning_rate": 9.560745914412029e-05, + "loss": 1.972, + "step": 5227 + }, + { + "epoch": 1.6046654389195827, + "grad_norm": 0.44162631034851074, + "learning_rate": 9.560542169329454e-05, + "loss": 1.9054, + "step": 5228 + }, + { + "epoch": 1.6049723756906076, + "grad_norm": 0.3891509771347046, + "learning_rate": 9.560338379176929e-05, + "loss": 1.9356, + "step": 5229 + }, + { + "epoch": 1.605279312461633, + "grad_norm": 0.3821989893913269, + "learning_rate": 9.56013454395647e-05, + "loss": 1.9197, + "step": 5230 + }, + { + "epoch": 1.605586249232658, + "grad_norm": 0.4338948428630829, + "learning_rate": 9.559930663670084e-05, + "loss": 2.002, + "step": 5231 + }, + { + "epoch": 1.6058931860036831, + "grad_norm": 0.4784114956855774, + "learning_rate": 9.559726738319794e-05, + "loss": 2.0344, + "step": 5232 + }, + { + "epoch": 1.6062001227747085, + "grad_norm": 0.43362441658973694, + "learning_rate": 9.559522767907612e-05, + "loss": 1.9282, + "step": 5233 + }, + { + "epoch": 1.6065070595457336, + "grad_norm": 0.40863800048828125, + "learning_rate": 9.559318752435553e-05, + "loss": 1.8468, + "step": 5234 + }, + { + "epoch": 1.6068139963167587, + "grad_norm": 0.4509727358818054, + "learning_rate": 9.559114691905633e-05, + "loss": 2.0175, + "step": 5235 + }, + { + "epoch": 1.607120933087784, + "grad_norm": 0.4650020897388458, + "learning_rate": 9.55891058631987e-05, + "loss": 1.9946, + "step": 5236 + }, + { + "epoch": 1.607427869858809, + "grad_norm": 0.4315911829471588, + "learning_rate": 9.55870643568028e-05, + "loss": 1.9271, + "step": 5237 + }, + { + "epoch": 1.6077348066298343, + "grad_norm": 0.4109809994697571, + "learning_rate": 9.558502239988882e-05, + "loss": 1.9791, + "step": 5238 + }, + { + "epoch": 1.6080417434008594, + "grad_norm": 0.4323776662349701, + "learning_rate": 9.558297999247692e-05, + "loss": 1.9745, + "step": 5239 + }, + { + "epoch": 1.6083486801718845, + "grad_norm": 0.4255007207393646, + "learning_rate": 9.558093713458729e-05, + "loss": 1.96, + "step": 5240 + }, + { + "epoch": 1.6086556169429098, + "grad_norm": 0.4045571982860565, + "learning_rate": 9.557889382624014e-05, + "loss": 1.9148, + "step": 5241 + }, + { + "epoch": 1.608962553713935, + "grad_norm": 0.39663615822792053, + "learning_rate": 9.557685006745564e-05, + "loss": 1.9313, + "step": 5242 + }, + { + "epoch": 1.60926949048496, + "grad_norm": 0.39130523800849915, + "learning_rate": 9.5574805858254e-05, + "loss": 2.0073, + "step": 5243 + }, + { + "epoch": 1.6095764272559854, + "grad_norm": 0.4071548581123352, + "learning_rate": 9.55727611986554e-05, + "loss": 1.9353, + "step": 5244 + }, + { + "epoch": 1.6098833640270105, + "grad_norm": 0.44347357749938965, + "learning_rate": 9.557071608868007e-05, + "loss": 1.9325, + "step": 5245 + }, + { + "epoch": 1.6101903007980356, + "grad_norm": 0.48900067806243896, + "learning_rate": 9.556867052834821e-05, + "loss": 2.0083, + "step": 5246 + }, + { + "epoch": 1.610497237569061, + "grad_norm": 0.44374197721481323, + "learning_rate": 9.556662451768006e-05, + "loss": 2.0143, + "step": 5247 + }, + { + "epoch": 1.6108041743400858, + "grad_norm": 0.385268896818161, + "learning_rate": 9.556457805669581e-05, + "loss": 1.8981, + "step": 5248 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.5355607867240906, + "learning_rate": 9.556253114541569e-05, + "loss": 2.0413, + "step": 5249 + }, + { + "epoch": 1.6114180478821363, + "grad_norm": 0.5672646164894104, + "learning_rate": 9.556048378385992e-05, + "loss": 1.9429, + "step": 5250 + }, + { + "epoch": 1.6117249846531614, + "grad_norm": 0.46225669980049133, + "learning_rate": 9.555843597204875e-05, + "loss": 1.9883, + "step": 5251 + }, + { + "epoch": 1.6120319214241867, + "grad_norm": 0.43236228823661804, + "learning_rate": 9.555638771000243e-05, + "loss": 1.9641, + "step": 5252 + }, + { + "epoch": 1.6123388581952118, + "grad_norm": 0.4843178987503052, + "learning_rate": 9.555433899774116e-05, + "loss": 1.9224, + "step": 5253 + }, + { + "epoch": 1.612645794966237, + "grad_norm": 0.4693675637245178, + "learning_rate": 9.555228983528523e-05, + "loss": 1.9774, + "step": 5254 + }, + { + "epoch": 1.6129527317372623, + "grad_norm": 0.3968529999256134, + "learning_rate": 9.555024022265487e-05, + "loss": 1.8939, + "step": 5255 + }, + { + "epoch": 1.6132596685082872, + "grad_norm": 0.42781850695610046, + "learning_rate": 9.554819015987033e-05, + "loss": 1.9561, + "step": 5256 + }, + { + "epoch": 1.6135666052793125, + "grad_norm": 0.5241015553474426, + "learning_rate": 9.554613964695189e-05, + "loss": 1.963, + "step": 5257 + }, + { + "epoch": 1.6138735420503376, + "grad_norm": 0.4292888641357422, + "learning_rate": 9.554408868391979e-05, + "loss": 2.0248, + "step": 5258 + }, + { + "epoch": 1.6141804788213627, + "grad_norm": 0.49197763204574585, + "learning_rate": 9.554203727079433e-05, + "loss": 1.9612, + "step": 5259 + }, + { + "epoch": 1.614487415592388, + "grad_norm": 0.45733556151390076, + "learning_rate": 9.553998540759575e-05, + "loss": 1.9093, + "step": 5260 + }, + { + "epoch": 1.6147943523634132, + "grad_norm": 0.4139576256275177, + "learning_rate": 9.553793309434436e-05, + "loss": 1.875, + "step": 5261 + }, + { + "epoch": 1.6151012891344383, + "grad_norm": 0.42295894026756287, + "learning_rate": 9.55358803310604e-05, + "loss": 1.9427, + "step": 5262 + }, + { + "epoch": 1.6154082259054636, + "grad_norm": 0.370761513710022, + "learning_rate": 9.55338271177642e-05, + "loss": 1.932, + "step": 5263 + }, + { + "epoch": 1.6157151626764885, + "grad_norm": 0.38912683725357056, + "learning_rate": 9.553177345447602e-05, + "loss": 1.9606, + "step": 5264 + }, + { + "epoch": 1.6160220994475138, + "grad_norm": 0.3901510238647461, + "learning_rate": 9.552971934121618e-05, + "loss": 1.9455, + "step": 5265 + }, + { + "epoch": 1.616329036218539, + "grad_norm": 0.4517458975315094, + "learning_rate": 9.552766477800494e-05, + "loss": 1.9291, + "step": 5266 + }, + { + "epoch": 1.616635972989564, + "grad_norm": 0.47282713651657104, + "learning_rate": 9.552560976486266e-05, + "loss": 1.9326, + "step": 5267 + }, + { + "epoch": 1.6169429097605894, + "grad_norm": 0.4741488993167877, + "learning_rate": 9.552355430180961e-05, + "loss": 1.9782, + "step": 5268 + }, + { + "epoch": 1.6172498465316145, + "grad_norm": 0.42634037137031555, + "learning_rate": 9.552149838886612e-05, + "loss": 1.9871, + "step": 5269 + }, + { + "epoch": 1.6175567833026396, + "grad_norm": 0.39007633924484253, + "learning_rate": 9.55194420260525e-05, + "loss": 1.9397, + "step": 5270 + }, + { + "epoch": 1.617863720073665, + "grad_norm": 0.41707170009613037, + "learning_rate": 9.551738521338906e-05, + "loss": 1.8555, + "step": 5271 + }, + { + "epoch": 1.6181706568446899, + "grad_norm": 0.46702343225479126, + "learning_rate": 9.551532795089616e-05, + "loss": 1.9987, + "step": 5272 + }, + { + "epoch": 1.6184775936157152, + "grad_norm": 0.44585564732551575, + "learning_rate": 9.551327023859411e-05, + "loss": 1.8512, + "step": 5273 + }, + { + "epoch": 1.6187845303867403, + "grad_norm": 0.42617684602737427, + "learning_rate": 9.551121207650324e-05, + "loss": 1.9405, + "step": 5274 + }, + { + "epoch": 1.6190914671577654, + "grad_norm": 0.39399340748786926, + "learning_rate": 9.55091534646439e-05, + "loss": 1.9787, + "step": 5275 + }, + { + "epoch": 1.6193984039287908, + "grad_norm": 0.44386324286460876, + "learning_rate": 9.550709440303642e-05, + "loss": 1.9791, + "step": 5276 + }, + { + "epoch": 1.6197053406998159, + "grad_norm": 0.3871287405490875, + "learning_rate": 9.550503489170117e-05, + "loss": 1.9354, + "step": 5277 + }, + { + "epoch": 1.620012277470841, + "grad_norm": 0.4131690263748169, + "learning_rate": 9.550297493065851e-05, + "loss": 1.9709, + "step": 5278 + }, + { + "epoch": 1.6203192142418663, + "grad_norm": 0.3919534683227539, + "learning_rate": 9.550091451992877e-05, + "loss": 1.8997, + "step": 5279 + }, + { + "epoch": 1.6206261510128912, + "grad_norm": 0.40001583099365234, + "learning_rate": 9.54988536595323e-05, + "loss": 1.9006, + "step": 5280 + }, + { + "epoch": 1.6209330877839165, + "grad_norm": 0.44222408533096313, + "learning_rate": 9.549679234948952e-05, + "loss": 2.0033, + "step": 5281 + }, + { + "epoch": 1.6212400245549416, + "grad_norm": 0.4243159592151642, + "learning_rate": 9.549473058982077e-05, + "loss": 1.9582, + "step": 5282 + }, + { + "epoch": 1.6215469613259668, + "grad_norm": 0.411408007144928, + "learning_rate": 9.549266838054641e-05, + "loss": 1.9244, + "step": 5283 + }, + { + "epoch": 1.621853898096992, + "grad_norm": 0.3833782970905304, + "learning_rate": 9.549060572168686e-05, + "loss": 1.9184, + "step": 5284 + }, + { + "epoch": 1.6221608348680172, + "grad_norm": 0.3925926685333252, + "learning_rate": 9.548854261326246e-05, + "loss": 1.9299, + "step": 5285 + }, + { + "epoch": 1.6224677716390423, + "grad_norm": 0.4472656846046448, + "learning_rate": 9.548647905529363e-05, + "loss": 2.0622, + "step": 5286 + }, + { + "epoch": 1.6227747084100677, + "grad_norm": 0.4842108488082886, + "learning_rate": 9.548441504780074e-05, + "loss": 1.9759, + "step": 5287 + }, + { + "epoch": 1.6230816451810925, + "grad_norm": 0.49826517701148987, + "learning_rate": 9.548235059080422e-05, + "loss": 1.9162, + "step": 5288 + }, + { + "epoch": 1.6233885819521179, + "grad_norm": 0.4672689735889435, + "learning_rate": 9.548028568432445e-05, + "loss": 1.9843, + "step": 5289 + }, + { + "epoch": 1.623695518723143, + "grad_norm": 0.48113325238227844, + "learning_rate": 9.547822032838182e-05, + "loss": 1.9426, + "step": 5290 + }, + { + "epoch": 1.624002455494168, + "grad_norm": 0.49646374583244324, + "learning_rate": 9.54761545229968e-05, + "loss": 1.908, + "step": 5291 + }, + { + "epoch": 1.6243093922651934, + "grad_norm": 0.42530664801597595, + "learning_rate": 9.547408826818974e-05, + "loss": 1.9189, + "step": 5292 + }, + { + "epoch": 1.6246163290362186, + "grad_norm": 0.592721164226532, + "learning_rate": 9.54720215639811e-05, + "loss": 1.9656, + "step": 5293 + }, + { + "epoch": 1.6249232658072437, + "grad_norm": 0.5530748963356018, + "learning_rate": 9.546995441039127e-05, + "loss": 1.8815, + "step": 5294 + }, + { + "epoch": 1.625230202578269, + "grad_norm": 0.4551030695438385, + "learning_rate": 9.546788680744073e-05, + "loss": 1.9485, + "step": 5295 + }, + { + "epoch": 1.625537139349294, + "grad_norm": 0.42004409432411194, + "learning_rate": 9.546581875514985e-05, + "loss": 1.9903, + "step": 5296 + }, + { + "epoch": 1.6258440761203192, + "grad_norm": 0.5363507270812988, + "learning_rate": 9.546375025353911e-05, + "loss": 1.93, + "step": 5297 + }, + { + "epoch": 1.6261510128913443, + "grad_norm": 0.457795649766922, + "learning_rate": 9.546168130262896e-05, + "loss": 1.9279, + "step": 5298 + }, + { + "epoch": 1.6264579496623695, + "grad_norm": 0.5061174631118774, + "learning_rate": 9.545961190243982e-05, + "loss": 1.9198, + "step": 5299 + }, + { + "epoch": 1.6267648864333948, + "grad_norm": 0.4366548955440521, + "learning_rate": 9.545754205299214e-05, + "loss": 1.9206, + "step": 5300 + }, + { + "epoch": 1.62707182320442, + "grad_norm": 0.361251562833786, + "learning_rate": 9.54554717543064e-05, + "loss": 1.8638, + "step": 5301 + }, + { + "epoch": 1.627378759975445, + "grad_norm": 0.45089036226272583, + "learning_rate": 9.545340100640303e-05, + "loss": 1.9206, + "step": 5302 + }, + { + "epoch": 1.6276856967464703, + "grad_norm": 0.38224726915359497, + "learning_rate": 9.545132980930251e-05, + "loss": 1.9893, + "step": 5303 + }, + { + "epoch": 1.6279926335174952, + "grad_norm": 0.43573206663131714, + "learning_rate": 9.544925816302533e-05, + "loss": 1.9358, + "step": 5304 + }, + { + "epoch": 1.6282995702885206, + "grad_norm": 0.5618723630905151, + "learning_rate": 9.544718606759193e-05, + "loss": 1.9745, + "step": 5305 + }, + { + "epoch": 1.6286065070595457, + "grad_norm": 0.517867386341095, + "learning_rate": 9.54451135230228e-05, + "loss": 2.0238, + "step": 5306 + }, + { + "epoch": 1.6289134438305708, + "grad_norm": 0.4745725393295288, + "learning_rate": 9.544304052933842e-05, + "loss": 1.999, + "step": 5307 + }, + { + "epoch": 1.6292203806015961, + "grad_norm": 0.4454270899295807, + "learning_rate": 9.544096708655928e-05, + "loss": 1.9215, + "step": 5308 + }, + { + "epoch": 1.6295273173726212, + "grad_norm": 0.5604696273803711, + "learning_rate": 9.543889319470586e-05, + "loss": 1.8756, + "step": 5309 + }, + { + "epoch": 1.6298342541436464, + "grad_norm": 0.645453155040741, + "learning_rate": 9.543681885379869e-05, + "loss": 1.9177, + "step": 5310 + }, + { + "epoch": 1.6301411909146717, + "grad_norm": 0.7018140554428101, + "learning_rate": 9.543474406385824e-05, + "loss": 1.9231, + "step": 5311 + }, + { + "epoch": 1.6304481276856968, + "grad_norm": 0.691644549369812, + "learning_rate": 9.543266882490501e-05, + "loss": 1.9055, + "step": 5312 + }, + { + "epoch": 1.630755064456722, + "grad_norm": 0.5484849810600281, + "learning_rate": 9.54305931369595e-05, + "loss": 1.8977, + "step": 5313 + }, + { + "epoch": 1.6310620012277472, + "grad_norm": 0.4035104811191559, + "learning_rate": 9.542851700004227e-05, + "loss": 1.9098, + "step": 5314 + }, + { + "epoch": 1.6313689379987721, + "grad_norm": 0.4578574299812317, + "learning_rate": 9.542644041417379e-05, + "loss": 1.9946, + "step": 5315 + }, + { + "epoch": 1.6316758747697975, + "grad_norm": 0.646272599697113, + "learning_rate": 9.542436337937462e-05, + "loss": 1.9489, + "step": 5316 + }, + { + "epoch": 1.6319828115408226, + "grad_norm": 0.5796291828155518, + "learning_rate": 9.542228589566524e-05, + "loss": 1.8396, + "step": 5317 + }, + { + "epoch": 1.6322897483118477, + "grad_norm": 0.42690619826316833, + "learning_rate": 9.542020796306623e-05, + "loss": 1.9691, + "step": 5318 + }, + { + "epoch": 1.632596685082873, + "grad_norm": 0.3943910002708435, + "learning_rate": 9.54181295815981e-05, + "loss": 1.8711, + "step": 5319 + }, + { + "epoch": 1.6329036218538981, + "grad_norm": 0.4636860489845276, + "learning_rate": 9.541605075128137e-05, + "loss": 1.8659, + "step": 5320 + }, + { + "epoch": 1.6332105586249233, + "grad_norm": 0.5485807061195374, + "learning_rate": 9.541397147213664e-05, + "loss": 2.031, + "step": 5321 + }, + { + "epoch": 1.6335174953959486, + "grad_norm": 0.40169721841812134, + "learning_rate": 9.541189174418441e-05, + "loss": 1.9346, + "step": 5322 + }, + { + "epoch": 1.6338244321669735, + "grad_norm": 0.3407663106918335, + "learning_rate": 9.540981156744524e-05, + "loss": 1.9238, + "step": 5323 + }, + { + "epoch": 1.6341313689379988, + "grad_norm": 0.4062422513961792, + "learning_rate": 9.540773094193971e-05, + "loss": 1.914, + "step": 5324 + }, + { + "epoch": 1.634438305709024, + "grad_norm": 0.47654685378074646, + "learning_rate": 9.540564986768836e-05, + "loss": 1.8957, + "step": 5325 + }, + { + "epoch": 1.634745242480049, + "grad_norm": 0.4369850754737854, + "learning_rate": 9.540356834471178e-05, + "loss": 1.968, + "step": 5326 + }, + { + "epoch": 1.6350521792510744, + "grad_norm": 0.38868457078933716, + "learning_rate": 9.540148637303052e-05, + "loss": 1.931, + "step": 5327 + }, + { + "epoch": 1.6353591160220995, + "grad_norm": 0.4998358190059662, + "learning_rate": 9.539940395266515e-05, + "loss": 1.9316, + "step": 5328 + }, + { + "epoch": 1.6356660527931246, + "grad_norm": 0.5497372150421143, + "learning_rate": 9.539732108363628e-05, + "loss": 1.9233, + "step": 5329 + }, + { + "epoch": 1.63597298956415, + "grad_norm": 0.5609846115112305, + "learning_rate": 9.539523776596445e-05, + "loss": 1.898, + "step": 5330 + }, + { + "epoch": 1.6362799263351748, + "grad_norm": 0.44984617829322815, + "learning_rate": 9.539315399967029e-05, + "loss": 2.0103, + "step": 5331 + }, + { + "epoch": 1.6365868631062002, + "grad_norm": 0.41710013151168823, + "learning_rate": 9.539106978477436e-05, + "loss": 1.9008, + "step": 5332 + }, + { + "epoch": 1.6368937998772253, + "grad_norm": 0.44854703545570374, + "learning_rate": 9.53889851212973e-05, + "loss": 1.9591, + "step": 5333 + }, + { + "epoch": 1.6372007366482504, + "grad_norm": 0.4259171485900879, + "learning_rate": 9.538690000925968e-05, + "loss": 1.915, + "step": 5334 + }, + { + "epoch": 1.6375076734192757, + "grad_norm": 0.4444480240345001, + "learning_rate": 9.53848144486821e-05, + "loss": 1.9562, + "step": 5335 + }, + { + "epoch": 1.6378146101903008, + "grad_norm": 0.40078794956207275, + "learning_rate": 9.538272843958518e-05, + "loss": 1.8802, + "step": 5336 + }, + { + "epoch": 1.638121546961326, + "grad_norm": 0.5346726179122925, + "learning_rate": 9.538064198198955e-05, + "loss": 2.0214, + "step": 5337 + }, + { + "epoch": 1.6384284837323513, + "grad_norm": 0.47136780619621277, + "learning_rate": 9.537855507591581e-05, + "loss": 1.9593, + "step": 5338 + }, + { + "epoch": 1.6387354205033762, + "grad_norm": 0.3839198052883148, + "learning_rate": 9.53764677213846e-05, + "loss": 1.9507, + "step": 5339 + }, + { + "epoch": 1.6390423572744015, + "grad_norm": 0.4565586447715759, + "learning_rate": 9.537437991841654e-05, + "loss": 1.9292, + "step": 5340 + }, + { + "epoch": 1.6393492940454266, + "grad_norm": 0.5139011740684509, + "learning_rate": 9.537229166703225e-05, + "loss": 1.9388, + "step": 5341 + }, + { + "epoch": 1.6396562308164517, + "grad_norm": 0.5421571135520935, + "learning_rate": 9.537020296725238e-05, + "loss": 1.9031, + "step": 5342 + }, + { + "epoch": 1.639963167587477, + "grad_norm": 0.4085434675216675, + "learning_rate": 9.536811381909758e-05, + "loss": 1.9167, + "step": 5343 + }, + { + "epoch": 1.6402701043585022, + "grad_norm": 0.3567824065685272, + "learning_rate": 9.536602422258849e-05, + "loss": 1.89, + "step": 5344 + }, + { + "epoch": 1.6405770411295273, + "grad_norm": 0.5427443385124207, + "learning_rate": 9.536393417774575e-05, + "loss": 2.0036, + "step": 5345 + }, + { + "epoch": 1.6408839779005526, + "grad_norm": 0.5275370478630066, + "learning_rate": 9.536184368459003e-05, + "loss": 1.94, + "step": 5346 + }, + { + "epoch": 1.6411909146715775, + "grad_norm": 0.3916989862918854, + "learning_rate": 9.535975274314198e-05, + "loss": 1.8769, + "step": 5347 + }, + { + "epoch": 1.6414978514426029, + "grad_norm": 0.4200802743434906, + "learning_rate": 9.535766135342228e-05, + "loss": 1.9384, + "step": 5348 + }, + { + "epoch": 1.641804788213628, + "grad_norm": 0.5287195444107056, + "learning_rate": 9.535556951545157e-05, + "loss": 1.9159, + "step": 5349 + }, + { + "epoch": 1.642111724984653, + "grad_norm": 0.5934851765632629, + "learning_rate": 9.535347722925055e-05, + "loss": 1.9927, + "step": 5350 + }, + { + "epoch": 1.6424186617556784, + "grad_norm": 0.49941807985305786, + "learning_rate": 9.535138449483987e-05, + "loss": 1.9124, + "step": 5351 + }, + { + "epoch": 1.6427255985267035, + "grad_norm": 0.41778016090393066, + "learning_rate": 9.534929131224024e-05, + "loss": 1.9468, + "step": 5352 + }, + { + "epoch": 1.6430325352977286, + "grad_norm": 0.5172474384307861, + "learning_rate": 9.534719768147233e-05, + "loss": 1.928, + "step": 5353 + }, + { + "epoch": 1.643339472068754, + "grad_norm": 0.6690294146537781, + "learning_rate": 9.534510360255683e-05, + "loss": 1.9697, + "step": 5354 + }, + { + "epoch": 1.6436464088397789, + "grad_norm": 0.617683470249176, + "learning_rate": 9.534300907551444e-05, + "loss": 1.9529, + "step": 5355 + }, + { + "epoch": 1.6439533456108042, + "grad_norm": 0.40067893266677856, + "learning_rate": 9.534091410036587e-05, + "loss": 1.915, + "step": 5356 + }, + { + "epoch": 1.6442602823818293, + "grad_norm": 0.46418440341949463, + "learning_rate": 9.53388186771318e-05, + "loss": 1.9056, + "step": 5357 + }, + { + "epoch": 1.6445672191528544, + "grad_norm": 0.6600098013877869, + "learning_rate": 9.533672280583295e-05, + "loss": 1.9641, + "step": 5358 + }, + { + "epoch": 1.6448741559238798, + "grad_norm": 0.6510347127914429, + "learning_rate": 9.533462648649004e-05, + "loss": 1.916, + "step": 5359 + }, + { + "epoch": 1.6451810926949049, + "grad_norm": 0.5004377365112305, + "learning_rate": 9.533252971912376e-05, + "loss": 1.9584, + "step": 5360 + }, + { + "epoch": 1.64548802946593, + "grad_norm": 0.45522230863571167, + "learning_rate": 9.533043250375488e-05, + "loss": 1.973, + "step": 5361 + }, + { + "epoch": 1.6457949662369553, + "grad_norm": 0.5304180383682251, + "learning_rate": 9.532833484040408e-05, + "loss": 1.8542, + "step": 5362 + }, + { + "epoch": 1.6461019030079802, + "grad_norm": 0.5320406556129456, + "learning_rate": 9.53262367290921e-05, + "loss": 1.9405, + "step": 5363 + }, + { + "epoch": 1.6464088397790055, + "grad_norm": 0.4377361536026001, + "learning_rate": 9.532413816983969e-05, + "loss": 1.9126, + "step": 5364 + }, + { + "epoch": 1.6467157765500307, + "grad_norm": 0.4632298946380615, + "learning_rate": 9.532203916266758e-05, + "loss": 1.9868, + "step": 5365 + }, + { + "epoch": 1.6470227133210558, + "grad_norm": 0.4861730635166168, + "learning_rate": 9.531993970759651e-05, + "loss": 1.895, + "step": 5366 + }, + { + "epoch": 1.647329650092081, + "grad_norm": 0.45012348890304565, + "learning_rate": 9.531783980464726e-05, + "loss": 1.9583, + "step": 5367 + }, + { + "epoch": 1.6476365868631062, + "grad_norm": 0.43772751092910767, + "learning_rate": 9.531573945384053e-05, + "loss": 1.9341, + "step": 5368 + }, + { + "epoch": 1.6479435236341313, + "grad_norm": 0.39253392815589905, + "learning_rate": 9.531363865519711e-05, + "loss": 1.8629, + "step": 5369 + }, + { + "epoch": 1.6482504604051567, + "grad_norm": 0.44614076614379883, + "learning_rate": 9.531153740873775e-05, + "loss": 1.9508, + "step": 5370 + }, + { + "epoch": 1.6485573971761815, + "grad_norm": 0.4442307949066162, + "learning_rate": 9.530943571448322e-05, + "loss": 1.9624, + "step": 5371 + }, + { + "epoch": 1.6488643339472069, + "grad_norm": 0.44962942600250244, + "learning_rate": 9.53073335724543e-05, + "loss": 1.9315, + "step": 5372 + }, + { + "epoch": 1.649171270718232, + "grad_norm": 0.4903222620487213, + "learning_rate": 9.530523098267173e-05, + "loss": 1.8776, + "step": 5373 + }, + { + "epoch": 1.649478207489257, + "grad_norm": 0.4733131229877472, + "learning_rate": 9.530312794515633e-05, + "loss": 1.958, + "step": 5374 + }, + { + "epoch": 1.6497851442602824, + "grad_norm": 0.4134232997894287, + "learning_rate": 9.530102445992886e-05, + "loss": 1.9184, + "step": 5375 + }, + { + "epoch": 1.6500920810313076, + "grad_norm": 0.43521758913993835, + "learning_rate": 9.529892052701012e-05, + "loss": 1.9383, + "step": 5376 + }, + { + "epoch": 1.6503990178023327, + "grad_norm": 0.5098583102226257, + "learning_rate": 9.52968161464209e-05, + "loss": 1.9596, + "step": 5377 + }, + { + "epoch": 1.650705954573358, + "grad_norm": 0.48421037197113037, + "learning_rate": 9.5294711318182e-05, + "loss": 1.9258, + "step": 5378 + }, + { + "epoch": 1.651012891344383, + "grad_norm": 0.4039461314678192, + "learning_rate": 9.52926060423142e-05, + "loss": 1.9975, + "step": 5379 + }, + { + "epoch": 1.6513198281154082, + "grad_norm": 0.491858571767807, + "learning_rate": 9.529050031883832e-05, + "loss": 1.9564, + "step": 5380 + }, + { + "epoch": 1.6516267648864333, + "grad_norm": 0.45920100808143616, + "learning_rate": 9.528839414777517e-05, + "loss": 1.8513, + "step": 5381 + }, + { + "epoch": 1.6519337016574585, + "grad_norm": 0.4812139868736267, + "learning_rate": 9.528628752914558e-05, + "loss": 1.9638, + "step": 5382 + }, + { + "epoch": 1.6522406384284838, + "grad_norm": 0.38021141290664673, + "learning_rate": 9.528418046297034e-05, + "loss": 1.848, + "step": 5383 + }, + { + "epoch": 1.652547575199509, + "grad_norm": 0.438681960105896, + "learning_rate": 9.52820729492703e-05, + "loss": 1.9931, + "step": 5384 + }, + { + "epoch": 1.652854511970534, + "grad_norm": 0.4387293756008148, + "learning_rate": 9.527996498806627e-05, + "loss": 1.9969, + "step": 5385 + }, + { + "epoch": 1.6531614487415593, + "grad_norm": 0.43315380811691284, + "learning_rate": 9.527785657937907e-05, + "loss": 1.9607, + "step": 5386 + }, + { + "epoch": 1.6534683855125845, + "grad_norm": 0.4800446927547455, + "learning_rate": 9.527574772322956e-05, + "loss": 1.9645, + "step": 5387 + }, + { + "epoch": 1.6537753222836096, + "grad_norm": 0.45495909452438354, + "learning_rate": 9.527363841963857e-05, + "loss": 1.8748, + "step": 5388 + }, + { + "epoch": 1.654082259054635, + "grad_norm": 0.4052638113498688, + "learning_rate": 9.527152866862696e-05, + "loss": 1.9491, + "step": 5389 + }, + { + "epoch": 1.6543891958256598, + "grad_norm": 0.44545745849609375, + "learning_rate": 9.526941847021558e-05, + "loss": 1.8938, + "step": 5390 + }, + { + "epoch": 1.6546961325966851, + "grad_norm": 0.5576399564743042, + "learning_rate": 9.526730782442526e-05, + "loss": 1.9656, + "step": 5391 + }, + { + "epoch": 1.6550030693677102, + "grad_norm": 0.5678401589393616, + "learning_rate": 9.526519673127686e-05, + "loss": 1.9914, + "step": 5392 + }, + { + "epoch": 1.6553100061387354, + "grad_norm": 0.4391598701477051, + "learning_rate": 9.526308519079127e-05, + "loss": 1.9452, + "step": 5393 + }, + { + "epoch": 1.6556169429097607, + "grad_norm": 0.4375559091567993, + "learning_rate": 9.526097320298934e-05, + "loss": 1.9335, + "step": 5394 + }, + { + "epoch": 1.6559238796807858, + "grad_norm": 0.4976498782634735, + "learning_rate": 9.525886076789194e-05, + "loss": 2.0065, + "step": 5395 + }, + { + "epoch": 1.656230816451811, + "grad_norm": 0.5966445207595825, + "learning_rate": 9.525674788551996e-05, + "loss": 1.9924, + "step": 5396 + }, + { + "epoch": 1.6565377532228363, + "grad_norm": 0.5119359493255615, + "learning_rate": 9.525463455589427e-05, + "loss": 2.0061, + "step": 5397 + }, + { + "epoch": 1.6568446899938611, + "grad_norm": 0.46835067868232727, + "learning_rate": 9.525252077903574e-05, + "loss": 1.9441, + "step": 5398 + }, + { + "epoch": 1.6571516267648865, + "grad_norm": 0.5319140553474426, + "learning_rate": 9.52504065549653e-05, + "loss": 1.9704, + "step": 5399 + }, + { + "epoch": 1.6574585635359116, + "grad_norm": 0.5132572054862976, + "learning_rate": 9.52482918837038e-05, + "loss": 1.9037, + "step": 5400 + }, + { + "epoch": 1.6577655003069367, + "grad_norm": 0.41260987520217896, + "learning_rate": 9.524617676527218e-05, + "loss": 1.9103, + "step": 5401 + }, + { + "epoch": 1.658072437077962, + "grad_norm": 0.41780540347099304, + "learning_rate": 9.524406119969131e-05, + "loss": 1.9419, + "step": 5402 + }, + { + "epoch": 1.6583793738489871, + "grad_norm": 0.42015889286994934, + "learning_rate": 9.524194518698211e-05, + "loss": 1.9143, + "step": 5403 + }, + { + "epoch": 1.6586863106200123, + "grad_norm": 0.4449796676635742, + "learning_rate": 9.523982872716548e-05, + "loss": 1.9794, + "step": 5404 + }, + { + "epoch": 1.6589932473910376, + "grad_norm": 0.4392293393611908, + "learning_rate": 9.523771182026237e-05, + "loss": 1.8687, + "step": 5405 + }, + { + "epoch": 1.6593001841620625, + "grad_norm": 0.49595963954925537, + "learning_rate": 9.523559446629366e-05, + "loss": 2.013, + "step": 5406 + }, + { + "epoch": 1.6596071209330878, + "grad_norm": 0.4456728994846344, + "learning_rate": 9.523347666528029e-05, + "loss": 1.9269, + "step": 5407 + }, + { + "epoch": 1.659914057704113, + "grad_norm": 0.3835284411907196, + "learning_rate": 9.52313584172432e-05, + "loss": 1.9042, + "step": 5408 + }, + { + "epoch": 1.660220994475138, + "grad_norm": 0.39068692922592163, + "learning_rate": 9.522923972220332e-05, + "loss": 1.999, + "step": 5409 + }, + { + "epoch": 1.6605279312461634, + "grad_norm": 0.4522729814052582, + "learning_rate": 9.522712058018157e-05, + "loss": 1.9546, + "step": 5410 + }, + { + "epoch": 1.6608348680171885, + "grad_norm": 0.3834155201911926, + "learning_rate": 9.522500099119891e-05, + "loss": 1.9184, + "step": 5411 + }, + { + "epoch": 1.6611418047882136, + "grad_norm": 0.36149126291275024, + "learning_rate": 9.522288095527629e-05, + "loss": 1.8973, + "step": 5412 + }, + { + "epoch": 1.661448741559239, + "grad_norm": 0.3502398729324341, + "learning_rate": 9.522076047243464e-05, + "loss": 1.8775, + "step": 5413 + }, + { + "epoch": 1.6617556783302638, + "grad_norm": 0.36552321910858154, + "learning_rate": 9.521863954269495e-05, + "loss": 1.901, + "step": 5414 + }, + { + "epoch": 1.6620626151012892, + "grad_norm": 0.37815216183662415, + "learning_rate": 9.521651816607814e-05, + "loss": 1.9143, + "step": 5415 + }, + { + "epoch": 1.6623695518723143, + "grad_norm": 0.4048994481563568, + "learning_rate": 9.52143963426052e-05, + "loss": 1.9892, + "step": 5416 + }, + { + "epoch": 1.6626764886433394, + "grad_norm": 0.35271233320236206, + "learning_rate": 9.52122740722971e-05, + "loss": 1.9209, + "step": 5417 + }, + { + "epoch": 1.6629834254143647, + "grad_norm": 0.405009925365448, + "learning_rate": 9.521015135517482e-05, + "loss": 1.9583, + "step": 5418 + }, + { + "epoch": 1.6632903621853898, + "grad_norm": 0.4041683077812195, + "learning_rate": 9.520802819125932e-05, + "loss": 1.8937, + "step": 5419 + }, + { + "epoch": 1.663597298956415, + "grad_norm": 0.41353970766067505, + "learning_rate": 9.520590458057157e-05, + "loss": 1.949, + "step": 5420 + }, + { + "epoch": 1.6639042357274403, + "grad_norm": 0.3704569637775421, + "learning_rate": 9.520378052313258e-05, + "loss": 1.9287, + "step": 5421 + }, + { + "epoch": 1.6642111724984652, + "grad_norm": 0.4043133854866028, + "learning_rate": 9.520165601896334e-05, + "loss": 1.9116, + "step": 5422 + }, + { + "epoch": 1.6645181092694905, + "grad_norm": 0.3976849317550659, + "learning_rate": 9.519953106808485e-05, + "loss": 1.9578, + "step": 5423 + }, + { + "epoch": 1.6648250460405156, + "grad_norm": 0.41225695610046387, + "learning_rate": 9.51974056705181e-05, + "loss": 1.8861, + "step": 5424 + }, + { + "epoch": 1.6651319828115407, + "grad_norm": 0.40096259117126465, + "learning_rate": 9.519527982628409e-05, + "loss": 1.926, + "step": 5425 + }, + { + "epoch": 1.665438919582566, + "grad_norm": 0.4373134970664978, + "learning_rate": 9.519315353540384e-05, + "loss": 1.8761, + "step": 5426 + }, + { + "epoch": 1.6657458563535912, + "grad_norm": 0.3798682689666748, + "learning_rate": 9.519102679789835e-05, + "loss": 1.8655, + "step": 5427 + }, + { + "epoch": 1.6660527931246163, + "grad_norm": 0.3889687955379486, + "learning_rate": 9.518889961378865e-05, + "loss": 1.8928, + "step": 5428 + }, + { + "epoch": 1.6663597298956416, + "grad_norm": 0.39567697048187256, + "learning_rate": 9.518677198309575e-05, + "loss": 1.9193, + "step": 5429 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.37571004033088684, + "learning_rate": 9.51846439058407e-05, + "loss": 1.9653, + "step": 5430 + }, + { + "epoch": 1.6669736034376919, + "grad_norm": 0.36011725664138794, + "learning_rate": 9.518251538204451e-05, + "loss": 1.9202, + "step": 5431 + }, + { + "epoch": 1.667280540208717, + "grad_norm": 0.42314839363098145, + "learning_rate": 9.518038641172822e-05, + "loss": 1.9883, + "step": 5432 + }, + { + "epoch": 1.667587476979742, + "grad_norm": 0.3986029326915741, + "learning_rate": 9.517825699491287e-05, + "loss": 1.9838, + "step": 5433 + }, + { + "epoch": 1.6678944137507674, + "grad_norm": 0.388236939907074, + "learning_rate": 9.517612713161949e-05, + "loss": 1.901, + "step": 5434 + }, + { + "epoch": 1.6682013505217925, + "grad_norm": 0.3849826455116272, + "learning_rate": 9.517399682186917e-05, + "loss": 1.9621, + "step": 5435 + }, + { + "epoch": 1.6685082872928176, + "grad_norm": 0.40182530879974365, + "learning_rate": 9.517186606568292e-05, + "loss": 1.9081, + "step": 5436 + }, + { + "epoch": 1.668815224063843, + "grad_norm": 0.4260261654853821, + "learning_rate": 9.516973486308181e-05, + "loss": 1.9701, + "step": 5437 + }, + { + "epoch": 1.6691221608348679, + "grad_norm": 0.4035099744796753, + "learning_rate": 9.516760321408692e-05, + "loss": 1.9269, + "step": 5438 + }, + { + "epoch": 1.6694290976058932, + "grad_norm": 0.42106589674949646, + "learning_rate": 9.51654711187193e-05, + "loss": 1.9026, + "step": 5439 + }, + { + "epoch": 1.6697360343769183, + "grad_norm": 0.4629819989204407, + "learning_rate": 9.516333857700001e-05, + "loss": 1.9128, + "step": 5440 + }, + { + "epoch": 1.6700429711479434, + "grad_norm": 0.3824837803840637, + "learning_rate": 9.516120558895014e-05, + "loss": 1.8861, + "step": 5441 + }, + { + "epoch": 1.6703499079189688, + "grad_norm": 0.37263223528862, + "learning_rate": 9.515907215459076e-05, + "loss": 1.9098, + "step": 5442 + }, + { + "epoch": 1.6706568446899939, + "grad_norm": 0.3980494439601898, + "learning_rate": 9.515693827394299e-05, + "loss": 1.9764, + "step": 5443 + }, + { + "epoch": 1.670963781461019, + "grad_norm": 0.5064507722854614, + "learning_rate": 9.515480394702786e-05, + "loss": 1.9771, + "step": 5444 + }, + { + "epoch": 1.6712707182320443, + "grad_norm": 0.5012909770011902, + "learning_rate": 9.515266917386649e-05, + "loss": 1.9162, + "step": 5445 + }, + { + "epoch": 1.6715776550030692, + "grad_norm": 0.5422279238700867, + "learning_rate": 9.515053395447999e-05, + "loss": 1.8913, + "step": 5446 + }, + { + "epoch": 1.6718845917740945, + "grad_norm": 0.4677022397518158, + "learning_rate": 9.514839828888946e-05, + "loss": 1.9156, + "step": 5447 + }, + { + "epoch": 1.6721915285451197, + "grad_norm": 0.39561185240745544, + "learning_rate": 9.514626217711597e-05, + "loss": 1.9203, + "step": 5448 + }, + { + "epoch": 1.6724984653161448, + "grad_norm": 0.4435743987560272, + "learning_rate": 9.514412561918068e-05, + "loss": 1.953, + "step": 5449 + }, + { + "epoch": 1.67280540208717, + "grad_norm": 0.5383535027503967, + "learning_rate": 9.514198861510467e-05, + "loss": 1.9662, + "step": 5450 + }, + { + "epoch": 1.6731123388581952, + "grad_norm": 0.4787214696407318, + "learning_rate": 9.513985116490906e-05, + "loss": 1.9278, + "step": 5451 + }, + { + "epoch": 1.6734192756292203, + "grad_norm": 0.40962034463882446, + "learning_rate": 9.513771326861501e-05, + "loss": 1.9267, + "step": 5452 + }, + { + "epoch": 1.6737262124002457, + "grad_norm": 0.43605929613113403, + "learning_rate": 9.513557492624359e-05, + "loss": 1.9537, + "step": 5453 + }, + { + "epoch": 1.6740331491712708, + "grad_norm": 0.46278494596481323, + "learning_rate": 9.513343613781599e-05, + "loss": 1.9383, + "step": 5454 + }, + { + "epoch": 1.6743400859422959, + "grad_norm": 0.4052918255329132, + "learning_rate": 9.513129690335331e-05, + "loss": 1.9289, + "step": 5455 + }, + { + "epoch": 1.6746470227133212, + "grad_norm": 0.37791141867637634, + "learning_rate": 9.51291572228767e-05, + "loss": 1.9185, + "step": 5456 + }, + { + "epoch": 1.674953959484346, + "grad_norm": 0.41135111451148987, + "learning_rate": 9.512701709640731e-05, + "loss": 2.0003, + "step": 5457 + }, + { + "epoch": 1.6752608962553714, + "grad_norm": 0.41175320744514465, + "learning_rate": 9.512487652396629e-05, + "loss": 1.9307, + "step": 5458 + }, + { + "epoch": 1.6755678330263966, + "grad_norm": 0.40061330795288086, + "learning_rate": 9.512273550557478e-05, + "loss": 1.9361, + "step": 5459 + }, + { + "epoch": 1.6758747697974217, + "grad_norm": 0.3938329219818115, + "learning_rate": 9.512059404125397e-05, + "loss": 1.9419, + "step": 5460 + }, + { + "epoch": 1.676181706568447, + "grad_norm": 0.42825883626937866, + "learning_rate": 9.511845213102498e-05, + "loss": 1.9201, + "step": 5461 + }, + { + "epoch": 1.6764886433394721, + "grad_norm": 0.3795798122882843, + "learning_rate": 9.511630977490901e-05, + "loss": 1.9872, + "step": 5462 + }, + { + "epoch": 1.6767955801104972, + "grad_norm": 0.3639005422592163, + "learning_rate": 9.511416697292724e-05, + "loss": 1.9066, + "step": 5463 + }, + { + "epoch": 1.6771025168815226, + "grad_norm": 0.4200088381767273, + "learning_rate": 9.511202372510082e-05, + "loss": 1.9928, + "step": 5464 + }, + { + "epoch": 1.6774094536525475, + "grad_norm": 0.436638742685318, + "learning_rate": 9.510988003145092e-05, + "loss": 1.8527, + "step": 5465 + }, + { + "epoch": 1.6777163904235728, + "grad_norm": 0.40901345014572144, + "learning_rate": 9.510773589199877e-05, + "loss": 1.9915, + "step": 5466 + }, + { + "epoch": 1.678023327194598, + "grad_norm": 0.39717167615890503, + "learning_rate": 9.510559130676553e-05, + "loss": 1.9682, + "step": 5467 + }, + { + "epoch": 1.678330263965623, + "grad_norm": 0.37574490904808044, + "learning_rate": 9.510344627577239e-05, + "loss": 1.9641, + "step": 5468 + }, + { + "epoch": 1.6786372007366483, + "grad_norm": 0.36686137318611145, + "learning_rate": 9.510130079904057e-05, + "loss": 1.9082, + "step": 5469 + }, + { + "epoch": 1.6789441375076735, + "grad_norm": 0.37321972846984863, + "learning_rate": 9.509915487659125e-05, + "loss": 1.8911, + "step": 5470 + }, + { + "epoch": 1.6792510742786986, + "grad_norm": 0.3911389112472534, + "learning_rate": 9.509700850844566e-05, + "loss": 1.9721, + "step": 5471 + }, + { + "epoch": 1.679558011049724, + "grad_norm": 0.41182973980903625, + "learning_rate": 9.509486169462499e-05, + "loss": 1.9188, + "step": 5472 + }, + { + "epoch": 1.6798649478207488, + "grad_norm": 0.4141900837421417, + "learning_rate": 9.509271443515047e-05, + "loss": 1.875, + "step": 5473 + }, + { + "epoch": 1.6801718845917741, + "grad_norm": 0.4259745478630066, + "learning_rate": 9.509056673004333e-05, + "loss": 1.9258, + "step": 5474 + }, + { + "epoch": 1.6804788213627992, + "grad_norm": 0.47081178426742554, + "learning_rate": 9.508841857932476e-05, + "loss": 2.0494, + "step": 5475 + }, + { + "epoch": 1.6807857581338244, + "grad_norm": 0.5346465110778809, + "learning_rate": 9.508626998301602e-05, + "loss": 1.9371, + "step": 5476 + }, + { + "epoch": 1.6810926949048497, + "grad_norm": 0.5532976388931274, + "learning_rate": 9.508412094113832e-05, + "loss": 1.8727, + "step": 5477 + }, + { + "epoch": 1.6813996316758748, + "grad_norm": 0.5262138843536377, + "learning_rate": 9.508197145371294e-05, + "loss": 1.9098, + "step": 5478 + }, + { + "epoch": 1.6817065684469, + "grad_norm": 0.47581788897514343, + "learning_rate": 9.507982152076108e-05, + "loss": 1.9174, + "step": 5479 + }, + { + "epoch": 1.6820135052179253, + "grad_norm": 0.41795024275779724, + "learning_rate": 9.507767114230399e-05, + "loss": 1.9333, + "step": 5480 + }, + { + "epoch": 1.6823204419889501, + "grad_norm": 0.5213392376899719, + "learning_rate": 9.507552031836295e-05, + "loss": 1.9731, + "step": 5481 + }, + { + "epoch": 1.6826273787599755, + "grad_norm": 0.624969482421875, + "learning_rate": 9.507336904895919e-05, + "loss": 1.965, + "step": 5482 + }, + { + "epoch": 1.6829343155310006, + "grad_norm": 0.5719303488731384, + "learning_rate": 9.507121733411397e-05, + "loss": 1.9325, + "step": 5483 + }, + { + "epoch": 1.6832412523020257, + "grad_norm": 0.45429563522338867, + "learning_rate": 9.506906517384858e-05, + "loss": 1.8846, + "step": 5484 + }, + { + "epoch": 1.683548189073051, + "grad_norm": 0.4679521322250366, + "learning_rate": 9.506691256818427e-05, + "loss": 1.9609, + "step": 5485 + }, + { + "epoch": 1.6838551258440762, + "grad_norm": 0.64385986328125, + "learning_rate": 9.50647595171423e-05, + "loss": 1.9138, + "step": 5486 + }, + { + "epoch": 1.6841620626151013, + "grad_norm": 0.6783073544502258, + "learning_rate": 9.506260602074398e-05, + "loss": 2.0252, + "step": 5487 + }, + { + "epoch": 1.6844689993861266, + "grad_norm": 0.6151844263076782, + "learning_rate": 9.506045207901058e-05, + "loss": 2.0077, + "step": 5488 + }, + { + "epoch": 1.6847759361571515, + "grad_norm": 0.43046683073043823, + "learning_rate": 9.505829769196338e-05, + "loss": 1.8945, + "step": 5489 + }, + { + "epoch": 1.6850828729281768, + "grad_norm": 0.44831258058547974, + "learning_rate": 9.505614285962366e-05, + "loss": 1.9775, + "step": 5490 + }, + { + "epoch": 1.685389809699202, + "grad_norm": 0.4917668402194977, + "learning_rate": 9.505398758201272e-05, + "loss": 1.9115, + "step": 5491 + }, + { + "epoch": 1.685696746470227, + "grad_norm": 0.4595036506652832, + "learning_rate": 9.505183185915187e-05, + "loss": 1.9103, + "step": 5492 + }, + { + "epoch": 1.6860036832412524, + "grad_norm": 0.43335607647895813, + "learning_rate": 9.504967569106243e-05, + "loss": 1.9147, + "step": 5493 + }, + { + "epoch": 1.6863106200122775, + "grad_norm": 0.42885956168174744, + "learning_rate": 9.504751907776567e-05, + "loss": 2.0085, + "step": 5494 + }, + { + "epoch": 1.6866175567833026, + "grad_norm": 0.4121492803096771, + "learning_rate": 9.504536201928295e-05, + "loss": 1.9212, + "step": 5495 + }, + { + "epoch": 1.686924493554328, + "grad_norm": 0.4387015700340271, + "learning_rate": 9.504320451563555e-05, + "loss": 1.9202, + "step": 5496 + }, + { + "epoch": 1.6872314303253528, + "grad_norm": 0.4333394467830658, + "learning_rate": 9.504104656684481e-05, + "loss": 1.9165, + "step": 5497 + }, + { + "epoch": 1.6875383670963782, + "grad_norm": 0.37835901975631714, + "learning_rate": 9.503888817293203e-05, + "loss": 1.9087, + "step": 5498 + }, + { + "epoch": 1.6878453038674033, + "grad_norm": 0.42156684398651123, + "learning_rate": 9.503672933391857e-05, + "loss": 1.8909, + "step": 5499 + }, + { + "epoch": 1.6881522406384284, + "grad_norm": 0.4315885603427887, + "learning_rate": 9.503457004982574e-05, + "loss": 1.8892, + "step": 5500 + }, + { + "epoch": 1.6884591774094537, + "grad_norm": 0.4349892735481262, + "learning_rate": 9.50324103206749e-05, + "loss": 1.9532, + "step": 5501 + }, + { + "epoch": 1.6887661141804788, + "grad_norm": 0.45786523818969727, + "learning_rate": 9.503025014648739e-05, + "loss": 1.9285, + "step": 5502 + }, + { + "epoch": 1.689073050951504, + "grad_norm": 0.36640092730522156, + "learning_rate": 9.502808952728456e-05, + "loss": 1.9167, + "step": 5503 + }, + { + "epoch": 1.6893799877225293, + "grad_norm": 0.46942031383514404, + "learning_rate": 9.502592846308775e-05, + "loss": 2.08, + "step": 5504 + }, + { + "epoch": 1.6896869244935542, + "grad_norm": 0.44714173674583435, + "learning_rate": 9.502376695391833e-05, + "loss": 1.9618, + "step": 5505 + }, + { + "epoch": 1.6899938612645795, + "grad_norm": 0.4216810464859009, + "learning_rate": 9.502160499979764e-05, + "loss": 1.888, + "step": 5506 + }, + { + "epoch": 1.6903007980356046, + "grad_norm": 0.40471377968788147, + "learning_rate": 9.501944260074709e-05, + "loss": 1.9048, + "step": 5507 + }, + { + "epoch": 1.6906077348066297, + "grad_norm": 0.399309366941452, + "learning_rate": 9.501727975678801e-05, + "loss": 1.8796, + "step": 5508 + }, + { + "epoch": 1.690914671577655, + "grad_norm": 0.36903873085975647, + "learning_rate": 9.501511646794176e-05, + "loss": 1.9607, + "step": 5509 + }, + { + "epoch": 1.6912216083486802, + "grad_norm": 0.40781939029693604, + "learning_rate": 9.501295273422977e-05, + "loss": 1.9328, + "step": 5510 + }, + { + "epoch": 1.6915285451197053, + "grad_norm": 0.38062483072280884, + "learning_rate": 9.50107885556734e-05, + "loss": 1.9552, + "step": 5511 + }, + { + "epoch": 1.6918354818907306, + "grad_norm": 0.4047648012638092, + "learning_rate": 9.500862393229402e-05, + "loss": 1.9503, + "step": 5512 + }, + { + "epoch": 1.6921424186617555, + "grad_norm": 0.3829517066478729, + "learning_rate": 9.500645886411305e-05, + "loss": 1.9034, + "step": 5513 + }, + { + "epoch": 1.6924493554327809, + "grad_norm": 0.3657867908477783, + "learning_rate": 9.500429335115188e-05, + "loss": 1.869, + "step": 5514 + }, + { + "epoch": 1.692756292203806, + "grad_norm": 0.410877525806427, + "learning_rate": 9.50021273934319e-05, + "loss": 1.9824, + "step": 5515 + }, + { + "epoch": 1.693063228974831, + "grad_norm": 0.420682817697525, + "learning_rate": 9.499996099097453e-05, + "loss": 1.969, + "step": 5516 + }, + { + "epoch": 1.6933701657458564, + "grad_norm": 0.44578227400779724, + "learning_rate": 9.499779414380115e-05, + "loss": 1.9513, + "step": 5517 + }, + { + "epoch": 1.6936771025168815, + "grad_norm": 0.42710423469543457, + "learning_rate": 9.499562685193319e-05, + "loss": 1.9423, + "step": 5518 + }, + { + "epoch": 1.6939840392879066, + "grad_norm": 0.4503214657306671, + "learning_rate": 9.49934591153921e-05, + "loss": 1.9849, + "step": 5519 + }, + { + "epoch": 1.694290976058932, + "grad_norm": 0.427157998085022, + "learning_rate": 9.499129093419926e-05, + "loss": 1.9502, + "step": 5520 + }, + { + "epoch": 1.6945979128299569, + "grad_norm": 0.4356638491153717, + "learning_rate": 9.498912230837611e-05, + "loss": 1.8593, + "step": 5521 + }, + { + "epoch": 1.6949048496009822, + "grad_norm": 0.3894338309764862, + "learning_rate": 9.498695323794409e-05, + "loss": 1.8857, + "step": 5522 + }, + { + "epoch": 1.6952117863720073, + "grad_norm": 0.4285121262073517, + "learning_rate": 9.498478372292464e-05, + "loss": 1.9774, + "step": 5523 + }, + { + "epoch": 1.6955187231430324, + "grad_norm": 0.4316183924674988, + "learning_rate": 9.498261376333916e-05, + "loss": 1.9067, + "step": 5524 + }, + { + "epoch": 1.6958256599140578, + "grad_norm": 0.3760167956352234, + "learning_rate": 9.498044335920914e-05, + "loss": 1.8375, + "step": 5525 + }, + { + "epoch": 1.6961325966850829, + "grad_norm": 0.4327097237110138, + "learning_rate": 9.497827251055602e-05, + "loss": 1.9333, + "step": 5526 + }, + { + "epoch": 1.696439533456108, + "grad_norm": 0.4169953167438507, + "learning_rate": 9.497610121740126e-05, + "loss": 1.9015, + "step": 5527 + }, + { + "epoch": 1.6967464702271333, + "grad_norm": 0.3915253281593323, + "learning_rate": 9.49739294797663e-05, + "loss": 1.8608, + "step": 5528 + }, + { + "epoch": 1.6970534069981584, + "grad_norm": 0.4071075916290283, + "learning_rate": 9.497175729767259e-05, + "loss": 1.9336, + "step": 5529 + }, + { + "epoch": 1.6973603437691835, + "grad_norm": 0.3550303876399994, + "learning_rate": 9.496958467114163e-05, + "loss": 1.8614, + "step": 5530 + }, + { + "epoch": 1.6976672805402089, + "grad_norm": 0.3757273554801941, + "learning_rate": 9.496741160019487e-05, + "loss": 1.9959, + "step": 5531 + }, + { + "epoch": 1.6979742173112338, + "grad_norm": 0.4126262366771698, + "learning_rate": 9.49652380848538e-05, + "loss": 1.935, + "step": 5532 + }, + { + "epoch": 1.698281154082259, + "grad_norm": 0.46366190910339355, + "learning_rate": 9.496306412513988e-05, + "loss": 1.9336, + "step": 5533 + }, + { + "epoch": 1.6985880908532842, + "grad_norm": 0.42553630471229553, + "learning_rate": 9.496088972107463e-05, + "loss": 1.9388, + "step": 5534 + }, + { + "epoch": 1.6988950276243093, + "grad_norm": 0.4060843884944916, + "learning_rate": 9.49587148726795e-05, + "loss": 1.917, + "step": 5535 + }, + { + "epoch": 1.6992019643953347, + "grad_norm": 0.37994736433029175, + "learning_rate": 9.495653957997601e-05, + "loss": 1.9268, + "step": 5536 + }, + { + "epoch": 1.6995089011663598, + "grad_norm": 0.4148559272289276, + "learning_rate": 9.495436384298563e-05, + "loss": 1.8936, + "step": 5537 + }, + { + "epoch": 1.6998158379373849, + "grad_norm": 0.39814767241477966, + "learning_rate": 9.495218766172989e-05, + "loss": 1.9468, + "step": 5538 + }, + { + "epoch": 1.7001227747084102, + "grad_norm": 0.40800294280052185, + "learning_rate": 9.495001103623027e-05, + "loss": 1.9649, + "step": 5539 + }, + { + "epoch": 1.7004297114794351, + "grad_norm": 0.4225989282131195, + "learning_rate": 9.49478339665083e-05, + "loss": 1.987, + "step": 5540 + }, + { + "epoch": 1.7007366482504604, + "grad_norm": 0.4280939996242523, + "learning_rate": 9.494565645258551e-05, + "loss": 2.0487, + "step": 5541 + }, + { + "epoch": 1.7010435850214856, + "grad_norm": 0.44816237688064575, + "learning_rate": 9.494347849448338e-05, + "loss": 1.9112, + "step": 5542 + }, + { + "epoch": 1.7013505217925107, + "grad_norm": 0.424629271030426, + "learning_rate": 9.494130009222346e-05, + "loss": 1.9284, + "step": 5543 + }, + { + "epoch": 1.701657458563536, + "grad_norm": 0.40010082721710205, + "learning_rate": 9.493912124582727e-05, + "loss": 1.9307, + "step": 5544 + }, + { + "epoch": 1.7019643953345611, + "grad_norm": 0.42541825771331787, + "learning_rate": 9.493694195531633e-05, + "loss": 2.0009, + "step": 5545 + }, + { + "epoch": 1.7022713321055862, + "grad_norm": 0.39693546295166016, + "learning_rate": 9.49347622207122e-05, + "loss": 1.9237, + "step": 5546 + }, + { + "epoch": 1.7025782688766116, + "grad_norm": 0.37853676080703735, + "learning_rate": 9.493258204203644e-05, + "loss": 1.9212, + "step": 5547 + }, + { + "epoch": 1.7028852056476365, + "grad_norm": 0.3856247663497925, + "learning_rate": 9.493040141931054e-05, + "loss": 1.926, + "step": 5548 + }, + { + "epoch": 1.7031921424186618, + "grad_norm": 0.3429555892944336, + "learning_rate": 9.492822035255608e-05, + "loss": 1.8854, + "step": 5549 + }, + { + "epoch": 1.703499079189687, + "grad_norm": 0.3500545620918274, + "learning_rate": 9.49260388417946e-05, + "loss": 1.8627, + "step": 5550 + }, + { + "epoch": 1.703806015960712, + "grad_norm": 0.3461480140686035, + "learning_rate": 9.49238568870477e-05, + "loss": 1.8962, + "step": 5551 + }, + { + "epoch": 1.7041129527317374, + "grad_norm": 0.36311015486717224, + "learning_rate": 9.492167448833691e-05, + "loss": 1.9398, + "step": 5552 + }, + { + "epoch": 1.7044198895027625, + "grad_norm": 0.36770105361938477, + "learning_rate": 9.491949164568379e-05, + "loss": 1.9083, + "step": 5553 + }, + { + "epoch": 1.7047268262737876, + "grad_norm": 0.42491769790649414, + "learning_rate": 9.491730835910993e-05, + "loss": 1.8874, + "step": 5554 + }, + { + "epoch": 1.705033763044813, + "grad_norm": 0.5321764945983887, + "learning_rate": 9.491512462863691e-05, + "loss": 1.9813, + "step": 5555 + }, + { + "epoch": 1.7053406998158378, + "grad_norm": 0.5481576323509216, + "learning_rate": 9.49129404542863e-05, + "loss": 1.8696, + "step": 5556 + }, + { + "epoch": 1.7056476365868631, + "grad_norm": 0.47720953822135925, + "learning_rate": 9.491075583607969e-05, + "loss": 1.9026, + "step": 5557 + }, + { + "epoch": 1.7059545733578882, + "grad_norm": 0.3976534605026245, + "learning_rate": 9.490857077403865e-05, + "loss": 1.8551, + "step": 5558 + }, + { + "epoch": 1.7062615101289134, + "grad_norm": 0.3744281828403473, + "learning_rate": 9.49063852681848e-05, + "loss": 2.012, + "step": 5559 + }, + { + "epoch": 1.7065684468999387, + "grad_norm": 0.3931918740272522, + "learning_rate": 9.490419931853974e-05, + "loss": 1.845, + "step": 5560 + }, + { + "epoch": 1.7068753836709638, + "grad_norm": 0.5411466956138611, + "learning_rate": 9.490201292512506e-05, + "loss": 2.0225, + "step": 5561 + }, + { + "epoch": 1.707182320441989, + "grad_norm": 0.6602910757064819, + "learning_rate": 9.489982608796237e-05, + "loss": 1.9559, + "step": 5562 + }, + { + "epoch": 1.7074892572130143, + "grad_norm": 0.5455329418182373, + "learning_rate": 9.489763880707329e-05, + "loss": 1.8855, + "step": 5563 + }, + { + "epoch": 1.7077961939840391, + "grad_norm": 0.42309099435806274, + "learning_rate": 9.489545108247941e-05, + "loss": 1.8784, + "step": 5564 + }, + { + "epoch": 1.7081031307550645, + "grad_norm": 0.3817001283168793, + "learning_rate": 9.489326291420239e-05, + "loss": 1.8926, + "step": 5565 + }, + { + "epoch": 1.7084100675260896, + "grad_norm": 0.5077582597732544, + "learning_rate": 9.489107430226381e-05, + "loss": 1.8742, + "step": 5566 + }, + { + "epoch": 1.7087170042971147, + "grad_norm": 0.5634065866470337, + "learning_rate": 9.488888524668533e-05, + "loss": 1.9251, + "step": 5567 + }, + { + "epoch": 1.70902394106814, + "grad_norm": 0.5182891488075256, + "learning_rate": 9.488669574748859e-05, + "loss": 1.9689, + "step": 5568 + }, + { + "epoch": 1.7093308778391652, + "grad_norm": 0.4180498719215393, + "learning_rate": 9.48845058046952e-05, + "loss": 1.9248, + "step": 5569 + }, + { + "epoch": 1.7096378146101903, + "grad_norm": 0.4833194315433502, + "learning_rate": 9.488231541832682e-05, + "loss": 2.0115, + "step": 5570 + }, + { + "epoch": 1.7099447513812156, + "grad_norm": 0.46525415778160095, + "learning_rate": 9.488012458840509e-05, + "loss": 1.9108, + "step": 5571 + }, + { + "epoch": 1.7102516881522405, + "grad_norm": 0.5051191449165344, + "learning_rate": 9.487793331495166e-05, + "loss": 1.9055, + "step": 5572 + }, + { + "epoch": 1.7105586249232658, + "grad_norm": 0.4713154137134552, + "learning_rate": 9.48757415979882e-05, + "loss": 1.9104, + "step": 5573 + }, + { + "epoch": 1.710865561694291, + "grad_norm": 0.44901835918426514, + "learning_rate": 9.487354943753635e-05, + "loss": 1.9536, + "step": 5574 + }, + { + "epoch": 1.711172498465316, + "grad_norm": 0.41106006503105164, + "learning_rate": 9.487135683361778e-05, + "loss": 1.9549, + "step": 5575 + }, + { + "epoch": 1.7114794352363414, + "grad_norm": 0.4571320116519928, + "learning_rate": 9.486916378625416e-05, + "loss": 1.859, + "step": 5576 + }, + { + "epoch": 1.7117863720073665, + "grad_norm": 0.4423540532588959, + "learning_rate": 9.486697029546718e-05, + "loss": 1.9621, + "step": 5577 + }, + { + "epoch": 1.7120933087783916, + "grad_norm": 0.44291070103645325, + "learning_rate": 9.48647763612785e-05, + "loss": 1.8567, + "step": 5578 + }, + { + "epoch": 1.712400245549417, + "grad_norm": 0.4374423921108246, + "learning_rate": 9.486258198370981e-05, + "loss": 1.9754, + "step": 5579 + }, + { + "epoch": 1.7127071823204418, + "grad_norm": 0.44008153676986694, + "learning_rate": 9.486038716278277e-05, + "loss": 1.8815, + "step": 5580 + }, + { + "epoch": 1.7130141190914672, + "grad_norm": 0.3571348190307617, + "learning_rate": 9.48581918985191e-05, + "loss": 1.8948, + "step": 5581 + }, + { + "epoch": 1.7133210558624923, + "grad_norm": 0.42260754108428955, + "learning_rate": 9.485599619094049e-05, + "loss": 1.9964, + "step": 5582 + }, + { + "epoch": 1.7136279926335174, + "grad_norm": 0.44568777084350586, + "learning_rate": 9.485380004006863e-05, + "loss": 1.9596, + "step": 5583 + }, + { + "epoch": 1.7139349294045427, + "grad_norm": 0.5488269925117493, + "learning_rate": 9.485160344592523e-05, + "loss": 1.9239, + "step": 5584 + }, + { + "epoch": 1.7142418661755678, + "grad_norm": 0.5653155446052551, + "learning_rate": 9.484940640853199e-05, + "loss": 1.9115, + "step": 5585 + }, + { + "epoch": 1.714548802946593, + "grad_norm": 0.4652312099933624, + "learning_rate": 9.484720892791064e-05, + "loss": 1.9973, + "step": 5586 + }, + { + "epoch": 1.7148557397176183, + "grad_norm": 0.41521382331848145, + "learning_rate": 9.484501100408288e-05, + "loss": 1.9395, + "step": 5587 + }, + { + "epoch": 1.7151626764886432, + "grad_norm": 0.46761438250541687, + "learning_rate": 9.484281263707043e-05, + "loss": 1.9465, + "step": 5588 + }, + { + "epoch": 1.7154696132596685, + "grad_norm": 0.46990182995796204, + "learning_rate": 9.484061382689501e-05, + "loss": 1.8969, + "step": 5589 + }, + { + "epoch": 1.7157765500306936, + "grad_norm": 0.44951021671295166, + "learning_rate": 9.48384145735784e-05, + "loss": 1.9925, + "step": 5590 + }, + { + "epoch": 1.7160834868017187, + "grad_norm": 0.4029327630996704, + "learning_rate": 9.483621487714227e-05, + "loss": 1.8574, + "step": 5591 + }, + { + "epoch": 1.716390423572744, + "grad_norm": 0.3501027226448059, + "learning_rate": 9.48340147376084e-05, + "loss": 1.9156, + "step": 5592 + }, + { + "epoch": 1.7166973603437692, + "grad_norm": 0.5058720111846924, + "learning_rate": 9.48318141549985e-05, + "loss": 2.071, + "step": 5593 + }, + { + "epoch": 1.7170042971147943, + "grad_norm": 0.5097518563270569, + "learning_rate": 9.482961312933435e-05, + "loss": 1.9609, + "step": 5594 + }, + { + "epoch": 1.7173112338858196, + "grad_norm": 0.4728573262691498, + "learning_rate": 9.482741166063769e-05, + "loss": 1.9552, + "step": 5595 + }, + { + "epoch": 1.7176181706568447, + "grad_norm": 0.44095897674560547, + "learning_rate": 9.482520974893026e-05, + "loss": 2.011, + "step": 5596 + }, + { + "epoch": 1.7179251074278699, + "grad_norm": 0.48331573605537415, + "learning_rate": 9.482300739423385e-05, + "loss": 1.9676, + "step": 5597 + }, + { + "epoch": 1.7182320441988952, + "grad_norm": 0.4890894293785095, + "learning_rate": 9.482080459657019e-05, + "loss": 1.9571, + "step": 5598 + }, + { + "epoch": 1.71853898096992, + "grad_norm": 0.4486929476261139, + "learning_rate": 9.481860135596109e-05, + "loss": 1.9205, + "step": 5599 + }, + { + "epoch": 1.7188459177409454, + "grad_norm": 0.44154083728790283, + "learning_rate": 9.48163976724283e-05, + "loss": 1.9995, + "step": 5600 + }, + { + "epoch": 1.7191528545119705, + "grad_norm": 0.4155641496181488, + "learning_rate": 9.481419354599358e-05, + "loss": 1.9192, + "step": 5601 + }, + { + "epoch": 1.7194597912829956, + "grad_norm": 0.453253835439682, + "learning_rate": 9.481198897667875e-05, + "loss": 2.0102, + "step": 5602 + }, + { + "epoch": 1.719766728054021, + "grad_norm": 0.4325653314590454, + "learning_rate": 9.480978396450557e-05, + "loss": 1.8859, + "step": 5603 + }, + { + "epoch": 1.720073664825046, + "grad_norm": 0.4191089868545532, + "learning_rate": 9.480757850949584e-05, + "loss": 2.0007, + "step": 5604 + }, + { + "epoch": 1.7203806015960712, + "grad_norm": 0.4182284474372864, + "learning_rate": 9.480537261167137e-05, + "loss": 1.9374, + "step": 5605 + }, + { + "epoch": 1.7206875383670965, + "grad_norm": 0.4695988893508911, + "learning_rate": 9.480316627105394e-05, + "loss": 1.983, + "step": 5606 + }, + { + "epoch": 1.7209944751381214, + "grad_norm": 0.4668160378932953, + "learning_rate": 9.480095948766536e-05, + "loss": 1.8705, + "step": 5607 + }, + { + "epoch": 1.7213014119091468, + "grad_norm": 0.3689236044883728, + "learning_rate": 9.479875226152744e-05, + "loss": 1.8695, + "step": 5608 + }, + { + "epoch": 1.7216083486801719, + "grad_norm": 0.4206932485103607, + "learning_rate": 9.4796544592662e-05, + "loss": 1.9494, + "step": 5609 + }, + { + "epoch": 1.721915285451197, + "grad_norm": 0.4420578181743622, + "learning_rate": 9.479433648109083e-05, + "loss": 1.8749, + "step": 5610 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.4243582487106323, + "learning_rate": 9.479212792683579e-05, + "loss": 1.9524, + "step": 5611 + }, + { + "epoch": 1.7225291589932474, + "grad_norm": 0.5053666234016418, + "learning_rate": 9.478991892991868e-05, + "loss": 1.9308, + "step": 5612 + }, + { + "epoch": 1.7228360957642725, + "grad_norm": 0.4365650713443756, + "learning_rate": 9.478770949036136e-05, + "loss": 1.9469, + "step": 5613 + }, + { + "epoch": 1.7231430325352979, + "grad_norm": 0.3916216194629669, + "learning_rate": 9.478549960818561e-05, + "loss": 1.8239, + "step": 5614 + }, + { + "epoch": 1.7234499693063228, + "grad_norm": 0.4051356911659241, + "learning_rate": 9.478328928341334e-05, + "loss": 1.892, + "step": 5615 + }, + { + "epoch": 1.723756906077348, + "grad_norm": 0.36592593789100647, + "learning_rate": 9.478107851606633e-05, + "loss": 1.8763, + "step": 5616 + }, + { + "epoch": 1.7240638428483732, + "grad_norm": 0.45741888880729675, + "learning_rate": 9.477886730616645e-05, + "loss": 1.9502, + "step": 5617 + }, + { + "epoch": 1.7243707796193983, + "grad_norm": 0.38170990347862244, + "learning_rate": 9.477665565373558e-05, + "loss": 1.8568, + "step": 5618 + }, + { + "epoch": 1.7246777163904237, + "grad_norm": 0.4193691313266754, + "learning_rate": 9.477444355879554e-05, + "loss": 1.9553, + "step": 5619 + }, + { + "epoch": 1.7249846531614488, + "grad_norm": 0.39682838320732117, + "learning_rate": 9.477223102136821e-05, + "loss": 1.9474, + "step": 5620 + }, + { + "epoch": 1.725291589932474, + "grad_norm": 0.391544371843338, + "learning_rate": 9.477001804147545e-05, + "loss": 1.9277, + "step": 5621 + }, + { + "epoch": 1.7255985267034992, + "grad_norm": 0.42348888516426086, + "learning_rate": 9.476780461913913e-05, + "loss": 1.8923, + "step": 5622 + }, + { + "epoch": 1.7259054634745241, + "grad_norm": 0.4393916130065918, + "learning_rate": 9.476559075438114e-05, + "loss": 1.9052, + "step": 5623 + }, + { + "epoch": 1.7262124002455494, + "grad_norm": 0.42631569504737854, + "learning_rate": 9.476337644722333e-05, + "loss": 1.8849, + "step": 5624 + }, + { + "epoch": 1.7265193370165746, + "grad_norm": 0.3514206111431122, + "learning_rate": 9.47611616976876e-05, + "loss": 1.9286, + "step": 5625 + }, + { + "epoch": 1.7268262737875997, + "grad_norm": 0.4104609191417694, + "learning_rate": 9.475894650579582e-05, + "loss": 1.9178, + "step": 5626 + }, + { + "epoch": 1.727133210558625, + "grad_norm": 0.44329676032066345, + "learning_rate": 9.475673087156992e-05, + "loss": 1.9789, + "step": 5627 + }, + { + "epoch": 1.7274401473296501, + "grad_norm": 0.41865840554237366, + "learning_rate": 9.475451479503175e-05, + "loss": 1.9105, + "step": 5628 + }, + { + "epoch": 1.7277470841006752, + "grad_norm": 0.4166790544986725, + "learning_rate": 9.475229827620326e-05, + "loss": 1.9089, + "step": 5629 + }, + { + "epoch": 1.7280540208717006, + "grad_norm": 0.353771448135376, + "learning_rate": 9.475008131510633e-05, + "loss": 1.9081, + "step": 5630 + }, + { + "epoch": 1.7283609576427255, + "grad_norm": 0.385046124458313, + "learning_rate": 9.474786391176284e-05, + "loss": 1.9268, + "step": 5631 + }, + { + "epoch": 1.7286678944137508, + "grad_norm": 0.3956538438796997, + "learning_rate": 9.474564606619474e-05, + "loss": 1.9445, + "step": 5632 + }, + { + "epoch": 1.728974831184776, + "grad_norm": 0.41305112838745117, + "learning_rate": 9.474342777842394e-05, + "loss": 1.9331, + "step": 5633 + }, + { + "epoch": 1.729281767955801, + "grad_norm": 0.39336860179901123, + "learning_rate": 9.474120904847237e-05, + "loss": 1.9792, + "step": 5634 + }, + { + "epoch": 1.7295887047268264, + "grad_norm": 0.41963186860084534, + "learning_rate": 9.473898987636194e-05, + "loss": 1.8719, + "step": 5635 + }, + { + "epoch": 1.7298956414978515, + "grad_norm": 0.4087338149547577, + "learning_rate": 9.473677026211458e-05, + "loss": 1.9121, + "step": 5636 + }, + { + "epoch": 1.7302025782688766, + "grad_norm": 0.3693830966949463, + "learning_rate": 9.473455020575226e-05, + "loss": 1.9293, + "step": 5637 + }, + { + "epoch": 1.730509515039902, + "grad_norm": 0.40699541568756104, + "learning_rate": 9.473232970729688e-05, + "loss": 1.94, + "step": 5638 + }, + { + "epoch": 1.7308164518109268, + "grad_norm": 0.4222811162471771, + "learning_rate": 9.473010876677041e-05, + "loss": 1.9416, + "step": 5639 + }, + { + "epoch": 1.7311233885819521, + "grad_norm": 0.41459110379219055, + "learning_rate": 9.472788738419477e-05, + "loss": 1.8801, + "step": 5640 + }, + { + "epoch": 1.7314303253529773, + "grad_norm": 0.36970487236976624, + "learning_rate": 9.472566555959195e-05, + "loss": 1.9122, + "step": 5641 + }, + { + "epoch": 1.7317372621240024, + "grad_norm": 0.35511577129364014, + "learning_rate": 9.472344329298388e-05, + "loss": 1.8646, + "step": 5642 + }, + { + "epoch": 1.7320441988950277, + "grad_norm": 0.3511577248573303, + "learning_rate": 9.472122058439252e-05, + "loss": 1.9047, + "step": 5643 + }, + { + "epoch": 1.7323511356660528, + "grad_norm": 0.3421955108642578, + "learning_rate": 9.471899743383986e-05, + "loss": 1.8732, + "step": 5644 + }, + { + "epoch": 1.732658072437078, + "grad_norm": 0.44008341431617737, + "learning_rate": 9.471677384134785e-05, + "loss": 1.8956, + "step": 5645 + }, + { + "epoch": 1.7329650092081033, + "grad_norm": 0.49410128593444824, + "learning_rate": 9.471454980693848e-05, + "loss": 1.9197, + "step": 5646 + }, + { + "epoch": 1.7332719459791281, + "grad_norm": 0.4664965867996216, + "learning_rate": 9.471232533063373e-05, + "loss": 1.8945, + "step": 5647 + }, + { + "epoch": 1.7335788827501535, + "grad_norm": 0.3789248764514923, + "learning_rate": 9.471010041245555e-05, + "loss": 1.9153, + "step": 5648 + }, + { + "epoch": 1.7338858195211786, + "grad_norm": 0.34556612372398376, + "learning_rate": 9.470787505242596e-05, + "loss": 1.9144, + "step": 5649 + }, + { + "epoch": 1.7341927562922037, + "grad_norm": 0.3466256856918335, + "learning_rate": 9.470564925056695e-05, + "loss": 1.8837, + "step": 5650 + }, + { + "epoch": 1.734499693063229, + "grad_norm": 0.34612321853637695, + "learning_rate": 9.470342300690051e-05, + "loss": 1.8667, + "step": 5651 + }, + { + "epoch": 1.7348066298342542, + "grad_norm": 0.3648833632469177, + "learning_rate": 9.470119632144864e-05, + "loss": 1.9499, + "step": 5652 + }, + { + "epoch": 1.7351135666052793, + "grad_norm": 0.3600454330444336, + "learning_rate": 9.469896919423334e-05, + "loss": 1.9093, + "step": 5653 + }, + { + "epoch": 1.7354205033763046, + "grad_norm": 0.41487598419189453, + "learning_rate": 9.469674162527664e-05, + "loss": 1.9714, + "step": 5654 + }, + { + "epoch": 1.7357274401473295, + "grad_norm": 0.35980695486068726, + "learning_rate": 9.469451361460053e-05, + "loss": 1.9006, + "step": 5655 + }, + { + "epoch": 1.7360343769183548, + "grad_norm": 0.42676928639411926, + "learning_rate": 9.469228516222705e-05, + "loss": 1.9286, + "step": 5656 + }, + { + "epoch": 1.73634131368938, + "grad_norm": 0.41541969776153564, + "learning_rate": 9.469005626817822e-05, + "loss": 1.9243, + "step": 5657 + }, + { + "epoch": 1.736648250460405, + "grad_norm": 0.4245065152645111, + "learning_rate": 9.468782693247604e-05, + "loss": 1.9427, + "step": 5658 + }, + { + "epoch": 1.7369551872314304, + "grad_norm": 0.46148940920829773, + "learning_rate": 9.468559715514257e-05, + "loss": 2.0201, + "step": 5659 + }, + { + "epoch": 1.7372621240024555, + "grad_norm": 0.47727301716804504, + "learning_rate": 9.468336693619985e-05, + "loss": 1.9792, + "step": 5660 + }, + { + "epoch": 1.7375690607734806, + "grad_norm": 0.4807848036289215, + "learning_rate": 9.46811362756699e-05, + "loss": 1.9036, + "step": 5661 + }, + { + "epoch": 1.737875997544506, + "grad_norm": 0.5129636526107788, + "learning_rate": 9.467890517357477e-05, + "loss": 1.8861, + "step": 5662 + }, + { + "epoch": 1.7381829343155308, + "grad_norm": 0.467804878950119, + "learning_rate": 9.467667362993651e-05, + "loss": 1.868, + "step": 5663 + }, + { + "epoch": 1.7384898710865562, + "grad_norm": 0.4179893136024475, + "learning_rate": 9.46744416447772e-05, + "loss": 1.9521, + "step": 5664 + }, + { + "epoch": 1.7387968078575813, + "grad_norm": 0.4384612739086151, + "learning_rate": 9.467220921811884e-05, + "loss": 1.9167, + "step": 5665 + }, + { + "epoch": 1.7391037446286064, + "grad_norm": 0.517855703830719, + "learning_rate": 9.466997634998354e-05, + "loss": 1.8919, + "step": 5666 + }, + { + "epoch": 1.7394106813996317, + "grad_norm": 0.4875940978527069, + "learning_rate": 9.466774304039334e-05, + "loss": 1.8774, + "step": 5667 + }, + { + "epoch": 1.7397176181706568, + "grad_norm": 0.44286540150642395, + "learning_rate": 9.466550928937034e-05, + "loss": 1.9696, + "step": 5668 + }, + { + "epoch": 1.740024554941682, + "grad_norm": 0.4092461168766022, + "learning_rate": 9.466327509693658e-05, + "loss": 1.9978, + "step": 5669 + }, + { + "epoch": 1.7403314917127073, + "grad_norm": 0.42797163128852844, + "learning_rate": 9.466104046311418e-05, + "loss": 1.9428, + "step": 5670 + }, + { + "epoch": 1.7406384284837324, + "grad_norm": 0.5174738764762878, + "learning_rate": 9.465880538792518e-05, + "loss": 1.9493, + "step": 5671 + }, + { + "epoch": 1.7409453652547575, + "grad_norm": 0.6263836622238159, + "learning_rate": 9.46565698713917e-05, + "loss": 1.9131, + "step": 5672 + }, + { + "epoch": 1.7412523020257828, + "grad_norm": 0.6452967524528503, + "learning_rate": 9.465433391353582e-05, + "loss": 2.0412, + "step": 5673 + }, + { + "epoch": 1.7415592387968077, + "grad_norm": 0.5004684925079346, + "learning_rate": 9.465209751437964e-05, + "loss": 1.8721, + "step": 5674 + }, + { + "epoch": 1.741866175567833, + "grad_norm": 0.4694507420063019, + "learning_rate": 9.464986067394526e-05, + "loss": 1.9614, + "step": 5675 + }, + { + "epoch": 1.7421731123388582, + "grad_norm": 0.4519532322883606, + "learning_rate": 9.464762339225479e-05, + "loss": 1.9687, + "step": 5676 + }, + { + "epoch": 1.7424800491098833, + "grad_norm": 0.4297941029071808, + "learning_rate": 9.464538566933033e-05, + "loss": 1.965, + "step": 5677 + }, + { + "epoch": 1.7427869858809086, + "grad_norm": 0.4612393081188202, + "learning_rate": 9.464314750519401e-05, + "loss": 1.9651, + "step": 5678 + }, + { + "epoch": 1.7430939226519337, + "grad_norm": 0.394142210483551, + "learning_rate": 9.464090889986794e-05, + "loss": 1.9185, + "step": 5679 + }, + { + "epoch": 1.7434008594229589, + "grad_norm": 0.39999979734420776, + "learning_rate": 9.463866985337424e-05, + "loss": 1.899, + "step": 5680 + }, + { + "epoch": 1.7437077961939842, + "grad_norm": 0.40942859649658203, + "learning_rate": 9.463643036573504e-05, + "loss": 1.9653, + "step": 5681 + }, + { + "epoch": 1.744014732965009, + "grad_norm": 0.4097300171852112, + "learning_rate": 9.463419043697248e-05, + "loss": 1.9944, + "step": 5682 + }, + { + "epoch": 1.7443216697360344, + "grad_norm": 0.41627535223960876, + "learning_rate": 9.463195006710868e-05, + "loss": 1.9156, + "step": 5683 + }, + { + "epoch": 1.7446286065070595, + "grad_norm": 0.3789215385913849, + "learning_rate": 9.46297092561658e-05, + "loss": 1.9262, + "step": 5684 + }, + { + "epoch": 1.7449355432780846, + "grad_norm": 0.4867783188819885, + "learning_rate": 9.462746800416595e-05, + "loss": 1.961, + "step": 5685 + }, + { + "epoch": 1.74524248004911, + "grad_norm": 0.6078580617904663, + "learning_rate": 9.462522631113133e-05, + "loss": 1.9694, + "step": 5686 + }, + { + "epoch": 1.745549416820135, + "grad_norm": 0.558968186378479, + "learning_rate": 9.462298417708406e-05, + "loss": 1.9537, + "step": 5687 + }, + { + "epoch": 1.7458563535911602, + "grad_norm": 0.4677596986293793, + "learning_rate": 9.46207416020463e-05, + "loss": 1.9253, + "step": 5688 + }, + { + "epoch": 1.7461632903621855, + "grad_norm": 0.40353646874427795, + "learning_rate": 9.461849858604023e-05, + "loss": 1.8992, + "step": 5689 + }, + { + "epoch": 1.7464702271332104, + "grad_norm": 0.3738614618778229, + "learning_rate": 9.4616255129088e-05, + "loss": 1.9109, + "step": 5690 + }, + { + "epoch": 1.7467771639042358, + "grad_norm": 0.4040324091911316, + "learning_rate": 9.461401123121179e-05, + "loss": 1.8981, + "step": 5691 + }, + { + "epoch": 1.7470841006752609, + "grad_norm": 0.44214901328086853, + "learning_rate": 9.461176689243376e-05, + "loss": 1.9244, + "step": 5692 + }, + { + "epoch": 1.747391037446286, + "grad_norm": 0.44187378883361816, + "learning_rate": 9.460952211277611e-05, + "loss": 1.9329, + "step": 5693 + }, + { + "epoch": 1.7476979742173113, + "grad_norm": 0.44287410378456116, + "learning_rate": 9.460727689226102e-05, + "loss": 1.97, + "step": 5694 + }, + { + "epoch": 1.7480049109883364, + "grad_norm": 0.3757341504096985, + "learning_rate": 9.460503123091067e-05, + "loss": 1.8766, + "step": 5695 + }, + { + "epoch": 1.7483118477593615, + "grad_norm": 0.4139314591884613, + "learning_rate": 9.460278512874725e-05, + "loss": 1.902, + "step": 5696 + }, + { + "epoch": 1.7486187845303869, + "grad_norm": 0.37526339292526245, + "learning_rate": 9.460053858579298e-05, + "loss": 1.9325, + "step": 5697 + }, + { + "epoch": 1.7489257213014118, + "grad_norm": 0.3770616948604584, + "learning_rate": 9.459829160207004e-05, + "loss": 1.9437, + "step": 5698 + }, + { + "epoch": 1.749232658072437, + "grad_norm": 0.4069806933403015, + "learning_rate": 9.459604417760064e-05, + "loss": 1.9454, + "step": 5699 + }, + { + "epoch": 1.7495395948434622, + "grad_norm": 0.42822694778442383, + "learning_rate": 9.459379631240699e-05, + "loss": 1.8798, + "step": 5700 + }, + { + "epoch": 1.7498465316144873, + "grad_norm": 0.44075292348861694, + "learning_rate": 9.459154800651131e-05, + "loss": 1.9842, + "step": 5701 + }, + { + "epoch": 1.7501534683855127, + "grad_norm": 0.4151122272014618, + "learning_rate": 9.458929925993583e-05, + "loss": 1.8495, + "step": 5702 + }, + { + "epoch": 1.7504604051565378, + "grad_norm": 0.41887882351875305, + "learning_rate": 9.458705007270275e-05, + "loss": 1.9611, + "step": 5703 + }, + { + "epoch": 1.750767341927563, + "grad_norm": 0.3976796865463257, + "learning_rate": 9.45848004448343e-05, + "loss": 1.8841, + "step": 5704 + }, + { + "epoch": 1.7510742786985882, + "grad_norm": 0.3783813416957855, + "learning_rate": 9.458255037635272e-05, + "loss": 1.8897, + "step": 5705 + }, + { + "epoch": 1.7513812154696131, + "grad_norm": 0.35153308510780334, + "learning_rate": 9.458029986728026e-05, + "loss": 1.911, + "step": 5706 + }, + { + "epoch": 1.7516881522406385, + "grad_norm": 0.38390985131263733, + "learning_rate": 9.457804891763913e-05, + "loss": 2.0105, + "step": 5707 + }, + { + "epoch": 1.7519950890116636, + "grad_norm": 0.3830740451812744, + "learning_rate": 9.457579752745161e-05, + "loss": 1.9635, + "step": 5708 + }, + { + "epoch": 1.7523020257826887, + "grad_norm": 0.3711417019367218, + "learning_rate": 9.457354569673993e-05, + "loss": 1.8553, + "step": 5709 + }, + { + "epoch": 1.752608962553714, + "grad_norm": 0.3670618236064911, + "learning_rate": 9.457129342552633e-05, + "loss": 1.9044, + "step": 5710 + }, + { + "epoch": 1.7529158993247391, + "grad_norm": 0.398863285779953, + "learning_rate": 9.45690407138331e-05, + "loss": 1.987, + "step": 5711 + }, + { + "epoch": 1.7532228360957642, + "grad_norm": 0.4100732207298279, + "learning_rate": 9.456678756168248e-05, + "loss": 1.8552, + "step": 5712 + }, + { + "epoch": 1.7535297728667896, + "grad_norm": 0.41883236169815063, + "learning_rate": 9.456453396909676e-05, + "loss": 1.9183, + "step": 5713 + }, + { + "epoch": 1.7538367096378145, + "grad_norm": 0.4063440263271332, + "learning_rate": 9.456227993609818e-05, + "loss": 1.8751, + "step": 5714 + }, + { + "epoch": 1.7541436464088398, + "grad_norm": 0.3880515694618225, + "learning_rate": 9.456002546270904e-05, + "loss": 1.9558, + "step": 5715 + }, + { + "epoch": 1.754450583179865, + "grad_norm": 0.38582444190979004, + "learning_rate": 9.45577705489516e-05, + "loss": 1.9588, + "step": 5716 + }, + { + "epoch": 1.75475751995089, + "grad_norm": 0.3678396940231323, + "learning_rate": 9.455551519484816e-05, + "loss": 1.9108, + "step": 5717 + }, + { + "epoch": 1.7550644567219154, + "grad_norm": 0.3590768277645111, + "learning_rate": 9.455325940042098e-05, + "loss": 1.9027, + "step": 5718 + }, + { + "epoch": 1.7553713934929405, + "grad_norm": 0.4104592204093933, + "learning_rate": 9.455100316569241e-05, + "loss": 1.9099, + "step": 5719 + }, + { + "epoch": 1.7556783302639656, + "grad_norm": 0.3774401843547821, + "learning_rate": 9.45487464906847e-05, + "loss": 1.9098, + "step": 5720 + }, + { + "epoch": 1.755985267034991, + "grad_norm": 0.38464388251304626, + "learning_rate": 9.454648937542019e-05, + "loss": 1.9194, + "step": 5721 + }, + { + "epoch": 1.7562922038060158, + "grad_norm": 0.435131698846817, + "learning_rate": 9.454423181992114e-05, + "loss": 1.9798, + "step": 5722 + }, + { + "epoch": 1.7565991405770411, + "grad_norm": 0.4583236575126648, + "learning_rate": 9.454197382420988e-05, + "loss": 1.9862, + "step": 5723 + }, + { + "epoch": 1.7569060773480663, + "grad_norm": 0.3644738793373108, + "learning_rate": 9.453971538830874e-05, + "loss": 1.8535, + "step": 5724 + }, + { + "epoch": 1.7572130141190914, + "grad_norm": 0.3644218444824219, + "learning_rate": 9.453745651224002e-05, + "loss": 1.8773, + "step": 5725 + }, + { + "epoch": 1.7575199508901167, + "grad_norm": 0.42884743213653564, + "learning_rate": 9.453519719602604e-05, + "loss": 1.882, + "step": 5726 + }, + { + "epoch": 1.7578268876611418, + "grad_norm": 0.41049477458000183, + "learning_rate": 9.453293743968916e-05, + "loss": 1.9133, + "step": 5727 + }, + { + "epoch": 1.758133824432167, + "grad_norm": 0.35882604122161865, + "learning_rate": 9.453067724325169e-05, + "loss": 1.9056, + "step": 5728 + }, + { + "epoch": 1.7584407612031923, + "grad_norm": 0.34516364336013794, + "learning_rate": 9.452841660673595e-05, + "loss": 1.8894, + "step": 5729 + }, + { + "epoch": 1.7587476979742172, + "grad_norm": 0.41804373264312744, + "learning_rate": 9.45261555301643e-05, + "loss": 1.8798, + "step": 5730 + }, + { + "epoch": 1.7590546347452425, + "grad_norm": 0.48584702610969543, + "learning_rate": 9.45238940135591e-05, + "loss": 1.9353, + "step": 5731 + }, + { + "epoch": 1.7593615715162676, + "grad_norm": 0.5693044662475586, + "learning_rate": 9.452163205694267e-05, + "loss": 1.8813, + "step": 5732 + }, + { + "epoch": 1.7596685082872927, + "grad_norm": 0.6146205067634583, + "learning_rate": 9.451936966033738e-05, + "loss": 1.9993, + "step": 5733 + }, + { + "epoch": 1.759975445058318, + "grad_norm": 0.4658338129520416, + "learning_rate": 9.451710682376558e-05, + "loss": 1.8977, + "step": 5734 + }, + { + "epoch": 1.7602823818293432, + "grad_norm": 0.35184696316719055, + "learning_rate": 9.451484354724964e-05, + "loss": 1.9924, + "step": 5735 + }, + { + "epoch": 1.7605893186003683, + "grad_norm": 0.48720163106918335, + "learning_rate": 9.451257983081194e-05, + "loss": 1.9054, + "step": 5736 + }, + { + "epoch": 1.7608962553713936, + "grad_norm": 0.6268271803855896, + "learning_rate": 9.451031567447482e-05, + "loss": 1.9956, + "step": 5737 + }, + { + "epoch": 1.7612031921424187, + "grad_norm": 0.5384534001350403, + "learning_rate": 9.450805107826068e-05, + "loss": 1.9169, + "step": 5738 + }, + { + "epoch": 1.7615101289134438, + "grad_norm": 0.4011121094226837, + "learning_rate": 9.450578604219188e-05, + "loss": 1.9845, + "step": 5739 + }, + { + "epoch": 1.7618170656844692, + "grad_norm": 0.4422668516635895, + "learning_rate": 9.450352056629082e-05, + "loss": 2.0014, + "step": 5740 + }, + { + "epoch": 1.762124002455494, + "grad_norm": 0.5033303499221802, + "learning_rate": 9.45012546505799e-05, + "loss": 1.9142, + "step": 5741 + }, + { + "epoch": 1.7624309392265194, + "grad_norm": 0.6074427366256714, + "learning_rate": 9.449898829508148e-05, + "loss": 1.9385, + "step": 5742 + }, + { + "epoch": 1.7627378759975445, + "grad_norm": 0.6405495405197144, + "learning_rate": 9.449672149981799e-05, + "loss": 1.9792, + "step": 5743 + }, + { + "epoch": 1.7630448127685696, + "grad_norm": 0.5432560443878174, + "learning_rate": 9.449445426481182e-05, + "loss": 1.9294, + "step": 5744 + }, + { + "epoch": 1.763351749539595, + "grad_norm": 0.41406089067459106, + "learning_rate": 9.449218659008536e-05, + "loss": 1.9266, + "step": 5745 + }, + { + "epoch": 1.76365868631062, + "grad_norm": 0.41278013586997986, + "learning_rate": 9.448991847566104e-05, + "loss": 1.9448, + "step": 5746 + }, + { + "epoch": 1.7639656230816452, + "grad_norm": 0.4682934582233429, + "learning_rate": 9.448764992156128e-05, + "loss": 1.9836, + "step": 5747 + }, + { + "epoch": 1.7642725598526705, + "grad_norm": 0.47673073410987854, + "learning_rate": 9.448538092780848e-05, + "loss": 2.0229, + "step": 5748 + }, + { + "epoch": 1.7645794966236954, + "grad_norm": 0.3956258296966553, + "learning_rate": 9.448311149442507e-05, + "loss": 1.9871, + "step": 5749 + }, + { + "epoch": 1.7648864333947207, + "grad_norm": 0.39578214287757874, + "learning_rate": 9.448084162143348e-05, + "loss": 1.8991, + "step": 5750 + }, + { + "epoch": 1.7651933701657458, + "grad_norm": 0.42902353405952454, + "learning_rate": 9.447857130885614e-05, + "loss": 1.9925, + "step": 5751 + }, + { + "epoch": 1.765500306936771, + "grad_norm": 0.45643556118011475, + "learning_rate": 9.44763005567155e-05, + "loss": 1.9662, + "step": 5752 + }, + { + "epoch": 1.7658072437077963, + "grad_norm": 0.39291635155677795, + "learning_rate": 9.447402936503398e-05, + "loss": 1.8925, + "step": 5753 + }, + { + "epoch": 1.7661141804788214, + "grad_norm": 0.36709296703338623, + "learning_rate": 9.447175773383404e-05, + "loss": 1.8669, + "step": 5754 + }, + { + "epoch": 1.7664211172498465, + "grad_norm": 0.41586652398109436, + "learning_rate": 9.446948566313812e-05, + "loss": 1.8925, + "step": 5755 + }, + { + "epoch": 1.7667280540208719, + "grad_norm": 0.42532578110694885, + "learning_rate": 9.446721315296867e-05, + "loss": 1.9923, + "step": 5756 + }, + { + "epoch": 1.7670349907918967, + "grad_norm": 0.45310646295547485, + "learning_rate": 9.446494020334817e-05, + "loss": 1.9908, + "step": 5757 + }, + { + "epoch": 1.767341927562922, + "grad_norm": 0.4391445219516754, + "learning_rate": 9.446266681429907e-05, + "loss": 1.9391, + "step": 5758 + }, + { + "epoch": 1.7676488643339472, + "grad_norm": 0.3728313446044922, + "learning_rate": 9.446039298584382e-05, + "loss": 1.9352, + "step": 5759 + }, + { + "epoch": 1.7679558011049723, + "grad_norm": 0.3862408697605133, + "learning_rate": 9.445811871800492e-05, + "loss": 1.9628, + "step": 5760 + }, + { + "epoch": 1.7682627378759976, + "grad_norm": 0.3704443573951721, + "learning_rate": 9.445584401080482e-05, + "loss": 1.9041, + "step": 5761 + }, + { + "epoch": 1.7685696746470227, + "grad_norm": 0.3490816652774811, + "learning_rate": 9.445356886426603e-05, + "loss": 1.9203, + "step": 5762 + }, + { + "epoch": 1.7688766114180479, + "grad_norm": 0.40135613083839417, + "learning_rate": 9.445129327841102e-05, + "loss": 1.9166, + "step": 5763 + }, + { + "epoch": 1.7691835481890732, + "grad_norm": 0.3794950246810913, + "learning_rate": 9.444901725326227e-05, + "loss": 1.8735, + "step": 5764 + }, + { + "epoch": 1.769490484960098, + "grad_norm": 0.3908408284187317, + "learning_rate": 9.444674078884228e-05, + "loss": 1.9044, + "step": 5765 + }, + { + "epoch": 1.7697974217311234, + "grad_norm": 0.45880573987960815, + "learning_rate": 9.444446388517354e-05, + "loss": 1.999, + "step": 5766 + }, + { + "epoch": 1.7701043585021485, + "grad_norm": 0.44833555817604065, + "learning_rate": 9.444218654227856e-05, + "loss": 1.8638, + "step": 5767 + }, + { + "epoch": 1.7704112952731736, + "grad_norm": 0.4608282446861267, + "learning_rate": 9.443990876017985e-05, + "loss": 2.0073, + "step": 5768 + }, + { + "epoch": 1.770718232044199, + "grad_norm": 0.41873493790626526, + "learning_rate": 9.44376305388999e-05, + "loss": 1.9337, + "step": 5769 + }, + { + "epoch": 1.771025168815224, + "grad_norm": 0.44395530223846436, + "learning_rate": 9.443535187846125e-05, + "loss": 1.9218, + "step": 5770 + }, + { + "epoch": 1.7713321055862492, + "grad_norm": 0.4347928464412689, + "learning_rate": 9.443307277888641e-05, + "loss": 1.9251, + "step": 5771 + }, + { + "epoch": 1.7716390423572745, + "grad_norm": 0.4892890155315399, + "learning_rate": 9.44307932401979e-05, + "loss": 1.9549, + "step": 5772 + }, + { + "epoch": 1.7719459791282994, + "grad_norm": 0.4234324097633362, + "learning_rate": 9.442851326241826e-05, + "loss": 1.9835, + "step": 5773 + }, + { + "epoch": 1.7722529158993248, + "grad_norm": 0.3614303171634674, + "learning_rate": 9.442623284557e-05, + "loss": 1.8942, + "step": 5774 + }, + { + "epoch": 1.7725598526703499, + "grad_norm": 0.4273429214954376, + "learning_rate": 9.442395198967566e-05, + "loss": 1.9363, + "step": 5775 + }, + { + "epoch": 1.772866789441375, + "grad_norm": 0.5049880146980286, + "learning_rate": 9.44216706947578e-05, + "loss": 1.904, + "step": 5776 + }, + { + "epoch": 1.7731737262124003, + "grad_norm": 0.5713424682617188, + "learning_rate": 9.441938896083895e-05, + "loss": 1.9756, + "step": 5777 + }, + { + "epoch": 1.7734806629834254, + "grad_norm": 0.4836362600326538, + "learning_rate": 9.441710678794166e-05, + "loss": 1.9657, + "step": 5778 + }, + { + "epoch": 1.7737875997544506, + "grad_norm": 0.39967820048332214, + "learning_rate": 9.44148241760885e-05, + "loss": 1.9566, + "step": 5779 + }, + { + "epoch": 1.7740945365254759, + "grad_norm": 0.38304075598716736, + "learning_rate": 9.4412541125302e-05, + "loss": 1.9055, + "step": 5780 + }, + { + "epoch": 1.7744014732965008, + "grad_norm": 0.3932463526725769, + "learning_rate": 9.441025763560474e-05, + "loss": 1.9603, + "step": 5781 + }, + { + "epoch": 1.774708410067526, + "grad_norm": 0.4528409242630005, + "learning_rate": 9.44079737070193e-05, + "loss": 2.0095, + "step": 5782 + }, + { + "epoch": 1.7750153468385512, + "grad_norm": 0.42075392603874207, + "learning_rate": 9.440568933956822e-05, + "loss": 1.8818, + "step": 5783 + }, + { + "epoch": 1.7753222836095763, + "grad_norm": 0.4114269018173218, + "learning_rate": 9.44034045332741e-05, + "loss": 1.8524, + "step": 5784 + }, + { + "epoch": 1.7756292203806017, + "grad_norm": 0.4052261412143707, + "learning_rate": 9.44011192881595e-05, + "loss": 1.9759, + "step": 5785 + }, + { + "epoch": 1.7759361571516268, + "grad_norm": 0.3551998436450958, + "learning_rate": 9.439883360424702e-05, + "loss": 1.9534, + "step": 5786 + }, + { + "epoch": 1.776243093922652, + "grad_norm": 0.404109925031662, + "learning_rate": 9.439654748155924e-05, + "loss": 1.8944, + "step": 5787 + }, + { + "epoch": 1.7765500306936772, + "grad_norm": 0.4092860519886017, + "learning_rate": 9.439426092011875e-05, + "loss": 2.0341, + "step": 5788 + }, + { + "epoch": 1.7768569674647021, + "grad_norm": 0.36132386326789856, + "learning_rate": 9.439197391994819e-05, + "loss": 1.8746, + "step": 5789 + }, + { + "epoch": 1.7771639042357275, + "grad_norm": 0.34845319390296936, + "learning_rate": 9.438968648107009e-05, + "loss": 1.8646, + "step": 5790 + }, + { + "epoch": 1.7774708410067526, + "grad_norm": 0.33360353112220764, + "learning_rate": 9.43873986035071e-05, + "loss": 1.901, + "step": 5791 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.348147988319397, + "learning_rate": 9.438511028728181e-05, + "loss": 1.8703, + "step": 5792 + }, + { + "epoch": 1.778084714548803, + "grad_norm": 0.385662704706192, + "learning_rate": 9.438282153241686e-05, + "loss": 1.9806, + "step": 5793 + }, + { + "epoch": 1.7783916513198281, + "grad_norm": 0.39457234740257263, + "learning_rate": 9.438053233893484e-05, + "loss": 1.9324, + "step": 5794 + }, + { + "epoch": 1.7786985880908532, + "grad_norm": 0.35745853185653687, + "learning_rate": 9.43782427068584e-05, + "loss": 1.9754, + "step": 5795 + }, + { + "epoch": 1.7790055248618786, + "grad_norm": 0.40866991877555847, + "learning_rate": 9.437595263621015e-05, + "loss": 1.959, + "step": 5796 + }, + { + "epoch": 1.7793124616329035, + "grad_norm": 0.3938930630683899, + "learning_rate": 9.437366212701274e-05, + "loss": 1.8746, + "step": 5797 + }, + { + "epoch": 1.7796193984039288, + "grad_norm": 0.36665603518486023, + "learning_rate": 9.437137117928878e-05, + "loss": 1.9209, + "step": 5798 + }, + { + "epoch": 1.779926335174954, + "grad_norm": 0.38514846563339233, + "learning_rate": 9.436907979306092e-05, + "loss": 1.8697, + "step": 5799 + }, + { + "epoch": 1.780233271945979, + "grad_norm": 0.4100898206233978, + "learning_rate": 9.43667879683518e-05, + "loss": 1.9606, + "step": 5800 + }, + { + "epoch": 1.7805402087170044, + "grad_norm": 0.40195250511169434, + "learning_rate": 9.43644957051841e-05, + "loss": 1.918, + "step": 5801 + }, + { + "epoch": 1.7808471454880295, + "grad_norm": 0.3943032920360565, + "learning_rate": 9.436220300358043e-05, + "loss": 1.9394, + "step": 5802 + }, + { + "epoch": 1.7811540822590546, + "grad_norm": 0.4171943664550781, + "learning_rate": 9.435990986356349e-05, + "loss": 1.9773, + "step": 5803 + }, + { + "epoch": 1.78146101903008, + "grad_norm": 0.4278806746006012, + "learning_rate": 9.435761628515589e-05, + "loss": 1.8696, + "step": 5804 + }, + { + "epoch": 1.7817679558011048, + "grad_norm": 0.4659377634525299, + "learning_rate": 9.435532226838036e-05, + "loss": 1.9387, + "step": 5805 + }, + { + "epoch": 1.7820748925721301, + "grad_norm": 0.4428139925003052, + "learning_rate": 9.435302781325952e-05, + "loss": 1.9673, + "step": 5806 + }, + { + "epoch": 1.7823818293431553, + "grad_norm": 0.4488377869129181, + "learning_rate": 9.435073291981607e-05, + "loss": 1.8493, + "step": 5807 + }, + { + "epoch": 1.7826887661141804, + "grad_norm": 0.5337218046188354, + "learning_rate": 9.434843758807268e-05, + "loss": 1.8631, + "step": 5808 + }, + { + "epoch": 1.7829957028852057, + "grad_norm": 0.5479410886764526, + "learning_rate": 9.434614181805202e-05, + "loss": 1.8548, + "step": 5809 + }, + { + "epoch": 1.7833026396562308, + "grad_norm": 0.5154398679733276, + "learning_rate": 9.434384560977681e-05, + "loss": 1.9558, + "step": 5810 + }, + { + "epoch": 1.783609576427256, + "grad_norm": 0.44863855838775635, + "learning_rate": 9.434154896326974e-05, + "loss": 1.9287, + "step": 5811 + }, + { + "epoch": 1.7839165131982813, + "grad_norm": 0.43923139572143555, + "learning_rate": 9.433925187855348e-05, + "loss": 1.9475, + "step": 5812 + }, + { + "epoch": 1.7842234499693064, + "grad_norm": 0.3602962791919708, + "learning_rate": 9.433695435565073e-05, + "loss": 1.8705, + "step": 5813 + }, + { + "epoch": 1.7845303867403315, + "grad_norm": 0.3956433832645416, + "learning_rate": 9.433465639458423e-05, + "loss": 1.9402, + "step": 5814 + }, + { + "epoch": 1.7848373235113568, + "grad_norm": 0.3382786810398102, + "learning_rate": 9.433235799537666e-05, + "loss": 1.9176, + "step": 5815 + }, + { + "epoch": 1.7851442602823817, + "grad_norm": 0.3681669533252716, + "learning_rate": 9.433005915805076e-05, + "loss": 1.8628, + "step": 5816 + }, + { + "epoch": 1.785451197053407, + "grad_norm": 0.32285505533218384, + "learning_rate": 9.432775988262921e-05, + "loss": 1.8875, + "step": 5817 + }, + { + "epoch": 1.7857581338244322, + "grad_norm": 0.35673508048057556, + "learning_rate": 9.432546016913477e-05, + "loss": 1.925, + "step": 5818 + }, + { + "epoch": 1.7860650705954573, + "grad_norm": 0.363308310508728, + "learning_rate": 9.432316001759015e-05, + "loss": 1.8711, + "step": 5819 + }, + { + "epoch": 1.7863720073664826, + "grad_norm": 0.36789265275001526, + "learning_rate": 9.432085942801808e-05, + "loss": 1.8578, + "step": 5820 + }, + { + "epoch": 1.7866789441375077, + "grad_norm": 0.3791796565055847, + "learning_rate": 9.43185584004413e-05, + "loss": 1.9162, + "step": 5821 + }, + { + "epoch": 1.7869858809085328, + "grad_norm": 0.3819539248943329, + "learning_rate": 9.431625693488256e-05, + "loss": 1.9042, + "step": 5822 + }, + { + "epoch": 1.7872928176795582, + "grad_norm": 0.36675095558166504, + "learning_rate": 9.43139550313646e-05, + "loss": 1.9775, + "step": 5823 + }, + { + "epoch": 1.787599754450583, + "grad_norm": 0.40895935893058777, + "learning_rate": 9.431165268991013e-05, + "loss": 1.9249, + "step": 5824 + }, + { + "epoch": 1.7879066912216084, + "grad_norm": 0.3866878151893616, + "learning_rate": 9.430934991054197e-05, + "loss": 1.8706, + "step": 5825 + }, + { + "epoch": 1.7882136279926335, + "grad_norm": 0.4892923831939697, + "learning_rate": 9.430704669328283e-05, + "loss": 1.9177, + "step": 5826 + }, + { + "epoch": 1.7885205647636586, + "grad_norm": 0.46216699481010437, + "learning_rate": 9.430474303815548e-05, + "loss": 1.8606, + "step": 5827 + }, + { + "epoch": 1.788827501534684, + "grad_norm": 0.4253760874271393, + "learning_rate": 9.430243894518271e-05, + "loss": 1.9123, + "step": 5828 + }, + { + "epoch": 1.789134438305709, + "grad_norm": 0.3316090404987335, + "learning_rate": 9.430013441438726e-05, + "loss": 1.9138, + "step": 5829 + }, + { + "epoch": 1.7894413750767342, + "grad_norm": 0.36144545674324036, + "learning_rate": 9.429782944579191e-05, + "loss": 1.8851, + "step": 5830 + }, + { + "epoch": 1.7897483118477595, + "grad_norm": 0.47213298082351685, + "learning_rate": 9.429552403941946e-05, + "loss": 1.9614, + "step": 5831 + }, + { + "epoch": 1.7900552486187844, + "grad_norm": 0.5166186094284058, + "learning_rate": 9.429321819529267e-05, + "loss": 1.9297, + "step": 5832 + }, + { + "epoch": 1.7903621853898097, + "grad_norm": 0.5276393294334412, + "learning_rate": 9.429091191343433e-05, + "loss": 1.8803, + "step": 5833 + }, + { + "epoch": 1.7906691221608348, + "grad_norm": 0.5736613869667053, + "learning_rate": 9.428860519386726e-05, + "loss": 1.9256, + "step": 5834 + }, + { + "epoch": 1.79097605893186, + "grad_norm": 0.6111080050468445, + "learning_rate": 9.428629803661421e-05, + "loss": 1.9624, + "step": 5835 + }, + { + "epoch": 1.7912829957028853, + "grad_norm": 0.45036107301712036, + "learning_rate": 9.428399044169802e-05, + "loss": 1.8625, + "step": 5836 + }, + { + "epoch": 1.7915899324739104, + "grad_norm": 0.35049325227737427, + "learning_rate": 9.428168240914148e-05, + "loss": 1.8988, + "step": 5837 + }, + { + "epoch": 1.7918968692449355, + "grad_norm": 0.4196048080921173, + "learning_rate": 9.427937393896739e-05, + "loss": 1.8593, + "step": 5838 + }, + { + "epoch": 1.7922038060159609, + "grad_norm": 0.5051491856575012, + "learning_rate": 9.42770650311986e-05, + "loss": 1.9283, + "step": 5839 + }, + { + "epoch": 1.7925107427869857, + "grad_norm": 0.5883297324180603, + "learning_rate": 9.427475568585787e-05, + "loss": 1.9211, + "step": 5840 + }, + { + "epoch": 1.792817679558011, + "grad_norm": 0.54326993227005, + "learning_rate": 9.427244590296807e-05, + "loss": 1.8856, + "step": 5841 + }, + { + "epoch": 1.7931246163290362, + "grad_norm": 0.3963034152984619, + "learning_rate": 9.4270135682552e-05, + "loss": 1.9302, + "step": 5842 + }, + { + "epoch": 1.7934315531000613, + "grad_norm": 0.3804232180118561, + "learning_rate": 9.426782502463251e-05, + "loss": 1.8615, + "step": 5843 + }, + { + "epoch": 1.7937384898710866, + "grad_norm": 0.5173880457878113, + "learning_rate": 9.426551392923244e-05, + "loss": 1.9702, + "step": 5844 + }, + { + "epoch": 1.7940454266421118, + "grad_norm": 0.5509253144264221, + "learning_rate": 9.42632023963746e-05, + "loss": 1.9091, + "step": 5845 + }, + { + "epoch": 1.7943523634131369, + "grad_norm": 0.4918860197067261, + "learning_rate": 9.426089042608186e-05, + "loss": 1.956, + "step": 5846 + }, + { + "epoch": 1.7946593001841622, + "grad_norm": 0.40632131695747375, + "learning_rate": 9.425857801837705e-05, + "loss": 1.978, + "step": 5847 + }, + { + "epoch": 1.794966236955187, + "grad_norm": 0.429643839597702, + "learning_rate": 9.425626517328303e-05, + "loss": 1.9293, + "step": 5848 + }, + { + "epoch": 1.7952731737262124, + "grad_norm": 0.46690109372138977, + "learning_rate": 9.425395189082267e-05, + "loss": 1.935, + "step": 5849 + }, + { + "epoch": 1.7955801104972375, + "grad_norm": 0.47745081782341003, + "learning_rate": 9.425163817101881e-05, + "loss": 1.9308, + "step": 5850 + }, + { + "epoch": 1.7958870472682626, + "grad_norm": 0.40971288084983826, + "learning_rate": 9.424932401389433e-05, + "loss": 1.8818, + "step": 5851 + }, + { + "epoch": 1.796193984039288, + "grad_norm": 0.44640809297561646, + "learning_rate": 9.424700941947209e-05, + "loss": 1.9298, + "step": 5852 + }, + { + "epoch": 1.796500920810313, + "grad_norm": 0.4068106412887573, + "learning_rate": 9.424469438777497e-05, + "loss": 1.9176, + "step": 5853 + }, + { + "epoch": 1.7968078575813382, + "grad_norm": 0.39228180050849915, + "learning_rate": 9.424237891882584e-05, + "loss": 1.9822, + "step": 5854 + }, + { + "epoch": 1.7971147943523635, + "grad_norm": 0.4050966203212738, + "learning_rate": 9.424006301264761e-05, + "loss": 2.0092, + "step": 5855 + }, + { + "epoch": 1.7974217311233884, + "grad_norm": 0.4402252733707428, + "learning_rate": 9.423774666926313e-05, + "loss": 1.9686, + "step": 5856 + }, + { + "epoch": 1.7977286678944138, + "grad_norm": 0.4362206757068634, + "learning_rate": 9.423542988869531e-05, + "loss": 1.9472, + "step": 5857 + }, + { + "epoch": 1.7980356046654389, + "grad_norm": 0.4363079369068146, + "learning_rate": 9.423311267096706e-05, + "loss": 1.9046, + "step": 5858 + }, + { + "epoch": 1.798342541436464, + "grad_norm": 0.4619371294975281, + "learning_rate": 9.423079501610123e-05, + "loss": 1.9322, + "step": 5859 + }, + { + "epoch": 1.7986494782074893, + "grad_norm": 0.3747330605983734, + "learning_rate": 9.42284769241208e-05, + "loss": 1.8859, + "step": 5860 + }, + { + "epoch": 1.7989564149785144, + "grad_norm": 0.46349939703941345, + "learning_rate": 9.422615839504863e-05, + "loss": 2.0343, + "step": 5861 + }, + { + "epoch": 1.7992633517495396, + "grad_norm": 0.4081406891345978, + "learning_rate": 9.422383942890762e-05, + "loss": 1.9261, + "step": 5862 + }, + { + "epoch": 1.7995702885205649, + "grad_norm": 0.4200274348258972, + "learning_rate": 9.42215200257207e-05, + "loss": 1.8922, + "step": 5863 + }, + { + "epoch": 1.7998772252915898, + "grad_norm": 0.4353233277797699, + "learning_rate": 9.421920018551084e-05, + "loss": 1.9263, + "step": 5864 + }, + { + "epoch": 1.8001841620626151, + "grad_norm": 0.43261346220970154, + "learning_rate": 9.42168799083009e-05, + "loss": 1.872, + "step": 5865 + }, + { + "epoch": 1.8004910988336402, + "grad_norm": 0.41588231921195984, + "learning_rate": 9.421455919411385e-05, + "loss": 1.9427, + "step": 5866 + }, + { + "epoch": 1.8007980356046653, + "grad_norm": 0.36490678787231445, + "learning_rate": 9.421223804297261e-05, + "loss": 1.9458, + "step": 5867 + }, + { + "epoch": 1.8011049723756907, + "grad_norm": 0.40656644105911255, + "learning_rate": 9.42099164549001e-05, + "loss": 1.8791, + "step": 5868 + }, + { + "epoch": 1.8014119091467158, + "grad_norm": 0.35529834032058716, + "learning_rate": 9.42075944299193e-05, + "loss": 1.8889, + "step": 5869 + }, + { + "epoch": 1.801718845917741, + "grad_norm": 0.3530628979206085, + "learning_rate": 9.420527196805314e-05, + "loss": 1.9093, + "step": 5870 + }, + { + "epoch": 1.8020257826887662, + "grad_norm": 0.35012003779411316, + "learning_rate": 9.420294906932457e-05, + "loss": 1.84, + "step": 5871 + }, + { + "epoch": 1.8023327194597911, + "grad_norm": 0.37993142008781433, + "learning_rate": 9.420062573375654e-05, + "loss": 1.9943, + "step": 5872 + }, + { + "epoch": 1.8026396562308165, + "grad_norm": 0.34801873564720154, + "learning_rate": 9.419830196137204e-05, + "loss": 1.9092, + "step": 5873 + }, + { + "epoch": 1.8029465930018416, + "grad_norm": 0.3381052017211914, + "learning_rate": 9.4195977752194e-05, + "loss": 1.9212, + "step": 5874 + }, + { + "epoch": 1.8032535297728667, + "grad_norm": 0.3624991476535797, + "learning_rate": 9.419365310624542e-05, + "loss": 1.9491, + "step": 5875 + }, + { + "epoch": 1.803560466543892, + "grad_norm": 0.3840768337249756, + "learning_rate": 9.419132802354925e-05, + "loss": 1.9531, + "step": 5876 + }, + { + "epoch": 1.8038674033149171, + "grad_norm": 0.377481073141098, + "learning_rate": 9.418900250412846e-05, + "loss": 1.9103, + "step": 5877 + }, + { + "epoch": 1.8041743400859422, + "grad_norm": 0.41462278366088867, + "learning_rate": 9.418667654800606e-05, + "loss": 1.944, + "step": 5878 + }, + { + "epoch": 1.8044812768569676, + "grad_norm": 0.5620705485343933, + "learning_rate": 9.418435015520502e-05, + "loss": 1.9184, + "step": 5879 + }, + { + "epoch": 1.8047882136279927, + "grad_norm": 0.6150699853897095, + "learning_rate": 9.418202332574833e-05, + "loss": 1.8971, + "step": 5880 + }, + { + "epoch": 1.8050951503990178, + "grad_norm": 0.5631645321846008, + "learning_rate": 9.4179696059659e-05, + "loss": 1.9668, + "step": 5881 + }, + { + "epoch": 1.8054020871700431, + "grad_norm": 0.4416831433773041, + "learning_rate": 9.417736835696001e-05, + "loss": 1.8531, + "step": 5882 + }, + { + "epoch": 1.805709023941068, + "grad_norm": 0.37340816855430603, + "learning_rate": 9.417504021767438e-05, + "loss": 1.8928, + "step": 5883 + }, + { + "epoch": 1.8060159607120934, + "grad_norm": 0.46018123626708984, + "learning_rate": 9.41727116418251e-05, + "loss": 1.8943, + "step": 5884 + }, + { + "epoch": 1.8063228974831185, + "grad_norm": 0.3852032721042633, + "learning_rate": 9.41703826294352e-05, + "loss": 1.8927, + "step": 5885 + }, + { + "epoch": 1.8066298342541436, + "grad_norm": 0.36783283948898315, + "learning_rate": 9.41680531805277e-05, + "loss": 1.9255, + "step": 5886 + }, + { + "epoch": 1.806936771025169, + "grad_norm": 0.39950302243232727, + "learning_rate": 9.416572329512559e-05, + "loss": 1.9215, + "step": 5887 + }, + { + "epoch": 1.807243707796194, + "grad_norm": 0.37217068672180176, + "learning_rate": 9.416339297325193e-05, + "loss": 1.8798, + "step": 5888 + }, + { + "epoch": 1.8075506445672191, + "grad_norm": 0.4334213137626648, + "learning_rate": 9.416106221492974e-05, + "loss": 1.9583, + "step": 5889 + }, + { + "epoch": 1.8078575813382445, + "grad_norm": 0.39610370993614197, + "learning_rate": 9.415873102018204e-05, + "loss": 1.9526, + "step": 5890 + }, + { + "epoch": 1.8081645181092694, + "grad_norm": 0.4256335496902466, + "learning_rate": 9.41563993890319e-05, + "loss": 1.9633, + "step": 5891 + }, + { + "epoch": 1.8084714548802947, + "grad_norm": 0.48030543327331543, + "learning_rate": 9.41540673215023e-05, + "loss": 1.8869, + "step": 5892 + }, + { + "epoch": 1.8087783916513198, + "grad_norm": 0.5549675822257996, + "learning_rate": 9.415173481761634e-05, + "loss": 1.9894, + "step": 5893 + }, + { + "epoch": 1.809085328422345, + "grad_norm": 0.5706361532211304, + "learning_rate": 9.414940187739708e-05, + "loss": 1.9721, + "step": 5894 + }, + { + "epoch": 1.8093922651933703, + "grad_norm": 0.4263947606086731, + "learning_rate": 9.414706850086754e-05, + "loss": 1.9408, + "step": 5895 + }, + { + "epoch": 1.8096992019643954, + "grad_norm": 0.3934611976146698, + "learning_rate": 9.414473468805078e-05, + "loss": 1.9444, + "step": 5896 + }, + { + "epoch": 1.8100061387354205, + "grad_norm": 0.4267776608467102, + "learning_rate": 9.41424004389699e-05, + "loss": 1.8774, + "step": 5897 + }, + { + "epoch": 1.8103130755064458, + "grad_norm": 0.46216219663619995, + "learning_rate": 9.414006575364795e-05, + "loss": 1.9648, + "step": 5898 + }, + { + "epoch": 1.8106200122774707, + "grad_norm": 0.4730767607688904, + "learning_rate": 9.413773063210798e-05, + "loss": 1.9528, + "step": 5899 + }, + { + "epoch": 1.810926949048496, + "grad_norm": 0.36383283138275146, + "learning_rate": 9.413539507437308e-05, + "loss": 1.843, + "step": 5900 + }, + { + "epoch": 1.8112338858195212, + "grad_norm": 0.343729168176651, + "learning_rate": 9.413305908046636e-05, + "loss": 1.9101, + "step": 5901 + }, + { + "epoch": 1.8115408225905463, + "grad_norm": 0.3774524927139282, + "learning_rate": 9.413072265041087e-05, + "loss": 1.8705, + "step": 5902 + }, + { + "epoch": 1.8118477593615716, + "grad_norm": 0.37734711170196533, + "learning_rate": 9.412838578422972e-05, + "loss": 1.868, + "step": 5903 + }, + { + "epoch": 1.8121546961325967, + "grad_norm": 0.3705524206161499, + "learning_rate": 9.4126048481946e-05, + "loss": 1.9587, + "step": 5904 + }, + { + "epoch": 1.8124616329036218, + "grad_norm": 0.45906612277030945, + "learning_rate": 9.41237107435828e-05, + "loss": 1.9872, + "step": 5905 + }, + { + "epoch": 1.8127685696746472, + "grad_norm": 0.5013484954833984, + "learning_rate": 9.412137256916323e-05, + "loss": 1.8692, + "step": 5906 + }, + { + "epoch": 1.813075506445672, + "grad_norm": 0.5123991370201111, + "learning_rate": 9.411903395871038e-05, + "loss": 1.9574, + "step": 5907 + }, + { + "epoch": 1.8133824432166974, + "grad_norm": 0.45425844192504883, + "learning_rate": 9.411669491224739e-05, + "loss": 1.9295, + "step": 5908 + }, + { + "epoch": 1.8136893799877225, + "grad_norm": 0.3939640522003174, + "learning_rate": 9.411435542979736e-05, + "loss": 1.9258, + "step": 5909 + }, + { + "epoch": 1.8139963167587476, + "grad_norm": 0.5032235383987427, + "learning_rate": 9.411201551138342e-05, + "loss": 1.9012, + "step": 5910 + }, + { + "epoch": 1.814303253529773, + "grad_norm": 0.6334826946258545, + "learning_rate": 9.410967515702869e-05, + "loss": 1.9699, + "step": 5911 + }, + { + "epoch": 1.814610190300798, + "grad_norm": 0.56645667552948, + "learning_rate": 9.41073343667563e-05, + "loss": 1.9346, + "step": 5912 + }, + { + "epoch": 1.8149171270718232, + "grad_norm": 0.461668461561203, + "learning_rate": 9.410499314058936e-05, + "loss": 1.9549, + "step": 5913 + }, + { + "epoch": 1.8152240638428485, + "grad_norm": 0.39917534589767456, + "learning_rate": 9.410265147855104e-05, + "loss": 1.9503, + "step": 5914 + }, + { + "epoch": 1.8155310006138734, + "grad_norm": 0.4409043788909912, + "learning_rate": 9.410030938066448e-05, + "loss": 1.897, + "step": 5915 + }, + { + "epoch": 1.8158379373848987, + "grad_norm": 0.5793384313583374, + "learning_rate": 9.40979668469528e-05, + "loss": 1.9526, + "step": 5916 + }, + { + "epoch": 1.8161448741559238, + "grad_norm": 0.4642924666404724, + "learning_rate": 9.409562387743917e-05, + "loss": 1.8993, + "step": 5917 + }, + { + "epoch": 1.816451810926949, + "grad_norm": 0.3799861669540405, + "learning_rate": 9.409328047214674e-05, + "loss": 1.9412, + "step": 5918 + }, + { + "epoch": 1.8167587476979743, + "grad_norm": 0.40758320689201355, + "learning_rate": 9.409093663109866e-05, + "loss": 1.9908, + "step": 5919 + }, + { + "epoch": 1.8170656844689994, + "grad_norm": 0.41446420550346375, + "learning_rate": 9.40885923543181e-05, + "loss": 1.8711, + "step": 5920 + }, + { + "epoch": 1.8173726212400245, + "grad_norm": 0.4744807183742523, + "learning_rate": 9.408624764182823e-05, + "loss": 2.0297, + "step": 5921 + }, + { + "epoch": 1.8176795580110499, + "grad_norm": 0.43377524614334106, + "learning_rate": 9.408390249365224e-05, + "loss": 1.9613, + "step": 5922 + }, + { + "epoch": 1.8179864947820747, + "grad_norm": 0.38450872898101807, + "learning_rate": 9.408155690981328e-05, + "loss": 1.8716, + "step": 5923 + }, + { + "epoch": 1.8182934315531, + "grad_norm": 0.4989684820175171, + "learning_rate": 9.407921089033452e-05, + "loss": 1.9909, + "step": 5924 + }, + { + "epoch": 1.8186003683241252, + "grad_norm": 0.4137042462825775, + "learning_rate": 9.407686443523918e-05, + "loss": 1.8778, + "step": 5925 + }, + { + "epoch": 1.8189073050951503, + "grad_norm": 0.3816729485988617, + "learning_rate": 9.407451754455042e-05, + "loss": 1.9355, + "step": 5926 + }, + { + "epoch": 1.8192142418661756, + "grad_norm": 0.48876214027404785, + "learning_rate": 9.407217021829145e-05, + "loss": 1.9256, + "step": 5927 + }, + { + "epoch": 1.8195211786372008, + "grad_norm": 0.5273690223693848, + "learning_rate": 9.406982245648547e-05, + "loss": 1.9456, + "step": 5928 + }, + { + "epoch": 1.8198281154082259, + "grad_norm": 0.4148990511894226, + "learning_rate": 9.406747425915566e-05, + "loss": 1.9184, + "step": 5929 + }, + { + "epoch": 1.8201350521792512, + "grad_norm": 0.4484131634235382, + "learning_rate": 9.406512562632526e-05, + "loss": 1.9305, + "step": 5930 + }, + { + "epoch": 1.820441988950276, + "grad_norm": 0.6036938428878784, + "learning_rate": 9.406277655801744e-05, + "loss": 1.9294, + "step": 5931 + }, + { + "epoch": 1.8207489257213014, + "grad_norm": 0.5399366021156311, + "learning_rate": 9.406042705425543e-05, + "loss": 1.9265, + "step": 5932 + }, + { + "epoch": 1.8210558624923265, + "grad_norm": 0.3591126501560211, + "learning_rate": 9.405807711506249e-05, + "loss": 1.8634, + "step": 5933 + }, + { + "epoch": 1.8213627992633517, + "grad_norm": 0.4474995732307434, + "learning_rate": 9.405572674046179e-05, + "loss": 2.0084, + "step": 5934 + }, + { + "epoch": 1.821669736034377, + "grad_norm": 0.4841657876968384, + "learning_rate": 9.405337593047657e-05, + "loss": 1.8885, + "step": 5935 + }, + { + "epoch": 1.821976672805402, + "grad_norm": 0.4786655008792877, + "learning_rate": 9.405102468513008e-05, + "loss": 1.9273, + "step": 5936 + }, + { + "epoch": 1.8222836095764272, + "grad_norm": 0.4675963521003723, + "learning_rate": 9.404867300444553e-05, + "loss": 1.9267, + "step": 5937 + }, + { + "epoch": 1.8225905463474525, + "grad_norm": 0.40235474705696106, + "learning_rate": 9.404632088844619e-05, + "loss": 2.0208, + "step": 5938 + }, + { + "epoch": 1.8228974831184774, + "grad_norm": 0.40626317262649536, + "learning_rate": 9.404396833715527e-05, + "loss": 1.9079, + "step": 5939 + }, + { + "epoch": 1.8232044198895028, + "grad_norm": 0.4164435565471649, + "learning_rate": 9.404161535059607e-05, + "loss": 1.8818, + "step": 5940 + }, + { + "epoch": 1.8235113566605279, + "grad_norm": 0.44487184286117554, + "learning_rate": 9.40392619287918e-05, + "loss": 1.9184, + "step": 5941 + }, + { + "epoch": 1.823818293431553, + "grad_norm": 0.4009508192539215, + "learning_rate": 9.403690807176572e-05, + "loss": 1.8814, + "step": 5942 + }, + { + "epoch": 1.8241252302025783, + "grad_norm": 0.3518575429916382, + "learning_rate": 9.403455377954112e-05, + "loss": 1.9319, + "step": 5943 + }, + { + "epoch": 1.8244321669736034, + "grad_norm": 0.36712533235549927, + "learning_rate": 9.403219905214125e-05, + "loss": 1.8609, + "step": 5944 + }, + { + "epoch": 1.8247391037446286, + "grad_norm": 0.3926267623901367, + "learning_rate": 9.402984388958937e-05, + "loss": 1.9328, + "step": 5945 + }, + { + "epoch": 1.825046040515654, + "grad_norm": 0.370781272649765, + "learning_rate": 9.402748829190878e-05, + "loss": 1.9848, + "step": 5946 + }, + { + "epoch": 1.8253529772866788, + "grad_norm": 0.38226625323295593, + "learning_rate": 9.402513225912273e-05, + "loss": 1.8933, + "step": 5947 + }, + { + "epoch": 1.8256599140577041, + "grad_norm": 0.40101101994514465, + "learning_rate": 9.402277579125451e-05, + "loss": 1.9231, + "step": 5948 + }, + { + "epoch": 1.8259668508287292, + "grad_norm": 0.41038060188293457, + "learning_rate": 9.402041888832744e-05, + "loss": 1.9445, + "step": 5949 + }, + { + "epoch": 1.8262737875997543, + "grad_norm": 0.37442395091056824, + "learning_rate": 9.401806155036479e-05, + "loss": 1.9271, + "step": 5950 + }, + { + "epoch": 1.8265807243707797, + "grad_norm": 0.43142926692962646, + "learning_rate": 9.401570377738984e-05, + "loss": 1.9489, + "step": 5951 + }, + { + "epoch": 1.8268876611418048, + "grad_norm": 0.38730981945991516, + "learning_rate": 9.401334556942591e-05, + "loss": 1.8802, + "step": 5952 + }, + { + "epoch": 1.82719459791283, + "grad_norm": 0.34189531207084656, + "learning_rate": 9.40109869264963e-05, + "loss": 1.9116, + "step": 5953 + }, + { + "epoch": 1.8275015346838552, + "grad_norm": 0.3632197678089142, + "learning_rate": 9.400862784862434e-05, + "loss": 1.8456, + "step": 5954 + }, + { + "epoch": 1.8278084714548803, + "grad_norm": 0.4008798599243164, + "learning_rate": 9.400626833583331e-05, + "loss": 1.9984, + "step": 5955 + }, + { + "epoch": 1.8281154082259055, + "grad_norm": 0.4087502062320709, + "learning_rate": 9.400390838814655e-05, + "loss": 1.8177, + "step": 5956 + }, + { + "epoch": 1.8284223449969308, + "grad_norm": 0.3753478229045868, + "learning_rate": 9.400154800558737e-05, + "loss": 1.864, + "step": 5957 + }, + { + "epoch": 1.8287292817679557, + "grad_norm": 0.37939608097076416, + "learning_rate": 9.399918718817911e-05, + "loss": 1.9331, + "step": 5958 + }, + { + "epoch": 1.829036218538981, + "grad_norm": 0.41382426023483276, + "learning_rate": 9.399682593594507e-05, + "loss": 1.9014, + "step": 5959 + }, + { + "epoch": 1.8293431553100061, + "grad_norm": 0.46129345893859863, + "learning_rate": 9.399446424890864e-05, + "loss": 1.9591, + "step": 5960 + }, + { + "epoch": 1.8296500920810312, + "grad_norm": 0.487870454788208, + "learning_rate": 9.399210212709312e-05, + "loss": 1.9073, + "step": 5961 + }, + { + "epoch": 1.8299570288520566, + "grad_norm": 0.4693615138530731, + "learning_rate": 9.398973957052185e-05, + "loss": 1.8336, + "step": 5962 + }, + { + "epoch": 1.8302639656230817, + "grad_norm": 0.38947850465774536, + "learning_rate": 9.39873765792182e-05, + "loss": 1.8599, + "step": 5963 + }, + { + "epoch": 1.8305709023941068, + "grad_norm": 0.372242271900177, + "learning_rate": 9.398501315320551e-05, + "loss": 1.9653, + "step": 5964 + }, + { + "epoch": 1.8308778391651321, + "grad_norm": 0.37679895758628845, + "learning_rate": 9.398264929250714e-05, + "loss": 1.8886, + "step": 5965 + }, + { + "epoch": 1.831184775936157, + "grad_norm": 0.347989022731781, + "learning_rate": 9.398028499714645e-05, + "loss": 1.8665, + "step": 5966 + }, + { + "epoch": 1.8314917127071824, + "grad_norm": 0.4297877550125122, + "learning_rate": 9.397792026714681e-05, + "loss": 1.9646, + "step": 5967 + }, + { + "epoch": 1.8317986494782075, + "grad_norm": 0.3698103427886963, + "learning_rate": 9.397555510253158e-05, + "loss": 1.9537, + "step": 5968 + }, + { + "epoch": 1.8321055862492326, + "grad_norm": 0.3268609941005707, + "learning_rate": 9.397318950332414e-05, + "loss": 1.8679, + "step": 5969 + }, + { + "epoch": 1.832412523020258, + "grad_norm": 0.3487341105937958, + "learning_rate": 9.397082346954788e-05, + "loss": 1.8936, + "step": 5970 + }, + { + "epoch": 1.832719459791283, + "grad_norm": 0.36363741755485535, + "learning_rate": 9.396845700122616e-05, + "loss": 1.8926, + "step": 5971 + }, + { + "epoch": 1.8330263965623081, + "grad_norm": 0.42258647084236145, + "learning_rate": 9.396609009838237e-05, + "loss": 1.9439, + "step": 5972 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.4087521433830261, + "learning_rate": 9.396372276103992e-05, + "loss": 1.8868, + "step": 5973 + }, + { + "epoch": 1.8336402701043584, + "grad_norm": 0.41857820749282837, + "learning_rate": 9.396135498922218e-05, + "loss": 1.9824, + "step": 5974 + }, + { + "epoch": 1.8339472068753837, + "grad_norm": 0.44207099080085754, + "learning_rate": 9.395898678295259e-05, + "loss": 1.9183, + "step": 5975 + }, + { + "epoch": 1.8342541436464088, + "grad_norm": 0.38295891880989075, + "learning_rate": 9.39566181422545e-05, + "loss": 1.8882, + "step": 5976 + }, + { + "epoch": 1.834561080417434, + "grad_norm": 0.4440687298774719, + "learning_rate": 9.395424906715136e-05, + "loss": 1.9401, + "step": 5977 + }, + { + "epoch": 1.8348680171884593, + "grad_norm": 0.3867577016353607, + "learning_rate": 9.395187955766655e-05, + "loss": 1.9243, + "step": 5978 + }, + { + "epoch": 1.8351749539594844, + "grad_norm": 0.47536182403564453, + "learning_rate": 9.394950961382354e-05, + "loss": 1.9248, + "step": 5979 + }, + { + "epoch": 1.8354818907305095, + "grad_norm": 0.4071936011314392, + "learning_rate": 9.394713923564569e-05, + "loss": 1.8701, + "step": 5980 + }, + { + "epoch": 1.8357888275015348, + "grad_norm": 0.41844502091407776, + "learning_rate": 9.394476842315645e-05, + "loss": 2.0087, + "step": 5981 + }, + { + "epoch": 1.8360957642725597, + "grad_norm": 0.40439316630363464, + "learning_rate": 9.394239717637927e-05, + "loss": 1.8945, + "step": 5982 + }, + { + "epoch": 1.836402701043585, + "grad_norm": 0.36738064885139465, + "learning_rate": 9.394002549533754e-05, + "loss": 1.9361, + "step": 5983 + }, + { + "epoch": 1.8367096378146102, + "grad_norm": 0.4733370542526245, + "learning_rate": 9.393765338005476e-05, + "loss": 1.9301, + "step": 5984 + }, + { + "epoch": 1.8370165745856353, + "grad_norm": 0.4467030465602875, + "learning_rate": 9.39352808305543e-05, + "loss": 1.8691, + "step": 5985 + }, + { + "epoch": 1.8373235113566606, + "grad_norm": 0.5276423692703247, + "learning_rate": 9.393290784685967e-05, + "loss": 1.9211, + "step": 5986 + }, + { + "epoch": 1.8376304481276857, + "grad_norm": 0.4791669547557831, + "learning_rate": 9.393053442899428e-05, + "loss": 1.9876, + "step": 5987 + }, + { + "epoch": 1.8379373848987108, + "grad_norm": 0.41468554735183716, + "learning_rate": 9.392816057698159e-05, + "loss": 1.9483, + "step": 5988 + }, + { + "epoch": 1.8382443216697362, + "grad_norm": 0.3979242742061615, + "learning_rate": 9.39257862908451e-05, + "loss": 1.8962, + "step": 5989 + }, + { + "epoch": 1.838551258440761, + "grad_norm": 0.47706472873687744, + "learning_rate": 9.392341157060822e-05, + "loss": 1.9028, + "step": 5990 + }, + { + "epoch": 1.8388581952117864, + "grad_norm": 0.5254244804382324, + "learning_rate": 9.392103641629446e-05, + "loss": 1.9244, + "step": 5991 + }, + { + "epoch": 1.8391651319828115, + "grad_norm": 0.49596595764160156, + "learning_rate": 9.391866082792727e-05, + "loss": 1.8731, + "step": 5992 + }, + { + "epoch": 1.8394720687538366, + "grad_norm": 0.3787136971950531, + "learning_rate": 9.391628480553013e-05, + "loss": 1.9404, + "step": 5993 + }, + { + "epoch": 1.839779005524862, + "grad_norm": 0.3986566960811615, + "learning_rate": 9.391390834912651e-05, + "loss": 1.9319, + "step": 5994 + }, + { + "epoch": 1.840085942295887, + "grad_norm": 0.4466419219970703, + "learning_rate": 9.391153145873992e-05, + "loss": 1.9755, + "step": 5995 + }, + { + "epoch": 1.8403928790669122, + "grad_norm": 0.43374884128570557, + "learning_rate": 9.390915413439385e-05, + "loss": 1.913, + "step": 5996 + }, + { + "epoch": 1.8406998158379375, + "grad_norm": 0.3897610902786255, + "learning_rate": 9.390677637611176e-05, + "loss": 1.9488, + "step": 5997 + }, + { + "epoch": 1.8410067526089624, + "grad_norm": 0.38407614827156067, + "learning_rate": 9.390439818391718e-05, + "loss": 1.8712, + "step": 5998 + }, + { + "epoch": 1.8413136893799877, + "grad_norm": 0.4159192740917206, + "learning_rate": 9.390201955783362e-05, + "loss": 1.9254, + "step": 5999 + }, + { + "epoch": 1.8416206261510129, + "grad_norm": 0.42220592498779297, + "learning_rate": 9.389964049788455e-05, + "loss": 1.9684, + "step": 6000 + }, + { + "epoch": 1.841927562922038, + "grad_norm": 0.3792029619216919, + "learning_rate": 9.389726100409351e-05, + "loss": 1.9091, + "step": 6001 + }, + { + "epoch": 1.8422344996930633, + "grad_norm": 0.37374788522720337, + "learning_rate": 9.389488107648401e-05, + "loss": 1.9498, + "step": 6002 + }, + { + "epoch": 1.8425414364640884, + "grad_norm": 0.4237084686756134, + "learning_rate": 9.389250071507958e-05, + "loss": 1.9177, + "step": 6003 + }, + { + "epoch": 1.8428483732351135, + "grad_norm": 0.5332993865013123, + "learning_rate": 9.38901199199037e-05, + "loss": 1.8994, + "step": 6004 + }, + { + "epoch": 1.8431553100061389, + "grad_norm": 0.42202335596084595, + "learning_rate": 9.388773869097996e-05, + "loss": 1.8365, + "step": 6005 + }, + { + "epoch": 1.8434622467771637, + "grad_norm": 0.3581100106239319, + "learning_rate": 9.388535702833185e-05, + "loss": 1.8536, + "step": 6006 + }, + { + "epoch": 1.843769183548189, + "grad_norm": 0.3670782446861267, + "learning_rate": 9.388297493198293e-05, + "loss": 1.8965, + "step": 6007 + }, + { + "epoch": 1.8440761203192142, + "grad_norm": 0.39181825518608093, + "learning_rate": 9.38805924019567e-05, + "loss": 1.8674, + "step": 6008 + }, + { + "epoch": 1.8443830570902393, + "grad_norm": 0.46757015585899353, + "learning_rate": 9.387820943827676e-05, + "loss": 1.8945, + "step": 6009 + }, + { + "epoch": 1.8446899938612646, + "grad_norm": 0.4656504690647125, + "learning_rate": 9.387582604096664e-05, + "loss": 1.8626, + "step": 6010 + }, + { + "epoch": 1.8449969306322898, + "grad_norm": 0.4699888825416565, + "learning_rate": 9.387344221004988e-05, + "loss": 1.9396, + "step": 6011 + }, + { + "epoch": 1.8453038674033149, + "grad_norm": 0.36591392755508423, + "learning_rate": 9.387105794555006e-05, + "loss": 1.8031, + "step": 6012 + }, + { + "epoch": 1.8456108041743402, + "grad_norm": 0.3563486933708191, + "learning_rate": 9.386867324749073e-05, + "loss": 1.8658, + "step": 6013 + }, + { + "epoch": 1.845917740945365, + "grad_norm": 0.4490883946418762, + "learning_rate": 9.386628811589547e-05, + "loss": 1.9809, + "step": 6014 + }, + { + "epoch": 1.8462246777163904, + "grad_norm": 0.39862295985221863, + "learning_rate": 9.38639025507878e-05, + "loss": 1.9268, + "step": 6015 + }, + { + "epoch": 1.8465316144874155, + "grad_norm": 0.3579883575439453, + "learning_rate": 9.386151655219138e-05, + "loss": 1.8538, + "step": 6016 + }, + { + "epoch": 1.8468385512584407, + "grad_norm": 0.411685973405838, + "learning_rate": 9.385913012012973e-05, + "loss": 1.9034, + "step": 6017 + }, + { + "epoch": 1.847145488029466, + "grad_norm": 0.44486066699028015, + "learning_rate": 9.385674325462643e-05, + "loss": 1.9279, + "step": 6018 + }, + { + "epoch": 1.847452424800491, + "grad_norm": 0.42794153094291687, + "learning_rate": 9.385435595570511e-05, + "loss": 1.9117, + "step": 6019 + }, + { + "epoch": 1.8477593615715162, + "grad_norm": 0.3652110695838928, + "learning_rate": 9.385196822338933e-05, + "loss": 1.9636, + "step": 6020 + }, + { + "epoch": 1.8480662983425415, + "grad_norm": 0.36490142345428467, + "learning_rate": 9.38495800577027e-05, + "loss": 1.9468, + "step": 6021 + }, + { + "epoch": 1.8483732351135667, + "grad_norm": 0.3946039080619812, + "learning_rate": 9.384719145866882e-05, + "loss": 1.8851, + "step": 6022 + }, + { + "epoch": 1.8486801718845918, + "grad_norm": 0.4236997067928314, + "learning_rate": 9.38448024263113e-05, + "loss": 2.0256, + "step": 6023 + }, + { + "epoch": 1.848987108655617, + "grad_norm": 0.34637942910194397, + "learning_rate": 9.384241296065374e-05, + "loss": 1.9032, + "step": 6024 + }, + { + "epoch": 1.849294045426642, + "grad_norm": 0.4096907079219818, + "learning_rate": 9.384002306171975e-05, + "loss": 1.9762, + "step": 6025 + }, + { + "epoch": 1.8496009821976673, + "grad_norm": 0.38225218653678894, + "learning_rate": 9.383763272953297e-05, + "loss": 2.023, + "step": 6026 + }, + { + "epoch": 1.8499079189686924, + "grad_norm": 0.4297153055667877, + "learning_rate": 9.3835241964117e-05, + "loss": 1.977, + "step": 6027 + }, + { + "epoch": 1.8502148557397176, + "grad_norm": 0.5225360989570618, + "learning_rate": 9.383285076549548e-05, + "loss": 1.919, + "step": 6028 + }, + { + "epoch": 1.850521792510743, + "grad_norm": 0.6799743175506592, + "learning_rate": 9.383045913369205e-05, + "loss": 1.9382, + "step": 6029 + }, + { + "epoch": 1.850828729281768, + "grad_norm": 0.6274817585945129, + "learning_rate": 9.382806706873031e-05, + "loss": 1.9782, + "step": 6030 + }, + { + "epoch": 1.8511356660527931, + "grad_norm": 0.4939708113670349, + "learning_rate": 9.382567457063392e-05, + "loss": 1.8794, + "step": 6031 + }, + { + "epoch": 1.8514426028238185, + "grad_norm": 0.3876135051250458, + "learning_rate": 9.382328163942656e-05, + "loss": 2.0153, + "step": 6032 + }, + { + "epoch": 1.8517495395948433, + "grad_norm": 0.592051088809967, + "learning_rate": 9.38208882751318e-05, + "loss": 1.9277, + "step": 6033 + }, + { + "epoch": 1.8520564763658687, + "grad_norm": 0.660763144493103, + "learning_rate": 9.381849447777337e-05, + "loss": 1.9177, + "step": 6034 + }, + { + "epoch": 1.8523634131368938, + "grad_norm": 0.5823151469230652, + "learning_rate": 9.381610024737489e-05, + "loss": 1.9363, + "step": 6035 + }, + { + "epoch": 1.852670349907919, + "grad_norm": 0.39519962668418884, + "learning_rate": 9.381370558396004e-05, + "loss": 1.8627, + "step": 6036 + }, + { + "epoch": 1.8529772866789442, + "grad_norm": 0.44657328724861145, + "learning_rate": 9.381131048755244e-05, + "loss": 1.9075, + "step": 6037 + }, + { + "epoch": 1.8532842234499693, + "grad_norm": 0.540743887424469, + "learning_rate": 9.380891495817581e-05, + "loss": 1.9518, + "step": 6038 + }, + { + "epoch": 1.8535911602209945, + "grad_norm": 0.4388680160045624, + "learning_rate": 9.38065189958538e-05, + "loss": 1.8485, + "step": 6039 + }, + { + "epoch": 1.8538980969920198, + "grad_norm": 0.37645572423934937, + "learning_rate": 9.38041226006101e-05, + "loss": 1.9542, + "step": 6040 + }, + { + "epoch": 1.8542050337630447, + "grad_norm": 0.4405656158924103, + "learning_rate": 9.380172577246837e-05, + "loss": 1.9054, + "step": 6041 + }, + { + "epoch": 1.85451197053407, + "grad_norm": 0.45483505725860596, + "learning_rate": 9.379932851145232e-05, + "loss": 1.9077, + "step": 6042 + }, + { + "epoch": 1.8548189073050951, + "grad_norm": 0.40666261315345764, + "learning_rate": 9.379693081758564e-05, + "loss": 1.9977, + "step": 6043 + }, + { + "epoch": 1.8551258440761202, + "grad_norm": 0.365241140127182, + "learning_rate": 9.379453269089202e-05, + "loss": 1.9047, + "step": 6044 + }, + { + "epoch": 1.8554327808471456, + "grad_norm": 0.40797916054725647, + "learning_rate": 9.379213413139516e-05, + "loss": 1.9621, + "step": 6045 + }, + { + "epoch": 1.8557397176181707, + "grad_norm": 0.4525306820869446, + "learning_rate": 9.378973513911875e-05, + "loss": 1.9479, + "step": 6046 + }, + { + "epoch": 1.8560466543891958, + "grad_norm": 0.45422959327697754, + "learning_rate": 9.378733571408652e-05, + "loss": 1.9754, + "step": 6047 + }, + { + "epoch": 1.8563535911602211, + "grad_norm": 0.381862998008728, + "learning_rate": 9.378493585632217e-05, + "loss": 1.8542, + "step": 6048 + }, + { + "epoch": 1.856660527931246, + "grad_norm": 0.40489691495895386, + "learning_rate": 9.378253556584944e-05, + "loss": 1.9331, + "step": 6049 + }, + { + "epoch": 1.8569674647022714, + "grad_norm": 0.40347445011138916, + "learning_rate": 9.378013484269201e-05, + "loss": 1.9414, + "step": 6050 + }, + { + "epoch": 1.8572744014732965, + "grad_norm": 0.35401904582977295, + "learning_rate": 9.377773368687363e-05, + "loss": 1.8094, + "step": 6051 + }, + { + "epoch": 1.8575813382443216, + "grad_norm": 0.4061582684516907, + "learning_rate": 9.377533209841805e-05, + "loss": 1.8686, + "step": 6052 + }, + { + "epoch": 1.857888275015347, + "grad_norm": 0.44419318437576294, + "learning_rate": 9.377293007734895e-05, + "loss": 1.929, + "step": 6053 + }, + { + "epoch": 1.858195211786372, + "grad_norm": 0.41038191318511963, + "learning_rate": 9.37705276236901e-05, + "loss": 1.9636, + "step": 6054 + }, + { + "epoch": 1.8585021485573971, + "grad_norm": 0.4431348145008087, + "learning_rate": 9.376812473746526e-05, + "loss": 1.953, + "step": 6055 + }, + { + "epoch": 1.8588090853284225, + "grad_norm": 0.42502057552337646, + "learning_rate": 9.376572141869814e-05, + "loss": 1.95, + "step": 6056 + }, + { + "epoch": 1.8591160220994474, + "grad_norm": 0.40050914883613586, + "learning_rate": 9.376331766741253e-05, + "loss": 1.9507, + "step": 6057 + }, + { + "epoch": 1.8594229588704727, + "grad_norm": 0.3863932490348816, + "learning_rate": 9.376091348363216e-05, + "loss": 1.8746, + "step": 6058 + }, + { + "epoch": 1.8597298956414978, + "grad_norm": 0.37295350432395935, + "learning_rate": 9.375850886738077e-05, + "loss": 1.8778, + "step": 6059 + }, + { + "epoch": 1.860036832412523, + "grad_norm": 0.37965887784957886, + "learning_rate": 9.375610381868217e-05, + "loss": 1.8511, + "step": 6060 + }, + { + "epoch": 1.8603437691835483, + "grad_norm": 0.3740752637386322, + "learning_rate": 9.37536983375601e-05, + "loss": 1.8988, + "step": 6061 + }, + { + "epoch": 1.8606507059545734, + "grad_norm": 0.40466782450675964, + "learning_rate": 9.375129242403834e-05, + "loss": 1.9195, + "step": 6062 + }, + { + "epoch": 1.8609576427255985, + "grad_norm": 0.3658956289291382, + "learning_rate": 9.374888607814067e-05, + "loss": 1.9598, + "step": 6063 + }, + { + "epoch": 1.8612645794966238, + "grad_norm": 0.3752783238887787, + "learning_rate": 9.374647929989085e-05, + "loss": 1.9791, + "step": 6064 + }, + { + "epoch": 1.8615715162676487, + "grad_norm": 0.408774733543396, + "learning_rate": 9.374407208931268e-05, + "loss": 1.88, + "step": 6065 + }, + { + "epoch": 1.861878453038674, + "grad_norm": 0.3968205749988556, + "learning_rate": 9.374166444642997e-05, + "loss": 1.8755, + "step": 6066 + }, + { + "epoch": 1.8621853898096992, + "grad_norm": 0.37851858139038086, + "learning_rate": 9.373925637126648e-05, + "loss": 1.9296, + "step": 6067 + }, + { + "epoch": 1.8624923265807243, + "grad_norm": 0.34285619854927063, + "learning_rate": 9.373684786384604e-05, + "loss": 2.0149, + "step": 6068 + }, + { + "epoch": 1.8627992633517496, + "grad_norm": 0.38841512799263, + "learning_rate": 9.373443892419242e-05, + "loss": 1.9134, + "step": 6069 + }, + { + "epoch": 1.8631062001227747, + "grad_norm": 0.4744485914707184, + "learning_rate": 9.373202955232943e-05, + "loss": 1.9164, + "step": 6070 + }, + { + "epoch": 1.8634131368937998, + "grad_norm": 0.522659420967102, + "learning_rate": 9.372961974828092e-05, + "loss": 1.9155, + "step": 6071 + }, + { + "epoch": 1.8637200736648252, + "grad_norm": 0.5794001817703247, + "learning_rate": 9.372720951207066e-05, + "loss": 1.9003, + "step": 6072 + }, + { + "epoch": 1.86402701043585, + "grad_norm": 0.5135447978973389, + "learning_rate": 9.372479884372247e-05, + "loss": 1.948, + "step": 6073 + }, + { + "epoch": 1.8643339472068754, + "grad_norm": 0.4060198664665222, + "learning_rate": 9.372238774326021e-05, + "loss": 1.8634, + "step": 6074 + }, + { + "epoch": 1.8646408839779005, + "grad_norm": 0.3880244195461273, + "learning_rate": 9.371997621070769e-05, + "loss": 1.8729, + "step": 6075 + }, + { + "epoch": 1.8649478207489256, + "grad_norm": 0.4862929582595825, + "learning_rate": 9.371756424608875e-05, + "loss": 1.9185, + "step": 6076 + }, + { + "epoch": 1.865254757519951, + "grad_norm": 0.4763035476207733, + "learning_rate": 9.371515184942719e-05, + "loss": 1.9696, + "step": 6077 + }, + { + "epoch": 1.865561694290976, + "grad_norm": 0.3552228808403015, + "learning_rate": 9.371273902074689e-05, + "loss": 1.9101, + "step": 6078 + }, + { + "epoch": 1.8658686310620012, + "grad_norm": 0.46329566836357117, + "learning_rate": 9.371032576007168e-05, + "loss": 1.8807, + "step": 6079 + }, + { + "epoch": 1.8661755678330265, + "grad_norm": 0.5176550149917603, + "learning_rate": 9.370791206742541e-05, + "loss": 1.9044, + "step": 6080 + }, + { + "epoch": 1.8664825046040514, + "grad_norm": 0.3929184675216675, + "learning_rate": 9.370549794283194e-05, + "loss": 1.8858, + "step": 6081 + }, + { + "epoch": 1.8667894413750767, + "grad_norm": 0.35135987401008606, + "learning_rate": 9.370308338631511e-05, + "loss": 1.8518, + "step": 6082 + }, + { + "epoch": 1.8670963781461019, + "grad_norm": 0.4229072034358978, + "learning_rate": 9.370066839789881e-05, + "loss": 1.891, + "step": 6083 + }, + { + "epoch": 1.867403314917127, + "grad_norm": 0.4862394630908966, + "learning_rate": 9.369825297760688e-05, + "loss": 1.9058, + "step": 6084 + }, + { + "epoch": 1.8677102516881523, + "grad_norm": 0.4775281548500061, + "learning_rate": 9.369583712546322e-05, + "loss": 1.9738, + "step": 6085 + }, + { + "epoch": 1.8680171884591774, + "grad_norm": 0.3831046521663666, + "learning_rate": 9.369342084149166e-05, + "loss": 1.9516, + "step": 6086 + }, + { + "epoch": 1.8683241252302025, + "grad_norm": 0.3970867395401001, + "learning_rate": 9.369100412571612e-05, + "loss": 2.0158, + "step": 6087 + }, + { + "epoch": 1.8686310620012279, + "grad_norm": 0.41662725806236267, + "learning_rate": 9.368858697816047e-05, + "loss": 1.86, + "step": 6088 + }, + { + "epoch": 1.8689379987722528, + "grad_norm": 0.44235244393348694, + "learning_rate": 9.36861693988486e-05, + "loss": 1.9257, + "step": 6089 + }, + { + "epoch": 1.869244935543278, + "grad_norm": 0.37863966822624207, + "learning_rate": 9.36837513878044e-05, + "loss": 1.8877, + "step": 6090 + }, + { + "epoch": 1.8695518723143032, + "grad_norm": 0.44757044315338135, + "learning_rate": 9.368133294505175e-05, + "loss": 1.8962, + "step": 6091 + }, + { + "epoch": 1.8698588090853283, + "grad_norm": 0.5299558639526367, + "learning_rate": 9.367891407061458e-05, + "loss": 1.8655, + "step": 6092 + }, + { + "epoch": 1.8701657458563536, + "grad_norm": 0.4899531900882721, + "learning_rate": 9.367649476451678e-05, + "loss": 1.8933, + "step": 6093 + }, + { + "epoch": 1.8704726826273788, + "grad_norm": 0.3883507251739502, + "learning_rate": 9.367407502678224e-05, + "loss": 1.88, + "step": 6094 + }, + { + "epoch": 1.8707796193984039, + "grad_norm": 0.40936750173568726, + "learning_rate": 9.367165485743493e-05, + "loss": 1.8926, + "step": 6095 + }, + { + "epoch": 1.8710865561694292, + "grad_norm": 0.5708447098731995, + "learning_rate": 9.36692342564987e-05, + "loss": 1.9701, + "step": 6096 + }, + { + "epoch": 1.8713934929404543, + "grad_norm": 0.5559602379798889, + "learning_rate": 9.366681322399751e-05, + "loss": 1.8962, + "step": 6097 + }, + { + "epoch": 1.8717004297114794, + "grad_norm": 0.45344826579093933, + "learning_rate": 9.366439175995528e-05, + "loss": 1.9766, + "step": 6098 + }, + { + "epoch": 1.8720073664825048, + "grad_norm": 0.4887133538722992, + "learning_rate": 9.366196986439592e-05, + "loss": 1.8982, + "step": 6099 + }, + { + "epoch": 1.8723143032535297, + "grad_norm": 0.536568284034729, + "learning_rate": 9.365954753734339e-05, + "loss": 1.9506, + "step": 6100 + }, + { + "epoch": 1.872621240024555, + "grad_norm": 0.4792746901512146, + "learning_rate": 9.365712477882162e-05, + "loss": 1.9392, + "step": 6101 + }, + { + "epoch": 1.87292817679558, + "grad_norm": 0.39836910367012024, + "learning_rate": 9.365470158885458e-05, + "loss": 1.8812, + "step": 6102 + }, + { + "epoch": 1.8732351135666052, + "grad_norm": 0.4263121783733368, + "learning_rate": 9.365227796746617e-05, + "loss": 1.8326, + "step": 6103 + }, + { + "epoch": 1.8735420503376305, + "grad_norm": 0.4158315360546112, + "learning_rate": 9.364985391468038e-05, + "loss": 1.8857, + "step": 6104 + }, + { + "epoch": 1.8738489871086557, + "grad_norm": 0.4384559094905853, + "learning_rate": 9.364742943052112e-05, + "loss": 1.9247, + "step": 6105 + }, + { + "epoch": 1.8741559238796808, + "grad_norm": 0.34221649169921875, + "learning_rate": 9.364500451501242e-05, + "loss": 1.8869, + "step": 6106 + }, + { + "epoch": 1.874462860650706, + "grad_norm": 0.38786688446998596, + "learning_rate": 9.364257916817817e-05, + "loss": 1.8879, + "step": 6107 + }, + { + "epoch": 1.874769797421731, + "grad_norm": 0.39408090710639954, + "learning_rate": 9.364015339004239e-05, + "loss": 1.8832, + "step": 6108 + }, + { + "epoch": 1.8750767341927563, + "grad_norm": 0.33985385298728943, + "learning_rate": 9.363772718062902e-05, + "loss": 1.8823, + "step": 6109 + }, + { + "epoch": 1.8753836709637814, + "grad_norm": 0.35319194197654724, + "learning_rate": 9.363530053996206e-05, + "loss": 1.9205, + "step": 6110 + }, + { + "epoch": 1.8756906077348066, + "grad_norm": 0.3455435335636139, + "learning_rate": 9.36328734680655e-05, + "loss": 1.9028, + "step": 6111 + }, + { + "epoch": 1.875997544505832, + "grad_norm": 0.3689115643501282, + "learning_rate": 9.363044596496329e-05, + "loss": 1.8996, + "step": 6112 + }, + { + "epoch": 1.876304481276857, + "grad_norm": 0.35776960849761963, + "learning_rate": 9.362801803067945e-05, + "loss": 1.9563, + "step": 6113 + }, + { + "epoch": 1.8766114180478821, + "grad_norm": 0.3524370491504669, + "learning_rate": 9.362558966523797e-05, + "loss": 1.9016, + "step": 6114 + }, + { + "epoch": 1.8769183548189075, + "grad_norm": 0.3725074529647827, + "learning_rate": 9.362316086866283e-05, + "loss": 1.9467, + "step": 6115 + }, + { + "epoch": 1.8772252915899323, + "grad_norm": 0.390055775642395, + "learning_rate": 9.362073164097807e-05, + "loss": 1.9326, + "step": 6116 + }, + { + "epoch": 1.8775322283609577, + "grad_norm": 0.39119964838027954, + "learning_rate": 9.361830198220764e-05, + "loss": 1.8723, + "step": 6117 + }, + { + "epoch": 1.8778391651319828, + "grad_norm": 0.3659103512763977, + "learning_rate": 9.36158718923756e-05, + "loss": 1.835, + "step": 6118 + }, + { + "epoch": 1.878146101903008, + "grad_norm": 0.3360283076763153, + "learning_rate": 9.361344137150597e-05, + "loss": 1.8622, + "step": 6119 + }, + { + "epoch": 1.8784530386740332, + "grad_norm": 0.35440295934677124, + "learning_rate": 9.361101041962272e-05, + "loss": 1.8523, + "step": 6120 + }, + { + "epoch": 1.8787599754450584, + "grad_norm": 1.2606174945831299, + "learning_rate": 9.36085790367499e-05, + "loss": 1.9826, + "step": 6121 + }, + { + "epoch": 1.8790669122160835, + "grad_norm": 0.49294769763946533, + "learning_rate": 9.360614722291157e-05, + "loss": 1.8478, + "step": 6122 + }, + { + "epoch": 1.8793738489871088, + "grad_norm": 0.5642881393432617, + "learning_rate": 9.360371497813172e-05, + "loss": 1.883, + "step": 6123 + }, + { + "epoch": 1.8796807857581337, + "grad_norm": 0.5257276296615601, + "learning_rate": 9.36012823024344e-05, + "loss": 1.8577, + "step": 6124 + }, + { + "epoch": 1.879987722529159, + "grad_norm": 0.36913231015205383, + "learning_rate": 9.359884919584366e-05, + "loss": 1.8934, + "step": 6125 + }, + { + "epoch": 1.8802946593001841, + "grad_norm": 0.43373262882232666, + "learning_rate": 9.359641565838353e-05, + "loss": 1.8354, + "step": 6126 + }, + { + "epoch": 1.8806015960712092, + "grad_norm": 0.5280462503433228, + "learning_rate": 9.359398169007807e-05, + "loss": 1.9446, + "step": 6127 + }, + { + "epoch": 1.8809085328422346, + "grad_norm": 0.4991915225982666, + "learning_rate": 9.359154729095135e-05, + "loss": 1.9003, + "step": 6128 + }, + { + "epoch": 1.8812154696132597, + "grad_norm": 0.3766331374645233, + "learning_rate": 9.358911246102738e-05, + "loss": 1.9149, + "step": 6129 + }, + { + "epoch": 1.8815224063842848, + "grad_norm": 0.39050692319869995, + "learning_rate": 9.358667720033026e-05, + "loss": 1.8945, + "step": 6130 + }, + { + "epoch": 1.8818293431553101, + "grad_norm": 0.47633904218673706, + "learning_rate": 9.358424150888405e-05, + "loss": 1.8772, + "step": 6131 + }, + { + "epoch": 1.882136279926335, + "grad_norm": 0.46322503685951233, + "learning_rate": 9.358180538671283e-05, + "loss": 1.893, + "step": 6132 + }, + { + "epoch": 1.8824432166973604, + "grad_norm": 0.39437612891197205, + "learning_rate": 9.357936883384066e-05, + "loss": 1.9394, + "step": 6133 + }, + { + "epoch": 1.8827501534683855, + "grad_norm": 0.4534996747970581, + "learning_rate": 9.357693185029162e-05, + "loss": 1.9689, + "step": 6134 + }, + { + "epoch": 1.8830570902394106, + "grad_norm": 0.4408230483531952, + "learning_rate": 9.35744944360898e-05, + "loss": 1.876, + "step": 6135 + }, + { + "epoch": 1.883364027010436, + "grad_norm": 0.5688899755477905, + "learning_rate": 9.35720565912593e-05, + "loss": 2.0153, + "step": 6136 + }, + { + "epoch": 1.883670963781461, + "grad_norm": 0.5005510449409485, + "learning_rate": 9.356961831582418e-05, + "loss": 1.9454, + "step": 6137 + }, + { + "epoch": 1.8839779005524862, + "grad_norm": 0.4002588987350464, + "learning_rate": 9.356717960980856e-05, + "loss": 1.9153, + "step": 6138 + }, + { + "epoch": 1.8842848373235115, + "grad_norm": 0.49053385853767395, + "learning_rate": 9.356474047323653e-05, + "loss": 1.9734, + "step": 6139 + }, + { + "epoch": 1.8845917740945364, + "grad_norm": 0.4828382432460785, + "learning_rate": 9.35623009061322e-05, + "loss": 1.8946, + "step": 6140 + }, + { + "epoch": 1.8848987108655617, + "grad_norm": 0.4389181137084961, + "learning_rate": 9.35598609085197e-05, + "loss": 1.9491, + "step": 6141 + }, + { + "epoch": 1.8852056476365868, + "grad_norm": 0.4010564982891083, + "learning_rate": 9.35574204804231e-05, + "loss": 1.8786, + "step": 6142 + }, + { + "epoch": 1.885512584407612, + "grad_norm": 0.4038756787776947, + "learning_rate": 9.355497962186657e-05, + "loss": 1.907, + "step": 6143 + }, + { + "epoch": 1.8858195211786373, + "grad_norm": 0.5030881762504578, + "learning_rate": 9.355253833287418e-05, + "loss": 1.8438, + "step": 6144 + }, + { + "epoch": 1.8861264579496624, + "grad_norm": 0.42690956592559814, + "learning_rate": 9.355009661347007e-05, + "loss": 1.8254, + "step": 6145 + }, + { + "epoch": 1.8864333947206875, + "grad_norm": 0.37733983993530273, + "learning_rate": 9.35476544636784e-05, + "loss": 1.9035, + "step": 6146 + }, + { + "epoch": 1.8867403314917128, + "grad_norm": 0.36874648928642273, + "learning_rate": 9.354521188352327e-05, + "loss": 1.885, + "step": 6147 + }, + { + "epoch": 1.8870472682627377, + "grad_norm": 0.36208659410476685, + "learning_rate": 9.354276887302885e-05, + "loss": 1.9416, + "step": 6148 + }, + { + "epoch": 1.887354205033763, + "grad_norm": 0.3952158987522125, + "learning_rate": 9.354032543221926e-05, + "loss": 1.9073, + "step": 6149 + }, + { + "epoch": 1.8876611418047882, + "grad_norm": 0.3603280782699585, + "learning_rate": 9.353788156111864e-05, + "loss": 1.9204, + "step": 6150 + }, + { + "epoch": 1.8879680785758133, + "grad_norm": 0.4325824975967407, + "learning_rate": 9.353543725975118e-05, + "loss": 1.9345, + "step": 6151 + }, + { + "epoch": 1.8882750153468386, + "grad_norm": 0.46270960569381714, + "learning_rate": 9.3532992528141e-05, + "loss": 1.9783, + "step": 6152 + }, + { + "epoch": 1.8885819521178637, + "grad_norm": 0.42317959666252136, + "learning_rate": 9.353054736631228e-05, + "loss": 1.9252, + "step": 6153 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.42653194069862366, + "learning_rate": 9.352810177428917e-05, + "loss": 1.9875, + "step": 6154 + }, + { + "epoch": 1.8891958256599142, + "grad_norm": 0.49819129705429077, + "learning_rate": 9.352565575209584e-05, + "loss": 1.9334, + "step": 6155 + }, + { + "epoch": 1.889502762430939, + "grad_norm": 0.4481790065765381, + "learning_rate": 9.352320929975646e-05, + "loss": 1.8939, + "step": 6156 + }, + { + "epoch": 1.8898096992019644, + "grad_norm": 0.41602686047554016, + "learning_rate": 9.352076241729524e-05, + "loss": 1.9207, + "step": 6157 + }, + { + "epoch": 1.8901166359729895, + "grad_norm": 0.4516759216785431, + "learning_rate": 9.351831510473633e-05, + "loss": 1.9384, + "step": 6158 + }, + { + "epoch": 1.8904235727440146, + "grad_norm": 0.5030251741409302, + "learning_rate": 9.351586736210391e-05, + "loss": 1.9787, + "step": 6159 + }, + { + "epoch": 1.89073050951504, + "grad_norm": 0.37176215648651123, + "learning_rate": 9.35134191894222e-05, + "loss": 1.8826, + "step": 6160 + }, + { + "epoch": 1.891037446286065, + "grad_norm": 0.3850235939025879, + "learning_rate": 9.351097058671537e-05, + "loss": 1.8689, + "step": 6161 + }, + { + "epoch": 1.8913443830570902, + "grad_norm": 0.3740260601043701, + "learning_rate": 9.350852155400764e-05, + "loss": 1.8624, + "step": 6162 + }, + { + "epoch": 1.8916513198281155, + "grad_norm": 0.386124849319458, + "learning_rate": 9.350607209132318e-05, + "loss": 1.8506, + "step": 6163 + }, + { + "epoch": 1.8919582565991406, + "grad_norm": 0.3743472993373871, + "learning_rate": 9.350362219868623e-05, + "loss": 1.9499, + "step": 6164 + }, + { + "epoch": 1.8922651933701657, + "grad_norm": 0.4257555603981018, + "learning_rate": 9.350117187612097e-05, + "loss": 1.9407, + "step": 6165 + }, + { + "epoch": 1.892572130141191, + "grad_norm": 0.37218552827835083, + "learning_rate": 9.349872112365163e-05, + "loss": 1.8772, + "step": 6166 + }, + { + "epoch": 1.892879066912216, + "grad_norm": 0.3443894386291504, + "learning_rate": 9.349626994130245e-05, + "loss": 1.8846, + "step": 6167 + }, + { + "epoch": 1.8931860036832413, + "grad_norm": 0.33507248759269714, + "learning_rate": 9.349381832909763e-05, + "loss": 1.9303, + "step": 6168 + }, + { + "epoch": 1.8934929404542664, + "grad_norm": 0.3844592869281769, + "learning_rate": 9.349136628706141e-05, + "loss": 1.9453, + "step": 6169 + }, + { + "epoch": 1.8937998772252915, + "grad_norm": 0.35765793919563293, + "learning_rate": 9.348891381521802e-05, + "loss": 1.8745, + "step": 6170 + }, + { + "epoch": 1.8941068139963169, + "grad_norm": 0.3732185661792755, + "learning_rate": 9.348646091359168e-05, + "loss": 1.9318, + "step": 6171 + }, + { + "epoch": 1.894413750767342, + "grad_norm": 0.3704257607460022, + "learning_rate": 9.348400758220666e-05, + "loss": 1.9285, + "step": 6172 + }, + { + "epoch": 1.894720687538367, + "grad_norm": 0.32159942388534546, + "learning_rate": 9.348155382108717e-05, + "loss": 1.8368, + "step": 6173 + }, + { + "epoch": 1.8950276243093924, + "grad_norm": 0.32755646109580994, + "learning_rate": 9.34790996302575e-05, + "loss": 1.8975, + "step": 6174 + }, + { + "epoch": 1.8953345610804173, + "grad_norm": 0.38797906041145325, + "learning_rate": 9.347664500974186e-05, + "loss": 1.9684, + "step": 6175 + }, + { + "epoch": 1.8956414978514426, + "grad_norm": 0.3870599865913391, + "learning_rate": 9.347418995956456e-05, + "loss": 1.963, + "step": 6176 + }, + { + "epoch": 1.8959484346224678, + "grad_norm": 0.35739025473594666, + "learning_rate": 9.347173447974982e-05, + "loss": 1.8912, + "step": 6177 + }, + { + "epoch": 1.8962553713934929, + "grad_norm": 0.3525852859020233, + "learning_rate": 9.346927857032193e-05, + "loss": 1.8455, + "step": 6178 + }, + { + "epoch": 1.8965623081645182, + "grad_norm": 0.39735934138298035, + "learning_rate": 9.346682223130514e-05, + "loss": 1.8824, + "step": 6179 + }, + { + "epoch": 1.8968692449355433, + "grad_norm": 0.3677692413330078, + "learning_rate": 9.346436546272373e-05, + "loss": 1.8723, + "step": 6180 + }, + { + "epoch": 1.8971761817065684, + "grad_norm": 0.3660476505756378, + "learning_rate": 9.346190826460199e-05, + "loss": 1.9674, + "step": 6181 + }, + { + "epoch": 1.8974831184775938, + "grad_norm": 0.4416230022907257, + "learning_rate": 9.34594506369642e-05, + "loss": 1.9309, + "step": 6182 + }, + { + "epoch": 1.8977900552486187, + "grad_norm": 0.39761826395988464, + "learning_rate": 9.345699257983466e-05, + "loss": 1.9408, + "step": 6183 + }, + { + "epoch": 1.898096992019644, + "grad_norm": 0.44419440627098083, + "learning_rate": 9.345453409323763e-05, + "loss": 2.0013, + "step": 6184 + }, + { + "epoch": 1.898403928790669, + "grad_norm": 0.4173676371574402, + "learning_rate": 9.345207517719743e-05, + "loss": 1.8462, + "step": 6185 + }, + { + "epoch": 1.8987108655616942, + "grad_norm": 0.39312002062797546, + "learning_rate": 9.344961583173837e-05, + "loss": 1.8716, + "step": 6186 + }, + { + "epoch": 1.8990178023327196, + "grad_norm": 0.389996737241745, + "learning_rate": 9.344715605688472e-05, + "loss": 1.9331, + "step": 6187 + }, + { + "epoch": 1.8993247391037447, + "grad_norm": 0.4575251340866089, + "learning_rate": 9.34446958526608e-05, + "loss": 1.9408, + "step": 6188 + }, + { + "epoch": 1.8996316758747698, + "grad_norm": 0.425075888633728, + "learning_rate": 9.344223521909097e-05, + "loss": 1.8632, + "step": 6189 + }, + { + "epoch": 1.899938612645795, + "grad_norm": 0.3622394800186157, + "learning_rate": 9.343977415619948e-05, + "loss": 1.8671, + "step": 6190 + }, + { + "epoch": 1.90024554941682, + "grad_norm": 0.38955047726631165, + "learning_rate": 9.343731266401068e-05, + "loss": 1.8955, + "step": 6191 + }, + { + "epoch": 1.9005524861878453, + "grad_norm": 0.40853381156921387, + "learning_rate": 9.34348507425489e-05, + "loss": 1.8477, + "step": 6192 + }, + { + "epoch": 1.9008594229588704, + "grad_norm": 0.36416095495224, + "learning_rate": 9.343238839183848e-05, + "loss": 1.8596, + "step": 6193 + }, + { + "epoch": 1.9011663597298956, + "grad_norm": 0.3371017277240753, + "learning_rate": 9.342992561190374e-05, + "loss": 1.9646, + "step": 6194 + }, + { + "epoch": 1.901473296500921, + "grad_norm": 0.3605191111564636, + "learning_rate": 9.3427462402769e-05, + "loss": 1.9165, + "step": 6195 + }, + { + "epoch": 1.901780233271946, + "grad_norm": 0.32952287793159485, + "learning_rate": 9.342499876445863e-05, + "loss": 1.8827, + "step": 6196 + }, + { + "epoch": 1.9020871700429711, + "grad_norm": 0.3627411425113678, + "learning_rate": 9.342253469699698e-05, + "loss": 1.9058, + "step": 6197 + }, + { + "epoch": 1.9023941068139965, + "grad_norm": 0.3830505311489105, + "learning_rate": 9.342007020040839e-05, + "loss": 1.89, + "step": 6198 + }, + { + "epoch": 1.9027010435850213, + "grad_norm": 0.36550065875053406, + "learning_rate": 9.341760527471722e-05, + "loss": 1.9004, + "step": 6199 + }, + { + "epoch": 1.9030079803560467, + "grad_norm": 0.4098506569862366, + "learning_rate": 9.341513991994782e-05, + "loss": 1.8656, + "step": 6200 + }, + { + "epoch": 1.9033149171270718, + "grad_norm": 0.5218825340270996, + "learning_rate": 9.341267413612456e-05, + "loss": 1.9179, + "step": 6201 + }, + { + "epoch": 1.903621853898097, + "grad_norm": 0.6201978921890259, + "learning_rate": 9.34102079232718e-05, + "loss": 1.9485, + "step": 6202 + }, + { + "epoch": 1.9039287906691222, + "grad_norm": 0.597594141960144, + "learning_rate": 9.340774128141395e-05, + "loss": 1.9074, + "step": 6203 + }, + { + "epoch": 1.9042357274401474, + "grad_norm": 0.477268248796463, + "learning_rate": 9.340527421057533e-05, + "loss": 1.9202, + "step": 6204 + }, + { + "epoch": 1.9045426642111725, + "grad_norm": 0.39805278182029724, + "learning_rate": 9.340280671078035e-05, + "loss": 1.8801, + "step": 6205 + }, + { + "epoch": 1.9048496009821978, + "grad_norm": 0.5815454721450806, + "learning_rate": 9.340033878205342e-05, + "loss": 1.8564, + "step": 6206 + }, + { + "epoch": 1.9051565377532227, + "grad_norm": 0.6385661363601685, + "learning_rate": 9.339787042441888e-05, + "loss": 1.8992, + "step": 6207 + }, + { + "epoch": 1.905463474524248, + "grad_norm": 0.5905124545097351, + "learning_rate": 9.339540163790116e-05, + "loss": 1.9608, + "step": 6208 + }, + { + "epoch": 1.9057704112952731, + "grad_norm": 0.37329113483428955, + "learning_rate": 9.339293242252465e-05, + "loss": 1.9037, + "step": 6209 + }, + { + "epoch": 1.9060773480662982, + "grad_norm": 0.4568968117237091, + "learning_rate": 9.339046277831374e-05, + "loss": 1.8719, + "step": 6210 + }, + { + "epoch": 1.9063842848373236, + "grad_norm": 0.43003782629966736, + "learning_rate": 9.338799270529284e-05, + "loss": 1.8594, + "step": 6211 + }, + { + "epoch": 1.9066912216083487, + "grad_norm": 0.3795240819454193, + "learning_rate": 9.338552220348637e-05, + "loss": 1.8645, + "step": 6212 + }, + { + "epoch": 1.9069981583793738, + "grad_norm": 0.3791581392288208, + "learning_rate": 9.338305127291876e-05, + "loss": 1.9076, + "step": 6213 + }, + { + "epoch": 1.9073050951503991, + "grad_norm": 0.3747733533382416, + "learning_rate": 9.338057991361438e-05, + "loss": 1.8665, + "step": 6214 + }, + { + "epoch": 1.907612031921424, + "grad_norm": 0.3994114100933075, + "learning_rate": 9.337810812559771e-05, + "loss": 1.9202, + "step": 6215 + }, + { + "epoch": 1.9079189686924494, + "grad_norm": 0.3808605670928955, + "learning_rate": 9.337563590889312e-05, + "loss": 1.9272, + "step": 6216 + }, + { + "epoch": 1.9082259054634745, + "grad_norm": 0.3461966812610626, + "learning_rate": 9.33731632635251e-05, + "loss": 1.8621, + "step": 6217 + }, + { + "epoch": 1.9085328422344996, + "grad_norm": 0.37272316217422485, + "learning_rate": 9.337069018951805e-05, + "loss": 1.8996, + "step": 6218 + }, + { + "epoch": 1.908839779005525, + "grad_norm": 0.40319329500198364, + "learning_rate": 9.336821668689642e-05, + "loss": 1.8852, + "step": 6219 + }, + { + "epoch": 1.90914671577655, + "grad_norm": 0.4059053659439087, + "learning_rate": 9.336574275568463e-05, + "loss": 1.9156, + "step": 6220 + }, + { + "epoch": 1.9094536525475752, + "grad_norm": 0.41244640946388245, + "learning_rate": 9.336326839590719e-05, + "loss": 1.9858, + "step": 6221 + }, + { + "epoch": 1.9097605893186005, + "grad_norm": 0.38230007886886597, + "learning_rate": 9.336079360758849e-05, + "loss": 1.8756, + "step": 6222 + }, + { + "epoch": 1.9100675260896254, + "grad_norm": 0.3620646297931671, + "learning_rate": 9.335831839075304e-05, + "loss": 1.9305, + "step": 6223 + }, + { + "epoch": 1.9103744628606507, + "grad_norm": 0.3700193166732788, + "learning_rate": 9.335584274542525e-05, + "loss": 1.8544, + "step": 6224 + }, + { + "epoch": 1.9106813996316758, + "grad_norm": 0.36827734112739563, + "learning_rate": 9.335336667162962e-05, + "loss": 1.8658, + "step": 6225 + }, + { + "epoch": 1.910988336402701, + "grad_norm": 0.33878061175346375, + "learning_rate": 9.33508901693906e-05, + "loss": 1.8638, + "step": 6226 + }, + { + "epoch": 1.9112952731737263, + "grad_norm": 0.3522186577320099, + "learning_rate": 9.334841323873269e-05, + "loss": 1.9109, + "step": 6227 + }, + { + "epoch": 1.9116022099447514, + "grad_norm": 0.3552776277065277, + "learning_rate": 9.334593587968035e-05, + "loss": 1.8499, + "step": 6228 + }, + { + "epoch": 1.9119091467157765, + "grad_norm": 0.3232300877571106, + "learning_rate": 9.334345809225805e-05, + "loss": 1.9078, + "step": 6229 + }, + { + "epoch": 1.9122160834868018, + "grad_norm": 0.3500599265098572, + "learning_rate": 9.33409798764903e-05, + "loss": 1.8953, + "step": 6230 + }, + { + "epoch": 1.9125230202578267, + "grad_norm": 0.4011479914188385, + "learning_rate": 9.333850123240159e-05, + "loss": 1.8961, + "step": 6231 + }, + { + "epoch": 1.912829957028852, + "grad_norm": 0.419539213180542, + "learning_rate": 9.333602216001642e-05, + "loss": 1.9381, + "step": 6232 + }, + { + "epoch": 1.9131368937998774, + "grad_norm": 0.364956259727478, + "learning_rate": 9.333354265935926e-05, + "loss": 1.8495, + "step": 6233 + }, + { + "epoch": 1.9134438305709023, + "grad_norm": 0.3322601318359375, + "learning_rate": 9.333106273045464e-05, + "loss": 1.8389, + "step": 6234 + }, + { + "epoch": 1.9137507673419276, + "grad_norm": 0.3706522583961487, + "learning_rate": 9.332858237332705e-05, + "loss": 1.904, + "step": 6235 + }, + { + "epoch": 1.9140577041129527, + "grad_norm": 0.3900963366031647, + "learning_rate": 9.332610158800104e-05, + "loss": 1.8974, + "step": 6236 + }, + { + "epoch": 1.9143646408839778, + "grad_norm": 0.3308334946632385, + "learning_rate": 9.332362037450108e-05, + "loss": 1.959, + "step": 6237 + }, + { + "epoch": 1.9146715776550032, + "grad_norm": 0.37876754999160767, + "learning_rate": 9.332113873285171e-05, + "loss": 1.9187, + "step": 6238 + }, + { + "epoch": 1.9149785144260283, + "grad_norm": 0.3557550609111786, + "learning_rate": 9.331865666307746e-05, + "loss": 1.9351, + "step": 6239 + }, + { + "epoch": 1.9152854511970534, + "grad_norm": 0.3792133927345276, + "learning_rate": 9.331617416520285e-05, + "loss": 1.8488, + "step": 6240 + }, + { + "epoch": 1.9155923879680787, + "grad_norm": 0.40517017245292664, + "learning_rate": 9.331369123925242e-05, + "loss": 1.9311, + "step": 6241 + }, + { + "epoch": 1.9158993247391036, + "grad_norm": 0.34011030197143555, + "learning_rate": 9.331120788525072e-05, + "loss": 1.8606, + "step": 6242 + }, + { + "epoch": 1.916206261510129, + "grad_norm": 0.39949584007263184, + "learning_rate": 9.330872410322227e-05, + "loss": 1.9156, + "step": 6243 + }, + { + "epoch": 1.916513198281154, + "grad_norm": 0.3771394193172455, + "learning_rate": 9.330623989319162e-05, + "loss": 1.8448, + "step": 6244 + }, + { + "epoch": 1.9168201350521792, + "grad_norm": 0.32114169001579285, + "learning_rate": 9.330375525518333e-05, + "loss": 1.8681, + "step": 6245 + }, + { + "epoch": 1.9171270718232045, + "grad_norm": 0.3438408672809601, + "learning_rate": 9.330127018922194e-05, + "loss": 1.8582, + "step": 6246 + }, + { + "epoch": 1.9174340085942296, + "grad_norm": 0.35971906781196594, + "learning_rate": 9.329878469533201e-05, + "loss": 1.9026, + "step": 6247 + }, + { + "epoch": 1.9177409453652547, + "grad_norm": 0.3953855633735657, + "learning_rate": 9.329629877353813e-05, + "loss": 1.8837, + "step": 6248 + }, + { + "epoch": 1.91804788213628, + "grad_norm": 0.36541905999183655, + "learning_rate": 9.329381242386485e-05, + "loss": 1.9156, + "step": 6249 + }, + { + "epoch": 1.918354818907305, + "grad_norm": 0.3577594459056854, + "learning_rate": 9.329132564633673e-05, + "loss": 1.8791, + "step": 6250 + }, + { + "epoch": 1.9186617556783303, + "grad_norm": 0.3869122564792633, + "learning_rate": 9.328883844097837e-05, + "loss": 1.9048, + "step": 6251 + }, + { + "epoch": 1.9189686924493554, + "grad_norm": 0.35097724199295044, + "learning_rate": 9.328635080781433e-05, + "loss": 1.9602, + "step": 6252 + }, + { + "epoch": 1.9192756292203805, + "grad_norm": 0.3813062012195587, + "learning_rate": 9.328386274686919e-05, + "loss": 1.9133, + "step": 6253 + }, + { + "epoch": 1.9195825659914059, + "grad_norm": 0.3950280249118805, + "learning_rate": 9.328137425816756e-05, + "loss": 1.9462, + "step": 6254 + }, + { + "epoch": 1.919889502762431, + "grad_norm": 0.41710540652275085, + "learning_rate": 9.327888534173402e-05, + "loss": 1.8616, + "step": 6255 + }, + { + "epoch": 1.920196439533456, + "grad_norm": 0.39998626708984375, + "learning_rate": 9.327639599759318e-05, + "loss": 1.8758, + "step": 6256 + }, + { + "epoch": 1.9205033763044814, + "grad_norm": 0.35425302386283875, + "learning_rate": 9.32739062257696e-05, + "loss": 1.8896, + "step": 6257 + }, + { + "epoch": 1.9208103130755063, + "grad_norm": 0.3487682640552521, + "learning_rate": 9.327141602628793e-05, + "loss": 1.8901, + "step": 6258 + }, + { + "epoch": 1.9211172498465316, + "grad_norm": 0.38767126202583313, + "learning_rate": 9.326892539917277e-05, + "loss": 1.9264, + "step": 6259 + }, + { + "epoch": 1.9214241866175568, + "grad_norm": 0.4265333116054535, + "learning_rate": 9.326643434444872e-05, + "loss": 1.9282, + "step": 6260 + }, + { + "epoch": 1.9217311233885819, + "grad_norm": 0.3386894166469574, + "learning_rate": 9.326394286214042e-05, + "loss": 1.8167, + "step": 6261 + }, + { + "epoch": 1.9220380601596072, + "grad_norm": 0.3594066798686981, + "learning_rate": 9.326145095227246e-05, + "loss": 1.9293, + "step": 6262 + }, + { + "epoch": 1.9223449969306323, + "grad_norm": 0.4041733741760254, + "learning_rate": 9.32589586148695e-05, + "loss": 2.0066, + "step": 6263 + }, + { + "epoch": 1.9226519337016574, + "grad_norm": 0.45588794350624084, + "learning_rate": 9.325646584995615e-05, + "loss": 1.9485, + "step": 6264 + }, + { + "epoch": 1.9229588704726828, + "grad_norm": 0.42583590745925903, + "learning_rate": 9.325397265755705e-05, + "loss": 1.8973, + "step": 6265 + }, + { + "epoch": 1.9232658072437077, + "grad_norm": 0.38701504468917847, + "learning_rate": 9.325147903769684e-05, + "loss": 1.9624, + "step": 6266 + }, + { + "epoch": 1.923572744014733, + "grad_norm": 0.4298608899116516, + "learning_rate": 9.324898499040017e-05, + "loss": 1.9033, + "step": 6267 + }, + { + "epoch": 1.923879680785758, + "grad_norm": 0.3692619800567627, + "learning_rate": 9.324649051569167e-05, + "loss": 1.973, + "step": 6268 + }, + { + "epoch": 1.9241866175567832, + "grad_norm": 0.40625011920928955, + "learning_rate": 9.324399561359602e-05, + "loss": 1.8629, + "step": 6269 + }, + { + "epoch": 1.9244935543278086, + "grad_norm": 0.43613263964653015, + "learning_rate": 9.324150028413784e-05, + "loss": 1.8928, + "step": 6270 + }, + { + "epoch": 1.9248004910988337, + "grad_norm": 0.4670937657356262, + "learning_rate": 9.323900452734182e-05, + "loss": 1.8809, + "step": 6271 + }, + { + "epoch": 1.9251074278698588, + "grad_norm": 0.43263986706733704, + "learning_rate": 9.323650834323262e-05, + "loss": 1.891, + "step": 6272 + }, + { + "epoch": 1.9254143646408841, + "grad_norm": 0.4253878891468048, + "learning_rate": 9.32340117318349e-05, + "loss": 2.0064, + "step": 6273 + }, + { + "epoch": 1.925721301411909, + "grad_norm": 0.3742302358150482, + "learning_rate": 9.323151469317332e-05, + "loss": 1.9441, + "step": 6274 + }, + { + "epoch": 1.9260282381829343, + "grad_norm": 0.37415632605552673, + "learning_rate": 9.32290172272726e-05, + "loss": 1.8901, + "step": 6275 + }, + { + "epoch": 1.9263351749539595, + "grad_norm": 0.402935266494751, + "learning_rate": 9.322651933415738e-05, + "loss": 1.9013, + "step": 6276 + }, + { + "epoch": 1.9266421117249846, + "grad_norm": 0.479819118976593, + "learning_rate": 9.322402101385235e-05, + "loss": 1.9713, + "step": 6277 + }, + { + "epoch": 1.92694904849601, + "grad_norm": 0.4472719430923462, + "learning_rate": 9.322152226638222e-05, + "loss": 1.9106, + "step": 6278 + }, + { + "epoch": 1.927255985267035, + "grad_norm": 0.36508920788764954, + "learning_rate": 9.321902309177168e-05, + "loss": 1.8999, + "step": 6279 + }, + { + "epoch": 1.9275629220380601, + "grad_norm": 0.38674476742744446, + "learning_rate": 9.321652349004542e-05, + "loss": 1.8653, + "step": 6280 + }, + { + "epoch": 1.9278698588090855, + "grad_norm": 0.3745587170124054, + "learning_rate": 9.321402346122814e-05, + "loss": 1.8764, + "step": 6281 + }, + { + "epoch": 1.9281767955801103, + "grad_norm": 0.37824445962905884, + "learning_rate": 9.321152300534454e-05, + "loss": 1.8712, + "step": 6282 + }, + { + "epoch": 1.9284837323511357, + "grad_norm": 0.3442685306072235, + "learning_rate": 9.320902212241936e-05, + "loss": 1.8242, + "step": 6283 + }, + { + "epoch": 1.9287906691221608, + "grad_norm": 0.3152186870574951, + "learning_rate": 9.32065208124773e-05, + "loss": 1.9282, + "step": 6284 + }, + { + "epoch": 1.929097605893186, + "grad_norm": 0.35380542278289795, + "learning_rate": 9.320401907554306e-05, + "loss": 1.8783, + "step": 6285 + }, + { + "epoch": 1.9294045426642112, + "grad_norm": 0.3140089511871338, + "learning_rate": 9.320151691164138e-05, + "loss": 1.9174, + "step": 6286 + }, + { + "epoch": 1.9297114794352364, + "grad_norm": 0.33666202425956726, + "learning_rate": 9.3199014320797e-05, + "loss": 1.8926, + "step": 6287 + }, + { + "epoch": 1.9300184162062615, + "grad_norm": 0.3297472894191742, + "learning_rate": 9.319651130303465e-05, + "loss": 1.8763, + "step": 6288 + }, + { + "epoch": 1.9303253529772868, + "grad_norm": 0.3323235511779785, + "learning_rate": 9.319400785837906e-05, + "loss": 1.9088, + "step": 6289 + }, + { + "epoch": 1.9306322897483117, + "grad_norm": 0.32601413130760193, + "learning_rate": 9.319150398685494e-05, + "loss": 1.8672, + "step": 6290 + }, + { + "epoch": 1.930939226519337, + "grad_norm": 0.35310089588165283, + "learning_rate": 9.318899968848708e-05, + "loss": 1.9492, + "step": 6291 + }, + { + "epoch": 1.9312461632903621, + "grad_norm": 0.3718548119068146, + "learning_rate": 9.31864949633002e-05, + "loss": 1.8692, + "step": 6292 + }, + { + "epoch": 1.9315531000613873, + "grad_norm": 0.42382025718688965, + "learning_rate": 9.318398981131908e-05, + "loss": 1.9693, + "step": 6293 + }, + { + "epoch": 1.9318600368324126, + "grad_norm": 0.5123299360275269, + "learning_rate": 9.318148423256845e-05, + "loss": 2.0117, + "step": 6294 + }, + { + "epoch": 1.9321669736034377, + "grad_norm": 0.4483809769153595, + "learning_rate": 9.317897822707308e-05, + "loss": 1.9165, + "step": 6295 + }, + { + "epoch": 1.9324739103744628, + "grad_norm": 0.4385908544063568, + "learning_rate": 9.317647179485776e-05, + "loss": 1.8869, + "step": 6296 + }, + { + "epoch": 1.9327808471454881, + "grad_norm": 0.42863771319389343, + "learning_rate": 9.317396493594724e-05, + "loss": 1.9484, + "step": 6297 + }, + { + "epoch": 1.933087783916513, + "grad_norm": 0.4130534529685974, + "learning_rate": 9.317145765036627e-05, + "loss": 1.9201, + "step": 6298 + }, + { + "epoch": 1.9333947206875384, + "grad_norm": 0.39024612307548523, + "learning_rate": 9.316894993813965e-05, + "loss": 1.9674, + "step": 6299 + }, + { + "epoch": 1.9337016574585635, + "grad_norm": 0.41060271859169006, + "learning_rate": 9.316644179929219e-05, + "loss": 1.9529, + "step": 6300 + }, + { + "epoch": 1.9340085942295886, + "grad_norm": 0.4302372634410858, + "learning_rate": 9.316393323384863e-05, + "loss": 1.8998, + "step": 6301 + }, + { + "epoch": 1.934315531000614, + "grad_norm": 0.3739410936832428, + "learning_rate": 9.316142424183379e-05, + "loss": 1.8812, + "step": 6302 + }, + { + "epoch": 1.934622467771639, + "grad_norm": 0.3965891897678375, + "learning_rate": 9.315891482327245e-05, + "loss": 1.8851, + "step": 6303 + }, + { + "epoch": 1.9349294045426642, + "grad_norm": 0.4486664831638336, + "learning_rate": 9.315640497818943e-05, + "loss": 1.9494, + "step": 6304 + }, + { + "epoch": 1.9352363413136895, + "grad_norm": 0.5530070662498474, + "learning_rate": 9.315389470660951e-05, + "loss": 1.9716, + "step": 6305 + }, + { + "epoch": 1.9355432780847146, + "grad_norm": 0.7142495512962341, + "learning_rate": 9.315138400855751e-05, + "loss": 1.947, + "step": 6306 + }, + { + "epoch": 1.9358502148557397, + "grad_norm": 0.7555594444274902, + "learning_rate": 9.314887288405827e-05, + "loss": 1.873, + "step": 6307 + }, + { + "epoch": 1.936157151626765, + "grad_norm": 0.6025232076644897, + "learning_rate": 9.314636133313654e-05, + "loss": 1.9189, + "step": 6308 + }, + { + "epoch": 1.93646408839779, + "grad_norm": 0.3686346113681793, + "learning_rate": 9.314384935581719e-05, + "loss": 1.8461, + "step": 6309 + }, + { + "epoch": 1.9367710251688153, + "grad_norm": 0.46265771985054016, + "learning_rate": 9.314133695212505e-05, + "loss": 1.8955, + "step": 6310 + }, + { + "epoch": 1.9370779619398404, + "grad_norm": 0.7023865580558777, + "learning_rate": 9.313882412208492e-05, + "loss": 1.9378, + "step": 6311 + }, + { + "epoch": 1.9373848987108655, + "grad_norm": 0.7163348197937012, + "learning_rate": 9.313631086572163e-05, + "loss": 1.9278, + "step": 6312 + }, + { + "epoch": 1.9376918354818908, + "grad_norm": 0.4772320091724396, + "learning_rate": 9.313379718306006e-05, + "loss": 1.9215, + "step": 6313 + }, + { + "epoch": 1.937998772252916, + "grad_norm": 0.4934171438217163, + "learning_rate": 9.313128307412501e-05, + "loss": 1.9725, + "step": 6314 + }, + { + "epoch": 1.938305709023941, + "grad_norm": 0.5988278985023499, + "learning_rate": 9.312876853894134e-05, + "loss": 1.9238, + "step": 6315 + }, + { + "epoch": 1.9386126457949664, + "grad_norm": 0.5819640159606934, + "learning_rate": 9.31262535775339e-05, + "loss": 1.9228, + "step": 6316 + }, + { + "epoch": 1.9389195825659913, + "grad_norm": 0.49525877833366394, + "learning_rate": 9.312373818992756e-05, + "loss": 1.8939, + "step": 6317 + }, + { + "epoch": 1.9392265193370166, + "grad_norm": 0.3778049647808075, + "learning_rate": 9.312122237614715e-05, + "loss": 1.8709, + "step": 6318 + }, + { + "epoch": 1.9395334561080417, + "grad_norm": 0.48716801404953003, + "learning_rate": 9.311870613621754e-05, + "loss": 1.9014, + "step": 6319 + }, + { + "epoch": 1.9398403928790668, + "grad_norm": 0.47298866510391235, + "learning_rate": 9.311618947016362e-05, + "loss": 1.8686, + "step": 6320 + }, + { + "epoch": 1.9401473296500922, + "grad_norm": 0.3709685206413269, + "learning_rate": 9.311367237801023e-05, + "loss": 1.9531, + "step": 6321 + }, + { + "epoch": 1.9404542664211173, + "grad_norm": 0.3898928761482239, + "learning_rate": 9.311115485978228e-05, + "loss": 1.8806, + "step": 6322 + }, + { + "epoch": 1.9407612031921424, + "grad_norm": 0.43091922998428345, + "learning_rate": 9.310863691550461e-05, + "loss": 1.9278, + "step": 6323 + }, + { + "epoch": 1.9410681399631677, + "grad_norm": 0.3788231909275055, + "learning_rate": 9.310611854520212e-05, + "loss": 1.893, + "step": 6324 + }, + { + "epoch": 1.9413750767341926, + "grad_norm": 0.4471469819545746, + "learning_rate": 9.310359974889972e-05, + "loss": 1.9706, + "step": 6325 + }, + { + "epoch": 1.941682013505218, + "grad_norm": 0.4047459661960602, + "learning_rate": 9.310108052662228e-05, + "loss": 1.8863, + "step": 6326 + }, + { + "epoch": 1.941988950276243, + "grad_norm": 0.4334566593170166, + "learning_rate": 9.309856087839468e-05, + "loss": 1.9543, + "step": 6327 + }, + { + "epoch": 1.9422958870472682, + "grad_norm": 0.3828316032886505, + "learning_rate": 9.309604080424185e-05, + "loss": 1.8601, + "step": 6328 + }, + { + "epoch": 1.9426028238182935, + "grad_norm": 0.3702560067176819, + "learning_rate": 9.30935203041887e-05, + "loss": 1.9055, + "step": 6329 + }, + { + "epoch": 1.9429097605893186, + "grad_norm": 0.4922797977924347, + "learning_rate": 9.309099937826011e-05, + "loss": 1.9589, + "step": 6330 + }, + { + "epoch": 1.9432166973603437, + "grad_norm": 0.4073271155357361, + "learning_rate": 9.308847802648102e-05, + "loss": 1.9727, + "step": 6331 + }, + { + "epoch": 1.943523634131369, + "grad_norm": 0.3833904266357422, + "learning_rate": 9.308595624887633e-05, + "loss": 1.8641, + "step": 6332 + }, + { + "epoch": 1.943830570902394, + "grad_norm": 0.44063761830329895, + "learning_rate": 9.308343404547095e-05, + "loss": 1.8996, + "step": 6333 + }, + { + "epoch": 1.9441375076734193, + "grad_norm": 0.4776977300643921, + "learning_rate": 9.308091141628983e-05, + "loss": 1.9353, + "step": 6334 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.39584699273109436, + "learning_rate": 9.307838836135792e-05, + "loss": 1.8521, + "step": 6335 + }, + { + "epoch": 1.9447513812154695, + "grad_norm": 0.3220890760421753, + "learning_rate": 9.30758648807001e-05, + "loss": 1.825, + "step": 6336 + }, + { + "epoch": 1.9450583179864949, + "grad_norm": 0.4301774501800537, + "learning_rate": 9.307334097434133e-05, + "loss": 1.9317, + "step": 6337 + }, + { + "epoch": 1.94536525475752, + "grad_norm": 0.439165323972702, + "learning_rate": 9.307081664230658e-05, + "loss": 1.8669, + "step": 6338 + }, + { + "epoch": 1.945672191528545, + "grad_norm": 0.4185279607772827, + "learning_rate": 9.306829188462076e-05, + "loss": 1.9512, + "step": 6339 + }, + { + "epoch": 1.9459791282995704, + "grad_norm": 0.4089502990245819, + "learning_rate": 9.306576670130885e-05, + "loss": 1.9607, + "step": 6340 + }, + { + "epoch": 1.9462860650705953, + "grad_norm": 0.508836567401886, + "learning_rate": 9.306324109239578e-05, + "loss": 1.9187, + "step": 6341 + }, + { + "epoch": 1.9465930018416207, + "grad_norm": 0.637534499168396, + "learning_rate": 9.306071505790652e-05, + "loss": 1.8237, + "step": 6342 + }, + { + "epoch": 1.9468999386126458, + "grad_norm": 0.5845112800598145, + "learning_rate": 9.305818859786603e-05, + "loss": 1.8238, + "step": 6343 + }, + { + "epoch": 1.9472068753836709, + "grad_norm": 0.4168374240398407, + "learning_rate": 9.305566171229932e-05, + "loss": 1.9343, + "step": 6344 + }, + { + "epoch": 1.9475138121546962, + "grad_norm": 0.43040701746940613, + "learning_rate": 9.305313440123129e-05, + "loss": 1.8774, + "step": 6345 + }, + { + "epoch": 1.9478207489257213, + "grad_norm": 0.6011641025543213, + "learning_rate": 9.305060666468696e-05, + "loss": 1.89, + "step": 6346 + }, + { + "epoch": 1.9481276856967464, + "grad_norm": 0.5530022382736206, + "learning_rate": 9.304807850269131e-05, + "loss": 2.0006, + "step": 6347 + }, + { + "epoch": 1.9484346224677718, + "grad_norm": 0.3707423210144043, + "learning_rate": 9.30455499152693e-05, + "loss": 1.9116, + "step": 6348 + }, + { + "epoch": 1.9487415592387967, + "grad_norm": 0.5013771653175354, + "learning_rate": 9.304302090244595e-05, + "loss": 1.8902, + "step": 6349 + }, + { + "epoch": 1.949048496009822, + "grad_norm": 0.5873609781265259, + "learning_rate": 9.304049146424623e-05, + "loss": 1.8879, + "step": 6350 + }, + { + "epoch": 1.949355432780847, + "grad_norm": 0.4389801621437073, + "learning_rate": 9.303796160069516e-05, + "loss": 1.9215, + "step": 6351 + }, + { + "epoch": 1.9496623695518722, + "grad_norm": 0.4004434645175934, + "learning_rate": 9.303543131181772e-05, + "loss": 1.9137, + "step": 6352 + }, + { + "epoch": 1.9499693063228976, + "grad_norm": 0.4928852617740631, + "learning_rate": 9.303290059763892e-05, + "loss": 1.9415, + "step": 6353 + }, + { + "epoch": 1.9502762430939227, + "grad_norm": 0.5045879483222961, + "learning_rate": 9.303036945818377e-05, + "loss": 1.8727, + "step": 6354 + }, + { + "epoch": 1.9505831798649478, + "grad_norm": 0.3434823453426361, + "learning_rate": 9.30278378934773e-05, + "loss": 1.8971, + "step": 6355 + }, + { + "epoch": 1.9508901166359731, + "grad_norm": 0.42980003356933594, + "learning_rate": 9.302530590354452e-05, + "loss": 1.9233, + "step": 6356 + }, + { + "epoch": 1.951197053406998, + "grad_norm": 0.3832406997680664, + "learning_rate": 9.302277348841042e-05, + "loss": 1.9317, + "step": 6357 + }, + { + "epoch": 1.9515039901780233, + "grad_norm": 0.37214264273643494, + "learning_rate": 9.30202406481001e-05, + "loss": 1.9172, + "step": 6358 + }, + { + "epoch": 1.9518109269490485, + "grad_norm": 0.3601585924625397, + "learning_rate": 9.30177073826385e-05, + "loss": 1.9286, + "step": 6359 + }, + { + "epoch": 1.9521178637200736, + "grad_norm": 0.36419349908828735, + "learning_rate": 9.301517369205072e-05, + "loss": 1.8624, + "step": 6360 + }, + { + "epoch": 1.952424800491099, + "grad_norm": 0.3808813691139221, + "learning_rate": 9.30126395763618e-05, + "loss": 1.8656, + "step": 6361 + }, + { + "epoch": 1.952731737262124, + "grad_norm": 0.39045700430870056, + "learning_rate": 9.301010503559675e-05, + "loss": 1.9205, + "step": 6362 + }, + { + "epoch": 1.9530386740331491, + "grad_norm": 0.37281444668769836, + "learning_rate": 9.300757006978065e-05, + "loss": 1.9162, + "step": 6363 + }, + { + "epoch": 1.9533456108041745, + "grad_norm": 0.4525204002857208, + "learning_rate": 9.300503467893851e-05, + "loss": 1.8999, + "step": 6364 + }, + { + "epoch": 1.9536525475751993, + "grad_norm": 0.41406187415122986, + "learning_rate": 9.300249886309542e-05, + "loss": 1.9804, + "step": 6365 + }, + { + "epoch": 1.9539594843462247, + "grad_norm": 0.4125058650970459, + "learning_rate": 9.299996262227644e-05, + "loss": 1.8464, + "step": 6366 + }, + { + "epoch": 1.9542664211172498, + "grad_norm": 0.41582876443862915, + "learning_rate": 9.299742595650663e-05, + "loss": 1.9937, + "step": 6367 + }, + { + "epoch": 1.954573357888275, + "grad_norm": 0.4360882639884949, + "learning_rate": 9.299488886581103e-05, + "loss": 1.9064, + "step": 6368 + }, + { + "epoch": 1.9548802946593002, + "grad_norm": 0.38369372487068176, + "learning_rate": 9.299235135021476e-05, + "loss": 1.9202, + "step": 6369 + }, + { + "epoch": 1.9551872314303254, + "grad_norm": 0.34401383996009827, + "learning_rate": 9.298981340974287e-05, + "loss": 1.844, + "step": 6370 + }, + { + "epoch": 1.9554941682013505, + "grad_norm": 0.3434326946735382, + "learning_rate": 9.298727504442044e-05, + "loss": 1.8206, + "step": 6371 + }, + { + "epoch": 1.9558011049723758, + "grad_norm": 0.35966724157333374, + "learning_rate": 9.298473625427257e-05, + "loss": 1.9, + "step": 6372 + }, + { + "epoch": 1.9561080417434007, + "grad_norm": 0.3726016581058502, + "learning_rate": 9.298219703932434e-05, + "loss": 1.9004, + "step": 6373 + }, + { + "epoch": 1.956414978514426, + "grad_norm": 0.3377366364002228, + "learning_rate": 9.297965739960084e-05, + "loss": 1.8747, + "step": 6374 + }, + { + "epoch": 1.9567219152854514, + "grad_norm": 0.36824578046798706, + "learning_rate": 9.297711733512718e-05, + "loss": 1.9059, + "step": 6375 + }, + { + "epoch": 1.9570288520564763, + "grad_norm": 0.3434023857116699, + "learning_rate": 9.297457684592847e-05, + "loss": 1.8624, + "step": 6376 + }, + { + "epoch": 1.9573357888275016, + "grad_norm": 0.36236703395843506, + "learning_rate": 9.297203593202979e-05, + "loss": 1.8558, + "step": 6377 + }, + { + "epoch": 1.9576427255985267, + "grad_norm": 0.3326953947544098, + "learning_rate": 9.296949459345625e-05, + "loss": 1.9189, + "step": 6378 + }, + { + "epoch": 1.9579496623695518, + "grad_norm": 0.3358452022075653, + "learning_rate": 9.2966952830233e-05, + "loss": 1.8601, + "step": 6379 + }, + { + "epoch": 1.9582565991405771, + "grad_norm": 0.36092114448547363, + "learning_rate": 9.296441064238514e-05, + "loss": 1.873, + "step": 6380 + }, + { + "epoch": 1.9585635359116023, + "grad_norm": 0.345683217048645, + "learning_rate": 9.296186802993778e-05, + "loss": 1.9122, + "step": 6381 + }, + { + "epoch": 1.9588704726826274, + "grad_norm": 0.32488611340522766, + "learning_rate": 9.295932499291606e-05, + "loss": 1.8709, + "step": 6382 + }, + { + "epoch": 1.9591774094536527, + "grad_norm": 0.34276288747787476, + "learning_rate": 9.295678153134512e-05, + "loss": 1.937, + "step": 6383 + }, + { + "epoch": 1.9594843462246776, + "grad_norm": 0.3953622877597809, + "learning_rate": 9.295423764525008e-05, + "loss": 1.9357, + "step": 6384 + }, + { + "epoch": 1.959791282995703, + "grad_norm": 0.37806951999664307, + "learning_rate": 9.29516933346561e-05, + "loss": 1.8813, + "step": 6385 + }, + { + "epoch": 1.960098219766728, + "grad_norm": 0.39551272988319397, + "learning_rate": 9.29491485995883e-05, + "loss": 1.8812, + "step": 6386 + }, + { + "epoch": 1.9604051565377532, + "grad_norm": 0.37042370438575745, + "learning_rate": 9.294660344007184e-05, + "loss": 1.9059, + "step": 6387 + }, + { + "epoch": 1.9607120933087785, + "grad_norm": 0.37503576278686523, + "learning_rate": 9.294405785613187e-05, + "loss": 1.9792, + "step": 6388 + }, + { + "epoch": 1.9610190300798036, + "grad_norm": 0.3515741229057312, + "learning_rate": 9.294151184779355e-05, + "loss": 1.8792, + "step": 6389 + }, + { + "epoch": 1.9613259668508287, + "grad_norm": 0.319890558719635, + "learning_rate": 9.293896541508205e-05, + "loss": 1.9222, + "step": 6390 + }, + { + "epoch": 1.961632903621854, + "grad_norm": 0.3517487645149231, + "learning_rate": 9.293641855802252e-05, + "loss": 1.8751, + "step": 6391 + }, + { + "epoch": 1.961939840392879, + "grad_norm": 0.33269986510276794, + "learning_rate": 9.293387127664012e-05, + "loss": 1.8372, + "step": 6392 + }, + { + "epoch": 1.9622467771639043, + "grad_norm": 0.36048516631126404, + "learning_rate": 9.293132357096007e-05, + "loss": 1.8944, + "step": 6393 + }, + { + "epoch": 1.9625537139349294, + "grad_norm": 0.4329642057418823, + "learning_rate": 9.292877544100751e-05, + "loss": 1.9868, + "step": 6394 + }, + { + "epoch": 1.9628606507059545, + "grad_norm": 0.445496529340744, + "learning_rate": 9.292622688680762e-05, + "loss": 1.9885, + "step": 6395 + }, + { + "epoch": 1.9631675874769798, + "grad_norm": 0.3818886876106262, + "learning_rate": 9.292367790838561e-05, + "loss": 1.9515, + "step": 6396 + }, + { + "epoch": 1.963474524248005, + "grad_norm": 0.3800121545791626, + "learning_rate": 9.292112850576664e-05, + "loss": 1.8838, + "step": 6397 + }, + { + "epoch": 1.96378146101903, + "grad_norm": 0.44252321124076843, + "learning_rate": 9.291857867897593e-05, + "loss": 1.9296, + "step": 6398 + }, + { + "epoch": 1.9640883977900554, + "grad_norm": 0.463766485452652, + "learning_rate": 9.291602842803867e-05, + "loss": 1.9164, + "step": 6399 + }, + { + "epoch": 1.9643953345610803, + "grad_norm": 0.4599217474460602, + "learning_rate": 9.291347775298006e-05, + "loss": 1.9277, + "step": 6400 + }, + { + "epoch": 1.9647022713321056, + "grad_norm": 0.371346652507782, + "learning_rate": 9.291092665382532e-05, + "loss": 1.9036, + "step": 6401 + }, + { + "epoch": 1.9650092081031307, + "grad_norm": 0.327197402715683, + "learning_rate": 9.290837513059965e-05, + "loss": 1.8214, + "step": 6402 + }, + { + "epoch": 1.9653161448741558, + "grad_norm": 0.3346688747406006, + "learning_rate": 9.290582318332826e-05, + "loss": 1.8671, + "step": 6403 + }, + { + "epoch": 1.9656230816451812, + "grad_norm": 0.342208594083786, + "learning_rate": 9.290327081203637e-05, + "loss": 1.9143, + "step": 6404 + }, + { + "epoch": 1.9659300184162063, + "grad_norm": 0.3430559039115906, + "learning_rate": 9.290071801674923e-05, + "loss": 1.9135, + "step": 6405 + }, + { + "epoch": 1.9662369551872314, + "grad_norm": 0.3335573971271515, + "learning_rate": 9.289816479749202e-05, + "loss": 1.9011, + "step": 6406 + }, + { + "epoch": 1.9665438919582567, + "grad_norm": 0.3464879095554352, + "learning_rate": 9.289561115429004e-05, + "loss": 1.9061, + "step": 6407 + }, + { + "epoch": 1.9668508287292816, + "grad_norm": 0.3513408899307251, + "learning_rate": 9.289305708716847e-05, + "loss": 1.8982, + "step": 6408 + }, + { + "epoch": 1.967157765500307, + "grad_norm": 0.3888663947582245, + "learning_rate": 9.289050259615256e-05, + "loss": 1.9196, + "step": 6409 + }, + { + "epoch": 1.967464702271332, + "grad_norm": 0.3414073884487152, + "learning_rate": 9.288794768126759e-05, + "loss": 1.932, + "step": 6410 + }, + { + "epoch": 1.9677716390423572, + "grad_norm": 0.33067384362220764, + "learning_rate": 9.288539234253876e-05, + "loss": 1.8547, + "step": 6411 + }, + { + "epoch": 1.9680785758133825, + "grad_norm": 0.31827688217163086, + "learning_rate": 9.288283657999135e-05, + "loss": 1.8691, + "step": 6412 + }, + { + "epoch": 1.9683855125844076, + "grad_norm": 0.32259073853492737, + "learning_rate": 9.288028039365062e-05, + "loss": 1.8889, + "step": 6413 + }, + { + "epoch": 1.9686924493554327, + "grad_norm": 0.37552687525749207, + "learning_rate": 9.287772378354182e-05, + "loss": 1.8709, + "step": 6414 + }, + { + "epoch": 1.968999386126458, + "grad_norm": 0.3446151316165924, + "learning_rate": 9.287516674969024e-05, + "loss": 1.8749, + "step": 6415 + }, + { + "epoch": 1.969306322897483, + "grad_norm": 0.3648208975791931, + "learning_rate": 9.287260929212111e-05, + "loss": 1.93, + "step": 6416 + }, + { + "epoch": 1.9696132596685083, + "grad_norm": 0.3430599868297577, + "learning_rate": 9.287005141085974e-05, + "loss": 1.8537, + "step": 6417 + }, + { + "epoch": 1.9699201964395334, + "grad_norm": 0.39110586047172546, + "learning_rate": 9.286749310593139e-05, + "loss": 1.987, + "step": 6418 + }, + { + "epoch": 1.9702271332105585, + "grad_norm": 0.4033393859863281, + "learning_rate": 9.286493437736136e-05, + "loss": 1.9793, + "step": 6419 + }, + { + "epoch": 1.9705340699815839, + "grad_norm": 0.3950151205062866, + "learning_rate": 9.286237522517491e-05, + "loss": 1.8781, + "step": 6420 + }, + { + "epoch": 1.970841006752609, + "grad_norm": 0.4614053964614868, + "learning_rate": 9.285981564939735e-05, + "loss": 1.9886, + "step": 6421 + }, + { + "epoch": 1.971147943523634, + "grad_norm": 0.4990023076534271, + "learning_rate": 9.285725565005398e-05, + "loss": 1.8957, + "step": 6422 + }, + { + "epoch": 1.9714548802946594, + "grad_norm": 0.501301109790802, + "learning_rate": 9.285469522717008e-05, + "loss": 1.8606, + "step": 6423 + }, + { + "epoch": 1.9717618170656843, + "grad_norm": 0.3820148706436157, + "learning_rate": 9.285213438077097e-05, + "loss": 1.9097, + "step": 6424 + }, + { + "epoch": 1.9720687538367097, + "grad_norm": 0.3959129750728607, + "learning_rate": 9.284957311088193e-05, + "loss": 1.8972, + "step": 6425 + }, + { + "epoch": 1.9723756906077348, + "grad_norm": 0.4914678931236267, + "learning_rate": 9.284701141752831e-05, + "loss": 1.9211, + "step": 6426 + }, + { + "epoch": 1.9726826273787599, + "grad_norm": 0.5992010831832886, + "learning_rate": 9.284444930073542e-05, + "loss": 1.917, + "step": 6427 + }, + { + "epoch": 1.9729895641497852, + "grad_norm": 0.6089407801628113, + "learning_rate": 9.284188676052856e-05, + "loss": 1.9497, + "step": 6428 + }, + { + "epoch": 1.9732965009208103, + "grad_norm": 0.5493173003196716, + "learning_rate": 9.283932379693306e-05, + "loss": 1.9888, + "step": 6429 + }, + { + "epoch": 1.9736034376918354, + "grad_norm": 0.4451984167098999, + "learning_rate": 9.283676040997426e-05, + "loss": 1.892, + "step": 6430 + }, + { + "epoch": 1.9739103744628608, + "grad_norm": 0.35765743255615234, + "learning_rate": 9.283419659967748e-05, + "loss": 1.8768, + "step": 6431 + }, + { + "epoch": 1.9742173112338857, + "grad_norm": 0.36561164259910583, + "learning_rate": 9.283163236606807e-05, + "loss": 1.825, + "step": 6432 + }, + { + "epoch": 1.974524248004911, + "grad_norm": 0.38473913073539734, + "learning_rate": 9.282906770917137e-05, + "loss": 1.9247, + "step": 6433 + }, + { + "epoch": 1.974831184775936, + "grad_norm": 0.324945867061615, + "learning_rate": 9.28265026290127e-05, + "loss": 1.8832, + "step": 6434 + }, + { + "epoch": 1.9751381215469612, + "grad_norm": 0.38697487115859985, + "learning_rate": 9.282393712561744e-05, + "loss": 1.9282, + "step": 6435 + }, + { + "epoch": 1.9754450583179866, + "grad_norm": 0.3772333264350891, + "learning_rate": 9.282137119901094e-05, + "loss": 1.8822, + "step": 6436 + }, + { + "epoch": 1.9757519950890117, + "grad_norm": 0.3522745668888092, + "learning_rate": 9.281880484921854e-05, + "loss": 1.9102, + "step": 6437 + }, + { + "epoch": 1.9760589318600368, + "grad_norm": 0.36745330691337585, + "learning_rate": 9.281623807626562e-05, + "loss": 1.8842, + "step": 6438 + }, + { + "epoch": 1.9763658686310621, + "grad_norm": 0.3990548253059387, + "learning_rate": 9.281367088017755e-05, + "loss": 1.9642, + "step": 6439 + }, + { + "epoch": 1.976672805402087, + "grad_norm": 0.3333520293235779, + "learning_rate": 9.281110326097969e-05, + "loss": 1.8541, + "step": 6440 + }, + { + "epoch": 1.9769797421731123, + "grad_norm": 0.3282802700996399, + "learning_rate": 9.280853521869739e-05, + "loss": 1.8416, + "step": 6441 + }, + { + "epoch": 1.9772866789441375, + "grad_norm": 0.3415268361568451, + "learning_rate": 9.280596675335607e-05, + "loss": 1.9009, + "step": 6442 + }, + { + "epoch": 1.9775936157151626, + "grad_norm": 0.3621836006641388, + "learning_rate": 9.28033978649811e-05, + "loss": 1.8584, + "step": 6443 + }, + { + "epoch": 1.977900552486188, + "grad_norm": 0.34778010845184326, + "learning_rate": 9.280082855359786e-05, + "loss": 1.9455, + "step": 6444 + }, + { + "epoch": 1.978207489257213, + "grad_norm": 0.36525633931159973, + "learning_rate": 9.279825881923174e-05, + "loss": 1.9182, + "step": 6445 + }, + { + "epoch": 1.9785144260282381, + "grad_norm": 0.3404203951358795, + "learning_rate": 9.279568866190815e-05, + "loss": 1.8853, + "step": 6446 + }, + { + "epoch": 1.9788213627992635, + "grad_norm": 0.4564785659313202, + "learning_rate": 9.279311808165249e-05, + "loss": 2.0012, + "step": 6447 + }, + { + "epoch": 1.9791282995702886, + "grad_norm": 0.4371441602706909, + "learning_rate": 9.279054707849015e-05, + "loss": 1.9372, + "step": 6448 + }, + { + "epoch": 1.9794352363413137, + "grad_norm": 0.3928726017475128, + "learning_rate": 9.278797565244652e-05, + "loss": 1.882, + "step": 6449 + }, + { + "epoch": 1.979742173112339, + "grad_norm": 0.483331561088562, + "learning_rate": 9.278540380354706e-05, + "loss": 1.9664, + "step": 6450 + }, + { + "epoch": 1.980049109883364, + "grad_norm": 0.39085066318511963, + "learning_rate": 9.278283153181716e-05, + "loss": 1.874, + "step": 6451 + }, + { + "epoch": 1.9803560466543892, + "grad_norm": 0.3549460172653198, + "learning_rate": 9.278025883728224e-05, + "loss": 1.9108, + "step": 6452 + }, + { + "epoch": 1.9806629834254144, + "grad_norm": 0.4260072410106659, + "learning_rate": 9.277768571996772e-05, + "loss": 1.8621, + "step": 6453 + }, + { + "epoch": 1.9809699201964395, + "grad_norm": 0.4531188905239105, + "learning_rate": 9.277511217989904e-05, + "loss": 1.9924, + "step": 6454 + }, + { + "epoch": 1.9812768569674648, + "grad_norm": 0.34916743636131287, + "learning_rate": 9.277253821710165e-05, + "loss": 1.9459, + "step": 6455 + }, + { + "epoch": 1.98158379373849, + "grad_norm": 0.45466169714927673, + "learning_rate": 9.276996383160095e-05, + "loss": 1.9129, + "step": 6456 + }, + { + "epoch": 1.981890730509515, + "grad_norm": 0.4948022663593292, + "learning_rate": 9.27673890234224e-05, + "loss": 1.9362, + "step": 6457 + }, + { + "epoch": 1.9821976672805404, + "grad_norm": 0.43365779519081116, + "learning_rate": 9.276481379259146e-05, + "loss": 1.9323, + "step": 6458 + }, + { + "epoch": 1.9825046040515653, + "grad_norm": 0.5301255583763123, + "learning_rate": 9.276223813913354e-05, + "loss": 1.9611, + "step": 6459 + }, + { + "epoch": 1.9828115408225906, + "grad_norm": 0.4785257577896118, + "learning_rate": 9.275966206307412e-05, + "loss": 1.8945, + "step": 6460 + }, + { + "epoch": 1.9831184775936157, + "grad_norm": 0.4091590940952301, + "learning_rate": 9.275708556443868e-05, + "loss": 1.9171, + "step": 6461 + }, + { + "epoch": 1.9834254143646408, + "grad_norm": 0.4031025767326355, + "learning_rate": 9.275450864325264e-05, + "loss": 1.9518, + "step": 6462 + }, + { + "epoch": 1.9837323511356661, + "grad_norm": 0.39147642254829407, + "learning_rate": 9.275193129954149e-05, + "loss": 1.8756, + "step": 6463 + }, + { + "epoch": 1.9840392879066913, + "grad_norm": 0.3863523006439209, + "learning_rate": 9.27493535333307e-05, + "loss": 1.8894, + "step": 6464 + }, + { + "epoch": 1.9843462246777164, + "grad_norm": 0.36373165249824524, + "learning_rate": 9.274677534464576e-05, + "loss": 1.8574, + "step": 6465 + }, + { + "epoch": 1.9846531614487417, + "grad_norm": 0.40247389674186707, + "learning_rate": 9.274419673351211e-05, + "loss": 1.832, + "step": 6466 + }, + { + "epoch": 1.9849600982197666, + "grad_norm": 0.3874013125896454, + "learning_rate": 9.274161769995526e-05, + "loss": 1.9079, + "step": 6467 + }, + { + "epoch": 1.985267034990792, + "grad_norm": 0.35506606101989746, + "learning_rate": 9.27390382440007e-05, + "loss": 1.8784, + "step": 6468 + }, + { + "epoch": 1.985573971761817, + "grad_norm": 0.406325101852417, + "learning_rate": 9.273645836567388e-05, + "loss": 1.9822, + "step": 6469 + }, + { + "epoch": 1.9858809085328422, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.273387806500036e-05, + "loss": 1.9334, + "step": 6470 + }, + { + "epoch": 1.9861878453038675, + "grad_norm": 0.4810343384742737, + "learning_rate": 9.273129734200561e-05, + "loss": 1.9598, + "step": 6471 + }, + { + "epoch": 1.9864947820748926, + "grad_norm": 0.4552834630012512, + "learning_rate": 9.272871619671513e-05, + "loss": 1.9504, + "step": 6472 + }, + { + "epoch": 1.9868017188459177, + "grad_norm": 0.38974207639694214, + "learning_rate": 9.272613462915443e-05, + "loss": 1.8811, + "step": 6473 + }, + { + "epoch": 1.987108655616943, + "grad_norm": 0.40983298420906067, + "learning_rate": 9.272355263934902e-05, + "loss": 1.8876, + "step": 6474 + }, + { + "epoch": 1.987415592387968, + "grad_norm": 0.3684757947921753, + "learning_rate": 9.272097022732443e-05, + "loss": 1.921, + "step": 6475 + }, + { + "epoch": 1.9877225291589933, + "grad_norm": 0.38384270668029785, + "learning_rate": 9.271838739310618e-05, + "loss": 1.9099, + "step": 6476 + }, + { + "epoch": 1.9880294659300184, + "grad_norm": 0.3783731460571289, + "learning_rate": 9.271580413671976e-05, + "loss": 1.9322, + "step": 6477 + }, + { + "epoch": 1.9883364027010435, + "grad_norm": 0.3686216473579407, + "learning_rate": 9.271322045819076e-05, + "loss": 1.914, + "step": 6478 + }, + { + "epoch": 1.9886433394720688, + "grad_norm": 0.38776305317878723, + "learning_rate": 9.271063635754466e-05, + "loss": 1.9331, + "step": 6479 + }, + { + "epoch": 1.988950276243094, + "grad_norm": 0.35099950432777405, + "learning_rate": 9.270805183480702e-05, + "loss": 1.9837, + "step": 6480 + }, + { + "epoch": 1.989257213014119, + "grad_norm": 0.3736453652381897, + "learning_rate": 9.270546689000339e-05, + "loss": 1.846, + "step": 6481 + }, + { + "epoch": 1.9895641497851444, + "grad_norm": 0.3654848635196686, + "learning_rate": 9.27028815231593e-05, + "loss": 1.8987, + "step": 6482 + }, + { + "epoch": 1.9898710865561693, + "grad_norm": 0.3534870147705078, + "learning_rate": 9.27002957343003e-05, + "loss": 1.868, + "step": 6483 + }, + { + "epoch": 1.9901780233271946, + "grad_norm": 0.3143392503261566, + "learning_rate": 9.269770952345197e-05, + "loss": 1.8042, + "step": 6484 + }, + { + "epoch": 1.9904849600982197, + "grad_norm": 0.37151026725769043, + "learning_rate": 9.269512289063982e-05, + "loss": 1.8392, + "step": 6485 + }, + { + "epoch": 1.9907918968692448, + "grad_norm": 0.39781463146209717, + "learning_rate": 9.269253583588947e-05, + "loss": 1.9911, + "step": 6486 + }, + { + "epoch": 1.9910988336402702, + "grad_norm": 0.44022107124328613, + "learning_rate": 9.268994835922643e-05, + "loss": 1.9644, + "step": 6487 + }, + { + "epoch": 1.9914057704112953, + "grad_norm": 0.4058530628681183, + "learning_rate": 9.268736046067632e-05, + "loss": 1.9062, + "step": 6488 + }, + { + "epoch": 1.9917127071823204, + "grad_norm": 0.3754481077194214, + "learning_rate": 9.268477214026467e-05, + "loss": 1.8278, + "step": 6489 + }, + { + "epoch": 1.9920196439533457, + "grad_norm": 0.318208247423172, + "learning_rate": 9.268218339801711e-05, + "loss": 1.8529, + "step": 6490 + }, + { + "epoch": 1.9923265807243706, + "grad_norm": 0.350777268409729, + "learning_rate": 9.267959423395918e-05, + "loss": 1.9024, + "step": 6491 + }, + { + "epoch": 1.992633517495396, + "grad_norm": 0.3145158588886261, + "learning_rate": 9.26770046481165e-05, + "loss": 1.934, + "step": 6492 + }, + { + "epoch": 1.992940454266421, + "grad_norm": 0.3347548842430115, + "learning_rate": 9.267441464051463e-05, + "loss": 1.8989, + "step": 6493 + }, + { + "epoch": 1.9932473910374462, + "grad_norm": 0.33111512660980225, + "learning_rate": 9.267182421117919e-05, + "loss": 1.8808, + "step": 6494 + }, + { + "epoch": 1.9935543278084715, + "grad_norm": 0.3135010898113251, + "learning_rate": 9.266923336013577e-05, + "loss": 1.895, + "step": 6495 + }, + { + "epoch": 1.9938612645794966, + "grad_norm": 0.3638830780982971, + "learning_rate": 9.266664208740998e-05, + "loss": 1.9331, + "step": 6496 + }, + { + "epoch": 1.9941682013505218, + "grad_norm": 0.3592624068260193, + "learning_rate": 9.266405039302743e-05, + "loss": 1.8963, + "step": 6497 + }, + { + "epoch": 1.994475138121547, + "grad_norm": 0.34216129779815674, + "learning_rate": 9.266145827701371e-05, + "loss": 1.9062, + "step": 6498 + }, + { + "epoch": 1.994782074892572, + "grad_norm": 0.4180343747138977, + "learning_rate": 9.265886573939447e-05, + "loss": 1.9351, + "step": 6499 + }, + { + "epoch": 1.9950890116635973, + "grad_norm": 0.36890342831611633, + "learning_rate": 9.265627278019531e-05, + "loss": 1.9037, + "step": 6500 + }, + { + "epoch": 1.9953959484346224, + "grad_norm": 0.36638152599334717, + "learning_rate": 9.265367939944188e-05, + "loss": 1.9524, + "step": 6501 + }, + { + "epoch": 1.9957028852056475, + "grad_norm": 0.44918373227119446, + "learning_rate": 9.265108559715976e-05, + "loss": 1.9236, + "step": 6502 + }, + { + "epoch": 1.9960098219766729, + "grad_norm": 0.3805326521396637, + "learning_rate": 9.264849137337462e-05, + "loss": 1.8526, + "step": 6503 + }, + { + "epoch": 1.996316758747698, + "grad_norm": 0.39035212993621826, + "learning_rate": 9.26458967281121e-05, + "loss": 1.8256, + "step": 6504 + }, + { + "epoch": 1.996623695518723, + "grad_norm": 0.330522358417511, + "learning_rate": 9.264330166139783e-05, + "loss": 1.8487, + "step": 6505 + }, + { + "epoch": 1.9969306322897484, + "grad_norm": 0.33569198846817017, + "learning_rate": 9.264070617325746e-05, + "loss": 1.8735, + "step": 6506 + }, + { + "epoch": 1.9972375690607733, + "grad_norm": 0.4121384918689728, + "learning_rate": 9.263811026371664e-05, + "loss": 2.0028, + "step": 6507 + }, + { + "epoch": 1.9975445058317987, + "grad_norm": 0.3419879972934723, + "learning_rate": 9.263551393280103e-05, + "loss": 1.8432, + "step": 6508 + }, + { + "epoch": 1.9978514426028238, + "grad_norm": 0.33369818329811096, + "learning_rate": 9.263291718053626e-05, + "loss": 1.8752, + "step": 6509 + }, + { + "epoch": 1.9981583793738489, + "grad_norm": 0.3580996096134186, + "learning_rate": 9.263032000694804e-05, + "loss": 1.9319, + "step": 6510 + }, + { + "epoch": 1.9984653161448742, + "grad_norm": 0.38216903805732727, + "learning_rate": 9.2627722412062e-05, + "loss": 1.9424, + "step": 6511 + }, + { + "epoch": 1.9987722529158993, + "grad_norm": 0.3836761713027954, + "learning_rate": 9.26251243959038e-05, + "loss": 1.9259, + "step": 6512 + }, + { + "epoch": 1.9990791896869244, + "grad_norm": 0.34978967905044556, + "learning_rate": 9.262252595849917e-05, + "loss": 1.8648, + "step": 6513 + }, + { + "epoch": 1.9993861264579498, + "grad_norm": 0.4190160632133484, + "learning_rate": 9.261992709987375e-05, + "loss": 1.9456, + "step": 6514 + }, + { + "epoch": 1.9996930632289747, + "grad_norm": 0.38700881600379944, + "learning_rate": 9.261732782005322e-05, + "loss": 1.8768, + "step": 6515 + }, + { + "epoch": 2.0, + "grad_norm": 0.3706338405609131, + "learning_rate": 9.261472811906328e-05, + "loss": 1.9247, + "step": 6516 + } + ], + "logging_steps": 1.0, + "max_steps": 32580, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.426526169638969e+20, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6516/training_args.bin b/checkpoint-6516/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9db7ad91da5423a229826113feb3e9db3ef40c31 --- /dev/null +++ b/checkpoint-6516/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682b697e933b6e2693e5f9af9a0654effab1ca392c8500bf8af0eb089116a263 +size 7288 diff --git a/checkpoint-6516/zero_to_fp32.py b/checkpoint-6516/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-6516/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-9774/config.json b/checkpoint-9774/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a29af639fbf705188c21aae22660a85fee1ca26e --- /dev/null +++ b/checkpoint-9774/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "freeze_mm_mlp_adapter": false, + "gen_hidden_size": 1792, + "gen_pooling": "early_pool2d_4", + "gen_vision_tower": "eva-clip-E-14-plus", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "image_aspect_ratio": "square", + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "llava_llama", + "n_query": 64, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "tokenizer_model_max_length": 256, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "tune_mm_mlp_adapter": false, + "use_cache": false, + "use_mm_proj": true, + "vision_tower_pretrained": null, + "vocab_size": 128260 +} diff --git a/checkpoint-9774/generation_config.json b/checkpoint-9774/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..05892c70fa899883072c585fa444b4aa7175d6bc --- /dev/null +++ b/checkpoint-9774/generation_config.json @@ -0,0 +1,13 @@ +{ + "attn_implementation": "flash_attention_2", + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-9774/latest b/checkpoint-9774/latest new file mode 100644 index 0000000000000000000000000000000000000000..bf4d4ecf3009a325bf98529770192bef8afa5e8d --- /dev/null +++ b/checkpoint-9774/latest @@ -0,0 +1 @@ +global_step9774 \ No newline at end of file diff --git a/checkpoint-9774/model-00001-of-00003.safetensors b/checkpoint-9774/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b4cb56e710be9c121064bf1e774b3a4d682dbb0 --- /dev/null +++ b/checkpoint-9774/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2947008d660009ed7bd5d80013c8d60b193d282ccb922dde2aeaa63ceb971f7 +size 4955415870 diff --git a/checkpoint-9774/model-00002-of-00003.safetensors b/checkpoint-9774/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1c6f3bf70f8abb1e7ffb233219debc10bc20bfc --- /dev/null +++ b/checkpoint-9774/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b088e0e2c4fb5916f448522fa5aef361db713e2c2c0ceac534662c8d52e330d +size 4971563008 diff --git a/checkpoint-9774/model-00003-of-00003.safetensors b/checkpoint-9774/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7ea627b5699a5a97bb40ad96ba802e9211d372b4 --- /dev/null +++ b/checkpoint-9774/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aacfe0dd3d99d823b297bf2227f921e19cd05b5ed04cea6f53bcb7ef24a1326 +size 4180840856 diff --git a/checkpoint-9774/model.safetensors.index.json b/checkpoint-9774/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c911c94f46f802ae304903dd7796da96c28604 --- /dev/null +++ b/checkpoint-9774/model.safetensors.index.json @@ -0,0 +1,2358 @@ +{ + "metadata": { + "total_size": 14107506086 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.bias": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.cls_token": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.pos_embed": "model-00001-of-00003.safetensors", + "model.latent_queries": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/checkpoint-9774/rng_state_0.pth b/checkpoint-9774/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab891beaa0c27c4a907b47ca8a8b66f46797f610 --- /dev/null +++ b/checkpoint-9774/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11c0b4830d35636816ca18ae61db865464be30e91cd4d1a0b83d566e8be4a3f8 +size 15984 diff --git a/checkpoint-9774/rng_state_1.pth b/checkpoint-9774/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b33fdb920bf915954e122147312d68d8471ef09 --- /dev/null +++ b/checkpoint-9774/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7daaf1f5c21f4123337618256a55a0c3f232969f40224b8673b8f616df01c188 +size 15984 diff --git a/checkpoint-9774/rng_state_10.pth b/checkpoint-9774/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..df899f8b5e300fc69972bca94602909a2aa5facd --- /dev/null +++ b/checkpoint-9774/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91263c152fc53d8cfa40a25089255dfc0fcd23d290e9d299b9c53967574c1049 +size 15997 diff --git a/checkpoint-9774/rng_state_11.pth b/checkpoint-9774/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..e58b92422a805f1a6f4385e3fd91b06a03e1095b --- /dev/null +++ b/checkpoint-9774/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f063e046c2f72adcc44729d67d649d6f700081a707a82e8a775631f0ef50c6f +size 15997 diff --git a/checkpoint-9774/rng_state_12.pth b/checkpoint-9774/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..2281618e7d66483e12dbb060405485e00606c33d --- /dev/null +++ b/checkpoint-9774/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41646ee20b32c9a3b4e796984145d1de6508f2c382df8376c2f84fd3455526b7 +size 15997 diff --git a/checkpoint-9774/rng_state_13.pth b/checkpoint-9774/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..71b06651dda829b39b3aad79c6225ccdb98da130 --- /dev/null +++ b/checkpoint-9774/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d863f6cc88d51f725a57e1bf6718bb85f42498ecd5fdec86dd54bc01656d050 +size 15997 diff --git a/checkpoint-9774/rng_state_14.pth b/checkpoint-9774/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef29336ddb8696c41315f3c8b7ca592d8126d9e2 --- /dev/null +++ b/checkpoint-9774/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f8eb87601e77e108a284c839ee8f3b0519cfae59c9fb29c689f0435fc0e49a0 +size 15997 diff --git a/checkpoint-9774/rng_state_15.pth b/checkpoint-9774/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..16e58d238f780152a4144f100f0d958af4df8550 --- /dev/null +++ b/checkpoint-9774/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32c380dc9b4325dfa9880af0ac4f11f8f6edf6ce4dc014669bd8d1e78fcec22b +size 15997 diff --git a/checkpoint-9774/rng_state_16.pth b/checkpoint-9774/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..a537fe40eedd79f08584c37ee532c42502838781 --- /dev/null +++ b/checkpoint-9774/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc1089a7c430f724417c94f2dd3ff3bdf9a0fa12530691478bcd23fa235df6da +size 15997 diff --git a/checkpoint-9774/rng_state_17.pth b/checkpoint-9774/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..69d7c03acb43540e8937f9a3a40ec066b6232a66 --- /dev/null +++ b/checkpoint-9774/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:564291a0bc1956f25c18f7094c7874d98c3ea59fd18c497b886a7615663cb0ee +size 15997 diff --git a/checkpoint-9774/rng_state_18.pth b/checkpoint-9774/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e0d1642b096556dd6b5ea09a6d95cbfca17d4b3 --- /dev/null +++ b/checkpoint-9774/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39fb2082c52e48468fd8ec8dc0c082378b6adf5adc4eb891edb54f504a846ab6 +size 15997 diff --git a/checkpoint-9774/rng_state_19.pth b/checkpoint-9774/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..f557f45d7fa0179e0d65d93bfe74c3a5393221be --- /dev/null +++ b/checkpoint-9774/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecef7aa57d102b3fe945f2f86ec705f4474d2bb6c383709d5e39f77cf8de798f +size 15997 diff --git a/checkpoint-9774/rng_state_2.pth b/checkpoint-9774/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e5e0f94993a47a6bf40f6eafd5084e9edec19e4 --- /dev/null +++ b/checkpoint-9774/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9328c2673eac09f3ab538803b80bb82cb785ea4f8a93b3fa98b9e5111827e8f +size 15984 diff --git a/checkpoint-9774/rng_state_20.pth b/checkpoint-9774/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..48e59a2d051143979c4b6b86c50a22dcdf0710cc --- /dev/null +++ b/checkpoint-9774/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3dfb5318a106651aba8532418c7e2c089a5cc948c713139f4fb5098ae0fcf92 +size 15997 diff --git a/checkpoint-9774/rng_state_21.pth b/checkpoint-9774/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..055e6eb8743720f9e2c83fafdfb9ee81d0c2cee1 --- /dev/null +++ b/checkpoint-9774/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0509a226ddf746ba5275f0ef66ee8a86c6291f12e3182d4bb07266f4a0f364cd +size 15997 diff --git a/checkpoint-9774/rng_state_22.pth b/checkpoint-9774/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..6d0ab6d09f548b793e098ce27d819c419d5cb89b --- /dev/null +++ b/checkpoint-9774/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:341a588649289476363bc79716ca7863b8540634ec79f695a2d8bb56ab980f02 +size 15997 diff --git a/checkpoint-9774/rng_state_23.pth b/checkpoint-9774/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..d2a33439bd60e1c7e9efd6f123545de6785e6609 --- /dev/null +++ b/checkpoint-9774/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc6c17f4676aebdb473ea5d6466cc3d514fbeec678a140e758f5d2fe07e30bfd +size 15997 diff --git a/checkpoint-9774/rng_state_24.pth b/checkpoint-9774/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..ec5a3d6a8190b2b4e0011f3050a0be3ad4a75e2e --- /dev/null +++ b/checkpoint-9774/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32baa11538afd4afe3a66ad09ef56fb637d56a953c6202503be37a0e0f4ccf22 +size 15997 diff --git a/checkpoint-9774/rng_state_25.pth b/checkpoint-9774/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..467fa9e6410cb32c852b335941474b51be8dfb44 --- /dev/null +++ b/checkpoint-9774/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa5aa780d94a77833e1881056a694a6a15d9bfde3d1113874f6813649e721fb7 +size 15997 diff --git a/checkpoint-9774/rng_state_26.pth b/checkpoint-9774/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..613a506760e27154076297e62810ab4c53b20e56 --- /dev/null +++ b/checkpoint-9774/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e79720d2a12a19cede417d674f93ea004d375cc49c984971c4fd20fc519a24c1 +size 15997 diff --git a/checkpoint-9774/rng_state_27.pth b/checkpoint-9774/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..064f6facf5ef98d2193427da6814bf93a192a97e --- /dev/null +++ b/checkpoint-9774/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d09f009b0c23c663a78f046dc723ce34420032117ad5f0e1565fbca0e53a50f3 +size 15997 diff --git a/checkpoint-9774/rng_state_28.pth b/checkpoint-9774/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..832b37c3a46658b8921fa29335d460b4d107b1cb --- /dev/null +++ b/checkpoint-9774/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9235330f766c6592da010f93f95c634f827da1d250386ba18956ba3c51adaa84 +size 15997 diff --git a/checkpoint-9774/rng_state_29.pth b/checkpoint-9774/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..ffac24511bb0d2a568846628d11d3db71cc2d184 --- /dev/null +++ b/checkpoint-9774/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c3867030d515d1a990003e9b0f2ac9f9fe0ad0e7644c3b61dc8dbd9312a5403 +size 15997 diff --git a/checkpoint-9774/rng_state_3.pth b/checkpoint-9774/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..420d572d01b68575c670e5b869d2f264427959d8 --- /dev/null +++ b/checkpoint-9774/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b2c2a66da15e5ca5b43e392d514db1b361863837f2778e081e289001850f6c6 +size 15984 diff --git a/checkpoint-9774/rng_state_30.pth b/checkpoint-9774/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..10765f3c70a2b2ab46d580ff0e3be38fadec20bc --- /dev/null +++ b/checkpoint-9774/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b3be37522d9544ed559a6a246801f136130de1345d15bf54ad7318529378af3 +size 15997 diff --git a/checkpoint-9774/rng_state_31.pth b/checkpoint-9774/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e23df3d6b4dba419a76557beff1ef65d7406c62 --- /dev/null +++ b/checkpoint-9774/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2deddaec10746db810ec79d9b455d24c303f9a782976b958326b1778062a47d9 +size 15997 diff --git a/checkpoint-9774/rng_state_32.pth b/checkpoint-9774/rng_state_32.pth new file mode 100644 index 0000000000000000000000000000000000000000..86b25c01557278476d89321274eda7e111441921 --- /dev/null +++ b/checkpoint-9774/rng_state_32.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d743a21d2a18596581accac8be7ff2c05b2c0c1eab36ad5a967a9b181fa0721 +size 15997 diff --git a/checkpoint-9774/rng_state_33.pth b/checkpoint-9774/rng_state_33.pth new file mode 100644 index 0000000000000000000000000000000000000000..336973700550a7aaa0cc03a15bac4ac38f10bf61 --- /dev/null +++ b/checkpoint-9774/rng_state_33.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8030ba88467e8c063df31c2002fa5f7c84add43eb269cfcb04702a08dd6a5572 +size 15997 diff --git a/checkpoint-9774/rng_state_34.pth b/checkpoint-9774/rng_state_34.pth new file mode 100644 index 0000000000000000000000000000000000000000..b539f8b754bcb872cb9d2de6cc531020c69dc1c8 --- /dev/null +++ b/checkpoint-9774/rng_state_34.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe1320f06281b54d538802a78505a0e17f449a7bbc7e60d90da0d743132f3aaf +size 15997 diff --git a/checkpoint-9774/rng_state_35.pth b/checkpoint-9774/rng_state_35.pth new file mode 100644 index 0000000000000000000000000000000000000000..6afd72f3e4430a0f157c5d22ec281b644ddf5058 --- /dev/null +++ b/checkpoint-9774/rng_state_35.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfba3ca89d793e9d8256de23d500205cb927d32da87ca8e5b5347eda455221b4 +size 15997 diff --git a/checkpoint-9774/rng_state_36.pth b/checkpoint-9774/rng_state_36.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea964f790756cbea9bff60c90084139f49c3d95a --- /dev/null +++ b/checkpoint-9774/rng_state_36.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b76ab5dbede4bc9b1b738ca963a23c08b037ddafd98565309677817d97afdb53 +size 15997 diff --git a/checkpoint-9774/rng_state_37.pth b/checkpoint-9774/rng_state_37.pth new file mode 100644 index 0000000000000000000000000000000000000000..d58807d8bbc06ee9561c83e85a59643a1dd7b7c8 --- /dev/null +++ b/checkpoint-9774/rng_state_37.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:665f264098d2c13d0f7ef6eec5b6a14c30248b929d00c2db7a9707ff4c7cfaad +size 15997 diff --git a/checkpoint-9774/rng_state_38.pth b/checkpoint-9774/rng_state_38.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b77b6499c881d67338b2817316500fddf6a2887 --- /dev/null +++ b/checkpoint-9774/rng_state_38.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d18b1de3360754280bb9f5949784a9aa8c632ac47ae09bb58d0e2b6c7e40d3e5 +size 15997 diff --git a/checkpoint-9774/rng_state_39.pth b/checkpoint-9774/rng_state_39.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3c10398063ea578fbc45782273ad42718e76128 --- /dev/null +++ b/checkpoint-9774/rng_state_39.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682fa3392dd8fe70e4f50ea92b1c3039d8e76390baa3e9c4395d0b9a63b5a6b4 +size 15997 diff --git a/checkpoint-9774/rng_state_4.pth b/checkpoint-9774/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..62b3d7a998f913ce011adee55655957e64d74f29 --- /dev/null +++ b/checkpoint-9774/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:421a28ec74a1caf1435fa96228245a001c2f16032122a30e2eea03dc15097025 +size 15984 diff --git a/checkpoint-9774/rng_state_40.pth b/checkpoint-9774/rng_state_40.pth new file mode 100644 index 0000000000000000000000000000000000000000..4250b7d18c0ca49b7201cf6dc55c15b7e46114c5 --- /dev/null +++ b/checkpoint-9774/rng_state_40.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad293081b51b60d239b6b90c09ed1582d68370a99a53bce90188ab7fbad893e2 +size 15997 diff --git a/checkpoint-9774/rng_state_41.pth b/checkpoint-9774/rng_state_41.pth new file mode 100644 index 0000000000000000000000000000000000000000..2cdbfa42c4023e0af52a0bbcff7d93cb82f6b39a --- /dev/null +++ b/checkpoint-9774/rng_state_41.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88aa4ae37b7fd9527922b19e5cfad59f3d07737f6b65005c3c112cd12c93bc62 +size 15997 diff --git a/checkpoint-9774/rng_state_42.pth b/checkpoint-9774/rng_state_42.pth new file mode 100644 index 0000000000000000000000000000000000000000..a2a550f0d40e1e61c647b8b57a59a896981f9ef2 --- /dev/null +++ b/checkpoint-9774/rng_state_42.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bf1137af9c2b9dc5218d81f8639229f185b076b9be53a277b736e6aa5813055 +size 15997 diff --git a/checkpoint-9774/rng_state_43.pth b/checkpoint-9774/rng_state_43.pth new file mode 100644 index 0000000000000000000000000000000000000000..f32309290f215f5c0b4485507874c25c58c60cf5 --- /dev/null +++ b/checkpoint-9774/rng_state_43.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a16389457064df88118f775ec3ebe3448b5915bb4adfbe4bc5cdc6d9dbdad761 +size 15997 diff --git a/checkpoint-9774/rng_state_44.pth b/checkpoint-9774/rng_state_44.pth new file mode 100644 index 0000000000000000000000000000000000000000..5fc20a58cdddb4982b1ea50d3bdc601e83ab8711 --- /dev/null +++ b/checkpoint-9774/rng_state_44.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9ddad4a393bfeb66095ca9a9b00a280b4723bc66d9d32f7d6adc327bc6926f6 +size 15997 diff --git a/checkpoint-9774/rng_state_45.pth b/checkpoint-9774/rng_state_45.pth new file mode 100644 index 0000000000000000000000000000000000000000..53684c948d440694f2e6c7ed83fb9de2a72330aa --- /dev/null +++ b/checkpoint-9774/rng_state_45.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44e602189b37438c3e4355777de9cf2ba09b38c0112c8b0772592d566cf6a46e +size 15997 diff --git a/checkpoint-9774/rng_state_46.pth b/checkpoint-9774/rng_state_46.pth new file mode 100644 index 0000000000000000000000000000000000000000..da860b7626601f965020cf7b221846a51ebaed89 --- /dev/null +++ b/checkpoint-9774/rng_state_46.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0907fe0e70a1eec8d1d847f1b98ea23f82e73a75c768edf80872e3d6f9f84a72 +size 15997 diff --git a/checkpoint-9774/rng_state_47.pth b/checkpoint-9774/rng_state_47.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3ed0139a4f0be907a923459f480bdd7ab8f5479 --- /dev/null +++ b/checkpoint-9774/rng_state_47.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97d86a2f7ce876b322468b9e8327a9acc3decb67809b5282c0f43f4c686891a6 +size 15997 diff --git a/checkpoint-9774/rng_state_48.pth b/checkpoint-9774/rng_state_48.pth new file mode 100644 index 0000000000000000000000000000000000000000..502196f3d4fe0bde53531ebe6579cd76d9e5534f --- /dev/null +++ b/checkpoint-9774/rng_state_48.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:651b52863254ed6f015a736172158bb42904c41ca71abb77b46e6ea76772f277 +size 15997 diff --git a/checkpoint-9774/rng_state_49.pth b/checkpoint-9774/rng_state_49.pth new file mode 100644 index 0000000000000000000000000000000000000000..8bccfdbe3ecc2255295dbde87562d9b66723b262 --- /dev/null +++ b/checkpoint-9774/rng_state_49.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b2cde4c54bb751c5324cc83912e06f8eb6bbe1842877522b08392721f37caf5 +size 15997 diff --git a/checkpoint-9774/rng_state_5.pth b/checkpoint-9774/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b00ec92f750830c67b0aa343bca30afa821f8c5d --- /dev/null +++ b/checkpoint-9774/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d4e77a78573b0f98f708f07c6c45a8ad97bfb86429872105a7669fec5bd7082 +size 15984 diff --git a/checkpoint-9774/rng_state_50.pth b/checkpoint-9774/rng_state_50.pth new file mode 100644 index 0000000000000000000000000000000000000000..02c2fbd8ad9562f24d8a23bf6bbb4562bedaeec2 --- /dev/null +++ b/checkpoint-9774/rng_state_50.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cf364f230165db817080b3da1b30a6fb1fc9d82e30c751c3f549380461dcd29 +size 15997 diff --git a/checkpoint-9774/rng_state_51.pth b/checkpoint-9774/rng_state_51.pth new file mode 100644 index 0000000000000000000000000000000000000000..b4fe72a8756e4790f25791ed4b6984bb361ee9a9 --- /dev/null +++ b/checkpoint-9774/rng_state_51.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0a5aec6e3a683f628f126486cefc898cdb6807a1d5b28beeb5ccbe5cc1b283c +size 15997 diff --git a/checkpoint-9774/rng_state_52.pth b/checkpoint-9774/rng_state_52.pth new file mode 100644 index 0000000000000000000000000000000000000000..ed1cbcf37fd0dc536c86365803d76079da5d2b8d --- /dev/null +++ b/checkpoint-9774/rng_state_52.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:924589477321ffd8980e329d157721e3edbad727e648de31cfc2752e87681cfe +size 15997 diff --git a/checkpoint-9774/rng_state_53.pth b/checkpoint-9774/rng_state_53.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd2389a907a9944403b97c180dc67878243cf564 --- /dev/null +++ b/checkpoint-9774/rng_state_53.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f67ded476c00765d7d1b4506ab840fdce852bb87c3bc61718d524078959d16da +size 15997 diff --git a/checkpoint-9774/rng_state_54.pth b/checkpoint-9774/rng_state_54.pth new file mode 100644 index 0000000000000000000000000000000000000000..ecf62bee8cfc86d7767e3ec010647fb34797f385 --- /dev/null +++ b/checkpoint-9774/rng_state_54.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:179d2ea01909b82f7241157329387b00c342068d6f17c6e30525c7b5c661482a +size 15997 diff --git a/checkpoint-9774/rng_state_55.pth b/checkpoint-9774/rng_state_55.pth new file mode 100644 index 0000000000000000000000000000000000000000..fb7e0d5eaf345a6e72ff9b05c1be7d3595b76a04 --- /dev/null +++ b/checkpoint-9774/rng_state_55.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72f34dca430f78f78e81349d77ff6bc91e10dd808b6d4cfb2b89f2c7995bb51f +size 15997 diff --git a/checkpoint-9774/rng_state_56.pth b/checkpoint-9774/rng_state_56.pth new file mode 100644 index 0000000000000000000000000000000000000000..707b562a0dd4a976d79cc7ddfb2b58912acb33ed --- /dev/null +++ b/checkpoint-9774/rng_state_56.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe9d6b31b7623c481a85e5e4b3d89120ed2cbf3f29dcec7c77c72d78b1b7d476 +size 15997 diff --git a/checkpoint-9774/rng_state_57.pth b/checkpoint-9774/rng_state_57.pth new file mode 100644 index 0000000000000000000000000000000000000000..2a1aaf8bb53869c36cebe39f7136d868e3726b2e --- /dev/null +++ b/checkpoint-9774/rng_state_57.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33a1dc3ae46d341e11d753e8441858a451c5d8e9555f0132baeedc0049af5e2c +size 15997 diff --git a/checkpoint-9774/rng_state_58.pth b/checkpoint-9774/rng_state_58.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6bdb9b768d5997f218406e7f0e68fcc6a73f5f7 --- /dev/null +++ b/checkpoint-9774/rng_state_58.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51f7e5884f8fbf36ce2814a2e572bb3ef250c6e3670b37ff13c3fb93bb9eca04 +size 15997 diff --git a/checkpoint-9774/rng_state_59.pth b/checkpoint-9774/rng_state_59.pth new file mode 100644 index 0000000000000000000000000000000000000000..e46d7d5a1e5f5063f19b06f4260cd943508c760a --- /dev/null +++ b/checkpoint-9774/rng_state_59.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05230d2b582db11b518f7b7912358fe3083bcf43ce51e91f816371c60ebd24fb +size 15997 diff --git a/checkpoint-9774/rng_state_6.pth b/checkpoint-9774/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7395e714d017324d48a8dd2a42e55759f9558e2 --- /dev/null +++ b/checkpoint-9774/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19475a568160e99e09570e9f9dcbfa39165a671a1f9b4cb9fa79c9c7f1719482 +size 15984 diff --git a/checkpoint-9774/rng_state_60.pth b/checkpoint-9774/rng_state_60.pth new file mode 100644 index 0000000000000000000000000000000000000000..3bcd856ad5e164a817dcd976fde0478a735cb257 --- /dev/null +++ b/checkpoint-9774/rng_state_60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd1ecdacf6f56fb6135bf9cdaeca2e9208a52f205fcb89dc2f4be52fb7e04ce +size 15997 diff --git a/checkpoint-9774/rng_state_61.pth b/checkpoint-9774/rng_state_61.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d110fee471a68250157e968cf5b285131973a0d --- /dev/null +++ b/checkpoint-9774/rng_state_61.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02ea499cb96ff1945fe655c34e6beb116486494d26045393205ceac89368f850 +size 15997 diff --git a/checkpoint-9774/rng_state_62.pth b/checkpoint-9774/rng_state_62.pth new file mode 100644 index 0000000000000000000000000000000000000000..eaa6216083437a08e67dc95e50ca1ec19e20f133 --- /dev/null +++ b/checkpoint-9774/rng_state_62.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2d82d38c328b732a9fa0e2dc28bf3ecdbeab8857a7f2f1face93b4a0da12e1d +size 15997 diff --git a/checkpoint-9774/rng_state_63.pth b/checkpoint-9774/rng_state_63.pth new file mode 100644 index 0000000000000000000000000000000000000000..8942ffc120454a4ef26417eed9af564a8174ed37 --- /dev/null +++ b/checkpoint-9774/rng_state_63.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90028709f97c4b7b55f7abd6ae6d36b2b9a0b5206bf62b6ad1a2db3873393e58 +size 15997 diff --git a/checkpoint-9774/rng_state_7.pth b/checkpoint-9774/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..a40187d764b84bb6c2e3dc70403bd38bcc3d9686 --- /dev/null +++ b/checkpoint-9774/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b73a3a674bf690e274d594689242d6c92d6368910e0e13a317c0d472e965e3ab +size 15984 diff --git a/checkpoint-9774/rng_state_8.pth b/checkpoint-9774/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..d6c8284636a97035e3672a57287427ed20ab862b --- /dev/null +++ b/checkpoint-9774/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b21fd1301dca3cac21b549b90f6cc87875d885de7520e5d2ff6b030746235739 +size 15984 diff --git a/checkpoint-9774/rng_state_9.pth b/checkpoint-9774/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d1ac1f28582177f6ea7e41f9d79e76497462a17 --- /dev/null +++ b/checkpoint-9774/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:506a2c0f8ab7f04ccf72e77e37506263639f3893ea96ea10609896788b1be585 +size 15984 diff --git a/checkpoint-9774/scheduler.pt b/checkpoint-9774/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5034bc93b7e5d1921d2e4a19303f1b915ef1c0c9 --- /dev/null +++ b/checkpoint-9774/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:833b3d8a8fd959370535e53e6ce6b2bb9dabed394d484b128b3d53ed25c2ee3c +size 1064 diff --git a/checkpoint-9774/special_tokens_map.json b/checkpoint-9774/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad43db72a0e94321a5a9455dce616c68d1f9673 --- /dev/null +++ b/checkpoint-9774/special_tokens_map.json @@ -0,0 +1,46 @@ +{ + "additional_special_tokens": [ + { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-9774/tokenizer.json b/checkpoint-9774/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..444d43e1c25d11b63381073024becd006c83d4f6 --- /dev/null +++ b/checkpoint-9774/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fbef9068a1d82c7fafc3fdfd7c717524c8bfbcaea19c14ce4f8a4e616deb57 +size 17210651 diff --git a/checkpoint-9774/tokenizer_config.json b/checkpoint-9774/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a54102d00c210427fe2da524cea00c5ace13686 --- /dev/null +++ b/checkpoint-9774/tokenizer_config.json @@ -0,0 +1,2102 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "[IMG]", + "[/IMG]", + "" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 256, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-9774/trainer_state.json b/checkpoint-9774/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2c530aa0fca2717d38455bf1423ad377efe94247 --- /dev/null +++ b/checkpoint-9774/trainer_state.json @@ -0,0 +1,68452 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 9774, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003069367710251688, + "grad_norm": 1.3492016792297363, + "learning_rate": 0.0, + "loss": 6.5185, + "step": 1 + }, + { + "epoch": 0.0006138735420503376, + "grad_norm": 1.4303781986236572, + "learning_rate": 1.0224948875255626e-07, + "loss": 6.5124, + "step": 2 + }, + { + "epoch": 0.0009208103130755065, + "grad_norm": 1.3981783390045166, + "learning_rate": 2.0449897750511251e-07, + "loss": 6.5204, + "step": 3 + }, + { + "epoch": 0.0012277470841006752, + "grad_norm": 1.3760672807693481, + "learning_rate": 3.0674846625766876e-07, + "loss": 6.502, + "step": 4 + }, + { + "epoch": 0.001534683855125844, + "grad_norm": 1.3704107999801636, + "learning_rate": 4.0899795501022503e-07, + "loss": 6.5021, + "step": 5 + }, + { + "epoch": 0.001841620626151013, + "grad_norm": 1.3109549283981323, + "learning_rate": 5.112474437627812e-07, + "loss": 6.521, + "step": 6 + }, + { + "epoch": 0.002148557397176182, + "grad_norm": 1.475183367729187, + "learning_rate": 6.134969325153375e-07, + "loss": 6.521, + "step": 7 + }, + { + "epoch": 0.0024554941682013503, + "grad_norm": 1.4563297033309937, + "learning_rate": 7.157464212678937e-07, + "loss": 6.5075, + "step": 8 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 1.437183141708374, + "learning_rate": 8.179959100204501e-07, + "loss": 6.5135, + "step": 9 + }, + { + "epoch": 0.003069367710251688, + "grad_norm": 1.336928129196167, + "learning_rate": 9.202453987730062e-07, + "loss": 6.5138, + "step": 10 + }, + { + "epoch": 0.003376304481276857, + "grad_norm": 1.3220698833465576, + "learning_rate": 1.0224948875255625e-06, + "loss": 6.5187, + "step": 11 + }, + { + "epoch": 0.003683241252302026, + "grad_norm": 1.3990652561187744, + "learning_rate": 1.1247443762781187e-06, + "loss": 6.5129, + "step": 12 + }, + { + "epoch": 0.003990178023327195, + "grad_norm": 1.4394340515136719, + "learning_rate": 1.226993865030675e-06, + "loss": 6.5078, + "step": 13 + }, + { + "epoch": 0.004297114794352364, + "grad_norm": 1.3675259351730347, + "learning_rate": 1.3292433537832312e-06, + "loss": 6.5115, + "step": 14 + }, + { + "epoch": 0.004604051565377533, + "grad_norm": 1.3085063695907593, + "learning_rate": 1.4314928425357874e-06, + "loss": 6.5092, + "step": 15 + }, + { + "epoch": 0.004910988336402701, + "grad_norm": 1.4214227199554443, + "learning_rate": 1.5337423312883435e-06, + "loss": 6.5026, + "step": 16 + }, + { + "epoch": 0.0052179251074278695, + "grad_norm": 1.377146601676941, + "learning_rate": 1.6359918200409001e-06, + "loss": 6.4882, + "step": 17 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.3461124897003174, + "learning_rate": 1.7382413087934563e-06, + "loss": 6.4935, + "step": 18 + }, + { + "epoch": 0.005831798649478207, + "grad_norm": 1.3161669969558716, + "learning_rate": 1.8404907975460124e-06, + "loss": 6.4795, + "step": 19 + }, + { + "epoch": 0.006138735420503376, + "grad_norm": 1.2915974855422974, + "learning_rate": 1.942740286298569e-06, + "loss": 6.4529, + "step": 20 + }, + { + "epoch": 0.006445672191528545, + "grad_norm": 1.2675414085388184, + "learning_rate": 2.044989775051125e-06, + "loss": 6.454, + "step": 21 + }, + { + "epoch": 0.006752608962553714, + "grad_norm": 1.2769283056259155, + "learning_rate": 2.147239263803681e-06, + "loss": 6.4574, + "step": 22 + }, + { + "epoch": 0.007059545733578883, + "grad_norm": 1.2556813955307007, + "learning_rate": 2.2494887525562373e-06, + "loss": 6.4486, + "step": 23 + }, + { + "epoch": 0.007366482504604052, + "grad_norm": 1.2158268690109253, + "learning_rate": 2.3517382413087935e-06, + "loss": 6.4357, + "step": 24 + }, + { + "epoch": 0.007673419275629221, + "grad_norm": 1.2383767366409302, + "learning_rate": 2.45398773006135e-06, + "loss": 6.4347, + "step": 25 + }, + { + "epoch": 0.00798035604665439, + "grad_norm": 1.2865383625030518, + "learning_rate": 2.5562372188139062e-06, + "loss": 6.3611, + "step": 26 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 1.1501989364624023, + "learning_rate": 2.6584867075664624e-06, + "loss": 6.3247, + "step": 27 + }, + { + "epoch": 0.008594229588704727, + "grad_norm": 1.0971378087997437, + "learning_rate": 2.7607361963190186e-06, + "loss": 6.3078, + "step": 28 + }, + { + "epoch": 0.008901166359729895, + "grad_norm": 1.1365599632263184, + "learning_rate": 2.8629856850715747e-06, + "loss": 6.3211, + "step": 29 + }, + { + "epoch": 0.009208103130755065, + "grad_norm": 1.1228944063186646, + "learning_rate": 2.965235173824131e-06, + "loss": 6.3185, + "step": 30 + }, + { + "epoch": 0.009515039901780233, + "grad_norm": 1.126287579536438, + "learning_rate": 3.067484662576687e-06, + "loss": 6.2845, + "step": 31 + }, + { + "epoch": 0.009821976672805401, + "grad_norm": 1.1070353984832764, + "learning_rate": 3.1697341513292436e-06, + "loss": 6.2855, + "step": 32 + }, + { + "epoch": 0.010128913443830571, + "grad_norm": 1.101291537284851, + "learning_rate": 3.2719836400818002e-06, + "loss": 6.2764, + "step": 33 + }, + { + "epoch": 0.010435850214855739, + "grad_norm": 1.0643113851547241, + "learning_rate": 3.374233128834356e-06, + "loss": 6.2363, + "step": 34 + }, + { + "epoch": 0.010742786985880909, + "grad_norm": 0.9714563488960266, + "learning_rate": 3.4764826175869125e-06, + "loss": 6.1771, + "step": 35 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8998560309410095, + "learning_rate": 3.5787321063394683e-06, + "loss": 6.1202, + "step": 36 + }, + { + "epoch": 0.011356660527931247, + "grad_norm": 0.8481987714767456, + "learning_rate": 3.680981595092025e-06, + "loss": 6.0954, + "step": 37 + }, + { + "epoch": 0.011663597298956415, + "grad_norm": 0.8124909996986389, + "learning_rate": 3.783231083844581e-06, + "loss": 6.0832, + "step": 38 + }, + { + "epoch": 0.011970534069981584, + "grad_norm": 0.7968178391456604, + "learning_rate": 3.885480572597138e-06, + "loss": 6.0661, + "step": 39 + }, + { + "epoch": 0.012277470841006752, + "grad_norm": 0.7714207768440247, + "learning_rate": 3.987730061349693e-06, + "loss": 6.0385, + "step": 40 + }, + { + "epoch": 0.012584407612031922, + "grad_norm": 0.7436742782592773, + "learning_rate": 4.08997955010225e-06, + "loss": 6.0227, + "step": 41 + }, + { + "epoch": 0.01289134438305709, + "grad_norm": 0.7447277307510376, + "learning_rate": 4.192229038854806e-06, + "loss": 6.0208, + "step": 42 + }, + { + "epoch": 0.013198281154082258, + "grad_norm": 0.6983785629272461, + "learning_rate": 4.294478527607362e-06, + "loss": 6.0295, + "step": 43 + }, + { + "epoch": 0.013505217925107428, + "grad_norm": 0.6630908250808716, + "learning_rate": 4.3967280163599184e-06, + "loss": 6.004, + "step": 44 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.6481929421424866, + "learning_rate": 4.498977505112475e-06, + "loss": 5.9986, + "step": 45 + }, + { + "epoch": 0.014119091467157766, + "grad_norm": 0.7187685966491699, + "learning_rate": 4.601226993865031e-06, + "loss": 6.0008, + "step": 46 + }, + { + "epoch": 0.014426028238182934, + "grad_norm": 0.6550983190536499, + "learning_rate": 4.703476482617587e-06, + "loss": 5.9735, + "step": 47 + }, + { + "epoch": 0.014732965009208104, + "grad_norm": 0.6780675649642944, + "learning_rate": 4.805725971370143e-06, + "loss": 5.9568, + "step": 48 + }, + { + "epoch": 0.015039901780233272, + "grad_norm": 0.703427791595459, + "learning_rate": 4.9079754601227e-06, + "loss": 5.961, + "step": 49 + }, + { + "epoch": 0.015346838551258441, + "grad_norm": 0.6507543921470642, + "learning_rate": 5.0102249488752554e-06, + "loss": 5.9557, + "step": 50 + }, + { + "epoch": 0.01565377532228361, + "grad_norm": 0.5959481000900269, + "learning_rate": 5.1124744376278124e-06, + "loss": 5.9391, + "step": 51 + }, + { + "epoch": 0.01596071209330878, + "grad_norm": 0.5798730254173279, + "learning_rate": 5.214723926380368e-06, + "loss": 5.9488, + "step": 52 + }, + { + "epoch": 0.016267648864333947, + "grad_norm": 0.5932896137237549, + "learning_rate": 5.316973415132925e-06, + "loss": 5.9176, + "step": 53 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5772561430931091, + "learning_rate": 5.419222903885481e-06, + "loss": 5.9069, + "step": 54 + }, + { + "epoch": 0.016881522406384283, + "grad_norm": 0.5578178763389587, + "learning_rate": 5.521472392638037e-06, + "loss": 5.8924, + "step": 55 + }, + { + "epoch": 0.017188459177409455, + "grad_norm": 0.5458457469940186, + "learning_rate": 5.623721881390593e-06, + "loss": 5.9001, + "step": 56 + }, + { + "epoch": 0.017495395948434623, + "grad_norm": 0.5381231904029846, + "learning_rate": 5.7259713701431494e-06, + "loss": 5.8827, + "step": 57 + }, + { + "epoch": 0.01780233271945979, + "grad_norm": 0.540920615196228, + "learning_rate": 5.828220858895706e-06, + "loss": 5.8763, + "step": 58 + }, + { + "epoch": 0.01810926949048496, + "grad_norm": 0.5378615260124207, + "learning_rate": 5.930470347648262e-06, + "loss": 5.865, + "step": 59 + }, + { + "epoch": 0.01841620626151013, + "grad_norm": 0.5139282941818237, + "learning_rate": 6.032719836400819e-06, + "loss": 5.873, + "step": 60 + }, + { + "epoch": 0.0187231430325353, + "grad_norm": 0.5298904776573181, + "learning_rate": 6.134969325153374e-06, + "loss": 5.861, + "step": 61 + }, + { + "epoch": 0.019030079803560467, + "grad_norm": 0.503131628036499, + "learning_rate": 6.237218813905931e-06, + "loss": 5.844, + "step": 62 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.5133433938026428, + "learning_rate": 6.339468302658487e-06, + "loss": 5.8535, + "step": 63 + }, + { + "epoch": 0.019643953345610803, + "grad_norm": 0.4909187853336334, + "learning_rate": 6.4417177914110434e-06, + "loss": 5.8378, + "step": 64 + }, + { + "epoch": 0.019950890116635974, + "grad_norm": 0.6916642785072327, + "learning_rate": 6.5439672801636004e-06, + "loss": 5.8385, + "step": 65 + }, + { + "epoch": 0.020257826887661142, + "grad_norm": 0.4801484942436218, + "learning_rate": 6.646216768916155e-06, + "loss": 5.8089, + "step": 66 + }, + { + "epoch": 0.02056476365868631, + "grad_norm": 0.47745251655578613, + "learning_rate": 6.748466257668712e-06, + "loss": 5.8119, + "step": 67 + }, + { + "epoch": 0.020871700429711478, + "grad_norm": 0.4693359136581421, + "learning_rate": 6.850715746421268e-06, + "loss": 5.8038, + "step": 68 + }, + { + "epoch": 0.02117863720073665, + "grad_norm": 0.46996453404426575, + "learning_rate": 6.952965235173825e-06, + "loss": 5.7966, + "step": 69 + }, + { + "epoch": 0.021485573971761818, + "grad_norm": 0.45779168605804443, + "learning_rate": 7.05521472392638e-06, + "loss": 5.7959, + "step": 70 + }, + { + "epoch": 0.021792510742786986, + "grad_norm": 0.49008259177207947, + "learning_rate": 7.1574642126789366e-06, + "loss": 5.7861, + "step": 71 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.44727766513824463, + "learning_rate": 7.259713701431494e-06, + "loss": 5.7716, + "step": 72 + }, + { + "epoch": 0.022406384284837322, + "grad_norm": 0.4392741918563843, + "learning_rate": 7.36196319018405e-06, + "loss": 5.7776, + "step": 73 + }, + { + "epoch": 0.022713321055862493, + "grad_norm": 0.43525391817092896, + "learning_rate": 7.464212678936605e-06, + "loss": 5.7687, + "step": 74 + }, + { + "epoch": 0.02302025782688766, + "grad_norm": 0.4370710253715515, + "learning_rate": 7.566462167689162e-06, + "loss": 5.7504, + "step": 75 + }, + { + "epoch": 0.02332719459791283, + "grad_norm": 0.4349770247936249, + "learning_rate": 7.668711656441718e-06, + "loss": 5.7425, + "step": 76 + }, + { + "epoch": 0.023634131368937997, + "grad_norm": 0.42710933089256287, + "learning_rate": 7.770961145194275e-06, + "loss": 5.7562, + "step": 77 + }, + { + "epoch": 0.02394106813996317, + "grad_norm": 0.42816224694252014, + "learning_rate": 7.87321063394683e-06, + "loss": 5.7301, + "step": 78 + }, + { + "epoch": 0.024248004910988337, + "grad_norm": 0.4183364510536194, + "learning_rate": 7.975460122699386e-06, + "loss": 5.7131, + "step": 79 + }, + { + "epoch": 0.024554941682013505, + "grad_norm": 0.4179428517818451, + "learning_rate": 8.077709611451943e-06, + "loss": 5.7057, + "step": 80 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.40880727767944336, + "learning_rate": 8.1799591002045e-06, + "loss": 5.7179, + "step": 81 + }, + { + "epoch": 0.025168815224063844, + "grad_norm": 0.40961235761642456, + "learning_rate": 8.282208588957055e-06, + "loss": 5.7008, + "step": 82 + }, + { + "epoch": 0.025475751995089013, + "grad_norm": 0.46789029240608215, + "learning_rate": 8.384458077709612e-06, + "loss": 5.7071, + "step": 83 + }, + { + "epoch": 0.02578268876611418, + "grad_norm": 0.4776248335838318, + "learning_rate": 8.486707566462168e-06, + "loss": 5.6829, + "step": 84 + }, + { + "epoch": 0.02608962553713935, + "grad_norm": 0.40660589933395386, + "learning_rate": 8.588957055214725e-06, + "loss": 5.6732, + "step": 85 + }, + { + "epoch": 0.026396562308164517, + "grad_norm": 0.3984324038028717, + "learning_rate": 8.69120654396728e-06, + "loss": 5.6777, + "step": 86 + }, + { + "epoch": 0.026703499079189688, + "grad_norm": 0.3972148597240448, + "learning_rate": 8.793456032719837e-06, + "loss": 5.6598, + "step": 87 + }, + { + "epoch": 0.027010435850214856, + "grad_norm": 0.3906182050704956, + "learning_rate": 8.895705521472392e-06, + "loss": 5.6468, + "step": 88 + }, + { + "epoch": 0.027317372621240024, + "grad_norm": 0.38598939776420593, + "learning_rate": 8.99795501022495e-06, + "loss": 5.6452, + "step": 89 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.405943363904953, + "learning_rate": 9.100204498977506e-06, + "loss": 5.6408, + "step": 90 + }, + { + "epoch": 0.027931246163290364, + "grad_norm": 0.3859459161758423, + "learning_rate": 9.202453987730062e-06, + "loss": 5.613, + "step": 91 + }, + { + "epoch": 0.028238182934315532, + "grad_norm": 0.3773545026779175, + "learning_rate": 9.304703476482619e-06, + "loss": 5.6277, + "step": 92 + }, + { + "epoch": 0.0285451197053407, + "grad_norm": 0.36915943026542664, + "learning_rate": 9.406952965235174e-06, + "loss": 5.618, + "step": 93 + }, + { + "epoch": 0.028852056476365868, + "grad_norm": 0.3732316792011261, + "learning_rate": 9.509202453987731e-06, + "loss": 5.6066, + "step": 94 + }, + { + "epoch": 0.029158993247391036, + "grad_norm": 0.3670802414417267, + "learning_rate": 9.611451942740286e-06, + "loss": 5.6189, + "step": 95 + }, + { + "epoch": 0.029465930018416207, + "grad_norm": 0.3672202229499817, + "learning_rate": 9.713701431492843e-06, + "loss": 5.6046, + "step": 96 + }, + { + "epoch": 0.029772866789441375, + "grad_norm": 0.3624509871006012, + "learning_rate": 9.8159509202454e-06, + "loss": 5.585, + "step": 97 + }, + { + "epoch": 0.030079803560466543, + "grad_norm": 0.36265870928764343, + "learning_rate": 9.918200408997956e-06, + "loss": 5.5867, + "step": 98 + }, + { + "epoch": 0.03038674033149171, + "grad_norm": 0.3606979548931122, + "learning_rate": 1.0020449897750511e-05, + "loss": 5.5658, + "step": 99 + }, + { + "epoch": 0.030693677102516883, + "grad_norm": 0.36800363659858704, + "learning_rate": 1.0122699386503068e-05, + "loss": 5.5494, + "step": 100 + }, + { + "epoch": 0.03100061387354205, + "grad_norm": 0.3641016483306885, + "learning_rate": 1.0224948875255625e-05, + "loss": 5.5553, + "step": 101 + }, + { + "epoch": 0.03130755064456722, + "grad_norm": 0.36807990074157715, + "learning_rate": 1.032719836400818e-05, + "loss": 5.5315, + "step": 102 + }, + { + "epoch": 0.03161448741559239, + "grad_norm": 0.37071728706359863, + "learning_rate": 1.0429447852760736e-05, + "loss": 5.522, + "step": 103 + }, + { + "epoch": 0.03192142418661756, + "grad_norm": 0.3549076020717621, + "learning_rate": 1.0531697341513293e-05, + "loss": 5.5354, + "step": 104 + }, + { + "epoch": 0.03222836095764273, + "grad_norm": 0.3589537441730499, + "learning_rate": 1.063394683026585e-05, + "loss": 5.534, + "step": 105 + }, + { + "epoch": 0.032535297728667895, + "grad_norm": 0.4341397285461426, + "learning_rate": 1.0736196319018407e-05, + "loss": 5.5088, + "step": 106 + }, + { + "epoch": 0.03284223449969306, + "grad_norm": 0.37220680713653564, + "learning_rate": 1.0838445807770962e-05, + "loss": 5.5213, + "step": 107 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.3776145875453949, + "learning_rate": 1.0940695296523517e-05, + "loss": 5.4955, + "step": 108 + }, + { + "epoch": 0.0334561080417434, + "grad_norm": 0.38651829957962036, + "learning_rate": 1.1042944785276074e-05, + "loss": 5.4916, + "step": 109 + }, + { + "epoch": 0.03376304481276857, + "grad_norm": 0.3749970495700836, + "learning_rate": 1.1145194274028631e-05, + "loss": 5.4686, + "step": 110 + }, + { + "epoch": 0.03406998158379374, + "grad_norm": 0.38184404373168945, + "learning_rate": 1.1247443762781187e-05, + "loss": 5.4694, + "step": 111 + }, + { + "epoch": 0.03437691835481891, + "grad_norm": 0.38783952593803406, + "learning_rate": 1.1349693251533742e-05, + "loss": 5.4447, + "step": 112 + }, + { + "epoch": 0.03468385512584408, + "grad_norm": 0.369125097990036, + "learning_rate": 1.1451942740286299e-05, + "loss": 5.4506, + "step": 113 + }, + { + "epoch": 0.034990791896869246, + "grad_norm": 0.3773012161254883, + "learning_rate": 1.1554192229038856e-05, + "loss": 5.4637, + "step": 114 + }, + { + "epoch": 0.035297728667894414, + "grad_norm": 0.47702446579933167, + "learning_rate": 1.1656441717791411e-05, + "loss": 5.4487, + "step": 115 + }, + { + "epoch": 0.03560466543891958, + "grad_norm": 0.5288241505622864, + "learning_rate": 1.1758691206543968e-05, + "loss": 5.4216, + "step": 116 + }, + { + "epoch": 0.03591160220994475, + "grad_norm": 0.49916699528694153, + "learning_rate": 1.1860940695296524e-05, + "loss": 5.4055, + "step": 117 + }, + { + "epoch": 0.03621853898096992, + "grad_norm": 0.5027921795845032, + "learning_rate": 1.196319018404908e-05, + "loss": 5.4141, + "step": 118 + }, + { + "epoch": 0.036525475751995086, + "grad_norm": 0.5069209933280945, + "learning_rate": 1.2065439672801638e-05, + "loss": 5.4277, + "step": 119 + }, + { + "epoch": 0.03683241252302026, + "grad_norm": 0.5208525657653809, + "learning_rate": 1.2167689161554193e-05, + "loss": 5.4023, + "step": 120 + }, + { + "epoch": 0.03713934929404543, + "grad_norm": 0.7059593796730042, + "learning_rate": 1.2269938650306748e-05, + "loss": 5.3797, + "step": 121 + }, + { + "epoch": 0.0374462860650706, + "grad_norm": 0.71112060546875, + "learning_rate": 1.2372188139059305e-05, + "loss": 5.3619, + "step": 122 + }, + { + "epoch": 0.037753222836095765, + "grad_norm": 0.5095361471176147, + "learning_rate": 1.2474437627811862e-05, + "loss": 5.3667, + "step": 123 + }, + { + "epoch": 0.03806015960712093, + "grad_norm": 0.986062228679657, + "learning_rate": 1.2576687116564418e-05, + "loss": 5.3459, + "step": 124 + }, + { + "epoch": 0.0383670963781461, + "grad_norm": 0.693392813205719, + "learning_rate": 1.2678936605316975e-05, + "loss": 5.3165, + "step": 125 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7835625410079956, + "learning_rate": 1.278118609406953e-05, + "loss": 5.3205, + "step": 126 + }, + { + "epoch": 0.03898096992019644, + "grad_norm": 0.6314569711685181, + "learning_rate": 1.2883435582822087e-05, + "loss": 5.3287, + "step": 127 + }, + { + "epoch": 0.039287906691221605, + "grad_norm": 0.9079526662826538, + "learning_rate": 1.2985685071574644e-05, + "loss": 5.2935, + "step": 128 + }, + { + "epoch": 0.03959484346224678, + "grad_norm": 0.6998131275177002, + "learning_rate": 1.3087934560327201e-05, + "loss": 5.315, + "step": 129 + }, + { + "epoch": 0.03990178023327195, + "grad_norm": 0.7570182085037231, + "learning_rate": 1.3190184049079754e-05, + "loss": 5.293, + "step": 130 + }, + { + "epoch": 0.040208717004297116, + "grad_norm": 0.6972737908363342, + "learning_rate": 1.329243353783231e-05, + "loss": 5.2863, + "step": 131 + }, + { + "epoch": 0.040515653775322284, + "grad_norm": 0.8841190934181213, + "learning_rate": 1.3394683026584867e-05, + "loss": 5.2518, + "step": 132 + }, + { + "epoch": 0.04082259054634745, + "grad_norm": 0.6792641282081604, + "learning_rate": 1.3496932515337424e-05, + "loss": 5.2386, + "step": 133 + }, + { + "epoch": 0.04112952731737262, + "grad_norm": 0.9234145879745483, + "learning_rate": 1.359918200408998e-05, + "loss": 5.2418, + "step": 134 + }, + { + "epoch": 0.04143646408839779, + "grad_norm": 1.1438226699829102, + "learning_rate": 1.3701431492842536e-05, + "loss": 5.2298, + "step": 135 + }, + { + "epoch": 0.041743400859422956, + "grad_norm": 0.910861074924469, + "learning_rate": 1.3803680981595093e-05, + "loss": 5.2437, + "step": 136 + }, + { + "epoch": 0.042050337630448124, + "grad_norm": 0.8995844721794128, + "learning_rate": 1.390593047034765e-05, + "loss": 5.2456, + "step": 137 + }, + { + "epoch": 0.0423572744014733, + "grad_norm": 0.8543404936790466, + "learning_rate": 1.4008179959100204e-05, + "loss": 5.1888, + "step": 138 + }, + { + "epoch": 0.04266421117249847, + "grad_norm": 0.7565917372703552, + "learning_rate": 1.411042944785276e-05, + "loss": 5.1939, + "step": 139 + }, + { + "epoch": 0.042971147943523635, + "grad_norm": 0.7103878259658813, + "learning_rate": 1.4212678936605318e-05, + "loss": 5.1693, + "step": 140 + }, + { + "epoch": 0.0432780847145488, + "grad_norm": 1.008686900138855, + "learning_rate": 1.4314928425357873e-05, + "loss": 5.1467, + "step": 141 + }, + { + "epoch": 0.04358502148557397, + "grad_norm": 1.1134421825408936, + "learning_rate": 1.441717791411043e-05, + "loss": 5.1695, + "step": 142 + }, + { + "epoch": 0.04389195825659914, + "grad_norm": 0.7418283820152283, + "learning_rate": 1.4519427402862987e-05, + "loss": 5.1556, + "step": 143 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 1.3332276344299316, + "learning_rate": 1.4621676891615542e-05, + "loss": 5.1736, + "step": 144 + }, + { + "epoch": 0.044505831798649476, + "grad_norm": 0.99709153175354, + "learning_rate": 1.47239263803681e-05, + "loss": 5.1326, + "step": 145 + }, + { + "epoch": 0.044812768569674644, + "grad_norm": 2.0185158252716064, + "learning_rate": 1.4826175869120657e-05, + "loss": 5.1075, + "step": 146 + }, + { + "epoch": 0.04511970534069982, + "grad_norm": 0.9810693264007568, + "learning_rate": 1.492842535787321e-05, + "loss": 5.1181, + "step": 147 + }, + { + "epoch": 0.04542664211172499, + "grad_norm": 1.3122087717056274, + "learning_rate": 1.5030674846625767e-05, + "loss": 5.1104, + "step": 148 + }, + { + "epoch": 0.045733578882750155, + "grad_norm": 1.230662226676941, + "learning_rate": 1.5132924335378324e-05, + "loss": 5.0721, + "step": 149 + }, + { + "epoch": 0.04604051565377532, + "grad_norm": 0.9584419131278992, + "learning_rate": 1.523517382413088e-05, + "loss": 5.0574, + "step": 150 + }, + { + "epoch": 0.04634745242480049, + "grad_norm": 1.3933353424072266, + "learning_rate": 1.5337423312883436e-05, + "loss": 5.0468, + "step": 151 + }, + { + "epoch": 0.04665438919582566, + "grad_norm": 1.2336134910583496, + "learning_rate": 1.5439672801635993e-05, + "loss": 5.0596, + "step": 152 + }, + { + "epoch": 0.04696132596685083, + "grad_norm": 1.3005256652832031, + "learning_rate": 1.554192229038855e-05, + "loss": 5.0236, + "step": 153 + }, + { + "epoch": 0.047268262737875995, + "grad_norm": 1.2528692483901978, + "learning_rate": 1.5644171779141108e-05, + "loss": 5.0269, + "step": 154 + }, + { + "epoch": 0.04757519950890117, + "grad_norm": 1.0448148250579834, + "learning_rate": 1.574642126789366e-05, + "loss": 5.0338, + "step": 155 + }, + { + "epoch": 0.04788213627992634, + "grad_norm": 1.2372045516967773, + "learning_rate": 1.5848670756646218e-05, + "loss": 4.9544, + "step": 156 + }, + { + "epoch": 0.048189073050951506, + "grad_norm": 1.2700645923614502, + "learning_rate": 1.5950920245398772e-05, + "loss": 4.9723, + "step": 157 + }, + { + "epoch": 0.048496009821976674, + "grad_norm": 1.1283228397369385, + "learning_rate": 1.605316973415133e-05, + "loss": 4.9801, + "step": 158 + }, + { + "epoch": 0.04880294659300184, + "grad_norm": 1.5563665628433228, + "learning_rate": 1.6155419222903886e-05, + "loss": 4.9118, + "step": 159 + }, + { + "epoch": 0.04910988336402701, + "grad_norm": 1.3759487867355347, + "learning_rate": 1.6257668711656443e-05, + "loss": 4.9552, + "step": 160 + }, + { + "epoch": 0.04941682013505218, + "grad_norm": 1.2167878150939941, + "learning_rate": 1.6359918200409e-05, + "loss": 4.9186, + "step": 161 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.6424930095672607, + "learning_rate": 1.6462167689161557e-05, + "loss": 4.9143, + "step": 162 + }, + { + "epoch": 0.050030693677102514, + "grad_norm": 1.0009948015213013, + "learning_rate": 1.656441717791411e-05, + "loss": 4.8615, + "step": 163 + }, + { + "epoch": 0.05033763044812769, + "grad_norm": 1.8803274631500244, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.8558, + "step": 164 + }, + { + "epoch": 0.05064456721915286, + "grad_norm": 1.1819735765457153, + "learning_rate": 1.6768916155419224e-05, + "loss": 4.8453, + "step": 165 + }, + { + "epoch": 0.050951503990178025, + "grad_norm": 1.9724273681640625, + "learning_rate": 1.6871165644171778e-05, + "loss": 4.8573, + "step": 166 + }, + { + "epoch": 0.05125844076120319, + "grad_norm": 1.4624557495117188, + "learning_rate": 1.6973415132924335e-05, + "loss": 4.8494, + "step": 167 + }, + { + "epoch": 0.05156537753222836, + "grad_norm": 1.4750267267227173, + "learning_rate": 1.7075664621676892e-05, + "loss": 4.8296, + "step": 168 + }, + { + "epoch": 0.05187231430325353, + "grad_norm": 1.3206923007965088, + "learning_rate": 1.717791411042945e-05, + "loss": 4.7834, + "step": 169 + }, + { + "epoch": 0.0521792510742787, + "grad_norm": 1.4332681894302368, + "learning_rate": 1.7280163599182006e-05, + "loss": 4.8008, + "step": 170 + }, + { + "epoch": 0.052486187845303865, + "grad_norm": 1.612804651260376, + "learning_rate": 1.738241308793456e-05, + "loss": 4.7885, + "step": 171 + }, + { + "epoch": 0.05279312461632903, + "grad_norm": 1.3880311250686646, + "learning_rate": 1.7484662576687117e-05, + "loss": 4.8034, + "step": 172 + }, + { + "epoch": 0.05310006138735421, + "grad_norm": 1.7550631761550903, + "learning_rate": 1.7586912065439674e-05, + "loss": 4.7568, + "step": 173 + }, + { + "epoch": 0.053406998158379376, + "grad_norm": 1.653678297996521, + "learning_rate": 1.768916155419223e-05, + "loss": 4.7294, + "step": 174 + }, + { + "epoch": 0.053713934929404544, + "grad_norm": 1.6094826459884644, + "learning_rate": 1.7791411042944784e-05, + "loss": 4.7409, + "step": 175 + }, + { + "epoch": 0.05402087170042971, + "grad_norm": 1.7453033924102783, + "learning_rate": 1.789366053169734e-05, + "loss": 4.7191, + "step": 176 + }, + { + "epoch": 0.05432780847145488, + "grad_norm": 1.3073794841766357, + "learning_rate": 1.79959100204499e-05, + "loss": 4.7347, + "step": 177 + }, + { + "epoch": 0.05463474524248005, + "grad_norm": 2.096515655517578, + "learning_rate": 1.8098159509202455e-05, + "loss": 4.7396, + "step": 178 + }, + { + "epoch": 0.054941682013505216, + "grad_norm": 1.3826024532318115, + "learning_rate": 1.8200408997955012e-05, + "loss": 4.6988, + "step": 179 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 1.9290310144424438, + "learning_rate": 1.8302658486707566e-05, + "loss": 4.6653, + "step": 180 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.7404149770736694, + "learning_rate": 1.8404907975460123e-05, + "loss": 4.7102, + "step": 181 + }, + { + "epoch": 0.05586249232658073, + "grad_norm": 1.7535779476165771, + "learning_rate": 1.850715746421268e-05, + "loss": 4.7124, + "step": 182 + }, + { + "epoch": 0.056169429097605895, + "grad_norm": 1.7792351245880127, + "learning_rate": 1.8609406952965237e-05, + "loss": 4.6969, + "step": 183 + }, + { + "epoch": 0.056476365868631064, + "grad_norm": 2.048332452774048, + "learning_rate": 1.8711656441717794e-05, + "loss": 4.6134, + "step": 184 + }, + { + "epoch": 0.05678330263965623, + "grad_norm": 1.9558366537094116, + "learning_rate": 1.8813905930470348e-05, + "loss": 4.6739, + "step": 185 + }, + { + "epoch": 0.0570902394106814, + "grad_norm": 2.5299644470214844, + "learning_rate": 1.8916155419222905e-05, + "loss": 4.6248, + "step": 186 + }, + { + "epoch": 0.05739717618170657, + "grad_norm": 2.143704891204834, + "learning_rate": 1.9018404907975462e-05, + "loss": 4.6664, + "step": 187 + }, + { + "epoch": 0.057704112952731736, + "grad_norm": 1.925010323524475, + "learning_rate": 1.9120654396728015e-05, + "loss": 4.5657, + "step": 188 + }, + { + "epoch": 0.058011049723756904, + "grad_norm": 1.8223596811294556, + "learning_rate": 1.9222903885480572e-05, + "loss": 4.6124, + "step": 189 + }, + { + "epoch": 0.05831798649478207, + "grad_norm": 1.9519827365875244, + "learning_rate": 1.932515337423313e-05, + "loss": 4.5937, + "step": 190 + }, + { + "epoch": 0.05862492326580725, + "grad_norm": 2.062534809112549, + "learning_rate": 1.9427402862985686e-05, + "loss": 4.6023, + "step": 191 + }, + { + "epoch": 0.058931860036832415, + "grad_norm": 1.8512892723083496, + "learning_rate": 1.9529652351738243e-05, + "loss": 4.5709, + "step": 192 + }, + { + "epoch": 0.05923879680785758, + "grad_norm": 2.7771248817443848, + "learning_rate": 1.96319018404908e-05, + "loss": 4.5902, + "step": 193 + }, + { + "epoch": 0.05954573357888275, + "grad_norm": 1.8911874294281006, + "learning_rate": 1.9734151329243354e-05, + "loss": 4.4973, + "step": 194 + }, + { + "epoch": 0.05985267034990792, + "grad_norm": 2.261096715927124, + "learning_rate": 1.983640081799591e-05, + "loss": 4.5343, + "step": 195 + }, + { + "epoch": 0.06015960712093309, + "grad_norm": 1.833983302116394, + "learning_rate": 1.9938650306748465e-05, + "loss": 4.5604, + "step": 196 + }, + { + "epoch": 0.060466543891958255, + "grad_norm": 2.6909141540527344, + "learning_rate": 2.0040899795501022e-05, + "loss": 4.5411, + "step": 197 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.8085883855819702, + "learning_rate": 2.014314928425358e-05, + "loss": 4.5127, + "step": 198 + }, + { + "epoch": 0.06108041743400859, + "grad_norm": 3.082063913345337, + "learning_rate": 2.0245398773006136e-05, + "loss": 4.5055, + "step": 199 + }, + { + "epoch": 0.061387354205033766, + "grad_norm": 1.6942392587661743, + "learning_rate": 2.0347648261758693e-05, + "loss": 4.4852, + "step": 200 + }, + { + "epoch": 0.061694290976058934, + "grad_norm": 2.428569793701172, + "learning_rate": 2.044989775051125e-05, + "loss": 4.4876, + "step": 201 + }, + { + "epoch": 0.0620012277470841, + "grad_norm": 2.1669068336486816, + "learning_rate": 2.0552147239263807e-05, + "loss": 4.5156, + "step": 202 + }, + { + "epoch": 0.06230816451810927, + "grad_norm": 1.8558237552642822, + "learning_rate": 2.065439672801636e-05, + "loss": 4.495, + "step": 203 + }, + { + "epoch": 0.06261510128913444, + "grad_norm": 2.86224627494812, + "learning_rate": 2.0756646216768917e-05, + "loss": 4.4881, + "step": 204 + }, + { + "epoch": 0.06292203806015961, + "grad_norm": 2.263230562210083, + "learning_rate": 2.085889570552147e-05, + "loss": 4.4349, + "step": 205 + }, + { + "epoch": 0.06322897483118478, + "grad_norm": 2.533039093017578, + "learning_rate": 2.0961145194274028e-05, + "loss": 4.4921, + "step": 206 + }, + { + "epoch": 0.06353591160220995, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.1063394683026585e-05, + "loss": 4.4581, + "step": 207 + }, + { + "epoch": 0.06384284837323512, + "grad_norm": 1.9801981449127197, + "learning_rate": 2.1165644171779142e-05, + "loss": 4.4646, + "step": 208 + }, + { + "epoch": 0.06414978514426029, + "grad_norm": 2.8499860763549805, + "learning_rate": 2.12678936605317e-05, + "loss": 4.3913, + "step": 209 + }, + { + "epoch": 0.06445672191528545, + "grad_norm": 1.8176993131637573, + "learning_rate": 2.1370143149284256e-05, + "loss": 4.4414, + "step": 210 + }, + { + "epoch": 0.06476365868631062, + "grad_norm": 3.1497061252593994, + "learning_rate": 2.1472392638036813e-05, + "loss": 4.4164, + "step": 211 + }, + { + "epoch": 0.06507059545733579, + "grad_norm": 2.0509049892425537, + "learning_rate": 2.1574642126789367e-05, + "loss": 4.4198, + "step": 212 + }, + { + "epoch": 0.06537753222836096, + "grad_norm": 2.5346014499664307, + "learning_rate": 2.1676891615541924e-05, + "loss": 4.3628, + "step": 213 + }, + { + "epoch": 0.06568446899938613, + "grad_norm": 2.281947135925293, + "learning_rate": 2.1779141104294477e-05, + "loss": 4.3824, + "step": 214 + }, + { + "epoch": 0.0659914057704113, + "grad_norm": 2.9005074501037598, + "learning_rate": 2.1881390593047034e-05, + "loss": 4.4227, + "step": 215 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 2.5869741439819336, + "learning_rate": 2.198364008179959e-05, + "loss": 4.4231, + "step": 216 + }, + { + "epoch": 0.06660527931246163, + "grad_norm": 2.339655637741089, + "learning_rate": 2.208588957055215e-05, + "loss": 4.3901, + "step": 217 + }, + { + "epoch": 0.0669122160834868, + "grad_norm": 2.430664539337158, + "learning_rate": 2.2188139059304705e-05, + "loss": 4.3487, + "step": 218 + }, + { + "epoch": 0.06721915285451197, + "grad_norm": 2.1791040897369385, + "learning_rate": 2.2290388548057262e-05, + "loss": 4.3404, + "step": 219 + }, + { + "epoch": 0.06752608962553713, + "grad_norm": 2.7054920196533203, + "learning_rate": 2.239263803680982e-05, + "loss": 4.4186, + "step": 220 + }, + { + "epoch": 0.0678330263965623, + "grad_norm": 2.516566514968872, + "learning_rate": 2.2494887525562373e-05, + "loss": 4.4102, + "step": 221 + }, + { + "epoch": 0.06813996316758748, + "grad_norm": 2.3522324562072754, + "learning_rate": 2.259713701431493e-05, + "loss": 4.4062, + "step": 222 + }, + { + "epoch": 0.06844689993861265, + "grad_norm": 2.557600259780884, + "learning_rate": 2.2699386503067484e-05, + "loss": 4.3711, + "step": 223 + }, + { + "epoch": 0.06875383670963782, + "grad_norm": 2.0590531826019287, + "learning_rate": 2.280163599182004e-05, + "loss": 4.3546, + "step": 224 + }, + { + "epoch": 0.06906077348066299, + "grad_norm": 4.704878330230713, + "learning_rate": 2.2903885480572598e-05, + "loss": 4.39, + "step": 225 + }, + { + "epoch": 0.06936771025168816, + "grad_norm": 2.237440347671509, + "learning_rate": 2.3006134969325155e-05, + "loss": 4.3425, + "step": 226 + }, + { + "epoch": 0.06967464702271332, + "grad_norm": 3.9394450187683105, + "learning_rate": 2.3108384458077712e-05, + "loss": 4.3641, + "step": 227 + }, + { + "epoch": 0.06998158379373849, + "grad_norm": 2.4857213497161865, + "learning_rate": 2.321063394683027e-05, + "loss": 4.3435, + "step": 228 + }, + { + "epoch": 0.07028852056476366, + "grad_norm": 2.893437147140503, + "learning_rate": 2.3312883435582822e-05, + "loss": 4.329, + "step": 229 + }, + { + "epoch": 0.07059545733578883, + "grad_norm": 2.6498284339904785, + "learning_rate": 2.341513292433538e-05, + "loss": 4.3058, + "step": 230 + }, + { + "epoch": 0.070902394106814, + "grad_norm": 2.4182214736938477, + "learning_rate": 2.3517382413087936e-05, + "loss": 4.3147, + "step": 231 + }, + { + "epoch": 0.07120933087783916, + "grad_norm": 2.532050371170044, + "learning_rate": 2.361963190184049e-05, + "loss": 4.3388, + "step": 232 + }, + { + "epoch": 0.07151626764886433, + "grad_norm": 2.5818533897399902, + "learning_rate": 2.3721881390593047e-05, + "loss": 4.3023, + "step": 233 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 2.1860098838806152, + "learning_rate": 2.3824130879345604e-05, + "loss": 4.2571, + "step": 234 + }, + { + "epoch": 0.07213014119091467, + "grad_norm": 3.5780131816864014, + "learning_rate": 2.392638036809816e-05, + "loss": 4.3336, + "step": 235 + }, + { + "epoch": 0.07243707796193984, + "grad_norm": 2.24653697013855, + "learning_rate": 2.4028629856850718e-05, + "loss": 4.3013, + "step": 236 + }, + { + "epoch": 0.072744014732965, + "grad_norm": 3.59663987159729, + "learning_rate": 2.4130879345603275e-05, + "loss": 4.3248, + "step": 237 + }, + { + "epoch": 0.07305095150399017, + "grad_norm": 2.818321943283081, + "learning_rate": 2.423312883435583e-05, + "loss": 4.2876, + "step": 238 + }, + { + "epoch": 0.07335788827501534, + "grad_norm": 2.457371950149536, + "learning_rate": 2.4335378323108386e-05, + "loss": 4.2584, + "step": 239 + }, + { + "epoch": 0.07366482504604052, + "grad_norm": 3.6243598461151123, + "learning_rate": 2.4437627811860943e-05, + "loss": 4.2786, + "step": 240 + }, + { + "epoch": 0.07397176181706569, + "grad_norm": 2.113060474395752, + "learning_rate": 2.4539877300613496e-05, + "loss": 4.2071, + "step": 241 + }, + { + "epoch": 0.07427869858809086, + "grad_norm": 5.355374813079834, + "learning_rate": 2.4642126789366053e-05, + "loss": 4.2871, + "step": 242 + }, + { + "epoch": 0.07458563535911603, + "grad_norm": 2.4509847164154053, + "learning_rate": 2.474437627811861e-05, + "loss": 4.2073, + "step": 243 + }, + { + "epoch": 0.0748925721301412, + "grad_norm": 3.313793659210205, + "learning_rate": 2.4846625766871167e-05, + "loss": 4.2938, + "step": 244 + }, + { + "epoch": 0.07519950890116636, + "grad_norm": 2.731903553009033, + "learning_rate": 2.4948875255623724e-05, + "loss": 4.2023, + "step": 245 + }, + { + "epoch": 0.07550644567219153, + "grad_norm": 2.6218042373657227, + "learning_rate": 2.505112474437628e-05, + "loss": 4.2492, + "step": 246 + }, + { + "epoch": 0.0758133824432167, + "grad_norm": 3.2865426540374756, + "learning_rate": 2.5153374233128835e-05, + "loss": 4.2358, + "step": 247 + }, + { + "epoch": 0.07612031921424187, + "grad_norm": 2.21870756149292, + "learning_rate": 2.5255623721881395e-05, + "loss": 4.1989, + "step": 248 + }, + { + "epoch": 0.07642725598526703, + "grad_norm": 4.095842361450195, + "learning_rate": 2.535787321063395e-05, + "loss": 4.2484, + "step": 249 + }, + { + "epoch": 0.0767341927562922, + "grad_norm": 2.21420955657959, + "learning_rate": 2.5460122699386503e-05, + "loss": 4.1985, + "step": 250 + }, + { + "epoch": 0.07704112952731737, + "grad_norm": 3.011272668838501, + "learning_rate": 2.556237218813906e-05, + "loss": 4.2182, + "step": 251 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 2.930999279022217, + "learning_rate": 2.5664621676891613e-05, + "loss": 4.1985, + "step": 252 + }, + { + "epoch": 0.0776550030693677, + "grad_norm": 2.8528032302856445, + "learning_rate": 2.5766871165644174e-05, + "loss": 4.1859, + "step": 253 + }, + { + "epoch": 0.07796193984039287, + "grad_norm": 3.215587854385376, + "learning_rate": 2.5869120654396727e-05, + "loss": 4.2416, + "step": 254 + }, + { + "epoch": 0.07826887661141804, + "grad_norm": 3.1349990367889404, + "learning_rate": 2.5971370143149288e-05, + "loss": 4.2204, + "step": 255 + }, + { + "epoch": 0.07857581338244321, + "grad_norm": 3.146942377090454, + "learning_rate": 2.607361963190184e-05, + "loss": 4.17, + "step": 256 + }, + { + "epoch": 0.07888275015346839, + "grad_norm": 2.2611942291259766, + "learning_rate": 2.6175869120654402e-05, + "loss": 4.191, + "step": 257 + }, + { + "epoch": 0.07918968692449356, + "grad_norm": 3.434574604034424, + "learning_rate": 2.6278118609406955e-05, + "loss": 4.1854, + "step": 258 + }, + { + "epoch": 0.07949662369551873, + "grad_norm": 2.3132400512695312, + "learning_rate": 2.638036809815951e-05, + "loss": 4.233, + "step": 259 + }, + { + "epoch": 0.0798035604665439, + "grad_norm": 3.2676596641540527, + "learning_rate": 2.6482617586912066e-05, + "loss": 4.1586, + "step": 260 + }, + { + "epoch": 0.08011049723756906, + "grad_norm": 2.6182920932769775, + "learning_rate": 2.658486707566462e-05, + "loss": 4.164, + "step": 261 + }, + { + "epoch": 0.08041743400859423, + "grad_norm": 2.872018814086914, + "learning_rate": 2.668711656441718e-05, + "loss": 4.1642, + "step": 262 + }, + { + "epoch": 0.0807243707796194, + "grad_norm": 3.147237539291382, + "learning_rate": 2.6789366053169734e-05, + "loss": 4.147, + "step": 263 + }, + { + "epoch": 0.08103130755064457, + "grad_norm": 2.363360643386841, + "learning_rate": 2.6891615541922294e-05, + "loss": 4.1388, + "step": 264 + }, + { + "epoch": 0.08133824432166974, + "grad_norm": 3.364442825317383, + "learning_rate": 2.6993865030674848e-05, + "loss": 4.1678, + "step": 265 + }, + { + "epoch": 0.0816451810926949, + "grad_norm": 2.393705368041992, + "learning_rate": 2.7096114519427408e-05, + "loss": 4.1626, + "step": 266 + }, + { + "epoch": 0.08195211786372007, + "grad_norm": 3.8512558937072754, + "learning_rate": 2.719836400817996e-05, + "loss": 4.1613, + "step": 267 + }, + { + "epoch": 0.08225905463474524, + "grad_norm": 3.0992584228515625, + "learning_rate": 2.7300613496932515e-05, + "loss": 4.1486, + "step": 268 + }, + { + "epoch": 0.08256599140577041, + "grad_norm": 3.481079578399658, + "learning_rate": 2.7402862985685072e-05, + "loss": 4.1772, + "step": 269 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 3.2167513370513916, + "learning_rate": 2.7505112474437626e-05, + "loss": 4.1253, + "step": 270 + }, + { + "epoch": 0.08317986494782074, + "grad_norm": 2.9698429107666016, + "learning_rate": 2.7607361963190186e-05, + "loss": 4.0897, + "step": 271 + }, + { + "epoch": 0.08348680171884591, + "grad_norm": 3.2549962997436523, + "learning_rate": 2.770961145194274e-05, + "loss": 4.0851, + "step": 272 + }, + { + "epoch": 0.08379373848987108, + "grad_norm": 3.089301824569702, + "learning_rate": 2.78118609406953e-05, + "loss": 4.1378, + "step": 273 + }, + { + "epoch": 0.08410067526089625, + "grad_norm": 3.1799745559692383, + "learning_rate": 2.7914110429447854e-05, + "loss": 4.159, + "step": 274 + }, + { + "epoch": 0.08440761203192143, + "grad_norm": 2.7577199935913086, + "learning_rate": 2.8016359918200408e-05, + "loss": 4.0524, + "step": 275 + }, + { + "epoch": 0.0847145488029466, + "grad_norm": 3.709740161895752, + "learning_rate": 2.8118609406952968e-05, + "loss": 4.0877, + "step": 276 + }, + { + "epoch": 0.08502148557397177, + "grad_norm": 2.930482864379883, + "learning_rate": 2.822085889570552e-05, + "loss": 4.0408, + "step": 277 + }, + { + "epoch": 0.08532842234499693, + "grad_norm": 3.8216278553009033, + "learning_rate": 2.832310838445808e-05, + "loss": 4.0915, + "step": 278 + }, + { + "epoch": 0.0856353591160221, + "grad_norm": 2.7614903450012207, + "learning_rate": 2.8425357873210636e-05, + "loss": 4.0793, + "step": 279 + }, + { + "epoch": 0.08594229588704727, + "grad_norm": 4.005281448364258, + "learning_rate": 2.8527607361963193e-05, + "loss": 4.1234, + "step": 280 + }, + { + "epoch": 0.08624923265807244, + "grad_norm": 2.731640338897705, + "learning_rate": 2.8629856850715746e-05, + "loss": 4.1408, + "step": 281 + }, + { + "epoch": 0.0865561694290976, + "grad_norm": 4.439471244812012, + "learning_rate": 2.8732106339468307e-05, + "loss": 4.08, + "step": 282 + }, + { + "epoch": 0.08686310620012277, + "grad_norm": 2.929032564163208, + "learning_rate": 2.883435582822086e-05, + "loss": 4.0521, + "step": 283 + }, + { + "epoch": 0.08717004297114794, + "grad_norm": 3.3943557739257812, + "learning_rate": 2.8936605316973414e-05, + "loss": 4.0936, + "step": 284 + }, + { + "epoch": 0.08747697974217311, + "grad_norm": 2.9899704456329346, + "learning_rate": 2.9038854805725974e-05, + "loss": 4.0985, + "step": 285 + }, + { + "epoch": 0.08778391651319828, + "grad_norm": 2.8169870376586914, + "learning_rate": 2.9141104294478528e-05, + "loss": 4.1044, + "step": 286 + }, + { + "epoch": 0.08809085328422345, + "grad_norm": 4.312693119049072, + "learning_rate": 2.9243353783231085e-05, + "loss": 4.0515, + "step": 287 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 2.9270846843719482, + "learning_rate": 2.9345603271983642e-05, + "loss": 4.0221, + "step": 288 + }, + { + "epoch": 0.08870472682627378, + "grad_norm": 3.9831974506378174, + "learning_rate": 2.94478527607362e-05, + "loss": 4.0807, + "step": 289 + }, + { + "epoch": 0.08901166359729895, + "grad_norm": 2.721794605255127, + "learning_rate": 2.9550102249488753e-05, + "loss": 4.0732, + "step": 290 + }, + { + "epoch": 0.08931860036832412, + "grad_norm": 4.721047878265381, + "learning_rate": 2.9652351738241313e-05, + "loss": 4.0457, + "step": 291 + }, + { + "epoch": 0.08962553713934929, + "grad_norm": 2.785738229751587, + "learning_rate": 2.9754601226993867e-05, + "loss": 4.0288, + "step": 292 + }, + { + "epoch": 0.08993247391037447, + "grad_norm": 4.842009544372559, + "learning_rate": 2.985685071574642e-05, + "loss": 4.1193, + "step": 293 + }, + { + "epoch": 0.09023941068139964, + "grad_norm": 2.802044153213501, + "learning_rate": 2.995910020449898e-05, + "loss": 4.0055, + "step": 294 + }, + { + "epoch": 0.0905463474524248, + "grad_norm": 3.7060954570770264, + "learning_rate": 3.0061349693251534e-05, + "loss": 4.0478, + "step": 295 + }, + { + "epoch": 0.09085328422344997, + "grad_norm": 2.8033370971679688, + "learning_rate": 3.0163599182004095e-05, + "loss": 4.0344, + "step": 296 + }, + { + "epoch": 0.09116022099447514, + "grad_norm": 3.148653984069824, + "learning_rate": 3.026584867075665e-05, + "loss": 3.9825, + "step": 297 + }, + { + "epoch": 0.09146715776550031, + "grad_norm": 3.925459384918213, + "learning_rate": 3.0368098159509205e-05, + "loss": 4.0253, + "step": 298 + }, + { + "epoch": 0.09177409453652548, + "grad_norm": 2.8502724170684814, + "learning_rate": 3.047034764826176e-05, + "loss": 4.0192, + "step": 299 + }, + { + "epoch": 0.09208103130755065, + "grad_norm": 3.8444268703460693, + "learning_rate": 3.057259713701431e-05, + "loss": 4.0354, + "step": 300 + }, + { + "epoch": 0.09238796807857581, + "grad_norm": 2.935976982116699, + "learning_rate": 3.067484662576687e-05, + "loss": 4.0397, + "step": 301 + }, + { + "epoch": 0.09269490484960098, + "grad_norm": 2.9375271797180176, + "learning_rate": 3.0777096114519427e-05, + "loss": 3.975, + "step": 302 + }, + { + "epoch": 0.09300184162062615, + "grad_norm": 3.7623329162597656, + "learning_rate": 3.087934560327199e-05, + "loss": 4.0259, + "step": 303 + }, + { + "epoch": 0.09330877839165132, + "grad_norm": 3.1480228900909424, + "learning_rate": 3.098159509202454e-05, + "loss": 3.9676, + "step": 304 + }, + { + "epoch": 0.09361571516267649, + "grad_norm": 4.572622299194336, + "learning_rate": 3.10838445807771e-05, + "loss": 4.0123, + "step": 305 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 2.469806671142578, + "learning_rate": 3.1186094069529655e-05, + "loss": 4.012, + "step": 306 + }, + { + "epoch": 0.09422958870472682, + "grad_norm": 5.133090019226074, + "learning_rate": 3.1288343558282215e-05, + "loss": 3.9892, + "step": 307 + }, + { + "epoch": 0.09453652547575199, + "grad_norm": 3.379105567932129, + "learning_rate": 3.139059304703477e-05, + "loss": 4.0286, + "step": 308 + }, + { + "epoch": 0.09484346224677716, + "grad_norm": 3.1413521766662598, + "learning_rate": 3.149284253578732e-05, + "loss": 4.0238, + "step": 309 + }, + { + "epoch": 0.09515039901780234, + "grad_norm": 2.832242250442505, + "learning_rate": 3.159509202453988e-05, + "loss": 3.9955, + "step": 310 + }, + { + "epoch": 0.09545733578882751, + "grad_norm": 4.405134201049805, + "learning_rate": 3.1697341513292436e-05, + "loss": 4.0093, + "step": 311 + }, + { + "epoch": 0.09576427255985268, + "grad_norm": 2.8928587436676025, + "learning_rate": 3.179959100204499e-05, + "loss": 3.9518, + "step": 312 + }, + { + "epoch": 0.09607120933087784, + "grad_norm": 3.8899731636047363, + "learning_rate": 3.1901840490797544e-05, + "loss": 3.9773, + "step": 313 + }, + { + "epoch": 0.09637814610190301, + "grad_norm": 2.768199920654297, + "learning_rate": 3.2004089979550104e-05, + "loss": 3.9671, + "step": 314 + }, + { + "epoch": 0.09668508287292818, + "grad_norm": 3.834092378616333, + "learning_rate": 3.210633946830266e-05, + "loss": 3.9641, + "step": 315 + }, + { + "epoch": 0.09699201964395335, + "grad_norm": 3.566220998764038, + "learning_rate": 3.220858895705521e-05, + "loss": 3.9585, + "step": 316 + }, + { + "epoch": 0.09729895641497852, + "grad_norm": 3.1876113414764404, + "learning_rate": 3.231083844580777e-05, + "loss": 3.9689, + "step": 317 + }, + { + "epoch": 0.09760589318600368, + "grad_norm": 3.122142791748047, + "learning_rate": 3.2413087934560325e-05, + "loss": 3.9601, + "step": 318 + }, + { + "epoch": 0.09791282995702885, + "grad_norm": 3.825195789337158, + "learning_rate": 3.2515337423312886e-05, + "loss": 3.9413, + "step": 319 + }, + { + "epoch": 0.09821976672805402, + "grad_norm": 3.3126778602600098, + "learning_rate": 3.261758691206544e-05, + "loss": 4.0414, + "step": 320 + }, + { + "epoch": 0.09852670349907919, + "grad_norm": 3.7704360485076904, + "learning_rate": 3.2719836400818e-05, + "loss": 3.9224, + "step": 321 + }, + { + "epoch": 0.09883364027010436, + "grad_norm": 2.997194290161133, + "learning_rate": 3.282208588957055e-05, + "loss": 3.9454, + "step": 322 + }, + { + "epoch": 0.09914057704112952, + "grad_norm": 3.4990131855010986, + "learning_rate": 3.2924335378323114e-05, + "loss": 3.8682, + "step": 323 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 3.146879196166992, + "learning_rate": 3.302658486707567e-05, + "loss": 3.8863, + "step": 324 + }, + { + "epoch": 0.09975445058317986, + "grad_norm": 4.963291645050049, + "learning_rate": 3.312883435582822e-05, + "loss": 3.9951, + "step": 325 + }, + { + "epoch": 0.10006138735420503, + "grad_norm": 2.4511775970458984, + "learning_rate": 3.323108384458078e-05, + "loss": 3.875, + "step": 326 + }, + { + "epoch": 0.1003683241252302, + "grad_norm": 5.670922756195068, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.0446, + "step": 327 + }, + { + "epoch": 0.10067526089625538, + "grad_norm": 3.54237699508667, + "learning_rate": 3.3435582822085895e-05, + "loss": 3.9877, + "step": 328 + }, + { + "epoch": 0.10098219766728055, + "grad_norm": 2.9059271812438965, + "learning_rate": 3.353783231083845e-05, + "loss": 3.949, + "step": 329 + }, + { + "epoch": 0.10128913443830571, + "grad_norm": 3.870962381362915, + "learning_rate": 3.3640081799591e-05, + "loss": 3.8985, + "step": 330 + }, + { + "epoch": 0.10159607120933088, + "grad_norm": 3.275129556655884, + "learning_rate": 3.3742331288343556e-05, + "loss": 4.0209, + "step": 331 + }, + { + "epoch": 0.10190300798035605, + "grad_norm": 3.040931224822998, + "learning_rate": 3.3844580777096117e-05, + "loss": 3.9938, + "step": 332 + }, + { + "epoch": 0.10220994475138122, + "grad_norm": 4.3355584144592285, + "learning_rate": 3.394683026584867e-05, + "loss": 3.876, + "step": 333 + }, + { + "epoch": 0.10251688152240639, + "grad_norm": 3.0981085300445557, + "learning_rate": 3.4049079754601224e-05, + "loss": 3.9014, + "step": 334 + }, + { + "epoch": 0.10282381829343155, + "grad_norm": 3.2902655601501465, + "learning_rate": 3.4151329243353784e-05, + "loss": 3.9599, + "step": 335 + }, + { + "epoch": 0.10313075506445672, + "grad_norm": 3.496514081954956, + "learning_rate": 3.425357873210634e-05, + "loss": 3.9005, + "step": 336 + }, + { + "epoch": 0.10343769183548189, + "grad_norm": 3.4680685997009277, + "learning_rate": 3.43558282208589e-05, + "loss": 3.8591, + "step": 337 + }, + { + "epoch": 0.10374462860650706, + "grad_norm": 3.3041694164276123, + "learning_rate": 3.445807770961145e-05, + "loss": 3.9566, + "step": 338 + }, + { + "epoch": 0.10405156537753223, + "grad_norm": 3.519709825515747, + "learning_rate": 3.456032719836401e-05, + "loss": 3.9219, + "step": 339 + }, + { + "epoch": 0.1043585021485574, + "grad_norm": 3.932344436645508, + "learning_rate": 3.4662576687116566e-05, + "loss": 3.9155, + "step": 340 + }, + { + "epoch": 0.10466543891958256, + "grad_norm": 3.3109822273254395, + "learning_rate": 3.476482617586912e-05, + "loss": 3.9729, + "step": 341 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 4.556341648101807, + "learning_rate": 3.486707566462168e-05, + "loss": 3.9459, + "step": 342 + }, + { + "epoch": 0.1052793124616329, + "grad_norm": 2.9105725288391113, + "learning_rate": 3.4969325153374234e-05, + "loss": 3.9384, + "step": 343 + }, + { + "epoch": 0.10558624923265807, + "grad_norm": 3.865682601928711, + "learning_rate": 3.5071574642126794e-05, + "loss": 3.9826, + "step": 344 + }, + { + "epoch": 0.10589318600368323, + "grad_norm": 2.8606700897216797, + "learning_rate": 3.517382413087935e-05, + "loss": 3.8184, + "step": 345 + }, + { + "epoch": 0.10620012277470842, + "grad_norm": 4.323507785797119, + "learning_rate": 3.527607361963191e-05, + "loss": 3.8772, + "step": 346 + }, + { + "epoch": 0.10650705954573358, + "grad_norm": 2.890390157699585, + "learning_rate": 3.537832310838446e-05, + "loss": 3.8769, + "step": 347 + }, + { + "epoch": 0.10681399631675875, + "grad_norm": 4.008283615112305, + "learning_rate": 3.5480572597137015e-05, + "loss": 3.8796, + "step": 348 + }, + { + "epoch": 0.10712093308778392, + "grad_norm": 3.3605823516845703, + "learning_rate": 3.558282208588957e-05, + "loss": 3.8924, + "step": 349 + }, + { + "epoch": 0.10742786985880909, + "grad_norm": 3.6573123931884766, + "learning_rate": 3.568507157464213e-05, + "loss": 3.812, + "step": 350 + }, + { + "epoch": 0.10773480662983426, + "grad_norm": 3.0771777629852295, + "learning_rate": 3.578732106339468e-05, + "loss": 3.8958, + "step": 351 + }, + { + "epoch": 0.10804174340085942, + "grad_norm": 3.6483314037323, + "learning_rate": 3.5889570552147236e-05, + "loss": 3.8863, + "step": 352 + }, + { + "epoch": 0.10834868017188459, + "grad_norm": 3.1320669651031494, + "learning_rate": 3.59918200408998e-05, + "loss": 3.8194, + "step": 353 + }, + { + "epoch": 0.10865561694290976, + "grad_norm": 3.6510627269744873, + "learning_rate": 3.609406952965235e-05, + "loss": 3.8916, + "step": 354 + }, + { + "epoch": 0.10896255371393493, + "grad_norm": 3.0419273376464844, + "learning_rate": 3.619631901840491e-05, + "loss": 3.7907, + "step": 355 + }, + { + "epoch": 0.1092694904849601, + "grad_norm": 4.519289493560791, + "learning_rate": 3.6298568507157465e-05, + "loss": 3.8902, + "step": 356 + }, + { + "epoch": 0.10957642725598526, + "grad_norm": 2.938493251800537, + "learning_rate": 3.6400817995910025e-05, + "loss": 3.8675, + "step": 357 + }, + { + "epoch": 0.10988336402701043, + "grad_norm": 4.398004531860352, + "learning_rate": 3.650306748466258e-05, + "loss": 3.9535, + "step": 358 + }, + { + "epoch": 0.1101903007980356, + "grad_norm": 2.9128408432006836, + "learning_rate": 3.660531697341513e-05, + "loss": 3.944, + "step": 359 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 5.364169597625732, + "learning_rate": 3.670756646216769e-05, + "loss": 3.9289, + "step": 360 + }, + { + "epoch": 0.11080417434008594, + "grad_norm": 2.8434085845947266, + "learning_rate": 3.6809815950920246e-05, + "loss": 3.8204, + "step": 361 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.494234561920166, + "learning_rate": 3.6912065439672807e-05, + "loss": 3.8518, + "step": 362 + }, + { + "epoch": 0.11141804788213629, + "grad_norm": 2.959608554840088, + "learning_rate": 3.701431492842536e-05, + "loss": 3.8365, + "step": 363 + }, + { + "epoch": 0.11172498465316145, + "grad_norm": 3.4115726947784424, + "learning_rate": 3.711656441717792e-05, + "loss": 3.8507, + "step": 364 + }, + { + "epoch": 0.11203192142418662, + "grad_norm": 3.8023531436920166, + "learning_rate": 3.7218813905930474e-05, + "loss": 3.8544, + "step": 365 + }, + { + "epoch": 0.11233885819521179, + "grad_norm": 3.0639398097991943, + "learning_rate": 3.732106339468303e-05, + "loss": 3.8772, + "step": 366 + }, + { + "epoch": 0.11264579496623696, + "grad_norm": 4.241199016571045, + "learning_rate": 3.742331288343559e-05, + "loss": 3.7739, + "step": 367 + }, + { + "epoch": 0.11295273173726213, + "grad_norm": 2.977330446243286, + "learning_rate": 3.752556237218814e-05, + "loss": 3.8376, + "step": 368 + }, + { + "epoch": 0.1132596685082873, + "grad_norm": 4.574001789093018, + "learning_rate": 3.7627811860940696e-05, + "loss": 3.8761, + "step": 369 + }, + { + "epoch": 0.11356660527931246, + "grad_norm": 3.1499617099761963, + "learning_rate": 3.773006134969325e-05, + "loss": 3.8884, + "step": 370 + }, + { + "epoch": 0.11387354205033763, + "grad_norm": 3.81887149810791, + "learning_rate": 3.783231083844581e-05, + "loss": 3.8474, + "step": 371 + }, + { + "epoch": 0.1141804788213628, + "grad_norm": 3.424117088317871, + "learning_rate": 3.793456032719836e-05, + "loss": 3.8715, + "step": 372 + }, + { + "epoch": 0.11448741559238797, + "grad_norm": 4.431595325469971, + "learning_rate": 3.8036809815950924e-05, + "loss": 3.8305, + "step": 373 + }, + { + "epoch": 0.11479435236341314, + "grad_norm": 3.1664443016052246, + "learning_rate": 3.813905930470348e-05, + "loss": 3.8203, + "step": 374 + }, + { + "epoch": 0.1151012891344383, + "grad_norm": 4.312273025512695, + "learning_rate": 3.824130879345603e-05, + "loss": 3.8195, + "step": 375 + }, + { + "epoch": 0.11540822590546347, + "grad_norm": 3.0893726348876953, + "learning_rate": 3.834355828220859e-05, + "loss": 3.8248, + "step": 376 + }, + { + "epoch": 0.11571516267648864, + "grad_norm": 4.526726722717285, + "learning_rate": 3.8445807770961145e-05, + "loss": 3.8505, + "step": 377 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 2.5805325508117676, + "learning_rate": 3.8548057259713705e-05, + "loss": 3.8153, + "step": 378 + }, + { + "epoch": 0.11632903621853898, + "grad_norm": 4.6043381690979, + "learning_rate": 3.865030674846626e-05, + "loss": 3.8248, + "step": 379 + }, + { + "epoch": 0.11663597298956414, + "grad_norm": 3.0713136196136475, + "learning_rate": 3.875255623721882e-05, + "loss": 3.7687, + "step": 380 + }, + { + "epoch": 0.11694290976058933, + "grad_norm": 3.6344685554504395, + "learning_rate": 3.885480572597137e-05, + "loss": 3.8061, + "step": 381 + }, + { + "epoch": 0.1172498465316145, + "grad_norm": 3.6261723041534424, + "learning_rate": 3.895705521472393e-05, + "loss": 3.7939, + "step": 382 + }, + { + "epoch": 0.11755678330263966, + "grad_norm": 3.811779260635376, + "learning_rate": 3.905930470347649e-05, + "loss": 3.7973, + "step": 383 + }, + { + "epoch": 0.11786372007366483, + "grad_norm": 3.741685628890991, + "learning_rate": 3.916155419222904e-05, + "loss": 3.8149, + "step": 384 + }, + { + "epoch": 0.11817065684469, + "grad_norm": 3.330526351928711, + "learning_rate": 3.92638036809816e-05, + "loss": 3.8058, + "step": 385 + }, + { + "epoch": 0.11847759361571517, + "grad_norm": 3.2102115154266357, + "learning_rate": 3.9366053169734155e-05, + "loss": 3.7199, + "step": 386 + }, + { + "epoch": 0.11878453038674033, + "grad_norm": 3.670474052429199, + "learning_rate": 3.946830265848671e-05, + "loss": 3.8087, + "step": 387 + }, + { + "epoch": 0.1190914671577655, + "grad_norm": 3.218390941619873, + "learning_rate": 3.957055214723926e-05, + "loss": 3.7631, + "step": 388 + }, + { + "epoch": 0.11939840392879067, + "grad_norm": 4.2256693840026855, + "learning_rate": 3.967280163599182e-05, + "loss": 3.7624, + "step": 389 + }, + { + "epoch": 0.11970534069981584, + "grad_norm": 2.86247181892395, + "learning_rate": 3.9775051124744376e-05, + "loss": 3.7638, + "step": 390 + }, + { + "epoch": 0.120012277470841, + "grad_norm": 4.083118915557861, + "learning_rate": 3.987730061349693e-05, + "loss": 3.7581, + "step": 391 + }, + { + "epoch": 0.12031921424186617, + "grad_norm": 2.836794376373291, + "learning_rate": 3.997955010224949e-05, + "loss": 3.7466, + "step": 392 + }, + { + "epoch": 0.12062615101289134, + "grad_norm": 4.071137428283691, + "learning_rate": 4.0081799591002043e-05, + "loss": 3.7836, + "step": 393 + }, + { + "epoch": 0.12093308778391651, + "grad_norm": 3.3141064643859863, + "learning_rate": 4.0184049079754604e-05, + "loss": 3.754, + "step": 394 + }, + { + "epoch": 0.12124002455494168, + "grad_norm": 3.6064393520355225, + "learning_rate": 4.028629856850716e-05, + "loss": 3.8379, + "step": 395 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 3.7306606769561768, + "learning_rate": 4.038854805725972e-05, + "loss": 3.6848, + "step": 396 + }, + { + "epoch": 0.12185389809699201, + "grad_norm": 3.5877859592437744, + "learning_rate": 4.049079754601227e-05, + "loss": 3.8201, + "step": 397 + }, + { + "epoch": 0.12216083486801718, + "grad_norm": 3.930271625518799, + "learning_rate": 4.059304703476483e-05, + "loss": 3.7507, + "step": 398 + }, + { + "epoch": 0.12246777163904236, + "grad_norm": 2.974968194961548, + "learning_rate": 4.0695296523517386e-05, + "loss": 3.7545, + "step": 399 + }, + { + "epoch": 0.12277470841006753, + "grad_norm": 4.655934810638428, + "learning_rate": 4.079754601226994e-05, + "loss": 3.8093, + "step": 400 + }, + { + "epoch": 0.1230816451810927, + "grad_norm": 3.201986312866211, + "learning_rate": 4.08997955010225e-05, + "loss": 3.7252, + "step": 401 + }, + { + "epoch": 0.12338858195211787, + "grad_norm": 4.447626113891602, + "learning_rate": 4.100204498977505e-05, + "loss": 3.7132, + "step": 402 + }, + { + "epoch": 0.12369551872314304, + "grad_norm": 2.6518118381500244, + "learning_rate": 4.1104294478527614e-05, + "loss": 3.7637, + "step": 403 + }, + { + "epoch": 0.1240024554941682, + "grad_norm": 5.116448402404785, + "learning_rate": 4.120654396728017e-05, + "loss": 3.6991, + "step": 404 + }, + { + "epoch": 0.12430939226519337, + "grad_norm": 2.7780613899230957, + "learning_rate": 4.130879345603272e-05, + "loss": 3.7555, + "step": 405 + }, + { + "epoch": 0.12461632903621854, + "grad_norm": 4.281010627746582, + "learning_rate": 4.1411042944785274e-05, + "loss": 3.688, + "step": 406 + }, + { + "epoch": 0.12492326580724371, + "grad_norm": 2.851562023162842, + "learning_rate": 4.1513292433537835e-05, + "loss": 3.7557, + "step": 407 + }, + { + "epoch": 0.1252302025782689, + "grad_norm": 4.092229843139648, + "learning_rate": 4.161554192229039e-05, + "loss": 3.7179, + "step": 408 + }, + { + "epoch": 0.12553713934929406, + "grad_norm": 3.410094976425171, + "learning_rate": 4.171779141104294e-05, + "loss": 3.7292, + "step": 409 + }, + { + "epoch": 0.12584407612031923, + "grad_norm": 4.266562461853027, + "learning_rate": 4.18200408997955e-05, + "loss": 3.8204, + "step": 410 + }, + { + "epoch": 0.1261510128913444, + "grad_norm": 2.997642755508423, + "learning_rate": 4.1922290388548056e-05, + "loss": 3.7773, + "step": 411 + }, + { + "epoch": 0.12645794966236956, + "grad_norm": 4.50873327255249, + "learning_rate": 4.2024539877300617e-05, + "loss": 3.7255, + "step": 412 + }, + { + "epoch": 0.12676488643339473, + "grad_norm": 3.65312123298645, + "learning_rate": 4.212678936605317e-05, + "loss": 3.6472, + "step": 413 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 3.985487222671509, + "learning_rate": 4.222903885480573e-05, + "loss": 3.6915, + "step": 414 + }, + { + "epoch": 0.12737875997544507, + "grad_norm": 3.6020219326019287, + "learning_rate": 4.2331288343558284e-05, + "loss": 3.7299, + "step": 415 + }, + { + "epoch": 0.12768569674647023, + "grad_norm": 3.414529323577881, + "learning_rate": 4.243353783231084e-05, + "loss": 3.7827, + "step": 416 + }, + { + "epoch": 0.1279926335174954, + "grad_norm": 3.537292718887329, + "learning_rate": 4.25357873210634e-05, + "loss": 3.751, + "step": 417 + }, + { + "epoch": 0.12829957028852057, + "grad_norm": 3.5442280769348145, + "learning_rate": 4.263803680981595e-05, + "loss": 3.6828, + "step": 418 + }, + { + "epoch": 0.12860650705954574, + "grad_norm": 3.9816019535064697, + "learning_rate": 4.274028629856851e-05, + "loss": 3.7668, + "step": 419 + }, + { + "epoch": 0.1289134438305709, + "grad_norm": 3.1632657051086426, + "learning_rate": 4.2842535787321066e-05, + "loss": 3.6946, + "step": 420 + }, + { + "epoch": 0.12922038060159607, + "grad_norm": 4.731013298034668, + "learning_rate": 4.2944785276073626e-05, + "loss": 3.7078, + "step": 421 + }, + { + "epoch": 0.12952731737262124, + "grad_norm": 2.7973382472991943, + "learning_rate": 4.304703476482618e-05, + "loss": 3.5934, + "step": 422 + }, + { + "epoch": 0.1298342541436464, + "grad_norm": 4.555461406707764, + "learning_rate": 4.3149284253578733e-05, + "loss": 3.7406, + "step": 423 + }, + { + "epoch": 0.13014119091467158, + "grad_norm": 3.25795841217041, + "learning_rate": 4.3251533742331294e-05, + "loss": 3.6302, + "step": 424 + }, + { + "epoch": 0.13044812768569675, + "grad_norm": 3.9974427223205566, + "learning_rate": 4.335378323108385e-05, + "loss": 3.6995, + "step": 425 + }, + { + "epoch": 0.13075506445672191, + "grad_norm": 3.4234917163848877, + "learning_rate": 4.34560327198364e-05, + "loss": 3.727, + "step": 426 + }, + { + "epoch": 0.13106200122774708, + "grad_norm": 3.40573787689209, + "learning_rate": 4.3558282208588955e-05, + "loss": 3.6964, + "step": 427 + }, + { + "epoch": 0.13136893799877225, + "grad_norm": 3.6903765201568604, + "learning_rate": 4.3660531697341515e-05, + "loss": 3.7139, + "step": 428 + }, + { + "epoch": 0.13167587476979742, + "grad_norm": 3.3252439498901367, + "learning_rate": 4.376278118609407e-05, + "loss": 3.7221, + "step": 429 + }, + { + "epoch": 0.1319828115408226, + "grad_norm": 3.591610908508301, + "learning_rate": 4.386503067484663e-05, + "loss": 3.6592, + "step": 430 + }, + { + "epoch": 0.13228974831184775, + "grad_norm": 3.584683418273926, + "learning_rate": 4.396728016359918e-05, + "loss": 3.695, + "step": 431 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 3.5093443393707275, + "learning_rate": 4.4069529652351736e-05, + "loss": 3.6368, + "step": 432 + }, + { + "epoch": 0.1329036218538981, + "grad_norm": 3.5040347576141357, + "learning_rate": 4.41717791411043e-05, + "loss": 3.6463, + "step": 433 + }, + { + "epoch": 0.13321055862492326, + "grad_norm": 3.534536361694336, + "learning_rate": 4.427402862985685e-05, + "loss": 3.681, + "step": 434 + }, + { + "epoch": 0.13351749539594843, + "grad_norm": 4.016106605529785, + "learning_rate": 4.437627811860941e-05, + "loss": 3.7592, + "step": 435 + }, + { + "epoch": 0.1338244321669736, + "grad_norm": 3.4661898612976074, + "learning_rate": 4.4478527607361964e-05, + "loss": 3.6437, + "step": 436 + }, + { + "epoch": 0.13413136893799876, + "grad_norm": 3.917189359664917, + "learning_rate": 4.4580777096114525e-05, + "loss": 3.6809, + "step": 437 + }, + { + "epoch": 0.13443830570902393, + "grad_norm": 3.472147226333618, + "learning_rate": 4.468302658486708e-05, + "loss": 3.5978, + "step": 438 + }, + { + "epoch": 0.1347452424800491, + "grad_norm": 3.2357044219970703, + "learning_rate": 4.478527607361964e-05, + "loss": 3.6758, + "step": 439 + }, + { + "epoch": 0.13505217925107427, + "grad_norm": 3.8607826232910156, + "learning_rate": 4.488752556237219e-05, + "loss": 3.7155, + "step": 440 + }, + { + "epoch": 0.13535911602209943, + "grad_norm": 3.085242509841919, + "learning_rate": 4.4989775051124746e-05, + "loss": 3.674, + "step": 441 + }, + { + "epoch": 0.1356660527931246, + "grad_norm": 4.0473432540893555, + "learning_rate": 4.5092024539877307e-05, + "loss": 3.6542, + "step": 442 + }, + { + "epoch": 0.1359729895641498, + "grad_norm": 3.4742088317871094, + "learning_rate": 4.519427402862986e-05, + "loss": 3.6226, + "step": 443 + }, + { + "epoch": 0.13627992633517497, + "grad_norm": 3.8838884830474854, + "learning_rate": 4.5296523517382414e-05, + "loss": 3.695, + "step": 444 + }, + { + "epoch": 0.13658686310620013, + "grad_norm": 3.1551895141601562, + "learning_rate": 4.539877300613497e-05, + "loss": 3.6886, + "step": 445 + }, + { + "epoch": 0.1368937998772253, + "grad_norm": 3.6824824810028076, + "learning_rate": 4.550102249488753e-05, + "loss": 3.6397, + "step": 446 + }, + { + "epoch": 0.13720073664825047, + "grad_norm": 3.3671298027038574, + "learning_rate": 4.560327198364008e-05, + "loss": 3.5983, + "step": 447 + }, + { + "epoch": 0.13750767341927564, + "grad_norm": 4.11976957321167, + "learning_rate": 4.570552147239264e-05, + "loss": 3.6371, + "step": 448 + }, + { + "epoch": 0.1378146101903008, + "grad_norm": 3.2035205364227295, + "learning_rate": 4.5807770961145195e-05, + "loss": 3.6097, + "step": 449 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 4.944174289703369, + "learning_rate": 4.591002044989775e-05, + "loss": 3.6317, + "step": 450 + }, + { + "epoch": 0.13842848373235114, + "grad_norm": 3.0040266513824463, + "learning_rate": 4.601226993865031e-05, + "loss": 3.6407, + "step": 451 + }, + { + "epoch": 0.1387354205033763, + "grad_norm": 5.124639511108398, + "learning_rate": 4.611451942740286e-05, + "loss": 3.6539, + "step": 452 + }, + { + "epoch": 0.13904235727440148, + "grad_norm": 2.792884349822998, + "learning_rate": 4.6216768916155423e-05, + "loss": 3.6542, + "step": 453 + }, + { + "epoch": 0.13934929404542665, + "grad_norm": 4.394725799560547, + "learning_rate": 4.631901840490798e-05, + "loss": 3.6811, + "step": 454 + }, + { + "epoch": 0.13965623081645182, + "grad_norm": 3.209400177001953, + "learning_rate": 4.642126789366054e-05, + "loss": 3.6635, + "step": 455 + }, + { + "epoch": 0.13996316758747698, + "grad_norm": 3.6599526405334473, + "learning_rate": 4.652351738241309e-05, + "loss": 3.5732, + "step": 456 + }, + { + "epoch": 0.14027010435850215, + "grad_norm": 3.6527204513549805, + "learning_rate": 4.6625766871165645e-05, + "loss": 3.5979, + "step": 457 + }, + { + "epoch": 0.14057704112952732, + "grad_norm": 3.4562110900878906, + "learning_rate": 4.6728016359918205e-05, + "loss": 3.6761, + "step": 458 + }, + { + "epoch": 0.1408839779005525, + "grad_norm": 3.5935721397399902, + "learning_rate": 4.683026584867076e-05, + "loss": 3.6598, + "step": 459 + }, + { + "epoch": 0.14119091467157766, + "grad_norm": 3.4518251419067383, + "learning_rate": 4.693251533742332e-05, + "loss": 3.5707, + "step": 460 + }, + { + "epoch": 0.14149785144260282, + "grad_norm": 3.3248815536499023, + "learning_rate": 4.703476482617587e-05, + "loss": 3.6949, + "step": 461 + }, + { + "epoch": 0.141804788213628, + "grad_norm": 3.6379971504211426, + "learning_rate": 4.7137014314928426e-05, + "loss": 3.6265, + "step": 462 + }, + { + "epoch": 0.14211172498465316, + "grad_norm": 4.068325996398926, + "learning_rate": 4.723926380368098e-05, + "loss": 3.6096, + "step": 463 + }, + { + "epoch": 0.14241866175567833, + "grad_norm": 3.0870959758758545, + "learning_rate": 4.734151329243354e-05, + "loss": 3.5201, + "step": 464 + }, + { + "epoch": 0.1427255985267035, + "grad_norm": 4.013638973236084, + "learning_rate": 4.7443762781186094e-05, + "loss": 3.5845, + "step": 465 + }, + { + "epoch": 0.14303253529772866, + "grad_norm": 3.421921968460083, + "learning_rate": 4.754601226993865e-05, + "loss": 3.6718, + "step": 466 + }, + { + "epoch": 0.14333947206875383, + "grad_norm": 3.4814112186431885, + "learning_rate": 4.764826175869121e-05, + "loss": 3.6225, + "step": 467 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 2.9323105812072754, + "learning_rate": 4.775051124744376e-05, + "loss": 3.5881, + "step": 468 + }, + { + "epoch": 0.14395334561080417, + "grad_norm": 3.862344264984131, + "learning_rate": 4.785276073619632e-05, + "loss": 3.6264, + "step": 469 + }, + { + "epoch": 0.14426028238182934, + "grad_norm": 2.950495481491089, + "learning_rate": 4.7955010224948876e-05, + "loss": 3.5891, + "step": 470 + }, + { + "epoch": 0.1445672191528545, + "grad_norm": 4.360744476318359, + "learning_rate": 4.8057259713701436e-05, + "loss": 3.6746, + "step": 471 + }, + { + "epoch": 0.14487415592387967, + "grad_norm": 2.689297914505005, + "learning_rate": 4.815950920245399e-05, + "loss": 3.616, + "step": 472 + }, + { + "epoch": 0.14518109269490484, + "grad_norm": 4.433006286621094, + "learning_rate": 4.826175869120655e-05, + "loss": 3.6259, + "step": 473 + }, + { + "epoch": 0.14548802946593, + "grad_norm": 2.9184467792510986, + "learning_rate": 4.8364008179959104e-05, + "loss": 3.59, + "step": 474 + }, + { + "epoch": 0.14579496623695518, + "grad_norm": 4.472714424133301, + "learning_rate": 4.846625766871166e-05, + "loss": 3.5608, + "step": 475 + }, + { + "epoch": 0.14610190300798034, + "grad_norm": 3.0839431285858154, + "learning_rate": 4.856850715746422e-05, + "loss": 3.6069, + "step": 476 + }, + { + "epoch": 0.1464088397790055, + "grad_norm": 3.8900411128997803, + "learning_rate": 4.867075664621677e-05, + "loss": 3.5387, + "step": 477 + }, + { + "epoch": 0.14671577655003068, + "grad_norm": 3.0446956157684326, + "learning_rate": 4.877300613496933e-05, + "loss": 3.5374, + "step": 478 + }, + { + "epoch": 0.14702271332105588, + "grad_norm": 3.805018901824951, + "learning_rate": 4.8875255623721885e-05, + "loss": 3.6032, + "step": 479 + }, + { + "epoch": 0.14732965009208104, + "grad_norm": 2.9937491416931152, + "learning_rate": 4.897750511247444e-05, + "loss": 3.548, + "step": 480 + }, + { + "epoch": 0.1476365868631062, + "grad_norm": 4.103757858276367, + "learning_rate": 4.907975460122699e-05, + "loss": 3.6292, + "step": 481 + }, + { + "epoch": 0.14794352363413138, + "grad_norm": 2.8275530338287354, + "learning_rate": 4.918200408997955e-05, + "loss": 3.5885, + "step": 482 + }, + { + "epoch": 0.14825046040515655, + "grad_norm": 4.104444980621338, + "learning_rate": 4.928425357873211e-05, + "loss": 3.5566, + "step": 483 + }, + { + "epoch": 0.14855739717618172, + "grad_norm": 2.820648670196533, + "learning_rate": 4.938650306748466e-05, + "loss": 3.6576, + "step": 484 + }, + { + "epoch": 0.14886433394720688, + "grad_norm": 4.639568328857422, + "learning_rate": 4.948875255623722e-05, + "loss": 3.583, + "step": 485 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 2.8675858974456787, + "learning_rate": 4.9591002044989774e-05, + "loss": 3.5982, + "step": 486 + }, + { + "epoch": 0.14947820748925722, + "grad_norm": 4.820484638214111, + "learning_rate": 4.9693251533742335e-05, + "loss": 3.5479, + "step": 487 + }, + { + "epoch": 0.1497851442602824, + "grad_norm": 2.9569075107574463, + "learning_rate": 4.979550102249489e-05, + "loss": 3.5846, + "step": 488 + }, + { + "epoch": 0.15009208103130756, + "grad_norm": 4.402152061462402, + "learning_rate": 4.989775051124745e-05, + "loss": 3.5368, + "step": 489 + }, + { + "epoch": 0.15039901780233272, + "grad_norm": 3.0454704761505127, + "learning_rate": 5e-05, + "loss": 3.5233, + "step": 490 + }, + { + "epoch": 0.1507059545733579, + "grad_norm": 3.564425468444824, + "learning_rate": 5.010224948875256e-05, + "loss": 3.5747, + "step": 491 + }, + { + "epoch": 0.15101289134438306, + "grad_norm": 3.2065536975860596, + "learning_rate": 5.020449897750511e-05, + "loss": 3.4803, + "step": 492 + }, + { + "epoch": 0.15131982811540823, + "grad_norm": 4.06170129776001, + "learning_rate": 5.030674846625767e-05, + "loss": 3.5867, + "step": 493 + }, + { + "epoch": 0.1516267648864334, + "grad_norm": 2.937181234359741, + "learning_rate": 5.040899795501023e-05, + "loss": 3.5098, + "step": 494 + }, + { + "epoch": 0.15193370165745856, + "grad_norm": 3.7272653579711914, + "learning_rate": 5.051124744376279e-05, + "loss": 3.5959, + "step": 495 + }, + { + "epoch": 0.15224063842848373, + "grad_norm": 2.8606886863708496, + "learning_rate": 5.061349693251534e-05, + "loss": 3.4881, + "step": 496 + }, + { + "epoch": 0.1525475751995089, + "grad_norm": 3.4861185550689697, + "learning_rate": 5.07157464212679e-05, + "loss": 3.563, + "step": 497 + }, + { + "epoch": 0.15285451197053407, + "grad_norm": 3.1362967491149902, + "learning_rate": 5.081799591002045e-05, + "loss": 3.5564, + "step": 498 + }, + { + "epoch": 0.15316144874155924, + "grad_norm": 3.360508441925049, + "learning_rate": 5.0920245398773005e-05, + "loss": 3.5307, + "step": 499 + }, + { + "epoch": 0.1534683855125844, + "grad_norm": 3.2896840572357178, + "learning_rate": 5.1022494887525566e-05, + "loss": 3.4843, + "step": 500 + }, + { + "epoch": 0.15377532228360957, + "grad_norm": 3.320429801940918, + "learning_rate": 5.112474437627812e-05, + "loss": 3.484, + "step": 501 + }, + { + "epoch": 0.15408225905463474, + "grad_norm": 3.409586191177368, + "learning_rate": 5.122699386503068e-05, + "loss": 3.506, + "step": 502 + }, + { + "epoch": 0.1543891958256599, + "grad_norm": 3.0944409370422363, + "learning_rate": 5.1329243353783227e-05, + "loss": 3.5011, + "step": 503 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 3.7220418453216553, + "learning_rate": 5.143149284253579e-05, + "loss": 3.5629, + "step": 504 + }, + { + "epoch": 0.15500306936771024, + "grad_norm": 3.217435359954834, + "learning_rate": 5.153374233128835e-05, + "loss": 3.4957, + "step": 505 + }, + { + "epoch": 0.1553100061387354, + "grad_norm": 4.0457444190979, + "learning_rate": 5.163599182004091e-05, + "loss": 3.5152, + "step": 506 + }, + { + "epoch": 0.15561694290976058, + "grad_norm": 2.9380006790161133, + "learning_rate": 5.1738241308793455e-05, + "loss": 3.5261, + "step": 507 + }, + { + "epoch": 0.15592387968078575, + "grad_norm": 4.134535312652588, + "learning_rate": 5.1840490797546015e-05, + "loss": 3.5622, + "step": 508 + }, + { + "epoch": 0.15623081645181092, + "grad_norm": 2.8209407329559326, + "learning_rate": 5.1942740286298575e-05, + "loss": 3.5335, + "step": 509 + }, + { + "epoch": 0.15653775322283608, + "grad_norm": 4.4260711669921875, + "learning_rate": 5.204498977505112e-05, + "loss": 3.5554, + "step": 510 + }, + { + "epoch": 0.15684468999386125, + "grad_norm": 2.8649590015411377, + "learning_rate": 5.214723926380368e-05, + "loss": 3.4989, + "step": 511 + }, + { + "epoch": 0.15715162676488642, + "grad_norm": 4.0349812507629395, + "learning_rate": 5.224948875255624e-05, + "loss": 3.4883, + "step": 512 + }, + { + "epoch": 0.1574585635359116, + "grad_norm": 2.841923475265503, + "learning_rate": 5.2351738241308803e-05, + "loss": 3.4748, + "step": 513 + }, + { + "epoch": 0.15776550030693678, + "grad_norm": 3.8810653686523438, + "learning_rate": 5.245398773006135e-05, + "loss": 3.5403, + "step": 514 + }, + { + "epoch": 0.15807243707796195, + "grad_norm": 3.0830774307250977, + "learning_rate": 5.255623721881391e-05, + "loss": 3.513, + "step": 515 + }, + { + "epoch": 0.15837937384898712, + "grad_norm": 3.8688604831695557, + "learning_rate": 5.265848670756647e-05, + "loss": 3.5409, + "step": 516 + }, + { + "epoch": 0.1586863106200123, + "grad_norm": 2.854600429534912, + "learning_rate": 5.276073619631902e-05, + "loss": 3.4441, + "step": 517 + }, + { + "epoch": 0.15899324739103746, + "grad_norm": 3.9125611782073975, + "learning_rate": 5.286298568507158e-05, + "loss": 3.4953, + "step": 518 + }, + { + "epoch": 0.15930018416206262, + "grad_norm": 2.8626177310943604, + "learning_rate": 5.296523517382413e-05, + "loss": 3.5279, + "step": 519 + }, + { + "epoch": 0.1596071209330878, + "grad_norm": 3.5023677349090576, + "learning_rate": 5.306748466257669e-05, + "loss": 3.4886, + "step": 520 + }, + { + "epoch": 0.15991405770411296, + "grad_norm": 2.960505962371826, + "learning_rate": 5.316973415132924e-05, + "loss": 3.5278, + "step": 521 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 3.976245164871216, + "learning_rate": 5.32719836400818e-05, + "loss": 3.5236, + "step": 522 + }, + { + "epoch": 0.1605279312461633, + "grad_norm": 3.078248977661133, + "learning_rate": 5.337423312883436e-05, + "loss": 3.5194, + "step": 523 + }, + { + "epoch": 0.16083486801718846, + "grad_norm": 3.7498552799224854, + "learning_rate": 5.347648261758691e-05, + "loss": 3.5315, + "step": 524 + }, + { + "epoch": 0.16114180478821363, + "grad_norm": 2.87638258934021, + "learning_rate": 5.357873210633947e-05, + "loss": 3.434, + "step": 525 + }, + { + "epoch": 0.1614487415592388, + "grad_norm": 3.786454677581787, + "learning_rate": 5.368098159509203e-05, + "loss": 3.4985, + "step": 526 + }, + { + "epoch": 0.16175567833026397, + "grad_norm": 2.915156364440918, + "learning_rate": 5.378323108384459e-05, + "loss": 3.4979, + "step": 527 + }, + { + "epoch": 0.16206261510128914, + "grad_norm": 4.095824718475342, + "learning_rate": 5.3885480572597135e-05, + "loss": 3.4605, + "step": 528 + }, + { + "epoch": 0.1623695518723143, + "grad_norm": 2.793501853942871, + "learning_rate": 5.3987730061349695e-05, + "loss": 3.476, + "step": 529 + }, + { + "epoch": 0.16267648864333947, + "grad_norm": 3.9074480533599854, + "learning_rate": 5.4089979550102256e-05, + "loss": 3.4636, + "step": 530 + }, + { + "epoch": 0.16298342541436464, + "grad_norm": 2.8382515907287598, + "learning_rate": 5.4192229038854816e-05, + "loss": 3.4364, + "step": 531 + }, + { + "epoch": 0.1632903621853898, + "grad_norm": 3.4670751094818115, + "learning_rate": 5.429447852760736e-05, + "loss": 3.5033, + "step": 532 + }, + { + "epoch": 0.16359729895641498, + "grad_norm": 2.8805580139160156, + "learning_rate": 5.439672801635992e-05, + "loss": 3.471, + "step": 533 + }, + { + "epoch": 0.16390423572744015, + "grad_norm": 3.745434522628784, + "learning_rate": 5.4498977505112484e-05, + "loss": 3.4565, + "step": 534 + }, + { + "epoch": 0.1642111724984653, + "grad_norm": 3.290579319000244, + "learning_rate": 5.460122699386503e-05, + "loss": 3.47, + "step": 535 + }, + { + "epoch": 0.16451810926949048, + "grad_norm": 3.2988481521606445, + "learning_rate": 5.470347648261759e-05, + "loss": 3.3781, + "step": 536 + }, + { + "epoch": 0.16482504604051565, + "grad_norm": 3.3673248291015625, + "learning_rate": 5.4805725971370145e-05, + "loss": 3.4891, + "step": 537 + }, + { + "epoch": 0.16513198281154082, + "grad_norm": 3.1917717456817627, + "learning_rate": 5.4907975460122705e-05, + "loss": 3.4493, + "step": 538 + }, + { + "epoch": 0.16543891958256599, + "grad_norm": 3.3869614601135254, + "learning_rate": 5.501022494887525e-05, + "loss": 3.3954, + "step": 539 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 2.896742820739746, + "learning_rate": 5.511247443762781e-05, + "loss": 3.4465, + "step": 540 + }, + { + "epoch": 0.16605279312461632, + "grad_norm": 3.771268844604492, + "learning_rate": 5.521472392638037e-05, + "loss": 3.4889, + "step": 541 + }, + { + "epoch": 0.1663597298956415, + "grad_norm": 2.8693349361419678, + "learning_rate": 5.531697341513292e-05, + "loss": 3.3661, + "step": 542 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.093103885650635, + "learning_rate": 5.541922290388548e-05, + "loss": 3.4451, + "step": 543 + }, + { + "epoch": 0.16697360343769183, + "grad_norm": 3.050361394882202, + "learning_rate": 5.552147239263804e-05, + "loss": 3.4203, + "step": 544 + }, + { + "epoch": 0.167280540208717, + "grad_norm": 3.041480302810669, + "learning_rate": 5.56237218813906e-05, + "loss": 3.4173, + "step": 545 + }, + { + "epoch": 0.16758747697974216, + "grad_norm": 3.385680675506592, + "learning_rate": 5.572597137014315e-05, + "loss": 3.4408, + "step": 546 + }, + { + "epoch": 0.16789441375076733, + "grad_norm": 2.88845157623291, + "learning_rate": 5.582822085889571e-05, + "loss": 3.4536, + "step": 547 + }, + { + "epoch": 0.1682013505217925, + "grad_norm": 3.7155961990356445, + "learning_rate": 5.593047034764827e-05, + "loss": 3.4392, + "step": 548 + }, + { + "epoch": 0.1685082872928177, + "grad_norm": 3.4626615047454834, + "learning_rate": 5.6032719836400815e-05, + "loss": 3.4395, + "step": 549 + }, + { + "epoch": 0.16881522406384286, + "grad_norm": 3.182154417037964, + "learning_rate": 5.6134969325153376e-05, + "loss": 3.5239, + "step": 550 + }, + { + "epoch": 0.16912216083486803, + "grad_norm": 3.478602886199951, + "learning_rate": 5.6237218813905936e-05, + "loss": 3.4258, + "step": 551 + }, + { + "epoch": 0.1694290976058932, + "grad_norm": 2.9652369022369385, + "learning_rate": 5.6339468302658496e-05, + "loss": 3.3919, + "step": 552 + }, + { + "epoch": 0.16973603437691837, + "grad_norm": 3.736821413040161, + "learning_rate": 5.644171779141104e-05, + "loss": 3.4491, + "step": 553 + }, + { + "epoch": 0.17004297114794353, + "grad_norm": 2.7791361808776855, + "learning_rate": 5.6543967280163604e-05, + "loss": 3.4748, + "step": 554 + }, + { + "epoch": 0.1703499079189687, + "grad_norm": 4.583637714385986, + "learning_rate": 5.664621676891616e-05, + "loss": 3.4554, + "step": 555 + }, + { + "epoch": 0.17065684468999387, + "grad_norm": 2.8527474403381348, + "learning_rate": 5.674846625766872e-05, + "loss": 3.4327, + "step": 556 + }, + { + "epoch": 0.17096378146101904, + "grad_norm": 4.116163730621338, + "learning_rate": 5.685071574642127e-05, + "loss": 3.4043, + "step": 557 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 3.0130903720855713, + "learning_rate": 5.6952965235173825e-05, + "loss": 3.4823, + "step": 558 + }, + { + "epoch": 0.17157765500306937, + "grad_norm": 3.3556432723999023, + "learning_rate": 5.7055214723926385e-05, + "loss": 3.4464, + "step": 559 + }, + { + "epoch": 0.17188459177409454, + "grad_norm": 2.854952573776245, + "learning_rate": 5.715746421267893e-05, + "loss": 3.3768, + "step": 560 + }, + { + "epoch": 0.1721915285451197, + "grad_norm": 3.9891982078552246, + "learning_rate": 5.725971370143149e-05, + "loss": 3.3949, + "step": 561 + }, + { + "epoch": 0.17249846531614488, + "grad_norm": 2.980468511581421, + "learning_rate": 5.736196319018405e-05, + "loss": 3.459, + "step": 562 + }, + { + "epoch": 0.17280540208717005, + "grad_norm": 3.453510284423828, + "learning_rate": 5.7464212678936613e-05, + "loss": 3.4549, + "step": 563 + }, + { + "epoch": 0.1731123388581952, + "grad_norm": 2.8926782608032227, + "learning_rate": 5.756646216768916e-05, + "loss": 3.392, + "step": 564 + }, + { + "epoch": 0.17341927562922038, + "grad_norm": 3.3722894191741943, + "learning_rate": 5.766871165644172e-05, + "loss": 3.4002, + "step": 565 + }, + { + "epoch": 0.17372621240024555, + "grad_norm": 2.8093647956848145, + "learning_rate": 5.777096114519428e-05, + "loss": 3.3862, + "step": 566 + }, + { + "epoch": 0.17403314917127072, + "grad_norm": 4.1722731590271, + "learning_rate": 5.787321063394683e-05, + "loss": 3.3903, + "step": 567 + }, + { + "epoch": 0.17434008594229589, + "grad_norm": 2.778069257736206, + "learning_rate": 5.797546012269939e-05, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.17464702271332105, + "grad_norm": 3.8501908779144287, + "learning_rate": 5.807770961145195e-05, + "loss": 3.4094, + "step": 569 + }, + { + "epoch": 0.17495395948434622, + "grad_norm": 2.5164549350738525, + "learning_rate": 5.817995910020451e-05, + "loss": 3.4343, + "step": 570 + }, + { + "epoch": 0.1752608962553714, + "grad_norm": 4.0673065185546875, + "learning_rate": 5.8282208588957056e-05, + "loss": 3.3993, + "step": 571 + }, + { + "epoch": 0.17556783302639656, + "grad_norm": 2.7882072925567627, + "learning_rate": 5.8384458077709616e-05, + "loss": 3.4759, + "step": 572 + }, + { + "epoch": 0.17587476979742173, + "grad_norm": 3.3252487182617188, + "learning_rate": 5.848670756646217e-05, + "loss": 3.3562, + "step": 573 + }, + { + "epoch": 0.1761817065684469, + "grad_norm": 2.7499115467071533, + "learning_rate": 5.8588957055214724e-05, + "loss": 3.3376, + "step": 574 + }, + { + "epoch": 0.17648864333947206, + "grad_norm": 4.061224460601807, + "learning_rate": 5.8691206543967284e-05, + "loss": 3.3521, + "step": 575 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 3.022193431854248, + "learning_rate": 5.879345603271984e-05, + "loss": 3.3933, + "step": 576 + }, + { + "epoch": 0.1771025168815224, + "grad_norm": 3.2442128658294678, + "learning_rate": 5.88957055214724e-05, + "loss": 3.4531, + "step": 577 + }, + { + "epoch": 0.17740945365254757, + "grad_norm": 2.9524872303009033, + "learning_rate": 5.8997955010224945e-05, + "loss": 3.332, + "step": 578 + }, + { + "epoch": 0.17771639042357273, + "grad_norm": 3.4604902267456055, + "learning_rate": 5.9100204498977505e-05, + "loss": 3.3706, + "step": 579 + }, + { + "epoch": 0.1780233271945979, + "grad_norm": 3.05216646194458, + "learning_rate": 5.9202453987730066e-05, + "loss": 3.463, + "step": 580 + }, + { + "epoch": 0.17833026396562307, + "grad_norm": 3.427311658859253, + "learning_rate": 5.9304703476482626e-05, + "loss": 3.4204, + "step": 581 + }, + { + "epoch": 0.17863720073664824, + "grad_norm": 2.5583856105804443, + "learning_rate": 5.940695296523517e-05, + "loss": 3.4686, + "step": 582 + }, + { + "epoch": 0.1789441375076734, + "grad_norm": 3.85471248626709, + "learning_rate": 5.950920245398773e-05, + "loss": 3.4518, + "step": 583 + }, + { + "epoch": 0.17925107427869857, + "grad_norm": 2.6894235610961914, + "learning_rate": 5.9611451942740294e-05, + "loss": 3.4179, + "step": 584 + }, + { + "epoch": 0.17955801104972377, + "grad_norm": 3.7592904567718506, + "learning_rate": 5.971370143149284e-05, + "loss": 3.3197, + "step": 585 + }, + { + "epoch": 0.17986494782074894, + "grad_norm": 2.8180313110351562, + "learning_rate": 5.98159509202454e-05, + "loss": 3.4098, + "step": 586 + }, + { + "epoch": 0.1801718845917741, + "grad_norm": 3.5678224563598633, + "learning_rate": 5.991820040899796e-05, + "loss": 3.3644, + "step": 587 + }, + { + "epoch": 0.18047882136279927, + "grad_norm": 2.920607328414917, + "learning_rate": 6.002044989775052e-05, + "loss": 3.4158, + "step": 588 + }, + { + "epoch": 0.18078575813382444, + "grad_norm": 2.9465436935424805, + "learning_rate": 6.012269938650307e-05, + "loss": 3.3369, + "step": 589 + }, + { + "epoch": 0.1810926949048496, + "grad_norm": 3.8760533332824707, + "learning_rate": 6.022494887525563e-05, + "loss": 3.4205, + "step": 590 + }, + { + "epoch": 0.18139963167587478, + "grad_norm": 3.2972259521484375, + "learning_rate": 6.032719836400819e-05, + "loss": 3.3234, + "step": 591 + }, + { + "epoch": 0.18170656844689995, + "grad_norm": 2.8855841159820557, + "learning_rate": 6.0429447852760736e-05, + "loss": 3.4172, + "step": 592 + }, + { + "epoch": 0.18201350521792511, + "grad_norm": 3.3035166263580322, + "learning_rate": 6.05316973415133e-05, + "loss": 3.3235, + "step": 593 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 2.5975232124328613, + "learning_rate": 6.063394683026585e-05, + "loss": 3.3245, + "step": 594 + }, + { + "epoch": 0.18262737875997545, + "grad_norm": 3.68007755279541, + "learning_rate": 6.073619631901841e-05, + "loss": 3.4348, + "step": 595 + }, + { + "epoch": 0.18293431553100062, + "grad_norm": 2.774419069290161, + "learning_rate": 6.083844580777096e-05, + "loss": 3.2763, + "step": 596 + }, + { + "epoch": 0.1832412523020258, + "grad_norm": 3.686140298843384, + "learning_rate": 6.094069529652352e-05, + "loss": 3.29, + "step": 597 + }, + { + "epoch": 0.18354818907305095, + "grad_norm": 2.71142315864563, + "learning_rate": 6.104294478527609e-05, + "loss": 3.3899, + "step": 598 + }, + { + "epoch": 0.18385512584407612, + "grad_norm": 3.725736141204834, + "learning_rate": 6.114519427402863e-05, + "loss": 3.3844, + "step": 599 + }, + { + "epoch": 0.1841620626151013, + "grad_norm": 2.691237211227417, + "learning_rate": 6.124744376278119e-05, + "loss": 3.3138, + "step": 600 + }, + { + "epoch": 0.18446899938612646, + "grad_norm": 3.467499256134033, + "learning_rate": 6.134969325153375e-05, + "loss": 3.3501, + "step": 601 + }, + { + "epoch": 0.18477593615715163, + "grad_norm": 2.776309013366699, + "learning_rate": 6.14519427402863e-05, + "loss": 3.3278, + "step": 602 + }, + { + "epoch": 0.1850828729281768, + "grad_norm": 3.4674019813537598, + "learning_rate": 6.155419222903885e-05, + "loss": 3.262, + "step": 603 + }, + { + "epoch": 0.18538980969920196, + "grad_norm": 2.8091421127319336, + "learning_rate": 6.165644171779141e-05, + "loss": 3.3296, + "step": 604 + }, + { + "epoch": 0.18569674647022713, + "grad_norm": 3.4938528537750244, + "learning_rate": 6.175869120654397e-05, + "loss": 3.4028, + "step": 605 + }, + { + "epoch": 0.1860036832412523, + "grad_norm": 2.5200188159942627, + "learning_rate": 6.186094069529653e-05, + "loss": 3.3726, + "step": 606 + }, + { + "epoch": 0.18631062001227747, + "grad_norm": 3.6415109634399414, + "learning_rate": 6.196319018404908e-05, + "loss": 3.3539, + "step": 607 + }, + { + "epoch": 0.18661755678330263, + "grad_norm": 2.553532123565674, + "learning_rate": 6.206543967280163e-05, + "loss": 3.2971, + "step": 608 + }, + { + "epoch": 0.1869244935543278, + "grad_norm": 3.7287046909332275, + "learning_rate": 6.21676891615542e-05, + "loss": 3.3987, + "step": 609 + }, + { + "epoch": 0.18723143032535297, + "grad_norm": 2.6285226345062256, + "learning_rate": 6.226993865030674e-05, + "loss": 3.2446, + "step": 610 + }, + { + "epoch": 0.18753836709637814, + "grad_norm": 3.453766107559204, + "learning_rate": 6.237218813905931e-05, + "loss": 3.2644, + "step": 611 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 2.7924115657806396, + "learning_rate": 6.247443762781186e-05, + "loss": 3.3056, + "step": 612 + }, + { + "epoch": 0.18815224063842848, + "grad_norm": 3.4854533672332764, + "learning_rate": 6.257668711656443e-05, + "loss": 3.3468, + "step": 613 + }, + { + "epoch": 0.18845917740945364, + "grad_norm": 2.8738653659820557, + "learning_rate": 6.267893660531697e-05, + "loss": 3.3079, + "step": 614 + }, + { + "epoch": 0.1887661141804788, + "grad_norm": 3.496342420578003, + "learning_rate": 6.278118609406954e-05, + "loss": 3.3453, + "step": 615 + }, + { + "epoch": 0.18907305095150398, + "grad_norm": 3.1935245990753174, + "learning_rate": 6.288343558282209e-05, + "loss": 3.303, + "step": 616 + }, + { + "epoch": 0.18937998772252915, + "grad_norm": 2.9726579189300537, + "learning_rate": 6.298568507157464e-05, + "loss": 3.284, + "step": 617 + }, + { + "epoch": 0.18968692449355432, + "grad_norm": 2.8515241146087646, + "learning_rate": 6.30879345603272e-05, + "loss": 3.2748, + "step": 618 + }, + { + "epoch": 0.18999386126457948, + "grad_norm": 3.216681480407715, + "learning_rate": 6.319018404907977e-05, + "loss": 3.2613, + "step": 619 + }, + { + "epoch": 0.19030079803560468, + "grad_norm": 2.9164562225341797, + "learning_rate": 6.329243353783232e-05, + "loss": 3.3234, + "step": 620 + }, + { + "epoch": 0.19060773480662985, + "grad_norm": 2.6724259853363037, + "learning_rate": 6.339468302658487e-05, + "loss": 3.3271, + "step": 621 + }, + { + "epoch": 0.19091467157765502, + "grad_norm": 3.298551082611084, + "learning_rate": 6.349693251533743e-05, + "loss": 3.2715, + "step": 622 + }, + { + "epoch": 0.19122160834868018, + "grad_norm": 2.609632968902588, + "learning_rate": 6.359918200408998e-05, + "loss": 3.2392, + "step": 623 + }, + { + "epoch": 0.19152854511970535, + "grad_norm": 3.6469385623931885, + "learning_rate": 6.370143149284253e-05, + "loss": 3.428, + "step": 624 + }, + { + "epoch": 0.19183548189073052, + "grad_norm": 2.4231622219085693, + "learning_rate": 6.380368098159509e-05, + "loss": 3.3436, + "step": 625 + }, + { + "epoch": 0.1921424186617557, + "grad_norm": 3.9182474613189697, + "learning_rate": 6.390593047034765e-05, + "loss": 3.3375, + "step": 626 + }, + { + "epoch": 0.19244935543278086, + "grad_norm": 2.3975942134857178, + "learning_rate": 6.400817995910021e-05, + "loss": 3.2711, + "step": 627 + }, + { + "epoch": 0.19275629220380602, + "grad_norm": 3.061039447784424, + "learning_rate": 6.411042944785276e-05, + "loss": 3.3124, + "step": 628 + }, + { + "epoch": 0.1930632289748312, + "grad_norm": 2.9461817741394043, + "learning_rate": 6.421267893660532e-05, + "loss": 3.2954, + "step": 629 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 2.6603662967681885, + "learning_rate": 6.431492842535788e-05, + "loss": 3.2138, + "step": 630 + }, + { + "epoch": 0.19367710251688153, + "grad_norm": 3.339444875717163, + "learning_rate": 6.441717791411042e-05, + "loss": 3.2796, + "step": 631 + }, + { + "epoch": 0.1939840392879067, + "grad_norm": 2.59061861038208, + "learning_rate": 6.451942740286299e-05, + "loss": 3.3906, + "step": 632 + }, + { + "epoch": 0.19429097605893186, + "grad_norm": 3.704300880432129, + "learning_rate": 6.462167689161554e-05, + "loss": 3.2604, + "step": 633 + }, + { + "epoch": 0.19459791282995703, + "grad_norm": 3.110203266143799, + "learning_rate": 6.472392638036811e-05, + "loss": 3.3236, + "step": 634 + }, + { + "epoch": 0.1949048496009822, + "grad_norm": 3.016730308532715, + "learning_rate": 6.482617586912065e-05, + "loss": 3.2911, + "step": 635 + }, + { + "epoch": 0.19521178637200737, + "grad_norm": 2.896956205368042, + "learning_rate": 6.492842535787322e-05, + "loss": 3.35, + "step": 636 + }, + { + "epoch": 0.19551872314303254, + "grad_norm": 2.7913663387298584, + "learning_rate": 6.503067484662577e-05, + "loss": 3.3474, + "step": 637 + }, + { + "epoch": 0.1958256599140577, + "grad_norm": 3.285518169403076, + "learning_rate": 6.513292433537832e-05, + "loss": 3.2131, + "step": 638 + }, + { + "epoch": 0.19613259668508287, + "grad_norm": 2.588491201400757, + "learning_rate": 6.523517382413088e-05, + "loss": 3.2955, + "step": 639 + }, + { + "epoch": 0.19643953345610804, + "grad_norm": 2.9417827129364014, + "learning_rate": 6.533742331288345e-05, + "loss": 3.2917, + "step": 640 + }, + { + "epoch": 0.1967464702271332, + "grad_norm": 3.2209408283233643, + "learning_rate": 6.5439672801636e-05, + "loss": 3.233, + "step": 641 + }, + { + "epoch": 0.19705340699815838, + "grad_norm": 2.8424925804138184, + "learning_rate": 6.554192229038855e-05, + "loss": 3.3194, + "step": 642 + }, + { + "epoch": 0.19736034376918354, + "grad_norm": 2.9005842208862305, + "learning_rate": 6.56441717791411e-05, + "loss": 3.275, + "step": 643 + }, + { + "epoch": 0.1976672805402087, + "grad_norm": 3.0277016162872314, + "learning_rate": 6.574642126789366e-05, + "loss": 3.2881, + "step": 644 + }, + { + "epoch": 0.19797421731123388, + "grad_norm": 2.8932368755340576, + "learning_rate": 6.584867075664623e-05, + "loss": 3.2799, + "step": 645 + }, + { + "epoch": 0.19828115408225905, + "grad_norm": 2.994464635848999, + "learning_rate": 6.595092024539877e-05, + "loss": 3.258, + "step": 646 + }, + { + "epoch": 0.19858809085328422, + "grad_norm": 2.943040132522583, + "learning_rate": 6.605316973415133e-05, + "loss": 3.1994, + "step": 647 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 2.942765712738037, + "learning_rate": 6.615541922290389e-05, + "loss": 3.1802, + "step": 648 + }, + { + "epoch": 0.19920196439533455, + "grad_norm": 2.8036246299743652, + "learning_rate": 6.625766871165644e-05, + "loss": 3.2426, + "step": 649 + }, + { + "epoch": 0.19950890116635972, + "grad_norm": 2.814507484436035, + "learning_rate": 6.6359918200409e-05, + "loss": 3.2978, + "step": 650 + }, + { + "epoch": 0.1998158379373849, + "grad_norm": 2.8133158683776855, + "learning_rate": 6.646216768916156e-05, + "loss": 3.2435, + "step": 651 + }, + { + "epoch": 0.20012277470841006, + "grad_norm": 2.8596129417419434, + "learning_rate": 6.656441717791412e-05, + "loss": 3.2154, + "step": 652 + }, + { + "epoch": 0.20042971147943522, + "grad_norm": 2.663926839828491, + "learning_rate": 6.666666666666667e-05, + "loss": 3.2487, + "step": 653 + }, + { + "epoch": 0.2007366482504604, + "grad_norm": 3.40561580657959, + "learning_rate": 6.676891615541922e-05, + "loss": 3.1509, + "step": 654 + }, + { + "epoch": 0.20104358502148556, + "grad_norm": 2.5786798000335693, + "learning_rate": 6.687116564417179e-05, + "loss": 3.2686, + "step": 655 + }, + { + "epoch": 0.20135052179251076, + "grad_norm": 3.007436752319336, + "learning_rate": 6.697341513292433e-05, + "loss": 3.2543, + "step": 656 + }, + { + "epoch": 0.20165745856353592, + "grad_norm": 2.5966951847076416, + "learning_rate": 6.70756646216769e-05, + "loss": 3.2643, + "step": 657 + }, + { + "epoch": 0.2019643953345611, + "grad_norm": 3.2698333263397217, + "learning_rate": 6.717791411042945e-05, + "loss": 3.2002, + "step": 658 + }, + { + "epoch": 0.20227133210558626, + "grad_norm": 2.513129472732544, + "learning_rate": 6.7280163599182e-05, + "loss": 3.1551, + "step": 659 + }, + { + "epoch": 0.20257826887661143, + "grad_norm": 2.9690299034118652, + "learning_rate": 6.738241308793456e-05, + "loss": 3.3037, + "step": 660 + }, + { + "epoch": 0.2028852056476366, + "grad_norm": 2.6644227504730225, + "learning_rate": 6.748466257668711e-05, + "loss": 3.3225, + "step": 661 + }, + { + "epoch": 0.20319214241866176, + "grad_norm": 2.6990232467651367, + "learning_rate": 6.758691206543968e-05, + "loss": 3.227, + "step": 662 + }, + { + "epoch": 0.20349907918968693, + "grad_norm": 3.6271350383758545, + "learning_rate": 6.768916155419223e-05, + "loss": 3.32, + "step": 663 + }, + { + "epoch": 0.2038060159607121, + "grad_norm": 2.6351428031921387, + "learning_rate": 6.779141104294479e-05, + "loss": 3.2104, + "step": 664 + }, + { + "epoch": 0.20411295273173727, + "grad_norm": 3.980685234069824, + "learning_rate": 6.789366053169734e-05, + "loss": 3.2602, + "step": 665 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 2.5207509994506836, + "learning_rate": 6.799591002044991e-05, + "loss": 3.2256, + "step": 666 + }, + { + "epoch": 0.2047268262737876, + "grad_norm": 3.0568666458129883, + "learning_rate": 6.809815950920245e-05, + "loss": 3.2918, + "step": 667 + }, + { + "epoch": 0.20503376304481277, + "grad_norm": 2.6476826667785645, + "learning_rate": 6.820040899795501e-05, + "loss": 3.2745, + "step": 668 + }, + { + "epoch": 0.20534069981583794, + "grad_norm": 3.0413191318511963, + "learning_rate": 6.830265848670757e-05, + "loss": 3.2683, + "step": 669 + }, + { + "epoch": 0.2056476365868631, + "grad_norm": 2.6214709281921387, + "learning_rate": 6.840490797546014e-05, + "loss": 3.1399, + "step": 670 + }, + { + "epoch": 0.20595457335788828, + "grad_norm": 3.0577988624572754, + "learning_rate": 6.850715746421268e-05, + "loss": 3.2131, + "step": 671 + }, + { + "epoch": 0.20626151012891344, + "grad_norm": 2.795365571975708, + "learning_rate": 6.860940695296524e-05, + "loss": 3.1633, + "step": 672 + }, + { + "epoch": 0.2065684468999386, + "grad_norm": 3.3030495643615723, + "learning_rate": 6.87116564417178e-05, + "loss": 3.2036, + "step": 673 + }, + { + "epoch": 0.20687538367096378, + "grad_norm": 2.3182966709136963, + "learning_rate": 6.881390593047035e-05, + "loss": 3.2154, + "step": 674 + }, + { + "epoch": 0.20718232044198895, + "grad_norm": 3.133702039718628, + "learning_rate": 6.89161554192229e-05, + "loss": 3.1828, + "step": 675 + }, + { + "epoch": 0.20748925721301412, + "grad_norm": 2.555358409881592, + "learning_rate": 6.901840490797547e-05, + "loss": 3.1434, + "step": 676 + }, + { + "epoch": 0.20779619398403928, + "grad_norm": 2.990675687789917, + "learning_rate": 6.912065439672802e-05, + "loss": 3.2182, + "step": 677 + }, + { + "epoch": 0.20810313075506445, + "grad_norm": 2.5072035789489746, + "learning_rate": 6.922290388548058e-05, + "loss": 3.2735, + "step": 678 + }, + { + "epoch": 0.20841006752608962, + "grad_norm": 3.311474323272705, + "learning_rate": 6.932515337423313e-05, + "loss": 3.2152, + "step": 679 + }, + { + "epoch": 0.2087170042971148, + "grad_norm": 2.7110986709594727, + "learning_rate": 6.942740286298569e-05, + "loss": 3.1633, + "step": 680 + }, + { + "epoch": 0.20902394106813996, + "grad_norm": 2.6963095664978027, + "learning_rate": 6.952965235173824e-05, + "loss": 3.2097, + "step": 681 + }, + { + "epoch": 0.20933087783916512, + "grad_norm": 2.7126448154449463, + "learning_rate": 6.963190184049079e-05, + "loss": 3.232, + "step": 682 + }, + { + "epoch": 0.2096378146101903, + "grad_norm": 2.723257541656494, + "learning_rate": 6.973415132924336e-05, + "loss": 3.1024, + "step": 683 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 2.985407829284668, + "learning_rate": 6.983640081799591e-05, + "loss": 3.215, + "step": 684 + }, + { + "epoch": 0.21025168815224063, + "grad_norm": 2.4878063201904297, + "learning_rate": 6.993865030674847e-05, + "loss": 3.2543, + "step": 685 + }, + { + "epoch": 0.2105586249232658, + "grad_norm": 3.417191505432129, + "learning_rate": 7.004089979550102e-05, + "loss": 3.217, + "step": 686 + }, + { + "epoch": 0.21086556169429096, + "grad_norm": 2.606513738632202, + "learning_rate": 7.014314928425359e-05, + "loss": 3.1831, + "step": 687 + }, + { + "epoch": 0.21117249846531613, + "grad_norm": 2.777334213256836, + "learning_rate": 7.024539877300614e-05, + "loss": 3.1513, + "step": 688 + }, + { + "epoch": 0.2114794352363413, + "grad_norm": 2.718494415283203, + "learning_rate": 7.03476482617587e-05, + "loss": 3.1695, + "step": 689 + }, + { + "epoch": 0.21178637200736647, + "grad_norm": 3.041794776916504, + "learning_rate": 7.044989775051125e-05, + "loss": 3.2078, + "step": 690 + }, + { + "epoch": 0.21209330877839166, + "grad_norm": 2.6473169326782227, + "learning_rate": 7.055214723926382e-05, + "loss": 3.177, + "step": 691 + }, + { + "epoch": 0.21240024554941683, + "grad_norm": 3.2349517345428467, + "learning_rate": 7.065439672801636e-05, + "loss": 3.2144, + "step": 692 + }, + { + "epoch": 0.212707182320442, + "grad_norm": 2.6024651527404785, + "learning_rate": 7.075664621676892e-05, + "loss": 3.2204, + "step": 693 + }, + { + "epoch": 0.21301411909146717, + "grad_norm": 2.9090511798858643, + "learning_rate": 7.085889570552148e-05, + "loss": 3.2473, + "step": 694 + }, + { + "epoch": 0.21332105586249234, + "grad_norm": 3.230525255203247, + "learning_rate": 7.096114519427403e-05, + "loss": 3.2552, + "step": 695 + }, + { + "epoch": 0.2136279926335175, + "grad_norm": 2.2609128952026367, + "learning_rate": 7.106339468302658e-05, + "loss": 3.1302, + "step": 696 + }, + { + "epoch": 0.21393492940454267, + "grad_norm": 3.484372854232788, + "learning_rate": 7.116564417177914e-05, + "loss": 3.1578, + "step": 697 + }, + { + "epoch": 0.21424186617556784, + "grad_norm": 2.130702257156372, + "learning_rate": 7.12678936605317e-05, + "loss": 3.2089, + "step": 698 + }, + { + "epoch": 0.214548802946593, + "grad_norm": 3.0673611164093018, + "learning_rate": 7.137014314928426e-05, + "loss": 3.214, + "step": 699 + }, + { + "epoch": 0.21485573971761818, + "grad_norm": 2.572826862335205, + "learning_rate": 7.147239263803681e-05, + "loss": 3.1824, + "step": 700 + }, + { + "epoch": 0.21516267648864335, + "grad_norm": 2.8327746391296387, + "learning_rate": 7.157464212678937e-05, + "loss": 3.2384, + "step": 701 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 2.863041877746582, + "learning_rate": 7.167689161554193e-05, + "loss": 3.1102, + "step": 702 + }, + { + "epoch": 0.21577655003069368, + "grad_norm": 2.2519750595092773, + "learning_rate": 7.177914110429447e-05, + "loss": 3.1541, + "step": 703 + }, + { + "epoch": 0.21608348680171885, + "grad_norm": 3.197129011154175, + "learning_rate": 7.188139059304704e-05, + "loss": 3.2407, + "step": 704 + }, + { + "epoch": 0.21639042357274402, + "grad_norm": 2.32582426071167, + "learning_rate": 7.19836400817996e-05, + "loss": 3.1895, + "step": 705 + }, + { + "epoch": 0.21669736034376919, + "grad_norm": 3.0128488540649414, + "learning_rate": 7.208588957055215e-05, + "loss": 3.2839, + "step": 706 + }, + { + "epoch": 0.21700429711479435, + "grad_norm": 2.503342390060425, + "learning_rate": 7.21881390593047e-05, + "loss": 3.2093, + "step": 707 + }, + { + "epoch": 0.21731123388581952, + "grad_norm": 2.7540833950042725, + "learning_rate": 7.229038854805727e-05, + "loss": 3.2143, + "step": 708 + }, + { + "epoch": 0.2176181706568447, + "grad_norm": 2.8838772773742676, + "learning_rate": 7.239263803680982e-05, + "loss": 3.2051, + "step": 709 + }, + { + "epoch": 0.21792510742786986, + "grad_norm": 2.7495758533477783, + "learning_rate": 7.249488752556238e-05, + "loss": 3.0701, + "step": 710 + }, + { + "epoch": 0.21823204419889503, + "grad_norm": 2.684539794921875, + "learning_rate": 7.259713701431493e-05, + "loss": 3.1917, + "step": 711 + }, + { + "epoch": 0.2185389809699202, + "grad_norm": 2.8330819606781006, + "learning_rate": 7.26993865030675e-05, + "loss": 3.1685, + "step": 712 + }, + { + "epoch": 0.21884591774094536, + "grad_norm": 2.6974711418151855, + "learning_rate": 7.280163599182005e-05, + "loss": 3.0953, + "step": 713 + }, + { + "epoch": 0.21915285451197053, + "grad_norm": 2.5129306316375732, + "learning_rate": 7.29038854805726e-05, + "loss": 3.1371, + "step": 714 + }, + { + "epoch": 0.2194597912829957, + "grad_norm": 2.7884230613708496, + "learning_rate": 7.300613496932516e-05, + "loss": 3.1386, + "step": 715 + }, + { + "epoch": 0.21976672805402087, + "grad_norm": 2.296306610107422, + "learning_rate": 7.310838445807771e-05, + "loss": 3.1735, + "step": 716 + }, + { + "epoch": 0.22007366482504603, + "grad_norm": 2.777911424636841, + "learning_rate": 7.321063394683026e-05, + "loss": 3.1726, + "step": 717 + }, + { + "epoch": 0.2203806015960712, + "grad_norm": 2.5349695682525635, + "learning_rate": 7.331288343558282e-05, + "loss": 3.1603, + "step": 718 + }, + { + "epoch": 0.22068753836709637, + "grad_norm": 2.415412425994873, + "learning_rate": 7.341513292433539e-05, + "loss": 3.1378, + "step": 719 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 2.7188358306884766, + "learning_rate": 7.351738241308794e-05, + "loss": 3.1321, + "step": 720 + }, + { + "epoch": 0.2213014119091467, + "grad_norm": 2.4872183799743652, + "learning_rate": 7.361963190184049e-05, + "loss": 3.1283, + "step": 721 + }, + { + "epoch": 0.22160834868017187, + "grad_norm": 2.454535961151123, + "learning_rate": 7.372188139059305e-05, + "loss": 3.1085, + "step": 722 + }, + { + "epoch": 0.22191528545119704, + "grad_norm": 2.5621426105499268, + "learning_rate": 7.382413087934561e-05, + "loss": 3.1307, + "step": 723 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.513777256011963, + "learning_rate": 7.392638036809815e-05, + "loss": 3.1103, + "step": 724 + }, + { + "epoch": 0.22252915899324738, + "grad_norm": 2.596559762954712, + "learning_rate": 7.402862985685072e-05, + "loss": 3.1563, + "step": 725 + }, + { + "epoch": 0.22283609576427257, + "grad_norm": 2.371487617492676, + "learning_rate": 7.413087934560327e-05, + "loss": 3.1344, + "step": 726 + }, + { + "epoch": 0.22314303253529774, + "grad_norm": 2.7252206802368164, + "learning_rate": 7.423312883435584e-05, + "loss": 3.2139, + "step": 727 + }, + { + "epoch": 0.2234499693063229, + "grad_norm": 2.2834722995758057, + "learning_rate": 7.433537832310838e-05, + "loss": 3.1461, + "step": 728 + }, + { + "epoch": 0.22375690607734808, + "grad_norm": 3.0965540409088135, + "learning_rate": 7.443762781186095e-05, + "loss": 3.1433, + "step": 729 + }, + { + "epoch": 0.22406384284837325, + "grad_norm": 2.351365804672241, + "learning_rate": 7.45398773006135e-05, + "loss": 3.1737, + "step": 730 + }, + { + "epoch": 0.2243707796193984, + "grad_norm": 3.0938596725463867, + "learning_rate": 7.464212678936606e-05, + "loss": 3.1689, + "step": 731 + }, + { + "epoch": 0.22467771639042358, + "grad_norm": 2.415039300918579, + "learning_rate": 7.474437627811861e-05, + "loss": 3.1146, + "step": 732 + }, + { + "epoch": 0.22498465316144875, + "grad_norm": 2.8242318630218506, + "learning_rate": 7.484662576687118e-05, + "loss": 3.0812, + "step": 733 + }, + { + "epoch": 0.22529158993247392, + "grad_norm": 2.4347777366638184, + "learning_rate": 7.494887525562373e-05, + "loss": 3.203, + "step": 734 + }, + { + "epoch": 0.22559852670349909, + "grad_norm": 2.953418016433716, + "learning_rate": 7.505112474437628e-05, + "loss": 3.109, + "step": 735 + }, + { + "epoch": 0.22590546347452425, + "grad_norm": 2.600888252258301, + "learning_rate": 7.515337423312884e-05, + "loss": 3.1859, + "step": 736 + }, + { + "epoch": 0.22621240024554942, + "grad_norm": 2.7484869956970215, + "learning_rate": 7.525562372188139e-05, + "loss": 3.1169, + "step": 737 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 2.4797677993774414, + "learning_rate": 7.535787321063396e-05, + "loss": 3.0696, + "step": 738 + }, + { + "epoch": 0.22682627378759976, + "grad_norm": 2.641873359680176, + "learning_rate": 7.54601226993865e-05, + "loss": 3.1545, + "step": 739 + }, + { + "epoch": 0.22713321055862493, + "grad_norm": 2.3956825733184814, + "learning_rate": 7.556237218813907e-05, + "loss": 3.1295, + "step": 740 + }, + { + "epoch": 0.2274401473296501, + "grad_norm": 2.8832130432128906, + "learning_rate": 7.566462167689162e-05, + "loss": 3.1119, + "step": 741 + }, + { + "epoch": 0.22774708410067526, + "grad_norm": 2.3001184463500977, + "learning_rate": 7.576687116564417e-05, + "loss": 3.0068, + "step": 742 + }, + { + "epoch": 0.22805402087170043, + "grad_norm": 2.8682122230529785, + "learning_rate": 7.586912065439673e-05, + "loss": 3.0562, + "step": 743 + }, + { + "epoch": 0.2283609576427256, + "grad_norm": 2.2176413536071777, + "learning_rate": 7.59713701431493e-05, + "loss": 3.1395, + "step": 744 + }, + { + "epoch": 0.22866789441375077, + "grad_norm": 3.698274612426758, + "learning_rate": 7.607361963190185e-05, + "loss": 3.209, + "step": 745 + }, + { + "epoch": 0.22897483118477593, + "grad_norm": 2.141063928604126, + "learning_rate": 7.61758691206544e-05, + "loss": 3.1734, + "step": 746 + }, + { + "epoch": 0.2292817679558011, + "grad_norm": 2.728498697280884, + "learning_rate": 7.627811860940695e-05, + "loss": 3.1498, + "step": 747 + }, + { + "epoch": 0.22958870472682627, + "grad_norm": 2.271678924560547, + "learning_rate": 7.638036809815952e-05, + "loss": 3.1538, + "step": 748 + }, + { + "epoch": 0.22989564149785144, + "grad_norm": 2.6095521450042725, + "learning_rate": 7.648261758691206e-05, + "loss": 3.155, + "step": 749 + }, + { + "epoch": 0.2302025782688766, + "grad_norm": 2.410792112350464, + "learning_rate": 7.658486707566463e-05, + "loss": 3.0478, + "step": 750 + }, + { + "epoch": 0.23050951503990177, + "grad_norm": 2.6980888843536377, + "learning_rate": 7.668711656441718e-05, + "loss": 3.1369, + "step": 751 + }, + { + "epoch": 0.23081645181092694, + "grad_norm": 2.353308916091919, + "learning_rate": 7.678936605316974e-05, + "loss": 3.0052, + "step": 752 + }, + { + "epoch": 0.2311233885819521, + "grad_norm": 2.4530155658721924, + "learning_rate": 7.689161554192229e-05, + "loss": 3.1348, + "step": 753 + }, + { + "epoch": 0.23143032535297728, + "grad_norm": 2.393601894378662, + "learning_rate": 7.699386503067484e-05, + "loss": 2.9941, + "step": 754 + }, + { + "epoch": 0.23173726212400245, + "grad_norm": 2.576876401901245, + "learning_rate": 7.709611451942741e-05, + "loss": 3.114, + "step": 755 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 2.0420913696289062, + "learning_rate": 7.719836400817996e-05, + "loss": 3.132, + "step": 756 + }, + { + "epoch": 0.23235113566605278, + "grad_norm": 3.0095622539520264, + "learning_rate": 7.730061349693252e-05, + "loss": 3.1763, + "step": 757 + }, + { + "epoch": 0.23265807243707795, + "grad_norm": 2.224005937576294, + "learning_rate": 7.740286298568507e-05, + "loss": 3.0703, + "step": 758 + }, + { + "epoch": 0.23296500920810312, + "grad_norm": 2.7559845447540283, + "learning_rate": 7.750511247443764e-05, + "loss": 3.1026, + "step": 759 + }, + { + "epoch": 0.2332719459791283, + "grad_norm": 2.2965753078460693, + "learning_rate": 7.760736196319018e-05, + "loss": 3.0284, + "step": 760 + }, + { + "epoch": 0.23357888275015345, + "grad_norm": 2.374398708343506, + "learning_rate": 7.770961145194275e-05, + "loss": 3.0636, + "step": 761 + }, + { + "epoch": 0.23388581952117865, + "grad_norm": 2.4315314292907715, + "learning_rate": 7.78118609406953e-05, + "loss": 3.0906, + "step": 762 + }, + { + "epoch": 0.23419275629220382, + "grad_norm": 2.5609946250915527, + "learning_rate": 7.791411042944787e-05, + "loss": 3.0692, + "step": 763 + }, + { + "epoch": 0.234499693063229, + "grad_norm": 2.419597864151001, + "learning_rate": 7.80163599182004e-05, + "loss": 3.1934, + "step": 764 + }, + { + "epoch": 0.23480662983425415, + "grad_norm": 3.0499062538146973, + "learning_rate": 7.811860940695297e-05, + "loss": 3.18, + "step": 765 + }, + { + "epoch": 0.23511356660527932, + "grad_norm": 2.464421510696411, + "learning_rate": 7.822085889570553e-05, + "loss": 3.1591, + "step": 766 + }, + { + "epoch": 0.2354205033763045, + "grad_norm": 3.4370174407958984, + "learning_rate": 7.832310838445808e-05, + "loss": 3.1156, + "step": 767 + }, + { + "epoch": 0.23572744014732966, + "grad_norm": 2.207406520843506, + "learning_rate": 7.842535787321063e-05, + "loss": 3.0557, + "step": 768 + }, + { + "epoch": 0.23603437691835483, + "grad_norm": 2.484807014465332, + "learning_rate": 7.85276073619632e-05, + "loss": 3.1003, + "step": 769 + }, + { + "epoch": 0.23634131368938, + "grad_norm": 2.33217716217041, + "learning_rate": 7.862985685071576e-05, + "loss": 3.0707, + "step": 770 + }, + { + "epoch": 0.23664825046040516, + "grad_norm": 2.493717670440674, + "learning_rate": 7.873210633946831e-05, + "loss": 3.127, + "step": 771 + }, + { + "epoch": 0.23695518723143033, + "grad_norm": 2.5824413299560547, + "learning_rate": 7.883435582822086e-05, + "loss": 3.1042, + "step": 772 + }, + { + "epoch": 0.2372621240024555, + "grad_norm": 2.4137654304504395, + "learning_rate": 7.893660531697342e-05, + "loss": 3.136, + "step": 773 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 2.4657833576202393, + "learning_rate": 7.903885480572597e-05, + "loss": 3.038, + "step": 774 + }, + { + "epoch": 0.23787599754450584, + "grad_norm": 2.426260471343994, + "learning_rate": 7.914110429447852e-05, + "loss": 3.0102, + "step": 775 + }, + { + "epoch": 0.238182934315531, + "grad_norm": 2.4658050537109375, + "learning_rate": 7.924335378323109e-05, + "loss": 3.0645, + "step": 776 + }, + { + "epoch": 0.23848987108655617, + "grad_norm": 2.186267614364624, + "learning_rate": 7.934560327198364e-05, + "loss": 3.0585, + "step": 777 + }, + { + "epoch": 0.23879680785758134, + "grad_norm": 2.8824141025543213, + "learning_rate": 7.94478527607362e-05, + "loss": 3.0796, + "step": 778 + }, + { + "epoch": 0.2391037446286065, + "grad_norm": 1.9940539598464966, + "learning_rate": 7.955010224948875e-05, + "loss": 2.9894, + "step": 779 + }, + { + "epoch": 0.23941068139963168, + "grad_norm": 2.9386861324310303, + "learning_rate": 7.965235173824132e-05, + "loss": 3.1147, + "step": 780 + }, + { + "epoch": 0.23971761817065684, + "grad_norm": 2.241983413696289, + "learning_rate": 7.975460122699386e-05, + "loss": 2.9977, + "step": 781 + }, + { + "epoch": 0.240024554941682, + "grad_norm": 2.4796900749206543, + "learning_rate": 7.985685071574643e-05, + "loss": 3.0507, + "step": 782 + }, + { + "epoch": 0.24033149171270718, + "grad_norm": 2.6178741455078125, + "learning_rate": 7.995910020449898e-05, + "loss": 3.0299, + "step": 783 + }, + { + "epoch": 0.24063842848373235, + "grad_norm": 2.157179594039917, + "learning_rate": 8.006134969325155e-05, + "loss": 3.0419, + "step": 784 + }, + { + "epoch": 0.24094536525475752, + "grad_norm": 2.49029541015625, + "learning_rate": 8.016359918200409e-05, + "loss": 3.0785, + "step": 785 + }, + { + "epoch": 0.24125230202578268, + "grad_norm": 2.254014492034912, + "learning_rate": 8.026584867075665e-05, + "loss": 3.0009, + "step": 786 + }, + { + "epoch": 0.24155923879680785, + "grad_norm": 2.514465570449829, + "learning_rate": 8.036809815950921e-05, + "loss": 3.0221, + "step": 787 + }, + { + "epoch": 0.24186617556783302, + "grad_norm": 2.309812545776367, + "learning_rate": 8.047034764826176e-05, + "loss": 2.9822, + "step": 788 + }, + { + "epoch": 0.2421731123388582, + "grad_norm": 2.5367796421051025, + "learning_rate": 8.057259713701431e-05, + "loss": 2.966, + "step": 789 + }, + { + "epoch": 0.24248004910988336, + "grad_norm": 2.4668943881988525, + "learning_rate": 8.067484662576688e-05, + "loss": 3.1177, + "step": 790 + }, + { + "epoch": 0.24278698588090852, + "grad_norm": 2.9424917697906494, + "learning_rate": 8.077709611451944e-05, + "loss": 3.078, + "step": 791 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 2.3068933486938477, + "learning_rate": 8.087934560327199e-05, + "loss": 3.0415, + "step": 792 + }, + { + "epoch": 0.24340085942295886, + "grad_norm": 2.675631284713745, + "learning_rate": 8.098159509202454e-05, + "loss": 3.012, + "step": 793 + }, + { + "epoch": 0.24370779619398403, + "grad_norm": 2.0261662006378174, + "learning_rate": 8.10838445807771e-05, + "loss": 3.0023, + "step": 794 + }, + { + "epoch": 0.2440147329650092, + "grad_norm": 3.32330322265625, + "learning_rate": 8.118609406952966e-05, + "loss": 3.0992, + "step": 795 + }, + { + "epoch": 0.24432166973603436, + "grad_norm": 2.1587088108062744, + "learning_rate": 8.12883435582822e-05, + "loss": 3.0922, + "step": 796 + }, + { + "epoch": 0.24462860650705956, + "grad_norm": 2.639254331588745, + "learning_rate": 8.139059304703477e-05, + "loss": 2.9856, + "step": 797 + }, + { + "epoch": 0.24493554327808473, + "grad_norm": 1.9976975917816162, + "learning_rate": 8.149284253578732e-05, + "loss": 3.0015, + "step": 798 + }, + { + "epoch": 0.2452424800491099, + "grad_norm": 2.763504981994629, + "learning_rate": 8.159509202453988e-05, + "loss": 3.0437, + "step": 799 + }, + { + "epoch": 0.24554941682013506, + "grad_norm": 1.9080138206481934, + "learning_rate": 8.169734151329243e-05, + "loss": 3.0009, + "step": 800 + }, + { + "epoch": 0.24585635359116023, + "grad_norm": 3.1276164054870605, + "learning_rate": 8.1799591002045e-05, + "loss": 3.0433, + "step": 801 + }, + { + "epoch": 0.2461632903621854, + "grad_norm": 2.0463218688964844, + "learning_rate": 8.190184049079755e-05, + "loss": 2.988, + "step": 802 + }, + { + "epoch": 0.24647022713321057, + "grad_norm": 2.8476648330688477, + "learning_rate": 8.20040899795501e-05, + "loss": 3.0238, + "step": 803 + }, + { + "epoch": 0.24677716390423574, + "grad_norm": 1.9715898036956787, + "learning_rate": 8.210633946830266e-05, + "loss": 3.0657, + "step": 804 + }, + { + "epoch": 0.2470841006752609, + "grad_norm": 3.369995594024658, + "learning_rate": 8.220858895705523e-05, + "loss": 3.0181, + "step": 805 + }, + { + "epoch": 0.24739103744628607, + "grad_norm": 2.0333900451660156, + "learning_rate": 8.231083844580777e-05, + "loss": 3.0589, + "step": 806 + }, + { + "epoch": 0.24769797421731124, + "grad_norm": 2.5702931880950928, + "learning_rate": 8.241308793456033e-05, + "loss": 2.9908, + "step": 807 + }, + { + "epoch": 0.2480049109883364, + "grad_norm": 2.12131929397583, + "learning_rate": 8.251533742331289e-05, + "loss": 3.0519, + "step": 808 + }, + { + "epoch": 0.24831184775936158, + "grad_norm": 2.5457377433776855, + "learning_rate": 8.261758691206544e-05, + "loss": 3.019, + "step": 809 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 2.0954740047454834, + "learning_rate": 8.2719836400818e-05, + "loss": 2.9805, + "step": 810 + }, + { + "epoch": 0.2489257213014119, + "grad_norm": 2.2456700801849365, + "learning_rate": 8.282208588957055e-05, + "loss": 3.0627, + "step": 811 + }, + { + "epoch": 0.24923265807243708, + "grad_norm": 2.4453790187835693, + "learning_rate": 8.292433537832312e-05, + "loss": 3.0447, + "step": 812 + }, + { + "epoch": 0.24953959484346225, + "grad_norm": 2.1835873126983643, + "learning_rate": 8.302658486707567e-05, + "loss": 3.0008, + "step": 813 + }, + { + "epoch": 0.24984653161448742, + "grad_norm": 2.292989492416382, + "learning_rate": 8.312883435582822e-05, + "loss": 2.9175, + "step": 814 + }, + { + "epoch": 0.2501534683855126, + "grad_norm": 2.408888816833496, + "learning_rate": 8.323108384458078e-05, + "loss": 2.9649, + "step": 815 + }, + { + "epoch": 0.2504604051565378, + "grad_norm": 2.1873834133148193, + "learning_rate": 8.333333333333334e-05, + "loss": 2.9812, + "step": 816 + }, + { + "epoch": 0.25076734192756295, + "grad_norm": 2.2599284648895264, + "learning_rate": 8.343558282208588e-05, + "loss": 3.0086, + "step": 817 + }, + { + "epoch": 0.2510742786985881, + "grad_norm": 2.1902761459350586, + "learning_rate": 8.353783231083845e-05, + "loss": 2.9295, + "step": 818 + }, + { + "epoch": 0.2513812154696133, + "grad_norm": 2.4830422401428223, + "learning_rate": 8.3640081799591e-05, + "loss": 2.9808, + "step": 819 + }, + { + "epoch": 0.25168815224063845, + "grad_norm": 2.2274281978607178, + "learning_rate": 8.374233128834357e-05, + "loss": 2.9525, + "step": 820 + }, + { + "epoch": 0.2519950890116636, + "grad_norm": 2.2949111461639404, + "learning_rate": 8.384458077709611e-05, + "loss": 3.0313, + "step": 821 + }, + { + "epoch": 0.2523020257826888, + "grad_norm": 2.2345564365386963, + "learning_rate": 8.394683026584868e-05, + "loss": 2.9024, + "step": 822 + }, + { + "epoch": 0.25260896255371396, + "grad_norm": 2.488744020462036, + "learning_rate": 8.404907975460123e-05, + "loss": 2.9907, + "step": 823 + }, + { + "epoch": 0.2529158993247391, + "grad_norm": 1.9192837476730347, + "learning_rate": 8.415132924335379e-05, + "loss": 2.9792, + "step": 824 + }, + { + "epoch": 0.2532228360957643, + "grad_norm": 2.6426947116851807, + "learning_rate": 8.425357873210634e-05, + "loss": 2.972, + "step": 825 + }, + { + "epoch": 0.25352977286678946, + "grad_norm": 1.9950047731399536, + "learning_rate": 8.435582822085891e-05, + "loss": 2.9885, + "step": 826 + }, + { + "epoch": 0.25383670963781463, + "grad_norm": 2.30191969871521, + "learning_rate": 8.445807770961146e-05, + "loss": 2.9358, + "step": 827 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 2.1111395359039307, + "learning_rate": 8.456032719836401e-05, + "loss": 3.0343, + "step": 828 + }, + { + "epoch": 0.25445058317986496, + "grad_norm": 2.7292258739471436, + "learning_rate": 8.466257668711657e-05, + "loss": 2.9465, + "step": 829 + }, + { + "epoch": 0.25475751995089013, + "grad_norm": 1.9130604267120361, + "learning_rate": 8.476482617586912e-05, + "loss": 2.9443, + "step": 830 + }, + { + "epoch": 0.2550644567219153, + "grad_norm": 2.4240024089813232, + "learning_rate": 8.486707566462168e-05, + "loss": 2.963, + "step": 831 + }, + { + "epoch": 0.25537139349294047, + "grad_norm": 2.062875509262085, + "learning_rate": 8.496932515337423e-05, + "loss": 3.0127, + "step": 832 + }, + { + "epoch": 0.25567833026396564, + "grad_norm": 2.223639726638794, + "learning_rate": 8.50715746421268e-05, + "loss": 2.944, + "step": 833 + }, + { + "epoch": 0.2559852670349908, + "grad_norm": 2.2969272136688232, + "learning_rate": 8.517382413087935e-05, + "loss": 2.9495, + "step": 834 + }, + { + "epoch": 0.256292203806016, + "grad_norm": 2.1343178749084473, + "learning_rate": 8.52760736196319e-05, + "loss": 3.0383, + "step": 835 + }, + { + "epoch": 0.25659914057704114, + "grad_norm": 2.2348313331604004, + "learning_rate": 8.537832310838446e-05, + "loss": 2.9205, + "step": 836 + }, + { + "epoch": 0.2569060773480663, + "grad_norm": 2.2653896808624268, + "learning_rate": 8.548057259713702e-05, + "loss": 2.9699, + "step": 837 + }, + { + "epoch": 0.2572130141190915, + "grad_norm": 2.1332547664642334, + "learning_rate": 8.558282208588958e-05, + "loss": 2.9318, + "step": 838 + }, + { + "epoch": 0.25751995089011664, + "grad_norm": 2.5935778617858887, + "learning_rate": 8.568507157464213e-05, + "loss": 2.9754, + "step": 839 + }, + { + "epoch": 0.2578268876611418, + "grad_norm": 2.073923110961914, + "learning_rate": 8.578732106339469e-05, + "loss": 3.0396, + "step": 840 + }, + { + "epoch": 0.258133824432167, + "grad_norm": 2.485049247741699, + "learning_rate": 8.588957055214725e-05, + "loss": 2.9297, + "step": 841 + }, + { + "epoch": 0.25844076120319215, + "grad_norm": 1.9425253868103027, + "learning_rate": 8.599182004089979e-05, + "loss": 3.0131, + "step": 842 + }, + { + "epoch": 0.2587476979742173, + "grad_norm": 2.6248724460601807, + "learning_rate": 8.609406952965236e-05, + "loss": 3.0345, + "step": 843 + }, + { + "epoch": 0.2590546347452425, + "grad_norm": 1.9123374223709106, + "learning_rate": 8.619631901840491e-05, + "loss": 3.0259, + "step": 844 + }, + { + "epoch": 0.25936157151626765, + "grad_norm": 2.457913637161255, + "learning_rate": 8.629856850715747e-05, + "loss": 3.0015, + "step": 845 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 2.0444202423095703, + "learning_rate": 8.640081799591002e-05, + "loss": 2.9663, + "step": 846 + }, + { + "epoch": 0.259975445058318, + "grad_norm": 2.1673583984375, + "learning_rate": 8.650306748466259e-05, + "loss": 3.0646, + "step": 847 + }, + { + "epoch": 0.26028238182934316, + "grad_norm": 2.1198627948760986, + "learning_rate": 8.660531697341514e-05, + "loss": 2.8769, + "step": 848 + }, + { + "epoch": 0.2605893186003683, + "grad_norm": 2.379960775375366, + "learning_rate": 8.67075664621677e-05, + "loss": 2.9637, + "step": 849 + }, + { + "epoch": 0.2608962553713935, + "grad_norm": 2.3954226970672607, + "learning_rate": 8.680981595092025e-05, + "loss": 3.025, + "step": 850 + }, + { + "epoch": 0.26120319214241866, + "grad_norm": 2.254746198654175, + "learning_rate": 8.69120654396728e-05, + "loss": 2.9962, + "step": 851 + }, + { + "epoch": 0.26151012891344383, + "grad_norm": 2.0851991176605225, + "learning_rate": 8.701431492842537e-05, + "loss": 2.9399, + "step": 852 + }, + { + "epoch": 0.261817065684469, + "grad_norm": 2.2800698280334473, + "learning_rate": 8.711656441717791e-05, + "loss": 2.9465, + "step": 853 + }, + { + "epoch": 0.26212400245549416, + "grad_norm": 2.3628437519073486, + "learning_rate": 8.721881390593048e-05, + "loss": 3.0298, + "step": 854 + }, + { + "epoch": 0.26243093922651933, + "grad_norm": 1.9642207622528076, + "learning_rate": 8.732106339468303e-05, + "loss": 2.8462, + "step": 855 + }, + { + "epoch": 0.2627378759975445, + "grad_norm": 2.5833423137664795, + "learning_rate": 8.742331288343558e-05, + "loss": 2.9024, + "step": 856 + }, + { + "epoch": 0.26304481276856967, + "grad_norm": 1.7022998332977295, + "learning_rate": 8.752556237218814e-05, + "loss": 2.9948, + "step": 857 + }, + { + "epoch": 0.26335174953959484, + "grad_norm": 3.181725025177002, + "learning_rate": 8.76278118609407e-05, + "loss": 3.0634, + "step": 858 + }, + { + "epoch": 0.26365868631062, + "grad_norm": 1.8931077718734741, + "learning_rate": 8.773006134969326e-05, + "loss": 2.9974, + "step": 859 + }, + { + "epoch": 0.2639656230816452, + "grad_norm": 2.5016703605651855, + "learning_rate": 8.783231083844581e-05, + "loss": 3.0109, + "step": 860 + }, + { + "epoch": 0.26427255985267034, + "grad_norm": 1.810957908630371, + "learning_rate": 8.793456032719837e-05, + "loss": 3.0143, + "step": 861 + }, + { + "epoch": 0.2645794966236955, + "grad_norm": 2.3004086017608643, + "learning_rate": 8.803680981595093e-05, + "loss": 2.9825, + "step": 862 + }, + { + "epoch": 0.2648864333947207, + "grad_norm": 2.23740816116333, + "learning_rate": 8.813905930470347e-05, + "loss": 2.8897, + "step": 863 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 2.441157579421997, + "learning_rate": 8.824130879345604e-05, + "loss": 2.8966, + "step": 864 + }, + { + "epoch": 0.265500306936771, + "grad_norm": 2.063201665878296, + "learning_rate": 8.83435582822086e-05, + "loss": 2.9468, + "step": 865 + }, + { + "epoch": 0.2658072437077962, + "grad_norm": 2.1484951972961426, + "learning_rate": 8.844580777096115e-05, + "loss": 2.9199, + "step": 866 + }, + { + "epoch": 0.26611418047882135, + "grad_norm": 2.167827844619751, + "learning_rate": 8.85480572597137e-05, + "loss": 2.9403, + "step": 867 + }, + { + "epoch": 0.2664211172498465, + "grad_norm": 2.193556070327759, + "learning_rate": 8.865030674846625e-05, + "loss": 2.9171, + "step": 868 + }, + { + "epoch": 0.2667280540208717, + "grad_norm": 2.0754151344299316, + "learning_rate": 8.875255623721882e-05, + "loss": 2.9605, + "step": 869 + }, + { + "epoch": 0.26703499079189685, + "grad_norm": 2.1351094245910645, + "learning_rate": 8.885480572597138e-05, + "loss": 2.9272, + "step": 870 + }, + { + "epoch": 0.267341927562922, + "grad_norm": 2.0486347675323486, + "learning_rate": 8.895705521472393e-05, + "loss": 3.0308, + "step": 871 + }, + { + "epoch": 0.2676488643339472, + "grad_norm": 2.3303308486938477, + "learning_rate": 8.905930470347648e-05, + "loss": 2.9061, + "step": 872 + }, + { + "epoch": 0.26795580110497236, + "grad_norm": 1.9345083236694336, + "learning_rate": 8.916155419222905e-05, + "loss": 2.9644, + "step": 873 + }, + { + "epoch": 0.2682627378759975, + "grad_norm": 2.451918601989746, + "learning_rate": 8.926380368098159e-05, + "loss": 2.9536, + "step": 874 + }, + { + "epoch": 0.2685696746470227, + "grad_norm": 1.6964573860168457, + "learning_rate": 8.936605316973416e-05, + "loss": 2.9228, + "step": 875 + }, + { + "epoch": 0.26887661141804786, + "grad_norm": 2.2414000034332275, + "learning_rate": 8.946830265848671e-05, + "loss": 2.9776, + "step": 876 + }, + { + "epoch": 0.26918354818907303, + "grad_norm": 1.725002408027649, + "learning_rate": 8.957055214723928e-05, + "loss": 2.9837, + "step": 877 + }, + { + "epoch": 0.2694904849600982, + "grad_norm": 2.1498587131500244, + "learning_rate": 8.967280163599182e-05, + "loss": 2.8684, + "step": 878 + }, + { + "epoch": 0.26979742173112337, + "grad_norm": 1.814738392829895, + "learning_rate": 8.977505112474438e-05, + "loss": 2.9077, + "step": 879 + }, + { + "epoch": 0.27010435850214853, + "grad_norm": 2.3086628913879395, + "learning_rate": 8.987730061349694e-05, + "loss": 2.9482, + "step": 880 + }, + { + "epoch": 0.2704112952731737, + "grad_norm": 1.7470855712890625, + "learning_rate": 8.997955010224949e-05, + "loss": 2.9775, + "step": 881 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 2.2822775840759277, + "learning_rate": 9.008179959100205e-05, + "loss": 3.0004, + "step": 882 + }, + { + "epoch": 0.27102516881522404, + "grad_norm": 1.9530903100967407, + "learning_rate": 9.018404907975461e-05, + "loss": 2.949, + "step": 883 + }, + { + "epoch": 0.2713321055862492, + "grad_norm": 2.0626885890960693, + "learning_rate": 9.028629856850717e-05, + "loss": 2.9184, + "step": 884 + }, + { + "epoch": 0.2716390423572744, + "grad_norm": 2.0040712356567383, + "learning_rate": 9.038854805725972e-05, + "loss": 2.8562, + "step": 885 + }, + { + "epoch": 0.2719459791282996, + "grad_norm": 2.026193141937256, + "learning_rate": 9.049079754601227e-05, + "loss": 2.883, + "step": 886 + }, + { + "epoch": 0.27225291589932477, + "grad_norm": 1.8337095975875854, + "learning_rate": 9.059304703476483e-05, + "loss": 2.8512, + "step": 887 + }, + { + "epoch": 0.27255985267034993, + "grad_norm": 2.1098122596740723, + "learning_rate": 9.069529652351738e-05, + "loss": 2.9024, + "step": 888 + }, + { + "epoch": 0.2728667894413751, + "grad_norm": 2.065650701522827, + "learning_rate": 9.079754601226993e-05, + "loss": 2.9291, + "step": 889 + }, + { + "epoch": 0.27317372621240027, + "grad_norm": 2.204819679260254, + "learning_rate": 9.08997955010225e-05, + "loss": 2.9153, + "step": 890 + }, + { + "epoch": 0.27348066298342544, + "grad_norm": 1.7931475639343262, + "learning_rate": 9.100204498977506e-05, + "loss": 2.9104, + "step": 891 + }, + { + "epoch": 0.2737875997544506, + "grad_norm": 2.4288859367370605, + "learning_rate": 9.110429447852761e-05, + "loss": 2.9974, + "step": 892 + }, + { + "epoch": 0.2740945365254758, + "grad_norm": 2.095872640609741, + "learning_rate": 9.120654396728016e-05, + "loss": 2.8446, + "step": 893 + }, + { + "epoch": 0.27440147329650094, + "grad_norm": 2.054410696029663, + "learning_rate": 9.130879345603273e-05, + "loss": 2.9008, + "step": 894 + }, + { + "epoch": 0.2747084100675261, + "grad_norm": 2.1989710330963135, + "learning_rate": 9.141104294478528e-05, + "loss": 2.8808, + "step": 895 + }, + { + "epoch": 0.2750153468385513, + "grad_norm": 2.531081199645996, + "learning_rate": 9.151329243353784e-05, + "loss": 2.8928, + "step": 896 + }, + { + "epoch": 0.27532228360957645, + "grad_norm": 2.010425567626953, + "learning_rate": 9.161554192229039e-05, + "loss": 2.9051, + "step": 897 + }, + { + "epoch": 0.2756292203806016, + "grad_norm": 1.9320241212844849, + "learning_rate": 9.171779141104296e-05, + "loss": 2.8675, + "step": 898 + }, + { + "epoch": 0.2759361571516268, + "grad_norm": 2.2280430793762207, + "learning_rate": 9.18200408997955e-05, + "loss": 2.9082, + "step": 899 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 1.9172335863113403, + "learning_rate": 9.192229038854807e-05, + "loss": 2.8947, + "step": 900 + }, + { + "epoch": 0.2765500306936771, + "grad_norm": 2.0846056938171387, + "learning_rate": 9.202453987730062e-05, + "loss": 2.9161, + "step": 901 + }, + { + "epoch": 0.2768569674647023, + "grad_norm": 1.875034213066101, + "learning_rate": 9.212678936605317e-05, + "loss": 2.8937, + "step": 902 + }, + { + "epoch": 0.27716390423572745, + "grad_norm": 2.230164051055908, + "learning_rate": 9.222903885480573e-05, + "loss": 2.8396, + "step": 903 + }, + { + "epoch": 0.2774708410067526, + "grad_norm": 1.6204382181167603, + "learning_rate": 9.233128834355828e-05, + "loss": 2.9367, + "step": 904 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.4218156337738037, + "learning_rate": 9.243353783231085e-05, + "loss": 2.9727, + "step": 905 + }, + { + "epoch": 0.27808471454880296, + "grad_norm": 1.7401793003082275, + "learning_rate": 9.25357873210634e-05, + "loss": 2.8957, + "step": 906 + }, + { + "epoch": 0.2783916513198281, + "grad_norm": 2.2128076553344727, + "learning_rate": 9.263803680981595e-05, + "loss": 2.8725, + "step": 907 + }, + { + "epoch": 0.2786985880908533, + "grad_norm": 2.004179000854492, + "learning_rate": 9.274028629856851e-05, + "loss": 2.8879, + "step": 908 + }, + { + "epoch": 0.27900552486187846, + "grad_norm": 2.198784112930298, + "learning_rate": 9.284253578732107e-05, + "loss": 2.9655, + "step": 909 + }, + { + "epoch": 0.27931246163290363, + "grad_norm": 1.8064004182815552, + "learning_rate": 9.294478527607362e-05, + "loss": 2.7801, + "step": 910 + }, + { + "epoch": 0.2796193984039288, + "grad_norm": 2.1273581981658936, + "learning_rate": 9.304703476482618e-05, + "loss": 2.8615, + "step": 911 + }, + { + "epoch": 0.27992633517495397, + "grad_norm": 1.7843197584152222, + "learning_rate": 9.314928425357874e-05, + "loss": 2.8735, + "step": 912 + }, + { + "epoch": 0.28023327194597913, + "grad_norm": 2.234886884689331, + "learning_rate": 9.325153374233129e-05, + "loss": 2.9444, + "step": 913 + }, + { + "epoch": 0.2805402087170043, + "grad_norm": 2.0565783977508545, + "learning_rate": 9.335378323108384e-05, + "loss": 2.9784, + "step": 914 + }, + { + "epoch": 0.28084714548802947, + "grad_norm": 1.836901068687439, + "learning_rate": 9.345603271983641e-05, + "loss": 2.9217, + "step": 915 + }, + { + "epoch": 0.28115408225905464, + "grad_norm": 2.0981357097625732, + "learning_rate": 9.355828220858896e-05, + "loss": 2.9091, + "step": 916 + }, + { + "epoch": 0.2814610190300798, + "grad_norm": 1.9199821949005127, + "learning_rate": 9.366053169734152e-05, + "loss": 2.8882, + "step": 917 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 1.9928756952285767, + "learning_rate": 9.376278118609407e-05, + "loss": 2.8463, + "step": 918 + }, + { + "epoch": 0.28207489257213014, + "grad_norm": 1.9580156803131104, + "learning_rate": 9.386503067484664e-05, + "loss": 2.7814, + "step": 919 + }, + { + "epoch": 0.2823818293431553, + "grad_norm": 2.016144275665283, + "learning_rate": 9.396728016359919e-05, + "loss": 2.8725, + "step": 920 + }, + { + "epoch": 0.2826887661141805, + "grad_norm": 1.967668890953064, + "learning_rate": 9.406952965235175e-05, + "loss": 2.912, + "step": 921 + }, + { + "epoch": 0.28299570288520565, + "grad_norm": 1.8826593160629272, + "learning_rate": 9.41717791411043e-05, + "loss": 2.7885, + "step": 922 + }, + { + "epoch": 0.2833026396562308, + "grad_norm": 2.0615732669830322, + "learning_rate": 9.427402862985685e-05, + "loss": 2.9111, + "step": 923 + }, + { + "epoch": 0.283609576427256, + "grad_norm": 1.7132701873779297, + "learning_rate": 9.43762781186094e-05, + "loss": 2.89, + "step": 924 + }, + { + "epoch": 0.28391651319828115, + "grad_norm": 2.1561272144317627, + "learning_rate": 9.447852760736196e-05, + "loss": 2.8741, + "step": 925 + }, + { + "epoch": 0.2842234499693063, + "grad_norm": 1.727338433265686, + "learning_rate": 9.458077709611453e-05, + "loss": 2.8449, + "step": 926 + }, + { + "epoch": 0.2845303867403315, + "grad_norm": 2.19234299659729, + "learning_rate": 9.468302658486708e-05, + "loss": 2.8499, + "step": 927 + }, + { + "epoch": 0.28483732351135665, + "grad_norm": 1.7370812892913818, + "learning_rate": 9.478527607361963e-05, + "loss": 2.882, + "step": 928 + }, + { + "epoch": 0.2851442602823818, + "grad_norm": 2.0576157569885254, + "learning_rate": 9.488752556237219e-05, + "loss": 2.7869, + "step": 929 + }, + { + "epoch": 0.285451197053407, + "grad_norm": 1.7926486730575562, + "learning_rate": 9.498977505112476e-05, + "loss": 2.906, + "step": 930 + }, + { + "epoch": 0.28575813382443216, + "grad_norm": 1.6877856254577637, + "learning_rate": 9.50920245398773e-05, + "loss": 2.8422, + "step": 931 + }, + { + "epoch": 0.2860650705954573, + "grad_norm": 2.3053178787231445, + "learning_rate": 9.519427402862986e-05, + "loss": 2.9039, + "step": 932 + }, + { + "epoch": 0.2863720073664825, + "grad_norm": 1.7746092081069946, + "learning_rate": 9.529652351738242e-05, + "loss": 2.9082, + "step": 933 + }, + { + "epoch": 0.28667894413750766, + "grad_norm": 2.1900086402893066, + "learning_rate": 9.539877300613498e-05, + "loss": 2.8511, + "step": 934 + }, + { + "epoch": 0.28698588090853283, + "grad_norm": 1.781988501548767, + "learning_rate": 9.550102249488752e-05, + "loss": 2.8264, + "step": 935 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 1.845797061920166, + "learning_rate": 9.560327198364009e-05, + "loss": 2.8657, + "step": 936 + }, + { + "epoch": 0.28759975445058317, + "grad_norm": 1.8794586658477783, + "learning_rate": 9.570552147239264e-05, + "loss": 2.8365, + "step": 937 + }, + { + "epoch": 0.28790669122160834, + "grad_norm": 2.078359603881836, + "learning_rate": 9.58077709611452e-05, + "loss": 2.8829, + "step": 938 + }, + { + "epoch": 0.2882136279926335, + "grad_norm": 1.8091285228729248, + "learning_rate": 9.591002044989775e-05, + "loss": 2.8083, + "step": 939 + }, + { + "epoch": 0.28852056476365867, + "grad_norm": 2.0130608081817627, + "learning_rate": 9.601226993865032e-05, + "loss": 2.8922, + "step": 940 + }, + { + "epoch": 0.28882750153468384, + "grad_norm": 1.8504360914230347, + "learning_rate": 9.611451942740287e-05, + "loss": 2.8034, + "step": 941 + }, + { + "epoch": 0.289134438305709, + "grad_norm": 1.860420823097229, + "learning_rate": 9.621676891615543e-05, + "loss": 2.8249, + "step": 942 + }, + { + "epoch": 0.2894413750767342, + "grad_norm": 2.157158374786377, + "learning_rate": 9.631901840490798e-05, + "loss": 2.8629, + "step": 943 + }, + { + "epoch": 0.28974831184775934, + "grad_norm": 1.8066895008087158, + "learning_rate": 9.642126789366053e-05, + "loss": 2.7965, + "step": 944 + }, + { + "epoch": 0.2900552486187845, + "grad_norm": 1.9674500226974487, + "learning_rate": 9.65235173824131e-05, + "loss": 2.8043, + "step": 945 + }, + { + "epoch": 0.2903621853898097, + "grad_norm": 1.7899354696273804, + "learning_rate": 9.662576687116564e-05, + "loss": 2.8803, + "step": 946 + }, + { + "epoch": 0.29066912216083485, + "grad_norm": 2.220201015472412, + "learning_rate": 9.672801635991821e-05, + "loss": 2.8201, + "step": 947 + }, + { + "epoch": 0.29097605893186, + "grad_norm": 1.76320219039917, + "learning_rate": 9.683026584867076e-05, + "loss": 2.8921, + "step": 948 + }, + { + "epoch": 0.2912829957028852, + "grad_norm": 1.6863081455230713, + "learning_rate": 9.693251533742331e-05, + "loss": 2.8208, + "step": 949 + }, + { + "epoch": 0.29158993247391035, + "grad_norm": 2.1578476428985596, + "learning_rate": 9.703476482617587e-05, + "loss": 2.8972, + "step": 950 + }, + { + "epoch": 0.2918968692449355, + "grad_norm": 1.6925181150436401, + "learning_rate": 9.713701431492844e-05, + "loss": 2.8225, + "step": 951 + }, + { + "epoch": 0.2922038060159607, + "grad_norm": 1.8861147165298462, + "learning_rate": 9.723926380368099e-05, + "loss": 2.8707, + "step": 952 + }, + { + "epoch": 0.29251074278698586, + "grad_norm": 1.5894604921340942, + "learning_rate": 9.734151329243354e-05, + "loss": 2.7576, + "step": 953 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 1.9092673063278198, + "learning_rate": 9.74437627811861e-05, + "loss": 2.8659, + "step": 954 + }, + { + "epoch": 0.2931246163290362, + "grad_norm": 1.8600605726242065, + "learning_rate": 9.754601226993866e-05, + "loss": 2.752, + "step": 955 + }, + { + "epoch": 0.29343155310006136, + "grad_norm": 2.005805015563965, + "learning_rate": 9.76482617586912e-05, + "loss": 2.8511, + "step": 956 + }, + { + "epoch": 0.2937384898710866, + "grad_norm": 1.9485148191452026, + "learning_rate": 9.775051124744377e-05, + "loss": 2.9726, + "step": 957 + }, + { + "epoch": 0.29404542664211175, + "grad_norm": 1.9197280406951904, + "learning_rate": 9.785276073619632e-05, + "loss": 2.7753, + "step": 958 + }, + { + "epoch": 0.2943523634131369, + "grad_norm": 1.6279773712158203, + "learning_rate": 9.795501022494888e-05, + "loss": 2.8855, + "step": 959 + }, + { + "epoch": 0.2946593001841621, + "grad_norm": 2.0233097076416016, + "learning_rate": 9.805725971370143e-05, + "loss": 2.749, + "step": 960 + }, + { + "epoch": 0.29496623695518726, + "grad_norm": 1.550295352935791, + "learning_rate": 9.815950920245399e-05, + "loss": 2.7991, + "step": 961 + }, + { + "epoch": 0.2952731737262124, + "grad_norm": 2.3194360733032227, + "learning_rate": 9.826175869120655e-05, + "loss": 2.8208, + "step": 962 + }, + { + "epoch": 0.2955801104972376, + "grad_norm": 1.634867787361145, + "learning_rate": 9.83640081799591e-05, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.29588704726826276, + "grad_norm": 2.1152596473693848, + "learning_rate": 9.846625766871166e-05, + "loss": 2.7667, + "step": 964 + }, + { + "epoch": 0.2961939840392879, + "grad_norm": 1.8927233219146729, + "learning_rate": 9.856850715746421e-05, + "loss": 2.8308, + "step": 965 + }, + { + "epoch": 0.2965009208103131, + "grad_norm": 1.765026330947876, + "learning_rate": 9.867075664621678e-05, + "loss": 2.7546, + "step": 966 + }, + { + "epoch": 0.29680785758133826, + "grad_norm": 1.7491015195846558, + "learning_rate": 9.877300613496932e-05, + "loss": 2.8156, + "step": 967 + }, + { + "epoch": 0.29711479435236343, + "grad_norm": 1.8352077007293701, + "learning_rate": 9.887525562372189e-05, + "loss": 2.8542, + "step": 968 + }, + { + "epoch": 0.2974217311233886, + "grad_norm": 1.8892323970794678, + "learning_rate": 9.897750511247444e-05, + "loss": 2.8216, + "step": 969 + }, + { + "epoch": 0.29772866789441377, + "grad_norm": 1.7171403169631958, + "learning_rate": 9.907975460122701e-05, + "loss": 2.8428, + "step": 970 + }, + { + "epoch": 0.29803560466543894, + "grad_norm": 1.8318040370941162, + "learning_rate": 9.918200408997955e-05, + "loss": 2.7821, + "step": 971 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 1.5829975605010986, + "learning_rate": 9.928425357873212e-05, + "loss": 2.9091, + "step": 972 + }, + { + "epoch": 0.29864947820748927, + "grad_norm": 1.7248235940933228, + "learning_rate": 9.938650306748467e-05, + "loss": 2.7914, + "step": 973 + }, + { + "epoch": 0.29895641497851444, + "grad_norm": 1.7741187810897827, + "learning_rate": 9.948875255623722e-05, + "loss": 2.8711, + "step": 974 + }, + { + "epoch": 0.2992633517495396, + "grad_norm": 1.7419151067733765, + "learning_rate": 9.959100204498978e-05, + "loss": 2.8933, + "step": 975 + }, + { + "epoch": 0.2995702885205648, + "grad_norm": 1.6603926420211792, + "learning_rate": 9.969325153374234e-05, + "loss": 2.7138, + "step": 976 + }, + { + "epoch": 0.29987722529158994, + "grad_norm": 1.8423576354980469, + "learning_rate": 9.97955010224949e-05, + "loss": 2.7776, + "step": 977 + }, + { + "epoch": 0.3001841620626151, + "grad_norm": 1.5548568964004517, + "learning_rate": 9.989775051124745e-05, + "loss": 2.8193, + "step": 978 + }, + { + "epoch": 0.3004910988336403, + "grad_norm": 1.711785078048706, + "learning_rate": 0.0001, + "loss": 2.7082, + "step": 979 + }, + { + "epoch": 0.30079803560466545, + "grad_norm": 1.6395221948623657, + "learning_rate": 9.999999975293535e-05, + "loss": 2.7526, + "step": 980 + }, + { + "epoch": 0.3011049723756906, + "grad_norm": 1.829174518585205, + "learning_rate": 9.999999901174139e-05, + "loss": 2.7555, + "step": 981 + }, + { + "epoch": 0.3014119091467158, + "grad_norm": 1.5807569026947021, + "learning_rate": 9.999999777641814e-05, + "loss": 2.848, + "step": 982 + }, + { + "epoch": 0.30171884591774095, + "grad_norm": 2.014803171157837, + "learning_rate": 9.99999960469656e-05, + "loss": 2.8318, + "step": 983 + }, + { + "epoch": 0.3020257826887661, + "grad_norm": 1.4732542037963867, + "learning_rate": 9.99999938233838e-05, + "loss": 2.8143, + "step": 984 + }, + { + "epoch": 0.3023327194597913, + "grad_norm": 2.4888343811035156, + "learning_rate": 9.999999110567275e-05, + "loss": 2.7979, + "step": 985 + }, + { + "epoch": 0.30263965623081646, + "grad_norm": 1.4265737533569336, + "learning_rate": 9.99999878938325e-05, + "loss": 2.7968, + "step": 986 + }, + { + "epoch": 0.3029465930018416, + "grad_norm": 2.0397326946258545, + "learning_rate": 9.999998418786303e-05, + "loss": 2.7413, + "step": 987 + }, + { + "epoch": 0.3032535297728668, + "grad_norm": 1.6565579175949097, + "learning_rate": 9.999997998776443e-05, + "loss": 2.8249, + "step": 988 + }, + { + "epoch": 0.30356046654389196, + "grad_norm": 1.8470033407211304, + "learning_rate": 9.999997529353673e-05, + "loss": 2.7815, + "step": 989 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 1.571768045425415, + "learning_rate": 9.999997010517995e-05, + "loss": 2.7202, + "step": 990 + }, + { + "epoch": 0.3041743400859423, + "grad_norm": 1.6217811107635498, + "learning_rate": 9.999996442269417e-05, + "loss": 2.832, + "step": 991 + }, + { + "epoch": 0.30448127685696746, + "grad_norm": 1.745591640472412, + "learning_rate": 9.999995824607943e-05, + "loss": 2.8271, + "step": 992 + }, + { + "epoch": 0.30478821362799263, + "grad_norm": 1.6469355821609497, + "learning_rate": 9.99999515753358e-05, + "loss": 2.7699, + "step": 993 + }, + { + "epoch": 0.3050951503990178, + "grad_norm": 1.733182430267334, + "learning_rate": 9.999994441046334e-05, + "loss": 2.7927, + "step": 994 + }, + { + "epoch": 0.30540208717004297, + "grad_norm": 1.6043230295181274, + "learning_rate": 9.999993675146213e-05, + "loss": 2.7536, + "step": 995 + }, + { + "epoch": 0.30570902394106814, + "grad_norm": 1.8154711723327637, + "learning_rate": 9.999992859833222e-05, + "loss": 2.7795, + "step": 996 + }, + { + "epoch": 0.3060159607120933, + "grad_norm": 1.7553666830062866, + "learning_rate": 9.999991995107374e-05, + "loss": 2.8128, + "step": 997 + }, + { + "epoch": 0.3063228974831185, + "grad_norm": 1.702697992324829, + "learning_rate": 9.999991080968672e-05, + "loss": 2.7234, + "step": 998 + }, + { + "epoch": 0.30662983425414364, + "grad_norm": 1.512619972229004, + "learning_rate": 9.99999011741713e-05, + "loss": 2.7555, + "step": 999 + }, + { + "epoch": 0.3069367710251688, + "grad_norm": 1.735844612121582, + "learning_rate": 9.999989104452753e-05, + "loss": 2.7847, + "step": 1000 + }, + { + "epoch": 0.307243707796194, + "grad_norm": 1.4687904119491577, + "learning_rate": 9.999988042075555e-05, + "loss": 2.8039, + "step": 1001 + }, + { + "epoch": 0.30755064456721914, + "grad_norm": 1.6867917776107788, + "learning_rate": 9.999986930285542e-05, + "loss": 2.7643, + "step": 1002 + }, + { + "epoch": 0.3078575813382443, + "grad_norm": 1.6974400281906128, + "learning_rate": 9.99998576908273e-05, + "loss": 2.7284, + "step": 1003 + }, + { + "epoch": 0.3081645181092695, + "grad_norm": 1.6622353792190552, + "learning_rate": 9.999984558467126e-05, + "loss": 2.8364, + "step": 1004 + }, + { + "epoch": 0.30847145488029465, + "grad_norm": 1.7920496463775635, + "learning_rate": 9.999983298438744e-05, + "loss": 2.7769, + "step": 1005 + }, + { + "epoch": 0.3087783916513198, + "grad_norm": 1.7111997604370117, + "learning_rate": 9.999981988997598e-05, + "loss": 2.7323, + "step": 1006 + }, + { + "epoch": 0.309085328422345, + "grad_norm": 1.6372064352035522, + "learning_rate": 9.9999806301437e-05, + "loss": 2.8128, + "step": 1007 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 1.841002345085144, + "learning_rate": 9.999979221877061e-05, + "loss": 2.7049, + "step": 1008 + }, + { + "epoch": 0.3096992019643953, + "grad_norm": 1.4474141597747803, + "learning_rate": 9.999977764197697e-05, + "loss": 2.64, + "step": 1009 + }, + { + "epoch": 0.3100061387354205, + "grad_norm": 1.6599560976028442, + "learning_rate": 9.999976257105622e-05, + "loss": 2.7989, + "step": 1010 + }, + { + "epoch": 0.31031307550644566, + "grad_norm": 1.7502890825271606, + "learning_rate": 9.999974700600851e-05, + "loss": 2.7949, + "step": 1011 + }, + { + "epoch": 0.3106200122774708, + "grad_norm": 1.8119313716888428, + "learning_rate": 9.9999730946834e-05, + "loss": 2.7577, + "step": 1012 + }, + { + "epoch": 0.310926949048496, + "grad_norm": 1.4398404359817505, + "learning_rate": 9.999971439353284e-05, + "loss": 2.7369, + "step": 1013 + }, + { + "epoch": 0.31123388581952116, + "grad_norm": 1.8501840829849243, + "learning_rate": 9.999969734610522e-05, + "loss": 2.6651, + "step": 1014 + }, + { + "epoch": 0.31154082259054633, + "grad_norm": 1.450804352760315, + "learning_rate": 9.999967980455125e-05, + "loss": 2.7231, + "step": 1015 + }, + { + "epoch": 0.3118477593615715, + "grad_norm": 1.9445282220840454, + "learning_rate": 9.999966176887115e-05, + "loss": 2.795, + "step": 1016 + }, + { + "epoch": 0.31215469613259667, + "grad_norm": 1.6361008882522583, + "learning_rate": 9.99996432390651e-05, + "loss": 2.8894, + "step": 1017 + }, + { + "epoch": 0.31246163290362183, + "grad_norm": 2.0804831981658936, + "learning_rate": 9.999962421513325e-05, + "loss": 2.8313, + "step": 1018 + }, + { + "epoch": 0.312768569674647, + "grad_norm": 1.3779852390289307, + "learning_rate": 9.999960469707582e-05, + "loss": 2.6776, + "step": 1019 + }, + { + "epoch": 0.31307550644567217, + "grad_norm": 1.7727700471878052, + "learning_rate": 9.999958468489299e-05, + "loss": 2.8076, + "step": 1020 + }, + { + "epoch": 0.31338244321669734, + "grad_norm": 1.5273795127868652, + "learning_rate": 9.999956417858496e-05, + "loss": 2.7069, + "step": 1021 + }, + { + "epoch": 0.3136893799877225, + "grad_norm": 1.8135402202606201, + "learning_rate": 9.999954317815193e-05, + "loss": 2.7375, + "step": 1022 + }, + { + "epoch": 0.3139963167587477, + "grad_norm": 1.6642818450927734, + "learning_rate": 9.99995216835941e-05, + "loss": 2.8085, + "step": 1023 + }, + { + "epoch": 0.31430325352977284, + "grad_norm": 1.681378722190857, + "learning_rate": 9.999949969491169e-05, + "loss": 2.807, + "step": 1024 + }, + { + "epoch": 0.314610190300798, + "grad_norm": 1.5521160364151, + "learning_rate": 9.999947721210493e-05, + "loss": 2.7266, + "step": 1025 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 1.486830711364746, + "learning_rate": 9.999945423517403e-05, + "loss": 2.774, + "step": 1026 + }, + { + "epoch": 0.3152240638428484, + "grad_norm": 1.5730900764465332, + "learning_rate": 9.99994307641192e-05, + "loss": 2.7101, + "step": 1027 + }, + { + "epoch": 0.31553100061387357, + "grad_norm": 1.4835596084594727, + "learning_rate": 9.999940679894071e-05, + "loss": 2.8195, + "step": 1028 + }, + { + "epoch": 0.31583793738489874, + "grad_norm": 1.7885956764221191, + "learning_rate": 9.999938233963877e-05, + "loss": 2.796, + "step": 1029 + }, + { + "epoch": 0.3161448741559239, + "grad_norm": 1.4036259651184082, + "learning_rate": 9.999935738621362e-05, + "loss": 2.7167, + "step": 1030 + }, + { + "epoch": 0.3164518109269491, + "grad_norm": 1.7480512857437134, + "learning_rate": 9.999933193866554e-05, + "loss": 2.6774, + "step": 1031 + }, + { + "epoch": 0.31675874769797424, + "grad_norm": 1.66177499294281, + "learning_rate": 9.999930599699473e-05, + "loss": 2.7635, + "step": 1032 + }, + { + "epoch": 0.3170656844689994, + "grad_norm": 1.5088306665420532, + "learning_rate": 9.999927956120147e-05, + "loss": 2.7284, + "step": 1033 + }, + { + "epoch": 0.3173726212400246, + "grad_norm": 1.6847199201583862, + "learning_rate": 9.999925263128605e-05, + "loss": 2.8287, + "step": 1034 + }, + { + "epoch": 0.31767955801104975, + "grad_norm": 1.6092369556427002, + "learning_rate": 9.999922520724869e-05, + "loss": 2.7189, + "step": 1035 + }, + { + "epoch": 0.3179864947820749, + "grad_norm": 1.41717529296875, + "learning_rate": 9.999919728908969e-05, + "loss": 2.7134, + "step": 1036 + }, + { + "epoch": 0.3182934315531001, + "grad_norm": 1.6256498098373413, + "learning_rate": 9.999916887680931e-05, + "loss": 2.7312, + "step": 1037 + }, + { + "epoch": 0.31860036832412525, + "grad_norm": 1.4934377670288086, + "learning_rate": 9.999913997040784e-05, + "loss": 2.7548, + "step": 1038 + }, + { + "epoch": 0.3189073050951504, + "grad_norm": 1.6037719249725342, + "learning_rate": 9.999911056988557e-05, + "loss": 2.7682, + "step": 1039 + }, + { + "epoch": 0.3192142418661756, + "grad_norm": 1.4746284484863281, + "learning_rate": 9.999908067524277e-05, + "loss": 2.7256, + "step": 1040 + }, + { + "epoch": 0.31952117863720075, + "grad_norm": 1.4633710384368896, + "learning_rate": 9.999905028647976e-05, + "loss": 2.6779, + "step": 1041 + }, + { + "epoch": 0.3198281154082259, + "grad_norm": 1.6108646392822266, + "learning_rate": 9.999901940359684e-05, + "loss": 2.781, + "step": 1042 + }, + { + "epoch": 0.3201350521792511, + "grad_norm": 1.4130996465682983, + "learning_rate": 9.999898802659428e-05, + "loss": 2.6327, + "step": 1043 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 2.110307455062866, + "learning_rate": 9.999895615547244e-05, + "loss": 2.7965, + "step": 1044 + }, + { + "epoch": 0.3207489257213014, + "grad_norm": 1.500618815422058, + "learning_rate": 9.99989237902316e-05, + "loss": 2.7874, + "step": 1045 + }, + { + "epoch": 0.3210558624923266, + "grad_norm": 1.577890157699585, + "learning_rate": 9.999889093087207e-05, + "loss": 2.6816, + "step": 1046 + }, + { + "epoch": 0.32136279926335176, + "grad_norm": 1.2820981740951538, + "learning_rate": 9.999885757739422e-05, + "loss": 2.6799, + "step": 1047 + }, + { + "epoch": 0.32166973603437693, + "grad_norm": 1.629936695098877, + "learning_rate": 9.999882372979835e-05, + "loss": 2.6783, + "step": 1048 + }, + { + "epoch": 0.3219766728054021, + "grad_norm": 1.3119972944259644, + "learning_rate": 9.999878938808478e-05, + "loss": 2.6403, + "step": 1049 + }, + { + "epoch": 0.32228360957642727, + "grad_norm": 1.720093846321106, + "learning_rate": 9.999875455225389e-05, + "loss": 2.709, + "step": 1050 + }, + { + "epoch": 0.32259054634745243, + "grad_norm": 1.446273922920227, + "learning_rate": 9.999871922230599e-05, + "loss": 2.6463, + "step": 1051 + }, + { + "epoch": 0.3228974831184776, + "grad_norm": 1.5000908374786377, + "learning_rate": 9.999868339824145e-05, + "loss": 2.7502, + "step": 1052 + }, + { + "epoch": 0.32320441988950277, + "grad_norm": 1.6257869005203247, + "learning_rate": 9.999864708006061e-05, + "loss": 2.6984, + "step": 1053 + }, + { + "epoch": 0.32351135666052794, + "grad_norm": 1.509638786315918, + "learning_rate": 9.999861026776384e-05, + "loss": 2.6931, + "step": 1054 + }, + { + "epoch": 0.3238182934315531, + "grad_norm": 1.5305874347686768, + "learning_rate": 9.999857296135149e-05, + "loss": 2.8423, + "step": 1055 + }, + { + "epoch": 0.3241252302025783, + "grad_norm": 1.7664300203323364, + "learning_rate": 9.999853516082394e-05, + "loss": 2.7703, + "step": 1056 + }, + { + "epoch": 0.32443216697360344, + "grad_norm": 1.4633153676986694, + "learning_rate": 9.999849686618157e-05, + "loss": 2.7588, + "step": 1057 + }, + { + "epoch": 0.3247391037446286, + "grad_norm": 1.5177773237228394, + "learning_rate": 9.999845807742473e-05, + "loss": 2.7376, + "step": 1058 + }, + { + "epoch": 0.3250460405156538, + "grad_norm": 1.6122089624404907, + "learning_rate": 9.999841879455383e-05, + "loss": 2.7871, + "step": 1059 + }, + { + "epoch": 0.32535297728667895, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.999837901756926e-05, + "loss": 2.6602, + "step": 1060 + }, + { + "epoch": 0.3256599140577041, + "grad_norm": 1.5714327096939087, + "learning_rate": 9.99983387464714e-05, + "loss": 2.6279, + "step": 1061 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 1.399731993675232, + "learning_rate": 9.999829798126065e-05, + "loss": 2.7957, + "step": 1062 + }, + { + "epoch": 0.32627378759975445, + "grad_norm": 1.694368839263916, + "learning_rate": 9.999825672193741e-05, + "loss": 2.6859, + "step": 1063 + }, + { + "epoch": 0.3265807243707796, + "grad_norm": 1.2585967779159546, + "learning_rate": 9.99982149685021e-05, + "loss": 2.7964, + "step": 1064 + }, + { + "epoch": 0.3268876611418048, + "grad_norm": 1.802262306213379, + "learning_rate": 9.999817272095512e-05, + "loss": 2.6325, + "step": 1065 + }, + { + "epoch": 0.32719459791282995, + "grad_norm": 1.213222861289978, + "learning_rate": 9.99981299792969e-05, + "loss": 2.718, + "step": 1066 + }, + { + "epoch": 0.3275015346838551, + "grad_norm": 1.5745760202407837, + "learning_rate": 9.999808674352785e-05, + "loss": 2.8589, + "step": 1067 + }, + { + "epoch": 0.3278084714548803, + "grad_norm": 1.516995906829834, + "learning_rate": 9.999804301364839e-05, + "loss": 2.6691, + "step": 1068 + }, + { + "epoch": 0.32811540822590546, + "grad_norm": 1.4223122596740723, + "learning_rate": 9.999799878965897e-05, + "loss": 2.6899, + "step": 1069 + }, + { + "epoch": 0.3284223449969306, + "grad_norm": 1.4502828121185303, + "learning_rate": 9.999795407156003e-05, + "loss": 2.7801, + "step": 1070 + }, + { + "epoch": 0.3287292817679558, + "grad_norm": 1.4692026376724243, + "learning_rate": 9.999790885935198e-05, + "loss": 2.6869, + "step": 1071 + }, + { + "epoch": 0.32903621853898096, + "grad_norm": 1.4182246923446655, + "learning_rate": 9.999786315303532e-05, + "loss": 2.7802, + "step": 1072 + }, + { + "epoch": 0.32934315531000613, + "grad_norm": 1.781173586845398, + "learning_rate": 9.999781695261046e-05, + "loss": 2.7522, + "step": 1073 + }, + { + "epoch": 0.3296500920810313, + "grad_norm": 1.3958306312561035, + "learning_rate": 9.999777025807786e-05, + "loss": 2.6894, + "step": 1074 + }, + { + "epoch": 0.32995702885205647, + "grad_norm": 1.7938110828399658, + "learning_rate": 9.9997723069438e-05, + "loss": 2.6468, + "step": 1075 + }, + { + "epoch": 0.33026396562308163, + "grad_norm": 1.2314528226852417, + "learning_rate": 9.999767538669134e-05, + "loss": 2.7446, + "step": 1076 + }, + { + "epoch": 0.3305709023941068, + "grad_norm": 1.4881565570831299, + "learning_rate": 9.999762720983835e-05, + "loss": 2.6904, + "step": 1077 + }, + { + "epoch": 0.33087783916513197, + "grad_norm": 1.3903130292892456, + "learning_rate": 9.999757853887948e-05, + "loss": 2.7315, + "step": 1078 + }, + { + "epoch": 0.33118477593615714, + "grad_norm": 1.491129755973816, + "learning_rate": 9.999752937381525e-05, + "loss": 2.7325, + "step": 1079 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 1.4748190641403198, + "learning_rate": 9.999747971464612e-05, + "loss": 2.7288, + "step": 1080 + }, + { + "epoch": 0.3317986494782075, + "grad_norm": 1.5664055347442627, + "learning_rate": 9.99974295613726e-05, + "loss": 2.8225, + "step": 1081 + }, + { + "epoch": 0.33210558624923264, + "grad_norm": 1.4422696828842163, + "learning_rate": 9.999737891399518e-05, + "loss": 2.6537, + "step": 1082 + }, + { + "epoch": 0.3324125230202578, + "grad_norm": 1.397817850112915, + "learning_rate": 9.999732777251436e-05, + "loss": 2.6329, + "step": 1083 + }, + { + "epoch": 0.332719459791283, + "grad_norm": 1.4253548383712769, + "learning_rate": 9.999727613693063e-05, + "loss": 2.7028, + "step": 1084 + }, + { + "epoch": 0.33302639656230815, + "grad_norm": 1.4327688217163086, + "learning_rate": 9.999722400724451e-05, + "loss": 2.6524, + "step": 1085 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.999717138345654e-05, + "loss": 2.7278, + "step": 1086 + }, + { + "epoch": 0.3336402701043585, + "grad_norm": 1.536656379699707, + "learning_rate": 9.999711826556719e-05, + "loss": 2.5858, + "step": 1087 + }, + { + "epoch": 0.33394720687538365, + "grad_norm": 1.4210286140441895, + "learning_rate": 9.999706465357703e-05, + "loss": 2.7057, + "step": 1088 + }, + { + "epoch": 0.3342541436464088, + "grad_norm": 1.4605839252471924, + "learning_rate": 9.999701054748657e-05, + "loss": 2.6461, + "step": 1089 + }, + { + "epoch": 0.334561080417434, + "grad_norm": 1.4764037132263184, + "learning_rate": 9.999695594729636e-05, + "loss": 2.608, + "step": 1090 + }, + { + "epoch": 0.33486801718845915, + "grad_norm": 1.630843162536621, + "learning_rate": 9.99969008530069e-05, + "loss": 2.6165, + "step": 1091 + }, + { + "epoch": 0.3351749539594843, + "grad_norm": 1.3693522214889526, + "learning_rate": 9.999684526461879e-05, + "loss": 2.72, + "step": 1092 + }, + { + "epoch": 0.3354818907305095, + "grad_norm": 1.609580636024475, + "learning_rate": 9.999678918213254e-05, + "loss": 2.7602, + "step": 1093 + }, + { + "epoch": 0.33578882750153466, + "grad_norm": 1.3815720081329346, + "learning_rate": 9.999673260554872e-05, + "loss": 2.6297, + "step": 1094 + }, + { + "epoch": 0.3360957642725598, + "grad_norm": 1.4511120319366455, + "learning_rate": 9.999667553486787e-05, + "loss": 2.7515, + "step": 1095 + }, + { + "epoch": 0.336402701043585, + "grad_norm": 1.486387848854065, + "learning_rate": 9.999661797009057e-05, + "loss": 2.6839, + "step": 1096 + }, + { + "epoch": 0.33670963781461016, + "grad_norm": 1.239160180091858, + "learning_rate": 9.999655991121739e-05, + "loss": 2.6033, + "step": 1097 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 1.499598741531372, + "learning_rate": 9.999650135824891e-05, + "loss": 2.5582, + "step": 1098 + }, + { + "epoch": 0.33732351135666055, + "grad_norm": 1.32973051071167, + "learning_rate": 9.999644231118571e-05, + "loss": 2.6253, + "step": 1099 + }, + { + "epoch": 0.3376304481276857, + "grad_norm": 1.4025259017944336, + "learning_rate": 9.999638277002833e-05, + "loss": 2.6199, + "step": 1100 + }, + { + "epoch": 0.3379373848987109, + "grad_norm": 1.3162082433700562, + "learning_rate": 9.999632273477742e-05, + "loss": 2.5528, + "step": 1101 + }, + { + "epoch": 0.33824432166973606, + "grad_norm": 1.5454723834991455, + "learning_rate": 9.999626220543352e-05, + "loss": 2.6724, + "step": 1102 + }, + { + "epoch": 0.3385512584407612, + "grad_norm": 1.45896315574646, + "learning_rate": 9.999620118199727e-05, + "loss": 2.688, + "step": 1103 + }, + { + "epoch": 0.3388581952117864, + "grad_norm": 1.3940998315811157, + "learning_rate": 9.999613966446926e-05, + "loss": 2.6991, + "step": 1104 + }, + { + "epoch": 0.33916513198281156, + "grad_norm": 1.4427480697631836, + "learning_rate": 9.999607765285009e-05, + "loss": 2.6869, + "step": 1105 + }, + { + "epoch": 0.33947206875383673, + "grad_norm": 1.260373830795288, + "learning_rate": 9.999601514714036e-05, + "loss": 2.7011, + "step": 1106 + }, + { + "epoch": 0.3397790055248619, + "grad_norm": 1.5985103845596313, + "learning_rate": 9.999595214734072e-05, + "loss": 2.599, + "step": 1107 + }, + { + "epoch": 0.34008594229588707, + "grad_norm": 1.1968494653701782, + "learning_rate": 9.999588865345179e-05, + "loss": 2.6346, + "step": 1108 + }, + { + "epoch": 0.34039287906691224, + "grad_norm": 1.4565916061401367, + "learning_rate": 9.999582466547417e-05, + "loss": 2.6303, + "step": 1109 + }, + { + "epoch": 0.3406998158379374, + "grad_norm": 1.2992361783981323, + "learning_rate": 9.999576018340851e-05, + "loss": 2.6121, + "step": 1110 + }, + { + "epoch": 0.34100675260896257, + "grad_norm": 1.402471899986267, + "learning_rate": 9.999569520725543e-05, + "loss": 2.6697, + "step": 1111 + }, + { + "epoch": 0.34131368937998774, + "grad_norm": 1.3006439208984375, + "learning_rate": 9.99956297370156e-05, + "loss": 2.6347, + "step": 1112 + }, + { + "epoch": 0.3416206261510129, + "grad_norm": 1.4235650300979614, + "learning_rate": 9.999556377268966e-05, + "loss": 2.6869, + "step": 1113 + }, + { + "epoch": 0.3419275629220381, + "grad_norm": 1.3288183212280273, + "learning_rate": 9.999549731427824e-05, + "loss": 2.5834, + "step": 1114 + }, + { + "epoch": 0.34223449969306324, + "grad_norm": 1.430736780166626, + "learning_rate": 9.999543036178203e-05, + "loss": 2.6248, + "step": 1115 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 1.467417597770691, + "learning_rate": 9.999536291520167e-05, + "loss": 2.6563, + "step": 1116 + }, + { + "epoch": 0.3428483732351136, + "grad_norm": 1.3988397121429443, + "learning_rate": 9.999529497453782e-05, + "loss": 2.6634, + "step": 1117 + }, + { + "epoch": 0.34315531000613875, + "grad_norm": 1.2072746753692627, + "learning_rate": 9.999522653979117e-05, + "loss": 2.6129, + "step": 1118 + }, + { + "epoch": 0.3434622467771639, + "grad_norm": 1.5297373533248901, + "learning_rate": 9.999515761096239e-05, + "loss": 2.6359, + "step": 1119 + }, + { + "epoch": 0.3437691835481891, + "grad_norm": 1.2022082805633545, + "learning_rate": 9.999508818805214e-05, + "loss": 2.6934, + "step": 1120 + }, + { + "epoch": 0.34407612031921425, + "grad_norm": 1.5655800104141235, + "learning_rate": 9.999501827106114e-05, + "loss": 2.6132, + "step": 1121 + }, + { + "epoch": 0.3443830570902394, + "grad_norm": 1.1639407873153687, + "learning_rate": 9.999494785999007e-05, + "loss": 2.6416, + "step": 1122 + }, + { + "epoch": 0.3446899938612646, + "grad_norm": 1.5784116983413696, + "learning_rate": 9.999487695483962e-05, + "loss": 2.5967, + "step": 1123 + }, + { + "epoch": 0.34499693063228976, + "grad_norm": 1.1812770366668701, + "learning_rate": 9.999480555561049e-05, + "loss": 2.6303, + "step": 1124 + }, + { + "epoch": 0.3453038674033149, + "grad_norm": 1.5105888843536377, + "learning_rate": 9.99947336623034e-05, + "loss": 2.58, + "step": 1125 + }, + { + "epoch": 0.3456108041743401, + "grad_norm": 1.2969506978988647, + "learning_rate": 9.999466127491904e-05, + "loss": 2.6857, + "step": 1126 + }, + { + "epoch": 0.34591774094536526, + "grad_norm": 1.679018259048462, + "learning_rate": 9.999458839345812e-05, + "loss": 2.6304, + "step": 1127 + }, + { + "epoch": 0.3462246777163904, + "grad_norm": 1.2718015909194946, + "learning_rate": 9.99945150179214e-05, + "loss": 2.6929, + "step": 1128 + }, + { + "epoch": 0.3465316144874156, + "grad_norm": 1.5834014415740967, + "learning_rate": 9.999444114830957e-05, + "loss": 2.6477, + "step": 1129 + }, + { + "epoch": 0.34683855125844076, + "grad_norm": 1.1575955152511597, + "learning_rate": 9.999436678462338e-05, + "loss": 2.6908, + "step": 1130 + }, + { + "epoch": 0.34714548802946593, + "grad_norm": 1.6231988668441772, + "learning_rate": 9.999429192686352e-05, + "loss": 2.6741, + "step": 1131 + }, + { + "epoch": 0.3474524248004911, + "grad_norm": 1.1616390943527222, + "learning_rate": 9.99942165750308e-05, + "loss": 2.5977, + "step": 1132 + }, + { + "epoch": 0.34775936157151627, + "grad_norm": 1.6188498735427856, + "learning_rate": 9.999414072912592e-05, + "loss": 2.6776, + "step": 1133 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 1.3885529041290283, + "learning_rate": 9.999406438914962e-05, + "loss": 2.7136, + "step": 1134 + }, + { + "epoch": 0.3483732351135666, + "grad_norm": 1.4522851705551147, + "learning_rate": 9.999398755510269e-05, + "loss": 2.6817, + "step": 1135 + }, + { + "epoch": 0.34868017188459177, + "grad_norm": 1.2695082426071167, + "learning_rate": 9.999391022698588e-05, + "loss": 2.6257, + "step": 1136 + }, + { + "epoch": 0.34898710865561694, + "grad_norm": 1.1735594272613525, + "learning_rate": 9.999383240479993e-05, + "loss": 2.5908, + "step": 1137 + }, + { + "epoch": 0.3492940454266421, + "grad_norm": 1.4158523082733154, + "learning_rate": 9.999375408854564e-05, + "loss": 2.572, + "step": 1138 + }, + { + "epoch": 0.3496009821976673, + "grad_norm": 1.1342333555221558, + "learning_rate": 9.999367527822376e-05, + "loss": 2.6918, + "step": 1139 + }, + { + "epoch": 0.34990791896869244, + "grad_norm": 1.4462997913360596, + "learning_rate": 9.999359597383509e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 0.3502148557397176, + "grad_norm": 1.254346251487732, + "learning_rate": 9.99935161753804e-05, + "loss": 2.6426, + "step": 1141 + }, + { + "epoch": 0.3505217925107428, + "grad_norm": 1.5101851224899292, + "learning_rate": 9.999343588286048e-05, + "loss": 2.6261, + "step": 1142 + }, + { + "epoch": 0.35082872928176795, + "grad_norm": 1.2910065650939941, + "learning_rate": 9.999335509627612e-05, + "loss": 2.5587, + "step": 1143 + }, + { + "epoch": 0.3511356660527931, + "grad_norm": 1.4421133995056152, + "learning_rate": 9.999327381562812e-05, + "loss": 2.6812, + "step": 1144 + }, + { + "epoch": 0.3514426028238183, + "grad_norm": 1.3265037536621094, + "learning_rate": 9.999319204091728e-05, + "loss": 2.6506, + "step": 1145 + }, + { + "epoch": 0.35174953959484345, + "grad_norm": 1.346258521080017, + "learning_rate": 9.999310977214443e-05, + "loss": 2.7038, + "step": 1146 + }, + { + "epoch": 0.3520564763658686, + "grad_norm": 1.3683836460113525, + "learning_rate": 9.999302700931037e-05, + "loss": 2.5823, + "step": 1147 + }, + { + "epoch": 0.3523634131368938, + "grad_norm": 1.3593783378601074, + "learning_rate": 9.99929437524159e-05, + "loss": 2.5705, + "step": 1148 + }, + { + "epoch": 0.35267034990791896, + "grad_norm": 1.4077095985412598, + "learning_rate": 9.999286000146186e-05, + "loss": 2.6259, + "step": 1149 + }, + { + "epoch": 0.3529772866789441, + "grad_norm": 1.3095922470092773, + "learning_rate": 9.99927757564491e-05, + "loss": 2.683, + "step": 1150 + }, + { + "epoch": 0.3532842234499693, + "grad_norm": 1.4188631772994995, + "learning_rate": 9.999269101737841e-05, + "loss": 2.619, + "step": 1151 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 1.2483123540878296, + "learning_rate": 9.999260578425063e-05, + "loss": 2.6477, + "step": 1152 + }, + { + "epoch": 0.35389809699201963, + "grad_norm": 1.4601099491119385, + "learning_rate": 9.999252005706663e-05, + "loss": 2.5861, + "step": 1153 + }, + { + "epoch": 0.3542050337630448, + "grad_norm": 1.107335090637207, + "learning_rate": 9.999243383582726e-05, + "loss": 2.6308, + "step": 1154 + }, + { + "epoch": 0.35451197053406996, + "grad_norm": 1.60590398311615, + "learning_rate": 9.999234712053334e-05, + "loss": 2.7057, + "step": 1155 + }, + { + "epoch": 0.35481890730509513, + "grad_norm": 1.2256578207015991, + "learning_rate": 9.999225991118575e-05, + "loss": 2.6371, + "step": 1156 + }, + { + "epoch": 0.3551258440761203, + "grad_norm": 1.4451910257339478, + "learning_rate": 9.999217220778535e-05, + "loss": 2.6424, + "step": 1157 + }, + { + "epoch": 0.35543278084714547, + "grad_norm": 1.184781789779663, + "learning_rate": 9.999208401033299e-05, + "loss": 2.6576, + "step": 1158 + }, + { + "epoch": 0.35573971761817064, + "grad_norm": 1.3395711183547974, + "learning_rate": 9.999199531882956e-05, + "loss": 2.6109, + "step": 1159 + }, + { + "epoch": 0.3560466543891958, + "grad_norm": 1.2052571773529053, + "learning_rate": 9.999190613327594e-05, + "loss": 2.5486, + "step": 1160 + }, + { + "epoch": 0.356353591160221, + "grad_norm": 1.2690850496292114, + "learning_rate": 9.999181645367299e-05, + "loss": 2.6457, + "step": 1161 + }, + { + "epoch": 0.35666052793124614, + "grad_norm": 1.2832787036895752, + "learning_rate": 9.999172628002162e-05, + "loss": 2.6097, + "step": 1162 + }, + { + "epoch": 0.3569674647022713, + "grad_norm": 1.3791579008102417, + "learning_rate": 9.999163561232272e-05, + "loss": 2.7458, + "step": 1163 + }, + { + "epoch": 0.3572744014732965, + "grad_norm": 1.260743498802185, + "learning_rate": 9.999154445057715e-05, + "loss": 2.594, + "step": 1164 + }, + { + "epoch": 0.35758133824432164, + "grad_norm": 1.1595406532287598, + "learning_rate": 9.999145279478585e-05, + "loss": 2.5315, + "step": 1165 + }, + { + "epoch": 0.3578882750153468, + "grad_norm": 1.3424396514892578, + "learning_rate": 9.999136064494972e-05, + "loss": 2.6017, + "step": 1166 + }, + { + "epoch": 0.358195211786372, + "grad_norm": 1.317750334739685, + "learning_rate": 9.999126800106963e-05, + "loss": 2.5787, + "step": 1167 + }, + { + "epoch": 0.35850214855739715, + "grad_norm": 1.104471206665039, + "learning_rate": 9.999117486314657e-05, + "loss": 2.6801, + "step": 1168 + }, + { + "epoch": 0.3588090853284224, + "grad_norm": 1.5555830001831055, + "learning_rate": 9.99910812311814e-05, + "loss": 2.6575, + "step": 1169 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 1.1883453130722046, + "learning_rate": 9.999098710517507e-05, + "loss": 2.5801, + "step": 1170 + }, + { + "epoch": 0.3594229588704727, + "grad_norm": 1.3885222673416138, + "learning_rate": 9.99908924851285e-05, + "loss": 2.5637, + "step": 1171 + }, + { + "epoch": 0.3597298956414979, + "grad_norm": 1.1860510110855103, + "learning_rate": 9.999079737104262e-05, + "loss": 2.6528, + "step": 1172 + }, + { + "epoch": 0.36003683241252304, + "grad_norm": 1.4319096803665161, + "learning_rate": 9.99907017629184e-05, + "loss": 2.579, + "step": 1173 + }, + { + "epoch": 0.3603437691835482, + "grad_norm": 1.256819725036621, + "learning_rate": 9.999060566075676e-05, + "loss": 2.5638, + "step": 1174 + }, + { + "epoch": 0.3606507059545734, + "grad_norm": 1.5452641248703003, + "learning_rate": 9.999050906455865e-05, + "loss": 2.6318, + "step": 1175 + }, + { + "epoch": 0.36095764272559855, + "grad_norm": 1.1933847665786743, + "learning_rate": 9.999041197432503e-05, + "loss": 2.5451, + "step": 1176 + }, + { + "epoch": 0.3612645794966237, + "grad_norm": 1.245689034461975, + "learning_rate": 9.999031439005684e-05, + "loss": 2.5452, + "step": 1177 + }, + { + "epoch": 0.3615715162676489, + "grad_norm": 1.2228111028671265, + "learning_rate": 9.99902163117551e-05, + "loss": 2.5856, + "step": 1178 + }, + { + "epoch": 0.36187845303867405, + "grad_norm": 1.3547098636627197, + "learning_rate": 9.999011773942071e-05, + "loss": 2.6604, + "step": 1179 + }, + { + "epoch": 0.3621853898096992, + "grad_norm": 1.25395929813385, + "learning_rate": 9.999001867305469e-05, + "loss": 2.5947, + "step": 1180 + }, + { + "epoch": 0.3624923265807244, + "grad_norm": 1.1676687002182007, + "learning_rate": 9.9989919112658e-05, + "loss": 2.5728, + "step": 1181 + }, + { + "epoch": 0.36279926335174956, + "grad_norm": 1.2076375484466553, + "learning_rate": 9.998981905823163e-05, + "loss": 2.569, + "step": 1182 + }, + { + "epoch": 0.3631062001227747, + "grad_norm": 1.3417900800704956, + "learning_rate": 9.998971850977659e-05, + "loss": 2.5552, + "step": 1183 + }, + { + "epoch": 0.3634131368937999, + "grad_norm": 1.135088324546814, + "learning_rate": 9.998961746729383e-05, + "loss": 2.5883, + "step": 1184 + }, + { + "epoch": 0.36372007366482506, + "grad_norm": 1.3329869508743286, + "learning_rate": 9.998951593078438e-05, + "loss": 2.6398, + "step": 1185 + }, + { + "epoch": 0.36402701043585023, + "grad_norm": 1.1681292057037354, + "learning_rate": 9.998941390024923e-05, + "loss": 2.6082, + "step": 1186 + }, + { + "epoch": 0.3643339472068754, + "grad_norm": 1.4083843231201172, + "learning_rate": 9.998931137568939e-05, + "loss": 2.6585, + "step": 1187 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 1.0879896879196167, + "learning_rate": 9.998920835710587e-05, + "loss": 2.4779, + "step": 1188 + }, + { + "epoch": 0.36494782074892573, + "grad_norm": 1.2977828979492188, + "learning_rate": 9.99891048444997e-05, + "loss": 2.6586, + "step": 1189 + }, + { + "epoch": 0.3652547575199509, + "grad_norm": 1.2552378177642822, + "learning_rate": 9.998900083787188e-05, + "loss": 2.5211, + "step": 1190 + }, + { + "epoch": 0.36556169429097607, + "grad_norm": 1.178227186203003, + "learning_rate": 9.998889633722348e-05, + "loss": 2.5365, + "step": 1191 + }, + { + "epoch": 0.36586863106200124, + "grad_norm": 1.36601722240448, + "learning_rate": 9.99887913425555e-05, + "loss": 2.6108, + "step": 1192 + }, + { + "epoch": 0.3661755678330264, + "grad_norm": 1.1947816610336304, + "learning_rate": 9.998868585386898e-05, + "loss": 2.5269, + "step": 1193 + }, + { + "epoch": 0.3664825046040516, + "grad_norm": 1.3113429546356201, + "learning_rate": 9.998857987116497e-05, + "loss": 2.5241, + "step": 1194 + }, + { + "epoch": 0.36678944137507674, + "grad_norm": 1.1573466062545776, + "learning_rate": 9.99884733944445e-05, + "loss": 2.5772, + "step": 1195 + }, + { + "epoch": 0.3670963781461019, + "grad_norm": 1.3841795921325684, + "learning_rate": 9.998836642370866e-05, + "loss": 2.6254, + "step": 1196 + }, + { + "epoch": 0.3674033149171271, + "grad_norm": 1.3332045078277588, + "learning_rate": 9.998825895895848e-05, + "loss": 2.6846, + "step": 1197 + }, + { + "epoch": 0.36771025168815225, + "grad_norm": 1.1578748226165771, + "learning_rate": 9.9988151000195e-05, + "loss": 2.4717, + "step": 1198 + }, + { + "epoch": 0.3680171884591774, + "grad_norm": 1.1045753955841064, + "learning_rate": 9.998804254741934e-05, + "loss": 2.6433, + "step": 1199 + }, + { + "epoch": 0.3683241252302026, + "grad_norm": 1.3260962963104248, + "learning_rate": 9.998793360063254e-05, + "loss": 2.6385, + "step": 1200 + }, + { + "epoch": 0.36863106200122775, + "grad_norm": 1.1483805179595947, + "learning_rate": 9.998782415983568e-05, + "loss": 2.6013, + "step": 1201 + }, + { + "epoch": 0.3689379987722529, + "grad_norm": 1.1897181272506714, + "learning_rate": 9.998771422502984e-05, + "loss": 2.485, + "step": 1202 + }, + { + "epoch": 0.3692449355432781, + "grad_norm": 1.2124346494674683, + "learning_rate": 9.99876037962161e-05, + "loss": 2.6271, + "step": 1203 + }, + { + "epoch": 0.36955187231430325, + "grad_norm": 1.2274240255355835, + "learning_rate": 9.998749287339557e-05, + "loss": 2.6072, + "step": 1204 + }, + { + "epoch": 0.3698588090853284, + "grad_norm": 1.2045015096664429, + "learning_rate": 9.998738145656934e-05, + "loss": 2.5567, + "step": 1205 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 1.187698483467102, + "learning_rate": 9.998726954573852e-05, + "loss": 2.6251, + "step": 1206 + }, + { + "epoch": 0.37047268262737876, + "grad_norm": 1.1760836839675903, + "learning_rate": 9.998715714090419e-05, + "loss": 2.6544, + "step": 1207 + }, + { + "epoch": 0.3707796193984039, + "grad_norm": 1.2181260585784912, + "learning_rate": 9.998704424206746e-05, + "loss": 2.6258, + "step": 1208 + }, + { + "epoch": 0.3710865561694291, + "grad_norm": 1.2106094360351562, + "learning_rate": 9.998693084922947e-05, + "loss": 2.5932, + "step": 1209 + }, + { + "epoch": 0.37139349294045426, + "grad_norm": 1.2973625659942627, + "learning_rate": 9.998681696239133e-05, + "loss": 2.5257, + "step": 1210 + }, + { + "epoch": 0.37170042971147943, + "grad_norm": 1.2477924823760986, + "learning_rate": 9.998670258155417e-05, + "loss": 2.6579, + "step": 1211 + }, + { + "epoch": 0.3720073664825046, + "grad_norm": 1.3301422595977783, + "learning_rate": 9.998658770671913e-05, + "loss": 2.4903, + "step": 1212 + }, + { + "epoch": 0.37231430325352977, + "grad_norm": 1.224321722984314, + "learning_rate": 9.998647233788732e-05, + "loss": 2.5865, + "step": 1213 + }, + { + "epoch": 0.37262124002455493, + "grad_norm": 1.3110655546188354, + "learning_rate": 9.99863564750599e-05, + "loss": 2.6134, + "step": 1214 + }, + { + "epoch": 0.3729281767955801, + "grad_norm": 1.2323014736175537, + "learning_rate": 9.998624011823801e-05, + "loss": 2.5892, + "step": 1215 + }, + { + "epoch": 0.37323511356660527, + "grad_norm": 1.0873770713806152, + "learning_rate": 9.998612326742279e-05, + "loss": 2.4897, + "step": 1216 + }, + { + "epoch": 0.37354205033763044, + "grad_norm": 1.2789679765701294, + "learning_rate": 9.998600592261539e-05, + "loss": 2.5603, + "step": 1217 + }, + { + "epoch": 0.3738489871086556, + "grad_norm": 1.1311540603637695, + "learning_rate": 9.998588808381699e-05, + "loss": 2.5327, + "step": 1218 + }, + { + "epoch": 0.3741559238796808, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.998576975102876e-05, + "loss": 2.4789, + "step": 1219 + }, + { + "epoch": 0.37446286065070594, + "grad_norm": 1.1840651035308838, + "learning_rate": 9.998565092425182e-05, + "loss": 2.5026, + "step": 1220 + }, + { + "epoch": 0.3747697974217311, + "grad_norm": 1.3145099878311157, + "learning_rate": 9.998553160348743e-05, + "loss": 2.5424, + "step": 1221 + }, + { + "epoch": 0.3750767341927563, + "grad_norm": 1.2192758321762085, + "learning_rate": 9.998541178873668e-05, + "loss": 2.5556, + "step": 1222 + }, + { + "epoch": 0.37538367096378145, + "grad_norm": 1.1329905986785889, + "learning_rate": 9.99852914800008e-05, + "loss": 2.4624, + "step": 1223 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 1.2490339279174805, + "learning_rate": 9.9985170677281e-05, + "loss": 2.5016, + "step": 1224 + }, + { + "epoch": 0.3759975445058318, + "grad_norm": 1.1884582042694092, + "learning_rate": 9.998504938057841e-05, + "loss": 2.5345, + "step": 1225 + }, + { + "epoch": 0.37630448127685695, + "grad_norm": 1.2075775861740112, + "learning_rate": 9.998492758989428e-05, + "loss": 2.5206, + "step": 1226 + }, + { + "epoch": 0.3766114180478821, + "grad_norm": 1.238457441329956, + "learning_rate": 9.99848053052298e-05, + "loss": 2.6748, + "step": 1227 + }, + { + "epoch": 0.3769183548189073, + "grad_norm": 1.3056883811950684, + "learning_rate": 9.998468252658618e-05, + "loss": 2.6146, + "step": 1228 + }, + { + "epoch": 0.37722529158993245, + "grad_norm": 1.191575050354004, + "learning_rate": 9.998455925396461e-05, + "loss": 2.4743, + "step": 1229 + }, + { + "epoch": 0.3775322283609576, + "grad_norm": 1.2834603786468506, + "learning_rate": 9.998443548736635e-05, + "loss": 2.5504, + "step": 1230 + }, + { + "epoch": 0.3778391651319828, + "grad_norm": 1.3023632764816284, + "learning_rate": 9.99843112267926e-05, + "loss": 2.5832, + "step": 1231 + }, + { + "epoch": 0.37814610190300796, + "grad_norm": 1.1219336986541748, + "learning_rate": 9.998418647224458e-05, + "loss": 2.5715, + "step": 1232 + }, + { + "epoch": 0.3784530386740331, + "grad_norm": 1.0666810274124146, + "learning_rate": 9.998406122372354e-05, + "loss": 2.4865, + "step": 1233 + }, + { + "epoch": 0.3787599754450583, + "grad_norm": 1.3699263334274292, + "learning_rate": 9.998393548123072e-05, + "loss": 2.5523, + "step": 1234 + }, + { + "epoch": 0.37906691221608346, + "grad_norm": 1.1383014917373657, + "learning_rate": 9.998380924476733e-05, + "loss": 2.7054, + "step": 1235 + }, + { + "epoch": 0.37937384898710863, + "grad_norm": 1.1304205656051636, + "learning_rate": 9.998368251433465e-05, + "loss": 2.5007, + "step": 1236 + }, + { + "epoch": 0.3796807857581338, + "grad_norm": 1.2220405340194702, + "learning_rate": 9.998355528993394e-05, + "loss": 2.5635, + "step": 1237 + }, + { + "epoch": 0.37998772252915897, + "grad_norm": 1.1126691102981567, + "learning_rate": 9.998342757156642e-05, + "loss": 2.5795, + "step": 1238 + }, + { + "epoch": 0.38029465930018413, + "grad_norm": 1.1675945520401, + "learning_rate": 9.998329935923339e-05, + "loss": 2.564, + "step": 1239 + }, + { + "epoch": 0.38060159607120936, + "grad_norm": 1.1286569833755493, + "learning_rate": 9.998317065293607e-05, + "loss": 2.5476, + "step": 1240 + }, + { + "epoch": 0.3809085328422345, + "grad_norm": 1.1252213716506958, + "learning_rate": 9.998304145267579e-05, + "loss": 2.5406, + "step": 1241 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 1.1931700706481934, + "learning_rate": 9.998291175845378e-05, + "loss": 2.5277, + "step": 1242 + }, + { + "epoch": 0.38152240638428486, + "grad_norm": 1.2148306369781494, + "learning_rate": 9.998278157027136e-05, + "loss": 2.5178, + "step": 1243 + }, + { + "epoch": 0.38182934315531003, + "grad_norm": 1.1597660779953003, + "learning_rate": 9.998265088812978e-05, + "loss": 2.5522, + "step": 1244 + }, + { + "epoch": 0.3821362799263352, + "grad_norm": 1.105973243713379, + "learning_rate": 9.998251971203035e-05, + "loss": 2.4558, + "step": 1245 + }, + { + "epoch": 0.38244321669736037, + "grad_norm": 1.1082781553268433, + "learning_rate": 9.998238804197437e-05, + "loss": 2.5504, + "step": 1246 + }, + { + "epoch": 0.38275015346838553, + "grad_norm": 1.2124732732772827, + "learning_rate": 9.998225587796312e-05, + "loss": 2.5536, + "step": 1247 + }, + { + "epoch": 0.3830570902394107, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.998212321999795e-05, + "loss": 2.4837, + "step": 1248 + }, + { + "epoch": 0.38336402701043587, + "grad_norm": 1.353562355041504, + "learning_rate": 9.998199006808014e-05, + "loss": 2.4554, + "step": 1249 + }, + { + "epoch": 0.38367096378146104, + "grad_norm": 1.2103357315063477, + "learning_rate": 9.998185642221098e-05, + "loss": 2.4843, + "step": 1250 + }, + { + "epoch": 0.3839779005524862, + "grad_norm": 1.2572352886199951, + "learning_rate": 9.998172228239185e-05, + "loss": 2.497, + "step": 1251 + }, + { + "epoch": 0.3842848373235114, + "grad_norm": 1.0910226106643677, + "learning_rate": 9.998158764862402e-05, + "loss": 2.577, + "step": 1252 + }, + { + "epoch": 0.38459177409453654, + "grad_norm": 1.2550606727600098, + "learning_rate": 9.998145252090886e-05, + "loss": 2.5087, + "step": 1253 + }, + { + "epoch": 0.3848987108655617, + "grad_norm": 1.0103787183761597, + "learning_rate": 9.998131689924768e-05, + "loss": 2.5306, + "step": 1254 + }, + { + "epoch": 0.3852056476365869, + "grad_norm": 1.2965941429138184, + "learning_rate": 9.998118078364184e-05, + "loss": 2.5622, + "step": 1255 + }, + { + "epoch": 0.38551258440761205, + "grad_norm": 1.0791535377502441, + "learning_rate": 9.998104417409269e-05, + "loss": 2.5608, + "step": 1256 + }, + { + "epoch": 0.3858195211786372, + "grad_norm": 1.3277596235275269, + "learning_rate": 9.998090707060155e-05, + "loss": 2.5748, + "step": 1257 + }, + { + "epoch": 0.3861264579496624, + "grad_norm": 1.004031777381897, + "learning_rate": 9.99807694731698e-05, + "loss": 2.5532, + "step": 1258 + }, + { + "epoch": 0.38643339472068755, + "grad_norm": 1.4802277088165283, + "learning_rate": 9.998063138179877e-05, + "loss": 2.585, + "step": 1259 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 1.0821146965026855, + "learning_rate": 9.998049279648987e-05, + "loss": 2.5248, + "step": 1260 + }, + { + "epoch": 0.3870472682627379, + "grad_norm": 1.2902108430862427, + "learning_rate": 9.998035371724443e-05, + "loss": 2.5134, + "step": 1261 + }, + { + "epoch": 0.38735420503376305, + "grad_norm": 1.082943320274353, + "learning_rate": 9.998021414406385e-05, + "loss": 2.5937, + "step": 1262 + }, + { + "epoch": 0.3876611418047882, + "grad_norm": 1.2164193391799927, + "learning_rate": 9.998007407694949e-05, + "loss": 2.5106, + "step": 1263 + }, + { + "epoch": 0.3879680785758134, + "grad_norm": 1.0999115705490112, + "learning_rate": 9.997993351590276e-05, + "loss": 2.5458, + "step": 1264 + }, + { + "epoch": 0.38827501534683856, + "grad_norm": 1.2275537252426147, + "learning_rate": 9.997979246092503e-05, + "loss": 2.5664, + "step": 1265 + }, + { + "epoch": 0.3885819521178637, + "grad_norm": 1.3246204853057861, + "learning_rate": 9.997965091201769e-05, + "loss": 2.5289, + "step": 1266 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.2404677867889404, + "learning_rate": 9.997950886918214e-05, + "loss": 2.5302, + "step": 1267 + }, + { + "epoch": 0.38919582565991406, + "grad_norm": 1.0993810892105103, + "learning_rate": 9.99793663324198e-05, + "loss": 2.5085, + "step": 1268 + }, + { + "epoch": 0.38950276243093923, + "grad_norm": 1.3394049406051636, + "learning_rate": 9.997922330173206e-05, + "loss": 2.5882, + "step": 1269 + }, + { + "epoch": 0.3898096992019644, + "grad_norm": 1.1464321613311768, + "learning_rate": 9.997907977712036e-05, + "loss": 2.5211, + "step": 1270 + }, + { + "epoch": 0.39011663597298957, + "grad_norm": 1.1246297359466553, + "learning_rate": 9.997893575858608e-05, + "loss": 2.4204, + "step": 1271 + }, + { + "epoch": 0.39042357274401474, + "grad_norm": 1.1278076171875, + "learning_rate": 9.997879124613067e-05, + "loss": 2.4405, + "step": 1272 + }, + { + "epoch": 0.3907305095150399, + "grad_norm": 1.2284942865371704, + "learning_rate": 9.997864623975555e-05, + "loss": 2.5674, + "step": 1273 + }, + { + "epoch": 0.39103744628606507, + "grad_norm": 1.1243138313293457, + "learning_rate": 9.997850073946215e-05, + "loss": 2.489, + "step": 1274 + }, + { + "epoch": 0.39134438305709024, + "grad_norm": 1.198461890220642, + "learning_rate": 9.997835474525193e-05, + "loss": 2.51, + "step": 1275 + }, + { + "epoch": 0.3916513198281154, + "grad_norm": 1.1643213033676147, + "learning_rate": 9.997820825712629e-05, + "loss": 2.5688, + "step": 1276 + }, + { + "epoch": 0.3919582565991406, + "grad_norm": 1.2107082605361938, + "learning_rate": 9.997806127508671e-05, + "loss": 2.5614, + "step": 1277 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 1.1856440305709839, + "learning_rate": 9.997791379913464e-05, + "loss": 2.5893, + "step": 1278 + }, + { + "epoch": 0.3925721301411909, + "grad_norm": 1.166395664215088, + "learning_rate": 9.997776582927153e-05, + "loss": 2.539, + "step": 1279 + }, + { + "epoch": 0.3928790669122161, + "grad_norm": 1.1638765335083008, + "learning_rate": 9.997761736549886e-05, + "loss": 2.5384, + "step": 1280 + }, + { + "epoch": 0.39318600368324125, + "grad_norm": 1.107485055923462, + "learning_rate": 9.997746840781806e-05, + "loss": 2.559, + "step": 1281 + }, + { + "epoch": 0.3934929404542664, + "grad_norm": 1.174592137336731, + "learning_rate": 9.997731895623063e-05, + "loss": 2.5132, + "step": 1282 + }, + { + "epoch": 0.3937998772252916, + "grad_norm": 1.0407745838165283, + "learning_rate": 9.997716901073806e-05, + "loss": 2.4871, + "step": 1283 + }, + { + "epoch": 0.39410681399631675, + "grad_norm": 1.059743046760559, + "learning_rate": 9.997701857134179e-05, + "loss": 2.4865, + "step": 1284 + }, + { + "epoch": 0.3944137507673419, + "grad_norm": 1.0606070756912231, + "learning_rate": 9.997686763804335e-05, + "loss": 2.5651, + "step": 1285 + }, + { + "epoch": 0.3947206875383671, + "grad_norm": 1.0753284692764282, + "learning_rate": 9.99767162108442e-05, + "loss": 2.4699, + "step": 1286 + }, + { + "epoch": 0.39502762430939226, + "grad_norm": 1.1155509948730469, + "learning_rate": 9.997656428974585e-05, + "loss": 2.5326, + "step": 1287 + }, + { + "epoch": 0.3953345610804174, + "grad_norm": 1.2243739366531372, + "learning_rate": 9.99764118747498e-05, + "loss": 2.5189, + "step": 1288 + }, + { + "epoch": 0.3956414978514426, + "grad_norm": 1.2526514530181885, + "learning_rate": 9.997625896585757e-05, + "loss": 2.5464, + "step": 1289 + }, + { + "epoch": 0.39594843462246776, + "grad_norm": 1.297153115272522, + "learning_rate": 9.997610556307062e-05, + "loss": 2.5752, + "step": 1290 + }, + { + "epoch": 0.39625537139349293, + "grad_norm": 1.1064956188201904, + "learning_rate": 9.997595166639054e-05, + "loss": 2.5743, + "step": 1291 + }, + { + "epoch": 0.3965623081645181, + "grad_norm": 1.255810022354126, + "learning_rate": 9.997579727581879e-05, + "loss": 2.7087, + "step": 1292 + }, + { + "epoch": 0.39686924493554326, + "grad_norm": 1.4290298223495483, + "learning_rate": 9.997564239135692e-05, + "loss": 2.5417, + "step": 1293 + }, + { + "epoch": 0.39717618170656843, + "grad_norm": 1.1937109231948853, + "learning_rate": 9.997548701300648e-05, + "loss": 2.4862, + "step": 1294 + }, + { + "epoch": 0.3974831184775936, + "grad_norm": 1.1707425117492676, + "learning_rate": 9.997533114076897e-05, + "loss": 2.4715, + "step": 1295 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 1.1248551607131958, + "learning_rate": 9.997517477464596e-05, + "loss": 2.4859, + "step": 1296 + }, + { + "epoch": 0.39809699201964394, + "grad_norm": 1.1656453609466553, + "learning_rate": 9.997501791463897e-05, + "loss": 2.5402, + "step": 1297 + }, + { + "epoch": 0.3984039287906691, + "grad_norm": 0.9916674494743347, + "learning_rate": 9.997486056074956e-05, + "loss": 2.5116, + "step": 1298 + }, + { + "epoch": 0.39871086556169427, + "grad_norm": 1.3229619264602661, + "learning_rate": 9.997470271297928e-05, + "loss": 2.5565, + "step": 1299 + }, + { + "epoch": 0.39901780233271944, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.997454437132971e-05, + "loss": 2.5191, + "step": 1300 + }, + { + "epoch": 0.3993247391037446, + "grad_norm": 1.2117778062820435, + "learning_rate": 9.997438553580241e-05, + "loss": 2.558, + "step": 1301 + }, + { + "epoch": 0.3996316758747698, + "grad_norm": 1.1083563566207886, + "learning_rate": 9.997422620639892e-05, + "loss": 2.4734, + "step": 1302 + }, + { + "epoch": 0.39993861264579494, + "grad_norm": 0.9662174582481384, + "learning_rate": 9.997406638312084e-05, + "loss": 2.4866, + "step": 1303 + }, + { + "epoch": 0.4002455494168201, + "grad_norm": 1.0886632204055786, + "learning_rate": 9.997390606596976e-05, + "loss": 2.5397, + "step": 1304 + }, + { + "epoch": 0.4005524861878453, + "grad_norm": 1.2318742275238037, + "learning_rate": 9.997374525494723e-05, + "loss": 2.6281, + "step": 1305 + }, + { + "epoch": 0.40085942295887045, + "grad_norm": 1.1717815399169922, + "learning_rate": 9.997358395005487e-05, + "loss": 2.5202, + "step": 1306 + }, + { + "epoch": 0.4011663597298956, + "grad_norm": 1.0533723831176758, + "learning_rate": 9.997342215129427e-05, + "loss": 2.5096, + "step": 1307 + }, + { + "epoch": 0.4014732965009208, + "grad_norm": 1.0814248323440552, + "learning_rate": 9.997325985866701e-05, + "loss": 2.5513, + "step": 1308 + }, + { + "epoch": 0.40178023327194595, + "grad_norm": 1.078261137008667, + "learning_rate": 9.997309707217472e-05, + "loss": 2.5115, + "step": 1309 + }, + { + "epoch": 0.4020871700429711, + "grad_norm": 1.0834710597991943, + "learning_rate": 9.997293379181897e-05, + "loss": 2.4754, + "step": 1310 + }, + { + "epoch": 0.40239410681399634, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.997277001760142e-05, + "loss": 2.5068, + "step": 1311 + }, + { + "epoch": 0.4027010435850215, + "grad_norm": 1.3008345365524292, + "learning_rate": 9.997260574952366e-05, + "loss": 2.4675, + "step": 1312 + }, + { + "epoch": 0.4030079803560467, + "grad_norm": 1.176858901977539, + "learning_rate": 9.997244098758732e-05, + "loss": 2.4786, + "step": 1313 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 1.0121303796768188, + "learning_rate": 9.997227573179403e-05, + "loss": 2.476, + "step": 1314 + }, + { + "epoch": 0.403621853898097, + "grad_norm": 1.326298713684082, + "learning_rate": 9.997210998214542e-05, + "loss": 2.4093, + "step": 1315 + }, + { + "epoch": 0.4039287906691222, + "grad_norm": 0.9008898735046387, + "learning_rate": 9.997194373864314e-05, + "loss": 2.4523, + "step": 1316 + }, + { + "epoch": 0.40423572744014735, + "grad_norm": 1.0441854000091553, + "learning_rate": 9.99717770012888e-05, + "loss": 2.5419, + "step": 1317 + }, + { + "epoch": 0.4045426642111725, + "grad_norm": 1.0490028858184814, + "learning_rate": 9.997160977008408e-05, + "loss": 2.4855, + "step": 1318 + }, + { + "epoch": 0.4048496009821977, + "grad_norm": 1.0244388580322266, + "learning_rate": 9.997144204503063e-05, + "loss": 2.4555, + "step": 1319 + }, + { + "epoch": 0.40515653775322286, + "grad_norm": 1.1217700242996216, + "learning_rate": 9.99712738261301e-05, + "loss": 2.4872, + "step": 1320 + }, + { + "epoch": 0.405463474524248, + "grad_norm": 1.031691551208496, + "learning_rate": 9.997110511338414e-05, + "loss": 2.4094, + "step": 1321 + }, + { + "epoch": 0.4057704112952732, + "grad_norm": 1.1658705472946167, + "learning_rate": 9.997093590679444e-05, + "loss": 2.407, + "step": 1322 + }, + { + "epoch": 0.40607734806629836, + "grad_norm": 1.1527072191238403, + "learning_rate": 9.997076620636266e-05, + "loss": 2.5041, + "step": 1323 + }, + { + "epoch": 0.40638428483732353, + "grad_norm": 1.2039116621017456, + "learning_rate": 9.997059601209049e-05, + "loss": 2.4682, + "step": 1324 + }, + { + "epoch": 0.4066912216083487, + "grad_norm": 1.142160177230835, + "learning_rate": 9.997042532397957e-05, + "loss": 2.4629, + "step": 1325 + }, + { + "epoch": 0.40699815837937386, + "grad_norm": 0.972081184387207, + "learning_rate": 9.997025414203164e-05, + "loss": 2.3941, + "step": 1326 + }, + { + "epoch": 0.40730509515039903, + "grad_norm": 1.0181753635406494, + "learning_rate": 9.99700824662484e-05, + "loss": 2.5649, + "step": 1327 + }, + { + "epoch": 0.4076120319214242, + "grad_norm": 1.145769715309143, + "learning_rate": 9.996991029663148e-05, + "loss": 2.5284, + "step": 1328 + }, + { + "epoch": 0.40791896869244937, + "grad_norm": 1.0604028701782227, + "learning_rate": 9.996973763318262e-05, + "loss": 2.4488, + "step": 1329 + }, + { + "epoch": 0.40822590546347454, + "grad_norm": 1.161383867263794, + "learning_rate": 9.996956447590354e-05, + "loss": 2.6081, + "step": 1330 + }, + { + "epoch": 0.4085328422344997, + "grad_norm": 1.0880714654922485, + "learning_rate": 9.996939082479591e-05, + "loss": 2.4695, + "step": 1331 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 1.036556601524353, + "learning_rate": 9.99692166798615e-05, + "loss": 2.4428, + "step": 1332 + }, + { + "epoch": 0.40914671577655004, + "grad_norm": 1.079179286956787, + "learning_rate": 9.996904204110198e-05, + "loss": 2.4543, + "step": 1333 + }, + { + "epoch": 0.4094536525475752, + "grad_norm": 1.0588144063949585, + "learning_rate": 9.996886690851912e-05, + "loss": 2.4755, + "step": 1334 + }, + { + "epoch": 0.4097605893186004, + "grad_norm": 1.0359580516815186, + "learning_rate": 9.996869128211462e-05, + "loss": 2.4933, + "step": 1335 + }, + { + "epoch": 0.41006752608962554, + "grad_norm": 1.0067389011383057, + "learning_rate": 9.996851516189021e-05, + "loss": 2.4291, + "step": 1336 + }, + { + "epoch": 0.4103744628606507, + "grad_norm": 1.0173524618148804, + "learning_rate": 9.996833854784766e-05, + "loss": 2.4856, + "step": 1337 + }, + { + "epoch": 0.4106813996316759, + "grad_norm": 1.0740927457809448, + "learning_rate": 9.99681614399887e-05, + "loss": 2.5248, + "step": 1338 + }, + { + "epoch": 0.41098833640270105, + "grad_norm": 0.9638547301292419, + "learning_rate": 9.99679838383151e-05, + "loss": 2.4777, + "step": 1339 + }, + { + "epoch": 0.4112952731737262, + "grad_norm": 1.0349369049072266, + "learning_rate": 9.996780574282856e-05, + "loss": 2.5188, + "step": 1340 + }, + { + "epoch": 0.4116022099447514, + "grad_norm": 1.099743127822876, + "learning_rate": 9.996762715353089e-05, + "loss": 2.4141, + "step": 1341 + }, + { + "epoch": 0.41190914671577655, + "grad_norm": 1.027178406715393, + "learning_rate": 9.996744807042386e-05, + "loss": 2.5134, + "step": 1342 + }, + { + "epoch": 0.4122160834868017, + "grad_norm": 1.1933472156524658, + "learning_rate": 9.996726849350922e-05, + "loss": 2.4821, + "step": 1343 + }, + { + "epoch": 0.4125230202578269, + "grad_norm": 1.1663923263549805, + "learning_rate": 9.996708842278872e-05, + "loss": 2.4593, + "step": 1344 + }, + { + "epoch": 0.41282995702885206, + "grad_norm": 1.2633854150772095, + "learning_rate": 9.996690785826418e-05, + "loss": 2.5524, + "step": 1345 + }, + { + "epoch": 0.4131368937998772, + "grad_norm": 1.03873610496521, + "learning_rate": 9.996672679993737e-05, + "loss": 2.5403, + "step": 1346 + }, + { + "epoch": 0.4134438305709024, + "grad_norm": 1.106656789779663, + "learning_rate": 9.996654524781009e-05, + "loss": 2.5172, + "step": 1347 + }, + { + "epoch": 0.41375076734192756, + "grad_norm": 1.015608310699463, + "learning_rate": 9.996636320188411e-05, + "loss": 2.423, + "step": 1348 + }, + { + "epoch": 0.41405770411295273, + "grad_norm": 1.0672087669372559, + "learning_rate": 9.996618066216124e-05, + "loss": 2.4861, + "step": 1349 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 1.1289842128753662, + "learning_rate": 9.996599762864329e-05, + "loss": 2.3944, + "step": 1350 + }, + { + "epoch": 0.41467157765500307, + "grad_norm": 1.080428957939148, + "learning_rate": 9.996581410133207e-05, + "loss": 2.4563, + "step": 1351 + }, + { + "epoch": 0.41497851442602823, + "grad_norm": 1.257104516029358, + "learning_rate": 9.996563008022939e-05, + "loss": 2.437, + "step": 1352 + }, + { + "epoch": 0.4152854511970534, + "grad_norm": 1.039293646812439, + "learning_rate": 9.996544556533706e-05, + "loss": 2.4654, + "step": 1353 + }, + { + "epoch": 0.41559238796807857, + "grad_norm": 1.0976085662841797, + "learning_rate": 9.996526055665692e-05, + "loss": 2.4755, + "step": 1354 + }, + { + "epoch": 0.41589932473910374, + "grad_norm": 0.937647819519043, + "learning_rate": 9.996507505419078e-05, + "loss": 2.4687, + "step": 1355 + }, + { + "epoch": 0.4162062615101289, + "grad_norm": 1.0461267232894897, + "learning_rate": 9.996488905794047e-05, + "loss": 2.4092, + "step": 1356 + }, + { + "epoch": 0.4165131982811541, + "grad_norm": 1.0510658025741577, + "learning_rate": 9.996470256790787e-05, + "loss": 2.4806, + "step": 1357 + }, + { + "epoch": 0.41682013505217924, + "grad_norm": 1.2323371171951294, + "learning_rate": 9.996451558409478e-05, + "loss": 2.5017, + "step": 1358 + }, + { + "epoch": 0.4171270718232044, + "grad_norm": 0.9880139827728271, + "learning_rate": 9.996432810650307e-05, + "loss": 2.5171, + "step": 1359 + }, + { + "epoch": 0.4174340085942296, + "grad_norm": 1.2572466135025024, + "learning_rate": 9.996414013513458e-05, + "loss": 2.4285, + "step": 1360 + }, + { + "epoch": 0.41774094536525475, + "grad_norm": 0.8809559941291809, + "learning_rate": 9.996395166999118e-05, + "loss": 2.398, + "step": 1361 + }, + { + "epoch": 0.4180478821362799, + "grad_norm": 0.9389429688453674, + "learning_rate": 9.996376271107471e-05, + "loss": 2.4539, + "step": 1362 + }, + { + "epoch": 0.4183548189073051, + "grad_norm": 0.8821789026260376, + "learning_rate": 9.996357325838705e-05, + "loss": 2.4762, + "step": 1363 + }, + { + "epoch": 0.41866175567833025, + "grad_norm": 1.0148484706878662, + "learning_rate": 9.99633833119301e-05, + "loss": 2.5292, + "step": 1364 + }, + { + "epoch": 0.4189686924493554, + "grad_norm": 0.9861947894096375, + "learning_rate": 9.996319287170569e-05, + "loss": 2.4285, + "step": 1365 + }, + { + "epoch": 0.4192756292203806, + "grad_norm": 1.1907099485397339, + "learning_rate": 9.996300193771573e-05, + "loss": 2.4325, + "step": 1366 + }, + { + "epoch": 0.41958256599140575, + "grad_norm": 1.0746681690216064, + "learning_rate": 9.99628105099621e-05, + "loss": 2.3349, + "step": 1367 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 1.2040268182754517, + "learning_rate": 9.996261858844669e-05, + "loss": 2.4427, + "step": 1368 + }, + { + "epoch": 0.4201964395334561, + "grad_norm": 1.0487430095672607, + "learning_rate": 9.99624261731714e-05, + "loss": 2.4305, + "step": 1369 + }, + { + "epoch": 0.42050337630448126, + "grad_norm": 1.0047999620437622, + "learning_rate": 9.996223326413812e-05, + "loss": 2.4442, + "step": 1370 + }, + { + "epoch": 0.4208103130755064, + "grad_norm": 1.147078275680542, + "learning_rate": 9.996203986134879e-05, + "loss": 2.5189, + "step": 1371 + }, + { + "epoch": 0.4211172498465316, + "grad_norm": 1.2269455194473267, + "learning_rate": 9.996184596480529e-05, + "loss": 2.3905, + "step": 1372 + }, + { + "epoch": 0.42142418661755676, + "grad_norm": 0.9716771245002747, + "learning_rate": 9.996165157450954e-05, + "loss": 2.4246, + "step": 1373 + }, + { + "epoch": 0.42173112338858193, + "grad_norm": 1.0569939613342285, + "learning_rate": 9.996145669046347e-05, + "loss": 2.529, + "step": 1374 + }, + { + "epoch": 0.4220380601596071, + "grad_norm": 1.1145942211151123, + "learning_rate": 9.996126131266899e-05, + "loss": 2.3965, + "step": 1375 + }, + { + "epoch": 0.42234499693063227, + "grad_norm": 0.9990974068641663, + "learning_rate": 9.996106544112805e-05, + "loss": 2.4991, + "step": 1376 + }, + { + "epoch": 0.42265193370165743, + "grad_norm": 0.9536247253417969, + "learning_rate": 9.99608690758426e-05, + "loss": 2.4347, + "step": 1377 + }, + { + "epoch": 0.4229588704726826, + "grad_norm": 1.0053460597991943, + "learning_rate": 9.996067221681452e-05, + "loss": 2.4213, + "step": 1378 + }, + { + "epoch": 0.42326580724370777, + "grad_norm": 1.0727168321609497, + "learning_rate": 9.99604748640458e-05, + "loss": 2.4479, + "step": 1379 + }, + { + "epoch": 0.42357274401473294, + "grad_norm": 1.2539277076721191, + "learning_rate": 9.996027701753841e-05, + "loss": 2.4721, + "step": 1380 + }, + { + "epoch": 0.4238796807857581, + "grad_norm": 1.0348230600357056, + "learning_rate": 9.996007867729427e-05, + "loss": 2.4263, + "step": 1381 + }, + { + "epoch": 0.42418661755678333, + "grad_norm": 1.051802158355713, + "learning_rate": 9.995987984331533e-05, + "loss": 2.4492, + "step": 1382 + }, + { + "epoch": 0.4244935543278085, + "grad_norm": 1.0394505262374878, + "learning_rate": 9.995968051560361e-05, + "loss": 2.4625, + "step": 1383 + }, + { + "epoch": 0.42480049109883367, + "grad_norm": 1.1121852397918701, + "learning_rate": 9.995948069416103e-05, + "loss": 2.4999, + "step": 1384 + }, + { + "epoch": 0.42510742786985883, + "grad_norm": 0.9693613052368164, + "learning_rate": 9.995928037898957e-05, + "loss": 2.4112, + "step": 1385 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 1.1416810750961304, + "learning_rate": 9.995907957009123e-05, + "loss": 2.5452, + "step": 1386 + }, + { + "epoch": 0.42572130141190917, + "grad_norm": 1.010640025138855, + "learning_rate": 9.995887826746797e-05, + "loss": 2.412, + "step": 1387 + }, + { + "epoch": 0.42602823818293434, + "grad_norm": 1.0800373554229736, + "learning_rate": 9.99586764711218e-05, + "loss": 2.4451, + "step": 1388 + }, + { + "epoch": 0.4263351749539595, + "grad_norm": 1.058931589126587, + "learning_rate": 9.995847418105471e-05, + "loss": 2.474, + "step": 1389 + }, + { + "epoch": 0.4266421117249847, + "grad_norm": 1.0727131366729736, + "learning_rate": 9.99582713972687e-05, + "loss": 2.468, + "step": 1390 + }, + { + "epoch": 0.42694904849600984, + "grad_norm": 1.0237464904785156, + "learning_rate": 9.995806811976576e-05, + "loss": 2.5208, + "step": 1391 + }, + { + "epoch": 0.427255985267035, + "grad_norm": 1.036582112312317, + "learning_rate": 9.995786434854793e-05, + "loss": 2.4338, + "step": 1392 + }, + { + "epoch": 0.4275629220380602, + "grad_norm": 0.9617817997932434, + "learning_rate": 9.995766008361719e-05, + "loss": 2.4465, + "step": 1393 + }, + { + "epoch": 0.42786985880908535, + "grad_norm": 1.2188911437988281, + "learning_rate": 9.995745532497556e-05, + "loss": 2.5069, + "step": 1394 + }, + { + "epoch": 0.4281767955801105, + "grad_norm": 1.0796585083007812, + "learning_rate": 9.99572500726251e-05, + "loss": 2.4839, + "step": 1395 + }, + { + "epoch": 0.4284837323511357, + "grad_norm": 0.9843130111694336, + "learning_rate": 9.99570443265678e-05, + "loss": 2.4968, + "step": 1396 + }, + { + "epoch": 0.42879066912216085, + "grad_norm": 1.0441415309906006, + "learning_rate": 9.99568380868057e-05, + "loss": 2.4134, + "step": 1397 + }, + { + "epoch": 0.429097605893186, + "grad_norm": 0.9156177639961243, + "learning_rate": 9.995663135334085e-05, + "loss": 2.4891, + "step": 1398 + }, + { + "epoch": 0.4294045426642112, + "grad_norm": 1.1159545183181763, + "learning_rate": 9.995642412617529e-05, + "loss": 2.4507, + "step": 1399 + }, + { + "epoch": 0.42971147943523635, + "grad_norm": 0.8944577574729919, + "learning_rate": 9.995621640531107e-05, + "loss": 2.4465, + "step": 1400 + }, + { + "epoch": 0.4300184162062615, + "grad_norm": 0.9043408036231995, + "learning_rate": 9.995600819075025e-05, + "loss": 2.3726, + "step": 1401 + }, + { + "epoch": 0.4303253529772867, + "grad_norm": 0.9028464555740356, + "learning_rate": 9.995579948249486e-05, + "loss": 2.427, + "step": 1402 + }, + { + "epoch": 0.43063228974831186, + "grad_norm": 0.9497705101966858, + "learning_rate": 9.995559028054699e-05, + "loss": 2.4666, + "step": 1403 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.927601158618927, + "learning_rate": 9.995538058490868e-05, + "loss": 2.3679, + "step": 1404 + }, + { + "epoch": 0.4312461632903622, + "grad_norm": 1.050394892692566, + "learning_rate": 9.995517039558204e-05, + "loss": 2.4096, + "step": 1405 + }, + { + "epoch": 0.43155310006138736, + "grad_norm": 1.3011974096298218, + "learning_rate": 9.995495971256911e-05, + "loss": 2.4439, + "step": 1406 + }, + { + "epoch": 0.43186003683241253, + "grad_norm": 1.0740708112716675, + "learning_rate": 9.9954748535872e-05, + "loss": 2.4891, + "step": 1407 + }, + { + "epoch": 0.4321669736034377, + "grad_norm": 1.1132466793060303, + "learning_rate": 9.995453686549279e-05, + "loss": 2.46, + "step": 1408 + }, + { + "epoch": 0.43247391037446287, + "grad_norm": 1.063275933265686, + "learning_rate": 9.995432470143356e-05, + "loss": 2.5035, + "step": 1409 + }, + { + "epoch": 0.43278084714548803, + "grad_norm": 1.065679669380188, + "learning_rate": 9.99541120436964e-05, + "loss": 2.4471, + "step": 1410 + }, + { + "epoch": 0.4330877839165132, + "grad_norm": 1.017587423324585, + "learning_rate": 9.995389889228344e-05, + "loss": 2.4879, + "step": 1411 + }, + { + "epoch": 0.43339472068753837, + "grad_norm": 0.9744442701339722, + "learning_rate": 9.995368524719678e-05, + "loss": 2.3923, + "step": 1412 + }, + { + "epoch": 0.43370165745856354, + "grad_norm": 0.8916706442832947, + "learning_rate": 9.995347110843851e-05, + "loss": 2.3965, + "step": 1413 + }, + { + "epoch": 0.4340085942295887, + "grad_norm": 0.916221559047699, + "learning_rate": 9.995325647601075e-05, + "loss": 2.4742, + "step": 1414 + }, + { + "epoch": 0.4343155310006139, + "grad_norm": 0.9388782978057861, + "learning_rate": 9.995304134991565e-05, + "loss": 2.453, + "step": 1415 + }, + { + "epoch": 0.43462246777163904, + "grad_norm": 1.057085633277893, + "learning_rate": 9.995282573015532e-05, + "loss": 2.5791, + "step": 1416 + }, + { + "epoch": 0.4349294045426642, + "grad_norm": 1.055145025253296, + "learning_rate": 9.995260961673187e-05, + "loss": 2.3565, + "step": 1417 + }, + { + "epoch": 0.4352363413136894, + "grad_norm": 1.0733528137207031, + "learning_rate": 9.995239300964747e-05, + "loss": 2.5413, + "step": 1418 + }, + { + "epoch": 0.43554327808471455, + "grad_norm": 1.1478198766708374, + "learning_rate": 9.995217590890425e-05, + "loss": 2.4093, + "step": 1419 + }, + { + "epoch": 0.4358502148557397, + "grad_norm": 0.8663081526756287, + "learning_rate": 9.995195831450432e-05, + "loss": 2.3968, + "step": 1420 + }, + { + "epoch": 0.4361571516267649, + "grad_norm": 0.9811860918998718, + "learning_rate": 9.995174022644988e-05, + "loss": 2.3536, + "step": 1421 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.9883477687835693, + "learning_rate": 9.995152164474306e-05, + "loss": 2.5372, + "step": 1422 + }, + { + "epoch": 0.4367710251688152, + "grad_norm": 1.2196532487869263, + "learning_rate": 9.995130256938603e-05, + "loss": 2.429, + "step": 1423 + }, + { + "epoch": 0.4370779619398404, + "grad_norm": 1.000264286994934, + "learning_rate": 9.995108300038096e-05, + "loss": 2.4116, + "step": 1424 + }, + { + "epoch": 0.43738489871086556, + "grad_norm": 1.1259286403656006, + "learning_rate": 9.995086293773e-05, + "loss": 2.4405, + "step": 1425 + }, + { + "epoch": 0.4376918354818907, + "grad_norm": 0.9334595203399658, + "learning_rate": 9.995064238143533e-05, + "loss": 2.3849, + "step": 1426 + }, + { + "epoch": 0.4379987722529159, + "grad_norm": 0.8880285620689392, + "learning_rate": 9.995042133149914e-05, + "loss": 2.4177, + "step": 1427 + }, + { + "epoch": 0.43830570902394106, + "grad_norm": 0.8823251724243164, + "learning_rate": 9.995019978792362e-05, + "loss": 2.4876, + "step": 1428 + }, + { + "epoch": 0.4386126457949662, + "grad_norm": 0.9289014339447021, + "learning_rate": 9.994997775071094e-05, + "loss": 2.4725, + "step": 1429 + }, + { + "epoch": 0.4389195825659914, + "grad_norm": 0.9100427627563477, + "learning_rate": 9.994975521986329e-05, + "loss": 2.3834, + "step": 1430 + }, + { + "epoch": 0.43922651933701656, + "grad_norm": 0.8956978917121887, + "learning_rate": 9.99495321953829e-05, + "loss": 2.4418, + "step": 1431 + }, + { + "epoch": 0.43953345610804173, + "grad_norm": 1.1248396635055542, + "learning_rate": 9.994930867727195e-05, + "loss": 2.4389, + "step": 1432 + }, + { + "epoch": 0.4398403928790669, + "grad_norm": 0.9285669922828674, + "learning_rate": 9.994908466553266e-05, + "loss": 2.3922, + "step": 1433 + }, + { + "epoch": 0.44014732965009207, + "grad_norm": 0.9604844450950623, + "learning_rate": 9.994886016016723e-05, + "loss": 2.4365, + "step": 1434 + }, + { + "epoch": 0.44045426642111724, + "grad_norm": 1.0534024238586426, + "learning_rate": 9.99486351611779e-05, + "loss": 2.4377, + "step": 1435 + }, + { + "epoch": 0.4407612031921424, + "grad_norm": 1.1028003692626953, + "learning_rate": 9.994840966856686e-05, + "loss": 2.4299, + "step": 1436 + }, + { + "epoch": 0.44106813996316757, + "grad_norm": 1.119832158088684, + "learning_rate": 9.994818368233639e-05, + "loss": 2.4656, + "step": 1437 + }, + { + "epoch": 0.44137507673419274, + "grad_norm": 0.9782878160476685, + "learning_rate": 9.994795720248867e-05, + "loss": 2.3661, + "step": 1438 + }, + { + "epoch": 0.4416820135052179, + "grad_norm": 1.0002741813659668, + "learning_rate": 9.994773022902597e-05, + "loss": 2.4157, + "step": 1439 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 1.051486611366272, + "learning_rate": 9.994750276195053e-05, + "loss": 2.452, + "step": 1440 + }, + { + "epoch": 0.44229588704726824, + "grad_norm": 1.0375488996505737, + "learning_rate": 9.994727480126457e-05, + "loss": 2.4406, + "step": 1441 + }, + { + "epoch": 0.4426028238182934, + "grad_norm": 0.9407445192337036, + "learning_rate": 9.99470463469704e-05, + "loss": 2.3434, + "step": 1442 + }, + { + "epoch": 0.4429097605893186, + "grad_norm": 1.0371474027633667, + "learning_rate": 9.994681739907022e-05, + "loss": 2.5094, + "step": 1443 + }, + { + "epoch": 0.44321669736034375, + "grad_norm": 1.057519555091858, + "learning_rate": 9.994658795756632e-05, + "loss": 2.4501, + "step": 1444 + }, + { + "epoch": 0.4435236341313689, + "grad_norm": 0.9340078234672546, + "learning_rate": 9.994635802246097e-05, + "loss": 2.4151, + "step": 1445 + }, + { + "epoch": 0.4438305709023941, + "grad_norm": 0.8906050324440002, + "learning_rate": 9.994612759375644e-05, + "loss": 2.3837, + "step": 1446 + }, + { + "epoch": 0.44413750767341925, + "grad_norm": 0.8349595665931702, + "learning_rate": 9.994589667145497e-05, + "loss": 2.4317, + "step": 1447 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.9362117648124695, + "learning_rate": 9.994566525555891e-05, + "loss": 2.4586, + "step": 1448 + }, + { + "epoch": 0.4447513812154696, + "grad_norm": 0.869215190410614, + "learning_rate": 9.99454333460705e-05, + "loss": 2.4458, + "step": 1449 + }, + { + "epoch": 0.44505831798649476, + "grad_norm": 0.904531717300415, + "learning_rate": 9.994520094299204e-05, + "loss": 2.4198, + "step": 1450 + }, + { + "epoch": 0.4453652547575199, + "grad_norm": 0.9153178930282593, + "learning_rate": 9.994496804632583e-05, + "loss": 2.3718, + "step": 1451 + }, + { + "epoch": 0.44567219152854515, + "grad_norm": 1.0229307413101196, + "learning_rate": 9.994473465607418e-05, + "loss": 2.3787, + "step": 1452 + }, + { + "epoch": 0.4459791282995703, + "grad_norm": 1.0449415445327759, + "learning_rate": 9.994450077223938e-05, + "loss": 2.4965, + "step": 1453 + }, + { + "epoch": 0.4462860650705955, + "grad_norm": 1.0524135828018188, + "learning_rate": 9.994426639482375e-05, + "loss": 2.3518, + "step": 1454 + }, + { + "epoch": 0.44659300184162065, + "grad_norm": 1.0612086057662964, + "learning_rate": 9.994403152382961e-05, + "loss": 2.4501, + "step": 1455 + }, + { + "epoch": 0.4468999386126458, + "grad_norm": 1.0568779706954956, + "learning_rate": 9.994379615925929e-05, + "loss": 2.3754, + "step": 1456 + }, + { + "epoch": 0.447206875383671, + "grad_norm": 1.0984265804290771, + "learning_rate": 9.994356030111509e-05, + "loss": 2.4318, + "step": 1457 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.9227646589279175, + "learning_rate": 9.994332394939936e-05, + "loss": 2.3928, + "step": 1458 + }, + { + "epoch": 0.4478207489257213, + "grad_norm": 1.0073471069335938, + "learning_rate": 9.994308710411442e-05, + "loss": 2.4203, + "step": 1459 + }, + { + "epoch": 0.4481276856967465, + "grad_norm": 1.1347973346710205, + "learning_rate": 9.994284976526263e-05, + "loss": 2.4991, + "step": 1460 + }, + { + "epoch": 0.44843462246777166, + "grad_norm": 0.9912654757499695, + "learning_rate": 9.994261193284631e-05, + "loss": 2.471, + "step": 1461 + }, + { + "epoch": 0.4487415592387968, + "grad_norm": 1.0599550008773804, + "learning_rate": 9.994237360686784e-05, + "loss": 2.505, + "step": 1462 + }, + { + "epoch": 0.449048496009822, + "grad_norm": 0.9811004996299744, + "learning_rate": 9.994213478732957e-05, + "loss": 2.3868, + "step": 1463 + }, + { + "epoch": 0.44935543278084716, + "grad_norm": 0.8389631509780884, + "learning_rate": 9.994189547423384e-05, + "loss": 2.4766, + "step": 1464 + }, + { + "epoch": 0.44966236955187233, + "grad_norm": 0.8475043773651123, + "learning_rate": 9.994165566758302e-05, + "loss": 2.3666, + "step": 1465 + }, + { + "epoch": 0.4499693063228975, + "grad_norm": 0.8922824859619141, + "learning_rate": 9.994141536737951e-05, + "loss": 2.3823, + "step": 1466 + }, + { + "epoch": 0.45027624309392267, + "grad_norm": 1.0286083221435547, + "learning_rate": 9.994117457362564e-05, + "loss": 2.4639, + "step": 1467 + }, + { + "epoch": 0.45058317986494784, + "grad_norm": 1.094282865524292, + "learning_rate": 9.994093328632383e-05, + "loss": 2.3984, + "step": 1468 + }, + { + "epoch": 0.450890116635973, + "grad_norm": 1.0993603467941284, + "learning_rate": 9.994069150547642e-05, + "loss": 2.3719, + "step": 1469 + }, + { + "epoch": 0.45119705340699817, + "grad_norm": 1.0274133682250977, + "learning_rate": 9.994044923108585e-05, + "loss": 2.3644, + "step": 1470 + }, + { + "epoch": 0.45150399017802334, + "grad_norm": 0.8834434747695923, + "learning_rate": 9.994020646315448e-05, + "loss": 2.4955, + "step": 1471 + }, + { + "epoch": 0.4518109269490485, + "grad_norm": 0.8540776968002319, + "learning_rate": 9.993996320168473e-05, + "loss": 2.4292, + "step": 1472 + }, + { + "epoch": 0.4521178637200737, + "grad_norm": 0.8735383749008179, + "learning_rate": 9.993971944667897e-05, + "loss": 2.4343, + "step": 1473 + }, + { + "epoch": 0.45242480049109884, + "grad_norm": 0.976224422454834, + "learning_rate": 9.993947519813965e-05, + "loss": 2.4173, + "step": 1474 + }, + { + "epoch": 0.452731737262124, + "grad_norm": 0.9638139009475708, + "learning_rate": 9.993923045606917e-05, + "loss": 2.4322, + "step": 1475 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.9689927697181702, + "learning_rate": 9.993898522046992e-05, + "loss": 2.4625, + "step": 1476 + }, + { + "epoch": 0.45334561080417435, + "grad_norm": 1.0496052503585815, + "learning_rate": 9.993873949134437e-05, + "loss": 2.4788, + "step": 1477 + }, + { + "epoch": 0.4536525475751995, + "grad_norm": 1.0285090208053589, + "learning_rate": 9.993849326869491e-05, + "loss": 2.4119, + "step": 1478 + }, + { + "epoch": 0.4539594843462247, + "grad_norm": 0.9423730373382568, + "learning_rate": 9.993824655252401e-05, + "loss": 2.3919, + "step": 1479 + }, + { + "epoch": 0.45426642111724985, + "grad_norm": 1.0312988758087158, + "learning_rate": 9.993799934283407e-05, + "loss": 2.3829, + "step": 1480 + }, + { + "epoch": 0.454573357888275, + "grad_norm": 1.0985655784606934, + "learning_rate": 9.993775163962755e-05, + "loss": 2.3958, + "step": 1481 + }, + { + "epoch": 0.4548802946593002, + "grad_norm": 0.9346623420715332, + "learning_rate": 9.993750344290691e-05, + "loss": 2.3611, + "step": 1482 + }, + { + "epoch": 0.45518723143032536, + "grad_norm": 1.039681315422058, + "learning_rate": 9.993725475267459e-05, + "loss": 2.3989, + "step": 1483 + }, + { + "epoch": 0.4554941682013505, + "grad_norm": 0.9941854476928711, + "learning_rate": 9.993700556893304e-05, + "loss": 2.3092, + "step": 1484 + }, + { + "epoch": 0.4558011049723757, + "grad_norm": 0.9752130508422852, + "learning_rate": 9.993675589168473e-05, + "loss": 2.3727, + "step": 1485 + }, + { + "epoch": 0.45610804174340086, + "grad_norm": 0.9946039319038391, + "learning_rate": 9.993650572093216e-05, + "loss": 2.4121, + "step": 1486 + }, + { + "epoch": 0.45641497851442603, + "grad_norm": 1.1340489387512207, + "learning_rate": 9.993625505667774e-05, + "loss": 2.4477, + "step": 1487 + }, + { + "epoch": 0.4567219152854512, + "grad_norm": 0.9300981760025024, + "learning_rate": 9.993600389892399e-05, + "loss": 2.4045, + "step": 1488 + }, + { + "epoch": 0.45702885205647636, + "grad_norm": 0.8670973181724548, + "learning_rate": 9.993575224767338e-05, + "loss": 2.3596, + "step": 1489 + }, + { + "epoch": 0.45733578882750153, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.99355001029284e-05, + "loss": 2.4191, + "step": 1490 + }, + { + "epoch": 0.4576427255985267, + "grad_norm": 0.9099079370498657, + "learning_rate": 9.993524746469154e-05, + "loss": 2.4139, + "step": 1491 + }, + { + "epoch": 0.45794966236955187, + "grad_norm": 0.9740153551101685, + "learning_rate": 9.99349943329653e-05, + "loss": 2.4269, + "step": 1492 + }, + { + "epoch": 0.45825659914057704, + "grad_norm": 0.9112171530723572, + "learning_rate": 9.993474070775217e-05, + "loss": 2.3575, + "step": 1493 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 1.124553918838501, + "learning_rate": 9.993448658905466e-05, + "loss": 2.5518, + "step": 1494 + }, + { + "epoch": 0.4588704726826274, + "grad_norm": 1.1732012033462524, + "learning_rate": 9.99342319768753e-05, + "loss": 2.4346, + "step": 1495 + }, + { + "epoch": 0.45917740945365254, + "grad_norm": 0.8880025148391724, + "learning_rate": 9.993397687121659e-05, + "loss": 2.3593, + "step": 1496 + }, + { + "epoch": 0.4594843462246777, + "grad_norm": 0.9916797876358032, + "learning_rate": 9.993372127208105e-05, + "loss": 2.3283, + "step": 1497 + }, + { + "epoch": 0.4597912829957029, + "grad_norm": 0.9372622966766357, + "learning_rate": 9.99334651794712e-05, + "loss": 2.3868, + "step": 1498 + }, + { + "epoch": 0.46009821976672804, + "grad_norm": 1.0630989074707031, + "learning_rate": 9.99332085933896e-05, + "loss": 2.3605, + "step": 1499 + }, + { + "epoch": 0.4604051565377532, + "grad_norm": 1.000473976135254, + "learning_rate": 9.993295151383874e-05, + "loss": 2.3478, + "step": 1500 + }, + { + "epoch": 0.4607120933087784, + "grad_norm": 1.0269688367843628, + "learning_rate": 9.99326939408212e-05, + "loss": 2.4104, + "step": 1501 + }, + { + "epoch": 0.46101903007980355, + "grad_norm": 0.9003174901008606, + "learning_rate": 9.993243587433952e-05, + "loss": 2.3461, + "step": 1502 + }, + { + "epoch": 0.4613259668508287, + "grad_norm": 0.7938058972358704, + "learning_rate": 9.993217731439623e-05, + "loss": 2.3463, + "step": 1503 + }, + { + "epoch": 0.4616329036218539, + "grad_norm": 0.8715407252311707, + "learning_rate": 9.993191826099391e-05, + "loss": 2.3962, + "step": 1504 + }, + { + "epoch": 0.46193984039287905, + "grad_norm": 0.8319756984710693, + "learning_rate": 9.99316587141351e-05, + "loss": 2.342, + "step": 1505 + }, + { + "epoch": 0.4622467771639042, + "grad_norm": 0.846592903137207, + "learning_rate": 9.993139867382238e-05, + "loss": 2.4064, + "step": 1506 + }, + { + "epoch": 0.4625537139349294, + "grad_norm": 0.8567312955856323, + "learning_rate": 9.99311381400583e-05, + "loss": 2.3603, + "step": 1507 + }, + { + "epoch": 0.46286065070595456, + "grad_norm": 0.8784321546554565, + "learning_rate": 9.993087711284546e-05, + "loss": 2.4031, + "step": 1508 + }, + { + "epoch": 0.4631675874769797, + "grad_norm": 0.838233232498169, + "learning_rate": 9.993061559218641e-05, + "loss": 2.3156, + "step": 1509 + }, + { + "epoch": 0.4634745242480049, + "grad_norm": 0.8804462552070618, + "learning_rate": 9.993035357808376e-05, + "loss": 2.4322, + "step": 1510 + }, + { + "epoch": 0.46378146101903006, + "grad_norm": 1.1055982112884521, + "learning_rate": 9.99300910705401e-05, + "loss": 2.5006, + "step": 1511 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.9872145056724548, + "learning_rate": 9.992982806955799e-05, + "loss": 2.3547, + "step": 1512 + }, + { + "epoch": 0.4643953345610804, + "grad_norm": 1.0710479021072388, + "learning_rate": 9.99295645751401e-05, + "loss": 2.4867, + "step": 1513 + }, + { + "epoch": 0.46470227133210557, + "grad_norm": 0.9858919382095337, + "learning_rate": 9.992930058728894e-05, + "loss": 2.2986, + "step": 1514 + }, + { + "epoch": 0.46500920810313073, + "grad_norm": 0.9031065702438354, + "learning_rate": 9.992903610600719e-05, + "loss": 2.3172, + "step": 1515 + }, + { + "epoch": 0.4653161448741559, + "grad_norm": 0.923160970211029, + "learning_rate": 9.992877113129744e-05, + "loss": 2.4231, + "step": 1516 + }, + { + "epoch": 0.46562308164518107, + "grad_norm": 1.0130947828292847, + "learning_rate": 9.992850566316231e-05, + "loss": 2.3593, + "step": 1517 + }, + { + "epoch": 0.46593001841620624, + "grad_norm": 0.8947033286094666, + "learning_rate": 9.992823970160441e-05, + "loss": 2.3324, + "step": 1518 + }, + { + "epoch": 0.4662369551872314, + "grad_norm": 0.8819900155067444, + "learning_rate": 9.992797324662639e-05, + "loss": 2.2885, + "step": 1519 + }, + { + "epoch": 0.4665438919582566, + "grad_norm": 0.9434374570846558, + "learning_rate": 9.99277062982309e-05, + "loss": 2.427, + "step": 1520 + }, + { + "epoch": 0.46685082872928174, + "grad_norm": 0.9568646550178528, + "learning_rate": 9.99274388564205e-05, + "loss": 2.4059, + "step": 1521 + }, + { + "epoch": 0.4671577655003069, + "grad_norm": 0.9125105142593384, + "learning_rate": 9.992717092119794e-05, + "loss": 2.3306, + "step": 1522 + }, + { + "epoch": 0.46746470227133213, + "grad_norm": 0.8893206715583801, + "learning_rate": 9.992690249256578e-05, + "loss": 2.4211, + "step": 1523 + }, + { + "epoch": 0.4677716390423573, + "grad_norm": 0.8655402660369873, + "learning_rate": 9.992663357052672e-05, + "loss": 2.3493, + "step": 1524 + }, + { + "epoch": 0.46807857581338247, + "grad_norm": 0.7973037958145142, + "learning_rate": 9.99263641550834e-05, + "loss": 2.4255, + "step": 1525 + }, + { + "epoch": 0.46838551258440764, + "grad_norm": 0.8158934116363525, + "learning_rate": 9.992609424623849e-05, + "loss": 2.3518, + "step": 1526 + }, + { + "epoch": 0.4686924493554328, + "grad_norm": 0.7919436693191528, + "learning_rate": 9.992582384399465e-05, + "loss": 2.3762, + "step": 1527 + }, + { + "epoch": 0.468999386126458, + "grad_norm": 0.911490261554718, + "learning_rate": 9.992555294835455e-05, + "loss": 2.454, + "step": 1528 + }, + { + "epoch": 0.46930632289748314, + "grad_norm": 0.9504674077033997, + "learning_rate": 9.992528155932088e-05, + "loss": 2.3554, + "step": 1529 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.9833991527557373, + "learning_rate": 9.99250096768963e-05, + "loss": 2.4245, + "step": 1530 + }, + { + "epoch": 0.4699201964395335, + "grad_norm": 0.9994687438011169, + "learning_rate": 9.992473730108354e-05, + "loss": 2.3269, + "step": 1531 + }, + { + "epoch": 0.47022713321055865, + "grad_norm": 0.977237343788147, + "learning_rate": 9.992446443188526e-05, + "loss": 2.3938, + "step": 1532 + }, + { + "epoch": 0.4705340699815838, + "grad_norm": 1.018334150314331, + "learning_rate": 9.992419106930415e-05, + "loss": 2.3076, + "step": 1533 + }, + { + "epoch": 0.470841006752609, + "grad_norm": 0.9752077460289001, + "learning_rate": 9.992391721334293e-05, + "loss": 2.4224, + "step": 1534 + }, + { + "epoch": 0.47114794352363415, + "grad_norm": 0.9457291960716248, + "learning_rate": 9.992364286400428e-05, + "loss": 2.3859, + "step": 1535 + }, + { + "epoch": 0.4714548802946593, + "grad_norm": 0.9112275838851929, + "learning_rate": 9.992336802129096e-05, + "loss": 2.3343, + "step": 1536 + }, + { + "epoch": 0.4717618170656845, + "grad_norm": 0.7701164484024048, + "learning_rate": 9.992309268520563e-05, + "loss": 2.3912, + "step": 1537 + }, + { + "epoch": 0.47206875383670965, + "grad_norm": 0.826822817325592, + "learning_rate": 9.992281685575105e-05, + "loss": 2.3794, + "step": 1538 + }, + { + "epoch": 0.4723756906077348, + "grad_norm": 0.8690019249916077, + "learning_rate": 9.992254053292994e-05, + "loss": 2.3474, + "step": 1539 + }, + { + "epoch": 0.47268262737876, + "grad_norm": 0.935954213142395, + "learning_rate": 9.9922263716745e-05, + "loss": 2.3794, + "step": 1540 + }, + { + "epoch": 0.47298956414978516, + "grad_norm": 1.0606616735458374, + "learning_rate": 9.992198640719901e-05, + "loss": 2.3491, + "step": 1541 + }, + { + "epoch": 0.4732965009208103, + "grad_norm": 1.0020630359649658, + "learning_rate": 9.992170860429469e-05, + "loss": 2.4723, + "step": 1542 + }, + { + "epoch": 0.4736034376918355, + "grad_norm": 0.9738268256187439, + "learning_rate": 9.992143030803476e-05, + "loss": 2.4282, + "step": 1543 + }, + { + "epoch": 0.47391037446286066, + "grad_norm": 1.0320461988449097, + "learning_rate": 9.992115151842203e-05, + "loss": 2.3935, + "step": 1544 + }, + { + "epoch": 0.47421731123388583, + "grad_norm": 0.926980197429657, + "learning_rate": 9.992087223545921e-05, + "loss": 2.4403, + "step": 1545 + }, + { + "epoch": 0.474524248004911, + "grad_norm": 0.8760805130004883, + "learning_rate": 9.992059245914906e-05, + "loss": 2.3282, + "step": 1546 + }, + { + "epoch": 0.47483118477593617, + "grad_norm": 0.807569146156311, + "learning_rate": 9.992031218949435e-05, + "loss": 2.351, + "step": 1547 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.7491574883460999, + "learning_rate": 9.992003142649788e-05, + "loss": 2.3788, + "step": 1548 + }, + { + "epoch": 0.4754450583179865, + "grad_norm": 0.8402566909790039, + "learning_rate": 9.99197501701624e-05, + "loss": 2.4025, + "step": 1549 + }, + { + "epoch": 0.47575199508901167, + "grad_norm": 0.9501824975013733, + "learning_rate": 9.991946842049067e-05, + "loss": 2.4433, + "step": 1550 + }, + { + "epoch": 0.47605893186003684, + "grad_norm": 1.0070267915725708, + "learning_rate": 9.99191861774855e-05, + "loss": 2.4267, + "step": 1551 + }, + { + "epoch": 0.476365868631062, + "grad_norm": 0.9052779078483582, + "learning_rate": 9.991890344114969e-05, + "loss": 2.37, + "step": 1552 + }, + { + "epoch": 0.4766728054020872, + "grad_norm": 0.9453344345092773, + "learning_rate": 9.9918620211486e-05, + "loss": 2.4687, + "step": 1553 + }, + { + "epoch": 0.47697974217311234, + "grad_norm": 0.9836863875389099, + "learning_rate": 9.991833648849725e-05, + "loss": 2.4005, + "step": 1554 + }, + { + "epoch": 0.4772866789441375, + "grad_norm": 0.856532633304596, + "learning_rate": 9.991805227218624e-05, + "loss": 2.329, + "step": 1555 + }, + { + "epoch": 0.4775936157151627, + "grad_norm": 0.8338705897331238, + "learning_rate": 9.991776756255579e-05, + "loss": 2.3648, + "step": 1556 + }, + { + "epoch": 0.47790055248618785, + "grad_norm": 0.7738644480705261, + "learning_rate": 9.991748235960869e-05, + "loss": 2.2784, + "step": 1557 + }, + { + "epoch": 0.478207489257213, + "grad_norm": 0.7771223783493042, + "learning_rate": 9.991719666334778e-05, + "loss": 2.2747, + "step": 1558 + }, + { + "epoch": 0.4785144260282382, + "grad_norm": 0.7564612627029419, + "learning_rate": 9.991691047377588e-05, + "loss": 2.2964, + "step": 1559 + }, + { + "epoch": 0.47882136279926335, + "grad_norm": 0.7877290844917297, + "learning_rate": 9.99166237908958e-05, + "loss": 2.3149, + "step": 1560 + }, + { + "epoch": 0.4791282995702885, + "grad_norm": 0.7967450022697449, + "learning_rate": 9.991633661471039e-05, + "loss": 2.4035, + "step": 1561 + }, + { + "epoch": 0.4794352363413137, + "grad_norm": 0.8993534445762634, + "learning_rate": 9.991604894522248e-05, + "loss": 2.4028, + "step": 1562 + }, + { + "epoch": 0.47974217311233885, + "grad_norm": 0.9135516881942749, + "learning_rate": 9.991576078243494e-05, + "loss": 2.3968, + "step": 1563 + }, + { + "epoch": 0.480049109883364, + "grad_norm": 0.8438525795936584, + "learning_rate": 9.991547212635057e-05, + "loss": 2.3589, + "step": 1564 + }, + { + "epoch": 0.4803560466543892, + "grad_norm": 0.8979686498641968, + "learning_rate": 9.991518297697226e-05, + "loss": 2.3835, + "step": 1565 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.8821539878845215, + "learning_rate": 9.991489333430286e-05, + "loss": 2.3503, + "step": 1566 + }, + { + "epoch": 0.4809699201964395, + "grad_norm": 0.8649077415466309, + "learning_rate": 9.991460319834523e-05, + "loss": 2.3806, + "step": 1567 + }, + { + "epoch": 0.4812768569674647, + "grad_norm": 0.8360965847969055, + "learning_rate": 9.991431256910223e-05, + "loss": 2.3997, + "step": 1568 + }, + { + "epoch": 0.48158379373848986, + "grad_norm": 0.9178828597068787, + "learning_rate": 9.991402144657673e-05, + "loss": 2.3611, + "step": 1569 + }, + { + "epoch": 0.48189073050951503, + "grad_norm": 0.7961607575416565, + "learning_rate": 9.991372983077161e-05, + "loss": 2.3588, + "step": 1570 + }, + { + "epoch": 0.4821976672805402, + "grad_norm": 0.8136993646621704, + "learning_rate": 9.991343772168978e-05, + "loss": 2.3241, + "step": 1571 + }, + { + "epoch": 0.48250460405156537, + "grad_norm": 0.8421273231506348, + "learning_rate": 9.991314511933407e-05, + "loss": 2.3493, + "step": 1572 + }, + { + "epoch": 0.48281154082259053, + "grad_norm": 0.774861752986908, + "learning_rate": 9.991285202370743e-05, + "loss": 2.362, + "step": 1573 + }, + { + "epoch": 0.4831184775936157, + "grad_norm": 0.9181589484214783, + "learning_rate": 9.991255843481273e-05, + "loss": 2.443, + "step": 1574 + }, + { + "epoch": 0.48342541436464087, + "grad_norm": 0.873884379863739, + "learning_rate": 9.991226435265286e-05, + "loss": 2.3819, + "step": 1575 + }, + { + "epoch": 0.48373235113566604, + "grad_norm": 0.923200786113739, + "learning_rate": 9.991196977723077e-05, + "loss": 2.4152, + "step": 1576 + }, + { + "epoch": 0.4840392879066912, + "grad_norm": 0.9097923040390015, + "learning_rate": 9.99116747085493e-05, + "loss": 2.4072, + "step": 1577 + }, + { + "epoch": 0.4843462246777164, + "grad_norm": 0.8885805010795593, + "learning_rate": 9.991137914661143e-05, + "loss": 2.3963, + "step": 1578 + }, + { + "epoch": 0.48465316144874154, + "grad_norm": 0.9016655683517456, + "learning_rate": 9.991108309142006e-05, + "loss": 2.4287, + "step": 1579 + }, + { + "epoch": 0.4849600982197667, + "grad_norm": 0.957548201084137, + "learning_rate": 9.99107865429781e-05, + "loss": 2.4306, + "step": 1580 + }, + { + "epoch": 0.4852670349907919, + "grad_norm": 0.9604195356369019, + "learning_rate": 9.99104895012885e-05, + "loss": 2.3721, + "step": 1581 + }, + { + "epoch": 0.48557397176181705, + "grad_norm": 1.0423815250396729, + "learning_rate": 9.991019196635419e-05, + "loss": 2.3847, + "step": 1582 + }, + { + "epoch": 0.4858809085328422, + "grad_norm": 0.9538045525550842, + "learning_rate": 9.990989393817809e-05, + "loss": 2.4307, + "step": 1583 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 1.0103334188461304, + "learning_rate": 9.990959541676318e-05, + "loss": 2.409, + "step": 1584 + }, + { + "epoch": 0.48649478207489255, + "grad_norm": 1.0780646800994873, + "learning_rate": 9.99092964021124e-05, + "loss": 2.3314, + "step": 1585 + }, + { + "epoch": 0.4868017188459177, + "grad_norm": 1.0062072277069092, + "learning_rate": 9.99089968942287e-05, + "loss": 2.3922, + "step": 1586 + }, + { + "epoch": 0.4871086556169429, + "grad_norm": 1.0575196743011475, + "learning_rate": 9.990869689311504e-05, + "loss": 2.4156, + "step": 1587 + }, + { + "epoch": 0.48741559238796806, + "grad_norm": 0.9953998923301697, + "learning_rate": 9.990839639877438e-05, + "loss": 2.381, + "step": 1588 + }, + { + "epoch": 0.4877225291589932, + "grad_norm": 0.8848470449447632, + "learning_rate": 9.99080954112097e-05, + "loss": 2.4178, + "step": 1589 + }, + { + "epoch": 0.4880294659300184, + "grad_norm": 0.7849117517471313, + "learning_rate": 9.990779393042397e-05, + "loss": 2.3021, + "step": 1590 + }, + { + "epoch": 0.48833640270104356, + "grad_norm": 0.7611599564552307, + "learning_rate": 9.990749195642016e-05, + "loss": 2.4426, + "step": 1591 + }, + { + "epoch": 0.4886433394720687, + "grad_norm": 0.8361895084381104, + "learning_rate": 9.990718948920127e-05, + "loss": 2.3442, + "step": 1592 + }, + { + "epoch": 0.4889502762430939, + "grad_norm": 0.8249576687812805, + "learning_rate": 9.990688652877028e-05, + "loss": 2.2745, + "step": 1593 + }, + { + "epoch": 0.4892572130141191, + "grad_norm": 0.763889729976654, + "learning_rate": 9.990658307513019e-05, + "loss": 2.3123, + "step": 1594 + }, + { + "epoch": 0.4895641497851443, + "grad_norm": 0.7517281770706177, + "learning_rate": 9.990627912828399e-05, + "loss": 2.3811, + "step": 1595 + }, + { + "epoch": 0.48987108655616945, + "grad_norm": 0.8254112005233765, + "learning_rate": 9.990597468823468e-05, + "loss": 2.4269, + "step": 1596 + }, + { + "epoch": 0.4901780233271946, + "grad_norm": 0.8267236948013306, + "learning_rate": 9.99056697549853e-05, + "loss": 2.354, + "step": 1597 + }, + { + "epoch": 0.4904849600982198, + "grad_norm": 0.8511303067207336, + "learning_rate": 9.990536432853881e-05, + "loss": 2.3755, + "step": 1598 + }, + { + "epoch": 0.49079189686924496, + "grad_norm": 0.8639636635780334, + "learning_rate": 9.990505840889828e-05, + "loss": 2.3828, + "step": 1599 + }, + { + "epoch": 0.4910988336402701, + "grad_norm": 0.8371795415878296, + "learning_rate": 9.990475199606672e-05, + "loss": 2.4235, + "step": 1600 + }, + { + "epoch": 0.4914057704112953, + "grad_norm": 0.7639186382293701, + "learning_rate": 9.990444509004713e-05, + "loss": 2.3547, + "step": 1601 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.7835492491722107, + "learning_rate": 9.990413769084257e-05, + "loss": 2.2983, + "step": 1602 + }, + { + "epoch": 0.49201964395334563, + "grad_norm": 0.8301565647125244, + "learning_rate": 9.990382979845609e-05, + "loss": 2.4109, + "step": 1603 + }, + { + "epoch": 0.4923265807243708, + "grad_norm": 0.9005976915359497, + "learning_rate": 9.99035214128907e-05, + "loss": 2.3618, + "step": 1604 + }, + { + "epoch": 0.49263351749539597, + "grad_norm": 1.0234936475753784, + "learning_rate": 9.990321253414945e-05, + "loss": 2.4622, + "step": 1605 + }, + { + "epoch": 0.49294045426642114, + "grad_norm": 1.1613819599151611, + "learning_rate": 9.990290316223542e-05, + "loss": 2.3231, + "step": 1606 + }, + { + "epoch": 0.4932473910374463, + "grad_norm": 0.9382983446121216, + "learning_rate": 9.990259329715165e-05, + "loss": 2.357, + "step": 1607 + }, + { + "epoch": 0.49355432780847147, + "grad_norm": 1.0277435779571533, + "learning_rate": 9.990228293890121e-05, + "loss": 2.3497, + "step": 1608 + }, + { + "epoch": 0.49386126457949664, + "grad_norm": 0.9809542894363403, + "learning_rate": 9.990197208748716e-05, + "loss": 2.363, + "step": 1609 + }, + { + "epoch": 0.4941682013505218, + "grad_norm": 1.151412844657898, + "learning_rate": 9.990166074291255e-05, + "loss": 2.4859, + "step": 1610 + }, + { + "epoch": 0.494475138121547, + "grad_norm": 0.9663482308387756, + "learning_rate": 9.990134890518051e-05, + "loss": 2.3848, + "step": 1611 + }, + { + "epoch": 0.49478207489257214, + "grad_norm": 0.9619266986846924, + "learning_rate": 9.990103657429405e-05, + "loss": 2.3381, + "step": 1612 + }, + { + "epoch": 0.4950890116635973, + "grad_norm": 1.1306475400924683, + "learning_rate": 9.990072375025634e-05, + "loss": 2.3859, + "step": 1613 + }, + { + "epoch": 0.4953959484346225, + "grad_norm": 1.127801537513733, + "learning_rate": 9.990041043307043e-05, + "loss": 2.4259, + "step": 1614 + }, + { + "epoch": 0.49570288520564765, + "grad_norm": 0.9880200624465942, + "learning_rate": 9.990009662273941e-05, + "loss": 2.3629, + "step": 1615 + }, + { + "epoch": 0.4960098219766728, + "grad_norm": 0.940493643283844, + "learning_rate": 9.989978231926636e-05, + "loss": 2.3716, + "step": 1616 + }, + { + "epoch": 0.496316758747698, + "grad_norm": 0.7923702597618103, + "learning_rate": 9.989946752265445e-05, + "loss": 2.3017, + "step": 1617 + }, + { + "epoch": 0.49662369551872315, + "grad_norm": 0.7668408155441284, + "learning_rate": 9.989915223290673e-05, + "loss": 2.3273, + "step": 1618 + }, + { + "epoch": 0.4969306322897483, + "grad_norm": 0.7134098410606384, + "learning_rate": 9.989883645002636e-05, + "loss": 2.302, + "step": 1619 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.6878800392150879, + "learning_rate": 9.989852017401643e-05, + "loss": 2.3047, + "step": 1620 + }, + { + "epoch": 0.49754450583179866, + "grad_norm": 0.8099397420883179, + "learning_rate": 9.989820340488008e-05, + "loss": 2.4747, + "step": 1621 + }, + { + "epoch": 0.4978514426028238, + "grad_norm": 0.9677640795707703, + "learning_rate": 9.989788614262043e-05, + "loss": 2.3347, + "step": 1622 + }, + { + "epoch": 0.498158379373849, + "grad_norm": 0.7592893838882446, + "learning_rate": 9.989756838724064e-05, + "loss": 2.3238, + "step": 1623 + }, + { + "epoch": 0.49846531614487416, + "grad_norm": 0.872529923915863, + "learning_rate": 9.989725013874382e-05, + "loss": 2.4117, + "step": 1624 + }, + { + "epoch": 0.49877225291589933, + "grad_norm": 1.023362159729004, + "learning_rate": 9.989693139713315e-05, + "loss": 2.3307, + "step": 1625 + }, + { + "epoch": 0.4990791896869245, + "grad_norm": 0.8994693756103516, + "learning_rate": 9.989661216241172e-05, + "loss": 2.3661, + "step": 1626 + }, + { + "epoch": 0.49938612645794966, + "grad_norm": 0.8854429125785828, + "learning_rate": 9.989629243458275e-05, + "loss": 2.311, + "step": 1627 + }, + { + "epoch": 0.49969306322897483, + "grad_norm": 0.8326926231384277, + "learning_rate": 9.989597221364937e-05, + "loss": 2.302, + "step": 1628 + }, + { + "epoch": 0.5, + "grad_norm": 0.8778239488601685, + "learning_rate": 9.989565149961475e-05, + "loss": 2.4653, + "step": 1629 + }, + { + "epoch": 0.5003069367710252, + "grad_norm": 0.9369759559631348, + "learning_rate": 9.989533029248205e-05, + "loss": 2.4165, + "step": 1630 + }, + { + "epoch": 0.5006138735420503, + "grad_norm": 0.8510915637016296, + "learning_rate": 9.989500859225445e-05, + "loss": 2.3345, + "step": 1631 + }, + { + "epoch": 0.5009208103130756, + "grad_norm": 0.787972629070282, + "learning_rate": 9.989468639893513e-05, + "loss": 2.283, + "step": 1632 + }, + { + "epoch": 0.5012277470841007, + "grad_norm": 0.7370568513870239, + "learning_rate": 9.989436371252729e-05, + "loss": 2.2867, + "step": 1633 + }, + { + "epoch": 0.5015346838551259, + "grad_norm": 0.8459502458572388, + "learning_rate": 9.989404053303409e-05, + "loss": 2.2875, + "step": 1634 + }, + { + "epoch": 0.501841620626151, + "grad_norm": 0.9123181700706482, + "learning_rate": 9.989371686045874e-05, + "loss": 2.2653, + "step": 1635 + }, + { + "epoch": 0.5021485573971762, + "grad_norm": 1.1908178329467773, + "learning_rate": 9.989339269480445e-05, + "loss": 2.4849, + "step": 1636 + }, + { + "epoch": 0.5024554941682013, + "grad_norm": 0.8162623643875122, + "learning_rate": 9.989306803607439e-05, + "loss": 2.2409, + "step": 1637 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.9289522171020508, + "learning_rate": 9.98927428842718e-05, + "loss": 2.455, + "step": 1638 + }, + { + "epoch": 0.5030693677102517, + "grad_norm": 1.212346076965332, + "learning_rate": 9.989241723939988e-05, + "loss": 2.3461, + "step": 1639 + }, + { + "epoch": 0.5033763044812769, + "grad_norm": 0.8971593976020813, + "learning_rate": 9.989209110146184e-05, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.503683241252302, + "grad_norm": 0.9293156862258911, + "learning_rate": 9.989176447046092e-05, + "loss": 2.3235, + "step": 1641 + }, + { + "epoch": 0.5039901780233272, + "grad_norm": 0.8665596842765808, + "learning_rate": 9.989143734640034e-05, + "loss": 2.4694, + "step": 1642 + }, + { + "epoch": 0.5042971147943524, + "grad_norm": 0.7732648253440857, + "learning_rate": 9.989110972928333e-05, + "loss": 2.1985, + "step": 1643 + }, + { + "epoch": 0.5046040515653776, + "grad_norm": 0.8124692440032959, + "learning_rate": 9.989078161911314e-05, + "loss": 2.315, + "step": 1644 + }, + { + "epoch": 0.5049109883364027, + "grad_norm": 0.8534342050552368, + "learning_rate": 9.989045301589301e-05, + "loss": 2.3491, + "step": 1645 + }, + { + "epoch": 0.5052179251074279, + "grad_norm": 0.8351274132728577, + "learning_rate": 9.989012391962617e-05, + "loss": 2.3416, + "step": 1646 + }, + { + "epoch": 0.505524861878453, + "grad_norm": 0.9143189787864685, + "learning_rate": 9.988979433031588e-05, + "loss": 2.4665, + "step": 1647 + }, + { + "epoch": 0.5058317986494782, + "grad_norm": 0.8978474140167236, + "learning_rate": 9.988946424796542e-05, + "loss": 2.389, + "step": 1648 + }, + { + "epoch": 0.5061387354205034, + "grad_norm": 1.0245648622512817, + "learning_rate": 9.988913367257802e-05, + "loss": 2.3391, + "step": 1649 + }, + { + "epoch": 0.5064456721915286, + "grad_norm": 0.9991573691368103, + "learning_rate": 9.988880260415695e-05, + "loss": 2.405, + "step": 1650 + }, + { + "epoch": 0.5067526089625537, + "grad_norm": 1.042378306388855, + "learning_rate": 9.98884710427055e-05, + "loss": 2.3467, + "step": 1651 + }, + { + "epoch": 0.5070595457335789, + "grad_norm": 0.9569510817527771, + "learning_rate": 9.988813898822694e-05, + "loss": 2.31, + "step": 1652 + }, + { + "epoch": 0.507366482504604, + "grad_norm": 0.9343158006668091, + "learning_rate": 9.988780644072456e-05, + "loss": 2.3659, + "step": 1653 + }, + { + "epoch": 0.5076734192756293, + "grad_norm": 0.7857093811035156, + "learning_rate": 9.988747340020162e-05, + "loss": 2.3424, + "step": 1654 + }, + { + "epoch": 0.5079803560466544, + "grad_norm": 0.7613041996955872, + "learning_rate": 9.988713986666144e-05, + "loss": 2.2698, + "step": 1655 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.8077516555786133, + "learning_rate": 9.98868058401073e-05, + "loss": 2.3827, + "step": 1656 + }, + { + "epoch": 0.5085942295887047, + "grad_norm": 0.8794304132461548, + "learning_rate": 9.98864713205425e-05, + "loss": 2.3079, + "step": 1657 + }, + { + "epoch": 0.5089011663597299, + "grad_norm": 0.8333674073219299, + "learning_rate": 9.988613630797036e-05, + "loss": 2.3622, + "step": 1658 + }, + { + "epoch": 0.509208103130755, + "grad_norm": 0.9654781222343445, + "learning_rate": 9.988580080239417e-05, + "loss": 2.3979, + "step": 1659 + }, + { + "epoch": 0.5095150399017803, + "grad_norm": 0.9278727769851685, + "learning_rate": 9.988546480381727e-05, + "loss": 2.3728, + "step": 1660 + }, + { + "epoch": 0.5098219766728054, + "grad_norm": 0.7971704006195068, + "learning_rate": 9.988512831224298e-05, + "loss": 2.2983, + "step": 1661 + }, + { + "epoch": 0.5101289134438306, + "grad_norm": 0.8991698026657104, + "learning_rate": 9.988479132767459e-05, + "loss": 2.3992, + "step": 1662 + }, + { + "epoch": 0.5104358502148557, + "grad_norm": 1.0208392143249512, + "learning_rate": 9.988445385011546e-05, + "loss": 2.3847, + "step": 1663 + }, + { + "epoch": 0.5107427869858809, + "grad_norm": 0.878237247467041, + "learning_rate": 9.988411587956891e-05, + "loss": 2.2851, + "step": 1664 + }, + { + "epoch": 0.511049723756906, + "grad_norm": 0.903287410736084, + "learning_rate": 9.98837774160383e-05, + "loss": 2.4233, + "step": 1665 + }, + { + "epoch": 0.5113566605279313, + "grad_norm": 0.8845674991607666, + "learning_rate": 9.988343845952697e-05, + "loss": 2.2923, + "step": 1666 + }, + { + "epoch": 0.5116635972989564, + "grad_norm": 0.7729392051696777, + "learning_rate": 9.988309901003825e-05, + "loss": 2.3044, + "step": 1667 + }, + { + "epoch": 0.5119705340699816, + "grad_norm": 0.719302237033844, + "learning_rate": 9.988275906757551e-05, + "loss": 2.3207, + "step": 1668 + }, + { + "epoch": 0.5122774708410067, + "grad_norm": 0.7205179333686829, + "learning_rate": 9.988241863214211e-05, + "loss": 2.341, + "step": 1669 + }, + { + "epoch": 0.512584407612032, + "grad_norm": 0.7318145036697388, + "learning_rate": 9.988207770374142e-05, + "loss": 2.3419, + "step": 1670 + }, + { + "epoch": 0.5128913443830571, + "grad_norm": 0.770630955696106, + "learning_rate": 9.98817362823768e-05, + "loss": 2.27, + "step": 1671 + }, + { + "epoch": 0.5131982811540823, + "grad_norm": 0.6485452651977539, + "learning_rate": 9.988139436805162e-05, + "loss": 2.2715, + "step": 1672 + }, + { + "epoch": 0.5135052179251074, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.988105196076925e-05, + "loss": 2.2806, + "step": 1673 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.695818305015564, + "learning_rate": 9.98807090605331e-05, + "loss": 2.3387, + "step": 1674 + }, + { + "epoch": 0.5141190914671577, + "grad_norm": 0.7685426473617554, + "learning_rate": 9.988036566734655e-05, + "loss": 2.2921, + "step": 1675 + }, + { + "epoch": 0.514426028238183, + "grad_norm": 0.6522897481918335, + "learning_rate": 9.988002178121301e-05, + "loss": 2.2507, + "step": 1676 + }, + { + "epoch": 0.5147329650092081, + "grad_norm": 0.7442181706428528, + "learning_rate": 9.987967740213583e-05, + "loss": 2.3292, + "step": 1677 + }, + { + "epoch": 0.5150399017802333, + "grad_norm": 0.8093023300170898, + "learning_rate": 9.987933253011846e-05, + "loss": 2.3384, + "step": 1678 + }, + { + "epoch": 0.5153468385512584, + "grad_norm": 0.8014655113220215, + "learning_rate": 9.987898716516428e-05, + "loss": 2.3619, + "step": 1679 + }, + { + "epoch": 0.5156537753222836, + "grad_norm": 0.8230258822441101, + "learning_rate": 9.987864130727671e-05, + "loss": 2.3242, + "step": 1680 + }, + { + "epoch": 0.5159607120933087, + "grad_norm": 0.9222247004508972, + "learning_rate": 9.987829495645918e-05, + "loss": 2.3907, + "step": 1681 + }, + { + "epoch": 0.516267648864334, + "grad_norm": 0.9293351769447327, + "learning_rate": 9.987794811271511e-05, + "loss": 2.3632, + "step": 1682 + }, + { + "epoch": 0.5165745856353591, + "grad_norm": 0.9555168747901917, + "learning_rate": 9.987760077604791e-05, + "loss": 2.3273, + "step": 1683 + }, + { + "epoch": 0.5168815224063843, + "grad_norm": 0.9839370250701904, + "learning_rate": 9.987725294646102e-05, + "loss": 2.3451, + "step": 1684 + }, + { + "epoch": 0.5171884591774094, + "grad_norm": 1.097970962524414, + "learning_rate": 9.987690462395791e-05, + "loss": 2.308, + "step": 1685 + }, + { + "epoch": 0.5174953959484346, + "grad_norm": 0.9345484972000122, + "learning_rate": 9.987655580854198e-05, + "loss": 2.3051, + "step": 1686 + }, + { + "epoch": 0.5178023327194597, + "grad_norm": 0.8075851798057556, + "learning_rate": 9.987620650021668e-05, + "loss": 2.3005, + "step": 1687 + }, + { + "epoch": 0.518109269490485, + "grad_norm": 0.7287935614585876, + "learning_rate": 9.987585669898549e-05, + "loss": 2.3709, + "step": 1688 + }, + { + "epoch": 0.5184162062615101, + "grad_norm": 0.7611173987388611, + "learning_rate": 9.987550640485184e-05, + "loss": 2.3265, + "step": 1689 + }, + { + "epoch": 0.5187231430325353, + "grad_norm": 0.7932588458061218, + "learning_rate": 9.987515561781921e-05, + "loss": 2.3625, + "step": 1690 + }, + { + "epoch": 0.5190300798035604, + "grad_norm": 0.7837479114532471, + "learning_rate": 9.987480433789106e-05, + "loss": 2.2614, + "step": 1691 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.905799925327301, + "learning_rate": 9.987445256507085e-05, + "loss": 2.2915, + "step": 1692 + }, + { + "epoch": 0.5196439533456108, + "grad_norm": 0.9417183995246887, + "learning_rate": 9.987410029936208e-05, + "loss": 2.3624, + "step": 1693 + }, + { + "epoch": 0.519950890116636, + "grad_norm": 0.9971327185630798, + "learning_rate": 9.987374754076822e-05, + "loss": 2.3913, + "step": 1694 + }, + { + "epoch": 0.5202578268876611, + "grad_norm": 0.8719072341918945, + "learning_rate": 9.987339428929274e-05, + "loss": 2.3412, + "step": 1695 + }, + { + "epoch": 0.5205647636586863, + "grad_norm": 0.8198116421699524, + "learning_rate": 9.987304054493916e-05, + "loss": 2.333, + "step": 1696 + }, + { + "epoch": 0.5208717004297114, + "grad_norm": 0.7450931668281555, + "learning_rate": 9.987268630771096e-05, + "loss": 2.2817, + "step": 1697 + }, + { + "epoch": 0.5211786372007366, + "grad_norm": 0.6867587566375732, + "learning_rate": 9.987233157761164e-05, + "loss": 2.3456, + "step": 1698 + }, + { + "epoch": 0.5214855739717618, + "grad_norm": 0.7537778615951538, + "learning_rate": 9.987197635464471e-05, + "loss": 2.176, + "step": 1699 + }, + { + "epoch": 0.521792510742787, + "grad_norm": 0.8347577452659607, + "learning_rate": 9.987162063881366e-05, + "loss": 2.3296, + "step": 1700 + }, + { + "epoch": 0.5220994475138122, + "grad_norm": 0.8714643120765686, + "learning_rate": 9.987126443012205e-05, + "loss": 2.3648, + "step": 1701 + }, + { + "epoch": 0.5224063842848373, + "grad_norm": 0.8579849004745483, + "learning_rate": 9.987090772857336e-05, + "loss": 2.4189, + "step": 1702 + }, + { + "epoch": 0.5227133210558625, + "grad_norm": 0.8651238083839417, + "learning_rate": 9.987055053417114e-05, + "loss": 2.3036, + "step": 1703 + }, + { + "epoch": 0.5230202578268877, + "grad_norm": 0.8447873592376709, + "learning_rate": 9.98701928469189e-05, + "loss": 2.3243, + "step": 1704 + }, + { + "epoch": 0.5233271945979129, + "grad_norm": 0.8218941688537598, + "learning_rate": 9.986983466682019e-05, + "loss": 2.3888, + "step": 1705 + }, + { + "epoch": 0.523634131368938, + "grad_norm": 0.7862920761108398, + "learning_rate": 9.986947599387855e-05, + "loss": 2.335, + "step": 1706 + }, + { + "epoch": 0.5239410681399632, + "grad_norm": 0.8096200227737427, + "learning_rate": 9.986911682809749e-05, + "loss": 2.4034, + "step": 1707 + }, + { + "epoch": 0.5242480049109883, + "grad_norm": 0.8217427730560303, + "learning_rate": 9.986875716948062e-05, + "loss": 2.2659, + "step": 1708 + }, + { + "epoch": 0.5245549416820136, + "grad_norm": 0.7676928043365479, + "learning_rate": 9.986839701803146e-05, + "loss": 2.2736, + "step": 1709 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.7783572673797607, + "learning_rate": 9.986803637375356e-05, + "loss": 2.3611, + "step": 1710 + }, + { + "epoch": 0.5251688152240639, + "grad_norm": 0.7657338380813599, + "learning_rate": 9.98676752366505e-05, + "loss": 2.3573, + "step": 1711 + }, + { + "epoch": 0.525475751995089, + "grad_norm": 0.8946976065635681, + "learning_rate": 9.986731360672585e-05, + "loss": 2.3443, + "step": 1712 + }, + { + "epoch": 0.5257826887661142, + "grad_norm": 0.8047227263450623, + "learning_rate": 9.986695148398318e-05, + "loss": 2.345, + "step": 1713 + }, + { + "epoch": 0.5260896255371393, + "grad_norm": 0.8407939672470093, + "learning_rate": 9.986658886842605e-05, + "loss": 2.2828, + "step": 1714 + }, + { + "epoch": 0.5263965623081646, + "grad_norm": 0.8460215330123901, + "learning_rate": 9.986622576005806e-05, + "loss": 2.2786, + "step": 1715 + }, + { + "epoch": 0.5267034990791897, + "grad_norm": 0.8291949033737183, + "learning_rate": 9.986586215888283e-05, + "loss": 2.3491, + "step": 1716 + }, + { + "epoch": 0.5270104358502149, + "grad_norm": 0.8812628388404846, + "learning_rate": 9.98654980649039e-05, + "loss": 2.3392, + "step": 1717 + }, + { + "epoch": 0.52731737262124, + "grad_norm": 0.8666933178901672, + "learning_rate": 9.98651334781249e-05, + "loss": 2.2585, + "step": 1718 + }, + { + "epoch": 0.5276243093922652, + "grad_norm": 0.8393275737762451, + "learning_rate": 9.986476839854941e-05, + "loss": 2.3315, + "step": 1719 + }, + { + "epoch": 0.5279312461632903, + "grad_norm": 0.8431777954101562, + "learning_rate": 9.986440282618105e-05, + "loss": 2.268, + "step": 1720 + }, + { + "epoch": 0.5282381829343156, + "grad_norm": 0.8020747900009155, + "learning_rate": 9.986403676102346e-05, + "loss": 2.2306, + "step": 1721 + }, + { + "epoch": 0.5285451197053407, + "grad_norm": 0.817395806312561, + "learning_rate": 9.986367020308022e-05, + "loss": 2.2914, + "step": 1722 + }, + { + "epoch": 0.5288520564763659, + "grad_norm": 0.8034493327140808, + "learning_rate": 9.986330315235497e-05, + "loss": 2.3598, + "step": 1723 + }, + { + "epoch": 0.529158993247391, + "grad_norm": 0.9001252055168152, + "learning_rate": 9.986293560885131e-05, + "loss": 2.3456, + "step": 1724 + }, + { + "epoch": 0.5294659300184162, + "grad_norm": 0.9782349467277527, + "learning_rate": 9.986256757257293e-05, + "loss": 2.231, + "step": 1725 + }, + { + "epoch": 0.5297728667894414, + "grad_norm": 1.0022578239440918, + "learning_rate": 9.98621990435234e-05, + "loss": 2.3457, + "step": 1726 + }, + { + "epoch": 0.5300798035604666, + "grad_norm": 1.0705206394195557, + "learning_rate": 9.986183002170642e-05, + "loss": 2.2775, + "step": 1727 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.8464064598083496, + "learning_rate": 9.98614605071256e-05, + "loss": 2.4006, + "step": 1728 + }, + { + "epoch": 0.5306936771025169, + "grad_norm": 0.7128132581710815, + "learning_rate": 9.98610904997846e-05, + "loss": 2.3273, + "step": 1729 + }, + { + "epoch": 0.531000613873542, + "grad_norm": 0.8113927245140076, + "learning_rate": 9.986071999968706e-05, + "loss": 2.3467, + "step": 1730 + }, + { + "epoch": 0.5313075506445673, + "grad_norm": 0.9236831665039062, + "learning_rate": 9.986034900683669e-05, + "loss": 2.3815, + "step": 1731 + }, + { + "epoch": 0.5316144874155924, + "grad_norm": 0.9325668811798096, + "learning_rate": 9.985997752123713e-05, + "loss": 2.3411, + "step": 1732 + }, + { + "epoch": 0.5319214241866176, + "grad_norm": 0.9585117101669312, + "learning_rate": 9.985960554289203e-05, + "loss": 2.3309, + "step": 1733 + }, + { + "epoch": 0.5322283609576427, + "grad_norm": 0.9459986686706543, + "learning_rate": 9.98592330718051e-05, + "loss": 2.3525, + "step": 1734 + }, + { + "epoch": 0.5325352977286679, + "grad_norm": 0.971592366695404, + "learning_rate": 9.985886010797997e-05, + "loss": 2.3665, + "step": 1735 + }, + { + "epoch": 0.532842234499693, + "grad_norm": 0.8533779978752136, + "learning_rate": 9.985848665142039e-05, + "loss": 2.26, + "step": 1736 + }, + { + "epoch": 0.5331491712707183, + "grad_norm": 0.8224228620529175, + "learning_rate": 9.985811270213002e-05, + "loss": 2.3523, + "step": 1737 + }, + { + "epoch": 0.5334561080417434, + "grad_norm": 0.8649810552597046, + "learning_rate": 9.985773826011255e-05, + "loss": 2.3262, + "step": 1738 + }, + { + "epoch": 0.5337630448127686, + "grad_norm": 0.8099339604377747, + "learning_rate": 9.98573633253717e-05, + "loss": 2.3038, + "step": 1739 + }, + { + "epoch": 0.5340699815837937, + "grad_norm": 0.6788219213485718, + "learning_rate": 9.985698789791115e-05, + "loss": 2.3278, + "step": 1740 + }, + { + "epoch": 0.5343769183548189, + "grad_norm": 0.8716040253639221, + "learning_rate": 9.985661197773464e-05, + "loss": 2.2955, + "step": 1741 + }, + { + "epoch": 0.534683855125844, + "grad_norm": 0.8377614617347717, + "learning_rate": 9.985623556484587e-05, + "loss": 2.2801, + "step": 1742 + }, + { + "epoch": 0.5349907918968693, + "grad_norm": 0.8452683091163635, + "learning_rate": 9.985585865924853e-05, + "loss": 2.3313, + "step": 1743 + }, + { + "epoch": 0.5352977286678944, + "grad_norm": 0.8226203918457031, + "learning_rate": 9.98554812609464e-05, + "loss": 2.3464, + "step": 1744 + }, + { + "epoch": 0.5356046654389196, + "grad_norm": 0.7476974725723267, + "learning_rate": 9.985510336994316e-05, + "loss": 2.3721, + "step": 1745 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.7132230997085571, + "learning_rate": 9.98547249862426e-05, + "loss": 2.2657, + "step": 1746 + }, + { + "epoch": 0.5362185389809699, + "grad_norm": 0.7022002339363098, + "learning_rate": 9.98543461098484e-05, + "loss": 2.2656, + "step": 1747 + }, + { + "epoch": 0.536525475751995, + "grad_norm": 0.7174789309501648, + "learning_rate": 9.985396674076435e-05, + "loss": 2.2914, + "step": 1748 + }, + { + "epoch": 0.5368324125230203, + "grad_norm": 0.78509920835495, + "learning_rate": 9.985358687899417e-05, + "loss": 2.3155, + "step": 1749 + }, + { + "epoch": 0.5371393492940454, + "grad_norm": 0.7670894861221313, + "learning_rate": 9.985320652454162e-05, + "loss": 2.2608, + "step": 1750 + }, + { + "epoch": 0.5374462860650706, + "grad_norm": 0.6196603178977966, + "learning_rate": 9.985282567741047e-05, + "loss": 2.2796, + "step": 1751 + }, + { + "epoch": 0.5377532228360957, + "grad_norm": 0.7119829058647156, + "learning_rate": 9.985244433760448e-05, + "loss": 2.2262, + "step": 1752 + }, + { + "epoch": 0.538060159607121, + "grad_norm": 0.6665359735488892, + "learning_rate": 9.98520625051274e-05, + "loss": 2.2714, + "step": 1753 + }, + { + "epoch": 0.5383670963781461, + "grad_norm": 0.7960934042930603, + "learning_rate": 9.985168017998303e-05, + "loss": 2.3703, + "step": 1754 + }, + { + "epoch": 0.5386740331491713, + "grad_norm": 0.9428521394729614, + "learning_rate": 9.985129736217513e-05, + "loss": 2.3334, + "step": 1755 + }, + { + "epoch": 0.5389809699201964, + "grad_norm": 0.9900842905044556, + "learning_rate": 9.985091405170751e-05, + "loss": 2.2369, + "step": 1756 + }, + { + "epoch": 0.5392879066912216, + "grad_norm": 0.9340593814849854, + "learning_rate": 9.985053024858393e-05, + "loss": 2.4332, + "step": 1757 + }, + { + "epoch": 0.5395948434622467, + "grad_norm": 0.9241896271705627, + "learning_rate": 9.985014595280818e-05, + "loss": 2.3484, + "step": 1758 + }, + { + "epoch": 0.539901780233272, + "grad_norm": 0.7724506258964539, + "learning_rate": 9.984976116438408e-05, + "loss": 2.282, + "step": 1759 + }, + { + "epoch": 0.5402087170042971, + "grad_norm": 0.9098101854324341, + "learning_rate": 9.984937588331543e-05, + "loss": 2.3039, + "step": 1760 + }, + { + "epoch": 0.5405156537753223, + "grad_norm": 0.9430370330810547, + "learning_rate": 9.984899010960601e-05, + "loss": 2.2555, + "step": 1761 + }, + { + "epoch": 0.5408225905463474, + "grad_norm": 0.8927021026611328, + "learning_rate": 9.984860384325965e-05, + "loss": 2.3034, + "step": 1762 + }, + { + "epoch": 0.5411295273173726, + "grad_norm": 0.8331896662712097, + "learning_rate": 9.98482170842802e-05, + "loss": 2.3341, + "step": 1763 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.8311246633529663, + "learning_rate": 9.984782983267142e-05, + "loss": 2.3913, + "step": 1764 + }, + { + "epoch": 0.541743400859423, + "grad_norm": 0.7459335923194885, + "learning_rate": 9.98474420884372e-05, + "loss": 2.2912, + "step": 1765 + }, + { + "epoch": 0.5420503376304481, + "grad_norm": 0.84760981798172, + "learning_rate": 9.984705385158131e-05, + "loss": 2.316, + "step": 1766 + }, + { + "epoch": 0.5423572744014733, + "grad_norm": 0.888793408870697, + "learning_rate": 9.984666512210762e-05, + "loss": 2.3452, + "step": 1767 + }, + { + "epoch": 0.5426642111724984, + "grad_norm": 0.7977499961853027, + "learning_rate": 9.984627590001999e-05, + "loss": 2.3325, + "step": 1768 + }, + { + "epoch": 0.5429711479435236, + "grad_norm": 0.8059934377670288, + "learning_rate": 9.984588618532224e-05, + "loss": 2.3347, + "step": 1769 + }, + { + "epoch": 0.5432780847145487, + "grad_norm": 0.8190197348594666, + "learning_rate": 9.984549597801822e-05, + "loss": 2.3446, + "step": 1770 + }, + { + "epoch": 0.543585021485574, + "grad_norm": 0.774773895740509, + "learning_rate": 9.98451052781118e-05, + "loss": 2.2598, + "step": 1771 + }, + { + "epoch": 0.5438919582565992, + "grad_norm": 0.7341485023498535, + "learning_rate": 9.984471408560682e-05, + "loss": 2.2728, + "step": 1772 + }, + { + "epoch": 0.5441988950276243, + "grad_norm": 0.6881145238876343, + "learning_rate": 9.984432240050719e-05, + "loss": 2.2922, + "step": 1773 + }, + { + "epoch": 0.5445058317986495, + "grad_norm": 0.6896151304244995, + "learning_rate": 9.984393022281673e-05, + "loss": 2.2915, + "step": 1774 + }, + { + "epoch": 0.5448127685696746, + "grad_norm": 0.6902059316635132, + "learning_rate": 9.984353755253932e-05, + "loss": 2.31, + "step": 1775 + }, + { + "epoch": 0.5451197053406999, + "grad_norm": 0.7594140768051147, + "learning_rate": 9.984314438967888e-05, + "loss": 2.3092, + "step": 1776 + }, + { + "epoch": 0.545426642111725, + "grad_norm": 0.8682328462600708, + "learning_rate": 9.984275073423927e-05, + "loss": 2.2851, + "step": 1777 + }, + { + "epoch": 0.5457335788827502, + "grad_norm": 0.8747107982635498, + "learning_rate": 9.98423565862244e-05, + "loss": 2.2927, + "step": 1778 + }, + { + "epoch": 0.5460405156537753, + "grad_norm": 0.9824326038360596, + "learning_rate": 9.984196194563813e-05, + "loss": 2.3622, + "step": 1779 + }, + { + "epoch": 0.5463474524248005, + "grad_norm": 1.0006790161132812, + "learning_rate": 9.984156681248438e-05, + "loss": 2.2531, + "step": 1780 + }, + { + "epoch": 0.5466543891958257, + "grad_norm": 0.9501944184303284, + "learning_rate": 9.984117118676705e-05, + "loss": 2.3902, + "step": 1781 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.7835353016853333, + "learning_rate": 9.984077506849005e-05, + "loss": 2.2754, + "step": 1782 + }, + { + "epoch": 0.547268262737876, + "grad_norm": 0.7310026288032532, + "learning_rate": 9.984037845765732e-05, + "loss": 2.2742, + "step": 1783 + }, + { + "epoch": 0.5475751995089012, + "grad_norm": 0.9469361901283264, + "learning_rate": 9.983998135427275e-05, + "loss": 2.4026, + "step": 1784 + }, + { + "epoch": 0.5478821362799263, + "grad_norm": 1.0639240741729736, + "learning_rate": 9.983958375834025e-05, + "loss": 2.3522, + "step": 1785 + }, + { + "epoch": 0.5481890730509515, + "grad_norm": 0.7771989703178406, + "learning_rate": 9.983918566986379e-05, + "loss": 2.216, + "step": 1786 + }, + { + "epoch": 0.5484960098219767, + "grad_norm": 0.6809307932853699, + "learning_rate": 9.983878708884728e-05, + "loss": 2.256, + "step": 1787 + }, + { + "epoch": 0.5488029465930019, + "grad_norm": 0.7300165891647339, + "learning_rate": 9.983838801529469e-05, + "loss": 2.3156, + "step": 1788 + }, + { + "epoch": 0.549109883364027, + "grad_norm": 0.8352389335632324, + "learning_rate": 9.98379884492099e-05, + "loss": 2.3344, + "step": 1789 + }, + { + "epoch": 0.5494168201350522, + "grad_norm": 0.830585777759552, + "learning_rate": 9.983758839059692e-05, + "loss": 2.3076, + "step": 1790 + }, + { + "epoch": 0.5497237569060773, + "grad_norm": 0.7384640574455261, + "learning_rate": 9.983718783945968e-05, + "loss": 2.2387, + "step": 1791 + }, + { + "epoch": 0.5500306936771026, + "grad_norm": 0.7133243083953857, + "learning_rate": 9.983678679580213e-05, + "loss": 2.2933, + "step": 1792 + }, + { + "epoch": 0.5503376304481277, + "grad_norm": 0.8462459444999695, + "learning_rate": 9.983638525962823e-05, + "loss": 2.3294, + "step": 1793 + }, + { + "epoch": 0.5506445672191529, + "grad_norm": 0.7841110825538635, + "learning_rate": 9.983598323094199e-05, + "loss": 2.3156, + "step": 1794 + }, + { + "epoch": 0.550951503990178, + "grad_norm": 0.8454114198684692, + "learning_rate": 9.983558070974735e-05, + "loss": 2.2203, + "step": 1795 + }, + { + "epoch": 0.5512584407612032, + "grad_norm": 0.7741531729698181, + "learning_rate": 9.983517769604826e-05, + "loss": 2.2585, + "step": 1796 + }, + { + "epoch": 0.5515653775322283, + "grad_norm": 0.717714250087738, + "learning_rate": 9.983477418984876e-05, + "loss": 2.3127, + "step": 1797 + }, + { + "epoch": 0.5518723143032536, + "grad_norm": 0.7546361088752747, + "learning_rate": 9.983437019115283e-05, + "loss": 2.2591, + "step": 1798 + }, + { + "epoch": 0.5521792510742787, + "grad_norm": 0.7947681546211243, + "learning_rate": 9.983396569996442e-05, + "loss": 2.337, + "step": 1799 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.9286270141601562, + "learning_rate": 9.983356071628756e-05, + "loss": 2.371, + "step": 1800 + }, + { + "epoch": 0.552793124616329, + "grad_norm": 1.0236682891845703, + "learning_rate": 9.983315524012625e-05, + "loss": 2.2673, + "step": 1801 + }, + { + "epoch": 0.5531000613873542, + "grad_norm": 1.043534278869629, + "learning_rate": 9.983274927148447e-05, + "loss": 2.3204, + "step": 1802 + }, + { + "epoch": 0.5534069981583793, + "grad_norm": 0.9694257378578186, + "learning_rate": 9.983234281036626e-05, + "loss": 2.2642, + "step": 1803 + }, + { + "epoch": 0.5537139349294046, + "grad_norm": 0.8890992403030396, + "learning_rate": 9.983193585677563e-05, + "loss": 2.2546, + "step": 1804 + }, + { + "epoch": 0.5540208717004297, + "grad_norm": 0.8109140396118164, + "learning_rate": 9.983152841071662e-05, + "loss": 2.3088, + "step": 1805 + }, + { + "epoch": 0.5543278084714549, + "grad_norm": 0.7762413620948792, + "learning_rate": 9.983112047219323e-05, + "loss": 2.2277, + "step": 1806 + }, + { + "epoch": 0.55463474524248, + "grad_norm": 0.7949336767196655, + "learning_rate": 9.983071204120951e-05, + "loss": 2.3004, + "step": 1807 + }, + { + "epoch": 0.5549416820135052, + "grad_norm": 0.9118300080299377, + "learning_rate": 9.983030311776946e-05, + "loss": 2.3986, + "step": 1808 + }, + { + "epoch": 0.5552486187845304, + "grad_norm": 0.874891996383667, + "learning_rate": 9.982989370187717e-05, + "loss": 2.2721, + "step": 1809 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.8089940547943115, + "learning_rate": 9.982948379353667e-05, + "loss": 2.2846, + "step": 1810 + }, + { + "epoch": 0.5558624923265807, + "grad_norm": 0.7407395839691162, + "learning_rate": 9.982907339275198e-05, + "loss": 2.2848, + "step": 1811 + }, + { + "epoch": 0.5561694290976059, + "grad_norm": 0.7487329244613647, + "learning_rate": 9.982866249952721e-05, + "loss": 2.266, + "step": 1812 + }, + { + "epoch": 0.556476365868631, + "grad_norm": 0.7910557389259338, + "learning_rate": 9.982825111386638e-05, + "loss": 2.2975, + "step": 1813 + }, + { + "epoch": 0.5567833026396563, + "grad_norm": 0.767186164855957, + "learning_rate": 9.982783923577356e-05, + "loss": 2.2867, + "step": 1814 + }, + { + "epoch": 0.5570902394106814, + "grad_norm": 0.7296959757804871, + "learning_rate": 9.982742686525284e-05, + "loss": 2.2167, + "step": 1815 + }, + { + "epoch": 0.5573971761817066, + "grad_norm": 0.6536411643028259, + "learning_rate": 9.982701400230827e-05, + "loss": 2.2278, + "step": 1816 + }, + { + "epoch": 0.5577041129527317, + "grad_norm": 0.7393643260002136, + "learning_rate": 9.982660064694394e-05, + "loss": 2.3275, + "step": 1817 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.7837240099906921, + "learning_rate": 9.982618679916396e-05, + "loss": 2.3516, + "step": 1818 + }, + { + "epoch": 0.558317986494782, + "grad_norm": 0.8186847567558289, + "learning_rate": 9.982577245897238e-05, + "loss": 2.4104, + "step": 1819 + }, + { + "epoch": 0.5586249232658073, + "grad_norm": 0.733651340007782, + "learning_rate": 9.98253576263733e-05, + "loss": 2.2151, + "step": 1820 + }, + { + "epoch": 0.5589318600368324, + "grad_norm": 0.7452411651611328, + "learning_rate": 9.982494230137086e-05, + "loss": 2.3288, + "step": 1821 + }, + { + "epoch": 0.5592387968078576, + "grad_norm": 0.7369456887245178, + "learning_rate": 9.982452648396913e-05, + "loss": 2.3023, + "step": 1822 + }, + { + "epoch": 0.5595457335788827, + "grad_norm": 0.794789731502533, + "learning_rate": 9.982411017417222e-05, + "loss": 2.2774, + "step": 1823 + }, + { + "epoch": 0.5598526703499079, + "grad_norm": 0.7677412033081055, + "learning_rate": 9.982369337198425e-05, + "loss": 2.3213, + "step": 1824 + }, + { + "epoch": 0.560159607120933, + "grad_norm": 0.8195241689682007, + "learning_rate": 9.982327607740934e-05, + "loss": 2.3721, + "step": 1825 + }, + { + "epoch": 0.5604665438919583, + "grad_norm": 0.867115318775177, + "learning_rate": 9.982285829045162e-05, + "loss": 2.3653, + "step": 1826 + }, + { + "epoch": 0.5607734806629834, + "grad_norm": 0.8519865870475769, + "learning_rate": 9.98224400111152e-05, + "loss": 2.3646, + "step": 1827 + }, + { + "epoch": 0.5610804174340086, + "grad_norm": 0.9408721923828125, + "learning_rate": 9.982202123940425e-05, + "loss": 2.2051, + "step": 1828 + }, + { + "epoch": 0.5613873542050337, + "grad_norm": 0.985325813293457, + "learning_rate": 9.982160197532287e-05, + "loss": 2.3402, + "step": 1829 + }, + { + "epoch": 0.5616942909760589, + "grad_norm": 1.018094539642334, + "learning_rate": 9.982118221887521e-05, + "loss": 2.2712, + "step": 1830 + }, + { + "epoch": 0.562001227747084, + "grad_norm": 0.9246920347213745, + "learning_rate": 9.982076197006543e-05, + "loss": 2.3808, + "step": 1831 + }, + { + "epoch": 0.5623081645181093, + "grad_norm": 0.8519729971885681, + "learning_rate": 9.982034122889768e-05, + "loss": 2.3774, + "step": 1832 + }, + { + "epoch": 0.5626151012891344, + "grad_norm": 0.801567018032074, + "learning_rate": 9.981991999537612e-05, + "loss": 2.2713, + "step": 1833 + }, + { + "epoch": 0.5629220380601596, + "grad_norm": 0.7212518453598022, + "learning_rate": 9.981949826950492e-05, + "loss": 2.1902, + "step": 1834 + }, + { + "epoch": 0.5632289748311847, + "grad_norm": 0.7644798755645752, + "learning_rate": 9.981907605128822e-05, + "loss": 2.2751, + "step": 1835 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.7941999435424805, + "learning_rate": 9.981865334073022e-05, + "loss": 2.2991, + "step": 1836 + }, + { + "epoch": 0.5638428483732351, + "grad_norm": 0.7274888753890991, + "learning_rate": 9.981823013783508e-05, + "loss": 2.3536, + "step": 1837 + }, + { + "epoch": 0.5641497851442603, + "grad_norm": 0.845024585723877, + "learning_rate": 9.9817806442607e-05, + "loss": 2.2796, + "step": 1838 + }, + { + "epoch": 0.5644567219152854, + "grad_norm": 0.8225597739219666, + "learning_rate": 9.981738225505015e-05, + "loss": 2.3339, + "step": 1839 + }, + { + "epoch": 0.5647636586863106, + "grad_norm": 0.8456425070762634, + "learning_rate": 9.981695757516873e-05, + "loss": 2.2583, + "step": 1840 + }, + { + "epoch": 0.5650705954573357, + "grad_norm": 1.0066497325897217, + "learning_rate": 9.981653240296695e-05, + "loss": 2.3628, + "step": 1841 + }, + { + "epoch": 0.565377532228361, + "grad_norm": 0.9574379920959473, + "learning_rate": 9.981610673844899e-05, + "loss": 2.306, + "step": 1842 + }, + { + "epoch": 0.5656844689993862, + "grad_norm": 0.7427437901496887, + "learning_rate": 9.981568058161905e-05, + "loss": 2.267, + "step": 1843 + }, + { + "epoch": 0.5659914057704113, + "grad_norm": 0.6984857320785522, + "learning_rate": 9.981525393248138e-05, + "loss": 2.2095, + "step": 1844 + }, + { + "epoch": 0.5662983425414365, + "grad_norm": 0.748062789440155, + "learning_rate": 9.981482679104016e-05, + "loss": 2.211, + "step": 1845 + }, + { + "epoch": 0.5666052793124616, + "grad_norm": 0.7978217005729675, + "learning_rate": 9.981439915729964e-05, + "loss": 2.2437, + "step": 1846 + }, + { + "epoch": 0.5669122160834869, + "grad_norm": 0.807849109172821, + "learning_rate": 9.981397103126401e-05, + "loss": 2.3063, + "step": 1847 + }, + { + "epoch": 0.567219152854512, + "grad_norm": 0.8626619577407837, + "learning_rate": 9.981354241293752e-05, + "loss": 2.3616, + "step": 1848 + }, + { + "epoch": 0.5675260896255372, + "grad_norm": 0.8991526961326599, + "learning_rate": 9.981311330232442e-05, + "loss": 2.2355, + "step": 1849 + }, + { + "epoch": 0.5678330263965623, + "grad_norm": 0.7399953007698059, + "learning_rate": 9.981268369942894e-05, + "loss": 2.2452, + "step": 1850 + }, + { + "epoch": 0.5681399631675875, + "grad_norm": 0.7787104845046997, + "learning_rate": 9.981225360425533e-05, + "loss": 2.4141, + "step": 1851 + }, + { + "epoch": 0.5684468999386126, + "grad_norm": 0.8570892214775085, + "learning_rate": 9.98118230168078e-05, + "loss": 2.2487, + "step": 1852 + }, + { + "epoch": 0.5687538367096379, + "grad_norm": 0.8277538418769836, + "learning_rate": 9.981139193709068e-05, + "loss": 2.2602, + "step": 1853 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.7638106942176819, + "learning_rate": 9.981096036510817e-05, + "loss": 2.2886, + "step": 1854 + }, + { + "epoch": 0.5693677102516882, + "grad_norm": 0.8480616807937622, + "learning_rate": 9.981052830086454e-05, + "loss": 2.2893, + "step": 1855 + }, + { + "epoch": 0.5696746470227133, + "grad_norm": 0.8568599820137024, + "learning_rate": 9.98100957443641e-05, + "loss": 2.3802, + "step": 1856 + }, + { + "epoch": 0.5699815837937385, + "grad_norm": 0.7863987684249878, + "learning_rate": 9.98096626956111e-05, + "loss": 2.2996, + "step": 1857 + }, + { + "epoch": 0.5702885205647636, + "grad_norm": 0.7636334896087646, + "learning_rate": 9.980922915460979e-05, + "loss": 2.2569, + "step": 1858 + }, + { + "epoch": 0.5705954573357889, + "grad_norm": 0.7514677047729492, + "learning_rate": 9.98087951213645e-05, + "loss": 2.3317, + "step": 1859 + }, + { + "epoch": 0.570902394106814, + "grad_norm": 0.717637300491333, + "learning_rate": 9.980836059587951e-05, + "loss": 2.2855, + "step": 1860 + }, + { + "epoch": 0.5712093308778392, + "grad_norm": 0.728518545627594, + "learning_rate": 9.98079255781591e-05, + "loss": 2.3166, + "step": 1861 + }, + { + "epoch": 0.5715162676488643, + "grad_norm": 0.7158043384552002, + "learning_rate": 9.980749006820757e-05, + "loss": 2.2639, + "step": 1862 + }, + { + "epoch": 0.5718232044198895, + "grad_norm": 0.7565107941627502, + "learning_rate": 9.980705406602924e-05, + "loss": 2.2833, + "step": 1863 + }, + { + "epoch": 0.5721301411909147, + "grad_norm": 0.7873388528823853, + "learning_rate": 9.980661757162841e-05, + "loss": 2.201, + "step": 1864 + }, + { + "epoch": 0.5724370779619399, + "grad_norm": 0.7818259596824646, + "learning_rate": 9.980618058500939e-05, + "loss": 2.242, + "step": 1865 + }, + { + "epoch": 0.572744014732965, + "grad_norm": 0.7464665770530701, + "learning_rate": 9.98057431061765e-05, + "loss": 2.2325, + "step": 1866 + }, + { + "epoch": 0.5730509515039902, + "grad_norm": 0.7778184413909912, + "learning_rate": 9.980530513513406e-05, + "loss": 2.3258, + "step": 1867 + }, + { + "epoch": 0.5733578882750153, + "grad_norm": 0.825661301612854, + "learning_rate": 9.980486667188642e-05, + "loss": 2.3477, + "step": 1868 + }, + { + "epoch": 0.5736648250460405, + "grad_norm": 0.8448848724365234, + "learning_rate": 9.980442771643788e-05, + "loss": 2.3523, + "step": 1869 + }, + { + "epoch": 0.5739717618170657, + "grad_norm": 0.8330404758453369, + "learning_rate": 9.98039882687928e-05, + "loss": 2.2274, + "step": 1870 + }, + { + "epoch": 0.5742786985880909, + "grad_norm": 0.7520943284034729, + "learning_rate": 9.98035483289555e-05, + "loss": 2.2773, + "step": 1871 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.8312448263168335, + "learning_rate": 9.980310789693037e-05, + "loss": 2.302, + "step": 1872 + }, + { + "epoch": 0.5748925721301412, + "grad_norm": 0.7383994460105896, + "learning_rate": 9.980266697272173e-05, + "loss": 2.2168, + "step": 1873 + }, + { + "epoch": 0.5751995089011663, + "grad_norm": 0.9612922072410583, + "learning_rate": 9.980222555633394e-05, + "loss": 2.3558, + "step": 1874 + }, + { + "epoch": 0.5755064456721916, + "grad_norm": 0.9921227097511292, + "learning_rate": 9.980178364777136e-05, + "loss": 2.2913, + "step": 1875 + }, + { + "epoch": 0.5758133824432167, + "grad_norm": 0.9152889847755432, + "learning_rate": 9.980134124703837e-05, + "loss": 2.2615, + "step": 1876 + }, + { + "epoch": 0.5761203192142419, + "grad_norm": 0.8090541362762451, + "learning_rate": 9.980089835413936e-05, + "loss": 2.2661, + "step": 1877 + }, + { + "epoch": 0.576427255985267, + "grad_norm": 0.8074322938919067, + "learning_rate": 9.980045496907865e-05, + "loss": 2.3209, + "step": 1878 + }, + { + "epoch": 0.5767341927562922, + "grad_norm": 0.784649670124054, + "learning_rate": 9.980001109186065e-05, + "loss": 2.241, + "step": 1879 + }, + { + "epoch": 0.5770411295273173, + "grad_norm": 0.768108069896698, + "learning_rate": 9.979956672248978e-05, + "loss": 2.3333, + "step": 1880 + }, + { + "epoch": 0.5773480662983426, + "grad_norm": 0.798058271408081, + "learning_rate": 9.97991218609704e-05, + "loss": 2.3564, + "step": 1881 + }, + { + "epoch": 0.5776550030693677, + "grad_norm": 0.7606865763664246, + "learning_rate": 9.97986765073069e-05, + "loss": 2.2277, + "step": 1882 + }, + { + "epoch": 0.5779619398403929, + "grad_norm": 0.8320558667182922, + "learning_rate": 9.979823066150369e-05, + "loss": 2.3715, + "step": 1883 + }, + { + "epoch": 0.578268876611418, + "grad_norm": 0.7935798168182373, + "learning_rate": 9.979778432356517e-05, + "loss": 2.2605, + "step": 1884 + }, + { + "epoch": 0.5785758133824432, + "grad_norm": 0.6914796829223633, + "learning_rate": 9.979733749349578e-05, + "loss": 2.2699, + "step": 1885 + }, + { + "epoch": 0.5788827501534684, + "grad_norm": 0.6546899676322937, + "learning_rate": 9.979689017129989e-05, + "loss": 2.1908, + "step": 1886 + }, + { + "epoch": 0.5791896869244936, + "grad_norm": 0.7231267094612122, + "learning_rate": 9.979644235698195e-05, + "loss": 2.2084, + "step": 1887 + }, + { + "epoch": 0.5794966236955187, + "grad_norm": 0.668933093547821, + "learning_rate": 9.979599405054639e-05, + "loss": 2.2722, + "step": 1888 + }, + { + "epoch": 0.5798035604665439, + "grad_norm": 0.678191602230072, + "learning_rate": 9.979554525199763e-05, + "loss": 2.2312, + "step": 1889 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.6407462954521179, + "learning_rate": 9.97950959613401e-05, + "loss": 2.2381, + "step": 1890 + }, + { + "epoch": 0.5804174340085942, + "grad_norm": 0.6920403242111206, + "learning_rate": 9.979464617857826e-05, + "loss": 2.2678, + "step": 1891 + }, + { + "epoch": 0.5807243707796194, + "grad_norm": 0.6907110810279846, + "learning_rate": 9.979419590371651e-05, + "loss": 2.2579, + "step": 1892 + }, + { + "epoch": 0.5810313075506446, + "grad_norm": 0.7683933973312378, + "learning_rate": 9.979374513675935e-05, + "loss": 2.2184, + "step": 1893 + }, + { + "epoch": 0.5813382443216697, + "grad_norm": 0.797286868095398, + "learning_rate": 9.979329387771121e-05, + "loss": 2.2518, + "step": 1894 + }, + { + "epoch": 0.5816451810926949, + "grad_norm": 0.8192877769470215, + "learning_rate": 9.979284212657657e-05, + "loss": 2.2271, + "step": 1895 + }, + { + "epoch": 0.58195211786372, + "grad_norm": 0.7510090470314026, + "learning_rate": 9.979238988335986e-05, + "loss": 2.2864, + "step": 1896 + }, + { + "epoch": 0.5822590546347453, + "grad_norm": 0.7541393041610718, + "learning_rate": 9.979193714806558e-05, + "loss": 2.239, + "step": 1897 + }, + { + "epoch": 0.5825659914057704, + "grad_norm": 0.7353073358535767, + "learning_rate": 9.97914839206982e-05, + "loss": 2.2145, + "step": 1898 + }, + { + "epoch": 0.5828729281767956, + "grad_norm": 0.6813456416130066, + "learning_rate": 9.979103020126218e-05, + "loss": 2.194, + "step": 1899 + }, + { + "epoch": 0.5831798649478207, + "grad_norm": 0.6922066807746887, + "learning_rate": 9.979057598976202e-05, + "loss": 2.2335, + "step": 1900 + }, + { + "epoch": 0.5834868017188459, + "grad_norm": 0.5800344944000244, + "learning_rate": 9.97901212862022e-05, + "loss": 2.2159, + "step": 1901 + }, + { + "epoch": 0.583793738489871, + "grad_norm": 0.5770835280418396, + "learning_rate": 9.978966609058722e-05, + "loss": 2.2217, + "step": 1902 + }, + { + "epoch": 0.5841006752608963, + "grad_norm": 0.6217128038406372, + "learning_rate": 9.978921040292158e-05, + "loss": 2.2703, + "step": 1903 + }, + { + "epoch": 0.5844076120319214, + "grad_norm": 0.6684436798095703, + "learning_rate": 9.97887542232098e-05, + "loss": 2.2747, + "step": 1904 + }, + { + "epoch": 0.5847145488029466, + "grad_norm": 0.6261670589447021, + "learning_rate": 9.978829755145633e-05, + "loss": 2.2867, + "step": 1905 + }, + { + "epoch": 0.5850214855739717, + "grad_norm": 0.646051824092865, + "learning_rate": 9.978784038766575e-05, + "loss": 2.2493, + "step": 1906 + }, + { + "epoch": 0.5853284223449969, + "grad_norm": 0.6757060885429382, + "learning_rate": 9.978738273184254e-05, + "loss": 2.218, + "step": 1907 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.7867937684059143, + "learning_rate": 9.978692458399122e-05, + "loss": 2.3405, + "step": 1908 + }, + { + "epoch": 0.5859422958870473, + "grad_norm": 0.8349789381027222, + "learning_rate": 9.978646594411636e-05, + "loss": 2.3292, + "step": 1909 + }, + { + "epoch": 0.5862492326580724, + "grad_norm": 0.8739562034606934, + "learning_rate": 9.978600681222243e-05, + "loss": 2.2132, + "step": 1910 + }, + { + "epoch": 0.5865561694290976, + "grad_norm": 0.8187520503997803, + "learning_rate": 9.978554718831402e-05, + "loss": 2.3078, + "step": 1911 + }, + { + "epoch": 0.5868631062001227, + "grad_norm": 0.8463271856307983, + "learning_rate": 9.978508707239565e-05, + "loss": 2.1924, + "step": 1912 + }, + { + "epoch": 0.5871700429711479, + "grad_norm": 0.8674206733703613, + "learning_rate": 9.978462646447187e-05, + "loss": 2.2185, + "step": 1913 + }, + { + "epoch": 0.5874769797421732, + "grad_norm": 0.7828893065452576, + "learning_rate": 9.978416536454722e-05, + "loss": 2.3137, + "step": 1914 + }, + { + "epoch": 0.5877839165131983, + "grad_norm": 0.7868914604187012, + "learning_rate": 9.978370377262629e-05, + "loss": 2.2202, + "step": 1915 + }, + { + "epoch": 0.5880908532842235, + "grad_norm": 0.811596155166626, + "learning_rate": 9.97832416887136e-05, + "loss": 2.3463, + "step": 1916 + }, + { + "epoch": 0.5883977900552486, + "grad_norm": 0.9281075596809387, + "learning_rate": 9.978277911281375e-05, + "loss": 2.2394, + "step": 1917 + }, + { + "epoch": 0.5887047268262738, + "grad_norm": 0.8862313628196716, + "learning_rate": 9.978231604493129e-05, + "loss": 2.2456, + "step": 1918 + }, + { + "epoch": 0.589011663597299, + "grad_norm": 0.8411116600036621, + "learning_rate": 9.978185248507081e-05, + "loss": 2.2409, + "step": 1919 + }, + { + "epoch": 0.5893186003683242, + "grad_norm": 0.8205060958862305, + "learning_rate": 9.978138843323688e-05, + "loss": 2.2468, + "step": 1920 + }, + { + "epoch": 0.5896255371393493, + "grad_norm": 0.8103171586990356, + "learning_rate": 9.97809238894341e-05, + "loss": 2.2979, + "step": 1921 + }, + { + "epoch": 0.5899324739103745, + "grad_norm": 0.7937025427818298, + "learning_rate": 9.978045885366704e-05, + "loss": 2.3582, + "step": 1922 + }, + { + "epoch": 0.5902394106813996, + "grad_norm": 0.7983896136283875, + "learning_rate": 9.977999332594032e-05, + "loss": 2.2725, + "step": 1923 + }, + { + "epoch": 0.5905463474524248, + "grad_norm": 0.8274399042129517, + "learning_rate": 9.977952730625852e-05, + "loss": 2.3091, + "step": 1924 + }, + { + "epoch": 0.59085328422345, + "grad_norm": 0.9385362863540649, + "learning_rate": 9.977906079462627e-05, + "loss": 2.4322, + "step": 1925 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.8405537009239197, + "learning_rate": 9.977859379104814e-05, + "loss": 2.1606, + "step": 1926 + }, + { + "epoch": 0.5914671577655003, + "grad_norm": 0.8082418441772461, + "learning_rate": 9.97781262955288e-05, + "loss": 2.2929, + "step": 1927 + }, + { + "epoch": 0.5917740945365255, + "grad_norm": 0.7444280385971069, + "learning_rate": 9.977765830807283e-05, + "loss": 2.3217, + "step": 1928 + }, + { + "epoch": 0.5920810313075506, + "grad_norm": 0.7369982600212097, + "learning_rate": 9.977718982868485e-05, + "loss": 2.2658, + "step": 1929 + }, + { + "epoch": 0.5923879680785759, + "grad_norm": 0.6842257380485535, + "learning_rate": 9.977672085736951e-05, + "loss": 2.2243, + "step": 1930 + }, + { + "epoch": 0.592694904849601, + "grad_norm": 0.6954882740974426, + "learning_rate": 9.977625139413145e-05, + "loss": 2.2802, + "step": 1931 + }, + { + "epoch": 0.5930018416206262, + "grad_norm": 0.749829888343811, + "learning_rate": 9.97757814389753e-05, + "loss": 2.3166, + "step": 1932 + }, + { + "epoch": 0.5933087783916513, + "grad_norm": 0.7725609540939331, + "learning_rate": 9.977531099190569e-05, + "loss": 2.2367, + "step": 1933 + }, + { + "epoch": 0.5936157151626765, + "grad_norm": 0.7467440366744995, + "learning_rate": 9.977484005292728e-05, + "loss": 2.2704, + "step": 1934 + }, + { + "epoch": 0.5939226519337016, + "grad_norm": 0.7104424834251404, + "learning_rate": 9.977436862204475e-05, + "loss": 2.1983, + "step": 1935 + }, + { + "epoch": 0.5942295887047269, + "grad_norm": 0.7562711834907532, + "learning_rate": 9.977389669926272e-05, + "loss": 2.2857, + "step": 1936 + }, + { + "epoch": 0.594536525475752, + "grad_norm": 0.7803298830986023, + "learning_rate": 9.977342428458585e-05, + "loss": 2.3526, + "step": 1937 + }, + { + "epoch": 0.5948434622467772, + "grad_norm": 0.7487826943397522, + "learning_rate": 9.977295137801885e-05, + "loss": 2.2338, + "step": 1938 + }, + { + "epoch": 0.5951503990178023, + "grad_norm": 0.6969291567802429, + "learning_rate": 9.977247797956639e-05, + "loss": 2.2185, + "step": 1939 + }, + { + "epoch": 0.5954573357888275, + "grad_norm": 0.6293052434921265, + "learning_rate": 9.977200408923311e-05, + "loss": 2.2767, + "step": 1940 + }, + { + "epoch": 0.5957642725598526, + "grad_norm": 0.7457680702209473, + "learning_rate": 9.97715297070237e-05, + "loss": 2.2688, + "step": 1941 + }, + { + "epoch": 0.5960712093308779, + "grad_norm": 0.7255130410194397, + "learning_rate": 9.977105483294288e-05, + "loss": 2.2157, + "step": 1942 + }, + { + "epoch": 0.596378146101903, + "grad_norm": 0.739815890789032, + "learning_rate": 9.977057946699532e-05, + "loss": 2.306, + "step": 1943 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.7493855357170105, + "learning_rate": 9.977010360918571e-05, + "loss": 2.1893, + "step": 1944 + }, + { + "epoch": 0.5969920196439533, + "grad_norm": 0.7976173758506775, + "learning_rate": 9.976962725951878e-05, + "loss": 2.3288, + "step": 1945 + }, + { + "epoch": 0.5972989564149785, + "grad_norm": 0.9487287998199463, + "learning_rate": 9.976915041799921e-05, + "loss": 2.4484, + "step": 1946 + }, + { + "epoch": 0.5976058931860037, + "grad_norm": 0.9866845011711121, + "learning_rate": 9.976867308463174e-05, + "loss": 2.3223, + "step": 1947 + }, + { + "epoch": 0.5979128299570289, + "grad_norm": 0.9258660674095154, + "learning_rate": 9.976819525942107e-05, + "loss": 2.2358, + "step": 1948 + }, + { + "epoch": 0.598219766728054, + "grad_norm": 0.9822832345962524, + "learning_rate": 9.976771694237192e-05, + "loss": 2.2951, + "step": 1949 + }, + { + "epoch": 0.5985267034990792, + "grad_norm": 1.005528450012207, + "learning_rate": 9.976723813348902e-05, + "loss": 2.2604, + "step": 1950 + }, + { + "epoch": 0.5988336402701043, + "grad_norm": 0.8988018035888672, + "learning_rate": 9.976675883277711e-05, + "loss": 2.3419, + "step": 1951 + }, + { + "epoch": 0.5991405770411296, + "grad_norm": 0.7386319041252136, + "learning_rate": 9.976627904024091e-05, + "loss": 2.2357, + "step": 1952 + }, + { + "epoch": 0.5994475138121547, + "grad_norm": 0.7715404033660889, + "learning_rate": 9.976579875588518e-05, + "loss": 2.3482, + "step": 1953 + }, + { + "epoch": 0.5997544505831799, + "grad_norm": 0.7529712319374084, + "learning_rate": 9.976531797971464e-05, + "loss": 2.1735, + "step": 1954 + }, + { + "epoch": 0.600061387354205, + "grad_norm": 0.8589643836021423, + "learning_rate": 9.97648367117341e-05, + "loss": 2.305, + "step": 1955 + }, + { + "epoch": 0.6003683241252302, + "grad_norm": 0.9038915634155273, + "learning_rate": 9.976435495194823e-05, + "loss": 2.2123, + "step": 1956 + }, + { + "epoch": 0.6006752608962553, + "grad_norm": 0.9388678073883057, + "learning_rate": 9.976387270036186e-05, + "loss": 2.1792, + "step": 1957 + }, + { + "epoch": 0.6009821976672806, + "grad_norm": 0.7970952391624451, + "learning_rate": 9.976338995697974e-05, + "loss": 2.2425, + "step": 1958 + }, + { + "epoch": 0.6012891344383057, + "grad_norm": 0.7219900488853455, + "learning_rate": 9.976290672180662e-05, + "loss": 2.1984, + "step": 1959 + }, + { + "epoch": 0.6015960712093309, + "grad_norm": 0.639715313911438, + "learning_rate": 9.976242299484728e-05, + "loss": 2.2796, + "step": 1960 + }, + { + "epoch": 0.601903007980356, + "grad_norm": 0.6734911799430847, + "learning_rate": 9.976193877610652e-05, + "loss": 2.3066, + "step": 1961 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.8328932523727417, + "learning_rate": 9.976145406558912e-05, + "loss": 2.3958, + "step": 1962 + }, + { + "epoch": 0.6025168815224063, + "grad_norm": 0.9552088379859924, + "learning_rate": 9.976096886329986e-05, + "loss": 2.3246, + "step": 1963 + }, + { + "epoch": 0.6028238182934316, + "grad_norm": 0.8407328128814697, + "learning_rate": 9.976048316924354e-05, + "loss": 2.2922, + "step": 1964 + }, + { + "epoch": 0.6031307550644567, + "grad_norm": 0.6899709105491638, + "learning_rate": 9.975999698342495e-05, + "loss": 2.1808, + "step": 1965 + }, + { + "epoch": 0.6034376918354819, + "grad_norm": 0.8114390969276428, + "learning_rate": 9.975951030584892e-05, + "loss": 2.3516, + "step": 1966 + }, + { + "epoch": 0.603744628606507, + "grad_norm": 0.8071461319923401, + "learning_rate": 9.975902313652024e-05, + "loss": 2.2044, + "step": 1967 + }, + { + "epoch": 0.6040515653775322, + "grad_norm": 0.8767913579940796, + "learning_rate": 9.975853547544372e-05, + "loss": 2.24, + "step": 1968 + }, + { + "epoch": 0.6043585021485574, + "grad_norm": 0.817095935344696, + "learning_rate": 9.975804732262419e-05, + "loss": 2.169, + "step": 1969 + }, + { + "epoch": 0.6046654389195826, + "grad_norm": 0.6818623542785645, + "learning_rate": 9.975755867806648e-05, + "loss": 2.2869, + "step": 1970 + }, + { + "epoch": 0.6049723756906077, + "grad_norm": 0.7248693704605103, + "learning_rate": 9.97570695417754e-05, + "loss": 2.2159, + "step": 1971 + }, + { + "epoch": 0.6052793124616329, + "grad_norm": 0.6425455212593079, + "learning_rate": 9.975657991375581e-05, + "loss": 2.2173, + "step": 1972 + }, + { + "epoch": 0.605586249232658, + "grad_norm": 0.6856566071510315, + "learning_rate": 9.975608979401252e-05, + "loss": 2.2994, + "step": 1973 + }, + { + "epoch": 0.6058931860036832, + "grad_norm": 0.6731004118919373, + "learning_rate": 9.97555991825504e-05, + "loss": 2.2286, + "step": 1974 + }, + { + "epoch": 0.6062001227747084, + "grad_norm": 0.7461759448051453, + "learning_rate": 9.975510807937428e-05, + "loss": 2.2057, + "step": 1975 + }, + { + "epoch": 0.6065070595457336, + "grad_norm": 0.7256236672401428, + "learning_rate": 9.975461648448902e-05, + "loss": 2.2686, + "step": 1976 + }, + { + "epoch": 0.6068139963167587, + "grad_norm": 0.7254514098167419, + "learning_rate": 9.975412439789949e-05, + "loss": 2.2748, + "step": 1977 + }, + { + "epoch": 0.6071209330877839, + "grad_norm": 0.7280047535896301, + "learning_rate": 9.975363181961052e-05, + "loss": 2.27, + "step": 1978 + }, + { + "epoch": 0.607427869858809, + "grad_norm": 0.6801813244819641, + "learning_rate": 9.9753138749627e-05, + "loss": 2.2356, + "step": 1979 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.841946005821228, + "learning_rate": 9.975264518795382e-05, + "loss": 2.3887, + "step": 1980 + }, + { + "epoch": 0.6080417434008594, + "grad_norm": 0.9610007405281067, + "learning_rate": 9.975215113459582e-05, + "loss": 2.2857, + "step": 1981 + }, + { + "epoch": 0.6083486801718846, + "grad_norm": 0.8726536631584167, + "learning_rate": 9.975165658955791e-05, + "loss": 2.3137, + "step": 1982 + }, + { + "epoch": 0.6086556169429097, + "grad_norm": 0.9275946021080017, + "learning_rate": 9.975116155284498e-05, + "loss": 2.291, + "step": 1983 + }, + { + "epoch": 0.6089625537139349, + "grad_norm": 0.9045402407646179, + "learning_rate": 9.97506660244619e-05, + "loss": 2.2183, + "step": 1984 + }, + { + "epoch": 0.6092694904849602, + "grad_norm": 0.7913599610328674, + "learning_rate": 9.975017000441358e-05, + "loss": 2.349, + "step": 1985 + }, + { + "epoch": 0.6095764272559853, + "grad_norm": 0.714824378490448, + "learning_rate": 9.974967349270492e-05, + "loss": 2.2163, + "step": 1986 + }, + { + "epoch": 0.6098833640270105, + "grad_norm": 0.7178559899330139, + "learning_rate": 9.974917648934084e-05, + "loss": 2.2338, + "step": 1987 + }, + { + "epoch": 0.6101903007980356, + "grad_norm": 0.8417280912399292, + "learning_rate": 9.97486789943262e-05, + "loss": 2.1961, + "step": 1988 + }, + { + "epoch": 0.6104972375690608, + "grad_norm": 0.8488532304763794, + "learning_rate": 9.9748181007666e-05, + "loss": 2.2509, + "step": 1989 + }, + { + "epoch": 0.6108041743400859, + "grad_norm": 0.796309769153595, + "learning_rate": 9.974768252936509e-05, + "loss": 2.2948, + "step": 1990 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.7163965702056885, + "learning_rate": 9.974718355942843e-05, + "loss": 2.2136, + "step": 1991 + }, + { + "epoch": 0.6114180478821363, + "grad_norm": 0.6620060205459595, + "learning_rate": 9.974668409786095e-05, + "loss": 2.2442, + "step": 1992 + }, + { + "epoch": 0.6117249846531615, + "grad_norm": 0.6843542456626892, + "learning_rate": 9.974618414466759e-05, + "loss": 2.1972, + "step": 1993 + }, + { + "epoch": 0.6120319214241866, + "grad_norm": 0.699847936630249, + "learning_rate": 9.974568369985327e-05, + "loss": 2.2194, + "step": 1994 + }, + { + "epoch": 0.6123388581952118, + "grad_norm": 0.693384051322937, + "learning_rate": 9.974518276342293e-05, + "loss": 2.2446, + "step": 1995 + }, + { + "epoch": 0.612645794966237, + "grad_norm": 0.6022316813468933, + "learning_rate": 9.974468133538155e-05, + "loss": 2.2037, + "step": 1996 + }, + { + "epoch": 0.6129527317372622, + "grad_norm": 0.6317062377929688, + "learning_rate": 9.974417941573409e-05, + "loss": 2.1855, + "step": 1997 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.7291355133056641, + "learning_rate": 9.974367700448547e-05, + "loss": 2.2179, + "step": 1998 + }, + { + "epoch": 0.6135666052793125, + "grad_norm": 0.6776867508888245, + "learning_rate": 9.97431741016407e-05, + "loss": 2.2437, + "step": 1999 + }, + { + "epoch": 0.6138735420503376, + "grad_norm": 0.6598517298698425, + "learning_rate": 9.97426707072047e-05, + "loss": 2.2775, + "step": 2000 + }, + { + "epoch": 0.6141804788213628, + "grad_norm": 0.6681709289550781, + "learning_rate": 9.974216682118249e-05, + "loss": 2.2004, + "step": 2001 + }, + { + "epoch": 0.614487415592388, + "grad_norm": 0.6725168228149414, + "learning_rate": 9.974166244357903e-05, + "loss": 2.2922, + "step": 2002 + }, + { + "epoch": 0.6147943523634132, + "grad_norm": 0.6547908782958984, + "learning_rate": 9.974115757439931e-05, + "loss": 2.2195, + "step": 2003 + }, + { + "epoch": 0.6151012891344383, + "grad_norm": 0.7195348739624023, + "learning_rate": 9.974065221364831e-05, + "loss": 2.2862, + "step": 2004 + }, + { + "epoch": 0.6154082259054635, + "grad_norm": 0.7992655038833618, + "learning_rate": 9.974014636133103e-05, + "loss": 2.3109, + "step": 2005 + }, + { + "epoch": 0.6157151626764886, + "grad_norm": 0.7932934165000916, + "learning_rate": 9.973964001745249e-05, + "loss": 2.2869, + "step": 2006 + }, + { + "epoch": 0.6160220994475138, + "grad_norm": 0.7778924107551575, + "learning_rate": 9.973913318201763e-05, + "loss": 2.2046, + "step": 2007 + }, + { + "epoch": 0.616329036218539, + "grad_norm": 0.7951294183731079, + "learning_rate": 9.973862585503155e-05, + "loss": 2.221, + "step": 2008 + }, + { + "epoch": 0.6166359729895642, + "grad_norm": 0.729552686214447, + "learning_rate": 9.97381180364992e-05, + "loss": 2.2929, + "step": 2009 + }, + { + "epoch": 0.6169429097605893, + "grad_norm": 0.731516420841217, + "learning_rate": 9.973760972642561e-05, + "loss": 2.2673, + "step": 2010 + }, + { + "epoch": 0.6172498465316145, + "grad_norm": 0.6950094103813171, + "learning_rate": 9.973710092481581e-05, + "loss": 2.2029, + "step": 2011 + }, + { + "epoch": 0.6175567833026396, + "grad_norm": 0.6260825395584106, + "learning_rate": 9.973659163167484e-05, + "loss": 2.3037, + "step": 2012 + }, + { + "epoch": 0.6178637200736649, + "grad_norm": 0.6949467658996582, + "learning_rate": 9.97360818470077e-05, + "loss": 2.2699, + "step": 2013 + }, + { + "epoch": 0.61817065684469, + "grad_norm": 0.7322572469711304, + "learning_rate": 9.973557157081945e-05, + "loss": 2.2921, + "step": 2014 + }, + { + "epoch": 0.6184775936157152, + "grad_norm": 0.8999563455581665, + "learning_rate": 9.973506080311514e-05, + "loss": 2.2499, + "step": 2015 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.9269914031028748, + "learning_rate": 9.973454954389981e-05, + "loss": 2.2676, + "step": 2016 + }, + { + "epoch": 0.6190914671577655, + "grad_norm": 0.8630712628364563, + "learning_rate": 9.973403779317852e-05, + "loss": 2.1379, + "step": 2017 + }, + { + "epoch": 0.6193984039287906, + "grad_norm": 0.8249645233154297, + "learning_rate": 9.97335255509563e-05, + "loss": 2.3109, + "step": 2018 + }, + { + "epoch": 0.6197053406998159, + "grad_norm": 0.7832711338996887, + "learning_rate": 9.973301281723824e-05, + "loss": 2.1316, + "step": 2019 + }, + { + "epoch": 0.620012277470841, + "grad_norm": 0.7502821683883667, + "learning_rate": 9.97324995920294e-05, + "loss": 2.2188, + "step": 2020 + }, + { + "epoch": 0.6203192142418662, + "grad_norm": 0.7804487347602844, + "learning_rate": 9.973198587533483e-05, + "loss": 2.2639, + "step": 2021 + }, + { + "epoch": 0.6206261510128913, + "grad_norm": 0.9198356866836548, + "learning_rate": 9.973147166715963e-05, + "loss": 2.2574, + "step": 2022 + }, + { + "epoch": 0.6209330877839165, + "grad_norm": 0.8792869448661804, + "learning_rate": 9.97309569675089e-05, + "loss": 2.2228, + "step": 2023 + }, + { + "epoch": 0.6212400245549416, + "grad_norm": 0.779772937297821, + "learning_rate": 9.97304417763877e-05, + "loss": 2.2179, + "step": 2024 + }, + { + "epoch": 0.6215469613259669, + "grad_norm": 0.7702100276947021, + "learning_rate": 9.972992609380111e-05, + "loss": 2.3872, + "step": 2025 + }, + { + "epoch": 0.621853898096992, + "grad_norm": 0.8576669096946716, + "learning_rate": 9.972940991975426e-05, + "loss": 2.2279, + "step": 2026 + }, + { + "epoch": 0.6221608348680172, + "grad_norm": 0.8312802314758301, + "learning_rate": 9.972889325425223e-05, + "loss": 2.3507, + "step": 2027 + }, + { + "epoch": 0.6224677716390423, + "grad_norm": 0.7873719930648804, + "learning_rate": 9.972837609730013e-05, + "loss": 2.2252, + "step": 2028 + }, + { + "epoch": 0.6227747084100675, + "grad_norm": 0.7763897180557251, + "learning_rate": 9.972785844890307e-05, + "loss": 2.2559, + "step": 2029 + }, + { + "epoch": 0.6230816451810927, + "grad_norm": 0.7053700685501099, + "learning_rate": 9.972734030906617e-05, + "loss": 2.2248, + "step": 2030 + }, + { + "epoch": 0.6233885819521179, + "grad_norm": 0.8800643682479858, + "learning_rate": 9.972682167779453e-05, + "loss": 2.3111, + "step": 2031 + }, + { + "epoch": 0.623695518723143, + "grad_norm": 0.7237632274627686, + "learning_rate": 9.97263025550933e-05, + "loss": 2.2255, + "step": 2032 + }, + { + "epoch": 0.6240024554941682, + "grad_norm": 0.7139064073562622, + "learning_rate": 9.97257829409676e-05, + "loss": 2.2065, + "step": 2033 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.6514315009117126, + "learning_rate": 9.972526283542259e-05, + "loss": 2.2176, + "step": 2034 + }, + { + "epoch": 0.6246163290362186, + "grad_norm": 0.726828932762146, + "learning_rate": 9.972474223846337e-05, + "loss": 2.2236, + "step": 2035 + }, + { + "epoch": 0.6249232658072437, + "grad_norm": 0.7121313810348511, + "learning_rate": 9.97242211500951e-05, + "loss": 2.2696, + "step": 2036 + }, + { + "epoch": 0.6252302025782689, + "grad_norm": 0.7203021049499512, + "learning_rate": 9.972369957032293e-05, + "loss": 2.2418, + "step": 2037 + }, + { + "epoch": 0.625537139349294, + "grad_norm": 0.6843051910400391, + "learning_rate": 9.972317749915203e-05, + "loss": 2.2408, + "step": 2038 + }, + { + "epoch": 0.6258440761203192, + "grad_norm": 0.6523141264915466, + "learning_rate": 9.972265493658754e-05, + "loss": 2.1693, + "step": 2039 + }, + { + "epoch": 0.6261510128913443, + "grad_norm": 0.6263946294784546, + "learning_rate": 9.972213188263463e-05, + "loss": 2.2477, + "step": 2040 + }, + { + "epoch": 0.6264579496623696, + "grad_norm": 0.6428464651107788, + "learning_rate": 9.972160833729847e-05, + "loss": 2.2131, + "step": 2041 + }, + { + "epoch": 0.6267648864333947, + "grad_norm": 0.6333484649658203, + "learning_rate": 9.972108430058423e-05, + "loss": 2.2806, + "step": 2042 + }, + { + "epoch": 0.6270718232044199, + "grad_norm": 0.7168832421302795, + "learning_rate": 9.97205597724971e-05, + "loss": 2.2468, + "step": 2043 + }, + { + "epoch": 0.627378759975445, + "grad_norm": 0.7522227168083191, + "learning_rate": 9.972003475304226e-05, + "loss": 2.249, + "step": 2044 + }, + { + "epoch": 0.6276856967464702, + "grad_norm": 0.6810066103935242, + "learning_rate": 9.971950924222488e-05, + "loss": 2.1988, + "step": 2045 + }, + { + "epoch": 0.6279926335174953, + "grad_norm": 0.6983187198638916, + "learning_rate": 9.971898324005018e-05, + "loss": 2.2444, + "step": 2046 + }, + { + "epoch": 0.6282995702885206, + "grad_norm": 0.7261439561843872, + "learning_rate": 9.971845674652333e-05, + "loss": 2.1789, + "step": 2047 + }, + { + "epoch": 0.6286065070595457, + "grad_norm": 0.6844322681427002, + "learning_rate": 9.971792976164957e-05, + "loss": 2.2666, + "step": 2048 + }, + { + "epoch": 0.6289134438305709, + "grad_norm": 0.7166746258735657, + "learning_rate": 9.971740228543407e-05, + "loss": 2.3002, + "step": 2049 + }, + { + "epoch": 0.629220380601596, + "grad_norm": 0.7386785745620728, + "learning_rate": 9.971687431788207e-05, + "loss": 2.1798, + "step": 2050 + }, + { + "epoch": 0.6295273173726212, + "grad_norm": 0.6873611211776733, + "learning_rate": 9.971634585899878e-05, + "loss": 2.184, + "step": 2051 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.8005948066711426, + "learning_rate": 9.971581690878941e-05, + "loss": 2.2778, + "step": 2052 + }, + { + "epoch": 0.6301411909146716, + "grad_norm": 0.8972415924072266, + "learning_rate": 9.971528746725922e-05, + "loss": 2.2822, + "step": 2053 + }, + { + "epoch": 0.6304481276856968, + "grad_norm": 0.7935822010040283, + "learning_rate": 9.97147575344134e-05, + "loss": 2.1732, + "step": 2054 + }, + { + "epoch": 0.6307550644567219, + "grad_norm": 0.7891644239425659, + "learning_rate": 9.971422711025721e-05, + "loss": 2.2765, + "step": 2055 + }, + { + "epoch": 0.6310620012277471, + "grad_norm": 0.7857005000114441, + "learning_rate": 9.971369619479589e-05, + "loss": 2.2386, + "step": 2056 + }, + { + "epoch": 0.6313689379987723, + "grad_norm": 0.6909852623939514, + "learning_rate": 9.97131647880347e-05, + "loss": 2.1251, + "step": 2057 + }, + { + "epoch": 0.6316758747697975, + "grad_norm": 0.6352387070655823, + "learning_rate": 9.971263288997885e-05, + "loss": 2.1883, + "step": 2058 + }, + { + "epoch": 0.6319828115408226, + "grad_norm": 0.5811386704444885, + "learning_rate": 9.971210050063364e-05, + "loss": 2.281, + "step": 2059 + }, + { + "epoch": 0.6322897483118478, + "grad_norm": 0.6227630376815796, + "learning_rate": 9.971156762000432e-05, + "loss": 2.1346, + "step": 2060 + }, + { + "epoch": 0.6325966850828729, + "grad_norm": 0.6628422737121582, + "learning_rate": 9.971103424809616e-05, + "loss": 2.2617, + "step": 2061 + }, + { + "epoch": 0.6329036218538981, + "grad_norm": 0.7212308645248413, + "learning_rate": 9.97105003849144e-05, + "loss": 2.1764, + "step": 2062 + }, + { + "epoch": 0.6332105586249233, + "grad_norm": 0.8368894457817078, + "learning_rate": 9.970996603046435e-05, + "loss": 2.2897, + "step": 2063 + }, + { + "epoch": 0.6335174953959485, + "grad_norm": 0.8797467350959778, + "learning_rate": 9.970943118475129e-05, + "loss": 2.1987, + "step": 2064 + }, + { + "epoch": 0.6338244321669736, + "grad_norm": 0.9241101145744324, + "learning_rate": 9.970889584778047e-05, + "loss": 2.2759, + "step": 2065 + }, + { + "epoch": 0.6341313689379988, + "grad_norm": 0.8636183142662048, + "learning_rate": 9.970836001955723e-05, + "loss": 2.2188, + "step": 2066 + }, + { + "epoch": 0.6344383057090239, + "grad_norm": 0.8965754508972168, + "learning_rate": 9.970782370008682e-05, + "loss": 2.2845, + "step": 2067 + }, + { + "epoch": 0.6347452424800492, + "grad_norm": 0.9064372777938843, + "learning_rate": 9.970728688937459e-05, + "loss": 2.1787, + "step": 2068 + }, + { + "epoch": 0.6350521792510743, + "grad_norm": 0.7387171387672424, + "learning_rate": 9.970674958742579e-05, + "loss": 2.1805, + "step": 2069 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.6220484972000122, + "learning_rate": 9.970621179424578e-05, + "loss": 2.2762, + "step": 2070 + }, + { + "epoch": 0.6356660527931246, + "grad_norm": 0.6268464922904968, + "learning_rate": 9.970567350983984e-05, + "loss": 2.2491, + "step": 2071 + }, + { + "epoch": 0.6359729895641498, + "grad_norm": 0.6385738253593445, + "learning_rate": 9.97051347342133e-05, + "loss": 2.2126, + "step": 2072 + }, + { + "epoch": 0.6362799263351749, + "grad_norm": 0.7084285020828247, + "learning_rate": 9.970459546737148e-05, + "loss": 2.2364, + "step": 2073 + }, + { + "epoch": 0.6365868631062002, + "grad_norm": 0.6957145929336548, + "learning_rate": 9.97040557093197e-05, + "loss": 2.266, + "step": 2074 + }, + { + "epoch": 0.6368937998772253, + "grad_norm": 0.6037309169769287, + "learning_rate": 9.970351546006334e-05, + "loss": 2.1514, + "step": 2075 + }, + { + "epoch": 0.6372007366482505, + "grad_norm": 0.6342970132827759, + "learning_rate": 9.97029747196077e-05, + "loss": 2.1602, + "step": 2076 + }, + { + "epoch": 0.6375076734192756, + "grad_norm": 0.5793863534927368, + "learning_rate": 9.970243348795812e-05, + "loss": 2.1853, + "step": 2077 + }, + { + "epoch": 0.6378146101903008, + "grad_norm": 0.5420103073120117, + "learning_rate": 9.970189176511997e-05, + "loss": 2.1885, + "step": 2078 + }, + { + "epoch": 0.638121546961326, + "grad_norm": 0.6713188886642456, + "learning_rate": 9.97013495510986e-05, + "loss": 2.2641, + "step": 2079 + }, + { + "epoch": 0.6384284837323512, + "grad_norm": 0.7410796880722046, + "learning_rate": 9.970080684589935e-05, + "loss": 2.2248, + "step": 2080 + }, + { + "epoch": 0.6387354205033763, + "grad_norm": 0.7138017416000366, + "learning_rate": 9.970026364952761e-05, + "loss": 2.1975, + "step": 2081 + }, + { + "epoch": 0.6390423572744015, + "grad_norm": 0.7553584575653076, + "learning_rate": 9.969971996198873e-05, + "loss": 2.2482, + "step": 2082 + }, + { + "epoch": 0.6393492940454266, + "grad_norm": 0.7082852125167847, + "learning_rate": 9.969917578328808e-05, + "loss": 2.1681, + "step": 2083 + }, + { + "epoch": 0.6396562308164518, + "grad_norm": 0.6190223097801208, + "learning_rate": 9.969863111343105e-05, + "loss": 2.1995, + "step": 2084 + }, + { + "epoch": 0.639963167587477, + "grad_norm": 0.6640429496765137, + "learning_rate": 9.969808595242302e-05, + "loss": 2.2969, + "step": 2085 + }, + { + "epoch": 0.6402701043585022, + "grad_norm": 0.761377215385437, + "learning_rate": 9.969754030026936e-05, + "loss": 2.2412, + "step": 2086 + }, + { + "epoch": 0.6405770411295273, + "grad_norm": 0.7226401567459106, + "learning_rate": 9.969699415697551e-05, + "loss": 2.1852, + "step": 2087 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.6474639177322388, + "learning_rate": 9.969644752254681e-05, + "loss": 2.1867, + "step": 2088 + }, + { + "epoch": 0.6411909146715776, + "grad_norm": 0.6725835800170898, + "learning_rate": 9.96959003969887e-05, + "loss": 2.1962, + "step": 2089 + }, + { + "epoch": 0.6414978514426029, + "grad_norm": 0.6669641733169556, + "learning_rate": 9.969535278030657e-05, + "loss": 2.2045, + "step": 2090 + }, + { + "epoch": 0.641804788213628, + "grad_norm": 0.7604048252105713, + "learning_rate": 9.969480467250583e-05, + "loss": 2.2543, + "step": 2091 + }, + { + "epoch": 0.6421117249846532, + "grad_norm": 0.9369953870773315, + "learning_rate": 9.969425607359191e-05, + "loss": 2.2461, + "step": 2092 + }, + { + "epoch": 0.6424186617556783, + "grad_norm": 1.116156816482544, + "learning_rate": 9.969370698357022e-05, + "loss": 2.2447, + "step": 2093 + }, + { + "epoch": 0.6427255985267035, + "grad_norm": 0.9179674983024597, + "learning_rate": 9.96931574024462e-05, + "loss": 2.2164, + "step": 2094 + }, + { + "epoch": 0.6430325352977286, + "grad_norm": 0.7629393339157104, + "learning_rate": 9.969260733022526e-05, + "loss": 2.22, + "step": 2095 + }, + { + "epoch": 0.6433394720687539, + "grad_norm": 0.7152948379516602, + "learning_rate": 9.969205676691286e-05, + "loss": 2.1967, + "step": 2096 + }, + { + "epoch": 0.643646408839779, + "grad_norm": 0.7527763247489929, + "learning_rate": 9.969150571251442e-05, + "loss": 2.2263, + "step": 2097 + }, + { + "epoch": 0.6439533456108042, + "grad_norm": 0.9889422655105591, + "learning_rate": 9.96909541670354e-05, + "loss": 2.2127, + "step": 2098 + }, + { + "epoch": 0.6442602823818293, + "grad_norm": 1.0340619087219238, + "learning_rate": 9.969040213048125e-05, + "loss": 2.2392, + "step": 2099 + }, + { + "epoch": 0.6445672191528545, + "grad_norm": 0.735322892665863, + "learning_rate": 9.968984960285743e-05, + "loss": 2.1351, + "step": 2100 + }, + { + "epoch": 0.6448741559238796, + "grad_norm": 0.6575397849082947, + "learning_rate": 9.968929658416936e-05, + "loss": 2.2481, + "step": 2101 + }, + { + "epoch": 0.6451810926949049, + "grad_norm": 0.6891960501670837, + "learning_rate": 9.968874307442258e-05, + "loss": 2.2164, + "step": 2102 + }, + { + "epoch": 0.64548802946593, + "grad_norm": 0.792298436164856, + "learning_rate": 9.968818907362248e-05, + "loss": 2.1681, + "step": 2103 + }, + { + "epoch": 0.6457949662369552, + "grad_norm": 0.8438142538070679, + "learning_rate": 9.968763458177459e-05, + "loss": 2.2123, + "step": 2104 + }, + { + "epoch": 0.6461019030079803, + "grad_norm": 0.7494921088218689, + "learning_rate": 9.968707959888436e-05, + "loss": 2.1863, + "step": 2105 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.7049927115440369, + "learning_rate": 9.968652412495731e-05, + "loss": 2.2364, + "step": 2106 + }, + { + "epoch": 0.6467157765500307, + "grad_norm": 0.7586455345153809, + "learning_rate": 9.968596815999889e-05, + "loss": 2.1976, + "step": 2107 + }, + { + "epoch": 0.6470227133210559, + "grad_norm": 0.7762691974639893, + "learning_rate": 9.968541170401462e-05, + "loss": 2.2323, + "step": 2108 + }, + { + "epoch": 0.647329650092081, + "grad_norm": 0.8127642869949341, + "learning_rate": 9.968485475700998e-05, + "loss": 2.1577, + "step": 2109 + }, + { + "epoch": 0.6476365868631062, + "grad_norm": 0.6762635111808777, + "learning_rate": 9.968429731899049e-05, + "loss": 2.1972, + "step": 2110 + }, + { + "epoch": 0.6479435236341313, + "grad_norm": 0.675707995891571, + "learning_rate": 9.968373938996165e-05, + "loss": 2.1932, + "step": 2111 + }, + { + "epoch": 0.6482504604051565, + "grad_norm": 0.6996815204620361, + "learning_rate": 9.968318096992898e-05, + "loss": 2.2695, + "step": 2112 + }, + { + "epoch": 0.6485573971761817, + "grad_norm": 0.8519851565361023, + "learning_rate": 9.968262205889799e-05, + "loss": 2.2662, + "step": 2113 + }, + { + "epoch": 0.6488643339472069, + "grad_norm": 0.7621145844459534, + "learning_rate": 9.968206265687421e-05, + "loss": 2.2888, + "step": 2114 + }, + { + "epoch": 0.649171270718232, + "grad_norm": 0.786609411239624, + "learning_rate": 9.968150276386317e-05, + "loss": 2.3354, + "step": 2115 + }, + { + "epoch": 0.6494782074892572, + "grad_norm": 0.7693428993225098, + "learning_rate": 9.96809423798704e-05, + "loss": 2.1981, + "step": 2116 + }, + { + "epoch": 0.6497851442602823, + "grad_norm": 0.72762131690979, + "learning_rate": 9.968038150490145e-05, + "loss": 2.2387, + "step": 2117 + }, + { + "epoch": 0.6500920810313076, + "grad_norm": 0.737617015838623, + "learning_rate": 9.967982013896184e-05, + "loss": 2.258, + "step": 2118 + }, + { + "epoch": 0.6503990178023327, + "grad_norm": 0.7320968508720398, + "learning_rate": 9.967925828205712e-05, + "loss": 2.3248, + "step": 2119 + }, + { + "epoch": 0.6507059545733579, + "grad_norm": 0.7904484868049622, + "learning_rate": 9.967869593419286e-05, + "loss": 2.2121, + "step": 2120 + }, + { + "epoch": 0.651012891344383, + "grad_norm": 0.7519722580909729, + "learning_rate": 9.967813309537461e-05, + "loss": 2.1999, + "step": 2121 + }, + { + "epoch": 0.6513198281154082, + "grad_norm": 0.7201504707336426, + "learning_rate": 9.967756976560793e-05, + "loss": 2.2022, + "step": 2122 + }, + { + "epoch": 0.6516267648864333, + "grad_norm": 0.6134514808654785, + "learning_rate": 9.96770059448984e-05, + "loss": 2.2105, + "step": 2123 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.6086028218269348, + "learning_rate": 9.967644163325156e-05, + "loss": 2.212, + "step": 2124 + }, + { + "epoch": 0.6522406384284838, + "grad_norm": 0.6550475358963013, + "learning_rate": 9.967587683067302e-05, + "loss": 2.181, + "step": 2125 + }, + { + "epoch": 0.6525475751995089, + "grad_norm": 0.7557916045188904, + "learning_rate": 9.967531153716835e-05, + "loss": 2.3194, + "step": 2126 + }, + { + "epoch": 0.6528545119705341, + "grad_norm": 0.8859965801239014, + "learning_rate": 9.967474575274314e-05, + "loss": 2.2104, + "step": 2127 + }, + { + "epoch": 0.6531614487415592, + "grad_norm": 0.8049005270004272, + "learning_rate": 9.967417947740296e-05, + "loss": 2.2949, + "step": 2128 + }, + { + "epoch": 0.6534683855125845, + "grad_norm": 0.708297073841095, + "learning_rate": 9.967361271115343e-05, + "loss": 2.1703, + "step": 2129 + }, + { + "epoch": 0.6537753222836096, + "grad_norm": 0.6764169335365295, + "learning_rate": 9.967304545400016e-05, + "loss": 2.2177, + "step": 2130 + }, + { + "epoch": 0.6540822590546348, + "grad_norm": 0.6987971067428589, + "learning_rate": 9.967247770594872e-05, + "loss": 2.1699, + "step": 2131 + }, + { + "epoch": 0.6543891958256599, + "grad_norm": 0.7212976217269897, + "learning_rate": 9.967190946700476e-05, + "loss": 2.1217, + "step": 2132 + }, + { + "epoch": 0.6546961325966851, + "grad_norm": 0.6805562973022461, + "learning_rate": 9.967134073717386e-05, + "loss": 2.2295, + "step": 2133 + }, + { + "epoch": 0.6550030693677102, + "grad_norm": 0.665428102016449, + "learning_rate": 9.967077151646167e-05, + "loss": 2.1742, + "step": 2134 + }, + { + "epoch": 0.6553100061387355, + "grad_norm": 0.6691353917121887, + "learning_rate": 9.967020180487378e-05, + "loss": 2.2313, + "step": 2135 + }, + { + "epoch": 0.6556169429097606, + "grad_norm": 0.7095547914505005, + "learning_rate": 9.966963160241587e-05, + "loss": 2.1367, + "step": 2136 + }, + { + "epoch": 0.6559238796807858, + "grad_norm": 0.7050215601921082, + "learning_rate": 9.966906090909353e-05, + "loss": 2.3234, + "step": 2137 + }, + { + "epoch": 0.6562308164518109, + "grad_norm": 0.7592353820800781, + "learning_rate": 9.966848972491245e-05, + "loss": 2.1722, + "step": 2138 + }, + { + "epoch": 0.6565377532228361, + "grad_norm": 0.6520100831985474, + "learning_rate": 9.96679180498782e-05, + "loss": 2.2401, + "step": 2139 + }, + { + "epoch": 0.6568446899938613, + "grad_norm": 0.6650902628898621, + "learning_rate": 9.966734588399651e-05, + "loss": 2.2094, + "step": 2140 + }, + { + "epoch": 0.6571516267648865, + "grad_norm": 0.7236151099205017, + "learning_rate": 9.966677322727299e-05, + "loss": 2.3021, + "step": 2141 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.7160753011703491, + "learning_rate": 9.966620007971327e-05, + "loss": 2.1992, + "step": 2142 + }, + { + "epoch": 0.6577655003069368, + "grad_norm": 0.6761705279350281, + "learning_rate": 9.966562644132309e-05, + "loss": 2.1853, + "step": 2143 + }, + { + "epoch": 0.6580724370779619, + "grad_norm": 0.7017555236816406, + "learning_rate": 9.966505231210806e-05, + "loss": 2.208, + "step": 2144 + }, + { + "epoch": 0.6583793738489871, + "grad_norm": 0.7652586102485657, + "learning_rate": 9.966447769207387e-05, + "loss": 2.3065, + "step": 2145 + }, + { + "epoch": 0.6586863106200123, + "grad_norm": 0.7148436307907104, + "learning_rate": 9.966390258122621e-05, + "loss": 2.1388, + "step": 2146 + }, + { + "epoch": 0.6589932473910375, + "grad_norm": 0.5885360240936279, + "learning_rate": 9.966332697957076e-05, + "loss": 2.1463, + "step": 2147 + }, + { + "epoch": 0.6593001841620626, + "grad_norm": 0.6800816655158997, + "learning_rate": 9.966275088711321e-05, + "loss": 2.3397, + "step": 2148 + }, + { + "epoch": 0.6596071209330878, + "grad_norm": 0.6856956481933594, + "learning_rate": 9.966217430385925e-05, + "loss": 2.0893, + "step": 2149 + }, + { + "epoch": 0.6599140577041129, + "grad_norm": 0.6302888989448547, + "learning_rate": 9.966159722981456e-05, + "loss": 2.1108, + "step": 2150 + }, + { + "epoch": 0.6602209944751382, + "grad_norm": 0.6145252585411072, + "learning_rate": 9.966101966498486e-05, + "loss": 2.2668, + "step": 2151 + }, + { + "epoch": 0.6605279312461633, + "grad_norm": 0.7258949279785156, + "learning_rate": 9.966044160937586e-05, + "loss": 2.2163, + "step": 2152 + }, + { + "epoch": 0.6608348680171885, + "grad_norm": 0.6809847950935364, + "learning_rate": 9.965986306299327e-05, + "loss": 2.1828, + "step": 2153 + }, + { + "epoch": 0.6611418047882136, + "grad_norm": 0.6673223376274109, + "learning_rate": 9.96592840258428e-05, + "loss": 2.232, + "step": 2154 + }, + { + "epoch": 0.6614487415592388, + "grad_norm": 0.6483572721481323, + "learning_rate": 9.96587044979302e-05, + "loss": 2.199, + "step": 2155 + }, + { + "epoch": 0.6617556783302639, + "grad_norm": 0.6227185726165771, + "learning_rate": 9.965812447926115e-05, + "loss": 2.166, + "step": 2156 + }, + { + "epoch": 0.6620626151012892, + "grad_norm": 0.5982463955879211, + "learning_rate": 9.965754396984142e-05, + "loss": 2.2074, + "step": 2157 + }, + { + "epoch": 0.6623695518723143, + "grad_norm": 0.6357809901237488, + "learning_rate": 9.965696296967673e-05, + "loss": 2.2086, + "step": 2158 + }, + { + "epoch": 0.6626764886433395, + "grad_norm": 0.5908147692680359, + "learning_rate": 9.965638147877283e-05, + "loss": 2.1103, + "step": 2159 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.591332733631134, + "learning_rate": 9.965579949713545e-05, + "loss": 2.1698, + "step": 2160 + }, + { + "epoch": 0.6632903621853898, + "grad_norm": 0.5748336911201477, + "learning_rate": 9.965521702477038e-05, + "loss": 2.1812, + "step": 2161 + }, + { + "epoch": 0.663597298956415, + "grad_norm": 0.6643908023834229, + "learning_rate": 9.965463406168334e-05, + "loss": 2.2129, + "step": 2162 + }, + { + "epoch": 0.6639042357274402, + "grad_norm": 0.637627124786377, + "learning_rate": 9.965405060788011e-05, + "loss": 2.226, + "step": 2163 + }, + { + "epoch": 0.6642111724984653, + "grad_norm": 0.6170387268066406, + "learning_rate": 9.965346666336644e-05, + "loss": 2.2025, + "step": 2164 + }, + { + "epoch": 0.6645181092694905, + "grad_norm": 0.6038833260536194, + "learning_rate": 9.965288222814812e-05, + "loss": 2.1761, + "step": 2165 + }, + { + "epoch": 0.6648250460405156, + "grad_norm": 0.5705585479736328, + "learning_rate": 9.965229730223092e-05, + "loss": 2.1511, + "step": 2166 + }, + { + "epoch": 0.6651319828115408, + "grad_norm": 0.5994759798049927, + "learning_rate": 9.965171188562059e-05, + "loss": 2.1763, + "step": 2167 + }, + { + "epoch": 0.665438919582566, + "grad_norm": 0.5887313485145569, + "learning_rate": 9.965112597832296e-05, + "loss": 2.2185, + "step": 2168 + }, + { + "epoch": 0.6657458563535912, + "grad_norm": 0.5688689947128296, + "learning_rate": 9.96505395803438e-05, + "loss": 2.2387, + "step": 2169 + }, + { + "epoch": 0.6660527931246163, + "grad_norm": 0.6121554970741272, + "learning_rate": 9.96499526916889e-05, + "loss": 2.1938, + "step": 2170 + }, + { + "epoch": 0.6663597298956415, + "grad_norm": 0.6048038005828857, + "learning_rate": 9.964936531236407e-05, + "loss": 2.197, + "step": 2171 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6442995071411133, + "learning_rate": 9.96487774423751e-05, + "loss": 2.1725, + "step": 2172 + }, + { + "epoch": 0.6669736034376919, + "grad_norm": 0.7136862874031067, + "learning_rate": 9.964818908172783e-05, + "loss": 2.2166, + "step": 2173 + }, + { + "epoch": 0.667280540208717, + "grad_norm": 0.6902804970741272, + "learning_rate": 9.964760023042805e-05, + "loss": 2.2318, + "step": 2174 + }, + { + "epoch": 0.6675874769797422, + "grad_norm": 0.6946488618850708, + "learning_rate": 9.964701088848158e-05, + "loss": 2.177, + "step": 2175 + }, + { + "epoch": 0.6678944137507673, + "grad_norm": 0.6283712983131409, + "learning_rate": 9.964642105589425e-05, + "loss": 2.2227, + "step": 2176 + }, + { + "epoch": 0.6682013505217925, + "grad_norm": 0.5768510103225708, + "learning_rate": 9.96458307326719e-05, + "loss": 2.1559, + "step": 2177 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.6045784950256348, + "learning_rate": 9.964523991882035e-05, + "loss": 2.2018, + "step": 2178 + }, + { + "epoch": 0.6688152240638429, + "grad_norm": 0.5962889790534973, + "learning_rate": 9.964464861434544e-05, + "loss": 2.1898, + "step": 2179 + }, + { + "epoch": 0.669122160834868, + "grad_norm": 0.6611660718917847, + "learning_rate": 9.964405681925301e-05, + "loss": 2.1989, + "step": 2180 + }, + { + "epoch": 0.6694290976058932, + "grad_norm": 0.6764575242996216, + "learning_rate": 9.964346453354891e-05, + "loss": 2.2764, + "step": 2181 + }, + { + "epoch": 0.6697360343769183, + "grad_norm": 0.6795048117637634, + "learning_rate": 9.964287175723899e-05, + "loss": 2.1313, + "step": 2182 + }, + { + "epoch": 0.6700429711479435, + "grad_norm": 0.6697003841400146, + "learning_rate": 9.964227849032914e-05, + "loss": 2.1999, + "step": 2183 + }, + { + "epoch": 0.6703499079189686, + "grad_norm": 0.669682502746582, + "learning_rate": 9.964168473282519e-05, + "loss": 2.202, + "step": 2184 + }, + { + "epoch": 0.6706568446899939, + "grad_norm": 0.6823530793190002, + "learning_rate": 9.9641090484733e-05, + "loss": 2.2326, + "step": 2185 + }, + { + "epoch": 0.670963781461019, + "grad_norm": 0.7460775971412659, + "learning_rate": 9.964049574605848e-05, + "loss": 2.1594, + "step": 2186 + }, + { + "epoch": 0.6712707182320442, + "grad_norm": 0.8075460195541382, + "learning_rate": 9.963990051680744e-05, + "loss": 2.1506, + "step": 2187 + }, + { + "epoch": 0.6715776550030693, + "grad_norm": 0.8041695356369019, + "learning_rate": 9.963930479698585e-05, + "loss": 2.123, + "step": 2188 + }, + { + "epoch": 0.6718845917740945, + "grad_norm": 0.9129732251167297, + "learning_rate": 9.963870858659955e-05, + "loss": 2.116, + "step": 2189 + }, + { + "epoch": 0.6721915285451197, + "grad_norm": 0.9989685416221619, + "learning_rate": 9.963811188565444e-05, + "loss": 2.3194, + "step": 2190 + }, + { + "epoch": 0.6724984653161449, + "grad_norm": 1.0353670120239258, + "learning_rate": 9.96375146941564e-05, + "loss": 2.113, + "step": 2191 + }, + { + "epoch": 0.67280540208717, + "grad_norm": 0.897750735282898, + "learning_rate": 9.963691701211135e-05, + "loss": 2.1038, + "step": 2192 + }, + { + "epoch": 0.6731123388581952, + "grad_norm": 0.7353916168212891, + "learning_rate": 9.96363188395252e-05, + "loss": 2.2185, + "step": 2193 + }, + { + "epoch": 0.6734192756292203, + "grad_norm": 0.6474063992500305, + "learning_rate": 9.963572017640385e-05, + "loss": 2.2229, + "step": 2194 + }, + { + "epoch": 0.6737262124002455, + "grad_norm": 0.7194583415985107, + "learning_rate": 9.963512102275322e-05, + "loss": 2.2172, + "step": 2195 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.6638131737709045, + "learning_rate": 9.963452137857926e-05, + "loss": 2.2212, + "step": 2196 + }, + { + "epoch": 0.6743400859422959, + "grad_norm": 0.7219048738479614, + "learning_rate": 9.963392124388782e-05, + "loss": 2.3302, + "step": 2197 + }, + { + "epoch": 0.6746470227133211, + "grad_norm": 0.7941164374351501, + "learning_rate": 9.963332061868491e-05, + "loss": 2.2982, + "step": 2198 + }, + { + "epoch": 0.6749539594843462, + "grad_norm": 0.7356888055801392, + "learning_rate": 9.963271950297643e-05, + "loss": 2.1761, + "step": 2199 + }, + { + "epoch": 0.6752608962553714, + "grad_norm": 0.6705774664878845, + "learning_rate": 9.963211789676831e-05, + "loss": 2.2483, + "step": 2200 + }, + { + "epoch": 0.6755678330263966, + "grad_norm": 0.7958056926727295, + "learning_rate": 9.963151580006653e-05, + "loss": 2.2209, + "step": 2201 + }, + { + "epoch": 0.6758747697974218, + "grad_norm": 0.7215412259101868, + "learning_rate": 9.9630913212877e-05, + "loss": 2.1676, + "step": 2202 + }, + { + "epoch": 0.6761817065684469, + "grad_norm": 0.705649197101593, + "learning_rate": 9.963031013520572e-05, + "loss": 2.1855, + "step": 2203 + }, + { + "epoch": 0.6764886433394721, + "grad_norm": 0.7050254344940186, + "learning_rate": 9.962970656705861e-05, + "loss": 2.171, + "step": 2204 + }, + { + "epoch": 0.6767955801104972, + "grad_norm": 0.7163556218147278, + "learning_rate": 9.962910250844167e-05, + "loss": 2.1295, + "step": 2205 + }, + { + "epoch": 0.6771025168815225, + "grad_norm": 0.7195280194282532, + "learning_rate": 9.962849795936083e-05, + "loss": 2.1436, + "step": 2206 + }, + { + "epoch": 0.6774094536525476, + "grad_norm": 0.7356030344963074, + "learning_rate": 9.962789291982208e-05, + "loss": 2.2739, + "step": 2207 + }, + { + "epoch": 0.6777163904235728, + "grad_norm": 0.783649742603302, + "learning_rate": 9.962728738983143e-05, + "loss": 2.2461, + "step": 2208 + }, + { + "epoch": 0.6780233271945979, + "grad_norm": 0.6966754794120789, + "learning_rate": 9.962668136939481e-05, + "loss": 2.1977, + "step": 2209 + }, + { + "epoch": 0.6783302639656231, + "grad_norm": 0.6986487507820129, + "learning_rate": 9.962607485851825e-05, + "loss": 2.1806, + "step": 2210 + }, + { + "epoch": 0.6786372007366482, + "grad_norm": 0.6502536535263062, + "learning_rate": 9.962546785720774e-05, + "loss": 2.174, + "step": 2211 + }, + { + "epoch": 0.6789441375076735, + "grad_norm": 0.6797144412994385, + "learning_rate": 9.962486036546926e-05, + "loss": 2.2635, + "step": 2212 + }, + { + "epoch": 0.6792510742786986, + "grad_norm": 0.7190150022506714, + "learning_rate": 9.962425238330884e-05, + "loss": 2.2231, + "step": 2213 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.6770560145378113, + "learning_rate": 9.962364391073245e-05, + "loss": 2.1639, + "step": 2214 + }, + { + "epoch": 0.6798649478207489, + "grad_norm": 0.624911904335022, + "learning_rate": 9.962303494774614e-05, + "loss": 2.1754, + "step": 2215 + }, + { + "epoch": 0.6801718845917741, + "grad_norm": 0.7127423286437988, + "learning_rate": 9.96224254943559e-05, + "loss": 2.2047, + "step": 2216 + }, + { + "epoch": 0.6804788213627992, + "grad_norm": 0.6729345321655273, + "learning_rate": 9.962181555056778e-05, + "loss": 2.2245, + "step": 2217 + }, + { + "epoch": 0.6807857581338245, + "grad_norm": 0.7142044901847839, + "learning_rate": 9.96212051163878e-05, + "loss": 2.1827, + "step": 2218 + }, + { + "epoch": 0.6810926949048496, + "grad_norm": 0.686295211315155, + "learning_rate": 9.962059419182196e-05, + "loss": 2.1784, + "step": 2219 + }, + { + "epoch": 0.6813996316758748, + "grad_norm": 0.7207211256027222, + "learning_rate": 9.961998277687634e-05, + "loss": 2.2603, + "step": 2220 + }, + { + "epoch": 0.6817065684468999, + "grad_norm": 0.814552903175354, + "learning_rate": 9.961937087155697e-05, + "loss": 2.2328, + "step": 2221 + }, + { + "epoch": 0.6820135052179251, + "grad_norm": 0.851860761642456, + "learning_rate": 9.96187584758699e-05, + "loss": 2.2334, + "step": 2222 + }, + { + "epoch": 0.6823204419889503, + "grad_norm": 0.9232058525085449, + "learning_rate": 9.961814558982117e-05, + "loss": 2.2259, + "step": 2223 + }, + { + "epoch": 0.6826273787599755, + "grad_norm": 0.8393358588218689, + "learning_rate": 9.961753221341684e-05, + "loss": 2.1347, + "step": 2224 + }, + { + "epoch": 0.6829343155310006, + "grad_norm": 0.7124439477920532, + "learning_rate": 9.961691834666297e-05, + "loss": 2.195, + "step": 2225 + }, + { + "epoch": 0.6832412523020258, + "grad_norm": 0.644290566444397, + "learning_rate": 9.961630398956565e-05, + "loss": 2.1967, + "step": 2226 + }, + { + "epoch": 0.6835481890730509, + "grad_norm": 0.6896283030509949, + "learning_rate": 9.961568914213092e-05, + "loss": 2.1781, + "step": 2227 + }, + { + "epoch": 0.6838551258440762, + "grad_norm": 0.711643636226654, + "learning_rate": 9.961507380436487e-05, + "loss": 2.1091, + "step": 2228 + }, + { + "epoch": 0.6841620626151013, + "grad_norm": 0.7056689858436584, + "learning_rate": 9.961445797627358e-05, + "loss": 2.1848, + "step": 2229 + }, + { + "epoch": 0.6844689993861265, + "grad_norm": 0.60573410987854, + "learning_rate": 9.961384165786314e-05, + "loss": 2.1156, + "step": 2230 + }, + { + "epoch": 0.6847759361571516, + "grad_norm": 0.5612443089485168, + "learning_rate": 9.961322484913963e-05, + "loss": 2.2311, + "step": 2231 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.6356449723243713, + "learning_rate": 9.961260755010916e-05, + "loss": 2.1945, + "step": 2232 + }, + { + "epoch": 0.6853898096992019, + "grad_norm": 0.7393341660499573, + "learning_rate": 9.961198976077782e-05, + "loss": 2.2743, + "step": 2233 + }, + { + "epoch": 0.6856967464702272, + "grad_norm": 0.7658794522285461, + "learning_rate": 9.961137148115171e-05, + "loss": 2.1729, + "step": 2234 + }, + { + "epoch": 0.6860036832412523, + "grad_norm": 0.790540337562561, + "learning_rate": 9.961075271123697e-05, + "loss": 2.1372, + "step": 2235 + }, + { + "epoch": 0.6863106200122775, + "grad_norm": 0.71295565366745, + "learning_rate": 9.961013345103968e-05, + "loss": 2.1325, + "step": 2236 + }, + { + "epoch": 0.6866175567833026, + "grad_norm": 0.6648302674293518, + "learning_rate": 9.960951370056597e-05, + "loss": 2.1626, + "step": 2237 + }, + { + "epoch": 0.6869244935543278, + "grad_norm": 0.6276865601539612, + "learning_rate": 9.960889345982198e-05, + "loss": 2.1848, + "step": 2238 + }, + { + "epoch": 0.6872314303253529, + "grad_norm": 0.6786942481994629, + "learning_rate": 9.960827272881383e-05, + "loss": 2.2402, + "step": 2239 + }, + { + "epoch": 0.6875383670963782, + "grad_norm": 0.7752293348312378, + "learning_rate": 9.960765150754764e-05, + "loss": 2.2187, + "step": 2240 + }, + { + "epoch": 0.6878453038674033, + "grad_norm": 0.7958577871322632, + "learning_rate": 9.960702979602956e-05, + "loss": 2.1995, + "step": 2241 + }, + { + "epoch": 0.6881522406384285, + "grad_norm": 0.7327582240104675, + "learning_rate": 9.960640759426575e-05, + "loss": 2.1709, + "step": 2242 + }, + { + "epoch": 0.6884591774094536, + "grad_norm": 0.7002710103988647, + "learning_rate": 9.960578490226233e-05, + "loss": 2.1966, + "step": 2243 + }, + { + "epoch": 0.6887661141804788, + "grad_norm": 0.6163785457611084, + "learning_rate": 9.960516172002548e-05, + "loss": 2.2012, + "step": 2244 + }, + { + "epoch": 0.689073050951504, + "grad_norm": 0.6808127760887146, + "learning_rate": 9.960453804756134e-05, + "loss": 2.1704, + "step": 2245 + }, + { + "epoch": 0.6893799877225292, + "grad_norm": 0.6571208834648132, + "learning_rate": 9.960391388487609e-05, + "loss": 2.17, + "step": 2246 + }, + { + "epoch": 0.6896869244935543, + "grad_norm": 0.7180834412574768, + "learning_rate": 9.960328923197588e-05, + "loss": 2.229, + "step": 2247 + }, + { + "epoch": 0.6899938612645795, + "grad_norm": 0.7283746600151062, + "learning_rate": 9.96026640888669e-05, + "loss": 2.195, + "step": 2248 + }, + { + "epoch": 0.6903007980356046, + "grad_norm": 0.6808122992515564, + "learning_rate": 9.960203845555531e-05, + "loss": 2.1327, + "step": 2249 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.7105094790458679, + "learning_rate": 9.960141233204731e-05, + "loss": 2.2747, + "step": 2250 + }, + { + "epoch": 0.690914671577655, + "grad_norm": 0.7650291919708252, + "learning_rate": 9.960078571834909e-05, + "loss": 2.2751, + "step": 2251 + }, + { + "epoch": 0.6912216083486802, + "grad_norm": 0.8347647786140442, + "learning_rate": 9.960015861446684e-05, + "loss": 2.2101, + "step": 2252 + }, + { + "epoch": 0.6915285451197053, + "grad_norm": 0.7774063348770142, + "learning_rate": 9.959953102040672e-05, + "loss": 2.1275, + "step": 2253 + }, + { + "epoch": 0.6918354818907305, + "grad_norm": 0.7466274499893188, + "learning_rate": 9.959890293617497e-05, + "loss": 2.1352, + "step": 2254 + }, + { + "epoch": 0.6921424186617556, + "grad_norm": 0.7451669573783875, + "learning_rate": 9.959827436177781e-05, + "loss": 2.1229, + "step": 2255 + }, + { + "epoch": 0.6924493554327809, + "grad_norm": 0.651746392250061, + "learning_rate": 9.959764529722142e-05, + "loss": 2.1416, + "step": 2256 + }, + { + "epoch": 0.692756292203806, + "grad_norm": 0.6267968416213989, + "learning_rate": 9.959701574251203e-05, + "loss": 2.1346, + "step": 2257 + }, + { + "epoch": 0.6930632289748312, + "grad_norm": 0.6087000966072083, + "learning_rate": 9.959638569765586e-05, + "loss": 2.2136, + "step": 2258 + }, + { + "epoch": 0.6933701657458563, + "grad_norm": 0.6032208204269409, + "learning_rate": 9.959575516265914e-05, + "loss": 2.1211, + "step": 2259 + }, + { + "epoch": 0.6936771025168815, + "grad_norm": 0.83074551820755, + "learning_rate": 9.95951241375281e-05, + "loss": 2.2951, + "step": 2260 + }, + { + "epoch": 0.6939840392879066, + "grad_norm": 0.8564106225967407, + "learning_rate": 9.959449262226897e-05, + "loss": 2.1496, + "step": 2261 + }, + { + "epoch": 0.6942909760589319, + "grad_norm": 0.8558153510093689, + "learning_rate": 9.9593860616888e-05, + "loss": 2.2325, + "step": 2262 + }, + { + "epoch": 0.694597912829957, + "grad_norm": 0.7391008734703064, + "learning_rate": 9.959322812139143e-05, + "loss": 2.1133, + "step": 2263 + }, + { + "epoch": 0.6949048496009822, + "grad_norm": 0.6090536713600159, + "learning_rate": 9.959259513578552e-05, + "loss": 2.1453, + "step": 2264 + }, + { + "epoch": 0.6952117863720073, + "grad_norm": 0.5893986821174622, + "learning_rate": 9.95919616600765e-05, + "loss": 2.2035, + "step": 2265 + }, + { + "epoch": 0.6955187231430325, + "grad_norm": 0.6274020671844482, + "learning_rate": 9.959132769427065e-05, + "loss": 2.2118, + "step": 2266 + }, + { + "epoch": 0.6958256599140578, + "grad_norm": 0.6287395358085632, + "learning_rate": 9.959069323837424e-05, + "loss": 2.2167, + "step": 2267 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.6281611323356628, + "learning_rate": 9.959005829239354e-05, + "loss": 2.1945, + "step": 2268 + }, + { + "epoch": 0.6964395334561081, + "grad_norm": 0.6422389149665833, + "learning_rate": 9.958942285633481e-05, + "loss": 2.1826, + "step": 2269 + }, + { + "epoch": 0.6967464702271332, + "grad_norm": 0.6461887955665588, + "learning_rate": 9.958878693020434e-05, + "loss": 2.2454, + "step": 2270 + }, + { + "epoch": 0.6970534069981584, + "grad_norm": 0.562102735042572, + "learning_rate": 9.958815051400841e-05, + "loss": 2.1375, + "step": 2271 + }, + { + "epoch": 0.6973603437691835, + "grad_norm": 0.5737003087997437, + "learning_rate": 9.958751360775331e-05, + "loss": 2.2344, + "step": 2272 + }, + { + "epoch": 0.6976672805402088, + "grad_norm": 0.5516494512557983, + "learning_rate": 9.958687621144535e-05, + "loss": 2.249, + "step": 2273 + }, + { + "epoch": 0.6979742173112339, + "grad_norm": 0.7148357629776001, + "learning_rate": 9.958623832509081e-05, + "loss": 2.2383, + "step": 2274 + }, + { + "epoch": 0.6982811540822591, + "grad_norm": 0.7151525020599365, + "learning_rate": 9.958559994869599e-05, + "loss": 2.1697, + "step": 2275 + }, + { + "epoch": 0.6985880908532842, + "grad_norm": 0.6927846670150757, + "learning_rate": 9.958496108226722e-05, + "loss": 2.1534, + "step": 2276 + }, + { + "epoch": 0.6988950276243094, + "grad_norm": 0.811660647392273, + "learning_rate": 9.958432172581079e-05, + "loss": 2.2197, + "step": 2277 + }, + { + "epoch": 0.6992019643953346, + "grad_norm": 0.9680081009864807, + "learning_rate": 9.958368187933305e-05, + "loss": 2.2241, + "step": 2278 + }, + { + "epoch": 0.6995089011663598, + "grad_norm": 0.9996320605278015, + "learning_rate": 9.958304154284028e-05, + "loss": 2.1598, + "step": 2279 + }, + { + "epoch": 0.6998158379373849, + "grad_norm": 1.008695363998413, + "learning_rate": 9.958240071633884e-05, + "loss": 2.2082, + "step": 2280 + }, + { + "epoch": 0.7001227747084101, + "grad_norm": 0.9931860566139221, + "learning_rate": 9.958175939983506e-05, + "loss": 2.1478, + "step": 2281 + }, + { + "epoch": 0.7004297114794352, + "grad_norm": 0.8637800812721252, + "learning_rate": 9.958111759333528e-05, + "loss": 2.149, + "step": 2282 + }, + { + "epoch": 0.7007366482504604, + "grad_norm": 0.7089012861251831, + "learning_rate": 9.958047529684582e-05, + "loss": 2.1845, + "step": 2283 + }, + { + "epoch": 0.7010435850214856, + "grad_norm": 0.6083673238754272, + "learning_rate": 9.957983251037303e-05, + "loss": 2.1542, + "step": 2284 + }, + { + "epoch": 0.7013505217925108, + "grad_norm": 0.7092905044555664, + "learning_rate": 9.957918923392331e-05, + "loss": 2.2305, + "step": 2285 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.8416675925254822, + "learning_rate": 9.957854546750297e-05, + "loss": 2.2975, + "step": 2286 + }, + { + "epoch": 0.7019643953345611, + "grad_norm": 0.7778663039207458, + "learning_rate": 9.957790121111838e-05, + "loss": 2.2363, + "step": 2287 + }, + { + "epoch": 0.7022713321055862, + "grad_norm": 0.7886617183685303, + "learning_rate": 9.957725646477592e-05, + "loss": 2.1547, + "step": 2288 + }, + { + "epoch": 0.7025782688766115, + "grad_norm": 0.6596038937568665, + "learning_rate": 9.957661122848194e-05, + "loss": 2.1537, + "step": 2289 + }, + { + "epoch": 0.7028852056476366, + "grad_norm": 0.6441544890403748, + "learning_rate": 9.957596550224285e-05, + "loss": 2.1678, + "step": 2290 + }, + { + "epoch": 0.7031921424186618, + "grad_norm": 0.7106116414070129, + "learning_rate": 9.957531928606499e-05, + "loss": 2.2039, + "step": 2291 + }, + { + "epoch": 0.7034990791896869, + "grad_norm": 0.6948207020759583, + "learning_rate": 9.957467257995476e-05, + "loss": 2.176, + "step": 2292 + }, + { + "epoch": 0.7038060159607121, + "grad_norm": 0.6834874153137207, + "learning_rate": 9.957402538391859e-05, + "loss": 2.2182, + "step": 2293 + }, + { + "epoch": 0.7041129527317372, + "grad_norm": 0.6246630549430847, + "learning_rate": 9.957337769796282e-05, + "loss": 2.1181, + "step": 2294 + }, + { + "epoch": 0.7044198895027625, + "grad_norm": 0.6421988606452942, + "learning_rate": 9.957272952209389e-05, + "loss": 2.1352, + "step": 2295 + }, + { + "epoch": 0.7047268262737876, + "grad_norm": 0.5955870151519775, + "learning_rate": 9.95720808563182e-05, + "loss": 2.1852, + "step": 2296 + }, + { + "epoch": 0.7050337630448128, + "grad_norm": 0.6961265206336975, + "learning_rate": 9.957143170064214e-05, + "loss": 2.242, + "step": 2297 + }, + { + "epoch": 0.7053406998158379, + "grad_norm": 0.6966063380241394, + "learning_rate": 9.957078205507213e-05, + "loss": 2.1505, + "step": 2298 + }, + { + "epoch": 0.7056476365868631, + "grad_norm": 0.6155996322631836, + "learning_rate": 9.957013191961459e-05, + "loss": 2.1928, + "step": 2299 + }, + { + "epoch": 0.7059545733578882, + "grad_norm": 0.6092718839645386, + "learning_rate": 9.956948129427597e-05, + "loss": 2.138, + "step": 2300 + }, + { + "epoch": 0.7062615101289135, + "grad_norm": 0.645746111869812, + "learning_rate": 9.95688301790627e-05, + "loss": 2.2334, + "step": 2301 + }, + { + "epoch": 0.7065684468999386, + "grad_norm": 0.5959149599075317, + "learning_rate": 9.956817857398116e-05, + "loss": 2.1985, + "step": 2302 + }, + { + "epoch": 0.7068753836709638, + "grad_norm": 0.7127073407173157, + "learning_rate": 9.956752647903785e-05, + "loss": 2.2157, + "step": 2303 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.5589274764060974, + "learning_rate": 9.956687389423917e-05, + "loss": 2.1251, + "step": 2304 + }, + { + "epoch": 0.7074892572130141, + "grad_norm": 0.5502300262451172, + "learning_rate": 9.95662208195916e-05, + "loss": 2.1344, + "step": 2305 + }, + { + "epoch": 0.7077961939840393, + "grad_norm": 0.6577275991439819, + "learning_rate": 9.95655672551016e-05, + "loss": 2.1646, + "step": 2306 + }, + { + "epoch": 0.7081031307550645, + "grad_norm": 0.6241618394851685, + "learning_rate": 9.956491320077559e-05, + "loss": 2.1153, + "step": 2307 + }, + { + "epoch": 0.7084100675260896, + "grad_norm": 0.5846728086471558, + "learning_rate": 9.956425865662007e-05, + "loss": 2.1477, + "step": 2308 + }, + { + "epoch": 0.7087170042971148, + "grad_norm": 0.6005275249481201, + "learning_rate": 9.95636036226415e-05, + "loss": 2.2034, + "step": 2309 + }, + { + "epoch": 0.7090239410681399, + "grad_norm": 0.6545519828796387, + "learning_rate": 9.956294809884635e-05, + "loss": 2.23, + "step": 2310 + }, + { + "epoch": 0.7093308778391652, + "grad_norm": 0.7513750791549683, + "learning_rate": 9.956229208524108e-05, + "loss": 2.2497, + "step": 2311 + }, + { + "epoch": 0.7096378146101903, + "grad_norm": 0.7308349609375, + "learning_rate": 9.956163558183219e-05, + "loss": 2.166, + "step": 2312 + }, + { + "epoch": 0.7099447513812155, + "grad_norm": 0.6278798580169678, + "learning_rate": 9.956097858862619e-05, + "loss": 2.1994, + "step": 2313 + }, + { + "epoch": 0.7102516881522406, + "grad_norm": 0.6725621223449707, + "learning_rate": 9.956032110562953e-05, + "loss": 2.2212, + "step": 2314 + }, + { + "epoch": 0.7105586249232658, + "grad_norm": 0.7116945385932922, + "learning_rate": 9.955966313284872e-05, + "loss": 2.2033, + "step": 2315 + }, + { + "epoch": 0.7108655616942909, + "grad_norm": 0.5906245112419128, + "learning_rate": 9.95590046702903e-05, + "loss": 2.1419, + "step": 2316 + }, + { + "epoch": 0.7111724984653162, + "grad_norm": 0.6911863684654236, + "learning_rate": 9.955834571796073e-05, + "loss": 2.1697, + "step": 2317 + }, + { + "epoch": 0.7114794352363413, + "grad_norm": 0.600350558757782, + "learning_rate": 9.955768627586655e-05, + "loss": 2.0864, + "step": 2318 + }, + { + "epoch": 0.7117863720073665, + "grad_norm": 0.6246278285980225, + "learning_rate": 9.955702634401427e-05, + "loss": 2.1549, + "step": 2319 + }, + { + "epoch": 0.7120933087783916, + "grad_norm": 0.6530009508132935, + "learning_rate": 9.95563659224104e-05, + "loss": 2.1457, + "step": 2320 + }, + { + "epoch": 0.7124002455494168, + "grad_norm": 0.6566256880760193, + "learning_rate": 9.955570501106148e-05, + "loss": 2.1589, + "step": 2321 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.6607041358947754, + "learning_rate": 9.955504360997404e-05, + "loss": 2.1692, + "step": 2322 + }, + { + "epoch": 0.7130141190914672, + "grad_norm": 0.7257810235023499, + "learning_rate": 9.95543817191546e-05, + "loss": 2.2067, + "step": 2323 + }, + { + "epoch": 0.7133210558624923, + "grad_norm": 0.7413349151611328, + "learning_rate": 9.955371933860973e-05, + "loss": 2.1817, + "step": 2324 + }, + { + "epoch": 0.7136279926335175, + "grad_norm": 0.6968317031860352, + "learning_rate": 9.955305646834596e-05, + "loss": 2.2574, + "step": 2325 + }, + { + "epoch": 0.7139349294045426, + "grad_norm": 0.8065732717514038, + "learning_rate": 9.955239310836983e-05, + "loss": 2.1957, + "step": 2326 + }, + { + "epoch": 0.7142418661755678, + "grad_norm": 0.7563133835792542, + "learning_rate": 9.955172925868792e-05, + "loss": 2.2113, + "step": 2327 + }, + { + "epoch": 0.714548802946593, + "grad_norm": 0.6790496110916138, + "learning_rate": 9.955106491930678e-05, + "loss": 2.103, + "step": 2328 + }, + { + "epoch": 0.7148557397176182, + "grad_norm": 0.65167236328125, + "learning_rate": 9.955040009023298e-05, + "loss": 2.1919, + "step": 2329 + }, + { + "epoch": 0.7151626764886433, + "grad_norm": 0.6869332790374756, + "learning_rate": 9.954973477147307e-05, + "loss": 2.2141, + "step": 2330 + }, + { + "epoch": 0.7154696132596685, + "grad_norm": 0.8613699078559875, + "learning_rate": 9.954906896303363e-05, + "loss": 2.1962, + "step": 2331 + }, + { + "epoch": 0.7157765500306936, + "grad_norm": 0.8827282786369324, + "learning_rate": 9.954840266492127e-05, + "loss": 2.216, + "step": 2332 + }, + { + "epoch": 0.7160834868017188, + "grad_norm": 0.9737905263900757, + "learning_rate": 9.954773587714255e-05, + "loss": 2.2118, + "step": 2333 + }, + { + "epoch": 0.716390423572744, + "grad_norm": 0.9978635311126709, + "learning_rate": 9.954706859970404e-05, + "loss": 2.0998, + "step": 2334 + }, + { + "epoch": 0.7166973603437692, + "grad_norm": 0.8694623112678528, + "learning_rate": 9.954640083261238e-05, + "loss": 2.1533, + "step": 2335 + }, + { + "epoch": 0.7170042971147943, + "grad_norm": 0.641293466091156, + "learning_rate": 9.954573257587415e-05, + "loss": 2.2095, + "step": 2336 + }, + { + "epoch": 0.7173112338858195, + "grad_norm": 0.6289860010147095, + "learning_rate": 9.954506382949594e-05, + "loss": 2.1683, + "step": 2337 + }, + { + "epoch": 0.7176181706568447, + "grad_norm": 0.8292246460914612, + "learning_rate": 9.954439459348437e-05, + "loss": 2.1729, + "step": 2338 + }, + { + "epoch": 0.7179251074278699, + "grad_norm": 0.8990920782089233, + "learning_rate": 9.954372486784605e-05, + "loss": 2.0888, + "step": 2339 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.7905614376068115, + "learning_rate": 9.954305465258762e-05, + "loss": 2.2262, + "step": 2340 + }, + { + "epoch": 0.7185389809699202, + "grad_norm": 0.7142611145973206, + "learning_rate": 9.954238394771567e-05, + "loss": 2.1311, + "step": 2341 + }, + { + "epoch": 0.7188459177409454, + "grad_norm": 0.68161541223526, + "learning_rate": 9.954171275323684e-05, + "loss": 2.2622, + "step": 2342 + }, + { + "epoch": 0.7191528545119705, + "grad_norm": 0.7524895668029785, + "learning_rate": 9.954104106915779e-05, + "loss": 2.1709, + "step": 2343 + }, + { + "epoch": 0.7194597912829958, + "grad_norm": 0.7419885396957397, + "learning_rate": 9.954036889548511e-05, + "loss": 2.1528, + "step": 2344 + }, + { + "epoch": 0.7197667280540209, + "grad_norm": 0.8045634031295776, + "learning_rate": 9.953969623222547e-05, + "loss": 2.1774, + "step": 2345 + }, + { + "epoch": 0.7200736648250461, + "grad_norm": 0.6680217385292053, + "learning_rate": 9.953902307938554e-05, + "loss": 2.2345, + "step": 2346 + }, + { + "epoch": 0.7203806015960712, + "grad_norm": 0.6900907754898071, + "learning_rate": 9.953834943697193e-05, + "loss": 2.1696, + "step": 2347 + }, + { + "epoch": 0.7206875383670964, + "grad_norm": 0.7231009006500244, + "learning_rate": 9.953767530499132e-05, + "loss": 2.2556, + "step": 2348 + }, + { + "epoch": 0.7209944751381215, + "grad_norm": 0.7766092419624329, + "learning_rate": 9.953700068345036e-05, + "loss": 2.1522, + "step": 2349 + }, + { + "epoch": 0.7213014119091468, + "grad_norm": 0.7361852526664734, + "learning_rate": 9.953632557235574e-05, + "loss": 2.2427, + "step": 2350 + }, + { + "epoch": 0.7216083486801719, + "grad_norm": 0.7170109152793884, + "learning_rate": 9.953564997171411e-05, + "loss": 2.2439, + "step": 2351 + }, + { + "epoch": 0.7219152854511971, + "grad_norm": 0.7192662954330444, + "learning_rate": 9.953497388153214e-05, + "loss": 2.1242, + "step": 2352 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.7363288402557373, + "learning_rate": 9.953429730181653e-05, + "loss": 2.2748, + "step": 2353 + }, + { + "epoch": 0.7225291589932474, + "grad_norm": 0.8516983985900879, + "learning_rate": 9.953362023257397e-05, + "loss": 2.2471, + "step": 2354 + }, + { + "epoch": 0.7228360957642725, + "grad_norm": 0.7928574681282043, + "learning_rate": 9.953294267381114e-05, + "loss": 2.164, + "step": 2355 + }, + { + "epoch": 0.7231430325352978, + "grad_norm": 0.6803320646286011, + "learning_rate": 9.953226462553474e-05, + "loss": 2.1671, + "step": 2356 + }, + { + "epoch": 0.7234499693063229, + "grad_norm": 0.6811994910240173, + "learning_rate": 9.953158608775147e-05, + "loss": 2.1042, + "step": 2357 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.6077840328216553, + "learning_rate": 9.953090706046804e-05, + "loss": 2.2161, + "step": 2358 + }, + { + "epoch": 0.7240638428483732, + "grad_norm": 0.5938412547111511, + "learning_rate": 9.953022754369114e-05, + "loss": 2.1177, + "step": 2359 + }, + { + "epoch": 0.7243707796193984, + "grad_norm": 0.6752299070358276, + "learning_rate": 9.952954753742751e-05, + "loss": 2.2255, + "step": 2360 + }, + { + "epoch": 0.7246777163904236, + "grad_norm": 0.6745245456695557, + "learning_rate": 9.952886704168387e-05, + "loss": 2.1817, + "step": 2361 + }, + { + "epoch": 0.7249846531614488, + "grad_norm": 0.6645397543907166, + "learning_rate": 9.95281860564669e-05, + "loss": 2.2495, + "step": 2362 + }, + { + "epoch": 0.7252915899324739, + "grad_norm": 0.6758745312690735, + "learning_rate": 9.95275045817834e-05, + "loss": 2.2059, + "step": 2363 + }, + { + "epoch": 0.7255985267034991, + "grad_norm": 0.6584516763687134, + "learning_rate": 9.952682261764006e-05, + "loss": 2.1868, + "step": 2364 + }, + { + "epoch": 0.7259054634745242, + "grad_norm": 0.6335561871528625, + "learning_rate": 9.952614016404363e-05, + "loss": 2.1352, + "step": 2365 + }, + { + "epoch": 0.7262124002455494, + "grad_norm": 0.6656816601753235, + "learning_rate": 9.952545722100087e-05, + "loss": 2.1805, + "step": 2366 + }, + { + "epoch": 0.7265193370165746, + "grad_norm": 0.6262782216072083, + "learning_rate": 9.95247737885185e-05, + "loss": 2.1435, + "step": 2367 + }, + { + "epoch": 0.7268262737875998, + "grad_norm": 0.569795548915863, + "learning_rate": 9.952408986660329e-05, + "loss": 2.1547, + "step": 2368 + }, + { + "epoch": 0.7271332105586249, + "grad_norm": 0.5249118208885193, + "learning_rate": 9.952340545526199e-05, + "loss": 2.1213, + "step": 2369 + }, + { + "epoch": 0.7274401473296501, + "grad_norm": 0.5581740140914917, + "learning_rate": 9.952272055450139e-05, + "loss": 2.1866, + "step": 2370 + }, + { + "epoch": 0.7277470841006752, + "grad_norm": 0.5986969470977783, + "learning_rate": 9.952203516432821e-05, + "loss": 2.143, + "step": 2371 + }, + { + "epoch": 0.7280540208717005, + "grad_norm": 0.6426723599433899, + "learning_rate": 9.952134928474926e-05, + "loss": 2.2132, + "step": 2372 + }, + { + "epoch": 0.7283609576427256, + "grad_norm": 0.5856953263282776, + "learning_rate": 9.952066291577133e-05, + "loss": 2.1502, + "step": 2373 + }, + { + "epoch": 0.7286678944137508, + "grad_norm": 0.5420570969581604, + "learning_rate": 9.951997605740117e-05, + "loss": 2.1213, + "step": 2374 + }, + { + "epoch": 0.7289748311847759, + "grad_norm": 0.6201688647270203, + "learning_rate": 9.951928870964558e-05, + "loss": 2.218, + "step": 2375 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.7023850083351135, + "learning_rate": 9.951860087251137e-05, + "loss": 2.2787, + "step": 2376 + }, + { + "epoch": 0.7295887047268262, + "grad_norm": 0.733650803565979, + "learning_rate": 9.951791254600532e-05, + "loss": 2.1861, + "step": 2377 + }, + { + "epoch": 0.7298956414978515, + "grad_norm": 0.7177363038063049, + "learning_rate": 9.951722373013421e-05, + "loss": 2.1905, + "step": 2378 + }, + { + "epoch": 0.7302025782688766, + "grad_norm": 0.7963547706604004, + "learning_rate": 9.95165344249049e-05, + "loss": 2.1842, + "step": 2379 + }, + { + "epoch": 0.7305095150399018, + "grad_norm": 0.8466546535491943, + "learning_rate": 9.951584463032416e-05, + "loss": 2.1661, + "step": 2380 + }, + { + "epoch": 0.7308164518109269, + "grad_norm": 0.7288870811462402, + "learning_rate": 9.951515434639882e-05, + "loss": 2.1153, + "step": 2381 + }, + { + "epoch": 0.7311233885819521, + "grad_norm": 0.6168704032897949, + "learning_rate": 9.951446357313571e-05, + "loss": 2.121, + "step": 2382 + }, + { + "epoch": 0.7314303253529773, + "grad_norm": 0.6534848809242249, + "learning_rate": 9.951377231054166e-05, + "loss": 2.2087, + "step": 2383 + }, + { + "epoch": 0.7317372621240025, + "grad_norm": 0.7872020602226257, + "learning_rate": 9.951308055862347e-05, + "loss": 2.2428, + "step": 2384 + }, + { + "epoch": 0.7320441988950276, + "grad_norm": 0.864799439907074, + "learning_rate": 9.9512388317388e-05, + "loss": 2.2392, + "step": 2385 + }, + { + "epoch": 0.7323511356660528, + "grad_norm": 0.7365485429763794, + "learning_rate": 9.95116955868421e-05, + "loss": 2.1614, + "step": 2386 + }, + { + "epoch": 0.7326580724370779, + "grad_norm": 0.6509390473365784, + "learning_rate": 9.95110023669926e-05, + "loss": 2.1917, + "step": 2387 + }, + { + "epoch": 0.7329650092081031, + "grad_norm": 0.7660403847694397, + "learning_rate": 9.951030865784635e-05, + "loss": 2.2414, + "step": 2388 + }, + { + "epoch": 0.7332719459791283, + "grad_norm": 0.9997872114181519, + "learning_rate": 9.950961445941022e-05, + "loss": 2.2063, + "step": 2389 + }, + { + "epoch": 0.7335788827501535, + "grad_norm": 1.0113418102264404, + "learning_rate": 9.950891977169106e-05, + "loss": 2.1898, + "step": 2390 + }, + { + "epoch": 0.7338858195211786, + "grad_norm": 0.8849206566810608, + "learning_rate": 9.950822459469573e-05, + "loss": 2.1503, + "step": 2391 + }, + { + "epoch": 0.7341927562922038, + "grad_norm": 0.6561055779457092, + "learning_rate": 9.950752892843112e-05, + "loss": 2.1234, + "step": 2392 + }, + { + "epoch": 0.7344996930632289, + "grad_norm": 0.5568758845329285, + "learning_rate": 9.950683277290407e-05, + "loss": 2.2129, + "step": 2393 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.7019078135490417, + "learning_rate": 9.950613612812149e-05, + "loss": 2.1162, + "step": 2394 + }, + { + "epoch": 0.7351135666052793, + "grad_norm": 0.7633521556854248, + "learning_rate": 9.950543899409026e-05, + "loss": 2.2427, + "step": 2395 + }, + { + "epoch": 0.7354205033763045, + "grad_norm": 0.6743205785751343, + "learning_rate": 9.950474137081726e-05, + "loss": 2.2213, + "step": 2396 + }, + { + "epoch": 0.7357274401473296, + "grad_norm": 0.6008336544036865, + "learning_rate": 9.950404325830941e-05, + "loss": 2.1605, + "step": 2397 + }, + { + "epoch": 0.7360343769183548, + "grad_norm": 0.648760199546814, + "learning_rate": 9.950334465657357e-05, + "loss": 2.2298, + "step": 2398 + }, + { + "epoch": 0.7363413136893799, + "grad_norm": 0.6996559500694275, + "learning_rate": 9.950264556561667e-05, + "loss": 2.1616, + "step": 2399 + }, + { + "epoch": 0.7366482504604052, + "grad_norm": 0.741629421710968, + "learning_rate": 9.950194598544561e-05, + "loss": 2.2162, + "step": 2400 + }, + { + "epoch": 0.7369551872314303, + "grad_norm": 0.6144673824310303, + "learning_rate": 9.95012459160673e-05, + "loss": 2.15, + "step": 2401 + }, + { + "epoch": 0.7372621240024555, + "grad_norm": 0.5826541781425476, + "learning_rate": 9.950054535748867e-05, + "loss": 2.1792, + "step": 2402 + }, + { + "epoch": 0.7375690607734806, + "grad_norm": 0.6489288806915283, + "learning_rate": 9.949984430971665e-05, + "loss": 2.1703, + "step": 2403 + }, + { + "epoch": 0.7378759975445058, + "grad_norm": 0.6752250790596008, + "learning_rate": 9.949914277275814e-05, + "loss": 2.2561, + "step": 2404 + }, + { + "epoch": 0.738182934315531, + "grad_norm": 0.5570092797279358, + "learning_rate": 9.94984407466201e-05, + "loss": 2.1418, + "step": 2405 + }, + { + "epoch": 0.7384898710865562, + "grad_norm": 0.5966812968254089, + "learning_rate": 9.949773823130944e-05, + "loss": 2.2168, + "step": 2406 + }, + { + "epoch": 0.7387968078575813, + "grad_norm": 0.6253142952919006, + "learning_rate": 9.949703522683314e-05, + "loss": 2.1646, + "step": 2407 + }, + { + "epoch": 0.7391037446286065, + "grad_norm": 0.6673659086227417, + "learning_rate": 9.94963317331981e-05, + "loss": 2.1904, + "step": 2408 + }, + { + "epoch": 0.7394106813996317, + "grad_norm": 0.6243279576301575, + "learning_rate": 9.949562775041133e-05, + "loss": 2.2568, + "step": 2409 + }, + { + "epoch": 0.7397176181706568, + "grad_norm": 0.7014298439025879, + "learning_rate": 9.949492327847973e-05, + "loss": 2.2331, + "step": 2410 + }, + { + "epoch": 0.7400245549416821, + "grad_norm": 0.698403537273407, + "learning_rate": 9.94942183174103e-05, + "loss": 2.1928, + "step": 2411 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.6354022026062012, + "learning_rate": 9.949351286721001e-05, + "loss": 2.0975, + "step": 2412 + }, + { + "epoch": 0.7406384284837324, + "grad_norm": 0.595302164554596, + "learning_rate": 9.949280692788579e-05, + "loss": 2.177, + "step": 2413 + }, + { + "epoch": 0.7409453652547575, + "grad_norm": 0.6844484210014343, + "learning_rate": 9.949210049944465e-05, + "loss": 2.1962, + "step": 2414 + }, + { + "epoch": 0.7412523020257827, + "grad_norm": 0.6242616176605225, + "learning_rate": 9.949139358189357e-05, + "loss": 2.2143, + "step": 2415 + }, + { + "epoch": 0.7415592387968079, + "grad_norm": 0.6524595022201538, + "learning_rate": 9.949068617523954e-05, + "loss": 2.1438, + "step": 2416 + }, + { + "epoch": 0.7418661755678331, + "grad_norm": 0.6667510867118835, + "learning_rate": 9.948997827948953e-05, + "loss": 2.2115, + "step": 2417 + }, + { + "epoch": 0.7421731123388582, + "grad_norm": 0.7688906192779541, + "learning_rate": 9.948926989465056e-05, + "loss": 2.1887, + "step": 2418 + }, + { + "epoch": 0.7424800491098834, + "grad_norm": 0.6888165473937988, + "learning_rate": 9.948856102072958e-05, + "loss": 2.1349, + "step": 2419 + }, + { + "epoch": 0.7427869858809085, + "grad_norm": 0.5672495365142822, + "learning_rate": 9.948785165773367e-05, + "loss": 2.1109, + "step": 2420 + }, + { + "epoch": 0.7430939226519337, + "grad_norm": 0.5714489221572876, + "learning_rate": 9.94871418056698e-05, + "loss": 2.1483, + "step": 2421 + }, + { + "epoch": 0.7434008594229589, + "grad_norm": 0.6061533093452454, + "learning_rate": 9.948643146454498e-05, + "loss": 2.211, + "step": 2422 + }, + { + "epoch": 0.7437077961939841, + "grad_norm": 0.6132726073265076, + "learning_rate": 9.948572063436625e-05, + "loss": 2.23, + "step": 2423 + }, + { + "epoch": 0.7440147329650092, + "grad_norm": 0.684301495552063, + "learning_rate": 9.948500931514062e-05, + "loss": 2.129, + "step": 2424 + }, + { + "epoch": 0.7443216697360344, + "grad_norm": 0.6325442790985107, + "learning_rate": 9.948429750687512e-05, + "loss": 2.129, + "step": 2425 + }, + { + "epoch": 0.7446286065070595, + "grad_norm": 0.6245989203453064, + "learning_rate": 9.948358520957678e-05, + "loss": 2.1999, + "step": 2426 + }, + { + "epoch": 0.7449355432780848, + "grad_norm": 0.6638534069061279, + "learning_rate": 9.948287242325267e-05, + "loss": 2.203, + "step": 2427 + }, + { + "epoch": 0.7452424800491099, + "grad_norm": 0.6121437549591064, + "learning_rate": 9.94821591479098e-05, + "loss": 2.1204, + "step": 2428 + }, + { + "epoch": 0.7455494168201351, + "grad_norm": 0.7919846177101135, + "learning_rate": 9.948144538355522e-05, + "loss": 2.2353, + "step": 2429 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.7246984839439392, + "learning_rate": 9.948073113019602e-05, + "loss": 2.1284, + "step": 2430 + }, + { + "epoch": 0.7461632903621854, + "grad_norm": 0.6120265126228333, + "learning_rate": 9.948001638783921e-05, + "loss": 2.0873, + "step": 2431 + }, + { + "epoch": 0.7464702271332105, + "grad_norm": 0.628588080406189, + "learning_rate": 9.947930115649189e-05, + "loss": 2.1713, + "step": 2432 + }, + { + "epoch": 0.7467771639042358, + "grad_norm": 0.63116854429245, + "learning_rate": 9.947858543616111e-05, + "loss": 2.123, + "step": 2433 + }, + { + "epoch": 0.7470841006752609, + "grad_norm": 0.6533017754554749, + "learning_rate": 9.947786922685394e-05, + "loss": 2.1593, + "step": 2434 + }, + { + "epoch": 0.7473910374462861, + "grad_norm": 0.6854177117347717, + "learning_rate": 9.947715252857749e-05, + "loss": 2.162, + "step": 2435 + }, + { + "epoch": 0.7476979742173112, + "grad_norm": 0.7257967591285706, + "learning_rate": 9.94764353413388e-05, + "loss": 2.2644, + "step": 2436 + }, + { + "epoch": 0.7480049109883364, + "grad_norm": 0.6806700825691223, + "learning_rate": 9.947571766514498e-05, + "loss": 2.0875, + "step": 2437 + }, + { + "epoch": 0.7483118477593615, + "grad_norm": 0.6616181135177612, + "learning_rate": 9.947499950000312e-05, + "loss": 2.1353, + "step": 2438 + }, + { + "epoch": 0.7486187845303868, + "grad_norm": 0.7249685525894165, + "learning_rate": 9.947428084592032e-05, + "loss": 2.148, + "step": 2439 + }, + { + "epoch": 0.7489257213014119, + "grad_norm": 0.6372905969619751, + "learning_rate": 9.947356170290369e-05, + "loss": 2.1749, + "step": 2440 + }, + { + "epoch": 0.7492326580724371, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.947284207096031e-05, + "loss": 2.1909, + "step": 2441 + }, + { + "epoch": 0.7495395948434622, + "grad_norm": 0.5830507278442383, + "learning_rate": 9.94721219500973e-05, + "loss": 2.1351, + "step": 2442 + }, + { + "epoch": 0.7498465316144874, + "grad_norm": 0.650262713432312, + "learning_rate": 9.94714013403218e-05, + "loss": 2.2602, + "step": 2443 + }, + { + "epoch": 0.7501534683855126, + "grad_norm": 0.6658717393875122, + "learning_rate": 9.947068024164091e-05, + "loss": 2.0919, + "step": 2444 + }, + { + "epoch": 0.7504604051565378, + "grad_norm": 0.7299105525016785, + "learning_rate": 9.946995865406177e-05, + "loss": 2.2079, + "step": 2445 + }, + { + "epoch": 0.7507673419275629, + "grad_norm": 0.762246310710907, + "learning_rate": 9.946923657759148e-05, + "loss": 2.2225, + "step": 2446 + }, + { + "epoch": 0.7510742786985881, + "grad_norm": 0.7019835710525513, + "learning_rate": 9.946851401223722e-05, + "loss": 2.175, + "step": 2447 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.6214791536331177, + "learning_rate": 9.946779095800611e-05, + "loss": 2.2095, + "step": 2448 + }, + { + "epoch": 0.7516881522406385, + "grad_norm": 0.6380667090415955, + "learning_rate": 9.94670674149053e-05, + "loss": 2.2325, + "step": 2449 + }, + { + "epoch": 0.7519950890116636, + "grad_norm": 0.6175886392593384, + "learning_rate": 9.946634338294191e-05, + "loss": 2.1431, + "step": 2450 + }, + { + "epoch": 0.7523020257826888, + "grad_norm": 0.6642621159553528, + "learning_rate": 9.946561886212315e-05, + "loss": 2.1538, + "step": 2451 + }, + { + "epoch": 0.7526089625537139, + "grad_norm": 0.7078617215156555, + "learning_rate": 9.946489385245614e-05, + "loss": 2.1544, + "step": 2452 + }, + { + "epoch": 0.7529158993247391, + "grad_norm": 0.6939398050308228, + "learning_rate": 9.946416835394806e-05, + "loss": 2.1131, + "step": 2453 + }, + { + "epoch": 0.7532228360957642, + "grad_norm": 0.7080716490745544, + "learning_rate": 9.946344236660608e-05, + "loss": 2.2135, + "step": 2454 + }, + { + "epoch": 0.7535297728667895, + "grad_norm": 0.7451115250587463, + "learning_rate": 9.946271589043736e-05, + "loss": 2.1475, + "step": 2455 + }, + { + "epoch": 0.7538367096378146, + "grad_norm": 0.6718367338180542, + "learning_rate": 9.946198892544909e-05, + "loss": 2.1853, + "step": 2456 + }, + { + "epoch": 0.7541436464088398, + "grad_norm": 0.7071637511253357, + "learning_rate": 9.946126147164847e-05, + "loss": 2.0981, + "step": 2457 + }, + { + "epoch": 0.7544505831798649, + "grad_norm": 0.6745624542236328, + "learning_rate": 9.946053352904267e-05, + "loss": 2.1914, + "step": 2458 + }, + { + "epoch": 0.7547575199508901, + "grad_norm": 0.7267486453056335, + "learning_rate": 9.945980509763888e-05, + "loss": 2.1091, + "step": 2459 + }, + { + "epoch": 0.7550644567219152, + "grad_norm": 0.6128695607185364, + "learning_rate": 9.94590761774443e-05, + "loss": 2.1721, + "step": 2460 + }, + { + "epoch": 0.7553713934929405, + "grad_norm": 0.6574678421020508, + "learning_rate": 9.945834676846615e-05, + "loss": 2.1609, + "step": 2461 + }, + { + "epoch": 0.7556783302639656, + "grad_norm": 0.6209995150566101, + "learning_rate": 9.945761687071164e-05, + "loss": 2.1889, + "step": 2462 + }, + { + "epoch": 0.7559852670349908, + "grad_norm": 0.7425361275672913, + "learning_rate": 9.945688648418795e-05, + "loss": 2.2189, + "step": 2463 + }, + { + "epoch": 0.7562922038060159, + "grad_norm": 1.0604934692382812, + "learning_rate": 9.945615560890234e-05, + "loss": 2.1858, + "step": 2464 + }, + { + "epoch": 0.7565991405770411, + "grad_norm": 0.7162829041481018, + "learning_rate": 9.945542424486201e-05, + "loss": 2.101, + "step": 2465 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.6361207962036133, + "learning_rate": 9.945469239207416e-05, + "loss": 2.0836, + "step": 2466 + }, + { + "epoch": 0.7572130141190915, + "grad_norm": 0.5858156085014343, + "learning_rate": 9.945396005054609e-05, + "loss": 2.2059, + "step": 2467 + }, + { + "epoch": 0.7575199508901166, + "grad_norm": 0.7322074174880981, + "learning_rate": 9.945322722028498e-05, + "loss": 2.2295, + "step": 2468 + }, + { + "epoch": 0.7578268876611418, + "grad_norm": 0.775900661945343, + "learning_rate": 9.945249390129811e-05, + "loss": 2.2171, + "step": 2469 + }, + { + "epoch": 0.7581338244321669, + "grad_norm": 0.8801379799842834, + "learning_rate": 9.94517600935927e-05, + "loss": 2.1632, + "step": 2470 + }, + { + "epoch": 0.7584407612031921, + "grad_norm": 0.8258405923843384, + "learning_rate": 9.945102579717602e-05, + "loss": 2.1591, + "step": 2471 + }, + { + "epoch": 0.7587476979742173, + "grad_norm": 0.7472482323646545, + "learning_rate": 9.945029101205532e-05, + "loss": 2.2242, + "step": 2472 + }, + { + "epoch": 0.7590546347452425, + "grad_norm": 0.6594643592834473, + "learning_rate": 9.944955573823785e-05, + "loss": 2.1217, + "step": 2473 + }, + { + "epoch": 0.7593615715162676, + "grad_norm": 0.6547524333000183, + "learning_rate": 9.944881997573088e-05, + "loss": 2.131, + "step": 2474 + }, + { + "epoch": 0.7596685082872928, + "grad_norm": 0.6630129814147949, + "learning_rate": 9.94480837245417e-05, + "loss": 2.1264, + "step": 2475 + }, + { + "epoch": 0.7599754450583179, + "grad_norm": 0.6877384781837463, + "learning_rate": 9.944734698467757e-05, + "loss": 2.2453, + "step": 2476 + }, + { + "epoch": 0.7602823818293432, + "grad_norm": 0.6736158728599548, + "learning_rate": 9.944660975614579e-05, + "loss": 2.1425, + "step": 2477 + }, + { + "epoch": 0.7605893186003683, + "grad_norm": 0.6140786409378052, + "learning_rate": 9.944587203895361e-05, + "loss": 2.1345, + "step": 2478 + }, + { + "epoch": 0.7608962553713935, + "grad_norm": 0.5515910387039185, + "learning_rate": 9.944513383310837e-05, + "loss": 2.086, + "step": 2479 + }, + { + "epoch": 0.7612031921424187, + "grad_norm": 0.49419671297073364, + "learning_rate": 9.944439513861731e-05, + "loss": 2.1069, + "step": 2480 + }, + { + "epoch": 0.7615101289134438, + "grad_norm": 0.5526577234268188, + "learning_rate": 9.944365595548777e-05, + "loss": 2.1702, + "step": 2481 + }, + { + "epoch": 0.761817065684469, + "grad_norm": 0.5430580973625183, + "learning_rate": 9.944291628372702e-05, + "loss": 2.121, + "step": 2482 + }, + { + "epoch": 0.7621240024554942, + "grad_norm": 0.5333554148674011, + "learning_rate": 9.94421761233424e-05, + "loss": 2.1154, + "step": 2483 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.5856761932373047, + "learning_rate": 9.944143547434124e-05, + "loss": 2.1734, + "step": 2484 + }, + { + "epoch": 0.7627378759975445, + "grad_norm": 0.6619083881378174, + "learning_rate": 9.944069433673082e-05, + "loss": 2.2068, + "step": 2485 + }, + { + "epoch": 0.7630448127685697, + "grad_norm": 0.5791018009185791, + "learning_rate": 9.943995271051849e-05, + "loss": 2.0834, + "step": 2486 + }, + { + "epoch": 0.7633517495395948, + "grad_norm": 0.5942522287368774, + "learning_rate": 9.943921059571155e-05, + "loss": 2.2001, + "step": 2487 + }, + { + "epoch": 0.7636586863106201, + "grad_norm": 0.6285880208015442, + "learning_rate": 9.943846799231738e-05, + "loss": 2.1601, + "step": 2488 + }, + { + "epoch": 0.7639656230816452, + "grad_norm": 0.6337715983390808, + "learning_rate": 9.943772490034326e-05, + "loss": 2.1722, + "step": 2489 + }, + { + "epoch": 0.7642725598526704, + "grad_norm": 0.6912121772766113, + "learning_rate": 9.94369813197966e-05, + "loss": 2.1933, + "step": 2490 + }, + { + "epoch": 0.7645794966236955, + "grad_norm": 0.8028284311294556, + "learning_rate": 9.943623725068469e-05, + "loss": 2.129, + "step": 2491 + }, + { + "epoch": 0.7648864333947207, + "grad_norm": 0.8527138233184814, + "learning_rate": 9.943549269301491e-05, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.7651933701657458, + "grad_norm": 0.8422580361366272, + "learning_rate": 9.943474764679462e-05, + "loss": 2.2958, + "step": 2493 + }, + { + "epoch": 0.7655003069367711, + "grad_norm": 0.7698150873184204, + "learning_rate": 9.943400211203118e-05, + "loss": 2.1415, + "step": 2494 + }, + { + "epoch": 0.7658072437077962, + "grad_norm": 0.6360690593719482, + "learning_rate": 9.943325608873196e-05, + "loss": 2.1188, + "step": 2495 + }, + { + "epoch": 0.7661141804788214, + "grad_norm": 0.6225799918174744, + "learning_rate": 9.943250957690433e-05, + "loss": 2.1006, + "step": 2496 + }, + { + "epoch": 0.7664211172498465, + "grad_norm": 0.6694490909576416, + "learning_rate": 9.943176257655567e-05, + "loss": 2.2455, + "step": 2497 + }, + { + "epoch": 0.7667280540208717, + "grad_norm": 0.6188158988952637, + "learning_rate": 9.943101508769335e-05, + "loss": 2.0853, + "step": 2498 + }, + { + "epoch": 0.7670349907918969, + "grad_norm": 0.5934504866600037, + "learning_rate": 9.943026711032477e-05, + "loss": 2.0718, + "step": 2499 + }, + { + "epoch": 0.7673419275629221, + "grad_norm": 0.6261292695999146, + "learning_rate": 9.942951864445732e-05, + "loss": 2.1747, + "step": 2500 + }, + { + "epoch": 0.7676488643339472, + "grad_norm": 0.5891184210777283, + "learning_rate": 9.94287696900984e-05, + "loss": 2.1637, + "step": 2501 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.5321740508079529, + "learning_rate": 9.94280202472554e-05, + "loss": 2.0717, + "step": 2502 + }, + { + "epoch": 0.7682627378759975, + "grad_norm": 0.5563281178474426, + "learning_rate": 9.942727031593573e-05, + "loss": 2.1654, + "step": 2503 + }, + { + "epoch": 0.7685696746470227, + "grad_norm": 0.5672664046287537, + "learning_rate": 9.942651989614681e-05, + "loss": 2.0853, + "step": 2504 + }, + { + "epoch": 0.7688766114180479, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.942576898789606e-05, + "loss": 2.0636, + "step": 2505 + }, + { + "epoch": 0.7691835481890731, + "grad_norm": 0.5802470445632935, + "learning_rate": 9.942501759119088e-05, + "loss": 2.0924, + "step": 2506 + }, + { + "epoch": 0.7694904849600982, + "grad_norm": 0.5630003213882446, + "learning_rate": 9.94242657060387e-05, + "loss": 2.1975, + "step": 2507 + }, + { + "epoch": 0.7697974217311234, + "grad_norm": 0.6001835465431213, + "learning_rate": 9.942351333244697e-05, + "loss": 2.1187, + "step": 2508 + }, + { + "epoch": 0.7701043585021485, + "grad_norm": 0.6702088117599487, + "learning_rate": 9.942276047042311e-05, + "loss": 2.1489, + "step": 2509 + }, + { + "epoch": 0.7704112952731738, + "grad_norm": 0.7941808700561523, + "learning_rate": 9.942200711997456e-05, + "loss": 2.1404, + "step": 2510 + }, + { + "epoch": 0.7707182320441989, + "grad_norm": 0.8202539682388306, + "learning_rate": 9.942125328110876e-05, + "loss": 2.1242, + "step": 2511 + }, + { + "epoch": 0.7710251688152241, + "grad_norm": 0.7667655348777771, + "learning_rate": 9.942049895383319e-05, + "loss": 2.118, + "step": 2512 + }, + { + "epoch": 0.7713321055862492, + "grad_norm": 0.6766887307167053, + "learning_rate": 9.941974413815527e-05, + "loss": 2.2632, + "step": 2513 + }, + { + "epoch": 0.7716390423572744, + "grad_norm": 0.5923287272453308, + "learning_rate": 9.941898883408248e-05, + "loss": 2.1096, + "step": 2514 + }, + { + "epoch": 0.7719459791282995, + "grad_norm": 0.8847586512565613, + "learning_rate": 9.941823304162227e-05, + "loss": 2.2629, + "step": 2515 + }, + { + "epoch": 0.7722529158993248, + "grad_norm": 1.2274069786071777, + "learning_rate": 9.941747676078211e-05, + "loss": 2.2493, + "step": 2516 + }, + { + "epoch": 0.7725598526703499, + "grad_norm": 0.8637729287147522, + "learning_rate": 9.94167199915695e-05, + "loss": 2.1545, + "step": 2517 + }, + { + "epoch": 0.7728667894413751, + "grad_norm": 0.7852178812026978, + "learning_rate": 9.941596273399187e-05, + "loss": 2.1984, + "step": 2518 + }, + { + "epoch": 0.7731737262124002, + "grad_norm": 0.6839576959609985, + "learning_rate": 9.941520498805677e-05, + "loss": 2.1913, + "step": 2519 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.7051649689674377, + "learning_rate": 9.941444675377163e-05, + "loss": 2.1678, + "step": 2520 + }, + { + "epoch": 0.7737875997544506, + "grad_norm": 0.702549159526825, + "learning_rate": 9.941368803114395e-05, + "loss": 2.1426, + "step": 2521 + }, + { + "epoch": 0.7740945365254758, + "grad_norm": 0.6717942953109741, + "learning_rate": 9.941292882018127e-05, + "loss": 2.1873, + "step": 2522 + }, + { + "epoch": 0.7744014732965009, + "grad_norm": 0.6705282926559448, + "learning_rate": 9.941216912089104e-05, + "loss": 2.1363, + "step": 2523 + }, + { + "epoch": 0.7747084100675261, + "grad_norm": 0.5858317017555237, + "learning_rate": 9.941140893328082e-05, + "loss": 2.1019, + "step": 2524 + }, + { + "epoch": 0.7750153468385512, + "grad_norm": 0.6353682279586792, + "learning_rate": 9.941064825735808e-05, + "loss": 2.1765, + "step": 2525 + }, + { + "epoch": 0.7753222836095764, + "grad_norm": 0.6573354601860046, + "learning_rate": 9.940988709313035e-05, + "loss": 2.0636, + "step": 2526 + }, + { + "epoch": 0.7756292203806016, + "grad_norm": 0.6040489077568054, + "learning_rate": 9.940912544060517e-05, + "loss": 2.0902, + "step": 2527 + }, + { + "epoch": 0.7759361571516268, + "grad_norm": 0.7024530172348022, + "learning_rate": 9.940836329979004e-05, + "loss": 2.2198, + "step": 2528 + }, + { + "epoch": 0.7762430939226519, + "grad_norm": 0.6910196542739868, + "learning_rate": 9.940760067069251e-05, + "loss": 2.0546, + "step": 2529 + }, + { + "epoch": 0.7765500306936771, + "grad_norm": 0.6841506361961365, + "learning_rate": 9.940683755332012e-05, + "loss": 2.2159, + "step": 2530 + }, + { + "epoch": 0.7768569674647022, + "grad_norm": 0.6503066420555115, + "learning_rate": 9.940607394768038e-05, + "loss": 2.2156, + "step": 2531 + }, + { + "epoch": 0.7771639042357275, + "grad_norm": 0.6512146592140198, + "learning_rate": 9.940530985378089e-05, + "loss": 2.1417, + "step": 2532 + }, + { + "epoch": 0.7774708410067526, + "grad_norm": 0.6234787106513977, + "learning_rate": 9.940454527162914e-05, + "loss": 2.1315, + "step": 2533 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.6279457211494446, + "learning_rate": 9.940378020123273e-05, + "loss": 2.2699, + "step": 2534 + }, + { + "epoch": 0.7780847145488029, + "grad_norm": 0.6793956160545349, + "learning_rate": 9.940301464259921e-05, + "loss": 2.2488, + "step": 2535 + }, + { + "epoch": 0.7783916513198281, + "grad_norm": 0.721234142780304, + "learning_rate": 9.940224859573614e-05, + "loss": 2.1183, + "step": 2536 + }, + { + "epoch": 0.7786985880908532, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.94014820606511e-05, + "loss": 2.0995, + "step": 2537 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.6358578205108643, + "learning_rate": 9.940071503735165e-05, + "loss": 2.2024, + "step": 2538 + }, + { + "epoch": 0.7793124616329036, + "grad_norm": 0.6250868439674377, + "learning_rate": 9.939994752584538e-05, + "loss": 2.1574, + "step": 2539 + }, + { + "epoch": 0.7796193984039288, + "grad_norm": 0.7657763361930847, + "learning_rate": 9.939917952613989e-05, + "loss": 2.2625, + "step": 2540 + }, + { + "epoch": 0.7799263351749539, + "grad_norm": 0.7625400424003601, + "learning_rate": 9.939841103824275e-05, + "loss": 2.1809, + "step": 2541 + }, + { + "epoch": 0.7802332719459791, + "grad_norm": 0.8593107461929321, + "learning_rate": 9.939764206216155e-05, + "loss": 2.2359, + "step": 2542 + }, + { + "epoch": 0.7805402087170042, + "grad_norm": 0.8441007733345032, + "learning_rate": 9.93968725979039e-05, + "loss": 2.1844, + "step": 2543 + }, + { + "epoch": 0.7808471454880295, + "grad_norm": 0.6408470273017883, + "learning_rate": 9.93961026454774e-05, + "loss": 2.1871, + "step": 2544 + }, + { + "epoch": 0.7811540822590546, + "grad_norm": 0.6779976487159729, + "learning_rate": 9.939533220488966e-05, + "loss": 2.1651, + "step": 2545 + }, + { + "epoch": 0.7814610190300798, + "grad_norm": 0.5885556936264038, + "learning_rate": 9.93945612761483e-05, + "loss": 2.0172, + "step": 2546 + }, + { + "epoch": 0.7817679558011049, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.939378985926094e-05, + "loss": 2.1358, + "step": 2547 + }, + { + "epoch": 0.7820748925721301, + "grad_norm": 0.685183584690094, + "learning_rate": 9.939301795423519e-05, + "loss": 2.1822, + "step": 2548 + }, + { + "epoch": 0.7823818293431553, + "grad_norm": 0.6666997671127319, + "learning_rate": 9.939224556107869e-05, + "loss": 2.288, + "step": 2549 + }, + { + "epoch": 0.7826887661141805, + "grad_norm": 0.6401170492172241, + "learning_rate": 9.939147267979905e-05, + "loss": 2.1038, + "step": 2550 + }, + { + "epoch": 0.7829957028852057, + "grad_norm": 0.645182728767395, + "learning_rate": 9.939069931040396e-05, + "loss": 2.1285, + "step": 2551 + }, + { + "epoch": 0.7833026396562308, + "grad_norm": 0.6795851588249207, + "learning_rate": 9.9389925452901e-05, + "loss": 2.1844, + "step": 2552 + }, + { + "epoch": 0.783609576427256, + "grad_norm": 0.7027488946914673, + "learning_rate": 9.938915110729788e-05, + "loss": 2.1712, + "step": 2553 + }, + { + "epoch": 0.7839165131982812, + "grad_norm": 0.7076524496078491, + "learning_rate": 9.93883762736022e-05, + "loss": 2.1812, + "step": 2554 + }, + { + "epoch": 0.7842234499693064, + "grad_norm": 0.5979459881782532, + "learning_rate": 9.938760095182165e-05, + "loss": 2.0877, + "step": 2555 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.6408665776252747, + "learning_rate": 9.938682514196387e-05, + "loss": 2.191, + "step": 2556 + }, + { + "epoch": 0.7848373235113567, + "grad_norm": 0.6545908451080322, + "learning_rate": 9.938604884403654e-05, + "loss": 2.0933, + "step": 2557 + }, + { + "epoch": 0.7851442602823818, + "grad_norm": 0.7271838784217834, + "learning_rate": 9.938527205804733e-05, + "loss": 2.1804, + "step": 2558 + }, + { + "epoch": 0.785451197053407, + "grad_norm": 0.6371840834617615, + "learning_rate": 9.938449478400391e-05, + "loss": 2.1161, + "step": 2559 + }, + { + "epoch": 0.7857581338244322, + "grad_norm": 0.5922467708587646, + "learning_rate": 9.938371702191398e-05, + "loss": 2.0929, + "step": 2560 + }, + { + "epoch": 0.7860650705954574, + "grad_norm": 0.536125898361206, + "learning_rate": 9.938293877178522e-05, + "loss": 2.0815, + "step": 2561 + }, + { + "epoch": 0.7863720073664825, + "grad_norm": 0.6026225090026855, + "learning_rate": 9.93821600336253e-05, + "loss": 2.1719, + "step": 2562 + }, + { + "epoch": 0.7866789441375077, + "grad_norm": 0.584267795085907, + "learning_rate": 9.938138080744192e-05, + "loss": 2.1515, + "step": 2563 + }, + { + "epoch": 0.7869858809085328, + "grad_norm": 0.6616362929344177, + "learning_rate": 9.938060109324281e-05, + "loss": 2.2425, + "step": 2564 + }, + { + "epoch": 0.787292817679558, + "grad_norm": 0.669987678527832, + "learning_rate": 9.937982089103566e-05, + "loss": 2.1883, + "step": 2565 + }, + { + "epoch": 0.7875997544505832, + "grad_norm": 0.6769465208053589, + "learning_rate": 9.937904020082815e-05, + "loss": 2.1508, + "step": 2566 + }, + { + "epoch": 0.7879066912216084, + "grad_norm": 0.5796112418174744, + "learning_rate": 9.937825902262805e-05, + "loss": 2.0925, + "step": 2567 + }, + { + "epoch": 0.7882136279926335, + "grad_norm": 0.5895870923995972, + "learning_rate": 9.937747735644305e-05, + "loss": 2.1002, + "step": 2568 + }, + { + "epoch": 0.7885205647636587, + "grad_norm": 0.5870219469070435, + "learning_rate": 9.937669520228088e-05, + "loss": 2.1189, + "step": 2569 + }, + { + "epoch": 0.7888275015346838, + "grad_norm": 0.6191404461860657, + "learning_rate": 9.937591256014925e-05, + "loss": 2.1783, + "step": 2570 + }, + { + "epoch": 0.7891344383057091, + "grad_norm": 0.6033806204795837, + "learning_rate": 9.937512943005592e-05, + "loss": 2.1507, + "step": 2571 + }, + { + "epoch": 0.7894413750767342, + "grad_norm": 0.6319470405578613, + "learning_rate": 9.937434581200863e-05, + "loss": 2.2088, + "step": 2572 + }, + { + "epoch": 0.7897483118477594, + "grad_norm": 0.621004581451416, + "learning_rate": 9.93735617060151e-05, + "loss": 2.1523, + "step": 2573 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.6069821715354919, + "learning_rate": 9.937277711208311e-05, + "loss": 2.1437, + "step": 2574 + }, + { + "epoch": 0.7903621853898097, + "grad_norm": 0.6186996102333069, + "learning_rate": 9.937199203022039e-05, + "loss": 2.1541, + "step": 2575 + }, + { + "epoch": 0.7906691221608348, + "grad_norm": 0.6531949639320374, + "learning_rate": 9.937120646043471e-05, + "loss": 2.1928, + "step": 2576 + }, + { + "epoch": 0.7909760589318601, + "grad_norm": 0.5974560379981995, + "learning_rate": 9.937042040273383e-05, + "loss": 2.1814, + "step": 2577 + }, + { + "epoch": 0.7912829957028852, + "grad_norm": 0.59506756067276, + "learning_rate": 9.936963385712552e-05, + "loss": 2.2143, + "step": 2578 + }, + { + "epoch": 0.7915899324739104, + "grad_norm": 0.5878757834434509, + "learning_rate": 9.936884682361755e-05, + "loss": 2.0718, + "step": 2579 + }, + { + "epoch": 0.7918968692449355, + "grad_norm": 0.6318243145942688, + "learning_rate": 9.936805930221769e-05, + "loss": 2.1465, + "step": 2580 + }, + { + "epoch": 0.7922038060159607, + "grad_norm": 0.6474836468696594, + "learning_rate": 9.936727129293376e-05, + "loss": 2.0869, + "step": 2581 + }, + { + "epoch": 0.7925107427869859, + "grad_norm": 0.6589438915252686, + "learning_rate": 9.936648279577349e-05, + "loss": 2.1422, + "step": 2582 + }, + { + "epoch": 0.7928176795580111, + "grad_norm": 0.6935134530067444, + "learning_rate": 9.93656938107447e-05, + "loss": 2.1571, + "step": 2583 + }, + { + "epoch": 0.7931246163290362, + "grad_norm": 0.655430793762207, + "learning_rate": 9.936490433785522e-05, + "loss": 2.1044, + "step": 2584 + }, + { + "epoch": 0.7934315531000614, + "grad_norm": 0.6856111288070679, + "learning_rate": 9.93641143771128e-05, + "loss": 2.0551, + "step": 2585 + }, + { + "epoch": 0.7937384898710865, + "grad_norm": 0.6783097386360168, + "learning_rate": 9.936332392852527e-05, + "loss": 2.1475, + "step": 2586 + }, + { + "epoch": 0.7940454266421118, + "grad_norm": 0.6746678948402405, + "learning_rate": 9.936253299210045e-05, + "loss": 2.1462, + "step": 2587 + }, + { + "epoch": 0.7943523634131369, + "grad_norm": 0.6854017972946167, + "learning_rate": 9.936174156784614e-05, + "loss": 2.1649, + "step": 2588 + }, + { + "epoch": 0.7946593001841621, + "grad_norm": 0.6740380525588989, + "learning_rate": 9.936094965577017e-05, + "loss": 2.06, + "step": 2589 + }, + { + "epoch": 0.7949662369551872, + "grad_norm": 0.6354179978370667, + "learning_rate": 9.936015725588037e-05, + "loss": 2.1938, + "step": 2590 + }, + { + "epoch": 0.7952731737262124, + "grad_norm": 0.6496716141700745, + "learning_rate": 9.935936436818453e-05, + "loss": 2.089, + "step": 2591 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.5996106266975403, + "learning_rate": 9.935857099269057e-05, + "loss": 2.2254, + "step": 2592 + }, + { + "epoch": 0.7958870472682628, + "grad_norm": 0.5630382895469666, + "learning_rate": 9.935777712940625e-05, + "loss": 2.069, + "step": 2593 + }, + { + "epoch": 0.7961939840392879, + "grad_norm": 0.5480468273162842, + "learning_rate": 9.935698277833946e-05, + "loss": 2.1288, + "step": 2594 + }, + { + "epoch": 0.7965009208103131, + "grad_norm": 0.5127096772193909, + "learning_rate": 9.935618793949803e-05, + "loss": 2.0753, + "step": 2595 + }, + { + "epoch": 0.7968078575813382, + "grad_norm": 0.6451439261436462, + "learning_rate": 9.935539261288983e-05, + "loss": 2.3005, + "step": 2596 + }, + { + "epoch": 0.7971147943523634, + "grad_norm": 0.7047737836837769, + "learning_rate": 9.935459679852271e-05, + "loss": 2.1307, + "step": 2597 + }, + { + "epoch": 0.7974217311233885, + "grad_norm": 0.6382983922958374, + "learning_rate": 9.935380049640454e-05, + "loss": 2.1136, + "step": 2598 + }, + { + "epoch": 0.7977286678944138, + "grad_norm": 0.7337773442268372, + "learning_rate": 9.935300370654317e-05, + "loss": 2.0719, + "step": 2599 + }, + { + "epoch": 0.7980356046654389, + "grad_norm": 0.7481197118759155, + "learning_rate": 9.935220642894652e-05, + "loss": 2.2263, + "step": 2600 + }, + { + "epoch": 0.7983425414364641, + "grad_norm": 0.7383365631103516, + "learning_rate": 9.93514086636224e-05, + "loss": 2.2207, + "step": 2601 + }, + { + "epoch": 0.7986494782074892, + "grad_norm": 0.800762951374054, + "learning_rate": 9.935061041057876e-05, + "loss": 2.1848, + "step": 2602 + }, + { + "epoch": 0.7989564149785144, + "grad_norm": 0.6972829699516296, + "learning_rate": 9.934981166982346e-05, + "loss": 2.1301, + "step": 2603 + }, + { + "epoch": 0.7992633517495396, + "grad_norm": 0.5842304229736328, + "learning_rate": 9.93490124413644e-05, + "loss": 2.1311, + "step": 2604 + }, + { + "epoch": 0.7995702885205648, + "grad_norm": 0.6070491075515747, + "learning_rate": 9.934821272520946e-05, + "loss": 2.2226, + "step": 2605 + }, + { + "epoch": 0.7998772252915899, + "grad_norm": 0.6141406297683716, + "learning_rate": 9.934741252136656e-05, + "loss": 2.1425, + "step": 2606 + }, + { + "epoch": 0.8001841620626151, + "grad_norm": 0.5515148043632507, + "learning_rate": 9.934661182984363e-05, + "loss": 2.1138, + "step": 2607 + }, + { + "epoch": 0.8004910988336402, + "grad_norm": 0.5819688439369202, + "learning_rate": 9.934581065064854e-05, + "loss": 2.0835, + "step": 2608 + }, + { + "epoch": 0.8007980356046654, + "grad_norm": 0.593979001045227, + "learning_rate": 9.934500898378922e-05, + "loss": 2.2262, + "step": 2609 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.6978363990783691, + "learning_rate": 9.934420682927361e-05, + "loss": 2.1283, + "step": 2610 + }, + { + "epoch": 0.8014119091467158, + "grad_norm": 0.6205853223800659, + "learning_rate": 9.934340418710963e-05, + "loss": 2.1254, + "step": 2611 + }, + { + "epoch": 0.8017188459177409, + "grad_norm": 0.5547113418579102, + "learning_rate": 9.93426010573052e-05, + "loss": 2.0895, + "step": 2612 + }, + { + "epoch": 0.8020257826887661, + "grad_norm": 0.5652415156364441, + "learning_rate": 9.934179743986827e-05, + "loss": 2.1496, + "step": 2613 + }, + { + "epoch": 0.8023327194597912, + "grad_norm": 0.5833094120025635, + "learning_rate": 9.934099333480678e-05, + "loss": 2.1159, + "step": 2614 + }, + { + "epoch": 0.8026396562308165, + "grad_norm": 0.5929473638534546, + "learning_rate": 9.934018874212866e-05, + "loss": 2.1512, + "step": 2615 + }, + { + "epoch": 0.8029465930018416, + "grad_norm": 0.6359207630157471, + "learning_rate": 9.93393836618419e-05, + "loss": 2.1384, + "step": 2616 + }, + { + "epoch": 0.8032535297728668, + "grad_norm": 0.5934728384017944, + "learning_rate": 9.933857809395441e-05, + "loss": 2.1087, + "step": 2617 + }, + { + "epoch": 0.8035604665438919, + "grad_norm": 0.5685787796974182, + "learning_rate": 9.933777203847418e-05, + "loss": 2.1521, + "step": 2618 + }, + { + "epoch": 0.8038674033149171, + "grad_norm": 0.6276339292526245, + "learning_rate": 9.933696549540918e-05, + "loss": 2.1151, + "step": 2619 + }, + { + "epoch": 0.8041743400859422, + "grad_norm": 0.6206804513931274, + "learning_rate": 9.933615846476736e-05, + "loss": 2.1872, + "step": 2620 + }, + { + "epoch": 0.8044812768569675, + "grad_norm": 0.6645623445510864, + "learning_rate": 9.933535094655671e-05, + "loss": 2.217, + "step": 2621 + }, + { + "epoch": 0.8047882136279927, + "grad_norm": 0.6639950275421143, + "learning_rate": 9.93345429407852e-05, + "loss": 2.1479, + "step": 2622 + }, + { + "epoch": 0.8050951503990178, + "grad_norm": 0.6284301280975342, + "learning_rate": 9.933373444746081e-05, + "loss": 2.1763, + "step": 2623 + }, + { + "epoch": 0.805402087170043, + "grad_norm": 0.5974198579788208, + "learning_rate": 9.933292546659156e-05, + "loss": 2.1453, + "step": 2624 + }, + { + "epoch": 0.8057090239410681, + "grad_norm": 0.6465814113616943, + "learning_rate": 9.933211599818541e-05, + "loss": 2.1999, + "step": 2625 + }, + { + "epoch": 0.8060159607120934, + "grad_norm": 0.6099503040313721, + "learning_rate": 9.933130604225038e-05, + "loss": 2.1523, + "step": 2626 + }, + { + "epoch": 0.8063228974831185, + "grad_norm": 0.5749596953392029, + "learning_rate": 9.933049559879448e-05, + "loss": 2.0802, + "step": 2627 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.5541282892227173, + "learning_rate": 9.93296846678257e-05, + "loss": 2.0851, + "step": 2628 + }, + { + "epoch": 0.8069367710251688, + "grad_norm": 0.5884469747543335, + "learning_rate": 9.932887324935207e-05, + "loss": 2.1824, + "step": 2629 + }, + { + "epoch": 0.807243707796194, + "grad_norm": 0.7330854535102844, + "learning_rate": 9.93280613433816e-05, + "loss": 2.1463, + "step": 2630 + }, + { + "epoch": 0.8075506445672191, + "grad_norm": 0.7012677192687988, + "learning_rate": 9.932724894992232e-05, + "loss": 2.0907, + "step": 2631 + }, + { + "epoch": 0.8078575813382444, + "grad_norm": 0.6487980484962463, + "learning_rate": 9.932643606898224e-05, + "loss": 2.2131, + "step": 2632 + }, + { + "epoch": 0.8081645181092695, + "grad_norm": 0.7956567406654358, + "learning_rate": 9.932562270056941e-05, + "loss": 2.2289, + "step": 2633 + }, + { + "epoch": 0.8084714548802947, + "grad_norm": 0.7904889583587646, + "learning_rate": 9.932480884469187e-05, + "loss": 2.195, + "step": 2634 + }, + { + "epoch": 0.8087783916513198, + "grad_norm": 0.8088505864143372, + "learning_rate": 9.932399450135766e-05, + "loss": 2.1199, + "step": 2635 + }, + { + "epoch": 0.809085328422345, + "grad_norm": 0.7557070851325989, + "learning_rate": 9.932317967057483e-05, + "loss": 2.177, + "step": 2636 + }, + { + "epoch": 0.8093922651933702, + "grad_norm": 0.8585113286972046, + "learning_rate": 9.932236435235143e-05, + "loss": 2.2215, + "step": 2637 + }, + { + "epoch": 0.8096992019643954, + "grad_norm": 0.9541242718696594, + "learning_rate": 9.932154854669551e-05, + "loss": 2.0971, + "step": 2638 + }, + { + "epoch": 0.8100061387354205, + "grad_norm": 0.9696017503738403, + "learning_rate": 9.932073225361513e-05, + "loss": 2.1723, + "step": 2639 + }, + { + "epoch": 0.8103130755064457, + "grad_norm": 0.9876028895378113, + "learning_rate": 9.931991547311839e-05, + "loss": 2.2266, + "step": 2640 + }, + { + "epoch": 0.8106200122774708, + "grad_norm": 0.9169884324073792, + "learning_rate": 9.931909820521332e-05, + "loss": 2.1453, + "step": 2641 + }, + { + "epoch": 0.810926949048496, + "grad_norm": 0.7645174860954285, + "learning_rate": 9.931828044990801e-05, + "loss": 2.1683, + "step": 2642 + }, + { + "epoch": 0.8112338858195212, + "grad_norm": 0.6733110547065735, + "learning_rate": 9.931746220721056e-05, + "loss": 2.0869, + "step": 2643 + }, + { + "epoch": 0.8115408225905464, + "grad_norm": 0.6033461689949036, + "learning_rate": 9.931664347712904e-05, + "loss": 2.1395, + "step": 2644 + }, + { + "epoch": 0.8118477593615715, + "grad_norm": 0.5953301191329956, + "learning_rate": 9.931582425967154e-05, + "loss": 2.0886, + "step": 2645 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.6587704420089722, + "learning_rate": 9.931500455484616e-05, + "loss": 2.1846, + "step": 2646 + }, + { + "epoch": 0.8124616329036218, + "grad_norm": 0.5837808847427368, + "learning_rate": 9.931418436266101e-05, + "loss": 2.0953, + "step": 2647 + }, + { + "epoch": 0.8127685696746471, + "grad_norm": 0.5593163967132568, + "learning_rate": 9.931336368312417e-05, + "loss": 2.1044, + "step": 2648 + }, + { + "epoch": 0.8130755064456722, + "grad_norm": 0.5758668780326843, + "learning_rate": 9.931254251624378e-05, + "loss": 2.1813, + "step": 2649 + }, + { + "epoch": 0.8133824432166974, + "grad_norm": 0.7128240466117859, + "learning_rate": 9.931172086202793e-05, + "loss": 2.1743, + "step": 2650 + }, + { + "epoch": 0.8136893799877225, + "grad_norm": 0.6214346885681152, + "learning_rate": 9.931089872048476e-05, + "loss": 2.0566, + "step": 2651 + }, + { + "epoch": 0.8139963167587477, + "grad_norm": 0.6279975771903992, + "learning_rate": 9.931007609162239e-05, + "loss": 2.1487, + "step": 2652 + }, + { + "epoch": 0.8143032535297728, + "grad_norm": 0.6137428879737854, + "learning_rate": 9.930925297544895e-05, + "loss": 2.1281, + "step": 2653 + }, + { + "epoch": 0.8146101903007981, + "grad_norm": 0.7433622479438782, + "learning_rate": 9.930842937197255e-05, + "loss": 2.2398, + "step": 2654 + }, + { + "epoch": 0.8149171270718232, + "grad_norm": 0.7490934729576111, + "learning_rate": 9.930760528120137e-05, + "loss": 2.0626, + "step": 2655 + }, + { + "epoch": 0.8152240638428484, + "grad_norm": 0.6829020380973816, + "learning_rate": 9.930678070314352e-05, + "loss": 2.0685, + "step": 2656 + }, + { + "epoch": 0.8155310006138735, + "grad_norm": 0.6328942775726318, + "learning_rate": 9.930595563780718e-05, + "loss": 2.1415, + "step": 2657 + }, + { + "epoch": 0.8158379373848987, + "grad_norm": 0.6919183135032654, + "learning_rate": 9.930513008520048e-05, + "loss": 2.1764, + "step": 2658 + }, + { + "epoch": 0.8161448741559238, + "grad_norm": 0.6600683331489563, + "learning_rate": 9.930430404533158e-05, + "loss": 2.2252, + "step": 2659 + }, + { + "epoch": 0.8164518109269491, + "grad_norm": 0.6614112257957458, + "learning_rate": 9.930347751820866e-05, + "loss": 2.0842, + "step": 2660 + }, + { + "epoch": 0.8167587476979742, + "grad_norm": 0.634395182132721, + "learning_rate": 9.930265050383987e-05, + "loss": 2.1784, + "step": 2661 + }, + { + "epoch": 0.8170656844689994, + "grad_norm": 0.6563819050788879, + "learning_rate": 9.930182300223338e-05, + "loss": 2.1845, + "step": 2662 + }, + { + "epoch": 0.8173726212400245, + "grad_norm": 0.7023175954818726, + "learning_rate": 9.93009950133974e-05, + "loss": 2.1913, + "step": 2663 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.6042037010192871, + "learning_rate": 9.930016653734007e-05, + "loss": 2.1624, + "step": 2664 + }, + { + "epoch": 0.8179864947820749, + "grad_norm": 0.5729875564575195, + "learning_rate": 9.929933757406962e-05, + "loss": 2.0439, + "step": 2665 + }, + { + "epoch": 0.8182934315531001, + "grad_norm": 0.5399687886238098, + "learning_rate": 9.929850812359421e-05, + "loss": 2.1438, + "step": 2666 + }, + { + "epoch": 0.8186003683241252, + "grad_norm": 0.6325745582580566, + "learning_rate": 9.929767818592205e-05, + "loss": 2.1644, + "step": 2667 + }, + { + "epoch": 0.8189073050951504, + "grad_norm": 0.6303146481513977, + "learning_rate": 9.929684776106134e-05, + "loss": 2.1106, + "step": 2668 + }, + { + "epoch": 0.8192142418661755, + "grad_norm": 0.6482712030410767, + "learning_rate": 9.929601684902027e-05, + "loss": 2.0877, + "step": 2669 + }, + { + "epoch": 0.8195211786372008, + "grad_norm": 0.6858036518096924, + "learning_rate": 9.92951854498071e-05, + "loss": 2.1263, + "step": 2670 + }, + { + "epoch": 0.8198281154082259, + "grad_norm": 0.6214284896850586, + "learning_rate": 9.929435356343e-05, + "loss": 2.1516, + "step": 2671 + }, + { + "epoch": 0.8201350521792511, + "grad_norm": 0.5486865639686584, + "learning_rate": 9.92935211898972e-05, + "loss": 2.1199, + "step": 2672 + }, + { + "epoch": 0.8204419889502762, + "grad_norm": 0.62936931848526, + "learning_rate": 9.929268832921693e-05, + "loss": 2.1555, + "step": 2673 + }, + { + "epoch": 0.8207489257213014, + "grad_norm": 0.6402064561843872, + "learning_rate": 9.929185498139744e-05, + "loss": 2.1017, + "step": 2674 + }, + { + "epoch": 0.8210558624923265, + "grad_norm": 0.7254593372344971, + "learning_rate": 9.929102114644693e-05, + "loss": 2.1145, + "step": 2675 + }, + { + "epoch": 0.8213627992633518, + "grad_norm": 0.776472806930542, + "learning_rate": 9.929018682437366e-05, + "loss": 2.2582, + "step": 2676 + }, + { + "epoch": 0.8216697360343769, + "grad_norm": 0.7073757648468018, + "learning_rate": 9.928935201518587e-05, + "loss": 2.1135, + "step": 2677 + }, + { + "epoch": 0.8219766728054021, + "grad_norm": 0.7075079679489136, + "learning_rate": 9.928851671889184e-05, + "loss": 2.128, + "step": 2678 + }, + { + "epoch": 0.8222836095764272, + "grad_norm": 0.7937450408935547, + "learning_rate": 9.928768093549979e-05, + "loss": 2.1401, + "step": 2679 + }, + { + "epoch": 0.8225905463474524, + "grad_norm": 0.7523970603942871, + "learning_rate": 9.928684466501797e-05, + "loss": 2.2055, + "step": 2680 + }, + { + "epoch": 0.8228974831184775, + "grad_norm": 0.6644876599311829, + "learning_rate": 9.928600790745466e-05, + "loss": 2.1449, + "step": 2681 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.6054069399833679, + "learning_rate": 9.928517066281816e-05, + "loss": 2.1191, + "step": 2682 + }, + { + "epoch": 0.8235113566605279, + "grad_norm": 0.6610973477363586, + "learning_rate": 9.92843329311167e-05, + "loss": 2.2247, + "step": 2683 + }, + { + "epoch": 0.8238182934315531, + "grad_norm": 0.69968181848526, + "learning_rate": 9.928349471235858e-05, + "loss": 2.149, + "step": 2684 + }, + { + "epoch": 0.8241252302025782, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.928265600655206e-05, + "loss": 2.1906, + "step": 2685 + }, + { + "epoch": 0.8244321669736034, + "grad_norm": 0.6621972918510437, + "learning_rate": 9.928181681370547e-05, + "loss": 2.1259, + "step": 2686 + }, + { + "epoch": 0.8247391037446286, + "grad_norm": 0.6452053785324097, + "learning_rate": 9.928097713382708e-05, + "loss": 2.1301, + "step": 2687 + }, + { + "epoch": 0.8250460405156538, + "grad_norm": 0.6137326955795288, + "learning_rate": 9.928013696692519e-05, + "loss": 2.0942, + "step": 2688 + }, + { + "epoch": 0.8253529772866789, + "grad_norm": 0.6449215412139893, + "learning_rate": 9.92792963130081e-05, + "loss": 2.2135, + "step": 2689 + }, + { + "epoch": 0.8256599140577041, + "grad_norm": 0.5838732123374939, + "learning_rate": 9.927845517208411e-05, + "loss": 2.1161, + "step": 2690 + }, + { + "epoch": 0.8259668508287292, + "grad_norm": 0.6642805337905884, + "learning_rate": 9.927761354416157e-05, + "loss": 2.1228, + "step": 2691 + }, + { + "epoch": 0.8262737875997545, + "grad_norm": 0.653274416923523, + "learning_rate": 9.927677142924874e-05, + "loss": 2.1777, + "step": 2692 + }, + { + "epoch": 0.8265807243707797, + "grad_norm": 0.6471827030181885, + "learning_rate": 9.927592882735398e-05, + "loss": 2.0756, + "step": 2693 + }, + { + "epoch": 0.8268876611418048, + "grad_norm": 0.6215457916259766, + "learning_rate": 9.927508573848562e-05, + "loss": 2.0691, + "step": 2694 + }, + { + "epoch": 0.82719459791283, + "grad_norm": 0.6343390345573425, + "learning_rate": 9.927424216265198e-05, + "loss": 2.2145, + "step": 2695 + }, + { + "epoch": 0.8275015346838551, + "grad_norm": 0.5296334624290466, + "learning_rate": 9.927339809986138e-05, + "loss": 2.0861, + "step": 2696 + }, + { + "epoch": 0.8278084714548803, + "grad_norm": 0.6457146406173706, + "learning_rate": 9.92725535501222e-05, + "loss": 2.1703, + "step": 2697 + }, + { + "epoch": 0.8281154082259055, + "grad_norm": 0.753579318523407, + "learning_rate": 9.927170851344276e-05, + "loss": 2.1628, + "step": 2698 + }, + { + "epoch": 0.8284223449969307, + "grad_norm": 0.7327163815498352, + "learning_rate": 9.927086298983141e-05, + "loss": 2.105, + "step": 2699 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.7786175608634949, + "learning_rate": 9.927001697929653e-05, + "loss": 2.084, + "step": 2700 + }, + { + "epoch": 0.829036218538981, + "grad_norm": 0.6370857357978821, + "learning_rate": 9.926917048184646e-05, + "loss": 2.0888, + "step": 2701 + }, + { + "epoch": 0.8293431553100061, + "grad_norm": 0.6600006818771362, + "learning_rate": 9.926832349748955e-05, + "loss": 2.148, + "step": 2702 + }, + { + "epoch": 0.8296500920810314, + "grad_norm": 0.6266845464706421, + "learning_rate": 9.926747602623422e-05, + "loss": 2.2182, + "step": 2703 + }, + { + "epoch": 0.8299570288520565, + "grad_norm": 0.588934600353241, + "learning_rate": 9.92666280680888e-05, + "loss": 2.1879, + "step": 2704 + }, + { + "epoch": 0.8302639656230817, + "grad_norm": 0.6467881202697754, + "learning_rate": 9.926577962306168e-05, + "loss": 2.1082, + "step": 2705 + }, + { + "epoch": 0.8305709023941068, + "grad_norm": 0.6256638765335083, + "learning_rate": 9.926493069116127e-05, + "loss": 2.1007, + "step": 2706 + }, + { + "epoch": 0.830877839165132, + "grad_norm": 0.5710256099700928, + "learning_rate": 9.926408127239592e-05, + "loss": 2.0783, + "step": 2707 + }, + { + "epoch": 0.8311847759361571, + "grad_norm": 0.5836597681045532, + "learning_rate": 9.926323136677405e-05, + "loss": 2.1292, + "step": 2708 + }, + { + "epoch": 0.8314917127071824, + "grad_norm": 0.6420408487319946, + "learning_rate": 9.926238097430405e-05, + "loss": 2.1191, + "step": 2709 + }, + { + "epoch": 0.8317986494782075, + "grad_norm": 0.6192520260810852, + "learning_rate": 9.926153009499433e-05, + "loss": 2.1401, + "step": 2710 + }, + { + "epoch": 0.8321055862492327, + "grad_norm": 0.5986925959587097, + "learning_rate": 9.92606787288533e-05, + "loss": 2.0466, + "step": 2711 + }, + { + "epoch": 0.8324125230202578, + "grad_norm": 0.6386710405349731, + "learning_rate": 9.925982687588937e-05, + "loss": 2.1975, + "step": 2712 + }, + { + "epoch": 0.832719459791283, + "grad_norm": 0.6678250432014465, + "learning_rate": 9.925897453611095e-05, + "loss": 2.1744, + "step": 2713 + }, + { + "epoch": 0.8330263965623081, + "grad_norm": 0.628873348236084, + "learning_rate": 9.925812170952648e-05, + "loss": 2.0901, + "step": 2714 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6365368366241455, + "learning_rate": 9.925726839614438e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.8336402701043585, + "grad_norm": 0.6812825798988342, + "learning_rate": 9.925641459597309e-05, + "loss": 2.1163, + "step": 2716 + }, + { + "epoch": 0.8339472068753837, + "grad_norm": 0.6961301565170288, + "learning_rate": 9.925556030902103e-05, + "loss": 2.1634, + "step": 2717 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.687017023563385, + "learning_rate": 9.925470553529666e-05, + "loss": 2.1921, + "step": 2718 + }, + { + "epoch": 0.834561080417434, + "grad_norm": 0.6528787612915039, + "learning_rate": 9.925385027480841e-05, + "loss": 2.1148, + "step": 2719 + }, + { + "epoch": 0.8348680171884592, + "grad_norm": 0.6092917323112488, + "learning_rate": 9.925299452756476e-05, + "loss": 2.0154, + "step": 2720 + }, + { + "epoch": 0.8351749539594844, + "grad_norm": 0.6537092328071594, + "learning_rate": 9.925213829357413e-05, + "loss": 2.1775, + "step": 2721 + }, + { + "epoch": 0.8354818907305095, + "grad_norm": 0.6560773849487305, + "learning_rate": 9.925128157284503e-05, + "loss": 2.1628, + "step": 2722 + }, + { + "epoch": 0.8357888275015347, + "grad_norm": 0.5976104140281677, + "learning_rate": 9.925042436538588e-05, + "loss": 2.1527, + "step": 2723 + }, + { + "epoch": 0.8360957642725598, + "grad_norm": 0.6577131152153015, + "learning_rate": 9.924956667120516e-05, + "loss": 2.1449, + "step": 2724 + }, + { + "epoch": 0.836402701043585, + "grad_norm": 0.6574232578277588, + "learning_rate": 9.924870849031136e-05, + "loss": 2.0517, + "step": 2725 + }, + { + "epoch": 0.8367096378146102, + "grad_norm": 0.5988326072692871, + "learning_rate": 9.924784982271297e-05, + "loss": 2.0975, + "step": 2726 + }, + { + "epoch": 0.8370165745856354, + "grad_norm": 0.5970706939697266, + "learning_rate": 9.924699066841845e-05, + "loss": 2.1754, + "step": 2727 + }, + { + "epoch": 0.8373235113566605, + "grad_norm": 0.6547200679779053, + "learning_rate": 9.924613102743632e-05, + "loss": 2.1651, + "step": 2728 + }, + { + "epoch": 0.8376304481276857, + "grad_norm": 0.643358588218689, + "learning_rate": 9.924527089977504e-05, + "loss": 2.1355, + "step": 2729 + }, + { + "epoch": 0.8379373848987108, + "grad_norm": 0.6696504950523376, + "learning_rate": 9.924441028544314e-05, + "loss": 2.1444, + "step": 2730 + }, + { + "epoch": 0.8382443216697361, + "grad_norm": 0.5923263430595398, + "learning_rate": 9.924354918444911e-05, + "loss": 2.1656, + "step": 2731 + }, + { + "epoch": 0.8385512584407612, + "grad_norm": 0.6507698893547058, + "learning_rate": 9.924268759680146e-05, + "loss": 2.1172, + "step": 2732 + }, + { + "epoch": 0.8388581952117864, + "grad_norm": 0.6240561008453369, + "learning_rate": 9.924182552250873e-05, + "loss": 2.113, + "step": 2733 + }, + { + "epoch": 0.8391651319828115, + "grad_norm": 0.7350605726242065, + "learning_rate": 9.92409629615794e-05, + "loss": 2.2099, + "step": 2734 + }, + { + "epoch": 0.8394720687538367, + "grad_norm": 0.679027795791626, + "learning_rate": 9.924009991402202e-05, + "loss": 2.1202, + "step": 2735 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.7187801003456116, + "learning_rate": 9.923923637984512e-05, + "loss": 2.1994, + "step": 2736 + }, + { + "epoch": 0.8400859422958871, + "grad_norm": 0.7437569499015808, + "learning_rate": 9.92383723590572e-05, + "loss": 2.1778, + "step": 2737 + }, + { + "epoch": 0.8403928790669122, + "grad_norm": 0.7004902958869934, + "learning_rate": 9.923750785166686e-05, + "loss": 2.1478, + "step": 2738 + }, + { + "epoch": 0.8406998158379374, + "grad_norm": 0.632478654384613, + "learning_rate": 9.923664285768258e-05, + "loss": 2.1785, + "step": 2739 + }, + { + "epoch": 0.8410067526089625, + "grad_norm": 0.6399826407432556, + "learning_rate": 9.923577737711295e-05, + "loss": 2.1708, + "step": 2740 + }, + { + "epoch": 0.8413136893799877, + "grad_norm": 0.649340033531189, + "learning_rate": 9.92349114099665e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.8416206261510129, + "grad_norm": 0.6143749952316284, + "learning_rate": 9.923404495625182e-05, + "loss": 2.0696, + "step": 2742 + }, + { + "epoch": 0.8419275629220381, + "grad_norm": 0.655846357345581, + "learning_rate": 9.923317801597742e-05, + "loss": 2.1163, + "step": 2743 + }, + { + "epoch": 0.8422344996930632, + "grad_norm": 0.588096022605896, + "learning_rate": 9.923231058915192e-05, + "loss": 2.0893, + "step": 2744 + }, + { + "epoch": 0.8425414364640884, + "grad_norm": 0.5445908904075623, + "learning_rate": 9.923144267578386e-05, + "loss": 2.1223, + "step": 2745 + }, + { + "epoch": 0.8428483732351135, + "grad_norm": 0.5372910499572754, + "learning_rate": 9.923057427588182e-05, + "loss": 2.1386, + "step": 2746 + }, + { + "epoch": 0.8431553100061387, + "grad_norm": 0.5118899345397949, + "learning_rate": 9.922970538945442e-05, + "loss": 2.0532, + "step": 2747 + }, + { + "epoch": 0.8434622467771639, + "grad_norm": 0.5252440571784973, + "learning_rate": 9.922883601651019e-05, + "loss": 2.1679, + "step": 2748 + }, + { + "epoch": 0.8437691835481891, + "grad_norm": 0.5978875160217285, + "learning_rate": 9.922796615705776e-05, + "loss": 2.2054, + "step": 2749 + }, + { + "epoch": 0.8440761203192142, + "grad_norm": 0.5642610788345337, + "learning_rate": 9.922709581110572e-05, + "loss": 2.1886, + "step": 2750 + }, + { + "epoch": 0.8443830570902394, + "grad_norm": 0.6332407593727112, + "learning_rate": 9.922622497866265e-05, + "loss": 2.1618, + "step": 2751 + }, + { + "epoch": 0.8446899938612645, + "grad_norm": 0.6971728801727295, + "learning_rate": 9.922535365973718e-05, + "loss": 2.1011, + "step": 2752 + }, + { + "epoch": 0.8449969306322898, + "grad_norm": 0.6917250156402588, + "learning_rate": 9.922448185433792e-05, + "loss": 2.1408, + "step": 2753 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.748960554599762, + "learning_rate": 9.922360956247348e-05, + "loss": 2.1612, + "step": 2754 + }, + { + "epoch": 0.8456108041743401, + "grad_norm": 0.6739722490310669, + "learning_rate": 9.922273678415245e-05, + "loss": 2.1234, + "step": 2755 + }, + { + "epoch": 0.8459177409453652, + "grad_norm": 0.6310722827911377, + "learning_rate": 9.922186351938351e-05, + "loss": 2.1476, + "step": 2756 + }, + { + "epoch": 0.8462246777163904, + "grad_norm": 0.5992079973220825, + "learning_rate": 9.922098976817527e-05, + "loss": 2.1009, + "step": 2757 + }, + { + "epoch": 0.8465316144874155, + "grad_norm": 0.5697188973426819, + "learning_rate": 9.922011553053637e-05, + "loss": 2.1277, + "step": 2758 + }, + { + "epoch": 0.8468385512584408, + "grad_norm": 0.7005256414413452, + "learning_rate": 9.921924080647541e-05, + "loss": 2.1592, + "step": 2759 + }, + { + "epoch": 0.8471454880294659, + "grad_norm": 0.7664382457733154, + "learning_rate": 9.921836559600109e-05, + "loss": 2.2328, + "step": 2760 + }, + { + "epoch": 0.8474524248004911, + "grad_norm": 0.8668230772018433, + "learning_rate": 9.921748989912201e-05, + "loss": 2.2285, + "step": 2761 + }, + { + "epoch": 0.8477593615715162, + "grad_norm": 0.9423169493675232, + "learning_rate": 9.921661371584685e-05, + "loss": 2.1172, + "step": 2762 + }, + { + "epoch": 0.8480662983425414, + "grad_norm": 0.8547552824020386, + "learning_rate": 9.921573704618428e-05, + "loss": 2.1426, + "step": 2763 + }, + { + "epoch": 0.8483732351135667, + "grad_norm": 0.7568690776824951, + "learning_rate": 9.921485989014294e-05, + "loss": 2.0861, + "step": 2764 + }, + { + "epoch": 0.8486801718845918, + "grad_norm": 0.6535828709602356, + "learning_rate": 9.92139822477315e-05, + "loss": 2.1705, + "step": 2765 + }, + { + "epoch": 0.848987108655617, + "grad_norm": 0.6099218130111694, + "learning_rate": 9.921310411895867e-05, + "loss": 2.1666, + "step": 2766 + }, + { + "epoch": 0.8492940454266421, + "grad_norm": 0.6315065026283264, + "learning_rate": 9.92122255038331e-05, + "loss": 2.1868, + "step": 2767 + }, + { + "epoch": 0.8496009821976673, + "grad_norm": 0.6861329078674316, + "learning_rate": 9.921134640236344e-05, + "loss": 2.1056, + "step": 2768 + }, + { + "epoch": 0.8499079189686924, + "grad_norm": 0.6357519626617432, + "learning_rate": 9.921046681455844e-05, + "loss": 2.1272, + "step": 2769 + }, + { + "epoch": 0.8502148557397177, + "grad_norm": 0.6245810389518738, + "learning_rate": 9.920958674042676e-05, + "loss": 2.1313, + "step": 2770 + }, + { + "epoch": 0.8505217925107428, + "grad_norm": 0.6087192296981812, + "learning_rate": 9.920870617997709e-05, + "loss": 2.123, + "step": 2771 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.6384228467941284, + "learning_rate": 9.920782513321814e-05, + "loss": 2.1343, + "step": 2772 + }, + { + "epoch": 0.8511356660527931, + "grad_norm": 0.6143882274627686, + "learning_rate": 9.920694360015863e-05, + "loss": 2.0706, + "step": 2773 + }, + { + "epoch": 0.8514426028238183, + "grad_norm": 0.5561975240707397, + "learning_rate": 9.920606158080725e-05, + "loss": 2.1015, + "step": 2774 + }, + { + "epoch": 0.8517495395948435, + "grad_norm": 0.5434146523475647, + "learning_rate": 9.920517907517275e-05, + "loss": 2.1306, + "step": 2775 + }, + { + "epoch": 0.8520564763658687, + "grad_norm": 0.6028591990470886, + "learning_rate": 9.920429608326382e-05, + "loss": 2.1665, + "step": 2776 + }, + { + "epoch": 0.8523634131368938, + "grad_norm": 0.6491599082946777, + "learning_rate": 9.920341260508918e-05, + "loss": 2.0715, + "step": 2777 + }, + { + "epoch": 0.852670349907919, + "grad_norm": 0.6350167989730835, + "learning_rate": 9.92025286406576e-05, + "loss": 2.1492, + "step": 2778 + }, + { + "epoch": 0.8529772866789441, + "grad_norm": 0.5726897120475769, + "learning_rate": 9.92016441899778e-05, + "loss": 2.1128, + "step": 2779 + }, + { + "epoch": 0.8532842234499693, + "grad_norm": 0.5680630207061768, + "learning_rate": 9.92007592530585e-05, + "loss": 2.0718, + "step": 2780 + }, + { + "epoch": 0.8535911602209945, + "grad_norm": 0.5901346802711487, + "learning_rate": 9.919987382990845e-05, + "loss": 2.0577, + "step": 2781 + }, + { + "epoch": 0.8538980969920197, + "grad_norm": 0.5756994485855103, + "learning_rate": 9.919898792053643e-05, + "loss": 2.106, + "step": 2782 + }, + { + "epoch": 0.8542050337630448, + "grad_norm": 0.5831238031387329, + "learning_rate": 9.919810152495116e-05, + "loss": 2.0507, + "step": 2783 + }, + { + "epoch": 0.85451197053407, + "grad_norm": 0.529931902885437, + "learning_rate": 9.919721464316143e-05, + "loss": 2.0934, + "step": 2784 + }, + { + "epoch": 0.8548189073050951, + "grad_norm": 0.603672981262207, + "learning_rate": 9.919632727517597e-05, + "loss": 2.164, + "step": 2785 + }, + { + "epoch": 0.8551258440761204, + "grad_norm": 0.5741528868675232, + "learning_rate": 9.919543942100357e-05, + "loss": 2.0948, + "step": 2786 + }, + { + "epoch": 0.8554327808471455, + "grad_norm": 0.5689142942428589, + "learning_rate": 9.919455108065303e-05, + "loss": 2.1572, + "step": 2787 + }, + { + "epoch": 0.8557397176181707, + "grad_norm": 0.5767523646354675, + "learning_rate": 9.919366225413308e-05, + "loss": 2.0528, + "step": 2788 + }, + { + "epoch": 0.8560466543891958, + "grad_norm": 0.6004374623298645, + "learning_rate": 9.919277294145252e-05, + "loss": 2.1078, + "step": 2789 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.6199560761451721, + "learning_rate": 9.919188314262017e-05, + "loss": 2.034, + "step": 2790 + }, + { + "epoch": 0.8566605279312461, + "grad_norm": 0.5928464531898499, + "learning_rate": 9.919099285764478e-05, + "loss": 2.1226, + "step": 2791 + }, + { + "epoch": 0.8569674647022714, + "grad_norm": 0.5620111227035522, + "learning_rate": 9.919010208653517e-05, + "loss": 2.1387, + "step": 2792 + }, + { + "epoch": 0.8572744014732965, + "grad_norm": 0.6035314798355103, + "learning_rate": 9.918921082930015e-05, + "loss": 2.0888, + "step": 2793 + }, + { + "epoch": 0.8575813382443217, + "grad_norm": 0.6842171549797058, + "learning_rate": 9.91883190859485e-05, + "loss": 2.15, + "step": 2794 + }, + { + "epoch": 0.8578882750153468, + "grad_norm": 0.7600229978561401, + "learning_rate": 9.918742685648906e-05, + "loss": 2.1776, + "step": 2795 + }, + { + "epoch": 0.858195211786372, + "grad_norm": 0.641504168510437, + "learning_rate": 9.918653414093065e-05, + "loss": 2.086, + "step": 2796 + }, + { + "epoch": 0.8585021485573971, + "grad_norm": 0.6062462329864502, + "learning_rate": 9.918564093928207e-05, + "loss": 2.0772, + "step": 2797 + }, + { + "epoch": 0.8588090853284224, + "grad_norm": 0.5259165167808533, + "learning_rate": 9.918474725155214e-05, + "loss": 2.1034, + "step": 2798 + }, + { + "epoch": 0.8591160220994475, + "grad_norm": 0.532511830329895, + "learning_rate": 9.918385307774973e-05, + "loss": 2.103, + "step": 2799 + }, + { + "epoch": 0.8594229588704727, + "grad_norm": 0.5996485352516174, + "learning_rate": 9.918295841788366e-05, + "loss": 2.1698, + "step": 2800 + }, + { + "epoch": 0.8597298956414978, + "grad_norm": 0.5895976424217224, + "learning_rate": 9.918206327196276e-05, + "loss": 2.132, + "step": 2801 + }, + { + "epoch": 0.860036832412523, + "grad_norm": 0.6363179087638855, + "learning_rate": 9.918116763999588e-05, + "loss": 2.0967, + "step": 2802 + }, + { + "epoch": 0.8603437691835482, + "grad_norm": 0.6594113707542419, + "learning_rate": 9.918027152199187e-05, + "loss": 2.1266, + "step": 2803 + }, + { + "epoch": 0.8606507059545734, + "grad_norm": 0.694879412651062, + "learning_rate": 9.917937491795961e-05, + "loss": 2.0694, + "step": 2804 + }, + { + "epoch": 0.8609576427255985, + "grad_norm": 0.6310710906982422, + "learning_rate": 9.917847782790793e-05, + "loss": 2.1546, + "step": 2805 + }, + { + "epoch": 0.8612645794966237, + "grad_norm": 0.6166081428527832, + "learning_rate": 9.917758025184572e-05, + "loss": 2.131, + "step": 2806 + }, + { + "epoch": 0.8615715162676488, + "grad_norm": 0.5857066512107849, + "learning_rate": 9.917668218978182e-05, + "loss": 2.1529, + "step": 2807 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.6374151706695557, + "learning_rate": 9.917578364172513e-05, + "loss": 2.151, + "step": 2808 + }, + { + "epoch": 0.8621853898096992, + "grad_norm": 0.6760959625244141, + "learning_rate": 9.917488460768453e-05, + "loss": 2.1955, + "step": 2809 + }, + { + "epoch": 0.8624923265807244, + "grad_norm": 0.6308501362800598, + "learning_rate": 9.917398508766889e-05, + "loss": 2.1449, + "step": 2810 + }, + { + "epoch": 0.8627992633517495, + "grad_norm": 0.615181028842926, + "learning_rate": 9.91730850816871e-05, + "loss": 2.0326, + "step": 2811 + }, + { + "epoch": 0.8631062001227747, + "grad_norm": 0.6746891736984253, + "learning_rate": 9.917218458974809e-05, + "loss": 2.1472, + "step": 2812 + }, + { + "epoch": 0.8634131368937998, + "grad_norm": 0.6594959497451782, + "learning_rate": 9.91712836118607e-05, + "loss": 2.0879, + "step": 2813 + }, + { + "epoch": 0.8637200736648251, + "grad_norm": 0.6843087077140808, + "learning_rate": 9.91703821480339e-05, + "loss": 2.13, + "step": 2814 + }, + { + "epoch": 0.8640270104358502, + "grad_norm": 0.7513928413391113, + "learning_rate": 9.916948019827653e-05, + "loss": 2.1866, + "step": 2815 + }, + { + "epoch": 0.8643339472068754, + "grad_norm": 0.7352319955825806, + "learning_rate": 9.916857776259755e-05, + "loss": 2.0844, + "step": 2816 + }, + { + "epoch": 0.8646408839779005, + "grad_norm": 0.6901769638061523, + "learning_rate": 9.916767484100587e-05, + "loss": 2.086, + "step": 2817 + }, + { + "epoch": 0.8649478207489257, + "grad_norm": 0.621734619140625, + "learning_rate": 9.91667714335104e-05, + "loss": 2.0764, + "step": 2818 + }, + { + "epoch": 0.8652547575199508, + "grad_norm": 0.5779813528060913, + "learning_rate": 9.916586754012008e-05, + "loss": 2.0568, + "step": 2819 + }, + { + "epoch": 0.8655616942909761, + "grad_norm": 0.566251814365387, + "learning_rate": 9.916496316084385e-05, + "loss": 2.1624, + "step": 2820 + }, + { + "epoch": 0.8658686310620012, + "grad_norm": 0.6039763689041138, + "learning_rate": 9.916405829569062e-05, + "loss": 2.0412, + "step": 2821 + }, + { + "epoch": 0.8661755678330264, + "grad_norm": 0.587469220161438, + "learning_rate": 9.916315294466935e-05, + "loss": 2.1513, + "step": 2822 + }, + { + "epoch": 0.8664825046040515, + "grad_norm": 0.5792883634567261, + "learning_rate": 9.916224710778901e-05, + "loss": 2.055, + "step": 2823 + }, + { + "epoch": 0.8667894413750767, + "grad_norm": 0.5533844232559204, + "learning_rate": 9.916134078505852e-05, + "loss": 2.1237, + "step": 2824 + }, + { + "epoch": 0.8670963781461019, + "grad_norm": 0.6140845417976379, + "learning_rate": 9.916043397648685e-05, + "loss": 2.1481, + "step": 2825 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.6092365384101868, + "learning_rate": 9.915952668208295e-05, + "loss": 2.1567, + "step": 2826 + }, + { + "epoch": 0.8677102516881522, + "grad_norm": 0.5712884068489075, + "learning_rate": 9.915861890185578e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.8680171884591774, + "grad_norm": 0.5314213633537292, + "learning_rate": 9.915771063581434e-05, + "loss": 2.0408, + "step": 2828 + }, + { + "epoch": 0.8683241252302025, + "grad_norm": 0.5258345007896423, + "learning_rate": 9.915680188396759e-05, + "loss": 2.0968, + "step": 2829 + }, + { + "epoch": 0.8686310620012277, + "grad_norm": 0.6071497797966003, + "learning_rate": 9.915589264632453e-05, + "loss": 2.0924, + "step": 2830 + }, + { + "epoch": 0.8689379987722529, + "grad_norm": 0.6742420792579651, + "learning_rate": 9.915498292289408e-05, + "loss": 2.1276, + "step": 2831 + }, + { + "epoch": 0.8692449355432781, + "grad_norm": 0.7642729878425598, + "learning_rate": 9.915407271368533e-05, + "loss": 2.204, + "step": 2832 + }, + { + "epoch": 0.8695518723143032, + "grad_norm": 0.8024489283561707, + "learning_rate": 9.915316201870718e-05, + "loss": 2.163, + "step": 2833 + }, + { + "epoch": 0.8698588090853284, + "grad_norm": 0.8268367648124695, + "learning_rate": 9.915225083796871e-05, + "loss": 2.117, + "step": 2834 + }, + { + "epoch": 0.8701657458563536, + "grad_norm": 0.7761407494544983, + "learning_rate": 9.915133917147888e-05, + "loss": 2.0727, + "step": 2835 + }, + { + "epoch": 0.8704726826273788, + "grad_norm": 0.7515753507614136, + "learning_rate": 9.91504270192467e-05, + "loss": 2.075, + "step": 2836 + }, + { + "epoch": 0.870779619398404, + "grad_norm": 0.6203973889350891, + "learning_rate": 9.914951438128119e-05, + "loss": 2.1163, + "step": 2837 + }, + { + "epoch": 0.8710865561694291, + "grad_norm": 0.6056976318359375, + "learning_rate": 9.914860125759138e-05, + "loss": 2.1515, + "step": 2838 + }, + { + "epoch": 0.8713934929404543, + "grad_norm": 0.6472234725952148, + "learning_rate": 9.914768764818627e-05, + "loss": 2.1618, + "step": 2839 + }, + { + "epoch": 0.8717004297114794, + "grad_norm": 0.5981749892234802, + "learning_rate": 9.914677355307491e-05, + "loss": 2.0763, + "step": 2840 + }, + { + "epoch": 0.8720073664825047, + "grad_norm": 0.5721938014030457, + "learning_rate": 9.914585897226634e-05, + "loss": 2.0916, + "step": 2841 + }, + { + "epoch": 0.8723143032535298, + "grad_norm": 0.6079535484313965, + "learning_rate": 9.914494390576958e-05, + "loss": 2.0767, + "step": 2842 + }, + { + "epoch": 0.872621240024555, + "grad_norm": 0.6684066653251648, + "learning_rate": 9.914402835359368e-05, + "loss": 2.2712, + "step": 2843 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.6992711424827576, + "learning_rate": 9.91431123157477e-05, + "loss": 2.0813, + "step": 2844 + }, + { + "epoch": 0.8732351135666053, + "grad_norm": 0.6585392951965332, + "learning_rate": 9.914219579224065e-05, + "loss": 2.1303, + "step": 2845 + }, + { + "epoch": 0.8735420503376304, + "grad_norm": 0.7267395257949829, + "learning_rate": 9.914127878308164e-05, + "loss": 2.2253, + "step": 2846 + }, + { + "epoch": 0.8738489871086557, + "grad_norm": 0.6764006018638611, + "learning_rate": 9.91403612882797e-05, + "loss": 2.0886, + "step": 2847 + }, + { + "epoch": 0.8741559238796808, + "grad_norm": 0.612808108329773, + "learning_rate": 9.91394433078439e-05, + "loss": 2.0469, + "step": 2848 + }, + { + "epoch": 0.874462860650706, + "grad_norm": 0.5598782896995544, + "learning_rate": 9.913852484178334e-05, + "loss": 2.1745, + "step": 2849 + }, + { + "epoch": 0.8747697974217311, + "grad_norm": 0.6498168706893921, + "learning_rate": 9.913760589010707e-05, + "loss": 2.2657, + "step": 2850 + }, + { + "epoch": 0.8750767341927563, + "grad_norm": 0.6796014904975891, + "learning_rate": 9.913668645282418e-05, + "loss": 2.1056, + "step": 2851 + }, + { + "epoch": 0.8753836709637814, + "grad_norm": 0.7409440279006958, + "learning_rate": 9.913576652994376e-05, + "loss": 2.1533, + "step": 2852 + }, + { + "epoch": 0.8756906077348067, + "grad_norm": 0.7044464945793152, + "learning_rate": 9.913484612147488e-05, + "loss": 2.2088, + "step": 2853 + }, + { + "epoch": 0.8759975445058318, + "grad_norm": 0.6333544254302979, + "learning_rate": 9.913392522742666e-05, + "loss": 2.132, + "step": 2854 + }, + { + "epoch": 0.876304481276857, + "grad_norm": 0.603382408618927, + "learning_rate": 9.91330038478082e-05, + "loss": 2.0657, + "step": 2855 + }, + { + "epoch": 0.8766114180478821, + "grad_norm": 0.5919856429100037, + "learning_rate": 9.913208198262858e-05, + "loss": 2.0854, + "step": 2856 + }, + { + "epoch": 0.8769183548189073, + "grad_norm": 0.6033365726470947, + "learning_rate": 9.913115963189694e-05, + "loss": 2.0825, + "step": 2857 + }, + { + "epoch": 0.8772252915899325, + "grad_norm": 0.5917964577674866, + "learning_rate": 9.913023679562238e-05, + "loss": 2.1608, + "step": 2858 + }, + { + "epoch": 0.8775322283609577, + "grad_norm": 0.5953360795974731, + "learning_rate": 9.912931347381402e-05, + "loss": 2.1454, + "step": 2859 + }, + { + "epoch": 0.8778391651319828, + "grad_norm": 0.5949352979660034, + "learning_rate": 9.9128389666481e-05, + "loss": 2.1575, + "step": 2860 + }, + { + "epoch": 0.878146101903008, + "grad_norm": 0.5468181371688843, + "learning_rate": 9.912746537363243e-05, + "loss": 2.151, + "step": 2861 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.5476632714271545, + "learning_rate": 9.912654059527746e-05, + "loss": 2.1015, + "step": 2862 + }, + { + "epoch": 0.8787599754450584, + "grad_norm": 0.6881390810012817, + "learning_rate": 9.912561533142521e-05, + "loss": 2.2002, + "step": 2863 + }, + { + "epoch": 0.8790669122160835, + "grad_norm": 0.6663404703140259, + "learning_rate": 9.912468958208486e-05, + "loss": 2.0691, + "step": 2864 + }, + { + "epoch": 0.8793738489871087, + "grad_norm": 0.5739100575447083, + "learning_rate": 9.91237633472655e-05, + "loss": 2.0852, + "step": 2865 + }, + { + "epoch": 0.8796807857581338, + "grad_norm": 0.5227558016777039, + "learning_rate": 9.912283662697635e-05, + "loss": 2.1144, + "step": 2866 + }, + { + "epoch": 0.879987722529159, + "grad_norm": 0.5626821517944336, + "learning_rate": 9.912190942122652e-05, + "loss": 2.0796, + "step": 2867 + }, + { + "epoch": 0.8802946593001841, + "grad_norm": 0.5367855429649353, + "learning_rate": 9.912098173002518e-05, + "loss": 2.0768, + "step": 2868 + }, + { + "epoch": 0.8806015960712094, + "grad_norm": 0.5285482406616211, + "learning_rate": 9.912005355338152e-05, + "loss": 2.0832, + "step": 2869 + }, + { + "epoch": 0.8809085328422345, + "grad_norm": 0.5384502410888672, + "learning_rate": 9.91191248913047e-05, + "loss": 2.0187, + "step": 2870 + }, + { + "epoch": 0.8812154696132597, + "grad_norm": 0.5099567770957947, + "learning_rate": 9.91181957438039e-05, + "loss": 2.0865, + "step": 2871 + }, + { + "epoch": 0.8815224063842848, + "grad_norm": 0.5513966679573059, + "learning_rate": 9.911726611088831e-05, + "loss": 2.1097, + "step": 2872 + }, + { + "epoch": 0.88182934315531, + "grad_norm": 0.5411790609359741, + "learning_rate": 9.911633599256709e-05, + "loss": 2.0964, + "step": 2873 + }, + { + "epoch": 0.8821362799263351, + "grad_norm": 0.6151100397109985, + "learning_rate": 9.911540538884947e-05, + "loss": 2.1006, + "step": 2874 + }, + { + "epoch": 0.8824432166973604, + "grad_norm": 0.754391610622406, + "learning_rate": 9.911447429974461e-05, + "loss": 2.1493, + "step": 2875 + }, + { + "epoch": 0.8827501534683855, + "grad_norm": 0.7485715746879578, + "learning_rate": 9.911354272526172e-05, + "loss": 2.1136, + "step": 2876 + }, + { + "epoch": 0.8830570902394107, + "grad_norm": 0.6808591485023499, + "learning_rate": 9.911261066541003e-05, + "loss": 2.1238, + "step": 2877 + }, + { + "epoch": 0.8833640270104358, + "grad_norm": 0.5771127343177795, + "learning_rate": 9.911167812019874e-05, + "loss": 2.0846, + "step": 2878 + }, + { + "epoch": 0.883670963781461, + "grad_norm": 0.5991767048835754, + "learning_rate": 9.911074508963705e-05, + "loss": 2.1486, + "step": 2879 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.6899440884590149, + "learning_rate": 9.91098115737342e-05, + "loss": 2.1357, + "step": 2880 + }, + { + "epoch": 0.8842848373235114, + "grad_norm": 0.7102574110031128, + "learning_rate": 9.91088775724994e-05, + "loss": 2.1269, + "step": 2881 + }, + { + "epoch": 0.8845917740945365, + "grad_norm": 0.7238754034042358, + "learning_rate": 9.910794308594189e-05, + "loss": 2.0829, + "step": 2882 + }, + { + "epoch": 0.8848987108655617, + "grad_norm": 0.7232441902160645, + "learning_rate": 9.91070081140709e-05, + "loss": 2.1704, + "step": 2883 + }, + { + "epoch": 0.8852056476365868, + "grad_norm": 0.7136173844337463, + "learning_rate": 9.910607265689569e-05, + "loss": 2.1553, + "step": 2884 + }, + { + "epoch": 0.885512584407612, + "grad_norm": 0.6566216945648193, + "learning_rate": 9.910513671442547e-05, + "loss": 2.0856, + "step": 2885 + }, + { + "epoch": 0.8858195211786372, + "grad_norm": 0.5712916851043701, + "learning_rate": 9.910420028666951e-05, + "loss": 2.1399, + "step": 2886 + }, + { + "epoch": 0.8861264579496624, + "grad_norm": 0.727664589881897, + "learning_rate": 9.910326337363707e-05, + "loss": 2.088, + "step": 2887 + }, + { + "epoch": 0.8864333947206875, + "grad_norm": 0.799963653087616, + "learning_rate": 9.91023259753374e-05, + "loss": 2.0984, + "step": 2888 + }, + { + "epoch": 0.8867403314917127, + "grad_norm": 0.9462977051734924, + "learning_rate": 9.910138809177975e-05, + "loss": 2.1262, + "step": 2889 + }, + { + "epoch": 0.8870472682627378, + "grad_norm": 0.9130533933639526, + "learning_rate": 9.910044972297343e-05, + "loss": 2.1967, + "step": 2890 + }, + { + "epoch": 0.887354205033763, + "grad_norm": 0.6971304416656494, + "learning_rate": 9.909951086892767e-05, + "loss": 2.0797, + "step": 2891 + }, + { + "epoch": 0.8876611418047882, + "grad_norm": 0.5822353363037109, + "learning_rate": 9.909857152965176e-05, + "loss": 2.1152, + "step": 2892 + }, + { + "epoch": 0.8879680785758134, + "grad_norm": 0.5885453820228577, + "learning_rate": 9.9097631705155e-05, + "loss": 2.0323, + "step": 2893 + }, + { + "epoch": 0.8882750153468385, + "grad_norm": 0.6249284744262695, + "learning_rate": 9.909669139544666e-05, + "loss": 2.1076, + "step": 2894 + }, + { + "epoch": 0.8885819521178637, + "grad_norm": 0.6117702722549438, + "learning_rate": 9.909575060053604e-05, + "loss": 2.0608, + "step": 2895 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.560357928276062, + "learning_rate": 9.909480932043245e-05, + "loss": 2.145, + "step": 2896 + }, + { + "epoch": 0.8891958256599141, + "grad_norm": 0.5442607998847961, + "learning_rate": 9.909386755514516e-05, + "loss": 2.1091, + "step": 2897 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.5653077363967896, + "learning_rate": 9.909292530468351e-05, + "loss": 2.1097, + "step": 2898 + }, + { + "epoch": 0.8898096992019644, + "grad_norm": 0.531939685344696, + "learning_rate": 9.909198256905679e-05, + "loss": 2.0866, + "step": 2899 + }, + { + "epoch": 0.8901166359729895, + "grad_norm": 0.6238400340080261, + "learning_rate": 9.909103934827433e-05, + "loss": 2.1421, + "step": 2900 + }, + { + "epoch": 0.8904235727440147, + "grad_norm": 0.5685901045799255, + "learning_rate": 9.909009564234543e-05, + "loss": 2.0019, + "step": 2901 + }, + { + "epoch": 0.8907305095150398, + "grad_norm": 0.5979083180427551, + "learning_rate": 9.908915145127945e-05, + "loss": 2.0891, + "step": 2902 + }, + { + "epoch": 0.8910374462860651, + "grad_norm": 0.5847237706184387, + "learning_rate": 9.90882067750857e-05, + "loss": 2.1165, + "step": 2903 + }, + { + "epoch": 0.8913443830570903, + "grad_norm": 0.6281530261039734, + "learning_rate": 9.908726161377351e-05, + "loss": 2.1396, + "step": 2904 + }, + { + "epoch": 0.8916513198281154, + "grad_norm": 0.5685252547264099, + "learning_rate": 9.908631596735225e-05, + "loss": 2.0781, + "step": 2905 + }, + { + "epoch": 0.8919582565991406, + "grad_norm": 0.5427065491676331, + "learning_rate": 9.908536983583123e-05, + "loss": 2.1387, + "step": 2906 + }, + { + "epoch": 0.8922651933701657, + "grad_norm": 0.5972270965576172, + "learning_rate": 9.908442321921982e-05, + "loss": 2.0546, + "step": 2907 + }, + { + "epoch": 0.892572130141191, + "grad_norm": 0.562685489654541, + "learning_rate": 9.908347611752735e-05, + "loss": 2.093, + "step": 2908 + }, + { + "epoch": 0.8928790669122161, + "grad_norm": 0.6781734824180603, + "learning_rate": 9.908252853076323e-05, + "loss": 2.1589, + "step": 2909 + }, + { + "epoch": 0.8931860036832413, + "grad_norm": 0.7591540813446045, + "learning_rate": 9.908158045893678e-05, + "loss": 2.164, + "step": 2910 + }, + { + "epoch": 0.8934929404542664, + "grad_norm": 0.7161938548088074, + "learning_rate": 9.908063190205738e-05, + "loss": 2.079, + "step": 2911 + }, + { + "epoch": 0.8937998772252916, + "grad_norm": 0.7338036298751831, + "learning_rate": 9.907968286013442e-05, + "loss": 2.0033, + "step": 2912 + }, + { + "epoch": 0.8941068139963168, + "grad_norm": 0.7641176581382751, + "learning_rate": 9.907873333317727e-05, + "loss": 2.187, + "step": 2913 + }, + { + "epoch": 0.894413750767342, + "grad_norm": 0.6073760390281677, + "learning_rate": 9.90777833211953e-05, + "loss": 2.0589, + "step": 2914 + }, + { + "epoch": 0.8947206875383671, + "grad_norm": 0.49493756890296936, + "learning_rate": 9.907683282419791e-05, + "loss": 2.0555, + "step": 2915 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.6428996920585632, + "learning_rate": 9.907588184219449e-05, + "loss": 2.1083, + "step": 2916 + }, + { + "epoch": 0.8953345610804174, + "grad_norm": 0.6752644777297974, + "learning_rate": 9.907493037519447e-05, + "loss": 2.0987, + "step": 2917 + }, + { + "epoch": 0.8956414978514426, + "grad_norm": 0.5719494223594666, + "learning_rate": 9.907397842320719e-05, + "loss": 2.1735, + "step": 2918 + }, + { + "epoch": 0.8959484346224678, + "grad_norm": 0.5799626111984253, + "learning_rate": 9.907302598624211e-05, + "loss": 2.0978, + "step": 2919 + }, + { + "epoch": 0.896255371393493, + "grad_norm": 0.5407500267028809, + "learning_rate": 9.907207306430861e-05, + "loss": 2.0303, + "step": 2920 + }, + { + "epoch": 0.8965623081645181, + "grad_norm": 0.5950884222984314, + "learning_rate": 9.907111965741614e-05, + "loss": 2.0721, + "step": 2921 + }, + { + "epoch": 0.8968692449355433, + "grad_norm": 0.7711441516876221, + "learning_rate": 9.907016576557409e-05, + "loss": 2.1693, + "step": 2922 + }, + { + "epoch": 0.8971761817065684, + "grad_norm": 0.5522177815437317, + "learning_rate": 9.906921138879191e-05, + "loss": 2.1057, + "step": 2923 + }, + { + "epoch": 0.8974831184775937, + "grad_norm": 0.5743894577026367, + "learning_rate": 9.906825652707903e-05, + "loss": 2.119, + "step": 2924 + }, + { + "epoch": 0.8977900552486188, + "grad_norm": 0.5996440649032593, + "learning_rate": 9.906730118044486e-05, + "loss": 2.1251, + "step": 2925 + }, + { + "epoch": 0.898096992019644, + "grad_norm": 0.691302478313446, + "learning_rate": 9.906634534889887e-05, + "loss": 2.1459, + "step": 2926 + }, + { + "epoch": 0.8984039287906691, + "grad_norm": 0.6125866770744324, + "learning_rate": 9.90653890324505e-05, + "loss": 2.0739, + "step": 2927 + }, + { + "epoch": 0.8987108655616943, + "grad_norm": 0.5285681486129761, + "learning_rate": 9.906443223110919e-05, + "loss": 2.0398, + "step": 2928 + }, + { + "epoch": 0.8990178023327194, + "grad_norm": 0.5747935771942139, + "learning_rate": 9.90634749448844e-05, + "loss": 2.0688, + "step": 2929 + }, + { + "epoch": 0.8993247391037447, + "grad_norm": 0.5686646103858948, + "learning_rate": 9.90625171737856e-05, + "loss": 2.1196, + "step": 2930 + }, + { + "epoch": 0.8996316758747698, + "grad_norm": 0.5320247411727905, + "learning_rate": 9.906155891782225e-05, + "loss": 2.1069, + "step": 2931 + }, + { + "epoch": 0.899938612645795, + "grad_norm": 0.5626047849655151, + "learning_rate": 9.906060017700383e-05, + "loss": 2.1091, + "step": 2932 + }, + { + "epoch": 0.9002455494168201, + "grad_norm": 0.5284978151321411, + "learning_rate": 9.905964095133979e-05, + "loss": 2.036, + "step": 2933 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.5362093448638916, + "learning_rate": 9.905868124083962e-05, + "loss": 2.1273, + "step": 2934 + }, + { + "epoch": 0.9008594229588704, + "grad_norm": 0.5583781599998474, + "learning_rate": 9.90577210455128e-05, + "loss": 2.0871, + "step": 2935 + }, + { + "epoch": 0.9011663597298957, + "grad_norm": 0.5552016496658325, + "learning_rate": 9.905676036536883e-05, + "loss": 2.0785, + "step": 2936 + }, + { + "epoch": 0.9014732965009208, + "grad_norm": 0.6875657439231873, + "learning_rate": 9.905579920041724e-05, + "loss": 2.083, + "step": 2937 + }, + { + "epoch": 0.901780233271946, + "grad_norm": 0.5396340489387512, + "learning_rate": 9.905483755066744e-05, + "loss": 2.0717, + "step": 2938 + }, + { + "epoch": 0.9020871700429711, + "grad_norm": 0.594739556312561, + "learning_rate": 9.9053875416129e-05, + "loss": 2.1305, + "step": 2939 + }, + { + "epoch": 0.9023941068139963, + "grad_norm": 0.6208831667900085, + "learning_rate": 9.905291279681143e-05, + "loss": 2.0034, + "step": 2940 + }, + { + "epoch": 0.9027010435850215, + "grad_norm": 0.5154325366020203, + "learning_rate": 9.90519496927242e-05, + "loss": 2.098, + "step": 2941 + }, + { + "epoch": 0.9030079803560467, + "grad_norm": 0.5217738151550293, + "learning_rate": 9.905098610387687e-05, + "loss": 2.0467, + "step": 2942 + }, + { + "epoch": 0.9033149171270718, + "grad_norm": 0.5623623728752136, + "learning_rate": 9.905002203027894e-05, + "loss": 2.1854, + "step": 2943 + }, + { + "epoch": 0.903621853898097, + "grad_norm": 0.5365456938743591, + "learning_rate": 9.904905747193993e-05, + "loss": 2.1021, + "step": 2944 + }, + { + "epoch": 0.9039287906691221, + "grad_norm": 0.5391906499862671, + "learning_rate": 9.904809242886941e-05, + "loss": 2.1102, + "step": 2945 + }, + { + "epoch": 0.9042357274401474, + "grad_norm": 0.5439971685409546, + "learning_rate": 9.904712690107687e-05, + "loss": 2.0691, + "step": 2946 + }, + { + "epoch": 0.9045426642111725, + "grad_norm": 0.539383053779602, + "learning_rate": 9.904616088857189e-05, + "loss": 2.0514, + "step": 2947 + }, + { + "epoch": 0.9048496009821977, + "grad_norm": 0.5370060801506042, + "learning_rate": 9.904519439136399e-05, + "loss": 2.1069, + "step": 2948 + }, + { + "epoch": 0.9051565377532228, + "grad_norm": 0.5136541724205017, + "learning_rate": 9.904422740946274e-05, + "loss": 2.0519, + "step": 2949 + }, + { + "epoch": 0.905463474524248, + "grad_norm": 0.4970051348209381, + "learning_rate": 9.904325994287768e-05, + "loss": 2.0624, + "step": 2950 + }, + { + "epoch": 0.9057704112952731, + "grad_norm": 0.5003986954689026, + "learning_rate": 9.90422919916184e-05, + "loss": 2.135, + "step": 2951 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.5559821724891663, + "learning_rate": 9.904132355569443e-05, + "loss": 2.0733, + "step": 2952 + }, + { + "epoch": 0.9063842848373235, + "grad_norm": 0.5450533628463745, + "learning_rate": 9.904035463511537e-05, + "loss": 2.1491, + "step": 2953 + }, + { + "epoch": 0.9066912216083487, + "grad_norm": 0.5789141058921814, + "learning_rate": 9.903938522989076e-05, + "loss": 2.0604, + "step": 2954 + }, + { + "epoch": 0.9069981583793738, + "grad_norm": 0.6327412128448486, + "learning_rate": 9.903841534003023e-05, + "loss": 2.1307, + "step": 2955 + }, + { + "epoch": 0.907305095150399, + "grad_norm": 0.5694023966789246, + "learning_rate": 9.90374449655433e-05, + "loss": 2.1322, + "step": 2956 + }, + { + "epoch": 0.9076120319214241, + "grad_norm": 0.6241337060928345, + "learning_rate": 9.903647410643963e-05, + "loss": 2.1026, + "step": 2957 + }, + { + "epoch": 0.9079189686924494, + "grad_norm": 0.6257766485214233, + "learning_rate": 9.903550276272878e-05, + "loss": 2.0449, + "step": 2958 + }, + { + "epoch": 0.9082259054634745, + "grad_norm": 0.708626389503479, + "learning_rate": 9.903453093442032e-05, + "loss": 2.095, + "step": 2959 + }, + { + "epoch": 0.9085328422344997, + "grad_norm": 0.6769086122512817, + "learning_rate": 9.903355862152391e-05, + "loss": 2.0939, + "step": 2960 + }, + { + "epoch": 0.9088397790055248, + "grad_norm": 0.6221890449523926, + "learning_rate": 9.903258582404913e-05, + "loss": 2.1552, + "step": 2961 + }, + { + "epoch": 0.90914671577655, + "grad_norm": 0.7477858662605286, + "learning_rate": 9.903161254200561e-05, + "loss": 2.1155, + "step": 2962 + }, + { + "epoch": 0.9094536525475752, + "grad_norm": 0.665538489818573, + "learning_rate": 9.903063877540294e-05, + "loss": 2.1032, + "step": 2963 + }, + { + "epoch": 0.9097605893186004, + "grad_norm": 0.5973435044288635, + "learning_rate": 9.902966452425076e-05, + "loss": 2.0793, + "step": 2964 + }, + { + "epoch": 0.9100675260896255, + "grad_norm": 0.6544547080993652, + "learning_rate": 9.90286897885587e-05, + "loss": 2.1566, + "step": 2965 + }, + { + "epoch": 0.9103744628606507, + "grad_norm": 0.7162452936172485, + "learning_rate": 9.90277145683364e-05, + "loss": 2.1234, + "step": 2966 + }, + { + "epoch": 0.9106813996316758, + "grad_norm": 0.8400503993034363, + "learning_rate": 9.902673886359349e-05, + "loss": 2.216, + "step": 2967 + }, + { + "epoch": 0.910988336402701, + "grad_norm": 1.0350611209869385, + "learning_rate": 9.902576267433961e-05, + "loss": 2.0785, + "step": 2968 + }, + { + "epoch": 0.9112952731737262, + "grad_norm": 0.9551987051963806, + "learning_rate": 9.90247860005844e-05, + "loss": 2.0652, + "step": 2969 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.839712381362915, + "learning_rate": 9.902380884233751e-05, + "loss": 2.1197, + "step": 2970 + }, + { + "epoch": 0.9119091467157765, + "grad_norm": 0.6588022708892822, + "learning_rate": 9.902283119960863e-05, + "loss": 2.155, + "step": 2971 + }, + { + "epoch": 0.9122160834868017, + "grad_norm": 0.6532430052757263, + "learning_rate": 9.902185307240739e-05, + "loss": 2.0947, + "step": 2972 + }, + { + "epoch": 0.9125230202578268, + "grad_norm": 0.7890481352806091, + "learning_rate": 9.902087446074346e-05, + "loss": 2.0246, + "step": 2973 + }, + { + "epoch": 0.9128299570288521, + "grad_norm": 0.6234511137008667, + "learning_rate": 9.901989536462652e-05, + "loss": 2.1033, + "step": 2974 + }, + { + "epoch": 0.9131368937998773, + "grad_norm": 0.5875300168991089, + "learning_rate": 9.901891578406623e-05, + "loss": 2.0553, + "step": 2975 + }, + { + "epoch": 0.9134438305709024, + "grad_norm": 0.6868174076080322, + "learning_rate": 9.901793571907231e-05, + "loss": 2.1398, + "step": 2976 + }, + { + "epoch": 0.9137507673419276, + "grad_norm": 0.7423301339149475, + "learning_rate": 9.90169551696544e-05, + "loss": 2.1034, + "step": 2977 + }, + { + "epoch": 0.9140577041129527, + "grad_norm": 0.588916003704071, + "learning_rate": 9.901597413582222e-05, + "loss": 2.078, + "step": 2978 + }, + { + "epoch": 0.914364640883978, + "grad_norm": 0.5895309448242188, + "learning_rate": 9.901499261758544e-05, + "loss": 2.0902, + "step": 2979 + }, + { + "epoch": 0.9146715776550031, + "grad_norm": 0.5403301119804382, + "learning_rate": 9.901401061495379e-05, + "loss": 2.0291, + "step": 2980 + }, + { + "epoch": 0.9149785144260283, + "grad_norm": 0.6102077960968018, + "learning_rate": 9.901302812793696e-05, + "loss": 2.0415, + "step": 2981 + }, + { + "epoch": 0.9152854511970534, + "grad_norm": 0.6728450059890747, + "learning_rate": 9.901204515654465e-05, + "loss": 2.105, + "step": 2982 + }, + { + "epoch": 0.9155923879680786, + "grad_norm": 0.5886163711547852, + "learning_rate": 9.901106170078657e-05, + "loss": 2.0186, + "step": 2983 + }, + { + "epoch": 0.9158993247391037, + "grad_norm": 0.539252758026123, + "learning_rate": 9.901007776067247e-05, + "loss": 2.0604, + "step": 2984 + }, + { + "epoch": 0.916206261510129, + "grad_norm": 0.6169516444206238, + "learning_rate": 9.900909333621205e-05, + "loss": 2.1257, + "step": 2985 + }, + { + "epoch": 0.9165131982811541, + "grad_norm": 0.5624274015426636, + "learning_rate": 9.900810842741506e-05, + "loss": 2.0325, + "step": 2986 + }, + { + "epoch": 0.9168201350521793, + "grad_norm": 0.5931735634803772, + "learning_rate": 9.900712303429119e-05, + "loss": 2.0815, + "step": 2987 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.5720505714416504, + "learning_rate": 9.900613715685023e-05, + "loss": 2.1261, + "step": 2988 + }, + { + "epoch": 0.9174340085942296, + "grad_norm": 0.5752067565917969, + "learning_rate": 9.900515079510189e-05, + "loss": 2.1402, + "step": 2989 + }, + { + "epoch": 0.9177409453652547, + "grad_norm": 0.5836917757987976, + "learning_rate": 9.900416394905591e-05, + "loss": 2.0523, + "step": 2990 + }, + { + "epoch": 0.91804788213628, + "grad_norm": 0.6408325433731079, + "learning_rate": 9.900317661872209e-05, + "loss": 2.1874, + "step": 2991 + }, + { + "epoch": 0.9183548189073051, + "grad_norm": 0.6188341379165649, + "learning_rate": 9.900218880411013e-05, + "loss": 2.0903, + "step": 2992 + }, + { + "epoch": 0.9186617556783303, + "grad_norm": 0.5740565657615662, + "learning_rate": 9.900120050522985e-05, + "loss": 2.1243, + "step": 2993 + }, + { + "epoch": 0.9189686924493554, + "grad_norm": 0.635638952255249, + "learning_rate": 9.900021172209096e-05, + "loss": 2.089, + "step": 2994 + }, + { + "epoch": 0.9192756292203806, + "grad_norm": 0.5538209676742554, + "learning_rate": 9.899922245470326e-05, + "loss": 2.0489, + "step": 2995 + }, + { + "epoch": 0.9195825659914058, + "grad_norm": 0.5440292954444885, + "learning_rate": 9.899823270307654e-05, + "loss": 2.0534, + "step": 2996 + }, + { + "epoch": 0.919889502762431, + "grad_norm": 0.6203792691230774, + "learning_rate": 9.899724246722055e-05, + "loss": 2.2799, + "step": 2997 + }, + { + "epoch": 0.9201964395334561, + "grad_norm": 0.6299278140068054, + "learning_rate": 9.89962517471451e-05, + "loss": 2.0813, + "step": 2998 + }, + { + "epoch": 0.9205033763044813, + "grad_norm": 0.6156774759292603, + "learning_rate": 9.899526054285997e-05, + "loss": 2.1345, + "step": 2999 + }, + { + "epoch": 0.9208103130755064, + "grad_norm": 0.5940032601356506, + "learning_rate": 9.899426885437496e-05, + "loss": 2.133, + "step": 3000 + }, + { + "epoch": 0.9211172498465316, + "grad_norm": 0.6210232377052307, + "learning_rate": 9.899327668169987e-05, + "loss": 2.0275, + "step": 3001 + }, + { + "epoch": 0.9214241866175568, + "grad_norm": 0.5578985214233398, + "learning_rate": 9.89922840248445e-05, + "loss": 2.0806, + "step": 3002 + }, + { + "epoch": 0.921731123388582, + "grad_norm": 0.5264963507652283, + "learning_rate": 9.899129088381866e-05, + "loss": 2.1233, + "step": 3003 + }, + { + "epoch": 0.9220380601596071, + "grad_norm": 0.5414119958877563, + "learning_rate": 9.899029725863218e-05, + "loss": 2.1052, + "step": 3004 + }, + { + "epoch": 0.9223449969306323, + "grad_norm": 0.5933207869529724, + "learning_rate": 9.898930314929486e-05, + "loss": 2.108, + "step": 3005 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.6170317530632019, + "learning_rate": 9.898830855581654e-05, + "loss": 2.0997, + "step": 3006 + }, + { + "epoch": 0.9229588704726827, + "grad_norm": 0.5930282473564148, + "learning_rate": 9.898731347820705e-05, + "loss": 2.0507, + "step": 3007 + }, + { + "epoch": 0.9232658072437078, + "grad_norm": 0.5894142985343933, + "learning_rate": 9.898631791647619e-05, + "loss": 2.0687, + "step": 3008 + }, + { + "epoch": 0.923572744014733, + "grad_norm": 0.6560437083244324, + "learning_rate": 9.898532187063383e-05, + "loss": 2.096, + "step": 3009 + }, + { + "epoch": 0.9238796807857581, + "grad_norm": 0.6083245873451233, + "learning_rate": 9.898432534068983e-05, + "loss": 2.0526, + "step": 3010 + }, + { + "epoch": 0.9241866175567833, + "grad_norm": 0.5152565240859985, + "learning_rate": 9.8983328326654e-05, + "loss": 2.0802, + "step": 3011 + }, + { + "epoch": 0.9244935543278084, + "grad_norm": 0.6326588988304138, + "learning_rate": 9.89823308285362e-05, + "loss": 2.1246, + "step": 3012 + }, + { + "epoch": 0.9248004910988337, + "grad_norm": 0.6821309328079224, + "learning_rate": 9.898133284634632e-05, + "loss": 2.1106, + "step": 3013 + }, + { + "epoch": 0.9251074278698588, + "grad_norm": 0.6192164421081543, + "learning_rate": 9.898033438009419e-05, + "loss": 2.0475, + "step": 3014 + }, + { + "epoch": 0.925414364640884, + "grad_norm": 0.6112427115440369, + "learning_rate": 9.897933542978967e-05, + "loss": 2.0904, + "step": 3015 + }, + { + "epoch": 0.9257213014119091, + "grad_norm": 0.5729427933692932, + "learning_rate": 9.897833599544268e-05, + "loss": 2.1151, + "step": 3016 + }, + { + "epoch": 0.9260282381829343, + "grad_norm": 0.6200255751609802, + "learning_rate": 9.897733607706305e-05, + "loss": 2.0815, + "step": 3017 + }, + { + "epoch": 0.9263351749539595, + "grad_norm": 0.635920524597168, + "learning_rate": 9.897633567466068e-05, + "loss": 2.0724, + "step": 3018 + }, + { + "epoch": 0.9266421117249847, + "grad_norm": 0.5916038155555725, + "learning_rate": 9.897533478824546e-05, + "loss": 2.1527, + "step": 3019 + }, + { + "epoch": 0.9269490484960098, + "grad_norm": 0.5552941560745239, + "learning_rate": 9.897433341782727e-05, + "loss": 2.0958, + "step": 3020 + }, + { + "epoch": 0.927255985267035, + "grad_norm": 0.562383770942688, + "learning_rate": 9.897333156341602e-05, + "loss": 2.0939, + "step": 3021 + }, + { + "epoch": 0.9275629220380601, + "grad_norm": 0.5227869153022766, + "learning_rate": 9.897232922502158e-05, + "loss": 2.1358, + "step": 3022 + }, + { + "epoch": 0.9278698588090853, + "grad_norm": 0.5671074986457825, + "learning_rate": 9.897132640265391e-05, + "loss": 2.0877, + "step": 3023 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.5176356434822083, + "learning_rate": 9.897032309632287e-05, + "loss": 2.0392, + "step": 3024 + }, + { + "epoch": 0.9284837323511357, + "grad_norm": 0.5160155296325684, + "learning_rate": 9.89693193060384e-05, + "loss": 2.069, + "step": 3025 + }, + { + "epoch": 0.9287906691221608, + "grad_norm": 0.5034440159797668, + "learning_rate": 9.896831503181042e-05, + "loss": 2.0348, + "step": 3026 + }, + { + "epoch": 0.929097605893186, + "grad_norm": 0.5146151781082153, + "learning_rate": 9.896731027364884e-05, + "loss": 2.0884, + "step": 3027 + }, + { + "epoch": 0.9294045426642111, + "grad_norm": 0.7153071165084839, + "learning_rate": 9.896630503156361e-05, + "loss": 2.2295, + "step": 3028 + }, + { + "epoch": 0.9297114794352364, + "grad_norm": 0.7201753258705139, + "learning_rate": 9.896529930556464e-05, + "loss": 2.1285, + "step": 3029 + }, + { + "epoch": 0.9300184162062615, + "grad_norm": 0.7110029458999634, + "learning_rate": 9.89642930956619e-05, + "loss": 2.1371, + "step": 3030 + }, + { + "epoch": 0.9303253529772867, + "grad_norm": 0.695444643497467, + "learning_rate": 9.896328640186531e-05, + "loss": 2.0698, + "step": 3031 + }, + { + "epoch": 0.9306322897483118, + "grad_norm": 0.6157357096672058, + "learning_rate": 9.896227922418482e-05, + "loss": 2.1294, + "step": 3032 + }, + { + "epoch": 0.930939226519337, + "grad_norm": 0.5473730564117432, + "learning_rate": 9.896127156263039e-05, + "loss": 2.0487, + "step": 3033 + }, + { + "epoch": 0.9312461632903621, + "grad_norm": 0.6400229334831238, + "learning_rate": 9.896026341721198e-05, + "loss": 2.0422, + "step": 3034 + }, + { + "epoch": 0.9315531000613874, + "grad_norm": 0.5046324729919434, + "learning_rate": 9.895925478793955e-05, + "loss": 2.0715, + "step": 3035 + }, + { + "epoch": 0.9318600368324125, + "grad_norm": 0.5316528081893921, + "learning_rate": 9.895824567482307e-05, + "loss": 2.11, + "step": 3036 + }, + { + "epoch": 0.9321669736034377, + "grad_norm": 0.5760478973388672, + "learning_rate": 9.895723607787251e-05, + "loss": 2.0885, + "step": 3037 + }, + { + "epoch": 0.9324739103744628, + "grad_norm": 0.5034705996513367, + "learning_rate": 9.895622599709785e-05, + "loss": 2.0024, + "step": 3038 + }, + { + "epoch": 0.932780847145488, + "grad_norm": 0.46088743209838867, + "learning_rate": 9.895521543250906e-05, + "loss": 2.0794, + "step": 3039 + }, + { + "epoch": 0.9330877839165131, + "grad_norm": 0.5219544172286987, + "learning_rate": 9.895420438411616e-05, + "loss": 2.1002, + "step": 3040 + }, + { + "epoch": 0.9333947206875384, + "grad_norm": 0.5363453030586243, + "learning_rate": 9.89531928519291e-05, + "loss": 2.0629, + "step": 3041 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.5860787630081177, + "learning_rate": 9.89521808359579e-05, + "loss": 2.0999, + "step": 3042 + }, + { + "epoch": 0.9340085942295887, + "grad_norm": 0.7155836224555969, + "learning_rate": 9.895116833621255e-05, + "loss": 2.1674, + "step": 3043 + }, + { + "epoch": 0.9343155310006138, + "grad_norm": 0.8029196262359619, + "learning_rate": 9.895015535270307e-05, + "loss": 2.0776, + "step": 3044 + }, + { + "epoch": 0.934622467771639, + "grad_norm": 0.6973832845687866, + "learning_rate": 9.894914188543946e-05, + "loss": 2.0537, + "step": 3045 + }, + { + "epoch": 0.9349294045426643, + "grad_norm": 0.6646706461906433, + "learning_rate": 9.894812793443175e-05, + "loss": 2.0857, + "step": 3046 + }, + { + "epoch": 0.9352363413136894, + "grad_norm": 0.6343888640403748, + "learning_rate": 9.894711349968995e-05, + "loss": 2.0832, + "step": 3047 + }, + { + "epoch": 0.9355432780847146, + "grad_norm": 0.54819256067276, + "learning_rate": 9.894609858122407e-05, + "loss": 2.1576, + "step": 3048 + }, + { + "epoch": 0.9358502148557397, + "grad_norm": 0.6905701160430908, + "learning_rate": 9.894508317904419e-05, + "loss": 2.0685, + "step": 3049 + }, + { + "epoch": 0.9361571516267649, + "grad_norm": 0.605591356754303, + "learning_rate": 9.894406729316028e-05, + "loss": 2.0931, + "step": 3050 + }, + { + "epoch": 0.93646408839779, + "grad_norm": 0.5702943801879883, + "learning_rate": 9.89430509235824e-05, + "loss": 2.1224, + "step": 3051 + }, + { + "epoch": 0.9367710251688153, + "grad_norm": 0.5855122804641724, + "learning_rate": 9.894203407032064e-05, + "loss": 2.0747, + "step": 3052 + }, + { + "epoch": 0.9370779619398404, + "grad_norm": 0.6002167463302612, + "learning_rate": 9.894101673338498e-05, + "loss": 2.0991, + "step": 3053 + }, + { + "epoch": 0.9373848987108656, + "grad_norm": 0.5914842486381531, + "learning_rate": 9.893999891278553e-05, + "loss": 2.0427, + "step": 3054 + }, + { + "epoch": 0.9376918354818907, + "grad_norm": 0.6283048391342163, + "learning_rate": 9.893898060853232e-05, + "loss": 2.0558, + "step": 3055 + }, + { + "epoch": 0.937998772252916, + "grad_norm": 0.5955209136009216, + "learning_rate": 9.893796182063542e-05, + "loss": 2.1286, + "step": 3056 + }, + { + "epoch": 0.9383057090239411, + "grad_norm": 0.5579878687858582, + "learning_rate": 9.893694254910489e-05, + "loss": 2.0799, + "step": 3057 + }, + { + "epoch": 0.9386126457949663, + "grad_norm": 0.5690281391143799, + "learning_rate": 9.893592279395082e-05, + "loss": 2.0699, + "step": 3058 + }, + { + "epoch": 0.9389195825659914, + "grad_norm": 0.5189259648323059, + "learning_rate": 9.893490255518327e-05, + "loss": 2.0627, + "step": 3059 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.5205439925193787, + "learning_rate": 9.893388183281233e-05, + "loss": 2.0136, + "step": 3060 + }, + { + "epoch": 0.9395334561080417, + "grad_norm": 0.492593914270401, + "learning_rate": 9.89328606268481e-05, + "loss": 2.0799, + "step": 3061 + }, + { + "epoch": 0.939840392879067, + "grad_norm": 0.6511666178703308, + "learning_rate": 9.893183893730067e-05, + "loss": 2.1297, + "step": 3062 + }, + { + "epoch": 0.9401473296500921, + "grad_norm": 0.7640050053596497, + "learning_rate": 9.89308167641801e-05, + "loss": 2.1384, + "step": 3063 + }, + { + "epoch": 0.9404542664211173, + "grad_norm": 0.7526536583900452, + "learning_rate": 9.892979410749654e-05, + "loss": 2.0454, + "step": 3064 + }, + { + "epoch": 0.9407612031921424, + "grad_norm": 0.7140639424324036, + "learning_rate": 9.892877096726007e-05, + "loss": 2.0219, + "step": 3065 + }, + { + "epoch": 0.9410681399631676, + "grad_norm": 0.6584374308586121, + "learning_rate": 9.89277473434808e-05, + "loss": 2.0943, + "step": 3066 + }, + { + "epoch": 0.9413750767341927, + "grad_norm": 0.5889024138450623, + "learning_rate": 9.892672323616888e-05, + "loss": 2.1088, + "step": 3067 + }, + { + "epoch": 0.941682013505218, + "grad_norm": 0.6196749806404114, + "learning_rate": 9.892569864533438e-05, + "loss": 2.101, + "step": 3068 + }, + { + "epoch": 0.9419889502762431, + "grad_norm": 0.6432211399078369, + "learning_rate": 9.892467357098744e-05, + "loss": 2.0828, + "step": 3069 + }, + { + "epoch": 0.9422958870472683, + "grad_norm": 0.6448069214820862, + "learning_rate": 9.892364801313823e-05, + "loss": 2.1389, + "step": 3070 + }, + { + "epoch": 0.9426028238182934, + "grad_norm": 0.597197949886322, + "learning_rate": 9.892262197179682e-05, + "loss": 2.0902, + "step": 3071 + }, + { + "epoch": 0.9429097605893186, + "grad_norm": 0.625348687171936, + "learning_rate": 9.892159544697341e-05, + "loss": 2.0659, + "step": 3072 + }, + { + "epoch": 0.9432166973603437, + "grad_norm": 0.5109166502952576, + "learning_rate": 9.892056843867812e-05, + "loss": 2.0895, + "step": 3073 + }, + { + "epoch": 0.943523634131369, + "grad_norm": 0.5917959213256836, + "learning_rate": 9.891954094692108e-05, + "loss": 2.0646, + "step": 3074 + }, + { + "epoch": 0.9438305709023941, + "grad_norm": 0.5320633053779602, + "learning_rate": 9.891851297171249e-05, + "loss": 2.107, + "step": 3075 + }, + { + "epoch": 0.9441375076734193, + "grad_norm": 0.5271332263946533, + "learning_rate": 9.891748451306246e-05, + "loss": 2.0984, + "step": 3076 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.5389983057975769, + "learning_rate": 9.89164555709812e-05, + "loss": 2.1097, + "step": 3077 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.5536573529243469, + "learning_rate": 9.891542614547885e-05, + "loss": 2.1271, + "step": 3078 + }, + { + "epoch": 0.9450583179864948, + "grad_norm": 0.5481712222099304, + "learning_rate": 9.891439623656558e-05, + "loss": 2.0975, + "step": 3079 + }, + { + "epoch": 0.94536525475752, + "grad_norm": 0.626431941986084, + "learning_rate": 9.891336584425157e-05, + "loss": 2.1561, + "step": 3080 + }, + { + "epoch": 0.9456721915285451, + "grad_norm": 0.7452689409255981, + "learning_rate": 9.891233496854702e-05, + "loss": 2.0791, + "step": 3081 + }, + { + "epoch": 0.9459791282995703, + "grad_norm": 0.9399113059043884, + "learning_rate": 9.89113036094621e-05, + "loss": 2.0706, + "step": 3082 + }, + { + "epoch": 0.9462860650705954, + "grad_norm": 1.0733267068862915, + "learning_rate": 9.891027176700701e-05, + "loss": 2.0705, + "step": 3083 + }, + { + "epoch": 0.9465930018416207, + "grad_norm": 0.7521542906761169, + "learning_rate": 9.890923944119194e-05, + "loss": 2.0862, + "step": 3084 + }, + { + "epoch": 0.9468999386126458, + "grad_norm": 0.5447198152542114, + "learning_rate": 9.890820663202713e-05, + "loss": 2.1047, + "step": 3085 + }, + { + "epoch": 0.947206875383671, + "grad_norm": 0.5733833312988281, + "learning_rate": 9.890717333952273e-05, + "loss": 2.121, + "step": 3086 + }, + { + "epoch": 0.9475138121546961, + "grad_norm": 0.7225440144538879, + "learning_rate": 9.890613956368899e-05, + "loss": 2.0533, + "step": 3087 + }, + { + "epoch": 0.9478207489257213, + "grad_norm": 0.6377096176147461, + "learning_rate": 9.89051053045361e-05, + "loss": 2.07, + "step": 3088 + }, + { + "epoch": 0.9481276856967464, + "grad_norm": 0.556656002998352, + "learning_rate": 9.890407056207432e-05, + "loss": 2.1103, + "step": 3089 + }, + { + "epoch": 0.9484346224677717, + "grad_norm": 0.6807621121406555, + "learning_rate": 9.890303533631382e-05, + "loss": 2.1351, + "step": 3090 + }, + { + "epoch": 0.9487415592387968, + "grad_norm": 0.7187803983688354, + "learning_rate": 9.890199962726487e-05, + "loss": 2.0582, + "step": 3091 + }, + { + "epoch": 0.949048496009822, + "grad_norm": 0.6201196908950806, + "learning_rate": 9.890096343493771e-05, + "loss": 2.0799, + "step": 3092 + }, + { + "epoch": 0.9493554327808471, + "grad_norm": 0.6258496046066284, + "learning_rate": 9.889992675934257e-05, + "loss": 2.156, + "step": 3093 + }, + { + "epoch": 0.9496623695518723, + "grad_norm": 0.6191570162773132, + "learning_rate": 9.889888960048967e-05, + "loss": 2.0121, + "step": 3094 + }, + { + "epoch": 0.9499693063228974, + "grad_norm": 0.5668848752975464, + "learning_rate": 9.88978519583893e-05, + "loss": 2.0954, + "step": 3095 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.5596859455108643, + "learning_rate": 9.88968138330517e-05, + "loss": 2.1274, + "step": 3096 + }, + { + "epoch": 0.9505831798649478, + "grad_norm": 0.6199706196784973, + "learning_rate": 9.889577522448712e-05, + "loss": 2.0588, + "step": 3097 + }, + { + "epoch": 0.950890116635973, + "grad_norm": 0.5129860639572144, + "learning_rate": 9.889473613270584e-05, + "loss": 2.0722, + "step": 3098 + }, + { + "epoch": 0.9511970534069981, + "grad_norm": 0.513263463973999, + "learning_rate": 9.88936965577181e-05, + "loss": 2.0298, + "step": 3099 + }, + { + "epoch": 0.9515039901780233, + "grad_norm": 0.4870156943798065, + "learning_rate": 9.88926564995342e-05, + "loss": 2.025, + "step": 3100 + }, + { + "epoch": 0.9518109269490485, + "grad_norm": 0.5310595035552979, + "learning_rate": 9.889161595816442e-05, + "loss": 2.0767, + "step": 3101 + }, + { + "epoch": 0.9521178637200737, + "grad_norm": 0.5993812084197998, + "learning_rate": 9.889057493361903e-05, + "loss": 2.1931, + "step": 3102 + }, + { + "epoch": 0.9524248004910988, + "grad_norm": 0.6157637238502502, + "learning_rate": 9.888953342590832e-05, + "loss": 2.0757, + "step": 3103 + }, + { + "epoch": 0.952731737262124, + "grad_norm": 0.6280032992362976, + "learning_rate": 9.88884914350426e-05, + "loss": 2.0042, + "step": 3104 + }, + { + "epoch": 0.9530386740331491, + "grad_norm": 0.6740781664848328, + "learning_rate": 9.888744896103212e-05, + "loss": 2.0663, + "step": 3105 + }, + { + "epoch": 0.9533456108041743, + "grad_norm": 0.5851804614067078, + "learning_rate": 9.888640600388725e-05, + "loss": 2.0585, + "step": 3106 + }, + { + "epoch": 0.9536525475751995, + "grad_norm": 0.6590312719345093, + "learning_rate": 9.888536256361825e-05, + "loss": 2.0698, + "step": 3107 + }, + { + "epoch": 0.9539594843462247, + "grad_norm": 0.5356595516204834, + "learning_rate": 9.888431864023544e-05, + "loss": 2.1019, + "step": 3108 + }, + { + "epoch": 0.9542664211172498, + "grad_norm": 0.6401084661483765, + "learning_rate": 9.888327423374915e-05, + "loss": 2.1176, + "step": 3109 + }, + { + "epoch": 0.954573357888275, + "grad_norm": 0.6582900285720825, + "learning_rate": 9.888222934416968e-05, + "loss": 2.0375, + "step": 3110 + }, + { + "epoch": 0.9548802946593001, + "grad_norm": 0.6245424151420593, + "learning_rate": 9.888118397150738e-05, + "loss": 1.9913, + "step": 3111 + }, + { + "epoch": 0.9551872314303254, + "grad_norm": 0.5871780514717102, + "learning_rate": 9.888013811577256e-05, + "loss": 2.1434, + "step": 3112 + }, + { + "epoch": 0.9554941682013505, + "grad_norm": 0.6295487284660339, + "learning_rate": 9.887909177697559e-05, + "loss": 2.0805, + "step": 3113 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.5844045877456665, + "learning_rate": 9.887804495512676e-05, + "loss": 2.076, + "step": 3114 + }, + { + "epoch": 0.9561080417434008, + "grad_norm": 0.5581921339035034, + "learning_rate": 9.887699765023645e-05, + "loss": 2.131, + "step": 3115 + }, + { + "epoch": 0.956414978514426, + "grad_norm": 0.6659174561500549, + "learning_rate": 9.8875949862315e-05, + "loss": 2.0759, + "step": 3116 + }, + { + "epoch": 0.9567219152854513, + "grad_norm": 0.5852961540222168, + "learning_rate": 9.887490159137276e-05, + "loss": 2.0486, + "step": 3117 + }, + { + "epoch": 0.9570288520564764, + "grad_norm": 0.6077566146850586, + "learning_rate": 9.887385283742011e-05, + "loss": 2.1132, + "step": 3118 + }, + { + "epoch": 0.9573357888275016, + "grad_norm": 0.5991361141204834, + "learning_rate": 9.88728036004674e-05, + "loss": 2.0322, + "step": 3119 + }, + { + "epoch": 0.9576427255985267, + "grad_norm": 0.5832391977310181, + "learning_rate": 9.887175388052499e-05, + "loss": 2.135, + "step": 3120 + }, + { + "epoch": 0.9579496623695519, + "grad_norm": 0.5479732751846313, + "learning_rate": 9.887070367760327e-05, + "loss": 2.1222, + "step": 3121 + }, + { + "epoch": 0.958256599140577, + "grad_norm": 0.5630220770835876, + "learning_rate": 9.88696529917126e-05, + "loss": 2.1247, + "step": 3122 + }, + { + "epoch": 0.9585635359116023, + "grad_norm": 0.7052439451217651, + "learning_rate": 9.88686018228634e-05, + "loss": 2.204, + "step": 3123 + }, + { + "epoch": 0.9588704726826274, + "grad_norm": 0.5995638370513916, + "learning_rate": 9.8867550171066e-05, + "loss": 2.0153, + "step": 3124 + }, + { + "epoch": 0.9591774094536526, + "grad_norm": 0.5689408779144287, + "learning_rate": 9.886649803633086e-05, + "loss": 2.0341, + "step": 3125 + }, + { + "epoch": 0.9594843462246777, + "grad_norm": 0.5247456431388855, + "learning_rate": 9.886544541866832e-05, + "loss": 2.0657, + "step": 3126 + }, + { + "epoch": 0.9597912829957029, + "grad_norm": 0.5596463084220886, + "learning_rate": 9.886439231808882e-05, + "loss": 2.0829, + "step": 3127 + }, + { + "epoch": 0.960098219766728, + "grad_norm": 0.4993874430656433, + "learning_rate": 9.886333873460275e-05, + "loss": 2.0517, + "step": 3128 + }, + { + "epoch": 0.9604051565377533, + "grad_norm": 0.5776910185813904, + "learning_rate": 9.886228466822054e-05, + "loss": 2.0124, + "step": 3129 + }, + { + "epoch": 0.9607120933087784, + "grad_norm": 0.5871354341506958, + "learning_rate": 9.886123011895258e-05, + "loss": 2.0327, + "step": 3130 + }, + { + "epoch": 0.9610190300798036, + "grad_norm": 0.5873207449913025, + "learning_rate": 9.886017508680931e-05, + "loss": 2.0756, + "step": 3131 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.6422720551490784, + "learning_rate": 9.885911957180113e-05, + "loss": 2.0649, + "step": 3132 + }, + { + "epoch": 0.9616329036218539, + "grad_norm": 0.6040814518928528, + "learning_rate": 9.885806357393853e-05, + "loss": 2.066, + "step": 3133 + }, + { + "epoch": 0.961939840392879, + "grad_norm": 0.6629621982574463, + "learning_rate": 9.885700709323189e-05, + "loss": 2.0824, + "step": 3134 + }, + { + "epoch": 0.9622467771639043, + "grad_norm": 0.572485625743866, + "learning_rate": 9.885595012969168e-05, + "loss": 2.0572, + "step": 3135 + }, + { + "epoch": 0.9625537139349294, + "grad_norm": 0.5050783753395081, + "learning_rate": 9.885489268332833e-05, + "loss": 2.0645, + "step": 3136 + }, + { + "epoch": 0.9628606507059546, + "grad_norm": 0.5744417309761047, + "learning_rate": 9.885383475415229e-05, + "loss": 2.0549, + "step": 3137 + }, + { + "epoch": 0.9631675874769797, + "grad_norm": 0.5604275465011597, + "learning_rate": 9.885277634217403e-05, + "loss": 2.1339, + "step": 3138 + }, + { + "epoch": 0.963474524248005, + "grad_norm": 0.6182584762573242, + "learning_rate": 9.8851717447404e-05, + "loss": 2.0397, + "step": 3139 + }, + { + "epoch": 0.9637814610190301, + "grad_norm": 0.510515570640564, + "learning_rate": 9.885065806985266e-05, + "loss": 1.9761, + "step": 3140 + }, + { + "epoch": 0.9640883977900553, + "grad_norm": 0.4881763756275177, + "learning_rate": 9.884959820953048e-05, + "loss": 2.005, + "step": 3141 + }, + { + "epoch": 0.9643953345610804, + "grad_norm": 0.47206851840019226, + "learning_rate": 9.884853786644794e-05, + "loss": 2.0661, + "step": 3142 + }, + { + "epoch": 0.9647022713321056, + "grad_norm": 0.5691676735877991, + "learning_rate": 9.884747704061552e-05, + "loss": 2.1316, + "step": 3143 + }, + { + "epoch": 0.9650092081031307, + "grad_norm": 0.5338765978813171, + "learning_rate": 9.884641573204372e-05, + "loss": 2.0715, + "step": 3144 + }, + { + "epoch": 0.965316144874156, + "grad_norm": 0.5721597075462341, + "learning_rate": 9.884535394074299e-05, + "loss": 2.1004, + "step": 3145 + }, + { + "epoch": 0.9656230816451811, + "grad_norm": 0.5269518494606018, + "learning_rate": 9.884429166672384e-05, + "loss": 2.1233, + "step": 3146 + }, + { + "epoch": 0.9659300184162063, + "grad_norm": 0.5264385342597961, + "learning_rate": 9.884322890999678e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.9662369551872314, + "grad_norm": 0.6094604730606079, + "learning_rate": 9.88421656705723e-05, + "loss": 2.1009, + "step": 3148 + }, + { + "epoch": 0.9665438919582566, + "grad_norm": 0.5538906455039978, + "learning_rate": 9.884110194846093e-05, + "loss": 2.0055, + "step": 3149 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.591526985168457, + "learning_rate": 9.884003774367313e-05, + "loss": 2.0655, + "step": 3150 + }, + { + "epoch": 0.967157765500307, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.883897305621948e-05, + "loss": 2.0775, + "step": 3151 + }, + { + "epoch": 0.9674647022713321, + "grad_norm": 0.5074640512466431, + "learning_rate": 9.883790788611045e-05, + "loss": 2.0322, + "step": 3152 + }, + { + "epoch": 0.9677716390423573, + "grad_norm": 0.5111376047134399, + "learning_rate": 9.883684223335661e-05, + "loss": 2.0972, + "step": 3153 + }, + { + "epoch": 0.9680785758133824, + "grad_norm": 0.5187644362449646, + "learning_rate": 9.883577609796846e-05, + "loss": 2.072, + "step": 3154 + }, + { + "epoch": 0.9683855125844076, + "grad_norm": 0.5285201072692871, + "learning_rate": 9.883470947995654e-05, + "loss": 2.0468, + "step": 3155 + }, + { + "epoch": 0.9686924493554327, + "grad_norm": 0.49360916018486023, + "learning_rate": 9.883364237933142e-05, + "loss": 2.07, + "step": 3156 + }, + { + "epoch": 0.968999386126458, + "grad_norm": 0.6359294056892395, + "learning_rate": 9.88325747961036e-05, + "loss": 2.1169, + "step": 3157 + }, + { + "epoch": 0.9693063228974831, + "grad_norm": 0.6274764537811279, + "learning_rate": 9.883150673028367e-05, + "loss": 2.1412, + "step": 3158 + }, + { + "epoch": 0.9696132596685083, + "grad_norm": 0.5755917429924011, + "learning_rate": 9.883043818188215e-05, + "loss": 2.0547, + "step": 3159 + }, + { + "epoch": 0.9699201964395334, + "grad_norm": 0.4765770137310028, + "learning_rate": 9.882936915090964e-05, + "loss": 2.02, + "step": 3160 + }, + { + "epoch": 0.9702271332105586, + "grad_norm": 0.5085053443908691, + "learning_rate": 9.882829963737667e-05, + "loss": 2.0355, + "step": 3161 + }, + { + "epoch": 0.9705340699815838, + "grad_norm": 0.49804505705833435, + "learning_rate": 9.882722964129385e-05, + "loss": 2.1274, + "step": 3162 + }, + { + "epoch": 0.970841006752609, + "grad_norm": 0.5575076341629028, + "learning_rate": 9.882615916267171e-05, + "loss": 2.0661, + "step": 3163 + }, + { + "epoch": 0.9711479435236341, + "grad_norm": 0.5678727626800537, + "learning_rate": 9.882508820152084e-05, + "loss": 2.1135, + "step": 3164 + }, + { + "epoch": 0.9714548802946593, + "grad_norm": 0.5505611896514893, + "learning_rate": 9.882401675785185e-05, + "loss": 2.0888, + "step": 3165 + }, + { + "epoch": 0.9717618170656844, + "grad_norm": 0.5224125385284424, + "learning_rate": 9.88229448316753e-05, + "loss": 2.0492, + "step": 3166 + }, + { + "epoch": 0.9720687538367097, + "grad_norm": 0.437215656042099, + "learning_rate": 9.882187242300178e-05, + "loss": 1.9927, + "step": 3167 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.4914848804473877, + "learning_rate": 9.882079953184192e-05, + "loss": 2.0309, + "step": 3168 + }, + { + "epoch": 0.97268262737876, + "grad_norm": 0.4990764260292053, + "learning_rate": 9.88197261582063e-05, + "loss": 2.0408, + "step": 3169 + }, + { + "epoch": 0.9729895641497851, + "grad_norm": 0.5283234715461731, + "learning_rate": 9.881865230210552e-05, + "loss": 2.0627, + "step": 3170 + }, + { + "epoch": 0.9732965009208103, + "grad_norm": 0.5771347284317017, + "learning_rate": 9.88175779635502e-05, + "loss": 2.1591, + "step": 3171 + }, + { + "epoch": 0.9736034376918354, + "grad_norm": 0.5020268559455872, + "learning_rate": 9.881650314255098e-05, + "loss": 2.0311, + "step": 3172 + }, + { + "epoch": 0.9739103744628607, + "grad_norm": 0.5476529002189636, + "learning_rate": 9.881542783911846e-05, + "loss": 2.1114, + "step": 3173 + }, + { + "epoch": 0.9742173112338858, + "grad_norm": 0.5630559921264648, + "learning_rate": 9.881435205326327e-05, + "loss": 2.0617, + "step": 3174 + }, + { + "epoch": 0.974524248004911, + "grad_norm": 0.5931001305580139, + "learning_rate": 9.881327578499604e-05, + "loss": 2.0376, + "step": 3175 + }, + { + "epoch": 0.9748311847759361, + "grad_norm": 0.6123979091644287, + "learning_rate": 9.881219903432742e-05, + "loss": 2.0995, + "step": 3176 + }, + { + "epoch": 0.9751381215469613, + "grad_norm": 0.6064465641975403, + "learning_rate": 9.881112180126802e-05, + "loss": 2.0533, + "step": 3177 + }, + { + "epoch": 0.9754450583179864, + "grad_norm": 0.6071485877037048, + "learning_rate": 9.881004408582852e-05, + "loss": 2.1007, + "step": 3178 + }, + { + "epoch": 0.9757519950890117, + "grad_norm": 0.6021482944488525, + "learning_rate": 9.880896588801954e-05, + "loss": 2.0528, + "step": 3179 + }, + { + "epoch": 0.9760589318600368, + "grad_norm": 0.5204832553863525, + "learning_rate": 9.880788720785177e-05, + "loss": 2.0489, + "step": 3180 + }, + { + "epoch": 0.976365868631062, + "grad_norm": 0.5347138047218323, + "learning_rate": 9.880680804533585e-05, + "loss": 2.1021, + "step": 3181 + }, + { + "epoch": 0.9766728054020871, + "grad_norm": 0.6318790912628174, + "learning_rate": 9.880572840048243e-05, + "loss": 2.0808, + "step": 3182 + }, + { + "epoch": 0.9769797421731123, + "grad_norm": 0.6978665590286255, + "learning_rate": 9.88046482733022e-05, + "loss": 2.0067, + "step": 3183 + }, + { + "epoch": 0.9772866789441375, + "grad_norm": 0.7986917495727539, + "learning_rate": 9.880356766380582e-05, + "loss": 2.0239, + "step": 3184 + }, + { + "epoch": 0.9775936157151627, + "grad_norm": 0.853898286819458, + "learning_rate": 9.880248657200402e-05, + "loss": 2.085, + "step": 3185 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.8207793235778809, + "learning_rate": 9.880140499790741e-05, + "loss": 2.0504, + "step": 3186 + }, + { + "epoch": 0.978207489257213, + "grad_norm": 0.7750336527824402, + "learning_rate": 9.880032294152673e-05, + "loss": 2.0962, + "step": 3187 + }, + { + "epoch": 0.9785144260282382, + "grad_norm": 0.7141241431236267, + "learning_rate": 9.879924040287263e-05, + "loss": 2.0655, + "step": 3188 + }, + { + "epoch": 0.9788213627992634, + "grad_norm": 0.6119080781936646, + "learning_rate": 9.879815738195585e-05, + "loss": 2.0611, + "step": 3189 + }, + { + "epoch": 0.9791282995702886, + "grad_norm": 0.5963751673698425, + "learning_rate": 9.879707387878708e-05, + "loss": 2.0978, + "step": 3190 + }, + { + "epoch": 0.9794352363413137, + "grad_norm": 0.5016428828239441, + "learning_rate": 9.879598989337703e-05, + "loss": 2.0323, + "step": 3191 + }, + { + "epoch": 0.9797421731123389, + "grad_norm": 0.5610151290893555, + "learning_rate": 9.87949054257364e-05, + "loss": 2.1362, + "step": 3192 + }, + { + "epoch": 0.980049109883364, + "grad_norm": 0.5687069296836853, + "learning_rate": 9.879382047587591e-05, + "loss": 2.0234, + "step": 3193 + }, + { + "epoch": 0.9803560466543892, + "grad_norm": 0.6210914254188538, + "learning_rate": 9.87927350438063e-05, + "loss": 2.0455, + "step": 3194 + }, + { + "epoch": 0.9806629834254144, + "grad_norm": 0.530215322971344, + "learning_rate": 9.879164912953827e-05, + "loss": 2.0607, + "step": 3195 + }, + { + "epoch": 0.9809699201964396, + "grad_norm": 0.5462486147880554, + "learning_rate": 9.879056273308258e-05, + "loss": 2.1229, + "step": 3196 + }, + { + "epoch": 0.9812768569674647, + "grad_norm": 0.5765405297279358, + "learning_rate": 9.878947585444994e-05, + "loss": 2.0575, + "step": 3197 + }, + { + "epoch": 0.9815837937384899, + "grad_norm": 0.531679630279541, + "learning_rate": 9.878838849365111e-05, + "loss": 2.0208, + "step": 3198 + }, + { + "epoch": 0.981890730509515, + "grad_norm": 0.5190781950950623, + "learning_rate": 9.878730065069683e-05, + "loss": 2.0073, + "step": 3199 + }, + { + "epoch": 0.9821976672805403, + "grad_norm": 0.6260761022567749, + "learning_rate": 9.878621232559784e-05, + "loss": 2.1144, + "step": 3200 + }, + { + "epoch": 0.9825046040515654, + "grad_norm": 0.664830207824707, + "learning_rate": 9.878512351836491e-05, + "loss": 2.1423, + "step": 3201 + }, + { + "epoch": 0.9828115408225906, + "grad_norm": 0.7107433676719666, + "learning_rate": 9.878403422900881e-05, + "loss": 2.0851, + "step": 3202 + }, + { + "epoch": 0.9831184775936157, + "grad_norm": 0.7426268458366394, + "learning_rate": 9.878294445754027e-05, + "loss": 2.0637, + "step": 3203 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.7643515467643738, + "learning_rate": 9.878185420397008e-05, + "loss": 2.0623, + "step": 3204 + }, + { + "epoch": 0.983732351135666, + "grad_norm": 0.644257664680481, + "learning_rate": 9.878076346830904e-05, + "loss": 2.103, + "step": 3205 + }, + { + "epoch": 0.9840392879066913, + "grad_norm": 0.5871284008026123, + "learning_rate": 9.877967225056787e-05, + "loss": 2.0695, + "step": 3206 + }, + { + "epoch": 0.9843462246777164, + "grad_norm": 0.6907737851142883, + "learning_rate": 9.877858055075742e-05, + "loss": 2.1148, + "step": 3207 + }, + { + "epoch": 0.9846531614487416, + "grad_norm": 0.6685691475868225, + "learning_rate": 9.877748836888843e-05, + "loss": 2.0356, + "step": 3208 + }, + { + "epoch": 0.9849600982197667, + "grad_norm": 0.797210156917572, + "learning_rate": 9.87763957049717e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.9852670349907919, + "grad_norm": 0.8397588133811951, + "learning_rate": 9.877530255901806e-05, + "loss": 2.0697, + "step": 3210 + }, + { + "epoch": 0.985573971761817, + "grad_norm": 0.6988976001739502, + "learning_rate": 9.877420893103828e-05, + "loss": 2.0676, + "step": 3211 + }, + { + "epoch": 0.9858809085328423, + "grad_norm": 0.5828577876091003, + "learning_rate": 9.877311482104319e-05, + "loss": 2.0988, + "step": 3212 + }, + { + "epoch": 0.9861878453038674, + "grad_norm": 0.66143798828125, + "learning_rate": 9.877202022904359e-05, + "loss": 2.101, + "step": 3213 + }, + { + "epoch": 0.9864947820748926, + "grad_norm": 0.7351155877113342, + "learning_rate": 9.877092515505028e-05, + "loss": 2.0198, + "step": 3214 + }, + { + "epoch": 0.9868017188459177, + "grad_norm": 0.6817437410354614, + "learning_rate": 9.876982959907413e-05, + "loss": 2.1182, + "step": 3215 + }, + { + "epoch": 0.9871086556169429, + "grad_norm": 0.6640676259994507, + "learning_rate": 9.876873356112592e-05, + "loss": 2.1264, + "step": 3216 + }, + { + "epoch": 0.987415592387968, + "grad_norm": 0.6146695017814636, + "learning_rate": 9.876763704121652e-05, + "loss": 2.0378, + "step": 3217 + }, + { + "epoch": 0.9877225291589933, + "grad_norm": 0.6681298017501831, + "learning_rate": 9.876654003935672e-05, + "loss": 2.1916, + "step": 3218 + }, + { + "epoch": 0.9880294659300184, + "grad_norm": 0.7407983541488647, + "learning_rate": 9.876544255555742e-05, + "loss": 2.0996, + "step": 3219 + }, + { + "epoch": 0.9883364027010436, + "grad_norm": 0.5995208621025085, + "learning_rate": 9.876434458982941e-05, + "loss": 2.0023, + "step": 3220 + }, + { + "epoch": 0.9886433394720687, + "grad_norm": 0.6491377949714661, + "learning_rate": 9.876324614218357e-05, + "loss": 2.129, + "step": 3221 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.6356569528579712, + "learning_rate": 9.876214721263074e-05, + "loss": 2.1396, + "step": 3222 + }, + { + "epoch": 0.9892572130141191, + "grad_norm": 0.6149557828903198, + "learning_rate": 9.876104780118182e-05, + "loss": 2.0204, + "step": 3223 + }, + { + "epoch": 0.9895641497851443, + "grad_norm": 0.600841224193573, + "learning_rate": 9.875994790784764e-05, + "loss": 2.0585, + "step": 3224 + }, + { + "epoch": 0.9898710865561694, + "grad_norm": 0.6398041248321533, + "learning_rate": 9.875884753263906e-05, + "loss": 2.1296, + "step": 3225 + }, + { + "epoch": 0.9901780233271946, + "grad_norm": 0.5978466272354126, + "learning_rate": 9.875774667556697e-05, + "loss": 1.9765, + "step": 3226 + }, + { + "epoch": 0.9904849600982197, + "grad_norm": 0.49499931931495667, + "learning_rate": 9.875664533664227e-05, + "loss": 2.0516, + "step": 3227 + }, + { + "epoch": 0.990791896869245, + "grad_norm": 0.5660768151283264, + "learning_rate": 9.875554351587579e-05, + "loss": 2.0743, + "step": 3228 + }, + { + "epoch": 0.9910988336402701, + "grad_norm": 0.56971275806427, + "learning_rate": 9.875444121327849e-05, + "loss": 2.0794, + "step": 3229 + }, + { + "epoch": 0.9914057704112953, + "grad_norm": 0.5806300044059753, + "learning_rate": 9.87533384288612e-05, + "loss": 2.1636, + "step": 3230 + }, + { + "epoch": 0.9917127071823204, + "grad_norm": 0.5485837459564209, + "learning_rate": 9.875223516263485e-05, + "loss": 2.025, + "step": 3231 + }, + { + "epoch": 0.9920196439533456, + "grad_norm": 0.6353451013565063, + "learning_rate": 9.875113141461034e-05, + "loss": 2.1033, + "step": 3232 + }, + { + "epoch": 0.9923265807243707, + "grad_norm": 0.577608048915863, + "learning_rate": 9.875002718479858e-05, + "loss": 2.1306, + "step": 3233 + }, + { + "epoch": 0.992633517495396, + "grad_norm": 0.5305901765823364, + "learning_rate": 9.874892247321046e-05, + "loss": 2.1123, + "step": 3234 + }, + { + "epoch": 0.9929404542664211, + "grad_norm": 0.5554118752479553, + "learning_rate": 9.874781727985693e-05, + "loss": 2.0524, + "step": 3235 + }, + { + "epoch": 0.9932473910374463, + "grad_norm": 0.48555269837379456, + "learning_rate": 9.87467116047489e-05, + "loss": 2.0699, + "step": 3236 + }, + { + "epoch": 0.9935543278084714, + "grad_norm": 0.578976035118103, + "learning_rate": 9.874560544789729e-05, + "loss": 2.0747, + "step": 3237 + }, + { + "epoch": 0.9938612645794966, + "grad_norm": 0.5508282780647278, + "learning_rate": 9.874449880931304e-05, + "loss": 2.0947, + "step": 3238 + }, + { + "epoch": 0.9941682013505218, + "grad_norm": 0.5458595752716064, + "learning_rate": 9.874339168900707e-05, + "loss": 2.0417, + "step": 3239 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.5668261647224426, + "learning_rate": 9.874228408699035e-05, + "loss": 2.0948, + "step": 3240 + }, + { + "epoch": 0.9947820748925721, + "grad_norm": 0.6127253174781799, + "learning_rate": 9.87411760032738e-05, + "loss": 2.0904, + "step": 3241 + }, + { + "epoch": 0.9950890116635973, + "grad_norm": 0.5736191868782043, + "learning_rate": 9.874006743786839e-05, + "loss": 2.0637, + "step": 3242 + }, + { + "epoch": 0.9953959484346224, + "grad_norm": 0.574163019657135, + "learning_rate": 9.873895839078507e-05, + "loss": 2.0925, + "step": 3243 + }, + { + "epoch": 0.9957028852056476, + "grad_norm": 0.5660602450370789, + "learning_rate": 9.873784886203478e-05, + "loss": 2.0743, + "step": 3244 + }, + { + "epoch": 0.9960098219766728, + "grad_norm": 0.6037993431091309, + "learning_rate": 9.87367388516285e-05, + "loss": 2.1274, + "step": 3245 + }, + { + "epoch": 0.996316758747698, + "grad_norm": 0.5664488673210144, + "learning_rate": 9.873562835957722e-05, + "loss": 2.0403, + "step": 3246 + }, + { + "epoch": 0.9966236955187231, + "grad_norm": 0.6170254349708557, + "learning_rate": 9.873451738589188e-05, + "loss": 2.0198, + "step": 3247 + }, + { + "epoch": 0.9969306322897483, + "grad_norm": 0.5582032799720764, + "learning_rate": 9.873340593058348e-05, + "loss": 2.1494, + "step": 3248 + }, + { + "epoch": 0.9972375690607734, + "grad_norm": 0.5565598607063293, + "learning_rate": 9.8732293993663e-05, + "loss": 2.1062, + "step": 3249 + }, + { + "epoch": 0.9975445058317987, + "grad_norm": 0.5526474118232727, + "learning_rate": 9.873118157514142e-05, + "loss": 2.1184, + "step": 3250 + }, + { + "epoch": 0.9978514426028238, + "grad_norm": 0.5864302515983582, + "learning_rate": 9.873006867502975e-05, + "loss": 2.1389, + "step": 3251 + }, + { + "epoch": 0.998158379373849, + "grad_norm": 0.5295118689537048, + "learning_rate": 9.872895529333899e-05, + "loss": 2.05, + "step": 3252 + }, + { + "epoch": 0.9984653161448741, + "grad_norm": 0.553537905216217, + "learning_rate": 9.872784143008012e-05, + "loss": 2.0464, + "step": 3253 + }, + { + "epoch": 0.9987722529158993, + "grad_norm": 0.558159589767456, + "learning_rate": 9.872672708526416e-05, + "loss": 2.1013, + "step": 3254 + }, + { + "epoch": 0.9990791896869244, + "grad_norm": 0.5479860901832581, + "learning_rate": 9.872561225890211e-05, + "loss": 2.0497, + "step": 3255 + }, + { + "epoch": 0.9993861264579497, + "grad_norm": 0.5538234114646912, + "learning_rate": 9.872449695100503e-05, + "loss": 2.1239, + "step": 3256 + }, + { + "epoch": 0.9996930632289748, + "grad_norm": 0.5970771908760071, + "learning_rate": 9.872338116158389e-05, + "loss": 2.0693, + "step": 3257 + }, + { + "epoch": 1.0, + "grad_norm": 0.5118132829666138, + "learning_rate": 9.872226489064975e-05, + "loss": 2.0302, + "step": 3258 + }, + { + "epoch": 1.0003069367710251, + "grad_norm": 0.538902223110199, + "learning_rate": 9.872114813821363e-05, + "loss": 2.0604, + "step": 3259 + }, + { + "epoch": 1.0006138735420504, + "grad_norm": 0.47124916315078735, + "learning_rate": 9.872003090428657e-05, + "loss": 2.054, + "step": 3260 + }, + { + "epoch": 1.0009208103130756, + "grad_norm": 0.5109235048294067, + "learning_rate": 9.87189131888796e-05, + "loss": 2.0107, + "step": 3261 + }, + { + "epoch": 1.0012277470841007, + "grad_norm": 0.5530306696891785, + "learning_rate": 9.871779499200377e-05, + "loss": 2.0914, + "step": 3262 + }, + { + "epoch": 1.0015346838551258, + "grad_norm": 0.6271992325782776, + "learning_rate": 9.871667631367017e-05, + "loss": 1.9855, + "step": 3263 + }, + { + "epoch": 1.0018416206261511, + "grad_norm": 0.5752004384994507, + "learning_rate": 9.871555715388978e-05, + "loss": 2.0689, + "step": 3264 + }, + { + "epoch": 1.0021485573971762, + "grad_norm": 0.6185278296470642, + "learning_rate": 9.871443751267373e-05, + "loss": 2.0751, + "step": 3265 + }, + { + "epoch": 1.0024554941682013, + "grad_norm": 0.625248908996582, + "learning_rate": 9.871331739003304e-05, + "loss": 2.102, + "step": 3266 + }, + { + "epoch": 1.0027624309392265, + "grad_norm": 0.6345300078392029, + "learning_rate": 9.87121967859788e-05, + "loss": 2.0898, + "step": 3267 + }, + { + "epoch": 1.0030693677102518, + "grad_norm": 0.6836622953414917, + "learning_rate": 9.871107570052207e-05, + "loss": 2.1348, + "step": 3268 + }, + { + "epoch": 1.003376304481277, + "grad_norm": 0.699739933013916, + "learning_rate": 9.870995413367397e-05, + "loss": 2.0085, + "step": 3269 + }, + { + "epoch": 1.003683241252302, + "grad_norm": 0.650558590888977, + "learning_rate": 9.870883208544553e-05, + "loss": 2.0927, + "step": 3270 + }, + { + "epoch": 1.0039901780233271, + "grad_norm": 0.6837300658226013, + "learning_rate": 9.870770955584785e-05, + "loss": 2.1415, + "step": 3271 + }, + { + "epoch": 1.0042971147943525, + "grad_norm": 0.595761239528656, + "learning_rate": 9.870658654489206e-05, + "loss": 2.0372, + "step": 3272 + }, + { + "epoch": 1.0046040515653776, + "grad_norm": 0.5177203416824341, + "learning_rate": 9.870546305258922e-05, + "loss": 2.053, + "step": 3273 + }, + { + "epoch": 1.0049109883364027, + "grad_norm": 0.5392438173294067, + "learning_rate": 9.870433907895045e-05, + "loss": 2.0886, + "step": 3274 + }, + { + "epoch": 1.0052179251074278, + "grad_norm": 0.594776451587677, + "learning_rate": 9.870321462398686e-05, + "loss": 2.0158, + "step": 3275 + }, + { + "epoch": 1.0055248618784531, + "grad_norm": 0.6363179683685303, + "learning_rate": 9.870208968770955e-05, + "loss": 2.0532, + "step": 3276 + }, + { + "epoch": 1.0058317986494782, + "grad_norm": 0.7506567239761353, + "learning_rate": 9.870096427012965e-05, + "loss": 2.1288, + "step": 3277 + }, + { + "epoch": 1.0061387354205034, + "grad_norm": 0.7155289053916931, + "learning_rate": 9.869983837125828e-05, + "loss": 2.0859, + "step": 3278 + }, + { + "epoch": 1.0064456721915285, + "grad_norm": 0.7589760422706604, + "learning_rate": 9.869871199110656e-05, + "loss": 2.1668, + "step": 3279 + }, + { + "epoch": 1.0067526089625538, + "grad_norm": 0.6161168217658997, + "learning_rate": 9.869758512968562e-05, + "loss": 2.0421, + "step": 3280 + }, + { + "epoch": 1.007059545733579, + "grad_norm": 0.5722637176513672, + "learning_rate": 9.86964577870066e-05, + "loss": 2.1333, + "step": 3281 + }, + { + "epoch": 1.007366482504604, + "grad_norm": 0.6443020701408386, + "learning_rate": 9.869532996308065e-05, + "loss": 2.0227, + "step": 3282 + }, + { + "epoch": 1.0076734192756291, + "grad_norm": 0.6603342890739441, + "learning_rate": 9.869420165791891e-05, + "loss": 2.0888, + "step": 3283 + }, + { + "epoch": 1.0079803560466545, + "grad_norm": 0.6666482090950012, + "learning_rate": 9.869307287153251e-05, + "loss": 2.0132, + "step": 3284 + }, + { + "epoch": 1.0082872928176796, + "grad_norm": 0.6691575646400452, + "learning_rate": 9.869194360393264e-05, + "loss": 2.0752, + "step": 3285 + }, + { + "epoch": 1.0085942295887047, + "grad_norm": 0.6142565011978149, + "learning_rate": 9.869081385513044e-05, + "loss": 2.0491, + "step": 3286 + }, + { + "epoch": 1.0089011663597298, + "grad_norm": 0.5869930386543274, + "learning_rate": 9.868968362513708e-05, + "loss": 2.1252, + "step": 3287 + }, + { + "epoch": 1.0092081031307552, + "grad_norm": 0.532183825969696, + "learning_rate": 9.868855291396373e-05, + "loss": 2.0589, + "step": 3288 + }, + { + "epoch": 1.0095150399017803, + "grad_norm": 0.616374135017395, + "learning_rate": 9.868742172162156e-05, + "loss": 2.0808, + "step": 3289 + }, + { + "epoch": 1.0098219766728054, + "grad_norm": 0.5750923156738281, + "learning_rate": 9.868629004812176e-05, + "loss": 2.0407, + "step": 3290 + }, + { + "epoch": 1.0101289134438305, + "grad_norm": 0.6161531209945679, + "learning_rate": 9.86851578934755e-05, + "loss": 2.0938, + "step": 3291 + }, + { + "epoch": 1.0104358502148558, + "grad_norm": 0.5369158983230591, + "learning_rate": 9.868402525769397e-05, + "loss": 2.1298, + "step": 3292 + }, + { + "epoch": 1.010742786985881, + "grad_norm": 0.5134824514389038, + "learning_rate": 9.868289214078837e-05, + "loss": 2.0345, + "step": 3293 + }, + { + "epoch": 1.011049723756906, + "grad_norm": 0.4972594082355499, + "learning_rate": 9.868175854276991e-05, + "loss": 2.1264, + "step": 3294 + }, + { + "epoch": 1.0113566605279312, + "grad_norm": 0.5727534890174866, + "learning_rate": 9.868062446364976e-05, + "loss": 2.1668, + "step": 3295 + }, + { + "epoch": 1.0116635972989565, + "grad_norm": 0.6384626030921936, + "learning_rate": 9.867948990343915e-05, + "loss": 2.1125, + "step": 3296 + }, + { + "epoch": 1.0119705340699816, + "grad_norm": 0.7591070532798767, + "learning_rate": 9.867835486214929e-05, + "loss": 2.0975, + "step": 3297 + }, + { + "epoch": 1.0122774708410067, + "grad_norm": 0.7940282821655273, + "learning_rate": 9.86772193397914e-05, + "loss": 2.0107, + "step": 3298 + }, + { + "epoch": 1.0125844076120318, + "grad_norm": 0.6877933144569397, + "learning_rate": 9.86760833363767e-05, + "loss": 2.0684, + "step": 3299 + }, + { + "epoch": 1.0128913443830572, + "grad_norm": 0.5361137986183167, + "learning_rate": 9.867494685191641e-05, + "loss": 2.0426, + "step": 3300 + }, + { + "epoch": 1.0131982811540823, + "grad_norm": 0.5104349851608276, + "learning_rate": 9.867380988642177e-05, + "loss": 2.0849, + "step": 3301 + }, + { + "epoch": 1.0135052179251074, + "grad_norm": 0.6133849024772644, + "learning_rate": 9.867267243990399e-05, + "loss": 2.0789, + "step": 3302 + }, + { + "epoch": 1.0138121546961325, + "grad_norm": 0.6607559323310852, + "learning_rate": 9.867153451237436e-05, + "loss": 2.0978, + "step": 3303 + }, + { + "epoch": 1.0141190914671578, + "grad_norm": 0.6853774189949036, + "learning_rate": 9.867039610384409e-05, + "loss": 2.1612, + "step": 3304 + }, + { + "epoch": 1.014426028238183, + "grad_norm": 0.6326626539230347, + "learning_rate": 9.866925721432442e-05, + "loss": 2.0887, + "step": 3305 + }, + { + "epoch": 1.014732965009208, + "grad_norm": 0.5483830571174622, + "learning_rate": 9.866811784382665e-05, + "loss": 2.0522, + "step": 3306 + }, + { + "epoch": 1.0150399017802332, + "grad_norm": 0.5980744957923889, + "learning_rate": 9.866697799236201e-05, + "loss": 2.0666, + "step": 3307 + }, + { + "epoch": 1.0153468385512585, + "grad_norm": 0.6047075986862183, + "learning_rate": 9.866583765994177e-05, + "loss": 2.0924, + "step": 3308 + }, + { + "epoch": 1.0156537753222836, + "grad_norm": 0.5932674407958984, + "learning_rate": 9.86646968465772e-05, + "loss": 2.0426, + "step": 3309 + }, + { + "epoch": 1.0159607120933087, + "grad_norm": 0.5349873304367065, + "learning_rate": 9.866355555227957e-05, + "loss": 2.027, + "step": 3310 + }, + { + "epoch": 1.0162676488643339, + "grad_norm": 0.5090891122817993, + "learning_rate": 9.866241377706015e-05, + "loss": 2.0554, + "step": 3311 + }, + { + "epoch": 1.0165745856353592, + "grad_norm": 0.605268120765686, + "learning_rate": 9.866127152093025e-05, + "loss": 2.0788, + "step": 3312 + }, + { + "epoch": 1.0168815224063843, + "grad_norm": 0.6006563305854797, + "learning_rate": 9.866012878390113e-05, + "loss": 2.0154, + "step": 3313 + }, + { + "epoch": 1.0171884591774094, + "grad_norm": 0.6412727236747742, + "learning_rate": 9.865898556598409e-05, + "loss": 2.0948, + "step": 3314 + }, + { + "epoch": 1.0174953959484345, + "grad_norm": 0.512140154838562, + "learning_rate": 9.865784186719046e-05, + "loss": 2.0314, + "step": 3315 + }, + { + "epoch": 1.0178023327194599, + "grad_norm": 0.48285913467407227, + "learning_rate": 9.865669768753151e-05, + "loss": 1.9689, + "step": 3316 + }, + { + "epoch": 1.018109269490485, + "grad_norm": 0.6067737340927124, + "learning_rate": 9.865555302701854e-05, + "loss": 2.1042, + "step": 3317 + }, + { + "epoch": 1.01841620626151, + "grad_norm": 0.6272363662719727, + "learning_rate": 9.865440788566289e-05, + "loss": 2.1092, + "step": 3318 + }, + { + "epoch": 1.0187231430325352, + "grad_norm": 0.6264182925224304, + "learning_rate": 9.865326226347586e-05, + "loss": 2.0445, + "step": 3319 + }, + { + "epoch": 1.0190300798035605, + "grad_norm": 0.5642834901809692, + "learning_rate": 9.86521161604688e-05, + "loss": 2.1041, + "step": 3320 + }, + { + "epoch": 1.0193370165745856, + "grad_norm": 0.5188324451446533, + "learning_rate": 9.865096957665297e-05, + "loss": 2.0174, + "step": 3321 + }, + { + "epoch": 1.0196439533456108, + "grad_norm": 0.5204416513442993, + "learning_rate": 9.864982251203976e-05, + "loss": 2.0927, + "step": 3322 + }, + { + "epoch": 1.0199508901166359, + "grad_norm": 0.5845292806625366, + "learning_rate": 9.86486749666405e-05, + "loss": 2.0751, + "step": 3323 + }, + { + "epoch": 1.0202578268876612, + "grad_norm": 0.5514994263648987, + "learning_rate": 9.86475269404665e-05, + "loss": 2.0976, + "step": 3324 + }, + { + "epoch": 1.0205647636586863, + "grad_norm": 0.6578981280326843, + "learning_rate": 9.864637843352915e-05, + "loss": 2.0668, + "step": 3325 + }, + { + "epoch": 1.0208717004297114, + "grad_norm": 0.6396434307098389, + "learning_rate": 9.864522944583976e-05, + "loss": 2.0648, + "step": 3326 + }, + { + "epoch": 1.0211786372007365, + "grad_norm": 0.548759400844574, + "learning_rate": 9.86440799774097e-05, + "loss": 2.0873, + "step": 3327 + }, + { + "epoch": 1.0214855739717619, + "grad_norm": 0.5739279985427856, + "learning_rate": 9.864293002825033e-05, + "loss": 2.0623, + "step": 3328 + }, + { + "epoch": 1.021792510742787, + "grad_norm": 0.5882315039634705, + "learning_rate": 9.864177959837303e-05, + "loss": 2.0399, + "step": 3329 + }, + { + "epoch": 1.022099447513812, + "grad_norm": 0.563359797000885, + "learning_rate": 9.864062868778914e-05, + "loss": 2.0839, + "step": 3330 + }, + { + "epoch": 1.0224063842848374, + "grad_norm": 0.6162607073783875, + "learning_rate": 9.863947729651006e-05, + "loss": 2.0439, + "step": 3331 + }, + { + "epoch": 1.0227133210558625, + "grad_norm": 0.6540365815162659, + "learning_rate": 9.863832542454715e-05, + "loss": 2.1234, + "step": 3332 + }, + { + "epoch": 1.0230202578268877, + "grad_norm": 0.6401089429855347, + "learning_rate": 9.86371730719118e-05, + "loss": 2.0418, + "step": 3333 + }, + { + "epoch": 1.0233271945979128, + "grad_norm": 0.6456391215324402, + "learning_rate": 9.86360202386154e-05, + "loss": 2.1191, + "step": 3334 + }, + { + "epoch": 1.023634131368938, + "grad_norm": 0.59992516040802, + "learning_rate": 9.863486692466933e-05, + "loss": 2.0582, + "step": 3335 + }, + { + "epoch": 1.0239410681399632, + "grad_norm": 0.5932520627975464, + "learning_rate": 9.8633713130085e-05, + "loss": 2.1812, + "step": 3336 + }, + { + "epoch": 1.0242480049109883, + "grad_norm": 0.6322866082191467, + "learning_rate": 9.863255885487384e-05, + "loss": 2.1523, + "step": 3337 + }, + { + "epoch": 1.0245549416820134, + "grad_norm": 0.6291313171386719, + "learning_rate": 9.863140409904719e-05, + "loss": 2.0495, + "step": 3338 + }, + { + "epoch": 1.0248618784530388, + "grad_norm": 0.6272565126419067, + "learning_rate": 9.863024886261653e-05, + "loss": 1.9812, + "step": 3339 + }, + { + "epoch": 1.025168815224064, + "grad_norm": 0.6485729217529297, + "learning_rate": 9.862909314559323e-05, + "loss": 2.0826, + "step": 3340 + }, + { + "epoch": 1.025475751995089, + "grad_norm": 0.608239471912384, + "learning_rate": 9.862793694798875e-05, + "loss": 2.0519, + "step": 3341 + }, + { + "epoch": 1.0257826887661141, + "grad_norm": 0.5492779612541199, + "learning_rate": 9.862678026981447e-05, + "loss": 1.9901, + "step": 3342 + }, + { + "epoch": 1.0260896255371394, + "grad_norm": 0.524030327796936, + "learning_rate": 9.862562311108187e-05, + "loss": 2.0695, + "step": 3343 + }, + { + "epoch": 1.0263965623081646, + "grad_norm": 0.6835227608680725, + "learning_rate": 9.862446547180235e-05, + "loss": 2.1312, + "step": 3344 + }, + { + "epoch": 1.0267034990791897, + "grad_norm": 0.6771748065948486, + "learning_rate": 9.862330735198736e-05, + "loss": 2.0566, + "step": 3345 + }, + { + "epoch": 1.0270104358502148, + "grad_norm": 0.609993577003479, + "learning_rate": 9.862214875164835e-05, + "loss": 2.1463, + "step": 3346 + }, + { + "epoch": 1.0273173726212401, + "grad_norm": 0.6617777347564697, + "learning_rate": 9.862098967079677e-05, + "loss": 2.0485, + "step": 3347 + }, + { + "epoch": 1.0276243093922652, + "grad_norm": 0.7935113906860352, + "learning_rate": 9.861983010944407e-05, + "loss": 2.0528, + "step": 3348 + }, + { + "epoch": 1.0279312461632903, + "grad_norm": 0.7510255575180054, + "learning_rate": 9.861867006760172e-05, + "loss": 1.9803, + "step": 3349 + }, + { + "epoch": 1.0282381829343155, + "grad_norm": 0.6944519281387329, + "learning_rate": 9.861750954528117e-05, + "loss": 2.0488, + "step": 3350 + }, + { + "epoch": 1.0285451197053408, + "grad_norm": 0.6057126522064209, + "learning_rate": 9.861634854249389e-05, + "loss": 2.1465, + "step": 3351 + }, + { + "epoch": 1.028852056476366, + "grad_norm": 0.6156182289123535, + "learning_rate": 9.861518705925135e-05, + "loss": 2.1227, + "step": 3352 + }, + { + "epoch": 1.029158993247391, + "grad_norm": 0.6016978621482849, + "learning_rate": 9.861402509556506e-05, + "loss": 2.0238, + "step": 3353 + }, + { + "epoch": 1.0294659300184161, + "grad_norm": 0.5987950563430786, + "learning_rate": 9.861286265144648e-05, + "loss": 2.0529, + "step": 3354 + }, + { + "epoch": 1.0297728667894415, + "grad_norm": 0.6011384725570679, + "learning_rate": 9.861169972690707e-05, + "loss": 2.0612, + "step": 3355 + }, + { + "epoch": 1.0300798035604666, + "grad_norm": 0.5217840671539307, + "learning_rate": 9.861053632195838e-05, + "loss": 2.0472, + "step": 3356 + }, + { + "epoch": 1.0303867403314917, + "grad_norm": 0.5202180743217468, + "learning_rate": 9.860937243661186e-05, + "loss": 2.1301, + "step": 3357 + }, + { + "epoch": 1.0306936771025168, + "grad_norm": 0.572290301322937, + "learning_rate": 9.860820807087905e-05, + "loss": 2.0309, + "step": 3358 + }, + { + "epoch": 1.0310006138735421, + "grad_norm": 0.5088694095611572, + "learning_rate": 9.860704322477142e-05, + "loss": 2.0789, + "step": 3359 + }, + { + "epoch": 1.0313075506445673, + "grad_norm": 0.5546056032180786, + "learning_rate": 9.860587789830052e-05, + "loss": 1.9708, + "step": 3360 + }, + { + "epoch": 1.0316144874155924, + "grad_norm": 0.5152996182441711, + "learning_rate": 9.860471209147782e-05, + "loss": 2.0656, + "step": 3361 + }, + { + "epoch": 1.0319214241866175, + "grad_norm": 0.4997018873691559, + "learning_rate": 9.860354580431488e-05, + "loss": 2.1404, + "step": 3362 + }, + { + "epoch": 1.0322283609576428, + "grad_norm": 0.5464209318161011, + "learning_rate": 9.860237903682321e-05, + "loss": 2.0013, + "step": 3363 + }, + { + "epoch": 1.032535297728668, + "grad_norm": 0.4934932589530945, + "learning_rate": 9.860121178901435e-05, + "loss": 2.0873, + "step": 3364 + }, + { + "epoch": 1.032842234499693, + "grad_norm": 0.5755184292793274, + "learning_rate": 9.860004406089982e-05, + "loss": 2.0706, + "step": 3365 + }, + { + "epoch": 1.0331491712707181, + "grad_norm": 0.6155427098274231, + "learning_rate": 9.859887585249117e-05, + "loss": 2.1153, + "step": 3366 + }, + { + "epoch": 1.0334561080417435, + "grad_norm": 0.6251068711280823, + "learning_rate": 9.859770716379995e-05, + "loss": 1.9988, + "step": 3367 + }, + { + "epoch": 1.0337630448127686, + "grad_norm": 0.5652515888214111, + "learning_rate": 9.85965379948377e-05, + "loss": 1.9834, + "step": 3368 + }, + { + "epoch": 1.0340699815837937, + "grad_norm": 0.49031418561935425, + "learning_rate": 9.859536834561599e-05, + "loss": 2.0719, + "step": 3369 + }, + { + "epoch": 1.0343769183548188, + "grad_norm": 0.5014585852622986, + "learning_rate": 9.859419821614635e-05, + "loss": 2.0309, + "step": 3370 + }, + { + "epoch": 1.0346838551258442, + "grad_norm": 0.5657221674919128, + "learning_rate": 9.859302760644036e-05, + "loss": 2.048, + "step": 3371 + }, + { + "epoch": 1.0349907918968693, + "grad_norm": 0.7023506164550781, + "learning_rate": 9.85918565165096e-05, + "loss": 2.033, + "step": 3372 + }, + { + "epoch": 1.0352977286678944, + "grad_norm": 0.5712850689888, + "learning_rate": 9.859068494636565e-05, + "loss": 2.1006, + "step": 3373 + }, + { + "epoch": 1.0356046654389195, + "grad_norm": 0.5352653861045837, + "learning_rate": 9.858951289602004e-05, + "loss": 1.9775, + "step": 3374 + }, + { + "epoch": 1.0359116022099448, + "grad_norm": 0.5282073616981506, + "learning_rate": 9.85883403654844e-05, + "loss": 2.0388, + "step": 3375 + }, + { + "epoch": 1.03621853898097, + "grad_norm": 0.6164727210998535, + "learning_rate": 9.85871673547703e-05, + "loss": 2.0758, + "step": 3376 + }, + { + "epoch": 1.036525475751995, + "grad_norm": 0.6034660935401917, + "learning_rate": 9.858599386388933e-05, + "loss": 2.0619, + "step": 3377 + }, + { + "epoch": 1.0368324125230202, + "grad_norm": 0.6129952073097229, + "learning_rate": 9.85848198928531e-05, + "loss": 2.0709, + "step": 3378 + }, + { + "epoch": 1.0371393492940455, + "grad_norm": 0.6287248134613037, + "learning_rate": 9.85836454416732e-05, + "loss": 2.1493, + "step": 3379 + }, + { + "epoch": 1.0374462860650706, + "grad_norm": 0.675419807434082, + "learning_rate": 9.858247051036124e-05, + "loss": 2.0558, + "step": 3380 + }, + { + "epoch": 1.0377532228360957, + "grad_norm": 0.6493481397628784, + "learning_rate": 9.858129509892882e-05, + "loss": 2.2019, + "step": 3381 + }, + { + "epoch": 1.0380601596071208, + "grad_norm": 0.6690036058425903, + "learning_rate": 9.85801192073876e-05, + "loss": 2.0069, + "step": 3382 + }, + { + "epoch": 1.0383670963781462, + "grad_norm": 0.6682954430580139, + "learning_rate": 9.857894283574913e-05, + "loss": 2.0559, + "step": 3383 + }, + { + "epoch": 1.0386740331491713, + "grad_norm": 0.6408236622810364, + "learning_rate": 9.857776598402508e-05, + "loss": 2.0837, + "step": 3384 + }, + { + "epoch": 1.0389809699201964, + "grad_norm": 0.7896385192871094, + "learning_rate": 9.85765886522271e-05, + "loss": 2.1344, + "step": 3385 + }, + { + "epoch": 1.0392879066912215, + "grad_norm": 0.7404007911682129, + "learning_rate": 9.857541084036677e-05, + "loss": 2.0937, + "step": 3386 + }, + { + "epoch": 1.0395948434622468, + "grad_norm": 0.6780609488487244, + "learning_rate": 9.857423254845577e-05, + "loss": 2.0279, + "step": 3387 + }, + { + "epoch": 1.039901780233272, + "grad_norm": 0.5989474654197693, + "learning_rate": 9.857305377650574e-05, + "loss": 2.0997, + "step": 3388 + }, + { + "epoch": 1.040208717004297, + "grad_norm": 0.5449484586715698, + "learning_rate": 9.857187452452832e-05, + "loss": 2.0544, + "step": 3389 + }, + { + "epoch": 1.0405156537753222, + "grad_norm": 0.6261779069900513, + "learning_rate": 9.857069479253516e-05, + "loss": 2.024, + "step": 3390 + }, + { + "epoch": 1.0408225905463475, + "grad_norm": 0.6665713787078857, + "learning_rate": 9.856951458053794e-05, + "loss": 2.1139, + "step": 3391 + }, + { + "epoch": 1.0411295273173726, + "grad_norm": 0.5861490964889526, + "learning_rate": 9.856833388854829e-05, + "loss": 2.0087, + "step": 3392 + }, + { + "epoch": 1.0414364640883977, + "grad_norm": 0.5511623620986938, + "learning_rate": 9.856715271657793e-05, + "loss": 2.106, + "step": 3393 + }, + { + "epoch": 1.0417434008594229, + "grad_norm": 0.5450705885887146, + "learning_rate": 9.856597106463848e-05, + "loss": 2.0669, + "step": 3394 + }, + { + "epoch": 1.0420503376304482, + "grad_norm": 0.5172801613807678, + "learning_rate": 9.856478893274163e-05, + "loss": 2.0492, + "step": 3395 + }, + { + "epoch": 1.0423572744014733, + "grad_norm": 0.580157458782196, + "learning_rate": 9.856360632089907e-05, + "loss": 2.0794, + "step": 3396 + }, + { + "epoch": 1.0426642111724984, + "grad_norm": 0.5138662457466125, + "learning_rate": 9.856242322912251e-05, + "loss": 2.0813, + "step": 3397 + }, + { + "epoch": 1.0429711479435237, + "grad_norm": 0.5626689791679382, + "learning_rate": 9.85612396574236e-05, + "loss": 2.071, + "step": 3398 + }, + { + "epoch": 1.0432780847145489, + "grad_norm": 0.6069894433021545, + "learning_rate": 9.856005560581407e-05, + "loss": 2.132, + "step": 3399 + }, + { + "epoch": 1.043585021485574, + "grad_norm": 0.547346293926239, + "learning_rate": 9.85588710743056e-05, + "loss": 2.0572, + "step": 3400 + }, + { + "epoch": 1.043891958256599, + "grad_norm": 0.5712311863899231, + "learning_rate": 9.855768606290992e-05, + "loss": 2.0943, + "step": 3401 + }, + { + "epoch": 1.0441988950276242, + "grad_norm": 0.5945014953613281, + "learning_rate": 9.85565005716387e-05, + "loss": 2.1004, + "step": 3402 + }, + { + "epoch": 1.0445058317986495, + "grad_norm": 0.5712563395500183, + "learning_rate": 9.85553146005037e-05, + "loss": 2.0817, + "step": 3403 + }, + { + "epoch": 1.0448127685696746, + "grad_norm": 0.552578866481781, + "learning_rate": 9.855412814951661e-05, + "loss": 2.0514, + "step": 3404 + }, + { + "epoch": 1.0451197053406998, + "grad_norm": 0.5654930472373962, + "learning_rate": 9.855294121868918e-05, + "loss": 2.1342, + "step": 3405 + }, + { + "epoch": 1.045426642111725, + "grad_norm": 0.516094446182251, + "learning_rate": 9.855175380803312e-05, + "loss": 2.01, + "step": 3406 + }, + { + "epoch": 1.0457335788827502, + "grad_norm": 0.5198549628257751, + "learning_rate": 9.855056591756018e-05, + "loss": 2.0423, + "step": 3407 + }, + { + "epoch": 1.0460405156537753, + "grad_norm": 0.45312678813934326, + "learning_rate": 9.854937754728209e-05, + "loss": 1.9767, + "step": 3408 + }, + { + "epoch": 1.0463474524248004, + "grad_norm": 0.4647958278656006, + "learning_rate": 9.854818869721059e-05, + "loss": 2.107, + "step": 3409 + }, + { + "epoch": 1.0466543891958258, + "grad_norm": 0.5034347772598267, + "learning_rate": 9.854699936735742e-05, + "loss": 2.0358, + "step": 3410 + }, + { + "epoch": 1.0469613259668509, + "grad_norm": 0.48189103603363037, + "learning_rate": 9.854580955773435e-05, + "loss": 2.0441, + "step": 3411 + }, + { + "epoch": 1.047268262737876, + "grad_norm": 0.5315099954605103, + "learning_rate": 9.854461926835316e-05, + "loss": 2.0222, + "step": 3412 + }, + { + "epoch": 1.047575199508901, + "grad_norm": 0.6013970971107483, + "learning_rate": 9.854342849922557e-05, + "loss": 2.09, + "step": 3413 + }, + { + "epoch": 1.0478821362799264, + "grad_norm": 0.7554240226745605, + "learning_rate": 9.854223725036339e-05, + "loss": 2.0411, + "step": 3414 + }, + { + "epoch": 1.0481890730509515, + "grad_norm": 0.7160158157348633, + "learning_rate": 9.854104552177835e-05, + "loss": 2.0858, + "step": 3415 + }, + { + "epoch": 1.0484960098219767, + "grad_norm": 0.5641576051712036, + "learning_rate": 9.853985331348225e-05, + "loss": 2.0287, + "step": 3416 + }, + { + "epoch": 1.0488029465930018, + "grad_norm": 0.5947676301002502, + "learning_rate": 9.853866062548687e-05, + "loss": 2.1177, + "step": 3417 + }, + { + "epoch": 1.049109883364027, + "grad_norm": 0.5780991911888123, + "learning_rate": 9.853746745780401e-05, + "loss": 2.024, + "step": 3418 + }, + { + "epoch": 1.0494168201350522, + "grad_norm": 0.6753053665161133, + "learning_rate": 9.853627381044543e-05, + "loss": 2.1303, + "step": 3419 + }, + { + "epoch": 1.0497237569060773, + "grad_norm": 0.7183442711830139, + "learning_rate": 9.853507968342295e-05, + "loss": 2.0845, + "step": 3420 + }, + { + "epoch": 1.0500306936771024, + "grad_norm": 0.6768840551376343, + "learning_rate": 9.853388507674837e-05, + "loss": 2.0991, + "step": 3421 + }, + { + "epoch": 1.0503376304481278, + "grad_norm": 0.624703049659729, + "learning_rate": 9.85326899904335e-05, + "loss": 2.0952, + "step": 3422 + }, + { + "epoch": 1.050644567219153, + "grad_norm": 0.523289144039154, + "learning_rate": 9.853149442449013e-05, + "loss": 2.0244, + "step": 3423 + }, + { + "epoch": 1.050951503990178, + "grad_norm": 0.4939860701560974, + "learning_rate": 9.853029837893008e-05, + "loss": 2.0312, + "step": 3424 + }, + { + "epoch": 1.0512584407612031, + "grad_norm": 0.5685132145881653, + "learning_rate": 9.852910185376519e-05, + "loss": 2.0863, + "step": 3425 + }, + { + "epoch": 1.0515653775322285, + "grad_norm": 0.5713129639625549, + "learning_rate": 9.852790484900725e-05, + "loss": 2.1182, + "step": 3426 + }, + { + "epoch": 1.0518723143032536, + "grad_norm": 0.5626100301742554, + "learning_rate": 9.852670736466813e-05, + "loss": 2.0187, + "step": 3427 + }, + { + "epoch": 1.0521792510742787, + "grad_norm": 0.5129684805870056, + "learning_rate": 9.852550940075965e-05, + "loss": 2.0354, + "step": 3428 + }, + { + "epoch": 1.0524861878453038, + "grad_norm": 0.6123769879341125, + "learning_rate": 9.852431095729361e-05, + "loss": 2.1315, + "step": 3429 + }, + { + "epoch": 1.0527931246163291, + "grad_norm": 0.66834956407547, + "learning_rate": 9.852311203428192e-05, + "loss": 2.1642, + "step": 3430 + }, + { + "epoch": 1.0531000613873542, + "grad_norm": 0.6253052353858948, + "learning_rate": 9.85219126317364e-05, + "loss": 2.0651, + "step": 3431 + }, + { + "epoch": 1.0534069981583793, + "grad_norm": 0.5162510871887207, + "learning_rate": 9.852071274966888e-05, + "loss": 2.0029, + "step": 3432 + }, + { + "epoch": 1.0537139349294045, + "grad_norm": 0.5725626349449158, + "learning_rate": 9.851951238809125e-05, + "loss": 2.0875, + "step": 3433 + }, + { + "epoch": 1.0540208717004298, + "grad_norm": 0.5319885611534119, + "learning_rate": 9.851831154701537e-05, + "loss": 2.0042, + "step": 3434 + }, + { + "epoch": 1.054327808471455, + "grad_norm": 0.5030925273895264, + "learning_rate": 9.851711022645307e-05, + "loss": 1.9805, + "step": 3435 + }, + { + "epoch": 1.05463474524248, + "grad_norm": 0.5786148309707642, + "learning_rate": 9.851590842641627e-05, + "loss": 2.1456, + "step": 3436 + }, + { + "epoch": 1.0549416820135051, + "grad_norm": 0.6246622800827026, + "learning_rate": 9.851470614691682e-05, + "loss": 2.042, + "step": 3437 + }, + { + "epoch": 1.0552486187845305, + "grad_norm": 0.5181210041046143, + "learning_rate": 9.851350338796662e-05, + "loss": 2.0423, + "step": 3438 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.5505120754241943, + "learning_rate": 9.851230014957754e-05, + "loss": 2.0478, + "step": 3439 + }, + { + "epoch": 1.0558624923265807, + "grad_norm": 0.6193632483482361, + "learning_rate": 9.851109643176147e-05, + "loss": 1.9904, + "step": 3440 + }, + { + "epoch": 1.0561694290976058, + "grad_norm": 0.6332803964614868, + "learning_rate": 9.85098922345303e-05, + "loss": 2.0037, + "step": 3441 + }, + { + "epoch": 1.0564763658686311, + "grad_norm": 0.5601481199264526, + "learning_rate": 9.850868755789595e-05, + "loss": 2.141, + "step": 3442 + }, + { + "epoch": 1.0567833026396563, + "grad_norm": 0.588182270526886, + "learning_rate": 9.850748240187033e-05, + "loss": 2.17, + "step": 3443 + }, + { + "epoch": 1.0570902394106814, + "grad_norm": 0.5955865383148193, + "learning_rate": 9.850627676646533e-05, + "loss": 2.1004, + "step": 3444 + }, + { + "epoch": 1.0573971761817065, + "grad_norm": 0.6412670612335205, + "learning_rate": 9.850507065169288e-05, + "loss": 2.0642, + "step": 3445 + }, + { + "epoch": 1.0577041129527318, + "grad_norm": 0.5597305297851562, + "learning_rate": 9.850386405756489e-05, + "loss": 2.0412, + "step": 3446 + }, + { + "epoch": 1.058011049723757, + "grad_norm": 0.5633887052536011, + "learning_rate": 9.850265698409328e-05, + "loss": 1.9976, + "step": 3447 + }, + { + "epoch": 1.058317986494782, + "grad_norm": 0.5924213528633118, + "learning_rate": 9.850144943128998e-05, + "loss": 2.0715, + "step": 3448 + }, + { + "epoch": 1.0586249232658071, + "grad_norm": 0.5968048572540283, + "learning_rate": 9.850024139916694e-05, + "loss": 2.0755, + "step": 3449 + }, + { + "epoch": 1.0589318600368325, + "grad_norm": 0.5745044946670532, + "learning_rate": 9.849903288773609e-05, + "loss": 2.0615, + "step": 3450 + }, + { + "epoch": 1.0592387968078576, + "grad_norm": 0.5154273509979248, + "learning_rate": 9.849782389700936e-05, + "loss": 2.0429, + "step": 3451 + }, + { + "epoch": 1.0595457335788827, + "grad_norm": 0.5307286977767944, + "learning_rate": 9.849661442699871e-05, + "loss": 2.0788, + "step": 3452 + }, + { + "epoch": 1.0598526703499078, + "grad_norm": 0.5445010662078857, + "learning_rate": 9.84954044777161e-05, + "loss": 2.0598, + "step": 3453 + }, + { + "epoch": 1.0601596071209332, + "grad_norm": 0.5858064889907837, + "learning_rate": 9.849419404917347e-05, + "loss": 2.069, + "step": 3454 + }, + { + "epoch": 1.0604665438919583, + "grad_norm": 0.5906962156295776, + "learning_rate": 9.84929831413828e-05, + "loss": 2.1256, + "step": 3455 + }, + { + "epoch": 1.0607734806629834, + "grad_norm": 0.6632845997810364, + "learning_rate": 9.849177175435605e-05, + "loss": 2.1002, + "step": 3456 + }, + { + "epoch": 1.0610804174340085, + "grad_norm": 0.6352782845497131, + "learning_rate": 9.849055988810518e-05, + "loss": 2.0901, + "step": 3457 + }, + { + "epoch": 1.0613873542050338, + "grad_norm": 0.5406731963157654, + "learning_rate": 9.848934754264218e-05, + "loss": 2.0562, + "step": 3458 + }, + { + "epoch": 1.061694290976059, + "grad_norm": 0.6067590117454529, + "learning_rate": 9.848813471797902e-05, + "loss": 2.0914, + "step": 3459 + }, + { + "epoch": 1.062001227747084, + "grad_norm": 0.5876826047897339, + "learning_rate": 9.84869214141277e-05, + "loss": 2.0065, + "step": 3460 + }, + { + "epoch": 1.0623081645181092, + "grad_norm": 0.611648440361023, + "learning_rate": 9.84857076311002e-05, + "loss": 2.1252, + "step": 3461 + }, + { + "epoch": 1.0626151012891345, + "grad_norm": 0.568358302116394, + "learning_rate": 9.848449336890853e-05, + "loss": 2.0312, + "step": 3462 + }, + { + "epoch": 1.0629220380601596, + "grad_norm": 0.5303518772125244, + "learning_rate": 9.848327862756466e-05, + "loss": 1.9989, + "step": 3463 + }, + { + "epoch": 1.0632289748311847, + "grad_norm": 0.5377182960510254, + "learning_rate": 9.848206340708062e-05, + "loss": 2.0759, + "step": 3464 + }, + { + "epoch": 1.06353591160221, + "grad_norm": 0.5178431868553162, + "learning_rate": 9.848084770746842e-05, + "loss": 2.0613, + "step": 3465 + }, + { + "epoch": 1.0638428483732352, + "grad_norm": 0.4605518877506256, + "learning_rate": 9.847963152874007e-05, + "loss": 1.9961, + "step": 3466 + }, + { + "epoch": 1.0641497851442603, + "grad_norm": 0.5262506604194641, + "learning_rate": 9.847841487090758e-05, + "loss": 2.032, + "step": 3467 + }, + { + "epoch": 1.0644567219152854, + "grad_norm": 0.5210484862327576, + "learning_rate": 9.847719773398298e-05, + "loss": 2.106, + "step": 3468 + }, + { + "epoch": 1.0647636586863105, + "grad_norm": 0.5159584283828735, + "learning_rate": 9.84759801179783e-05, + "loss": 2.07, + "step": 3469 + }, + { + "epoch": 1.0650705954573358, + "grad_norm": 0.5094224810600281, + "learning_rate": 9.847476202290557e-05, + "loss": 2.1379, + "step": 3470 + }, + { + "epoch": 1.065377532228361, + "grad_norm": 0.5180851221084595, + "learning_rate": 9.847354344877684e-05, + "loss": 2.0911, + "step": 3471 + }, + { + "epoch": 1.065684468999386, + "grad_norm": 0.5476199984550476, + "learning_rate": 9.847232439560412e-05, + "loss": 2.0654, + "step": 3472 + }, + { + "epoch": 1.0659914057704114, + "grad_norm": 0.5314182639122009, + "learning_rate": 9.84711048633995e-05, + "loss": 1.9829, + "step": 3473 + }, + { + "epoch": 1.0662983425414365, + "grad_norm": 0.549379825592041, + "learning_rate": 9.8469884852175e-05, + "loss": 2.0876, + "step": 3474 + }, + { + "epoch": 1.0666052793124616, + "grad_norm": 0.6280861496925354, + "learning_rate": 9.84686643619427e-05, + "loss": 2.1026, + "step": 3475 + }, + { + "epoch": 1.0669122160834867, + "grad_norm": 0.5838838219642639, + "learning_rate": 9.846744339271464e-05, + "loss": 2.0553, + "step": 3476 + }, + { + "epoch": 1.0672191528545119, + "grad_norm": 0.6090747117996216, + "learning_rate": 9.84662219445029e-05, + "loss": 2.0983, + "step": 3477 + }, + { + "epoch": 1.0675260896255372, + "grad_norm": 0.515504002571106, + "learning_rate": 9.846500001731955e-05, + "loss": 2.0992, + "step": 3478 + }, + { + "epoch": 1.0678330263965623, + "grad_norm": 0.5083954930305481, + "learning_rate": 9.846377761117667e-05, + "loss": 1.9851, + "step": 3479 + }, + { + "epoch": 1.0681399631675874, + "grad_norm": 0.5102222561836243, + "learning_rate": 9.846255472608632e-05, + "loss": 2.0553, + "step": 3480 + }, + { + "epoch": 1.0684468999386127, + "grad_norm": 0.5123574137687683, + "learning_rate": 9.846133136206061e-05, + "loss": 2.0382, + "step": 3481 + }, + { + "epoch": 1.0687538367096379, + "grad_norm": 0.5657833814620972, + "learning_rate": 9.84601075191116e-05, + "loss": 2.0735, + "step": 3482 + }, + { + "epoch": 1.069060773480663, + "grad_norm": 0.5460711121559143, + "learning_rate": 9.845888319725143e-05, + "loss": 2.0445, + "step": 3483 + }, + { + "epoch": 1.069367710251688, + "grad_norm": 0.42860034108161926, + "learning_rate": 9.845765839649217e-05, + "loss": 2.0166, + "step": 3484 + }, + { + "epoch": 1.0696746470227134, + "grad_norm": 0.5413190126419067, + "learning_rate": 9.845643311684592e-05, + "loss": 1.9923, + "step": 3485 + }, + { + "epoch": 1.0699815837937385, + "grad_norm": 0.4982166290283203, + "learning_rate": 9.84552073583248e-05, + "loss": 2.0279, + "step": 3486 + }, + { + "epoch": 1.0702885205647636, + "grad_norm": 0.4824393689632416, + "learning_rate": 9.845398112094091e-05, + "loss": 1.9661, + "step": 3487 + }, + { + "epoch": 1.0705954573357888, + "grad_norm": 0.5690898895263672, + "learning_rate": 9.845275440470639e-05, + "loss": 2.0866, + "step": 3488 + }, + { + "epoch": 1.070902394106814, + "grad_norm": 0.6087098717689514, + "learning_rate": 9.845152720963335e-05, + "loss": 2.055, + "step": 3489 + }, + { + "epoch": 1.0712093308778392, + "grad_norm": 0.5754218101501465, + "learning_rate": 9.845029953573392e-05, + "loss": 2.0577, + "step": 3490 + }, + { + "epoch": 1.0715162676488643, + "grad_norm": 0.619746744632721, + "learning_rate": 9.844907138302023e-05, + "loss": 2.0694, + "step": 3491 + }, + { + "epoch": 1.0718232044198894, + "grad_norm": 0.5165389776229858, + "learning_rate": 9.844784275150442e-05, + "loss": 1.9618, + "step": 3492 + }, + { + "epoch": 1.0721301411909148, + "grad_norm": 0.5098079442977905, + "learning_rate": 9.844661364119863e-05, + "loss": 2.0021, + "step": 3493 + }, + { + "epoch": 1.0724370779619399, + "grad_norm": 0.5978688597679138, + "learning_rate": 9.8445384052115e-05, + "loss": 2.0861, + "step": 3494 + }, + { + "epoch": 1.072744014732965, + "grad_norm": 0.5498695373535156, + "learning_rate": 9.844415398426572e-05, + "loss": 2.095, + "step": 3495 + }, + { + "epoch": 1.07305095150399, + "grad_norm": 0.4890369474887848, + "learning_rate": 9.844292343766289e-05, + "loss": 1.9819, + "step": 3496 + }, + { + "epoch": 1.0733578882750154, + "grad_norm": 0.49551400542259216, + "learning_rate": 9.844169241231871e-05, + "loss": 2.109, + "step": 3497 + }, + { + "epoch": 1.0736648250460405, + "grad_norm": 0.5358633399009705, + "learning_rate": 9.844046090824533e-05, + "loss": 2.0579, + "step": 3498 + }, + { + "epoch": 1.0739717618170657, + "grad_norm": 0.5990919470787048, + "learning_rate": 9.843922892545492e-05, + "loss": 2.1962, + "step": 3499 + }, + { + "epoch": 1.0742786985880908, + "grad_norm": 0.5973169207572937, + "learning_rate": 9.843799646395967e-05, + "loss": 2.0691, + "step": 3500 + }, + { + "epoch": 1.074585635359116, + "grad_norm": 0.5875831246376038, + "learning_rate": 9.843676352377172e-05, + "loss": 2.0807, + "step": 3501 + }, + { + "epoch": 1.0748925721301412, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.84355301049033e-05, + "loss": 2.0694, + "step": 3502 + }, + { + "epoch": 1.0751995089011663, + "grad_norm": 0.7694209814071655, + "learning_rate": 9.843429620736659e-05, + "loss": 2.1504, + "step": 3503 + }, + { + "epoch": 1.0755064456721914, + "grad_norm": 0.7930089831352234, + "learning_rate": 9.843306183117376e-05, + "loss": 2.0635, + "step": 3504 + }, + { + "epoch": 1.0758133824432168, + "grad_norm": 0.6518469452857971, + "learning_rate": 9.843182697633704e-05, + "loss": 2.0395, + "step": 3505 + }, + { + "epoch": 1.076120319214242, + "grad_norm": 0.49737605452537537, + "learning_rate": 9.843059164286861e-05, + "loss": 1.9875, + "step": 3506 + }, + { + "epoch": 1.076427255985267, + "grad_norm": 0.5311492085456848, + "learning_rate": 9.84293558307807e-05, + "loss": 2.1331, + "step": 3507 + }, + { + "epoch": 1.0767341927562921, + "grad_norm": 0.6801449656486511, + "learning_rate": 9.842811954008551e-05, + "loss": 2.0991, + "step": 3508 + }, + { + "epoch": 1.0770411295273175, + "grad_norm": 0.5404406189918518, + "learning_rate": 9.842688277079523e-05, + "loss": 2.0482, + "step": 3509 + }, + { + "epoch": 1.0773480662983426, + "grad_norm": 0.6136532425880432, + "learning_rate": 9.842564552292215e-05, + "loss": 2.1016, + "step": 3510 + }, + { + "epoch": 1.0776550030693677, + "grad_norm": 0.5874183773994446, + "learning_rate": 9.842440779647843e-05, + "loss": 2.0495, + "step": 3511 + }, + { + "epoch": 1.0779619398403928, + "grad_norm": 0.4891047775745392, + "learning_rate": 9.842316959147635e-05, + "loss": 2.0592, + "step": 3512 + }, + { + "epoch": 1.0782688766114181, + "grad_norm": 0.5115689635276794, + "learning_rate": 9.84219309079281e-05, + "loss": 2.0084, + "step": 3513 + }, + { + "epoch": 1.0785758133824432, + "grad_norm": 0.5662370324134827, + "learning_rate": 9.842069174584597e-05, + "loss": 2.1134, + "step": 3514 + }, + { + "epoch": 1.0788827501534684, + "grad_norm": 0.6859605312347412, + "learning_rate": 9.841945210524217e-05, + "loss": 2.1144, + "step": 3515 + }, + { + "epoch": 1.0791896869244935, + "grad_norm": 0.8003933429718018, + "learning_rate": 9.841821198612897e-05, + "loss": 2.0353, + "step": 3516 + }, + { + "epoch": 1.0794966236955188, + "grad_norm": 0.8481027483940125, + "learning_rate": 9.841697138851863e-05, + "loss": 2.1012, + "step": 3517 + }, + { + "epoch": 1.079803560466544, + "grad_norm": 0.7234178185462952, + "learning_rate": 9.84157303124234e-05, + "loss": 2.1134, + "step": 3518 + }, + { + "epoch": 1.080110497237569, + "grad_norm": 0.6129522919654846, + "learning_rate": 9.841448875785553e-05, + "loss": 2.0736, + "step": 3519 + }, + { + "epoch": 1.0804174340085941, + "grad_norm": 0.4983314573764801, + "learning_rate": 9.841324672482732e-05, + "loss": 2.0334, + "step": 3520 + }, + { + "epoch": 1.0807243707796195, + "grad_norm": 0.6069099307060242, + "learning_rate": 9.841200421335101e-05, + "loss": 2.0506, + "step": 3521 + }, + { + "epoch": 1.0810313075506446, + "grad_norm": 0.5841798186302185, + "learning_rate": 9.841076122343893e-05, + "loss": 2.0491, + "step": 3522 + }, + { + "epoch": 1.0813382443216697, + "grad_norm": 0.5629861354827881, + "learning_rate": 9.84095177551033e-05, + "loss": 2.0435, + "step": 3523 + }, + { + "epoch": 1.0816451810926948, + "grad_norm": 0.48676446080207825, + "learning_rate": 9.840827380835646e-05, + "loss": 2.0543, + "step": 3524 + }, + { + "epoch": 1.0819521178637201, + "grad_norm": 0.5119389295578003, + "learning_rate": 9.840702938321069e-05, + "loss": 2.0461, + "step": 3525 + }, + { + "epoch": 1.0822590546347453, + "grad_norm": 0.47259917855262756, + "learning_rate": 9.840578447967827e-05, + "loss": 2.0494, + "step": 3526 + }, + { + "epoch": 1.0825659914057704, + "grad_norm": 0.5083605647087097, + "learning_rate": 9.840453909777153e-05, + "loss": 2.0518, + "step": 3527 + }, + { + "epoch": 1.0828729281767955, + "grad_norm": 0.46149778366088867, + "learning_rate": 9.840329323750276e-05, + "loss": 2.0087, + "step": 3528 + }, + { + "epoch": 1.0831798649478208, + "grad_norm": 0.4698919951915741, + "learning_rate": 9.840204689888427e-05, + "loss": 2.0715, + "step": 3529 + }, + { + "epoch": 1.083486801718846, + "grad_norm": 0.514570951461792, + "learning_rate": 9.840080008192838e-05, + "loss": 2.1067, + "step": 3530 + }, + { + "epoch": 1.083793738489871, + "grad_norm": 0.5938723087310791, + "learning_rate": 9.839955278664743e-05, + "loss": 2.1246, + "step": 3531 + }, + { + "epoch": 1.0841006752608962, + "grad_norm": 0.58525550365448, + "learning_rate": 9.839830501305372e-05, + "loss": 2.0695, + "step": 3532 + }, + { + "epoch": 1.0844076120319215, + "grad_norm": 0.5693490505218506, + "learning_rate": 9.83970567611596e-05, + "loss": 2.0166, + "step": 3533 + }, + { + "epoch": 1.0847145488029466, + "grad_norm": 0.544964075088501, + "learning_rate": 9.839580803097738e-05, + "loss": 2.0093, + "step": 3534 + }, + { + "epoch": 1.0850214855739717, + "grad_norm": 0.5509639978408813, + "learning_rate": 9.839455882251945e-05, + "loss": 2.0511, + "step": 3535 + }, + { + "epoch": 1.0853284223449968, + "grad_norm": 0.5092516541481018, + "learning_rate": 9.83933091357981e-05, + "loss": 2.0586, + "step": 3536 + }, + { + "epoch": 1.0856353591160222, + "grad_norm": 0.5163968205451965, + "learning_rate": 9.83920589708257e-05, + "loss": 2.0541, + "step": 3537 + }, + { + "epoch": 1.0859422958870473, + "grad_norm": 0.49756479263305664, + "learning_rate": 9.839080832761464e-05, + "loss": 2.0495, + "step": 3538 + }, + { + "epoch": 1.0862492326580724, + "grad_norm": 0.6246916055679321, + "learning_rate": 9.838955720617722e-05, + "loss": 2.2082, + "step": 3539 + }, + { + "epoch": 1.0865561694290977, + "grad_norm": 0.5826153755187988, + "learning_rate": 9.838830560652585e-05, + "loss": 2.0318, + "step": 3540 + }, + { + "epoch": 1.0868631062001228, + "grad_norm": 0.6131548285484314, + "learning_rate": 9.838705352867287e-05, + "loss": 2.1172, + "step": 3541 + }, + { + "epoch": 1.087170042971148, + "grad_norm": 0.7028201818466187, + "learning_rate": 9.838580097263068e-05, + "loss": 2.061, + "step": 3542 + }, + { + "epoch": 1.087476979742173, + "grad_norm": 0.7061073780059814, + "learning_rate": 9.838454793841166e-05, + "loss": 2.0944, + "step": 3543 + }, + { + "epoch": 1.0877839165131982, + "grad_norm": 0.6820229887962341, + "learning_rate": 9.838329442602814e-05, + "loss": 2.072, + "step": 3544 + }, + { + "epoch": 1.0880908532842235, + "grad_norm": 0.5658139586448669, + "learning_rate": 9.838204043549257e-05, + "loss": 2.0499, + "step": 3545 + }, + { + "epoch": 1.0883977900552486, + "grad_norm": 0.5714126825332642, + "learning_rate": 9.838078596681731e-05, + "loss": 2.06, + "step": 3546 + }, + { + "epoch": 1.0887047268262737, + "grad_norm": 0.5343610048294067, + "learning_rate": 9.837953102001477e-05, + "loss": 2.0932, + "step": 3547 + }, + { + "epoch": 1.089011663597299, + "grad_norm": 0.5799851417541504, + "learning_rate": 9.837827559509735e-05, + "loss": 2.0615, + "step": 3548 + }, + { + "epoch": 1.0893186003683242, + "grad_norm": 0.5679401159286499, + "learning_rate": 9.837701969207745e-05, + "loss": 2.0161, + "step": 3549 + }, + { + "epoch": 1.0896255371393493, + "grad_norm": 0.5369420647621155, + "learning_rate": 9.83757633109675e-05, + "loss": 2.0066, + "step": 3550 + }, + { + "epoch": 1.0899324739103744, + "grad_norm": 0.5276355147361755, + "learning_rate": 9.837450645177988e-05, + "loss": 2.03, + "step": 3551 + }, + { + "epoch": 1.0902394106813997, + "grad_norm": 0.49717894196510315, + "learning_rate": 9.837324911452705e-05, + "loss": 1.9897, + "step": 3552 + }, + { + "epoch": 1.0905463474524248, + "grad_norm": 0.460783451795578, + "learning_rate": 9.837199129922142e-05, + "loss": 2.089, + "step": 3553 + }, + { + "epoch": 1.09085328422345, + "grad_norm": 0.505473792552948, + "learning_rate": 9.837073300587541e-05, + "loss": 2.035, + "step": 3554 + }, + { + "epoch": 1.091160220994475, + "grad_norm": 0.4588155150413513, + "learning_rate": 9.836947423450147e-05, + "loss": 2.0029, + "step": 3555 + }, + { + "epoch": 1.0914671577655004, + "grad_norm": 0.5151825547218323, + "learning_rate": 9.836821498511203e-05, + "loss": 2.1075, + "step": 3556 + }, + { + "epoch": 1.0917740945365255, + "grad_norm": 0.46669647097587585, + "learning_rate": 9.836695525771955e-05, + "loss": 2.0468, + "step": 3557 + }, + { + "epoch": 1.0920810313075506, + "grad_norm": 0.49291539192199707, + "learning_rate": 9.836569505233647e-05, + "loss": 2.1201, + "step": 3558 + }, + { + "epoch": 1.0923879680785757, + "grad_norm": 0.49323126673698425, + "learning_rate": 9.836443436897525e-05, + "loss": 1.9796, + "step": 3559 + }, + { + "epoch": 1.092694904849601, + "grad_norm": 0.4784039258956909, + "learning_rate": 9.836317320764832e-05, + "loss": 2.0267, + "step": 3560 + }, + { + "epoch": 1.0930018416206262, + "grad_norm": 0.5402999520301819, + "learning_rate": 9.836191156836818e-05, + "loss": 2.07, + "step": 3561 + }, + { + "epoch": 1.0933087783916513, + "grad_norm": 0.5989857912063599, + "learning_rate": 9.83606494511473e-05, + "loss": 2.0518, + "step": 3562 + }, + { + "epoch": 1.0936157151626764, + "grad_norm": 0.685855507850647, + "learning_rate": 9.835938685599811e-05, + "loss": 2.0632, + "step": 3563 + }, + { + "epoch": 1.0939226519337018, + "grad_norm": 0.7716066837310791, + "learning_rate": 9.835812378293312e-05, + "loss": 2.0758, + "step": 3564 + }, + { + "epoch": 1.0942295887047269, + "grad_norm": 0.6822659969329834, + "learning_rate": 9.835686023196481e-05, + "loss": 2.0077, + "step": 3565 + }, + { + "epoch": 1.094536525475752, + "grad_norm": 0.5031718611717224, + "learning_rate": 9.835559620310566e-05, + "loss": 2.0432, + "step": 3566 + }, + { + "epoch": 1.094843462246777, + "grad_norm": 0.5570902228355408, + "learning_rate": 9.835433169636818e-05, + "loss": 2.1203, + "step": 3567 + }, + { + "epoch": 1.0951503990178024, + "grad_norm": 0.6224993467330933, + "learning_rate": 9.835306671176484e-05, + "loss": 2.0281, + "step": 3568 + }, + { + "epoch": 1.0954573357888275, + "grad_norm": 0.67215895652771, + "learning_rate": 9.835180124930816e-05, + "loss": 2.1158, + "step": 3569 + }, + { + "epoch": 1.0957642725598526, + "grad_norm": 0.5764983892440796, + "learning_rate": 9.835053530901064e-05, + "loss": 1.9735, + "step": 3570 + }, + { + "epoch": 1.0960712093308778, + "grad_norm": 0.48459672927856445, + "learning_rate": 9.834926889088478e-05, + "loss": 2.0074, + "step": 3571 + }, + { + "epoch": 1.096378146101903, + "grad_norm": 0.4789890944957733, + "learning_rate": 9.834800199494312e-05, + "loss": 1.9942, + "step": 3572 + }, + { + "epoch": 1.0966850828729282, + "grad_norm": 0.5133237838745117, + "learning_rate": 9.834673462119817e-05, + "loss": 2.0204, + "step": 3573 + }, + { + "epoch": 1.0969920196439533, + "grad_norm": 0.638518750667572, + "learning_rate": 9.834546676966244e-05, + "loss": 2.1396, + "step": 3574 + }, + { + "epoch": 1.0972989564149784, + "grad_norm": 0.5471677780151367, + "learning_rate": 9.834419844034848e-05, + "loss": 1.99, + "step": 3575 + }, + { + "epoch": 1.0976058931860038, + "grad_norm": 0.5372926592826843, + "learning_rate": 9.83429296332688e-05, + "loss": 2.0241, + "step": 3576 + }, + { + "epoch": 1.0979128299570289, + "grad_norm": 0.5284983515739441, + "learning_rate": 9.834166034843597e-05, + "loss": 2.0705, + "step": 3577 + }, + { + "epoch": 1.098219766728054, + "grad_norm": 0.5212574601173401, + "learning_rate": 9.834039058586252e-05, + "loss": 2.0648, + "step": 3578 + }, + { + "epoch": 1.098526703499079, + "grad_norm": 0.439454048871994, + "learning_rate": 9.833912034556099e-05, + "loss": 1.9981, + "step": 3579 + }, + { + "epoch": 1.0988336402701044, + "grad_norm": 0.529550313949585, + "learning_rate": 9.833784962754394e-05, + "loss": 2.0092, + "step": 3580 + }, + { + "epoch": 1.0991405770411296, + "grad_norm": 0.5555844902992249, + "learning_rate": 9.833657843182394e-05, + "loss": 2.0457, + "step": 3581 + }, + { + "epoch": 1.0994475138121547, + "grad_norm": 0.56191086769104, + "learning_rate": 9.833530675841352e-05, + "loss": 2.0742, + "step": 3582 + }, + { + "epoch": 1.0997544505831798, + "grad_norm": 0.5119436383247375, + "learning_rate": 9.833403460732529e-05, + "loss": 2.0836, + "step": 3583 + }, + { + "epoch": 1.1000613873542051, + "grad_norm": 0.48049578070640564, + "learning_rate": 9.833276197857179e-05, + "loss": 2.0018, + "step": 3584 + }, + { + "epoch": 1.1003683241252302, + "grad_norm": 0.48501092195510864, + "learning_rate": 9.83314888721656e-05, + "loss": 2.0158, + "step": 3585 + }, + { + "epoch": 1.1006752608962553, + "grad_norm": 0.528548538684845, + "learning_rate": 9.833021528811932e-05, + "loss": 2.0327, + "step": 3586 + }, + { + "epoch": 1.1009821976672804, + "grad_norm": 0.5243194699287415, + "learning_rate": 9.832894122644551e-05, + "loss": 1.9874, + "step": 3587 + }, + { + "epoch": 1.1012891344383058, + "grad_norm": 0.46920302510261536, + "learning_rate": 9.832766668715681e-05, + "loss": 2.0487, + "step": 3588 + }, + { + "epoch": 1.101596071209331, + "grad_norm": 0.45994171500205994, + "learning_rate": 9.832639167026575e-05, + "loss": 2.0926, + "step": 3589 + }, + { + "epoch": 1.101903007980356, + "grad_norm": 0.5337465405464172, + "learning_rate": 9.832511617578497e-05, + "loss": 1.9957, + "step": 3590 + }, + { + "epoch": 1.1022099447513811, + "grad_norm": 0.5920217633247375, + "learning_rate": 9.832384020372707e-05, + "loss": 2.0571, + "step": 3591 + }, + { + "epoch": 1.1025168815224065, + "grad_norm": 0.651720404624939, + "learning_rate": 9.832256375410466e-05, + "loss": 2.0382, + "step": 3592 + }, + { + "epoch": 1.1028238182934316, + "grad_norm": 0.6063461899757385, + "learning_rate": 9.832128682693035e-05, + "loss": 1.9932, + "step": 3593 + }, + { + "epoch": 1.1031307550644567, + "grad_norm": 0.5111881494522095, + "learning_rate": 9.832000942221676e-05, + "loss": 1.9821, + "step": 3594 + }, + { + "epoch": 1.1034376918354818, + "grad_norm": 0.5419835448265076, + "learning_rate": 9.831873153997652e-05, + "loss": 2.0535, + "step": 3595 + }, + { + "epoch": 1.1037446286065071, + "grad_norm": 0.5685762763023376, + "learning_rate": 9.831745318022226e-05, + "loss": 2.0715, + "step": 3596 + }, + { + "epoch": 1.1040515653775322, + "grad_norm": 0.6095051765441895, + "learning_rate": 9.831617434296659e-05, + "loss": 2.0382, + "step": 3597 + }, + { + "epoch": 1.1043585021485574, + "grad_norm": 0.548292338848114, + "learning_rate": 9.831489502822217e-05, + "loss": 1.98, + "step": 3598 + }, + { + "epoch": 1.1046654389195825, + "grad_norm": 0.5056986808776855, + "learning_rate": 9.831361523600165e-05, + "loss": 2.0271, + "step": 3599 + }, + { + "epoch": 1.1049723756906078, + "grad_norm": 0.48790082335472107, + "learning_rate": 9.831233496631767e-05, + "loss": 1.9555, + "step": 3600 + }, + { + "epoch": 1.105279312461633, + "grad_norm": 0.4663766622543335, + "learning_rate": 9.831105421918287e-05, + "loss": 1.9985, + "step": 3601 + }, + { + "epoch": 1.105586249232658, + "grad_norm": 0.4549616277217865, + "learning_rate": 9.83097729946099e-05, + "loss": 2.0543, + "step": 3602 + }, + { + "epoch": 1.1058931860036831, + "grad_norm": 0.46699193120002747, + "learning_rate": 9.830849129261146e-05, + "loss": 2.0395, + "step": 3603 + }, + { + "epoch": 1.1062001227747085, + "grad_norm": 0.4600387215614319, + "learning_rate": 9.830720911320019e-05, + "loss": 2.0155, + "step": 3604 + }, + { + "epoch": 1.1065070595457336, + "grad_norm": 0.4854283034801483, + "learning_rate": 9.830592645638877e-05, + "loss": 2.0698, + "step": 3605 + }, + { + "epoch": 1.1068139963167587, + "grad_norm": 0.5249526500701904, + "learning_rate": 9.830464332218987e-05, + "loss": 2.0842, + "step": 3606 + }, + { + "epoch": 1.107120933087784, + "grad_norm": 0.6377332806587219, + "learning_rate": 9.830335971061616e-05, + "loss": 2.1399, + "step": 3607 + }, + { + "epoch": 1.1074278698588091, + "grad_norm": 0.632194995880127, + "learning_rate": 9.830207562168034e-05, + "loss": 2.1203, + "step": 3608 + }, + { + "epoch": 1.1077348066298343, + "grad_norm": 0.5585857629776001, + "learning_rate": 9.830079105539512e-05, + "loss": 2.0219, + "step": 3609 + }, + { + "epoch": 1.1080417434008594, + "grad_norm": 0.5613297820091248, + "learning_rate": 9.829950601177316e-05, + "loss": 2.0464, + "step": 3610 + }, + { + "epoch": 1.1083486801718845, + "grad_norm": 0.5213276743888855, + "learning_rate": 9.829822049082716e-05, + "loss": 2.0134, + "step": 3611 + }, + { + "epoch": 1.1086556169429098, + "grad_norm": 0.5008644461631775, + "learning_rate": 9.829693449256984e-05, + "loss": 1.9952, + "step": 3612 + }, + { + "epoch": 1.108962553713935, + "grad_norm": 0.5565455555915833, + "learning_rate": 9.829564801701392e-05, + "loss": 1.9737, + "step": 3613 + }, + { + "epoch": 1.10926949048496, + "grad_norm": 0.6150243878364563, + "learning_rate": 9.82943610641721e-05, + "loss": 2.0414, + "step": 3614 + }, + { + "epoch": 1.1095764272559854, + "grad_norm": 0.6731769442558289, + "learning_rate": 9.829307363405709e-05, + "loss": 2.0262, + "step": 3615 + }, + { + "epoch": 1.1098833640270105, + "grad_norm": 0.5681004524230957, + "learning_rate": 9.829178572668162e-05, + "loss": 2.0303, + "step": 3616 + }, + { + "epoch": 1.1101903007980356, + "grad_norm": 0.4748475253582001, + "learning_rate": 9.829049734205841e-05, + "loss": 1.9756, + "step": 3617 + }, + { + "epoch": 1.1104972375690607, + "grad_norm": 0.4218698740005493, + "learning_rate": 9.82892084802002e-05, + "loss": 2.0243, + "step": 3618 + }, + { + "epoch": 1.1108041743400858, + "grad_norm": 0.47928178310394287, + "learning_rate": 9.828791914111976e-05, + "loss": 2.0368, + "step": 3619 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5805749297142029, + "learning_rate": 9.828662932482977e-05, + "loss": 2.0071, + "step": 3620 + }, + { + "epoch": 1.1114180478821363, + "grad_norm": 0.5580070614814758, + "learning_rate": 9.828533903134302e-05, + "loss": 1.9568, + "step": 3621 + }, + { + "epoch": 1.1117249846531614, + "grad_norm": 0.572694718837738, + "learning_rate": 9.828404826067224e-05, + "loss": 2.0128, + "step": 3622 + }, + { + "epoch": 1.1120319214241867, + "grad_norm": 0.605338990688324, + "learning_rate": 9.828275701283021e-05, + "loss": 2.0638, + "step": 3623 + }, + { + "epoch": 1.1123388581952118, + "grad_norm": 0.550521969795227, + "learning_rate": 9.828146528782967e-05, + "loss": 2.118, + "step": 3624 + }, + { + "epoch": 1.112645794966237, + "grad_norm": 0.5420751571655273, + "learning_rate": 9.828017308568337e-05, + "loss": 2.0685, + "step": 3625 + }, + { + "epoch": 1.112952731737262, + "grad_norm": 0.5761057734489441, + "learning_rate": 9.827888040640414e-05, + "loss": 2.1111, + "step": 3626 + }, + { + "epoch": 1.1132596685082874, + "grad_norm": 0.5724154710769653, + "learning_rate": 9.827758725000468e-05, + "loss": 2.0596, + "step": 3627 + }, + { + "epoch": 1.1135666052793125, + "grad_norm": 0.5120618343353271, + "learning_rate": 9.827629361649783e-05, + "loss": 1.9811, + "step": 3628 + }, + { + "epoch": 1.1138735420503376, + "grad_norm": 0.4449520409107208, + "learning_rate": 9.827499950589633e-05, + "loss": 1.9935, + "step": 3629 + }, + { + "epoch": 1.1141804788213627, + "grad_norm": 0.5478667616844177, + "learning_rate": 9.827370491821302e-05, + "loss": 2.0142, + "step": 3630 + }, + { + "epoch": 1.114487415592388, + "grad_norm": 0.6170383095741272, + "learning_rate": 9.827240985346064e-05, + "loss": 2.0588, + "step": 3631 + }, + { + "epoch": 1.1147943523634132, + "grad_norm": 0.5950221419334412, + "learning_rate": 9.827111431165202e-05, + "loss": 2.0187, + "step": 3632 + }, + { + "epoch": 1.1151012891344383, + "grad_norm": 0.5250533819198608, + "learning_rate": 9.826981829279995e-05, + "loss": 2.0288, + "step": 3633 + }, + { + "epoch": 1.1154082259054634, + "grad_norm": 0.6252482533454895, + "learning_rate": 9.826852179691725e-05, + "loss": 2.1834, + "step": 3634 + }, + { + "epoch": 1.1157151626764887, + "grad_norm": 0.5258986353874207, + "learning_rate": 9.826722482401673e-05, + "loss": 1.9894, + "step": 3635 + }, + { + "epoch": 1.1160220994475138, + "grad_norm": 0.5532206892967224, + "learning_rate": 9.82659273741112e-05, + "loss": 2.013, + "step": 3636 + }, + { + "epoch": 1.116329036218539, + "grad_norm": 0.5178828835487366, + "learning_rate": 9.826462944721349e-05, + "loss": 1.955, + "step": 3637 + }, + { + "epoch": 1.116635972989564, + "grad_norm": 0.5466227531433105, + "learning_rate": 9.826333104333642e-05, + "loss": 2.1073, + "step": 3638 + }, + { + "epoch": 1.1169429097605894, + "grad_norm": 0.5513507723808289, + "learning_rate": 9.826203216249282e-05, + "loss": 2.0735, + "step": 3639 + }, + { + "epoch": 1.1172498465316145, + "grad_norm": 0.5485204458236694, + "learning_rate": 9.826073280469554e-05, + "loss": 2.0699, + "step": 3640 + }, + { + "epoch": 1.1175567833026396, + "grad_norm": 0.5148037075996399, + "learning_rate": 9.825943296995741e-05, + "loss": 1.9364, + "step": 3641 + }, + { + "epoch": 1.1178637200736647, + "grad_norm": 0.5639125108718872, + "learning_rate": 9.825813265829127e-05, + "loss": 2.078, + "step": 3642 + }, + { + "epoch": 1.11817065684469, + "grad_norm": 0.581631064414978, + "learning_rate": 9.825683186970997e-05, + "loss": 2.0404, + "step": 3643 + }, + { + "epoch": 1.1184775936157152, + "grad_norm": 0.5630286335945129, + "learning_rate": 9.82555306042264e-05, + "loss": 2.0615, + "step": 3644 + }, + { + "epoch": 1.1187845303867403, + "grad_norm": 0.5661062598228455, + "learning_rate": 9.825422886185338e-05, + "loss": 2.0432, + "step": 3645 + }, + { + "epoch": 1.1190914671577654, + "grad_norm": 0.4960556626319885, + "learning_rate": 9.825292664260379e-05, + "loss": 2.0576, + "step": 3646 + }, + { + "epoch": 1.1193984039287908, + "grad_norm": 0.5052362084388733, + "learning_rate": 9.825162394649048e-05, + "loss": 2.0615, + "step": 3647 + }, + { + "epoch": 1.1197053406998159, + "grad_norm": 0.566758930683136, + "learning_rate": 9.825032077352636e-05, + "loss": 2.0821, + "step": 3648 + }, + { + "epoch": 1.120012277470841, + "grad_norm": 0.5705568790435791, + "learning_rate": 9.824901712372429e-05, + "loss": 2.1455, + "step": 3649 + }, + { + "epoch": 1.120319214241866, + "grad_norm": 0.5584011673927307, + "learning_rate": 9.824771299709714e-05, + "loss": 2.0911, + "step": 3650 + }, + { + "epoch": 1.1206261510128914, + "grad_norm": 0.5621497631072998, + "learning_rate": 9.824640839365782e-05, + "loss": 2.1209, + "step": 3651 + }, + { + "epoch": 1.1209330877839165, + "grad_norm": 0.4893646240234375, + "learning_rate": 9.824510331341921e-05, + "loss": 1.977, + "step": 3652 + }, + { + "epoch": 1.1212400245549416, + "grad_norm": 0.5626688599586487, + "learning_rate": 9.82437977563942e-05, + "loss": 2.1114, + "step": 3653 + }, + { + "epoch": 1.1215469613259668, + "grad_norm": 0.5714966058731079, + "learning_rate": 9.824249172259573e-05, + "loss": 2.021, + "step": 3654 + }, + { + "epoch": 1.121853898096992, + "grad_norm": 0.5190821886062622, + "learning_rate": 9.824118521203666e-05, + "loss": 1.9788, + "step": 3655 + }, + { + "epoch": 1.1221608348680172, + "grad_norm": 0.46421363949775696, + "learning_rate": 9.823987822472994e-05, + "loss": 1.9762, + "step": 3656 + }, + { + "epoch": 1.1224677716390423, + "grad_norm": 0.5071156620979309, + "learning_rate": 9.823857076068846e-05, + "loss": 1.9625, + "step": 3657 + }, + { + "epoch": 1.1227747084100674, + "grad_norm": 0.5762679576873779, + "learning_rate": 9.823726281992515e-05, + "loss": 2.0543, + "step": 3658 + }, + { + "epoch": 1.1230816451810928, + "grad_norm": 0.6275226473808289, + "learning_rate": 9.823595440245294e-05, + "loss": 2.0878, + "step": 3659 + }, + { + "epoch": 1.1233885819521179, + "grad_norm": 0.6893213391304016, + "learning_rate": 9.823464550828476e-05, + "loss": 2.1059, + "step": 3660 + }, + { + "epoch": 1.123695518723143, + "grad_norm": 0.5521993041038513, + "learning_rate": 9.823333613743353e-05, + "loss": 2.035, + "step": 3661 + }, + { + "epoch": 1.124002455494168, + "grad_norm": 0.4918796718120575, + "learning_rate": 9.823202628991221e-05, + "loss": 1.9873, + "step": 3662 + }, + { + "epoch": 1.1243093922651934, + "grad_norm": 0.5177932977676392, + "learning_rate": 9.823071596573373e-05, + "loss": 2.0376, + "step": 3663 + }, + { + "epoch": 1.1246163290362186, + "grad_norm": 0.5337314009666443, + "learning_rate": 9.822940516491106e-05, + "loss": 2.1065, + "step": 3664 + }, + { + "epoch": 1.1249232658072437, + "grad_norm": 0.5179010629653931, + "learning_rate": 9.822809388745713e-05, + "loss": 1.9642, + "step": 3665 + }, + { + "epoch": 1.125230202578269, + "grad_norm": 0.5394679307937622, + "learning_rate": 9.82267821333849e-05, + "loss": 2.0275, + "step": 3666 + }, + { + "epoch": 1.1255371393492941, + "grad_norm": 0.582873523235321, + "learning_rate": 9.822546990270735e-05, + "loss": 2.0369, + "step": 3667 + }, + { + "epoch": 1.1258440761203192, + "grad_norm": 0.6595674753189087, + "learning_rate": 9.822415719543745e-05, + "loss": 1.9776, + "step": 3668 + }, + { + "epoch": 1.1261510128913443, + "grad_norm": 0.8103840947151184, + "learning_rate": 9.822284401158814e-05, + "loss": 2.0784, + "step": 3669 + }, + { + "epoch": 1.1264579496623695, + "grad_norm": 0.9062070250511169, + "learning_rate": 9.822153035117245e-05, + "loss": 1.9886, + "step": 3670 + }, + { + "epoch": 1.1267648864333948, + "grad_norm": 0.8718156814575195, + "learning_rate": 9.822021621420333e-05, + "loss": 2.0499, + "step": 3671 + }, + { + "epoch": 1.12707182320442, + "grad_norm": 0.6499583721160889, + "learning_rate": 9.821890160069375e-05, + "loss": 2.0734, + "step": 3672 + }, + { + "epoch": 1.127378759975445, + "grad_norm": 0.4573141932487488, + "learning_rate": 9.821758651065673e-05, + "loss": 2.0306, + "step": 3673 + }, + { + "epoch": 1.1276856967464703, + "grad_norm": 0.6441135406494141, + "learning_rate": 9.821627094410526e-05, + "loss": 2.051, + "step": 3674 + }, + { + "epoch": 1.1279926335174955, + "grad_norm": 0.7201390266418457, + "learning_rate": 9.821495490105235e-05, + "loss": 2.0187, + "step": 3675 + }, + { + "epoch": 1.1282995702885206, + "grad_norm": 0.6751874685287476, + "learning_rate": 9.821363838151099e-05, + "loss": 2.0363, + "step": 3676 + }, + { + "epoch": 1.1286065070595457, + "grad_norm": 0.5435949563980103, + "learning_rate": 9.821232138549419e-05, + "loss": 1.939, + "step": 3677 + }, + { + "epoch": 1.1289134438305708, + "grad_norm": 0.605248212814331, + "learning_rate": 9.821100391301497e-05, + "loss": 2.146, + "step": 3678 + }, + { + "epoch": 1.1292203806015961, + "grad_norm": 0.6798139810562134, + "learning_rate": 9.820968596408636e-05, + "loss": 2.0423, + "step": 3679 + }, + { + "epoch": 1.1295273173726212, + "grad_norm": 0.6683683395385742, + "learning_rate": 9.820836753872137e-05, + "loss": 1.9768, + "step": 3680 + }, + { + "epoch": 1.1298342541436464, + "grad_norm": 0.578346312046051, + "learning_rate": 9.820704863693304e-05, + "loss": 1.9313, + "step": 3681 + }, + { + "epoch": 1.1301411909146717, + "grad_norm": 0.5639599561691284, + "learning_rate": 9.820572925873441e-05, + "loss": 2.0706, + "step": 3682 + }, + { + "epoch": 1.1304481276856968, + "grad_norm": 0.5749368071556091, + "learning_rate": 9.82044094041385e-05, + "loss": 2.0072, + "step": 3683 + }, + { + "epoch": 1.130755064456722, + "grad_norm": 0.6490229368209839, + "learning_rate": 9.820308907315836e-05, + "loss": 1.9947, + "step": 3684 + }, + { + "epoch": 1.131062001227747, + "grad_norm": 0.6207692623138428, + "learning_rate": 9.820176826580705e-05, + "loss": 2.1426, + "step": 3685 + }, + { + "epoch": 1.1313689379987721, + "grad_norm": 0.6421573162078857, + "learning_rate": 9.82004469820976e-05, + "loss": 2.0558, + "step": 3686 + }, + { + "epoch": 1.1316758747697975, + "grad_norm": 0.5462764501571655, + "learning_rate": 9.81991252220431e-05, + "loss": 2.0072, + "step": 3687 + }, + { + "epoch": 1.1319828115408226, + "grad_norm": 0.49791282415390015, + "learning_rate": 9.819780298565657e-05, + "loss": 1.9949, + "step": 3688 + }, + { + "epoch": 1.1322897483118477, + "grad_norm": 0.5120366215705872, + "learning_rate": 9.819648027295112e-05, + "loss": 2.0503, + "step": 3689 + }, + { + "epoch": 1.132596685082873, + "grad_norm": 0.5118343830108643, + "learning_rate": 9.81951570839398e-05, + "loss": 2.0104, + "step": 3690 + }, + { + "epoch": 1.1329036218538981, + "grad_norm": 0.44520822167396545, + "learning_rate": 9.81938334186357e-05, + "loss": 2.0024, + "step": 3691 + }, + { + "epoch": 1.1332105586249233, + "grad_norm": 0.5505960583686829, + "learning_rate": 9.819250927705188e-05, + "loss": 2.0924, + "step": 3692 + }, + { + "epoch": 1.1335174953959484, + "grad_norm": 0.5269182920455933, + "learning_rate": 9.819118465920143e-05, + "loss": 2.0553, + "step": 3693 + }, + { + "epoch": 1.1338244321669735, + "grad_norm": 0.4864311218261719, + "learning_rate": 9.818985956509745e-05, + "loss": 2.0405, + "step": 3694 + }, + { + "epoch": 1.1341313689379988, + "grad_norm": 0.515202522277832, + "learning_rate": 9.818853399475304e-05, + "loss": 2.0211, + "step": 3695 + }, + { + "epoch": 1.134438305709024, + "grad_norm": 0.5360483527183533, + "learning_rate": 9.818720794818128e-05, + "loss": 2.1077, + "step": 3696 + }, + { + "epoch": 1.134745242480049, + "grad_norm": 0.5469255447387695, + "learning_rate": 9.818588142539531e-05, + "loss": 1.9538, + "step": 3697 + }, + { + "epoch": 1.1350521792510744, + "grad_norm": 0.5042214393615723, + "learning_rate": 9.818455442640819e-05, + "loss": 2.0477, + "step": 3698 + }, + { + "epoch": 1.1353591160220995, + "grad_norm": 0.5678744316101074, + "learning_rate": 9.81832269512331e-05, + "loss": 2.0871, + "step": 3699 + }, + { + "epoch": 1.1356660527931246, + "grad_norm": 0.5218677520751953, + "learning_rate": 9.818189899988308e-05, + "loss": 2.1014, + "step": 3700 + }, + { + "epoch": 1.1359729895641497, + "grad_norm": 0.5141727924346924, + "learning_rate": 9.818057057237132e-05, + "loss": 2.0385, + "step": 3701 + }, + { + "epoch": 1.136279926335175, + "grad_norm": 0.5288038849830627, + "learning_rate": 9.81792416687109e-05, + "loss": 2.0736, + "step": 3702 + }, + { + "epoch": 1.1365868631062002, + "grad_norm": 0.5533168911933899, + "learning_rate": 9.817791228891499e-05, + "loss": 2.032, + "step": 3703 + }, + { + "epoch": 1.1368937998772253, + "grad_norm": 0.4840674102306366, + "learning_rate": 9.81765824329967e-05, + "loss": 2.027, + "step": 3704 + }, + { + "epoch": 1.1372007366482504, + "grad_norm": 0.5060023069381714, + "learning_rate": 9.817525210096921e-05, + "loss": 2.0561, + "step": 3705 + }, + { + "epoch": 1.1375076734192757, + "grad_norm": 0.48830488324165344, + "learning_rate": 9.817392129284561e-05, + "loss": 1.9807, + "step": 3706 + }, + { + "epoch": 1.1378146101903008, + "grad_norm": 0.4644564390182495, + "learning_rate": 9.817259000863911e-05, + "loss": 1.9871, + "step": 3707 + }, + { + "epoch": 1.138121546961326, + "grad_norm": 0.4644739329814911, + "learning_rate": 9.817125824836283e-05, + "loss": 2.0253, + "step": 3708 + }, + { + "epoch": 1.138428483732351, + "grad_norm": 0.5376463532447815, + "learning_rate": 9.816992601202994e-05, + "loss": 2.0693, + "step": 3709 + }, + { + "epoch": 1.1387354205033764, + "grad_norm": 0.49980148673057556, + "learning_rate": 9.816859329965363e-05, + "loss": 2.0123, + "step": 3710 + }, + { + "epoch": 1.1390423572744015, + "grad_norm": 0.5452225208282471, + "learning_rate": 9.816726011124702e-05, + "loss": 2.0725, + "step": 3711 + }, + { + "epoch": 1.1393492940454266, + "grad_norm": 0.5428896546363831, + "learning_rate": 9.816592644682332e-05, + "loss": 2.0446, + "step": 3712 + }, + { + "epoch": 1.1396562308164517, + "grad_norm": 0.5448847413063049, + "learning_rate": 9.816459230639571e-05, + "loss": 2.0262, + "step": 3713 + }, + { + "epoch": 1.139963167587477, + "grad_norm": 0.48574572801589966, + "learning_rate": 9.816325768997736e-05, + "loss": 2.0105, + "step": 3714 + }, + { + "epoch": 1.1402701043585022, + "grad_norm": 0.5566397905349731, + "learning_rate": 9.816192259758147e-05, + "loss": 2.0665, + "step": 3715 + }, + { + "epoch": 1.1405770411295273, + "grad_norm": 0.6098625659942627, + "learning_rate": 9.816058702922124e-05, + "loss": 2.0589, + "step": 3716 + }, + { + "epoch": 1.1408839779005524, + "grad_norm": 0.6118699312210083, + "learning_rate": 9.815925098490985e-05, + "loss": 2.0683, + "step": 3717 + }, + { + "epoch": 1.1411909146715777, + "grad_norm": 0.5213121175765991, + "learning_rate": 9.815791446466053e-05, + "loss": 2.0226, + "step": 3718 + }, + { + "epoch": 1.1414978514426029, + "grad_norm": 0.45717960596084595, + "learning_rate": 9.815657746848648e-05, + "loss": 2.0371, + "step": 3719 + }, + { + "epoch": 1.141804788213628, + "grad_norm": 0.4613656997680664, + "learning_rate": 9.815523999640088e-05, + "loss": 2.0702, + "step": 3720 + }, + { + "epoch": 1.142111724984653, + "grad_norm": 0.4527476727962494, + "learning_rate": 9.8153902048417e-05, + "loss": 1.9893, + "step": 3721 + }, + { + "epoch": 1.1424186617556784, + "grad_norm": 0.4524305462837219, + "learning_rate": 9.815256362454801e-05, + "loss": 1.975, + "step": 3722 + }, + { + "epoch": 1.1427255985267035, + "grad_norm": 0.4421180188655853, + "learning_rate": 9.815122472480718e-05, + "loss": 1.9987, + "step": 3723 + }, + { + "epoch": 1.1430325352977286, + "grad_norm": 0.4833788275718689, + "learning_rate": 9.814988534920771e-05, + "loss": 2.0246, + "step": 3724 + }, + { + "epoch": 1.1433394720687537, + "grad_norm": 0.46547624468803406, + "learning_rate": 9.814854549776287e-05, + "loss": 2.0007, + "step": 3725 + }, + { + "epoch": 1.143646408839779, + "grad_norm": 0.43220648169517517, + "learning_rate": 9.814720517048587e-05, + "loss": 1.9845, + "step": 3726 + }, + { + "epoch": 1.1439533456108042, + "grad_norm": 0.473910391330719, + "learning_rate": 9.814586436738998e-05, + "loss": 2.0518, + "step": 3727 + }, + { + "epoch": 1.1442602823818293, + "grad_norm": 0.507354199886322, + "learning_rate": 9.814452308848843e-05, + "loss": 2.0708, + "step": 3728 + }, + { + "epoch": 1.1445672191528544, + "grad_norm": 0.4585053622722626, + "learning_rate": 9.814318133379448e-05, + "loss": 2.0124, + "step": 3729 + }, + { + "epoch": 1.1448741559238798, + "grad_norm": 0.5280457735061646, + "learning_rate": 9.81418391033214e-05, + "loss": 2.0424, + "step": 3730 + }, + { + "epoch": 1.1451810926949049, + "grad_norm": 0.5173056125640869, + "learning_rate": 9.814049639708245e-05, + "loss": 1.9666, + "step": 3731 + }, + { + "epoch": 1.14548802946593, + "grad_norm": 0.5850839018821716, + "learning_rate": 9.81391532150909e-05, + "loss": 2.0765, + "step": 3732 + }, + { + "epoch": 1.145794966236955, + "grad_norm": 0.5450417995452881, + "learning_rate": 9.813780955736002e-05, + "loss": 2.0696, + "step": 3733 + }, + { + "epoch": 1.1461019030079804, + "grad_norm": 0.4577319622039795, + "learning_rate": 9.81364654239031e-05, + "loss": 2.0493, + "step": 3734 + }, + { + "epoch": 1.1464088397790055, + "grad_norm": 0.5211838483810425, + "learning_rate": 9.813512081473339e-05, + "loss": 2.0578, + "step": 3735 + }, + { + "epoch": 1.1467157765500307, + "grad_norm": 0.6763051152229309, + "learning_rate": 9.813377572986422e-05, + "loss": 2.0859, + "step": 3736 + }, + { + "epoch": 1.1470227133210558, + "grad_norm": 0.8591815233230591, + "learning_rate": 9.813243016930887e-05, + "loss": 1.9743, + "step": 3737 + }, + { + "epoch": 1.147329650092081, + "grad_norm": 0.8573755025863647, + "learning_rate": 9.813108413308063e-05, + "loss": 2.048, + "step": 3738 + }, + { + "epoch": 1.1476365868631062, + "grad_norm": 0.6887713074684143, + "learning_rate": 9.812973762119281e-05, + "loss": 2.0184, + "step": 3739 + }, + { + "epoch": 1.1479435236341313, + "grad_norm": 0.5491438508033752, + "learning_rate": 9.81283906336587e-05, + "loss": 2.0373, + "step": 3740 + }, + { + "epoch": 1.1482504604051567, + "grad_norm": 0.6413923501968384, + "learning_rate": 9.812704317049164e-05, + "loss": 2.067, + "step": 3741 + }, + { + "epoch": 1.1485573971761818, + "grad_norm": 0.8731338381767273, + "learning_rate": 9.812569523170492e-05, + "loss": 1.9996, + "step": 3742 + }, + { + "epoch": 1.1488643339472069, + "grad_norm": 0.8043886423110962, + "learning_rate": 9.812434681731189e-05, + "loss": 2.0464, + "step": 3743 + }, + { + "epoch": 1.149171270718232, + "grad_norm": 0.5458279252052307, + "learning_rate": 9.812299792732584e-05, + "loss": 2.0026, + "step": 3744 + }, + { + "epoch": 1.149478207489257, + "grad_norm": 0.5135432481765747, + "learning_rate": 9.812164856176011e-05, + "loss": 2.0302, + "step": 3745 + }, + { + "epoch": 1.1497851442602824, + "grad_norm": 0.6673153638839722, + "learning_rate": 9.812029872062807e-05, + "loss": 2.0435, + "step": 3746 + }, + { + "epoch": 1.1500920810313076, + "grad_norm": 0.6777083873748779, + "learning_rate": 9.811894840394302e-05, + "loss": 2.0591, + "step": 3747 + }, + { + "epoch": 1.1503990178023327, + "grad_norm": 0.6660524010658264, + "learning_rate": 9.811759761171833e-05, + "loss": 2.0461, + "step": 3748 + }, + { + "epoch": 1.150705954573358, + "grad_norm": 0.6079594492912292, + "learning_rate": 9.811624634396733e-05, + "loss": 2.0708, + "step": 3749 + }, + { + "epoch": 1.1510128913443831, + "grad_norm": 0.5242465734481812, + "learning_rate": 9.811489460070337e-05, + "loss": 2.0513, + "step": 3750 + }, + { + "epoch": 1.1513198281154082, + "grad_norm": 0.7091820240020752, + "learning_rate": 9.811354238193984e-05, + "loss": 2.0356, + "step": 3751 + }, + { + "epoch": 1.1516267648864333, + "grad_norm": 0.6781896948814392, + "learning_rate": 9.811218968769007e-05, + "loss": 2.0693, + "step": 3752 + }, + { + "epoch": 1.1519337016574585, + "grad_norm": 0.6036314368247986, + "learning_rate": 9.811083651796744e-05, + "loss": 2.134, + "step": 3753 + }, + { + "epoch": 1.1522406384284838, + "grad_norm": 0.6173892617225647, + "learning_rate": 9.810948287278534e-05, + "loss": 2.056, + "step": 3754 + }, + { + "epoch": 1.152547575199509, + "grad_norm": 0.4903198182582855, + "learning_rate": 9.810812875215712e-05, + "loss": 2.0037, + "step": 3755 + }, + { + "epoch": 1.152854511970534, + "grad_norm": 0.5527236461639404, + "learning_rate": 9.810677415609619e-05, + "loss": 2.0334, + "step": 3756 + }, + { + "epoch": 1.1531614487415593, + "grad_norm": 0.5342993140220642, + "learning_rate": 9.81054190846159e-05, + "loss": 2.0376, + "step": 3757 + }, + { + "epoch": 1.1534683855125845, + "grad_norm": 0.4860527515411377, + "learning_rate": 9.810406353772968e-05, + "loss": 2.0009, + "step": 3758 + }, + { + "epoch": 1.1537753222836096, + "grad_norm": 0.49722176790237427, + "learning_rate": 9.810270751545089e-05, + "loss": 2.051, + "step": 3759 + }, + { + "epoch": 1.1540822590546347, + "grad_norm": 0.4714743196964264, + "learning_rate": 9.810135101779296e-05, + "loss": 2.0474, + "step": 3760 + }, + { + "epoch": 1.1543891958256598, + "grad_norm": 0.5183619856834412, + "learning_rate": 9.80999940447693e-05, + "loss": 2.1032, + "step": 3761 + }, + { + "epoch": 1.1546961325966851, + "grad_norm": 0.6118659377098083, + "learning_rate": 9.809863659639328e-05, + "loss": 2.0967, + "step": 3762 + }, + { + "epoch": 1.1550030693677102, + "grad_norm": 0.49166184663772583, + "learning_rate": 9.809727867267838e-05, + "loss": 2.0683, + "step": 3763 + }, + { + "epoch": 1.1553100061387354, + "grad_norm": 0.5190026164054871, + "learning_rate": 9.809592027363795e-05, + "loss": 2.0161, + "step": 3764 + }, + { + "epoch": 1.1556169429097607, + "grad_norm": 0.516914427280426, + "learning_rate": 9.809456139928546e-05, + "loss": 2.0886, + "step": 3765 + }, + { + "epoch": 1.1559238796807858, + "grad_norm": 0.49737948179244995, + "learning_rate": 9.809320204963433e-05, + "loss": 2.0111, + "step": 3766 + }, + { + "epoch": 1.156230816451811, + "grad_norm": 0.44676536321640015, + "learning_rate": 9.809184222469796e-05, + "loss": 2.0571, + "step": 3767 + }, + { + "epoch": 1.156537753222836, + "grad_norm": 0.5008999109268188, + "learning_rate": 9.809048192448983e-05, + "loss": 2.0489, + "step": 3768 + }, + { + "epoch": 1.1568446899938611, + "grad_norm": 0.5116657614707947, + "learning_rate": 9.80891211490234e-05, + "loss": 1.9571, + "step": 3769 + }, + { + "epoch": 1.1571516267648865, + "grad_norm": 0.49909651279449463, + "learning_rate": 9.808775989831207e-05, + "loss": 2.0568, + "step": 3770 + }, + { + "epoch": 1.1574585635359116, + "grad_norm": 0.5186662077903748, + "learning_rate": 9.80863981723693e-05, + "loss": 2.0283, + "step": 3771 + }, + { + "epoch": 1.1577655003069367, + "grad_norm": 0.4974740445613861, + "learning_rate": 9.808503597120858e-05, + "loss": 1.9525, + "step": 3772 + }, + { + "epoch": 1.158072437077962, + "grad_norm": 0.5369553565979004, + "learning_rate": 9.808367329484333e-05, + "loss": 1.9627, + "step": 3773 + }, + { + "epoch": 1.1583793738489871, + "grad_norm": 0.5084113478660583, + "learning_rate": 9.808231014328704e-05, + "loss": 1.9563, + "step": 3774 + }, + { + "epoch": 1.1586863106200123, + "grad_norm": 0.6059956550598145, + "learning_rate": 9.808094651655319e-05, + "loss": 2.078, + "step": 3775 + }, + { + "epoch": 1.1589932473910374, + "grad_norm": 0.5677124261856079, + "learning_rate": 9.807958241465523e-05, + "loss": 1.9977, + "step": 3776 + }, + { + "epoch": 1.1593001841620627, + "grad_norm": 0.5582616329193115, + "learning_rate": 9.807821783760667e-05, + "loss": 2.0053, + "step": 3777 + }, + { + "epoch": 1.1596071209330878, + "grad_norm": 0.5558032989501953, + "learning_rate": 9.807685278542097e-05, + "loss": 2.0015, + "step": 3778 + }, + { + "epoch": 1.159914057704113, + "grad_norm": 0.553292989730835, + "learning_rate": 9.807548725811165e-05, + "loss": 2.133, + "step": 3779 + }, + { + "epoch": 1.160220994475138, + "grad_norm": 0.5281317234039307, + "learning_rate": 9.807412125569217e-05, + "loss": 2.0018, + "step": 3780 + }, + { + "epoch": 1.1605279312461634, + "grad_norm": 0.45385050773620605, + "learning_rate": 9.807275477817605e-05, + "loss": 1.9986, + "step": 3781 + }, + { + "epoch": 1.1608348680171885, + "grad_norm": 0.5843673944473267, + "learning_rate": 9.80713878255768e-05, + "loss": 2.0653, + "step": 3782 + }, + { + "epoch": 1.1611418047882136, + "grad_norm": 0.6193283796310425, + "learning_rate": 9.807002039790792e-05, + "loss": 1.9646, + "step": 3783 + }, + { + "epoch": 1.1614487415592387, + "grad_norm": 0.5831897258758545, + "learning_rate": 9.806865249518292e-05, + "loss": 1.9708, + "step": 3784 + }, + { + "epoch": 1.161755678330264, + "grad_norm": 0.49771901965141296, + "learning_rate": 9.806728411741533e-05, + "loss": 1.9953, + "step": 3785 + }, + { + "epoch": 1.1620626151012892, + "grad_norm": 0.5003515481948853, + "learning_rate": 9.806591526461864e-05, + "loss": 2.0503, + "step": 3786 + }, + { + "epoch": 1.1623695518723143, + "grad_norm": 0.5710052847862244, + "learning_rate": 9.806454593680642e-05, + "loss": 1.9976, + "step": 3787 + }, + { + "epoch": 1.1626764886433394, + "grad_norm": 0.5180788040161133, + "learning_rate": 9.806317613399218e-05, + "loss": 1.9872, + "step": 3788 + }, + { + "epoch": 1.1629834254143647, + "grad_norm": 0.5202008485794067, + "learning_rate": 9.806180585618949e-05, + "loss": 1.9628, + "step": 3789 + }, + { + "epoch": 1.1632903621853898, + "grad_norm": 0.47358211874961853, + "learning_rate": 9.806043510341183e-05, + "loss": 1.9994, + "step": 3790 + }, + { + "epoch": 1.163597298956415, + "grad_norm": 0.4258720278739929, + "learning_rate": 9.80590638756728e-05, + "loss": 1.9547, + "step": 3791 + }, + { + "epoch": 1.16390423572744, + "grad_norm": 0.4487614035606384, + "learning_rate": 9.805769217298593e-05, + "loss": 1.9912, + "step": 3792 + }, + { + "epoch": 1.1642111724984654, + "grad_norm": 0.4970495104789734, + "learning_rate": 9.805631999536477e-05, + "loss": 2.0568, + "step": 3793 + }, + { + "epoch": 1.1645181092694905, + "grad_norm": 0.4535474479198456, + "learning_rate": 9.805494734282289e-05, + "loss": 2.0088, + "step": 3794 + }, + { + "epoch": 1.1648250460405156, + "grad_norm": 0.44582805037498474, + "learning_rate": 9.805357421537385e-05, + "loss": 1.9694, + "step": 3795 + }, + { + "epoch": 1.1651319828115407, + "grad_norm": 0.43872734904289246, + "learning_rate": 9.805220061303125e-05, + "loss": 2.0041, + "step": 3796 + }, + { + "epoch": 1.165438919582566, + "grad_norm": 0.5050458908081055, + "learning_rate": 9.805082653580861e-05, + "loss": 1.9963, + "step": 3797 + }, + { + "epoch": 1.1657458563535912, + "grad_norm": 0.5346884727478027, + "learning_rate": 9.804945198371956e-05, + "loss": 2.0334, + "step": 3798 + }, + { + "epoch": 1.1660527931246163, + "grad_norm": 0.5607240796089172, + "learning_rate": 9.804807695677764e-05, + "loss": 2.0474, + "step": 3799 + }, + { + "epoch": 1.1663597298956414, + "grad_norm": 0.5343592166900635, + "learning_rate": 9.804670145499648e-05, + "loss": 2.0542, + "step": 3800 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5195753574371338, + "learning_rate": 9.804532547838964e-05, + "loss": 2.0816, + "step": 3801 + }, + { + "epoch": 1.1669736034376919, + "grad_norm": 0.575821042060852, + "learning_rate": 9.804394902697075e-05, + "loss": 2.0182, + "step": 3802 + }, + { + "epoch": 1.167280540208717, + "grad_norm": 0.6385466456413269, + "learning_rate": 9.804257210075339e-05, + "loss": 2.0519, + "step": 3803 + }, + { + "epoch": 1.167587476979742, + "grad_norm": 0.7202457785606384, + "learning_rate": 9.804119469975117e-05, + "loss": 1.9871, + "step": 3804 + }, + { + "epoch": 1.1678944137507674, + "grad_norm": 0.696793258190155, + "learning_rate": 9.803981682397772e-05, + "loss": 2.1018, + "step": 3805 + }, + { + "epoch": 1.1682013505217925, + "grad_norm": 0.6217656135559082, + "learning_rate": 9.803843847344662e-05, + "loss": 2.1009, + "step": 3806 + }, + { + "epoch": 1.1685082872928176, + "grad_norm": 0.5296351313591003, + "learning_rate": 9.803705964817153e-05, + "loss": 2.1057, + "step": 3807 + }, + { + "epoch": 1.168815224063843, + "grad_norm": 0.5280975699424744, + "learning_rate": 9.803568034816606e-05, + "loss": 2.0019, + "step": 3808 + }, + { + "epoch": 1.169122160834868, + "grad_norm": 0.4981881380081177, + "learning_rate": 9.803430057344385e-05, + "loss": 1.9918, + "step": 3809 + }, + { + "epoch": 1.1694290976058932, + "grad_norm": 0.43662941455841064, + "learning_rate": 9.803292032401852e-05, + "loss": 2.0273, + "step": 3810 + }, + { + "epoch": 1.1697360343769183, + "grad_norm": 0.5039259791374207, + "learning_rate": 9.80315395999037e-05, + "loss": 2.0475, + "step": 3811 + }, + { + "epoch": 1.1700429711479434, + "grad_norm": 0.4330410957336426, + "learning_rate": 9.803015840111308e-05, + "loss": 1.99, + "step": 3812 + }, + { + "epoch": 1.1703499079189688, + "grad_norm": 0.4603813886642456, + "learning_rate": 9.802877672766026e-05, + "loss": 2.0288, + "step": 3813 + }, + { + "epoch": 1.1706568446899939, + "grad_norm": 0.45815590023994446, + "learning_rate": 9.802739457955894e-05, + "loss": 2.0026, + "step": 3814 + }, + { + "epoch": 1.170963781461019, + "grad_norm": 0.46995803713798523, + "learning_rate": 9.802601195682275e-05, + "loss": 2.0608, + "step": 3815 + }, + { + "epoch": 1.1712707182320443, + "grad_norm": 0.4511576294898987, + "learning_rate": 9.802462885946536e-05, + "loss": 1.9793, + "step": 3816 + }, + { + "epoch": 1.1715776550030694, + "grad_norm": 0.49079468846321106, + "learning_rate": 9.802324528750044e-05, + "loss": 2.0049, + "step": 3817 + }, + { + "epoch": 1.1718845917740945, + "grad_norm": 0.47245466709136963, + "learning_rate": 9.802186124094166e-05, + "loss": 1.9562, + "step": 3818 + }, + { + "epoch": 1.1721915285451197, + "grad_norm": 0.485575795173645, + "learning_rate": 9.80204767198027e-05, + "loss": 2.0212, + "step": 3819 + }, + { + "epoch": 1.1724984653161448, + "grad_norm": 0.5924440622329712, + "learning_rate": 9.801909172409724e-05, + "loss": 1.9875, + "step": 3820 + }, + { + "epoch": 1.17280540208717, + "grad_norm": 0.48908641934394836, + "learning_rate": 9.801770625383899e-05, + "loss": 1.9778, + "step": 3821 + }, + { + "epoch": 1.1731123388581952, + "grad_norm": 0.4372415840625763, + "learning_rate": 9.80163203090416e-05, + "loss": 1.9368, + "step": 3822 + }, + { + "epoch": 1.1734192756292203, + "grad_norm": 0.5811094641685486, + "learning_rate": 9.801493388971881e-05, + "loss": 2.1293, + "step": 3823 + }, + { + "epoch": 1.1737262124002457, + "grad_norm": 0.516983151435852, + "learning_rate": 9.801354699588428e-05, + "loss": 2.039, + "step": 3824 + }, + { + "epoch": 1.1740331491712708, + "grad_norm": 0.53409343957901, + "learning_rate": 9.801215962755175e-05, + "loss": 2.0294, + "step": 3825 + }, + { + "epoch": 1.1743400859422959, + "grad_norm": 0.5703202486038208, + "learning_rate": 9.801077178473492e-05, + "loss": 2.0241, + "step": 3826 + }, + { + "epoch": 1.174647022713321, + "grad_norm": 0.49341192841529846, + "learning_rate": 9.80093834674475e-05, + "loss": 1.9092, + "step": 3827 + }, + { + "epoch": 1.174953959484346, + "grad_norm": 0.46960577368736267, + "learning_rate": 9.800799467570321e-05, + "loss": 1.9994, + "step": 3828 + }, + { + "epoch": 1.1752608962553714, + "grad_norm": 0.468108594417572, + "learning_rate": 9.800660540951577e-05, + "loss": 1.9471, + "step": 3829 + }, + { + "epoch": 1.1755678330263966, + "grad_norm": 0.4133259057998657, + "learning_rate": 9.800521566889893e-05, + "loss": 2.0159, + "step": 3830 + }, + { + "epoch": 1.1758747697974217, + "grad_norm": 0.44991979002952576, + "learning_rate": 9.800382545386641e-05, + "loss": 2.0179, + "step": 3831 + }, + { + "epoch": 1.176181706568447, + "grad_norm": 0.43111294507980347, + "learning_rate": 9.800243476443195e-05, + "loss": 2.1092, + "step": 3832 + }, + { + "epoch": 1.1764886433394721, + "grad_norm": 0.4859693944454193, + "learning_rate": 9.800104360060929e-05, + "loss": 2.0134, + "step": 3833 + }, + { + "epoch": 1.1767955801104972, + "grad_norm": 0.474960058927536, + "learning_rate": 9.799965196241219e-05, + "loss": 2.0288, + "step": 3834 + }, + { + "epoch": 1.1771025168815223, + "grad_norm": 0.5269008278846741, + "learning_rate": 9.79982598498544e-05, + "loss": 2.063, + "step": 3835 + }, + { + "epoch": 1.1774094536525475, + "grad_norm": 0.4923003613948822, + "learning_rate": 9.799686726294965e-05, + "loss": 1.9506, + "step": 3836 + }, + { + "epoch": 1.1777163904235728, + "grad_norm": 0.5355561971664429, + "learning_rate": 9.799547420171175e-05, + "loss": 2.0066, + "step": 3837 + }, + { + "epoch": 1.178023327194598, + "grad_norm": 0.6095728874206543, + "learning_rate": 9.799408066615443e-05, + "loss": 1.9799, + "step": 3838 + }, + { + "epoch": 1.178330263965623, + "grad_norm": 0.5268104672431946, + "learning_rate": 9.799268665629148e-05, + "loss": 2.0409, + "step": 3839 + }, + { + "epoch": 1.1786372007366483, + "grad_norm": 0.4478130340576172, + "learning_rate": 9.799129217213667e-05, + "loss": 1.9521, + "step": 3840 + }, + { + "epoch": 1.1789441375076735, + "grad_norm": 0.4691653847694397, + "learning_rate": 9.798989721370379e-05, + "loss": 2.0432, + "step": 3841 + }, + { + "epoch": 1.1792510742786986, + "grad_norm": 0.5602376461029053, + "learning_rate": 9.798850178100661e-05, + "loss": 2.0557, + "step": 3842 + }, + { + "epoch": 1.1795580110497237, + "grad_norm": 0.5619905591011047, + "learning_rate": 9.798710587405893e-05, + "loss": 2.0258, + "step": 3843 + }, + { + "epoch": 1.179864947820749, + "grad_norm": 0.5845574736595154, + "learning_rate": 9.798570949287454e-05, + "loss": 2.0637, + "step": 3844 + }, + { + "epoch": 1.1801718845917741, + "grad_norm": 0.5339313745498657, + "learning_rate": 9.798431263746725e-05, + "loss": 2.0265, + "step": 3845 + }, + { + "epoch": 1.1804788213627992, + "grad_norm": 0.45720914006233215, + "learning_rate": 9.798291530785086e-05, + "loss": 1.9745, + "step": 3846 + }, + { + "epoch": 1.1807857581338244, + "grad_norm": 0.5121282935142517, + "learning_rate": 9.798151750403917e-05, + "loss": 2.0427, + "step": 3847 + }, + { + "epoch": 1.1810926949048497, + "grad_norm": 0.48100459575653076, + "learning_rate": 9.7980119226046e-05, + "loss": 2.0307, + "step": 3848 + }, + { + "epoch": 1.1813996316758748, + "grad_norm": 0.4424034655094147, + "learning_rate": 9.797872047388517e-05, + "loss": 1.9697, + "step": 3849 + }, + { + "epoch": 1.1817065684469, + "grad_norm": 0.45154938101768494, + "learning_rate": 9.797732124757051e-05, + "loss": 1.9689, + "step": 3850 + }, + { + "epoch": 1.182013505217925, + "grad_norm": 0.4807071387767792, + "learning_rate": 9.797592154711584e-05, + "loss": 1.9616, + "step": 3851 + }, + { + "epoch": 1.1823204419889504, + "grad_norm": 0.5113904476165771, + "learning_rate": 9.797452137253498e-05, + "loss": 2.0158, + "step": 3852 + }, + { + "epoch": 1.1826273787599755, + "grad_norm": 0.5456753969192505, + "learning_rate": 9.797312072384179e-05, + "loss": 1.977, + "step": 3853 + }, + { + "epoch": 1.1829343155310006, + "grad_norm": 0.5545704364776611, + "learning_rate": 9.797171960105012e-05, + "loss": 2.0622, + "step": 3854 + }, + { + "epoch": 1.1832412523020257, + "grad_norm": 0.651498556137085, + "learning_rate": 9.797031800417377e-05, + "loss": 2.0739, + "step": 3855 + }, + { + "epoch": 1.183548189073051, + "grad_norm": 0.748968780040741, + "learning_rate": 9.796891593322665e-05, + "loss": 2.0713, + "step": 3856 + }, + { + "epoch": 1.1838551258440762, + "grad_norm": 0.8724157214164734, + "learning_rate": 9.796751338822256e-05, + "loss": 2.0224, + "step": 3857 + }, + { + "epoch": 1.1841620626151013, + "grad_norm": 0.8158844709396362, + "learning_rate": 9.796611036917542e-05, + "loss": 2.0165, + "step": 3858 + }, + { + "epoch": 1.1844689993861264, + "grad_norm": 0.6231487989425659, + "learning_rate": 9.796470687609904e-05, + "loss": 1.9607, + "step": 3859 + }, + { + "epoch": 1.1847759361571517, + "grad_norm": 0.49367067217826843, + "learning_rate": 9.796330290900731e-05, + "loss": 2.0074, + "step": 3860 + }, + { + "epoch": 1.1850828729281768, + "grad_norm": 0.5546393990516663, + "learning_rate": 9.796189846791413e-05, + "loss": 1.9688, + "step": 3861 + }, + { + "epoch": 1.185389809699202, + "grad_norm": 0.5880963802337646, + "learning_rate": 9.796049355283333e-05, + "loss": 2.0192, + "step": 3862 + }, + { + "epoch": 1.185696746470227, + "grad_norm": 0.6064910292625427, + "learning_rate": 9.795908816377884e-05, + "loss": 2.0236, + "step": 3863 + }, + { + "epoch": 1.1860036832412524, + "grad_norm": 0.524116575717926, + "learning_rate": 9.795768230076454e-05, + "loss": 2.0315, + "step": 3864 + }, + { + "epoch": 1.1863106200122775, + "grad_norm": 0.449158251285553, + "learning_rate": 9.79562759638043e-05, + "loss": 1.9423, + "step": 3865 + }, + { + "epoch": 1.1866175567833026, + "grad_norm": 0.5623016953468323, + "learning_rate": 9.795486915291203e-05, + "loss": 2.096, + "step": 3866 + }, + { + "epoch": 1.1869244935543277, + "grad_norm": 0.6107217073440552, + "learning_rate": 9.795346186810164e-05, + "loss": 1.9994, + "step": 3867 + }, + { + "epoch": 1.187231430325353, + "grad_norm": 0.5559211373329163, + "learning_rate": 9.795205410938704e-05, + "loss": 2.0138, + "step": 3868 + }, + { + "epoch": 1.1875383670963782, + "grad_norm": 0.5022037029266357, + "learning_rate": 9.795064587678212e-05, + "loss": 2.0835, + "step": 3869 + }, + { + "epoch": 1.1878453038674033, + "grad_norm": 0.5760810971260071, + "learning_rate": 9.794923717030082e-05, + "loss": 2.0839, + "step": 3870 + }, + { + "epoch": 1.1881522406384284, + "grad_norm": 0.559018075466156, + "learning_rate": 9.794782798995706e-05, + "loss": 2.0397, + "step": 3871 + }, + { + "epoch": 1.1884591774094537, + "grad_norm": 0.48842501640319824, + "learning_rate": 9.794641833576477e-05, + "loss": 2.022, + "step": 3872 + }, + { + "epoch": 1.1887661141804788, + "grad_norm": 0.47267377376556396, + "learning_rate": 9.794500820773785e-05, + "loss": 1.9677, + "step": 3873 + }, + { + "epoch": 1.189073050951504, + "grad_norm": 0.5107980966567993, + "learning_rate": 9.794359760589026e-05, + "loss": 2.124, + "step": 3874 + }, + { + "epoch": 1.189379987722529, + "grad_norm": 0.4993875026702881, + "learning_rate": 9.794218653023595e-05, + "loss": 1.9528, + "step": 3875 + }, + { + "epoch": 1.1896869244935544, + "grad_norm": 0.49543896317481995, + "learning_rate": 9.794077498078885e-05, + "loss": 2.0257, + "step": 3876 + }, + { + "epoch": 1.1899938612645795, + "grad_norm": 0.5207403302192688, + "learning_rate": 9.79393629575629e-05, + "loss": 2.0853, + "step": 3877 + }, + { + "epoch": 1.1903007980356046, + "grad_norm": 0.44884833693504333, + "learning_rate": 9.793795046057208e-05, + "loss": 1.9366, + "step": 3878 + }, + { + "epoch": 1.1906077348066297, + "grad_norm": 0.47921934723854065, + "learning_rate": 9.793653748983033e-05, + "loss": 2.0614, + "step": 3879 + }, + { + "epoch": 1.190914671577655, + "grad_norm": 0.5371566414833069, + "learning_rate": 9.793512404535163e-05, + "loss": 2.0433, + "step": 3880 + }, + { + "epoch": 1.1912216083486802, + "grad_norm": 0.48760104179382324, + "learning_rate": 9.793371012714994e-05, + "loss": 2.0061, + "step": 3881 + }, + { + "epoch": 1.1915285451197053, + "grad_norm": 0.47291669249534607, + "learning_rate": 9.793229573523922e-05, + "loss": 2.0661, + "step": 3882 + }, + { + "epoch": 1.1918354818907306, + "grad_norm": 0.5348502397537231, + "learning_rate": 9.793088086963347e-05, + "loss": 2.0131, + "step": 3883 + }, + { + "epoch": 1.1921424186617557, + "grad_norm": 0.6291812062263489, + "learning_rate": 9.792946553034666e-05, + "loss": 2.0312, + "step": 3884 + }, + { + "epoch": 1.1924493554327809, + "grad_norm": 0.5620503425598145, + "learning_rate": 9.792804971739276e-05, + "loss": 2.0429, + "step": 3885 + }, + { + "epoch": 1.192756292203806, + "grad_norm": 0.4984607696533203, + "learning_rate": 9.792663343078581e-05, + "loss": 2.0183, + "step": 3886 + }, + { + "epoch": 1.193063228974831, + "grad_norm": 0.5867961645126343, + "learning_rate": 9.792521667053975e-05, + "loss": 2.0609, + "step": 3887 + }, + { + "epoch": 1.1933701657458564, + "grad_norm": 0.5819169282913208, + "learning_rate": 9.792379943666863e-05, + "loss": 1.9412, + "step": 3888 + }, + { + "epoch": 1.1936771025168815, + "grad_norm": 0.6232548952102661, + "learning_rate": 9.792238172918643e-05, + "loss": 2.0607, + "step": 3889 + }, + { + "epoch": 1.1939840392879066, + "grad_norm": 0.5859619379043579, + "learning_rate": 9.792096354810716e-05, + "loss": 2.0718, + "step": 3890 + }, + { + "epoch": 1.194290976058932, + "grad_norm": 0.47209057211875916, + "learning_rate": 9.791954489344485e-05, + "loss": 1.9872, + "step": 3891 + }, + { + "epoch": 1.194597912829957, + "grad_norm": 0.5183662176132202, + "learning_rate": 9.79181257652135e-05, + "loss": 2.0782, + "step": 3892 + }, + { + "epoch": 1.1949048496009822, + "grad_norm": 0.551873505115509, + "learning_rate": 9.791670616342715e-05, + "loss": 2.0477, + "step": 3893 + }, + { + "epoch": 1.1952117863720073, + "grad_norm": 0.47254955768585205, + "learning_rate": 9.791528608809984e-05, + "loss": 1.9859, + "step": 3894 + }, + { + "epoch": 1.1955187231430324, + "grad_norm": 0.45482897758483887, + "learning_rate": 9.791386553924556e-05, + "loss": 1.9939, + "step": 3895 + }, + { + "epoch": 1.1958256599140578, + "grad_norm": 0.4687066078186035, + "learning_rate": 9.79124445168784e-05, + "loss": 1.9982, + "step": 3896 + }, + { + "epoch": 1.1961325966850829, + "grad_norm": 0.4855460524559021, + "learning_rate": 9.791102302101236e-05, + "loss": 1.9667, + "step": 3897 + }, + { + "epoch": 1.196439533456108, + "grad_norm": 0.48152467608451843, + "learning_rate": 9.790960105166153e-05, + "loss": 1.9914, + "step": 3898 + }, + { + "epoch": 1.1967464702271333, + "grad_norm": 0.48487406969070435, + "learning_rate": 9.790817860883993e-05, + "loss": 1.9978, + "step": 3899 + }, + { + "epoch": 1.1970534069981584, + "grad_norm": 0.47665563225746155, + "learning_rate": 9.790675569256162e-05, + "loss": 1.9995, + "step": 3900 + }, + { + "epoch": 1.1973603437691835, + "grad_norm": 0.48938530683517456, + "learning_rate": 9.790533230284069e-05, + "loss": 2.0461, + "step": 3901 + }, + { + "epoch": 1.1976672805402087, + "grad_norm": 0.6336411237716675, + "learning_rate": 9.790390843969119e-05, + "loss": 2.0003, + "step": 3902 + }, + { + "epoch": 1.1979742173112338, + "grad_norm": 0.6946616172790527, + "learning_rate": 9.790248410312717e-05, + "loss": 1.9979, + "step": 3903 + }, + { + "epoch": 1.198281154082259, + "grad_norm": 0.7829384803771973, + "learning_rate": 9.790105929316274e-05, + "loss": 2.015, + "step": 3904 + }, + { + "epoch": 1.1985880908532842, + "grad_norm": 0.6874059438705444, + "learning_rate": 9.789963400981197e-05, + "loss": 1.9887, + "step": 3905 + }, + { + "epoch": 1.1988950276243093, + "grad_norm": 0.6074720025062561, + "learning_rate": 9.789820825308893e-05, + "loss": 2.0287, + "step": 3906 + }, + { + "epoch": 1.1992019643953347, + "grad_norm": 0.49311673641204834, + "learning_rate": 9.789678202300774e-05, + "loss": 1.9846, + "step": 3907 + }, + { + "epoch": 1.1995089011663598, + "grad_norm": 0.5266487002372742, + "learning_rate": 9.789535531958244e-05, + "loss": 2.017, + "step": 3908 + }, + { + "epoch": 1.1998158379373849, + "grad_norm": 0.6170570850372314, + "learning_rate": 9.789392814282721e-05, + "loss": 2.0615, + "step": 3909 + }, + { + "epoch": 1.20012277470841, + "grad_norm": 0.5820409059524536, + "learning_rate": 9.789250049275609e-05, + "loss": 2.0459, + "step": 3910 + }, + { + "epoch": 1.2004297114794351, + "grad_norm": 0.5220739841461182, + "learning_rate": 9.78910723693832e-05, + "loss": 2.0843, + "step": 3911 + }, + { + "epoch": 1.2007366482504604, + "grad_norm": 0.5884750485420227, + "learning_rate": 9.788964377272267e-05, + "loss": 2.1068, + "step": 3912 + }, + { + "epoch": 1.2010435850214856, + "grad_norm": 0.5634950995445251, + "learning_rate": 9.788821470278861e-05, + "loss": 2.0206, + "step": 3913 + }, + { + "epoch": 1.2013505217925107, + "grad_norm": 0.5219514966011047, + "learning_rate": 9.788678515959517e-05, + "loss": 2.0802, + "step": 3914 + }, + { + "epoch": 1.201657458563536, + "grad_norm": 0.5870078206062317, + "learning_rate": 9.788535514315642e-05, + "loss": 2.0149, + "step": 3915 + }, + { + "epoch": 1.2019643953345611, + "grad_norm": 0.4850577414035797, + "learning_rate": 9.788392465348653e-05, + "loss": 2.0424, + "step": 3916 + }, + { + "epoch": 1.2022713321055862, + "grad_norm": 0.5354881882667542, + "learning_rate": 9.788249369059964e-05, + "loss": 2.0822, + "step": 3917 + }, + { + "epoch": 1.2025782688766113, + "grad_norm": 0.5817529559135437, + "learning_rate": 9.788106225450988e-05, + "loss": 2.0384, + "step": 3918 + }, + { + "epoch": 1.2028852056476367, + "grad_norm": 0.5685575008392334, + "learning_rate": 9.78796303452314e-05, + "loss": 1.9777, + "step": 3919 + }, + { + "epoch": 1.2031921424186618, + "grad_norm": 0.5086472034454346, + "learning_rate": 9.787819796277835e-05, + "loss": 1.9109, + "step": 3920 + }, + { + "epoch": 1.203499079189687, + "grad_norm": 0.45905008912086487, + "learning_rate": 9.787676510716488e-05, + "loss": 1.9945, + "step": 3921 + }, + { + "epoch": 1.203806015960712, + "grad_norm": 0.6052672863006592, + "learning_rate": 9.787533177840516e-05, + "loss": 2.0873, + "step": 3922 + }, + { + "epoch": 1.2041129527317374, + "grad_norm": 0.636320173740387, + "learning_rate": 9.787389797651334e-05, + "loss": 1.954, + "step": 3923 + }, + { + "epoch": 1.2044198895027625, + "grad_norm": 0.5775459408760071, + "learning_rate": 9.78724637015036e-05, + "loss": 1.9632, + "step": 3924 + }, + { + "epoch": 1.2047268262737876, + "grad_norm": 0.4593936502933502, + "learning_rate": 9.787102895339013e-05, + "loss": 1.948, + "step": 3925 + }, + { + "epoch": 1.2050337630448127, + "grad_norm": 0.4568643867969513, + "learning_rate": 9.78695937321871e-05, + "loss": 1.977, + "step": 3926 + }, + { + "epoch": 1.205340699815838, + "grad_norm": 0.6079357266426086, + "learning_rate": 9.786815803790867e-05, + "loss": 1.9738, + "step": 3927 + }, + { + "epoch": 1.2056476365868631, + "grad_norm": 0.5991626977920532, + "learning_rate": 9.786672187056905e-05, + "loss": 1.9603, + "step": 3928 + }, + { + "epoch": 1.2059545733578882, + "grad_norm": 0.4844282865524292, + "learning_rate": 9.786528523018242e-05, + "loss": 1.9739, + "step": 3929 + }, + { + "epoch": 1.2062615101289134, + "grad_norm": 0.43694475293159485, + "learning_rate": 9.786384811676298e-05, + "loss": 1.957, + "step": 3930 + }, + { + "epoch": 1.2065684468999387, + "grad_norm": 0.5742451548576355, + "learning_rate": 9.786241053032496e-05, + "loss": 1.9872, + "step": 3931 + }, + { + "epoch": 1.2068753836709638, + "grad_norm": 0.6246824860572815, + "learning_rate": 9.786097247088255e-05, + "loss": 2.0747, + "step": 3932 + }, + { + "epoch": 1.207182320441989, + "grad_norm": 0.5364731550216675, + "learning_rate": 9.785953393844996e-05, + "loss": 1.9793, + "step": 3933 + }, + { + "epoch": 1.207489257213014, + "grad_norm": 0.42909273505210876, + "learning_rate": 9.785809493304139e-05, + "loss": 1.9959, + "step": 3934 + }, + { + "epoch": 1.2077961939840394, + "grad_norm": 0.43952879309654236, + "learning_rate": 9.785665545467108e-05, + "loss": 2.0019, + "step": 3935 + }, + { + "epoch": 1.2081031307550645, + "grad_norm": 0.45972180366516113, + "learning_rate": 9.785521550335323e-05, + "loss": 1.9504, + "step": 3936 + }, + { + "epoch": 1.2084100675260896, + "grad_norm": 0.5592246651649475, + "learning_rate": 9.785377507910212e-05, + "loss": 2.0214, + "step": 3937 + }, + { + "epoch": 1.2087170042971147, + "grad_norm": 0.6084285378456116, + "learning_rate": 9.785233418193196e-05, + "loss": 2.08, + "step": 3938 + }, + { + "epoch": 1.20902394106814, + "grad_norm": 0.5370670557022095, + "learning_rate": 9.785089281185698e-05, + "loss": 2.0877, + "step": 3939 + }, + { + "epoch": 1.2093308778391652, + "grad_norm": 0.466501921415329, + "learning_rate": 9.784945096889143e-05, + "loss": 1.9795, + "step": 3940 + }, + { + "epoch": 1.2096378146101903, + "grad_norm": 0.48617517948150635, + "learning_rate": 9.784800865304954e-05, + "loss": 2.0099, + "step": 3941 + }, + { + "epoch": 1.2099447513812154, + "grad_norm": 0.528110921382904, + "learning_rate": 9.78465658643456e-05, + "loss": 2.0597, + "step": 3942 + }, + { + "epoch": 1.2102516881522407, + "grad_norm": 0.47355538606643677, + "learning_rate": 9.784512260279385e-05, + "loss": 2.0145, + "step": 3943 + }, + { + "epoch": 1.2105586249232658, + "grad_norm": 0.46970823407173157, + "learning_rate": 9.784367886840856e-05, + "loss": 2.0533, + "step": 3944 + }, + { + "epoch": 1.210865561694291, + "grad_norm": 0.41206037998199463, + "learning_rate": 9.784223466120399e-05, + "loss": 1.9226, + "step": 3945 + }, + { + "epoch": 1.211172498465316, + "grad_norm": 0.4298155605792999, + "learning_rate": 9.784078998119442e-05, + "loss": 2.0686, + "step": 3946 + }, + { + "epoch": 1.2114794352363414, + "grad_norm": 0.4616359770298004, + "learning_rate": 9.783934482839412e-05, + "loss": 2.0063, + "step": 3947 + }, + { + "epoch": 1.2117863720073665, + "grad_norm": 0.476726233959198, + "learning_rate": 9.783789920281737e-05, + "loss": 1.9868, + "step": 3948 + }, + { + "epoch": 1.2120933087783916, + "grad_norm": 0.5075610876083374, + "learning_rate": 9.783645310447846e-05, + "loss": 2.1019, + "step": 3949 + }, + { + "epoch": 1.212400245549417, + "grad_norm": 0.49806225299835205, + "learning_rate": 9.78350065333917e-05, + "loss": 2.0503, + "step": 3950 + }, + { + "epoch": 1.212707182320442, + "grad_norm": 0.5278452634811401, + "learning_rate": 9.783355948957134e-05, + "loss": 2.0513, + "step": 3951 + }, + { + "epoch": 1.2130141190914672, + "grad_norm": 0.5634627938270569, + "learning_rate": 9.783211197303174e-05, + "loss": 2.1135, + "step": 3952 + }, + { + "epoch": 1.2133210558624923, + "grad_norm": 0.5152999758720398, + "learning_rate": 9.783066398378715e-05, + "loss": 2.0392, + "step": 3953 + }, + { + "epoch": 1.2136279926335174, + "grad_norm": 0.48095864057540894, + "learning_rate": 9.782921552185191e-05, + "loss": 1.982, + "step": 3954 + }, + { + "epoch": 1.2139349294045427, + "grad_norm": 0.47377893328666687, + "learning_rate": 9.782776658724034e-05, + "loss": 1.9538, + "step": 3955 + }, + { + "epoch": 1.2142418661755678, + "grad_norm": 0.5260181427001953, + "learning_rate": 9.782631717996675e-05, + "loss": 2.1197, + "step": 3956 + }, + { + "epoch": 1.214548802946593, + "grad_norm": 0.5640038251876831, + "learning_rate": 9.782486730004544e-05, + "loss": 2.0338, + "step": 3957 + }, + { + "epoch": 1.2148557397176183, + "grad_norm": 0.5091645121574402, + "learning_rate": 9.782341694749078e-05, + "loss": 1.9921, + "step": 3958 + }, + { + "epoch": 1.2151626764886434, + "grad_norm": 0.48285624384880066, + "learning_rate": 9.782196612231706e-05, + "loss": 2.0358, + "step": 3959 + }, + { + "epoch": 1.2154696132596685, + "grad_norm": 0.5013573169708252, + "learning_rate": 9.782051482453867e-05, + "loss": 1.9378, + "step": 3960 + }, + { + "epoch": 1.2157765500306936, + "grad_norm": 0.42000052332878113, + "learning_rate": 9.781906305416991e-05, + "loss": 1.9232, + "step": 3961 + }, + { + "epoch": 1.2160834868017187, + "grad_norm": 0.4651196599006653, + "learning_rate": 9.781761081122514e-05, + "loss": 2.0244, + "step": 3962 + }, + { + "epoch": 1.216390423572744, + "grad_norm": 0.48081469535827637, + "learning_rate": 9.781615809571871e-05, + "loss": 1.938, + "step": 3963 + }, + { + "epoch": 1.2166973603437692, + "grad_norm": 0.4692462086677551, + "learning_rate": 9.7814704907665e-05, + "loss": 1.9592, + "step": 3964 + }, + { + "epoch": 1.2170042971147943, + "grad_norm": 0.5545635223388672, + "learning_rate": 9.781325124707832e-05, + "loss": 2.0882, + "step": 3965 + }, + { + "epoch": 1.2173112338858196, + "grad_norm": 0.47801801562309265, + "learning_rate": 9.78117971139731e-05, + "loss": 2.0127, + "step": 3966 + }, + { + "epoch": 1.2176181706568447, + "grad_norm": 0.4705824851989746, + "learning_rate": 9.781034250836364e-05, + "loss": 2.0659, + "step": 3967 + }, + { + "epoch": 1.2179251074278699, + "grad_norm": 0.4757092297077179, + "learning_rate": 9.78088874302644e-05, + "loss": 1.9177, + "step": 3968 + }, + { + "epoch": 1.218232044198895, + "grad_norm": 0.4563291370868683, + "learning_rate": 9.780743187968968e-05, + "loss": 1.991, + "step": 3969 + }, + { + "epoch": 1.21853898096992, + "grad_norm": 0.4641762375831604, + "learning_rate": 9.78059758566539e-05, + "loss": 2.0357, + "step": 3970 + }, + { + "epoch": 1.2188459177409454, + "grad_norm": 0.510754406452179, + "learning_rate": 9.780451936117145e-05, + "loss": 2.0754, + "step": 3971 + }, + { + "epoch": 1.2191528545119705, + "grad_norm": 0.5595460534095764, + "learning_rate": 9.780306239325671e-05, + "loss": 2.0449, + "step": 3972 + }, + { + "epoch": 1.2194597912829956, + "grad_norm": 0.5778231620788574, + "learning_rate": 9.780160495292412e-05, + "loss": 2.0187, + "step": 3973 + }, + { + "epoch": 1.219766728054021, + "grad_norm": 0.5098022818565369, + "learning_rate": 9.780014704018803e-05, + "loss": 1.9881, + "step": 3974 + }, + { + "epoch": 1.220073664825046, + "grad_norm": 0.46725937724113464, + "learning_rate": 9.779868865506288e-05, + "loss": 1.9929, + "step": 3975 + }, + { + "epoch": 1.2203806015960712, + "grad_norm": 0.48517540097236633, + "learning_rate": 9.779722979756304e-05, + "loss": 1.9446, + "step": 3976 + }, + { + "epoch": 1.2206875383670963, + "grad_norm": 0.5013269186019897, + "learning_rate": 9.7795770467703e-05, + "loss": 2.0256, + "step": 3977 + }, + { + "epoch": 1.2209944751381214, + "grad_norm": 0.4918982982635498, + "learning_rate": 9.779431066549713e-05, + "loss": 1.9732, + "step": 3978 + }, + { + "epoch": 1.2213014119091468, + "grad_norm": 0.45646655559539795, + "learning_rate": 9.779285039095987e-05, + "loss": 1.9672, + "step": 3979 + }, + { + "epoch": 1.2216083486801719, + "grad_norm": 0.4712901711463928, + "learning_rate": 9.779138964410565e-05, + "loss": 2.0074, + "step": 3980 + }, + { + "epoch": 1.221915285451197, + "grad_norm": 0.4901394844055176, + "learning_rate": 9.77899284249489e-05, + "loss": 2.0073, + "step": 3981 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.579247772693634, + "learning_rate": 9.778846673350407e-05, + "loss": 2.0983, + "step": 3982 + }, + { + "epoch": 1.2225291589932474, + "grad_norm": 0.6108444929122925, + "learning_rate": 9.77870045697856e-05, + "loss": 2.0268, + "step": 3983 + }, + { + "epoch": 1.2228360957642725, + "grad_norm": 0.5592121481895447, + "learning_rate": 9.778554193380796e-05, + "loss": 2.0549, + "step": 3984 + }, + { + "epoch": 1.2231430325352977, + "grad_norm": 0.538088858127594, + "learning_rate": 9.778407882558556e-05, + "loss": 1.9398, + "step": 3985 + }, + { + "epoch": 1.223449969306323, + "grad_norm": 0.5928295850753784, + "learning_rate": 9.77826152451329e-05, + "loss": 2.0341, + "step": 3986 + }, + { + "epoch": 1.223756906077348, + "grad_norm": 0.566687822341919, + "learning_rate": 9.778115119246442e-05, + "loss": 2.0629, + "step": 3987 + }, + { + "epoch": 1.2240638428483732, + "grad_norm": 0.7019027471542358, + "learning_rate": 9.777968666759461e-05, + "loss": 1.9979, + "step": 3988 + }, + { + "epoch": 1.2243707796193983, + "grad_norm": 0.7198969721794128, + "learning_rate": 9.777822167053793e-05, + "loss": 1.9898, + "step": 3989 + }, + { + "epoch": 1.2246777163904237, + "grad_norm": 0.6319006085395813, + "learning_rate": 9.777675620130887e-05, + "loss": 1.9591, + "step": 3990 + }, + { + "epoch": 1.2249846531614488, + "grad_norm": 0.5372903347015381, + "learning_rate": 9.777529025992187e-05, + "loss": 1.9605, + "step": 3991 + }, + { + "epoch": 1.225291589932474, + "grad_norm": 0.47436487674713135, + "learning_rate": 9.777382384639147e-05, + "loss": 1.9667, + "step": 3992 + }, + { + "epoch": 1.225598526703499, + "grad_norm": 0.5885797739028931, + "learning_rate": 9.777235696073214e-05, + "loss": 2.0363, + "step": 3993 + }, + { + "epoch": 1.2259054634745243, + "grad_norm": 0.6333138346672058, + "learning_rate": 9.777088960295838e-05, + "loss": 1.9352, + "step": 3994 + }, + { + "epoch": 1.2262124002455494, + "grad_norm": 0.6364251971244812, + "learning_rate": 9.776942177308468e-05, + "loss": 1.9577, + "step": 3995 + }, + { + "epoch": 1.2265193370165746, + "grad_norm": 0.5114668607711792, + "learning_rate": 9.776795347112557e-05, + "loss": 2.0241, + "step": 3996 + }, + { + "epoch": 1.2268262737875997, + "grad_norm": 0.6139995455741882, + "learning_rate": 9.776648469709556e-05, + "loss": 1.9847, + "step": 3997 + }, + { + "epoch": 1.227133210558625, + "grad_norm": 0.6104671955108643, + "learning_rate": 9.776501545100911e-05, + "loss": 1.9311, + "step": 3998 + }, + { + "epoch": 1.2274401473296501, + "grad_norm": 0.5099297761917114, + "learning_rate": 9.776354573288081e-05, + "loss": 2.0877, + "step": 3999 + }, + { + "epoch": 1.2277470841006752, + "grad_norm": 0.48199233412742615, + "learning_rate": 9.776207554272516e-05, + "loss": 1.9802, + "step": 4000 + }, + { + "epoch": 1.2280540208717003, + "grad_norm": 0.5323067307472229, + "learning_rate": 9.776060488055667e-05, + "loss": 2.0278, + "step": 4001 + }, + { + "epoch": 1.2283609576427257, + "grad_norm": 0.49086472392082214, + "learning_rate": 9.775913374638988e-05, + "loss": 2.0242, + "step": 4002 + }, + { + "epoch": 1.2286678944137508, + "grad_norm": 0.4812946319580078, + "learning_rate": 9.775766214023936e-05, + "loss": 1.9762, + "step": 4003 + }, + { + "epoch": 1.228974831184776, + "grad_norm": 0.44118809700012207, + "learning_rate": 9.775619006211962e-05, + "loss": 1.9242, + "step": 4004 + }, + { + "epoch": 1.229281767955801, + "grad_norm": 0.4507352113723755, + "learning_rate": 9.775471751204522e-05, + "loss": 2.0015, + "step": 4005 + }, + { + "epoch": 1.2295887047268264, + "grad_norm": 0.4620691239833832, + "learning_rate": 9.775324449003072e-05, + "loss": 2.0269, + "step": 4006 + }, + { + "epoch": 1.2298956414978515, + "grad_norm": 0.5053025484085083, + "learning_rate": 9.775177099609065e-05, + "loss": 1.9764, + "step": 4007 + }, + { + "epoch": 1.2302025782688766, + "grad_norm": 0.5113483667373657, + "learning_rate": 9.775029703023961e-05, + "loss": 2.0583, + "step": 4008 + }, + { + "epoch": 1.2305095150399017, + "grad_norm": 0.517400324344635, + "learning_rate": 9.774882259249214e-05, + "loss": 2.0918, + "step": 4009 + }, + { + "epoch": 1.230816451810927, + "grad_norm": 0.5575035214424133, + "learning_rate": 9.774734768286282e-05, + "loss": 2.0573, + "step": 4010 + }, + { + "epoch": 1.2311233885819521, + "grad_norm": 0.5556582808494568, + "learning_rate": 9.774587230136622e-05, + "loss": 1.9612, + "step": 4011 + }, + { + "epoch": 1.2314303253529773, + "grad_norm": 0.541752815246582, + "learning_rate": 9.774439644801693e-05, + "loss": 2.0165, + "step": 4012 + }, + { + "epoch": 1.2317372621240024, + "grad_norm": 0.46944886445999146, + "learning_rate": 9.774292012282953e-05, + "loss": 2.0068, + "step": 4013 + }, + { + "epoch": 1.2320441988950277, + "grad_norm": 0.5507385730743408, + "learning_rate": 9.77414433258186e-05, + "loss": 2.0092, + "step": 4014 + }, + { + "epoch": 1.2323511356660528, + "grad_norm": 0.550862193107605, + "learning_rate": 9.773996605699875e-05, + "loss": 1.9887, + "step": 4015 + }, + { + "epoch": 1.232658072437078, + "grad_norm": 0.5281004905700684, + "learning_rate": 9.77384883163846e-05, + "loss": 2.0214, + "step": 4016 + }, + { + "epoch": 1.232965009208103, + "grad_norm": 0.5682541131973267, + "learning_rate": 9.77370101039907e-05, + "loss": 2.0021, + "step": 4017 + }, + { + "epoch": 1.2332719459791284, + "grad_norm": 0.5083168745040894, + "learning_rate": 9.77355314198317e-05, + "loss": 1.9589, + "step": 4018 + }, + { + "epoch": 1.2335788827501535, + "grad_norm": 0.48763957619667053, + "learning_rate": 9.773405226392218e-05, + "loss": 1.9517, + "step": 4019 + }, + { + "epoch": 1.2338858195211786, + "grad_norm": 0.4721868634223938, + "learning_rate": 9.77325726362768e-05, + "loss": 1.959, + "step": 4020 + }, + { + "epoch": 1.2341927562922037, + "grad_norm": 0.5072606205940247, + "learning_rate": 9.773109253691016e-05, + "loss": 2.0252, + "step": 4021 + }, + { + "epoch": 1.234499693063229, + "grad_norm": 0.483260840177536, + "learning_rate": 9.772961196583686e-05, + "loss": 2.0205, + "step": 4022 + }, + { + "epoch": 1.2348066298342542, + "grad_norm": 0.4468609392642975, + "learning_rate": 9.772813092307158e-05, + "loss": 2.0182, + "step": 4023 + }, + { + "epoch": 1.2351135666052793, + "grad_norm": 0.4950753152370453, + "learning_rate": 9.772664940862893e-05, + "loss": 2.0276, + "step": 4024 + }, + { + "epoch": 1.2354205033763046, + "grad_norm": 0.45740416646003723, + "learning_rate": 9.772516742252356e-05, + "loss": 1.9519, + "step": 4025 + }, + { + "epoch": 1.2357274401473297, + "grad_norm": 0.409072607755661, + "learning_rate": 9.772368496477011e-05, + "loss": 1.9441, + "step": 4026 + }, + { + "epoch": 1.2360343769183548, + "grad_norm": 0.44857287406921387, + "learning_rate": 9.772220203538325e-05, + "loss": 1.9941, + "step": 4027 + }, + { + "epoch": 1.23634131368938, + "grad_norm": 0.4610998034477234, + "learning_rate": 9.77207186343776e-05, + "loss": 1.9855, + "step": 4028 + }, + { + "epoch": 1.236648250460405, + "grad_norm": 0.4809660017490387, + "learning_rate": 9.771923476176784e-05, + "loss": 1.9596, + "step": 4029 + }, + { + "epoch": 1.2369551872314304, + "grad_norm": 0.5011657476425171, + "learning_rate": 9.771775041756865e-05, + "loss": 1.9537, + "step": 4030 + }, + { + "epoch": 1.2372621240024555, + "grad_norm": 0.476001501083374, + "learning_rate": 9.771626560179465e-05, + "loss": 1.9447, + "step": 4031 + }, + { + "epoch": 1.2375690607734806, + "grad_norm": 0.4733816385269165, + "learning_rate": 9.771478031446057e-05, + "loss": 2.08, + "step": 4032 + }, + { + "epoch": 1.237875997544506, + "grad_norm": 0.4763995409011841, + "learning_rate": 9.771329455558108e-05, + "loss": 1.9483, + "step": 4033 + }, + { + "epoch": 1.238182934315531, + "grad_norm": 0.4906281530857086, + "learning_rate": 9.771180832517082e-05, + "loss": 1.9619, + "step": 4034 + }, + { + "epoch": 1.2384898710865562, + "grad_norm": 0.48713672161102295, + "learning_rate": 9.77103216232445e-05, + "loss": 1.9753, + "step": 4035 + }, + { + "epoch": 1.2387968078575813, + "grad_norm": 0.5214180946350098, + "learning_rate": 9.770883444981683e-05, + "loss": 2.0407, + "step": 4036 + }, + { + "epoch": 1.2391037446286064, + "grad_norm": 0.5161129236221313, + "learning_rate": 9.77073468049025e-05, + "loss": 2.0298, + "step": 4037 + }, + { + "epoch": 1.2394106813996317, + "grad_norm": 0.5041607022285461, + "learning_rate": 9.770585868851621e-05, + "loss": 1.9898, + "step": 4038 + }, + { + "epoch": 1.2397176181706568, + "grad_norm": 0.5076795220375061, + "learning_rate": 9.770437010067264e-05, + "loss": 1.9899, + "step": 4039 + }, + { + "epoch": 1.240024554941682, + "grad_norm": 0.47992074489593506, + "learning_rate": 9.770288104138654e-05, + "loss": 1.9923, + "step": 4040 + }, + { + "epoch": 1.2403314917127073, + "grad_norm": 0.4655405580997467, + "learning_rate": 9.770139151067261e-05, + "loss": 2.0082, + "step": 4041 + }, + { + "epoch": 1.2406384284837324, + "grad_norm": 0.499953031539917, + "learning_rate": 9.769990150854558e-05, + "loss": 2.0412, + "step": 4042 + }, + { + "epoch": 1.2409453652547575, + "grad_norm": 0.5288184285163879, + "learning_rate": 9.769841103502016e-05, + "loss": 2.0163, + "step": 4043 + }, + { + "epoch": 1.2412523020257826, + "grad_norm": 0.6660463809967041, + "learning_rate": 9.769692009011107e-05, + "loss": 2.1644, + "step": 4044 + }, + { + "epoch": 1.2415592387968077, + "grad_norm": 0.7020677328109741, + "learning_rate": 9.769542867383306e-05, + "loss": 1.9921, + "step": 4045 + }, + { + "epoch": 1.241866175567833, + "grad_norm": 0.8394366502761841, + "learning_rate": 9.769393678620089e-05, + "loss": 2.0099, + "step": 4046 + }, + { + "epoch": 1.2421731123388582, + "grad_norm": 0.9541008472442627, + "learning_rate": 9.769244442722927e-05, + "loss": 2.0035, + "step": 4047 + }, + { + "epoch": 1.2424800491098833, + "grad_norm": 0.8454573750495911, + "learning_rate": 9.769095159693296e-05, + "loss": 2.0075, + "step": 4048 + }, + { + "epoch": 1.2427869858809086, + "grad_norm": 0.6634951233863831, + "learning_rate": 9.768945829532672e-05, + "loss": 2.0352, + "step": 4049 + }, + { + "epoch": 1.2430939226519337, + "grad_norm": 0.5453166365623474, + "learning_rate": 9.76879645224253e-05, + "loss": 2.0259, + "step": 4050 + }, + { + "epoch": 1.2434008594229589, + "grad_norm": 0.8018995523452759, + "learning_rate": 9.768647027824344e-05, + "loss": 2.0175, + "step": 4051 + }, + { + "epoch": 1.243707796193984, + "grad_norm": 0.8518994450569153, + "learning_rate": 9.768497556279596e-05, + "loss": 1.986, + "step": 4052 + }, + { + "epoch": 1.244014732965009, + "grad_norm": 0.670764684677124, + "learning_rate": 9.76834803760976e-05, + "loss": 1.9779, + "step": 4053 + }, + { + "epoch": 1.2443216697360344, + "grad_norm": 0.5042433142662048, + "learning_rate": 9.768198471816312e-05, + "loss": 1.9808, + "step": 4054 + }, + { + "epoch": 1.2446286065070595, + "grad_norm": 0.45487603545188904, + "learning_rate": 9.768048858900733e-05, + "loss": 2.011, + "step": 4055 + }, + { + "epoch": 1.2449355432780846, + "grad_norm": 0.5012104511260986, + "learning_rate": 9.767899198864502e-05, + "loss": 1.9945, + "step": 4056 + }, + { + "epoch": 1.24524248004911, + "grad_norm": 0.6275805234909058, + "learning_rate": 9.767749491709095e-05, + "loss": 2.0397, + "step": 4057 + }, + { + "epoch": 1.245549416820135, + "grad_norm": 0.601513683795929, + "learning_rate": 9.767599737435993e-05, + "loss": 2.0201, + "step": 4058 + }, + { + "epoch": 1.2458563535911602, + "grad_norm": 0.531112551689148, + "learning_rate": 9.767449936046678e-05, + "loss": 2.0449, + "step": 4059 + }, + { + "epoch": 1.2461632903621853, + "grad_norm": 0.48515528440475464, + "learning_rate": 9.767300087542626e-05, + "loss": 2.0318, + "step": 4060 + }, + { + "epoch": 1.2464702271332107, + "grad_norm": 0.49292388558387756, + "learning_rate": 9.767150191925321e-05, + "loss": 2.0004, + "step": 4061 + }, + { + "epoch": 1.2467771639042358, + "grad_norm": 0.6046907901763916, + "learning_rate": 9.767000249196242e-05, + "loss": 2.0141, + "step": 4062 + }, + { + "epoch": 1.2470841006752609, + "grad_norm": 0.5311875939369202, + "learning_rate": 9.766850259356876e-05, + "loss": 1.9909, + "step": 4063 + }, + { + "epoch": 1.247391037446286, + "grad_norm": 0.535664975643158, + "learning_rate": 9.7667002224087e-05, + "loss": 2.07, + "step": 4064 + }, + { + "epoch": 1.2476979742173113, + "grad_norm": 0.594886839389801, + "learning_rate": 9.766550138353199e-05, + "loss": 1.9646, + "step": 4065 + }, + { + "epoch": 1.2480049109883364, + "grad_norm": 0.6726763844490051, + "learning_rate": 9.766400007191856e-05, + "loss": 1.9778, + "step": 4066 + }, + { + "epoch": 1.2483118477593615, + "grad_norm": 0.6045297384262085, + "learning_rate": 9.766249828926154e-05, + "loss": 2.0215, + "step": 4067 + }, + { + "epoch": 1.2486187845303867, + "grad_norm": 0.56207275390625, + "learning_rate": 9.766099603557576e-05, + "loss": 2.0252, + "step": 4068 + }, + { + "epoch": 1.248925721301412, + "grad_norm": 0.6623022556304932, + "learning_rate": 9.765949331087611e-05, + "loss": 1.975, + "step": 4069 + }, + { + "epoch": 1.249232658072437, + "grad_norm": 0.6274738311767578, + "learning_rate": 9.76579901151774e-05, + "loss": 2.037, + "step": 4070 + }, + { + "epoch": 1.2495395948434622, + "grad_norm": 0.5161643028259277, + "learning_rate": 9.76564864484945e-05, + "loss": 1.969, + "step": 4071 + }, + { + "epoch": 1.2498465316144873, + "grad_norm": 0.5624449849128723, + "learning_rate": 9.765498231084227e-05, + "loss": 2.0322, + "step": 4072 + }, + { + "epoch": 1.2501534683855127, + "grad_norm": 0.6198796629905701, + "learning_rate": 9.765347770223556e-05, + "loss": 1.986, + "step": 4073 + }, + { + "epoch": 1.2504604051565378, + "grad_norm": 0.5928165316581726, + "learning_rate": 9.765197262268927e-05, + "loss": 1.9886, + "step": 4074 + }, + { + "epoch": 1.250767341927563, + "grad_norm": 0.476484090089798, + "learning_rate": 9.765046707221825e-05, + "loss": 2.0476, + "step": 4075 + }, + { + "epoch": 1.2510742786985882, + "grad_norm": 0.5001220703125, + "learning_rate": 9.764896105083738e-05, + "loss": 1.9222, + "step": 4076 + }, + { + "epoch": 1.2513812154696133, + "grad_norm": 0.5429214239120483, + "learning_rate": 9.764745455856156e-05, + "loss": 2.0005, + "step": 4077 + }, + { + "epoch": 1.2516881522406385, + "grad_norm": 0.49443748593330383, + "learning_rate": 9.764594759540566e-05, + "loss": 1.9746, + "step": 4078 + }, + { + "epoch": 1.2519950890116636, + "grad_norm": 0.46963369846343994, + "learning_rate": 9.764444016138458e-05, + "loss": 1.9133, + "step": 4079 + }, + { + "epoch": 1.2523020257826887, + "grad_norm": 0.5112172365188599, + "learning_rate": 9.764293225651324e-05, + "loss": 1.9488, + "step": 4080 + }, + { + "epoch": 1.252608962553714, + "grad_norm": 0.4584117829799652, + "learning_rate": 9.764142388080648e-05, + "loss": 1.9895, + "step": 4081 + }, + { + "epoch": 1.2529158993247391, + "grad_norm": 0.48059090971946716, + "learning_rate": 9.763991503427927e-05, + "loss": 2.0436, + "step": 4082 + }, + { + "epoch": 1.2532228360957642, + "grad_norm": 0.5877810120582581, + "learning_rate": 9.763840571694649e-05, + "loss": 1.97, + "step": 4083 + }, + { + "epoch": 1.2535297728667896, + "grad_norm": 0.5370834469795227, + "learning_rate": 9.763689592882306e-05, + "loss": 2.0369, + "step": 4084 + }, + { + "epoch": 1.2538367096378147, + "grad_norm": 0.5483170747756958, + "learning_rate": 9.763538566992392e-05, + "loss": 2.066, + "step": 4085 + }, + { + "epoch": 1.2541436464088398, + "grad_norm": 0.5209359526634216, + "learning_rate": 9.763387494026396e-05, + "loss": 2.0685, + "step": 4086 + }, + { + "epoch": 1.254450583179865, + "grad_norm": 0.5569130182266235, + "learning_rate": 9.763236373985813e-05, + "loss": 2.0253, + "step": 4087 + }, + { + "epoch": 1.25475751995089, + "grad_norm": 0.48483753204345703, + "learning_rate": 9.763085206872136e-05, + "loss": 1.9851, + "step": 4088 + }, + { + "epoch": 1.2550644567219154, + "grad_norm": 0.4289563000202179, + "learning_rate": 9.76293399268686e-05, + "loss": 1.9374, + "step": 4089 + }, + { + "epoch": 1.2553713934929405, + "grad_norm": 0.4691961109638214, + "learning_rate": 9.762782731431478e-05, + "loss": 1.9588, + "step": 4090 + }, + { + "epoch": 1.2556783302639656, + "grad_norm": 0.49626582860946655, + "learning_rate": 9.762631423107488e-05, + "loss": 1.999, + "step": 4091 + }, + { + "epoch": 1.255985267034991, + "grad_norm": 0.5099872946739197, + "learning_rate": 9.762480067716381e-05, + "loss": 2.013, + "step": 4092 + }, + { + "epoch": 1.256292203806016, + "grad_norm": 0.47525838017463684, + "learning_rate": 9.762328665259654e-05, + "loss": 1.9953, + "step": 4093 + }, + { + "epoch": 1.2565991405770411, + "grad_norm": 0.4277878999710083, + "learning_rate": 9.762177215738804e-05, + "loss": 1.9623, + "step": 4094 + }, + { + "epoch": 1.2569060773480663, + "grad_norm": 0.46068885922431946, + "learning_rate": 9.762025719155328e-05, + "loss": 2.0012, + "step": 4095 + }, + { + "epoch": 1.2572130141190914, + "grad_norm": 0.4566059410572052, + "learning_rate": 9.761874175510723e-05, + "loss": 1.9666, + "step": 4096 + }, + { + "epoch": 1.2575199508901167, + "grad_norm": 0.44656631350517273, + "learning_rate": 9.761722584806487e-05, + "loss": 1.9912, + "step": 4097 + }, + { + "epoch": 1.2578268876611418, + "grad_norm": 0.5149295330047607, + "learning_rate": 9.761570947044117e-05, + "loss": 1.9876, + "step": 4098 + }, + { + "epoch": 1.258133824432167, + "grad_norm": 0.5265617370605469, + "learning_rate": 9.761419262225111e-05, + "loss": 2.0817, + "step": 4099 + }, + { + "epoch": 1.2584407612031923, + "grad_norm": 0.5015068054199219, + "learning_rate": 9.76126753035097e-05, + "loss": 1.9767, + "step": 4100 + }, + { + "epoch": 1.2587476979742174, + "grad_norm": 0.5178890228271484, + "learning_rate": 9.761115751423192e-05, + "loss": 1.9968, + "step": 4101 + }, + { + "epoch": 1.2590546347452425, + "grad_norm": 0.46565014123916626, + "learning_rate": 9.760963925443279e-05, + "loss": 1.8977, + "step": 4102 + }, + { + "epoch": 1.2593615715162676, + "grad_norm": 0.466398686170578, + "learning_rate": 9.760812052412728e-05, + "loss": 2.0317, + "step": 4103 + }, + { + "epoch": 1.2596685082872927, + "grad_norm": 0.48445576429367065, + "learning_rate": 9.760660132333043e-05, + "loss": 1.9953, + "step": 4104 + }, + { + "epoch": 1.259975445058318, + "grad_norm": 0.5716978907585144, + "learning_rate": 9.760508165205724e-05, + "loss": 2.0468, + "step": 4105 + }, + { + "epoch": 1.2602823818293432, + "grad_norm": 0.5168376564979553, + "learning_rate": 9.760356151032273e-05, + "loss": 1.9896, + "step": 4106 + }, + { + "epoch": 1.2605893186003683, + "grad_norm": 0.5014469027519226, + "learning_rate": 9.760204089814192e-05, + "loss": 2.0855, + "step": 4107 + }, + { + "epoch": 1.2608962553713936, + "grad_norm": 0.5283352732658386, + "learning_rate": 9.760051981552984e-05, + "loss": 2.0477, + "step": 4108 + }, + { + "epoch": 1.2612031921424187, + "grad_norm": 0.4526209533214569, + "learning_rate": 9.759899826250153e-05, + "loss": 1.9638, + "step": 4109 + }, + { + "epoch": 1.2615101289134438, + "grad_norm": 0.4565027058124542, + "learning_rate": 9.759747623907203e-05, + "loss": 1.9401, + "step": 4110 + }, + { + "epoch": 1.261817065684469, + "grad_norm": 0.48825928568840027, + "learning_rate": 9.759595374525636e-05, + "loss": 1.9721, + "step": 4111 + }, + { + "epoch": 1.262124002455494, + "grad_norm": 0.4922933578491211, + "learning_rate": 9.759443078106958e-05, + "loss": 1.969, + "step": 4112 + }, + { + "epoch": 1.2624309392265194, + "grad_norm": 0.5227758884429932, + "learning_rate": 9.759290734652674e-05, + "loss": 2.0144, + "step": 4113 + }, + { + "epoch": 1.2627378759975445, + "grad_norm": 0.48013919591903687, + "learning_rate": 9.759138344164289e-05, + "loss": 1.9889, + "step": 4114 + }, + { + "epoch": 1.2630448127685696, + "grad_norm": 0.5039379596710205, + "learning_rate": 9.758985906643309e-05, + "loss": 1.9313, + "step": 4115 + }, + { + "epoch": 1.263351749539595, + "grad_norm": 0.5248776078224182, + "learning_rate": 9.758833422091244e-05, + "loss": 2.0091, + "step": 4116 + }, + { + "epoch": 1.26365868631062, + "grad_norm": 0.4788825809955597, + "learning_rate": 9.758680890509595e-05, + "loss": 2.0197, + "step": 4117 + }, + { + "epoch": 1.2639656230816452, + "grad_norm": 0.4926285743713379, + "learning_rate": 9.758528311899873e-05, + "loss": 2.0558, + "step": 4118 + }, + { + "epoch": 1.2642725598526703, + "grad_norm": 0.44785842299461365, + "learning_rate": 9.758375686263586e-05, + "loss": 1.9505, + "step": 4119 + }, + { + "epoch": 1.2645794966236954, + "grad_norm": 0.44693484902381897, + "learning_rate": 9.75822301360224e-05, + "loss": 1.9734, + "step": 4120 + }, + { + "epoch": 1.2648864333947207, + "grad_norm": 0.4691752791404724, + "learning_rate": 9.758070293917346e-05, + "loss": 2.0069, + "step": 4121 + }, + { + "epoch": 1.2651933701657458, + "grad_norm": 0.4718364477157593, + "learning_rate": 9.757917527210413e-05, + "loss": 1.9926, + "step": 4122 + }, + { + "epoch": 1.265500306936771, + "grad_norm": 0.47527435421943665, + "learning_rate": 9.757764713482949e-05, + "loss": 2.0304, + "step": 4123 + }, + { + "epoch": 1.2658072437077963, + "grad_norm": 0.5030924677848816, + "learning_rate": 9.757611852736467e-05, + "loss": 2.0281, + "step": 4124 + }, + { + "epoch": 1.2661141804788214, + "grad_norm": 0.5260440707206726, + "learning_rate": 9.757458944972475e-05, + "loss": 1.9952, + "step": 4125 + }, + { + "epoch": 1.2664211172498465, + "grad_norm": 0.5542300939559937, + "learning_rate": 9.757305990192486e-05, + "loss": 1.979, + "step": 4126 + }, + { + "epoch": 1.2667280540208716, + "grad_norm": 0.5589221715927124, + "learning_rate": 9.757152988398011e-05, + "loss": 2.0123, + "step": 4127 + }, + { + "epoch": 1.2670349907918967, + "grad_norm": 0.48933175206184387, + "learning_rate": 9.75699993959056e-05, + "loss": 1.9671, + "step": 4128 + }, + { + "epoch": 1.267341927562922, + "grad_norm": 0.4785501956939697, + "learning_rate": 9.75684684377165e-05, + "loss": 1.9452, + "step": 4129 + }, + { + "epoch": 1.2676488643339472, + "grad_norm": 0.5000367760658264, + "learning_rate": 9.75669370094279e-05, + "loss": 1.9637, + "step": 4130 + }, + { + "epoch": 1.2679558011049723, + "grad_norm": 0.5292743444442749, + "learning_rate": 9.756540511105496e-05, + "loss": 2.0464, + "step": 4131 + }, + { + "epoch": 1.2682627378759976, + "grad_norm": 0.4979592561721802, + "learning_rate": 9.75638727426128e-05, + "loss": 1.9863, + "step": 4132 + }, + { + "epoch": 1.2685696746470227, + "grad_norm": 0.4681611657142639, + "learning_rate": 9.756233990411656e-05, + "loss": 1.9978, + "step": 4133 + }, + { + "epoch": 1.2688766114180479, + "grad_norm": 0.5034354329109192, + "learning_rate": 9.756080659558142e-05, + "loss": 2.0332, + "step": 4134 + }, + { + "epoch": 1.269183548189073, + "grad_norm": 0.4815942347049713, + "learning_rate": 9.75592728170225e-05, + "loss": 1.9669, + "step": 4135 + }, + { + "epoch": 1.269490484960098, + "grad_norm": 0.49555137753486633, + "learning_rate": 9.755773856845498e-05, + "loss": 1.9774, + "step": 4136 + }, + { + "epoch": 1.2697974217311234, + "grad_norm": 0.5533550381660461, + "learning_rate": 9.755620384989401e-05, + "loss": 2.0236, + "step": 4137 + }, + { + "epoch": 1.2701043585021485, + "grad_norm": 0.49497511982917786, + "learning_rate": 9.755466866135476e-05, + "loss": 1.9266, + "step": 4138 + }, + { + "epoch": 1.2704112952731736, + "grad_norm": 0.5009804964065552, + "learning_rate": 9.755313300285239e-05, + "loss": 1.9463, + "step": 4139 + }, + { + "epoch": 1.270718232044199, + "grad_norm": 0.49870428442955017, + "learning_rate": 9.755159687440209e-05, + "loss": 1.9566, + "step": 4140 + }, + { + "epoch": 1.271025168815224, + "grad_norm": 0.49113500118255615, + "learning_rate": 9.755006027601905e-05, + "loss": 2.0075, + "step": 4141 + }, + { + "epoch": 1.2713321055862492, + "grad_norm": 0.45977187156677246, + "learning_rate": 9.754852320771845e-05, + "loss": 1.9358, + "step": 4142 + }, + { + "epoch": 1.2716390423572743, + "grad_norm": 0.5493664145469666, + "learning_rate": 9.754698566951545e-05, + "loss": 1.9996, + "step": 4143 + }, + { + "epoch": 1.2719459791282997, + "grad_norm": 0.4791078567504883, + "learning_rate": 9.75454476614253e-05, + "loss": 1.9426, + "step": 4144 + }, + { + "epoch": 1.2722529158993248, + "grad_norm": 0.4809282720088959, + "learning_rate": 9.754390918346315e-05, + "loss": 2.0197, + "step": 4145 + }, + { + "epoch": 1.2725598526703499, + "grad_norm": 0.5380387902259827, + "learning_rate": 9.754237023564423e-05, + "loss": 2.0261, + "step": 4146 + }, + { + "epoch": 1.272866789441375, + "grad_norm": 0.48302608728408813, + "learning_rate": 9.754083081798374e-05, + "loss": 2.0539, + "step": 4147 + }, + { + "epoch": 1.2731737262124003, + "grad_norm": 0.5752124786376953, + "learning_rate": 9.75392909304969e-05, + "loss": 2.0901, + "step": 4148 + }, + { + "epoch": 1.2734806629834254, + "grad_norm": 0.5538807511329651, + "learning_rate": 9.75377505731989e-05, + "loss": 1.9721, + "step": 4149 + }, + { + "epoch": 1.2737875997544506, + "grad_norm": 0.6331756114959717, + "learning_rate": 9.753620974610502e-05, + "loss": 2.0124, + "step": 4150 + }, + { + "epoch": 1.2740945365254759, + "grad_norm": 0.6422140598297119, + "learning_rate": 9.753466844923042e-05, + "loss": 2.0115, + "step": 4151 + }, + { + "epoch": 1.274401473296501, + "grad_norm": 0.6650347113609314, + "learning_rate": 9.753312668259038e-05, + "loss": 1.9735, + "step": 4152 + }, + { + "epoch": 1.274708410067526, + "grad_norm": 0.587230384349823, + "learning_rate": 9.753158444620013e-05, + "loss": 1.9382, + "step": 4153 + }, + { + "epoch": 1.2750153468385512, + "grad_norm": 0.5357664823532104, + "learning_rate": 9.75300417400749e-05, + "loss": 2.0437, + "step": 4154 + }, + { + "epoch": 1.2753222836095763, + "grad_norm": 0.5058115720748901, + "learning_rate": 9.752849856422994e-05, + "loss": 2.0031, + "step": 4155 + }, + { + "epoch": 1.2756292203806017, + "grad_norm": 0.5913745164871216, + "learning_rate": 9.75269549186805e-05, + "loss": 1.9923, + "step": 4156 + }, + { + "epoch": 1.2759361571516268, + "grad_norm": 0.6766920685768127, + "learning_rate": 9.752541080344181e-05, + "loss": 1.9619, + "step": 4157 + }, + { + "epoch": 1.276243093922652, + "grad_norm": 0.606132984161377, + "learning_rate": 9.752386621852919e-05, + "loss": 1.9689, + "step": 4158 + }, + { + "epoch": 1.2765500306936772, + "grad_norm": 0.521133542060852, + "learning_rate": 9.752232116395785e-05, + "loss": 1.9602, + "step": 4159 + }, + { + "epoch": 1.2768569674647023, + "grad_norm": 0.45266324281692505, + "learning_rate": 9.75207756397431e-05, + "loss": 2.0032, + "step": 4160 + }, + { + "epoch": 1.2771639042357275, + "grad_norm": 0.5078892707824707, + "learning_rate": 9.751922964590017e-05, + "loss": 2.0656, + "step": 4161 + }, + { + "epoch": 1.2774708410067526, + "grad_norm": 0.5042154788970947, + "learning_rate": 9.751768318244437e-05, + "loss": 1.9356, + "step": 4162 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.5866135954856873, + "learning_rate": 9.751613624939098e-05, + "loss": 1.9655, + "step": 4163 + }, + { + "epoch": 1.278084714548803, + "grad_norm": 0.6038163304328918, + "learning_rate": 9.751458884675527e-05, + "loss": 1.9445, + "step": 4164 + }, + { + "epoch": 1.2783916513198281, + "grad_norm": 0.4938269555568695, + "learning_rate": 9.751304097455254e-05, + "loss": 2.0164, + "step": 4165 + }, + { + "epoch": 1.2786985880908532, + "grad_norm": 0.4289272427558899, + "learning_rate": 9.75114926327981e-05, + "loss": 1.912, + "step": 4166 + }, + { + "epoch": 1.2790055248618786, + "grad_norm": 0.524058997631073, + "learning_rate": 9.750994382150724e-05, + "loss": 1.9279, + "step": 4167 + }, + { + "epoch": 1.2793124616329037, + "grad_norm": 0.6318224668502808, + "learning_rate": 9.750839454069527e-05, + "loss": 1.98, + "step": 4168 + }, + { + "epoch": 1.2796193984039288, + "grad_norm": 0.5709670782089233, + "learning_rate": 9.750684479037749e-05, + "loss": 2.0029, + "step": 4169 + }, + { + "epoch": 1.279926335174954, + "grad_norm": 0.4621523916721344, + "learning_rate": 9.750529457056924e-05, + "loss": 2.0295, + "step": 4170 + }, + { + "epoch": 1.280233271945979, + "grad_norm": 0.5812001824378967, + "learning_rate": 9.750374388128581e-05, + "loss": 2.0839, + "step": 4171 + }, + { + "epoch": 1.2805402087170044, + "grad_norm": 0.6389874219894409, + "learning_rate": 9.750219272254256e-05, + "loss": 2.0825, + "step": 4172 + }, + { + "epoch": 1.2808471454880295, + "grad_norm": 0.49902382493019104, + "learning_rate": 9.750064109435478e-05, + "loss": 1.8902, + "step": 4173 + }, + { + "epoch": 1.2811540822590546, + "grad_norm": 0.5641525983810425, + "learning_rate": 9.749908899673783e-05, + "loss": 2.0463, + "step": 4174 + }, + { + "epoch": 1.28146101903008, + "grad_norm": 0.5977841019630432, + "learning_rate": 9.749753642970704e-05, + "loss": 2.0253, + "step": 4175 + }, + { + "epoch": 1.281767955801105, + "grad_norm": 0.5438104271888733, + "learning_rate": 9.749598339327777e-05, + "loss": 1.9862, + "step": 4176 + }, + { + "epoch": 1.2820748925721301, + "grad_norm": 0.4542587697505951, + "learning_rate": 9.749442988746535e-05, + "loss": 1.9476, + "step": 4177 + }, + { + "epoch": 1.2823818293431553, + "grad_norm": 0.4900791347026825, + "learning_rate": 9.749287591228513e-05, + "loss": 2.0093, + "step": 4178 + }, + { + "epoch": 1.2826887661141804, + "grad_norm": 0.5837534666061401, + "learning_rate": 9.749132146775247e-05, + "loss": 2.0699, + "step": 4179 + }, + { + "epoch": 1.2829957028852057, + "grad_norm": 0.5315881967544556, + "learning_rate": 9.748976655388274e-05, + "loss": 1.9514, + "step": 4180 + }, + { + "epoch": 1.2833026396562308, + "grad_norm": 0.5284895300865173, + "learning_rate": 9.74882111706913e-05, + "loss": 2.0171, + "step": 4181 + }, + { + "epoch": 1.283609576427256, + "grad_norm": 0.521202802658081, + "learning_rate": 9.748665531819352e-05, + "loss": 2.025, + "step": 4182 + }, + { + "epoch": 1.2839165131982813, + "grad_norm": 0.5437573194503784, + "learning_rate": 9.748509899640479e-05, + "loss": 2.0352, + "step": 4183 + }, + { + "epoch": 1.2842234499693064, + "grad_norm": 0.5394143462181091, + "learning_rate": 9.748354220534048e-05, + "loss": 2.0245, + "step": 4184 + }, + { + "epoch": 1.2845303867403315, + "grad_norm": 0.47468093037605286, + "learning_rate": 9.748198494501597e-05, + "loss": 1.9719, + "step": 4185 + }, + { + "epoch": 1.2848373235113566, + "grad_norm": 0.5312216877937317, + "learning_rate": 9.748042721544666e-05, + "loss": 2.0111, + "step": 4186 + }, + { + "epoch": 1.2851442602823817, + "grad_norm": 0.525694727897644, + "learning_rate": 9.747886901664794e-05, + "loss": 2.0582, + "step": 4187 + }, + { + "epoch": 1.285451197053407, + "grad_norm": 0.4965955317020416, + "learning_rate": 9.74773103486352e-05, + "loss": 1.9777, + "step": 4188 + }, + { + "epoch": 1.2857581338244322, + "grad_norm": 0.4391513466835022, + "learning_rate": 9.747575121142385e-05, + "loss": 1.9725, + "step": 4189 + }, + { + "epoch": 1.2860650705954573, + "grad_norm": 0.48999011516571045, + "learning_rate": 9.74741916050293e-05, + "loss": 1.953, + "step": 4190 + }, + { + "epoch": 1.2863720073664826, + "grad_norm": 0.5297304391860962, + "learning_rate": 9.747263152946698e-05, + "loss": 2.0484, + "step": 4191 + }, + { + "epoch": 1.2866789441375077, + "grad_norm": 0.4878230690956116, + "learning_rate": 9.747107098475226e-05, + "loss": 2.0423, + "step": 4192 + }, + { + "epoch": 1.2869858809085328, + "grad_norm": 0.538070023059845, + "learning_rate": 9.74695099709006e-05, + "loss": 2.0699, + "step": 4193 + }, + { + "epoch": 1.287292817679558, + "grad_norm": 0.6656436324119568, + "learning_rate": 9.746794848792743e-05, + "loss": 2.0689, + "step": 4194 + }, + { + "epoch": 1.287599754450583, + "grad_norm": 0.6416848301887512, + "learning_rate": 9.746638653584819e-05, + "loss": 1.9796, + "step": 4195 + }, + { + "epoch": 1.2879066912216084, + "grad_norm": 0.5917447805404663, + "learning_rate": 9.746482411467827e-05, + "loss": 2.0324, + "step": 4196 + }, + { + "epoch": 1.2882136279926335, + "grad_norm": 0.5234537124633789, + "learning_rate": 9.746326122443314e-05, + "loss": 2.0468, + "step": 4197 + }, + { + "epoch": 1.2885205647636586, + "grad_norm": 0.4885808229446411, + "learning_rate": 9.746169786512827e-05, + "loss": 1.9619, + "step": 4198 + }, + { + "epoch": 1.288827501534684, + "grad_norm": 0.5776945948600769, + "learning_rate": 9.746013403677905e-05, + "loss": 2.0167, + "step": 4199 + }, + { + "epoch": 1.289134438305709, + "grad_norm": 0.5722271203994751, + "learning_rate": 9.745856973940099e-05, + "loss": 1.9751, + "step": 4200 + }, + { + "epoch": 1.2894413750767342, + "grad_norm": 0.49253931641578674, + "learning_rate": 9.745700497300951e-05, + "loss": 1.9821, + "step": 4201 + }, + { + "epoch": 1.2897483118477593, + "grad_norm": 0.4739282727241516, + "learning_rate": 9.74554397376201e-05, + "loss": 1.9926, + "step": 4202 + }, + { + "epoch": 1.2900552486187844, + "grad_norm": 0.5133153200149536, + "learning_rate": 9.745387403324823e-05, + "loss": 1.9655, + "step": 4203 + }, + { + "epoch": 1.2903621853898097, + "grad_norm": 0.48941388726234436, + "learning_rate": 9.745230785990935e-05, + "loss": 1.9401, + "step": 4204 + }, + { + "epoch": 1.2906691221608348, + "grad_norm": 0.5998152494430542, + "learning_rate": 9.745074121761896e-05, + "loss": 2.0223, + "step": 4205 + }, + { + "epoch": 1.29097605893186, + "grad_norm": 0.4423331618309021, + "learning_rate": 9.744917410639253e-05, + "loss": 1.9602, + "step": 4206 + }, + { + "epoch": 1.2912829957028853, + "grad_norm": 0.5387418866157532, + "learning_rate": 9.744760652624553e-05, + "loss": 2.0631, + "step": 4207 + }, + { + "epoch": 1.2915899324739104, + "grad_norm": 0.5992900729179382, + "learning_rate": 9.744603847719352e-05, + "loss": 1.9805, + "step": 4208 + }, + { + "epoch": 1.2918968692449355, + "grad_norm": 0.5033924579620361, + "learning_rate": 9.744446995925192e-05, + "loss": 1.9817, + "step": 4209 + }, + { + "epoch": 1.2922038060159606, + "grad_norm": 0.47493448853492737, + "learning_rate": 9.744290097243624e-05, + "loss": 2.0259, + "step": 4210 + }, + { + "epoch": 1.2925107427869857, + "grad_norm": 0.5161942839622498, + "learning_rate": 9.744133151676203e-05, + "loss": 1.9686, + "step": 4211 + }, + { + "epoch": 1.292817679558011, + "grad_norm": 0.4476351737976074, + "learning_rate": 9.743976159224477e-05, + "loss": 1.9488, + "step": 4212 + }, + { + "epoch": 1.2931246163290362, + "grad_norm": 0.5168361663818359, + "learning_rate": 9.743819119889999e-05, + "loss": 2.0645, + "step": 4213 + }, + { + "epoch": 1.2934315531000613, + "grad_norm": 0.5098811984062195, + "learning_rate": 9.743662033674319e-05, + "loss": 1.9889, + "step": 4214 + }, + { + "epoch": 1.2937384898710866, + "grad_norm": 0.5559372305870056, + "learning_rate": 9.74350490057899e-05, + "loss": 2.0348, + "step": 4215 + }, + { + "epoch": 1.2940454266421118, + "grad_norm": 0.5274948477745056, + "learning_rate": 9.743347720605566e-05, + "loss": 2.0566, + "step": 4216 + }, + { + "epoch": 1.2943523634131369, + "grad_norm": 0.5009967088699341, + "learning_rate": 9.743190493755601e-05, + "loss": 1.9915, + "step": 4217 + }, + { + "epoch": 1.2946593001841622, + "grad_norm": 0.5365834832191467, + "learning_rate": 9.743033220030646e-05, + "loss": 2.0581, + "step": 4218 + }, + { + "epoch": 1.2949662369551873, + "grad_norm": 0.519478976726532, + "learning_rate": 9.742875899432255e-05, + "loss": 1.9766, + "step": 4219 + }, + { + "epoch": 1.2952731737262124, + "grad_norm": 0.48030364513397217, + "learning_rate": 9.742718531961988e-05, + "loss": 2.0006, + "step": 4220 + }, + { + "epoch": 1.2955801104972375, + "grad_norm": 0.5257472991943359, + "learning_rate": 9.742561117621394e-05, + "loss": 2.0636, + "step": 4221 + }, + { + "epoch": 1.2958870472682626, + "grad_norm": 0.44784319400787354, + "learning_rate": 9.742403656412034e-05, + "loss": 1.9975, + "step": 4222 + }, + { + "epoch": 1.296193984039288, + "grad_norm": 0.4997022747993469, + "learning_rate": 9.742246148335459e-05, + "loss": 2.0167, + "step": 4223 + }, + { + "epoch": 1.296500920810313, + "grad_norm": 0.43378305435180664, + "learning_rate": 9.742088593393228e-05, + "loss": 1.9202, + "step": 4224 + }, + { + "epoch": 1.2968078575813382, + "grad_norm": 0.5256497859954834, + "learning_rate": 9.741930991586899e-05, + "loss": 2.0306, + "step": 4225 + }, + { + "epoch": 1.2971147943523635, + "grad_norm": 0.5017027258872986, + "learning_rate": 9.741773342918028e-05, + "loss": 2.0124, + "step": 4226 + }, + { + "epoch": 1.2974217311233887, + "grad_norm": 0.5393915176391602, + "learning_rate": 9.741615647388175e-05, + "loss": 2.0255, + "step": 4227 + }, + { + "epoch": 1.2977286678944138, + "grad_norm": 0.48618295788764954, + "learning_rate": 9.741457904998896e-05, + "loss": 1.9863, + "step": 4228 + }, + { + "epoch": 1.2980356046654389, + "grad_norm": 0.48060059547424316, + "learning_rate": 9.741300115751752e-05, + "loss": 2.0787, + "step": 4229 + }, + { + "epoch": 1.298342541436464, + "grad_norm": 0.4966236650943756, + "learning_rate": 9.741142279648298e-05, + "loss": 1.9818, + "step": 4230 + }, + { + "epoch": 1.2986494782074893, + "grad_norm": 0.5178021788597107, + "learning_rate": 9.7409843966901e-05, + "loss": 1.9847, + "step": 4231 + }, + { + "epoch": 1.2989564149785144, + "grad_norm": 0.5161063075065613, + "learning_rate": 9.740826466878716e-05, + "loss": 2.0028, + "step": 4232 + }, + { + "epoch": 1.2992633517495396, + "grad_norm": 0.5972462296485901, + "learning_rate": 9.740668490215705e-05, + "loss": 2.0205, + "step": 4233 + }, + { + "epoch": 1.2995702885205649, + "grad_norm": 0.5929185152053833, + "learning_rate": 9.740510466702629e-05, + "loss": 1.9802, + "step": 4234 + }, + { + "epoch": 1.29987722529159, + "grad_norm": 0.5496684908866882, + "learning_rate": 9.74035239634105e-05, + "loss": 1.9331, + "step": 4235 + }, + { + "epoch": 1.3001841620626151, + "grad_norm": 0.5822622179985046, + "learning_rate": 9.740194279132531e-05, + "loss": 2.1079, + "step": 4236 + }, + { + "epoch": 1.3004910988336402, + "grad_norm": 0.5886369943618774, + "learning_rate": 9.740036115078634e-05, + "loss": 1.9938, + "step": 4237 + }, + { + "epoch": 1.3007980356046653, + "grad_norm": 0.5259171724319458, + "learning_rate": 9.73987790418092e-05, + "loss": 2.0787, + "step": 4238 + }, + { + "epoch": 1.3011049723756907, + "grad_norm": 0.6112152934074402, + "learning_rate": 9.739719646440956e-05, + "loss": 2.0488, + "step": 4239 + }, + { + "epoch": 1.3014119091467158, + "grad_norm": 0.5786338448524475, + "learning_rate": 9.739561341860306e-05, + "loss": 1.9917, + "step": 4240 + }, + { + "epoch": 1.301718845917741, + "grad_norm": 0.5099230408668518, + "learning_rate": 9.739402990440531e-05, + "loss": 1.9949, + "step": 4241 + }, + { + "epoch": 1.3020257826887662, + "grad_norm": 0.5040346384048462, + "learning_rate": 9.739244592183198e-05, + "loss": 1.9368, + "step": 4242 + }, + { + "epoch": 1.3023327194597913, + "grad_norm": 0.48172008991241455, + "learning_rate": 9.739086147089871e-05, + "loss": 1.97, + "step": 4243 + }, + { + "epoch": 1.3026396562308165, + "grad_norm": 0.5350810885429382, + "learning_rate": 9.738927655162119e-05, + "loss": 2.0584, + "step": 4244 + }, + { + "epoch": 1.3029465930018416, + "grad_norm": 0.566371738910675, + "learning_rate": 9.738769116401505e-05, + "loss": 2.0138, + "step": 4245 + }, + { + "epoch": 1.3032535297728667, + "grad_norm": 0.5697746872901917, + "learning_rate": 9.738610530809598e-05, + "loss": 2.0319, + "step": 4246 + }, + { + "epoch": 1.303560466543892, + "grad_norm": 0.5186757445335388, + "learning_rate": 9.738451898387964e-05, + "loss": 1.9958, + "step": 4247 + }, + { + "epoch": 1.3038674033149171, + "grad_norm": 0.5318703651428223, + "learning_rate": 9.73829321913817e-05, + "loss": 2.0857, + "step": 4248 + }, + { + "epoch": 1.3041743400859422, + "grad_norm": 0.5013560056686401, + "learning_rate": 9.738134493061786e-05, + "loss": 1.9545, + "step": 4249 + }, + { + "epoch": 1.3044812768569676, + "grad_norm": 0.499009907245636, + "learning_rate": 9.737975720160382e-05, + "loss": 1.9773, + "step": 4250 + }, + { + "epoch": 1.3047882136279927, + "grad_norm": 0.5187140703201294, + "learning_rate": 9.737816900435522e-05, + "loss": 1.9826, + "step": 4251 + }, + { + "epoch": 1.3050951503990178, + "grad_norm": 0.4950683116912842, + "learning_rate": 9.73765803388878e-05, + "loss": 2.0061, + "step": 4252 + }, + { + "epoch": 1.305402087170043, + "grad_norm": 0.40729087591171265, + "learning_rate": 9.737499120521722e-05, + "loss": 1.9502, + "step": 4253 + }, + { + "epoch": 1.305709023941068, + "grad_norm": 0.4959156811237335, + "learning_rate": 9.737340160335924e-05, + "loss": 2.0975, + "step": 4254 + }, + { + "epoch": 1.3060159607120934, + "grad_norm": 0.5127618312835693, + "learning_rate": 9.737181153332952e-05, + "loss": 2.0098, + "step": 4255 + }, + { + "epoch": 1.3063228974831185, + "grad_norm": 0.45458972454071045, + "learning_rate": 9.737022099514381e-05, + "loss": 1.9475, + "step": 4256 + }, + { + "epoch": 1.3066298342541436, + "grad_norm": 0.5024627447128296, + "learning_rate": 9.736862998881779e-05, + "loss": 2.0682, + "step": 4257 + }, + { + "epoch": 1.306936771025169, + "grad_norm": 0.5217326283454895, + "learning_rate": 9.736703851436722e-05, + "loss": 2.0363, + "step": 4258 + }, + { + "epoch": 1.307243707796194, + "grad_norm": 0.4798679053783417, + "learning_rate": 9.736544657180781e-05, + "loss": 2.0357, + "step": 4259 + }, + { + "epoch": 1.3075506445672191, + "grad_norm": 0.6031736135482788, + "learning_rate": 9.73638541611553e-05, + "loss": 2.0143, + "step": 4260 + }, + { + "epoch": 1.3078575813382443, + "grad_norm": 0.4914969801902771, + "learning_rate": 9.736226128242542e-05, + "loss": 1.9292, + "step": 4261 + }, + { + "epoch": 1.3081645181092694, + "grad_norm": 0.40556418895721436, + "learning_rate": 9.736066793563392e-05, + "loss": 1.9528, + "step": 4262 + }, + { + "epoch": 1.3084714548802947, + "grad_norm": 0.45605841279029846, + "learning_rate": 9.735907412079652e-05, + "loss": 2.0704, + "step": 4263 + }, + { + "epoch": 1.3087783916513198, + "grad_norm": 0.4992324113845825, + "learning_rate": 9.7357479837929e-05, + "loss": 2.0211, + "step": 4264 + }, + { + "epoch": 1.309085328422345, + "grad_norm": 0.4904097020626068, + "learning_rate": 9.735588508704712e-05, + "loss": 1.987, + "step": 4265 + }, + { + "epoch": 1.3093922651933703, + "grad_norm": 0.5436086058616638, + "learning_rate": 9.735428986816661e-05, + "loss": 2.0704, + "step": 4266 + }, + { + "epoch": 1.3096992019643954, + "grad_norm": 0.4850294589996338, + "learning_rate": 9.735269418130326e-05, + "loss": 1.9576, + "step": 4267 + }, + { + "epoch": 1.3100061387354205, + "grad_norm": 0.44082164764404297, + "learning_rate": 9.735109802647283e-05, + "loss": 2.0018, + "step": 4268 + }, + { + "epoch": 1.3103130755064456, + "grad_norm": 0.4844531714916229, + "learning_rate": 9.73495014036911e-05, + "loss": 1.9852, + "step": 4269 + }, + { + "epoch": 1.3106200122774707, + "grad_norm": 0.547596275806427, + "learning_rate": 9.734790431297384e-05, + "loss": 2.0632, + "step": 4270 + }, + { + "epoch": 1.310926949048496, + "grad_norm": 0.517882764339447, + "learning_rate": 9.734630675433684e-05, + "loss": 1.9851, + "step": 4271 + }, + { + "epoch": 1.3112338858195212, + "grad_norm": 0.5148623585700989, + "learning_rate": 9.734470872779589e-05, + "loss": 2.0446, + "step": 4272 + }, + { + "epoch": 1.3115408225905463, + "grad_norm": 0.5872887372970581, + "learning_rate": 9.734311023336678e-05, + "loss": 2.0588, + "step": 4273 + }, + { + "epoch": 1.3118477593615716, + "grad_norm": 0.7116255164146423, + "learning_rate": 9.73415112710653e-05, + "loss": 2.0213, + "step": 4274 + }, + { + "epoch": 1.3121546961325967, + "grad_norm": 0.8191964626312256, + "learning_rate": 9.733991184090725e-05, + "loss": 1.9528, + "step": 4275 + }, + { + "epoch": 1.3124616329036218, + "grad_norm": 0.8214605450630188, + "learning_rate": 9.733831194290846e-05, + "loss": 1.9614, + "step": 4276 + }, + { + "epoch": 1.312768569674647, + "grad_norm": 0.7057182788848877, + "learning_rate": 9.733671157708472e-05, + "loss": 2.0767, + "step": 4277 + }, + { + "epoch": 1.313075506445672, + "grad_norm": 0.5114007592201233, + "learning_rate": 9.733511074345185e-05, + "loss": 1.946, + "step": 4278 + }, + { + "epoch": 1.3133824432166974, + "grad_norm": 0.5347970128059387, + "learning_rate": 9.733350944202566e-05, + "loss": 1.9658, + "step": 4279 + }, + { + "epoch": 1.3136893799877225, + "grad_norm": 0.6962214112281799, + "learning_rate": 9.733190767282202e-05, + "loss": 2.0943, + "step": 4280 + }, + { + "epoch": 1.3139963167587476, + "grad_norm": 0.5942707657814026, + "learning_rate": 9.733030543585668e-05, + "loss": 2.0101, + "step": 4281 + }, + { + "epoch": 1.314303253529773, + "grad_norm": 0.46218639612197876, + "learning_rate": 9.732870273114556e-05, + "loss": 2.0292, + "step": 4282 + }, + { + "epoch": 1.314610190300798, + "grad_norm": 0.5194444060325623, + "learning_rate": 9.732709955870445e-05, + "loss": 2.0666, + "step": 4283 + }, + { + "epoch": 1.3149171270718232, + "grad_norm": 0.5112141370773315, + "learning_rate": 9.732549591854918e-05, + "loss": 2.0205, + "step": 4284 + }, + { + "epoch": 1.3152240638428485, + "grad_norm": 0.5282790660858154, + "learning_rate": 9.732389181069566e-05, + "loss": 2.0704, + "step": 4285 + }, + { + "epoch": 1.3155310006138736, + "grad_norm": 0.4598311185836792, + "learning_rate": 9.732228723515968e-05, + "loss": 1.9485, + "step": 4286 + }, + { + "epoch": 1.3158379373848987, + "grad_norm": 0.4700186550617218, + "learning_rate": 9.732068219195711e-05, + "loss": 2.0329, + "step": 4287 + }, + { + "epoch": 1.3161448741559238, + "grad_norm": 0.4512452781200409, + "learning_rate": 9.731907668110384e-05, + "loss": 1.9829, + "step": 4288 + }, + { + "epoch": 1.316451810926949, + "grad_norm": 0.5053353309631348, + "learning_rate": 9.731747070261572e-05, + "loss": 2.0583, + "step": 4289 + }, + { + "epoch": 1.3167587476979743, + "grad_norm": 0.48143625259399414, + "learning_rate": 9.73158642565086e-05, + "loss": 2.014, + "step": 4290 + }, + { + "epoch": 1.3170656844689994, + "grad_norm": 0.4843716025352478, + "learning_rate": 9.73142573427984e-05, + "loss": 1.9951, + "step": 4291 + }, + { + "epoch": 1.3173726212400245, + "grad_norm": 0.45646217465400696, + "learning_rate": 9.731264996150098e-05, + "loss": 1.9701, + "step": 4292 + }, + { + "epoch": 1.3176795580110499, + "grad_norm": 0.5176306962966919, + "learning_rate": 9.73110421126322e-05, + "loss": 1.9915, + "step": 4293 + }, + { + "epoch": 1.317986494782075, + "grad_norm": 0.4862259328365326, + "learning_rate": 9.730943379620799e-05, + "loss": 2.0157, + "step": 4294 + }, + { + "epoch": 1.3182934315531, + "grad_norm": 0.4941593110561371, + "learning_rate": 9.730782501224423e-05, + "loss": 2.0164, + "step": 4295 + }, + { + "epoch": 1.3186003683241252, + "grad_norm": 0.46818530559539795, + "learning_rate": 9.73062157607568e-05, + "loss": 1.9749, + "step": 4296 + }, + { + "epoch": 1.3189073050951503, + "grad_norm": 0.41685113310813904, + "learning_rate": 9.730460604176163e-05, + "loss": 1.9443, + "step": 4297 + }, + { + "epoch": 1.3192142418661756, + "grad_norm": 0.40586861968040466, + "learning_rate": 9.73029958552746e-05, + "loss": 1.9227, + "step": 4298 + }, + { + "epoch": 1.3195211786372008, + "grad_norm": 0.3946068286895752, + "learning_rate": 9.730138520131167e-05, + "loss": 1.9073, + "step": 4299 + }, + { + "epoch": 1.3198281154082259, + "grad_norm": 0.3722321093082428, + "learning_rate": 9.729977407988871e-05, + "loss": 1.9299, + "step": 4300 + }, + { + "epoch": 1.3201350521792512, + "grad_norm": 0.39335691928863525, + "learning_rate": 9.729816249102164e-05, + "loss": 1.9673, + "step": 4301 + }, + { + "epoch": 1.3204419889502763, + "grad_norm": 0.4342779815196991, + "learning_rate": 9.729655043472643e-05, + "loss": 2.0704, + "step": 4302 + }, + { + "epoch": 1.3207489257213014, + "grad_norm": 0.46981000900268555, + "learning_rate": 9.729493791101899e-05, + "loss": 2.0593, + "step": 4303 + }, + { + "epoch": 1.3210558624923265, + "grad_norm": 0.4319849908351898, + "learning_rate": 9.729332491991524e-05, + "loss": 1.9378, + "step": 4304 + }, + { + "epoch": 1.3213627992633517, + "grad_norm": 0.4555012285709381, + "learning_rate": 9.729171146143115e-05, + "loss": 1.993, + "step": 4305 + }, + { + "epoch": 1.321669736034377, + "grad_norm": 0.5122297406196594, + "learning_rate": 9.729009753558262e-05, + "loss": 2.0237, + "step": 4306 + }, + { + "epoch": 1.321976672805402, + "grad_norm": 0.4814549386501312, + "learning_rate": 9.728848314238566e-05, + "loss": 2.0063, + "step": 4307 + }, + { + "epoch": 1.3222836095764272, + "grad_norm": 0.45410022139549255, + "learning_rate": 9.728686828185618e-05, + "loss": 2.0262, + "step": 4308 + }, + { + "epoch": 1.3225905463474525, + "grad_norm": 0.44759154319763184, + "learning_rate": 9.728525295401014e-05, + "loss": 1.9746, + "step": 4309 + }, + { + "epoch": 1.3228974831184777, + "grad_norm": 0.41539889574050903, + "learning_rate": 9.728363715886352e-05, + "loss": 1.9197, + "step": 4310 + }, + { + "epoch": 1.3232044198895028, + "grad_norm": 0.549961268901825, + "learning_rate": 9.72820208964323e-05, + "loss": 2.0168, + "step": 4311 + }, + { + "epoch": 1.3235113566605279, + "grad_norm": 0.6832249164581299, + "learning_rate": 9.728040416673243e-05, + "loss": 1.9711, + "step": 4312 + }, + { + "epoch": 1.323818293431553, + "grad_norm": 0.7458481788635254, + "learning_rate": 9.727878696977988e-05, + "loss": 2.1677, + "step": 4313 + }, + { + "epoch": 1.3241252302025783, + "grad_norm": 0.6268119812011719, + "learning_rate": 9.727716930559066e-05, + "loss": 2.0222, + "step": 4314 + }, + { + "epoch": 1.3244321669736034, + "grad_norm": 0.540987491607666, + "learning_rate": 9.727555117418075e-05, + "loss": 2.0552, + "step": 4315 + }, + { + "epoch": 1.3247391037446286, + "grad_norm": 0.6105024814605713, + "learning_rate": 9.727393257556612e-05, + "loss": 1.9287, + "step": 4316 + }, + { + "epoch": 1.325046040515654, + "grad_norm": 0.594327449798584, + "learning_rate": 9.727231350976277e-05, + "loss": 1.9737, + "step": 4317 + }, + { + "epoch": 1.325352977286679, + "grad_norm": 0.5686312913894653, + "learning_rate": 9.727069397678674e-05, + "loss": 1.988, + "step": 4318 + }, + { + "epoch": 1.3256599140577041, + "grad_norm": 0.5335875153541565, + "learning_rate": 9.726907397665399e-05, + "loss": 1.9992, + "step": 4319 + }, + { + "epoch": 1.3259668508287292, + "grad_norm": 0.514209508895874, + "learning_rate": 9.726745350938055e-05, + "loss": 2.0928, + "step": 4320 + }, + { + "epoch": 1.3262737875997543, + "grad_norm": 0.58844393491745, + "learning_rate": 9.726583257498242e-05, + "loss": 1.968, + "step": 4321 + }, + { + "epoch": 1.3265807243707797, + "grad_norm": 0.5247591733932495, + "learning_rate": 9.726421117347563e-05, + "loss": 1.9529, + "step": 4322 + }, + { + "epoch": 1.3268876611418048, + "grad_norm": 0.5057464241981506, + "learning_rate": 9.726258930487622e-05, + "loss": 2.0595, + "step": 4323 + }, + { + "epoch": 1.32719459791283, + "grad_norm": 0.564689040184021, + "learning_rate": 9.726096696920019e-05, + "loss": 1.9974, + "step": 4324 + }, + { + "epoch": 1.3275015346838552, + "grad_norm": 0.5755618214607239, + "learning_rate": 9.725934416646358e-05, + "loss": 1.9949, + "step": 4325 + }, + { + "epoch": 1.3278084714548803, + "grad_norm": 0.5969316959381104, + "learning_rate": 9.725772089668243e-05, + "loss": 1.972, + "step": 4326 + }, + { + "epoch": 1.3281154082259055, + "grad_norm": 0.5776877403259277, + "learning_rate": 9.725609715987278e-05, + "loss": 2.1018, + "step": 4327 + }, + { + "epoch": 1.3284223449969306, + "grad_norm": 0.5471270680427551, + "learning_rate": 9.725447295605071e-05, + "loss": 2.0153, + "step": 4328 + }, + { + "epoch": 1.3287292817679557, + "grad_norm": 0.49090373516082764, + "learning_rate": 9.725284828523222e-05, + "loss": 1.9651, + "step": 4329 + }, + { + "epoch": 1.329036218538981, + "grad_norm": 0.49420034885406494, + "learning_rate": 9.725122314743337e-05, + "loss": 2.0119, + "step": 4330 + }, + { + "epoch": 1.3293431553100061, + "grad_norm": 0.4841148853302002, + "learning_rate": 9.724959754267027e-05, + "loss": 1.974, + "step": 4331 + }, + { + "epoch": 1.3296500920810312, + "grad_norm": 0.42349007725715637, + "learning_rate": 9.724797147095893e-05, + "loss": 1.9779, + "step": 4332 + }, + { + "epoch": 1.3299570288520566, + "grad_norm": 0.47239863872528076, + "learning_rate": 9.724634493231545e-05, + "loss": 1.9184, + "step": 4333 + }, + { + "epoch": 1.3302639656230817, + "grad_norm": 0.5583773255348206, + "learning_rate": 9.72447179267559e-05, + "loss": 2.0742, + "step": 4334 + }, + { + "epoch": 1.3305709023941068, + "grad_norm": 0.486937552690506, + "learning_rate": 9.724309045429636e-05, + "loss": 2.0101, + "step": 4335 + }, + { + "epoch": 1.330877839165132, + "grad_norm": 0.42204493284225464, + "learning_rate": 9.724146251495289e-05, + "loss": 1.9564, + "step": 4336 + }, + { + "epoch": 1.331184775936157, + "grad_norm": 0.451628714799881, + "learning_rate": 9.723983410874163e-05, + "loss": 1.9949, + "step": 4337 + }, + { + "epoch": 1.3314917127071824, + "grad_norm": 0.4453491270542145, + "learning_rate": 9.723820523567861e-05, + "loss": 1.9415, + "step": 4338 + }, + { + "epoch": 1.3317986494782075, + "grad_norm": 0.4628424644470215, + "learning_rate": 9.723657589577999e-05, + "loss": 2.0296, + "step": 4339 + }, + { + "epoch": 1.3321055862492326, + "grad_norm": 0.5362148284912109, + "learning_rate": 9.723494608906181e-05, + "loss": 2.0719, + "step": 4340 + }, + { + "epoch": 1.332412523020258, + "grad_norm": 0.45357146859169006, + "learning_rate": 9.723331581554023e-05, + "loss": 1.9107, + "step": 4341 + }, + { + "epoch": 1.332719459791283, + "grad_norm": 0.5042485594749451, + "learning_rate": 9.723168507523133e-05, + "loss": 1.9838, + "step": 4342 + }, + { + "epoch": 1.3330263965623081, + "grad_norm": 0.4797585606575012, + "learning_rate": 9.723005386815123e-05, + "loss": 1.9779, + "step": 4343 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4489155113697052, + "learning_rate": 9.722842219431607e-05, + "loss": 1.9805, + "step": 4344 + }, + { + "epoch": 1.3336402701043584, + "grad_norm": 0.43091216683387756, + "learning_rate": 9.722679005374196e-05, + "loss": 1.9708, + "step": 4345 + }, + { + "epoch": 1.3339472068753837, + "grad_norm": 0.453937292098999, + "learning_rate": 9.722515744644502e-05, + "loss": 2.0038, + "step": 4346 + }, + { + "epoch": 1.3342541436464088, + "grad_norm": 0.38905346393585205, + "learning_rate": 9.722352437244138e-05, + "loss": 2.0042, + "step": 4347 + }, + { + "epoch": 1.334561080417434, + "grad_norm": 0.46686118841171265, + "learning_rate": 9.722189083174722e-05, + "loss": 2.0733, + "step": 4348 + }, + { + "epoch": 1.3348680171884593, + "grad_norm": 0.42737439274787903, + "learning_rate": 9.722025682437865e-05, + "loss": 1.9572, + "step": 4349 + }, + { + "epoch": 1.3351749539594844, + "grad_norm": 0.3857511878013611, + "learning_rate": 9.721862235035181e-05, + "loss": 1.9288, + "step": 4350 + }, + { + "epoch": 1.3354818907305095, + "grad_norm": 0.42448824644088745, + "learning_rate": 9.721698740968288e-05, + "loss": 1.99, + "step": 4351 + }, + { + "epoch": 1.3357888275015346, + "grad_norm": 0.4753642976284027, + "learning_rate": 9.721535200238802e-05, + "loss": 2.0268, + "step": 4352 + }, + { + "epoch": 1.3360957642725597, + "grad_norm": 0.5248960256576538, + "learning_rate": 9.721371612848336e-05, + "loss": 2.008, + "step": 4353 + }, + { + "epoch": 1.336402701043585, + "grad_norm": 0.5046865344047546, + "learning_rate": 9.721207978798507e-05, + "loss": 1.9248, + "step": 4354 + }, + { + "epoch": 1.3367096378146102, + "grad_norm": 0.48205190896987915, + "learning_rate": 9.721044298090937e-05, + "loss": 1.9895, + "step": 4355 + }, + { + "epoch": 1.3370165745856353, + "grad_norm": 0.46149346232414246, + "learning_rate": 9.720880570727238e-05, + "loss": 2.0001, + "step": 4356 + }, + { + "epoch": 1.3373235113566606, + "grad_norm": 0.6212405562400818, + "learning_rate": 9.72071679670903e-05, + "loss": 2.0772, + "step": 4357 + }, + { + "epoch": 1.3376304481276857, + "grad_norm": 0.6935828924179077, + "learning_rate": 9.720552976037934e-05, + "loss": 1.9865, + "step": 4358 + }, + { + "epoch": 1.3379373848987108, + "grad_norm": 0.6850154399871826, + "learning_rate": 9.720389108715564e-05, + "loss": 1.9964, + "step": 4359 + }, + { + "epoch": 1.3382443216697362, + "grad_norm": 0.5925734043121338, + "learning_rate": 9.720225194743544e-05, + "loss": 2.0109, + "step": 4360 + }, + { + "epoch": 1.3385512584407613, + "grad_norm": 0.47503459453582764, + "learning_rate": 9.720061234123492e-05, + "loss": 2.0406, + "step": 4361 + }, + { + "epoch": 1.3388581952117864, + "grad_norm": 0.44226083159446716, + "learning_rate": 9.719897226857026e-05, + "loss": 1.953, + "step": 4362 + }, + { + "epoch": 1.3391651319828115, + "grad_norm": 0.5688608884811401, + "learning_rate": 9.719733172945772e-05, + "loss": 1.9422, + "step": 4363 + }, + { + "epoch": 1.3394720687538366, + "grad_norm": 0.6097545027732849, + "learning_rate": 9.719569072391347e-05, + "loss": 2.0204, + "step": 4364 + }, + { + "epoch": 1.339779005524862, + "grad_norm": 0.44313064217567444, + "learning_rate": 9.719404925195374e-05, + "loss": 1.9458, + "step": 4365 + }, + { + "epoch": 1.340085942295887, + "grad_norm": 0.495632141828537, + "learning_rate": 9.719240731359476e-05, + "loss": 1.9682, + "step": 4366 + }, + { + "epoch": 1.3403928790669122, + "grad_norm": 0.5843736529350281, + "learning_rate": 9.719076490885275e-05, + "loss": 1.9948, + "step": 4367 + }, + { + "epoch": 1.3406998158379375, + "grad_norm": 0.6249645352363586, + "learning_rate": 9.718912203774395e-05, + "loss": 1.9675, + "step": 4368 + }, + { + "epoch": 1.3410067526089626, + "grad_norm": 0.48386043310165405, + "learning_rate": 9.718747870028457e-05, + "loss": 1.9678, + "step": 4369 + }, + { + "epoch": 1.3413136893799877, + "grad_norm": 0.4797835648059845, + "learning_rate": 9.718583489649088e-05, + "loss": 2.0118, + "step": 4370 + }, + { + "epoch": 1.3416206261510129, + "grad_norm": 0.6131169199943542, + "learning_rate": 9.718419062637911e-05, + "loss": 2.0057, + "step": 4371 + }, + { + "epoch": 1.341927562922038, + "grad_norm": 0.6230120062828064, + "learning_rate": 9.718254588996552e-05, + "loss": 1.9871, + "step": 4372 + }, + { + "epoch": 1.3422344996930633, + "grad_norm": 0.5323978662490845, + "learning_rate": 9.718090068726633e-05, + "loss": 1.9389, + "step": 4373 + }, + { + "epoch": 1.3425414364640884, + "grad_norm": 0.429446280002594, + "learning_rate": 9.717925501829786e-05, + "loss": 1.9928, + "step": 4374 + }, + { + "epoch": 1.3428483732351135, + "grad_norm": 0.5588231086730957, + "learning_rate": 9.717760888307632e-05, + "loss": 2.0197, + "step": 4375 + }, + { + "epoch": 1.3431553100061389, + "grad_norm": 0.608248770236969, + "learning_rate": 9.7175962281618e-05, + "loss": 1.9486, + "step": 4376 + }, + { + "epoch": 1.343462246777164, + "grad_norm": 0.6100868582725525, + "learning_rate": 9.717431521393918e-05, + "loss": 2.044, + "step": 4377 + }, + { + "epoch": 1.343769183548189, + "grad_norm": 0.5428611636161804, + "learning_rate": 9.717266768005611e-05, + "loss": 2.0078, + "step": 4378 + }, + { + "epoch": 1.3440761203192142, + "grad_norm": 0.4338260889053345, + "learning_rate": 9.71710196799851e-05, + "loss": 1.9206, + "step": 4379 + }, + { + "epoch": 1.3443830570902393, + "grad_norm": 0.4879632294178009, + "learning_rate": 9.716937121374243e-05, + "loss": 1.9852, + "step": 4380 + }, + { + "epoch": 1.3446899938612646, + "grad_norm": 0.5174580216407776, + "learning_rate": 9.716772228134438e-05, + "loss": 1.9328, + "step": 4381 + }, + { + "epoch": 1.3449969306322898, + "grad_norm": 0.4461662173271179, + "learning_rate": 9.716607288280726e-05, + "loss": 1.9653, + "step": 4382 + }, + { + "epoch": 1.3453038674033149, + "grad_norm": 0.49747103452682495, + "learning_rate": 9.716442301814735e-05, + "loss": 1.9904, + "step": 4383 + }, + { + "epoch": 1.3456108041743402, + "grad_norm": 0.5059060454368591, + "learning_rate": 9.716277268738097e-05, + "loss": 1.9408, + "step": 4384 + }, + { + "epoch": 1.3459177409453653, + "grad_norm": 0.47981831431388855, + "learning_rate": 9.716112189052445e-05, + "loss": 1.9604, + "step": 4385 + }, + { + "epoch": 1.3462246777163904, + "grad_norm": 0.48941048979759216, + "learning_rate": 9.715947062759405e-05, + "loss": 2.0005, + "step": 4386 + }, + { + "epoch": 1.3465316144874155, + "grad_norm": 0.4544732868671417, + "learning_rate": 9.715781889860613e-05, + "loss": 1.9641, + "step": 4387 + }, + { + "epoch": 1.3468385512584407, + "grad_norm": 0.4564060866832733, + "learning_rate": 9.715616670357701e-05, + "loss": 1.8786, + "step": 4388 + }, + { + "epoch": 1.347145488029466, + "grad_norm": 0.4216209352016449, + "learning_rate": 9.715451404252301e-05, + "loss": 1.9402, + "step": 4389 + }, + { + "epoch": 1.347452424800491, + "grad_norm": 0.5024694204330444, + "learning_rate": 9.715286091546046e-05, + "loss": 1.9815, + "step": 4390 + }, + { + "epoch": 1.3477593615715162, + "grad_norm": 0.523953378200531, + "learning_rate": 9.715120732240571e-05, + "loss": 2.008, + "step": 4391 + }, + { + "epoch": 1.3480662983425415, + "grad_norm": 0.5068427920341492, + "learning_rate": 9.714955326337508e-05, + "loss": 1.9984, + "step": 4392 + }, + { + "epoch": 1.3483732351135667, + "grad_norm": 0.4349055290222168, + "learning_rate": 9.714789873838494e-05, + "loss": 1.9576, + "step": 4393 + }, + { + "epoch": 1.3486801718845918, + "grad_norm": 0.4677357077598572, + "learning_rate": 9.714624374745162e-05, + "loss": 2.0491, + "step": 4394 + }, + { + "epoch": 1.3489871086556169, + "grad_norm": 0.5942007899284363, + "learning_rate": 9.71445882905915e-05, + "loss": 1.9951, + "step": 4395 + }, + { + "epoch": 1.349294045426642, + "grad_norm": 0.5354358553886414, + "learning_rate": 9.714293236782092e-05, + "loss": 2.0033, + "step": 4396 + }, + { + "epoch": 1.3496009821976673, + "grad_norm": 0.5081890821456909, + "learning_rate": 9.714127597915625e-05, + "loss": 1.9944, + "step": 4397 + }, + { + "epoch": 1.3499079189686924, + "grad_norm": 0.5279759764671326, + "learning_rate": 9.713961912461386e-05, + "loss": 2.025, + "step": 4398 + }, + { + "epoch": 1.3502148557397176, + "grad_norm": 0.41777312755584717, + "learning_rate": 9.713796180421012e-05, + "loss": 1.9214, + "step": 4399 + }, + { + "epoch": 1.350521792510743, + "grad_norm": 0.48946598172187805, + "learning_rate": 9.713630401796141e-05, + "loss": 1.9851, + "step": 4400 + }, + { + "epoch": 1.350828729281768, + "grad_norm": 0.45182350277900696, + "learning_rate": 9.713464576588413e-05, + "loss": 1.9825, + "step": 4401 + }, + { + "epoch": 1.3511356660527931, + "grad_norm": 0.4178939461708069, + "learning_rate": 9.713298704799465e-05, + "loss": 1.8944, + "step": 4402 + }, + { + "epoch": 1.3514426028238182, + "grad_norm": 0.4178236424922943, + "learning_rate": 9.713132786430937e-05, + "loss": 1.9884, + "step": 4403 + }, + { + "epoch": 1.3517495395948433, + "grad_norm": 0.45951130986213684, + "learning_rate": 9.712966821484467e-05, + "loss": 2.0786, + "step": 4404 + }, + { + "epoch": 1.3520564763658687, + "grad_norm": 0.4884461760520935, + "learning_rate": 9.712800809961697e-05, + "loss": 2.0494, + "step": 4405 + }, + { + "epoch": 1.3523634131368938, + "grad_norm": 0.5342240929603577, + "learning_rate": 9.712634751864268e-05, + "loss": 2.1068, + "step": 4406 + }, + { + "epoch": 1.352670349907919, + "grad_norm": 0.5503208637237549, + "learning_rate": 9.71246864719382e-05, + "loss": 1.9588, + "step": 4407 + }, + { + "epoch": 1.3529772866789442, + "grad_norm": 0.5576291084289551, + "learning_rate": 9.712302495951994e-05, + "loss": 2.0461, + "step": 4408 + }, + { + "epoch": 1.3532842234499693, + "grad_norm": 0.5063806772232056, + "learning_rate": 9.712136298140433e-05, + "loss": 1.9606, + "step": 4409 + }, + { + "epoch": 1.3535911602209945, + "grad_norm": 0.5391512513160706, + "learning_rate": 9.71197005376078e-05, + "loss": 2.0115, + "step": 4410 + }, + { + "epoch": 1.3538980969920196, + "grad_norm": 0.4934769868850708, + "learning_rate": 9.711803762814676e-05, + "loss": 1.9966, + "step": 4411 + }, + { + "epoch": 1.3542050337630447, + "grad_norm": 0.4658334255218506, + "learning_rate": 9.711637425303766e-05, + "loss": 1.9477, + "step": 4412 + }, + { + "epoch": 1.35451197053407, + "grad_norm": 0.4407191574573517, + "learning_rate": 9.711471041229693e-05, + "loss": 1.9334, + "step": 4413 + }, + { + "epoch": 1.3548189073050951, + "grad_norm": 0.5043092370033264, + "learning_rate": 9.711304610594104e-05, + "loss": 2.0068, + "step": 4414 + }, + { + "epoch": 1.3551258440761202, + "grad_norm": 0.4502009451389313, + "learning_rate": 9.711138133398639e-05, + "loss": 1.9389, + "step": 4415 + }, + { + "epoch": 1.3554327808471456, + "grad_norm": 0.41863033175468445, + "learning_rate": 9.710971609644945e-05, + "loss": 1.9244, + "step": 4416 + }, + { + "epoch": 1.3557397176181707, + "grad_norm": 0.47590091824531555, + "learning_rate": 9.71080503933467e-05, + "loss": 2.0144, + "step": 4417 + }, + { + "epoch": 1.3560466543891958, + "grad_norm": 0.47155439853668213, + "learning_rate": 9.71063842246946e-05, + "loss": 2.0729, + "step": 4418 + }, + { + "epoch": 1.356353591160221, + "grad_norm": 0.5231152176856995, + "learning_rate": 9.710471759050957e-05, + "loss": 2.0654, + "step": 4419 + }, + { + "epoch": 1.356660527931246, + "grad_norm": 0.5952544212341309, + "learning_rate": 9.710305049080812e-05, + "loss": 1.9983, + "step": 4420 + }, + { + "epoch": 1.3569674647022714, + "grad_norm": 0.4810022711753845, + "learning_rate": 9.710138292560673e-05, + "loss": 1.9725, + "step": 4421 + }, + { + "epoch": 1.3572744014732965, + "grad_norm": 0.553421676158905, + "learning_rate": 9.709971489492185e-05, + "loss": 2.0666, + "step": 4422 + }, + { + "epoch": 1.3575813382443216, + "grad_norm": 0.48790663480758667, + "learning_rate": 9.709804639877001e-05, + "loss": 1.9312, + "step": 4423 + }, + { + "epoch": 1.357888275015347, + "grad_norm": 0.42968273162841797, + "learning_rate": 9.709637743716764e-05, + "loss": 1.9061, + "step": 4424 + }, + { + "epoch": 1.358195211786372, + "grad_norm": 0.40183690190315247, + "learning_rate": 9.709470801013128e-05, + "loss": 2.0547, + "step": 4425 + }, + { + "epoch": 1.3585021485573971, + "grad_norm": 0.5162881016731262, + "learning_rate": 9.70930381176774e-05, + "loss": 2.0246, + "step": 4426 + }, + { + "epoch": 1.3588090853284225, + "grad_norm": 0.517995297908783, + "learning_rate": 9.709136775982252e-05, + "loss": 2.0029, + "step": 4427 + }, + { + "epoch": 1.3591160220994476, + "grad_norm": 0.47416025400161743, + "learning_rate": 9.708969693658314e-05, + "loss": 1.9517, + "step": 4428 + }, + { + "epoch": 1.3594229588704727, + "grad_norm": 0.4192255437374115, + "learning_rate": 9.708802564797578e-05, + "loss": 1.9138, + "step": 4429 + }, + { + "epoch": 1.3597298956414978, + "grad_norm": 0.4643617868423462, + "learning_rate": 9.708635389401697e-05, + "loss": 1.9753, + "step": 4430 + }, + { + "epoch": 1.360036832412523, + "grad_norm": 0.5007988214492798, + "learning_rate": 9.708468167472317e-05, + "loss": 1.9654, + "step": 4431 + }, + { + "epoch": 1.3603437691835483, + "grad_norm": 0.5188244581222534, + "learning_rate": 9.708300899011098e-05, + "loss": 1.9959, + "step": 4432 + }, + { + "epoch": 1.3606507059545734, + "grad_norm": 0.5209388732910156, + "learning_rate": 9.70813358401969e-05, + "loss": 2.0028, + "step": 4433 + }, + { + "epoch": 1.3609576427255985, + "grad_norm": 0.48829126358032227, + "learning_rate": 9.707966222499745e-05, + "loss": 2.0554, + "step": 4434 + }, + { + "epoch": 1.3612645794966238, + "grad_norm": 0.4373438358306885, + "learning_rate": 9.707798814452919e-05, + "loss": 1.9611, + "step": 4435 + }, + { + "epoch": 1.361571516267649, + "grad_norm": 0.4294830858707428, + "learning_rate": 9.707631359880867e-05, + "loss": 1.9049, + "step": 4436 + }, + { + "epoch": 1.361878453038674, + "grad_norm": 0.46988123655319214, + "learning_rate": 9.70746385878524e-05, + "loss": 1.9221, + "step": 4437 + }, + { + "epoch": 1.3621853898096992, + "grad_norm": 0.4956746995449066, + "learning_rate": 9.707296311167697e-05, + "loss": 1.9215, + "step": 4438 + }, + { + "epoch": 1.3624923265807243, + "grad_norm": 0.43748801946640015, + "learning_rate": 9.707128717029894e-05, + "loss": 1.9882, + "step": 4439 + }, + { + "epoch": 1.3627992633517496, + "grad_norm": 0.4926415979862213, + "learning_rate": 9.706961076373485e-05, + "loss": 1.9664, + "step": 4440 + }, + { + "epoch": 1.3631062001227747, + "grad_norm": 0.5239415764808655, + "learning_rate": 9.706793389200129e-05, + "loss": 1.9809, + "step": 4441 + }, + { + "epoch": 1.3634131368937998, + "grad_norm": 0.5134629607200623, + "learning_rate": 9.706625655511481e-05, + "loss": 1.9559, + "step": 4442 + }, + { + "epoch": 1.3637200736648252, + "grad_norm": 0.49562570452690125, + "learning_rate": 9.706457875309198e-05, + "loss": 1.9603, + "step": 4443 + }, + { + "epoch": 1.3640270104358503, + "grad_norm": 0.45000702142715454, + "learning_rate": 9.706290048594942e-05, + "loss": 1.9395, + "step": 4444 + }, + { + "epoch": 1.3643339472068754, + "grad_norm": 0.4216759502887726, + "learning_rate": 9.70612217537037e-05, + "loss": 1.8857, + "step": 4445 + }, + { + "epoch": 1.3646408839779005, + "grad_norm": 0.5022158622741699, + "learning_rate": 9.705954255637138e-05, + "loss": 1.9388, + "step": 4446 + }, + { + "epoch": 1.3649478207489256, + "grad_norm": 0.5086642503738403, + "learning_rate": 9.70578628939691e-05, + "loss": 1.9325, + "step": 4447 + }, + { + "epoch": 1.365254757519951, + "grad_norm": 0.4891139566898346, + "learning_rate": 9.705618276651342e-05, + "loss": 1.9068, + "step": 4448 + }, + { + "epoch": 1.365561694290976, + "grad_norm": 0.42479926347732544, + "learning_rate": 9.705450217402096e-05, + "loss": 2.0345, + "step": 4449 + }, + { + "epoch": 1.3658686310620012, + "grad_norm": 0.45347172021865845, + "learning_rate": 9.705282111650834e-05, + "loss": 1.9343, + "step": 4450 + }, + { + "epoch": 1.3661755678330265, + "grad_norm": 0.5443231463432312, + "learning_rate": 9.705113959399217e-05, + "loss": 2.0428, + "step": 4451 + }, + { + "epoch": 1.3664825046040516, + "grad_norm": 0.5320110321044922, + "learning_rate": 9.704945760648905e-05, + "loss": 2.0015, + "step": 4452 + }, + { + "epoch": 1.3667894413750767, + "grad_norm": 0.5018410086631775, + "learning_rate": 9.704777515401561e-05, + "loss": 1.9284, + "step": 4453 + }, + { + "epoch": 1.3670963781461019, + "grad_norm": 0.4587440490722656, + "learning_rate": 9.704609223658848e-05, + "loss": 1.8945, + "step": 4454 + }, + { + "epoch": 1.367403314917127, + "grad_norm": 0.4634784758090973, + "learning_rate": 9.70444088542243e-05, + "loss": 1.9564, + "step": 4455 + }, + { + "epoch": 1.3677102516881523, + "grad_norm": 0.43047839403152466, + "learning_rate": 9.70427250069397e-05, + "loss": 2.0417, + "step": 4456 + }, + { + "epoch": 1.3680171884591774, + "grad_norm": 0.46661630272865295, + "learning_rate": 9.70410406947513e-05, + "loss": 2.0563, + "step": 4457 + }, + { + "epoch": 1.3683241252302025, + "grad_norm": 0.46544912457466125, + "learning_rate": 9.703935591767579e-05, + "loss": 2.0115, + "step": 4458 + }, + { + "epoch": 1.3686310620012279, + "grad_norm": 0.466172993183136, + "learning_rate": 9.703767067572977e-05, + "loss": 1.9177, + "step": 4459 + }, + { + "epoch": 1.368937998772253, + "grad_norm": 0.44513949751853943, + "learning_rate": 9.703598496892994e-05, + "loss": 1.9954, + "step": 4460 + }, + { + "epoch": 1.369244935543278, + "grad_norm": 0.4502551257610321, + "learning_rate": 9.703429879729293e-05, + "loss": 1.9155, + "step": 4461 + }, + { + "epoch": 1.3695518723143032, + "grad_norm": 0.4618416726589203, + "learning_rate": 9.703261216083541e-05, + "loss": 2.015, + "step": 4462 + }, + { + "epoch": 1.3698588090853283, + "grad_norm": 0.4691082239151001, + "learning_rate": 9.703092505957405e-05, + "loss": 2.0332, + "step": 4463 + }, + { + "epoch": 1.3701657458563536, + "grad_norm": 0.5674530863761902, + "learning_rate": 9.702923749352553e-05, + "loss": 2.0, + "step": 4464 + }, + { + "epoch": 1.3704726826273788, + "grad_norm": 0.5828661322593689, + "learning_rate": 9.702754946270651e-05, + "loss": 1.9727, + "step": 4465 + }, + { + "epoch": 1.3707796193984039, + "grad_norm": 0.5861548781394958, + "learning_rate": 9.702586096713369e-05, + "loss": 2.0337, + "step": 4466 + }, + { + "epoch": 1.3710865561694292, + "grad_norm": 0.5607923865318298, + "learning_rate": 9.702417200682374e-05, + "loss": 1.9639, + "step": 4467 + }, + { + "epoch": 1.3713934929404543, + "grad_norm": 0.553827702999115, + "learning_rate": 9.702248258179337e-05, + "loss": 1.9644, + "step": 4468 + }, + { + "epoch": 1.3717004297114794, + "grad_norm": 0.6120470762252808, + "learning_rate": 9.702079269205925e-05, + "loss": 1.9562, + "step": 4469 + }, + { + "epoch": 1.3720073664825045, + "grad_norm": 0.6354473829269409, + "learning_rate": 9.70191023376381e-05, + "loss": 2.0984, + "step": 4470 + }, + { + "epoch": 1.3723143032535297, + "grad_norm": 0.5426626801490784, + "learning_rate": 9.701741151854665e-05, + "loss": 1.9473, + "step": 4471 + }, + { + "epoch": 1.372621240024555, + "grad_norm": 0.5632089376449585, + "learning_rate": 9.701572023480156e-05, + "loss": 2.0167, + "step": 4472 + }, + { + "epoch": 1.37292817679558, + "grad_norm": 0.5315039157867432, + "learning_rate": 9.701402848641957e-05, + "loss": 1.9537, + "step": 4473 + }, + { + "epoch": 1.3732351135666052, + "grad_norm": 0.4552931785583496, + "learning_rate": 9.70123362734174e-05, + "loss": 1.9553, + "step": 4474 + }, + { + "epoch": 1.3735420503376305, + "grad_norm": 0.49282166361808777, + "learning_rate": 9.701064359581176e-05, + "loss": 2.0409, + "step": 4475 + }, + { + "epoch": 1.3738489871086557, + "grad_norm": 0.46548575162887573, + "learning_rate": 9.700895045361939e-05, + "loss": 1.9707, + "step": 4476 + }, + { + "epoch": 1.3741559238796808, + "grad_norm": 0.4619027078151703, + "learning_rate": 9.7007256846857e-05, + "loss": 1.9531, + "step": 4477 + }, + { + "epoch": 1.3744628606507059, + "grad_norm": 0.5122626423835754, + "learning_rate": 9.700556277554138e-05, + "loss": 2.0625, + "step": 4478 + }, + { + "epoch": 1.374769797421731, + "grad_norm": 0.487246036529541, + "learning_rate": 9.700386823968922e-05, + "loss": 1.9667, + "step": 4479 + }, + { + "epoch": 1.3750767341927563, + "grad_norm": 0.5093865990638733, + "learning_rate": 9.700217323931729e-05, + "loss": 1.9982, + "step": 4480 + }, + { + "epoch": 1.3753836709637814, + "grad_norm": 0.47049981355667114, + "learning_rate": 9.700047777444232e-05, + "loss": 1.9876, + "step": 4481 + }, + { + "epoch": 1.3756906077348066, + "grad_norm": 0.4997411370277405, + "learning_rate": 9.699878184508109e-05, + "loss": 1.9925, + "step": 4482 + }, + { + "epoch": 1.375997544505832, + "grad_norm": 0.49374327063560486, + "learning_rate": 9.699708545125034e-05, + "loss": 1.9468, + "step": 4483 + }, + { + "epoch": 1.376304481276857, + "grad_norm": 0.44101378321647644, + "learning_rate": 9.699538859296686e-05, + "loss": 2.0577, + "step": 4484 + }, + { + "epoch": 1.3766114180478821, + "grad_norm": 0.47289925813674927, + "learning_rate": 9.699369127024741e-05, + "loss": 1.9611, + "step": 4485 + }, + { + "epoch": 1.3769183548189072, + "grad_norm": 0.4616342782974243, + "learning_rate": 9.699199348310875e-05, + "loss": 2.0196, + "step": 4486 + }, + { + "epoch": 1.3772252915899323, + "grad_norm": 0.45797309279441833, + "learning_rate": 9.699029523156766e-05, + "loss": 2.0168, + "step": 4487 + }, + { + "epoch": 1.3775322283609577, + "grad_norm": 0.5224477648735046, + "learning_rate": 9.698859651564095e-05, + "loss": 2.0312, + "step": 4488 + }, + { + "epoch": 1.3778391651319828, + "grad_norm": 0.4831027388572693, + "learning_rate": 9.698689733534539e-05, + "loss": 2.0084, + "step": 4489 + }, + { + "epoch": 1.378146101903008, + "grad_norm": 0.49492040276527405, + "learning_rate": 9.698519769069774e-05, + "loss": 1.9474, + "step": 4490 + }, + { + "epoch": 1.3784530386740332, + "grad_norm": 0.4911774694919586, + "learning_rate": 9.698349758171486e-05, + "loss": 1.987, + "step": 4491 + }, + { + "epoch": 1.3787599754450584, + "grad_norm": 0.5415390729904175, + "learning_rate": 9.69817970084135e-05, + "loss": 1.9927, + "step": 4492 + }, + { + "epoch": 1.3790669122160835, + "grad_norm": 0.6870381832122803, + "learning_rate": 9.698009597081048e-05, + "loss": 2.0348, + "step": 4493 + }, + { + "epoch": 1.3793738489871086, + "grad_norm": 0.6322616934776306, + "learning_rate": 9.697839446892263e-05, + "loss": 2.0119, + "step": 4494 + }, + { + "epoch": 1.3796807857581337, + "grad_norm": 0.5950151681900024, + "learning_rate": 9.697669250276675e-05, + "loss": 2.002, + "step": 4495 + }, + { + "epoch": 1.379987722529159, + "grad_norm": 0.4321151673793793, + "learning_rate": 9.697499007235966e-05, + "loss": 1.9173, + "step": 4496 + }, + { + "epoch": 1.3802946593001841, + "grad_norm": 0.4627344608306885, + "learning_rate": 9.697328717771818e-05, + "loss": 2.0289, + "step": 4497 + }, + { + "epoch": 1.3806015960712092, + "grad_norm": 0.5040726661682129, + "learning_rate": 9.697158381885915e-05, + "loss": 1.9844, + "step": 4498 + }, + { + "epoch": 1.3809085328422346, + "grad_norm": 0.5219398736953735, + "learning_rate": 9.696987999579939e-05, + "loss": 1.9536, + "step": 4499 + }, + { + "epoch": 1.3812154696132597, + "grad_norm": 0.487734317779541, + "learning_rate": 9.696817570855575e-05, + "loss": 1.9655, + "step": 4500 + }, + { + "epoch": 1.3815224063842848, + "grad_norm": 0.40818822383880615, + "learning_rate": 9.696647095714506e-05, + "loss": 1.9524, + "step": 4501 + }, + { + "epoch": 1.3818293431553101, + "grad_norm": 0.41752889752388, + "learning_rate": 9.69647657415842e-05, + "loss": 1.9927, + "step": 4502 + }, + { + "epoch": 1.3821362799263353, + "grad_norm": 0.44540464878082275, + "learning_rate": 9.696306006188998e-05, + "loss": 1.9207, + "step": 4503 + }, + { + "epoch": 1.3824432166973604, + "grad_norm": 0.44818806648254395, + "learning_rate": 9.696135391807927e-05, + "loss": 1.9054, + "step": 4504 + }, + { + "epoch": 1.3827501534683855, + "grad_norm": 0.430758535861969, + "learning_rate": 9.695964731016896e-05, + "loss": 1.9644, + "step": 4505 + }, + { + "epoch": 1.3830570902394106, + "grad_norm": 0.3787635564804077, + "learning_rate": 9.695794023817586e-05, + "loss": 1.9601, + "step": 4506 + }, + { + "epoch": 1.383364027010436, + "grad_norm": 0.42520588636398315, + "learning_rate": 9.695623270211689e-05, + "loss": 1.9681, + "step": 4507 + }, + { + "epoch": 1.383670963781461, + "grad_norm": 0.39063912630081177, + "learning_rate": 9.69545247020089e-05, + "loss": 2.0323, + "step": 4508 + }, + { + "epoch": 1.3839779005524862, + "grad_norm": 0.41405799984931946, + "learning_rate": 9.695281623786879e-05, + "loss": 1.9239, + "step": 4509 + }, + { + "epoch": 1.3842848373235115, + "grad_norm": 0.4275501072406769, + "learning_rate": 9.695110730971342e-05, + "loss": 1.941, + "step": 4510 + }, + { + "epoch": 1.3845917740945366, + "grad_norm": 0.5254966616630554, + "learning_rate": 9.694939791755968e-05, + "loss": 1.9997, + "step": 4511 + }, + { + "epoch": 1.3848987108655617, + "grad_norm": 0.581857442855835, + "learning_rate": 9.694768806142448e-05, + "loss": 2.0085, + "step": 4512 + }, + { + "epoch": 1.3852056476365868, + "grad_norm": 0.6330662965774536, + "learning_rate": 9.69459777413247e-05, + "loss": 1.9898, + "step": 4513 + }, + { + "epoch": 1.385512584407612, + "grad_norm": 0.693536639213562, + "learning_rate": 9.694426695727727e-05, + "loss": 1.9466, + "step": 4514 + }, + { + "epoch": 1.3858195211786373, + "grad_norm": 0.6494079232215881, + "learning_rate": 9.694255570929906e-05, + "loss": 1.9523, + "step": 4515 + }, + { + "epoch": 1.3861264579496624, + "grad_norm": 0.573515772819519, + "learning_rate": 9.694084399740701e-05, + "loss": 1.9789, + "step": 4516 + }, + { + "epoch": 1.3864333947206875, + "grad_norm": 0.5253448486328125, + "learning_rate": 9.693913182161805e-05, + "loss": 2.0348, + "step": 4517 + }, + { + "epoch": 1.3867403314917128, + "grad_norm": 0.49921590089797974, + "learning_rate": 9.693741918194904e-05, + "loss": 1.9684, + "step": 4518 + }, + { + "epoch": 1.387047268262738, + "grad_norm": 0.5164174437522888, + "learning_rate": 9.693570607841696e-05, + "loss": 2.0104, + "step": 4519 + }, + { + "epoch": 1.387354205033763, + "grad_norm": 0.5620231032371521, + "learning_rate": 9.693399251103872e-05, + "loss": 1.9969, + "step": 4520 + }, + { + "epoch": 1.3876611418047882, + "grad_norm": 0.495890349149704, + "learning_rate": 9.693227847983126e-05, + "loss": 2.0037, + "step": 4521 + }, + { + "epoch": 1.3879680785758133, + "grad_norm": 0.4942645728588104, + "learning_rate": 9.693056398481151e-05, + "loss": 2.0199, + "step": 4522 + }, + { + "epoch": 1.3882750153468386, + "grad_norm": 0.5366860628128052, + "learning_rate": 9.692884902599643e-05, + "loss": 2.0395, + "step": 4523 + }, + { + "epoch": 1.3885819521178637, + "grad_norm": 0.48179951310157776, + "learning_rate": 9.692713360340295e-05, + "loss": 2.0292, + "step": 4524 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.4709320366382599, + "learning_rate": 9.692541771704804e-05, + "loss": 2.006, + "step": 4525 + }, + { + "epoch": 1.3891958256599142, + "grad_norm": 0.4311975836753845, + "learning_rate": 9.692370136694864e-05, + "loss": 2.0122, + "step": 4526 + }, + { + "epoch": 1.3895027624309393, + "grad_norm": 0.4489841163158417, + "learning_rate": 9.692198455312172e-05, + "loss": 1.9635, + "step": 4527 + }, + { + "epoch": 1.3898096992019644, + "grad_norm": 0.40383243560791016, + "learning_rate": 9.692026727558425e-05, + "loss": 1.9352, + "step": 4528 + }, + { + "epoch": 1.3901166359729895, + "grad_norm": 0.4732677638530731, + "learning_rate": 9.691854953435319e-05, + "loss": 1.9882, + "step": 4529 + }, + { + "epoch": 1.3904235727440146, + "grad_norm": 0.5124688744544983, + "learning_rate": 9.691683132944553e-05, + "loss": 2.0068, + "step": 4530 + }, + { + "epoch": 1.39073050951504, + "grad_norm": 0.4810490906238556, + "learning_rate": 9.691511266087824e-05, + "loss": 2.0163, + "step": 4531 + }, + { + "epoch": 1.391037446286065, + "grad_norm": 0.4019710421562195, + "learning_rate": 9.691339352866831e-05, + "loss": 1.8943, + "step": 4532 + }, + { + "epoch": 1.3913443830570902, + "grad_norm": 0.4144287705421448, + "learning_rate": 9.691167393283274e-05, + "loss": 1.9457, + "step": 4533 + }, + { + "epoch": 1.3916513198281155, + "grad_norm": 0.42622655630111694, + "learning_rate": 9.690995387338851e-05, + "loss": 1.9618, + "step": 4534 + }, + { + "epoch": 1.3919582565991406, + "grad_norm": 0.4547794461250305, + "learning_rate": 9.690823335035259e-05, + "loss": 2.0243, + "step": 4535 + }, + { + "epoch": 1.3922651933701657, + "grad_norm": 0.4298909604549408, + "learning_rate": 9.690651236374205e-05, + "loss": 1.9872, + "step": 4536 + }, + { + "epoch": 1.3925721301411909, + "grad_norm": 0.40463829040527344, + "learning_rate": 9.690479091357386e-05, + "loss": 1.9617, + "step": 4537 + }, + { + "epoch": 1.392879066912216, + "grad_norm": 0.441487580537796, + "learning_rate": 9.690306899986502e-05, + "loss": 1.8965, + "step": 4538 + }, + { + "epoch": 1.3931860036832413, + "grad_norm": 0.4713582694530487, + "learning_rate": 9.690134662263256e-05, + "loss": 2.0112, + "step": 4539 + }, + { + "epoch": 1.3934929404542664, + "grad_norm": 0.5772922039031982, + "learning_rate": 9.689962378189351e-05, + "loss": 1.9903, + "step": 4540 + }, + { + "epoch": 1.3937998772252915, + "grad_norm": 0.6658890247344971, + "learning_rate": 9.689790047766489e-05, + "loss": 2.0569, + "step": 4541 + }, + { + "epoch": 1.3941068139963169, + "grad_norm": 0.6710116267204285, + "learning_rate": 9.689617670996372e-05, + "loss": 1.9692, + "step": 4542 + }, + { + "epoch": 1.394413750767342, + "grad_norm": 0.6778390407562256, + "learning_rate": 9.689445247880707e-05, + "loss": 2.0363, + "step": 4543 + }, + { + "epoch": 1.394720687538367, + "grad_norm": 0.6921203136444092, + "learning_rate": 9.689272778421192e-05, + "loss": 2.0104, + "step": 4544 + }, + { + "epoch": 1.3950276243093922, + "grad_norm": 0.48772117495536804, + "learning_rate": 9.689100262619537e-05, + "loss": 2.0006, + "step": 4545 + }, + { + "epoch": 1.3953345610804173, + "grad_norm": 0.4956360459327698, + "learning_rate": 9.688927700477445e-05, + "loss": 1.9724, + "step": 4546 + }, + { + "epoch": 1.3956414978514426, + "grad_norm": 0.6304072141647339, + "learning_rate": 9.68875509199662e-05, + "loss": 1.9904, + "step": 4547 + }, + { + "epoch": 1.3959484346224678, + "grad_norm": 0.6372275948524475, + "learning_rate": 9.68858243717877e-05, + "loss": 2.0328, + "step": 4548 + }, + { + "epoch": 1.3962553713934929, + "grad_norm": 0.48642870783805847, + "learning_rate": 9.688409736025601e-05, + "loss": 1.9898, + "step": 4549 + }, + { + "epoch": 1.3965623081645182, + "grad_norm": 0.41096800565719604, + "learning_rate": 9.688236988538817e-05, + "loss": 1.8945, + "step": 4550 + }, + { + "epoch": 1.3968692449355433, + "grad_norm": 0.48746830224990845, + "learning_rate": 9.68806419472013e-05, + "loss": 1.9809, + "step": 4551 + }, + { + "epoch": 1.3971761817065684, + "grad_norm": 0.5296676754951477, + "learning_rate": 9.687891354571242e-05, + "loss": 1.9194, + "step": 4552 + }, + { + "epoch": 1.3974831184775935, + "grad_norm": 0.43177086114883423, + "learning_rate": 9.687718468093865e-05, + "loss": 1.8785, + "step": 4553 + }, + { + "epoch": 1.3977900552486187, + "grad_norm": 0.4617565870285034, + "learning_rate": 9.687545535289705e-05, + "loss": 2.0021, + "step": 4554 + }, + { + "epoch": 1.398096992019644, + "grad_norm": 0.4460168182849884, + "learning_rate": 9.687372556160477e-05, + "loss": 1.9368, + "step": 4555 + }, + { + "epoch": 1.398403928790669, + "grad_norm": 0.5051010847091675, + "learning_rate": 9.687199530707882e-05, + "loss": 2.0321, + "step": 4556 + }, + { + "epoch": 1.3987108655616942, + "grad_norm": 0.5623685717582703, + "learning_rate": 9.687026458933636e-05, + "loss": 2.007, + "step": 4557 + }, + { + "epoch": 1.3990178023327196, + "grad_norm": 0.48149919509887695, + "learning_rate": 9.686853340839446e-05, + "loss": 1.9346, + "step": 4558 + }, + { + "epoch": 1.3993247391037447, + "grad_norm": 0.4651631712913513, + "learning_rate": 9.686680176427025e-05, + "loss": 1.9603, + "step": 4559 + }, + { + "epoch": 1.3996316758747698, + "grad_norm": 0.5255021452903748, + "learning_rate": 9.686506965698083e-05, + "loss": 2.0206, + "step": 4560 + }, + { + "epoch": 1.3999386126457949, + "grad_norm": 0.5137404799461365, + "learning_rate": 9.686333708654334e-05, + "loss": 1.9736, + "step": 4561 + }, + { + "epoch": 1.40024554941682, + "grad_norm": 0.5037943124771118, + "learning_rate": 9.686160405297487e-05, + "loss": 1.9886, + "step": 4562 + }, + { + "epoch": 1.4005524861878453, + "grad_norm": 0.46424365043640137, + "learning_rate": 9.685987055629256e-05, + "loss": 1.9316, + "step": 4563 + }, + { + "epoch": 1.4008594229588704, + "grad_norm": 0.4839535355567932, + "learning_rate": 9.685813659651355e-05, + "loss": 1.9651, + "step": 4564 + }, + { + "epoch": 1.4011663597298956, + "grad_norm": 0.48972323536872864, + "learning_rate": 9.685640217365497e-05, + "loss": 1.9544, + "step": 4565 + }, + { + "epoch": 1.401473296500921, + "grad_norm": 0.43038102984428406, + "learning_rate": 9.685466728773396e-05, + "loss": 1.9522, + "step": 4566 + }, + { + "epoch": 1.401780233271946, + "grad_norm": 0.5174641013145447, + "learning_rate": 9.685293193876765e-05, + "loss": 2.046, + "step": 4567 + }, + { + "epoch": 1.4020871700429711, + "grad_norm": 0.6731263995170593, + "learning_rate": 9.685119612677323e-05, + "loss": 2.0123, + "step": 4568 + }, + { + "epoch": 1.4023941068139965, + "grad_norm": 0.5863515734672546, + "learning_rate": 9.684945985176782e-05, + "loss": 1.9951, + "step": 4569 + }, + { + "epoch": 1.4027010435850216, + "grad_norm": 0.4479050934314728, + "learning_rate": 9.684772311376859e-05, + "loss": 1.9287, + "step": 4570 + }, + { + "epoch": 1.4030079803560467, + "grad_norm": 0.432740718126297, + "learning_rate": 9.68459859127927e-05, + "loss": 1.955, + "step": 4571 + }, + { + "epoch": 1.4033149171270718, + "grad_norm": 0.571775496006012, + "learning_rate": 9.684424824885731e-05, + "loss": 1.9519, + "step": 4572 + }, + { + "epoch": 1.403621853898097, + "grad_norm": 0.6454880237579346, + "learning_rate": 9.684251012197963e-05, + "loss": 1.9858, + "step": 4573 + }, + { + "epoch": 1.4039287906691222, + "grad_norm": 0.5274731516838074, + "learning_rate": 9.684077153217677e-05, + "loss": 1.9956, + "step": 4574 + }, + { + "epoch": 1.4042357274401474, + "grad_norm": 0.4459272027015686, + "learning_rate": 9.683903247946597e-05, + "loss": 2.0412, + "step": 4575 + }, + { + "epoch": 1.4045426642111725, + "grad_norm": 0.47089213132858276, + "learning_rate": 9.683729296386441e-05, + "loss": 1.9247, + "step": 4576 + }, + { + "epoch": 1.4048496009821978, + "grad_norm": 0.628490149974823, + "learning_rate": 9.683555298538927e-05, + "loss": 2.1311, + "step": 4577 + }, + { + "epoch": 1.405156537753223, + "grad_norm": 0.5498626232147217, + "learning_rate": 9.683381254405773e-05, + "loss": 1.9538, + "step": 4578 + }, + { + "epoch": 1.405463474524248, + "grad_norm": 0.4556458294391632, + "learning_rate": 9.6832071639887e-05, + "loss": 1.9957, + "step": 4579 + }, + { + "epoch": 1.4057704112952731, + "grad_norm": 0.5684164762496948, + "learning_rate": 9.68303302728943e-05, + "loss": 1.9339, + "step": 4580 + }, + { + "epoch": 1.4060773480662982, + "grad_norm": 0.5723292231559753, + "learning_rate": 9.682858844309682e-05, + "loss": 2.0043, + "step": 4581 + }, + { + "epoch": 1.4063842848373236, + "grad_norm": 0.4734770953655243, + "learning_rate": 9.682684615051178e-05, + "loss": 1.9854, + "step": 4582 + }, + { + "epoch": 1.4066912216083487, + "grad_norm": 0.49376189708709717, + "learning_rate": 9.682510339515642e-05, + "loss": 2.0436, + "step": 4583 + }, + { + "epoch": 1.4069981583793738, + "grad_norm": 0.6263520121574402, + "learning_rate": 9.682336017704793e-05, + "loss": 1.9426, + "step": 4584 + }, + { + "epoch": 1.4073050951503991, + "grad_norm": 0.5852357745170593, + "learning_rate": 9.682161649620355e-05, + "loss": 1.9865, + "step": 4585 + }, + { + "epoch": 1.4076120319214243, + "grad_norm": 0.45548367500305176, + "learning_rate": 9.681987235264052e-05, + "loss": 2.0454, + "step": 4586 + }, + { + "epoch": 1.4079189686924494, + "grad_norm": 0.4961472153663635, + "learning_rate": 9.681812774637607e-05, + "loss": 2.0414, + "step": 4587 + }, + { + "epoch": 1.4082259054634745, + "grad_norm": 0.5739028453826904, + "learning_rate": 9.681638267742741e-05, + "loss": 1.9591, + "step": 4588 + }, + { + "epoch": 1.4085328422344996, + "grad_norm": 0.546283483505249, + "learning_rate": 9.681463714581184e-05, + "loss": 1.9631, + "step": 4589 + }, + { + "epoch": 1.408839779005525, + "grad_norm": 0.4757421910762787, + "learning_rate": 9.681289115154659e-05, + "loss": 1.954, + "step": 4590 + }, + { + "epoch": 1.40914671577655, + "grad_norm": 0.5116898417472839, + "learning_rate": 9.681114469464891e-05, + "loss": 1.9816, + "step": 4591 + }, + { + "epoch": 1.4094536525475752, + "grad_norm": 0.6128544807434082, + "learning_rate": 9.680939777513607e-05, + "loss": 1.9408, + "step": 4592 + }, + { + "epoch": 1.4097605893186005, + "grad_norm": 0.5577036142349243, + "learning_rate": 9.680765039302531e-05, + "loss": 1.906, + "step": 4593 + }, + { + "epoch": 1.4100675260896256, + "grad_norm": 0.4608074128627777, + "learning_rate": 9.680590254833393e-05, + "loss": 1.9421, + "step": 4594 + }, + { + "epoch": 1.4103744628606507, + "grad_norm": 0.4221206307411194, + "learning_rate": 9.680415424107917e-05, + "loss": 1.9596, + "step": 4595 + }, + { + "epoch": 1.4106813996316758, + "grad_norm": 0.4278069734573364, + "learning_rate": 9.680240547127832e-05, + "loss": 1.9718, + "step": 4596 + }, + { + "epoch": 1.410988336402701, + "grad_norm": 0.48608019948005676, + "learning_rate": 9.680065623894869e-05, + "loss": 2.0595, + "step": 4597 + }, + { + "epoch": 1.4112952731737263, + "grad_norm": 0.4559817910194397, + "learning_rate": 9.679890654410753e-05, + "loss": 1.959, + "step": 4598 + }, + { + "epoch": 1.4116022099447514, + "grad_norm": 0.5122750997543335, + "learning_rate": 9.679715638677216e-05, + "loss": 2.0669, + "step": 4599 + }, + { + "epoch": 1.4119091467157765, + "grad_norm": 0.5203170776367188, + "learning_rate": 9.679540576695985e-05, + "loss": 1.9475, + "step": 4600 + }, + { + "epoch": 1.4122160834868018, + "grad_norm": 0.5420581698417664, + "learning_rate": 9.679365468468791e-05, + "loss": 1.9603, + "step": 4601 + }, + { + "epoch": 1.412523020257827, + "grad_norm": 0.527387261390686, + "learning_rate": 9.679190313997364e-05, + "loss": 1.9172, + "step": 4602 + }, + { + "epoch": 1.412829957028852, + "grad_norm": 0.48417946696281433, + "learning_rate": 9.679015113283438e-05, + "loss": 1.9619, + "step": 4603 + }, + { + "epoch": 1.4131368937998772, + "grad_norm": 0.49174100160598755, + "learning_rate": 9.678839866328742e-05, + "loss": 1.9959, + "step": 4604 + }, + { + "epoch": 1.4134438305709023, + "grad_norm": 0.5096092224121094, + "learning_rate": 9.678664573135006e-05, + "loss": 2.0046, + "step": 4605 + }, + { + "epoch": 1.4137507673419276, + "grad_norm": 0.4536958634853363, + "learning_rate": 9.678489233703965e-05, + "loss": 1.9289, + "step": 4606 + }, + { + "epoch": 1.4140577041129527, + "grad_norm": 0.40438196063041687, + "learning_rate": 9.678313848037353e-05, + "loss": 1.9488, + "step": 4607 + }, + { + "epoch": 1.4143646408839778, + "grad_norm": 0.4447456896305084, + "learning_rate": 9.6781384161369e-05, + "loss": 1.9638, + "step": 4608 + }, + { + "epoch": 1.4146715776550032, + "grad_norm": 0.44451746344566345, + "learning_rate": 9.677962938004342e-05, + "loss": 1.9026, + "step": 4609 + }, + { + "epoch": 1.4149785144260283, + "grad_norm": 0.4262266457080841, + "learning_rate": 9.677787413641412e-05, + "loss": 1.9408, + "step": 4610 + }, + { + "epoch": 1.4152854511970534, + "grad_norm": 0.42755937576293945, + "learning_rate": 9.677611843049845e-05, + "loss": 1.9542, + "step": 4611 + }, + { + "epoch": 1.4155923879680785, + "grad_norm": 0.43264830112457275, + "learning_rate": 9.677436226231375e-05, + "loss": 2.0244, + "step": 4612 + }, + { + "epoch": 1.4158993247391036, + "grad_norm": 0.4521278142929077, + "learning_rate": 9.67726056318774e-05, + "loss": 2.0343, + "step": 4613 + }, + { + "epoch": 1.416206261510129, + "grad_norm": 0.45257535576820374, + "learning_rate": 9.677084853920675e-05, + "loss": 1.9743, + "step": 4614 + }, + { + "epoch": 1.416513198281154, + "grad_norm": 0.42859771847724915, + "learning_rate": 9.676909098431915e-05, + "loss": 2.0067, + "step": 4615 + }, + { + "epoch": 1.4168201350521792, + "grad_norm": 0.4057050049304962, + "learning_rate": 9.6767332967232e-05, + "loss": 1.9074, + "step": 4616 + }, + { + "epoch": 1.4171270718232045, + "grad_norm": 0.46177807450294495, + "learning_rate": 9.676557448796264e-05, + "loss": 1.9899, + "step": 4617 + }, + { + "epoch": 1.4174340085942296, + "grad_norm": 0.44164395332336426, + "learning_rate": 9.676381554652846e-05, + "loss": 1.9759, + "step": 4618 + }, + { + "epoch": 1.4177409453652547, + "grad_norm": 0.42987993359565735, + "learning_rate": 9.676205614294684e-05, + "loss": 1.8783, + "step": 4619 + }, + { + "epoch": 1.4180478821362799, + "grad_norm": 0.541702389717102, + "learning_rate": 9.67602962772352e-05, + "loss": 2.0099, + "step": 4620 + }, + { + "epoch": 1.418354818907305, + "grad_norm": 0.42173272371292114, + "learning_rate": 9.67585359494109e-05, + "loss": 1.9281, + "step": 4621 + }, + { + "epoch": 1.4186617556783303, + "grad_norm": 0.432476669549942, + "learning_rate": 9.67567751594913e-05, + "loss": 1.9124, + "step": 4622 + }, + { + "epoch": 1.4189686924493554, + "grad_norm": 0.4952125549316406, + "learning_rate": 9.675501390749388e-05, + "loss": 1.973, + "step": 4623 + }, + { + "epoch": 1.4192756292203805, + "grad_norm": 0.5270698070526123, + "learning_rate": 9.6753252193436e-05, + "loss": 2.003, + "step": 4624 + }, + { + "epoch": 1.4195825659914059, + "grad_norm": 0.5735524892807007, + "learning_rate": 9.67514900173351e-05, + "loss": 1.9266, + "step": 4625 + }, + { + "epoch": 1.419889502762431, + "grad_norm": 0.508196234703064, + "learning_rate": 9.674972737920855e-05, + "loss": 1.9633, + "step": 4626 + }, + { + "epoch": 1.420196439533456, + "grad_norm": 0.4321250319480896, + "learning_rate": 9.674796427907379e-05, + "loss": 1.9994, + "step": 4627 + }, + { + "epoch": 1.4205033763044812, + "grad_norm": 0.5697643756866455, + "learning_rate": 9.674620071694826e-05, + "loss": 2.0018, + "step": 4628 + }, + { + "epoch": 1.4208103130755063, + "grad_norm": 0.6797513365745544, + "learning_rate": 9.674443669284936e-05, + "loss": 2.0514, + "step": 4629 + }, + { + "epoch": 1.4211172498465316, + "grad_norm": 0.6622742414474487, + "learning_rate": 9.674267220679456e-05, + "loss": 1.9315, + "step": 4630 + }, + { + "epoch": 1.4214241866175568, + "grad_norm": 0.5143589377403259, + "learning_rate": 9.674090725880125e-05, + "loss": 1.9691, + "step": 4631 + }, + { + "epoch": 1.4217311233885819, + "grad_norm": 0.4472220838069916, + "learning_rate": 9.673914184888692e-05, + "loss": 1.9629, + "step": 4632 + }, + { + "epoch": 1.4220380601596072, + "grad_norm": 0.4992378354072571, + "learning_rate": 9.6737375977069e-05, + "loss": 1.9202, + "step": 4633 + }, + { + "epoch": 1.4223449969306323, + "grad_norm": 0.5463345646858215, + "learning_rate": 9.673560964336493e-05, + "loss": 2.0143, + "step": 4634 + }, + { + "epoch": 1.4226519337016574, + "grad_norm": 0.4566437304019928, + "learning_rate": 9.673384284779217e-05, + "loss": 1.8907, + "step": 4635 + }, + { + "epoch": 1.4229588704726825, + "grad_norm": 0.41718652844429016, + "learning_rate": 9.673207559036816e-05, + "loss": 1.8955, + "step": 4636 + }, + { + "epoch": 1.4232658072437077, + "grad_norm": 0.5017329454421997, + "learning_rate": 9.673030787111043e-05, + "loss": 1.9745, + "step": 4637 + }, + { + "epoch": 1.423572744014733, + "grad_norm": 0.48890092968940735, + "learning_rate": 9.67285396900364e-05, + "loss": 1.9448, + "step": 4638 + }, + { + "epoch": 1.423879680785758, + "grad_norm": 0.4519537687301636, + "learning_rate": 9.672677104716352e-05, + "loss": 1.9572, + "step": 4639 + }, + { + "epoch": 1.4241866175567832, + "grad_norm": 0.4786919355392456, + "learning_rate": 9.672500194250932e-05, + "loss": 2.0212, + "step": 4640 + }, + { + "epoch": 1.4244935543278086, + "grad_norm": 0.4938487112522125, + "learning_rate": 9.672323237609127e-05, + "loss": 1.9842, + "step": 4641 + }, + { + "epoch": 1.4248004910988337, + "grad_norm": 0.5786599516868591, + "learning_rate": 9.672146234792686e-05, + "loss": 1.9575, + "step": 4642 + }, + { + "epoch": 1.4251074278698588, + "grad_norm": 0.5532247424125671, + "learning_rate": 9.671969185803356e-05, + "loss": 1.9972, + "step": 4643 + }, + { + "epoch": 1.4254143646408841, + "grad_norm": 0.5058014988899231, + "learning_rate": 9.671792090642889e-05, + "loss": 2.0042, + "step": 4644 + }, + { + "epoch": 1.4257213014119092, + "grad_norm": 0.46545106172561646, + "learning_rate": 9.671614949313033e-05, + "loss": 1.9853, + "step": 4645 + }, + { + "epoch": 1.4260282381829343, + "grad_norm": 0.47626879811286926, + "learning_rate": 9.671437761815541e-05, + "loss": 1.9725, + "step": 4646 + }, + { + "epoch": 1.4263351749539595, + "grad_norm": 0.4476237893104553, + "learning_rate": 9.671260528152165e-05, + "loss": 1.8876, + "step": 4647 + }, + { + "epoch": 1.4266421117249846, + "grad_norm": 0.4290693700313568, + "learning_rate": 9.671083248324651e-05, + "loss": 1.9766, + "step": 4648 + }, + { + "epoch": 1.42694904849601, + "grad_norm": 0.443131685256958, + "learning_rate": 9.670905922334757e-05, + "loss": 2.0201, + "step": 4649 + }, + { + "epoch": 1.427255985267035, + "grad_norm": 0.5181389451026917, + "learning_rate": 9.670728550184231e-05, + "loss": 2.0013, + "step": 4650 + }, + { + "epoch": 1.4275629220380601, + "grad_norm": 0.48453402519226074, + "learning_rate": 9.670551131874829e-05, + "loss": 1.9536, + "step": 4651 + }, + { + "epoch": 1.4278698588090855, + "grad_norm": 0.49652302265167236, + "learning_rate": 9.670373667408303e-05, + "loss": 1.9934, + "step": 4652 + }, + { + "epoch": 1.4281767955801106, + "grad_norm": 0.47071191668510437, + "learning_rate": 9.670196156786406e-05, + "loss": 2.0319, + "step": 4653 + }, + { + "epoch": 1.4284837323511357, + "grad_norm": 0.46828708052635193, + "learning_rate": 9.670018600010894e-05, + "loss": 1.9248, + "step": 4654 + }, + { + "epoch": 1.4287906691221608, + "grad_norm": 0.48472490906715393, + "learning_rate": 9.669840997083524e-05, + "loss": 1.9681, + "step": 4655 + }, + { + "epoch": 1.429097605893186, + "grad_norm": 0.48628562688827515, + "learning_rate": 9.669663348006044e-05, + "loss": 1.9818, + "step": 4656 + }, + { + "epoch": 1.4294045426642112, + "grad_norm": 0.40770742297172546, + "learning_rate": 9.669485652780215e-05, + "loss": 1.927, + "step": 4657 + }, + { + "epoch": 1.4297114794352364, + "grad_norm": 0.5005267858505249, + "learning_rate": 9.669307911407794e-05, + "loss": 2.0564, + "step": 4658 + }, + { + "epoch": 1.4300184162062615, + "grad_norm": 0.42432111501693726, + "learning_rate": 9.669130123890533e-05, + "loss": 1.9344, + "step": 4659 + }, + { + "epoch": 1.4303253529772868, + "grad_norm": 0.42347240447998047, + "learning_rate": 9.668952290230192e-05, + "loss": 1.962, + "step": 4660 + }, + { + "epoch": 1.430632289748312, + "grad_norm": 0.4718005955219269, + "learning_rate": 9.668774410428529e-05, + "loss": 2.0081, + "step": 4661 + }, + { + "epoch": 1.430939226519337, + "grad_norm": 0.45922374725341797, + "learning_rate": 9.6685964844873e-05, + "loss": 1.9378, + "step": 4662 + }, + { + "epoch": 1.4312461632903621, + "grad_norm": 0.43764227628707886, + "learning_rate": 9.668418512408263e-05, + "loss": 2.0084, + "step": 4663 + }, + { + "epoch": 1.4315531000613873, + "grad_norm": 0.42079678177833557, + "learning_rate": 9.668240494193179e-05, + "loss": 1.9675, + "step": 4664 + }, + { + "epoch": 1.4318600368324126, + "grad_norm": 0.4470539093017578, + "learning_rate": 9.668062429843808e-05, + "loss": 1.9781, + "step": 4665 + }, + { + "epoch": 1.4321669736034377, + "grad_norm": 0.4903084337711334, + "learning_rate": 9.667884319361906e-05, + "loss": 1.9612, + "step": 4666 + }, + { + "epoch": 1.4324739103744628, + "grad_norm": 0.4906228482723236, + "learning_rate": 9.667706162749234e-05, + "loss": 2.0115, + "step": 4667 + }, + { + "epoch": 1.4327808471454881, + "grad_norm": 0.4868105351924896, + "learning_rate": 9.667527960007556e-05, + "loss": 1.9648, + "step": 4668 + }, + { + "epoch": 1.4330877839165133, + "grad_norm": 0.5115882754325867, + "learning_rate": 9.667349711138632e-05, + "loss": 2.0366, + "step": 4669 + }, + { + "epoch": 1.4333947206875384, + "grad_norm": 0.47366276383399963, + "learning_rate": 9.66717141614422e-05, + "loss": 1.9467, + "step": 4670 + }, + { + "epoch": 1.4337016574585635, + "grad_norm": 0.6110171675682068, + "learning_rate": 9.666993075026086e-05, + "loss": 1.9272, + "step": 4671 + }, + { + "epoch": 1.4340085942295886, + "grad_norm": 0.5915683507919312, + "learning_rate": 9.66681468778599e-05, + "loss": 2.0444, + "step": 4672 + }, + { + "epoch": 1.434315531000614, + "grad_norm": 0.5783519744873047, + "learning_rate": 9.666636254425697e-05, + "loss": 1.9579, + "step": 4673 + }, + { + "epoch": 1.434622467771639, + "grad_norm": 0.4646502137184143, + "learning_rate": 9.66645777494697e-05, + "loss": 1.9172, + "step": 4674 + }, + { + "epoch": 1.4349294045426642, + "grad_norm": 0.4184744656085968, + "learning_rate": 9.666279249351571e-05, + "loss": 1.9189, + "step": 4675 + }, + { + "epoch": 1.4352363413136895, + "grad_norm": 0.5444575548171997, + "learning_rate": 9.666100677641266e-05, + "loss": 2.045, + "step": 4676 + }, + { + "epoch": 1.4355432780847146, + "grad_norm": 0.5232846140861511, + "learning_rate": 9.665922059817818e-05, + "loss": 2.0059, + "step": 4677 + }, + { + "epoch": 1.4358502148557397, + "grad_norm": 0.439259797334671, + "learning_rate": 9.665743395882994e-05, + "loss": 1.9164, + "step": 4678 + }, + { + "epoch": 1.4361571516267648, + "grad_norm": 0.405073344707489, + "learning_rate": 9.66556468583856e-05, + "loss": 1.9211, + "step": 4679 + }, + { + "epoch": 1.43646408839779, + "grad_norm": 0.47113174200057983, + "learning_rate": 9.665385929686279e-05, + "loss": 2.0732, + "step": 4680 + }, + { + "epoch": 1.4367710251688153, + "grad_norm": 0.4710143506526947, + "learning_rate": 9.665207127427923e-05, + "loss": 1.9153, + "step": 4681 + }, + { + "epoch": 1.4370779619398404, + "grad_norm": 0.41988152265548706, + "learning_rate": 9.665028279065254e-05, + "loss": 1.9985, + "step": 4682 + }, + { + "epoch": 1.4373848987108655, + "grad_norm": 0.4629889130592346, + "learning_rate": 9.664849384600042e-05, + "loss": 2.0188, + "step": 4683 + }, + { + "epoch": 1.4376918354818908, + "grad_norm": 0.42099106311798096, + "learning_rate": 9.664670444034051e-05, + "loss": 1.8915, + "step": 4684 + }, + { + "epoch": 1.437998772252916, + "grad_norm": 0.4132508337497711, + "learning_rate": 9.664491457369056e-05, + "loss": 1.9842, + "step": 4685 + }, + { + "epoch": 1.438305709023941, + "grad_norm": 0.4019499123096466, + "learning_rate": 9.664312424606822e-05, + "loss": 1.8653, + "step": 4686 + }, + { + "epoch": 1.4386126457949662, + "grad_norm": 0.40366294980049133, + "learning_rate": 9.664133345749118e-05, + "loss": 1.8993, + "step": 4687 + }, + { + "epoch": 1.4389195825659913, + "grad_norm": 0.4391988217830658, + "learning_rate": 9.663954220797715e-05, + "loss": 1.9471, + "step": 4688 + }, + { + "epoch": 1.4392265193370166, + "grad_norm": 0.44109684228897095, + "learning_rate": 9.663775049754382e-05, + "loss": 1.9579, + "step": 4689 + }, + { + "epoch": 1.4395334561080417, + "grad_norm": 0.45682960748672485, + "learning_rate": 9.663595832620891e-05, + "loss": 1.9757, + "step": 4690 + }, + { + "epoch": 1.4398403928790668, + "grad_norm": 0.4106207489967346, + "learning_rate": 9.663416569399013e-05, + "loss": 2.0038, + "step": 4691 + }, + { + "epoch": 1.4401473296500922, + "grad_norm": 0.4627512991428375, + "learning_rate": 9.66323726009052e-05, + "loss": 2.0253, + "step": 4692 + }, + { + "epoch": 1.4404542664211173, + "grad_norm": 0.43822941184043884, + "learning_rate": 9.663057904697182e-05, + "loss": 1.9565, + "step": 4693 + }, + { + "epoch": 1.4407612031921424, + "grad_norm": 0.46254315972328186, + "learning_rate": 9.662878503220772e-05, + "loss": 2.0042, + "step": 4694 + }, + { + "epoch": 1.4410681399631675, + "grad_norm": 0.49801671504974365, + "learning_rate": 9.662699055663065e-05, + "loss": 1.9725, + "step": 4695 + }, + { + "epoch": 1.4413750767341926, + "grad_norm": 0.40280646085739136, + "learning_rate": 9.662519562025832e-05, + "loss": 1.9016, + "step": 4696 + }, + { + "epoch": 1.441682013505218, + "grad_norm": 0.4095497131347656, + "learning_rate": 9.662340022310848e-05, + "loss": 2.0054, + "step": 4697 + }, + { + "epoch": 1.441988950276243, + "grad_norm": 0.44916659593582153, + "learning_rate": 9.662160436519889e-05, + "loss": 2.0126, + "step": 4698 + }, + { + "epoch": 1.4422958870472682, + "grad_norm": 0.47450655698776245, + "learning_rate": 9.661980804654725e-05, + "loss": 1.9679, + "step": 4699 + }, + { + "epoch": 1.4426028238182935, + "grad_norm": 0.4454696774482727, + "learning_rate": 9.661801126717136e-05, + "loss": 1.9335, + "step": 4700 + }, + { + "epoch": 1.4429097605893186, + "grad_norm": 0.5009927153587341, + "learning_rate": 9.661621402708896e-05, + "loss": 1.9777, + "step": 4701 + }, + { + "epoch": 1.4432166973603437, + "grad_norm": 0.49912458658218384, + "learning_rate": 9.66144163263178e-05, + "loss": 2.0095, + "step": 4702 + }, + { + "epoch": 1.4435236341313689, + "grad_norm": 0.4477069079875946, + "learning_rate": 9.661261816487568e-05, + "loss": 1.9265, + "step": 4703 + }, + { + "epoch": 1.443830570902394, + "grad_norm": 0.4170798361301422, + "learning_rate": 9.661081954278033e-05, + "loss": 1.9458, + "step": 4704 + }, + { + "epoch": 1.4441375076734193, + "grad_norm": 0.45160573720932007, + "learning_rate": 9.660902046004953e-05, + "loss": 1.9596, + "step": 4705 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4391551911830902, + "learning_rate": 9.660722091670109e-05, + "loss": 1.9158, + "step": 4706 + }, + { + "epoch": 1.4447513812154695, + "grad_norm": 0.5183218121528625, + "learning_rate": 9.660542091275276e-05, + "loss": 2.0055, + "step": 4707 + }, + { + "epoch": 1.4450583179864949, + "grad_norm": 0.49749481678009033, + "learning_rate": 9.660362044822235e-05, + "loss": 1.9695, + "step": 4708 + }, + { + "epoch": 1.44536525475752, + "grad_norm": 0.4839307963848114, + "learning_rate": 9.660181952312766e-05, + "loss": 1.9447, + "step": 4709 + }, + { + "epoch": 1.445672191528545, + "grad_norm": 0.5218588709831238, + "learning_rate": 9.660001813748647e-05, + "loss": 1.9892, + "step": 4710 + }, + { + "epoch": 1.4459791282995704, + "grad_norm": 0.5628986954689026, + "learning_rate": 9.659821629131658e-05, + "loss": 2.0598, + "step": 4711 + }, + { + "epoch": 1.4462860650705955, + "grad_norm": 0.5226300358772278, + "learning_rate": 9.65964139846358e-05, + "loss": 1.977, + "step": 4712 + }, + { + "epoch": 1.4465930018416207, + "grad_norm": 0.4345463216304779, + "learning_rate": 9.659461121746196e-05, + "loss": 1.9649, + "step": 4713 + }, + { + "epoch": 1.4468999386126458, + "grad_norm": 0.47233885526657104, + "learning_rate": 9.659280798981285e-05, + "loss": 1.9791, + "step": 4714 + }, + { + "epoch": 1.4472068753836709, + "grad_norm": 0.5272542238235474, + "learning_rate": 9.659100430170631e-05, + "loss": 2.0153, + "step": 4715 + }, + { + "epoch": 1.4475138121546962, + "grad_norm": 0.5567492246627808, + "learning_rate": 9.658920015316015e-05, + "loss": 2.0196, + "step": 4716 + }, + { + "epoch": 1.4478207489257213, + "grad_norm": 0.5393046140670776, + "learning_rate": 9.658739554419222e-05, + "loss": 1.9871, + "step": 4717 + }, + { + "epoch": 1.4481276856967464, + "grad_norm": 0.46408072113990784, + "learning_rate": 9.658559047482034e-05, + "loss": 1.9896, + "step": 4718 + }, + { + "epoch": 1.4484346224677718, + "grad_norm": 0.47001218795776367, + "learning_rate": 9.658378494506234e-05, + "loss": 2.0281, + "step": 4719 + }, + { + "epoch": 1.4487415592387969, + "grad_norm": 0.555749773979187, + "learning_rate": 9.658197895493608e-05, + "loss": 2.0184, + "step": 4720 + }, + { + "epoch": 1.449048496009822, + "grad_norm": 0.6206443905830383, + "learning_rate": 9.65801725044594e-05, + "loss": 1.9788, + "step": 4721 + }, + { + "epoch": 1.449355432780847, + "grad_norm": 0.533336877822876, + "learning_rate": 9.657836559365016e-05, + "loss": 1.9755, + "step": 4722 + }, + { + "epoch": 1.4496623695518722, + "grad_norm": 0.4553185701370239, + "learning_rate": 9.65765582225262e-05, + "loss": 1.9791, + "step": 4723 + }, + { + "epoch": 1.4499693063228976, + "grad_norm": 0.5754305124282837, + "learning_rate": 9.65747503911054e-05, + "loss": 1.9485, + "step": 4724 + }, + { + "epoch": 1.4502762430939227, + "grad_norm": 0.6812698245048523, + "learning_rate": 9.657294209940562e-05, + "loss": 2.0326, + "step": 4725 + }, + { + "epoch": 1.4505831798649478, + "grad_norm": 0.7532522678375244, + "learning_rate": 9.657113334744472e-05, + "loss": 1.9387, + "step": 4726 + }, + { + "epoch": 1.4508901166359731, + "grad_norm": 0.5618684887886047, + "learning_rate": 9.656932413524058e-05, + "loss": 1.9395, + "step": 4727 + }, + { + "epoch": 1.4511970534069982, + "grad_norm": 0.4818387031555176, + "learning_rate": 9.65675144628111e-05, + "loss": 1.9473, + "step": 4728 + }, + { + "epoch": 1.4515039901780233, + "grad_norm": 0.5152607560157776, + "learning_rate": 9.656570433017413e-05, + "loss": 1.894, + "step": 4729 + }, + { + "epoch": 1.4518109269490485, + "grad_norm": 0.5098578333854675, + "learning_rate": 9.656389373734759e-05, + "loss": 1.9519, + "step": 4730 + }, + { + "epoch": 1.4521178637200736, + "grad_norm": 0.5862317681312561, + "learning_rate": 9.656208268434936e-05, + "loss": 1.9968, + "step": 4731 + }, + { + "epoch": 1.452424800491099, + "grad_norm": 0.501220703125, + "learning_rate": 9.656027117119732e-05, + "loss": 1.993, + "step": 4732 + }, + { + "epoch": 1.452731737262124, + "grad_norm": 0.4974796772003174, + "learning_rate": 9.655845919790943e-05, + "loss": 2.0007, + "step": 4733 + }, + { + "epoch": 1.4530386740331491, + "grad_norm": 0.513671875, + "learning_rate": 9.655664676450351e-05, + "loss": 1.9321, + "step": 4734 + }, + { + "epoch": 1.4533456108041745, + "grad_norm": 0.5111755728721619, + "learning_rate": 9.655483387099756e-05, + "loss": 2.0187, + "step": 4735 + }, + { + "epoch": 1.4536525475751996, + "grad_norm": 0.47103258967399597, + "learning_rate": 9.655302051740942e-05, + "loss": 1.9716, + "step": 4736 + }, + { + "epoch": 1.4539594843462247, + "grad_norm": 0.4526553750038147, + "learning_rate": 9.655120670375707e-05, + "loss": 2.0424, + "step": 4737 + }, + { + "epoch": 1.4542664211172498, + "grad_norm": 0.44393640756607056, + "learning_rate": 9.65493924300584e-05, + "loss": 1.9318, + "step": 4738 + }, + { + "epoch": 1.454573357888275, + "grad_norm": 0.4070759415626526, + "learning_rate": 9.654757769633136e-05, + "loss": 1.9292, + "step": 4739 + }, + { + "epoch": 1.4548802946593002, + "grad_norm": 0.4010253846645355, + "learning_rate": 9.654576250259387e-05, + "loss": 1.9641, + "step": 4740 + }, + { + "epoch": 1.4551872314303254, + "grad_norm": 0.39156264066696167, + "learning_rate": 9.654394684886387e-05, + "loss": 1.9575, + "step": 4741 + }, + { + "epoch": 1.4554941682013505, + "grad_norm": 0.4360155463218689, + "learning_rate": 9.65421307351593e-05, + "loss": 1.9615, + "step": 4742 + }, + { + "epoch": 1.4558011049723758, + "grad_norm": 0.4203348755836487, + "learning_rate": 9.654031416149813e-05, + "loss": 1.9629, + "step": 4743 + }, + { + "epoch": 1.456108041743401, + "grad_norm": 0.42294225096702576, + "learning_rate": 9.653849712789828e-05, + "loss": 1.9756, + "step": 4744 + }, + { + "epoch": 1.456414978514426, + "grad_norm": 0.46253907680511475, + "learning_rate": 9.653667963437775e-05, + "loss": 2.0128, + "step": 4745 + }, + { + "epoch": 1.4567219152854511, + "grad_norm": 0.41743987798690796, + "learning_rate": 9.653486168095446e-05, + "loss": 1.938, + "step": 4746 + }, + { + "epoch": 1.4570288520564763, + "grad_norm": 0.43411263823509216, + "learning_rate": 9.653304326764639e-05, + "loss": 1.9744, + "step": 4747 + }, + { + "epoch": 1.4573357888275016, + "grad_norm": 0.4569607973098755, + "learning_rate": 9.653122439447151e-05, + "loss": 1.9844, + "step": 4748 + }, + { + "epoch": 1.4576427255985267, + "grad_norm": 0.41858115792274475, + "learning_rate": 9.652940506144781e-05, + "loss": 1.9835, + "step": 4749 + }, + { + "epoch": 1.4579496623695518, + "grad_norm": 0.4259703755378723, + "learning_rate": 9.652758526859324e-05, + "loss": 1.9467, + "step": 4750 + }, + { + "epoch": 1.4582565991405771, + "grad_norm": 0.49847620725631714, + "learning_rate": 9.652576501592583e-05, + "loss": 1.989, + "step": 4751 + }, + { + "epoch": 1.4585635359116023, + "grad_norm": 0.5898705720901489, + "learning_rate": 9.652394430346352e-05, + "loss": 1.9896, + "step": 4752 + }, + { + "epoch": 1.4588704726826274, + "grad_norm": 0.6528434157371521, + "learning_rate": 9.652212313122433e-05, + "loss": 1.9814, + "step": 4753 + }, + { + "epoch": 1.4591774094536525, + "grad_norm": 0.5704251527786255, + "learning_rate": 9.652030149922624e-05, + "loss": 1.9735, + "step": 4754 + }, + { + "epoch": 1.4594843462246776, + "grad_norm": 0.4349142014980316, + "learning_rate": 9.651847940748727e-05, + "loss": 1.9923, + "step": 4755 + }, + { + "epoch": 1.459791282995703, + "grad_norm": 0.43891096115112305, + "learning_rate": 9.651665685602542e-05, + "loss": 1.9429, + "step": 4756 + }, + { + "epoch": 1.460098219766728, + "grad_norm": 0.5881633758544922, + "learning_rate": 9.651483384485871e-05, + "loss": 2.0075, + "step": 4757 + }, + { + "epoch": 1.4604051565377532, + "grad_norm": 0.569064736366272, + "learning_rate": 9.651301037400515e-05, + "loss": 1.9968, + "step": 4758 + }, + { + "epoch": 1.4607120933087785, + "grad_norm": 0.49636805057525635, + "learning_rate": 9.651118644348276e-05, + "loss": 2.0844, + "step": 4759 + }, + { + "epoch": 1.4610190300798036, + "grad_norm": 0.4893283247947693, + "learning_rate": 9.650936205330955e-05, + "loss": 1.9635, + "step": 4760 + }, + { + "epoch": 1.4613259668508287, + "grad_norm": 0.5199632048606873, + "learning_rate": 9.650753720350358e-05, + "loss": 1.8934, + "step": 4761 + }, + { + "epoch": 1.4616329036218538, + "grad_norm": 0.5655859708786011, + "learning_rate": 9.650571189408287e-05, + "loss": 2.0473, + "step": 4762 + }, + { + "epoch": 1.461939840392879, + "grad_norm": 0.5004158020019531, + "learning_rate": 9.650388612506545e-05, + "loss": 1.9388, + "step": 4763 + }, + { + "epoch": 1.4622467771639043, + "grad_norm": 0.5075541734695435, + "learning_rate": 9.650205989646937e-05, + "loss": 2.0362, + "step": 4764 + }, + { + "epoch": 1.4625537139349294, + "grad_norm": 0.52835613489151, + "learning_rate": 9.650023320831267e-05, + "loss": 1.9849, + "step": 4765 + }, + { + "epoch": 1.4628606507059545, + "grad_norm": 0.5208338499069214, + "learning_rate": 9.649840606061342e-05, + "loss": 1.9619, + "step": 4766 + }, + { + "epoch": 1.4631675874769798, + "grad_norm": 0.4954691529273987, + "learning_rate": 9.649657845338966e-05, + "loss": 1.9282, + "step": 4767 + }, + { + "epoch": 1.463474524248005, + "grad_norm": 0.4260660409927368, + "learning_rate": 9.649475038665947e-05, + "loss": 2.0108, + "step": 4768 + }, + { + "epoch": 1.46378146101903, + "grad_norm": 0.4954771101474762, + "learning_rate": 9.64929218604409e-05, + "loss": 1.9995, + "step": 4769 + }, + { + "epoch": 1.4640883977900552, + "grad_norm": 0.6004415154457092, + "learning_rate": 9.649109287475202e-05, + "loss": 1.9816, + "step": 4770 + }, + { + "epoch": 1.4643953345610803, + "grad_norm": 0.6472858190536499, + "learning_rate": 9.648926342961092e-05, + "loss": 1.927, + "step": 4771 + }, + { + "epoch": 1.4647022713321056, + "grad_norm": 0.5293224453926086, + "learning_rate": 9.648743352503567e-05, + "loss": 1.9082, + "step": 4772 + }, + { + "epoch": 1.4650092081031307, + "grad_norm": 0.4413148760795593, + "learning_rate": 9.648560316104435e-05, + "loss": 1.9368, + "step": 4773 + }, + { + "epoch": 1.4653161448741558, + "grad_norm": 0.4727863371372223, + "learning_rate": 9.648377233765507e-05, + "loss": 1.944, + "step": 4774 + }, + { + "epoch": 1.4656230816451812, + "grad_norm": 0.5681154131889343, + "learning_rate": 9.648194105488589e-05, + "loss": 2.0003, + "step": 4775 + }, + { + "epoch": 1.4659300184162063, + "grad_norm": 0.5893644690513611, + "learning_rate": 9.648010931275493e-05, + "loss": 1.936, + "step": 4776 + }, + { + "epoch": 1.4662369551872314, + "grad_norm": 0.5034298300743103, + "learning_rate": 9.647827711128029e-05, + "loss": 2.0318, + "step": 4777 + }, + { + "epoch": 1.4665438919582565, + "grad_norm": 0.4954885244369507, + "learning_rate": 9.647644445048006e-05, + "loss": 2.0053, + "step": 4778 + }, + { + "epoch": 1.4668508287292816, + "grad_norm": 0.475923627614975, + "learning_rate": 9.647461133037236e-05, + "loss": 1.8911, + "step": 4779 + }, + { + "epoch": 1.467157765500307, + "grad_norm": 0.4725008010864258, + "learning_rate": 9.647277775097534e-05, + "loss": 1.8954, + "step": 4780 + }, + { + "epoch": 1.467464702271332, + "grad_norm": 0.4183707535266876, + "learning_rate": 9.647094371230707e-05, + "loss": 1.9891, + "step": 4781 + }, + { + "epoch": 1.4677716390423572, + "grad_norm": 0.4862513244152069, + "learning_rate": 9.64691092143857e-05, + "loss": 2.0364, + "step": 4782 + }, + { + "epoch": 1.4680785758133825, + "grad_norm": 0.5038082599639893, + "learning_rate": 9.646727425722936e-05, + "loss": 1.9304, + "step": 4783 + }, + { + "epoch": 1.4683855125844076, + "grad_norm": 0.47281327843666077, + "learning_rate": 9.646543884085618e-05, + "loss": 1.9453, + "step": 4784 + }, + { + "epoch": 1.4686924493554327, + "grad_norm": 0.42275354266166687, + "learning_rate": 9.646360296528431e-05, + "loss": 1.9434, + "step": 4785 + }, + { + "epoch": 1.468999386126458, + "grad_norm": 0.5757746696472168, + "learning_rate": 9.646176663053185e-05, + "loss": 2.0241, + "step": 4786 + }, + { + "epoch": 1.4693063228974832, + "grad_norm": 0.6757779121398926, + "learning_rate": 9.645992983661701e-05, + "loss": 1.9823, + "step": 4787 + }, + { + "epoch": 1.4696132596685083, + "grad_norm": 0.7052981853485107, + "learning_rate": 9.645809258355792e-05, + "loss": 2.0553, + "step": 4788 + }, + { + "epoch": 1.4699201964395334, + "grad_norm": 0.5630238652229309, + "learning_rate": 9.64562548713727e-05, + "loss": 2.0241, + "step": 4789 + }, + { + "epoch": 1.4702271332105585, + "grad_norm": 0.5034958124160767, + "learning_rate": 9.645441670007955e-05, + "loss": 1.9788, + "step": 4790 + }, + { + "epoch": 1.4705340699815839, + "grad_norm": 0.48978129029273987, + "learning_rate": 9.645257806969663e-05, + "loss": 1.9415, + "step": 4791 + }, + { + "epoch": 1.470841006752609, + "grad_norm": 0.4718508720397949, + "learning_rate": 9.645073898024211e-05, + "loss": 1.9657, + "step": 4792 + }, + { + "epoch": 1.471147943523634, + "grad_norm": 0.5171064734458923, + "learning_rate": 9.644889943173417e-05, + "loss": 1.9311, + "step": 4793 + }, + { + "epoch": 1.4714548802946594, + "grad_norm": 0.4556005597114563, + "learning_rate": 9.644705942419097e-05, + "loss": 1.9093, + "step": 4794 + }, + { + "epoch": 1.4717618170656845, + "grad_norm": 0.44836321473121643, + "learning_rate": 9.64452189576307e-05, + "loss": 1.9715, + "step": 4795 + }, + { + "epoch": 1.4720687538367097, + "grad_norm": 0.5139105916023254, + "learning_rate": 9.644337803207155e-05, + "loss": 1.967, + "step": 4796 + }, + { + "epoch": 1.4723756906077348, + "grad_norm": 0.49145743250846863, + "learning_rate": 9.644153664753173e-05, + "loss": 1.9679, + "step": 4797 + }, + { + "epoch": 1.4726826273787599, + "grad_norm": 0.4353790283203125, + "learning_rate": 9.643969480402942e-05, + "loss": 1.9438, + "step": 4798 + }, + { + "epoch": 1.4729895641497852, + "grad_norm": 0.39393118023872375, + "learning_rate": 9.643785250158283e-05, + "loss": 1.91, + "step": 4799 + }, + { + "epoch": 1.4732965009208103, + "grad_norm": 0.4250284731388092, + "learning_rate": 9.643600974021017e-05, + "loss": 1.9315, + "step": 4800 + }, + { + "epoch": 1.4736034376918354, + "grad_norm": 0.40301406383514404, + "learning_rate": 9.643416651992962e-05, + "loss": 1.9344, + "step": 4801 + }, + { + "epoch": 1.4739103744628608, + "grad_norm": 0.4428589940071106, + "learning_rate": 9.643232284075944e-05, + "loss": 1.9767, + "step": 4802 + }, + { + "epoch": 1.4742173112338859, + "grad_norm": 0.5098150372505188, + "learning_rate": 9.643047870271783e-05, + "loss": 2.0471, + "step": 4803 + }, + { + "epoch": 1.474524248004911, + "grad_norm": 0.5230079293251038, + "learning_rate": 9.642863410582302e-05, + "loss": 1.9647, + "step": 4804 + }, + { + "epoch": 1.474831184775936, + "grad_norm": 0.44200628995895386, + "learning_rate": 9.642678905009322e-05, + "loss": 1.9046, + "step": 4805 + }, + { + "epoch": 1.4751381215469612, + "grad_norm": 0.42684751749038696, + "learning_rate": 9.642494353554669e-05, + "loss": 1.82, + "step": 4806 + }, + { + "epoch": 1.4754450583179866, + "grad_norm": 0.3907437324523926, + "learning_rate": 9.642309756220165e-05, + "loss": 1.9257, + "step": 4807 + }, + { + "epoch": 1.4757519950890117, + "grad_norm": 0.43622660636901855, + "learning_rate": 9.642125113007636e-05, + "loss": 1.9319, + "step": 4808 + }, + { + "epoch": 1.4760589318600368, + "grad_norm": 0.4553097188472748, + "learning_rate": 9.641940423918905e-05, + "loss": 1.9699, + "step": 4809 + }, + { + "epoch": 1.4763658686310621, + "grad_norm": 0.48997193574905396, + "learning_rate": 9.641755688955798e-05, + "loss": 1.9843, + "step": 4810 + }, + { + "epoch": 1.4766728054020872, + "grad_norm": 0.5008227825164795, + "learning_rate": 9.641570908120141e-05, + "loss": 1.9616, + "step": 4811 + }, + { + "epoch": 1.4769797421731123, + "grad_norm": 0.49788615107536316, + "learning_rate": 9.64138608141376e-05, + "loss": 2.0233, + "step": 4812 + }, + { + "epoch": 1.4772866789441375, + "grad_norm": 0.509159505367279, + "learning_rate": 9.64120120883848e-05, + "loss": 1.9982, + "step": 4813 + }, + { + "epoch": 1.4775936157151626, + "grad_norm": 0.4976164996623993, + "learning_rate": 9.641016290396132e-05, + "loss": 1.9944, + "step": 4814 + }, + { + "epoch": 1.477900552486188, + "grad_norm": 0.4925370514392853, + "learning_rate": 9.640831326088539e-05, + "loss": 1.9547, + "step": 4815 + }, + { + "epoch": 1.478207489257213, + "grad_norm": 0.5058705806732178, + "learning_rate": 9.64064631591753e-05, + "loss": 2.0147, + "step": 4816 + }, + { + "epoch": 1.4785144260282381, + "grad_norm": 0.5614715814590454, + "learning_rate": 9.640461259884937e-05, + "loss": 1.9475, + "step": 4817 + }, + { + "epoch": 1.4788213627992635, + "grad_norm": 0.4417608380317688, + "learning_rate": 9.640276157992582e-05, + "loss": 1.9422, + "step": 4818 + }, + { + "epoch": 1.4791282995702886, + "grad_norm": 0.5124607682228088, + "learning_rate": 9.6400910102423e-05, + "loss": 1.9489, + "step": 4819 + }, + { + "epoch": 1.4794352363413137, + "grad_norm": 0.4931279420852661, + "learning_rate": 9.63990581663592e-05, + "loss": 1.9717, + "step": 4820 + }, + { + "epoch": 1.4797421731123388, + "grad_norm": 0.4716447591781616, + "learning_rate": 9.639720577175271e-05, + "loss": 1.9758, + "step": 4821 + }, + { + "epoch": 1.480049109883364, + "grad_norm": 0.4613695740699768, + "learning_rate": 9.639535291862183e-05, + "loss": 1.8998, + "step": 4822 + }, + { + "epoch": 1.4803560466543892, + "grad_norm": 0.4430600702762604, + "learning_rate": 9.639349960698489e-05, + "loss": 1.9539, + "step": 4823 + }, + { + "epoch": 1.4806629834254144, + "grad_norm": 0.45596009492874146, + "learning_rate": 9.639164583686018e-05, + "loss": 1.9626, + "step": 4824 + }, + { + "epoch": 1.4809699201964395, + "grad_norm": 0.4248705804347992, + "learning_rate": 9.638979160826604e-05, + "loss": 1.9627, + "step": 4825 + }, + { + "epoch": 1.4812768569674648, + "grad_norm": 0.43419960141181946, + "learning_rate": 9.63879369212208e-05, + "loss": 1.9589, + "step": 4826 + }, + { + "epoch": 1.48158379373849, + "grad_norm": 0.4715637266635895, + "learning_rate": 9.638608177574278e-05, + "loss": 1.981, + "step": 4827 + }, + { + "epoch": 1.481890730509515, + "grad_norm": 0.41809993982315063, + "learning_rate": 9.63842261718503e-05, + "loss": 1.9587, + "step": 4828 + }, + { + "epoch": 1.4821976672805401, + "grad_norm": 0.4085060656070709, + "learning_rate": 9.63823701095617e-05, + "loss": 1.9497, + "step": 4829 + }, + { + "epoch": 1.4825046040515653, + "grad_norm": 0.4199173152446747, + "learning_rate": 9.638051358889535e-05, + "loss": 1.9543, + "step": 4830 + }, + { + "epoch": 1.4828115408225906, + "grad_norm": 0.4560040235519409, + "learning_rate": 9.637865660986958e-05, + "loss": 1.9451, + "step": 4831 + }, + { + "epoch": 1.4831184775936157, + "grad_norm": 0.4059405028820038, + "learning_rate": 9.637679917250272e-05, + "loss": 1.9154, + "step": 4832 + }, + { + "epoch": 1.4834254143646408, + "grad_norm": 0.43314236402511597, + "learning_rate": 9.637494127681318e-05, + "loss": 1.9589, + "step": 4833 + }, + { + "epoch": 1.4837323511356661, + "grad_norm": 0.3866138458251953, + "learning_rate": 9.637308292281928e-05, + "loss": 1.9239, + "step": 4834 + }, + { + "epoch": 1.4840392879066913, + "grad_norm": 0.40781381726264954, + "learning_rate": 9.637122411053939e-05, + "loss": 1.9805, + "step": 4835 + }, + { + "epoch": 1.4843462246777164, + "grad_norm": 0.4605334401130676, + "learning_rate": 9.636936483999189e-05, + "loss": 1.9571, + "step": 4836 + }, + { + "epoch": 1.4846531614487415, + "grad_norm": 0.4730539917945862, + "learning_rate": 9.636750511119513e-05, + "loss": 1.9429, + "step": 4837 + }, + { + "epoch": 1.4849600982197666, + "grad_norm": 0.47973817586898804, + "learning_rate": 9.636564492416753e-05, + "loss": 1.9865, + "step": 4838 + }, + { + "epoch": 1.485267034990792, + "grad_norm": 0.4541794955730438, + "learning_rate": 9.636378427892744e-05, + "loss": 1.9796, + "step": 4839 + }, + { + "epoch": 1.485573971761817, + "grad_norm": 0.4863722026348114, + "learning_rate": 9.636192317549327e-05, + "loss": 1.9581, + "step": 4840 + }, + { + "epoch": 1.4858809085328422, + "grad_norm": 0.4559536278247833, + "learning_rate": 9.636006161388338e-05, + "loss": 1.9444, + "step": 4841 + }, + { + "epoch": 1.4861878453038675, + "grad_norm": 0.4385206401348114, + "learning_rate": 9.63581995941162e-05, + "loss": 1.9323, + "step": 4842 + }, + { + "epoch": 1.4864947820748926, + "grad_norm": 0.48802945017814636, + "learning_rate": 9.635633711621012e-05, + "loss": 1.9643, + "step": 4843 + }, + { + "epoch": 1.4868017188459177, + "grad_norm": 0.4051367938518524, + "learning_rate": 9.635447418018355e-05, + "loss": 1.9342, + "step": 4844 + }, + { + "epoch": 1.4871086556169428, + "grad_norm": 0.46384257078170776, + "learning_rate": 9.63526107860549e-05, + "loss": 1.9656, + "step": 4845 + }, + { + "epoch": 1.487415592387968, + "grad_norm": 0.3950713574886322, + "learning_rate": 9.635074693384257e-05, + "loss": 1.8673, + "step": 4846 + }, + { + "epoch": 1.4877225291589933, + "grad_norm": 0.4694644808769226, + "learning_rate": 9.634888262356501e-05, + "loss": 1.9484, + "step": 4847 + }, + { + "epoch": 1.4880294659300184, + "grad_norm": 0.45068567991256714, + "learning_rate": 9.63470178552406e-05, + "loss": 1.9221, + "step": 4848 + }, + { + "epoch": 1.4883364027010435, + "grad_norm": 0.44717836380004883, + "learning_rate": 9.634515262888781e-05, + "loss": 1.9968, + "step": 4849 + }, + { + "epoch": 1.4886433394720688, + "grad_norm": 0.42189615964889526, + "learning_rate": 9.634328694452506e-05, + "loss": 2.0262, + "step": 4850 + }, + { + "epoch": 1.488950276243094, + "grad_norm": 0.4895322322845459, + "learning_rate": 9.63414208021708e-05, + "loss": 2.0628, + "step": 4851 + }, + { + "epoch": 1.489257213014119, + "grad_norm": 0.4732883870601654, + "learning_rate": 9.633955420184342e-05, + "loss": 1.9487, + "step": 4852 + }, + { + "epoch": 1.4895641497851444, + "grad_norm": 0.4426051676273346, + "learning_rate": 9.633768714356143e-05, + "loss": 2.0181, + "step": 4853 + }, + { + "epoch": 1.4898710865561695, + "grad_norm": 0.5831739902496338, + "learning_rate": 9.633581962734326e-05, + "loss": 1.9311, + "step": 4854 + }, + { + "epoch": 1.4901780233271946, + "grad_norm": 0.6048587560653687, + "learning_rate": 9.633395165320734e-05, + "loss": 1.9159, + "step": 4855 + }, + { + "epoch": 1.4904849600982197, + "grad_norm": 0.60125732421875, + "learning_rate": 9.633208322117218e-05, + "loss": 1.9732, + "step": 4856 + }, + { + "epoch": 1.4907918968692448, + "grad_norm": 0.4806794822216034, + "learning_rate": 9.63302143312562e-05, + "loss": 1.9101, + "step": 4857 + }, + { + "epoch": 1.4910988336402702, + "grad_norm": 0.4032946228981018, + "learning_rate": 9.632834498347789e-05, + "loss": 1.9097, + "step": 4858 + }, + { + "epoch": 1.4914057704112953, + "grad_norm": 0.400632381439209, + "learning_rate": 9.632647517785571e-05, + "loss": 1.9949, + "step": 4859 + }, + { + "epoch": 1.4917127071823204, + "grad_norm": 0.49766576290130615, + "learning_rate": 9.632460491440818e-05, + "loss": 1.9762, + "step": 4860 + }, + { + "epoch": 1.4920196439533457, + "grad_norm": 0.6273209452629089, + "learning_rate": 9.632273419315372e-05, + "loss": 2.0797, + "step": 4861 + }, + { + "epoch": 1.4923265807243709, + "grad_norm": 0.5848406553268433, + "learning_rate": 9.632086301411087e-05, + "loss": 1.9366, + "step": 4862 + }, + { + "epoch": 1.492633517495396, + "grad_norm": 0.4683595597743988, + "learning_rate": 9.631899137729809e-05, + "loss": 1.9802, + "step": 4863 + }, + { + "epoch": 1.492940454266421, + "grad_norm": 0.43066033720970154, + "learning_rate": 9.63171192827339e-05, + "loss": 1.9621, + "step": 4864 + }, + { + "epoch": 1.4932473910374462, + "grad_norm": 0.47469422221183777, + "learning_rate": 9.63152467304368e-05, + "loss": 1.9795, + "step": 4865 + }, + { + "epoch": 1.4935543278084715, + "grad_norm": 0.5453927516937256, + "learning_rate": 9.631337372042526e-05, + "loss": 1.9711, + "step": 4866 + }, + { + "epoch": 1.4938612645794966, + "grad_norm": 0.5361614227294922, + "learning_rate": 9.631150025271782e-05, + "loss": 1.9849, + "step": 4867 + }, + { + "epoch": 1.4941682013505218, + "grad_norm": 0.4773578643798828, + "learning_rate": 9.6309626327333e-05, + "loss": 2.065, + "step": 4868 + }, + { + "epoch": 1.494475138121547, + "grad_norm": 0.428091824054718, + "learning_rate": 9.630775194428932e-05, + "loss": 1.9448, + "step": 4869 + }, + { + "epoch": 1.4947820748925722, + "grad_norm": 0.41679108142852783, + "learning_rate": 9.630587710360527e-05, + "loss": 1.9511, + "step": 4870 + }, + { + "epoch": 1.4950890116635973, + "grad_norm": 0.5072546601295471, + "learning_rate": 9.630400180529942e-05, + "loss": 1.9973, + "step": 4871 + }, + { + "epoch": 1.4953959484346224, + "grad_norm": 0.5230575799942017, + "learning_rate": 9.630212604939026e-05, + "loss": 1.9659, + "step": 4872 + }, + { + "epoch": 1.4957028852056475, + "grad_norm": 0.44307753443717957, + "learning_rate": 9.630024983589638e-05, + "loss": 1.9056, + "step": 4873 + }, + { + "epoch": 1.4960098219766729, + "grad_norm": 0.43783196806907654, + "learning_rate": 9.629837316483628e-05, + "loss": 1.9716, + "step": 4874 + }, + { + "epoch": 1.496316758747698, + "grad_norm": 0.4553990960121155, + "learning_rate": 9.629649603622852e-05, + "loss": 2.044, + "step": 4875 + }, + { + "epoch": 1.496623695518723, + "grad_norm": 0.49152833223342896, + "learning_rate": 9.629461845009164e-05, + "loss": 1.948, + "step": 4876 + }, + { + "epoch": 1.4969306322897484, + "grad_norm": 0.4371738135814667, + "learning_rate": 9.629274040644422e-05, + "loss": 1.9497, + "step": 4877 + }, + { + "epoch": 1.4972375690607735, + "grad_norm": 0.4973873198032379, + "learning_rate": 9.629086190530482e-05, + "loss": 2.0053, + "step": 4878 + }, + { + "epoch": 1.4975445058317987, + "grad_norm": 0.4250672459602356, + "learning_rate": 9.628898294669197e-05, + "loss": 1.9617, + "step": 4879 + }, + { + "epoch": 1.4978514426028238, + "grad_norm": 0.4514639675617218, + "learning_rate": 9.628710353062427e-05, + "loss": 1.9503, + "step": 4880 + }, + { + "epoch": 1.4981583793738489, + "grad_norm": 0.4960804879665375, + "learning_rate": 9.628522365712027e-05, + "loss": 1.9932, + "step": 4881 + }, + { + "epoch": 1.4984653161448742, + "grad_norm": 0.5604363083839417, + "learning_rate": 9.628334332619857e-05, + "loss": 2.0186, + "step": 4882 + }, + { + "epoch": 1.4987722529158993, + "grad_norm": 0.5125443935394287, + "learning_rate": 9.628146253787776e-05, + "loss": 1.9897, + "step": 4883 + }, + { + "epoch": 1.4990791896869244, + "grad_norm": 0.4029771089553833, + "learning_rate": 9.627958129217639e-05, + "loss": 1.9083, + "step": 4884 + }, + { + "epoch": 1.4993861264579498, + "grad_norm": 0.4608222544193268, + "learning_rate": 9.627769958911308e-05, + "loss": 2.0153, + "step": 4885 + }, + { + "epoch": 1.4996930632289749, + "grad_norm": 0.4253246486186981, + "learning_rate": 9.627581742870641e-05, + "loss": 1.9278, + "step": 4886 + }, + { + "epoch": 1.5, + "grad_norm": 0.4247463047504425, + "learning_rate": 9.6273934810975e-05, + "loss": 1.9456, + "step": 4887 + }, + { + "epoch": 1.5003069367710253, + "grad_norm": 0.44055816531181335, + "learning_rate": 9.627205173593744e-05, + "loss": 2.0225, + "step": 4888 + }, + { + "epoch": 1.5006138735420502, + "grad_norm": 0.47912710905075073, + "learning_rate": 9.627016820361235e-05, + "loss": 1.9716, + "step": 4889 + }, + { + "epoch": 1.5009208103130756, + "grad_norm": 0.47608625888824463, + "learning_rate": 9.626828421401832e-05, + "loss": 1.9444, + "step": 4890 + }, + { + "epoch": 1.5012277470841007, + "grad_norm": 0.4757349193096161, + "learning_rate": 9.6266399767174e-05, + "loss": 2.0699, + "step": 4891 + }, + { + "epoch": 1.5015346838551258, + "grad_norm": 0.5556650757789612, + "learning_rate": 9.6264514863098e-05, + "loss": 1.99, + "step": 4892 + }, + { + "epoch": 1.5018416206261511, + "grad_norm": 0.5072291493415833, + "learning_rate": 9.626262950180894e-05, + "loss": 1.9435, + "step": 4893 + }, + { + "epoch": 1.5021485573971762, + "grad_norm": 0.47811564803123474, + "learning_rate": 9.626074368332546e-05, + "loss": 1.9399, + "step": 4894 + }, + { + "epoch": 1.5024554941682013, + "grad_norm": 0.4613232910633087, + "learning_rate": 9.62588574076662e-05, + "loss": 1.9259, + "step": 4895 + }, + { + "epoch": 1.5027624309392267, + "grad_norm": 0.4170697331428528, + "learning_rate": 9.62569706748498e-05, + "loss": 1.9319, + "step": 4896 + }, + { + "epoch": 1.5030693677102516, + "grad_norm": 0.4731575548648834, + "learning_rate": 9.62550834848949e-05, + "loss": 1.9862, + "step": 4897 + }, + { + "epoch": 1.503376304481277, + "grad_norm": 0.49881401658058167, + "learning_rate": 9.625319583782016e-05, + "loss": 1.9837, + "step": 4898 + }, + { + "epoch": 1.503683241252302, + "grad_norm": 0.4689660668373108, + "learning_rate": 9.625130773364424e-05, + "loss": 1.9662, + "step": 4899 + }, + { + "epoch": 1.5039901780233271, + "grad_norm": 0.48389768600463867, + "learning_rate": 9.624941917238577e-05, + "loss": 2.0087, + "step": 4900 + }, + { + "epoch": 1.5042971147943525, + "grad_norm": 0.46716609597206116, + "learning_rate": 9.624753015406342e-05, + "loss": 1.9718, + "step": 4901 + }, + { + "epoch": 1.5046040515653776, + "grad_norm": 0.544793963432312, + "learning_rate": 9.62456406786959e-05, + "loss": 1.9878, + "step": 4902 + }, + { + "epoch": 1.5049109883364027, + "grad_norm": 0.44499701261520386, + "learning_rate": 9.624375074630183e-05, + "loss": 1.8849, + "step": 4903 + }, + { + "epoch": 1.505217925107428, + "grad_norm": 0.42464208602905273, + "learning_rate": 9.624186035689993e-05, + "loss": 1.8995, + "step": 4904 + }, + { + "epoch": 1.505524861878453, + "grad_norm": 0.41650670766830444, + "learning_rate": 9.623996951050885e-05, + "loss": 1.9138, + "step": 4905 + }, + { + "epoch": 1.5058317986494782, + "grad_norm": 0.37955889105796814, + "learning_rate": 9.62380782071473e-05, + "loss": 1.9746, + "step": 4906 + }, + { + "epoch": 1.5061387354205034, + "grad_norm": 0.3799228072166443, + "learning_rate": 9.623618644683394e-05, + "loss": 1.942, + "step": 4907 + }, + { + "epoch": 1.5064456721915285, + "grad_norm": 0.3799766004085541, + "learning_rate": 9.623429422958751e-05, + "loss": 1.9025, + "step": 4908 + }, + { + "epoch": 1.5067526089625538, + "grad_norm": 0.3780234456062317, + "learning_rate": 9.623240155542668e-05, + "loss": 1.9581, + "step": 4909 + }, + { + "epoch": 1.507059545733579, + "grad_norm": 0.36379706859588623, + "learning_rate": 9.623050842437014e-05, + "loss": 1.9299, + "step": 4910 + }, + { + "epoch": 1.507366482504604, + "grad_norm": 0.5230580568313599, + "learning_rate": 9.622861483643663e-05, + "loss": 2.0306, + "step": 4911 + }, + { + "epoch": 1.5076734192756294, + "grad_norm": 0.443945050239563, + "learning_rate": 9.622672079164486e-05, + "loss": 1.9032, + "step": 4912 + }, + { + "epoch": 1.5079803560466543, + "grad_norm": 0.4689701795578003, + "learning_rate": 9.622482629001355e-05, + "loss": 1.9901, + "step": 4913 + }, + { + "epoch": 1.5082872928176796, + "grad_norm": 0.4483632445335388, + "learning_rate": 9.622293133156139e-05, + "loss": 1.948, + "step": 4914 + }, + { + "epoch": 1.5085942295887047, + "grad_norm": 0.4064919948577881, + "learning_rate": 9.622103591630715e-05, + "loss": 1.9487, + "step": 4915 + }, + { + "epoch": 1.5089011663597298, + "grad_norm": 0.44170522689819336, + "learning_rate": 9.621914004426952e-05, + "loss": 1.9929, + "step": 4916 + }, + { + "epoch": 1.5092081031307552, + "grad_norm": 0.45979443192481995, + "learning_rate": 9.621724371546727e-05, + "loss": 1.9428, + "step": 4917 + }, + { + "epoch": 1.5095150399017803, + "grad_norm": 0.5258452892303467, + "learning_rate": 9.621534692991913e-05, + "loss": 2.0049, + "step": 4918 + }, + { + "epoch": 1.5098219766728054, + "grad_norm": 0.45191919803619385, + "learning_rate": 9.621344968764385e-05, + "loss": 2.0364, + "step": 4919 + }, + { + "epoch": 1.5101289134438307, + "grad_norm": 0.539245069026947, + "learning_rate": 9.621155198866016e-05, + "loss": 2.072, + "step": 4920 + }, + { + "epoch": 1.5104358502148556, + "grad_norm": 0.5410256385803223, + "learning_rate": 9.620965383298684e-05, + "loss": 2.0231, + "step": 4921 + }, + { + "epoch": 1.510742786985881, + "grad_norm": 0.4409741759300232, + "learning_rate": 9.620775522064264e-05, + "loss": 1.9024, + "step": 4922 + }, + { + "epoch": 1.511049723756906, + "grad_norm": 0.4911535680294037, + "learning_rate": 9.620585615164631e-05, + "loss": 2.0057, + "step": 4923 + }, + { + "epoch": 1.5113566605279312, + "grad_norm": 0.48139557242393494, + "learning_rate": 9.620395662601663e-05, + "loss": 2.0175, + "step": 4924 + }, + { + "epoch": 1.5116635972989565, + "grad_norm": 0.5130077004432678, + "learning_rate": 9.620205664377238e-05, + "loss": 1.952, + "step": 4925 + }, + { + "epoch": 1.5119705340699816, + "grad_norm": 0.5428542494773865, + "learning_rate": 9.62001562049323e-05, + "loss": 1.977, + "step": 4926 + }, + { + "epoch": 1.5122774708410067, + "grad_norm": 0.4586256444454193, + "learning_rate": 9.619825530951522e-05, + "loss": 1.9997, + "step": 4927 + }, + { + "epoch": 1.512584407612032, + "grad_norm": 0.3941349387168884, + "learning_rate": 9.61963539575399e-05, + "loss": 1.9174, + "step": 4928 + }, + { + "epoch": 1.512891344383057, + "grad_norm": 0.4396456480026245, + "learning_rate": 9.619445214902511e-05, + "loss": 1.9696, + "step": 4929 + }, + { + "epoch": 1.5131982811540823, + "grad_norm": 0.5413886904716492, + "learning_rate": 9.61925498839897e-05, + "loss": 2.0332, + "step": 4930 + }, + { + "epoch": 1.5135052179251074, + "grad_norm": 0.5946230888366699, + "learning_rate": 9.619064716245242e-05, + "loss": 2.0433, + "step": 4931 + }, + { + "epoch": 1.5138121546961325, + "grad_norm": 0.6353569030761719, + "learning_rate": 9.618874398443211e-05, + "loss": 1.9828, + "step": 4932 + }, + { + "epoch": 1.5141190914671578, + "grad_norm": 0.523690938949585, + "learning_rate": 9.618684034994754e-05, + "loss": 1.9024, + "step": 4933 + }, + { + "epoch": 1.514426028238183, + "grad_norm": 0.4437367022037506, + "learning_rate": 9.618493625901754e-05, + "loss": 1.9961, + "step": 4934 + }, + { + "epoch": 1.514732965009208, + "grad_norm": 0.48458734154701233, + "learning_rate": 9.618303171166094e-05, + "loss": 1.9515, + "step": 4935 + }, + { + "epoch": 1.5150399017802334, + "grad_norm": 0.47659310698509216, + "learning_rate": 9.618112670789657e-05, + "loss": 1.9943, + "step": 4936 + }, + { + "epoch": 1.5153468385512583, + "grad_norm": 0.49281415343284607, + "learning_rate": 9.617922124774322e-05, + "loss": 1.9311, + "step": 4937 + }, + { + "epoch": 1.5156537753222836, + "grad_norm": 0.4706041216850281, + "learning_rate": 9.617731533121972e-05, + "loss": 1.9478, + "step": 4938 + }, + { + "epoch": 1.5159607120933087, + "grad_norm": 0.4187149405479431, + "learning_rate": 9.617540895834496e-05, + "loss": 1.9915, + "step": 4939 + }, + { + "epoch": 1.5162676488643339, + "grad_norm": 0.3792540431022644, + "learning_rate": 9.617350212913772e-05, + "loss": 1.8609, + "step": 4940 + }, + { + "epoch": 1.5165745856353592, + "grad_norm": 0.46558165550231934, + "learning_rate": 9.617159484361688e-05, + "loss": 1.9574, + "step": 4941 + }, + { + "epoch": 1.5168815224063843, + "grad_norm": 0.4930344820022583, + "learning_rate": 9.616968710180127e-05, + "loss": 1.9924, + "step": 4942 + }, + { + "epoch": 1.5171884591774094, + "grad_norm": 0.44909337162971497, + "learning_rate": 9.616777890370976e-05, + "loss": 1.9674, + "step": 4943 + }, + { + "epoch": 1.5174953959484347, + "grad_norm": 0.43266600370407104, + "learning_rate": 9.616587024936119e-05, + "loss": 1.8899, + "step": 4944 + }, + { + "epoch": 1.5178023327194596, + "grad_norm": 0.43229207396507263, + "learning_rate": 9.616396113877444e-05, + "loss": 1.9671, + "step": 4945 + }, + { + "epoch": 1.518109269490485, + "grad_norm": 0.4609402120113373, + "learning_rate": 9.616205157196837e-05, + "loss": 1.9844, + "step": 4946 + }, + { + "epoch": 1.51841620626151, + "grad_norm": 0.4598314166069031, + "learning_rate": 9.616014154896184e-05, + "loss": 1.985, + "step": 4947 + }, + { + "epoch": 1.5187231430325352, + "grad_norm": 0.4746960997581482, + "learning_rate": 9.615823106977376e-05, + "loss": 2.0199, + "step": 4948 + }, + { + "epoch": 1.5190300798035605, + "grad_norm": 0.47560420632362366, + "learning_rate": 9.615632013442295e-05, + "loss": 1.8864, + "step": 4949 + }, + { + "epoch": 1.5193370165745856, + "grad_norm": 0.447837233543396, + "learning_rate": 9.615440874292835e-05, + "loss": 1.9699, + "step": 4950 + }, + { + "epoch": 1.5196439533456108, + "grad_norm": 0.49653175473213196, + "learning_rate": 9.615249689530883e-05, + "loss": 2.0645, + "step": 4951 + }, + { + "epoch": 1.519950890116636, + "grad_norm": 0.47083014249801636, + "learning_rate": 9.615058459158328e-05, + "loss": 2.01, + "step": 4952 + }, + { + "epoch": 1.520257826887661, + "grad_norm": 0.5299197435379028, + "learning_rate": 9.614867183177061e-05, + "loss": 2.0232, + "step": 4953 + }, + { + "epoch": 1.5205647636586863, + "grad_norm": 0.5005922317504883, + "learning_rate": 9.614675861588971e-05, + "loss": 1.9703, + "step": 4954 + }, + { + "epoch": 1.5208717004297114, + "grad_norm": 0.5131978392601013, + "learning_rate": 9.61448449439595e-05, + "loss": 1.9921, + "step": 4955 + }, + { + "epoch": 1.5211786372007365, + "grad_norm": 0.5278428196907043, + "learning_rate": 9.614293081599889e-05, + "loss": 1.9111, + "step": 4956 + }, + { + "epoch": 1.5214855739717619, + "grad_norm": 0.4914579689502716, + "learning_rate": 9.614101623202678e-05, + "loss": 2.0398, + "step": 4957 + }, + { + "epoch": 1.521792510742787, + "grad_norm": 0.454863041639328, + "learning_rate": 9.61391011920621e-05, + "loss": 1.9674, + "step": 4958 + }, + { + "epoch": 1.522099447513812, + "grad_norm": 0.464491605758667, + "learning_rate": 9.613718569612379e-05, + "loss": 2.0123, + "step": 4959 + }, + { + "epoch": 1.5224063842848374, + "grad_norm": 0.4252295196056366, + "learning_rate": 9.613526974423078e-05, + "loss": 1.9796, + "step": 4960 + }, + { + "epoch": 1.5227133210558625, + "grad_norm": 0.4643968641757965, + "learning_rate": 9.613335333640199e-05, + "loss": 1.9448, + "step": 4961 + }, + { + "epoch": 1.5230202578268877, + "grad_norm": 0.4204397201538086, + "learning_rate": 9.613143647265635e-05, + "loss": 2.0191, + "step": 4962 + }, + { + "epoch": 1.523327194597913, + "grad_norm": 0.3838767111301422, + "learning_rate": 9.612951915301283e-05, + "loss": 1.9057, + "step": 4963 + }, + { + "epoch": 1.5236341313689379, + "grad_norm": 0.4353863000869751, + "learning_rate": 9.612760137749035e-05, + "loss": 2.0435, + "step": 4964 + }, + { + "epoch": 1.5239410681399632, + "grad_norm": 0.4082738757133484, + "learning_rate": 9.612568314610788e-05, + "loss": 1.9229, + "step": 4965 + }, + { + "epoch": 1.5242480049109883, + "grad_norm": 0.4382591247558594, + "learning_rate": 9.612376445888437e-05, + "loss": 1.9185, + "step": 4966 + }, + { + "epoch": 1.5245549416820134, + "grad_norm": 0.48340749740600586, + "learning_rate": 9.61218453158388e-05, + "loss": 1.9669, + "step": 4967 + }, + { + "epoch": 1.5248618784530388, + "grad_norm": 0.47423556447029114, + "learning_rate": 9.611992571699012e-05, + "loss": 1.9372, + "step": 4968 + }, + { + "epoch": 1.525168815224064, + "grad_norm": 0.4070637822151184, + "learning_rate": 9.611800566235728e-05, + "loss": 2.0201, + "step": 4969 + }, + { + "epoch": 1.525475751995089, + "grad_norm": 0.43758198618888855, + "learning_rate": 9.61160851519593e-05, + "loss": 1.982, + "step": 4970 + }, + { + "epoch": 1.5257826887661143, + "grad_norm": 0.4724174737930298, + "learning_rate": 9.611416418581513e-05, + "loss": 1.9938, + "step": 4971 + }, + { + "epoch": 1.5260896255371392, + "grad_norm": 0.492405503988266, + "learning_rate": 9.611224276394374e-05, + "loss": 1.9462, + "step": 4972 + }, + { + "epoch": 1.5263965623081646, + "grad_norm": 0.5064161419868469, + "learning_rate": 9.611032088636418e-05, + "loss": 2.0326, + "step": 4973 + }, + { + "epoch": 1.5267034990791897, + "grad_norm": 0.4256031811237335, + "learning_rate": 9.610839855309537e-05, + "loss": 1.8885, + "step": 4974 + }, + { + "epoch": 1.5270104358502148, + "grad_norm": 0.4283316731452942, + "learning_rate": 9.610647576415636e-05, + "loss": 2.005, + "step": 4975 + }, + { + "epoch": 1.5273173726212401, + "grad_norm": 0.44234412908554077, + "learning_rate": 9.610455251956614e-05, + "loss": 1.9626, + "step": 4976 + }, + { + "epoch": 1.5276243093922652, + "grad_norm": 0.4135831594467163, + "learning_rate": 9.610262881934369e-05, + "loss": 1.9529, + "step": 4977 + }, + { + "epoch": 1.5279312461632903, + "grad_norm": 0.48090922832489014, + "learning_rate": 9.610070466350805e-05, + "loss": 2.0239, + "step": 4978 + }, + { + "epoch": 1.5282381829343157, + "grad_norm": 0.4546974301338196, + "learning_rate": 9.609878005207822e-05, + "loss": 1.9556, + "step": 4979 + }, + { + "epoch": 1.5285451197053406, + "grad_norm": 0.4197862148284912, + "learning_rate": 9.609685498507323e-05, + "loss": 1.9117, + "step": 4980 + }, + { + "epoch": 1.528852056476366, + "grad_norm": 0.4376974105834961, + "learning_rate": 9.60949294625121e-05, + "loss": 1.9514, + "step": 4981 + }, + { + "epoch": 1.529158993247391, + "grad_norm": 0.3671407401561737, + "learning_rate": 9.609300348441385e-05, + "loss": 1.9042, + "step": 4982 + }, + { + "epoch": 1.5294659300184161, + "grad_norm": 0.4326031506061554, + "learning_rate": 9.609107705079754e-05, + "loss": 1.9606, + "step": 4983 + }, + { + "epoch": 1.5297728667894415, + "grad_norm": 0.423308402299881, + "learning_rate": 9.608915016168218e-05, + "loss": 1.9663, + "step": 4984 + }, + { + "epoch": 1.5300798035604666, + "grad_norm": 0.46309906244277954, + "learning_rate": 9.608722281708683e-05, + "loss": 2.0114, + "step": 4985 + }, + { + "epoch": 1.5303867403314917, + "grad_norm": 0.4619913101196289, + "learning_rate": 9.608529501703053e-05, + "loss": 1.9328, + "step": 4986 + }, + { + "epoch": 1.530693677102517, + "grad_norm": 0.4335738718509674, + "learning_rate": 9.608336676153234e-05, + "loss": 1.9069, + "step": 4987 + }, + { + "epoch": 1.531000613873542, + "grad_norm": 0.40606966614723206, + "learning_rate": 9.608143805061129e-05, + "loss": 1.9243, + "step": 4988 + }, + { + "epoch": 1.5313075506445673, + "grad_norm": 0.45613235235214233, + "learning_rate": 9.607950888428649e-05, + "loss": 1.9943, + "step": 4989 + }, + { + "epoch": 1.5316144874155924, + "grad_norm": 0.4905582666397095, + "learning_rate": 9.607757926257696e-05, + "loss": 1.9649, + "step": 4990 + }, + { + "epoch": 1.5319214241866175, + "grad_norm": 0.44312527775764465, + "learning_rate": 9.607564918550179e-05, + "loss": 1.927, + "step": 4991 + }, + { + "epoch": 1.5322283609576428, + "grad_norm": 0.5193700790405273, + "learning_rate": 9.607371865308004e-05, + "loss": 1.9038, + "step": 4992 + }, + { + "epoch": 1.532535297728668, + "grad_norm": 0.5528806447982788, + "learning_rate": 9.607178766533078e-05, + "loss": 1.9194, + "step": 4993 + }, + { + "epoch": 1.532842234499693, + "grad_norm": 0.6561285257339478, + "learning_rate": 9.606985622227314e-05, + "loss": 2.0098, + "step": 4994 + }, + { + "epoch": 1.5331491712707184, + "grad_norm": 0.5642603635787964, + "learning_rate": 9.606792432392617e-05, + "loss": 1.9781, + "step": 4995 + }, + { + "epoch": 1.5334561080417433, + "grad_norm": 0.4974311590194702, + "learning_rate": 9.606599197030896e-05, + "loss": 1.9558, + "step": 4996 + }, + { + "epoch": 1.5337630448127686, + "grad_norm": 0.4324510395526886, + "learning_rate": 9.606405916144063e-05, + "loss": 1.9749, + "step": 4997 + }, + { + "epoch": 1.5340699815837937, + "grad_norm": 0.45244327187538147, + "learning_rate": 9.606212589734027e-05, + "loss": 1.8902, + "step": 4998 + }, + { + "epoch": 1.5343769183548188, + "grad_norm": 0.5418685078620911, + "learning_rate": 9.606019217802698e-05, + "loss": 1.9766, + "step": 4999 + }, + { + "epoch": 1.5346838551258442, + "grad_norm": 0.48479241132736206, + "learning_rate": 9.605825800351987e-05, + "loss": 1.9949, + "step": 5000 + }, + { + "epoch": 1.5349907918968693, + "grad_norm": 0.4958111643791199, + "learning_rate": 9.605632337383806e-05, + "loss": 1.988, + "step": 5001 + }, + { + "epoch": 1.5352977286678944, + "grad_norm": 0.47347983717918396, + "learning_rate": 9.605438828900067e-05, + "loss": 1.9157, + "step": 5002 + }, + { + "epoch": 1.5356046654389197, + "grad_norm": 0.4018974304199219, + "learning_rate": 9.605245274902684e-05, + "loss": 1.9347, + "step": 5003 + }, + { + "epoch": 1.5359116022099446, + "grad_norm": 0.46161791682243347, + "learning_rate": 9.605051675393565e-05, + "loss": 1.9785, + "step": 5004 + }, + { + "epoch": 1.53621853898097, + "grad_norm": 0.5113234519958496, + "learning_rate": 9.604858030374627e-05, + "loss": 1.9595, + "step": 5005 + }, + { + "epoch": 1.536525475751995, + "grad_norm": 0.6643409132957458, + "learning_rate": 9.604664339847784e-05, + "loss": 2.0395, + "step": 5006 + }, + { + "epoch": 1.5368324125230202, + "grad_norm": 0.6759974360466003, + "learning_rate": 9.604470603814948e-05, + "loss": 1.9058, + "step": 5007 + }, + { + "epoch": 1.5371393492940455, + "grad_norm": 0.5576213598251343, + "learning_rate": 9.604276822278035e-05, + "loss": 1.9326, + "step": 5008 + }, + { + "epoch": 1.5374462860650706, + "grad_norm": 0.4472630023956299, + "learning_rate": 9.60408299523896e-05, + "loss": 1.9553, + "step": 5009 + }, + { + "epoch": 1.5377532228360957, + "grad_norm": 0.48445144295692444, + "learning_rate": 9.603889122699638e-05, + "loss": 2.0136, + "step": 5010 + }, + { + "epoch": 1.538060159607121, + "grad_norm": 0.4793097972869873, + "learning_rate": 9.603695204661987e-05, + "loss": 1.9777, + "step": 5011 + }, + { + "epoch": 1.538367096378146, + "grad_norm": 0.5003167390823364, + "learning_rate": 9.60350124112792e-05, + "loss": 1.9672, + "step": 5012 + }, + { + "epoch": 1.5386740331491713, + "grad_norm": 0.5131042003631592, + "learning_rate": 9.603307232099355e-05, + "loss": 2.0058, + "step": 5013 + }, + { + "epoch": 1.5389809699201964, + "grad_norm": 0.4145869314670563, + "learning_rate": 9.603113177578212e-05, + "loss": 1.9332, + "step": 5014 + }, + { + "epoch": 1.5392879066912215, + "grad_norm": 0.4939991235733032, + "learning_rate": 9.602919077566404e-05, + "loss": 1.9967, + "step": 5015 + }, + { + "epoch": 1.5395948434622468, + "grad_norm": 0.4768902361392975, + "learning_rate": 9.602724932065853e-05, + "loss": 1.873, + "step": 5016 + }, + { + "epoch": 1.539901780233272, + "grad_norm": 0.45381611585617065, + "learning_rate": 9.602530741078476e-05, + "loss": 1.9416, + "step": 5017 + }, + { + "epoch": 1.540208717004297, + "grad_norm": 0.43104392290115356, + "learning_rate": 9.602336504606193e-05, + "loss": 1.9566, + "step": 5018 + }, + { + "epoch": 1.5405156537753224, + "grad_norm": 0.5354776978492737, + "learning_rate": 9.602142222650924e-05, + "loss": 1.9939, + "step": 5019 + }, + { + "epoch": 1.5408225905463473, + "grad_norm": 0.5623740553855896, + "learning_rate": 9.601947895214586e-05, + "loss": 1.9622, + "step": 5020 + }, + { + "epoch": 1.5411295273173726, + "grad_norm": 0.5234485268592834, + "learning_rate": 9.601753522299103e-05, + "loss": 1.9636, + "step": 5021 + }, + { + "epoch": 1.5414364640883977, + "grad_norm": 0.416384756565094, + "learning_rate": 9.601559103906396e-05, + "loss": 1.92, + "step": 5022 + }, + { + "epoch": 1.5417434008594229, + "grad_norm": 0.47080478072166443, + "learning_rate": 9.601364640038384e-05, + "loss": 1.9147, + "step": 5023 + }, + { + "epoch": 1.5420503376304482, + "grad_norm": 0.527463972568512, + "learning_rate": 9.601170130696988e-05, + "loss": 1.9458, + "step": 5024 + }, + { + "epoch": 1.5423572744014733, + "grad_norm": 0.4761022925376892, + "learning_rate": 9.600975575884134e-05, + "loss": 1.95, + "step": 5025 + }, + { + "epoch": 1.5426642111724984, + "grad_norm": 0.48202264308929443, + "learning_rate": 9.600780975601741e-05, + "loss": 1.9618, + "step": 5026 + }, + { + "epoch": 1.5429711479435237, + "grad_norm": 0.43222522735595703, + "learning_rate": 9.600586329851735e-05, + "loss": 1.9869, + "step": 5027 + }, + { + "epoch": 1.5432780847145486, + "grad_norm": 0.40816691517829895, + "learning_rate": 9.600391638636037e-05, + "loss": 1.991, + "step": 5028 + }, + { + "epoch": 1.543585021485574, + "grad_norm": 0.4365478754043579, + "learning_rate": 9.600196901956572e-05, + "loss": 1.9904, + "step": 5029 + }, + { + "epoch": 1.5438919582565993, + "grad_norm": 0.41411092877388, + "learning_rate": 9.600002119815268e-05, + "loss": 1.9449, + "step": 5030 + }, + { + "epoch": 1.5441988950276242, + "grad_norm": 0.41023650765419006, + "learning_rate": 9.599807292214045e-05, + "loss": 1.9318, + "step": 5031 + }, + { + "epoch": 1.5445058317986495, + "grad_norm": 0.4844631254673004, + "learning_rate": 9.599612419154831e-05, + "loss": 1.9884, + "step": 5032 + }, + { + "epoch": 1.5448127685696746, + "grad_norm": 0.4347037374973297, + "learning_rate": 9.59941750063955e-05, + "loss": 1.8992, + "step": 5033 + }, + { + "epoch": 1.5451197053406998, + "grad_norm": 0.6414445638656616, + "learning_rate": 9.59922253667013e-05, + "loss": 2.0268, + "step": 5034 + }, + { + "epoch": 1.545426642111725, + "grad_norm": 0.6607222557067871, + "learning_rate": 9.599027527248498e-05, + "loss": 2.0116, + "step": 5035 + }, + { + "epoch": 1.5457335788827502, + "grad_norm": 0.6406869292259216, + "learning_rate": 9.59883247237658e-05, + "loss": 1.9256, + "step": 5036 + }, + { + "epoch": 1.5460405156537753, + "grad_norm": 0.5388308167457581, + "learning_rate": 9.598637372056303e-05, + "loss": 1.906, + "step": 5037 + }, + { + "epoch": 1.5463474524248007, + "grad_norm": 0.42285510897636414, + "learning_rate": 9.598442226289596e-05, + "loss": 1.9137, + "step": 5038 + }, + { + "epoch": 1.5466543891958255, + "grad_norm": 0.5622994303703308, + "learning_rate": 9.598247035078389e-05, + "loss": 1.9825, + "step": 5039 + }, + { + "epoch": 1.5469613259668509, + "grad_norm": 0.7120574116706848, + "learning_rate": 9.59805179842461e-05, + "loss": 1.9467, + "step": 5040 + }, + { + "epoch": 1.547268262737876, + "grad_norm": 0.7050338983535767, + "learning_rate": 9.597856516330187e-05, + "loss": 1.9763, + "step": 5041 + }, + { + "epoch": 1.547575199508901, + "grad_norm": 0.4908922016620636, + "learning_rate": 9.597661188797051e-05, + "loss": 1.9826, + "step": 5042 + }, + { + "epoch": 1.5478821362799264, + "grad_norm": 0.47363361716270447, + "learning_rate": 9.597465815827133e-05, + "loss": 1.9769, + "step": 5043 + }, + { + "epoch": 1.5481890730509515, + "grad_norm": 0.6289864182472229, + "learning_rate": 9.597270397422364e-05, + "loss": 1.9364, + "step": 5044 + }, + { + "epoch": 1.5484960098219767, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.597074933584673e-05, + "loss": 1.949, + "step": 5045 + }, + { + "epoch": 1.548802946593002, + "grad_norm": 0.559152364730835, + "learning_rate": 9.596879424315993e-05, + "loss": 2.0194, + "step": 5046 + }, + { + "epoch": 1.5491098833640269, + "grad_norm": 0.4613901674747467, + "learning_rate": 9.596683869618257e-05, + "loss": 1.9658, + "step": 5047 + }, + { + "epoch": 1.5494168201350522, + "grad_norm": 0.6245483160018921, + "learning_rate": 9.596488269493396e-05, + "loss": 1.9265, + "step": 5048 + }, + { + "epoch": 1.5497237569060773, + "grad_norm": 0.8100824356079102, + "learning_rate": 9.596292623943343e-05, + "loss": 1.9536, + "step": 5049 + }, + { + "epoch": 1.5500306936771024, + "grad_norm": 0.7486092448234558, + "learning_rate": 9.596096932970035e-05, + "loss": 1.9801, + "step": 5050 + }, + { + "epoch": 1.5503376304481278, + "grad_norm": 0.4803295135498047, + "learning_rate": 9.595901196575401e-05, + "loss": 1.9943, + "step": 5051 + }, + { + "epoch": 1.550644567219153, + "grad_norm": 0.5027125477790833, + "learning_rate": 9.595705414761379e-05, + "loss": 1.9036, + "step": 5052 + }, + { + "epoch": 1.550951503990178, + "grad_norm": 0.5785070657730103, + "learning_rate": 9.595509587529902e-05, + "loss": 1.9489, + "step": 5053 + }, + { + "epoch": 1.5512584407612033, + "grad_norm": 0.6017338633537292, + "learning_rate": 9.595313714882906e-05, + "loss": 1.9964, + "step": 5054 + }, + { + "epoch": 1.5515653775322282, + "grad_norm": 0.5023195147514343, + "learning_rate": 9.595117796822326e-05, + "loss": 1.9778, + "step": 5055 + }, + { + "epoch": 1.5518723143032536, + "grad_norm": 0.4488884508609772, + "learning_rate": 9.594921833350099e-05, + "loss": 2.0141, + "step": 5056 + }, + { + "epoch": 1.5521792510742787, + "grad_norm": 0.47110801935195923, + "learning_rate": 9.59472582446816e-05, + "loss": 1.9294, + "step": 5057 + }, + { + "epoch": 1.5524861878453038, + "grad_norm": 0.5292330980300903, + "learning_rate": 9.594529770178449e-05, + "loss": 2.0427, + "step": 5058 + }, + { + "epoch": 1.5527931246163291, + "grad_norm": 0.522756814956665, + "learning_rate": 9.5943336704829e-05, + "loss": 1.9854, + "step": 5059 + }, + { + "epoch": 1.5531000613873542, + "grad_norm": 0.44659632444381714, + "learning_rate": 9.594137525383455e-05, + "loss": 2.028, + "step": 5060 + }, + { + "epoch": 1.5534069981583793, + "grad_norm": 0.4745616614818573, + "learning_rate": 9.593941334882048e-05, + "loss": 1.9994, + "step": 5061 + }, + { + "epoch": 1.5537139349294047, + "grad_norm": 0.41752973198890686, + "learning_rate": 9.593745098980622e-05, + "loss": 1.9466, + "step": 5062 + }, + { + "epoch": 1.5540208717004296, + "grad_norm": 0.4548248052597046, + "learning_rate": 9.593548817681115e-05, + "loss": 1.9064, + "step": 5063 + }, + { + "epoch": 1.554327808471455, + "grad_norm": 0.45780888199806213, + "learning_rate": 9.593352490985464e-05, + "loss": 2.0254, + "step": 5064 + }, + { + "epoch": 1.55463474524248, + "grad_norm": 0.4118718206882477, + "learning_rate": 9.593156118895613e-05, + "loss": 1.9761, + "step": 5065 + }, + { + "epoch": 1.5549416820135051, + "grad_norm": 0.41350236535072327, + "learning_rate": 9.592959701413501e-05, + "loss": 1.9476, + "step": 5066 + }, + { + "epoch": 1.5552486187845305, + "grad_norm": 0.4116091728210449, + "learning_rate": 9.59276323854107e-05, + "loss": 1.9325, + "step": 5067 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.44039735198020935, + "learning_rate": 9.592566730280259e-05, + "loss": 1.9916, + "step": 5068 + }, + { + "epoch": 1.5558624923265807, + "grad_norm": 0.4028816819190979, + "learning_rate": 9.592370176633012e-05, + "loss": 1.916, + "step": 5069 + }, + { + "epoch": 1.556169429097606, + "grad_norm": 0.42046302556991577, + "learning_rate": 9.592173577601271e-05, + "loss": 1.961, + "step": 5070 + }, + { + "epoch": 1.556476365868631, + "grad_norm": 0.3749450147151947, + "learning_rate": 9.591976933186982e-05, + "loss": 1.9279, + "step": 5071 + }, + { + "epoch": 1.5567833026396563, + "grad_norm": 0.3441384434700012, + "learning_rate": 9.591780243392081e-05, + "loss": 1.8967, + "step": 5072 + }, + { + "epoch": 1.5570902394106814, + "grad_norm": 0.4032546877861023, + "learning_rate": 9.59158350821852e-05, + "loss": 1.9912, + "step": 5073 + }, + { + "epoch": 1.5573971761817065, + "grad_norm": 0.44628265500068665, + "learning_rate": 9.591386727668238e-05, + "loss": 2.0539, + "step": 5074 + }, + { + "epoch": 1.5577041129527318, + "grad_norm": 0.43606969714164734, + "learning_rate": 9.59118990174318e-05, + "loss": 1.97, + "step": 5075 + }, + { + "epoch": 1.558011049723757, + "grad_norm": 0.42076775431632996, + "learning_rate": 9.590993030445295e-05, + "loss": 1.962, + "step": 5076 + }, + { + "epoch": 1.558317986494782, + "grad_norm": 0.34569117426872253, + "learning_rate": 9.590796113776526e-05, + "loss": 1.8815, + "step": 5077 + }, + { + "epoch": 1.5586249232658074, + "grad_norm": 0.3931111693382263, + "learning_rate": 9.590599151738817e-05, + "loss": 1.9016, + "step": 5078 + }, + { + "epoch": 1.5589318600368323, + "grad_norm": 0.3952369689941406, + "learning_rate": 9.590402144334117e-05, + "loss": 1.9277, + "step": 5079 + }, + { + "epoch": 1.5592387968078576, + "grad_norm": 0.3960857689380646, + "learning_rate": 9.590205091564372e-05, + "loss": 1.947, + "step": 5080 + }, + { + "epoch": 1.5595457335788827, + "grad_norm": 0.37946292757987976, + "learning_rate": 9.590007993431532e-05, + "loss": 1.9907, + "step": 5081 + }, + { + "epoch": 1.5598526703499078, + "grad_norm": 0.41619375348091125, + "learning_rate": 9.589810849937541e-05, + "loss": 1.9451, + "step": 5082 + }, + { + "epoch": 1.5601596071209332, + "grad_norm": 0.39266669750213623, + "learning_rate": 9.58961366108435e-05, + "loss": 2.0137, + "step": 5083 + }, + { + "epoch": 1.5604665438919583, + "grad_norm": 0.39510276913642883, + "learning_rate": 9.589416426873907e-05, + "loss": 1.947, + "step": 5084 + }, + { + "epoch": 1.5607734806629834, + "grad_norm": 0.40243181586265564, + "learning_rate": 9.58921914730816e-05, + "loss": 1.8957, + "step": 5085 + }, + { + "epoch": 1.5610804174340087, + "grad_norm": 0.39877578616142273, + "learning_rate": 9.58902182238906e-05, + "loss": 1.9497, + "step": 5086 + }, + { + "epoch": 1.5613873542050336, + "grad_norm": 0.39367151260375977, + "learning_rate": 9.588824452118557e-05, + "loss": 1.9616, + "step": 5087 + }, + { + "epoch": 1.561694290976059, + "grad_norm": 0.35690104961395264, + "learning_rate": 9.5886270364986e-05, + "loss": 1.9108, + "step": 5088 + }, + { + "epoch": 1.562001227747084, + "grad_norm": 0.39512762427330017, + "learning_rate": 9.588429575531141e-05, + "loss": 1.9909, + "step": 5089 + }, + { + "epoch": 1.5623081645181092, + "grad_norm": 0.39253926277160645, + "learning_rate": 9.588232069218132e-05, + "loss": 1.937, + "step": 5090 + }, + { + "epoch": 1.5626151012891345, + "grad_norm": 0.37811553478240967, + "learning_rate": 9.588034517561526e-05, + "loss": 1.8918, + "step": 5091 + }, + { + "epoch": 1.5629220380601596, + "grad_norm": 0.38191986083984375, + "learning_rate": 9.587836920563272e-05, + "loss": 1.9149, + "step": 5092 + }, + { + "epoch": 1.5632289748311847, + "grad_norm": 0.3903779089450836, + "learning_rate": 9.587639278225326e-05, + "loss": 1.9714, + "step": 5093 + }, + { + "epoch": 1.56353591160221, + "grad_norm": 0.4467499554157257, + "learning_rate": 9.587441590549639e-05, + "loss": 1.8822, + "step": 5094 + }, + { + "epoch": 1.563842848373235, + "grad_norm": 0.3819296956062317, + "learning_rate": 9.587243857538164e-05, + "loss": 1.9212, + "step": 5095 + }, + { + "epoch": 1.5641497851442603, + "grad_norm": 0.4305097162723541, + "learning_rate": 9.587046079192858e-05, + "loss": 1.9264, + "step": 5096 + }, + { + "epoch": 1.5644567219152854, + "grad_norm": 0.4135383367538452, + "learning_rate": 9.586848255515675e-05, + "loss": 1.9743, + "step": 5097 + }, + { + "epoch": 1.5647636586863105, + "grad_norm": 0.44688066840171814, + "learning_rate": 9.586650386508566e-05, + "loss": 1.8804, + "step": 5098 + }, + { + "epoch": 1.5650705954573358, + "grad_norm": 0.5358461737632751, + "learning_rate": 9.586452472173492e-05, + "loss": 1.9485, + "step": 5099 + }, + { + "epoch": 1.565377532228361, + "grad_norm": 0.5585343837738037, + "learning_rate": 9.586254512512408e-05, + "loss": 2.0901, + "step": 5100 + }, + { + "epoch": 1.565684468999386, + "grad_norm": 0.4682343602180481, + "learning_rate": 9.586056507527266e-05, + "loss": 1.8877, + "step": 5101 + }, + { + "epoch": 1.5659914057704114, + "grad_norm": 0.44076529145240784, + "learning_rate": 9.585858457220026e-05, + "loss": 1.93, + "step": 5102 + }, + { + "epoch": 1.5662983425414365, + "grad_norm": 0.4613071382045746, + "learning_rate": 9.585660361592646e-05, + "loss": 1.9689, + "step": 5103 + }, + { + "epoch": 1.5666052793124616, + "grad_norm": 0.4589289128780365, + "learning_rate": 9.585462220647082e-05, + "loss": 1.8876, + "step": 5104 + }, + { + "epoch": 1.566912216083487, + "grad_norm": 0.3495907485485077, + "learning_rate": 9.585264034385292e-05, + "loss": 1.9013, + "step": 5105 + }, + { + "epoch": 1.5672191528545119, + "grad_norm": 0.42263728380203247, + "learning_rate": 9.585065802809235e-05, + "loss": 1.8886, + "step": 5106 + }, + { + "epoch": 1.5675260896255372, + "grad_norm": 0.4275301694869995, + "learning_rate": 9.584867525920872e-05, + "loss": 1.9865, + "step": 5107 + }, + { + "epoch": 1.5678330263965623, + "grad_norm": 0.4228142201900482, + "learning_rate": 9.584669203722161e-05, + "loss": 1.8573, + "step": 5108 + }, + { + "epoch": 1.5681399631675874, + "grad_norm": 0.4422524571418762, + "learning_rate": 9.58447083621506e-05, + "loss": 1.924, + "step": 5109 + }, + { + "epoch": 1.5684468999386127, + "grad_norm": 0.41540947556495667, + "learning_rate": 9.584272423401532e-05, + "loss": 1.969, + "step": 5110 + }, + { + "epoch": 1.5687538367096379, + "grad_norm": 0.3963775336742401, + "learning_rate": 9.584073965283538e-05, + "loss": 1.9509, + "step": 5111 + }, + { + "epoch": 1.569060773480663, + "grad_norm": 0.41465985774993896, + "learning_rate": 9.583875461863037e-05, + "loss": 1.9393, + "step": 5112 + }, + { + "epoch": 1.5693677102516883, + "grad_norm": 0.4396083652973175, + "learning_rate": 9.583676913141991e-05, + "loss": 1.9872, + "step": 5113 + }, + { + "epoch": 1.5696746470227132, + "grad_norm": 0.4247182607650757, + "learning_rate": 9.583478319122366e-05, + "loss": 1.9807, + "step": 5114 + }, + { + "epoch": 1.5699815837937385, + "grad_norm": 0.3612080216407776, + "learning_rate": 9.583279679806119e-05, + "loss": 1.9563, + "step": 5115 + }, + { + "epoch": 1.5702885205647636, + "grad_norm": 0.40084055066108704, + "learning_rate": 9.583080995195217e-05, + "loss": 1.9099, + "step": 5116 + }, + { + "epoch": 1.5705954573357888, + "grad_norm": 0.432381272315979, + "learning_rate": 9.582882265291621e-05, + "loss": 2.0167, + "step": 5117 + }, + { + "epoch": 1.570902394106814, + "grad_norm": 0.45490768551826477, + "learning_rate": 9.5826834900973e-05, + "loss": 1.9179, + "step": 5118 + }, + { + "epoch": 1.5712093308778392, + "grad_norm": 0.39158329367637634, + "learning_rate": 9.582484669614211e-05, + "loss": 1.8716, + "step": 5119 + }, + { + "epoch": 1.5715162676488643, + "grad_norm": 0.45607441663742065, + "learning_rate": 9.582285803844324e-05, + "loss": 1.9631, + "step": 5120 + }, + { + "epoch": 1.5718232044198897, + "grad_norm": 0.42591094970703125, + "learning_rate": 9.582086892789604e-05, + "loss": 1.9809, + "step": 5121 + }, + { + "epoch": 1.5721301411909145, + "grad_norm": 0.46772903203964233, + "learning_rate": 9.581887936452015e-05, + "loss": 1.9991, + "step": 5122 + }, + { + "epoch": 1.5724370779619399, + "grad_norm": 0.4450485408306122, + "learning_rate": 9.581688934833524e-05, + "loss": 1.9471, + "step": 5123 + }, + { + "epoch": 1.572744014732965, + "grad_norm": 0.37539350986480713, + "learning_rate": 9.581489887936097e-05, + "loss": 1.8624, + "step": 5124 + }, + { + "epoch": 1.57305095150399, + "grad_norm": 0.4184030294418335, + "learning_rate": 9.581290795761702e-05, + "loss": 1.9746, + "step": 5125 + }, + { + "epoch": 1.5733578882750154, + "grad_norm": 0.43275317549705505, + "learning_rate": 9.581091658312305e-05, + "loss": 2.0484, + "step": 5126 + }, + { + "epoch": 1.5736648250460405, + "grad_norm": 0.48845502734184265, + "learning_rate": 9.580892475589876e-05, + "loss": 1.9331, + "step": 5127 + }, + { + "epoch": 1.5739717618170657, + "grad_norm": 0.4653528034687042, + "learning_rate": 9.580693247596383e-05, + "loss": 1.8888, + "step": 5128 + }, + { + "epoch": 1.574278698588091, + "grad_norm": 0.4371016323566437, + "learning_rate": 9.580493974333794e-05, + "loss": 1.9004, + "step": 5129 + }, + { + "epoch": 1.5745856353591159, + "grad_norm": 0.4274102747440338, + "learning_rate": 9.580294655804079e-05, + "loss": 1.9877, + "step": 5130 + }, + { + "epoch": 1.5748925721301412, + "grad_norm": 0.4053245484828949, + "learning_rate": 9.580095292009208e-05, + "loss": 1.9253, + "step": 5131 + }, + { + "epoch": 1.5751995089011663, + "grad_norm": 0.47868627309799194, + "learning_rate": 9.579895882951151e-05, + "loss": 1.9659, + "step": 5132 + }, + { + "epoch": 1.5755064456721914, + "grad_norm": 0.47420576214790344, + "learning_rate": 9.579696428631877e-05, + "loss": 1.9115, + "step": 5133 + }, + { + "epoch": 1.5758133824432168, + "grad_norm": 0.41192150115966797, + "learning_rate": 9.57949692905336e-05, + "loss": 1.8949, + "step": 5134 + }, + { + "epoch": 1.576120319214242, + "grad_norm": 0.44949471950531006, + "learning_rate": 9.57929738421757e-05, + "loss": 1.9393, + "step": 5135 + }, + { + "epoch": 1.576427255985267, + "grad_norm": 0.38450154662132263, + "learning_rate": 9.57909779412648e-05, + "loss": 1.8399, + "step": 5136 + }, + { + "epoch": 1.5767341927562923, + "grad_norm": 0.43553364276885986, + "learning_rate": 9.57889815878206e-05, + "loss": 1.9477, + "step": 5137 + }, + { + "epoch": 1.5770411295273172, + "grad_norm": 0.4546982944011688, + "learning_rate": 9.578698478186285e-05, + "loss": 1.9169, + "step": 5138 + }, + { + "epoch": 1.5773480662983426, + "grad_norm": 0.47802838683128357, + "learning_rate": 9.57849875234113e-05, + "loss": 1.9204, + "step": 5139 + }, + { + "epoch": 1.5776550030693677, + "grad_norm": 0.3648034930229187, + "learning_rate": 9.578298981248565e-05, + "loss": 1.9157, + "step": 5140 + }, + { + "epoch": 1.5779619398403928, + "grad_norm": 0.41951245069503784, + "learning_rate": 9.578099164910565e-05, + "loss": 1.9171, + "step": 5141 + }, + { + "epoch": 1.5782688766114181, + "grad_norm": 0.5198701620101929, + "learning_rate": 9.577899303329107e-05, + "loss": 1.9786, + "step": 5142 + }, + { + "epoch": 1.5785758133824432, + "grad_norm": 0.45244187116622925, + "learning_rate": 9.577699396506165e-05, + "loss": 2.0044, + "step": 5143 + }, + { + "epoch": 1.5788827501534684, + "grad_norm": 0.3874819874763489, + "learning_rate": 9.577499444443715e-05, + "loss": 1.9385, + "step": 5144 + }, + { + "epoch": 1.5791896869244937, + "grad_norm": 0.4578075110912323, + "learning_rate": 9.577299447143733e-05, + "loss": 1.9679, + "step": 5145 + }, + { + "epoch": 1.5794966236955186, + "grad_norm": 0.6001343727111816, + "learning_rate": 9.577099404608192e-05, + "loss": 1.9331, + "step": 5146 + }, + { + "epoch": 1.579803560466544, + "grad_norm": 0.5592501759529114, + "learning_rate": 9.576899316839074e-05, + "loss": 1.8968, + "step": 5147 + }, + { + "epoch": 1.580110497237569, + "grad_norm": 0.4333004951477051, + "learning_rate": 9.576699183838356e-05, + "loss": 2.0378, + "step": 5148 + }, + { + "epoch": 1.5804174340085941, + "grad_norm": 0.40593892335891724, + "learning_rate": 9.576499005608011e-05, + "loss": 1.9878, + "step": 5149 + }, + { + "epoch": 1.5807243707796195, + "grad_norm": 0.4805290400981903, + "learning_rate": 9.576298782150023e-05, + "loss": 1.9897, + "step": 5150 + }, + { + "epoch": 1.5810313075506446, + "grad_norm": 0.4620860517024994, + "learning_rate": 9.576098513466367e-05, + "loss": 1.9808, + "step": 5151 + }, + { + "epoch": 1.5813382443216697, + "grad_norm": 0.47085410356521606, + "learning_rate": 9.575898199559023e-05, + "loss": 1.9526, + "step": 5152 + }, + { + "epoch": 1.581645181092695, + "grad_norm": 0.512971043586731, + "learning_rate": 9.575697840429971e-05, + "loss": 1.9684, + "step": 5153 + }, + { + "epoch": 1.58195211786372, + "grad_norm": 0.5474939346313477, + "learning_rate": 9.575497436081193e-05, + "loss": 2.0052, + "step": 5154 + }, + { + "epoch": 1.5822590546347453, + "grad_norm": 0.6277830004692078, + "learning_rate": 9.575296986514666e-05, + "loss": 2.042, + "step": 5155 + }, + { + "epoch": 1.5825659914057704, + "grad_norm": 0.46941256523132324, + "learning_rate": 9.575096491732372e-05, + "loss": 1.952, + "step": 5156 + }, + { + "epoch": 1.5828729281767955, + "grad_norm": 0.4948115646839142, + "learning_rate": 9.574895951736294e-05, + "loss": 1.9573, + "step": 5157 + }, + { + "epoch": 1.5831798649478208, + "grad_norm": 0.5677160024642944, + "learning_rate": 9.574695366528411e-05, + "loss": 1.9696, + "step": 5158 + }, + { + "epoch": 1.583486801718846, + "grad_norm": 0.5915918350219727, + "learning_rate": 9.574494736110708e-05, + "loss": 1.9822, + "step": 5159 + }, + { + "epoch": 1.583793738489871, + "grad_norm": 0.556413471698761, + "learning_rate": 9.574294060485168e-05, + "loss": 1.9548, + "step": 5160 + }, + { + "epoch": 1.5841006752608964, + "grad_norm": 0.4706072509288788, + "learning_rate": 9.574093339653772e-05, + "loss": 2.0052, + "step": 5161 + }, + { + "epoch": 1.5844076120319213, + "grad_norm": 0.3931087553501129, + "learning_rate": 9.573892573618505e-05, + "loss": 1.9071, + "step": 5162 + }, + { + "epoch": 1.5847145488029466, + "grad_norm": 0.4590308368206024, + "learning_rate": 9.573691762381349e-05, + "loss": 2.048, + "step": 5163 + }, + { + "epoch": 1.5850214855739717, + "grad_norm": 0.4404078423976898, + "learning_rate": 9.573490905944293e-05, + "loss": 1.9426, + "step": 5164 + }, + { + "epoch": 1.5853284223449968, + "grad_norm": 0.486074298620224, + "learning_rate": 9.573290004309318e-05, + "loss": 1.9937, + "step": 5165 + }, + { + "epoch": 1.5856353591160222, + "grad_norm": 0.4650556445121765, + "learning_rate": 9.57308905747841e-05, + "loss": 1.9821, + "step": 5166 + }, + { + "epoch": 1.5859422958870473, + "grad_norm": 0.48193567991256714, + "learning_rate": 9.572888065453557e-05, + "loss": 2.0143, + "step": 5167 + }, + { + "epoch": 1.5862492326580724, + "grad_norm": 0.43178877234458923, + "learning_rate": 9.572687028236744e-05, + "loss": 2.0066, + "step": 5168 + }, + { + "epoch": 1.5865561694290977, + "grad_norm": 0.5256033539772034, + "learning_rate": 9.572485945829957e-05, + "loss": 2.0431, + "step": 5169 + }, + { + "epoch": 1.5868631062001226, + "grad_norm": 0.4714619517326355, + "learning_rate": 9.572284818235182e-05, + "loss": 1.9411, + "step": 5170 + }, + { + "epoch": 1.587170042971148, + "grad_norm": 0.4224734902381897, + "learning_rate": 9.572083645454411e-05, + "loss": 1.9648, + "step": 5171 + }, + { + "epoch": 1.5874769797421733, + "grad_norm": 0.45965152978897095, + "learning_rate": 9.571882427489628e-05, + "loss": 1.9241, + "step": 5172 + }, + { + "epoch": 1.5877839165131982, + "grad_norm": 0.459114670753479, + "learning_rate": 9.571681164342825e-05, + "loss": 2.0197, + "step": 5173 + }, + { + "epoch": 1.5880908532842235, + "grad_norm": 0.4278501272201538, + "learning_rate": 9.571479856015988e-05, + "loss": 1.9411, + "step": 5174 + }, + { + "epoch": 1.5883977900552486, + "grad_norm": 0.6875150799751282, + "learning_rate": 9.571278502511107e-05, + "loss": 1.8876, + "step": 5175 + }, + { + "epoch": 1.5887047268262737, + "grad_norm": 0.4596772789955139, + "learning_rate": 9.571077103830174e-05, + "loss": 1.9002, + "step": 5176 + }, + { + "epoch": 1.589011663597299, + "grad_norm": 0.47587937116622925, + "learning_rate": 9.570875659975178e-05, + "loss": 2.0034, + "step": 5177 + }, + { + "epoch": 1.5893186003683242, + "grad_norm": 0.42494842410087585, + "learning_rate": 9.570674170948109e-05, + "loss": 1.9668, + "step": 5178 + }, + { + "epoch": 1.5896255371393493, + "grad_norm": 0.4231310784816742, + "learning_rate": 9.570472636750957e-05, + "loss": 1.9365, + "step": 5179 + }, + { + "epoch": 1.5899324739103746, + "grad_norm": 0.4585247337818146, + "learning_rate": 9.570271057385719e-05, + "loss": 1.9707, + "step": 5180 + }, + { + "epoch": 1.5902394106813995, + "grad_norm": 0.4146895408630371, + "learning_rate": 9.570069432854382e-05, + "loss": 1.9405, + "step": 5181 + }, + { + "epoch": 1.5905463474524248, + "grad_norm": 0.42243605852127075, + "learning_rate": 9.56986776315894e-05, + "loss": 1.8893, + "step": 5182 + }, + { + "epoch": 1.59085328422345, + "grad_norm": 0.44299328327178955, + "learning_rate": 9.569666048301386e-05, + "loss": 1.9596, + "step": 5183 + }, + { + "epoch": 1.591160220994475, + "grad_norm": 0.4950970709323883, + "learning_rate": 9.569464288283716e-05, + "loss": 1.9066, + "step": 5184 + }, + { + "epoch": 1.5914671577655004, + "grad_norm": 0.4664969742298126, + "learning_rate": 9.569262483107919e-05, + "loss": 1.9485, + "step": 5185 + }, + { + "epoch": 1.5917740945365255, + "grad_norm": 0.5052160024642944, + "learning_rate": 9.569060632775993e-05, + "loss": 1.9189, + "step": 5186 + }, + { + "epoch": 1.5920810313075506, + "grad_norm": 0.4109063446521759, + "learning_rate": 9.568858737289932e-05, + "loss": 1.9236, + "step": 5187 + }, + { + "epoch": 1.592387968078576, + "grad_norm": 0.4078194499015808, + "learning_rate": 9.568656796651731e-05, + "loss": 1.9465, + "step": 5188 + }, + { + "epoch": 1.5926949048496009, + "grad_norm": 0.43199312686920166, + "learning_rate": 9.568454810863385e-05, + "loss": 1.9537, + "step": 5189 + }, + { + "epoch": 1.5930018416206262, + "grad_norm": 0.46389925479888916, + "learning_rate": 9.568252779926891e-05, + "loss": 1.9463, + "step": 5190 + }, + { + "epoch": 1.5933087783916513, + "grad_norm": 0.4130708575248718, + "learning_rate": 9.568050703844247e-05, + "loss": 1.948, + "step": 5191 + }, + { + "epoch": 1.5936157151626764, + "grad_norm": 0.4699256122112274, + "learning_rate": 9.567848582617448e-05, + "loss": 1.957, + "step": 5192 + }, + { + "epoch": 1.5939226519337018, + "grad_norm": 0.41965460777282715, + "learning_rate": 9.56764641624849e-05, + "loss": 1.9622, + "step": 5193 + }, + { + "epoch": 1.5942295887047269, + "grad_norm": 0.4313151240348816, + "learning_rate": 9.567444204739376e-05, + "loss": 1.981, + "step": 5194 + }, + { + "epoch": 1.594536525475752, + "grad_norm": 0.4149332642555237, + "learning_rate": 9.5672419480921e-05, + "loss": 1.9542, + "step": 5195 + }, + { + "epoch": 1.5948434622467773, + "grad_norm": 0.4456483721733093, + "learning_rate": 9.567039646308661e-05, + "loss": 2.0206, + "step": 5196 + }, + { + "epoch": 1.5951503990178022, + "grad_norm": 0.46637552976608276, + "learning_rate": 9.56683729939106e-05, + "loss": 2.0264, + "step": 5197 + }, + { + "epoch": 1.5954573357888275, + "grad_norm": 0.4809871315956116, + "learning_rate": 9.566634907341297e-05, + "loss": 1.9113, + "step": 5198 + }, + { + "epoch": 1.5957642725598526, + "grad_norm": 0.5220670104026794, + "learning_rate": 9.566432470161371e-05, + "loss": 1.9806, + "step": 5199 + }, + { + "epoch": 1.5960712093308778, + "grad_norm": 0.5020555853843689, + "learning_rate": 9.566229987853283e-05, + "loss": 1.9925, + "step": 5200 + }, + { + "epoch": 1.596378146101903, + "grad_norm": 0.5481683611869812, + "learning_rate": 9.566027460419034e-05, + "loss": 1.978, + "step": 5201 + }, + { + "epoch": 1.5966850828729282, + "grad_norm": 0.5014147758483887, + "learning_rate": 9.565824887860624e-05, + "loss": 1.9402, + "step": 5202 + }, + { + "epoch": 1.5969920196439533, + "grad_norm": 0.43973588943481445, + "learning_rate": 9.565622270180057e-05, + "loss": 1.9877, + "step": 5203 + }, + { + "epoch": 1.5972989564149787, + "grad_norm": 0.5172939300537109, + "learning_rate": 9.565419607379335e-05, + "loss": 1.9304, + "step": 5204 + }, + { + "epoch": 1.5976058931860035, + "grad_norm": 0.4767214357852936, + "learning_rate": 9.56521689946046e-05, + "loss": 1.9063, + "step": 5205 + }, + { + "epoch": 1.5979128299570289, + "grad_norm": 0.48810651898384094, + "learning_rate": 9.565014146425437e-05, + "loss": 1.9473, + "step": 5206 + }, + { + "epoch": 1.598219766728054, + "grad_norm": 0.4204402565956116, + "learning_rate": 9.564811348276269e-05, + "loss": 1.9562, + "step": 5207 + }, + { + "epoch": 1.598526703499079, + "grad_norm": 0.42679163813591003, + "learning_rate": 9.564608505014958e-05, + "loss": 1.8904, + "step": 5208 + }, + { + "epoch": 1.5988336402701044, + "grad_norm": 0.4240354299545288, + "learning_rate": 9.56440561664351e-05, + "loss": 1.9982, + "step": 5209 + }, + { + "epoch": 1.5991405770411296, + "grad_norm": 0.41588497161865234, + "learning_rate": 9.564202683163932e-05, + "loss": 1.9904, + "step": 5210 + }, + { + "epoch": 1.5994475138121547, + "grad_norm": 0.486240029335022, + "learning_rate": 9.563999704578226e-05, + "loss": 1.9379, + "step": 5211 + }, + { + "epoch": 1.59975445058318, + "grad_norm": 0.4628448188304901, + "learning_rate": 9.563796680888403e-05, + "loss": 2.0061, + "step": 5212 + }, + { + "epoch": 1.600061387354205, + "grad_norm": 0.4514544606208801, + "learning_rate": 9.563593612096464e-05, + "loss": 1.9692, + "step": 5213 + }, + { + "epoch": 1.6003683241252302, + "grad_norm": 0.3869803845882416, + "learning_rate": 9.563390498204419e-05, + "loss": 1.8801, + "step": 5214 + }, + { + "epoch": 1.6006752608962553, + "grad_norm": 0.47029098868370056, + "learning_rate": 9.563187339214274e-05, + "loss": 2.0457, + "step": 5215 + }, + { + "epoch": 1.6009821976672804, + "grad_norm": 0.49051982164382935, + "learning_rate": 9.562984135128037e-05, + "loss": 1.9121, + "step": 5216 + }, + { + "epoch": 1.6012891344383058, + "grad_norm": 0.5087830424308777, + "learning_rate": 9.562780885947717e-05, + "loss": 1.9165, + "step": 5217 + }, + { + "epoch": 1.601596071209331, + "grad_norm": 0.4597826600074768, + "learning_rate": 9.562577591675322e-05, + "loss": 1.9037, + "step": 5218 + }, + { + "epoch": 1.601903007980356, + "grad_norm": 0.43610528111457825, + "learning_rate": 9.562374252312858e-05, + "loss": 1.8785, + "step": 5219 + }, + { + "epoch": 1.6022099447513813, + "grad_norm": 0.45797282457351685, + "learning_rate": 9.56217086786234e-05, + "loss": 2.0713, + "step": 5220 + }, + { + "epoch": 1.6025168815224062, + "grad_norm": 0.46097078919410706, + "learning_rate": 9.561967438325777e-05, + "loss": 1.9176, + "step": 5221 + }, + { + "epoch": 1.6028238182934316, + "grad_norm": 0.47368288040161133, + "learning_rate": 9.561763963705176e-05, + "loss": 1.9333, + "step": 5222 + }, + { + "epoch": 1.6031307550644567, + "grad_norm": 0.5048179626464844, + "learning_rate": 9.561560444002551e-05, + "loss": 1.9473, + "step": 5223 + }, + { + "epoch": 1.6034376918354818, + "grad_norm": 0.42069435119628906, + "learning_rate": 9.56135687921991e-05, + "loss": 1.8507, + "step": 5224 + }, + { + "epoch": 1.6037446286065071, + "grad_norm": 0.37166985869407654, + "learning_rate": 9.561153269359269e-05, + "loss": 1.9404, + "step": 5225 + }, + { + "epoch": 1.6040515653775322, + "grad_norm": 0.42752668261528015, + "learning_rate": 9.560949614422637e-05, + "loss": 1.9791, + "step": 5226 + }, + { + "epoch": 1.6043585021485574, + "grad_norm": 0.4334527552127838, + "learning_rate": 9.560745914412029e-05, + "loss": 1.972, + "step": 5227 + }, + { + "epoch": 1.6046654389195827, + "grad_norm": 0.44162631034851074, + "learning_rate": 9.560542169329454e-05, + "loss": 1.9054, + "step": 5228 + }, + { + "epoch": 1.6049723756906076, + "grad_norm": 0.3891509771347046, + "learning_rate": 9.560338379176929e-05, + "loss": 1.9356, + "step": 5229 + }, + { + "epoch": 1.605279312461633, + "grad_norm": 0.3821989893913269, + "learning_rate": 9.56013454395647e-05, + "loss": 1.9197, + "step": 5230 + }, + { + "epoch": 1.605586249232658, + "grad_norm": 0.4338948428630829, + "learning_rate": 9.559930663670084e-05, + "loss": 2.002, + "step": 5231 + }, + { + "epoch": 1.6058931860036831, + "grad_norm": 0.4784114956855774, + "learning_rate": 9.559726738319794e-05, + "loss": 2.0344, + "step": 5232 + }, + { + "epoch": 1.6062001227747085, + "grad_norm": 0.43362441658973694, + "learning_rate": 9.559522767907612e-05, + "loss": 1.9282, + "step": 5233 + }, + { + "epoch": 1.6065070595457336, + "grad_norm": 0.40863800048828125, + "learning_rate": 9.559318752435553e-05, + "loss": 1.8468, + "step": 5234 + }, + { + "epoch": 1.6068139963167587, + "grad_norm": 0.4509727358818054, + "learning_rate": 9.559114691905633e-05, + "loss": 2.0175, + "step": 5235 + }, + { + "epoch": 1.607120933087784, + "grad_norm": 0.4650020897388458, + "learning_rate": 9.55891058631987e-05, + "loss": 1.9946, + "step": 5236 + }, + { + "epoch": 1.607427869858809, + "grad_norm": 0.4315911829471588, + "learning_rate": 9.55870643568028e-05, + "loss": 1.9271, + "step": 5237 + }, + { + "epoch": 1.6077348066298343, + "grad_norm": 0.4109809994697571, + "learning_rate": 9.558502239988882e-05, + "loss": 1.9791, + "step": 5238 + }, + { + "epoch": 1.6080417434008594, + "grad_norm": 0.4323776662349701, + "learning_rate": 9.558297999247692e-05, + "loss": 1.9745, + "step": 5239 + }, + { + "epoch": 1.6083486801718845, + "grad_norm": 0.4255007207393646, + "learning_rate": 9.558093713458729e-05, + "loss": 1.96, + "step": 5240 + }, + { + "epoch": 1.6086556169429098, + "grad_norm": 0.4045571982860565, + "learning_rate": 9.557889382624014e-05, + "loss": 1.9148, + "step": 5241 + }, + { + "epoch": 1.608962553713935, + "grad_norm": 0.39663615822792053, + "learning_rate": 9.557685006745564e-05, + "loss": 1.9313, + "step": 5242 + }, + { + "epoch": 1.60926949048496, + "grad_norm": 0.39130523800849915, + "learning_rate": 9.5574805858254e-05, + "loss": 2.0073, + "step": 5243 + }, + { + "epoch": 1.6095764272559854, + "grad_norm": 0.4071548581123352, + "learning_rate": 9.55727611986554e-05, + "loss": 1.9353, + "step": 5244 + }, + { + "epoch": 1.6098833640270105, + "grad_norm": 0.44347357749938965, + "learning_rate": 9.557071608868007e-05, + "loss": 1.9325, + "step": 5245 + }, + { + "epoch": 1.6101903007980356, + "grad_norm": 0.48900067806243896, + "learning_rate": 9.556867052834821e-05, + "loss": 2.0083, + "step": 5246 + }, + { + "epoch": 1.610497237569061, + "grad_norm": 0.44374197721481323, + "learning_rate": 9.556662451768006e-05, + "loss": 2.0143, + "step": 5247 + }, + { + "epoch": 1.6108041743400858, + "grad_norm": 0.385268896818161, + "learning_rate": 9.556457805669581e-05, + "loss": 1.8981, + "step": 5248 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.5355607867240906, + "learning_rate": 9.556253114541569e-05, + "loss": 2.0413, + "step": 5249 + }, + { + "epoch": 1.6114180478821363, + "grad_norm": 0.5672646164894104, + "learning_rate": 9.556048378385992e-05, + "loss": 1.9429, + "step": 5250 + }, + { + "epoch": 1.6117249846531614, + "grad_norm": 0.46225669980049133, + "learning_rate": 9.555843597204875e-05, + "loss": 1.9883, + "step": 5251 + }, + { + "epoch": 1.6120319214241867, + "grad_norm": 0.43236228823661804, + "learning_rate": 9.555638771000243e-05, + "loss": 1.9641, + "step": 5252 + }, + { + "epoch": 1.6123388581952118, + "grad_norm": 0.4843178987503052, + "learning_rate": 9.555433899774116e-05, + "loss": 1.9224, + "step": 5253 + }, + { + "epoch": 1.612645794966237, + "grad_norm": 0.4693675637245178, + "learning_rate": 9.555228983528523e-05, + "loss": 1.9774, + "step": 5254 + }, + { + "epoch": 1.6129527317372623, + "grad_norm": 0.3968529999256134, + "learning_rate": 9.555024022265487e-05, + "loss": 1.8939, + "step": 5255 + }, + { + "epoch": 1.6132596685082872, + "grad_norm": 0.42781850695610046, + "learning_rate": 9.554819015987033e-05, + "loss": 1.9561, + "step": 5256 + }, + { + "epoch": 1.6135666052793125, + "grad_norm": 0.5241015553474426, + "learning_rate": 9.554613964695189e-05, + "loss": 1.963, + "step": 5257 + }, + { + "epoch": 1.6138735420503376, + "grad_norm": 0.4292888641357422, + "learning_rate": 9.554408868391979e-05, + "loss": 2.0248, + "step": 5258 + }, + { + "epoch": 1.6141804788213627, + "grad_norm": 0.49197763204574585, + "learning_rate": 9.554203727079433e-05, + "loss": 1.9612, + "step": 5259 + }, + { + "epoch": 1.614487415592388, + "grad_norm": 0.45733556151390076, + "learning_rate": 9.553998540759575e-05, + "loss": 1.9093, + "step": 5260 + }, + { + "epoch": 1.6147943523634132, + "grad_norm": 0.4139576256275177, + "learning_rate": 9.553793309434436e-05, + "loss": 1.875, + "step": 5261 + }, + { + "epoch": 1.6151012891344383, + "grad_norm": 0.42295894026756287, + "learning_rate": 9.55358803310604e-05, + "loss": 1.9427, + "step": 5262 + }, + { + "epoch": 1.6154082259054636, + "grad_norm": 0.370761513710022, + "learning_rate": 9.55338271177642e-05, + "loss": 1.932, + "step": 5263 + }, + { + "epoch": 1.6157151626764885, + "grad_norm": 0.38912683725357056, + "learning_rate": 9.553177345447602e-05, + "loss": 1.9606, + "step": 5264 + }, + { + "epoch": 1.6160220994475138, + "grad_norm": 0.3901510238647461, + "learning_rate": 9.552971934121618e-05, + "loss": 1.9455, + "step": 5265 + }, + { + "epoch": 1.616329036218539, + "grad_norm": 0.4517458975315094, + "learning_rate": 9.552766477800494e-05, + "loss": 1.9291, + "step": 5266 + }, + { + "epoch": 1.616635972989564, + "grad_norm": 0.47282713651657104, + "learning_rate": 9.552560976486266e-05, + "loss": 1.9326, + "step": 5267 + }, + { + "epoch": 1.6169429097605894, + "grad_norm": 0.4741488993167877, + "learning_rate": 9.552355430180961e-05, + "loss": 1.9782, + "step": 5268 + }, + { + "epoch": 1.6172498465316145, + "grad_norm": 0.42634037137031555, + "learning_rate": 9.552149838886612e-05, + "loss": 1.9871, + "step": 5269 + }, + { + "epoch": 1.6175567833026396, + "grad_norm": 0.39007633924484253, + "learning_rate": 9.55194420260525e-05, + "loss": 1.9397, + "step": 5270 + }, + { + "epoch": 1.617863720073665, + "grad_norm": 0.41707170009613037, + "learning_rate": 9.551738521338906e-05, + "loss": 1.8555, + "step": 5271 + }, + { + "epoch": 1.6181706568446899, + "grad_norm": 0.46702343225479126, + "learning_rate": 9.551532795089616e-05, + "loss": 1.9987, + "step": 5272 + }, + { + "epoch": 1.6184775936157152, + "grad_norm": 0.44585564732551575, + "learning_rate": 9.551327023859411e-05, + "loss": 1.8512, + "step": 5273 + }, + { + "epoch": 1.6187845303867403, + "grad_norm": 0.42617684602737427, + "learning_rate": 9.551121207650324e-05, + "loss": 1.9405, + "step": 5274 + }, + { + "epoch": 1.6190914671577654, + "grad_norm": 0.39399340748786926, + "learning_rate": 9.55091534646439e-05, + "loss": 1.9787, + "step": 5275 + }, + { + "epoch": 1.6193984039287908, + "grad_norm": 0.44386324286460876, + "learning_rate": 9.550709440303642e-05, + "loss": 1.9791, + "step": 5276 + }, + { + "epoch": 1.6197053406998159, + "grad_norm": 0.3871287405490875, + "learning_rate": 9.550503489170117e-05, + "loss": 1.9354, + "step": 5277 + }, + { + "epoch": 1.620012277470841, + "grad_norm": 0.4131690263748169, + "learning_rate": 9.550297493065851e-05, + "loss": 1.9709, + "step": 5278 + }, + { + "epoch": 1.6203192142418663, + "grad_norm": 0.3919534683227539, + "learning_rate": 9.550091451992877e-05, + "loss": 1.8997, + "step": 5279 + }, + { + "epoch": 1.6206261510128912, + "grad_norm": 0.40001583099365234, + "learning_rate": 9.54988536595323e-05, + "loss": 1.9006, + "step": 5280 + }, + { + "epoch": 1.6209330877839165, + "grad_norm": 0.44222408533096313, + "learning_rate": 9.549679234948952e-05, + "loss": 2.0033, + "step": 5281 + }, + { + "epoch": 1.6212400245549416, + "grad_norm": 0.4243159592151642, + "learning_rate": 9.549473058982077e-05, + "loss": 1.9582, + "step": 5282 + }, + { + "epoch": 1.6215469613259668, + "grad_norm": 0.411408007144928, + "learning_rate": 9.549266838054641e-05, + "loss": 1.9244, + "step": 5283 + }, + { + "epoch": 1.621853898096992, + "grad_norm": 0.3833782970905304, + "learning_rate": 9.549060572168686e-05, + "loss": 1.9184, + "step": 5284 + }, + { + "epoch": 1.6221608348680172, + "grad_norm": 0.3925926685333252, + "learning_rate": 9.548854261326246e-05, + "loss": 1.9299, + "step": 5285 + }, + { + "epoch": 1.6224677716390423, + "grad_norm": 0.4472656846046448, + "learning_rate": 9.548647905529363e-05, + "loss": 2.0622, + "step": 5286 + }, + { + "epoch": 1.6227747084100677, + "grad_norm": 0.4842108488082886, + "learning_rate": 9.548441504780074e-05, + "loss": 1.9759, + "step": 5287 + }, + { + "epoch": 1.6230816451810925, + "grad_norm": 0.49826517701148987, + "learning_rate": 9.548235059080422e-05, + "loss": 1.9162, + "step": 5288 + }, + { + "epoch": 1.6233885819521179, + "grad_norm": 0.4672689735889435, + "learning_rate": 9.548028568432445e-05, + "loss": 1.9843, + "step": 5289 + }, + { + "epoch": 1.623695518723143, + "grad_norm": 0.48113325238227844, + "learning_rate": 9.547822032838182e-05, + "loss": 1.9426, + "step": 5290 + }, + { + "epoch": 1.624002455494168, + "grad_norm": 0.49646374583244324, + "learning_rate": 9.54761545229968e-05, + "loss": 1.908, + "step": 5291 + }, + { + "epoch": 1.6243093922651934, + "grad_norm": 0.42530664801597595, + "learning_rate": 9.547408826818974e-05, + "loss": 1.9189, + "step": 5292 + }, + { + "epoch": 1.6246163290362186, + "grad_norm": 0.592721164226532, + "learning_rate": 9.54720215639811e-05, + "loss": 1.9656, + "step": 5293 + }, + { + "epoch": 1.6249232658072437, + "grad_norm": 0.5530748963356018, + "learning_rate": 9.546995441039127e-05, + "loss": 1.8815, + "step": 5294 + }, + { + "epoch": 1.625230202578269, + "grad_norm": 0.4551030695438385, + "learning_rate": 9.546788680744073e-05, + "loss": 1.9485, + "step": 5295 + }, + { + "epoch": 1.625537139349294, + "grad_norm": 0.42004409432411194, + "learning_rate": 9.546581875514985e-05, + "loss": 1.9903, + "step": 5296 + }, + { + "epoch": 1.6258440761203192, + "grad_norm": 0.5363507270812988, + "learning_rate": 9.546375025353911e-05, + "loss": 1.93, + "step": 5297 + }, + { + "epoch": 1.6261510128913443, + "grad_norm": 0.457795649766922, + "learning_rate": 9.546168130262896e-05, + "loss": 1.9279, + "step": 5298 + }, + { + "epoch": 1.6264579496623695, + "grad_norm": 0.5061174631118774, + "learning_rate": 9.545961190243982e-05, + "loss": 1.9198, + "step": 5299 + }, + { + "epoch": 1.6267648864333948, + "grad_norm": 0.4366548955440521, + "learning_rate": 9.545754205299214e-05, + "loss": 1.9206, + "step": 5300 + }, + { + "epoch": 1.62707182320442, + "grad_norm": 0.361251562833786, + "learning_rate": 9.54554717543064e-05, + "loss": 1.8638, + "step": 5301 + }, + { + "epoch": 1.627378759975445, + "grad_norm": 0.45089036226272583, + "learning_rate": 9.545340100640303e-05, + "loss": 1.9206, + "step": 5302 + }, + { + "epoch": 1.6276856967464703, + "grad_norm": 0.38224726915359497, + "learning_rate": 9.545132980930251e-05, + "loss": 1.9893, + "step": 5303 + }, + { + "epoch": 1.6279926335174952, + "grad_norm": 0.43573206663131714, + "learning_rate": 9.544925816302533e-05, + "loss": 1.9358, + "step": 5304 + }, + { + "epoch": 1.6282995702885206, + "grad_norm": 0.5618723630905151, + "learning_rate": 9.544718606759193e-05, + "loss": 1.9745, + "step": 5305 + }, + { + "epoch": 1.6286065070595457, + "grad_norm": 0.517867386341095, + "learning_rate": 9.54451135230228e-05, + "loss": 2.0238, + "step": 5306 + }, + { + "epoch": 1.6289134438305708, + "grad_norm": 0.4745725393295288, + "learning_rate": 9.544304052933842e-05, + "loss": 1.999, + "step": 5307 + }, + { + "epoch": 1.6292203806015961, + "grad_norm": 0.4454270899295807, + "learning_rate": 9.544096708655928e-05, + "loss": 1.9215, + "step": 5308 + }, + { + "epoch": 1.6295273173726212, + "grad_norm": 0.5604696273803711, + "learning_rate": 9.543889319470586e-05, + "loss": 1.8756, + "step": 5309 + }, + { + "epoch": 1.6298342541436464, + "grad_norm": 0.645453155040741, + "learning_rate": 9.543681885379869e-05, + "loss": 1.9177, + "step": 5310 + }, + { + "epoch": 1.6301411909146717, + "grad_norm": 0.7018140554428101, + "learning_rate": 9.543474406385824e-05, + "loss": 1.9231, + "step": 5311 + }, + { + "epoch": 1.6304481276856968, + "grad_norm": 0.691644549369812, + "learning_rate": 9.543266882490501e-05, + "loss": 1.9055, + "step": 5312 + }, + { + "epoch": 1.630755064456722, + "grad_norm": 0.5484849810600281, + "learning_rate": 9.54305931369595e-05, + "loss": 1.8977, + "step": 5313 + }, + { + "epoch": 1.6310620012277472, + "grad_norm": 0.4035104811191559, + "learning_rate": 9.542851700004227e-05, + "loss": 1.9098, + "step": 5314 + }, + { + "epoch": 1.6313689379987721, + "grad_norm": 0.4578574299812317, + "learning_rate": 9.542644041417379e-05, + "loss": 1.9946, + "step": 5315 + }, + { + "epoch": 1.6316758747697975, + "grad_norm": 0.646272599697113, + "learning_rate": 9.542436337937462e-05, + "loss": 1.9489, + "step": 5316 + }, + { + "epoch": 1.6319828115408226, + "grad_norm": 0.5796291828155518, + "learning_rate": 9.542228589566524e-05, + "loss": 1.8396, + "step": 5317 + }, + { + "epoch": 1.6322897483118477, + "grad_norm": 0.42690619826316833, + "learning_rate": 9.542020796306623e-05, + "loss": 1.9691, + "step": 5318 + }, + { + "epoch": 1.632596685082873, + "grad_norm": 0.3943910002708435, + "learning_rate": 9.54181295815981e-05, + "loss": 1.8711, + "step": 5319 + }, + { + "epoch": 1.6329036218538981, + "grad_norm": 0.4636860489845276, + "learning_rate": 9.541605075128137e-05, + "loss": 1.8659, + "step": 5320 + }, + { + "epoch": 1.6332105586249233, + "grad_norm": 0.5485807061195374, + "learning_rate": 9.541397147213664e-05, + "loss": 2.031, + "step": 5321 + }, + { + "epoch": 1.6335174953959486, + "grad_norm": 0.40169721841812134, + "learning_rate": 9.541189174418441e-05, + "loss": 1.9346, + "step": 5322 + }, + { + "epoch": 1.6338244321669735, + "grad_norm": 0.3407663106918335, + "learning_rate": 9.540981156744524e-05, + "loss": 1.9238, + "step": 5323 + }, + { + "epoch": 1.6341313689379988, + "grad_norm": 0.4062422513961792, + "learning_rate": 9.540773094193971e-05, + "loss": 1.914, + "step": 5324 + }, + { + "epoch": 1.634438305709024, + "grad_norm": 0.47654685378074646, + "learning_rate": 9.540564986768836e-05, + "loss": 1.8957, + "step": 5325 + }, + { + "epoch": 1.634745242480049, + "grad_norm": 0.4369850754737854, + "learning_rate": 9.540356834471178e-05, + "loss": 1.968, + "step": 5326 + }, + { + "epoch": 1.6350521792510744, + "grad_norm": 0.38868457078933716, + "learning_rate": 9.540148637303052e-05, + "loss": 1.931, + "step": 5327 + }, + { + "epoch": 1.6353591160220995, + "grad_norm": 0.4998358190059662, + "learning_rate": 9.539940395266515e-05, + "loss": 1.9316, + "step": 5328 + }, + { + "epoch": 1.6356660527931246, + "grad_norm": 0.5497372150421143, + "learning_rate": 9.539732108363628e-05, + "loss": 1.9233, + "step": 5329 + }, + { + "epoch": 1.63597298956415, + "grad_norm": 0.5609846115112305, + "learning_rate": 9.539523776596445e-05, + "loss": 1.898, + "step": 5330 + }, + { + "epoch": 1.6362799263351748, + "grad_norm": 0.44984617829322815, + "learning_rate": 9.539315399967029e-05, + "loss": 2.0103, + "step": 5331 + }, + { + "epoch": 1.6365868631062002, + "grad_norm": 0.41710013151168823, + "learning_rate": 9.539106978477436e-05, + "loss": 1.9008, + "step": 5332 + }, + { + "epoch": 1.6368937998772253, + "grad_norm": 0.44854703545570374, + "learning_rate": 9.53889851212973e-05, + "loss": 1.9591, + "step": 5333 + }, + { + "epoch": 1.6372007366482504, + "grad_norm": 0.4259171485900879, + "learning_rate": 9.538690000925968e-05, + "loss": 1.915, + "step": 5334 + }, + { + "epoch": 1.6375076734192757, + "grad_norm": 0.4444480240345001, + "learning_rate": 9.53848144486821e-05, + "loss": 1.9562, + "step": 5335 + }, + { + "epoch": 1.6378146101903008, + "grad_norm": 0.40078794956207275, + "learning_rate": 9.538272843958518e-05, + "loss": 1.8802, + "step": 5336 + }, + { + "epoch": 1.638121546961326, + "grad_norm": 0.5346726179122925, + "learning_rate": 9.538064198198955e-05, + "loss": 2.0214, + "step": 5337 + }, + { + "epoch": 1.6384284837323513, + "grad_norm": 0.47136780619621277, + "learning_rate": 9.537855507591581e-05, + "loss": 1.9593, + "step": 5338 + }, + { + "epoch": 1.6387354205033762, + "grad_norm": 0.3839198052883148, + "learning_rate": 9.53764677213846e-05, + "loss": 1.9507, + "step": 5339 + }, + { + "epoch": 1.6390423572744015, + "grad_norm": 0.4565586447715759, + "learning_rate": 9.537437991841654e-05, + "loss": 1.9292, + "step": 5340 + }, + { + "epoch": 1.6393492940454266, + "grad_norm": 0.5139011740684509, + "learning_rate": 9.537229166703225e-05, + "loss": 1.9388, + "step": 5341 + }, + { + "epoch": 1.6396562308164517, + "grad_norm": 0.5421571135520935, + "learning_rate": 9.537020296725238e-05, + "loss": 1.9031, + "step": 5342 + }, + { + "epoch": 1.639963167587477, + "grad_norm": 0.4085434675216675, + "learning_rate": 9.536811381909758e-05, + "loss": 1.9167, + "step": 5343 + }, + { + "epoch": 1.6402701043585022, + "grad_norm": 0.3567824065685272, + "learning_rate": 9.536602422258849e-05, + "loss": 1.89, + "step": 5344 + }, + { + "epoch": 1.6405770411295273, + "grad_norm": 0.5427443385124207, + "learning_rate": 9.536393417774575e-05, + "loss": 2.0036, + "step": 5345 + }, + { + "epoch": 1.6408839779005526, + "grad_norm": 0.5275370478630066, + "learning_rate": 9.536184368459003e-05, + "loss": 1.94, + "step": 5346 + }, + { + "epoch": 1.6411909146715775, + "grad_norm": 0.3916989862918854, + "learning_rate": 9.535975274314198e-05, + "loss": 1.8769, + "step": 5347 + }, + { + "epoch": 1.6414978514426029, + "grad_norm": 0.4200802743434906, + "learning_rate": 9.535766135342228e-05, + "loss": 1.9384, + "step": 5348 + }, + { + "epoch": 1.641804788213628, + "grad_norm": 0.5287195444107056, + "learning_rate": 9.535556951545157e-05, + "loss": 1.9159, + "step": 5349 + }, + { + "epoch": 1.642111724984653, + "grad_norm": 0.5934851765632629, + "learning_rate": 9.535347722925055e-05, + "loss": 1.9927, + "step": 5350 + }, + { + "epoch": 1.6424186617556784, + "grad_norm": 0.49941807985305786, + "learning_rate": 9.535138449483987e-05, + "loss": 1.9124, + "step": 5351 + }, + { + "epoch": 1.6427255985267035, + "grad_norm": 0.41778016090393066, + "learning_rate": 9.534929131224024e-05, + "loss": 1.9468, + "step": 5352 + }, + { + "epoch": 1.6430325352977286, + "grad_norm": 0.5172474384307861, + "learning_rate": 9.534719768147233e-05, + "loss": 1.928, + "step": 5353 + }, + { + "epoch": 1.643339472068754, + "grad_norm": 0.6690294146537781, + "learning_rate": 9.534510360255683e-05, + "loss": 1.9697, + "step": 5354 + }, + { + "epoch": 1.6436464088397789, + "grad_norm": 0.617683470249176, + "learning_rate": 9.534300907551444e-05, + "loss": 1.9529, + "step": 5355 + }, + { + "epoch": 1.6439533456108042, + "grad_norm": 0.40067893266677856, + "learning_rate": 9.534091410036587e-05, + "loss": 1.915, + "step": 5356 + }, + { + "epoch": 1.6442602823818293, + "grad_norm": 0.46418440341949463, + "learning_rate": 9.53388186771318e-05, + "loss": 1.9056, + "step": 5357 + }, + { + "epoch": 1.6445672191528544, + "grad_norm": 0.6600098013877869, + "learning_rate": 9.533672280583295e-05, + "loss": 1.9641, + "step": 5358 + }, + { + "epoch": 1.6448741559238798, + "grad_norm": 0.6510347127914429, + "learning_rate": 9.533462648649004e-05, + "loss": 1.916, + "step": 5359 + }, + { + "epoch": 1.6451810926949049, + "grad_norm": 0.5004377365112305, + "learning_rate": 9.533252971912376e-05, + "loss": 1.9584, + "step": 5360 + }, + { + "epoch": 1.64548802946593, + "grad_norm": 0.45522230863571167, + "learning_rate": 9.533043250375488e-05, + "loss": 1.973, + "step": 5361 + }, + { + "epoch": 1.6457949662369553, + "grad_norm": 0.5304180383682251, + "learning_rate": 9.532833484040408e-05, + "loss": 1.8542, + "step": 5362 + }, + { + "epoch": 1.6461019030079802, + "grad_norm": 0.5320406556129456, + "learning_rate": 9.53262367290921e-05, + "loss": 1.9405, + "step": 5363 + }, + { + "epoch": 1.6464088397790055, + "grad_norm": 0.4377361536026001, + "learning_rate": 9.532413816983969e-05, + "loss": 1.9126, + "step": 5364 + }, + { + "epoch": 1.6467157765500307, + "grad_norm": 0.4632298946380615, + "learning_rate": 9.532203916266758e-05, + "loss": 1.9868, + "step": 5365 + }, + { + "epoch": 1.6470227133210558, + "grad_norm": 0.4861730635166168, + "learning_rate": 9.531993970759651e-05, + "loss": 1.895, + "step": 5366 + }, + { + "epoch": 1.647329650092081, + "grad_norm": 0.45012348890304565, + "learning_rate": 9.531783980464726e-05, + "loss": 1.9583, + "step": 5367 + }, + { + "epoch": 1.6476365868631062, + "grad_norm": 0.43772751092910767, + "learning_rate": 9.531573945384053e-05, + "loss": 1.9341, + "step": 5368 + }, + { + "epoch": 1.6479435236341313, + "grad_norm": 0.39253392815589905, + "learning_rate": 9.531363865519711e-05, + "loss": 1.8629, + "step": 5369 + }, + { + "epoch": 1.6482504604051567, + "grad_norm": 0.44614076614379883, + "learning_rate": 9.531153740873775e-05, + "loss": 1.9508, + "step": 5370 + }, + { + "epoch": 1.6485573971761815, + "grad_norm": 0.4442307949066162, + "learning_rate": 9.530943571448322e-05, + "loss": 1.9624, + "step": 5371 + }, + { + "epoch": 1.6488643339472069, + "grad_norm": 0.44962942600250244, + "learning_rate": 9.53073335724543e-05, + "loss": 1.9315, + "step": 5372 + }, + { + "epoch": 1.649171270718232, + "grad_norm": 0.4903222620487213, + "learning_rate": 9.530523098267173e-05, + "loss": 1.8776, + "step": 5373 + }, + { + "epoch": 1.649478207489257, + "grad_norm": 0.4733131229877472, + "learning_rate": 9.530312794515633e-05, + "loss": 1.958, + "step": 5374 + }, + { + "epoch": 1.6497851442602824, + "grad_norm": 0.4134232997894287, + "learning_rate": 9.530102445992886e-05, + "loss": 1.9184, + "step": 5375 + }, + { + "epoch": 1.6500920810313076, + "grad_norm": 0.43521758913993835, + "learning_rate": 9.529892052701012e-05, + "loss": 1.9383, + "step": 5376 + }, + { + "epoch": 1.6503990178023327, + "grad_norm": 0.5098583102226257, + "learning_rate": 9.52968161464209e-05, + "loss": 1.9596, + "step": 5377 + }, + { + "epoch": 1.650705954573358, + "grad_norm": 0.48421037197113037, + "learning_rate": 9.5294711318182e-05, + "loss": 1.9258, + "step": 5378 + }, + { + "epoch": 1.651012891344383, + "grad_norm": 0.4039461314678192, + "learning_rate": 9.52926060423142e-05, + "loss": 1.9975, + "step": 5379 + }, + { + "epoch": 1.6513198281154082, + "grad_norm": 0.491858571767807, + "learning_rate": 9.529050031883832e-05, + "loss": 1.9564, + "step": 5380 + }, + { + "epoch": 1.6516267648864333, + "grad_norm": 0.45920100808143616, + "learning_rate": 9.528839414777517e-05, + "loss": 1.8513, + "step": 5381 + }, + { + "epoch": 1.6519337016574585, + "grad_norm": 0.4812139868736267, + "learning_rate": 9.528628752914558e-05, + "loss": 1.9638, + "step": 5382 + }, + { + "epoch": 1.6522406384284838, + "grad_norm": 0.38021141290664673, + "learning_rate": 9.528418046297034e-05, + "loss": 1.848, + "step": 5383 + }, + { + "epoch": 1.652547575199509, + "grad_norm": 0.438681960105896, + "learning_rate": 9.52820729492703e-05, + "loss": 1.9931, + "step": 5384 + }, + { + "epoch": 1.652854511970534, + "grad_norm": 0.4387293756008148, + "learning_rate": 9.527996498806627e-05, + "loss": 1.9969, + "step": 5385 + }, + { + "epoch": 1.6531614487415593, + "grad_norm": 0.43315380811691284, + "learning_rate": 9.527785657937907e-05, + "loss": 1.9607, + "step": 5386 + }, + { + "epoch": 1.6534683855125845, + "grad_norm": 0.4800446927547455, + "learning_rate": 9.527574772322956e-05, + "loss": 1.9645, + "step": 5387 + }, + { + "epoch": 1.6537753222836096, + "grad_norm": 0.45495909452438354, + "learning_rate": 9.527363841963857e-05, + "loss": 1.8748, + "step": 5388 + }, + { + "epoch": 1.654082259054635, + "grad_norm": 0.4052638113498688, + "learning_rate": 9.527152866862696e-05, + "loss": 1.9491, + "step": 5389 + }, + { + "epoch": 1.6543891958256598, + "grad_norm": 0.44545745849609375, + "learning_rate": 9.526941847021558e-05, + "loss": 1.8938, + "step": 5390 + }, + { + "epoch": 1.6546961325966851, + "grad_norm": 0.5576399564743042, + "learning_rate": 9.526730782442526e-05, + "loss": 1.9656, + "step": 5391 + }, + { + "epoch": 1.6550030693677102, + "grad_norm": 0.5678401589393616, + "learning_rate": 9.526519673127686e-05, + "loss": 1.9914, + "step": 5392 + }, + { + "epoch": 1.6553100061387354, + "grad_norm": 0.4391598701477051, + "learning_rate": 9.526308519079127e-05, + "loss": 1.9452, + "step": 5393 + }, + { + "epoch": 1.6556169429097607, + "grad_norm": 0.4375559091567993, + "learning_rate": 9.526097320298934e-05, + "loss": 1.9335, + "step": 5394 + }, + { + "epoch": 1.6559238796807858, + "grad_norm": 0.4976498782634735, + "learning_rate": 9.525886076789194e-05, + "loss": 2.0065, + "step": 5395 + }, + { + "epoch": 1.656230816451811, + "grad_norm": 0.5966445207595825, + "learning_rate": 9.525674788551996e-05, + "loss": 1.9924, + "step": 5396 + }, + { + "epoch": 1.6565377532228363, + "grad_norm": 0.5119359493255615, + "learning_rate": 9.525463455589427e-05, + "loss": 2.0061, + "step": 5397 + }, + { + "epoch": 1.6568446899938611, + "grad_norm": 0.46835067868232727, + "learning_rate": 9.525252077903574e-05, + "loss": 1.9441, + "step": 5398 + }, + { + "epoch": 1.6571516267648865, + "grad_norm": 0.5319140553474426, + "learning_rate": 9.52504065549653e-05, + "loss": 1.9704, + "step": 5399 + }, + { + "epoch": 1.6574585635359116, + "grad_norm": 0.5132572054862976, + "learning_rate": 9.52482918837038e-05, + "loss": 1.9037, + "step": 5400 + }, + { + "epoch": 1.6577655003069367, + "grad_norm": 0.41260987520217896, + "learning_rate": 9.524617676527218e-05, + "loss": 1.9103, + "step": 5401 + }, + { + "epoch": 1.658072437077962, + "grad_norm": 0.41780540347099304, + "learning_rate": 9.524406119969131e-05, + "loss": 1.9419, + "step": 5402 + }, + { + "epoch": 1.6583793738489871, + "grad_norm": 0.42015889286994934, + "learning_rate": 9.524194518698211e-05, + "loss": 1.9143, + "step": 5403 + }, + { + "epoch": 1.6586863106200123, + "grad_norm": 0.4449796676635742, + "learning_rate": 9.523982872716548e-05, + "loss": 1.9794, + "step": 5404 + }, + { + "epoch": 1.6589932473910376, + "grad_norm": 0.4392293393611908, + "learning_rate": 9.523771182026237e-05, + "loss": 1.8687, + "step": 5405 + }, + { + "epoch": 1.6593001841620625, + "grad_norm": 0.49595963954925537, + "learning_rate": 9.523559446629366e-05, + "loss": 2.013, + "step": 5406 + }, + { + "epoch": 1.6596071209330878, + "grad_norm": 0.4456728994846344, + "learning_rate": 9.523347666528029e-05, + "loss": 1.9269, + "step": 5407 + }, + { + "epoch": 1.659914057704113, + "grad_norm": 0.3835284411907196, + "learning_rate": 9.52313584172432e-05, + "loss": 1.9042, + "step": 5408 + }, + { + "epoch": 1.660220994475138, + "grad_norm": 0.39068692922592163, + "learning_rate": 9.522923972220332e-05, + "loss": 1.999, + "step": 5409 + }, + { + "epoch": 1.6605279312461634, + "grad_norm": 0.4522729814052582, + "learning_rate": 9.522712058018157e-05, + "loss": 1.9546, + "step": 5410 + }, + { + "epoch": 1.6608348680171885, + "grad_norm": 0.3834155201911926, + "learning_rate": 9.522500099119891e-05, + "loss": 1.9184, + "step": 5411 + }, + { + "epoch": 1.6611418047882136, + "grad_norm": 0.36149126291275024, + "learning_rate": 9.522288095527629e-05, + "loss": 1.8973, + "step": 5412 + }, + { + "epoch": 1.661448741559239, + "grad_norm": 0.3502398729324341, + "learning_rate": 9.522076047243464e-05, + "loss": 1.8775, + "step": 5413 + }, + { + "epoch": 1.6617556783302638, + "grad_norm": 0.36552321910858154, + "learning_rate": 9.521863954269495e-05, + "loss": 1.901, + "step": 5414 + }, + { + "epoch": 1.6620626151012892, + "grad_norm": 0.37815216183662415, + "learning_rate": 9.521651816607814e-05, + "loss": 1.9143, + "step": 5415 + }, + { + "epoch": 1.6623695518723143, + "grad_norm": 0.4048994481563568, + "learning_rate": 9.52143963426052e-05, + "loss": 1.9892, + "step": 5416 + }, + { + "epoch": 1.6626764886433394, + "grad_norm": 0.35271233320236206, + "learning_rate": 9.52122740722971e-05, + "loss": 1.9209, + "step": 5417 + }, + { + "epoch": 1.6629834254143647, + "grad_norm": 0.405009925365448, + "learning_rate": 9.521015135517482e-05, + "loss": 1.9583, + "step": 5418 + }, + { + "epoch": 1.6632903621853898, + "grad_norm": 0.4041683077812195, + "learning_rate": 9.520802819125932e-05, + "loss": 1.8937, + "step": 5419 + }, + { + "epoch": 1.663597298956415, + "grad_norm": 0.41353970766067505, + "learning_rate": 9.520590458057157e-05, + "loss": 1.949, + "step": 5420 + }, + { + "epoch": 1.6639042357274403, + "grad_norm": 0.3704569637775421, + "learning_rate": 9.520378052313258e-05, + "loss": 1.9287, + "step": 5421 + }, + { + "epoch": 1.6642111724984652, + "grad_norm": 0.4043133854866028, + "learning_rate": 9.520165601896334e-05, + "loss": 1.9116, + "step": 5422 + }, + { + "epoch": 1.6645181092694905, + "grad_norm": 0.3976849317550659, + "learning_rate": 9.519953106808485e-05, + "loss": 1.9578, + "step": 5423 + }, + { + "epoch": 1.6648250460405156, + "grad_norm": 0.41225695610046387, + "learning_rate": 9.51974056705181e-05, + "loss": 1.8861, + "step": 5424 + }, + { + "epoch": 1.6651319828115407, + "grad_norm": 0.40096259117126465, + "learning_rate": 9.519527982628409e-05, + "loss": 1.926, + "step": 5425 + }, + { + "epoch": 1.665438919582566, + "grad_norm": 0.4373134970664978, + "learning_rate": 9.519315353540384e-05, + "loss": 1.8761, + "step": 5426 + }, + { + "epoch": 1.6657458563535912, + "grad_norm": 0.3798682689666748, + "learning_rate": 9.519102679789835e-05, + "loss": 1.8655, + "step": 5427 + }, + { + "epoch": 1.6660527931246163, + "grad_norm": 0.3889687955379486, + "learning_rate": 9.518889961378865e-05, + "loss": 1.8928, + "step": 5428 + }, + { + "epoch": 1.6663597298956416, + "grad_norm": 0.39567697048187256, + "learning_rate": 9.518677198309575e-05, + "loss": 1.9193, + "step": 5429 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.37571004033088684, + "learning_rate": 9.51846439058407e-05, + "loss": 1.9653, + "step": 5430 + }, + { + "epoch": 1.6669736034376919, + "grad_norm": 0.36011725664138794, + "learning_rate": 9.518251538204451e-05, + "loss": 1.9202, + "step": 5431 + }, + { + "epoch": 1.667280540208717, + "grad_norm": 0.42314839363098145, + "learning_rate": 9.518038641172822e-05, + "loss": 1.9883, + "step": 5432 + }, + { + "epoch": 1.667587476979742, + "grad_norm": 0.3986029326915741, + "learning_rate": 9.517825699491287e-05, + "loss": 1.9838, + "step": 5433 + }, + { + "epoch": 1.6678944137507674, + "grad_norm": 0.388236939907074, + "learning_rate": 9.517612713161949e-05, + "loss": 1.901, + "step": 5434 + }, + { + "epoch": 1.6682013505217925, + "grad_norm": 0.3849826455116272, + "learning_rate": 9.517399682186917e-05, + "loss": 1.9621, + "step": 5435 + }, + { + "epoch": 1.6685082872928176, + "grad_norm": 0.40182530879974365, + "learning_rate": 9.517186606568292e-05, + "loss": 1.9081, + "step": 5436 + }, + { + "epoch": 1.668815224063843, + "grad_norm": 0.4260261654853821, + "learning_rate": 9.516973486308181e-05, + "loss": 1.9701, + "step": 5437 + }, + { + "epoch": 1.6691221608348679, + "grad_norm": 0.4035099744796753, + "learning_rate": 9.516760321408692e-05, + "loss": 1.9269, + "step": 5438 + }, + { + "epoch": 1.6694290976058932, + "grad_norm": 0.42106589674949646, + "learning_rate": 9.51654711187193e-05, + "loss": 1.9026, + "step": 5439 + }, + { + "epoch": 1.6697360343769183, + "grad_norm": 0.4629819989204407, + "learning_rate": 9.516333857700001e-05, + "loss": 1.9128, + "step": 5440 + }, + { + "epoch": 1.6700429711479434, + "grad_norm": 0.3824837803840637, + "learning_rate": 9.516120558895014e-05, + "loss": 1.8861, + "step": 5441 + }, + { + "epoch": 1.6703499079189688, + "grad_norm": 0.37263223528862, + "learning_rate": 9.515907215459076e-05, + "loss": 1.9098, + "step": 5442 + }, + { + "epoch": 1.6706568446899939, + "grad_norm": 0.3980494439601898, + "learning_rate": 9.515693827394299e-05, + "loss": 1.9764, + "step": 5443 + }, + { + "epoch": 1.670963781461019, + "grad_norm": 0.5064507722854614, + "learning_rate": 9.515480394702786e-05, + "loss": 1.9771, + "step": 5444 + }, + { + "epoch": 1.6712707182320443, + "grad_norm": 0.5012909770011902, + "learning_rate": 9.515266917386649e-05, + "loss": 1.9162, + "step": 5445 + }, + { + "epoch": 1.6715776550030692, + "grad_norm": 0.5422279238700867, + "learning_rate": 9.515053395447999e-05, + "loss": 1.8913, + "step": 5446 + }, + { + "epoch": 1.6718845917740945, + "grad_norm": 0.4677022397518158, + "learning_rate": 9.514839828888946e-05, + "loss": 1.9156, + "step": 5447 + }, + { + "epoch": 1.6721915285451197, + "grad_norm": 0.39561185240745544, + "learning_rate": 9.514626217711597e-05, + "loss": 1.9203, + "step": 5448 + }, + { + "epoch": 1.6724984653161448, + "grad_norm": 0.4435743987560272, + "learning_rate": 9.514412561918068e-05, + "loss": 1.953, + "step": 5449 + }, + { + "epoch": 1.67280540208717, + "grad_norm": 0.5383535027503967, + "learning_rate": 9.514198861510467e-05, + "loss": 1.9662, + "step": 5450 + }, + { + "epoch": 1.6731123388581952, + "grad_norm": 0.4787214696407318, + "learning_rate": 9.513985116490906e-05, + "loss": 1.9278, + "step": 5451 + }, + { + "epoch": 1.6734192756292203, + "grad_norm": 0.40962034463882446, + "learning_rate": 9.513771326861501e-05, + "loss": 1.9267, + "step": 5452 + }, + { + "epoch": 1.6737262124002457, + "grad_norm": 0.43605929613113403, + "learning_rate": 9.513557492624359e-05, + "loss": 1.9537, + "step": 5453 + }, + { + "epoch": 1.6740331491712708, + "grad_norm": 0.46278494596481323, + "learning_rate": 9.513343613781599e-05, + "loss": 1.9383, + "step": 5454 + }, + { + "epoch": 1.6743400859422959, + "grad_norm": 0.4052918255329132, + "learning_rate": 9.513129690335331e-05, + "loss": 1.9289, + "step": 5455 + }, + { + "epoch": 1.6746470227133212, + "grad_norm": 0.37791141867637634, + "learning_rate": 9.51291572228767e-05, + "loss": 1.9185, + "step": 5456 + }, + { + "epoch": 1.674953959484346, + "grad_norm": 0.41135111451148987, + "learning_rate": 9.512701709640731e-05, + "loss": 2.0003, + "step": 5457 + }, + { + "epoch": 1.6752608962553714, + "grad_norm": 0.41175320744514465, + "learning_rate": 9.512487652396629e-05, + "loss": 1.9307, + "step": 5458 + }, + { + "epoch": 1.6755678330263966, + "grad_norm": 0.40061330795288086, + "learning_rate": 9.512273550557478e-05, + "loss": 1.9361, + "step": 5459 + }, + { + "epoch": 1.6758747697974217, + "grad_norm": 0.3938329219818115, + "learning_rate": 9.512059404125397e-05, + "loss": 1.9419, + "step": 5460 + }, + { + "epoch": 1.676181706568447, + "grad_norm": 0.42825883626937866, + "learning_rate": 9.511845213102498e-05, + "loss": 1.9201, + "step": 5461 + }, + { + "epoch": 1.6764886433394721, + "grad_norm": 0.3795798122882843, + "learning_rate": 9.511630977490901e-05, + "loss": 1.9872, + "step": 5462 + }, + { + "epoch": 1.6767955801104972, + "grad_norm": 0.3639005422592163, + "learning_rate": 9.511416697292724e-05, + "loss": 1.9066, + "step": 5463 + }, + { + "epoch": 1.6771025168815226, + "grad_norm": 0.4200088381767273, + "learning_rate": 9.511202372510082e-05, + "loss": 1.9928, + "step": 5464 + }, + { + "epoch": 1.6774094536525475, + "grad_norm": 0.436638742685318, + "learning_rate": 9.510988003145092e-05, + "loss": 1.8527, + "step": 5465 + }, + { + "epoch": 1.6777163904235728, + "grad_norm": 0.40901345014572144, + "learning_rate": 9.510773589199877e-05, + "loss": 1.9915, + "step": 5466 + }, + { + "epoch": 1.678023327194598, + "grad_norm": 0.39717167615890503, + "learning_rate": 9.510559130676553e-05, + "loss": 1.9682, + "step": 5467 + }, + { + "epoch": 1.678330263965623, + "grad_norm": 0.37574490904808044, + "learning_rate": 9.510344627577239e-05, + "loss": 1.9641, + "step": 5468 + }, + { + "epoch": 1.6786372007366483, + "grad_norm": 0.36686137318611145, + "learning_rate": 9.510130079904057e-05, + "loss": 1.9082, + "step": 5469 + }, + { + "epoch": 1.6789441375076735, + "grad_norm": 0.37321972846984863, + "learning_rate": 9.509915487659125e-05, + "loss": 1.8911, + "step": 5470 + }, + { + "epoch": 1.6792510742786986, + "grad_norm": 0.3911389112472534, + "learning_rate": 9.509700850844566e-05, + "loss": 1.9721, + "step": 5471 + }, + { + "epoch": 1.679558011049724, + "grad_norm": 0.41182973980903625, + "learning_rate": 9.509486169462499e-05, + "loss": 1.9188, + "step": 5472 + }, + { + "epoch": 1.6798649478207488, + "grad_norm": 0.4141900837421417, + "learning_rate": 9.509271443515047e-05, + "loss": 1.875, + "step": 5473 + }, + { + "epoch": 1.6801718845917741, + "grad_norm": 0.4259745478630066, + "learning_rate": 9.509056673004333e-05, + "loss": 1.9258, + "step": 5474 + }, + { + "epoch": 1.6804788213627992, + "grad_norm": 0.47081178426742554, + "learning_rate": 9.508841857932476e-05, + "loss": 2.0494, + "step": 5475 + }, + { + "epoch": 1.6807857581338244, + "grad_norm": 0.5346465110778809, + "learning_rate": 9.508626998301602e-05, + "loss": 1.9371, + "step": 5476 + }, + { + "epoch": 1.6810926949048497, + "grad_norm": 0.5532976388931274, + "learning_rate": 9.508412094113832e-05, + "loss": 1.8727, + "step": 5477 + }, + { + "epoch": 1.6813996316758748, + "grad_norm": 0.5262138843536377, + "learning_rate": 9.508197145371294e-05, + "loss": 1.9098, + "step": 5478 + }, + { + "epoch": 1.6817065684469, + "grad_norm": 0.47581788897514343, + "learning_rate": 9.507982152076108e-05, + "loss": 1.9174, + "step": 5479 + }, + { + "epoch": 1.6820135052179253, + "grad_norm": 0.41795024275779724, + "learning_rate": 9.507767114230399e-05, + "loss": 1.9333, + "step": 5480 + }, + { + "epoch": 1.6823204419889501, + "grad_norm": 0.5213392376899719, + "learning_rate": 9.507552031836295e-05, + "loss": 1.9731, + "step": 5481 + }, + { + "epoch": 1.6826273787599755, + "grad_norm": 0.624969482421875, + "learning_rate": 9.507336904895919e-05, + "loss": 1.965, + "step": 5482 + }, + { + "epoch": 1.6829343155310006, + "grad_norm": 0.5719303488731384, + "learning_rate": 9.507121733411397e-05, + "loss": 1.9325, + "step": 5483 + }, + { + "epoch": 1.6832412523020257, + "grad_norm": 0.45429563522338867, + "learning_rate": 9.506906517384858e-05, + "loss": 1.8846, + "step": 5484 + }, + { + "epoch": 1.683548189073051, + "grad_norm": 0.4679521322250366, + "learning_rate": 9.506691256818427e-05, + "loss": 1.9609, + "step": 5485 + }, + { + "epoch": 1.6838551258440762, + "grad_norm": 0.64385986328125, + "learning_rate": 9.50647595171423e-05, + "loss": 1.9138, + "step": 5486 + }, + { + "epoch": 1.6841620626151013, + "grad_norm": 0.6783073544502258, + "learning_rate": 9.506260602074398e-05, + "loss": 2.0252, + "step": 5487 + }, + { + "epoch": 1.6844689993861266, + "grad_norm": 0.6151844263076782, + "learning_rate": 9.506045207901058e-05, + "loss": 2.0077, + "step": 5488 + }, + { + "epoch": 1.6847759361571515, + "grad_norm": 0.43046683073043823, + "learning_rate": 9.505829769196338e-05, + "loss": 1.8945, + "step": 5489 + }, + { + "epoch": 1.6850828729281768, + "grad_norm": 0.44831258058547974, + "learning_rate": 9.505614285962366e-05, + "loss": 1.9775, + "step": 5490 + }, + { + "epoch": 1.685389809699202, + "grad_norm": 0.4917668402194977, + "learning_rate": 9.505398758201272e-05, + "loss": 1.9115, + "step": 5491 + }, + { + "epoch": 1.685696746470227, + "grad_norm": 0.4595036506652832, + "learning_rate": 9.505183185915187e-05, + "loss": 1.9103, + "step": 5492 + }, + { + "epoch": 1.6860036832412524, + "grad_norm": 0.43335607647895813, + "learning_rate": 9.504967569106243e-05, + "loss": 1.9147, + "step": 5493 + }, + { + "epoch": 1.6863106200122775, + "grad_norm": 0.42885956168174744, + "learning_rate": 9.504751907776567e-05, + "loss": 2.0085, + "step": 5494 + }, + { + "epoch": 1.6866175567833026, + "grad_norm": 0.4121492803096771, + "learning_rate": 9.504536201928295e-05, + "loss": 1.9212, + "step": 5495 + }, + { + "epoch": 1.686924493554328, + "grad_norm": 0.4387015700340271, + "learning_rate": 9.504320451563555e-05, + "loss": 1.9202, + "step": 5496 + }, + { + "epoch": 1.6872314303253528, + "grad_norm": 0.4333394467830658, + "learning_rate": 9.504104656684481e-05, + "loss": 1.9165, + "step": 5497 + }, + { + "epoch": 1.6875383670963782, + "grad_norm": 0.37835901975631714, + "learning_rate": 9.503888817293203e-05, + "loss": 1.9087, + "step": 5498 + }, + { + "epoch": 1.6878453038674033, + "grad_norm": 0.42156684398651123, + "learning_rate": 9.503672933391857e-05, + "loss": 1.8909, + "step": 5499 + }, + { + "epoch": 1.6881522406384284, + "grad_norm": 0.4315885603427887, + "learning_rate": 9.503457004982574e-05, + "loss": 1.8892, + "step": 5500 + }, + { + "epoch": 1.6884591774094537, + "grad_norm": 0.4349892735481262, + "learning_rate": 9.50324103206749e-05, + "loss": 1.9532, + "step": 5501 + }, + { + "epoch": 1.6887661141804788, + "grad_norm": 0.45786523818969727, + "learning_rate": 9.503025014648739e-05, + "loss": 1.9285, + "step": 5502 + }, + { + "epoch": 1.689073050951504, + "grad_norm": 0.36640092730522156, + "learning_rate": 9.502808952728456e-05, + "loss": 1.9167, + "step": 5503 + }, + { + "epoch": 1.6893799877225293, + "grad_norm": 0.46942031383514404, + "learning_rate": 9.502592846308775e-05, + "loss": 2.08, + "step": 5504 + }, + { + "epoch": 1.6896869244935542, + "grad_norm": 0.44714173674583435, + "learning_rate": 9.502376695391833e-05, + "loss": 1.9618, + "step": 5505 + }, + { + "epoch": 1.6899938612645795, + "grad_norm": 0.4216810464859009, + "learning_rate": 9.502160499979764e-05, + "loss": 1.888, + "step": 5506 + }, + { + "epoch": 1.6903007980356046, + "grad_norm": 0.40471377968788147, + "learning_rate": 9.501944260074709e-05, + "loss": 1.9048, + "step": 5507 + }, + { + "epoch": 1.6906077348066297, + "grad_norm": 0.399309366941452, + "learning_rate": 9.501727975678801e-05, + "loss": 1.8796, + "step": 5508 + }, + { + "epoch": 1.690914671577655, + "grad_norm": 0.36903873085975647, + "learning_rate": 9.501511646794176e-05, + "loss": 1.9607, + "step": 5509 + }, + { + "epoch": 1.6912216083486802, + "grad_norm": 0.40781939029693604, + "learning_rate": 9.501295273422977e-05, + "loss": 1.9328, + "step": 5510 + }, + { + "epoch": 1.6915285451197053, + "grad_norm": 0.38062483072280884, + "learning_rate": 9.50107885556734e-05, + "loss": 1.9552, + "step": 5511 + }, + { + "epoch": 1.6918354818907306, + "grad_norm": 0.4047648012638092, + "learning_rate": 9.500862393229402e-05, + "loss": 1.9503, + "step": 5512 + }, + { + "epoch": 1.6921424186617555, + "grad_norm": 0.3829517066478729, + "learning_rate": 9.500645886411305e-05, + "loss": 1.9034, + "step": 5513 + }, + { + "epoch": 1.6924493554327809, + "grad_norm": 0.3657867908477783, + "learning_rate": 9.500429335115188e-05, + "loss": 1.869, + "step": 5514 + }, + { + "epoch": 1.692756292203806, + "grad_norm": 0.410877525806427, + "learning_rate": 9.50021273934319e-05, + "loss": 1.9824, + "step": 5515 + }, + { + "epoch": 1.693063228974831, + "grad_norm": 0.420682817697525, + "learning_rate": 9.499996099097453e-05, + "loss": 1.969, + "step": 5516 + }, + { + "epoch": 1.6933701657458564, + "grad_norm": 0.44578227400779724, + "learning_rate": 9.499779414380115e-05, + "loss": 1.9513, + "step": 5517 + }, + { + "epoch": 1.6936771025168815, + "grad_norm": 0.42710423469543457, + "learning_rate": 9.499562685193319e-05, + "loss": 1.9423, + "step": 5518 + }, + { + "epoch": 1.6939840392879066, + "grad_norm": 0.4503214657306671, + "learning_rate": 9.49934591153921e-05, + "loss": 1.9849, + "step": 5519 + }, + { + "epoch": 1.694290976058932, + "grad_norm": 0.427157998085022, + "learning_rate": 9.499129093419926e-05, + "loss": 1.9502, + "step": 5520 + }, + { + "epoch": 1.6945979128299569, + "grad_norm": 0.4356638491153717, + "learning_rate": 9.498912230837611e-05, + "loss": 1.8593, + "step": 5521 + }, + { + "epoch": 1.6949048496009822, + "grad_norm": 0.3894338309764862, + "learning_rate": 9.498695323794409e-05, + "loss": 1.8857, + "step": 5522 + }, + { + "epoch": 1.6952117863720073, + "grad_norm": 0.4285121262073517, + "learning_rate": 9.498478372292464e-05, + "loss": 1.9774, + "step": 5523 + }, + { + "epoch": 1.6955187231430324, + "grad_norm": 0.4316183924674988, + "learning_rate": 9.498261376333916e-05, + "loss": 1.9067, + "step": 5524 + }, + { + "epoch": 1.6958256599140578, + "grad_norm": 0.3760167956352234, + "learning_rate": 9.498044335920914e-05, + "loss": 1.8375, + "step": 5525 + }, + { + "epoch": 1.6961325966850829, + "grad_norm": 0.4327097237110138, + "learning_rate": 9.497827251055602e-05, + "loss": 1.9333, + "step": 5526 + }, + { + "epoch": 1.696439533456108, + "grad_norm": 0.4169953167438507, + "learning_rate": 9.497610121740126e-05, + "loss": 1.9015, + "step": 5527 + }, + { + "epoch": 1.6967464702271333, + "grad_norm": 0.3915253281593323, + "learning_rate": 9.49739294797663e-05, + "loss": 1.8608, + "step": 5528 + }, + { + "epoch": 1.6970534069981584, + "grad_norm": 0.4071075916290283, + "learning_rate": 9.497175729767259e-05, + "loss": 1.9336, + "step": 5529 + }, + { + "epoch": 1.6973603437691835, + "grad_norm": 0.3550303876399994, + "learning_rate": 9.496958467114163e-05, + "loss": 1.8614, + "step": 5530 + }, + { + "epoch": 1.6976672805402089, + "grad_norm": 0.3757273554801941, + "learning_rate": 9.496741160019487e-05, + "loss": 1.9959, + "step": 5531 + }, + { + "epoch": 1.6979742173112338, + "grad_norm": 0.4126262366771698, + "learning_rate": 9.49652380848538e-05, + "loss": 1.935, + "step": 5532 + }, + { + "epoch": 1.698281154082259, + "grad_norm": 0.46366190910339355, + "learning_rate": 9.496306412513988e-05, + "loss": 1.9336, + "step": 5533 + }, + { + "epoch": 1.6985880908532842, + "grad_norm": 0.42553630471229553, + "learning_rate": 9.496088972107463e-05, + "loss": 1.9388, + "step": 5534 + }, + { + "epoch": 1.6988950276243093, + "grad_norm": 0.4060843884944916, + "learning_rate": 9.49587148726795e-05, + "loss": 1.917, + "step": 5535 + }, + { + "epoch": 1.6992019643953347, + "grad_norm": 0.37994736433029175, + "learning_rate": 9.495653957997601e-05, + "loss": 1.9268, + "step": 5536 + }, + { + "epoch": 1.6995089011663598, + "grad_norm": 0.4148559272289276, + "learning_rate": 9.495436384298563e-05, + "loss": 1.8936, + "step": 5537 + }, + { + "epoch": 1.6998158379373849, + "grad_norm": 0.39814767241477966, + "learning_rate": 9.495218766172989e-05, + "loss": 1.9468, + "step": 5538 + }, + { + "epoch": 1.7001227747084102, + "grad_norm": 0.40800294280052185, + "learning_rate": 9.495001103623027e-05, + "loss": 1.9649, + "step": 5539 + }, + { + "epoch": 1.7004297114794351, + "grad_norm": 0.4225989282131195, + "learning_rate": 9.49478339665083e-05, + "loss": 1.987, + "step": 5540 + }, + { + "epoch": 1.7007366482504604, + "grad_norm": 0.4280939996242523, + "learning_rate": 9.494565645258551e-05, + "loss": 2.0487, + "step": 5541 + }, + { + "epoch": 1.7010435850214856, + "grad_norm": 0.44816237688064575, + "learning_rate": 9.494347849448338e-05, + "loss": 1.9112, + "step": 5542 + }, + { + "epoch": 1.7013505217925107, + "grad_norm": 0.424629271030426, + "learning_rate": 9.494130009222346e-05, + "loss": 1.9284, + "step": 5543 + }, + { + "epoch": 1.701657458563536, + "grad_norm": 0.40010082721710205, + "learning_rate": 9.493912124582727e-05, + "loss": 1.9307, + "step": 5544 + }, + { + "epoch": 1.7019643953345611, + "grad_norm": 0.42541825771331787, + "learning_rate": 9.493694195531633e-05, + "loss": 2.0009, + "step": 5545 + }, + { + "epoch": 1.7022713321055862, + "grad_norm": 0.39693546295166016, + "learning_rate": 9.49347622207122e-05, + "loss": 1.9237, + "step": 5546 + }, + { + "epoch": 1.7025782688766116, + "grad_norm": 0.37853676080703735, + "learning_rate": 9.493258204203644e-05, + "loss": 1.9212, + "step": 5547 + }, + { + "epoch": 1.7028852056476365, + "grad_norm": 0.3856247663497925, + "learning_rate": 9.493040141931054e-05, + "loss": 1.926, + "step": 5548 + }, + { + "epoch": 1.7031921424186618, + "grad_norm": 0.3429555892944336, + "learning_rate": 9.492822035255608e-05, + "loss": 1.8854, + "step": 5549 + }, + { + "epoch": 1.703499079189687, + "grad_norm": 0.3500545620918274, + "learning_rate": 9.49260388417946e-05, + "loss": 1.8627, + "step": 5550 + }, + { + "epoch": 1.703806015960712, + "grad_norm": 0.3461480140686035, + "learning_rate": 9.49238568870477e-05, + "loss": 1.8962, + "step": 5551 + }, + { + "epoch": 1.7041129527317374, + "grad_norm": 0.36311015486717224, + "learning_rate": 9.492167448833691e-05, + "loss": 1.9398, + "step": 5552 + }, + { + "epoch": 1.7044198895027625, + "grad_norm": 0.36770105361938477, + "learning_rate": 9.491949164568379e-05, + "loss": 1.9083, + "step": 5553 + }, + { + "epoch": 1.7047268262737876, + "grad_norm": 0.42491769790649414, + "learning_rate": 9.491730835910993e-05, + "loss": 1.8874, + "step": 5554 + }, + { + "epoch": 1.705033763044813, + "grad_norm": 0.5321764945983887, + "learning_rate": 9.491512462863691e-05, + "loss": 1.9813, + "step": 5555 + }, + { + "epoch": 1.7053406998158378, + "grad_norm": 0.5481576323509216, + "learning_rate": 9.49129404542863e-05, + "loss": 1.8696, + "step": 5556 + }, + { + "epoch": 1.7056476365868631, + "grad_norm": 0.47720953822135925, + "learning_rate": 9.491075583607969e-05, + "loss": 1.9026, + "step": 5557 + }, + { + "epoch": 1.7059545733578882, + "grad_norm": 0.3976534605026245, + "learning_rate": 9.490857077403865e-05, + "loss": 1.8551, + "step": 5558 + }, + { + "epoch": 1.7062615101289134, + "grad_norm": 0.3744281828403473, + "learning_rate": 9.49063852681848e-05, + "loss": 2.012, + "step": 5559 + }, + { + "epoch": 1.7065684468999387, + "grad_norm": 0.3931918740272522, + "learning_rate": 9.490419931853974e-05, + "loss": 1.845, + "step": 5560 + }, + { + "epoch": 1.7068753836709638, + "grad_norm": 0.5411466956138611, + "learning_rate": 9.490201292512506e-05, + "loss": 2.0225, + "step": 5561 + }, + { + "epoch": 1.707182320441989, + "grad_norm": 0.6602910757064819, + "learning_rate": 9.489982608796237e-05, + "loss": 1.9559, + "step": 5562 + }, + { + "epoch": 1.7074892572130143, + "grad_norm": 0.5455329418182373, + "learning_rate": 9.489763880707329e-05, + "loss": 1.8855, + "step": 5563 + }, + { + "epoch": 1.7077961939840391, + "grad_norm": 0.42309099435806274, + "learning_rate": 9.489545108247941e-05, + "loss": 1.8784, + "step": 5564 + }, + { + "epoch": 1.7081031307550645, + "grad_norm": 0.3817001283168793, + "learning_rate": 9.489326291420239e-05, + "loss": 1.8926, + "step": 5565 + }, + { + "epoch": 1.7084100675260896, + "grad_norm": 0.5077582597732544, + "learning_rate": 9.489107430226381e-05, + "loss": 1.8742, + "step": 5566 + }, + { + "epoch": 1.7087170042971147, + "grad_norm": 0.5634065866470337, + "learning_rate": 9.488888524668533e-05, + "loss": 1.9251, + "step": 5567 + }, + { + "epoch": 1.70902394106814, + "grad_norm": 0.5182891488075256, + "learning_rate": 9.488669574748859e-05, + "loss": 1.9689, + "step": 5568 + }, + { + "epoch": 1.7093308778391652, + "grad_norm": 0.4180498719215393, + "learning_rate": 9.48845058046952e-05, + "loss": 1.9248, + "step": 5569 + }, + { + "epoch": 1.7096378146101903, + "grad_norm": 0.4833194315433502, + "learning_rate": 9.488231541832682e-05, + "loss": 2.0115, + "step": 5570 + }, + { + "epoch": 1.7099447513812156, + "grad_norm": 0.46525415778160095, + "learning_rate": 9.488012458840509e-05, + "loss": 1.9108, + "step": 5571 + }, + { + "epoch": 1.7102516881522405, + "grad_norm": 0.5051191449165344, + "learning_rate": 9.487793331495166e-05, + "loss": 1.9055, + "step": 5572 + }, + { + "epoch": 1.7105586249232658, + "grad_norm": 0.4713154137134552, + "learning_rate": 9.48757415979882e-05, + "loss": 1.9104, + "step": 5573 + }, + { + "epoch": 1.710865561694291, + "grad_norm": 0.44901835918426514, + "learning_rate": 9.487354943753635e-05, + "loss": 1.9536, + "step": 5574 + }, + { + "epoch": 1.711172498465316, + "grad_norm": 0.41106006503105164, + "learning_rate": 9.487135683361778e-05, + "loss": 1.9549, + "step": 5575 + }, + { + "epoch": 1.7114794352363414, + "grad_norm": 0.4571320116519928, + "learning_rate": 9.486916378625416e-05, + "loss": 1.859, + "step": 5576 + }, + { + "epoch": 1.7117863720073665, + "grad_norm": 0.4423540532588959, + "learning_rate": 9.486697029546718e-05, + "loss": 1.9621, + "step": 5577 + }, + { + "epoch": 1.7120933087783916, + "grad_norm": 0.44291070103645325, + "learning_rate": 9.48647763612785e-05, + "loss": 1.8567, + "step": 5578 + }, + { + "epoch": 1.712400245549417, + "grad_norm": 0.4374423921108246, + "learning_rate": 9.486258198370981e-05, + "loss": 1.9754, + "step": 5579 + }, + { + "epoch": 1.7127071823204418, + "grad_norm": 0.44008153676986694, + "learning_rate": 9.486038716278277e-05, + "loss": 1.8815, + "step": 5580 + }, + { + "epoch": 1.7130141190914672, + "grad_norm": 0.3571348190307617, + "learning_rate": 9.48581918985191e-05, + "loss": 1.8948, + "step": 5581 + }, + { + "epoch": 1.7133210558624923, + "grad_norm": 0.42260754108428955, + "learning_rate": 9.485599619094049e-05, + "loss": 1.9964, + "step": 5582 + }, + { + "epoch": 1.7136279926335174, + "grad_norm": 0.44568777084350586, + "learning_rate": 9.485380004006863e-05, + "loss": 1.9596, + "step": 5583 + }, + { + "epoch": 1.7139349294045427, + "grad_norm": 0.5488269925117493, + "learning_rate": 9.485160344592523e-05, + "loss": 1.9239, + "step": 5584 + }, + { + "epoch": 1.7142418661755678, + "grad_norm": 0.5653155446052551, + "learning_rate": 9.484940640853199e-05, + "loss": 1.9115, + "step": 5585 + }, + { + "epoch": 1.714548802946593, + "grad_norm": 0.4652312099933624, + "learning_rate": 9.484720892791064e-05, + "loss": 1.9973, + "step": 5586 + }, + { + "epoch": 1.7148557397176183, + "grad_norm": 0.41521382331848145, + "learning_rate": 9.484501100408288e-05, + "loss": 1.9395, + "step": 5587 + }, + { + "epoch": 1.7151626764886432, + "grad_norm": 0.46761438250541687, + "learning_rate": 9.484281263707043e-05, + "loss": 1.9465, + "step": 5588 + }, + { + "epoch": 1.7154696132596685, + "grad_norm": 0.46990182995796204, + "learning_rate": 9.484061382689501e-05, + "loss": 1.8969, + "step": 5589 + }, + { + "epoch": 1.7157765500306936, + "grad_norm": 0.44951021671295166, + "learning_rate": 9.48384145735784e-05, + "loss": 1.9925, + "step": 5590 + }, + { + "epoch": 1.7160834868017187, + "grad_norm": 0.4029327630996704, + "learning_rate": 9.483621487714227e-05, + "loss": 1.8574, + "step": 5591 + }, + { + "epoch": 1.716390423572744, + "grad_norm": 0.3501027226448059, + "learning_rate": 9.48340147376084e-05, + "loss": 1.9156, + "step": 5592 + }, + { + "epoch": 1.7166973603437692, + "grad_norm": 0.5058720111846924, + "learning_rate": 9.48318141549985e-05, + "loss": 2.071, + "step": 5593 + }, + { + "epoch": 1.7170042971147943, + "grad_norm": 0.5097518563270569, + "learning_rate": 9.482961312933435e-05, + "loss": 1.9609, + "step": 5594 + }, + { + "epoch": 1.7173112338858196, + "grad_norm": 0.4728573262691498, + "learning_rate": 9.482741166063769e-05, + "loss": 1.9552, + "step": 5595 + }, + { + "epoch": 1.7176181706568447, + "grad_norm": 0.44095897674560547, + "learning_rate": 9.482520974893026e-05, + "loss": 2.011, + "step": 5596 + }, + { + "epoch": 1.7179251074278699, + "grad_norm": 0.48331573605537415, + "learning_rate": 9.482300739423385e-05, + "loss": 1.9676, + "step": 5597 + }, + { + "epoch": 1.7182320441988952, + "grad_norm": 0.4890894293785095, + "learning_rate": 9.482080459657019e-05, + "loss": 1.9571, + "step": 5598 + }, + { + "epoch": 1.71853898096992, + "grad_norm": 0.4486929476261139, + "learning_rate": 9.481860135596109e-05, + "loss": 1.9205, + "step": 5599 + }, + { + "epoch": 1.7188459177409454, + "grad_norm": 0.44154083728790283, + "learning_rate": 9.48163976724283e-05, + "loss": 1.9995, + "step": 5600 + }, + { + "epoch": 1.7191528545119705, + "grad_norm": 0.4155641496181488, + "learning_rate": 9.481419354599358e-05, + "loss": 1.9192, + "step": 5601 + }, + { + "epoch": 1.7194597912829956, + "grad_norm": 0.453253835439682, + "learning_rate": 9.481198897667875e-05, + "loss": 2.0102, + "step": 5602 + }, + { + "epoch": 1.719766728054021, + "grad_norm": 0.4325653314590454, + "learning_rate": 9.480978396450557e-05, + "loss": 1.8859, + "step": 5603 + }, + { + "epoch": 1.720073664825046, + "grad_norm": 0.4191089868545532, + "learning_rate": 9.480757850949584e-05, + "loss": 2.0007, + "step": 5604 + }, + { + "epoch": 1.7203806015960712, + "grad_norm": 0.4182284474372864, + "learning_rate": 9.480537261167137e-05, + "loss": 1.9374, + "step": 5605 + }, + { + "epoch": 1.7206875383670965, + "grad_norm": 0.4695988893508911, + "learning_rate": 9.480316627105394e-05, + "loss": 1.983, + "step": 5606 + }, + { + "epoch": 1.7209944751381214, + "grad_norm": 0.4668160378932953, + "learning_rate": 9.480095948766536e-05, + "loss": 1.8705, + "step": 5607 + }, + { + "epoch": 1.7213014119091468, + "grad_norm": 0.3689236044883728, + "learning_rate": 9.479875226152744e-05, + "loss": 1.8695, + "step": 5608 + }, + { + "epoch": 1.7216083486801719, + "grad_norm": 0.4206932485103607, + "learning_rate": 9.4796544592662e-05, + "loss": 1.9494, + "step": 5609 + }, + { + "epoch": 1.721915285451197, + "grad_norm": 0.4420578181743622, + "learning_rate": 9.479433648109083e-05, + "loss": 1.8749, + "step": 5610 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.4243582487106323, + "learning_rate": 9.479212792683579e-05, + "loss": 1.9524, + "step": 5611 + }, + { + "epoch": 1.7225291589932474, + "grad_norm": 0.5053666234016418, + "learning_rate": 9.478991892991868e-05, + "loss": 1.9308, + "step": 5612 + }, + { + "epoch": 1.7228360957642725, + "grad_norm": 0.4365650713443756, + "learning_rate": 9.478770949036136e-05, + "loss": 1.9469, + "step": 5613 + }, + { + "epoch": 1.7231430325352979, + "grad_norm": 0.3916216194629669, + "learning_rate": 9.478549960818561e-05, + "loss": 1.8239, + "step": 5614 + }, + { + "epoch": 1.7234499693063228, + "grad_norm": 0.4051356911659241, + "learning_rate": 9.478328928341334e-05, + "loss": 1.892, + "step": 5615 + }, + { + "epoch": 1.723756906077348, + "grad_norm": 0.36592593789100647, + "learning_rate": 9.478107851606633e-05, + "loss": 1.8763, + "step": 5616 + }, + { + "epoch": 1.7240638428483732, + "grad_norm": 0.45741888880729675, + "learning_rate": 9.477886730616645e-05, + "loss": 1.9502, + "step": 5617 + }, + { + "epoch": 1.7243707796193983, + "grad_norm": 0.38170990347862244, + "learning_rate": 9.477665565373558e-05, + "loss": 1.8568, + "step": 5618 + }, + { + "epoch": 1.7246777163904237, + "grad_norm": 0.4193691313266754, + "learning_rate": 9.477444355879554e-05, + "loss": 1.9553, + "step": 5619 + }, + { + "epoch": 1.7249846531614488, + "grad_norm": 0.39682838320732117, + "learning_rate": 9.477223102136821e-05, + "loss": 1.9474, + "step": 5620 + }, + { + "epoch": 1.725291589932474, + "grad_norm": 0.391544371843338, + "learning_rate": 9.477001804147545e-05, + "loss": 1.9277, + "step": 5621 + }, + { + "epoch": 1.7255985267034992, + "grad_norm": 0.42348888516426086, + "learning_rate": 9.476780461913913e-05, + "loss": 1.8923, + "step": 5622 + }, + { + "epoch": 1.7259054634745241, + "grad_norm": 0.4393916130065918, + "learning_rate": 9.476559075438114e-05, + "loss": 1.9052, + "step": 5623 + }, + { + "epoch": 1.7262124002455494, + "grad_norm": 0.42631569504737854, + "learning_rate": 9.476337644722333e-05, + "loss": 1.8849, + "step": 5624 + }, + { + "epoch": 1.7265193370165746, + "grad_norm": 0.3514206111431122, + "learning_rate": 9.47611616976876e-05, + "loss": 1.9286, + "step": 5625 + }, + { + "epoch": 1.7268262737875997, + "grad_norm": 0.4104609191417694, + "learning_rate": 9.475894650579582e-05, + "loss": 1.9178, + "step": 5626 + }, + { + "epoch": 1.727133210558625, + "grad_norm": 0.44329676032066345, + "learning_rate": 9.475673087156992e-05, + "loss": 1.9789, + "step": 5627 + }, + { + "epoch": 1.7274401473296501, + "grad_norm": 0.41865840554237366, + "learning_rate": 9.475451479503175e-05, + "loss": 1.9105, + "step": 5628 + }, + { + "epoch": 1.7277470841006752, + "grad_norm": 0.4166790544986725, + "learning_rate": 9.475229827620326e-05, + "loss": 1.9089, + "step": 5629 + }, + { + "epoch": 1.7280540208717006, + "grad_norm": 0.353771448135376, + "learning_rate": 9.475008131510633e-05, + "loss": 1.9081, + "step": 5630 + }, + { + "epoch": 1.7283609576427255, + "grad_norm": 0.385046124458313, + "learning_rate": 9.474786391176284e-05, + "loss": 1.9268, + "step": 5631 + }, + { + "epoch": 1.7286678944137508, + "grad_norm": 0.3956538438796997, + "learning_rate": 9.474564606619474e-05, + "loss": 1.9445, + "step": 5632 + }, + { + "epoch": 1.728974831184776, + "grad_norm": 0.41305112838745117, + "learning_rate": 9.474342777842394e-05, + "loss": 1.9331, + "step": 5633 + }, + { + "epoch": 1.729281767955801, + "grad_norm": 0.39336860179901123, + "learning_rate": 9.474120904847237e-05, + "loss": 1.9792, + "step": 5634 + }, + { + "epoch": 1.7295887047268264, + "grad_norm": 0.41963186860084534, + "learning_rate": 9.473898987636194e-05, + "loss": 1.8719, + "step": 5635 + }, + { + "epoch": 1.7298956414978515, + "grad_norm": 0.4087338149547577, + "learning_rate": 9.473677026211458e-05, + "loss": 1.9121, + "step": 5636 + }, + { + "epoch": 1.7302025782688766, + "grad_norm": 0.3693830966949463, + "learning_rate": 9.473455020575226e-05, + "loss": 1.9293, + "step": 5637 + }, + { + "epoch": 1.730509515039902, + "grad_norm": 0.40699541568756104, + "learning_rate": 9.473232970729688e-05, + "loss": 1.94, + "step": 5638 + }, + { + "epoch": 1.7308164518109268, + "grad_norm": 0.4222811162471771, + "learning_rate": 9.473010876677041e-05, + "loss": 1.9416, + "step": 5639 + }, + { + "epoch": 1.7311233885819521, + "grad_norm": 0.41459110379219055, + "learning_rate": 9.472788738419477e-05, + "loss": 1.8801, + "step": 5640 + }, + { + "epoch": 1.7314303253529773, + "grad_norm": 0.36970487236976624, + "learning_rate": 9.472566555959195e-05, + "loss": 1.9122, + "step": 5641 + }, + { + "epoch": 1.7317372621240024, + "grad_norm": 0.35511577129364014, + "learning_rate": 9.472344329298388e-05, + "loss": 1.8646, + "step": 5642 + }, + { + "epoch": 1.7320441988950277, + "grad_norm": 0.3511577248573303, + "learning_rate": 9.472122058439252e-05, + "loss": 1.9047, + "step": 5643 + }, + { + "epoch": 1.7323511356660528, + "grad_norm": 0.3421955108642578, + "learning_rate": 9.471899743383986e-05, + "loss": 1.8732, + "step": 5644 + }, + { + "epoch": 1.732658072437078, + "grad_norm": 0.44008341431617737, + "learning_rate": 9.471677384134785e-05, + "loss": 1.8956, + "step": 5645 + }, + { + "epoch": 1.7329650092081033, + "grad_norm": 0.49410128593444824, + "learning_rate": 9.471454980693848e-05, + "loss": 1.9197, + "step": 5646 + }, + { + "epoch": 1.7332719459791281, + "grad_norm": 0.4664965867996216, + "learning_rate": 9.471232533063373e-05, + "loss": 1.8945, + "step": 5647 + }, + { + "epoch": 1.7335788827501535, + "grad_norm": 0.3789248764514923, + "learning_rate": 9.471010041245555e-05, + "loss": 1.9153, + "step": 5648 + }, + { + "epoch": 1.7338858195211786, + "grad_norm": 0.34556612372398376, + "learning_rate": 9.470787505242596e-05, + "loss": 1.9144, + "step": 5649 + }, + { + "epoch": 1.7341927562922037, + "grad_norm": 0.3466256856918335, + "learning_rate": 9.470564925056695e-05, + "loss": 1.8837, + "step": 5650 + }, + { + "epoch": 1.734499693063229, + "grad_norm": 0.34612321853637695, + "learning_rate": 9.470342300690051e-05, + "loss": 1.8667, + "step": 5651 + }, + { + "epoch": 1.7348066298342542, + "grad_norm": 0.3648833632469177, + "learning_rate": 9.470119632144864e-05, + "loss": 1.9499, + "step": 5652 + }, + { + "epoch": 1.7351135666052793, + "grad_norm": 0.3600454330444336, + "learning_rate": 9.469896919423334e-05, + "loss": 1.9093, + "step": 5653 + }, + { + "epoch": 1.7354205033763046, + "grad_norm": 0.41487598419189453, + "learning_rate": 9.469674162527664e-05, + "loss": 1.9714, + "step": 5654 + }, + { + "epoch": 1.7357274401473295, + "grad_norm": 0.35980695486068726, + "learning_rate": 9.469451361460053e-05, + "loss": 1.9006, + "step": 5655 + }, + { + "epoch": 1.7360343769183548, + "grad_norm": 0.42676928639411926, + "learning_rate": 9.469228516222705e-05, + "loss": 1.9286, + "step": 5656 + }, + { + "epoch": 1.73634131368938, + "grad_norm": 0.41541969776153564, + "learning_rate": 9.469005626817822e-05, + "loss": 1.9243, + "step": 5657 + }, + { + "epoch": 1.736648250460405, + "grad_norm": 0.4245065152645111, + "learning_rate": 9.468782693247604e-05, + "loss": 1.9427, + "step": 5658 + }, + { + "epoch": 1.7369551872314304, + "grad_norm": 0.46148940920829773, + "learning_rate": 9.468559715514257e-05, + "loss": 2.0201, + "step": 5659 + }, + { + "epoch": 1.7372621240024555, + "grad_norm": 0.47727301716804504, + "learning_rate": 9.468336693619985e-05, + "loss": 1.9792, + "step": 5660 + }, + { + "epoch": 1.7375690607734806, + "grad_norm": 0.4807848036289215, + "learning_rate": 9.46811362756699e-05, + "loss": 1.9036, + "step": 5661 + }, + { + "epoch": 1.737875997544506, + "grad_norm": 0.5129636526107788, + "learning_rate": 9.467890517357477e-05, + "loss": 1.8861, + "step": 5662 + }, + { + "epoch": 1.7381829343155308, + "grad_norm": 0.467804878950119, + "learning_rate": 9.467667362993651e-05, + "loss": 1.868, + "step": 5663 + }, + { + "epoch": 1.7384898710865562, + "grad_norm": 0.4179893136024475, + "learning_rate": 9.46744416447772e-05, + "loss": 1.9521, + "step": 5664 + }, + { + "epoch": 1.7387968078575813, + "grad_norm": 0.4384612739086151, + "learning_rate": 9.467220921811884e-05, + "loss": 1.9167, + "step": 5665 + }, + { + "epoch": 1.7391037446286064, + "grad_norm": 0.517855703830719, + "learning_rate": 9.466997634998354e-05, + "loss": 1.8919, + "step": 5666 + }, + { + "epoch": 1.7394106813996317, + "grad_norm": 0.4875940978527069, + "learning_rate": 9.466774304039334e-05, + "loss": 1.8774, + "step": 5667 + }, + { + "epoch": 1.7397176181706568, + "grad_norm": 0.44286540150642395, + "learning_rate": 9.466550928937034e-05, + "loss": 1.9696, + "step": 5668 + }, + { + "epoch": 1.740024554941682, + "grad_norm": 0.4092461168766022, + "learning_rate": 9.466327509693658e-05, + "loss": 1.9978, + "step": 5669 + }, + { + "epoch": 1.7403314917127073, + "grad_norm": 0.42797163128852844, + "learning_rate": 9.466104046311418e-05, + "loss": 1.9428, + "step": 5670 + }, + { + "epoch": 1.7406384284837324, + "grad_norm": 0.5174738764762878, + "learning_rate": 9.465880538792518e-05, + "loss": 1.9493, + "step": 5671 + }, + { + "epoch": 1.7409453652547575, + "grad_norm": 0.6263836622238159, + "learning_rate": 9.46565698713917e-05, + "loss": 1.9131, + "step": 5672 + }, + { + "epoch": 1.7412523020257828, + "grad_norm": 0.6452967524528503, + "learning_rate": 9.465433391353582e-05, + "loss": 2.0412, + "step": 5673 + }, + { + "epoch": 1.7415592387968077, + "grad_norm": 0.5004684925079346, + "learning_rate": 9.465209751437964e-05, + "loss": 1.8721, + "step": 5674 + }, + { + "epoch": 1.741866175567833, + "grad_norm": 0.4694507420063019, + "learning_rate": 9.464986067394526e-05, + "loss": 1.9614, + "step": 5675 + }, + { + "epoch": 1.7421731123388582, + "grad_norm": 0.4519532322883606, + "learning_rate": 9.464762339225479e-05, + "loss": 1.9687, + "step": 5676 + }, + { + "epoch": 1.7424800491098833, + "grad_norm": 0.4297941029071808, + "learning_rate": 9.464538566933033e-05, + "loss": 1.965, + "step": 5677 + }, + { + "epoch": 1.7427869858809086, + "grad_norm": 0.4612393081188202, + "learning_rate": 9.464314750519401e-05, + "loss": 1.9651, + "step": 5678 + }, + { + "epoch": 1.7430939226519337, + "grad_norm": 0.394142210483551, + "learning_rate": 9.464090889986794e-05, + "loss": 1.9185, + "step": 5679 + }, + { + "epoch": 1.7434008594229589, + "grad_norm": 0.39999979734420776, + "learning_rate": 9.463866985337424e-05, + "loss": 1.899, + "step": 5680 + }, + { + "epoch": 1.7437077961939842, + "grad_norm": 0.40942859649658203, + "learning_rate": 9.463643036573504e-05, + "loss": 1.9653, + "step": 5681 + }, + { + "epoch": 1.744014732965009, + "grad_norm": 0.4097300171852112, + "learning_rate": 9.463419043697248e-05, + "loss": 1.9944, + "step": 5682 + }, + { + "epoch": 1.7443216697360344, + "grad_norm": 0.41627535223960876, + "learning_rate": 9.463195006710868e-05, + "loss": 1.9156, + "step": 5683 + }, + { + "epoch": 1.7446286065070595, + "grad_norm": 0.3789215385913849, + "learning_rate": 9.46297092561658e-05, + "loss": 1.9262, + "step": 5684 + }, + { + "epoch": 1.7449355432780846, + "grad_norm": 0.4867783188819885, + "learning_rate": 9.462746800416595e-05, + "loss": 1.961, + "step": 5685 + }, + { + "epoch": 1.74524248004911, + "grad_norm": 0.6078580617904663, + "learning_rate": 9.462522631113133e-05, + "loss": 1.9694, + "step": 5686 + }, + { + "epoch": 1.745549416820135, + "grad_norm": 0.558968186378479, + "learning_rate": 9.462298417708406e-05, + "loss": 1.9537, + "step": 5687 + }, + { + "epoch": 1.7458563535911602, + "grad_norm": 0.4677596986293793, + "learning_rate": 9.46207416020463e-05, + "loss": 1.9253, + "step": 5688 + }, + { + "epoch": 1.7461632903621855, + "grad_norm": 0.40353646874427795, + "learning_rate": 9.461849858604023e-05, + "loss": 1.8992, + "step": 5689 + }, + { + "epoch": 1.7464702271332104, + "grad_norm": 0.3738614618778229, + "learning_rate": 9.4616255129088e-05, + "loss": 1.9109, + "step": 5690 + }, + { + "epoch": 1.7467771639042358, + "grad_norm": 0.4040324091911316, + "learning_rate": 9.461401123121179e-05, + "loss": 1.8981, + "step": 5691 + }, + { + "epoch": 1.7470841006752609, + "grad_norm": 0.44214901328086853, + "learning_rate": 9.461176689243376e-05, + "loss": 1.9244, + "step": 5692 + }, + { + "epoch": 1.747391037446286, + "grad_norm": 0.44187378883361816, + "learning_rate": 9.460952211277611e-05, + "loss": 1.9329, + "step": 5693 + }, + { + "epoch": 1.7476979742173113, + "grad_norm": 0.44287410378456116, + "learning_rate": 9.460727689226102e-05, + "loss": 1.97, + "step": 5694 + }, + { + "epoch": 1.7480049109883364, + "grad_norm": 0.3757341504096985, + "learning_rate": 9.460503123091067e-05, + "loss": 1.8766, + "step": 5695 + }, + { + "epoch": 1.7483118477593615, + "grad_norm": 0.4139314591884613, + "learning_rate": 9.460278512874725e-05, + "loss": 1.902, + "step": 5696 + }, + { + "epoch": 1.7486187845303869, + "grad_norm": 0.37526339292526245, + "learning_rate": 9.460053858579298e-05, + "loss": 1.9325, + "step": 5697 + }, + { + "epoch": 1.7489257213014118, + "grad_norm": 0.3770616948604584, + "learning_rate": 9.459829160207004e-05, + "loss": 1.9437, + "step": 5698 + }, + { + "epoch": 1.749232658072437, + "grad_norm": 0.4069806933403015, + "learning_rate": 9.459604417760064e-05, + "loss": 1.9454, + "step": 5699 + }, + { + "epoch": 1.7495395948434622, + "grad_norm": 0.42822694778442383, + "learning_rate": 9.459379631240699e-05, + "loss": 1.8798, + "step": 5700 + }, + { + "epoch": 1.7498465316144873, + "grad_norm": 0.44075292348861694, + "learning_rate": 9.459154800651131e-05, + "loss": 1.9842, + "step": 5701 + }, + { + "epoch": 1.7501534683855127, + "grad_norm": 0.4151122272014618, + "learning_rate": 9.458929925993583e-05, + "loss": 1.8495, + "step": 5702 + }, + { + "epoch": 1.7504604051565378, + "grad_norm": 0.41887882351875305, + "learning_rate": 9.458705007270275e-05, + "loss": 1.9611, + "step": 5703 + }, + { + "epoch": 1.750767341927563, + "grad_norm": 0.3976796865463257, + "learning_rate": 9.45848004448343e-05, + "loss": 1.8841, + "step": 5704 + }, + { + "epoch": 1.7510742786985882, + "grad_norm": 0.3783813416957855, + "learning_rate": 9.458255037635272e-05, + "loss": 1.8897, + "step": 5705 + }, + { + "epoch": 1.7513812154696131, + "grad_norm": 0.35153308510780334, + "learning_rate": 9.458029986728026e-05, + "loss": 1.911, + "step": 5706 + }, + { + "epoch": 1.7516881522406385, + "grad_norm": 0.38390985131263733, + "learning_rate": 9.457804891763913e-05, + "loss": 2.0105, + "step": 5707 + }, + { + "epoch": 1.7519950890116636, + "grad_norm": 0.3830740451812744, + "learning_rate": 9.457579752745161e-05, + "loss": 1.9635, + "step": 5708 + }, + { + "epoch": 1.7523020257826887, + "grad_norm": 0.3711417019367218, + "learning_rate": 9.457354569673993e-05, + "loss": 1.8553, + "step": 5709 + }, + { + "epoch": 1.752608962553714, + "grad_norm": 0.3670618236064911, + "learning_rate": 9.457129342552633e-05, + "loss": 1.9044, + "step": 5710 + }, + { + "epoch": 1.7529158993247391, + "grad_norm": 0.398863285779953, + "learning_rate": 9.45690407138331e-05, + "loss": 1.987, + "step": 5711 + }, + { + "epoch": 1.7532228360957642, + "grad_norm": 0.4100732207298279, + "learning_rate": 9.456678756168248e-05, + "loss": 1.8552, + "step": 5712 + }, + { + "epoch": 1.7535297728667896, + "grad_norm": 0.41883236169815063, + "learning_rate": 9.456453396909676e-05, + "loss": 1.9183, + "step": 5713 + }, + { + "epoch": 1.7538367096378145, + "grad_norm": 0.4063440263271332, + "learning_rate": 9.456227993609818e-05, + "loss": 1.8751, + "step": 5714 + }, + { + "epoch": 1.7541436464088398, + "grad_norm": 0.3880515694618225, + "learning_rate": 9.456002546270904e-05, + "loss": 1.9558, + "step": 5715 + }, + { + "epoch": 1.754450583179865, + "grad_norm": 0.38582444190979004, + "learning_rate": 9.45577705489516e-05, + "loss": 1.9588, + "step": 5716 + }, + { + "epoch": 1.75475751995089, + "grad_norm": 0.3678396940231323, + "learning_rate": 9.455551519484816e-05, + "loss": 1.9108, + "step": 5717 + }, + { + "epoch": 1.7550644567219154, + "grad_norm": 0.3590768277645111, + "learning_rate": 9.455325940042098e-05, + "loss": 1.9027, + "step": 5718 + }, + { + "epoch": 1.7553713934929405, + "grad_norm": 0.4104592204093933, + "learning_rate": 9.455100316569241e-05, + "loss": 1.9099, + "step": 5719 + }, + { + "epoch": 1.7556783302639656, + "grad_norm": 0.3774401843547821, + "learning_rate": 9.45487464906847e-05, + "loss": 1.9098, + "step": 5720 + }, + { + "epoch": 1.755985267034991, + "grad_norm": 0.38464388251304626, + "learning_rate": 9.454648937542019e-05, + "loss": 1.9194, + "step": 5721 + }, + { + "epoch": 1.7562922038060158, + "grad_norm": 0.435131698846817, + "learning_rate": 9.454423181992114e-05, + "loss": 1.9798, + "step": 5722 + }, + { + "epoch": 1.7565991405770411, + "grad_norm": 0.4583236575126648, + "learning_rate": 9.454197382420988e-05, + "loss": 1.9862, + "step": 5723 + }, + { + "epoch": 1.7569060773480663, + "grad_norm": 0.3644738793373108, + "learning_rate": 9.453971538830874e-05, + "loss": 1.8535, + "step": 5724 + }, + { + "epoch": 1.7572130141190914, + "grad_norm": 0.3644218444824219, + "learning_rate": 9.453745651224002e-05, + "loss": 1.8773, + "step": 5725 + }, + { + "epoch": 1.7575199508901167, + "grad_norm": 0.42884743213653564, + "learning_rate": 9.453519719602604e-05, + "loss": 1.882, + "step": 5726 + }, + { + "epoch": 1.7578268876611418, + "grad_norm": 0.41049477458000183, + "learning_rate": 9.453293743968916e-05, + "loss": 1.9133, + "step": 5727 + }, + { + "epoch": 1.758133824432167, + "grad_norm": 0.35882604122161865, + "learning_rate": 9.453067724325169e-05, + "loss": 1.9056, + "step": 5728 + }, + { + "epoch": 1.7584407612031923, + "grad_norm": 0.34516364336013794, + "learning_rate": 9.452841660673595e-05, + "loss": 1.8894, + "step": 5729 + }, + { + "epoch": 1.7587476979742172, + "grad_norm": 0.41804373264312744, + "learning_rate": 9.45261555301643e-05, + "loss": 1.8798, + "step": 5730 + }, + { + "epoch": 1.7590546347452425, + "grad_norm": 0.48584702610969543, + "learning_rate": 9.45238940135591e-05, + "loss": 1.9353, + "step": 5731 + }, + { + "epoch": 1.7593615715162676, + "grad_norm": 0.5693044662475586, + "learning_rate": 9.452163205694267e-05, + "loss": 1.8813, + "step": 5732 + }, + { + "epoch": 1.7596685082872927, + "grad_norm": 0.6146205067634583, + "learning_rate": 9.451936966033738e-05, + "loss": 1.9993, + "step": 5733 + }, + { + "epoch": 1.759975445058318, + "grad_norm": 0.4658338129520416, + "learning_rate": 9.451710682376558e-05, + "loss": 1.8977, + "step": 5734 + }, + { + "epoch": 1.7602823818293432, + "grad_norm": 0.35184696316719055, + "learning_rate": 9.451484354724964e-05, + "loss": 1.9924, + "step": 5735 + }, + { + "epoch": 1.7605893186003683, + "grad_norm": 0.48720163106918335, + "learning_rate": 9.451257983081194e-05, + "loss": 1.9054, + "step": 5736 + }, + { + "epoch": 1.7608962553713936, + "grad_norm": 0.6268271803855896, + "learning_rate": 9.451031567447482e-05, + "loss": 1.9956, + "step": 5737 + }, + { + "epoch": 1.7612031921424187, + "grad_norm": 0.5384534001350403, + "learning_rate": 9.450805107826068e-05, + "loss": 1.9169, + "step": 5738 + }, + { + "epoch": 1.7615101289134438, + "grad_norm": 0.4011121094226837, + "learning_rate": 9.450578604219188e-05, + "loss": 1.9845, + "step": 5739 + }, + { + "epoch": 1.7618170656844692, + "grad_norm": 0.4422668516635895, + "learning_rate": 9.450352056629082e-05, + "loss": 2.0014, + "step": 5740 + }, + { + "epoch": 1.762124002455494, + "grad_norm": 0.5033303499221802, + "learning_rate": 9.45012546505799e-05, + "loss": 1.9142, + "step": 5741 + }, + { + "epoch": 1.7624309392265194, + "grad_norm": 0.6074427366256714, + "learning_rate": 9.449898829508148e-05, + "loss": 1.9385, + "step": 5742 + }, + { + "epoch": 1.7627378759975445, + "grad_norm": 0.6405495405197144, + "learning_rate": 9.449672149981799e-05, + "loss": 1.9792, + "step": 5743 + }, + { + "epoch": 1.7630448127685696, + "grad_norm": 0.5432560443878174, + "learning_rate": 9.449445426481182e-05, + "loss": 1.9294, + "step": 5744 + }, + { + "epoch": 1.763351749539595, + "grad_norm": 0.41406089067459106, + "learning_rate": 9.449218659008536e-05, + "loss": 1.9266, + "step": 5745 + }, + { + "epoch": 1.76365868631062, + "grad_norm": 0.41278013586997986, + "learning_rate": 9.448991847566104e-05, + "loss": 1.9448, + "step": 5746 + }, + { + "epoch": 1.7639656230816452, + "grad_norm": 0.4682934582233429, + "learning_rate": 9.448764992156128e-05, + "loss": 1.9836, + "step": 5747 + }, + { + "epoch": 1.7642725598526705, + "grad_norm": 0.47673073410987854, + "learning_rate": 9.448538092780848e-05, + "loss": 2.0229, + "step": 5748 + }, + { + "epoch": 1.7645794966236954, + "grad_norm": 0.3956258296966553, + "learning_rate": 9.448311149442507e-05, + "loss": 1.9871, + "step": 5749 + }, + { + "epoch": 1.7648864333947207, + "grad_norm": 0.39578214287757874, + "learning_rate": 9.448084162143348e-05, + "loss": 1.8991, + "step": 5750 + }, + { + "epoch": 1.7651933701657458, + "grad_norm": 0.42902353405952454, + "learning_rate": 9.447857130885614e-05, + "loss": 1.9925, + "step": 5751 + }, + { + "epoch": 1.765500306936771, + "grad_norm": 0.45643556118011475, + "learning_rate": 9.44763005567155e-05, + "loss": 1.9662, + "step": 5752 + }, + { + "epoch": 1.7658072437077963, + "grad_norm": 0.39291635155677795, + "learning_rate": 9.447402936503398e-05, + "loss": 1.8925, + "step": 5753 + }, + { + "epoch": 1.7661141804788214, + "grad_norm": 0.36709296703338623, + "learning_rate": 9.447175773383404e-05, + "loss": 1.8669, + "step": 5754 + }, + { + "epoch": 1.7664211172498465, + "grad_norm": 0.41586652398109436, + "learning_rate": 9.446948566313812e-05, + "loss": 1.8925, + "step": 5755 + }, + { + "epoch": 1.7667280540208719, + "grad_norm": 0.42532578110694885, + "learning_rate": 9.446721315296867e-05, + "loss": 1.9923, + "step": 5756 + }, + { + "epoch": 1.7670349907918967, + "grad_norm": 0.45310646295547485, + "learning_rate": 9.446494020334817e-05, + "loss": 1.9908, + "step": 5757 + }, + { + "epoch": 1.767341927562922, + "grad_norm": 0.4391445219516754, + "learning_rate": 9.446266681429907e-05, + "loss": 1.9391, + "step": 5758 + }, + { + "epoch": 1.7676488643339472, + "grad_norm": 0.3728313446044922, + "learning_rate": 9.446039298584382e-05, + "loss": 1.9352, + "step": 5759 + }, + { + "epoch": 1.7679558011049723, + "grad_norm": 0.3862408697605133, + "learning_rate": 9.445811871800492e-05, + "loss": 1.9628, + "step": 5760 + }, + { + "epoch": 1.7682627378759976, + "grad_norm": 0.3704443573951721, + "learning_rate": 9.445584401080482e-05, + "loss": 1.9041, + "step": 5761 + }, + { + "epoch": 1.7685696746470227, + "grad_norm": 0.3490816652774811, + "learning_rate": 9.445356886426603e-05, + "loss": 1.9203, + "step": 5762 + }, + { + "epoch": 1.7688766114180479, + "grad_norm": 0.40135613083839417, + "learning_rate": 9.445129327841102e-05, + "loss": 1.9166, + "step": 5763 + }, + { + "epoch": 1.7691835481890732, + "grad_norm": 0.3794950246810913, + "learning_rate": 9.444901725326227e-05, + "loss": 1.8735, + "step": 5764 + }, + { + "epoch": 1.769490484960098, + "grad_norm": 0.3908408284187317, + "learning_rate": 9.444674078884228e-05, + "loss": 1.9044, + "step": 5765 + }, + { + "epoch": 1.7697974217311234, + "grad_norm": 0.45880573987960815, + "learning_rate": 9.444446388517354e-05, + "loss": 1.999, + "step": 5766 + }, + { + "epoch": 1.7701043585021485, + "grad_norm": 0.44833555817604065, + "learning_rate": 9.444218654227856e-05, + "loss": 1.8638, + "step": 5767 + }, + { + "epoch": 1.7704112952731736, + "grad_norm": 0.4608282446861267, + "learning_rate": 9.443990876017985e-05, + "loss": 2.0073, + "step": 5768 + }, + { + "epoch": 1.770718232044199, + "grad_norm": 0.41873493790626526, + "learning_rate": 9.44376305388999e-05, + "loss": 1.9337, + "step": 5769 + }, + { + "epoch": 1.771025168815224, + "grad_norm": 0.44395530223846436, + "learning_rate": 9.443535187846125e-05, + "loss": 1.9218, + "step": 5770 + }, + { + "epoch": 1.7713321055862492, + "grad_norm": 0.4347928464412689, + "learning_rate": 9.443307277888641e-05, + "loss": 1.9251, + "step": 5771 + }, + { + "epoch": 1.7716390423572745, + "grad_norm": 0.4892890155315399, + "learning_rate": 9.44307932401979e-05, + "loss": 1.9549, + "step": 5772 + }, + { + "epoch": 1.7719459791282994, + "grad_norm": 0.4234324097633362, + "learning_rate": 9.442851326241826e-05, + "loss": 1.9835, + "step": 5773 + }, + { + "epoch": 1.7722529158993248, + "grad_norm": 0.3614303171634674, + "learning_rate": 9.442623284557e-05, + "loss": 1.8942, + "step": 5774 + }, + { + "epoch": 1.7725598526703499, + "grad_norm": 0.4273429214954376, + "learning_rate": 9.442395198967566e-05, + "loss": 1.9363, + "step": 5775 + }, + { + "epoch": 1.772866789441375, + "grad_norm": 0.5049880146980286, + "learning_rate": 9.44216706947578e-05, + "loss": 1.904, + "step": 5776 + }, + { + "epoch": 1.7731737262124003, + "grad_norm": 0.5713424682617188, + "learning_rate": 9.441938896083895e-05, + "loss": 1.9756, + "step": 5777 + }, + { + "epoch": 1.7734806629834254, + "grad_norm": 0.4836362600326538, + "learning_rate": 9.441710678794166e-05, + "loss": 1.9657, + "step": 5778 + }, + { + "epoch": 1.7737875997544506, + "grad_norm": 0.39967820048332214, + "learning_rate": 9.44148241760885e-05, + "loss": 1.9566, + "step": 5779 + }, + { + "epoch": 1.7740945365254759, + "grad_norm": 0.38304075598716736, + "learning_rate": 9.4412541125302e-05, + "loss": 1.9055, + "step": 5780 + }, + { + "epoch": 1.7744014732965008, + "grad_norm": 0.3932463526725769, + "learning_rate": 9.441025763560474e-05, + "loss": 1.9603, + "step": 5781 + }, + { + "epoch": 1.774708410067526, + "grad_norm": 0.4528409242630005, + "learning_rate": 9.44079737070193e-05, + "loss": 2.0095, + "step": 5782 + }, + { + "epoch": 1.7750153468385512, + "grad_norm": 0.42075392603874207, + "learning_rate": 9.440568933956822e-05, + "loss": 1.8818, + "step": 5783 + }, + { + "epoch": 1.7753222836095763, + "grad_norm": 0.4114269018173218, + "learning_rate": 9.44034045332741e-05, + "loss": 1.8524, + "step": 5784 + }, + { + "epoch": 1.7756292203806017, + "grad_norm": 0.4052261412143707, + "learning_rate": 9.44011192881595e-05, + "loss": 1.9759, + "step": 5785 + }, + { + "epoch": 1.7759361571516268, + "grad_norm": 0.3551998436450958, + "learning_rate": 9.439883360424702e-05, + "loss": 1.9534, + "step": 5786 + }, + { + "epoch": 1.776243093922652, + "grad_norm": 0.404109925031662, + "learning_rate": 9.439654748155924e-05, + "loss": 1.8944, + "step": 5787 + }, + { + "epoch": 1.7765500306936772, + "grad_norm": 0.4092860519886017, + "learning_rate": 9.439426092011875e-05, + "loss": 2.0341, + "step": 5788 + }, + { + "epoch": 1.7768569674647021, + "grad_norm": 0.36132386326789856, + "learning_rate": 9.439197391994819e-05, + "loss": 1.8746, + "step": 5789 + }, + { + "epoch": 1.7771639042357275, + "grad_norm": 0.34845319390296936, + "learning_rate": 9.438968648107009e-05, + "loss": 1.8646, + "step": 5790 + }, + { + "epoch": 1.7774708410067526, + "grad_norm": 0.33360353112220764, + "learning_rate": 9.43873986035071e-05, + "loss": 1.901, + "step": 5791 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.348147988319397, + "learning_rate": 9.438511028728181e-05, + "loss": 1.8703, + "step": 5792 + }, + { + "epoch": 1.778084714548803, + "grad_norm": 0.385662704706192, + "learning_rate": 9.438282153241686e-05, + "loss": 1.9806, + "step": 5793 + }, + { + "epoch": 1.7783916513198281, + "grad_norm": 0.39457234740257263, + "learning_rate": 9.438053233893484e-05, + "loss": 1.9324, + "step": 5794 + }, + { + "epoch": 1.7786985880908532, + "grad_norm": 0.35745853185653687, + "learning_rate": 9.43782427068584e-05, + "loss": 1.9754, + "step": 5795 + }, + { + "epoch": 1.7790055248618786, + "grad_norm": 0.40866991877555847, + "learning_rate": 9.437595263621015e-05, + "loss": 1.959, + "step": 5796 + }, + { + "epoch": 1.7793124616329035, + "grad_norm": 0.3938930630683899, + "learning_rate": 9.437366212701274e-05, + "loss": 1.8746, + "step": 5797 + }, + { + "epoch": 1.7796193984039288, + "grad_norm": 0.36665603518486023, + "learning_rate": 9.437137117928878e-05, + "loss": 1.9209, + "step": 5798 + }, + { + "epoch": 1.779926335174954, + "grad_norm": 0.38514846563339233, + "learning_rate": 9.436907979306092e-05, + "loss": 1.8697, + "step": 5799 + }, + { + "epoch": 1.780233271945979, + "grad_norm": 0.4100898206233978, + "learning_rate": 9.43667879683518e-05, + "loss": 1.9606, + "step": 5800 + }, + { + "epoch": 1.7805402087170044, + "grad_norm": 0.40195250511169434, + "learning_rate": 9.43644957051841e-05, + "loss": 1.918, + "step": 5801 + }, + { + "epoch": 1.7808471454880295, + "grad_norm": 0.3943032920360565, + "learning_rate": 9.436220300358043e-05, + "loss": 1.9394, + "step": 5802 + }, + { + "epoch": 1.7811540822590546, + "grad_norm": 0.4171943664550781, + "learning_rate": 9.435990986356349e-05, + "loss": 1.9773, + "step": 5803 + }, + { + "epoch": 1.78146101903008, + "grad_norm": 0.4278806746006012, + "learning_rate": 9.435761628515589e-05, + "loss": 1.8696, + "step": 5804 + }, + { + "epoch": 1.7817679558011048, + "grad_norm": 0.4659377634525299, + "learning_rate": 9.435532226838036e-05, + "loss": 1.9387, + "step": 5805 + }, + { + "epoch": 1.7820748925721301, + "grad_norm": 0.4428139925003052, + "learning_rate": 9.435302781325952e-05, + "loss": 1.9673, + "step": 5806 + }, + { + "epoch": 1.7823818293431553, + "grad_norm": 0.4488377869129181, + "learning_rate": 9.435073291981607e-05, + "loss": 1.8493, + "step": 5807 + }, + { + "epoch": 1.7826887661141804, + "grad_norm": 0.5337218046188354, + "learning_rate": 9.434843758807268e-05, + "loss": 1.8631, + "step": 5808 + }, + { + "epoch": 1.7829957028852057, + "grad_norm": 0.5479410886764526, + "learning_rate": 9.434614181805202e-05, + "loss": 1.8548, + "step": 5809 + }, + { + "epoch": 1.7833026396562308, + "grad_norm": 0.5154398679733276, + "learning_rate": 9.434384560977681e-05, + "loss": 1.9558, + "step": 5810 + }, + { + "epoch": 1.783609576427256, + "grad_norm": 0.44863855838775635, + "learning_rate": 9.434154896326974e-05, + "loss": 1.9287, + "step": 5811 + }, + { + "epoch": 1.7839165131982813, + "grad_norm": 0.43923139572143555, + "learning_rate": 9.433925187855348e-05, + "loss": 1.9475, + "step": 5812 + }, + { + "epoch": 1.7842234499693064, + "grad_norm": 0.3602962791919708, + "learning_rate": 9.433695435565073e-05, + "loss": 1.8705, + "step": 5813 + }, + { + "epoch": 1.7845303867403315, + "grad_norm": 0.3956433832645416, + "learning_rate": 9.433465639458423e-05, + "loss": 1.9402, + "step": 5814 + }, + { + "epoch": 1.7848373235113568, + "grad_norm": 0.3382786810398102, + "learning_rate": 9.433235799537666e-05, + "loss": 1.9176, + "step": 5815 + }, + { + "epoch": 1.7851442602823817, + "grad_norm": 0.3681669533252716, + "learning_rate": 9.433005915805076e-05, + "loss": 1.8628, + "step": 5816 + }, + { + "epoch": 1.785451197053407, + "grad_norm": 0.32285505533218384, + "learning_rate": 9.432775988262921e-05, + "loss": 1.8875, + "step": 5817 + }, + { + "epoch": 1.7857581338244322, + "grad_norm": 0.35673508048057556, + "learning_rate": 9.432546016913477e-05, + "loss": 1.925, + "step": 5818 + }, + { + "epoch": 1.7860650705954573, + "grad_norm": 0.363308310508728, + "learning_rate": 9.432316001759015e-05, + "loss": 1.8711, + "step": 5819 + }, + { + "epoch": 1.7863720073664826, + "grad_norm": 0.36789265275001526, + "learning_rate": 9.432085942801808e-05, + "loss": 1.8578, + "step": 5820 + }, + { + "epoch": 1.7866789441375077, + "grad_norm": 0.3791796565055847, + "learning_rate": 9.43185584004413e-05, + "loss": 1.9162, + "step": 5821 + }, + { + "epoch": 1.7869858809085328, + "grad_norm": 0.3819539248943329, + "learning_rate": 9.431625693488256e-05, + "loss": 1.9042, + "step": 5822 + }, + { + "epoch": 1.7872928176795582, + "grad_norm": 0.36675095558166504, + "learning_rate": 9.43139550313646e-05, + "loss": 1.9775, + "step": 5823 + }, + { + "epoch": 1.787599754450583, + "grad_norm": 0.40895935893058777, + "learning_rate": 9.431165268991013e-05, + "loss": 1.9249, + "step": 5824 + }, + { + "epoch": 1.7879066912216084, + "grad_norm": 0.3866878151893616, + "learning_rate": 9.430934991054197e-05, + "loss": 1.8706, + "step": 5825 + }, + { + "epoch": 1.7882136279926335, + "grad_norm": 0.4892923831939697, + "learning_rate": 9.430704669328283e-05, + "loss": 1.9177, + "step": 5826 + }, + { + "epoch": 1.7885205647636586, + "grad_norm": 0.46216699481010437, + "learning_rate": 9.430474303815548e-05, + "loss": 1.8606, + "step": 5827 + }, + { + "epoch": 1.788827501534684, + "grad_norm": 0.4253760874271393, + "learning_rate": 9.430243894518271e-05, + "loss": 1.9123, + "step": 5828 + }, + { + "epoch": 1.789134438305709, + "grad_norm": 0.3316090404987335, + "learning_rate": 9.430013441438726e-05, + "loss": 1.9138, + "step": 5829 + }, + { + "epoch": 1.7894413750767342, + "grad_norm": 0.36144545674324036, + "learning_rate": 9.429782944579191e-05, + "loss": 1.8851, + "step": 5830 + }, + { + "epoch": 1.7897483118477595, + "grad_norm": 0.47213298082351685, + "learning_rate": 9.429552403941946e-05, + "loss": 1.9614, + "step": 5831 + }, + { + "epoch": 1.7900552486187844, + "grad_norm": 0.5166186094284058, + "learning_rate": 9.429321819529267e-05, + "loss": 1.9297, + "step": 5832 + }, + { + "epoch": 1.7903621853898097, + "grad_norm": 0.5276393294334412, + "learning_rate": 9.429091191343433e-05, + "loss": 1.8803, + "step": 5833 + }, + { + "epoch": 1.7906691221608348, + "grad_norm": 0.5736613869667053, + "learning_rate": 9.428860519386726e-05, + "loss": 1.9256, + "step": 5834 + }, + { + "epoch": 1.79097605893186, + "grad_norm": 0.6111080050468445, + "learning_rate": 9.428629803661421e-05, + "loss": 1.9624, + "step": 5835 + }, + { + "epoch": 1.7912829957028853, + "grad_norm": 0.45036107301712036, + "learning_rate": 9.428399044169802e-05, + "loss": 1.8625, + "step": 5836 + }, + { + "epoch": 1.7915899324739104, + "grad_norm": 0.35049325227737427, + "learning_rate": 9.428168240914148e-05, + "loss": 1.8988, + "step": 5837 + }, + { + "epoch": 1.7918968692449355, + "grad_norm": 0.4196048080921173, + "learning_rate": 9.427937393896739e-05, + "loss": 1.8593, + "step": 5838 + }, + { + "epoch": 1.7922038060159609, + "grad_norm": 0.5051491856575012, + "learning_rate": 9.42770650311986e-05, + "loss": 1.9283, + "step": 5839 + }, + { + "epoch": 1.7925107427869857, + "grad_norm": 0.5883297324180603, + "learning_rate": 9.427475568585787e-05, + "loss": 1.9211, + "step": 5840 + }, + { + "epoch": 1.792817679558011, + "grad_norm": 0.54326993227005, + "learning_rate": 9.427244590296807e-05, + "loss": 1.8856, + "step": 5841 + }, + { + "epoch": 1.7931246163290362, + "grad_norm": 0.3963034152984619, + "learning_rate": 9.4270135682552e-05, + "loss": 1.9302, + "step": 5842 + }, + { + "epoch": 1.7934315531000613, + "grad_norm": 0.3804232180118561, + "learning_rate": 9.426782502463251e-05, + "loss": 1.8615, + "step": 5843 + }, + { + "epoch": 1.7937384898710866, + "grad_norm": 0.5173880457878113, + "learning_rate": 9.426551392923244e-05, + "loss": 1.9702, + "step": 5844 + }, + { + "epoch": 1.7940454266421118, + "grad_norm": 0.5509253144264221, + "learning_rate": 9.42632023963746e-05, + "loss": 1.9091, + "step": 5845 + }, + { + "epoch": 1.7943523634131369, + "grad_norm": 0.4918860197067261, + "learning_rate": 9.426089042608186e-05, + "loss": 1.956, + "step": 5846 + }, + { + "epoch": 1.7946593001841622, + "grad_norm": 0.40632131695747375, + "learning_rate": 9.425857801837705e-05, + "loss": 1.978, + "step": 5847 + }, + { + "epoch": 1.794966236955187, + "grad_norm": 0.429643839597702, + "learning_rate": 9.425626517328303e-05, + "loss": 1.9293, + "step": 5848 + }, + { + "epoch": 1.7952731737262124, + "grad_norm": 0.46690109372138977, + "learning_rate": 9.425395189082267e-05, + "loss": 1.935, + "step": 5849 + }, + { + "epoch": 1.7955801104972375, + "grad_norm": 0.47745081782341003, + "learning_rate": 9.425163817101881e-05, + "loss": 1.9308, + "step": 5850 + }, + { + "epoch": 1.7958870472682626, + "grad_norm": 0.40971288084983826, + "learning_rate": 9.424932401389433e-05, + "loss": 1.8818, + "step": 5851 + }, + { + "epoch": 1.796193984039288, + "grad_norm": 0.44640809297561646, + "learning_rate": 9.424700941947209e-05, + "loss": 1.9298, + "step": 5852 + }, + { + "epoch": 1.796500920810313, + "grad_norm": 0.4068106412887573, + "learning_rate": 9.424469438777497e-05, + "loss": 1.9176, + "step": 5853 + }, + { + "epoch": 1.7968078575813382, + "grad_norm": 0.39228180050849915, + "learning_rate": 9.424237891882584e-05, + "loss": 1.9822, + "step": 5854 + }, + { + "epoch": 1.7971147943523635, + "grad_norm": 0.4050966203212738, + "learning_rate": 9.424006301264761e-05, + "loss": 2.0092, + "step": 5855 + }, + { + "epoch": 1.7974217311233884, + "grad_norm": 0.4402252733707428, + "learning_rate": 9.423774666926313e-05, + "loss": 1.9686, + "step": 5856 + }, + { + "epoch": 1.7977286678944138, + "grad_norm": 0.4362206757068634, + "learning_rate": 9.423542988869531e-05, + "loss": 1.9472, + "step": 5857 + }, + { + "epoch": 1.7980356046654389, + "grad_norm": 0.4363079369068146, + "learning_rate": 9.423311267096706e-05, + "loss": 1.9046, + "step": 5858 + }, + { + "epoch": 1.798342541436464, + "grad_norm": 0.4619371294975281, + "learning_rate": 9.423079501610123e-05, + "loss": 1.9322, + "step": 5859 + }, + { + "epoch": 1.7986494782074893, + "grad_norm": 0.3747330605983734, + "learning_rate": 9.42284769241208e-05, + "loss": 1.8859, + "step": 5860 + }, + { + "epoch": 1.7989564149785144, + "grad_norm": 0.46349939703941345, + "learning_rate": 9.422615839504863e-05, + "loss": 2.0343, + "step": 5861 + }, + { + "epoch": 1.7992633517495396, + "grad_norm": 0.4081406891345978, + "learning_rate": 9.422383942890762e-05, + "loss": 1.9261, + "step": 5862 + }, + { + "epoch": 1.7995702885205649, + "grad_norm": 0.4200274348258972, + "learning_rate": 9.42215200257207e-05, + "loss": 1.8922, + "step": 5863 + }, + { + "epoch": 1.7998772252915898, + "grad_norm": 0.4353233277797699, + "learning_rate": 9.421920018551084e-05, + "loss": 1.9263, + "step": 5864 + }, + { + "epoch": 1.8001841620626151, + "grad_norm": 0.43261346220970154, + "learning_rate": 9.42168799083009e-05, + "loss": 1.872, + "step": 5865 + }, + { + "epoch": 1.8004910988336402, + "grad_norm": 0.41588231921195984, + "learning_rate": 9.421455919411385e-05, + "loss": 1.9427, + "step": 5866 + }, + { + "epoch": 1.8007980356046653, + "grad_norm": 0.36490678787231445, + "learning_rate": 9.421223804297261e-05, + "loss": 1.9458, + "step": 5867 + }, + { + "epoch": 1.8011049723756907, + "grad_norm": 0.40656644105911255, + "learning_rate": 9.42099164549001e-05, + "loss": 1.8791, + "step": 5868 + }, + { + "epoch": 1.8014119091467158, + "grad_norm": 0.35529834032058716, + "learning_rate": 9.42075944299193e-05, + "loss": 1.8889, + "step": 5869 + }, + { + "epoch": 1.801718845917741, + "grad_norm": 0.3530628979206085, + "learning_rate": 9.420527196805314e-05, + "loss": 1.9093, + "step": 5870 + }, + { + "epoch": 1.8020257826887662, + "grad_norm": 0.35012003779411316, + "learning_rate": 9.420294906932457e-05, + "loss": 1.84, + "step": 5871 + }, + { + "epoch": 1.8023327194597911, + "grad_norm": 0.37993142008781433, + "learning_rate": 9.420062573375654e-05, + "loss": 1.9943, + "step": 5872 + }, + { + "epoch": 1.8026396562308165, + "grad_norm": 0.34801873564720154, + "learning_rate": 9.419830196137204e-05, + "loss": 1.9092, + "step": 5873 + }, + { + "epoch": 1.8029465930018416, + "grad_norm": 0.3381052017211914, + "learning_rate": 9.4195977752194e-05, + "loss": 1.9212, + "step": 5874 + }, + { + "epoch": 1.8032535297728667, + "grad_norm": 0.3624991476535797, + "learning_rate": 9.419365310624542e-05, + "loss": 1.9491, + "step": 5875 + }, + { + "epoch": 1.803560466543892, + "grad_norm": 0.3840768337249756, + "learning_rate": 9.419132802354925e-05, + "loss": 1.9531, + "step": 5876 + }, + { + "epoch": 1.8038674033149171, + "grad_norm": 0.377481073141098, + "learning_rate": 9.418900250412846e-05, + "loss": 1.9103, + "step": 5877 + }, + { + "epoch": 1.8041743400859422, + "grad_norm": 0.41462278366088867, + "learning_rate": 9.418667654800606e-05, + "loss": 1.944, + "step": 5878 + }, + { + "epoch": 1.8044812768569676, + "grad_norm": 0.5620705485343933, + "learning_rate": 9.418435015520502e-05, + "loss": 1.9184, + "step": 5879 + }, + { + "epoch": 1.8047882136279927, + "grad_norm": 0.6150699853897095, + "learning_rate": 9.418202332574833e-05, + "loss": 1.8971, + "step": 5880 + }, + { + "epoch": 1.8050951503990178, + "grad_norm": 0.5631645321846008, + "learning_rate": 9.4179696059659e-05, + "loss": 1.9668, + "step": 5881 + }, + { + "epoch": 1.8054020871700431, + "grad_norm": 0.4416831433773041, + "learning_rate": 9.417736835696001e-05, + "loss": 1.8531, + "step": 5882 + }, + { + "epoch": 1.805709023941068, + "grad_norm": 0.37340816855430603, + "learning_rate": 9.417504021767438e-05, + "loss": 1.8928, + "step": 5883 + }, + { + "epoch": 1.8060159607120934, + "grad_norm": 0.46018123626708984, + "learning_rate": 9.41727116418251e-05, + "loss": 1.8943, + "step": 5884 + }, + { + "epoch": 1.8063228974831185, + "grad_norm": 0.3852032721042633, + "learning_rate": 9.41703826294352e-05, + "loss": 1.8927, + "step": 5885 + }, + { + "epoch": 1.8066298342541436, + "grad_norm": 0.36783283948898315, + "learning_rate": 9.41680531805277e-05, + "loss": 1.9255, + "step": 5886 + }, + { + "epoch": 1.806936771025169, + "grad_norm": 0.39950302243232727, + "learning_rate": 9.416572329512559e-05, + "loss": 1.9215, + "step": 5887 + }, + { + "epoch": 1.807243707796194, + "grad_norm": 0.37217068672180176, + "learning_rate": 9.416339297325193e-05, + "loss": 1.8798, + "step": 5888 + }, + { + "epoch": 1.8075506445672191, + "grad_norm": 0.4334213137626648, + "learning_rate": 9.416106221492974e-05, + "loss": 1.9583, + "step": 5889 + }, + { + "epoch": 1.8078575813382445, + "grad_norm": 0.39610370993614197, + "learning_rate": 9.415873102018204e-05, + "loss": 1.9526, + "step": 5890 + }, + { + "epoch": 1.8081645181092694, + "grad_norm": 0.4256335496902466, + "learning_rate": 9.41563993890319e-05, + "loss": 1.9633, + "step": 5891 + }, + { + "epoch": 1.8084714548802947, + "grad_norm": 0.48030543327331543, + "learning_rate": 9.41540673215023e-05, + "loss": 1.8869, + "step": 5892 + }, + { + "epoch": 1.8087783916513198, + "grad_norm": 0.5549675822257996, + "learning_rate": 9.415173481761634e-05, + "loss": 1.9894, + "step": 5893 + }, + { + "epoch": 1.809085328422345, + "grad_norm": 0.5706361532211304, + "learning_rate": 9.414940187739708e-05, + "loss": 1.9721, + "step": 5894 + }, + { + "epoch": 1.8093922651933703, + "grad_norm": 0.4263947606086731, + "learning_rate": 9.414706850086754e-05, + "loss": 1.9408, + "step": 5895 + }, + { + "epoch": 1.8096992019643954, + "grad_norm": 0.3934611976146698, + "learning_rate": 9.414473468805078e-05, + "loss": 1.9444, + "step": 5896 + }, + { + "epoch": 1.8100061387354205, + "grad_norm": 0.4267776608467102, + "learning_rate": 9.41424004389699e-05, + "loss": 1.8774, + "step": 5897 + }, + { + "epoch": 1.8103130755064458, + "grad_norm": 0.46216219663619995, + "learning_rate": 9.414006575364795e-05, + "loss": 1.9648, + "step": 5898 + }, + { + "epoch": 1.8106200122774707, + "grad_norm": 0.4730767607688904, + "learning_rate": 9.413773063210798e-05, + "loss": 1.9528, + "step": 5899 + }, + { + "epoch": 1.810926949048496, + "grad_norm": 0.36383283138275146, + "learning_rate": 9.413539507437308e-05, + "loss": 1.843, + "step": 5900 + }, + { + "epoch": 1.8112338858195212, + "grad_norm": 0.343729168176651, + "learning_rate": 9.413305908046636e-05, + "loss": 1.9101, + "step": 5901 + }, + { + "epoch": 1.8115408225905463, + "grad_norm": 0.3774524927139282, + "learning_rate": 9.413072265041087e-05, + "loss": 1.8705, + "step": 5902 + }, + { + "epoch": 1.8118477593615716, + "grad_norm": 0.37734711170196533, + "learning_rate": 9.412838578422972e-05, + "loss": 1.868, + "step": 5903 + }, + { + "epoch": 1.8121546961325967, + "grad_norm": 0.3705524206161499, + "learning_rate": 9.4126048481946e-05, + "loss": 1.9587, + "step": 5904 + }, + { + "epoch": 1.8124616329036218, + "grad_norm": 0.45906612277030945, + "learning_rate": 9.41237107435828e-05, + "loss": 1.9872, + "step": 5905 + }, + { + "epoch": 1.8127685696746472, + "grad_norm": 0.5013484954833984, + "learning_rate": 9.412137256916323e-05, + "loss": 1.8692, + "step": 5906 + }, + { + "epoch": 1.813075506445672, + "grad_norm": 0.5123991370201111, + "learning_rate": 9.411903395871038e-05, + "loss": 1.9574, + "step": 5907 + }, + { + "epoch": 1.8133824432166974, + "grad_norm": 0.45425844192504883, + "learning_rate": 9.411669491224739e-05, + "loss": 1.9295, + "step": 5908 + }, + { + "epoch": 1.8136893799877225, + "grad_norm": 0.3939640522003174, + "learning_rate": 9.411435542979736e-05, + "loss": 1.9258, + "step": 5909 + }, + { + "epoch": 1.8139963167587476, + "grad_norm": 0.5032235383987427, + "learning_rate": 9.411201551138342e-05, + "loss": 1.9012, + "step": 5910 + }, + { + "epoch": 1.814303253529773, + "grad_norm": 0.6334826946258545, + "learning_rate": 9.410967515702869e-05, + "loss": 1.9699, + "step": 5911 + }, + { + "epoch": 1.814610190300798, + "grad_norm": 0.56645667552948, + "learning_rate": 9.41073343667563e-05, + "loss": 1.9346, + "step": 5912 + }, + { + "epoch": 1.8149171270718232, + "grad_norm": 0.461668461561203, + "learning_rate": 9.410499314058936e-05, + "loss": 1.9549, + "step": 5913 + }, + { + "epoch": 1.8152240638428485, + "grad_norm": 0.39917534589767456, + "learning_rate": 9.410265147855104e-05, + "loss": 1.9503, + "step": 5914 + }, + { + "epoch": 1.8155310006138734, + "grad_norm": 0.4409043788909912, + "learning_rate": 9.410030938066448e-05, + "loss": 1.897, + "step": 5915 + }, + { + "epoch": 1.8158379373848987, + "grad_norm": 0.5793384313583374, + "learning_rate": 9.40979668469528e-05, + "loss": 1.9526, + "step": 5916 + }, + { + "epoch": 1.8161448741559238, + "grad_norm": 0.4642924666404724, + "learning_rate": 9.409562387743917e-05, + "loss": 1.8993, + "step": 5917 + }, + { + "epoch": 1.816451810926949, + "grad_norm": 0.3799861669540405, + "learning_rate": 9.409328047214674e-05, + "loss": 1.9412, + "step": 5918 + }, + { + "epoch": 1.8167587476979743, + "grad_norm": 0.40758320689201355, + "learning_rate": 9.409093663109866e-05, + "loss": 1.9908, + "step": 5919 + }, + { + "epoch": 1.8170656844689994, + "grad_norm": 0.41446420550346375, + "learning_rate": 9.40885923543181e-05, + "loss": 1.8711, + "step": 5920 + }, + { + "epoch": 1.8173726212400245, + "grad_norm": 0.4744807183742523, + "learning_rate": 9.408624764182823e-05, + "loss": 2.0297, + "step": 5921 + }, + { + "epoch": 1.8176795580110499, + "grad_norm": 0.43377524614334106, + "learning_rate": 9.408390249365224e-05, + "loss": 1.9613, + "step": 5922 + }, + { + "epoch": 1.8179864947820747, + "grad_norm": 0.38450872898101807, + "learning_rate": 9.408155690981328e-05, + "loss": 1.8716, + "step": 5923 + }, + { + "epoch": 1.8182934315531, + "grad_norm": 0.4989684820175171, + "learning_rate": 9.407921089033452e-05, + "loss": 1.9909, + "step": 5924 + }, + { + "epoch": 1.8186003683241252, + "grad_norm": 0.4137042462825775, + "learning_rate": 9.407686443523918e-05, + "loss": 1.8778, + "step": 5925 + }, + { + "epoch": 1.8189073050951503, + "grad_norm": 0.3816729485988617, + "learning_rate": 9.407451754455042e-05, + "loss": 1.9355, + "step": 5926 + }, + { + "epoch": 1.8192142418661756, + "grad_norm": 0.48876214027404785, + "learning_rate": 9.407217021829145e-05, + "loss": 1.9256, + "step": 5927 + }, + { + "epoch": 1.8195211786372008, + "grad_norm": 0.5273690223693848, + "learning_rate": 9.406982245648547e-05, + "loss": 1.9456, + "step": 5928 + }, + { + "epoch": 1.8198281154082259, + "grad_norm": 0.4148990511894226, + "learning_rate": 9.406747425915566e-05, + "loss": 1.9184, + "step": 5929 + }, + { + "epoch": 1.8201350521792512, + "grad_norm": 0.4484131634235382, + "learning_rate": 9.406512562632526e-05, + "loss": 1.9305, + "step": 5930 + }, + { + "epoch": 1.820441988950276, + "grad_norm": 0.6036938428878784, + "learning_rate": 9.406277655801744e-05, + "loss": 1.9294, + "step": 5931 + }, + { + "epoch": 1.8207489257213014, + "grad_norm": 0.5399366021156311, + "learning_rate": 9.406042705425543e-05, + "loss": 1.9265, + "step": 5932 + }, + { + "epoch": 1.8210558624923265, + "grad_norm": 0.3591126501560211, + "learning_rate": 9.405807711506249e-05, + "loss": 1.8634, + "step": 5933 + }, + { + "epoch": 1.8213627992633517, + "grad_norm": 0.4474995732307434, + "learning_rate": 9.405572674046179e-05, + "loss": 2.0084, + "step": 5934 + }, + { + "epoch": 1.821669736034377, + "grad_norm": 0.4841657876968384, + "learning_rate": 9.405337593047657e-05, + "loss": 1.8885, + "step": 5935 + }, + { + "epoch": 1.821976672805402, + "grad_norm": 0.4786655008792877, + "learning_rate": 9.405102468513008e-05, + "loss": 1.9273, + "step": 5936 + }, + { + "epoch": 1.8222836095764272, + "grad_norm": 0.4675963521003723, + "learning_rate": 9.404867300444553e-05, + "loss": 1.9267, + "step": 5937 + }, + { + "epoch": 1.8225905463474525, + "grad_norm": 0.40235474705696106, + "learning_rate": 9.404632088844619e-05, + "loss": 2.0208, + "step": 5938 + }, + { + "epoch": 1.8228974831184774, + "grad_norm": 0.40626317262649536, + "learning_rate": 9.404396833715527e-05, + "loss": 1.9079, + "step": 5939 + }, + { + "epoch": 1.8232044198895028, + "grad_norm": 0.4164435565471649, + "learning_rate": 9.404161535059607e-05, + "loss": 1.8818, + "step": 5940 + }, + { + "epoch": 1.8235113566605279, + "grad_norm": 0.44487184286117554, + "learning_rate": 9.40392619287918e-05, + "loss": 1.9184, + "step": 5941 + }, + { + "epoch": 1.823818293431553, + "grad_norm": 0.4009508192539215, + "learning_rate": 9.403690807176572e-05, + "loss": 1.8814, + "step": 5942 + }, + { + "epoch": 1.8241252302025783, + "grad_norm": 0.3518575429916382, + "learning_rate": 9.403455377954112e-05, + "loss": 1.9319, + "step": 5943 + }, + { + "epoch": 1.8244321669736034, + "grad_norm": 0.36712533235549927, + "learning_rate": 9.403219905214125e-05, + "loss": 1.8609, + "step": 5944 + }, + { + "epoch": 1.8247391037446286, + "grad_norm": 0.3926267623901367, + "learning_rate": 9.402984388958937e-05, + "loss": 1.9328, + "step": 5945 + }, + { + "epoch": 1.825046040515654, + "grad_norm": 0.370781272649765, + "learning_rate": 9.402748829190878e-05, + "loss": 1.9848, + "step": 5946 + }, + { + "epoch": 1.8253529772866788, + "grad_norm": 0.38226625323295593, + "learning_rate": 9.402513225912273e-05, + "loss": 1.8933, + "step": 5947 + }, + { + "epoch": 1.8256599140577041, + "grad_norm": 0.40101101994514465, + "learning_rate": 9.402277579125451e-05, + "loss": 1.9231, + "step": 5948 + }, + { + "epoch": 1.8259668508287292, + "grad_norm": 0.41038060188293457, + "learning_rate": 9.402041888832744e-05, + "loss": 1.9445, + "step": 5949 + }, + { + "epoch": 1.8262737875997543, + "grad_norm": 0.37442395091056824, + "learning_rate": 9.401806155036479e-05, + "loss": 1.9271, + "step": 5950 + }, + { + "epoch": 1.8265807243707797, + "grad_norm": 0.43142926692962646, + "learning_rate": 9.401570377738984e-05, + "loss": 1.9489, + "step": 5951 + }, + { + "epoch": 1.8268876611418048, + "grad_norm": 0.38730981945991516, + "learning_rate": 9.401334556942591e-05, + "loss": 1.8802, + "step": 5952 + }, + { + "epoch": 1.82719459791283, + "grad_norm": 0.34189531207084656, + "learning_rate": 9.40109869264963e-05, + "loss": 1.9116, + "step": 5953 + }, + { + "epoch": 1.8275015346838552, + "grad_norm": 0.3632197678089142, + "learning_rate": 9.400862784862434e-05, + "loss": 1.8456, + "step": 5954 + }, + { + "epoch": 1.8278084714548803, + "grad_norm": 0.4008798599243164, + "learning_rate": 9.400626833583331e-05, + "loss": 1.9984, + "step": 5955 + }, + { + "epoch": 1.8281154082259055, + "grad_norm": 0.4087502062320709, + "learning_rate": 9.400390838814655e-05, + "loss": 1.8177, + "step": 5956 + }, + { + "epoch": 1.8284223449969308, + "grad_norm": 0.3753478229045868, + "learning_rate": 9.400154800558737e-05, + "loss": 1.864, + "step": 5957 + }, + { + "epoch": 1.8287292817679557, + "grad_norm": 0.37939608097076416, + "learning_rate": 9.399918718817911e-05, + "loss": 1.9331, + "step": 5958 + }, + { + "epoch": 1.829036218538981, + "grad_norm": 0.41382426023483276, + "learning_rate": 9.399682593594507e-05, + "loss": 1.9014, + "step": 5959 + }, + { + "epoch": 1.8293431553100061, + "grad_norm": 0.46129345893859863, + "learning_rate": 9.399446424890864e-05, + "loss": 1.9591, + "step": 5960 + }, + { + "epoch": 1.8296500920810312, + "grad_norm": 0.487870454788208, + "learning_rate": 9.399210212709312e-05, + "loss": 1.9073, + "step": 5961 + }, + { + "epoch": 1.8299570288520566, + "grad_norm": 0.4693615138530731, + "learning_rate": 9.398973957052185e-05, + "loss": 1.8336, + "step": 5962 + }, + { + "epoch": 1.8302639656230817, + "grad_norm": 0.38947850465774536, + "learning_rate": 9.39873765792182e-05, + "loss": 1.8599, + "step": 5963 + }, + { + "epoch": 1.8305709023941068, + "grad_norm": 0.372242271900177, + "learning_rate": 9.398501315320551e-05, + "loss": 1.9653, + "step": 5964 + }, + { + "epoch": 1.8308778391651321, + "grad_norm": 0.37679895758628845, + "learning_rate": 9.398264929250714e-05, + "loss": 1.8886, + "step": 5965 + }, + { + "epoch": 1.831184775936157, + "grad_norm": 0.347989022731781, + "learning_rate": 9.398028499714645e-05, + "loss": 1.8665, + "step": 5966 + }, + { + "epoch": 1.8314917127071824, + "grad_norm": 0.4297877550125122, + "learning_rate": 9.397792026714681e-05, + "loss": 1.9646, + "step": 5967 + }, + { + "epoch": 1.8317986494782075, + "grad_norm": 0.3698103427886963, + "learning_rate": 9.397555510253158e-05, + "loss": 1.9537, + "step": 5968 + }, + { + "epoch": 1.8321055862492326, + "grad_norm": 0.3268609941005707, + "learning_rate": 9.397318950332414e-05, + "loss": 1.8679, + "step": 5969 + }, + { + "epoch": 1.832412523020258, + "grad_norm": 0.3487341105937958, + "learning_rate": 9.397082346954788e-05, + "loss": 1.8936, + "step": 5970 + }, + { + "epoch": 1.832719459791283, + "grad_norm": 0.36363741755485535, + "learning_rate": 9.396845700122616e-05, + "loss": 1.8926, + "step": 5971 + }, + { + "epoch": 1.8330263965623081, + "grad_norm": 0.42258647084236145, + "learning_rate": 9.396609009838237e-05, + "loss": 1.9439, + "step": 5972 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.4087521433830261, + "learning_rate": 9.396372276103992e-05, + "loss": 1.8868, + "step": 5973 + }, + { + "epoch": 1.8336402701043584, + "grad_norm": 0.41857820749282837, + "learning_rate": 9.396135498922218e-05, + "loss": 1.9824, + "step": 5974 + }, + { + "epoch": 1.8339472068753837, + "grad_norm": 0.44207099080085754, + "learning_rate": 9.395898678295259e-05, + "loss": 1.9183, + "step": 5975 + }, + { + "epoch": 1.8342541436464088, + "grad_norm": 0.38295891880989075, + "learning_rate": 9.39566181422545e-05, + "loss": 1.8882, + "step": 5976 + }, + { + "epoch": 1.834561080417434, + "grad_norm": 0.4440687298774719, + "learning_rate": 9.395424906715136e-05, + "loss": 1.9401, + "step": 5977 + }, + { + "epoch": 1.8348680171884593, + "grad_norm": 0.3867577016353607, + "learning_rate": 9.395187955766655e-05, + "loss": 1.9243, + "step": 5978 + }, + { + "epoch": 1.8351749539594844, + "grad_norm": 0.47536182403564453, + "learning_rate": 9.394950961382354e-05, + "loss": 1.9248, + "step": 5979 + }, + { + "epoch": 1.8354818907305095, + "grad_norm": 0.4071936011314392, + "learning_rate": 9.394713923564569e-05, + "loss": 1.8701, + "step": 5980 + }, + { + "epoch": 1.8357888275015348, + "grad_norm": 0.41844502091407776, + "learning_rate": 9.394476842315645e-05, + "loss": 2.0087, + "step": 5981 + }, + { + "epoch": 1.8360957642725597, + "grad_norm": 0.40439316630363464, + "learning_rate": 9.394239717637927e-05, + "loss": 1.8945, + "step": 5982 + }, + { + "epoch": 1.836402701043585, + "grad_norm": 0.36738064885139465, + "learning_rate": 9.394002549533754e-05, + "loss": 1.9361, + "step": 5983 + }, + { + "epoch": 1.8367096378146102, + "grad_norm": 0.4733370542526245, + "learning_rate": 9.393765338005476e-05, + "loss": 1.9301, + "step": 5984 + }, + { + "epoch": 1.8370165745856353, + "grad_norm": 0.4467030465602875, + "learning_rate": 9.39352808305543e-05, + "loss": 1.8691, + "step": 5985 + }, + { + "epoch": 1.8373235113566606, + "grad_norm": 0.5276423692703247, + "learning_rate": 9.393290784685967e-05, + "loss": 1.9211, + "step": 5986 + }, + { + "epoch": 1.8376304481276857, + "grad_norm": 0.4791669547557831, + "learning_rate": 9.393053442899428e-05, + "loss": 1.9876, + "step": 5987 + }, + { + "epoch": 1.8379373848987108, + "grad_norm": 0.41468554735183716, + "learning_rate": 9.392816057698159e-05, + "loss": 1.9483, + "step": 5988 + }, + { + "epoch": 1.8382443216697362, + "grad_norm": 0.3979242742061615, + "learning_rate": 9.39257862908451e-05, + "loss": 1.8962, + "step": 5989 + }, + { + "epoch": 1.838551258440761, + "grad_norm": 0.47706472873687744, + "learning_rate": 9.392341157060822e-05, + "loss": 1.9028, + "step": 5990 + }, + { + "epoch": 1.8388581952117864, + "grad_norm": 0.5254244804382324, + "learning_rate": 9.392103641629446e-05, + "loss": 1.9244, + "step": 5991 + }, + { + "epoch": 1.8391651319828115, + "grad_norm": 0.49596595764160156, + "learning_rate": 9.391866082792727e-05, + "loss": 1.8731, + "step": 5992 + }, + { + "epoch": 1.8394720687538366, + "grad_norm": 0.3787136971950531, + "learning_rate": 9.391628480553013e-05, + "loss": 1.9404, + "step": 5993 + }, + { + "epoch": 1.839779005524862, + "grad_norm": 0.3986566960811615, + "learning_rate": 9.391390834912651e-05, + "loss": 1.9319, + "step": 5994 + }, + { + "epoch": 1.840085942295887, + "grad_norm": 0.4466419219970703, + "learning_rate": 9.391153145873992e-05, + "loss": 1.9755, + "step": 5995 + }, + { + "epoch": 1.8403928790669122, + "grad_norm": 0.43374884128570557, + "learning_rate": 9.390915413439385e-05, + "loss": 1.913, + "step": 5996 + }, + { + "epoch": 1.8406998158379375, + "grad_norm": 0.3897610902786255, + "learning_rate": 9.390677637611176e-05, + "loss": 1.9488, + "step": 5997 + }, + { + "epoch": 1.8410067526089624, + "grad_norm": 0.38407614827156067, + "learning_rate": 9.390439818391718e-05, + "loss": 1.8712, + "step": 5998 + }, + { + "epoch": 1.8413136893799877, + "grad_norm": 0.4159192740917206, + "learning_rate": 9.390201955783362e-05, + "loss": 1.9254, + "step": 5999 + }, + { + "epoch": 1.8416206261510129, + "grad_norm": 0.42220592498779297, + "learning_rate": 9.389964049788455e-05, + "loss": 1.9684, + "step": 6000 + }, + { + "epoch": 1.841927562922038, + "grad_norm": 0.3792029619216919, + "learning_rate": 9.389726100409351e-05, + "loss": 1.9091, + "step": 6001 + }, + { + "epoch": 1.8422344996930633, + "grad_norm": 0.37374788522720337, + "learning_rate": 9.389488107648401e-05, + "loss": 1.9498, + "step": 6002 + }, + { + "epoch": 1.8425414364640884, + "grad_norm": 0.4237084686756134, + "learning_rate": 9.389250071507958e-05, + "loss": 1.9177, + "step": 6003 + }, + { + "epoch": 1.8428483732351135, + "grad_norm": 0.5332993865013123, + "learning_rate": 9.38901199199037e-05, + "loss": 1.8994, + "step": 6004 + }, + { + "epoch": 1.8431553100061389, + "grad_norm": 0.42202335596084595, + "learning_rate": 9.388773869097996e-05, + "loss": 1.8365, + "step": 6005 + }, + { + "epoch": 1.8434622467771637, + "grad_norm": 0.3581100106239319, + "learning_rate": 9.388535702833185e-05, + "loss": 1.8536, + "step": 6006 + }, + { + "epoch": 1.843769183548189, + "grad_norm": 0.3670782446861267, + "learning_rate": 9.388297493198293e-05, + "loss": 1.8965, + "step": 6007 + }, + { + "epoch": 1.8440761203192142, + "grad_norm": 0.39181825518608093, + "learning_rate": 9.38805924019567e-05, + "loss": 1.8674, + "step": 6008 + }, + { + "epoch": 1.8443830570902393, + "grad_norm": 0.46757015585899353, + "learning_rate": 9.387820943827676e-05, + "loss": 1.8945, + "step": 6009 + }, + { + "epoch": 1.8446899938612646, + "grad_norm": 0.4656504690647125, + "learning_rate": 9.387582604096664e-05, + "loss": 1.8626, + "step": 6010 + }, + { + "epoch": 1.8449969306322898, + "grad_norm": 0.4699888825416565, + "learning_rate": 9.387344221004988e-05, + "loss": 1.9396, + "step": 6011 + }, + { + "epoch": 1.8453038674033149, + "grad_norm": 0.36591392755508423, + "learning_rate": 9.387105794555006e-05, + "loss": 1.8031, + "step": 6012 + }, + { + "epoch": 1.8456108041743402, + "grad_norm": 0.3563486933708191, + "learning_rate": 9.386867324749073e-05, + "loss": 1.8658, + "step": 6013 + }, + { + "epoch": 1.845917740945365, + "grad_norm": 0.4490883946418762, + "learning_rate": 9.386628811589547e-05, + "loss": 1.9809, + "step": 6014 + }, + { + "epoch": 1.8462246777163904, + "grad_norm": 0.39862295985221863, + "learning_rate": 9.38639025507878e-05, + "loss": 1.9268, + "step": 6015 + }, + { + "epoch": 1.8465316144874155, + "grad_norm": 0.3579883575439453, + "learning_rate": 9.386151655219138e-05, + "loss": 1.8538, + "step": 6016 + }, + { + "epoch": 1.8468385512584407, + "grad_norm": 0.411685973405838, + "learning_rate": 9.385913012012973e-05, + "loss": 1.9034, + "step": 6017 + }, + { + "epoch": 1.847145488029466, + "grad_norm": 0.44486066699028015, + "learning_rate": 9.385674325462643e-05, + "loss": 1.9279, + "step": 6018 + }, + { + "epoch": 1.847452424800491, + "grad_norm": 0.42794153094291687, + "learning_rate": 9.385435595570511e-05, + "loss": 1.9117, + "step": 6019 + }, + { + "epoch": 1.8477593615715162, + "grad_norm": 0.3652110695838928, + "learning_rate": 9.385196822338933e-05, + "loss": 1.9636, + "step": 6020 + }, + { + "epoch": 1.8480662983425415, + "grad_norm": 0.36490142345428467, + "learning_rate": 9.38495800577027e-05, + "loss": 1.9468, + "step": 6021 + }, + { + "epoch": 1.8483732351135667, + "grad_norm": 0.3946039080619812, + "learning_rate": 9.384719145866882e-05, + "loss": 1.8851, + "step": 6022 + }, + { + "epoch": 1.8486801718845918, + "grad_norm": 0.4236997067928314, + "learning_rate": 9.38448024263113e-05, + "loss": 2.0256, + "step": 6023 + }, + { + "epoch": 1.848987108655617, + "grad_norm": 0.34637942910194397, + "learning_rate": 9.384241296065374e-05, + "loss": 1.9032, + "step": 6024 + }, + { + "epoch": 1.849294045426642, + "grad_norm": 0.4096907079219818, + "learning_rate": 9.384002306171975e-05, + "loss": 1.9762, + "step": 6025 + }, + { + "epoch": 1.8496009821976673, + "grad_norm": 0.38225218653678894, + "learning_rate": 9.383763272953297e-05, + "loss": 2.023, + "step": 6026 + }, + { + "epoch": 1.8499079189686924, + "grad_norm": 0.4297153055667877, + "learning_rate": 9.3835241964117e-05, + "loss": 1.977, + "step": 6027 + }, + { + "epoch": 1.8502148557397176, + "grad_norm": 0.5225360989570618, + "learning_rate": 9.383285076549548e-05, + "loss": 1.919, + "step": 6028 + }, + { + "epoch": 1.850521792510743, + "grad_norm": 0.6799743175506592, + "learning_rate": 9.383045913369205e-05, + "loss": 1.9382, + "step": 6029 + }, + { + "epoch": 1.850828729281768, + "grad_norm": 0.6274817585945129, + "learning_rate": 9.382806706873031e-05, + "loss": 1.9782, + "step": 6030 + }, + { + "epoch": 1.8511356660527931, + "grad_norm": 0.4939708113670349, + "learning_rate": 9.382567457063392e-05, + "loss": 1.8794, + "step": 6031 + }, + { + "epoch": 1.8514426028238185, + "grad_norm": 0.3876135051250458, + "learning_rate": 9.382328163942656e-05, + "loss": 2.0153, + "step": 6032 + }, + { + "epoch": 1.8517495395948433, + "grad_norm": 0.592051088809967, + "learning_rate": 9.38208882751318e-05, + "loss": 1.9277, + "step": 6033 + }, + { + "epoch": 1.8520564763658687, + "grad_norm": 0.660763144493103, + "learning_rate": 9.381849447777337e-05, + "loss": 1.9177, + "step": 6034 + }, + { + "epoch": 1.8523634131368938, + "grad_norm": 0.5823151469230652, + "learning_rate": 9.381610024737489e-05, + "loss": 1.9363, + "step": 6035 + }, + { + "epoch": 1.852670349907919, + "grad_norm": 0.39519962668418884, + "learning_rate": 9.381370558396004e-05, + "loss": 1.8627, + "step": 6036 + }, + { + "epoch": 1.8529772866789442, + "grad_norm": 0.44657328724861145, + "learning_rate": 9.381131048755244e-05, + "loss": 1.9075, + "step": 6037 + }, + { + "epoch": 1.8532842234499693, + "grad_norm": 0.540743887424469, + "learning_rate": 9.380891495817581e-05, + "loss": 1.9518, + "step": 6038 + }, + { + "epoch": 1.8535911602209945, + "grad_norm": 0.4388680160045624, + "learning_rate": 9.38065189958538e-05, + "loss": 1.8485, + "step": 6039 + }, + { + "epoch": 1.8538980969920198, + "grad_norm": 0.37645572423934937, + "learning_rate": 9.38041226006101e-05, + "loss": 1.9542, + "step": 6040 + }, + { + "epoch": 1.8542050337630447, + "grad_norm": 0.4405656158924103, + "learning_rate": 9.380172577246837e-05, + "loss": 1.9054, + "step": 6041 + }, + { + "epoch": 1.85451197053407, + "grad_norm": 0.45483505725860596, + "learning_rate": 9.379932851145232e-05, + "loss": 1.9077, + "step": 6042 + }, + { + "epoch": 1.8548189073050951, + "grad_norm": 0.40666261315345764, + "learning_rate": 9.379693081758564e-05, + "loss": 1.9977, + "step": 6043 + }, + { + "epoch": 1.8551258440761202, + "grad_norm": 0.365241140127182, + "learning_rate": 9.379453269089202e-05, + "loss": 1.9047, + "step": 6044 + }, + { + "epoch": 1.8554327808471456, + "grad_norm": 0.40797916054725647, + "learning_rate": 9.379213413139516e-05, + "loss": 1.9621, + "step": 6045 + }, + { + "epoch": 1.8557397176181707, + "grad_norm": 0.4525306820869446, + "learning_rate": 9.378973513911875e-05, + "loss": 1.9479, + "step": 6046 + }, + { + "epoch": 1.8560466543891958, + "grad_norm": 0.45422959327697754, + "learning_rate": 9.378733571408652e-05, + "loss": 1.9754, + "step": 6047 + }, + { + "epoch": 1.8563535911602211, + "grad_norm": 0.381862998008728, + "learning_rate": 9.378493585632217e-05, + "loss": 1.8542, + "step": 6048 + }, + { + "epoch": 1.856660527931246, + "grad_norm": 0.40489691495895386, + "learning_rate": 9.378253556584944e-05, + "loss": 1.9331, + "step": 6049 + }, + { + "epoch": 1.8569674647022714, + "grad_norm": 0.40347445011138916, + "learning_rate": 9.378013484269201e-05, + "loss": 1.9414, + "step": 6050 + }, + { + "epoch": 1.8572744014732965, + "grad_norm": 0.35401904582977295, + "learning_rate": 9.377773368687363e-05, + "loss": 1.8094, + "step": 6051 + }, + { + "epoch": 1.8575813382443216, + "grad_norm": 0.4061582684516907, + "learning_rate": 9.377533209841805e-05, + "loss": 1.8686, + "step": 6052 + }, + { + "epoch": 1.857888275015347, + "grad_norm": 0.44419318437576294, + "learning_rate": 9.377293007734895e-05, + "loss": 1.929, + "step": 6053 + }, + { + "epoch": 1.858195211786372, + "grad_norm": 0.41038191318511963, + "learning_rate": 9.37705276236901e-05, + "loss": 1.9636, + "step": 6054 + }, + { + "epoch": 1.8585021485573971, + "grad_norm": 0.4431348145008087, + "learning_rate": 9.376812473746526e-05, + "loss": 1.953, + "step": 6055 + }, + { + "epoch": 1.8588090853284225, + "grad_norm": 0.42502057552337646, + "learning_rate": 9.376572141869814e-05, + "loss": 1.95, + "step": 6056 + }, + { + "epoch": 1.8591160220994474, + "grad_norm": 0.40050914883613586, + "learning_rate": 9.376331766741253e-05, + "loss": 1.9507, + "step": 6057 + }, + { + "epoch": 1.8594229588704727, + "grad_norm": 0.3863932490348816, + "learning_rate": 9.376091348363216e-05, + "loss": 1.8746, + "step": 6058 + }, + { + "epoch": 1.8597298956414978, + "grad_norm": 0.37295350432395935, + "learning_rate": 9.375850886738077e-05, + "loss": 1.8778, + "step": 6059 + }, + { + "epoch": 1.860036832412523, + "grad_norm": 0.37965887784957886, + "learning_rate": 9.375610381868217e-05, + "loss": 1.8511, + "step": 6060 + }, + { + "epoch": 1.8603437691835483, + "grad_norm": 0.3740752637386322, + "learning_rate": 9.37536983375601e-05, + "loss": 1.8988, + "step": 6061 + }, + { + "epoch": 1.8606507059545734, + "grad_norm": 0.40466782450675964, + "learning_rate": 9.375129242403834e-05, + "loss": 1.9195, + "step": 6062 + }, + { + "epoch": 1.8609576427255985, + "grad_norm": 0.3658956289291382, + "learning_rate": 9.374888607814067e-05, + "loss": 1.9598, + "step": 6063 + }, + { + "epoch": 1.8612645794966238, + "grad_norm": 0.3752783238887787, + "learning_rate": 9.374647929989085e-05, + "loss": 1.9791, + "step": 6064 + }, + { + "epoch": 1.8615715162676487, + "grad_norm": 0.408774733543396, + "learning_rate": 9.374407208931268e-05, + "loss": 1.88, + "step": 6065 + }, + { + "epoch": 1.861878453038674, + "grad_norm": 0.3968205749988556, + "learning_rate": 9.374166444642997e-05, + "loss": 1.8755, + "step": 6066 + }, + { + "epoch": 1.8621853898096992, + "grad_norm": 0.37851858139038086, + "learning_rate": 9.373925637126648e-05, + "loss": 1.9296, + "step": 6067 + }, + { + "epoch": 1.8624923265807243, + "grad_norm": 0.34285619854927063, + "learning_rate": 9.373684786384604e-05, + "loss": 2.0149, + "step": 6068 + }, + { + "epoch": 1.8627992633517496, + "grad_norm": 0.38841512799263, + "learning_rate": 9.373443892419242e-05, + "loss": 1.9134, + "step": 6069 + }, + { + "epoch": 1.8631062001227747, + "grad_norm": 0.4744485914707184, + "learning_rate": 9.373202955232943e-05, + "loss": 1.9164, + "step": 6070 + }, + { + "epoch": 1.8634131368937998, + "grad_norm": 0.522659420967102, + "learning_rate": 9.372961974828092e-05, + "loss": 1.9155, + "step": 6071 + }, + { + "epoch": 1.8637200736648252, + "grad_norm": 0.5794001817703247, + "learning_rate": 9.372720951207066e-05, + "loss": 1.9003, + "step": 6072 + }, + { + "epoch": 1.86402701043585, + "grad_norm": 0.5135447978973389, + "learning_rate": 9.372479884372247e-05, + "loss": 1.948, + "step": 6073 + }, + { + "epoch": 1.8643339472068754, + "grad_norm": 0.4060198664665222, + "learning_rate": 9.372238774326021e-05, + "loss": 1.8634, + "step": 6074 + }, + { + "epoch": 1.8646408839779005, + "grad_norm": 0.3880244195461273, + "learning_rate": 9.371997621070769e-05, + "loss": 1.8729, + "step": 6075 + }, + { + "epoch": 1.8649478207489256, + "grad_norm": 0.4862929582595825, + "learning_rate": 9.371756424608875e-05, + "loss": 1.9185, + "step": 6076 + }, + { + "epoch": 1.865254757519951, + "grad_norm": 0.4763035476207733, + "learning_rate": 9.371515184942719e-05, + "loss": 1.9696, + "step": 6077 + }, + { + "epoch": 1.865561694290976, + "grad_norm": 0.3552228808403015, + "learning_rate": 9.371273902074689e-05, + "loss": 1.9101, + "step": 6078 + }, + { + "epoch": 1.8658686310620012, + "grad_norm": 0.46329566836357117, + "learning_rate": 9.371032576007168e-05, + "loss": 1.8807, + "step": 6079 + }, + { + "epoch": 1.8661755678330265, + "grad_norm": 0.5176550149917603, + "learning_rate": 9.370791206742541e-05, + "loss": 1.9044, + "step": 6080 + }, + { + "epoch": 1.8664825046040514, + "grad_norm": 0.3929184675216675, + "learning_rate": 9.370549794283194e-05, + "loss": 1.8858, + "step": 6081 + }, + { + "epoch": 1.8667894413750767, + "grad_norm": 0.35135987401008606, + "learning_rate": 9.370308338631511e-05, + "loss": 1.8518, + "step": 6082 + }, + { + "epoch": 1.8670963781461019, + "grad_norm": 0.4229072034358978, + "learning_rate": 9.370066839789881e-05, + "loss": 1.891, + "step": 6083 + }, + { + "epoch": 1.867403314917127, + "grad_norm": 0.4862394630908966, + "learning_rate": 9.369825297760688e-05, + "loss": 1.9058, + "step": 6084 + }, + { + "epoch": 1.8677102516881523, + "grad_norm": 0.4775281548500061, + "learning_rate": 9.369583712546322e-05, + "loss": 1.9738, + "step": 6085 + }, + { + "epoch": 1.8680171884591774, + "grad_norm": 0.3831046521663666, + "learning_rate": 9.369342084149166e-05, + "loss": 1.9516, + "step": 6086 + }, + { + "epoch": 1.8683241252302025, + "grad_norm": 0.3970867395401001, + "learning_rate": 9.369100412571612e-05, + "loss": 2.0158, + "step": 6087 + }, + { + "epoch": 1.8686310620012279, + "grad_norm": 0.41662725806236267, + "learning_rate": 9.368858697816047e-05, + "loss": 1.86, + "step": 6088 + }, + { + "epoch": 1.8689379987722528, + "grad_norm": 0.44235244393348694, + "learning_rate": 9.36861693988486e-05, + "loss": 1.9257, + "step": 6089 + }, + { + "epoch": 1.869244935543278, + "grad_norm": 0.37863966822624207, + "learning_rate": 9.36837513878044e-05, + "loss": 1.8877, + "step": 6090 + }, + { + "epoch": 1.8695518723143032, + "grad_norm": 0.44757044315338135, + "learning_rate": 9.368133294505175e-05, + "loss": 1.8962, + "step": 6091 + }, + { + "epoch": 1.8698588090853283, + "grad_norm": 0.5299558639526367, + "learning_rate": 9.367891407061458e-05, + "loss": 1.8655, + "step": 6092 + }, + { + "epoch": 1.8701657458563536, + "grad_norm": 0.4899531900882721, + "learning_rate": 9.367649476451678e-05, + "loss": 1.8933, + "step": 6093 + }, + { + "epoch": 1.8704726826273788, + "grad_norm": 0.3883507251739502, + "learning_rate": 9.367407502678224e-05, + "loss": 1.88, + "step": 6094 + }, + { + "epoch": 1.8707796193984039, + "grad_norm": 0.40936750173568726, + "learning_rate": 9.367165485743493e-05, + "loss": 1.8926, + "step": 6095 + }, + { + "epoch": 1.8710865561694292, + "grad_norm": 0.5708447098731995, + "learning_rate": 9.36692342564987e-05, + "loss": 1.9701, + "step": 6096 + }, + { + "epoch": 1.8713934929404543, + "grad_norm": 0.5559602379798889, + "learning_rate": 9.366681322399751e-05, + "loss": 1.8962, + "step": 6097 + }, + { + "epoch": 1.8717004297114794, + "grad_norm": 0.45344826579093933, + "learning_rate": 9.366439175995528e-05, + "loss": 1.9766, + "step": 6098 + }, + { + "epoch": 1.8720073664825048, + "grad_norm": 0.4887133538722992, + "learning_rate": 9.366196986439592e-05, + "loss": 1.8982, + "step": 6099 + }, + { + "epoch": 1.8723143032535297, + "grad_norm": 0.536568284034729, + "learning_rate": 9.365954753734339e-05, + "loss": 1.9506, + "step": 6100 + }, + { + "epoch": 1.872621240024555, + "grad_norm": 0.4792746901512146, + "learning_rate": 9.365712477882162e-05, + "loss": 1.9392, + "step": 6101 + }, + { + "epoch": 1.87292817679558, + "grad_norm": 0.39836910367012024, + "learning_rate": 9.365470158885458e-05, + "loss": 1.8812, + "step": 6102 + }, + { + "epoch": 1.8732351135666052, + "grad_norm": 0.4263121783733368, + "learning_rate": 9.365227796746617e-05, + "loss": 1.8326, + "step": 6103 + }, + { + "epoch": 1.8735420503376305, + "grad_norm": 0.4158315360546112, + "learning_rate": 9.364985391468038e-05, + "loss": 1.8857, + "step": 6104 + }, + { + "epoch": 1.8738489871086557, + "grad_norm": 0.4384559094905853, + "learning_rate": 9.364742943052112e-05, + "loss": 1.9247, + "step": 6105 + }, + { + "epoch": 1.8741559238796808, + "grad_norm": 0.34221649169921875, + "learning_rate": 9.364500451501242e-05, + "loss": 1.8869, + "step": 6106 + }, + { + "epoch": 1.874462860650706, + "grad_norm": 0.38786688446998596, + "learning_rate": 9.364257916817817e-05, + "loss": 1.8879, + "step": 6107 + }, + { + "epoch": 1.874769797421731, + "grad_norm": 0.39408090710639954, + "learning_rate": 9.364015339004239e-05, + "loss": 1.8832, + "step": 6108 + }, + { + "epoch": 1.8750767341927563, + "grad_norm": 0.33985385298728943, + "learning_rate": 9.363772718062902e-05, + "loss": 1.8823, + "step": 6109 + }, + { + "epoch": 1.8753836709637814, + "grad_norm": 0.35319194197654724, + "learning_rate": 9.363530053996206e-05, + "loss": 1.9205, + "step": 6110 + }, + { + "epoch": 1.8756906077348066, + "grad_norm": 0.3455435335636139, + "learning_rate": 9.36328734680655e-05, + "loss": 1.9028, + "step": 6111 + }, + { + "epoch": 1.875997544505832, + "grad_norm": 0.3689115643501282, + "learning_rate": 9.363044596496329e-05, + "loss": 1.8996, + "step": 6112 + }, + { + "epoch": 1.876304481276857, + "grad_norm": 0.35776960849761963, + "learning_rate": 9.362801803067945e-05, + "loss": 1.9563, + "step": 6113 + }, + { + "epoch": 1.8766114180478821, + "grad_norm": 0.3524370491504669, + "learning_rate": 9.362558966523797e-05, + "loss": 1.9016, + "step": 6114 + }, + { + "epoch": 1.8769183548189075, + "grad_norm": 0.3725074529647827, + "learning_rate": 9.362316086866283e-05, + "loss": 1.9467, + "step": 6115 + }, + { + "epoch": 1.8772252915899323, + "grad_norm": 0.390055775642395, + "learning_rate": 9.362073164097807e-05, + "loss": 1.9326, + "step": 6116 + }, + { + "epoch": 1.8775322283609577, + "grad_norm": 0.39119964838027954, + "learning_rate": 9.361830198220764e-05, + "loss": 1.8723, + "step": 6117 + }, + { + "epoch": 1.8778391651319828, + "grad_norm": 0.3659103512763977, + "learning_rate": 9.36158718923756e-05, + "loss": 1.835, + "step": 6118 + }, + { + "epoch": 1.878146101903008, + "grad_norm": 0.3360283076763153, + "learning_rate": 9.361344137150597e-05, + "loss": 1.8622, + "step": 6119 + }, + { + "epoch": 1.8784530386740332, + "grad_norm": 0.35440295934677124, + "learning_rate": 9.361101041962272e-05, + "loss": 1.8523, + "step": 6120 + }, + { + "epoch": 1.8787599754450584, + "grad_norm": 1.2606174945831299, + "learning_rate": 9.36085790367499e-05, + "loss": 1.9826, + "step": 6121 + }, + { + "epoch": 1.8790669122160835, + "grad_norm": 0.49294769763946533, + "learning_rate": 9.360614722291157e-05, + "loss": 1.8478, + "step": 6122 + }, + { + "epoch": 1.8793738489871088, + "grad_norm": 0.5642881393432617, + "learning_rate": 9.360371497813172e-05, + "loss": 1.883, + "step": 6123 + }, + { + "epoch": 1.8796807857581337, + "grad_norm": 0.5257276296615601, + "learning_rate": 9.36012823024344e-05, + "loss": 1.8577, + "step": 6124 + }, + { + "epoch": 1.879987722529159, + "grad_norm": 0.36913231015205383, + "learning_rate": 9.359884919584366e-05, + "loss": 1.8934, + "step": 6125 + }, + { + "epoch": 1.8802946593001841, + "grad_norm": 0.43373262882232666, + "learning_rate": 9.359641565838353e-05, + "loss": 1.8354, + "step": 6126 + }, + { + "epoch": 1.8806015960712092, + "grad_norm": 0.5280462503433228, + "learning_rate": 9.359398169007807e-05, + "loss": 1.9446, + "step": 6127 + }, + { + "epoch": 1.8809085328422346, + "grad_norm": 0.4991915225982666, + "learning_rate": 9.359154729095135e-05, + "loss": 1.9003, + "step": 6128 + }, + { + "epoch": 1.8812154696132597, + "grad_norm": 0.3766331374645233, + "learning_rate": 9.358911246102738e-05, + "loss": 1.9149, + "step": 6129 + }, + { + "epoch": 1.8815224063842848, + "grad_norm": 0.39050692319869995, + "learning_rate": 9.358667720033026e-05, + "loss": 1.8945, + "step": 6130 + }, + { + "epoch": 1.8818293431553101, + "grad_norm": 0.47633904218673706, + "learning_rate": 9.358424150888405e-05, + "loss": 1.8772, + "step": 6131 + }, + { + "epoch": 1.882136279926335, + "grad_norm": 0.46322503685951233, + "learning_rate": 9.358180538671283e-05, + "loss": 1.893, + "step": 6132 + }, + { + "epoch": 1.8824432166973604, + "grad_norm": 0.39437612891197205, + "learning_rate": 9.357936883384066e-05, + "loss": 1.9394, + "step": 6133 + }, + { + "epoch": 1.8827501534683855, + "grad_norm": 0.4534996747970581, + "learning_rate": 9.357693185029162e-05, + "loss": 1.9689, + "step": 6134 + }, + { + "epoch": 1.8830570902394106, + "grad_norm": 0.4408230483531952, + "learning_rate": 9.35744944360898e-05, + "loss": 1.876, + "step": 6135 + }, + { + "epoch": 1.883364027010436, + "grad_norm": 0.5688899755477905, + "learning_rate": 9.35720565912593e-05, + "loss": 2.0153, + "step": 6136 + }, + { + "epoch": 1.883670963781461, + "grad_norm": 0.5005510449409485, + "learning_rate": 9.356961831582418e-05, + "loss": 1.9454, + "step": 6137 + }, + { + "epoch": 1.8839779005524862, + "grad_norm": 0.4002588987350464, + "learning_rate": 9.356717960980856e-05, + "loss": 1.9153, + "step": 6138 + }, + { + "epoch": 1.8842848373235115, + "grad_norm": 0.49053385853767395, + "learning_rate": 9.356474047323653e-05, + "loss": 1.9734, + "step": 6139 + }, + { + "epoch": 1.8845917740945364, + "grad_norm": 0.4828382432460785, + "learning_rate": 9.35623009061322e-05, + "loss": 1.8946, + "step": 6140 + }, + { + "epoch": 1.8848987108655617, + "grad_norm": 0.4389181137084961, + "learning_rate": 9.35598609085197e-05, + "loss": 1.9491, + "step": 6141 + }, + { + "epoch": 1.8852056476365868, + "grad_norm": 0.4010564982891083, + "learning_rate": 9.35574204804231e-05, + "loss": 1.8786, + "step": 6142 + }, + { + "epoch": 1.885512584407612, + "grad_norm": 0.4038756787776947, + "learning_rate": 9.355497962186657e-05, + "loss": 1.907, + "step": 6143 + }, + { + "epoch": 1.8858195211786373, + "grad_norm": 0.5030881762504578, + "learning_rate": 9.355253833287418e-05, + "loss": 1.8438, + "step": 6144 + }, + { + "epoch": 1.8861264579496624, + "grad_norm": 0.42690956592559814, + "learning_rate": 9.355009661347007e-05, + "loss": 1.8254, + "step": 6145 + }, + { + "epoch": 1.8864333947206875, + "grad_norm": 0.37733983993530273, + "learning_rate": 9.35476544636784e-05, + "loss": 1.9035, + "step": 6146 + }, + { + "epoch": 1.8867403314917128, + "grad_norm": 0.36874648928642273, + "learning_rate": 9.354521188352327e-05, + "loss": 1.885, + "step": 6147 + }, + { + "epoch": 1.8870472682627377, + "grad_norm": 0.36208659410476685, + "learning_rate": 9.354276887302885e-05, + "loss": 1.9416, + "step": 6148 + }, + { + "epoch": 1.887354205033763, + "grad_norm": 0.3952158987522125, + "learning_rate": 9.354032543221926e-05, + "loss": 1.9073, + "step": 6149 + }, + { + "epoch": 1.8876611418047882, + "grad_norm": 0.3603280782699585, + "learning_rate": 9.353788156111864e-05, + "loss": 1.9204, + "step": 6150 + }, + { + "epoch": 1.8879680785758133, + "grad_norm": 0.4325824975967407, + "learning_rate": 9.353543725975118e-05, + "loss": 1.9345, + "step": 6151 + }, + { + "epoch": 1.8882750153468386, + "grad_norm": 0.46270960569381714, + "learning_rate": 9.3532992528141e-05, + "loss": 1.9783, + "step": 6152 + }, + { + "epoch": 1.8885819521178637, + "grad_norm": 0.42317959666252136, + "learning_rate": 9.353054736631228e-05, + "loss": 1.9252, + "step": 6153 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.42653194069862366, + "learning_rate": 9.352810177428917e-05, + "loss": 1.9875, + "step": 6154 + }, + { + "epoch": 1.8891958256599142, + "grad_norm": 0.49819129705429077, + "learning_rate": 9.352565575209584e-05, + "loss": 1.9334, + "step": 6155 + }, + { + "epoch": 1.889502762430939, + "grad_norm": 0.4481790065765381, + "learning_rate": 9.352320929975646e-05, + "loss": 1.8939, + "step": 6156 + }, + { + "epoch": 1.8898096992019644, + "grad_norm": 0.41602686047554016, + "learning_rate": 9.352076241729524e-05, + "loss": 1.9207, + "step": 6157 + }, + { + "epoch": 1.8901166359729895, + "grad_norm": 0.4516759216785431, + "learning_rate": 9.351831510473633e-05, + "loss": 1.9384, + "step": 6158 + }, + { + "epoch": 1.8904235727440146, + "grad_norm": 0.5030251741409302, + "learning_rate": 9.351586736210391e-05, + "loss": 1.9787, + "step": 6159 + }, + { + "epoch": 1.89073050951504, + "grad_norm": 0.37176215648651123, + "learning_rate": 9.35134191894222e-05, + "loss": 1.8826, + "step": 6160 + }, + { + "epoch": 1.891037446286065, + "grad_norm": 0.3850235939025879, + "learning_rate": 9.351097058671537e-05, + "loss": 1.8689, + "step": 6161 + }, + { + "epoch": 1.8913443830570902, + "grad_norm": 0.3740260601043701, + "learning_rate": 9.350852155400764e-05, + "loss": 1.8624, + "step": 6162 + }, + { + "epoch": 1.8916513198281155, + "grad_norm": 0.386124849319458, + "learning_rate": 9.350607209132318e-05, + "loss": 1.8506, + "step": 6163 + }, + { + "epoch": 1.8919582565991406, + "grad_norm": 0.3743472993373871, + "learning_rate": 9.350362219868623e-05, + "loss": 1.9499, + "step": 6164 + }, + { + "epoch": 1.8922651933701657, + "grad_norm": 0.4257555603981018, + "learning_rate": 9.350117187612097e-05, + "loss": 1.9407, + "step": 6165 + }, + { + "epoch": 1.892572130141191, + "grad_norm": 0.37218552827835083, + "learning_rate": 9.349872112365163e-05, + "loss": 1.8772, + "step": 6166 + }, + { + "epoch": 1.892879066912216, + "grad_norm": 0.3443894386291504, + "learning_rate": 9.349626994130245e-05, + "loss": 1.8846, + "step": 6167 + }, + { + "epoch": 1.8931860036832413, + "grad_norm": 0.33507248759269714, + "learning_rate": 9.349381832909763e-05, + "loss": 1.9303, + "step": 6168 + }, + { + "epoch": 1.8934929404542664, + "grad_norm": 0.3844592869281769, + "learning_rate": 9.349136628706141e-05, + "loss": 1.9453, + "step": 6169 + }, + { + "epoch": 1.8937998772252915, + "grad_norm": 0.35765793919563293, + "learning_rate": 9.348891381521802e-05, + "loss": 1.8745, + "step": 6170 + }, + { + "epoch": 1.8941068139963169, + "grad_norm": 0.3732185661792755, + "learning_rate": 9.348646091359168e-05, + "loss": 1.9318, + "step": 6171 + }, + { + "epoch": 1.894413750767342, + "grad_norm": 0.3704257607460022, + "learning_rate": 9.348400758220666e-05, + "loss": 1.9285, + "step": 6172 + }, + { + "epoch": 1.894720687538367, + "grad_norm": 0.32159942388534546, + "learning_rate": 9.348155382108717e-05, + "loss": 1.8368, + "step": 6173 + }, + { + "epoch": 1.8950276243093924, + "grad_norm": 0.32755646109580994, + "learning_rate": 9.34790996302575e-05, + "loss": 1.8975, + "step": 6174 + }, + { + "epoch": 1.8953345610804173, + "grad_norm": 0.38797906041145325, + "learning_rate": 9.347664500974186e-05, + "loss": 1.9684, + "step": 6175 + }, + { + "epoch": 1.8956414978514426, + "grad_norm": 0.3870599865913391, + "learning_rate": 9.347418995956456e-05, + "loss": 1.963, + "step": 6176 + }, + { + "epoch": 1.8959484346224678, + "grad_norm": 0.35739025473594666, + "learning_rate": 9.347173447974982e-05, + "loss": 1.8912, + "step": 6177 + }, + { + "epoch": 1.8962553713934929, + "grad_norm": 0.3525852859020233, + "learning_rate": 9.346927857032193e-05, + "loss": 1.8455, + "step": 6178 + }, + { + "epoch": 1.8965623081645182, + "grad_norm": 0.39735934138298035, + "learning_rate": 9.346682223130514e-05, + "loss": 1.8824, + "step": 6179 + }, + { + "epoch": 1.8968692449355433, + "grad_norm": 0.3677692413330078, + "learning_rate": 9.346436546272373e-05, + "loss": 1.8723, + "step": 6180 + }, + { + "epoch": 1.8971761817065684, + "grad_norm": 0.3660476505756378, + "learning_rate": 9.346190826460199e-05, + "loss": 1.9674, + "step": 6181 + }, + { + "epoch": 1.8974831184775938, + "grad_norm": 0.4416230022907257, + "learning_rate": 9.34594506369642e-05, + "loss": 1.9309, + "step": 6182 + }, + { + "epoch": 1.8977900552486187, + "grad_norm": 0.39761826395988464, + "learning_rate": 9.345699257983466e-05, + "loss": 1.9408, + "step": 6183 + }, + { + "epoch": 1.898096992019644, + "grad_norm": 0.44419440627098083, + "learning_rate": 9.345453409323763e-05, + "loss": 2.0013, + "step": 6184 + }, + { + "epoch": 1.898403928790669, + "grad_norm": 0.4173676371574402, + "learning_rate": 9.345207517719743e-05, + "loss": 1.8462, + "step": 6185 + }, + { + "epoch": 1.8987108655616942, + "grad_norm": 0.39312002062797546, + "learning_rate": 9.344961583173837e-05, + "loss": 1.8716, + "step": 6186 + }, + { + "epoch": 1.8990178023327196, + "grad_norm": 0.389996737241745, + "learning_rate": 9.344715605688472e-05, + "loss": 1.9331, + "step": 6187 + }, + { + "epoch": 1.8993247391037447, + "grad_norm": 0.4575251340866089, + "learning_rate": 9.34446958526608e-05, + "loss": 1.9408, + "step": 6188 + }, + { + "epoch": 1.8996316758747698, + "grad_norm": 0.425075888633728, + "learning_rate": 9.344223521909097e-05, + "loss": 1.8632, + "step": 6189 + }, + { + "epoch": 1.899938612645795, + "grad_norm": 0.3622394800186157, + "learning_rate": 9.343977415619948e-05, + "loss": 1.8671, + "step": 6190 + }, + { + "epoch": 1.90024554941682, + "grad_norm": 0.38955047726631165, + "learning_rate": 9.343731266401068e-05, + "loss": 1.8955, + "step": 6191 + }, + { + "epoch": 1.9005524861878453, + "grad_norm": 0.40853381156921387, + "learning_rate": 9.34348507425489e-05, + "loss": 1.8477, + "step": 6192 + }, + { + "epoch": 1.9008594229588704, + "grad_norm": 0.36416095495224, + "learning_rate": 9.343238839183848e-05, + "loss": 1.8596, + "step": 6193 + }, + { + "epoch": 1.9011663597298956, + "grad_norm": 0.3371017277240753, + "learning_rate": 9.342992561190374e-05, + "loss": 1.9646, + "step": 6194 + }, + { + "epoch": 1.901473296500921, + "grad_norm": 0.3605191111564636, + "learning_rate": 9.3427462402769e-05, + "loss": 1.9165, + "step": 6195 + }, + { + "epoch": 1.901780233271946, + "grad_norm": 0.32952287793159485, + "learning_rate": 9.342499876445863e-05, + "loss": 1.8827, + "step": 6196 + }, + { + "epoch": 1.9020871700429711, + "grad_norm": 0.3627411425113678, + "learning_rate": 9.342253469699698e-05, + "loss": 1.9058, + "step": 6197 + }, + { + "epoch": 1.9023941068139965, + "grad_norm": 0.3830505311489105, + "learning_rate": 9.342007020040839e-05, + "loss": 1.89, + "step": 6198 + }, + { + "epoch": 1.9027010435850213, + "grad_norm": 0.36550065875053406, + "learning_rate": 9.341760527471722e-05, + "loss": 1.9004, + "step": 6199 + }, + { + "epoch": 1.9030079803560467, + "grad_norm": 0.4098506569862366, + "learning_rate": 9.341513991994782e-05, + "loss": 1.8656, + "step": 6200 + }, + { + "epoch": 1.9033149171270718, + "grad_norm": 0.5218825340270996, + "learning_rate": 9.341267413612456e-05, + "loss": 1.9179, + "step": 6201 + }, + { + "epoch": 1.903621853898097, + "grad_norm": 0.6201978921890259, + "learning_rate": 9.34102079232718e-05, + "loss": 1.9485, + "step": 6202 + }, + { + "epoch": 1.9039287906691222, + "grad_norm": 0.597594141960144, + "learning_rate": 9.340774128141395e-05, + "loss": 1.9074, + "step": 6203 + }, + { + "epoch": 1.9042357274401474, + "grad_norm": 0.477268248796463, + "learning_rate": 9.340527421057533e-05, + "loss": 1.9202, + "step": 6204 + }, + { + "epoch": 1.9045426642111725, + "grad_norm": 0.39805278182029724, + "learning_rate": 9.340280671078035e-05, + "loss": 1.8801, + "step": 6205 + }, + { + "epoch": 1.9048496009821978, + "grad_norm": 0.5815454721450806, + "learning_rate": 9.340033878205342e-05, + "loss": 1.8564, + "step": 6206 + }, + { + "epoch": 1.9051565377532227, + "grad_norm": 0.6385661363601685, + "learning_rate": 9.339787042441888e-05, + "loss": 1.8992, + "step": 6207 + }, + { + "epoch": 1.905463474524248, + "grad_norm": 0.5905124545097351, + "learning_rate": 9.339540163790116e-05, + "loss": 1.9608, + "step": 6208 + }, + { + "epoch": 1.9057704112952731, + "grad_norm": 0.37329113483428955, + "learning_rate": 9.339293242252465e-05, + "loss": 1.9037, + "step": 6209 + }, + { + "epoch": 1.9060773480662982, + "grad_norm": 0.4568968117237091, + "learning_rate": 9.339046277831374e-05, + "loss": 1.8719, + "step": 6210 + }, + { + "epoch": 1.9063842848373236, + "grad_norm": 0.43003782629966736, + "learning_rate": 9.338799270529284e-05, + "loss": 1.8594, + "step": 6211 + }, + { + "epoch": 1.9066912216083487, + "grad_norm": 0.3795240819454193, + "learning_rate": 9.338552220348637e-05, + "loss": 1.8645, + "step": 6212 + }, + { + "epoch": 1.9069981583793738, + "grad_norm": 0.3791581392288208, + "learning_rate": 9.338305127291876e-05, + "loss": 1.9076, + "step": 6213 + }, + { + "epoch": 1.9073050951503991, + "grad_norm": 0.3747733533382416, + "learning_rate": 9.338057991361438e-05, + "loss": 1.8665, + "step": 6214 + }, + { + "epoch": 1.907612031921424, + "grad_norm": 0.3994114100933075, + "learning_rate": 9.337810812559771e-05, + "loss": 1.9202, + "step": 6215 + }, + { + "epoch": 1.9079189686924494, + "grad_norm": 0.3808605670928955, + "learning_rate": 9.337563590889312e-05, + "loss": 1.9272, + "step": 6216 + }, + { + "epoch": 1.9082259054634745, + "grad_norm": 0.3461966812610626, + "learning_rate": 9.33731632635251e-05, + "loss": 1.8621, + "step": 6217 + }, + { + "epoch": 1.9085328422344996, + "grad_norm": 0.37272316217422485, + "learning_rate": 9.337069018951805e-05, + "loss": 1.8996, + "step": 6218 + }, + { + "epoch": 1.908839779005525, + "grad_norm": 0.40319329500198364, + "learning_rate": 9.336821668689642e-05, + "loss": 1.8852, + "step": 6219 + }, + { + "epoch": 1.90914671577655, + "grad_norm": 0.4059053659439087, + "learning_rate": 9.336574275568463e-05, + "loss": 1.9156, + "step": 6220 + }, + { + "epoch": 1.9094536525475752, + "grad_norm": 0.41244640946388245, + "learning_rate": 9.336326839590719e-05, + "loss": 1.9858, + "step": 6221 + }, + { + "epoch": 1.9097605893186005, + "grad_norm": 0.38230007886886597, + "learning_rate": 9.336079360758849e-05, + "loss": 1.8756, + "step": 6222 + }, + { + "epoch": 1.9100675260896254, + "grad_norm": 0.3620646297931671, + "learning_rate": 9.335831839075304e-05, + "loss": 1.9305, + "step": 6223 + }, + { + "epoch": 1.9103744628606507, + "grad_norm": 0.3700193166732788, + "learning_rate": 9.335584274542525e-05, + "loss": 1.8544, + "step": 6224 + }, + { + "epoch": 1.9106813996316758, + "grad_norm": 0.36827734112739563, + "learning_rate": 9.335336667162962e-05, + "loss": 1.8658, + "step": 6225 + }, + { + "epoch": 1.910988336402701, + "grad_norm": 0.33878061175346375, + "learning_rate": 9.33508901693906e-05, + "loss": 1.8638, + "step": 6226 + }, + { + "epoch": 1.9112952731737263, + "grad_norm": 0.3522186577320099, + "learning_rate": 9.334841323873269e-05, + "loss": 1.9109, + "step": 6227 + }, + { + "epoch": 1.9116022099447514, + "grad_norm": 0.3552776277065277, + "learning_rate": 9.334593587968035e-05, + "loss": 1.8499, + "step": 6228 + }, + { + "epoch": 1.9119091467157765, + "grad_norm": 0.3232300877571106, + "learning_rate": 9.334345809225805e-05, + "loss": 1.9078, + "step": 6229 + }, + { + "epoch": 1.9122160834868018, + "grad_norm": 0.3500599265098572, + "learning_rate": 9.33409798764903e-05, + "loss": 1.8953, + "step": 6230 + }, + { + "epoch": 1.9125230202578267, + "grad_norm": 0.4011479914188385, + "learning_rate": 9.333850123240159e-05, + "loss": 1.8961, + "step": 6231 + }, + { + "epoch": 1.912829957028852, + "grad_norm": 0.419539213180542, + "learning_rate": 9.333602216001642e-05, + "loss": 1.9381, + "step": 6232 + }, + { + "epoch": 1.9131368937998774, + "grad_norm": 0.364956259727478, + "learning_rate": 9.333354265935926e-05, + "loss": 1.8495, + "step": 6233 + }, + { + "epoch": 1.9134438305709023, + "grad_norm": 0.3322601318359375, + "learning_rate": 9.333106273045464e-05, + "loss": 1.8389, + "step": 6234 + }, + { + "epoch": 1.9137507673419276, + "grad_norm": 0.3706522583961487, + "learning_rate": 9.332858237332705e-05, + "loss": 1.904, + "step": 6235 + }, + { + "epoch": 1.9140577041129527, + "grad_norm": 0.3900963366031647, + "learning_rate": 9.332610158800104e-05, + "loss": 1.8974, + "step": 6236 + }, + { + "epoch": 1.9143646408839778, + "grad_norm": 0.3308334946632385, + "learning_rate": 9.332362037450108e-05, + "loss": 1.959, + "step": 6237 + }, + { + "epoch": 1.9146715776550032, + "grad_norm": 0.37876754999160767, + "learning_rate": 9.332113873285171e-05, + "loss": 1.9187, + "step": 6238 + }, + { + "epoch": 1.9149785144260283, + "grad_norm": 0.3557550609111786, + "learning_rate": 9.331865666307746e-05, + "loss": 1.9351, + "step": 6239 + }, + { + "epoch": 1.9152854511970534, + "grad_norm": 0.3792133927345276, + "learning_rate": 9.331617416520285e-05, + "loss": 1.8488, + "step": 6240 + }, + { + "epoch": 1.9155923879680787, + "grad_norm": 0.40517017245292664, + "learning_rate": 9.331369123925242e-05, + "loss": 1.9311, + "step": 6241 + }, + { + "epoch": 1.9158993247391036, + "grad_norm": 0.34011030197143555, + "learning_rate": 9.331120788525072e-05, + "loss": 1.8606, + "step": 6242 + }, + { + "epoch": 1.916206261510129, + "grad_norm": 0.39949584007263184, + "learning_rate": 9.330872410322227e-05, + "loss": 1.9156, + "step": 6243 + }, + { + "epoch": 1.916513198281154, + "grad_norm": 0.3771394193172455, + "learning_rate": 9.330623989319162e-05, + "loss": 1.8448, + "step": 6244 + }, + { + "epoch": 1.9168201350521792, + "grad_norm": 0.32114169001579285, + "learning_rate": 9.330375525518333e-05, + "loss": 1.8681, + "step": 6245 + }, + { + "epoch": 1.9171270718232045, + "grad_norm": 0.3438408672809601, + "learning_rate": 9.330127018922194e-05, + "loss": 1.8582, + "step": 6246 + }, + { + "epoch": 1.9174340085942296, + "grad_norm": 0.35971906781196594, + "learning_rate": 9.329878469533201e-05, + "loss": 1.9026, + "step": 6247 + }, + { + "epoch": 1.9177409453652547, + "grad_norm": 0.3953855633735657, + "learning_rate": 9.329629877353813e-05, + "loss": 1.8837, + "step": 6248 + }, + { + "epoch": 1.91804788213628, + "grad_norm": 0.36541905999183655, + "learning_rate": 9.329381242386485e-05, + "loss": 1.9156, + "step": 6249 + }, + { + "epoch": 1.918354818907305, + "grad_norm": 0.3577594459056854, + "learning_rate": 9.329132564633673e-05, + "loss": 1.8791, + "step": 6250 + }, + { + "epoch": 1.9186617556783303, + "grad_norm": 0.3869122564792633, + "learning_rate": 9.328883844097837e-05, + "loss": 1.9048, + "step": 6251 + }, + { + "epoch": 1.9189686924493554, + "grad_norm": 0.35097724199295044, + "learning_rate": 9.328635080781433e-05, + "loss": 1.9602, + "step": 6252 + }, + { + "epoch": 1.9192756292203805, + "grad_norm": 0.3813062012195587, + "learning_rate": 9.328386274686919e-05, + "loss": 1.9133, + "step": 6253 + }, + { + "epoch": 1.9195825659914059, + "grad_norm": 0.3950280249118805, + "learning_rate": 9.328137425816756e-05, + "loss": 1.9462, + "step": 6254 + }, + { + "epoch": 1.919889502762431, + "grad_norm": 0.41710540652275085, + "learning_rate": 9.327888534173402e-05, + "loss": 1.8616, + "step": 6255 + }, + { + "epoch": 1.920196439533456, + "grad_norm": 0.39998626708984375, + "learning_rate": 9.327639599759318e-05, + "loss": 1.8758, + "step": 6256 + }, + { + "epoch": 1.9205033763044814, + "grad_norm": 0.35425302386283875, + "learning_rate": 9.32739062257696e-05, + "loss": 1.8896, + "step": 6257 + }, + { + "epoch": 1.9208103130755063, + "grad_norm": 0.3487682640552521, + "learning_rate": 9.327141602628793e-05, + "loss": 1.8901, + "step": 6258 + }, + { + "epoch": 1.9211172498465316, + "grad_norm": 0.38767126202583313, + "learning_rate": 9.326892539917277e-05, + "loss": 1.9264, + "step": 6259 + }, + { + "epoch": 1.9214241866175568, + "grad_norm": 0.4265333116054535, + "learning_rate": 9.326643434444872e-05, + "loss": 1.9282, + "step": 6260 + }, + { + "epoch": 1.9217311233885819, + "grad_norm": 0.3386894166469574, + "learning_rate": 9.326394286214042e-05, + "loss": 1.8167, + "step": 6261 + }, + { + "epoch": 1.9220380601596072, + "grad_norm": 0.3594066798686981, + "learning_rate": 9.326145095227246e-05, + "loss": 1.9293, + "step": 6262 + }, + { + "epoch": 1.9223449969306323, + "grad_norm": 0.4041733741760254, + "learning_rate": 9.32589586148695e-05, + "loss": 2.0066, + "step": 6263 + }, + { + "epoch": 1.9226519337016574, + "grad_norm": 0.45588794350624084, + "learning_rate": 9.325646584995615e-05, + "loss": 1.9485, + "step": 6264 + }, + { + "epoch": 1.9229588704726828, + "grad_norm": 0.42583590745925903, + "learning_rate": 9.325397265755705e-05, + "loss": 1.8973, + "step": 6265 + }, + { + "epoch": 1.9232658072437077, + "grad_norm": 0.38701504468917847, + "learning_rate": 9.325147903769684e-05, + "loss": 1.9624, + "step": 6266 + }, + { + "epoch": 1.923572744014733, + "grad_norm": 0.4298608899116516, + "learning_rate": 9.324898499040017e-05, + "loss": 1.9033, + "step": 6267 + }, + { + "epoch": 1.923879680785758, + "grad_norm": 0.3692619800567627, + "learning_rate": 9.324649051569167e-05, + "loss": 1.973, + "step": 6268 + }, + { + "epoch": 1.9241866175567832, + "grad_norm": 0.40625011920928955, + "learning_rate": 9.324399561359602e-05, + "loss": 1.8629, + "step": 6269 + }, + { + "epoch": 1.9244935543278086, + "grad_norm": 0.43613263964653015, + "learning_rate": 9.324150028413784e-05, + "loss": 1.8928, + "step": 6270 + }, + { + "epoch": 1.9248004910988337, + "grad_norm": 0.4670937657356262, + "learning_rate": 9.323900452734182e-05, + "loss": 1.8809, + "step": 6271 + }, + { + "epoch": 1.9251074278698588, + "grad_norm": 0.43263986706733704, + "learning_rate": 9.323650834323262e-05, + "loss": 1.891, + "step": 6272 + }, + { + "epoch": 1.9254143646408841, + "grad_norm": 0.4253878891468048, + "learning_rate": 9.32340117318349e-05, + "loss": 2.0064, + "step": 6273 + }, + { + "epoch": 1.925721301411909, + "grad_norm": 0.3742302358150482, + "learning_rate": 9.323151469317332e-05, + "loss": 1.9441, + "step": 6274 + }, + { + "epoch": 1.9260282381829343, + "grad_norm": 0.37415632605552673, + "learning_rate": 9.32290172272726e-05, + "loss": 1.8901, + "step": 6275 + }, + { + "epoch": 1.9263351749539595, + "grad_norm": 0.402935266494751, + "learning_rate": 9.322651933415738e-05, + "loss": 1.9013, + "step": 6276 + }, + { + "epoch": 1.9266421117249846, + "grad_norm": 0.479819118976593, + "learning_rate": 9.322402101385235e-05, + "loss": 1.9713, + "step": 6277 + }, + { + "epoch": 1.92694904849601, + "grad_norm": 0.4472719430923462, + "learning_rate": 9.322152226638222e-05, + "loss": 1.9106, + "step": 6278 + }, + { + "epoch": 1.927255985267035, + "grad_norm": 0.36508920788764954, + "learning_rate": 9.321902309177168e-05, + "loss": 1.8999, + "step": 6279 + }, + { + "epoch": 1.9275629220380601, + "grad_norm": 0.38674476742744446, + "learning_rate": 9.321652349004542e-05, + "loss": 1.8653, + "step": 6280 + }, + { + "epoch": 1.9278698588090855, + "grad_norm": 0.3745587170124054, + "learning_rate": 9.321402346122814e-05, + "loss": 1.8764, + "step": 6281 + }, + { + "epoch": 1.9281767955801103, + "grad_norm": 0.37824445962905884, + "learning_rate": 9.321152300534454e-05, + "loss": 1.8712, + "step": 6282 + }, + { + "epoch": 1.9284837323511357, + "grad_norm": 0.3442685306072235, + "learning_rate": 9.320902212241936e-05, + "loss": 1.8242, + "step": 6283 + }, + { + "epoch": 1.9287906691221608, + "grad_norm": 0.3152186870574951, + "learning_rate": 9.32065208124773e-05, + "loss": 1.9282, + "step": 6284 + }, + { + "epoch": 1.929097605893186, + "grad_norm": 0.35380542278289795, + "learning_rate": 9.320401907554306e-05, + "loss": 1.8783, + "step": 6285 + }, + { + "epoch": 1.9294045426642112, + "grad_norm": 0.3140089511871338, + "learning_rate": 9.320151691164138e-05, + "loss": 1.9174, + "step": 6286 + }, + { + "epoch": 1.9297114794352364, + "grad_norm": 0.33666202425956726, + "learning_rate": 9.3199014320797e-05, + "loss": 1.8926, + "step": 6287 + }, + { + "epoch": 1.9300184162062615, + "grad_norm": 0.3297472894191742, + "learning_rate": 9.319651130303465e-05, + "loss": 1.8763, + "step": 6288 + }, + { + "epoch": 1.9303253529772868, + "grad_norm": 0.3323235511779785, + "learning_rate": 9.319400785837906e-05, + "loss": 1.9088, + "step": 6289 + }, + { + "epoch": 1.9306322897483117, + "grad_norm": 0.32601413130760193, + "learning_rate": 9.319150398685494e-05, + "loss": 1.8672, + "step": 6290 + }, + { + "epoch": 1.930939226519337, + "grad_norm": 0.35310089588165283, + "learning_rate": 9.318899968848708e-05, + "loss": 1.9492, + "step": 6291 + }, + { + "epoch": 1.9312461632903621, + "grad_norm": 0.3718548119068146, + "learning_rate": 9.31864949633002e-05, + "loss": 1.8692, + "step": 6292 + }, + { + "epoch": 1.9315531000613873, + "grad_norm": 0.42382025718688965, + "learning_rate": 9.318398981131908e-05, + "loss": 1.9693, + "step": 6293 + }, + { + "epoch": 1.9318600368324126, + "grad_norm": 0.5123299360275269, + "learning_rate": 9.318148423256845e-05, + "loss": 2.0117, + "step": 6294 + }, + { + "epoch": 1.9321669736034377, + "grad_norm": 0.4483809769153595, + "learning_rate": 9.317897822707308e-05, + "loss": 1.9165, + "step": 6295 + }, + { + "epoch": 1.9324739103744628, + "grad_norm": 0.4385908544063568, + "learning_rate": 9.317647179485776e-05, + "loss": 1.8869, + "step": 6296 + }, + { + "epoch": 1.9327808471454881, + "grad_norm": 0.42863771319389343, + "learning_rate": 9.317396493594724e-05, + "loss": 1.9484, + "step": 6297 + }, + { + "epoch": 1.933087783916513, + "grad_norm": 0.4130534529685974, + "learning_rate": 9.317145765036627e-05, + "loss": 1.9201, + "step": 6298 + }, + { + "epoch": 1.9333947206875384, + "grad_norm": 0.39024612307548523, + "learning_rate": 9.316894993813965e-05, + "loss": 1.9674, + "step": 6299 + }, + { + "epoch": 1.9337016574585635, + "grad_norm": 0.41060271859169006, + "learning_rate": 9.316644179929219e-05, + "loss": 1.9529, + "step": 6300 + }, + { + "epoch": 1.9340085942295886, + "grad_norm": 0.4302372634410858, + "learning_rate": 9.316393323384863e-05, + "loss": 1.8998, + "step": 6301 + }, + { + "epoch": 1.934315531000614, + "grad_norm": 0.3739410936832428, + "learning_rate": 9.316142424183379e-05, + "loss": 1.8812, + "step": 6302 + }, + { + "epoch": 1.934622467771639, + "grad_norm": 0.3965891897678375, + "learning_rate": 9.315891482327245e-05, + "loss": 1.8851, + "step": 6303 + }, + { + "epoch": 1.9349294045426642, + "grad_norm": 0.4486664831638336, + "learning_rate": 9.315640497818943e-05, + "loss": 1.9494, + "step": 6304 + }, + { + "epoch": 1.9352363413136895, + "grad_norm": 0.5530070662498474, + "learning_rate": 9.315389470660951e-05, + "loss": 1.9716, + "step": 6305 + }, + { + "epoch": 1.9355432780847146, + "grad_norm": 0.7142495512962341, + "learning_rate": 9.315138400855751e-05, + "loss": 1.947, + "step": 6306 + }, + { + "epoch": 1.9358502148557397, + "grad_norm": 0.7555594444274902, + "learning_rate": 9.314887288405827e-05, + "loss": 1.873, + "step": 6307 + }, + { + "epoch": 1.936157151626765, + "grad_norm": 0.6025232076644897, + "learning_rate": 9.314636133313654e-05, + "loss": 1.9189, + "step": 6308 + }, + { + "epoch": 1.93646408839779, + "grad_norm": 0.3686346113681793, + "learning_rate": 9.314384935581719e-05, + "loss": 1.8461, + "step": 6309 + }, + { + "epoch": 1.9367710251688153, + "grad_norm": 0.46265771985054016, + "learning_rate": 9.314133695212505e-05, + "loss": 1.8955, + "step": 6310 + }, + { + "epoch": 1.9370779619398404, + "grad_norm": 0.7023865580558777, + "learning_rate": 9.313882412208492e-05, + "loss": 1.9378, + "step": 6311 + }, + { + "epoch": 1.9373848987108655, + "grad_norm": 0.7163348197937012, + "learning_rate": 9.313631086572163e-05, + "loss": 1.9278, + "step": 6312 + }, + { + "epoch": 1.9376918354818908, + "grad_norm": 0.4772320091724396, + "learning_rate": 9.313379718306006e-05, + "loss": 1.9215, + "step": 6313 + }, + { + "epoch": 1.937998772252916, + "grad_norm": 0.4934171438217163, + "learning_rate": 9.313128307412501e-05, + "loss": 1.9725, + "step": 6314 + }, + { + "epoch": 1.938305709023941, + "grad_norm": 0.5988278985023499, + "learning_rate": 9.312876853894134e-05, + "loss": 1.9238, + "step": 6315 + }, + { + "epoch": 1.9386126457949664, + "grad_norm": 0.5819640159606934, + "learning_rate": 9.31262535775339e-05, + "loss": 1.9228, + "step": 6316 + }, + { + "epoch": 1.9389195825659913, + "grad_norm": 0.49525877833366394, + "learning_rate": 9.312373818992756e-05, + "loss": 1.8939, + "step": 6317 + }, + { + "epoch": 1.9392265193370166, + "grad_norm": 0.3778049647808075, + "learning_rate": 9.312122237614715e-05, + "loss": 1.8709, + "step": 6318 + }, + { + "epoch": 1.9395334561080417, + "grad_norm": 0.48716801404953003, + "learning_rate": 9.311870613621754e-05, + "loss": 1.9014, + "step": 6319 + }, + { + "epoch": 1.9398403928790668, + "grad_norm": 0.47298866510391235, + "learning_rate": 9.311618947016362e-05, + "loss": 1.8686, + "step": 6320 + }, + { + "epoch": 1.9401473296500922, + "grad_norm": 0.3709685206413269, + "learning_rate": 9.311367237801023e-05, + "loss": 1.9531, + "step": 6321 + }, + { + "epoch": 1.9404542664211173, + "grad_norm": 0.3898928761482239, + "learning_rate": 9.311115485978228e-05, + "loss": 1.8806, + "step": 6322 + }, + { + "epoch": 1.9407612031921424, + "grad_norm": 0.43091922998428345, + "learning_rate": 9.310863691550461e-05, + "loss": 1.9278, + "step": 6323 + }, + { + "epoch": 1.9410681399631677, + "grad_norm": 0.3788231909275055, + "learning_rate": 9.310611854520212e-05, + "loss": 1.893, + "step": 6324 + }, + { + "epoch": 1.9413750767341926, + "grad_norm": 0.4471469819545746, + "learning_rate": 9.310359974889972e-05, + "loss": 1.9706, + "step": 6325 + }, + { + "epoch": 1.941682013505218, + "grad_norm": 0.4047459661960602, + "learning_rate": 9.310108052662228e-05, + "loss": 1.8863, + "step": 6326 + }, + { + "epoch": 1.941988950276243, + "grad_norm": 0.4334566593170166, + "learning_rate": 9.309856087839468e-05, + "loss": 1.9543, + "step": 6327 + }, + { + "epoch": 1.9422958870472682, + "grad_norm": 0.3828316032886505, + "learning_rate": 9.309604080424185e-05, + "loss": 1.8601, + "step": 6328 + }, + { + "epoch": 1.9426028238182935, + "grad_norm": 0.3702560067176819, + "learning_rate": 9.30935203041887e-05, + "loss": 1.9055, + "step": 6329 + }, + { + "epoch": 1.9429097605893186, + "grad_norm": 0.4922797977924347, + "learning_rate": 9.309099937826011e-05, + "loss": 1.9589, + "step": 6330 + }, + { + "epoch": 1.9432166973603437, + "grad_norm": 0.4073271155357361, + "learning_rate": 9.308847802648102e-05, + "loss": 1.9727, + "step": 6331 + }, + { + "epoch": 1.943523634131369, + "grad_norm": 0.3833904266357422, + "learning_rate": 9.308595624887633e-05, + "loss": 1.8641, + "step": 6332 + }, + { + "epoch": 1.943830570902394, + "grad_norm": 0.44063761830329895, + "learning_rate": 9.308343404547095e-05, + "loss": 1.8996, + "step": 6333 + }, + { + "epoch": 1.9441375076734193, + "grad_norm": 0.4776977300643921, + "learning_rate": 9.308091141628983e-05, + "loss": 1.9353, + "step": 6334 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.39584699273109436, + "learning_rate": 9.307838836135792e-05, + "loss": 1.8521, + "step": 6335 + }, + { + "epoch": 1.9447513812154695, + "grad_norm": 0.3220890760421753, + "learning_rate": 9.30758648807001e-05, + "loss": 1.825, + "step": 6336 + }, + { + "epoch": 1.9450583179864949, + "grad_norm": 0.4301774501800537, + "learning_rate": 9.307334097434133e-05, + "loss": 1.9317, + "step": 6337 + }, + { + "epoch": 1.94536525475752, + "grad_norm": 0.439165323972702, + "learning_rate": 9.307081664230658e-05, + "loss": 1.8669, + "step": 6338 + }, + { + "epoch": 1.945672191528545, + "grad_norm": 0.4185279607772827, + "learning_rate": 9.306829188462076e-05, + "loss": 1.9512, + "step": 6339 + }, + { + "epoch": 1.9459791282995704, + "grad_norm": 0.4089502990245819, + "learning_rate": 9.306576670130885e-05, + "loss": 1.9607, + "step": 6340 + }, + { + "epoch": 1.9462860650705953, + "grad_norm": 0.508836567401886, + "learning_rate": 9.306324109239578e-05, + "loss": 1.9187, + "step": 6341 + }, + { + "epoch": 1.9465930018416207, + "grad_norm": 0.637534499168396, + "learning_rate": 9.306071505790652e-05, + "loss": 1.8237, + "step": 6342 + }, + { + "epoch": 1.9468999386126458, + "grad_norm": 0.5845112800598145, + "learning_rate": 9.305818859786603e-05, + "loss": 1.8238, + "step": 6343 + }, + { + "epoch": 1.9472068753836709, + "grad_norm": 0.4168374240398407, + "learning_rate": 9.305566171229932e-05, + "loss": 1.9343, + "step": 6344 + }, + { + "epoch": 1.9475138121546962, + "grad_norm": 0.43040701746940613, + "learning_rate": 9.305313440123129e-05, + "loss": 1.8774, + "step": 6345 + }, + { + "epoch": 1.9478207489257213, + "grad_norm": 0.6011641025543213, + "learning_rate": 9.305060666468696e-05, + "loss": 1.89, + "step": 6346 + }, + { + "epoch": 1.9481276856967464, + "grad_norm": 0.5530022382736206, + "learning_rate": 9.304807850269131e-05, + "loss": 2.0006, + "step": 6347 + }, + { + "epoch": 1.9484346224677718, + "grad_norm": 0.3707423210144043, + "learning_rate": 9.30455499152693e-05, + "loss": 1.9116, + "step": 6348 + }, + { + "epoch": 1.9487415592387967, + "grad_norm": 0.5013771653175354, + "learning_rate": 9.304302090244595e-05, + "loss": 1.8902, + "step": 6349 + }, + { + "epoch": 1.949048496009822, + "grad_norm": 0.5873609781265259, + "learning_rate": 9.304049146424623e-05, + "loss": 1.8879, + "step": 6350 + }, + { + "epoch": 1.949355432780847, + "grad_norm": 0.4389801621437073, + "learning_rate": 9.303796160069516e-05, + "loss": 1.9215, + "step": 6351 + }, + { + "epoch": 1.9496623695518722, + "grad_norm": 0.4004434645175934, + "learning_rate": 9.303543131181772e-05, + "loss": 1.9137, + "step": 6352 + }, + { + "epoch": 1.9499693063228976, + "grad_norm": 0.4928852617740631, + "learning_rate": 9.303290059763892e-05, + "loss": 1.9415, + "step": 6353 + }, + { + "epoch": 1.9502762430939227, + "grad_norm": 0.5045879483222961, + "learning_rate": 9.303036945818377e-05, + "loss": 1.8727, + "step": 6354 + }, + { + "epoch": 1.9505831798649478, + "grad_norm": 0.3434823453426361, + "learning_rate": 9.30278378934773e-05, + "loss": 1.8971, + "step": 6355 + }, + { + "epoch": 1.9508901166359731, + "grad_norm": 0.42980003356933594, + "learning_rate": 9.302530590354452e-05, + "loss": 1.9233, + "step": 6356 + }, + { + "epoch": 1.951197053406998, + "grad_norm": 0.3832406997680664, + "learning_rate": 9.302277348841042e-05, + "loss": 1.9317, + "step": 6357 + }, + { + "epoch": 1.9515039901780233, + "grad_norm": 0.37214264273643494, + "learning_rate": 9.30202406481001e-05, + "loss": 1.9172, + "step": 6358 + }, + { + "epoch": 1.9518109269490485, + "grad_norm": 0.3601585924625397, + "learning_rate": 9.30177073826385e-05, + "loss": 1.9286, + "step": 6359 + }, + { + "epoch": 1.9521178637200736, + "grad_norm": 0.36419349908828735, + "learning_rate": 9.301517369205072e-05, + "loss": 1.8624, + "step": 6360 + }, + { + "epoch": 1.952424800491099, + "grad_norm": 0.3808813691139221, + "learning_rate": 9.30126395763618e-05, + "loss": 1.8656, + "step": 6361 + }, + { + "epoch": 1.952731737262124, + "grad_norm": 0.39045700430870056, + "learning_rate": 9.301010503559675e-05, + "loss": 1.9205, + "step": 6362 + }, + { + "epoch": 1.9530386740331491, + "grad_norm": 0.37281444668769836, + "learning_rate": 9.300757006978065e-05, + "loss": 1.9162, + "step": 6363 + }, + { + "epoch": 1.9533456108041745, + "grad_norm": 0.4525204002857208, + "learning_rate": 9.300503467893851e-05, + "loss": 1.8999, + "step": 6364 + }, + { + "epoch": 1.9536525475751993, + "grad_norm": 0.41406187415122986, + "learning_rate": 9.300249886309542e-05, + "loss": 1.9804, + "step": 6365 + }, + { + "epoch": 1.9539594843462247, + "grad_norm": 0.4125058650970459, + "learning_rate": 9.299996262227644e-05, + "loss": 1.8464, + "step": 6366 + }, + { + "epoch": 1.9542664211172498, + "grad_norm": 0.41582876443862915, + "learning_rate": 9.299742595650663e-05, + "loss": 1.9937, + "step": 6367 + }, + { + "epoch": 1.954573357888275, + "grad_norm": 0.4360882639884949, + "learning_rate": 9.299488886581103e-05, + "loss": 1.9064, + "step": 6368 + }, + { + "epoch": 1.9548802946593002, + "grad_norm": 0.38369372487068176, + "learning_rate": 9.299235135021476e-05, + "loss": 1.9202, + "step": 6369 + }, + { + "epoch": 1.9551872314303254, + "grad_norm": 0.34401383996009827, + "learning_rate": 9.298981340974287e-05, + "loss": 1.844, + "step": 6370 + }, + { + "epoch": 1.9554941682013505, + "grad_norm": 0.3434326946735382, + "learning_rate": 9.298727504442044e-05, + "loss": 1.8206, + "step": 6371 + }, + { + "epoch": 1.9558011049723758, + "grad_norm": 0.35966724157333374, + "learning_rate": 9.298473625427257e-05, + "loss": 1.9, + "step": 6372 + }, + { + "epoch": 1.9561080417434007, + "grad_norm": 0.3726016581058502, + "learning_rate": 9.298219703932434e-05, + "loss": 1.9004, + "step": 6373 + }, + { + "epoch": 1.956414978514426, + "grad_norm": 0.3377366364002228, + "learning_rate": 9.297965739960084e-05, + "loss": 1.8747, + "step": 6374 + }, + { + "epoch": 1.9567219152854514, + "grad_norm": 0.36824578046798706, + "learning_rate": 9.297711733512718e-05, + "loss": 1.9059, + "step": 6375 + }, + { + "epoch": 1.9570288520564763, + "grad_norm": 0.3434023857116699, + "learning_rate": 9.297457684592847e-05, + "loss": 1.8624, + "step": 6376 + }, + { + "epoch": 1.9573357888275016, + "grad_norm": 0.36236703395843506, + "learning_rate": 9.297203593202979e-05, + "loss": 1.8558, + "step": 6377 + }, + { + "epoch": 1.9576427255985267, + "grad_norm": 0.3326953947544098, + "learning_rate": 9.296949459345625e-05, + "loss": 1.9189, + "step": 6378 + }, + { + "epoch": 1.9579496623695518, + "grad_norm": 0.3358452022075653, + "learning_rate": 9.2966952830233e-05, + "loss": 1.8601, + "step": 6379 + }, + { + "epoch": 1.9582565991405771, + "grad_norm": 0.36092114448547363, + "learning_rate": 9.296441064238514e-05, + "loss": 1.873, + "step": 6380 + }, + { + "epoch": 1.9585635359116023, + "grad_norm": 0.345683217048645, + "learning_rate": 9.296186802993778e-05, + "loss": 1.9122, + "step": 6381 + }, + { + "epoch": 1.9588704726826274, + "grad_norm": 0.32488611340522766, + "learning_rate": 9.295932499291606e-05, + "loss": 1.8709, + "step": 6382 + }, + { + "epoch": 1.9591774094536527, + "grad_norm": 0.34276288747787476, + "learning_rate": 9.295678153134512e-05, + "loss": 1.937, + "step": 6383 + }, + { + "epoch": 1.9594843462246776, + "grad_norm": 0.3953622877597809, + "learning_rate": 9.295423764525008e-05, + "loss": 1.9357, + "step": 6384 + }, + { + "epoch": 1.959791282995703, + "grad_norm": 0.37806951999664307, + "learning_rate": 9.29516933346561e-05, + "loss": 1.8813, + "step": 6385 + }, + { + "epoch": 1.960098219766728, + "grad_norm": 0.39551272988319397, + "learning_rate": 9.29491485995883e-05, + "loss": 1.8812, + "step": 6386 + }, + { + "epoch": 1.9604051565377532, + "grad_norm": 0.37042370438575745, + "learning_rate": 9.294660344007184e-05, + "loss": 1.9059, + "step": 6387 + }, + { + "epoch": 1.9607120933087785, + "grad_norm": 0.37503576278686523, + "learning_rate": 9.294405785613187e-05, + "loss": 1.9792, + "step": 6388 + }, + { + "epoch": 1.9610190300798036, + "grad_norm": 0.3515741229057312, + "learning_rate": 9.294151184779355e-05, + "loss": 1.8792, + "step": 6389 + }, + { + "epoch": 1.9613259668508287, + "grad_norm": 0.319890558719635, + "learning_rate": 9.293896541508205e-05, + "loss": 1.9222, + "step": 6390 + }, + { + "epoch": 1.961632903621854, + "grad_norm": 0.3517487645149231, + "learning_rate": 9.293641855802252e-05, + "loss": 1.8751, + "step": 6391 + }, + { + "epoch": 1.961939840392879, + "grad_norm": 0.33269986510276794, + "learning_rate": 9.293387127664012e-05, + "loss": 1.8372, + "step": 6392 + }, + { + "epoch": 1.9622467771639043, + "grad_norm": 0.36048516631126404, + "learning_rate": 9.293132357096007e-05, + "loss": 1.8944, + "step": 6393 + }, + { + "epoch": 1.9625537139349294, + "grad_norm": 0.4329642057418823, + "learning_rate": 9.292877544100751e-05, + "loss": 1.9868, + "step": 6394 + }, + { + "epoch": 1.9628606507059545, + "grad_norm": 0.445496529340744, + "learning_rate": 9.292622688680762e-05, + "loss": 1.9885, + "step": 6395 + }, + { + "epoch": 1.9631675874769798, + "grad_norm": 0.3818886876106262, + "learning_rate": 9.292367790838561e-05, + "loss": 1.9515, + "step": 6396 + }, + { + "epoch": 1.963474524248005, + "grad_norm": 0.3800121545791626, + "learning_rate": 9.292112850576664e-05, + "loss": 1.8838, + "step": 6397 + }, + { + "epoch": 1.96378146101903, + "grad_norm": 0.44252321124076843, + "learning_rate": 9.291857867897593e-05, + "loss": 1.9296, + "step": 6398 + }, + { + "epoch": 1.9640883977900554, + "grad_norm": 0.463766485452652, + "learning_rate": 9.291602842803867e-05, + "loss": 1.9164, + "step": 6399 + }, + { + "epoch": 1.9643953345610803, + "grad_norm": 0.4599217474460602, + "learning_rate": 9.291347775298006e-05, + "loss": 1.9277, + "step": 6400 + }, + { + "epoch": 1.9647022713321056, + "grad_norm": 0.371346652507782, + "learning_rate": 9.291092665382532e-05, + "loss": 1.9036, + "step": 6401 + }, + { + "epoch": 1.9650092081031307, + "grad_norm": 0.327197402715683, + "learning_rate": 9.290837513059965e-05, + "loss": 1.8214, + "step": 6402 + }, + { + "epoch": 1.9653161448741558, + "grad_norm": 0.3346688747406006, + "learning_rate": 9.290582318332826e-05, + "loss": 1.8671, + "step": 6403 + }, + { + "epoch": 1.9656230816451812, + "grad_norm": 0.342208594083786, + "learning_rate": 9.290327081203637e-05, + "loss": 1.9143, + "step": 6404 + }, + { + "epoch": 1.9659300184162063, + "grad_norm": 0.3430559039115906, + "learning_rate": 9.290071801674923e-05, + "loss": 1.9135, + "step": 6405 + }, + { + "epoch": 1.9662369551872314, + "grad_norm": 0.3335573971271515, + "learning_rate": 9.289816479749202e-05, + "loss": 1.9011, + "step": 6406 + }, + { + "epoch": 1.9665438919582567, + "grad_norm": 0.3464879095554352, + "learning_rate": 9.289561115429004e-05, + "loss": 1.9061, + "step": 6407 + }, + { + "epoch": 1.9668508287292816, + "grad_norm": 0.3513408899307251, + "learning_rate": 9.289305708716847e-05, + "loss": 1.8982, + "step": 6408 + }, + { + "epoch": 1.967157765500307, + "grad_norm": 0.3888663947582245, + "learning_rate": 9.289050259615256e-05, + "loss": 1.9196, + "step": 6409 + }, + { + "epoch": 1.967464702271332, + "grad_norm": 0.3414073884487152, + "learning_rate": 9.288794768126759e-05, + "loss": 1.932, + "step": 6410 + }, + { + "epoch": 1.9677716390423572, + "grad_norm": 0.33067384362220764, + "learning_rate": 9.288539234253876e-05, + "loss": 1.8547, + "step": 6411 + }, + { + "epoch": 1.9680785758133825, + "grad_norm": 0.31827688217163086, + "learning_rate": 9.288283657999135e-05, + "loss": 1.8691, + "step": 6412 + }, + { + "epoch": 1.9683855125844076, + "grad_norm": 0.32259073853492737, + "learning_rate": 9.288028039365062e-05, + "loss": 1.8889, + "step": 6413 + }, + { + "epoch": 1.9686924493554327, + "grad_norm": 0.37552687525749207, + "learning_rate": 9.287772378354182e-05, + "loss": 1.8709, + "step": 6414 + }, + { + "epoch": 1.968999386126458, + "grad_norm": 0.3446151316165924, + "learning_rate": 9.287516674969024e-05, + "loss": 1.8749, + "step": 6415 + }, + { + "epoch": 1.969306322897483, + "grad_norm": 0.3648208975791931, + "learning_rate": 9.287260929212111e-05, + "loss": 1.93, + "step": 6416 + }, + { + "epoch": 1.9696132596685083, + "grad_norm": 0.3430599868297577, + "learning_rate": 9.287005141085974e-05, + "loss": 1.8537, + "step": 6417 + }, + { + "epoch": 1.9699201964395334, + "grad_norm": 0.39110586047172546, + "learning_rate": 9.286749310593139e-05, + "loss": 1.987, + "step": 6418 + }, + { + "epoch": 1.9702271332105585, + "grad_norm": 0.4033393859863281, + "learning_rate": 9.286493437736136e-05, + "loss": 1.9793, + "step": 6419 + }, + { + "epoch": 1.9705340699815839, + "grad_norm": 0.3950151205062866, + "learning_rate": 9.286237522517491e-05, + "loss": 1.8781, + "step": 6420 + }, + { + "epoch": 1.970841006752609, + "grad_norm": 0.4614053964614868, + "learning_rate": 9.285981564939735e-05, + "loss": 1.9886, + "step": 6421 + }, + { + "epoch": 1.971147943523634, + "grad_norm": 0.4990023076534271, + "learning_rate": 9.285725565005398e-05, + "loss": 1.8957, + "step": 6422 + }, + { + "epoch": 1.9714548802946594, + "grad_norm": 0.501301109790802, + "learning_rate": 9.285469522717008e-05, + "loss": 1.8606, + "step": 6423 + }, + { + "epoch": 1.9717618170656843, + "grad_norm": 0.3820148706436157, + "learning_rate": 9.285213438077097e-05, + "loss": 1.9097, + "step": 6424 + }, + { + "epoch": 1.9720687538367097, + "grad_norm": 0.3959129750728607, + "learning_rate": 9.284957311088193e-05, + "loss": 1.8972, + "step": 6425 + }, + { + "epoch": 1.9723756906077348, + "grad_norm": 0.4914678931236267, + "learning_rate": 9.284701141752831e-05, + "loss": 1.9211, + "step": 6426 + }, + { + "epoch": 1.9726826273787599, + "grad_norm": 0.5992010831832886, + "learning_rate": 9.284444930073542e-05, + "loss": 1.917, + "step": 6427 + }, + { + "epoch": 1.9729895641497852, + "grad_norm": 0.6089407801628113, + "learning_rate": 9.284188676052856e-05, + "loss": 1.9497, + "step": 6428 + }, + { + "epoch": 1.9732965009208103, + "grad_norm": 0.5493173003196716, + "learning_rate": 9.283932379693306e-05, + "loss": 1.9888, + "step": 6429 + }, + { + "epoch": 1.9736034376918354, + "grad_norm": 0.4451984167098999, + "learning_rate": 9.283676040997426e-05, + "loss": 1.892, + "step": 6430 + }, + { + "epoch": 1.9739103744628608, + "grad_norm": 0.35765743255615234, + "learning_rate": 9.283419659967748e-05, + "loss": 1.8768, + "step": 6431 + }, + { + "epoch": 1.9742173112338857, + "grad_norm": 0.36561164259910583, + "learning_rate": 9.283163236606807e-05, + "loss": 1.825, + "step": 6432 + }, + { + "epoch": 1.974524248004911, + "grad_norm": 0.38473913073539734, + "learning_rate": 9.282906770917137e-05, + "loss": 1.9247, + "step": 6433 + }, + { + "epoch": 1.974831184775936, + "grad_norm": 0.324945867061615, + "learning_rate": 9.28265026290127e-05, + "loss": 1.8832, + "step": 6434 + }, + { + "epoch": 1.9751381215469612, + "grad_norm": 0.38697487115859985, + "learning_rate": 9.282393712561744e-05, + "loss": 1.9282, + "step": 6435 + }, + { + "epoch": 1.9754450583179866, + "grad_norm": 0.3772333264350891, + "learning_rate": 9.282137119901094e-05, + "loss": 1.8822, + "step": 6436 + }, + { + "epoch": 1.9757519950890117, + "grad_norm": 0.3522745668888092, + "learning_rate": 9.281880484921854e-05, + "loss": 1.9102, + "step": 6437 + }, + { + "epoch": 1.9760589318600368, + "grad_norm": 0.36745330691337585, + "learning_rate": 9.281623807626562e-05, + "loss": 1.8842, + "step": 6438 + }, + { + "epoch": 1.9763658686310621, + "grad_norm": 0.3990548253059387, + "learning_rate": 9.281367088017755e-05, + "loss": 1.9642, + "step": 6439 + }, + { + "epoch": 1.976672805402087, + "grad_norm": 0.3333520293235779, + "learning_rate": 9.281110326097969e-05, + "loss": 1.8541, + "step": 6440 + }, + { + "epoch": 1.9769797421731123, + "grad_norm": 0.3282802700996399, + "learning_rate": 9.280853521869739e-05, + "loss": 1.8416, + "step": 6441 + }, + { + "epoch": 1.9772866789441375, + "grad_norm": 0.3415268361568451, + "learning_rate": 9.280596675335607e-05, + "loss": 1.9009, + "step": 6442 + }, + { + "epoch": 1.9775936157151626, + "grad_norm": 0.3621836006641388, + "learning_rate": 9.28033978649811e-05, + "loss": 1.8584, + "step": 6443 + }, + { + "epoch": 1.977900552486188, + "grad_norm": 0.34778010845184326, + "learning_rate": 9.280082855359786e-05, + "loss": 1.9455, + "step": 6444 + }, + { + "epoch": 1.978207489257213, + "grad_norm": 0.36525633931159973, + "learning_rate": 9.279825881923174e-05, + "loss": 1.9182, + "step": 6445 + }, + { + "epoch": 1.9785144260282381, + "grad_norm": 0.3404203951358795, + "learning_rate": 9.279568866190815e-05, + "loss": 1.8853, + "step": 6446 + }, + { + "epoch": 1.9788213627992635, + "grad_norm": 0.4564785659313202, + "learning_rate": 9.279311808165249e-05, + "loss": 2.0012, + "step": 6447 + }, + { + "epoch": 1.9791282995702886, + "grad_norm": 0.4371441602706909, + "learning_rate": 9.279054707849015e-05, + "loss": 1.9372, + "step": 6448 + }, + { + "epoch": 1.9794352363413137, + "grad_norm": 0.3928726017475128, + "learning_rate": 9.278797565244652e-05, + "loss": 1.882, + "step": 6449 + }, + { + "epoch": 1.979742173112339, + "grad_norm": 0.483331561088562, + "learning_rate": 9.278540380354706e-05, + "loss": 1.9664, + "step": 6450 + }, + { + "epoch": 1.980049109883364, + "grad_norm": 0.39085066318511963, + "learning_rate": 9.278283153181716e-05, + "loss": 1.874, + "step": 6451 + }, + { + "epoch": 1.9803560466543892, + "grad_norm": 0.3549460172653198, + "learning_rate": 9.278025883728224e-05, + "loss": 1.9108, + "step": 6452 + }, + { + "epoch": 1.9806629834254144, + "grad_norm": 0.4260072410106659, + "learning_rate": 9.277768571996772e-05, + "loss": 1.8621, + "step": 6453 + }, + { + "epoch": 1.9809699201964395, + "grad_norm": 0.4531188905239105, + "learning_rate": 9.277511217989904e-05, + "loss": 1.9924, + "step": 6454 + }, + { + "epoch": 1.9812768569674648, + "grad_norm": 0.34916743636131287, + "learning_rate": 9.277253821710165e-05, + "loss": 1.9459, + "step": 6455 + }, + { + "epoch": 1.98158379373849, + "grad_norm": 0.45466169714927673, + "learning_rate": 9.276996383160095e-05, + "loss": 1.9129, + "step": 6456 + }, + { + "epoch": 1.981890730509515, + "grad_norm": 0.4948022663593292, + "learning_rate": 9.27673890234224e-05, + "loss": 1.9362, + "step": 6457 + }, + { + "epoch": 1.9821976672805404, + "grad_norm": 0.43365779519081116, + "learning_rate": 9.276481379259146e-05, + "loss": 1.9323, + "step": 6458 + }, + { + "epoch": 1.9825046040515653, + "grad_norm": 0.5301255583763123, + "learning_rate": 9.276223813913354e-05, + "loss": 1.9611, + "step": 6459 + }, + { + "epoch": 1.9828115408225906, + "grad_norm": 0.4785257577896118, + "learning_rate": 9.275966206307412e-05, + "loss": 1.8945, + "step": 6460 + }, + { + "epoch": 1.9831184775936157, + "grad_norm": 0.4091590940952301, + "learning_rate": 9.275708556443868e-05, + "loss": 1.9171, + "step": 6461 + }, + { + "epoch": 1.9834254143646408, + "grad_norm": 0.4031025767326355, + "learning_rate": 9.275450864325264e-05, + "loss": 1.9518, + "step": 6462 + }, + { + "epoch": 1.9837323511356661, + "grad_norm": 0.39147642254829407, + "learning_rate": 9.275193129954149e-05, + "loss": 1.8756, + "step": 6463 + }, + { + "epoch": 1.9840392879066913, + "grad_norm": 0.3863523006439209, + "learning_rate": 9.27493535333307e-05, + "loss": 1.8894, + "step": 6464 + }, + { + "epoch": 1.9843462246777164, + "grad_norm": 0.36373165249824524, + "learning_rate": 9.274677534464576e-05, + "loss": 1.8574, + "step": 6465 + }, + { + "epoch": 1.9846531614487417, + "grad_norm": 0.40247389674186707, + "learning_rate": 9.274419673351211e-05, + "loss": 1.832, + "step": 6466 + }, + { + "epoch": 1.9849600982197666, + "grad_norm": 0.3874013125896454, + "learning_rate": 9.274161769995526e-05, + "loss": 1.9079, + "step": 6467 + }, + { + "epoch": 1.985267034990792, + "grad_norm": 0.35506606101989746, + "learning_rate": 9.27390382440007e-05, + "loss": 1.8784, + "step": 6468 + }, + { + "epoch": 1.985573971761817, + "grad_norm": 0.406325101852417, + "learning_rate": 9.273645836567388e-05, + "loss": 1.9822, + "step": 6469 + }, + { + "epoch": 1.9858809085328422, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.273387806500036e-05, + "loss": 1.9334, + "step": 6470 + }, + { + "epoch": 1.9861878453038675, + "grad_norm": 0.4810343384742737, + "learning_rate": 9.273129734200561e-05, + "loss": 1.9598, + "step": 6471 + }, + { + "epoch": 1.9864947820748926, + "grad_norm": 0.4552834630012512, + "learning_rate": 9.272871619671513e-05, + "loss": 1.9504, + "step": 6472 + }, + { + "epoch": 1.9868017188459177, + "grad_norm": 0.38974207639694214, + "learning_rate": 9.272613462915443e-05, + "loss": 1.8811, + "step": 6473 + }, + { + "epoch": 1.987108655616943, + "grad_norm": 0.40983298420906067, + "learning_rate": 9.272355263934902e-05, + "loss": 1.8876, + "step": 6474 + }, + { + "epoch": 1.987415592387968, + "grad_norm": 0.3684757947921753, + "learning_rate": 9.272097022732443e-05, + "loss": 1.921, + "step": 6475 + }, + { + "epoch": 1.9877225291589933, + "grad_norm": 0.38384270668029785, + "learning_rate": 9.271838739310618e-05, + "loss": 1.9099, + "step": 6476 + }, + { + "epoch": 1.9880294659300184, + "grad_norm": 0.3783731460571289, + "learning_rate": 9.271580413671976e-05, + "loss": 1.9322, + "step": 6477 + }, + { + "epoch": 1.9883364027010435, + "grad_norm": 0.3686216473579407, + "learning_rate": 9.271322045819076e-05, + "loss": 1.914, + "step": 6478 + }, + { + "epoch": 1.9886433394720688, + "grad_norm": 0.38776305317878723, + "learning_rate": 9.271063635754466e-05, + "loss": 1.9331, + "step": 6479 + }, + { + "epoch": 1.988950276243094, + "grad_norm": 0.35099950432777405, + "learning_rate": 9.270805183480702e-05, + "loss": 1.9837, + "step": 6480 + }, + { + "epoch": 1.989257213014119, + "grad_norm": 0.3736453652381897, + "learning_rate": 9.270546689000339e-05, + "loss": 1.846, + "step": 6481 + }, + { + "epoch": 1.9895641497851444, + "grad_norm": 0.3654848635196686, + "learning_rate": 9.27028815231593e-05, + "loss": 1.8987, + "step": 6482 + }, + { + "epoch": 1.9898710865561693, + "grad_norm": 0.3534870147705078, + "learning_rate": 9.27002957343003e-05, + "loss": 1.868, + "step": 6483 + }, + { + "epoch": 1.9901780233271946, + "grad_norm": 0.3143392503261566, + "learning_rate": 9.269770952345197e-05, + "loss": 1.8042, + "step": 6484 + }, + { + "epoch": 1.9904849600982197, + "grad_norm": 0.37151026725769043, + "learning_rate": 9.269512289063982e-05, + "loss": 1.8392, + "step": 6485 + }, + { + "epoch": 1.9907918968692448, + "grad_norm": 0.39781463146209717, + "learning_rate": 9.269253583588947e-05, + "loss": 1.9911, + "step": 6486 + }, + { + "epoch": 1.9910988336402702, + "grad_norm": 0.44022107124328613, + "learning_rate": 9.268994835922643e-05, + "loss": 1.9644, + "step": 6487 + }, + { + "epoch": 1.9914057704112953, + "grad_norm": 0.4058530628681183, + "learning_rate": 9.268736046067632e-05, + "loss": 1.9062, + "step": 6488 + }, + { + "epoch": 1.9917127071823204, + "grad_norm": 0.3754481077194214, + "learning_rate": 9.268477214026467e-05, + "loss": 1.8278, + "step": 6489 + }, + { + "epoch": 1.9920196439533457, + "grad_norm": 0.318208247423172, + "learning_rate": 9.268218339801711e-05, + "loss": 1.8529, + "step": 6490 + }, + { + "epoch": 1.9923265807243706, + "grad_norm": 0.350777268409729, + "learning_rate": 9.267959423395918e-05, + "loss": 1.9024, + "step": 6491 + }, + { + "epoch": 1.992633517495396, + "grad_norm": 0.3145158588886261, + "learning_rate": 9.26770046481165e-05, + "loss": 1.934, + "step": 6492 + }, + { + "epoch": 1.992940454266421, + "grad_norm": 0.3347548842430115, + "learning_rate": 9.267441464051463e-05, + "loss": 1.8989, + "step": 6493 + }, + { + "epoch": 1.9932473910374462, + "grad_norm": 0.33111512660980225, + "learning_rate": 9.267182421117919e-05, + "loss": 1.8808, + "step": 6494 + }, + { + "epoch": 1.9935543278084715, + "grad_norm": 0.3135010898113251, + "learning_rate": 9.266923336013577e-05, + "loss": 1.895, + "step": 6495 + }, + { + "epoch": 1.9938612645794966, + "grad_norm": 0.3638830780982971, + "learning_rate": 9.266664208740998e-05, + "loss": 1.9331, + "step": 6496 + }, + { + "epoch": 1.9941682013505218, + "grad_norm": 0.3592624068260193, + "learning_rate": 9.266405039302743e-05, + "loss": 1.8963, + "step": 6497 + }, + { + "epoch": 1.994475138121547, + "grad_norm": 0.34216129779815674, + "learning_rate": 9.266145827701371e-05, + "loss": 1.9062, + "step": 6498 + }, + { + "epoch": 1.994782074892572, + "grad_norm": 0.4180343747138977, + "learning_rate": 9.265886573939447e-05, + "loss": 1.9351, + "step": 6499 + }, + { + "epoch": 1.9950890116635973, + "grad_norm": 0.36890342831611633, + "learning_rate": 9.265627278019531e-05, + "loss": 1.9037, + "step": 6500 + }, + { + "epoch": 1.9953959484346224, + "grad_norm": 0.36638152599334717, + "learning_rate": 9.265367939944188e-05, + "loss": 1.9524, + "step": 6501 + }, + { + "epoch": 1.9957028852056475, + "grad_norm": 0.44918373227119446, + "learning_rate": 9.265108559715976e-05, + "loss": 1.9236, + "step": 6502 + }, + { + "epoch": 1.9960098219766729, + "grad_norm": 0.3805326521396637, + "learning_rate": 9.264849137337462e-05, + "loss": 1.8526, + "step": 6503 + }, + { + "epoch": 1.996316758747698, + "grad_norm": 0.39035212993621826, + "learning_rate": 9.26458967281121e-05, + "loss": 1.8256, + "step": 6504 + }, + { + "epoch": 1.996623695518723, + "grad_norm": 0.330522358417511, + "learning_rate": 9.264330166139783e-05, + "loss": 1.8487, + "step": 6505 + }, + { + "epoch": 1.9969306322897484, + "grad_norm": 0.33569198846817017, + "learning_rate": 9.264070617325746e-05, + "loss": 1.8735, + "step": 6506 + }, + { + "epoch": 1.9972375690607733, + "grad_norm": 0.4121384918689728, + "learning_rate": 9.263811026371664e-05, + "loss": 2.0028, + "step": 6507 + }, + { + "epoch": 1.9975445058317987, + "grad_norm": 0.3419879972934723, + "learning_rate": 9.263551393280103e-05, + "loss": 1.8432, + "step": 6508 + }, + { + "epoch": 1.9978514426028238, + "grad_norm": 0.33369818329811096, + "learning_rate": 9.263291718053626e-05, + "loss": 1.8752, + "step": 6509 + }, + { + "epoch": 1.9981583793738489, + "grad_norm": 0.3580996096134186, + "learning_rate": 9.263032000694804e-05, + "loss": 1.9319, + "step": 6510 + }, + { + "epoch": 1.9984653161448742, + "grad_norm": 0.38216903805732727, + "learning_rate": 9.2627722412062e-05, + "loss": 1.9424, + "step": 6511 + }, + { + "epoch": 1.9987722529158993, + "grad_norm": 0.3836761713027954, + "learning_rate": 9.26251243959038e-05, + "loss": 1.9259, + "step": 6512 + }, + { + "epoch": 1.9990791896869244, + "grad_norm": 0.34978967905044556, + "learning_rate": 9.262252595849917e-05, + "loss": 1.8648, + "step": 6513 + }, + { + "epoch": 1.9993861264579498, + "grad_norm": 0.4190160632133484, + "learning_rate": 9.261992709987375e-05, + "loss": 1.9456, + "step": 6514 + }, + { + "epoch": 1.9996930632289747, + "grad_norm": 0.38700881600379944, + "learning_rate": 9.261732782005322e-05, + "loss": 1.8768, + "step": 6515 + }, + { + "epoch": 2.0, + "grad_norm": 0.3706338405609131, + "learning_rate": 9.261472811906328e-05, + "loss": 1.9247, + "step": 6516 + }, + { + "epoch": 2.0003069367710253, + "grad_norm": 0.36679908633232117, + "learning_rate": 9.261212799692962e-05, + "loss": 1.8193, + "step": 6517 + }, + { + "epoch": 2.0006138735420502, + "grad_norm": 0.45219072699546814, + "learning_rate": 9.260952745367795e-05, + "loss": 1.9019, + "step": 6518 + }, + { + "epoch": 2.0009208103130756, + "grad_norm": 0.6038491725921631, + "learning_rate": 9.260692648933393e-05, + "loss": 1.8834, + "step": 6519 + }, + { + "epoch": 2.001227747084101, + "grad_norm": 0.5823990106582642, + "learning_rate": 9.260432510392331e-05, + "loss": 1.9066, + "step": 6520 + }, + { + "epoch": 2.001534683855126, + "grad_norm": 0.4731088876724243, + "learning_rate": 9.260172329747178e-05, + "loss": 1.8997, + "step": 6521 + }, + { + "epoch": 2.001841620626151, + "grad_norm": 0.3397974669933319, + "learning_rate": 9.259912107000504e-05, + "loss": 1.9396, + "step": 6522 + }, + { + "epoch": 2.002148557397176, + "grad_norm": 0.374734103679657, + "learning_rate": 9.259651842154882e-05, + "loss": 1.9311, + "step": 6523 + }, + { + "epoch": 2.0024554941682013, + "grad_norm": 0.48218441009521484, + "learning_rate": 9.259391535212884e-05, + "loss": 1.948, + "step": 6524 + }, + { + "epoch": 2.0027624309392267, + "grad_norm": 0.40540626645088196, + "learning_rate": 9.259131186177082e-05, + "loss": 1.8541, + "step": 6525 + }, + { + "epoch": 2.0030693677102516, + "grad_norm": 0.3698440492153168, + "learning_rate": 9.258870795050048e-05, + "loss": 1.9622, + "step": 6526 + }, + { + "epoch": 2.003376304481277, + "grad_norm": 0.35084524750709534, + "learning_rate": 9.258610361834358e-05, + "loss": 1.8882, + "step": 6527 + }, + { + "epoch": 2.0036832412523022, + "grad_norm": 0.38982072472572327, + "learning_rate": 9.258349886532584e-05, + "loss": 1.9523, + "step": 6528 + }, + { + "epoch": 2.003990178023327, + "grad_norm": 0.3737744390964508, + "learning_rate": 9.258089369147302e-05, + "loss": 1.9091, + "step": 6529 + }, + { + "epoch": 2.0042971147943525, + "grad_norm": 0.36094167828559875, + "learning_rate": 9.257828809681083e-05, + "loss": 1.8711, + "step": 6530 + }, + { + "epoch": 2.0046040515653774, + "grad_norm": 0.3270244896411896, + "learning_rate": 9.257568208136506e-05, + "loss": 1.8738, + "step": 6531 + }, + { + "epoch": 2.0049109883364027, + "grad_norm": 0.3320237100124359, + "learning_rate": 9.257307564516145e-05, + "loss": 1.8889, + "step": 6532 + }, + { + "epoch": 2.005217925107428, + "grad_norm": 0.3091014623641968, + "learning_rate": 9.257046878822573e-05, + "loss": 1.8683, + "step": 6533 + }, + { + "epoch": 2.005524861878453, + "grad_norm": 0.3234712779521942, + "learning_rate": 9.25678615105837e-05, + "loss": 1.8787, + "step": 6534 + }, + { + "epoch": 2.0058317986494782, + "grad_norm": 0.38402292132377625, + "learning_rate": 9.25652538122611e-05, + "loss": 1.9414, + "step": 6535 + }, + { + "epoch": 2.0061387354205036, + "grad_norm": 0.41379863023757935, + "learning_rate": 9.256264569328372e-05, + "loss": 1.9185, + "step": 6536 + }, + { + "epoch": 2.0064456721915285, + "grad_norm": 0.35990384221076965, + "learning_rate": 9.256003715367733e-05, + "loss": 1.8756, + "step": 6537 + }, + { + "epoch": 2.006752608962554, + "grad_norm": 0.3489217460155487, + "learning_rate": 9.25574281934677e-05, + "loss": 1.8984, + "step": 6538 + }, + { + "epoch": 2.0070595457335787, + "grad_norm": 0.326541006565094, + "learning_rate": 9.255481881268064e-05, + "loss": 1.8559, + "step": 6539 + }, + { + "epoch": 2.007366482504604, + "grad_norm": 0.40900397300720215, + "learning_rate": 9.25522090113419e-05, + "loss": 1.8832, + "step": 6540 + }, + { + "epoch": 2.0076734192756294, + "grad_norm": 0.4130956828594208, + "learning_rate": 9.254959878947731e-05, + "loss": 1.8437, + "step": 6541 + }, + { + "epoch": 2.0079803560466543, + "grad_norm": 0.38869336247444153, + "learning_rate": 9.254698814711263e-05, + "loss": 1.8839, + "step": 6542 + }, + { + "epoch": 2.0082872928176796, + "grad_norm": 0.37832918763160706, + "learning_rate": 9.254437708427368e-05, + "loss": 1.9519, + "step": 6543 + }, + { + "epoch": 2.008594229588705, + "grad_norm": 0.35336560010910034, + "learning_rate": 9.254176560098625e-05, + "loss": 1.8928, + "step": 6544 + }, + { + "epoch": 2.00890116635973, + "grad_norm": 0.347260981798172, + "learning_rate": 9.253915369727617e-05, + "loss": 1.9133, + "step": 6545 + }, + { + "epoch": 2.009208103130755, + "grad_norm": 0.3706999719142914, + "learning_rate": 9.253654137316923e-05, + "loss": 1.9048, + "step": 6546 + }, + { + "epoch": 2.00951503990178, + "grad_norm": 0.40080907940864563, + "learning_rate": 9.253392862869127e-05, + "loss": 1.9169, + "step": 6547 + }, + { + "epoch": 2.0098219766728054, + "grad_norm": 0.3635334074497223, + "learning_rate": 9.253131546386808e-05, + "loss": 1.8623, + "step": 6548 + }, + { + "epoch": 2.0101289134438307, + "grad_norm": 0.32642990350723267, + "learning_rate": 9.252870187872552e-05, + "loss": 1.8624, + "step": 6549 + }, + { + "epoch": 2.0104358502148556, + "grad_norm": 0.32467779517173767, + "learning_rate": 9.25260878732894e-05, + "loss": 1.8867, + "step": 6550 + }, + { + "epoch": 2.010742786985881, + "grad_norm": 0.3496699631214142, + "learning_rate": 9.252347344758553e-05, + "loss": 1.8441, + "step": 6551 + }, + { + "epoch": 2.0110497237569063, + "grad_norm": 0.3624981939792633, + "learning_rate": 9.252085860163981e-05, + "loss": 1.9045, + "step": 6552 + }, + { + "epoch": 2.011356660527931, + "grad_norm": 0.3801099359989166, + "learning_rate": 9.251824333547801e-05, + "loss": 1.9273, + "step": 6553 + }, + { + "epoch": 2.0116635972989565, + "grad_norm": 0.355866402387619, + "learning_rate": 9.251562764912602e-05, + "loss": 1.9032, + "step": 6554 + }, + { + "epoch": 2.0119705340699814, + "grad_norm": 0.31210052967071533, + "learning_rate": 9.251301154260968e-05, + "loss": 1.8148, + "step": 6555 + }, + { + "epoch": 2.0122774708410067, + "grad_norm": 0.3583676218986511, + "learning_rate": 9.251039501595485e-05, + "loss": 1.9326, + "step": 6556 + }, + { + "epoch": 2.012584407612032, + "grad_norm": 0.40221846103668213, + "learning_rate": 9.250777806918737e-05, + "loss": 1.8968, + "step": 6557 + }, + { + "epoch": 2.012891344383057, + "grad_norm": 0.3403627574443817, + "learning_rate": 9.250516070233311e-05, + "loss": 1.8956, + "step": 6558 + }, + { + "epoch": 2.0131982811540823, + "grad_norm": 0.37752729654312134, + "learning_rate": 9.250254291541796e-05, + "loss": 1.9136, + "step": 6559 + }, + { + "epoch": 2.0135052179251076, + "grad_norm": 0.3661794364452362, + "learning_rate": 9.249992470846774e-05, + "loss": 1.8796, + "step": 6560 + }, + { + "epoch": 2.0138121546961325, + "grad_norm": 0.315603643655777, + "learning_rate": 9.249730608150837e-05, + "loss": 1.8711, + "step": 6561 + }, + { + "epoch": 2.014119091467158, + "grad_norm": 0.3187065124511719, + "learning_rate": 9.249468703456571e-05, + "loss": 1.8611, + "step": 6562 + }, + { + "epoch": 2.0144260282381827, + "grad_norm": 0.3018025755882263, + "learning_rate": 9.249206756766564e-05, + "loss": 1.786, + "step": 6563 + }, + { + "epoch": 2.014732965009208, + "grad_norm": 0.344963401556015, + "learning_rate": 9.248944768083406e-05, + "loss": 1.9428, + "step": 6564 + }, + { + "epoch": 2.0150399017802334, + "grad_norm": 0.29776978492736816, + "learning_rate": 9.248682737409687e-05, + "loss": 1.8089, + "step": 6565 + }, + { + "epoch": 2.0153468385512583, + "grad_norm": 0.348982572555542, + "learning_rate": 9.248420664747992e-05, + "loss": 1.8407, + "step": 6566 + }, + { + "epoch": 2.0156537753222836, + "grad_norm": 0.3413224518299103, + "learning_rate": 9.248158550100915e-05, + "loss": 1.9802, + "step": 6567 + }, + { + "epoch": 2.015960712093309, + "grad_norm": 0.3598950505256653, + "learning_rate": 9.247896393471044e-05, + "loss": 1.8882, + "step": 6568 + }, + { + "epoch": 2.016267648864334, + "grad_norm": 0.3609221875667572, + "learning_rate": 9.247634194860974e-05, + "loss": 1.934, + "step": 6569 + }, + { + "epoch": 2.016574585635359, + "grad_norm": 0.3893497586250305, + "learning_rate": 9.247371954273291e-05, + "loss": 1.8808, + "step": 6570 + }, + { + "epoch": 2.016881522406384, + "grad_norm": 0.347417950630188, + "learning_rate": 9.24710967171059e-05, + "loss": 1.863, + "step": 6571 + }, + { + "epoch": 2.0171884591774094, + "grad_norm": 0.35378298163414, + "learning_rate": 9.246847347175461e-05, + "loss": 1.8664, + "step": 6572 + }, + { + "epoch": 2.0174953959484347, + "grad_norm": 0.2819608151912689, + "learning_rate": 9.246584980670499e-05, + "loss": 1.9007, + "step": 6573 + }, + { + "epoch": 2.0178023327194596, + "grad_norm": 0.32445117831230164, + "learning_rate": 9.246322572198293e-05, + "loss": 1.9176, + "step": 6574 + }, + { + "epoch": 2.018109269490485, + "grad_norm": 0.33579203486442566, + "learning_rate": 9.24606012176144e-05, + "loss": 1.8192, + "step": 6575 + }, + { + "epoch": 2.0184162062615103, + "grad_norm": 0.40369588136672974, + "learning_rate": 9.245797629362532e-05, + "loss": 1.8731, + "step": 6576 + }, + { + "epoch": 2.018723143032535, + "grad_norm": 0.34241169691085815, + "learning_rate": 9.245535095004163e-05, + "loss": 1.8555, + "step": 6577 + }, + { + "epoch": 2.0190300798035605, + "grad_norm": 0.3627666234970093, + "learning_rate": 9.245272518688927e-05, + "loss": 1.9212, + "step": 6578 + }, + { + "epoch": 2.0193370165745854, + "grad_norm": 0.3330884873867035, + "learning_rate": 9.245009900419422e-05, + "loss": 1.8727, + "step": 6579 + }, + { + "epoch": 2.0196439533456108, + "grad_norm": 0.3259236514568329, + "learning_rate": 9.244747240198239e-05, + "loss": 1.8471, + "step": 6580 + }, + { + "epoch": 2.019950890116636, + "grad_norm": 0.3715277910232544, + "learning_rate": 9.244484538027976e-05, + "loss": 1.8925, + "step": 6581 + }, + { + "epoch": 2.020257826887661, + "grad_norm": 0.4752909541130066, + "learning_rate": 9.24422179391123e-05, + "loss": 1.889, + "step": 6582 + }, + { + "epoch": 2.0205647636586863, + "grad_norm": 0.5166791677474976, + "learning_rate": 9.243959007850597e-05, + "loss": 1.8637, + "step": 6583 + }, + { + "epoch": 2.0208717004297116, + "grad_norm": 0.5350266695022583, + "learning_rate": 9.243696179848673e-05, + "loss": 1.8916, + "step": 6584 + }, + { + "epoch": 2.0211786372007365, + "grad_norm": 0.6115607619285583, + "learning_rate": 9.243433309908055e-05, + "loss": 1.8847, + "step": 6585 + }, + { + "epoch": 2.021485573971762, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.243170398031343e-05, + "loss": 1.8889, + "step": 6586 + }, + { + "epoch": 2.021792510742787, + "grad_norm": 0.4547630846500397, + "learning_rate": 9.242907444221134e-05, + "loss": 1.8752, + "step": 6587 + }, + { + "epoch": 2.022099447513812, + "grad_norm": 0.39437413215637207, + "learning_rate": 9.242644448480027e-05, + "loss": 1.9318, + "step": 6588 + }, + { + "epoch": 2.0224063842848374, + "grad_norm": 0.39216291904449463, + "learning_rate": 9.24238141081062e-05, + "loss": 1.8799, + "step": 6589 + }, + { + "epoch": 2.0227133210558623, + "grad_norm": 0.4100605547428131, + "learning_rate": 9.242118331215513e-05, + "loss": 1.9278, + "step": 6590 + }, + { + "epoch": 2.0230202578268877, + "grad_norm": 0.38527074456214905, + "learning_rate": 9.241855209697307e-05, + "loss": 1.9085, + "step": 6591 + }, + { + "epoch": 2.023327194597913, + "grad_norm": 0.39856311678886414, + "learning_rate": 9.241592046258602e-05, + "loss": 1.8057, + "step": 6592 + }, + { + "epoch": 2.023634131368938, + "grad_norm": 0.4070499539375305, + "learning_rate": 9.241328840902e-05, + "loss": 1.8099, + "step": 6593 + }, + { + "epoch": 2.023941068139963, + "grad_norm": 0.40319183468818665, + "learning_rate": 9.241065593630097e-05, + "loss": 1.8654, + "step": 6594 + }, + { + "epoch": 2.0242480049109886, + "grad_norm": 0.3788430988788605, + "learning_rate": 9.240802304445499e-05, + "loss": 1.9419, + "step": 6595 + }, + { + "epoch": 2.0245549416820134, + "grad_norm": 0.3656894564628601, + "learning_rate": 9.240538973350809e-05, + "loss": 1.8625, + "step": 6596 + }, + { + "epoch": 2.0248618784530388, + "grad_norm": 0.4384852945804596, + "learning_rate": 9.240275600348625e-05, + "loss": 1.8893, + "step": 6597 + }, + { + "epoch": 2.0251688152240637, + "grad_norm": 0.5054775476455688, + "learning_rate": 9.240012185441554e-05, + "loss": 1.826, + "step": 6598 + }, + { + "epoch": 2.025475751995089, + "grad_norm": 0.4576725959777832, + "learning_rate": 9.239748728632196e-05, + "loss": 1.9319, + "step": 6599 + }, + { + "epoch": 2.0257826887661143, + "grad_norm": 0.40581515431404114, + "learning_rate": 9.239485229923157e-05, + "loss": 1.905, + "step": 6600 + }, + { + "epoch": 2.0260896255371392, + "grad_norm": 0.3168322443962097, + "learning_rate": 9.23922168931704e-05, + "loss": 1.8937, + "step": 6601 + }, + { + "epoch": 2.0263965623081646, + "grad_norm": 0.39211124181747437, + "learning_rate": 9.238958106816449e-05, + "loss": 1.8346, + "step": 6602 + }, + { + "epoch": 2.02670349907919, + "grad_norm": 0.4722496569156647, + "learning_rate": 9.23869448242399e-05, + "loss": 1.933, + "step": 6603 + }, + { + "epoch": 2.027010435850215, + "grad_norm": 0.47029170393943787, + "learning_rate": 9.238430816142268e-05, + "loss": 1.8873, + "step": 6604 + }, + { + "epoch": 2.02731737262124, + "grad_norm": 0.36421555280685425, + "learning_rate": 9.238167107973888e-05, + "loss": 1.8311, + "step": 6605 + }, + { + "epoch": 2.027624309392265, + "grad_norm": 0.36506712436676025, + "learning_rate": 9.237903357921455e-05, + "loss": 1.9025, + "step": 6606 + }, + { + "epoch": 2.0279312461632903, + "grad_norm": 0.5055087208747864, + "learning_rate": 9.237639565987579e-05, + "loss": 1.9138, + "step": 6607 + }, + { + "epoch": 2.0282381829343157, + "grad_norm": 0.5850993394851685, + "learning_rate": 9.237375732174867e-05, + "loss": 1.869, + "step": 6608 + }, + { + "epoch": 2.0285451197053406, + "grad_norm": 0.5053986310958862, + "learning_rate": 9.237111856485921e-05, + "loss": 1.8196, + "step": 6609 + }, + { + "epoch": 2.028852056476366, + "grad_norm": 0.40635839104652405, + "learning_rate": 9.236847938923354e-05, + "loss": 1.8399, + "step": 6610 + }, + { + "epoch": 2.0291589932473912, + "grad_norm": 0.32075709104537964, + "learning_rate": 9.236583979489771e-05, + "loss": 1.8532, + "step": 6611 + }, + { + "epoch": 2.029465930018416, + "grad_norm": 0.4474230408668518, + "learning_rate": 9.236319978187783e-05, + "loss": 1.8807, + "step": 6612 + }, + { + "epoch": 2.0297728667894415, + "grad_norm": 0.5391832590103149, + "learning_rate": 9.236055935019998e-05, + "loss": 1.8887, + "step": 6613 + }, + { + "epoch": 2.0300798035604664, + "grad_norm": 0.5129361748695374, + "learning_rate": 9.235791849989024e-05, + "loss": 1.8541, + "step": 6614 + }, + { + "epoch": 2.0303867403314917, + "grad_norm": 0.33113735914230347, + "learning_rate": 9.235527723097474e-05, + "loss": 1.8611, + "step": 6615 + }, + { + "epoch": 2.030693677102517, + "grad_norm": 0.3526761531829834, + "learning_rate": 9.235263554347956e-05, + "loss": 1.8436, + "step": 6616 + }, + { + "epoch": 2.031000613873542, + "grad_norm": 0.4380190670490265, + "learning_rate": 9.234999343743081e-05, + "loss": 1.854, + "step": 6617 + }, + { + "epoch": 2.0313075506445673, + "grad_norm": 0.4300559163093567, + "learning_rate": 9.23473509128546e-05, + "loss": 1.919, + "step": 6618 + }, + { + "epoch": 2.0316144874155926, + "grad_norm": 0.3445209860801697, + "learning_rate": 9.234470796977705e-05, + "loss": 1.88, + "step": 6619 + }, + { + "epoch": 2.0319214241866175, + "grad_norm": 0.35759109258651733, + "learning_rate": 9.234206460822428e-05, + "loss": 1.9244, + "step": 6620 + }, + { + "epoch": 2.032228360957643, + "grad_norm": 0.432804137468338, + "learning_rate": 9.23394208282224e-05, + "loss": 1.9312, + "step": 6621 + }, + { + "epoch": 2.0325352977286677, + "grad_norm": 0.446865439414978, + "learning_rate": 9.233677662979756e-05, + "loss": 1.8791, + "step": 6622 + }, + { + "epoch": 2.032842234499693, + "grad_norm": 0.37617436051368713, + "learning_rate": 9.233413201297588e-05, + "loss": 1.8794, + "step": 6623 + }, + { + "epoch": 2.0331491712707184, + "grad_norm": 0.33695775270462036, + "learning_rate": 9.233148697778349e-05, + "loss": 1.8649, + "step": 6624 + }, + { + "epoch": 2.0334561080417433, + "grad_norm": 0.3893069624900818, + "learning_rate": 9.232884152424654e-05, + "loss": 1.899, + "step": 6625 + }, + { + "epoch": 2.0337630448127686, + "grad_norm": 0.38993194699287415, + "learning_rate": 9.232619565239116e-05, + "loss": 1.8994, + "step": 6626 + }, + { + "epoch": 2.034069981583794, + "grad_norm": 0.3725507855415344, + "learning_rate": 9.23235493622435e-05, + "loss": 1.8758, + "step": 6627 + }, + { + "epoch": 2.034376918354819, + "grad_norm": 0.3236019015312195, + "learning_rate": 9.232090265382973e-05, + "loss": 1.9041, + "step": 6628 + }, + { + "epoch": 2.034683855125844, + "grad_norm": 0.3399617671966553, + "learning_rate": 9.231825552717599e-05, + "loss": 1.9081, + "step": 6629 + }, + { + "epoch": 2.034990791896869, + "grad_norm": 0.352096289396286, + "learning_rate": 9.231560798230845e-05, + "loss": 1.9001, + "step": 6630 + }, + { + "epoch": 2.0352977286678944, + "grad_norm": 0.39621952176094055, + "learning_rate": 9.231296001925327e-05, + "loss": 1.9258, + "step": 6631 + }, + { + "epoch": 2.0356046654389197, + "grad_norm": 0.36686012148857117, + "learning_rate": 9.23103116380366e-05, + "loss": 1.9325, + "step": 6632 + }, + { + "epoch": 2.0359116022099446, + "grad_norm": 0.36286696791648865, + "learning_rate": 9.230766283868466e-05, + "loss": 1.9623, + "step": 6633 + }, + { + "epoch": 2.03621853898097, + "grad_norm": 0.34748387336730957, + "learning_rate": 9.230501362122359e-05, + "loss": 1.8326, + "step": 6634 + }, + { + "epoch": 2.0365254757519953, + "grad_norm": 0.350993275642395, + "learning_rate": 9.230236398567958e-05, + "loss": 1.8333, + "step": 6635 + }, + { + "epoch": 2.03683241252302, + "grad_norm": 0.3181723356246948, + "learning_rate": 9.229971393207881e-05, + "loss": 1.8852, + "step": 6636 + }, + { + "epoch": 2.0371393492940455, + "grad_norm": 0.3446536660194397, + "learning_rate": 9.229706346044747e-05, + "loss": 1.8833, + "step": 6637 + }, + { + "epoch": 2.0374462860650704, + "grad_norm": 0.3077203631401062, + "learning_rate": 9.229441257081176e-05, + "loss": 1.8546, + "step": 6638 + }, + { + "epoch": 2.0377532228360957, + "grad_norm": 0.3659566342830658, + "learning_rate": 9.229176126319788e-05, + "loss": 1.8687, + "step": 6639 + }, + { + "epoch": 2.038060159607121, + "grad_norm": 0.379779577255249, + "learning_rate": 9.228910953763204e-05, + "loss": 1.9208, + "step": 6640 + }, + { + "epoch": 2.038367096378146, + "grad_norm": 0.4496903121471405, + "learning_rate": 9.228645739414042e-05, + "loss": 1.9471, + "step": 6641 + }, + { + "epoch": 2.0386740331491713, + "grad_norm": 0.37597209215164185, + "learning_rate": 9.228380483274923e-05, + "loss": 1.9047, + "step": 6642 + }, + { + "epoch": 2.0389809699201966, + "grad_norm": 0.3739323019981384, + "learning_rate": 9.228115185348471e-05, + "loss": 1.9697, + "step": 6643 + }, + { + "epoch": 2.0392879066912215, + "grad_norm": 0.3524092435836792, + "learning_rate": 9.227849845637306e-05, + "loss": 1.8716, + "step": 6644 + }, + { + "epoch": 2.039594843462247, + "grad_norm": 0.36939096450805664, + "learning_rate": 9.227584464144051e-05, + "loss": 1.9836, + "step": 6645 + }, + { + "epoch": 2.0399017802332717, + "grad_norm": 0.39015519618988037, + "learning_rate": 9.22731904087133e-05, + "loss": 1.907, + "step": 6646 + }, + { + "epoch": 2.040208717004297, + "grad_norm": 0.3725626468658447, + "learning_rate": 9.227053575821763e-05, + "loss": 1.9483, + "step": 6647 + }, + { + "epoch": 2.0405156537753224, + "grad_norm": 0.41595613956451416, + "learning_rate": 9.226788068997974e-05, + "loss": 1.9352, + "step": 6648 + }, + { + "epoch": 2.0408225905463473, + "grad_norm": 0.4026443660259247, + "learning_rate": 9.226522520402589e-05, + "loss": 1.9166, + "step": 6649 + }, + { + "epoch": 2.0411295273173726, + "grad_norm": 0.39883533120155334, + "learning_rate": 9.226256930038233e-05, + "loss": 1.8594, + "step": 6650 + }, + { + "epoch": 2.041436464088398, + "grad_norm": 0.35540083050727844, + "learning_rate": 9.225991297907526e-05, + "loss": 1.9065, + "step": 6651 + }, + { + "epoch": 2.041743400859423, + "grad_norm": 0.3799804747104645, + "learning_rate": 9.225725624013097e-05, + "loss": 1.9232, + "step": 6652 + }, + { + "epoch": 2.042050337630448, + "grad_norm": 0.37289959192276, + "learning_rate": 9.225459908357572e-05, + "loss": 1.9679, + "step": 6653 + }, + { + "epoch": 2.042357274401473, + "grad_norm": 0.38069143891334534, + "learning_rate": 9.225194150943574e-05, + "loss": 1.9699, + "step": 6654 + }, + { + "epoch": 2.0426642111724984, + "grad_norm": 0.43708884716033936, + "learning_rate": 9.224928351773731e-05, + "loss": 1.8907, + "step": 6655 + }, + { + "epoch": 2.0429711479435237, + "grad_norm": 0.47203195095062256, + "learning_rate": 9.22466251085067e-05, + "loss": 1.9615, + "step": 6656 + }, + { + "epoch": 2.0432780847145486, + "grad_norm": 0.405129998922348, + "learning_rate": 9.224396628177019e-05, + "loss": 1.9165, + "step": 6657 + }, + { + "epoch": 2.043585021485574, + "grad_norm": 0.33447468280792236, + "learning_rate": 9.224130703755403e-05, + "loss": 1.852, + "step": 6658 + }, + { + "epoch": 2.0438919582565993, + "grad_norm": 0.33780771493911743, + "learning_rate": 9.223864737588453e-05, + "loss": 1.875, + "step": 6659 + }, + { + "epoch": 2.044198895027624, + "grad_norm": 0.37942594289779663, + "learning_rate": 9.223598729678796e-05, + "loss": 1.9115, + "step": 6660 + }, + { + "epoch": 2.0445058317986495, + "grad_norm": 0.3368874192237854, + "learning_rate": 9.223332680029059e-05, + "loss": 1.822, + "step": 6661 + }, + { + "epoch": 2.044812768569675, + "grad_norm": 0.3029201924800873, + "learning_rate": 9.223066588641873e-05, + "loss": 1.8902, + "step": 6662 + }, + { + "epoch": 2.0451197053406998, + "grad_norm": 0.4605506360530853, + "learning_rate": 9.22280045551987e-05, + "loss": 1.9164, + "step": 6663 + }, + { + "epoch": 2.045426642111725, + "grad_norm": 0.5012617111206055, + "learning_rate": 9.222534280665675e-05, + "loss": 1.8859, + "step": 6664 + }, + { + "epoch": 2.04573357888275, + "grad_norm": 0.5177115797996521, + "learning_rate": 9.222268064081924e-05, + "loss": 1.93, + "step": 6665 + }, + { + "epoch": 2.0460405156537753, + "grad_norm": 0.3966628313064575, + "learning_rate": 9.222001805771244e-05, + "loss": 1.8817, + "step": 6666 + }, + { + "epoch": 2.0463474524248007, + "grad_norm": 0.3670666813850403, + "learning_rate": 9.221735505736269e-05, + "loss": 1.8224, + "step": 6667 + }, + { + "epoch": 2.0466543891958255, + "grad_norm": 0.4584221839904785, + "learning_rate": 9.221469163979628e-05, + "loss": 1.7788, + "step": 6668 + }, + { + "epoch": 2.046961325966851, + "grad_norm": 0.5598693490028381, + "learning_rate": 9.221202780503954e-05, + "loss": 1.9263, + "step": 6669 + }, + { + "epoch": 2.047268262737876, + "grad_norm": 0.44200289249420166, + "learning_rate": 9.22093635531188e-05, + "loss": 1.8455, + "step": 6670 + }, + { + "epoch": 2.047575199508901, + "grad_norm": 0.33257725834846497, + "learning_rate": 9.22066988840604e-05, + "loss": 1.9019, + "step": 6671 + }, + { + "epoch": 2.0478821362799264, + "grad_norm": 0.4716290831565857, + "learning_rate": 9.220403379789066e-05, + "loss": 1.9012, + "step": 6672 + }, + { + "epoch": 2.0481890730509513, + "grad_norm": 0.5600453615188599, + "learning_rate": 9.220136829463591e-05, + "loss": 1.9158, + "step": 6673 + }, + { + "epoch": 2.0484960098219767, + "grad_norm": 0.5345216393470764, + "learning_rate": 9.219870237432252e-05, + "loss": 1.931, + "step": 6674 + }, + { + "epoch": 2.048802946593002, + "grad_norm": 0.36617112159729004, + "learning_rate": 9.219603603697682e-05, + "loss": 1.9019, + "step": 6675 + }, + { + "epoch": 2.049109883364027, + "grad_norm": 0.33677804470062256, + "learning_rate": 9.219336928262514e-05, + "loss": 1.8897, + "step": 6676 + }, + { + "epoch": 2.049416820135052, + "grad_norm": 0.48563066124916077, + "learning_rate": 9.219070211129388e-05, + "loss": 1.9147, + "step": 6677 + }, + { + "epoch": 2.0497237569060776, + "grad_norm": 0.5029729008674622, + "learning_rate": 9.218803452300935e-05, + "loss": 1.8926, + "step": 6678 + }, + { + "epoch": 2.0500306936771024, + "grad_norm": 0.3969452977180481, + "learning_rate": 9.218536651779795e-05, + "loss": 1.9337, + "step": 6679 + }, + { + "epoch": 2.050337630448128, + "grad_norm": 0.37374138832092285, + "learning_rate": 9.218269809568603e-05, + "loss": 1.9147, + "step": 6680 + }, + { + "epoch": 2.0506445672191527, + "grad_norm": 0.416608065366745, + "learning_rate": 9.218002925669996e-05, + "loss": 1.975, + "step": 6681 + }, + { + "epoch": 2.050951503990178, + "grad_norm": 0.35848283767700195, + "learning_rate": 9.217736000086612e-05, + "loss": 1.9194, + "step": 6682 + }, + { + "epoch": 2.0512584407612033, + "grad_norm": 0.3294626772403717, + "learning_rate": 9.217469032821088e-05, + "loss": 1.8541, + "step": 6683 + }, + { + "epoch": 2.0515653775322282, + "grad_norm": 0.4164618253707886, + "learning_rate": 9.217202023876064e-05, + "loss": 1.8999, + "step": 6684 + }, + { + "epoch": 2.0518723143032536, + "grad_norm": 0.4067288935184479, + "learning_rate": 9.216934973254179e-05, + "loss": 1.8609, + "step": 6685 + }, + { + "epoch": 2.052179251074279, + "grad_norm": 0.38743069767951965, + "learning_rate": 9.216667880958069e-05, + "loss": 1.8571, + "step": 6686 + }, + { + "epoch": 2.052486187845304, + "grad_norm": 0.3430919647216797, + "learning_rate": 9.216400746990377e-05, + "loss": 1.9229, + "step": 6687 + }, + { + "epoch": 2.052793124616329, + "grad_norm": 0.3512028753757477, + "learning_rate": 9.21613357135374e-05, + "loss": 1.9331, + "step": 6688 + }, + { + "epoch": 2.053100061387354, + "grad_norm": 0.3708036541938782, + "learning_rate": 9.215866354050799e-05, + "loss": 1.8499, + "step": 6689 + }, + { + "epoch": 2.0534069981583793, + "grad_norm": 0.39376455545425415, + "learning_rate": 9.215599095084199e-05, + "loss": 1.8531, + "step": 6690 + }, + { + "epoch": 2.0537139349294047, + "grad_norm": 0.3855830430984497, + "learning_rate": 9.215331794456576e-05, + "loss": 1.8597, + "step": 6691 + }, + { + "epoch": 2.0540208717004296, + "grad_norm": 0.3515113592147827, + "learning_rate": 9.215064452170574e-05, + "loss": 1.8776, + "step": 6692 + }, + { + "epoch": 2.054327808471455, + "grad_norm": 0.3165057897567749, + "learning_rate": 9.214797068228833e-05, + "loss": 1.926, + "step": 6693 + }, + { + "epoch": 2.0546347452424802, + "grad_norm": 0.3516407310962677, + "learning_rate": 9.214529642633998e-05, + "loss": 1.9397, + "step": 6694 + }, + { + "epoch": 2.054941682013505, + "grad_norm": 0.36943888664245605, + "learning_rate": 9.214262175388713e-05, + "loss": 1.9114, + "step": 6695 + }, + { + "epoch": 2.0552486187845305, + "grad_norm": 0.3490065634250641, + "learning_rate": 9.213994666495616e-05, + "loss": 1.8637, + "step": 6696 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.30341869592666626, + "learning_rate": 9.213727115957356e-05, + "loss": 1.8525, + "step": 6697 + }, + { + "epoch": 2.0558624923265807, + "grad_norm": 0.3899247646331787, + "learning_rate": 9.213459523776573e-05, + "loss": 2.0578, + "step": 6698 + }, + { + "epoch": 2.056169429097606, + "grad_norm": 0.34904104471206665, + "learning_rate": 9.213191889955915e-05, + "loss": 1.9135, + "step": 6699 + }, + { + "epoch": 2.056476365868631, + "grad_norm": 0.3806450366973877, + "learning_rate": 9.212924214498024e-05, + "loss": 1.9252, + "step": 6700 + }, + { + "epoch": 2.0567833026396563, + "grad_norm": 0.33185848593711853, + "learning_rate": 9.212656497405547e-05, + "loss": 1.8457, + "step": 6701 + }, + { + "epoch": 2.0570902394106816, + "grad_norm": 0.356717050075531, + "learning_rate": 9.21238873868113e-05, + "loss": 1.9086, + "step": 6702 + }, + { + "epoch": 2.0573971761817065, + "grad_norm": 0.41743260622024536, + "learning_rate": 9.212120938327418e-05, + "loss": 1.9255, + "step": 6703 + }, + { + "epoch": 2.057704112952732, + "grad_norm": 0.3937377631664276, + "learning_rate": 9.211853096347058e-05, + "loss": 1.9529, + "step": 6704 + }, + { + "epoch": 2.0580110497237567, + "grad_norm": 0.43980923295021057, + "learning_rate": 9.211585212742698e-05, + "loss": 1.905, + "step": 6705 + }, + { + "epoch": 2.058317986494782, + "grad_norm": 0.36891186237335205, + "learning_rate": 9.211317287516984e-05, + "loss": 1.8109, + "step": 6706 + }, + { + "epoch": 2.0586249232658074, + "grad_norm": 0.3582547605037689, + "learning_rate": 9.211049320672563e-05, + "loss": 1.9633, + "step": 6707 + }, + { + "epoch": 2.0589318600368323, + "grad_norm": 0.3421446979045868, + "learning_rate": 9.210781312212087e-05, + "loss": 1.8956, + "step": 6708 + }, + { + "epoch": 2.0592387968078576, + "grad_norm": 0.34717023372650146, + "learning_rate": 9.210513262138199e-05, + "loss": 1.837, + "step": 6709 + }, + { + "epoch": 2.059545733578883, + "grad_norm": 0.32769930362701416, + "learning_rate": 9.210245170453553e-05, + "loss": 1.8588, + "step": 6710 + }, + { + "epoch": 2.059852670349908, + "grad_norm": 0.3694380223751068, + "learning_rate": 9.209977037160796e-05, + "loss": 1.9298, + "step": 6711 + }, + { + "epoch": 2.060159607120933, + "grad_norm": 0.38598594069480896, + "learning_rate": 9.209708862262578e-05, + "loss": 1.9011, + "step": 6712 + }, + { + "epoch": 2.060466543891958, + "grad_norm": 0.33520397543907166, + "learning_rate": 9.20944064576155e-05, + "loss": 1.9689, + "step": 6713 + }, + { + "epoch": 2.0607734806629834, + "grad_norm": 0.36898335814476013, + "learning_rate": 9.209172387660363e-05, + "loss": 1.9362, + "step": 6714 + }, + { + "epoch": 2.0610804174340087, + "grad_norm": 0.3989763855934143, + "learning_rate": 9.208904087961667e-05, + "loss": 1.8875, + "step": 6715 + }, + { + "epoch": 2.0613873542050336, + "grad_norm": 0.38079237937927246, + "learning_rate": 9.208635746668113e-05, + "loss": 1.8645, + "step": 6716 + }, + { + "epoch": 2.061694290976059, + "grad_norm": 0.3853057026863098, + "learning_rate": 9.208367363782355e-05, + "loss": 1.9346, + "step": 6717 + }, + { + "epoch": 2.0620012277470843, + "grad_norm": 0.33557942509651184, + "learning_rate": 9.208098939307044e-05, + "loss": 1.8629, + "step": 6718 + }, + { + "epoch": 2.062308164518109, + "grad_norm": 0.31848183274269104, + "learning_rate": 9.207830473244832e-05, + "loss": 1.7616, + "step": 6719 + }, + { + "epoch": 2.0626151012891345, + "grad_norm": 0.2901391088962555, + "learning_rate": 9.207561965598375e-05, + "loss": 1.8876, + "step": 6720 + }, + { + "epoch": 2.06292203806016, + "grad_norm": 0.33935174345970154, + "learning_rate": 9.207293416370322e-05, + "loss": 1.8407, + "step": 6721 + }, + { + "epoch": 2.0632289748311847, + "grad_norm": 0.3615114390850067, + "learning_rate": 9.207024825563331e-05, + "loss": 1.8378, + "step": 6722 + }, + { + "epoch": 2.06353591160221, + "grad_norm": 0.35903334617614746, + "learning_rate": 9.206756193180053e-05, + "loss": 1.8316, + "step": 6723 + }, + { + "epoch": 2.063842848373235, + "grad_norm": 0.35222968459129333, + "learning_rate": 9.206487519223146e-05, + "loss": 1.8786, + "step": 6724 + }, + { + "epoch": 2.0641497851442603, + "grad_norm": 0.3412967622280121, + "learning_rate": 9.206218803695264e-05, + "loss": 1.8682, + "step": 6725 + }, + { + "epoch": 2.0644567219152856, + "grad_norm": 0.4166354835033417, + "learning_rate": 9.205950046599062e-05, + "loss": 1.8871, + "step": 6726 + }, + { + "epoch": 2.0647636586863105, + "grad_norm": 0.4631161093711853, + "learning_rate": 9.205681247937196e-05, + "loss": 1.9328, + "step": 6727 + }, + { + "epoch": 2.065070595457336, + "grad_norm": 0.39197248220443726, + "learning_rate": 9.205412407712325e-05, + "loss": 1.9434, + "step": 6728 + }, + { + "epoch": 2.0653775322283607, + "grad_norm": 0.37939852476119995, + "learning_rate": 9.205143525927103e-05, + "loss": 1.9115, + "step": 6729 + }, + { + "epoch": 2.065684468999386, + "grad_norm": 0.35442814230918884, + "learning_rate": 9.204874602584186e-05, + "loss": 1.9197, + "step": 6730 + }, + { + "epoch": 2.0659914057704114, + "grad_norm": 0.3598809242248535, + "learning_rate": 9.204605637686235e-05, + "loss": 1.8684, + "step": 6731 + }, + { + "epoch": 2.0662983425414363, + "grad_norm": 0.3360415995121002, + "learning_rate": 9.204336631235905e-05, + "loss": 1.8531, + "step": 6732 + }, + { + "epoch": 2.0666052793124616, + "grad_norm": 0.4487619698047638, + "learning_rate": 9.204067583235859e-05, + "loss": 1.8509, + "step": 6733 + }, + { + "epoch": 2.066912216083487, + "grad_norm": 0.37166881561279297, + "learning_rate": 9.203798493688753e-05, + "loss": 1.8826, + "step": 6734 + }, + { + "epoch": 2.067219152854512, + "grad_norm": 0.35294032096862793, + "learning_rate": 9.203529362597244e-05, + "loss": 1.9029, + "step": 6735 + }, + { + "epoch": 2.067526089625537, + "grad_norm": 0.4115317165851593, + "learning_rate": 9.203260189963995e-05, + "loss": 1.9117, + "step": 6736 + }, + { + "epoch": 2.0678330263965625, + "grad_norm": 0.44137999415397644, + "learning_rate": 9.202990975791666e-05, + "loss": 1.8754, + "step": 6737 + }, + { + "epoch": 2.0681399631675874, + "grad_norm": 0.46055081486701965, + "learning_rate": 9.202721720082916e-05, + "loss": 1.8322, + "step": 6738 + }, + { + "epoch": 2.0684468999386127, + "grad_norm": 0.38548141717910767, + "learning_rate": 9.202452422840407e-05, + "loss": 1.8341, + "step": 6739 + }, + { + "epoch": 2.0687538367096376, + "grad_norm": 0.3542765974998474, + "learning_rate": 9.2021830840668e-05, + "loss": 1.9301, + "step": 6740 + }, + { + "epoch": 2.069060773480663, + "grad_norm": 0.35987207293510437, + "learning_rate": 9.201913703764755e-05, + "loss": 1.8756, + "step": 6741 + }, + { + "epoch": 2.0693677102516883, + "grad_norm": 0.4297364056110382, + "learning_rate": 9.201644281936938e-05, + "loss": 1.8549, + "step": 6742 + }, + { + "epoch": 2.069674647022713, + "grad_norm": 0.3679873049259186, + "learning_rate": 9.20137481858601e-05, + "loss": 1.8905, + "step": 6743 + }, + { + "epoch": 2.0699815837937385, + "grad_norm": 0.3402685523033142, + "learning_rate": 9.201105313714632e-05, + "loss": 1.8834, + "step": 6744 + }, + { + "epoch": 2.070288520564764, + "grad_norm": 0.40986955165863037, + "learning_rate": 9.200835767325469e-05, + "loss": 1.8861, + "step": 6745 + }, + { + "epoch": 2.0705954573357888, + "grad_norm": 0.4305949807167053, + "learning_rate": 9.200566179421186e-05, + "loss": 1.8977, + "step": 6746 + }, + { + "epoch": 2.070902394106814, + "grad_norm": 0.3948439359664917, + "learning_rate": 9.200296550004446e-05, + "loss": 1.8801, + "step": 6747 + }, + { + "epoch": 2.071209330877839, + "grad_norm": 0.3404015600681305, + "learning_rate": 9.200026879077912e-05, + "loss": 1.8417, + "step": 6748 + }, + { + "epoch": 2.0715162676488643, + "grad_norm": 0.39447101950645447, + "learning_rate": 9.199757166644252e-05, + "loss": 1.9675, + "step": 6749 + }, + { + "epoch": 2.0718232044198897, + "grad_norm": 0.44323647022247314, + "learning_rate": 9.199487412706129e-05, + "loss": 1.9014, + "step": 6750 + }, + { + "epoch": 2.0721301411909145, + "grad_norm": 0.47096556425094604, + "learning_rate": 9.199217617266212e-05, + "loss": 1.8783, + "step": 6751 + }, + { + "epoch": 2.07243707796194, + "grad_norm": 0.42863038182258606, + "learning_rate": 9.198947780327163e-05, + "loss": 1.8369, + "step": 6752 + }, + { + "epoch": 2.072744014732965, + "grad_norm": 0.414079874753952, + "learning_rate": 9.198677901891652e-05, + "loss": 1.9247, + "step": 6753 + }, + { + "epoch": 2.07305095150399, + "grad_norm": 0.3445589542388916, + "learning_rate": 9.198407981962345e-05, + "loss": 1.8494, + "step": 6754 + }, + { + "epoch": 2.0733578882750154, + "grad_norm": 0.4340321719646454, + "learning_rate": 9.198138020541908e-05, + "loss": 1.904, + "step": 6755 + }, + { + "epoch": 2.0736648250460403, + "grad_norm": 0.55349200963974, + "learning_rate": 9.197868017633013e-05, + "loss": 1.9368, + "step": 6756 + }, + { + "epoch": 2.0739717618170657, + "grad_norm": 0.5893970727920532, + "learning_rate": 9.197597973238326e-05, + "loss": 1.9329, + "step": 6757 + }, + { + "epoch": 2.074278698588091, + "grad_norm": 0.4942009449005127, + "learning_rate": 9.197327887360514e-05, + "loss": 1.7726, + "step": 6758 + }, + { + "epoch": 2.074585635359116, + "grad_norm": 0.36411046981811523, + "learning_rate": 9.197057760002247e-05, + "loss": 1.8214, + "step": 6759 + }, + { + "epoch": 2.074892572130141, + "grad_norm": 0.31520166993141174, + "learning_rate": 9.196787591166198e-05, + "loss": 1.8491, + "step": 6760 + }, + { + "epoch": 2.0751995089011666, + "grad_norm": 0.47392621636390686, + "learning_rate": 9.196517380855032e-05, + "loss": 2.0165, + "step": 6761 + }, + { + "epoch": 2.0755064456721914, + "grad_norm": 0.4768085181713104, + "learning_rate": 9.196247129071423e-05, + "loss": 1.9289, + "step": 6762 + }, + { + "epoch": 2.075813382443217, + "grad_norm": 0.396391361951828, + "learning_rate": 9.195976835818039e-05, + "loss": 1.9521, + "step": 6763 + }, + { + "epoch": 2.0761203192142417, + "grad_norm": 0.4030967950820923, + "learning_rate": 9.195706501097551e-05, + "loss": 1.8386, + "step": 6764 + }, + { + "epoch": 2.076427255985267, + "grad_norm": 0.48308777809143066, + "learning_rate": 9.195436124912635e-05, + "loss": 1.8874, + "step": 6765 + }, + { + "epoch": 2.0767341927562923, + "grad_norm": 0.5232771635055542, + "learning_rate": 9.19516570726596e-05, + "loss": 1.8822, + "step": 6766 + }, + { + "epoch": 2.0770411295273172, + "grad_norm": 0.3607174754142761, + "learning_rate": 9.194895248160198e-05, + "loss": 1.8995, + "step": 6767 + }, + { + "epoch": 2.0773480662983426, + "grad_norm": 0.4354429841041565, + "learning_rate": 9.194624747598022e-05, + "loss": 1.8629, + "step": 6768 + }, + { + "epoch": 2.077655003069368, + "grad_norm": 0.5405299067497253, + "learning_rate": 9.194354205582107e-05, + "loss": 1.8608, + "step": 6769 + }, + { + "epoch": 2.077961939840393, + "grad_norm": 0.5442025065422058, + "learning_rate": 9.194083622115123e-05, + "loss": 1.885, + "step": 6770 + }, + { + "epoch": 2.078268876611418, + "grad_norm": 0.4160112142562866, + "learning_rate": 9.193812997199749e-05, + "loss": 1.8617, + "step": 6771 + }, + { + "epoch": 2.078575813382443, + "grad_norm": 0.3550199866294861, + "learning_rate": 9.193542330838656e-05, + "loss": 1.9277, + "step": 6772 + }, + { + "epoch": 2.0788827501534684, + "grad_norm": 0.5224893093109131, + "learning_rate": 9.19327162303452e-05, + "loss": 1.7893, + "step": 6773 + }, + { + "epoch": 2.0791896869244937, + "grad_norm": 0.45021727681159973, + "learning_rate": 9.193000873790014e-05, + "loss": 1.8635, + "step": 6774 + }, + { + "epoch": 2.0794966236955186, + "grad_norm": 0.3087892532348633, + "learning_rate": 9.192730083107819e-05, + "loss": 1.842, + "step": 6775 + }, + { + "epoch": 2.079803560466544, + "grad_norm": 0.4304139018058777, + "learning_rate": 9.192459250990606e-05, + "loss": 1.8461, + "step": 6776 + }, + { + "epoch": 2.0801104972375692, + "grad_norm": 0.4388587474822998, + "learning_rate": 9.192188377441054e-05, + "loss": 1.8978, + "step": 6777 + }, + { + "epoch": 2.080417434008594, + "grad_norm": 0.3452616333961487, + "learning_rate": 9.19191746246184e-05, + "loss": 1.8849, + "step": 6778 + }, + { + "epoch": 2.0807243707796195, + "grad_norm": 0.3127618432044983, + "learning_rate": 9.191646506055638e-05, + "loss": 1.8703, + "step": 6779 + }, + { + "epoch": 2.0810313075506444, + "grad_norm": 0.3424977958202362, + "learning_rate": 9.191375508225131e-05, + "loss": 1.8446, + "step": 6780 + }, + { + "epoch": 2.0813382443216697, + "grad_norm": 0.3536671996116638, + "learning_rate": 9.191104468972993e-05, + "loss": 1.9079, + "step": 6781 + }, + { + "epoch": 2.081645181092695, + "grad_norm": 0.3689599633216858, + "learning_rate": 9.190833388301905e-05, + "loss": 1.8683, + "step": 6782 + }, + { + "epoch": 2.08195211786372, + "grad_norm": 0.30976906418800354, + "learning_rate": 9.190562266214546e-05, + "loss": 1.89, + "step": 6783 + }, + { + "epoch": 2.0822590546347453, + "grad_norm": 0.34682777523994446, + "learning_rate": 9.190291102713593e-05, + "loss": 1.8384, + "step": 6784 + }, + { + "epoch": 2.0825659914057706, + "grad_norm": 0.4135018587112427, + "learning_rate": 9.190019897801727e-05, + "loss": 1.8878, + "step": 6785 + }, + { + "epoch": 2.0828729281767955, + "grad_norm": 0.4247548580169678, + "learning_rate": 9.189748651481629e-05, + "loss": 1.9244, + "step": 6786 + }, + { + "epoch": 2.083179864947821, + "grad_norm": 0.3961609899997711, + "learning_rate": 9.18947736375598e-05, + "loss": 1.9539, + "step": 6787 + }, + { + "epoch": 2.0834868017188457, + "grad_norm": 0.4174231290817261, + "learning_rate": 9.18920603462746e-05, + "loss": 1.9705, + "step": 6788 + }, + { + "epoch": 2.083793738489871, + "grad_norm": 0.38771605491638184, + "learning_rate": 9.18893466409875e-05, + "loss": 1.9038, + "step": 6789 + }, + { + "epoch": 2.0841006752608964, + "grad_norm": 0.38480475544929504, + "learning_rate": 9.188663252172534e-05, + "loss": 1.8725, + "step": 6790 + }, + { + "epoch": 2.0844076120319213, + "grad_norm": 0.37508267164230347, + "learning_rate": 9.18839179885149e-05, + "loss": 1.8819, + "step": 6791 + }, + { + "epoch": 2.0847145488029466, + "grad_norm": 0.3970893621444702, + "learning_rate": 9.188120304138306e-05, + "loss": 1.9035, + "step": 6792 + }, + { + "epoch": 2.085021485573972, + "grad_norm": 0.42629706859588623, + "learning_rate": 9.18784876803566e-05, + "loss": 1.993, + "step": 6793 + }, + { + "epoch": 2.085328422344997, + "grad_norm": 0.40387317538261414, + "learning_rate": 9.18757719054624e-05, + "loss": 1.8987, + "step": 6794 + }, + { + "epoch": 2.085635359116022, + "grad_norm": 0.40304768085479736, + "learning_rate": 9.187305571672726e-05, + "loss": 1.9017, + "step": 6795 + }, + { + "epoch": 2.0859422958870475, + "grad_norm": 0.34255313873291016, + "learning_rate": 9.187033911417805e-05, + "loss": 1.8406, + "step": 6796 + }, + { + "epoch": 2.0862492326580724, + "grad_norm": 0.34713810682296753, + "learning_rate": 9.18676220978416e-05, + "loss": 1.8773, + "step": 6797 + }, + { + "epoch": 2.0865561694290977, + "grad_norm": 0.3651806712150574, + "learning_rate": 9.186490466774478e-05, + "loss": 1.9158, + "step": 6798 + }, + { + "epoch": 2.0868631062001226, + "grad_norm": 0.3859401047229767, + "learning_rate": 9.186218682391443e-05, + "loss": 1.8488, + "step": 6799 + }, + { + "epoch": 2.087170042971148, + "grad_norm": 0.34309303760528564, + "learning_rate": 9.185946856637742e-05, + "loss": 1.8373, + "step": 6800 + }, + { + "epoch": 2.0874769797421733, + "grad_norm": 0.3597384989261627, + "learning_rate": 9.18567498951606e-05, + "loss": 1.8297, + "step": 6801 + }, + { + "epoch": 2.087783916513198, + "grad_norm": 0.39170950651168823, + "learning_rate": 9.185403081029085e-05, + "loss": 1.9623, + "step": 6802 + }, + { + "epoch": 2.0880908532842235, + "grad_norm": 0.37024664878845215, + "learning_rate": 9.185131131179503e-05, + "loss": 1.8966, + "step": 6803 + }, + { + "epoch": 2.0883977900552484, + "grad_norm": 0.37869709730148315, + "learning_rate": 9.184859139970001e-05, + "loss": 1.9121, + "step": 6804 + }, + { + "epoch": 2.0887047268262737, + "grad_norm": 0.3808143436908722, + "learning_rate": 9.184587107403271e-05, + "loss": 1.918, + "step": 6805 + }, + { + "epoch": 2.089011663597299, + "grad_norm": 0.3864719271659851, + "learning_rate": 9.184315033481996e-05, + "loss": 1.9087, + "step": 6806 + }, + { + "epoch": 2.089318600368324, + "grad_norm": 0.41121476888656616, + "learning_rate": 9.184042918208869e-05, + "loss": 1.8971, + "step": 6807 + }, + { + "epoch": 2.0896255371393493, + "grad_norm": 0.33098986744880676, + "learning_rate": 9.183770761586576e-05, + "loss": 1.8497, + "step": 6808 + }, + { + "epoch": 2.0899324739103746, + "grad_norm": 0.336174339056015, + "learning_rate": 9.183498563617809e-05, + "loss": 1.8341, + "step": 6809 + }, + { + "epoch": 2.0902394106813995, + "grad_norm": 0.339040070772171, + "learning_rate": 9.183226324305258e-05, + "loss": 1.9228, + "step": 6810 + }, + { + "epoch": 2.090546347452425, + "grad_norm": 0.395000159740448, + "learning_rate": 9.182954043651613e-05, + "loss": 1.9773, + "step": 6811 + }, + { + "epoch": 2.09085328422345, + "grad_norm": 0.3884550929069519, + "learning_rate": 9.182681721659563e-05, + "loss": 1.9665, + "step": 6812 + }, + { + "epoch": 2.091160220994475, + "grad_norm": 0.38752105832099915, + "learning_rate": 9.182409358331801e-05, + "loss": 1.9337, + "step": 6813 + }, + { + "epoch": 2.0914671577655004, + "grad_norm": 0.3557493984699249, + "learning_rate": 9.182136953671017e-05, + "loss": 1.8506, + "step": 6814 + }, + { + "epoch": 2.0917740945365253, + "grad_norm": 0.36052554845809937, + "learning_rate": 9.181864507679906e-05, + "loss": 1.8336, + "step": 6815 + }, + { + "epoch": 2.0920810313075506, + "grad_norm": 0.3311133086681366, + "learning_rate": 9.181592020361158e-05, + "loss": 1.9121, + "step": 6816 + }, + { + "epoch": 2.092387968078576, + "grad_norm": 0.33922117948532104, + "learning_rate": 9.181319491717468e-05, + "loss": 1.8366, + "step": 6817 + }, + { + "epoch": 2.092694904849601, + "grad_norm": 0.30820000171661377, + "learning_rate": 9.181046921751527e-05, + "loss": 1.8931, + "step": 6818 + }, + { + "epoch": 2.093001841620626, + "grad_norm": 0.327374666929245, + "learning_rate": 9.180774310466031e-05, + "loss": 1.8818, + "step": 6819 + }, + { + "epoch": 2.0933087783916515, + "grad_norm": 0.3244091868400574, + "learning_rate": 9.180501657863672e-05, + "loss": 1.8542, + "step": 6820 + }, + { + "epoch": 2.0936157151626764, + "grad_norm": 0.32823657989501953, + "learning_rate": 9.180228963947144e-05, + "loss": 1.8745, + "step": 6821 + }, + { + "epoch": 2.0939226519337018, + "grad_norm": 0.32869017124176025, + "learning_rate": 9.179956228719144e-05, + "loss": 1.8497, + "step": 6822 + }, + { + "epoch": 2.0942295887047266, + "grad_norm": 0.3624805808067322, + "learning_rate": 9.179683452182369e-05, + "loss": 1.9499, + "step": 6823 + }, + { + "epoch": 2.094536525475752, + "grad_norm": 0.35709038376808167, + "learning_rate": 9.179410634339509e-05, + "loss": 1.8709, + "step": 6824 + }, + { + "epoch": 2.0948434622467773, + "grad_norm": 0.3875027298927307, + "learning_rate": 9.179137775193266e-05, + "loss": 1.883, + "step": 6825 + }, + { + "epoch": 2.095150399017802, + "grad_norm": 0.4203769862651825, + "learning_rate": 9.178864874746333e-05, + "loss": 1.814, + "step": 6826 + }, + { + "epoch": 2.0954573357888275, + "grad_norm": 0.46331214904785156, + "learning_rate": 9.178591933001407e-05, + "loss": 1.9821, + "step": 6827 + }, + { + "epoch": 2.095764272559853, + "grad_norm": 0.4264145791530609, + "learning_rate": 9.178318949961188e-05, + "loss": 1.9249, + "step": 6828 + }, + { + "epoch": 2.0960712093308778, + "grad_norm": 0.3697608709335327, + "learning_rate": 9.178045925628371e-05, + "loss": 2.0052, + "step": 6829 + }, + { + "epoch": 2.096378146101903, + "grad_norm": 0.39582517743110657, + "learning_rate": 9.177772860005656e-05, + "loss": 1.9086, + "step": 6830 + }, + { + "epoch": 2.096685082872928, + "grad_norm": 0.3287788927555084, + "learning_rate": 9.17749975309574e-05, + "loss": 1.8766, + "step": 6831 + }, + { + "epoch": 2.0969920196439533, + "grad_norm": 0.33648282289505005, + "learning_rate": 9.177226604901324e-05, + "loss": 1.933, + "step": 6832 + }, + { + "epoch": 2.0972989564149787, + "grad_norm": 0.34225910902023315, + "learning_rate": 9.176953415425106e-05, + "loss": 1.8801, + "step": 6833 + }, + { + "epoch": 2.0976058931860035, + "grad_norm": 0.35536935925483704, + "learning_rate": 9.176680184669786e-05, + "loss": 1.9472, + "step": 6834 + }, + { + "epoch": 2.097912829957029, + "grad_norm": 0.39152607321739197, + "learning_rate": 9.176406912638064e-05, + "loss": 1.9502, + "step": 6835 + }, + { + "epoch": 2.098219766728054, + "grad_norm": 0.3812694549560547, + "learning_rate": 9.176133599332643e-05, + "loss": 1.8746, + "step": 6836 + }, + { + "epoch": 2.098526703499079, + "grad_norm": 0.36225396394729614, + "learning_rate": 9.17586024475622e-05, + "loss": 1.8489, + "step": 6837 + }, + { + "epoch": 2.0988336402701044, + "grad_norm": 0.3953205943107605, + "learning_rate": 9.1755868489115e-05, + "loss": 1.8671, + "step": 6838 + }, + { + "epoch": 2.0991405770411293, + "grad_norm": 0.33443906903266907, + "learning_rate": 9.175313411801181e-05, + "loss": 1.8574, + "step": 6839 + }, + { + "epoch": 2.0994475138121547, + "grad_norm": 0.3358154892921448, + "learning_rate": 9.17503993342797e-05, + "loss": 1.8329, + "step": 6840 + }, + { + "epoch": 2.09975445058318, + "grad_norm": 0.45934513211250305, + "learning_rate": 9.174766413794566e-05, + "loss": 1.862, + "step": 6841 + }, + { + "epoch": 2.100061387354205, + "grad_norm": 0.46342480182647705, + "learning_rate": 9.174492852903673e-05, + "loss": 1.8747, + "step": 6842 + }, + { + "epoch": 2.1003683241252302, + "grad_norm": 0.4199588894844055, + "learning_rate": 9.174219250757996e-05, + "loss": 1.9308, + "step": 6843 + }, + { + "epoch": 2.1006752608962556, + "grad_norm": 0.3508588373661041, + "learning_rate": 9.173945607360238e-05, + "loss": 1.8622, + "step": 6844 + }, + { + "epoch": 2.1009821976672804, + "grad_norm": 0.3656609356403351, + "learning_rate": 9.173671922713104e-05, + "loss": 1.899, + "step": 6845 + }, + { + "epoch": 2.101289134438306, + "grad_norm": 0.43374791741371155, + "learning_rate": 9.173398196819295e-05, + "loss": 1.8725, + "step": 6846 + }, + { + "epoch": 2.1015960712093307, + "grad_norm": 0.49730411171913147, + "learning_rate": 9.17312442968152e-05, + "loss": 1.9224, + "step": 6847 + }, + { + "epoch": 2.101903007980356, + "grad_norm": 0.45392677187919617, + "learning_rate": 9.172850621302484e-05, + "loss": 1.8374, + "step": 6848 + }, + { + "epoch": 2.1022099447513813, + "grad_norm": 0.3507382273674011, + "learning_rate": 9.172576771684892e-05, + "loss": 1.8875, + "step": 6849 + }, + { + "epoch": 2.1025168815224062, + "grad_norm": 0.4124681055545807, + "learning_rate": 9.172302880831451e-05, + "loss": 1.8828, + "step": 6850 + }, + { + "epoch": 2.1028238182934316, + "grad_norm": 0.5120462775230408, + "learning_rate": 9.172028948744867e-05, + "loss": 1.8218, + "step": 6851 + }, + { + "epoch": 2.103130755064457, + "grad_norm": 0.5858038067817688, + "learning_rate": 9.171754975427848e-05, + "loss": 1.8679, + "step": 6852 + }, + { + "epoch": 2.103437691835482, + "grad_norm": 0.5196588039398193, + "learning_rate": 9.171480960883101e-05, + "loss": 1.8885, + "step": 6853 + }, + { + "epoch": 2.103744628606507, + "grad_norm": 0.38581255078315735, + "learning_rate": 9.171206905113335e-05, + "loss": 1.9127, + "step": 6854 + }, + { + "epoch": 2.104051565377532, + "grad_norm": 0.31531259417533875, + "learning_rate": 9.170932808121256e-05, + "loss": 1.84, + "step": 6855 + }, + { + "epoch": 2.1043585021485574, + "grad_norm": 0.4595080018043518, + "learning_rate": 9.170658669909575e-05, + "loss": 1.908, + "step": 6856 + }, + { + "epoch": 2.1046654389195827, + "grad_norm": 0.42485639452934265, + "learning_rate": 9.170384490481001e-05, + "loss": 1.8943, + "step": 6857 + }, + { + "epoch": 2.1049723756906076, + "grad_norm": 0.3465791344642639, + "learning_rate": 9.170110269838243e-05, + "loss": 1.8362, + "step": 6858 + }, + { + "epoch": 2.105279312461633, + "grad_norm": 0.26863181591033936, + "learning_rate": 9.16983600798401e-05, + "loss": 1.856, + "step": 6859 + }, + { + "epoch": 2.1055862492326582, + "grad_norm": 0.33826425671577454, + "learning_rate": 9.169561704921014e-05, + "loss": 1.8148, + "step": 6860 + }, + { + "epoch": 2.105893186003683, + "grad_norm": 0.3657929301261902, + "learning_rate": 9.169287360651967e-05, + "loss": 1.8978, + "step": 6861 + }, + { + "epoch": 2.1062001227747085, + "grad_norm": 0.2963617444038391, + "learning_rate": 9.169012975179579e-05, + "loss": 1.8432, + "step": 6862 + }, + { + "epoch": 2.1065070595457334, + "grad_norm": 0.32966092228889465, + "learning_rate": 9.168738548506559e-05, + "loss": 1.9137, + "step": 6863 + }, + { + "epoch": 2.1068139963167587, + "grad_norm": 0.4043191075325012, + "learning_rate": 9.168464080635622e-05, + "loss": 1.9294, + "step": 6864 + }, + { + "epoch": 2.107120933087784, + "grad_norm": 0.41461876034736633, + "learning_rate": 9.168189571569479e-05, + "loss": 1.8582, + "step": 6865 + }, + { + "epoch": 2.107427869858809, + "grad_norm": 0.34119492769241333, + "learning_rate": 9.167915021310845e-05, + "loss": 1.8245, + "step": 6866 + }, + { + "epoch": 2.1077348066298343, + "grad_norm": 0.3259434401988983, + "learning_rate": 9.167640429862429e-05, + "loss": 1.8962, + "step": 6867 + }, + { + "epoch": 2.1080417434008596, + "grad_norm": 0.3074548840522766, + "learning_rate": 9.167365797226951e-05, + "loss": 1.8617, + "step": 6868 + }, + { + "epoch": 2.1083486801718845, + "grad_norm": 0.40738388895988464, + "learning_rate": 9.167091123407121e-05, + "loss": 1.9701, + "step": 6869 + }, + { + "epoch": 2.10865561694291, + "grad_norm": 0.3931449055671692, + "learning_rate": 9.166816408405653e-05, + "loss": 1.8874, + "step": 6870 + }, + { + "epoch": 2.108962553713935, + "grad_norm": 0.3726460635662079, + "learning_rate": 9.166541652225264e-05, + "loss": 1.9307, + "step": 6871 + }, + { + "epoch": 2.10926949048496, + "grad_norm": 0.36566078662872314, + "learning_rate": 9.166266854868667e-05, + "loss": 1.8782, + "step": 6872 + }, + { + "epoch": 2.1095764272559854, + "grad_norm": 0.33448025584220886, + "learning_rate": 9.16599201633858e-05, + "loss": 1.8007, + "step": 6873 + }, + { + "epoch": 2.1098833640270103, + "grad_norm": 0.4261031150817871, + "learning_rate": 9.165717136637716e-05, + "loss": 1.9092, + "step": 6874 + }, + { + "epoch": 2.1101903007980356, + "grad_norm": 0.37860241532325745, + "learning_rate": 9.165442215768798e-05, + "loss": 1.8538, + "step": 6875 + }, + { + "epoch": 2.110497237569061, + "grad_norm": 0.35417279601097107, + "learning_rate": 9.165167253734535e-05, + "loss": 1.8859, + "step": 6876 + }, + { + "epoch": 2.110804174340086, + "grad_norm": 0.33357858657836914, + "learning_rate": 9.16489225053765e-05, + "loss": 1.8615, + "step": 6877 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.40441447496414185, + "learning_rate": 9.164617206180856e-05, + "loss": 1.8711, + "step": 6878 + }, + { + "epoch": 2.1114180478821365, + "grad_norm": 0.401530921459198, + "learning_rate": 9.164342120666876e-05, + "loss": 1.8378, + "step": 6879 + }, + { + "epoch": 2.1117249846531614, + "grad_norm": 0.36379504203796387, + "learning_rate": 9.164066993998426e-05, + "loss": 1.87, + "step": 6880 + }, + { + "epoch": 2.1120319214241867, + "grad_norm": 0.36242642998695374, + "learning_rate": 9.163791826178225e-05, + "loss": 1.9041, + "step": 6881 + }, + { + "epoch": 2.1123388581952116, + "grad_norm": 0.34601980447769165, + "learning_rate": 9.163516617208994e-05, + "loss": 1.9248, + "step": 6882 + }, + { + "epoch": 2.112645794966237, + "grad_norm": 0.4664660096168518, + "learning_rate": 9.163241367093451e-05, + "loss": 1.901, + "step": 6883 + }, + { + "epoch": 2.1129527317372623, + "grad_norm": 0.5991809964179993, + "learning_rate": 9.162966075834315e-05, + "loss": 1.9061, + "step": 6884 + }, + { + "epoch": 2.113259668508287, + "grad_norm": 0.5235050320625305, + "learning_rate": 9.16269074343431e-05, + "loss": 1.8958, + "step": 6885 + }, + { + "epoch": 2.1135666052793125, + "grad_norm": 0.39008161425590515, + "learning_rate": 9.162415369896153e-05, + "loss": 1.7935, + "step": 6886 + }, + { + "epoch": 2.113873542050338, + "grad_norm": 0.4212269186973572, + "learning_rate": 9.16213995522257e-05, + "loss": 1.9876, + "step": 6887 + }, + { + "epoch": 2.1141804788213627, + "grad_norm": 0.44495880603790283, + "learning_rate": 9.161864499416279e-05, + "loss": 1.9011, + "step": 6888 + }, + { + "epoch": 2.114487415592388, + "grad_norm": 0.40533384680747986, + "learning_rate": 9.161589002480006e-05, + "loss": 1.8734, + "step": 6889 + }, + { + "epoch": 2.114794352363413, + "grad_norm": 0.45783132314682007, + "learning_rate": 9.161313464416469e-05, + "loss": 1.9769, + "step": 6890 + }, + { + "epoch": 2.1151012891344383, + "grad_norm": 0.37975600361824036, + "learning_rate": 9.161037885228393e-05, + "loss": 1.8988, + "step": 6891 + }, + { + "epoch": 2.1154082259054636, + "grad_norm": 0.394987553358078, + "learning_rate": 9.160762264918504e-05, + "loss": 1.8076, + "step": 6892 + }, + { + "epoch": 2.1157151626764885, + "grad_norm": 0.4180262088775635, + "learning_rate": 9.160486603489522e-05, + "loss": 1.9497, + "step": 6893 + }, + { + "epoch": 2.116022099447514, + "grad_norm": 0.3917383849620819, + "learning_rate": 9.160210900944173e-05, + "loss": 1.9093, + "step": 6894 + }, + { + "epoch": 2.116329036218539, + "grad_norm": 0.3631739616394043, + "learning_rate": 9.15993515728518e-05, + "loss": 1.8724, + "step": 6895 + }, + { + "epoch": 2.116635972989564, + "grad_norm": 0.3304460942745209, + "learning_rate": 9.159659372515272e-05, + "loss": 1.8291, + "step": 6896 + }, + { + "epoch": 2.1169429097605894, + "grad_norm": 0.38202792406082153, + "learning_rate": 9.159383546637172e-05, + "loss": 1.8919, + "step": 6897 + }, + { + "epoch": 2.1172498465316143, + "grad_norm": 0.39544618129730225, + "learning_rate": 9.159107679653605e-05, + "loss": 1.8748, + "step": 6898 + }, + { + "epoch": 2.1175567833026396, + "grad_norm": 0.44175153970718384, + "learning_rate": 9.158831771567298e-05, + "loss": 1.9063, + "step": 6899 + }, + { + "epoch": 2.117863720073665, + "grad_norm": 0.3696559965610504, + "learning_rate": 9.158555822380979e-05, + "loss": 1.8356, + "step": 6900 + }, + { + "epoch": 2.11817065684469, + "grad_norm": 0.2917703688144684, + "learning_rate": 9.158279832097372e-05, + "loss": 1.8996, + "step": 6901 + }, + { + "epoch": 2.118477593615715, + "grad_norm": 0.3991266191005707, + "learning_rate": 9.158003800719208e-05, + "loss": 1.8872, + "step": 6902 + }, + { + "epoch": 2.1187845303867405, + "grad_norm": 0.41425880789756775, + "learning_rate": 9.157727728249213e-05, + "loss": 1.845, + "step": 6903 + }, + { + "epoch": 2.1190914671577654, + "grad_norm": 0.33590519428253174, + "learning_rate": 9.157451614690115e-05, + "loss": 1.8779, + "step": 6904 + }, + { + "epoch": 2.1193984039287908, + "grad_norm": 0.34963786602020264, + "learning_rate": 9.157175460044644e-05, + "loss": 1.8846, + "step": 6905 + }, + { + "epoch": 2.1197053406998156, + "grad_norm": 0.3274745047092438, + "learning_rate": 9.156899264315528e-05, + "loss": 1.8859, + "step": 6906 + }, + { + "epoch": 2.120012277470841, + "grad_norm": 0.35821303725242615, + "learning_rate": 9.156623027505498e-05, + "loss": 1.8314, + "step": 6907 + }, + { + "epoch": 2.1203192142418663, + "grad_norm": 0.41185733675956726, + "learning_rate": 9.156346749617283e-05, + "loss": 1.9162, + "step": 6908 + }, + { + "epoch": 2.120626151012891, + "grad_norm": 0.4120326042175293, + "learning_rate": 9.156070430653613e-05, + "loss": 1.8593, + "step": 6909 + }, + { + "epoch": 2.1209330877839165, + "grad_norm": 0.39017269015312195, + "learning_rate": 9.155794070617218e-05, + "loss": 1.9333, + "step": 6910 + }, + { + "epoch": 2.121240024554942, + "grad_norm": 0.3104727864265442, + "learning_rate": 9.155517669510832e-05, + "loss": 1.8274, + "step": 6911 + }, + { + "epoch": 2.1215469613259668, + "grad_norm": 0.38360875844955444, + "learning_rate": 9.155241227337183e-05, + "loss": 1.9013, + "step": 6912 + }, + { + "epoch": 2.121853898096992, + "grad_norm": 0.3752502501010895, + "learning_rate": 9.154964744099006e-05, + "loss": 1.9079, + "step": 6913 + }, + { + "epoch": 2.122160834868017, + "grad_norm": 0.32074928283691406, + "learning_rate": 9.154688219799033e-05, + "loss": 1.8232, + "step": 6914 + }, + { + "epoch": 2.1224677716390423, + "grad_norm": 0.39559221267700195, + "learning_rate": 9.154411654439993e-05, + "loss": 1.9273, + "step": 6915 + }, + { + "epoch": 2.1227747084100677, + "grad_norm": 0.4010276198387146, + "learning_rate": 9.154135048024623e-05, + "loss": 1.8368, + "step": 6916 + }, + { + "epoch": 2.1230816451810925, + "grad_norm": 0.5745936036109924, + "learning_rate": 9.153858400555658e-05, + "loss": 2.0344, + "step": 6917 + }, + { + "epoch": 2.123388581952118, + "grad_norm": 0.45708227157592773, + "learning_rate": 9.153581712035827e-05, + "loss": 1.9309, + "step": 6918 + }, + { + "epoch": 2.123695518723143, + "grad_norm": 0.43845629692077637, + "learning_rate": 9.153304982467868e-05, + "loss": 1.9213, + "step": 6919 + }, + { + "epoch": 2.124002455494168, + "grad_norm": 0.34456655383110046, + "learning_rate": 9.153028211854516e-05, + "loss": 1.9, + "step": 6920 + }, + { + "epoch": 2.1243093922651934, + "grad_norm": 0.3903563618659973, + "learning_rate": 9.152751400198502e-05, + "loss": 1.8619, + "step": 6921 + }, + { + "epoch": 2.1246163290362183, + "grad_norm": 0.3465174436569214, + "learning_rate": 9.152474547502566e-05, + "loss": 1.8253, + "step": 6922 + }, + { + "epoch": 2.1249232658072437, + "grad_norm": 0.38335317373275757, + "learning_rate": 9.152197653769444e-05, + "loss": 1.8824, + "step": 6923 + }, + { + "epoch": 2.125230202578269, + "grad_norm": 0.3583361506462097, + "learning_rate": 9.15192071900187e-05, + "loss": 1.8749, + "step": 6924 + }, + { + "epoch": 2.125537139349294, + "grad_norm": 0.38249272108078003, + "learning_rate": 9.151643743202582e-05, + "loss": 1.9289, + "step": 6925 + }, + { + "epoch": 2.1258440761203192, + "grad_norm": 0.3972204327583313, + "learning_rate": 9.151366726374318e-05, + "loss": 1.8259, + "step": 6926 + }, + { + "epoch": 2.1261510128913446, + "grad_norm": 0.42475268244743347, + "learning_rate": 9.151089668519814e-05, + "loss": 1.9026, + "step": 6927 + }, + { + "epoch": 2.1264579496623695, + "grad_norm": 0.39575010538101196, + "learning_rate": 9.15081256964181e-05, + "loss": 1.8835, + "step": 6928 + }, + { + "epoch": 2.126764886433395, + "grad_norm": 0.33592918515205383, + "learning_rate": 9.150535429743041e-05, + "loss": 1.9439, + "step": 6929 + }, + { + "epoch": 2.12707182320442, + "grad_norm": 0.41760140657424927, + "learning_rate": 9.150258248826249e-05, + "loss": 1.9326, + "step": 6930 + }, + { + "epoch": 2.127378759975445, + "grad_norm": 0.4759281575679779, + "learning_rate": 9.149981026894173e-05, + "loss": 1.8443, + "step": 6931 + }, + { + "epoch": 2.1276856967464703, + "grad_norm": 0.4669014513492584, + "learning_rate": 9.149703763949552e-05, + "loss": 1.9254, + "step": 6932 + }, + { + "epoch": 2.1279926335174952, + "grad_norm": 0.3498002588748932, + "learning_rate": 9.149426459995126e-05, + "loss": 1.8814, + "step": 6933 + }, + { + "epoch": 2.1282995702885206, + "grad_norm": 0.332998663187027, + "learning_rate": 9.149149115033637e-05, + "loss": 1.8223, + "step": 6934 + }, + { + "epoch": 2.128606507059546, + "grad_norm": 0.36990395188331604, + "learning_rate": 9.148871729067823e-05, + "loss": 1.917, + "step": 6935 + }, + { + "epoch": 2.128913443830571, + "grad_norm": 0.4807330369949341, + "learning_rate": 9.148594302100426e-05, + "loss": 1.9138, + "step": 6936 + }, + { + "epoch": 2.129220380601596, + "grad_norm": 0.4821743369102478, + "learning_rate": 9.14831683413419e-05, + "loss": 1.9201, + "step": 6937 + }, + { + "epoch": 2.129527317372621, + "grad_norm": 0.45373013615608215, + "learning_rate": 9.148039325171855e-05, + "loss": 1.88, + "step": 6938 + }, + { + "epoch": 2.1298342541436464, + "grad_norm": 0.3712935745716095, + "learning_rate": 9.147761775216166e-05, + "loss": 1.8424, + "step": 6939 + }, + { + "epoch": 2.1301411909146717, + "grad_norm": 0.32493939995765686, + "learning_rate": 9.147484184269862e-05, + "loss": 1.8691, + "step": 6940 + }, + { + "epoch": 2.1304481276856966, + "grad_norm": 0.41952449083328247, + "learning_rate": 9.14720655233569e-05, + "loss": 1.8468, + "step": 6941 + }, + { + "epoch": 2.130755064456722, + "grad_norm": 0.4730648398399353, + "learning_rate": 9.14692887941639e-05, + "loss": 2.0333, + "step": 6942 + }, + { + "epoch": 2.1310620012277472, + "grad_norm": 0.3745786249637604, + "learning_rate": 9.14665116551471e-05, + "loss": 1.8835, + "step": 6943 + }, + { + "epoch": 2.131368937998772, + "grad_norm": 0.3747421205043793, + "learning_rate": 9.146373410633392e-05, + "loss": 1.8958, + "step": 6944 + }, + { + "epoch": 2.1316758747697975, + "grad_norm": 0.4383934438228607, + "learning_rate": 9.146095614775182e-05, + "loss": 1.8527, + "step": 6945 + }, + { + "epoch": 2.131982811540823, + "grad_norm": 0.4657299220561981, + "learning_rate": 9.145817777942824e-05, + "loss": 1.9073, + "step": 6946 + }, + { + "epoch": 2.1322897483118477, + "grad_norm": 0.4741605818271637, + "learning_rate": 9.145539900139067e-05, + "loss": 1.8736, + "step": 6947 + }, + { + "epoch": 2.132596685082873, + "grad_norm": 0.4058460295200348, + "learning_rate": 9.145261981366653e-05, + "loss": 1.9365, + "step": 6948 + }, + { + "epoch": 2.132903621853898, + "grad_norm": 0.3430838882923126, + "learning_rate": 9.14498402162833e-05, + "loss": 1.8992, + "step": 6949 + }, + { + "epoch": 2.1332105586249233, + "grad_norm": 0.43009114265441895, + "learning_rate": 9.144706020926847e-05, + "loss": 1.925, + "step": 6950 + }, + { + "epoch": 2.1335174953959486, + "grad_norm": 0.47696158289909363, + "learning_rate": 9.144427979264949e-05, + "loss": 1.858, + "step": 6951 + }, + { + "epoch": 2.1338244321669735, + "grad_norm": 0.4477602243423462, + "learning_rate": 9.144149896645386e-05, + "loss": 1.9042, + "step": 6952 + }, + { + "epoch": 2.134131368937999, + "grad_norm": 0.3736960291862488, + "learning_rate": 9.143871773070903e-05, + "loss": 1.782, + "step": 6953 + }, + { + "epoch": 2.1344383057090237, + "grad_norm": 0.3065558075904846, + "learning_rate": 9.143593608544251e-05, + "loss": 1.8711, + "step": 6954 + }, + { + "epoch": 2.134745242480049, + "grad_norm": 0.41738569736480713, + "learning_rate": 9.143315403068178e-05, + "loss": 1.8651, + "step": 6955 + }, + { + "epoch": 2.1350521792510744, + "grad_norm": 0.4652978479862213, + "learning_rate": 9.143037156645435e-05, + "loss": 1.8225, + "step": 6956 + }, + { + "epoch": 2.1353591160220993, + "grad_norm": 0.3625001311302185, + "learning_rate": 9.142758869278769e-05, + "loss": 1.9045, + "step": 6957 + }, + { + "epoch": 2.1356660527931246, + "grad_norm": 0.34516090154647827, + "learning_rate": 9.142480540970933e-05, + "loss": 1.8527, + "step": 6958 + }, + { + "epoch": 2.13597298956415, + "grad_norm": 0.36983323097229004, + "learning_rate": 9.142202171724674e-05, + "loss": 1.7911, + "step": 6959 + }, + { + "epoch": 2.136279926335175, + "grad_norm": 0.46084535121917725, + "learning_rate": 9.141923761542748e-05, + "loss": 1.9489, + "step": 6960 + }, + { + "epoch": 2.1365868631062, + "grad_norm": 0.49472227692604065, + "learning_rate": 9.141645310427903e-05, + "loss": 1.9904, + "step": 6961 + }, + { + "epoch": 2.1368937998772255, + "grad_norm": 0.39878135919570923, + "learning_rate": 9.14136681838289e-05, + "loss": 1.8969, + "step": 6962 + }, + { + "epoch": 2.1372007366482504, + "grad_norm": 0.3451174795627594, + "learning_rate": 9.141088285410464e-05, + "loss": 1.9186, + "step": 6963 + }, + { + "epoch": 2.1375076734192757, + "grad_norm": 0.4497967064380646, + "learning_rate": 9.140809711513377e-05, + "loss": 1.8636, + "step": 6964 + }, + { + "epoch": 2.1378146101903006, + "grad_norm": 0.4643685221672058, + "learning_rate": 9.14053109669438e-05, + "loss": 1.8427, + "step": 6965 + }, + { + "epoch": 2.138121546961326, + "grad_norm": 0.3748690187931061, + "learning_rate": 9.140252440956229e-05, + "loss": 1.8529, + "step": 6966 + }, + { + "epoch": 2.1384284837323513, + "grad_norm": 0.3211230933666229, + "learning_rate": 9.139973744301675e-05, + "loss": 1.8849, + "step": 6967 + }, + { + "epoch": 2.138735420503376, + "grad_norm": 0.41169998049736023, + "learning_rate": 9.139695006733476e-05, + "loss": 1.8535, + "step": 6968 + }, + { + "epoch": 2.1390423572744015, + "grad_norm": 0.48356300592422485, + "learning_rate": 9.139416228254382e-05, + "loss": 1.8182, + "step": 6969 + }, + { + "epoch": 2.139349294045427, + "grad_norm": 0.4596598148345947, + "learning_rate": 9.139137408867153e-05, + "loss": 1.8522, + "step": 6970 + }, + { + "epoch": 2.1396562308164517, + "grad_norm": 0.37168747186660767, + "learning_rate": 9.138858548574543e-05, + "loss": 1.896, + "step": 6971 + }, + { + "epoch": 2.139963167587477, + "grad_norm": 0.34447649121284485, + "learning_rate": 9.138579647379305e-05, + "loss": 1.8473, + "step": 6972 + }, + { + "epoch": 2.140270104358502, + "grad_norm": 0.466169536113739, + "learning_rate": 9.138300705284197e-05, + "loss": 1.9131, + "step": 6973 + }, + { + "epoch": 2.1405770411295273, + "grad_norm": 0.4297258257865906, + "learning_rate": 9.138021722291977e-05, + "loss": 1.9013, + "step": 6974 + }, + { + "epoch": 2.1408839779005526, + "grad_norm": 0.29336342215538025, + "learning_rate": 9.1377426984054e-05, + "loss": 1.8242, + "step": 6975 + }, + { + "epoch": 2.1411909146715775, + "grad_norm": 0.4282750189304352, + "learning_rate": 9.137463633627226e-05, + "loss": 1.9159, + "step": 6976 + }, + { + "epoch": 2.141497851442603, + "grad_norm": 0.6071211099624634, + "learning_rate": 9.13718452796021e-05, + "loss": 1.9105, + "step": 6977 + }, + { + "epoch": 2.141804788213628, + "grad_norm": 0.5837090015411377, + "learning_rate": 9.136905381407113e-05, + "loss": 1.8735, + "step": 6978 + }, + { + "epoch": 2.142111724984653, + "grad_norm": 0.36910486221313477, + "learning_rate": 9.13662619397069e-05, + "loss": 1.9013, + "step": 6979 + }, + { + "epoch": 2.1424186617556784, + "grad_norm": 0.37497541308403015, + "learning_rate": 9.136346965653704e-05, + "loss": 1.8444, + "step": 6980 + }, + { + "epoch": 2.1427255985267033, + "grad_norm": 0.508252739906311, + "learning_rate": 9.136067696458911e-05, + "loss": 1.8756, + "step": 6981 + }, + { + "epoch": 2.1430325352977286, + "grad_norm": 0.4045214056968689, + "learning_rate": 9.135788386389077e-05, + "loss": 1.8843, + "step": 6982 + }, + { + "epoch": 2.143339472068754, + "grad_norm": 0.36260777711868286, + "learning_rate": 9.135509035446955e-05, + "loss": 1.9264, + "step": 6983 + }, + { + "epoch": 2.143646408839779, + "grad_norm": 0.4112427234649658, + "learning_rate": 9.135229643635309e-05, + "loss": 1.8843, + "step": 6984 + }, + { + "epoch": 2.143953345610804, + "grad_norm": 0.43893104791641235, + "learning_rate": 9.1349502109569e-05, + "loss": 1.9486, + "step": 6985 + }, + { + "epoch": 2.1442602823818295, + "grad_norm": 0.3942745625972748, + "learning_rate": 9.13467073741449e-05, + "loss": 1.8607, + "step": 6986 + }, + { + "epoch": 2.1445672191528544, + "grad_norm": 0.3920004963874817, + "learning_rate": 9.13439122301084e-05, + "loss": 1.8102, + "step": 6987 + }, + { + "epoch": 2.1448741559238798, + "grad_norm": 0.3774373531341553, + "learning_rate": 9.134111667748712e-05, + "loss": 1.8326, + "step": 6988 + }, + { + "epoch": 2.1451810926949046, + "grad_norm": 0.355228453874588, + "learning_rate": 9.13383207163087e-05, + "loss": 1.895, + "step": 6989 + }, + { + "epoch": 2.14548802946593, + "grad_norm": 0.40284648537635803, + "learning_rate": 9.133552434660077e-05, + "loss": 1.928, + "step": 6990 + }, + { + "epoch": 2.1457949662369553, + "grad_norm": 0.3974910378456116, + "learning_rate": 9.133272756839096e-05, + "loss": 1.8567, + "step": 6991 + }, + { + "epoch": 2.14610190300798, + "grad_norm": 0.3878382742404938, + "learning_rate": 9.13299303817069e-05, + "loss": 1.9125, + "step": 6992 + }, + { + "epoch": 2.1464088397790055, + "grad_norm": 0.36132267117500305, + "learning_rate": 9.132713278657625e-05, + "loss": 1.8395, + "step": 6993 + }, + { + "epoch": 2.146715776550031, + "grad_norm": 0.4648832082748413, + "learning_rate": 9.132433478302667e-05, + "loss": 1.8877, + "step": 6994 + }, + { + "epoch": 2.1470227133210558, + "grad_norm": 0.5171563625335693, + "learning_rate": 9.132153637108577e-05, + "loss": 1.857, + "step": 6995 + }, + { + "epoch": 2.147329650092081, + "grad_norm": 0.4256175756454468, + "learning_rate": 9.131873755078124e-05, + "loss": 1.8434, + "step": 6996 + }, + { + "epoch": 2.147636586863106, + "grad_norm": 0.3421500623226166, + "learning_rate": 9.131593832214072e-05, + "loss": 1.8747, + "step": 6997 + }, + { + "epoch": 2.1479435236341313, + "grad_norm": 0.3880314230918884, + "learning_rate": 9.131313868519188e-05, + "loss": 1.8592, + "step": 6998 + }, + { + "epoch": 2.1482504604051567, + "grad_norm": 0.41070252656936646, + "learning_rate": 9.131033863996239e-05, + "loss": 1.8746, + "step": 6999 + }, + { + "epoch": 2.1485573971761815, + "grad_norm": 0.3837376534938812, + "learning_rate": 9.130753818647992e-05, + "loss": 1.8722, + "step": 7000 + }, + { + "epoch": 2.148864333947207, + "grad_norm": 0.311184823513031, + "learning_rate": 9.130473732477217e-05, + "loss": 1.8964, + "step": 7001 + }, + { + "epoch": 2.149171270718232, + "grad_norm": 0.3548091948032379, + "learning_rate": 9.130193605486677e-05, + "loss": 1.9235, + "step": 7002 + }, + { + "epoch": 2.149478207489257, + "grad_norm": 0.3509860932826996, + "learning_rate": 9.129913437679143e-05, + "loss": 1.8088, + "step": 7003 + }, + { + "epoch": 2.1497851442602824, + "grad_norm": 0.3301749527454376, + "learning_rate": 9.129633229057384e-05, + "loss": 1.8926, + "step": 7004 + }, + { + "epoch": 2.150092081031308, + "grad_norm": 0.3071286082267761, + "learning_rate": 9.129352979624169e-05, + "loss": 1.8045, + "step": 7005 + }, + { + "epoch": 2.1503990178023327, + "grad_norm": 0.3222786486148834, + "learning_rate": 9.129072689382268e-05, + "loss": 1.877, + "step": 7006 + }, + { + "epoch": 2.150705954573358, + "grad_norm": 0.31817424297332764, + "learning_rate": 9.128792358334451e-05, + "loss": 1.8863, + "step": 7007 + }, + { + "epoch": 2.151012891344383, + "grad_norm": 0.29379183053970337, + "learning_rate": 9.128511986483487e-05, + "loss": 1.8339, + "step": 7008 + }, + { + "epoch": 2.1513198281154082, + "grad_norm": 0.3618883788585663, + "learning_rate": 9.128231573832149e-05, + "loss": 1.9521, + "step": 7009 + }, + { + "epoch": 2.1516267648864336, + "grad_norm": 0.3188464045524597, + "learning_rate": 9.127951120383205e-05, + "loss": 1.811, + "step": 7010 + }, + { + "epoch": 2.1519337016574585, + "grad_norm": 0.3257068395614624, + "learning_rate": 9.127670626139431e-05, + "loss": 1.9084, + "step": 7011 + }, + { + "epoch": 2.152240638428484, + "grad_norm": 0.3389057219028473, + "learning_rate": 9.127390091103595e-05, + "loss": 1.9272, + "step": 7012 + }, + { + "epoch": 2.1525475751995087, + "grad_norm": 0.3376730680465698, + "learning_rate": 9.127109515278471e-05, + "loss": 1.8841, + "step": 7013 + }, + { + "epoch": 2.152854511970534, + "grad_norm": 0.3032901883125305, + "learning_rate": 9.126828898666833e-05, + "loss": 1.8057, + "step": 7014 + }, + { + "epoch": 2.1531614487415593, + "grad_norm": 0.32034799456596375, + "learning_rate": 9.126548241271451e-05, + "loss": 1.7988, + "step": 7015 + }, + { + "epoch": 2.1534683855125842, + "grad_norm": 0.31879931688308716, + "learning_rate": 9.126267543095102e-05, + "loss": 1.8932, + "step": 7016 + }, + { + "epoch": 2.1537753222836096, + "grad_norm": 0.3282395005226135, + "learning_rate": 9.125986804140559e-05, + "loss": 1.907, + "step": 7017 + }, + { + "epoch": 2.154082259054635, + "grad_norm": 0.36310696601867676, + "learning_rate": 9.125706024410594e-05, + "loss": 1.9812, + "step": 7018 + }, + { + "epoch": 2.15438919582566, + "grad_norm": 0.39414262771606445, + "learning_rate": 9.125425203907985e-05, + "loss": 1.9112, + "step": 7019 + }, + { + "epoch": 2.154696132596685, + "grad_norm": 0.4457061290740967, + "learning_rate": 9.125144342635508e-05, + "loss": 1.8876, + "step": 7020 + }, + { + "epoch": 2.1550030693677105, + "grad_norm": 0.4651646316051483, + "learning_rate": 9.124863440595934e-05, + "loss": 1.8283, + "step": 7021 + }, + { + "epoch": 2.1553100061387354, + "grad_norm": 0.4404383897781372, + "learning_rate": 9.124582497792043e-05, + "loss": 1.8646, + "step": 7022 + }, + { + "epoch": 2.1556169429097607, + "grad_norm": 0.3569783866405487, + "learning_rate": 9.124301514226612e-05, + "loss": 1.9603, + "step": 7023 + }, + { + "epoch": 2.1559238796807856, + "grad_norm": 0.3878212571144104, + "learning_rate": 9.124020489902414e-05, + "loss": 1.889, + "step": 7024 + }, + { + "epoch": 2.156230816451811, + "grad_norm": 0.43005698919296265, + "learning_rate": 9.123739424822229e-05, + "loss": 1.9127, + "step": 7025 + }, + { + "epoch": 2.1565377532228363, + "grad_norm": 0.37798774242401123, + "learning_rate": 9.123458318988834e-05, + "loss": 1.8434, + "step": 7026 + }, + { + "epoch": 2.156844689993861, + "grad_norm": 0.38182979822158813, + "learning_rate": 9.123177172405007e-05, + "loss": 1.8905, + "step": 7027 + }, + { + "epoch": 2.1571516267648865, + "grad_norm": 0.4695180058479309, + "learning_rate": 9.122895985073524e-05, + "loss": 1.9035, + "step": 7028 + }, + { + "epoch": 2.1574585635359114, + "grad_norm": 0.37112870812416077, + "learning_rate": 9.12261475699717e-05, + "loss": 1.8497, + "step": 7029 + }, + { + "epoch": 2.1577655003069367, + "grad_norm": 0.36758264899253845, + "learning_rate": 9.122333488178721e-05, + "loss": 1.9015, + "step": 7030 + }, + { + "epoch": 2.158072437077962, + "grad_norm": 0.4691081643104553, + "learning_rate": 9.122052178620953e-05, + "loss": 1.9707, + "step": 7031 + }, + { + "epoch": 2.158379373848987, + "grad_norm": 0.47068753838539124, + "learning_rate": 9.121770828326653e-05, + "loss": 1.9103, + "step": 7032 + }, + { + "epoch": 2.1586863106200123, + "grad_norm": 0.38539063930511475, + "learning_rate": 9.121489437298593e-05, + "loss": 1.7872, + "step": 7033 + }, + { + "epoch": 2.1589932473910376, + "grad_norm": 0.43769749999046326, + "learning_rate": 9.121208005539563e-05, + "loss": 1.9654, + "step": 7034 + }, + { + "epoch": 2.1593001841620625, + "grad_norm": 0.4770655930042267, + "learning_rate": 9.120926533052338e-05, + "loss": 1.9754, + "step": 7035 + }, + { + "epoch": 2.159607120933088, + "grad_norm": 0.526979386806488, + "learning_rate": 9.120645019839702e-05, + "loss": 1.8833, + "step": 7036 + }, + { + "epoch": 2.159914057704113, + "grad_norm": 0.4734671413898468, + "learning_rate": 9.120363465904438e-05, + "loss": 1.8695, + "step": 7037 + }, + { + "epoch": 2.160220994475138, + "grad_norm": 0.40346798300743103, + "learning_rate": 9.120081871249326e-05, + "loss": 1.9216, + "step": 7038 + }, + { + "epoch": 2.1605279312461634, + "grad_norm": 0.38210105895996094, + "learning_rate": 9.119800235877149e-05, + "loss": 1.9334, + "step": 7039 + }, + { + "epoch": 2.1608348680171883, + "grad_norm": 0.5528677105903625, + "learning_rate": 9.119518559790694e-05, + "loss": 1.8858, + "step": 7040 + }, + { + "epoch": 2.1611418047882136, + "grad_norm": 0.6684148907661438, + "learning_rate": 9.11923684299274e-05, + "loss": 1.9105, + "step": 7041 + }, + { + "epoch": 2.161448741559239, + "grad_norm": 0.4497738778591156, + "learning_rate": 9.118955085486073e-05, + "loss": 1.8789, + "step": 7042 + }, + { + "epoch": 2.161755678330264, + "grad_norm": 0.4440831243991852, + "learning_rate": 9.11867328727348e-05, + "loss": 1.9966, + "step": 7043 + }, + { + "epoch": 2.162062615101289, + "grad_norm": 0.5910835266113281, + "learning_rate": 9.118391448357742e-05, + "loss": 1.8841, + "step": 7044 + }, + { + "epoch": 2.1623695518723145, + "grad_norm": 0.5312752723693848, + "learning_rate": 9.118109568741645e-05, + "loss": 1.8825, + "step": 7045 + }, + { + "epoch": 2.1626764886433394, + "grad_norm": 0.3885713815689087, + "learning_rate": 9.117827648427977e-05, + "loss": 1.8763, + "step": 7046 + }, + { + "epoch": 2.1629834254143647, + "grad_norm": 0.4274894893169403, + "learning_rate": 9.117545687419522e-05, + "loss": 1.8802, + "step": 7047 + }, + { + "epoch": 2.1632903621853896, + "grad_norm": 0.3984382748603821, + "learning_rate": 9.117263685719067e-05, + "loss": 1.8319, + "step": 7048 + }, + { + "epoch": 2.163597298956415, + "grad_norm": 0.3687778115272522, + "learning_rate": 9.1169816433294e-05, + "loss": 1.838, + "step": 7049 + }, + { + "epoch": 2.1639042357274403, + "grad_norm": 0.37597915530204773, + "learning_rate": 9.116699560253306e-05, + "loss": 1.8711, + "step": 7050 + }, + { + "epoch": 2.164211172498465, + "grad_norm": 0.41217467188835144, + "learning_rate": 9.116417436493574e-05, + "loss": 1.8552, + "step": 7051 + }, + { + "epoch": 2.1645181092694905, + "grad_norm": 0.3937448263168335, + "learning_rate": 9.116135272052994e-05, + "loss": 1.8548, + "step": 7052 + }, + { + "epoch": 2.164825046040516, + "grad_norm": 0.3545389175415039, + "learning_rate": 9.115853066934351e-05, + "loss": 1.8694, + "step": 7053 + }, + { + "epoch": 2.1651319828115407, + "grad_norm": 0.32625243067741394, + "learning_rate": 9.115570821140436e-05, + "loss": 1.8579, + "step": 7054 + }, + { + "epoch": 2.165438919582566, + "grad_norm": 0.32701975107192993, + "learning_rate": 9.115288534674038e-05, + "loss": 1.8676, + "step": 7055 + }, + { + "epoch": 2.165745856353591, + "grad_norm": 0.39372533559799194, + "learning_rate": 9.115006207537947e-05, + "loss": 1.8895, + "step": 7056 + }, + { + "epoch": 2.1660527931246163, + "grad_norm": 0.3688350021839142, + "learning_rate": 9.114723839734954e-05, + "loss": 1.8742, + "step": 7057 + }, + { + "epoch": 2.1663597298956416, + "grad_norm": 0.35461875796318054, + "learning_rate": 9.114441431267846e-05, + "loss": 1.8723, + "step": 7058 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.3331618010997772, + "learning_rate": 9.114158982139416e-05, + "loss": 1.8514, + "step": 7059 + }, + { + "epoch": 2.166973603437692, + "grad_norm": 0.3313215374946594, + "learning_rate": 9.113876492352458e-05, + "loss": 1.912, + "step": 7060 + }, + { + "epoch": 2.167280540208717, + "grad_norm": 0.3320949375629425, + "learning_rate": 9.113593961909759e-05, + "loss": 1.8908, + "step": 7061 + }, + { + "epoch": 2.167587476979742, + "grad_norm": 0.3292064070701599, + "learning_rate": 9.113311390814115e-05, + "loss": 1.8702, + "step": 7062 + }, + { + "epoch": 2.1678944137507674, + "grad_norm": 0.33991244435310364, + "learning_rate": 9.113028779068316e-05, + "loss": 1.8503, + "step": 7063 + }, + { + "epoch": 2.1682013505217923, + "grad_norm": 0.3602859377861023, + "learning_rate": 9.112746126675156e-05, + "loss": 1.9185, + "step": 7064 + }, + { + "epoch": 2.1685082872928176, + "grad_norm": 0.3354876637458801, + "learning_rate": 9.112463433637428e-05, + "loss": 1.8857, + "step": 7065 + }, + { + "epoch": 2.168815224063843, + "grad_norm": 0.32364192605018616, + "learning_rate": 9.112180699957926e-05, + "loss": 1.8548, + "step": 7066 + }, + { + "epoch": 2.169122160834868, + "grad_norm": 0.3617163896560669, + "learning_rate": 9.111897925639446e-05, + "loss": 1.9021, + "step": 7067 + }, + { + "epoch": 2.169429097605893, + "grad_norm": 0.3852904438972473, + "learning_rate": 9.111615110684778e-05, + "loss": 1.9331, + "step": 7068 + }, + { + "epoch": 2.1697360343769185, + "grad_norm": 0.332939088344574, + "learning_rate": 9.111332255096721e-05, + "loss": 1.9156, + "step": 7069 + }, + { + "epoch": 2.1700429711479434, + "grad_norm": 0.3386891186237335, + "learning_rate": 9.111049358878067e-05, + "loss": 1.8898, + "step": 7070 + }, + { + "epoch": 2.1703499079189688, + "grad_norm": 0.3559711277484894, + "learning_rate": 9.110766422031617e-05, + "loss": 1.8546, + "step": 7071 + }, + { + "epoch": 2.1706568446899936, + "grad_norm": 0.3440175950527191, + "learning_rate": 9.110483444560162e-05, + "loss": 1.9005, + "step": 7072 + }, + { + "epoch": 2.170963781461019, + "grad_norm": 0.3239493668079376, + "learning_rate": 9.110200426466499e-05, + "loss": 1.9258, + "step": 7073 + }, + { + "epoch": 2.1712707182320443, + "grad_norm": 0.3658723533153534, + "learning_rate": 9.109917367753428e-05, + "loss": 2.0203, + "step": 7074 + }, + { + "epoch": 2.171577655003069, + "grad_norm": 0.35419905185699463, + "learning_rate": 9.109634268423746e-05, + "loss": 1.8515, + "step": 7075 + }, + { + "epoch": 2.1718845917740945, + "grad_norm": 0.40852081775665283, + "learning_rate": 9.109351128480246e-05, + "loss": 1.8744, + "step": 7076 + }, + { + "epoch": 2.17219152854512, + "grad_norm": 0.3502386212348938, + "learning_rate": 9.109067947925732e-05, + "loss": 1.8785, + "step": 7077 + }, + { + "epoch": 2.1724984653161448, + "grad_norm": 0.42964309453964233, + "learning_rate": 9.108784726763e-05, + "loss": 1.9175, + "step": 7078 + }, + { + "epoch": 2.17280540208717, + "grad_norm": 0.39438319206237793, + "learning_rate": 9.108501464994849e-05, + "loss": 1.9072, + "step": 7079 + }, + { + "epoch": 2.1731123388581954, + "grad_norm": 0.5045785903930664, + "learning_rate": 9.108218162624079e-05, + "loss": 1.9246, + "step": 7080 + }, + { + "epoch": 2.1734192756292203, + "grad_norm": 0.4374946653842926, + "learning_rate": 9.107934819653488e-05, + "loss": 1.8669, + "step": 7081 + }, + { + "epoch": 2.1737262124002457, + "grad_norm": 0.3263556957244873, + "learning_rate": 9.107651436085878e-05, + "loss": 1.8402, + "step": 7082 + }, + { + "epoch": 2.1740331491712706, + "grad_norm": 0.4380986988544464, + "learning_rate": 9.107368011924048e-05, + "loss": 1.8948, + "step": 7083 + }, + { + "epoch": 2.174340085942296, + "grad_norm": 0.4350908696651459, + "learning_rate": 9.1070845471708e-05, + "loss": 1.8717, + "step": 7084 + }, + { + "epoch": 2.174647022713321, + "grad_norm": 0.37809762358665466, + "learning_rate": 9.106801041828936e-05, + "loss": 1.8703, + "step": 7085 + }, + { + "epoch": 2.174953959484346, + "grad_norm": 0.3473457992076874, + "learning_rate": 9.106517495901257e-05, + "loss": 1.8999, + "step": 7086 + }, + { + "epoch": 2.1752608962553714, + "grad_norm": 0.48066645860671997, + "learning_rate": 9.106233909390564e-05, + "loss": 1.8788, + "step": 7087 + }, + { + "epoch": 2.1755678330263963, + "grad_norm": 0.5873035788536072, + "learning_rate": 9.105950282299663e-05, + "loss": 1.8879, + "step": 7088 + }, + { + "epoch": 2.1758747697974217, + "grad_norm": 0.47609585523605347, + "learning_rate": 9.105666614631354e-05, + "loss": 1.8813, + "step": 7089 + }, + { + "epoch": 2.176181706568447, + "grad_norm": 0.3845362365245819, + "learning_rate": 9.10538290638844e-05, + "loss": 1.9629, + "step": 7090 + }, + { + "epoch": 2.176488643339472, + "grad_norm": 0.5463572144508362, + "learning_rate": 9.105099157573727e-05, + "loss": 1.9455, + "step": 7091 + }, + { + "epoch": 2.1767955801104972, + "grad_norm": 0.4875337779521942, + "learning_rate": 9.104815368190017e-05, + "loss": 1.9146, + "step": 7092 + }, + { + "epoch": 2.1771025168815226, + "grad_norm": 0.37513965368270874, + "learning_rate": 9.104531538240116e-05, + "loss": 1.8626, + "step": 7093 + }, + { + "epoch": 2.1774094536525475, + "grad_norm": 0.3477539122104645, + "learning_rate": 9.104247667726828e-05, + "loss": 1.878, + "step": 7094 + }, + { + "epoch": 2.177716390423573, + "grad_norm": 0.5122693181037903, + "learning_rate": 9.103963756652961e-05, + "loss": 1.8784, + "step": 7095 + }, + { + "epoch": 2.178023327194598, + "grad_norm": 0.49106159806251526, + "learning_rate": 9.103679805021317e-05, + "loss": 1.8441, + "step": 7096 + }, + { + "epoch": 2.178330263965623, + "grad_norm": 0.3801479637622833, + "learning_rate": 9.103395812834705e-05, + "loss": 1.8986, + "step": 7097 + }, + { + "epoch": 2.1786372007366483, + "grad_norm": 0.3429640233516693, + "learning_rate": 9.10311178009593e-05, + "loss": 1.8806, + "step": 7098 + }, + { + "epoch": 2.1789441375076732, + "grad_norm": 0.36715295910835266, + "learning_rate": 9.102827706807799e-05, + "loss": 1.8215, + "step": 7099 + }, + { + "epoch": 2.1792510742786986, + "grad_norm": 0.37225866317749023, + "learning_rate": 9.10254359297312e-05, + "loss": 1.8851, + "step": 7100 + }, + { + "epoch": 2.179558011049724, + "grad_norm": 0.3552459180355072, + "learning_rate": 9.102259438594702e-05, + "loss": 1.9345, + "step": 7101 + }, + { + "epoch": 2.179864947820749, + "grad_norm": 0.3876415193080902, + "learning_rate": 9.10197524367535e-05, + "loss": 1.8657, + "step": 7102 + }, + { + "epoch": 2.180171884591774, + "grad_norm": 0.4635472595691681, + "learning_rate": 9.101691008217875e-05, + "loss": 1.8527, + "step": 7103 + }, + { + "epoch": 2.1804788213627995, + "grad_norm": 0.46319296956062317, + "learning_rate": 9.101406732225086e-05, + "loss": 1.869, + "step": 7104 + }, + { + "epoch": 2.1807857581338244, + "grad_norm": 0.36179330945014954, + "learning_rate": 9.101122415699792e-05, + "loss": 1.9157, + "step": 7105 + }, + { + "epoch": 2.1810926949048497, + "grad_norm": 0.30921339988708496, + "learning_rate": 9.100838058644801e-05, + "loss": 1.858, + "step": 7106 + }, + { + "epoch": 2.1813996316758746, + "grad_norm": 0.4568884074687958, + "learning_rate": 9.100553661062925e-05, + "loss": 1.8663, + "step": 7107 + }, + { + "epoch": 2.1817065684469, + "grad_norm": 0.43856412172317505, + "learning_rate": 9.100269222956976e-05, + "loss": 1.8492, + "step": 7108 + }, + { + "epoch": 2.1820135052179253, + "grad_norm": 0.3025546967983246, + "learning_rate": 9.099984744329761e-05, + "loss": 1.8532, + "step": 7109 + }, + { + "epoch": 2.18232044198895, + "grad_norm": 0.38365665078163147, + "learning_rate": 9.099700225184096e-05, + "loss": 1.8883, + "step": 7110 + }, + { + "epoch": 2.1826273787599755, + "grad_norm": 0.4863334596157074, + "learning_rate": 9.099415665522788e-05, + "loss": 1.8682, + "step": 7111 + }, + { + "epoch": 2.182934315531001, + "grad_norm": 0.42789241671562195, + "learning_rate": 9.099131065348653e-05, + "loss": 1.8867, + "step": 7112 + }, + { + "epoch": 2.1832412523020257, + "grad_norm": 0.35933569073677063, + "learning_rate": 9.098846424664504e-05, + "loss": 1.9282, + "step": 7113 + }, + { + "epoch": 2.183548189073051, + "grad_norm": 0.42611026763916016, + "learning_rate": 9.09856174347315e-05, + "loss": 1.9609, + "step": 7114 + }, + { + "epoch": 2.183855125844076, + "grad_norm": 0.43970558047294617, + "learning_rate": 9.098277021777406e-05, + "loss": 1.823, + "step": 7115 + }, + { + "epoch": 2.1841620626151013, + "grad_norm": 0.36792683601379395, + "learning_rate": 9.097992259580089e-05, + "loss": 1.9231, + "step": 7116 + }, + { + "epoch": 2.1844689993861266, + "grad_norm": 0.3554590344429016, + "learning_rate": 9.097707456884008e-05, + "loss": 1.914, + "step": 7117 + }, + { + "epoch": 2.1847759361571515, + "grad_norm": 0.4271651804447174, + "learning_rate": 9.097422613691982e-05, + "loss": 1.8666, + "step": 7118 + }, + { + "epoch": 2.185082872928177, + "grad_norm": 0.32142770290374756, + "learning_rate": 9.097137730006822e-05, + "loss": 1.7989, + "step": 7119 + }, + { + "epoch": 2.185389809699202, + "grad_norm": 0.33245620131492615, + "learning_rate": 9.096852805831348e-05, + "loss": 1.8536, + "step": 7120 + }, + { + "epoch": 2.185696746470227, + "grad_norm": 0.3480495810508728, + "learning_rate": 9.09656784116837e-05, + "loss": 1.9008, + "step": 7121 + }, + { + "epoch": 2.1860036832412524, + "grad_norm": 0.35290226340293884, + "learning_rate": 9.09628283602071e-05, + "loss": 1.8593, + "step": 7122 + }, + { + "epoch": 2.1863106200122773, + "grad_norm": 0.3084987998008728, + "learning_rate": 9.095997790391183e-05, + "loss": 1.827, + "step": 7123 + }, + { + "epoch": 2.1866175567833026, + "grad_norm": 0.36295285820961, + "learning_rate": 9.095712704282604e-05, + "loss": 1.909, + "step": 7124 + }, + { + "epoch": 2.186924493554328, + "grad_norm": 0.3893873691558838, + "learning_rate": 9.095427577697791e-05, + "loss": 1.9221, + "step": 7125 + }, + { + "epoch": 2.187231430325353, + "grad_norm": 0.3699241578578949, + "learning_rate": 9.095142410639564e-05, + "loss": 1.9352, + "step": 7126 + }, + { + "epoch": 2.187538367096378, + "grad_norm": 0.3384705185890198, + "learning_rate": 9.094857203110738e-05, + "loss": 1.8541, + "step": 7127 + }, + { + "epoch": 2.1878453038674035, + "grad_norm": 0.377687007188797, + "learning_rate": 9.094571955114133e-05, + "loss": 1.8336, + "step": 7128 + }, + { + "epoch": 2.1881522406384284, + "grad_norm": 0.40227916836738586, + "learning_rate": 9.094286666652567e-05, + "loss": 1.9565, + "step": 7129 + }, + { + "epoch": 2.1884591774094537, + "grad_norm": 0.3679705560207367, + "learning_rate": 9.094001337728862e-05, + "loss": 1.8152, + "step": 7130 + }, + { + "epoch": 2.1887661141804786, + "grad_norm": 0.3197132647037506, + "learning_rate": 9.093715968345836e-05, + "loss": 1.9263, + "step": 7131 + }, + { + "epoch": 2.189073050951504, + "grad_norm": 0.3518284559249878, + "learning_rate": 9.09343055850631e-05, + "loss": 1.8675, + "step": 7132 + }, + { + "epoch": 2.1893799877225293, + "grad_norm": 0.3214010000228882, + "learning_rate": 9.093145108213103e-05, + "loss": 1.8991, + "step": 7133 + }, + { + "epoch": 2.189686924493554, + "grad_norm": 0.3563176393508911, + "learning_rate": 9.092859617469037e-05, + "loss": 1.8603, + "step": 7134 + }, + { + "epoch": 2.1899938612645795, + "grad_norm": 0.34053143858909607, + "learning_rate": 9.092574086276933e-05, + "loss": 1.8955, + "step": 7135 + }, + { + "epoch": 2.190300798035605, + "grad_norm": 0.3833705484867096, + "learning_rate": 9.092288514639613e-05, + "loss": 1.8845, + "step": 7136 + }, + { + "epoch": 2.1906077348066297, + "grad_norm": 0.3932427763938904, + "learning_rate": 9.092002902559901e-05, + "loss": 1.8608, + "step": 7137 + }, + { + "epoch": 2.190914671577655, + "grad_norm": 0.332955539226532, + "learning_rate": 9.091717250040617e-05, + "loss": 1.8558, + "step": 7138 + }, + { + "epoch": 2.1912216083486804, + "grad_norm": 0.3149980306625366, + "learning_rate": 9.091431557084584e-05, + "loss": 1.893, + "step": 7139 + }, + { + "epoch": 2.1915285451197053, + "grad_norm": 0.3679150640964508, + "learning_rate": 9.091145823694628e-05, + "loss": 1.9012, + "step": 7140 + }, + { + "epoch": 2.1918354818907306, + "grad_norm": 0.36836057901382446, + "learning_rate": 9.09086004987357e-05, + "loss": 1.9121, + "step": 7141 + }, + { + "epoch": 2.1921424186617555, + "grad_norm": 0.3581927418708801, + "learning_rate": 9.090574235624237e-05, + "loss": 1.8826, + "step": 7142 + }, + { + "epoch": 2.192449355432781, + "grad_norm": 0.40886545181274414, + "learning_rate": 9.09028838094945e-05, + "loss": 1.8828, + "step": 7143 + }, + { + "epoch": 2.192756292203806, + "grad_norm": 0.32729873061180115, + "learning_rate": 9.090002485852037e-05, + "loss": 1.8827, + "step": 7144 + }, + { + "epoch": 2.193063228974831, + "grad_norm": 0.35304784774780273, + "learning_rate": 9.089716550334819e-05, + "loss": 1.846, + "step": 7145 + }, + { + "epoch": 2.1933701657458564, + "grad_norm": 0.35022708773612976, + "learning_rate": 9.089430574400629e-05, + "loss": 1.9169, + "step": 7146 + }, + { + "epoch": 2.1936771025168813, + "grad_norm": 0.4137697219848633, + "learning_rate": 9.089144558052287e-05, + "loss": 1.9111, + "step": 7147 + }, + { + "epoch": 2.1939840392879066, + "grad_norm": 0.3193536102771759, + "learning_rate": 9.088858501292622e-05, + "loss": 1.8577, + "step": 7148 + }, + { + "epoch": 2.194290976058932, + "grad_norm": 0.35795432329177856, + "learning_rate": 9.08857240412446e-05, + "loss": 1.8645, + "step": 7149 + }, + { + "epoch": 2.194597912829957, + "grad_norm": 0.3626460134983063, + "learning_rate": 9.088286266550632e-05, + "loss": 1.9288, + "step": 7150 + }, + { + "epoch": 2.194904849600982, + "grad_norm": 0.3438000977039337, + "learning_rate": 9.08800008857396e-05, + "loss": 1.9112, + "step": 7151 + }, + { + "epoch": 2.1952117863720075, + "grad_norm": 0.3445241153240204, + "learning_rate": 9.087713870197276e-05, + "loss": 1.8711, + "step": 7152 + }, + { + "epoch": 2.1955187231430324, + "grad_norm": 0.34294596314430237, + "learning_rate": 9.087427611423408e-05, + "loss": 1.9061, + "step": 7153 + }, + { + "epoch": 2.1958256599140578, + "grad_norm": 0.3608735203742981, + "learning_rate": 9.087141312255184e-05, + "loss": 1.8634, + "step": 7154 + }, + { + "epoch": 2.196132596685083, + "grad_norm": 0.3417772352695465, + "learning_rate": 9.086854972695434e-05, + "loss": 1.9, + "step": 7155 + }, + { + "epoch": 2.196439533456108, + "grad_norm": 0.3516700863838196, + "learning_rate": 9.086568592746988e-05, + "loss": 1.9021, + "step": 7156 + }, + { + "epoch": 2.1967464702271333, + "grad_norm": 0.37481075525283813, + "learning_rate": 9.086282172412677e-05, + "loss": 1.8845, + "step": 7157 + }, + { + "epoch": 2.197053406998158, + "grad_norm": 0.3413105010986328, + "learning_rate": 9.08599571169533e-05, + "loss": 1.8128, + "step": 7158 + }, + { + "epoch": 2.1973603437691835, + "grad_norm": 0.3539934754371643, + "learning_rate": 9.085709210597777e-05, + "loss": 1.857, + "step": 7159 + }, + { + "epoch": 2.197667280540209, + "grad_norm": 0.4345060884952545, + "learning_rate": 9.085422669122851e-05, + "loss": 1.8698, + "step": 7160 + }, + { + "epoch": 2.1979742173112338, + "grad_norm": 0.40369880199432373, + "learning_rate": 9.085136087273386e-05, + "loss": 1.7948, + "step": 7161 + }, + { + "epoch": 2.198281154082259, + "grad_norm": 0.3832145035266876, + "learning_rate": 9.08484946505221e-05, + "loss": 1.8682, + "step": 7162 + }, + { + "epoch": 2.198588090853284, + "grad_norm": 0.2859131097793579, + "learning_rate": 9.084562802462158e-05, + "loss": 1.8123, + "step": 7163 + }, + { + "epoch": 2.1988950276243093, + "grad_norm": 0.3062222898006439, + "learning_rate": 9.084276099506062e-05, + "loss": 1.8448, + "step": 7164 + }, + { + "epoch": 2.1992019643953347, + "grad_norm": 0.3819046914577484, + "learning_rate": 9.083989356186757e-05, + "loss": 1.8661, + "step": 7165 + }, + { + "epoch": 2.1995089011663596, + "grad_norm": 0.5007020235061646, + "learning_rate": 9.083702572507074e-05, + "loss": 1.9144, + "step": 7166 + }, + { + "epoch": 2.199815837937385, + "grad_norm": 0.521885097026825, + "learning_rate": 9.083415748469849e-05, + "loss": 1.8695, + "step": 7167 + }, + { + "epoch": 2.2001227747084102, + "grad_norm": 0.35051268339157104, + "learning_rate": 9.083128884077916e-05, + "loss": 1.9378, + "step": 7168 + }, + { + "epoch": 2.200429711479435, + "grad_norm": 0.40265345573425293, + "learning_rate": 9.082841979334111e-05, + "loss": 1.8902, + "step": 7169 + }, + { + "epoch": 2.2007366482504604, + "grad_norm": 0.506377637386322, + "learning_rate": 9.082555034241267e-05, + "loss": 1.9115, + "step": 7170 + }, + { + "epoch": 2.201043585021486, + "grad_norm": 0.42828384041786194, + "learning_rate": 9.082268048802223e-05, + "loss": 1.8173, + "step": 7171 + }, + { + "epoch": 2.2013505217925107, + "grad_norm": 0.2979312539100647, + "learning_rate": 9.081981023019812e-05, + "loss": 1.8089, + "step": 7172 + }, + { + "epoch": 2.201657458563536, + "grad_norm": 0.3840465843677521, + "learning_rate": 9.081693956896872e-05, + "loss": 1.8557, + "step": 7173 + }, + { + "epoch": 2.201964395334561, + "grad_norm": 0.41454845666885376, + "learning_rate": 9.081406850436241e-05, + "loss": 1.8599, + "step": 7174 + }, + { + "epoch": 2.2022713321055862, + "grad_norm": 0.3305908739566803, + "learning_rate": 9.081119703640756e-05, + "loss": 1.8013, + "step": 7175 + }, + { + "epoch": 2.2025782688766116, + "grad_norm": 0.33649876713752747, + "learning_rate": 9.080832516513252e-05, + "loss": 1.9028, + "step": 7176 + }, + { + "epoch": 2.2028852056476365, + "grad_norm": 0.41247284412384033, + "learning_rate": 9.08054528905657e-05, + "loss": 1.8636, + "step": 7177 + }, + { + "epoch": 2.203192142418662, + "grad_norm": 0.4355279505252838, + "learning_rate": 9.080258021273548e-05, + "loss": 1.8923, + "step": 7178 + }, + { + "epoch": 2.203499079189687, + "grad_norm": 0.34598320722579956, + "learning_rate": 9.079970713167026e-05, + "loss": 1.9187, + "step": 7179 + }, + { + "epoch": 2.203806015960712, + "grad_norm": 0.3560951054096222, + "learning_rate": 9.07968336473984e-05, + "loss": 1.9382, + "step": 7180 + }, + { + "epoch": 2.2041129527317374, + "grad_norm": 0.3873176872730255, + "learning_rate": 9.079395975994834e-05, + "loss": 1.8377, + "step": 7181 + }, + { + "epoch": 2.2044198895027622, + "grad_norm": 0.38699567317962646, + "learning_rate": 9.079108546934844e-05, + "loss": 1.848, + "step": 7182 + }, + { + "epoch": 2.2047268262737876, + "grad_norm": 0.3658364713191986, + "learning_rate": 9.078821077562712e-05, + "loss": 1.9308, + "step": 7183 + }, + { + "epoch": 2.205033763044813, + "grad_norm": 0.35228830575942993, + "learning_rate": 9.078533567881281e-05, + "loss": 1.8886, + "step": 7184 + }, + { + "epoch": 2.205340699815838, + "grad_norm": 0.4177337884902954, + "learning_rate": 9.07824601789339e-05, + "loss": 1.8695, + "step": 7185 + }, + { + "epoch": 2.205647636586863, + "grad_norm": 0.4778536260128021, + "learning_rate": 9.077958427601882e-05, + "loss": 1.8288, + "step": 7186 + }, + { + "epoch": 2.2059545733578885, + "grad_norm": 0.46544820070266724, + "learning_rate": 9.077670797009599e-05, + "loss": 1.8974, + "step": 7187 + }, + { + "epoch": 2.2062615101289134, + "grad_norm": 0.36188805103302, + "learning_rate": 9.077383126119382e-05, + "loss": 1.8953, + "step": 7188 + }, + { + "epoch": 2.2065684468999387, + "grad_norm": 0.30941206216812134, + "learning_rate": 9.077095414934075e-05, + "loss": 1.8395, + "step": 7189 + }, + { + "epoch": 2.2068753836709636, + "grad_norm": 0.4497200846672058, + "learning_rate": 9.076807663456524e-05, + "loss": 1.8485, + "step": 7190 + }, + { + "epoch": 2.207182320441989, + "grad_norm": 0.4923233985900879, + "learning_rate": 9.076519871689568e-05, + "loss": 1.8233, + "step": 7191 + }, + { + "epoch": 2.2074892572130143, + "grad_norm": 0.32226502895355225, + "learning_rate": 9.076232039636053e-05, + "loss": 1.8563, + "step": 7192 + }, + { + "epoch": 2.207796193984039, + "grad_norm": 0.46719446778297424, + "learning_rate": 9.075944167298824e-05, + "loss": 1.8602, + "step": 7193 + }, + { + "epoch": 2.2081031307550645, + "grad_norm": 0.5534674525260925, + "learning_rate": 9.075656254680727e-05, + "loss": 1.8804, + "step": 7194 + }, + { + "epoch": 2.20841006752609, + "grad_norm": 0.4895678162574768, + "learning_rate": 9.075368301784606e-05, + "loss": 1.8893, + "step": 7195 + }, + { + "epoch": 2.2087170042971147, + "grad_norm": 0.33137625455856323, + "learning_rate": 9.075080308613306e-05, + "loss": 1.9158, + "step": 7196 + }, + { + "epoch": 2.20902394106814, + "grad_norm": 0.469319611787796, + "learning_rate": 9.074792275169674e-05, + "loss": 1.8628, + "step": 7197 + }, + { + "epoch": 2.209330877839165, + "grad_norm": 0.43872305750846863, + "learning_rate": 9.074504201456556e-05, + "loss": 1.8867, + "step": 7198 + }, + { + "epoch": 2.2096378146101903, + "grad_norm": 0.32900992035865784, + "learning_rate": 9.0742160874768e-05, + "loss": 1.8079, + "step": 7199 + }, + { + "epoch": 2.2099447513812156, + "grad_norm": 0.34231048822402954, + "learning_rate": 9.073927933233253e-05, + "loss": 1.9018, + "step": 7200 + }, + { + "epoch": 2.2102516881522405, + "grad_norm": 0.43461740016937256, + "learning_rate": 9.07363973872876e-05, + "loss": 1.8299, + "step": 7201 + }, + { + "epoch": 2.210558624923266, + "grad_norm": 0.43819913268089294, + "learning_rate": 9.073351503966174e-05, + "loss": 1.8641, + "step": 7202 + }, + { + "epoch": 2.210865561694291, + "grad_norm": 0.330683171749115, + "learning_rate": 9.073063228948339e-05, + "loss": 1.8595, + "step": 7203 + }, + { + "epoch": 2.211172498465316, + "grad_norm": 0.35648414492607117, + "learning_rate": 9.072774913678108e-05, + "loss": 1.8265, + "step": 7204 + }, + { + "epoch": 2.2114794352363414, + "grad_norm": 0.4420771300792694, + "learning_rate": 9.072486558158329e-05, + "loss": 1.902, + "step": 7205 + }, + { + "epoch": 2.2117863720073663, + "grad_norm": 0.41682472825050354, + "learning_rate": 9.072198162391849e-05, + "loss": 1.903, + "step": 7206 + }, + { + "epoch": 2.2120933087783916, + "grad_norm": 0.3194744288921356, + "learning_rate": 9.07190972638152e-05, + "loss": 1.8221, + "step": 7207 + }, + { + "epoch": 2.212400245549417, + "grad_norm": 0.35625776648521423, + "learning_rate": 9.071621250130192e-05, + "loss": 1.8737, + "step": 7208 + }, + { + "epoch": 2.212707182320442, + "grad_norm": 0.4136293828487396, + "learning_rate": 9.071332733640716e-05, + "loss": 1.7995, + "step": 7209 + }, + { + "epoch": 2.213014119091467, + "grad_norm": 0.39144495129585266, + "learning_rate": 9.071044176915947e-05, + "loss": 1.8446, + "step": 7210 + }, + { + "epoch": 2.2133210558624925, + "grad_norm": 0.3082813322544098, + "learning_rate": 9.07075557995873e-05, + "loss": 1.7635, + "step": 7211 + }, + { + "epoch": 2.2136279926335174, + "grad_norm": 0.3642291724681854, + "learning_rate": 9.070466942771921e-05, + "loss": 1.9471, + "step": 7212 + }, + { + "epoch": 2.2139349294045427, + "grad_norm": 0.4506807029247284, + "learning_rate": 9.070178265358372e-05, + "loss": 1.8542, + "step": 7213 + }, + { + "epoch": 2.214241866175568, + "grad_norm": 0.5011601448059082, + "learning_rate": 9.069889547720936e-05, + "loss": 1.9135, + "step": 7214 + }, + { + "epoch": 2.214548802946593, + "grad_norm": 0.3946228623390198, + "learning_rate": 9.069600789862467e-05, + "loss": 1.876, + "step": 7215 + }, + { + "epoch": 2.2148557397176183, + "grad_norm": 0.34833815693855286, + "learning_rate": 9.069311991785816e-05, + "loss": 1.8666, + "step": 7216 + }, + { + "epoch": 2.215162676488643, + "grad_norm": 0.43735191226005554, + "learning_rate": 9.069023153493839e-05, + "loss": 1.9238, + "step": 7217 + }, + { + "epoch": 2.2154696132596685, + "grad_norm": 0.5010718107223511, + "learning_rate": 9.06873427498939e-05, + "loss": 1.8724, + "step": 7218 + }, + { + "epoch": 2.215776550030694, + "grad_norm": 0.35850396752357483, + "learning_rate": 9.068445356275326e-05, + "loss": 1.8825, + "step": 7219 + }, + { + "epoch": 2.2160834868017187, + "grad_norm": 0.3528468906879425, + "learning_rate": 9.0681563973545e-05, + "loss": 1.8724, + "step": 7220 + }, + { + "epoch": 2.216390423572744, + "grad_norm": 0.34725508093833923, + "learning_rate": 9.067867398229767e-05, + "loss": 1.8722, + "step": 7221 + }, + { + "epoch": 2.216697360343769, + "grad_norm": 0.3343757092952728, + "learning_rate": 9.067578358903985e-05, + "loss": 1.8144, + "step": 7222 + }, + { + "epoch": 2.2170042971147943, + "grad_norm": 0.33384087681770325, + "learning_rate": 9.067289279380009e-05, + "loss": 1.832, + "step": 7223 + }, + { + "epoch": 2.2173112338858196, + "grad_norm": 0.3275810778141022, + "learning_rate": 9.067000159660697e-05, + "loss": 1.8819, + "step": 7224 + }, + { + "epoch": 2.2176181706568445, + "grad_norm": 0.405293732881546, + "learning_rate": 9.066710999748904e-05, + "loss": 1.8669, + "step": 7225 + }, + { + "epoch": 2.21792510742787, + "grad_norm": 0.3554569482803345, + "learning_rate": 9.066421799647491e-05, + "loss": 1.8331, + "step": 7226 + }, + { + "epoch": 2.218232044198895, + "grad_norm": 0.3896840810775757, + "learning_rate": 9.066132559359313e-05, + "loss": 1.891, + "step": 7227 + }, + { + "epoch": 2.21853898096992, + "grad_norm": 0.38668718934059143, + "learning_rate": 9.065843278887231e-05, + "loss": 1.9162, + "step": 7228 + }, + { + "epoch": 2.2188459177409454, + "grad_norm": 0.3593392074108124, + "learning_rate": 9.065553958234103e-05, + "loss": 1.866, + "step": 7229 + }, + { + "epoch": 2.2191528545119708, + "grad_norm": 0.3509809076786041, + "learning_rate": 9.065264597402788e-05, + "loss": 1.8979, + "step": 7230 + }, + { + "epoch": 2.2194597912829956, + "grad_norm": 0.35477882623672485, + "learning_rate": 9.064975196396144e-05, + "loss": 1.8425, + "step": 7231 + }, + { + "epoch": 2.219766728054021, + "grad_norm": 0.38763463497161865, + "learning_rate": 9.064685755217033e-05, + "loss": 1.8853, + "step": 7232 + }, + { + "epoch": 2.220073664825046, + "grad_norm": 0.33559930324554443, + "learning_rate": 9.064396273868316e-05, + "loss": 1.8825, + "step": 7233 + }, + { + "epoch": 2.220380601596071, + "grad_norm": 0.3130233585834503, + "learning_rate": 9.064106752352852e-05, + "loss": 1.8082, + "step": 7234 + }, + { + "epoch": 2.2206875383670965, + "grad_norm": 0.33321285247802734, + "learning_rate": 9.063817190673503e-05, + "loss": 1.8795, + "step": 7235 + }, + { + "epoch": 2.2209944751381214, + "grad_norm": 0.47564151883125305, + "learning_rate": 9.063527588833132e-05, + "loss": 1.9461, + "step": 7236 + }, + { + "epoch": 2.2213014119091468, + "grad_norm": 0.38102859258651733, + "learning_rate": 9.063237946834597e-05, + "loss": 1.8656, + "step": 7237 + }, + { + "epoch": 2.2216083486801717, + "grad_norm": 0.32240456342697144, + "learning_rate": 9.062948264680765e-05, + "loss": 1.8187, + "step": 7238 + }, + { + "epoch": 2.221915285451197, + "grad_norm": 0.2852800190448761, + "learning_rate": 9.062658542374496e-05, + "loss": 1.8172, + "step": 7239 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3506350815296173, + "learning_rate": 9.062368779918655e-05, + "loss": 1.8909, + "step": 7240 + }, + { + "epoch": 2.222529158993247, + "grad_norm": 0.29418817162513733, + "learning_rate": 9.062078977316104e-05, + "loss": 1.8078, + "step": 7241 + }, + { + "epoch": 2.2228360957642725, + "grad_norm": 0.31221407651901245, + "learning_rate": 9.061789134569707e-05, + "loss": 1.8813, + "step": 7242 + }, + { + "epoch": 2.223143032535298, + "grad_norm": 0.32314184308052063, + "learning_rate": 9.061499251682331e-05, + "loss": 1.8838, + "step": 7243 + }, + { + "epoch": 2.2234499693063228, + "grad_norm": 0.3329566419124603, + "learning_rate": 9.061209328656838e-05, + "loss": 1.8987, + "step": 7244 + }, + { + "epoch": 2.223756906077348, + "grad_norm": 0.35992133617401123, + "learning_rate": 9.060919365496094e-05, + "loss": 1.9194, + "step": 7245 + }, + { + "epoch": 2.2240638428483734, + "grad_norm": 0.33594536781311035, + "learning_rate": 9.060629362202964e-05, + "loss": 1.8303, + "step": 7246 + }, + { + "epoch": 2.2243707796193983, + "grad_norm": 0.3469938635826111, + "learning_rate": 9.060339318780316e-05, + "loss": 1.905, + "step": 7247 + }, + { + "epoch": 2.2246777163904237, + "grad_norm": 0.3989942967891693, + "learning_rate": 9.060049235231015e-05, + "loss": 1.8655, + "step": 7248 + }, + { + "epoch": 2.2249846531614486, + "grad_norm": 0.35004356503486633, + "learning_rate": 9.059759111557926e-05, + "loss": 1.8081, + "step": 7249 + }, + { + "epoch": 2.225291589932474, + "grad_norm": 0.38162320852279663, + "learning_rate": 9.059468947763919e-05, + "loss": 1.9243, + "step": 7250 + }, + { + "epoch": 2.2255985267034992, + "grad_norm": 0.3417564034461975, + "learning_rate": 9.059178743851859e-05, + "loss": 1.8246, + "step": 7251 + }, + { + "epoch": 2.225905463474524, + "grad_norm": 0.39185380935668945, + "learning_rate": 9.058888499824618e-05, + "loss": 1.9235, + "step": 7252 + }, + { + "epoch": 2.2262124002455494, + "grad_norm": 0.5741223096847534, + "learning_rate": 9.058598215685061e-05, + "loss": 1.9104, + "step": 7253 + }, + { + "epoch": 2.226519337016575, + "grad_norm": 0.6595804691314697, + "learning_rate": 9.058307891436057e-05, + "loss": 1.9956, + "step": 7254 + }, + { + "epoch": 2.2268262737875997, + "grad_norm": 0.6249661445617676, + "learning_rate": 9.058017527080476e-05, + "loss": 1.8913, + "step": 7255 + }, + { + "epoch": 2.227133210558625, + "grad_norm": 0.48208609223365784, + "learning_rate": 9.057727122621188e-05, + "loss": 1.9116, + "step": 7256 + }, + { + "epoch": 2.22744014732965, + "grad_norm": 0.37400147318840027, + "learning_rate": 9.057436678061062e-05, + "loss": 1.8828, + "step": 7257 + }, + { + "epoch": 2.2277470841006752, + "grad_norm": 0.40321463346481323, + "learning_rate": 9.057146193402968e-05, + "loss": 1.7984, + "step": 7258 + }, + { + "epoch": 2.2280540208717006, + "grad_norm": 0.43090149760246277, + "learning_rate": 9.056855668649778e-05, + "loss": 1.9135, + "step": 7259 + }, + { + "epoch": 2.2283609576427255, + "grad_norm": 0.3625677525997162, + "learning_rate": 9.056565103804362e-05, + "loss": 1.9005, + "step": 7260 + }, + { + "epoch": 2.228667894413751, + "grad_norm": 0.3386496901512146, + "learning_rate": 9.056274498869593e-05, + "loss": 1.879, + "step": 7261 + }, + { + "epoch": 2.228974831184776, + "grad_norm": 0.45207980275154114, + "learning_rate": 9.05598385384834e-05, + "loss": 1.8748, + "step": 7262 + }, + { + "epoch": 2.229281767955801, + "grad_norm": 0.38665562868118286, + "learning_rate": 9.055693168743478e-05, + "loss": 1.8828, + "step": 7263 + }, + { + "epoch": 2.2295887047268264, + "grad_norm": 0.3074968159198761, + "learning_rate": 9.05540244355788e-05, + "loss": 1.8443, + "step": 7264 + }, + { + "epoch": 2.2298956414978512, + "grad_norm": 0.36243903636932373, + "learning_rate": 9.055111678294418e-05, + "loss": 1.8681, + "step": 7265 + }, + { + "epoch": 2.2302025782688766, + "grad_norm": 0.4070085287094116, + "learning_rate": 9.054820872955965e-05, + "loss": 1.8643, + "step": 7266 + }, + { + "epoch": 2.230509515039902, + "grad_norm": 0.3784204125404358, + "learning_rate": 9.054530027545396e-05, + "loss": 1.9197, + "step": 7267 + }, + { + "epoch": 2.230816451810927, + "grad_norm": 0.32002586126327515, + "learning_rate": 9.054239142065583e-05, + "loss": 1.9, + "step": 7268 + }, + { + "epoch": 2.231123388581952, + "grad_norm": 0.3701259195804596, + "learning_rate": 9.053948216519405e-05, + "loss": 1.8815, + "step": 7269 + }, + { + "epoch": 2.2314303253529775, + "grad_norm": 0.32927554845809937, + "learning_rate": 9.053657250909734e-05, + "loss": 1.8599, + "step": 7270 + }, + { + "epoch": 2.2317372621240024, + "grad_norm": 0.2915503680706024, + "learning_rate": 9.053366245239445e-05, + "loss": 1.8553, + "step": 7271 + }, + { + "epoch": 2.2320441988950277, + "grad_norm": 0.3347928822040558, + "learning_rate": 9.053075199511416e-05, + "loss": 1.926, + "step": 7272 + }, + { + "epoch": 2.2323511356660526, + "grad_norm": 0.37499183416366577, + "learning_rate": 9.052784113728523e-05, + "loss": 1.8636, + "step": 7273 + }, + { + "epoch": 2.232658072437078, + "grad_norm": 0.38303107023239136, + "learning_rate": 9.05249298789364e-05, + "loss": 1.8739, + "step": 7274 + }, + { + "epoch": 2.2329650092081033, + "grad_norm": 0.356942355632782, + "learning_rate": 9.052201822009648e-05, + "loss": 1.8401, + "step": 7275 + }, + { + "epoch": 2.233271945979128, + "grad_norm": 0.3391316533088684, + "learning_rate": 9.051910616079422e-05, + "loss": 1.8954, + "step": 7276 + }, + { + "epoch": 2.2335788827501535, + "grad_norm": 0.3100464344024658, + "learning_rate": 9.051619370105839e-05, + "loss": 1.8726, + "step": 7277 + }, + { + "epoch": 2.233885819521179, + "grad_norm": 0.38745078444480896, + "learning_rate": 9.05132808409178e-05, + "loss": 1.9605, + "step": 7278 + }, + { + "epoch": 2.2341927562922037, + "grad_norm": 0.40631747245788574, + "learning_rate": 9.051036758040123e-05, + "loss": 1.8458, + "step": 7279 + }, + { + "epoch": 2.234499693063229, + "grad_norm": 0.4084717929363251, + "learning_rate": 9.050745391953745e-05, + "loss": 1.8696, + "step": 7280 + }, + { + "epoch": 2.234806629834254, + "grad_norm": 0.4426955282688141, + "learning_rate": 9.050453985835527e-05, + "loss": 1.9063, + "step": 7281 + }, + { + "epoch": 2.2351135666052793, + "grad_norm": 0.37360796332359314, + "learning_rate": 9.05016253968835e-05, + "loss": 1.9299, + "step": 7282 + }, + { + "epoch": 2.2354205033763046, + "grad_norm": 0.34415799379348755, + "learning_rate": 9.049871053515091e-05, + "loss": 1.8877, + "step": 7283 + }, + { + "epoch": 2.2357274401473295, + "grad_norm": 0.3745698928833008, + "learning_rate": 9.049579527318633e-05, + "loss": 1.9272, + "step": 7284 + }, + { + "epoch": 2.236034376918355, + "grad_norm": 0.3293079435825348, + "learning_rate": 9.049287961101857e-05, + "loss": 1.8599, + "step": 7285 + }, + { + "epoch": 2.23634131368938, + "grad_norm": 0.3563106060028076, + "learning_rate": 9.048996354867644e-05, + "loss": 1.938, + "step": 7286 + }, + { + "epoch": 2.236648250460405, + "grad_norm": 0.36354976892471313, + "learning_rate": 9.048704708618876e-05, + "loss": 1.9401, + "step": 7287 + }, + { + "epoch": 2.2369551872314304, + "grad_norm": 0.32659000158309937, + "learning_rate": 9.048413022358434e-05, + "loss": 1.8056, + "step": 7288 + }, + { + "epoch": 2.2372621240024557, + "grad_norm": 0.30486637353897095, + "learning_rate": 9.048121296089202e-05, + "loss": 1.8178, + "step": 7289 + }, + { + "epoch": 2.2375690607734806, + "grad_norm": 0.34506455063819885, + "learning_rate": 9.047829529814063e-05, + "loss": 1.8866, + "step": 7290 + }, + { + "epoch": 2.237875997544506, + "grad_norm": 0.3200983703136444, + "learning_rate": 9.047537723535902e-05, + "loss": 1.8218, + "step": 7291 + }, + { + "epoch": 2.238182934315531, + "grad_norm": 0.33315715193748474, + "learning_rate": 9.047245877257597e-05, + "loss": 1.8939, + "step": 7292 + }, + { + "epoch": 2.238489871086556, + "grad_norm": 0.38259127736091614, + "learning_rate": 9.046953990982039e-05, + "loss": 1.9566, + "step": 7293 + }, + { + "epoch": 2.2387968078575815, + "grad_norm": 0.32880350947380066, + "learning_rate": 9.04666206471211e-05, + "loss": 1.9056, + "step": 7294 + }, + { + "epoch": 2.2391037446286064, + "grad_norm": 0.39114195108413696, + "learning_rate": 9.046370098450692e-05, + "loss": 1.8773, + "step": 7295 + }, + { + "epoch": 2.2394106813996317, + "grad_norm": 0.37625813484191895, + "learning_rate": 9.046078092200675e-05, + "loss": 1.8685, + "step": 7296 + }, + { + "epoch": 2.2397176181706566, + "grad_norm": 0.3604978621006012, + "learning_rate": 9.045786045964942e-05, + "loss": 1.885, + "step": 7297 + }, + { + "epoch": 2.240024554941682, + "grad_norm": 0.32200589776039124, + "learning_rate": 9.045493959746381e-05, + "loss": 1.9146, + "step": 7298 + }, + { + "epoch": 2.2403314917127073, + "grad_norm": 0.3635976314544678, + "learning_rate": 9.045201833547876e-05, + "loss": 1.8597, + "step": 7299 + }, + { + "epoch": 2.240638428483732, + "grad_norm": 0.3326318562030792, + "learning_rate": 9.044909667372317e-05, + "loss": 1.8577, + "step": 7300 + }, + { + "epoch": 2.2409453652547575, + "grad_norm": 0.32209664583206177, + "learning_rate": 9.044617461222589e-05, + "loss": 1.844, + "step": 7301 + }, + { + "epoch": 2.241252302025783, + "grad_norm": 0.3654637634754181, + "learning_rate": 9.044325215101581e-05, + "loss": 1.8858, + "step": 7302 + }, + { + "epoch": 2.2415592387968077, + "grad_norm": 0.3583166003227234, + "learning_rate": 9.04403292901218e-05, + "loss": 1.8148, + "step": 7303 + }, + { + "epoch": 2.241866175567833, + "grad_norm": 0.3315606117248535, + "learning_rate": 9.043740602957276e-05, + "loss": 1.8504, + "step": 7304 + }, + { + "epoch": 2.2421731123388584, + "grad_norm": 0.36084556579589844, + "learning_rate": 9.043448236939758e-05, + "loss": 1.9167, + "step": 7305 + }, + { + "epoch": 2.2424800491098833, + "grad_norm": 0.43558987975120544, + "learning_rate": 9.043155830962514e-05, + "loss": 1.8937, + "step": 7306 + }, + { + "epoch": 2.2427869858809086, + "grad_norm": 0.455240398645401, + "learning_rate": 9.042863385028433e-05, + "loss": 1.9774, + "step": 7307 + }, + { + "epoch": 2.2430939226519335, + "grad_norm": 0.35868698358535767, + "learning_rate": 9.042570899140408e-05, + "loss": 1.7999, + "step": 7308 + }, + { + "epoch": 2.243400859422959, + "grad_norm": 0.33930447697639465, + "learning_rate": 9.042278373301327e-05, + "loss": 1.965, + "step": 7309 + }, + { + "epoch": 2.243707796193984, + "grad_norm": 0.34124335646629333, + "learning_rate": 9.041985807514082e-05, + "loss": 1.8916, + "step": 7310 + }, + { + "epoch": 2.244014732965009, + "grad_norm": 0.3905695974826813, + "learning_rate": 9.041693201781565e-05, + "loss": 1.9066, + "step": 7311 + }, + { + "epoch": 2.2443216697360344, + "grad_norm": 0.3108711242675781, + "learning_rate": 9.041400556106667e-05, + "loss": 1.8038, + "step": 7312 + }, + { + "epoch": 2.2446286065070598, + "grad_norm": 0.2853390872478485, + "learning_rate": 9.041107870492279e-05, + "loss": 1.8945, + "step": 7313 + }, + { + "epoch": 2.2449355432780846, + "grad_norm": 0.33351564407348633, + "learning_rate": 9.040815144941295e-05, + "loss": 1.8796, + "step": 7314 + }, + { + "epoch": 2.24524248004911, + "grad_norm": 0.3470609486103058, + "learning_rate": 9.040522379456606e-05, + "loss": 1.8914, + "step": 7315 + }, + { + "epoch": 2.245549416820135, + "grad_norm": 0.3474356532096863, + "learning_rate": 9.040229574041109e-05, + "loss": 1.838, + "step": 7316 + }, + { + "epoch": 2.24585635359116, + "grad_norm": 0.36590397357940674, + "learning_rate": 9.039936728697693e-05, + "loss": 1.86, + "step": 7317 + }, + { + "epoch": 2.2461632903621855, + "grad_norm": 0.35168272256851196, + "learning_rate": 9.039643843429257e-05, + "loss": 1.9337, + "step": 7318 + }, + { + "epoch": 2.2464702271332104, + "grad_norm": 0.3402341604232788, + "learning_rate": 9.039350918238691e-05, + "loss": 1.9291, + "step": 7319 + }, + { + "epoch": 2.2467771639042358, + "grad_norm": 0.3505321443080902, + "learning_rate": 9.03905795312889e-05, + "loss": 1.8252, + "step": 7320 + }, + { + "epoch": 2.247084100675261, + "grad_norm": 0.38366270065307617, + "learning_rate": 9.038764948102754e-05, + "loss": 1.8685, + "step": 7321 + }, + { + "epoch": 2.247391037446286, + "grad_norm": 0.3616010844707489, + "learning_rate": 9.038471903163176e-05, + "loss": 1.8734, + "step": 7322 + }, + { + "epoch": 2.2476979742173113, + "grad_norm": 0.2982875108718872, + "learning_rate": 9.038178818313048e-05, + "loss": 1.824, + "step": 7323 + }, + { + "epoch": 2.248004910988336, + "grad_norm": 0.41936174035072327, + "learning_rate": 9.037885693555273e-05, + "loss": 1.8799, + "step": 7324 + }, + { + "epoch": 2.2483118477593615, + "grad_norm": 0.3460717797279358, + "learning_rate": 9.037592528892744e-05, + "loss": 1.8889, + "step": 7325 + }, + { + "epoch": 2.248618784530387, + "grad_norm": 0.34347018599510193, + "learning_rate": 9.03729932432836e-05, + "loss": 1.8779, + "step": 7326 + }, + { + "epoch": 2.2489257213014118, + "grad_norm": 0.2988032400608063, + "learning_rate": 9.037006079865016e-05, + "loss": 1.8753, + "step": 7327 + }, + { + "epoch": 2.249232658072437, + "grad_norm": 0.32754310965538025, + "learning_rate": 9.036712795505613e-05, + "loss": 1.8896, + "step": 7328 + }, + { + "epoch": 2.2495395948434624, + "grad_norm": 0.3599032163619995, + "learning_rate": 9.036419471253049e-05, + "loss": 1.8752, + "step": 7329 + }, + { + "epoch": 2.2498465316144873, + "grad_norm": 0.3461225926876068, + "learning_rate": 9.03612610711022e-05, + "loss": 1.8723, + "step": 7330 + }, + { + "epoch": 2.2501534683855127, + "grad_norm": 0.3141838610172272, + "learning_rate": 9.035832703080027e-05, + "loss": 1.8825, + "step": 7331 + }, + { + "epoch": 2.250460405156538, + "grad_norm": 0.35188567638397217, + "learning_rate": 9.035539259165371e-05, + "loss": 1.8832, + "step": 7332 + }, + { + "epoch": 2.250767341927563, + "grad_norm": 0.3496280014514923, + "learning_rate": 9.035245775369151e-05, + "loss": 1.9084, + "step": 7333 + }, + { + "epoch": 2.2510742786985882, + "grad_norm": 0.34936273097991943, + "learning_rate": 9.034952251694266e-05, + "loss": 1.8142, + "step": 7334 + }, + { + "epoch": 2.251381215469613, + "grad_norm": 0.4227045774459839, + "learning_rate": 9.034658688143618e-05, + "loss": 1.9454, + "step": 7335 + }, + { + "epoch": 2.2516881522406385, + "grad_norm": 0.4042366147041321, + "learning_rate": 9.034365084720108e-05, + "loss": 1.8993, + "step": 7336 + }, + { + "epoch": 2.251995089011664, + "grad_norm": 0.392633318901062, + "learning_rate": 9.03407144142664e-05, + "loss": 1.9229, + "step": 7337 + }, + { + "epoch": 2.2523020257826887, + "grad_norm": 0.31304940581321716, + "learning_rate": 9.033777758266111e-05, + "loss": 1.8746, + "step": 7338 + }, + { + "epoch": 2.252608962553714, + "grad_norm": 0.3205752372741699, + "learning_rate": 9.033484035241426e-05, + "loss": 1.8224, + "step": 7339 + }, + { + "epoch": 2.252915899324739, + "grad_norm": 0.32164251804351807, + "learning_rate": 9.033190272355488e-05, + "loss": 1.8164, + "step": 7340 + }, + { + "epoch": 2.2532228360957642, + "grad_norm": 0.3567545413970947, + "learning_rate": 9.032896469611201e-05, + "loss": 1.8892, + "step": 7341 + }, + { + "epoch": 2.2535297728667896, + "grad_norm": 0.3475800156593323, + "learning_rate": 9.032602627011467e-05, + "loss": 1.8594, + "step": 7342 + }, + { + "epoch": 2.2538367096378145, + "grad_norm": 0.38770994544029236, + "learning_rate": 9.032308744559189e-05, + "loss": 1.8899, + "step": 7343 + }, + { + "epoch": 2.25414364640884, + "grad_norm": 0.3671153783798218, + "learning_rate": 9.032014822257273e-05, + "loss": 1.8795, + "step": 7344 + }, + { + "epoch": 2.254450583179865, + "grad_norm": 0.3415989875793457, + "learning_rate": 9.031720860108623e-05, + "loss": 1.9007, + "step": 7345 + }, + { + "epoch": 2.25475751995089, + "grad_norm": 0.3317084014415741, + "learning_rate": 9.031426858116145e-05, + "loss": 1.8604, + "step": 7346 + }, + { + "epoch": 2.2550644567219154, + "grad_norm": 0.3760251998901367, + "learning_rate": 9.031132816282745e-05, + "loss": 1.9061, + "step": 7347 + }, + { + "epoch": 2.2553713934929407, + "grad_norm": 0.4288908541202545, + "learning_rate": 9.030838734611326e-05, + "loss": 1.8621, + "step": 7348 + }, + { + "epoch": 2.2556783302639656, + "grad_norm": 0.3840491771697998, + "learning_rate": 9.030544613104797e-05, + "loss": 1.8743, + "step": 7349 + }, + { + "epoch": 2.255985267034991, + "grad_norm": 0.32746297121047974, + "learning_rate": 9.030250451766063e-05, + "loss": 1.8813, + "step": 7350 + }, + { + "epoch": 2.256292203806016, + "grad_norm": 0.31266525387763977, + "learning_rate": 9.029956250598032e-05, + "loss": 1.816, + "step": 7351 + }, + { + "epoch": 2.256599140577041, + "grad_norm": 0.34744998812675476, + "learning_rate": 9.029662009603613e-05, + "loss": 1.8728, + "step": 7352 + }, + { + "epoch": 2.2569060773480665, + "grad_norm": 0.36204856634140015, + "learning_rate": 9.029367728785709e-05, + "loss": 1.9331, + "step": 7353 + }, + { + "epoch": 2.2572130141190914, + "grad_norm": 0.3839271664619446, + "learning_rate": 9.029073408147234e-05, + "loss": 2.0018, + "step": 7354 + }, + { + "epoch": 2.2575199508901167, + "grad_norm": 0.34844526648521423, + "learning_rate": 9.028779047691094e-05, + "loss": 1.8873, + "step": 7355 + }, + { + "epoch": 2.2578268876611416, + "grad_norm": 0.31876906752586365, + "learning_rate": 9.028484647420196e-05, + "loss": 1.8569, + "step": 7356 + }, + { + "epoch": 2.258133824432167, + "grad_norm": 0.3633274435997009, + "learning_rate": 9.028190207337452e-05, + "loss": 1.8645, + "step": 7357 + }, + { + "epoch": 2.2584407612031923, + "grad_norm": 0.39025530219078064, + "learning_rate": 9.027895727445775e-05, + "loss": 1.911, + "step": 7358 + }, + { + "epoch": 2.258747697974217, + "grad_norm": 0.34168434143066406, + "learning_rate": 9.027601207748067e-05, + "loss": 1.8675, + "step": 7359 + }, + { + "epoch": 2.2590546347452425, + "grad_norm": 0.3539605438709259, + "learning_rate": 9.027306648247245e-05, + "loss": 1.9001, + "step": 7360 + }, + { + "epoch": 2.259361571516268, + "grad_norm": 0.30433401465415955, + "learning_rate": 9.02701204894622e-05, + "loss": 1.8598, + "step": 7361 + }, + { + "epoch": 2.2596685082872927, + "grad_norm": 0.35448700189590454, + "learning_rate": 9.026717409847898e-05, + "loss": 1.8845, + "step": 7362 + }, + { + "epoch": 2.259975445058318, + "grad_norm": 0.34060248732566833, + "learning_rate": 9.026422730955197e-05, + "loss": 1.9322, + "step": 7363 + }, + { + "epoch": 2.2602823818293434, + "grad_norm": 0.3370642364025116, + "learning_rate": 9.026128012271026e-05, + "loss": 1.8356, + "step": 7364 + }, + { + "epoch": 2.2605893186003683, + "grad_norm": 0.3148033022880554, + "learning_rate": 9.025833253798298e-05, + "loss": 1.7723, + "step": 7365 + }, + { + "epoch": 2.2608962553713936, + "grad_norm": 0.3062879145145416, + "learning_rate": 9.025538455539925e-05, + "loss": 1.8548, + "step": 7366 + }, + { + "epoch": 2.2612031921424185, + "grad_norm": 0.3378484547138214, + "learning_rate": 9.025243617498825e-05, + "loss": 1.9049, + "step": 7367 + }, + { + "epoch": 2.261510128913444, + "grad_norm": 0.277660608291626, + "learning_rate": 9.024948739677905e-05, + "loss": 1.7833, + "step": 7368 + }, + { + "epoch": 2.261817065684469, + "grad_norm": 0.3986060619354248, + "learning_rate": 9.024653822080083e-05, + "loss": 1.8837, + "step": 7369 + }, + { + "epoch": 2.262124002455494, + "grad_norm": 0.3013289272785187, + "learning_rate": 9.024358864708275e-05, + "loss": 1.8659, + "step": 7370 + }, + { + "epoch": 2.2624309392265194, + "grad_norm": 0.3403053879737854, + "learning_rate": 9.024063867565391e-05, + "loss": 1.8914, + "step": 7371 + }, + { + "epoch": 2.2627378759975443, + "grad_norm": 0.3488257825374603, + "learning_rate": 9.023768830654351e-05, + "loss": 1.8887, + "step": 7372 + }, + { + "epoch": 2.2630448127685696, + "grad_norm": 0.2950255274772644, + "learning_rate": 9.023473753978069e-05, + "loss": 1.8385, + "step": 7373 + }, + { + "epoch": 2.263351749539595, + "grad_norm": 0.35732173919677734, + "learning_rate": 9.023178637539461e-05, + "loss": 1.8769, + "step": 7374 + }, + { + "epoch": 2.26365868631062, + "grad_norm": 0.5403436422348022, + "learning_rate": 9.022883481341445e-05, + "loss": 1.9742, + "step": 7375 + }, + { + "epoch": 2.263965623081645, + "grad_norm": 0.5506799221038818, + "learning_rate": 9.022588285386935e-05, + "loss": 1.8667, + "step": 7376 + }, + { + "epoch": 2.2642725598526705, + "grad_norm": 0.4272395372390747, + "learning_rate": 9.02229304967885e-05, + "loss": 1.8336, + "step": 7377 + }, + { + "epoch": 2.2645794966236954, + "grad_norm": 0.34911462664604187, + "learning_rate": 9.021997774220108e-05, + "loss": 1.8608, + "step": 7378 + }, + { + "epoch": 2.2648864333947207, + "grad_norm": 0.3592715263366699, + "learning_rate": 9.021702459013626e-05, + "loss": 1.925, + "step": 7379 + }, + { + "epoch": 2.265193370165746, + "grad_norm": 0.38482216000556946, + "learning_rate": 9.021407104062323e-05, + "loss": 1.8553, + "step": 7380 + }, + { + "epoch": 2.265500306936771, + "grad_norm": 0.4675584137439728, + "learning_rate": 9.021111709369118e-05, + "loss": 1.9303, + "step": 7381 + }, + { + "epoch": 2.2658072437077963, + "grad_norm": 0.40397754311561584, + "learning_rate": 9.02081627493693e-05, + "loss": 1.9512, + "step": 7382 + }, + { + "epoch": 2.266114180478821, + "grad_norm": 0.3385498821735382, + "learning_rate": 9.02052080076868e-05, + "loss": 1.8314, + "step": 7383 + }, + { + "epoch": 2.2664211172498465, + "grad_norm": 0.40668871998786926, + "learning_rate": 9.020225286867285e-05, + "loss": 1.8658, + "step": 7384 + }, + { + "epoch": 2.266728054020872, + "grad_norm": 0.4566061198711395, + "learning_rate": 9.01992973323567e-05, + "loss": 1.8429, + "step": 7385 + }, + { + "epoch": 2.2670349907918967, + "grad_norm": 0.42283549904823303, + "learning_rate": 9.019634139876752e-05, + "loss": 1.8858, + "step": 7386 + }, + { + "epoch": 2.267341927562922, + "grad_norm": 0.3491251468658447, + "learning_rate": 9.019338506793454e-05, + "loss": 1.8389, + "step": 7387 + }, + { + "epoch": 2.267648864333947, + "grad_norm": 0.33846428990364075, + "learning_rate": 9.019042833988696e-05, + "loss": 1.8309, + "step": 7388 + }, + { + "epoch": 2.2679558011049723, + "grad_norm": 0.39968016743659973, + "learning_rate": 9.0187471214654e-05, + "loss": 1.8591, + "step": 7389 + }, + { + "epoch": 2.2682627378759976, + "grad_norm": 0.39926376938819885, + "learning_rate": 9.018451369226493e-05, + "loss": 1.9341, + "step": 7390 + }, + { + "epoch": 2.2685696746470225, + "grad_norm": 0.41112056374549866, + "learning_rate": 9.018155577274892e-05, + "loss": 1.8856, + "step": 7391 + }, + { + "epoch": 2.268876611418048, + "grad_norm": 0.49490058422088623, + "learning_rate": 9.017859745613521e-05, + "loss": 1.8458, + "step": 7392 + }, + { + "epoch": 2.269183548189073, + "grad_norm": 0.42149874567985535, + "learning_rate": 9.017563874245308e-05, + "loss": 1.862, + "step": 7393 + }, + { + "epoch": 2.269490484960098, + "grad_norm": 0.37284091114997864, + "learning_rate": 9.017267963173173e-05, + "loss": 1.8698, + "step": 7394 + }, + { + "epoch": 2.2697974217311234, + "grad_norm": 0.3743322193622589, + "learning_rate": 9.016972012400041e-05, + "loss": 1.8847, + "step": 7395 + }, + { + "epoch": 2.2701043585021488, + "grad_norm": 0.4327050447463989, + "learning_rate": 9.016676021928838e-05, + "loss": 1.8227, + "step": 7396 + }, + { + "epoch": 2.2704112952731736, + "grad_norm": 0.4334336519241333, + "learning_rate": 9.016379991762487e-05, + "loss": 1.9292, + "step": 7397 + }, + { + "epoch": 2.270718232044199, + "grad_norm": 0.37071630358695984, + "learning_rate": 9.016083921903915e-05, + "loss": 1.8045, + "step": 7398 + }, + { + "epoch": 2.271025168815224, + "grad_norm": 0.32131752371788025, + "learning_rate": 9.015787812356049e-05, + "loss": 1.8697, + "step": 7399 + }, + { + "epoch": 2.271332105586249, + "grad_norm": 0.3604664206504822, + "learning_rate": 9.015491663121813e-05, + "loss": 1.9259, + "step": 7400 + }, + { + "epoch": 2.2716390423572745, + "grad_norm": 0.3364580571651459, + "learning_rate": 9.015195474204136e-05, + "loss": 1.8964, + "step": 7401 + }, + { + "epoch": 2.2719459791282994, + "grad_norm": 0.3141402304172516, + "learning_rate": 9.014899245605944e-05, + "loss": 1.8536, + "step": 7402 + }, + { + "epoch": 2.2722529158993248, + "grad_norm": 0.3387024402618408, + "learning_rate": 9.014602977330162e-05, + "loss": 1.8362, + "step": 7403 + }, + { + "epoch": 2.27255985267035, + "grad_norm": 0.42270272970199585, + "learning_rate": 9.014306669379723e-05, + "loss": 1.8288, + "step": 7404 + }, + { + "epoch": 2.272866789441375, + "grad_norm": 0.4565230906009674, + "learning_rate": 9.01401032175755e-05, + "loss": 1.8573, + "step": 7405 + }, + { + "epoch": 2.2731737262124003, + "grad_norm": 0.38861140608787537, + "learning_rate": 9.013713934466576e-05, + "loss": 1.8778, + "step": 7406 + }, + { + "epoch": 2.2734806629834257, + "grad_norm": 0.31552520394325256, + "learning_rate": 9.01341750750973e-05, + "loss": 1.8342, + "step": 7407 + }, + { + "epoch": 2.2737875997544506, + "grad_norm": 0.3771591782569885, + "learning_rate": 9.013121040889938e-05, + "loss": 1.8847, + "step": 7408 + }, + { + "epoch": 2.274094536525476, + "grad_norm": 0.3689042925834656, + "learning_rate": 9.012824534610132e-05, + "loss": 1.9014, + "step": 7409 + }, + { + "epoch": 2.2744014732965008, + "grad_norm": 0.31477800011634827, + "learning_rate": 9.012527988673241e-05, + "loss": 1.8631, + "step": 7410 + }, + { + "epoch": 2.274708410067526, + "grad_norm": 0.3238977789878845, + "learning_rate": 9.012231403082199e-05, + "loss": 1.8319, + "step": 7411 + }, + { + "epoch": 2.2750153468385514, + "grad_norm": 0.3587593138217926, + "learning_rate": 9.011934777839932e-05, + "loss": 1.8982, + "step": 7412 + }, + { + "epoch": 2.2753222836095763, + "grad_norm": 0.35946986079216003, + "learning_rate": 9.011638112949376e-05, + "loss": 1.9206, + "step": 7413 + }, + { + "epoch": 2.2756292203806017, + "grad_norm": 0.3451001048088074, + "learning_rate": 9.01134140841346e-05, + "loss": 1.8122, + "step": 7414 + }, + { + "epoch": 2.2759361571516266, + "grad_norm": 0.3779532313346863, + "learning_rate": 9.011044664235116e-05, + "loss": 1.8851, + "step": 7415 + }, + { + "epoch": 2.276243093922652, + "grad_norm": 0.3812767267227173, + "learning_rate": 9.010747880417279e-05, + "loss": 1.902, + "step": 7416 + }, + { + "epoch": 2.2765500306936772, + "grad_norm": 0.3666127920150757, + "learning_rate": 9.01045105696288e-05, + "loss": 1.8296, + "step": 7417 + }, + { + "epoch": 2.276856967464702, + "grad_norm": 0.3588816225528717, + "learning_rate": 9.010154193874854e-05, + "loss": 1.9023, + "step": 7418 + }, + { + "epoch": 2.2771639042357275, + "grad_norm": 0.37766706943511963, + "learning_rate": 9.009857291156134e-05, + "loss": 1.7996, + "step": 7419 + }, + { + "epoch": 2.277470841006753, + "grad_norm": 0.4222901165485382, + "learning_rate": 9.009560348809654e-05, + "loss": 1.8802, + "step": 7420 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.39289870858192444, + "learning_rate": 9.009263366838348e-05, + "loss": 1.8988, + "step": 7421 + }, + { + "epoch": 2.278084714548803, + "grad_norm": 0.3670540750026703, + "learning_rate": 9.008966345245152e-05, + "loss": 1.8348, + "step": 7422 + }, + { + "epoch": 2.2783916513198283, + "grad_norm": 0.36671552062034607, + "learning_rate": 9.008669284032998e-05, + "loss": 1.9059, + "step": 7423 + }, + { + "epoch": 2.2786985880908532, + "grad_norm": 0.33226338028907776, + "learning_rate": 9.008372183204827e-05, + "loss": 1.8736, + "step": 7424 + }, + { + "epoch": 2.2790055248618786, + "grad_norm": 0.3424983322620392, + "learning_rate": 9.008075042763573e-05, + "loss": 1.8537, + "step": 7425 + }, + { + "epoch": 2.2793124616329035, + "grad_norm": 0.3336870074272156, + "learning_rate": 9.007777862712172e-05, + "loss": 1.8622, + "step": 7426 + }, + { + "epoch": 2.279619398403929, + "grad_norm": 0.3488881289958954, + "learning_rate": 9.007480643053561e-05, + "loss": 1.88, + "step": 7427 + }, + { + "epoch": 2.279926335174954, + "grad_norm": 0.34159761667251587, + "learning_rate": 9.007183383790676e-05, + "loss": 1.8893, + "step": 7428 + }, + { + "epoch": 2.280233271945979, + "grad_norm": 0.3075805604457855, + "learning_rate": 9.006886084926459e-05, + "loss": 1.8613, + "step": 7429 + }, + { + "epoch": 2.2805402087170044, + "grad_norm": 0.32371413707733154, + "learning_rate": 9.006588746463844e-05, + "loss": 1.909, + "step": 7430 + }, + { + "epoch": 2.2808471454880292, + "grad_norm": 0.34343451261520386, + "learning_rate": 9.006291368405769e-05, + "loss": 1.8696, + "step": 7431 + }, + { + "epoch": 2.2811540822590546, + "grad_norm": 0.34018251299858093, + "learning_rate": 9.005993950755177e-05, + "loss": 1.9155, + "step": 7432 + }, + { + "epoch": 2.28146101903008, + "grad_norm": 0.42582982778549194, + "learning_rate": 9.005696493515003e-05, + "loss": 1.8901, + "step": 7433 + }, + { + "epoch": 2.281767955801105, + "grad_norm": 0.44168829917907715, + "learning_rate": 9.005398996688188e-05, + "loss": 1.8693, + "step": 7434 + }, + { + "epoch": 2.28207489257213, + "grad_norm": 0.3650555908679962, + "learning_rate": 9.005101460277673e-05, + "loss": 1.8726, + "step": 7435 + }, + { + "epoch": 2.2823818293431555, + "grad_norm": 0.2945705056190491, + "learning_rate": 9.004803884286399e-05, + "loss": 1.8655, + "step": 7436 + }, + { + "epoch": 2.2826887661141804, + "grad_norm": 0.4192120432853699, + "learning_rate": 9.004506268717305e-05, + "loss": 1.9859, + "step": 7437 + }, + { + "epoch": 2.2829957028852057, + "grad_norm": 0.35403937101364136, + "learning_rate": 9.004208613573334e-05, + "loss": 1.785, + "step": 7438 + }, + { + "epoch": 2.283302639656231, + "grad_norm": 0.3038218021392822, + "learning_rate": 9.003910918857426e-05, + "loss": 1.8199, + "step": 7439 + }, + { + "epoch": 2.283609576427256, + "grad_norm": 0.3447442352771759, + "learning_rate": 9.003613184572522e-05, + "loss": 1.882, + "step": 7440 + }, + { + "epoch": 2.2839165131982813, + "grad_norm": 0.32208123803138733, + "learning_rate": 9.003315410721567e-05, + "loss": 1.8326, + "step": 7441 + }, + { + "epoch": 2.284223449969306, + "grad_norm": 0.31731268763542175, + "learning_rate": 9.003017597307504e-05, + "loss": 1.8947, + "step": 7442 + }, + { + "epoch": 2.2845303867403315, + "grad_norm": 0.3491910398006439, + "learning_rate": 9.002719744333273e-05, + "loss": 1.924, + "step": 7443 + }, + { + "epoch": 2.284837323511357, + "grad_norm": 0.32135117053985596, + "learning_rate": 9.00242185180182e-05, + "loss": 1.838, + "step": 7444 + }, + { + "epoch": 2.2851442602823817, + "grad_norm": 0.32201823592185974, + "learning_rate": 9.00212391971609e-05, + "loss": 1.8449, + "step": 7445 + }, + { + "epoch": 2.285451197053407, + "grad_norm": 0.3983609676361084, + "learning_rate": 9.001825948079024e-05, + "loss": 1.8897, + "step": 7446 + }, + { + "epoch": 2.285758133824432, + "grad_norm": 0.4174421727657318, + "learning_rate": 9.001527936893568e-05, + "loss": 1.8671, + "step": 7447 + }, + { + "epoch": 2.2860650705954573, + "grad_norm": 0.3456888496875763, + "learning_rate": 9.001229886162668e-05, + "loss": 1.9064, + "step": 7448 + }, + { + "epoch": 2.2863720073664826, + "grad_norm": 0.3092229664325714, + "learning_rate": 9.000931795889269e-05, + "loss": 1.8478, + "step": 7449 + }, + { + "epoch": 2.2866789441375075, + "grad_norm": 0.40093541145324707, + "learning_rate": 9.000633666076317e-05, + "loss": 1.9226, + "step": 7450 + }, + { + "epoch": 2.286985880908533, + "grad_norm": 0.41090336441993713, + "learning_rate": 9.000335496726759e-05, + "loss": 1.8542, + "step": 7451 + }, + { + "epoch": 2.287292817679558, + "grad_norm": 0.48479974269866943, + "learning_rate": 9.00003728784354e-05, + "loss": 1.9217, + "step": 7452 + }, + { + "epoch": 2.287599754450583, + "grad_norm": 0.662677526473999, + "learning_rate": 8.999739039429609e-05, + "loss": 1.957, + "step": 7453 + }, + { + "epoch": 2.2879066912216084, + "grad_norm": 0.6417959928512573, + "learning_rate": 8.999440751487911e-05, + "loss": 1.8273, + "step": 7454 + }, + { + "epoch": 2.2882136279926337, + "grad_norm": 0.5561745762825012, + "learning_rate": 8.999142424021396e-05, + "loss": 1.9081, + "step": 7455 + }, + { + "epoch": 2.2885205647636586, + "grad_norm": 0.3603537976741791, + "learning_rate": 8.998844057033013e-05, + "loss": 1.8256, + "step": 7456 + }, + { + "epoch": 2.288827501534684, + "grad_norm": 0.5149406790733337, + "learning_rate": 8.998545650525707e-05, + "loss": 1.8257, + "step": 7457 + }, + { + "epoch": 2.289134438305709, + "grad_norm": 0.6777750253677368, + "learning_rate": 8.99824720450243e-05, + "loss": 1.8581, + "step": 7458 + }, + { + "epoch": 2.289441375076734, + "grad_norm": 0.6244171857833862, + "learning_rate": 8.997948718966132e-05, + "loss": 1.9195, + "step": 7459 + }, + { + "epoch": 2.2897483118477595, + "grad_norm": 0.3903466463088989, + "learning_rate": 8.99765019391976e-05, + "loss": 1.8996, + "step": 7460 + }, + { + "epoch": 2.2900552486187844, + "grad_norm": 0.4231773614883423, + "learning_rate": 8.997351629366266e-05, + "loss": 1.9447, + "step": 7461 + }, + { + "epoch": 2.2903621853898097, + "grad_norm": 0.5735896825790405, + "learning_rate": 8.997053025308602e-05, + "loss": 1.9082, + "step": 7462 + }, + { + "epoch": 2.2906691221608346, + "grad_norm": 0.5015980005264282, + "learning_rate": 8.996754381749715e-05, + "loss": 1.8744, + "step": 7463 + }, + { + "epoch": 2.29097605893186, + "grad_norm": 0.3385339677333832, + "learning_rate": 8.996455698692558e-05, + "loss": 1.8908, + "step": 7464 + }, + { + "epoch": 2.2912829957028853, + "grad_norm": 0.35323935747146606, + "learning_rate": 8.996156976140086e-05, + "loss": 1.8739, + "step": 7465 + }, + { + "epoch": 2.29158993247391, + "grad_norm": 0.386081725358963, + "learning_rate": 8.995858214095248e-05, + "loss": 1.8734, + "step": 7466 + }, + { + "epoch": 2.2918968692449355, + "grad_norm": 0.32834386825561523, + "learning_rate": 8.995559412560996e-05, + "loss": 1.8849, + "step": 7467 + }, + { + "epoch": 2.292203806015961, + "grad_norm": 0.3868117034435272, + "learning_rate": 8.995260571540284e-05, + "loss": 1.8992, + "step": 7468 + }, + { + "epoch": 2.2925107427869857, + "grad_norm": 0.3869209885597229, + "learning_rate": 8.994961691036066e-05, + "loss": 1.8562, + "step": 7469 + }, + { + "epoch": 2.292817679558011, + "grad_norm": 0.39098650217056274, + "learning_rate": 8.994662771051294e-05, + "loss": 1.9077, + "step": 7470 + }, + { + "epoch": 2.2931246163290364, + "grad_norm": 0.4433341920375824, + "learning_rate": 8.994363811588923e-05, + "loss": 1.9193, + "step": 7471 + }, + { + "epoch": 2.2934315531000613, + "grad_norm": 0.37947940826416016, + "learning_rate": 8.99406481265191e-05, + "loss": 1.8843, + "step": 7472 + }, + { + "epoch": 2.2937384898710866, + "grad_norm": 0.4123954772949219, + "learning_rate": 8.993765774243206e-05, + "loss": 1.8847, + "step": 7473 + }, + { + "epoch": 2.2940454266421115, + "grad_norm": 0.3863835036754608, + "learning_rate": 8.993466696365768e-05, + "loss": 1.8226, + "step": 7474 + }, + { + "epoch": 2.294352363413137, + "grad_norm": 0.34903961420059204, + "learning_rate": 8.993167579022551e-05, + "loss": 1.9151, + "step": 7475 + }, + { + "epoch": 2.294659300184162, + "grad_norm": 0.439989298582077, + "learning_rate": 8.992868422216512e-05, + "loss": 1.8494, + "step": 7476 + }, + { + "epoch": 2.294966236955187, + "grad_norm": 0.42929476499557495, + "learning_rate": 8.992569225950607e-05, + "loss": 1.8174, + "step": 7477 + }, + { + "epoch": 2.2952731737262124, + "grad_norm": 0.39554497599601746, + "learning_rate": 8.992269990227792e-05, + "loss": 1.8692, + "step": 7478 + }, + { + "epoch": 2.2955801104972378, + "grad_norm": 0.29355254769325256, + "learning_rate": 8.991970715051026e-05, + "loss": 1.8033, + "step": 7479 + }, + { + "epoch": 2.2958870472682626, + "grad_norm": 0.3488605320453644, + "learning_rate": 8.991671400423265e-05, + "loss": 1.8979, + "step": 7480 + }, + { + "epoch": 2.296193984039288, + "grad_norm": 0.34984245896339417, + "learning_rate": 8.991372046347468e-05, + "loss": 1.8931, + "step": 7481 + }, + { + "epoch": 2.2965009208103133, + "grad_norm": 0.29404810070991516, + "learning_rate": 8.991072652826593e-05, + "loss": 1.8626, + "step": 7482 + }, + { + "epoch": 2.296807857581338, + "grad_norm": 0.2838701009750366, + "learning_rate": 8.990773219863598e-05, + "loss": 1.8542, + "step": 7483 + }, + { + "epoch": 2.2971147943523635, + "grad_norm": 0.28008925914764404, + "learning_rate": 8.990473747461444e-05, + "loss": 1.8354, + "step": 7484 + }, + { + "epoch": 2.2974217311233884, + "grad_norm": 0.3046751320362091, + "learning_rate": 8.99017423562309e-05, + "loss": 1.8657, + "step": 7485 + }, + { + "epoch": 2.2977286678944138, + "grad_norm": 0.28220781683921814, + "learning_rate": 8.989874684351494e-05, + "loss": 1.8349, + "step": 7486 + }, + { + "epoch": 2.298035604665439, + "grad_norm": 0.2665577232837677, + "learning_rate": 8.989575093649619e-05, + "loss": 1.8551, + "step": 7487 + }, + { + "epoch": 2.298342541436464, + "grad_norm": 0.2797924280166626, + "learning_rate": 8.989275463520423e-05, + "loss": 1.8568, + "step": 7488 + }, + { + "epoch": 2.2986494782074893, + "grad_norm": 0.2917410731315613, + "learning_rate": 8.98897579396687e-05, + "loss": 1.843, + "step": 7489 + }, + { + "epoch": 2.298956414978514, + "grad_norm": 0.3014819920063019, + "learning_rate": 8.98867608499192e-05, + "loss": 1.8527, + "step": 7490 + }, + { + "epoch": 2.2992633517495396, + "grad_norm": 0.28019243478775024, + "learning_rate": 8.988376336598537e-05, + "loss": 1.7744, + "step": 7491 + }, + { + "epoch": 2.299570288520565, + "grad_norm": 0.35014277696609497, + "learning_rate": 8.988076548789678e-05, + "loss": 1.9604, + "step": 7492 + }, + { + "epoch": 2.2998772252915898, + "grad_norm": 0.3060695230960846, + "learning_rate": 8.987776721568311e-05, + "loss": 1.8463, + "step": 7493 + }, + { + "epoch": 2.300184162062615, + "grad_norm": 0.29870638251304626, + "learning_rate": 8.987476854937395e-05, + "loss": 1.815, + "step": 7494 + }, + { + "epoch": 2.3004910988336404, + "grad_norm": 0.27395132184028625, + "learning_rate": 8.987176948899898e-05, + "loss": 1.8126, + "step": 7495 + }, + { + "epoch": 2.3007980356046653, + "grad_norm": 0.2982339859008789, + "learning_rate": 8.986877003458781e-05, + "loss": 1.9114, + "step": 7496 + }, + { + "epoch": 2.3011049723756907, + "grad_norm": 0.3113982081413269, + "learning_rate": 8.986577018617008e-05, + "loss": 1.8429, + "step": 7497 + }, + { + "epoch": 2.301411909146716, + "grad_norm": 0.3538585603237152, + "learning_rate": 8.986276994377544e-05, + "loss": 1.9045, + "step": 7498 + }, + { + "epoch": 2.301718845917741, + "grad_norm": 0.37576064467430115, + "learning_rate": 8.985976930743356e-05, + "loss": 1.8955, + "step": 7499 + }, + { + "epoch": 2.3020257826887662, + "grad_norm": 0.3080044388771057, + "learning_rate": 8.985676827717406e-05, + "loss": 1.7946, + "step": 7500 + }, + { + "epoch": 2.302332719459791, + "grad_norm": 0.33935341238975525, + "learning_rate": 8.985376685302662e-05, + "loss": 1.8817, + "step": 7501 + }, + { + "epoch": 2.3026396562308165, + "grad_norm": 0.3817180395126343, + "learning_rate": 8.98507650350209e-05, + "loss": 1.9178, + "step": 7502 + }, + { + "epoch": 2.302946593001842, + "grad_norm": 0.35170307755470276, + "learning_rate": 8.984776282318657e-05, + "loss": 1.9451, + "step": 7503 + }, + { + "epoch": 2.3032535297728667, + "grad_norm": 0.3451419770717621, + "learning_rate": 8.984476021755329e-05, + "loss": 1.9127, + "step": 7504 + }, + { + "epoch": 2.303560466543892, + "grad_norm": 0.4312259554862976, + "learning_rate": 8.984175721815071e-05, + "loss": 1.8784, + "step": 7505 + }, + { + "epoch": 2.303867403314917, + "grad_norm": 0.4684976041316986, + "learning_rate": 8.983875382500856e-05, + "loss": 1.8782, + "step": 7506 + }, + { + "epoch": 2.3041743400859422, + "grad_norm": 0.4230491518974304, + "learning_rate": 8.983575003815648e-05, + "loss": 1.8769, + "step": 7507 + }, + { + "epoch": 2.3044812768569676, + "grad_norm": 0.32715409994125366, + "learning_rate": 8.983274585762417e-05, + "loss": 1.8535, + "step": 7508 + }, + { + "epoch": 2.3047882136279925, + "grad_norm": 0.3857569396495819, + "learning_rate": 8.982974128344134e-05, + "loss": 1.8689, + "step": 7509 + }, + { + "epoch": 2.305095150399018, + "grad_norm": 0.46266329288482666, + "learning_rate": 8.982673631563766e-05, + "loss": 1.9151, + "step": 7510 + }, + { + "epoch": 2.305402087170043, + "grad_norm": 0.455713152885437, + "learning_rate": 8.98237309542428e-05, + "loss": 1.9304, + "step": 7511 + }, + { + "epoch": 2.305709023941068, + "grad_norm": 0.3413514792919159, + "learning_rate": 8.98207251992865e-05, + "loss": 1.8516, + "step": 7512 + }, + { + "epoch": 2.3060159607120934, + "grad_norm": 0.3705863058567047, + "learning_rate": 8.981771905079846e-05, + "loss": 1.8434, + "step": 7513 + }, + { + "epoch": 2.3063228974831187, + "grad_norm": 0.46615147590637207, + "learning_rate": 8.981471250880839e-05, + "loss": 1.9265, + "step": 7514 + }, + { + "epoch": 2.3066298342541436, + "grad_norm": 0.5400925278663635, + "learning_rate": 8.981170557334598e-05, + "loss": 1.9061, + "step": 7515 + }, + { + "epoch": 2.306936771025169, + "grad_norm": 0.40317288041114807, + "learning_rate": 8.980869824444096e-05, + "loss": 1.7916, + "step": 7516 + }, + { + "epoch": 2.307243707796194, + "grad_norm": 0.3522326648235321, + "learning_rate": 8.980569052212307e-05, + "loss": 1.867, + "step": 7517 + }, + { + "epoch": 2.307550644567219, + "grad_norm": 0.5134142637252808, + "learning_rate": 8.9802682406422e-05, + "loss": 1.8406, + "step": 7518 + }, + { + "epoch": 2.3078575813382445, + "grad_norm": 0.5792621970176697, + "learning_rate": 8.97996738973675e-05, + "loss": 1.8467, + "step": 7519 + }, + { + "epoch": 2.3081645181092694, + "grad_norm": 0.424405962228775, + "learning_rate": 8.979666499498928e-05, + "loss": 1.779, + "step": 7520 + }, + { + "epoch": 2.3084714548802947, + "grad_norm": 0.3233562409877777, + "learning_rate": 8.979365569931712e-05, + "loss": 1.9043, + "step": 7521 + }, + { + "epoch": 2.3087783916513196, + "grad_norm": 0.6043062806129456, + "learning_rate": 8.979064601038071e-05, + "loss": 1.9245, + "step": 7522 + }, + { + "epoch": 2.309085328422345, + "grad_norm": 0.6618810892105103, + "learning_rate": 8.978763592820982e-05, + "loss": 1.8601, + "step": 7523 + }, + { + "epoch": 2.3093922651933703, + "grad_norm": 0.44771909713745117, + "learning_rate": 8.978462545283418e-05, + "loss": 1.7836, + "step": 7524 + }, + { + "epoch": 2.309699201964395, + "grad_norm": 0.3473430871963501, + "learning_rate": 8.978161458428356e-05, + "loss": 1.8743, + "step": 7525 + }, + { + "epoch": 2.3100061387354205, + "grad_norm": 0.46158188581466675, + "learning_rate": 8.977860332258772e-05, + "loss": 1.8802, + "step": 7526 + }, + { + "epoch": 2.310313075506446, + "grad_norm": 0.42034098505973816, + "learning_rate": 8.977559166777639e-05, + "loss": 1.8773, + "step": 7527 + }, + { + "epoch": 2.3106200122774707, + "grad_norm": 0.30994895100593567, + "learning_rate": 8.977257961987936e-05, + "loss": 1.8042, + "step": 7528 + }, + { + "epoch": 2.310926949048496, + "grad_norm": 0.32265907526016235, + "learning_rate": 8.976956717892638e-05, + "loss": 1.8, + "step": 7529 + }, + { + "epoch": 2.3112338858195214, + "grad_norm": 0.3592197000980377, + "learning_rate": 8.976655434494723e-05, + "loss": 1.9053, + "step": 7530 + }, + { + "epoch": 2.3115408225905463, + "grad_norm": 0.36494702100753784, + "learning_rate": 8.97635411179717e-05, + "loss": 1.8982, + "step": 7531 + }, + { + "epoch": 2.3118477593615716, + "grad_norm": 0.3697327971458435, + "learning_rate": 8.976052749802952e-05, + "loss": 1.9446, + "step": 7532 + }, + { + "epoch": 2.3121546961325965, + "grad_norm": 0.5200048089027405, + "learning_rate": 8.975751348515052e-05, + "loss": 1.9429, + "step": 7533 + }, + { + "epoch": 2.312461632903622, + "grad_norm": 0.4033229947090149, + "learning_rate": 8.975449907936446e-05, + "loss": 1.8128, + "step": 7534 + }, + { + "epoch": 2.312768569674647, + "grad_norm": 0.35759851336479187, + "learning_rate": 8.975148428070115e-05, + "loss": 1.8721, + "step": 7535 + }, + { + "epoch": 2.313075506445672, + "grad_norm": 0.4578085243701935, + "learning_rate": 8.974846908919037e-05, + "loss": 1.8397, + "step": 7536 + }, + { + "epoch": 2.3133824432166974, + "grad_norm": 0.4557357132434845, + "learning_rate": 8.974545350486192e-05, + "loss": 1.8726, + "step": 7537 + }, + { + "epoch": 2.3136893799877223, + "grad_norm": 0.3946380615234375, + "learning_rate": 8.974243752774561e-05, + "loss": 1.8662, + "step": 7538 + }, + { + "epoch": 2.3139963167587476, + "grad_norm": 0.29723790287971497, + "learning_rate": 8.973942115787122e-05, + "loss": 1.8215, + "step": 7539 + }, + { + "epoch": 2.314303253529773, + "grad_norm": 0.37225791811943054, + "learning_rate": 8.973640439526858e-05, + "loss": 1.9422, + "step": 7540 + }, + { + "epoch": 2.314610190300798, + "grad_norm": 0.3359868824481964, + "learning_rate": 8.973338723996751e-05, + "loss": 1.7974, + "step": 7541 + }, + { + "epoch": 2.314917127071823, + "grad_norm": 0.2993139922618866, + "learning_rate": 8.973036969199782e-05, + "loss": 1.8691, + "step": 7542 + }, + { + "epoch": 2.3152240638428485, + "grad_norm": 0.3155567944049835, + "learning_rate": 8.972735175138933e-05, + "loss": 1.857, + "step": 7543 + }, + { + "epoch": 2.3155310006138734, + "grad_norm": 0.315820574760437, + "learning_rate": 8.972433341817188e-05, + "loss": 1.8597, + "step": 7544 + }, + { + "epoch": 2.3158379373848987, + "grad_norm": 0.32500606775283813, + "learning_rate": 8.972131469237526e-05, + "loss": 1.9293, + "step": 7545 + }, + { + "epoch": 2.316144874155924, + "grad_norm": 0.3481442332267761, + "learning_rate": 8.971829557402933e-05, + "loss": 1.8839, + "step": 7546 + }, + { + "epoch": 2.316451810926949, + "grad_norm": 0.3110404312610626, + "learning_rate": 8.971527606316394e-05, + "loss": 1.8717, + "step": 7547 + }, + { + "epoch": 2.3167587476979743, + "grad_norm": 0.319795161485672, + "learning_rate": 8.97122561598089e-05, + "loss": 1.8855, + "step": 7548 + }, + { + "epoch": 2.317065684468999, + "grad_norm": 0.33142411708831787, + "learning_rate": 8.970923586399407e-05, + "loss": 1.863, + "step": 7549 + }, + { + "epoch": 2.3173726212400245, + "grad_norm": 0.348715603351593, + "learning_rate": 8.970621517574929e-05, + "loss": 1.8886, + "step": 7550 + }, + { + "epoch": 2.31767955801105, + "grad_norm": 0.3179607689380646, + "learning_rate": 8.970319409510444e-05, + "loss": 1.8955, + "step": 7551 + }, + { + "epoch": 2.3179864947820747, + "grad_norm": 0.33166465163230896, + "learning_rate": 8.970017262208934e-05, + "loss": 1.8366, + "step": 7552 + }, + { + "epoch": 2.3182934315531, + "grad_norm": 0.30798691511154175, + "learning_rate": 8.969715075673386e-05, + "loss": 1.8437, + "step": 7553 + }, + { + "epoch": 2.3186003683241254, + "grad_norm": 0.292639821767807, + "learning_rate": 8.969412849906788e-05, + "loss": 1.8056, + "step": 7554 + }, + { + "epoch": 2.3189073050951503, + "grad_norm": 0.2972165048122406, + "learning_rate": 8.969110584912125e-05, + "loss": 1.8596, + "step": 7555 + }, + { + "epoch": 2.3192142418661756, + "grad_norm": 0.3346043527126312, + "learning_rate": 8.968808280692385e-05, + "loss": 1.8652, + "step": 7556 + }, + { + "epoch": 2.319521178637201, + "grad_norm": 0.31866857409477234, + "learning_rate": 8.968505937250555e-05, + "loss": 1.9263, + "step": 7557 + }, + { + "epoch": 2.319828115408226, + "grad_norm": 0.3511367440223694, + "learning_rate": 8.968203554589625e-05, + "loss": 1.8615, + "step": 7558 + }, + { + "epoch": 2.320135052179251, + "grad_norm": 0.36077243089675903, + "learning_rate": 8.96790113271258e-05, + "loss": 1.9155, + "step": 7559 + }, + { + "epoch": 2.320441988950276, + "grad_norm": 0.3335363268852234, + "learning_rate": 8.96759867162241e-05, + "loss": 1.8313, + "step": 7560 + }, + { + "epoch": 2.3207489257213014, + "grad_norm": 0.31834676861763, + "learning_rate": 8.967296171322105e-05, + "loss": 1.809, + "step": 7561 + }, + { + "epoch": 2.3210558624923268, + "grad_norm": 0.3629632890224457, + "learning_rate": 8.966993631814655e-05, + "loss": 1.854, + "step": 7562 + }, + { + "epoch": 2.3213627992633517, + "grad_norm": 0.3164220154285431, + "learning_rate": 8.966691053103049e-05, + "loss": 1.8431, + "step": 7563 + }, + { + "epoch": 2.321669736034377, + "grad_norm": 0.408178448677063, + "learning_rate": 8.966388435190276e-05, + "loss": 1.8652, + "step": 7564 + }, + { + "epoch": 2.321976672805402, + "grad_norm": 0.4244436025619507, + "learning_rate": 8.966085778079327e-05, + "loss": 1.8834, + "step": 7565 + }, + { + "epoch": 2.322283609576427, + "grad_norm": 0.44187989830970764, + "learning_rate": 8.965783081773195e-05, + "loss": 1.8822, + "step": 7566 + }, + { + "epoch": 2.3225905463474525, + "grad_norm": 0.30801042914390564, + "learning_rate": 8.965480346274869e-05, + "loss": 1.8145, + "step": 7567 + }, + { + "epoch": 2.3228974831184774, + "grad_norm": 0.30103740096092224, + "learning_rate": 8.965177571587343e-05, + "loss": 1.8207, + "step": 7568 + }, + { + "epoch": 2.3232044198895028, + "grad_norm": 0.417538046836853, + "learning_rate": 8.964874757713608e-05, + "loss": 1.9213, + "step": 7569 + }, + { + "epoch": 2.323511356660528, + "grad_norm": 0.4238434433937073, + "learning_rate": 8.964571904656656e-05, + "loss": 1.8309, + "step": 7570 + }, + { + "epoch": 2.323818293431553, + "grad_norm": 0.3717726171016693, + "learning_rate": 8.964269012419482e-05, + "loss": 1.8613, + "step": 7571 + }, + { + "epoch": 2.3241252302025783, + "grad_norm": 0.369182288646698, + "learning_rate": 8.963966081005078e-05, + "loss": 1.9232, + "step": 7572 + }, + { + "epoch": 2.3244321669736037, + "grad_norm": 0.40301385521888733, + "learning_rate": 8.963663110416436e-05, + "loss": 1.9509, + "step": 7573 + }, + { + "epoch": 2.3247391037446286, + "grad_norm": 0.3336825966835022, + "learning_rate": 8.963360100656553e-05, + "loss": 1.807, + "step": 7574 + }, + { + "epoch": 2.325046040515654, + "grad_norm": 0.4070039987564087, + "learning_rate": 8.963057051728423e-05, + "loss": 1.9349, + "step": 7575 + }, + { + "epoch": 2.325352977286679, + "grad_norm": 0.34244731068611145, + "learning_rate": 8.96275396363504e-05, + "loss": 1.8378, + "step": 7576 + }, + { + "epoch": 2.325659914057704, + "grad_norm": 0.3408849835395813, + "learning_rate": 8.962450836379401e-05, + "loss": 1.8087, + "step": 7577 + }, + { + "epoch": 2.3259668508287294, + "grad_norm": 0.34224358201026917, + "learning_rate": 8.962147669964498e-05, + "loss": 1.9158, + "step": 7578 + }, + { + "epoch": 2.3262737875997543, + "grad_norm": 0.36177051067352295, + "learning_rate": 8.961844464393332e-05, + "loss": 1.8774, + "step": 7579 + }, + { + "epoch": 2.3265807243707797, + "grad_norm": 0.3000224232673645, + "learning_rate": 8.961541219668895e-05, + "loss": 1.8092, + "step": 7580 + }, + { + "epoch": 2.3268876611418046, + "grad_norm": 0.34738194942474365, + "learning_rate": 8.961237935794185e-05, + "loss": 1.9107, + "step": 7581 + }, + { + "epoch": 2.32719459791283, + "grad_norm": 0.355585515499115, + "learning_rate": 8.960934612772203e-05, + "loss": 1.8343, + "step": 7582 + }, + { + "epoch": 2.3275015346838552, + "grad_norm": 0.29839828610420227, + "learning_rate": 8.96063125060594e-05, + "loss": 1.8345, + "step": 7583 + }, + { + "epoch": 2.32780847145488, + "grad_norm": 0.3695736229419708, + "learning_rate": 8.960327849298399e-05, + "loss": 1.8763, + "step": 7584 + }, + { + "epoch": 2.3281154082259055, + "grad_norm": 0.38834989070892334, + "learning_rate": 8.960024408852578e-05, + "loss": 1.8732, + "step": 7585 + }, + { + "epoch": 2.328422344996931, + "grad_norm": 0.4515606462955475, + "learning_rate": 8.959720929271474e-05, + "loss": 1.9685, + "step": 7586 + }, + { + "epoch": 2.3287292817679557, + "grad_norm": 0.39115825295448303, + "learning_rate": 8.959417410558087e-05, + "loss": 1.7969, + "step": 7587 + }, + { + "epoch": 2.329036218538981, + "grad_norm": 0.37858307361602783, + "learning_rate": 8.959113852715417e-05, + "loss": 1.9013, + "step": 7588 + }, + { + "epoch": 2.3293431553100064, + "grad_norm": 0.35533010959625244, + "learning_rate": 8.958810255746462e-05, + "loss": 1.8862, + "step": 7589 + }, + { + "epoch": 2.3296500920810312, + "grad_norm": 0.36994054913520813, + "learning_rate": 8.958506619654226e-05, + "loss": 1.9783, + "step": 7590 + }, + { + "epoch": 2.3299570288520566, + "grad_norm": 0.4424416124820709, + "learning_rate": 8.958202944441705e-05, + "loss": 1.9095, + "step": 7591 + }, + { + "epoch": 2.3302639656230815, + "grad_norm": 0.41932111978530884, + "learning_rate": 8.957899230111903e-05, + "loss": 1.8623, + "step": 7592 + }, + { + "epoch": 2.330570902394107, + "grad_norm": 0.4359748363494873, + "learning_rate": 8.957595476667822e-05, + "loss": 1.8917, + "step": 7593 + }, + { + "epoch": 2.330877839165132, + "grad_norm": 0.362957239151001, + "learning_rate": 8.957291684112463e-05, + "loss": 1.8478, + "step": 7594 + }, + { + "epoch": 2.331184775936157, + "grad_norm": 0.3442717492580414, + "learning_rate": 8.956987852448827e-05, + "loss": 1.862, + "step": 7595 + }, + { + "epoch": 2.3314917127071824, + "grad_norm": 0.33355212211608887, + "learning_rate": 8.956683981679918e-05, + "loss": 1.8319, + "step": 7596 + }, + { + "epoch": 2.3317986494782073, + "grad_norm": 0.36758801341056824, + "learning_rate": 8.95638007180874e-05, + "loss": 1.8989, + "step": 7597 + }, + { + "epoch": 2.3321055862492326, + "grad_norm": 0.3574751019477844, + "learning_rate": 8.956076122838294e-05, + "loss": 1.8304, + "step": 7598 + }, + { + "epoch": 2.332412523020258, + "grad_norm": 0.30615341663360596, + "learning_rate": 8.955772134771585e-05, + "loss": 1.9078, + "step": 7599 + }, + { + "epoch": 2.332719459791283, + "grad_norm": 0.38824397325515747, + "learning_rate": 8.955468107611618e-05, + "loss": 1.8733, + "step": 7600 + }, + { + "epoch": 2.333026396562308, + "grad_norm": 0.40545380115509033, + "learning_rate": 8.955164041361395e-05, + "loss": 1.8264, + "step": 7601 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.3104313910007477, + "learning_rate": 8.954859936023925e-05, + "loss": 1.8272, + "step": 7602 + }, + { + "epoch": 2.3336402701043584, + "grad_norm": 0.34795114398002625, + "learning_rate": 8.954555791602211e-05, + "loss": 1.8711, + "step": 7603 + }, + { + "epoch": 2.3339472068753837, + "grad_norm": 0.42790937423706055, + "learning_rate": 8.954251608099257e-05, + "loss": 1.8802, + "step": 7604 + }, + { + "epoch": 2.334254143646409, + "grad_norm": 0.3903054893016815, + "learning_rate": 8.953947385518072e-05, + "loss": 1.8489, + "step": 7605 + }, + { + "epoch": 2.334561080417434, + "grad_norm": 0.35869601368904114, + "learning_rate": 8.953643123861661e-05, + "loss": 1.8565, + "step": 7606 + }, + { + "epoch": 2.3348680171884593, + "grad_norm": 0.3960758447647095, + "learning_rate": 8.953338823133033e-05, + "loss": 1.9335, + "step": 7607 + }, + { + "epoch": 2.335174953959484, + "grad_norm": 0.3884136974811554, + "learning_rate": 8.953034483335191e-05, + "loss": 1.887, + "step": 7608 + }, + { + "epoch": 2.3354818907305095, + "grad_norm": 0.3734811246395111, + "learning_rate": 8.952730104471147e-05, + "loss": 1.861, + "step": 7609 + }, + { + "epoch": 2.335788827501535, + "grad_norm": 0.3074554204940796, + "learning_rate": 8.952425686543908e-05, + "loss": 1.8556, + "step": 7610 + }, + { + "epoch": 2.3360957642725597, + "grad_norm": 0.3098750412464142, + "learning_rate": 8.952121229556481e-05, + "loss": 1.8724, + "step": 7611 + }, + { + "epoch": 2.336402701043585, + "grad_norm": 0.3514649569988251, + "learning_rate": 8.951816733511875e-05, + "loss": 1.8023, + "step": 7612 + }, + { + "epoch": 2.33670963781461, + "grad_norm": 0.3275100290775299, + "learning_rate": 8.951512198413101e-05, + "loss": 1.8805, + "step": 7613 + }, + { + "epoch": 2.3370165745856353, + "grad_norm": 0.3380829989910126, + "learning_rate": 8.951207624263165e-05, + "loss": 1.8559, + "step": 7614 + }, + { + "epoch": 2.3373235113566606, + "grad_norm": 0.43179723620414734, + "learning_rate": 8.950903011065082e-05, + "loss": 1.937, + "step": 7615 + }, + { + "epoch": 2.337630448127686, + "grad_norm": 0.4981893002986908, + "learning_rate": 8.950598358821858e-05, + "loss": 1.8828, + "step": 7616 + }, + { + "epoch": 2.337937384898711, + "grad_norm": 0.42164552211761475, + "learning_rate": 8.950293667536506e-05, + "loss": 1.8898, + "step": 7617 + }, + { + "epoch": 2.338244321669736, + "grad_norm": 0.32897287607192993, + "learning_rate": 8.949988937212037e-05, + "loss": 1.9073, + "step": 7618 + }, + { + "epoch": 2.338551258440761, + "grad_norm": 0.38831618428230286, + "learning_rate": 8.949684167851462e-05, + "loss": 1.9694, + "step": 7619 + }, + { + "epoch": 2.3388581952117864, + "grad_norm": 0.3728467524051666, + "learning_rate": 8.949379359457793e-05, + "loss": 1.8803, + "step": 7620 + }, + { + "epoch": 2.3391651319828117, + "grad_norm": 0.4003579020500183, + "learning_rate": 8.949074512034044e-05, + "loss": 1.9306, + "step": 7621 + }, + { + "epoch": 2.3394720687538366, + "grad_norm": 0.35670751333236694, + "learning_rate": 8.948769625583224e-05, + "loss": 1.9176, + "step": 7622 + }, + { + "epoch": 2.339779005524862, + "grad_norm": 0.3257119357585907, + "learning_rate": 8.948464700108347e-05, + "loss": 1.8781, + "step": 7623 + }, + { + "epoch": 2.340085942295887, + "grad_norm": 0.2840226888656616, + "learning_rate": 8.94815973561243e-05, + "loss": 1.8112, + "step": 7624 + }, + { + "epoch": 2.340392879066912, + "grad_norm": 0.33156147599220276, + "learning_rate": 8.947854732098484e-05, + "loss": 1.8562, + "step": 7625 + }, + { + "epoch": 2.3406998158379375, + "grad_norm": 0.33335328102111816, + "learning_rate": 8.947549689569524e-05, + "loss": 1.8404, + "step": 7626 + }, + { + "epoch": 2.3410067526089624, + "grad_norm": 0.2913919985294342, + "learning_rate": 8.947244608028562e-05, + "loss": 1.83, + "step": 7627 + }, + { + "epoch": 2.3413136893799877, + "grad_norm": 0.32735875248908997, + "learning_rate": 8.946939487478618e-05, + "loss": 1.9047, + "step": 7628 + }, + { + "epoch": 2.341620626151013, + "grad_norm": 0.3421878516674042, + "learning_rate": 8.946634327922703e-05, + "loss": 1.8771, + "step": 7629 + }, + { + "epoch": 2.341927562922038, + "grad_norm": 0.33164483308792114, + "learning_rate": 8.946329129363835e-05, + "loss": 1.8463, + "step": 7630 + }, + { + "epoch": 2.3422344996930633, + "grad_norm": 0.35423099994659424, + "learning_rate": 8.946023891805029e-05, + "loss": 1.9254, + "step": 7631 + }, + { + "epoch": 2.3425414364640886, + "grad_norm": 0.3554958403110504, + "learning_rate": 8.9457186152493e-05, + "loss": 1.8949, + "step": 7632 + }, + { + "epoch": 2.3428483732351135, + "grad_norm": 0.35155919194221497, + "learning_rate": 8.94541329969967e-05, + "loss": 1.8432, + "step": 7633 + }, + { + "epoch": 2.343155310006139, + "grad_norm": 0.3210476338863373, + "learning_rate": 8.945107945159154e-05, + "loss": 1.8512, + "step": 7634 + }, + { + "epoch": 2.3434622467771637, + "grad_norm": 0.3587365746498108, + "learning_rate": 8.944802551630767e-05, + "loss": 1.8355, + "step": 7635 + }, + { + "epoch": 2.343769183548189, + "grad_norm": 0.41851457953453064, + "learning_rate": 8.94449711911753e-05, + "loss": 1.814, + "step": 7636 + }, + { + "epoch": 2.3440761203192144, + "grad_norm": 0.3516016900539398, + "learning_rate": 8.94419164762246e-05, + "loss": 1.8563, + "step": 7637 + }, + { + "epoch": 2.3443830570902393, + "grad_norm": 0.2917228937149048, + "learning_rate": 8.943886137148576e-05, + "loss": 1.8037, + "step": 7638 + }, + { + "epoch": 2.3446899938612646, + "grad_norm": 0.3597778379917145, + "learning_rate": 8.943580587698899e-05, + "loss": 1.8766, + "step": 7639 + }, + { + "epoch": 2.3449969306322895, + "grad_norm": 0.359642893075943, + "learning_rate": 8.943274999276445e-05, + "loss": 1.8485, + "step": 7640 + }, + { + "epoch": 2.345303867403315, + "grad_norm": 0.3543380796909332, + "learning_rate": 8.942969371884238e-05, + "loss": 1.8853, + "step": 7641 + }, + { + "epoch": 2.34561080417434, + "grad_norm": 0.371267706155777, + "learning_rate": 8.942663705525296e-05, + "loss": 1.869, + "step": 7642 + }, + { + "epoch": 2.345917740945365, + "grad_norm": 0.34073930978775024, + "learning_rate": 8.942358000202642e-05, + "loss": 1.831, + "step": 7643 + }, + { + "epoch": 2.3462246777163904, + "grad_norm": 0.3654492497444153, + "learning_rate": 8.942052255919293e-05, + "loss": 1.8697, + "step": 7644 + }, + { + "epoch": 2.3465316144874158, + "grad_norm": 0.31281957030296326, + "learning_rate": 8.941746472678275e-05, + "loss": 1.7908, + "step": 7645 + }, + { + "epoch": 2.3468385512584407, + "grad_norm": 0.3310844302177429, + "learning_rate": 8.941440650482607e-05, + "loss": 1.8523, + "step": 7646 + }, + { + "epoch": 2.347145488029466, + "grad_norm": 0.3187454342842102, + "learning_rate": 8.941134789335312e-05, + "loss": 1.8808, + "step": 7647 + }, + { + "epoch": 2.3474524248004913, + "grad_norm": 0.35980424284935, + "learning_rate": 8.940828889239415e-05, + "loss": 1.8713, + "step": 7648 + }, + { + "epoch": 2.347759361571516, + "grad_norm": 0.2960885763168335, + "learning_rate": 8.940522950197935e-05, + "loss": 1.8077, + "step": 7649 + }, + { + "epoch": 2.3480662983425415, + "grad_norm": 0.3056114912033081, + "learning_rate": 8.940216972213897e-05, + "loss": 1.8805, + "step": 7650 + }, + { + "epoch": 2.3483732351135664, + "grad_norm": 0.3047563135623932, + "learning_rate": 8.939910955290328e-05, + "loss": 1.793, + "step": 7651 + }, + { + "epoch": 2.3486801718845918, + "grad_norm": 0.3381251394748688, + "learning_rate": 8.939604899430248e-05, + "loss": 1.8267, + "step": 7652 + }, + { + "epoch": 2.348987108655617, + "grad_norm": 0.36855414509773254, + "learning_rate": 8.939298804636684e-05, + "loss": 1.9386, + "step": 7653 + }, + { + "epoch": 2.349294045426642, + "grad_norm": 0.3742626905441284, + "learning_rate": 8.93899267091266e-05, + "loss": 1.8695, + "step": 7654 + }, + { + "epoch": 2.3496009821976673, + "grad_norm": 0.3170017600059509, + "learning_rate": 8.938686498261201e-05, + "loss": 1.881, + "step": 7655 + }, + { + "epoch": 2.349907918968692, + "grad_norm": 0.2740418016910553, + "learning_rate": 8.938380286685334e-05, + "loss": 1.7992, + "step": 7656 + }, + { + "epoch": 2.3502148557397176, + "grad_norm": 0.3170342743396759, + "learning_rate": 8.938074036188087e-05, + "loss": 1.8281, + "step": 7657 + }, + { + "epoch": 2.350521792510743, + "grad_norm": 0.3487764298915863, + "learning_rate": 8.93776774677248e-05, + "loss": 1.8508, + "step": 7658 + }, + { + "epoch": 2.350828729281768, + "grad_norm": 0.3193725347518921, + "learning_rate": 8.937461418441549e-05, + "loss": 1.802, + "step": 7659 + }, + { + "epoch": 2.351135666052793, + "grad_norm": 0.30621078610420227, + "learning_rate": 8.937155051198312e-05, + "loss": 1.8723, + "step": 7660 + }, + { + "epoch": 2.3514426028238185, + "grad_norm": 0.3154527544975281, + "learning_rate": 8.936848645045803e-05, + "loss": 1.8276, + "step": 7661 + }, + { + "epoch": 2.3517495395948433, + "grad_norm": 0.3809822201728821, + "learning_rate": 8.936542199987048e-05, + "loss": 1.9682, + "step": 7662 + }, + { + "epoch": 2.3520564763658687, + "grad_norm": 0.3817490339279175, + "learning_rate": 8.936235716025076e-05, + "loss": 1.8896, + "step": 7663 + }, + { + "epoch": 2.352363413136894, + "grad_norm": 0.2996097207069397, + "learning_rate": 8.935929193162915e-05, + "loss": 1.7994, + "step": 7664 + }, + { + "epoch": 2.352670349907919, + "grad_norm": 0.30788013339042664, + "learning_rate": 8.935622631403596e-05, + "loss": 1.8243, + "step": 7665 + }, + { + "epoch": 2.3529772866789442, + "grad_norm": 0.331193745136261, + "learning_rate": 8.935316030750145e-05, + "loss": 1.9044, + "step": 7666 + }, + { + "epoch": 2.353284223449969, + "grad_norm": 0.31796711683273315, + "learning_rate": 8.935009391205598e-05, + "loss": 1.8006, + "step": 7667 + }, + { + "epoch": 2.3535911602209945, + "grad_norm": 0.3864014744758606, + "learning_rate": 8.934702712772979e-05, + "loss": 2.0193, + "step": 7668 + }, + { + "epoch": 2.35389809699202, + "grad_norm": 0.3923170566558838, + "learning_rate": 8.934395995455323e-05, + "loss": 1.9418, + "step": 7669 + }, + { + "epoch": 2.3542050337630447, + "grad_norm": 0.3210037052631378, + "learning_rate": 8.934089239255659e-05, + "loss": 1.7964, + "step": 7670 + }, + { + "epoch": 2.35451197053407, + "grad_norm": 0.32465317845344543, + "learning_rate": 8.933782444177019e-05, + "loss": 1.9405, + "step": 7671 + }, + { + "epoch": 2.354818907305095, + "grad_norm": 0.35554173588752747, + "learning_rate": 8.933475610222435e-05, + "loss": 1.8645, + "step": 7672 + }, + { + "epoch": 2.3551258440761202, + "grad_norm": 0.32723551988601685, + "learning_rate": 8.933168737394942e-05, + "loss": 1.8941, + "step": 7673 + }, + { + "epoch": 2.3554327808471456, + "grad_norm": 0.3295009732246399, + "learning_rate": 8.932861825697567e-05, + "loss": 1.9047, + "step": 7674 + }, + { + "epoch": 2.3557397176181705, + "grad_norm": 0.32315388321876526, + "learning_rate": 8.932554875133348e-05, + "loss": 1.8535, + "step": 7675 + }, + { + "epoch": 2.356046654389196, + "grad_norm": 0.31577154994010925, + "learning_rate": 8.932247885705315e-05, + "loss": 1.8697, + "step": 7676 + }, + { + "epoch": 2.356353591160221, + "grad_norm": 0.31099769473075867, + "learning_rate": 8.931940857416506e-05, + "loss": 1.8377, + "step": 7677 + }, + { + "epoch": 2.356660527931246, + "grad_norm": 0.32998642325401306, + "learning_rate": 8.931633790269954e-05, + "loss": 1.8528, + "step": 7678 + }, + { + "epoch": 2.3569674647022714, + "grad_norm": 0.29609233140945435, + "learning_rate": 8.93132668426869e-05, + "loss": 1.8646, + "step": 7679 + }, + { + "epoch": 2.3572744014732967, + "grad_norm": 0.31335413455963135, + "learning_rate": 8.931019539415752e-05, + "loss": 1.9011, + "step": 7680 + }, + { + "epoch": 2.3575813382443216, + "grad_norm": 0.3441788852214813, + "learning_rate": 8.930712355714174e-05, + "loss": 1.8673, + "step": 7681 + }, + { + "epoch": 2.357888275015347, + "grad_norm": 0.34610918164253235, + "learning_rate": 8.930405133166992e-05, + "loss": 1.8613, + "step": 7682 + }, + { + "epoch": 2.358195211786372, + "grad_norm": 0.31753265857696533, + "learning_rate": 8.930097871777245e-05, + "loss": 1.873, + "step": 7683 + }, + { + "epoch": 2.358502148557397, + "grad_norm": 0.29862073063850403, + "learning_rate": 8.929790571547966e-05, + "loss": 1.8392, + "step": 7684 + }, + { + "epoch": 2.3588090853284225, + "grad_norm": 0.2953017055988312, + "learning_rate": 8.929483232482194e-05, + "loss": 1.8402, + "step": 7685 + }, + { + "epoch": 2.3591160220994474, + "grad_norm": 0.36613956093788147, + "learning_rate": 8.929175854582966e-05, + "loss": 1.8954, + "step": 7686 + }, + { + "epoch": 2.3594229588704727, + "grad_norm": 0.3867746889591217, + "learning_rate": 8.928868437853319e-05, + "loss": 1.8496, + "step": 7687 + }, + { + "epoch": 2.359729895641498, + "grad_norm": 0.30742913484573364, + "learning_rate": 8.928560982296292e-05, + "loss": 1.82, + "step": 7688 + }, + { + "epoch": 2.360036832412523, + "grad_norm": 0.306905061006546, + "learning_rate": 8.928253487914921e-05, + "loss": 1.8299, + "step": 7689 + }, + { + "epoch": 2.3603437691835483, + "grad_norm": 0.3253326416015625, + "learning_rate": 8.927945954712247e-05, + "loss": 1.896, + "step": 7690 + }, + { + "epoch": 2.3606507059545736, + "grad_norm": 0.3139156699180603, + "learning_rate": 8.927638382691309e-05, + "loss": 1.838, + "step": 7691 + }, + { + "epoch": 2.3609576427255985, + "grad_norm": 0.3865121006965637, + "learning_rate": 8.927330771855147e-05, + "loss": 1.8502, + "step": 7692 + }, + { + "epoch": 2.361264579496624, + "grad_norm": 0.3640300929546356, + "learning_rate": 8.927023122206799e-05, + "loss": 1.8929, + "step": 7693 + }, + { + "epoch": 2.3615715162676487, + "grad_norm": 0.3446909487247467, + "learning_rate": 8.926715433749309e-05, + "loss": 1.864, + "step": 7694 + }, + { + "epoch": 2.361878453038674, + "grad_norm": 0.3086490035057068, + "learning_rate": 8.926407706485713e-05, + "loss": 1.8588, + "step": 7695 + }, + { + "epoch": 2.3621853898096994, + "grad_norm": 0.28351619839668274, + "learning_rate": 8.926099940419057e-05, + "loss": 1.8114, + "step": 7696 + }, + { + "epoch": 2.3624923265807243, + "grad_norm": 0.31882742047309875, + "learning_rate": 8.925792135552379e-05, + "loss": 1.8544, + "step": 7697 + }, + { + "epoch": 2.3627992633517496, + "grad_norm": 0.2691894769668579, + "learning_rate": 8.925484291888723e-05, + "loss": 1.8143, + "step": 7698 + }, + { + "epoch": 2.3631062001227745, + "grad_norm": 0.2815118432044983, + "learning_rate": 8.925176409431129e-05, + "loss": 1.8687, + "step": 7699 + }, + { + "epoch": 2.3634131368938, + "grad_norm": 0.34842196106910706, + "learning_rate": 8.924868488182643e-05, + "loss": 1.8673, + "step": 7700 + }, + { + "epoch": 2.363720073664825, + "grad_norm": 0.33553025126457214, + "learning_rate": 8.924560528146304e-05, + "loss": 1.8982, + "step": 7701 + }, + { + "epoch": 2.36402701043585, + "grad_norm": 0.30077221989631653, + "learning_rate": 8.924252529325159e-05, + "loss": 1.8155, + "step": 7702 + }, + { + "epoch": 2.3643339472068754, + "grad_norm": 0.3376595079898834, + "learning_rate": 8.923944491722252e-05, + "loss": 1.8871, + "step": 7703 + }, + { + "epoch": 2.3646408839779007, + "grad_norm": 0.3980284333229065, + "learning_rate": 8.923636415340622e-05, + "loss": 1.8414, + "step": 7704 + }, + { + "epoch": 2.3649478207489256, + "grad_norm": 0.4772777259349823, + "learning_rate": 8.92332830018332e-05, + "loss": 1.8393, + "step": 7705 + }, + { + "epoch": 2.365254757519951, + "grad_norm": 0.5061559081077576, + "learning_rate": 8.923020146253387e-05, + "loss": 1.9134, + "step": 7706 + }, + { + "epoch": 2.3655616942909763, + "grad_norm": 0.47147873044013977, + "learning_rate": 8.922711953553871e-05, + "loss": 1.9026, + "step": 7707 + }, + { + "epoch": 2.365868631062001, + "grad_norm": 0.37263748049736023, + "learning_rate": 8.922403722087814e-05, + "loss": 1.8474, + "step": 7708 + }, + { + "epoch": 2.3661755678330265, + "grad_norm": 0.3158501386642456, + "learning_rate": 8.922095451858265e-05, + "loss": 1.8771, + "step": 7709 + }, + { + "epoch": 2.3664825046040514, + "grad_norm": 0.3170566260814667, + "learning_rate": 8.921787142868271e-05, + "loss": 1.8111, + "step": 7710 + }, + { + "epoch": 2.3667894413750767, + "grad_norm": 0.3532208502292633, + "learning_rate": 8.921478795120877e-05, + "loss": 1.8708, + "step": 7711 + }, + { + "epoch": 2.367096378146102, + "grad_norm": 0.3211480379104614, + "learning_rate": 8.921170408619131e-05, + "loss": 1.8487, + "step": 7712 + }, + { + "epoch": 2.367403314917127, + "grad_norm": 0.2806071937084198, + "learning_rate": 8.920861983366083e-05, + "loss": 1.8325, + "step": 7713 + }, + { + "epoch": 2.3677102516881523, + "grad_norm": 0.30703970789909363, + "learning_rate": 8.920553519364777e-05, + "loss": 1.8364, + "step": 7714 + }, + { + "epoch": 2.368017188459177, + "grad_norm": 0.30848923325538635, + "learning_rate": 8.920245016618263e-05, + "loss": 1.833, + "step": 7715 + }, + { + "epoch": 2.3683241252302025, + "grad_norm": 0.31656739115715027, + "learning_rate": 8.919936475129588e-05, + "loss": 1.8884, + "step": 7716 + }, + { + "epoch": 2.368631062001228, + "grad_norm": 0.2806589603424072, + "learning_rate": 8.919627894901806e-05, + "loss": 1.7779, + "step": 7717 + }, + { + "epoch": 2.3689379987722528, + "grad_norm": 0.2943432629108429, + "learning_rate": 8.919319275937962e-05, + "loss": 1.8741, + "step": 7718 + }, + { + "epoch": 2.369244935543278, + "grad_norm": 0.2870347499847412, + "learning_rate": 8.919010618241111e-05, + "loss": 1.8415, + "step": 7719 + }, + { + "epoch": 2.3695518723143034, + "grad_norm": 0.3224312663078308, + "learning_rate": 8.918701921814297e-05, + "loss": 1.8594, + "step": 7720 + }, + { + "epoch": 2.3698588090853283, + "grad_norm": 0.3007681369781494, + "learning_rate": 8.918393186660575e-05, + "loss": 1.878, + "step": 7721 + }, + { + "epoch": 2.3701657458563536, + "grad_norm": 0.3083780109882355, + "learning_rate": 8.918084412782994e-05, + "loss": 1.9088, + "step": 7722 + }, + { + "epoch": 2.370472682627379, + "grad_norm": 0.30599063634872437, + "learning_rate": 8.917775600184608e-05, + "loss": 1.8743, + "step": 7723 + }, + { + "epoch": 2.370779619398404, + "grad_norm": 0.33503273129463196, + "learning_rate": 8.917466748868466e-05, + "loss": 1.9048, + "step": 7724 + }, + { + "epoch": 2.371086556169429, + "grad_norm": 0.3861919343471527, + "learning_rate": 8.917157858837622e-05, + "loss": 1.9073, + "step": 7725 + }, + { + "epoch": 2.371393492940454, + "grad_norm": 0.395945280790329, + "learning_rate": 8.916848930095128e-05, + "loss": 1.8678, + "step": 7726 + }, + { + "epoch": 2.3717004297114794, + "grad_norm": 0.3657386600971222, + "learning_rate": 8.916539962644037e-05, + "loss": 1.9138, + "step": 7727 + }, + { + "epoch": 2.3720073664825048, + "grad_norm": 0.32392752170562744, + "learning_rate": 8.916230956487402e-05, + "loss": 1.803, + "step": 7728 + }, + { + "epoch": 2.3723143032535297, + "grad_norm": 0.406703382730484, + "learning_rate": 8.915921911628278e-05, + "loss": 1.9222, + "step": 7729 + }, + { + "epoch": 2.372621240024555, + "grad_norm": 0.4293023645877838, + "learning_rate": 8.915612828069718e-05, + "loss": 1.8874, + "step": 7730 + }, + { + "epoch": 2.37292817679558, + "grad_norm": 0.45155876874923706, + "learning_rate": 8.915303705814777e-05, + "loss": 1.9059, + "step": 7731 + }, + { + "epoch": 2.373235113566605, + "grad_norm": 0.35105881094932556, + "learning_rate": 8.91499454486651e-05, + "loss": 1.8387, + "step": 7732 + }, + { + "epoch": 2.3735420503376305, + "grad_norm": 0.3197930157184601, + "learning_rate": 8.914685345227973e-05, + "loss": 1.8174, + "step": 7733 + }, + { + "epoch": 2.3738489871086554, + "grad_norm": 0.3610389232635498, + "learning_rate": 8.91437610690222e-05, + "loss": 1.841, + "step": 7734 + }, + { + "epoch": 2.3741559238796808, + "grad_norm": 0.3696954548358917, + "learning_rate": 8.91406682989231e-05, + "loss": 1.8511, + "step": 7735 + }, + { + "epoch": 2.374462860650706, + "grad_norm": 0.3364555239677429, + "learning_rate": 8.913757514201295e-05, + "loss": 1.8382, + "step": 7736 + }, + { + "epoch": 2.374769797421731, + "grad_norm": 0.4600698947906494, + "learning_rate": 8.913448159832236e-05, + "loss": 1.8247, + "step": 7737 + }, + { + "epoch": 2.3750767341927563, + "grad_norm": 0.5877843499183655, + "learning_rate": 8.913138766788187e-05, + "loss": 1.8449, + "step": 7738 + }, + { + "epoch": 2.3753836709637817, + "grad_norm": 0.5380640029907227, + "learning_rate": 8.912829335072208e-05, + "loss": 1.8647, + "step": 7739 + }, + { + "epoch": 2.3756906077348066, + "grad_norm": 0.5100306272506714, + "learning_rate": 8.912519864687357e-05, + "loss": 1.884, + "step": 7740 + }, + { + "epoch": 2.375997544505832, + "grad_norm": 0.48175910115242004, + "learning_rate": 8.91221035563669e-05, + "loss": 1.8378, + "step": 7741 + }, + { + "epoch": 2.376304481276857, + "grad_norm": 0.3296540081501007, + "learning_rate": 8.911900807923268e-05, + "loss": 1.8036, + "step": 7742 + }, + { + "epoch": 2.376611418047882, + "grad_norm": 0.32398131489753723, + "learning_rate": 8.911591221550149e-05, + "loss": 1.8415, + "step": 7743 + }, + { + "epoch": 2.3769183548189075, + "grad_norm": 0.33934786915779114, + "learning_rate": 8.911281596520393e-05, + "loss": 1.9002, + "step": 7744 + }, + { + "epoch": 2.3772252915899323, + "grad_norm": 0.33059465885162354, + "learning_rate": 8.91097193283706e-05, + "loss": 1.8194, + "step": 7745 + }, + { + "epoch": 2.3775322283609577, + "grad_norm": 0.2908796966075897, + "learning_rate": 8.91066223050321e-05, + "loss": 1.8272, + "step": 7746 + }, + { + "epoch": 2.3778391651319826, + "grad_norm": 0.31551963090896606, + "learning_rate": 8.910352489521904e-05, + "loss": 1.8717, + "step": 7747 + }, + { + "epoch": 2.378146101903008, + "grad_norm": 0.2886766493320465, + "learning_rate": 8.910042709896203e-05, + "loss": 1.8714, + "step": 7748 + }, + { + "epoch": 2.3784530386740332, + "grad_norm": 0.3288721740245819, + "learning_rate": 8.909732891629167e-05, + "loss": 1.9194, + "step": 7749 + }, + { + "epoch": 2.378759975445058, + "grad_norm": 0.42444637417793274, + "learning_rate": 8.90942303472386e-05, + "loss": 1.8871, + "step": 7750 + }, + { + "epoch": 2.3790669122160835, + "grad_norm": 0.3550770580768585, + "learning_rate": 8.909113139183343e-05, + "loss": 1.8639, + "step": 7751 + }, + { + "epoch": 2.379373848987109, + "grad_norm": 0.3291744589805603, + "learning_rate": 8.908803205010679e-05, + "loss": 1.8284, + "step": 7752 + }, + { + "epoch": 2.3796807857581337, + "grad_norm": 0.2803054451942444, + "learning_rate": 8.908493232208928e-05, + "loss": 1.8113, + "step": 7753 + }, + { + "epoch": 2.379987722529159, + "grad_norm": 0.30959245562553406, + "learning_rate": 8.908183220781158e-05, + "loss": 1.8821, + "step": 7754 + }, + { + "epoch": 2.3802946593001844, + "grad_norm": 0.37838777899742126, + "learning_rate": 8.907873170730431e-05, + "loss": 1.8749, + "step": 7755 + }, + { + "epoch": 2.3806015960712092, + "grad_norm": 0.34625449776649475, + "learning_rate": 8.907563082059813e-05, + "loss": 1.8804, + "step": 7756 + }, + { + "epoch": 2.3809085328422346, + "grad_norm": 0.3966830372810364, + "learning_rate": 8.907252954772364e-05, + "loss": 1.9295, + "step": 7757 + }, + { + "epoch": 2.3812154696132595, + "grad_norm": 0.3144119679927826, + "learning_rate": 8.906942788871151e-05, + "loss": 1.8486, + "step": 7758 + }, + { + "epoch": 2.381522406384285, + "grad_norm": 0.3498438596725464, + "learning_rate": 8.90663258435924e-05, + "loss": 1.8813, + "step": 7759 + }, + { + "epoch": 2.38182934315531, + "grad_norm": 0.32803723216056824, + "learning_rate": 8.906322341239696e-05, + "loss": 1.8282, + "step": 7760 + }, + { + "epoch": 2.382136279926335, + "grad_norm": 0.28600773215293884, + "learning_rate": 8.906012059515585e-05, + "loss": 1.8319, + "step": 7761 + }, + { + "epoch": 2.3824432166973604, + "grad_norm": 0.2743505537509918, + "learning_rate": 8.905701739189973e-05, + "loss": 1.8198, + "step": 7762 + }, + { + "epoch": 2.3827501534683857, + "grad_norm": 0.3011966347694397, + "learning_rate": 8.905391380265929e-05, + "loss": 1.8476, + "step": 7763 + }, + { + "epoch": 2.3830570902394106, + "grad_norm": 0.3022943437099457, + "learning_rate": 8.905080982746516e-05, + "loss": 1.9037, + "step": 7764 + }, + { + "epoch": 2.383364027010436, + "grad_norm": 0.3333243727684021, + "learning_rate": 8.904770546634805e-05, + "loss": 1.8487, + "step": 7765 + }, + { + "epoch": 2.3836709637814613, + "grad_norm": 0.3773072361946106, + "learning_rate": 8.904460071933862e-05, + "loss": 1.8828, + "step": 7766 + }, + { + "epoch": 2.383977900552486, + "grad_norm": 0.4382041096687317, + "learning_rate": 8.904149558646756e-05, + "loss": 1.9069, + "step": 7767 + }, + { + "epoch": 2.3842848373235115, + "grad_norm": 0.3963650166988373, + "learning_rate": 8.903839006776557e-05, + "loss": 1.816, + "step": 7768 + }, + { + "epoch": 2.3845917740945364, + "grad_norm": 0.35340386629104614, + "learning_rate": 8.903528416326333e-05, + "loss": 1.8853, + "step": 7769 + }, + { + "epoch": 2.3848987108655617, + "grad_norm": 0.31519120931625366, + "learning_rate": 8.903217787299153e-05, + "loss": 1.8953, + "step": 7770 + }, + { + "epoch": 2.385205647636587, + "grad_norm": 0.41126203536987305, + "learning_rate": 8.902907119698088e-05, + "loss": 1.9494, + "step": 7771 + }, + { + "epoch": 2.385512584407612, + "grad_norm": 0.4488140344619751, + "learning_rate": 8.902596413526205e-05, + "loss": 1.8717, + "step": 7772 + }, + { + "epoch": 2.3858195211786373, + "grad_norm": 0.36129191517829895, + "learning_rate": 8.902285668786578e-05, + "loss": 1.8472, + "step": 7773 + }, + { + "epoch": 2.386126457949662, + "grad_norm": 0.3357439935207367, + "learning_rate": 8.901974885482277e-05, + "loss": 1.8143, + "step": 7774 + }, + { + "epoch": 2.3864333947206875, + "grad_norm": 0.2832469046115875, + "learning_rate": 8.901664063616372e-05, + "loss": 1.7952, + "step": 7775 + }, + { + "epoch": 2.386740331491713, + "grad_norm": 0.31065669655799866, + "learning_rate": 8.901353203191937e-05, + "loss": 1.8651, + "step": 7776 + }, + { + "epoch": 2.3870472682627377, + "grad_norm": 0.2985263764858246, + "learning_rate": 8.901042304212042e-05, + "loss": 1.8106, + "step": 7777 + }, + { + "epoch": 2.387354205033763, + "grad_norm": 0.31606364250183105, + "learning_rate": 8.900731366679761e-05, + "loss": 1.8831, + "step": 7778 + }, + { + "epoch": 2.3876611418047884, + "grad_norm": 0.33167949318885803, + "learning_rate": 8.900420390598166e-05, + "loss": 1.9494, + "step": 7779 + }, + { + "epoch": 2.3879680785758133, + "grad_norm": 0.32814472913742065, + "learning_rate": 8.900109375970333e-05, + "loss": 1.8654, + "step": 7780 + }, + { + "epoch": 2.3882750153468386, + "grad_norm": 0.35307401418685913, + "learning_rate": 8.899798322799331e-05, + "loss": 1.904, + "step": 7781 + }, + { + "epoch": 2.388581952117864, + "grad_norm": 0.3936740458011627, + "learning_rate": 8.899487231088236e-05, + "loss": 1.8404, + "step": 7782 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.3675380349159241, + "learning_rate": 8.899176100840124e-05, + "loss": 1.8689, + "step": 7783 + }, + { + "epoch": 2.389195825659914, + "grad_norm": 0.34065911173820496, + "learning_rate": 8.898864932058067e-05, + "loss": 1.8819, + "step": 7784 + }, + { + "epoch": 2.389502762430939, + "grad_norm": 0.31531861424446106, + "learning_rate": 8.898553724745142e-05, + "loss": 1.8379, + "step": 7785 + }, + { + "epoch": 2.3898096992019644, + "grad_norm": 0.33485177159309387, + "learning_rate": 8.898242478904424e-05, + "loss": 1.9206, + "step": 7786 + }, + { + "epoch": 2.3901166359729897, + "grad_norm": 0.33116385340690613, + "learning_rate": 8.897931194538989e-05, + "loss": 1.8744, + "step": 7787 + }, + { + "epoch": 2.3904235727440146, + "grad_norm": 0.33216002583503723, + "learning_rate": 8.897619871651915e-05, + "loss": 1.8794, + "step": 7788 + }, + { + "epoch": 2.39073050951504, + "grad_norm": 0.3246794641017914, + "learning_rate": 8.897308510246273e-05, + "loss": 1.8739, + "step": 7789 + }, + { + "epoch": 2.391037446286065, + "grad_norm": 0.3038793206214905, + "learning_rate": 8.896997110325146e-05, + "loss": 1.8314, + "step": 7790 + }, + { + "epoch": 2.39134438305709, + "grad_norm": 0.35726267099380493, + "learning_rate": 8.896685671891612e-05, + "loss": 1.8764, + "step": 7791 + }, + { + "epoch": 2.3916513198281155, + "grad_norm": 0.421522855758667, + "learning_rate": 8.896374194948744e-05, + "loss": 1.8215, + "step": 7792 + }, + { + "epoch": 2.3919582565991404, + "grad_norm": 0.4456072747707367, + "learning_rate": 8.896062679499621e-05, + "loss": 1.9146, + "step": 7793 + }, + { + "epoch": 2.3922651933701657, + "grad_norm": 0.33498415350914, + "learning_rate": 8.895751125547325e-05, + "loss": 1.8372, + "step": 7794 + }, + { + "epoch": 2.392572130141191, + "grad_norm": 0.3279598355293274, + "learning_rate": 8.895439533094933e-05, + "loss": 1.8469, + "step": 7795 + }, + { + "epoch": 2.392879066912216, + "grad_norm": 0.4238305687904358, + "learning_rate": 8.895127902145524e-05, + "loss": 1.8259, + "step": 7796 + }, + { + "epoch": 2.3931860036832413, + "grad_norm": 0.473057359457016, + "learning_rate": 8.89481623270218e-05, + "loss": 1.8374, + "step": 7797 + }, + { + "epoch": 2.3934929404542666, + "grad_norm": 0.30914968252182007, + "learning_rate": 8.894504524767976e-05, + "loss": 1.7803, + "step": 7798 + }, + { + "epoch": 2.3937998772252915, + "grad_norm": 0.3433384597301483, + "learning_rate": 8.894192778345996e-05, + "loss": 1.8568, + "step": 7799 + }, + { + "epoch": 2.394106813996317, + "grad_norm": 0.4965706467628479, + "learning_rate": 8.893880993439323e-05, + "loss": 1.8576, + "step": 7800 + }, + { + "epoch": 2.3944137507673418, + "grad_norm": 0.4996519684791565, + "learning_rate": 8.893569170051032e-05, + "loss": 1.788, + "step": 7801 + }, + { + "epoch": 2.394720687538367, + "grad_norm": 0.31231364607810974, + "learning_rate": 8.893257308184212e-05, + "loss": 1.7846, + "step": 7802 + }, + { + "epoch": 2.3950276243093924, + "grad_norm": 0.32845574617385864, + "learning_rate": 8.89294540784194e-05, + "loss": 1.8811, + "step": 7803 + }, + { + "epoch": 2.3953345610804173, + "grad_norm": 0.525324285030365, + "learning_rate": 8.8926334690273e-05, + "loss": 1.8458, + "step": 7804 + }, + { + "epoch": 2.3956414978514426, + "grad_norm": 0.5107213854789734, + "learning_rate": 8.892321491743373e-05, + "loss": 1.8419, + "step": 7805 + }, + { + "epoch": 2.3959484346224675, + "grad_norm": 0.33831658959388733, + "learning_rate": 8.892009475993245e-05, + "loss": 1.811, + "step": 7806 + }, + { + "epoch": 2.396255371393493, + "grad_norm": 0.3781357407569885, + "learning_rate": 8.891697421779999e-05, + "loss": 1.9385, + "step": 7807 + }, + { + "epoch": 2.396562308164518, + "grad_norm": 0.43507882952690125, + "learning_rate": 8.891385329106717e-05, + "loss": 1.7705, + "step": 7808 + }, + { + "epoch": 2.396869244935543, + "grad_norm": 0.45114290714263916, + "learning_rate": 8.891073197976483e-05, + "loss": 1.8661, + "step": 7809 + }, + { + "epoch": 2.3971761817065684, + "grad_norm": 0.29369547963142395, + "learning_rate": 8.890761028392385e-05, + "loss": 1.873, + "step": 7810 + }, + { + "epoch": 2.3974831184775938, + "grad_norm": 0.3268595337867737, + "learning_rate": 8.890448820357506e-05, + "loss": 1.8461, + "step": 7811 + }, + { + "epoch": 2.3977900552486187, + "grad_norm": 0.4514225423336029, + "learning_rate": 8.890136573874931e-05, + "loss": 1.8458, + "step": 7812 + }, + { + "epoch": 2.398096992019644, + "grad_norm": 0.5288760662078857, + "learning_rate": 8.889824288947745e-05, + "loss": 1.8301, + "step": 7813 + }, + { + "epoch": 2.3984039287906693, + "grad_norm": 0.46517884731292725, + "learning_rate": 8.889511965579038e-05, + "loss": 1.8769, + "step": 7814 + }, + { + "epoch": 2.398710865561694, + "grad_norm": 0.29907044768333435, + "learning_rate": 8.889199603771892e-05, + "loss": 1.7815, + "step": 7815 + }, + { + "epoch": 2.3990178023327196, + "grad_norm": 0.36091622710227966, + "learning_rate": 8.888887203529398e-05, + "loss": 1.8375, + "step": 7816 + }, + { + "epoch": 2.3993247391037444, + "grad_norm": 0.5604190230369568, + "learning_rate": 8.88857476485464e-05, + "loss": 1.9176, + "step": 7817 + }, + { + "epoch": 2.3996316758747698, + "grad_norm": 0.48299452662467957, + "learning_rate": 8.888262287750707e-05, + "loss": 1.8682, + "step": 7818 + }, + { + "epoch": 2.399938612645795, + "grad_norm": 0.32829394936561584, + "learning_rate": 8.887949772220687e-05, + "loss": 1.9143, + "step": 7819 + }, + { + "epoch": 2.40024554941682, + "grad_norm": 0.401719868183136, + "learning_rate": 8.88763721826767e-05, + "loss": 1.8517, + "step": 7820 + }, + { + "epoch": 2.4005524861878453, + "grad_norm": 0.5205032825469971, + "learning_rate": 8.887324625894741e-05, + "loss": 1.811, + "step": 7821 + }, + { + "epoch": 2.4008594229588702, + "grad_norm": 0.3828800618648529, + "learning_rate": 8.887011995104993e-05, + "loss": 1.8042, + "step": 7822 + }, + { + "epoch": 2.4011663597298956, + "grad_norm": 0.31816062331199646, + "learning_rate": 8.886699325901514e-05, + "loss": 1.8998, + "step": 7823 + }, + { + "epoch": 2.401473296500921, + "grad_norm": 0.36172720789909363, + "learning_rate": 8.886386618287394e-05, + "loss": 1.8689, + "step": 7824 + }, + { + "epoch": 2.401780233271946, + "grad_norm": 0.3582005202770233, + "learning_rate": 8.886073872265725e-05, + "loss": 1.8565, + "step": 7825 + }, + { + "epoch": 2.402087170042971, + "grad_norm": 0.2915255129337311, + "learning_rate": 8.885761087839594e-05, + "loss": 1.8686, + "step": 7826 + }, + { + "epoch": 2.4023941068139965, + "grad_norm": 0.26619917154312134, + "learning_rate": 8.885448265012095e-05, + "loss": 1.7737, + "step": 7827 + }, + { + "epoch": 2.4027010435850213, + "grad_norm": 0.31685733795166016, + "learning_rate": 8.88513540378632e-05, + "loss": 1.9136, + "step": 7828 + }, + { + "epoch": 2.4030079803560467, + "grad_norm": 0.3427450954914093, + "learning_rate": 8.884822504165359e-05, + "loss": 1.8824, + "step": 7829 + }, + { + "epoch": 2.403314917127072, + "grad_norm": 0.3207513689994812, + "learning_rate": 8.884509566152306e-05, + "loss": 1.8332, + "step": 7830 + }, + { + "epoch": 2.403621853898097, + "grad_norm": 0.3301675319671631, + "learning_rate": 8.884196589750251e-05, + "loss": 1.9129, + "step": 7831 + }, + { + "epoch": 2.4039287906691222, + "grad_norm": 0.3232486844062805, + "learning_rate": 8.88388357496229e-05, + "loss": 1.8362, + "step": 7832 + }, + { + "epoch": 2.404235727440147, + "grad_norm": 0.3152230381965637, + "learning_rate": 8.883570521791514e-05, + "loss": 1.8586, + "step": 7833 + }, + { + "epoch": 2.4045426642111725, + "grad_norm": 0.3204822540283203, + "learning_rate": 8.883257430241019e-05, + "loss": 1.842, + "step": 7834 + }, + { + "epoch": 2.404849600982198, + "grad_norm": 0.28253886103630066, + "learning_rate": 8.882944300313897e-05, + "loss": 1.8521, + "step": 7835 + }, + { + "epoch": 2.4051565377532227, + "grad_norm": 0.37631165981292725, + "learning_rate": 8.882631132013245e-05, + "loss": 1.8838, + "step": 7836 + }, + { + "epoch": 2.405463474524248, + "grad_norm": 0.3606031537055969, + "learning_rate": 8.882317925342157e-05, + "loss": 1.8452, + "step": 7837 + }, + { + "epoch": 2.4057704112952734, + "grad_norm": 0.33793914318084717, + "learning_rate": 8.882004680303726e-05, + "loss": 1.8866, + "step": 7838 + }, + { + "epoch": 2.4060773480662982, + "grad_norm": 0.2714223265647888, + "learning_rate": 8.881691396901048e-05, + "loss": 1.7953, + "step": 7839 + }, + { + "epoch": 2.4063842848373236, + "grad_norm": 0.3588239252567291, + "learning_rate": 8.881378075137224e-05, + "loss": 1.9679, + "step": 7840 + }, + { + "epoch": 2.406691221608349, + "grad_norm": 0.3266383707523346, + "learning_rate": 8.881064715015344e-05, + "loss": 1.8747, + "step": 7841 + }, + { + "epoch": 2.406998158379374, + "grad_norm": 0.3498428761959076, + "learning_rate": 8.88075131653851e-05, + "loss": 1.8882, + "step": 7842 + }, + { + "epoch": 2.407305095150399, + "grad_norm": 0.36646100878715515, + "learning_rate": 8.880437879709815e-05, + "loss": 1.8624, + "step": 7843 + }, + { + "epoch": 2.407612031921424, + "grad_norm": 0.36088457703590393, + "learning_rate": 8.88012440453236e-05, + "loss": 1.8527, + "step": 7844 + }, + { + "epoch": 2.4079189686924494, + "grad_norm": 0.3267477750778198, + "learning_rate": 8.87981089100924e-05, + "loss": 1.8374, + "step": 7845 + }, + { + "epoch": 2.4082259054634747, + "grad_norm": 0.3262403607368469, + "learning_rate": 8.879497339143556e-05, + "loss": 1.8752, + "step": 7846 + }, + { + "epoch": 2.4085328422344996, + "grad_norm": 0.278877854347229, + "learning_rate": 8.879183748938405e-05, + "loss": 1.8056, + "step": 7847 + }, + { + "epoch": 2.408839779005525, + "grad_norm": 0.35509005188941956, + "learning_rate": 8.878870120396886e-05, + "loss": 1.8555, + "step": 7848 + }, + { + "epoch": 2.40914671577655, + "grad_norm": 0.3621126413345337, + "learning_rate": 8.8785564535221e-05, + "loss": 1.8084, + "step": 7849 + }, + { + "epoch": 2.409453652547575, + "grad_norm": 0.2772746682167053, + "learning_rate": 8.878242748317145e-05, + "loss": 1.8034, + "step": 7850 + }, + { + "epoch": 2.4097605893186005, + "grad_norm": 0.30938875675201416, + "learning_rate": 8.877929004785121e-05, + "loss": 1.8341, + "step": 7851 + }, + { + "epoch": 2.4100675260896254, + "grad_norm": 0.3349369764328003, + "learning_rate": 8.877615222929133e-05, + "loss": 1.8306, + "step": 7852 + }, + { + "epoch": 2.4103744628606507, + "grad_norm": 0.3109685778617859, + "learning_rate": 8.877301402752277e-05, + "loss": 1.7998, + "step": 7853 + }, + { + "epoch": 2.410681399631676, + "grad_norm": 0.3337927460670471, + "learning_rate": 8.876987544257655e-05, + "loss": 1.8766, + "step": 7854 + }, + { + "epoch": 2.410988336402701, + "grad_norm": 0.33891361951828003, + "learning_rate": 8.87667364744837e-05, + "loss": 1.8535, + "step": 7855 + }, + { + "epoch": 2.4112952731737263, + "grad_norm": 0.30946552753448486, + "learning_rate": 8.876359712327524e-05, + "loss": 1.8144, + "step": 7856 + }, + { + "epoch": 2.4116022099447516, + "grad_norm": 0.354981929063797, + "learning_rate": 8.87604573889822e-05, + "loss": 1.9253, + "step": 7857 + }, + { + "epoch": 2.4119091467157765, + "grad_norm": 0.42054516077041626, + "learning_rate": 8.875731727163559e-05, + "loss": 1.9122, + "step": 7858 + }, + { + "epoch": 2.412216083486802, + "grad_norm": 0.37435492873191833, + "learning_rate": 8.875417677126646e-05, + "loss": 1.8639, + "step": 7859 + }, + { + "epoch": 2.4125230202578267, + "grad_norm": 0.3742216229438782, + "learning_rate": 8.875103588790584e-05, + "loss": 1.8398, + "step": 7860 + }, + { + "epoch": 2.412829957028852, + "grad_norm": 0.3152104616165161, + "learning_rate": 8.874789462158478e-05, + "loss": 1.8078, + "step": 7861 + }, + { + "epoch": 2.4131368937998774, + "grad_norm": 0.32342761754989624, + "learning_rate": 8.87447529723343e-05, + "loss": 1.8632, + "step": 7862 + }, + { + "epoch": 2.4134438305709023, + "grad_norm": 0.31065210700035095, + "learning_rate": 8.874161094018547e-05, + "loss": 1.845, + "step": 7863 + }, + { + "epoch": 2.4137507673419276, + "grad_norm": 0.31379538774490356, + "learning_rate": 8.873846852516933e-05, + "loss": 1.8184, + "step": 7864 + }, + { + "epoch": 2.4140577041129525, + "grad_norm": 0.29058924317359924, + "learning_rate": 8.873532572731694e-05, + "loss": 1.8671, + "step": 7865 + }, + { + "epoch": 2.414364640883978, + "grad_norm": 0.3024691641330719, + "learning_rate": 8.873218254665936e-05, + "loss": 1.7977, + "step": 7866 + }, + { + "epoch": 2.414671577655003, + "grad_norm": 0.30356913805007935, + "learning_rate": 8.872903898322764e-05, + "loss": 1.8284, + "step": 7867 + }, + { + "epoch": 2.414978514426028, + "grad_norm": 0.29594334959983826, + "learning_rate": 8.872589503705287e-05, + "loss": 1.8651, + "step": 7868 + }, + { + "epoch": 2.4152854511970534, + "grad_norm": 0.2929564118385315, + "learning_rate": 8.872275070816612e-05, + "loss": 1.8671, + "step": 7869 + }, + { + "epoch": 2.4155923879680787, + "grad_norm": 0.30591902136802673, + "learning_rate": 8.871960599659842e-05, + "loss": 1.9341, + "step": 7870 + }, + { + "epoch": 2.4158993247391036, + "grad_norm": 0.3944799304008484, + "learning_rate": 8.87164609023809e-05, + "loss": 1.8947, + "step": 7871 + }, + { + "epoch": 2.416206261510129, + "grad_norm": 0.3568263351917267, + "learning_rate": 8.871331542554461e-05, + "loss": 1.8466, + "step": 7872 + }, + { + "epoch": 2.4165131982811543, + "grad_norm": 0.3182635009288788, + "learning_rate": 8.871016956612066e-05, + "loss": 1.8373, + "step": 7873 + }, + { + "epoch": 2.416820135052179, + "grad_norm": 0.31941649317741394, + "learning_rate": 8.870702332414012e-05, + "loss": 1.8356, + "step": 7874 + }, + { + "epoch": 2.4171270718232045, + "grad_norm": 0.3090899586677551, + "learning_rate": 8.870387669963407e-05, + "loss": 1.9308, + "step": 7875 + }, + { + "epoch": 2.4174340085942294, + "grad_norm": 0.3078390955924988, + "learning_rate": 8.870072969263364e-05, + "loss": 1.8521, + "step": 7876 + }, + { + "epoch": 2.4177409453652547, + "grad_norm": 0.29126885533332825, + "learning_rate": 8.869758230316992e-05, + "loss": 1.8091, + "step": 7877 + }, + { + "epoch": 2.41804788213628, + "grad_norm": 0.36473605036735535, + "learning_rate": 8.869443453127402e-05, + "loss": 1.8282, + "step": 7878 + }, + { + "epoch": 2.418354818907305, + "grad_norm": 0.3617660701274872, + "learning_rate": 8.869128637697702e-05, + "loss": 1.8843, + "step": 7879 + }, + { + "epoch": 2.4186617556783303, + "grad_norm": 0.33267220854759216, + "learning_rate": 8.868813784031005e-05, + "loss": 1.8647, + "step": 7880 + }, + { + "epoch": 2.418968692449355, + "grad_norm": 0.29990482330322266, + "learning_rate": 8.868498892130424e-05, + "loss": 1.7697, + "step": 7881 + }, + { + "epoch": 2.4192756292203805, + "grad_norm": 0.3618892431259155, + "learning_rate": 8.868183961999068e-05, + "loss": 1.7699, + "step": 7882 + }, + { + "epoch": 2.419582565991406, + "grad_norm": 0.29534587264060974, + "learning_rate": 8.867868993640051e-05, + "loss": 1.828, + "step": 7883 + }, + { + "epoch": 2.4198895027624308, + "grad_norm": 0.3086758255958557, + "learning_rate": 8.867553987056487e-05, + "loss": 1.8652, + "step": 7884 + }, + { + "epoch": 2.420196439533456, + "grad_norm": 0.3273947834968567, + "learning_rate": 8.867238942251487e-05, + "loss": 1.8553, + "step": 7885 + }, + { + "epoch": 2.4205033763044814, + "grad_norm": 0.3069070279598236, + "learning_rate": 8.866923859228165e-05, + "loss": 1.8057, + "step": 7886 + }, + { + "epoch": 2.4208103130755063, + "grad_norm": 0.2884439527988434, + "learning_rate": 8.866608737989635e-05, + "loss": 1.8479, + "step": 7887 + }, + { + "epoch": 2.4211172498465316, + "grad_norm": 0.32123002409935, + "learning_rate": 8.866293578539011e-05, + "loss": 1.916, + "step": 7888 + }, + { + "epoch": 2.421424186617557, + "grad_norm": 0.285966157913208, + "learning_rate": 8.865978380879407e-05, + "loss": 1.834, + "step": 7889 + }, + { + "epoch": 2.421731123388582, + "grad_norm": 0.28088799118995667, + "learning_rate": 8.865663145013941e-05, + "loss": 1.7794, + "step": 7890 + }, + { + "epoch": 2.422038060159607, + "grad_norm": 0.31160372495651245, + "learning_rate": 8.865347870945724e-05, + "loss": 1.8584, + "step": 7891 + }, + { + "epoch": 2.422344996930632, + "grad_norm": 0.3121089041233063, + "learning_rate": 8.865032558677874e-05, + "loss": 1.8797, + "step": 7892 + }, + { + "epoch": 2.4226519337016574, + "grad_norm": 0.35856643319129944, + "learning_rate": 8.864717208213506e-05, + "loss": 1.8664, + "step": 7893 + }, + { + "epoch": 2.4229588704726828, + "grad_norm": 0.32826781272888184, + "learning_rate": 8.864401819555739e-05, + "loss": 1.8473, + "step": 7894 + }, + { + "epoch": 2.4232658072437077, + "grad_norm": 0.34450921416282654, + "learning_rate": 8.86408639270769e-05, + "loss": 1.918, + "step": 7895 + }, + { + "epoch": 2.423572744014733, + "grad_norm": 0.39621153473854065, + "learning_rate": 8.86377092767247e-05, + "loss": 1.9411, + "step": 7896 + }, + { + "epoch": 2.423879680785758, + "grad_norm": 0.3765166103839874, + "learning_rate": 8.863455424453204e-05, + "loss": 1.9003, + "step": 7897 + }, + { + "epoch": 2.424186617556783, + "grad_norm": 0.3942621946334839, + "learning_rate": 8.863139883053007e-05, + "loss": 1.9647, + "step": 7898 + }, + { + "epoch": 2.4244935543278086, + "grad_norm": 0.4255806803703308, + "learning_rate": 8.862824303474996e-05, + "loss": 1.9147, + "step": 7899 + }, + { + "epoch": 2.424800491098834, + "grad_norm": 0.3993197977542877, + "learning_rate": 8.862508685722292e-05, + "loss": 1.8822, + "step": 7900 + }, + { + "epoch": 2.425107427869859, + "grad_norm": 0.3734201490879059, + "learning_rate": 8.862193029798013e-05, + "loss": 1.8745, + "step": 7901 + }, + { + "epoch": 2.425414364640884, + "grad_norm": 0.40955278277397156, + "learning_rate": 8.861877335705279e-05, + "loss": 1.877, + "step": 7902 + }, + { + "epoch": 2.425721301411909, + "grad_norm": 0.3975965678691864, + "learning_rate": 8.861561603447211e-05, + "loss": 1.868, + "step": 7903 + }, + { + "epoch": 2.4260282381829343, + "grad_norm": 0.30194091796875, + "learning_rate": 8.861245833026926e-05, + "loss": 1.7849, + "step": 7904 + }, + { + "epoch": 2.4263351749539597, + "grad_norm": 0.349930077791214, + "learning_rate": 8.860930024447547e-05, + "loss": 1.891, + "step": 7905 + }, + { + "epoch": 2.4266421117249846, + "grad_norm": 0.40644606947898865, + "learning_rate": 8.860614177712196e-05, + "loss": 1.8463, + "step": 7906 + }, + { + "epoch": 2.42694904849601, + "grad_norm": 0.3627426028251648, + "learning_rate": 8.86029829282399e-05, + "loss": 1.8518, + "step": 7907 + }, + { + "epoch": 2.427255985267035, + "grad_norm": 0.4019826054573059, + "learning_rate": 8.859982369786055e-05, + "loss": 1.7997, + "step": 7908 + }, + { + "epoch": 2.42756292203806, + "grad_norm": 0.375589519739151, + "learning_rate": 8.859666408601512e-05, + "loss": 1.9136, + "step": 7909 + }, + { + "epoch": 2.4278698588090855, + "grad_norm": 0.3135814070701599, + "learning_rate": 8.859350409273484e-05, + "loss": 1.8511, + "step": 7910 + }, + { + "epoch": 2.4281767955801103, + "grad_norm": 0.4534473717212677, + "learning_rate": 8.859034371805093e-05, + "loss": 1.9827, + "step": 7911 + }, + { + "epoch": 2.4284837323511357, + "grad_norm": 0.5559772849082947, + "learning_rate": 8.858718296199462e-05, + "loss": 1.8578, + "step": 7912 + }, + { + "epoch": 2.428790669122161, + "grad_norm": 0.4518011212348938, + "learning_rate": 8.858402182459715e-05, + "loss": 1.8374, + "step": 7913 + }, + { + "epoch": 2.429097605893186, + "grad_norm": 0.31662946939468384, + "learning_rate": 8.858086030588977e-05, + "loss": 1.8356, + "step": 7914 + }, + { + "epoch": 2.4294045426642112, + "grad_norm": 0.4660717844963074, + "learning_rate": 8.857769840590371e-05, + "loss": 1.7977, + "step": 7915 + }, + { + "epoch": 2.4297114794352366, + "grad_norm": 0.5611162185668945, + "learning_rate": 8.857453612467022e-05, + "loss": 1.8423, + "step": 7916 + }, + { + "epoch": 2.4300184162062615, + "grad_norm": 0.5055921077728271, + "learning_rate": 8.857137346222056e-05, + "loss": 1.8595, + "step": 7917 + }, + { + "epoch": 2.430325352977287, + "grad_norm": 0.3589123487472534, + "learning_rate": 8.856821041858597e-05, + "loss": 1.776, + "step": 7918 + }, + { + "epoch": 2.4306322897483117, + "grad_norm": 0.36849313974380493, + "learning_rate": 8.856504699379773e-05, + "loss": 1.8695, + "step": 7919 + }, + { + "epoch": 2.430939226519337, + "grad_norm": 0.47566625475883484, + "learning_rate": 8.856188318788709e-05, + "loss": 1.8578, + "step": 7920 + }, + { + "epoch": 2.4312461632903624, + "grad_norm": 0.554790735244751, + "learning_rate": 8.855871900088532e-05, + "loss": 1.8406, + "step": 7921 + }, + { + "epoch": 2.4315531000613873, + "grad_norm": 0.4846283197402954, + "learning_rate": 8.855555443282369e-05, + "loss": 1.8475, + "step": 7922 + }, + { + "epoch": 2.4318600368324126, + "grad_norm": 0.35256531834602356, + "learning_rate": 8.855238948373346e-05, + "loss": 1.8594, + "step": 7923 + }, + { + "epoch": 2.4321669736034375, + "grad_norm": 0.3713412880897522, + "learning_rate": 8.854922415364593e-05, + "loss": 1.893, + "step": 7924 + }, + { + "epoch": 2.432473910374463, + "grad_norm": 0.4289644658565521, + "learning_rate": 8.854605844259237e-05, + "loss": 1.8958, + "step": 7925 + }, + { + "epoch": 2.432780847145488, + "grad_norm": 0.4209578335285187, + "learning_rate": 8.854289235060406e-05, + "loss": 1.8419, + "step": 7926 + }, + { + "epoch": 2.433087783916513, + "grad_norm": 0.41226091980934143, + "learning_rate": 8.853972587771232e-05, + "loss": 1.958, + "step": 7927 + }, + { + "epoch": 2.4333947206875384, + "grad_norm": 0.36133915185928345, + "learning_rate": 8.853655902394841e-05, + "loss": 1.9181, + "step": 7928 + }, + { + "epoch": 2.4337016574585637, + "grad_norm": 0.44178202748298645, + "learning_rate": 8.853339178934363e-05, + "loss": 1.9242, + "step": 7929 + }, + { + "epoch": 2.4340085942295886, + "grad_norm": 0.4537523686885834, + "learning_rate": 8.853022417392929e-05, + "loss": 2.0451, + "step": 7930 + }, + { + "epoch": 2.434315531000614, + "grad_norm": 0.3214915990829468, + "learning_rate": 8.852705617773669e-05, + "loss": 1.8549, + "step": 7931 + }, + { + "epoch": 2.4346224677716393, + "grad_norm": 0.4621930420398712, + "learning_rate": 8.852388780079714e-05, + "loss": 1.8705, + "step": 7932 + }, + { + "epoch": 2.434929404542664, + "grad_norm": 0.52337646484375, + "learning_rate": 8.852071904314196e-05, + "loss": 1.8381, + "step": 7933 + }, + { + "epoch": 2.4352363413136895, + "grad_norm": 0.3846060633659363, + "learning_rate": 8.851754990480246e-05, + "loss": 1.828, + "step": 7934 + }, + { + "epoch": 2.4355432780847144, + "grad_norm": 0.34233763813972473, + "learning_rate": 8.851438038580994e-05, + "loss": 1.924, + "step": 7935 + }, + { + "epoch": 2.4358502148557397, + "grad_norm": 0.39583292603492737, + "learning_rate": 8.851121048619574e-05, + "loss": 1.8383, + "step": 7936 + }, + { + "epoch": 2.436157151626765, + "grad_norm": 0.3715476393699646, + "learning_rate": 8.850804020599119e-05, + "loss": 1.9251, + "step": 7937 + }, + { + "epoch": 2.43646408839779, + "grad_norm": 0.32089582085609436, + "learning_rate": 8.850486954522762e-05, + "loss": 1.9317, + "step": 7938 + }, + { + "epoch": 2.4367710251688153, + "grad_norm": 0.46823611855506897, + "learning_rate": 8.850169850393634e-05, + "loss": 1.9743, + "step": 7939 + }, + { + "epoch": 2.43707796193984, + "grad_norm": 0.405205637216568, + "learning_rate": 8.849852708214874e-05, + "loss": 1.8772, + "step": 7940 + }, + { + "epoch": 2.4373848987108655, + "grad_norm": 0.33672770857810974, + "learning_rate": 8.849535527989612e-05, + "loss": 1.8767, + "step": 7941 + }, + { + "epoch": 2.437691835481891, + "grad_norm": 0.38022953271865845, + "learning_rate": 8.849218309720983e-05, + "loss": 1.8882, + "step": 7942 + }, + { + "epoch": 2.4379987722529157, + "grad_norm": 0.4224186837673187, + "learning_rate": 8.848901053412124e-05, + "loss": 1.9016, + "step": 7943 + }, + { + "epoch": 2.438305709023941, + "grad_norm": 0.3890904486179352, + "learning_rate": 8.848583759066167e-05, + "loss": 1.8761, + "step": 7944 + }, + { + "epoch": 2.4386126457949664, + "grad_norm": 0.3747030794620514, + "learning_rate": 8.84826642668625e-05, + "loss": 1.8576, + "step": 7945 + }, + { + "epoch": 2.4389195825659913, + "grad_norm": 0.3317604959011078, + "learning_rate": 8.84794905627551e-05, + "loss": 1.9249, + "step": 7946 + }, + { + "epoch": 2.4392265193370166, + "grad_norm": 0.3294972777366638, + "learning_rate": 8.84763164783708e-05, + "loss": 1.8308, + "step": 7947 + }, + { + "epoch": 2.439533456108042, + "grad_norm": 0.42031124234199524, + "learning_rate": 8.847314201374101e-05, + "loss": 1.7884, + "step": 7948 + }, + { + "epoch": 2.439840392879067, + "grad_norm": 0.4018419682979584, + "learning_rate": 8.846996716889708e-05, + "loss": 1.8334, + "step": 7949 + }, + { + "epoch": 2.440147329650092, + "grad_norm": 0.39541858434677124, + "learning_rate": 8.846679194387036e-05, + "loss": 1.888, + "step": 7950 + }, + { + "epoch": 2.440454266421117, + "grad_norm": 0.34641456604003906, + "learning_rate": 8.846361633869228e-05, + "loss": 1.8521, + "step": 7951 + }, + { + "epoch": 2.4407612031921424, + "grad_norm": 0.42987826466560364, + "learning_rate": 8.846044035339419e-05, + "loss": 1.8789, + "step": 7952 + }, + { + "epoch": 2.4410681399631677, + "grad_norm": 0.3651089072227478, + "learning_rate": 8.845726398800749e-05, + "loss": 1.9024, + "step": 7953 + }, + { + "epoch": 2.4413750767341926, + "grad_norm": 0.3024137616157532, + "learning_rate": 8.845408724256356e-05, + "loss": 1.7773, + "step": 7954 + }, + { + "epoch": 2.441682013505218, + "grad_norm": 0.32426944375038147, + "learning_rate": 8.845091011709381e-05, + "loss": 1.7873, + "step": 7955 + }, + { + "epoch": 2.441988950276243, + "grad_norm": 0.34448274970054626, + "learning_rate": 8.844773261162962e-05, + "loss": 1.8854, + "step": 7956 + }, + { + "epoch": 2.442295887047268, + "grad_norm": 0.2942068874835968, + "learning_rate": 8.844455472620241e-05, + "loss": 1.8186, + "step": 7957 + }, + { + "epoch": 2.4426028238182935, + "grad_norm": 0.3849888741970062, + "learning_rate": 8.844137646084358e-05, + "loss": 1.905, + "step": 7958 + }, + { + "epoch": 2.4429097605893184, + "grad_norm": 0.44277897477149963, + "learning_rate": 8.843819781558452e-05, + "loss": 1.8836, + "step": 7959 + }, + { + "epoch": 2.4432166973603437, + "grad_norm": 0.34470248222351074, + "learning_rate": 8.843501879045667e-05, + "loss": 1.9368, + "step": 7960 + }, + { + "epoch": 2.443523634131369, + "grad_norm": 0.29713204503059387, + "learning_rate": 8.843183938549145e-05, + "loss": 1.8562, + "step": 7961 + }, + { + "epoch": 2.443830570902394, + "grad_norm": 0.370623379945755, + "learning_rate": 8.842865960072025e-05, + "loss": 1.8501, + "step": 7962 + }, + { + "epoch": 2.4441375076734193, + "grad_norm": 0.38828277587890625, + "learning_rate": 8.842547943617453e-05, + "loss": 1.884, + "step": 7963 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.294223427772522, + "learning_rate": 8.842229889188566e-05, + "loss": 1.857, + "step": 7964 + }, + { + "epoch": 2.4447513812154695, + "grad_norm": 0.31901589035987854, + "learning_rate": 8.841911796788516e-05, + "loss": 1.8675, + "step": 7965 + }, + { + "epoch": 2.445058317986495, + "grad_norm": 0.3586447834968567, + "learning_rate": 8.84159366642044e-05, + "loss": 1.86, + "step": 7966 + }, + { + "epoch": 2.4453652547575198, + "grad_norm": 0.30848199129104614, + "learning_rate": 8.841275498087482e-05, + "loss": 1.8153, + "step": 7967 + }, + { + "epoch": 2.445672191528545, + "grad_norm": 0.2694801688194275, + "learning_rate": 8.84095729179279e-05, + "loss": 1.7702, + "step": 7968 + }, + { + "epoch": 2.4459791282995704, + "grad_norm": 0.3068044185638428, + "learning_rate": 8.840639047539507e-05, + "loss": 1.8531, + "step": 7969 + }, + { + "epoch": 2.4462860650705953, + "grad_norm": 0.32885125279426575, + "learning_rate": 8.840320765330776e-05, + "loss": 1.9194, + "step": 7970 + }, + { + "epoch": 2.4465930018416207, + "grad_norm": 0.2949635088443756, + "learning_rate": 8.840002445169746e-05, + "loss": 1.8427, + "step": 7971 + }, + { + "epoch": 2.446899938612646, + "grad_norm": 0.27281275391578674, + "learning_rate": 8.83968408705956e-05, + "loss": 1.8279, + "step": 7972 + }, + { + "epoch": 2.447206875383671, + "grad_norm": 0.3038519620895386, + "learning_rate": 8.839365691003367e-05, + "loss": 1.8629, + "step": 7973 + }, + { + "epoch": 2.447513812154696, + "grad_norm": 0.28468266129493713, + "learning_rate": 8.839047257004311e-05, + "loss": 1.8765, + "step": 7974 + }, + { + "epoch": 2.4478207489257215, + "grad_norm": 0.29807159304618835, + "learning_rate": 8.83872878506554e-05, + "loss": 1.8152, + "step": 7975 + }, + { + "epoch": 2.4481276856967464, + "grad_norm": 0.3005301356315613, + "learning_rate": 8.838410275190201e-05, + "loss": 1.8577, + "step": 7976 + }, + { + "epoch": 2.4484346224677718, + "grad_norm": 0.3068598806858063, + "learning_rate": 8.838091727381442e-05, + "loss": 1.863, + "step": 7977 + }, + { + "epoch": 2.4487415592387967, + "grad_norm": 0.33748000860214233, + "learning_rate": 8.837773141642411e-05, + "loss": 1.7889, + "step": 7978 + }, + { + "epoch": 2.449048496009822, + "grad_norm": 0.344417542219162, + "learning_rate": 8.837454517976256e-05, + "loss": 1.9167, + "step": 7979 + }, + { + "epoch": 2.4493554327808473, + "grad_norm": 0.29128298163414, + "learning_rate": 8.837135856386127e-05, + "loss": 1.8246, + "step": 7980 + }, + { + "epoch": 2.449662369551872, + "grad_norm": 0.27023759484291077, + "learning_rate": 8.836817156875172e-05, + "loss": 1.8493, + "step": 7981 + }, + { + "epoch": 2.4499693063228976, + "grad_norm": 0.2792586088180542, + "learning_rate": 8.836498419446541e-05, + "loss": 1.8739, + "step": 7982 + }, + { + "epoch": 2.4502762430939224, + "grad_norm": 0.2715211510658264, + "learning_rate": 8.836179644103384e-05, + "loss": 1.8218, + "step": 7983 + }, + { + "epoch": 2.450583179864948, + "grad_norm": 0.273576557636261, + "learning_rate": 8.835860830848851e-05, + "loss": 1.9063, + "step": 7984 + }, + { + "epoch": 2.450890116635973, + "grad_norm": 0.2992589473724365, + "learning_rate": 8.835541979686093e-05, + "loss": 1.8799, + "step": 7985 + }, + { + "epoch": 2.451197053406998, + "grad_norm": 0.3231843411922455, + "learning_rate": 8.835223090618263e-05, + "loss": 1.8956, + "step": 7986 + }, + { + "epoch": 2.4515039901780233, + "grad_norm": 0.31108468770980835, + "learning_rate": 8.834904163648508e-05, + "loss": 1.8371, + "step": 7987 + }, + { + "epoch": 2.4518109269490487, + "grad_norm": 0.26657021045684814, + "learning_rate": 8.834585198779983e-05, + "loss": 1.8384, + "step": 7988 + }, + { + "epoch": 2.4521178637200736, + "grad_norm": 0.32093849778175354, + "learning_rate": 8.83426619601584e-05, + "loss": 1.8603, + "step": 7989 + }, + { + "epoch": 2.452424800491099, + "grad_norm": 0.32942765951156616, + "learning_rate": 8.833947155359231e-05, + "loss": 1.8306, + "step": 7990 + }, + { + "epoch": 2.4527317372621242, + "grad_norm": 0.31677374243736267, + "learning_rate": 8.83362807681331e-05, + "loss": 1.8339, + "step": 7991 + }, + { + "epoch": 2.453038674033149, + "grad_norm": 0.2739655673503876, + "learning_rate": 8.833308960381228e-05, + "loss": 1.8514, + "step": 7992 + }, + { + "epoch": 2.4533456108041745, + "grad_norm": 0.3194214105606079, + "learning_rate": 8.83298980606614e-05, + "loss": 1.8413, + "step": 7993 + }, + { + "epoch": 2.4536525475751993, + "grad_norm": 0.3346202075481415, + "learning_rate": 8.832670613871202e-05, + "loss": 1.8558, + "step": 7994 + }, + { + "epoch": 2.4539594843462247, + "grad_norm": 0.3400736451148987, + "learning_rate": 8.832351383799565e-05, + "loss": 1.8668, + "step": 7995 + }, + { + "epoch": 2.45426642111725, + "grad_norm": 0.2807479202747345, + "learning_rate": 8.832032115854385e-05, + "loss": 1.8361, + "step": 7996 + }, + { + "epoch": 2.454573357888275, + "grad_norm": 0.2977379262447357, + "learning_rate": 8.831712810038817e-05, + "loss": 1.84, + "step": 7997 + }, + { + "epoch": 2.4548802946593002, + "grad_norm": 0.3242948353290558, + "learning_rate": 8.831393466356019e-05, + "loss": 1.9421, + "step": 7998 + }, + { + "epoch": 2.455187231430325, + "grad_norm": 0.3289327025413513, + "learning_rate": 8.831074084809144e-05, + "loss": 1.9348, + "step": 7999 + }, + { + "epoch": 2.4554941682013505, + "grad_norm": 0.3378387987613678, + "learning_rate": 8.830754665401351e-05, + "loss": 1.7871, + "step": 8000 + }, + { + "epoch": 2.455801104972376, + "grad_norm": 0.29627665877342224, + "learning_rate": 8.830435208135794e-05, + "loss": 1.815, + "step": 8001 + }, + { + "epoch": 2.4561080417434007, + "grad_norm": 0.3509432375431061, + "learning_rate": 8.83011571301563e-05, + "loss": 1.9209, + "step": 8002 + }, + { + "epoch": 2.456414978514426, + "grad_norm": 0.3272305130958557, + "learning_rate": 8.829796180044019e-05, + "loss": 1.8437, + "step": 8003 + }, + { + "epoch": 2.4567219152854514, + "grad_norm": 0.33997493982315063, + "learning_rate": 8.829476609224119e-05, + "loss": 1.8827, + "step": 8004 + }, + { + "epoch": 2.4570288520564763, + "grad_norm": 0.30387789011001587, + "learning_rate": 8.829157000559084e-05, + "loss": 1.8427, + "step": 8005 + }, + { + "epoch": 2.4573357888275016, + "grad_norm": 0.30266425013542175, + "learning_rate": 8.828837354052075e-05, + "loss": 1.8274, + "step": 8006 + }, + { + "epoch": 2.457642725598527, + "grad_norm": 0.365546315908432, + "learning_rate": 8.828517669706254e-05, + "loss": 1.8455, + "step": 8007 + }, + { + "epoch": 2.457949662369552, + "grad_norm": 0.339226633310318, + "learning_rate": 8.828197947524774e-05, + "loss": 1.8665, + "step": 8008 + }, + { + "epoch": 2.458256599140577, + "grad_norm": 0.31167346239089966, + "learning_rate": 8.8278781875108e-05, + "loss": 1.7807, + "step": 8009 + }, + { + "epoch": 2.458563535911602, + "grad_norm": 0.2788028120994568, + "learning_rate": 8.82755838966749e-05, + "loss": 1.8834, + "step": 8010 + }, + { + "epoch": 2.4588704726826274, + "grad_norm": 0.34648752212524414, + "learning_rate": 8.827238553998005e-05, + "loss": 1.8981, + "step": 8011 + }, + { + "epoch": 2.4591774094536527, + "grad_norm": 0.3169974982738495, + "learning_rate": 8.826918680505504e-05, + "loss": 1.81, + "step": 8012 + }, + { + "epoch": 2.4594843462246776, + "grad_norm": 0.46924272179603577, + "learning_rate": 8.826598769193151e-05, + "loss": 1.9016, + "step": 8013 + }, + { + "epoch": 2.459791282995703, + "grad_norm": 0.38437098264694214, + "learning_rate": 8.826278820064106e-05, + "loss": 1.8924, + "step": 8014 + }, + { + "epoch": 2.460098219766728, + "grad_norm": 0.3350604474544525, + "learning_rate": 8.82595883312153e-05, + "loss": 1.8591, + "step": 8015 + }, + { + "epoch": 2.460405156537753, + "grad_norm": 0.3053742051124573, + "learning_rate": 8.825638808368588e-05, + "loss": 1.8114, + "step": 8016 + }, + { + "epoch": 2.4607120933087785, + "grad_norm": 0.29566875100135803, + "learning_rate": 8.82531874580844e-05, + "loss": 1.8055, + "step": 8017 + }, + { + "epoch": 2.4610190300798034, + "grad_norm": 0.3057360053062439, + "learning_rate": 8.824998645444249e-05, + "loss": 1.8268, + "step": 8018 + }, + { + "epoch": 2.4613259668508287, + "grad_norm": 0.27333348989486694, + "learning_rate": 8.82467850727918e-05, + "loss": 1.7876, + "step": 8019 + }, + { + "epoch": 2.461632903621854, + "grad_norm": 0.29202890396118164, + "learning_rate": 8.824358331316398e-05, + "loss": 1.8488, + "step": 8020 + }, + { + "epoch": 2.461939840392879, + "grad_norm": 0.3640623986721039, + "learning_rate": 8.824038117559064e-05, + "loss": 1.9665, + "step": 8021 + }, + { + "epoch": 2.4622467771639043, + "grad_norm": 0.35411131381988525, + "learning_rate": 8.823717866010344e-05, + "loss": 1.8561, + "step": 8022 + }, + { + "epoch": 2.4625537139349296, + "grad_norm": 0.3695240020751953, + "learning_rate": 8.823397576673403e-05, + "loss": 1.8489, + "step": 8023 + }, + { + "epoch": 2.4628606507059545, + "grad_norm": 0.36554715037345886, + "learning_rate": 8.823077249551406e-05, + "loss": 1.8523, + "step": 8024 + }, + { + "epoch": 2.46316758747698, + "grad_norm": 0.2982638478279114, + "learning_rate": 8.822756884647521e-05, + "loss": 1.8006, + "step": 8025 + }, + { + "epoch": 2.4634745242480047, + "grad_norm": 0.3693525791168213, + "learning_rate": 8.822436481964909e-05, + "loss": 1.8695, + "step": 8026 + }, + { + "epoch": 2.46378146101903, + "grad_norm": 0.46769842505455017, + "learning_rate": 8.82211604150674e-05, + "loss": 1.8509, + "step": 8027 + }, + { + "epoch": 2.4640883977900554, + "grad_norm": 0.5327584743499756, + "learning_rate": 8.82179556327618e-05, + "loss": 1.8642, + "step": 8028 + }, + { + "epoch": 2.4643953345610803, + "grad_norm": 0.5302795767784119, + "learning_rate": 8.821475047276398e-05, + "loss": 1.8645, + "step": 8029 + }, + { + "epoch": 2.4647022713321056, + "grad_norm": 0.43549028038978577, + "learning_rate": 8.821154493510557e-05, + "loss": 1.9193, + "step": 8030 + }, + { + "epoch": 2.4650092081031305, + "grad_norm": 0.3013847768306732, + "learning_rate": 8.82083390198183e-05, + "loss": 1.7819, + "step": 8031 + }, + { + "epoch": 2.465316144874156, + "grad_norm": 0.422325074672699, + "learning_rate": 8.820513272693383e-05, + "loss": 1.9307, + "step": 8032 + }, + { + "epoch": 2.465623081645181, + "grad_norm": 0.4823217988014221, + "learning_rate": 8.820192605648383e-05, + "loss": 1.8681, + "step": 8033 + }, + { + "epoch": 2.465930018416206, + "grad_norm": 0.3938382863998413, + "learning_rate": 8.819871900850001e-05, + "loss": 1.8483, + "step": 8034 + }, + { + "epoch": 2.4662369551872314, + "grad_norm": 0.30860164761543274, + "learning_rate": 8.819551158301406e-05, + "loss": 1.8818, + "step": 8035 + }, + { + "epoch": 2.4665438919582567, + "grad_norm": 0.3715503215789795, + "learning_rate": 8.819230378005767e-05, + "loss": 1.8443, + "step": 8036 + }, + { + "epoch": 2.4668508287292816, + "grad_norm": 0.4750272333621979, + "learning_rate": 8.818909559966255e-05, + "loss": 1.8379, + "step": 8037 + }, + { + "epoch": 2.467157765500307, + "grad_norm": 0.4794345796108246, + "learning_rate": 8.818588704186041e-05, + "loss": 1.8585, + "step": 8038 + }, + { + "epoch": 2.4674647022713323, + "grad_norm": 0.33470577001571655, + "learning_rate": 8.818267810668296e-05, + "loss": 1.8231, + "step": 8039 + }, + { + "epoch": 2.467771639042357, + "grad_norm": 0.31480371952056885, + "learning_rate": 8.817946879416191e-05, + "loss": 1.867, + "step": 8040 + }, + { + "epoch": 2.4680785758133825, + "grad_norm": 0.41635531187057495, + "learning_rate": 8.817625910432897e-05, + "loss": 1.9385, + "step": 8041 + }, + { + "epoch": 2.4683855125844074, + "grad_norm": 0.4570399522781372, + "learning_rate": 8.817304903721584e-05, + "loss": 1.7855, + "step": 8042 + }, + { + "epoch": 2.4686924493554327, + "grad_norm": 0.36506229639053345, + "learning_rate": 8.816983859285429e-05, + "loss": 1.808, + "step": 8043 + }, + { + "epoch": 2.468999386126458, + "grad_norm": 0.2650545537471771, + "learning_rate": 8.8166627771276e-05, + "loss": 1.8271, + "step": 8044 + }, + { + "epoch": 2.469306322897483, + "grad_norm": 0.3143758475780487, + "learning_rate": 8.816341657251272e-05, + "loss": 1.9016, + "step": 8045 + }, + { + "epoch": 2.4696132596685083, + "grad_norm": 0.3015407621860504, + "learning_rate": 8.81602049965962e-05, + "loss": 1.8357, + "step": 8046 + }, + { + "epoch": 2.4699201964395336, + "grad_norm": 0.26860085129737854, + "learning_rate": 8.815699304355819e-05, + "loss": 1.8223, + "step": 8047 + }, + { + "epoch": 2.4702271332105585, + "grad_norm": 0.2852436602115631, + "learning_rate": 8.81537807134304e-05, + "loss": 1.8298, + "step": 8048 + }, + { + "epoch": 2.470534069981584, + "grad_norm": 0.29519692063331604, + "learning_rate": 8.815056800624457e-05, + "loss": 1.863, + "step": 8049 + }, + { + "epoch": 2.470841006752609, + "grad_norm": 0.3163367807865143, + "learning_rate": 8.814735492203247e-05, + "loss": 1.878, + "step": 8050 + }, + { + "epoch": 2.471147943523634, + "grad_norm": 0.2955954968929291, + "learning_rate": 8.814414146082586e-05, + "loss": 1.8657, + "step": 8051 + }, + { + "epoch": 2.4714548802946594, + "grad_norm": 0.2773810029029846, + "learning_rate": 8.814092762265648e-05, + "loss": 1.7626, + "step": 8052 + }, + { + "epoch": 2.4717618170656843, + "grad_norm": 0.33908557891845703, + "learning_rate": 8.813771340755609e-05, + "loss": 1.8902, + "step": 8053 + }, + { + "epoch": 2.4720687538367097, + "grad_norm": 0.3083830773830414, + "learning_rate": 8.81344988155565e-05, + "loss": 1.876, + "step": 8054 + }, + { + "epoch": 2.472375690607735, + "grad_norm": 0.29082754254341125, + "learning_rate": 8.81312838466894e-05, + "loss": 1.8637, + "step": 8055 + }, + { + "epoch": 2.47268262737876, + "grad_norm": 0.3240490257740021, + "learning_rate": 8.81280685009866e-05, + "loss": 1.9096, + "step": 8056 + }, + { + "epoch": 2.472989564149785, + "grad_norm": 0.364561527967453, + "learning_rate": 8.812485277847991e-05, + "loss": 1.9361, + "step": 8057 + }, + { + "epoch": 2.47329650092081, + "grad_norm": 0.3420087695121765, + "learning_rate": 8.812163667920107e-05, + "loss": 1.9014, + "step": 8058 + }, + { + "epoch": 2.4736034376918354, + "grad_norm": 0.3346010148525238, + "learning_rate": 8.811842020318186e-05, + "loss": 1.9195, + "step": 8059 + }, + { + "epoch": 2.4739103744628608, + "grad_norm": 0.2990448772907257, + "learning_rate": 8.811520335045409e-05, + "loss": 1.8866, + "step": 8060 + }, + { + "epoch": 2.4742173112338857, + "grad_norm": 0.3047022223472595, + "learning_rate": 8.811198612104953e-05, + "loss": 1.8226, + "step": 8061 + }, + { + "epoch": 2.474524248004911, + "grad_norm": 0.300020307302475, + "learning_rate": 8.8108768515e-05, + "loss": 1.8496, + "step": 8062 + }, + { + "epoch": 2.4748311847759363, + "grad_norm": 0.31999605894088745, + "learning_rate": 8.810555053233729e-05, + "loss": 1.7853, + "step": 8063 + }, + { + "epoch": 2.4751381215469612, + "grad_norm": 0.3136597275733948, + "learning_rate": 8.810233217309318e-05, + "loss": 1.9317, + "step": 8064 + }, + { + "epoch": 2.4754450583179866, + "grad_norm": 0.3373543322086334, + "learning_rate": 8.809911343729948e-05, + "loss": 1.7827, + "step": 8065 + }, + { + "epoch": 2.475751995089012, + "grad_norm": 0.33876341581344604, + "learning_rate": 8.809589432498804e-05, + "loss": 1.8803, + "step": 8066 + }, + { + "epoch": 2.476058931860037, + "grad_norm": 0.3455486297607422, + "learning_rate": 8.809267483619061e-05, + "loss": 1.8987, + "step": 8067 + }, + { + "epoch": 2.476365868631062, + "grad_norm": 0.34245389699935913, + "learning_rate": 8.808945497093907e-05, + "loss": 1.8948, + "step": 8068 + }, + { + "epoch": 2.476672805402087, + "grad_norm": 0.3200787901878357, + "learning_rate": 8.808623472926521e-05, + "loss": 1.8234, + "step": 8069 + }, + { + "epoch": 2.4769797421731123, + "grad_norm": 0.3244795799255371, + "learning_rate": 8.808301411120083e-05, + "loss": 1.8974, + "step": 8070 + }, + { + "epoch": 2.4772866789441377, + "grad_norm": 0.30235809087753296, + "learning_rate": 8.80797931167778e-05, + "loss": 1.8461, + "step": 8071 + }, + { + "epoch": 2.4775936157151626, + "grad_norm": 0.3719651997089386, + "learning_rate": 8.807657174602792e-05, + "loss": 1.9717, + "step": 8072 + }, + { + "epoch": 2.477900552486188, + "grad_norm": 0.3349135220050812, + "learning_rate": 8.807334999898307e-05, + "loss": 1.9, + "step": 8073 + }, + { + "epoch": 2.478207489257213, + "grad_norm": 0.28822100162506104, + "learning_rate": 8.807012787567503e-05, + "loss": 1.7606, + "step": 8074 + }, + { + "epoch": 2.478514426028238, + "grad_norm": 0.33698850870132446, + "learning_rate": 8.806690537613568e-05, + "loss": 1.8909, + "step": 8075 + }, + { + "epoch": 2.4788213627992635, + "grad_norm": 0.35167089104652405, + "learning_rate": 8.806368250039687e-05, + "loss": 1.8529, + "step": 8076 + }, + { + "epoch": 2.4791282995702884, + "grad_norm": 0.3142544627189636, + "learning_rate": 8.806045924849044e-05, + "loss": 1.8169, + "step": 8077 + }, + { + "epoch": 2.4794352363413137, + "grad_norm": 0.3489094078540802, + "learning_rate": 8.805723562044824e-05, + "loss": 1.8822, + "step": 8078 + }, + { + "epoch": 2.479742173112339, + "grad_norm": 0.33814284205436707, + "learning_rate": 8.805401161630214e-05, + "loss": 1.7982, + "step": 8079 + }, + { + "epoch": 2.480049109883364, + "grad_norm": 0.26772376894950867, + "learning_rate": 8.805078723608398e-05, + "loss": 1.8354, + "step": 8080 + }, + { + "epoch": 2.4803560466543892, + "grad_norm": 0.3259965777397156, + "learning_rate": 8.804756247982563e-05, + "loss": 1.8292, + "step": 8081 + }, + { + "epoch": 2.4806629834254146, + "grad_norm": 0.32701683044433594, + "learning_rate": 8.804433734755899e-05, + "loss": 1.8339, + "step": 8082 + }, + { + "epoch": 2.4809699201964395, + "grad_norm": 0.3180190324783325, + "learning_rate": 8.804111183931589e-05, + "loss": 1.8839, + "step": 8083 + }, + { + "epoch": 2.481276856967465, + "grad_norm": 0.3318104147911072, + "learning_rate": 8.803788595512824e-05, + "loss": 1.9024, + "step": 8084 + }, + { + "epoch": 2.4815837937384897, + "grad_norm": 0.3849479854106903, + "learning_rate": 8.80346596950279e-05, + "loss": 1.8497, + "step": 8085 + }, + { + "epoch": 2.481890730509515, + "grad_norm": 0.48812124133110046, + "learning_rate": 8.803143305904676e-05, + "loss": 1.799, + "step": 8086 + }, + { + "epoch": 2.4821976672805404, + "grad_norm": 0.4957241415977478, + "learning_rate": 8.802820604721671e-05, + "loss": 1.8842, + "step": 8087 + }, + { + "epoch": 2.4825046040515653, + "grad_norm": 0.4011611342430115, + "learning_rate": 8.802497865956964e-05, + "loss": 1.8354, + "step": 8088 + }, + { + "epoch": 2.4828115408225906, + "grad_norm": 0.3676159679889679, + "learning_rate": 8.802175089613744e-05, + "loss": 1.8564, + "step": 8089 + }, + { + "epoch": 2.4831184775936155, + "grad_norm": 0.30699628591537476, + "learning_rate": 8.801852275695202e-05, + "loss": 1.8403, + "step": 8090 + }, + { + "epoch": 2.483425414364641, + "grad_norm": 0.4100657105445862, + "learning_rate": 8.801529424204527e-05, + "loss": 1.7885, + "step": 8091 + }, + { + "epoch": 2.483732351135666, + "grad_norm": 0.30880647897720337, + "learning_rate": 8.801206535144909e-05, + "loss": 1.8682, + "step": 8092 + }, + { + "epoch": 2.484039287906691, + "grad_norm": 0.2775783836841583, + "learning_rate": 8.800883608519541e-05, + "loss": 1.8179, + "step": 8093 + }, + { + "epoch": 2.4843462246777164, + "grad_norm": 0.3048902451992035, + "learning_rate": 8.800560644331613e-05, + "loss": 1.8799, + "step": 8094 + }, + { + "epoch": 2.4846531614487417, + "grad_norm": 0.30332526564598083, + "learning_rate": 8.800237642584318e-05, + "loss": 1.8892, + "step": 8095 + }, + { + "epoch": 2.4849600982197666, + "grad_norm": 0.27216237783432007, + "learning_rate": 8.799914603280847e-05, + "loss": 1.7896, + "step": 8096 + }, + { + "epoch": 2.485267034990792, + "grad_norm": 0.28771117329597473, + "learning_rate": 8.799591526424393e-05, + "loss": 1.8593, + "step": 8097 + }, + { + "epoch": 2.4855739717618173, + "grad_norm": 0.2986912429332733, + "learning_rate": 8.799268412018146e-05, + "loss": 1.8205, + "step": 8098 + }, + { + "epoch": 2.485880908532842, + "grad_norm": 0.3072153925895691, + "learning_rate": 8.798945260065306e-05, + "loss": 1.841, + "step": 8099 + }, + { + "epoch": 2.4861878453038675, + "grad_norm": 0.33869001269340515, + "learning_rate": 8.798622070569059e-05, + "loss": 1.8353, + "step": 8100 + }, + { + "epoch": 2.4864947820748924, + "grad_norm": 0.3075481951236725, + "learning_rate": 8.798298843532605e-05, + "loss": 1.8824, + "step": 8101 + }, + { + "epoch": 2.4868017188459177, + "grad_norm": 0.2758934795856476, + "learning_rate": 8.797975578959132e-05, + "loss": 1.8068, + "step": 8102 + }, + { + "epoch": 2.487108655616943, + "grad_norm": 0.3065447211265564, + "learning_rate": 8.79765227685184e-05, + "loss": 1.8661, + "step": 8103 + }, + { + "epoch": 2.487415592387968, + "grad_norm": 0.34466415643692017, + "learning_rate": 8.797328937213923e-05, + "loss": 1.8579, + "step": 8104 + }, + { + "epoch": 2.4877225291589933, + "grad_norm": 0.4202970862388611, + "learning_rate": 8.797005560048575e-05, + "loss": 1.8526, + "step": 8105 + }, + { + "epoch": 2.488029465930018, + "grad_norm": 0.35885924100875854, + "learning_rate": 8.796682145358991e-05, + "loss": 1.8194, + "step": 8106 + }, + { + "epoch": 2.4883364027010435, + "grad_norm": 0.3208492696285248, + "learning_rate": 8.796358693148372e-05, + "loss": 1.8379, + "step": 8107 + }, + { + "epoch": 2.488643339472069, + "grad_norm": 0.26514047384262085, + "learning_rate": 8.79603520341991e-05, + "loss": 1.7978, + "step": 8108 + }, + { + "epoch": 2.4889502762430937, + "grad_norm": 0.34550225734710693, + "learning_rate": 8.795711676176803e-05, + "loss": 1.8771, + "step": 8109 + }, + { + "epoch": 2.489257213014119, + "grad_norm": 0.3016511797904968, + "learning_rate": 8.795388111422248e-05, + "loss": 1.8184, + "step": 8110 + }, + { + "epoch": 2.4895641497851444, + "grad_norm": 0.34824177622795105, + "learning_rate": 8.795064509159444e-05, + "loss": 1.8486, + "step": 8111 + }, + { + "epoch": 2.4898710865561693, + "grad_norm": 0.341482013463974, + "learning_rate": 8.794740869391587e-05, + "loss": 1.7872, + "step": 8112 + }, + { + "epoch": 2.4901780233271946, + "grad_norm": 0.3366520404815674, + "learning_rate": 8.794417192121878e-05, + "loss": 1.838, + "step": 8113 + }, + { + "epoch": 2.49048496009822, + "grad_norm": 0.3168759047985077, + "learning_rate": 8.794093477353514e-05, + "loss": 1.8195, + "step": 8114 + }, + { + "epoch": 2.490791896869245, + "grad_norm": 0.36757516860961914, + "learning_rate": 8.793769725089693e-05, + "loss": 1.8825, + "step": 8115 + }, + { + "epoch": 2.49109883364027, + "grad_norm": 0.3936297297477722, + "learning_rate": 8.793445935333617e-05, + "loss": 1.855, + "step": 8116 + }, + { + "epoch": 2.491405770411295, + "grad_norm": 0.31962448358535767, + "learning_rate": 8.793122108088485e-05, + "loss": 1.8307, + "step": 8117 + }, + { + "epoch": 2.4917127071823204, + "grad_norm": 0.3082095980644226, + "learning_rate": 8.792798243357499e-05, + "loss": 1.8204, + "step": 8118 + }, + { + "epoch": 2.4920196439533457, + "grad_norm": 0.4574470520019531, + "learning_rate": 8.792474341143855e-05, + "loss": 1.8989, + "step": 8119 + }, + { + "epoch": 2.4923265807243706, + "grad_norm": 0.4596022367477417, + "learning_rate": 8.792150401450757e-05, + "loss": 1.8773, + "step": 8120 + }, + { + "epoch": 2.492633517495396, + "grad_norm": 0.32090309262275696, + "learning_rate": 8.791826424281407e-05, + "loss": 1.8621, + "step": 8121 + }, + { + "epoch": 2.4929404542664213, + "grad_norm": 0.3492026925086975, + "learning_rate": 8.791502409639006e-05, + "loss": 1.8887, + "step": 8122 + }, + { + "epoch": 2.493247391037446, + "grad_norm": 0.39859771728515625, + "learning_rate": 8.791178357526754e-05, + "loss": 1.8326, + "step": 8123 + }, + { + "epoch": 2.4935543278084715, + "grad_norm": 0.40439239144325256, + "learning_rate": 8.790854267947857e-05, + "loss": 1.8716, + "step": 8124 + }, + { + "epoch": 2.493861264579497, + "grad_norm": 0.4004671573638916, + "learning_rate": 8.790530140905515e-05, + "loss": 1.8253, + "step": 8125 + }, + { + "epoch": 2.4941682013505218, + "grad_norm": 0.31446993350982666, + "learning_rate": 8.790205976402934e-05, + "loss": 1.8356, + "step": 8126 + }, + { + "epoch": 2.494475138121547, + "grad_norm": 0.3069862723350525, + "learning_rate": 8.789881774443315e-05, + "loss": 1.8532, + "step": 8127 + }, + { + "epoch": 2.494782074892572, + "grad_norm": 0.3192054033279419, + "learning_rate": 8.789557535029864e-05, + "loss": 1.7991, + "step": 8128 + }, + { + "epoch": 2.4950890116635973, + "grad_norm": 0.30979350209236145, + "learning_rate": 8.789233258165783e-05, + "loss": 1.8874, + "step": 8129 + }, + { + "epoch": 2.4953959484346226, + "grad_norm": 0.3193976879119873, + "learning_rate": 8.788908943854279e-05, + "loss": 1.8218, + "step": 8130 + }, + { + "epoch": 2.4957028852056475, + "grad_norm": 0.3120083808898926, + "learning_rate": 8.788584592098557e-05, + "loss": 1.9542, + "step": 8131 + }, + { + "epoch": 2.496009821976673, + "grad_norm": 0.36913001537323, + "learning_rate": 8.788260202901819e-05, + "loss": 1.8543, + "step": 8132 + }, + { + "epoch": 2.4963167587476978, + "grad_norm": 0.40216776728630066, + "learning_rate": 8.787935776267275e-05, + "loss": 1.8645, + "step": 8133 + }, + { + "epoch": 2.496623695518723, + "grad_norm": 0.3553076684474945, + "learning_rate": 8.78761131219813e-05, + "loss": 1.8881, + "step": 8134 + }, + { + "epoch": 2.4969306322897484, + "grad_norm": 0.2926538288593292, + "learning_rate": 8.787286810697589e-05, + "loss": 1.8419, + "step": 8135 + }, + { + "epoch": 2.4972375690607733, + "grad_norm": 0.3412233293056488, + "learning_rate": 8.78696227176886e-05, + "loss": 1.8766, + "step": 8136 + }, + { + "epoch": 2.4975445058317987, + "grad_norm": 0.30935296416282654, + "learning_rate": 8.78663769541515e-05, + "loss": 1.8002, + "step": 8137 + }, + { + "epoch": 2.497851442602824, + "grad_norm": 0.31171828508377075, + "learning_rate": 8.786313081639666e-05, + "loss": 1.7795, + "step": 8138 + }, + { + "epoch": 2.498158379373849, + "grad_norm": 0.2874031364917755, + "learning_rate": 8.785988430445619e-05, + "loss": 1.8508, + "step": 8139 + }, + { + "epoch": 2.498465316144874, + "grad_norm": 0.3126043379306793, + "learning_rate": 8.785663741836215e-05, + "loss": 1.8328, + "step": 8140 + }, + { + "epoch": 2.4987722529158995, + "grad_norm": 0.32581454515457153, + "learning_rate": 8.785339015814662e-05, + "loss": 1.8333, + "step": 8141 + }, + { + "epoch": 2.4990791896869244, + "grad_norm": 0.329745888710022, + "learning_rate": 8.78501425238417e-05, + "loss": 1.8257, + "step": 8142 + }, + { + "epoch": 2.4993861264579498, + "grad_norm": 0.29101938009262085, + "learning_rate": 8.78468945154795e-05, + "loss": 1.8472, + "step": 8143 + }, + { + "epoch": 2.4996930632289747, + "grad_norm": 0.3123742341995239, + "learning_rate": 8.784364613309208e-05, + "loss": 1.9226, + "step": 8144 + }, + { + "epoch": 2.5, + "grad_norm": 0.3330230116844177, + "learning_rate": 8.784039737671159e-05, + "loss": 1.8768, + "step": 8145 + }, + { + "epoch": 2.5003069367710253, + "grad_norm": 0.3147718012332916, + "learning_rate": 8.783714824637011e-05, + "loss": 1.853, + "step": 8146 + }, + { + "epoch": 2.5006138735420502, + "grad_norm": 0.34790241718292236, + "learning_rate": 8.783389874209977e-05, + "loss": 1.8328, + "step": 8147 + }, + { + "epoch": 2.5009208103130756, + "grad_norm": 0.29425308108329773, + "learning_rate": 8.783064886393264e-05, + "loss": 1.8487, + "step": 8148 + }, + { + "epoch": 2.5012277470841005, + "grad_norm": 0.30555078387260437, + "learning_rate": 8.782739861190088e-05, + "loss": 1.8588, + "step": 8149 + }, + { + "epoch": 2.501534683855126, + "grad_norm": 0.29712429642677307, + "learning_rate": 8.78241479860366e-05, + "loss": 1.8056, + "step": 8150 + }, + { + "epoch": 2.501841620626151, + "grad_norm": 0.32512977719306946, + "learning_rate": 8.782089698637191e-05, + "loss": 1.9099, + "step": 8151 + }, + { + "epoch": 2.5021485573971765, + "grad_norm": 0.3660493493080139, + "learning_rate": 8.781764561293895e-05, + "loss": 1.905, + "step": 8152 + }, + { + "epoch": 2.5024554941682013, + "grad_norm": 0.33591583371162415, + "learning_rate": 8.781439386576984e-05, + "loss": 1.8353, + "step": 8153 + }, + { + "epoch": 2.5027624309392267, + "grad_norm": 0.3774370551109314, + "learning_rate": 8.781114174489673e-05, + "loss": 1.8626, + "step": 8154 + }, + { + "epoch": 2.5030693677102516, + "grad_norm": 0.3628109097480774, + "learning_rate": 8.780788925035178e-05, + "loss": 1.8549, + "step": 8155 + }, + { + "epoch": 2.503376304481277, + "grad_norm": 0.3089732825756073, + "learning_rate": 8.78046363821671e-05, + "loss": 1.835, + "step": 8156 + }, + { + "epoch": 2.5036832412523022, + "grad_norm": 0.3630690574645996, + "learning_rate": 8.780138314037482e-05, + "loss": 1.8308, + "step": 8157 + }, + { + "epoch": 2.503990178023327, + "grad_norm": 0.3658130466938019, + "learning_rate": 8.779812952500714e-05, + "loss": 1.8484, + "step": 8158 + }, + { + "epoch": 2.5042971147943525, + "grad_norm": 0.38401272892951965, + "learning_rate": 8.779487553609617e-05, + "loss": 1.8408, + "step": 8159 + }, + { + "epoch": 2.5046040515653774, + "grad_norm": 0.354514479637146, + "learning_rate": 8.77916211736741e-05, + "loss": 1.8491, + "step": 8160 + }, + { + "epoch": 2.5049109883364027, + "grad_norm": 0.3604681193828583, + "learning_rate": 8.778836643777309e-05, + "loss": 1.8887, + "step": 8161 + }, + { + "epoch": 2.505217925107428, + "grad_norm": 0.3155761957168579, + "learning_rate": 8.778511132842528e-05, + "loss": 1.8066, + "step": 8162 + }, + { + "epoch": 2.505524861878453, + "grad_norm": 0.35986092686653137, + "learning_rate": 8.778185584566286e-05, + "loss": 1.8348, + "step": 8163 + }, + { + "epoch": 2.5058317986494782, + "grad_norm": 0.558273434638977, + "learning_rate": 8.777859998951799e-05, + "loss": 1.9118, + "step": 8164 + }, + { + "epoch": 2.506138735420503, + "grad_norm": 0.6520169377326965, + "learning_rate": 8.777534376002285e-05, + "loss": 1.8747, + "step": 8165 + }, + { + "epoch": 2.5064456721915285, + "grad_norm": 0.5059971213340759, + "learning_rate": 8.777208715720963e-05, + "loss": 1.8218, + "step": 8166 + }, + { + "epoch": 2.506752608962554, + "grad_norm": 0.2873745560646057, + "learning_rate": 8.77688301811105e-05, + "loss": 1.8266, + "step": 8167 + }, + { + "epoch": 2.507059545733579, + "grad_norm": 0.4212021827697754, + "learning_rate": 8.776557283175765e-05, + "loss": 1.8553, + "step": 8168 + }, + { + "epoch": 2.507366482504604, + "grad_norm": 0.49324098229408264, + "learning_rate": 8.776231510918328e-05, + "loss": 1.8625, + "step": 8169 + }, + { + "epoch": 2.5076734192756294, + "grad_norm": 0.4414234459400177, + "learning_rate": 8.775905701341959e-05, + "loss": 1.7956, + "step": 8170 + }, + { + "epoch": 2.5079803560466543, + "grad_norm": 0.2691541612148285, + "learning_rate": 8.775579854449876e-05, + "loss": 1.8216, + "step": 8171 + }, + { + "epoch": 2.5082872928176796, + "grad_norm": 0.3366323411464691, + "learning_rate": 8.775253970245299e-05, + "loss": 1.8738, + "step": 8172 + }, + { + "epoch": 2.508594229588705, + "grad_norm": 0.49541351199150085, + "learning_rate": 8.77492804873145e-05, + "loss": 1.8281, + "step": 8173 + }, + { + "epoch": 2.50890116635973, + "grad_norm": 0.584227442741394, + "learning_rate": 8.774602089911548e-05, + "loss": 1.8248, + "step": 8174 + }, + { + "epoch": 2.509208103130755, + "grad_norm": 0.4493597149848938, + "learning_rate": 8.774276093788818e-05, + "loss": 1.8624, + "step": 8175 + }, + { + "epoch": 2.50951503990178, + "grad_norm": 0.29684513807296753, + "learning_rate": 8.77395006036648e-05, + "loss": 1.7806, + "step": 8176 + }, + { + "epoch": 2.5098219766728054, + "grad_norm": 0.38788866996765137, + "learning_rate": 8.773623989647754e-05, + "loss": 1.8334, + "step": 8177 + }, + { + "epoch": 2.5101289134438307, + "grad_norm": 0.44810980558395386, + "learning_rate": 8.773297881635865e-05, + "loss": 1.823, + "step": 8178 + }, + { + "epoch": 2.5104358502148556, + "grad_norm": 0.39918363094329834, + "learning_rate": 8.772971736334032e-05, + "loss": 1.8535, + "step": 8179 + }, + { + "epoch": 2.510742786985881, + "grad_norm": 0.3454466462135315, + "learning_rate": 8.772645553745484e-05, + "loss": 1.8532, + "step": 8180 + }, + { + "epoch": 2.511049723756906, + "grad_norm": 0.3523466885089874, + "learning_rate": 8.77231933387344e-05, + "loss": 1.8402, + "step": 8181 + }, + { + "epoch": 2.511356660527931, + "grad_norm": 0.41947969794273376, + "learning_rate": 8.771993076721126e-05, + "loss": 1.8509, + "step": 8182 + }, + { + "epoch": 2.5116635972989565, + "grad_norm": 0.43224433064460754, + "learning_rate": 8.771666782291765e-05, + "loss": 1.858, + "step": 8183 + }, + { + "epoch": 2.511970534069982, + "grad_norm": 0.3467538058757782, + "learning_rate": 8.771340450588584e-05, + "loss": 1.8528, + "step": 8184 + }, + { + "epoch": 2.5122774708410067, + "grad_norm": 0.33712685108184814, + "learning_rate": 8.771014081614803e-05, + "loss": 1.8741, + "step": 8185 + }, + { + "epoch": 2.512584407612032, + "grad_norm": 0.4289829134941101, + "learning_rate": 8.770687675373652e-05, + "loss": 1.8252, + "step": 8186 + }, + { + "epoch": 2.512891344383057, + "grad_norm": 0.4774068295955658, + "learning_rate": 8.770361231868356e-05, + "loss": 1.8285, + "step": 8187 + }, + { + "epoch": 2.5131982811540823, + "grad_norm": 0.3455580472946167, + "learning_rate": 8.77003475110214e-05, + "loss": 1.8025, + "step": 8188 + }, + { + "epoch": 2.5135052179251076, + "grad_norm": 0.3050900399684906, + "learning_rate": 8.769708233078231e-05, + "loss": 1.8764, + "step": 8189 + }, + { + "epoch": 2.5138121546961325, + "grad_norm": 0.42384061217308044, + "learning_rate": 8.769381677799855e-05, + "loss": 1.8937, + "step": 8190 + }, + { + "epoch": 2.514119091467158, + "grad_norm": 0.4084749221801758, + "learning_rate": 8.76905508527024e-05, + "loss": 1.8124, + "step": 8191 + }, + { + "epoch": 2.5144260282381827, + "grad_norm": 0.38785848021507263, + "learning_rate": 8.768728455492615e-05, + "loss": 1.8731, + "step": 8192 + }, + { + "epoch": 2.514732965009208, + "grad_norm": 0.28196588158607483, + "learning_rate": 8.768401788470206e-05, + "loss": 1.809, + "step": 8193 + }, + { + "epoch": 2.5150399017802334, + "grad_norm": 0.3551066815853119, + "learning_rate": 8.76807508420624e-05, + "loss": 1.8955, + "step": 8194 + }, + { + "epoch": 2.5153468385512583, + "grad_norm": 0.4327031373977661, + "learning_rate": 8.76774834270395e-05, + "loss": 1.8651, + "step": 8195 + }, + { + "epoch": 2.5156537753222836, + "grad_norm": 0.3748793303966522, + "learning_rate": 8.76742156396656e-05, + "loss": 1.8158, + "step": 8196 + }, + { + "epoch": 2.5159607120933085, + "grad_norm": 0.32504430413246155, + "learning_rate": 8.767094747997304e-05, + "loss": 1.8598, + "step": 8197 + }, + { + "epoch": 2.516267648864334, + "grad_norm": 0.3639826476573944, + "learning_rate": 8.76676789479941e-05, + "loss": 1.8829, + "step": 8198 + }, + { + "epoch": 2.516574585635359, + "grad_norm": 0.36793577671051025, + "learning_rate": 8.766441004376106e-05, + "loss": 1.8215, + "step": 8199 + }, + { + "epoch": 2.5168815224063845, + "grad_norm": 0.3245735466480255, + "learning_rate": 8.766114076730624e-05, + "loss": 1.8309, + "step": 8200 + }, + { + "epoch": 2.5171884591774094, + "grad_norm": 0.3022485673427582, + "learning_rate": 8.765787111866198e-05, + "loss": 1.8286, + "step": 8201 + }, + { + "epoch": 2.5174953959484347, + "grad_norm": 0.40962809324264526, + "learning_rate": 8.765460109786056e-05, + "loss": 1.8032, + "step": 8202 + }, + { + "epoch": 2.5178023327194596, + "grad_norm": 0.4123937487602234, + "learning_rate": 8.765133070493428e-05, + "loss": 1.9311, + "step": 8203 + }, + { + "epoch": 2.518109269490485, + "grad_norm": 0.30352556705474854, + "learning_rate": 8.764805993991551e-05, + "loss": 1.8197, + "step": 8204 + }, + { + "epoch": 2.5184162062615103, + "grad_norm": 0.3201169967651367, + "learning_rate": 8.764478880283653e-05, + "loss": 1.9355, + "step": 8205 + }, + { + "epoch": 2.518723143032535, + "grad_norm": 0.36343297362327576, + "learning_rate": 8.764151729372969e-05, + "loss": 1.9201, + "step": 8206 + }, + { + "epoch": 2.5190300798035605, + "grad_norm": 0.3273618817329407, + "learning_rate": 8.763824541262729e-05, + "loss": 1.8195, + "step": 8207 + }, + { + "epoch": 2.5193370165745854, + "grad_norm": 0.30200251936912537, + "learning_rate": 8.76349731595617e-05, + "loss": 1.8094, + "step": 8208 + }, + { + "epoch": 2.5196439533456108, + "grad_norm": 0.3177770674228668, + "learning_rate": 8.763170053456527e-05, + "loss": 1.8519, + "step": 8209 + }, + { + "epoch": 2.519950890116636, + "grad_norm": 0.3206307291984558, + "learning_rate": 8.762842753767031e-05, + "loss": 1.8496, + "step": 8210 + }, + { + "epoch": 2.520257826887661, + "grad_norm": 0.31902456283569336, + "learning_rate": 8.762515416890915e-05, + "loss": 1.9069, + "step": 8211 + }, + { + "epoch": 2.5205647636586863, + "grad_norm": 0.3088377118110657, + "learning_rate": 8.762188042831419e-05, + "loss": 1.8482, + "step": 8212 + }, + { + "epoch": 2.520871700429711, + "grad_norm": 0.3046402931213379, + "learning_rate": 8.761860631591773e-05, + "loss": 1.8241, + "step": 8213 + }, + { + "epoch": 2.5211786372007365, + "grad_norm": 0.291831910610199, + "learning_rate": 8.761533183175217e-05, + "loss": 1.846, + "step": 8214 + }, + { + "epoch": 2.521485573971762, + "grad_norm": 0.3514893054962158, + "learning_rate": 8.761205697584986e-05, + "loss": 1.9, + "step": 8215 + }, + { + "epoch": 2.521792510742787, + "grad_norm": 0.31843090057373047, + "learning_rate": 8.760878174824316e-05, + "loss": 1.78, + "step": 8216 + }, + { + "epoch": 2.522099447513812, + "grad_norm": 0.30090904235839844, + "learning_rate": 8.760550614896443e-05, + "loss": 1.8718, + "step": 8217 + }, + { + "epoch": 2.5224063842848374, + "grad_norm": 0.38502126932144165, + "learning_rate": 8.760223017804604e-05, + "loss": 1.8772, + "step": 8218 + }, + { + "epoch": 2.5227133210558623, + "grad_norm": 0.30862319469451904, + "learning_rate": 8.759895383552037e-05, + "loss": 1.8532, + "step": 8219 + }, + { + "epoch": 2.5230202578268877, + "grad_norm": 0.36331596970558167, + "learning_rate": 8.759567712141981e-05, + "loss": 1.8587, + "step": 8220 + }, + { + "epoch": 2.523327194597913, + "grad_norm": 0.3370853662490845, + "learning_rate": 8.759240003577673e-05, + "loss": 1.8065, + "step": 8221 + }, + { + "epoch": 2.523634131368938, + "grad_norm": 0.3047318160533905, + "learning_rate": 8.758912257862351e-05, + "loss": 1.8783, + "step": 8222 + }, + { + "epoch": 2.523941068139963, + "grad_norm": 0.3172069787979126, + "learning_rate": 8.758584474999257e-05, + "loss": 1.7844, + "step": 8223 + }, + { + "epoch": 2.524248004910988, + "grad_norm": 0.3063897490501404, + "learning_rate": 8.758256654991626e-05, + "loss": 1.8642, + "step": 8224 + }, + { + "epoch": 2.5245549416820134, + "grad_norm": 0.2535867393016815, + "learning_rate": 8.757928797842702e-05, + "loss": 1.7784, + "step": 8225 + }, + { + "epoch": 2.5248618784530388, + "grad_norm": 0.27732348442077637, + "learning_rate": 8.757600903555722e-05, + "loss": 1.8223, + "step": 8226 + }, + { + "epoch": 2.525168815224064, + "grad_norm": 0.29819566011428833, + "learning_rate": 8.757272972133927e-05, + "loss": 1.8237, + "step": 8227 + }, + { + "epoch": 2.525475751995089, + "grad_norm": 0.26726382970809937, + "learning_rate": 8.756945003580559e-05, + "loss": 1.8134, + "step": 8228 + }, + { + "epoch": 2.5257826887661143, + "grad_norm": 0.2845614552497864, + "learning_rate": 8.756616997898859e-05, + "loss": 1.8757, + "step": 8229 + }, + { + "epoch": 2.5260896255371392, + "grad_norm": 0.33399102091789246, + "learning_rate": 8.756288955092066e-05, + "loss": 1.9036, + "step": 8230 + }, + { + "epoch": 2.5263965623081646, + "grad_norm": 0.3839001953601837, + "learning_rate": 8.755960875163426e-05, + "loss": 1.8205, + "step": 8231 + }, + { + "epoch": 2.52670349907919, + "grad_norm": 0.3703761696815491, + "learning_rate": 8.75563275811618e-05, + "loss": 1.768, + "step": 8232 + }, + { + "epoch": 2.527010435850215, + "grad_norm": 0.3083760440349579, + "learning_rate": 8.755304603953568e-05, + "loss": 1.8621, + "step": 8233 + }, + { + "epoch": 2.52731737262124, + "grad_norm": 0.2995334267616272, + "learning_rate": 8.754976412678833e-05, + "loss": 1.8246, + "step": 8234 + }, + { + "epoch": 2.527624309392265, + "grad_norm": 0.3482929766178131, + "learning_rate": 8.754648184295222e-05, + "loss": 1.7982, + "step": 8235 + }, + { + "epoch": 2.5279312461632903, + "grad_norm": 0.37462911009788513, + "learning_rate": 8.754319918805978e-05, + "loss": 1.8458, + "step": 8236 + }, + { + "epoch": 2.5282381829343157, + "grad_norm": 0.3112029433250427, + "learning_rate": 8.753991616214343e-05, + "loss": 1.9116, + "step": 8237 + }, + { + "epoch": 2.5285451197053406, + "grad_norm": 0.309711217880249, + "learning_rate": 8.753663276523563e-05, + "loss": 1.8072, + "step": 8238 + }, + { + "epoch": 2.528852056476366, + "grad_norm": 0.3831833302974701, + "learning_rate": 8.753334899736882e-05, + "loss": 1.8769, + "step": 8239 + }, + { + "epoch": 2.529158993247391, + "grad_norm": 0.30272287130355835, + "learning_rate": 8.753006485857547e-05, + "loss": 1.7874, + "step": 8240 + }, + { + "epoch": 2.529465930018416, + "grad_norm": 0.3613976538181305, + "learning_rate": 8.752678034888801e-05, + "loss": 1.8591, + "step": 8241 + }, + { + "epoch": 2.5297728667894415, + "grad_norm": 0.35976549983024597, + "learning_rate": 8.75234954683389e-05, + "loss": 1.7831, + "step": 8242 + }, + { + "epoch": 2.530079803560467, + "grad_norm": 0.33987951278686523, + "learning_rate": 8.752021021696064e-05, + "loss": 1.7986, + "step": 8243 + }, + { + "epoch": 2.5303867403314917, + "grad_norm": 0.29231634736061096, + "learning_rate": 8.751692459478567e-05, + "loss": 1.8205, + "step": 8244 + }, + { + "epoch": 2.530693677102517, + "grad_norm": 0.3382028341293335, + "learning_rate": 8.751363860184644e-05, + "loss": 1.8403, + "step": 8245 + }, + { + "epoch": 2.531000613873542, + "grad_norm": 0.44643479585647583, + "learning_rate": 8.751035223817546e-05, + "loss": 1.8273, + "step": 8246 + }, + { + "epoch": 2.5313075506445673, + "grad_norm": 0.4412732720375061, + "learning_rate": 8.750706550380518e-05, + "loss": 1.7935, + "step": 8247 + }, + { + "epoch": 2.5316144874155926, + "grad_norm": 0.3826131820678711, + "learning_rate": 8.750377839876811e-05, + "loss": 1.8622, + "step": 8248 + }, + { + "epoch": 2.5319214241866175, + "grad_norm": 0.27509525418281555, + "learning_rate": 8.750049092309672e-05, + "loss": 1.8359, + "step": 8249 + }, + { + "epoch": 2.532228360957643, + "grad_norm": 0.36282727122306824, + "learning_rate": 8.749720307682348e-05, + "loss": 1.8531, + "step": 8250 + }, + { + "epoch": 2.5325352977286677, + "grad_norm": 0.3730177581310272, + "learning_rate": 8.749391485998091e-05, + "loss": 1.8616, + "step": 8251 + }, + { + "epoch": 2.532842234499693, + "grad_norm": 0.3347858190536499, + "learning_rate": 8.749062627260152e-05, + "loss": 1.8078, + "step": 8252 + }, + { + "epoch": 2.5331491712707184, + "grad_norm": 0.29422396421432495, + "learning_rate": 8.748733731471777e-05, + "loss": 1.8623, + "step": 8253 + }, + { + "epoch": 2.5334561080417433, + "grad_norm": 0.36915895342826843, + "learning_rate": 8.748404798636219e-05, + "loss": 1.8461, + "step": 8254 + }, + { + "epoch": 2.5337630448127686, + "grad_norm": 0.4497677981853485, + "learning_rate": 8.748075828756725e-05, + "loss": 1.8328, + "step": 8255 + }, + { + "epoch": 2.5340699815837935, + "grad_norm": 0.4770478308200836, + "learning_rate": 8.747746821836552e-05, + "loss": 1.8418, + "step": 8256 + }, + { + "epoch": 2.534376918354819, + "grad_norm": 0.39125776290893555, + "learning_rate": 8.747417777878946e-05, + "loss": 1.8044, + "step": 8257 + }, + { + "epoch": 2.534683855125844, + "grad_norm": 0.2976539731025696, + "learning_rate": 8.747088696887163e-05, + "loss": 1.8819, + "step": 8258 + }, + { + "epoch": 2.5349907918968695, + "grad_norm": 0.37511107325553894, + "learning_rate": 8.746759578864452e-05, + "loss": 1.8304, + "step": 8259 + }, + { + "epoch": 2.5352977286678944, + "grad_norm": 0.4462794363498688, + "learning_rate": 8.746430423814068e-05, + "loss": 1.8248, + "step": 8260 + }, + { + "epoch": 2.5356046654389197, + "grad_norm": 0.3465537130832672, + "learning_rate": 8.746101231739261e-05, + "loss": 1.7987, + "step": 8261 + }, + { + "epoch": 2.5359116022099446, + "grad_norm": 0.3182581663131714, + "learning_rate": 8.745772002643287e-05, + "loss": 1.8817, + "step": 8262 + }, + { + "epoch": 2.53621853898097, + "grad_norm": 0.43006083369255066, + "learning_rate": 8.745442736529398e-05, + "loss": 1.8003, + "step": 8263 + }, + { + "epoch": 2.5365254757519953, + "grad_norm": 0.45511460304260254, + "learning_rate": 8.745113433400849e-05, + "loss": 1.8735, + "step": 8264 + }, + { + "epoch": 2.53683241252302, + "grad_norm": 0.3625985085964203, + "learning_rate": 8.744784093260894e-05, + "loss": 1.8469, + "step": 8265 + }, + { + "epoch": 2.5371393492940455, + "grad_norm": 0.2977297306060791, + "learning_rate": 8.744454716112787e-05, + "loss": 1.7885, + "step": 8266 + }, + { + "epoch": 2.5374462860650704, + "grad_norm": 0.34910085797309875, + "learning_rate": 8.744125301959785e-05, + "loss": 1.8885, + "step": 8267 + }, + { + "epoch": 2.5377532228360957, + "grad_norm": 0.40707942843437195, + "learning_rate": 8.743795850805141e-05, + "loss": 1.8829, + "step": 8268 + }, + { + "epoch": 2.538060159607121, + "grad_norm": 0.4142697751522064, + "learning_rate": 8.743466362652114e-05, + "loss": 1.903, + "step": 8269 + }, + { + "epoch": 2.538367096378146, + "grad_norm": 0.38610437512397766, + "learning_rate": 8.743136837503958e-05, + "loss": 1.9245, + "step": 8270 + }, + { + "epoch": 2.5386740331491713, + "grad_norm": 0.2940465211868286, + "learning_rate": 8.742807275363928e-05, + "loss": 1.8532, + "step": 8271 + }, + { + "epoch": 2.538980969920196, + "grad_norm": 0.3257673978805542, + "learning_rate": 8.742477676235284e-05, + "loss": 1.8517, + "step": 8272 + }, + { + "epoch": 2.5392879066912215, + "grad_norm": 0.3709326982498169, + "learning_rate": 8.742148040121282e-05, + "loss": 1.872, + "step": 8273 + }, + { + "epoch": 2.539594843462247, + "grad_norm": 0.3433123826980591, + "learning_rate": 8.741818367025179e-05, + "loss": 1.8717, + "step": 8274 + }, + { + "epoch": 2.539901780233272, + "grad_norm": 0.39426255226135254, + "learning_rate": 8.741488656950234e-05, + "loss": 1.8155, + "step": 8275 + }, + { + "epoch": 2.540208717004297, + "grad_norm": 0.48205071687698364, + "learning_rate": 8.741158909899706e-05, + "loss": 1.8668, + "step": 8276 + }, + { + "epoch": 2.5405156537753224, + "grad_norm": 0.35280337929725647, + "learning_rate": 8.740829125876853e-05, + "loss": 1.7845, + "step": 8277 + }, + { + "epoch": 2.5408225905463473, + "grad_norm": 0.3148525059223175, + "learning_rate": 8.740499304884932e-05, + "loss": 1.8539, + "step": 8278 + }, + { + "epoch": 2.5411295273173726, + "grad_norm": 0.387932687997818, + "learning_rate": 8.740169446927207e-05, + "loss": 1.8514, + "step": 8279 + }, + { + "epoch": 2.541436464088398, + "grad_norm": 0.37375807762145996, + "learning_rate": 8.739839552006934e-05, + "loss": 1.8497, + "step": 8280 + }, + { + "epoch": 2.541743400859423, + "grad_norm": 0.3094288408756256, + "learning_rate": 8.739509620127375e-05, + "loss": 1.8675, + "step": 8281 + }, + { + "epoch": 2.542050337630448, + "grad_norm": 0.36951884627342224, + "learning_rate": 8.73917965129179e-05, + "loss": 1.8533, + "step": 8282 + }, + { + "epoch": 2.542357274401473, + "grad_norm": 0.39360809326171875, + "learning_rate": 8.73884964550344e-05, + "loss": 1.8688, + "step": 8283 + }, + { + "epoch": 2.5426642111724984, + "grad_norm": 0.29781201481819153, + "learning_rate": 8.738519602765586e-05, + "loss": 1.8285, + "step": 8284 + }, + { + "epoch": 2.5429711479435237, + "grad_norm": 0.29476743936538696, + "learning_rate": 8.73818952308149e-05, + "loss": 1.8234, + "step": 8285 + }, + { + "epoch": 2.5432780847145486, + "grad_norm": 0.3660123646259308, + "learning_rate": 8.737859406454416e-05, + "loss": 1.8933, + "step": 8286 + }, + { + "epoch": 2.543585021485574, + "grad_norm": 0.41587865352630615, + "learning_rate": 8.737529252887621e-05, + "loss": 1.8799, + "step": 8287 + }, + { + "epoch": 2.5438919582565993, + "grad_norm": 0.4183691143989563, + "learning_rate": 8.737199062384374e-05, + "loss": 1.8479, + "step": 8288 + }, + { + "epoch": 2.544198895027624, + "grad_norm": 0.35940057039260864, + "learning_rate": 8.736868834947935e-05, + "loss": 1.8164, + "step": 8289 + }, + { + "epoch": 2.5445058317986495, + "grad_norm": 0.26804691553115845, + "learning_rate": 8.736538570581568e-05, + "loss": 1.8017, + "step": 8290 + }, + { + "epoch": 2.544812768569675, + "grad_norm": 0.34537792205810547, + "learning_rate": 8.736208269288534e-05, + "loss": 1.9002, + "step": 8291 + }, + { + "epoch": 2.5451197053406998, + "grad_norm": 0.4636915624141693, + "learning_rate": 8.735877931072106e-05, + "loss": 1.8207, + "step": 8292 + }, + { + "epoch": 2.545426642111725, + "grad_norm": 0.4897560775279999, + "learning_rate": 8.735547555935537e-05, + "loss": 1.7981, + "step": 8293 + }, + { + "epoch": 2.54573357888275, + "grad_norm": 0.37379372119903564, + "learning_rate": 8.7352171438821e-05, + "loss": 1.8727, + "step": 8294 + }, + { + "epoch": 2.5460405156537753, + "grad_norm": 0.295436292886734, + "learning_rate": 8.734886694915059e-05, + "loss": 1.8321, + "step": 8295 + }, + { + "epoch": 2.5463474524248007, + "grad_norm": 0.40406084060668945, + "learning_rate": 8.734556209037676e-05, + "loss": 1.8666, + "step": 8296 + }, + { + "epoch": 2.5466543891958255, + "grad_norm": 0.3286290466785431, + "learning_rate": 8.734225686253221e-05, + "loss": 1.8574, + "step": 8297 + }, + { + "epoch": 2.546961325966851, + "grad_norm": 0.3200569152832031, + "learning_rate": 8.73389512656496e-05, + "loss": 1.8253, + "step": 8298 + }, + { + "epoch": 2.5472682627378758, + "grad_norm": 0.35550132393836975, + "learning_rate": 8.733564529976157e-05, + "loss": 1.8293, + "step": 8299 + }, + { + "epoch": 2.547575199508901, + "grad_norm": 0.3804685175418854, + "learning_rate": 8.733233896490081e-05, + "loss": 1.8689, + "step": 8300 + }, + { + "epoch": 2.5478821362799264, + "grad_norm": 0.34739598631858826, + "learning_rate": 8.73290322611e-05, + "loss": 1.8441, + "step": 8301 + }, + { + "epoch": 2.5481890730509518, + "grad_norm": 0.29757586121559143, + "learning_rate": 8.732572518839182e-05, + "loss": 1.8698, + "step": 8302 + }, + { + "epoch": 2.5484960098219767, + "grad_norm": 0.30403536558151245, + "learning_rate": 8.732241774680895e-05, + "loss": 1.8305, + "step": 8303 + }, + { + "epoch": 2.548802946593002, + "grad_norm": 0.326876699924469, + "learning_rate": 8.731910993638406e-05, + "loss": 1.8514, + "step": 8304 + }, + { + "epoch": 2.549109883364027, + "grad_norm": 0.3108467161655426, + "learning_rate": 8.731580175714986e-05, + "loss": 1.8509, + "step": 8305 + }, + { + "epoch": 2.549416820135052, + "grad_norm": 0.31641489267349243, + "learning_rate": 8.731249320913904e-05, + "loss": 1.9009, + "step": 8306 + }, + { + "epoch": 2.5497237569060776, + "grad_norm": 0.3166131377220154, + "learning_rate": 8.730918429238428e-05, + "loss": 1.8291, + "step": 8307 + }, + { + "epoch": 2.5500306936771024, + "grad_norm": 0.27900195121765137, + "learning_rate": 8.730587500691829e-05, + "loss": 1.856, + "step": 8308 + }, + { + "epoch": 2.550337630448128, + "grad_norm": 0.3000704050064087, + "learning_rate": 8.730256535277379e-05, + "loss": 1.839, + "step": 8309 + }, + { + "epoch": 2.5506445672191527, + "grad_norm": 0.30938518047332764, + "learning_rate": 8.729925532998348e-05, + "loss": 1.929, + "step": 8310 + }, + { + "epoch": 2.550951503990178, + "grad_norm": 0.3687250316143036, + "learning_rate": 8.729594493858007e-05, + "loss": 1.9214, + "step": 8311 + }, + { + "epoch": 2.5512584407612033, + "grad_norm": 0.3302690386772156, + "learning_rate": 8.729263417859625e-05, + "loss": 1.8667, + "step": 8312 + }, + { + "epoch": 2.5515653775322282, + "grad_norm": 0.32535505294799805, + "learning_rate": 8.728932305006478e-05, + "loss": 1.8298, + "step": 8313 + }, + { + "epoch": 2.5518723143032536, + "grad_norm": 0.3425545394420624, + "learning_rate": 8.728601155301834e-05, + "loss": 1.9479, + "step": 8314 + }, + { + "epoch": 2.5521792510742785, + "grad_norm": 0.29452621936798096, + "learning_rate": 8.72826996874897e-05, + "loss": 1.7963, + "step": 8315 + }, + { + "epoch": 2.552486187845304, + "grad_norm": 0.28749120235443115, + "learning_rate": 8.727938745351156e-05, + "loss": 1.7993, + "step": 8316 + }, + { + "epoch": 2.552793124616329, + "grad_norm": 0.29261404275894165, + "learning_rate": 8.727607485111669e-05, + "loss": 1.8307, + "step": 8317 + }, + { + "epoch": 2.5531000613873545, + "grad_norm": 0.2949221730232239, + "learning_rate": 8.727276188033778e-05, + "loss": 1.7918, + "step": 8318 + }, + { + "epoch": 2.5534069981583793, + "grad_norm": 0.2975117862224579, + "learning_rate": 8.726944854120757e-05, + "loss": 1.8488, + "step": 8319 + }, + { + "epoch": 2.5537139349294047, + "grad_norm": 0.30285659432411194, + "learning_rate": 8.726613483375885e-05, + "loss": 1.8763, + "step": 8320 + }, + { + "epoch": 2.5540208717004296, + "grad_norm": 0.3068414330482483, + "learning_rate": 8.726282075802435e-05, + "loss": 1.8684, + "step": 8321 + }, + { + "epoch": 2.554327808471455, + "grad_norm": 0.3904091715812683, + "learning_rate": 8.72595063140368e-05, + "loss": 1.8643, + "step": 8322 + }, + { + "epoch": 2.5546347452424802, + "grad_norm": 0.443294882774353, + "learning_rate": 8.725619150182897e-05, + "loss": 1.8268, + "step": 8323 + }, + { + "epoch": 2.554941682013505, + "grad_norm": 0.4574877619743347, + "learning_rate": 8.725287632143362e-05, + "loss": 1.8686, + "step": 8324 + }, + { + "epoch": 2.5552486187845305, + "grad_norm": 0.3246860206127167, + "learning_rate": 8.724956077288351e-05, + "loss": 1.8304, + "step": 8325 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.30745935440063477, + "learning_rate": 8.724624485621141e-05, + "loss": 1.8129, + "step": 8326 + }, + { + "epoch": 2.5558624923265807, + "grad_norm": 0.4026782214641571, + "learning_rate": 8.72429285714501e-05, + "loss": 1.8511, + "step": 8327 + }, + { + "epoch": 2.556169429097606, + "grad_norm": 0.41659530997276306, + "learning_rate": 8.723961191863232e-05, + "loss": 1.891, + "step": 8328 + }, + { + "epoch": 2.556476365868631, + "grad_norm": 0.31792551279067993, + "learning_rate": 8.723629489779088e-05, + "loss": 1.8413, + "step": 8329 + }, + { + "epoch": 2.5567833026396563, + "grad_norm": 0.3168247640132904, + "learning_rate": 8.723297750895856e-05, + "loss": 1.902, + "step": 8330 + }, + { + "epoch": 2.557090239410681, + "grad_norm": 0.27834242582321167, + "learning_rate": 8.72296597521681e-05, + "loss": 1.8185, + "step": 8331 + }, + { + "epoch": 2.5573971761817065, + "grad_norm": 0.2997399568557739, + "learning_rate": 8.722634162745236e-05, + "loss": 1.8389, + "step": 8332 + }, + { + "epoch": 2.557704112952732, + "grad_norm": 0.29116490483283997, + "learning_rate": 8.722302313484407e-05, + "loss": 1.8391, + "step": 8333 + }, + { + "epoch": 2.558011049723757, + "grad_norm": 0.2898460030555725, + "learning_rate": 8.721970427437605e-05, + "loss": 1.8891, + "step": 8334 + }, + { + "epoch": 2.558317986494782, + "grad_norm": 0.3231159746646881, + "learning_rate": 8.721638504608109e-05, + "loss": 1.826, + "step": 8335 + }, + { + "epoch": 2.5586249232658074, + "grad_norm": 0.38665273785591125, + "learning_rate": 8.721306544999203e-05, + "loss": 1.9162, + "step": 8336 + }, + { + "epoch": 2.5589318600368323, + "grad_norm": 0.367824912071228, + "learning_rate": 8.720974548614162e-05, + "loss": 1.8165, + "step": 8337 + }, + { + "epoch": 2.5592387968078576, + "grad_norm": 0.3095315098762512, + "learning_rate": 8.72064251545627e-05, + "loss": 1.8887, + "step": 8338 + }, + { + "epoch": 2.559545733578883, + "grad_norm": 0.316890150308609, + "learning_rate": 8.720310445528807e-05, + "loss": 1.8547, + "step": 8339 + }, + { + "epoch": 2.559852670349908, + "grad_norm": 0.2962728440761566, + "learning_rate": 8.719978338835057e-05, + "loss": 1.8252, + "step": 8340 + }, + { + "epoch": 2.560159607120933, + "grad_norm": 0.3351762890815735, + "learning_rate": 8.719646195378302e-05, + "loss": 1.8056, + "step": 8341 + }, + { + "epoch": 2.560466543891958, + "grad_norm": 0.2946149706840515, + "learning_rate": 8.719314015161822e-05, + "loss": 1.8219, + "step": 8342 + }, + { + "epoch": 2.5607734806629834, + "grad_norm": 0.30291053652763367, + "learning_rate": 8.718981798188899e-05, + "loss": 1.8161, + "step": 8343 + }, + { + "epoch": 2.5610804174340087, + "grad_norm": 0.30717429518699646, + "learning_rate": 8.71864954446282e-05, + "loss": 1.8763, + "step": 8344 + }, + { + "epoch": 2.5613873542050336, + "grad_norm": 0.28360515832901, + "learning_rate": 8.718317253986866e-05, + "loss": 1.7972, + "step": 8345 + }, + { + "epoch": 2.561694290976059, + "grad_norm": 0.34898701310157776, + "learning_rate": 8.717984926764322e-05, + "loss": 1.8843, + "step": 8346 + }, + { + "epoch": 2.562001227747084, + "grad_norm": 0.2702360451221466, + "learning_rate": 8.717652562798472e-05, + "loss": 1.7917, + "step": 8347 + }, + { + "epoch": 2.562308164518109, + "grad_norm": 0.30566295981407166, + "learning_rate": 8.7173201620926e-05, + "loss": 1.9027, + "step": 8348 + }, + { + "epoch": 2.5626151012891345, + "grad_norm": 0.2882433533668518, + "learning_rate": 8.716987724649991e-05, + "loss": 1.8167, + "step": 8349 + }, + { + "epoch": 2.56292203806016, + "grad_norm": 0.2616370916366577, + "learning_rate": 8.71665525047393e-05, + "loss": 1.7779, + "step": 8350 + }, + { + "epoch": 2.5632289748311847, + "grad_norm": 0.3033899664878845, + "learning_rate": 8.716322739567706e-05, + "loss": 1.9022, + "step": 8351 + }, + { + "epoch": 2.56353591160221, + "grad_norm": 0.30584800243377686, + "learning_rate": 8.7159901919346e-05, + "loss": 1.8808, + "step": 8352 + }, + { + "epoch": 2.563842848373235, + "grad_norm": 0.34650805592536926, + "learning_rate": 8.715657607577903e-05, + "loss": 1.8817, + "step": 8353 + }, + { + "epoch": 2.5641497851442603, + "grad_norm": 0.30568572878837585, + "learning_rate": 8.715324986500898e-05, + "loss": 1.8852, + "step": 8354 + }, + { + "epoch": 2.5644567219152856, + "grad_norm": 0.36174869537353516, + "learning_rate": 8.714992328706875e-05, + "loss": 1.8518, + "step": 8355 + }, + { + "epoch": 2.5647636586863105, + "grad_norm": 0.48538872599601746, + "learning_rate": 8.714659634199119e-05, + "loss": 1.8902, + "step": 8356 + }, + { + "epoch": 2.565070595457336, + "grad_norm": 0.44997766613960266, + "learning_rate": 8.71432690298092e-05, + "loss": 1.8914, + "step": 8357 + }, + { + "epoch": 2.5653775322283607, + "grad_norm": 0.30164965987205505, + "learning_rate": 8.713994135055566e-05, + "loss": 1.826, + "step": 8358 + }, + { + "epoch": 2.565684468999386, + "grad_norm": 0.35495996475219727, + "learning_rate": 8.713661330426345e-05, + "loss": 1.8006, + "step": 8359 + }, + { + "epoch": 2.5659914057704114, + "grad_norm": 0.4141593277454376, + "learning_rate": 8.713328489096545e-05, + "loss": 1.782, + "step": 8360 + }, + { + "epoch": 2.5662983425414367, + "grad_norm": 0.4758378267288208, + "learning_rate": 8.712995611069458e-05, + "loss": 1.8378, + "step": 8361 + }, + { + "epoch": 2.5666052793124616, + "grad_norm": 0.4852865934371948, + "learning_rate": 8.71266269634837e-05, + "loss": 1.8472, + "step": 8362 + }, + { + "epoch": 2.566912216083487, + "grad_norm": 0.43413496017456055, + "learning_rate": 8.712329744936576e-05, + "loss": 1.8118, + "step": 8363 + }, + { + "epoch": 2.567219152854512, + "grad_norm": 0.3100700080394745, + "learning_rate": 8.711996756837361e-05, + "loss": 1.8699, + "step": 8364 + }, + { + "epoch": 2.567526089625537, + "grad_norm": 0.31886258721351624, + "learning_rate": 8.711663732054021e-05, + "loss": 1.8022, + "step": 8365 + }, + { + "epoch": 2.5678330263965625, + "grad_norm": 0.38900697231292725, + "learning_rate": 8.711330670589841e-05, + "loss": 1.8119, + "step": 8366 + }, + { + "epoch": 2.5681399631675874, + "grad_norm": 0.4188348650932312, + "learning_rate": 8.710997572448119e-05, + "loss": 1.8561, + "step": 8367 + }, + { + "epoch": 2.5684468999386127, + "grad_norm": 0.3562021255493164, + "learning_rate": 8.710664437632143e-05, + "loss": 1.8605, + "step": 8368 + }, + { + "epoch": 2.5687538367096376, + "grad_norm": 0.3105112910270691, + "learning_rate": 8.710331266145206e-05, + "loss": 1.8122, + "step": 8369 + }, + { + "epoch": 2.569060773480663, + "grad_norm": 0.3209846615791321, + "learning_rate": 8.7099980579906e-05, + "loss": 1.8914, + "step": 8370 + }, + { + "epoch": 2.5693677102516883, + "grad_norm": 0.32560455799102783, + "learning_rate": 8.70966481317162e-05, + "loss": 1.9245, + "step": 8371 + }, + { + "epoch": 2.569674647022713, + "grad_norm": 0.29573267698287964, + "learning_rate": 8.709331531691558e-05, + "loss": 1.8576, + "step": 8372 + }, + { + "epoch": 2.5699815837937385, + "grad_norm": 0.2974778115749359, + "learning_rate": 8.708998213553707e-05, + "loss": 1.8464, + "step": 8373 + }, + { + "epoch": 2.5702885205647634, + "grad_norm": 0.3264322578907013, + "learning_rate": 8.708664858761362e-05, + "loss": 1.8945, + "step": 8374 + }, + { + "epoch": 2.5705954573357888, + "grad_norm": 0.28260353207588196, + "learning_rate": 8.708331467317816e-05, + "loss": 1.8296, + "step": 8375 + }, + { + "epoch": 2.570902394106814, + "grad_norm": 0.2991141676902771, + "learning_rate": 8.707998039226367e-05, + "loss": 1.9227, + "step": 8376 + }, + { + "epoch": 2.5712093308778394, + "grad_norm": 0.28582924604415894, + "learning_rate": 8.707664574490306e-05, + "loss": 1.8465, + "step": 8377 + }, + { + "epoch": 2.5715162676488643, + "grad_norm": 0.2860773205757141, + "learning_rate": 8.707331073112932e-05, + "loss": 1.8403, + "step": 8378 + }, + { + "epoch": 2.5718232044198897, + "grad_norm": 0.31145161390304565, + "learning_rate": 8.70699753509754e-05, + "loss": 1.8775, + "step": 8379 + }, + { + "epoch": 2.5721301411909145, + "grad_norm": 0.28711119294166565, + "learning_rate": 8.706663960447424e-05, + "loss": 1.8354, + "step": 8380 + }, + { + "epoch": 2.57243707796194, + "grad_norm": 0.2884272634983063, + "learning_rate": 8.706330349165884e-05, + "loss": 1.8772, + "step": 8381 + }, + { + "epoch": 2.572744014732965, + "grad_norm": 0.3581789433956146, + "learning_rate": 8.705996701256214e-05, + "loss": 1.8654, + "step": 8382 + }, + { + "epoch": 2.57305095150399, + "grad_norm": 0.41561809182167053, + "learning_rate": 8.705663016721712e-05, + "loss": 1.9112, + "step": 8383 + }, + { + "epoch": 2.5733578882750154, + "grad_norm": 0.301883727312088, + "learning_rate": 8.705329295565676e-05, + "loss": 1.803, + "step": 8384 + }, + { + "epoch": 2.5736648250460403, + "grad_norm": 0.37060779333114624, + "learning_rate": 8.704995537791405e-05, + "loss": 1.9371, + "step": 8385 + }, + { + "epoch": 2.5739717618170657, + "grad_norm": 0.44705548882484436, + "learning_rate": 8.704661743402195e-05, + "loss": 1.8599, + "step": 8386 + }, + { + "epoch": 2.574278698588091, + "grad_norm": 0.44097039103507996, + "learning_rate": 8.70432791240135e-05, + "loss": 1.8305, + "step": 8387 + }, + { + "epoch": 2.574585635359116, + "grad_norm": 0.3278143107891083, + "learning_rate": 8.703994044792161e-05, + "loss": 1.8817, + "step": 8388 + }, + { + "epoch": 2.574892572130141, + "grad_norm": 0.347153902053833, + "learning_rate": 8.703660140577934e-05, + "loss": 1.8182, + "step": 8389 + }, + { + "epoch": 2.575199508901166, + "grad_norm": 0.4667893052101135, + "learning_rate": 8.703326199761966e-05, + "loss": 1.8354, + "step": 8390 + }, + { + "epoch": 2.5755064456721914, + "grad_norm": 0.4956285059452057, + "learning_rate": 8.702992222347559e-05, + "loss": 1.8284, + "step": 8391 + }, + { + "epoch": 2.575813382443217, + "grad_norm": 0.3489355146884918, + "learning_rate": 8.702658208338012e-05, + "loss": 1.8439, + "step": 8392 + }, + { + "epoch": 2.576120319214242, + "grad_norm": 0.3054865002632141, + "learning_rate": 8.702324157736625e-05, + "loss": 1.8659, + "step": 8393 + }, + { + "epoch": 2.576427255985267, + "grad_norm": 0.3459004759788513, + "learning_rate": 8.701990070546703e-05, + "loss": 1.8644, + "step": 8394 + }, + { + "epoch": 2.5767341927562923, + "grad_norm": 0.34715306758880615, + "learning_rate": 8.701655946771544e-05, + "loss": 1.8765, + "step": 8395 + }, + { + "epoch": 2.5770411295273172, + "grad_norm": 0.35610535740852356, + "learning_rate": 8.701321786414452e-05, + "loss": 1.886, + "step": 8396 + }, + { + "epoch": 2.5773480662983426, + "grad_norm": 0.34869852662086487, + "learning_rate": 8.700987589478728e-05, + "loss": 1.8858, + "step": 8397 + }, + { + "epoch": 2.577655003069368, + "grad_norm": 0.33508050441741943, + "learning_rate": 8.700653355967675e-05, + "loss": 1.8429, + "step": 8398 + }, + { + "epoch": 2.577961939840393, + "grad_norm": 0.4707668721675873, + "learning_rate": 8.700319085884597e-05, + "loss": 1.8806, + "step": 8399 + }, + { + "epoch": 2.578268876611418, + "grad_norm": 0.5073609948158264, + "learning_rate": 8.699984779232797e-05, + "loss": 1.9252, + "step": 8400 + }, + { + "epoch": 2.578575813382443, + "grad_norm": 0.4120771884918213, + "learning_rate": 8.699650436015578e-05, + "loss": 1.9463, + "step": 8401 + }, + { + "epoch": 2.5788827501534684, + "grad_norm": 0.5639505386352539, + "learning_rate": 8.699316056236246e-05, + "loss": 1.9076, + "step": 8402 + }, + { + "epoch": 2.5791896869244937, + "grad_norm": 0.7611388564109802, + "learning_rate": 8.698981639898106e-05, + "loss": 1.8344, + "step": 8403 + }, + { + "epoch": 2.5794966236955186, + "grad_norm": 0.715629518032074, + "learning_rate": 8.69864718700446e-05, + "loss": 1.7928, + "step": 8404 + }, + { + "epoch": 2.579803560466544, + "grad_norm": 0.4248988926410675, + "learning_rate": 8.698312697558614e-05, + "loss": 1.835, + "step": 8405 + }, + { + "epoch": 2.580110497237569, + "grad_norm": 0.3638152778148651, + "learning_rate": 8.697978171563875e-05, + "loss": 1.8544, + "step": 8406 + }, + { + "epoch": 2.580417434008594, + "grad_norm": 0.40734997391700745, + "learning_rate": 8.697643609023547e-05, + "loss": 1.7759, + "step": 8407 + }, + { + "epoch": 2.5807243707796195, + "grad_norm": 0.41469305753707886, + "learning_rate": 8.697309009940939e-05, + "loss": 1.8989, + "step": 8408 + }, + { + "epoch": 2.581031307550645, + "grad_norm": 0.3003403842449188, + "learning_rate": 8.696974374319355e-05, + "loss": 1.8138, + "step": 8409 + }, + { + "epoch": 2.5813382443216697, + "grad_norm": 0.3475555181503296, + "learning_rate": 8.696639702162104e-05, + "loss": 1.8851, + "step": 8410 + }, + { + "epoch": 2.581645181092695, + "grad_norm": 0.3952930271625519, + "learning_rate": 8.696304993472493e-05, + "loss": 1.8421, + "step": 8411 + }, + { + "epoch": 2.58195211786372, + "grad_norm": 0.33059266209602356, + "learning_rate": 8.69597024825383e-05, + "loss": 1.886, + "step": 8412 + }, + { + "epoch": 2.5822590546347453, + "grad_norm": 0.291877806186676, + "learning_rate": 8.695635466509422e-05, + "loss": 1.8001, + "step": 8413 + }, + { + "epoch": 2.5825659914057706, + "grad_norm": 0.3707219064235687, + "learning_rate": 8.69530064824258e-05, + "loss": 1.8419, + "step": 8414 + }, + { + "epoch": 2.5828729281767955, + "grad_norm": 0.4656111001968384, + "learning_rate": 8.694965793456609e-05, + "loss": 1.8925, + "step": 8415 + }, + { + "epoch": 2.583179864947821, + "grad_norm": 0.4284421503543854, + "learning_rate": 8.694630902154821e-05, + "loss": 1.8794, + "step": 8416 + }, + { + "epoch": 2.5834868017188457, + "grad_norm": 0.25311100482940674, + "learning_rate": 8.694295974340525e-05, + "loss": 1.8004, + "step": 8417 + }, + { + "epoch": 2.583793738489871, + "grad_norm": 0.3463805615901947, + "learning_rate": 8.693961010017031e-05, + "loss": 1.8666, + "step": 8418 + }, + { + "epoch": 2.5841006752608964, + "grad_norm": 0.3193957209587097, + "learning_rate": 8.693626009187647e-05, + "loss": 1.8787, + "step": 8419 + }, + { + "epoch": 2.5844076120319213, + "grad_norm": 0.30919939279556274, + "learning_rate": 8.69329097185569e-05, + "loss": 1.9066, + "step": 8420 + }, + { + "epoch": 2.5847145488029466, + "grad_norm": 0.31369611620903015, + "learning_rate": 8.692955898024464e-05, + "loss": 1.8714, + "step": 8421 + }, + { + "epoch": 2.5850214855739715, + "grad_norm": 0.3191319406032562, + "learning_rate": 8.692620787697284e-05, + "loss": 1.8535, + "step": 8422 + }, + { + "epoch": 2.585328422344997, + "grad_norm": 0.3148418366909027, + "learning_rate": 8.692285640877462e-05, + "loss": 1.8648, + "step": 8423 + }, + { + "epoch": 2.585635359116022, + "grad_norm": 0.28245437145233154, + "learning_rate": 8.691950457568307e-05, + "loss": 1.8574, + "step": 8424 + }, + { + "epoch": 2.5859422958870475, + "grad_norm": 0.28383150696754456, + "learning_rate": 8.691615237773137e-05, + "loss": 1.7993, + "step": 8425 + }, + { + "epoch": 2.5862492326580724, + "grad_norm": 0.30522802472114563, + "learning_rate": 8.691279981495257e-05, + "loss": 1.8809, + "step": 8426 + }, + { + "epoch": 2.5865561694290977, + "grad_norm": 0.2936995327472687, + "learning_rate": 8.690944688737988e-05, + "loss": 1.745, + "step": 8427 + }, + { + "epoch": 2.5868631062001226, + "grad_norm": 0.2923533320426941, + "learning_rate": 8.69060935950464e-05, + "loss": 1.8929, + "step": 8428 + }, + { + "epoch": 2.587170042971148, + "grad_norm": 0.3280770182609558, + "learning_rate": 8.690273993798526e-05, + "loss": 1.8587, + "step": 8429 + }, + { + "epoch": 2.5874769797421733, + "grad_norm": 0.314712792634964, + "learning_rate": 8.689938591622962e-05, + "loss": 1.8569, + "step": 8430 + }, + { + "epoch": 2.587783916513198, + "grad_norm": 0.3230959475040436, + "learning_rate": 8.689603152981263e-05, + "loss": 1.8451, + "step": 8431 + }, + { + "epoch": 2.5880908532842235, + "grad_norm": 0.35917067527770996, + "learning_rate": 8.689267677876742e-05, + "loss": 1.7755, + "step": 8432 + }, + { + "epoch": 2.5883977900552484, + "grad_norm": 0.3590618968009949, + "learning_rate": 8.688932166312715e-05, + "loss": 1.8236, + "step": 8433 + }, + { + "epoch": 2.5887047268262737, + "grad_norm": 0.29416507482528687, + "learning_rate": 8.6885966182925e-05, + "loss": 1.7852, + "step": 8434 + }, + { + "epoch": 2.589011663597299, + "grad_norm": 0.24230079352855682, + "learning_rate": 8.688261033819409e-05, + "loss": 1.8006, + "step": 8435 + }, + { + "epoch": 2.5893186003683244, + "grad_norm": 0.2519497573375702, + "learning_rate": 8.687925412896762e-05, + "loss": 1.7787, + "step": 8436 + }, + { + "epoch": 2.5896255371393493, + "grad_norm": 0.2794395089149475, + "learning_rate": 8.687589755527874e-05, + "loss": 1.8408, + "step": 8437 + }, + { + "epoch": 2.5899324739103746, + "grad_norm": 0.28811511397361755, + "learning_rate": 8.687254061716063e-05, + "loss": 1.8961, + "step": 8438 + }, + { + "epoch": 2.5902394106813995, + "grad_norm": 0.28127825260162354, + "learning_rate": 8.686918331464647e-05, + "loss": 1.8235, + "step": 8439 + }, + { + "epoch": 2.590546347452425, + "grad_norm": 0.2869607210159302, + "learning_rate": 8.686582564776942e-05, + "loss": 1.8452, + "step": 8440 + }, + { + "epoch": 2.59085328422345, + "grad_norm": 0.36350393295288086, + "learning_rate": 8.686246761656268e-05, + "loss": 1.9262, + "step": 8441 + }, + { + "epoch": 2.591160220994475, + "grad_norm": 0.30231785774230957, + "learning_rate": 8.685910922105942e-05, + "loss": 1.8674, + "step": 8442 + }, + { + "epoch": 2.5914671577655004, + "grad_norm": 0.28321847319602966, + "learning_rate": 8.685575046129285e-05, + "loss": 1.8243, + "step": 8443 + }, + { + "epoch": 2.5917740945365253, + "grad_norm": 0.30235186219215393, + "learning_rate": 8.685239133729615e-05, + "loss": 1.8442, + "step": 8444 + }, + { + "epoch": 2.5920810313075506, + "grad_norm": 0.2684946060180664, + "learning_rate": 8.684903184910252e-05, + "loss": 1.8584, + "step": 8445 + }, + { + "epoch": 2.592387968078576, + "grad_norm": 0.33788567781448364, + "learning_rate": 8.684567199674514e-05, + "loss": 1.8296, + "step": 8446 + }, + { + "epoch": 2.592694904849601, + "grad_norm": 0.38110965490341187, + "learning_rate": 8.684231178025726e-05, + "loss": 1.8581, + "step": 8447 + }, + { + "epoch": 2.593001841620626, + "grad_norm": 0.36466923356056213, + "learning_rate": 8.683895119967204e-05, + "loss": 1.8799, + "step": 8448 + }, + { + "epoch": 2.593308778391651, + "grad_norm": 0.3052733838558197, + "learning_rate": 8.683559025502272e-05, + "loss": 1.8834, + "step": 8449 + }, + { + "epoch": 2.5936157151626764, + "grad_norm": 0.31457164883613586, + "learning_rate": 8.683222894634251e-05, + "loss": 1.8635, + "step": 8450 + }, + { + "epoch": 2.5939226519337018, + "grad_norm": 0.46189576387405396, + "learning_rate": 8.682886727366464e-05, + "loss": 1.8852, + "step": 8451 + }, + { + "epoch": 2.594229588704727, + "grad_norm": 0.467640221118927, + "learning_rate": 8.682550523702229e-05, + "loss": 1.8306, + "step": 8452 + }, + { + "epoch": 2.594536525475752, + "grad_norm": 0.3384416699409485, + "learning_rate": 8.682214283644873e-05, + "loss": 1.8298, + "step": 8453 + }, + { + "epoch": 2.5948434622467773, + "grad_norm": 0.2842169404029846, + "learning_rate": 8.681878007197717e-05, + "loss": 1.8091, + "step": 8454 + }, + { + "epoch": 2.595150399017802, + "grad_norm": 0.31266552209854126, + "learning_rate": 8.681541694364084e-05, + "loss": 1.8329, + "step": 8455 + }, + { + "epoch": 2.5954573357888275, + "grad_norm": 0.36803483963012695, + "learning_rate": 8.681205345147298e-05, + "loss": 1.8427, + "step": 8456 + }, + { + "epoch": 2.595764272559853, + "grad_norm": 0.37500229477882385, + "learning_rate": 8.680868959550684e-05, + "loss": 1.8865, + "step": 8457 + }, + { + "epoch": 2.5960712093308778, + "grad_norm": 0.30494266748428345, + "learning_rate": 8.680532537577565e-05, + "loss": 1.8375, + "step": 8458 + }, + { + "epoch": 2.596378146101903, + "grad_norm": 0.38320985436439514, + "learning_rate": 8.680196079231266e-05, + "loss": 1.8762, + "step": 8459 + }, + { + "epoch": 2.596685082872928, + "grad_norm": 0.48555347323417664, + "learning_rate": 8.679859584515112e-05, + "loss": 1.8558, + "step": 8460 + }, + { + "epoch": 2.5969920196439533, + "grad_norm": 0.3975796401500702, + "learning_rate": 8.67952305343243e-05, + "loss": 1.8265, + "step": 8461 + }, + { + "epoch": 2.5972989564149787, + "grad_norm": 0.3312734365463257, + "learning_rate": 8.679186485986544e-05, + "loss": 1.8346, + "step": 8462 + }, + { + "epoch": 2.5976058931860035, + "grad_norm": 0.37137889862060547, + "learning_rate": 8.67884988218078e-05, + "loss": 1.8894, + "step": 8463 + }, + { + "epoch": 2.597912829957029, + "grad_norm": 0.3645901083946228, + "learning_rate": 8.678513242018467e-05, + "loss": 1.8103, + "step": 8464 + }, + { + "epoch": 2.5982197667280538, + "grad_norm": 0.35010847449302673, + "learning_rate": 8.67817656550293e-05, + "loss": 1.8704, + "step": 8465 + }, + { + "epoch": 2.598526703499079, + "grad_norm": 0.36948931217193604, + "learning_rate": 8.677839852637492e-05, + "loss": 1.8413, + "step": 8466 + }, + { + "epoch": 2.5988336402701044, + "grad_norm": 0.3512018322944641, + "learning_rate": 8.67750310342549e-05, + "loss": 1.8222, + "step": 8467 + }, + { + "epoch": 2.5991405770411298, + "grad_norm": 0.3678590953350067, + "learning_rate": 8.677166317870245e-05, + "loss": 1.852, + "step": 8468 + }, + { + "epoch": 2.5994475138121547, + "grad_norm": 0.46718111634254456, + "learning_rate": 8.676829495975087e-05, + "loss": 1.8459, + "step": 8469 + }, + { + "epoch": 2.59975445058318, + "grad_norm": 0.4580456018447876, + "learning_rate": 8.676492637743345e-05, + "loss": 1.8547, + "step": 8470 + }, + { + "epoch": 2.600061387354205, + "grad_norm": 0.3790566921234131, + "learning_rate": 8.676155743178348e-05, + "loss": 1.8483, + "step": 8471 + }, + { + "epoch": 2.6003683241252302, + "grad_norm": 0.34775233268737793, + "learning_rate": 8.675818812283424e-05, + "loss": 1.9, + "step": 8472 + }, + { + "epoch": 2.6006752608962556, + "grad_norm": 0.4257417619228363, + "learning_rate": 8.675481845061906e-05, + "loss": 1.8354, + "step": 8473 + }, + { + "epoch": 2.6009821976672804, + "grad_norm": 0.46964964270591736, + "learning_rate": 8.675144841517122e-05, + "loss": 1.8305, + "step": 8474 + }, + { + "epoch": 2.601289134438306, + "grad_norm": 0.3592812120914459, + "learning_rate": 8.674807801652403e-05, + "loss": 1.778, + "step": 8475 + }, + { + "epoch": 2.6015960712093307, + "grad_norm": 0.3184985816478729, + "learning_rate": 8.674470725471078e-05, + "loss": 1.8706, + "step": 8476 + }, + { + "epoch": 2.601903007980356, + "grad_norm": 0.31306785345077515, + "learning_rate": 8.674133612976481e-05, + "loss": 1.8482, + "step": 8477 + }, + { + "epoch": 2.6022099447513813, + "grad_norm": 0.30568715929985046, + "learning_rate": 8.673796464171939e-05, + "loss": 1.8346, + "step": 8478 + }, + { + "epoch": 2.6025168815224062, + "grad_norm": 0.33701828122138977, + "learning_rate": 8.673459279060791e-05, + "loss": 1.8165, + "step": 8479 + }, + { + "epoch": 2.6028238182934316, + "grad_norm": 0.3153107166290283, + "learning_rate": 8.673122057646364e-05, + "loss": 1.8175, + "step": 8480 + }, + { + "epoch": 2.6031307550644565, + "grad_norm": 0.3428439497947693, + "learning_rate": 8.67278479993199e-05, + "loss": 1.8344, + "step": 8481 + }, + { + "epoch": 2.603437691835482, + "grad_norm": 0.39118432998657227, + "learning_rate": 8.672447505921006e-05, + "loss": 1.7904, + "step": 8482 + }, + { + "epoch": 2.603744628606507, + "grad_norm": 0.3845612108707428, + "learning_rate": 8.672110175616743e-05, + "loss": 1.8442, + "step": 8483 + }, + { + "epoch": 2.6040515653775325, + "grad_norm": 0.3402850329875946, + "learning_rate": 8.671772809022535e-05, + "loss": 1.8578, + "step": 8484 + }, + { + "epoch": 2.6043585021485574, + "grad_norm": 0.30314967036247253, + "learning_rate": 8.671435406141716e-05, + "loss": 1.8235, + "step": 8485 + }, + { + "epoch": 2.6046654389195827, + "grad_norm": 0.29402145743370056, + "learning_rate": 8.67109796697762e-05, + "loss": 1.8105, + "step": 8486 + }, + { + "epoch": 2.6049723756906076, + "grad_norm": 0.33207419514656067, + "learning_rate": 8.670760491533582e-05, + "loss": 1.9133, + "step": 8487 + }, + { + "epoch": 2.605279312461633, + "grad_norm": 0.3287195861339569, + "learning_rate": 8.670422979812938e-05, + "loss": 1.8344, + "step": 8488 + }, + { + "epoch": 2.6055862492326582, + "grad_norm": 0.37947842478752136, + "learning_rate": 8.670085431819021e-05, + "loss": 1.8504, + "step": 8489 + }, + { + "epoch": 2.605893186003683, + "grad_norm": 0.3688724935054779, + "learning_rate": 8.669747847555171e-05, + "loss": 1.8305, + "step": 8490 + }, + { + "epoch": 2.6062001227747085, + "grad_norm": 0.33962976932525635, + "learning_rate": 8.669410227024721e-05, + "loss": 1.861, + "step": 8491 + }, + { + "epoch": 2.6065070595457334, + "grad_norm": 0.27068057656288147, + "learning_rate": 8.669072570231009e-05, + "loss": 1.7666, + "step": 8492 + }, + { + "epoch": 2.6068139963167587, + "grad_norm": 0.32670122385025024, + "learning_rate": 8.668734877177371e-05, + "loss": 1.8434, + "step": 8493 + }, + { + "epoch": 2.607120933087784, + "grad_norm": 0.37303030490875244, + "learning_rate": 8.668397147867144e-05, + "loss": 1.8326, + "step": 8494 + }, + { + "epoch": 2.607427869858809, + "grad_norm": 0.2860218286514282, + "learning_rate": 8.668059382303666e-05, + "loss": 1.7993, + "step": 8495 + }, + { + "epoch": 2.6077348066298343, + "grad_norm": 0.3480636477470398, + "learning_rate": 8.667721580490278e-05, + "loss": 1.8895, + "step": 8496 + }, + { + "epoch": 2.608041743400859, + "grad_norm": 0.37609198689460754, + "learning_rate": 8.667383742430313e-05, + "loss": 1.8906, + "step": 8497 + }, + { + "epoch": 2.6083486801718845, + "grad_norm": 0.30747851729393005, + "learning_rate": 8.667045868127113e-05, + "loss": 1.8169, + "step": 8498 + }, + { + "epoch": 2.60865561694291, + "grad_norm": 0.3108443021774292, + "learning_rate": 8.666707957584016e-05, + "loss": 1.8296, + "step": 8499 + }, + { + "epoch": 2.608962553713935, + "grad_norm": 0.36353448033332825, + "learning_rate": 8.666370010804361e-05, + "loss": 1.879, + "step": 8500 + }, + { + "epoch": 2.60926949048496, + "grad_norm": 0.39959096908569336, + "learning_rate": 8.666032027791491e-05, + "loss": 1.8602, + "step": 8501 + }, + { + "epoch": 2.6095764272559854, + "grad_norm": 0.3505500853061676, + "learning_rate": 8.665694008548742e-05, + "loss": 1.861, + "step": 8502 + }, + { + "epoch": 2.6098833640270103, + "grad_norm": 0.3155219852924347, + "learning_rate": 8.665355953079457e-05, + "loss": 1.7911, + "step": 8503 + }, + { + "epoch": 2.6101903007980356, + "grad_norm": 0.2868075668811798, + "learning_rate": 8.665017861386975e-05, + "loss": 1.8023, + "step": 8504 + }, + { + "epoch": 2.610497237569061, + "grad_norm": 0.2890832722187042, + "learning_rate": 8.664679733474641e-05, + "loss": 1.8653, + "step": 8505 + }, + { + "epoch": 2.610804174340086, + "grad_norm": 0.3143366575241089, + "learning_rate": 8.66434156934579e-05, + "loss": 1.8024, + "step": 8506 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.28702911734580994, + "learning_rate": 8.664003369003772e-05, + "loss": 1.8231, + "step": 8507 + }, + { + "epoch": 2.611418047882136, + "grad_norm": 0.37087059020996094, + "learning_rate": 8.663665132451924e-05, + "loss": 1.8565, + "step": 8508 + }, + { + "epoch": 2.6117249846531614, + "grad_norm": 0.29796209931373596, + "learning_rate": 8.663326859693588e-05, + "loss": 1.8188, + "step": 8509 + }, + { + "epoch": 2.6120319214241867, + "grad_norm": 0.31352412700653076, + "learning_rate": 8.66298855073211e-05, + "loss": 1.806, + "step": 8510 + }, + { + "epoch": 2.612338858195212, + "grad_norm": 0.28749167919158936, + "learning_rate": 8.662650205570832e-05, + "loss": 1.8082, + "step": 8511 + }, + { + "epoch": 2.612645794966237, + "grad_norm": 0.26889678835868835, + "learning_rate": 8.662311824213099e-05, + "loss": 1.8211, + "step": 8512 + }, + { + "epoch": 2.6129527317372623, + "grad_norm": 0.2562754154205322, + "learning_rate": 8.661973406662253e-05, + "loss": 1.7519, + "step": 8513 + }, + { + "epoch": 2.613259668508287, + "grad_norm": 0.26967912912368774, + "learning_rate": 8.661634952921639e-05, + "loss": 1.8339, + "step": 8514 + }, + { + "epoch": 2.6135666052793125, + "grad_norm": 0.3468424081802368, + "learning_rate": 8.661296462994602e-05, + "loss": 1.9219, + "step": 8515 + }, + { + "epoch": 2.613873542050338, + "grad_norm": 0.34790560603141785, + "learning_rate": 8.660957936884489e-05, + "loss": 1.9089, + "step": 8516 + }, + { + "epoch": 2.6141804788213627, + "grad_norm": 0.350337952375412, + "learning_rate": 8.660619374594643e-05, + "loss": 1.8228, + "step": 8517 + }, + { + "epoch": 2.614487415592388, + "grad_norm": 0.37077057361602783, + "learning_rate": 8.660280776128411e-05, + "loss": 1.8658, + "step": 8518 + }, + { + "epoch": 2.614794352363413, + "grad_norm": 0.35846221446990967, + "learning_rate": 8.659942141489139e-05, + "loss": 1.8573, + "step": 8519 + }, + { + "epoch": 2.6151012891344383, + "grad_norm": 0.339101642370224, + "learning_rate": 8.659603470680173e-05, + "loss": 1.875, + "step": 8520 + }, + { + "epoch": 2.6154082259054636, + "grad_norm": 0.35074207186698914, + "learning_rate": 8.65926476370486e-05, + "loss": 1.8395, + "step": 8521 + }, + { + "epoch": 2.6157151626764885, + "grad_norm": 0.31544017791748047, + "learning_rate": 8.658926020566551e-05, + "loss": 1.8453, + "step": 8522 + }, + { + "epoch": 2.616022099447514, + "grad_norm": 0.30619683861732483, + "learning_rate": 8.658587241268587e-05, + "loss": 1.775, + "step": 8523 + }, + { + "epoch": 2.6163290362185387, + "grad_norm": 0.29331618547439575, + "learning_rate": 8.658248425814322e-05, + "loss": 1.8068, + "step": 8524 + }, + { + "epoch": 2.616635972989564, + "grad_norm": 0.2824336290359497, + "learning_rate": 8.6579095742071e-05, + "loss": 1.8759, + "step": 8525 + }, + { + "epoch": 2.6169429097605894, + "grad_norm": 0.2697986364364624, + "learning_rate": 8.657570686450271e-05, + "loss": 1.8295, + "step": 8526 + }, + { + "epoch": 2.6172498465316147, + "grad_norm": 0.3031822144985199, + "learning_rate": 8.657231762547186e-05, + "loss": 1.9205, + "step": 8527 + }, + { + "epoch": 2.6175567833026396, + "grad_norm": 0.2867984473705292, + "learning_rate": 8.656892802501196e-05, + "loss": 1.8638, + "step": 8528 + }, + { + "epoch": 2.617863720073665, + "grad_norm": 0.29799792170524597, + "learning_rate": 8.656553806315644e-05, + "loss": 1.8187, + "step": 8529 + }, + { + "epoch": 2.61817065684469, + "grad_norm": 0.3222150504589081, + "learning_rate": 8.656214773993884e-05, + "loss": 1.8661, + "step": 8530 + }, + { + "epoch": 2.618477593615715, + "grad_norm": 0.35999616980552673, + "learning_rate": 8.655875705539269e-05, + "loss": 1.9155, + "step": 8531 + }, + { + "epoch": 2.6187845303867405, + "grad_norm": 0.36571675539016724, + "learning_rate": 8.655536600955147e-05, + "loss": 1.8536, + "step": 8532 + }, + { + "epoch": 2.6190914671577654, + "grad_norm": 0.29667189717292786, + "learning_rate": 8.655197460244868e-05, + "loss": 1.8208, + "step": 8533 + }, + { + "epoch": 2.6193984039287908, + "grad_norm": 0.3216320276260376, + "learning_rate": 8.654858283411787e-05, + "loss": 1.8613, + "step": 8534 + }, + { + "epoch": 2.6197053406998156, + "grad_norm": 0.28880423307418823, + "learning_rate": 8.654519070459254e-05, + "loss": 1.8547, + "step": 8535 + }, + { + "epoch": 2.620012277470841, + "grad_norm": 0.3130050301551819, + "learning_rate": 8.654179821390621e-05, + "loss": 1.9355, + "step": 8536 + }, + { + "epoch": 2.6203192142418663, + "grad_norm": 0.3151358664035797, + "learning_rate": 8.653840536209241e-05, + "loss": 1.8462, + "step": 8537 + }, + { + "epoch": 2.620626151012891, + "grad_norm": 0.2702169120311737, + "learning_rate": 8.653501214918468e-05, + "loss": 1.7966, + "step": 8538 + }, + { + "epoch": 2.6209330877839165, + "grad_norm": 0.31494441628456116, + "learning_rate": 8.653161857521655e-05, + "loss": 1.7449, + "step": 8539 + }, + { + "epoch": 2.6212400245549414, + "grad_norm": 0.3219514787197113, + "learning_rate": 8.652822464022154e-05, + "loss": 1.8238, + "step": 8540 + }, + { + "epoch": 2.6215469613259668, + "grad_norm": 0.3237066864967346, + "learning_rate": 8.652483034423322e-05, + "loss": 1.8273, + "step": 8541 + }, + { + "epoch": 2.621853898096992, + "grad_norm": 0.31354910135269165, + "learning_rate": 8.65214356872851e-05, + "loss": 1.8662, + "step": 8542 + }, + { + "epoch": 2.6221608348680174, + "grad_norm": 0.30085036158561707, + "learning_rate": 8.651804066941077e-05, + "loss": 1.8922, + "step": 8543 + }, + { + "epoch": 2.6224677716390423, + "grad_norm": 0.337528258562088, + "learning_rate": 8.651464529064373e-05, + "loss": 1.8234, + "step": 8544 + }, + { + "epoch": 2.6227747084100677, + "grad_norm": 0.33202415704727173, + "learning_rate": 8.65112495510176e-05, + "loss": 1.8331, + "step": 8545 + }, + { + "epoch": 2.6230816451810925, + "grad_norm": 0.3288112282752991, + "learning_rate": 8.650785345056586e-05, + "loss": 1.8129, + "step": 8546 + }, + { + "epoch": 2.623388581952118, + "grad_norm": 0.35483047366142273, + "learning_rate": 8.650445698932214e-05, + "loss": 1.8488, + "step": 8547 + }, + { + "epoch": 2.623695518723143, + "grad_norm": 0.32108932733535767, + "learning_rate": 8.650106016731998e-05, + "loss": 1.8263, + "step": 8548 + }, + { + "epoch": 2.624002455494168, + "grad_norm": 0.2902318239212036, + "learning_rate": 8.649766298459295e-05, + "loss": 1.8352, + "step": 8549 + }, + { + "epoch": 2.6243093922651934, + "grad_norm": 0.29014477133750916, + "learning_rate": 8.64942654411746e-05, + "loss": 1.8568, + "step": 8550 + }, + { + "epoch": 2.6246163290362183, + "grad_norm": 0.3996742367744446, + "learning_rate": 8.649086753709855e-05, + "loss": 1.8928, + "step": 8551 + }, + { + "epoch": 2.6249232658072437, + "grad_norm": 0.3703175187110901, + "learning_rate": 8.648746927239835e-05, + "loss": 1.829, + "step": 8552 + }, + { + "epoch": 2.625230202578269, + "grad_norm": 0.33802542090415955, + "learning_rate": 8.64840706471076e-05, + "loss": 1.8827, + "step": 8553 + }, + { + "epoch": 2.625537139349294, + "grad_norm": 0.33303168416023254, + "learning_rate": 8.648067166125988e-05, + "loss": 1.8964, + "step": 8554 + }, + { + "epoch": 2.6258440761203192, + "grad_norm": 0.33449646830558777, + "learning_rate": 8.647727231488878e-05, + "loss": 1.8477, + "step": 8555 + }, + { + "epoch": 2.626151012891344, + "grad_norm": 0.3260989189147949, + "learning_rate": 8.647387260802788e-05, + "loss": 1.8623, + "step": 8556 + }, + { + "epoch": 2.6264579496623695, + "grad_norm": 0.2847815752029419, + "learning_rate": 8.647047254071082e-05, + "loss": 1.769, + "step": 8557 + }, + { + "epoch": 2.626764886433395, + "grad_norm": 0.30041372776031494, + "learning_rate": 8.646707211297116e-05, + "loss": 1.8451, + "step": 8558 + }, + { + "epoch": 2.62707182320442, + "grad_norm": 0.3557286560535431, + "learning_rate": 8.646367132484252e-05, + "loss": 1.8233, + "step": 8559 + }, + { + "epoch": 2.627378759975445, + "grad_norm": 0.39471131563186646, + "learning_rate": 8.646027017635851e-05, + "loss": 1.8364, + "step": 8560 + }, + { + "epoch": 2.6276856967464703, + "grad_norm": 0.37501803040504456, + "learning_rate": 8.645686866755273e-05, + "loss": 1.8129, + "step": 8561 + }, + { + "epoch": 2.6279926335174952, + "grad_norm": 0.374553918838501, + "learning_rate": 8.645346679845881e-05, + "loss": 1.9388, + "step": 8562 + }, + { + "epoch": 2.6282995702885206, + "grad_norm": 0.34410929679870605, + "learning_rate": 8.645006456911037e-05, + "loss": 1.8496, + "step": 8563 + }, + { + "epoch": 2.628606507059546, + "grad_norm": 0.28208592534065247, + "learning_rate": 8.644666197954103e-05, + "loss": 1.8405, + "step": 8564 + }, + { + "epoch": 2.628913443830571, + "grad_norm": 0.2913917005062103, + "learning_rate": 8.644325902978441e-05, + "loss": 1.8775, + "step": 8565 + }, + { + "epoch": 2.629220380601596, + "grad_norm": 0.33285796642303467, + "learning_rate": 8.643985571987414e-05, + "loss": 1.8217, + "step": 8566 + }, + { + "epoch": 2.629527317372621, + "grad_norm": 0.3419492244720459, + "learning_rate": 8.643645204984386e-05, + "loss": 1.8911, + "step": 8567 + }, + { + "epoch": 2.6298342541436464, + "grad_norm": 0.33901095390319824, + "learning_rate": 8.643304801972721e-05, + "loss": 1.8653, + "step": 8568 + }, + { + "epoch": 2.6301411909146717, + "grad_norm": 0.30073773860931396, + "learning_rate": 8.642964362955781e-05, + "loss": 1.7544, + "step": 8569 + }, + { + "epoch": 2.630448127685697, + "grad_norm": 0.3300367593765259, + "learning_rate": 8.642623887936933e-05, + "loss": 1.8764, + "step": 8570 + }, + { + "epoch": 2.630755064456722, + "grad_norm": 0.330671101808548, + "learning_rate": 8.642283376919542e-05, + "loss": 1.8227, + "step": 8571 + }, + { + "epoch": 2.6310620012277472, + "grad_norm": 0.3498590290546417, + "learning_rate": 8.64194282990697e-05, + "loss": 1.8639, + "step": 8572 + }, + { + "epoch": 2.631368937998772, + "grad_norm": 0.33145999908447266, + "learning_rate": 8.641602246902586e-05, + "loss": 1.8442, + "step": 8573 + }, + { + "epoch": 2.6316758747697975, + "grad_norm": 0.29510337114334106, + "learning_rate": 8.641261627909754e-05, + "loss": 1.829, + "step": 8574 + }, + { + "epoch": 2.631982811540823, + "grad_norm": 0.2788131833076477, + "learning_rate": 8.640920972931839e-05, + "loss": 1.7717, + "step": 8575 + }, + { + "epoch": 2.6322897483118477, + "grad_norm": 0.27459269762039185, + "learning_rate": 8.640580281972209e-05, + "loss": 1.7924, + "step": 8576 + }, + { + "epoch": 2.632596685082873, + "grad_norm": 0.3517146110534668, + "learning_rate": 8.640239555034232e-05, + "loss": 1.8921, + "step": 8577 + }, + { + "epoch": 2.632903621853898, + "grad_norm": 0.2852388620376587, + "learning_rate": 8.639898792121273e-05, + "loss": 1.8207, + "step": 8578 + }, + { + "epoch": 2.6332105586249233, + "grad_norm": 0.3164372742176056, + "learning_rate": 8.639557993236702e-05, + "loss": 1.8782, + "step": 8579 + }, + { + "epoch": 2.6335174953959486, + "grad_norm": 0.43939462304115295, + "learning_rate": 8.639217158383885e-05, + "loss": 1.8345, + "step": 8580 + }, + { + "epoch": 2.6338244321669735, + "grad_norm": 0.45321017503738403, + "learning_rate": 8.63887628756619e-05, + "loss": 1.904, + "step": 8581 + }, + { + "epoch": 2.634131368937999, + "grad_norm": 0.4423905611038208, + "learning_rate": 8.638535380786989e-05, + "loss": 1.8894, + "step": 8582 + }, + { + "epoch": 2.6344383057090237, + "grad_norm": 0.3929237723350525, + "learning_rate": 8.638194438049648e-05, + "loss": 1.8835, + "step": 8583 + }, + { + "epoch": 2.634745242480049, + "grad_norm": 0.3178403973579407, + "learning_rate": 8.637853459357536e-05, + "loss": 1.8125, + "step": 8584 + }, + { + "epoch": 2.6350521792510744, + "grad_norm": 0.3796660602092743, + "learning_rate": 8.637512444714024e-05, + "loss": 1.9376, + "step": 8585 + }, + { + "epoch": 2.6353591160220997, + "grad_norm": 0.34011390805244446, + "learning_rate": 8.637171394122483e-05, + "loss": 1.8339, + "step": 8586 + }, + { + "epoch": 2.6356660527931246, + "grad_norm": 0.3423489034175873, + "learning_rate": 8.636830307586281e-05, + "loss": 1.82, + "step": 8587 + }, + { + "epoch": 2.63597298956415, + "grad_norm": 0.3644867241382599, + "learning_rate": 8.636489185108791e-05, + "loss": 1.811, + "step": 8588 + }, + { + "epoch": 2.636279926335175, + "grad_norm": 0.35383811593055725, + "learning_rate": 8.636148026693384e-05, + "loss": 1.8228, + "step": 8589 + }, + { + "epoch": 2.6365868631062, + "grad_norm": 0.28066012263298035, + "learning_rate": 8.635806832343431e-05, + "loss": 1.7752, + "step": 8590 + }, + { + "epoch": 2.6368937998772255, + "grad_norm": 0.27132275700569153, + "learning_rate": 8.635465602062304e-05, + "loss": 1.8053, + "step": 8591 + }, + { + "epoch": 2.6372007366482504, + "grad_norm": 0.3076920211315155, + "learning_rate": 8.635124335853375e-05, + "loss": 1.77, + "step": 8592 + }, + { + "epoch": 2.6375076734192757, + "grad_norm": 0.35130617022514343, + "learning_rate": 8.634783033720015e-05, + "loss": 1.8272, + "step": 8593 + }, + { + "epoch": 2.6378146101903006, + "grad_norm": 0.3805561661720276, + "learning_rate": 8.634441695665601e-05, + "loss": 1.8549, + "step": 8594 + }, + { + "epoch": 2.638121546961326, + "grad_norm": 0.3168867230415344, + "learning_rate": 8.634100321693504e-05, + "loss": 1.9131, + "step": 8595 + }, + { + "epoch": 2.6384284837323513, + "grad_norm": 0.3061029314994812, + "learning_rate": 8.633758911807095e-05, + "loss": 1.84, + "step": 8596 + }, + { + "epoch": 2.638735420503376, + "grad_norm": 0.2766086459159851, + "learning_rate": 8.633417466009752e-05, + "loss": 1.8519, + "step": 8597 + }, + { + "epoch": 2.6390423572744015, + "grad_norm": 0.3250633180141449, + "learning_rate": 8.633075984304849e-05, + "loss": 1.8434, + "step": 8598 + }, + { + "epoch": 2.6393492940454264, + "grad_norm": 0.2819656729698181, + "learning_rate": 8.63273446669576e-05, + "loss": 1.8181, + "step": 8599 + }, + { + "epoch": 2.6396562308164517, + "grad_norm": 0.3506627678871155, + "learning_rate": 8.632392913185859e-05, + "loss": 1.8521, + "step": 8600 + }, + { + "epoch": 2.639963167587477, + "grad_norm": 0.3026714026927948, + "learning_rate": 8.632051323778521e-05, + "loss": 1.8183, + "step": 8601 + }, + { + "epoch": 2.6402701043585024, + "grad_norm": 0.31900104880332947, + "learning_rate": 8.631709698477124e-05, + "loss": 1.8615, + "step": 8602 + }, + { + "epoch": 2.6405770411295273, + "grad_norm": 0.3017260730266571, + "learning_rate": 8.631368037285044e-05, + "loss": 1.837, + "step": 8603 + }, + { + "epoch": 2.6408839779005526, + "grad_norm": 0.29461613297462463, + "learning_rate": 8.631026340205655e-05, + "loss": 1.8398, + "step": 8604 + }, + { + "epoch": 2.6411909146715775, + "grad_norm": 0.3405241370201111, + "learning_rate": 8.630684607242337e-05, + "loss": 1.9241, + "step": 8605 + }, + { + "epoch": 2.641497851442603, + "grad_norm": 0.36280715465545654, + "learning_rate": 8.630342838398465e-05, + "loss": 1.8319, + "step": 8606 + }, + { + "epoch": 2.641804788213628, + "grad_norm": 0.32274433970451355, + "learning_rate": 8.630001033677414e-05, + "loss": 1.8462, + "step": 8607 + }, + { + "epoch": 2.642111724984653, + "grad_norm": 0.28930720686912537, + "learning_rate": 8.629659193082571e-05, + "loss": 1.8251, + "step": 8608 + }, + { + "epoch": 2.6424186617556784, + "grad_norm": 0.30114278197288513, + "learning_rate": 8.629317316617305e-05, + "loss": 1.8037, + "step": 8609 + }, + { + "epoch": 2.6427255985267033, + "grad_norm": 0.31895074248313904, + "learning_rate": 8.628975404285e-05, + "loss": 1.808, + "step": 8610 + }, + { + "epoch": 2.6430325352977286, + "grad_norm": 0.31819066405296326, + "learning_rate": 8.62863345608903e-05, + "loss": 1.811, + "step": 8611 + }, + { + "epoch": 2.643339472068754, + "grad_norm": 0.3860008716583252, + "learning_rate": 8.628291472032779e-05, + "loss": 1.9041, + "step": 8612 + }, + { + "epoch": 2.643646408839779, + "grad_norm": 0.4598442614078522, + "learning_rate": 8.627949452119626e-05, + "loss": 1.788, + "step": 8613 + }, + { + "epoch": 2.643953345610804, + "grad_norm": 0.4720706641674042, + "learning_rate": 8.62760739635295e-05, + "loss": 1.8436, + "step": 8614 + }, + { + "epoch": 2.644260282381829, + "grad_norm": 0.3894381523132324, + "learning_rate": 8.627265304736131e-05, + "loss": 1.8188, + "step": 8615 + }, + { + "epoch": 2.6445672191528544, + "grad_norm": 0.2819352149963379, + "learning_rate": 8.626923177272551e-05, + "loss": 1.7804, + "step": 8616 + }, + { + "epoch": 2.6448741559238798, + "grad_norm": 0.33847305178642273, + "learning_rate": 8.626581013965588e-05, + "loss": 1.8628, + "step": 8617 + }, + { + "epoch": 2.645181092694905, + "grad_norm": 0.49113303422927856, + "learning_rate": 8.626238814818628e-05, + "loss": 1.821, + "step": 8618 + }, + { + "epoch": 2.64548802946593, + "grad_norm": 0.5562265515327454, + "learning_rate": 8.62589657983505e-05, + "loss": 1.8732, + "step": 8619 + }, + { + "epoch": 2.6457949662369553, + "grad_norm": 0.48525476455688477, + "learning_rate": 8.625554309018237e-05, + "loss": 1.8711, + "step": 8620 + }, + { + "epoch": 2.64610190300798, + "grad_norm": 0.35900986194610596, + "learning_rate": 8.62521200237157e-05, + "loss": 1.8922, + "step": 8621 + }, + { + "epoch": 2.6464088397790055, + "grad_norm": 0.2920636832714081, + "learning_rate": 8.624869659898435e-05, + "loss": 1.8121, + "step": 8622 + }, + { + "epoch": 2.646715776550031, + "grad_norm": 0.3626689314842224, + "learning_rate": 8.624527281602213e-05, + "loss": 1.8231, + "step": 8623 + }, + { + "epoch": 2.6470227133210558, + "grad_norm": 0.37683549523353577, + "learning_rate": 8.624184867486288e-05, + "loss": 1.8648, + "step": 8624 + }, + { + "epoch": 2.647329650092081, + "grad_norm": 0.293865829706192, + "learning_rate": 8.623842417554043e-05, + "loss": 1.8347, + "step": 8625 + }, + { + "epoch": 2.647636586863106, + "grad_norm": 0.28916221857070923, + "learning_rate": 8.623499931808863e-05, + "loss": 1.8337, + "step": 8626 + }, + { + "epoch": 2.6479435236341313, + "grad_norm": 0.439003586769104, + "learning_rate": 8.623157410254134e-05, + "loss": 1.8933, + "step": 8627 + }, + { + "epoch": 2.6482504604051567, + "grad_norm": 0.39125844836235046, + "learning_rate": 8.62281485289324e-05, + "loss": 1.7986, + "step": 8628 + }, + { + "epoch": 2.6485573971761815, + "grad_norm": 0.3968810439109802, + "learning_rate": 8.622472259729566e-05, + "loss": 1.8211, + "step": 8629 + }, + { + "epoch": 2.648864333947207, + "grad_norm": 0.37775713205337524, + "learning_rate": 8.622129630766498e-05, + "loss": 1.8976, + "step": 8630 + }, + { + "epoch": 2.6491712707182318, + "grad_norm": 0.329583078622818, + "learning_rate": 8.621786966007422e-05, + "loss": 1.9164, + "step": 8631 + }, + { + "epoch": 2.649478207489257, + "grad_norm": 0.3499230742454529, + "learning_rate": 8.621444265455725e-05, + "loss": 1.8589, + "step": 8632 + }, + { + "epoch": 2.6497851442602824, + "grad_norm": 0.504540741443634, + "learning_rate": 8.621101529114792e-05, + "loss": 1.7853, + "step": 8633 + }, + { + "epoch": 2.650092081031308, + "grad_norm": 0.47648704051971436, + "learning_rate": 8.620758756988012e-05, + "loss": 1.865, + "step": 8634 + }, + { + "epoch": 2.6503990178023327, + "grad_norm": 0.3592020869255066, + "learning_rate": 8.62041594907877e-05, + "loss": 1.886, + "step": 8635 + }, + { + "epoch": 2.650705954573358, + "grad_norm": 0.4862852096557617, + "learning_rate": 8.620073105390458e-05, + "loss": 1.8408, + "step": 8636 + }, + { + "epoch": 2.651012891344383, + "grad_norm": 0.5418413877487183, + "learning_rate": 8.619730225926462e-05, + "loss": 1.8715, + "step": 8637 + }, + { + "epoch": 2.6513198281154082, + "grad_norm": 0.4154299795627594, + "learning_rate": 8.619387310690168e-05, + "loss": 1.8879, + "step": 8638 + }, + { + "epoch": 2.6516267648864336, + "grad_norm": 0.3325296938419342, + "learning_rate": 8.619044359684968e-05, + "loss": 1.8422, + "step": 8639 + }, + { + "epoch": 2.6519337016574585, + "grad_norm": 0.4082878828048706, + "learning_rate": 8.61870137291425e-05, + "loss": 1.8375, + "step": 8640 + }, + { + "epoch": 2.652240638428484, + "grad_norm": 0.46948596835136414, + "learning_rate": 8.618358350381406e-05, + "loss": 1.8367, + "step": 8641 + }, + { + "epoch": 2.6525475751995087, + "grad_norm": 0.3770928978919983, + "learning_rate": 8.618015292089823e-05, + "loss": 1.8236, + "step": 8642 + }, + { + "epoch": 2.652854511970534, + "grad_norm": 0.27340826392173767, + "learning_rate": 8.617672198042892e-05, + "loss": 1.8446, + "step": 8643 + }, + { + "epoch": 2.6531614487415593, + "grad_norm": 0.4071608781814575, + "learning_rate": 8.617329068244004e-05, + "loss": 1.8576, + "step": 8644 + }, + { + "epoch": 2.6534683855125847, + "grad_norm": 0.5041884779930115, + "learning_rate": 8.61698590269655e-05, + "loss": 1.9075, + "step": 8645 + }, + { + "epoch": 2.6537753222836096, + "grad_norm": 0.4129817485809326, + "learning_rate": 8.616642701403921e-05, + "loss": 1.8592, + "step": 8646 + }, + { + "epoch": 2.654082259054635, + "grad_norm": 0.2837994694709778, + "learning_rate": 8.616299464369508e-05, + "loss": 1.8383, + "step": 8647 + }, + { + "epoch": 2.65438919582566, + "grad_norm": 0.3413170278072357, + "learning_rate": 8.615956191596707e-05, + "loss": 1.8083, + "step": 8648 + }, + { + "epoch": 2.654696132596685, + "grad_norm": 0.3661767244338989, + "learning_rate": 8.615612883088907e-05, + "loss": 1.9141, + "step": 8649 + }, + { + "epoch": 2.6550030693677105, + "grad_norm": 0.3209584951400757, + "learning_rate": 8.6152695388495e-05, + "loss": 1.8886, + "step": 8650 + }, + { + "epoch": 2.6553100061387354, + "grad_norm": 0.3161548674106598, + "learning_rate": 8.61492615888188e-05, + "loss": 1.832, + "step": 8651 + }, + { + "epoch": 2.6556169429097607, + "grad_norm": 0.3258545696735382, + "learning_rate": 8.614582743189441e-05, + "loss": 1.8747, + "step": 8652 + }, + { + "epoch": 2.6559238796807856, + "grad_norm": 0.3528682291507721, + "learning_rate": 8.614239291775579e-05, + "loss": 1.9192, + "step": 8653 + }, + { + "epoch": 2.656230816451811, + "grad_norm": 0.3430826961994171, + "learning_rate": 8.613895804643684e-05, + "loss": 1.8601, + "step": 8654 + }, + { + "epoch": 2.6565377532228363, + "grad_norm": 0.3221988379955292, + "learning_rate": 8.613552281797152e-05, + "loss": 1.9218, + "step": 8655 + }, + { + "epoch": 2.656844689993861, + "grad_norm": 0.2917289137840271, + "learning_rate": 8.613208723239379e-05, + "loss": 1.7443, + "step": 8656 + }, + { + "epoch": 2.6571516267648865, + "grad_norm": 0.28350377082824707, + "learning_rate": 8.612865128973762e-05, + "loss": 1.809, + "step": 8657 + }, + { + "epoch": 2.6574585635359114, + "grad_norm": 0.2758159339427948, + "learning_rate": 8.61252149900369e-05, + "loss": 1.8628, + "step": 8658 + }, + { + "epoch": 2.6577655003069367, + "grad_norm": 0.3537377417087555, + "learning_rate": 8.612177833332566e-05, + "loss": 1.8586, + "step": 8659 + }, + { + "epoch": 2.658072437077962, + "grad_norm": 0.38237693905830383, + "learning_rate": 8.611834131963783e-05, + "loss": 1.8869, + "step": 8660 + }, + { + "epoch": 2.6583793738489874, + "grad_norm": 0.30623751878738403, + "learning_rate": 8.611490394900739e-05, + "loss": 1.8508, + "step": 8661 + }, + { + "epoch": 2.6586863106200123, + "grad_norm": 0.2597752809524536, + "learning_rate": 8.611146622146828e-05, + "loss": 1.7931, + "step": 8662 + }, + { + "epoch": 2.6589932473910376, + "grad_norm": 0.2953357696533203, + "learning_rate": 8.61080281370545e-05, + "loss": 1.837, + "step": 8663 + }, + { + "epoch": 2.6593001841620625, + "grad_norm": 0.3018724322319031, + "learning_rate": 8.610458969580003e-05, + "loss": 1.871, + "step": 8664 + }, + { + "epoch": 2.659607120933088, + "grad_norm": 0.36607179045677185, + "learning_rate": 8.610115089773885e-05, + "loss": 1.9453, + "step": 8665 + }, + { + "epoch": 2.659914057704113, + "grad_norm": 0.38754695653915405, + "learning_rate": 8.609771174290493e-05, + "loss": 1.8886, + "step": 8666 + }, + { + "epoch": 2.660220994475138, + "grad_norm": 0.3752847909927368, + "learning_rate": 8.609427223133226e-05, + "loss": 1.8662, + "step": 8667 + }, + { + "epoch": 2.6605279312461634, + "grad_norm": 0.3301216661930084, + "learning_rate": 8.609083236305483e-05, + "loss": 1.8697, + "step": 8668 + }, + { + "epoch": 2.6608348680171883, + "grad_norm": 0.31682586669921875, + "learning_rate": 8.608739213810666e-05, + "loss": 1.8982, + "step": 8669 + }, + { + "epoch": 2.6611418047882136, + "grad_norm": 0.30835145711898804, + "learning_rate": 8.608395155652172e-05, + "loss": 1.8245, + "step": 8670 + }, + { + "epoch": 2.661448741559239, + "grad_norm": 0.32517582178115845, + "learning_rate": 8.608051061833402e-05, + "loss": 1.9117, + "step": 8671 + }, + { + "epoch": 2.661755678330264, + "grad_norm": 0.3120395541191101, + "learning_rate": 8.607706932357757e-05, + "loss": 1.76, + "step": 8672 + }, + { + "epoch": 2.662062615101289, + "grad_norm": 0.31719091534614563, + "learning_rate": 8.607362767228637e-05, + "loss": 1.8939, + "step": 8673 + }, + { + "epoch": 2.662369551872314, + "grad_norm": 0.28792136907577515, + "learning_rate": 8.607018566449445e-05, + "loss": 1.8403, + "step": 8674 + }, + { + "epoch": 2.6626764886433394, + "grad_norm": 0.28327643871307373, + "learning_rate": 8.606674330023581e-05, + "loss": 1.8204, + "step": 8675 + }, + { + "epoch": 2.6629834254143647, + "grad_norm": 0.29808422923088074, + "learning_rate": 8.606330057954446e-05, + "loss": 1.8325, + "step": 8676 + }, + { + "epoch": 2.66329036218539, + "grad_norm": 0.36162641644477844, + "learning_rate": 8.605985750245446e-05, + "loss": 1.8387, + "step": 8677 + }, + { + "epoch": 2.663597298956415, + "grad_norm": 0.3418589234352112, + "learning_rate": 8.605641406899978e-05, + "loss": 1.8139, + "step": 8678 + }, + { + "epoch": 2.6639042357274403, + "grad_norm": 0.31307870149612427, + "learning_rate": 8.605297027921451e-05, + "loss": 1.8897, + "step": 8679 + }, + { + "epoch": 2.664211172498465, + "grad_norm": 0.36962878704071045, + "learning_rate": 8.604952613313264e-05, + "loss": 1.9233, + "step": 8680 + }, + { + "epoch": 2.6645181092694905, + "grad_norm": 0.3502652049064636, + "learning_rate": 8.604608163078824e-05, + "loss": 1.8218, + "step": 8681 + }, + { + "epoch": 2.664825046040516, + "grad_norm": 0.3703038692474365, + "learning_rate": 8.604263677221533e-05, + "loss": 1.8484, + "step": 8682 + }, + { + "epoch": 2.6651319828115407, + "grad_norm": 0.2609662711620331, + "learning_rate": 8.603919155744796e-05, + "loss": 1.7645, + "step": 8683 + }, + { + "epoch": 2.665438919582566, + "grad_norm": 0.33297231793403625, + "learning_rate": 8.603574598652015e-05, + "loss": 1.8543, + "step": 8684 + }, + { + "epoch": 2.665745856353591, + "grad_norm": 0.28411462903022766, + "learning_rate": 8.603230005946601e-05, + "loss": 1.867, + "step": 8685 + }, + { + "epoch": 2.6660527931246163, + "grad_norm": 0.3209732174873352, + "learning_rate": 8.602885377631954e-05, + "loss": 1.8886, + "step": 8686 + }, + { + "epoch": 2.6663597298956416, + "grad_norm": 0.35397234559059143, + "learning_rate": 8.602540713711482e-05, + "loss": 1.8965, + "step": 8687 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2925071716308594, + "learning_rate": 8.602196014188593e-05, + "loss": 1.8027, + "step": 8688 + }, + { + "epoch": 2.666973603437692, + "grad_norm": 0.2902941107749939, + "learning_rate": 8.60185127906669e-05, + "loss": 1.8022, + "step": 8689 + }, + { + "epoch": 2.6672805402087167, + "grad_norm": 0.31528550386428833, + "learning_rate": 8.601506508349181e-05, + "loss": 1.8153, + "step": 8690 + }, + { + "epoch": 2.667587476979742, + "grad_norm": 0.32254844903945923, + "learning_rate": 8.601161702039477e-05, + "loss": 1.8199, + "step": 8691 + }, + { + "epoch": 2.6678944137507674, + "grad_norm": 0.2999059855937958, + "learning_rate": 8.600816860140979e-05, + "loss": 1.8404, + "step": 8692 + }, + { + "epoch": 2.6682013505217927, + "grad_norm": 0.32727453112602234, + "learning_rate": 8.6004719826571e-05, + "loss": 1.8148, + "step": 8693 + }, + { + "epoch": 2.6685082872928176, + "grad_norm": 0.3048906624317169, + "learning_rate": 8.600127069591245e-05, + "loss": 1.833, + "step": 8694 + }, + { + "epoch": 2.668815224063843, + "grad_norm": 0.43790102005004883, + "learning_rate": 8.599782120946826e-05, + "loss": 1.8537, + "step": 8695 + }, + { + "epoch": 2.669122160834868, + "grad_norm": 0.38096752762794495, + "learning_rate": 8.59943713672725e-05, + "loss": 1.8094, + "step": 8696 + }, + { + "epoch": 2.669429097605893, + "grad_norm": 0.3065931499004364, + "learning_rate": 8.599092116935927e-05, + "loss": 1.8878, + "step": 8697 + }, + { + "epoch": 2.6697360343769185, + "grad_norm": 0.41807904839515686, + "learning_rate": 8.598747061576264e-05, + "loss": 1.8753, + "step": 8698 + }, + { + "epoch": 2.6700429711479434, + "grad_norm": 0.4906943142414093, + "learning_rate": 8.598401970651676e-05, + "loss": 1.7642, + "step": 8699 + }, + { + "epoch": 2.6703499079189688, + "grad_norm": 0.37138858437538147, + "learning_rate": 8.598056844165567e-05, + "loss": 1.8191, + "step": 8700 + }, + { + "epoch": 2.6706568446899936, + "grad_norm": 0.2804940938949585, + "learning_rate": 8.597711682121354e-05, + "loss": 1.8238, + "step": 8701 + }, + { + "epoch": 2.670963781461019, + "grad_norm": 0.3853018581867218, + "learning_rate": 8.597366484522445e-05, + "loss": 1.8762, + "step": 8702 + }, + { + "epoch": 2.6712707182320443, + "grad_norm": 0.3066580295562744, + "learning_rate": 8.597021251372253e-05, + "loss": 1.7638, + "step": 8703 + }, + { + "epoch": 2.671577655003069, + "grad_norm": 0.30797824263572693, + "learning_rate": 8.596675982674186e-05, + "loss": 1.8574, + "step": 8704 + }, + { + "epoch": 2.6718845917740945, + "grad_norm": 0.3268548548221588, + "learning_rate": 8.596330678431661e-05, + "loss": 1.9184, + "step": 8705 + }, + { + "epoch": 2.6721915285451194, + "grad_norm": 0.4077534079551697, + "learning_rate": 8.595985338648087e-05, + "loss": 1.8967, + "step": 8706 + }, + { + "epoch": 2.6724984653161448, + "grad_norm": 0.4514889419078827, + "learning_rate": 8.595639963326881e-05, + "loss": 1.8491, + "step": 8707 + }, + { + "epoch": 2.67280540208717, + "grad_norm": 0.39269959926605225, + "learning_rate": 8.59529455247145e-05, + "loss": 1.7865, + "step": 8708 + }, + { + "epoch": 2.6731123388581954, + "grad_norm": 0.3139820694923401, + "learning_rate": 8.594949106085212e-05, + "loss": 1.8007, + "step": 8709 + }, + { + "epoch": 2.6734192756292203, + "grad_norm": 0.3423599600791931, + "learning_rate": 8.59460362417158e-05, + "loss": 1.8389, + "step": 8710 + }, + { + "epoch": 2.6737262124002457, + "grad_norm": 0.3829670548439026, + "learning_rate": 8.594258106733968e-05, + "loss": 1.8355, + "step": 8711 + }, + { + "epoch": 2.6740331491712706, + "grad_norm": 0.34447145462036133, + "learning_rate": 8.593912553775791e-05, + "loss": 1.8595, + "step": 8712 + }, + { + "epoch": 2.674340085942296, + "grad_norm": 0.34868502616882324, + "learning_rate": 8.593566965300465e-05, + "loss": 1.9195, + "step": 8713 + }, + { + "epoch": 2.674647022713321, + "grad_norm": 0.4919234812259674, + "learning_rate": 8.593221341311402e-05, + "loss": 1.8321, + "step": 8714 + }, + { + "epoch": 2.674953959484346, + "grad_norm": 0.4413202702999115, + "learning_rate": 8.59287568181202e-05, + "loss": 1.7976, + "step": 8715 + }, + { + "epoch": 2.6752608962553714, + "grad_norm": 0.3395153880119324, + "learning_rate": 8.592529986805736e-05, + "loss": 1.7974, + "step": 8716 + }, + { + "epoch": 2.6755678330263963, + "grad_norm": 0.30407002568244934, + "learning_rate": 8.592184256295965e-05, + "loss": 1.7929, + "step": 8717 + }, + { + "epoch": 2.6758747697974217, + "grad_norm": 0.31925150752067566, + "learning_rate": 8.591838490286121e-05, + "loss": 1.8413, + "step": 8718 + }, + { + "epoch": 2.676181706568447, + "grad_norm": 0.28456512093544006, + "learning_rate": 8.591492688779627e-05, + "loss": 1.8686, + "step": 8719 + }, + { + "epoch": 2.6764886433394723, + "grad_norm": 0.3286445438861847, + "learning_rate": 8.591146851779895e-05, + "loss": 1.8538, + "step": 8720 + }, + { + "epoch": 2.6767955801104972, + "grad_norm": 0.40354880690574646, + "learning_rate": 8.590800979290346e-05, + "loss": 1.8599, + "step": 8721 + }, + { + "epoch": 2.6771025168815226, + "grad_norm": 0.3654378652572632, + "learning_rate": 8.590455071314397e-05, + "loss": 1.8063, + "step": 8722 + }, + { + "epoch": 2.6774094536525475, + "grad_norm": 0.3211844861507416, + "learning_rate": 8.590109127855466e-05, + "loss": 1.8146, + "step": 8723 + }, + { + "epoch": 2.677716390423573, + "grad_norm": 0.30884361267089844, + "learning_rate": 8.589763148916973e-05, + "loss": 1.8725, + "step": 8724 + }, + { + "epoch": 2.678023327194598, + "grad_norm": 0.303095281124115, + "learning_rate": 8.589417134502336e-05, + "loss": 1.8994, + "step": 8725 + }, + { + "epoch": 2.678330263965623, + "grad_norm": 0.3086979389190674, + "learning_rate": 8.589071084614977e-05, + "loss": 1.7941, + "step": 8726 + }, + { + "epoch": 2.6786372007366483, + "grad_norm": 0.30298081040382385, + "learning_rate": 8.588724999258311e-05, + "loss": 1.8945, + "step": 8727 + }, + { + "epoch": 2.6789441375076732, + "grad_norm": 0.33253392577171326, + "learning_rate": 8.588378878435763e-05, + "loss": 1.8397, + "step": 8728 + }, + { + "epoch": 2.6792510742786986, + "grad_norm": 0.2782913148403168, + "learning_rate": 8.588032722150752e-05, + "loss": 1.8505, + "step": 8729 + }, + { + "epoch": 2.679558011049724, + "grad_norm": 0.3482373058795929, + "learning_rate": 8.587686530406697e-05, + "loss": 1.9144, + "step": 8730 + }, + { + "epoch": 2.679864947820749, + "grad_norm": 0.31985580921173096, + "learning_rate": 8.587340303207021e-05, + "loss": 1.7695, + "step": 8731 + }, + { + "epoch": 2.680171884591774, + "grad_norm": 0.3222995400428772, + "learning_rate": 8.586994040555147e-05, + "loss": 1.8624, + "step": 8732 + }, + { + "epoch": 2.680478821362799, + "grad_norm": 0.28178468346595764, + "learning_rate": 8.586647742454495e-05, + "loss": 1.8036, + "step": 8733 + }, + { + "epoch": 2.6807857581338244, + "grad_norm": 0.27367156744003296, + "learning_rate": 8.586301408908487e-05, + "loss": 1.801, + "step": 8734 + }, + { + "epoch": 2.6810926949048497, + "grad_norm": 0.2696636915206909, + "learning_rate": 8.585955039920547e-05, + "loss": 1.8211, + "step": 8735 + }, + { + "epoch": 2.681399631675875, + "grad_norm": 0.2880568504333496, + "learning_rate": 8.585608635494098e-05, + "loss": 1.8543, + "step": 8736 + }, + { + "epoch": 2.6817065684469, + "grad_norm": 0.28708669543266296, + "learning_rate": 8.585262195632562e-05, + "loss": 1.8311, + "step": 8737 + }, + { + "epoch": 2.6820135052179253, + "grad_norm": 0.2633354663848877, + "learning_rate": 8.584915720339364e-05, + "loss": 1.7815, + "step": 8738 + }, + { + "epoch": 2.68232044198895, + "grad_norm": 0.25772908329963684, + "learning_rate": 8.584569209617928e-05, + "loss": 1.8322, + "step": 8739 + }, + { + "epoch": 2.6826273787599755, + "grad_norm": 0.2665303647518158, + "learning_rate": 8.584222663471677e-05, + "loss": 1.8456, + "step": 8740 + }, + { + "epoch": 2.682934315531001, + "grad_norm": 0.26330938935279846, + "learning_rate": 8.583876081904038e-05, + "loss": 1.8552, + "step": 8741 + }, + { + "epoch": 2.6832412523020257, + "grad_norm": 0.29758915305137634, + "learning_rate": 8.583529464918434e-05, + "loss": 1.8362, + "step": 8742 + }, + { + "epoch": 2.683548189073051, + "grad_norm": 0.32018154859542847, + "learning_rate": 8.583182812518293e-05, + "loss": 1.8439, + "step": 8743 + }, + { + "epoch": 2.683855125844076, + "grad_norm": 0.33279770612716675, + "learning_rate": 8.582836124707036e-05, + "loss": 1.8629, + "step": 8744 + }, + { + "epoch": 2.6841620626151013, + "grad_norm": 0.40244174003601074, + "learning_rate": 8.582489401488096e-05, + "loss": 1.8221, + "step": 8745 + }, + { + "epoch": 2.6844689993861266, + "grad_norm": 0.3935016393661499, + "learning_rate": 8.582142642864895e-05, + "loss": 1.8564, + "step": 8746 + }, + { + "epoch": 2.6847759361571515, + "grad_norm": 0.3062369227409363, + "learning_rate": 8.58179584884086e-05, + "loss": 1.8587, + "step": 8747 + }, + { + "epoch": 2.685082872928177, + "grad_norm": 0.320422500371933, + "learning_rate": 8.58144901941942e-05, + "loss": 1.8758, + "step": 8748 + }, + { + "epoch": 2.6853898096992017, + "grad_norm": 0.3681413531303406, + "learning_rate": 8.581102154604001e-05, + "loss": 1.7899, + "step": 8749 + }, + { + "epoch": 2.685696746470227, + "grad_norm": 0.37779754400253296, + "learning_rate": 8.580755254398032e-05, + "loss": 1.8584, + "step": 8750 + }, + { + "epoch": 2.6860036832412524, + "grad_norm": 0.34761306643486023, + "learning_rate": 8.58040831880494e-05, + "loss": 1.8656, + "step": 8751 + }, + { + "epoch": 2.6863106200122777, + "grad_norm": 0.2833636403083801, + "learning_rate": 8.580061347828156e-05, + "loss": 1.8043, + "step": 8752 + }, + { + "epoch": 2.6866175567833026, + "grad_norm": 0.29990699887275696, + "learning_rate": 8.579714341471106e-05, + "loss": 1.8365, + "step": 8753 + }, + { + "epoch": 2.686924493554328, + "grad_norm": 0.3322729766368866, + "learning_rate": 8.579367299737222e-05, + "loss": 1.8541, + "step": 8754 + }, + { + "epoch": 2.687231430325353, + "grad_norm": 0.31999245285987854, + "learning_rate": 8.579020222629931e-05, + "loss": 1.8405, + "step": 8755 + }, + { + "epoch": 2.687538367096378, + "grad_norm": 0.332714319229126, + "learning_rate": 8.578673110152666e-05, + "loss": 1.9512, + "step": 8756 + }, + { + "epoch": 2.6878453038674035, + "grad_norm": 0.36372992396354675, + "learning_rate": 8.578325962308855e-05, + "loss": 1.8969, + "step": 8757 + }, + { + "epoch": 2.6881522406384284, + "grad_norm": 0.27239182591438293, + "learning_rate": 8.577978779101929e-05, + "loss": 1.7898, + "step": 8758 + }, + { + "epoch": 2.6884591774094537, + "grad_norm": 0.3552536070346832, + "learning_rate": 8.57763156053532e-05, + "loss": 1.8919, + "step": 8759 + }, + { + "epoch": 2.6887661141804786, + "grad_norm": 0.40591174364089966, + "learning_rate": 8.577284306612458e-05, + "loss": 1.8021, + "step": 8760 + }, + { + "epoch": 2.689073050951504, + "grad_norm": 0.37012994289398193, + "learning_rate": 8.576937017336777e-05, + "loss": 1.7803, + "step": 8761 + }, + { + "epoch": 2.6893799877225293, + "grad_norm": 0.33496031165122986, + "learning_rate": 8.576589692711707e-05, + "loss": 1.8573, + "step": 8762 + }, + { + "epoch": 2.689686924493554, + "grad_norm": 0.35000404715538025, + "learning_rate": 8.576242332740683e-05, + "loss": 1.8769, + "step": 8763 + }, + { + "epoch": 2.6899938612645795, + "grad_norm": 0.32730549573898315, + "learning_rate": 8.575894937427135e-05, + "loss": 1.823, + "step": 8764 + }, + { + "epoch": 2.6903007980356044, + "grad_norm": 0.31418806314468384, + "learning_rate": 8.575547506774497e-05, + "loss": 1.7646, + "step": 8765 + }, + { + "epoch": 2.6906077348066297, + "grad_norm": 0.277721107006073, + "learning_rate": 8.575200040786205e-05, + "loss": 1.8046, + "step": 8766 + }, + { + "epoch": 2.690914671577655, + "grad_norm": 0.3289557695388794, + "learning_rate": 8.574852539465688e-05, + "loss": 1.8145, + "step": 8767 + }, + { + "epoch": 2.6912216083486804, + "grad_norm": 0.28926602005958557, + "learning_rate": 8.574505002816385e-05, + "loss": 1.7627, + "step": 8768 + }, + { + "epoch": 2.6915285451197053, + "grad_norm": 0.2972332835197449, + "learning_rate": 8.574157430841727e-05, + "loss": 1.8294, + "step": 8769 + }, + { + "epoch": 2.6918354818907306, + "grad_norm": 0.28366953134536743, + "learning_rate": 8.57380982354515e-05, + "loss": 1.8535, + "step": 8770 + }, + { + "epoch": 2.6921424186617555, + "grad_norm": 0.2798771262168884, + "learning_rate": 8.57346218093009e-05, + "loss": 1.8298, + "step": 8771 + }, + { + "epoch": 2.692449355432781, + "grad_norm": 0.2614765465259552, + "learning_rate": 8.573114502999983e-05, + "loss": 1.8555, + "step": 8772 + }, + { + "epoch": 2.692756292203806, + "grad_norm": 0.30653777718544006, + "learning_rate": 8.572766789758265e-05, + "loss": 1.8507, + "step": 8773 + }, + { + "epoch": 2.693063228974831, + "grad_norm": 0.3189094066619873, + "learning_rate": 8.572419041208369e-05, + "loss": 1.8791, + "step": 8774 + }, + { + "epoch": 2.6933701657458564, + "grad_norm": 0.33381524682044983, + "learning_rate": 8.572071257353735e-05, + "loss": 1.8241, + "step": 8775 + }, + { + "epoch": 2.6936771025168813, + "grad_norm": 0.2776879668235779, + "learning_rate": 8.571723438197801e-05, + "loss": 1.7837, + "step": 8776 + }, + { + "epoch": 2.6939840392879066, + "grad_norm": 0.35845425724983215, + "learning_rate": 8.571375583744001e-05, + "loss": 1.8896, + "step": 8777 + }, + { + "epoch": 2.694290976058932, + "grad_norm": 0.28849005699157715, + "learning_rate": 8.571027693995775e-05, + "loss": 1.803, + "step": 8778 + }, + { + "epoch": 2.694597912829957, + "grad_norm": 0.3008786141872406, + "learning_rate": 8.57067976895656e-05, + "loss": 1.8559, + "step": 8779 + }, + { + "epoch": 2.694904849600982, + "grad_norm": 0.2924736440181732, + "learning_rate": 8.570331808629795e-05, + "loss": 1.8016, + "step": 8780 + }, + { + "epoch": 2.695211786372007, + "grad_norm": 0.2962380051612854, + "learning_rate": 8.569983813018917e-05, + "loss": 1.819, + "step": 8781 + }, + { + "epoch": 2.6955187231430324, + "grad_norm": 0.3141970634460449, + "learning_rate": 8.569635782127367e-05, + "loss": 1.8462, + "step": 8782 + }, + { + "epoch": 2.6958256599140578, + "grad_norm": 0.297061562538147, + "learning_rate": 8.569287715958584e-05, + "loss": 1.855, + "step": 8783 + }, + { + "epoch": 2.696132596685083, + "grad_norm": 0.30669623613357544, + "learning_rate": 8.568939614516009e-05, + "loss": 1.8626, + "step": 8784 + }, + { + "epoch": 2.696439533456108, + "grad_norm": 0.2782025933265686, + "learning_rate": 8.568591477803081e-05, + "loss": 1.8993, + "step": 8785 + }, + { + "epoch": 2.6967464702271333, + "grad_norm": 0.3644821345806122, + "learning_rate": 8.568243305823239e-05, + "loss": 1.8318, + "step": 8786 + }, + { + "epoch": 2.697053406998158, + "grad_norm": 0.4073259234428406, + "learning_rate": 8.567895098579925e-05, + "loss": 1.8963, + "step": 8787 + }, + { + "epoch": 2.6973603437691835, + "grad_norm": 0.40539780259132385, + "learning_rate": 8.567546856076583e-05, + "loss": 1.8644, + "step": 8788 + }, + { + "epoch": 2.697667280540209, + "grad_norm": 0.36739271879196167, + "learning_rate": 8.567198578316648e-05, + "loss": 1.8555, + "step": 8789 + }, + { + "epoch": 2.6979742173112338, + "grad_norm": 0.3339182138442993, + "learning_rate": 8.566850265303568e-05, + "loss": 1.8431, + "step": 8790 + }, + { + "epoch": 2.698281154082259, + "grad_norm": 0.3389740586280823, + "learning_rate": 8.566501917040784e-05, + "loss": 1.8271, + "step": 8791 + }, + { + "epoch": 2.698588090853284, + "grad_norm": 0.33819615840911865, + "learning_rate": 8.566153533531737e-05, + "loss": 1.8504, + "step": 8792 + }, + { + "epoch": 2.6988950276243093, + "grad_norm": 0.39106276631355286, + "learning_rate": 8.56580511477987e-05, + "loss": 1.7656, + "step": 8793 + }, + { + "epoch": 2.6992019643953347, + "grad_norm": 0.3374726474285126, + "learning_rate": 8.565456660788628e-05, + "loss": 1.8256, + "step": 8794 + }, + { + "epoch": 2.69950890116636, + "grad_norm": 0.33096614480018616, + "learning_rate": 8.565108171561452e-05, + "loss": 1.9486, + "step": 8795 + }, + { + "epoch": 2.699815837937385, + "grad_norm": 0.3202100396156311, + "learning_rate": 8.564759647101788e-05, + "loss": 1.7708, + "step": 8796 + }, + { + "epoch": 2.7001227747084102, + "grad_norm": 0.28830909729003906, + "learning_rate": 8.56441108741308e-05, + "loss": 1.8247, + "step": 8797 + }, + { + "epoch": 2.700429711479435, + "grad_norm": 0.32385459542274475, + "learning_rate": 8.564062492498772e-05, + "loss": 1.8338, + "step": 8798 + }, + { + "epoch": 2.7007366482504604, + "grad_norm": 0.3059900104999542, + "learning_rate": 8.56371386236231e-05, + "loss": 1.8321, + "step": 8799 + }, + { + "epoch": 2.701043585021486, + "grad_norm": 0.2922738492488861, + "learning_rate": 8.563365197007141e-05, + "loss": 1.7734, + "step": 8800 + }, + { + "epoch": 2.7013505217925107, + "grad_norm": 0.32542386651039124, + "learning_rate": 8.563016496436704e-05, + "loss": 1.8696, + "step": 8801 + }, + { + "epoch": 2.701657458563536, + "grad_norm": 0.2830851674079895, + "learning_rate": 8.562667760654452e-05, + "loss": 1.8237, + "step": 8802 + }, + { + "epoch": 2.701964395334561, + "grad_norm": 0.2794142961502075, + "learning_rate": 8.562318989663831e-05, + "loss": 1.8301, + "step": 8803 + }, + { + "epoch": 2.7022713321055862, + "grad_norm": 0.3149101436138153, + "learning_rate": 8.561970183468281e-05, + "loss": 1.8716, + "step": 8804 + }, + { + "epoch": 2.7025782688766116, + "grad_norm": 0.29530593752861023, + "learning_rate": 8.561621342071258e-05, + "loss": 1.9069, + "step": 8805 + }, + { + "epoch": 2.7028852056476365, + "grad_norm": 0.33965879678726196, + "learning_rate": 8.561272465476204e-05, + "loss": 1.8381, + "step": 8806 + }, + { + "epoch": 2.703192142418662, + "grad_norm": 0.3310995399951935, + "learning_rate": 8.560923553686569e-05, + "loss": 1.9293, + "step": 8807 + }, + { + "epoch": 2.7034990791896867, + "grad_norm": 0.3828842043876648, + "learning_rate": 8.5605746067058e-05, + "loss": 1.8789, + "step": 8808 + }, + { + "epoch": 2.703806015960712, + "grad_norm": 0.3666260242462158, + "learning_rate": 8.560225624537346e-05, + "loss": 1.8622, + "step": 8809 + }, + { + "epoch": 2.7041129527317374, + "grad_norm": 0.36732783913612366, + "learning_rate": 8.559876607184653e-05, + "loss": 1.8177, + "step": 8810 + }, + { + "epoch": 2.7044198895027627, + "grad_norm": 0.35554859042167664, + "learning_rate": 8.559527554651176e-05, + "loss": 1.884, + "step": 8811 + }, + { + "epoch": 2.7047268262737876, + "grad_norm": 0.3118159770965576, + "learning_rate": 8.55917846694036e-05, + "loss": 1.8779, + "step": 8812 + }, + { + "epoch": 2.705033763044813, + "grad_norm": 0.278105765581131, + "learning_rate": 8.558829344055657e-05, + "loss": 1.8513, + "step": 8813 + }, + { + "epoch": 2.705340699815838, + "grad_norm": 0.30809372663497925, + "learning_rate": 8.558480186000517e-05, + "loss": 1.8023, + "step": 8814 + }, + { + "epoch": 2.705647636586863, + "grad_norm": 0.28222522139549255, + "learning_rate": 8.558130992778388e-05, + "loss": 1.8421, + "step": 8815 + }, + { + "epoch": 2.7059545733578885, + "grad_norm": 0.29532718658447266, + "learning_rate": 8.557781764392725e-05, + "loss": 1.8131, + "step": 8816 + }, + { + "epoch": 2.7062615101289134, + "grad_norm": 0.2670072317123413, + "learning_rate": 8.557432500846975e-05, + "loss": 1.7856, + "step": 8817 + }, + { + "epoch": 2.7065684468999387, + "grad_norm": 0.3431483805179596, + "learning_rate": 8.557083202144594e-05, + "loss": 1.8484, + "step": 8818 + }, + { + "epoch": 2.7068753836709636, + "grad_norm": 0.3824561536312103, + "learning_rate": 8.556733868289033e-05, + "loss": 1.8954, + "step": 8819 + }, + { + "epoch": 2.707182320441989, + "grad_norm": 0.4189379811286926, + "learning_rate": 8.55638449928374e-05, + "loss": 1.7846, + "step": 8820 + }, + { + "epoch": 2.7074892572130143, + "grad_norm": 0.34948450326919556, + "learning_rate": 8.556035095132173e-05, + "loss": 1.7696, + "step": 8821 + }, + { + "epoch": 2.707796193984039, + "grad_norm": 0.2906292676925659, + "learning_rate": 8.555685655837783e-05, + "loss": 1.8359, + "step": 8822 + }, + { + "epoch": 2.7081031307550645, + "grad_norm": 0.2756035029888153, + "learning_rate": 8.555336181404023e-05, + "loss": 1.8684, + "step": 8823 + }, + { + "epoch": 2.7084100675260894, + "grad_norm": 0.3714772164821625, + "learning_rate": 8.554986671834346e-05, + "loss": 1.8833, + "step": 8824 + }, + { + "epoch": 2.7087170042971147, + "grad_norm": 0.41674792766571045, + "learning_rate": 8.554637127132209e-05, + "loss": 1.8272, + "step": 8825 + }, + { + "epoch": 2.70902394106814, + "grad_norm": 0.333915650844574, + "learning_rate": 8.554287547301063e-05, + "loss": 1.8343, + "step": 8826 + }, + { + "epoch": 2.7093308778391654, + "grad_norm": 0.33764639496803284, + "learning_rate": 8.553937932344365e-05, + "loss": 1.812, + "step": 8827 + }, + { + "epoch": 2.7096378146101903, + "grad_norm": 0.4445551931858063, + "learning_rate": 8.553588282265569e-05, + "loss": 1.8386, + "step": 8828 + }, + { + "epoch": 2.7099447513812156, + "grad_norm": 0.43314024806022644, + "learning_rate": 8.553238597068131e-05, + "loss": 1.7727, + "step": 8829 + }, + { + "epoch": 2.7102516881522405, + "grad_norm": 0.364596426486969, + "learning_rate": 8.552888876755506e-05, + "loss": 1.8875, + "step": 8830 + }, + { + "epoch": 2.710558624923266, + "grad_norm": 0.3023224174976349, + "learning_rate": 8.552539121331151e-05, + "loss": 1.8676, + "step": 8831 + }, + { + "epoch": 2.710865561694291, + "grad_norm": 0.3278682231903076, + "learning_rate": 8.552189330798522e-05, + "loss": 1.852, + "step": 8832 + }, + { + "epoch": 2.711172498465316, + "grad_norm": 0.34684303402900696, + "learning_rate": 8.551839505161077e-05, + "loss": 1.8449, + "step": 8833 + }, + { + "epoch": 2.7114794352363414, + "grad_norm": 0.3398132920265198, + "learning_rate": 8.551489644422271e-05, + "loss": 1.8493, + "step": 8834 + }, + { + "epoch": 2.7117863720073663, + "grad_norm": 0.2835905849933624, + "learning_rate": 8.551139748585563e-05, + "loss": 1.8283, + "step": 8835 + }, + { + "epoch": 2.7120933087783916, + "grad_norm": 0.30910351872444153, + "learning_rate": 8.55078981765441e-05, + "loss": 1.8429, + "step": 8836 + }, + { + "epoch": 2.712400245549417, + "grad_norm": 0.3802061676979065, + "learning_rate": 8.550439851632272e-05, + "loss": 1.8348, + "step": 8837 + }, + { + "epoch": 2.712707182320442, + "grad_norm": 0.3686448931694031, + "learning_rate": 8.550089850522606e-05, + "loss": 1.8652, + "step": 8838 + }, + { + "epoch": 2.713014119091467, + "grad_norm": 0.2919705808162689, + "learning_rate": 8.549739814328872e-05, + "loss": 1.8318, + "step": 8839 + }, + { + "epoch": 2.713321055862492, + "grad_norm": 0.34780198335647583, + "learning_rate": 8.549389743054527e-05, + "loss": 1.8781, + "step": 8840 + }, + { + "epoch": 2.7136279926335174, + "grad_norm": 0.3955966532230377, + "learning_rate": 8.549039636703034e-05, + "loss": 1.867, + "step": 8841 + }, + { + "epoch": 2.7139349294045427, + "grad_norm": 0.2836689054965973, + "learning_rate": 8.548689495277851e-05, + "loss": 1.7859, + "step": 8842 + }, + { + "epoch": 2.714241866175568, + "grad_norm": 0.369865357875824, + "learning_rate": 8.548339318782436e-05, + "loss": 1.8246, + "step": 8843 + }, + { + "epoch": 2.714548802946593, + "grad_norm": 0.2901081442832947, + "learning_rate": 8.547989107220256e-05, + "loss": 1.7888, + "step": 8844 + }, + { + "epoch": 2.7148557397176183, + "grad_norm": 0.2790970802307129, + "learning_rate": 8.547638860594764e-05, + "loss": 1.8311, + "step": 8845 + }, + { + "epoch": 2.715162676488643, + "grad_norm": 0.2935783267021179, + "learning_rate": 8.547288578909429e-05, + "loss": 1.857, + "step": 8846 + }, + { + "epoch": 2.7154696132596685, + "grad_norm": 0.27074959874153137, + "learning_rate": 8.546938262167708e-05, + "loss": 1.7457, + "step": 8847 + }, + { + "epoch": 2.715776550030694, + "grad_norm": 0.3042888343334198, + "learning_rate": 8.546587910373063e-05, + "loss": 1.8598, + "step": 8848 + }, + { + "epoch": 2.7160834868017187, + "grad_norm": 0.29088664054870605, + "learning_rate": 8.546237523528958e-05, + "loss": 1.8461, + "step": 8849 + }, + { + "epoch": 2.716390423572744, + "grad_norm": 0.3022211492061615, + "learning_rate": 8.545887101638857e-05, + "loss": 1.8327, + "step": 8850 + }, + { + "epoch": 2.716697360343769, + "grad_norm": 0.30194929242134094, + "learning_rate": 8.545536644706218e-05, + "loss": 1.8331, + "step": 8851 + }, + { + "epoch": 2.7170042971147943, + "grad_norm": 0.31702303886413574, + "learning_rate": 8.54518615273451e-05, + "loss": 1.8576, + "step": 8852 + }, + { + "epoch": 2.7173112338858196, + "grad_norm": 0.30386796593666077, + "learning_rate": 8.544835625727195e-05, + "loss": 1.8278, + "step": 8853 + }, + { + "epoch": 2.717618170656845, + "grad_norm": 0.30670568346977234, + "learning_rate": 8.544485063687735e-05, + "loss": 1.8123, + "step": 8854 + }, + { + "epoch": 2.71792510742787, + "grad_norm": 0.3896371126174927, + "learning_rate": 8.544134466619597e-05, + "loss": 1.8101, + "step": 8855 + }, + { + "epoch": 2.718232044198895, + "grad_norm": 0.4742000699043274, + "learning_rate": 8.543783834526245e-05, + "loss": 1.8402, + "step": 8856 + }, + { + "epoch": 2.71853898096992, + "grad_norm": 0.4234209954738617, + "learning_rate": 8.543433167411143e-05, + "loss": 1.8814, + "step": 8857 + }, + { + "epoch": 2.7188459177409454, + "grad_norm": 0.28478503227233887, + "learning_rate": 8.54308246527776e-05, + "loss": 1.8165, + "step": 8858 + }, + { + "epoch": 2.7191528545119708, + "grad_norm": 0.3534078896045685, + "learning_rate": 8.542731728129558e-05, + "loss": 1.7947, + "step": 8859 + }, + { + "epoch": 2.7194597912829956, + "grad_norm": 0.5471592545509338, + "learning_rate": 8.542380955970004e-05, + "loss": 1.9073, + "step": 8860 + }, + { + "epoch": 2.719766728054021, + "grad_norm": 0.5037226676940918, + "learning_rate": 8.542030148802566e-05, + "loss": 1.8701, + "step": 8861 + }, + { + "epoch": 2.720073664825046, + "grad_norm": 0.3415449559688568, + "learning_rate": 8.54167930663071e-05, + "loss": 1.827, + "step": 8862 + }, + { + "epoch": 2.720380601596071, + "grad_norm": 0.33516764640808105, + "learning_rate": 8.541328429457903e-05, + "loss": 1.9396, + "step": 8863 + }, + { + "epoch": 2.7206875383670965, + "grad_norm": 0.3934863209724426, + "learning_rate": 8.540977517287612e-05, + "loss": 1.8738, + "step": 8864 + }, + { + "epoch": 2.7209944751381214, + "grad_norm": 0.5137139558792114, + "learning_rate": 8.540626570123307e-05, + "loss": 1.9007, + "step": 8865 + }, + { + "epoch": 2.7213014119091468, + "grad_norm": 0.5846540331840515, + "learning_rate": 8.540275587968453e-05, + "loss": 1.9335, + "step": 8866 + }, + { + "epoch": 2.7216083486801717, + "grad_norm": 0.613388180732727, + "learning_rate": 8.539924570826523e-05, + "loss": 1.8967, + "step": 8867 + }, + { + "epoch": 2.721915285451197, + "grad_norm": 0.4804840087890625, + "learning_rate": 8.539573518700983e-05, + "loss": 1.7712, + "step": 8868 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.34939101338386536, + "learning_rate": 8.539222431595303e-05, + "loss": 1.8578, + "step": 8869 + }, + { + "epoch": 2.7225291589932477, + "grad_norm": 0.4230511486530304, + "learning_rate": 8.538871309512951e-05, + "loss": 1.793, + "step": 8870 + }, + { + "epoch": 2.7228360957642725, + "grad_norm": 0.5383400917053223, + "learning_rate": 8.538520152457402e-05, + "loss": 1.8153, + "step": 8871 + }, + { + "epoch": 2.723143032535298, + "grad_norm": 0.46213194727897644, + "learning_rate": 8.538168960432118e-05, + "loss": 1.9357, + "step": 8872 + }, + { + "epoch": 2.7234499693063228, + "grad_norm": 0.3126194477081299, + "learning_rate": 8.537817733440577e-05, + "loss": 1.7954, + "step": 8873 + }, + { + "epoch": 2.723756906077348, + "grad_norm": 0.4018714129924774, + "learning_rate": 8.537466471486248e-05, + "loss": 1.824, + "step": 8874 + }, + { + "epoch": 2.7240638428483734, + "grad_norm": 0.5690213441848755, + "learning_rate": 8.537115174572602e-05, + "loss": 1.7807, + "step": 8875 + }, + { + "epoch": 2.7243707796193983, + "grad_norm": 0.4669814705848694, + "learning_rate": 8.53676384270311e-05, + "loss": 1.7438, + "step": 8876 + }, + { + "epoch": 2.7246777163904237, + "grad_norm": 0.3040566146373749, + "learning_rate": 8.536412475881246e-05, + "loss": 1.8613, + "step": 8877 + }, + { + "epoch": 2.7249846531614486, + "grad_norm": 0.38985559344291687, + "learning_rate": 8.53606107411048e-05, + "loss": 1.816, + "step": 8878 + }, + { + "epoch": 2.725291589932474, + "grad_norm": 0.4417174160480499, + "learning_rate": 8.535709637394285e-05, + "loss": 1.8675, + "step": 8879 + }, + { + "epoch": 2.7255985267034992, + "grad_norm": 0.3254696726799011, + "learning_rate": 8.535358165736138e-05, + "loss": 1.8419, + "step": 8880 + }, + { + "epoch": 2.725905463474524, + "grad_norm": 0.36002370715141296, + "learning_rate": 8.535006659139506e-05, + "loss": 1.9084, + "step": 8881 + }, + { + "epoch": 2.7262124002455494, + "grad_norm": 0.3471790850162506, + "learning_rate": 8.534655117607869e-05, + "loss": 1.8442, + "step": 8882 + }, + { + "epoch": 2.7265193370165743, + "grad_norm": 0.3042849004268646, + "learning_rate": 8.534303541144697e-05, + "loss": 1.8261, + "step": 8883 + }, + { + "epoch": 2.7268262737875997, + "grad_norm": 0.32416659593582153, + "learning_rate": 8.533951929753465e-05, + "loss": 1.8625, + "step": 8884 + }, + { + "epoch": 2.727133210558625, + "grad_norm": 0.32449519634246826, + "learning_rate": 8.53360028343765e-05, + "loss": 1.8653, + "step": 8885 + }, + { + "epoch": 2.7274401473296503, + "grad_norm": 0.34744054079055786, + "learning_rate": 8.533248602200726e-05, + "loss": 1.8742, + "step": 8886 + }, + { + "epoch": 2.7277470841006752, + "grad_norm": 0.30540695786476135, + "learning_rate": 8.532896886046167e-05, + "loss": 1.8064, + "step": 8887 + }, + { + "epoch": 2.7280540208717006, + "grad_norm": 0.27105677127838135, + "learning_rate": 8.532545134977452e-05, + "loss": 1.7867, + "step": 8888 + }, + { + "epoch": 2.7283609576427255, + "grad_norm": 0.2682685852050781, + "learning_rate": 8.532193348998054e-05, + "loss": 1.8191, + "step": 8889 + }, + { + "epoch": 2.728667894413751, + "grad_norm": 0.33534809947013855, + "learning_rate": 8.531841528111452e-05, + "loss": 1.8758, + "step": 8890 + }, + { + "epoch": 2.728974831184776, + "grad_norm": 0.33555057644844055, + "learning_rate": 8.531489672321122e-05, + "loss": 1.8932, + "step": 8891 + }, + { + "epoch": 2.729281767955801, + "grad_norm": 0.3532167077064514, + "learning_rate": 8.531137781630542e-05, + "loss": 1.8621, + "step": 8892 + }, + { + "epoch": 2.7295887047268264, + "grad_norm": 0.337634414434433, + "learning_rate": 8.530785856043186e-05, + "loss": 1.8618, + "step": 8893 + }, + { + "epoch": 2.7298956414978512, + "grad_norm": 0.28855568170547485, + "learning_rate": 8.530433895562538e-05, + "loss": 1.8248, + "step": 8894 + }, + { + "epoch": 2.7302025782688766, + "grad_norm": 0.3128049373626709, + "learning_rate": 8.530081900192071e-05, + "loss": 1.8071, + "step": 8895 + }, + { + "epoch": 2.730509515039902, + "grad_norm": 0.2949801981449127, + "learning_rate": 8.529729869935265e-05, + "loss": 1.7704, + "step": 8896 + }, + { + "epoch": 2.730816451810927, + "grad_norm": 0.2708294987678528, + "learning_rate": 8.529377804795603e-05, + "loss": 1.8127, + "step": 8897 + }, + { + "epoch": 2.731123388581952, + "grad_norm": 0.300516813993454, + "learning_rate": 8.529025704776559e-05, + "loss": 1.9063, + "step": 8898 + }, + { + "epoch": 2.731430325352977, + "grad_norm": 0.2590954005718231, + "learning_rate": 8.528673569881613e-05, + "loss": 1.7595, + "step": 8899 + }, + { + "epoch": 2.7317372621240024, + "grad_norm": 0.30067136883735657, + "learning_rate": 8.528321400114248e-05, + "loss": 1.8697, + "step": 8900 + }, + { + "epoch": 2.7320441988950277, + "grad_norm": 0.3289981186389923, + "learning_rate": 8.527969195477943e-05, + "loss": 1.8257, + "step": 8901 + }, + { + "epoch": 2.732351135666053, + "grad_norm": 0.3205581307411194, + "learning_rate": 8.527616955976178e-05, + "loss": 1.9002, + "step": 8902 + }, + { + "epoch": 2.732658072437078, + "grad_norm": 0.30869361758232117, + "learning_rate": 8.527264681612435e-05, + "loss": 1.8239, + "step": 8903 + }, + { + "epoch": 2.7329650092081033, + "grad_norm": 0.3237484097480774, + "learning_rate": 8.526912372390195e-05, + "loss": 1.8879, + "step": 8904 + }, + { + "epoch": 2.733271945979128, + "grad_norm": 0.3172036111354828, + "learning_rate": 8.52656002831294e-05, + "loss": 1.8118, + "step": 8905 + }, + { + "epoch": 2.7335788827501535, + "grad_norm": 0.3326823115348816, + "learning_rate": 8.52620764938415e-05, + "loss": 1.8035, + "step": 8906 + }, + { + "epoch": 2.733885819521179, + "grad_norm": 0.36605212092399597, + "learning_rate": 8.525855235607311e-05, + "loss": 1.8689, + "step": 8907 + }, + { + "epoch": 2.7341927562922037, + "grad_norm": 0.31904828548431396, + "learning_rate": 8.525502786985905e-05, + "loss": 1.8188, + "step": 8908 + }, + { + "epoch": 2.734499693063229, + "grad_norm": 0.2657643258571625, + "learning_rate": 8.525150303523413e-05, + "loss": 1.7471, + "step": 8909 + }, + { + "epoch": 2.734806629834254, + "grad_norm": 0.32748520374298096, + "learning_rate": 8.524797785223318e-05, + "loss": 1.8678, + "step": 8910 + }, + { + "epoch": 2.7351135666052793, + "grad_norm": 0.32576173543930054, + "learning_rate": 8.524445232089107e-05, + "loss": 1.8296, + "step": 8911 + }, + { + "epoch": 2.7354205033763046, + "grad_norm": 0.3028578758239746, + "learning_rate": 8.524092644124261e-05, + "loss": 1.8656, + "step": 8912 + }, + { + "epoch": 2.7357274401473295, + "grad_norm": 0.29967090487480164, + "learning_rate": 8.523740021332268e-05, + "loss": 1.8206, + "step": 8913 + }, + { + "epoch": 2.736034376918355, + "grad_norm": 0.3042941391468048, + "learning_rate": 8.523387363716611e-05, + "loss": 1.7928, + "step": 8914 + }, + { + "epoch": 2.7363413136893797, + "grad_norm": 0.3278021216392517, + "learning_rate": 8.523034671280772e-05, + "loss": 1.9213, + "step": 8915 + }, + { + "epoch": 2.736648250460405, + "grad_norm": 0.39839017391204834, + "learning_rate": 8.522681944028242e-05, + "loss": 1.8242, + "step": 8916 + }, + { + "epoch": 2.7369551872314304, + "grad_norm": 0.3960748016834259, + "learning_rate": 8.522329181962504e-05, + "loss": 1.8761, + "step": 8917 + }, + { + "epoch": 2.7372621240024557, + "grad_norm": 0.3250591456890106, + "learning_rate": 8.521976385087044e-05, + "loss": 1.8318, + "step": 8918 + }, + { + "epoch": 2.7375690607734806, + "grad_norm": 0.31731119751930237, + "learning_rate": 8.521623553405349e-05, + "loss": 1.8062, + "step": 8919 + }, + { + "epoch": 2.737875997544506, + "grad_norm": 0.32452264428138733, + "learning_rate": 8.521270686920906e-05, + "loss": 1.8384, + "step": 8920 + }, + { + "epoch": 2.738182934315531, + "grad_norm": 0.2892500162124634, + "learning_rate": 8.520917785637204e-05, + "loss": 1.8128, + "step": 8921 + }, + { + "epoch": 2.738489871086556, + "grad_norm": 0.30028483271598816, + "learning_rate": 8.520564849557726e-05, + "loss": 1.8512, + "step": 8922 + }, + { + "epoch": 2.7387968078575815, + "grad_norm": 0.29927411675453186, + "learning_rate": 8.520211878685964e-05, + "loss": 1.8431, + "step": 8923 + }, + { + "epoch": 2.7391037446286064, + "grad_norm": 0.3426479995250702, + "learning_rate": 8.519858873025405e-05, + "loss": 1.8724, + "step": 8924 + }, + { + "epoch": 2.7394106813996317, + "grad_norm": 0.3795917332172394, + "learning_rate": 8.519505832579538e-05, + "loss": 1.8888, + "step": 8925 + }, + { + "epoch": 2.7397176181706566, + "grad_norm": 0.4924582839012146, + "learning_rate": 8.519152757351849e-05, + "loss": 1.7743, + "step": 8926 + }, + { + "epoch": 2.740024554941682, + "grad_norm": 0.43054282665252686, + "learning_rate": 8.518799647345832e-05, + "loss": 1.8556, + "step": 8927 + }, + { + "epoch": 2.7403314917127073, + "grad_norm": 0.37040412425994873, + "learning_rate": 8.518446502564974e-05, + "loss": 1.9162, + "step": 8928 + }, + { + "epoch": 2.7406384284837326, + "grad_norm": 0.38334885239601135, + "learning_rate": 8.518093323012766e-05, + "loss": 1.8078, + "step": 8929 + }, + { + "epoch": 2.7409453652547575, + "grad_norm": 0.409101665019989, + "learning_rate": 8.517740108692698e-05, + "loss": 1.7874, + "step": 8930 + }, + { + "epoch": 2.741252302025783, + "grad_norm": 0.3953499495983124, + "learning_rate": 8.517386859608258e-05, + "loss": 1.8455, + "step": 8931 + }, + { + "epoch": 2.7415592387968077, + "grad_norm": 0.30524972081184387, + "learning_rate": 8.517033575762942e-05, + "loss": 1.822, + "step": 8932 + }, + { + "epoch": 2.741866175567833, + "grad_norm": 0.354086309671402, + "learning_rate": 8.516680257160239e-05, + "loss": 1.859, + "step": 8933 + }, + { + "epoch": 2.7421731123388584, + "grad_norm": 0.4305376410484314, + "learning_rate": 8.516326903803638e-05, + "loss": 1.8918, + "step": 8934 + }, + { + "epoch": 2.7424800491098833, + "grad_norm": 0.590727686882019, + "learning_rate": 8.515973515696635e-05, + "loss": 1.8841, + "step": 8935 + }, + { + "epoch": 2.7427869858809086, + "grad_norm": 0.665314257144928, + "learning_rate": 8.515620092842723e-05, + "loss": 1.8166, + "step": 8936 + }, + { + "epoch": 2.7430939226519335, + "grad_norm": 0.5579181909561157, + "learning_rate": 8.515266635245389e-05, + "loss": 1.8344, + "step": 8937 + }, + { + "epoch": 2.743400859422959, + "grad_norm": 0.3698382079601288, + "learning_rate": 8.514913142908132e-05, + "loss": 1.8445, + "step": 8938 + }, + { + "epoch": 2.743707796193984, + "grad_norm": 0.30882057547569275, + "learning_rate": 8.514559615834442e-05, + "loss": 1.8443, + "step": 8939 + }, + { + "epoch": 2.744014732965009, + "grad_norm": 0.35821446776390076, + "learning_rate": 8.514206054027815e-05, + "loss": 1.8482, + "step": 8940 + }, + { + "epoch": 2.7443216697360344, + "grad_norm": 0.35552099347114563, + "learning_rate": 8.513852457491744e-05, + "loss": 1.7848, + "step": 8941 + }, + { + "epoch": 2.7446286065070593, + "grad_norm": 0.27788954973220825, + "learning_rate": 8.513498826229722e-05, + "loss": 1.7935, + "step": 8942 + }, + { + "epoch": 2.7449355432780846, + "grad_norm": 0.30653929710388184, + "learning_rate": 8.513145160245246e-05, + "loss": 1.808, + "step": 8943 + }, + { + "epoch": 2.74524248004911, + "grad_norm": 0.34749966859817505, + "learning_rate": 8.512791459541812e-05, + "loss": 1.8498, + "step": 8944 + }, + { + "epoch": 2.7455494168201353, + "grad_norm": 0.362326979637146, + "learning_rate": 8.512437724122912e-05, + "loss": 1.8263, + "step": 8945 + }, + { + "epoch": 2.74585635359116, + "grad_norm": 0.2914038598537445, + "learning_rate": 8.512083953992044e-05, + "loss": 1.834, + "step": 8946 + }, + { + "epoch": 2.7461632903621855, + "grad_norm": 0.31662893295288086, + "learning_rate": 8.511730149152705e-05, + "loss": 1.8157, + "step": 8947 + }, + { + "epoch": 2.7464702271332104, + "grad_norm": 0.38970568776130676, + "learning_rate": 8.51137630960839e-05, + "loss": 1.8764, + "step": 8948 + }, + { + "epoch": 2.7467771639042358, + "grad_norm": 0.3907272517681122, + "learning_rate": 8.511022435362594e-05, + "loss": 1.8665, + "step": 8949 + }, + { + "epoch": 2.747084100675261, + "grad_norm": 0.3315196931362152, + "learning_rate": 8.510668526418819e-05, + "loss": 1.8076, + "step": 8950 + }, + { + "epoch": 2.747391037446286, + "grad_norm": 0.29783520102500916, + "learning_rate": 8.510314582780559e-05, + "loss": 1.8518, + "step": 8951 + }, + { + "epoch": 2.7476979742173113, + "grad_norm": 0.3085685670375824, + "learning_rate": 8.509960604451312e-05, + "loss": 1.8961, + "step": 8952 + }, + { + "epoch": 2.748004910988336, + "grad_norm": 0.3204992711544037, + "learning_rate": 8.509606591434579e-05, + "loss": 1.8374, + "step": 8953 + }, + { + "epoch": 2.7483118477593615, + "grad_norm": 0.2801276445388794, + "learning_rate": 8.509252543733855e-05, + "loss": 1.8455, + "step": 8954 + }, + { + "epoch": 2.748618784530387, + "grad_norm": 0.26911506056785583, + "learning_rate": 8.508898461352641e-05, + "loss": 1.8093, + "step": 8955 + }, + { + "epoch": 2.7489257213014118, + "grad_norm": 0.30429625511169434, + "learning_rate": 8.508544344294435e-05, + "loss": 1.8526, + "step": 8956 + }, + { + "epoch": 2.749232658072437, + "grad_norm": 0.308403342962265, + "learning_rate": 8.50819019256274e-05, + "loss": 1.7917, + "step": 8957 + }, + { + "epoch": 2.749539594843462, + "grad_norm": 0.3292251229286194, + "learning_rate": 8.507836006161052e-05, + "loss": 1.8206, + "step": 8958 + }, + { + "epoch": 2.7498465316144873, + "grad_norm": 0.30014076828956604, + "learning_rate": 8.507481785092871e-05, + "loss": 1.8136, + "step": 8959 + }, + { + "epoch": 2.7501534683855127, + "grad_norm": 0.2879343032836914, + "learning_rate": 8.5071275293617e-05, + "loss": 1.8476, + "step": 8960 + }, + { + "epoch": 2.750460405156538, + "grad_norm": 0.30646058917045593, + "learning_rate": 8.506773238971039e-05, + "loss": 1.7936, + "step": 8961 + }, + { + "epoch": 2.750767341927563, + "grad_norm": 0.309804230928421, + "learning_rate": 8.506418913924391e-05, + "loss": 1.8076, + "step": 8962 + }, + { + "epoch": 2.7510742786985882, + "grad_norm": 0.27035996317863464, + "learning_rate": 8.506064554225255e-05, + "loss": 1.8169, + "step": 8963 + }, + { + "epoch": 2.751381215469613, + "grad_norm": 0.3185548782348633, + "learning_rate": 8.505710159877134e-05, + "loss": 1.8265, + "step": 8964 + }, + { + "epoch": 2.7516881522406385, + "grad_norm": 0.3806973099708557, + "learning_rate": 8.505355730883532e-05, + "loss": 1.824, + "step": 8965 + }, + { + "epoch": 2.751995089011664, + "grad_norm": 0.3206372857093811, + "learning_rate": 8.505001267247949e-05, + "loss": 1.8436, + "step": 8966 + }, + { + "epoch": 2.7523020257826887, + "grad_norm": 0.2957460880279541, + "learning_rate": 8.504646768973889e-05, + "loss": 1.8212, + "step": 8967 + }, + { + "epoch": 2.752608962553714, + "grad_norm": 0.2854628562927246, + "learning_rate": 8.504292236064854e-05, + "loss": 1.862, + "step": 8968 + }, + { + "epoch": 2.752915899324739, + "grad_norm": 0.30056047439575195, + "learning_rate": 8.503937668524351e-05, + "loss": 1.8007, + "step": 8969 + }, + { + "epoch": 2.7532228360957642, + "grad_norm": 0.33884522318840027, + "learning_rate": 8.503583066355883e-05, + "loss": 1.8972, + "step": 8970 + }, + { + "epoch": 2.7535297728667896, + "grad_norm": 0.29358747601509094, + "learning_rate": 8.503228429562951e-05, + "loss": 1.8343, + "step": 8971 + }, + { + "epoch": 2.7538367096378145, + "grad_norm": 0.3650909662246704, + "learning_rate": 8.502873758149063e-05, + "loss": 1.7866, + "step": 8972 + }, + { + "epoch": 2.75414364640884, + "grad_norm": 0.3245839476585388, + "learning_rate": 8.502519052117725e-05, + "loss": 1.8451, + "step": 8973 + }, + { + "epoch": 2.7544505831798647, + "grad_norm": 0.305429071187973, + "learning_rate": 8.502164311472441e-05, + "loss": 1.9277, + "step": 8974 + }, + { + "epoch": 2.75475751995089, + "grad_norm": 0.3520638942718506, + "learning_rate": 8.501809536216716e-05, + "loss": 1.7648, + "step": 8975 + }, + { + "epoch": 2.7550644567219154, + "grad_norm": 0.419918030500412, + "learning_rate": 8.501454726354054e-05, + "loss": 1.7862, + "step": 8976 + }, + { + "epoch": 2.7553713934929407, + "grad_norm": 0.3854345977306366, + "learning_rate": 8.501099881887968e-05, + "loss": 1.8234, + "step": 8977 + }, + { + "epoch": 2.7556783302639656, + "grad_norm": 0.27826064825057983, + "learning_rate": 8.50074500282196e-05, + "loss": 1.7694, + "step": 8978 + }, + { + "epoch": 2.755985267034991, + "grad_norm": 0.3439055383205414, + "learning_rate": 8.500390089159536e-05, + "loss": 1.8136, + "step": 8979 + }, + { + "epoch": 2.756292203806016, + "grad_norm": 0.3434913754463196, + "learning_rate": 8.500035140904208e-05, + "loss": 1.8053, + "step": 8980 + }, + { + "epoch": 2.756599140577041, + "grad_norm": 0.27551600337028503, + "learning_rate": 8.49968015805948e-05, + "loss": 1.8349, + "step": 8981 + }, + { + "epoch": 2.7569060773480665, + "grad_norm": 0.304706871509552, + "learning_rate": 8.499325140628863e-05, + "loss": 1.8488, + "step": 8982 + }, + { + "epoch": 2.7572130141190914, + "grad_norm": 0.36910584568977356, + "learning_rate": 8.498970088615861e-05, + "loss": 1.8519, + "step": 8983 + }, + { + "epoch": 2.7575199508901167, + "grad_norm": 0.30584999918937683, + "learning_rate": 8.498615002023987e-05, + "loss": 1.8479, + "step": 8984 + }, + { + "epoch": 2.7578268876611416, + "grad_norm": 0.28511542081832886, + "learning_rate": 8.498259880856749e-05, + "loss": 1.8047, + "step": 8985 + }, + { + "epoch": 2.758133824432167, + "grad_norm": 0.28804922103881836, + "learning_rate": 8.497904725117658e-05, + "loss": 1.891, + "step": 8986 + }, + { + "epoch": 2.7584407612031923, + "grad_norm": 0.32592445611953735, + "learning_rate": 8.497549534810221e-05, + "loss": 1.8081, + "step": 8987 + }, + { + "epoch": 2.758747697974217, + "grad_norm": 0.3298552632331848, + "learning_rate": 8.497194309937949e-05, + "loss": 1.8897, + "step": 8988 + }, + { + "epoch": 2.7590546347452425, + "grad_norm": 0.3506438136100769, + "learning_rate": 8.496839050504353e-05, + "loss": 1.9007, + "step": 8989 + }, + { + "epoch": 2.7593615715162674, + "grad_norm": 0.30891793966293335, + "learning_rate": 8.496483756512946e-05, + "loss": 1.8154, + "step": 8990 + }, + { + "epoch": 2.7596685082872927, + "grad_norm": 0.3697068691253662, + "learning_rate": 8.496128427967235e-05, + "loss": 1.8301, + "step": 8991 + }, + { + "epoch": 2.759975445058318, + "grad_norm": 0.3090182840824127, + "learning_rate": 8.495773064870734e-05, + "loss": 1.8443, + "step": 8992 + }, + { + "epoch": 2.7602823818293434, + "grad_norm": 0.31172695755958557, + "learning_rate": 8.495417667226955e-05, + "loss": 1.8051, + "step": 8993 + }, + { + "epoch": 2.7605893186003683, + "grad_norm": 0.34285077452659607, + "learning_rate": 8.495062235039411e-05, + "loss": 1.8766, + "step": 8994 + }, + { + "epoch": 2.7608962553713936, + "grad_norm": 0.30001118779182434, + "learning_rate": 8.494706768311612e-05, + "loss": 1.8267, + "step": 8995 + }, + { + "epoch": 2.7612031921424185, + "grad_norm": 0.2767544984817505, + "learning_rate": 8.494351267047074e-05, + "loss": 1.8038, + "step": 8996 + }, + { + "epoch": 2.761510128913444, + "grad_norm": 0.2952648401260376, + "learning_rate": 8.493995731249307e-05, + "loss": 1.7863, + "step": 8997 + }, + { + "epoch": 2.761817065684469, + "grad_norm": 0.27491581439971924, + "learning_rate": 8.493640160921828e-05, + "loss": 1.844, + "step": 8998 + }, + { + "epoch": 2.762124002455494, + "grad_norm": 0.2733328938484192, + "learning_rate": 8.493284556068147e-05, + "loss": 1.7909, + "step": 8999 + }, + { + "epoch": 2.7624309392265194, + "grad_norm": 0.3201010525226593, + "learning_rate": 8.492928916691783e-05, + "loss": 1.8827, + "step": 9000 + }, + { + "epoch": 2.7627378759975443, + "grad_norm": 0.293652206659317, + "learning_rate": 8.492573242796244e-05, + "loss": 1.7755, + "step": 9001 + }, + { + "epoch": 2.7630448127685696, + "grad_norm": 0.2862321436405182, + "learning_rate": 8.492217534385053e-05, + "loss": 1.7868, + "step": 9002 + }, + { + "epoch": 2.763351749539595, + "grad_norm": 0.364490270614624, + "learning_rate": 8.491861791461722e-05, + "loss": 1.8276, + "step": 9003 + }, + { + "epoch": 2.7636586863106203, + "grad_norm": 0.4316955506801605, + "learning_rate": 8.491506014029765e-05, + "loss": 1.8727, + "step": 9004 + }, + { + "epoch": 2.763965623081645, + "grad_norm": 0.37957659363746643, + "learning_rate": 8.491150202092697e-05, + "loss": 1.8471, + "step": 9005 + }, + { + "epoch": 2.7642725598526705, + "grad_norm": 0.2936808168888092, + "learning_rate": 8.490794355654039e-05, + "loss": 1.7964, + "step": 9006 + }, + { + "epoch": 2.7645794966236954, + "grad_norm": 0.3742556869983673, + "learning_rate": 8.490438474717304e-05, + "loss": 1.8461, + "step": 9007 + }, + { + "epoch": 2.7648864333947207, + "grad_norm": 0.4273780286312103, + "learning_rate": 8.49008255928601e-05, + "loss": 1.7947, + "step": 9008 + }, + { + "epoch": 2.765193370165746, + "grad_norm": 0.35967808961868286, + "learning_rate": 8.489726609363675e-05, + "loss": 1.8125, + "step": 9009 + }, + { + "epoch": 2.765500306936771, + "grad_norm": 0.27607613801956177, + "learning_rate": 8.489370624953817e-05, + "loss": 1.8413, + "step": 9010 + }, + { + "epoch": 2.7658072437077963, + "grad_norm": 0.38287433981895447, + "learning_rate": 8.489014606059952e-05, + "loss": 1.8184, + "step": 9011 + }, + { + "epoch": 2.766114180478821, + "grad_norm": 0.4284100830554962, + "learning_rate": 8.4886585526856e-05, + "loss": 1.7965, + "step": 9012 + }, + { + "epoch": 2.7664211172498465, + "grad_norm": 0.35851627588272095, + "learning_rate": 8.48830246483428e-05, + "loss": 1.8275, + "step": 9013 + }, + { + "epoch": 2.766728054020872, + "grad_norm": 0.30598360300064087, + "learning_rate": 8.487946342509509e-05, + "loss": 1.8383, + "step": 9014 + }, + { + "epoch": 2.7670349907918967, + "grad_norm": 0.30098259449005127, + "learning_rate": 8.487590185714811e-05, + "loss": 1.8229, + "step": 9015 + }, + { + "epoch": 2.767341927562922, + "grad_norm": 0.45887723565101624, + "learning_rate": 8.487233994453701e-05, + "loss": 1.9128, + "step": 9016 + }, + { + "epoch": 2.767648864333947, + "grad_norm": 0.4983403980731964, + "learning_rate": 8.4868777687297e-05, + "loss": 1.8269, + "step": 9017 + }, + { + "epoch": 2.7679558011049723, + "grad_norm": 0.4925507605075836, + "learning_rate": 8.48652150854633e-05, + "loss": 1.9231, + "step": 9018 + }, + { + "epoch": 2.7682627378759976, + "grad_norm": 0.31434112787246704, + "learning_rate": 8.48616521390711e-05, + "loss": 1.7782, + "step": 9019 + }, + { + "epoch": 2.768569674647023, + "grad_norm": 0.31802332401275635, + "learning_rate": 8.485808884815563e-05, + "loss": 1.8927, + "step": 9020 + }, + { + "epoch": 2.768876611418048, + "grad_norm": 0.4615871012210846, + "learning_rate": 8.485452521275208e-05, + "loss": 1.7866, + "step": 9021 + }, + { + "epoch": 2.769183548189073, + "grad_norm": 0.43722355365753174, + "learning_rate": 8.48509612328957e-05, + "loss": 1.8159, + "step": 9022 + }, + { + "epoch": 2.769490484960098, + "grad_norm": 0.27137285470962524, + "learning_rate": 8.484739690862169e-05, + "loss": 1.7613, + "step": 9023 + }, + { + "epoch": 2.7697974217311234, + "grad_norm": 0.32973676919937134, + "learning_rate": 8.484383223996528e-05, + "loss": 1.8321, + "step": 9024 + }, + { + "epoch": 2.7701043585021488, + "grad_norm": 0.38628003001213074, + "learning_rate": 8.484026722696169e-05, + "loss": 1.8154, + "step": 9025 + }, + { + "epoch": 2.7704112952731736, + "grad_norm": 0.33044543862342834, + "learning_rate": 8.483670186964617e-05, + "loss": 1.857, + "step": 9026 + }, + { + "epoch": 2.770718232044199, + "grad_norm": 0.2778245210647583, + "learning_rate": 8.483313616805393e-05, + "loss": 1.8524, + "step": 9027 + }, + { + "epoch": 2.771025168815224, + "grad_norm": 0.32064709067344666, + "learning_rate": 8.482957012222024e-05, + "loss": 1.8757, + "step": 9028 + }, + { + "epoch": 2.771332105586249, + "grad_norm": 0.29325249791145325, + "learning_rate": 8.48260037321803e-05, + "loss": 1.8504, + "step": 9029 + }, + { + "epoch": 2.7716390423572745, + "grad_norm": 0.308626651763916, + "learning_rate": 8.48224369979694e-05, + "loss": 1.882, + "step": 9030 + }, + { + "epoch": 2.7719459791282994, + "grad_norm": 0.34577706456184387, + "learning_rate": 8.481886991962276e-05, + "loss": 1.8178, + "step": 9031 + }, + { + "epoch": 2.7722529158993248, + "grad_norm": 0.3902320861816406, + "learning_rate": 8.481530249717564e-05, + "loss": 1.9111, + "step": 9032 + }, + { + "epoch": 2.7725598526703497, + "grad_norm": 0.431540310382843, + "learning_rate": 8.481173473066328e-05, + "loss": 1.8145, + "step": 9033 + }, + { + "epoch": 2.772866789441375, + "grad_norm": 0.3637184798717499, + "learning_rate": 8.480816662012097e-05, + "loss": 1.8298, + "step": 9034 + }, + { + "epoch": 2.7731737262124003, + "grad_norm": 0.3045017123222351, + "learning_rate": 8.480459816558397e-05, + "loss": 1.8099, + "step": 9035 + }, + { + "epoch": 2.7734806629834257, + "grad_norm": 0.4252402186393738, + "learning_rate": 8.48010293670875e-05, + "loss": 1.8125, + "step": 9036 + }, + { + "epoch": 2.7737875997544506, + "grad_norm": 0.37933188676834106, + "learning_rate": 8.479746022466688e-05, + "loss": 1.8162, + "step": 9037 + }, + { + "epoch": 2.774094536525476, + "grad_norm": 0.287536084651947, + "learning_rate": 8.479389073835735e-05, + "loss": 1.8377, + "step": 9038 + }, + { + "epoch": 2.7744014732965008, + "grad_norm": 0.3484840393066406, + "learning_rate": 8.47903209081942e-05, + "loss": 1.8166, + "step": 9039 + }, + { + "epoch": 2.774708410067526, + "grad_norm": 0.4489477872848511, + "learning_rate": 8.478675073421272e-05, + "loss": 1.8618, + "step": 9040 + }, + { + "epoch": 2.7750153468385514, + "grad_norm": 0.3817744553089142, + "learning_rate": 8.478318021644817e-05, + "loss": 1.86, + "step": 9041 + }, + { + "epoch": 2.7753222836095763, + "grad_norm": 0.263468861579895, + "learning_rate": 8.477960935493585e-05, + "loss": 1.7802, + "step": 9042 + }, + { + "epoch": 2.7756292203806017, + "grad_norm": 0.3218925893306732, + "learning_rate": 8.477603814971104e-05, + "loss": 1.8056, + "step": 9043 + }, + { + "epoch": 2.7759361571516266, + "grad_norm": 0.38502782583236694, + "learning_rate": 8.477246660080905e-05, + "loss": 1.8405, + "step": 9044 + }, + { + "epoch": 2.776243093922652, + "grad_norm": 0.3504064381122589, + "learning_rate": 8.476889470826517e-05, + "loss": 1.8606, + "step": 9045 + }, + { + "epoch": 2.7765500306936772, + "grad_norm": 0.3007161021232605, + "learning_rate": 8.476532247211468e-05, + "loss": 1.8407, + "step": 9046 + }, + { + "epoch": 2.776856967464702, + "grad_norm": 0.30306726694107056, + "learning_rate": 8.476174989239289e-05, + "loss": 1.8399, + "step": 9047 + }, + { + "epoch": 2.7771639042357275, + "grad_norm": 0.3898545801639557, + "learning_rate": 8.475817696913511e-05, + "loss": 1.8971, + "step": 9048 + }, + { + "epoch": 2.7774708410067523, + "grad_norm": 0.35386478900909424, + "learning_rate": 8.475460370237667e-05, + "loss": 1.8213, + "step": 9049 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.35815873742103577, + "learning_rate": 8.475103009215287e-05, + "loss": 1.9593, + "step": 9050 + }, + { + "epoch": 2.778084714548803, + "grad_norm": 0.28021275997161865, + "learning_rate": 8.474745613849901e-05, + "loss": 1.7767, + "step": 9051 + }, + { + "epoch": 2.7783916513198283, + "grad_norm": 0.3393603563308716, + "learning_rate": 8.474388184145042e-05, + "loss": 1.8484, + "step": 9052 + }, + { + "epoch": 2.7786985880908532, + "grad_norm": 0.30488693714141846, + "learning_rate": 8.474030720104243e-05, + "loss": 1.835, + "step": 9053 + }, + { + "epoch": 2.7790055248618786, + "grad_norm": 0.2839586138725281, + "learning_rate": 8.473673221731037e-05, + "loss": 1.8054, + "step": 9054 + }, + { + "epoch": 2.7793124616329035, + "grad_norm": 0.2718851864337921, + "learning_rate": 8.473315689028955e-05, + "loss": 1.8216, + "step": 9055 + }, + { + "epoch": 2.779619398403929, + "grad_norm": 0.3072827458381653, + "learning_rate": 8.472958122001531e-05, + "loss": 1.8537, + "step": 9056 + }, + { + "epoch": 2.779926335174954, + "grad_norm": 0.36827966570854187, + "learning_rate": 8.472600520652301e-05, + "loss": 1.8174, + "step": 9057 + }, + { + "epoch": 2.780233271945979, + "grad_norm": 0.37436968088150024, + "learning_rate": 8.472242884984797e-05, + "loss": 1.7983, + "step": 9058 + }, + { + "epoch": 2.7805402087170044, + "grad_norm": 0.3039530813694, + "learning_rate": 8.471885215002554e-05, + "loss": 1.839, + "step": 9059 + }, + { + "epoch": 2.7808471454880292, + "grad_norm": 0.2949865162372589, + "learning_rate": 8.471527510709106e-05, + "loss": 1.8191, + "step": 9060 + }, + { + "epoch": 2.7811540822590546, + "grad_norm": 0.2914051413536072, + "learning_rate": 8.471169772107987e-05, + "loss": 1.8511, + "step": 9061 + }, + { + "epoch": 2.78146101903008, + "grad_norm": 0.29169002175331116, + "learning_rate": 8.470811999202734e-05, + "loss": 1.8242, + "step": 9062 + }, + { + "epoch": 2.781767955801105, + "grad_norm": 0.2862909436225891, + "learning_rate": 8.470454191996884e-05, + "loss": 1.8471, + "step": 9063 + }, + { + "epoch": 2.78207489257213, + "grad_norm": 0.2820829749107361, + "learning_rate": 8.47009635049397e-05, + "loss": 1.8539, + "step": 9064 + }, + { + "epoch": 2.782381829343155, + "grad_norm": 0.2778072655200958, + "learning_rate": 8.469738474697532e-05, + "loss": 1.7999, + "step": 9065 + }, + { + "epoch": 2.7826887661141804, + "grad_norm": 0.35963353514671326, + "learning_rate": 8.469380564611103e-05, + "loss": 1.8589, + "step": 9066 + }, + { + "epoch": 2.7829957028852057, + "grad_norm": 0.29438379406929016, + "learning_rate": 8.469022620238223e-05, + "loss": 1.7898, + "step": 9067 + }, + { + "epoch": 2.783302639656231, + "grad_norm": 0.2766551971435547, + "learning_rate": 8.468664641582428e-05, + "loss": 1.858, + "step": 9068 + }, + { + "epoch": 2.783609576427256, + "grad_norm": 0.29893574118614197, + "learning_rate": 8.468306628647256e-05, + "loss": 1.7859, + "step": 9069 + }, + { + "epoch": 2.7839165131982813, + "grad_norm": 0.2744910717010498, + "learning_rate": 8.467948581436243e-05, + "loss": 1.7803, + "step": 9070 + }, + { + "epoch": 2.784223449969306, + "grad_norm": 0.2405908703804016, + "learning_rate": 8.467590499952931e-05, + "loss": 1.8064, + "step": 9071 + }, + { + "epoch": 2.7845303867403315, + "grad_norm": 0.28585049510002136, + "learning_rate": 8.467232384200858e-05, + "loss": 1.809, + "step": 9072 + }, + { + "epoch": 2.784837323511357, + "grad_norm": 0.25816819071769714, + "learning_rate": 8.466874234183562e-05, + "loss": 1.7687, + "step": 9073 + }, + { + "epoch": 2.7851442602823817, + "grad_norm": 0.3135145306587219, + "learning_rate": 8.466516049904582e-05, + "loss": 1.8902, + "step": 9074 + }, + { + "epoch": 2.785451197053407, + "grad_norm": 0.32004159688949585, + "learning_rate": 8.46615783136746e-05, + "loss": 1.8227, + "step": 9075 + }, + { + "epoch": 2.785758133824432, + "grad_norm": 0.2775251567363739, + "learning_rate": 8.465799578575733e-05, + "loss": 1.8293, + "step": 9076 + }, + { + "epoch": 2.7860650705954573, + "grad_norm": 0.3377391993999481, + "learning_rate": 8.465441291532944e-05, + "loss": 1.9096, + "step": 9077 + }, + { + "epoch": 2.7863720073664826, + "grad_norm": 0.322818398475647, + "learning_rate": 8.465082970242634e-05, + "loss": 1.8372, + "step": 9078 + }, + { + "epoch": 2.786678944137508, + "grad_norm": 0.30539727210998535, + "learning_rate": 8.464724614708342e-05, + "loss": 1.8678, + "step": 9079 + }, + { + "epoch": 2.786985880908533, + "grad_norm": 0.3148079216480255, + "learning_rate": 8.464366224933611e-05, + "loss": 1.798, + "step": 9080 + }, + { + "epoch": 2.787292817679558, + "grad_norm": 0.3834371566772461, + "learning_rate": 8.464007800921983e-05, + "loss": 1.7871, + "step": 9081 + }, + { + "epoch": 2.787599754450583, + "grad_norm": 0.360202431678772, + "learning_rate": 8.463649342676998e-05, + "loss": 1.8396, + "step": 9082 + }, + { + "epoch": 2.7879066912216084, + "grad_norm": 0.28360050916671753, + "learning_rate": 8.463290850202201e-05, + "loss": 1.7905, + "step": 9083 + }, + { + "epoch": 2.7882136279926337, + "grad_norm": 0.28087326884269714, + "learning_rate": 8.462932323501134e-05, + "loss": 1.8079, + "step": 9084 + }, + { + "epoch": 2.7885205647636586, + "grad_norm": 0.2725851833820343, + "learning_rate": 8.462573762577339e-05, + "loss": 1.8099, + "step": 9085 + }, + { + "epoch": 2.788827501534684, + "grad_norm": 0.27776938676834106, + "learning_rate": 8.462215167434363e-05, + "loss": 1.8002, + "step": 9086 + }, + { + "epoch": 2.789134438305709, + "grad_norm": 0.3118545711040497, + "learning_rate": 8.461856538075745e-05, + "loss": 1.8541, + "step": 9087 + }, + { + "epoch": 2.789441375076734, + "grad_norm": 0.29499873518943787, + "learning_rate": 8.461497874505034e-05, + "loss": 1.8667, + "step": 9088 + }, + { + "epoch": 2.7897483118477595, + "grad_norm": 0.31346917152404785, + "learning_rate": 8.46113917672577e-05, + "loss": 1.8737, + "step": 9089 + }, + { + "epoch": 2.7900552486187844, + "grad_norm": 0.30406203866004944, + "learning_rate": 8.460780444741501e-05, + "loss": 1.8467, + "step": 9090 + }, + { + "epoch": 2.7903621853898097, + "grad_norm": 0.28438735008239746, + "learning_rate": 8.46042167855577e-05, + "loss": 1.8008, + "step": 9091 + }, + { + "epoch": 2.7906691221608346, + "grad_norm": 0.29893866181373596, + "learning_rate": 8.460062878172125e-05, + "loss": 1.8498, + "step": 9092 + }, + { + "epoch": 2.79097605893186, + "grad_norm": 0.33810749650001526, + "learning_rate": 8.459704043594112e-05, + "loss": 1.8259, + "step": 9093 + }, + { + "epoch": 2.7912829957028853, + "grad_norm": 0.3726813495159149, + "learning_rate": 8.459345174825273e-05, + "loss": 1.8831, + "step": 9094 + }, + { + "epoch": 2.7915899324739106, + "grad_norm": 0.2983379662036896, + "learning_rate": 8.45898627186916e-05, + "loss": 1.7886, + "step": 9095 + }, + { + "epoch": 2.7918968692449355, + "grad_norm": 0.3235681354999542, + "learning_rate": 8.458627334729316e-05, + "loss": 1.8616, + "step": 9096 + }, + { + "epoch": 2.792203806015961, + "grad_norm": 0.47961094975471497, + "learning_rate": 8.458268363409288e-05, + "loss": 1.8134, + "step": 9097 + }, + { + "epoch": 2.7925107427869857, + "grad_norm": 0.5463281869888306, + "learning_rate": 8.457909357912628e-05, + "loss": 1.8288, + "step": 9098 + }, + { + "epoch": 2.792817679558011, + "grad_norm": 0.5377171635627747, + "learning_rate": 8.45755031824288e-05, + "loss": 1.8032, + "step": 9099 + }, + { + "epoch": 2.7931246163290364, + "grad_norm": 0.30159178376197815, + "learning_rate": 8.457191244403592e-05, + "loss": 1.7619, + "step": 9100 + }, + { + "epoch": 2.7934315531000613, + "grad_norm": 0.33798086643218994, + "learning_rate": 8.456832136398315e-05, + "loss": 1.839, + "step": 9101 + }, + { + "epoch": 2.7937384898710866, + "grad_norm": 0.5194488167762756, + "learning_rate": 8.456472994230595e-05, + "loss": 1.7908, + "step": 9102 + }, + { + "epoch": 2.7940454266421115, + "grad_norm": 0.49310582876205444, + "learning_rate": 8.456113817903986e-05, + "loss": 1.8471, + "step": 9103 + }, + { + "epoch": 2.794352363413137, + "grad_norm": 0.27490735054016113, + "learning_rate": 8.455754607422032e-05, + "loss": 1.8168, + "step": 9104 + }, + { + "epoch": 2.794659300184162, + "grad_norm": 0.3760504126548767, + "learning_rate": 8.455395362788285e-05, + "loss": 1.8796, + "step": 9105 + }, + { + "epoch": 2.794966236955187, + "grad_norm": 0.4636823534965515, + "learning_rate": 8.455036084006298e-05, + "loss": 1.8001, + "step": 9106 + }, + { + "epoch": 2.7952731737262124, + "grad_norm": 0.38666999340057373, + "learning_rate": 8.454676771079619e-05, + "loss": 1.8396, + "step": 9107 + }, + { + "epoch": 2.7955801104972373, + "grad_norm": 0.2992180585861206, + "learning_rate": 8.454317424011797e-05, + "loss": 1.8298, + "step": 9108 + }, + { + "epoch": 2.7958870472682626, + "grad_norm": 0.3744206428527832, + "learning_rate": 8.453958042806389e-05, + "loss": 1.8396, + "step": 9109 + }, + { + "epoch": 2.796193984039288, + "grad_norm": 0.5117284059524536, + "learning_rate": 8.453598627466941e-05, + "loss": 1.9734, + "step": 9110 + }, + { + "epoch": 2.7965009208103133, + "grad_norm": 0.36792969703674316, + "learning_rate": 8.453239177997008e-05, + "loss": 1.8347, + "step": 9111 + }, + { + "epoch": 2.796807857581338, + "grad_norm": 0.3352719843387604, + "learning_rate": 8.452879694400139e-05, + "loss": 1.7967, + "step": 9112 + }, + { + "epoch": 2.7971147943523635, + "grad_norm": 0.45745235681533813, + "learning_rate": 8.452520176679893e-05, + "loss": 1.8484, + "step": 9113 + }, + { + "epoch": 2.7974217311233884, + "grad_norm": 0.43958255648612976, + "learning_rate": 8.452160624839816e-05, + "loss": 1.7954, + "step": 9114 + }, + { + "epoch": 2.7977286678944138, + "grad_norm": 0.28715837001800537, + "learning_rate": 8.451801038883467e-05, + "loss": 1.8088, + "step": 9115 + }, + { + "epoch": 2.798035604665439, + "grad_norm": 0.3552972078323364, + "learning_rate": 8.451441418814394e-05, + "loss": 1.7654, + "step": 9116 + }, + { + "epoch": 2.798342541436464, + "grad_norm": 0.5065462589263916, + "learning_rate": 8.451081764636156e-05, + "loss": 1.7841, + "step": 9117 + }, + { + "epoch": 2.7986494782074893, + "grad_norm": 0.48900917172431946, + "learning_rate": 8.450722076352306e-05, + "loss": 1.8709, + "step": 9118 + }, + { + "epoch": 2.798956414978514, + "grad_norm": 0.31420227885246277, + "learning_rate": 8.450362353966395e-05, + "loss": 1.9057, + "step": 9119 + }, + { + "epoch": 2.7992633517495396, + "grad_norm": 0.35886913537979126, + "learning_rate": 8.450002597481982e-05, + "loss": 1.877, + "step": 9120 + }, + { + "epoch": 2.799570288520565, + "grad_norm": 0.3822213113307953, + "learning_rate": 8.449642806902623e-05, + "loss": 1.9171, + "step": 9121 + }, + { + "epoch": 2.7998772252915898, + "grad_norm": 0.3286183476448059, + "learning_rate": 8.449282982231869e-05, + "loss": 1.8342, + "step": 9122 + }, + { + "epoch": 2.800184162062615, + "grad_norm": 0.3498966693878174, + "learning_rate": 8.448923123473282e-05, + "loss": 1.8276, + "step": 9123 + }, + { + "epoch": 2.80049109883364, + "grad_norm": 0.3550187647342682, + "learning_rate": 8.448563230630413e-05, + "loss": 1.8585, + "step": 9124 + }, + { + "epoch": 2.8007980356046653, + "grad_norm": 0.32100117206573486, + "learning_rate": 8.448203303706821e-05, + "loss": 1.8168, + "step": 9125 + }, + { + "epoch": 2.8011049723756907, + "grad_norm": 0.3859860301017761, + "learning_rate": 8.447843342706063e-05, + "loss": 1.8941, + "step": 9126 + }, + { + "epoch": 2.801411909146716, + "grad_norm": 0.41674432158470154, + "learning_rate": 8.447483347631697e-05, + "loss": 1.7894, + "step": 9127 + }, + { + "epoch": 2.801718845917741, + "grad_norm": 0.3324837386608124, + "learning_rate": 8.44712331848728e-05, + "loss": 1.8901, + "step": 9128 + }, + { + "epoch": 2.8020257826887662, + "grad_norm": 0.30357789993286133, + "learning_rate": 8.44676325527637e-05, + "loss": 1.8434, + "step": 9129 + }, + { + "epoch": 2.802332719459791, + "grad_norm": 0.3215816617012024, + "learning_rate": 8.446403158002525e-05, + "loss": 1.8291, + "step": 9130 + }, + { + "epoch": 2.8026396562308165, + "grad_norm": 0.26280832290649414, + "learning_rate": 8.446043026669303e-05, + "loss": 1.7934, + "step": 9131 + }, + { + "epoch": 2.802946593001842, + "grad_norm": 0.2963539659976959, + "learning_rate": 8.445682861280265e-05, + "loss": 1.824, + "step": 9132 + }, + { + "epoch": 2.8032535297728667, + "grad_norm": 0.4251864552497864, + "learning_rate": 8.44532266183897e-05, + "loss": 1.9, + "step": 9133 + }, + { + "epoch": 2.803560466543892, + "grad_norm": 0.3920140862464905, + "learning_rate": 8.444962428348978e-05, + "loss": 1.7753, + "step": 9134 + }, + { + "epoch": 2.803867403314917, + "grad_norm": 0.2614890933036804, + "learning_rate": 8.444602160813845e-05, + "loss": 1.844, + "step": 9135 + }, + { + "epoch": 2.8041743400859422, + "grad_norm": 0.3359995484352112, + "learning_rate": 8.444241859237135e-05, + "loss": 1.8636, + "step": 9136 + }, + { + "epoch": 2.8044812768569676, + "grad_norm": 0.34399285912513733, + "learning_rate": 8.44388152362241e-05, + "loss": 1.8304, + "step": 9137 + }, + { + "epoch": 2.804788213627993, + "grad_norm": 0.27815961837768555, + "learning_rate": 8.443521153973228e-05, + "loss": 1.7916, + "step": 9138 + }, + { + "epoch": 2.805095150399018, + "grad_norm": 0.40705251693725586, + "learning_rate": 8.443160750293152e-05, + "loss": 1.7707, + "step": 9139 + }, + { + "epoch": 2.805402087170043, + "grad_norm": 0.49512532353401184, + "learning_rate": 8.442800312585744e-05, + "loss": 1.866, + "step": 9140 + }, + { + "epoch": 2.805709023941068, + "grad_norm": 0.31373831629753113, + "learning_rate": 8.442439840854565e-05, + "loss": 1.8495, + "step": 9141 + }, + { + "epoch": 2.8060159607120934, + "grad_norm": 0.33470213413238525, + "learning_rate": 8.442079335103177e-05, + "loss": 1.8459, + "step": 9142 + }, + { + "epoch": 2.8063228974831187, + "grad_norm": 0.4092586636543274, + "learning_rate": 8.441718795335145e-05, + "loss": 1.8547, + "step": 9143 + }, + { + "epoch": 2.8066298342541436, + "grad_norm": 0.37220728397369385, + "learning_rate": 8.44135822155403e-05, + "loss": 1.8922, + "step": 9144 + }, + { + "epoch": 2.806936771025169, + "grad_norm": 0.3197399973869324, + "learning_rate": 8.440997613763395e-05, + "loss": 1.872, + "step": 9145 + }, + { + "epoch": 2.807243707796194, + "grad_norm": 0.31258881092071533, + "learning_rate": 8.440636971966805e-05, + "loss": 1.8394, + "step": 9146 + }, + { + "epoch": 2.807550644567219, + "grad_norm": 0.31450721621513367, + "learning_rate": 8.440276296167825e-05, + "loss": 1.8496, + "step": 9147 + }, + { + "epoch": 2.8078575813382445, + "grad_norm": 0.30959805846214294, + "learning_rate": 8.439915586370018e-05, + "loss": 1.8326, + "step": 9148 + }, + { + "epoch": 2.8081645181092694, + "grad_norm": 0.2942456901073456, + "learning_rate": 8.439554842576949e-05, + "loss": 1.8742, + "step": 9149 + }, + { + "epoch": 2.8084714548802947, + "grad_norm": 0.32378795742988586, + "learning_rate": 8.439194064792182e-05, + "loss": 1.7991, + "step": 9150 + }, + { + "epoch": 2.8087783916513196, + "grad_norm": 0.30733996629714966, + "learning_rate": 8.438833253019285e-05, + "loss": 1.8822, + "step": 9151 + }, + { + "epoch": 2.809085328422345, + "grad_norm": 0.29933521151542664, + "learning_rate": 8.438472407261821e-05, + "loss": 1.7785, + "step": 9152 + }, + { + "epoch": 2.8093922651933703, + "grad_norm": 0.2992005944252014, + "learning_rate": 8.438111527523358e-05, + "loss": 1.9056, + "step": 9153 + }, + { + "epoch": 2.8096992019643956, + "grad_norm": 0.3074969947338104, + "learning_rate": 8.43775061380746e-05, + "loss": 1.8283, + "step": 9154 + }, + { + "epoch": 2.8100061387354205, + "grad_norm": 0.29843345284461975, + "learning_rate": 8.437389666117699e-05, + "loss": 1.87, + "step": 9155 + }, + { + "epoch": 2.810313075506446, + "grad_norm": 0.2939853072166443, + "learning_rate": 8.437028684457635e-05, + "loss": 1.8657, + "step": 9156 + }, + { + "epoch": 2.8106200122774707, + "grad_norm": 0.292972207069397, + "learning_rate": 8.436667668830841e-05, + "loss": 1.821, + "step": 9157 + }, + { + "epoch": 2.810926949048496, + "grad_norm": 0.298244833946228, + "learning_rate": 8.436306619240882e-05, + "loss": 1.8531, + "step": 9158 + }, + { + "epoch": 2.8112338858195214, + "grad_norm": 0.28567394614219666, + "learning_rate": 8.435945535691328e-05, + "loss": 1.7719, + "step": 9159 + }, + { + "epoch": 2.8115408225905463, + "grad_norm": 0.2876092493534088, + "learning_rate": 8.435584418185745e-05, + "loss": 1.7622, + "step": 9160 + }, + { + "epoch": 2.8118477593615716, + "grad_norm": 0.2656804919242859, + "learning_rate": 8.435223266727704e-05, + "loss": 1.7624, + "step": 9161 + }, + { + "epoch": 2.8121546961325965, + "grad_norm": 0.26690298318862915, + "learning_rate": 8.434862081320774e-05, + "loss": 1.807, + "step": 9162 + }, + { + "epoch": 2.812461632903622, + "grad_norm": 0.3088238537311554, + "learning_rate": 8.434500861968521e-05, + "loss": 1.9214, + "step": 9163 + }, + { + "epoch": 2.812768569674647, + "grad_norm": 0.32310751080513, + "learning_rate": 8.43413960867452e-05, + "loss": 1.8341, + "step": 9164 + }, + { + "epoch": 2.813075506445672, + "grad_norm": 0.3028428554534912, + "learning_rate": 8.433778321442339e-05, + "loss": 1.8316, + "step": 9165 + }, + { + "epoch": 2.8133824432166974, + "grad_norm": 0.28363901376724243, + "learning_rate": 8.433417000275545e-05, + "loss": 1.8506, + "step": 9166 + }, + { + "epoch": 2.8136893799877223, + "grad_norm": 0.2976547181606293, + "learning_rate": 8.433055645177714e-05, + "loss": 1.8654, + "step": 9167 + }, + { + "epoch": 2.8139963167587476, + "grad_norm": 0.2945725619792938, + "learning_rate": 8.432694256152414e-05, + "loss": 1.8146, + "step": 9168 + }, + { + "epoch": 2.814303253529773, + "grad_norm": 0.30364149808883667, + "learning_rate": 8.432332833203217e-05, + "loss": 1.8152, + "step": 9169 + }, + { + "epoch": 2.8146101903007983, + "grad_norm": 0.2776038348674774, + "learning_rate": 8.431971376333699e-05, + "loss": 1.7723, + "step": 9170 + }, + { + "epoch": 2.814917127071823, + "grad_norm": 0.41802000999450684, + "learning_rate": 8.431609885547425e-05, + "loss": 1.7909, + "step": 9171 + }, + { + "epoch": 2.8152240638428485, + "grad_norm": 0.400622695684433, + "learning_rate": 8.43124836084797e-05, + "loss": 1.8241, + "step": 9172 + }, + { + "epoch": 2.8155310006138734, + "grad_norm": 0.3760300576686859, + "learning_rate": 8.430886802238908e-05, + "loss": 1.9298, + "step": 9173 + }, + { + "epoch": 2.8158379373848987, + "grad_norm": 0.2944977283477783, + "learning_rate": 8.430525209723813e-05, + "loss": 1.8181, + "step": 9174 + }, + { + "epoch": 2.816144874155924, + "grad_norm": 0.28091785311698914, + "learning_rate": 8.430163583306257e-05, + "loss": 1.8178, + "step": 9175 + }, + { + "epoch": 2.816451810926949, + "grad_norm": 0.33689528703689575, + "learning_rate": 8.429801922989812e-05, + "loss": 1.8195, + "step": 9176 + }, + { + "epoch": 2.8167587476979743, + "grad_norm": 0.3541412055492401, + "learning_rate": 8.429440228778058e-05, + "loss": 1.8951, + "step": 9177 + }, + { + "epoch": 2.817065684468999, + "grad_norm": 0.2846376299858093, + "learning_rate": 8.429078500674564e-05, + "loss": 1.7858, + "step": 9178 + }, + { + "epoch": 2.8173726212400245, + "grad_norm": 0.28097108006477356, + "learning_rate": 8.428716738682905e-05, + "loss": 1.8503, + "step": 9179 + }, + { + "epoch": 2.81767955801105, + "grad_norm": 0.354670912027359, + "learning_rate": 8.428354942806658e-05, + "loss": 1.8332, + "step": 9180 + }, + { + "epoch": 2.8179864947820747, + "grad_norm": 0.3589770793914795, + "learning_rate": 8.427993113049397e-05, + "loss": 1.8527, + "step": 9181 + }, + { + "epoch": 2.8182934315531, + "grad_norm": 0.3171144723892212, + "learning_rate": 8.4276312494147e-05, + "loss": 1.789, + "step": 9182 + }, + { + "epoch": 2.818600368324125, + "grad_norm": 0.3540917932987213, + "learning_rate": 8.427269351906143e-05, + "loss": 1.8338, + "step": 9183 + }, + { + "epoch": 2.8189073050951503, + "grad_norm": 0.34149861335754395, + "learning_rate": 8.426907420527302e-05, + "loss": 1.8202, + "step": 9184 + }, + { + "epoch": 2.8192142418661756, + "grad_norm": 0.3035878837108612, + "learning_rate": 8.426545455281751e-05, + "loss": 1.842, + "step": 9185 + }, + { + "epoch": 2.819521178637201, + "grad_norm": 0.29007625579833984, + "learning_rate": 8.426183456173072e-05, + "loss": 1.8486, + "step": 9186 + }, + { + "epoch": 2.819828115408226, + "grad_norm": 0.3066602647304535, + "learning_rate": 8.425821423204837e-05, + "loss": 1.7833, + "step": 9187 + }, + { + "epoch": 2.820135052179251, + "grad_norm": 0.3163747191429138, + "learning_rate": 8.425459356380627e-05, + "loss": 1.8037, + "step": 9188 + }, + { + "epoch": 2.820441988950276, + "grad_norm": 0.3282648026943207, + "learning_rate": 8.425097255704022e-05, + "loss": 1.8476, + "step": 9189 + }, + { + "epoch": 2.8207489257213014, + "grad_norm": 0.3573009669780731, + "learning_rate": 8.424735121178598e-05, + "loss": 1.87, + "step": 9190 + }, + { + "epoch": 2.8210558624923268, + "grad_norm": 0.3480490744113922, + "learning_rate": 8.424372952807933e-05, + "loss": 1.8773, + "step": 9191 + }, + { + "epoch": 2.8213627992633517, + "grad_norm": 0.3296821415424347, + "learning_rate": 8.424010750595608e-05, + "loss": 1.8775, + "step": 9192 + }, + { + "epoch": 2.821669736034377, + "grad_norm": 0.33366382122039795, + "learning_rate": 8.423648514545202e-05, + "loss": 1.8064, + "step": 9193 + }, + { + "epoch": 2.821976672805402, + "grad_norm": 0.454303503036499, + "learning_rate": 8.423286244660295e-05, + "loss": 1.9702, + "step": 9194 + }, + { + "epoch": 2.822283609576427, + "grad_norm": 0.361215740442276, + "learning_rate": 8.422923940944466e-05, + "loss": 1.8055, + "step": 9195 + }, + { + "epoch": 2.8225905463474525, + "grad_norm": 0.3678447902202606, + "learning_rate": 8.422561603401297e-05, + "loss": 1.8924, + "step": 9196 + }, + { + "epoch": 2.8228974831184774, + "grad_norm": 0.32999005913734436, + "learning_rate": 8.422199232034369e-05, + "loss": 1.7887, + "step": 9197 + }, + { + "epoch": 2.8232044198895028, + "grad_norm": 0.2811618149280548, + "learning_rate": 8.42183682684726e-05, + "loss": 1.8166, + "step": 9198 + }, + { + "epoch": 2.8235113566605277, + "grad_norm": 0.3178839385509491, + "learning_rate": 8.421474387843555e-05, + "loss": 1.7868, + "step": 9199 + }, + { + "epoch": 2.823818293431553, + "grad_norm": 0.27299264073371887, + "learning_rate": 8.421111915026836e-05, + "loss": 1.816, + "step": 9200 + }, + { + "epoch": 2.8241252302025783, + "grad_norm": 0.3191591203212738, + "learning_rate": 8.420749408400684e-05, + "loss": 1.912, + "step": 9201 + }, + { + "epoch": 2.8244321669736037, + "grad_norm": 0.3638809323310852, + "learning_rate": 8.42038686796868e-05, + "loss": 1.7716, + "step": 9202 + }, + { + "epoch": 2.8247391037446286, + "grad_norm": 0.33573171496391296, + "learning_rate": 8.420024293734407e-05, + "loss": 1.8599, + "step": 9203 + }, + { + "epoch": 2.825046040515654, + "grad_norm": 0.29062843322753906, + "learning_rate": 8.419661685701452e-05, + "loss": 1.7982, + "step": 9204 + }, + { + "epoch": 2.825352977286679, + "grad_norm": 0.27475887537002563, + "learning_rate": 8.419299043873394e-05, + "loss": 1.7763, + "step": 9205 + }, + { + "epoch": 2.825659914057704, + "grad_norm": 0.2996850609779358, + "learning_rate": 8.41893636825382e-05, + "loss": 1.7957, + "step": 9206 + }, + { + "epoch": 2.8259668508287294, + "grad_norm": 0.38112908601760864, + "learning_rate": 8.418573658846314e-05, + "loss": 1.8536, + "step": 9207 + }, + { + "epoch": 2.8262737875997543, + "grad_norm": 0.3245584964752197, + "learning_rate": 8.418210915654456e-05, + "loss": 1.8254, + "step": 9208 + }, + { + "epoch": 2.8265807243707797, + "grad_norm": 0.24600234627723694, + "learning_rate": 8.417848138681837e-05, + "loss": 1.825, + "step": 9209 + }, + { + "epoch": 2.8268876611418046, + "grad_norm": 0.3130429685115814, + "learning_rate": 8.417485327932038e-05, + "loss": 1.7954, + "step": 9210 + }, + { + "epoch": 2.82719459791283, + "grad_norm": 0.3218819200992584, + "learning_rate": 8.417122483408647e-05, + "loss": 1.8343, + "step": 9211 + }, + { + "epoch": 2.8275015346838552, + "grad_norm": 0.3020598292350769, + "learning_rate": 8.416759605115248e-05, + "loss": 1.8547, + "step": 9212 + }, + { + "epoch": 2.8278084714548806, + "grad_norm": 0.2685437798500061, + "learning_rate": 8.416396693055429e-05, + "loss": 1.7828, + "step": 9213 + }, + { + "epoch": 2.8281154082259055, + "grad_norm": 0.2990378737449646, + "learning_rate": 8.416033747232775e-05, + "loss": 1.8108, + "step": 9214 + }, + { + "epoch": 2.828422344996931, + "grad_norm": 0.25395238399505615, + "learning_rate": 8.415670767650871e-05, + "loss": 1.786, + "step": 9215 + }, + { + "epoch": 2.8287292817679557, + "grad_norm": 0.3406725823879242, + "learning_rate": 8.41530775431331e-05, + "loss": 1.9015, + "step": 9216 + }, + { + "epoch": 2.829036218538981, + "grad_norm": 0.279859721660614, + "learning_rate": 8.414944707223676e-05, + "loss": 1.8639, + "step": 9217 + }, + { + "epoch": 2.8293431553100064, + "grad_norm": 0.2574310600757599, + "learning_rate": 8.414581626385554e-05, + "loss": 1.7595, + "step": 9218 + }, + { + "epoch": 2.8296500920810312, + "grad_norm": 0.2956291437149048, + "learning_rate": 8.414218511802537e-05, + "loss": 1.8418, + "step": 9219 + }, + { + "epoch": 2.8299570288520566, + "grad_norm": 0.30965283513069153, + "learning_rate": 8.41385536347821e-05, + "loss": 1.8241, + "step": 9220 + }, + { + "epoch": 2.8302639656230815, + "grad_norm": 0.3125357925891876, + "learning_rate": 8.413492181416166e-05, + "loss": 1.7961, + "step": 9221 + }, + { + "epoch": 2.830570902394107, + "grad_norm": 0.23901188373565674, + "learning_rate": 8.413128965619988e-05, + "loss": 1.8109, + "step": 9222 + }, + { + "epoch": 2.830877839165132, + "grad_norm": 0.26556700468063354, + "learning_rate": 8.412765716093272e-05, + "loss": 1.8756, + "step": 9223 + }, + { + "epoch": 2.831184775936157, + "grad_norm": 0.3080972731113434, + "learning_rate": 8.412402432839604e-05, + "loss": 1.8271, + "step": 9224 + }, + { + "epoch": 2.8314917127071824, + "grad_norm": 0.32894501090049744, + "learning_rate": 8.412039115862573e-05, + "loss": 1.8427, + "step": 9225 + }, + { + "epoch": 2.8317986494782073, + "grad_norm": 0.3136049509048462, + "learning_rate": 8.411675765165774e-05, + "loss": 1.8716, + "step": 9226 + }, + { + "epoch": 2.8321055862492326, + "grad_norm": 0.26859185099601746, + "learning_rate": 8.411312380752795e-05, + "loss": 1.8138, + "step": 9227 + }, + { + "epoch": 2.832412523020258, + "grad_norm": 0.26863718032836914, + "learning_rate": 8.410948962627227e-05, + "loss": 1.8286, + "step": 9228 + }, + { + "epoch": 2.8327194597912833, + "grad_norm": 0.25599852204322815, + "learning_rate": 8.410585510792663e-05, + "loss": 1.8274, + "step": 9229 + }, + { + "epoch": 2.833026396562308, + "grad_norm": 0.22787287831306458, + "learning_rate": 8.410222025252694e-05, + "loss": 1.7961, + "step": 9230 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.22957643866539001, + "learning_rate": 8.409858506010912e-05, + "loss": 1.7763, + "step": 9231 + }, + { + "epoch": 2.8336402701043584, + "grad_norm": 0.2794438302516937, + "learning_rate": 8.409494953070909e-05, + "loss": 1.8552, + "step": 9232 + }, + { + "epoch": 2.8339472068753837, + "grad_norm": 0.2755461037158966, + "learning_rate": 8.409131366436279e-05, + "loss": 1.8418, + "step": 9233 + }, + { + "epoch": 2.834254143646409, + "grad_norm": 0.27968719601631165, + "learning_rate": 8.408767746110616e-05, + "loss": 1.8774, + "step": 9234 + }, + { + "epoch": 2.834561080417434, + "grad_norm": 0.3014982044696808, + "learning_rate": 8.408404092097511e-05, + "loss": 1.8886, + "step": 9235 + }, + { + "epoch": 2.8348680171884593, + "grad_norm": 0.3139450252056122, + "learning_rate": 8.408040404400558e-05, + "loss": 1.8119, + "step": 9236 + }, + { + "epoch": 2.835174953959484, + "grad_norm": 0.43578827381134033, + "learning_rate": 8.407676683023353e-05, + "loss": 1.8173, + "step": 9237 + }, + { + "epoch": 2.8354818907305095, + "grad_norm": 0.4939953088760376, + "learning_rate": 8.407312927969489e-05, + "loss": 1.8647, + "step": 9238 + }, + { + "epoch": 2.835788827501535, + "grad_norm": 0.40801018476486206, + "learning_rate": 8.406949139242562e-05, + "loss": 1.8259, + "step": 9239 + }, + { + "epoch": 2.8360957642725597, + "grad_norm": 0.331249862909317, + "learning_rate": 8.406585316846168e-05, + "loss": 1.8727, + "step": 9240 + }, + { + "epoch": 2.836402701043585, + "grad_norm": 0.3368569314479828, + "learning_rate": 8.406221460783901e-05, + "loss": 1.8362, + "step": 9241 + }, + { + "epoch": 2.83670963781461, + "grad_norm": 0.4736326336860657, + "learning_rate": 8.405857571059355e-05, + "loss": 1.9543, + "step": 9242 + }, + { + "epoch": 2.8370165745856353, + "grad_norm": 0.4151712656021118, + "learning_rate": 8.405493647676131e-05, + "loss": 1.8764, + "step": 9243 + }, + { + "epoch": 2.8373235113566606, + "grad_norm": 0.3463367819786072, + "learning_rate": 8.405129690637821e-05, + "loss": 1.8578, + "step": 9244 + }, + { + "epoch": 2.837630448127686, + "grad_norm": 0.28701671957969666, + "learning_rate": 8.404765699948023e-05, + "loss": 1.8201, + "step": 9245 + }, + { + "epoch": 2.837937384898711, + "grad_norm": 0.2893613874912262, + "learning_rate": 8.404401675610336e-05, + "loss": 1.7918, + "step": 9246 + }, + { + "epoch": 2.838244321669736, + "grad_norm": 0.29359766840934753, + "learning_rate": 8.404037617628357e-05, + "loss": 1.7919, + "step": 9247 + }, + { + "epoch": 2.838551258440761, + "grad_norm": 0.30147913098335266, + "learning_rate": 8.403673526005682e-05, + "loss": 1.8227, + "step": 9248 + }, + { + "epoch": 2.8388581952117864, + "grad_norm": 0.28443291783332825, + "learning_rate": 8.403309400745908e-05, + "loss": 1.8128, + "step": 9249 + }, + { + "epoch": 2.8391651319828117, + "grad_norm": 0.27890142798423767, + "learning_rate": 8.40294524185264e-05, + "loss": 1.8109, + "step": 9250 + }, + { + "epoch": 2.8394720687538366, + "grad_norm": 0.29900890588760376, + "learning_rate": 8.402581049329471e-05, + "loss": 1.7852, + "step": 9251 + }, + { + "epoch": 2.839779005524862, + "grad_norm": 0.34249019622802734, + "learning_rate": 8.402216823180001e-05, + "loss": 1.8681, + "step": 9252 + }, + { + "epoch": 2.840085942295887, + "grad_norm": 0.3387257754802704, + "learning_rate": 8.40185256340783e-05, + "loss": 1.9171, + "step": 9253 + }, + { + "epoch": 2.840392879066912, + "grad_norm": 0.2831752598285675, + "learning_rate": 8.40148827001656e-05, + "loss": 1.8422, + "step": 9254 + }, + { + "epoch": 2.8406998158379375, + "grad_norm": 0.30895891785621643, + "learning_rate": 8.401123943009788e-05, + "loss": 1.7967, + "step": 9255 + }, + { + "epoch": 2.8410067526089624, + "grad_norm": 0.381154328584671, + "learning_rate": 8.400759582391116e-05, + "loss": 1.8359, + "step": 9256 + }, + { + "epoch": 2.8413136893799877, + "grad_norm": 0.4041622281074524, + "learning_rate": 8.400395188164144e-05, + "loss": 1.8306, + "step": 9257 + }, + { + "epoch": 2.8416206261510126, + "grad_norm": 0.3801247775554657, + "learning_rate": 8.400030760332474e-05, + "loss": 1.8696, + "step": 9258 + }, + { + "epoch": 2.841927562922038, + "grad_norm": 0.27382874488830566, + "learning_rate": 8.399666298899706e-05, + "loss": 1.8369, + "step": 9259 + }, + { + "epoch": 2.8422344996930633, + "grad_norm": 0.31395214796066284, + "learning_rate": 8.399301803869445e-05, + "loss": 1.8135, + "step": 9260 + }, + { + "epoch": 2.8425414364640886, + "grad_norm": 0.36473774909973145, + "learning_rate": 8.398937275245291e-05, + "loss": 1.8025, + "step": 9261 + }, + { + "epoch": 2.8428483732351135, + "grad_norm": 0.38420331478118896, + "learning_rate": 8.398572713030846e-05, + "loss": 1.7873, + "step": 9262 + }, + { + "epoch": 2.843155310006139, + "grad_norm": 0.2707001566886902, + "learning_rate": 8.398208117229714e-05, + "loss": 1.8071, + "step": 9263 + }, + { + "epoch": 2.8434622467771637, + "grad_norm": 0.3391258418560028, + "learning_rate": 8.397843487845496e-05, + "loss": 1.8186, + "step": 9264 + }, + { + "epoch": 2.843769183548189, + "grad_norm": 0.4473530650138855, + "learning_rate": 8.397478824881799e-05, + "loss": 1.9144, + "step": 9265 + }, + { + "epoch": 2.8440761203192144, + "grad_norm": 0.3141709268093109, + "learning_rate": 8.397114128342224e-05, + "loss": 1.77, + "step": 9266 + }, + { + "epoch": 2.8443830570902393, + "grad_norm": 0.29191854596138, + "learning_rate": 8.396749398230377e-05, + "loss": 1.8645, + "step": 9267 + }, + { + "epoch": 2.8446899938612646, + "grad_norm": 0.4399743676185608, + "learning_rate": 8.39638463454986e-05, + "loss": 1.8261, + "step": 9268 + }, + { + "epoch": 2.8449969306322895, + "grad_norm": 0.4741196036338806, + "learning_rate": 8.396019837304281e-05, + "loss": 1.8566, + "step": 9269 + }, + { + "epoch": 2.845303867403315, + "grad_norm": 0.39640361070632935, + "learning_rate": 8.395655006497243e-05, + "loss": 1.8062, + "step": 9270 + }, + { + "epoch": 2.84561080417434, + "grad_norm": 0.290171355009079, + "learning_rate": 8.39529014213235e-05, + "loss": 1.8463, + "step": 9271 + }, + { + "epoch": 2.845917740945365, + "grad_norm": 0.2773928940296173, + "learning_rate": 8.394925244213212e-05, + "loss": 1.7929, + "step": 9272 + }, + { + "epoch": 2.8462246777163904, + "grad_norm": 0.38512173295021057, + "learning_rate": 8.394560312743433e-05, + "loss": 1.8724, + "step": 9273 + }, + { + "epoch": 2.8465316144874153, + "grad_norm": 0.44405680894851685, + "learning_rate": 8.394195347726619e-05, + "loss": 1.8184, + "step": 9274 + }, + { + "epoch": 2.8468385512584407, + "grad_norm": 0.32526880502700806, + "learning_rate": 8.393830349166376e-05, + "loss": 1.8207, + "step": 9275 + }, + { + "epoch": 2.847145488029466, + "grad_norm": 0.2934194803237915, + "learning_rate": 8.393465317066313e-05, + "loss": 1.8023, + "step": 9276 + }, + { + "epoch": 2.8474524248004913, + "grad_norm": 0.43126001954078674, + "learning_rate": 8.393100251430037e-05, + "loss": 1.8283, + "step": 9277 + }, + { + "epoch": 2.847759361571516, + "grad_norm": 0.48253729939460754, + "learning_rate": 8.392735152261157e-05, + "loss": 1.8359, + "step": 9278 + }, + { + "epoch": 2.8480662983425415, + "grad_norm": 0.3736251890659332, + "learning_rate": 8.392370019563279e-05, + "loss": 1.8553, + "step": 9279 + }, + { + "epoch": 2.8483732351135664, + "grad_norm": 0.33329901099205017, + "learning_rate": 8.39200485334001e-05, + "loss": 1.8156, + "step": 9280 + }, + { + "epoch": 2.8486801718845918, + "grad_norm": 0.42538657784461975, + "learning_rate": 8.391639653594963e-05, + "loss": 1.7812, + "step": 9281 + }, + { + "epoch": 2.848987108655617, + "grad_norm": 0.39076727628707886, + "learning_rate": 8.391274420331744e-05, + "loss": 1.8027, + "step": 9282 + }, + { + "epoch": 2.849294045426642, + "grad_norm": 0.3558272123336792, + "learning_rate": 8.390909153553963e-05, + "loss": 1.8448, + "step": 9283 + }, + { + "epoch": 2.8496009821976673, + "grad_norm": 0.26782071590423584, + "learning_rate": 8.390543853265232e-05, + "loss": 1.7995, + "step": 9284 + }, + { + "epoch": 2.849907918968692, + "grad_norm": 0.3449724614620209, + "learning_rate": 8.390178519469158e-05, + "loss": 1.7888, + "step": 9285 + }, + { + "epoch": 2.8502148557397176, + "grad_norm": 0.36390578746795654, + "learning_rate": 8.389813152169355e-05, + "loss": 1.8072, + "step": 9286 + }, + { + "epoch": 2.850521792510743, + "grad_norm": 0.31959423422813416, + "learning_rate": 8.389447751369428e-05, + "loss": 1.8513, + "step": 9287 + }, + { + "epoch": 2.8508287292817682, + "grad_norm": 0.2717762589454651, + "learning_rate": 8.389082317072994e-05, + "loss": 1.8457, + "step": 9288 + }, + { + "epoch": 2.851135666052793, + "grad_norm": 0.28937265276908875, + "learning_rate": 8.388716849283662e-05, + "loss": 1.7945, + "step": 9289 + }, + { + "epoch": 2.8514426028238185, + "grad_norm": 0.293079674243927, + "learning_rate": 8.388351348005044e-05, + "loss": 1.7731, + "step": 9290 + }, + { + "epoch": 2.8517495395948433, + "grad_norm": 0.32930463552474976, + "learning_rate": 8.38798581324075e-05, + "loss": 1.9017, + "step": 9291 + }, + { + "epoch": 2.8520564763658687, + "grad_norm": 0.2972584664821625, + "learning_rate": 8.387620244994397e-05, + "loss": 1.861, + "step": 9292 + }, + { + "epoch": 2.852363413136894, + "grad_norm": 0.24732981622219086, + "learning_rate": 8.387254643269595e-05, + "loss": 1.7749, + "step": 9293 + }, + { + "epoch": 2.852670349907919, + "grad_norm": 0.31004419922828674, + "learning_rate": 8.386889008069955e-05, + "loss": 1.7848, + "step": 9294 + }, + { + "epoch": 2.8529772866789442, + "grad_norm": 0.2916278541088104, + "learning_rate": 8.386523339399095e-05, + "loss": 1.8299, + "step": 9295 + }, + { + "epoch": 2.853284223449969, + "grad_norm": 0.3109573423862457, + "learning_rate": 8.386157637260626e-05, + "loss": 1.8072, + "step": 9296 + }, + { + "epoch": 2.8535911602209945, + "grad_norm": 0.26398584246635437, + "learning_rate": 8.385791901658162e-05, + "loss": 1.8157, + "step": 9297 + }, + { + "epoch": 2.85389809699202, + "grad_norm": 0.3289371132850647, + "learning_rate": 8.385426132595317e-05, + "loss": 1.9382, + "step": 9298 + }, + { + "epoch": 2.8542050337630447, + "grad_norm": 0.2946974039077759, + "learning_rate": 8.38506033007571e-05, + "loss": 1.7893, + "step": 9299 + }, + { + "epoch": 2.85451197053407, + "grad_norm": 0.2909530699253082, + "learning_rate": 8.384694494102949e-05, + "loss": 1.8223, + "step": 9300 + }, + { + "epoch": 2.854818907305095, + "grad_norm": 0.2886645793914795, + "learning_rate": 8.384328624680655e-05, + "loss": 1.8239, + "step": 9301 + }, + { + "epoch": 2.8551258440761202, + "grad_norm": 0.2669137716293335, + "learning_rate": 8.383962721812442e-05, + "loss": 1.8102, + "step": 9302 + }, + { + "epoch": 2.8554327808471456, + "grad_norm": 0.3740660548210144, + "learning_rate": 8.383596785501926e-05, + "loss": 1.9014, + "step": 9303 + }, + { + "epoch": 2.855739717618171, + "grad_norm": 0.3062593638896942, + "learning_rate": 8.383230815752724e-05, + "loss": 1.8071, + "step": 9304 + }, + { + "epoch": 2.856046654389196, + "grad_norm": 0.2509091794490814, + "learning_rate": 8.382864812568452e-05, + "loss": 1.7968, + "step": 9305 + }, + { + "epoch": 2.856353591160221, + "grad_norm": 0.2764138877391815, + "learning_rate": 8.382498775952725e-05, + "loss": 1.7463, + "step": 9306 + }, + { + "epoch": 2.856660527931246, + "grad_norm": 0.3292323350906372, + "learning_rate": 8.382132705909165e-05, + "loss": 1.7888, + "step": 9307 + }, + { + "epoch": 2.8569674647022714, + "grad_norm": 0.3169284462928772, + "learning_rate": 8.381766602441386e-05, + "loss": 1.841, + "step": 9308 + }, + { + "epoch": 2.8572744014732967, + "grad_norm": 0.27665168046951294, + "learning_rate": 8.381400465553007e-05, + "loss": 1.7659, + "step": 9309 + }, + { + "epoch": 2.8575813382443216, + "grad_norm": 0.34908005595207214, + "learning_rate": 8.381034295247647e-05, + "loss": 1.8752, + "step": 9310 + }, + { + "epoch": 2.857888275015347, + "grad_norm": 0.31204238533973694, + "learning_rate": 8.380668091528924e-05, + "loss": 1.8201, + "step": 9311 + }, + { + "epoch": 2.858195211786372, + "grad_norm": 0.2713339328765869, + "learning_rate": 8.380301854400459e-05, + "loss": 1.8002, + "step": 9312 + }, + { + "epoch": 2.858502148557397, + "grad_norm": 0.30525076389312744, + "learning_rate": 8.379935583865868e-05, + "loss": 1.8533, + "step": 9313 + }, + { + "epoch": 2.8588090853284225, + "grad_norm": 0.3294430673122406, + "learning_rate": 8.379569279928774e-05, + "loss": 1.8895, + "step": 9314 + }, + { + "epoch": 2.8591160220994474, + "grad_norm": 0.31798750162124634, + "learning_rate": 8.379202942592795e-05, + "loss": 1.8148, + "step": 9315 + }, + { + "epoch": 2.8594229588704727, + "grad_norm": 0.3044969141483307, + "learning_rate": 8.378836571861553e-05, + "loss": 1.8477, + "step": 9316 + }, + { + "epoch": 2.8597298956414976, + "grad_norm": 0.2694118320941925, + "learning_rate": 8.378470167738665e-05, + "loss": 1.7998, + "step": 9317 + }, + { + "epoch": 2.860036832412523, + "grad_norm": 0.2601872980594635, + "learning_rate": 8.378103730227758e-05, + "loss": 1.8118, + "step": 9318 + }, + { + "epoch": 2.8603437691835483, + "grad_norm": 0.28168994188308716, + "learning_rate": 8.377737259332446e-05, + "loss": 1.8048, + "step": 9319 + }, + { + "epoch": 2.8606507059545736, + "grad_norm": 0.3008260428905487, + "learning_rate": 8.377370755056358e-05, + "loss": 1.7743, + "step": 9320 + }, + { + "epoch": 2.8609576427255985, + "grad_norm": 0.2578682601451874, + "learning_rate": 8.37700421740311e-05, + "loss": 1.8011, + "step": 9321 + }, + { + "epoch": 2.861264579496624, + "grad_norm": 0.3051932752132416, + "learning_rate": 8.376637646376329e-05, + "loss": 1.8747, + "step": 9322 + }, + { + "epoch": 2.8615715162676487, + "grad_norm": 0.27534300088882446, + "learning_rate": 8.376271041979636e-05, + "loss": 1.8018, + "step": 9323 + }, + { + "epoch": 2.861878453038674, + "grad_norm": 0.3990626335144043, + "learning_rate": 8.375904404216653e-05, + "loss": 1.9223, + "step": 9324 + }, + { + "epoch": 2.8621853898096994, + "grad_norm": 0.43015196919441223, + "learning_rate": 8.375537733091003e-05, + "loss": 1.8219, + "step": 9325 + }, + { + "epoch": 2.8624923265807243, + "grad_norm": 0.4051269590854645, + "learning_rate": 8.37517102860631e-05, + "loss": 1.8057, + "step": 9326 + }, + { + "epoch": 2.8627992633517496, + "grad_norm": 0.31781086325645447, + "learning_rate": 8.3748042907662e-05, + "loss": 1.8374, + "step": 9327 + }, + { + "epoch": 2.8631062001227745, + "grad_norm": 0.3476638197898865, + "learning_rate": 8.374437519574297e-05, + "loss": 1.8679, + "step": 9328 + }, + { + "epoch": 2.8634131368938, + "grad_norm": 0.40497875213623047, + "learning_rate": 8.374070715034224e-05, + "loss": 1.7996, + "step": 9329 + }, + { + "epoch": 2.863720073664825, + "grad_norm": 0.40277308225631714, + "learning_rate": 8.373703877149605e-05, + "loss": 1.8156, + "step": 9330 + }, + { + "epoch": 2.86402701043585, + "grad_norm": 0.3012325167655945, + "learning_rate": 8.373337005924069e-05, + "loss": 1.8765, + "step": 9331 + }, + { + "epoch": 2.8643339472068754, + "grad_norm": 0.3151897192001343, + "learning_rate": 8.372970101361238e-05, + "loss": 1.8395, + "step": 9332 + }, + { + "epoch": 2.8646408839779003, + "grad_norm": 0.33645790815353394, + "learning_rate": 8.372603163464741e-05, + "loss": 1.8587, + "step": 9333 + }, + { + "epoch": 2.8649478207489256, + "grad_norm": 0.29943743348121643, + "learning_rate": 8.3722361922382e-05, + "loss": 1.8007, + "step": 9334 + }, + { + "epoch": 2.865254757519951, + "grad_norm": 0.24727779626846313, + "learning_rate": 8.371869187685248e-05, + "loss": 1.766, + "step": 9335 + }, + { + "epoch": 2.8655616942909763, + "grad_norm": 0.3177282512187958, + "learning_rate": 8.371502149809507e-05, + "loss": 1.7954, + "step": 9336 + }, + { + "epoch": 2.865868631062001, + "grad_norm": 0.3415081202983856, + "learning_rate": 8.371135078614605e-05, + "loss": 1.8036, + "step": 9337 + }, + { + "epoch": 2.8661755678330265, + "grad_norm": 0.3044268488883972, + "learning_rate": 8.37076797410417e-05, + "loss": 1.8196, + "step": 9338 + }, + { + "epoch": 2.8664825046040514, + "grad_norm": 0.24425630271434784, + "learning_rate": 8.370400836281831e-05, + "loss": 1.8267, + "step": 9339 + }, + { + "epoch": 2.8667894413750767, + "grad_norm": 0.27264806628227234, + "learning_rate": 8.370033665151216e-05, + "loss": 1.8218, + "step": 9340 + }, + { + "epoch": 2.867096378146102, + "grad_norm": 0.275601327419281, + "learning_rate": 8.369666460715953e-05, + "loss": 1.8427, + "step": 9341 + }, + { + "epoch": 2.867403314917127, + "grad_norm": 0.2670573592185974, + "learning_rate": 8.36929922297967e-05, + "loss": 1.8449, + "step": 9342 + }, + { + "epoch": 2.8677102516881523, + "grad_norm": 0.2991434335708618, + "learning_rate": 8.368931951945998e-05, + "loss": 1.8866, + "step": 9343 + }, + { + "epoch": 2.868017188459177, + "grad_norm": 0.2975110411643982, + "learning_rate": 8.368564647618564e-05, + "loss": 1.7992, + "step": 9344 + }, + { + "epoch": 2.8683241252302025, + "grad_norm": 0.30109819769859314, + "learning_rate": 8.368197310001001e-05, + "loss": 1.8402, + "step": 9345 + }, + { + "epoch": 2.868631062001228, + "grad_norm": 0.3303714692592621, + "learning_rate": 8.367829939096938e-05, + "loss": 1.8329, + "step": 9346 + }, + { + "epoch": 2.8689379987722528, + "grad_norm": 0.3697182834148407, + "learning_rate": 8.367462534910007e-05, + "loss": 1.9328, + "step": 9347 + }, + { + "epoch": 2.869244935543278, + "grad_norm": 0.3292355537414551, + "learning_rate": 8.367095097443836e-05, + "loss": 1.8284, + "step": 9348 + }, + { + "epoch": 2.869551872314303, + "grad_norm": 0.30440348386764526, + "learning_rate": 8.366727626702058e-05, + "loss": 1.8891, + "step": 9349 + }, + { + "epoch": 2.8698588090853283, + "grad_norm": 0.28200212121009827, + "learning_rate": 8.366360122688303e-05, + "loss": 1.7931, + "step": 9350 + }, + { + "epoch": 2.8701657458563536, + "grad_norm": 0.3162787854671478, + "learning_rate": 8.365992585406207e-05, + "loss": 1.8033, + "step": 9351 + }, + { + "epoch": 2.870472682627379, + "grad_norm": 0.3326094448566437, + "learning_rate": 8.365625014859399e-05, + "loss": 1.8474, + "step": 9352 + }, + { + "epoch": 2.870779619398404, + "grad_norm": 0.36957383155822754, + "learning_rate": 8.36525741105151e-05, + "loss": 1.8387, + "step": 9353 + }, + { + "epoch": 2.871086556169429, + "grad_norm": 0.32996198534965515, + "learning_rate": 8.364889773986175e-05, + "loss": 1.9087, + "step": 9354 + }, + { + "epoch": 2.871393492940454, + "grad_norm": 0.3164239227771759, + "learning_rate": 8.36452210366703e-05, + "loss": 1.8735, + "step": 9355 + }, + { + "epoch": 2.8717004297114794, + "grad_norm": 0.411538302898407, + "learning_rate": 8.364154400097702e-05, + "loss": 1.832, + "step": 9356 + }, + { + "epoch": 2.8720073664825048, + "grad_norm": 0.48294687271118164, + "learning_rate": 8.36378666328183e-05, + "loss": 1.7772, + "step": 9357 + }, + { + "epoch": 2.8723143032535297, + "grad_norm": 0.4894202649593353, + "learning_rate": 8.363418893223046e-05, + "loss": 1.8396, + "step": 9358 + }, + { + "epoch": 2.872621240024555, + "grad_norm": 0.3328344225883484, + "learning_rate": 8.363051089924986e-05, + "loss": 1.8264, + "step": 9359 + }, + { + "epoch": 2.87292817679558, + "grad_norm": 0.29800695180892944, + "learning_rate": 8.362683253391284e-05, + "loss": 1.8609, + "step": 9360 + }, + { + "epoch": 2.873235113566605, + "grad_norm": 0.48049718141555786, + "learning_rate": 8.362315383625574e-05, + "loss": 1.8703, + "step": 9361 + }, + { + "epoch": 2.8735420503376305, + "grad_norm": 0.5477426052093506, + "learning_rate": 8.361947480631494e-05, + "loss": 1.8336, + "step": 9362 + }, + { + "epoch": 2.873848987108656, + "grad_norm": 0.42515942454338074, + "learning_rate": 8.361579544412676e-05, + "loss": 1.826, + "step": 9363 + }, + { + "epoch": 2.8741559238796808, + "grad_norm": 0.3049539029598236, + "learning_rate": 8.361211574972762e-05, + "loss": 1.9117, + "step": 9364 + }, + { + "epoch": 2.874462860650706, + "grad_norm": 0.4089799225330353, + "learning_rate": 8.360843572315384e-05, + "loss": 1.8669, + "step": 9365 + }, + { + "epoch": 2.874769797421731, + "grad_norm": 0.42594894766807556, + "learning_rate": 8.36047553644418e-05, + "loss": 1.8527, + "step": 9366 + }, + { + "epoch": 2.8750767341927563, + "grad_norm": 0.3282840847969055, + "learning_rate": 8.360107467362785e-05, + "loss": 1.833, + "step": 9367 + }, + { + "epoch": 2.8753836709637817, + "grad_norm": 0.26597294211387634, + "learning_rate": 8.359739365074841e-05, + "loss": 1.7735, + "step": 9368 + }, + { + "epoch": 2.8756906077348066, + "grad_norm": 0.33498096466064453, + "learning_rate": 8.359371229583983e-05, + "loss": 1.7923, + "step": 9369 + }, + { + "epoch": 2.875997544505832, + "grad_norm": 0.3046290874481201, + "learning_rate": 8.35900306089385e-05, + "loss": 1.8296, + "step": 9370 + }, + { + "epoch": 2.876304481276857, + "grad_norm": 0.3128269612789154, + "learning_rate": 8.358634859008079e-05, + "loss": 1.8115, + "step": 9371 + }, + { + "epoch": 2.876611418047882, + "grad_norm": 0.3814822733402252, + "learning_rate": 8.358266623930309e-05, + "loss": 1.8454, + "step": 9372 + }, + { + "epoch": 2.8769183548189075, + "grad_norm": 0.42400503158569336, + "learning_rate": 8.35789835566418e-05, + "loss": 1.8162, + "step": 9373 + }, + { + "epoch": 2.8772252915899323, + "grad_norm": 0.3131491243839264, + "learning_rate": 8.357530054213333e-05, + "loss": 1.8281, + "step": 9374 + }, + { + "epoch": 2.8775322283609577, + "grad_norm": 0.2566036581993103, + "learning_rate": 8.357161719581406e-05, + "loss": 1.7751, + "step": 9375 + }, + { + "epoch": 2.8778391651319826, + "grad_norm": 0.3858461081981659, + "learning_rate": 8.356793351772038e-05, + "loss": 1.8558, + "step": 9376 + }, + { + "epoch": 2.878146101903008, + "grad_norm": 0.38664349913597107, + "learning_rate": 8.35642495078887e-05, + "loss": 1.8009, + "step": 9377 + }, + { + "epoch": 2.8784530386740332, + "grad_norm": 0.33365172147750854, + "learning_rate": 8.356056516635545e-05, + "loss": 1.8689, + "step": 9378 + }, + { + "epoch": 2.8787599754450586, + "grad_norm": 0.3602980971336365, + "learning_rate": 8.355688049315702e-05, + "loss": 1.8397, + "step": 9379 + }, + { + "epoch": 2.8790669122160835, + "grad_norm": 0.4508447051048279, + "learning_rate": 8.355319548832983e-05, + "loss": 1.8163, + "step": 9380 + }, + { + "epoch": 2.879373848987109, + "grad_norm": 0.4433961808681488, + "learning_rate": 8.35495101519103e-05, + "loss": 1.7868, + "step": 9381 + }, + { + "epoch": 2.8796807857581337, + "grad_norm": 0.2754592299461365, + "learning_rate": 8.354582448393483e-05, + "loss": 1.8222, + "step": 9382 + }, + { + "epoch": 2.879987722529159, + "grad_norm": 0.29384344816207886, + "learning_rate": 8.354213848443987e-05, + "loss": 1.7742, + "step": 9383 + }, + { + "epoch": 2.8802946593001844, + "grad_norm": 0.33183756470680237, + "learning_rate": 8.353845215346183e-05, + "loss": 1.8327, + "step": 9384 + }, + { + "epoch": 2.8806015960712092, + "grad_norm": 0.3018858730792999, + "learning_rate": 8.353476549103717e-05, + "loss": 1.8606, + "step": 9385 + }, + { + "epoch": 2.8809085328422346, + "grad_norm": 0.38592803478240967, + "learning_rate": 8.353107849720229e-05, + "loss": 1.8091, + "step": 9386 + }, + { + "epoch": 2.8812154696132595, + "grad_norm": 0.448723703622818, + "learning_rate": 8.352739117199364e-05, + "loss": 1.8537, + "step": 9387 + }, + { + "epoch": 2.881522406384285, + "grad_norm": 0.25959616899490356, + "learning_rate": 8.352370351544765e-05, + "loss": 1.8188, + "step": 9388 + }, + { + "epoch": 2.88182934315531, + "grad_norm": 0.3304184079170227, + "learning_rate": 8.352001552760078e-05, + "loss": 1.8008, + "step": 9389 + }, + { + "epoch": 2.882136279926335, + "grad_norm": 0.3831254541873932, + "learning_rate": 8.351632720848947e-05, + "loss": 1.7636, + "step": 9390 + }, + { + "epoch": 2.8824432166973604, + "grad_norm": 0.3358294665813446, + "learning_rate": 8.351263855815017e-05, + "loss": 1.8375, + "step": 9391 + }, + { + "epoch": 2.8827501534683853, + "grad_norm": 0.31194913387298584, + "learning_rate": 8.350894957661935e-05, + "loss": 1.817, + "step": 9392 + }, + { + "epoch": 2.8830570902394106, + "grad_norm": 0.4156818687915802, + "learning_rate": 8.350526026393343e-05, + "loss": 1.799, + "step": 9393 + }, + { + "epoch": 2.883364027010436, + "grad_norm": 0.3062533140182495, + "learning_rate": 8.350157062012889e-05, + "loss": 1.8535, + "step": 9394 + }, + { + "epoch": 2.8836709637814613, + "grad_norm": 0.3091447949409485, + "learning_rate": 8.34978806452422e-05, + "loss": 1.839, + "step": 9395 + }, + { + "epoch": 2.883977900552486, + "grad_norm": 0.38731643557548523, + "learning_rate": 8.349419033930981e-05, + "loss": 1.8714, + "step": 9396 + }, + { + "epoch": 2.8842848373235115, + "grad_norm": 0.34655869007110596, + "learning_rate": 8.34904997023682e-05, + "loss": 1.8694, + "step": 9397 + }, + { + "epoch": 2.8845917740945364, + "grad_norm": 0.3094301223754883, + "learning_rate": 8.348680873445386e-05, + "loss": 1.8773, + "step": 9398 + }, + { + "epoch": 2.8848987108655617, + "grad_norm": 0.2954508364200592, + "learning_rate": 8.348311743560325e-05, + "loss": 1.7716, + "step": 9399 + }, + { + "epoch": 2.885205647636587, + "grad_norm": 0.32545948028564453, + "learning_rate": 8.347942580585282e-05, + "loss": 1.871, + "step": 9400 + }, + { + "epoch": 2.885512584407612, + "grad_norm": 0.3251612186431885, + "learning_rate": 8.34757338452391e-05, + "loss": 1.8553, + "step": 9401 + }, + { + "epoch": 2.8858195211786373, + "grad_norm": 0.2610895335674286, + "learning_rate": 8.347204155379856e-05, + "loss": 1.8018, + "step": 9402 + }, + { + "epoch": 2.886126457949662, + "grad_norm": 0.3369129002094269, + "learning_rate": 8.346834893156768e-05, + "loss": 1.8536, + "step": 9403 + }, + { + "epoch": 2.8864333947206875, + "grad_norm": 0.4544060528278351, + "learning_rate": 8.346465597858296e-05, + "loss": 1.8332, + "step": 9404 + }, + { + "epoch": 2.886740331491713, + "grad_norm": 0.45742174983024597, + "learning_rate": 8.346096269488089e-05, + "loss": 1.89, + "step": 9405 + }, + { + "epoch": 2.8870472682627377, + "grad_norm": 0.3458103537559509, + "learning_rate": 8.345726908049799e-05, + "loss": 1.8902, + "step": 9406 + }, + { + "epoch": 2.887354205033763, + "grad_norm": 0.33266058564186096, + "learning_rate": 8.345357513547074e-05, + "loss": 1.7975, + "step": 9407 + }, + { + "epoch": 2.887661141804788, + "grad_norm": 0.3503437042236328, + "learning_rate": 8.344988085983565e-05, + "loss": 1.8503, + "step": 9408 + }, + { + "epoch": 2.8879680785758133, + "grad_norm": 0.33511486649513245, + "learning_rate": 8.344618625362923e-05, + "loss": 1.8731, + "step": 9409 + }, + { + "epoch": 2.8882750153468386, + "grad_norm": 0.295250803232193, + "learning_rate": 8.344249131688799e-05, + "loss": 1.8557, + "step": 9410 + }, + { + "epoch": 2.888581952117864, + "grad_norm": 0.33287179470062256, + "learning_rate": 8.343879604964846e-05, + "loss": 1.8015, + "step": 9411 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.35169747471809387, + "learning_rate": 8.343510045194715e-05, + "loss": 1.7857, + "step": 9412 + }, + { + "epoch": 2.889195825659914, + "grad_norm": 0.3191360533237457, + "learning_rate": 8.343140452382056e-05, + "loss": 1.8474, + "step": 9413 + }, + { + "epoch": 2.889502762430939, + "grad_norm": 0.27216482162475586, + "learning_rate": 8.342770826530526e-05, + "loss": 1.7941, + "step": 9414 + }, + { + "epoch": 2.8898096992019644, + "grad_norm": 0.32968905568122864, + "learning_rate": 8.342401167643774e-05, + "loss": 1.8568, + "step": 9415 + }, + { + "epoch": 2.8901166359729897, + "grad_norm": 0.37429341673851013, + "learning_rate": 8.342031475725456e-05, + "loss": 1.8995, + "step": 9416 + }, + { + "epoch": 2.8904235727440146, + "grad_norm": 0.3318146765232086, + "learning_rate": 8.341661750779223e-05, + "loss": 1.8886, + "step": 9417 + }, + { + "epoch": 2.89073050951504, + "grad_norm": 0.3208807408809662, + "learning_rate": 8.34129199280873e-05, + "loss": 1.8306, + "step": 9418 + }, + { + "epoch": 2.891037446286065, + "grad_norm": 0.30906134843826294, + "learning_rate": 8.340922201817632e-05, + "loss": 1.8931, + "step": 9419 + }, + { + "epoch": 2.89134438305709, + "grad_norm": 0.2949373722076416, + "learning_rate": 8.340552377809581e-05, + "loss": 1.8375, + "step": 9420 + }, + { + "epoch": 2.8916513198281155, + "grad_norm": 0.2553368806838989, + "learning_rate": 8.340182520788236e-05, + "loss": 1.7816, + "step": 9421 + }, + { + "epoch": 2.891958256599141, + "grad_norm": 0.26867765188217163, + "learning_rate": 8.339812630757246e-05, + "loss": 1.7721, + "step": 9422 + }, + { + "epoch": 2.8922651933701657, + "grad_norm": 0.3132673501968384, + "learning_rate": 8.339442707720273e-05, + "loss": 1.8412, + "step": 9423 + }, + { + "epoch": 2.892572130141191, + "grad_norm": 0.32028669118881226, + "learning_rate": 8.33907275168097e-05, + "loss": 1.8081, + "step": 9424 + }, + { + "epoch": 2.892879066912216, + "grad_norm": 0.30383285880088806, + "learning_rate": 8.338702762642992e-05, + "loss": 1.8294, + "step": 9425 + }, + { + "epoch": 2.8931860036832413, + "grad_norm": 0.284161239862442, + "learning_rate": 8.338332740609995e-05, + "loss": 1.7788, + "step": 9426 + }, + { + "epoch": 2.8934929404542666, + "grad_norm": 0.26731929183006287, + "learning_rate": 8.337962685585638e-05, + "loss": 1.8244, + "step": 9427 + }, + { + "epoch": 2.8937998772252915, + "grad_norm": 0.2687760889530182, + "learning_rate": 8.337592597573578e-05, + "loss": 1.8104, + "step": 9428 + }, + { + "epoch": 2.894106813996317, + "grad_norm": 0.3097872734069824, + "learning_rate": 8.337222476577472e-05, + "loss": 1.8311, + "step": 9429 + }, + { + "epoch": 2.8944137507673418, + "grad_norm": 0.2915988862514496, + "learning_rate": 8.336852322600977e-05, + "loss": 1.8878, + "step": 9430 + }, + { + "epoch": 2.894720687538367, + "grad_norm": 0.2783167362213135, + "learning_rate": 8.336482135647751e-05, + "loss": 1.829, + "step": 9431 + }, + { + "epoch": 2.8950276243093924, + "grad_norm": 0.27866432070732117, + "learning_rate": 8.336111915721454e-05, + "loss": 1.8881, + "step": 9432 + }, + { + "epoch": 2.8953345610804173, + "grad_norm": 0.26949164271354675, + "learning_rate": 8.335741662825743e-05, + "loss": 1.7652, + "step": 9433 + }, + { + "epoch": 2.8956414978514426, + "grad_norm": 0.31324130296707153, + "learning_rate": 8.335371376964278e-05, + "loss": 1.8362, + "step": 9434 + }, + { + "epoch": 2.8959484346224675, + "grad_norm": 0.31150999665260315, + "learning_rate": 8.335001058140718e-05, + "loss": 1.8588, + "step": 9435 + }, + { + "epoch": 2.896255371393493, + "grad_norm": 0.30692601203918457, + "learning_rate": 8.334630706358724e-05, + "loss": 1.8473, + "step": 9436 + }, + { + "epoch": 2.896562308164518, + "grad_norm": 0.2764357328414917, + "learning_rate": 8.334260321621954e-05, + "loss": 1.8696, + "step": 9437 + }, + { + "epoch": 2.8968692449355435, + "grad_norm": 0.26108071208000183, + "learning_rate": 8.333889903934069e-05, + "loss": 1.7647, + "step": 9438 + }, + { + "epoch": 2.8971761817065684, + "grad_norm": 0.3382989466190338, + "learning_rate": 8.33351945329873e-05, + "loss": 1.8936, + "step": 9439 + }, + { + "epoch": 2.8974831184775938, + "grad_norm": 0.3121405839920044, + "learning_rate": 8.333148969719598e-05, + "loss": 1.8281, + "step": 9440 + }, + { + "epoch": 2.8977900552486187, + "grad_norm": 0.283149778842926, + "learning_rate": 8.332778453200334e-05, + "loss": 1.8642, + "step": 9441 + }, + { + "epoch": 2.898096992019644, + "grad_norm": 0.4140075445175171, + "learning_rate": 8.332407903744598e-05, + "loss": 1.8553, + "step": 9442 + }, + { + "epoch": 2.8984039287906693, + "grad_norm": 0.4345620274543762, + "learning_rate": 8.332037321356057e-05, + "loss": 1.7879, + "step": 9443 + }, + { + "epoch": 2.898710865561694, + "grad_norm": 0.4103661775588989, + "learning_rate": 8.33166670603837e-05, + "loss": 1.7928, + "step": 9444 + }, + { + "epoch": 2.8990178023327196, + "grad_norm": 0.2874266505241394, + "learning_rate": 8.3312960577952e-05, + "loss": 1.8097, + "step": 9445 + }, + { + "epoch": 2.8993247391037444, + "grad_norm": 0.2949487864971161, + "learning_rate": 8.330925376630208e-05, + "loss": 1.8679, + "step": 9446 + }, + { + "epoch": 2.8996316758747698, + "grad_norm": 0.3222406804561615, + "learning_rate": 8.330554662547059e-05, + "loss": 1.8184, + "step": 9447 + }, + { + "epoch": 2.899938612645795, + "grad_norm": 0.32089436054229736, + "learning_rate": 8.330183915549418e-05, + "loss": 1.8798, + "step": 9448 + }, + { + "epoch": 2.90024554941682, + "grad_norm": 0.28950363397598267, + "learning_rate": 8.329813135640947e-05, + "loss": 1.8502, + "step": 9449 + }, + { + "epoch": 2.9005524861878453, + "grad_norm": 0.29070547223091125, + "learning_rate": 8.329442322825312e-05, + "loss": 1.8826, + "step": 9450 + }, + { + "epoch": 2.9008594229588702, + "grad_norm": 0.3030688464641571, + "learning_rate": 8.329071477106175e-05, + "loss": 1.8002, + "step": 9451 + }, + { + "epoch": 2.9011663597298956, + "grad_norm": 0.33711570501327515, + "learning_rate": 8.328700598487203e-05, + "loss": 1.8876, + "step": 9452 + }, + { + "epoch": 2.901473296500921, + "grad_norm": 0.31995612382888794, + "learning_rate": 8.328329686972063e-05, + "loss": 1.7952, + "step": 9453 + }, + { + "epoch": 2.9017802332719462, + "grad_norm": 0.2619616389274597, + "learning_rate": 8.327958742564415e-05, + "loss": 1.7371, + "step": 9454 + }, + { + "epoch": 2.902087170042971, + "grad_norm": 0.3527650535106659, + "learning_rate": 8.32758776526793e-05, + "loss": 1.8385, + "step": 9455 + }, + { + "epoch": 2.9023941068139965, + "grad_norm": 0.3238582909107208, + "learning_rate": 8.327216755086271e-05, + "loss": 1.7955, + "step": 9456 + }, + { + "epoch": 2.9027010435850213, + "grad_norm": 0.2647970914840698, + "learning_rate": 8.326845712023106e-05, + "loss": 1.8639, + "step": 9457 + }, + { + "epoch": 2.9030079803560467, + "grad_norm": 0.3435346186161041, + "learning_rate": 8.326474636082103e-05, + "loss": 1.7831, + "step": 9458 + }, + { + "epoch": 2.903314917127072, + "grad_norm": 0.42539843916893005, + "learning_rate": 8.326103527266927e-05, + "loss": 1.8473, + "step": 9459 + }, + { + "epoch": 2.903621853898097, + "grad_norm": 0.3773367404937744, + "learning_rate": 8.325732385581247e-05, + "loss": 1.8993, + "step": 9460 + }, + { + "epoch": 2.9039287906691222, + "grad_norm": 0.2918262183666229, + "learning_rate": 8.32536121102873e-05, + "loss": 1.8198, + "step": 9461 + }, + { + "epoch": 2.904235727440147, + "grad_norm": 0.3997703492641449, + "learning_rate": 8.324990003613044e-05, + "loss": 1.8307, + "step": 9462 + }, + { + "epoch": 2.9045426642111725, + "grad_norm": 0.4593566656112671, + "learning_rate": 8.324618763337858e-05, + "loss": 1.8068, + "step": 9463 + }, + { + "epoch": 2.904849600982198, + "grad_norm": 0.30200180411338806, + "learning_rate": 8.324247490206841e-05, + "loss": 1.7935, + "step": 9464 + }, + { + "epoch": 2.9051565377532227, + "grad_norm": 0.37651970982551575, + "learning_rate": 8.323876184223663e-05, + "loss": 1.9268, + "step": 9465 + }, + { + "epoch": 2.905463474524248, + "grad_norm": 0.465863436460495, + "learning_rate": 8.32350484539199e-05, + "loss": 1.8331, + "step": 9466 + }, + { + "epoch": 2.905770411295273, + "grad_norm": 0.3527480661869049, + "learning_rate": 8.323133473715496e-05, + "loss": 1.899, + "step": 9467 + }, + { + "epoch": 2.9060773480662982, + "grad_norm": 0.30979883670806885, + "learning_rate": 8.32276206919785e-05, + "loss": 1.7578, + "step": 9468 + }, + { + "epoch": 2.9063842848373236, + "grad_norm": 0.5039793252944946, + "learning_rate": 8.322390631842718e-05, + "loss": 1.7822, + "step": 9469 + }, + { + "epoch": 2.906691221608349, + "grad_norm": 0.4683503806591034, + "learning_rate": 8.322019161653777e-05, + "loss": 1.7958, + "step": 9470 + }, + { + "epoch": 2.906998158379374, + "grad_norm": 0.27022865414619446, + "learning_rate": 8.321647658634696e-05, + "loss": 1.838, + "step": 9471 + }, + { + "epoch": 2.907305095150399, + "grad_norm": 0.3253246247768402, + "learning_rate": 8.321276122789146e-05, + "loss": 1.862, + "step": 9472 + }, + { + "epoch": 2.907612031921424, + "grad_norm": 0.3654547929763794, + "learning_rate": 8.320904554120798e-05, + "loss": 1.8578, + "step": 9473 + }, + { + "epoch": 2.9079189686924494, + "grad_norm": 0.3140239417552948, + "learning_rate": 8.320532952633325e-05, + "loss": 1.7954, + "step": 9474 + }, + { + "epoch": 2.9082259054634747, + "grad_norm": 0.24541302025318146, + "learning_rate": 8.3201613183304e-05, + "loss": 1.7711, + "step": 9475 + }, + { + "epoch": 2.9085328422344996, + "grad_norm": 0.2538415491580963, + "learning_rate": 8.319789651215692e-05, + "loss": 1.7756, + "step": 9476 + }, + { + "epoch": 2.908839779005525, + "grad_norm": 0.3181871175765991, + "learning_rate": 8.31941795129288e-05, + "loss": 1.7957, + "step": 9477 + }, + { + "epoch": 2.90914671577655, + "grad_norm": 0.3094673752784729, + "learning_rate": 8.319046218565633e-05, + "loss": 1.8897, + "step": 9478 + }, + { + "epoch": 2.909453652547575, + "grad_norm": 0.3004473149776459, + "learning_rate": 8.318674453037626e-05, + "loss": 1.7853, + "step": 9479 + }, + { + "epoch": 2.9097605893186005, + "grad_norm": 0.28673505783081055, + "learning_rate": 8.318302654712532e-05, + "loss": 1.8119, + "step": 9480 + }, + { + "epoch": 2.9100675260896254, + "grad_norm": 0.3177729547023773, + "learning_rate": 8.317930823594027e-05, + "loss": 1.8211, + "step": 9481 + }, + { + "epoch": 2.9103744628606507, + "grad_norm": 0.28347232937812805, + "learning_rate": 8.317558959685786e-05, + "loss": 1.8061, + "step": 9482 + }, + { + "epoch": 2.9106813996316756, + "grad_norm": 0.28247126936912537, + "learning_rate": 8.317187062991482e-05, + "loss": 1.8175, + "step": 9483 + }, + { + "epoch": 2.910988336402701, + "grad_norm": 0.3153017461299896, + "learning_rate": 8.31681513351479e-05, + "loss": 1.8619, + "step": 9484 + }, + { + "epoch": 2.9112952731737263, + "grad_norm": 0.265821635723114, + "learning_rate": 8.316443171259389e-05, + "loss": 1.7783, + "step": 9485 + }, + { + "epoch": 2.9116022099447516, + "grad_norm": 0.33247366547584534, + "learning_rate": 8.31607117622895e-05, + "loss": 1.8701, + "step": 9486 + }, + { + "epoch": 2.9119091467157765, + "grad_norm": 0.3343275189399719, + "learning_rate": 8.315699148427154e-05, + "loss": 1.742, + "step": 9487 + }, + { + "epoch": 2.912216083486802, + "grad_norm": 0.3427117168903351, + "learning_rate": 8.315327087857677e-05, + "loss": 1.8382, + "step": 9488 + }, + { + "epoch": 2.9125230202578267, + "grad_norm": 0.2884635925292969, + "learning_rate": 8.31495499452419e-05, + "loss": 1.8378, + "step": 9489 + }, + { + "epoch": 2.912829957028852, + "grad_norm": 0.30335184931755066, + "learning_rate": 8.31458286843038e-05, + "loss": 1.7619, + "step": 9490 + }, + { + "epoch": 2.9131368937998774, + "grad_norm": 0.3224368095397949, + "learning_rate": 8.314210709579916e-05, + "loss": 1.8289, + "step": 9491 + }, + { + "epoch": 2.9134438305709023, + "grad_norm": 0.28016242384910583, + "learning_rate": 8.31383851797648e-05, + "loss": 1.8027, + "step": 9492 + }, + { + "epoch": 2.9137507673419276, + "grad_norm": 0.32091468572616577, + "learning_rate": 8.313466293623749e-05, + "loss": 1.9027, + "step": 9493 + }, + { + "epoch": 2.9140577041129525, + "grad_norm": 0.2809069752693176, + "learning_rate": 8.313094036525403e-05, + "loss": 1.9194, + "step": 9494 + }, + { + "epoch": 2.914364640883978, + "grad_norm": 0.30734366178512573, + "learning_rate": 8.312721746685119e-05, + "loss": 1.8612, + "step": 9495 + }, + { + "epoch": 2.914671577655003, + "grad_norm": 0.25953513383865356, + "learning_rate": 8.312349424106578e-05, + "loss": 1.7593, + "step": 9496 + }, + { + "epoch": 2.9149785144260285, + "grad_norm": 0.27583983540534973, + "learning_rate": 8.311977068793459e-05, + "loss": 1.8138, + "step": 9497 + }, + { + "epoch": 2.9152854511970534, + "grad_norm": 0.30315884947776794, + "learning_rate": 8.31160468074944e-05, + "loss": 1.7704, + "step": 9498 + }, + { + "epoch": 2.9155923879680787, + "grad_norm": 0.321603387594223, + "learning_rate": 8.311232259978204e-05, + "loss": 1.8055, + "step": 9499 + }, + { + "epoch": 2.9158993247391036, + "grad_norm": 0.27882421016693115, + "learning_rate": 8.310859806483429e-05, + "loss": 1.8257, + "step": 9500 + }, + { + "epoch": 2.916206261510129, + "grad_norm": 0.3095625042915344, + "learning_rate": 8.310487320268795e-05, + "loss": 1.8561, + "step": 9501 + }, + { + "epoch": 2.9165131982811543, + "grad_norm": 0.27503731846809387, + "learning_rate": 8.310114801337988e-05, + "loss": 1.7588, + "step": 9502 + }, + { + "epoch": 2.916820135052179, + "grad_norm": 0.2534404695034027, + "learning_rate": 8.309742249694686e-05, + "loss": 1.7289, + "step": 9503 + }, + { + "epoch": 2.9171270718232045, + "grad_norm": 0.24968849122524261, + "learning_rate": 8.30936966534257e-05, + "loss": 1.7763, + "step": 9504 + }, + { + "epoch": 2.9174340085942294, + "grad_norm": 0.2728060781955719, + "learning_rate": 8.308997048285324e-05, + "loss": 1.7847, + "step": 9505 + }, + { + "epoch": 2.9177409453652547, + "grad_norm": 0.28728193044662476, + "learning_rate": 8.308624398526629e-05, + "loss": 1.7957, + "step": 9506 + }, + { + "epoch": 2.91804788213628, + "grad_norm": 0.3097241520881653, + "learning_rate": 8.308251716070169e-05, + "loss": 1.8141, + "step": 9507 + }, + { + "epoch": 2.918354818907305, + "grad_norm": 0.3570188879966736, + "learning_rate": 8.307879000919628e-05, + "loss": 1.8246, + "step": 9508 + }, + { + "epoch": 2.9186617556783303, + "grad_norm": 0.27077826857566833, + "learning_rate": 8.307506253078685e-05, + "loss": 1.7912, + "step": 9509 + }, + { + "epoch": 2.918968692449355, + "grad_norm": 0.26213565468788147, + "learning_rate": 8.307133472551028e-05, + "loss": 1.8378, + "step": 9510 + }, + { + "epoch": 2.9192756292203805, + "grad_norm": 0.3482845723628998, + "learning_rate": 8.306760659340339e-05, + "loss": 1.8031, + "step": 9511 + }, + { + "epoch": 2.919582565991406, + "grad_norm": 0.3730507791042328, + "learning_rate": 8.306387813450303e-05, + "loss": 1.7404, + "step": 9512 + }, + { + "epoch": 2.919889502762431, + "grad_norm": 0.2957874536514282, + "learning_rate": 8.306014934884606e-05, + "loss": 1.8623, + "step": 9513 + }, + { + "epoch": 2.920196439533456, + "grad_norm": 0.29137885570526123, + "learning_rate": 8.30564202364693e-05, + "loss": 1.847, + "step": 9514 + }, + { + "epoch": 2.9205033763044814, + "grad_norm": 0.35623642802238464, + "learning_rate": 8.305269079740964e-05, + "loss": 1.8382, + "step": 9515 + }, + { + "epoch": 2.9208103130755063, + "grad_norm": 0.28263330459594727, + "learning_rate": 8.304896103170389e-05, + "loss": 1.7732, + "step": 9516 + }, + { + "epoch": 2.9211172498465316, + "grad_norm": 0.23631221055984497, + "learning_rate": 8.304523093938897e-05, + "loss": 1.7709, + "step": 9517 + }, + { + "epoch": 2.921424186617557, + "grad_norm": 0.25887101888656616, + "learning_rate": 8.304150052050169e-05, + "loss": 1.7966, + "step": 9518 + }, + { + "epoch": 2.921731123388582, + "grad_norm": 0.31445473432540894, + "learning_rate": 8.303776977507894e-05, + "loss": 1.8735, + "step": 9519 + }, + { + "epoch": 2.922038060159607, + "grad_norm": 0.264930784702301, + "learning_rate": 8.303403870315757e-05, + "loss": 1.7983, + "step": 9520 + }, + { + "epoch": 2.922344996930632, + "grad_norm": 0.2664194107055664, + "learning_rate": 8.30303073047745e-05, + "loss": 1.8573, + "step": 9521 + }, + { + "epoch": 2.9226519337016574, + "grad_norm": 0.31645768880844116, + "learning_rate": 8.302657557996656e-05, + "loss": 1.913, + "step": 9522 + }, + { + "epoch": 2.9229588704726828, + "grad_norm": 0.2820858657360077, + "learning_rate": 8.302284352877063e-05, + "loss": 1.8714, + "step": 9523 + }, + { + "epoch": 2.9232658072437077, + "grad_norm": 0.2960543930530548, + "learning_rate": 8.30191111512236e-05, + "loss": 1.8296, + "step": 9524 + }, + { + "epoch": 2.923572744014733, + "grad_norm": 0.319363534450531, + "learning_rate": 8.301537844736237e-05, + "loss": 1.8533, + "step": 9525 + }, + { + "epoch": 2.923879680785758, + "grad_norm": 0.28047996759414673, + "learning_rate": 8.301164541722384e-05, + "loss": 1.7415, + "step": 9526 + }, + { + "epoch": 2.924186617556783, + "grad_norm": 0.3106628656387329, + "learning_rate": 8.300791206084486e-05, + "loss": 1.8809, + "step": 9527 + }, + { + "epoch": 2.9244935543278086, + "grad_norm": 0.2650253474712372, + "learning_rate": 8.300417837826235e-05, + "loss": 1.8097, + "step": 9528 + }, + { + "epoch": 2.924800491098834, + "grad_norm": 0.31832796335220337, + "learning_rate": 8.30004443695132e-05, + "loss": 1.881, + "step": 9529 + }, + { + "epoch": 2.925107427869859, + "grad_norm": 0.311018168926239, + "learning_rate": 8.299671003463432e-05, + "loss": 1.8725, + "step": 9530 + }, + { + "epoch": 2.925414364640884, + "grad_norm": 0.3125450909137726, + "learning_rate": 8.299297537366262e-05, + "loss": 1.8159, + "step": 9531 + }, + { + "epoch": 2.925721301411909, + "grad_norm": 0.30022570490837097, + "learning_rate": 8.298924038663498e-05, + "loss": 1.8217, + "step": 9532 + }, + { + "epoch": 2.9260282381829343, + "grad_norm": 0.3061163127422333, + "learning_rate": 8.298550507358836e-05, + "loss": 1.8529, + "step": 9533 + }, + { + "epoch": 2.9263351749539597, + "grad_norm": 0.258891224861145, + "learning_rate": 8.298176943455962e-05, + "loss": 1.8579, + "step": 9534 + }, + { + "epoch": 2.9266421117249846, + "grad_norm": 0.2871147096157074, + "learning_rate": 8.297803346958571e-05, + "loss": 1.8699, + "step": 9535 + }, + { + "epoch": 2.92694904849601, + "grad_norm": 0.3047468066215515, + "learning_rate": 8.297429717870356e-05, + "loss": 1.9165, + "step": 9536 + }, + { + "epoch": 2.927255985267035, + "grad_norm": 0.2852346897125244, + "learning_rate": 8.297056056195005e-05, + "loss": 1.8417, + "step": 9537 + }, + { + "epoch": 2.92756292203806, + "grad_norm": 0.30782654881477356, + "learning_rate": 8.296682361936216e-05, + "loss": 1.835, + "step": 9538 + }, + { + "epoch": 2.9278698588090855, + "grad_norm": 0.44828128814697266, + "learning_rate": 8.296308635097678e-05, + "loss": 1.8997, + "step": 9539 + }, + { + "epoch": 2.9281767955801103, + "grad_norm": 0.48911961913108826, + "learning_rate": 8.295934875683087e-05, + "loss": 1.8249, + "step": 9540 + }, + { + "epoch": 2.9284837323511357, + "grad_norm": 0.3377256691455841, + "learning_rate": 8.295561083696136e-05, + "loss": 1.757, + "step": 9541 + }, + { + "epoch": 2.9287906691221606, + "grad_norm": 0.29486989974975586, + "learning_rate": 8.295187259140518e-05, + "loss": 1.8282, + "step": 9542 + }, + { + "epoch": 2.929097605893186, + "grad_norm": 0.4291549026966095, + "learning_rate": 8.294813402019927e-05, + "loss": 1.7633, + "step": 9543 + }, + { + "epoch": 2.9294045426642112, + "grad_norm": 0.43153640627861023, + "learning_rate": 8.294439512338061e-05, + "loss": 1.7904, + "step": 9544 + }, + { + "epoch": 2.9297114794352366, + "grad_norm": 0.3454402685165405, + "learning_rate": 8.294065590098611e-05, + "loss": 1.8586, + "step": 9545 + }, + { + "epoch": 2.9300184162062615, + "grad_norm": 0.2709622383117676, + "learning_rate": 8.293691635305276e-05, + "loss": 1.8225, + "step": 9546 + }, + { + "epoch": 2.930325352977287, + "grad_norm": 0.34379467368125916, + "learning_rate": 8.293317647961749e-05, + "loss": 1.9005, + "step": 9547 + }, + { + "epoch": 2.9306322897483117, + "grad_norm": 0.37137365341186523, + "learning_rate": 8.292943628071727e-05, + "loss": 1.829, + "step": 9548 + }, + { + "epoch": 2.930939226519337, + "grad_norm": 0.31634894013404846, + "learning_rate": 8.292569575638905e-05, + "loss": 1.8062, + "step": 9549 + }, + { + "epoch": 2.9312461632903624, + "grad_norm": 0.25719332695007324, + "learning_rate": 8.292195490666981e-05, + "loss": 1.8044, + "step": 9550 + }, + { + "epoch": 2.9315531000613873, + "grad_norm": 0.3341852128505707, + "learning_rate": 8.291821373159652e-05, + "loss": 1.8627, + "step": 9551 + }, + { + "epoch": 2.9318600368324126, + "grad_norm": 0.38499385118484497, + "learning_rate": 8.291447223120614e-05, + "loss": 1.8138, + "step": 9552 + }, + { + "epoch": 2.9321669736034375, + "grad_norm": 0.28036460280418396, + "learning_rate": 8.291073040553567e-05, + "loss": 1.7958, + "step": 9553 + }, + { + "epoch": 2.932473910374463, + "grad_norm": 0.30798816680908203, + "learning_rate": 8.290698825462207e-05, + "loss": 1.899, + "step": 9554 + }, + { + "epoch": 2.932780847145488, + "grad_norm": 0.40930941700935364, + "learning_rate": 8.290324577850232e-05, + "loss": 1.841, + "step": 9555 + }, + { + "epoch": 2.933087783916513, + "grad_norm": 0.38794800639152527, + "learning_rate": 8.289950297721341e-05, + "loss": 1.8022, + "step": 9556 + }, + { + "epoch": 2.9333947206875384, + "grad_norm": 0.2716790437698364, + "learning_rate": 8.289575985079232e-05, + "loss": 1.8009, + "step": 9557 + }, + { + "epoch": 2.9337016574585633, + "grad_norm": 0.3063231110572815, + "learning_rate": 8.289201639927605e-05, + "loss": 1.8677, + "step": 9558 + }, + { + "epoch": 2.9340085942295886, + "grad_norm": 0.3279048800468445, + "learning_rate": 8.28882726227016e-05, + "loss": 1.8071, + "step": 9559 + }, + { + "epoch": 2.934315531000614, + "grad_norm": 0.32144758105278015, + "learning_rate": 8.288452852110596e-05, + "loss": 1.8601, + "step": 9560 + }, + { + "epoch": 2.9346224677716393, + "grad_norm": 0.284495085477829, + "learning_rate": 8.288078409452614e-05, + "loss": 1.8358, + "step": 9561 + }, + { + "epoch": 2.934929404542664, + "grad_norm": 0.3779112696647644, + "learning_rate": 8.287703934299915e-05, + "loss": 1.7903, + "step": 9562 + }, + { + "epoch": 2.9352363413136895, + "grad_norm": 0.33851495385169983, + "learning_rate": 8.287329426656197e-05, + "loss": 1.806, + "step": 9563 + }, + { + "epoch": 2.9355432780847144, + "grad_norm": 0.26610738039016724, + "learning_rate": 8.286954886525164e-05, + "loss": 1.7739, + "step": 9564 + }, + { + "epoch": 2.9358502148557397, + "grad_norm": 0.24825556576251984, + "learning_rate": 8.286580313910515e-05, + "loss": 1.7595, + "step": 9565 + }, + { + "epoch": 2.936157151626765, + "grad_norm": 0.28356245160102844, + "learning_rate": 8.286205708815954e-05, + "loss": 1.8497, + "step": 9566 + }, + { + "epoch": 2.93646408839779, + "grad_norm": 0.2974208891391754, + "learning_rate": 8.285831071245182e-05, + "loss": 1.8561, + "step": 9567 + }, + { + "epoch": 2.9367710251688153, + "grad_norm": 0.26718810200691223, + "learning_rate": 8.2854564012019e-05, + "loss": 1.776, + "step": 9568 + }, + { + "epoch": 2.93707796193984, + "grad_norm": 0.30627691745758057, + "learning_rate": 8.285081698689814e-05, + "loss": 1.8141, + "step": 9569 + }, + { + "epoch": 2.9373848987108655, + "grad_norm": 0.33287444710731506, + "learning_rate": 8.284706963712625e-05, + "loss": 1.8727, + "step": 9570 + }, + { + "epoch": 2.937691835481891, + "grad_norm": 0.30571332573890686, + "learning_rate": 8.284332196274036e-05, + "loss": 1.8388, + "step": 9571 + }, + { + "epoch": 2.937998772252916, + "grad_norm": 0.3603699207305908, + "learning_rate": 8.283957396377753e-05, + "loss": 1.8655, + "step": 9572 + }, + { + "epoch": 2.938305709023941, + "grad_norm": 0.2890760898590088, + "learning_rate": 8.283582564027477e-05, + "loss": 1.7919, + "step": 9573 + }, + { + "epoch": 2.9386126457949664, + "grad_norm": 0.34981194138526917, + "learning_rate": 8.283207699226912e-05, + "loss": 1.8542, + "step": 9574 + }, + { + "epoch": 2.9389195825659913, + "grad_norm": 0.43490317463874817, + "learning_rate": 8.282832801979766e-05, + "loss": 1.8109, + "step": 9575 + }, + { + "epoch": 2.9392265193370166, + "grad_norm": 0.4337438941001892, + "learning_rate": 8.282457872289742e-05, + "loss": 1.8856, + "step": 9576 + }, + { + "epoch": 2.939533456108042, + "grad_norm": 0.2723710834980011, + "learning_rate": 8.282082910160544e-05, + "loss": 1.8554, + "step": 9577 + }, + { + "epoch": 2.939840392879067, + "grad_norm": 0.32447734475135803, + "learning_rate": 8.28170791559588e-05, + "loss": 1.8086, + "step": 9578 + }, + { + "epoch": 2.940147329650092, + "grad_norm": 0.3495276868343353, + "learning_rate": 8.281332888599455e-05, + "loss": 1.785, + "step": 9579 + }, + { + "epoch": 2.940454266421117, + "grad_norm": 0.3324705958366394, + "learning_rate": 8.280957829174975e-05, + "loss": 1.8086, + "step": 9580 + }, + { + "epoch": 2.9407612031921424, + "grad_norm": 0.2633898854255676, + "learning_rate": 8.280582737326146e-05, + "loss": 1.8116, + "step": 9581 + }, + { + "epoch": 2.9410681399631677, + "grad_norm": 0.3109157085418701, + "learning_rate": 8.280207613056676e-05, + "loss": 1.8649, + "step": 9582 + }, + { + "epoch": 2.9413750767341926, + "grad_norm": 0.2772599756717682, + "learning_rate": 8.279832456370273e-05, + "loss": 1.8578, + "step": 9583 + }, + { + "epoch": 2.941682013505218, + "grad_norm": 0.32322654128074646, + "learning_rate": 8.279457267270642e-05, + "loss": 1.8621, + "step": 9584 + }, + { + "epoch": 2.941988950276243, + "grad_norm": 0.3678343594074249, + "learning_rate": 8.279082045761493e-05, + "loss": 1.8819, + "step": 9585 + }, + { + "epoch": 2.942295887047268, + "grad_norm": 0.30976057052612305, + "learning_rate": 8.27870679184653e-05, + "loss": 1.8126, + "step": 9586 + }, + { + "epoch": 2.9426028238182935, + "grad_norm": 0.26715603470802307, + "learning_rate": 8.278331505529469e-05, + "loss": 1.8831, + "step": 9587 + }, + { + "epoch": 2.942909760589319, + "grad_norm": 0.263288289308548, + "learning_rate": 8.277956186814014e-05, + "loss": 1.8057, + "step": 9588 + }, + { + "epoch": 2.9432166973603437, + "grad_norm": 0.29458633065223694, + "learning_rate": 8.277580835703873e-05, + "loss": 1.7307, + "step": 9589 + }, + { + "epoch": 2.943523634131369, + "grad_norm": 0.27819791436195374, + "learning_rate": 8.277205452202759e-05, + "loss": 1.8783, + "step": 9590 + }, + { + "epoch": 2.943830570902394, + "grad_norm": 0.29286056756973267, + "learning_rate": 8.276830036314379e-05, + "loss": 1.8061, + "step": 9591 + }, + { + "epoch": 2.9441375076734193, + "grad_norm": 0.2955230474472046, + "learning_rate": 8.276454588042442e-05, + "loss": 1.8227, + "step": 9592 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.3070714473724365, + "learning_rate": 8.276079107390663e-05, + "loss": 1.8451, + "step": 9593 + }, + { + "epoch": 2.9447513812154695, + "grad_norm": 0.34235841035842896, + "learning_rate": 8.275703594362749e-05, + "loss": 1.8052, + "step": 9594 + }, + { + "epoch": 2.945058317986495, + "grad_norm": 0.2863236665725708, + "learning_rate": 8.275328048962412e-05, + "loss": 1.8741, + "step": 9595 + }, + { + "epoch": 2.9453652547575198, + "grad_norm": 0.3013235032558441, + "learning_rate": 8.274952471193364e-05, + "loss": 1.8177, + "step": 9596 + }, + { + "epoch": 2.945672191528545, + "grad_norm": 0.2994023561477661, + "learning_rate": 8.274576861059316e-05, + "loss": 1.903, + "step": 9597 + }, + { + "epoch": 2.9459791282995704, + "grad_norm": 0.320049524307251, + "learning_rate": 8.27420121856398e-05, + "loss": 1.882, + "step": 9598 + }, + { + "epoch": 2.9462860650705953, + "grad_norm": 0.2789655029773712, + "learning_rate": 8.273825543711069e-05, + "loss": 1.794, + "step": 9599 + }, + { + "epoch": 2.9465930018416207, + "grad_norm": 0.3148564398288727, + "learning_rate": 8.273449836504294e-05, + "loss": 1.8453, + "step": 9600 + }, + { + "epoch": 2.9468999386126455, + "grad_norm": 0.46754372119903564, + "learning_rate": 8.273074096947371e-05, + "loss": 1.8147, + "step": 9601 + }, + { + "epoch": 2.947206875383671, + "grad_norm": 0.5946900844573975, + "learning_rate": 8.27269832504401e-05, + "loss": 1.8099, + "step": 9602 + }, + { + "epoch": 2.947513812154696, + "grad_norm": 0.4916069507598877, + "learning_rate": 8.272322520797926e-05, + "loss": 1.8315, + "step": 9603 + }, + { + "epoch": 2.9478207489257215, + "grad_norm": 0.30378973484039307, + "learning_rate": 8.271946684212833e-05, + "loss": 1.87, + "step": 9604 + }, + { + "epoch": 2.9481276856967464, + "grad_norm": 0.5197327136993408, + "learning_rate": 8.271570815292447e-05, + "loss": 1.8109, + "step": 9605 + }, + { + "epoch": 2.9484346224677718, + "grad_norm": 0.7213841080665588, + "learning_rate": 8.271194914040478e-05, + "loss": 1.8526, + "step": 9606 + }, + { + "epoch": 2.9487415592387967, + "grad_norm": 0.5521572232246399, + "learning_rate": 8.270818980460643e-05, + "loss": 1.7982, + "step": 9607 + }, + { + "epoch": 2.949048496009822, + "grad_norm": 0.3072868287563324, + "learning_rate": 8.27044301455666e-05, + "loss": 1.8708, + "step": 9608 + }, + { + "epoch": 2.9493554327808473, + "grad_norm": 0.5477200746536255, + "learning_rate": 8.270067016332241e-05, + "loss": 1.8708, + "step": 9609 + }, + { + "epoch": 2.949662369551872, + "grad_norm": 0.5991030335426331, + "learning_rate": 8.269690985791104e-05, + "loss": 1.7983, + "step": 9610 + }, + { + "epoch": 2.9499693063228976, + "grad_norm": 0.33343803882598877, + "learning_rate": 8.269314922936964e-05, + "loss": 1.7867, + "step": 9611 + }, + { + "epoch": 2.9502762430939224, + "grad_norm": 0.3671727776527405, + "learning_rate": 8.268938827773538e-05, + "loss": 1.9604, + "step": 9612 + }, + { + "epoch": 2.950583179864948, + "grad_norm": 0.5015503764152527, + "learning_rate": 8.26856270030454e-05, + "loss": 1.8424, + "step": 9613 + }, + { + "epoch": 2.950890116635973, + "grad_norm": 0.4369170367717743, + "learning_rate": 8.268186540533693e-05, + "loss": 1.7915, + "step": 9614 + }, + { + "epoch": 2.951197053406998, + "grad_norm": 0.2739746868610382, + "learning_rate": 8.267810348464709e-05, + "loss": 1.7816, + "step": 9615 + }, + { + "epoch": 2.9515039901780233, + "grad_norm": 0.3660983145236969, + "learning_rate": 8.26743412410131e-05, + "loss": 1.8235, + "step": 9616 + }, + { + "epoch": 2.9518109269490482, + "grad_norm": 0.44442248344421387, + "learning_rate": 8.26705786744721e-05, + "loss": 1.8566, + "step": 9617 + }, + { + "epoch": 2.9521178637200736, + "grad_norm": 0.28847622871398926, + "learning_rate": 8.266681578506129e-05, + "loss": 1.82, + "step": 9618 + }, + { + "epoch": 2.952424800491099, + "grad_norm": 0.32827475666999817, + "learning_rate": 8.266305257281786e-05, + "loss": 1.8422, + "step": 9619 + }, + { + "epoch": 2.9527317372621242, + "grad_norm": 0.3459654748439789, + "learning_rate": 8.265928903777902e-05, + "loss": 1.7919, + "step": 9620 + }, + { + "epoch": 2.953038674033149, + "grad_norm": 0.31467050313949585, + "learning_rate": 8.265552517998191e-05, + "loss": 1.8178, + "step": 9621 + }, + { + "epoch": 2.9533456108041745, + "grad_norm": 0.2814936935901642, + "learning_rate": 8.265176099946381e-05, + "loss": 1.7823, + "step": 9622 + }, + { + "epoch": 2.9536525475751993, + "grad_norm": 0.36387261748313904, + "learning_rate": 8.264799649626182e-05, + "loss": 1.7861, + "step": 9623 + }, + { + "epoch": 2.9539594843462247, + "grad_norm": 0.3504095673561096, + "learning_rate": 8.264423167041322e-05, + "loss": 1.8216, + "step": 9624 + }, + { + "epoch": 2.95426642111725, + "grad_norm": 0.28199300169944763, + "learning_rate": 8.264046652195519e-05, + "loss": 1.8397, + "step": 9625 + }, + { + "epoch": 2.954573357888275, + "grad_norm": 0.435774028301239, + "learning_rate": 8.263670105092494e-05, + "loss": 1.8316, + "step": 9626 + }, + { + "epoch": 2.9548802946593002, + "grad_norm": 0.37712937593460083, + "learning_rate": 8.263293525735967e-05, + "loss": 1.8089, + "step": 9627 + }, + { + "epoch": 2.955187231430325, + "grad_norm": 0.34833967685699463, + "learning_rate": 8.26291691412966e-05, + "loss": 1.8324, + "step": 9628 + }, + { + "epoch": 2.9554941682013505, + "grad_norm": 0.37515538930892944, + "learning_rate": 8.262540270277297e-05, + "loss": 1.7958, + "step": 9629 + }, + { + "epoch": 2.955801104972376, + "grad_norm": 0.3392273485660553, + "learning_rate": 8.262163594182598e-05, + "loss": 1.8322, + "step": 9630 + }, + { + "epoch": 2.9561080417434007, + "grad_norm": 0.3477925956249237, + "learning_rate": 8.261786885849287e-05, + "loss": 1.8525, + "step": 9631 + }, + { + "epoch": 2.956414978514426, + "grad_norm": 0.35574036836624146, + "learning_rate": 8.261410145281085e-05, + "loss": 1.8148, + "step": 9632 + }, + { + "epoch": 2.9567219152854514, + "grad_norm": 0.3166620135307312, + "learning_rate": 8.261033372481717e-05, + "loss": 1.7914, + "step": 9633 + }, + { + "epoch": 2.9570288520564763, + "grad_norm": 0.2562217116355896, + "learning_rate": 8.260656567454907e-05, + "loss": 1.7794, + "step": 9634 + }, + { + "epoch": 2.9573357888275016, + "grad_norm": 0.3328792452812195, + "learning_rate": 8.260279730204377e-05, + "loss": 1.8235, + "step": 9635 + }, + { + "epoch": 2.957642725598527, + "grad_norm": 0.33144834637641907, + "learning_rate": 8.259902860733852e-05, + "loss": 1.7668, + "step": 9636 + }, + { + "epoch": 2.957949662369552, + "grad_norm": 0.30557021498680115, + "learning_rate": 8.259525959047056e-05, + "loss": 1.9135, + "step": 9637 + }, + { + "epoch": 2.958256599140577, + "grad_norm": 0.2901468575000763, + "learning_rate": 8.259149025147713e-05, + "loss": 1.8023, + "step": 9638 + }, + { + "epoch": 2.958563535911602, + "grad_norm": 0.35177919268608093, + "learning_rate": 8.25877205903955e-05, + "loss": 1.8541, + "step": 9639 + }, + { + "epoch": 2.9588704726826274, + "grad_norm": 0.2745177447795868, + "learning_rate": 8.258395060726291e-05, + "loss": 1.8103, + "step": 9640 + }, + { + "epoch": 2.9591774094536527, + "grad_norm": 0.29005685448646545, + "learning_rate": 8.258018030211663e-05, + "loss": 1.7587, + "step": 9641 + }, + { + "epoch": 2.9594843462246776, + "grad_norm": 0.27498918771743774, + "learning_rate": 8.257640967499391e-05, + "loss": 1.8052, + "step": 9642 + }, + { + "epoch": 2.959791282995703, + "grad_norm": 0.2689644694328308, + "learning_rate": 8.257263872593202e-05, + "loss": 1.8582, + "step": 9643 + }, + { + "epoch": 2.960098219766728, + "grad_norm": 0.2953707277774811, + "learning_rate": 8.256886745496821e-05, + "loss": 1.7654, + "step": 9644 + }, + { + "epoch": 2.960405156537753, + "grad_norm": 0.2573971450328827, + "learning_rate": 8.256509586213978e-05, + "loss": 1.7819, + "step": 9645 + }, + { + "epoch": 2.9607120933087785, + "grad_norm": 0.29667192697525024, + "learning_rate": 8.256132394748398e-05, + "loss": 1.8632, + "step": 9646 + }, + { + "epoch": 2.961019030079804, + "grad_norm": 0.2953830361366272, + "learning_rate": 8.255755171103808e-05, + "loss": 1.8672, + "step": 9647 + }, + { + "epoch": 2.9613259668508287, + "grad_norm": 0.2925500273704529, + "learning_rate": 8.255377915283937e-05, + "loss": 1.8691, + "step": 9648 + }, + { + "epoch": 2.961632903621854, + "grad_norm": 0.32245302200317383, + "learning_rate": 8.255000627292515e-05, + "loss": 1.8701, + "step": 9649 + }, + { + "epoch": 2.961939840392879, + "grad_norm": 0.2671414315700531, + "learning_rate": 8.254623307133268e-05, + "loss": 1.8045, + "step": 9650 + }, + { + "epoch": 2.9622467771639043, + "grad_norm": 0.3135749101638794, + "learning_rate": 8.254245954809928e-05, + "loss": 1.7573, + "step": 9651 + }, + { + "epoch": 2.9625537139349296, + "grad_norm": 0.2604369521141052, + "learning_rate": 8.253868570326218e-05, + "loss": 1.8513, + "step": 9652 + }, + { + "epoch": 2.9628606507059545, + "grad_norm": 0.24657092988491058, + "learning_rate": 8.253491153685875e-05, + "loss": 1.8303, + "step": 9653 + }, + { + "epoch": 2.96316758747698, + "grad_norm": 0.24310527741909027, + "learning_rate": 8.253113704892623e-05, + "loss": 1.7648, + "step": 9654 + }, + { + "epoch": 2.9634745242480047, + "grad_norm": 0.24558408558368683, + "learning_rate": 8.252736223950198e-05, + "loss": 1.7517, + "step": 9655 + }, + { + "epoch": 2.96378146101903, + "grad_norm": 0.2500043511390686, + "learning_rate": 8.252358710862324e-05, + "loss": 1.7588, + "step": 9656 + }, + { + "epoch": 2.9640883977900554, + "grad_norm": 0.2532055079936981, + "learning_rate": 8.251981165632737e-05, + "loss": 1.8414, + "step": 9657 + }, + { + "epoch": 2.9643953345610803, + "grad_norm": 0.2692684829235077, + "learning_rate": 8.251603588265165e-05, + "loss": 1.8701, + "step": 9658 + }, + { + "epoch": 2.9647022713321056, + "grad_norm": 0.2511022984981537, + "learning_rate": 8.251225978763341e-05, + "loss": 1.8068, + "step": 9659 + }, + { + "epoch": 2.9650092081031305, + "grad_norm": 0.24702081084251404, + "learning_rate": 8.250848337130997e-05, + "loss": 1.7993, + "step": 9660 + }, + { + "epoch": 2.965316144874156, + "grad_norm": 0.26960623264312744, + "learning_rate": 8.250470663371862e-05, + "loss": 1.8269, + "step": 9661 + }, + { + "epoch": 2.965623081645181, + "grad_norm": 0.2651064693927765, + "learning_rate": 8.250092957489673e-05, + "loss": 1.8235, + "step": 9662 + }, + { + "epoch": 2.9659300184162065, + "grad_norm": 0.3117934465408325, + "learning_rate": 8.249715219488158e-05, + "loss": 1.9603, + "step": 9663 + }, + { + "epoch": 2.9662369551872314, + "grad_norm": 0.3244706988334656, + "learning_rate": 8.249337449371055e-05, + "loss": 1.8766, + "step": 9664 + }, + { + "epoch": 2.9665438919582567, + "grad_norm": 0.3071763515472412, + "learning_rate": 8.248959647142094e-05, + "loss": 1.8118, + "step": 9665 + }, + { + "epoch": 2.9668508287292816, + "grad_norm": 0.2575626075267792, + "learning_rate": 8.24858181280501e-05, + "loss": 1.8578, + "step": 9666 + }, + { + "epoch": 2.967157765500307, + "grad_norm": 0.369356244802475, + "learning_rate": 8.248203946363535e-05, + "loss": 1.7831, + "step": 9667 + }, + { + "epoch": 2.9674647022713323, + "grad_norm": 0.317775160074234, + "learning_rate": 8.247826047821405e-05, + "loss": 1.8839, + "step": 9668 + }, + { + "epoch": 2.967771639042357, + "grad_norm": 0.31816980242729187, + "learning_rate": 8.247448117182355e-05, + "loss": 1.8111, + "step": 9669 + }, + { + "epoch": 2.9680785758133825, + "grad_norm": 0.2943781316280365, + "learning_rate": 8.247070154450119e-05, + "loss": 1.848, + "step": 9670 + }, + { + "epoch": 2.9683855125844074, + "grad_norm": 0.28252434730529785, + "learning_rate": 8.246692159628433e-05, + "loss": 1.8601, + "step": 9671 + }, + { + "epoch": 2.9686924493554327, + "grad_norm": 0.29150691628456116, + "learning_rate": 8.246314132721032e-05, + "loss": 1.7738, + "step": 9672 + }, + { + "epoch": 2.968999386126458, + "grad_norm": 0.3699757754802704, + "learning_rate": 8.245936073731653e-05, + "loss": 1.842, + "step": 9673 + }, + { + "epoch": 2.969306322897483, + "grad_norm": 0.37951794266700745, + "learning_rate": 8.245557982664031e-05, + "loss": 1.8648, + "step": 9674 + }, + { + "epoch": 2.9696132596685083, + "grad_norm": 0.2792273461818695, + "learning_rate": 8.245179859521901e-05, + "loss": 1.889, + "step": 9675 + }, + { + "epoch": 2.969920196439533, + "grad_norm": 0.3405047059059143, + "learning_rate": 8.244801704309002e-05, + "loss": 1.7658, + "step": 9676 + }, + { + "epoch": 2.9702271332105585, + "grad_norm": 0.40138551592826843, + "learning_rate": 8.244423517029072e-05, + "loss": 1.79, + "step": 9677 + }, + { + "epoch": 2.970534069981584, + "grad_norm": 0.42260462045669556, + "learning_rate": 8.244045297685846e-05, + "loss": 1.9248, + "step": 9678 + }, + { + "epoch": 2.970841006752609, + "grad_norm": 0.30391061305999756, + "learning_rate": 8.243667046283063e-05, + "loss": 1.7922, + "step": 9679 + }, + { + "epoch": 2.971147943523634, + "grad_norm": 0.3194752037525177, + "learning_rate": 8.243288762824463e-05, + "loss": 1.8582, + "step": 9680 + }, + { + "epoch": 2.9714548802946594, + "grad_norm": 0.47853100299835205, + "learning_rate": 8.24291044731378e-05, + "loss": 1.8206, + "step": 9681 + }, + { + "epoch": 2.9717618170656843, + "grad_norm": 0.47428956627845764, + "learning_rate": 8.242532099754756e-05, + "loss": 1.8271, + "step": 9682 + }, + { + "epoch": 2.9720687538367097, + "grad_norm": 0.30275169014930725, + "learning_rate": 8.24215372015113e-05, + "loss": 1.8532, + "step": 9683 + }, + { + "epoch": 2.972375690607735, + "grad_norm": 0.31766825914382935, + "learning_rate": 8.24177530850664e-05, + "loss": 1.7751, + "step": 9684 + }, + { + "epoch": 2.97268262737876, + "grad_norm": 0.3738986551761627, + "learning_rate": 8.241396864825026e-05, + "loss": 1.7644, + "step": 9685 + }, + { + "epoch": 2.972989564149785, + "grad_norm": 0.2794596254825592, + "learning_rate": 8.24101838911003e-05, + "loss": 1.7445, + "step": 9686 + }, + { + "epoch": 2.97329650092081, + "grad_norm": 0.30008718371391296, + "learning_rate": 8.240639881365388e-05, + "loss": 1.8181, + "step": 9687 + }, + { + "epoch": 2.9736034376918354, + "grad_norm": 0.36667200922966003, + "learning_rate": 8.240261341594846e-05, + "loss": 1.8606, + "step": 9688 + }, + { + "epoch": 2.9739103744628608, + "grad_norm": 0.2943612039089203, + "learning_rate": 8.23988276980214e-05, + "loss": 1.8169, + "step": 9689 + }, + { + "epoch": 2.9742173112338857, + "grad_norm": 0.3499365746974945, + "learning_rate": 8.239504165991015e-05, + "loss": 1.8901, + "step": 9690 + }, + { + "epoch": 2.974524248004911, + "grad_norm": 0.35552978515625, + "learning_rate": 8.239125530165211e-05, + "loss": 1.8266, + "step": 9691 + }, + { + "epoch": 2.974831184775936, + "grad_norm": 0.35415011644363403, + "learning_rate": 8.23874686232847e-05, + "loss": 1.8588, + "step": 9692 + }, + { + "epoch": 2.9751381215469612, + "grad_norm": 0.3237420618534088, + "learning_rate": 8.238368162484533e-05, + "loss": 1.8112, + "step": 9693 + }, + { + "epoch": 2.9754450583179866, + "grad_norm": 0.31672203540802, + "learning_rate": 8.237989430637145e-05, + "loss": 1.7983, + "step": 9694 + }, + { + "epoch": 2.975751995089012, + "grad_norm": 0.2926657795906067, + "learning_rate": 8.237610666790048e-05, + "loss": 1.8137, + "step": 9695 + }, + { + "epoch": 2.976058931860037, + "grad_norm": 0.2924230992794037, + "learning_rate": 8.237231870946983e-05, + "loss": 1.8789, + "step": 9696 + }, + { + "epoch": 2.976365868631062, + "grad_norm": 0.2768077850341797, + "learning_rate": 8.236853043111697e-05, + "loss": 1.8643, + "step": 9697 + }, + { + "epoch": 2.976672805402087, + "grad_norm": 0.24151389300823212, + "learning_rate": 8.23647418328793e-05, + "loss": 1.8245, + "step": 9698 + }, + { + "epoch": 2.9769797421731123, + "grad_norm": 0.24514195322990417, + "learning_rate": 8.23609529147943e-05, + "loss": 1.761, + "step": 9699 + }, + { + "epoch": 2.9772866789441377, + "grad_norm": 0.2619125545024872, + "learning_rate": 8.235716367689938e-05, + "loss": 1.8445, + "step": 9700 + }, + { + "epoch": 2.9775936157151626, + "grad_norm": 0.2570437490940094, + "learning_rate": 8.235337411923203e-05, + "loss": 1.7881, + "step": 9701 + }, + { + "epoch": 2.977900552486188, + "grad_norm": 0.288775235414505, + "learning_rate": 8.234958424182966e-05, + "loss": 1.8177, + "step": 9702 + }, + { + "epoch": 2.978207489257213, + "grad_norm": 0.3186240792274475, + "learning_rate": 8.234579404472973e-05, + "loss": 1.8438, + "step": 9703 + }, + { + "epoch": 2.978514426028238, + "grad_norm": 0.2520117163658142, + "learning_rate": 8.23420035279697e-05, + "loss": 1.7791, + "step": 9704 + }, + { + "epoch": 2.9788213627992635, + "grad_norm": 0.23164312541484833, + "learning_rate": 8.233821269158706e-05, + "loss": 1.7368, + "step": 9705 + }, + { + "epoch": 2.979128299570289, + "grad_norm": 0.33843451738357544, + "learning_rate": 8.233442153561924e-05, + "loss": 1.8656, + "step": 9706 + }, + { + "epoch": 2.9794352363413137, + "grad_norm": 0.3070257604122162, + "learning_rate": 8.23306300601037e-05, + "loss": 1.7982, + "step": 9707 + }, + { + "epoch": 2.979742173112339, + "grad_norm": 0.29138872027397156, + "learning_rate": 8.232683826507793e-05, + "loss": 1.8227, + "step": 9708 + }, + { + "epoch": 2.980049109883364, + "grad_norm": 0.22698308527469635, + "learning_rate": 8.23230461505794e-05, + "loss": 1.7841, + "step": 9709 + }, + { + "epoch": 2.9803560466543892, + "grad_norm": 0.2597857713699341, + "learning_rate": 8.231925371664559e-05, + "loss": 1.7438, + "step": 9710 + }, + { + "epoch": 2.9806629834254146, + "grad_norm": 0.28672367334365845, + "learning_rate": 8.231546096331395e-05, + "loss": 1.8415, + "step": 9711 + }, + { + "epoch": 2.9809699201964395, + "grad_norm": 0.24295037984848022, + "learning_rate": 8.2311667890622e-05, + "loss": 1.8179, + "step": 9712 + }, + { + "epoch": 2.981276856967465, + "grad_norm": 0.24558894336223602, + "learning_rate": 8.23078744986072e-05, + "loss": 1.8092, + "step": 9713 + }, + { + "epoch": 2.9815837937384897, + "grad_norm": 0.2644276022911072, + "learning_rate": 8.230408078730706e-05, + "loss": 1.8214, + "step": 9714 + }, + { + "epoch": 2.981890730509515, + "grad_norm": 0.27007076144218445, + "learning_rate": 8.230028675675907e-05, + "loss": 1.8042, + "step": 9715 + }, + { + "epoch": 2.9821976672805404, + "grad_norm": 0.2729937732219696, + "learning_rate": 8.229649240700069e-05, + "loss": 1.8419, + "step": 9716 + }, + { + "epoch": 2.9825046040515653, + "grad_norm": 0.26545679569244385, + "learning_rate": 8.229269773806945e-05, + "loss": 1.823, + "step": 9717 + }, + { + "epoch": 2.9828115408225906, + "grad_norm": 0.23276878893375397, + "learning_rate": 8.228890275000285e-05, + "loss": 1.7635, + "step": 9718 + }, + { + "epoch": 2.9831184775936155, + "grad_norm": 0.28991779685020447, + "learning_rate": 8.228510744283837e-05, + "loss": 1.8303, + "step": 9719 + }, + { + "epoch": 2.983425414364641, + "grad_norm": 0.2821960151195526, + "learning_rate": 8.228131181661357e-05, + "loss": 1.8246, + "step": 9720 + }, + { + "epoch": 2.983732351135666, + "grad_norm": 0.25588423013687134, + "learning_rate": 8.22775158713659e-05, + "loss": 1.7764, + "step": 9721 + }, + { + "epoch": 2.9840392879066915, + "grad_norm": 0.2694758176803589, + "learning_rate": 8.227371960713289e-05, + "loss": 1.8026, + "step": 9722 + }, + { + "epoch": 2.9843462246777164, + "grad_norm": 0.27571097016334534, + "learning_rate": 8.226992302395209e-05, + "loss": 1.8051, + "step": 9723 + }, + { + "epoch": 2.9846531614487417, + "grad_norm": 0.2940119504928589, + "learning_rate": 8.226612612186099e-05, + "loss": 1.8782, + "step": 9724 + }, + { + "epoch": 2.9849600982197666, + "grad_norm": 0.34924936294555664, + "learning_rate": 8.226232890089711e-05, + "loss": 1.7845, + "step": 9725 + }, + { + "epoch": 2.985267034990792, + "grad_norm": 0.30503180623054504, + "learning_rate": 8.2258531361098e-05, + "loss": 1.8345, + "step": 9726 + }, + { + "epoch": 2.9855739717618173, + "grad_norm": 0.2463730275630951, + "learning_rate": 8.225473350250117e-05, + "loss": 1.8188, + "step": 9727 + }, + { + "epoch": 2.985880908532842, + "grad_norm": 0.3514629900455475, + "learning_rate": 8.225093532514417e-05, + "loss": 1.9253, + "step": 9728 + }, + { + "epoch": 2.9861878453038675, + "grad_norm": 0.26462769508361816, + "learning_rate": 8.224713682906449e-05, + "loss": 1.7396, + "step": 9729 + }, + { + "epoch": 2.9864947820748924, + "grad_norm": 0.27125996351242065, + "learning_rate": 8.224333801429973e-05, + "loss": 1.7784, + "step": 9730 + }, + { + "epoch": 2.9868017188459177, + "grad_norm": 0.3083387315273285, + "learning_rate": 8.22395388808874e-05, + "loss": 1.8503, + "step": 9731 + }, + { + "epoch": 2.987108655616943, + "grad_norm": 0.28289708495140076, + "learning_rate": 8.223573942886505e-05, + "loss": 1.8337, + "step": 9732 + }, + { + "epoch": 2.987415592387968, + "grad_norm": 0.3667753040790558, + "learning_rate": 8.223193965827023e-05, + "loss": 1.8213, + "step": 9733 + }, + { + "epoch": 2.9877225291589933, + "grad_norm": 0.3568948805332184, + "learning_rate": 8.222813956914049e-05, + "loss": 1.8337, + "step": 9734 + }, + { + "epoch": 2.988029465930018, + "grad_norm": 0.2883065640926361, + "learning_rate": 8.22243391615134e-05, + "loss": 1.7227, + "step": 9735 + }, + { + "epoch": 2.9883364027010435, + "grad_norm": 0.24940936267375946, + "learning_rate": 8.222053843542648e-05, + "loss": 1.7889, + "step": 9736 + }, + { + "epoch": 2.988643339472069, + "grad_norm": 0.31267982721328735, + "learning_rate": 8.221673739091732e-05, + "loss": 1.8432, + "step": 9737 + }, + { + "epoch": 2.988950276243094, + "grad_norm": 0.3552311658859253, + "learning_rate": 8.221293602802349e-05, + "loss": 1.8569, + "step": 9738 + }, + { + "epoch": 2.989257213014119, + "grad_norm": 0.4149966835975647, + "learning_rate": 8.220913434678252e-05, + "loss": 1.8052, + "step": 9739 + }, + { + "epoch": 2.9895641497851444, + "grad_norm": 0.282320499420166, + "learning_rate": 8.220533234723204e-05, + "loss": 1.7629, + "step": 9740 + }, + { + "epoch": 2.9898710865561693, + "grad_norm": 0.27737030386924744, + "learning_rate": 8.220153002940958e-05, + "loss": 1.8331, + "step": 9741 + }, + { + "epoch": 2.9901780233271946, + "grad_norm": 0.29296645522117615, + "learning_rate": 8.219772739335272e-05, + "loss": 1.8414, + "step": 9742 + }, + { + "epoch": 2.99048496009822, + "grad_norm": 0.35226449370384216, + "learning_rate": 8.219392443909903e-05, + "loss": 1.8608, + "step": 9743 + }, + { + "epoch": 2.990791896869245, + "grad_norm": 0.3199223577976227, + "learning_rate": 8.219012116668612e-05, + "loss": 1.7868, + "step": 9744 + }, + { + "epoch": 2.99109883364027, + "grad_norm": 0.2904597818851471, + "learning_rate": 8.218631757615159e-05, + "loss": 1.8495, + "step": 9745 + }, + { + "epoch": 2.991405770411295, + "grad_norm": 0.34674009680747986, + "learning_rate": 8.218251366753298e-05, + "loss": 1.8143, + "step": 9746 + }, + { + "epoch": 2.9917127071823204, + "grad_norm": 0.38007479906082153, + "learning_rate": 8.217870944086791e-05, + "loss": 1.8534, + "step": 9747 + }, + { + "epoch": 2.9920196439533457, + "grad_norm": 0.31660130620002747, + "learning_rate": 8.217490489619398e-05, + "loss": 1.7807, + "step": 9748 + }, + { + "epoch": 2.9923265807243706, + "grad_norm": 0.2923539876937866, + "learning_rate": 8.217110003354877e-05, + "loss": 1.8517, + "step": 9749 + }, + { + "epoch": 2.992633517495396, + "grad_norm": 0.31018227338790894, + "learning_rate": 8.21672948529699e-05, + "loss": 1.7998, + "step": 9750 + }, + { + "epoch": 2.992940454266421, + "grad_norm": 0.29448994994163513, + "learning_rate": 8.216348935449496e-05, + "loss": 1.7883, + "step": 9751 + }, + { + "epoch": 2.993247391037446, + "grad_norm": 0.26120781898498535, + "learning_rate": 8.215968353816158e-05, + "loss": 1.7762, + "step": 9752 + }, + { + "epoch": 2.9935543278084715, + "grad_norm": 0.27784180641174316, + "learning_rate": 8.215587740400735e-05, + "loss": 1.8711, + "step": 9753 + }, + { + "epoch": 2.993861264579497, + "grad_norm": 0.3106052577495575, + "learning_rate": 8.21520709520699e-05, + "loss": 1.8112, + "step": 9754 + }, + { + "epoch": 2.9941682013505218, + "grad_norm": 0.3170885145664215, + "learning_rate": 8.214826418238684e-05, + "loss": 1.8893, + "step": 9755 + }, + { + "epoch": 2.994475138121547, + "grad_norm": 0.2969432473182678, + "learning_rate": 8.214445709499577e-05, + "loss": 1.8628, + "step": 9756 + }, + { + "epoch": 2.994782074892572, + "grad_norm": 0.30484744906425476, + "learning_rate": 8.214064968993436e-05, + "loss": 1.8421, + "step": 9757 + }, + { + "epoch": 2.9950890116635973, + "grad_norm": 0.24819856882095337, + "learning_rate": 8.213684196724019e-05, + "loss": 1.8243, + "step": 9758 + }, + { + "epoch": 2.9953959484346226, + "grad_norm": 0.28566786646842957, + "learning_rate": 8.213303392695092e-05, + "loss": 1.8064, + "step": 9759 + }, + { + "epoch": 2.9957028852056475, + "grad_norm": 0.27742111682891846, + "learning_rate": 8.212922556910418e-05, + "loss": 1.8174, + "step": 9760 + }, + { + "epoch": 2.996009821976673, + "grad_norm": 0.27103090286254883, + "learning_rate": 8.212541689373761e-05, + "loss": 1.761, + "step": 9761 + }, + { + "epoch": 2.9963167587476978, + "grad_norm": 0.27157172560691833, + "learning_rate": 8.212160790088883e-05, + "loss": 1.8893, + "step": 9762 + }, + { + "epoch": 2.996623695518723, + "grad_norm": 0.2742370367050171, + "learning_rate": 8.21177985905955e-05, + "loss": 1.8774, + "step": 9763 + }, + { + "epoch": 2.9969306322897484, + "grad_norm": 0.26467064023017883, + "learning_rate": 8.211398896289524e-05, + "loss": 1.7805, + "step": 9764 + }, + { + "epoch": 2.9972375690607733, + "grad_norm": 0.2622149884700775, + "learning_rate": 8.211017901782574e-05, + "loss": 1.7346, + "step": 9765 + }, + { + "epoch": 2.9975445058317987, + "grad_norm": 0.3163202106952667, + "learning_rate": 8.210636875542462e-05, + "loss": 1.8348, + "step": 9766 + }, + { + "epoch": 2.9978514426028235, + "grad_norm": 0.2789528965950012, + "learning_rate": 8.210255817572955e-05, + "loss": 1.7535, + "step": 9767 + }, + { + "epoch": 2.998158379373849, + "grad_norm": 0.25694188475608826, + "learning_rate": 8.209874727877818e-05, + "loss": 1.8731, + "step": 9768 + }, + { + "epoch": 2.998465316144874, + "grad_norm": 0.40298742055892944, + "learning_rate": 8.209493606460818e-05, + "loss": 1.7924, + "step": 9769 + }, + { + "epoch": 2.9987722529158995, + "grad_norm": 0.5090280771255493, + "learning_rate": 8.20911245332572e-05, + "loss": 1.8253, + "step": 9770 + }, + { + "epoch": 2.9990791896869244, + "grad_norm": 0.41809162497520447, + "learning_rate": 8.208731268476293e-05, + "loss": 1.8233, + "step": 9771 + }, + { + "epoch": 2.9993861264579498, + "grad_norm": 0.23141434788703918, + "learning_rate": 8.208350051916303e-05, + "loss": 1.7842, + "step": 9772 + }, + { + "epoch": 2.9996930632289747, + "grad_norm": 0.3174372613430023, + "learning_rate": 8.207968803649517e-05, + "loss": 1.8477, + "step": 9773 + }, + { + "epoch": 3.0, + "grad_norm": 0.41795292496681213, + "learning_rate": 8.207587523679704e-05, + "loss": 1.8407, + "step": 9774 + } + ], + "logging_steps": 1.0, + "max_steps": 32580, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.6397766985510604e+20, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9774/training_args.bin b/checkpoint-9774/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9db7ad91da5423a229826113feb3e9db3ef40c31 --- /dev/null +++ b/checkpoint-9774/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682b697e933b6e2693e5f9af9a0654effab1ca392c8500bf8af0eb089116a263 +size 7288 diff --git a/checkpoint-9774/zero_to_fp32.py b/checkpoint-9774/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-9774/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..615d502b60aa53b2b6b2a193a08b31631f96438b --- /dev/null +++ b/config.json @@ -0,0 +1,60 @@ +{ + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "freeze_mm_mlp_adapter": false, + "gen_hidden_size": 1792, + "gen_pooling": "early_pool2d_4", + "gen_vision_tower": "eva-clip-E-14-plus", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "image_aspect_ratio": "square", + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "model_type": "llava_llama", + "n_query": 64, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128256, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "tokenizer_model_max_length": 256, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "tune_mm_mlp_adapter": false, + "use_cache": true, + "use_mm_proj": true, + "vision_tower_pretrained": null, + "vocab_size": 128260 +} diff --git a/gen_projector.bin b/gen_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..6c023e365eaed20eac9a74637d59fa6980994422 --- /dev/null +++ b/gen_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d7bf359cbb7138122aa5806ccbe833e84aae0afb5819800ec61eff6335937cb +size 888 diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..05892c70fa899883072c585fa444b4aa7175d6bc --- /dev/null +++ b/generation_config.json @@ -0,0 +1,13 @@ +{ + "attn_implementation": "flash_attention_2", + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/mm_projector.bin b/mm_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..ce02f584f4367e779d376aa737c2037d01c298c5 --- /dev/null +++ b/mm_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50851b89e359427a9bd89f3bdbcf5350f84c3fd12251d529dfc7397cdc443fcd +size 13117496 diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..40e286fc97157dac65404b04dbb26baec8c2e3c2 --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a9449c8f5827ef7e6fe120565d7012681fa3d7ed957714ca215f7fddbb30ee9 +size 4955415870 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f1c6f3bf70f8abb1e7ffb233219debc10bc20bfc --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b088e0e2c4fb5916f448522fa5aef361db713e2c2c0ceac534662c8d52e330d +size 4971563008 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dc65a5dfde6d99312e188a257227571ac71c350f --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30b6ea9ccdee0d6276c4086d831ffff162f36df75bd46ab1fe988a95097285b8 +size 4180840856 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c911c94f46f802ae304903dd7796da96c28604 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,2358 @@ +{ + "metadata": { + "total_size": 14107506086 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.caption_projection.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.10.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.11.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.12.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.13.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.14.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.15.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.16.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.17.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.18.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.19.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.20.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.21.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.22.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.23.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.24.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.25.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.26.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.27.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.3.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.4.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.5.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.6.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.7.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.8.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn1.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.norm_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_k.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_q.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.attn2.to_v.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.feed_forward.linear_3.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.ffn_norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.gate": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.bias": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.linear.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1.norm.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm1_context.weight": "model-00001-of-00003.safetensors", + "model.dit.model.layers.9.norm2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.norm_out.linear_2.weight": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.bias": "model-00001-of-00003.safetensors", + "model.dit.model.patch_embedder.proj.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.0.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.caption_embedder.1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias": "model-00001-of-00003.safetensors", + "model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight": "model-00003-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight": "model-00002-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.cls_token": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight": "model-00001-of-00003.safetensors", + "model.gen_vision_tower.vision_tower.model.pos_embed": "model-00001-of-00003.safetensors", + "model.latent_queries": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.2.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.decoder.up_blocks.3.resnets.2.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_in.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_norm_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.conv_out.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.0.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.1.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.2.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.down_blocks.3.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.group_norm.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_k.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_out.0.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_q.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.attentions.0.to_v.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.0.norm2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.conv2.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm1.weight": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.bias": "model-00001-of-00003.safetensors", + "model.vae.encoder.mid_block.resnets.1.norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00003.safetensors", + "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00003.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad43db72a0e94321a5a9455dce616c68d1f9673 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,46 @@ +{ + "additional_special_tokens": [ + { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..444d43e1c25d11b63381073024becd006c83d4f6 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fbef9068a1d82c7fafc3fdfd7c717524c8bfbcaea19c14ce4f8a4e616deb57 +size 17210651 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a54102d00c210427fe2da524cea00c5ace13686 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2102 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "[IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "[/IMG]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "[IMG]", + "[/IMG]", + "" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 256, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..527c5b6d438556bcfe6d62bc72a8f195a6e3c9dc --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,228103 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 32580, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003069367710251688, + "grad_norm": 1.3492016792297363, + "learning_rate": 0.0, + "loss": 6.5185, + "step": 1 + }, + { + "epoch": 0.0006138735420503376, + "grad_norm": 1.4303781986236572, + "learning_rate": 1.0224948875255626e-07, + "loss": 6.5124, + "step": 2 + }, + { + "epoch": 0.0009208103130755065, + "grad_norm": 1.3981783390045166, + "learning_rate": 2.0449897750511251e-07, + "loss": 6.5204, + "step": 3 + }, + { + "epoch": 0.0012277470841006752, + "grad_norm": 1.3760672807693481, + "learning_rate": 3.0674846625766876e-07, + "loss": 6.502, + "step": 4 + }, + { + "epoch": 0.001534683855125844, + "grad_norm": 1.3704107999801636, + "learning_rate": 4.0899795501022503e-07, + "loss": 6.5021, + "step": 5 + }, + { + "epoch": 0.001841620626151013, + "grad_norm": 1.3109549283981323, + "learning_rate": 5.112474437627812e-07, + "loss": 6.521, + "step": 6 + }, + { + "epoch": 0.002148557397176182, + "grad_norm": 1.475183367729187, + "learning_rate": 6.134969325153375e-07, + "loss": 6.521, + "step": 7 + }, + { + "epoch": 0.0024554941682013503, + "grad_norm": 1.4563297033309937, + "learning_rate": 7.157464212678937e-07, + "loss": 6.5075, + "step": 8 + }, + { + "epoch": 0.0027624309392265192, + "grad_norm": 1.437183141708374, + "learning_rate": 8.179959100204501e-07, + "loss": 6.5135, + "step": 9 + }, + { + "epoch": 0.003069367710251688, + "grad_norm": 1.336928129196167, + "learning_rate": 9.202453987730062e-07, + "loss": 6.5138, + "step": 10 + }, + { + "epoch": 0.003376304481276857, + "grad_norm": 1.3220698833465576, + "learning_rate": 1.0224948875255625e-06, + "loss": 6.5187, + "step": 11 + }, + { + "epoch": 0.003683241252302026, + "grad_norm": 1.3990652561187744, + "learning_rate": 1.1247443762781187e-06, + "loss": 6.5129, + "step": 12 + }, + { + "epoch": 0.003990178023327195, + "grad_norm": 1.4394340515136719, + "learning_rate": 1.226993865030675e-06, + "loss": 6.5078, + "step": 13 + }, + { + "epoch": 0.004297114794352364, + "grad_norm": 1.3675259351730347, + "learning_rate": 1.3292433537832312e-06, + "loss": 6.5115, + "step": 14 + }, + { + "epoch": 0.004604051565377533, + "grad_norm": 1.3085063695907593, + "learning_rate": 1.4314928425357874e-06, + "loss": 6.5092, + "step": 15 + }, + { + "epoch": 0.004910988336402701, + "grad_norm": 1.4214227199554443, + "learning_rate": 1.5337423312883435e-06, + "loss": 6.5026, + "step": 16 + }, + { + "epoch": 0.0052179251074278695, + "grad_norm": 1.377146601676941, + "learning_rate": 1.6359918200409001e-06, + "loss": 6.4882, + "step": 17 + }, + { + "epoch": 0.0055248618784530384, + "grad_norm": 1.3461124897003174, + "learning_rate": 1.7382413087934563e-06, + "loss": 6.4935, + "step": 18 + }, + { + "epoch": 0.005831798649478207, + "grad_norm": 1.3161669969558716, + "learning_rate": 1.8404907975460124e-06, + "loss": 6.4795, + "step": 19 + }, + { + "epoch": 0.006138735420503376, + "grad_norm": 1.2915974855422974, + "learning_rate": 1.942740286298569e-06, + "loss": 6.4529, + "step": 20 + }, + { + "epoch": 0.006445672191528545, + "grad_norm": 1.2675414085388184, + "learning_rate": 2.044989775051125e-06, + "loss": 6.454, + "step": 21 + }, + { + "epoch": 0.006752608962553714, + "grad_norm": 1.2769283056259155, + "learning_rate": 2.147239263803681e-06, + "loss": 6.4574, + "step": 22 + }, + { + "epoch": 0.007059545733578883, + "grad_norm": 1.2556813955307007, + "learning_rate": 2.2494887525562373e-06, + "loss": 6.4486, + "step": 23 + }, + { + "epoch": 0.007366482504604052, + "grad_norm": 1.2158268690109253, + "learning_rate": 2.3517382413087935e-06, + "loss": 6.4357, + "step": 24 + }, + { + "epoch": 0.007673419275629221, + "grad_norm": 1.2383767366409302, + "learning_rate": 2.45398773006135e-06, + "loss": 6.4347, + "step": 25 + }, + { + "epoch": 0.00798035604665439, + "grad_norm": 1.2865383625030518, + "learning_rate": 2.5562372188139062e-06, + "loss": 6.3611, + "step": 26 + }, + { + "epoch": 0.008287292817679558, + "grad_norm": 1.1501989364624023, + "learning_rate": 2.6584867075664624e-06, + "loss": 6.3247, + "step": 27 + }, + { + "epoch": 0.008594229588704727, + "grad_norm": 1.0971378087997437, + "learning_rate": 2.7607361963190186e-06, + "loss": 6.3078, + "step": 28 + }, + { + "epoch": 0.008901166359729895, + "grad_norm": 1.1365599632263184, + "learning_rate": 2.8629856850715747e-06, + "loss": 6.3211, + "step": 29 + }, + { + "epoch": 0.009208103130755065, + "grad_norm": 1.1228944063186646, + "learning_rate": 2.965235173824131e-06, + "loss": 6.3185, + "step": 30 + }, + { + "epoch": 0.009515039901780233, + "grad_norm": 1.126287579536438, + "learning_rate": 3.067484662576687e-06, + "loss": 6.2845, + "step": 31 + }, + { + "epoch": 0.009821976672805401, + "grad_norm": 1.1070353984832764, + "learning_rate": 3.1697341513292436e-06, + "loss": 6.2855, + "step": 32 + }, + { + "epoch": 0.010128913443830571, + "grad_norm": 1.101291537284851, + "learning_rate": 3.2719836400818002e-06, + "loss": 6.2764, + "step": 33 + }, + { + "epoch": 0.010435850214855739, + "grad_norm": 1.0643113851547241, + "learning_rate": 3.374233128834356e-06, + "loss": 6.2363, + "step": 34 + }, + { + "epoch": 0.010742786985880909, + "grad_norm": 0.9714563488960266, + "learning_rate": 3.4764826175869125e-06, + "loss": 6.1771, + "step": 35 + }, + { + "epoch": 0.011049723756906077, + "grad_norm": 0.8998560309410095, + "learning_rate": 3.5787321063394683e-06, + "loss": 6.1202, + "step": 36 + }, + { + "epoch": 0.011356660527931247, + "grad_norm": 0.8481987714767456, + "learning_rate": 3.680981595092025e-06, + "loss": 6.0954, + "step": 37 + }, + { + "epoch": 0.011663597298956415, + "grad_norm": 0.8124909996986389, + "learning_rate": 3.783231083844581e-06, + "loss": 6.0832, + "step": 38 + }, + { + "epoch": 0.011970534069981584, + "grad_norm": 0.7968178391456604, + "learning_rate": 3.885480572597138e-06, + "loss": 6.0661, + "step": 39 + }, + { + "epoch": 0.012277470841006752, + "grad_norm": 0.7714207768440247, + "learning_rate": 3.987730061349693e-06, + "loss": 6.0385, + "step": 40 + }, + { + "epoch": 0.012584407612031922, + "grad_norm": 0.7436742782592773, + "learning_rate": 4.08997955010225e-06, + "loss": 6.0227, + "step": 41 + }, + { + "epoch": 0.01289134438305709, + "grad_norm": 0.7447277307510376, + "learning_rate": 4.192229038854806e-06, + "loss": 6.0208, + "step": 42 + }, + { + "epoch": 0.013198281154082258, + "grad_norm": 0.6983785629272461, + "learning_rate": 4.294478527607362e-06, + "loss": 6.0295, + "step": 43 + }, + { + "epoch": 0.013505217925107428, + "grad_norm": 0.6630908250808716, + "learning_rate": 4.3967280163599184e-06, + "loss": 6.004, + "step": 44 + }, + { + "epoch": 0.013812154696132596, + "grad_norm": 0.6481929421424866, + "learning_rate": 4.498977505112475e-06, + "loss": 5.9986, + "step": 45 + }, + { + "epoch": 0.014119091467157766, + "grad_norm": 0.7187685966491699, + "learning_rate": 4.601226993865031e-06, + "loss": 6.0008, + "step": 46 + }, + { + "epoch": 0.014426028238182934, + "grad_norm": 0.6550983190536499, + "learning_rate": 4.703476482617587e-06, + "loss": 5.9735, + "step": 47 + }, + { + "epoch": 0.014732965009208104, + "grad_norm": 0.6780675649642944, + "learning_rate": 4.805725971370143e-06, + "loss": 5.9568, + "step": 48 + }, + { + "epoch": 0.015039901780233272, + "grad_norm": 0.703427791595459, + "learning_rate": 4.9079754601227e-06, + "loss": 5.961, + "step": 49 + }, + { + "epoch": 0.015346838551258441, + "grad_norm": 0.6507543921470642, + "learning_rate": 5.0102249488752554e-06, + "loss": 5.9557, + "step": 50 + }, + { + "epoch": 0.01565377532228361, + "grad_norm": 0.5959481000900269, + "learning_rate": 5.1124744376278124e-06, + "loss": 5.9391, + "step": 51 + }, + { + "epoch": 0.01596071209330878, + "grad_norm": 0.5798730254173279, + "learning_rate": 5.214723926380368e-06, + "loss": 5.9488, + "step": 52 + }, + { + "epoch": 0.016267648864333947, + "grad_norm": 0.5932896137237549, + "learning_rate": 5.316973415132925e-06, + "loss": 5.9176, + "step": 53 + }, + { + "epoch": 0.016574585635359115, + "grad_norm": 0.5772561430931091, + "learning_rate": 5.419222903885481e-06, + "loss": 5.9069, + "step": 54 + }, + { + "epoch": 0.016881522406384283, + "grad_norm": 0.5578178763389587, + "learning_rate": 5.521472392638037e-06, + "loss": 5.8924, + "step": 55 + }, + { + "epoch": 0.017188459177409455, + "grad_norm": 0.5458457469940186, + "learning_rate": 5.623721881390593e-06, + "loss": 5.9001, + "step": 56 + }, + { + "epoch": 0.017495395948434623, + "grad_norm": 0.5381231904029846, + "learning_rate": 5.7259713701431494e-06, + "loss": 5.8827, + "step": 57 + }, + { + "epoch": 0.01780233271945979, + "grad_norm": 0.540920615196228, + "learning_rate": 5.828220858895706e-06, + "loss": 5.8763, + "step": 58 + }, + { + "epoch": 0.01810926949048496, + "grad_norm": 0.5378615260124207, + "learning_rate": 5.930470347648262e-06, + "loss": 5.865, + "step": 59 + }, + { + "epoch": 0.01841620626151013, + "grad_norm": 0.5139282941818237, + "learning_rate": 6.032719836400819e-06, + "loss": 5.873, + "step": 60 + }, + { + "epoch": 0.0187231430325353, + "grad_norm": 0.5298904776573181, + "learning_rate": 6.134969325153374e-06, + "loss": 5.861, + "step": 61 + }, + { + "epoch": 0.019030079803560467, + "grad_norm": 0.503131628036499, + "learning_rate": 6.237218813905931e-06, + "loss": 5.844, + "step": 62 + }, + { + "epoch": 0.019337016574585635, + "grad_norm": 0.5133433938026428, + "learning_rate": 6.339468302658487e-06, + "loss": 5.8535, + "step": 63 + }, + { + "epoch": 0.019643953345610803, + "grad_norm": 0.4909187853336334, + "learning_rate": 6.4417177914110434e-06, + "loss": 5.8378, + "step": 64 + }, + { + "epoch": 0.019950890116635974, + "grad_norm": 0.6916642785072327, + "learning_rate": 6.5439672801636004e-06, + "loss": 5.8385, + "step": 65 + }, + { + "epoch": 0.020257826887661142, + "grad_norm": 0.4801484942436218, + "learning_rate": 6.646216768916155e-06, + "loss": 5.8089, + "step": 66 + }, + { + "epoch": 0.02056476365868631, + "grad_norm": 0.47745251655578613, + "learning_rate": 6.748466257668712e-06, + "loss": 5.8119, + "step": 67 + }, + { + "epoch": 0.020871700429711478, + "grad_norm": 0.4693359136581421, + "learning_rate": 6.850715746421268e-06, + "loss": 5.8038, + "step": 68 + }, + { + "epoch": 0.02117863720073665, + "grad_norm": 0.46996453404426575, + "learning_rate": 6.952965235173825e-06, + "loss": 5.7966, + "step": 69 + }, + { + "epoch": 0.021485573971761818, + "grad_norm": 0.45779168605804443, + "learning_rate": 7.05521472392638e-06, + "loss": 5.7959, + "step": 70 + }, + { + "epoch": 0.021792510742786986, + "grad_norm": 0.49008259177207947, + "learning_rate": 7.1574642126789366e-06, + "loss": 5.7861, + "step": 71 + }, + { + "epoch": 0.022099447513812154, + "grad_norm": 0.44727766513824463, + "learning_rate": 7.259713701431494e-06, + "loss": 5.7716, + "step": 72 + }, + { + "epoch": 0.022406384284837322, + "grad_norm": 0.4392741918563843, + "learning_rate": 7.36196319018405e-06, + "loss": 5.7776, + "step": 73 + }, + { + "epoch": 0.022713321055862493, + "grad_norm": 0.43525391817092896, + "learning_rate": 7.464212678936605e-06, + "loss": 5.7687, + "step": 74 + }, + { + "epoch": 0.02302025782688766, + "grad_norm": 0.4370710253715515, + "learning_rate": 7.566462167689162e-06, + "loss": 5.7504, + "step": 75 + }, + { + "epoch": 0.02332719459791283, + "grad_norm": 0.4349770247936249, + "learning_rate": 7.668711656441718e-06, + "loss": 5.7425, + "step": 76 + }, + { + "epoch": 0.023634131368937997, + "grad_norm": 0.42710933089256287, + "learning_rate": 7.770961145194275e-06, + "loss": 5.7562, + "step": 77 + }, + { + "epoch": 0.02394106813996317, + "grad_norm": 0.42816224694252014, + "learning_rate": 7.87321063394683e-06, + "loss": 5.7301, + "step": 78 + }, + { + "epoch": 0.024248004910988337, + "grad_norm": 0.4183364510536194, + "learning_rate": 7.975460122699386e-06, + "loss": 5.7131, + "step": 79 + }, + { + "epoch": 0.024554941682013505, + "grad_norm": 0.4179428517818451, + "learning_rate": 8.077709611451943e-06, + "loss": 5.7057, + "step": 80 + }, + { + "epoch": 0.024861878453038673, + "grad_norm": 0.40880727767944336, + "learning_rate": 8.1799591002045e-06, + "loss": 5.7179, + "step": 81 + }, + { + "epoch": 0.025168815224063844, + "grad_norm": 0.40961235761642456, + "learning_rate": 8.282208588957055e-06, + "loss": 5.7008, + "step": 82 + }, + { + "epoch": 0.025475751995089013, + "grad_norm": 0.46789029240608215, + "learning_rate": 8.384458077709612e-06, + "loss": 5.7071, + "step": 83 + }, + { + "epoch": 0.02578268876611418, + "grad_norm": 0.4776248335838318, + "learning_rate": 8.486707566462168e-06, + "loss": 5.6829, + "step": 84 + }, + { + "epoch": 0.02608962553713935, + "grad_norm": 0.40660589933395386, + "learning_rate": 8.588957055214725e-06, + "loss": 5.6732, + "step": 85 + }, + { + "epoch": 0.026396562308164517, + "grad_norm": 0.3984324038028717, + "learning_rate": 8.69120654396728e-06, + "loss": 5.6777, + "step": 86 + }, + { + "epoch": 0.026703499079189688, + "grad_norm": 0.3972148597240448, + "learning_rate": 8.793456032719837e-06, + "loss": 5.6598, + "step": 87 + }, + { + "epoch": 0.027010435850214856, + "grad_norm": 0.3906182050704956, + "learning_rate": 8.895705521472392e-06, + "loss": 5.6468, + "step": 88 + }, + { + "epoch": 0.027317372621240024, + "grad_norm": 0.38598939776420593, + "learning_rate": 8.99795501022495e-06, + "loss": 5.6452, + "step": 89 + }, + { + "epoch": 0.027624309392265192, + "grad_norm": 0.405943363904953, + "learning_rate": 9.100204498977506e-06, + "loss": 5.6408, + "step": 90 + }, + { + "epoch": 0.027931246163290364, + "grad_norm": 0.3859459161758423, + "learning_rate": 9.202453987730062e-06, + "loss": 5.613, + "step": 91 + }, + { + "epoch": 0.028238182934315532, + "grad_norm": 0.3773545026779175, + "learning_rate": 9.304703476482619e-06, + "loss": 5.6277, + "step": 92 + }, + { + "epoch": 0.0285451197053407, + "grad_norm": 0.36915943026542664, + "learning_rate": 9.406952965235174e-06, + "loss": 5.618, + "step": 93 + }, + { + "epoch": 0.028852056476365868, + "grad_norm": 0.3732316792011261, + "learning_rate": 9.509202453987731e-06, + "loss": 5.6066, + "step": 94 + }, + { + "epoch": 0.029158993247391036, + "grad_norm": 0.3670802414417267, + "learning_rate": 9.611451942740286e-06, + "loss": 5.6189, + "step": 95 + }, + { + "epoch": 0.029465930018416207, + "grad_norm": 0.3672202229499817, + "learning_rate": 9.713701431492843e-06, + "loss": 5.6046, + "step": 96 + }, + { + "epoch": 0.029772866789441375, + "grad_norm": 0.3624509871006012, + "learning_rate": 9.8159509202454e-06, + "loss": 5.585, + "step": 97 + }, + { + "epoch": 0.030079803560466543, + "grad_norm": 0.36265870928764343, + "learning_rate": 9.918200408997956e-06, + "loss": 5.5867, + "step": 98 + }, + { + "epoch": 0.03038674033149171, + "grad_norm": 0.3606979548931122, + "learning_rate": 1.0020449897750511e-05, + "loss": 5.5658, + "step": 99 + }, + { + "epoch": 0.030693677102516883, + "grad_norm": 0.36800363659858704, + "learning_rate": 1.0122699386503068e-05, + "loss": 5.5494, + "step": 100 + }, + { + "epoch": 0.03100061387354205, + "grad_norm": 0.3641016483306885, + "learning_rate": 1.0224948875255625e-05, + "loss": 5.5553, + "step": 101 + }, + { + "epoch": 0.03130755064456722, + "grad_norm": 0.36807990074157715, + "learning_rate": 1.032719836400818e-05, + "loss": 5.5315, + "step": 102 + }, + { + "epoch": 0.03161448741559239, + "grad_norm": 0.37071728706359863, + "learning_rate": 1.0429447852760736e-05, + "loss": 5.522, + "step": 103 + }, + { + "epoch": 0.03192142418661756, + "grad_norm": 0.3549076020717621, + "learning_rate": 1.0531697341513293e-05, + "loss": 5.5354, + "step": 104 + }, + { + "epoch": 0.03222836095764273, + "grad_norm": 0.3589537441730499, + "learning_rate": 1.063394683026585e-05, + "loss": 5.534, + "step": 105 + }, + { + "epoch": 0.032535297728667895, + "grad_norm": 0.4341397285461426, + "learning_rate": 1.0736196319018407e-05, + "loss": 5.5088, + "step": 106 + }, + { + "epoch": 0.03284223449969306, + "grad_norm": 0.37220680713653564, + "learning_rate": 1.0838445807770962e-05, + "loss": 5.5213, + "step": 107 + }, + { + "epoch": 0.03314917127071823, + "grad_norm": 0.3776145875453949, + "learning_rate": 1.0940695296523517e-05, + "loss": 5.4955, + "step": 108 + }, + { + "epoch": 0.0334561080417434, + "grad_norm": 0.38651829957962036, + "learning_rate": 1.1042944785276074e-05, + "loss": 5.4916, + "step": 109 + }, + { + "epoch": 0.03376304481276857, + "grad_norm": 0.3749970495700836, + "learning_rate": 1.1145194274028631e-05, + "loss": 5.4686, + "step": 110 + }, + { + "epoch": 0.03406998158379374, + "grad_norm": 0.38184404373168945, + "learning_rate": 1.1247443762781187e-05, + "loss": 5.4694, + "step": 111 + }, + { + "epoch": 0.03437691835481891, + "grad_norm": 0.38783952593803406, + "learning_rate": 1.1349693251533742e-05, + "loss": 5.4447, + "step": 112 + }, + { + "epoch": 0.03468385512584408, + "grad_norm": 0.369125097990036, + "learning_rate": 1.1451942740286299e-05, + "loss": 5.4506, + "step": 113 + }, + { + "epoch": 0.034990791896869246, + "grad_norm": 0.3773012161254883, + "learning_rate": 1.1554192229038856e-05, + "loss": 5.4637, + "step": 114 + }, + { + "epoch": 0.035297728667894414, + "grad_norm": 0.47702446579933167, + "learning_rate": 1.1656441717791411e-05, + "loss": 5.4487, + "step": 115 + }, + { + "epoch": 0.03560466543891958, + "grad_norm": 0.5288241505622864, + "learning_rate": 1.1758691206543968e-05, + "loss": 5.4216, + "step": 116 + }, + { + "epoch": 0.03591160220994475, + "grad_norm": 0.49916699528694153, + "learning_rate": 1.1860940695296524e-05, + "loss": 5.4055, + "step": 117 + }, + { + "epoch": 0.03621853898096992, + "grad_norm": 0.5027921795845032, + "learning_rate": 1.196319018404908e-05, + "loss": 5.4141, + "step": 118 + }, + { + "epoch": 0.036525475751995086, + "grad_norm": 0.5069209933280945, + "learning_rate": 1.2065439672801638e-05, + "loss": 5.4277, + "step": 119 + }, + { + "epoch": 0.03683241252302026, + "grad_norm": 0.5208525657653809, + "learning_rate": 1.2167689161554193e-05, + "loss": 5.4023, + "step": 120 + }, + { + "epoch": 0.03713934929404543, + "grad_norm": 0.7059593796730042, + "learning_rate": 1.2269938650306748e-05, + "loss": 5.3797, + "step": 121 + }, + { + "epoch": 0.0374462860650706, + "grad_norm": 0.71112060546875, + "learning_rate": 1.2372188139059305e-05, + "loss": 5.3619, + "step": 122 + }, + { + "epoch": 0.037753222836095765, + "grad_norm": 0.5095361471176147, + "learning_rate": 1.2474437627811862e-05, + "loss": 5.3667, + "step": 123 + }, + { + "epoch": 0.03806015960712093, + "grad_norm": 0.986062228679657, + "learning_rate": 1.2576687116564418e-05, + "loss": 5.3459, + "step": 124 + }, + { + "epoch": 0.0383670963781461, + "grad_norm": 0.693392813205719, + "learning_rate": 1.2678936605316975e-05, + "loss": 5.3165, + "step": 125 + }, + { + "epoch": 0.03867403314917127, + "grad_norm": 0.7835625410079956, + "learning_rate": 1.278118609406953e-05, + "loss": 5.3205, + "step": 126 + }, + { + "epoch": 0.03898096992019644, + "grad_norm": 0.6314569711685181, + "learning_rate": 1.2883435582822087e-05, + "loss": 5.3287, + "step": 127 + }, + { + "epoch": 0.039287906691221605, + "grad_norm": 0.9079526662826538, + "learning_rate": 1.2985685071574644e-05, + "loss": 5.2935, + "step": 128 + }, + { + "epoch": 0.03959484346224678, + "grad_norm": 0.6998131275177002, + "learning_rate": 1.3087934560327201e-05, + "loss": 5.315, + "step": 129 + }, + { + "epoch": 0.03990178023327195, + "grad_norm": 0.7570182085037231, + "learning_rate": 1.3190184049079754e-05, + "loss": 5.293, + "step": 130 + }, + { + "epoch": 0.040208717004297116, + "grad_norm": 0.6972737908363342, + "learning_rate": 1.329243353783231e-05, + "loss": 5.2863, + "step": 131 + }, + { + "epoch": 0.040515653775322284, + "grad_norm": 0.8841190934181213, + "learning_rate": 1.3394683026584867e-05, + "loss": 5.2518, + "step": 132 + }, + { + "epoch": 0.04082259054634745, + "grad_norm": 0.6792641282081604, + "learning_rate": 1.3496932515337424e-05, + "loss": 5.2386, + "step": 133 + }, + { + "epoch": 0.04112952731737262, + "grad_norm": 0.9234145879745483, + "learning_rate": 1.359918200408998e-05, + "loss": 5.2418, + "step": 134 + }, + { + "epoch": 0.04143646408839779, + "grad_norm": 1.1438226699829102, + "learning_rate": 1.3701431492842536e-05, + "loss": 5.2298, + "step": 135 + }, + { + "epoch": 0.041743400859422956, + "grad_norm": 0.910861074924469, + "learning_rate": 1.3803680981595093e-05, + "loss": 5.2437, + "step": 136 + }, + { + "epoch": 0.042050337630448124, + "grad_norm": 0.8995844721794128, + "learning_rate": 1.390593047034765e-05, + "loss": 5.2456, + "step": 137 + }, + { + "epoch": 0.0423572744014733, + "grad_norm": 0.8543404936790466, + "learning_rate": 1.4008179959100204e-05, + "loss": 5.1888, + "step": 138 + }, + { + "epoch": 0.04266421117249847, + "grad_norm": 0.7565917372703552, + "learning_rate": 1.411042944785276e-05, + "loss": 5.1939, + "step": 139 + }, + { + "epoch": 0.042971147943523635, + "grad_norm": 0.7103878259658813, + "learning_rate": 1.4212678936605318e-05, + "loss": 5.1693, + "step": 140 + }, + { + "epoch": 0.0432780847145488, + "grad_norm": 1.008686900138855, + "learning_rate": 1.4314928425357873e-05, + "loss": 5.1467, + "step": 141 + }, + { + "epoch": 0.04358502148557397, + "grad_norm": 1.1134421825408936, + "learning_rate": 1.441717791411043e-05, + "loss": 5.1695, + "step": 142 + }, + { + "epoch": 0.04389195825659914, + "grad_norm": 0.7418283820152283, + "learning_rate": 1.4519427402862987e-05, + "loss": 5.1556, + "step": 143 + }, + { + "epoch": 0.04419889502762431, + "grad_norm": 1.3332276344299316, + "learning_rate": 1.4621676891615542e-05, + "loss": 5.1736, + "step": 144 + }, + { + "epoch": 0.044505831798649476, + "grad_norm": 0.99709153175354, + "learning_rate": 1.47239263803681e-05, + "loss": 5.1326, + "step": 145 + }, + { + "epoch": 0.044812768569674644, + "grad_norm": 2.0185158252716064, + "learning_rate": 1.4826175869120657e-05, + "loss": 5.1075, + "step": 146 + }, + { + "epoch": 0.04511970534069982, + "grad_norm": 0.9810693264007568, + "learning_rate": 1.492842535787321e-05, + "loss": 5.1181, + "step": 147 + }, + { + "epoch": 0.04542664211172499, + "grad_norm": 1.3122087717056274, + "learning_rate": 1.5030674846625767e-05, + "loss": 5.1104, + "step": 148 + }, + { + "epoch": 0.045733578882750155, + "grad_norm": 1.230662226676941, + "learning_rate": 1.5132924335378324e-05, + "loss": 5.0721, + "step": 149 + }, + { + "epoch": 0.04604051565377532, + "grad_norm": 0.9584419131278992, + "learning_rate": 1.523517382413088e-05, + "loss": 5.0574, + "step": 150 + }, + { + "epoch": 0.04634745242480049, + "grad_norm": 1.3933353424072266, + "learning_rate": 1.5337423312883436e-05, + "loss": 5.0468, + "step": 151 + }, + { + "epoch": 0.04665438919582566, + "grad_norm": 1.2336134910583496, + "learning_rate": 1.5439672801635993e-05, + "loss": 5.0596, + "step": 152 + }, + { + "epoch": 0.04696132596685083, + "grad_norm": 1.3005256652832031, + "learning_rate": 1.554192229038855e-05, + "loss": 5.0236, + "step": 153 + }, + { + "epoch": 0.047268262737875995, + "grad_norm": 1.2528692483901978, + "learning_rate": 1.5644171779141108e-05, + "loss": 5.0269, + "step": 154 + }, + { + "epoch": 0.04757519950890117, + "grad_norm": 1.0448148250579834, + "learning_rate": 1.574642126789366e-05, + "loss": 5.0338, + "step": 155 + }, + { + "epoch": 0.04788213627992634, + "grad_norm": 1.2372045516967773, + "learning_rate": 1.5848670756646218e-05, + "loss": 4.9544, + "step": 156 + }, + { + "epoch": 0.048189073050951506, + "grad_norm": 1.2700645923614502, + "learning_rate": 1.5950920245398772e-05, + "loss": 4.9723, + "step": 157 + }, + { + "epoch": 0.048496009821976674, + "grad_norm": 1.1283228397369385, + "learning_rate": 1.605316973415133e-05, + "loss": 4.9801, + "step": 158 + }, + { + "epoch": 0.04880294659300184, + "grad_norm": 1.5563665628433228, + "learning_rate": 1.6155419222903886e-05, + "loss": 4.9118, + "step": 159 + }, + { + "epoch": 0.04910988336402701, + "grad_norm": 1.3759487867355347, + "learning_rate": 1.6257668711656443e-05, + "loss": 4.9552, + "step": 160 + }, + { + "epoch": 0.04941682013505218, + "grad_norm": 1.2167878150939941, + "learning_rate": 1.6359918200409e-05, + "loss": 4.9186, + "step": 161 + }, + { + "epoch": 0.049723756906077346, + "grad_norm": 1.6424930095672607, + "learning_rate": 1.6462167689161557e-05, + "loss": 4.9143, + "step": 162 + }, + { + "epoch": 0.050030693677102514, + "grad_norm": 1.0009948015213013, + "learning_rate": 1.656441717791411e-05, + "loss": 4.8615, + "step": 163 + }, + { + "epoch": 0.05033763044812769, + "grad_norm": 1.8803274631500244, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.8558, + "step": 164 + }, + { + "epoch": 0.05064456721915286, + "grad_norm": 1.1819735765457153, + "learning_rate": 1.6768916155419224e-05, + "loss": 4.8453, + "step": 165 + }, + { + "epoch": 0.050951503990178025, + "grad_norm": 1.9724273681640625, + "learning_rate": 1.6871165644171778e-05, + "loss": 4.8573, + "step": 166 + }, + { + "epoch": 0.05125844076120319, + "grad_norm": 1.4624557495117188, + "learning_rate": 1.6973415132924335e-05, + "loss": 4.8494, + "step": 167 + }, + { + "epoch": 0.05156537753222836, + "grad_norm": 1.4750267267227173, + "learning_rate": 1.7075664621676892e-05, + "loss": 4.8296, + "step": 168 + }, + { + "epoch": 0.05187231430325353, + "grad_norm": 1.3206923007965088, + "learning_rate": 1.717791411042945e-05, + "loss": 4.7834, + "step": 169 + }, + { + "epoch": 0.0521792510742787, + "grad_norm": 1.4332681894302368, + "learning_rate": 1.7280163599182006e-05, + "loss": 4.8008, + "step": 170 + }, + { + "epoch": 0.052486187845303865, + "grad_norm": 1.612804651260376, + "learning_rate": 1.738241308793456e-05, + "loss": 4.7885, + "step": 171 + }, + { + "epoch": 0.05279312461632903, + "grad_norm": 1.3880311250686646, + "learning_rate": 1.7484662576687117e-05, + "loss": 4.8034, + "step": 172 + }, + { + "epoch": 0.05310006138735421, + "grad_norm": 1.7550631761550903, + "learning_rate": 1.7586912065439674e-05, + "loss": 4.7568, + "step": 173 + }, + { + "epoch": 0.053406998158379376, + "grad_norm": 1.653678297996521, + "learning_rate": 1.768916155419223e-05, + "loss": 4.7294, + "step": 174 + }, + { + "epoch": 0.053713934929404544, + "grad_norm": 1.6094826459884644, + "learning_rate": 1.7791411042944784e-05, + "loss": 4.7409, + "step": 175 + }, + { + "epoch": 0.05402087170042971, + "grad_norm": 1.7453033924102783, + "learning_rate": 1.789366053169734e-05, + "loss": 4.7191, + "step": 176 + }, + { + "epoch": 0.05432780847145488, + "grad_norm": 1.3073794841766357, + "learning_rate": 1.79959100204499e-05, + "loss": 4.7347, + "step": 177 + }, + { + "epoch": 0.05463474524248005, + "grad_norm": 2.096515655517578, + "learning_rate": 1.8098159509202455e-05, + "loss": 4.7396, + "step": 178 + }, + { + "epoch": 0.054941682013505216, + "grad_norm": 1.3826024532318115, + "learning_rate": 1.8200408997955012e-05, + "loss": 4.6988, + "step": 179 + }, + { + "epoch": 0.055248618784530384, + "grad_norm": 1.9290310144424438, + "learning_rate": 1.8302658486707566e-05, + "loss": 4.6653, + "step": 180 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.7404149770736694, + "learning_rate": 1.8404907975460123e-05, + "loss": 4.7102, + "step": 181 + }, + { + "epoch": 0.05586249232658073, + "grad_norm": 1.7535779476165771, + "learning_rate": 1.850715746421268e-05, + "loss": 4.7124, + "step": 182 + }, + { + "epoch": 0.056169429097605895, + "grad_norm": 1.7792351245880127, + "learning_rate": 1.8609406952965237e-05, + "loss": 4.6969, + "step": 183 + }, + { + "epoch": 0.056476365868631064, + "grad_norm": 2.048332452774048, + "learning_rate": 1.8711656441717794e-05, + "loss": 4.6134, + "step": 184 + }, + { + "epoch": 0.05678330263965623, + "grad_norm": 1.9558366537094116, + "learning_rate": 1.8813905930470348e-05, + "loss": 4.6739, + "step": 185 + }, + { + "epoch": 0.0570902394106814, + "grad_norm": 2.5299644470214844, + "learning_rate": 1.8916155419222905e-05, + "loss": 4.6248, + "step": 186 + }, + { + "epoch": 0.05739717618170657, + "grad_norm": 2.143704891204834, + "learning_rate": 1.9018404907975462e-05, + "loss": 4.6664, + "step": 187 + }, + { + "epoch": 0.057704112952731736, + "grad_norm": 1.925010323524475, + "learning_rate": 1.9120654396728015e-05, + "loss": 4.5657, + "step": 188 + }, + { + "epoch": 0.058011049723756904, + "grad_norm": 1.8223596811294556, + "learning_rate": 1.9222903885480572e-05, + "loss": 4.6124, + "step": 189 + }, + { + "epoch": 0.05831798649478207, + "grad_norm": 1.9519827365875244, + "learning_rate": 1.932515337423313e-05, + "loss": 4.5937, + "step": 190 + }, + { + "epoch": 0.05862492326580725, + "grad_norm": 2.062534809112549, + "learning_rate": 1.9427402862985686e-05, + "loss": 4.6023, + "step": 191 + }, + { + "epoch": 0.058931860036832415, + "grad_norm": 1.8512892723083496, + "learning_rate": 1.9529652351738243e-05, + "loss": 4.5709, + "step": 192 + }, + { + "epoch": 0.05923879680785758, + "grad_norm": 2.7771248817443848, + "learning_rate": 1.96319018404908e-05, + "loss": 4.5902, + "step": 193 + }, + { + "epoch": 0.05954573357888275, + "grad_norm": 1.8911874294281006, + "learning_rate": 1.9734151329243354e-05, + "loss": 4.4973, + "step": 194 + }, + { + "epoch": 0.05985267034990792, + "grad_norm": 2.261096715927124, + "learning_rate": 1.983640081799591e-05, + "loss": 4.5343, + "step": 195 + }, + { + "epoch": 0.06015960712093309, + "grad_norm": 1.833983302116394, + "learning_rate": 1.9938650306748465e-05, + "loss": 4.5604, + "step": 196 + }, + { + "epoch": 0.060466543891958255, + "grad_norm": 2.6909141540527344, + "learning_rate": 2.0040899795501022e-05, + "loss": 4.5411, + "step": 197 + }, + { + "epoch": 0.06077348066298342, + "grad_norm": 1.8085883855819702, + "learning_rate": 2.014314928425358e-05, + "loss": 4.5127, + "step": 198 + }, + { + "epoch": 0.06108041743400859, + "grad_norm": 3.082063913345337, + "learning_rate": 2.0245398773006136e-05, + "loss": 4.5055, + "step": 199 + }, + { + "epoch": 0.061387354205033766, + "grad_norm": 1.6942392587661743, + "learning_rate": 2.0347648261758693e-05, + "loss": 4.4852, + "step": 200 + }, + { + "epoch": 0.061694290976058934, + "grad_norm": 2.428569793701172, + "learning_rate": 2.044989775051125e-05, + "loss": 4.4876, + "step": 201 + }, + { + "epoch": 0.0620012277470841, + "grad_norm": 2.1669068336486816, + "learning_rate": 2.0552147239263807e-05, + "loss": 4.5156, + "step": 202 + }, + { + "epoch": 0.06230816451810927, + "grad_norm": 1.8558237552642822, + "learning_rate": 2.065439672801636e-05, + "loss": 4.495, + "step": 203 + }, + { + "epoch": 0.06261510128913444, + "grad_norm": 2.86224627494812, + "learning_rate": 2.0756646216768917e-05, + "loss": 4.4881, + "step": 204 + }, + { + "epoch": 0.06292203806015961, + "grad_norm": 2.263230562210083, + "learning_rate": 2.085889570552147e-05, + "loss": 4.4349, + "step": 205 + }, + { + "epoch": 0.06322897483118478, + "grad_norm": 2.533039093017578, + "learning_rate": 2.0961145194274028e-05, + "loss": 4.4921, + "step": 206 + }, + { + "epoch": 0.06353591160220995, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.1063394683026585e-05, + "loss": 4.4581, + "step": 207 + }, + { + "epoch": 0.06384284837323512, + "grad_norm": 1.9801981449127197, + "learning_rate": 2.1165644171779142e-05, + "loss": 4.4646, + "step": 208 + }, + { + "epoch": 0.06414978514426029, + "grad_norm": 2.8499860763549805, + "learning_rate": 2.12678936605317e-05, + "loss": 4.3913, + "step": 209 + }, + { + "epoch": 0.06445672191528545, + "grad_norm": 1.8176993131637573, + "learning_rate": 2.1370143149284256e-05, + "loss": 4.4414, + "step": 210 + }, + { + "epoch": 0.06476365868631062, + "grad_norm": 3.1497061252593994, + "learning_rate": 2.1472392638036813e-05, + "loss": 4.4164, + "step": 211 + }, + { + "epoch": 0.06507059545733579, + "grad_norm": 2.0509049892425537, + "learning_rate": 2.1574642126789367e-05, + "loss": 4.4198, + "step": 212 + }, + { + "epoch": 0.06537753222836096, + "grad_norm": 2.5346014499664307, + "learning_rate": 2.1676891615541924e-05, + "loss": 4.3628, + "step": 213 + }, + { + "epoch": 0.06568446899938613, + "grad_norm": 2.281947135925293, + "learning_rate": 2.1779141104294477e-05, + "loss": 4.3824, + "step": 214 + }, + { + "epoch": 0.0659914057704113, + "grad_norm": 2.9005074501037598, + "learning_rate": 2.1881390593047034e-05, + "loss": 4.4227, + "step": 215 + }, + { + "epoch": 0.06629834254143646, + "grad_norm": 2.5869741439819336, + "learning_rate": 2.198364008179959e-05, + "loss": 4.4231, + "step": 216 + }, + { + "epoch": 0.06660527931246163, + "grad_norm": 2.339655637741089, + "learning_rate": 2.208588957055215e-05, + "loss": 4.3901, + "step": 217 + }, + { + "epoch": 0.0669122160834868, + "grad_norm": 2.430664539337158, + "learning_rate": 2.2188139059304705e-05, + "loss": 4.3487, + "step": 218 + }, + { + "epoch": 0.06721915285451197, + "grad_norm": 2.1791040897369385, + "learning_rate": 2.2290388548057262e-05, + "loss": 4.3404, + "step": 219 + }, + { + "epoch": 0.06752608962553713, + "grad_norm": 2.7054920196533203, + "learning_rate": 2.239263803680982e-05, + "loss": 4.4186, + "step": 220 + }, + { + "epoch": 0.0678330263965623, + "grad_norm": 2.516566514968872, + "learning_rate": 2.2494887525562373e-05, + "loss": 4.4102, + "step": 221 + }, + { + "epoch": 0.06813996316758748, + "grad_norm": 2.3522324562072754, + "learning_rate": 2.259713701431493e-05, + "loss": 4.4062, + "step": 222 + }, + { + "epoch": 0.06844689993861265, + "grad_norm": 2.557600259780884, + "learning_rate": 2.2699386503067484e-05, + "loss": 4.3711, + "step": 223 + }, + { + "epoch": 0.06875383670963782, + "grad_norm": 2.0590531826019287, + "learning_rate": 2.280163599182004e-05, + "loss": 4.3546, + "step": 224 + }, + { + "epoch": 0.06906077348066299, + "grad_norm": 4.704878330230713, + "learning_rate": 2.2903885480572598e-05, + "loss": 4.39, + "step": 225 + }, + { + "epoch": 0.06936771025168816, + "grad_norm": 2.237440347671509, + "learning_rate": 2.3006134969325155e-05, + "loss": 4.3425, + "step": 226 + }, + { + "epoch": 0.06967464702271332, + "grad_norm": 3.9394450187683105, + "learning_rate": 2.3108384458077712e-05, + "loss": 4.3641, + "step": 227 + }, + { + "epoch": 0.06998158379373849, + "grad_norm": 2.4857213497161865, + "learning_rate": 2.321063394683027e-05, + "loss": 4.3435, + "step": 228 + }, + { + "epoch": 0.07028852056476366, + "grad_norm": 2.893437147140503, + "learning_rate": 2.3312883435582822e-05, + "loss": 4.329, + "step": 229 + }, + { + "epoch": 0.07059545733578883, + "grad_norm": 2.6498284339904785, + "learning_rate": 2.341513292433538e-05, + "loss": 4.3058, + "step": 230 + }, + { + "epoch": 0.070902394106814, + "grad_norm": 2.4182214736938477, + "learning_rate": 2.3517382413087936e-05, + "loss": 4.3147, + "step": 231 + }, + { + "epoch": 0.07120933087783916, + "grad_norm": 2.532050371170044, + "learning_rate": 2.361963190184049e-05, + "loss": 4.3388, + "step": 232 + }, + { + "epoch": 0.07151626764886433, + "grad_norm": 2.5818533897399902, + "learning_rate": 2.3721881390593047e-05, + "loss": 4.3023, + "step": 233 + }, + { + "epoch": 0.0718232044198895, + "grad_norm": 2.1860098838806152, + "learning_rate": 2.3824130879345604e-05, + "loss": 4.2571, + "step": 234 + }, + { + "epoch": 0.07213014119091467, + "grad_norm": 3.5780131816864014, + "learning_rate": 2.392638036809816e-05, + "loss": 4.3336, + "step": 235 + }, + { + "epoch": 0.07243707796193984, + "grad_norm": 2.24653697013855, + "learning_rate": 2.4028629856850718e-05, + "loss": 4.3013, + "step": 236 + }, + { + "epoch": 0.072744014732965, + "grad_norm": 3.59663987159729, + "learning_rate": 2.4130879345603275e-05, + "loss": 4.3248, + "step": 237 + }, + { + "epoch": 0.07305095150399017, + "grad_norm": 2.818321943283081, + "learning_rate": 2.423312883435583e-05, + "loss": 4.2876, + "step": 238 + }, + { + "epoch": 0.07335788827501534, + "grad_norm": 2.457371950149536, + "learning_rate": 2.4335378323108386e-05, + "loss": 4.2584, + "step": 239 + }, + { + "epoch": 0.07366482504604052, + "grad_norm": 3.6243598461151123, + "learning_rate": 2.4437627811860943e-05, + "loss": 4.2786, + "step": 240 + }, + { + "epoch": 0.07397176181706569, + "grad_norm": 2.113060474395752, + "learning_rate": 2.4539877300613496e-05, + "loss": 4.2071, + "step": 241 + }, + { + "epoch": 0.07427869858809086, + "grad_norm": 5.355374813079834, + "learning_rate": 2.4642126789366053e-05, + "loss": 4.2871, + "step": 242 + }, + { + "epoch": 0.07458563535911603, + "grad_norm": 2.4509847164154053, + "learning_rate": 2.474437627811861e-05, + "loss": 4.2073, + "step": 243 + }, + { + "epoch": 0.0748925721301412, + "grad_norm": 3.313793659210205, + "learning_rate": 2.4846625766871167e-05, + "loss": 4.2938, + "step": 244 + }, + { + "epoch": 0.07519950890116636, + "grad_norm": 2.731903553009033, + "learning_rate": 2.4948875255623724e-05, + "loss": 4.2023, + "step": 245 + }, + { + "epoch": 0.07550644567219153, + "grad_norm": 2.6218042373657227, + "learning_rate": 2.505112474437628e-05, + "loss": 4.2492, + "step": 246 + }, + { + "epoch": 0.0758133824432167, + "grad_norm": 3.2865426540374756, + "learning_rate": 2.5153374233128835e-05, + "loss": 4.2358, + "step": 247 + }, + { + "epoch": 0.07612031921424187, + "grad_norm": 2.21870756149292, + "learning_rate": 2.5255623721881395e-05, + "loss": 4.1989, + "step": 248 + }, + { + "epoch": 0.07642725598526703, + "grad_norm": 4.095842361450195, + "learning_rate": 2.535787321063395e-05, + "loss": 4.2484, + "step": 249 + }, + { + "epoch": 0.0767341927562922, + "grad_norm": 2.21420955657959, + "learning_rate": 2.5460122699386503e-05, + "loss": 4.1985, + "step": 250 + }, + { + "epoch": 0.07704112952731737, + "grad_norm": 3.011272668838501, + "learning_rate": 2.556237218813906e-05, + "loss": 4.2182, + "step": 251 + }, + { + "epoch": 0.07734806629834254, + "grad_norm": 2.930999279022217, + "learning_rate": 2.5664621676891613e-05, + "loss": 4.1985, + "step": 252 + }, + { + "epoch": 0.0776550030693677, + "grad_norm": 2.8528032302856445, + "learning_rate": 2.5766871165644174e-05, + "loss": 4.1859, + "step": 253 + }, + { + "epoch": 0.07796193984039287, + "grad_norm": 3.215587854385376, + "learning_rate": 2.5869120654396727e-05, + "loss": 4.2416, + "step": 254 + }, + { + "epoch": 0.07826887661141804, + "grad_norm": 3.1349990367889404, + "learning_rate": 2.5971370143149288e-05, + "loss": 4.2204, + "step": 255 + }, + { + "epoch": 0.07857581338244321, + "grad_norm": 3.146942377090454, + "learning_rate": 2.607361963190184e-05, + "loss": 4.17, + "step": 256 + }, + { + "epoch": 0.07888275015346839, + "grad_norm": 2.2611942291259766, + "learning_rate": 2.6175869120654402e-05, + "loss": 4.191, + "step": 257 + }, + { + "epoch": 0.07918968692449356, + "grad_norm": 3.434574604034424, + "learning_rate": 2.6278118609406955e-05, + "loss": 4.1854, + "step": 258 + }, + { + "epoch": 0.07949662369551873, + "grad_norm": 2.3132400512695312, + "learning_rate": 2.638036809815951e-05, + "loss": 4.233, + "step": 259 + }, + { + "epoch": 0.0798035604665439, + "grad_norm": 3.2676596641540527, + "learning_rate": 2.6482617586912066e-05, + "loss": 4.1586, + "step": 260 + }, + { + "epoch": 0.08011049723756906, + "grad_norm": 2.6182920932769775, + "learning_rate": 2.658486707566462e-05, + "loss": 4.164, + "step": 261 + }, + { + "epoch": 0.08041743400859423, + "grad_norm": 2.872018814086914, + "learning_rate": 2.668711656441718e-05, + "loss": 4.1642, + "step": 262 + }, + { + "epoch": 0.0807243707796194, + "grad_norm": 3.147237539291382, + "learning_rate": 2.6789366053169734e-05, + "loss": 4.147, + "step": 263 + }, + { + "epoch": 0.08103130755064457, + "grad_norm": 2.363360643386841, + "learning_rate": 2.6891615541922294e-05, + "loss": 4.1388, + "step": 264 + }, + { + "epoch": 0.08133824432166974, + "grad_norm": 3.364442825317383, + "learning_rate": 2.6993865030674848e-05, + "loss": 4.1678, + "step": 265 + }, + { + "epoch": 0.0816451810926949, + "grad_norm": 2.393705368041992, + "learning_rate": 2.7096114519427408e-05, + "loss": 4.1626, + "step": 266 + }, + { + "epoch": 0.08195211786372007, + "grad_norm": 3.8512558937072754, + "learning_rate": 2.719836400817996e-05, + "loss": 4.1613, + "step": 267 + }, + { + "epoch": 0.08225905463474524, + "grad_norm": 3.0992584228515625, + "learning_rate": 2.7300613496932515e-05, + "loss": 4.1486, + "step": 268 + }, + { + "epoch": 0.08256599140577041, + "grad_norm": 3.481079578399658, + "learning_rate": 2.7402862985685072e-05, + "loss": 4.1772, + "step": 269 + }, + { + "epoch": 0.08287292817679558, + "grad_norm": 3.2167513370513916, + "learning_rate": 2.7505112474437626e-05, + "loss": 4.1253, + "step": 270 + }, + { + "epoch": 0.08317986494782074, + "grad_norm": 2.9698429107666016, + "learning_rate": 2.7607361963190186e-05, + "loss": 4.0897, + "step": 271 + }, + { + "epoch": 0.08348680171884591, + "grad_norm": 3.2549962997436523, + "learning_rate": 2.770961145194274e-05, + "loss": 4.0851, + "step": 272 + }, + { + "epoch": 0.08379373848987108, + "grad_norm": 3.089301824569702, + "learning_rate": 2.78118609406953e-05, + "loss": 4.1378, + "step": 273 + }, + { + "epoch": 0.08410067526089625, + "grad_norm": 3.1799745559692383, + "learning_rate": 2.7914110429447854e-05, + "loss": 4.159, + "step": 274 + }, + { + "epoch": 0.08440761203192143, + "grad_norm": 2.7577199935913086, + "learning_rate": 2.8016359918200408e-05, + "loss": 4.0524, + "step": 275 + }, + { + "epoch": 0.0847145488029466, + "grad_norm": 3.709740161895752, + "learning_rate": 2.8118609406952968e-05, + "loss": 4.0877, + "step": 276 + }, + { + "epoch": 0.08502148557397177, + "grad_norm": 2.930482864379883, + "learning_rate": 2.822085889570552e-05, + "loss": 4.0408, + "step": 277 + }, + { + "epoch": 0.08532842234499693, + "grad_norm": 3.8216278553009033, + "learning_rate": 2.832310838445808e-05, + "loss": 4.0915, + "step": 278 + }, + { + "epoch": 0.0856353591160221, + "grad_norm": 2.7614903450012207, + "learning_rate": 2.8425357873210636e-05, + "loss": 4.0793, + "step": 279 + }, + { + "epoch": 0.08594229588704727, + "grad_norm": 4.005281448364258, + "learning_rate": 2.8527607361963193e-05, + "loss": 4.1234, + "step": 280 + }, + { + "epoch": 0.08624923265807244, + "grad_norm": 2.731640338897705, + "learning_rate": 2.8629856850715746e-05, + "loss": 4.1408, + "step": 281 + }, + { + "epoch": 0.0865561694290976, + "grad_norm": 4.439471244812012, + "learning_rate": 2.8732106339468307e-05, + "loss": 4.08, + "step": 282 + }, + { + "epoch": 0.08686310620012277, + "grad_norm": 2.929032564163208, + "learning_rate": 2.883435582822086e-05, + "loss": 4.0521, + "step": 283 + }, + { + "epoch": 0.08717004297114794, + "grad_norm": 3.3943557739257812, + "learning_rate": 2.8936605316973414e-05, + "loss": 4.0936, + "step": 284 + }, + { + "epoch": 0.08747697974217311, + "grad_norm": 2.9899704456329346, + "learning_rate": 2.9038854805725974e-05, + "loss": 4.0985, + "step": 285 + }, + { + "epoch": 0.08778391651319828, + "grad_norm": 2.8169870376586914, + "learning_rate": 2.9141104294478528e-05, + "loss": 4.1044, + "step": 286 + }, + { + "epoch": 0.08809085328422345, + "grad_norm": 4.312693119049072, + "learning_rate": 2.9243353783231085e-05, + "loss": 4.0515, + "step": 287 + }, + { + "epoch": 0.08839779005524862, + "grad_norm": 2.9270846843719482, + "learning_rate": 2.9345603271983642e-05, + "loss": 4.0221, + "step": 288 + }, + { + "epoch": 0.08870472682627378, + "grad_norm": 3.9831974506378174, + "learning_rate": 2.94478527607362e-05, + "loss": 4.0807, + "step": 289 + }, + { + "epoch": 0.08901166359729895, + "grad_norm": 2.721794605255127, + "learning_rate": 2.9550102249488753e-05, + "loss": 4.0732, + "step": 290 + }, + { + "epoch": 0.08931860036832412, + "grad_norm": 4.721047878265381, + "learning_rate": 2.9652351738241313e-05, + "loss": 4.0457, + "step": 291 + }, + { + "epoch": 0.08962553713934929, + "grad_norm": 2.785738229751587, + "learning_rate": 2.9754601226993867e-05, + "loss": 4.0288, + "step": 292 + }, + { + "epoch": 0.08993247391037447, + "grad_norm": 4.842009544372559, + "learning_rate": 2.985685071574642e-05, + "loss": 4.1193, + "step": 293 + }, + { + "epoch": 0.09023941068139964, + "grad_norm": 2.802044153213501, + "learning_rate": 2.995910020449898e-05, + "loss": 4.0055, + "step": 294 + }, + { + "epoch": 0.0905463474524248, + "grad_norm": 3.7060954570770264, + "learning_rate": 3.0061349693251534e-05, + "loss": 4.0478, + "step": 295 + }, + { + "epoch": 0.09085328422344997, + "grad_norm": 2.8033370971679688, + "learning_rate": 3.0163599182004095e-05, + "loss": 4.0344, + "step": 296 + }, + { + "epoch": 0.09116022099447514, + "grad_norm": 3.148653984069824, + "learning_rate": 3.026584867075665e-05, + "loss": 3.9825, + "step": 297 + }, + { + "epoch": 0.09146715776550031, + "grad_norm": 3.925459384918213, + "learning_rate": 3.0368098159509205e-05, + "loss": 4.0253, + "step": 298 + }, + { + "epoch": 0.09177409453652548, + "grad_norm": 2.8502724170684814, + "learning_rate": 3.047034764826176e-05, + "loss": 4.0192, + "step": 299 + }, + { + "epoch": 0.09208103130755065, + "grad_norm": 3.8444268703460693, + "learning_rate": 3.057259713701431e-05, + "loss": 4.0354, + "step": 300 + }, + { + "epoch": 0.09238796807857581, + "grad_norm": 2.935976982116699, + "learning_rate": 3.067484662576687e-05, + "loss": 4.0397, + "step": 301 + }, + { + "epoch": 0.09269490484960098, + "grad_norm": 2.9375271797180176, + "learning_rate": 3.0777096114519427e-05, + "loss": 3.975, + "step": 302 + }, + { + "epoch": 0.09300184162062615, + "grad_norm": 3.7623329162597656, + "learning_rate": 3.087934560327199e-05, + "loss": 4.0259, + "step": 303 + }, + { + "epoch": 0.09330877839165132, + "grad_norm": 3.1480228900909424, + "learning_rate": 3.098159509202454e-05, + "loss": 3.9676, + "step": 304 + }, + { + "epoch": 0.09361571516267649, + "grad_norm": 4.572622299194336, + "learning_rate": 3.10838445807771e-05, + "loss": 4.0123, + "step": 305 + }, + { + "epoch": 0.09392265193370165, + "grad_norm": 2.469806671142578, + "learning_rate": 3.1186094069529655e-05, + "loss": 4.012, + "step": 306 + }, + { + "epoch": 0.09422958870472682, + "grad_norm": 5.133090019226074, + "learning_rate": 3.1288343558282215e-05, + "loss": 3.9892, + "step": 307 + }, + { + "epoch": 0.09453652547575199, + "grad_norm": 3.379105567932129, + "learning_rate": 3.139059304703477e-05, + "loss": 4.0286, + "step": 308 + }, + { + "epoch": 0.09484346224677716, + "grad_norm": 3.1413521766662598, + "learning_rate": 3.149284253578732e-05, + "loss": 4.0238, + "step": 309 + }, + { + "epoch": 0.09515039901780234, + "grad_norm": 2.832242250442505, + "learning_rate": 3.159509202453988e-05, + "loss": 3.9955, + "step": 310 + }, + { + "epoch": 0.09545733578882751, + "grad_norm": 4.405134201049805, + "learning_rate": 3.1697341513292436e-05, + "loss": 4.0093, + "step": 311 + }, + { + "epoch": 0.09576427255985268, + "grad_norm": 2.8928587436676025, + "learning_rate": 3.179959100204499e-05, + "loss": 3.9518, + "step": 312 + }, + { + "epoch": 0.09607120933087784, + "grad_norm": 3.8899731636047363, + "learning_rate": 3.1901840490797544e-05, + "loss": 3.9773, + "step": 313 + }, + { + "epoch": 0.09637814610190301, + "grad_norm": 2.768199920654297, + "learning_rate": 3.2004089979550104e-05, + "loss": 3.9671, + "step": 314 + }, + { + "epoch": 0.09668508287292818, + "grad_norm": 3.834092378616333, + "learning_rate": 3.210633946830266e-05, + "loss": 3.9641, + "step": 315 + }, + { + "epoch": 0.09699201964395335, + "grad_norm": 3.566220998764038, + "learning_rate": 3.220858895705521e-05, + "loss": 3.9585, + "step": 316 + }, + { + "epoch": 0.09729895641497852, + "grad_norm": 3.1876113414764404, + "learning_rate": 3.231083844580777e-05, + "loss": 3.9689, + "step": 317 + }, + { + "epoch": 0.09760589318600368, + "grad_norm": 3.122142791748047, + "learning_rate": 3.2413087934560325e-05, + "loss": 3.9601, + "step": 318 + }, + { + "epoch": 0.09791282995702885, + "grad_norm": 3.825195789337158, + "learning_rate": 3.2515337423312886e-05, + "loss": 3.9413, + "step": 319 + }, + { + "epoch": 0.09821976672805402, + "grad_norm": 3.3126778602600098, + "learning_rate": 3.261758691206544e-05, + "loss": 4.0414, + "step": 320 + }, + { + "epoch": 0.09852670349907919, + "grad_norm": 3.7704360485076904, + "learning_rate": 3.2719836400818e-05, + "loss": 3.9224, + "step": 321 + }, + { + "epoch": 0.09883364027010436, + "grad_norm": 2.997194290161133, + "learning_rate": 3.282208588957055e-05, + "loss": 3.9454, + "step": 322 + }, + { + "epoch": 0.09914057704112952, + "grad_norm": 3.4990131855010986, + "learning_rate": 3.2924335378323114e-05, + "loss": 3.8682, + "step": 323 + }, + { + "epoch": 0.09944751381215469, + "grad_norm": 3.146879196166992, + "learning_rate": 3.302658486707567e-05, + "loss": 3.8863, + "step": 324 + }, + { + "epoch": 0.09975445058317986, + "grad_norm": 4.963291645050049, + "learning_rate": 3.312883435582822e-05, + "loss": 3.9951, + "step": 325 + }, + { + "epoch": 0.10006138735420503, + "grad_norm": 2.4511775970458984, + "learning_rate": 3.323108384458078e-05, + "loss": 3.875, + "step": 326 + }, + { + "epoch": 0.1003683241252302, + "grad_norm": 5.670922756195068, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.0446, + "step": 327 + }, + { + "epoch": 0.10067526089625538, + "grad_norm": 3.54237699508667, + "learning_rate": 3.3435582822085895e-05, + "loss": 3.9877, + "step": 328 + }, + { + "epoch": 0.10098219766728055, + "grad_norm": 2.9059271812438965, + "learning_rate": 3.353783231083845e-05, + "loss": 3.949, + "step": 329 + }, + { + "epoch": 0.10128913443830571, + "grad_norm": 3.870962381362915, + "learning_rate": 3.3640081799591e-05, + "loss": 3.8985, + "step": 330 + }, + { + "epoch": 0.10159607120933088, + "grad_norm": 3.275129556655884, + "learning_rate": 3.3742331288343556e-05, + "loss": 4.0209, + "step": 331 + }, + { + "epoch": 0.10190300798035605, + "grad_norm": 3.040931224822998, + "learning_rate": 3.3844580777096117e-05, + "loss": 3.9938, + "step": 332 + }, + { + "epoch": 0.10220994475138122, + "grad_norm": 4.3355584144592285, + "learning_rate": 3.394683026584867e-05, + "loss": 3.876, + "step": 333 + }, + { + "epoch": 0.10251688152240639, + "grad_norm": 3.0981085300445557, + "learning_rate": 3.4049079754601224e-05, + "loss": 3.9014, + "step": 334 + }, + { + "epoch": 0.10282381829343155, + "grad_norm": 3.2902655601501465, + "learning_rate": 3.4151329243353784e-05, + "loss": 3.9599, + "step": 335 + }, + { + "epoch": 0.10313075506445672, + "grad_norm": 3.496514081954956, + "learning_rate": 3.425357873210634e-05, + "loss": 3.9005, + "step": 336 + }, + { + "epoch": 0.10343769183548189, + "grad_norm": 3.4680685997009277, + "learning_rate": 3.43558282208589e-05, + "loss": 3.8591, + "step": 337 + }, + { + "epoch": 0.10374462860650706, + "grad_norm": 3.3041694164276123, + "learning_rate": 3.445807770961145e-05, + "loss": 3.9566, + "step": 338 + }, + { + "epoch": 0.10405156537753223, + "grad_norm": 3.519709825515747, + "learning_rate": 3.456032719836401e-05, + "loss": 3.9219, + "step": 339 + }, + { + "epoch": 0.1043585021485574, + "grad_norm": 3.932344436645508, + "learning_rate": 3.4662576687116566e-05, + "loss": 3.9155, + "step": 340 + }, + { + "epoch": 0.10466543891958256, + "grad_norm": 3.3109822273254395, + "learning_rate": 3.476482617586912e-05, + "loss": 3.9729, + "step": 341 + }, + { + "epoch": 0.10497237569060773, + "grad_norm": 4.556341648101807, + "learning_rate": 3.486707566462168e-05, + "loss": 3.9459, + "step": 342 + }, + { + "epoch": 0.1052793124616329, + "grad_norm": 2.9105725288391113, + "learning_rate": 3.4969325153374234e-05, + "loss": 3.9384, + "step": 343 + }, + { + "epoch": 0.10558624923265807, + "grad_norm": 3.865682601928711, + "learning_rate": 3.5071574642126794e-05, + "loss": 3.9826, + "step": 344 + }, + { + "epoch": 0.10589318600368323, + "grad_norm": 2.8606700897216797, + "learning_rate": 3.517382413087935e-05, + "loss": 3.8184, + "step": 345 + }, + { + "epoch": 0.10620012277470842, + "grad_norm": 4.323507785797119, + "learning_rate": 3.527607361963191e-05, + "loss": 3.8772, + "step": 346 + }, + { + "epoch": 0.10650705954573358, + "grad_norm": 2.890390157699585, + "learning_rate": 3.537832310838446e-05, + "loss": 3.8769, + "step": 347 + }, + { + "epoch": 0.10681399631675875, + "grad_norm": 4.008283615112305, + "learning_rate": 3.5480572597137015e-05, + "loss": 3.8796, + "step": 348 + }, + { + "epoch": 0.10712093308778392, + "grad_norm": 3.3605823516845703, + "learning_rate": 3.558282208588957e-05, + "loss": 3.8924, + "step": 349 + }, + { + "epoch": 0.10742786985880909, + "grad_norm": 3.6573123931884766, + "learning_rate": 3.568507157464213e-05, + "loss": 3.812, + "step": 350 + }, + { + "epoch": 0.10773480662983426, + "grad_norm": 3.0771777629852295, + "learning_rate": 3.578732106339468e-05, + "loss": 3.8958, + "step": 351 + }, + { + "epoch": 0.10804174340085942, + "grad_norm": 3.6483314037323, + "learning_rate": 3.5889570552147236e-05, + "loss": 3.8863, + "step": 352 + }, + { + "epoch": 0.10834868017188459, + "grad_norm": 3.1320669651031494, + "learning_rate": 3.59918200408998e-05, + "loss": 3.8194, + "step": 353 + }, + { + "epoch": 0.10865561694290976, + "grad_norm": 3.6510627269744873, + "learning_rate": 3.609406952965235e-05, + "loss": 3.8916, + "step": 354 + }, + { + "epoch": 0.10896255371393493, + "grad_norm": 3.0419273376464844, + "learning_rate": 3.619631901840491e-05, + "loss": 3.7907, + "step": 355 + }, + { + "epoch": 0.1092694904849601, + "grad_norm": 4.519289493560791, + "learning_rate": 3.6298568507157465e-05, + "loss": 3.8902, + "step": 356 + }, + { + "epoch": 0.10957642725598526, + "grad_norm": 2.938493251800537, + "learning_rate": 3.6400817995910025e-05, + "loss": 3.8675, + "step": 357 + }, + { + "epoch": 0.10988336402701043, + "grad_norm": 4.398004531860352, + "learning_rate": 3.650306748466258e-05, + "loss": 3.9535, + "step": 358 + }, + { + "epoch": 0.1101903007980356, + "grad_norm": 2.9128408432006836, + "learning_rate": 3.660531697341513e-05, + "loss": 3.944, + "step": 359 + }, + { + "epoch": 0.11049723756906077, + "grad_norm": 5.364169597625732, + "learning_rate": 3.670756646216769e-05, + "loss": 3.9289, + "step": 360 + }, + { + "epoch": 0.11080417434008594, + "grad_norm": 2.8434085845947266, + "learning_rate": 3.6809815950920246e-05, + "loss": 3.8204, + "step": 361 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.494234561920166, + "learning_rate": 3.6912065439672807e-05, + "loss": 3.8518, + "step": 362 + }, + { + "epoch": 0.11141804788213629, + "grad_norm": 2.959608554840088, + "learning_rate": 3.701431492842536e-05, + "loss": 3.8365, + "step": 363 + }, + { + "epoch": 0.11172498465316145, + "grad_norm": 3.4115726947784424, + "learning_rate": 3.711656441717792e-05, + "loss": 3.8507, + "step": 364 + }, + { + "epoch": 0.11203192142418662, + "grad_norm": 3.8023531436920166, + "learning_rate": 3.7218813905930474e-05, + "loss": 3.8544, + "step": 365 + }, + { + "epoch": 0.11233885819521179, + "grad_norm": 3.0639398097991943, + "learning_rate": 3.732106339468303e-05, + "loss": 3.8772, + "step": 366 + }, + { + "epoch": 0.11264579496623696, + "grad_norm": 4.241199016571045, + "learning_rate": 3.742331288343559e-05, + "loss": 3.7739, + "step": 367 + }, + { + "epoch": 0.11295273173726213, + "grad_norm": 2.977330446243286, + "learning_rate": 3.752556237218814e-05, + "loss": 3.8376, + "step": 368 + }, + { + "epoch": 0.1132596685082873, + "grad_norm": 4.574001789093018, + "learning_rate": 3.7627811860940696e-05, + "loss": 3.8761, + "step": 369 + }, + { + "epoch": 0.11356660527931246, + "grad_norm": 3.1499617099761963, + "learning_rate": 3.773006134969325e-05, + "loss": 3.8884, + "step": 370 + }, + { + "epoch": 0.11387354205033763, + "grad_norm": 3.81887149810791, + "learning_rate": 3.783231083844581e-05, + "loss": 3.8474, + "step": 371 + }, + { + "epoch": 0.1141804788213628, + "grad_norm": 3.424117088317871, + "learning_rate": 3.793456032719836e-05, + "loss": 3.8715, + "step": 372 + }, + { + "epoch": 0.11448741559238797, + "grad_norm": 4.431595325469971, + "learning_rate": 3.8036809815950924e-05, + "loss": 3.8305, + "step": 373 + }, + { + "epoch": 0.11479435236341314, + "grad_norm": 3.1664443016052246, + "learning_rate": 3.813905930470348e-05, + "loss": 3.8203, + "step": 374 + }, + { + "epoch": 0.1151012891344383, + "grad_norm": 4.312273025512695, + "learning_rate": 3.824130879345603e-05, + "loss": 3.8195, + "step": 375 + }, + { + "epoch": 0.11540822590546347, + "grad_norm": 3.0893726348876953, + "learning_rate": 3.834355828220859e-05, + "loss": 3.8248, + "step": 376 + }, + { + "epoch": 0.11571516267648864, + "grad_norm": 4.526726722717285, + "learning_rate": 3.8445807770961145e-05, + "loss": 3.8505, + "step": 377 + }, + { + "epoch": 0.11602209944751381, + "grad_norm": 2.5805325508117676, + "learning_rate": 3.8548057259713705e-05, + "loss": 3.8153, + "step": 378 + }, + { + "epoch": 0.11632903621853898, + "grad_norm": 4.6043381690979, + "learning_rate": 3.865030674846626e-05, + "loss": 3.8248, + "step": 379 + }, + { + "epoch": 0.11663597298956414, + "grad_norm": 3.0713136196136475, + "learning_rate": 3.875255623721882e-05, + "loss": 3.7687, + "step": 380 + }, + { + "epoch": 0.11694290976058933, + "grad_norm": 3.6344685554504395, + "learning_rate": 3.885480572597137e-05, + "loss": 3.8061, + "step": 381 + }, + { + "epoch": 0.1172498465316145, + "grad_norm": 3.6261723041534424, + "learning_rate": 3.895705521472393e-05, + "loss": 3.7939, + "step": 382 + }, + { + "epoch": 0.11755678330263966, + "grad_norm": 3.811779260635376, + "learning_rate": 3.905930470347649e-05, + "loss": 3.7973, + "step": 383 + }, + { + "epoch": 0.11786372007366483, + "grad_norm": 3.741685628890991, + "learning_rate": 3.916155419222904e-05, + "loss": 3.8149, + "step": 384 + }, + { + "epoch": 0.11817065684469, + "grad_norm": 3.330526351928711, + "learning_rate": 3.92638036809816e-05, + "loss": 3.8058, + "step": 385 + }, + { + "epoch": 0.11847759361571517, + "grad_norm": 3.2102115154266357, + "learning_rate": 3.9366053169734155e-05, + "loss": 3.7199, + "step": 386 + }, + { + "epoch": 0.11878453038674033, + "grad_norm": 3.670474052429199, + "learning_rate": 3.946830265848671e-05, + "loss": 3.8087, + "step": 387 + }, + { + "epoch": 0.1190914671577655, + "grad_norm": 3.218390941619873, + "learning_rate": 3.957055214723926e-05, + "loss": 3.7631, + "step": 388 + }, + { + "epoch": 0.11939840392879067, + "grad_norm": 4.2256693840026855, + "learning_rate": 3.967280163599182e-05, + "loss": 3.7624, + "step": 389 + }, + { + "epoch": 0.11970534069981584, + "grad_norm": 2.86247181892395, + "learning_rate": 3.9775051124744376e-05, + "loss": 3.7638, + "step": 390 + }, + { + "epoch": 0.120012277470841, + "grad_norm": 4.083118915557861, + "learning_rate": 3.987730061349693e-05, + "loss": 3.7581, + "step": 391 + }, + { + "epoch": 0.12031921424186617, + "grad_norm": 2.836794376373291, + "learning_rate": 3.997955010224949e-05, + "loss": 3.7466, + "step": 392 + }, + { + "epoch": 0.12062615101289134, + "grad_norm": 4.071137428283691, + "learning_rate": 4.0081799591002043e-05, + "loss": 3.7836, + "step": 393 + }, + { + "epoch": 0.12093308778391651, + "grad_norm": 3.3141064643859863, + "learning_rate": 4.0184049079754604e-05, + "loss": 3.754, + "step": 394 + }, + { + "epoch": 0.12124002455494168, + "grad_norm": 3.6064393520355225, + "learning_rate": 4.028629856850716e-05, + "loss": 3.8379, + "step": 395 + }, + { + "epoch": 0.12154696132596685, + "grad_norm": 3.7306606769561768, + "learning_rate": 4.038854805725972e-05, + "loss": 3.6848, + "step": 396 + }, + { + "epoch": 0.12185389809699201, + "grad_norm": 3.5877859592437744, + "learning_rate": 4.049079754601227e-05, + "loss": 3.8201, + "step": 397 + }, + { + "epoch": 0.12216083486801718, + "grad_norm": 3.930271625518799, + "learning_rate": 4.059304703476483e-05, + "loss": 3.7507, + "step": 398 + }, + { + "epoch": 0.12246777163904236, + "grad_norm": 2.974968194961548, + "learning_rate": 4.0695296523517386e-05, + "loss": 3.7545, + "step": 399 + }, + { + "epoch": 0.12277470841006753, + "grad_norm": 4.655934810638428, + "learning_rate": 4.079754601226994e-05, + "loss": 3.8093, + "step": 400 + }, + { + "epoch": 0.1230816451810927, + "grad_norm": 3.201986312866211, + "learning_rate": 4.08997955010225e-05, + "loss": 3.7252, + "step": 401 + }, + { + "epoch": 0.12338858195211787, + "grad_norm": 4.447626113891602, + "learning_rate": 4.100204498977505e-05, + "loss": 3.7132, + "step": 402 + }, + { + "epoch": 0.12369551872314304, + "grad_norm": 2.6518118381500244, + "learning_rate": 4.1104294478527614e-05, + "loss": 3.7637, + "step": 403 + }, + { + "epoch": 0.1240024554941682, + "grad_norm": 5.116448402404785, + "learning_rate": 4.120654396728017e-05, + "loss": 3.6991, + "step": 404 + }, + { + "epoch": 0.12430939226519337, + "grad_norm": 2.7780613899230957, + "learning_rate": 4.130879345603272e-05, + "loss": 3.7555, + "step": 405 + }, + { + "epoch": 0.12461632903621854, + "grad_norm": 4.281010627746582, + "learning_rate": 4.1411042944785274e-05, + "loss": 3.688, + "step": 406 + }, + { + "epoch": 0.12492326580724371, + "grad_norm": 2.851562023162842, + "learning_rate": 4.1513292433537835e-05, + "loss": 3.7557, + "step": 407 + }, + { + "epoch": 0.1252302025782689, + "grad_norm": 4.092229843139648, + "learning_rate": 4.161554192229039e-05, + "loss": 3.7179, + "step": 408 + }, + { + "epoch": 0.12553713934929406, + "grad_norm": 3.410094976425171, + "learning_rate": 4.171779141104294e-05, + "loss": 3.7292, + "step": 409 + }, + { + "epoch": 0.12584407612031923, + "grad_norm": 4.266562461853027, + "learning_rate": 4.18200408997955e-05, + "loss": 3.8204, + "step": 410 + }, + { + "epoch": 0.1261510128913444, + "grad_norm": 2.997642755508423, + "learning_rate": 4.1922290388548056e-05, + "loss": 3.7773, + "step": 411 + }, + { + "epoch": 0.12645794966236956, + "grad_norm": 4.50873327255249, + "learning_rate": 4.2024539877300617e-05, + "loss": 3.7255, + "step": 412 + }, + { + "epoch": 0.12676488643339473, + "grad_norm": 3.65312123298645, + "learning_rate": 4.212678936605317e-05, + "loss": 3.6472, + "step": 413 + }, + { + "epoch": 0.1270718232044199, + "grad_norm": 3.985487222671509, + "learning_rate": 4.222903885480573e-05, + "loss": 3.6915, + "step": 414 + }, + { + "epoch": 0.12737875997544507, + "grad_norm": 3.6020219326019287, + "learning_rate": 4.2331288343558284e-05, + "loss": 3.7299, + "step": 415 + }, + { + "epoch": 0.12768569674647023, + "grad_norm": 3.414529323577881, + "learning_rate": 4.243353783231084e-05, + "loss": 3.7827, + "step": 416 + }, + { + "epoch": 0.1279926335174954, + "grad_norm": 3.537292718887329, + "learning_rate": 4.25357873210634e-05, + "loss": 3.751, + "step": 417 + }, + { + "epoch": 0.12829957028852057, + "grad_norm": 3.5442280769348145, + "learning_rate": 4.263803680981595e-05, + "loss": 3.6828, + "step": 418 + }, + { + "epoch": 0.12860650705954574, + "grad_norm": 3.9816019535064697, + "learning_rate": 4.274028629856851e-05, + "loss": 3.7668, + "step": 419 + }, + { + "epoch": 0.1289134438305709, + "grad_norm": 3.1632657051086426, + "learning_rate": 4.2842535787321066e-05, + "loss": 3.6946, + "step": 420 + }, + { + "epoch": 0.12922038060159607, + "grad_norm": 4.731013298034668, + "learning_rate": 4.2944785276073626e-05, + "loss": 3.7078, + "step": 421 + }, + { + "epoch": 0.12952731737262124, + "grad_norm": 2.7973382472991943, + "learning_rate": 4.304703476482618e-05, + "loss": 3.5934, + "step": 422 + }, + { + "epoch": 0.1298342541436464, + "grad_norm": 4.555461406707764, + "learning_rate": 4.3149284253578733e-05, + "loss": 3.7406, + "step": 423 + }, + { + "epoch": 0.13014119091467158, + "grad_norm": 3.25795841217041, + "learning_rate": 4.3251533742331294e-05, + "loss": 3.6302, + "step": 424 + }, + { + "epoch": 0.13044812768569675, + "grad_norm": 3.9974427223205566, + "learning_rate": 4.335378323108385e-05, + "loss": 3.6995, + "step": 425 + }, + { + "epoch": 0.13075506445672191, + "grad_norm": 3.4234917163848877, + "learning_rate": 4.34560327198364e-05, + "loss": 3.727, + "step": 426 + }, + { + "epoch": 0.13106200122774708, + "grad_norm": 3.40573787689209, + "learning_rate": 4.3558282208588955e-05, + "loss": 3.6964, + "step": 427 + }, + { + "epoch": 0.13136893799877225, + "grad_norm": 3.6903765201568604, + "learning_rate": 4.3660531697341515e-05, + "loss": 3.7139, + "step": 428 + }, + { + "epoch": 0.13167587476979742, + "grad_norm": 3.3252439498901367, + "learning_rate": 4.376278118609407e-05, + "loss": 3.7221, + "step": 429 + }, + { + "epoch": 0.1319828115408226, + "grad_norm": 3.591610908508301, + "learning_rate": 4.386503067484663e-05, + "loss": 3.6592, + "step": 430 + }, + { + "epoch": 0.13228974831184775, + "grad_norm": 3.584683418273926, + "learning_rate": 4.396728016359918e-05, + "loss": 3.695, + "step": 431 + }, + { + "epoch": 0.13259668508287292, + "grad_norm": 3.5093443393707275, + "learning_rate": 4.4069529652351736e-05, + "loss": 3.6368, + "step": 432 + }, + { + "epoch": 0.1329036218538981, + "grad_norm": 3.5040347576141357, + "learning_rate": 4.41717791411043e-05, + "loss": 3.6463, + "step": 433 + }, + { + "epoch": 0.13321055862492326, + "grad_norm": 3.534536361694336, + "learning_rate": 4.427402862985685e-05, + "loss": 3.681, + "step": 434 + }, + { + "epoch": 0.13351749539594843, + "grad_norm": 4.016106605529785, + "learning_rate": 4.437627811860941e-05, + "loss": 3.7592, + "step": 435 + }, + { + "epoch": 0.1338244321669736, + "grad_norm": 3.4661898612976074, + "learning_rate": 4.4478527607361964e-05, + "loss": 3.6437, + "step": 436 + }, + { + "epoch": 0.13413136893799876, + "grad_norm": 3.917189359664917, + "learning_rate": 4.4580777096114525e-05, + "loss": 3.6809, + "step": 437 + }, + { + "epoch": 0.13443830570902393, + "grad_norm": 3.472147226333618, + "learning_rate": 4.468302658486708e-05, + "loss": 3.5978, + "step": 438 + }, + { + "epoch": 0.1347452424800491, + "grad_norm": 3.2357044219970703, + "learning_rate": 4.478527607361964e-05, + "loss": 3.6758, + "step": 439 + }, + { + "epoch": 0.13505217925107427, + "grad_norm": 3.8607826232910156, + "learning_rate": 4.488752556237219e-05, + "loss": 3.7155, + "step": 440 + }, + { + "epoch": 0.13535911602209943, + "grad_norm": 3.085242509841919, + "learning_rate": 4.4989775051124746e-05, + "loss": 3.674, + "step": 441 + }, + { + "epoch": 0.1356660527931246, + "grad_norm": 4.0473432540893555, + "learning_rate": 4.5092024539877307e-05, + "loss": 3.6542, + "step": 442 + }, + { + "epoch": 0.1359729895641498, + "grad_norm": 3.4742088317871094, + "learning_rate": 4.519427402862986e-05, + "loss": 3.6226, + "step": 443 + }, + { + "epoch": 0.13627992633517497, + "grad_norm": 3.8838884830474854, + "learning_rate": 4.5296523517382414e-05, + "loss": 3.695, + "step": 444 + }, + { + "epoch": 0.13658686310620013, + "grad_norm": 3.1551895141601562, + "learning_rate": 4.539877300613497e-05, + "loss": 3.6886, + "step": 445 + }, + { + "epoch": 0.1368937998772253, + "grad_norm": 3.6824824810028076, + "learning_rate": 4.550102249488753e-05, + "loss": 3.6397, + "step": 446 + }, + { + "epoch": 0.13720073664825047, + "grad_norm": 3.3671298027038574, + "learning_rate": 4.560327198364008e-05, + "loss": 3.5983, + "step": 447 + }, + { + "epoch": 0.13750767341927564, + "grad_norm": 4.11976957321167, + "learning_rate": 4.570552147239264e-05, + "loss": 3.6371, + "step": 448 + }, + { + "epoch": 0.1378146101903008, + "grad_norm": 3.2035205364227295, + "learning_rate": 4.5807770961145195e-05, + "loss": 3.6097, + "step": 449 + }, + { + "epoch": 0.13812154696132597, + "grad_norm": 4.944174289703369, + "learning_rate": 4.591002044989775e-05, + "loss": 3.6317, + "step": 450 + }, + { + "epoch": 0.13842848373235114, + "grad_norm": 3.0040266513824463, + "learning_rate": 4.601226993865031e-05, + "loss": 3.6407, + "step": 451 + }, + { + "epoch": 0.1387354205033763, + "grad_norm": 5.124639511108398, + "learning_rate": 4.611451942740286e-05, + "loss": 3.6539, + "step": 452 + }, + { + "epoch": 0.13904235727440148, + "grad_norm": 2.792884349822998, + "learning_rate": 4.6216768916155423e-05, + "loss": 3.6542, + "step": 453 + }, + { + "epoch": 0.13934929404542665, + "grad_norm": 4.394725799560547, + "learning_rate": 4.631901840490798e-05, + "loss": 3.6811, + "step": 454 + }, + { + "epoch": 0.13965623081645182, + "grad_norm": 3.209400177001953, + "learning_rate": 4.642126789366054e-05, + "loss": 3.6635, + "step": 455 + }, + { + "epoch": 0.13996316758747698, + "grad_norm": 3.6599526405334473, + "learning_rate": 4.652351738241309e-05, + "loss": 3.5732, + "step": 456 + }, + { + "epoch": 0.14027010435850215, + "grad_norm": 3.6527204513549805, + "learning_rate": 4.6625766871165645e-05, + "loss": 3.5979, + "step": 457 + }, + { + "epoch": 0.14057704112952732, + "grad_norm": 3.4562110900878906, + "learning_rate": 4.6728016359918205e-05, + "loss": 3.6761, + "step": 458 + }, + { + "epoch": 0.1408839779005525, + "grad_norm": 3.5935721397399902, + "learning_rate": 4.683026584867076e-05, + "loss": 3.6598, + "step": 459 + }, + { + "epoch": 0.14119091467157766, + "grad_norm": 3.4518251419067383, + "learning_rate": 4.693251533742332e-05, + "loss": 3.5707, + "step": 460 + }, + { + "epoch": 0.14149785144260282, + "grad_norm": 3.3248815536499023, + "learning_rate": 4.703476482617587e-05, + "loss": 3.6949, + "step": 461 + }, + { + "epoch": 0.141804788213628, + "grad_norm": 3.6379971504211426, + "learning_rate": 4.7137014314928426e-05, + "loss": 3.6265, + "step": 462 + }, + { + "epoch": 0.14211172498465316, + "grad_norm": 4.068325996398926, + "learning_rate": 4.723926380368098e-05, + "loss": 3.6096, + "step": 463 + }, + { + "epoch": 0.14241866175567833, + "grad_norm": 3.0870959758758545, + "learning_rate": 4.734151329243354e-05, + "loss": 3.5201, + "step": 464 + }, + { + "epoch": 0.1427255985267035, + "grad_norm": 4.013638973236084, + "learning_rate": 4.7443762781186094e-05, + "loss": 3.5845, + "step": 465 + }, + { + "epoch": 0.14303253529772866, + "grad_norm": 3.421921968460083, + "learning_rate": 4.754601226993865e-05, + "loss": 3.6718, + "step": 466 + }, + { + "epoch": 0.14333947206875383, + "grad_norm": 3.4814112186431885, + "learning_rate": 4.764826175869121e-05, + "loss": 3.6225, + "step": 467 + }, + { + "epoch": 0.143646408839779, + "grad_norm": 2.9323105812072754, + "learning_rate": 4.775051124744376e-05, + "loss": 3.5881, + "step": 468 + }, + { + "epoch": 0.14395334561080417, + "grad_norm": 3.862344264984131, + "learning_rate": 4.785276073619632e-05, + "loss": 3.6264, + "step": 469 + }, + { + "epoch": 0.14426028238182934, + "grad_norm": 2.950495481491089, + "learning_rate": 4.7955010224948876e-05, + "loss": 3.5891, + "step": 470 + }, + { + "epoch": 0.1445672191528545, + "grad_norm": 4.360744476318359, + "learning_rate": 4.8057259713701436e-05, + "loss": 3.6746, + "step": 471 + }, + { + "epoch": 0.14487415592387967, + "grad_norm": 2.689297914505005, + "learning_rate": 4.815950920245399e-05, + "loss": 3.616, + "step": 472 + }, + { + "epoch": 0.14518109269490484, + "grad_norm": 4.433006286621094, + "learning_rate": 4.826175869120655e-05, + "loss": 3.6259, + "step": 473 + }, + { + "epoch": 0.14548802946593, + "grad_norm": 2.9184467792510986, + "learning_rate": 4.8364008179959104e-05, + "loss": 3.59, + "step": 474 + }, + { + "epoch": 0.14579496623695518, + "grad_norm": 4.472714424133301, + "learning_rate": 4.846625766871166e-05, + "loss": 3.5608, + "step": 475 + }, + { + "epoch": 0.14610190300798034, + "grad_norm": 3.0839431285858154, + "learning_rate": 4.856850715746422e-05, + "loss": 3.6069, + "step": 476 + }, + { + "epoch": 0.1464088397790055, + "grad_norm": 3.8900411128997803, + "learning_rate": 4.867075664621677e-05, + "loss": 3.5387, + "step": 477 + }, + { + "epoch": 0.14671577655003068, + "grad_norm": 3.0446956157684326, + "learning_rate": 4.877300613496933e-05, + "loss": 3.5374, + "step": 478 + }, + { + "epoch": 0.14702271332105588, + "grad_norm": 3.805018901824951, + "learning_rate": 4.8875255623721885e-05, + "loss": 3.6032, + "step": 479 + }, + { + "epoch": 0.14732965009208104, + "grad_norm": 2.9937491416931152, + "learning_rate": 4.897750511247444e-05, + "loss": 3.548, + "step": 480 + }, + { + "epoch": 0.1476365868631062, + "grad_norm": 4.103757858276367, + "learning_rate": 4.907975460122699e-05, + "loss": 3.6292, + "step": 481 + }, + { + "epoch": 0.14794352363413138, + "grad_norm": 2.8275530338287354, + "learning_rate": 4.918200408997955e-05, + "loss": 3.5885, + "step": 482 + }, + { + "epoch": 0.14825046040515655, + "grad_norm": 4.104444980621338, + "learning_rate": 4.928425357873211e-05, + "loss": 3.5566, + "step": 483 + }, + { + "epoch": 0.14855739717618172, + "grad_norm": 2.820648670196533, + "learning_rate": 4.938650306748466e-05, + "loss": 3.6576, + "step": 484 + }, + { + "epoch": 0.14886433394720688, + "grad_norm": 4.639568328857422, + "learning_rate": 4.948875255623722e-05, + "loss": 3.583, + "step": 485 + }, + { + "epoch": 0.14917127071823205, + "grad_norm": 2.8675858974456787, + "learning_rate": 4.9591002044989774e-05, + "loss": 3.5982, + "step": 486 + }, + { + "epoch": 0.14947820748925722, + "grad_norm": 4.820484638214111, + "learning_rate": 4.9693251533742335e-05, + "loss": 3.5479, + "step": 487 + }, + { + "epoch": 0.1497851442602824, + "grad_norm": 2.9569075107574463, + "learning_rate": 4.979550102249489e-05, + "loss": 3.5846, + "step": 488 + }, + { + "epoch": 0.15009208103130756, + "grad_norm": 4.402152061462402, + "learning_rate": 4.989775051124745e-05, + "loss": 3.5368, + "step": 489 + }, + { + "epoch": 0.15039901780233272, + "grad_norm": 3.0454704761505127, + "learning_rate": 5e-05, + "loss": 3.5233, + "step": 490 + }, + { + "epoch": 0.1507059545733579, + "grad_norm": 3.564425468444824, + "learning_rate": 5.010224948875256e-05, + "loss": 3.5747, + "step": 491 + }, + { + "epoch": 0.15101289134438306, + "grad_norm": 3.2065536975860596, + "learning_rate": 5.020449897750511e-05, + "loss": 3.4803, + "step": 492 + }, + { + "epoch": 0.15131982811540823, + "grad_norm": 4.06170129776001, + "learning_rate": 5.030674846625767e-05, + "loss": 3.5867, + "step": 493 + }, + { + "epoch": 0.1516267648864334, + "grad_norm": 2.937181234359741, + "learning_rate": 5.040899795501023e-05, + "loss": 3.5098, + "step": 494 + }, + { + "epoch": 0.15193370165745856, + "grad_norm": 3.7272653579711914, + "learning_rate": 5.051124744376279e-05, + "loss": 3.5959, + "step": 495 + }, + { + "epoch": 0.15224063842848373, + "grad_norm": 2.8606886863708496, + "learning_rate": 5.061349693251534e-05, + "loss": 3.4881, + "step": 496 + }, + { + "epoch": 0.1525475751995089, + "grad_norm": 3.4861185550689697, + "learning_rate": 5.07157464212679e-05, + "loss": 3.563, + "step": 497 + }, + { + "epoch": 0.15285451197053407, + "grad_norm": 3.1362967491149902, + "learning_rate": 5.081799591002045e-05, + "loss": 3.5564, + "step": 498 + }, + { + "epoch": 0.15316144874155924, + "grad_norm": 3.360508441925049, + "learning_rate": 5.0920245398773005e-05, + "loss": 3.5307, + "step": 499 + }, + { + "epoch": 0.1534683855125844, + "grad_norm": 3.2896840572357178, + "learning_rate": 5.1022494887525566e-05, + "loss": 3.4843, + "step": 500 + }, + { + "epoch": 0.15377532228360957, + "grad_norm": 3.320429801940918, + "learning_rate": 5.112474437627812e-05, + "loss": 3.484, + "step": 501 + }, + { + "epoch": 0.15408225905463474, + "grad_norm": 3.409586191177368, + "learning_rate": 5.122699386503068e-05, + "loss": 3.506, + "step": 502 + }, + { + "epoch": 0.1543891958256599, + "grad_norm": 3.0944409370422363, + "learning_rate": 5.1329243353783227e-05, + "loss": 3.5011, + "step": 503 + }, + { + "epoch": 0.15469613259668508, + "grad_norm": 3.7220418453216553, + "learning_rate": 5.143149284253579e-05, + "loss": 3.5629, + "step": 504 + }, + { + "epoch": 0.15500306936771024, + "grad_norm": 3.217435359954834, + "learning_rate": 5.153374233128835e-05, + "loss": 3.4957, + "step": 505 + }, + { + "epoch": 0.1553100061387354, + "grad_norm": 4.0457444190979, + "learning_rate": 5.163599182004091e-05, + "loss": 3.5152, + "step": 506 + }, + { + "epoch": 0.15561694290976058, + "grad_norm": 2.9380006790161133, + "learning_rate": 5.1738241308793455e-05, + "loss": 3.5261, + "step": 507 + }, + { + "epoch": 0.15592387968078575, + "grad_norm": 4.134535312652588, + "learning_rate": 5.1840490797546015e-05, + "loss": 3.5622, + "step": 508 + }, + { + "epoch": 0.15623081645181092, + "grad_norm": 2.8209407329559326, + "learning_rate": 5.1942740286298575e-05, + "loss": 3.5335, + "step": 509 + }, + { + "epoch": 0.15653775322283608, + "grad_norm": 4.4260711669921875, + "learning_rate": 5.204498977505112e-05, + "loss": 3.5554, + "step": 510 + }, + { + "epoch": 0.15684468999386125, + "grad_norm": 2.8649590015411377, + "learning_rate": 5.214723926380368e-05, + "loss": 3.4989, + "step": 511 + }, + { + "epoch": 0.15715162676488642, + "grad_norm": 4.0349812507629395, + "learning_rate": 5.224948875255624e-05, + "loss": 3.4883, + "step": 512 + }, + { + "epoch": 0.1574585635359116, + "grad_norm": 2.841923475265503, + "learning_rate": 5.2351738241308803e-05, + "loss": 3.4748, + "step": 513 + }, + { + "epoch": 0.15776550030693678, + "grad_norm": 3.8810653686523438, + "learning_rate": 5.245398773006135e-05, + "loss": 3.5403, + "step": 514 + }, + { + "epoch": 0.15807243707796195, + "grad_norm": 3.0830774307250977, + "learning_rate": 5.255623721881391e-05, + "loss": 3.513, + "step": 515 + }, + { + "epoch": 0.15837937384898712, + "grad_norm": 3.8688604831695557, + "learning_rate": 5.265848670756647e-05, + "loss": 3.5409, + "step": 516 + }, + { + "epoch": 0.1586863106200123, + "grad_norm": 2.854600429534912, + "learning_rate": 5.276073619631902e-05, + "loss": 3.4441, + "step": 517 + }, + { + "epoch": 0.15899324739103746, + "grad_norm": 3.9125611782073975, + "learning_rate": 5.286298568507158e-05, + "loss": 3.4953, + "step": 518 + }, + { + "epoch": 0.15930018416206262, + "grad_norm": 2.8626177310943604, + "learning_rate": 5.296523517382413e-05, + "loss": 3.5279, + "step": 519 + }, + { + "epoch": 0.1596071209330878, + "grad_norm": 3.5023677349090576, + "learning_rate": 5.306748466257669e-05, + "loss": 3.4886, + "step": 520 + }, + { + "epoch": 0.15991405770411296, + "grad_norm": 2.960505962371826, + "learning_rate": 5.316973415132924e-05, + "loss": 3.5278, + "step": 521 + }, + { + "epoch": 0.16022099447513813, + "grad_norm": 3.976245164871216, + "learning_rate": 5.32719836400818e-05, + "loss": 3.5236, + "step": 522 + }, + { + "epoch": 0.1605279312461633, + "grad_norm": 3.078248977661133, + "learning_rate": 5.337423312883436e-05, + "loss": 3.5194, + "step": 523 + }, + { + "epoch": 0.16083486801718846, + "grad_norm": 3.7498552799224854, + "learning_rate": 5.347648261758691e-05, + "loss": 3.5315, + "step": 524 + }, + { + "epoch": 0.16114180478821363, + "grad_norm": 2.87638258934021, + "learning_rate": 5.357873210633947e-05, + "loss": 3.434, + "step": 525 + }, + { + "epoch": 0.1614487415592388, + "grad_norm": 3.786454677581787, + "learning_rate": 5.368098159509203e-05, + "loss": 3.4985, + "step": 526 + }, + { + "epoch": 0.16175567833026397, + "grad_norm": 2.915156364440918, + "learning_rate": 5.378323108384459e-05, + "loss": 3.4979, + "step": 527 + }, + { + "epoch": 0.16206261510128914, + "grad_norm": 4.095824718475342, + "learning_rate": 5.3885480572597135e-05, + "loss": 3.4605, + "step": 528 + }, + { + "epoch": 0.1623695518723143, + "grad_norm": 2.793501853942871, + "learning_rate": 5.3987730061349695e-05, + "loss": 3.476, + "step": 529 + }, + { + "epoch": 0.16267648864333947, + "grad_norm": 3.9074480533599854, + "learning_rate": 5.4089979550102256e-05, + "loss": 3.4636, + "step": 530 + }, + { + "epoch": 0.16298342541436464, + "grad_norm": 2.8382515907287598, + "learning_rate": 5.4192229038854816e-05, + "loss": 3.4364, + "step": 531 + }, + { + "epoch": 0.1632903621853898, + "grad_norm": 3.4670751094818115, + "learning_rate": 5.429447852760736e-05, + "loss": 3.5033, + "step": 532 + }, + { + "epoch": 0.16359729895641498, + "grad_norm": 2.8805580139160156, + "learning_rate": 5.439672801635992e-05, + "loss": 3.471, + "step": 533 + }, + { + "epoch": 0.16390423572744015, + "grad_norm": 3.745434522628784, + "learning_rate": 5.4498977505112484e-05, + "loss": 3.4565, + "step": 534 + }, + { + "epoch": 0.1642111724984653, + "grad_norm": 3.290579319000244, + "learning_rate": 5.460122699386503e-05, + "loss": 3.47, + "step": 535 + }, + { + "epoch": 0.16451810926949048, + "grad_norm": 3.2988481521606445, + "learning_rate": 5.470347648261759e-05, + "loss": 3.3781, + "step": 536 + }, + { + "epoch": 0.16482504604051565, + "grad_norm": 3.3673248291015625, + "learning_rate": 5.4805725971370145e-05, + "loss": 3.4891, + "step": 537 + }, + { + "epoch": 0.16513198281154082, + "grad_norm": 3.1917717456817627, + "learning_rate": 5.4907975460122705e-05, + "loss": 3.4493, + "step": 538 + }, + { + "epoch": 0.16543891958256599, + "grad_norm": 3.3869614601135254, + "learning_rate": 5.501022494887525e-05, + "loss": 3.3954, + "step": 539 + }, + { + "epoch": 0.16574585635359115, + "grad_norm": 2.896742820739746, + "learning_rate": 5.511247443762781e-05, + "loss": 3.4465, + "step": 540 + }, + { + "epoch": 0.16605279312461632, + "grad_norm": 3.771268844604492, + "learning_rate": 5.521472392638037e-05, + "loss": 3.4889, + "step": 541 + }, + { + "epoch": 0.1663597298956415, + "grad_norm": 2.8693349361419678, + "learning_rate": 5.531697341513292e-05, + "loss": 3.3661, + "step": 542 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.093103885650635, + "learning_rate": 5.541922290388548e-05, + "loss": 3.4451, + "step": 543 + }, + { + "epoch": 0.16697360343769183, + "grad_norm": 3.050361394882202, + "learning_rate": 5.552147239263804e-05, + "loss": 3.4203, + "step": 544 + }, + { + "epoch": 0.167280540208717, + "grad_norm": 3.041480302810669, + "learning_rate": 5.56237218813906e-05, + "loss": 3.4173, + "step": 545 + }, + { + "epoch": 0.16758747697974216, + "grad_norm": 3.385680675506592, + "learning_rate": 5.572597137014315e-05, + "loss": 3.4408, + "step": 546 + }, + { + "epoch": 0.16789441375076733, + "grad_norm": 2.88845157623291, + "learning_rate": 5.582822085889571e-05, + "loss": 3.4536, + "step": 547 + }, + { + "epoch": 0.1682013505217925, + "grad_norm": 3.7155961990356445, + "learning_rate": 5.593047034764827e-05, + "loss": 3.4392, + "step": 548 + }, + { + "epoch": 0.1685082872928177, + "grad_norm": 3.4626615047454834, + "learning_rate": 5.6032719836400815e-05, + "loss": 3.4395, + "step": 549 + }, + { + "epoch": 0.16881522406384286, + "grad_norm": 3.182154417037964, + "learning_rate": 5.6134969325153376e-05, + "loss": 3.5239, + "step": 550 + }, + { + "epoch": 0.16912216083486803, + "grad_norm": 3.478602886199951, + "learning_rate": 5.6237218813905936e-05, + "loss": 3.4258, + "step": 551 + }, + { + "epoch": 0.1694290976058932, + "grad_norm": 2.9652369022369385, + "learning_rate": 5.6339468302658496e-05, + "loss": 3.3919, + "step": 552 + }, + { + "epoch": 0.16973603437691837, + "grad_norm": 3.736821413040161, + "learning_rate": 5.644171779141104e-05, + "loss": 3.4491, + "step": 553 + }, + { + "epoch": 0.17004297114794353, + "grad_norm": 2.7791361808776855, + "learning_rate": 5.6543967280163604e-05, + "loss": 3.4748, + "step": 554 + }, + { + "epoch": 0.1703499079189687, + "grad_norm": 4.583637714385986, + "learning_rate": 5.664621676891616e-05, + "loss": 3.4554, + "step": 555 + }, + { + "epoch": 0.17065684468999387, + "grad_norm": 2.8527474403381348, + "learning_rate": 5.674846625766872e-05, + "loss": 3.4327, + "step": 556 + }, + { + "epoch": 0.17096378146101904, + "grad_norm": 4.116163730621338, + "learning_rate": 5.685071574642127e-05, + "loss": 3.4043, + "step": 557 + }, + { + "epoch": 0.1712707182320442, + "grad_norm": 3.0130903720855713, + "learning_rate": 5.6952965235173825e-05, + "loss": 3.4823, + "step": 558 + }, + { + "epoch": 0.17157765500306937, + "grad_norm": 3.3556432723999023, + "learning_rate": 5.7055214723926385e-05, + "loss": 3.4464, + "step": 559 + }, + { + "epoch": 0.17188459177409454, + "grad_norm": 2.854952573776245, + "learning_rate": 5.715746421267893e-05, + "loss": 3.3768, + "step": 560 + }, + { + "epoch": 0.1721915285451197, + "grad_norm": 3.9891982078552246, + "learning_rate": 5.725971370143149e-05, + "loss": 3.3949, + "step": 561 + }, + { + "epoch": 0.17249846531614488, + "grad_norm": 2.980468511581421, + "learning_rate": 5.736196319018405e-05, + "loss": 3.459, + "step": 562 + }, + { + "epoch": 0.17280540208717005, + "grad_norm": 3.453510284423828, + "learning_rate": 5.7464212678936613e-05, + "loss": 3.4549, + "step": 563 + }, + { + "epoch": 0.1731123388581952, + "grad_norm": 2.8926782608032227, + "learning_rate": 5.756646216768916e-05, + "loss": 3.392, + "step": 564 + }, + { + "epoch": 0.17341927562922038, + "grad_norm": 3.3722894191741943, + "learning_rate": 5.766871165644172e-05, + "loss": 3.4002, + "step": 565 + }, + { + "epoch": 0.17372621240024555, + "grad_norm": 2.8093647956848145, + "learning_rate": 5.777096114519428e-05, + "loss": 3.3862, + "step": 566 + }, + { + "epoch": 0.17403314917127072, + "grad_norm": 4.1722731590271, + "learning_rate": 5.787321063394683e-05, + "loss": 3.3903, + "step": 567 + }, + { + "epoch": 0.17434008594229589, + "grad_norm": 2.778069257736206, + "learning_rate": 5.797546012269939e-05, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.17464702271332105, + "grad_norm": 3.8501908779144287, + "learning_rate": 5.807770961145195e-05, + "loss": 3.4094, + "step": 569 + }, + { + "epoch": 0.17495395948434622, + "grad_norm": 2.5164549350738525, + "learning_rate": 5.817995910020451e-05, + "loss": 3.4343, + "step": 570 + }, + { + "epoch": 0.1752608962553714, + "grad_norm": 4.0673065185546875, + "learning_rate": 5.8282208588957056e-05, + "loss": 3.3993, + "step": 571 + }, + { + "epoch": 0.17556783302639656, + "grad_norm": 2.7882072925567627, + "learning_rate": 5.8384458077709616e-05, + "loss": 3.4759, + "step": 572 + }, + { + "epoch": 0.17587476979742173, + "grad_norm": 3.3252487182617188, + "learning_rate": 5.848670756646217e-05, + "loss": 3.3562, + "step": 573 + }, + { + "epoch": 0.1761817065684469, + "grad_norm": 2.7499115467071533, + "learning_rate": 5.8588957055214724e-05, + "loss": 3.3376, + "step": 574 + }, + { + "epoch": 0.17648864333947206, + "grad_norm": 4.061224460601807, + "learning_rate": 5.8691206543967284e-05, + "loss": 3.3521, + "step": 575 + }, + { + "epoch": 0.17679558011049723, + "grad_norm": 3.022193431854248, + "learning_rate": 5.879345603271984e-05, + "loss": 3.3933, + "step": 576 + }, + { + "epoch": 0.1771025168815224, + "grad_norm": 3.2442128658294678, + "learning_rate": 5.88957055214724e-05, + "loss": 3.4531, + "step": 577 + }, + { + "epoch": 0.17740945365254757, + "grad_norm": 2.9524872303009033, + "learning_rate": 5.8997955010224945e-05, + "loss": 3.332, + "step": 578 + }, + { + "epoch": 0.17771639042357273, + "grad_norm": 3.4604902267456055, + "learning_rate": 5.9100204498977505e-05, + "loss": 3.3706, + "step": 579 + }, + { + "epoch": 0.1780233271945979, + "grad_norm": 3.05216646194458, + "learning_rate": 5.9202453987730066e-05, + "loss": 3.463, + "step": 580 + }, + { + "epoch": 0.17833026396562307, + "grad_norm": 3.427311658859253, + "learning_rate": 5.9304703476482626e-05, + "loss": 3.4204, + "step": 581 + }, + { + "epoch": 0.17863720073664824, + "grad_norm": 2.5583856105804443, + "learning_rate": 5.940695296523517e-05, + "loss": 3.4686, + "step": 582 + }, + { + "epoch": 0.1789441375076734, + "grad_norm": 3.85471248626709, + "learning_rate": 5.950920245398773e-05, + "loss": 3.4518, + "step": 583 + }, + { + "epoch": 0.17925107427869857, + "grad_norm": 2.6894235610961914, + "learning_rate": 5.9611451942740294e-05, + "loss": 3.4179, + "step": 584 + }, + { + "epoch": 0.17955801104972377, + "grad_norm": 3.7592904567718506, + "learning_rate": 5.971370143149284e-05, + "loss": 3.3197, + "step": 585 + }, + { + "epoch": 0.17986494782074894, + "grad_norm": 2.8180313110351562, + "learning_rate": 5.98159509202454e-05, + "loss": 3.4098, + "step": 586 + }, + { + "epoch": 0.1801718845917741, + "grad_norm": 3.5678224563598633, + "learning_rate": 5.991820040899796e-05, + "loss": 3.3644, + "step": 587 + }, + { + "epoch": 0.18047882136279927, + "grad_norm": 2.920607328414917, + "learning_rate": 6.002044989775052e-05, + "loss": 3.4158, + "step": 588 + }, + { + "epoch": 0.18078575813382444, + "grad_norm": 2.9465436935424805, + "learning_rate": 6.012269938650307e-05, + "loss": 3.3369, + "step": 589 + }, + { + "epoch": 0.1810926949048496, + "grad_norm": 3.8760533332824707, + "learning_rate": 6.022494887525563e-05, + "loss": 3.4205, + "step": 590 + }, + { + "epoch": 0.18139963167587478, + "grad_norm": 3.2972259521484375, + "learning_rate": 6.032719836400819e-05, + "loss": 3.3234, + "step": 591 + }, + { + "epoch": 0.18170656844689995, + "grad_norm": 2.8855841159820557, + "learning_rate": 6.0429447852760736e-05, + "loss": 3.4172, + "step": 592 + }, + { + "epoch": 0.18201350521792511, + "grad_norm": 3.3035166263580322, + "learning_rate": 6.05316973415133e-05, + "loss": 3.3235, + "step": 593 + }, + { + "epoch": 0.18232044198895028, + "grad_norm": 2.5975232124328613, + "learning_rate": 6.063394683026585e-05, + "loss": 3.3245, + "step": 594 + }, + { + "epoch": 0.18262737875997545, + "grad_norm": 3.68007755279541, + "learning_rate": 6.073619631901841e-05, + "loss": 3.4348, + "step": 595 + }, + { + "epoch": 0.18293431553100062, + "grad_norm": 2.774419069290161, + "learning_rate": 6.083844580777096e-05, + "loss": 3.2763, + "step": 596 + }, + { + "epoch": 0.1832412523020258, + "grad_norm": 3.686140298843384, + "learning_rate": 6.094069529652352e-05, + "loss": 3.29, + "step": 597 + }, + { + "epoch": 0.18354818907305095, + "grad_norm": 2.71142315864563, + "learning_rate": 6.104294478527609e-05, + "loss": 3.3899, + "step": 598 + }, + { + "epoch": 0.18385512584407612, + "grad_norm": 3.725736141204834, + "learning_rate": 6.114519427402863e-05, + "loss": 3.3844, + "step": 599 + }, + { + "epoch": 0.1841620626151013, + "grad_norm": 2.691237211227417, + "learning_rate": 6.124744376278119e-05, + "loss": 3.3138, + "step": 600 + }, + { + "epoch": 0.18446899938612646, + "grad_norm": 3.467499256134033, + "learning_rate": 6.134969325153375e-05, + "loss": 3.3501, + "step": 601 + }, + { + "epoch": 0.18477593615715163, + "grad_norm": 2.776309013366699, + "learning_rate": 6.14519427402863e-05, + "loss": 3.3278, + "step": 602 + }, + { + "epoch": 0.1850828729281768, + "grad_norm": 3.4674019813537598, + "learning_rate": 6.155419222903885e-05, + "loss": 3.262, + "step": 603 + }, + { + "epoch": 0.18538980969920196, + "grad_norm": 2.8091421127319336, + "learning_rate": 6.165644171779141e-05, + "loss": 3.3296, + "step": 604 + }, + { + "epoch": 0.18569674647022713, + "grad_norm": 3.4938528537750244, + "learning_rate": 6.175869120654397e-05, + "loss": 3.4028, + "step": 605 + }, + { + "epoch": 0.1860036832412523, + "grad_norm": 2.5200188159942627, + "learning_rate": 6.186094069529653e-05, + "loss": 3.3726, + "step": 606 + }, + { + "epoch": 0.18631062001227747, + "grad_norm": 3.6415109634399414, + "learning_rate": 6.196319018404908e-05, + "loss": 3.3539, + "step": 607 + }, + { + "epoch": 0.18661755678330263, + "grad_norm": 2.553532123565674, + "learning_rate": 6.206543967280163e-05, + "loss": 3.2971, + "step": 608 + }, + { + "epoch": 0.1869244935543278, + "grad_norm": 3.7287046909332275, + "learning_rate": 6.21676891615542e-05, + "loss": 3.3987, + "step": 609 + }, + { + "epoch": 0.18723143032535297, + "grad_norm": 2.6285226345062256, + "learning_rate": 6.226993865030674e-05, + "loss": 3.2446, + "step": 610 + }, + { + "epoch": 0.18753836709637814, + "grad_norm": 3.453766107559204, + "learning_rate": 6.237218813905931e-05, + "loss": 3.2644, + "step": 611 + }, + { + "epoch": 0.1878453038674033, + "grad_norm": 2.7924115657806396, + "learning_rate": 6.247443762781186e-05, + "loss": 3.3056, + "step": 612 + }, + { + "epoch": 0.18815224063842848, + "grad_norm": 3.4854533672332764, + "learning_rate": 6.257668711656443e-05, + "loss": 3.3468, + "step": 613 + }, + { + "epoch": 0.18845917740945364, + "grad_norm": 2.8738653659820557, + "learning_rate": 6.267893660531697e-05, + "loss": 3.3079, + "step": 614 + }, + { + "epoch": 0.1887661141804788, + "grad_norm": 3.496342420578003, + "learning_rate": 6.278118609406954e-05, + "loss": 3.3453, + "step": 615 + }, + { + "epoch": 0.18907305095150398, + "grad_norm": 3.1935245990753174, + "learning_rate": 6.288343558282209e-05, + "loss": 3.303, + "step": 616 + }, + { + "epoch": 0.18937998772252915, + "grad_norm": 2.9726579189300537, + "learning_rate": 6.298568507157464e-05, + "loss": 3.284, + "step": 617 + }, + { + "epoch": 0.18968692449355432, + "grad_norm": 2.8515241146087646, + "learning_rate": 6.30879345603272e-05, + "loss": 3.2748, + "step": 618 + }, + { + "epoch": 0.18999386126457948, + "grad_norm": 3.216681480407715, + "learning_rate": 6.319018404907977e-05, + "loss": 3.2613, + "step": 619 + }, + { + "epoch": 0.19030079803560468, + "grad_norm": 2.9164562225341797, + "learning_rate": 6.329243353783232e-05, + "loss": 3.3234, + "step": 620 + }, + { + "epoch": 0.19060773480662985, + "grad_norm": 2.6724259853363037, + "learning_rate": 6.339468302658487e-05, + "loss": 3.3271, + "step": 621 + }, + { + "epoch": 0.19091467157765502, + "grad_norm": 3.298551082611084, + "learning_rate": 6.349693251533743e-05, + "loss": 3.2715, + "step": 622 + }, + { + "epoch": 0.19122160834868018, + "grad_norm": 2.609632968902588, + "learning_rate": 6.359918200408998e-05, + "loss": 3.2392, + "step": 623 + }, + { + "epoch": 0.19152854511970535, + "grad_norm": 3.6469385623931885, + "learning_rate": 6.370143149284253e-05, + "loss": 3.428, + "step": 624 + }, + { + "epoch": 0.19183548189073052, + "grad_norm": 2.4231622219085693, + "learning_rate": 6.380368098159509e-05, + "loss": 3.3436, + "step": 625 + }, + { + "epoch": 0.1921424186617557, + "grad_norm": 3.9182474613189697, + "learning_rate": 6.390593047034765e-05, + "loss": 3.3375, + "step": 626 + }, + { + "epoch": 0.19244935543278086, + "grad_norm": 2.3975942134857178, + "learning_rate": 6.400817995910021e-05, + "loss": 3.2711, + "step": 627 + }, + { + "epoch": 0.19275629220380602, + "grad_norm": 3.061039447784424, + "learning_rate": 6.411042944785276e-05, + "loss": 3.3124, + "step": 628 + }, + { + "epoch": 0.1930632289748312, + "grad_norm": 2.9461817741394043, + "learning_rate": 6.421267893660532e-05, + "loss": 3.2954, + "step": 629 + }, + { + "epoch": 0.19337016574585636, + "grad_norm": 2.6603662967681885, + "learning_rate": 6.431492842535788e-05, + "loss": 3.2138, + "step": 630 + }, + { + "epoch": 0.19367710251688153, + "grad_norm": 3.339444875717163, + "learning_rate": 6.441717791411042e-05, + "loss": 3.2796, + "step": 631 + }, + { + "epoch": 0.1939840392879067, + "grad_norm": 2.59061861038208, + "learning_rate": 6.451942740286299e-05, + "loss": 3.3906, + "step": 632 + }, + { + "epoch": 0.19429097605893186, + "grad_norm": 3.704300880432129, + "learning_rate": 6.462167689161554e-05, + "loss": 3.2604, + "step": 633 + }, + { + "epoch": 0.19459791282995703, + "grad_norm": 3.110203266143799, + "learning_rate": 6.472392638036811e-05, + "loss": 3.3236, + "step": 634 + }, + { + "epoch": 0.1949048496009822, + "grad_norm": 3.016730308532715, + "learning_rate": 6.482617586912065e-05, + "loss": 3.2911, + "step": 635 + }, + { + "epoch": 0.19521178637200737, + "grad_norm": 2.896956205368042, + "learning_rate": 6.492842535787322e-05, + "loss": 3.35, + "step": 636 + }, + { + "epoch": 0.19551872314303254, + "grad_norm": 2.7913663387298584, + "learning_rate": 6.503067484662577e-05, + "loss": 3.3474, + "step": 637 + }, + { + "epoch": 0.1958256599140577, + "grad_norm": 3.285518169403076, + "learning_rate": 6.513292433537832e-05, + "loss": 3.2131, + "step": 638 + }, + { + "epoch": 0.19613259668508287, + "grad_norm": 2.588491201400757, + "learning_rate": 6.523517382413088e-05, + "loss": 3.2955, + "step": 639 + }, + { + "epoch": 0.19643953345610804, + "grad_norm": 2.9417827129364014, + "learning_rate": 6.533742331288345e-05, + "loss": 3.2917, + "step": 640 + }, + { + "epoch": 0.1967464702271332, + "grad_norm": 3.2209408283233643, + "learning_rate": 6.5439672801636e-05, + "loss": 3.233, + "step": 641 + }, + { + "epoch": 0.19705340699815838, + "grad_norm": 2.8424925804138184, + "learning_rate": 6.554192229038855e-05, + "loss": 3.3194, + "step": 642 + }, + { + "epoch": 0.19736034376918354, + "grad_norm": 2.9005842208862305, + "learning_rate": 6.56441717791411e-05, + "loss": 3.275, + "step": 643 + }, + { + "epoch": 0.1976672805402087, + "grad_norm": 3.0277016162872314, + "learning_rate": 6.574642126789366e-05, + "loss": 3.2881, + "step": 644 + }, + { + "epoch": 0.19797421731123388, + "grad_norm": 2.8932368755340576, + "learning_rate": 6.584867075664623e-05, + "loss": 3.2799, + "step": 645 + }, + { + "epoch": 0.19828115408225905, + "grad_norm": 2.994464635848999, + "learning_rate": 6.595092024539877e-05, + "loss": 3.258, + "step": 646 + }, + { + "epoch": 0.19858809085328422, + "grad_norm": 2.943040132522583, + "learning_rate": 6.605316973415133e-05, + "loss": 3.1994, + "step": 647 + }, + { + "epoch": 0.19889502762430938, + "grad_norm": 2.942765712738037, + "learning_rate": 6.615541922290389e-05, + "loss": 3.1802, + "step": 648 + }, + { + "epoch": 0.19920196439533455, + "grad_norm": 2.8036246299743652, + "learning_rate": 6.625766871165644e-05, + "loss": 3.2426, + "step": 649 + }, + { + "epoch": 0.19950890116635972, + "grad_norm": 2.814507484436035, + "learning_rate": 6.6359918200409e-05, + "loss": 3.2978, + "step": 650 + }, + { + "epoch": 0.1998158379373849, + "grad_norm": 2.8133158683776855, + "learning_rate": 6.646216768916156e-05, + "loss": 3.2435, + "step": 651 + }, + { + "epoch": 0.20012277470841006, + "grad_norm": 2.8596129417419434, + "learning_rate": 6.656441717791412e-05, + "loss": 3.2154, + "step": 652 + }, + { + "epoch": 0.20042971147943522, + "grad_norm": 2.663926839828491, + "learning_rate": 6.666666666666667e-05, + "loss": 3.2487, + "step": 653 + }, + { + "epoch": 0.2007366482504604, + "grad_norm": 3.40561580657959, + "learning_rate": 6.676891615541922e-05, + "loss": 3.1509, + "step": 654 + }, + { + "epoch": 0.20104358502148556, + "grad_norm": 2.5786798000335693, + "learning_rate": 6.687116564417179e-05, + "loss": 3.2686, + "step": 655 + }, + { + "epoch": 0.20135052179251076, + "grad_norm": 3.007436752319336, + "learning_rate": 6.697341513292433e-05, + "loss": 3.2543, + "step": 656 + }, + { + "epoch": 0.20165745856353592, + "grad_norm": 2.5966951847076416, + "learning_rate": 6.70756646216769e-05, + "loss": 3.2643, + "step": 657 + }, + { + "epoch": 0.2019643953345611, + "grad_norm": 3.2698333263397217, + "learning_rate": 6.717791411042945e-05, + "loss": 3.2002, + "step": 658 + }, + { + "epoch": 0.20227133210558626, + "grad_norm": 2.513129472732544, + "learning_rate": 6.7280163599182e-05, + "loss": 3.1551, + "step": 659 + }, + { + "epoch": 0.20257826887661143, + "grad_norm": 2.9690299034118652, + "learning_rate": 6.738241308793456e-05, + "loss": 3.3037, + "step": 660 + }, + { + "epoch": 0.2028852056476366, + "grad_norm": 2.6644227504730225, + "learning_rate": 6.748466257668711e-05, + "loss": 3.3225, + "step": 661 + }, + { + "epoch": 0.20319214241866176, + "grad_norm": 2.6990232467651367, + "learning_rate": 6.758691206543968e-05, + "loss": 3.227, + "step": 662 + }, + { + "epoch": 0.20349907918968693, + "grad_norm": 3.6271350383758545, + "learning_rate": 6.768916155419223e-05, + "loss": 3.32, + "step": 663 + }, + { + "epoch": 0.2038060159607121, + "grad_norm": 2.6351428031921387, + "learning_rate": 6.779141104294479e-05, + "loss": 3.2104, + "step": 664 + }, + { + "epoch": 0.20411295273173727, + "grad_norm": 3.980685234069824, + "learning_rate": 6.789366053169734e-05, + "loss": 3.2602, + "step": 665 + }, + { + "epoch": 0.20441988950276244, + "grad_norm": 2.5207509994506836, + "learning_rate": 6.799591002044991e-05, + "loss": 3.2256, + "step": 666 + }, + { + "epoch": 0.2047268262737876, + "grad_norm": 3.0568666458129883, + "learning_rate": 6.809815950920245e-05, + "loss": 3.2918, + "step": 667 + }, + { + "epoch": 0.20503376304481277, + "grad_norm": 2.6476826667785645, + "learning_rate": 6.820040899795501e-05, + "loss": 3.2745, + "step": 668 + }, + { + "epoch": 0.20534069981583794, + "grad_norm": 3.0413191318511963, + "learning_rate": 6.830265848670757e-05, + "loss": 3.2683, + "step": 669 + }, + { + "epoch": 0.2056476365868631, + "grad_norm": 2.6214709281921387, + "learning_rate": 6.840490797546014e-05, + "loss": 3.1399, + "step": 670 + }, + { + "epoch": 0.20595457335788828, + "grad_norm": 3.0577988624572754, + "learning_rate": 6.850715746421268e-05, + "loss": 3.2131, + "step": 671 + }, + { + "epoch": 0.20626151012891344, + "grad_norm": 2.795365571975708, + "learning_rate": 6.860940695296524e-05, + "loss": 3.1633, + "step": 672 + }, + { + "epoch": 0.2065684468999386, + "grad_norm": 3.3030495643615723, + "learning_rate": 6.87116564417178e-05, + "loss": 3.2036, + "step": 673 + }, + { + "epoch": 0.20687538367096378, + "grad_norm": 2.3182966709136963, + "learning_rate": 6.881390593047035e-05, + "loss": 3.2154, + "step": 674 + }, + { + "epoch": 0.20718232044198895, + "grad_norm": 3.133702039718628, + "learning_rate": 6.89161554192229e-05, + "loss": 3.1828, + "step": 675 + }, + { + "epoch": 0.20748925721301412, + "grad_norm": 2.555358409881592, + "learning_rate": 6.901840490797547e-05, + "loss": 3.1434, + "step": 676 + }, + { + "epoch": 0.20779619398403928, + "grad_norm": 2.990675687789917, + "learning_rate": 6.912065439672802e-05, + "loss": 3.2182, + "step": 677 + }, + { + "epoch": 0.20810313075506445, + "grad_norm": 2.5072035789489746, + "learning_rate": 6.922290388548058e-05, + "loss": 3.2735, + "step": 678 + }, + { + "epoch": 0.20841006752608962, + "grad_norm": 3.311474323272705, + "learning_rate": 6.932515337423313e-05, + "loss": 3.2152, + "step": 679 + }, + { + "epoch": 0.2087170042971148, + "grad_norm": 2.7110986709594727, + "learning_rate": 6.942740286298569e-05, + "loss": 3.1633, + "step": 680 + }, + { + "epoch": 0.20902394106813996, + "grad_norm": 2.6963095664978027, + "learning_rate": 6.952965235173824e-05, + "loss": 3.2097, + "step": 681 + }, + { + "epoch": 0.20933087783916512, + "grad_norm": 2.7126448154449463, + "learning_rate": 6.963190184049079e-05, + "loss": 3.232, + "step": 682 + }, + { + "epoch": 0.2096378146101903, + "grad_norm": 2.723257541656494, + "learning_rate": 6.973415132924336e-05, + "loss": 3.1024, + "step": 683 + }, + { + "epoch": 0.20994475138121546, + "grad_norm": 2.985407829284668, + "learning_rate": 6.983640081799591e-05, + "loss": 3.215, + "step": 684 + }, + { + "epoch": 0.21025168815224063, + "grad_norm": 2.4878063201904297, + "learning_rate": 6.993865030674847e-05, + "loss": 3.2543, + "step": 685 + }, + { + "epoch": 0.2105586249232658, + "grad_norm": 3.417191505432129, + "learning_rate": 7.004089979550102e-05, + "loss": 3.217, + "step": 686 + }, + { + "epoch": 0.21086556169429096, + "grad_norm": 2.606513738632202, + "learning_rate": 7.014314928425359e-05, + "loss": 3.1831, + "step": 687 + }, + { + "epoch": 0.21117249846531613, + "grad_norm": 2.777334213256836, + "learning_rate": 7.024539877300614e-05, + "loss": 3.1513, + "step": 688 + }, + { + "epoch": 0.2114794352363413, + "grad_norm": 2.718494415283203, + "learning_rate": 7.03476482617587e-05, + "loss": 3.1695, + "step": 689 + }, + { + "epoch": 0.21178637200736647, + "grad_norm": 3.041794776916504, + "learning_rate": 7.044989775051125e-05, + "loss": 3.2078, + "step": 690 + }, + { + "epoch": 0.21209330877839166, + "grad_norm": 2.6473169326782227, + "learning_rate": 7.055214723926382e-05, + "loss": 3.177, + "step": 691 + }, + { + "epoch": 0.21240024554941683, + "grad_norm": 3.2349517345428467, + "learning_rate": 7.065439672801636e-05, + "loss": 3.2144, + "step": 692 + }, + { + "epoch": 0.212707182320442, + "grad_norm": 2.6024651527404785, + "learning_rate": 7.075664621676892e-05, + "loss": 3.2204, + "step": 693 + }, + { + "epoch": 0.21301411909146717, + "grad_norm": 2.9090511798858643, + "learning_rate": 7.085889570552148e-05, + "loss": 3.2473, + "step": 694 + }, + { + "epoch": 0.21332105586249234, + "grad_norm": 3.230525255203247, + "learning_rate": 7.096114519427403e-05, + "loss": 3.2552, + "step": 695 + }, + { + "epoch": 0.2136279926335175, + "grad_norm": 2.2609128952026367, + "learning_rate": 7.106339468302658e-05, + "loss": 3.1302, + "step": 696 + }, + { + "epoch": 0.21393492940454267, + "grad_norm": 3.484372854232788, + "learning_rate": 7.116564417177914e-05, + "loss": 3.1578, + "step": 697 + }, + { + "epoch": 0.21424186617556784, + "grad_norm": 2.130702257156372, + "learning_rate": 7.12678936605317e-05, + "loss": 3.2089, + "step": 698 + }, + { + "epoch": 0.214548802946593, + "grad_norm": 3.0673611164093018, + "learning_rate": 7.137014314928426e-05, + "loss": 3.214, + "step": 699 + }, + { + "epoch": 0.21485573971761818, + "grad_norm": 2.572826862335205, + "learning_rate": 7.147239263803681e-05, + "loss": 3.1824, + "step": 700 + }, + { + "epoch": 0.21516267648864335, + "grad_norm": 2.8327746391296387, + "learning_rate": 7.157464212678937e-05, + "loss": 3.2384, + "step": 701 + }, + { + "epoch": 0.2154696132596685, + "grad_norm": 2.863041877746582, + "learning_rate": 7.167689161554193e-05, + "loss": 3.1102, + "step": 702 + }, + { + "epoch": 0.21577655003069368, + "grad_norm": 2.2519750595092773, + "learning_rate": 7.177914110429447e-05, + "loss": 3.1541, + "step": 703 + }, + { + "epoch": 0.21608348680171885, + "grad_norm": 3.197129011154175, + "learning_rate": 7.188139059304704e-05, + "loss": 3.2407, + "step": 704 + }, + { + "epoch": 0.21639042357274402, + "grad_norm": 2.32582426071167, + "learning_rate": 7.19836400817996e-05, + "loss": 3.1895, + "step": 705 + }, + { + "epoch": 0.21669736034376919, + "grad_norm": 3.0128488540649414, + "learning_rate": 7.208588957055215e-05, + "loss": 3.2839, + "step": 706 + }, + { + "epoch": 0.21700429711479435, + "grad_norm": 2.503342390060425, + "learning_rate": 7.21881390593047e-05, + "loss": 3.2093, + "step": 707 + }, + { + "epoch": 0.21731123388581952, + "grad_norm": 2.7540833950042725, + "learning_rate": 7.229038854805727e-05, + "loss": 3.2143, + "step": 708 + }, + { + "epoch": 0.2176181706568447, + "grad_norm": 2.8838772773742676, + "learning_rate": 7.239263803680982e-05, + "loss": 3.2051, + "step": 709 + }, + { + "epoch": 0.21792510742786986, + "grad_norm": 2.7495758533477783, + "learning_rate": 7.249488752556238e-05, + "loss": 3.0701, + "step": 710 + }, + { + "epoch": 0.21823204419889503, + "grad_norm": 2.684539794921875, + "learning_rate": 7.259713701431493e-05, + "loss": 3.1917, + "step": 711 + }, + { + "epoch": 0.2185389809699202, + "grad_norm": 2.8330819606781006, + "learning_rate": 7.26993865030675e-05, + "loss": 3.1685, + "step": 712 + }, + { + "epoch": 0.21884591774094536, + "grad_norm": 2.6974711418151855, + "learning_rate": 7.280163599182005e-05, + "loss": 3.0953, + "step": 713 + }, + { + "epoch": 0.21915285451197053, + "grad_norm": 2.5129306316375732, + "learning_rate": 7.29038854805726e-05, + "loss": 3.1371, + "step": 714 + }, + { + "epoch": 0.2194597912829957, + "grad_norm": 2.7884230613708496, + "learning_rate": 7.300613496932516e-05, + "loss": 3.1386, + "step": 715 + }, + { + "epoch": 0.21976672805402087, + "grad_norm": 2.296306610107422, + "learning_rate": 7.310838445807771e-05, + "loss": 3.1735, + "step": 716 + }, + { + "epoch": 0.22007366482504603, + "grad_norm": 2.777911424636841, + "learning_rate": 7.321063394683026e-05, + "loss": 3.1726, + "step": 717 + }, + { + "epoch": 0.2203806015960712, + "grad_norm": 2.5349695682525635, + "learning_rate": 7.331288343558282e-05, + "loss": 3.1603, + "step": 718 + }, + { + "epoch": 0.22068753836709637, + "grad_norm": 2.415412425994873, + "learning_rate": 7.341513292433539e-05, + "loss": 3.1378, + "step": 719 + }, + { + "epoch": 0.22099447513812154, + "grad_norm": 2.7188358306884766, + "learning_rate": 7.351738241308794e-05, + "loss": 3.1321, + "step": 720 + }, + { + "epoch": 0.2213014119091467, + "grad_norm": 2.4872183799743652, + "learning_rate": 7.361963190184049e-05, + "loss": 3.1283, + "step": 721 + }, + { + "epoch": 0.22160834868017187, + "grad_norm": 2.454535961151123, + "learning_rate": 7.372188139059305e-05, + "loss": 3.1085, + "step": 722 + }, + { + "epoch": 0.22191528545119704, + "grad_norm": 2.5621426105499268, + "learning_rate": 7.382413087934561e-05, + "loss": 3.1307, + "step": 723 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.513777256011963, + "learning_rate": 7.392638036809815e-05, + "loss": 3.1103, + "step": 724 + }, + { + "epoch": 0.22252915899324738, + "grad_norm": 2.596559762954712, + "learning_rate": 7.402862985685072e-05, + "loss": 3.1563, + "step": 725 + }, + { + "epoch": 0.22283609576427257, + "grad_norm": 2.371487617492676, + "learning_rate": 7.413087934560327e-05, + "loss": 3.1344, + "step": 726 + }, + { + "epoch": 0.22314303253529774, + "grad_norm": 2.7252206802368164, + "learning_rate": 7.423312883435584e-05, + "loss": 3.2139, + "step": 727 + }, + { + "epoch": 0.2234499693063229, + "grad_norm": 2.2834722995758057, + "learning_rate": 7.433537832310838e-05, + "loss": 3.1461, + "step": 728 + }, + { + "epoch": 0.22375690607734808, + "grad_norm": 3.0965540409088135, + "learning_rate": 7.443762781186095e-05, + "loss": 3.1433, + "step": 729 + }, + { + "epoch": 0.22406384284837325, + "grad_norm": 2.351365804672241, + "learning_rate": 7.45398773006135e-05, + "loss": 3.1737, + "step": 730 + }, + { + "epoch": 0.2243707796193984, + "grad_norm": 3.0938596725463867, + "learning_rate": 7.464212678936606e-05, + "loss": 3.1689, + "step": 731 + }, + { + "epoch": 0.22467771639042358, + "grad_norm": 2.415039300918579, + "learning_rate": 7.474437627811861e-05, + "loss": 3.1146, + "step": 732 + }, + { + "epoch": 0.22498465316144875, + "grad_norm": 2.8242318630218506, + "learning_rate": 7.484662576687118e-05, + "loss": 3.0812, + "step": 733 + }, + { + "epoch": 0.22529158993247392, + "grad_norm": 2.4347777366638184, + "learning_rate": 7.494887525562373e-05, + "loss": 3.203, + "step": 734 + }, + { + "epoch": 0.22559852670349909, + "grad_norm": 2.953418016433716, + "learning_rate": 7.505112474437628e-05, + "loss": 3.109, + "step": 735 + }, + { + "epoch": 0.22590546347452425, + "grad_norm": 2.600888252258301, + "learning_rate": 7.515337423312884e-05, + "loss": 3.1859, + "step": 736 + }, + { + "epoch": 0.22621240024554942, + "grad_norm": 2.7484869956970215, + "learning_rate": 7.525562372188139e-05, + "loss": 3.1169, + "step": 737 + }, + { + "epoch": 0.2265193370165746, + "grad_norm": 2.4797677993774414, + "learning_rate": 7.535787321063396e-05, + "loss": 3.0696, + "step": 738 + }, + { + "epoch": 0.22682627378759976, + "grad_norm": 2.641873359680176, + "learning_rate": 7.54601226993865e-05, + "loss": 3.1545, + "step": 739 + }, + { + "epoch": 0.22713321055862493, + "grad_norm": 2.3956825733184814, + "learning_rate": 7.556237218813907e-05, + "loss": 3.1295, + "step": 740 + }, + { + "epoch": 0.2274401473296501, + "grad_norm": 2.8832130432128906, + "learning_rate": 7.566462167689162e-05, + "loss": 3.1119, + "step": 741 + }, + { + "epoch": 0.22774708410067526, + "grad_norm": 2.3001184463500977, + "learning_rate": 7.576687116564417e-05, + "loss": 3.0068, + "step": 742 + }, + { + "epoch": 0.22805402087170043, + "grad_norm": 2.8682122230529785, + "learning_rate": 7.586912065439673e-05, + "loss": 3.0562, + "step": 743 + }, + { + "epoch": 0.2283609576427256, + "grad_norm": 2.2176413536071777, + "learning_rate": 7.59713701431493e-05, + "loss": 3.1395, + "step": 744 + }, + { + "epoch": 0.22866789441375077, + "grad_norm": 3.698274612426758, + "learning_rate": 7.607361963190185e-05, + "loss": 3.209, + "step": 745 + }, + { + "epoch": 0.22897483118477593, + "grad_norm": 2.141063928604126, + "learning_rate": 7.61758691206544e-05, + "loss": 3.1734, + "step": 746 + }, + { + "epoch": 0.2292817679558011, + "grad_norm": 2.728498697280884, + "learning_rate": 7.627811860940695e-05, + "loss": 3.1498, + "step": 747 + }, + { + "epoch": 0.22958870472682627, + "grad_norm": 2.271678924560547, + "learning_rate": 7.638036809815952e-05, + "loss": 3.1538, + "step": 748 + }, + { + "epoch": 0.22989564149785144, + "grad_norm": 2.6095521450042725, + "learning_rate": 7.648261758691206e-05, + "loss": 3.155, + "step": 749 + }, + { + "epoch": 0.2302025782688766, + "grad_norm": 2.410792112350464, + "learning_rate": 7.658486707566463e-05, + "loss": 3.0478, + "step": 750 + }, + { + "epoch": 0.23050951503990177, + "grad_norm": 2.6980888843536377, + "learning_rate": 7.668711656441718e-05, + "loss": 3.1369, + "step": 751 + }, + { + "epoch": 0.23081645181092694, + "grad_norm": 2.353308916091919, + "learning_rate": 7.678936605316974e-05, + "loss": 3.0052, + "step": 752 + }, + { + "epoch": 0.2311233885819521, + "grad_norm": 2.4530155658721924, + "learning_rate": 7.689161554192229e-05, + "loss": 3.1348, + "step": 753 + }, + { + "epoch": 0.23143032535297728, + "grad_norm": 2.393601894378662, + "learning_rate": 7.699386503067484e-05, + "loss": 2.9941, + "step": 754 + }, + { + "epoch": 0.23173726212400245, + "grad_norm": 2.576876401901245, + "learning_rate": 7.709611451942741e-05, + "loss": 3.114, + "step": 755 + }, + { + "epoch": 0.23204419889502761, + "grad_norm": 2.0420913696289062, + "learning_rate": 7.719836400817996e-05, + "loss": 3.132, + "step": 756 + }, + { + "epoch": 0.23235113566605278, + "grad_norm": 3.0095622539520264, + "learning_rate": 7.730061349693252e-05, + "loss": 3.1763, + "step": 757 + }, + { + "epoch": 0.23265807243707795, + "grad_norm": 2.224005937576294, + "learning_rate": 7.740286298568507e-05, + "loss": 3.0703, + "step": 758 + }, + { + "epoch": 0.23296500920810312, + "grad_norm": 2.7559845447540283, + "learning_rate": 7.750511247443764e-05, + "loss": 3.1026, + "step": 759 + }, + { + "epoch": 0.2332719459791283, + "grad_norm": 2.2965753078460693, + "learning_rate": 7.760736196319018e-05, + "loss": 3.0284, + "step": 760 + }, + { + "epoch": 0.23357888275015345, + "grad_norm": 2.374398708343506, + "learning_rate": 7.770961145194275e-05, + "loss": 3.0636, + "step": 761 + }, + { + "epoch": 0.23388581952117865, + "grad_norm": 2.4315314292907715, + "learning_rate": 7.78118609406953e-05, + "loss": 3.0906, + "step": 762 + }, + { + "epoch": 0.23419275629220382, + "grad_norm": 2.5609946250915527, + "learning_rate": 7.791411042944787e-05, + "loss": 3.0692, + "step": 763 + }, + { + "epoch": 0.234499693063229, + "grad_norm": 2.419597864151001, + "learning_rate": 7.80163599182004e-05, + "loss": 3.1934, + "step": 764 + }, + { + "epoch": 0.23480662983425415, + "grad_norm": 3.0499062538146973, + "learning_rate": 7.811860940695297e-05, + "loss": 3.18, + "step": 765 + }, + { + "epoch": 0.23511356660527932, + "grad_norm": 2.464421510696411, + "learning_rate": 7.822085889570553e-05, + "loss": 3.1591, + "step": 766 + }, + { + "epoch": 0.2354205033763045, + "grad_norm": 3.4370174407958984, + "learning_rate": 7.832310838445808e-05, + "loss": 3.1156, + "step": 767 + }, + { + "epoch": 0.23572744014732966, + "grad_norm": 2.207406520843506, + "learning_rate": 7.842535787321063e-05, + "loss": 3.0557, + "step": 768 + }, + { + "epoch": 0.23603437691835483, + "grad_norm": 2.484807014465332, + "learning_rate": 7.85276073619632e-05, + "loss": 3.1003, + "step": 769 + }, + { + "epoch": 0.23634131368938, + "grad_norm": 2.33217716217041, + "learning_rate": 7.862985685071576e-05, + "loss": 3.0707, + "step": 770 + }, + { + "epoch": 0.23664825046040516, + "grad_norm": 2.493717670440674, + "learning_rate": 7.873210633946831e-05, + "loss": 3.127, + "step": 771 + }, + { + "epoch": 0.23695518723143033, + "grad_norm": 2.5824413299560547, + "learning_rate": 7.883435582822086e-05, + "loss": 3.1042, + "step": 772 + }, + { + "epoch": 0.2372621240024555, + "grad_norm": 2.4137654304504395, + "learning_rate": 7.893660531697342e-05, + "loss": 3.136, + "step": 773 + }, + { + "epoch": 0.23756906077348067, + "grad_norm": 2.4657833576202393, + "learning_rate": 7.903885480572597e-05, + "loss": 3.038, + "step": 774 + }, + { + "epoch": 0.23787599754450584, + "grad_norm": 2.426260471343994, + "learning_rate": 7.914110429447852e-05, + "loss": 3.0102, + "step": 775 + }, + { + "epoch": 0.238182934315531, + "grad_norm": 2.4658050537109375, + "learning_rate": 7.924335378323109e-05, + "loss": 3.0645, + "step": 776 + }, + { + "epoch": 0.23848987108655617, + "grad_norm": 2.186267614364624, + "learning_rate": 7.934560327198364e-05, + "loss": 3.0585, + "step": 777 + }, + { + "epoch": 0.23879680785758134, + "grad_norm": 2.8824141025543213, + "learning_rate": 7.94478527607362e-05, + "loss": 3.0796, + "step": 778 + }, + { + "epoch": 0.2391037446286065, + "grad_norm": 1.9940539598464966, + "learning_rate": 7.955010224948875e-05, + "loss": 2.9894, + "step": 779 + }, + { + "epoch": 0.23941068139963168, + "grad_norm": 2.9386861324310303, + "learning_rate": 7.965235173824132e-05, + "loss": 3.1147, + "step": 780 + }, + { + "epoch": 0.23971761817065684, + "grad_norm": 2.241983413696289, + "learning_rate": 7.975460122699386e-05, + "loss": 2.9977, + "step": 781 + }, + { + "epoch": 0.240024554941682, + "grad_norm": 2.4796900749206543, + "learning_rate": 7.985685071574643e-05, + "loss": 3.0507, + "step": 782 + }, + { + "epoch": 0.24033149171270718, + "grad_norm": 2.6178741455078125, + "learning_rate": 7.995910020449898e-05, + "loss": 3.0299, + "step": 783 + }, + { + "epoch": 0.24063842848373235, + "grad_norm": 2.157179594039917, + "learning_rate": 8.006134969325155e-05, + "loss": 3.0419, + "step": 784 + }, + { + "epoch": 0.24094536525475752, + "grad_norm": 2.49029541015625, + "learning_rate": 8.016359918200409e-05, + "loss": 3.0785, + "step": 785 + }, + { + "epoch": 0.24125230202578268, + "grad_norm": 2.254014492034912, + "learning_rate": 8.026584867075665e-05, + "loss": 3.0009, + "step": 786 + }, + { + "epoch": 0.24155923879680785, + "grad_norm": 2.514465570449829, + "learning_rate": 8.036809815950921e-05, + "loss": 3.0221, + "step": 787 + }, + { + "epoch": 0.24186617556783302, + "grad_norm": 2.309812545776367, + "learning_rate": 8.047034764826176e-05, + "loss": 2.9822, + "step": 788 + }, + { + "epoch": 0.2421731123388582, + "grad_norm": 2.5367796421051025, + "learning_rate": 8.057259713701431e-05, + "loss": 2.966, + "step": 789 + }, + { + "epoch": 0.24248004910988336, + "grad_norm": 2.4668943881988525, + "learning_rate": 8.067484662576688e-05, + "loss": 3.1177, + "step": 790 + }, + { + "epoch": 0.24278698588090852, + "grad_norm": 2.9424917697906494, + "learning_rate": 8.077709611451944e-05, + "loss": 3.078, + "step": 791 + }, + { + "epoch": 0.2430939226519337, + "grad_norm": 2.3068933486938477, + "learning_rate": 8.087934560327199e-05, + "loss": 3.0415, + "step": 792 + }, + { + "epoch": 0.24340085942295886, + "grad_norm": 2.675631284713745, + "learning_rate": 8.098159509202454e-05, + "loss": 3.012, + "step": 793 + }, + { + "epoch": 0.24370779619398403, + "grad_norm": 2.0261662006378174, + "learning_rate": 8.10838445807771e-05, + "loss": 3.0023, + "step": 794 + }, + { + "epoch": 0.2440147329650092, + "grad_norm": 3.32330322265625, + "learning_rate": 8.118609406952966e-05, + "loss": 3.0992, + "step": 795 + }, + { + "epoch": 0.24432166973603436, + "grad_norm": 2.1587088108062744, + "learning_rate": 8.12883435582822e-05, + "loss": 3.0922, + "step": 796 + }, + { + "epoch": 0.24462860650705956, + "grad_norm": 2.639254331588745, + "learning_rate": 8.139059304703477e-05, + "loss": 2.9856, + "step": 797 + }, + { + "epoch": 0.24493554327808473, + "grad_norm": 1.9976975917816162, + "learning_rate": 8.149284253578732e-05, + "loss": 3.0015, + "step": 798 + }, + { + "epoch": 0.2452424800491099, + "grad_norm": 2.763504981994629, + "learning_rate": 8.159509202453988e-05, + "loss": 3.0437, + "step": 799 + }, + { + "epoch": 0.24554941682013506, + "grad_norm": 1.9080138206481934, + "learning_rate": 8.169734151329243e-05, + "loss": 3.0009, + "step": 800 + }, + { + "epoch": 0.24585635359116023, + "grad_norm": 3.1276164054870605, + "learning_rate": 8.1799591002045e-05, + "loss": 3.0433, + "step": 801 + }, + { + "epoch": 0.2461632903621854, + "grad_norm": 2.0463218688964844, + "learning_rate": 8.190184049079755e-05, + "loss": 2.988, + "step": 802 + }, + { + "epoch": 0.24647022713321057, + "grad_norm": 2.8476648330688477, + "learning_rate": 8.20040899795501e-05, + "loss": 3.0238, + "step": 803 + }, + { + "epoch": 0.24677716390423574, + "grad_norm": 1.9715898036956787, + "learning_rate": 8.210633946830266e-05, + "loss": 3.0657, + "step": 804 + }, + { + "epoch": 0.2470841006752609, + "grad_norm": 3.369995594024658, + "learning_rate": 8.220858895705523e-05, + "loss": 3.0181, + "step": 805 + }, + { + "epoch": 0.24739103744628607, + "grad_norm": 2.0333900451660156, + "learning_rate": 8.231083844580777e-05, + "loss": 3.0589, + "step": 806 + }, + { + "epoch": 0.24769797421731124, + "grad_norm": 2.5702931880950928, + "learning_rate": 8.241308793456033e-05, + "loss": 2.9908, + "step": 807 + }, + { + "epoch": 0.2480049109883364, + "grad_norm": 2.12131929397583, + "learning_rate": 8.251533742331289e-05, + "loss": 3.0519, + "step": 808 + }, + { + "epoch": 0.24831184775936158, + "grad_norm": 2.5457377433776855, + "learning_rate": 8.261758691206544e-05, + "loss": 3.019, + "step": 809 + }, + { + "epoch": 0.24861878453038674, + "grad_norm": 2.0954740047454834, + "learning_rate": 8.2719836400818e-05, + "loss": 2.9805, + "step": 810 + }, + { + "epoch": 0.2489257213014119, + "grad_norm": 2.2456700801849365, + "learning_rate": 8.282208588957055e-05, + "loss": 3.0627, + "step": 811 + }, + { + "epoch": 0.24923265807243708, + "grad_norm": 2.4453790187835693, + "learning_rate": 8.292433537832312e-05, + "loss": 3.0447, + "step": 812 + }, + { + "epoch": 0.24953959484346225, + "grad_norm": 2.1835873126983643, + "learning_rate": 8.302658486707567e-05, + "loss": 3.0008, + "step": 813 + }, + { + "epoch": 0.24984653161448742, + "grad_norm": 2.292989492416382, + "learning_rate": 8.312883435582822e-05, + "loss": 2.9175, + "step": 814 + }, + { + "epoch": 0.2501534683855126, + "grad_norm": 2.408888816833496, + "learning_rate": 8.323108384458078e-05, + "loss": 2.9649, + "step": 815 + }, + { + "epoch": 0.2504604051565378, + "grad_norm": 2.1873834133148193, + "learning_rate": 8.333333333333334e-05, + "loss": 2.9812, + "step": 816 + }, + { + "epoch": 0.25076734192756295, + "grad_norm": 2.2599284648895264, + "learning_rate": 8.343558282208588e-05, + "loss": 3.0086, + "step": 817 + }, + { + "epoch": 0.2510742786985881, + "grad_norm": 2.1902761459350586, + "learning_rate": 8.353783231083845e-05, + "loss": 2.9295, + "step": 818 + }, + { + "epoch": 0.2513812154696133, + "grad_norm": 2.4830422401428223, + "learning_rate": 8.3640081799591e-05, + "loss": 2.9808, + "step": 819 + }, + { + "epoch": 0.25168815224063845, + "grad_norm": 2.2274281978607178, + "learning_rate": 8.374233128834357e-05, + "loss": 2.9525, + "step": 820 + }, + { + "epoch": 0.2519950890116636, + "grad_norm": 2.2949111461639404, + "learning_rate": 8.384458077709611e-05, + "loss": 3.0313, + "step": 821 + }, + { + "epoch": 0.2523020257826888, + "grad_norm": 2.2345564365386963, + "learning_rate": 8.394683026584868e-05, + "loss": 2.9024, + "step": 822 + }, + { + "epoch": 0.25260896255371396, + "grad_norm": 2.488744020462036, + "learning_rate": 8.404907975460123e-05, + "loss": 2.9907, + "step": 823 + }, + { + "epoch": 0.2529158993247391, + "grad_norm": 1.9192837476730347, + "learning_rate": 8.415132924335379e-05, + "loss": 2.9792, + "step": 824 + }, + { + "epoch": 0.2532228360957643, + "grad_norm": 2.6426947116851807, + "learning_rate": 8.425357873210634e-05, + "loss": 2.972, + "step": 825 + }, + { + "epoch": 0.25352977286678946, + "grad_norm": 1.9950047731399536, + "learning_rate": 8.435582822085891e-05, + "loss": 2.9885, + "step": 826 + }, + { + "epoch": 0.25383670963781463, + "grad_norm": 2.30191969871521, + "learning_rate": 8.445807770961146e-05, + "loss": 2.9358, + "step": 827 + }, + { + "epoch": 0.2541436464088398, + "grad_norm": 2.1111395359039307, + "learning_rate": 8.456032719836401e-05, + "loss": 3.0343, + "step": 828 + }, + { + "epoch": 0.25445058317986496, + "grad_norm": 2.7292258739471436, + "learning_rate": 8.466257668711657e-05, + "loss": 2.9465, + "step": 829 + }, + { + "epoch": 0.25475751995089013, + "grad_norm": 1.9130604267120361, + "learning_rate": 8.476482617586912e-05, + "loss": 2.9443, + "step": 830 + }, + { + "epoch": 0.2550644567219153, + "grad_norm": 2.4240024089813232, + "learning_rate": 8.486707566462168e-05, + "loss": 2.963, + "step": 831 + }, + { + "epoch": 0.25537139349294047, + "grad_norm": 2.062875509262085, + "learning_rate": 8.496932515337423e-05, + "loss": 3.0127, + "step": 832 + }, + { + "epoch": 0.25567833026396564, + "grad_norm": 2.223639726638794, + "learning_rate": 8.50715746421268e-05, + "loss": 2.944, + "step": 833 + }, + { + "epoch": 0.2559852670349908, + "grad_norm": 2.2969272136688232, + "learning_rate": 8.517382413087935e-05, + "loss": 2.9495, + "step": 834 + }, + { + "epoch": 0.256292203806016, + "grad_norm": 2.1343178749084473, + "learning_rate": 8.52760736196319e-05, + "loss": 3.0383, + "step": 835 + }, + { + "epoch": 0.25659914057704114, + "grad_norm": 2.2348313331604004, + "learning_rate": 8.537832310838446e-05, + "loss": 2.9205, + "step": 836 + }, + { + "epoch": 0.2569060773480663, + "grad_norm": 2.2653896808624268, + "learning_rate": 8.548057259713702e-05, + "loss": 2.9699, + "step": 837 + }, + { + "epoch": 0.2572130141190915, + "grad_norm": 2.1332547664642334, + "learning_rate": 8.558282208588958e-05, + "loss": 2.9318, + "step": 838 + }, + { + "epoch": 0.25751995089011664, + "grad_norm": 2.5935778617858887, + "learning_rate": 8.568507157464213e-05, + "loss": 2.9754, + "step": 839 + }, + { + "epoch": 0.2578268876611418, + "grad_norm": 2.073923110961914, + "learning_rate": 8.578732106339469e-05, + "loss": 3.0396, + "step": 840 + }, + { + "epoch": 0.258133824432167, + "grad_norm": 2.485049247741699, + "learning_rate": 8.588957055214725e-05, + "loss": 2.9297, + "step": 841 + }, + { + "epoch": 0.25844076120319215, + "grad_norm": 1.9425253868103027, + "learning_rate": 8.599182004089979e-05, + "loss": 3.0131, + "step": 842 + }, + { + "epoch": 0.2587476979742173, + "grad_norm": 2.6248724460601807, + "learning_rate": 8.609406952965236e-05, + "loss": 3.0345, + "step": 843 + }, + { + "epoch": 0.2590546347452425, + "grad_norm": 1.9123374223709106, + "learning_rate": 8.619631901840491e-05, + "loss": 3.0259, + "step": 844 + }, + { + "epoch": 0.25936157151626765, + "grad_norm": 2.457913637161255, + "learning_rate": 8.629856850715747e-05, + "loss": 3.0015, + "step": 845 + }, + { + "epoch": 0.2596685082872928, + "grad_norm": 2.0444202423095703, + "learning_rate": 8.640081799591002e-05, + "loss": 2.9663, + "step": 846 + }, + { + "epoch": 0.259975445058318, + "grad_norm": 2.1673583984375, + "learning_rate": 8.650306748466259e-05, + "loss": 3.0646, + "step": 847 + }, + { + "epoch": 0.26028238182934316, + "grad_norm": 2.1198627948760986, + "learning_rate": 8.660531697341514e-05, + "loss": 2.8769, + "step": 848 + }, + { + "epoch": 0.2605893186003683, + "grad_norm": 2.379960775375366, + "learning_rate": 8.67075664621677e-05, + "loss": 2.9637, + "step": 849 + }, + { + "epoch": 0.2608962553713935, + "grad_norm": 2.3954226970672607, + "learning_rate": 8.680981595092025e-05, + "loss": 3.025, + "step": 850 + }, + { + "epoch": 0.26120319214241866, + "grad_norm": 2.254746198654175, + "learning_rate": 8.69120654396728e-05, + "loss": 2.9962, + "step": 851 + }, + { + "epoch": 0.26151012891344383, + "grad_norm": 2.0851991176605225, + "learning_rate": 8.701431492842537e-05, + "loss": 2.9399, + "step": 852 + }, + { + "epoch": 0.261817065684469, + "grad_norm": 2.2800698280334473, + "learning_rate": 8.711656441717791e-05, + "loss": 2.9465, + "step": 853 + }, + { + "epoch": 0.26212400245549416, + "grad_norm": 2.3628437519073486, + "learning_rate": 8.721881390593048e-05, + "loss": 3.0298, + "step": 854 + }, + { + "epoch": 0.26243093922651933, + "grad_norm": 1.9642207622528076, + "learning_rate": 8.732106339468303e-05, + "loss": 2.8462, + "step": 855 + }, + { + "epoch": 0.2627378759975445, + "grad_norm": 2.5833423137664795, + "learning_rate": 8.742331288343558e-05, + "loss": 2.9024, + "step": 856 + }, + { + "epoch": 0.26304481276856967, + "grad_norm": 1.7022998332977295, + "learning_rate": 8.752556237218814e-05, + "loss": 2.9948, + "step": 857 + }, + { + "epoch": 0.26335174953959484, + "grad_norm": 3.181725025177002, + "learning_rate": 8.76278118609407e-05, + "loss": 3.0634, + "step": 858 + }, + { + "epoch": 0.26365868631062, + "grad_norm": 1.8931077718734741, + "learning_rate": 8.773006134969326e-05, + "loss": 2.9974, + "step": 859 + }, + { + "epoch": 0.2639656230816452, + "grad_norm": 2.5016703605651855, + "learning_rate": 8.783231083844581e-05, + "loss": 3.0109, + "step": 860 + }, + { + "epoch": 0.26427255985267034, + "grad_norm": 1.810957908630371, + "learning_rate": 8.793456032719837e-05, + "loss": 3.0143, + "step": 861 + }, + { + "epoch": 0.2645794966236955, + "grad_norm": 2.3004086017608643, + "learning_rate": 8.803680981595093e-05, + "loss": 2.9825, + "step": 862 + }, + { + "epoch": 0.2648864333947207, + "grad_norm": 2.23740816116333, + "learning_rate": 8.813905930470347e-05, + "loss": 2.8897, + "step": 863 + }, + { + "epoch": 0.26519337016574585, + "grad_norm": 2.441157579421997, + "learning_rate": 8.824130879345604e-05, + "loss": 2.8966, + "step": 864 + }, + { + "epoch": 0.265500306936771, + "grad_norm": 2.063201665878296, + "learning_rate": 8.83435582822086e-05, + "loss": 2.9468, + "step": 865 + }, + { + "epoch": 0.2658072437077962, + "grad_norm": 2.1484951972961426, + "learning_rate": 8.844580777096115e-05, + "loss": 2.9199, + "step": 866 + }, + { + "epoch": 0.26611418047882135, + "grad_norm": 2.167827844619751, + "learning_rate": 8.85480572597137e-05, + "loss": 2.9403, + "step": 867 + }, + { + "epoch": 0.2664211172498465, + "grad_norm": 2.193556070327759, + "learning_rate": 8.865030674846625e-05, + "loss": 2.9171, + "step": 868 + }, + { + "epoch": 0.2667280540208717, + "grad_norm": 2.0754151344299316, + "learning_rate": 8.875255623721882e-05, + "loss": 2.9605, + "step": 869 + }, + { + "epoch": 0.26703499079189685, + "grad_norm": 2.1351094245910645, + "learning_rate": 8.885480572597138e-05, + "loss": 2.9272, + "step": 870 + }, + { + "epoch": 0.267341927562922, + "grad_norm": 2.0486347675323486, + "learning_rate": 8.895705521472393e-05, + "loss": 3.0308, + "step": 871 + }, + { + "epoch": 0.2676488643339472, + "grad_norm": 2.3303308486938477, + "learning_rate": 8.905930470347648e-05, + "loss": 2.9061, + "step": 872 + }, + { + "epoch": 0.26795580110497236, + "grad_norm": 1.9345083236694336, + "learning_rate": 8.916155419222905e-05, + "loss": 2.9644, + "step": 873 + }, + { + "epoch": 0.2682627378759975, + "grad_norm": 2.451918601989746, + "learning_rate": 8.926380368098159e-05, + "loss": 2.9536, + "step": 874 + }, + { + "epoch": 0.2685696746470227, + "grad_norm": 1.6964573860168457, + "learning_rate": 8.936605316973416e-05, + "loss": 2.9228, + "step": 875 + }, + { + "epoch": 0.26887661141804786, + "grad_norm": 2.2414000034332275, + "learning_rate": 8.946830265848671e-05, + "loss": 2.9776, + "step": 876 + }, + { + "epoch": 0.26918354818907303, + "grad_norm": 1.725002408027649, + "learning_rate": 8.957055214723928e-05, + "loss": 2.9837, + "step": 877 + }, + { + "epoch": 0.2694904849600982, + "grad_norm": 2.1498587131500244, + "learning_rate": 8.967280163599182e-05, + "loss": 2.8684, + "step": 878 + }, + { + "epoch": 0.26979742173112337, + "grad_norm": 1.814738392829895, + "learning_rate": 8.977505112474438e-05, + "loss": 2.9077, + "step": 879 + }, + { + "epoch": 0.27010435850214853, + "grad_norm": 2.3086628913879395, + "learning_rate": 8.987730061349694e-05, + "loss": 2.9482, + "step": 880 + }, + { + "epoch": 0.2704112952731737, + "grad_norm": 1.7470855712890625, + "learning_rate": 8.997955010224949e-05, + "loss": 2.9775, + "step": 881 + }, + { + "epoch": 0.27071823204419887, + "grad_norm": 2.2822775840759277, + "learning_rate": 9.008179959100205e-05, + "loss": 3.0004, + "step": 882 + }, + { + "epoch": 0.27102516881522404, + "grad_norm": 1.9530903100967407, + "learning_rate": 9.018404907975461e-05, + "loss": 2.949, + "step": 883 + }, + { + "epoch": 0.2713321055862492, + "grad_norm": 2.0626885890960693, + "learning_rate": 9.028629856850717e-05, + "loss": 2.9184, + "step": 884 + }, + { + "epoch": 0.2716390423572744, + "grad_norm": 2.0040712356567383, + "learning_rate": 9.038854805725972e-05, + "loss": 2.8562, + "step": 885 + }, + { + "epoch": 0.2719459791282996, + "grad_norm": 2.026193141937256, + "learning_rate": 9.049079754601227e-05, + "loss": 2.883, + "step": 886 + }, + { + "epoch": 0.27225291589932477, + "grad_norm": 1.8337095975875854, + "learning_rate": 9.059304703476483e-05, + "loss": 2.8512, + "step": 887 + }, + { + "epoch": 0.27255985267034993, + "grad_norm": 2.1098122596740723, + "learning_rate": 9.069529652351738e-05, + "loss": 2.9024, + "step": 888 + }, + { + "epoch": 0.2728667894413751, + "grad_norm": 2.065650701522827, + "learning_rate": 9.079754601226993e-05, + "loss": 2.9291, + "step": 889 + }, + { + "epoch": 0.27317372621240027, + "grad_norm": 2.204819679260254, + "learning_rate": 9.08997955010225e-05, + "loss": 2.9153, + "step": 890 + }, + { + "epoch": 0.27348066298342544, + "grad_norm": 1.7931475639343262, + "learning_rate": 9.100204498977506e-05, + "loss": 2.9104, + "step": 891 + }, + { + "epoch": 0.2737875997544506, + "grad_norm": 2.4288859367370605, + "learning_rate": 9.110429447852761e-05, + "loss": 2.9974, + "step": 892 + }, + { + "epoch": 0.2740945365254758, + "grad_norm": 2.095872640609741, + "learning_rate": 9.120654396728016e-05, + "loss": 2.8446, + "step": 893 + }, + { + "epoch": 0.27440147329650094, + "grad_norm": 2.054410696029663, + "learning_rate": 9.130879345603273e-05, + "loss": 2.9008, + "step": 894 + }, + { + "epoch": 0.2747084100675261, + "grad_norm": 2.1989710330963135, + "learning_rate": 9.141104294478528e-05, + "loss": 2.8808, + "step": 895 + }, + { + "epoch": 0.2750153468385513, + "grad_norm": 2.531081199645996, + "learning_rate": 9.151329243353784e-05, + "loss": 2.8928, + "step": 896 + }, + { + "epoch": 0.27532228360957645, + "grad_norm": 2.010425567626953, + "learning_rate": 9.161554192229039e-05, + "loss": 2.9051, + "step": 897 + }, + { + "epoch": 0.2756292203806016, + "grad_norm": 1.9320241212844849, + "learning_rate": 9.171779141104296e-05, + "loss": 2.8675, + "step": 898 + }, + { + "epoch": 0.2759361571516268, + "grad_norm": 2.2280430793762207, + "learning_rate": 9.18200408997955e-05, + "loss": 2.9082, + "step": 899 + }, + { + "epoch": 0.27624309392265195, + "grad_norm": 1.9172335863113403, + "learning_rate": 9.192229038854807e-05, + "loss": 2.8947, + "step": 900 + }, + { + "epoch": 0.2765500306936771, + "grad_norm": 2.0846056938171387, + "learning_rate": 9.202453987730062e-05, + "loss": 2.9161, + "step": 901 + }, + { + "epoch": 0.2768569674647023, + "grad_norm": 1.875034213066101, + "learning_rate": 9.212678936605317e-05, + "loss": 2.8937, + "step": 902 + }, + { + "epoch": 0.27716390423572745, + "grad_norm": 2.230164051055908, + "learning_rate": 9.222903885480573e-05, + "loss": 2.8396, + "step": 903 + }, + { + "epoch": 0.2774708410067526, + "grad_norm": 1.6204382181167603, + "learning_rate": 9.233128834355828e-05, + "loss": 2.9367, + "step": 904 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.4218156337738037, + "learning_rate": 9.243353783231085e-05, + "loss": 2.9727, + "step": 905 + }, + { + "epoch": 0.27808471454880296, + "grad_norm": 1.7401793003082275, + "learning_rate": 9.25357873210634e-05, + "loss": 2.8957, + "step": 906 + }, + { + "epoch": 0.2783916513198281, + "grad_norm": 2.2128076553344727, + "learning_rate": 9.263803680981595e-05, + "loss": 2.8725, + "step": 907 + }, + { + "epoch": 0.2786985880908533, + "grad_norm": 2.004179000854492, + "learning_rate": 9.274028629856851e-05, + "loss": 2.8879, + "step": 908 + }, + { + "epoch": 0.27900552486187846, + "grad_norm": 2.198784112930298, + "learning_rate": 9.284253578732107e-05, + "loss": 2.9655, + "step": 909 + }, + { + "epoch": 0.27931246163290363, + "grad_norm": 1.8064004182815552, + "learning_rate": 9.294478527607362e-05, + "loss": 2.7801, + "step": 910 + }, + { + "epoch": 0.2796193984039288, + "grad_norm": 2.1273581981658936, + "learning_rate": 9.304703476482618e-05, + "loss": 2.8615, + "step": 911 + }, + { + "epoch": 0.27992633517495397, + "grad_norm": 1.7843197584152222, + "learning_rate": 9.314928425357874e-05, + "loss": 2.8735, + "step": 912 + }, + { + "epoch": 0.28023327194597913, + "grad_norm": 2.234886884689331, + "learning_rate": 9.325153374233129e-05, + "loss": 2.9444, + "step": 913 + }, + { + "epoch": 0.2805402087170043, + "grad_norm": 2.0565783977508545, + "learning_rate": 9.335378323108384e-05, + "loss": 2.9784, + "step": 914 + }, + { + "epoch": 0.28084714548802947, + "grad_norm": 1.836901068687439, + "learning_rate": 9.345603271983641e-05, + "loss": 2.9217, + "step": 915 + }, + { + "epoch": 0.28115408225905464, + "grad_norm": 2.0981357097625732, + "learning_rate": 9.355828220858896e-05, + "loss": 2.9091, + "step": 916 + }, + { + "epoch": 0.2814610190300798, + "grad_norm": 1.9199821949005127, + "learning_rate": 9.366053169734152e-05, + "loss": 2.8882, + "step": 917 + }, + { + "epoch": 0.281767955801105, + "grad_norm": 1.9928756952285767, + "learning_rate": 9.376278118609407e-05, + "loss": 2.8463, + "step": 918 + }, + { + "epoch": 0.28207489257213014, + "grad_norm": 1.9580156803131104, + "learning_rate": 9.386503067484664e-05, + "loss": 2.7814, + "step": 919 + }, + { + "epoch": 0.2823818293431553, + "grad_norm": 2.016144275665283, + "learning_rate": 9.396728016359919e-05, + "loss": 2.8725, + "step": 920 + }, + { + "epoch": 0.2826887661141805, + "grad_norm": 1.967668890953064, + "learning_rate": 9.406952965235175e-05, + "loss": 2.912, + "step": 921 + }, + { + "epoch": 0.28299570288520565, + "grad_norm": 1.8826593160629272, + "learning_rate": 9.41717791411043e-05, + "loss": 2.7885, + "step": 922 + }, + { + "epoch": 0.2833026396562308, + "grad_norm": 2.0615732669830322, + "learning_rate": 9.427402862985685e-05, + "loss": 2.9111, + "step": 923 + }, + { + "epoch": 0.283609576427256, + "grad_norm": 1.7132701873779297, + "learning_rate": 9.43762781186094e-05, + "loss": 2.89, + "step": 924 + }, + { + "epoch": 0.28391651319828115, + "grad_norm": 2.1561272144317627, + "learning_rate": 9.447852760736196e-05, + "loss": 2.8741, + "step": 925 + }, + { + "epoch": 0.2842234499693063, + "grad_norm": 1.727338433265686, + "learning_rate": 9.458077709611453e-05, + "loss": 2.8449, + "step": 926 + }, + { + "epoch": 0.2845303867403315, + "grad_norm": 2.19234299659729, + "learning_rate": 9.468302658486708e-05, + "loss": 2.8499, + "step": 927 + }, + { + "epoch": 0.28483732351135665, + "grad_norm": 1.7370812892913818, + "learning_rate": 9.478527607361963e-05, + "loss": 2.882, + "step": 928 + }, + { + "epoch": 0.2851442602823818, + "grad_norm": 2.0576157569885254, + "learning_rate": 9.488752556237219e-05, + "loss": 2.7869, + "step": 929 + }, + { + "epoch": 0.285451197053407, + "grad_norm": 1.7926486730575562, + "learning_rate": 9.498977505112476e-05, + "loss": 2.906, + "step": 930 + }, + { + "epoch": 0.28575813382443216, + "grad_norm": 1.6877856254577637, + "learning_rate": 9.50920245398773e-05, + "loss": 2.8422, + "step": 931 + }, + { + "epoch": 0.2860650705954573, + "grad_norm": 2.3053178787231445, + "learning_rate": 9.519427402862986e-05, + "loss": 2.9039, + "step": 932 + }, + { + "epoch": 0.2863720073664825, + "grad_norm": 1.7746092081069946, + "learning_rate": 9.529652351738242e-05, + "loss": 2.9082, + "step": 933 + }, + { + "epoch": 0.28667894413750766, + "grad_norm": 2.1900086402893066, + "learning_rate": 9.539877300613498e-05, + "loss": 2.8511, + "step": 934 + }, + { + "epoch": 0.28698588090853283, + "grad_norm": 1.781988501548767, + "learning_rate": 9.550102249488752e-05, + "loss": 2.8264, + "step": 935 + }, + { + "epoch": 0.287292817679558, + "grad_norm": 1.845797061920166, + "learning_rate": 9.560327198364009e-05, + "loss": 2.8657, + "step": 936 + }, + { + "epoch": 0.28759975445058317, + "grad_norm": 1.8794586658477783, + "learning_rate": 9.570552147239264e-05, + "loss": 2.8365, + "step": 937 + }, + { + "epoch": 0.28790669122160834, + "grad_norm": 2.078359603881836, + "learning_rate": 9.58077709611452e-05, + "loss": 2.8829, + "step": 938 + }, + { + "epoch": 0.2882136279926335, + "grad_norm": 1.8091285228729248, + "learning_rate": 9.591002044989775e-05, + "loss": 2.8083, + "step": 939 + }, + { + "epoch": 0.28852056476365867, + "grad_norm": 2.0130608081817627, + "learning_rate": 9.601226993865032e-05, + "loss": 2.8922, + "step": 940 + }, + { + "epoch": 0.28882750153468384, + "grad_norm": 1.8504360914230347, + "learning_rate": 9.611451942740287e-05, + "loss": 2.8034, + "step": 941 + }, + { + "epoch": 0.289134438305709, + "grad_norm": 1.860420823097229, + "learning_rate": 9.621676891615543e-05, + "loss": 2.8249, + "step": 942 + }, + { + "epoch": 0.2894413750767342, + "grad_norm": 2.157158374786377, + "learning_rate": 9.631901840490798e-05, + "loss": 2.8629, + "step": 943 + }, + { + "epoch": 0.28974831184775934, + "grad_norm": 1.8066895008087158, + "learning_rate": 9.642126789366053e-05, + "loss": 2.7965, + "step": 944 + }, + { + "epoch": 0.2900552486187845, + "grad_norm": 1.9674500226974487, + "learning_rate": 9.65235173824131e-05, + "loss": 2.8043, + "step": 945 + }, + { + "epoch": 0.2903621853898097, + "grad_norm": 1.7899354696273804, + "learning_rate": 9.662576687116564e-05, + "loss": 2.8803, + "step": 946 + }, + { + "epoch": 0.29066912216083485, + "grad_norm": 2.220201015472412, + "learning_rate": 9.672801635991821e-05, + "loss": 2.8201, + "step": 947 + }, + { + "epoch": 0.29097605893186, + "grad_norm": 1.76320219039917, + "learning_rate": 9.683026584867076e-05, + "loss": 2.8921, + "step": 948 + }, + { + "epoch": 0.2912829957028852, + "grad_norm": 1.6863081455230713, + "learning_rate": 9.693251533742331e-05, + "loss": 2.8208, + "step": 949 + }, + { + "epoch": 0.29158993247391035, + "grad_norm": 2.1578476428985596, + "learning_rate": 9.703476482617587e-05, + "loss": 2.8972, + "step": 950 + }, + { + "epoch": 0.2918968692449355, + "grad_norm": 1.6925181150436401, + "learning_rate": 9.713701431492844e-05, + "loss": 2.8225, + "step": 951 + }, + { + "epoch": 0.2922038060159607, + "grad_norm": 1.8861147165298462, + "learning_rate": 9.723926380368099e-05, + "loss": 2.8707, + "step": 952 + }, + { + "epoch": 0.29251074278698586, + "grad_norm": 1.5894604921340942, + "learning_rate": 9.734151329243354e-05, + "loss": 2.7576, + "step": 953 + }, + { + "epoch": 0.292817679558011, + "grad_norm": 1.9092673063278198, + "learning_rate": 9.74437627811861e-05, + "loss": 2.8659, + "step": 954 + }, + { + "epoch": 0.2931246163290362, + "grad_norm": 1.8600605726242065, + "learning_rate": 9.754601226993866e-05, + "loss": 2.752, + "step": 955 + }, + { + "epoch": 0.29343155310006136, + "grad_norm": 2.005805015563965, + "learning_rate": 9.76482617586912e-05, + "loss": 2.8511, + "step": 956 + }, + { + "epoch": 0.2937384898710866, + "grad_norm": 1.9485148191452026, + "learning_rate": 9.775051124744377e-05, + "loss": 2.9726, + "step": 957 + }, + { + "epoch": 0.29404542664211175, + "grad_norm": 1.9197280406951904, + "learning_rate": 9.785276073619632e-05, + "loss": 2.7753, + "step": 958 + }, + { + "epoch": 0.2943523634131369, + "grad_norm": 1.6279773712158203, + "learning_rate": 9.795501022494888e-05, + "loss": 2.8855, + "step": 959 + }, + { + "epoch": 0.2946593001841621, + "grad_norm": 2.0233097076416016, + "learning_rate": 9.805725971370143e-05, + "loss": 2.749, + "step": 960 + }, + { + "epoch": 0.29496623695518726, + "grad_norm": 1.550295352935791, + "learning_rate": 9.815950920245399e-05, + "loss": 2.7991, + "step": 961 + }, + { + "epoch": 0.2952731737262124, + "grad_norm": 2.3194360733032227, + "learning_rate": 9.826175869120655e-05, + "loss": 2.8208, + "step": 962 + }, + { + "epoch": 0.2955801104972376, + "grad_norm": 1.634867787361145, + "learning_rate": 9.83640081799591e-05, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.29588704726826276, + "grad_norm": 2.1152596473693848, + "learning_rate": 9.846625766871166e-05, + "loss": 2.7667, + "step": 964 + }, + { + "epoch": 0.2961939840392879, + "grad_norm": 1.8927233219146729, + "learning_rate": 9.856850715746421e-05, + "loss": 2.8308, + "step": 965 + }, + { + "epoch": 0.2965009208103131, + "grad_norm": 1.765026330947876, + "learning_rate": 9.867075664621678e-05, + "loss": 2.7546, + "step": 966 + }, + { + "epoch": 0.29680785758133826, + "grad_norm": 1.7491015195846558, + "learning_rate": 9.877300613496932e-05, + "loss": 2.8156, + "step": 967 + }, + { + "epoch": 0.29711479435236343, + "grad_norm": 1.8352077007293701, + "learning_rate": 9.887525562372189e-05, + "loss": 2.8542, + "step": 968 + }, + { + "epoch": 0.2974217311233886, + "grad_norm": 1.8892323970794678, + "learning_rate": 9.897750511247444e-05, + "loss": 2.8216, + "step": 969 + }, + { + "epoch": 0.29772866789441377, + "grad_norm": 1.7171403169631958, + "learning_rate": 9.907975460122701e-05, + "loss": 2.8428, + "step": 970 + }, + { + "epoch": 0.29803560466543894, + "grad_norm": 1.8318040370941162, + "learning_rate": 9.918200408997955e-05, + "loss": 2.7821, + "step": 971 + }, + { + "epoch": 0.2983425414364641, + "grad_norm": 1.5829975605010986, + "learning_rate": 9.928425357873212e-05, + "loss": 2.9091, + "step": 972 + }, + { + "epoch": 0.29864947820748927, + "grad_norm": 1.7248235940933228, + "learning_rate": 9.938650306748467e-05, + "loss": 2.7914, + "step": 973 + }, + { + "epoch": 0.29895641497851444, + "grad_norm": 1.7741187810897827, + "learning_rate": 9.948875255623722e-05, + "loss": 2.8711, + "step": 974 + }, + { + "epoch": 0.2992633517495396, + "grad_norm": 1.7419151067733765, + "learning_rate": 9.959100204498978e-05, + "loss": 2.8933, + "step": 975 + }, + { + "epoch": 0.2995702885205648, + "grad_norm": 1.6603926420211792, + "learning_rate": 9.969325153374234e-05, + "loss": 2.7138, + "step": 976 + }, + { + "epoch": 0.29987722529158994, + "grad_norm": 1.8423576354980469, + "learning_rate": 9.97955010224949e-05, + "loss": 2.7776, + "step": 977 + }, + { + "epoch": 0.3001841620626151, + "grad_norm": 1.5548568964004517, + "learning_rate": 9.989775051124745e-05, + "loss": 2.8193, + "step": 978 + }, + { + "epoch": 0.3004910988336403, + "grad_norm": 1.711785078048706, + "learning_rate": 0.0001, + "loss": 2.7082, + "step": 979 + }, + { + "epoch": 0.30079803560466545, + "grad_norm": 1.6395221948623657, + "learning_rate": 9.999999975293535e-05, + "loss": 2.7526, + "step": 980 + }, + { + "epoch": 0.3011049723756906, + "grad_norm": 1.829174518585205, + "learning_rate": 9.999999901174139e-05, + "loss": 2.7555, + "step": 981 + }, + { + "epoch": 0.3014119091467158, + "grad_norm": 1.5807569026947021, + "learning_rate": 9.999999777641814e-05, + "loss": 2.848, + "step": 982 + }, + { + "epoch": 0.30171884591774095, + "grad_norm": 2.014803171157837, + "learning_rate": 9.99999960469656e-05, + "loss": 2.8318, + "step": 983 + }, + { + "epoch": 0.3020257826887661, + "grad_norm": 1.4732542037963867, + "learning_rate": 9.99999938233838e-05, + "loss": 2.8143, + "step": 984 + }, + { + "epoch": 0.3023327194597913, + "grad_norm": 2.4888343811035156, + "learning_rate": 9.999999110567275e-05, + "loss": 2.7979, + "step": 985 + }, + { + "epoch": 0.30263965623081646, + "grad_norm": 1.4265737533569336, + "learning_rate": 9.99999878938325e-05, + "loss": 2.7968, + "step": 986 + }, + { + "epoch": 0.3029465930018416, + "grad_norm": 2.0397326946258545, + "learning_rate": 9.999998418786303e-05, + "loss": 2.7413, + "step": 987 + }, + { + "epoch": 0.3032535297728668, + "grad_norm": 1.6565579175949097, + "learning_rate": 9.999997998776443e-05, + "loss": 2.8249, + "step": 988 + }, + { + "epoch": 0.30356046654389196, + "grad_norm": 1.8470033407211304, + "learning_rate": 9.999997529353673e-05, + "loss": 2.7815, + "step": 989 + }, + { + "epoch": 0.30386740331491713, + "grad_norm": 1.571768045425415, + "learning_rate": 9.999997010517995e-05, + "loss": 2.7202, + "step": 990 + }, + { + "epoch": 0.3041743400859423, + "grad_norm": 1.6217811107635498, + "learning_rate": 9.999996442269417e-05, + "loss": 2.832, + "step": 991 + }, + { + "epoch": 0.30448127685696746, + "grad_norm": 1.745591640472412, + "learning_rate": 9.999995824607943e-05, + "loss": 2.8271, + "step": 992 + }, + { + "epoch": 0.30478821362799263, + "grad_norm": 1.6469355821609497, + "learning_rate": 9.99999515753358e-05, + "loss": 2.7699, + "step": 993 + }, + { + "epoch": 0.3050951503990178, + "grad_norm": 1.733182430267334, + "learning_rate": 9.999994441046334e-05, + "loss": 2.7927, + "step": 994 + }, + { + "epoch": 0.30540208717004297, + "grad_norm": 1.6043230295181274, + "learning_rate": 9.999993675146213e-05, + "loss": 2.7536, + "step": 995 + }, + { + "epoch": 0.30570902394106814, + "grad_norm": 1.8154711723327637, + "learning_rate": 9.999992859833222e-05, + "loss": 2.7795, + "step": 996 + }, + { + "epoch": 0.3060159607120933, + "grad_norm": 1.7553666830062866, + "learning_rate": 9.999991995107374e-05, + "loss": 2.8128, + "step": 997 + }, + { + "epoch": 0.3063228974831185, + "grad_norm": 1.702697992324829, + "learning_rate": 9.999991080968672e-05, + "loss": 2.7234, + "step": 998 + }, + { + "epoch": 0.30662983425414364, + "grad_norm": 1.512619972229004, + "learning_rate": 9.99999011741713e-05, + "loss": 2.7555, + "step": 999 + }, + { + "epoch": 0.3069367710251688, + "grad_norm": 1.735844612121582, + "learning_rate": 9.999989104452753e-05, + "loss": 2.7847, + "step": 1000 + }, + { + "epoch": 0.307243707796194, + "grad_norm": 1.4687904119491577, + "learning_rate": 9.999988042075555e-05, + "loss": 2.8039, + "step": 1001 + }, + { + "epoch": 0.30755064456721914, + "grad_norm": 1.6867917776107788, + "learning_rate": 9.999986930285542e-05, + "loss": 2.7643, + "step": 1002 + }, + { + "epoch": 0.3078575813382443, + "grad_norm": 1.6974400281906128, + "learning_rate": 9.99998576908273e-05, + "loss": 2.7284, + "step": 1003 + }, + { + "epoch": 0.3081645181092695, + "grad_norm": 1.6622353792190552, + "learning_rate": 9.999984558467126e-05, + "loss": 2.8364, + "step": 1004 + }, + { + "epoch": 0.30847145488029465, + "grad_norm": 1.7920496463775635, + "learning_rate": 9.999983298438744e-05, + "loss": 2.7769, + "step": 1005 + }, + { + "epoch": 0.3087783916513198, + "grad_norm": 1.7111997604370117, + "learning_rate": 9.999981988997598e-05, + "loss": 2.7323, + "step": 1006 + }, + { + "epoch": 0.309085328422345, + "grad_norm": 1.6372064352035522, + "learning_rate": 9.9999806301437e-05, + "loss": 2.8128, + "step": 1007 + }, + { + "epoch": 0.30939226519337015, + "grad_norm": 1.841002345085144, + "learning_rate": 9.999979221877061e-05, + "loss": 2.7049, + "step": 1008 + }, + { + "epoch": 0.3096992019643953, + "grad_norm": 1.4474141597747803, + "learning_rate": 9.999977764197697e-05, + "loss": 2.64, + "step": 1009 + }, + { + "epoch": 0.3100061387354205, + "grad_norm": 1.6599560976028442, + "learning_rate": 9.999976257105622e-05, + "loss": 2.7989, + "step": 1010 + }, + { + "epoch": 0.31031307550644566, + "grad_norm": 1.7502890825271606, + "learning_rate": 9.999974700600851e-05, + "loss": 2.7949, + "step": 1011 + }, + { + "epoch": 0.3106200122774708, + "grad_norm": 1.8119313716888428, + "learning_rate": 9.9999730946834e-05, + "loss": 2.7577, + "step": 1012 + }, + { + "epoch": 0.310926949048496, + "grad_norm": 1.4398404359817505, + "learning_rate": 9.999971439353284e-05, + "loss": 2.7369, + "step": 1013 + }, + { + "epoch": 0.31123388581952116, + "grad_norm": 1.8501840829849243, + "learning_rate": 9.999969734610522e-05, + "loss": 2.6651, + "step": 1014 + }, + { + "epoch": 0.31154082259054633, + "grad_norm": 1.450804352760315, + "learning_rate": 9.999967980455125e-05, + "loss": 2.7231, + "step": 1015 + }, + { + "epoch": 0.3118477593615715, + "grad_norm": 1.9445282220840454, + "learning_rate": 9.999966176887115e-05, + "loss": 2.795, + "step": 1016 + }, + { + "epoch": 0.31215469613259667, + "grad_norm": 1.6361008882522583, + "learning_rate": 9.99996432390651e-05, + "loss": 2.8894, + "step": 1017 + }, + { + "epoch": 0.31246163290362183, + "grad_norm": 2.0804831981658936, + "learning_rate": 9.999962421513325e-05, + "loss": 2.8313, + "step": 1018 + }, + { + "epoch": 0.312768569674647, + "grad_norm": 1.3779852390289307, + "learning_rate": 9.999960469707582e-05, + "loss": 2.6776, + "step": 1019 + }, + { + "epoch": 0.31307550644567217, + "grad_norm": 1.7727700471878052, + "learning_rate": 9.999958468489299e-05, + "loss": 2.8076, + "step": 1020 + }, + { + "epoch": 0.31338244321669734, + "grad_norm": 1.5273795127868652, + "learning_rate": 9.999956417858496e-05, + "loss": 2.7069, + "step": 1021 + }, + { + "epoch": 0.3136893799877225, + "grad_norm": 1.8135402202606201, + "learning_rate": 9.999954317815193e-05, + "loss": 2.7375, + "step": 1022 + }, + { + "epoch": 0.3139963167587477, + "grad_norm": 1.6642818450927734, + "learning_rate": 9.99995216835941e-05, + "loss": 2.8085, + "step": 1023 + }, + { + "epoch": 0.31430325352977284, + "grad_norm": 1.681378722190857, + "learning_rate": 9.999949969491169e-05, + "loss": 2.807, + "step": 1024 + }, + { + "epoch": 0.314610190300798, + "grad_norm": 1.5521160364151, + "learning_rate": 9.999947721210493e-05, + "loss": 2.7266, + "step": 1025 + }, + { + "epoch": 0.3149171270718232, + "grad_norm": 1.486830711364746, + "learning_rate": 9.999945423517403e-05, + "loss": 2.774, + "step": 1026 + }, + { + "epoch": 0.3152240638428484, + "grad_norm": 1.5730900764465332, + "learning_rate": 9.99994307641192e-05, + "loss": 2.7101, + "step": 1027 + }, + { + "epoch": 0.31553100061387357, + "grad_norm": 1.4835596084594727, + "learning_rate": 9.999940679894071e-05, + "loss": 2.8195, + "step": 1028 + }, + { + "epoch": 0.31583793738489874, + "grad_norm": 1.7885956764221191, + "learning_rate": 9.999938233963877e-05, + "loss": 2.796, + "step": 1029 + }, + { + "epoch": 0.3161448741559239, + "grad_norm": 1.4036259651184082, + "learning_rate": 9.999935738621362e-05, + "loss": 2.7167, + "step": 1030 + }, + { + "epoch": 0.3164518109269491, + "grad_norm": 1.7480512857437134, + "learning_rate": 9.999933193866554e-05, + "loss": 2.6774, + "step": 1031 + }, + { + "epoch": 0.31675874769797424, + "grad_norm": 1.66177499294281, + "learning_rate": 9.999930599699473e-05, + "loss": 2.7635, + "step": 1032 + }, + { + "epoch": 0.3170656844689994, + "grad_norm": 1.5088306665420532, + "learning_rate": 9.999927956120147e-05, + "loss": 2.7284, + "step": 1033 + }, + { + "epoch": 0.3173726212400246, + "grad_norm": 1.6847199201583862, + "learning_rate": 9.999925263128605e-05, + "loss": 2.8287, + "step": 1034 + }, + { + "epoch": 0.31767955801104975, + "grad_norm": 1.6092369556427002, + "learning_rate": 9.999922520724869e-05, + "loss": 2.7189, + "step": 1035 + }, + { + "epoch": 0.3179864947820749, + "grad_norm": 1.41717529296875, + "learning_rate": 9.999919728908969e-05, + "loss": 2.7134, + "step": 1036 + }, + { + "epoch": 0.3182934315531001, + "grad_norm": 1.6256498098373413, + "learning_rate": 9.999916887680931e-05, + "loss": 2.7312, + "step": 1037 + }, + { + "epoch": 0.31860036832412525, + "grad_norm": 1.4934377670288086, + "learning_rate": 9.999913997040784e-05, + "loss": 2.7548, + "step": 1038 + }, + { + "epoch": 0.3189073050951504, + "grad_norm": 1.6037719249725342, + "learning_rate": 9.999911056988557e-05, + "loss": 2.7682, + "step": 1039 + }, + { + "epoch": 0.3192142418661756, + "grad_norm": 1.4746284484863281, + "learning_rate": 9.999908067524277e-05, + "loss": 2.7256, + "step": 1040 + }, + { + "epoch": 0.31952117863720075, + "grad_norm": 1.4633710384368896, + "learning_rate": 9.999905028647976e-05, + "loss": 2.6779, + "step": 1041 + }, + { + "epoch": 0.3198281154082259, + "grad_norm": 1.6108646392822266, + "learning_rate": 9.999901940359684e-05, + "loss": 2.781, + "step": 1042 + }, + { + "epoch": 0.3201350521792511, + "grad_norm": 1.4130996465682983, + "learning_rate": 9.999898802659428e-05, + "loss": 2.6327, + "step": 1043 + }, + { + "epoch": 0.32044198895027626, + "grad_norm": 2.110307455062866, + "learning_rate": 9.999895615547244e-05, + "loss": 2.7965, + "step": 1044 + }, + { + "epoch": 0.3207489257213014, + "grad_norm": 1.500618815422058, + "learning_rate": 9.99989237902316e-05, + "loss": 2.7874, + "step": 1045 + }, + { + "epoch": 0.3210558624923266, + "grad_norm": 1.577890157699585, + "learning_rate": 9.999889093087207e-05, + "loss": 2.6816, + "step": 1046 + }, + { + "epoch": 0.32136279926335176, + "grad_norm": 1.2820981740951538, + "learning_rate": 9.999885757739422e-05, + "loss": 2.6799, + "step": 1047 + }, + { + "epoch": 0.32166973603437693, + "grad_norm": 1.629936695098877, + "learning_rate": 9.999882372979835e-05, + "loss": 2.6783, + "step": 1048 + }, + { + "epoch": 0.3219766728054021, + "grad_norm": 1.3119972944259644, + "learning_rate": 9.999878938808478e-05, + "loss": 2.6403, + "step": 1049 + }, + { + "epoch": 0.32228360957642727, + "grad_norm": 1.720093846321106, + "learning_rate": 9.999875455225389e-05, + "loss": 2.709, + "step": 1050 + }, + { + "epoch": 0.32259054634745243, + "grad_norm": 1.446273922920227, + "learning_rate": 9.999871922230599e-05, + "loss": 2.6463, + "step": 1051 + }, + { + "epoch": 0.3228974831184776, + "grad_norm": 1.5000908374786377, + "learning_rate": 9.999868339824145e-05, + "loss": 2.7502, + "step": 1052 + }, + { + "epoch": 0.32320441988950277, + "grad_norm": 1.6257869005203247, + "learning_rate": 9.999864708006061e-05, + "loss": 2.6984, + "step": 1053 + }, + { + "epoch": 0.32351135666052794, + "grad_norm": 1.509638786315918, + "learning_rate": 9.999861026776384e-05, + "loss": 2.6931, + "step": 1054 + }, + { + "epoch": 0.3238182934315531, + "grad_norm": 1.5305874347686768, + "learning_rate": 9.999857296135149e-05, + "loss": 2.8423, + "step": 1055 + }, + { + "epoch": 0.3241252302025783, + "grad_norm": 1.7664300203323364, + "learning_rate": 9.999853516082394e-05, + "loss": 2.7703, + "step": 1056 + }, + { + "epoch": 0.32443216697360344, + "grad_norm": 1.4633153676986694, + "learning_rate": 9.999849686618157e-05, + "loss": 2.7588, + "step": 1057 + }, + { + "epoch": 0.3247391037446286, + "grad_norm": 1.5177773237228394, + "learning_rate": 9.999845807742473e-05, + "loss": 2.7376, + "step": 1058 + }, + { + "epoch": 0.3250460405156538, + "grad_norm": 1.6122089624404907, + "learning_rate": 9.999841879455383e-05, + "loss": 2.7871, + "step": 1059 + }, + { + "epoch": 0.32535297728667895, + "grad_norm": 1.4331457614898682, + "learning_rate": 9.999837901756926e-05, + "loss": 2.6602, + "step": 1060 + }, + { + "epoch": 0.3256599140577041, + "grad_norm": 1.5714327096939087, + "learning_rate": 9.99983387464714e-05, + "loss": 2.6279, + "step": 1061 + }, + { + "epoch": 0.3259668508287293, + "grad_norm": 1.399731993675232, + "learning_rate": 9.999829798126065e-05, + "loss": 2.7957, + "step": 1062 + }, + { + "epoch": 0.32627378759975445, + "grad_norm": 1.694368839263916, + "learning_rate": 9.999825672193741e-05, + "loss": 2.6859, + "step": 1063 + }, + { + "epoch": 0.3265807243707796, + "grad_norm": 1.2585967779159546, + "learning_rate": 9.99982149685021e-05, + "loss": 2.7964, + "step": 1064 + }, + { + "epoch": 0.3268876611418048, + "grad_norm": 1.802262306213379, + "learning_rate": 9.999817272095512e-05, + "loss": 2.6325, + "step": 1065 + }, + { + "epoch": 0.32719459791282995, + "grad_norm": 1.213222861289978, + "learning_rate": 9.99981299792969e-05, + "loss": 2.718, + "step": 1066 + }, + { + "epoch": 0.3275015346838551, + "grad_norm": 1.5745760202407837, + "learning_rate": 9.999808674352785e-05, + "loss": 2.8589, + "step": 1067 + }, + { + "epoch": 0.3278084714548803, + "grad_norm": 1.516995906829834, + "learning_rate": 9.999804301364839e-05, + "loss": 2.6691, + "step": 1068 + }, + { + "epoch": 0.32811540822590546, + "grad_norm": 1.4223122596740723, + "learning_rate": 9.999799878965897e-05, + "loss": 2.6899, + "step": 1069 + }, + { + "epoch": 0.3284223449969306, + "grad_norm": 1.4502828121185303, + "learning_rate": 9.999795407156003e-05, + "loss": 2.7801, + "step": 1070 + }, + { + "epoch": 0.3287292817679558, + "grad_norm": 1.4692026376724243, + "learning_rate": 9.999790885935198e-05, + "loss": 2.6869, + "step": 1071 + }, + { + "epoch": 0.32903621853898096, + "grad_norm": 1.4182246923446655, + "learning_rate": 9.999786315303532e-05, + "loss": 2.7802, + "step": 1072 + }, + { + "epoch": 0.32934315531000613, + "grad_norm": 1.781173586845398, + "learning_rate": 9.999781695261046e-05, + "loss": 2.7522, + "step": 1073 + }, + { + "epoch": 0.3296500920810313, + "grad_norm": 1.3958306312561035, + "learning_rate": 9.999777025807786e-05, + "loss": 2.6894, + "step": 1074 + }, + { + "epoch": 0.32995702885205647, + "grad_norm": 1.7938110828399658, + "learning_rate": 9.9997723069438e-05, + "loss": 2.6468, + "step": 1075 + }, + { + "epoch": 0.33026396562308163, + "grad_norm": 1.2314528226852417, + "learning_rate": 9.999767538669134e-05, + "loss": 2.7446, + "step": 1076 + }, + { + "epoch": 0.3305709023941068, + "grad_norm": 1.4881565570831299, + "learning_rate": 9.999762720983835e-05, + "loss": 2.6904, + "step": 1077 + }, + { + "epoch": 0.33087783916513197, + "grad_norm": 1.3903130292892456, + "learning_rate": 9.999757853887948e-05, + "loss": 2.7315, + "step": 1078 + }, + { + "epoch": 0.33118477593615714, + "grad_norm": 1.491129755973816, + "learning_rate": 9.999752937381525e-05, + "loss": 2.7325, + "step": 1079 + }, + { + "epoch": 0.3314917127071823, + "grad_norm": 1.4748190641403198, + "learning_rate": 9.999747971464612e-05, + "loss": 2.7288, + "step": 1080 + }, + { + "epoch": 0.3317986494782075, + "grad_norm": 1.5664055347442627, + "learning_rate": 9.99974295613726e-05, + "loss": 2.8225, + "step": 1081 + }, + { + "epoch": 0.33210558624923264, + "grad_norm": 1.4422696828842163, + "learning_rate": 9.999737891399518e-05, + "loss": 2.6537, + "step": 1082 + }, + { + "epoch": 0.3324125230202578, + "grad_norm": 1.397817850112915, + "learning_rate": 9.999732777251436e-05, + "loss": 2.6329, + "step": 1083 + }, + { + "epoch": 0.332719459791283, + "grad_norm": 1.4253548383712769, + "learning_rate": 9.999727613693063e-05, + "loss": 2.7028, + "step": 1084 + }, + { + "epoch": 0.33302639656230815, + "grad_norm": 1.4327688217163086, + "learning_rate": 9.999722400724451e-05, + "loss": 2.6524, + "step": 1085 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2485733032226562, + "learning_rate": 9.999717138345654e-05, + "loss": 2.7278, + "step": 1086 + }, + { + "epoch": 0.3336402701043585, + "grad_norm": 1.536656379699707, + "learning_rate": 9.999711826556719e-05, + "loss": 2.5858, + "step": 1087 + }, + { + "epoch": 0.33394720687538365, + "grad_norm": 1.4210286140441895, + "learning_rate": 9.999706465357703e-05, + "loss": 2.7057, + "step": 1088 + }, + { + "epoch": 0.3342541436464088, + "grad_norm": 1.4605839252471924, + "learning_rate": 9.999701054748657e-05, + "loss": 2.6461, + "step": 1089 + }, + { + "epoch": 0.334561080417434, + "grad_norm": 1.4764037132263184, + "learning_rate": 9.999695594729636e-05, + "loss": 2.608, + "step": 1090 + }, + { + "epoch": 0.33486801718845915, + "grad_norm": 1.630843162536621, + "learning_rate": 9.99969008530069e-05, + "loss": 2.6165, + "step": 1091 + }, + { + "epoch": 0.3351749539594843, + "grad_norm": 1.3693522214889526, + "learning_rate": 9.999684526461879e-05, + "loss": 2.72, + "step": 1092 + }, + { + "epoch": 0.3354818907305095, + "grad_norm": 1.609580636024475, + "learning_rate": 9.999678918213254e-05, + "loss": 2.7602, + "step": 1093 + }, + { + "epoch": 0.33578882750153466, + "grad_norm": 1.3815720081329346, + "learning_rate": 9.999673260554872e-05, + "loss": 2.6297, + "step": 1094 + }, + { + "epoch": 0.3360957642725598, + "grad_norm": 1.4511120319366455, + "learning_rate": 9.999667553486787e-05, + "loss": 2.7515, + "step": 1095 + }, + { + "epoch": 0.336402701043585, + "grad_norm": 1.486387848854065, + "learning_rate": 9.999661797009057e-05, + "loss": 2.6839, + "step": 1096 + }, + { + "epoch": 0.33670963781461016, + "grad_norm": 1.239160180091858, + "learning_rate": 9.999655991121739e-05, + "loss": 2.6033, + "step": 1097 + }, + { + "epoch": 0.3370165745856354, + "grad_norm": 1.499598741531372, + "learning_rate": 9.999650135824891e-05, + "loss": 2.5582, + "step": 1098 + }, + { + "epoch": 0.33732351135666055, + "grad_norm": 1.32973051071167, + "learning_rate": 9.999644231118571e-05, + "loss": 2.6253, + "step": 1099 + }, + { + "epoch": 0.3376304481276857, + "grad_norm": 1.4025259017944336, + "learning_rate": 9.999638277002833e-05, + "loss": 2.6199, + "step": 1100 + }, + { + "epoch": 0.3379373848987109, + "grad_norm": 1.3162082433700562, + "learning_rate": 9.999632273477742e-05, + "loss": 2.5528, + "step": 1101 + }, + { + "epoch": 0.33824432166973606, + "grad_norm": 1.5454723834991455, + "learning_rate": 9.999626220543352e-05, + "loss": 2.6724, + "step": 1102 + }, + { + "epoch": 0.3385512584407612, + "grad_norm": 1.45896315574646, + "learning_rate": 9.999620118199727e-05, + "loss": 2.688, + "step": 1103 + }, + { + "epoch": 0.3388581952117864, + "grad_norm": 1.3940998315811157, + "learning_rate": 9.999613966446926e-05, + "loss": 2.6991, + "step": 1104 + }, + { + "epoch": 0.33916513198281156, + "grad_norm": 1.4427480697631836, + "learning_rate": 9.999607765285009e-05, + "loss": 2.6869, + "step": 1105 + }, + { + "epoch": 0.33947206875383673, + "grad_norm": 1.260373830795288, + "learning_rate": 9.999601514714036e-05, + "loss": 2.7011, + "step": 1106 + }, + { + "epoch": 0.3397790055248619, + "grad_norm": 1.5985103845596313, + "learning_rate": 9.999595214734072e-05, + "loss": 2.599, + "step": 1107 + }, + { + "epoch": 0.34008594229588707, + "grad_norm": 1.1968494653701782, + "learning_rate": 9.999588865345179e-05, + "loss": 2.6346, + "step": 1108 + }, + { + "epoch": 0.34039287906691224, + "grad_norm": 1.4565916061401367, + "learning_rate": 9.999582466547417e-05, + "loss": 2.6303, + "step": 1109 + }, + { + "epoch": 0.3406998158379374, + "grad_norm": 1.2992361783981323, + "learning_rate": 9.999576018340851e-05, + "loss": 2.6121, + "step": 1110 + }, + { + "epoch": 0.34100675260896257, + "grad_norm": 1.402471899986267, + "learning_rate": 9.999569520725543e-05, + "loss": 2.6697, + "step": 1111 + }, + { + "epoch": 0.34131368937998774, + "grad_norm": 1.3006439208984375, + "learning_rate": 9.99956297370156e-05, + "loss": 2.6347, + "step": 1112 + }, + { + "epoch": 0.3416206261510129, + "grad_norm": 1.4235650300979614, + "learning_rate": 9.999556377268966e-05, + "loss": 2.6869, + "step": 1113 + }, + { + "epoch": 0.3419275629220381, + "grad_norm": 1.3288183212280273, + "learning_rate": 9.999549731427824e-05, + "loss": 2.5834, + "step": 1114 + }, + { + "epoch": 0.34223449969306324, + "grad_norm": 1.430736780166626, + "learning_rate": 9.999543036178203e-05, + "loss": 2.6248, + "step": 1115 + }, + { + "epoch": 0.3425414364640884, + "grad_norm": 1.467417597770691, + "learning_rate": 9.999536291520167e-05, + "loss": 2.6563, + "step": 1116 + }, + { + "epoch": 0.3428483732351136, + "grad_norm": 1.3988397121429443, + "learning_rate": 9.999529497453782e-05, + "loss": 2.6634, + "step": 1117 + }, + { + "epoch": 0.34315531000613875, + "grad_norm": 1.2072746753692627, + "learning_rate": 9.999522653979117e-05, + "loss": 2.6129, + "step": 1118 + }, + { + "epoch": 0.3434622467771639, + "grad_norm": 1.5297373533248901, + "learning_rate": 9.999515761096239e-05, + "loss": 2.6359, + "step": 1119 + }, + { + "epoch": 0.3437691835481891, + "grad_norm": 1.2022082805633545, + "learning_rate": 9.999508818805214e-05, + "loss": 2.6934, + "step": 1120 + }, + { + "epoch": 0.34407612031921425, + "grad_norm": 1.5655800104141235, + "learning_rate": 9.999501827106114e-05, + "loss": 2.6132, + "step": 1121 + }, + { + "epoch": 0.3443830570902394, + "grad_norm": 1.1639407873153687, + "learning_rate": 9.999494785999007e-05, + "loss": 2.6416, + "step": 1122 + }, + { + "epoch": 0.3446899938612646, + "grad_norm": 1.5784116983413696, + "learning_rate": 9.999487695483962e-05, + "loss": 2.5967, + "step": 1123 + }, + { + "epoch": 0.34499693063228976, + "grad_norm": 1.1812770366668701, + "learning_rate": 9.999480555561049e-05, + "loss": 2.6303, + "step": 1124 + }, + { + "epoch": 0.3453038674033149, + "grad_norm": 1.5105888843536377, + "learning_rate": 9.99947336623034e-05, + "loss": 2.58, + "step": 1125 + }, + { + "epoch": 0.3456108041743401, + "grad_norm": 1.2969506978988647, + "learning_rate": 9.999466127491904e-05, + "loss": 2.6857, + "step": 1126 + }, + { + "epoch": 0.34591774094536526, + "grad_norm": 1.679018259048462, + "learning_rate": 9.999458839345812e-05, + "loss": 2.6304, + "step": 1127 + }, + { + "epoch": 0.3462246777163904, + "grad_norm": 1.2718015909194946, + "learning_rate": 9.99945150179214e-05, + "loss": 2.6929, + "step": 1128 + }, + { + "epoch": 0.3465316144874156, + "grad_norm": 1.5834014415740967, + "learning_rate": 9.999444114830957e-05, + "loss": 2.6477, + "step": 1129 + }, + { + "epoch": 0.34683855125844076, + "grad_norm": 1.1575955152511597, + "learning_rate": 9.999436678462338e-05, + "loss": 2.6908, + "step": 1130 + }, + { + "epoch": 0.34714548802946593, + "grad_norm": 1.6231988668441772, + "learning_rate": 9.999429192686352e-05, + "loss": 2.6741, + "step": 1131 + }, + { + "epoch": 0.3474524248004911, + "grad_norm": 1.1616390943527222, + "learning_rate": 9.99942165750308e-05, + "loss": 2.5977, + "step": 1132 + }, + { + "epoch": 0.34775936157151627, + "grad_norm": 1.6188498735427856, + "learning_rate": 9.999414072912592e-05, + "loss": 2.6776, + "step": 1133 + }, + { + "epoch": 0.34806629834254144, + "grad_norm": 1.3885529041290283, + "learning_rate": 9.999406438914962e-05, + "loss": 2.7136, + "step": 1134 + }, + { + "epoch": 0.3483732351135666, + "grad_norm": 1.4522851705551147, + "learning_rate": 9.999398755510269e-05, + "loss": 2.6817, + "step": 1135 + }, + { + "epoch": 0.34868017188459177, + "grad_norm": 1.2695082426071167, + "learning_rate": 9.999391022698588e-05, + "loss": 2.6257, + "step": 1136 + }, + { + "epoch": 0.34898710865561694, + "grad_norm": 1.1735594272613525, + "learning_rate": 9.999383240479993e-05, + "loss": 2.5908, + "step": 1137 + }, + { + "epoch": 0.3492940454266421, + "grad_norm": 1.4158523082733154, + "learning_rate": 9.999375408854564e-05, + "loss": 2.572, + "step": 1138 + }, + { + "epoch": 0.3496009821976673, + "grad_norm": 1.1342333555221558, + "learning_rate": 9.999367527822376e-05, + "loss": 2.6918, + "step": 1139 + }, + { + "epoch": 0.34990791896869244, + "grad_norm": 1.4462997913360596, + "learning_rate": 9.999359597383509e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 0.3502148557397176, + "grad_norm": 1.254346251487732, + "learning_rate": 9.99935161753804e-05, + "loss": 2.6426, + "step": 1141 + }, + { + "epoch": 0.3505217925107428, + "grad_norm": 1.5101851224899292, + "learning_rate": 9.999343588286048e-05, + "loss": 2.6261, + "step": 1142 + }, + { + "epoch": 0.35082872928176795, + "grad_norm": 1.2910065650939941, + "learning_rate": 9.999335509627612e-05, + "loss": 2.5587, + "step": 1143 + }, + { + "epoch": 0.3511356660527931, + "grad_norm": 1.4421133995056152, + "learning_rate": 9.999327381562812e-05, + "loss": 2.6812, + "step": 1144 + }, + { + "epoch": 0.3514426028238183, + "grad_norm": 1.3265037536621094, + "learning_rate": 9.999319204091728e-05, + "loss": 2.6506, + "step": 1145 + }, + { + "epoch": 0.35174953959484345, + "grad_norm": 1.346258521080017, + "learning_rate": 9.999310977214443e-05, + "loss": 2.7038, + "step": 1146 + }, + { + "epoch": 0.3520564763658686, + "grad_norm": 1.3683836460113525, + "learning_rate": 9.999302700931037e-05, + "loss": 2.5823, + "step": 1147 + }, + { + "epoch": 0.3523634131368938, + "grad_norm": 1.3593783378601074, + "learning_rate": 9.99929437524159e-05, + "loss": 2.5705, + "step": 1148 + }, + { + "epoch": 0.35267034990791896, + "grad_norm": 1.4077095985412598, + "learning_rate": 9.999286000146186e-05, + "loss": 2.6259, + "step": 1149 + }, + { + "epoch": 0.3529772866789441, + "grad_norm": 1.3095922470092773, + "learning_rate": 9.99927757564491e-05, + "loss": 2.683, + "step": 1150 + }, + { + "epoch": 0.3532842234499693, + "grad_norm": 1.4188631772994995, + "learning_rate": 9.999269101737841e-05, + "loss": 2.619, + "step": 1151 + }, + { + "epoch": 0.35359116022099446, + "grad_norm": 1.2483123540878296, + "learning_rate": 9.999260578425063e-05, + "loss": 2.6477, + "step": 1152 + }, + { + "epoch": 0.35389809699201963, + "grad_norm": 1.4601099491119385, + "learning_rate": 9.999252005706663e-05, + "loss": 2.5861, + "step": 1153 + }, + { + "epoch": 0.3542050337630448, + "grad_norm": 1.107335090637207, + "learning_rate": 9.999243383582726e-05, + "loss": 2.6308, + "step": 1154 + }, + { + "epoch": 0.35451197053406996, + "grad_norm": 1.60590398311615, + "learning_rate": 9.999234712053334e-05, + "loss": 2.7057, + "step": 1155 + }, + { + "epoch": 0.35481890730509513, + "grad_norm": 1.2256578207015991, + "learning_rate": 9.999225991118575e-05, + "loss": 2.6371, + "step": 1156 + }, + { + "epoch": 0.3551258440761203, + "grad_norm": 1.4451910257339478, + "learning_rate": 9.999217220778535e-05, + "loss": 2.6424, + "step": 1157 + }, + { + "epoch": 0.35543278084714547, + "grad_norm": 1.184781789779663, + "learning_rate": 9.999208401033299e-05, + "loss": 2.6576, + "step": 1158 + }, + { + "epoch": 0.35573971761817064, + "grad_norm": 1.3395711183547974, + "learning_rate": 9.999199531882956e-05, + "loss": 2.6109, + "step": 1159 + }, + { + "epoch": 0.3560466543891958, + "grad_norm": 1.2052571773529053, + "learning_rate": 9.999190613327594e-05, + "loss": 2.5486, + "step": 1160 + }, + { + "epoch": 0.356353591160221, + "grad_norm": 1.2690850496292114, + "learning_rate": 9.999181645367299e-05, + "loss": 2.6457, + "step": 1161 + }, + { + "epoch": 0.35666052793124614, + "grad_norm": 1.2832787036895752, + "learning_rate": 9.999172628002162e-05, + "loss": 2.6097, + "step": 1162 + }, + { + "epoch": 0.3569674647022713, + "grad_norm": 1.3791579008102417, + "learning_rate": 9.999163561232272e-05, + "loss": 2.7458, + "step": 1163 + }, + { + "epoch": 0.3572744014732965, + "grad_norm": 1.260743498802185, + "learning_rate": 9.999154445057715e-05, + "loss": 2.594, + "step": 1164 + }, + { + "epoch": 0.35758133824432164, + "grad_norm": 1.1595406532287598, + "learning_rate": 9.999145279478585e-05, + "loss": 2.5315, + "step": 1165 + }, + { + "epoch": 0.3578882750153468, + "grad_norm": 1.3424396514892578, + "learning_rate": 9.999136064494972e-05, + "loss": 2.6017, + "step": 1166 + }, + { + "epoch": 0.358195211786372, + "grad_norm": 1.317750334739685, + "learning_rate": 9.999126800106963e-05, + "loss": 2.5787, + "step": 1167 + }, + { + "epoch": 0.35850214855739715, + "grad_norm": 1.104471206665039, + "learning_rate": 9.999117486314657e-05, + "loss": 2.6801, + "step": 1168 + }, + { + "epoch": 0.3588090853284224, + "grad_norm": 1.5555830001831055, + "learning_rate": 9.99910812311814e-05, + "loss": 2.6575, + "step": 1169 + }, + { + "epoch": 0.35911602209944754, + "grad_norm": 1.1883453130722046, + "learning_rate": 9.999098710517507e-05, + "loss": 2.5801, + "step": 1170 + }, + { + "epoch": 0.3594229588704727, + "grad_norm": 1.3885222673416138, + "learning_rate": 9.99908924851285e-05, + "loss": 2.5637, + "step": 1171 + }, + { + "epoch": 0.3597298956414979, + "grad_norm": 1.1860510110855103, + "learning_rate": 9.999079737104262e-05, + "loss": 2.6528, + "step": 1172 + }, + { + "epoch": 0.36003683241252304, + "grad_norm": 1.4319096803665161, + "learning_rate": 9.99907017629184e-05, + "loss": 2.579, + "step": 1173 + }, + { + "epoch": 0.3603437691835482, + "grad_norm": 1.256819725036621, + "learning_rate": 9.999060566075676e-05, + "loss": 2.5638, + "step": 1174 + }, + { + "epoch": 0.3606507059545734, + "grad_norm": 1.5452641248703003, + "learning_rate": 9.999050906455865e-05, + "loss": 2.6318, + "step": 1175 + }, + { + "epoch": 0.36095764272559855, + "grad_norm": 1.1933847665786743, + "learning_rate": 9.999041197432503e-05, + "loss": 2.5451, + "step": 1176 + }, + { + "epoch": 0.3612645794966237, + "grad_norm": 1.245689034461975, + "learning_rate": 9.999031439005684e-05, + "loss": 2.5452, + "step": 1177 + }, + { + "epoch": 0.3615715162676489, + "grad_norm": 1.2228111028671265, + "learning_rate": 9.99902163117551e-05, + "loss": 2.5856, + "step": 1178 + }, + { + "epoch": 0.36187845303867405, + "grad_norm": 1.3547098636627197, + "learning_rate": 9.999011773942071e-05, + "loss": 2.6604, + "step": 1179 + }, + { + "epoch": 0.3621853898096992, + "grad_norm": 1.25395929813385, + "learning_rate": 9.999001867305469e-05, + "loss": 2.5947, + "step": 1180 + }, + { + "epoch": 0.3624923265807244, + "grad_norm": 1.1676687002182007, + "learning_rate": 9.9989919112658e-05, + "loss": 2.5728, + "step": 1181 + }, + { + "epoch": 0.36279926335174956, + "grad_norm": 1.2076375484466553, + "learning_rate": 9.998981905823163e-05, + "loss": 2.569, + "step": 1182 + }, + { + "epoch": 0.3631062001227747, + "grad_norm": 1.3417900800704956, + "learning_rate": 9.998971850977659e-05, + "loss": 2.5552, + "step": 1183 + }, + { + "epoch": 0.3634131368937999, + "grad_norm": 1.135088324546814, + "learning_rate": 9.998961746729383e-05, + "loss": 2.5883, + "step": 1184 + }, + { + "epoch": 0.36372007366482506, + "grad_norm": 1.3329869508743286, + "learning_rate": 9.998951593078438e-05, + "loss": 2.6398, + "step": 1185 + }, + { + "epoch": 0.36402701043585023, + "grad_norm": 1.1681292057037354, + "learning_rate": 9.998941390024923e-05, + "loss": 2.6082, + "step": 1186 + }, + { + "epoch": 0.3643339472068754, + "grad_norm": 1.4083843231201172, + "learning_rate": 9.998931137568939e-05, + "loss": 2.6585, + "step": 1187 + }, + { + "epoch": 0.36464088397790057, + "grad_norm": 1.0879896879196167, + "learning_rate": 9.998920835710587e-05, + "loss": 2.4779, + "step": 1188 + }, + { + "epoch": 0.36494782074892573, + "grad_norm": 1.2977828979492188, + "learning_rate": 9.99891048444997e-05, + "loss": 2.6586, + "step": 1189 + }, + { + "epoch": 0.3652547575199509, + "grad_norm": 1.2552378177642822, + "learning_rate": 9.998900083787188e-05, + "loss": 2.5211, + "step": 1190 + }, + { + "epoch": 0.36556169429097607, + "grad_norm": 1.178227186203003, + "learning_rate": 9.998889633722348e-05, + "loss": 2.5365, + "step": 1191 + }, + { + "epoch": 0.36586863106200124, + "grad_norm": 1.36601722240448, + "learning_rate": 9.99887913425555e-05, + "loss": 2.6108, + "step": 1192 + }, + { + "epoch": 0.3661755678330264, + "grad_norm": 1.1947816610336304, + "learning_rate": 9.998868585386898e-05, + "loss": 2.5269, + "step": 1193 + }, + { + "epoch": 0.3664825046040516, + "grad_norm": 1.3113429546356201, + "learning_rate": 9.998857987116497e-05, + "loss": 2.5241, + "step": 1194 + }, + { + "epoch": 0.36678944137507674, + "grad_norm": 1.1573466062545776, + "learning_rate": 9.99884733944445e-05, + "loss": 2.5772, + "step": 1195 + }, + { + "epoch": 0.3670963781461019, + "grad_norm": 1.3841795921325684, + "learning_rate": 9.998836642370866e-05, + "loss": 2.6254, + "step": 1196 + }, + { + "epoch": 0.3674033149171271, + "grad_norm": 1.3332045078277588, + "learning_rate": 9.998825895895848e-05, + "loss": 2.6846, + "step": 1197 + }, + { + "epoch": 0.36771025168815225, + "grad_norm": 1.1578748226165771, + "learning_rate": 9.9988151000195e-05, + "loss": 2.4717, + "step": 1198 + }, + { + "epoch": 0.3680171884591774, + "grad_norm": 1.1045753955841064, + "learning_rate": 9.998804254741934e-05, + "loss": 2.6433, + "step": 1199 + }, + { + "epoch": 0.3683241252302026, + "grad_norm": 1.3260962963104248, + "learning_rate": 9.998793360063254e-05, + "loss": 2.6385, + "step": 1200 + }, + { + "epoch": 0.36863106200122775, + "grad_norm": 1.1483805179595947, + "learning_rate": 9.998782415983568e-05, + "loss": 2.6013, + "step": 1201 + }, + { + "epoch": 0.3689379987722529, + "grad_norm": 1.1897181272506714, + "learning_rate": 9.998771422502984e-05, + "loss": 2.485, + "step": 1202 + }, + { + "epoch": 0.3692449355432781, + "grad_norm": 1.2124346494674683, + "learning_rate": 9.99876037962161e-05, + "loss": 2.6271, + "step": 1203 + }, + { + "epoch": 0.36955187231430325, + "grad_norm": 1.2274240255355835, + "learning_rate": 9.998749287339557e-05, + "loss": 2.6072, + "step": 1204 + }, + { + "epoch": 0.3698588090853284, + "grad_norm": 1.2045015096664429, + "learning_rate": 9.998738145656934e-05, + "loss": 2.5567, + "step": 1205 + }, + { + "epoch": 0.3701657458563536, + "grad_norm": 1.187698483467102, + "learning_rate": 9.998726954573852e-05, + "loss": 2.6251, + "step": 1206 + }, + { + "epoch": 0.37047268262737876, + "grad_norm": 1.1760836839675903, + "learning_rate": 9.998715714090419e-05, + "loss": 2.6544, + "step": 1207 + }, + { + "epoch": 0.3707796193984039, + "grad_norm": 1.2181260585784912, + "learning_rate": 9.998704424206746e-05, + "loss": 2.6258, + "step": 1208 + }, + { + "epoch": 0.3710865561694291, + "grad_norm": 1.2106094360351562, + "learning_rate": 9.998693084922947e-05, + "loss": 2.5932, + "step": 1209 + }, + { + "epoch": 0.37139349294045426, + "grad_norm": 1.2973625659942627, + "learning_rate": 9.998681696239133e-05, + "loss": 2.5257, + "step": 1210 + }, + { + "epoch": 0.37170042971147943, + "grad_norm": 1.2477924823760986, + "learning_rate": 9.998670258155417e-05, + "loss": 2.6579, + "step": 1211 + }, + { + "epoch": 0.3720073664825046, + "grad_norm": 1.3301422595977783, + "learning_rate": 9.998658770671913e-05, + "loss": 2.4903, + "step": 1212 + }, + { + "epoch": 0.37231430325352977, + "grad_norm": 1.224321722984314, + "learning_rate": 9.998647233788732e-05, + "loss": 2.5865, + "step": 1213 + }, + { + "epoch": 0.37262124002455493, + "grad_norm": 1.3110655546188354, + "learning_rate": 9.99863564750599e-05, + "loss": 2.6134, + "step": 1214 + }, + { + "epoch": 0.3729281767955801, + "grad_norm": 1.2323014736175537, + "learning_rate": 9.998624011823801e-05, + "loss": 2.5892, + "step": 1215 + }, + { + "epoch": 0.37323511356660527, + "grad_norm": 1.0873770713806152, + "learning_rate": 9.998612326742279e-05, + "loss": 2.4897, + "step": 1216 + }, + { + "epoch": 0.37354205033763044, + "grad_norm": 1.2789679765701294, + "learning_rate": 9.998600592261539e-05, + "loss": 2.5603, + "step": 1217 + }, + { + "epoch": 0.3738489871086556, + "grad_norm": 1.1311540603637695, + "learning_rate": 9.998588808381699e-05, + "loss": 2.5327, + "step": 1218 + }, + { + "epoch": 0.3741559238796808, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.998576975102876e-05, + "loss": 2.4789, + "step": 1219 + }, + { + "epoch": 0.37446286065070594, + "grad_norm": 1.1840651035308838, + "learning_rate": 9.998565092425182e-05, + "loss": 2.5026, + "step": 1220 + }, + { + "epoch": 0.3747697974217311, + "grad_norm": 1.3145099878311157, + "learning_rate": 9.998553160348743e-05, + "loss": 2.5424, + "step": 1221 + }, + { + "epoch": 0.3750767341927563, + "grad_norm": 1.2192758321762085, + "learning_rate": 9.998541178873668e-05, + "loss": 2.5556, + "step": 1222 + }, + { + "epoch": 0.37538367096378145, + "grad_norm": 1.1329905986785889, + "learning_rate": 9.99852914800008e-05, + "loss": 2.4624, + "step": 1223 + }, + { + "epoch": 0.3756906077348066, + "grad_norm": 1.2490339279174805, + "learning_rate": 9.9985170677281e-05, + "loss": 2.5016, + "step": 1224 + }, + { + "epoch": 0.3759975445058318, + "grad_norm": 1.1884582042694092, + "learning_rate": 9.998504938057841e-05, + "loss": 2.5345, + "step": 1225 + }, + { + "epoch": 0.37630448127685695, + "grad_norm": 1.2075775861740112, + "learning_rate": 9.998492758989428e-05, + "loss": 2.5206, + "step": 1226 + }, + { + "epoch": 0.3766114180478821, + "grad_norm": 1.238457441329956, + "learning_rate": 9.99848053052298e-05, + "loss": 2.6748, + "step": 1227 + }, + { + "epoch": 0.3769183548189073, + "grad_norm": 1.3056883811950684, + "learning_rate": 9.998468252658618e-05, + "loss": 2.6146, + "step": 1228 + }, + { + "epoch": 0.37722529158993245, + "grad_norm": 1.191575050354004, + "learning_rate": 9.998455925396461e-05, + "loss": 2.4743, + "step": 1229 + }, + { + "epoch": 0.3775322283609576, + "grad_norm": 1.2834603786468506, + "learning_rate": 9.998443548736635e-05, + "loss": 2.5504, + "step": 1230 + }, + { + "epoch": 0.3778391651319828, + "grad_norm": 1.3023632764816284, + "learning_rate": 9.99843112267926e-05, + "loss": 2.5832, + "step": 1231 + }, + { + "epoch": 0.37814610190300796, + "grad_norm": 1.1219336986541748, + "learning_rate": 9.998418647224458e-05, + "loss": 2.5715, + "step": 1232 + }, + { + "epoch": 0.3784530386740331, + "grad_norm": 1.0666810274124146, + "learning_rate": 9.998406122372354e-05, + "loss": 2.4865, + "step": 1233 + }, + { + "epoch": 0.3787599754450583, + "grad_norm": 1.3699263334274292, + "learning_rate": 9.998393548123072e-05, + "loss": 2.5523, + "step": 1234 + }, + { + "epoch": 0.37906691221608346, + "grad_norm": 1.1383014917373657, + "learning_rate": 9.998380924476733e-05, + "loss": 2.7054, + "step": 1235 + }, + { + "epoch": 0.37937384898710863, + "grad_norm": 1.1304205656051636, + "learning_rate": 9.998368251433465e-05, + "loss": 2.5007, + "step": 1236 + }, + { + "epoch": 0.3796807857581338, + "grad_norm": 1.2220405340194702, + "learning_rate": 9.998355528993394e-05, + "loss": 2.5635, + "step": 1237 + }, + { + "epoch": 0.37998772252915897, + "grad_norm": 1.1126691102981567, + "learning_rate": 9.998342757156642e-05, + "loss": 2.5795, + "step": 1238 + }, + { + "epoch": 0.38029465930018413, + "grad_norm": 1.1675945520401, + "learning_rate": 9.998329935923339e-05, + "loss": 2.564, + "step": 1239 + }, + { + "epoch": 0.38060159607120936, + "grad_norm": 1.1286569833755493, + "learning_rate": 9.998317065293607e-05, + "loss": 2.5476, + "step": 1240 + }, + { + "epoch": 0.3809085328422345, + "grad_norm": 1.1252213716506958, + "learning_rate": 9.998304145267579e-05, + "loss": 2.5406, + "step": 1241 + }, + { + "epoch": 0.3812154696132597, + "grad_norm": 1.1931700706481934, + "learning_rate": 9.998291175845378e-05, + "loss": 2.5277, + "step": 1242 + }, + { + "epoch": 0.38152240638428486, + "grad_norm": 1.2148306369781494, + "learning_rate": 9.998278157027136e-05, + "loss": 2.5178, + "step": 1243 + }, + { + "epoch": 0.38182934315531003, + "grad_norm": 1.1597660779953003, + "learning_rate": 9.998265088812978e-05, + "loss": 2.5522, + "step": 1244 + }, + { + "epoch": 0.3821362799263352, + "grad_norm": 1.105973243713379, + "learning_rate": 9.998251971203035e-05, + "loss": 2.4558, + "step": 1245 + }, + { + "epoch": 0.38244321669736037, + "grad_norm": 1.1082781553268433, + "learning_rate": 9.998238804197437e-05, + "loss": 2.5504, + "step": 1246 + }, + { + "epoch": 0.38275015346838553, + "grad_norm": 1.2124732732772827, + "learning_rate": 9.998225587796312e-05, + "loss": 2.5536, + "step": 1247 + }, + { + "epoch": 0.3830570902394107, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.998212321999795e-05, + "loss": 2.4837, + "step": 1248 + }, + { + "epoch": 0.38336402701043587, + "grad_norm": 1.353562355041504, + "learning_rate": 9.998199006808014e-05, + "loss": 2.4554, + "step": 1249 + }, + { + "epoch": 0.38367096378146104, + "grad_norm": 1.2103357315063477, + "learning_rate": 9.998185642221098e-05, + "loss": 2.4843, + "step": 1250 + }, + { + "epoch": 0.3839779005524862, + "grad_norm": 1.2572352886199951, + "learning_rate": 9.998172228239185e-05, + "loss": 2.497, + "step": 1251 + }, + { + "epoch": 0.3842848373235114, + "grad_norm": 1.0910226106643677, + "learning_rate": 9.998158764862402e-05, + "loss": 2.577, + "step": 1252 + }, + { + "epoch": 0.38459177409453654, + "grad_norm": 1.2550606727600098, + "learning_rate": 9.998145252090886e-05, + "loss": 2.5087, + "step": 1253 + }, + { + "epoch": 0.3848987108655617, + "grad_norm": 1.0103787183761597, + "learning_rate": 9.998131689924768e-05, + "loss": 2.5306, + "step": 1254 + }, + { + "epoch": 0.3852056476365869, + "grad_norm": 1.2965941429138184, + "learning_rate": 9.998118078364184e-05, + "loss": 2.5622, + "step": 1255 + }, + { + "epoch": 0.38551258440761205, + "grad_norm": 1.0791535377502441, + "learning_rate": 9.998104417409269e-05, + "loss": 2.5608, + "step": 1256 + }, + { + "epoch": 0.3858195211786372, + "grad_norm": 1.3277596235275269, + "learning_rate": 9.998090707060155e-05, + "loss": 2.5748, + "step": 1257 + }, + { + "epoch": 0.3861264579496624, + "grad_norm": 1.004031777381897, + "learning_rate": 9.99807694731698e-05, + "loss": 2.5532, + "step": 1258 + }, + { + "epoch": 0.38643339472068755, + "grad_norm": 1.4802277088165283, + "learning_rate": 9.998063138179877e-05, + "loss": 2.585, + "step": 1259 + }, + { + "epoch": 0.3867403314917127, + "grad_norm": 1.0821146965026855, + "learning_rate": 9.998049279648987e-05, + "loss": 2.5248, + "step": 1260 + }, + { + "epoch": 0.3870472682627379, + "grad_norm": 1.2902108430862427, + "learning_rate": 9.998035371724443e-05, + "loss": 2.5134, + "step": 1261 + }, + { + "epoch": 0.38735420503376305, + "grad_norm": 1.082943320274353, + "learning_rate": 9.998021414406385e-05, + "loss": 2.5937, + "step": 1262 + }, + { + "epoch": 0.3876611418047882, + "grad_norm": 1.2164193391799927, + "learning_rate": 9.998007407694949e-05, + "loss": 2.5106, + "step": 1263 + }, + { + "epoch": 0.3879680785758134, + "grad_norm": 1.0999115705490112, + "learning_rate": 9.997993351590276e-05, + "loss": 2.5458, + "step": 1264 + }, + { + "epoch": 0.38827501534683856, + "grad_norm": 1.2275537252426147, + "learning_rate": 9.997979246092503e-05, + "loss": 2.5664, + "step": 1265 + }, + { + "epoch": 0.3885819521178637, + "grad_norm": 1.3246204853057861, + "learning_rate": 9.997965091201769e-05, + "loss": 2.5289, + "step": 1266 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.2404677867889404, + "learning_rate": 9.997950886918214e-05, + "loss": 2.5302, + "step": 1267 + }, + { + "epoch": 0.38919582565991406, + "grad_norm": 1.0993810892105103, + "learning_rate": 9.99793663324198e-05, + "loss": 2.5085, + "step": 1268 + }, + { + "epoch": 0.38950276243093923, + "grad_norm": 1.3394049406051636, + "learning_rate": 9.997922330173206e-05, + "loss": 2.5882, + "step": 1269 + }, + { + "epoch": 0.3898096992019644, + "grad_norm": 1.1464321613311768, + "learning_rate": 9.997907977712036e-05, + "loss": 2.5211, + "step": 1270 + }, + { + "epoch": 0.39011663597298957, + "grad_norm": 1.1246297359466553, + "learning_rate": 9.997893575858608e-05, + "loss": 2.4204, + "step": 1271 + }, + { + "epoch": 0.39042357274401474, + "grad_norm": 1.1278076171875, + "learning_rate": 9.997879124613067e-05, + "loss": 2.4405, + "step": 1272 + }, + { + "epoch": 0.3907305095150399, + "grad_norm": 1.2284942865371704, + "learning_rate": 9.997864623975555e-05, + "loss": 2.5674, + "step": 1273 + }, + { + "epoch": 0.39103744628606507, + "grad_norm": 1.1243138313293457, + "learning_rate": 9.997850073946215e-05, + "loss": 2.489, + "step": 1274 + }, + { + "epoch": 0.39134438305709024, + "grad_norm": 1.198461890220642, + "learning_rate": 9.997835474525193e-05, + "loss": 2.51, + "step": 1275 + }, + { + "epoch": 0.3916513198281154, + "grad_norm": 1.1643213033676147, + "learning_rate": 9.997820825712629e-05, + "loss": 2.5688, + "step": 1276 + }, + { + "epoch": 0.3919582565991406, + "grad_norm": 1.2107082605361938, + "learning_rate": 9.997806127508671e-05, + "loss": 2.5614, + "step": 1277 + }, + { + "epoch": 0.39226519337016574, + "grad_norm": 1.1856440305709839, + "learning_rate": 9.997791379913464e-05, + "loss": 2.5893, + "step": 1278 + }, + { + "epoch": 0.3925721301411909, + "grad_norm": 1.166395664215088, + "learning_rate": 9.997776582927153e-05, + "loss": 2.539, + "step": 1279 + }, + { + "epoch": 0.3928790669122161, + "grad_norm": 1.1638765335083008, + "learning_rate": 9.997761736549886e-05, + "loss": 2.5384, + "step": 1280 + }, + { + "epoch": 0.39318600368324125, + "grad_norm": 1.107485055923462, + "learning_rate": 9.997746840781806e-05, + "loss": 2.559, + "step": 1281 + }, + { + "epoch": 0.3934929404542664, + "grad_norm": 1.174592137336731, + "learning_rate": 9.997731895623063e-05, + "loss": 2.5132, + "step": 1282 + }, + { + "epoch": 0.3937998772252916, + "grad_norm": 1.0407745838165283, + "learning_rate": 9.997716901073806e-05, + "loss": 2.4871, + "step": 1283 + }, + { + "epoch": 0.39410681399631675, + "grad_norm": 1.059743046760559, + "learning_rate": 9.997701857134179e-05, + "loss": 2.4865, + "step": 1284 + }, + { + "epoch": 0.3944137507673419, + "grad_norm": 1.0606070756912231, + "learning_rate": 9.997686763804335e-05, + "loss": 2.5651, + "step": 1285 + }, + { + "epoch": 0.3947206875383671, + "grad_norm": 1.0753284692764282, + "learning_rate": 9.99767162108442e-05, + "loss": 2.4699, + "step": 1286 + }, + { + "epoch": 0.39502762430939226, + "grad_norm": 1.1155509948730469, + "learning_rate": 9.997656428974585e-05, + "loss": 2.5326, + "step": 1287 + }, + { + "epoch": 0.3953345610804174, + "grad_norm": 1.2243739366531372, + "learning_rate": 9.99764118747498e-05, + "loss": 2.5189, + "step": 1288 + }, + { + "epoch": 0.3956414978514426, + "grad_norm": 1.2526514530181885, + "learning_rate": 9.997625896585757e-05, + "loss": 2.5464, + "step": 1289 + }, + { + "epoch": 0.39594843462246776, + "grad_norm": 1.297153115272522, + "learning_rate": 9.997610556307062e-05, + "loss": 2.5752, + "step": 1290 + }, + { + "epoch": 0.39625537139349293, + "grad_norm": 1.1064956188201904, + "learning_rate": 9.997595166639054e-05, + "loss": 2.5743, + "step": 1291 + }, + { + "epoch": 0.3965623081645181, + "grad_norm": 1.255810022354126, + "learning_rate": 9.997579727581879e-05, + "loss": 2.7087, + "step": 1292 + }, + { + "epoch": 0.39686924493554326, + "grad_norm": 1.4290298223495483, + "learning_rate": 9.997564239135692e-05, + "loss": 2.5417, + "step": 1293 + }, + { + "epoch": 0.39717618170656843, + "grad_norm": 1.1937109231948853, + "learning_rate": 9.997548701300648e-05, + "loss": 2.4862, + "step": 1294 + }, + { + "epoch": 0.3974831184775936, + "grad_norm": 1.1707425117492676, + "learning_rate": 9.997533114076897e-05, + "loss": 2.4715, + "step": 1295 + }, + { + "epoch": 0.39779005524861877, + "grad_norm": 1.1248551607131958, + "learning_rate": 9.997517477464596e-05, + "loss": 2.4859, + "step": 1296 + }, + { + "epoch": 0.39809699201964394, + "grad_norm": 1.1656453609466553, + "learning_rate": 9.997501791463897e-05, + "loss": 2.5402, + "step": 1297 + }, + { + "epoch": 0.3984039287906691, + "grad_norm": 0.9916674494743347, + "learning_rate": 9.997486056074956e-05, + "loss": 2.5116, + "step": 1298 + }, + { + "epoch": 0.39871086556169427, + "grad_norm": 1.3229619264602661, + "learning_rate": 9.997470271297928e-05, + "loss": 2.5565, + "step": 1299 + }, + { + "epoch": 0.39901780233271944, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.997454437132971e-05, + "loss": 2.5191, + "step": 1300 + }, + { + "epoch": 0.3993247391037446, + "grad_norm": 1.2117778062820435, + "learning_rate": 9.997438553580241e-05, + "loss": 2.558, + "step": 1301 + }, + { + "epoch": 0.3996316758747698, + "grad_norm": 1.1083563566207886, + "learning_rate": 9.997422620639892e-05, + "loss": 2.4734, + "step": 1302 + }, + { + "epoch": 0.39993861264579494, + "grad_norm": 0.9662174582481384, + "learning_rate": 9.997406638312084e-05, + "loss": 2.4866, + "step": 1303 + }, + { + "epoch": 0.4002455494168201, + "grad_norm": 1.0886632204055786, + "learning_rate": 9.997390606596976e-05, + "loss": 2.5397, + "step": 1304 + }, + { + "epoch": 0.4005524861878453, + "grad_norm": 1.2318742275238037, + "learning_rate": 9.997374525494723e-05, + "loss": 2.6281, + "step": 1305 + }, + { + "epoch": 0.40085942295887045, + "grad_norm": 1.1717815399169922, + "learning_rate": 9.997358395005487e-05, + "loss": 2.5202, + "step": 1306 + }, + { + "epoch": 0.4011663597298956, + "grad_norm": 1.0533723831176758, + "learning_rate": 9.997342215129427e-05, + "loss": 2.5096, + "step": 1307 + }, + { + "epoch": 0.4014732965009208, + "grad_norm": 1.0814248323440552, + "learning_rate": 9.997325985866701e-05, + "loss": 2.5513, + "step": 1308 + }, + { + "epoch": 0.40178023327194595, + "grad_norm": 1.078261137008667, + "learning_rate": 9.997309707217472e-05, + "loss": 2.5115, + "step": 1309 + }, + { + "epoch": 0.4020871700429711, + "grad_norm": 1.0834710597991943, + "learning_rate": 9.997293379181897e-05, + "loss": 2.4754, + "step": 1310 + }, + { + "epoch": 0.40239410681399634, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.997277001760142e-05, + "loss": 2.5068, + "step": 1311 + }, + { + "epoch": 0.4027010435850215, + "grad_norm": 1.3008345365524292, + "learning_rate": 9.997260574952366e-05, + "loss": 2.4675, + "step": 1312 + }, + { + "epoch": 0.4030079803560467, + "grad_norm": 1.176858901977539, + "learning_rate": 9.997244098758732e-05, + "loss": 2.4786, + "step": 1313 + }, + { + "epoch": 0.40331491712707185, + "grad_norm": 1.0121303796768188, + "learning_rate": 9.997227573179403e-05, + "loss": 2.476, + "step": 1314 + }, + { + "epoch": 0.403621853898097, + "grad_norm": 1.326298713684082, + "learning_rate": 9.997210998214542e-05, + "loss": 2.4093, + "step": 1315 + }, + { + "epoch": 0.4039287906691222, + "grad_norm": 0.9008898735046387, + "learning_rate": 9.997194373864314e-05, + "loss": 2.4523, + "step": 1316 + }, + { + "epoch": 0.40423572744014735, + "grad_norm": 1.0441854000091553, + "learning_rate": 9.99717770012888e-05, + "loss": 2.5419, + "step": 1317 + }, + { + "epoch": 0.4045426642111725, + "grad_norm": 1.0490028858184814, + "learning_rate": 9.997160977008408e-05, + "loss": 2.4855, + "step": 1318 + }, + { + "epoch": 0.4048496009821977, + "grad_norm": 1.0244388580322266, + "learning_rate": 9.997144204503063e-05, + "loss": 2.4555, + "step": 1319 + }, + { + "epoch": 0.40515653775322286, + "grad_norm": 1.1217700242996216, + "learning_rate": 9.99712738261301e-05, + "loss": 2.4872, + "step": 1320 + }, + { + "epoch": 0.405463474524248, + "grad_norm": 1.031691551208496, + "learning_rate": 9.997110511338414e-05, + "loss": 2.4094, + "step": 1321 + }, + { + "epoch": 0.4057704112952732, + "grad_norm": 1.1658705472946167, + "learning_rate": 9.997093590679444e-05, + "loss": 2.407, + "step": 1322 + }, + { + "epoch": 0.40607734806629836, + "grad_norm": 1.1527072191238403, + "learning_rate": 9.997076620636266e-05, + "loss": 2.5041, + "step": 1323 + }, + { + "epoch": 0.40638428483732353, + "grad_norm": 1.2039116621017456, + "learning_rate": 9.997059601209049e-05, + "loss": 2.4682, + "step": 1324 + }, + { + "epoch": 0.4066912216083487, + "grad_norm": 1.142160177230835, + "learning_rate": 9.997042532397957e-05, + "loss": 2.4629, + "step": 1325 + }, + { + "epoch": 0.40699815837937386, + "grad_norm": 0.972081184387207, + "learning_rate": 9.997025414203164e-05, + "loss": 2.3941, + "step": 1326 + }, + { + "epoch": 0.40730509515039903, + "grad_norm": 1.0181753635406494, + "learning_rate": 9.99700824662484e-05, + "loss": 2.5649, + "step": 1327 + }, + { + "epoch": 0.4076120319214242, + "grad_norm": 1.145769715309143, + "learning_rate": 9.996991029663148e-05, + "loss": 2.5284, + "step": 1328 + }, + { + "epoch": 0.40791896869244937, + "grad_norm": 1.0604028701782227, + "learning_rate": 9.996973763318262e-05, + "loss": 2.4488, + "step": 1329 + }, + { + "epoch": 0.40822590546347454, + "grad_norm": 1.161383867263794, + "learning_rate": 9.996956447590354e-05, + "loss": 2.6081, + "step": 1330 + }, + { + "epoch": 0.4085328422344997, + "grad_norm": 1.0880714654922485, + "learning_rate": 9.996939082479591e-05, + "loss": 2.4695, + "step": 1331 + }, + { + "epoch": 0.4088397790055249, + "grad_norm": 1.036556601524353, + "learning_rate": 9.99692166798615e-05, + "loss": 2.4428, + "step": 1332 + }, + { + "epoch": 0.40914671577655004, + "grad_norm": 1.079179286956787, + "learning_rate": 9.996904204110198e-05, + "loss": 2.4543, + "step": 1333 + }, + { + "epoch": 0.4094536525475752, + "grad_norm": 1.0588144063949585, + "learning_rate": 9.996886690851912e-05, + "loss": 2.4755, + "step": 1334 + }, + { + "epoch": 0.4097605893186004, + "grad_norm": 1.0359580516815186, + "learning_rate": 9.996869128211462e-05, + "loss": 2.4933, + "step": 1335 + }, + { + "epoch": 0.41006752608962554, + "grad_norm": 1.0067389011383057, + "learning_rate": 9.996851516189021e-05, + "loss": 2.4291, + "step": 1336 + }, + { + "epoch": 0.4103744628606507, + "grad_norm": 1.0173524618148804, + "learning_rate": 9.996833854784766e-05, + "loss": 2.4856, + "step": 1337 + }, + { + "epoch": 0.4106813996316759, + "grad_norm": 1.0740927457809448, + "learning_rate": 9.99681614399887e-05, + "loss": 2.5248, + "step": 1338 + }, + { + "epoch": 0.41098833640270105, + "grad_norm": 0.9638547301292419, + "learning_rate": 9.99679838383151e-05, + "loss": 2.4777, + "step": 1339 + }, + { + "epoch": 0.4112952731737262, + "grad_norm": 1.0349369049072266, + "learning_rate": 9.996780574282856e-05, + "loss": 2.5188, + "step": 1340 + }, + { + "epoch": 0.4116022099447514, + "grad_norm": 1.099743127822876, + "learning_rate": 9.996762715353089e-05, + "loss": 2.4141, + "step": 1341 + }, + { + "epoch": 0.41190914671577655, + "grad_norm": 1.027178406715393, + "learning_rate": 9.996744807042386e-05, + "loss": 2.5134, + "step": 1342 + }, + { + "epoch": 0.4122160834868017, + "grad_norm": 1.1933472156524658, + "learning_rate": 9.996726849350922e-05, + "loss": 2.4821, + "step": 1343 + }, + { + "epoch": 0.4125230202578269, + "grad_norm": 1.1663923263549805, + "learning_rate": 9.996708842278872e-05, + "loss": 2.4593, + "step": 1344 + }, + { + "epoch": 0.41282995702885206, + "grad_norm": 1.2633854150772095, + "learning_rate": 9.996690785826418e-05, + "loss": 2.5524, + "step": 1345 + }, + { + "epoch": 0.4131368937998772, + "grad_norm": 1.03873610496521, + "learning_rate": 9.996672679993737e-05, + "loss": 2.5403, + "step": 1346 + }, + { + "epoch": 0.4134438305709024, + "grad_norm": 1.106656789779663, + "learning_rate": 9.996654524781009e-05, + "loss": 2.5172, + "step": 1347 + }, + { + "epoch": 0.41375076734192756, + "grad_norm": 1.015608310699463, + "learning_rate": 9.996636320188411e-05, + "loss": 2.423, + "step": 1348 + }, + { + "epoch": 0.41405770411295273, + "grad_norm": 1.0672087669372559, + "learning_rate": 9.996618066216124e-05, + "loss": 2.4861, + "step": 1349 + }, + { + "epoch": 0.4143646408839779, + "grad_norm": 1.1289842128753662, + "learning_rate": 9.996599762864329e-05, + "loss": 2.3944, + "step": 1350 + }, + { + "epoch": 0.41467157765500307, + "grad_norm": 1.080428957939148, + "learning_rate": 9.996581410133207e-05, + "loss": 2.4563, + "step": 1351 + }, + { + "epoch": 0.41497851442602823, + "grad_norm": 1.257104516029358, + "learning_rate": 9.996563008022939e-05, + "loss": 2.437, + "step": 1352 + }, + { + "epoch": 0.4152854511970534, + "grad_norm": 1.039293646812439, + "learning_rate": 9.996544556533706e-05, + "loss": 2.4654, + "step": 1353 + }, + { + "epoch": 0.41559238796807857, + "grad_norm": 1.0976085662841797, + "learning_rate": 9.996526055665692e-05, + "loss": 2.4755, + "step": 1354 + }, + { + "epoch": 0.41589932473910374, + "grad_norm": 0.937647819519043, + "learning_rate": 9.996507505419078e-05, + "loss": 2.4687, + "step": 1355 + }, + { + "epoch": 0.4162062615101289, + "grad_norm": 1.0461267232894897, + "learning_rate": 9.996488905794047e-05, + "loss": 2.4092, + "step": 1356 + }, + { + "epoch": 0.4165131982811541, + "grad_norm": 1.0510658025741577, + "learning_rate": 9.996470256790787e-05, + "loss": 2.4806, + "step": 1357 + }, + { + "epoch": 0.41682013505217924, + "grad_norm": 1.2323371171951294, + "learning_rate": 9.996451558409478e-05, + "loss": 2.5017, + "step": 1358 + }, + { + "epoch": 0.4171270718232044, + "grad_norm": 0.9880139827728271, + "learning_rate": 9.996432810650307e-05, + "loss": 2.5171, + "step": 1359 + }, + { + "epoch": 0.4174340085942296, + "grad_norm": 1.2572466135025024, + "learning_rate": 9.996414013513458e-05, + "loss": 2.4285, + "step": 1360 + }, + { + "epoch": 0.41774094536525475, + "grad_norm": 0.8809559941291809, + "learning_rate": 9.996395166999118e-05, + "loss": 2.398, + "step": 1361 + }, + { + "epoch": 0.4180478821362799, + "grad_norm": 0.9389429688453674, + "learning_rate": 9.996376271107471e-05, + "loss": 2.4539, + "step": 1362 + }, + { + "epoch": 0.4183548189073051, + "grad_norm": 0.8821789026260376, + "learning_rate": 9.996357325838705e-05, + "loss": 2.4762, + "step": 1363 + }, + { + "epoch": 0.41866175567833025, + "grad_norm": 1.0148484706878662, + "learning_rate": 9.99633833119301e-05, + "loss": 2.5292, + "step": 1364 + }, + { + "epoch": 0.4189686924493554, + "grad_norm": 0.9861947894096375, + "learning_rate": 9.996319287170569e-05, + "loss": 2.4285, + "step": 1365 + }, + { + "epoch": 0.4192756292203806, + "grad_norm": 1.1907099485397339, + "learning_rate": 9.996300193771573e-05, + "loss": 2.4325, + "step": 1366 + }, + { + "epoch": 0.41958256599140575, + "grad_norm": 1.0746681690216064, + "learning_rate": 9.99628105099621e-05, + "loss": 2.3349, + "step": 1367 + }, + { + "epoch": 0.4198895027624309, + "grad_norm": 1.2040268182754517, + "learning_rate": 9.996261858844669e-05, + "loss": 2.4427, + "step": 1368 + }, + { + "epoch": 0.4201964395334561, + "grad_norm": 1.0487430095672607, + "learning_rate": 9.99624261731714e-05, + "loss": 2.4305, + "step": 1369 + }, + { + "epoch": 0.42050337630448126, + "grad_norm": 1.0047999620437622, + "learning_rate": 9.996223326413812e-05, + "loss": 2.4442, + "step": 1370 + }, + { + "epoch": 0.4208103130755064, + "grad_norm": 1.147078275680542, + "learning_rate": 9.996203986134879e-05, + "loss": 2.5189, + "step": 1371 + }, + { + "epoch": 0.4211172498465316, + "grad_norm": 1.2269455194473267, + "learning_rate": 9.996184596480529e-05, + "loss": 2.3905, + "step": 1372 + }, + { + "epoch": 0.42142418661755676, + "grad_norm": 0.9716771245002747, + "learning_rate": 9.996165157450954e-05, + "loss": 2.4246, + "step": 1373 + }, + { + "epoch": 0.42173112338858193, + "grad_norm": 1.0569939613342285, + "learning_rate": 9.996145669046347e-05, + "loss": 2.529, + "step": 1374 + }, + { + "epoch": 0.4220380601596071, + "grad_norm": 1.1145942211151123, + "learning_rate": 9.996126131266899e-05, + "loss": 2.3965, + "step": 1375 + }, + { + "epoch": 0.42234499693063227, + "grad_norm": 0.9990974068641663, + "learning_rate": 9.996106544112805e-05, + "loss": 2.4991, + "step": 1376 + }, + { + "epoch": 0.42265193370165743, + "grad_norm": 0.9536247253417969, + "learning_rate": 9.99608690758426e-05, + "loss": 2.4347, + "step": 1377 + }, + { + "epoch": 0.4229588704726826, + "grad_norm": 1.0053460597991943, + "learning_rate": 9.996067221681452e-05, + "loss": 2.4213, + "step": 1378 + }, + { + "epoch": 0.42326580724370777, + "grad_norm": 1.0727168321609497, + "learning_rate": 9.99604748640458e-05, + "loss": 2.4479, + "step": 1379 + }, + { + "epoch": 0.42357274401473294, + "grad_norm": 1.2539277076721191, + "learning_rate": 9.996027701753841e-05, + "loss": 2.4721, + "step": 1380 + }, + { + "epoch": 0.4238796807857581, + "grad_norm": 1.0348230600357056, + "learning_rate": 9.996007867729427e-05, + "loss": 2.4263, + "step": 1381 + }, + { + "epoch": 0.42418661755678333, + "grad_norm": 1.051802158355713, + "learning_rate": 9.995987984331533e-05, + "loss": 2.4492, + "step": 1382 + }, + { + "epoch": 0.4244935543278085, + "grad_norm": 1.0394505262374878, + "learning_rate": 9.995968051560361e-05, + "loss": 2.4625, + "step": 1383 + }, + { + "epoch": 0.42480049109883367, + "grad_norm": 1.1121852397918701, + "learning_rate": 9.995948069416103e-05, + "loss": 2.4999, + "step": 1384 + }, + { + "epoch": 0.42510742786985883, + "grad_norm": 0.9693613052368164, + "learning_rate": 9.995928037898957e-05, + "loss": 2.4112, + "step": 1385 + }, + { + "epoch": 0.425414364640884, + "grad_norm": 1.1416810750961304, + "learning_rate": 9.995907957009123e-05, + "loss": 2.5452, + "step": 1386 + }, + { + "epoch": 0.42572130141190917, + "grad_norm": 1.010640025138855, + "learning_rate": 9.995887826746797e-05, + "loss": 2.412, + "step": 1387 + }, + { + "epoch": 0.42602823818293434, + "grad_norm": 1.0800373554229736, + "learning_rate": 9.99586764711218e-05, + "loss": 2.4451, + "step": 1388 + }, + { + "epoch": 0.4263351749539595, + "grad_norm": 1.058931589126587, + "learning_rate": 9.995847418105471e-05, + "loss": 2.474, + "step": 1389 + }, + { + "epoch": 0.4266421117249847, + "grad_norm": 1.0727131366729736, + "learning_rate": 9.99582713972687e-05, + "loss": 2.468, + "step": 1390 + }, + { + "epoch": 0.42694904849600984, + "grad_norm": 1.0237464904785156, + "learning_rate": 9.995806811976576e-05, + "loss": 2.5208, + "step": 1391 + }, + { + "epoch": 0.427255985267035, + "grad_norm": 1.036582112312317, + "learning_rate": 9.995786434854793e-05, + "loss": 2.4338, + "step": 1392 + }, + { + "epoch": 0.4275629220380602, + "grad_norm": 0.9617817997932434, + "learning_rate": 9.995766008361719e-05, + "loss": 2.4465, + "step": 1393 + }, + { + "epoch": 0.42786985880908535, + "grad_norm": 1.2188911437988281, + "learning_rate": 9.995745532497556e-05, + "loss": 2.5069, + "step": 1394 + }, + { + "epoch": 0.4281767955801105, + "grad_norm": 1.0796585083007812, + "learning_rate": 9.99572500726251e-05, + "loss": 2.4839, + "step": 1395 + }, + { + "epoch": 0.4284837323511357, + "grad_norm": 0.9843130111694336, + "learning_rate": 9.99570443265678e-05, + "loss": 2.4968, + "step": 1396 + }, + { + "epoch": 0.42879066912216085, + "grad_norm": 1.0441415309906006, + "learning_rate": 9.99568380868057e-05, + "loss": 2.4134, + "step": 1397 + }, + { + "epoch": 0.429097605893186, + "grad_norm": 0.9156177639961243, + "learning_rate": 9.995663135334085e-05, + "loss": 2.4891, + "step": 1398 + }, + { + "epoch": 0.4294045426642112, + "grad_norm": 1.1159545183181763, + "learning_rate": 9.995642412617529e-05, + "loss": 2.4507, + "step": 1399 + }, + { + "epoch": 0.42971147943523635, + "grad_norm": 0.8944577574729919, + "learning_rate": 9.995621640531107e-05, + "loss": 2.4465, + "step": 1400 + }, + { + "epoch": 0.4300184162062615, + "grad_norm": 0.9043408036231995, + "learning_rate": 9.995600819075025e-05, + "loss": 2.3726, + "step": 1401 + }, + { + "epoch": 0.4303253529772867, + "grad_norm": 0.9028464555740356, + "learning_rate": 9.995579948249486e-05, + "loss": 2.427, + "step": 1402 + }, + { + "epoch": 0.43063228974831186, + "grad_norm": 0.9497705101966858, + "learning_rate": 9.995559028054699e-05, + "loss": 2.4666, + "step": 1403 + }, + { + "epoch": 0.430939226519337, + "grad_norm": 0.927601158618927, + "learning_rate": 9.995538058490868e-05, + "loss": 2.3679, + "step": 1404 + }, + { + "epoch": 0.4312461632903622, + "grad_norm": 1.050394892692566, + "learning_rate": 9.995517039558204e-05, + "loss": 2.4096, + "step": 1405 + }, + { + "epoch": 0.43155310006138736, + "grad_norm": 1.3011974096298218, + "learning_rate": 9.995495971256911e-05, + "loss": 2.4439, + "step": 1406 + }, + { + "epoch": 0.43186003683241253, + "grad_norm": 1.0740708112716675, + "learning_rate": 9.9954748535872e-05, + "loss": 2.4891, + "step": 1407 + }, + { + "epoch": 0.4321669736034377, + "grad_norm": 1.1132466793060303, + "learning_rate": 9.995453686549279e-05, + "loss": 2.46, + "step": 1408 + }, + { + "epoch": 0.43247391037446287, + "grad_norm": 1.063275933265686, + "learning_rate": 9.995432470143356e-05, + "loss": 2.5035, + "step": 1409 + }, + { + "epoch": 0.43278084714548803, + "grad_norm": 1.065679669380188, + "learning_rate": 9.99541120436964e-05, + "loss": 2.4471, + "step": 1410 + }, + { + "epoch": 0.4330877839165132, + "grad_norm": 1.017587423324585, + "learning_rate": 9.995389889228344e-05, + "loss": 2.4879, + "step": 1411 + }, + { + "epoch": 0.43339472068753837, + "grad_norm": 0.9744442701339722, + "learning_rate": 9.995368524719678e-05, + "loss": 2.3923, + "step": 1412 + }, + { + "epoch": 0.43370165745856354, + "grad_norm": 0.8916706442832947, + "learning_rate": 9.995347110843851e-05, + "loss": 2.3965, + "step": 1413 + }, + { + "epoch": 0.4340085942295887, + "grad_norm": 0.916221559047699, + "learning_rate": 9.995325647601075e-05, + "loss": 2.4742, + "step": 1414 + }, + { + "epoch": 0.4343155310006139, + "grad_norm": 0.9388782978057861, + "learning_rate": 9.995304134991565e-05, + "loss": 2.453, + "step": 1415 + }, + { + "epoch": 0.43462246777163904, + "grad_norm": 1.057085633277893, + "learning_rate": 9.995282573015532e-05, + "loss": 2.5791, + "step": 1416 + }, + { + "epoch": 0.4349294045426642, + "grad_norm": 1.055145025253296, + "learning_rate": 9.995260961673187e-05, + "loss": 2.3565, + "step": 1417 + }, + { + "epoch": 0.4352363413136894, + "grad_norm": 1.0733528137207031, + "learning_rate": 9.995239300964747e-05, + "loss": 2.5413, + "step": 1418 + }, + { + "epoch": 0.43554327808471455, + "grad_norm": 1.1478198766708374, + "learning_rate": 9.995217590890425e-05, + "loss": 2.4093, + "step": 1419 + }, + { + "epoch": 0.4358502148557397, + "grad_norm": 0.8663081526756287, + "learning_rate": 9.995195831450432e-05, + "loss": 2.3968, + "step": 1420 + }, + { + "epoch": 0.4361571516267649, + "grad_norm": 0.9811860918998718, + "learning_rate": 9.995174022644988e-05, + "loss": 2.3536, + "step": 1421 + }, + { + "epoch": 0.43646408839779005, + "grad_norm": 0.9883477687835693, + "learning_rate": 9.995152164474306e-05, + "loss": 2.5372, + "step": 1422 + }, + { + "epoch": 0.4367710251688152, + "grad_norm": 1.2196532487869263, + "learning_rate": 9.995130256938603e-05, + "loss": 2.429, + "step": 1423 + }, + { + "epoch": 0.4370779619398404, + "grad_norm": 1.000264286994934, + "learning_rate": 9.995108300038096e-05, + "loss": 2.4116, + "step": 1424 + }, + { + "epoch": 0.43738489871086556, + "grad_norm": 1.1259286403656006, + "learning_rate": 9.995086293773e-05, + "loss": 2.4405, + "step": 1425 + }, + { + "epoch": 0.4376918354818907, + "grad_norm": 0.9334595203399658, + "learning_rate": 9.995064238143533e-05, + "loss": 2.3849, + "step": 1426 + }, + { + "epoch": 0.4379987722529159, + "grad_norm": 0.8880285620689392, + "learning_rate": 9.995042133149914e-05, + "loss": 2.4177, + "step": 1427 + }, + { + "epoch": 0.43830570902394106, + "grad_norm": 0.8823251724243164, + "learning_rate": 9.995019978792362e-05, + "loss": 2.4876, + "step": 1428 + }, + { + "epoch": 0.4386126457949662, + "grad_norm": 0.9289014339447021, + "learning_rate": 9.994997775071094e-05, + "loss": 2.4725, + "step": 1429 + }, + { + "epoch": 0.4389195825659914, + "grad_norm": 0.9100427627563477, + "learning_rate": 9.994975521986329e-05, + "loss": 2.3834, + "step": 1430 + }, + { + "epoch": 0.43922651933701656, + "grad_norm": 0.8956978917121887, + "learning_rate": 9.99495321953829e-05, + "loss": 2.4418, + "step": 1431 + }, + { + "epoch": 0.43953345610804173, + "grad_norm": 1.1248396635055542, + "learning_rate": 9.994930867727195e-05, + "loss": 2.4389, + "step": 1432 + }, + { + "epoch": 0.4398403928790669, + "grad_norm": 0.9285669922828674, + "learning_rate": 9.994908466553266e-05, + "loss": 2.3922, + "step": 1433 + }, + { + "epoch": 0.44014732965009207, + "grad_norm": 0.9604844450950623, + "learning_rate": 9.994886016016723e-05, + "loss": 2.4365, + "step": 1434 + }, + { + "epoch": 0.44045426642111724, + "grad_norm": 1.0534024238586426, + "learning_rate": 9.99486351611779e-05, + "loss": 2.4377, + "step": 1435 + }, + { + "epoch": 0.4407612031921424, + "grad_norm": 1.1028003692626953, + "learning_rate": 9.994840966856686e-05, + "loss": 2.4299, + "step": 1436 + }, + { + "epoch": 0.44106813996316757, + "grad_norm": 1.119832158088684, + "learning_rate": 9.994818368233639e-05, + "loss": 2.4656, + "step": 1437 + }, + { + "epoch": 0.44137507673419274, + "grad_norm": 0.9782878160476685, + "learning_rate": 9.994795720248867e-05, + "loss": 2.3661, + "step": 1438 + }, + { + "epoch": 0.4416820135052179, + "grad_norm": 1.0002741813659668, + "learning_rate": 9.994773022902597e-05, + "loss": 2.4157, + "step": 1439 + }, + { + "epoch": 0.4419889502762431, + "grad_norm": 1.051486611366272, + "learning_rate": 9.994750276195053e-05, + "loss": 2.452, + "step": 1440 + }, + { + "epoch": 0.44229588704726824, + "grad_norm": 1.0375488996505737, + "learning_rate": 9.994727480126457e-05, + "loss": 2.4406, + "step": 1441 + }, + { + "epoch": 0.4426028238182934, + "grad_norm": 0.9407445192337036, + "learning_rate": 9.99470463469704e-05, + "loss": 2.3434, + "step": 1442 + }, + { + "epoch": 0.4429097605893186, + "grad_norm": 1.0371474027633667, + "learning_rate": 9.994681739907022e-05, + "loss": 2.5094, + "step": 1443 + }, + { + "epoch": 0.44321669736034375, + "grad_norm": 1.057519555091858, + "learning_rate": 9.994658795756632e-05, + "loss": 2.4501, + "step": 1444 + }, + { + "epoch": 0.4435236341313689, + "grad_norm": 0.9340078234672546, + "learning_rate": 9.994635802246097e-05, + "loss": 2.4151, + "step": 1445 + }, + { + "epoch": 0.4438305709023941, + "grad_norm": 0.8906050324440002, + "learning_rate": 9.994612759375644e-05, + "loss": 2.3837, + "step": 1446 + }, + { + "epoch": 0.44413750767341925, + "grad_norm": 0.8349595665931702, + "learning_rate": 9.994589667145497e-05, + "loss": 2.4317, + "step": 1447 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.9362117648124695, + "learning_rate": 9.994566525555891e-05, + "loss": 2.4586, + "step": 1448 + }, + { + "epoch": 0.4447513812154696, + "grad_norm": 0.869215190410614, + "learning_rate": 9.99454333460705e-05, + "loss": 2.4458, + "step": 1449 + }, + { + "epoch": 0.44505831798649476, + "grad_norm": 0.904531717300415, + "learning_rate": 9.994520094299204e-05, + "loss": 2.4198, + "step": 1450 + }, + { + "epoch": 0.4453652547575199, + "grad_norm": 0.9153178930282593, + "learning_rate": 9.994496804632583e-05, + "loss": 2.3718, + "step": 1451 + }, + { + "epoch": 0.44567219152854515, + "grad_norm": 1.0229307413101196, + "learning_rate": 9.994473465607418e-05, + "loss": 2.3787, + "step": 1452 + }, + { + "epoch": 0.4459791282995703, + "grad_norm": 1.0449415445327759, + "learning_rate": 9.994450077223938e-05, + "loss": 2.4965, + "step": 1453 + }, + { + "epoch": 0.4462860650705955, + "grad_norm": 1.0524135828018188, + "learning_rate": 9.994426639482375e-05, + "loss": 2.3518, + "step": 1454 + }, + { + "epoch": 0.44659300184162065, + "grad_norm": 1.0612086057662964, + "learning_rate": 9.994403152382961e-05, + "loss": 2.4501, + "step": 1455 + }, + { + "epoch": 0.4468999386126458, + "grad_norm": 1.0568779706954956, + "learning_rate": 9.994379615925929e-05, + "loss": 2.3754, + "step": 1456 + }, + { + "epoch": 0.447206875383671, + "grad_norm": 1.0984265804290771, + "learning_rate": 9.994356030111509e-05, + "loss": 2.4318, + "step": 1457 + }, + { + "epoch": 0.44751381215469616, + "grad_norm": 0.9227646589279175, + "learning_rate": 9.994332394939936e-05, + "loss": 2.3928, + "step": 1458 + }, + { + "epoch": 0.4478207489257213, + "grad_norm": 1.0073471069335938, + "learning_rate": 9.994308710411442e-05, + "loss": 2.4203, + "step": 1459 + }, + { + "epoch": 0.4481276856967465, + "grad_norm": 1.1347973346710205, + "learning_rate": 9.994284976526263e-05, + "loss": 2.4991, + "step": 1460 + }, + { + "epoch": 0.44843462246777166, + "grad_norm": 0.9912654757499695, + "learning_rate": 9.994261193284631e-05, + "loss": 2.471, + "step": 1461 + }, + { + "epoch": 0.4487415592387968, + "grad_norm": 1.0599550008773804, + "learning_rate": 9.994237360686784e-05, + "loss": 2.505, + "step": 1462 + }, + { + "epoch": 0.449048496009822, + "grad_norm": 0.9811004996299744, + "learning_rate": 9.994213478732957e-05, + "loss": 2.3868, + "step": 1463 + }, + { + "epoch": 0.44935543278084716, + "grad_norm": 0.8389631509780884, + "learning_rate": 9.994189547423384e-05, + "loss": 2.4766, + "step": 1464 + }, + { + "epoch": 0.44966236955187233, + "grad_norm": 0.8475043773651123, + "learning_rate": 9.994165566758302e-05, + "loss": 2.3666, + "step": 1465 + }, + { + "epoch": 0.4499693063228975, + "grad_norm": 0.8922824859619141, + "learning_rate": 9.994141536737951e-05, + "loss": 2.3823, + "step": 1466 + }, + { + "epoch": 0.45027624309392267, + "grad_norm": 1.0286083221435547, + "learning_rate": 9.994117457362564e-05, + "loss": 2.4639, + "step": 1467 + }, + { + "epoch": 0.45058317986494784, + "grad_norm": 1.094282865524292, + "learning_rate": 9.994093328632383e-05, + "loss": 2.3984, + "step": 1468 + }, + { + "epoch": 0.450890116635973, + "grad_norm": 1.0993603467941284, + "learning_rate": 9.994069150547642e-05, + "loss": 2.3719, + "step": 1469 + }, + { + "epoch": 0.45119705340699817, + "grad_norm": 1.0274133682250977, + "learning_rate": 9.994044923108585e-05, + "loss": 2.3644, + "step": 1470 + }, + { + "epoch": 0.45150399017802334, + "grad_norm": 0.8834434747695923, + "learning_rate": 9.994020646315448e-05, + "loss": 2.4955, + "step": 1471 + }, + { + "epoch": 0.4518109269490485, + "grad_norm": 0.8540776968002319, + "learning_rate": 9.993996320168473e-05, + "loss": 2.4292, + "step": 1472 + }, + { + "epoch": 0.4521178637200737, + "grad_norm": 0.8735383749008179, + "learning_rate": 9.993971944667897e-05, + "loss": 2.4343, + "step": 1473 + }, + { + "epoch": 0.45242480049109884, + "grad_norm": 0.976224422454834, + "learning_rate": 9.993947519813965e-05, + "loss": 2.4173, + "step": 1474 + }, + { + "epoch": 0.452731737262124, + "grad_norm": 0.9638139009475708, + "learning_rate": 9.993923045606917e-05, + "loss": 2.4322, + "step": 1475 + }, + { + "epoch": 0.4530386740331492, + "grad_norm": 0.9689927697181702, + "learning_rate": 9.993898522046992e-05, + "loss": 2.4625, + "step": 1476 + }, + { + "epoch": 0.45334561080417435, + "grad_norm": 1.0496052503585815, + "learning_rate": 9.993873949134437e-05, + "loss": 2.4788, + "step": 1477 + }, + { + "epoch": 0.4536525475751995, + "grad_norm": 1.0285090208053589, + "learning_rate": 9.993849326869491e-05, + "loss": 2.4119, + "step": 1478 + }, + { + "epoch": 0.4539594843462247, + "grad_norm": 0.9423730373382568, + "learning_rate": 9.993824655252401e-05, + "loss": 2.3919, + "step": 1479 + }, + { + "epoch": 0.45426642111724985, + "grad_norm": 1.0312988758087158, + "learning_rate": 9.993799934283407e-05, + "loss": 2.3829, + "step": 1480 + }, + { + "epoch": 0.454573357888275, + "grad_norm": 1.0985655784606934, + "learning_rate": 9.993775163962755e-05, + "loss": 2.3958, + "step": 1481 + }, + { + "epoch": 0.4548802946593002, + "grad_norm": 0.9346623420715332, + "learning_rate": 9.993750344290691e-05, + "loss": 2.3611, + "step": 1482 + }, + { + "epoch": 0.45518723143032536, + "grad_norm": 1.039681315422058, + "learning_rate": 9.993725475267459e-05, + "loss": 2.3989, + "step": 1483 + }, + { + "epoch": 0.4554941682013505, + "grad_norm": 0.9941854476928711, + "learning_rate": 9.993700556893304e-05, + "loss": 2.3092, + "step": 1484 + }, + { + "epoch": 0.4558011049723757, + "grad_norm": 0.9752130508422852, + "learning_rate": 9.993675589168473e-05, + "loss": 2.3727, + "step": 1485 + }, + { + "epoch": 0.45610804174340086, + "grad_norm": 0.9946039319038391, + "learning_rate": 9.993650572093216e-05, + "loss": 2.4121, + "step": 1486 + }, + { + "epoch": 0.45641497851442603, + "grad_norm": 1.1340489387512207, + "learning_rate": 9.993625505667774e-05, + "loss": 2.4477, + "step": 1487 + }, + { + "epoch": 0.4567219152854512, + "grad_norm": 0.9300981760025024, + "learning_rate": 9.993600389892399e-05, + "loss": 2.4045, + "step": 1488 + }, + { + "epoch": 0.45702885205647636, + "grad_norm": 0.8670973181724548, + "learning_rate": 9.993575224767338e-05, + "loss": 2.3596, + "step": 1489 + }, + { + "epoch": 0.45733578882750153, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.99355001029284e-05, + "loss": 2.4191, + "step": 1490 + }, + { + "epoch": 0.4576427255985267, + "grad_norm": 0.9099079370498657, + "learning_rate": 9.993524746469154e-05, + "loss": 2.4139, + "step": 1491 + }, + { + "epoch": 0.45794966236955187, + "grad_norm": 0.9740153551101685, + "learning_rate": 9.99349943329653e-05, + "loss": 2.4269, + "step": 1492 + }, + { + "epoch": 0.45825659914057704, + "grad_norm": 0.9112171530723572, + "learning_rate": 9.993474070775217e-05, + "loss": 2.3575, + "step": 1493 + }, + { + "epoch": 0.4585635359116022, + "grad_norm": 1.124553918838501, + "learning_rate": 9.993448658905466e-05, + "loss": 2.5518, + "step": 1494 + }, + { + "epoch": 0.4588704726826274, + "grad_norm": 1.1732012033462524, + "learning_rate": 9.99342319768753e-05, + "loss": 2.4346, + "step": 1495 + }, + { + "epoch": 0.45917740945365254, + "grad_norm": 0.8880025148391724, + "learning_rate": 9.993397687121659e-05, + "loss": 2.3593, + "step": 1496 + }, + { + "epoch": 0.4594843462246777, + "grad_norm": 0.9916797876358032, + "learning_rate": 9.993372127208105e-05, + "loss": 2.3283, + "step": 1497 + }, + { + "epoch": 0.4597912829957029, + "grad_norm": 0.9372622966766357, + "learning_rate": 9.99334651794712e-05, + "loss": 2.3868, + "step": 1498 + }, + { + "epoch": 0.46009821976672804, + "grad_norm": 1.0630989074707031, + "learning_rate": 9.99332085933896e-05, + "loss": 2.3605, + "step": 1499 + }, + { + "epoch": 0.4604051565377532, + "grad_norm": 1.000473976135254, + "learning_rate": 9.993295151383874e-05, + "loss": 2.3478, + "step": 1500 + }, + { + "epoch": 0.4607120933087784, + "grad_norm": 1.0269688367843628, + "learning_rate": 9.99326939408212e-05, + "loss": 2.4104, + "step": 1501 + }, + { + "epoch": 0.46101903007980355, + "grad_norm": 0.9003174901008606, + "learning_rate": 9.993243587433952e-05, + "loss": 2.3461, + "step": 1502 + }, + { + "epoch": 0.4613259668508287, + "grad_norm": 0.7938058972358704, + "learning_rate": 9.993217731439623e-05, + "loss": 2.3463, + "step": 1503 + }, + { + "epoch": 0.4616329036218539, + "grad_norm": 0.8715407252311707, + "learning_rate": 9.993191826099391e-05, + "loss": 2.3962, + "step": 1504 + }, + { + "epoch": 0.46193984039287905, + "grad_norm": 0.8319756984710693, + "learning_rate": 9.99316587141351e-05, + "loss": 2.342, + "step": 1505 + }, + { + "epoch": 0.4622467771639042, + "grad_norm": 0.846592903137207, + "learning_rate": 9.993139867382238e-05, + "loss": 2.4064, + "step": 1506 + }, + { + "epoch": 0.4625537139349294, + "grad_norm": 0.8567312955856323, + "learning_rate": 9.99311381400583e-05, + "loss": 2.3603, + "step": 1507 + }, + { + "epoch": 0.46286065070595456, + "grad_norm": 0.8784321546554565, + "learning_rate": 9.993087711284546e-05, + "loss": 2.4031, + "step": 1508 + }, + { + "epoch": 0.4631675874769797, + "grad_norm": 0.838233232498169, + "learning_rate": 9.993061559218641e-05, + "loss": 2.3156, + "step": 1509 + }, + { + "epoch": 0.4634745242480049, + "grad_norm": 0.8804462552070618, + "learning_rate": 9.993035357808376e-05, + "loss": 2.4322, + "step": 1510 + }, + { + "epoch": 0.46378146101903006, + "grad_norm": 1.1055982112884521, + "learning_rate": 9.99300910705401e-05, + "loss": 2.5006, + "step": 1511 + }, + { + "epoch": 0.46408839779005523, + "grad_norm": 0.9872145056724548, + "learning_rate": 9.992982806955799e-05, + "loss": 2.3547, + "step": 1512 + }, + { + "epoch": 0.4643953345610804, + "grad_norm": 1.0710479021072388, + "learning_rate": 9.99295645751401e-05, + "loss": 2.4867, + "step": 1513 + }, + { + "epoch": 0.46470227133210557, + "grad_norm": 0.9858919382095337, + "learning_rate": 9.992930058728894e-05, + "loss": 2.2986, + "step": 1514 + }, + { + "epoch": 0.46500920810313073, + "grad_norm": 0.9031065702438354, + "learning_rate": 9.992903610600719e-05, + "loss": 2.3172, + "step": 1515 + }, + { + "epoch": 0.4653161448741559, + "grad_norm": 0.923160970211029, + "learning_rate": 9.992877113129744e-05, + "loss": 2.4231, + "step": 1516 + }, + { + "epoch": 0.46562308164518107, + "grad_norm": 1.0130947828292847, + "learning_rate": 9.992850566316231e-05, + "loss": 2.3593, + "step": 1517 + }, + { + "epoch": 0.46593001841620624, + "grad_norm": 0.8947033286094666, + "learning_rate": 9.992823970160441e-05, + "loss": 2.3324, + "step": 1518 + }, + { + "epoch": 0.4662369551872314, + "grad_norm": 0.8819900155067444, + "learning_rate": 9.992797324662639e-05, + "loss": 2.2885, + "step": 1519 + }, + { + "epoch": 0.4665438919582566, + "grad_norm": 0.9434374570846558, + "learning_rate": 9.99277062982309e-05, + "loss": 2.427, + "step": 1520 + }, + { + "epoch": 0.46685082872928174, + "grad_norm": 0.9568646550178528, + "learning_rate": 9.99274388564205e-05, + "loss": 2.4059, + "step": 1521 + }, + { + "epoch": 0.4671577655003069, + "grad_norm": 0.9125105142593384, + "learning_rate": 9.992717092119794e-05, + "loss": 2.3306, + "step": 1522 + }, + { + "epoch": 0.46746470227133213, + "grad_norm": 0.8893206715583801, + "learning_rate": 9.992690249256578e-05, + "loss": 2.4211, + "step": 1523 + }, + { + "epoch": 0.4677716390423573, + "grad_norm": 0.8655402660369873, + "learning_rate": 9.992663357052672e-05, + "loss": 2.3493, + "step": 1524 + }, + { + "epoch": 0.46807857581338247, + "grad_norm": 0.7973037958145142, + "learning_rate": 9.99263641550834e-05, + "loss": 2.4255, + "step": 1525 + }, + { + "epoch": 0.46838551258440764, + "grad_norm": 0.8158934116363525, + "learning_rate": 9.992609424623849e-05, + "loss": 2.3518, + "step": 1526 + }, + { + "epoch": 0.4686924493554328, + "grad_norm": 0.7919436693191528, + "learning_rate": 9.992582384399465e-05, + "loss": 2.3762, + "step": 1527 + }, + { + "epoch": 0.468999386126458, + "grad_norm": 0.911490261554718, + "learning_rate": 9.992555294835455e-05, + "loss": 2.454, + "step": 1528 + }, + { + "epoch": 0.46930632289748314, + "grad_norm": 0.9504674077033997, + "learning_rate": 9.992528155932088e-05, + "loss": 2.3554, + "step": 1529 + }, + { + "epoch": 0.4696132596685083, + "grad_norm": 0.9833991527557373, + "learning_rate": 9.99250096768963e-05, + "loss": 2.4245, + "step": 1530 + }, + { + "epoch": 0.4699201964395335, + "grad_norm": 0.9994687438011169, + "learning_rate": 9.992473730108354e-05, + "loss": 2.3269, + "step": 1531 + }, + { + "epoch": 0.47022713321055865, + "grad_norm": 0.977237343788147, + "learning_rate": 9.992446443188526e-05, + "loss": 2.3938, + "step": 1532 + }, + { + "epoch": 0.4705340699815838, + "grad_norm": 1.018334150314331, + "learning_rate": 9.992419106930415e-05, + "loss": 2.3076, + "step": 1533 + }, + { + "epoch": 0.470841006752609, + "grad_norm": 0.9752077460289001, + "learning_rate": 9.992391721334293e-05, + "loss": 2.4224, + "step": 1534 + }, + { + "epoch": 0.47114794352363415, + "grad_norm": 0.9457291960716248, + "learning_rate": 9.992364286400428e-05, + "loss": 2.3859, + "step": 1535 + }, + { + "epoch": 0.4714548802946593, + "grad_norm": 0.9112275838851929, + "learning_rate": 9.992336802129096e-05, + "loss": 2.3343, + "step": 1536 + }, + { + "epoch": 0.4717618170656845, + "grad_norm": 0.7701164484024048, + "learning_rate": 9.992309268520563e-05, + "loss": 2.3912, + "step": 1537 + }, + { + "epoch": 0.47206875383670965, + "grad_norm": 0.826822817325592, + "learning_rate": 9.992281685575105e-05, + "loss": 2.3794, + "step": 1538 + }, + { + "epoch": 0.4723756906077348, + "grad_norm": 0.8690019249916077, + "learning_rate": 9.992254053292994e-05, + "loss": 2.3474, + "step": 1539 + }, + { + "epoch": 0.47268262737876, + "grad_norm": 0.935954213142395, + "learning_rate": 9.9922263716745e-05, + "loss": 2.3794, + "step": 1540 + }, + { + "epoch": 0.47298956414978516, + "grad_norm": 1.0606616735458374, + "learning_rate": 9.992198640719901e-05, + "loss": 2.3491, + "step": 1541 + }, + { + "epoch": 0.4732965009208103, + "grad_norm": 1.0020630359649658, + "learning_rate": 9.992170860429469e-05, + "loss": 2.4723, + "step": 1542 + }, + { + "epoch": 0.4736034376918355, + "grad_norm": 0.9738268256187439, + "learning_rate": 9.992143030803476e-05, + "loss": 2.4282, + "step": 1543 + }, + { + "epoch": 0.47391037446286066, + "grad_norm": 1.0320461988449097, + "learning_rate": 9.992115151842203e-05, + "loss": 2.3935, + "step": 1544 + }, + { + "epoch": 0.47421731123388583, + "grad_norm": 0.926980197429657, + "learning_rate": 9.992087223545921e-05, + "loss": 2.4403, + "step": 1545 + }, + { + "epoch": 0.474524248004911, + "grad_norm": 0.8760805130004883, + "learning_rate": 9.992059245914906e-05, + "loss": 2.3282, + "step": 1546 + }, + { + "epoch": 0.47483118477593617, + "grad_norm": 0.807569146156311, + "learning_rate": 9.992031218949435e-05, + "loss": 2.351, + "step": 1547 + }, + { + "epoch": 0.47513812154696133, + "grad_norm": 0.7491574883460999, + "learning_rate": 9.992003142649788e-05, + "loss": 2.3788, + "step": 1548 + }, + { + "epoch": 0.4754450583179865, + "grad_norm": 0.8402566909790039, + "learning_rate": 9.99197501701624e-05, + "loss": 2.4025, + "step": 1549 + }, + { + "epoch": 0.47575199508901167, + "grad_norm": 0.9501824975013733, + "learning_rate": 9.991946842049067e-05, + "loss": 2.4433, + "step": 1550 + }, + { + "epoch": 0.47605893186003684, + "grad_norm": 1.0070267915725708, + "learning_rate": 9.99191861774855e-05, + "loss": 2.4267, + "step": 1551 + }, + { + "epoch": 0.476365868631062, + "grad_norm": 0.9052779078483582, + "learning_rate": 9.991890344114969e-05, + "loss": 2.37, + "step": 1552 + }, + { + "epoch": 0.4766728054020872, + "grad_norm": 0.9453344345092773, + "learning_rate": 9.9918620211486e-05, + "loss": 2.4687, + "step": 1553 + }, + { + "epoch": 0.47697974217311234, + "grad_norm": 0.9836863875389099, + "learning_rate": 9.991833648849725e-05, + "loss": 2.4005, + "step": 1554 + }, + { + "epoch": 0.4772866789441375, + "grad_norm": 0.856532633304596, + "learning_rate": 9.991805227218624e-05, + "loss": 2.329, + "step": 1555 + }, + { + "epoch": 0.4775936157151627, + "grad_norm": 0.8338705897331238, + "learning_rate": 9.991776756255579e-05, + "loss": 2.3648, + "step": 1556 + }, + { + "epoch": 0.47790055248618785, + "grad_norm": 0.7738644480705261, + "learning_rate": 9.991748235960869e-05, + "loss": 2.2784, + "step": 1557 + }, + { + "epoch": 0.478207489257213, + "grad_norm": 0.7771223783493042, + "learning_rate": 9.991719666334778e-05, + "loss": 2.2747, + "step": 1558 + }, + { + "epoch": 0.4785144260282382, + "grad_norm": 0.7564612627029419, + "learning_rate": 9.991691047377588e-05, + "loss": 2.2964, + "step": 1559 + }, + { + "epoch": 0.47882136279926335, + "grad_norm": 0.7877290844917297, + "learning_rate": 9.99166237908958e-05, + "loss": 2.3149, + "step": 1560 + }, + { + "epoch": 0.4791282995702885, + "grad_norm": 0.7967450022697449, + "learning_rate": 9.991633661471039e-05, + "loss": 2.4035, + "step": 1561 + }, + { + "epoch": 0.4794352363413137, + "grad_norm": 0.8993534445762634, + "learning_rate": 9.991604894522248e-05, + "loss": 2.4028, + "step": 1562 + }, + { + "epoch": 0.47974217311233885, + "grad_norm": 0.9135516881942749, + "learning_rate": 9.991576078243494e-05, + "loss": 2.3968, + "step": 1563 + }, + { + "epoch": 0.480049109883364, + "grad_norm": 0.8438525795936584, + "learning_rate": 9.991547212635057e-05, + "loss": 2.3589, + "step": 1564 + }, + { + "epoch": 0.4803560466543892, + "grad_norm": 0.8979686498641968, + "learning_rate": 9.991518297697226e-05, + "loss": 2.3835, + "step": 1565 + }, + { + "epoch": 0.48066298342541436, + "grad_norm": 0.8821539878845215, + "learning_rate": 9.991489333430286e-05, + "loss": 2.3503, + "step": 1566 + }, + { + "epoch": 0.4809699201964395, + "grad_norm": 0.8649077415466309, + "learning_rate": 9.991460319834523e-05, + "loss": 2.3806, + "step": 1567 + }, + { + "epoch": 0.4812768569674647, + "grad_norm": 0.8360965847969055, + "learning_rate": 9.991431256910223e-05, + "loss": 2.3997, + "step": 1568 + }, + { + "epoch": 0.48158379373848986, + "grad_norm": 0.9178828597068787, + "learning_rate": 9.991402144657673e-05, + "loss": 2.3611, + "step": 1569 + }, + { + "epoch": 0.48189073050951503, + "grad_norm": 0.7961607575416565, + "learning_rate": 9.991372983077161e-05, + "loss": 2.3588, + "step": 1570 + }, + { + "epoch": 0.4821976672805402, + "grad_norm": 0.8136993646621704, + "learning_rate": 9.991343772168978e-05, + "loss": 2.3241, + "step": 1571 + }, + { + "epoch": 0.48250460405156537, + "grad_norm": 0.8421273231506348, + "learning_rate": 9.991314511933407e-05, + "loss": 2.3493, + "step": 1572 + }, + { + "epoch": 0.48281154082259053, + "grad_norm": 0.774861752986908, + "learning_rate": 9.991285202370743e-05, + "loss": 2.362, + "step": 1573 + }, + { + "epoch": 0.4831184775936157, + "grad_norm": 0.9181589484214783, + "learning_rate": 9.991255843481273e-05, + "loss": 2.443, + "step": 1574 + }, + { + "epoch": 0.48342541436464087, + "grad_norm": 0.873884379863739, + "learning_rate": 9.991226435265286e-05, + "loss": 2.3819, + "step": 1575 + }, + { + "epoch": 0.48373235113566604, + "grad_norm": 0.923200786113739, + "learning_rate": 9.991196977723077e-05, + "loss": 2.4152, + "step": 1576 + }, + { + "epoch": 0.4840392879066912, + "grad_norm": 0.9097923040390015, + "learning_rate": 9.99116747085493e-05, + "loss": 2.4072, + "step": 1577 + }, + { + "epoch": 0.4843462246777164, + "grad_norm": 0.8885805010795593, + "learning_rate": 9.991137914661143e-05, + "loss": 2.3963, + "step": 1578 + }, + { + "epoch": 0.48465316144874154, + "grad_norm": 0.9016655683517456, + "learning_rate": 9.991108309142006e-05, + "loss": 2.4287, + "step": 1579 + }, + { + "epoch": 0.4849600982197667, + "grad_norm": 0.957548201084137, + "learning_rate": 9.99107865429781e-05, + "loss": 2.4306, + "step": 1580 + }, + { + "epoch": 0.4852670349907919, + "grad_norm": 0.9604195356369019, + "learning_rate": 9.99104895012885e-05, + "loss": 2.3721, + "step": 1581 + }, + { + "epoch": 0.48557397176181705, + "grad_norm": 1.0423815250396729, + "learning_rate": 9.991019196635419e-05, + "loss": 2.3847, + "step": 1582 + }, + { + "epoch": 0.4858809085328422, + "grad_norm": 0.9538045525550842, + "learning_rate": 9.990989393817809e-05, + "loss": 2.4307, + "step": 1583 + }, + { + "epoch": 0.4861878453038674, + "grad_norm": 1.0103334188461304, + "learning_rate": 9.990959541676318e-05, + "loss": 2.409, + "step": 1584 + }, + { + "epoch": 0.48649478207489255, + "grad_norm": 1.0780646800994873, + "learning_rate": 9.99092964021124e-05, + "loss": 2.3314, + "step": 1585 + }, + { + "epoch": 0.4868017188459177, + "grad_norm": 1.0062072277069092, + "learning_rate": 9.99089968942287e-05, + "loss": 2.3922, + "step": 1586 + }, + { + "epoch": 0.4871086556169429, + "grad_norm": 1.0575196743011475, + "learning_rate": 9.990869689311504e-05, + "loss": 2.4156, + "step": 1587 + }, + { + "epoch": 0.48741559238796806, + "grad_norm": 0.9953998923301697, + "learning_rate": 9.990839639877438e-05, + "loss": 2.381, + "step": 1588 + }, + { + "epoch": 0.4877225291589932, + "grad_norm": 0.8848470449447632, + "learning_rate": 9.99080954112097e-05, + "loss": 2.4178, + "step": 1589 + }, + { + "epoch": 0.4880294659300184, + "grad_norm": 0.7849117517471313, + "learning_rate": 9.990779393042397e-05, + "loss": 2.3021, + "step": 1590 + }, + { + "epoch": 0.48833640270104356, + "grad_norm": 0.7611599564552307, + "learning_rate": 9.990749195642016e-05, + "loss": 2.4426, + "step": 1591 + }, + { + "epoch": 0.4886433394720687, + "grad_norm": 0.8361895084381104, + "learning_rate": 9.990718948920127e-05, + "loss": 2.3442, + "step": 1592 + }, + { + "epoch": 0.4889502762430939, + "grad_norm": 0.8249576687812805, + "learning_rate": 9.990688652877028e-05, + "loss": 2.2745, + "step": 1593 + }, + { + "epoch": 0.4892572130141191, + "grad_norm": 0.763889729976654, + "learning_rate": 9.990658307513019e-05, + "loss": 2.3123, + "step": 1594 + }, + { + "epoch": 0.4895641497851443, + "grad_norm": 0.7517281770706177, + "learning_rate": 9.990627912828399e-05, + "loss": 2.3811, + "step": 1595 + }, + { + "epoch": 0.48987108655616945, + "grad_norm": 0.8254112005233765, + "learning_rate": 9.990597468823468e-05, + "loss": 2.4269, + "step": 1596 + }, + { + "epoch": 0.4901780233271946, + "grad_norm": 0.8267236948013306, + "learning_rate": 9.99056697549853e-05, + "loss": 2.354, + "step": 1597 + }, + { + "epoch": 0.4904849600982198, + "grad_norm": 0.8511303067207336, + "learning_rate": 9.990536432853881e-05, + "loss": 2.3755, + "step": 1598 + }, + { + "epoch": 0.49079189686924496, + "grad_norm": 0.8639636635780334, + "learning_rate": 9.990505840889828e-05, + "loss": 2.3828, + "step": 1599 + }, + { + "epoch": 0.4910988336402701, + "grad_norm": 0.8371795415878296, + "learning_rate": 9.990475199606672e-05, + "loss": 2.4235, + "step": 1600 + }, + { + "epoch": 0.4914057704112953, + "grad_norm": 0.7639186382293701, + "learning_rate": 9.990444509004713e-05, + "loss": 2.3547, + "step": 1601 + }, + { + "epoch": 0.49171270718232046, + "grad_norm": 0.7835492491722107, + "learning_rate": 9.990413769084257e-05, + "loss": 2.2983, + "step": 1602 + }, + { + "epoch": 0.49201964395334563, + "grad_norm": 0.8301565647125244, + "learning_rate": 9.990382979845609e-05, + "loss": 2.4109, + "step": 1603 + }, + { + "epoch": 0.4923265807243708, + "grad_norm": 0.9005976915359497, + "learning_rate": 9.99035214128907e-05, + "loss": 2.3618, + "step": 1604 + }, + { + "epoch": 0.49263351749539597, + "grad_norm": 1.0234936475753784, + "learning_rate": 9.990321253414945e-05, + "loss": 2.4622, + "step": 1605 + }, + { + "epoch": 0.49294045426642114, + "grad_norm": 1.1613819599151611, + "learning_rate": 9.990290316223542e-05, + "loss": 2.3231, + "step": 1606 + }, + { + "epoch": 0.4932473910374463, + "grad_norm": 0.9382983446121216, + "learning_rate": 9.990259329715165e-05, + "loss": 2.357, + "step": 1607 + }, + { + "epoch": 0.49355432780847147, + "grad_norm": 1.0277435779571533, + "learning_rate": 9.990228293890121e-05, + "loss": 2.3497, + "step": 1608 + }, + { + "epoch": 0.49386126457949664, + "grad_norm": 0.9809542894363403, + "learning_rate": 9.990197208748716e-05, + "loss": 2.363, + "step": 1609 + }, + { + "epoch": 0.4941682013505218, + "grad_norm": 1.151412844657898, + "learning_rate": 9.990166074291255e-05, + "loss": 2.4859, + "step": 1610 + }, + { + "epoch": 0.494475138121547, + "grad_norm": 0.9663482308387756, + "learning_rate": 9.990134890518051e-05, + "loss": 2.3848, + "step": 1611 + }, + { + "epoch": 0.49478207489257214, + "grad_norm": 0.9619266986846924, + "learning_rate": 9.990103657429405e-05, + "loss": 2.3381, + "step": 1612 + }, + { + "epoch": 0.4950890116635973, + "grad_norm": 1.1306475400924683, + "learning_rate": 9.990072375025634e-05, + "loss": 2.3859, + "step": 1613 + }, + { + "epoch": 0.4953959484346225, + "grad_norm": 1.127801537513733, + "learning_rate": 9.990041043307043e-05, + "loss": 2.4259, + "step": 1614 + }, + { + "epoch": 0.49570288520564765, + "grad_norm": 0.9880200624465942, + "learning_rate": 9.990009662273941e-05, + "loss": 2.3629, + "step": 1615 + }, + { + "epoch": 0.4960098219766728, + "grad_norm": 0.940493643283844, + "learning_rate": 9.989978231926636e-05, + "loss": 2.3716, + "step": 1616 + }, + { + "epoch": 0.496316758747698, + "grad_norm": 0.7923702597618103, + "learning_rate": 9.989946752265445e-05, + "loss": 2.3017, + "step": 1617 + }, + { + "epoch": 0.49662369551872315, + "grad_norm": 0.7668408155441284, + "learning_rate": 9.989915223290673e-05, + "loss": 2.3273, + "step": 1618 + }, + { + "epoch": 0.4969306322897483, + "grad_norm": 0.7134098410606384, + "learning_rate": 9.989883645002636e-05, + "loss": 2.302, + "step": 1619 + }, + { + "epoch": 0.4972375690607735, + "grad_norm": 0.6878800392150879, + "learning_rate": 9.989852017401643e-05, + "loss": 2.3047, + "step": 1620 + }, + { + "epoch": 0.49754450583179866, + "grad_norm": 0.8099397420883179, + "learning_rate": 9.989820340488008e-05, + "loss": 2.4747, + "step": 1621 + }, + { + "epoch": 0.4978514426028238, + "grad_norm": 0.9677640795707703, + "learning_rate": 9.989788614262043e-05, + "loss": 2.3347, + "step": 1622 + }, + { + "epoch": 0.498158379373849, + "grad_norm": 0.7592893838882446, + "learning_rate": 9.989756838724064e-05, + "loss": 2.3238, + "step": 1623 + }, + { + "epoch": 0.49846531614487416, + "grad_norm": 0.872529923915863, + "learning_rate": 9.989725013874382e-05, + "loss": 2.4117, + "step": 1624 + }, + { + "epoch": 0.49877225291589933, + "grad_norm": 1.023362159729004, + "learning_rate": 9.989693139713315e-05, + "loss": 2.3307, + "step": 1625 + }, + { + "epoch": 0.4990791896869245, + "grad_norm": 0.8994693756103516, + "learning_rate": 9.989661216241172e-05, + "loss": 2.3661, + "step": 1626 + }, + { + "epoch": 0.49938612645794966, + "grad_norm": 0.8854429125785828, + "learning_rate": 9.989629243458275e-05, + "loss": 2.311, + "step": 1627 + }, + { + "epoch": 0.49969306322897483, + "grad_norm": 0.8326926231384277, + "learning_rate": 9.989597221364937e-05, + "loss": 2.302, + "step": 1628 + }, + { + "epoch": 0.5, + "grad_norm": 0.8778239488601685, + "learning_rate": 9.989565149961475e-05, + "loss": 2.4653, + "step": 1629 + }, + { + "epoch": 0.5003069367710252, + "grad_norm": 0.9369759559631348, + "learning_rate": 9.989533029248205e-05, + "loss": 2.4165, + "step": 1630 + }, + { + "epoch": 0.5006138735420503, + "grad_norm": 0.8510915637016296, + "learning_rate": 9.989500859225445e-05, + "loss": 2.3345, + "step": 1631 + }, + { + "epoch": 0.5009208103130756, + "grad_norm": 0.787972629070282, + "learning_rate": 9.989468639893513e-05, + "loss": 2.283, + "step": 1632 + }, + { + "epoch": 0.5012277470841007, + "grad_norm": 0.7370568513870239, + "learning_rate": 9.989436371252729e-05, + "loss": 2.2867, + "step": 1633 + }, + { + "epoch": 0.5015346838551259, + "grad_norm": 0.8459502458572388, + "learning_rate": 9.989404053303409e-05, + "loss": 2.2875, + "step": 1634 + }, + { + "epoch": 0.501841620626151, + "grad_norm": 0.9123181700706482, + "learning_rate": 9.989371686045874e-05, + "loss": 2.2653, + "step": 1635 + }, + { + "epoch": 0.5021485573971762, + "grad_norm": 1.1908178329467773, + "learning_rate": 9.989339269480445e-05, + "loss": 2.4849, + "step": 1636 + }, + { + "epoch": 0.5024554941682013, + "grad_norm": 0.8162623643875122, + "learning_rate": 9.989306803607439e-05, + "loss": 2.2409, + "step": 1637 + }, + { + "epoch": 0.5027624309392266, + "grad_norm": 0.9289522171020508, + "learning_rate": 9.98927428842718e-05, + "loss": 2.455, + "step": 1638 + }, + { + "epoch": 0.5030693677102517, + "grad_norm": 1.212346076965332, + "learning_rate": 9.989241723939988e-05, + "loss": 2.3461, + "step": 1639 + }, + { + "epoch": 0.5033763044812769, + "grad_norm": 0.8971593976020813, + "learning_rate": 9.989209110146184e-05, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.503683241252302, + "grad_norm": 0.9293156862258911, + "learning_rate": 9.989176447046092e-05, + "loss": 2.3235, + "step": 1641 + }, + { + "epoch": 0.5039901780233272, + "grad_norm": 0.8665596842765808, + "learning_rate": 9.989143734640034e-05, + "loss": 2.4694, + "step": 1642 + }, + { + "epoch": 0.5042971147943524, + "grad_norm": 0.7732648253440857, + "learning_rate": 9.989110972928333e-05, + "loss": 2.1985, + "step": 1643 + }, + { + "epoch": 0.5046040515653776, + "grad_norm": 0.8124692440032959, + "learning_rate": 9.989078161911314e-05, + "loss": 2.315, + "step": 1644 + }, + { + "epoch": 0.5049109883364027, + "grad_norm": 0.8534342050552368, + "learning_rate": 9.989045301589301e-05, + "loss": 2.3491, + "step": 1645 + }, + { + "epoch": 0.5052179251074279, + "grad_norm": 0.8351274132728577, + "learning_rate": 9.989012391962617e-05, + "loss": 2.3416, + "step": 1646 + }, + { + "epoch": 0.505524861878453, + "grad_norm": 0.9143189787864685, + "learning_rate": 9.988979433031588e-05, + "loss": 2.4665, + "step": 1647 + }, + { + "epoch": 0.5058317986494782, + "grad_norm": 0.8978474140167236, + "learning_rate": 9.988946424796542e-05, + "loss": 2.389, + "step": 1648 + }, + { + "epoch": 0.5061387354205034, + "grad_norm": 1.0245648622512817, + "learning_rate": 9.988913367257802e-05, + "loss": 2.3391, + "step": 1649 + }, + { + "epoch": 0.5064456721915286, + "grad_norm": 0.9991573691368103, + "learning_rate": 9.988880260415695e-05, + "loss": 2.405, + "step": 1650 + }, + { + "epoch": 0.5067526089625537, + "grad_norm": 1.042378306388855, + "learning_rate": 9.98884710427055e-05, + "loss": 2.3467, + "step": 1651 + }, + { + "epoch": 0.5070595457335789, + "grad_norm": 0.9569510817527771, + "learning_rate": 9.988813898822694e-05, + "loss": 2.31, + "step": 1652 + }, + { + "epoch": 0.507366482504604, + "grad_norm": 0.9343158006668091, + "learning_rate": 9.988780644072456e-05, + "loss": 2.3659, + "step": 1653 + }, + { + "epoch": 0.5076734192756293, + "grad_norm": 0.7857093811035156, + "learning_rate": 9.988747340020162e-05, + "loss": 2.3424, + "step": 1654 + }, + { + "epoch": 0.5079803560466544, + "grad_norm": 0.7613041996955872, + "learning_rate": 9.988713986666144e-05, + "loss": 2.2698, + "step": 1655 + }, + { + "epoch": 0.5082872928176796, + "grad_norm": 0.8077516555786133, + "learning_rate": 9.98868058401073e-05, + "loss": 2.3827, + "step": 1656 + }, + { + "epoch": 0.5085942295887047, + "grad_norm": 0.8794304132461548, + "learning_rate": 9.98864713205425e-05, + "loss": 2.3079, + "step": 1657 + }, + { + "epoch": 0.5089011663597299, + "grad_norm": 0.8333674073219299, + "learning_rate": 9.988613630797036e-05, + "loss": 2.3622, + "step": 1658 + }, + { + "epoch": 0.509208103130755, + "grad_norm": 0.9654781222343445, + "learning_rate": 9.988580080239417e-05, + "loss": 2.3979, + "step": 1659 + }, + { + "epoch": 0.5095150399017803, + "grad_norm": 0.9278727769851685, + "learning_rate": 9.988546480381727e-05, + "loss": 2.3728, + "step": 1660 + }, + { + "epoch": 0.5098219766728054, + "grad_norm": 0.7971704006195068, + "learning_rate": 9.988512831224298e-05, + "loss": 2.2983, + "step": 1661 + }, + { + "epoch": 0.5101289134438306, + "grad_norm": 0.8991698026657104, + "learning_rate": 9.988479132767459e-05, + "loss": 2.3992, + "step": 1662 + }, + { + "epoch": 0.5104358502148557, + "grad_norm": 1.0208392143249512, + "learning_rate": 9.988445385011546e-05, + "loss": 2.3847, + "step": 1663 + }, + { + "epoch": 0.5107427869858809, + "grad_norm": 0.878237247467041, + "learning_rate": 9.988411587956891e-05, + "loss": 2.2851, + "step": 1664 + }, + { + "epoch": 0.511049723756906, + "grad_norm": 0.903287410736084, + "learning_rate": 9.98837774160383e-05, + "loss": 2.4233, + "step": 1665 + }, + { + "epoch": 0.5113566605279313, + "grad_norm": 0.8845674991607666, + "learning_rate": 9.988343845952697e-05, + "loss": 2.2923, + "step": 1666 + }, + { + "epoch": 0.5116635972989564, + "grad_norm": 0.7729392051696777, + "learning_rate": 9.988309901003825e-05, + "loss": 2.3044, + "step": 1667 + }, + { + "epoch": 0.5119705340699816, + "grad_norm": 0.719302237033844, + "learning_rate": 9.988275906757551e-05, + "loss": 2.3207, + "step": 1668 + }, + { + "epoch": 0.5122774708410067, + "grad_norm": 0.7205179333686829, + "learning_rate": 9.988241863214211e-05, + "loss": 2.341, + "step": 1669 + }, + { + "epoch": 0.512584407612032, + "grad_norm": 0.7318145036697388, + "learning_rate": 9.988207770374142e-05, + "loss": 2.3419, + "step": 1670 + }, + { + "epoch": 0.5128913443830571, + "grad_norm": 0.770630955696106, + "learning_rate": 9.98817362823768e-05, + "loss": 2.27, + "step": 1671 + }, + { + "epoch": 0.5131982811540823, + "grad_norm": 0.6485452651977539, + "learning_rate": 9.988139436805162e-05, + "loss": 2.2715, + "step": 1672 + }, + { + "epoch": 0.5135052179251074, + "grad_norm": 0.6990405321121216, + "learning_rate": 9.988105196076925e-05, + "loss": 2.2806, + "step": 1673 + }, + { + "epoch": 0.5138121546961326, + "grad_norm": 0.695818305015564, + "learning_rate": 9.98807090605331e-05, + "loss": 2.3387, + "step": 1674 + }, + { + "epoch": 0.5141190914671577, + "grad_norm": 0.7685426473617554, + "learning_rate": 9.988036566734655e-05, + "loss": 2.2921, + "step": 1675 + }, + { + "epoch": 0.514426028238183, + "grad_norm": 0.6522897481918335, + "learning_rate": 9.988002178121301e-05, + "loss": 2.2507, + "step": 1676 + }, + { + "epoch": 0.5147329650092081, + "grad_norm": 0.7442181706428528, + "learning_rate": 9.987967740213583e-05, + "loss": 2.3292, + "step": 1677 + }, + { + "epoch": 0.5150399017802333, + "grad_norm": 0.8093023300170898, + "learning_rate": 9.987933253011846e-05, + "loss": 2.3384, + "step": 1678 + }, + { + "epoch": 0.5153468385512584, + "grad_norm": 0.8014655113220215, + "learning_rate": 9.987898716516428e-05, + "loss": 2.3619, + "step": 1679 + }, + { + "epoch": 0.5156537753222836, + "grad_norm": 0.8230258822441101, + "learning_rate": 9.987864130727671e-05, + "loss": 2.3242, + "step": 1680 + }, + { + "epoch": 0.5159607120933087, + "grad_norm": 0.9222247004508972, + "learning_rate": 9.987829495645918e-05, + "loss": 2.3907, + "step": 1681 + }, + { + "epoch": 0.516267648864334, + "grad_norm": 0.9293351769447327, + "learning_rate": 9.987794811271511e-05, + "loss": 2.3632, + "step": 1682 + }, + { + "epoch": 0.5165745856353591, + "grad_norm": 0.9555168747901917, + "learning_rate": 9.987760077604791e-05, + "loss": 2.3273, + "step": 1683 + }, + { + "epoch": 0.5168815224063843, + "grad_norm": 0.9839370250701904, + "learning_rate": 9.987725294646102e-05, + "loss": 2.3451, + "step": 1684 + }, + { + "epoch": 0.5171884591774094, + "grad_norm": 1.097970962524414, + "learning_rate": 9.987690462395791e-05, + "loss": 2.308, + "step": 1685 + }, + { + "epoch": 0.5174953959484346, + "grad_norm": 0.9345484972000122, + "learning_rate": 9.987655580854198e-05, + "loss": 2.3051, + "step": 1686 + }, + { + "epoch": 0.5178023327194597, + "grad_norm": 0.8075851798057556, + "learning_rate": 9.987620650021668e-05, + "loss": 2.3005, + "step": 1687 + }, + { + "epoch": 0.518109269490485, + "grad_norm": 0.7287935614585876, + "learning_rate": 9.987585669898549e-05, + "loss": 2.3709, + "step": 1688 + }, + { + "epoch": 0.5184162062615101, + "grad_norm": 0.7611173987388611, + "learning_rate": 9.987550640485184e-05, + "loss": 2.3265, + "step": 1689 + }, + { + "epoch": 0.5187231430325353, + "grad_norm": 0.7932588458061218, + "learning_rate": 9.987515561781921e-05, + "loss": 2.3625, + "step": 1690 + }, + { + "epoch": 0.5190300798035604, + "grad_norm": 0.7837479114532471, + "learning_rate": 9.987480433789106e-05, + "loss": 2.2614, + "step": 1691 + }, + { + "epoch": 0.5193370165745856, + "grad_norm": 0.905799925327301, + "learning_rate": 9.987445256507085e-05, + "loss": 2.2915, + "step": 1692 + }, + { + "epoch": 0.5196439533456108, + "grad_norm": 0.9417183995246887, + "learning_rate": 9.987410029936208e-05, + "loss": 2.3624, + "step": 1693 + }, + { + "epoch": 0.519950890116636, + "grad_norm": 0.9971327185630798, + "learning_rate": 9.987374754076822e-05, + "loss": 2.3913, + "step": 1694 + }, + { + "epoch": 0.5202578268876611, + "grad_norm": 0.8719072341918945, + "learning_rate": 9.987339428929274e-05, + "loss": 2.3412, + "step": 1695 + }, + { + "epoch": 0.5205647636586863, + "grad_norm": 0.8198116421699524, + "learning_rate": 9.987304054493916e-05, + "loss": 2.333, + "step": 1696 + }, + { + "epoch": 0.5208717004297114, + "grad_norm": 0.7450931668281555, + "learning_rate": 9.987268630771096e-05, + "loss": 2.2817, + "step": 1697 + }, + { + "epoch": 0.5211786372007366, + "grad_norm": 0.6867587566375732, + "learning_rate": 9.987233157761164e-05, + "loss": 2.3456, + "step": 1698 + }, + { + "epoch": 0.5214855739717618, + "grad_norm": 0.7537778615951538, + "learning_rate": 9.987197635464471e-05, + "loss": 2.176, + "step": 1699 + }, + { + "epoch": 0.521792510742787, + "grad_norm": 0.8347577452659607, + "learning_rate": 9.987162063881366e-05, + "loss": 2.3296, + "step": 1700 + }, + { + "epoch": 0.5220994475138122, + "grad_norm": 0.8714643120765686, + "learning_rate": 9.987126443012205e-05, + "loss": 2.3648, + "step": 1701 + }, + { + "epoch": 0.5224063842848373, + "grad_norm": 0.8579849004745483, + "learning_rate": 9.987090772857336e-05, + "loss": 2.4189, + "step": 1702 + }, + { + "epoch": 0.5227133210558625, + "grad_norm": 0.8651238083839417, + "learning_rate": 9.987055053417114e-05, + "loss": 2.3036, + "step": 1703 + }, + { + "epoch": 0.5230202578268877, + "grad_norm": 0.8447873592376709, + "learning_rate": 9.98701928469189e-05, + "loss": 2.3243, + "step": 1704 + }, + { + "epoch": 0.5233271945979129, + "grad_norm": 0.8218941688537598, + "learning_rate": 9.986983466682019e-05, + "loss": 2.3888, + "step": 1705 + }, + { + "epoch": 0.523634131368938, + "grad_norm": 0.7862920761108398, + "learning_rate": 9.986947599387855e-05, + "loss": 2.335, + "step": 1706 + }, + { + "epoch": 0.5239410681399632, + "grad_norm": 0.8096200227737427, + "learning_rate": 9.986911682809749e-05, + "loss": 2.4034, + "step": 1707 + }, + { + "epoch": 0.5242480049109883, + "grad_norm": 0.8217427730560303, + "learning_rate": 9.986875716948062e-05, + "loss": 2.2659, + "step": 1708 + }, + { + "epoch": 0.5245549416820136, + "grad_norm": 0.7676928043365479, + "learning_rate": 9.986839701803146e-05, + "loss": 2.2736, + "step": 1709 + }, + { + "epoch": 0.5248618784530387, + "grad_norm": 0.7783572673797607, + "learning_rate": 9.986803637375356e-05, + "loss": 2.3611, + "step": 1710 + }, + { + "epoch": 0.5251688152240639, + "grad_norm": 0.7657338380813599, + "learning_rate": 9.98676752366505e-05, + "loss": 2.3573, + "step": 1711 + }, + { + "epoch": 0.525475751995089, + "grad_norm": 0.8946976065635681, + "learning_rate": 9.986731360672585e-05, + "loss": 2.3443, + "step": 1712 + }, + { + "epoch": 0.5257826887661142, + "grad_norm": 0.8047227263450623, + "learning_rate": 9.986695148398318e-05, + "loss": 2.345, + "step": 1713 + }, + { + "epoch": 0.5260896255371393, + "grad_norm": 0.8407939672470093, + "learning_rate": 9.986658886842605e-05, + "loss": 2.2828, + "step": 1714 + }, + { + "epoch": 0.5263965623081646, + "grad_norm": 0.8460215330123901, + "learning_rate": 9.986622576005806e-05, + "loss": 2.2786, + "step": 1715 + }, + { + "epoch": 0.5267034990791897, + "grad_norm": 0.8291949033737183, + "learning_rate": 9.986586215888283e-05, + "loss": 2.3491, + "step": 1716 + }, + { + "epoch": 0.5270104358502149, + "grad_norm": 0.8812628388404846, + "learning_rate": 9.98654980649039e-05, + "loss": 2.3392, + "step": 1717 + }, + { + "epoch": 0.52731737262124, + "grad_norm": 0.8666933178901672, + "learning_rate": 9.98651334781249e-05, + "loss": 2.2585, + "step": 1718 + }, + { + "epoch": 0.5276243093922652, + "grad_norm": 0.8393275737762451, + "learning_rate": 9.986476839854941e-05, + "loss": 2.3315, + "step": 1719 + }, + { + "epoch": 0.5279312461632903, + "grad_norm": 0.8431777954101562, + "learning_rate": 9.986440282618105e-05, + "loss": 2.268, + "step": 1720 + }, + { + "epoch": 0.5282381829343156, + "grad_norm": 0.8020747900009155, + "learning_rate": 9.986403676102346e-05, + "loss": 2.2306, + "step": 1721 + }, + { + "epoch": 0.5285451197053407, + "grad_norm": 0.817395806312561, + "learning_rate": 9.986367020308022e-05, + "loss": 2.2914, + "step": 1722 + }, + { + "epoch": 0.5288520564763659, + "grad_norm": 0.8034493327140808, + "learning_rate": 9.986330315235497e-05, + "loss": 2.3598, + "step": 1723 + }, + { + "epoch": 0.529158993247391, + "grad_norm": 0.9001252055168152, + "learning_rate": 9.986293560885131e-05, + "loss": 2.3456, + "step": 1724 + }, + { + "epoch": 0.5294659300184162, + "grad_norm": 0.9782349467277527, + "learning_rate": 9.986256757257293e-05, + "loss": 2.231, + "step": 1725 + }, + { + "epoch": 0.5297728667894414, + "grad_norm": 1.0022578239440918, + "learning_rate": 9.98621990435234e-05, + "loss": 2.3457, + "step": 1726 + }, + { + "epoch": 0.5300798035604666, + "grad_norm": 1.0705206394195557, + "learning_rate": 9.986183002170642e-05, + "loss": 2.2775, + "step": 1727 + }, + { + "epoch": 0.5303867403314917, + "grad_norm": 0.8464064598083496, + "learning_rate": 9.98614605071256e-05, + "loss": 2.4006, + "step": 1728 + }, + { + "epoch": 0.5306936771025169, + "grad_norm": 0.7128132581710815, + "learning_rate": 9.98610904997846e-05, + "loss": 2.3273, + "step": 1729 + }, + { + "epoch": 0.531000613873542, + "grad_norm": 0.8113927245140076, + "learning_rate": 9.986071999968706e-05, + "loss": 2.3467, + "step": 1730 + }, + { + "epoch": 0.5313075506445673, + "grad_norm": 0.9236831665039062, + "learning_rate": 9.986034900683669e-05, + "loss": 2.3815, + "step": 1731 + }, + { + "epoch": 0.5316144874155924, + "grad_norm": 0.9325668811798096, + "learning_rate": 9.985997752123713e-05, + "loss": 2.3411, + "step": 1732 + }, + { + "epoch": 0.5319214241866176, + "grad_norm": 0.9585117101669312, + "learning_rate": 9.985960554289203e-05, + "loss": 2.3309, + "step": 1733 + }, + { + "epoch": 0.5322283609576427, + "grad_norm": 0.9459986686706543, + "learning_rate": 9.98592330718051e-05, + "loss": 2.3525, + "step": 1734 + }, + { + "epoch": 0.5325352977286679, + "grad_norm": 0.971592366695404, + "learning_rate": 9.985886010797997e-05, + "loss": 2.3665, + "step": 1735 + }, + { + "epoch": 0.532842234499693, + "grad_norm": 0.8533779978752136, + "learning_rate": 9.985848665142039e-05, + "loss": 2.26, + "step": 1736 + }, + { + "epoch": 0.5331491712707183, + "grad_norm": 0.8224228620529175, + "learning_rate": 9.985811270213002e-05, + "loss": 2.3523, + "step": 1737 + }, + { + "epoch": 0.5334561080417434, + "grad_norm": 0.8649810552597046, + "learning_rate": 9.985773826011255e-05, + "loss": 2.3262, + "step": 1738 + }, + { + "epoch": 0.5337630448127686, + "grad_norm": 0.8099339604377747, + "learning_rate": 9.98573633253717e-05, + "loss": 2.3038, + "step": 1739 + }, + { + "epoch": 0.5340699815837937, + "grad_norm": 0.6788219213485718, + "learning_rate": 9.985698789791115e-05, + "loss": 2.3278, + "step": 1740 + }, + { + "epoch": 0.5343769183548189, + "grad_norm": 0.8716040253639221, + "learning_rate": 9.985661197773464e-05, + "loss": 2.2955, + "step": 1741 + }, + { + "epoch": 0.534683855125844, + "grad_norm": 0.8377614617347717, + "learning_rate": 9.985623556484587e-05, + "loss": 2.2801, + "step": 1742 + }, + { + "epoch": 0.5349907918968693, + "grad_norm": 0.8452683091163635, + "learning_rate": 9.985585865924853e-05, + "loss": 2.3313, + "step": 1743 + }, + { + "epoch": 0.5352977286678944, + "grad_norm": 0.8226203918457031, + "learning_rate": 9.98554812609464e-05, + "loss": 2.3464, + "step": 1744 + }, + { + "epoch": 0.5356046654389196, + "grad_norm": 0.7476974725723267, + "learning_rate": 9.985510336994316e-05, + "loss": 2.3721, + "step": 1745 + }, + { + "epoch": 0.5359116022099447, + "grad_norm": 0.7132230997085571, + "learning_rate": 9.98547249862426e-05, + "loss": 2.2657, + "step": 1746 + }, + { + "epoch": 0.5362185389809699, + "grad_norm": 0.7022002339363098, + "learning_rate": 9.98543461098484e-05, + "loss": 2.2656, + "step": 1747 + }, + { + "epoch": 0.536525475751995, + "grad_norm": 0.7174789309501648, + "learning_rate": 9.985396674076435e-05, + "loss": 2.2914, + "step": 1748 + }, + { + "epoch": 0.5368324125230203, + "grad_norm": 0.78509920835495, + "learning_rate": 9.985358687899417e-05, + "loss": 2.3155, + "step": 1749 + }, + { + "epoch": 0.5371393492940454, + "grad_norm": 0.7670894861221313, + "learning_rate": 9.985320652454162e-05, + "loss": 2.2608, + "step": 1750 + }, + { + "epoch": 0.5374462860650706, + "grad_norm": 0.6196603178977966, + "learning_rate": 9.985282567741047e-05, + "loss": 2.2796, + "step": 1751 + }, + { + "epoch": 0.5377532228360957, + "grad_norm": 0.7119829058647156, + "learning_rate": 9.985244433760448e-05, + "loss": 2.2262, + "step": 1752 + }, + { + "epoch": 0.538060159607121, + "grad_norm": 0.6665359735488892, + "learning_rate": 9.98520625051274e-05, + "loss": 2.2714, + "step": 1753 + }, + { + "epoch": 0.5383670963781461, + "grad_norm": 0.7960934042930603, + "learning_rate": 9.985168017998303e-05, + "loss": 2.3703, + "step": 1754 + }, + { + "epoch": 0.5386740331491713, + "grad_norm": 0.9428521394729614, + "learning_rate": 9.985129736217513e-05, + "loss": 2.3334, + "step": 1755 + }, + { + "epoch": 0.5389809699201964, + "grad_norm": 0.9900842905044556, + "learning_rate": 9.985091405170751e-05, + "loss": 2.2369, + "step": 1756 + }, + { + "epoch": 0.5392879066912216, + "grad_norm": 0.9340593814849854, + "learning_rate": 9.985053024858393e-05, + "loss": 2.4332, + "step": 1757 + }, + { + "epoch": 0.5395948434622467, + "grad_norm": 0.9241896271705627, + "learning_rate": 9.985014595280818e-05, + "loss": 2.3484, + "step": 1758 + }, + { + "epoch": 0.539901780233272, + "grad_norm": 0.7724506258964539, + "learning_rate": 9.984976116438408e-05, + "loss": 2.282, + "step": 1759 + }, + { + "epoch": 0.5402087170042971, + "grad_norm": 0.9098101854324341, + "learning_rate": 9.984937588331543e-05, + "loss": 2.3039, + "step": 1760 + }, + { + "epoch": 0.5405156537753223, + "grad_norm": 0.9430370330810547, + "learning_rate": 9.984899010960601e-05, + "loss": 2.2555, + "step": 1761 + }, + { + "epoch": 0.5408225905463474, + "grad_norm": 0.8927021026611328, + "learning_rate": 9.984860384325965e-05, + "loss": 2.3034, + "step": 1762 + }, + { + "epoch": 0.5411295273173726, + "grad_norm": 0.8331896662712097, + "learning_rate": 9.98482170842802e-05, + "loss": 2.3341, + "step": 1763 + }, + { + "epoch": 0.5414364640883977, + "grad_norm": 0.8311246633529663, + "learning_rate": 9.984782983267142e-05, + "loss": 2.3913, + "step": 1764 + }, + { + "epoch": 0.541743400859423, + "grad_norm": 0.7459335923194885, + "learning_rate": 9.98474420884372e-05, + "loss": 2.2912, + "step": 1765 + }, + { + "epoch": 0.5420503376304481, + "grad_norm": 0.84760981798172, + "learning_rate": 9.984705385158131e-05, + "loss": 2.316, + "step": 1766 + }, + { + "epoch": 0.5423572744014733, + "grad_norm": 0.888793408870697, + "learning_rate": 9.984666512210762e-05, + "loss": 2.3452, + "step": 1767 + }, + { + "epoch": 0.5426642111724984, + "grad_norm": 0.7977499961853027, + "learning_rate": 9.984627590001999e-05, + "loss": 2.3325, + "step": 1768 + }, + { + "epoch": 0.5429711479435236, + "grad_norm": 0.8059934377670288, + "learning_rate": 9.984588618532224e-05, + "loss": 2.3347, + "step": 1769 + }, + { + "epoch": 0.5432780847145487, + "grad_norm": 0.8190197348594666, + "learning_rate": 9.984549597801822e-05, + "loss": 2.3446, + "step": 1770 + }, + { + "epoch": 0.543585021485574, + "grad_norm": 0.774773895740509, + "learning_rate": 9.98451052781118e-05, + "loss": 2.2598, + "step": 1771 + }, + { + "epoch": 0.5438919582565992, + "grad_norm": 0.7341485023498535, + "learning_rate": 9.984471408560682e-05, + "loss": 2.2728, + "step": 1772 + }, + { + "epoch": 0.5441988950276243, + "grad_norm": 0.6881145238876343, + "learning_rate": 9.984432240050719e-05, + "loss": 2.2922, + "step": 1773 + }, + { + "epoch": 0.5445058317986495, + "grad_norm": 0.6896151304244995, + "learning_rate": 9.984393022281673e-05, + "loss": 2.2915, + "step": 1774 + }, + { + "epoch": 0.5448127685696746, + "grad_norm": 0.6902059316635132, + "learning_rate": 9.984353755253932e-05, + "loss": 2.31, + "step": 1775 + }, + { + "epoch": 0.5451197053406999, + "grad_norm": 0.7594140768051147, + "learning_rate": 9.984314438967888e-05, + "loss": 2.3092, + "step": 1776 + }, + { + "epoch": 0.545426642111725, + "grad_norm": 0.8682328462600708, + "learning_rate": 9.984275073423927e-05, + "loss": 2.2851, + "step": 1777 + }, + { + "epoch": 0.5457335788827502, + "grad_norm": 0.8747107982635498, + "learning_rate": 9.98423565862244e-05, + "loss": 2.2927, + "step": 1778 + }, + { + "epoch": 0.5460405156537753, + "grad_norm": 0.9824326038360596, + "learning_rate": 9.984196194563813e-05, + "loss": 2.3622, + "step": 1779 + }, + { + "epoch": 0.5463474524248005, + "grad_norm": 1.0006790161132812, + "learning_rate": 9.984156681248438e-05, + "loss": 2.2531, + "step": 1780 + }, + { + "epoch": 0.5466543891958257, + "grad_norm": 0.9501944184303284, + "learning_rate": 9.984117118676705e-05, + "loss": 2.3902, + "step": 1781 + }, + { + "epoch": 0.5469613259668509, + "grad_norm": 0.7835353016853333, + "learning_rate": 9.984077506849005e-05, + "loss": 2.2754, + "step": 1782 + }, + { + "epoch": 0.547268262737876, + "grad_norm": 0.7310026288032532, + "learning_rate": 9.984037845765732e-05, + "loss": 2.2742, + "step": 1783 + }, + { + "epoch": 0.5475751995089012, + "grad_norm": 0.9469361901283264, + "learning_rate": 9.983998135427275e-05, + "loss": 2.4026, + "step": 1784 + }, + { + "epoch": 0.5478821362799263, + "grad_norm": 1.0639240741729736, + "learning_rate": 9.983958375834025e-05, + "loss": 2.3522, + "step": 1785 + }, + { + "epoch": 0.5481890730509515, + "grad_norm": 0.7771989703178406, + "learning_rate": 9.983918566986379e-05, + "loss": 2.216, + "step": 1786 + }, + { + "epoch": 0.5484960098219767, + "grad_norm": 0.6809307932853699, + "learning_rate": 9.983878708884728e-05, + "loss": 2.256, + "step": 1787 + }, + { + "epoch": 0.5488029465930019, + "grad_norm": 0.7300165891647339, + "learning_rate": 9.983838801529469e-05, + "loss": 2.3156, + "step": 1788 + }, + { + "epoch": 0.549109883364027, + "grad_norm": 0.8352389335632324, + "learning_rate": 9.98379884492099e-05, + "loss": 2.3344, + "step": 1789 + }, + { + "epoch": 0.5494168201350522, + "grad_norm": 0.830585777759552, + "learning_rate": 9.983758839059692e-05, + "loss": 2.3076, + "step": 1790 + }, + { + "epoch": 0.5497237569060773, + "grad_norm": 0.7384640574455261, + "learning_rate": 9.983718783945968e-05, + "loss": 2.2387, + "step": 1791 + }, + { + "epoch": 0.5500306936771026, + "grad_norm": 0.7133243083953857, + "learning_rate": 9.983678679580213e-05, + "loss": 2.2933, + "step": 1792 + }, + { + "epoch": 0.5503376304481277, + "grad_norm": 0.8462459444999695, + "learning_rate": 9.983638525962823e-05, + "loss": 2.3294, + "step": 1793 + }, + { + "epoch": 0.5506445672191529, + "grad_norm": 0.7841110825538635, + "learning_rate": 9.983598323094199e-05, + "loss": 2.3156, + "step": 1794 + }, + { + "epoch": 0.550951503990178, + "grad_norm": 0.8454114198684692, + "learning_rate": 9.983558070974735e-05, + "loss": 2.2203, + "step": 1795 + }, + { + "epoch": 0.5512584407612032, + "grad_norm": 0.7741531729698181, + "learning_rate": 9.983517769604826e-05, + "loss": 2.2585, + "step": 1796 + }, + { + "epoch": 0.5515653775322283, + "grad_norm": 0.717714250087738, + "learning_rate": 9.983477418984876e-05, + "loss": 2.3127, + "step": 1797 + }, + { + "epoch": 0.5518723143032536, + "grad_norm": 0.7546361088752747, + "learning_rate": 9.983437019115283e-05, + "loss": 2.2591, + "step": 1798 + }, + { + "epoch": 0.5521792510742787, + "grad_norm": 0.7947681546211243, + "learning_rate": 9.983396569996442e-05, + "loss": 2.337, + "step": 1799 + }, + { + "epoch": 0.5524861878453039, + "grad_norm": 0.9286270141601562, + "learning_rate": 9.983356071628756e-05, + "loss": 2.371, + "step": 1800 + }, + { + "epoch": 0.552793124616329, + "grad_norm": 1.0236682891845703, + "learning_rate": 9.983315524012625e-05, + "loss": 2.2673, + "step": 1801 + }, + { + "epoch": 0.5531000613873542, + "grad_norm": 1.043534278869629, + "learning_rate": 9.983274927148447e-05, + "loss": 2.3204, + "step": 1802 + }, + { + "epoch": 0.5534069981583793, + "grad_norm": 0.9694257378578186, + "learning_rate": 9.983234281036626e-05, + "loss": 2.2642, + "step": 1803 + }, + { + "epoch": 0.5537139349294046, + "grad_norm": 0.8890992403030396, + "learning_rate": 9.983193585677563e-05, + "loss": 2.2546, + "step": 1804 + }, + { + "epoch": 0.5540208717004297, + "grad_norm": 0.8109140396118164, + "learning_rate": 9.983152841071662e-05, + "loss": 2.3088, + "step": 1805 + }, + { + "epoch": 0.5543278084714549, + "grad_norm": 0.7762413620948792, + "learning_rate": 9.983112047219323e-05, + "loss": 2.2277, + "step": 1806 + }, + { + "epoch": 0.55463474524248, + "grad_norm": 0.7949336767196655, + "learning_rate": 9.983071204120951e-05, + "loss": 2.3004, + "step": 1807 + }, + { + "epoch": 0.5549416820135052, + "grad_norm": 0.9118300080299377, + "learning_rate": 9.983030311776946e-05, + "loss": 2.3986, + "step": 1808 + }, + { + "epoch": 0.5552486187845304, + "grad_norm": 0.874891996383667, + "learning_rate": 9.982989370187717e-05, + "loss": 2.2721, + "step": 1809 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.8089940547943115, + "learning_rate": 9.982948379353667e-05, + "loss": 2.2846, + "step": 1810 + }, + { + "epoch": 0.5558624923265807, + "grad_norm": 0.7407395839691162, + "learning_rate": 9.982907339275198e-05, + "loss": 2.2848, + "step": 1811 + }, + { + "epoch": 0.5561694290976059, + "grad_norm": 0.7487329244613647, + "learning_rate": 9.982866249952721e-05, + "loss": 2.266, + "step": 1812 + }, + { + "epoch": 0.556476365868631, + "grad_norm": 0.7910557389259338, + "learning_rate": 9.982825111386638e-05, + "loss": 2.2975, + "step": 1813 + }, + { + "epoch": 0.5567833026396563, + "grad_norm": 0.767186164855957, + "learning_rate": 9.982783923577356e-05, + "loss": 2.2867, + "step": 1814 + }, + { + "epoch": 0.5570902394106814, + "grad_norm": 0.7296959757804871, + "learning_rate": 9.982742686525284e-05, + "loss": 2.2167, + "step": 1815 + }, + { + "epoch": 0.5573971761817066, + "grad_norm": 0.6536411643028259, + "learning_rate": 9.982701400230827e-05, + "loss": 2.2278, + "step": 1816 + }, + { + "epoch": 0.5577041129527317, + "grad_norm": 0.7393643260002136, + "learning_rate": 9.982660064694394e-05, + "loss": 2.3275, + "step": 1817 + }, + { + "epoch": 0.5580110497237569, + "grad_norm": 0.7837240099906921, + "learning_rate": 9.982618679916396e-05, + "loss": 2.3516, + "step": 1818 + }, + { + "epoch": 0.558317986494782, + "grad_norm": 0.8186847567558289, + "learning_rate": 9.982577245897238e-05, + "loss": 2.4104, + "step": 1819 + }, + { + "epoch": 0.5586249232658073, + "grad_norm": 0.733651340007782, + "learning_rate": 9.98253576263733e-05, + "loss": 2.2151, + "step": 1820 + }, + { + "epoch": 0.5589318600368324, + "grad_norm": 0.7452411651611328, + "learning_rate": 9.982494230137086e-05, + "loss": 2.3288, + "step": 1821 + }, + { + "epoch": 0.5592387968078576, + "grad_norm": 0.7369456887245178, + "learning_rate": 9.982452648396913e-05, + "loss": 2.3023, + "step": 1822 + }, + { + "epoch": 0.5595457335788827, + "grad_norm": 0.794789731502533, + "learning_rate": 9.982411017417222e-05, + "loss": 2.2774, + "step": 1823 + }, + { + "epoch": 0.5598526703499079, + "grad_norm": 0.7677412033081055, + "learning_rate": 9.982369337198425e-05, + "loss": 2.3213, + "step": 1824 + }, + { + "epoch": 0.560159607120933, + "grad_norm": 0.8195241689682007, + "learning_rate": 9.982327607740934e-05, + "loss": 2.3721, + "step": 1825 + }, + { + "epoch": 0.5604665438919583, + "grad_norm": 0.867115318775177, + "learning_rate": 9.982285829045162e-05, + "loss": 2.3653, + "step": 1826 + }, + { + "epoch": 0.5607734806629834, + "grad_norm": 0.8519865870475769, + "learning_rate": 9.98224400111152e-05, + "loss": 2.3646, + "step": 1827 + }, + { + "epoch": 0.5610804174340086, + "grad_norm": 0.9408721923828125, + "learning_rate": 9.982202123940425e-05, + "loss": 2.2051, + "step": 1828 + }, + { + "epoch": 0.5613873542050337, + "grad_norm": 0.985325813293457, + "learning_rate": 9.982160197532287e-05, + "loss": 2.3402, + "step": 1829 + }, + { + "epoch": 0.5616942909760589, + "grad_norm": 1.018094539642334, + "learning_rate": 9.982118221887521e-05, + "loss": 2.2712, + "step": 1830 + }, + { + "epoch": 0.562001227747084, + "grad_norm": 0.9246920347213745, + "learning_rate": 9.982076197006543e-05, + "loss": 2.3808, + "step": 1831 + }, + { + "epoch": 0.5623081645181093, + "grad_norm": 0.8519729971885681, + "learning_rate": 9.982034122889768e-05, + "loss": 2.3774, + "step": 1832 + }, + { + "epoch": 0.5626151012891344, + "grad_norm": 0.801567018032074, + "learning_rate": 9.981991999537612e-05, + "loss": 2.2713, + "step": 1833 + }, + { + "epoch": 0.5629220380601596, + "grad_norm": 0.7212518453598022, + "learning_rate": 9.981949826950492e-05, + "loss": 2.1902, + "step": 1834 + }, + { + "epoch": 0.5632289748311847, + "grad_norm": 0.7644798755645752, + "learning_rate": 9.981907605128822e-05, + "loss": 2.2751, + "step": 1835 + }, + { + "epoch": 0.56353591160221, + "grad_norm": 0.7941999435424805, + "learning_rate": 9.981865334073022e-05, + "loss": 2.2991, + "step": 1836 + }, + { + "epoch": 0.5638428483732351, + "grad_norm": 0.7274888753890991, + "learning_rate": 9.981823013783508e-05, + "loss": 2.3536, + "step": 1837 + }, + { + "epoch": 0.5641497851442603, + "grad_norm": 0.845024585723877, + "learning_rate": 9.9817806442607e-05, + "loss": 2.2796, + "step": 1838 + }, + { + "epoch": 0.5644567219152854, + "grad_norm": 0.8225597739219666, + "learning_rate": 9.981738225505015e-05, + "loss": 2.3339, + "step": 1839 + }, + { + "epoch": 0.5647636586863106, + "grad_norm": 0.8456425070762634, + "learning_rate": 9.981695757516873e-05, + "loss": 2.2583, + "step": 1840 + }, + { + "epoch": 0.5650705954573357, + "grad_norm": 1.0066497325897217, + "learning_rate": 9.981653240296695e-05, + "loss": 2.3628, + "step": 1841 + }, + { + "epoch": 0.565377532228361, + "grad_norm": 0.9574379920959473, + "learning_rate": 9.981610673844899e-05, + "loss": 2.306, + "step": 1842 + }, + { + "epoch": 0.5656844689993862, + "grad_norm": 0.7427437901496887, + "learning_rate": 9.981568058161905e-05, + "loss": 2.267, + "step": 1843 + }, + { + "epoch": 0.5659914057704113, + "grad_norm": 0.6984857320785522, + "learning_rate": 9.981525393248138e-05, + "loss": 2.2095, + "step": 1844 + }, + { + "epoch": 0.5662983425414365, + "grad_norm": 0.748062789440155, + "learning_rate": 9.981482679104016e-05, + "loss": 2.211, + "step": 1845 + }, + { + "epoch": 0.5666052793124616, + "grad_norm": 0.7978217005729675, + "learning_rate": 9.981439915729964e-05, + "loss": 2.2437, + "step": 1846 + }, + { + "epoch": 0.5669122160834869, + "grad_norm": 0.807849109172821, + "learning_rate": 9.981397103126401e-05, + "loss": 2.3063, + "step": 1847 + }, + { + "epoch": 0.567219152854512, + "grad_norm": 0.8626619577407837, + "learning_rate": 9.981354241293752e-05, + "loss": 2.3616, + "step": 1848 + }, + { + "epoch": 0.5675260896255372, + "grad_norm": 0.8991526961326599, + "learning_rate": 9.981311330232442e-05, + "loss": 2.2355, + "step": 1849 + }, + { + "epoch": 0.5678330263965623, + "grad_norm": 0.7399953007698059, + "learning_rate": 9.981268369942894e-05, + "loss": 2.2452, + "step": 1850 + }, + { + "epoch": 0.5681399631675875, + "grad_norm": 0.7787104845046997, + "learning_rate": 9.981225360425533e-05, + "loss": 2.4141, + "step": 1851 + }, + { + "epoch": 0.5684468999386126, + "grad_norm": 0.8570892214775085, + "learning_rate": 9.98118230168078e-05, + "loss": 2.2487, + "step": 1852 + }, + { + "epoch": 0.5687538367096379, + "grad_norm": 0.8277538418769836, + "learning_rate": 9.981139193709068e-05, + "loss": 2.2602, + "step": 1853 + }, + { + "epoch": 0.569060773480663, + "grad_norm": 0.7638106942176819, + "learning_rate": 9.981096036510817e-05, + "loss": 2.2886, + "step": 1854 + }, + { + "epoch": 0.5693677102516882, + "grad_norm": 0.8480616807937622, + "learning_rate": 9.981052830086454e-05, + "loss": 2.2893, + "step": 1855 + }, + { + "epoch": 0.5696746470227133, + "grad_norm": 0.8568599820137024, + "learning_rate": 9.98100957443641e-05, + "loss": 2.3802, + "step": 1856 + }, + { + "epoch": 0.5699815837937385, + "grad_norm": 0.7863987684249878, + "learning_rate": 9.98096626956111e-05, + "loss": 2.2996, + "step": 1857 + }, + { + "epoch": 0.5702885205647636, + "grad_norm": 0.7636334896087646, + "learning_rate": 9.980922915460979e-05, + "loss": 2.2569, + "step": 1858 + }, + { + "epoch": 0.5705954573357889, + "grad_norm": 0.7514677047729492, + "learning_rate": 9.98087951213645e-05, + "loss": 2.3317, + "step": 1859 + }, + { + "epoch": 0.570902394106814, + "grad_norm": 0.717637300491333, + "learning_rate": 9.980836059587951e-05, + "loss": 2.2855, + "step": 1860 + }, + { + "epoch": 0.5712093308778392, + "grad_norm": 0.728518545627594, + "learning_rate": 9.98079255781591e-05, + "loss": 2.3166, + "step": 1861 + }, + { + "epoch": 0.5715162676488643, + "grad_norm": 0.7158043384552002, + "learning_rate": 9.980749006820757e-05, + "loss": 2.2639, + "step": 1862 + }, + { + "epoch": 0.5718232044198895, + "grad_norm": 0.7565107941627502, + "learning_rate": 9.980705406602924e-05, + "loss": 2.2833, + "step": 1863 + }, + { + "epoch": 0.5721301411909147, + "grad_norm": 0.7873388528823853, + "learning_rate": 9.980661757162841e-05, + "loss": 2.201, + "step": 1864 + }, + { + "epoch": 0.5724370779619399, + "grad_norm": 0.7818259596824646, + "learning_rate": 9.980618058500939e-05, + "loss": 2.242, + "step": 1865 + }, + { + "epoch": 0.572744014732965, + "grad_norm": 0.7464665770530701, + "learning_rate": 9.98057431061765e-05, + "loss": 2.2325, + "step": 1866 + }, + { + "epoch": 0.5730509515039902, + "grad_norm": 0.7778184413909912, + "learning_rate": 9.980530513513406e-05, + "loss": 2.3258, + "step": 1867 + }, + { + "epoch": 0.5733578882750153, + "grad_norm": 0.825661301612854, + "learning_rate": 9.980486667188642e-05, + "loss": 2.3477, + "step": 1868 + }, + { + "epoch": 0.5736648250460405, + "grad_norm": 0.8448848724365234, + "learning_rate": 9.980442771643788e-05, + "loss": 2.3523, + "step": 1869 + }, + { + "epoch": 0.5739717618170657, + "grad_norm": 0.8330404758453369, + "learning_rate": 9.98039882687928e-05, + "loss": 2.2274, + "step": 1870 + }, + { + "epoch": 0.5742786985880909, + "grad_norm": 0.7520943284034729, + "learning_rate": 9.98035483289555e-05, + "loss": 2.2773, + "step": 1871 + }, + { + "epoch": 0.574585635359116, + "grad_norm": 0.8312448263168335, + "learning_rate": 9.980310789693037e-05, + "loss": 2.302, + "step": 1872 + }, + { + "epoch": 0.5748925721301412, + "grad_norm": 0.7383994460105896, + "learning_rate": 9.980266697272173e-05, + "loss": 2.2168, + "step": 1873 + }, + { + "epoch": 0.5751995089011663, + "grad_norm": 0.9612922072410583, + "learning_rate": 9.980222555633394e-05, + "loss": 2.3558, + "step": 1874 + }, + { + "epoch": 0.5755064456721916, + "grad_norm": 0.9921227097511292, + "learning_rate": 9.980178364777136e-05, + "loss": 2.2913, + "step": 1875 + }, + { + "epoch": 0.5758133824432167, + "grad_norm": 0.9152889847755432, + "learning_rate": 9.980134124703837e-05, + "loss": 2.2615, + "step": 1876 + }, + { + "epoch": 0.5761203192142419, + "grad_norm": 0.8090541362762451, + "learning_rate": 9.980089835413936e-05, + "loss": 2.2661, + "step": 1877 + }, + { + "epoch": 0.576427255985267, + "grad_norm": 0.8074322938919067, + "learning_rate": 9.980045496907865e-05, + "loss": 2.3209, + "step": 1878 + }, + { + "epoch": 0.5767341927562922, + "grad_norm": 0.784649670124054, + "learning_rate": 9.980001109186065e-05, + "loss": 2.241, + "step": 1879 + }, + { + "epoch": 0.5770411295273173, + "grad_norm": 0.768108069896698, + "learning_rate": 9.979956672248978e-05, + "loss": 2.3333, + "step": 1880 + }, + { + "epoch": 0.5773480662983426, + "grad_norm": 0.798058271408081, + "learning_rate": 9.97991218609704e-05, + "loss": 2.3564, + "step": 1881 + }, + { + "epoch": 0.5776550030693677, + "grad_norm": 0.7606865763664246, + "learning_rate": 9.97986765073069e-05, + "loss": 2.2277, + "step": 1882 + }, + { + "epoch": 0.5779619398403929, + "grad_norm": 0.8320558667182922, + "learning_rate": 9.979823066150369e-05, + "loss": 2.3715, + "step": 1883 + }, + { + "epoch": 0.578268876611418, + "grad_norm": 0.7935798168182373, + "learning_rate": 9.979778432356517e-05, + "loss": 2.2605, + "step": 1884 + }, + { + "epoch": 0.5785758133824432, + "grad_norm": 0.6914796829223633, + "learning_rate": 9.979733749349578e-05, + "loss": 2.2699, + "step": 1885 + }, + { + "epoch": 0.5788827501534684, + "grad_norm": 0.6546899676322937, + "learning_rate": 9.979689017129989e-05, + "loss": 2.1908, + "step": 1886 + }, + { + "epoch": 0.5791896869244936, + "grad_norm": 0.7231267094612122, + "learning_rate": 9.979644235698195e-05, + "loss": 2.2084, + "step": 1887 + }, + { + "epoch": 0.5794966236955187, + "grad_norm": 0.668933093547821, + "learning_rate": 9.979599405054639e-05, + "loss": 2.2722, + "step": 1888 + }, + { + "epoch": 0.5798035604665439, + "grad_norm": 0.678191602230072, + "learning_rate": 9.979554525199763e-05, + "loss": 2.2312, + "step": 1889 + }, + { + "epoch": 0.580110497237569, + "grad_norm": 0.6407462954521179, + "learning_rate": 9.97950959613401e-05, + "loss": 2.2381, + "step": 1890 + }, + { + "epoch": 0.5804174340085942, + "grad_norm": 0.6920403242111206, + "learning_rate": 9.979464617857826e-05, + "loss": 2.2678, + "step": 1891 + }, + { + "epoch": 0.5807243707796194, + "grad_norm": 0.6907110810279846, + "learning_rate": 9.979419590371651e-05, + "loss": 2.2579, + "step": 1892 + }, + { + "epoch": 0.5810313075506446, + "grad_norm": 0.7683933973312378, + "learning_rate": 9.979374513675935e-05, + "loss": 2.2184, + "step": 1893 + }, + { + "epoch": 0.5813382443216697, + "grad_norm": 0.797286868095398, + "learning_rate": 9.979329387771121e-05, + "loss": 2.2518, + "step": 1894 + }, + { + "epoch": 0.5816451810926949, + "grad_norm": 0.8192877769470215, + "learning_rate": 9.979284212657657e-05, + "loss": 2.2271, + "step": 1895 + }, + { + "epoch": 0.58195211786372, + "grad_norm": 0.7510090470314026, + "learning_rate": 9.979238988335986e-05, + "loss": 2.2864, + "step": 1896 + }, + { + "epoch": 0.5822590546347453, + "grad_norm": 0.7541393041610718, + "learning_rate": 9.979193714806558e-05, + "loss": 2.239, + "step": 1897 + }, + { + "epoch": 0.5825659914057704, + "grad_norm": 0.7353073358535767, + "learning_rate": 9.97914839206982e-05, + "loss": 2.2145, + "step": 1898 + }, + { + "epoch": 0.5828729281767956, + "grad_norm": 0.6813456416130066, + "learning_rate": 9.979103020126218e-05, + "loss": 2.194, + "step": 1899 + }, + { + "epoch": 0.5831798649478207, + "grad_norm": 0.6922066807746887, + "learning_rate": 9.979057598976202e-05, + "loss": 2.2335, + "step": 1900 + }, + { + "epoch": 0.5834868017188459, + "grad_norm": 0.5800344944000244, + "learning_rate": 9.97901212862022e-05, + "loss": 2.2159, + "step": 1901 + }, + { + "epoch": 0.583793738489871, + "grad_norm": 0.5770835280418396, + "learning_rate": 9.978966609058722e-05, + "loss": 2.2217, + "step": 1902 + }, + { + "epoch": 0.5841006752608963, + "grad_norm": 0.6217128038406372, + "learning_rate": 9.978921040292158e-05, + "loss": 2.2703, + "step": 1903 + }, + { + "epoch": 0.5844076120319214, + "grad_norm": 0.6684436798095703, + "learning_rate": 9.97887542232098e-05, + "loss": 2.2747, + "step": 1904 + }, + { + "epoch": 0.5847145488029466, + "grad_norm": 0.6261670589447021, + "learning_rate": 9.978829755145633e-05, + "loss": 2.2867, + "step": 1905 + }, + { + "epoch": 0.5850214855739717, + "grad_norm": 0.646051824092865, + "learning_rate": 9.978784038766575e-05, + "loss": 2.2493, + "step": 1906 + }, + { + "epoch": 0.5853284223449969, + "grad_norm": 0.6757060885429382, + "learning_rate": 9.978738273184254e-05, + "loss": 2.218, + "step": 1907 + }, + { + "epoch": 0.585635359116022, + "grad_norm": 0.7867937684059143, + "learning_rate": 9.978692458399122e-05, + "loss": 2.3405, + "step": 1908 + }, + { + "epoch": 0.5859422958870473, + "grad_norm": 0.8349789381027222, + "learning_rate": 9.978646594411636e-05, + "loss": 2.3292, + "step": 1909 + }, + { + "epoch": 0.5862492326580724, + "grad_norm": 0.8739562034606934, + "learning_rate": 9.978600681222243e-05, + "loss": 2.2132, + "step": 1910 + }, + { + "epoch": 0.5865561694290976, + "grad_norm": 0.8187520503997803, + "learning_rate": 9.978554718831402e-05, + "loss": 2.3078, + "step": 1911 + }, + { + "epoch": 0.5868631062001227, + "grad_norm": 0.8463271856307983, + "learning_rate": 9.978508707239565e-05, + "loss": 2.1924, + "step": 1912 + }, + { + "epoch": 0.5871700429711479, + "grad_norm": 0.8674206733703613, + "learning_rate": 9.978462646447187e-05, + "loss": 2.2185, + "step": 1913 + }, + { + "epoch": 0.5874769797421732, + "grad_norm": 0.7828893065452576, + "learning_rate": 9.978416536454722e-05, + "loss": 2.3137, + "step": 1914 + }, + { + "epoch": 0.5877839165131983, + "grad_norm": 0.7868914604187012, + "learning_rate": 9.978370377262629e-05, + "loss": 2.2202, + "step": 1915 + }, + { + "epoch": 0.5880908532842235, + "grad_norm": 0.811596155166626, + "learning_rate": 9.97832416887136e-05, + "loss": 2.3463, + "step": 1916 + }, + { + "epoch": 0.5883977900552486, + "grad_norm": 0.9281075596809387, + "learning_rate": 9.978277911281375e-05, + "loss": 2.2394, + "step": 1917 + }, + { + "epoch": 0.5887047268262738, + "grad_norm": 0.8862313628196716, + "learning_rate": 9.978231604493129e-05, + "loss": 2.2456, + "step": 1918 + }, + { + "epoch": 0.589011663597299, + "grad_norm": 0.8411116600036621, + "learning_rate": 9.978185248507081e-05, + "loss": 2.2409, + "step": 1919 + }, + { + "epoch": 0.5893186003683242, + "grad_norm": 0.8205060958862305, + "learning_rate": 9.978138843323688e-05, + "loss": 2.2468, + "step": 1920 + }, + { + "epoch": 0.5896255371393493, + "grad_norm": 0.8103171586990356, + "learning_rate": 9.97809238894341e-05, + "loss": 2.2979, + "step": 1921 + }, + { + "epoch": 0.5899324739103745, + "grad_norm": 0.7937025427818298, + "learning_rate": 9.978045885366704e-05, + "loss": 2.3582, + "step": 1922 + }, + { + "epoch": 0.5902394106813996, + "grad_norm": 0.7983896136283875, + "learning_rate": 9.977999332594032e-05, + "loss": 2.2725, + "step": 1923 + }, + { + "epoch": 0.5905463474524248, + "grad_norm": 0.8274399042129517, + "learning_rate": 9.977952730625852e-05, + "loss": 2.3091, + "step": 1924 + }, + { + "epoch": 0.59085328422345, + "grad_norm": 0.9385362863540649, + "learning_rate": 9.977906079462627e-05, + "loss": 2.4322, + "step": 1925 + }, + { + "epoch": 0.5911602209944752, + "grad_norm": 0.8405537009239197, + "learning_rate": 9.977859379104814e-05, + "loss": 2.1606, + "step": 1926 + }, + { + "epoch": 0.5914671577655003, + "grad_norm": 0.8082418441772461, + "learning_rate": 9.97781262955288e-05, + "loss": 2.2929, + "step": 1927 + }, + { + "epoch": 0.5917740945365255, + "grad_norm": 0.7444280385971069, + "learning_rate": 9.977765830807283e-05, + "loss": 2.3217, + "step": 1928 + }, + { + "epoch": 0.5920810313075506, + "grad_norm": 0.7369982600212097, + "learning_rate": 9.977718982868485e-05, + "loss": 2.2658, + "step": 1929 + }, + { + "epoch": 0.5923879680785759, + "grad_norm": 0.6842257380485535, + "learning_rate": 9.977672085736951e-05, + "loss": 2.2243, + "step": 1930 + }, + { + "epoch": 0.592694904849601, + "grad_norm": 0.6954882740974426, + "learning_rate": 9.977625139413145e-05, + "loss": 2.2802, + "step": 1931 + }, + { + "epoch": 0.5930018416206262, + "grad_norm": 0.749829888343811, + "learning_rate": 9.97757814389753e-05, + "loss": 2.3166, + "step": 1932 + }, + { + "epoch": 0.5933087783916513, + "grad_norm": 0.7725609540939331, + "learning_rate": 9.977531099190569e-05, + "loss": 2.2367, + "step": 1933 + }, + { + "epoch": 0.5936157151626765, + "grad_norm": 0.7467440366744995, + "learning_rate": 9.977484005292728e-05, + "loss": 2.2704, + "step": 1934 + }, + { + "epoch": 0.5939226519337016, + "grad_norm": 0.7104424834251404, + "learning_rate": 9.977436862204475e-05, + "loss": 2.1983, + "step": 1935 + }, + { + "epoch": 0.5942295887047269, + "grad_norm": 0.7562711834907532, + "learning_rate": 9.977389669926272e-05, + "loss": 2.2857, + "step": 1936 + }, + { + "epoch": 0.594536525475752, + "grad_norm": 0.7803298830986023, + "learning_rate": 9.977342428458585e-05, + "loss": 2.3526, + "step": 1937 + }, + { + "epoch": 0.5948434622467772, + "grad_norm": 0.7487826943397522, + "learning_rate": 9.977295137801885e-05, + "loss": 2.2338, + "step": 1938 + }, + { + "epoch": 0.5951503990178023, + "grad_norm": 0.6969291567802429, + "learning_rate": 9.977247797956639e-05, + "loss": 2.2185, + "step": 1939 + }, + { + "epoch": 0.5954573357888275, + "grad_norm": 0.6293052434921265, + "learning_rate": 9.977200408923311e-05, + "loss": 2.2767, + "step": 1940 + }, + { + "epoch": 0.5957642725598526, + "grad_norm": 0.7457680702209473, + "learning_rate": 9.97715297070237e-05, + "loss": 2.2688, + "step": 1941 + }, + { + "epoch": 0.5960712093308779, + "grad_norm": 0.7255130410194397, + "learning_rate": 9.977105483294288e-05, + "loss": 2.2157, + "step": 1942 + }, + { + "epoch": 0.596378146101903, + "grad_norm": 0.739815890789032, + "learning_rate": 9.977057946699532e-05, + "loss": 2.306, + "step": 1943 + }, + { + "epoch": 0.5966850828729282, + "grad_norm": 0.7493855357170105, + "learning_rate": 9.977010360918571e-05, + "loss": 2.1893, + "step": 1944 + }, + { + "epoch": 0.5969920196439533, + "grad_norm": 0.7976173758506775, + "learning_rate": 9.976962725951878e-05, + "loss": 2.3288, + "step": 1945 + }, + { + "epoch": 0.5972989564149785, + "grad_norm": 0.9487287998199463, + "learning_rate": 9.976915041799921e-05, + "loss": 2.4484, + "step": 1946 + }, + { + "epoch": 0.5976058931860037, + "grad_norm": 0.9866845011711121, + "learning_rate": 9.976867308463174e-05, + "loss": 2.3223, + "step": 1947 + }, + { + "epoch": 0.5979128299570289, + "grad_norm": 0.9258660674095154, + "learning_rate": 9.976819525942107e-05, + "loss": 2.2358, + "step": 1948 + }, + { + "epoch": 0.598219766728054, + "grad_norm": 0.9822832345962524, + "learning_rate": 9.976771694237192e-05, + "loss": 2.2951, + "step": 1949 + }, + { + "epoch": 0.5985267034990792, + "grad_norm": 1.005528450012207, + "learning_rate": 9.976723813348902e-05, + "loss": 2.2604, + "step": 1950 + }, + { + "epoch": 0.5988336402701043, + "grad_norm": 0.8988018035888672, + "learning_rate": 9.976675883277711e-05, + "loss": 2.3419, + "step": 1951 + }, + { + "epoch": 0.5991405770411296, + "grad_norm": 0.7386319041252136, + "learning_rate": 9.976627904024091e-05, + "loss": 2.2357, + "step": 1952 + }, + { + "epoch": 0.5994475138121547, + "grad_norm": 0.7715404033660889, + "learning_rate": 9.976579875588518e-05, + "loss": 2.3482, + "step": 1953 + }, + { + "epoch": 0.5997544505831799, + "grad_norm": 0.7529712319374084, + "learning_rate": 9.976531797971464e-05, + "loss": 2.1735, + "step": 1954 + }, + { + "epoch": 0.600061387354205, + "grad_norm": 0.8589643836021423, + "learning_rate": 9.97648367117341e-05, + "loss": 2.305, + "step": 1955 + }, + { + "epoch": 0.6003683241252302, + "grad_norm": 0.9038915634155273, + "learning_rate": 9.976435495194823e-05, + "loss": 2.2123, + "step": 1956 + }, + { + "epoch": 0.6006752608962553, + "grad_norm": 0.9388678073883057, + "learning_rate": 9.976387270036186e-05, + "loss": 2.1792, + "step": 1957 + }, + { + "epoch": 0.6009821976672806, + "grad_norm": 0.7970952391624451, + "learning_rate": 9.976338995697974e-05, + "loss": 2.2425, + "step": 1958 + }, + { + "epoch": 0.6012891344383057, + "grad_norm": 0.7219900488853455, + "learning_rate": 9.976290672180662e-05, + "loss": 2.1984, + "step": 1959 + }, + { + "epoch": 0.6015960712093309, + "grad_norm": 0.639715313911438, + "learning_rate": 9.976242299484728e-05, + "loss": 2.2796, + "step": 1960 + }, + { + "epoch": 0.601903007980356, + "grad_norm": 0.6734911799430847, + "learning_rate": 9.976193877610652e-05, + "loss": 2.3066, + "step": 1961 + }, + { + "epoch": 0.6022099447513812, + "grad_norm": 0.8328932523727417, + "learning_rate": 9.976145406558912e-05, + "loss": 2.3958, + "step": 1962 + }, + { + "epoch": 0.6025168815224063, + "grad_norm": 0.9552088379859924, + "learning_rate": 9.976096886329986e-05, + "loss": 2.3246, + "step": 1963 + }, + { + "epoch": 0.6028238182934316, + "grad_norm": 0.8407328128814697, + "learning_rate": 9.976048316924354e-05, + "loss": 2.2922, + "step": 1964 + }, + { + "epoch": 0.6031307550644567, + "grad_norm": 0.6899709105491638, + "learning_rate": 9.975999698342495e-05, + "loss": 2.1808, + "step": 1965 + }, + { + "epoch": 0.6034376918354819, + "grad_norm": 0.8114390969276428, + "learning_rate": 9.975951030584892e-05, + "loss": 2.3516, + "step": 1966 + }, + { + "epoch": 0.603744628606507, + "grad_norm": 0.8071461319923401, + "learning_rate": 9.975902313652024e-05, + "loss": 2.2044, + "step": 1967 + }, + { + "epoch": 0.6040515653775322, + "grad_norm": 0.8767913579940796, + "learning_rate": 9.975853547544372e-05, + "loss": 2.24, + "step": 1968 + }, + { + "epoch": 0.6043585021485574, + "grad_norm": 0.817095935344696, + "learning_rate": 9.975804732262419e-05, + "loss": 2.169, + "step": 1969 + }, + { + "epoch": 0.6046654389195826, + "grad_norm": 0.6818623542785645, + "learning_rate": 9.975755867806648e-05, + "loss": 2.2869, + "step": 1970 + }, + { + "epoch": 0.6049723756906077, + "grad_norm": 0.7248693704605103, + "learning_rate": 9.97570695417754e-05, + "loss": 2.2159, + "step": 1971 + }, + { + "epoch": 0.6052793124616329, + "grad_norm": 0.6425455212593079, + "learning_rate": 9.975657991375581e-05, + "loss": 2.2173, + "step": 1972 + }, + { + "epoch": 0.605586249232658, + "grad_norm": 0.6856566071510315, + "learning_rate": 9.975608979401252e-05, + "loss": 2.2994, + "step": 1973 + }, + { + "epoch": 0.6058931860036832, + "grad_norm": 0.6731004118919373, + "learning_rate": 9.97555991825504e-05, + "loss": 2.2286, + "step": 1974 + }, + { + "epoch": 0.6062001227747084, + "grad_norm": 0.7461759448051453, + "learning_rate": 9.975510807937428e-05, + "loss": 2.2057, + "step": 1975 + }, + { + "epoch": 0.6065070595457336, + "grad_norm": 0.7256236672401428, + "learning_rate": 9.975461648448902e-05, + "loss": 2.2686, + "step": 1976 + }, + { + "epoch": 0.6068139963167587, + "grad_norm": 0.7254514098167419, + "learning_rate": 9.975412439789949e-05, + "loss": 2.2748, + "step": 1977 + }, + { + "epoch": 0.6071209330877839, + "grad_norm": 0.7280047535896301, + "learning_rate": 9.975363181961052e-05, + "loss": 2.27, + "step": 1978 + }, + { + "epoch": 0.607427869858809, + "grad_norm": 0.6801813244819641, + "learning_rate": 9.9753138749627e-05, + "loss": 2.2356, + "step": 1979 + }, + { + "epoch": 0.6077348066298343, + "grad_norm": 0.841946005821228, + "learning_rate": 9.975264518795382e-05, + "loss": 2.3887, + "step": 1980 + }, + { + "epoch": 0.6080417434008594, + "grad_norm": 0.9610007405281067, + "learning_rate": 9.975215113459582e-05, + "loss": 2.2857, + "step": 1981 + }, + { + "epoch": 0.6083486801718846, + "grad_norm": 0.8726536631584167, + "learning_rate": 9.975165658955791e-05, + "loss": 2.3137, + "step": 1982 + }, + { + "epoch": 0.6086556169429097, + "grad_norm": 0.9275946021080017, + "learning_rate": 9.975116155284498e-05, + "loss": 2.291, + "step": 1983 + }, + { + "epoch": 0.6089625537139349, + "grad_norm": 0.9045402407646179, + "learning_rate": 9.97506660244619e-05, + "loss": 2.2183, + "step": 1984 + }, + { + "epoch": 0.6092694904849602, + "grad_norm": 0.7913599610328674, + "learning_rate": 9.975017000441358e-05, + "loss": 2.349, + "step": 1985 + }, + { + "epoch": 0.6095764272559853, + "grad_norm": 0.714824378490448, + "learning_rate": 9.974967349270492e-05, + "loss": 2.2163, + "step": 1986 + }, + { + "epoch": 0.6098833640270105, + "grad_norm": 0.7178559899330139, + "learning_rate": 9.974917648934084e-05, + "loss": 2.2338, + "step": 1987 + }, + { + "epoch": 0.6101903007980356, + "grad_norm": 0.8417280912399292, + "learning_rate": 9.97486789943262e-05, + "loss": 2.1961, + "step": 1988 + }, + { + "epoch": 0.6104972375690608, + "grad_norm": 0.8488532304763794, + "learning_rate": 9.9748181007666e-05, + "loss": 2.2509, + "step": 1989 + }, + { + "epoch": 0.6108041743400859, + "grad_norm": 0.796309769153595, + "learning_rate": 9.974768252936509e-05, + "loss": 2.2948, + "step": 1990 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.7163965702056885, + "learning_rate": 9.974718355942843e-05, + "loss": 2.2136, + "step": 1991 + }, + { + "epoch": 0.6114180478821363, + "grad_norm": 0.6620060205459595, + "learning_rate": 9.974668409786095e-05, + "loss": 2.2442, + "step": 1992 + }, + { + "epoch": 0.6117249846531615, + "grad_norm": 0.6843542456626892, + "learning_rate": 9.974618414466759e-05, + "loss": 2.1972, + "step": 1993 + }, + { + "epoch": 0.6120319214241866, + "grad_norm": 0.699847936630249, + "learning_rate": 9.974568369985327e-05, + "loss": 2.2194, + "step": 1994 + }, + { + "epoch": 0.6123388581952118, + "grad_norm": 0.693384051322937, + "learning_rate": 9.974518276342293e-05, + "loss": 2.2446, + "step": 1995 + }, + { + "epoch": 0.612645794966237, + "grad_norm": 0.6022316813468933, + "learning_rate": 9.974468133538155e-05, + "loss": 2.2037, + "step": 1996 + }, + { + "epoch": 0.6129527317372622, + "grad_norm": 0.6317062377929688, + "learning_rate": 9.974417941573409e-05, + "loss": 2.1855, + "step": 1997 + }, + { + "epoch": 0.6132596685082873, + "grad_norm": 0.7291355133056641, + "learning_rate": 9.974367700448547e-05, + "loss": 2.2179, + "step": 1998 + }, + { + "epoch": 0.6135666052793125, + "grad_norm": 0.6776867508888245, + "learning_rate": 9.97431741016407e-05, + "loss": 2.2437, + "step": 1999 + }, + { + "epoch": 0.6138735420503376, + "grad_norm": 0.6598517298698425, + "learning_rate": 9.97426707072047e-05, + "loss": 2.2775, + "step": 2000 + }, + { + "epoch": 0.6141804788213628, + "grad_norm": 0.6681709289550781, + "learning_rate": 9.974216682118249e-05, + "loss": 2.2004, + "step": 2001 + }, + { + "epoch": 0.614487415592388, + "grad_norm": 0.6725168228149414, + "learning_rate": 9.974166244357903e-05, + "loss": 2.2922, + "step": 2002 + }, + { + "epoch": 0.6147943523634132, + "grad_norm": 0.6547908782958984, + "learning_rate": 9.974115757439931e-05, + "loss": 2.2195, + "step": 2003 + }, + { + "epoch": 0.6151012891344383, + "grad_norm": 0.7195348739624023, + "learning_rate": 9.974065221364831e-05, + "loss": 2.2862, + "step": 2004 + }, + { + "epoch": 0.6154082259054635, + "grad_norm": 0.7992655038833618, + "learning_rate": 9.974014636133103e-05, + "loss": 2.3109, + "step": 2005 + }, + { + "epoch": 0.6157151626764886, + "grad_norm": 0.7932934165000916, + "learning_rate": 9.973964001745249e-05, + "loss": 2.2869, + "step": 2006 + }, + { + "epoch": 0.6160220994475138, + "grad_norm": 0.7778924107551575, + "learning_rate": 9.973913318201763e-05, + "loss": 2.2046, + "step": 2007 + }, + { + "epoch": 0.616329036218539, + "grad_norm": 0.7951294183731079, + "learning_rate": 9.973862585503155e-05, + "loss": 2.221, + "step": 2008 + }, + { + "epoch": 0.6166359729895642, + "grad_norm": 0.729552686214447, + "learning_rate": 9.97381180364992e-05, + "loss": 2.2929, + "step": 2009 + }, + { + "epoch": 0.6169429097605893, + "grad_norm": 0.731516420841217, + "learning_rate": 9.973760972642561e-05, + "loss": 2.2673, + "step": 2010 + }, + { + "epoch": 0.6172498465316145, + "grad_norm": 0.6950094103813171, + "learning_rate": 9.973710092481581e-05, + "loss": 2.2029, + "step": 2011 + }, + { + "epoch": 0.6175567833026396, + "grad_norm": 0.6260825395584106, + "learning_rate": 9.973659163167484e-05, + "loss": 2.3037, + "step": 2012 + }, + { + "epoch": 0.6178637200736649, + "grad_norm": 0.6949467658996582, + "learning_rate": 9.97360818470077e-05, + "loss": 2.2699, + "step": 2013 + }, + { + "epoch": 0.61817065684469, + "grad_norm": 0.7322572469711304, + "learning_rate": 9.973557157081945e-05, + "loss": 2.2921, + "step": 2014 + }, + { + "epoch": 0.6184775936157152, + "grad_norm": 0.8999563455581665, + "learning_rate": 9.973506080311514e-05, + "loss": 2.2499, + "step": 2015 + }, + { + "epoch": 0.6187845303867403, + "grad_norm": 0.9269914031028748, + "learning_rate": 9.973454954389981e-05, + "loss": 2.2676, + "step": 2016 + }, + { + "epoch": 0.6190914671577655, + "grad_norm": 0.8630712628364563, + "learning_rate": 9.973403779317852e-05, + "loss": 2.1379, + "step": 2017 + }, + { + "epoch": 0.6193984039287906, + "grad_norm": 0.8249645233154297, + "learning_rate": 9.97335255509563e-05, + "loss": 2.3109, + "step": 2018 + }, + { + "epoch": 0.6197053406998159, + "grad_norm": 0.7832711338996887, + "learning_rate": 9.973301281723824e-05, + "loss": 2.1316, + "step": 2019 + }, + { + "epoch": 0.620012277470841, + "grad_norm": 0.7502821683883667, + "learning_rate": 9.97324995920294e-05, + "loss": 2.2188, + "step": 2020 + }, + { + "epoch": 0.6203192142418662, + "grad_norm": 0.7804487347602844, + "learning_rate": 9.973198587533483e-05, + "loss": 2.2639, + "step": 2021 + }, + { + "epoch": 0.6206261510128913, + "grad_norm": 0.9198356866836548, + "learning_rate": 9.973147166715963e-05, + "loss": 2.2574, + "step": 2022 + }, + { + "epoch": 0.6209330877839165, + "grad_norm": 0.8792869448661804, + "learning_rate": 9.97309569675089e-05, + "loss": 2.2228, + "step": 2023 + }, + { + "epoch": 0.6212400245549416, + "grad_norm": 0.779772937297821, + "learning_rate": 9.97304417763877e-05, + "loss": 2.2179, + "step": 2024 + }, + { + "epoch": 0.6215469613259669, + "grad_norm": 0.7702100276947021, + "learning_rate": 9.972992609380111e-05, + "loss": 2.3872, + "step": 2025 + }, + { + "epoch": 0.621853898096992, + "grad_norm": 0.8576669096946716, + "learning_rate": 9.972940991975426e-05, + "loss": 2.2279, + "step": 2026 + }, + { + "epoch": 0.6221608348680172, + "grad_norm": 0.8312802314758301, + "learning_rate": 9.972889325425223e-05, + "loss": 2.3507, + "step": 2027 + }, + { + "epoch": 0.6224677716390423, + "grad_norm": 0.7873719930648804, + "learning_rate": 9.972837609730013e-05, + "loss": 2.2252, + "step": 2028 + }, + { + "epoch": 0.6227747084100675, + "grad_norm": 0.7763897180557251, + "learning_rate": 9.972785844890307e-05, + "loss": 2.2559, + "step": 2029 + }, + { + "epoch": 0.6230816451810927, + "grad_norm": 0.7053700685501099, + "learning_rate": 9.972734030906617e-05, + "loss": 2.2248, + "step": 2030 + }, + { + "epoch": 0.6233885819521179, + "grad_norm": 0.8800643682479858, + "learning_rate": 9.972682167779453e-05, + "loss": 2.3111, + "step": 2031 + }, + { + "epoch": 0.623695518723143, + "grad_norm": 0.7237632274627686, + "learning_rate": 9.97263025550933e-05, + "loss": 2.2255, + "step": 2032 + }, + { + "epoch": 0.6240024554941682, + "grad_norm": 0.7139064073562622, + "learning_rate": 9.97257829409676e-05, + "loss": 2.2065, + "step": 2033 + }, + { + "epoch": 0.6243093922651933, + "grad_norm": 0.6514315009117126, + "learning_rate": 9.972526283542259e-05, + "loss": 2.2176, + "step": 2034 + }, + { + "epoch": 0.6246163290362186, + "grad_norm": 0.726828932762146, + "learning_rate": 9.972474223846337e-05, + "loss": 2.2236, + "step": 2035 + }, + { + "epoch": 0.6249232658072437, + "grad_norm": 0.7121313810348511, + "learning_rate": 9.97242211500951e-05, + "loss": 2.2696, + "step": 2036 + }, + { + "epoch": 0.6252302025782689, + "grad_norm": 0.7203021049499512, + "learning_rate": 9.972369957032293e-05, + "loss": 2.2418, + "step": 2037 + }, + { + "epoch": 0.625537139349294, + "grad_norm": 0.6843051910400391, + "learning_rate": 9.972317749915203e-05, + "loss": 2.2408, + "step": 2038 + }, + { + "epoch": 0.6258440761203192, + "grad_norm": 0.6523141264915466, + "learning_rate": 9.972265493658754e-05, + "loss": 2.1693, + "step": 2039 + }, + { + "epoch": 0.6261510128913443, + "grad_norm": 0.6263946294784546, + "learning_rate": 9.972213188263463e-05, + "loss": 2.2477, + "step": 2040 + }, + { + "epoch": 0.6264579496623696, + "grad_norm": 0.6428464651107788, + "learning_rate": 9.972160833729847e-05, + "loss": 2.2131, + "step": 2041 + }, + { + "epoch": 0.6267648864333947, + "grad_norm": 0.6333484649658203, + "learning_rate": 9.972108430058423e-05, + "loss": 2.2806, + "step": 2042 + }, + { + "epoch": 0.6270718232044199, + "grad_norm": 0.7168832421302795, + "learning_rate": 9.97205597724971e-05, + "loss": 2.2468, + "step": 2043 + }, + { + "epoch": 0.627378759975445, + "grad_norm": 0.7522227168083191, + "learning_rate": 9.972003475304226e-05, + "loss": 2.249, + "step": 2044 + }, + { + "epoch": 0.6276856967464702, + "grad_norm": 0.6810066103935242, + "learning_rate": 9.971950924222488e-05, + "loss": 2.1988, + "step": 2045 + }, + { + "epoch": 0.6279926335174953, + "grad_norm": 0.6983187198638916, + "learning_rate": 9.971898324005018e-05, + "loss": 2.2444, + "step": 2046 + }, + { + "epoch": 0.6282995702885206, + "grad_norm": 0.7261439561843872, + "learning_rate": 9.971845674652333e-05, + "loss": 2.1789, + "step": 2047 + }, + { + "epoch": 0.6286065070595457, + "grad_norm": 0.6844322681427002, + "learning_rate": 9.971792976164957e-05, + "loss": 2.2666, + "step": 2048 + }, + { + "epoch": 0.6289134438305709, + "grad_norm": 0.7166746258735657, + "learning_rate": 9.971740228543407e-05, + "loss": 2.3002, + "step": 2049 + }, + { + "epoch": 0.629220380601596, + "grad_norm": 0.7386785745620728, + "learning_rate": 9.971687431788207e-05, + "loss": 2.1798, + "step": 2050 + }, + { + "epoch": 0.6295273173726212, + "grad_norm": 0.6873611211776733, + "learning_rate": 9.971634585899878e-05, + "loss": 2.184, + "step": 2051 + }, + { + "epoch": 0.6298342541436464, + "grad_norm": 0.8005948066711426, + "learning_rate": 9.971581690878941e-05, + "loss": 2.2778, + "step": 2052 + }, + { + "epoch": 0.6301411909146716, + "grad_norm": 0.8972415924072266, + "learning_rate": 9.971528746725922e-05, + "loss": 2.2822, + "step": 2053 + }, + { + "epoch": 0.6304481276856968, + "grad_norm": 0.7935822010040283, + "learning_rate": 9.97147575344134e-05, + "loss": 2.1732, + "step": 2054 + }, + { + "epoch": 0.6307550644567219, + "grad_norm": 0.7891644239425659, + "learning_rate": 9.971422711025721e-05, + "loss": 2.2765, + "step": 2055 + }, + { + "epoch": 0.6310620012277471, + "grad_norm": 0.7857005000114441, + "learning_rate": 9.971369619479589e-05, + "loss": 2.2386, + "step": 2056 + }, + { + "epoch": 0.6313689379987723, + "grad_norm": 0.6909852623939514, + "learning_rate": 9.97131647880347e-05, + "loss": 2.1251, + "step": 2057 + }, + { + "epoch": 0.6316758747697975, + "grad_norm": 0.6352387070655823, + "learning_rate": 9.971263288997885e-05, + "loss": 2.1883, + "step": 2058 + }, + { + "epoch": 0.6319828115408226, + "grad_norm": 0.5811386704444885, + "learning_rate": 9.971210050063364e-05, + "loss": 2.281, + "step": 2059 + }, + { + "epoch": 0.6322897483118478, + "grad_norm": 0.6227630376815796, + "learning_rate": 9.971156762000432e-05, + "loss": 2.1346, + "step": 2060 + }, + { + "epoch": 0.6325966850828729, + "grad_norm": 0.6628422737121582, + "learning_rate": 9.971103424809616e-05, + "loss": 2.2617, + "step": 2061 + }, + { + "epoch": 0.6329036218538981, + "grad_norm": 0.7212308645248413, + "learning_rate": 9.97105003849144e-05, + "loss": 2.1764, + "step": 2062 + }, + { + "epoch": 0.6332105586249233, + "grad_norm": 0.8368894457817078, + "learning_rate": 9.970996603046435e-05, + "loss": 2.2897, + "step": 2063 + }, + { + "epoch": 0.6335174953959485, + "grad_norm": 0.8797467350959778, + "learning_rate": 9.970943118475129e-05, + "loss": 2.1987, + "step": 2064 + }, + { + "epoch": 0.6338244321669736, + "grad_norm": 0.9241101145744324, + "learning_rate": 9.970889584778047e-05, + "loss": 2.2759, + "step": 2065 + }, + { + "epoch": 0.6341313689379988, + "grad_norm": 0.8636183142662048, + "learning_rate": 9.970836001955723e-05, + "loss": 2.2188, + "step": 2066 + }, + { + "epoch": 0.6344383057090239, + "grad_norm": 0.8965754508972168, + "learning_rate": 9.970782370008682e-05, + "loss": 2.2845, + "step": 2067 + }, + { + "epoch": 0.6347452424800492, + "grad_norm": 0.9064372777938843, + "learning_rate": 9.970728688937459e-05, + "loss": 2.1787, + "step": 2068 + }, + { + "epoch": 0.6350521792510743, + "grad_norm": 0.7387171387672424, + "learning_rate": 9.970674958742579e-05, + "loss": 2.1805, + "step": 2069 + }, + { + "epoch": 0.6353591160220995, + "grad_norm": 0.6220484972000122, + "learning_rate": 9.970621179424578e-05, + "loss": 2.2762, + "step": 2070 + }, + { + "epoch": 0.6356660527931246, + "grad_norm": 0.6268464922904968, + "learning_rate": 9.970567350983984e-05, + "loss": 2.2491, + "step": 2071 + }, + { + "epoch": 0.6359729895641498, + "grad_norm": 0.6385738253593445, + "learning_rate": 9.97051347342133e-05, + "loss": 2.2126, + "step": 2072 + }, + { + "epoch": 0.6362799263351749, + "grad_norm": 0.7084285020828247, + "learning_rate": 9.970459546737148e-05, + "loss": 2.2364, + "step": 2073 + }, + { + "epoch": 0.6365868631062002, + "grad_norm": 0.6957145929336548, + "learning_rate": 9.97040557093197e-05, + "loss": 2.266, + "step": 2074 + }, + { + "epoch": 0.6368937998772253, + "grad_norm": 0.6037309169769287, + "learning_rate": 9.970351546006334e-05, + "loss": 2.1514, + "step": 2075 + }, + { + "epoch": 0.6372007366482505, + "grad_norm": 0.6342970132827759, + "learning_rate": 9.97029747196077e-05, + "loss": 2.1602, + "step": 2076 + }, + { + "epoch": 0.6375076734192756, + "grad_norm": 0.5793863534927368, + "learning_rate": 9.970243348795812e-05, + "loss": 2.1853, + "step": 2077 + }, + { + "epoch": 0.6378146101903008, + "grad_norm": 0.5420103073120117, + "learning_rate": 9.970189176511997e-05, + "loss": 2.1885, + "step": 2078 + }, + { + "epoch": 0.638121546961326, + "grad_norm": 0.6713188886642456, + "learning_rate": 9.97013495510986e-05, + "loss": 2.2641, + "step": 2079 + }, + { + "epoch": 0.6384284837323512, + "grad_norm": 0.7410796880722046, + "learning_rate": 9.970080684589935e-05, + "loss": 2.2248, + "step": 2080 + }, + { + "epoch": 0.6387354205033763, + "grad_norm": 0.7138017416000366, + "learning_rate": 9.970026364952761e-05, + "loss": 2.1975, + "step": 2081 + }, + { + "epoch": 0.6390423572744015, + "grad_norm": 0.7553584575653076, + "learning_rate": 9.969971996198873e-05, + "loss": 2.2482, + "step": 2082 + }, + { + "epoch": 0.6393492940454266, + "grad_norm": 0.7082852125167847, + "learning_rate": 9.969917578328808e-05, + "loss": 2.1681, + "step": 2083 + }, + { + "epoch": 0.6396562308164518, + "grad_norm": 0.6190223097801208, + "learning_rate": 9.969863111343105e-05, + "loss": 2.1995, + "step": 2084 + }, + { + "epoch": 0.639963167587477, + "grad_norm": 0.6640429496765137, + "learning_rate": 9.969808595242302e-05, + "loss": 2.2969, + "step": 2085 + }, + { + "epoch": 0.6402701043585022, + "grad_norm": 0.761377215385437, + "learning_rate": 9.969754030026936e-05, + "loss": 2.2412, + "step": 2086 + }, + { + "epoch": 0.6405770411295273, + "grad_norm": 0.7226401567459106, + "learning_rate": 9.969699415697551e-05, + "loss": 2.1852, + "step": 2087 + }, + { + "epoch": 0.6408839779005525, + "grad_norm": 0.6474639177322388, + "learning_rate": 9.969644752254681e-05, + "loss": 2.1867, + "step": 2088 + }, + { + "epoch": 0.6411909146715776, + "grad_norm": 0.6725835800170898, + "learning_rate": 9.96959003969887e-05, + "loss": 2.1962, + "step": 2089 + }, + { + "epoch": 0.6414978514426029, + "grad_norm": 0.6669641733169556, + "learning_rate": 9.969535278030657e-05, + "loss": 2.2045, + "step": 2090 + }, + { + "epoch": 0.641804788213628, + "grad_norm": 0.7604048252105713, + "learning_rate": 9.969480467250583e-05, + "loss": 2.2543, + "step": 2091 + }, + { + "epoch": 0.6421117249846532, + "grad_norm": 0.9369953870773315, + "learning_rate": 9.969425607359191e-05, + "loss": 2.2461, + "step": 2092 + }, + { + "epoch": 0.6424186617556783, + "grad_norm": 1.116156816482544, + "learning_rate": 9.969370698357022e-05, + "loss": 2.2447, + "step": 2093 + }, + { + "epoch": 0.6427255985267035, + "grad_norm": 0.9179674983024597, + "learning_rate": 9.96931574024462e-05, + "loss": 2.2164, + "step": 2094 + }, + { + "epoch": 0.6430325352977286, + "grad_norm": 0.7629393339157104, + "learning_rate": 9.969260733022526e-05, + "loss": 2.22, + "step": 2095 + }, + { + "epoch": 0.6433394720687539, + "grad_norm": 0.7152948379516602, + "learning_rate": 9.969205676691286e-05, + "loss": 2.1967, + "step": 2096 + }, + { + "epoch": 0.643646408839779, + "grad_norm": 0.7527763247489929, + "learning_rate": 9.969150571251442e-05, + "loss": 2.2263, + "step": 2097 + }, + { + "epoch": 0.6439533456108042, + "grad_norm": 0.9889422655105591, + "learning_rate": 9.96909541670354e-05, + "loss": 2.2127, + "step": 2098 + }, + { + "epoch": 0.6442602823818293, + "grad_norm": 1.0340619087219238, + "learning_rate": 9.969040213048125e-05, + "loss": 2.2392, + "step": 2099 + }, + { + "epoch": 0.6445672191528545, + "grad_norm": 0.735322892665863, + "learning_rate": 9.968984960285743e-05, + "loss": 2.1351, + "step": 2100 + }, + { + "epoch": 0.6448741559238796, + "grad_norm": 0.6575397849082947, + "learning_rate": 9.968929658416936e-05, + "loss": 2.2481, + "step": 2101 + }, + { + "epoch": 0.6451810926949049, + "grad_norm": 0.6891960501670837, + "learning_rate": 9.968874307442258e-05, + "loss": 2.2164, + "step": 2102 + }, + { + "epoch": 0.64548802946593, + "grad_norm": 0.792298436164856, + "learning_rate": 9.968818907362248e-05, + "loss": 2.1681, + "step": 2103 + }, + { + "epoch": 0.6457949662369552, + "grad_norm": 0.8438142538070679, + "learning_rate": 9.968763458177459e-05, + "loss": 2.2123, + "step": 2104 + }, + { + "epoch": 0.6461019030079803, + "grad_norm": 0.7494921088218689, + "learning_rate": 9.968707959888436e-05, + "loss": 2.1863, + "step": 2105 + }, + { + "epoch": 0.6464088397790055, + "grad_norm": 0.7049927115440369, + "learning_rate": 9.968652412495731e-05, + "loss": 2.2364, + "step": 2106 + }, + { + "epoch": 0.6467157765500307, + "grad_norm": 0.7586455345153809, + "learning_rate": 9.968596815999889e-05, + "loss": 2.1976, + "step": 2107 + }, + { + "epoch": 0.6470227133210559, + "grad_norm": 0.7762691974639893, + "learning_rate": 9.968541170401462e-05, + "loss": 2.2323, + "step": 2108 + }, + { + "epoch": 0.647329650092081, + "grad_norm": 0.8127642869949341, + "learning_rate": 9.968485475700998e-05, + "loss": 2.1577, + "step": 2109 + }, + { + "epoch": 0.6476365868631062, + "grad_norm": 0.6762635111808777, + "learning_rate": 9.968429731899049e-05, + "loss": 2.1972, + "step": 2110 + }, + { + "epoch": 0.6479435236341313, + "grad_norm": 0.675707995891571, + "learning_rate": 9.968373938996165e-05, + "loss": 2.1932, + "step": 2111 + }, + { + "epoch": 0.6482504604051565, + "grad_norm": 0.6996815204620361, + "learning_rate": 9.968318096992898e-05, + "loss": 2.2695, + "step": 2112 + }, + { + "epoch": 0.6485573971761817, + "grad_norm": 0.8519851565361023, + "learning_rate": 9.968262205889799e-05, + "loss": 2.2662, + "step": 2113 + }, + { + "epoch": 0.6488643339472069, + "grad_norm": 0.7621145844459534, + "learning_rate": 9.968206265687421e-05, + "loss": 2.2888, + "step": 2114 + }, + { + "epoch": 0.649171270718232, + "grad_norm": 0.786609411239624, + "learning_rate": 9.968150276386317e-05, + "loss": 2.3354, + "step": 2115 + }, + { + "epoch": 0.6494782074892572, + "grad_norm": 0.7693428993225098, + "learning_rate": 9.96809423798704e-05, + "loss": 2.1981, + "step": 2116 + }, + { + "epoch": 0.6497851442602823, + "grad_norm": 0.72762131690979, + "learning_rate": 9.968038150490145e-05, + "loss": 2.2387, + "step": 2117 + }, + { + "epoch": 0.6500920810313076, + "grad_norm": 0.737617015838623, + "learning_rate": 9.967982013896184e-05, + "loss": 2.258, + "step": 2118 + }, + { + "epoch": 0.6503990178023327, + "grad_norm": 0.7320968508720398, + "learning_rate": 9.967925828205712e-05, + "loss": 2.3248, + "step": 2119 + }, + { + "epoch": 0.6507059545733579, + "grad_norm": 0.7904484868049622, + "learning_rate": 9.967869593419286e-05, + "loss": 2.2121, + "step": 2120 + }, + { + "epoch": 0.651012891344383, + "grad_norm": 0.7519722580909729, + "learning_rate": 9.967813309537461e-05, + "loss": 2.1999, + "step": 2121 + }, + { + "epoch": 0.6513198281154082, + "grad_norm": 0.7201504707336426, + "learning_rate": 9.967756976560793e-05, + "loss": 2.2022, + "step": 2122 + }, + { + "epoch": 0.6516267648864333, + "grad_norm": 0.6134514808654785, + "learning_rate": 9.96770059448984e-05, + "loss": 2.2105, + "step": 2123 + }, + { + "epoch": 0.6519337016574586, + "grad_norm": 0.6086028218269348, + "learning_rate": 9.967644163325156e-05, + "loss": 2.212, + "step": 2124 + }, + { + "epoch": 0.6522406384284838, + "grad_norm": 0.6550475358963013, + "learning_rate": 9.967587683067302e-05, + "loss": 2.181, + "step": 2125 + }, + { + "epoch": 0.6525475751995089, + "grad_norm": 0.7557916045188904, + "learning_rate": 9.967531153716835e-05, + "loss": 2.3194, + "step": 2126 + }, + { + "epoch": 0.6528545119705341, + "grad_norm": 0.8859965801239014, + "learning_rate": 9.967474575274314e-05, + "loss": 2.2104, + "step": 2127 + }, + { + "epoch": 0.6531614487415592, + "grad_norm": 0.8049005270004272, + "learning_rate": 9.967417947740296e-05, + "loss": 2.2949, + "step": 2128 + }, + { + "epoch": 0.6534683855125845, + "grad_norm": 0.708297073841095, + "learning_rate": 9.967361271115343e-05, + "loss": 2.1703, + "step": 2129 + }, + { + "epoch": 0.6537753222836096, + "grad_norm": 0.6764169335365295, + "learning_rate": 9.967304545400016e-05, + "loss": 2.2177, + "step": 2130 + }, + { + "epoch": 0.6540822590546348, + "grad_norm": 0.6987971067428589, + "learning_rate": 9.967247770594872e-05, + "loss": 2.1699, + "step": 2131 + }, + { + "epoch": 0.6543891958256599, + "grad_norm": 0.7212976217269897, + "learning_rate": 9.967190946700476e-05, + "loss": 2.1217, + "step": 2132 + }, + { + "epoch": 0.6546961325966851, + "grad_norm": 0.6805562973022461, + "learning_rate": 9.967134073717386e-05, + "loss": 2.2295, + "step": 2133 + }, + { + "epoch": 0.6550030693677102, + "grad_norm": 0.665428102016449, + "learning_rate": 9.967077151646167e-05, + "loss": 2.1742, + "step": 2134 + }, + { + "epoch": 0.6553100061387355, + "grad_norm": 0.6691353917121887, + "learning_rate": 9.967020180487378e-05, + "loss": 2.2313, + "step": 2135 + }, + { + "epoch": 0.6556169429097606, + "grad_norm": 0.7095547914505005, + "learning_rate": 9.966963160241587e-05, + "loss": 2.1367, + "step": 2136 + }, + { + "epoch": 0.6559238796807858, + "grad_norm": 0.7050215601921082, + "learning_rate": 9.966906090909353e-05, + "loss": 2.3234, + "step": 2137 + }, + { + "epoch": 0.6562308164518109, + "grad_norm": 0.7592353820800781, + "learning_rate": 9.966848972491245e-05, + "loss": 2.1722, + "step": 2138 + }, + { + "epoch": 0.6565377532228361, + "grad_norm": 0.6520100831985474, + "learning_rate": 9.96679180498782e-05, + "loss": 2.2401, + "step": 2139 + }, + { + "epoch": 0.6568446899938613, + "grad_norm": 0.6650902628898621, + "learning_rate": 9.966734588399651e-05, + "loss": 2.2094, + "step": 2140 + }, + { + "epoch": 0.6571516267648865, + "grad_norm": 0.7236151099205017, + "learning_rate": 9.966677322727299e-05, + "loss": 2.3021, + "step": 2141 + }, + { + "epoch": 0.6574585635359116, + "grad_norm": 0.7160753011703491, + "learning_rate": 9.966620007971327e-05, + "loss": 2.1992, + "step": 2142 + }, + { + "epoch": 0.6577655003069368, + "grad_norm": 0.6761705279350281, + "learning_rate": 9.966562644132309e-05, + "loss": 2.1853, + "step": 2143 + }, + { + "epoch": 0.6580724370779619, + "grad_norm": 0.7017555236816406, + "learning_rate": 9.966505231210806e-05, + "loss": 2.208, + "step": 2144 + }, + { + "epoch": 0.6583793738489871, + "grad_norm": 0.7652586102485657, + "learning_rate": 9.966447769207387e-05, + "loss": 2.3065, + "step": 2145 + }, + { + "epoch": 0.6586863106200123, + "grad_norm": 0.7148436307907104, + "learning_rate": 9.966390258122621e-05, + "loss": 2.1388, + "step": 2146 + }, + { + "epoch": 0.6589932473910375, + "grad_norm": 0.5885360240936279, + "learning_rate": 9.966332697957076e-05, + "loss": 2.1463, + "step": 2147 + }, + { + "epoch": 0.6593001841620626, + "grad_norm": 0.6800816655158997, + "learning_rate": 9.966275088711321e-05, + "loss": 2.3397, + "step": 2148 + }, + { + "epoch": 0.6596071209330878, + "grad_norm": 0.6856956481933594, + "learning_rate": 9.966217430385925e-05, + "loss": 2.0893, + "step": 2149 + }, + { + "epoch": 0.6599140577041129, + "grad_norm": 0.6302888989448547, + "learning_rate": 9.966159722981456e-05, + "loss": 2.1108, + "step": 2150 + }, + { + "epoch": 0.6602209944751382, + "grad_norm": 0.6145252585411072, + "learning_rate": 9.966101966498486e-05, + "loss": 2.2668, + "step": 2151 + }, + { + "epoch": 0.6605279312461633, + "grad_norm": 0.7258949279785156, + "learning_rate": 9.966044160937586e-05, + "loss": 2.2163, + "step": 2152 + }, + { + "epoch": 0.6608348680171885, + "grad_norm": 0.6809847950935364, + "learning_rate": 9.965986306299327e-05, + "loss": 2.1828, + "step": 2153 + }, + { + "epoch": 0.6611418047882136, + "grad_norm": 0.6673223376274109, + "learning_rate": 9.96592840258428e-05, + "loss": 2.232, + "step": 2154 + }, + { + "epoch": 0.6614487415592388, + "grad_norm": 0.6483572721481323, + "learning_rate": 9.96587044979302e-05, + "loss": 2.199, + "step": 2155 + }, + { + "epoch": 0.6617556783302639, + "grad_norm": 0.6227185726165771, + "learning_rate": 9.965812447926115e-05, + "loss": 2.166, + "step": 2156 + }, + { + "epoch": 0.6620626151012892, + "grad_norm": 0.5982463955879211, + "learning_rate": 9.965754396984142e-05, + "loss": 2.2074, + "step": 2157 + }, + { + "epoch": 0.6623695518723143, + "grad_norm": 0.6357809901237488, + "learning_rate": 9.965696296967673e-05, + "loss": 2.2086, + "step": 2158 + }, + { + "epoch": 0.6626764886433395, + "grad_norm": 0.5908147692680359, + "learning_rate": 9.965638147877283e-05, + "loss": 2.1103, + "step": 2159 + }, + { + "epoch": 0.6629834254143646, + "grad_norm": 0.591332733631134, + "learning_rate": 9.965579949713545e-05, + "loss": 2.1698, + "step": 2160 + }, + { + "epoch": 0.6632903621853898, + "grad_norm": 0.5748336911201477, + "learning_rate": 9.965521702477038e-05, + "loss": 2.1812, + "step": 2161 + }, + { + "epoch": 0.663597298956415, + "grad_norm": 0.6643908023834229, + "learning_rate": 9.965463406168334e-05, + "loss": 2.2129, + "step": 2162 + }, + { + "epoch": 0.6639042357274402, + "grad_norm": 0.637627124786377, + "learning_rate": 9.965405060788011e-05, + "loss": 2.226, + "step": 2163 + }, + { + "epoch": 0.6642111724984653, + "grad_norm": 0.6170387268066406, + "learning_rate": 9.965346666336644e-05, + "loss": 2.2025, + "step": 2164 + }, + { + "epoch": 0.6645181092694905, + "grad_norm": 0.6038833260536194, + "learning_rate": 9.965288222814812e-05, + "loss": 2.1761, + "step": 2165 + }, + { + "epoch": 0.6648250460405156, + "grad_norm": 0.5705585479736328, + "learning_rate": 9.965229730223092e-05, + "loss": 2.1511, + "step": 2166 + }, + { + "epoch": 0.6651319828115408, + "grad_norm": 0.5994759798049927, + "learning_rate": 9.965171188562059e-05, + "loss": 2.1763, + "step": 2167 + }, + { + "epoch": 0.665438919582566, + "grad_norm": 0.5887313485145569, + "learning_rate": 9.965112597832296e-05, + "loss": 2.2185, + "step": 2168 + }, + { + "epoch": 0.6657458563535912, + "grad_norm": 0.5688689947128296, + "learning_rate": 9.96505395803438e-05, + "loss": 2.2387, + "step": 2169 + }, + { + "epoch": 0.6660527931246163, + "grad_norm": 0.6121554970741272, + "learning_rate": 9.96499526916889e-05, + "loss": 2.1938, + "step": 2170 + }, + { + "epoch": 0.6663597298956415, + "grad_norm": 0.6048038005828857, + "learning_rate": 9.964936531236407e-05, + "loss": 2.197, + "step": 2171 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6442995071411133, + "learning_rate": 9.96487774423751e-05, + "loss": 2.1725, + "step": 2172 + }, + { + "epoch": 0.6669736034376919, + "grad_norm": 0.7136862874031067, + "learning_rate": 9.964818908172783e-05, + "loss": 2.2166, + "step": 2173 + }, + { + "epoch": 0.667280540208717, + "grad_norm": 0.6902804970741272, + "learning_rate": 9.964760023042805e-05, + "loss": 2.2318, + "step": 2174 + }, + { + "epoch": 0.6675874769797422, + "grad_norm": 0.6946488618850708, + "learning_rate": 9.964701088848158e-05, + "loss": 2.177, + "step": 2175 + }, + { + "epoch": 0.6678944137507673, + "grad_norm": 0.6283712983131409, + "learning_rate": 9.964642105589425e-05, + "loss": 2.2227, + "step": 2176 + }, + { + "epoch": 0.6682013505217925, + "grad_norm": 0.5768510103225708, + "learning_rate": 9.96458307326719e-05, + "loss": 2.1559, + "step": 2177 + }, + { + "epoch": 0.6685082872928176, + "grad_norm": 0.6045784950256348, + "learning_rate": 9.964523991882035e-05, + "loss": 2.2018, + "step": 2178 + }, + { + "epoch": 0.6688152240638429, + "grad_norm": 0.5962889790534973, + "learning_rate": 9.964464861434544e-05, + "loss": 2.1898, + "step": 2179 + }, + { + "epoch": 0.669122160834868, + "grad_norm": 0.6611660718917847, + "learning_rate": 9.964405681925301e-05, + "loss": 2.1989, + "step": 2180 + }, + { + "epoch": 0.6694290976058932, + "grad_norm": 0.6764575242996216, + "learning_rate": 9.964346453354891e-05, + "loss": 2.2764, + "step": 2181 + }, + { + "epoch": 0.6697360343769183, + "grad_norm": 0.6795048117637634, + "learning_rate": 9.964287175723899e-05, + "loss": 2.1313, + "step": 2182 + }, + { + "epoch": 0.6700429711479435, + "grad_norm": 0.6697003841400146, + "learning_rate": 9.964227849032914e-05, + "loss": 2.1999, + "step": 2183 + }, + { + "epoch": 0.6703499079189686, + "grad_norm": 0.669682502746582, + "learning_rate": 9.964168473282519e-05, + "loss": 2.202, + "step": 2184 + }, + { + "epoch": 0.6706568446899939, + "grad_norm": 0.6823530793190002, + "learning_rate": 9.9641090484733e-05, + "loss": 2.2326, + "step": 2185 + }, + { + "epoch": 0.670963781461019, + "grad_norm": 0.7460775971412659, + "learning_rate": 9.964049574605848e-05, + "loss": 2.1594, + "step": 2186 + }, + { + "epoch": 0.6712707182320442, + "grad_norm": 0.8075460195541382, + "learning_rate": 9.963990051680744e-05, + "loss": 2.1506, + "step": 2187 + }, + { + "epoch": 0.6715776550030693, + "grad_norm": 0.8041695356369019, + "learning_rate": 9.963930479698585e-05, + "loss": 2.123, + "step": 2188 + }, + { + "epoch": 0.6718845917740945, + "grad_norm": 0.9129732251167297, + "learning_rate": 9.963870858659955e-05, + "loss": 2.116, + "step": 2189 + }, + { + "epoch": 0.6721915285451197, + "grad_norm": 0.9989685416221619, + "learning_rate": 9.963811188565444e-05, + "loss": 2.3194, + "step": 2190 + }, + { + "epoch": 0.6724984653161449, + "grad_norm": 1.0353670120239258, + "learning_rate": 9.96375146941564e-05, + "loss": 2.113, + "step": 2191 + }, + { + "epoch": 0.67280540208717, + "grad_norm": 0.897750735282898, + "learning_rate": 9.963691701211135e-05, + "loss": 2.1038, + "step": 2192 + }, + { + "epoch": 0.6731123388581952, + "grad_norm": 0.7353916168212891, + "learning_rate": 9.96363188395252e-05, + "loss": 2.2185, + "step": 2193 + }, + { + "epoch": 0.6734192756292203, + "grad_norm": 0.6474063992500305, + "learning_rate": 9.963572017640385e-05, + "loss": 2.2229, + "step": 2194 + }, + { + "epoch": 0.6737262124002455, + "grad_norm": 0.7194583415985107, + "learning_rate": 9.963512102275322e-05, + "loss": 2.2172, + "step": 2195 + }, + { + "epoch": 0.6740331491712708, + "grad_norm": 0.6638131737709045, + "learning_rate": 9.963452137857926e-05, + "loss": 2.2212, + "step": 2196 + }, + { + "epoch": 0.6743400859422959, + "grad_norm": 0.7219048738479614, + "learning_rate": 9.963392124388782e-05, + "loss": 2.3302, + "step": 2197 + }, + { + "epoch": 0.6746470227133211, + "grad_norm": 0.7941164374351501, + "learning_rate": 9.963332061868491e-05, + "loss": 2.2982, + "step": 2198 + }, + { + "epoch": 0.6749539594843462, + "grad_norm": 0.7356888055801392, + "learning_rate": 9.963271950297643e-05, + "loss": 2.1761, + "step": 2199 + }, + { + "epoch": 0.6752608962553714, + "grad_norm": 0.6705774664878845, + "learning_rate": 9.963211789676831e-05, + "loss": 2.2483, + "step": 2200 + }, + { + "epoch": 0.6755678330263966, + "grad_norm": 0.7958056926727295, + "learning_rate": 9.963151580006653e-05, + "loss": 2.2209, + "step": 2201 + }, + { + "epoch": 0.6758747697974218, + "grad_norm": 0.7215412259101868, + "learning_rate": 9.9630913212877e-05, + "loss": 2.1676, + "step": 2202 + }, + { + "epoch": 0.6761817065684469, + "grad_norm": 0.705649197101593, + "learning_rate": 9.963031013520572e-05, + "loss": 2.1855, + "step": 2203 + }, + { + "epoch": 0.6764886433394721, + "grad_norm": 0.7050254344940186, + "learning_rate": 9.962970656705861e-05, + "loss": 2.171, + "step": 2204 + }, + { + "epoch": 0.6767955801104972, + "grad_norm": 0.7163556218147278, + "learning_rate": 9.962910250844167e-05, + "loss": 2.1295, + "step": 2205 + }, + { + "epoch": 0.6771025168815225, + "grad_norm": 0.7195280194282532, + "learning_rate": 9.962849795936083e-05, + "loss": 2.1436, + "step": 2206 + }, + { + "epoch": 0.6774094536525476, + "grad_norm": 0.7356030344963074, + "learning_rate": 9.962789291982208e-05, + "loss": 2.2739, + "step": 2207 + }, + { + "epoch": 0.6777163904235728, + "grad_norm": 0.783649742603302, + "learning_rate": 9.962728738983143e-05, + "loss": 2.2461, + "step": 2208 + }, + { + "epoch": 0.6780233271945979, + "grad_norm": 0.6966754794120789, + "learning_rate": 9.962668136939481e-05, + "loss": 2.1977, + "step": 2209 + }, + { + "epoch": 0.6783302639656231, + "grad_norm": 0.6986487507820129, + "learning_rate": 9.962607485851825e-05, + "loss": 2.1806, + "step": 2210 + }, + { + "epoch": 0.6786372007366482, + "grad_norm": 0.6502536535263062, + "learning_rate": 9.962546785720774e-05, + "loss": 2.174, + "step": 2211 + }, + { + "epoch": 0.6789441375076735, + "grad_norm": 0.6797144412994385, + "learning_rate": 9.962486036546926e-05, + "loss": 2.2635, + "step": 2212 + }, + { + "epoch": 0.6792510742786986, + "grad_norm": 0.7190150022506714, + "learning_rate": 9.962425238330884e-05, + "loss": 2.2231, + "step": 2213 + }, + { + "epoch": 0.6795580110497238, + "grad_norm": 0.6770560145378113, + "learning_rate": 9.962364391073245e-05, + "loss": 2.1639, + "step": 2214 + }, + { + "epoch": 0.6798649478207489, + "grad_norm": 0.624911904335022, + "learning_rate": 9.962303494774614e-05, + "loss": 2.1754, + "step": 2215 + }, + { + "epoch": 0.6801718845917741, + "grad_norm": 0.7127423286437988, + "learning_rate": 9.96224254943559e-05, + "loss": 2.2047, + "step": 2216 + }, + { + "epoch": 0.6804788213627992, + "grad_norm": 0.6729345321655273, + "learning_rate": 9.962181555056778e-05, + "loss": 2.2245, + "step": 2217 + }, + { + "epoch": 0.6807857581338245, + "grad_norm": 0.7142044901847839, + "learning_rate": 9.96212051163878e-05, + "loss": 2.1827, + "step": 2218 + }, + { + "epoch": 0.6810926949048496, + "grad_norm": 0.686295211315155, + "learning_rate": 9.962059419182196e-05, + "loss": 2.1784, + "step": 2219 + }, + { + "epoch": 0.6813996316758748, + "grad_norm": 0.7207211256027222, + "learning_rate": 9.961998277687634e-05, + "loss": 2.2603, + "step": 2220 + }, + { + "epoch": 0.6817065684468999, + "grad_norm": 0.814552903175354, + "learning_rate": 9.961937087155697e-05, + "loss": 2.2328, + "step": 2221 + }, + { + "epoch": 0.6820135052179251, + "grad_norm": 0.851860761642456, + "learning_rate": 9.96187584758699e-05, + "loss": 2.2334, + "step": 2222 + }, + { + "epoch": 0.6823204419889503, + "grad_norm": 0.9232058525085449, + "learning_rate": 9.961814558982117e-05, + "loss": 2.2259, + "step": 2223 + }, + { + "epoch": 0.6826273787599755, + "grad_norm": 0.8393358588218689, + "learning_rate": 9.961753221341684e-05, + "loss": 2.1347, + "step": 2224 + }, + { + "epoch": 0.6829343155310006, + "grad_norm": 0.7124439477920532, + "learning_rate": 9.961691834666297e-05, + "loss": 2.195, + "step": 2225 + }, + { + "epoch": 0.6832412523020258, + "grad_norm": 0.644290566444397, + "learning_rate": 9.961630398956565e-05, + "loss": 2.1967, + "step": 2226 + }, + { + "epoch": 0.6835481890730509, + "grad_norm": 0.6896283030509949, + "learning_rate": 9.961568914213092e-05, + "loss": 2.1781, + "step": 2227 + }, + { + "epoch": 0.6838551258440762, + "grad_norm": 0.711643636226654, + "learning_rate": 9.961507380436487e-05, + "loss": 2.1091, + "step": 2228 + }, + { + "epoch": 0.6841620626151013, + "grad_norm": 0.7056689858436584, + "learning_rate": 9.961445797627358e-05, + "loss": 2.1848, + "step": 2229 + }, + { + "epoch": 0.6844689993861265, + "grad_norm": 0.60573410987854, + "learning_rate": 9.961384165786314e-05, + "loss": 2.1156, + "step": 2230 + }, + { + "epoch": 0.6847759361571516, + "grad_norm": 0.5612443089485168, + "learning_rate": 9.961322484913963e-05, + "loss": 2.2311, + "step": 2231 + }, + { + "epoch": 0.6850828729281768, + "grad_norm": 0.6356449723243713, + "learning_rate": 9.961260755010916e-05, + "loss": 2.1945, + "step": 2232 + }, + { + "epoch": 0.6853898096992019, + "grad_norm": 0.7393341660499573, + "learning_rate": 9.961198976077782e-05, + "loss": 2.2743, + "step": 2233 + }, + { + "epoch": 0.6856967464702272, + "grad_norm": 0.7658794522285461, + "learning_rate": 9.961137148115171e-05, + "loss": 2.1729, + "step": 2234 + }, + { + "epoch": 0.6860036832412523, + "grad_norm": 0.790540337562561, + "learning_rate": 9.961075271123697e-05, + "loss": 2.1372, + "step": 2235 + }, + { + "epoch": 0.6863106200122775, + "grad_norm": 0.71295565366745, + "learning_rate": 9.961013345103968e-05, + "loss": 2.1325, + "step": 2236 + }, + { + "epoch": 0.6866175567833026, + "grad_norm": 0.6648302674293518, + "learning_rate": 9.960951370056597e-05, + "loss": 2.1626, + "step": 2237 + }, + { + "epoch": 0.6869244935543278, + "grad_norm": 0.6276865601539612, + "learning_rate": 9.960889345982198e-05, + "loss": 2.1848, + "step": 2238 + }, + { + "epoch": 0.6872314303253529, + "grad_norm": 0.6786942481994629, + "learning_rate": 9.960827272881383e-05, + "loss": 2.2402, + "step": 2239 + }, + { + "epoch": 0.6875383670963782, + "grad_norm": 0.7752293348312378, + "learning_rate": 9.960765150754764e-05, + "loss": 2.2187, + "step": 2240 + }, + { + "epoch": 0.6878453038674033, + "grad_norm": 0.7958577871322632, + "learning_rate": 9.960702979602956e-05, + "loss": 2.1995, + "step": 2241 + }, + { + "epoch": 0.6881522406384285, + "grad_norm": 0.7327582240104675, + "learning_rate": 9.960640759426575e-05, + "loss": 2.1709, + "step": 2242 + }, + { + "epoch": 0.6884591774094536, + "grad_norm": 0.7002710103988647, + "learning_rate": 9.960578490226233e-05, + "loss": 2.1966, + "step": 2243 + }, + { + "epoch": 0.6887661141804788, + "grad_norm": 0.6163785457611084, + "learning_rate": 9.960516172002548e-05, + "loss": 2.2012, + "step": 2244 + }, + { + "epoch": 0.689073050951504, + "grad_norm": 0.6808127760887146, + "learning_rate": 9.960453804756134e-05, + "loss": 2.1704, + "step": 2245 + }, + { + "epoch": 0.6893799877225292, + "grad_norm": 0.6571208834648132, + "learning_rate": 9.960391388487609e-05, + "loss": 2.17, + "step": 2246 + }, + { + "epoch": 0.6896869244935543, + "grad_norm": 0.7180834412574768, + "learning_rate": 9.960328923197588e-05, + "loss": 2.229, + "step": 2247 + }, + { + "epoch": 0.6899938612645795, + "grad_norm": 0.7283746600151062, + "learning_rate": 9.96026640888669e-05, + "loss": 2.195, + "step": 2248 + }, + { + "epoch": 0.6903007980356046, + "grad_norm": 0.6808122992515564, + "learning_rate": 9.960203845555531e-05, + "loss": 2.1327, + "step": 2249 + }, + { + "epoch": 0.6906077348066298, + "grad_norm": 0.7105094790458679, + "learning_rate": 9.960141233204731e-05, + "loss": 2.2747, + "step": 2250 + }, + { + "epoch": 0.690914671577655, + "grad_norm": 0.7650291919708252, + "learning_rate": 9.960078571834909e-05, + "loss": 2.2751, + "step": 2251 + }, + { + "epoch": 0.6912216083486802, + "grad_norm": 0.8347647786140442, + "learning_rate": 9.960015861446684e-05, + "loss": 2.2101, + "step": 2252 + }, + { + "epoch": 0.6915285451197053, + "grad_norm": 0.7774063348770142, + "learning_rate": 9.959953102040672e-05, + "loss": 2.1275, + "step": 2253 + }, + { + "epoch": 0.6918354818907305, + "grad_norm": 0.7466274499893188, + "learning_rate": 9.959890293617497e-05, + "loss": 2.1352, + "step": 2254 + }, + { + "epoch": 0.6921424186617556, + "grad_norm": 0.7451669573783875, + "learning_rate": 9.959827436177781e-05, + "loss": 2.1229, + "step": 2255 + }, + { + "epoch": 0.6924493554327809, + "grad_norm": 0.651746392250061, + "learning_rate": 9.959764529722142e-05, + "loss": 2.1416, + "step": 2256 + }, + { + "epoch": 0.692756292203806, + "grad_norm": 0.6267968416213989, + "learning_rate": 9.959701574251203e-05, + "loss": 2.1346, + "step": 2257 + }, + { + "epoch": 0.6930632289748312, + "grad_norm": 0.6087000966072083, + "learning_rate": 9.959638569765586e-05, + "loss": 2.2136, + "step": 2258 + }, + { + "epoch": 0.6933701657458563, + "grad_norm": 0.6032208204269409, + "learning_rate": 9.959575516265914e-05, + "loss": 2.1211, + "step": 2259 + }, + { + "epoch": 0.6936771025168815, + "grad_norm": 0.83074551820755, + "learning_rate": 9.95951241375281e-05, + "loss": 2.2951, + "step": 2260 + }, + { + "epoch": 0.6939840392879066, + "grad_norm": 0.8564106225967407, + "learning_rate": 9.959449262226897e-05, + "loss": 2.1496, + "step": 2261 + }, + { + "epoch": 0.6942909760589319, + "grad_norm": 0.8558153510093689, + "learning_rate": 9.9593860616888e-05, + "loss": 2.2325, + "step": 2262 + }, + { + "epoch": 0.694597912829957, + "grad_norm": 0.7391008734703064, + "learning_rate": 9.959322812139143e-05, + "loss": 2.1133, + "step": 2263 + }, + { + "epoch": 0.6949048496009822, + "grad_norm": 0.6090536713600159, + "learning_rate": 9.959259513578552e-05, + "loss": 2.1453, + "step": 2264 + }, + { + "epoch": 0.6952117863720073, + "grad_norm": 0.5893986821174622, + "learning_rate": 9.95919616600765e-05, + "loss": 2.2035, + "step": 2265 + }, + { + "epoch": 0.6955187231430325, + "grad_norm": 0.6274020671844482, + "learning_rate": 9.959132769427065e-05, + "loss": 2.2118, + "step": 2266 + }, + { + "epoch": 0.6958256599140578, + "grad_norm": 0.6287395358085632, + "learning_rate": 9.959069323837424e-05, + "loss": 2.2167, + "step": 2267 + }, + { + "epoch": 0.6961325966850829, + "grad_norm": 0.6281611323356628, + "learning_rate": 9.959005829239354e-05, + "loss": 2.1945, + "step": 2268 + }, + { + "epoch": 0.6964395334561081, + "grad_norm": 0.6422389149665833, + "learning_rate": 9.958942285633481e-05, + "loss": 2.1826, + "step": 2269 + }, + { + "epoch": 0.6967464702271332, + "grad_norm": 0.6461887955665588, + "learning_rate": 9.958878693020434e-05, + "loss": 2.2454, + "step": 2270 + }, + { + "epoch": 0.6970534069981584, + "grad_norm": 0.562102735042572, + "learning_rate": 9.958815051400841e-05, + "loss": 2.1375, + "step": 2271 + }, + { + "epoch": 0.6973603437691835, + "grad_norm": 0.5737003087997437, + "learning_rate": 9.958751360775331e-05, + "loss": 2.2344, + "step": 2272 + }, + { + "epoch": 0.6976672805402088, + "grad_norm": 0.5516494512557983, + "learning_rate": 9.958687621144535e-05, + "loss": 2.249, + "step": 2273 + }, + { + "epoch": 0.6979742173112339, + "grad_norm": 0.7148357629776001, + "learning_rate": 9.958623832509081e-05, + "loss": 2.2383, + "step": 2274 + }, + { + "epoch": 0.6982811540822591, + "grad_norm": 0.7151525020599365, + "learning_rate": 9.958559994869599e-05, + "loss": 2.1697, + "step": 2275 + }, + { + "epoch": 0.6985880908532842, + "grad_norm": 0.6927846670150757, + "learning_rate": 9.958496108226722e-05, + "loss": 2.1534, + "step": 2276 + }, + { + "epoch": 0.6988950276243094, + "grad_norm": 0.811660647392273, + "learning_rate": 9.958432172581079e-05, + "loss": 2.2197, + "step": 2277 + }, + { + "epoch": 0.6992019643953346, + "grad_norm": 0.9680081009864807, + "learning_rate": 9.958368187933305e-05, + "loss": 2.2241, + "step": 2278 + }, + { + "epoch": 0.6995089011663598, + "grad_norm": 0.9996320605278015, + "learning_rate": 9.958304154284028e-05, + "loss": 2.1598, + "step": 2279 + }, + { + "epoch": 0.6998158379373849, + "grad_norm": 1.008695363998413, + "learning_rate": 9.958240071633884e-05, + "loss": 2.2082, + "step": 2280 + }, + { + "epoch": 0.7001227747084101, + "grad_norm": 0.9931860566139221, + "learning_rate": 9.958175939983506e-05, + "loss": 2.1478, + "step": 2281 + }, + { + "epoch": 0.7004297114794352, + "grad_norm": 0.8637800812721252, + "learning_rate": 9.958111759333528e-05, + "loss": 2.149, + "step": 2282 + }, + { + "epoch": 0.7007366482504604, + "grad_norm": 0.7089012861251831, + "learning_rate": 9.958047529684582e-05, + "loss": 2.1845, + "step": 2283 + }, + { + "epoch": 0.7010435850214856, + "grad_norm": 0.6083673238754272, + "learning_rate": 9.957983251037303e-05, + "loss": 2.1542, + "step": 2284 + }, + { + "epoch": 0.7013505217925108, + "grad_norm": 0.7092905044555664, + "learning_rate": 9.957918923392331e-05, + "loss": 2.2305, + "step": 2285 + }, + { + "epoch": 0.7016574585635359, + "grad_norm": 0.8416675925254822, + "learning_rate": 9.957854546750297e-05, + "loss": 2.2975, + "step": 2286 + }, + { + "epoch": 0.7019643953345611, + "grad_norm": 0.7778663039207458, + "learning_rate": 9.957790121111838e-05, + "loss": 2.2363, + "step": 2287 + }, + { + "epoch": 0.7022713321055862, + "grad_norm": 0.7886617183685303, + "learning_rate": 9.957725646477592e-05, + "loss": 2.1547, + "step": 2288 + }, + { + "epoch": 0.7025782688766115, + "grad_norm": 0.6596038937568665, + "learning_rate": 9.957661122848194e-05, + "loss": 2.1537, + "step": 2289 + }, + { + "epoch": 0.7028852056476366, + "grad_norm": 0.6441544890403748, + "learning_rate": 9.957596550224285e-05, + "loss": 2.1678, + "step": 2290 + }, + { + "epoch": 0.7031921424186618, + "grad_norm": 0.7106116414070129, + "learning_rate": 9.957531928606499e-05, + "loss": 2.2039, + "step": 2291 + }, + { + "epoch": 0.7034990791896869, + "grad_norm": 0.6948207020759583, + "learning_rate": 9.957467257995476e-05, + "loss": 2.176, + "step": 2292 + }, + { + "epoch": 0.7038060159607121, + "grad_norm": 0.6834874153137207, + "learning_rate": 9.957402538391859e-05, + "loss": 2.2182, + "step": 2293 + }, + { + "epoch": 0.7041129527317372, + "grad_norm": 0.6246630549430847, + "learning_rate": 9.957337769796282e-05, + "loss": 2.1181, + "step": 2294 + }, + { + "epoch": 0.7044198895027625, + "grad_norm": 0.6421988606452942, + "learning_rate": 9.957272952209389e-05, + "loss": 2.1352, + "step": 2295 + }, + { + "epoch": 0.7047268262737876, + "grad_norm": 0.5955870151519775, + "learning_rate": 9.95720808563182e-05, + "loss": 2.1852, + "step": 2296 + }, + { + "epoch": 0.7050337630448128, + "grad_norm": 0.6961265206336975, + "learning_rate": 9.957143170064214e-05, + "loss": 2.242, + "step": 2297 + }, + { + "epoch": 0.7053406998158379, + "grad_norm": 0.6966063380241394, + "learning_rate": 9.957078205507213e-05, + "loss": 2.1505, + "step": 2298 + }, + { + "epoch": 0.7056476365868631, + "grad_norm": 0.6155996322631836, + "learning_rate": 9.957013191961459e-05, + "loss": 2.1928, + "step": 2299 + }, + { + "epoch": 0.7059545733578882, + "grad_norm": 0.6092718839645386, + "learning_rate": 9.956948129427597e-05, + "loss": 2.138, + "step": 2300 + }, + { + "epoch": 0.7062615101289135, + "grad_norm": 0.645746111869812, + "learning_rate": 9.95688301790627e-05, + "loss": 2.2334, + "step": 2301 + }, + { + "epoch": 0.7065684468999386, + "grad_norm": 0.5959149599075317, + "learning_rate": 9.956817857398116e-05, + "loss": 2.1985, + "step": 2302 + }, + { + "epoch": 0.7068753836709638, + "grad_norm": 0.7127073407173157, + "learning_rate": 9.956752647903785e-05, + "loss": 2.2157, + "step": 2303 + }, + { + "epoch": 0.7071823204419889, + "grad_norm": 0.5589274764060974, + "learning_rate": 9.956687389423917e-05, + "loss": 2.1251, + "step": 2304 + }, + { + "epoch": 0.7074892572130141, + "grad_norm": 0.5502300262451172, + "learning_rate": 9.95662208195916e-05, + "loss": 2.1344, + "step": 2305 + }, + { + "epoch": 0.7077961939840393, + "grad_norm": 0.6577275991439819, + "learning_rate": 9.95655672551016e-05, + "loss": 2.1646, + "step": 2306 + }, + { + "epoch": 0.7081031307550645, + "grad_norm": 0.6241618394851685, + "learning_rate": 9.956491320077559e-05, + "loss": 2.1153, + "step": 2307 + }, + { + "epoch": 0.7084100675260896, + "grad_norm": 0.5846728086471558, + "learning_rate": 9.956425865662007e-05, + "loss": 2.1477, + "step": 2308 + }, + { + "epoch": 0.7087170042971148, + "grad_norm": 0.6005275249481201, + "learning_rate": 9.95636036226415e-05, + "loss": 2.2034, + "step": 2309 + }, + { + "epoch": 0.7090239410681399, + "grad_norm": 0.6545519828796387, + "learning_rate": 9.956294809884635e-05, + "loss": 2.23, + "step": 2310 + }, + { + "epoch": 0.7093308778391652, + "grad_norm": 0.7513750791549683, + "learning_rate": 9.956229208524108e-05, + "loss": 2.2497, + "step": 2311 + }, + { + "epoch": 0.7096378146101903, + "grad_norm": 0.7308349609375, + "learning_rate": 9.956163558183219e-05, + "loss": 2.166, + "step": 2312 + }, + { + "epoch": 0.7099447513812155, + "grad_norm": 0.6278798580169678, + "learning_rate": 9.956097858862619e-05, + "loss": 2.1994, + "step": 2313 + }, + { + "epoch": 0.7102516881522406, + "grad_norm": 0.6725621223449707, + "learning_rate": 9.956032110562953e-05, + "loss": 2.2212, + "step": 2314 + }, + { + "epoch": 0.7105586249232658, + "grad_norm": 0.7116945385932922, + "learning_rate": 9.955966313284872e-05, + "loss": 2.2033, + "step": 2315 + }, + { + "epoch": 0.7108655616942909, + "grad_norm": 0.5906245112419128, + "learning_rate": 9.95590046702903e-05, + "loss": 2.1419, + "step": 2316 + }, + { + "epoch": 0.7111724984653162, + "grad_norm": 0.6911863684654236, + "learning_rate": 9.955834571796073e-05, + "loss": 2.1697, + "step": 2317 + }, + { + "epoch": 0.7114794352363413, + "grad_norm": 0.600350558757782, + "learning_rate": 9.955768627586655e-05, + "loss": 2.0864, + "step": 2318 + }, + { + "epoch": 0.7117863720073665, + "grad_norm": 0.6246278285980225, + "learning_rate": 9.955702634401427e-05, + "loss": 2.1549, + "step": 2319 + }, + { + "epoch": 0.7120933087783916, + "grad_norm": 0.6530009508132935, + "learning_rate": 9.95563659224104e-05, + "loss": 2.1457, + "step": 2320 + }, + { + "epoch": 0.7124002455494168, + "grad_norm": 0.6566256880760193, + "learning_rate": 9.955570501106148e-05, + "loss": 2.1589, + "step": 2321 + }, + { + "epoch": 0.712707182320442, + "grad_norm": 0.6607041358947754, + "learning_rate": 9.955504360997404e-05, + "loss": 2.1692, + "step": 2322 + }, + { + "epoch": 0.7130141190914672, + "grad_norm": 0.7257810235023499, + "learning_rate": 9.95543817191546e-05, + "loss": 2.2067, + "step": 2323 + }, + { + "epoch": 0.7133210558624923, + "grad_norm": 0.7413349151611328, + "learning_rate": 9.955371933860973e-05, + "loss": 2.1817, + "step": 2324 + }, + { + "epoch": 0.7136279926335175, + "grad_norm": 0.6968317031860352, + "learning_rate": 9.955305646834596e-05, + "loss": 2.2574, + "step": 2325 + }, + { + "epoch": 0.7139349294045426, + "grad_norm": 0.8065732717514038, + "learning_rate": 9.955239310836983e-05, + "loss": 2.1957, + "step": 2326 + }, + { + "epoch": 0.7142418661755678, + "grad_norm": 0.7563133835792542, + "learning_rate": 9.955172925868792e-05, + "loss": 2.2113, + "step": 2327 + }, + { + "epoch": 0.714548802946593, + "grad_norm": 0.6790496110916138, + "learning_rate": 9.955106491930678e-05, + "loss": 2.103, + "step": 2328 + }, + { + "epoch": 0.7148557397176182, + "grad_norm": 0.65167236328125, + "learning_rate": 9.955040009023298e-05, + "loss": 2.1919, + "step": 2329 + }, + { + "epoch": 0.7151626764886433, + "grad_norm": 0.6869332790374756, + "learning_rate": 9.954973477147307e-05, + "loss": 2.2141, + "step": 2330 + }, + { + "epoch": 0.7154696132596685, + "grad_norm": 0.8613699078559875, + "learning_rate": 9.954906896303363e-05, + "loss": 2.1962, + "step": 2331 + }, + { + "epoch": 0.7157765500306936, + "grad_norm": 0.8827282786369324, + "learning_rate": 9.954840266492127e-05, + "loss": 2.216, + "step": 2332 + }, + { + "epoch": 0.7160834868017188, + "grad_norm": 0.9737905263900757, + "learning_rate": 9.954773587714255e-05, + "loss": 2.2118, + "step": 2333 + }, + { + "epoch": 0.716390423572744, + "grad_norm": 0.9978635311126709, + "learning_rate": 9.954706859970404e-05, + "loss": 2.0998, + "step": 2334 + }, + { + "epoch": 0.7166973603437692, + "grad_norm": 0.8694623112678528, + "learning_rate": 9.954640083261238e-05, + "loss": 2.1533, + "step": 2335 + }, + { + "epoch": 0.7170042971147943, + "grad_norm": 0.641293466091156, + "learning_rate": 9.954573257587415e-05, + "loss": 2.2095, + "step": 2336 + }, + { + "epoch": 0.7173112338858195, + "grad_norm": 0.6289860010147095, + "learning_rate": 9.954506382949594e-05, + "loss": 2.1683, + "step": 2337 + }, + { + "epoch": 0.7176181706568447, + "grad_norm": 0.8292246460914612, + "learning_rate": 9.954439459348437e-05, + "loss": 2.1729, + "step": 2338 + }, + { + "epoch": 0.7179251074278699, + "grad_norm": 0.8990920782089233, + "learning_rate": 9.954372486784605e-05, + "loss": 2.0888, + "step": 2339 + }, + { + "epoch": 0.7182320441988951, + "grad_norm": 0.7905614376068115, + "learning_rate": 9.954305465258762e-05, + "loss": 2.2262, + "step": 2340 + }, + { + "epoch": 0.7185389809699202, + "grad_norm": 0.7142611145973206, + "learning_rate": 9.954238394771567e-05, + "loss": 2.1311, + "step": 2341 + }, + { + "epoch": 0.7188459177409454, + "grad_norm": 0.68161541223526, + "learning_rate": 9.954171275323684e-05, + "loss": 2.2622, + "step": 2342 + }, + { + "epoch": 0.7191528545119705, + "grad_norm": 0.7524895668029785, + "learning_rate": 9.954104106915779e-05, + "loss": 2.1709, + "step": 2343 + }, + { + "epoch": 0.7194597912829958, + "grad_norm": 0.7419885396957397, + "learning_rate": 9.954036889548511e-05, + "loss": 2.1528, + "step": 2344 + }, + { + "epoch": 0.7197667280540209, + "grad_norm": 0.8045634031295776, + "learning_rate": 9.953969623222547e-05, + "loss": 2.1774, + "step": 2345 + }, + { + "epoch": 0.7200736648250461, + "grad_norm": 0.6680217385292053, + "learning_rate": 9.953902307938554e-05, + "loss": 2.2345, + "step": 2346 + }, + { + "epoch": 0.7203806015960712, + "grad_norm": 0.6900907754898071, + "learning_rate": 9.953834943697193e-05, + "loss": 2.1696, + "step": 2347 + }, + { + "epoch": 0.7206875383670964, + "grad_norm": 0.7231009006500244, + "learning_rate": 9.953767530499132e-05, + "loss": 2.2556, + "step": 2348 + }, + { + "epoch": 0.7209944751381215, + "grad_norm": 0.7766092419624329, + "learning_rate": 9.953700068345036e-05, + "loss": 2.1522, + "step": 2349 + }, + { + "epoch": 0.7213014119091468, + "grad_norm": 0.7361852526664734, + "learning_rate": 9.953632557235574e-05, + "loss": 2.2427, + "step": 2350 + }, + { + "epoch": 0.7216083486801719, + "grad_norm": 0.7170109152793884, + "learning_rate": 9.953564997171411e-05, + "loss": 2.2439, + "step": 2351 + }, + { + "epoch": 0.7219152854511971, + "grad_norm": 0.7192662954330444, + "learning_rate": 9.953497388153214e-05, + "loss": 2.1242, + "step": 2352 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.7363288402557373, + "learning_rate": 9.953429730181653e-05, + "loss": 2.2748, + "step": 2353 + }, + { + "epoch": 0.7225291589932474, + "grad_norm": 0.8516983985900879, + "learning_rate": 9.953362023257397e-05, + "loss": 2.2471, + "step": 2354 + }, + { + "epoch": 0.7228360957642725, + "grad_norm": 0.7928574681282043, + "learning_rate": 9.953294267381114e-05, + "loss": 2.164, + "step": 2355 + }, + { + "epoch": 0.7231430325352978, + "grad_norm": 0.6803320646286011, + "learning_rate": 9.953226462553474e-05, + "loss": 2.1671, + "step": 2356 + }, + { + "epoch": 0.7234499693063229, + "grad_norm": 0.6811994910240173, + "learning_rate": 9.953158608775147e-05, + "loss": 2.1042, + "step": 2357 + }, + { + "epoch": 0.7237569060773481, + "grad_norm": 0.6077840328216553, + "learning_rate": 9.953090706046804e-05, + "loss": 2.2161, + "step": 2358 + }, + { + "epoch": 0.7240638428483732, + "grad_norm": 0.5938412547111511, + "learning_rate": 9.953022754369114e-05, + "loss": 2.1177, + "step": 2359 + }, + { + "epoch": 0.7243707796193984, + "grad_norm": 0.6752299070358276, + "learning_rate": 9.952954753742751e-05, + "loss": 2.2255, + "step": 2360 + }, + { + "epoch": 0.7246777163904236, + "grad_norm": 0.6745245456695557, + "learning_rate": 9.952886704168387e-05, + "loss": 2.1817, + "step": 2361 + }, + { + "epoch": 0.7249846531614488, + "grad_norm": 0.6645397543907166, + "learning_rate": 9.95281860564669e-05, + "loss": 2.2495, + "step": 2362 + }, + { + "epoch": 0.7252915899324739, + "grad_norm": 0.6758745312690735, + "learning_rate": 9.95275045817834e-05, + "loss": 2.2059, + "step": 2363 + }, + { + "epoch": 0.7255985267034991, + "grad_norm": 0.6584516763687134, + "learning_rate": 9.952682261764006e-05, + "loss": 2.1868, + "step": 2364 + }, + { + "epoch": 0.7259054634745242, + "grad_norm": 0.6335561871528625, + "learning_rate": 9.952614016404363e-05, + "loss": 2.1352, + "step": 2365 + }, + { + "epoch": 0.7262124002455494, + "grad_norm": 0.6656816601753235, + "learning_rate": 9.952545722100087e-05, + "loss": 2.1805, + "step": 2366 + }, + { + "epoch": 0.7265193370165746, + "grad_norm": 0.6262782216072083, + "learning_rate": 9.95247737885185e-05, + "loss": 2.1435, + "step": 2367 + }, + { + "epoch": 0.7268262737875998, + "grad_norm": 0.569795548915863, + "learning_rate": 9.952408986660329e-05, + "loss": 2.1547, + "step": 2368 + }, + { + "epoch": 0.7271332105586249, + "grad_norm": 0.5249118208885193, + "learning_rate": 9.952340545526199e-05, + "loss": 2.1213, + "step": 2369 + }, + { + "epoch": 0.7274401473296501, + "grad_norm": 0.5581740140914917, + "learning_rate": 9.952272055450139e-05, + "loss": 2.1866, + "step": 2370 + }, + { + "epoch": 0.7277470841006752, + "grad_norm": 0.5986969470977783, + "learning_rate": 9.952203516432821e-05, + "loss": 2.143, + "step": 2371 + }, + { + "epoch": 0.7280540208717005, + "grad_norm": 0.6426723599433899, + "learning_rate": 9.952134928474926e-05, + "loss": 2.2132, + "step": 2372 + }, + { + "epoch": 0.7283609576427256, + "grad_norm": 0.5856953263282776, + "learning_rate": 9.952066291577133e-05, + "loss": 2.1502, + "step": 2373 + }, + { + "epoch": 0.7286678944137508, + "grad_norm": 0.5420570969581604, + "learning_rate": 9.951997605740117e-05, + "loss": 2.1213, + "step": 2374 + }, + { + "epoch": 0.7289748311847759, + "grad_norm": 0.6201688647270203, + "learning_rate": 9.951928870964558e-05, + "loss": 2.218, + "step": 2375 + }, + { + "epoch": 0.7292817679558011, + "grad_norm": 0.7023850083351135, + "learning_rate": 9.951860087251137e-05, + "loss": 2.2787, + "step": 2376 + }, + { + "epoch": 0.7295887047268262, + "grad_norm": 0.733650803565979, + "learning_rate": 9.951791254600532e-05, + "loss": 2.1861, + "step": 2377 + }, + { + "epoch": 0.7298956414978515, + "grad_norm": 0.7177363038063049, + "learning_rate": 9.951722373013421e-05, + "loss": 2.1905, + "step": 2378 + }, + { + "epoch": 0.7302025782688766, + "grad_norm": 0.7963547706604004, + "learning_rate": 9.95165344249049e-05, + "loss": 2.1842, + "step": 2379 + }, + { + "epoch": 0.7305095150399018, + "grad_norm": 0.8466546535491943, + "learning_rate": 9.951584463032416e-05, + "loss": 2.1661, + "step": 2380 + }, + { + "epoch": 0.7308164518109269, + "grad_norm": 0.7288870811462402, + "learning_rate": 9.951515434639882e-05, + "loss": 2.1153, + "step": 2381 + }, + { + "epoch": 0.7311233885819521, + "grad_norm": 0.6168704032897949, + "learning_rate": 9.951446357313571e-05, + "loss": 2.121, + "step": 2382 + }, + { + "epoch": 0.7314303253529773, + "grad_norm": 0.6534848809242249, + "learning_rate": 9.951377231054166e-05, + "loss": 2.2087, + "step": 2383 + }, + { + "epoch": 0.7317372621240025, + "grad_norm": 0.7872020602226257, + "learning_rate": 9.951308055862347e-05, + "loss": 2.2428, + "step": 2384 + }, + { + "epoch": 0.7320441988950276, + "grad_norm": 0.864799439907074, + "learning_rate": 9.9512388317388e-05, + "loss": 2.2392, + "step": 2385 + }, + { + "epoch": 0.7323511356660528, + "grad_norm": 0.7365485429763794, + "learning_rate": 9.95116955868421e-05, + "loss": 2.1614, + "step": 2386 + }, + { + "epoch": 0.7326580724370779, + "grad_norm": 0.6509390473365784, + "learning_rate": 9.95110023669926e-05, + "loss": 2.1917, + "step": 2387 + }, + { + "epoch": 0.7329650092081031, + "grad_norm": 0.7660403847694397, + "learning_rate": 9.951030865784635e-05, + "loss": 2.2414, + "step": 2388 + }, + { + "epoch": 0.7332719459791283, + "grad_norm": 0.9997872114181519, + "learning_rate": 9.950961445941022e-05, + "loss": 2.2063, + "step": 2389 + }, + { + "epoch": 0.7335788827501535, + "grad_norm": 1.0113418102264404, + "learning_rate": 9.950891977169106e-05, + "loss": 2.1898, + "step": 2390 + }, + { + "epoch": 0.7338858195211786, + "grad_norm": 0.8849206566810608, + "learning_rate": 9.950822459469573e-05, + "loss": 2.1503, + "step": 2391 + }, + { + "epoch": 0.7341927562922038, + "grad_norm": 0.6561055779457092, + "learning_rate": 9.950752892843112e-05, + "loss": 2.1234, + "step": 2392 + }, + { + "epoch": 0.7344996930632289, + "grad_norm": 0.5568758845329285, + "learning_rate": 9.950683277290407e-05, + "loss": 2.2129, + "step": 2393 + }, + { + "epoch": 0.7348066298342542, + "grad_norm": 0.7019078135490417, + "learning_rate": 9.950613612812149e-05, + "loss": 2.1162, + "step": 2394 + }, + { + "epoch": 0.7351135666052793, + "grad_norm": 0.7633521556854248, + "learning_rate": 9.950543899409026e-05, + "loss": 2.2427, + "step": 2395 + }, + { + "epoch": 0.7354205033763045, + "grad_norm": 0.6743205785751343, + "learning_rate": 9.950474137081726e-05, + "loss": 2.2213, + "step": 2396 + }, + { + "epoch": 0.7357274401473296, + "grad_norm": 0.6008336544036865, + "learning_rate": 9.950404325830941e-05, + "loss": 2.1605, + "step": 2397 + }, + { + "epoch": 0.7360343769183548, + "grad_norm": 0.648760199546814, + "learning_rate": 9.950334465657357e-05, + "loss": 2.2298, + "step": 2398 + }, + { + "epoch": 0.7363413136893799, + "grad_norm": 0.6996559500694275, + "learning_rate": 9.950264556561667e-05, + "loss": 2.1616, + "step": 2399 + }, + { + "epoch": 0.7366482504604052, + "grad_norm": 0.741629421710968, + "learning_rate": 9.950194598544561e-05, + "loss": 2.2162, + "step": 2400 + }, + { + "epoch": 0.7369551872314303, + "grad_norm": 0.6144673824310303, + "learning_rate": 9.95012459160673e-05, + "loss": 2.15, + "step": 2401 + }, + { + "epoch": 0.7372621240024555, + "grad_norm": 0.5826541781425476, + "learning_rate": 9.950054535748867e-05, + "loss": 2.1792, + "step": 2402 + }, + { + "epoch": 0.7375690607734806, + "grad_norm": 0.6489288806915283, + "learning_rate": 9.949984430971665e-05, + "loss": 2.1703, + "step": 2403 + }, + { + "epoch": 0.7378759975445058, + "grad_norm": 0.6752250790596008, + "learning_rate": 9.949914277275814e-05, + "loss": 2.2561, + "step": 2404 + }, + { + "epoch": 0.738182934315531, + "grad_norm": 0.5570092797279358, + "learning_rate": 9.94984407466201e-05, + "loss": 2.1418, + "step": 2405 + }, + { + "epoch": 0.7384898710865562, + "grad_norm": 0.5966812968254089, + "learning_rate": 9.949773823130944e-05, + "loss": 2.2168, + "step": 2406 + }, + { + "epoch": 0.7387968078575813, + "grad_norm": 0.6253142952919006, + "learning_rate": 9.949703522683314e-05, + "loss": 2.1646, + "step": 2407 + }, + { + "epoch": 0.7391037446286065, + "grad_norm": 0.6673659086227417, + "learning_rate": 9.94963317331981e-05, + "loss": 2.1904, + "step": 2408 + }, + { + "epoch": 0.7394106813996317, + "grad_norm": 0.6243279576301575, + "learning_rate": 9.949562775041133e-05, + "loss": 2.2568, + "step": 2409 + }, + { + "epoch": 0.7397176181706568, + "grad_norm": 0.7014298439025879, + "learning_rate": 9.949492327847973e-05, + "loss": 2.2331, + "step": 2410 + }, + { + "epoch": 0.7400245549416821, + "grad_norm": 0.698403537273407, + "learning_rate": 9.94942183174103e-05, + "loss": 2.1928, + "step": 2411 + }, + { + "epoch": 0.7403314917127072, + "grad_norm": 0.6354022026062012, + "learning_rate": 9.949351286721001e-05, + "loss": 2.0975, + "step": 2412 + }, + { + "epoch": 0.7406384284837324, + "grad_norm": 0.595302164554596, + "learning_rate": 9.949280692788579e-05, + "loss": 2.177, + "step": 2413 + }, + { + "epoch": 0.7409453652547575, + "grad_norm": 0.6844484210014343, + "learning_rate": 9.949210049944465e-05, + "loss": 2.1962, + "step": 2414 + }, + { + "epoch": 0.7412523020257827, + "grad_norm": 0.6242616176605225, + "learning_rate": 9.949139358189357e-05, + "loss": 2.2143, + "step": 2415 + }, + { + "epoch": 0.7415592387968079, + "grad_norm": 0.6524595022201538, + "learning_rate": 9.949068617523954e-05, + "loss": 2.1438, + "step": 2416 + }, + { + "epoch": 0.7418661755678331, + "grad_norm": 0.6667510867118835, + "learning_rate": 9.948997827948953e-05, + "loss": 2.2115, + "step": 2417 + }, + { + "epoch": 0.7421731123388582, + "grad_norm": 0.7688906192779541, + "learning_rate": 9.948926989465056e-05, + "loss": 2.1887, + "step": 2418 + }, + { + "epoch": 0.7424800491098834, + "grad_norm": 0.6888165473937988, + "learning_rate": 9.948856102072958e-05, + "loss": 2.1349, + "step": 2419 + }, + { + "epoch": 0.7427869858809085, + "grad_norm": 0.5672495365142822, + "learning_rate": 9.948785165773367e-05, + "loss": 2.1109, + "step": 2420 + }, + { + "epoch": 0.7430939226519337, + "grad_norm": 0.5714489221572876, + "learning_rate": 9.94871418056698e-05, + "loss": 2.1483, + "step": 2421 + }, + { + "epoch": 0.7434008594229589, + "grad_norm": 0.6061533093452454, + "learning_rate": 9.948643146454498e-05, + "loss": 2.211, + "step": 2422 + }, + { + "epoch": 0.7437077961939841, + "grad_norm": 0.6132726073265076, + "learning_rate": 9.948572063436625e-05, + "loss": 2.23, + "step": 2423 + }, + { + "epoch": 0.7440147329650092, + "grad_norm": 0.684301495552063, + "learning_rate": 9.948500931514062e-05, + "loss": 2.129, + "step": 2424 + }, + { + "epoch": 0.7443216697360344, + "grad_norm": 0.6325442790985107, + "learning_rate": 9.948429750687512e-05, + "loss": 2.129, + "step": 2425 + }, + { + "epoch": 0.7446286065070595, + "grad_norm": 0.6245989203453064, + "learning_rate": 9.948358520957678e-05, + "loss": 2.1999, + "step": 2426 + }, + { + "epoch": 0.7449355432780848, + "grad_norm": 0.6638534069061279, + "learning_rate": 9.948287242325267e-05, + "loss": 2.203, + "step": 2427 + }, + { + "epoch": 0.7452424800491099, + "grad_norm": 0.6121437549591064, + "learning_rate": 9.94821591479098e-05, + "loss": 2.1204, + "step": 2428 + }, + { + "epoch": 0.7455494168201351, + "grad_norm": 0.7919846177101135, + "learning_rate": 9.948144538355522e-05, + "loss": 2.2353, + "step": 2429 + }, + { + "epoch": 0.7458563535911602, + "grad_norm": 0.7246984839439392, + "learning_rate": 9.948073113019602e-05, + "loss": 2.1284, + "step": 2430 + }, + { + "epoch": 0.7461632903621854, + "grad_norm": 0.6120265126228333, + "learning_rate": 9.948001638783921e-05, + "loss": 2.0873, + "step": 2431 + }, + { + "epoch": 0.7464702271332105, + "grad_norm": 0.628588080406189, + "learning_rate": 9.947930115649189e-05, + "loss": 2.1713, + "step": 2432 + }, + { + "epoch": 0.7467771639042358, + "grad_norm": 0.63116854429245, + "learning_rate": 9.947858543616111e-05, + "loss": 2.123, + "step": 2433 + }, + { + "epoch": 0.7470841006752609, + "grad_norm": 0.6533017754554749, + "learning_rate": 9.947786922685394e-05, + "loss": 2.1593, + "step": 2434 + }, + { + "epoch": 0.7473910374462861, + "grad_norm": 0.6854177117347717, + "learning_rate": 9.947715252857749e-05, + "loss": 2.162, + "step": 2435 + }, + { + "epoch": 0.7476979742173112, + "grad_norm": 0.7257967591285706, + "learning_rate": 9.94764353413388e-05, + "loss": 2.2644, + "step": 2436 + }, + { + "epoch": 0.7480049109883364, + "grad_norm": 0.6806700825691223, + "learning_rate": 9.947571766514498e-05, + "loss": 2.0875, + "step": 2437 + }, + { + "epoch": 0.7483118477593615, + "grad_norm": 0.6616181135177612, + "learning_rate": 9.947499950000312e-05, + "loss": 2.1353, + "step": 2438 + }, + { + "epoch": 0.7486187845303868, + "grad_norm": 0.7249685525894165, + "learning_rate": 9.947428084592032e-05, + "loss": 2.148, + "step": 2439 + }, + { + "epoch": 0.7489257213014119, + "grad_norm": 0.6372905969619751, + "learning_rate": 9.947356170290369e-05, + "loss": 2.1749, + "step": 2440 + }, + { + "epoch": 0.7492326580724371, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.947284207096031e-05, + "loss": 2.1909, + "step": 2441 + }, + { + "epoch": 0.7495395948434622, + "grad_norm": 0.5830507278442383, + "learning_rate": 9.94721219500973e-05, + "loss": 2.1351, + "step": 2442 + }, + { + "epoch": 0.7498465316144874, + "grad_norm": 0.650262713432312, + "learning_rate": 9.94714013403218e-05, + "loss": 2.2602, + "step": 2443 + }, + { + "epoch": 0.7501534683855126, + "grad_norm": 0.6658717393875122, + "learning_rate": 9.947068024164091e-05, + "loss": 2.0919, + "step": 2444 + }, + { + "epoch": 0.7504604051565378, + "grad_norm": 0.7299105525016785, + "learning_rate": 9.946995865406177e-05, + "loss": 2.2079, + "step": 2445 + }, + { + "epoch": 0.7507673419275629, + "grad_norm": 0.762246310710907, + "learning_rate": 9.946923657759148e-05, + "loss": 2.2225, + "step": 2446 + }, + { + "epoch": 0.7510742786985881, + "grad_norm": 0.7019835710525513, + "learning_rate": 9.946851401223722e-05, + "loss": 2.175, + "step": 2447 + }, + { + "epoch": 0.7513812154696132, + "grad_norm": 0.6214791536331177, + "learning_rate": 9.946779095800611e-05, + "loss": 2.2095, + "step": 2448 + }, + { + "epoch": 0.7516881522406385, + "grad_norm": 0.6380667090415955, + "learning_rate": 9.94670674149053e-05, + "loss": 2.2325, + "step": 2449 + }, + { + "epoch": 0.7519950890116636, + "grad_norm": 0.6175886392593384, + "learning_rate": 9.946634338294191e-05, + "loss": 2.1431, + "step": 2450 + }, + { + "epoch": 0.7523020257826888, + "grad_norm": 0.6642621159553528, + "learning_rate": 9.946561886212315e-05, + "loss": 2.1538, + "step": 2451 + }, + { + "epoch": 0.7526089625537139, + "grad_norm": 0.7078617215156555, + "learning_rate": 9.946489385245614e-05, + "loss": 2.1544, + "step": 2452 + }, + { + "epoch": 0.7529158993247391, + "grad_norm": 0.6939398050308228, + "learning_rate": 9.946416835394806e-05, + "loss": 2.1131, + "step": 2453 + }, + { + "epoch": 0.7532228360957642, + "grad_norm": 0.7080716490745544, + "learning_rate": 9.946344236660608e-05, + "loss": 2.2135, + "step": 2454 + }, + { + "epoch": 0.7535297728667895, + "grad_norm": 0.7451115250587463, + "learning_rate": 9.946271589043736e-05, + "loss": 2.1475, + "step": 2455 + }, + { + "epoch": 0.7538367096378146, + "grad_norm": 0.6718367338180542, + "learning_rate": 9.946198892544909e-05, + "loss": 2.1853, + "step": 2456 + }, + { + "epoch": 0.7541436464088398, + "grad_norm": 0.7071637511253357, + "learning_rate": 9.946126147164847e-05, + "loss": 2.0981, + "step": 2457 + }, + { + "epoch": 0.7544505831798649, + "grad_norm": 0.6745624542236328, + "learning_rate": 9.946053352904267e-05, + "loss": 2.1914, + "step": 2458 + }, + { + "epoch": 0.7547575199508901, + "grad_norm": 0.7267486453056335, + "learning_rate": 9.945980509763888e-05, + "loss": 2.1091, + "step": 2459 + }, + { + "epoch": 0.7550644567219152, + "grad_norm": 0.6128695607185364, + "learning_rate": 9.94590761774443e-05, + "loss": 2.1721, + "step": 2460 + }, + { + "epoch": 0.7553713934929405, + "grad_norm": 0.6574678421020508, + "learning_rate": 9.945834676846615e-05, + "loss": 2.1609, + "step": 2461 + }, + { + "epoch": 0.7556783302639656, + "grad_norm": 0.6209995150566101, + "learning_rate": 9.945761687071164e-05, + "loss": 2.1889, + "step": 2462 + }, + { + "epoch": 0.7559852670349908, + "grad_norm": 0.7425361275672913, + "learning_rate": 9.945688648418795e-05, + "loss": 2.2189, + "step": 2463 + }, + { + "epoch": 0.7562922038060159, + "grad_norm": 1.0604934692382812, + "learning_rate": 9.945615560890234e-05, + "loss": 2.1858, + "step": 2464 + }, + { + "epoch": 0.7565991405770411, + "grad_norm": 0.7162829041481018, + "learning_rate": 9.945542424486201e-05, + "loss": 2.101, + "step": 2465 + }, + { + "epoch": 0.7569060773480663, + "grad_norm": 0.6361207962036133, + "learning_rate": 9.945469239207416e-05, + "loss": 2.0836, + "step": 2466 + }, + { + "epoch": 0.7572130141190915, + "grad_norm": 0.5858156085014343, + "learning_rate": 9.945396005054609e-05, + "loss": 2.2059, + "step": 2467 + }, + { + "epoch": 0.7575199508901166, + "grad_norm": 0.7322074174880981, + "learning_rate": 9.945322722028498e-05, + "loss": 2.2295, + "step": 2468 + }, + { + "epoch": 0.7578268876611418, + "grad_norm": 0.775900661945343, + "learning_rate": 9.945249390129811e-05, + "loss": 2.2171, + "step": 2469 + }, + { + "epoch": 0.7581338244321669, + "grad_norm": 0.8801379799842834, + "learning_rate": 9.94517600935927e-05, + "loss": 2.1632, + "step": 2470 + }, + { + "epoch": 0.7584407612031921, + "grad_norm": 0.8258405923843384, + "learning_rate": 9.945102579717602e-05, + "loss": 2.1591, + "step": 2471 + }, + { + "epoch": 0.7587476979742173, + "grad_norm": 0.7472482323646545, + "learning_rate": 9.945029101205532e-05, + "loss": 2.2242, + "step": 2472 + }, + { + "epoch": 0.7590546347452425, + "grad_norm": 0.6594643592834473, + "learning_rate": 9.944955573823785e-05, + "loss": 2.1217, + "step": 2473 + }, + { + "epoch": 0.7593615715162676, + "grad_norm": 0.6547524333000183, + "learning_rate": 9.944881997573088e-05, + "loss": 2.131, + "step": 2474 + }, + { + "epoch": 0.7596685082872928, + "grad_norm": 0.6630129814147949, + "learning_rate": 9.94480837245417e-05, + "loss": 2.1264, + "step": 2475 + }, + { + "epoch": 0.7599754450583179, + "grad_norm": 0.6877384781837463, + "learning_rate": 9.944734698467757e-05, + "loss": 2.2453, + "step": 2476 + }, + { + "epoch": 0.7602823818293432, + "grad_norm": 0.6736158728599548, + "learning_rate": 9.944660975614579e-05, + "loss": 2.1425, + "step": 2477 + }, + { + "epoch": 0.7605893186003683, + "grad_norm": 0.6140786409378052, + "learning_rate": 9.944587203895361e-05, + "loss": 2.1345, + "step": 2478 + }, + { + "epoch": 0.7608962553713935, + "grad_norm": 0.5515910387039185, + "learning_rate": 9.944513383310837e-05, + "loss": 2.086, + "step": 2479 + }, + { + "epoch": 0.7612031921424187, + "grad_norm": 0.49419671297073364, + "learning_rate": 9.944439513861731e-05, + "loss": 2.1069, + "step": 2480 + }, + { + "epoch": 0.7615101289134438, + "grad_norm": 0.5526577234268188, + "learning_rate": 9.944365595548777e-05, + "loss": 2.1702, + "step": 2481 + }, + { + "epoch": 0.761817065684469, + "grad_norm": 0.5430580973625183, + "learning_rate": 9.944291628372702e-05, + "loss": 2.121, + "step": 2482 + }, + { + "epoch": 0.7621240024554942, + "grad_norm": 0.5333554148674011, + "learning_rate": 9.94421761233424e-05, + "loss": 2.1154, + "step": 2483 + }, + { + "epoch": 0.7624309392265194, + "grad_norm": 0.5856761932373047, + "learning_rate": 9.944143547434124e-05, + "loss": 2.1734, + "step": 2484 + }, + { + "epoch": 0.7627378759975445, + "grad_norm": 0.6619083881378174, + "learning_rate": 9.944069433673082e-05, + "loss": 2.2068, + "step": 2485 + }, + { + "epoch": 0.7630448127685697, + "grad_norm": 0.5791018009185791, + "learning_rate": 9.943995271051849e-05, + "loss": 2.0834, + "step": 2486 + }, + { + "epoch": 0.7633517495395948, + "grad_norm": 0.5942522287368774, + "learning_rate": 9.943921059571155e-05, + "loss": 2.2001, + "step": 2487 + }, + { + "epoch": 0.7636586863106201, + "grad_norm": 0.6285880208015442, + "learning_rate": 9.943846799231738e-05, + "loss": 2.1601, + "step": 2488 + }, + { + "epoch": 0.7639656230816452, + "grad_norm": 0.6337715983390808, + "learning_rate": 9.943772490034326e-05, + "loss": 2.1722, + "step": 2489 + }, + { + "epoch": 0.7642725598526704, + "grad_norm": 0.6912121772766113, + "learning_rate": 9.94369813197966e-05, + "loss": 2.1933, + "step": 2490 + }, + { + "epoch": 0.7645794966236955, + "grad_norm": 0.8028284311294556, + "learning_rate": 9.943623725068469e-05, + "loss": 2.129, + "step": 2491 + }, + { + "epoch": 0.7648864333947207, + "grad_norm": 0.8527138233184814, + "learning_rate": 9.943549269301491e-05, + "loss": 2.1846, + "step": 2492 + }, + { + "epoch": 0.7651933701657458, + "grad_norm": 0.8422580361366272, + "learning_rate": 9.943474764679462e-05, + "loss": 2.2958, + "step": 2493 + }, + { + "epoch": 0.7655003069367711, + "grad_norm": 0.7698150873184204, + "learning_rate": 9.943400211203118e-05, + "loss": 2.1415, + "step": 2494 + }, + { + "epoch": 0.7658072437077962, + "grad_norm": 0.6360690593719482, + "learning_rate": 9.943325608873196e-05, + "loss": 2.1188, + "step": 2495 + }, + { + "epoch": 0.7661141804788214, + "grad_norm": 0.6225799918174744, + "learning_rate": 9.943250957690433e-05, + "loss": 2.1006, + "step": 2496 + }, + { + "epoch": 0.7664211172498465, + "grad_norm": 0.6694490909576416, + "learning_rate": 9.943176257655567e-05, + "loss": 2.2455, + "step": 2497 + }, + { + "epoch": 0.7667280540208717, + "grad_norm": 0.6188158988952637, + "learning_rate": 9.943101508769335e-05, + "loss": 2.0853, + "step": 2498 + }, + { + "epoch": 0.7670349907918969, + "grad_norm": 0.5934504866600037, + "learning_rate": 9.943026711032477e-05, + "loss": 2.0718, + "step": 2499 + }, + { + "epoch": 0.7673419275629221, + "grad_norm": 0.6261292695999146, + "learning_rate": 9.942951864445732e-05, + "loss": 2.1747, + "step": 2500 + }, + { + "epoch": 0.7676488643339472, + "grad_norm": 0.5891184210777283, + "learning_rate": 9.94287696900984e-05, + "loss": 2.1637, + "step": 2501 + }, + { + "epoch": 0.7679558011049724, + "grad_norm": 0.5321740508079529, + "learning_rate": 9.94280202472554e-05, + "loss": 2.0717, + "step": 2502 + }, + { + "epoch": 0.7682627378759975, + "grad_norm": 0.5563281178474426, + "learning_rate": 9.942727031593573e-05, + "loss": 2.1654, + "step": 2503 + }, + { + "epoch": 0.7685696746470227, + "grad_norm": 0.5672664046287537, + "learning_rate": 9.942651989614681e-05, + "loss": 2.0853, + "step": 2504 + }, + { + "epoch": 0.7688766114180479, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.942576898789606e-05, + "loss": 2.0636, + "step": 2505 + }, + { + "epoch": 0.7691835481890731, + "grad_norm": 0.5802470445632935, + "learning_rate": 9.942501759119088e-05, + "loss": 2.0924, + "step": 2506 + }, + { + "epoch": 0.7694904849600982, + "grad_norm": 0.5630003213882446, + "learning_rate": 9.94242657060387e-05, + "loss": 2.1975, + "step": 2507 + }, + { + "epoch": 0.7697974217311234, + "grad_norm": 0.6001835465431213, + "learning_rate": 9.942351333244697e-05, + "loss": 2.1187, + "step": 2508 + }, + { + "epoch": 0.7701043585021485, + "grad_norm": 0.6702088117599487, + "learning_rate": 9.942276047042311e-05, + "loss": 2.1489, + "step": 2509 + }, + { + "epoch": 0.7704112952731738, + "grad_norm": 0.7941808700561523, + "learning_rate": 9.942200711997456e-05, + "loss": 2.1404, + "step": 2510 + }, + { + "epoch": 0.7707182320441989, + "grad_norm": 0.8202539682388306, + "learning_rate": 9.942125328110876e-05, + "loss": 2.1242, + "step": 2511 + }, + { + "epoch": 0.7710251688152241, + "grad_norm": 0.7667655348777771, + "learning_rate": 9.942049895383319e-05, + "loss": 2.118, + "step": 2512 + }, + { + "epoch": 0.7713321055862492, + "grad_norm": 0.6766887307167053, + "learning_rate": 9.941974413815527e-05, + "loss": 2.2632, + "step": 2513 + }, + { + "epoch": 0.7716390423572744, + "grad_norm": 0.5923287272453308, + "learning_rate": 9.941898883408248e-05, + "loss": 2.1096, + "step": 2514 + }, + { + "epoch": 0.7719459791282995, + "grad_norm": 0.8847586512565613, + "learning_rate": 9.941823304162227e-05, + "loss": 2.2629, + "step": 2515 + }, + { + "epoch": 0.7722529158993248, + "grad_norm": 1.2274069786071777, + "learning_rate": 9.941747676078211e-05, + "loss": 2.2493, + "step": 2516 + }, + { + "epoch": 0.7725598526703499, + "grad_norm": 0.8637729287147522, + "learning_rate": 9.94167199915695e-05, + "loss": 2.1545, + "step": 2517 + }, + { + "epoch": 0.7728667894413751, + "grad_norm": 0.7852178812026978, + "learning_rate": 9.941596273399187e-05, + "loss": 2.1984, + "step": 2518 + }, + { + "epoch": 0.7731737262124002, + "grad_norm": 0.6839576959609985, + "learning_rate": 9.941520498805677e-05, + "loss": 2.1913, + "step": 2519 + }, + { + "epoch": 0.7734806629834254, + "grad_norm": 0.7051649689674377, + "learning_rate": 9.941444675377163e-05, + "loss": 2.1678, + "step": 2520 + }, + { + "epoch": 0.7737875997544506, + "grad_norm": 0.702549159526825, + "learning_rate": 9.941368803114395e-05, + "loss": 2.1426, + "step": 2521 + }, + { + "epoch": 0.7740945365254758, + "grad_norm": 0.6717942953109741, + "learning_rate": 9.941292882018127e-05, + "loss": 2.1873, + "step": 2522 + }, + { + "epoch": 0.7744014732965009, + "grad_norm": 0.6705282926559448, + "learning_rate": 9.941216912089104e-05, + "loss": 2.1363, + "step": 2523 + }, + { + "epoch": 0.7747084100675261, + "grad_norm": 0.5858317017555237, + "learning_rate": 9.941140893328082e-05, + "loss": 2.1019, + "step": 2524 + }, + { + "epoch": 0.7750153468385512, + "grad_norm": 0.6353682279586792, + "learning_rate": 9.941064825735808e-05, + "loss": 2.1765, + "step": 2525 + }, + { + "epoch": 0.7753222836095764, + "grad_norm": 0.6573354601860046, + "learning_rate": 9.940988709313035e-05, + "loss": 2.0636, + "step": 2526 + }, + { + "epoch": 0.7756292203806016, + "grad_norm": 0.6040489077568054, + "learning_rate": 9.940912544060517e-05, + "loss": 2.0902, + "step": 2527 + }, + { + "epoch": 0.7759361571516268, + "grad_norm": 0.7024530172348022, + "learning_rate": 9.940836329979004e-05, + "loss": 2.2198, + "step": 2528 + }, + { + "epoch": 0.7762430939226519, + "grad_norm": 0.6910196542739868, + "learning_rate": 9.940760067069251e-05, + "loss": 2.0546, + "step": 2529 + }, + { + "epoch": 0.7765500306936771, + "grad_norm": 0.6841506361961365, + "learning_rate": 9.940683755332012e-05, + "loss": 2.2159, + "step": 2530 + }, + { + "epoch": 0.7768569674647022, + "grad_norm": 0.6503066420555115, + "learning_rate": 9.940607394768038e-05, + "loss": 2.2156, + "step": 2531 + }, + { + "epoch": 0.7771639042357275, + "grad_norm": 0.6512146592140198, + "learning_rate": 9.940530985378089e-05, + "loss": 2.1417, + "step": 2532 + }, + { + "epoch": 0.7774708410067526, + "grad_norm": 0.6234787106513977, + "learning_rate": 9.940454527162914e-05, + "loss": 2.1315, + "step": 2533 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.6279457211494446, + "learning_rate": 9.940378020123273e-05, + "loss": 2.2699, + "step": 2534 + }, + { + "epoch": 0.7780847145488029, + "grad_norm": 0.6793956160545349, + "learning_rate": 9.940301464259921e-05, + "loss": 2.2488, + "step": 2535 + }, + { + "epoch": 0.7783916513198281, + "grad_norm": 0.721234142780304, + "learning_rate": 9.940224859573614e-05, + "loss": 2.1183, + "step": 2536 + }, + { + "epoch": 0.7786985880908532, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.94014820606511e-05, + "loss": 2.0995, + "step": 2537 + }, + { + "epoch": 0.7790055248618785, + "grad_norm": 0.6358578205108643, + "learning_rate": 9.940071503735165e-05, + "loss": 2.2024, + "step": 2538 + }, + { + "epoch": 0.7793124616329036, + "grad_norm": 0.6250868439674377, + "learning_rate": 9.939994752584538e-05, + "loss": 2.1574, + "step": 2539 + }, + { + "epoch": 0.7796193984039288, + "grad_norm": 0.7657763361930847, + "learning_rate": 9.939917952613989e-05, + "loss": 2.2625, + "step": 2540 + }, + { + "epoch": 0.7799263351749539, + "grad_norm": 0.7625400424003601, + "learning_rate": 9.939841103824275e-05, + "loss": 2.1809, + "step": 2541 + }, + { + "epoch": 0.7802332719459791, + "grad_norm": 0.8593107461929321, + "learning_rate": 9.939764206216155e-05, + "loss": 2.2359, + "step": 2542 + }, + { + "epoch": 0.7805402087170042, + "grad_norm": 0.8441007733345032, + "learning_rate": 9.93968725979039e-05, + "loss": 2.1844, + "step": 2543 + }, + { + "epoch": 0.7808471454880295, + "grad_norm": 0.6408470273017883, + "learning_rate": 9.93961026454774e-05, + "loss": 2.1871, + "step": 2544 + }, + { + "epoch": 0.7811540822590546, + "grad_norm": 0.6779976487159729, + "learning_rate": 9.939533220488966e-05, + "loss": 2.1651, + "step": 2545 + }, + { + "epoch": 0.7814610190300798, + "grad_norm": 0.5885556936264038, + "learning_rate": 9.93945612761483e-05, + "loss": 2.0172, + "step": 2546 + }, + { + "epoch": 0.7817679558011049, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.939378985926094e-05, + "loss": 2.1358, + "step": 2547 + }, + { + "epoch": 0.7820748925721301, + "grad_norm": 0.685183584690094, + "learning_rate": 9.939301795423519e-05, + "loss": 2.1822, + "step": 2548 + }, + { + "epoch": 0.7823818293431553, + "grad_norm": 0.6666997671127319, + "learning_rate": 9.939224556107869e-05, + "loss": 2.288, + "step": 2549 + }, + { + "epoch": 0.7826887661141805, + "grad_norm": 0.6401170492172241, + "learning_rate": 9.939147267979905e-05, + "loss": 2.1038, + "step": 2550 + }, + { + "epoch": 0.7829957028852057, + "grad_norm": 0.645182728767395, + "learning_rate": 9.939069931040396e-05, + "loss": 2.1285, + "step": 2551 + }, + { + "epoch": 0.7833026396562308, + "grad_norm": 0.6795851588249207, + "learning_rate": 9.9389925452901e-05, + "loss": 2.1844, + "step": 2552 + }, + { + "epoch": 0.783609576427256, + "grad_norm": 0.7027488946914673, + "learning_rate": 9.938915110729788e-05, + "loss": 2.1712, + "step": 2553 + }, + { + "epoch": 0.7839165131982812, + "grad_norm": 0.7076524496078491, + "learning_rate": 9.93883762736022e-05, + "loss": 2.1812, + "step": 2554 + }, + { + "epoch": 0.7842234499693064, + "grad_norm": 0.5979459881782532, + "learning_rate": 9.938760095182165e-05, + "loss": 2.0877, + "step": 2555 + }, + { + "epoch": 0.7845303867403315, + "grad_norm": 0.6408665776252747, + "learning_rate": 9.938682514196387e-05, + "loss": 2.191, + "step": 2556 + }, + { + "epoch": 0.7848373235113567, + "grad_norm": 0.6545908451080322, + "learning_rate": 9.938604884403654e-05, + "loss": 2.0933, + "step": 2557 + }, + { + "epoch": 0.7851442602823818, + "grad_norm": 0.7271838784217834, + "learning_rate": 9.938527205804733e-05, + "loss": 2.1804, + "step": 2558 + }, + { + "epoch": 0.785451197053407, + "grad_norm": 0.6371840834617615, + "learning_rate": 9.938449478400391e-05, + "loss": 2.1161, + "step": 2559 + }, + { + "epoch": 0.7857581338244322, + "grad_norm": 0.5922467708587646, + "learning_rate": 9.938371702191398e-05, + "loss": 2.0929, + "step": 2560 + }, + { + "epoch": 0.7860650705954574, + "grad_norm": 0.536125898361206, + "learning_rate": 9.938293877178522e-05, + "loss": 2.0815, + "step": 2561 + }, + { + "epoch": 0.7863720073664825, + "grad_norm": 0.6026225090026855, + "learning_rate": 9.93821600336253e-05, + "loss": 2.1719, + "step": 2562 + }, + { + "epoch": 0.7866789441375077, + "grad_norm": 0.584267795085907, + "learning_rate": 9.938138080744192e-05, + "loss": 2.1515, + "step": 2563 + }, + { + "epoch": 0.7869858809085328, + "grad_norm": 0.6616362929344177, + "learning_rate": 9.938060109324281e-05, + "loss": 2.2425, + "step": 2564 + }, + { + "epoch": 0.787292817679558, + "grad_norm": 0.669987678527832, + "learning_rate": 9.937982089103566e-05, + "loss": 2.1883, + "step": 2565 + }, + { + "epoch": 0.7875997544505832, + "grad_norm": 0.6769465208053589, + "learning_rate": 9.937904020082815e-05, + "loss": 2.1508, + "step": 2566 + }, + { + "epoch": 0.7879066912216084, + "grad_norm": 0.5796112418174744, + "learning_rate": 9.937825902262805e-05, + "loss": 2.0925, + "step": 2567 + }, + { + "epoch": 0.7882136279926335, + "grad_norm": 0.5895870923995972, + "learning_rate": 9.937747735644305e-05, + "loss": 2.1002, + "step": 2568 + }, + { + "epoch": 0.7885205647636587, + "grad_norm": 0.5870219469070435, + "learning_rate": 9.937669520228088e-05, + "loss": 2.1189, + "step": 2569 + }, + { + "epoch": 0.7888275015346838, + "grad_norm": 0.6191404461860657, + "learning_rate": 9.937591256014925e-05, + "loss": 2.1783, + "step": 2570 + }, + { + "epoch": 0.7891344383057091, + "grad_norm": 0.6033806204795837, + "learning_rate": 9.937512943005592e-05, + "loss": 2.1507, + "step": 2571 + }, + { + "epoch": 0.7894413750767342, + "grad_norm": 0.6319470405578613, + "learning_rate": 9.937434581200863e-05, + "loss": 2.2088, + "step": 2572 + }, + { + "epoch": 0.7897483118477594, + "grad_norm": 0.621004581451416, + "learning_rate": 9.93735617060151e-05, + "loss": 2.1523, + "step": 2573 + }, + { + "epoch": 0.7900552486187845, + "grad_norm": 0.6069821715354919, + "learning_rate": 9.937277711208311e-05, + "loss": 2.1437, + "step": 2574 + }, + { + "epoch": 0.7903621853898097, + "grad_norm": 0.6186996102333069, + "learning_rate": 9.937199203022039e-05, + "loss": 2.1541, + "step": 2575 + }, + { + "epoch": 0.7906691221608348, + "grad_norm": 0.6531949639320374, + "learning_rate": 9.937120646043471e-05, + "loss": 2.1928, + "step": 2576 + }, + { + "epoch": 0.7909760589318601, + "grad_norm": 0.5974560379981995, + "learning_rate": 9.937042040273383e-05, + "loss": 2.1814, + "step": 2577 + }, + { + "epoch": 0.7912829957028852, + "grad_norm": 0.59506756067276, + "learning_rate": 9.936963385712552e-05, + "loss": 2.2143, + "step": 2578 + }, + { + "epoch": 0.7915899324739104, + "grad_norm": 0.5878757834434509, + "learning_rate": 9.936884682361755e-05, + "loss": 2.0718, + "step": 2579 + }, + { + "epoch": 0.7918968692449355, + "grad_norm": 0.6318243145942688, + "learning_rate": 9.936805930221769e-05, + "loss": 2.1465, + "step": 2580 + }, + { + "epoch": 0.7922038060159607, + "grad_norm": 0.6474836468696594, + "learning_rate": 9.936727129293376e-05, + "loss": 2.0869, + "step": 2581 + }, + { + "epoch": 0.7925107427869859, + "grad_norm": 0.6589438915252686, + "learning_rate": 9.936648279577349e-05, + "loss": 2.1422, + "step": 2582 + }, + { + "epoch": 0.7928176795580111, + "grad_norm": 0.6935134530067444, + "learning_rate": 9.93656938107447e-05, + "loss": 2.1571, + "step": 2583 + }, + { + "epoch": 0.7931246163290362, + "grad_norm": 0.655430793762207, + "learning_rate": 9.936490433785522e-05, + "loss": 2.1044, + "step": 2584 + }, + { + "epoch": 0.7934315531000614, + "grad_norm": 0.6856111288070679, + "learning_rate": 9.93641143771128e-05, + "loss": 2.0551, + "step": 2585 + }, + { + "epoch": 0.7937384898710865, + "grad_norm": 0.6783097386360168, + "learning_rate": 9.936332392852527e-05, + "loss": 2.1475, + "step": 2586 + }, + { + "epoch": 0.7940454266421118, + "grad_norm": 0.6746678948402405, + "learning_rate": 9.936253299210045e-05, + "loss": 2.1462, + "step": 2587 + }, + { + "epoch": 0.7943523634131369, + "grad_norm": 0.6854017972946167, + "learning_rate": 9.936174156784614e-05, + "loss": 2.1649, + "step": 2588 + }, + { + "epoch": 0.7946593001841621, + "grad_norm": 0.6740380525588989, + "learning_rate": 9.936094965577017e-05, + "loss": 2.06, + "step": 2589 + }, + { + "epoch": 0.7949662369551872, + "grad_norm": 0.6354179978370667, + "learning_rate": 9.936015725588037e-05, + "loss": 2.1938, + "step": 2590 + }, + { + "epoch": 0.7952731737262124, + "grad_norm": 0.6496716141700745, + "learning_rate": 9.935936436818453e-05, + "loss": 2.089, + "step": 2591 + }, + { + "epoch": 0.7955801104972375, + "grad_norm": 0.5996106266975403, + "learning_rate": 9.935857099269057e-05, + "loss": 2.2254, + "step": 2592 + }, + { + "epoch": 0.7958870472682628, + "grad_norm": 0.5630382895469666, + "learning_rate": 9.935777712940625e-05, + "loss": 2.069, + "step": 2593 + }, + { + "epoch": 0.7961939840392879, + "grad_norm": 0.5480468273162842, + "learning_rate": 9.935698277833946e-05, + "loss": 2.1288, + "step": 2594 + }, + { + "epoch": 0.7965009208103131, + "grad_norm": 0.5127096772193909, + "learning_rate": 9.935618793949803e-05, + "loss": 2.0753, + "step": 2595 + }, + { + "epoch": 0.7968078575813382, + "grad_norm": 0.6451439261436462, + "learning_rate": 9.935539261288983e-05, + "loss": 2.3005, + "step": 2596 + }, + { + "epoch": 0.7971147943523634, + "grad_norm": 0.7047737836837769, + "learning_rate": 9.935459679852271e-05, + "loss": 2.1307, + "step": 2597 + }, + { + "epoch": 0.7974217311233885, + "grad_norm": 0.6382983922958374, + "learning_rate": 9.935380049640454e-05, + "loss": 2.1136, + "step": 2598 + }, + { + "epoch": 0.7977286678944138, + "grad_norm": 0.7337773442268372, + "learning_rate": 9.935300370654317e-05, + "loss": 2.0719, + "step": 2599 + }, + { + "epoch": 0.7980356046654389, + "grad_norm": 0.7481197118759155, + "learning_rate": 9.935220642894652e-05, + "loss": 2.2263, + "step": 2600 + }, + { + "epoch": 0.7983425414364641, + "grad_norm": 0.7383365631103516, + "learning_rate": 9.93514086636224e-05, + "loss": 2.2207, + "step": 2601 + }, + { + "epoch": 0.7986494782074892, + "grad_norm": 0.800762951374054, + "learning_rate": 9.935061041057876e-05, + "loss": 2.1848, + "step": 2602 + }, + { + "epoch": 0.7989564149785144, + "grad_norm": 0.6972829699516296, + "learning_rate": 9.934981166982346e-05, + "loss": 2.1301, + "step": 2603 + }, + { + "epoch": 0.7992633517495396, + "grad_norm": 0.5842304229736328, + "learning_rate": 9.93490124413644e-05, + "loss": 2.1311, + "step": 2604 + }, + { + "epoch": 0.7995702885205648, + "grad_norm": 0.6070491075515747, + "learning_rate": 9.934821272520946e-05, + "loss": 2.2226, + "step": 2605 + }, + { + "epoch": 0.7998772252915899, + "grad_norm": 0.6141406297683716, + "learning_rate": 9.934741252136656e-05, + "loss": 2.1425, + "step": 2606 + }, + { + "epoch": 0.8001841620626151, + "grad_norm": 0.5515148043632507, + "learning_rate": 9.934661182984363e-05, + "loss": 2.1138, + "step": 2607 + }, + { + "epoch": 0.8004910988336402, + "grad_norm": 0.5819688439369202, + "learning_rate": 9.934581065064854e-05, + "loss": 2.0835, + "step": 2608 + }, + { + "epoch": 0.8007980356046654, + "grad_norm": 0.593979001045227, + "learning_rate": 9.934500898378922e-05, + "loss": 2.2262, + "step": 2609 + }, + { + "epoch": 0.8011049723756906, + "grad_norm": 0.6978363990783691, + "learning_rate": 9.934420682927361e-05, + "loss": 2.1283, + "step": 2610 + }, + { + "epoch": 0.8014119091467158, + "grad_norm": 0.6205853223800659, + "learning_rate": 9.934340418710963e-05, + "loss": 2.1254, + "step": 2611 + }, + { + "epoch": 0.8017188459177409, + "grad_norm": 0.5547113418579102, + "learning_rate": 9.93426010573052e-05, + "loss": 2.0895, + "step": 2612 + }, + { + "epoch": 0.8020257826887661, + "grad_norm": 0.5652415156364441, + "learning_rate": 9.934179743986827e-05, + "loss": 2.1496, + "step": 2613 + }, + { + "epoch": 0.8023327194597912, + "grad_norm": 0.5833094120025635, + "learning_rate": 9.934099333480678e-05, + "loss": 2.1159, + "step": 2614 + }, + { + "epoch": 0.8026396562308165, + "grad_norm": 0.5929473638534546, + "learning_rate": 9.934018874212866e-05, + "loss": 2.1512, + "step": 2615 + }, + { + "epoch": 0.8029465930018416, + "grad_norm": 0.6359207630157471, + "learning_rate": 9.93393836618419e-05, + "loss": 2.1384, + "step": 2616 + }, + { + "epoch": 0.8032535297728668, + "grad_norm": 0.5934728384017944, + "learning_rate": 9.933857809395441e-05, + "loss": 2.1087, + "step": 2617 + }, + { + "epoch": 0.8035604665438919, + "grad_norm": 0.5685787796974182, + "learning_rate": 9.933777203847418e-05, + "loss": 2.1521, + "step": 2618 + }, + { + "epoch": 0.8038674033149171, + "grad_norm": 0.6276339292526245, + "learning_rate": 9.933696549540918e-05, + "loss": 2.1151, + "step": 2619 + }, + { + "epoch": 0.8041743400859422, + "grad_norm": 0.6206804513931274, + "learning_rate": 9.933615846476736e-05, + "loss": 2.1872, + "step": 2620 + }, + { + "epoch": 0.8044812768569675, + "grad_norm": 0.6645623445510864, + "learning_rate": 9.933535094655671e-05, + "loss": 2.217, + "step": 2621 + }, + { + "epoch": 0.8047882136279927, + "grad_norm": 0.6639950275421143, + "learning_rate": 9.93345429407852e-05, + "loss": 2.1479, + "step": 2622 + }, + { + "epoch": 0.8050951503990178, + "grad_norm": 0.6284301280975342, + "learning_rate": 9.933373444746081e-05, + "loss": 2.1763, + "step": 2623 + }, + { + "epoch": 0.805402087170043, + "grad_norm": 0.5974198579788208, + "learning_rate": 9.933292546659156e-05, + "loss": 2.1453, + "step": 2624 + }, + { + "epoch": 0.8057090239410681, + "grad_norm": 0.6465814113616943, + "learning_rate": 9.933211599818541e-05, + "loss": 2.1999, + "step": 2625 + }, + { + "epoch": 0.8060159607120934, + "grad_norm": 0.6099503040313721, + "learning_rate": 9.933130604225038e-05, + "loss": 2.1523, + "step": 2626 + }, + { + "epoch": 0.8063228974831185, + "grad_norm": 0.5749596953392029, + "learning_rate": 9.933049559879448e-05, + "loss": 2.0802, + "step": 2627 + }, + { + "epoch": 0.8066298342541437, + "grad_norm": 0.5541282892227173, + "learning_rate": 9.93296846678257e-05, + "loss": 2.0851, + "step": 2628 + }, + { + "epoch": 0.8069367710251688, + "grad_norm": 0.5884469747543335, + "learning_rate": 9.932887324935207e-05, + "loss": 2.1824, + "step": 2629 + }, + { + "epoch": 0.807243707796194, + "grad_norm": 0.7330854535102844, + "learning_rate": 9.93280613433816e-05, + "loss": 2.1463, + "step": 2630 + }, + { + "epoch": 0.8075506445672191, + "grad_norm": 0.7012677192687988, + "learning_rate": 9.932724894992232e-05, + "loss": 2.0907, + "step": 2631 + }, + { + "epoch": 0.8078575813382444, + "grad_norm": 0.6487980484962463, + "learning_rate": 9.932643606898224e-05, + "loss": 2.2131, + "step": 2632 + }, + { + "epoch": 0.8081645181092695, + "grad_norm": 0.7956567406654358, + "learning_rate": 9.932562270056941e-05, + "loss": 2.2289, + "step": 2633 + }, + { + "epoch": 0.8084714548802947, + "grad_norm": 0.7904889583587646, + "learning_rate": 9.932480884469187e-05, + "loss": 2.195, + "step": 2634 + }, + { + "epoch": 0.8087783916513198, + "grad_norm": 0.8088505864143372, + "learning_rate": 9.932399450135766e-05, + "loss": 2.1199, + "step": 2635 + }, + { + "epoch": 0.809085328422345, + "grad_norm": 0.7557070851325989, + "learning_rate": 9.932317967057483e-05, + "loss": 2.177, + "step": 2636 + }, + { + "epoch": 0.8093922651933702, + "grad_norm": 0.8585113286972046, + "learning_rate": 9.932236435235143e-05, + "loss": 2.2215, + "step": 2637 + }, + { + "epoch": 0.8096992019643954, + "grad_norm": 0.9541242718696594, + "learning_rate": 9.932154854669551e-05, + "loss": 2.0971, + "step": 2638 + }, + { + "epoch": 0.8100061387354205, + "grad_norm": 0.9696017503738403, + "learning_rate": 9.932073225361513e-05, + "loss": 2.1723, + "step": 2639 + }, + { + "epoch": 0.8103130755064457, + "grad_norm": 0.9876028895378113, + "learning_rate": 9.931991547311839e-05, + "loss": 2.2266, + "step": 2640 + }, + { + "epoch": 0.8106200122774708, + "grad_norm": 0.9169884324073792, + "learning_rate": 9.931909820521332e-05, + "loss": 2.1453, + "step": 2641 + }, + { + "epoch": 0.810926949048496, + "grad_norm": 0.7645174860954285, + "learning_rate": 9.931828044990801e-05, + "loss": 2.1683, + "step": 2642 + }, + { + "epoch": 0.8112338858195212, + "grad_norm": 0.6733110547065735, + "learning_rate": 9.931746220721056e-05, + "loss": 2.0869, + "step": 2643 + }, + { + "epoch": 0.8115408225905464, + "grad_norm": 0.6033461689949036, + "learning_rate": 9.931664347712904e-05, + "loss": 2.1395, + "step": 2644 + }, + { + "epoch": 0.8118477593615715, + "grad_norm": 0.5953301191329956, + "learning_rate": 9.931582425967154e-05, + "loss": 2.0886, + "step": 2645 + }, + { + "epoch": 0.8121546961325967, + "grad_norm": 0.6587704420089722, + "learning_rate": 9.931500455484616e-05, + "loss": 2.1846, + "step": 2646 + }, + { + "epoch": 0.8124616329036218, + "grad_norm": 0.5837808847427368, + "learning_rate": 9.931418436266101e-05, + "loss": 2.0953, + "step": 2647 + }, + { + "epoch": 0.8127685696746471, + "grad_norm": 0.5593163967132568, + "learning_rate": 9.931336368312417e-05, + "loss": 2.1044, + "step": 2648 + }, + { + "epoch": 0.8130755064456722, + "grad_norm": 0.5758668780326843, + "learning_rate": 9.931254251624378e-05, + "loss": 2.1813, + "step": 2649 + }, + { + "epoch": 0.8133824432166974, + "grad_norm": 0.7128240466117859, + "learning_rate": 9.931172086202793e-05, + "loss": 2.1743, + "step": 2650 + }, + { + "epoch": 0.8136893799877225, + "grad_norm": 0.6214346885681152, + "learning_rate": 9.931089872048476e-05, + "loss": 2.0566, + "step": 2651 + }, + { + "epoch": 0.8139963167587477, + "grad_norm": 0.6279975771903992, + "learning_rate": 9.931007609162239e-05, + "loss": 2.1487, + "step": 2652 + }, + { + "epoch": 0.8143032535297728, + "grad_norm": 0.6137428879737854, + "learning_rate": 9.930925297544895e-05, + "loss": 2.1281, + "step": 2653 + }, + { + "epoch": 0.8146101903007981, + "grad_norm": 0.7433622479438782, + "learning_rate": 9.930842937197255e-05, + "loss": 2.2398, + "step": 2654 + }, + { + "epoch": 0.8149171270718232, + "grad_norm": 0.7490934729576111, + "learning_rate": 9.930760528120137e-05, + "loss": 2.0626, + "step": 2655 + }, + { + "epoch": 0.8152240638428484, + "grad_norm": 0.6829020380973816, + "learning_rate": 9.930678070314352e-05, + "loss": 2.0685, + "step": 2656 + }, + { + "epoch": 0.8155310006138735, + "grad_norm": 0.6328942775726318, + "learning_rate": 9.930595563780718e-05, + "loss": 2.1415, + "step": 2657 + }, + { + "epoch": 0.8158379373848987, + "grad_norm": 0.6919183135032654, + "learning_rate": 9.930513008520048e-05, + "loss": 2.1764, + "step": 2658 + }, + { + "epoch": 0.8161448741559238, + "grad_norm": 0.6600683331489563, + "learning_rate": 9.930430404533158e-05, + "loss": 2.2252, + "step": 2659 + }, + { + "epoch": 0.8164518109269491, + "grad_norm": 0.6614112257957458, + "learning_rate": 9.930347751820866e-05, + "loss": 2.0842, + "step": 2660 + }, + { + "epoch": 0.8167587476979742, + "grad_norm": 0.634395182132721, + "learning_rate": 9.930265050383987e-05, + "loss": 2.1784, + "step": 2661 + }, + { + "epoch": 0.8170656844689994, + "grad_norm": 0.6563819050788879, + "learning_rate": 9.930182300223338e-05, + "loss": 2.1845, + "step": 2662 + }, + { + "epoch": 0.8173726212400245, + "grad_norm": 0.7023175954818726, + "learning_rate": 9.93009950133974e-05, + "loss": 2.1913, + "step": 2663 + }, + { + "epoch": 0.8176795580110497, + "grad_norm": 0.6042037010192871, + "learning_rate": 9.930016653734007e-05, + "loss": 2.1624, + "step": 2664 + }, + { + "epoch": 0.8179864947820749, + "grad_norm": 0.5729875564575195, + "learning_rate": 9.929933757406962e-05, + "loss": 2.0439, + "step": 2665 + }, + { + "epoch": 0.8182934315531001, + "grad_norm": 0.5399687886238098, + "learning_rate": 9.929850812359421e-05, + "loss": 2.1438, + "step": 2666 + }, + { + "epoch": 0.8186003683241252, + "grad_norm": 0.6325745582580566, + "learning_rate": 9.929767818592205e-05, + "loss": 2.1644, + "step": 2667 + }, + { + "epoch": 0.8189073050951504, + "grad_norm": 0.6303146481513977, + "learning_rate": 9.929684776106134e-05, + "loss": 2.1106, + "step": 2668 + }, + { + "epoch": 0.8192142418661755, + "grad_norm": 0.6482712030410767, + "learning_rate": 9.929601684902027e-05, + "loss": 2.0877, + "step": 2669 + }, + { + "epoch": 0.8195211786372008, + "grad_norm": 0.6858036518096924, + "learning_rate": 9.92951854498071e-05, + "loss": 2.1263, + "step": 2670 + }, + { + "epoch": 0.8198281154082259, + "grad_norm": 0.6214284896850586, + "learning_rate": 9.929435356343e-05, + "loss": 2.1516, + "step": 2671 + }, + { + "epoch": 0.8201350521792511, + "grad_norm": 0.5486865639686584, + "learning_rate": 9.92935211898972e-05, + "loss": 2.1199, + "step": 2672 + }, + { + "epoch": 0.8204419889502762, + "grad_norm": 0.62936931848526, + "learning_rate": 9.929268832921693e-05, + "loss": 2.1555, + "step": 2673 + }, + { + "epoch": 0.8207489257213014, + "grad_norm": 0.6402064561843872, + "learning_rate": 9.929185498139744e-05, + "loss": 2.1017, + "step": 2674 + }, + { + "epoch": 0.8210558624923265, + "grad_norm": 0.7254593372344971, + "learning_rate": 9.929102114644693e-05, + "loss": 2.1145, + "step": 2675 + }, + { + "epoch": 0.8213627992633518, + "grad_norm": 0.776472806930542, + "learning_rate": 9.929018682437366e-05, + "loss": 2.2582, + "step": 2676 + }, + { + "epoch": 0.8216697360343769, + "grad_norm": 0.7073757648468018, + "learning_rate": 9.928935201518587e-05, + "loss": 2.1135, + "step": 2677 + }, + { + "epoch": 0.8219766728054021, + "grad_norm": 0.7075079679489136, + "learning_rate": 9.928851671889184e-05, + "loss": 2.128, + "step": 2678 + }, + { + "epoch": 0.8222836095764272, + "grad_norm": 0.7937450408935547, + "learning_rate": 9.928768093549979e-05, + "loss": 2.1401, + "step": 2679 + }, + { + "epoch": 0.8225905463474524, + "grad_norm": 0.7523970603942871, + "learning_rate": 9.928684466501797e-05, + "loss": 2.2055, + "step": 2680 + }, + { + "epoch": 0.8228974831184775, + "grad_norm": 0.6644876599311829, + "learning_rate": 9.928600790745466e-05, + "loss": 2.1449, + "step": 2681 + }, + { + "epoch": 0.8232044198895028, + "grad_norm": 0.6054069399833679, + "learning_rate": 9.928517066281816e-05, + "loss": 2.1191, + "step": 2682 + }, + { + "epoch": 0.8235113566605279, + "grad_norm": 0.6610973477363586, + "learning_rate": 9.92843329311167e-05, + "loss": 2.2247, + "step": 2683 + }, + { + "epoch": 0.8238182934315531, + "grad_norm": 0.69968181848526, + "learning_rate": 9.928349471235858e-05, + "loss": 2.149, + "step": 2684 + }, + { + "epoch": 0.8241252302025782, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.928265600655206e-05, + "loss": 2.1906, + "step": 2685 + }, + { + "epoch": 0.8244321669736034, + "grad_norm": 0.6621972918510437, + "learning_rate": 9.928181681370547e-05, + "loss": 2.1259, + "step": 2686 + }, + { + "epoch": 0.8247391037446286, + "grad_norm": 0.6452053785324097, + "learning_rate": 9.928097713382708e-05, + "loss": 2.1301, + "step": 2687 + }, + { + "epoch": 0.8250460405156538, + "grad_norm": 0.6137326955795288, + "learning_rate": 9.928013696692519e-05, + "loss": 2.0942, + "step": 2688 + }, + { + "epoch": 0.8253529772866789, + "grad_norm": 0.6449215412139893, + "learning_rate": 9.92792963130081e-05, + "loss": 2.2135, + "step": 2689 + }, + { + "epoch": 0.8256599140577041, + "grad_norm": 0.5838732123374939, + "learning_rate": 9.927845517208411e-05, + "loss": 2.1161, + "step": 2690 + }, + { + "epoch": 0.8259668508287292, + "grad_norm": 0.6642805337905884, + "learning_rate": 9.927761354416157e-05, + "loss": 2.1228, + "step": 2691 + }, + { + "epoch": 0.8262737875997545, + "grad_norm": 0.653274416923523, + "learning_rate": 9.927677142924874e-05, + "loss": 2.1777, + "step": 2692 + }, + { + "epoch": 0.8265807243707797, + "grad_norm": 0.6471827030181885, + "learning_rate": 9.927592882735398e-05, + "loss": 2.0756, + "step": 2693 + }, + { + "epoch": 0.8268876611418048, + "grad_norm": 0.6215457916259766, + "learning_rate": 9.927508573848562e-05, + "loss": 2.0691, + "step": 2694 + }, + { + "epoch": 0.82719459791283, + "grad_norm": 0.6343390345573425, + "learning_rate": 9.927424216265198e-05, + "loss": 2.2145, + "step": 2695 + }, + { + "epoch": 0.8275015346838551, + "grad_norm": 0.5296334624290466, + "learning_rate": 9.927339809986138e-05, + "loss": 2.0861, + "step": 2696 + }, + { + "epoch": 0.8278084714548803, + "grad_norm": 0.6457146406173706, + "learning_rate": 9.92725535501222e-05, + "loss": 2.1703, + "step": 2697 + }, + { + "epoch": 0.8281154082259055, + "grad_norm": 0.753579318523407, + "learning_rate": 9.927170851344276e-05, + "loss": 2.1628, + "step": 2698 + }, + { + "epoch": 0.8284223449969307, + "grad_norm": 0.7327163815498352, + "learning_rate": 9.927086298983141e-05, + "loss": 2.105, + "step": 2699 + }, + { + "epoch": 0.8287292817679558, + "grad_norm": 0.7786175608634949, + "learning_rate": 9.927001697929653e-05, + "loss": 2.084, + "step": 2700 + }, + { + "epoch": 0.829036218538981, + "grad_norm": 0.6370857357978821, + "learning_rate": 9.926917048184646e-05, + "loss": 2.0888, + "step": 2701 + }, + { + "epoch": 0.8293431553100061, + "grad_norm": 0.6600006818771362, + "learning_rate": 9.926832349748955e-05, + "loss": 2.148, + "step": 2702 + }, + { + "epoch": 0.8296500920810314, + "grad_norm": 0.6266845464706421, + "learning_rate": 9.926747602623422e-05, + "loss": 2.2182, + "step": 2703 + }, + { + "epoch": 0.8299570288520565, + "grad_norm": 0.588934600353241, + "learning_rate": 9.92666280680888e-05, + "loss": 2.1879, + "step": 2704 + }, + { + "epoch": 0.8302639656230817, + "grad_norm": 0.6467881202697754, + "learning_rate": 9.926577962306168e-05, + "loss": 2.1082, + "step": 2705 + }, + { + "epoch": 0.8305709023941068, + "grad_norm": 0.6256638765335083, + "learning_rate": 9.926493069116127e-05, + "loss": 2.1007, + "step": 2706 + }, + { + "epoch": 0.830877839165132, + "grad_norm": 0.5710256099700928, + "learning_rate": 9.926408127239592e-05, + "loss": 2.0783, + "step": 2707 + }, + { + "epoch": 0.8311847759361571, + "grad_norm": 0.5836597681045532, + "learning_rate": 9.926323136677405e-05, + "loss": 2.1292, + "step": 2708 + }, + { + "epoch": 0.8314917127071824, + "grad_norm": 0.6420408487319946, + "learning_rate": 9.926238097430405e-05, + "loss": 2.1191, + "step": 2709 + }, + { + "epoch": 0.8317986494782075, + "grad_norm": 0.6192520260810852, + "learning_rate": 9.926153009499433e-05, + "loss": 2.1401, + "step": 2710 + }, + { + "epoch": 0.8321055862492327, + "grad_norm": 0.5986925959587097, + "learning_rate": 9.92606787288533e-05, + "loss": 2.0466, + "step": 2711 + }, + { + "epoch": 0.8324125230202578, + "grad_norm": 0.6386710405349731, + "learning_rate": 9.925982687588937e-05, + "loss": 2.1975, + "step": 2712 + }, + { + "epoch": 0.832719459791283, + "grad_norm": 0.6678250432014465, + "learning_rate": 9.925897453611095e-05, + "loss": 2.1744, + "step": 2713 + }, + { + "epoch": 0.8330263965623081, + "grad_norm": 0.628873348236084, + "learning_rate": 9.925812170952648e-05, + "loss": 2.0901, + "step": 2714 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6365368366241455, + "learning_rate": 9.925726839614438e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.8336402701043585, + "grad_norm": 0.6812825798988342, + "learning_rate": 9.925641459597309e-05, + "loss": 2.1163, + "step": 2716 + }, + { + "epoch": 0.8339472068753837, + "grad_norm": 0.6961301565170288, + "learning_rate": 9.925556030902103e-05, + "loss": 2.1634, + "step": 2717 + }, + { + "epoch": 0.8342541436464088, + "grad_norm": 0.687017023563385, + "learning_rate": 9.925470553529666e-05, + "loss": 2.1921, + "step": 2718 + }, + { + "epoch": 0.834561080417434, + "grad_norm": 0.6528787612915039, + "learning_rate": 9.925385027480841e-05, + "loss": 2.1148, + "step": 2719 + }, + { + "epoch": 0.8348680171884592, + "grad_norm": 0.6092917323112488, + "learning_rate": 9.925299452756476e-05, + "loss": 2.0154, + "step": 2720 + }, + { + "epoch": 0.8351749539594844, + "grad_norm": 0.6537092328071594, + "learning_rate": 9.925213829357413e-05, + "loss": 2.1775, + "step": 2721 + }, + { + "epoch": 0.8354818907305095, + "grad_norm": 0.6560773849487305, + "learning_rate": 9.925128157284503e-05, + "loss": 2.1628, + "step": 2722 + }, + { + "epoch": 0.8357888275015347, + "grad_norm": 0.5976104140281677, + "learning_rate": 9.925042436538588e-05, + "loss": 2.1527, + "step": 2723 + }, + { + "epoch": 0.8360957642725598, + "grad_norm": 0.6577131152153015, + "learning_rate": 9.924956667120516e-05, + "loss": 2.1449, + "step": 2724 + }, + { + "epoch": 0.836402701043585, + "grad_norm": 0.6574232578277588, + "learning_rate": 9.924870849031136e-05, + "loss": 2.0517, + "step": 2725 + }, + { + "epoch": 0.8367096378146102, + "grad_norm": 0.5988326072692871, + "learning_rate": 9.924784982271297e-05, + "loss": 2.0975, + "step": 2726 + }, + { + "epoch": 0.8370165745856354, + "grad_norm": 0.5970706939697266, + "learning_rate": 9.924699066841845e-05, + "loss": 2.1754, + "step": 2727 + }, + { + "epoch": 0.8373235113566605, + "grad_norm": 0.6547200679779053, + "learning_rate": 9.924613102743632e-05, + "loss": 2.1651, + "step": 2728 + }, + { + "epoch": 0.8376304481276857, + "grad_norm": 0.643358588218689, + "learning_rate": 9.924527089977504e-05, + "loss": 2.1355, + "step": 2729 + }, + { + "epoch": 0.8379373848987108, + "grad_norm": 0.6696504950523376, + "learning_rate": 9.924441028544314e-05, + "loss": 2.1444, + "step": 2730 + }, + { + "epoch": 0.8382443216697361, + "grad_norm": 0.5923263430595398, + "learning_rate": 9.924354918444911e-05, + "loss": 2.1656, + "step": 2731 + }, + { + "epoch": 0.8385512584407612, + "grad_norm": 0.6507698893547058, + "learning_rate": 9.924268759680146e-05, + "loss": 2.1172, + "step": 2732 + }, + { + "epoch": 0.8388581952117864, + "grad_norm": 0.6240561008453369, + "learning_rate": 9.924182552250873e-05, + "loss": 2.113, + "step": 2733 + }, + { + "epoch": 0.8391651319828115, + "grad_norm": 0.7350605726242065, + "learning_rate": 9.92409629615794e-05, + "loss": 2.2099, + "step": 2734 + }, + { + "epoch": 0.8394720687538367, + "grad_norm": 0.679027795791626, + "learning_rate": 9.924009991402202e-05, + "loss": 2.1202, + "step": 2735 + }, + { + "epoch": 0.8397790055248618, + "grad_norm": 0.7187801003456116, + "learning_rate": 9.923923637984512e-05, + "loss": 2.1994, + "step": 2736 + }, + { + "epoch": 0.8400859422958871, + "grad_norm": 0.7437569499015808, + "learning_rate": 9.92383723590572e-05, + "loss": 2.1778, + "step": 2737 + }, + { + "epoch": 0.8403928790669122, + "grad_norm": 0.7004902958869934, + "learning_rate": 9.923750785166686e-05, + "loss": 2.1478, + "step": 2738 + }, + { + "epoch": 0.8406998158379374, + "grad_norm": 0.632478654384613, + "learning_rate": 9.923664285768258e-05, + "loss": 2.1785, + "step": 2739 + }, + { + "epoch": 0.8410067526089625, + "grad_norm": 0.6399826407432556, + "learning_rate": 9.923577737711295e-05, + "loss": 2.1708, + "step": 2740 + }, + { + "epoch": 0.8413136893799877, + "grad_norm": 0.649340033531189, + "learning_rate": 9.92349114099665e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.8416206261510129, + "grad_norm": 0.6143749952316284, + "learning_rate": 9.923404495625182e-05, + "loss": 2.0696, + "step": 2742 + }, + { + "epoch": 0.8419275629220381, + "grad_norm": 0.655846357345581, + "learning_rate": 9.923317801597742e-05, + "loss": 2.1163, + "step": 2743 + }, + { + "epoch": 0.8422344996930632, + "grad_norm": 0.588096022605896, + "learning_rate": 9.923231058915192e-05, + "loss": 2.0893, + "step": 2744 + }, + { + "epoch": 0.8425414364640884, + "grad_norm": 0.5445908904075623, + "learning_rate": 9.923144267578386e-05, + "loss": 2.1223, + "step": 2745 + }, + { + "epoch": 0.8428483732351135, + "grad_norm": 0.5372910499572754, + "learning_rate": 9.923057427588182e-05, + "loss": 2.1386, + "step": 2746 + }, + { + "epoch": 0.8431553100061387, + "grad_norm": 0.5118899345397949, + "learning_rate": 9.922970538945442e-05, + "loss": 2.0532, + "step": 2747 + }, + { + "epoch": 0.8434622467771639, + "grad_norm": 0.5252440571784973, + "learning_rate": 9.922883601651019e-05, + "loss": 2.1679, + "step": 2748 + }, + { + "epoch": 0.8437691835481891, + "grad_norm": 0.5978875160217285, + "learning_rate": 9.922796615705776e-05, + "loss": 2.2054, + "step": 2749 + }, + { + "epoch": 0.8440761203192142, + "grad_norm": 0.5642610788345337, + "learning_rate": 9.922709581110572e-05, + "loss": 2.1886, + "step": 2750 + }, + { + "epoch": 0.8443830570902394, + "grad_norm": 0.6332407593727112, + "learning_rate": 9.922622497866265e-05, + "loss": 2.1618, + "step": 2751 + }, + { + "epoch": 0.8446899938612645, + "grad_norm": 0.6971728801727295, + "learning_rate": 9.922535365973718e-05, + "loss": 2.1011, + "step": 2752 + }, + { + "epoch": 0.8449969306322898, + "grad_norm": 0.6917250156402588, + "learning_rate": 9.922448185433792e-05, + "loss": 2.1408, + "step": 2753 + }, + { + "epoch": 0.8453038674033149, + "grad_norm": 0.748960554599762, + "learning_rate": 9.922360956247348e-05, + "loss": 2.1612, + "step": 2754 + }, + { + "epoch": 0.8456108041743401, + "grad_norm": 0.6739722490310669, + "learning_rate": 9.922273678415245e-05, + "loss": 2.1234, + "step": 2755 + }, + { + "epoch": 0.8459177409453652, + "grad_norm": 0.6310722827911377, + "learning_rate": 9.922186351938351e-05, + "loss": 2.1476, + "step": 2756 + }, + { + "epoch": 0.8462246777163904, + "grad_norm": 0.5992079973220825, + "learning_rate": 9.922098976817527e-05, + "loss": 2.1009, + "step": 2757 + }, + { + "epoch": 0.8465316144874155, + "grad_norm": 0.5697188973426819, + "learning_rate": 9.922011553053637e-05, + "loss": 2.1277, + "step": 2758 + }, + { + "epoch": 0.8468385512584408, + "grad_norm": 0.7005256414413452, + "learning_rate": 9.921924080647541e-05, + "loss": 2.1592, + "step": 2759 + }, + { + "epoch": 0.8471454880294659, + "grad_norm": 0.7664382457733154, + "learning_rate": 9.921836559600109e-05, + "loss": 2.2328, + "step": 2760 + }, + { + "epoch": 0.8474524248004911, + "grad_norm": 0.8668230772018433, + "learning_rate": 9.921748989912201e-05, + "loss": 2.2285, + "step": 2761 + }, + { + "epoch": 0.8477593615715162, + "grad_norm": 0.9423169493675232, + "learning_rate": 9.921661371584685e-05, + "loss": 2.1172, + "step": 2762 + }, + { + "epoch": 0.8480662983425414, + "grad_norm": 0.8547552824020386, + "learning_rate": 9.921573704618428e-05, + "loss": 2.1426, + "step": 2763 + }, + { + "epoch": 0.8483732351135667, + "grad_norm": 0.7568690776824951, + "learning_rate": 9.921485989014294e-05, + "loss": 2.0861, + "step": 2764 + }, + { + "epoch": 0.8486801718845918, + "grad_norm": 0.6535828709602356, + "learning_rate": 9.92139822477315e-05, + "loss": 2.1705, + "step": 2765 + }, + { + "epoch": 0.848987108655617, + "grad_norm": 0.6099218130111694, + "learning_rate": 9.921310411895867e-05, + "loss": 2.1666, + "step": 2766 + }, + { + "epoch": 0.8492940454266421, + "grad_norm": 0.6315065026283264, + "learning_rate": 9.92122255038331e-05, + "loss": 2.1868, + "step": 2767 + }, + { + "epoch": 0.8496009821976673, + "grad_norm": 0.6861329078674316, + "learning_rate": 9.921134640236344e-05, + "loss": 2.1056, + "step": 2768 + }, + { + "epoch": 0.8499079189686924, + "grad_norm": 0.6357519626617432, + "learning_rate": 9.921046681455844e-05, + "loss": 2.1272, + "step": 2769 + }, + { + "epoch": 0.8502148557397177, + "grad_norm": 0.6245810389518738, + "learning_rate": 9.920958674042676e-05, + "loss": 2.1313, + "step": 2770 + }, + { + "epoch": 0.8505217925107428, + "grad_norm": 0.6087192296981812, + "learning_rate": 9.920870617997709e-05, + "loss": 2.123, + "step": 2771 + }, + { + "epoch": 0.850828729281768, + "grad_norm": 0.6384228467941284, + "learning_rate": 9.920782513321814e-05, + "loss": 2.1343, + "step": 2772 + }, + { + "epoch": 0.8511356660527931, + "grad_norm": 0.6143882274627686, + "learning_rate": 9.920694360015863e-05, + "loss": 2.0706, + "step": 2773 + }, + { + "epoch": 0.8514426028238183, + "grad_norm": 0.5561975240707397, + "learning_rate": 9.920606158080725e-05, + "loss": 2.1015, + "step": 2774 + }, + { + "epoch": 0.8517495395948435, + "grad_norm": 0.5434146523475647, + "learning_rate": 9.920517907517275e-05, + "loss": 2.1306, + "step": 2775 + }, + { + "epoch": 0.8520564763658687, + "grad_norm": 0.6028591990470886, + "learning_rate": 9.920429608326382e-05, + "loss": 2.1665, + "step": 2776 + }, + { + "epoch": 0.8523634131368938, + "grad_norm": 0.6491599082946777, + "learning_rate": 9.920341260508918e-05, + "loss": 2.0715, + "step": 2777 + }, + { + "epoch": 0.852670349907919, + "grad_norm": 0.6350167989730835, + "learning_rate": 9.92025286406576e-05, + "loss": 2.1492, + "step": 2778 + }, + { + "epoch": 0.8529772866789441, + "grad_norm": 0.5726897120475769, + "learning_rate": 9.92016441899778e-05, + "loss": 2.1128, + "step": 2779 + }, + { + "epoch": 0.8532842234499693, + "grad_norm": 0.5680630207061768, + "learning_rate": 9.92007592530585e-05, + "loss": 2.0718, + "step": 2780 + }, + { + "epoch": 0.8535911602209945, + "grad_norm": 0.5901346802711487, + "learning_rate": 9.919987382990845e-05, + "loss": 2.0577, + "step": 2781 + }, + { + "epoch": 0.8538980969920197, + "grad_norm": 0.5756994485855103, + "learning_rate": 9.919898792053643e-05, + "loss": 2.106, + "step": 2782 + }, + { + "epoch": 0.8542050337630448, + "grad_norm": 0.5831238031387329, + "learning_rate": 9.919810152495116e-05, + "loss": 2.0507, + "step": 2783 + }, + { + "epoch": 0.85451197053407, + "grad_norm": 0.529931902885437, + "learning_rate": 9.919721464316143e-05, + "loss": 2.0934, + "step": 2784 + }, + { + "epoch": 0.8548189073050951, + "grad_norm": 0.603672981262207, + "learning_rate": 9.919632727517597e-05, + "loss": 2.164, + "step": 2785 + }, + { + "epoch": 0.8551258440761204, + "grad_norm": 0.5741528868675232, + "learning_rate": 9.919543942100357e-05, + "loss": 2.0948, + "step": 2786 + }, + { + "epoch": 0.8554327808471455, + "grad_norm": 0.5689142942428589, + "learning_rate": 9.919455108065303e-05, + "loss": 2.1572, + "step": 2787 + }, + { + "epoch": 0.8557397176181707, + "grad_norm": 0.5767523646354675, + "learning_rate": 9.919366225413308e-05, + "loss": 2.0528, + "step": 2788 + }, + { + "epoch": 0.8560466543891958, + "grad_norm": 0.6004374623298645, + "learning_rate": 9.919277294145252e-05, + "loss": 2.1078, + "step": 2789 + }, + { + "epoch": 0.856353591160221, + "grad_norm": 0.6199560761451721, + "learning_rate": 9.919188314262017e-05, + "loss": 2.034, + "step": 2790 + }, + { + "epoch": 0.8566605279312461, + "grad_norm": 0.5928464531898499, + "learning_rate": 9.919099285764478e-05, + "loss": 2.1226, + "step": 2791 + }, + { + "epoch": 0.8569674647022714, + "grad_norm": 0.5620111227035522, + "learning_rate": 9.919010208653517e-05, + "loss": 2.1387, + "step": 2792 + }, + { + "epoch": 0.8572744014732965, + "grad_norm": 0.6035314798355103, + "learning_rate": 9.918921082930015e-05, + "loss": 2.0888, + "step": 2793 + }, + { + "epoch": 0.8575813382443217, + "grad_norm": 0.6842171549797058, + "learning_rate": 9.91883190859485e-05, + "loss": 2.15, + "step": 2794 + }, + { + "epoch": 0.8578882750153468, + "grad_norm": 0.7600229978561401, + "learning_rate": 9.918742685648906e-05, + "loss": 2.1776, + "step": 2795 + }, + { + "epoch": 0.858195211786372, + "grad_norm": 0.641504168510437, + "learning_rate": 9.918653414093065e-05, + "loss": 2.086, + "step": 2796 + }, + { + "epoch": 0.8585021485573971, + "grad_norm": 0.6062462329864502, + "learning_rate": 9.918564093928207e-05, + "loss": 2.0772, + "step": 2797 + }, + { + "epoch": 0.8588090853284224, + "grad_norm": 0.5259165167808533, + "learning_rate": 9.918474725155214e-05, + "loss": 2.1034, + "step": 2798 + }, + { + "epoch": 0.8591160220994475, + "grad_norm": 0.532511830329895, + "learning_rate": 9.918385307774973e-05, + "loss": 2.103, + "step": 2799 + }, + { + "epoch": 0.8594229588704727, + "grad_norm": 0.5996485352516174, + "learning_rate": 9.918295841788366e-05, + "loss": 2.1698, + "step": 2800 + }, + { + "epoch": 0.8597298956414978, + "grad_norm": 0.5895976424217224, + "learning_rate": 9.918206327196276e-05, + "loss": 2.132, + "step": 2801 + }, + { + "epoch": 0.860036832412523, + "grad_norm": 0.6363179087638855, + "learning_rate": 9.918116763999588e-05, + "loss": 2.0967, + "step": 2802 + }, + { + "epoch": 0.8603437691835482, + "grad_norm": 0.6594113707542419, + "learning_rate": 9.918027152199187e-05, + "loss": 2.1266, + "step": 2803 + }, + { + "epoch": 0.8606507059545734, + "grad_norm": 0.694879412651062, + "learning_rate": 9.917937491795961e-05, + "loss": 2.0694, + "step": 2804 + }, + { + "epoch": 0.8609576427255985, + "grad_norm": 0.6310710906982422, + "learning_rate": 9.917847782790793e-05, + "loss": 2.1546, + "step": 2805 + }, + { + "epoch": 0.8612645794966237, + "grad_norm": 0.6166081428527832, + "learning_rate": 9.917758025184572e-05, + "loss": 2.131, + "step": 2806 + }, + { + "epoch": 0.8615715162676488, + "grad_norm": 0.5857066512107849, + "learning_rate": 9.917668218978182e-05, + "loss": 2.1529, + "step": 2807 + }, + { + "epoch": 0.861878453038674, + "grad_norm": 0.6374151706695557, + "learning_rate": 9.917578364172513e-05, + "loss": 2.151, + "step": 2808 + }, + { + "epoch": 0.8621853898096992, + "grad_norm": 0.6760959625244141, + "learning_rate": 9.917488460768453e-05, + "loss": 2.1955, + "step": 2809 + }, + { + "epoch": 0.8624923265807244, + "grad_norm": 0.6308501362800598, + "learning_rate": 9.917398508766889e-05, + "loss": 2.1449, + "step": 2810 + }, + { + "epoch": 0.8627992633517495, + "grad_norm": 0.615181028842926, + "learning_rate": 9.91730850816871e-05, + "loss": 2.0326, + "step": 2811 + }, + { + "epoch": 0.8631062001227747, + "grad_norm": 0.6746891736984253, + "learning_rate": 9.917218458974809e-05, + "loss": 2.1472, + "step": 2812 + }, + { + "epoch": 0.8634131368937998, + "grad_norm": 0.6594959497451782, + "learning_rate": 9.91712836118607e-05, + "loss": 2.0879, + "step": 2813 + }, + { + "epoch": 0.8637200736648251, + "grad_norm": 0.6843087077140808, + "learning_rate": 9.91703821480339e-05, + "loss": 2.13, + "step": 2814 + }, + { + "epoch": 0.8640270104358502, + "grad_norm": 0.7513928413391113, + "learning_rate": 9.916948019827653e-05, + "loss": 2.1866, + "step": 2815 + }, + { + "epoch": 0.8643339472068754, + "grad_norm": 0.7352319955825806, + "learning_rate": 9.916857776259755e-05, + "loss": 2.0844, + "step": 2816 + }, + { + "epoch": 0.8646408839779005, + "grad_norm": 0.6901769638061523, + "learning_rate": 9.916767484100587e-05, + "loss": 2.086, + "step": 2817 + }, + { + "epoch": 0.8649478207489257, + "grad_norm": 0.621734619140625, + "learning_rate": 9.91667714335104e-05, + "loss": 2.0764, + "step": 2818 + }, + { + "epoch": 0.8652547575199508, + "grad_norm": 0.5779813528060913, + "learning_rate": 9.916586754012008e-05, + "loss": 2.0568, + "step": 2819 + }, + { + "epoch": 0.8655616942909761, + "grad_norm": 0.566251814365387, + "learning_rate": 9.916496316084385e-05, + "loss": 2.1624, + "step": 2820 + }, + { + "epoch": 0.8658686310620012, + "grad_norm": 0.6039763689041138, + "learning_rate": 9.916405829569062e-05, + "loss": 2.0412, + "step": 2821 + }, + { + "epoch": 0.8661755678330264, + "grad_norm": 0.587469220161438, + "learning_rate": 9.916315294466935e-05, + "loss": 2.1513, + "step": 2822 + }, + { + "epoch": 0.8664825046040515, + "grad_norm": 0.5792883634567261, + "learning_rate": 9.916224710778901e-05, + "loss": 2.055, + "step": 2823 + }, + { + "epoch": 0.8667894413750767, + "grad_norm": 0.5533844232559204, + "learning_rate": 9.916134078505852e-05, + "loss": 2.1237, + "step": 2824 + }, + { + "epoch": 0.8670963781461019, + "grad_norm": 0.6140845417976379, + "learning_rate": 9.916043397648685e-05, + "loss": 2.1481, + "step": 2825 + }, + { + "epoch": 0.8674033149171271, + "grad_norm": 0.6092365384101868, + "learning_rate": 9.915952668208295e-05, + "loss": 2.1567, + "step": 2826 + }, + { + "epoch": 0.8677102516881522, + "grad_norm": 0.5712884068489075, + "learning_rate": 9.915861890185578e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.8680171884591774, + "grad_norm": 0.5314213633537292, + "learning_rate": 9.915771063581434e-05, + "loss": 2.0408, + "step": 2828 + }, + { + "epoch": 0.8683241252302025, + "grad_norm": 0.5258345007896423, + "learning_rate": 9.915680188396759e-05, + "loss": 2.0968, + "step": 2829 + }, + { + "epoch": 0.8686310620012277, + "grad_norm": 0.6071497797966003, + "learning_rate": 9.915589264632453e-05, + "loss": 2.0924, + "step": 2830 + }, + { + "epoch": 0.8689379987722529, + "grad_norm": 0.6742420792579651, + "learning_rate": 9.915498292289408e-05, + "loss": 2.1276, + "step": 2831 + }, + { + "epoch": 0.8692449355432781, + "grad_norm": 0.7642729878425598, + "learning_rate": 9.915407271368533e-05, + "loss": 2.204, + "step": 2832 + }, + { + "epoch": 0.8695518723143032, + "grad_norm": 0.8024489283561707, + "learning_rate": 9.915316201870718e-05, + "loss": 2.163, + "step": 2833 + }, + { + "epoch": 0.8698588090853284, + "grad_norm": 0.8268367648124695, + "learning_rate": 9.915225083796871e-05, + "loss": 2.117, + "step": 2834 + }, + { + "epoch": 0.8701657458563536, + "grad_norm": 0.7761407494544983, + "learning_rate": 9.915133917147888e-05, + "loss": 2.0727, + "step": 2835 + }, + { + "epoch": 0.8704726826273788, + "grad_norm": 0.7515753507614136, + "learning_rate": 9.91504270192467e-05, + "loss": 2.075, + "step": 2836 + }, + { + "epoch": 0.870779619398404, + "grad_norm": 0.6203973889350891, + "learning_rate": 9.914951438128119e-05, + "loss": 2.1163, + "step": 2837 + }, + { + "epoch": 0.8710865561694291, + "grad_norm": 0.6056976318359375, + "learning_rate": 9.914860125759138e-05, + "loss": 2.1515, + "step": 2838 + }, + { + "epoch": 0.8713934929404543, + "grad_norm": 0.6472234725952148, + "learning_rate": 9.914768764818627e-05, + "loss": 2.1618, + "step": 2839 + }, + { + "epoch": 0.8717004297114794, + "grad_norm": 0.5981749892234802, + "learning_rate": 9.914677355307491e-05, + "loss": 2.0763, + "step": 2840 + }, + { + "epoch": 0.8720073664825047, + "grad_norm": 0.5721938014030457, + "learning_rate": 9.914585897226634e-05, + "loss": 2.0916, + "step": 2841 + }, + { + "epoch": 0.8723143032535298, + "grad_norm": 0.6079535484313965, + "learning_rate": 9.914494390576958e-05, + "loss": 2.0767, + "step": 2842 + }, + { + "epoch": 0.872621240024555, + "grad_norm": 0.6684066653251648, + "learning_rate": 9.914402835359368e-05, + "loss": 2.2712, + "step": 2843 + }, + { + "epoch": 0.8729281767955801, + "grad_norm": 0.6992711424827576, + "learning_rate": 9.91431123157477e-05, + "loss": 2.0813, + "step": 2844 + }, + { + "epoch": 0.8732351135666053, + "grad_norm": 0.6585392951965332, + "learning_rate": 9.914219579224065e-05, + "loss": 2.1303, + "step": 2845 + }, + { + "epoch": 0.8735420503376304, + "grad_norm": 0.7267395257949829, + "learning_rate": 9.914127878308164e-05, + "loss": 2.2253, + "step": 2846 + }, + { + "epoch": 0.8738489871086557, + "grad_norm": 0.6764006018638611, + "learning_rate": 9.91403612882797e-05, + "loss": 2.0886, + "step": 2847 + }, + { + "epoch": 0.8741559238796808, + "grad_norm": 0.612808108329773, + "learning_rate": 9.91394433078439e-05, + "loss": 2.0469, + "step": 2848 + }, + { + "epoch": 0.874462860650706, + "grad_norm": 0.5598782896995544, + "learning_rate": 9.913852484178334e-05, + "loss": 2.1745, + "step": 2849 + }, + { + "epoch": 0.8747697974217311, + "grad_norm": 0.6498168706893921, + "learning_rate": 9.913760589010707e-05, + "loss": 2.2657, + "step": 2850 + }, + { + "epoch": 0.8750767341927563, + "grad_norm": 0.6796014904975891, + "learning_rate": 9.913668645282418e-05, + "loss": 2.1056, + "step": 2851 + }, + { + "epoch": 0.8753836709637814, + "grad_norm": 0.7409440279006958, + "learning_rate": 9.913576652994376e-05, + "loss": 2.1533, + "step": 2852 + }, + { + "epoch": 0.8756906077348067, + "grad_norm": 0.7044464945793152, + "learning_rate": 9.913484612147488e-05, + "loss": 2.2088, + "step": 2853 + }, + { + "epoch": 0.8759975445058318, + "grad_norm": 0.6333544254302979, + "learning_rate": 9.913392522742666e-05, + "loss": 2.132, + "step": 2854 + }, + { + "epoch": 0.876304481276857, + "grad_norm": 0.603382408618927, + "learning_rate": 9.91330038478082e-05, + "loss": 2.0657, + "step": 2855 + }, + { + "epoch": 0.8766114180478821, + "grad_norm": 0.5919856429100037, + "learning_rate": 9.913208198262858e-05, + "loss": 2.0854, + "step": 2856 + }, + { + "epoch": 0.8769183548189073, + "grad_norm": 0.6033365726470947, + "learning_rate": 9.913115963189694e-05, + "loss": 2.0825, + "step": 2857 + }, + { + "epoch": 0.8772252915899325, + "grad_norm": 0.5917964577674866, + "learning_rate": 9.913023679562238e-05, + "loss": 2.1608, + "step": 2858 + }, + { + "epoch": 0.8775322283609577, + "grad_norm": 0.5953360795974731, + "learning_rate": 9.912931347381402e-05, + "loss": 2.1454, + "step": 2859 + }, + { + "epoch": 0.8778391651319828, + "grad_norm": 0.5949352979660034, + "learning_rate": 9.9128389666481e-05, + "loss": 2.1575, + "step": 2860 + }, + { + "epoch": 0.878146101903008, + "grad_norm": 0.5468181371688843, + "learning_rate": 9.912746537363243e-05, + "loss": 2.151, + "step": 2861 + }, + { + "epoch": 0.8784530386740331, + "grad_norm": 0.5476632714271545, + "learning_rate": 9.912654059527746e-05, + "loss": 2.1015, + "step": 2862 + }, + { + "epoch": 0.8787599754450584, + "grad_norm": 0.6881390810012817, + "learning_rate": 9.912561533142521e-05, + "loss": 2.2002, + "step": 2863 + }, + { + "epoch": 0.8790669122160835, + "grad_norm": 0.6663404703140259, + "learning_rate": 9.912468958208486e-05, + "loss": 2.0691, + "step": 2864 + }, + { + "epoch": 0.8793738489871087, + "grad_norm": 0.5739100575447083, + "learning_rate": 9.91237633472655e-05, + "loss": 2.0852, + "step": 2865 + }, + { + "epoch": 0.8796807857581338, + "grad_norm": 0.5227558016777039, + "learning_rate": 9.912283662697635e-05, + "loss": 2.1144, + "step": 2866 + }, + { + "epoch": 0.879987722529159, + "grad_norm": 0.5626821517944336, + "learning_rate": 9.912190942122652e-05, + "loss": 2.0796, + "step": 2867 + }, + { + "epoch": 0.8802946593001841, + "grad_norm": 0.5367855429649353, + "learning_rate": 9.912098173002518e-05, + "loss": 2.0768, + "step": 2868 + }, + { + "epoch": 0.8806015960712094, + "grad_norm": 0.5285482406616211, + "learning_rate": 9.912005355338152e-05, + "loss": 2.0832, + "step": 2869 + }, + { + "epoch": 0.8809085328422345, + "grad_norm": 0.5384502410888672, + "learning_rate": 9.91191248913047e-05, + "loss": 2.0187, + "step": 2870 + }, + { + "epoch": 0.8812154696132597, + "grad_norm": 0.5099567770957947, + "learning_rate": 9.91181957438039e-05, + "loss": 2.0865, + "step": 2871 + }, + { + "epoch": 0.8815224063842848, + "grad_norm": 0.5513966679573059, + "learning_rate": 9.911726611088831e-05, + "loss": 2.1097, + "step": 2872 + }, + { + "epoch": 0.88182934315531, + "grad_norm": 0.5411790609359741, + "learning_rate": 9.911633599256709e-05, + "loss": 2.0964, + "step": 2873 + }, + { + "epoch": 0.8821362799263351, + "grad_norm": 0.6151100397109985, + "learning_rate": 9.911540538884947e-05, + "loss": 2.1006, + "step": 2874 + }, + { + "epoch": 0.8824432166973604, + "grad_norm": 0.754391610622406, + "learning_rate": 9.911447429974461e-05, + "loss": 2.1493, + "step": 2875 + }, + { + "epoch": 0.8827501534683855, + "grad_norm": 0.7485715746879578, + "learning_rate": 9.911354272526172e-05, + "loss": 2.1136, + "step": 2876 + }, + { + "epoch": 0.8830570902394107, + "grad_norm": 0.6808591485023499, + "learning_rate": 9.911261066541003e-05, + "loss": 2.1238, + "step": 2877 + }, + { + "epoch": 0.8833640270104358, + "grad_norm": 0.5771127343177795, + "learning_rate": 9.911167812019874e-05, + "loss": 2.0846, + "step": 2878 + }, + { + "epoch": 0.883670963781461, + "grad_norm": 0.5991767048835754, + "learning_rate": 9.911074508963705e-05, + "loss": 2.1486, + "step": 2879 + }, + { + "epoch": 0.8839779005524862, + "grad_norm": 0.6899440884590149, + "learning_rate": 9.91098115737342e-05, + "loss": 2.1357, + "step": 2880 + }, + { + "epoch": 0.8842848373235114, + "grad_norm": 0.7102574110031128, + "learning_rate": 9.91088775724994e-05, + "loss": 2.1269, + "step": 2881 + }, + { + "epoch": 0.8845917740945365, + "grad_norm": 0.7238754034042358, + "learning_rate": 9.910794308594189e-05, + "loss": 2.0829, + "step": 2882 + }, + { + "epoch": 0.8848987108655617, + "grad_norm": 0.7232441902160645, + "learning_rate": 9.91070081140709e-05, + "loss": 2.1704, + "step": 2883 + }, + { + "epoch": 0.8852056476365868, + "grad_norm": 0.7136173844337463, + "learning_rate": 9.910607265689569e-05, + "loss": 2.1553, + "step": 2884 + }, + { + "epoch": 0.885512584407612, + "grad_norm": 0.6566216945648193, + "learning_rate": 9.910513671442547e-05, + "loss": 2.0856, + "step": 2885 + }, + { + "epoch": 0.8858195211786372, + "grad_norm": 0.5712916851043701, + "learning_rate": 9.910420028666951e-05, + "loss": 2.1399, + "step": 2886 + }, + { + "epoch": 0.8861264579496624, + "grad_norm": 0.727664589881897, + "learning_rate": 9.910326337363707e-05, + "loss": 2.088, + "step": 2887 + }, + { + "epoch": 0.8864333947206875, + "grad_norm": 0.799963653087616, + "learning_rate": 9.91023259753374e-05, + "loss": 2.0984, + "step": 2888 + }, + { + "epoch": 0.8867403314917127, + "grad_norm": 0.9462977051734924, + "learning_rate": 9.910138809177975e-05, + "loss": 2.1262, + "step": 2889 + }, + { + "epoch": 0.8870472682627378, + "grad_norm": 0.9130533933639526, + "learning_rate": 9.910044972297343e-05, + "loss": 2.1967, + "step": 2890 + }, + { + "epoch": 0.887354205033763, + "grad_norm": 0.6971304416656494, + "learning_rate": 9.909951086892767e-05, + "loss": 2.0797, + "step": 2891 + }, + { + "epoch": 0.8876611418047882, + "grad_norm": 0.5822353363037109, + "learning_rate": 9.909857152965176e-05, + "loss": 2.1152, + "step": 2892 + }, + { + "epoch": 0.8879680785758134, + "grad_norm": 0.5885453820228577, + "learning_rate": 9.9097631705155e-05, + "loss": 2.0323, + "step": 2893 + }, + { + "epoch": 0.8882750153468385, + "grad_norm": 0.6249284744262695, + "learning_rate": 9.909669139544666e-05, + "loss": 2.1076, + "step": 2894 + }, + { + "epoch": 0.8885819521178637, + "grad_norm": 0.6117702722549438, + "learning_rate": 9.909575060053604e-05, + "loss": 2.0608, + "step": 2895 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.560357928276062, + "learning_rate": 9.909480932043245e-05, + "loss": 2.145, + "step": 2896 + }, + { + "epoch": 0.8891958256599141, + "grad_norm": 0.5442607998847961, + "learning_rate": 9.909386755514516e-05, + "loss": 2.1091, + "step": 2897 + }, + { + "epoch": 0.8895027624309392, + "grad_norm": 0.5653077363967896, + "learning_rate": 9.909292530468351e-05, + "loss": 2.1097, + "step": 2898 + }, + { + "epoch": 0.8898096992019644, + "grad_norm": 0.531939685344696, + "learning_rate": 9.909198256905679e-05, + "loss": 2.0866, + "step": 2899 + }, + { + "epoch": 0.8901166359729895, + "grad_norm": 0.6238400340080261, + "learning_rate": 9.909103934827433e-05, + "loss": 2.1421, + "step": 2900 + }, + { + "epoch": 0.8904235727440147, + "grad_norm": 0.5685901045799255, + "learning_rate": 9.909009564234543e-05, + "loss": 2.0019, + "step": 2901 + }, + { + "epoch": 0.8907305095150398, + "grad_norm": 0.5979083180427551, + "learning_rate": 9.908915145127945e-05, + "loss": 2.0891, + "step": 2902 + }, + { + "epoch": 0.8910374462860651, + "grad_norm": 0.5847237706184387, + "learning_rate": 9.90882067750857e-05, + "loss": 2.1165, + "step": 2903 + }, + { + "epoch": 0.8913443830570903, + "grad_norm": 0.6281530261039734, + "learning_rate": 9.908726161377351e-05, + "loss": 2.1396, + "step": 2904 + }, + { + "epoch": 0.8916513198281154, + "grad_norm": 0.5685252547264099, + "learning_rate": 9.908631596735225e-05, + "loss": 2.0781, + "step": 2905 + }, + { + "epoch": 0.8919582565991406, + "grad_norm": 0.5427065491676331, + "learning_rate": 9.908536983583123e-05, + "loss": 2.1387, + "step": 2906 + }, + { + "epoch": 0.8922651933701657, + "grad_norm": 0.5972270965576172, + "learning_rate": 9.908442321921982e-05, + "loss": 2.0546, + "step": 2907 + }, + { + "epoch": 0.892572130141191, + "grad_norm": 0.562685489654541, + "learning_rate": 9.908347611752735e-05, + "loss": 2.093, + "step": 2908 + }, + { + "epoch": 0.8928790669122161, + "grad_norm": 0.6781734824180603, + "learning_rate": 9.908252853076323e-05, + "loss": 2.1589, + "step": 2909 + }, + { + "epoch": 0.8931860036832413, + "grad_norm": 0.7591540813446045, + "learning_rate": 9.908158045893678e-05, + "loss": 2.164, + "step": 2910 + }, + { + "epoch": 0.8934929404542664, + "grad_norm": 0.7161938548088074, + "learning_rate": 9.908063190205738e-05, + "loss": 2.079, + "step": 2911 + }, + { + "epoch": 0.8937998772252916, + "grad_norm": 0.7338036298751831, + "learning_rate": 9.907968286013442e-05, + "loss": 2.0033, + "step": 2912 + }, + { + "epoch": 0.8941068139963168, + "grad_norm": 0.7641176581382751, + "learning_rate": 9.907873333317727e-05, + "loss": 2.187, + "step": 2913 + }, + { + "epoch": 0.894413750767342, + "grad_norm": 0.6073760390281677, + "learning_rate": 9.90777833211953e-05, + "loss": 2.0589, + "step": 2914 + }, + { + "epoch": 0.8947206875383671, + "grad_norm": 0.49493756890296936, + "learning_rate": 9.907683282419791e-05, + "loss": 2.0555, + "step": 2915 + }, + { + "epoch": 0.8950276243093923, + "grad_norm": 0.6428996920585632, + "learning_rate": 9.907588184219449e-05, + "loss": 2.1083, + "step": 2916 + }, + { + "epoch": 0.8953345610804174, + "grad_norm": 0.6752644777297974, + "learning_rate": 9.907493037519447e-05, + "loss": 2.0987, + "step": 2917 + }, + { + "epoch": 0.8956414978514426, + "grad_norm": 0.5719494223594666, + "learning_rate": 9.907397842320719e-05, + "loss": 2.1735, + "step": 2918 + }, + { + "epoch": 0.8959484346224678, + "grad_norm": 0.5799626111984253, + "learning_rate": 9.907302598624211e-05, + "loss": 2.0978, + "step": 2919 + }, + { + "epoch": 0.896255371393493, + "grad_norm": 0.5407500267028809, + "learning_rate": 9.907207306430861e-05, + "loss": 2.0303, + "step": 2920 + }, + { + "epoch": 0.8965623081645181, + "grad_norm": 0.5950884222984314, + "learning_rate": 9.907111965741614e-05, + "loss": 2.0721, + "step": 2921 + }, + { + "epoch": 0.8968692449355433, + "grad_norm": 0.7711441516876221, + "learning_rate": 9.907016576557409e-05, + "loss": 2.1693, + "step": 2922 + }, + { + "epoch": 0.8971761817065684, + "grad_norm": 0.5522177815437317, + "learning_rate": 9.906921138879191e-05, + "loss": 2.1057, + "step": 2923 + }, + { + "epoch": 0.8974831184775937, + "grad_norm": 0.5743894577026367, + "learning_rate": 9.906825652707903e-05, + "loss": 2.119, + "step": 2924 + }, + { + "epoch": 0.8977900552486188, + "grad_norm": 0.5996440649032593, + "learning_rate": 9.906730118044486e-05, + "loss": 2.1251, + "step": 2925 + }, + { + "epoch": 0.898096992019644, + "grad_norm": 0.691302478313446, + "learning_rate": 9.906634534889887e-05, + "loss": 2.1459, + "step": 2926 + }, + { + "epoch": 0.8984039287906691, + "grad_norm": 0.6125866770744324, + "learning_rate": 9.90653890324505e-05, + "loss": 2.0739, + "step": 2927 + }, + { + "epoch": 0.8987108655616943, + "grad_norm": 0.5285681486129761, + "learning_rate": 9.906443223110919e-05, + "loss": 2.0398, + "step": 2928 + }, + { + "epoch": 0.8990178023327194, + "grad_norm": 0.5747935771942139, + "learning_rate": 9.90634749448844e-05, + "loss": 2.0688, + "step": 2929 + }, + { + "epoch": 0.8993247391037447, + "grad_norm": 0.5686646103858948, + "learning_rate": 9.90625171737856e-05, + "loss": 2.1196, + "step": 2930 + }, + { + "epoch": 0.8996316758747698, + "grad_norm": 0.5320247411727905, + "learning_rate": 9.906155891782225e-05, + "loss": 2.1069, + "step": 2931 + }, + { + "epoch": 0.899938612645795, + "grad_norm": 0.5626047849655151, + "learning_rate": 9.906060017700383e-05, + "loss": 2.1091, + "step": 2932 + }, + { + "epoch": 0.9002455494168201, + "grad_norm": 0.5284978151321411, + "learning_rate": 9.905964095133979e-05, + "loss": 2.036, + "step": 2933 + }, + { + "epoch": 0.9005524861878453, + "grad_norm": 0.5362093448638916, + "learning_rate": 9.905868124083962e-05, + "loss": 2.1273, + "step": 2934 + }, + { + "epoch": 0.9008594229588704, + "grad_norm": 0.5583781599998474, + "learning_rate": 9.90577210455128e-05, + "loss": 2.0871, + "step": 2935 + }, + { + "epoch": 0.9011663597298957, + "grad_norm": 0.5552016496658325, + "learning_rate": 9.905676036536883e-05, + "loss": 2.0785, + "step": 2936 + }, + { + "epoch": 0.9014732965009208, + "grad_norm": 0.6875657439231873, + "learning_rate": 9.905579920041724e-05, + "loss": 2.083, + "step": 2937 + }, + { + "epoch": 0.901780233271946, + "grad_norm": 0.5396340489387512, + "learning_rate": 9.905483755066744e-05, + "loss": 2.0717, + "step": 2938 + }, + { + "epoch": 0.9020871700429711, + "grad_norm": 0.594739556312561, + "learning_rate": 9.9053875416129e-05, + "loss": 2.1305, + "step": 2939 + }, + { + "epoch": 0.9023941068139963, + "grad_norm": 0.6208831667900085, + "learning_rate": 9.905291279681143e-05, + "loss": 2.0034, + "step": 2940 + }, + { + "epoch": 0.9027010435850215, + "grad_norm": 0.5154325366020203, + "learning_rate": 9.90519496927242e-05, + "loss": 2.098, + "step": 2941 + }, + { + "epoch": 0.9030079803560467, + "grad_norm": 0.5217738151550293, + "learning_rate": 9.905098610387687e-05, + "loss": 2.0467, + "step": 2942 + }, + { + "epoch": 0.9033149171270718, + "grad_norm": 0.5623623728752136, + "learning_rate": 9.905002203027894e-05, + "loss": 2.1854, + "step": 2943 + }, + { + "epoch": 0.903621853898097, + "grad_norm": 0.5365456938743591, + "learning_rate": 9.904905747193993e-05, + "loss": 2.1021, + "step": 2944 + }, + { + "epoch": 0.9039287906691221, + "grad_norm": 0.5391906499862671, + "learning_rate": 9.904809242886941e-05, + "loss": 2.1102, + "step": 2945 + }, + { + "epoch": 0.9042357274401474, + "grad_norm": 0.5439971685409546, + "learning_rate": 9.904712690107687e-05, + "loss": 2.0691, + "step": 2946 + }, + { + "epoch": 0.9045426642111725, + "grad_norm": 0.539383053779602, + "learning_rate": 9.904616088857189e-05, + "loss": 2.0514, + "step": 2947 + }, + { + "epoch": 0.9048496009821977, + "grad_norm": 0.5370060801506042, + "learning_rate": 9.904519439136399e-05, + "loss": 2.1069, + "step": 2948 + }, + { + "epoch": 0.9051565377532228, + "grad_norm": 0.5136541724205017, + "learning_rate": 9.904422740946274e-05, + "loss": 2.0519, + "step": 2949 + }, + { + "epoch": 0.905463474524248, + "grad_norm": 0.4970051348209381, + "learning_rate": 9.904325994287768e-05, + "loss": 2.0624, + "step": 2950 + }, + { + "epoch": 0.9057704112952731, + "grad_norm": 0.5003986954689026, + "learning_rate": 9.90422919916184e-05, + "loss": 2.135, + "step": 2951 + }, + { + "epoch": 0.9060773480662984, + "grad_norm": 0.5559821724891663, + "learning_rate": 9.904132355569443e-05, + "loss": 2.0733, + "step": 2952 + }, + { + "epoch": 0.9063842848373235, + "grad_norm": 0.5450533628463745, + "learning_rate": 9.904035463511537e-05, + "loss": 2.1491, + "step": 2953 + }, + { + "epoch": 0.9066912216083487, + "grad_norm": 0.5789141058921814, + "learning_rate": 9.903938522989076e-05, + "loss": 2.0604, + "step": 2954 + }, + { + "epoch": 0.9069981583793738, + "grad_norm": 0.6327412128448486, + "learning_rate": 9.903841534003023e-05, + "loss": 2.1307, + "step": 2955 + }, + { + "epoch": 0.907305095150399, + "grad_norm": 0.5694023966789246, + "learning_rate": 9.90374449655433e-05, + "loss": 2.1322, + "step": 2956 + }, + { + "epoch": 0.9076120319214241, + "grad_norm": 0.6241337060928345, + "learning_rate": 9.903647410643963e-05, + "loss": 2.1026, + "step": 2957 + }, + { + "epoch": 0.9079189686924494, + "grad_norm": 0.6257766485214233, + "learning_rate": 9.903550276272878e-05, + "loss": 2.0449, + "step": 2958 + }, + { + "epoch": 0.9082259054634745, + "grad_norm": 0.708626389503479, + "learning_rate": 9.903453093442032e-05, + "loss": 2.095, + "step": 2959 + }, + { + "epoch": 0.9085328422344997, + "grad_norm": 0.6769086122512817, + "learning_rate": 9.903355862152391e-05, + "loss": 2.0939, + "step": 2960 + }, + { + "epoch": 0.9088397790055248, + "grad_norm": 0.6221890449523926, + "learning_rate": 9.903258582404913e-05, + "loss": 2.1552, + "step": 2961 + }, + { + "epoch": 0.90914671577655, + "grad_norm": 0.7477858662605286, + "learning_rate": 9.903161254200561e-05, + "loss": 2.1155, + "step": 2962 + }, + { + "epoch": 0.9094536525475752, + "grad_norm": 0.665538489818573, + "learning_rate": 9.903063877540294e-05, + "loss": 2.1032, + "step": 2963 + }, + { + "epoch": 0.9097605893186004, + "grad_norm": 0.5973435044288635, + "learning_rate": 9.902966452425076e-05, + "loss": 2.0793, + "step": 2964 + }, + { + "epoch": 0.9100675260896255, + "grad_norm": 0.6544547080993652, + "learning_rate": 9.90286897885587e-05, + "loss": 2.1566, + "step": 2965 + }, + { + "epoch": 0.9103744628606507, + "grad_norm": 0.7162452936172485, + "learning_rate": 9.90277145683364e-05, + "loss": 2.1234, + "step": 2966 + }, + { + "epoch": 0.9106813996316758, + "grad_norm": 0.8400503993034363, + "learning_rate": 9.902673886359349e-05, + "loss": 2.216, + "step": 2967 + }, + { + "epoch": 0.910988336402701, + "grad_norm": 1.0350611209869385, + "learning_rate": 9.902576267433961e-05, + "loss": 2.0785, + "step": 2968 + }, + { + "epoch": 0.9112952731737262, + "grad_norm": 0.9551987051963806, + "learning_rate": 9.90247860005844e-05, + "loss": 2.0652, + "step": 2969 + }, + { + "epoch": 0.9116022099447514, + "grad_norm": 0.839712381362915, + "learning_rate": 9.902380884233751e-05, + "loss": 2.1197, + "step": 2970 + }, + { + "epoch": 0.9119091467157765, + "grad_norm": 0.6588022708892822, + "learning_rate": 9.902283119960863e-05, + "loss": 2.155, + "step": 2971 + }, + { + "epoch": 0.9122160834868017, + "grad_norm": 0.6532430052757263, + "learning_rate": 9.902185307240739e-05, + "loss": 2.0947, + "step": 2972 + }, + { + "epoch": 0.9125230202578268, + "grad_norm": 0.7890481352806091, + "learning_rate": 9.902087446074346e-05, + "loss": 2.0246, + "step": 2973 + }, + { + "epoch": 0.9128299570288521, + "grad_norm": 0.6234511137008667, + "learning_rate": 9.901989536462652e-05, + "loss": 2.1033, + "step": 2974 + }, + { + "epoch": 0.9131368937998773, + "grad_norm": 0.5875300168991089, + "learning_rate": 9.901891578406623e-05, + "loss": 2.0553, + "step": 2975 + }, + { + "epoch": 0.9134438305709024, + "grad_norm": 0.6868174076080322, + "learning_rate": 9.901793571907231e-05, + "loss": 2.1398, + "step": 2976 + }, + { + "epoch": 0.9137507673419276, + "grad_norm": 0.7423301339149475, + "learning_rate": 9.90169551696544e-05, + "loss": 2.1034, + "step": 2977 + }, + { + "epoch": 0.9140577041129527, + "grad_norm": 0.588916003704071, + "learning_rate": 9.901597413582222e-05, + "loss": 2.078, + "step": 2978 + }, + { + "epoch": 0.914364640883978, + "grad_norm": 0.5895309448242188, + "learning_rate": 9.901499261758544e-05, + "loss": 2.0902, + "step": 2979 + }, + { + "epoch": 0.9146715776550031, + "grad_norm": 0.5403301119804382, + "learning_rate": 9.901401061495379e-05, + "loss": 2.0291, + "step": 2980 + }, + { + "epoch": 0.9149785144260283, + "grad_norm": 0.6102077960968018, + "learning_rate": 9.901302812793696e-05, + "loss": 2.0415, + "step": 2981 + }, + { + "epoch": 0.9152854511970534, + "grad_norm": 0.6728450059890747, + "learning_rate": 9.901204515654465e-05, + "loss": 2.105, + "step": 2982 + }, + { + "epoch": 0.9155923879680786, + "grad_norm": 0.5886163711547852, + "learning_rate": 9.901106170078657e-05, + "loss": 2.0186, + "step": 2983 + }, + { + "epoch": 0.9158993247391037, + "grad_norm": 0.539252758026123, + "learning_rate": 9.901007776067247e-05, + "loss": 2.0604, + "step": 2984 + }, + { + "epoch": 0.916206261510129, + "grad_norm": 0.6169516444206238, + "learning_rate": 9.900909333621205e-05, + "loss": 2.1257, + "step": 2985 + }, + { + "epoch": 0.9165131982811541, + "grad_norm": 0.5624274015426636, + "learning_rate": 9.900810842741506e-05, + "loss": 2.0325, + "step": 2986 + }, + { + "epoch": 0.9168201350521793, + "grad_norm": 0.5931735634803772, + "learning_rate": 9.900712303429119e-05, + "loss": 2.0815, + "step": 2987 + }, + { + "epoch": 0.9171270718232044, + "grad_norm": 0.5720505714416504, + "learning_rate": 9.900613715685023e-05, + "loss": 2.1261, + "step": 2988 + }, + { + "epoch": 0.9174340085942296, + "grad_norm": 0.5752067565917969, + "learning_rate": 9.900515079510189e-05, + "loss": 2.1402, + "step": 2989 + }, + { + "epoch": 0.9177409453652547, + "grad_norm": 0.5836917757987976, + "learning_rate": 9.900416394905591e-05, + "loss": 2.0523, + "step": 2990 + }, + { + "epoch": 0.91804788213628, + "grad_norm": 0.6408325433731079, + "learning_rate": 9.900317661872209e-05, + "loss": 2.1874, + "step": 2991 + }, + { + "epoch": 0.9183548189073051, + "grad_norm": 0.6188341379165649, + "learning_rate": 9.900218880411013e-05, + "loss": 2.0903, + "step": 2992 + }, + { + "epoch": 0.9186617556783303, + "grad_norm": 0.5740565657615662, + "learning_rate": 9.900120050522985e-05, + "loss": 2.1243, + "step": 2993 + }, + { + "epoch": 0.9189686924493554, + "grad_norm": 0.635638952255249, + "learning_rate": 9.900021172209096e-05, + "loss": 2.089, + "step": 2994 + }, + { + "epoch": 0.9192756292203806, + "grad_norm": 0.5538209676742554, + "learning_rate": 9.899922245470326e-05, + "loss": 2.0489, + "step": 2995 + }, + { + "epoch": 0.9195825659914058, + "grad_norm": 0.5440292954444885, + "learning_rate": 9.899823270307654e-05, + "loss": 2.0534, + "step": 2996 + }, + { + "epoch": 0.919889502762431, + "grad_norm": 0.6203792691230774, + "learning_rate": 9.899724246722055e-05, + "loss": 2.2799, + "step": 2997 + }, + { + "epoch": 0.9201964395334561, + "grad_norm": 0.6299278140068054, + "learning_rate": 9.89962517471451e-05, + "loss": 2.0813, + "step": 2998 + }, + { + "epoch": 0.9205033763044813, + "grad_norm": 0.6156774759292603, + "learning_rate": 9.899526054285997e-05, + "loss": 2.1345, + "step": 2999 + }, + { + "epoch": 0.9208103130755064, + "grad_norm": 0.5940032601356506, + "learning_rate": 9.899426885437496e-05, + "loss": 2.133, + "step": 3000 + }, + { + "epoch": 0.9211172498465316, + "grad_norm": 0.6210232377052307, + "learning_rate": 9.899327668169987e-05, + "loss": 2.0275, + "step": 3001 + }, + { + "epoch": 0.9214241866175568, + "grad_norm": 0.5578985214233398, + "learning_rate": 9.89922840248445e-05, + "loss": 2.0806, + "step": 3002 + }, + { + "epoch": 0.921731123388582, + "grad_norm": 0.5264963507652283, + "learning_rate": 9.899129088381866e-05, + "loss": 2.1233, + "step": 3003 + }, + { + "epoch": 0.9220380601596071, + "grad_norm": 0.5414119958877563, + "learning_rate": 9.899029725863218e-05, + "loss": 2.1052, + "step": 3004 + }, + { + "epoch": 0.9223449969306323, + "grad_norm": 0.5933207869529724, + "learning_rate": 9.898930314929486e-05, + "loss": 2.108, + "step": 3005 + }, + { + "epoch": 0.9226519337016574, + "grad_norm": 0.6170317530632019, + "learning_rate": 9.898830855581654e-05, + "loss": 2.0997, + "step": 3006 + }, + { + "epoch": 0.9229588704726827, + "grad_norm": 0.5930282473564148, + "learning_rate": 9.898731347820705e-05, + "loss": 2.0507, + "step": 3007 + }, + { + "epoch": 0.9232658072437078, + "grad_norm": 0.5894142985343933, + "learning_rate": 9.898631791647619e-05, + "loss": 2.0687, + "step": 3008 + }, + { + "epoch": 0.923572744014733, + "grad_norm": 0.6560437083244324, + "learning_rate": 9.898532187063383e-05, + "loss": 2.096, + "step": 3009 + }, + { + "epoch": 0.9238796807857581, + "grad_norm": 0.6083245873451233, + "learning_rate": 9.898432534068983e-05, + "loss": 2.0526, + "step": 3010 + }, + { + "epoch": 0.9241866175567833, + "grad_norm": 0.5152565240859985, + "learning_rate": 9.8983328326654e-05, + "loss": 2.0802, + "step": 3011 + }, + { + "epoch": 0.9244935543278084, + "grad_norm": 0.6326588988304138, + "learning_rate": 9.89823308285362e-05, + "loss": 2.1246, + "step": 3012 + }, + { + "epoch": 0.9248004910988337, + "grad_norm": 0.6821309328079224, + "learning_rate": 9.898133284634632e-05, + "loss": 2.1106, + "step": 3013 + }, + { + "epoch": 0.9251074278698588, + "grad_norm": 0.6192164421081543, + "learning_rate": 9.898033438009419e-05, + "loss": 2.0475, + "step": 3014 + }, + { + "epoch": 0.925414364640884, + "grad_norm": 0.6112427115440369, + "learning_rate": 9.897933542978967e-05, + "loss": 2.0904, + "step": 3015 + }, + { + "epoch": 0.9257213014119091, + "grad_norm": 0.5729427933692932, + "learning_rate": 9.897833599544268e-05, + "loss": 2.1151, + "step": 3016 + }, + { + "epoch": 0.9260282381829343, + "grad_norm": 0.6200255751609802, + "learning_rate": 9.897733607706305e-05, + "loss": 2.0815, + "step": 3017 + }, + { + "epoch": 0.9263351749539595, + "grad_norm": 0.635920524597168, + "learning_rate": 9.897633567466068e-05, + "loss": 2.0724, + "step": 3018 + }, + { + "epoch": 0.9266421117249847, + "grad_norm": 0.5916038155555725, + "learning_rate": 9.897533478824546e-05, + "loss": 2.1527, + "step": 3019 + }, + { + "epoch": 0.9269490484960098, + "grad_norm": 0.5552941560745239, + "learning_rate": 9.897433341782727e-05, + "loss": 2.0958, + "step": 3020 + }, + { + "epoch": 0.927255985267035, + "grad_norm": 0.562383770942688, + "learning_rate": 9.897333156341602e-05, + "loss": 2.0939, + "step": 3021 + }, + { + "epoch": 0.9275629220380601, + "grad_norm": 0.5227869153022766, + "learning_rate": 9.897232922502158e-05, + "loss": 2.1358, + "step": 3022 + }, + { + "epoch": 0.9278698588090853, + "grad_norm": 0.5671074986457825, + "learning_rate": 9.897132640265391e-05, + "loss": 2.0877, + "step": 3023 + }, + { + "epoch": 0.9281767955801105, + "grad_norm": 0.5176356434822083, + "learning_rate": 9.897032309632287e-05, + "loss": 2.0392, + "step": 3024 + }, + { + "epoch": 0.9284837323511357, + "grad_norm": 0.5160155296325684, + "learning_rate": 9.89693193060384e-05, + "loss": 2.069, + "step": 3025 + }, + { + "epoch": 0.9287906691221608, + "grad_norm": 0.5034440159797668, + "learning_rate": 9.896831503181042e-05, + "loss": 2.0348, + "step": 3026 + }, + { + "epoch": 0.929097605893186, + "grad_norm": 0.5146151781082153, + "learning_rate": 9.896731027364884e-05, + "loss": 2.0884, + "step": 3027 + }, + { + "epoch": 0.9294045426642111, + "grad_norm": 0.7153071165084839, + "learning_rate": 9.896630503156361e-05, + "loss": 2.2295, + "step": 3028 + }, + { + "epoch": 0.9297114794352364, + "grad_norm": 0.7201753258705139, + "learning_rate": 9.896529930556464e-05, + "loss": 2.1285, + "step": 3029 + }, + { + "epoch": 0.9300184162062615, + "grad_norm": 0.7110029458999634, + "learning_rate": 9.89642930956619e-05, + "loss": 2.1371, + "step": 3030 + }, + { + "epoch": 0.9303253529772867, + "grad_norm": 0.695444643497467, + "learning_rate": 9.896328640186531e-05, + "loss": 2.0698, + "step": 3031 + }, + { + "epoch": 0.9306322897483118, + "grad_norm": 0.6157357096672058, + "learning_rate": 9.896227922418482e-05, + "loss": 2.1294, + "step": 3032 + }, + { + "epoch": 0.930939226519337, + "grad_norm": 0.5473730564117432, + "learning_rate": 9.896127156263039e-05, + "loss": 2.0487, + "step": 3033 + }, + { + "epoch": 0.9312461632903621, + "grad_norm": 0.6400229334831238, + "learning_rate": 9.896026341721198e-05, + "loss": 2.0422, + "step": 3034 + }, + { + "epoch": 0.9315531000613874, + "grad_norm": 0.5046324729919434, + "learning_rate": 9.895925478793955e-05, + "loss": 2.0715, + "step": 3035 + }, + { + "epoch": 0.9318600368324125, + "grad_norm": 0.5316528081893921, + "learning_rate": 9.895824567482307e-05, + "loss": 2.11, + "step": 3036 + }, + { + "epoch": 0.9321669736034377, + "grad_norm": 0.5760478973388672, + "learning_rate": 9.895723607787251e-05, + "loss": 2.0885, + "step": 3037 + }, + { + "epoch": 0.9324739103744628, + "grad_norm": 0.5034705996513367, + "learning_rate": 9.895622599709785e-05, + "loss": 2.0024, + "step": 3038 + }, + { + "epoch": 0.932780847145488, + "grad_norm": 0.46088743209838867, + "learning_rate": 9.895521543250906e-05, + "loss": 2.0794, + "step": 3039 + }, + { + "epoch": 0.9330877839165131, + "grad_norm": 0.5219544172286987, + "learning_rate": 9.895420438411616e-05, + "loss": 2.1002, + "step": 3040 + }, + { + "epoch": 0.9333947206875384, + "grad_norm": 0.5363453030586243, + "learning_rate": 9.89531928519291e-05, + "loss": 2.0629, + "step": 3041 + }, + { + "epoch": 0.9337016574585635, + "grad_norm": 0.5860787630081177, + "learning_rate": 9.89521808359579e-05, + "loss": 2.0999, + "step": 3042 + }, + { + "epoch": 0.9340085942295887, + "grad_norm": 0.7155836224555969, + "learning_rate": 9.895116833621255e-05, + "loss": 2.1674, + "step": 3043 + }, + { + "epoch": 0.9343155310006138, + "grad_norm": 0.8029196262359619, + "learning_rate": 9.895015535270307e-05, + "loss": 2.0776, + "step": 3044 + }, + { + "epoch": 0.934622467771639, + "grad_norm": 0.6973832845687866, + "learning_rate": 9.894914188543946e-05, + "loss": 2.0537, + "step": 3045 + }, + { + "epoch": 0.9349294045426643, + "grad_norm": 0.6646706461906433, + "learning_rate": 9.894812793443175e-05, + "loss": 2.0857, + "step": 3046 + }, + { + "epoch": 0.9352363413136894, + "grad_norm": 0.6343888640403748, + "learning_rate": 9.894711349968995e-05, + "loss": 2.0832, + "step": 3047 + }, + { + "epoch": 0.9355432780847146, + "grad_norm": 0.54819256067276, + "learning_rate": 9.894609858122407e-05, + "loss": 2.1576, + "step": 3048 + }, + { + "epoch": 0.9358502148557397, + "grad_norm": 0.6905701160430908, + "learning_rate": 9.894508317904419e-05, + "loss": 2.0685, + "step": 3049 + }, + { + "epoch": 0.9361571516267649, + "grad_norm": 0.605591356754303, + "learning_rate": 9.894406729316028e-05, + "loss": 2.0931, + "step": 3050 + }, + { + "epoch": 0.93646408839779, + "grad_norm": 0.5702943801879883, + "learning_rate": 9.89430509235824e-05, + "loss": 2.1224, + "step": 3051 + }, + { + "epoch": 0.9367710251688153, + "grad_norm": 0.5855122804641724, + "learning_rate": 9.894203407032064e-05, + "loss": 2.0747, + "step": 3052 + }, + { + "epoch": 0.9370779619398404, + "grad_norm": 0.6002167463302612, + "learning_rate": 9.894101673338498e-05, + "loss": 2.0991, + "step": 3053 + }, + { + "epoch": 0.9373848987108656, + "grad_norm": 0.5914842486381531, + "learning_rate": 9.893999891278553e-05, + "loss": 2.0427, + "step": 3054 + }, + { + "epoch": 0.9376918354818907, + "grad_norm": 0.6283048391342163, + "learning_rate": 9.893898060853232e-05, + "loss": 2.0558, + "step": 3055 + }, + { + "epoch": 0.937998772252916, + "grad_norm": 0.5955209136009216, + "learning_rate": 9.893796182063542e-05, + "loss": 2.1286, + "step": 3056 + }, + { + "epoch": 0.9383057090239411, + "grad_norm": 0.5579878687858582, + "learning_rate": 9.893694254910489e-05, + "loss": 2.0799, + "step": 3057 + }, + { + "epoch": 0.9386126457949663, + "grad_norm": 0.5690281391143799, + "learning_rate": 9.893592279395082e-05, + "loss": 2.0699, + "step": 3058 + }, + { + "epoch": 0.9389195825659914, + "grad_norm": 0.5189259648323059, + "learning_rate": 9.893490255518327e-05, + "loss": 2.0627, + "step": 3059 + }, + { + "epoch": 0.9392265193370166, + "grad_norm": 0.5205439925193787, + "learning_rate": 9.893388183281233e-05, + "loss": 2.0136, + "step": 3060 + }, + { + "epoch": 0.9395334561080417, + "grad_norm": 0.492593914270401, + "learning_rate": 9.89328606268481e-05, + "loss": 2.0799, + "step": 3061 + }, + { + "epoch": 0.939840392879067, + "grad_norm": 0.6511666178703308, + "learning_rate": 9.893183893730067e-05, + "loss": 2.1297, + "step": 3062 + }, + { + "epoch": 0.9401473296500921, + "grad_norm": 0.7640050053596497, + "learning_rate": 9.89308167641801e-05, + "loss": 2.1384, + "step": 3063 + }, + { + "epoch": 0.9404542664211173, + "grad_norm": 0.7526536583900452, + "learning_rate": 9.892979410749654e-05, + "loss": 2.0454, + "step": 3064 + }, + { + "epoch": 0.9407612031921424, + "grad_norm": 0.7140639424324036, + "learning_rate": 9.892877096726007e-05, + "loss": 2.0219, + "step": 3065 + }, + { + "epoch": 0.9410681399631676, + "grad_norm": 0.6584374308586121, + "learning_rate": 9.89277473434808e-05, + "loss": 2.0943, + "step": 3066 + }, + { + "epoch": 0.9413750767341927, + "grad_norm": 0.5889024138450623, + "learning_rate": 9.892672323616888e-05, + "loss": 2.1088, + "step": 3067 + }, + { + "epoch": 0.941682013505218, + "grad_norm": 0.6196749806404114, + "learning_rate": 9.892569864533438e-05, + "loss": 2.101, + "step": 3068 + }, + { + "epoch": 0.9419889502762431, + "grad_norm": 0.6432211399078369, + "learning_rate": 9.892467357098744e-05, + "loss": 2.0828, + "step": 3069 + }, + { + "epoch": 0.9422958870472683, + "grad_norm": 0.6448069214820862, + "learning_rate": 9.892364801313823e-05, + "loss": 2.1389, + "step": 3070 + }, + { + "epoch": 0.9426028238182934, + "grad_norm": 0.597197949886322, + "learning_rate": 9.892262197179682e-05, + "loss": 2.0902, + "step": 3071 + }, + { + "epoch": 0.9429097605893186, + "grad_norm": 0.625348687171936, + "learning_rate": 9.892159544697341e-05, + "loss": 2.0659, + "step": 3072 + }, + { + "epoch": 0.9432166973603437, + "grad_norm": 0.5109166502952576, + "learning_rate": 9.892056843867812e-05, + "loss": 2.0895, + "step": 3073 + }, + { + "epoch": 0.943523634131369, + "grad_norm": 0.5917959213256836, + "learning_rate": 9.891954094692108e-05, + "loss": 2.0646, + "step": 3074 + }, + { + "epoch": 0.9438305709023941, + "grad_norm": 0.5320633053779602, + "learning_rate": 9.891851297171249e-05, + "loss": 2.107, + "step": 3075 + }, + { + "epoch": 0.9441375076734193, + "grad_norm": 0.5271332263946533, + "learning_rate": 9.891748451306246e-05, + "loss": 2.0984, + "step": 3076 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.5389983057975769, + "learning_rate": 9.89164555709812e-05, + "loss": 2.1097, + "step": 3077 + }, + { + "epoch": 0.9447513812154696, + "grad_norm": 0.5536573529243469, + "learning_rate": 9.891542614547885e-05, + "loss": 2.1271, + "step": 3078 + }, + { + "epoch": 0.9450583179864948, + "grad_norm": 0.5481712222099304, + "learning_rate": 9.891439623656558e-05, + "loss": 2.0975, + "step": 3079 + }, + { + "epoch": 0.94536525475752, + "grad_norm": 0.626431941986084, + "learning_rate": 9.891336584425157e-05, + "loss": 2.1561, + "step": 3080 + }, + { + "epoch": 0.9456721915285451, + "grad_norm": 0.7452689409255981, + "learning_rate": 9.891233496854702e-05, + "loss": 2.0791, + "step": 3081 + }, + { + "epoch": 0.9459791282995703, + "grad_norm": 0.9399113059043884, + "learning_rate": 9.89113036094621e-05, + "loss": 2.0706, + "step": 3082 + }, + { + "epoch": 0.9462860650705954, + "grad_norm": 1.0733267068862915, + "learning_rate": 9.891027176700701e-05, + "loss": 2.0705, + "step": 3083 + }, + { + "epoch": 0.9465930018416207, + "grad_norm": 0.7521542906761169, + "learning_rate": 9.890923944119194e-05, + "loss": 2.0862, + "step": 3084 + }, + { + "epoch": 0.9468999386126458, + "grad_norm": 0.5447198152542114, + "learning_rate": 9.890820663202713e-05, + "loss": 2.1047, + "step": 3085 + }, + { + "epoch": 0.947206875383671, + "grad_norm": 0.5733833312988281, + "learning_rate": 9.890717333952273e-05, + "loss": 2.121, + "step": 3086 + }, + { + "epoch": 0.9475138121546961, + "grad_norm": 0.7225440144538879, + "learning_rate": 9.890613956368899e-05, + "loss": 2.0533, + "step": 3087 + }, + { + "epoch": 0.9478207489257213, + "grad_norm": 0.6377096176147461, + "learning_rate": 9.89051053045361e-05, + "loss": 2.07, + "step": 3088 + }, + { + "epoch": 0.9481276856967464, + "grad_norm": 0.556656002998352, + "learning_rate": 9.890407056207432e-05, + "loss": 2.1103, + "step": 3089 + }, + { + "epoch": 0.9484346224677717, + "grad_norm": 0.6807621121406555, + "learning_rate": 9.890303533631382e-05, + "loss": 2.1351, + "step": 3090 + }, + { + "epoch": 0.9487415592387968, + "grad_norm": 0.7187803983688354, + "learning_rate": 9.890199962726487e-05, + "loss": 2.0582, + "step": 3091 + }, + { + "epoch": 0.949048496009822, + "grad_norm": 0.6201196908950806, + "learning_rate": 9.890096343493771e-05, + "loss": 2.0799, + "step": 3092 + }, + { + "epoch": 0.9493554327808471, + "grad_norm": 0.6258496046066284, + "learning_rate": 9.889992675934257e-05, + "loss": 2.156, + "step": 3093 + }, + { + "epoch": 0.9496623695518723, + "grad_norm": 0.6191570162773132, + "learning_rate": 9.889888960048967e-05, + "loss": 2.0121, + "step": 3094 + }, + { + "epoch": 0.9499693063228974, + "grad_norm": 0.5668848752975464, + "learning_rate": 9.88978519583893e-05, + "loss": 2.0954, + "step": 3095 + }, + { + "epoch": 0.9502762430939227, + "grad_norm": 0.5596859455108643, + "learning_rate": 9.88968138330517e-05, + "loss": 2.1274, + "step": 3096 + }, + { + "epoch": 0.9505831798649478, + "grad_norm": 0.6199706196784973, + "learning_rate": 9.889577522448712e-05, + "loss": 2.0588, + "step": 3097 + }, + { + "epoch": 0.950890116635973, + "grad_norm": 0.5129860639572144, + "learning_rate": 9.889473613270584e-05, + "loss": 2.0722, + "step": 3098 + }, + { + "epoch": 0.9511970534069981, + "grad_norm": 0.513263463973999, + "learning_rate": 9.88936965577181e-05, + "loss": 2.0298, + "step": 3099 + }, + { + "epoch": 0.9515039901780233, + "grad_norm": 0.4870156943798065, + "learning_rate": 9.88926564995342e-05, + "loss": 2.025, + "step": 3100 + }, + { + "epoch": 0.9518109269490485, + "grad_norm": 0.5310595035552979, + "learning_rate": 9.889161595816442e-05, + "loss": 2.0767, + "step": 3101 + }, + { + "epoch": 0.9521178637200737, + "grad_norm": 0.5993812084197998, + "learning_rate": 9.889057493361903e-05, + "loss": 2.1931, + "step": 3102 + }, + { + "epoch": 0.9524248004910988, + "grad_norm": 0.6157637238502502, + "learning_rate": 9.888953342590832e-05, + "loss": 2.0757, + "step": 3103 + }, + { + "epoch": 0.952731737262124, + "grad_norm": 0.6280032992362976, + "learning_rate": 9.88884914350426e-05, + "loss": 2.0042, + "step": 3104 + }, + { + "epoch": 0.9530386740331491, + "grad_norm": 0.6740781664848328, + "learning_rate": 9.888744896103212e-05, + "loss": 2.0663, + "step": 3105 + }, + { + "epoch": 0.9533456108041743, + "grad_norm": 0.5851804614067078, + "learning_rate": 9.888640600388725e-05, + "loss": 2.0585, + "step": 3106 + }, + { + "epoch": 0.9536525475751995, + "grad_norm": 0.6590312719345093, + "learning_rate": 9.888536256361825e-05, + "loss": 2.0698, + "step": 3107 + }, + { + "epoch": 0.9539594843462247, + "grad_norm": 0.5356595516204834, + "learning_rate": 9.888431864023544e-05, + "loss": 2.1019, + "step": 3108 + }, + { + "epoch": 0.9542664211172498, + "grad_norm": 0.6401084661483765, + "learning_rate": 9.888327423374915e-05, + "loss": 2.1176, + "step": 3109 + }, + { + "epoch": 0.954573357888275, + "grad_norm": 0.6582900285720825, + "learning_rate": 9.888222934416968e-05, + "loss": 2.0375, + "step": 3110 + }, + { + "epoch": 0.9548802946593001, + "grad_norm": 0.6245424151420593, + "learning_rate": 9.888118397150738e-05, + "loss": 1.9913, + "step": 3111 + }, + { + "epoch": 0.9551872314303254, + "grad_norm": 0.5871780514717102, + "learning_rate": 9.888013811577256e-05, + "loss": 2.1434, + "step": 3112 + }, + { + "epoch": 0.9554941682013505, + "grad_norm": 0.6295487284660339, + "learning_rate": 9.887909177697559e-05, + "loss": 2.0805, + "step": 3113 + }, + { + "epoch": 0.9558011049723757, + "grad_norm": 0.5844045877456665, + "learning_rate": 9.887804495512676e-05, + "loss": 2.076, + "step": 3114 + }, + { + "epoch": 0.9561080417434008, + "grad_norm": 0.5581921339035034, + "learning_rate": 9.887699765023645e-05, + "loss": 2.131, + "step": 3115 + }, + { + "epoch": 0.956414978514426, + "grad_norm": 0.6659174561500549, + "learning_rate": 9.8875949862315e-05, + "loss": 2.0759, + "step": 3116 + }, + { + "epoch": 0.9567219152854513, + "grad_norm": 0.5852961540222168, + "learning_rate": 9.887490159137276e-05, + "loss": 2.0486, + "step": 3117 + }, + { + "epoch": 0.9570288520564764, + "grad_norm": 0.6077566146850586, + "learning_rate": 9.887385283742011e-05, + "loss": 2.1132, + "step": 3118 + }, + { + "epoch": 0.9573357888275016, + "grad_norm": 0.5991361141204834, + "learning_rate": 9.88728036004674e-05, + "loss": 2.0322, + "step": 3119 + }, + { + "epoch": 0.9576427255985267, + "grad_norm": 0.5832391977310181, + "learning_rate": 9.887175388052499e-05, + "loss": 2.135, + "step": 3120 + }, + { + "epoch": 0.9579496623695519, + "grad_norm": 0.5479732751846313, + "learning_rate": 9.887070367760327e-05, + "loss": 2.1222, + "step": 3121 + }, + { + "epoch": 0.958256599140577, + "grad_norm": 0.5630220770835876, + "learning_rate": 9.88696529917126e-05, + "loss": 2.1247, + "step": 3122 + }, + { + "epoch": 0.9585635359116023, + "grad_norm": 0.7052439451217651, + "learning_rate": 9.88686018228634e-05, + "loss": 2.204, + "step": 3123 + }, + { + "epoch": 0.9588704726826274, + "grad_norm": 0.5995638370513916, + "learning_rate": 9.8867550171066e-05, + "loss": 2.0153, + "step": 3124 + }, + { + "epoch": 0.9591774094536526, + "grad_norm": 0.5689408779144287, + "learning_rate": 9.886649803633086e-05, + "loss": 2.0341, + "step": 3125 + }, + { + "epoch": 0.9594843462246777, + "grad_norm": 0.5247456431388855, + "learning_rate": 9.886544541866832e-05, + "loss": 2.0657, + "step": 3126 + }, + { + "epoch": 0.9597912829957029, + "grad_norm": 0.5596463084220886, + "learning_rate": 9.886439231808882e-05, + "loss": 2.0829, + "step": 3127 + }, + { + "epoch": 0.960098219766728, + "grad_norm": 0.4993874430656433, + "learning_rate": 9.886333873460275e-05, + "loss": 2.0517, + "step": 3128 + }, + { + "epoch": 0.9604051565377533, + "grad_norm": 0.5776910185813904, + "learning_rate": 9.886228466822054e-05, + "loss": 2.0124, + "step": 3129 + }, + { + "epoch": 0.9607120933087784, + "grad_norm": 0.5871354341506958, + "learning_rate": 9.886123011895258e-05, + "loss": 2.0327, + "step": 3130 + }, + { + "epoch": 0.9610190300798036, + "grad_norm": 0.5873207449913025, + "learning_rate": 9.886017508680931e-05, + "loss": 2.0756, + "step": 3131 + }, + { + "epoch": 0.9613259668508287, + "grad_norm": 0.6422720551490784, + "learning_rate": 9.885911957180113e-05, + "loss": 2.0649, + "step": 3132 + }, + { + "epoch": 0.9616329036218539, + "grad_norm": 0.6040814518928528, + "learning_rate": 9.885806357393853e-05, + "loss": 2.066, + "step": 3133 + }, + { + "epoch": 0.961939840392879, + "grad_norm": 0.6629621982574463, + "learning_rate": 9.885700709323189e-05, + "loss": 2.0824, + "step": 3134 + }, + { + "epoch": 0.9622467771639043, + "grad_norm": 0.572485625743866, + "learning_rate": 9.885595012969168e-05, + "loss": 2.0572, + "step": 3135 + }, + { + "epoch": 0.9625537139349294, + "grad_norm": 0.5050783753395081, + "learning_rate": 9.885489268332833e-05, + "loss": 2.0645, + "step": 3136 + }, + { + "epoch": 0.9628606507059546, + "grad_norm": 0.5744417309761047, + "learning_rate": 9.885383475415229e-05, + "loss": 2.0549, + "step": 3137 + }, + { + "epoch": 0.9631675874769797, + "grad_norm": 0.5604275465011597, + "learning_rate": 9.885277634217403e-05, + "loss": 2.1339, + "step": 3138 + }, + { + "epoch": 0.963474524248005, + "grad_norm": 0.6182584762573242, + "learning_rate": 9.8851717447404e-05, + "loss": 2.0397, + "step": 3139 + }, + { + "epoch": 0.9637814610190301, + "grad_norm": 0.510515570640564, + "learning_rate": 9.885065806985266e-05, + "loss": 1.9761, + "step": 3140 + }, + { + "epoch": 0.9640883977900553, + "grad_norm": 0.4881763756275177, + "learning_rate": 9.884959820953048e-05, + "loss": 2.005, + "step": 3141 + }, + { + "epoch": 0.9643953345610804, + "grad_norm": 0.47206851840019226, + "learning_rate": 9.884853786644794e-05, + "loss": 2.0661, + "step": 3142 + }, + { + "epoch": 0.9647022713321056, + "grad_norm": 0.5691676735877991, + "learning_rate": 9.884747704061552e-05, + "loss": 2.1316, + "step": 3143 + }, + { + "epoch": 0.9650092081031307, + "grad_norm": 0.5338765978813171, + "learning_rate": 9.884641573204372e-05, + "loss": 2.0715, + "step": 3144 + }, + { + "epoch": 0.965316144874156, + "grad_norm": 0.5721597075462341, + "learning_rate": 9.884535394074299e-05, + "loss": 2.1004, + "step": 3145 + }, + { + "epoch": 0.9656230816451811, + "grad_norm": 0.5269518494606018, + "learning_rate": 9.884429166672384e-05, + "loss": 2.1233, + "step": 3146 + }, + { + "epoch": 0.9659300184162063, + "grad_norm": 0.5264385342597961, + "learning_rate": 9.884322890999678e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.9662369551872314, + "grad_norm": 0.6094604730606079, + "learning_rate": 9.88421656705723e-05, + "loss": 2.1009, + "step": 3148 + }, + { + "epoch": 0.9665438919582566, + "grad_norm": 0.5538906455039978, + "learning_rate": 9.884110194846093e-05, + "loss": 2.0055, + "step": 3149 + }, + { + "epoch": 0.9668508287292817, + "grad_norm": 0.591526985168457, + "learning_rate": 9.884003774367313e-05, + "loss": 2.0655, + "step": 3150 + }, + { + "epoch": 0.967157765500307, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.883897305621948e-05, + "loss": 2.0775, + "step": 3151 + }, + { + "epoch": 0.9674647022713321, + "grad_norm": 0.5074640512466431, + "learning_rate": 9.883790788611045e-05, + "loss": 2.0322, + "step": 3152 + }, + { + "epoch": 0.9677716390423573, + "grad_norm": 0.5111376047134399, + "learning_rate": 9.883684223335661e-05, + "loss": 2.0972, + "step": 3153 + }, + { + "epoch": 0.9680785758133824, + "grad_norm": 0.5187644362449646, + "learning_rate": 9.883577609796846e-05, + "loss": 2.072, + "step": 3154 + }, + { + "epoch": 0.9683855125844076, + "grad_norm": 0.5285201072692871, + "learning_rate": 9.883470947995654e-05, + "loss": 2.0468, + "step": 3155 + }, + { + "epoch": 0.9686924493554327, + "grad_norm": 0.49360916018486023, + "learning_rate": 9.883364237933142e-05, + "loss": 2.07, + "step": 3156 + }, + { + "epoch": 0.968999386126458, + "grad_norm": 0.6359294056892395, + "learning_rate": 9.88325747961036e-05, + "loss": 2.1169, + "step": 3157 + }, + { + "epoch": 0.9693063228974831, + "grad_norm": 0.6274764537811279, + "learning_rate": 9.883150673028367e-05, + "loss": 2.1412, + "step": 3158 + }, + { + "epoch": 0.9696132596685083, + "grad_norm": 0.5755917429924011, + "learning_rate": 9.883043818188215e-05, + "loss": 2.0547, + "step": 3159 + }, + { + "epoch": 0.9699201964395334, + "grad_norm": 0.4765770137310028, + "learning_rate": 9.882936915090964e-05, + "loss": 2.02, + "step": 3160 + }, + { + "epoch": 0.9702271332105586, + "grad_norm": 0.5085053443908691, + "learning_rate": 9.882829963737667e-05, + "loss": 2.0355, + "step": 3161 + }, + { + "epoch": 0.9705340699815838, + "grad_norm": 0.49804505705833435, + "learning_rate": 9.882722964129385e-05, + "loss": 2.1274, + "step": 3162 + }, + { + "epoch": 0.970841006752609, + "grad_norm": 0.5575076341629028, + "learning_rate": 9.882615916267171e-05, + "loss": 2.0661, + "step": 3163 + }, + { + "epoch": 0.9711479435236341, + "grad_norm": 0.5678727626800537, + "learning_rate": 9.882508820152084e-05, + "loss": 2.1135, + "step": 3164 + }, + { + "epoch": 0.9714548802946593, + "grad_norm": 0.5505611896514893, + "learning_rate": 9.882401675785185e-05, + "loss": 2.0888, + "step": 3165 + }, + { + "epoch": 0.9717618170656844, + "grad_norm": 0.5224125385284424, + "learning_rate": 9.88229448316753e-05, + "loss": 2.0492, + "step": 3166 + }, + { + "epoch": 0.9720687538367097, + "grad_norm": 0.437215656042099, + "learning_rate": 9.882187242300178e-05, + "loss": 1.9927, + "step": 3167 + }, + { + "epoch": 0.9723756906077348, + "grad_norm": 0.4914848804473877, + "learning_rate": 9.882079953184192e-05, + "loss": 2.0309, + "step": 3168 + }, + { + "epoch": 0.97268262737876, + "grad_norm": 0.4990764260292053, + "learning_rate": 9.88197261582063e-05, + "loss": 2.0408, + "step": 3169 + }, + { + "epoch": 0.9729895641497851, + "grad_norm": 0.5283234715461731, + "learning_rate": 9.881865230210552e-05, + "loss": 2.0627, + "step": 3170 + }, + { + "epoch": 0.9732965009208103, + "grad_norm": 0.5771347284317017, + "learning_rate": 9.88175779635502e-05, + "loss": 2.1591, + "step": 3171 + }, + { + "epoch": 0.9736034376918354, + "grad_norm": 0.5020268559455872, + "learning_rate": 9.881650314255098e-05, + "loss": 2.0311, + "step": 3172 + }, + { + "epoch": 0.9739103744628607, + "grad_norm": 0.5476529002189636, + "learning_rate": 9.881542783911846e-05, + "loss": 2.1114, + "step": 3173 + }, + { + "epoch": 0.9742173112338858, + "grad_norm": 0.5630559921264648, + "learning_rate": 9.881435205326327e-05, + "loss": 2.0617, + "step": 3174 + }, + { + "epoch": 0.974524248004911, + "grad_norm": 0.5931001305580139, + "learning_rate": 9.881327578499604e-05, + "loss": 2.0376, + "step": 3175 + }, + { + "epoch": 0.9748311847759361, + "grad_norm": 0.6123979091644287, + "learning_rate": 9.881219903432742e-05, + "loss": 2.0995, + "step": 3176 + }, + { + "epoch": 0.9751381215469613, + "grad_norm": 0.6064465641975403, + "learning_rate": 9.881112180126802e-05, + "loss": 2.0533, + "step": 3177 + }, + { + "epoch": 0.9754450583179864, + "grad_norm": 0.6071485877037048, + "learning_rate": 9.881004408582852e-05, + "loss": 2.1007, + "step": 3178 + }, + { + "epoch": 0.9757519950890117, + "grad_norm": 0.6021482944488525, + "learning_rate": 9.880896588801954e-05, + "loss": 2.0528, + "step": 3179 + }, + { + "epoch": 0.9760589318600368, + "grad_norm": 0.5204832553863525, + "learning_rate": 9.880788720785177e-05, + "loss": 2.0489, + "step": 3180 + }, + { + "epoch": 0.976365868631062, + "grad_norm": 0.5347138047218323, + "learning_rate": 9.880680804533585e-05, + "loss": 2.1021, + "step": 3181 + }, + { + "epoch": 0.9766728054020871, + "grad_norm": 0.6318790912628174, + "learning_rate": 9.880572840048243e-05, + "loss": 2.0808, + "step": 3182 + }, + { + "epoch": 0.9769797421731123, + "grad_norm": 0.6978665590286255, + "learning_rate": 9.88046482733022e-05, + "loss": 2.0067, + "step": 3183 + }, + { + "epoch": 0.9772866789441375, + "grad_norm": 0.7986917495727539, + "learning_rate": 9.880356766380582e-05, + "loss": 2.0239, + "step": 3184 + }, + { + "epoch": 0.9775936157151627, + "grad_norm": 0.853898286819458, + "learning_rate": 9.880248657200402e-05, + "loss": 2.085, + "step": 3185 + }, + { + "epoch": 0.9779005524861878, + "grad_norm": 0.8207793235778809, + "learning_rate": 9.880140499790741e-05, + "loss": 2.0504, + "step": 3186 + }, + { + "epoch": 0.978207489257213, + "grad_norm": 0.7750336527824402, + "learning_rate": 9.880032294152673e-05, + "loss": 2.0962, + "step": 3187 + }, + { + "epoch": 0.9785144260282382, + "grad_norm": 0.7141241431236267, + "learning_rate": 9.879924040287263e-05, + "loss": 2.0655, + "step": 3188 + }, + { + "epoch": 0.9788213627992634, + "grad_norm": 0.6119080781936646, + "learning_rate": 9.879815738195585e-05, + "loss": 2.0611, + "step": 3189 + }, + { + "epoch": 0.9791282995702886, + "grad_norm": 0.5963751673698425, + "learning_rate": 9.879707387878708e-05, + "loss": 2.0978, + "step": 3190 + }, + { + "epoch": 0.9794352363413137, + "grad_norm": 0.5016428828239441, + "learning_rate": 9.879598989337703e-05, + "loss": 2.0323, + "step": 3191 + }, + { + "epoch": 0.9797421731123389, + "grad_norm": 0.5610151290893555, + "learning_rate": 9.87949054257364e-05, + "loss": 2.1362, + "step": 3192 + }, + { + "epoch": 0.980049109883364, + "grad_norm": 0.5687069296836853, + "learning_rate": 9.879382047587591e-05, + "loss": 2.0234, + "step": 3193 + }, + { + "epoch": 0.9803560466543892, + "grad_norm": 0.6210914254188538, + "learning_rate": 9.87927350438063e-05, + "loss": 2.0455, + "step": 3194 + }, + { + "epoch": 0.9806629834254144, + "grad_norm": 0.530215322971344, + "learning_rate": 9.879164912953827e-05, + "loss": 2.0607, + "step": 3195 + }, + { + "epoch": 0.9809699201964396, + "grad_norm": 0.5462486147880554, + "learning_rate": 9.879056273308258e-05, + "loss": 2.1229, + "step": 3196 + }, + { + "epoch": 0.9812768569674647, + "grad_norm": 0.5765405297279358, + "learning_rate": 9.878947585444994e-05, + "loss": 2.0575, + "step": 3197 + }, + { + "epoch": 0.9815837937384899, + "grad_norm": 0.531679630279541, + "learning_rate": 9.878838849365111e-05, + "loss": 2.0208, + "step": 3198 + }, + { + "epoch": 0.981890730509515, + "grad_norm": 0.5190781950950623, + "learning_rate": 9.878730065069683e-05, + "loss": 2.0073, + "step": 3199 + }, + { + "epoch": 0.9821976672805403, + "grad_norm": 0.6260761022567749, + "learning_rate": 9.878621232559784e-05, + "loss": 2.1144, + "step": 3200 + }, + { + "epoch": 0.9825046040515654, + "grad_norm": 0.664830207824707, + "learning_rate": 9.878512351836491e-05, + "loss": 2.1423, + "step": 3201 + }, + { + "epoch": 0.9828115408225906, + "grad_norm": 0.7107433676719666, + "learning_rate": 9.878403422900881e-05, + "loss": 2.0851, + "step": 3202 + }, + { + "epoch": 0.9831184775936157, + "grad_norm": 0.7426268458366394, + "learning_rate": 9.878294445754027e-05, + "loss": 2.0637, + "step": 3203 + }, + { + "epoch": 0.9834254143646409, + "grad_norm": 0.7643515467643738, + "learning_rate": 9.878185420397008e-05, + "loss": 2.0623, + "step": 3204 + }, + { + "epoch": 0.983732351135666, + "grad_norm": 0.644257664680481, + "learning_rate": 9.878076346830904e-05, + "loss": 2.103, + "step": 3205 + }, + { + "epoch": 0.9840392879066913, + "grad_norm": 0.5871284008026123, + "learning_rate": 9.877967225056787e-05, + "loss": 2.0695, + "step": 3206 + }, + { + "epoch": 0.9843462246777164, + "grad_norm": 0.6907737851142883, + "learning_rate": 9.877858055075742e-05, + "loss": 2.1148, + "step": 3207 + }, + { + "epoch": 0.9846531614487416, + "grad_norm": 0.6685691475868225, + "learning_rate": 9.877748836888843e-05, + "loss": 2.0356, + "step": 3208 + }, + { + "epoch": 0.9849600982197667, + "grad_norm": 0.797210156917572, + "learning_rate": 9.87763957049717e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.9852670349907919, + "grad_norm": 0.8397588133811951, + "learning_rate": 9.877530255901806e-05, + "loss": 2.0697, + "step": 3210 + }, + { + "epoch": 0.985573971761817, + "grad_norm": 0.6988976001739502, + "learning_rate": 9.877420893103828e-05, + "loss": 2.0676, + "step": 3211 + }, + { + "epoch": 0.9858809085328423, + "grad_norm": 0.5828577876091003, + "learning_rate": 9.877311482104319e-05, + "loss": 2.0988, + "step": 3212 + }, + { + "epoch": 0.9861878453038674, + "grad_norm": 0.66143798828125, + "learning_rate": 9.877202022904359e-05, + "loss": 2.101, + "step": 3213 + }, + { + "epoch": 0.9864947820748926, + "grad_norm": 0.7351155877113342, + "learning_rate": 9.877092515505028e-05, + "loss": 2.0198, + "step": 3214 + }, + { + "epoch": 0.9868017188459177, + "grad_norm": 0.6817437410354614, + "learning_rate": 9.876982959907413e-05, + "loss": 2.1182, + "step": 3215 + }, + { + "epoch": 0.9871086556169429, + "grad_norm": 0.6640676259994507, + "learning_rate": 9.876873356112592e-05, + "loss": 2.1264, + "step": 3216 + }, + { + "epoch": 0.987415592387968, + "grad_norm": 0.6146695017814636, + "learning_rate": 9.876763704121652e-05, + "loss": 2.0378, + "step": 3217 + }, + { + "epoch": 0.9877225291589933, + "grad_norm": 0.6681298017501831, + "learning_rate": 9.876654003935672e-05, + "loss": 2.1916, + "step": 3218 + }, + { + "epoch": 0.9880294659300184, + "grad_norm": 0.7407983541488647, + "learning_rate": 9.876544255555742e-05, + "loss": 2.0996, + "step": 3219 + }, + { + "epoch": 0.9883364027010436, + "grad_norm": 0.5995208621025085, + "learning_rate": 9.876434458982941e-05, + "loss": 2.0023, + "step": 3220 + }, + { + "epoch": 0.9886433394720687, + "grad_norm": 0.6491377949714661, + "learning_rate": 9.876324614218357e-05, + "loss": 2.129, + "step": 3221 + }, + { + "epoch": 0.988950276243094, + "grad_norm": 0.6356569528579712, + "learning_rate": 9.876214721263074e-05, + "loss": 2.1396, + "step": 3222 + }, + { + "epoch": 0.9892572130141191, + "grad_norm": 0.6149557828903198, + "learning_rate": 9.876104780118182e-05, + "loss": 2.0204, + "step": 3223 + }, + { + "epoch": 0.9895641497851443, + "grad_norm": 0.600841224193573, + "learning_rate": 9.875994790784764e-05, + "loss": 2.0585, + "step": 3224 + }, + { + "epoch": 0.9898710865561694, + "grad_norm": 0.6398041248321533, + "learning_rate": 9.875884753263906e-05, + "loss": 2.1296, + "step": 3225 + }, + { + "epoch": 0.9901780233271946, + "grad_norm": 0.5978466272354126, + "learning_rate": 9.875774667556697e-05, + "loss": 1.9765, + "step": 3226 + }, + { + "epoch": 0.9904849600982197, + "grad_norm": 0.49499931931495667, + "learning_rate": 9.875664533664227e-05, + "loss": 2.0516, + "step": 3227 + }, + { + "epoch": 0.990791896869245, + "grad_norm": 0.5660768151283264, + "learning_rate": 9.875554351587579e-05, + "loss": 2.0743, + "step": 3228 + }, + { + "epoch": 0.9910988336402701, + "grad_norm": 0.56971275806427, + "learning_rate": 9.875444121327849e-05, + "loss": 2.0794, + "step": 3229 + }, + { + "epoch": 0.9914057704112953, + "grad_norm": 0.5806300044059753, + "learning_rate": 9.87533384288612e-05, + "loss": 2.1636, + "step": 3230 + }, + { + "epoch": 0.9917127071823204, + "grad_norm": 0.5485837459564209, + "learning_rate": 9.875223516263485e-05, + "loss": 2.025, + "step": 3231 + }, + { + "epoch": 0.9920196439533456, + "grad_norm": 0.6353451013565063, + "learning_rate": 9.875113141461034e-05, + "loss": 2.1033, + "step": 3232 + }, + { + "epoch": 0.9923265807243707, + "grad_norm": 0.577608048915863, + "learning_rate": 9.875002718479858e-05, + "loss": 2.1306, + "step": 3233 + }, + { + "epoch": 0.992633517495396, + "grad_norm": 0.5305901765823364, + "learning_rate": 9.874892247321046e-05, + "loss": 2.1123, + "step": 3234 + }, + { + "epoch": 0.9929404542664211, + "grad_norm": 0.5554118752479553, + "learning_rate": 9.874781727985693e-05, + "loss": 2.0524, + "step": 3235 + }, + { + "epoch": 0.9932473910374463, + "grad_norm": 0.48555269837379456, + "learning_rate": 9.87467116047489e-05, + "loss": 2.0699, + "step": 3236 + }, + { + "epoch": 0.9935543278084714, + "grad_norm": 0.578976035118103, + "learning_rate": 9.874560544789729e-05, + "loss": 2.0747, + "step": 3237 + }, + { + "epoch": 0.9938612645794966, + "grad_norm": 0.5508282780647278, + "learning_rate": 9.874449880931304e-05, + "loss": 2.0947, + "step": 3238 + }, + { + "epoch": 0.9941682013505218, + "grad_norm": 0.5458595752716064, + "learning_rate": 9.874339168900707e-05, + "loss": 2.0417, + "step": 3239 + }, + { + "epoch": 0.994475138121547, + "grad_norm": 0.5668261647224426, + "learning_rate": 9.874228408699035e-05, + "loss": 2.0948, + "step": 3240 + }, + { + "epoch": 0.9947820748925721, + "grad_norm": 0.6127253174781799, + "learning_rate": 9.87411760032738e-05, + "loss": 2.0904, + "step": 3241 + }, + { + "epoch": 0.9950890116635973, + "grad_norm": 0.5736191868782043, + "learning_rate": 9.874006743786839e-05, + "loss": 2.0637, + "step": 3242 + }, + { + "epoch": 0.9953959484346224, + "grad_norm": 0.574163019657135, + "learning_rate": 9.873895839078507e-05, + "loss": 2.0925, + "step": 3243 + }, + { + "epoch": 0.9957028852056476, + "grad_norm": 0.5660602450370789, + "learning_rate": 9.873784886203478e-05, + "loss": 2.0743, + "step": 3244 + }, + { + "epoch": 0.9960098219766728, + "grad_norm": 0.6037993431091309, + "learning_rate": 9.87367388516285e-05, + "loss": 2.1274, + "step": 3245 + }, + { + "epoch": 0.996316758747698, + "grad_norm": 0.5664488673210144, + "learning_rate": 9.873562835957722e-05, + "loss": 2.0403, + "step": 3246 + }, + { + "epoch": 0.9966236955187231, + "grad_norm": 0.6170254349708557, + "learning_rate": 9.873451738589188e-05, + "loss": 2.0198, + "step": 3247 + }, + { + "epoch": 0.9969306322897483, + "grad_norm": 0.5582032799720764, + "learning_rate": 9.873340593058348e-05, + "loss": 2.1494, + "step": 3248 + }, + { + "epoch": 0.9972375690607734, + "grad_norm": 0.5565598607063293, + "learning_rate": 9.8732293993663e-05, + "loss": 2.1062, + "step": 3249 + }, + { + "epoch": 0.9975445058317987, + "grad_norm": 0.5526474118232727, + "learning_rate": 9.873118157514142e-05, + "loss": 2.1184, + "step": 3250 + }, + { + "epoch": 0.9978514426028238, + "grad_norm": 0.5864302515983582, + "learning_rate": 9.873006867502975e-05, + "loss": 2.1389, + "step": 3251 + }, + { + "epoch": 0.998158379373849, + "grad_norm": 0.5295118689537048, + "learning_rate": 9.872895529333899e-05, + "loss": 2.05, + "step": 3252 + }, + { + "epoch": 0.9984653161448741, + "grad_norm": 0.553537905216217, + "learning_rate": 9.872784143008012e-05, + "loss": 2.0464, + "step": 3253 + }, + { + "epoch": 0.9987722529158993, + "grad_norm": 0.558159589767456, + "learning_rate": 9.872672708526416e-05, + "loss": 2.1013, + "step": 3254 + }, + { + "epoch": 0.9990791896869244, + "grad_norm": 0.5479860901832581, + "learning_rate": 9.872561225890211e-05, + "loss": 2.0497, + "step": 3255 + }, + { + "epoch": 0.9993861264579497, + "grad_norm": 0.5538234114646912, + "learning_rate": 9.872449695100503e-05, + "loss": 2.1239, + "step": 3256 + }, + { + "epoch": 0.9996930632289748, + "grad_norm": 0.5970771908760071, + "learning_rate": 9.872338116158389e-05, + "loss": 2.0693, + "step": 3257 + }, + { + "epoch": 1.0, + "grad_norm": 0.5118132829666138, + "learning_rate": 9.872226489064975e-05, + "loss": 2.0302, + "step": 3258 + }, + { + "epoch": 1.0003069367710251, + "grad_norm": 0.538902223110199, + "learning_rate": 9.872114813821363e-05, + "loss": 2.0604, + "step": 3259 + }, + { + "epoch": 1.0006138735420504, + "grad_norm": 0.47124916315078735, + "learning_rate": 9.872003090428657e-05, + "loss": 2.054, + "step": 3260 + }, + { + "epoch": 1.0009208103130756, + "grad_norm": 0.5109235048294067, + "learning_rate": 9.87189131888796e-05, + "loss": 2.0107, + "step": 3261 + }, + { + "epoch": 1.0012277470841007, + "grad_norm": 0.5530306696891785, + "learning_rate": 9.871779499200377e-05, + "loss": 2.0914, + "step": 3262 + }, + { + "epoch": 1.0015346838551258, + "grad_norm": 0.6271992325782776, + "learning_rate": 9.871667631367017e-05, + "loss": 1.9855, + "step": 3263 + }, + { + "epoch": 1.0018416206261511, + "grad_norm": 0.5752004384994507, + "learning_rate": 9.871555715388978e-05, + "loss": 2.0689, + "step": 3264 + }, + { + "epoch": 1.0021485573971762, + "grad_norm": 0.6185278296470642, + "learning_rate": 9.871443751267373e-05, + "loss": 2.0751, + "step": 3265 + }, + { + "epoch": 1.0024554941682013, + "grad_norm": 0.625248908996582, + "learning_rate": 9.871331739003304e-05, + "loss": 2.102, + "step": 3266 + }, + { + "epoch": 1.0027624309392265, + "grad_norm": 0.6345300078392029, + "learning_rate": 9.87121967859788e-05, + "loss": 2.0898, + "step": 3267 + }, + { + "epoch": 1.0030693677102518, + "grad_norm": 0.6836622953414917, + "learning_rate": 9.871107570052207e-05, + "loss": 2.1348, + "step": 3268 + }, + { + "epoch": 1.003376304481277, + "grad_norm": 0.699739933013916, + "learning_rate": 9.870995413367397e-05, + "loss": 2.0085, + "step": 3269 + }, + { + "epoch": 1.003683241252302, + "grad_norm": 0.650558590888977, + "learning_rate": 9.870883208544553e-05, + "loss": 2.0927, + "step": 3270 + }, + { + "epoch": 1.0039901780233271, + "grad_norm": 0.6837300658226013, + "learning_rate": 9.870770955584785e-05, + "loss": 2.1415, + "step": 3271 + }, + { + "epoch": 1.0042971147943525, + "grad_norm": 0.595761239528656, + "learning_rate": 9.870658654489206e-05, + "loss": 2.0372, + "step": 3272 + }, + { + "epoch": 1.0046040515653776, + "grad_norm": 0.5177203416824341, + "learning_rate": 9.870546305258922e-05, + "loss": 2.053, + "step": 3273 + }, + { + "epoch": 1.0049109883364027, + "grad_norm": 0.5392438173294067, + "learning_rate": 9.870433907895045e-05, + "loss": 2.0886, + "step": 3274 + }, + { + "epoch": 1.0052179251074278, + "grad_norm": 0.594776451587677, + "learning_rate": 9.870321462398686e-05, + "loss": 2.0158, + "step": 3275 + }, + { + "epoch": 1.0055248618784531, + "grad_norm": 0.6363179683685303, + "learning_rate": 9.870208968770955e-05, + "loss": 2.0532, + "step": 3276 + }, + { + "epoch": 1.0058317986494782, + "grad_norm": 0.7506567239761353, + "learning_rate": 9.870096427012965e-05, + "loss": 2.1288, + "step": 3277 + }, + { + "epoch": 1.0061387354205034, + "grad_norm": 0.7155289053916931, + "learning_rate": 9.869983837125828e-05, + "loss": 2.0859, + "step": 3278 + }, + { + "epoch": 1.0064456721915285, + "grad_norm": 0.7589760422706604, + "learning_rate": 9.869871199110656e-05, + "loss": 2.1668, + "step": 3279 + }, + { + "epoch": 1.0067526089625538, + "grad_norm": 0.6161168217658997, + "learning_rate": 9.869758512968562e-05, + "loss": 2.0421, + "step": 3280 + }, + { + "epoch": 1.007059545733579, + "grad_norm": 0.5722637176513672, + "learning_rate": 9.86964577870066e-05, + "loss": 2.1333, + "step": 3281 + }, + { + "epoch": 1.007366482504604, + "grad_norm": 0.6443020701408386, + "learning_rate": 9.869532996308065e-05, + "loss": 2.0227, + "step": 3282 + }, + { + "epoch": 1.0076734192756291, + "grad_norm": 0.6603342890739441, + "learning_rate": 9.869420165791891e-05, + "loss": 2.0888, + "step": 3283 + }, + { + "epoch": 1.0079803560466545, + "grad_norm": 0.6666482090950012, + "learning_rate": 9.869307287153251e-05, + "loss": 2.0132, + "step": 3284 + }, + { + "epoch": 1.0082872928176796, + "grad_norm": 0.6691575646400452, + "learning_rate": 9.869194360393264e-05, + "loss": 2.0752, + "step": 3285 + }, + { + "epoch": 1.0085942295887047, + "grad_norm": 0.6142565011978149, + "learning_rate": 9.869081385513044e-05, + "loss": 2.0491, + "step": 3286 + }, + { + "epoch": 1.0089011663597298, + "grad_norm": 0.5869930386543274, + "learning_rate": 9.868968362513708e-05, + "loss": 2.1252, + "step": 3287 + }, + { + "epoch": 1.0092081031307552, + "grad_norm": 0.532183825969696, + "learning_rate": 9.868855291396373e-05, + "loss": 2.0589, + "step": 3288 + }, + { + "epoch": 1.0095150399017803, + "grad_norm": 0.616374135017395, + "learning_rate": 9.868742172162156e-05, + "loss": 2.0808, + "step": 3289 + }, + { + "epoch": 1.0098219766728054, + "grad_norm": 0.5750923156738281, + "learning_rate": 9.868629004812176e-05, + "loss": 2.0407, + "step": 3290 + }, + { + "epoch": 1.0101289134438305, + "grad_norm": 0.6161531209945679, + "learning_rate": 9.86851578934755e-05, + "loss": 2.0938, + "step": 3291 + }, + { + "epoch": 1.0104358502148558, + "grad_norm": 0.5369158983230591, + "learning_rate": 9.868402525769397e-05, + "loss": 2.1298, + "step": 3292 + }, + { + "epoch": 1.010742786985881, + "grad_norm": 0.5134824514389038, + "learning_rate": 9.868289214078837e-05, + "loss": 2.0345, + "step": 3293 + }, + { + "epoch": 1.011049723756906, + "grad_norm": 0.4972594082355499, + "learning_rate": 9.868175854276991e-05, + "loss": 2.1264, + "step": 3294 + }, + { + "epoch": 1.0113566605279312, + "grad_norm": 0.5727534890174866, + "learning_rate": 9.868062446364976e-05, + "loss": 2.1668, + "step": 3295 + }, + { + "epoch": 1.0116635972989565, + "grad_norm": 0.6384626030921936, + "learning_rate": 9.867948990343915e-05, + "loss": 2.1125, + "step": 3296 + }, + { + "epoch": 1.0119705340699816, + "grad_norm": 0.7591070532798767, + "learning_rate": 9.867835486214929e-05, + "loss": 2.0975, + "step": 3297 + }, + { + "epoch": 1.0122774708410067, + "grad_norm": 0.7940282821655273, + "learning_rate": 9.86772193397914e-05, + "loss": 2.0107, + "step": 3298 + }, + { + "epoch": 1.0125844076120318, + "grad_norm": 0.6877933144569397, + "learning_rate": 9.86760833363767e-05, + "loss": 2.0684, + "step": 3299 + }, + { + "epoch": 1.0128913443830572, + "grad_norm": 0.5361137986183167, + "learning_rate": 9.867494685191641e-05, + "loss": 2.0426, + "step": 3300 + }, + { + "epoch": 1.0131982811540823, + "grad_norm": 0.5104349851608276, + "learning_rate": 9.867380988642177e-05, + "loss": 2.0849, + "step": 3301 + }, + { + "epoch": 1.0135052179251074, + "grad_norm": 0.6133849024772644, + "learning_rate": 9.867267243990399e-05, + "loss": 2.0789, + "step": 3302 + }, + { + "epoch": 1.0138121546961325, + "grad_norm": 0.6607559323310852, + "learning_rate": 9.867153451237436e-05, + "loss": 2.0978, + "step": 3303 + }, + { + "epoch": 1.0141190914671578, + "grad_norm": 0.6853774189949036, + "learning_rate": 9.867039610384409e-05, + "loss": 2.1612, + "step": 3304 + }, + { + "epoch": 1.014426028238183, + "grad_norm": 0.6326626539230347, + "learning_rate": 9.866925721432442e-05, + "loss": 2.0887, + "step": 3305 + }, + { + "epoch": 1.014732965009208, + "grad_norm": 0.5483830571174622, + "learning_rate": 9.866811784382665e-05, + "loss": 2.0522, + "step": 3306 + }, + { + "epoch": 1.0150399017802332, + "grad_norm": 0.5980744957923889, + "learning_rate": 9.866697799236201e-05, + "loss": 2.0666, + "step": 3307 + }, + { + "epoch": 1.0153468385512585, + "grad_norm": 0.6047075986862183, + "learning_rate": 9.866583765994177e-05, + "loss": 2.0924, + "step": 3308 + }, + { + "epoch": 1.0156537753222836, + "grad_norm": 0.5932674407958984, + "learning_rate": 9.86646968465772e-05, + "loss": 2.0426, + "step": 3309 + }, + { + "epoch": 1.0159607120933087, + "grad_norm": 0.5349873304367065, + "learning_rate": 9.866355555227957e-05, + "loss": 2.027, + "step": 3310 + }, + { + "epoch": 1.0162676488643339, + "grad_norm": 0.5090891122817993, + "learning_rate": 9.866241377706015e-05, + "loss": 2.0554, + "step": 3311 + }, + { + "epoch": 1.0165745856353592, + "grad_norm": 0.605268120765686, + "learning_rate": 9.866127152093025e-05, + "loss": 2.0788, + "step": 3312 + }, + { + "epoch": 1.0168815224063843, + "grad_norm": 0.6006563305854797, + "learning_rate": 9.866012878390113e-05, + "loss": 2.0154, + "step": 3313 + }, + { + "epoch": 1.0171884591774094, + "grad_norm": 0.6412727236747742, + "learning_rate": 9.865898556598409e-05, + "loss": 2.0948, + "step": 3314 + }, + { + "epoch": 1.0174953959484345, + "grad_norm": 0.512140154838562, + "learning_rate": 9.865784186719046e-05, + "loss": 2.0314, + "step": 3315 + }, + { + "epoch": 1.0178023327194599, + "grad_norm": 0.48285913467407227, + "learning_rate": 9.865669768753151e-05, + "loss": 1.9689, + "step": 3316 + }, + { + "epoch": 1.018109269490485, + "grad_norm": 0.6067737340927124, + "learning_rate": 9.865555302701854e-05, + "loss": 2.1042, + "step": 3317 + }, + { + "epoch": 1.01841620626151, + "grad_norm": 0.6272363662719727, + "learning_rate": 9.865440788566289e-05, + "loss": 2.1092, + "step": 3318 + }, + { + "epoch": 1.0187231430325352, + "grad_norm": 0.6264182925224304, + "learning_rate": 9.865326226347586e-05, + "loss": 2.0445, + "step": 3319 + }, + { + "epoch": 1.0190300798035605, + "grad_norm": 0.5642834901809692, + "learning_rate": 9.86521161604688e-05, + "loss": 2.1041, + "step": 3320 + }, + { + "epoch": 1.0193370165745856, + "grad_norm": 0.5188324451446533, + "learning_rate": 9.865096957665297e-05, + "loss": 2.0174, + "step": 3321 + }, + { + "epoch": 1.0196439533456108, + "grad_norm": 0.5204416513442993, + "learning_rate": 9.864982251203976e-05, + "loss": 2.0927, + "step": 3322 + }, + { + "epoch": 1.0199508901166359, + "grad_norm": 0.5845292806625366, + "learning_rate": 9.86486749666405e-05, + "loss": 2.0751, + "step": 3323 + }, + { + "epoch": 1.0202578268876612, + "grad_norm": 0.5514994263648987, + "learning_rate": 9.86475269404665e-05, + "loss": 2.0976, + "step": 3324 + }, + { + "epoch": 1.0205647636586863, + "grad_norm": 0.6578981280326843, + "learning_rate": 9.864637843352915e-05, + "loss": 2.0668, + "step": 3325 + }, + { + "epoch": 1.0208717004297114, + "grad_norm": 0.6396434307098389, + "learning_rate": 9.864522944583976e-05, + "loss": 2.0648, + "step": 3326 + }, + { + "epoch": 1.0211786372007365, + "grad_norm": 0.548759400844574, + "learning_rate": 9.86440799774097e-05, + "loss": 2.0873, + "step": 3327 + }, + { + "epoch": 1.0214855739717619, + "grad_norm": 0.5739279985427856, + "learning_rate": 9.864293002825033e-05, + "loss": 2.0623, + "step": 3328 + }, + { + "epoch": 1.021792510742787, + "grad_norm": 0.5882315039634705, + "learning_rate": 9.864177959837303e-05, + "loss": 2.0399, + "step": 3329 + }, + { + "epoch": 1.022099447513812, + "grad_norm": 0.563359797000885, + "learning_rate": 9.864062868778914e-05, + "loss": 2.0839, + "step": 3330 + }, + { + "epoch": 1.0224063842848374, + "grad_norm": 0.6162607073783875, + "learning_rate": 9.863947729651006e-05, + "loss": 2.0439, + "step": 3331 + }, + { + "epoch": 1.0227133210558625, + "grad_norm": 0.6540365815162659, + "learning_rate": 9.863832542454715e-05, + "loss": 2.1234, + "step": 3332 + }, + { + "epoch": 1.0230202578268877, + "grad_norm": 0.6401089429855347, + "learning_rate": 9.86371730719118e-05, + "loss": 2.0418, + "step": 3333 + }, + { + "epoch": 1.0233271945979128, + "grad_norm": 0.6456391215324402, + "learning_rate": 9.86360202386154e-05, + "loss": 2.1191, + "step": 3334 + }, + { + "epoch": 1.023634131368938, + "grad_norm": 0.59992516040802, + "learning_rate": 9.863486692466933e-05, + "loss": 2.0582, + "step": 3335 + }, + { + "epoch": 1.0239410681399632, + "grad_norm": 0.5932520627975464, + "learning_rate": 9.8633713130085e-05, + "loss": 2.1812, + "step": 3336 + }, + { + "epoch": 1.0242480049109883, + "grad_norm": 0.6322866082191467, + "learning_rate": 9.863255885487384e-05, + "loss": 2.1523, + "step": 3337 + }, + { + "epoch": 1.0245549416820134, + "grad_norm": 0.6291313171386719, + "learning_rate": 9.863140409904719e-05, + "loss": 2.0495, + "step": 3338 + }, + { + "epoch": 1.0248618784530388, + "grad_norm": 0.6272565126419067, + "learning_rate": 9.863024886261653e-05, + "loss": 1.9812, + "step": 3339 + }, + { + "epoch": 1.025168815224064, + "grad_norm": 0.6485729217529297, + "learning_rate": 9.862909314559323e-05, + "loss": 2.0826, + "step": 3340 + }, + { + "epoch": 1.025475751995089, + "grad_norm": 0.608239471912384, + "learning_rate": 9.862793694798875e-05, + "loss": 2.0519, + "step": 3341 + }, + { + "epoch": 1.0257826887661141, + "grad_norm": 0.5492779612541199, + "learning_rate": 9.862678026981447e-05, + "loss": 1.9901, + "step": 3342 + }, + { + "epoch": 1.0260896255371394, + "grad_norm": 0.524030327796936, + "learning_rate": 9.862562311108187e-05, + "loss": 2.0695, + "step": 3343 + }, + { + "epoch": 1.0263965623081646, + "grad_norm": 0.6835227608680725, + "learning_rate": 9.862446547180235e-05, + "loss": 2.1312, + "step": 3344 + }, + { + "epoch": 1.0267034990791897, + "grad_norm": 0.6771748065948486, + "learning_rate": 9.862330735198736e-05, + "loss": 2.0566, + "step": 3345 + }, + { + "epoch": 1.0270104358502148, + "grad_norm": 0.609993577003479, + "learning_rate": 9.862214875164835e-05, + "loss": 2.1463, + "step": 3346 + }, + { + "epoch": 1.0273173726212401, + "grad_norm": 0.6617777347564697, + "learning_rate": 9.862098967079677e-05, + "loss": 2.0485, + "step": 3347 + }, + { + "epoch": 1.0276243093922652, + "grad_norm": 0.7935113906860352, + "learning_rate": 9.861983010944407e-05, + "loss": 2.0528, + "step": 3348 + }, + { + "epoch": 1.0279312461632903, + "grad_norm": 0.7510255575180054, + "learning_rate": 9.861867006760172e-05, + "loss": 1.9803, + "step": 3349 + }, + { + "epoch": 1.0282381829343155, + "grad_norm": 0.6944519281387329, + "learning_rate": 9.861750954528117e-05, + "loss": 2.0488, + "step": 3350 + }, + { + "epoch": 1.0285451197053408, + "grad_norm": 0.6057126522064209, + "learning_rate": 9.861634854249389e-05, + "loss": 2.1465, + "step": 3351 + }, + { + "epoch": 1.028852056476366, + "grad_norm": 0.6156182289123535, + "learning_rate": 9.861518705925135e-05, + "loss": 2.1227, + "step": 3352 + }, + { + "epoch": 1.029158993247391, + "grad_norm": 0.6016978621482849, + "learning_rate": 9.861402509556506e-05, + "loss": 2.0238, + "step": 3353 + }, + { + "epoch": 1.0294659300184161, + "grad_norm": 0.5987950563430786, + "learning_rate": 9.861286265144648e-05, + "loss": 2.0529, + "step": 3354 + }, + { + "epoch": 1.0297728667894415, + "grad_norm": 0.6011384725570679, + "learning_rate": 9.861169972690707e-05, + "loss": 2.0612, + "step": 3355 + }, + { + "epoch": 1.0300798035604666, + "grad_norm": 0.5217840671539307, + "learning_rate": 9.861053632195838e-05, + "loss": 2.0472, + "step": 3356 + }, + { + "epoch": 1.0303867403314917, + "grad_norm": 0.5202180743217468, + "learning_rate": 9.860937243661186e-05, + "loss": 2.1301, + "step": 3357 + }, + { + "epoch": 1.0306936771025168, + "grad_norm": 0.572290301322937, + "learning_rate": 9.860820807087905e-05, + "loss": 2.0309, + "step": 3358 + }, + { + "epoch": 1.0310006138735421, + "grad_norm": 0.5088694095611572, + "learning_rate": 9.860704322477142e-05, + "loss": 2.0789, + "step": 3359 + }, + { + "epoch": 1.0313075506445673, + "grad_norm": 0.5546056032180786, + "learning_rate": 9.860587789830052e-05, + "loss": 1.9708, + "step": 3360 + }, + { + "epoch": 1.0316144874155924, + "grad_norm": 0.5152996182441711, + "learning_rate": 9.860471209147782e-05, + "loss": 2.0656, + "step": 3361 + }, + { + "epoch": 1.0319214241866175, + "grad_norm": 0.4997018873691559, + "learning_rate": 9.860354580431488e-05, + "loss": 2.1404, + "step": 3362 + }, + { + "epoch": 1.0322283609576428, + "grad_norm": 0.5464209318161011, + "learning_rate": 9.860237903682321e-05, + "loss": 2.0013, + "step": 3363 + }, + { + "epoch": 1.032535297728668, + "grad_norm": 0.4934932589530945, + "learning_rate": 9.860121178901435e-05, + "loss": 2.0873, + "step": 3364 + }, + { + "epoch": 1.032842234499693, + "grad_norm": 0.5755184292793274, + "learning_rate": 9.860004406089982e-05, + "loss": 2.0706, + "step": 3365 + }, + { + "epoch": 1.0331491712707181, + "grad_norm": 0.6155427098274231, + "learning_rate": 9.859887585249117e-05, + "loss": 2.1153, + "step": 3366 + }, + { + "epoch": 1.0334561080417435, + "grad_norm": 0.6251068711280823, + "learning_rate": 9.859770716379995e-05, + "loss": 1.9988, + "step": 3367 + }, + { + "epoch": 1.0337630448127686, + "grad_norm": 0.5652515888214111, + "learning_rate": 9.85965379948377e-05, + "loss": 1.9834, + "step": 3368 + }, + { + "epoch": 1.0340699815837937, + "grad_norm": 0.49031418561935425, + "learning_rate": 9.859536834561599e-05, + "loss": 2.0719, + "step": 3369 + }, + { + "epoch": 1.0343769183548188, + "grad_norm": 0.5014585852622986, + "learning_rate": 9.859419821614635e-05, + "loss": 2.0309, + "step": 3370 + }, + { + "epoch": 1.0346838551258442, + "grad_norm": 0.5657221674919128, + "learning_rate": 9.859302760644036e-05, + "loss": 2.048, + "step": 3371 + }, + { + "epoch": 1.0349907918968693, + "grad_norm": 0.7023506164550781, + "learning_rate": 9.85918565165096e-05, + "loss": 2.033, + "step": 3372 + }, + { + "epoch": 1.0352977286678944, + "grad_norm": 0.5712850689888, + "learning_rate": 9.859068494636565e-05, + "loss": 2.1006, + "step": 3373 + }, + { + "epoch": 1.0356046654389195, + "grad_norm": 0.5352653861045837, + "learning_rate": 9.858951289602004e-05, + "loss": 1.9775, + "step": 3374 + }, + { + "epoch": 1.0359116022099448, + "grad_norm": 0.5282073616981506, + "learning_rate": 9.85883403654844e-05, + "loss": 2.0388, + "step": 3375 + }, + { + "epoch": 1.03621853898097, + "grad_norm": 0.6164727210998535, + "learning_rate": 9.85871673547703e-05, + "loss": 2.0758, + "step": 3376 + }, + { + "epoch": 1.036525475751995, + "grad_norm": 0.6034660935401917, + "learning_rate": 9.858599386388933e-05, + "loss": 2.0619, + "step": 3377 + }, + { + "epoch": 1.0368324125230202, + "grad_norm": 0.6129952073097229, + "learning_rate": 9.85848198928531e-05, + "loss": 2.0709, + "step": 3378 + }, + { + "epoch": 1.0371393492940455, + "grad_norm": 0.6287248134613037, + "learning_rate": 9.85836454416732e-05, + "loss": 2.1493, + "step": 3379 + }, + { + "epoch": 1.0374462860650706, + "grad_norm": 0.675419807434082, + "learning_rate": 9.858247051036124e-05, + "loss": 2.0558, + "step": 3380 + }, + { + "epoch": 1.0377532228360957, + "grad_norm": 0.6493481397628784, + "learning_rate": 9.858129509892882e-05, + "loss": 2.2019, + "step": 3381 + }, + { + "epoch": 1.0380601596071208, + "grad_norm": 0.6690036058425903, + "learning_rate": 9.85801192073876e-05, + "loss": 2.0069, + "step": 3382 + }, + { + "epoch": 1.0383670963781462, + "grad_norm": 0.6682954430580139, + "learning_rate": 9.857894283574913e-05, + "loss": 2.0559, + "step": 3383 + }, + { + "epoch": 1.0386740331491713, + "grad_norm": 0.6408236622810364, + "learning_rate": 9.857776598402508e-05, + "loss": 2.0837, + "step": 3384 + }, + { + "epoch": 1.0389809699201964, + "grad_norm": 0.7896385192871094, + "learning_rate": 9.85765886522271e-05, + "loss": 2.1344, + "step": 3385 + }, + { + "epoch": 1.0392879066912215, + "grad_norm": 0.7404007911682129, + "learning_rate": 9.857541084036677e-05, + "loss": 2.0937, + "step": 3386 + }, + { + "epoch": 1.0395948434622468, + "grad_norm": 0.6780609488487244, + "learning_rate": 9.857423254845577e-05, + "loss": 2.0279, + "step": 3387 + }, + { + "epoch": 1.039901780233272, + "grad_norm": 0.5989474654197693, + "learning_rate": 9.857305377650574e-05, + "loss": 2.0997, + "step": 3388 + }, + { + "epoch": 1.040208717004297, + "grad_norm": 0.5449484586715698, + "learning_rate": 9.857187452452832e-05, + "loss": 2.0544, + "step": 3389 + }, + { + "epoch": 1.0405156537753222, + "grad_norm": 0.6261779069900513, + "learning_rate": 9.857069479253516e-05, + "loss": 2.024, + "step": 3390 + }, + { + "epoch": 1.0408225905463475, + "grad_norm": 0.6665713787078857, + "learning_rate": 9.856951458053794e-05, + "loss": 2.1139, + "step": 3391 + }, + { + "epoch": 1.0411295273173726, + "grad_norm": 0.5861490964889526, + "learning_rate": 9.856833388854829e-05, + "loss": 2.0087, + "step": 3392 + }, + { + "epoch": 1.0414364640883977, + "grad_norm": 0.5511623620986938, + "learning_rate": 9.856715271657793e-05, + "loss": 2.106, + "step": 3393 + }, + { + "epoch": 1.0417434008594229, + "grad_norm": 0.5450705885887146, + "learning_rate": 9.856597106463848e-05, + "loss": 2.0669, + "step": 3394 + }, + { + "epoch": 1.0420503376304482, + "grad_norm": 0.5172801613807678, + "learning_rate": 9.856478893274163e-05, + "loss": 2.0492, + "step": 3395 + }, + { + "epoch": 1.0423572744014733, + "grad_norm": 0.580157458782196, + "learning_rate": 9.856360632089907e-05, + "loss": 2.0794, + "step": 3396 + }, + { + "epoch": 1.0426642111724984, + "grad_norm": 0.5138662457466125, + "learning_rate": 9.856242322912251e-05, + "loss": 2.0813, + "step": 3397 + }, + { + "epoch": 1.0429711479435237, + "grad_norm": 0.5626689791679382, + "learning_rate": 9.85612396574236e-05, + "loss": 2.071, + "step": 3398 + }, + { + "epoch": 1.0432780847145489, + "grad_norm": 0.6069894433021545, + "learning_rate": 9.856005560581407e-05, + "loss": 2.132, + "step": 3399 + }, + { + "epoch": 1.043585021485574, + "grad_norm": 0.547346293926239, + "learning_rate": 9.85588710743056e-05, + "loss": 2.0572, + "step": 3400 + }, + { + "epoch": 1.043891958256599, + "grad_norm": 0.5712311863899231, + "learning_rate": 9.855768606290992e-05, + "loss": 2.0943, + "step": 3401 + }, + { + "epoch": 1.0441988950276242, + "grad_norm": 0.5945014953613281, + "learning_rate": 9.85565005716387e-05, + "loss": 2.1004, + "step": 3402 + }, + { + "epoch": 1.0445058317986495, + "grad_norm": 0.5712563395500183, + "learning_rate": 9.85553146005037e-05, + "loss": 2.0817, + "step": 3403 + }, + { + "epoch": 1.0448127685696746, + "grad_norm": 0.552578866481781, + "learning_rate": 9.855412814951661e-05, + "loss": 2.0514, + "step": 3404 + }, + { + "epoch": 1.0451197053406998, + "grad_norm": 0.5654930472373962, + "learning_rate": 9.855294121868918e-05, + "loss": 2.1342, + "step": 3405 + }, + { + "epoch": 1.045426642111725, + "grad_norm": 0.516094446182251, + "learning_rate": 9.855175380803312e-05, + "loss": 2.01, + "step": 3406 + }, + { + "epoch": 1.0457335788827502, + "grad_norm": 0.5198549628257751, + "learning_rate": 9.855056591756018e-05, + "loss": 2.0423, + "step": 3407 + }, + { + "epoch": 1.0460405156537753, + "grad_norm": 0.45312678813934326, + "learning_rate": 9.854937754728209e-05, + "loss": 1.9767, + "step": 3408 + }, + { + "epoch": 1.0463474524248004, + "grad_norm": 0.4647958278656006, + "learning_rate": 9.854818869721059e-05, + "loss": 2.107, + "step": 3409 + }, + { + "epoch": 1.0466543891958258, + "grad_norm": 0.5034347772598267, + "learning_rate": 9.854699936735742e-05, + "loss": 2.0358, + "step": 3410 + }, + { + "epoch": 1.0469613259668509, + "grad_norm": 0.48189103603363037, + "learning_rate": 9.854580955773435e-05, + "loss": 2.0441, + "step": 3411 + }, + { + "epoch": 1.047268262737876, + "grad_norm": 0.5315099954605103, + "learning_rate": 9.854461926835316e-05, + "loss": 2.0222, + "step": 3412 + }, + { + "epoch": 1.047575199508901, + "grad_norm": 0.6013970971107483, + "learning_rate": 9.854342849922557e-05, + "loss": 2.09, + "step": 3413 + }, + { + "epoch": 1.0478821362799264, + "grad_norm": 0.7554240226745605, + "learning_rate": 9.854223725036339e-05, + "loss": 2.0411, + "step": 3414 + }, + { + "epoch": 1.0481890730509515, + "grad_norm": 0.7160158157348633, + "learning_rate": 9.854104552177835e-05, + "loss": 2.0858, + "step": 3415 + }, + { + "epoch": 1.0484960098219767, + "grad_norm": 0.5641576051712036, + "learning_rate": 9.853985331348225e-05, + "loss": 2.0287, + "step": 3416 + }, + { + "epoch": 1.0488029465930018, + "grad_norm": 0.5947676301002502, + "learning_rate": 9.853866062548687e-05, + "loss": 2.1177, + "step": 3417 + }, + { + "epoch": 1.049109883364027, + "grad_norm": 0.5780991911888123, + "learning_rate": 9.853746745780401e-05, + "loss": 2.024, + "step": 3418 + }, + { + "epoch": 1.0494168201350522, + "grad_norm": 0.6753053665161133, + "learning_rate": 9.853627381044543e-05, + "loss": 2.1303, + "step": 3419 + }, + { + "epoch": 1.0497237569060773, + "grad_norm": 0.7183442711830139, + "learning_rate": 9.853507968342295e-05, + "loss": 2.0845, + "step": 3420 + }, + { + "epoch": 1.0500306936771024, + "grad_norm": 0.6768840551376343, + "learning_rate": 9.853388507674837e-05, + "loss": 2.0991, + "step": 3421 + }, + { + "epoch": 1.0503376304481278, + "grad_norm": 0.624703049659729, + "learning_rate": 9.85326899904335e-05, + "loss": 2.0952, + "step": 3422 + }, + { + "epoch": 1.050644567219153, + "grad_norm": 0.523289144039154, + "learning_rate": 9.853149442449013e-05, + "loss": 2.0244, + "step": 3423 + }, + { + "epoch": 1.050951503990178, + "grad_norm": 0.4939860701560974, + "learning_rate": 9.853029837893008e-05, + "loss": 2.0312, + "step": 3424 + }, + { + "epoch": 1.0512584407612031, + "grad_norm": 0.5685132145881653, + "learning_rate": 9.852910185376519e-05, + "loss": 2.0863, + "step": 3425 + }, + { + "epoch": 1.0515653775322285, + "grad_norm": 0.5713129639625549, + "learning_rate": 9.852790484900725e-05, + "loss": 2.1182, + "step": 3426 + }, + { + "epoch": 1.0518723143032536, + "grad_norm": 0.5626100301742554, + "learning_rate": 9.852670736466813e-05, + "loss": 2.0187, + "step": 3427 + }, + { + "epoch": 1.0521792510742787, + "grad_norm": 0.5129684805870056, + "learning_rate": 9.852550940075965e-05, + "loss": 2.0354, + "step": 3428 + }, + { + "epoch": 1.0524861878453038, + "grad_norm": 0.6123769879341125, + "learning_rate": 9.852431095729361e-05, + "loss": 2.1315, + "step": 3429 + }, + { + "epoch": 1.0527931246163291, + "grad_norm": 0.66834956407547, + "learning_rate": 9.852311203428192e-05, + "loss": 2.1642, + "step": 3430 + }, + { + "epoch": 1.0531000613873542, + "grad_norm": 0.6253052353858948, + "learning_rate": 9.85219126317364e-05, + "loss": 2.0651, + "step": 3431 + }, + { + "epoch": 1.0534069981583793, + "grad_norm": 0.5162510871887207, + "learning_rate": 9.852071274966888e-05, + "loss": 2.0029, + "step": 3432 + }, + { + "epoch": 1.0537139349294045, + "grad_norm": 0.5725626349449158, + "learning_rate": 9.851951238809125e-05, + "loss": 2.0875, + "step": 3433 + }, + { + "epoch": 1.0540208717004298, + "grad_norm": 0.5319885611534119, + "learning_rate": 9.851831154701537e-05, + "loss": 2.0042, + "step": 3434 + }, + { + "epoch": 1.054327808471455, + "grad_norm": 0.5030925273895264, + "learning_rate": 9.851711022645307e-05, + "loss": 1.9805, + "step": 3435 + }, + { + "epoch": 1.05463474524248, + "grad_norm": 0.5786148309707642, + "learning_rate": 9.851590842641627e-05, + "loss": 2.1456, + "step": 3436 + }, + { + "epoch": 1.0549416820135051, + "grad_norm": 0.6246622800827026, + "learning_rate": 9.851470614691682e-05, + "loss": 2.042, + "step": 3437 + }, + { + "epoch": 1.0552486187845305, + "grad_norm": 0.5181210041046143, + "learning_rate": 9.851350338796662e-05, + "loss": 2.0423, + "step": 3438 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.5505120754241943, + "learning_rate": 9.851230014957754e-05, + "loss": 2.0478, + "step": 3439 + }, + { + "epoch": 1.0558624923265807, + "grad_norm": 0.6193632483482361, + "learning_rate": 9.851109643176147e-05, + "loss": 1.9904, + "step": 3440 + }, + { + "epoch": 1.0561694290976058, + "grad_norm": 0.6332803964614868, + "learning_rate": 9.85098922345303e-05, + "loss": 2.0037, + "step": 3441 + }, + { + "epoch": 1.0564763658686311, + "grad_norm": 0.5601481199264526, + "learning_rate": 9.850868755789595e-05, + "loss": 2.141, + "step": 3442 + }, + { + "epoch": 1.0567833026396563, + "grad_norm": 0.588182270526886, + "learning_rate": 9.850748240187033e-05, + "loss": 2.17, + "step": 3443 + }, + { + "epoch": 1.0570902394106814, + "grad_norm": 0.5955865383148193, + "learning_rate": 9.850627676646533e-05, + "loss": 2.1004, + "step": 3444 + }, + { + "epoch": 1.0573971761817065, + "grad_norm": 0.6412670612335205, + "learning_rate": 9.850507065169288e-05, + "loss": 2.0642, + "step": 3445 + }, + { + "epoch": 1.0577041129527318, + "grad_norm": 0.5597305297851562, + "learning_rate": 9.850386405756489e-05, + "loss": 2.0412, + "step": 3446 + }, + { + "epoch": 1.058011049723757, + "grad_norm": 0.5633887052536011, + "learning_rate": 9.850265698409328e-05, + "loss": 1.9976, + "step": 3447 + }, + { + "epoch": 1.058317986494782, + "grad_norm": 0.5924213528633118, + "learning_rate": 9.850144943128998e-05, + "loss": 2.0715, + "step": 3448 + }, + { + "epoch": 1.0586249232658071, + "grad_norm": 0.5968048572540283, + "learning_rate": 9.850024139916694e-05, + "loss": 2.0755, + "step": 3449 + }, + { + "epoch": 1.0589318600368325, + "grad_norm": 0.5745044946670532, + "learning_rate": 9.849903288773609e-05, + "loss": 2.0615, + "step": 3450 + }, + { + "epoch": 1.0592387968078576, + "grad_norm": 0.5154273509979248, + "learning_rate": 9.849782389700936e-05, + "loss": 2.0429, + "step": 3451 + }, + { + "epoch": 1.0595457335788827, + "grad_norm": 0.5307286977767944, + "learning_rate": 9.849661442699871e-05, + "loss": 2.0788, + "step": 3452 + }, + { + "epoch": 1.0598526703499078, + "grad_norm": 0.5445010662078857, + "learning_rate": 9.84954044777161e-05, + "loss": 2.0598, + "step": 3453 + }, + { + "epoch": 1.0601596071209332, + "grad_norm": 0.5858064889907837, + "learning_rate": 9.849419404917347e-05, + "loss": 2.069, + "step": 3454 + }, + { + "epoch": 1.0604665438919583, + "grad_norm": 0.5906962156295776, + "learning_rate": 9.84929831413828e-05, + "loss": 2.1256, + "step": 3455 + }, + { + "epoch": 1.0607734806629834, + "grad_norm": 0.6632845997810364, + "learning_rate": 9.849177175435605e-05, + "loss": 2.1002, + "step": 3456 + }, + { + "epoch": 1.0610804174340085, + "grad_norm": 0.6352782845497131, + "learning_rate": 9.849055988810518e-05, + "loss": 2.0901, + "step": 3457 + }, + { + "epoch": 1.0613873542050338, + "grad_norm": 0.5406731963157654, + "learning_rate": 9.848934754264218e-05, + "loss": 2.0562, + "step": 3458 + }, + { + "epoch": 1.061694290976059, + "grad_norm": 0.6067590117454529, + "learning_rate": 9.848813471797902e-05, + "loss": 2.0914, + "step": 3459 + }, + { + "epoch": 1.062001227747084, + "grad_norm": 0.5876826047897339, + "learning_rate": 9.84869214141277e-05, + "loss": 2.0065, + "step": 3460 + }, + { + "epoch": 1.0623081645181092, + "grad_norm": 0.611648440361023, + "learning_rate": 9.84857076311002e-05, + "loss": 2.1252, + "step": 3461 + }, + { + "epoch": 1.0626151012891345, + "grad_norm": 0.568358302116394, + "learning_rate": 9.848449336890853e-05, + "loss": 2.0312, + "step": 3462 + }, + { + "epoch": 1.0629220380601596, + "grad_norm": 0.5303518772125244, + "learning_rate": 9.848327862756466e-05, + "loss": 1.9989, + "step": 3463 + }, + { + "epoch": 1.0632289748311847, + "grad_norm": 0.5377182960510254, + "learning_rate": 9.848206340708062e-05, + "loss": 2.0759, + "step": 3464 + }, + { + "epoch": 1.06353591160221, + "grad_norm": 0.5178431868553162, + "learning_rate": 9.848084770746842e-05, + "loss": 2.0613, + "step": 3465 + }, + { + "epoch": 1.0638428483732352, + "grad_norm": 0.4605518877506256, + "learning_rate": 9.847963152874007e-05, + "loss": 1.9961, + "step": 3466 + }, + { + "epoch": 1.0641497851442603, + "grad_norm": 0.5262506604194641, + "learning_rate": 9.847841487090758e-05, + "loss": 2.032, + "step": 3467 + }, + { + "epoch": 1.0644567219152854, + "grad_norm": 0.5210484862327576, + "learning_rate": 9.847719773398298e-05, + "loss": 2.106, + "step": 3468 + }, + { + "epoch": 1.0647636586863105, + "grad_norm": 0.5159584283828735, + "learning_rate": 9.84759801179783e-05, + "loss": 2.07, + "step": 3469 + }, + { + "epoch": 1.0650705954573358, + "grad_norm": 0.5094224810600281, + "learning_rate": 9.847476202290557e-05, + "loss": 2.1379, + "step": 3470 + }, + { + "epoch": 1.065377532228361, + "grad_norm": 0.5180851221084595, + "learning_rate": 9.847354344877684e-05, + "loss": 2.0911, + "step": 3471 + }, + { + "epoch": 1.065684468999386, + "grad_norm": 0.5476199984550476, + "learning_rate": 9.847232439560412e-05, + "loss": 2.0654, + "step": 3472 + }, + { + "epoch": 1.0659914057704114, + "grad_norm": 0.5314182639122009, + "learning_rate": 9.84711048633995e-05, + "loss": 1.9829, + "step": 3473 + }, + { + "epoch": 1.0662983425414365, + "grad_norm": 0.549379825592041, + "learning_rate": 9.8469884852175e-05, + "loss": 2.0876, + "step": 3474 + }, + { + "epoch": 1.0666052793124616, + "grad_norm": 0.6280861496925354, + "learning_rate": 9.84686643619427e-05, + "loss": 2.1026, + "step": 3475 + }, + { + "epoch": 1.0669122160834867, + "grad_norm": 0.5838838219642639, + "learning_rate": 9.846744339271464e-05, + "loss": 2.0553, + "step": 3476 + }, + { + "epoch": 1.0672191528545119, + "grad_norm": 0.6090747117996216, + "learning_rate": 9.84662219445029e-05, + "loss": 2.0983, + "step": 3477 + }, + { + "epoch": 1.0675260896255372, + "grad_norm": 0.515504002571106, + "learning_rate": 9.846500001731955e-05, + "loss": 2.0992, + "step": 3478 + }, + { + "epoch": 1.0678330263965623, + "grad_norm": 0.5083954930305481, + "learning_rate": 9.846377761117667e-05, + "loss": 1.9851, + "step": 3479 + }, + { + "epoch": 1.0681399631675874, + "grad_norm": 0.5102222561836243, + "learning_rate": 9.846255472608632e-05, + "loss": 2.0553, + "step": 3480 + }, + { + "epoch": 1.0684468999386127, + "grad_norm": 0.5123574137687683, + "learning_rate": 9.846133136206061e-05, + "loss": 2.0382, + "step": 3481 + }, + { + "epoch": 1.0687538367096379, + "grad_norm": 0.5657833814620972, + "learning_rate": 9.84601075191116e-05, + "loss": 2.0735, + "step": 3482 + }, + { + "epoch": 1.069060773480663, + "grad_norm": 0.5460711121559143, + "learning_rate": 9.845888319725143e-05, + "loss": 2.0445, + "step": 3483 + }, + { + "epoch": 1.069367710251688, + "grad_norm": 0.42860034108161926, + "learning_rate": 9.845765839649217e-05, + "loss": 2.0166, + "step": 3484 + }, + { + "epoch": 1.0696746470227134, + "grad_norm": 0.5413190126419067, + "learning_rate": 9.845643311684592e-05, + "loss": 1.9923, + "step": 3485 + }, + { + "epoch": 1.0699815837937385, + "grad_norm": 0.4982166290283203, + "learning_rate": 9.84552073583248e-05, + "loss": 2.0279, + "step": 3486 + }, + { + "epoch": 1.0702885205647636, + "grad_norm": 0.4824393689632416, + "learning_rate": 9.845398112094091e-05, + "loss": 1.9661, + "step": 3487 + }, + { + "epoch": 1.0705954573357888, + "grad_norm": 0.5690898895263672, + "learning_rate": 9.845275440470639e-05, + "loss": 2.0866, + "step": 3488 + }, + { + "epoch": 1.070902394106814, + "grad_norm": 0.6087098717689514, + "learning_rate": 9.845152720963335e-05, + "loss": 2.055, + "step": 3489 + }, + { + "epoch": 1.0712093308778392, + "grad_norm": 0.5754218101501465, + "learning_rate": 9.845029953573392e-05, + "loss": 2.0577, + "step": 3490 + }, + { + "epoch": 1.0715162676488643, + "grad_norm": 0.619746744632721, + "learning_rate": 9.844907138302023e-05, + "loss": 2.0694, + "step": 3491 + }, + { + "epoch": 1.0718232044198894, + "grad_norm": 0.5165389776229858, + "learning_rate": 9.844784275150442e-05, + "loss": 1.9618, + "step": 3492 + }, + { + "epoch": 1.0721301411909148, + "grad_norm": 0.5098079442977905, + "learning_rate": 9.844661364119863e-05, + "loss": 2.0021, + "step": 3493 + }, + { + "epoch": 1.0724370779619399, + "grad_norm": 0.5978688597679138, + "learning_rate": 9.8445384052115e-05, + "loss": 2.0861, + "step": 3494 + }, + { + "epoch": 1.072744014732965, + "grad_norm": 0.5498695373535156, + "learning_rate": 9.844415398426572e-05, + "loss": 2.095, + "step": 3495 + }, + { + "epoch": 1.07305095150399, + "grad_norm": 0.4890369474887848, + "learning_rate": 9.844292343766289e-05, + "loss": 1.9819, + "step": 3496 + }, + { + "epoch": 1.0733578882750154, + "grad_norm": 0.49551400542259216, + "learning_rate": 9.844169241231871e-05, + "loss": 2.109, + "step": 3497 + }, + { + "epoch": 1.0736648250460405, + "grad_norm": 0.5358633399009705, + "learning_rate": 9.844046090824533e-05, + "loss": 2.0579, + "step": 3498 + }, + { + "epoch": 1.0739717618170657, + "grad_norm": 0.5990919470787048, + "learning_rate": 9.843922892545492e-05, + "loss": 2.1962, + "step": 3499 + }, + { + "epoch": 1.0742786985880908, + "grad_norm": 0.5973169207572937, + "learning_rate": 9.843799646395967e-05, + "loss": 2.0691, + "step": 3500 + }, + { + "epoch": 1.074585635359116, + "grad_norm": 0.5875831246376038, + "learning_rate": 9.843676352377172e-05, + "loss": 2.0807, + "step": 3501 + }, + { + "epoch": 1.0748925721301412, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.84355301049033e-05, + "loss": 2.0694, + "step": 3502 + }, + { + "epoch": 1.0751995089011663, + "grad_norm": 0.7694209814071655, + "learning_rate": 9.843429620736659e-05, + "loss": 2.1504, + "step": 3503 + }, + { + "epoch": 1.0755064456721914, + "grad_norm": 0.7930089831352234, + "learning_rate": 9.843306183117376e-05, + "loss": 2.0635, + "step": 3504 + }, + { + "epoch": 1.0758133824432168, + "grad_norm": 0.6518469452857971, + "learning_rate": 9.843182697633704e-05, + "loss": 2.0395, + "step": 3505 + }, + { + "epoch": 1.076120319214242, + "grad_norm": 0.49737605452537537, + "learning_rate": 9.843059164286861e-05, + "loss": 1.9875, + "step": 3506 + }, + { + "epoch": 1.076427255985267, + "grad_norm": 0.5311492085456848, + "learning_rate": 9.84293558307807e-05, + "loss": 2.1331, + "step": 3507 + }, + { + "epoch": 1.0767341927562921, + "grad_norm": 0.6801449656486511, + "learning_rate": 9.842811954008551e-05, + "loss": 2.0991, + "step": 3508 + }, + { + "epoch": 1.0770411295273175, + "grad_norm": 0.5404406189918518, + "learning_rate": 9.842688277079523e-05, + "loss": 2.0482, + "step": 3509 + }, + { + "epoch": 1.0773480662983426, + "grad_norm": 0.6136532425880432, + "learning_rate": 9.842564552292215e-05, + "loss": 2.1016, + "step": 3510 + }, + { + "epoch": 1.0776550030693677, + "grad_norm": 0.5874183773994446, + "learning_rate": 9.842440779647843e-05, + "loss": 2.0495, + "step": 3511 + }, + { + "epoch": 1.0779619398403928, + "grad_norm": 0.4891047775745392, + "learning_rate": 9.842316959147635e-05, + "loss": 2.0592, + "step": 3512 + }, + { + "epoch": 1.0782688766114181, + "grad_norm": 0.5115689635276794, + "learning_rate": 9.84219309079281e-05, + "loss": 2.0084, + "step": 3513 + }, + { + "epoch": 1.0785758133824432, + "grad_norm": 0.5662370324134827, + "learning_rate": 9.842069174584597e-05, + "loss": 2.1134, + "step": 3514 + }, + { + "epoch": 1.0788827501534684, + "grad_norm": 0.6859605312347412, + "learning_rate": 9.841945210524217e-05, + "loss": 2.1144, + "step": 3515 + }, + { + "epoch": 1.0791896869244935, + "grad_norm": 0.8003933429718018, + "learning_rate": 9.841821198612897e-05, + "loss": 2.0353, + "step": 3516 + }, + { + "epoch": 1.0794966236955188, + "grad_norm": 0.8481027483940125, + "learning_rate": 9.841697138851863e-05, + "loss": 2.1012, + "step": 3517 + }, + { + "epoch": 1.079803560466544, + "grad_norm": 0.7234178185462952, + "learning_rate": 9.84157303124234e-05, + "loss": 2.1134, + "step": 3518 + }, + { + "epoch": 1.080110497237569, + "grad_norm": 0.6129522919654846, + "learning_rate": 9.841448875785553e-05, + "loss": 2.0736, + "step": 3519 + }, + { + "epoch": 1.0804174340085941, + "grad_norm": 0.4983314573764801, + "learning_rate": 9.841324672482732e-05, + "loss": 2.0334, + "step": 3520 + }, + { + "epoch": 1.0807243707796195, + "grad_norm": 0.6069099307060242, + "learning_rate": 9.841200421335101e-05, + "loss": 2.0506, + "step": 3521 + }, + { + "epoch": 1.0810313075506446, + "grad_norm": 0.5841798186302185, + "learning_rate": 9.841076122343893e-05, + "loss": 2.0491, + "step": 3522 + }, + { + "epoch": 1.0813382443216697, + "grad_norm": 0.5629861354827881, + "learning_rate": 9.84095177551033e-05, + "loss": 2.0435, + "step": 3523 + }, + { + "epoch": 1.0816451810926948, + "grad_norm": 0.48676446080207825, + "learning_rate": 9.840827380835646e-05, + "loss": 2.0543, + "step": 3524 + }, + { + "epoch": 1.0819521178637201, + "grad_norm": 0.5119389295578003, + "learning_rate": 9.840702938321069e-05, + "loss": 2.0461, + "step": 3525 + }, + { + "epoch": 1.0822590546347453, + "grad_norm": 0.47259917855262756, + "learning_rate": 9.840578447967827e-05, + "loss": 2.0494, + "step": 3526 + }, + { + "epoch": 1.0825659914057704, + "grad_norm": 0.5083605647087097, + "learning_rate": 9.840453909777153e-05, + "loss": 2.0518, + "step": 3527 + }, + { + "epoch": 1.0828729281767955, + "grad_norm": 0.46149778366088867, + "learning_rate": 9.840329323750276e-05, + "loss": 2.0087, + "step": 3528 + }, + { + "epoch": 1.0831798649478208, + "grad_norm": 0.4698919951915741, + "learning_rate": 9.840204689888427e-05, + "loss": 2.0715, + "step": 3529 + }, + { + "epoch": 1.083486801718846, + "grad_norm": 0.514570951461792, + "learning_rate": 9.840080008192838e-05, + "loss": 2.1067, + "step": 3530 + }, + { + "epoch": 1.083793738489871, + "grad_norm": 0.5938723087310791, + "learning_rate": 9.839955278664743e-05, + "loss": 2.1246, + "step": 3531 + }, + { + "epoch": 1.0841006752608962, + "grad_norm": 0.58525550365448, + "learning_rate": 9.839830501305372e-05, + "loss": 2.0695, + "step": 3532 + }, + { + "epoch": 1.0844076120319215, + "grad_norm": 0.5693490505218506, + "learning_rate": 9.83970567611596e-05, + "loss": 2.0166, + "step": 3533 + }, + { + "epoch": 1.0847145488029466, + "grad_norm": 0.544964075088501, + "learning_rate": 9.839580803097738e-05, + "loss": 2.0093, + "step": 3534 + }, + { + "epoch": 1.0850214855739717, + "grad_norm": 0.5509639978408813, + "learning_rate": 9.839455882251945e-05, + "loss": 2.0511, + "step": 3535 + }, + { + "epoch": 1.0853284223449968, + "grad_norm": 0.5092516541481018, + "learning_rate": 9.83933091357981e-05, + "loss": 2.0586, + "step": 3536 + }, + { + "epoch": 1.0856353591160222, + "grad_norm": 0.5163968205451965, + "learning_rate": 9.83920589708257e-05, + "loss": 2.0541, + "step": 3537 + }, + { + "epoch": 1.0859422958870473, + "grad_norm": 0.49756479263305664, + "learning_rate": 9.839080832761464e-05, + "loss": 2.0495, + "step": 3538 + }, + { + "epoch": 1.0862492326580724, + "grad_norm": 0.6246916055679321, + "learning_rate": 9.838955720617722e-05, + "loss": 2.2082, + "step": 3539 + }, + { + "epoch": 1.0865561694290977, + "grad_norm": 0.5826153755187988, + "learning_rate": 9.838830560652585e-05, + "loss": 2.0318, + "step": 3540 + }, + { + "epoch": 1.0868631062001228, + "grad_norm": 0.6131548285484314, + "learning_rate": 9.838705352867287e-05, + "loss": 2.1172, + "step": 3541 + }, + { + "epoch": 1.087170042971148, + "grad_norm": 0.7028201818466187, + "learning_rate": 9.838580097263068e-05, + "loss": 2.061, + "step": 3542 + }, + { + "epoch": 1.087476979742173, + "grad_norm": 0.7061073780059814, + "learning_rate": 9.838454793841166e-05, + "loss": 2.0944, + "step": 3543 + }, + { + "epoch": 1.0877839165131982, + "grad_norm": 0.6820229887962341, + "learning_rate": 9.838329442602814e-05, + "loss": 2.072, + "step": 3544 + }, + { + "epoch": 1.0880908532842235, + "grad_norm": 0.5658139586448669, + "learning_rate": 9.838204043549257e-05, + "loss": 2.0499, + "step": 3545 + }, + { + "epoch": 1.0883977900552486, + "grad_norm": 0.5714126825332642, + "learning_rate": 9.838078596681731e-05, + "loss": 2.06, + "step": 3546 + }, + { + "epoch": 1.0887047268262737, + "grad_norm": 0.5343610048294067, + "learning_rate": 9.837953102001477e-05, + "loss": 2.0932, + "step": 3547 + }, + { + "epoch": 1.089011663597299, + "grad_norm": 0.5799851417541504, + "learning_rate": 9.837827559509735e-05, + "loss": 2.0615, + "step": 3548 + }, + { + "epoch": 1.0893186003683242, + "grad_norm": 0.5679401159286499, + "learning_rate": 9.837701969207745e-05, + "loss": 2.0161, + "step": 3549 + }, + { + "epoch": 1.0896255371393493, + "grad_norm": 0.5369420647621155, + "learning_rate": 9.83757633109675e-05, + "loss": 2.0066, + "step": 3550 + }, + { + "epoch": 1.0899324739103744, + "grad_norm": 0.5276355147361755, + "learning_rate": 9.837450645177988e-05, + "loss": 2.03, + "step": 3551 + }, + { + "epoch": 1.0902394106813997, + "grad_norm": 0.49717894196510315, + "learning_rate": 9.837324911452705e-05, + "loss": 1.9897, + "step": 3552 + }, + { + "epoch": 1.0905463474524248, + "grad_norm": 0.460783451795578, + "learning_rate": 9.837199129922142e-05, + "loss": 2.089, + "step": 3553 + }, + { + "epoch": 1.09085328422345, + "grad_norm": 0.505473792552948, + "learning_rate": 9.837073300587541e-05, + "loss": 2.035, + "step": 3554 + }, + { + "epoch": 1.091160220994475, + "grad_norm": 0.4588155150413513, + "learning_rate": 9.836947423450147e-05, + "loss": 2.0029, + "step": 3555 + }, + { + "epoch": 1.0914671577655004, + "grad_norm": 0.5151825547218323, + "learning_rate": 9.836821498511203e-05, + "loss": 2.1075, + "step": 3556 + }, + { + "epoch": 1.0917740945365255, + "grad_norm": 0.46669647097587585, + "learning_rate": 9.836695525771955e-05, + "loss": 2.0468, + "step": 3557 + }, + { + "epoch": 1.0920810313075506, + "grad_norm": 0.49291539192199707, + "learning_rate": 9.836569505233647e-05, + "loss": 2.1201, + "step": 3558 + }, + { + "epoch": 1.0923879680785757, + "grad_norm": 0.49323126673698425, + "learning_rate": 9.836443436897525e-05, + "loss": 1.9796, + "step": 3559 + }, + { + "epoch": 1.092694904849601, + "grad_norm": 0.4784039258956909, + "learning_rate": 9.836317320764832e-05, + "loss": 2.0267, + "step": 3560 + }, + { + "epoch": 1.0930018416206262, + "grad_norm": 0.5402999520301819, + "learning_rate": 9.836191156836818e-05, + "loss": 2.07, + "step": 3561 + }, + { + "epoch": 1.0933087783916513, + "grad_norm": 0.5989857912063599, + "learning_rate": 9.83606494511473e-05, + "loss": 2.0518, + "step": 3562 + }, + { + "epoch": 1.0936157151626764, + "grad_norm": 0.685855507850647, + "learning_rate": 9.835938685599811e-05, + "loss": 2.0632, + "step": 3563 + }, + { + "epoch": 1.0939226519337018, + "grad_norm": 0.7716066837310791, + "learning_rate": 9.835812378293312e-05, + "loss": 2.0758, + "step": 3564 + }, + { + "epoch": 1.0942295887047269, + "grad_norm": 0.6822659969329834, + "learning_rate": 9.835686023196481e-05, + "loss": 2.0077, + "step": 3565 + }, + { + "epoch": 1.094536525475752, + "grad_norm": 0.5031718611717224, + "learning_rate": 9.835559620310566e-05, + "loss": 2.0432, + "step": 3566 + }, + { + "epoch": 1.094843462246777, + "grad_norm": 0.5570902228355408, + "learning_rate": 9.835433169636818e-05, + "loss": 2.1203, + "step": 3567 + }, + { + "epoch": 1.0951503990178024, + "grad_norm": 0.6224993467330933, + "learning_rate": 9.835306671176484e-05, + "loss": 2.0281, + "step": 3568 + }, + { + "epoch": 1.0954573357888275, + "grad_norm": 0.67215895652771, + "learning_rate": 9.835180124930816e-05, + "loss": 2.1158, + "step": 3569 + }, + { + "epoch": 1.0957642725598526, + "grad_norm": 0.5764983892440796, + "learning_rate": 9.835053530901064e-05, + "loss": 1.9735, + "step": 3570 + }, + { + "epoch": 1.0960712093308778, + "grad_norm": 0.48459672927856445, + "learning_rate": 9.834926889088478e-05, + "loss": 2.0074, + "step": 3571 + }, + { + "epoch": 1.096378146101903, + "grad_norm": 0.4789890944957733, + "learning_rate": 9.834800199494312e-05, + "loss": 1.9942, + "step": 3572 + }, + { + "epoch": 1.0966850828729282, + "grad_norm": 0.5133237838745117, + "learning_rate": 9.834673462119817e-05, + "loss": 2.0204, + "step": 3573 + }, + { + "epoch": 1.0969920196439533, + "grad_norm": 0.638518750667572, + "learning_rate": 9.834546676966244e-05, + "loss": 2.1396, + "step": 3574 + }, + { + "epoch": 1.0972989564149784, + "grad_norm": 0.5471677780151367, + "learning_rate": 9.834419844034848e-05, + "loss": 1.99, + "step": 3575 + }, + { + "epoch": 1.0976058931860038, + "grad_norm": 0.5372926592826843, + "learning_rate": 9.83429296332688e-05, + "loss": 2.0241, + "step": 3576 + }, + { + "epoch": 1.0979128299570289, + "grad_norm": 0.5284983515739441, + "learning_rate": 9.834166034843597e-05, + "loss": 2.0705, + "step": 3577 + }, + { + "epoch": 1.098219766728054, + "grad_norm": 0.5212574601173401, + "learning_rate": 9.834039058586252e-05, + "loss": 2.0648, + "step": 3578 + }, + { + "epoch": 1.098526703499079, + "grad_norm": 0.439454048871994, + "learning_rate": 9.833912034556099e-05, + "loss": 1.9981, + "step": 3579 + }, + { + "epoch": 1.0988336402701044, + "grad_norm": 0.529550313949585, + "learning_rate": 9.833784962754394e-05, + "loss": 2.0092, + "step": 3580 + }, + { + "epoch": 1.0991405770411296, + "grad_norm": 0.5555844902992249, + "learning_rate": 9.833657843182394e-05, + "loss": 2.0457, + "step": 3581 + }, + { + "epoch": 1.0994475138121547, + "grad_norm": 0.56191086769104, + "learning_rate": 9.833530675841352e-05, + "loss": 2.0742, + "step": 3582 + }, + { + "epoch": 1.0997544505831798, + "grad_norm": 0.5119436383247375, + "learning_rate": 9.833403460732529e-05, + "loss": 2.0836, + "step": 3583 + }, + { + "epoch": 1.1000613873542051, + "grad_norm": 0.48049578070640564, + "learning_rate": 9.833276197857179e-05, + "loss": 2.0018, + "step": 3584 + }, + { + "epoch": 1.1003683241252302, + "grad_norm": 0.48501092195510864, + "learning_rate": 9.83314888721656e-05, + "loss": 2.0158, + "step": 3585 + }, + { + "epoch": 1.1006752608962553, + "grad_norm": 0.528548538684845, + "learning_rate": 9.833021528811932e-05, + "loss": 2.0327, + "step": 3586 + }, + { + "epoch": 1.1009821976672804, + "grad_norm": 0.5243194699287415, + "learning_rate": 9.832894122644551e-05, + "loss": 1.9874, + "step": 3587 + }, + { + "epoch": 1.1012891344383058, + "grad_norm": 0.46920302510261536, + "learning_rate": 9.832766668715681e-05, + "loss": 2.0487, + "step": 3588 + }, + { + "epoch": 1.101596071209331, + "grad_norm": 0.45994171500205994, + "learning_rate": 9.832639167026575e-05, + "loss": 2.0926, + "step": 3589 + }, + { + "epoch": 1.101903007980356, + "grad_norm": 0.5337465405464172, + "learning_rate": 9.832511617578497e-05, + "loss": 1.9957, + "step": 3590 + }, + { + "epoch": 1.1022099447513811, + "grad_norm": 0.5920217633247375, + "learning_rate": 9.832384020372707e-05, + "loss": 2.0571, + "step": 3591 + }, + { + "epoch": 1.1025168815224065, + "grad_norm": 0.651720404624939, + "learning_rate": 9.832256375410466e-05, + "loss": 2.0382, + "step": 3592 + }, + { + "epoch": 1.1028238182934316, + "grad_norm": 0.6063461899757385, + "learning_rate": 9.832128682693035e-05, + "loss": 1.9932, + "step": 3593 + }, + { + "epoch": 1.1031307550644567, + "grad_norm": 0.5111881494522095, + "learning_rate": 9.832000942221676e-05, + "loss": 1.9821, + "step": 3594 + }, + { + "epoch": 1.1034376918354818, + "grad_norm": 0.5419835448265076, + "learning_rate": 9.831873153997652e-05, + "loss": 2.0535, + "step": 3595 + }, + { + "epoch": 1.1037446286065071, + "grad_norm": 0.5685762763023376, + "learning_rate": 9.831745318022226e-05, + "loss": 2.0715, + "step": 3596 + }, + { + "epoch": 1.1040515653775322, + "grad_norm": 0.6095051765441895, + "learning_rate": 9.831617434296659e-05, + "loss": 2.0382, + "step": 3597 + }, + { + "epoch": 1.1043585021485574, + "grad_norm": 0.548292338848114, + "learning_rate": 9.831489502822217e-05, + "loss": 1.98, + "step": 3598 + }, + { + "epoch": 1.1046654389195825, + "grad_norm": 0.5056986808776855, + "learning_rate": 9.831361523600165e-05, + "loss": 2.0271, + "step": 3599 + }, + { + "epoch": 1.1049723756906078, + "grad_norm": 0.48790082335472107, + "learning_rate": 9.831233496631767e-05, + "loss": 1.9555, + "step": 3600 + }, + { + "epoch": 1.105279312461633, + "grad_norm": 0.4663766622543335, + "learning_rate": 9.831105421918287e-05, + "loss": 1.9985, + "step": 3601 + }, + { + "epoch": 1.105586249232658, + "grad_norm": 0.4549616277217865, + "learning_rate": 9.83097729946099e-05, + "loss": 2.0543, + "step": 3602 + }, + { + "epoch": 1.1058931860036831, + "grad_norm": 0.46699193120002747, + "learning_rate": 9.830849129261146e-05, + "loss": 2.0395, + "step": 3603 + }, + { + "epoch": 1.1062001227747085, + "grad_norm": 0.4600387215614319, + "learning_rate": 9.830720911320019e-05, + "loss": 2.0155, + "step": 3604 + }, + { + "epoch": 1.1065070595457336, + "grad_norm": 0.4854283034801483, + "learning_rate": 9.830592645638877e-05, + "loss": 2.0698, + "step": 3605 + }, + { + "epoch": 1.1068139963167587, + "grad_norm": 0.5249526500701904, + "learning_rate": 9.830464332218987e-05, + "loss": 2.0842, + "step": 3606 + }, + { + "epoch": 1.107120933087784, + "grad_norm": 0.6377332806587219, + "learning_rate": 9.830335971061616e-05, + "loss": 2.1399, + "step": 3607 + }, + { + "epoch": 1.1074278698588091, + "grad_norm": 0.632194995880127, + "learning_rate": 9.830207562168034e-05, + "loss": 2.1203, + "step": 3608 + }, + { + "epoch": 1.1077348066298343, + "grad_norm": 0.5585857629776001, + "learning_rate": 9.830079105539512e-05, + "loss": 2.0219, + "step": 3609 + }, + { + "epoch": 1.1080417434008594, + "grad_norm": 0.5613297820091248, + "learning_rate": 9.829950601177316e-05, + "loss": 2.0464, + "step": 3610 + }, + { + "epoch": 1.1083486801718845, + "grad_norm": 0.5213276743888855, + "learning_rate": 9.829822049082716e-05, + "loss": 2.0134, + "step": 3611 + }, + { + "epoch": 1.1086556169429098, + "grad_norm": 0.5008644461631775, + "learning_rate": 9.829693449256984e-05, + "loss": 1.9952, + "step": 3612 + }, + { + "epoch": 1.108962553713935, + "grad_norm": 0.5565455555915833, + "learning_rate": 9.829564801701392e-05, + "loss": 1.9737, + "step": 3613 + }, + { + "epoch": 1.10926949048496, + "grad_norm": 0.6150243878364563, + "learning_rate": 9.82943610641721e-05, + "loss": 2.0414, + "step": 3614 + }, + { + "epoch": 1.1095764272559854, + "grad_norm": 0.6731769442558289, + "learning_rate": 9.829307363405709e-05, + "loss": 2.0262, + "step": 3615 + }, + { + "epoch": 1.1098833640270105, + "grad_norm": 0.5681004524230957, + "learning_rate": 9.829178572668162e-05, + "loss": 2.0303, + "step": 3616 + }, + { + "epoch": 1.1101903007980356, + "grad_norm": 0.4748475253582001, + "learning_rate": 9.829049734205841e-05, + "loss": 1.9756, + "step": 3617 + }, + { + "epoch": 1.1104972375690607, + "grad_norm": 0.4218698740005493, + "learning_rate": 9.82892084802002e-05, + "loss": 2.0243, + "step": 3618 + }, + { + "epoch": 1.1108041743400858, + "grad_norm": 0.47928178310394287, + "learning_rate": 9.828791914111976e-05, + "loss": 2.0368, + "step": 3619 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.5805749297142029, + "learning_rate": 9.828662932482977e-05, + "loss": 2.0071, + "step": 3620 + }, + { + "epoch": 1.1114180478821363, + "grad_norm": 0.5580070614814758, + "learning_rate": 9.828533903134302e-05, + "loss": 1.9568, + "step": 3621 + }, + { + "epoch": 1.1117249846531614, + "grad_norm": 0.572694718837738, + "learning_rate": 9.828404826067224e-05, + "loss": 2.0128, + "step": 3622 + }, + { + "epoch": 1.1120319214241867, + "grad_norm": 0.605338990688324, + "learning_rate": 9.828275701283021e-05, + "loss": 2.0638, + "step": 3623 + }, + { + "epoch": 1.1123388581952118, + "grad_norm": 0.550521969795227, + "learning_rate": 9.828146528782967e-05, + "loss": 2.118, + "step": 3624 + }, + { + "epoch": 1.112645794966237, + "grad_norm": 0.5420751571655273, + "learning_rate": 9.828017308568337e-05, + "loss": 2.0685, + "step": 3625 + }, + { + "epoch": 1.112952731737262, + "grad_norm": 0.5761057734489441, + "learning_rate": 9.827888040640414e-05, + "loss": 2.1111, + "step": 3626 + }, + { + "epoch": 1.1132596685082874, + "grad_norm": 0.5724154710769653, + "learning_rate": 9.827758725000468e-05, + "loss": 2.0596, + "step": 3627 + }, + { + "epoch": 1.1135666052793125, + "grad_norm": 0.5120618343353271, + "learning_rate": 9.827629361649783e-05, + "loss": 1.9811, + "step": 3628 + }, + { + "epoch": 1.1138735420503376, + "grad_norm": 0.4449520409107208, + "learning_rate": 9.827499950589633e-05, + "loss": 1.9935, + "step": 3629 + }, + { + "epoch": 1.1141804788213627, + "grad_norm": 0.5478667616844177, + "learning_rate": 9.827370491821302e-05, + "loss": 2.0142, + "step": 3630 + }, + { + "epoch": 1.114487415592388, + "grad_norm": 0.6170383095741272, + "learning_rate": 9.827240985346064e-05, + "loss": 2.0588, + "step": 3631 + }, + { + "epoch": 1.1147943523634132, + "grad_norm": 0.5950221419334412, + "learning_rate": 9.827111431165202e-05, + "loss": 2.0187, + "step": 3632 + }, + { + "epoch": 1.1151012891344383, + "grad_norm": 0.5250533819198608, + "learning_rate": 9.826981829279995e-05, + "loss": 2.0288, + "step": 3633 + }, + { + "epoch": 1.1154082259054634, + "grad_norm": 0.6252482533454895, + "learning_rate": 9.826852179691725e-05, + "loss": 2.1834, + "step": 3634 + }, + { + "epoch": 1.1157151626764887, + "grad_norm": 0.5258986353874207, + "learning_rate": 9.826722482401673e-05, + "loss": 1.9894, + "step": 3635 + }, + { + "epoch": 1.1160220994475138, + "grad_norm": 0.5532206892967224, + "learning_rate": 9.82659273741112e-05, + "loss": 2.013, + "step": 3636 + }, + { + "epoch": 1.116329036218539, + "grad_norm": 0.5178828835487366, + "learning_rate": 9.826462944721349e-05, + "loss": 1.955, + "step": 3637 + }, + { + "epoch": 1.116635972989564, + "grad_norm": 0.5466227531433105, + "learning_rate": 9.826333104333642e-05, + "loss": 2.1073, + "step": 3638 + }, + { + "epoch": 1.1169429097605894, + "grad_norm": 0.5513507723808289, + "learning_rate": 9.826203216249282e-05, + "loss": 2.0735, + "step": 3639 + }, + { + "epoch": 1.1172498465316145, + "grad_norm": 0.5485204458236694, + "learning_rate": 9.826073280469554e-05, + "loss": 2.0699, + "step": 3640 + }, + { + "epoch": 1.1175567833026396, + "grad_norm": 0.5148037075996399, + "learning_rate": 9.825943296995741e-05, + "loss": 1.9364, + "step": 3641 + }, + { + "epoch": 1.1178637200736647, + "grad_norm": 0.5639125108718872, + "learning_rate": 9.825813265829127e-05, + "loss": 2.078, + "step": 3642 + }, + { + "epoch": 1.11817065684469, + "grad_norm": 0.581631064414978, + "learning_rate": 9.825683186970997e-05, + "loss": 2.0404, + "step": 3643 + }, + { + "epoch": 1.1184775936157152, + "grad_norm": 0.5630286335945129, + "learning_rate": 9.82555306042264e-05, + "loss": 2.0615, + "step": 3644 + }, + { + "epoch": 1.1187845303867403, + "grad_norm": 0.5661062598228455, + "learning_rate": 9.825422886185338e-05, + "loss": 2.0432, + "step": 3645 + }, + { + "epoch": 1.1190914671577654, + "grad_norm": 0.4960556626319885, + "learning_rate": 9.825292664260379e-05, + "loss": 2.0576, + "step": 3646 + }, + { + "epoch": 1.1193984039287908, + "grad_norm": 0.5052362084388733, + "learning_rate": 9.825162394649048e-05, + "loss": 2.0615, + "step": 3647 + }, + { + "epoch": 1.1197053406998159, + "grad_norm": 0.566758930683136, + "learning_rate": 9.825032077352636e-05, + "loss": 2.0821, + "step": 3648 + }, + { + "epoch": 1.120012277470841, + "grad_norm": 0.5705568790435791, + "learning_rate": 9.824901712372429e-05, + "loss": 2.1455, + "step": 3649 + }, + { + "epoch": 1.120319214241866, + "grad_norm": 0.5584011673927307, + "learning_rate": 9.824771299709714e-05, + "loss": 2.0911, + "step": 3650 + }, + { + "epoch": 1.1206261510128914, + "grad_norm": 0.5621497631072998, + "learning_rate": 9.824640839365782e-05, + "loss": 2.1209, + "step": 3651 + }, + { + "epoch": 1.1209330877839165, + "grad_norm": 0.4893646240234375, + "learning_rate": 9.824510331341921e-05, + "loss": 1.977, + "step": 3652 + }, + { + "epoch": 1.1212400245549416, + "grad_norm": 0.5626688599586487, + "learning_rate": 9.82437977563942e-05, + "loss": 2.1114, + "step": 3653 + }, + { + "epoch": 1.1215469613259668, + "grad_norm": 0.5714966058731079, + "learning_rate": 9.824249172259573e-05, + "loss": 2.021, + "step": 3654 + }, + { + "epoch": 1.121853898096992, + "grad_norm": 0.5190821886062622, + "learning_rate": 9.824118521203666e-05, + "loss": 1.9788, + "step": 3655 + }, + { + "epoch": 1.1221608348680172, + "grad_norm": 0.46421363949775696, + "learning_rate": 9.823987822472994e-05, + "loss": 1.9762, + "step": 3656 + }, + { + "epoch": 1.1224677716390423, + "grad_norm": 0.5071156620979309, + "learning_rate": 9.823857076068846e-05, + "loss": 1.9625, + "step": 3657 + }, + { + "epoch": 1.1227747084100674, + "grad_norm": 0.5762679576873779, + "learning_rate": 9.823726281992515e-05, + "loss": 2.0543, + "step": 3658 + }, + { + "epoch": 1.1230816451810928, + "grad_norm": 0.6275226473808289, + "learning_rate": 9.823595440245294e-05, + "loss": 2.0878, + "step": 3659 + }, + { + "epoch": 1.1233885819521179, + "grad_norm": 0.6893213391304016, + "learning_rate": 9.823464550828476e-05, + "loss": 2.1059, + "step": 3660 + }, + { + "epoch": 1.123695518723143, + "grad_norm": 0.5521993041038513, + "learning_rate": 9.823333613743353e-05, + "loss": 2.035, + "step": 3661 + }, + { + "epoch": 1.124002455494168, + "grad_norm": 0.4918796718120575, + "learning_rate": 9.823202628991221e-05, + "loss": 1.9873, + "step": 3662 + }, + { + "epoch": 1.1243093922651934, + "grad_norm": 0.5177932977676392, + "learning_rate": 9.823071596573373e-05, + "loss": 2.0376, + "step": 3663 + }, + { + "epoch": 1.1246163290362186, + "grad_norm": 0.5337314009666443, + "learning_rate": 9.822940516491106e-05, + "loss": 2.1065, + "step": 3664 + }, + { + "epoch": 1.1249232658072437, + "grad_norm": 0.5179010629653931, + "learning_rate": 9.822809388745713e-05, + "loss": 1.9642, + "step": 3665 + }, + { + "epoch": 1.125230202578269, + "grad_norm": 0.5394679307937622, + "learning_rate": 9.82267821333849e-05, + "loss": 2.0275, + "step": 3666 + }, + { + "epoch": 1.1255371393492941, + "grad_norm": 0.582873523235321, + "learning_rate": 9.822546990270735e-05, + "loss": 2.0369, + "step": 3667 + }, + { + "epoch": 1.1258440761203192, + "grad_norm": 0.6595674753189087, + "learning_rate": 9.822415719543745e-05, + "loss": 1.9776, + "step": 3668 + }, + { + "epoch": 1.1261510128913443, + "grad_norm": 0.8103840947151184, + "learning_rate": 9.822284401158814e-05, + "loss": 2.0784, + "step": 3669 + }, + { + "epoch": 1.1264579496623695, + "grad_norm": 0.9062070250511169, + "learning_rate": 9.822153035117245e-05, + "loss": 1.9886, + "step": 3670 + }, + { + "epoch": 1.1267648864333948, + "grad_norm": 0.8718156814575195, + "learning_rate": 9.822021621420333e-05, + "loss": 2.0499, + "step": 3671 + }, + { + "epoch": 1.12707182320442, + "grad_norm": 0.6499583721160889, + "learning_rate": 9.821890160069375e-05, + "loss": 2.0734, + "step": 3672 + }, + { + "epoch": 1.127378759975445, + "grad_norm": 0.4573141932487488, + "learning_rate": 9.821758651065673e-05, + "loss": 2.0306, + "step": 3673 + }, + { + "epoch": 1.1276856967464703, + "grad_norm": 0.6441135406494141, + "learning_rate": 9.821627094410526e-05, + "loss": 2.051, + "step": 3674 + }, + { + "epoch": 1.1279926335174955, + "grad_norm": 0.7201390266418457, + "learning_rate": 9.821495490105235e-05, + "loss": 2.0187, + "step": 3675 + }, + { + "epoch": 1.1282995702885206, + "grad_norm": 0.6751874685287476, + "learning_rate": 9.821363838151099e-05, + "loss": 2.0363, + "step": 3676 + }, + { + "epoch": 1.1286065070595457, + "grad_norm": 0.5435949563980103, + "learning_rate": 9.821232138549419e-05, + "loss": 1.939, + "step": 3677 + }, + { + "epoch": 1.1289134438305708, + "grad_norm": 0.605248212814331, + "learning_rate": 9.821100391301497e-05, + "loss": 2.146, + "step": 3678 + }, + { + "epoch": 1.1292203806015961, + "grad_norm": 0.6798139810562134, + "learning_rate": 9.820968596408636e-05, + "loss": 2.0423, + "step": 3679 + }, + { + "epoch": 1.1295273173726212, + "grad_norm": 0.6683683395385742, + "learning_rate": 9.820836753872137e-05, + "loss": 1.9768, + "step": 3680 + }, + { + "epoch": 1.1298342541436464, + "grad_norm": 0.578346312046051, + "learning_rate": 9.820704863693304e-05, + "loss": 1.9313, + "step": 3681 + }, + { + "epoch": 1.1301411909146717, + "grad_norm": 0.5639599561691284, + "learning_rate": 9.820572925873441e-05, + "loss": 2.0706, + "step": 3682 + }, + { + "epoch": 1.1304481276856968, + "grad_norm": 0.5749368071556091, + "learning_rate": 9.82044094041385e-05, + "loss": 2.0072, + "step": 3683 + }, + { + "epoch": 1.130755064456722, + "grad_norm": 0.6490229368209839, + "learning_rate": 9.820308907315836e-05, + "loss": 1.9947, + "step": 3684 + }, + { + "epoch": 1.131062001227747, + "grad_norm": 0.6207692623138428, + "learning_rate": 9.820176826580705e-05, + "loss": 2.1426, + "step": 3685 + }, + { + "epoch": 1.1313689379987721, + "grad_norm": 0.6421573162078857, + "learning_rate": 9.82004469820976e-05, + "loss": 2.0558, + "step": 3686 + }, + { + "epoch": 1.1316758747697975, + "grad_norm": 0.5462764501571655, + "learning_rate": 9.81991252220431e-05, + "loss": 2.0072, + "step": 3687 + }, + { + "epoch": 1.1319828115408226, + "grad_norm": 0.49791282415390015, + "learning_rate": 9.819780298565657e-05, + "loss": 1.9949, + "step": 3688 + }, + { + "epoch": 1.1322897483118477, + "grad_norm": 0.5120366215705872, + "learning_rate": 9.819648027295112e-05, + "loss": 2.0503, + "step": 3689 + }, + { + "epoch": 1.132596685082873, + "grad_norm": 0.5118343830108643, + "learning_rate": 9.81951570839398e-05, + "loss": 2.0104, + "step": 3690 + }, + { + "epoch": 1.1329036218538981, + "grad_norm": 0.44520822167396545, + "learning_rate": 9.81938334186357e-05, + "loss": 2.0024, + "step": 3691 + }, + { + "epoch": 1.1332105586249233, + "grad_norm": 0.5505960583686829, + "learning_rate": 9.819250927705188e-05, + "loss": 2.0924, + "step": 3692 + }, + { + "epoch": 1.1335174953959484, + "grad_norm": 0.5269182920455933, + "learning_rate": 9.819118465920143e-05, + "loss": 2.0553, + "step": 3693 + }, + { + "epoch": 1.1338244321669735, + "grad_norm": 0.4864311218261719, + "learning_rate": 9.818985956509745e-05, + "loss": 2.0405, + "step": 3694 + }, + { + "epoch": 1.1341313689379988, + "grad_norm": 0.515202522277832, + "learning_rate": 9.818853399475304e-05, + "loss": 2.0211, + "step": 3695 + }, + { + "epoch": 1.134438305709024, + "grad_norm": 0.5360483527183533, + "learning_rate": 9.818720794818128e-05, + "loss": 2.1077, + "step": 3696 + }, + { + "epoch": 1.134745242480049, + "grad_norm": 0.5469255447387695, + "learning_rate": 9.818588142539531e-05, + "loss": 1.9538, + "step": 3697 + }, + { + "epoch": 1.1350521792510744, + "grad_norm": 0.5042214393615723, + "learning_rate": 9.818455442640819e-05, + "loss": 2.0477, + "step": 3698 + }, + { + "epoch": 1.1353591160220995, + "grad_norm": 0.5678744316101074, + "learning_rate": 9.81832269512331e-05, + "loss": 2.0871, + "step": 3699 + }, + { + "epoch": 1.1356660527931246, + "grad_norm": 0.5218677520751953, + "learning_rate": 9.818189899988308e-05, + "loss": 2.1014, + "step": 3700 + }, + { + "epoch": 1.1359729895641497, + "grad_norm": 0.5141727924346924, + "learning_rate": 9.818057057237132e-05, + "loss": 2.0385, + "step": 3701 + }, + { + "epoch": 1.136279926335175, + "grad_norm": 0.5288038849830627, + "learning_rate": 9.81792416687109e-05, + "loss": 2.0736, + "step": 3702 + }, + { + "epoch": 1.1365868631062002, + "grad_norm": 0.5533168911933899, + "learning_rate": 9.817791228891499e-05, + "loss": 2.032, + "step": 3703 + }, + { + "epoch": 1.1368937998772253, + "grad_norm": 0.4840674102306366, + "learning_rate": 9.81765824329967e-05, + "loss": 2.027, + "step": 3704 + }, + { + "epoch": 1.1372007366482504, + "grad_norm": 0.5060023069381714, + "learning_rate": 9.817525210096921e-05, + "loss": 2.0561, + "step": 3705 + }, + { + "epoch": 1.1375076734192757, + "grad_norm": 0.48830488324165344, + "learning_rate": 9.817392129284561e-05, + "loss": 1.9807, + "step": 3706 + }, + { + "epoch": 1.1378146101903008, + "grad_norm": 0.4644564390182495, + "learning_rate": 9.817259000863911e-05, + "loss": 1.9871, + "step": 3707 + }, + { + "epoch": 1.138121546961326, + "grad_norm": 0.4644739329814911, + "learning_rate": 9.817125824836283e-05, + "loss": 2.0253, + "step": 3708 + }, + { + "epoch": 1.138428483732351, + "grad_norm": 0.5376463532447815, + "learning_rate": 9.816992601202994e-05, + "loss": 2.0693, + "step": 3709 + }, + { + "epoch": 1.1387354205033764, + "grad_norm": 0.49980148673057556, + "learning_rate": 9.816859329965363e-05, + "loss": 2.0123, + "step": 3710 + }, + { + "epoch": 1.1390423572744015, + "grad_norm": 0.5452225208282471, + "learning_rate": 9.816726011124702e-05, + "loss": 2.0725, + "step": 3711 + }, + { + "epoch": 1.1393492940454266, + "grad_norm": 0.5428896546363831, + "learning_rate": 9.816592644682332e-05, + "loss": 2.0446, + "step": 3712 + }, + { + "epoch": 1.1396562308164517, + "grad_norm": 0.5448847413063049, + "learning_rate": 9.816459230639571e-05, + "loss": 2.0262, + "step": 3713 + }, + { + "epoch": 1.139963167587477, + "grad_norm": 0.48574572801589966, + "learning_rate": 9.816325768997736e-05, + "loss": 2.0105, + "step": 3714 + }, + { + "epoch": 1.1402701043585022, + "grad_norm": 0.5566397905349731, + "learning_rate": 9.816192259758147e-05, + "loss": 2.0665, + "step": 3715 + }, + { + "epoch": 1.1405770411295273, + "grad_norm": 0.6098625659942627, + "learning_rate": 9.816058702922124e-05, + "loss": 2.0589, + "step": 3716 + }, + { + "epoch": 1.1408839779005524, + "grad_norm": 0.6118699312210083, + "learning_rate": 9.815925098490985e-05, + "loss": 2.0683, + "step": 3717 + }, + { + "epoch": 1.1411909146715777, + "grad_norm": 0.5213121175765991, + "learning_rate": 9.815791446466053e-05, + "loss": 2.0226, + "step": 3718 + }, + { + "epoch": 1.1414978514426029, + "grad_norm": 0.45717960596084595, + "learning_rate": 9.815657746848648e-05, + "loss": 2.0371, + "step": 3719 + }, + { + "epoch": 1.141804788213628, + "grad_norm": 0.4613656997680664, + "learning_rate": 9.815523999640088e-05, + "loss": 2.0702, + "step": 3720 + }, + { + "epoch": 1.142111724984653, + "grad_norm": 0.4527476727962494, + "learning_rate": 9.8153902048417e-05, + "loss": 1.9893, + "step": 3721 + }, + { + "epoch": 1.1424186617556784, + "grad_norm": 0.4524305462837219, + "learning_rate": 9.815256362454801e-05, + "loss": 1.975, + "step": 3722 + }, + { + "epoch": 1.1427255985267035, + "grad_norm": 0.4421180188655853, + "learning_rate": 9.815122472480718e-05, + "loss": 1.9987, + "step": 3723 + }, + { + "epoch": 1.1430325352977286, + "grad_norm": 0.4833788275718689, + "learning_rate": 9.814988534920771e-05, + "loss": 2.0246, + "step": 3724 + }, + { + "epoch": 1.1433394720687537, + "grad_norm": 0.46547624468803406, + "learning_rate": 9.814854549776287e-05, + "loss": 2.0007, + "step": 3725 + }, + { + "epoch": 1.143646408839779, + "grad_norm": 0.43220648169517517, + "learning_rate": 9.814720517048587e-05, + "loss": 1.9845, + "step": 3726 + }, + { + "epoch": 1.1439533456108042, + "grad_norm": 0.473910391330719, + "learning_rate": 9.814586436738998e-05, + "loss": 2.0518, + "step": 3727 + }, + { + "epoch": 1.1442602823818293, + "grad_norm": 0.507354199886322, + "learning_rate": 9.814452308848843e-05, + "loss": 2.0708, + "step": 3728 + }, + { + "epoch": 1.1445672191528544, + "grad_norm": 0.4585053622722626, + "learning_rate": 9.814318133379448e-05, + "loss": 2.0124, + "step": 3729 + }, + { + "epoch": 1.1448741559238798, + "grad_norm": 0.5280457735061646, + "learning_rate": 9.81418391033214e-05, + "loss": 2.0424, + "step": 3730 + }, + { + "epoch": 1.1451810926949049, + "grad_norm": 0.5173056125640869, + "learning_rate": 9.814049639708245e-05, + "loss": 1.9666, + "step": 3731 + }, + { + "epoch": 1.14548802946593, + "grad_norm": 0.5850839018821716, + "learning_rate": 9.81391532150909e-05, + "loss": 2.0765, + "step": 3732 + }, + { + "epoch": 1.145794966236955, + "grad_norm": 0.5450417995452881, + "learning_rate": 9.813780955736002e-05, + "loss": 2.0696, + "step": 3733 + }, + { + "epoch": 1.1461019030079804, + "grad_norm": 0.4577319622039795, + "learning_rate": 9.81364654239031e-05, + "loss": 2.0493, + "step": 3734 + }, + { + "epoch": 1.1464088397790055, + "grad_norm": 0.5211838483810425, + "learning_rate": 9.813512081473339e-05, + "loss": 2.0578, + "step": 3735 + }, + { + "epoch": 1.1467157765500307, + "grad_norm": 0.6763051152229309, + "learning_rate": 9.813377572986422e-05, + "loss": 2.0859, + "step": 3736 + }, + { + "epoch": 1.1470227133210558, + "grad_norm": 0.8591815233230591, + "learning_rate": 9.813243016930887e-05, + "loss": 1.9743, + "step": 3737 + }, + { + "epoch": 1.147329650092081, + "grad_norm": 0.8573755025863647, + "learning_rate": 9.813108413308063e-05, + "loss": 2.048, + "step": 3738 + }, + { + "epoch": 1.1476365868631062, + "grad_norm": 0.6887713074684143, + "learning_rate": 9.812973762119281e-05, + "loss": 2.0184, + "step": 3739 + }, + { + "epoch": 1.1479435236341313, + "grad_norm": 0.5491438508033752, + "learning_rate": 9.81283906336587e-05, + "loss": 2.0373, + "step": 3740 + }, + { + "epoch": 1.1482504604051567, + "grad_norm": 0.6413923501968384, + "learning_rate": 9.812704317049164e-05, + "loss": 2.067, + "step": 3741 + }, + { + "epoch": 1.1485573971761818, + "grad_norm": 0.8731338381767273, + "learning_rate": 9.812569523170492e-05, + "loss": 1.9996, + "step": 3742 + }, + { + "epoch": 1.1488643339472069, + "grad_norm": 0.8043886423110962, + "learning_rate": 9.812434681731189e-05, + "loss": 2.0464, + "step": 3743 + }, + { + "epoch": 1.149171270718232, + "grad_norm": 0.5458279252052307, + "learning_rate": 9.812299792732584e-05, + "loss": 2.0026, + "step": 3744 + }, + { + "epoch": 1.149478207489257, + "grad_norm": 0.5135432481765747, + "learning_rate": 9.812164856176011e-05, + "loss": 2.0302, + "step": 3745 + }, + { + "epoch": 1.1497851442602824, + "grad_norm": 0.6673153638839722, + "learning_rate": 9.812029872062807e-05, + "loss": 2.0435, + "step": 3746 + }, + { + "epoch": 1.1500920810313076, + "grad_norm": 0.6777083873748779, + "learning_rate": 9.811894840394302e-05, + "loss": 2.0591, + "step": 3747 + }, + { + "epoch": 1.1503990178023327, + "grad_norm": 0.6660524010658264, + "learning_rate": 9.811759761171833e-05, + "loss": 2.0461, + "step": 3748 + }, + { + "epoch": 1.150705954573358, + "grad_norm": 0.6079594492912292, + "learning_rate": 9.811624634396733e-05, + "loss": 2.0708, + "step": 3749 + }, + { + "epoch": 1.1510128913443831, + "grad_norm": 0.5242465734481812, + "learning_rate": 9.811489460070337e-05, + "loss": 2.0513, + "step": 3750 + }, + { + "epoch": 1.1513198281154082, + "grad_norm": 0.7091820240020752, + "learning_rate": 9.811354238193984e-05, + "loss": 2.0356, + "step": 3751 + }, + { + "epoch": 1.1516267648864333, + "grad_norm": 0.6781896948814392, + "learning_rate": 9.811218968769007e-05, + "loss": 2.0693, + "step": 3752 + }, + { + "epoch": 1.1519337016574585, + "grad_norm": 0.6036314368247986, + "learning_rate": 9.811083651796744e-05, + "loss": 2.134, + "step": 3753 + }, + { + "epoch": 1.1522406384284838, + "grad_norm": 0.6173892617225647, + "learning_rate": 9.810948287278534e-05, + "loss": 2.056, + "step": 3754 + }, + { + "epoch": 1.152547575199509, + "grad_norm": 0.4903198182582855, + "learning_rate": 9.810812875215712e-05, + "loss": 2.0037, + "step": 3755 + }, + { + "epoch": 1.152854511970534, + "grad_norm": 0.5527236461639404, + "learning_rate": 9.810677415609619e-05, + "loss": 2.0334, + "step": 3756 + }, + { + "epoch": 1.1531614487415593, + "grad_norm": 0.5342993140220642, + "learning_rate": 9.81054190846159e-05, + "loss": 2.0376, + "step": 3757 + }, + { + "epoch": 1.1534683855125845, + "grad_norm": 0.4860527515411377, + "learning_rate": 9.810406353772968e-05, + "loss": 2.0009, + "step": 3758 + }, + { + "epoch": 1.1537753222836096, + "grad_norm": 0.49722176790237427, + "learning_rate": 9.810270751545089e-05, + "loss": 2.051, + "step": 3759 + }, + { + "epoch": 1.1540822590546347, + "grad_norm": 0.4714743196964264, + "learning_rate": 9.810135101779296e-05, + "loss": 2.0474, + "step": 3760 + }, + { + "epoch": 1.1543891958256598, + "grad_norm": 0.5183619856834412, + "learning_rate": 9.80999940447693e-05, + "loss": 2.1032, + "step": 3761 + }, + { + "epoch": 1.1546961325966851, + "grad_norm": 0.6118659377098083, + "learning_rate": 9.809863659639328e-05, + "loss": 2.0967, + "step": 3762 + }, + { + "epoch": 1.1550030693677102, + "grad_norm": 0.49166184663772583, + "learning_rate": 9.809727867267838e-05, + "loss": 2.0683, + "step": 3763 + }, + { + "epoch": 1.1553100061387354, + "grad_norm": 0.5190026164054871, + "learning_rate": 9.809592027363795e-05, + "loss": 2.0161, + "step": 3764 + }, + { + "epoch": 1.1556169429097607, + "grad_norm": 0.516914427280426, + "learning_rate": 9.809456139928546e-05, + "loss": 2.0886, + "step": 3765 + }, + { + "epoch": 1.1559238796807858, + "grad_norm": 0.49737948179244995, + "learning_rate": 9.809320204963433e-05, + "loss": 2.0111, + "step": 3766 + }, + { + "epoch": 1.156230816451811, + "grad_norm": 0.44676536321640015, + "learning_rate": 9.809184222469796e-05, + "loss": 2.0571, + "step": 3767 + }, + { + "epoch": 1.156537753222836, + "grad_norm": 0.5008999109268188, + "learning_rate": 9.809048192448983e-05, + "loss": 2.0489, + "step": 3768 + }, + { + "epoch": 1.1568446899938611, + "grad_norm": 0.5116657614707947, + "learning_rate": 9.80891211490234e-05, + "loss": 1.9571, + "step": 3769 + }, + { + "epoch": 1.1571516267648865, + "grad_norm": 0.49909651279449463, + "learning_rate": 9.808775989831207e-05, + "loss": 2.0568, + "step": 3770 + }, + { + "epoch": 1.1574585635359116, + "grad_norm": 0.5186662077903748, + "learning_rate": 9.80863981723693e-05, + "loss": 2.0283, + "step": 3771 + }, + { + "epoch": 1.1577655003069367, + "grad_norm": 0.4974740445613861, + "learning_rate": 9.808503597120858e-05, + "loss": 1.9525, + "step": 3772 + }, + { + "epoch": 1.158072437077962, + "grad_norm": 0.5369553565979004, + "learning_rate": 9.808367329484333e-05, + "loss": 1.9627, + "step": 3773 + }, + { + "epoch": 1.1583793738489871, + "grad_norm": 0.5084113478660583, + "learning_rate": 9.808231014328704e-05, + "loss": 1.9563, + "step": 3774 + }, + { + "epoch": 1.1586863106200123, + "grad_norm": 0.6059956550598145, + "learning_rate": 9.808094651655319e-05, + "loss": 2.078, + "step": 3775 + }, + { + "epoch": 1.1589932473910374, + "grad_norm": 0.5677124261856079, + "learning_rate": 9.807958241465523e-05, + "loss": 1.9977, + "step": 3776 + }, + { + "epoch": 1.1593001841620627, + "grad_norm": 0.5582616329193115, + "learning_rate": 9.807821783760667e-05, + "loss": 2.0053, + "step": 3777 + }, + { + "epoch": 1.1596071209330878, + "grad_norm": 0.5558032989501953, + "learning_rate": 9.807685278542097e-05, + "loss": 2.0015, + "step": 3778 + }, + { + "epoch": 1.159914057704113, + "grad_norm": 0.553292989730835, + "learning_rate": 9.807548725811165e-05, + "loss": 2.133, + "step": 3779 + }, + { + "epoch": 1.160220994475138, + "grad_norm": 0.5281317234039307, + "learning_rate": 9.807412125569217e-05, + "loss": 2.0018, + "step": 3780 + }, + { + "epoch": 1.1605279312461634, + "grad_norm": 0.45385050773620605, + "learning_rate": 9.807275477817605e-05, + "loss": 1.9986, + "step": 3781 + }, + { + "epoch": 1.1608348680171885, + "grad_norm": 0.5843673944473267, + "learning_rate": 9.80713878255768e-05, + "loss": 2.0653, + "step": 3782 + }, + { + "epoch": 1.1611418047882136, + "grad_norm": 0.6193283796310425, + "learning_rate": 9.807002039790792e-05, + "loss": 1.9646, + "step": 3783 + }, + { + "epoch": 1.1614487415592387, + "grad_norm": 0.5831897258758545, + "learning_rate": 9.806865249518292e-05, + "loss": 1.9708, + "step": 3784 + }, + { + "epoch": 1.161755678330264, + "grad_norm": 0.49771901965141296, + "learning_rate": 9.806728411741533e-05, + "loss": 1.9953, + "step": 3785 + }, + { + "epoch": 1.1620626151012892, + "grad_norm": 0.5003515481948853, + "learning_rate": 9.806591526461864e-05, + "loss": 2.0503, + "step": 3786 + }, + { + "epoch": 1.1623695518723143, + "grad_norm": 0.5710052847862244, + "learning_rate": 9.806454593680642e-05, + "loss": 1.9976, + "step": 3787 + }, + { + "epoch": 1.1626764886433394, + "grad_norm": 0.5180788040161133, + "learning_rate": 9.806317613399218e-05, + "loss": 1.9872, + "step": 3788 + }, + { + "epoch": 1.1629834254143647, + "grad_norm": 0.5202008485794067, + "learning_rate": 9.806180585618949e-05, + "loss": 1.9628, + "step": 3789 + }, + { + "epoch": 1.1632903621853898, + "grad_norm": 0.47358211874961853, + "learning_rate": 9.806043510341183e-05, + "loss": 1.9994, + "step": 3790 + }, + { + "epoch": 1.163597298956415, + "grad_norm": 0.4258720278739929, + "learning_rate": 9.80590638756728e-05, + "loss": 1.9547, + "step": 3791 + }, + { + "epoch": 1.16390423572744, + "grad_norm": 0.4487614035606384, + "learning_rate": 9.805769217298593e-05, + "loss": 1.9912, + "step": 3792 + }, + { + "epoch": 1.1642111724984654, + "grad_norm": 0.4970495104789734, + "learning_rate": 9.805631999536477e-05, + "loss": 2.0568, + "step": 3793 + }, + { + "epoch": 1.1645181092694905, + "grad_norm": 0.4535474479198456, + "learning_rate": 9.805494734282289e-05, + "loss": 2.0088, + "step": 3794 + }, + { + "epoch": 1.1648250460405156, + "grad_norm": 0.44582805037498474, + "learning_rate": 9.805357421537385e-05, + "loss": 1.9694, + "step": 3795 + }, + { + "epoch": 1.1651319828115407, + "grad_norm": 0.43872734904289246, + "learning_rate": 9.805220061303125e-05, + "loss": 2.0041, + "step": 3796 + }, + { + "epoch": 1.165438919582566, + "grad_norm": 0.5050458908081055, + "learning_rate": 9.805082653580861e-05, + "loss": 1.9963, + "step": 3797 + }, + { + "epoch": 1.1657458563535912, + "grad_norm": 0.5346884727478027, + "learning_rate": 9.804945198371956e-05, + "loss": 2.0334, + "step": 3798 + }, + { + "epoch": 1.1660527931246163, + "grad_norm": 0.5607240796089172, + "learning_rate": 9.804807695677764e-05, + "loss": 2.0474, + "step": 3799 + }, + { + "epoch": 1.1663597298956414, + "grad_norm": 0.5343592166900635, + "learning_rate": 9.804670145499648e-05, + "loss": 2.0542, + "step": 3800 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5195753574371338, + "learning_rate": 9.804532547838964e-05, + "loss": 2.0816, + "step": 3801 + }, + { + "epoch": 1.1669736034376919, + "grad_norm": 0.575821042060852, + "learning_rate": 9.804394902697075e-05, + "loss": 2.0182, + "step": 3802 + }, + { + "epoch": 1.167280540208717, + "grad_norm": 0.6385466456413269, + "learning_rate": 9.804257210075339e-05, + "loss": 2.0519, + "step": 3803 + }, + { + "epoch": 1.167587476979742, + "grad_norm": 0.7202457785606384, + "learning_rate": 9.804119469975117e-05, + "loss": 1.9871, + "step": 3804 + }, + { + "epoch": 1.1678944137507674, + "grad_norm": 0.696793258190155, + "learning_rate": 9.803981682397772e-05, + "loss": 2.1018, + "step": 3805 + }, + { + "epoch": 1.1682013505217925, + "grad_norm": 0.6217656135559082, + "learning_rate": 9.803843847344662e-05, + "loss": 2.1009, + "step": 3806 + }, + { + "epoch": 1.1685082872928176, + "grad_norm": 0.5296351313591003, + "learning_rate": 9.803705964817153e-05, + "loss": 2.1057, + "step": 3807 + }, + { + "epoch": 1.168815224063843, + "grad_norm": 0.5280975699424744, + "learning_rate": 9.803568034816606e-05, + "loss": 2.0019, + "step": 3808 + }, + { + "epoch": 1.169122160834868, + "grad_norm": 0.4981881380081177, + "learning_rate": 9.803430057344385e-05, + "loss": 1.9918, + "step": 3809 + }, + { + "epoch": 1.1694290976058932, + "grad_norm": 0.43662941455841064, + "learning_rate": 9.803292032401852e-05, + "loss": 2.0273, + "step": 3810 + }, + { + "epoch": 1.1697360343769183, + "grad_norm": 0.5039259791374207, + "learning_rate": 9.80315395999037e-05, + "loss": 2.0475, + "step": 3811 + }, + { + "epoch": 1.1700429711479434, + "grad_norm": 0.4330410957336426, + "learning_rate": 9.803015840111308e-05, + "loss": 1.99, + "step": 3812 + }, + { + "epoch": 1.1703499079189688, + "grad_norm": 0.4603813886642456, + "learning_rate": 9.802877672766026e-05, + "loss": 2.0288, + "step": 3813 + }, + { + "epoch": 1.1706568446899939, + "grad_norm": 0.45815590023994446, + "learning_rate": 9.802739457955894e-05, + "loss": 2.0026, + "step": 3814 + }, + { + "epoch": 1.170963781461019, + "grad_norm": 0.46995803713798523, + "learning_rate": 9.802601195682275e-05, + "loss": 2.0608, + "step": 3815 + }, + { + "epoch": 1.1712707182320443, + "grad_norm": 0.4511576294898987, + "learning_rate": 9.802462885946536e-05, + "loss": 1.9793, + "step": 3816 + }, + { + "epoch": 1.1715776550030694, + "grad_norm": 0.49079468846321106, + "learning_rate": 9.802324528750044e-05, + "loss": 2.0049, + "step": 3817 + }, + { + "epoch": 1.1718845917740945, + "grad_norm": 0.47245466709136963, + "learning_rate": 9.802186124094166e-05, + "loss": 1.9562, + "step": 3818 + }, + { + "epoch": 1.1721915285451197, + "grad_norm": 0.485575795173645, + "learning_rate": 9.80204767198027e-05, + "loss": 2.0212, + "step": 3819 + }, + { + "epoch": 1.1724984653161448, + "grad_norm": 0.5924440622329712, + "learning_rate": 9.801909172409724e-05, + "loss": 1.9875, + "step": 3820 + }, + { + "epoch": 1.17280540208717, + "grad_norm": 0.48908641934394836, + "learning_rate": 9.801770625383899e-05, + "loss": 1.9778, + "step": 3821 + }, + { + "epoch": 1.1731123388581952, + "grad_norm": 0.4372415840625763, + "learning_rate": 9.80163203090416e-05, + "loss": 1.9368, + "step": 3822 + }, + { + "epoch": 1.1734192756292203, + "grad_norm": 0.5811094641685486, + "learning_rate": 9.801493388971881e-05, + "loss": 2.1293, + "step": 3823 + }, + { + "epoch": 1.1737262124002457, + "grad_norm": 0.516983151435852, + "learning_rate": 9.801354699588428e-05, + "loss": 2.039, + "step": 3824 + }, + { + "epoch": 1.1740331491712708, + "grad_norm": 0.53409343957901, + "learning_rate": 9.801215962755175e-05, + "loss": 2.0294, + "step": 3825 + }, + { + "epoch": 1.1743400859422959, + "grad_norm": 0.5703202486038208, + "learning_rate": 9.801077178473492e-05, + "loss": 2.0241, + "step": 3826 + }, + { + "epoch": 1.174647022713321, + "grad_norm": 0.49341192841529846, + "learning_rate": 9.80093834674475e-05, + "loss": 1.9092, + "step": 3827 + }, + { + "epoch": 1.174953959484346, + "grad_norm": 0.46960577368736267, + "learning_rate": 9.800799467570321e-05, + "loss": 1.9994, + "step": 3828 + }, + { + "epoch": 1.1752608962553714, + "grad_norm": 0.468108594417572, + "learning_rate": 9.800660540951577e-05, + "loss": 1.9471, + "step": 3829 + }, + { + "epoch": 1.1755678330263966, + "grad_norm": 0.4133259057998657, + "learning_rate": 9.800521566889893e-05, + "loss": 2.0159, + "step": 3830 + }, + { + "epoch": 1.1758747697974217, + "grad_norm": 0.44991979002952576, + "learning_rate": 9.800382545386641e-05, + "loss": 2.0179, + "step": 3831 + }, + { + "epoch": 1.176181706568447, + "grad_norm": 0.43111294507980347, + "learning_rate": 9.800243476443195e-05, + "loss": 2.1092, + "step": 3832 + }, + { + "epoch": 1.1764886433394721, + "grad_norm": 0.4859693944454193, + "learning_rate": 9.800104360060929e-05, + "loss": 2.0134, + "step": 3833 + }, + { + "epoch": 1.1767955801104972, + "grad_norm": 0.474960058927536, + "learning_rate": 9.799965196241219e-05, + "loss": 2.0288, + "step": 3834 + }, + { + "epoch": 1.1771025168815223, + "grad_norm": 0.5269008278846741, + "learning_rate": 9.79982598498544e-05, + "loss": 2.063, + "step": 3835 + }, + { + "epoch": 1.1774094536525475, + "grad_norm": 0.4923003613948822, + "learning_rate": 9.799686726294965e-05, + "loss": 1.9506, + "step": 3836 + }, + { + "epoch": 1.1777163904235728, + "grad_norm": 0.5355561971664429, + "learning_rate": 9.799547420171175e-05, + "loss": 2.0066, + "step": 3837 + }, + { + "epoch": 1.178023327194598, + "grad_norm": 0.6095728874206543, + "learning_rate": 9.799408066615443e-05, + "loss": 1.9799, + "step": 3838 + }, + { + "epoch": 1.178330263965623, + "grad_norm": 0.5268104672431946, + "learning_rate": 9.799268665629148e-05, + "loss": 2.0409, + "step": 3839 + }, + { + "epoch": 1.1786372007366483, + "grad_norm": 0.4478130340576172, + "learning_rate": 9.799129217213667e-05, + "loss": 1.9521, + "step": 3840 + }, + { + "epoch": 1.1789441375076735, + "grad_norm": 0.4691653847694397, + "learning_rate": 9.798989721370379e-05, + "loss": 2.0432, + "step": 3841 + }, + { + "epoch": 1.1792510742786986, + "grad_norm": 0.5602376461029053, + "learning_rate": 9.798850178100661e-05, + "loss": 2.0557, + "step": 3842 + }, + { + "epoch": 1.1795580110497237, + "grad_norm": 0.5619905591011047, + "learning_rate": 9.798710587405893e-05, + "loss": 2.0258, + "step": 3843 + }, + { + "epoch": 1.179864947820749, + "grad_norm": 0.5845574736595154, + "learning_rate": 9.798570949287454e-05, + "loss": 2.0637, + "step": 3844 + }, + { + "epoch": 1.1801718845917741, + "grad_norm": 0.5339313745498657, + "learning_rate": 9.798431263746725e-05, + "loss": 2.0265, + "step": 3845 + }, + { + "epoch": 1.1804788213627992, + "grad_norm": 0.45720914006233215, + "learning_rate": 9.798291530785086e-05, + "loss": 1.9745, + "step": 3846 + }, + { + "epoch": 1.1807857581338244, + "grad_norm": 0.5121282935142517, + "learning_rate": 9.798151750403917e-05, + "loss": 2.0427, + "step": 3847 + }, + { + "epoch": 1.1810926949048497, + "grad_norm": 0.48100459575653076, + "learning_rate": 9.7980119226046e-05, + "loss": 2.0307, + "step": 3848 + }, + { + "epoch": 1.1813996316758748, + "grad_norm": 0.4424034655094147, + "learning_rate": 9.797872047388517e-05, + "loss": 1.9697, + "step": 3849 + }, + { + "epoch": 1.1817065684469, + "grad_norm": 0.45154938101768494, + "learning_rate": 9.797732124757051e-05, + "loss": 1.9689, + "step": 3850 + }, + { + "epoch": 1.182013505217925, + "grad_norm": 0.4807071387767792, + "learning_rate": 9.797592154711584e-05, + "loss": 1.9616, + "step": 3851 + }, + { + "epoch": 1.1823204419889504, + "grad_norm": 0.5113904476165771, + "learning_rate": 9.797452137253498e-05, + "loss": 2.0158, + "step": 3852 + }, + { + "epoch": 1.1826273787599755, + "grad_norm": 0.5456753969192505, + "learning_rate": 9.797312072384179e-05, + "loss": 1.977, + "step": 3853 + }, + { + "epoch": 1.1829343155310006, + "grad_norm": 0.5545704364776611, + "learning_rate": 9.797171960105012e-05, + "loss": 2.0622, + "step": 3854 + }, + { + "epoch": 1.1832412523020257, + "grad_norm": 0.651498556137085, + "learning_rate": 9.797031800417377e-05, + "loss": 2.0739, + "step": 3855 + }, + { + "epoch": 1.183548189073051, + "grad_norm": 0.748968780040741, + "learning_rate": 9.796891593322665e-05, + "loss": 2.0713, + "step": 3856 + }, + { + "epoch": 1.1838551258440762, + "grad_norm": 0.8724157214164734, + "learning_rate": 9.796751338822256e-05, + "loss": 2.0224, + "step": 3857 + }, + { + "epoch": 1.1841620626151013, + "grad_norm": 0.8158844709396362, + "learning_rate": 9.796611036917542e-05, + "loss": 2.0165, + "step": 3858 + }, + { + "epoch": 1.1844689993861264, + "grad_norm": 0.6231487989425659, + "learning_rate": 9.796470687609904e-05, + "loss": 1.9607, + "step": 3859 + }, + { + "epoch": 1.1847759361571517, + "grad_norm": 0.49367067217826843, + "learning_rate": 9.796330290900731e-05, + "loss": 2.0074, + "step": 3860 + }, + { + "epoch": 1.1850828729281768, + "grad_norm": 0.5546393990516663, + "learning_rate": 9.796189846791413e-05, + "loss": 1.9688, + "step": 3861 + }, + { + "epoch": 1.185389809699202, + "grad_norm": 0.5880963802337646, + "learning_rate": 9.796049355283333e-05, + "loss": 2.0192, + "step": 3862 + }, + { + "epoch": 1.185696746470227, + "grad_norm": 0.6064910292625427, + "learning_rate": 9.795908816377884e-05, + "loss": 2.0236, + "step": 3863 + }, + { + "epoch": 1.1860036832412524, + "grad_norm": 0.524116575717926, + "learning_rate": 9.795768230076454e-05, + "loss": 2.0315, + "step": 3864 + }, + { + "epoch": 1.1863106200122775, + "grad_norm": 0.449158251285553, + "learning_rate": 9.79562759638043e-05, + "loss": 1.9423, + "step": 3865 + }, + { + "epoch": 1.1866175567833026, + "grad_norm": 0.5623016953468323, + "learning_rate": 9.795486915291203e-05, + "loss": 2.096, + "step": 3866 + }, + { + "epoch": 1.1869244935543277, + "grad_norm": 0.6107217073440552, + "learning_rate": 9.795346186810164e-05, + "loss": 1.9994, + "step": 3867 + }, + { + "epoch": 1.187231430325353, + "grad_norm": 0.5559211373329163, + "learning_rate": 9.795205410938704e-05, + "loss": 2.0138, + "step": 3868 + }, + { + "epoch": 1.1875383670963782, + "grad_norm": 0.5022037029266357, + "learning_rate": 9.795064587678212e-05, + "loss": 2.0835, + "step": 3869 + }, + { + "epoch": 1.1878453038674033, + "grad_norm": 0.5760810971260071, + "learning_rate": 9.794923717030082e-05, + "loss": 2.0839, + "step": 3870 + }, + { + "epoch": 1.1881522406384284, + "grad_norm": 0.559018075466156, + "learning_rate": 9.794782798995706e-05, + "loss": 2.0397, + "step": 3871 + }, + { + "epoch": 1.1884591774094537, + "grad_norm": 0.48842501640319824, + "learning_rate": 9.794641833576477e-05, + "loss": 2.022, + "step": 3872 + }, + { + "epoch": 1.1887661141804788, + "grad_norm": 0.47267377376556396, + "learning_rate": 9.794500820773785e-05, + "loss": 1.9677, + "step": 3873 + }, + { + "epoch": 1.189073050951504, + "grad_norm": 0.5107980966567993, + "learning_rate": 9.794359760589026e-05, + "loss": 2.124, + "step": 3874 + }, + { + "epoch": 1.189379987722529, + "grad_norm": 0.4993875026702881, + "learning_rate": 9.794218653023595e-05, + "loss": 1.9528, + "step": 3875 + }, + { + "epoch": 1.1896869244935544, + "grad_norm": 0.49543896317481995, + "learning_rate": 9.794077498078885e-05, + "loss": 2.0257, + "step": 3876 + }, + { + "epoch": 1.1899938612645795, + "grad_norm": 0.5207403302192688, + "learning_rate": 9.79393629575629e-05, + "loss": 2.0853, + "step": 3877 + }, + { + "epoch": 1.1903007980356046, + "grad_norm": 0.44884833693504333, + "learning_rate": 9.793795046057208e-05, + "loss": 1.9366, + "step": 3878 + }, + { + "epoch": 1.1906077348066297, + "grad_norm": 0.47921934723854065, + "learning_rate": 9.793653748983033e-05, + "loss": 2.0614, + "step": 3879 + }, + { + "epoch": 1.190914671577655, + "grad_norm": 0.5371566414833069, + "learning_rate": 9.793512404535163e-05, + "loss": 2.0433, + "step": 3880 + }, + { + "epoch": 1.1912216083486802, + "grad_norm": 0.48760104179382324, + "learning_rate": 9.793371012714994e-05, + "loss": 2.0061, + "step": 3881 + }, + { + "epoch": 1.1915285451197053, + "grad_norm": 0.47291669249534607, + "learning_rate": 9.793229573523922e-05, + "loss": 2.0661, + "step": 3882 + }, + { + "epoch": 1.1918354818907306, + "grad_norm": 0.5348502397537231, + "learning_rate": 9.793088086963347e-05, + "loss": 2.0131, + "step": 3883 + }, + { + "epoch": 1.1921424186617557, + "grad_norm": 0.6291812062263489, + "learning_rate": 9.792946553034666e-05, + "loss": 2.0312, + "step": 3884 + }, + { + "epoch": 1.1924493554327809, + "grad_norm": 0.5620503425598145, + "learning_rate": 9.792804971739276e-05, + "loss": 2.0429, + "step": 3885 + }, + { + "epoch": 1.192756292203806, + "grad_norm": 0.4984607696533203, + "learning_rate": 9.792663343078581e-05, + "loss": 2.0183, + "step": 3886 + }, + { + "epoch": 1.193063228974831, + "grad_norm": 0.5867961645126343, + "learning_rate": 9.792521667053975e-05, + "loss": 2.0609, + "step": 3887 + }, + { + "epoch": 1.1933701657458564, + "grad_norm": 0.5819169282913208, + "learning_rate": 9.792379943666863e-05, + "loss": 1.9412, + "step": 3888 + }, + { + "epoch": 1.1936771025168815, + "grad_norm": 0.6232548952102661, + "learning_rate": 9.792238172918643e-05, + "loss": 2.0607, + "step": 3889 + }, + { + "epoch": 1.1939840392879066, + "grad_norm": 0.5859619379043579, + "learning_rate": 9.792096354810716e-05, + "loss": 2.0718, + "step": 3890 + }, + { + "epoch": 1.194290976058932, + "grad_norm": 0.47209057211875916, + "learning_rate": 9.791954489344485e-05, + "loss": 1.9872, + "step": 3891 + }, + { + "epoch": 1.194597912829957, + "grad_norm": 0.5183662176132202, + "learning_rate": 9.79181257652135e-05, + "loss": 2.0782, + "step": 3892 + }, + { + "epoch": 1.1949048496009822, + "grad_norm": 0.551873505115509, + "learning_rate": 9.791670616342715e-05, + "loss": 2.0477, + "step": 3893 + }, + { + "epoch": 1.1952117863720073, + "grad_norm": 0.47254955768585205, + "learning_rate": 9.791528608809984e-05, + "loss": 1.9859, + "step": 3894 + }, + { + "epoch": 1.1955187231430324, + "grad_norm": 0.45482897758483887, + "learning_rate": 9.791386553924556e-05, + "loss": 1.9939, + "step": 3895 + }, + { + "epoch": 1.1958256599140578, + "grad_norm": 0.4687066078186035, + "learning_rate": 9.79124445168784e-05, + "loss": 1.9982, + "step": 3896 + }, + { + "epoch": 1.1961325966850829, + "grad_norm": 0.4855460524559021, + "learning_rate": 9.791102302101236e-05, + "loss": 1.9667, + "step": 3897 + }, + { + "epoch": 1.196439533456108, + "grad_norm": 0.48152467608451843, + "learning_rate": 9.790960105166153e-05, + "loss": 1.9914, + "step": 3898 + }, + { + "epoch": 1.1967464702271333, + "grad_norm": 0.48487406969070435, + "learning_rate": 9.790817860883993e-05, + "loss": 1.9978, + "step": 3899 + }, + { + "epoch": 1.1970534069981584, + "grad_norm": 0.47665563225746155, + "learning_rate": 9.790675569256162e-05, + "loss": 1.9995, + "step": 3900 + }, + { + "epoch": 1.1973603437691835, + "grad_norm": 0.48938530683517456, + "learning_rate": 9.790533230284069e-05, + "loss": 2.0461, + "step": 3901 + }, + { + "epoch": 1.1976672805402087, + "grad_norm": 0.6336411237716675, + "learning_rate": 9.790390843969119e-05, + "loss": 2.0003, + "step": 3902 + }, + { + "epoch": 1.1979742173112338, + "grad_norm": 0.6946616172790527, + "learning_rate": 9.790248410312717e-05, + "loss": 1.9979, + "step": 3903 + }, + { + "epoch": 1.198281154082259, + "grad_norm": 0.7829384803771973, + "learning_rate": 9.790105929316274e-05, + "loss": 2.015, + "step": 3904 + }, + { + "epoch": 1.1985880908532842, + "grad_norm": 0.6874059438705444, + "learning_rate": 9.789963400981197e-05, + "loss": 1.9887, + "step": 3905 + }, + { + "epoch": 1.1988950276243093, + "grad_norm": 0.6074720025062561, + "learning_rate": 9.789820825308893e-05, + "loss": 2.0287, + "step": 3906 + }, + { + "epoch": 1.1992019643953347, + "grad_norm": 0.49311673641204834, + "learning_rate": 9.789678202300774e-05, + "loss": 1.9846, + "step": 3907 + }, + { + "epoch": 1.1995089011663598, + "grad_norm": 0.5266487002372742, + "learning_rate": 9.789535531958244e-05, + "loss": 2.017, + "step": 3908 + }, + { + "epoch": 1.1998158379373849, + "grad_norm": 0.6170570850372314, + "learning_rate": 9.789392814282721e-05, + "loss": 2.0615, + "step": 3909 + }, + { + "epoch": 1.20012277470841, + "grad_norm": 0.5820409059524536, + "learning_rate": 9.789250049275609e-05, + "loss": 2.0459, + "step": 3910 + }, + { + "epoch": 1.2004297114794351, + "grad_norm": 0.5220739841461182, + "learning_rate": 9.78910723693832e-05, + "loss": 2.0843, + "step": 3911 + }, + { + "epoch": 1.2007366482504604, + "grad_norm": 0.5884750485420227, + "learning_rate": 9.788964377272267e-05, + "loss": 2.1068, + "step": 3912 + }, + { + "epoch": 1.2010435850214856, + "grad_norm": 0.5634950995445251, + "learning_rate": 9.788821470278861e-05, + "loss": 2.0206, + "step": 3913 + }, + { + "epoch": 1.2013505217925107, + "grad_norm": 0.5219514966011047, + "learning_rate": 9.788678515959517e-05, + "loss": 2.0802, + "step": 3914 + }, + { + "epoch": 1.201657458563536, + "grad_norm": 0.5870078206062317, + "learning_rate": 9.788535514315642e-05, + "loss": 2.0149, + "step": 3915 + }, + { + "epoch": 1.2019643953345611, + "grad_norm": 0.4850577414035797, + "learning_rate": 9.788392465348653e-05, + "loss": 2.0424, + "step": 3916 + }, + { + "epoch": 1.2022713321055862, + "grad_norm": 0.5354881882667542, + "learning_rate": 9.788249369059964e-05, + "loss": 2.0822, + "step": 3917 + }, + { + "epoch": 1.2025782688766113, + "grad_norm": 0.5817529559135437, + "learning_rate": 9.788106225450988e-05, + "loss": 2.0384, + "step": 3918 + }, + { + "epoch": 1.2028852056476367, + "grad_norm": 0.5685575008392334, + "learning_rate": 9.78796303452314e-05, + "loss": 1.9777, + "step": 3919 + }, + { + "epoch": 1.2031921424186618, + "grad_norm": 0.5086472034454346, + "learning_rate": 9.787819796277835e-05, + "loss": 1.9109, + "step": 3920 + }, + { + "epoch": 1.203499079189687, + "grad_norm": 0.45905008912086487, + "learning_rate": 9.787676510716488e-05, + "loss": 1.9945, + "step": 3921 + }, + { + "epoch": 1.203806015960712, + "grad_norm": 0.6052672863006592, + "learning_rate": 9.787533177840516e-05, + "loss": 2.0873, + "step": 3922 + }, + { + "epoch": 1.2041129527317374, + "grad_norm": 0.636320173740387, + "learning_rate": 9.787389797651334e-05, + "loss": 1.954, + "step": 3923 + }, + { + "epoch": 1.2044198895027625, + "grad_norm": 0.5775459408760071, + "learning_rate": 9.78724637015036e-05, + "loss": 1.9632, + "step": 3924 + }, + { + "epoch": 1.2047268262737876, + "grad_norm": 0.4593936502933502, + "learning_rate": 9.787102895339013e-05, + "loss": 1.948, + "step": 3925 + }, + { + "epoch": 1.2050337630448127, + "grad_norm": 0.4568643867969513, + "learning_rate": 9.78695937321871e-05, + "loss": 1.977, + "step": 3926 + }, + { + "epoch": 1.205340699815838, + "grad_norm": 0.6079357266426086, + "learning_rate": 9.786815803790867e-05, + "loss": 1.9738, + "step": 3927 + }, + { + "epoch": 1.2056476365868631, + "grad_norm": 0.5991626977920532, + "learning_rate": 9.786672187056905e-05, + "loss": 1.9603, + "step": 3928 + }, + { + "epoch": 1.2059545733578882, + "grad_norm": 0.4844282865524292, + "learning_rate": 9.786528523018242e-05, + "loss": 1.9739, + "step": 3929 + }, + { + "epoch": 1.2062615101289134, + "grad_norm": 0.43694475293159485, + "learning_rate": 9.786384811676298e-05, + "loss": 1.957, + "step": 3930 + }, + { + "epoch": 1.2065684468999387, + "grad_norm": 0.5742451548576355, + "learning_rate": 9.786241053032496e-05, + "loss": 1.9872, + "step": 3931 + }, + { + "epoch": 1.2068753836709638, + "grad_norm": 0.6246824860572815, + "learning_rate": 9.786097247088255e-05, + "loss": 2.0747, + "step": 3932 + }, + { + "epoch": 1.207182320441989, + "grad_norm": 0.5364731550216675, + "learning_rate": 9.785953393844996e-05, + "loss": 1.9793, + "step": 3933 + }, + { + "epoch": 1.207489257213014, + "grad_norm": 0.42909273505210876, + "learning_rate": 9.785809493304139e-05, + "loss": 1.9959, + "step": 3934 + }, + { + "epoch": 1.2077961939840394, + "grad_norm": 0.43952879309654236, + "learning_rate": 9.785665545467108e-05, + "loss": 2.0019, + "step": 3935 + }, + { + "epoch": 1.2081031307550645, + "grad_norm": 0.45972180366516113, + "learning_rate": 9.785521550335323e-05, + "loss": 1.9504, + "step": 3936 + }, + { + "epoch": 1.2084100675260896, + "grad_norm": 0.5592246651649475, + "learning_rate": 9.785377507910212e-05, + "loss": 2.0214, + "step": 3937 + }, + { + "epoch": 1.2087170042971147, + "grad_norm": 0.6084285378456116, + "learning_rate": 9.785233418193196e-05, + "loss": 2.08, + "step": 3938 + }, + { + "epoch": 1.20902394106814, + "grad_norm": 0.5370670557022095, + "learning_rate": 9.785089281185698e-05, + "loss": 2.0877, + "step": 3939 + }, + { + "epoch": 1.2093308778391652, + "grad_norm": 0.466501921415329, + "learning_rate": 9.784945096889143e-05, + "loss": 1.9795, + "step": 3940 + }, + { + "epoch": 1.2096378146101903, + "grad_norm": 0.48617517948150635, + "learning_rate": 9.784800865304954e-05, + "loss": 2.0099, + "step": 3941 + }, + { + "epoch": 1.2099447513812154, + "grad_norm": 0.528110921382904, + "learning_rate": 9.78465658643456e-05, + "loss": 2.0597, + "step": 3942 + }, + { + "epoch": 1.2102516881522407, + "grad_norm": 0.47355538606643677, + "learning_rate": 9.784512260279385e-05, + "loss": 2.0145, + "step": 3943 + }, + { + "epoch": 1.2105586249232658, + "grad_norm": 0.46970823407173157, + "learning_rate": 9.784367886840856e-05, + "loss": 2.0533, + "step": 3944 + }, + { + "epoch": 1.210865561694291, + "grad_norm": 0.41206037998199463, + "learning_rate": 9.784223466120399e-05, + "loss": 1.9226, + "step": 3945 + }, + { + "epoch": 1.211172498465316, + "grad_norm": 0.4298155605792999, + "learning_rate": 9.784078998119442e-05, + "loss": 2.0686, + "step": 3946 + }, + { + "epoch": 1.2114794352363414, + "grad_norm": 0.4616359770298004, + "learning_rate": 9.783934482839412e-05, + "loss": 2.0063, + "step": 3947 + }, + { + "epoch": 1.2117863720073665, + "grad_norm": 0.476726233959198, + "learning_rate": 9.783789920281737e-05, + "loss": 1.9868, + "step": 3948 + }, + { + "epoch": 1.2120933087783916, + "grad_norm": 0.5075610876083374, + "learning_rate": 9.783645310447846e-05, + "loss": 2.1019, + "step": 3949 + }, + { + "epoch": 1.212400245549417, + "grad_norm": 0.49806225299835205, + "learning_rate": 9.78350065333917e-05, + "loss": 2.0503, + "step": 3950 + }, + { + "epoch": 1.212707182320442, + "grad_norm": 0.5278452634811401, + "learning_rate": 9.783355948957134e-05, + "loss": 2.0513, + "step": 3951 + }, + { + "epoch": 1.2130141190914672, + "grad_norm": 0.5634627938270569, + "learning_rate": 9.783211197303174e-05, + "loss": 2.1135, + "step": 3952 + }, + { + "epoch": 1.2133210558624923, + "grad_norm": 0.5152999758720398, + "learning_rate": 9.783066398378715e-05, + "loss": 2.0392, + "step": 3953 + }, + { + "epoch": 1.2136279926335174, + "grad_norm": 0.48095864057540894, + "learning_rate": 9.782921552185191e-05, + "loss": 1.982, + "step": 3954 + }, + { + "epoch": 1.2139349294045427, + "grad_norm": 0.47377893328666687, + "learning_rate": 9.782776658724034e-05, + "loss": 1.9538, + "step": 3955 + }, + { + "epoch": 1.2142418661755678, + "grad_norm": 0.5260181427001953, + "learning_rate": 9.782631717996675e-05, + "loss": 2.1197, + "step": 3956 + }, + { + "epoch": 1.214548802946593, + "grad_norm": 0.5640038251876831, + "learning_rate": 9.782486730004544e-05, + "loss": 2.0338, + "step": 3957 + }, + { + "epoch": 1.2148557397176183, + "grad_norm": 0.5091645121574402, + "learning_rate": 9.782341694749078e-05, + "loss": 1.9921, + "step": 3958 + }, + { + "epoch": 1.2151626764886434, + "grad_norm": 0.48285624384880066, + "learning_rate": 9.782196612231706e-05, + "loss": 2.0358, + "step": 3959 + }, + { + "epoch": 1.2154696132596685, + "grad_norm": 0.5013573169708252, + "learning_rate": 9.782051482453867e-05, + "loss": 1.9378, + "step": 3960 + }, + { + "epoch": 1.2157765500306936, + "grad_norm": 0.42000052332878113, + "learning_rate": 9.781906305416991e-05, + "loss": 1.9232, + "step": 3961 + }, + { + "epoch": 1.2160834868017187, + "grad_norm": 0.4651196599006653, + "learning_rate": 9.781761081122514e-05, + "loss": 2.0244, + "step": 3962 + }, + { + "epoch": 1.216390423572744, + "grad_norm": 0.48081469535827637, + "learning_rate": 9.781615809571871e-05, + "loss": 1.938, + "step": 3963 + }, + { + "epoch": 1.2166973603437692, + "grad_norm": 0.4692462086677551, + "learning_rate": 9.7814704907665e-05, + "loss": 1.9592, + "step": 3964 + }, + { + "epoch": 1.2170042971147943, + "grad_norm": 0.5545635223388672, + "learning_rate": 9.781325124707832e-05, + "loss": 2.0882, + "step": 3965 + }, + { + "epoch": 1.2173112338858196, + "grad_norm": 0.47801801562309265, + "learning_rate": 9.78117971139731e-05, + "loss": 2.0127, + "step": 3966 + }, + { + "epoch": 1.2176181706568447, + "grad_norm": 0.4705824851989746, + "learning_rate": 9.781034250836364e-05, + "loss": 2.0659, + "step": 3967 + }, + { + "epoch": 1.2179251074278699, + "grad_norm": 0.4757092297077179, + "learning_rate": 9.78088874302644e-05, + "loss": 1.9177, + "step": 3968 + }, + { + "epoch": 1.218232044198895, + "grad_norm": 0.4563291370868683, + "learning_rate": 9.780743187968968e-05, + "loss": 1.991, + "step": 3969 + }, + { + "epoch": 1.21853898096992, + "grad_norm": 0.4641762375831604, + "learning_rate": 9.78059758566539e-05, + "loss": 2.0357, + "step": 3970 + }, + { + "epoch": 1.2188459177409454, + "grad_norm": 0.510754406452179, + "learning_rate": 9.780451936117145e-05, + "loss": 2.0754, + "step": 3971 + }, + { + "epoch": 1.2191528545119705, + "grad_norm": 0.5595460534095764, + "learning_rate": 9.780306239325671e-05, + "loss": 2.0449, + "step": 3972 + }, + { + "epoch": 1.2194597912829956, + "grad_norm": 0.5778231620788574, + "learning_rate": 9.780160495292412e-05, + "loss": 2.0187, + "step": 3973 + }, + { + "epoch": 1.219766728054021, + "grad_norm": 0.5098022818565369, + "learning_rate": 9.780014704018803e-05, + "loss": 1.9881, + "step": 3974 + }, + { + "epoch": 1.220073664825046, + "grad_norm": 0.46725937724113464, + "learning_rate": 9.779868865506288e-05, + "loss": 1.9929, + "step": 3975 + }, + { + "epoch": 1.2203806015960712, + "grad_norm": 0.48517540097236633, + "learning_rate": 9.779722979756304e-05, + "loss": 1.9446, + "step": 3976 + }, + { + "epoch": 1.2206875383670963, + "grad_norm": 0.5013269186019897, + "learning_rate": 9.7795770467703e-05, + "loss": 2.0256, + "step": 3977 + }, + { + "epoch": 1.2209944751381214, + "grad_norm": 0.4918982982635498, + "learning_rate": 9.779431066549713e-05, + "loss": 1.9732, + "step": 3978 + }, + { + "epoch": 1.2213014119091468, + "grad_norm": 0.45646655559539795, + "learning_rate": 9.779285039095987e-05, + "loss": 1.9672, + "step": 3979 + }, + { + "epoch": 1.2216083486801719, + "grad_norm": 0.4712901711463928, + "learning_rate": 9.779138964410565e-05, + "loss": 2.0074, + "step": 3980 + }, + { + "epoch": 1.221915285451197, + "grad_norm": 0.4901394844055176, + "learning_rate": 9.77899284249489e-05, + "loss": 2.0073, + "step": 3981 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.579247772693634, + "learning_rate": 9.778846673350407e-05, + "loss": 2.0983, + "step": 3982 + }, + { + "epoch": 1.2225291589932474, + "grad_norm": 0.6108444929122925, + "learning_rate": 9.77870045697856e-05, + "loss": 2.0268, + "step": 3983 + }, + { + "epoch": 1.2228360957642725, + "grad_norm": 0.5592121481895447, + "learning_rate": 9.778554193380796e-05, + "loss": 2.0549, + "step": 3984 + }, + { + "epoch": 1.2231430325352977, + "grad_norm": 0.538088858127594, + "learning_rate": 9.778407882558556e-05, + "loss": 1.9398, + "step": 3985 + }, + { + "epoch": 1.223449969306323, + "grad_norm": 0.5928295850753784, + "learning_rate": 9.77826152451329e-05, + "loss": 2.0341, + "step": 3986 + }, + { + "epoch": 1.223756906077348, + "grad_norm": 0.566687822341919, + "learning_rate": 9.778115119246442e-05, + "loss": 2.0629, + "step": 3987 + }, + { + "epoch": 1.2240638428483732, + "grad_norm": 0.7019027471542358, + "learning_rate": 9.777968666759461e-05, + "loss": 1.9979, + "step": 3988 + }, + { + "epoch": 1.2243707796193983, + "grad_norm": 0.7198969721794128, + "learning_rate": 9.777822167053793e-05, + "loss": 1.9898, + "step": 3989 + }, + { + "epoch": 1.2246777163904237, + "grad_norm": 0.6319006085395813, + "learning_rate": 9.777675620130887e-05, + "loss": 1.9591, + "step": 3990 + }, + { + "epoch": 1.2249846531614488, + "grad_norm": 0.5372903347015381, + "learning_rate": 9.777529025992187e-05, + "loss": 1.9605, + "step": 3991 + }, + { + "epoch": 1.225291589932474, + "grad_norm": 0.47436487674713135, + "learning_rate": 9.777382384639147e-05, + "loss": 1.9667, + "step": 3992 + }, + { + "epoch": 1.225598526703499, + "grad_norm": 0.5885797739028931, + "learning_rate": 9.777235696073214e-05, + "loss": 2.0363, + "step": 3993 + }, + { + "epoch": 1.2259054634745243, + "grad_norm": 0.6333138346672058, + "learning_rate": 9.777088960295838e-05, + "loss": 1.9352, + "step": 3994 + }, + { + "epoch": 1.2262124002455494, + "grad_norm": 0.6364251971244812, + "learning_rate": 9.776942177308468e-05, + "loss": 1.9577, + "step": 3995 + }, + { + "epoch": 1.2265193370165746, + "grad_norm": 0.5114668607711792, + "learning_rate": 9.776795347112557e-05, + "loss": 2.0241, + "step": 3996 + }, + { + "epoch": 1.2268262737875997, + "grad_norm": 0.6139995455741882, + "learning_rate": 9.776648469709556e-05, + "loss": 1.9847, + "step": 3997 + }, + { + "epoch": 1.227133210558625, + "grad_norm": 0.6104671955108643, + "learning_rate": 9.776501545100911e-05, + "loss": 1.9311, + "step": 3998 + }, + { + "epoch": 1.2274401473296501, + "grad_norm": 0.5099297761917114, + "learning_rate": 9.776354573288081e-05, + "loss": 2.0877, + "step": 3999 + }, + { + "epoch": 1.2277470841006752, + "grad_norm": 0.48199233412742615, + "learning_rate": 9.776207554272516e-05, + "loss": 1.9802, + "step": 4000 + }, + { + "epoch": 1.2280540208717003, + "grad_norm": 0.5323067307472229, + "learning_rate": 9.776060488055667e-05, + "loss": 2.0278, + "step": 4001 + }, + { + "epoch": 1.2283609576427257, + "grad_norm": 0.49086472392082214, + "learning_rate": 9.775913374638988e-05, + "loss": 2.0242, + "step": 4002 + }, + { + "epoch": 1.2286678944137508, + "grad_norm": 0.4812946319580078, + "learning_rate": 9.775766214023936e-05, + "loss": 1.9762, + "step": 4003 + }, + { + "epoch": 1.228974831184776, + "grad_norm": 0.44118809700012207, + "learning_rate": 9.775619006211962e-05, + "loss": 1.9242, + "step": 4004 + }, + { + "epoch": 1.229281767955801, + "grad_norm": 0.4507352113723755, + "learning_rate": 9.775471751204522e-05, + "loss": 2.0015, + "step": 4005 + }, + { + "epoch": 1.2295887047268264, + "grad_norm": 0.4620691239833832, + "learning_rate": 9.775324449003072e-05, + "loss": 2.0269, + "step": 4006 + }, + { + "epoch": 1.2298956414978515, + "grad_norm": 0.5053025484085083, + "learning_rate": 9.775177099609065e-05, + "loss": 1.9764, + "step": 4007 + }, + { + "epoch": 1.2302025782688766, + "grad_norm": 0.5113483667373657, + "learning_rate": 9.775029703023961e-05, + "loss": 2.0583, + "step": 4008 + }, + { + "epoch": 1.2305095150399017, + "grad_norm": 0.517400324344635, + "learning_rate": 9.774882259249214e-05, + "loss": 2.0918, + "step": 4009 + }, + { + "epoch": 1.230816451810927, + "grad_norm": 0.5575035214424133, + "learning_rate": 9.774734768286282e-05, + "loss": 2.0573, + "step": 4010 + }, + { + "epoch": 1.2311233885819521, + "grad_norm": 0.5556582808494568, + "learning_rate": 9.774587230136622e-05, + "loss": 1.9612, + "step": 4011 + }, + { + "epoch": 1.2314303253529773, + "grad_norm": 0.541752815246582, + "learning_rate": 9.774439644801693e-05, + "loss": 2.0165, + "step": 4012 + }, + { + "epoch": 1.2317372621240024, + "grad_norm": 0.46944886445999146, + "learning_rate": 9.774292012282953e-05, + "loss": 2.0068, + "step": 4013 + }, + { + "epoch": 1.2320441988950277, + "grad_norm": 0.5507385730743408, + "learning_rate": 9.77414433258186e-05, + "loss": 2.0092, + "step": 4014 + }, + { + "epoch": 1.2323511356660528, + "grad_norm": 0.550862193107605, + "learning_rate": 9.773996605699875e-05, + "loss": 1.9887, + "step": 4015 + }, + { + "epoch": 1.232658072437078, + "grad_norm": 0.5281004905700684, + "learning_rate": 9.77384883163846e-05, + "loss": 2.0214, + "step": 4016 + }, + { + "epoch": 1.232965009208103, + "grad_norm": 0.5682541131973267, + "learning_rate": 9.77370101039907e-05, + "loss": 2.0021, + "step": 4017 + }, + { + "epoch": 1.2332719459791284, + "grad_norm": 0.5083168745040894, + "learning_rate": 9.77355314198317e-05, + "loss": 1.9589, + "step": 4018 + }, + { + "epoch": 1.2335788827501535, + "grad_norm": 0.48763957619667053, + "learning_rate": 9.773405226392218e-05, + "loss": 1.9517, + "step": 4019 + }, + { + "epoch": 1.2338858195211786, + "grad_norm": 0.4721868634223938, + "learning_rate": 9.77325726362768e-05, + "loss": 1.959, + "step": 4020 + }, + { + "epoch": 1.2341927562922037, + "grad_norm": 0.5072606205940247, + "learning_rate": 9.773109253691016e-05, + "loss": 2.0252, + "step": 4021 + }, + { + "epoch": 1.234499693063229, + "grad_norm": 0.483260840177536, + "learning_rate": 9.772961196583686e-05, + "loss": 2.0205, + "step": 4022 + }, + { + "epoch": 1.2348066298342542, + "grad_norm": 0.4468609392642975, + "learning_rate": 9.772813092307158e-05, + "loss": 2.0182, + "step": 4023 + }, + { + "epoch": 1.2351135666052793, + "grad_norm": 0.4950753152370453, + "learning_rate": 9.772664940862893e-05, + "loss": 2.0276, + "step": 4024 + }, + { + "epoch": 1.2354205033763046, + "grad_norm": 0.45740416646003723, + "learning_rate": 9.772516742252356e-05, + "loss": 1.9519, + "step": 4025 + }, + { + "epoch": 1.2357274401473297, + "grad_norm": 0.409072607755661, + "learning_rate": 9.772368496477011e-05, + "loss": 1.9441, + "step": 4026 + }, + { + "epoch": 1.2360343769183548, + "grad_norm": 0.44857287406921387, + "learning_rate": 9.772220203538325e-05, + "loss": 1.9941, + "step": 4027 + }, + { + "epoch": 1.23634131368938, + "grad_norm": 0.4610998034477234, + "learning_rate": 9.77207186343776e-05, + "loss": 1.9855, + "step": 4028 + }, + { + "epoch": 1.236648250460405, + "grad_norm": 0.4809660017490387, + "learning_rate": 9.771923476176784e-05, + "loss": 1.9596, + "step": 4029 + }, + { + "epoch": 1.2369551872314304, + "grad_norm": 0.5011657476425171, + "learning_rate": 9.771775041756865e-05, + "loss": 1.9537, + "step": 4030 + }, + { + "epoch": 1.2372621240024555, + "grad_norm": 0.476001501083374, + "learning_rate": 9.771626560179465e-05, + "loss": 1.9447, + "step": 4031 + }, + { + "epoch": 1.2375690607734806, + "grad_norm": 0.4733816385269165, + "learning_rate": 9.771478031446057e-05, + "loss": 2.08, + "step": 4032 + }, + { + "epoch": 1.237875997544506, + "grad_norm": 0.4763995409011841, + "learning_rate": 9.771329455558108e-05, + "loss": 1.9483, + "step": 4033 + }, + { + "epoch": 1.238182934315531, + "grad_norm": 0.4906281530857086, + "learning_rate": 9.771180832517082e-05, + "loss": 1.9619, + "step": 4034 + }, + { + "epoch": 1.2384898710865562, + "grad_norm": 0.48713672161102295, + "learning_rate": 9.77103216232445e-05, + "loss": 1.9753, + "step": 4035 + }, + { + "epoch": 1.2387968078575813, + "grad_norm": 0.5214180946350098, + "learning_rate": 9.770883444981683e-05, + "loss": 2.0407, + "step": 4036 + }, + { + "epoch": 1.2391037446286064, + "grad_norm": 0.5161129236221313, + "learning_rate": 9.77073468049025e-05, + "loss": 2.0298, + "step": 4037 + }, + { + "epoch": 1.2394106813996317, + "grad_norm": 0.5041607022285461, + "learning_rate": 9.770585868851621e-05, + "loss": 1.9898, + "step": 4038 + }, + { + "epoch": 1.2397176181706568, + "grad_norm": 0.5076795220375061, + "learning_rate": 9.770437010067264e-05, + "loss": 1.9899, + "step": 4039 + }, + { + "epoch": 1.240024554941682, + "grad_norm": 0.47992074489593506, + "learning_rate": 9.770288104138654e-05, + "loss": 1.9923, + "step": 4040 + }, + { + "epoch": 1.2403314917127073, + "grad_norm": 0.4655405580997467, + "learning_rate": 9.770139151067261e-05, + "loss": 2.0082, + "step": 4041 + }, + { + "epoch": 1.2406384284837324, + "grad_norm": 0.499953031539917, + "learning_rate": 9.769990150854558e-05, + "loss": 2.0412, + "step": 4042 + }, + { + "epoch": 1.2409453652547575, + "grad_norm": 0.5288184285163879, + "learning_rate": 9.769841103502016e-05, + "loss": 2.0163, + "step": 4043 + }, + { + "epoch": 1.2412523020257826, + "grad_norm": 0.6660463809967041, + "learning_rate": 9.769692009011107e-05, + "loss": 2.1644, + "step": 4044 + }, + { + "epoch": 1.2415592387968077, + "grad_norm": 0.7020677328109741, + "learning_rate": 9.769542867383306e-05, + "loss": 1.9921, + "step": 4045 + }, + { + "epoch": 1.241866175567833, + "grad_norm": 0.8394366502761841, + "learning_rate": 9.769393678620089e-05, + "loss": 2.0099, + "step": 4046 + }, + { + "epoch": 1.2421731123388582, + "grad_norm": 0.9541008472442627, + "learning_rate": 9.769244442722927e-05, + "loss": 2.0035, + "step": 4047 + }, + { + "epoch": 1.2424800491098833, + "grad_norm": 0.8454573750495911, + "learning_rate": 9.769095159693296e-05, + "loss": 2.0075, + "step": 4048 + }, + { + "epoch": 1.2427869858809086, + "grad_norm": 0.6634951233863831, + "learning_rate": 9.768945829532672e-05, + "loss": 2.0352, + "step": 4049 + }, + { + "epoch": 1.2430939226519337, + "grad_norm": 0.5453166365623474, + "learning_rate": 9.76879645224253e-05, + "loss": 2.0259, + "step": 4050 + }, + { + "epoch": 1.2434008594229589, + "grad_norm": 0.8018995523452759, + "learning_rate": 9.768647027824344e-05, + "loss": 2.0175, + "step": 4051 + }, + { + "epoch": 1.243707796193984, + "grad_norm": 0.8518994450569153, + "learning_rate": 9.768497556279596e-05, + "loss": 1.986, + "step": 4052 + }, + { + "epoch": 1.244014732965009, + "grad_norm": 0.670764684677124, + "learning_rate": 9.76834803760976e-05, + "loss": 1.9779, + "step": 4053 + }, + { + "epoch": 1.2443216697360344, + "grad_norm": 0.5042433142662048, + "learning_rate": 9.768198471816312e-05, + "loss": 1.9808, + "step": 4054 + }, + { + "epoch": 1.2446286065070595, + "grad_norm": 0.45487603545188904, + "learning_rate": 9.768048858900733e-05, + "loss": 2.011, + "step": 4055 + }, + { + "epoch": 1.2449355432780846, + "grad_norm": 0.5012104511260986, + "learning_rate": 9.767899198864502e-05, + "loss": 1.9945, + "step": 4056 + }, + { + "epoch": 1.24524248004911, + "grad_norm": 0.6275805234909058, + "learning_rate": 9.767749491709095e-05, + "loss": 2.0397, + "step": 4057 + }, + { + "epoch": 1.245549416820135, + "grad_norm": 0.601513683795929, + "learning_rate": 9.767599737435993e-05, + "loss": 2.0201, + "step": 4058 + }, + { + "epoch": 1.2458563535911602, + "grad_norm": 0.531112551689148, + "learning_rate": 9.767449936046678e-05, + "loss": 2.0449, + "step": 4059 + }, + { + "epoch": 1.2461632903621853, + "grad_norm": 0.48515528440475464, + "learning_rate": 9.767300087542626e-05, + "loss": 2.0318, + "step": 4060 + }, + { + "epoch": 1.2464702271332107, + "grad_norm": 0.49292388558387756, + "learning_rate": 9.767150191925321e-05, + "loss": 2.0004, + "step": 4061 + }, + { + "epoch": 1.2467771639042358, + "grad_norm": 0.6046907901763916, + "learning_rate": 9.767000249196242e-05, + "loss": 2.0141, + "step": 4062 + }, + { + "epoch": 1.2470841006752609, + "grad_norm": 0.5311875939369202, + "learning_rate": 9.766850259356876e-05, + "loss": 1.9909, + "step": 4063 + }, + { + "epoch": 1.247391037446286, + "grad_norm": 0.535664975643158, + "learning_rate": 9.7667002224087e-05, + "loss": 2.07, + "step": 4064 + }, + { + "epoch": 1.2476979742173113, + "grad_norm": 0.594886839389801, + "learning_rate": 9.766550138353199e-05, + "loss": 1.9646, + "step": 4065 + }, + { + "epoch": 1.2480049109883364, + "grad_norm": 0.6726763844490051, + "learning_rate": 9.766400007191856e-05, + "loss": 1.9778, + "step": 4066 + }, + { + "epoch": 1.2483118477593615, + "grad_norm": 0.6045297384262085, + "learning_rate": 9.766249828926154e-05, + "loss": 2.0215, + "step": 4067 + }, + { + "epoch": 1.2486187845303867, + "grad_norm": 0.56207275390625, + "learning_rate": 9.766099603557576e-05, + "loss": 2.0252, + "step": 4068 + }, + { + "epoch": 1.248925721301412, + "grad_norm": 0.6623022556304932, + "learning_rate": 9.765949331087611e-05, + "loss": 1.975, + "step": 4069 + }, + { + "epoch": 1.249232658072437, + "grad_norm": 0.6274738311767578, + "learning_rate": 9.76579901151774e-05, + "loss": 2.037, + "step": 4070 + }, + { + "epoch": 1.2495395948434622, + "grad_norm": 0.5161643028259277, + "learning_rate": 9.76564864484945e-05, + "loss": 1.969, + "step": 4071 + }, + { + "epoch": 1.2498465316144873, + "grad_norm": 0.5624449849128723, + "learning_rate": 9.765498231084227e-05, + "loss": 2.0322, + "step": 4072 + }, + { + "epoch": 1.2501534683855127, + "grad_norm": 0.6198796629905701, + "learning_rate": 9.765347770223556e-05, + "loss": 1.986, + "step": 4073 + }, + { + "epoch": 1.2504604051565378, + "grad_norm": 0.5928165316581726, + "learning_rate": 9.765197262268927e-05, + "loss": 1.9886, + "step": 4074 + }, + { + "epoch": 1.250767341927563, + "grad_norm": 0.476484090089798, + "learning_rate": 9.765046707221825e-05, + "loss": 2.0476, + "step": 4075 + }, + { + "epoch": 1.2510742786985882, + "grad_norm": 0.5001220703125, + "learning_rate": 9.764896105083738e-05, + "loss": 1.9222, + "step": 4076 + }, + { + "epoch": 1.2513812154696133, + "grad_norm": 0.5429214239120483, + "learning_rate": 9.764745455856156e-05, + "loss": 2.0005, + "step": 4077 + }, + { + "epoch": 1.2516881522406385, + "grad_norm": 0.49443748593330383, + "learning_rate": 9.764594759540566e-05, + "loss": 1.9746, + "step": 4078 + }, + { + "epoch": 1.2519950890116636, + "grad_norm": 0.46963369846343994, + "learning_rate": 9.764444016138458e-05, + "loss": 1.9133, + "step": 4079 + }, + { + "epoch": 1.2523020257826887, + "grad_norm": 0.5112172365188599, + "learning_rate": 9.764293225651324e-05, + "loss": 1.9488, + "step": 4080 + }, + { + "epoch": 1.252608962553714, + "grad_norm": 0.4584117829799652, + "learning_rate": 9.764142388080648e-05, + "loss": 1.9895, + "step": 4081 + }, + { + "epoch": 1.2529158993247391, + "grad_norm": 0.48059090971946716, + "learning_rate": 9.763991503427927e-05, + "loss": 2.0436, + "step": 4082 + }, + { + "epoch": 1.2532228360957642, + "grad_norm": 0.5877810120582581, + "learning_rate": 9.763840571694649e-05, + "loss": 1.97, + "step": 4083 + }, + { + "epoch": 1.2535297728667896, + "grad_norm": 0.5370834469795227, + "learning_rate": 9.763689592882306e-05, + "loss": 2.0369, + "step": 4084 + }, + { + "epoch": 1.2538367096378147, + "grad_norm": 0.5483170747756958, + "learning_rate": 9.763538566992392e-05, + "loss": 2.066, + "step": 4085 + }, + { + "epoch": 1.2541436464088398, + "grad_norm": 0.5209359526634216, + "learning_rate": 9.763387494026396e-05, + "loss": 2.0685, + "step": 4086 + }, + { + "epoch": 1.254450583179865, + "grad_norm": 0.5569130182266235, + "learning_rate": 9.763236373985813e-05, + "loss": 2.0253, + "step": 4087 + }, + { + "epoch": 1.25475751995089, + "grad_norm": 0.48483753204345703, + "learning_rate": 9.763085206872136e-05, + "loss": 1.9851, + "step": 4088 + }, + { + "epoch": 1.2550644567219154, + "grad_norm": 0.4289563000202179, + "learning_rate": 9.76293399268686e-05, + "loss": 1.9374, + "step": 4089 + }, + { + "epoch": 1.2553713934929405, + "grad_norm": 0.4691961109638214, + "learning_rate": 9.762782731431478e-05, + "loss": 1.9588, + "step": 4090 + }, + { + "epoch": 1.2556783302639656, + "grad_norm": 0.49626582860946655, + "learning_rate": 9.762631423107488e-05, + "loss": 1.999, + "step": 4091 + }, + { + "epoch": 1.255985267034991, + "grad_norm": 0.5099872946739197, + "learning_rate": 9.762480067716381e-05, + "loss": 2.013, + "step": 4092 + }, + { + "epoch": 1.256292203806016, + "grad_norm": 0.47525838017463684, + "learning_rate": 9.762328665259654e-05, + "loss": 1.9953, + "step": 4093 + }, + { + "epoch": 1.2565991405770411, + "grad_norm": 0.4277878999710083, + "learning_rate": 9.762177215738804e-05, + "loss": 1.9623, + "step": 4094 + }, + { + "epoch": 1.2569060773480663, + "grad_norm": 0.46068885922431946, + "learning_rate": 9.762025719155328e-05, + "loss": 2.0012, + "step": 4095 + }, + { + "epoch": 1.2572130141190914, + "grad_norm": 0.4566059410572052, + "learning_rate": 9.761874175510723e-05, + "loss": 1.9666, + "step": 4096 + }, + { + "epoch": 1.2575199508901167, + "grad_norm": 0.44656631350517273, + "learning_rate": 9.761722584806487e-05, + "loss": 1.9912, + "step": 4097 + }, + { + "epoch": 1.2578268876611418, + "grad_norm": 0.5149295330047607, + "learning_rate": 9.761570947044117e-05, + "loss": 1.9876, + "step": 4098 + }, + { + "epoch": 1.258133824432167, + "grad_norm": 0.5265617370605469, + "learning_rate": 9.761419262225111e-05, + "loss": 2.0817, + "step": 4099 + }, + { + "epoch": 1.2584407612031923, + "grad_norm": 0.5015068054199219, + "learning_rate": 9.76126753035097e-05, + "loss": 1.9767, + "step": 4100 + }, + { + "epoch": 1.2587476979742174, + "grad_norm": 0.5178890228271484, + "learning_rate": 9.761115751423192e-05, + "loss": 1.9968, + "step": 4101 + }, + { + "epoch": 1.2590546347452425, + "grad_norm": 0.46565014123916626, + "learning_rate": 9.760963925443279e-05, + "loss": 1.8977, + "step": 4102 + }, + { + "epoch": 1.2593615715162676, + "grad_norm": 0.466398686170578, + "learning_rate": 9.760812052412728e-05, + "loss": 2.0317, + "step": 4103 + }, + { + "epoch": 1.2596685082872927, + "grad_norm": 0.48445576429367065, + "learning_rate": 9.760660132333043e-05, + "loss": 1.9953, + "step": 4104 + }, + { + "epoch": 1.259975445058318, + "grad_norm": 0.5716978907585144, + "learning_rate": 9.760508165205724e-05, + "loss": 2.0468, + "step": 4105 + }, + { + "epoch": 1.2602823818293432, + "grad_norm": 0.5168376564979553, + "learning_rate": 9.760356151032273e-05, + "loss": 1.9896, + "step": 4106 + }, + { + "epoch": 1.2605893186003683, + "grad_norm": 0.5014469027519226, + "learning_rate": 9.760204089814192e-05, + "loss": 2.0855, + "step": 4107 + }, + { + "epoch": 1.2608962553713936, + "grad_norm": 0.5283352732658386, + "learning_rate": 9.760051981552984e-05, + "loss": 2.0477, + "step": 4108 + }, + { + "epoch": 1.2612031921424187, + "grad_norm": 0.4526209533214569, + "learning_rate": 9.759899826250153e-05, + "loss": 1.9638, + "step": 4109 + }, + { + "epoch": 1.2615101289134438, + "grad_norm": 0.4565027058124542, + "learning_rate": 9.759747623907203e-05, + "loss": 1.9401, + "step": 4110 + }, + { + "epoch": 1.261817065684469, + "grad_norm": 0.48825928568840027, + "learning_rate": 9.759595374525636e-05, + "loss": 1.9721, + "step": 4111 + }, + { + "epoch": 1.262124002455494, + "grad_norm": 0.4922933578491211, + "learning_rate": 9.759443078106958e-05, + "loss": 1.969, + "step": 4112 + }, + { + "epoch": 1.2624309392265194, + "grad_norm": 0.5227758884429932, + "learning_rate": 9.759290734652674e-05, + "loss": 2.0144, + "step": 4113 + }, + { + "epoch": 1.2627378759975445, + "grad_norm": 0.48013919591903687, + "learning_rate": 9.759138344164289e-05, + "loss": 1.9889, + "step": 4114 + }, + { + "epoch": 1.2630448127685696, + "grad_norm": 0.5039379596710205, + "learning_rate": 9.758985906643309e-05, + "loss": 1.9313, + "step": 4115 + }, + { + "epoch": 1.263351749539595, + "grad_norm": 0.5248776078224182, + "learning_rate": 9.758833422091244e-05, + "loss": 2.0091, + "step": 4116 + }, + { + "epoch": 1.26365868631062, + "grad_norm": 0.4788825809955597, + "learning_rate": 9.758680890509595e-05, + "loss": 2.0197, + "step": 4117 + }, + { + "epoch": 1.2639656230816452, + "grad_norm": 0.4926285743713379, + "learning_rate": 9.758528311899873e-05, + "loss": 2.0558, + "step": 4118 + }, + { + "epoch": 1.2642725598526703, + "grad_norm": 0.44785842299461365, + "learning_rate": 9.758375686263586e-05, + "loss": 1.9505, + "step": 4119 + }, + { + "epoch": 1.2645794966236954, + "grad_norm": 0.44693484902381897, + "learning_rate": 9.75822301360224e-05, + "loss": 1.9734, + "step": 4120 + }, + { + "epoch": 1.2648864333947207, + "grad_norm": 0.4691752791404724, + "learning_rate": 9.758070293917346e-05, + "loss": 2.0069, + "step": 4121 + }, + { + "epoch": 1.2651933701657458, + "grad_norm": 0.4718364477157593, + "learning_rate": 9.757917527210413e-05, + "loss": 1.9926, + "step": 4122 + }, + { + "epoch": 1.265500306936771, + "grad_norm": 0.47527435421943665, + "learning_rate": 9.757764713482949e-05, + "loss": 2.0304, + "step": 4123 + }, + { + "epoch": 1.2658072437077963, + "grad_norm": 0.5030924677848816, + "learning_rate": 9.757611852736467e-05, + "loss": 2.0281, + "step": 4124 + }, + { + "epoch": 1.2661141804788214, + "grad_norm": 0.5260440707206726, + "learning_rate": 9.757458944972475e-05, + "loss": 1.9952, + "step": 4125 + }, + { + "epoch": 1.2664211172498465, + "grad_norm": 0.5542300939559937, + "learning_rate": 9.757305990192486e-05, + "loss": 1.979, + "step": 4126 + }, + { + "epoch": 1.2667280540208716, + "grad_norm": 0.5589221715927124, + "learning_rate": 9.757152988398011e-05, + "loss": 2.0123, + "step": 4127 + }, + { + "epoch": 1.2670349907918967, + "grad_norm": 0.48933175206184387, + "learning_rate": 9.75699993959056e-05, + "loss": 1.9671, + "step": 4128 + }, + { + "epoch": 1.267341927562922, + "grad_norm": 0.4785501956939697, + "learning_rate": 9.75684684377165e-05, + "loss": 1.9452, + "step": 4129 + }, + { + "epoch": 1.2676488643339472, + "grad_norm": 0.5000367760658264, + "learning_rate": 9.75669370094279e-05, + "loss": 1.9637, + "step": 4130 + }, + { + "epoch": 1.2679558011049723, + "grad_norm": 0.5292743444442749, + "learning_rate": 9.756540511105496e-05, + "loss": 2.0464, + "step": 4131 + }, + { + "epoch": 1.2682627378759976, + "grad_norm": 0.4979592561721802, + "learning_rate": 9.75638727426128e-05, + "loss": 1.9863, + "step": 4132 + }, + { + "epoch": 1.2685696746470227, + "grad_norm": 0.4681611657142639, + "learning_rate": 9.756233990411656e-05, + "loss": 1.9978, + "step": 4133 + }, + { + "epoch": 1.2688766114180479, + "grad_norm": 0.5034354329109192, + "learning_rate": 9.756080659558142e-05, + "loss": 2.0332, + "step": 4134 + }, + { + "epoch": 1.269183548189073, + "grad_norm": 0.4815942347049713, + "learning_rate": 9.75592728170225e-05, + "loss": 1.9669, + "step": 4135 + }, + { + "epoch": 1.269490484960098, + "grad_norm": 0.49555137753486633, + "learning_rate": 9.755773856845498e-05, + "loss": 1.9774, + "step": 4136 + }, + { + "epoch": 1.2697974217311234, + "grad_norm": 0.5533550381660461, + "learning_rate": 9.755620384989401e-05, + "loss": 2.0236, + "step": 4137 + }, + { + "epoch": 1.2701043585021485, + "grad_norm": 0.49497511982917786, + "learning_rate": 9.755466866135476e-05, + "loss": 1.9266, + "step": 4138 + }, + { + "epoch": 1.2704112952731736, + "grad_norm": 0.5009804964065552, + "learning_rate": 9.755313300285239e-05, + "loss": 1.9463, + "step": 4139 + }, + { + "epoch": 1.270718232044199, + "grad_norm": 0.49870428442955017, + "learning_rate": 9.755159687440209e-05, + "loss": 1.9566, + "step": 4140 + }, + { + "epoch": 1.271025168815224, + "grad_norm": 0.49113500118255615, + "learning_rate": 9.755006027601905e-05, + "loss": 2.0075, + "step": 4141 + }, + { + "epoch": 1.2713321055862492, + "grad_norm": 0.45977187156677246, + "learning_rate": 9.754852320771845e-05, + "loss": 1.9358, + "step": 4142 + }, + { + "epoch": 1.2716390423572743, + "grad_norm": 0.5493664145469666, + "learning_rate": 9.754698566951545e-05, + "loss": 1.9996, + "step": 4143 + }, + { + "epoch": 1.2719459791282997, + "grad_norm": 0.4791078567504883, + "learning_rate": 9.75454476614253e-05, + "loss": 1.9426, + "step": 4144 + }, + { + "epoch": 1.2722529158993248, + "grad_norm": 0.4809282720088959, + "learning_rate": 9.754390918346315e-05, + "loss": 2.0197, + "step": 4145 + }, + { + "epoch": 1.2725598526703499, + "grad_norm": 0.5380387902259827, + "learning_rate": 9.754237023564423e-05, + "loss": 2.0261, + "step": 4146 + }, + { + "epoch": 1.272866789441375, + "grad_norm": 0.48302608728408813, + "learning_rate": 9.754083081798374e-05, + "loss": 2.0539, + "step": 4147 + }, + { + "epoch": 1.2731737262124003, + "grad_norm": 0.5752124786376953, + "learning_rate": 9.75392909304969e-05, + "loss": 2.0901, + "step": 4148 + }, + { + "epoch": 1.2734806629834254, + "grad_norm": 0.5538807511329651, + "learning_rate": 9.75377505731989e-05, + "loss": 1.9721, + "step": 4149 + }, + { + "epoch": 1.2737875997544506, + "grad_norm": 0.6331756114959717, + "learning_rate": 9.753620974610502e-05, + "loss": 2.0124, + "step": 4150 + }, + { + "epoch": 1.2740945365254759, + "grad_norm": 0.6422140598297119, + "learning_rate": 9.753466844923042e-05, + "loss": 2.0115, + "step": 4151 + }, + { + "epoch": 1.274401473296501, + "grad_norm": 0.6650347113609314, + "learning_rate": 9.753312668259038e-05, + "loss": 1.9735, + "step": 4152 + }, + { + "epoch": 1.274708410067526, + "grad_norm": 0.587230384349823, + "learning_rate": 9.753158444620013e-05, + "loss": 1.9382, + "step": 4153 + }, + { + "epoch": 1.2750153468385512, + "grad_norm": 0.5357664823532104, + "learning_rate": 9.75300417400749e-05, + "loss": 2.0437, + "step": 4154 + }, + { + "epoch": 1.2753222836095763, + "grad_norm": 0.5058115720748901, + "learning_rate": 9.752849856422994e-05, + "loss": 2.0031, + "step": 4155 + }, + { + "epoch": 1.2756292203806017, + "grad_norm": 0.5913745164871216, + "learning_rate": 9.75269549186805e-05, + "loss": 1.9923, + "step": 4156 + }, + { + "epoch": 1.2759361571516268, + "grad_norm": 0.6766920685768127, + "learning_rate": 9.752541080344181e-05, + "loss": 1.9619, + "step": 4157 + }, + { + "epoch": 1.276243093922652, + "grad_norm": 0.606132984161377, + "learning_rate": 9.752386621852919e-05, + "loss": 1.9689, + "step": 4158 + }, + { + "epoch": 1.2765500306936772, + "grad_norm": 0.521133542060852, + "learning_rate": 9.752232116395785e-05, + "loss": 1.9602, + "step": 4159 + }, + { + "epoch": 1.2768569674647023, + "grad_norm": 0.45266324281692505, + "learning_rate": 9.75207756397431e-05, + "loss": 2.0032, + "step": 4160 + }, + { + "epoch": 1.2771639042357275, + "grad_norm": 0.5078892707824707, + "learning_rate": 9.751922964590017e-05, + "loss": 2.0656, + "step": 4161 + }, + { + "epoch": 1.2774708410067526, + "grad_norm": 0.5042154788970947, + "learning_rate": 9.751768318244437e-05, + "loss": 1.9356, + "step": 4162 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.5866135954856873, + "learning_rate": 9.751613624939098e-05, + "loss": 1.9655, + "step": 4163 + }, + { + "epoch": 1.278084714548803, + "grad_norm": 0.6038163304328918, + "learning_rate": 9.751458884675527e-05, + "loss": 1.9445, + "step": 4164 + }, + { + "epoch": 1.2783916513198281, + "grad_norm": 0.4938269555568695, + "learning_rate": 9.751304097455254e-05, + "loss": 2.0164, + "step": 4165 + }, + { + "epoch": 1.2786985880908532, + "grad_norm": 0.4289272427558899, + "learning_rate": 9.75114926327981e-05, + "loss": 1.912, + "step": 4166 + }, + { + "epoch": 1.2790055248618786, + "grad_norm": 0.524058997631073, + "learning_rate": 9.750994382150724e-05, + "loss": 1.9279, + "step": 4167 + }, + { + "epoch": 1.2793124616329037, + "grad_norm": 0.6318224668502808, + "learning_rate": 9.750839454069527e-05, + "loss": 1.98, + "step": 4168 + }, + { + "epoch": 1.2796193984039288, + "grad_norm": 0.5709670782089233, + "learning_rate": 9.750684479037749e-05, + "loss": 2.0029, + "step": 4169 + }, + { + "epoch": 1.279926335174954, + "grad_norm": 0.4621523916721344, + "learning_rate": 9.750529457056924e-05, + "loss": 2.0295, + "step": 4170 + }, + { + "epoch": 1.280233271945979, + "grad_norm": 0.5812001824378967, + "learning_rate": 9.750374388128581e-05, + "loss": 2.0839, + "step": 4171 + }, + { + "epoch": 1.2805402087170044, + "grad_norm": 0.6389874219894409, + "learning_rate": 9.750219272254256e-05, + "loss": 2.0825, + "step": 4172 + }, + { + "epoch": 1.2808471454880295, + "grad_norm": 0.49902382493019104, + "learning_rate": 9.750064109435478e-05, + "loss": 1.8902, + "step": 4173 + }, + { + "epoch": 1.2811540822590546, + "grad_norm": 0.5641525983810425, + "learning_rate": 9.749908899673783e-05, + "loss": 2.0463, + "step": 4174 + }, + { + "epoch": 1.28146101903008, + "grad_norm": 0.5977841019630432, + "learning_rate": 9.749753642970704e-05, + "loss": 2.0253, + "step": 4175 + }, + { + "epoch": 1.281767955801105, + "grad_norm": 0.5438104271888733, + "learning_rate": 9.749598339327777e-05, + "loss": 1.9862, + "step": 4176 + }, + { + "epoch": 1.2820748925721301, + "grad_norm": 0.4542587697505951, + "learning_rate": 9.749442988746535e-05, + "loss": 1.9476, + "step": 4177 + }, + { + "epoch": 1.2823818293431553, + "grad_norm": 0.4900791347026825, + "learning_rate": 9.749287591228513e-05, + "loss": 2.0093, + "step": 4178 + }, + { + "epoch": 1.2826887661141804, + "grad_norm": 0.5837534666061401, + "learning_rate": 9.749132146775247e-05, + "loss": 2.0699, + "step": 4179 + }, + { + "epoch": 1.2829957028852057, + "grad_norm": 0.5315881967544556, + "learning_rate": 9.748976655388274e-05, + "loss": 1.9514, + "step": 4180 + }, + { + "epoch": 1.2833026396562308, + "grad_norm": 0.5284895300865173, + "learning_rate": 9.74882111706913e-05, + "loss": 2.0171, + "step": 4181 + }, + { + "epoch": 1.283609576427256, + "grad_norm": 0.521202802658081, + "learning_rate": 9.748665531819352e-05, + "loss": 2.025, + "step": 4182 + }, + { + "epoch": 1.2839165131982813, + "grad_norm": 0.5437573194503784, + "learning_rate": 9.748509899640479e-05, + "loss": 2.0352, + "step": 4183 + }, + { + "epoch": 1.2842234499693064, + "grad_norm": 0.5394143462181091, + "learning_rate": 9.748354220534048e-05, + "loss": 2.0245, + "step": 4184 + }, + { + "epoch": 1.2845303867403315, + "grad_norm": 0.47468093037605286, + "learning_rate": 9.748198494501597e-05, + "loss": 1.9719, + "step": 4185 + }, + { + "epoch": 1.2848373235113566, + "grad_norm": 0.5312216877937317, + "learning_rate": 9.748042721544666e-05, + "loss": 2.0111, + "step": 4186 + }, + { + "epoch": 1.2851442602823817, + "grad_norm": 0.525694727897644, + "learning_rate": 9.747886901664794e-05, + "loss": 2.0582, + "step": 4187 + }, + { + "epoch": 1.285451197053407, + "grad_norm": 0.4965955317020416, + "learning_rate": 9.74773103486352e-05, + "loss": 1.9777, + "step": 4188 + }, + { + "epoch": 1.2857581338244322, + "grad_norm": 0.4391513466835022, + "learning_rate": 9.747575121142385e-05, + "loss": 1.9725, + "step": 4189 + }, + { + "epoch": 1.2860650705954573, + "grad_norm": 0.48999011516571045, + "learning_rate": 9.74741916050293e-05, + "loss": 1.953, + "step": 4190 + }, + { + "epoch": 1.2863720073664826, + "grad_norm": 0.5297304391860962, + "learning_rate": 9.747263152946698e-05, + "loss": 2.0484, + "step": 4191 + }, + { + "epoch": 1.2866789441375077, + "grad_norm": 0.4878230690956116, + "learning_rate": 9.747107098475226e-05, + "loss": 2.0423, + "step": 4192 + }, + { + "epoch": 1.2869858809085328, + "grad_norm": 0.538070023059845, + "learning_rate": 9.74695099709006e-05, + "loss": 2.0699, + "step": 4193 + }, + { + "epoch": 1.287292817679558, + "grad_norm": 0.6656436324119568, + "learning_rate": 9.746794848792743e-05, + "loss": 2.0689, + "step": 4194 + }, + { + "epoch": 1.287599754450583, + "grad_norm": 0.6416848301887512, + "learning_rate": 9.746638653584819e-05, + "loss": 1.9796, + "step": 4195 + }, + { + "epoch": 1.2879066912216084, + "grad_norm": 0.5917447805404663, + "learning_rate": 9.746482411467827e-05, + "loss": 2.0324, + "step": 4196 + }, + { + "epoch": 1.2882136279926335, + "grad_norm": 0.5234537124633789, + "learning_rate": 9.746326122443314e-05, + "loss": 2.0468, + "step": 4197 + }, + { + "epoch": 1.2885205647636586, + "grad_norm": 0.4885808229446411, + "learning_rate": 9.746169786512827e-05, + "loss": 1.9619, + "step": 4198 + }, + { + "epoch": 1.288827501534684, + "grad_norm": 0.5776945948600769, + "learning_rate": 9.746013403677905e-05, + "loss": 2.0167, + "step": 4199 + }, + { + "epoch": 1.289134438305709, + "grad_norm": 0.5722271203994751, + "learning_rate": 9.745856973940099e-05, + "loss": 1.9751, + "step": 4200 + }, + { + "epoch": 1.2894413750767342, + "grad_norm": 0.49253931641578674, + "learning_rate": 9.745700497300951e-05, + "loss": 1.9821, + "step": 4201 + }, + { + "epoch": 1.2897483118477593, + "grad_norm": 0.4739282727241516, + "learning_rate": 9.74554397376201e-05, + "loss": 1.9926, + "step": 4202 + }, + { + "epoch": 1.2900552486187844, + "grad_norm": 0.5133153200149536, + "learning_rate": 9.745387403324823e-05, + "loss": 1.9655, + "step": 4203 + }, + { + "epoch": 1.2903621853898097, + "grad_norm": 0.48941388726234436, + "learning_rate": 9.745230785990935e-05, + "loss": 1.9401, + "step": 4204 + }, + { + "epoch": 1.2906691221608348, + "grad_norm": 0.5998152494430542, + "learning_rate": 9.745074121761896e-05, + "loss": 2.0223, + "step": 4205 + }, + { + "epoch": 1.29097605893186, + "grad_norm": 0.4423331618309021, + "learning_rate": 9.744917410639253e-05, + "loss": 1.9602, + "step": 4206 + }, + { + "epoch": 1.2912829957028853, + "grad_norm": 0.5387418866157532, + "learning_rate": 9.744760652624553e-05, + "loss": 2.0631, + "step": 4207 + }, + { + "epoch": 1.2915899324739104, + "grad_norm": 0.5992900729179382, + "learning_rate": 9.744603847719352e-05, + "loss": 1.9805, + "step": 4208 + }, + { + "epoch": 1.2918968692449355, + "grad_norm": 0.5033924579620361, + "learning_rate": 9.744446995925192e-05, + "loss": 1.9817, + "step": 4209 + }, + { + "epoch": 1.2922038060159606, + "grad_norm": 0.47493448853492737, + "learning_rate": 9.744290097243624e-05, + "loss": 2.0259, + "step": 4210 + }, + { + "epoch": 1.2925107427869857, + "grad_norm": 0.5161942839622498, + "learning_rate": 9.744133151676203e-05, + "loss": 1.9686, + "step": 4211 + }, + { + "epoch": 1.292817679558011, + "grad_norm": 0.4476351737976074, + "learning_rate": 9.743976159224477e-05, + "loss": 1.9488, + "step": 4212 + }, + { + "epoch": 1.2931246163290362, + "grad_norm": 0.5168361663818359, + "learning_rate": 9.743819119889999e-05, + "loss": 2.0645, + "step": 4213 + }, + { + "epoch": 1.2934315531000613, + "grad_norm": 0.5098811984062195, + "learning_rate": 9.743662033674319e-05, + "loss": 1.9889, + "step": 4214 + }, + { + "epoch": 1.2937384898710866, + "grad_norm": 0.5559372305870056, + "learning_rate": 9.74350490057899e-05, + "loss": 2.0348, + "step": 4215 + }, + { + "epoch": 1.2940454266421118, + "grad_norm": 0.5274948477745056, + "learning_rate": 9.743347720605566e-05, + "loss": 2.0566, + "step": 4216 + }, + { + "epoch": 1.2943523634131369, + "grad_norm": 0.5009967088699341, + "learning_rate": 9.743190493755601e-05, + "loss": 1.9915, + "step": 4217 + }, + { + "epoch": 1.2946593001841622, + "grad_norm": 0.5365834832191467, + "learning_rate": 9.743033220030646e-05, + "loss": 2.0581, + "step": 4218 + }, + { + "epoch": 1.2949662369551873, + "grad_norm": 0.519478976726532, + "learning_rate": 9.742875899432255e-05, + "loss": 1.9766, + "step": 4219 + }, + { + "epoch": 1.2952731737262124, + "grad_norm": 0.48030364513397217, + "learning_rate": 9.742718531961988e-05, + "loss": 2.0006, + "step": 4220 + }, + { + "epoch": 1.2955801104972375, + "grad_norm": 0.5257472991943359, + "learning_rate": 9.742561117621394e-05, + "loss": 2.0636, + "step": 4221 + }, + { + "epoch": 1.2958870472682626, + "grad_norm": 0.44784319400787354, + "learning_rate": 9.742403656412034e-05, + "loss": 1.9975, + "step": 4222 + }, + { + "epoch": 1.296193984039288, + "grad_norm": 0.4997022747993469, + "learning_rate": 9.742246148335459e-05, + "loss": 2.0167, + "step": 4223 + }, + { + "epoch": 1.296500920810313, + "grad_norm": 0.43378305435180664, + "learning_rate": 9.742088593393228e-05, + "loss": 1.9202, + "step": 4224 + }, + { + "epoch": 1.2968078575813382, + "grad_norm": 0.5256497859954834, + "learning_rate": 9.741930991586899e-05, + "loss": 2.0306, + "step": 4225 + }, + { + "epoch": 1.2971147943523635, + "grad_norm": 0.5017027258872986, + "learning_rate": 9.741773342918028e-05, + "loss": 2.0124, + "step": 4226 + }, + { + "epoch": 1.2974217311233887, + "grad_norm": 0.5393915176391602, + "learning_rate": 9.741615647388175e-05, + "loss": 2.0255, + "step": 4227 + }, + { + "epoch": 1.2977286678944138, + "grad_norm": 0.48618295788764954, + "learning_rate": 9.741457904998896e-05, + "loss": 1.9863, + "step": 4228 + }, + { + "epoch": 1.2980356046654389, + "grad_norm": 0.48060059547424316, + "learning_rate": 9.741300115751752e-05, + "loss": 2.0787, + "step": 4229 + }, + { + "epoch": 1.298342541436464, + "grad_norm": 0.4966236650943756, + "learning_rate": 9.741142279648298e-05, + "loss": 1.9818, + "step": 4230 + }, + { + "epoch": 1.2986494782074893, + "grad_norm": 0.5178021788597107, + "learning_rate": 9.7409843966901e-05, + "loss": 1.9847, + "step": 4231 + }, + { + "epoch": 1.2989564149785144, + "grad_norm": 0.5161063075065613, + "learning_rate": 9.740826466878716e-05, + "loss": 2.0028, + "step": 4232 + }, + { + "epoch": 1.2992633517495396, + "grad_norm": 0.5972462296485901, + "learning_rate": 9.740668490215705e-05, + "loss": 2.0205, + "step": 4233 + }, + { + "epoch": 1.2995702885205649, + "grad_norm": 0.5929185152053833, + "learning_rate": 9.740510466702629e-05, + "loss": 1.9802, + "step": 4234 + }, + { + "epoch": 1.29987722529159, + "grad_norm": 0.5496684908866882, + "learning_rate": 9.74035239634105e-05, + "loss": 1.9331, + "step": 4235 + }, + { + "epoch": 1.3001841620626151, + "grad_norm": 0.5822622179985046, + "learning_rate": 9.740194279132531e-05, + "loss": 2.1079, + "step": 4236 + }, + { + "epoch": 1.3004910988336402, + "grad_norm": 0.5886369943618774, + "learning_rate": 9.740036115078634e-05, + "loss": 1.9938, + "step": 4237 + }, + { + "epoch": 1.3007980356046653, + "grad_norm": 0.5259171724319458, + "learning_rate": 9.73987790418092e-05, + "loss": 2.0787, + "step": 4238 + }, + { + "epoch": 1.3011049723756907, + "grad_norm": 0.6112152934074402, + "learning_rate": 9.739719646440956e-05, + "loss": 2.0488, + "step": 4239 + }, + { + "epoch": 1.3014119091467158, + "grad_norm": 0.5786338448524475, + "learning_rate": 9.739561341860306e-05, + "loss": 1.9917, + "step": 4240 + }, + { + "epoch": 1.301718845917741, + "grad_norm": 0.5099230408668518, + "learning_rate": 9.739402990440531e-05, + "loss": 1.9949, + "step": 4241 + }, + { + "epoch": 1.3020257826887662, + "grad_norm": 0.5040346384048462, + "learning_rate": 9.739244592183198e-05, + "loss": 1.9368, + "step": 4242 + }, + { + "epoch": 1.3023327194597913, + "grad_norm": 0.48172008991241455, + "learning_rate": 9.739086147089871e-05, + "loss": 1.97, + "step": 4243 + }, + { + "epoch": 1.3026396562308165, + "grad_norm": 0.5350810885429382, + "learning_rate": 9.738927655162119e-05, + "loss": 2.0584, + "step": 4244 + }, + { + "epoch": 1.3029465930018416, + "grad_norm": 0.566371738910675, + "learning_rate": 9.738769116401505e-05, + "loss": 2.0138, + "step": 4245 + }, + { + "epoch": 1.3032535297728667, + "grad_norm": 0.5697746872901917, + "learning_rate": 9.738610530809598e-05, + "loss": 2.0319, + "step": 4246 + }, + { + "epoch": 1.303560466543892, + "grad_norm": 0.5186757445335388, + "learning_rate": 9.738451898387964e-05, + "loss": 1.9958, + "step": 4247 + }, + { + "epoch": 1.3038674033149171, + "grad_norm": 0.5318703651428223, + "learning_rate": 9.73829321913817e-05, + "loss": 2.0857, + "step": 4248 + }, + { + "epoch": 1.3041743400859422, + "grad_norm": 0.5013560056686401, + "learning_rate": 9.738134493061786e-05, + "loss": 1.9545, + "step": 4249 + }, + { + "epoch": 1.3044812768569676, + "grad_norm": 0.499009907245636, + "learning_rate": 9.737975720160382e-05, + "loss": 1.9773, + "step": 4250 + }, + { + "epoch": 1.3047882136279927, + "grad_norm": 0.5187140703201294, + "learning_rate": 9.737816900435522e-05, + "loss": 1.9826, + "step": 4251 + }, + { + "epoch": 1.3050951503990178, + "grad_norm": 0.4950683116912842, + "learning_rate": 9.73765803388878e-05, + "loss": 2.0061, + "step": 4252 + }, + { + "epoch": 1.305402087170043, + "grad_norm": 0.40729087591171265, + "learning_rate": 9.737499120521722e-05, + "loss": 1.9502, + "step": 4253 + }, + { + "epoch": 1.305709023941068, + "grad_norm": 0.4959156811237335, + "learning_rate": 9.737340160335924e-05, + "loss": 2.0975, + "step": 4254 + }, + { + "epoch": 1.3060159607120934, + "grad_norm": 0.5127618312835693, + "learning_rate": 9.737181153332952e-05, + "loss": 2.0098, + "step": 4255 + }, + { + "epoch": 1.3063228974831185, + "grad_norm": 0.45458972454071045, + "learning_rate": 9.737022099514381e-05, + "loss": 1.9475, + "step": 4256 + }, + { + "epoch": 1.3066298342541436, + "grad_norm": 0.5024627447128296, + "learning_rate": 9.736862998881779e-05, + "loss": 2.0682, + "step": 4257 + }, + { + "epoch": 1.306936771025169, + "grad_norm": 0.5217326283454895, + "learning_rate": 9.736703851436722e-05, + "loss": 2.0363, + "step": 4258 + }, + { + "epoch": 1.307243707796194, + "grad_norm": 0.4798679053783417, + "learning_rate": 9.736544657180781e-05, + "loss": 2.0357, + "step": 4259 + }, + { + "epoch": 1.3075506445672191, + "grad_norm": 0.6031736135482788, + "learning_rate": 9.73638541611553e-05, + "loss": 2.0143, + "step": 4260 + }, + { + "epoch": 1.3078575813382443, + "grad_norm": 0.4914969801902771, + "learning_rate": 9.736226128242542e-05, + "loss": 1.9292, + "step": 4261 + }, + { + "epoch": 1.3081645181092694, + "grad_norm": 0.40556418895721436, + "learning_rate": 9.736066793563392e-05, + "loss": 1.9528, + "step": 4262 + }, + { + "epoch": 1.3084714548802947, + "grad_norm": 0.45605841279029846, + "learning_rate": 9.735907412079652e-05, + "loss": 2.0704, + "step": 4263 + }, + { + "epoch": 1.3087783916513198, + "grad_norm": 0.4992324113845825, + "learning_rate": 9.7357479837929e-05, + "loss": 2.0211, + "step": 4264 + }, + { + "epoch": 1.309085328422345, + "grad_norm": 0.4904097020626068, + "learning_rate": 9.735588508704712e-05, + "loss": 1.987, + "step": 4265 + }, + { + "epoch": 1.3093922651933703, + "grad_norm": 0.5436086058616638, + "learning_rate": 9.735428986816661e-05, + "loss": 2.0704, + "step": 4266 + }, + { + "epoch": 1.3096992019643954, + "grad_norm": 0.4850294589996338, + "learning_rate": 9.735269418130326e-05, + "loss": 1.9576, + "step": 4267 + }, + { + "epoch": 1.3100061387354205, + "grad_norm": 0.44082164764404297, + "learning_rate": 9.735109802647283e-05, + "loss": 2.0018, + "step": 4268 + }, + { + "epoch": 1.3103130755064456, + "grad_norm": 0.4844531714916229, + "learning_rate": 9.73495014036911e-05, + "loss": 1.9852, + "step": 4269 + }, + { + "epoch": 1.3106200122774707, + "grad_norm": 0.547596275806427, + "learning_rate": 9.734790431297384e-05, + "loss": 2.0632, + "step": 4270 + }, + { + "epoch": 1.310926949048496, + "grad_norm": 0.517882764339447, + "learning_rate": 9.734630675433684e-05, + "loss": 1.9851, + "step": 4271 + }, + { + "epoch": 1.3112338858195212, + "grad_norm": 0.5148623585700989, + "learning_rate": 9.734470872779589e-05, + "loss": 2.0446, + "step": 4272 + }, + { + "epoch": 1.3115408225905463, + "grad_norm": 0.5872887372970581, + "learning_rate": 9.734311023336678e-05, + "loss": 2.0588, + "step": 4273 + }, + { + "epoch": 1.3118477593615716, + "grad_norm": 0.7116255164146423, + "learning_rate": 9.73415112710653e-05, + "loss": 2.0213, + "step": 4274 + }, + { + "epoch": 1.3121546961325967, + "grad_norm": 0.8191964626312256, + "learning_rate": 9.733991184090725e-05, + "loss": 1.9528, + "step": 4275 + }, + { + "epoch": 1.3124616329036218, + "grad_norm": 0.8214605450630188, + "learning_rate": 9.733831194290846e-05, + "loss": 1.9614, + "step": 4276 + }, + { + "epoch": 1.312768569674647, + "grad_norm": 0.7057182788848877, + "learning_rate": 9.733671157708472e-05, + "loss": 2.0767, + "step": 4277 + }, + { + "epoch": 1.313075506445672, + "grad_norm": 0.5114007592201233, + "learning_rate": 9.733511074345185e-05, + "loss": 1.946, + "step": 4278 + }, + { + "epoch": 1.3133824432166974, + "grad_norm": 0.5347970128059387, + "learning_rate": 9.733350944202566e-05, + "loss": 1.9658, + "step": 4279 + }, + { + "epoch": 1.3136893799877225, + "grad_norm": 0.6962214112281799, + "learning_rate": 9.733190767282202e-05, + "loss": 2.0943, + "step": 4280 + }, + { + "epoch": 1.3139963167587476, + "grad_norm": 0.5942707657814026, + "learning_rate": 9.733030543585668e-05, + "loss": 2.0101, + "step": 4281 + }, + { + "epoch": 1.314303253529773, + "grad_norm": 0.46218639612197876, + "learning_rate": 9.732870273114556e-05, + "loss": 2.0292, + "step": 4282 + }, + { + "epoch": 1.314610190300798, + "grad_norm": 0.5194444060325623, + "learning_rate": 9.732709955870445e-05, + "loss": 2.0666, + "step": 4283 + }, + { + "epoch": 1.3149171270718232, + "grad_norm": 0.5112141370773315, + "learning_rate": 9.732549591854918e-05, + "loss": 2.0205, + "step": 4284 + }, + { + "epoch": 1.3152240638428485, + "grad_norm": 0.5282790660858154, + "learning_rate": 9.732389181069566e-05, + "loss": 2.0704, + "step": 4285 + }, + { + "epoch": 1.3155310006138736, + "grad_norm": 0.4598311185836792, + "learning_rate": 9.732228723515968e-05, + "loss": 1.9485, + "step": 4286 + }, + { + "epoch": 1.3158379373848987, + "grad_norm": 0.4700186550617218, + "learning_rate": 9.732068219195711e-05, + "loss": 2.0329, + "step": 4287 + }, + { + "epoch": 1.3161448741559238, + "grad_norm": 0.4512452781200409, + "learning_rate": 9.731907668110384e-05, + "loss": 1.9829, + "step": 4288 + }, + { + "epoch": 1.316451810926949, + "grad_norm": 0.5053353309631348, + "learning_rate": 9.731747070261572e-05, + "loss": 2.0583, + "step": 4289 + }, + { + "epoch": 1.3167587476979743, + "grad_norm": 0.48143625259399414, + "learning_rate": 9.73158642565086e-05, + "loss": 2.014, + "step": 4290 + }, + { + "epoch": 1.3170656844689994, + "grad_norm": 0.4843716025352478, + "learning_rate": 9.73142573427984e-05, + "loss": 1.9951, + "step": 4291 + }, + { + "epoch": 1.3173726212400245, + "grad_norm": 0.45646217465400696, + "learning_rate": 9.731264996150098e-05, + "loss": 1.9701, + "step": 4292 + }, + { + "epoch": 1.3176795580110499, + "grad_norm": 0.5176306962966919, + "learning_rate": 9.73110421126322e-05, + "loss": 1.9915, + "step": 4293 + }, + { + "epoch": 1.317986494782075, + "grad_norm": 0.4862259328365326, + "learning_rate": 9.730943379620799e-05, + "loss": 2.0157, + "step": 4294 + }, + { + "epoch": 1.3182934315531, + "grad_norm": 0.4941593110561371, + "learning_rate": 9.730782501224423e-05, + "loss": 2.0164, + "step": 4295 + }, + { + "epoch": 1.3186003683241252, + "grad_norm": 0.46818530559539795, + "learning_rate": 9.73062157607568e-05, + "loss": 1.9749, + "step": 4296 + }, + { + "epoch": 1.3189073050951503, + "grad_norm": 0.41685113310813904, + "learning_rate": 9.730460604176163e-05, + "loss": 1.9443, + "step": 4297 + }, + { + "epoch": 1.3192142418661756, + "grad_norm": 0.40586861968040466, + "learning_rate": 9.73029958552746e-05, + "loss": 1.9227, + "step": 4298 + }, + { + "epoch": 1.3195211786372008, + "grad_norm": 0.3946068286895752, + "learning_rate": 9.730138520131167e-05, + "loss": 1.9073, + "step": 4299 + }, + { + "epoch": 1.3198281154082259, + "grad_norm": 0.3722321093082428, + "learning_rate": 9.729977407988871e-05, + "loss": 1.9299, + "step": 4300 + }, + { + "epoch": 1.3201350521792512, + "grad_norm": 0.39335691928863525, + "learning_rate": 9.729816249102164e-05, + "loss": 1.9673, + "step": 4301 + }, + { + "epoch": 1.3204419889502763, + "grad_norm": 0.4342779815196991, + "learning_rate": 9.729655043472643e-05, + "loss": 2.0704, + "step": 4302 + }, + { + "epoch": 1.3207489257213014, + "grad_norm": 0.46981000900268555, + "learning_rate": 9.729493791101899e-05, + "loss": 2.0593, + "step": 4303 + }, + { + "epoch": 1.3210558624923265, + "grad_norm": 0.4319849908351898, + "learning_rate": 9.729332491991524e-05, + "loss": 1.9378, + "step": 4304 + }, + { + "epoch": 1.3213627992633517, + "grad_norm": 0.4555012285709381, + "learning_rate": 9.729171146143115e-05, + "loss": 1.993, + "step": 4305 + }, + { + "epoch": 1.321669736034377, + "grad_norm": 0.5122297406196594, + "learning_rate": 9.729009753558262e-05, + "loss": 2.0237, + "step": 4306 + }, + { + "epoch": 1.321976672805402, + "grad_norm": 0.4814549386501312, + "learning_rate": 9.728848314238566e-05, + "loss": 2.0063, + "step": 4307 + }, + { + "epoch": 1.3222836095764272, + "grad_norm": 0.45410022139549255, + "learning_rate": 9.728686828185618e-05, + "loss": 2.0262, + "step": 4308 + }, + { + "epoch": 1.3225905463474525, + "grad_norm": 0.44759154319763184, + "learning_rate": 9.728525295401014e-05, + "loss": 1.9746, + "step": 4309 + }, + { + "epoch": 1.3228974831184777, + "grad_norm": 0.41539889574050903, + "learning_rate": 9.728363715886352e-05, + "loss": 1.9197, + "step": 4310 + }, + { + "epoch": 1.3232044198895028, + "grad_norm": 0.549961268901825, + "learning_rate": 9.72820208964323e-05, + "loss": 2.0168, + "step": 4311 + }, + { + "epoch": 1.3235113566605279, + "grad_norm": 0.6832249164581299, + "learning_rate": 9.728040416673243e-05, + "loss": 1.9711, + "step": 4312 + }, + { + "epoch": 1.323818293431553, + "grad_norm": 0.7458481788635254, + "learning_rate": 9.727878696977988e-05, + "loss": 2.1677, + "step": 4313 + }, + { + "epoch": 1.3241252302025783, + "grad_norm": 0.6268119812011719, + "learning_rate": 9.727716930559066e-05, + "loss": 2.0222, + "step": 4314 + }, + { + "epoch": 1.3244321669736034, + "grad_norm": 0.540987491607666, + "learning_rate": 9.727555117418075e-05, + "loss": 2.0552, + "step": 4315 + }, + { + "epoch": 1.3247391037446286, + "grad_norm": 0.6105024814605713, + "learning_rate": 9.727393257556612e-05, + "loss": 1.9287, + "step": 4316 + }, + { + "epoch": 1.325046040515654, + "grad_norm": 0.594327449798584, + "learning_rate": 9.727231350976277e-05, + "loss": 1.9737, + "step": 4317 + }, + { + "epoch": 1.325352977286679, + "grad_norm": 0.5686312913894653, + "learning_rate": 9.727069397678674e-05, + "loss": 1.988, + "step": 4318 + }, + { + "epoch": 1.3256599140577041, + "grad_norm": 0.5335875153541565, + "learning_rate": 9.726907397665399e-05, + "loss": 1.9992, + "step": 4319 + }, + { + "epoch": 1.3259668508287292, + "grad_norm": 0.514209508895874, + "learning_rate": 9.726745350938055e-05, + "loss": 2.0928, + "step": 4320 + }, + { + "epoch": 1.3262737875997543, + "grad_norm": 0.58844393491745, + "learning_rate": 9.726583257498242e-05, + "loss": 1.968, + "step": 4321 + }, + { + "epoch": 1.3265807243707797, + "grad_norm": 0.5247591733932495, + "learning_rate": 9.726421117347563e-05, + "loss": 1.9529, + "step": 4322 + }, + { + "epoch": 1.3268876611418048, + "grad_norm": 0.5057464241981506, + "learning_rate": 9.726258930487622e-05, + "loss": 2.0595, + "step": 4323 + }, + { + "epoch": 1.32719459791283, + "grad_norm": 0.564689040184021, + "learning_rate": 9.726096696920019e-05, + "loss": 1.9974, + "step": 4324 + }, + { + "epoch": 1.3275015346838552, + "grad_norm": 0.5755618214607239, + "learning_rate": 9.725934416646358e-05, + "loss": 1.9949, + "step": 4325 + }, + { + "epoch": 1.3278084714548803, + "grad_norm": 0.5969316959381104, + "learning_rate": 9.725772089668243e-05, + "loss": 1.972, + "step": 4326 + }, + { + "epoch": 1.3281154082259055, + "grad_norm": 0.5776877403259277, + "learning_rate": 9.725609715987278e-05, + "loss": 2.1018, + "step": 4327 + }, + { + "epoch": 1.3284223449969306, + "grad_norm": 0.5471270680427551, + "learning_rate": 9.725447295605071e-05, + "loss": 2.0153, + "step": 4328 + }, + { + "epoch": 1.3287292817679557, + "grad_norm": 0.49090373516082764, + "learning_rate": 9.725284828523222e-05, + "loss": 1.9651, + "step": 4329 + }, + { + "epoch": 1.329036218538981, + "grad_norm": 0.49420034885406494, + "learning_rate": 9.725122314743337e-05, + "loss": 2.0119, + "step": 4330 + }, + { + "epoch": 1.3293431553100061, + "grad_norm": 0.4841148853302002, + "learning_rate": 9.724959754267027e-05, + "loss": 1.974, + "step": 4331 + }, + { + "epoch": 1.3296500920810312, + "grad_norm": 0.42349007725715637, + "learning_rate": 9.724797147095893e-05, + "loss": 1.9779, + "step": 4332 + }, + { + "epoch": 1.3299570288520566, + "grad_norm": 0.47239863872528076, + "learning_rate": 9.724634493231545e-05, + "loss": 1.9184, + "step": 4333 + }, + { + "epoch": 1.3302639656230817, + "grad_norm": 0.5583773255348206, + "learning_rate": 9.72447179267559e-05, + "loss": 2.0742, + "step": 4334 + }, + { + "epoch": 1.3305709023941068, + "grad_norm": 0.486937552690506, + "learning_rate": 9.724309045429636e-05, + "loss": 2.0101, + "step": 4335 + }, + { + "epoch": 1.330877839165132, + "grad_norm": 0.42204493284225464, + "learning_rate": 9.724146251495289e-05, + "loss": 1.9564, + "step": 4336 + }, + { + "epoch": 1.331184775936157, + "grad_norm": 0.451628714799881, + "learning_rate": 9.723983410874163e-05, + "loss": 1.9949, + "step": 4337 + }, + { + "epoch": 1.3314917127071824, + "grad_norm": 0.4453491270542145, + "learning_rate": 9.723820523567861e-05, + "loss": 1.9415, + "step": 4338 + }, + { + "epoch": 1.3317986494782075, + "grad_norm": 0.4628424644470215, + "learning_rate": 9.723657589577999e-05, + "loss": 2.0296, + "step": 4339 + }, + { + "epoch": 1.3321055862492326, + "grad_norm": 0.5362148284912109, + "learning_rate": 9.723494608906181e-05, + "loss": 2.0719, + "step": 4340 + }, + { + "epoch": 1.332412523020258, + "grad_norm": 0.45357146859169006, + "learning_rate": 9.723331581554023e-05, + "loss": 1.9107, + "step": 4341 + }, + { + "epoch": 1.332719459791283, + "grad_norm": 0.5042485594749451, + "learning_rate": 9.723168507523133e-05, + "loss": 1.9838, + "step": 4342 + }, + { + "epoch": 1.3330263965623081, + "grad_norm": 0.4797585606575012, + "learning_rate": 9.723005386815123e-05, + "loss": 1.9779, + "step": 4343 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4489155113697052, + "learning_rate": 9.722842219431607e-05, + "loss": 1.9805, + "step": 4344 + }, + { + "epoch": 1.3336402701043584, + "grad_norm": 0.43091216683387756, + "learning_rate": 9.722679005374196e-05, + "loss": 1.9708, + "step": 4345 + }, + { + "epoch": 1.3339472068753837, + "grad_norm": 0.453937292098999, + "learning_rate": 9.722515744644502e-05, + "loss": 2.0038, + "step": 4346 + }, + { + "epoch": 1.3342541436464088, + "grad_norm": 0.38905346393585205, + "learning_rate": 9.722352437244138e-05, + "loss": 2.0042, + "step": 4347 + }, + { + "epoch": 1.334561080417434, + "grad_norm": 0.46686118841171265, + "learning_rate": 9.722189083174722e-05, + "loss": 2.0733, + "step": 4348 + }, + { + "epoch": 1.3348680171884593, + "grad_norm": 0.42737439274787903, + "learning_rate": 9.722025682437865e-05, + "loss": 1.9572, + "step": 4349 + }, + { + "epoch": 1.3351749539594844, + "grad_norm": 0.3857511878013611, + "learning_rate": 9.721862235035181e-05, + "loss": 1.9288, + "step": 4350 + }, + { + "epoch": 1.3354818907305095, + "grad_norm": 0.42448824644088745, + "learning_rate": 9.721698740968288e-05, + "loss": 1.99, + "step": 4351 + }, + { + "epoch": 1.3357888275015346, + "grad_norm": 0.4753642976284027, + "learning_rate": 9.721535200238802e-05, + "loss": 2.0268, + "step": 4352 + }, + { + "epoch": 1.3360957642725597, + "grad_norm": 0.5248960256576538, + "learning_rate": 9.721371612848336e-05, + "loss": 2.008, + "step": 4353 + }, + { + "epoch": 1.336402701043585, + "grad_norm": 0.5046865344047546, + "learning_rate": 9.721207978798507e-05, + "loss": 1.9248, + "step": 4354 + }, + { + "epoch": 1.3367096378146102, + "grad_norm": 0.48205190896987915, + "learning_rate": 9.721044298090937e-05, + "loss": 1.9895, + "step": 4355 + }, + { + "epoch": 1.3370165745856353, + "grad_norm": 0.46149346232414246, + "learning_rate": 9.720880570727238e-05, + "loss": 2.0001, + "step": 4356 + }, + { + "epoch": 1.3373235113566606, + "grad_norm": 0.6212405562400818, + "learning_rate": 9.72071679670903e-05, + "loss": 2.0772, + "step": 4357 + }, + { + "epoch": 1.3376304481276857, + "grad_norm": 0.6935828924179077, + "learning_rate": 9.720552976037934e-05, + "loss": 1.9865, + "step": 4358 + }, + { + "epoch": 1.3379373848987108, + "grad_norm": 0.6850154399871826, + "learning_rate": 9.720389108715564e-05, + "loss": 1.9964, + "step": 4359 + }, + { + "epoch": 1.3382443216697362, + "grad_norm": 0.5925734043121338, + "learning_rate": 9.720225194743544e-05, + "loss": 2.0109, + "step": 4360 + }, + { + "epoch": 1.3385512584407613, + "grad_norm": 0.47503459453582764, + "learning_rate": 9.720061234123492e-05, + "loss": 2.0406, + "step": 4361 + }, + { + "epoch": 1.3388581952117864, + "grad_norm": 0.44226083159446716, + "learning_rate": 9.719897226857026e-05, + "loss": 1.953, + "step": 4362 + }, + { + "epoch": 1.3391651319828115, + "grad_norm": 0.5688608884811401, + "learning_rate": 9.719733172945772e-05, + "loss": 1.9422, + "step": 4363 + }, + { + "epoch": 1.3394720687538366, + "grad_norm": 0.6097545027732849, + "learning_rate": 9.719569072391347e-05, + "loss": 2.0204, + "step": 4364 + }, + { + "epoch": 1.339779005524862, + "grad_norm": 0.44313064217567444, + "learning_rate": 9.719404925195374e-05, + "loss": 1.9458, + "step": 4365 + }, + { + "epoch": 1.340085942295887, + "grad_norm": 0.495632141828537, + "learning_rate": 9.719240731359476e-05, + "loss": 1.9682, + "step": 4366 + }, + { + "epoch": 1.3403928790669122, + "grad_norm": 0.5843736529350281, + "learning_rate": 9.719076490885275e-05, + "loss": 1.9948, + "step": 4367 + }, + { + "epoch": 1.3406998158379375, + "grad_norm": 0.6249645352363586, + "learning_rate": 9.718912203774395e-05, + "loss": 1.9675, + "step": 4368 + }, + { + "epoch": 1.3410067526089626, + "grad_norm": 0.48386043310165405, + "learning_rate": 9.718747870028457e-05, + "loss": 1.9678, + "step": 4369 + }, + { + "epoch": 1.3413136893799877, + "grad_norm": 0.4797835648059845, + "learning_rate": 9.718583489649088e-05, + "loss": 2.0118, + "step": 4370 + }, + { + "epoch": 1.3416206261510129, + "grad_norm": 0.6131169199943542, + "learning_rate": 9.718419062637911e-05, + "loss": 2.0057, + "step": 4371 + }, + { + "epoch": 1.341927562922038, + "grad_norm": 0.6230120062828064, + "learning_rate": 9.718254588996552e-05, + "loss": 1.9871, + "step": 4372 + }, + { + "epoch": 1.3422344996930633, + "grad_norm": 0.5323978662490845, + "learning_rate": 9.718090068726633e-05, + "loss": 1.9389, + "step": 4373 + }, + { + "epoch": 1.3425414364640884, + "grad_norm": 0.429446280002594, + "learning_rate": 9.717925501829786e-05, + "loss": 1.9928, + "step": 4374 + }, + { + "epoch": 1.3428483732351135, + "grad_norm": 0.5588231086730957, + "learning_rate": 9.717760888307632e-05, + "loss": 2.0197, + "step": 4375 + }, + { + "epoch": 1.3431553100061389, + "grad_norm": 0.608248770236969, + "learning_rate": 9.7175962281618e-05, + "loss": 1.9486, + "step": 4376 + }, + { + "epoch": 1.343462246777164, + "grad_norm": 0.6100868582725525, + "learning_rate": 9.717431521393918e-05, + "loss": 2.044, + "step": 4377 + }, + { + "epoch": 1.343769183548189, + "grad_norm": 0.5428611636161804, + "learning_rate": 9.717266768005611e-05, + "loss": 2.0078, + "step": 4378 + }, + { + "epoch": 1.3440761203192142, + "grad_norm": 0.4338260889053345, + "learning_rate": 9.71710196799851e-05, + "loss": 1.9206, + "step": 4379 + }, + { + "epoch": 1.3443830570902393, + "grad_norm": 0.4879632294178009, + "learning_rate": 9.716937121374243e-05, + "loss": 1.9852, + "step": 4380 + }, + { + "epoch": 1.3446899938612646, + "grad_norm": 0.5174580216407776, + "learning_rate": 9.716772228134438e-05, + "loss": 1.9328, + "step": 4381 + }, + { + "epoch": 1.3449969306322898, + "grad_norm": 0.4461662173271179, + "learning_rate": 9.716607288280726e-05, + "loss": 1.9653, + "step": 4382 + }, + { + "epoch": 1.3453038674033149, + "grad_norm": 0.49747103452682495, + "learning_rate": 9.716442301814735e-05, + "loss": 1.9904, + "step": 4383 + }, + { + "epoch": 1.3456108041743402, + "grad_norm": 0.5059060454368591, + "learning_rate": 9.716277268738097e-05, + "loss": 1.9408, + "step": 4384 + }, + { + "epoch": 1.3459177409453653, + "grad_norm": 0.47981831431388855, + "learning_rate": 9.716112189052445e-05, + "loss": 1.9604, + "step": 4385 + }, + { + "epoch": 1.3462246777163904, + "grad_norm": 0.48941048979759216, + "learning_rate": 9.715947062759405e-05, + "loss": 2.0005, + "step": 4386 + }, + { + "epoch": 1.3465316144874155, + "grad_norm": 0.4544732868671417, + "learning_rate": 9.715781889860613e-05, + "loss": 1.9641, + "step": 4387 + }, + { + "epoch": 1.3468385512584407, + "grad_norm": 0.4564060866832733, + "learning_rate": 9.715616670357701e-05, + "loss": 1.8786, + "step": 4388 + }, + { + "epoch": 1.347145488029466, + "grad_norm": 0.4216209352016449, + "learning_rate": 9.715451404252301e-05, + "loss": 1.9402, + "step": 4389 + }, + { + "epoch": 1.347452424800491, + "grad_norm": 0.5024694204330444, + "learning_rate": 9.715286091546046e-05, + "loss": 1.9815, + "step": 4390 + }, + { + "epoch": 1.3477593615715162, + "grad_norm": 0.523953378200531, + "learning_rate": 9.715120732240571e-05, + "loss": 2.008, + "step": 4391 + }, + { + "epoch": 1.3480662983425415, + "grad_norm": 0.5068427920341492, + "learning_rate": 9.714955326337508e-05, + "loss": 1.9984, + "step": 4392 + }, + { + "epoch": 1.3483732351135667, + "grad_norm": 0.4349055290222168, + "learning_rate": 9.714789873838494e-05, + "loss": 1.9576, + "step": 4393 + }, + { + "epoch": 1.3486801718845918, + "grad_norm": 0.4677357077598572, + "learning_rate": 9.714624374745162e-05, + "loss": 2.0491, + "step": 4394 + }, + { + "epoch": 1.3489871086556169, + "grad_norm": 0.5942007899284363, + "learning_rate": 9.71445882905915e-05, + "loss": 1.9951, + "step": 4395 + }, + { + "epoch": 1.349294045426642, + "grad_norm": 0.5354358553886414, + "learning_rate": 9.714293236782092e-05, + "loss": 2.0033, + "step": 4396 + }, + { + "epoch": 1.3496009821976673, + "grad_norm": 0.5081890821456909, + "learning_rate": 9.714127597915625e-05, + "loss": 1.9944, + "step": 4397 + }, + { + "epoch": 1.3499079189686924, + "grad_norm": 0.5279759764671326, + "learning_rate": 9.713961912461386e-05, + "loss": 2.025, + "step": 4398 + }, + { + "epoch": 1.3502148557397176, + "grad_norm": 0.41777312755584717, + "learning_rate": 9.713796180421012e-05, + "loss": 1.9214, + "step": 4399 + }, + { + "epoch": 1.350521792510743, + "grad_norm": 0.48946598172187805, + "learning_rate": 9.713630401796141e-05, + "loss": 1.9851, + "step": 4400 + }, + { + "epoch": 1.350828729281768, + "grad_norm": 0.45182350277900696, + "learning_rate": 9.713464576588413e-05, + "loss": 1.9825, + "step": 4401 + }, + { + "epoch": 1.3511356660527931, + "grad_norm": 0.4178939461708069, + "learning_rate": 9.713298704799465e-05, + "loss": 1.8944, + "step": 4402 + }, + { + "epoch": 1.3514426028238182, + "grad_norm": 0.4178236424922943, + "learning_rate": 9.713132786430937e-05, + "loss": 1.9884, + "step": 4403 + }, + { + "epoch": 1.3517495395948433, + "grad_norm": 0.45951130986213684, + "learning_rate": 9.712966821484467e-05, + "loss": 2.0786, + "step": 4404 + }, + { + "epoch": 1.3520564763658687, + "grad_norm": 0.4884461760520935, + "learning_rate": 9.712800809961697e-05, + "loss": 2.0494, + "step": 4405 + }, + { + "epoch": 1.3523634131368938, + "grad_norm": 0.5342240929603577, + "learning_rate": 9.712634751864268e-05, + "loss": 2.1068, + "step": 4406 + }, + { + "epoch": 1.352670349907919, + "grad_norm": 0.5503208637237549, + "learning_rate": 9.71246864719382e-05, + "loss": 1.9588, + "step": 4407 + }, + { + "epoch": 1.3529772866789442, + "grad_norm": 0.5576291084289551, + "learning_rate": 9.712302495951994e-05, + "loss": 2.0461, + "step": 4408 + }, + { + "epoch": 1.3532842234499693, + "grad_norm": 0.5063806772232056, + "learning_rate": 9.712136298140433e-05, + "loss": 1.9606, + "step": 4409 + }, + { + "epoch": 1.3535911602209945, + "grad_norm": 0.5391512513160706, + "learning_rate": 9.71197005376078e-05, + "loss": 2.0115, + "step": 4410 + }, + { + "epoch": 1.3538980969920196, + "grad_norm": 0.4934769868850708, + "learning_rate": 9.711803762814676e-05, + "loss": 1.9966, + "step": 4411 + }, + { + "epoch": 1.3542050337630447, + "grad_norm": 0.4658334255218506, + "learning_rate": 9.711637425303766e-05, + "loss": 1.9477, + "step": 4412 + }, + { + "epoch": 1.35451197053407, + "grad_norm": 0.4407191574573517, + "learning_rate": 9.711471041229693e-05, + "loss": 1.9334, + "step": 4413 + }, + { + "epoch": 1.3548189073050951, + "grad_norm": 0.5043092370033264, + "learning_rate": 9.711304610594104e-05, + "loss": 2.0068, + "step": 4414 + }, + { + "epoch": 1.3551258440761202, + "grad_norm": 0.4502009451389313, + "learning_rate": 9.711138133398639e-05, + "loss": 1.9389, + "step": 4415 + }, + { + "epoch": 1.3554327808471456, + "grad_norm": 0.41863033175468445, + "learning_rate": 9.710971609644945e-05, + "loss": 1.9244, + "step": 4416 + }, + { + "epoch": 1.3557397176181707, + "grad_norm": 0.47590091824531555, + "learning_rate": 9.71080503933467e-05, + "loss": 2.0144, + "step": 4417 + }, + { + "epoch": 1.3560466543891958, + "grad_norm": 0.47155439853668213, + "learning_rate": 9.71063842246946e-05, + "loss": 2.0729, + "step": 4418 + }, + { + "epoch": 1.356353591160221, + "grad_norm": 0.5231152176856995, + "learning_rate": 9.710471759050957e-05, + "loss": 2.0654, + "step": 4419 + }, + { + "epoch": 1.356660527931246, + "grad_norm": 0.5952544212341309, + "learning_rate": 9.710305049080812e-05, + "loss": 1.9983, + "step": 4420 + }, + { + "epoch": 1.3569674647022714, + "grad_norm": 0.4810022711753845, + "learning_rate": 9.710138292560673e-05, + "loss": 1.9725, + "step": 4421 + }, + { + "epoch": 1.3572744014732965, + "grad_norm": 0.553421676158905, + "learning_rate": 9.709971489492185e-05, + "loss": 2.0666, + "step": 4422 + }, + { + "epoch": 1.3575813382443216, + "grad_norm": 0.48790663480758667, + "learning_rate": 9.709804639877001e-05, + "loss": 1.9312, + "step": 4423 + }, + { + "epoch": 1.357888275015347, + "grad_norm": 0.42968273162841797, + "learning_rate": 9.709637743716764e-05, + "loss": 1.9061, + "step": 4424 + }, + { + "epoch": 1.358195211786372, + "grad_norm": 0.40183690190315247, + "learning_rate": 9.709470801013128e-05, + "loss": 2.0547, + "step": 4425 + }, + { + "epoch": 1.3585021485573971, + "grad_norm": 0.5162881016731262, + "learning_rate": 9.70930381176774e-05, + "loss": 2.0246, + "step": 4426 + }, + { + "epoch": 1.3588090853284225, + "grad_norm": 0.517995297908783, + "learning_rate": 9.709136775982252e-05, + "loss": 2.0029, + "step": 4427 + }, + { + "epoch": 1.3591160220994476, + "grad_norm": 0.47416025400161743, + "learning_rate": 9.708969693658314e-05, + "loss": 1.9517, + "step": 4428 + }, + { + "epoch": 1.3594229588704727, + "grad_norm": 0.4192255437374115, + "learning_rate": 9.708802564797578e-05, + "loss": 1.9138, + "step": 4429 + }, + { + "epoch": 1.3597298956414978, + "grad_norm": 0.4643617868423462, + "learning_rate": 9.708635389401697e-05, + "loss": 1.9753, + "step": 4430 + }, + { + "epoch": 1.360036832412523, + "grad_norm": 0.5007988214492798, + "learning_rate": 9.708468167472317e-05, + "loss": 1.9654, + "step": 4431 + }, + { + "epoch": 1.3603437691835483, + "grad_norm": 0.5188244581222534, + "learning_rate": 9.708300899011098e-05, + "loss": 1.9959, + "step": 4432 + }, + { + "epoch": 1.3606507059545734, + "grad_norm": 0.5209388732910156, + "learning_rate": 9.70813358401969e-05, + "loss": 2.0028, + "step": 4433 + }, + { + "epoch": 1.3609576427255985, + "grad_norm": 0.48829126358032227, + "learning_rate": 9.707966222499745e-05, + "loss": 2.0554, + "step": 4434 + }, + { + "epoch": 1.3612645794966238, + "grad_norm": 0.4373438358306885, + "learning_rate": 9.707798814452919e-05, + "loss": 1.9611, + "step": 4435 + }, + { + "epoch": 1.361571516267649, + "grad_norm": 0.4294830858707428, + "learning_rate": 9.707631359880867e-05, + "loss": 1.9049, + "step": 4436 + }, + { + "epoch": 1.361878453038674, + "grad_norm": 0.46988123655319214, + "learning_rate": 9.70746385878524e-05, + "loss": 1.9221, + "step": 4437 + }, + { + "epoch": 1.3621853898096992, + "grad_norm": 0.4956746995449066, + "learning_rate": 9.707296311167697e-05, + "loss": 1.9215, + "step": 4438 + }, + { + "epoch": 1.3624923265807243, + "grad_norm": 0.43748801946640015, + "learning_rate": 9.707128717029894e-05, + "loss": 1.9882, + "step": 4439 + }, + { + "epoch": 1.3627992633517496, + "grad_norm": 0.4926415979862213, + "learning_rate": 9.706961076373485e-05, + "loss": 1.9664, + "step": 4440 + }, + { + "epoch": 1.3631062001227747, + "grad_norm": 0.5239415764808655, + "learning_rate": 9.706793389200129e-05, + "loss": 1.9809, + "step": 4441 + }, + { + "epoch": 1.3634131368937998, + "grad_norm": 0.5134629607200623, + "learning_rate": 9.706625655511481e-05, + "loss": 1.9559, + "step": 4442 + }, + { + "epoch": 1.3637200736648252, + "grad_norm": 0.49562570452690125, + "learning_rate": 9.706457875309198e-05, + "loss": 1.9603, + "step": 4443 + }, + { + "epoch": 1.3640270104358503, + "grad_norm": 0.45000702142715454, + "learning_rate": 9.706290048594942e-05, + "loss": 1.9395, + "step": 4444 + }, + { + "epoch": 1.3643339472068754, + "grad_norm": 0.4216759502887726, + "learning_rate": 9.70612217537037e-05, + "loss": 1.8857, + "step": 4445 + }, + { + "epoch": 1.3646408839779005, + "grad_norm": 0.5022158622741699, + "learning_rate": 9.705954255637138e-05, + "loss": 1.9388, + "step": 4446 + }, + { + "epoch": 1.3649478207489256, + "grad_norm": 0.5086642503738403, + "learning_rate": 9.70578628939691e-05, + "loss": 1.9325, + "step": 4447 + }, + { + "epoch": 1.365254757519951, + "grad_norm": 0.4891139566898346, + "learning_rate": 9.705618276651342e-05, + "loss": 1.9068, + "step": 4448 + }, + { + "epoch": 1.365561694290976, + "grad_norm": 0.42479926347732544, + "learning_rate": 9.705450217402096e-05, + "loss": 2.0345, + "step": 4449 + }, + { + "epoch": 1.3658686310620012, + "grad_norm": 0.45347172021865845, + "learning_rate": 9.705282111650834e-05, + "loss": 1.9343, + "step": 4450 + }, + { + "epoch": 1.3661755678330265, + "grad_norm": 0.5443231463432312, + "learning_rate": 9.705113959399217e-05, + "loss": 2.0428, + "step": 4451 + }, + { + "epoch": 1.3664825046040516, + "grad_norm": 0.5320110321044922, + "learning_rate": 9.704945760648905e-05, + "loss": 2.0015, + "step": 4452 + }, + { + "epoch": 1.3667894413750767, + "grad_norm": 0.5018410086631775, + "learning_rate": 9.704777515401561e-05, + "loss": 1.9284, + "step": 4453 + }, + { + "epoch": 1.3670963781461019, + "grad_norm": 0.4587440490722656, + "learning_rate": 9.704609223658848e-05, + "loss": 1.8945, + "step": 4454 + }, + { + "epoch": 1.367403314917127, + "grad_norm": 0.4634784758090973, + "learning_rate": 9.70444088542243e-05, + "loss": 1.9564, + "step": 4455 + }, + { + "epoch": 1.3677102516881523, + "grad_norm": 0.43047839403152466, + "learning_rate": 9.70427250069397e-05, + "loss": 2.0417, + "step": 4456 + }, + { + "epoch": 1.3680171884591774, + "grad_norm": 0.46661630272865295, + "learning_rate": 9.70410406947513e-05, + "loss": 2.0563, + "step": 4457 + }, + { + "epoch": 1.3683241252302025, + "grad_norm": 0.46544912457466125, + "learning_rate": 9.703935591767579e-05, + "loss": 2.0115, + "step": 4458 + }, + { + "epoch": 1.3686310620012279, + "grad_norm": 0.466172993183136, + "learning_rate": 9.703767067572977e-05, + "loss": 1.9177, + "step": 4459 + }, + { + "epoch": 1.368937998772253, + "grad_norm": 0.44513949751853943, + "learning_rate": 9.703598496892994e-05, + "loss": 1.9954, + "step": 4460 + }, + { + "epoch": 1.369244935543278, + "grad_norm": 0.4502551257610321, + "learning_rate": 9.703429879729293e-05, + "loss": 1.9155, + "step": 4461 + }, + { + "epoch": 1.3695518723143032, + "grad_norm": 0.4618416726589203, + "learning_rate": 9.703261216083541e-05, + "loss": 2.015, + "step": 4462 + }, + { + "epoch": 1.3698588090853283, + "grad_norm": 0.4691082239151001, + "learning_rate": 9.703092505957405e-05, + "loss": 2.0332, + "step": 4463 + }, + { + "epoch": 1.3701657458563536, + "grad_norm": 0.5674530863761902, + "learning_rate": 9.702923749352553e-05, + "loss": 2.0, + "step": 4464 + }, + { + "epoch": 1.3704726826273788, + "grad_norm": 0.5828661322593689, + "learning_rate": 9.702754946270651e-05, + "loss": 1.9727, + "step": 4465 + }, + { + "epoch": 1.3707796193984039, + "grad_norm": 0.5861548781394958, + "learning_rate": 9.702586096713369e-05, + "loss": 2.0337, + "step": 4466 + }, + { + "epoch": 1.3710865561694292, + "grad_norm": 0.5607923865318298, + "learning_rate": 9.702417200682374e-05, + "loss": 1.9639, + "step": 4467 + }, + { + "epoch": 1.3713934929404543, + "grad_norm": 0.553827702999115, + "learning_rate": 9.702248258179337e-05, + "loss": 1.9644, + "step": 4468 + }, + { + "epoch": 1.3717004297114794, + "grad_norm": 0.6120470762252808, + "learning_rate": 9.702079269205925e-05, + "loss": 1.9562, + "step": 4469 + }, + { + "epoch": 1.3720073664825045, + "grad_norm": 0.6354473829269409, + "learning_rate": 9.70191023376381e-05, + "loss": 2.0984, + "step": 4470 + }, + { + "epoch": 1.3723143032535297, + "grad_norm": 0.5426626801490784, + "learning_rate": 9.701741151854665e-05, + "loss": 1.9473, + "step": 4471 + }, + { + "epoch": 1.372621240024555, + "grad_norm": 0.5632089376449585, + "learning_rate": 9.701572023480156e-05, + "loss": 2.0167, + "step": 4472 + }, + { + "epoch": 1.37292817679558, + "grad_norm": 0.5315039157867432, + "learning_rate": 9.701402848641957e-05, + "loss": 1.9537, + "step": 4473 + }, + { + "epoch": 1.3732351135666052, + "grad_norm": 0.4552931785583496, + "learning_rate": 9.70123362734174e-05, + "loss": 1.9553, + "step": 4474 + }, + { + "epoch": 1.3735420503376305, + "grad_norm": 0.49282166361808777, + "learning_rate": 9.701064359581176e-05, + "loss": 2.0409, + "step": 4475 + }, + { + "epoch": 1.3738489871086557, + "grad_norm": 0.46548575162887573, + "learning_rate": 9.700895045361939e-05, + "loss": 1.9707, + "step": 4476 + }, + { + "epoch": 1.3741559238796808, + "grad_norm": 0.4619027078151703, + "learning_rate": 9.7007256846857e-05, + "loss": 1.9531, + "step": 4477 + }, + { + "epoch": 1.3744628606507059, + "grad_norm": 0.5122626423835754, + "learning_rate": 9.700556277554138e-05, + "loss": 2.0625, + "step": 4478 + }, + { + "epoch": 1.374769797421731, + "grad_norm": 0.487246036529541, + "learning_rate": 9.700386823968922e-05, + "loss": 1.9667, + "step": 4479 + }, + { + "epoch": 1.3750767341927563, + "grad_norm": 0.5093865990638733, + "learning_rate": 9.700217323931729e-05, + "loss": 1.9982, + "step": 4480 + }, + { + "epoch": 1.3753836709637814, + "grad_norm": 0.47049981355667114, + "learning_rate": 9.700047777444232e-05, + "loss": 1.9876, + "step": 4481 + }, + { + "epoch": 1.3756906077348066, + "grad_norm": 0.4997411370277405, + "learning_rate": 9.699878184508109e-05, + "loss": 1.9925, + "step": 4482 + }, + { + "epoch": 1.375997544505832, + "grad_norm": 0.49374327063560486, + "learning_rate": 9.699708545125034e-05, + "loss": 1.9468, + "step": 4483 + }, + { + "epoch": 1.376304481276857, + "grad_norm": 0.44101378321647644, + "learning_rate": 9.699538859296686e-05, + "loss": 2.0577, + "step": 4484 + }, + { + "epoch": 1.3766114180478821, + "grad_norm": 0.47289925813674927, + "learning_rate": 9.699369127024741e-05, + "loss": 1.9611, + "step": 4485 + }, + { + "epoch": 1.3769183548189072, + "grad_norm": 0.4616342782974243, + "learning_rate": 9.699199348310875e-05, + "loss": 2.0196, + "step": 4486 + }, + { + "epoch": 1.3772252915899323, + "grad_norm": 0.45797309279441833, + "learning_rate": 9.699029523156766e-05, + "loss": 2.0168, + "step": 4487 + }, + { + "epoch": 1.3775322283609577, + "grad_norm": 0.5224477648735046, + "learning_rate": 9.698859651564095e-05, + "loss": 2.0312, + "step": 4488 + }, + { + "epoch": 1.3778391651319828, + "grad_norm": 0.4831027388572693, + "learning_rate": 9.698689733534539e-05, + "loss": 2.0084, + "step": 4489 + }, + { + "epoch": 1.378146101903008, + "grad_norm": 0.49492040276527405, + "learning_rate": 9.698519769069774e-05, + "loss": 1.9474, + "step": 4490 + }, + { + "epoch": 1.3784530386740332, + "grad_norm": 0.4911774694919586, + "learning_rate": 9.698349758171486e-05, + "loss": 1.987, + "step": 4491 + }, + { + "epoch": 1.3787599754450584, + "grad_norm": 0.5415390729904175, + "learning_rate": 9.69817970084135e-05, + "loss": 1.9927, + "step": 4492 + }, + { + "epoch": 1.3790669122160835, + "grad_norm": 0.6870381832122803, + "learning_rate": 9.698009597081048e-05, + "loss": 2.0348, + "step": 4493 + }, + { + "epoch": 1.3793738489871086, + "grad_norm": 0.6322616934776306, + "learning_rate": 9.697839446892263e-05, + "loss": 2.0119, + "step": 4494 + }, + { + "epoch": 1.3796807857581337, + "grad_norm": 0.5950151681900024, + "learning_rate": 9.697669250276675e-05, + "loss": 2.002, + "step": 4495 + }, + { + "epoch": 1.379987722529159, + "grad_norm": 0.4321151673793793, + "learning_rate": 9.697499007235966e-05, + "loss": 1.9173, + "step": 4496 + }, + { + "epoch": 1.3802946593001841, + "grad_norm": 0.4627344608306885, + "learning_rate": 9.697328717771818e-05, + "loss": 2.0289, + "step": 4497 + }, + { + "epoch": 1.3806015960712092, + "grad_norm": 0.5040726661682129, + "learning_rate": 9.697158381885915e-05, + "loss": 1.9844, + "step": 4498 + }, + { + "epoch": 1.3809085328422346, + "grad_norm": 0.5219398736953735, + "learning_rate": 9.696987999579939e-05, + "loss": 1.9536, + "step": 4499 + }, + { + "epoch": 1.3812154696132597, + "grad_norm": 0.487734317779541, + "learning_rate": 9.696817570855575e-05, + "loss": 1.9655, + "step": 4500 + }, + { + "epoch": 1.3815224063842848, + "grad_norm": 0.40818822383880615, + "learning_rate": 9.696647095714506e-05, + "loss": 1.9524, + "step": 4501 + }, + { + "epoch": 1.3818293431553101, + "grad_norm": 0.41752889752388, + "learning_rate": 9.69647657415842e-05, + "loss": 1.9927, + "step": 4502 + }, + { + "epoch": 1.3821362799263353, + "grad_norm": 0.44540464878082275, + "learning_rate": 9.696306006188998e-05, + "loss": 1.9207, + "step": 4503 + }, + { + "epoch": 1.3824432166973604, + "grad_norm": 0.44818806648254395, + "learning_rate": 9.696135391807927e-05, + "loss": 1.9054, + "step": 4504 + }, + { + "epoch": 1.3827501534683855, + "grad_norm": 0.430758535861969, + "learning_rate": 9.695964731016896e-05, + "loss": 1.9644, + "step": 4505 + }, + { + "epoch": 1.3830570902394106, + "grad_norm": 0.3787635564804077, + "learning_rate": 9.695794023817586e-05, + "loss": 1.9601, + "step": 4506 + }, + { + "epoch": 1.383364027010436, + "grad_norm": 0.42520588636398315, + "learning_rate": 9.695623270211689e-05, + "loss": 1.9681, + "step": 4507 + }, + { + "epoch": 1.383670963781461, + "grad_norm": 0.39063912630081177, + "learning_rate": 9.69545247020089e-05, + "loss": 2.0323, + "step": 4508 + }, + { + "epoch": 1.3839779005524862, + "grad_norm": 0.41405799984931946, + "learning_rate": 9.695281623786879e-05, + "loss": 1.9239, + "step": 4509 + }, + { + "epoch": 1.3842848373235115, + "grad_norm": 0.4275501072406769, + "learning_rate": 9.695110730971342e-05, + "loss": 1.941, + "step": 4510 + }, + { + "epoch": 1.3845917740945366, + "grad_norm": 0.5254966616630554, + "learning_rate": 9.694939791755968e-05, + "loss": 1.9997, + "step": 4511 + }, + { + "epoch": 1.3848987108655617, + "grad_norm": 0.581857442855835, + "learning_rate": 9.694768806142448e-05, + "loss": 2.0085, + "step": 4512 + }, + { + "epoch": 1.3852056476365868, + "grad_norm": 0.6330662965774536, + "learning_rate": 9.69459777413247e-05, + "loss": 1.9898, + "step": 4513 + }, + { + "epoch": 1.385512584407612, + "grad_norm": 0.693536639213562, + "learning_rate": 9.694426695727727e-05, + "loss": 1.9466, + "step": 4514 + }, + { + "epoch": 1.3858195211786373, + "grad_norm": 0.6494079232215881, + "learning_rate": 9.694255570929906e-05, + "loss": 1.9523, + "step": 4515 + }, + { + "epoch": 1.3861264579496624, + "grad_norm": 0.573515772819519, + "learning_rate": 9.694084399740701e-05, + "loss": 1.9789, + "step": 4516 + }, + { + "epoch": 1.3864333947206875, + "grad_norm": 0.5253448486328125, + "learning_rate": 9.693913182161805e-05, + "loss": 2.0348, + "step": 4517 + }, + { + "epoch": 1.3867403314917128, + "grad_norm": 0.49921590089797974, + "learning_rate": 9.693741918194904e-05, + "loss": 1.9684, + "step": 4518 + }, + { + "epoch": 1.387047268262738, + "grad_norm": 0.5164174437522888, + "learning_rate": 9.693570607841696e-05, + "loss": 2.0104, + "step": 4519 + }, + { + "epoch": 1.387354205033763, + "grad_norm": 0.5620231032371521, + "learning_rate": 9.693399251103872e-05, + "loss": 1.9969, + "step": 4520 + }, + { + "epoch": 1.3876611418047882, + "grad_norm": 0.495890349149704, + "learning_rate": 9.693227847983126e-05, + "loss": 2.0037, + "step": 4521 + }, + { + "epoch": 1.3879680785758133, + "grad_norm": 0.4942645728588104, + "learning_rate": 9.693056398481151e-05, + "loss": 2.0199, + "step": 4522 + }, + { + "epoch": 1.3882750153468386, + "grad_norm": 0.5366860628128052, + "learning_rate": 9.692884902599643e-05, + "loss": 2.0395, + "step": 4523 + }, + { + "epoch": 1.3885819521178637, + "grad_norm": 0.48179951310157776, + "learning_rate": 9.692713360340295e-05, + "loss": 2.0292, + "step": 4524 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.4709320366382599, + "learning_rate": 9.692541771704804e-05, + "loss": 2.006, + "step": 4525 + }, + { + "epoch": 1.3891958256599142, + "grad_norm": 0.4311975836753845, + "learning_rate": 9.692370136694864e-05, + "loss": 2.0122, + "step": 4526 + }, + { + "epoch": 1.3895027624309393, + "grad_norm": 0.4489841163158417, + "learning_rate": 9.692198455312172e-05, + "loss": 1.9635, + "step": 4527 + }, + { + "epoch": 1.3898096992019644, + "grad_norm": 0.40383243560791016, + "learning_rate": 9.692026727558425e-05, + "loss": 1.9352, + "step": 4528 + }, + { + "epoch": 1.3901166359729895, + "grad_norm": 0.4732677638530731, + "learning_rate": 9.691854953435319e-05, + "loss": 1.9882, + "step": 4529 + }, + { + "epoch": 1.3904235727440146, + "grad_norm": 0.5124688744544983, + "learning_rate": 9.691683132944553e-05, + "loss": 2.0068, + "step": 4530 + }, + { + "epoch": 1.39073050951504, + "grad_norm": 0.4810490906238556, + "learning_rate": 9.691511266087824e-05, + "loss": 2.0163, + "step": 4531 + }, + { + "epoch": 1.391037446286065, + "grad_norm": 0.4019710421562195, + "learning_rate": 9.691339352866831e-05, + "loss": 1.8943, + "step": 4532 + }, + { + "epoch": 1.3913443830570902, + "grad_norm": 0.4144287705421448, + "learning_rate": 9.691167393283274e-05, + "loss": 1.9457, + "step": 4533 + }, + { + "epoch": 1.3916513198281155, + "grad_norm": 0.42622655630111694, + "learning_rate": 9.690995387338851e-05, + "loss": 1.9618, + "step": 4534 + }, + { + "epoch": 1.3919582565991406, + "grad_norm": 0.4547794461250305, + "learning_rate": 9.690823335035259e-05, + "loss": 2.0243, + "step": 4535 + }, + { + "epoch": 1.3922651933701657, + "grad_norm": 0.4298909604549408, + "learning_rate": 9.690651236374205e-05, + "loss": 1.9872, + "step": 4536 + }, + { + "epoch": 1.3925721301411909, + "grad_norm": 0.40463829040527344, + "learning_rate": 9.690479091357386e-05, + "loss": 1.9617, + "step": 4537 + }, + { + "epoch": 1.392879066912216, + "grad_norm": 0.441487580537796, + "learning_rate": 9.690306899986502e-05, + "loss": 1.8965, + "step": 4538 + }, + { + "epoch": 1.3931860036832413, + "grad_norm": 0.4713582694530487, + "learning_rate": 9.690134662263256e-05, + "loss": 2.0112, + "step": 4539 + }, + { + "epoch": 1.3934929404542664, + "grad_norm": 0.5772922039031982, + "learning_rate": 9.689962378189351e-05, + "loss": 1.9903, + "step": 4540 + }, + { + "epoch": 1.3937998772252915, + "grad_norm": 0.6658890247344971, + "learning_rate": 9.689790047766489e-05, + "loss": 2.0569, + "step": 4541 + }, + { + "epoch": 1.3941068139963169, + "grad_norm": 0.6710116267204285, + "learning_rate": 9.689617670996372e-05, + "loss": 1.9692, + "step": 4542 + }, + { + "epoch": 1.394413750767342, + "grad_norm": 0.6778390407562256, + "learning_rate": 9.689445247880707e-05, + "loss": 2.0363, + "step": 4543 + }, + { + "epoch": 1.394720687538367, + "grad_norm": 0.6921203136444092, + "learning_rate": 9.689272778421192e-05, + "loss": 2.0104, + "step": 4544 + }, + { + "epoch": 1.3950276243093922, + "grad_norm": 0.48772117495536804, + "learning_rate": 9.689100262619537e-05, + "loss": 2.0006, + "step": 4545 + }, + { + "epoch": 1.3953345610804173, + "grad_norm": 0.4956360459327698, + "learning_rate": 9.688927700477445e-05, + "loss": 1.9724, + "step": 4546 + }, + { + "epoch": 1.3956414978514426, + "grad_norm": 0.6304072141647339, + "learning_rate": 9.68875509199662e-05, + "loss": 1.9904, + "step": 4547 + }, + { + "epoch": 1.3959484346224678, + "grad_norm": 0.6372275948524475, + "learning_rate": 9.68858243717877e-05, + "loss": 2.0328, + "step": 4548 + }, + { + "epoch": 1.3962553713934929, + "grad_norm": 0.48642870783805847, + "learning_rate": 9.688409736025601e-05, + "loss": 1.9898, + "step": 4549 + }, + { + "epoch": 1.3965623081645182, + "grad_norm": 0.41096800565719604, + "learning_rate": 9.688236988538817e-05, + "loss": 1.8945, + "step": 4550 + }, + { + "epoch": 1.3968692449355433, + "grad_norm": 0.48746830224990845, + "learning_rate": 9.68806419472013e-05, + "loss": 1.9809, + "step": 4551 + }, + { + "epoch": 1.3971761817065684, + "grad_norm": 0.5296676754951477, + "learning_rate": 9.687891354571242e-05, + "loss": 1.9194, + "step": 4552 + }, + { + "epoch": 1.3974831184775935, + "grad_norm": 0.43177086114883423, + "learning_rate": 9.687718468093865e-05, + "loss": 1.8785, + "step": 4553 + }, + { + "epoch": 1.3977900552486187, + "grad_norm": 0.4617565870285034, + "learning_rate": 9.687545535289705e-05, + "loss": 2.0021, + "step": 4554 + }, + { + "epoch": 1.398096992019644, + "grad_norm": 0.4460168182849884, + "learning_rate": 9.687372556160477e-05, + "loss": 1.9368, + "step": 4555 + }, + { + "epoch": 1.398403928790669, + "grad_norm": 0.5051010847091675, + "learning_rate": 9.687199530707882e-05, + "loss": 2.0321, + "step": 4556 + }, + { + "epoch": 1.3987108655616942, + "grad_norm": 0.5623685717582703, + "learning_rate": 9.687026458933636e-05, + "loss": 2.007, + "step": 4557 + }, + { + "epoch": 1.3990178023327196, + "grad_norm": 0.48149919509887695, + "learning_rate": 9.686853340839446e-05, + "loss": 1.9346, + "step": 4558 + }, + { + "epoch": 1.3993247391037447, + "grad_norm": 0.4651631712913513, + "learning_rate": 9.686680176427025e-05, + "loss": 1.9603, + "step": 4559 + }, + { + "epoch": 1.3996316758747698, + "grad_norm": 0.5255021452903748, + "learning_rate": 9.686506965698083e-05, + "loss": 2.0206, + "step": 4560 + }, + { + "epoch": 1.3999386126457949, + "grad_norm": 0.5137404799461365, + "learning_rate": 9.686333708654334e-05, + "loss": 1.9736, + "step": 4561 + }, + { + "epoch": 1.40024554941682, + "grad_norm": 0.5037943124771118, + "learning_rate": 9.686160405297487e-05, + "loss": 1.9886, + "step": 4562 + }, + { + "epoch": 1.4005524861878453, + "grad_norm": 0.46424365043640137, + "learning_rate": 9.685987055629256e-05, + "loss": 1.9316, + "step": 4563 + }, + { + "epoch": 1.4008594229588704, + "grad_norm": 0.4839535355567932, + "learning_rate": 9.685813659651355e-05, + "loss": 1.9651, + "step": 4564 + }, + { + "epoch": 1.4011663597298956, + "grad_norm": 0.48972323536872864, + "learning_rate": 9.685640217365497e-05, + "loss": 1.9544, + "step": 4565 + }, + { + "epoch": 1.401473296500921, + "grad_norm": 0.43038102984428406, + "learning_rate": 9.685466728773396e-05, + "loss": 1.9522, + "step": 4566 + }, + { + "epoch": 1.401780233271946, + "grad_norm": 0.5174641013145447, + "learning_rate": 9.685293193876765e-05, + "loss": 2.046, + "step": 4567 + }, + { + "epoch": 1.4020871700429711, + "grad_norm": 0.6731263995170593, + "learning_rate": 9.685119612677323e-05, + "loss": 2.0123, + "step": 4568 + }, + { + "epoch": 1.4023941068139965, + "grad_norm": 0.5863515734672546, + "learning_rate": 9.684945985176782e-05, + "loss": 1.9951, + "step": 4569 + }, + { + "epoch": 1.4027010435850216, + "grad_norm": 0.4479050934314728, + "learning_rate": 9.684772311376859e-05, + "loss": 1.9287, + "step": 4570 + }, + { + "epoch": 1.4030079803560467, + "grad_norm": 0.432740718126297, + "learning_rate": 9.68459859127927e-05, + "loss": 1.955, + "step": 4571 + }, + { + "epoch": 1.4033149171270718, + "grad_norm": 0.571775496006012, + "learning_rate": 9.684424824885731e-05, + "loss": 1.9519, + "step": 4572 + }, + { + "epoch": 1.403621853898097, + "grad_norm": 0.6454880237579346, + "learning_rate": 9.684251012197963e-05, + "loss": 1.9858, + "step": 4573 + }, + { + "epoch": 1.4039287906691222, + "grad_norm": 0.5274731516838074, + "learning_rate": 9.684077153217677e-05, + "loss": 1.9956, + "step": 4574 + }, + { + "epoch": 1.4042357274401474, + "grad_norm": 0.4459272027015686, + "learning_rate": 9.683903247946597e-05, + "loss": 2.0412, + "step": 4575 + }, + { + "epoch": 1.4045426642111725, + "grad_norm": 0.47089213132858276, + "learning_rate": 9.683729296386441e-05, + "loss": 1.9247, + "step": 4576 + }, + { + "epoch": 1.4048496009821978, + "grad_norm": 0.628490149974823, + "learning_rate": 9.683555298538927e-05, + "loss": 2.1311, + "step": 4577 + }, + { + "epoch": 1.405156537753223, + "grad_norm": 0.5498626232147217, + "learning_rate": 9.683381254405773e-05, + "loss": 1.9538, + "step": 4578 + }, + { + "epoch": 1.405463474524248, + "grad_norm": 0.4556458294391632, + "learning_rate": 9.6832071639887e-05, + "loss": 1.9957, + "step": 4579 + }, + { + "epoch": 1.4057704112952731, + "grad_norm": 0.5684164762496948, + "learning_rate": 9.68303302728943e-05, + "loss": 1.9339, + "step": 4580 + }, + { + "epoch": 1.4060773480662982, + "grad_norm": 0.5723292231559753, + "learning_rate": 9.682858844309682e-05, + "loss": 2.0043, + "step": 4581 + }, + { + "epoch": 1.4063842848373236, + "grad_norm": 0.4734770953655243, + "learning_rate": 9.682684615051178e-05, + "loss": 1.9854, + "step": 4582 + }, + { + "epoch": 1.4066912216083487, + "grad_norm": 0.49376189708709717, + "learning_rate": 9.682510339515642e-05, + "loss": 2.0436, + "step": 4583 + }, + { + "epoch": 1.4069981583793738, + "grad_norm": 0.6263520121574402, + "learning_rate": 9.682336017704793e-05, + "loss": 1.9426, + "step": 4584 + }, + { + "epoch": 1.4073050951503991, + "grad_norm": 0.5852357745170593, + "learning_rate": 9.682161649620355e-05, + "loss": 1.9865, + "step": 4585 + }, + { + "epoch": 1.4076120319214243, + "grad_norm": 0.45548367500305176, + "learning_rate": 9.681987235264052e-05, + "loss": 2.0454, + "step": 4586 + }, + { + "epoch": 1.4079189686924494, + "grad_norm": 0.4961472153663635, + "learning_rate": 9.681812774637607e-05, + "loss": 2.0414, + "step": 4587 + }, + { + "epoch": 1.4082259054634745, + "grad_norm": 0.5739028453826904, + "learning_rate": 9.681638267742741e-05, + "loss": 1.9591, + "step": 4588 + }, + { + "epoch": 1.4085328422344996, + "grad_norm": 0.546283483505249, + "learning_rate": 9.681463714581184e-05, + "loss": 1.9631, + "step": 4589 + }, + { + "epoch": 1.408839779005525, + "grad_norm": 0.4757421910762787, + "learning_rate": 9.681289115154659e-05, + "loss": 1.954, + "step": 4590 + }, + { + "epoch": 1.40914671577655, + "grad_norm": 0.5116898417472839, + "learning_rate": 9.681114469464891e-05, + "loss": 1.9816, + "step": 4591 + }, + { + "epoch": 1.4094536525475752, + "grad_norm": 0.6128544807434082, + "learning_rate": 9.680939777513607e-05, + "loss": 1.9408, + "step": 4592 + }, + { + "epoch": 1.4097605893186005, + "grad_norm": 0.5577036142349243, + "learning_rate": 9.680765039302531e-05, + "loss": 1.906, + "step": 4593 + }, + { + "epoch": 1.4100675260896256, + "grad_norm": 0.4608074128627777, + "learning_rate": 9.680590254833393e-05, + "loss": 1.9421, + "step": 4594 + }, + { + "epoch": 1.4103744628606507, + "grad_norm": 0.4221206307411194, + "learning_rate": 9.680415424107917e-05, + "loss": 1.9596, + "step": 4595 + }, + { + "epoch": 1.4106813996316758, + "grad_norm": 0.4278069734573364, + "learning_rate": 9.680240547127832e-05, + "loss": 1.9718, + "step": 4596 + }, + { + "epoch": 1.410988336402701, + "grad_norm": 0.48608019948005676, + "learning_rate": 9.680065623894869e-05, + "loss": 2.0595, + "step": 4597 + }, + { + "epoch": 1.4112952731737263, + "grad_norm": 0.4559817910194397, + "learning_rate": 9.679890654410753e-05, + "loss": 1.959, + "step": 4598 + }, + { + "epoch": 1.4116022099447514, + "grad_norm": 0.5122750997543335, + "learning_rate": 9.679715638677216e-05, + "loss": 2.0669, + "step": 4599 + }, + { + "epoch": 1.4119091467157765, + "grad_norm": 0.5203170776367188, + "learning_rate": 9.679540576695985e-05, + "loss": 1.9475, + "step": 4600 + }, + { + "epoch": 1.4122160834868018, + "grad_norm": 0.5420581698417664, + "learning_rate": 9.679365468468791e-05, + "loss": 1.9603, + "step": 4601 + }, + { + "epoch": 1.412523020257827, + "grad_norm": 0.527387261390686, + "learning_rate": 9.679190313997364e-05, + "loss": 1.9172, + "step": 4602 + }, + { + "epoch": 1.412829957028852, + "grad_norm": 0.48417946696281433, + "learning_rate": 9.679015113283438e-05, + "loss": 1.9619, + "step": 4603 + }, + { + "epoch": 1.4131368937998772, + "grad_norm": 0.49174100160598755, + "learning_rate": 9.678839866328742e-05, + "loss": 1.9959, + "step": 4604 + }, + { + "epoch": 1.4134438305709023, + "grad_norm": 0.5096092224121094, + "learning_rate": 9.678664573135006e-05, + "loss": 2.0046, + "step": 4605 + }, + { + "epoch": 1.4137507673419276, + "grad_norm": 0.4536958634853363, + "learning_rate": 9.678489233703965e-05, + "loss": 1.9289, + "step": 4606 + }, + { + "epoch": 1.4140577041129527, + "grad_norm": 0.40438196063041687, + "learning_rate": 9.678313848037353e-05, + "loss": 1.9488, + "step": 4607 + }, + { + "epoch": 1.4143646408839778, + "grad_norm": 0.4447456896305084, + "learning_rate": 9.6781384161369e-05, + "loss": 1.9638, + "step": 4608 + }, + { + "epoch": 1.4146715776550032, + "grad_norm": 0.44451746344566345, + "learning_rate": 9.677962938004342e-05, + "loss": 1.9026, + "step": 4609 + }, + { + "epoch": 1.4149785144260283, + "grad_norm": 0.4262266457080841, + "learning_rate": 9.677787413641412e-05, + "loss": 1.9408, + "step": 4610 + }, + { + "epoch": 1.4152854511970534, + "grad_norm": 0.42755937576293945, + "learning_rate": 9.677611843049845e-05, + "loss": 1.9542, + "step": 4611 + }, + { + "epoch": 1.4155923879680785, + "grad_norm": 0.43264830112457275, + "learning_rate": 9.677436226231375e-05, + "loss": 2.0244, + "step": 4612 + }, + { + "epoch": 1.4158993247391036, + "grad_norm": 0.4521278142929077, + "learning_rate": 9.67726056318774e-05, + "loss": 2.0343, + "step": 4613 + }, + { + "epoch": 1.416206261510129, + "grad_norm": 0.45257535576820374, + "learning_rate": 9.677084853920675e-05, + "loss": 1.9743, + "step": 4614 + }, + { + "epoch": 1.416513198281154, + "grad_norm": 0.42859771847724915, + "learning_rate": 9.676909098431915e-05, + "loss": 2.0067, + "step": 4615 + }, + { + "epoch": 1.4168201350521792, + "grad_norm": 0.4057050049304962, + "learning_rate": 9.6767332967232e-05, + "loss": 1.9074, + "step": 4616 + }, + { + "epoch": 1.4171270718232045, + "grad_norm": 0.46177807450294495, + "learning_rate": 9.676557448796264e-05, + "loss": 1.9899, + "step": 4617 + }, + { + "epoch": 1.4174340085942296, + "grad_norm": 0.44164395332336426, + "learning_rate": 9.676381554652846e-05, + "loss": 1.9759, + "step": 4618 + }, + { + "epoch": 1.4177409453652547, + "grad_norm": 0.42987993359565735, + "learning_rate": 9.676205614294684e-05, + "loss": 1.8783, + "step": 4619 + }, + { + "epoch": 1.4180478821362799, + "grad_norm": 0.541702389717102, + "learning_rate": 9.67602962772352e-05, + "loss": 2.0099, + "step": 4620 + }, + { + "epoch": 1.418354818907305, + "grad_norm": 0.42173272371292114, + "learning_rate": 9.67585359494109e-05, + "loss": 1.9281, + "step": 4621 + }, + { + "epoch": 1.4186617556783303, + "grad_norm": 0.432476669549942, + "learning_rate": 9.67567751594913e-05, + "loss": 1.9124, + "step": 4622 + }, + { + "epoch": 1.4189686924493554, + "grad_norm": 0.4952125549316406, + "learning_rate": 9.675501390749388e-05, + "loss": 1.973, + "step": 4623 + }, + { + "epoch": 1.4192756292203805, + "grad_norm": 0.5270698070526123, + "learning_rate": 9.6753252193436e-05, + "loss": 2.003, + "step": 4624 + }, + { + "epoch": 1.4195825659914059, + "grad_norm": 0.5735524892807007, + "learning_rate": 9.67514900173351e-05, + "loss": 1.9266, + "step": 4625 + }, + { + "epoch": 1.419889502762431, + "grad_norm": 0.508196234703064, + "learning_rate": 9.674972737920855e-05, + "loss": 1.9633, + "step": 4626 + }, + { + "epoch": 1.420196439533456, + "grad_norm": 0.4321250319480896, + "learning_rate": 9.674796427907379e-05, + "loss": 1.9994, + "step": 4627 + }, + { + "epoch": 1.4205033763044812, + "grad_norm": 0.5697643756866455, + "learning_rate": 9.674620071694826e-05, + "loss": 2.0018, + "step": 4628 + }, + { + "epoch": 1.4208103130755063, + "grad_norm": 0.6797513365745544, + "learning_rate": 9.674443669284936e-05, + "loss": 2.0514, + "step": 4629 + }, + { + "epoch": 1.4211172498465316, + "grad_norm": 0.6622742414474487, + "learning_rate": 9.674267220679456e-05, + "loss": 1.9315, + "step": 4630 + }, + { + "epoch": 1.4214241866175568, + "grad_norm": 0.5143589377403259, + "learning_rate": 9.674090725880125e-05, + "loss": 1.9691, + "step": 4631 + }, + { + "epoch": 1.4217311233885819, + "grad_norm": 0.4472220838069916, + "learning_rate": 9.673914184888692e-05, + "loss": 1.9629, + "step": 4632 + }, + { + "epoch": 1.4220380601596072, + "grad_norm": 0.4992378354072571, + "learning_rate": 9.6737375977069e-05, + "loss": 1.9202, + "step": 4633 + }, + { + "epoch": 1.4223449969306323, + "grad_norm": 0.5463345646858215, + "learning_rate": 9.673560964336493e-05, + "loss": 2.0143, + "step": 4634 + }, + { + "epoch": 1.4226519337016574, + "grad_norm": 0.4566437304019928, + "learning_rate": 9.673384284779217e-05, + "loss": 1.8907, + "step": 4635 + }, + { + "epoch": 1.4229588704726825, + "grad_norm": 0.41718652844429016, + "learning_rate": 9.673207559036816e-05, + "loss": 1.8955, + "step": 4636 + }, + { + "epoch": 1.4232658072437077, + "grad_norm": 0.5017329454421997, + "learning_rate": 9.673030787111043e-05, + "loss": 1.9745, + "step": 4637 + }, + { + "epoch": 1.423572744014733, + "grad_norm": 0.48890092968940735, + "learning_rate": 9.67285396900364e-05, + "loss": 1.9448, + "step": 4638 + }, + { + "epoch": 1.423879680785758, + "grad_norm": 0.4519537687301636, + "learning_rate": 9.672677104716352e-05, + "loss": 1.9572, + "step": 4639 + }, + { + "epoch": 1.4241866175567832, + "grad_norm": 0.4786919355392456, + "learning_rate": 9.672500194250932e-05, + "loss": 2.0212, + "step": 4640 + }, + { + "epoch": 1.4244935543278086, + "grad_norm": 0.4938487112522125, + "learning_rate": 9.672323237609127e-05, + "loss": 1.9842, + "step": 4641 + }, + { + "epoch": 1.4248004910988337, + "grad_norm": 0.5786599516868591, + "learning_rate": 9.672146234792686e-05, + "loss": 1.9575, + "step": 4642 + }, + { + "epoch": 1.4251074278698588, + "grad_norm": 0.5532247424125671, + "learning_rate": 9.671969185803356e-05, + "loss": 1.9972, + "step": 4643 + }, + { + "epoch": 1.4254143646408841, + "grad_norm": 0.5058014988899231, + "learning_rate": 9.671792090642889e-05, + "loss": 2.0042, + "step": 4644 + }, + { + "epoch": 1.4257213014119092, + "grad_norm": 0.46545106172561646, + "learning_rate": 9.671614949313033e-05, + "loss": 1.9853, + "step": 4645 + }, + { + "epoch": 1.4260282381829343, + "grad_norm": 0.47626879811286926, + "learning_rate": 9.671437761815541e-05, + "loss": 1.9725, + "step": 4646 + }, + { + "epoch": 1.4263351749539595, + "grad_norm": 0.4476237893104553, + "learning_rate": 9.671260528152165e-05, + "loss": 1.8876, + "step": 4647 + }, + { + "epoch": 1.4266421117249846, + "grad_norm": 0.4290693700313568, + "learning_rate": 9.671083248324651e-05, + "loss": 1.9766, + "step": 4648 + }, + { + "epoch": 1.42694904849601, + "grad_norm": 0.443131685256958, + "learning_rate": 9.670905922334757e-05, + "loss": 2.0201, + "step": 4649 + }, + { + "epoch": 1.427255985267035, + "grad_norm": 0.5181389451026917, + "learning_rate": 9.670728550184231e-05, + "loss": 2.0013, + "step": 4650 + }, + { + "epoch": 1.4275629220380601, + "grad_norm": 0.48453402519226074, + "learning_rate": 9.670551131874829e-05, + "loss": 1.9536, + "step": 4651 + }, + { + "epoch": 1.4278698588090855, + "grad_norm": 0.49652302265167236, + "learning_rate": 9.670373667408303e-05, + "loss": 1.9934, + "step": 4652 + }, + { + "epoch": 1.4281767955801106, + "grad_norm": 0.47071191668510437, + "learning_rate": 9.670196156786406e-05, + "loss": 2.0319, + "step": 4653 + }, + { + "epoch": 1.4284837323511357, + "grad_norm": 0.46828708052635193, + "learning_rate": 9.670018600010894e-05, + "loss": 1.9248, + "step": 4654 + }, + { + "epoch": 1.4287906691221608, + "grad_norm": 0.48472490906715393, + "learning_rate": 9.669840997083524e-05, + "loss": 1.9681, + "step": 4655 + }, + { + "epoch": 1.429097605893186, + "grad_norm": 0.48628562688827515, + "learning_rate": 9.669663348006044e-05, + "loss": 1.9818, + "step": 4656 + }, + { + "epoch": 1.4294045426642112, + "grad_norm": 0.40770742297172546, + "learning_rate": 9.669485652780215e-05, + "loss": 1.927, + "step": 4657 + }, + { + "epoch": 1.4297114794352364, + "grad_norm": 0.5005267858505249, + "learning_rate": 9.669307911407794e-05, + "loss": 2.0564, + "step": 4658 + }, + { + "epoch": 1.4300184162062615, + "grad_norm": 0.42432111501693726, + "learning_rate": 9.669130123890533e-05, + "loss": 1.9344, + "step": 4659 + }, + { + "epoch": 1.4303253529772868, + "grad_norm": 0.42347240447998047, + "learning_rate": 9.668952290230192e-05, + "loss": 1.962, + "step": 4660 + }, + { + "epoch": 1.430632289748312, + "grad_norm": 0.4718005955219269, + "learning_rate": 9.668774410428529e-05, + "loss": 2.0081, + "step": 4661 + }, + { + "epoch": 1.430939226519337, + "grad_norm": 0.45922374725341797, + "learning_rate": 9.6685964844873e-05, + "loss": 1.9378, + "step": 4662 + }, + { + "epoch": 1.4312461632903621, + "grad_norm": 0.43764227628707886, + "learning_rate": 9.668418512408263e-05, + "loss": 2.0084, + "step": 4663 + }, + { + "epoch": 1.4315531000613873, + "grad_norm": 0.42079678177833557, + "learning_rate": 9.668240494193179e-05, + "loss": 1.9675, + "step": 4664 + }, + { + "epoch": 1.4318600368324126, + "grad_norm": 0.4470539093017578, + "learning_rate": 9.668062429843808e-05, + "loss": 1.9781, + "step": 4665 + }, + { + "epoch": 1.4321669736034377, + "grad_norm": 0.4903084337711334, + "learning_rate": 9.667884319361906e-05, + "loss": 1.9612, + "step": 4666 + }, + { + "epoch": 1.4324739103744628, + "grad_norm": 0.4906228482723236, + "learning_rate": 9.667706162749234e-05, + "loss": 2.0115, + "step": 4667 + }, + { + "epoch": 1.4327808471454881, + "grad_norm": 0.4868105351924896, + "learning_rate": 9.667527960007556e-05, + "loss": 1.9648, + "step": 4668 + }, + { + "epoch": 1.4330877839165133, + "grad_norm": 0.5115882754325867, + "learning_rate": 9.667349711138632e-05, + "loss": 2.0366, + "step": 4669 + }, + { + "epoch": 1.4333947206875384, + "grad_norm": 0.47366276383399963, + "learning_rate": 9.66717141614422e-05, + "loss": 1.9467, + "step": 4670 + }, + { + "epoch": 1.4337016574585635, + "grad_norm": 0.6110171675682068, + "learning_rate": 9.666993075026086e-05, + "loss": 1.9272, + "step": 4671 + }, + { + "epoch": 1.4340085942295886, + "grad_norm": 0.5915683507919312, + "learning_rate": 9.66681468778599e-05, + "loss": 2.0444, + "step": 4672 + }, + { + "epoch": 1.434315531000614, + "grad_norm": 0.5783519744873047, + "learning_rate": 9.666636254425697e-05, + "loss": 1.9579, + "step": 4673 + }, + { + "epoch": 1.434622467771639, + "grad_norm": 0.4646502137184143, + "learning_rate": 9.66645777494697e-05, + "loss": 1.9172, + "step": 4674 + }, + { + "epoch": 1.4349294045426642, + "grad_norm": 0.4184744656085968, + "learning_rate": 9.666279249351571e-05, + "loss": 1.9189, + "step": 4675 + }, + { + "epoch": 1.4352363413136895, + "grad_norm": 0.5444575548171997, + "learning_rate": 9.666100677641266e-05, + "loss": 2.045, + "step": 4676 + }, + { + "epoch": 1.4355432780847146, + "grad_norm": 0.5232846140861511, + "learning_rate": 9.665922059817818e-05, + "loss": 2.0059, + "step": 4677 + }, + { + "epoch": 1.4358502148557397, + "grad_norm": 0.439259797334671, + "learning_rate": 9.665743395882994e-05, + "loss": 1.9164, + "step": 4678 + }, + { + "epoch": 1.4361571516267648, + "grad_norm": 0.405073344707489, + "learning_rate": 9.66556468583856e-05, + "loss": 1.9211, + "step": 4679 + }, + { + "epoch": 1.43646408839779, + "grad_norm": 0.47113174200057983, + "learning_rate": 9.665385929686279e-05, + "loss": 2.0732, + "step": 4680 + }, + { + "epoch": 1.4367710251688153, + "grad_norm": 0.4710143506526947, + "learning_rate": 9.665207127427923e-05, + "loss": 1.9153, + "step": 4681 + }, + { + "epoch": 1.4370779619398404, + "grad_norm": 0.41988152265548706, + "learning_rate": 9.665028279065254e-05, + "loss": 1.9985, + "step": 4682 + }, + { + "epoch": 1.4373848987108655, + "grad_norm": 0.4629889130592346, + "learning_rate": 9.664849384600042e-05, + "loss": 2.0188, + "step": 4683 + }, + { + "epoch": 1.4376918354818908, + "grad_norm": 0.42099106311798096, + "learning_rate": 9.664670444034051e-05, + "loss": 1.8915, + "step": 4684 + }, + { + "epoch": 1.437998772252916, + "grad_norm": 0.4132508337497711, + "learning_rate": 9.664491457369056e-05, + "loss": 1.9842, + "step": 4685 + }, + { + "epoch": 1.438305709023941, + "grad_norm": 0.4019499123096466, + "learning_rate": 9.664312424606822e-05, + "loss": 1.8653, + "step": 4686 + }, + { + "epoch": 1.4386126457949662, + "grad_norm": 0.40366294980049133, + "learning_rate": 9.664133345749118e-05, + "loss": 1.8993, + "step": 4687 + }, + { + "epoch": 1.4389195825659913, + "grad_norm": 0.4391988217830658, + "learning_rate": 9.663954220797715e-05, + "loss": 1.9471, + "step": 4688 + }, + { + "epoch": 1.4392265193370166, + "grad_norm": 0.44109684228897095, + "learning_rate": 9.663775049754382e-05, + "loss": 1.9579, + "step": 4689 + }, + { + "epoch": 1.4395334561080417, + "grad_norm": 0.45682960748672485, + "learning_rate": 9.663595832620891e-05, + "loss": 1.9757, + "step": 4690 + }, + { + "epoch": 1.4398403928790668, + "grad_norm": 0.4106207489967346, + "learning_rate": 9.663416569399013e-05, + "loss": 2.0038, + "step": 4691 + }, + { + "epoch": 1.4401473296500922, + "grad_norm": 0.4627512991428375, + "learning_rate": 9.66323726009052e-05, + "loss": 2.0253, + "step": 4692 + }, + { + "epoch": 1.4404542664211173, + "grad_norm": 0.43822941184043884, + "learning_rate": 9.663057904697182e-05, + "loss": 1.9565, + "step": 4693 + }, + { + "epoch": 1.4407612031921424, + "grad_norm": 0.46254315972328186, + "learning_rate": 9.662878503220772e-05, + "loss": 2.0042, + "step": 4694 + }, + { + "epoch": 1.4410681399631675, + "grad_norm": 0.49801671504974365, + "learning_rate": 9.662699055663065e-05, + "loss": 1.9725, + "step": 4695 + }, + { + "epoch": 1.4413750767341926, + "grad_norm": 0.40280646085739136, + "learning_rate": 9.662519562025832e-05, + "loss": 1.9016, + "step": 4696 + }, + { + "epoch": 1.441682013505218, + "grad_norm": 0.4095497131347656, + "learning_rate": 9.662340022310848e-05, + "loss": 2.0054, + "step": 4697 + }, + { + "epoch": 1.441988950276243, + "grad_norm": 0.44916659593582153, + "learning_rate": 9.662160436519889e-05, + "loss": 2.0126, + "step": 4698 + }, + { + "epoch": 1.4422958870472682, + "grad_norm": 0.47450655698776245, + "learning_rate": 9.661980804654725e-05, + "loss": 1.9679, + "step": 4699 + }, + { + "epoch": 1.4426028238182935, + "grad_norm": 0.4454696774482727, + "learning_rate": 9.661801126717136e-05, + "loss": 1.9335, + "step": 4700 + }, + { + "epoch": 1.4429097605893186, + "grad_norm": 0.5009927153587341, + "learning_rate": 9.661621402708896e-05, + "loss": 1.9777, + "step": 4701 + }, + { + "epoch": 1.4432166973603437, + "grad_norm": 0.49912458658218384, + "learning_rate": 9.66144163263178e-05, + "loss": 2.0095, + "step": 4702 + }, + { + "epoch": 1.4435236341313689, + "grad_norm": 0.4477069079875946, + "learning_rate": 9.661261816487568e-05, + "loss": 1.9265, + "step": 4703 + }, + { + "epoch": 1.443830570902394, + "grad_norm": 0.4170798361301422, + "learning_rate": 9.661081954278033e-05, + "loss": 1.9458, + "step": 4704 + }, + { + "epoch": 1.4441375076734193, + "grad_norm": 0.45160573720932007, + "learning_rate": 9.660902046004953e-05, + "loss": 1.9596, + "step": 4705 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4391551911830902, + "learning_rate": 9.660722091670109e-05, + "loss": 1.9158, + "step": 4706 + }, + { + "epoch": 1.4447513812154695, + "grad_norm": 0.5183218121528625, + "learning_rate": 9.660542091275276e-05, + "loss": 2.0055, + "step": 4707 + }, + { + "epoch": 1.4450583179864949, + "grad_norm": 0.49749481678009033, + "learning_rate": 9.660362044822235e-05, + "loss": 1.9695, + "step": 4708 + }, + { + "epoch": 1.44536525475752, + "grad_norm": 0.4839307963848114, + "learning_rate": 9.660181952312766e-05, + "loss": 1.9447, + "step": 4709 + }, + { + "epoch": 1.445672191528545, + "grad_norm": 0.5218588709831238, + "learning_rate": 9.660001813748647e-05, + "loss": 1.9892, + "step": 4710 + }, + { + "epoch": 1.4459791282995704, + "grad_norm": 0.5628986954689026, + "learning_rate": 9.659821629131658e-05, + "loss": 2.0598, + "step": 4711 + }, + { + "epoch": 1.4462860650705955, + "grad_norm": 0.5226300358772278, + "learning_rate": 9.65964139846358e-05, + "loss": 1.977, + "step": 4712 + }, + { + "epoch": 1.4465930018416207, + "grad_norm": 0.4345463216304779, + "learning_rate": 9.659461121746196e-05, + "loss": 1.9649, + "step": 4713 + }, + { + "epoch": 1.4468999386126458, + "grad_norm": 0.47233885526657104, + "learning_rate": 9.659280798981285e-05, + "loss": 1.9791, + "step": 4714 + }, + { + "epoch": 1.4472068753836709, + "grad_norm": 0.5272542238235474, + "learning_rate": 9.659100430170631e-05, + "loss": 2.0153, + "step": 4715 + }, + { + "epoch": 1.4475138121546962, + "grad_norm": 0.5567492246627808, + "learning_rate": 9.658920015316015e-05, + "loss": 2.0196, + "step": 4716 + }, + { + "epoch": 1.4478207489257213, + "grad_norm": 0.5393046140670776, + "learning_rate": 9.658739554419222e-05, + "loss": 1.9871, + "step": 4717 + }, + { + "epoch": 1.4481276856967464, + "grad_norm": 0.46408072113990784, + "learning_rate": 9.658559047482034e-05, + "loss": 1.9896, + "step": 4718 + }, + { + "epoch": 1.4484346224677718, + "grad_norm": 0.47001218795776367, + "learning_rate": 9.658378494506234e-05, + "loss": 2.0281, + "step": 4719 + }, + { + "epoch": 1.4487415592387969, + "grad_norm": 0.555749773979187, + "learning_rate": 9.658197895493608e-05, + "loss": 2.0184, + "step": 4720 + }, + { + "epoch": 1.449048496009822, + "grad_norm": 0.6206443905830383, + "learning_rate": 9.65801725044594e-05, + "loss": 1.9788, + "step": 4721 + }, + { + "epoch": 1.449355432780847, + "grad_norm": 0.533336877822876, + "learning_rate": 9.657836559365016e-05, + "loss": 1.9755, + "step": 4722 + }, + { + "epoch": 1.4496623695518722, + "grad_norm": 0.4553185701370239, + "learning_rate": 9.65765582225262e-05, + "loss": 1.9791, + "step": 4723 + }, + { + "epoch": 1.4499693063228976, + "grad_norm": 0.5754305124282837, + "learning_rate": 9.65747503911054e-05, + "loss": 1.9485, + "step": 4724 + }, + { + "epoch": 1.4502762430939227, + "grad_norm": 0.6812698245048523, + "learning_rate": 9.657294209940562e-05, + "loss": 2.0326, + "step": 4725 + }, + { + "epoch": 1.4505831798649478, + "grad_norm": 0.7532522678375244, + "learning_rate": 9.657113334744472e-05, + "loss": 1.9387, + "step": 4726 + }, + { + "epoch": 1.4508901166359731, + "grad_norm": 0.5618684887886047, + "learning_rate": 9.656932413524058e-05, + "loss": 1.9395, + "step": 4727 + }, + { + "epoch": 1.4511970534069982, + "grad_norm": 0.4818387031555176, + "learning_rate": 9.65675144628111e-05, + "loss": 1.9473, + "step": 4728 + }, + { + "epoch": 1.4515039901780233, + "grad_norm": 0.5152607560157776, + "learning_rate": 9.656570433017413e-05, + "loss": 1.894, + "step": 4729 + }, + { + "epoch": 1.4518109269490485, + "grad_norm": 0.5098578333854675, + "learning_rate": 9.656389373734759e-05, + "loss": 1.9519, + "step": 4730 + }, + { + "epoch": 1.4521178637200736, + "grad_norm": 0.5862317681312561, + "learning_rate": 9.656208268434936e-05, + "loss": 1.9968, + "step": 4731 + }, + { + "epoch": 1.452424800491099, + "grad_norm": 0.501220703125, + "learning_rate": 9.656027117119732e-05, + "loss": 1.993, + "step": 4732 + }, + { + "epoch": 1.452731737262124, + "grad_norm": 0.4974796772003174, + "learning_rate": 9.655845919790943e-05, + "loss": 2.0007, + "step": 4733 + }, + { + "epoch": 1.4530386740331491, + "grad_norm": 0.513671875, + "learning_rate": 9.655664676450351e-05, + "loss": 1.9321, + "step": 4734 + }, + { + "epoch": 1.4533456108041745, + "grad_norm": 0.5111755728721619, + "learning_rate": 9.655483387099756e-05, + "loss": 2.0187, + "step": 4735 + }, + { + "epoch": 1.4536525475751996, + "grad_norm": 0.47103258967399597, + "learning_rate": 9.655302051740942e-05, + "loss": 1.9716, + "step": 4736 + }, + { + "epoch": 1.4539594843462247, + "grad_norm": 0.4526553750038147, + "learning_rate": 9.655120670375707e-05, + "loss": 2.0424, + "step": 4737 + }, + { + "epoch": 1.4542664211172498, + "grad_norm": 0.44393640756607056, + "learning_rate": 9.65493924300584e-05, + "loss": 1.9318, + "step": 4738 + }, + { + "epoch": 1.454573357888275, + "grad_norm": 0.4070759415626526, + "learning_rate": 9.654757769633136e-05, + "loss": 1.9292, + "step": 4739 + }, + { + "epoch": 1.4548802946593002, + "grad_norm": 0.4010253846645355, + "learning_rate": 9.654576250259387e-05, + "loss": 1.9641, + "step": 4740 + }, + { + "epoch": 1.4551872314303254, + "grad_norm": 0.39156264066696167, + "learning_rate": 9.654394684886387e-05, + "loss": 1.9575, + "step": 4741 + }, + { + "epoch": 1.4554941682013505, + "grad_norm": 0.4360155463218689, + "learning_rate": 9.65421307351593e-05, + "loss": 1.9615, + "step": 4742 + }, + { + "epoch": 1.4558011049723758, + "grad_norm": 0.4203348755836487, + "learning_rate": 9.654031416149813e-05, + "loss": 1.9629, + "step": 4743 + }, + { + "epoch": 1.456108041743401, + "grad_norm": 0.42294225096702576, + "learning_rate": 9.653849712789828e-05, + "loss": 1.9756, + "step": 4744 + }, + { + "epoch": 1.456414978514426, + "grad_norm": 0.46253907680511475, + "learning_rate": 9.653667963437775e-05, + "loss": 2.0128, + "step": 4745 + }, + { + "epoch": 1.4567219152854511, + "grad_norm": 0.41743987798690796, + "learning_rate": 9.653486168095446e-05, + "loss": 1.938, + "step": 4746 + }, + { + "epoch": 1.4570288520564763, + "grad_norm": 0.43411263823509216, + "learning_rate": 9.653304326764639e-05, + "loss": 1.9744, + "step": 4747 + }, + { + "epoch": 1.4573357888275016, + "grad_norm": 0.4569607973098755, + "learning_rate": 9.653122439447151e-05, + "loss": 1.9844, + "step": 4748 + }, + { + "epoch": 1.4576427255985267, + "grad_norm": 0.41858115792274475, + "learning_rate": 9.652940506144781e-05, + "loss": 1.9835, + "step": 4749 + }, + { + "epoch": 1.4579496623695518, + "grad_norm": 0.4259703755378723, + "learning_rate": 9.652758526859324e-05, + "loss": 1.9467, + "step": 4750 + }, + { + "epoch": 1.4582565991405771, + "grad_norm": 0.49847620725631714, + "learning_rate": 9.652576501592583e-05, + "loss": 1.989, + "step": 4751 + }, + { + "epoch": 1.4585635359116023, + "grad_norm": 0.5898705720901489, + "learning_rate": 9.652394430346352e-05, + "loss": 1.9896, + "step": 4752 + }, + { + "epoch": 1.4588704726826274, + "grad_norm": 0.6528434157371521, + "learning_rate": 9.652212313122433e-05, + "loss": 1.9814, + "step": 4753 + }, + { + "epoch": 1.4591774094536525, + "grad_norm": 0.5704251527786255, + "learning_rate": 9.652030149922624e-05, + "loss": 1.9735, + "step": 4754 + }, + { + "epoch": 1.4594843462246776, + "grad_norm": 0.4349142014980316, + "learning_rate": 9.651847940748727e-05, + "loss": 1.9923, + "step": 4755 + }, + { + "epoch": 1.459791282995703, + "grad_norm": 0.43891096115112305, + "learning_rate": 9.651665685602542e-05, + "loss": 1.9429, + "step": 4756 + }, + { + "epoch": 1.460098219766728, + "grad_norm": 0.5881633758544922, + "learning_rate": 9.651483384485871e-05, + "loss": 2.0075, + "step": 4757 + }, + { + "epoch": 1.4604051565377532, + "grad_norm": 0.569064736366272, + "learning_rate": 9.651301037400515e-05, + "loss": 1.9968, + "step": 4758 + }, + { + "epoch": 1.4607120933087785, + "grad_norm": 0.49636805057525635, + "learning_rate": 9.651118644348276e-05, + "loss": 2.0844, + "step": 4759 + }, + { + "epoch": 1.4610190300798036, + "grad_norm": 0.4893283247947693, + "learning_rate": 9.650936205330955e-05, + "loss": 1.9635, + "step": 4760 + }, + { + "epoch": 1.4613259668508287, + "grad_norm": 0.5199632048606873, + "learning_rate": 9.650753720350358e-05, + "loss": 1.8934, + "step": 4761 + }, + { + "epoch": 1.4616329036218538, + "grad_norm": 0.5655859708786011, + "learning_rate": 9.650571189408287e-05, + "loss": 2.0473, + "step": 4762 + }, + { + "epoch": 1.461939840392879, + "grad_norm": 0.5004158020019531, + "learning_rate": 9.650388612506545e-05, + "loss": 1.9388, + "step": 4763 + }, + { + "epoch": 1.4622467771639043, + "grad_norm": 0.5075541734695435, + "learning_rate": 9.650205989646937e-05, + "loss": 2.0362, + "step": 4764 + }, + { + "epoch": 1.4625537139349294, + "grad_norm": 0.52835613489151, + "learning_rate": 9.650023320831267e-05, + "loss": 1.9849, + "step": 4765 + }, + { + "epoch": 1.4628606507059545, + "grad_norm": 0.5208338499069214, + "learning_rate": 9.649840606061342e-05, + "loss": 1.9619, + "step": 4766 + }, + { + "epoch": 1.4631675874769798, + "grad_norm": 0.4954691529273987, + "learning_rate": 9.649657845338966e-05, + "loss": 1.9282, + "step": 4767 + }, + { + "epoch": 1.463474524248005, + "grad_norm": 0.4260660409927368, + "learning_rate": 9.649475038665947e-05, + "loss": 2.0108, + "step": 4768 + }, + { + "epoch": 1.46378146101903, + "grad_norm": 0.4954771101474762, + "learning_rate": 9.64929218604409e-05, + "loss": 1.9995, + "step": 4769 + }, + { + "epoch": 1.4640883977900552, + "grad_norm": 0.6004415154457092, + "learning_rate": 9.649109287475202e-05, + "loss": 1.9816, + "step": 4770 + }, + { + "epoch": 1.4643953345610803, + "grad_norm": 0.6472858190536499, + "learning_rate": 9.648926342961092e-05, + "loss": 1.927, + "step": 4771 + }, + { + "epoch": 1.4647022713321056, + "grad_norm": 0.5293224453926086, + "learning_rate": 9.648743352503567e-05, + "loss": 1.9082, + "step": 4772 + }, + { + "epoch": 1.4650092081031307, + "grad_norm": 0.4413148760795593, + "learning_rate": 9.648560316104435e-05, + "loss": 1.9368, + "step": 4773 + }, + { + "epoch": 1.4653161448741558, + "grad_norm": 0.4727863371372223, + "learning_rate": 9.648377233765507e-05, + "loss": 1.944, + "step": 4774 + }, + { + "epoch": 1.4656230816451812, + "grad_norm": 0.5681154131889343, + "learning_rate": 9.648194105488589e-05, + "loss": 2.0003, + "step": 4775 + }, + { + "epoch": 1.4659300184162063, + "grad_norm": 0.5893644690513611, + "learning_rate": 9.648010931275493e-05, + "loss": 1.936, + "step": 4776 + }, + { + "epoch": 1.4662369551872314, + "grad_norm": 0.5034298300743103, + "learning_rate": 9.647827711128029e-05, + "loss": 2.0318, + "step": 4777 + }, + { + "epoch": 1.4665438919582565, + "grad_norm": 0.4954885244369507, + "learning_rate": 9.647644445048006e-05, + "loss": 2.0053, + "step": 4778 + }, + { + "epoch": 1.4668508287292816, + "grad_norm": 0.475923627614975, + "learning_rate": 9.647461133037236e-05, + "loss": 1.8911, + "step": 4779 + }, + { + "epoch": 1.467157765500307, + "grad_norm": 0.4725008010864258, + "learning_rate": 9.647277775097534e-05, + "loss": 1.8954, + "step": 4780 + }, + { + "epoch": 1.467464702271332, + "grad_norm": 0.4183707535266876, + "learning_rate": 9.647094371230707e-05, + "loss": 1.9891, + "step": 4781 + }, + { + "epoch": 1.4677716390423572, + "grad_norm": 0.4862513244152069, + "learning_rate": 9.64691092143857e-05, + "loss": 2.0364, + "step": 4782 + }, + { + "epoch": 1.4680785758133825, + "grad_norm": 0.5038082599639893, + "learning_rate": 9.646727425722936e-05, + "loss": 1.9304, + "step": 4783 + }, + { + "epoch": 1.4683855125844076, + "grad_norm": 0.47281327843666077, + "learning_rate": 9.646543884085618e-05, + "loss": 1.9453, + "step": 4784 + }, + { + "epoch": 1.4686924493554327, + "grad_norm": 0.42275354266166687, + "learning_rate": 9.646360296528431e-05, + "loss": 1.9434, + "step": 4785 + }, + { + "epoch": 1.468999386126458, + "grad_norm": 0.5757746696472168, + "learning_rate": 9.646176663053185e-05, + "loss": 2.0241, + "step": 4786 + }, + { + "epoch": 1.4693063228974832, + "grad_norm": 0.6757779121398926, + "learning_rate": 9.645992983661701e-05, + "loss": 1.9823, + "step": 4787 + }, + { + "epoch": 1.4696132596685083, + "grad_norm": 0.7052981853485107, + "learning_rate": 9.645809258355792e-05, + "loss": 2.0553, + "step": 4788 + }, + { + "epoch": 1.4699201964395334, + "grad_norm": 0.5630238652229309, + "learning_rate": 9.64562548713727e-05, + "loss": 2.0241, + "step": 4789 + }, + { + "epoch": 1.4702271332105585, + "grad_norm": 0.5034958124160767, + "learning_rate": 9.645441670007955e-05, + "loss": 1.9788, + "step": 4790 + }, + { + "epoch": 1.4705340699815839, + "grad_norm": 0.48978129029273987, + "learning_rate": 9.645257806969663e-05, + "loss": 1.9415, + "step": 4791 + }, + { + "epoch": 1.470841006752609, + "grad_norm": 0.4718508720397949, + "learning_rate": 9.645073898024211e-05, + "loss": 1.9657, + "step": 4792 + }, + { + "epoch": 1.471147943523634, + "grad_norm": 0.5171064734458923, + "learning_rate": 9.644889943173417e-05, + "loss": 1.9311, + "step": 4793 + }, + { + "epoch": 1.4714548802946594, + "grad_norm": 0.4556005597114563, + "learning_rate": 9.644705942419097e-05, + "loss": 1.9093, + "step": 4794 + }, + { + "epoch": 1.4717618170656845, + "grad_norm": 0.44836321473121643, + "learning_rate": 9.64452189576307e-05, + "loss": 1.9715, + "step": 4795 + }, + { + "epoch": 1.4720687538367097, + "grad_norm": 0.5139105916023254, + "learning_rate": 9.644337803207155e-05, + "loss": 1.967, + "step": 4796 + }, + { + "epoch": 1.4723756906077348, + "grad_norm": 0.49145743250846863, + "learning_rate": 9.644153664753173e-05, + "loss": 1.9679, + "step": 4797 + }, + { + "epoch": 1.4726826273787599, + "grad_norm": 0.4353790283203125, + "learning_rate": 9.643969480402942e-05, + "loss": 1.9438, + "step": 4798 + }, + { + "epoch": 1.4729895641497852, + "grad_norm": 0.39393118023872375, + "learning_rate": 9.643785250158283e-05, + "loss": 1.91, + "step": 4799 + }, + { + "epoch": 1.4732965009208103, + "grad_norm": 0.4250284731388092, + "learning_rate": 9.643600974021017e-05, + "loss": 1.9315, + "step": 4800 + }, + { + "epoch": 1.4736034376918354, + "grad_norm": 0.40301406383514404, + "learning_rate": 9.643416651992962e-05, + "loss": 1.9344, + "step": 4801 + }, + { + "epoch": 1.4739103744628608, + "grad_norm": 0.4428589940071106, + "learning_rate": 9.643232284075944e-05, + "loss": 1.9767, + "step": 4802 + }, + { + "epoch": 1.4742173112338859, + "grad_norm": 0.5098150372505188, + "learning_rate": 9.643047870271783e-05, + "loss": 2.0471, + "step": 4803 + }, + { + "epoch": 1.474524248004911, + "grad_norm": 0.5230079293251038, + "learning_rate": 9.642863410582302e-05, + "loss": 1.9647, + "step": 4804 + }, + { + "epoch": 1.474831184775936, + "grad_norm": 0.44200628995895386, + "learning_rate": 9.642678905009322e-05, + "loss": 1.9046, + "step": 4805 + }, + { + "epoch": 1.4751381215469612, + "grad_norm": 0.42684751749038696, + "learning_rate": 9.642494353554669e-05, + "loss": 1.82, + "step": 4806 + }, + { + "epoch": 1.4754450583179866, + "grad_norm": 0.3907437324523926, + "learning_rate": 9.642309756220165e-05, + "loss": 1.9257, + "step": 4807 + }, + { + "epoch": 1.4757519950890117, + "grad_norm": 0.43622660636901855, + "learning_rate": 9.642125113007636e-05, + "loss": 1.9319, + "step": 4808 + }, + { + "epoch": 1.4760589318600368, + "grad_norm": 0.4553097188472748, + "learning_rate": 9.641940423918905e-05, + "loss": 1.9699, + "step": 4809 + }, + { + "epoch": 1.4763658686310621, + "grad_norm": 0.48997193574905396, + "learning_rate": 9.641755688955798e-05, + "loss": 1.9843, + "step": 4810 + }, + { + "epoch": 1.4766728054020872, + "grad_norm": 0.5008227825164795, + "learning_rate": 9.641570908120141e-05, + "loss": 1.9616, + "step": 4811 + }, + { + "epoch": 1.4769797421731123, + "grad_norm": 0.49788615107536316, + "learning_rate": 9.64138608141376e-05, + "loss": 2.0233, + "step": 4812 + }, + { + "epoch": 1.4772866789441375, + "grad_norm": 0.509159505367279, + "learning_rate": 9.64120120883848e-05, + "loss": 1.9982, + "step": 4813 + }, + { + "epoch": 1.4775936157151626, + "grad_norm": 0.4976164996623993, + "learning_rate": 9.641016290396132e-05, + "loss": 1.9944, + "step": 4814 + }, + { + "epoch": 1.477900552486188, + "grad_norm": 0.4925370514392853, + "learning_rate": 9.640831326088539e-05, + "loss": 1.9547, + "step": 4815 + }, + { + "epoch": 1.478207489257213, + "grad_norm": 0.5058705806732178, + "learning_rate": 9.64064631591753e-05, + "loss": 2.0147, + "step": 4816 + }, + { + "epoch": 1.4785144260282381, + "grad_norm": 0.5614715814590454, + "learning_rate": 9.640461259884937e-05, + "loss": 1.9475, + "step": 4817 + }, + { + "epoch": 1.4788213627992635, + "grad_norm": 0.4417608380317688, + "learning_rate": 9.640276157992582e-05, + "loss": 1.9422, + "step": 4818 + }, + { + "epoch": 1.4791282995702886, + "grad_norm": 0.5124607682228088, + "learning_rate": 9.6400910102423e-05, + "loss": 1.9489, + "step": 4819 + }, + { + "epoch": 1.4794352363413137, + "grad_norm": 0.4931279420852661, + "learning_rate": 9.63990581663592e-05, + "loss": 1.9717, + "step": 4820 + }, + { + "epoch": 1.4797421731123388, + "grad_norm": 0.4716447591781616, + "learning_rate": 9.639720577175271e-05, + "loss": 1.9758, + "step": 4821 + }, + { + "epoch": 1.480049109883364, + "grad_norm": 0.4613695740699768, + "learning_rate": 9.639535291862183e-05, + "loss": 1.8998, + "step": 4822 + }, + { + "epoch": 1.4803560466543892, + "grad_norm": 0.4430600702762604, + "learning_rate": 9.639349960698489e-05, + "loss": 1.9539, + "step": 4823 + }, + { + "epoch": 1.4806629834254144, + "grad_norm": 0.45596009492874146, + "learning_rate": 9.639164583686018e-05, + "loss": 1.9626, + "step": 4824 + }, + { + "epoch": 1.4809699201964395, + "grad_norm": 0.4248705804347992, + "learning_rate": 9.638979160826604e-05, + "loss": 1.9627, + "step": 4825 + }, + { + "epoch": 1.4812768569674648, + "grad_norm": 0.43419960141181946, + "learning_rate": 9.63879369212208e-05, + "loss": 1.9589, + "step": 4826 + }, + { + "epoch": 1.48158379373849, + "grad_norm": 0.4715637266635895, + "learning_rate": 9.638608177574278e-05, + "loss": 1.981, + "step": 4827 + }, + { + "epoch": 1.481890730509515, + "grad_norm": 0.41809993982315063, + "learning_rate": 9.63842261718503e-05, + "loss": 1.9587, + "step": 4828 + }, + { + "epoch": 1.4821976672805401, + "grad_norm": 0.4085060656070709, + "learning_rate": 9.63823701095617e-05, + "loss": 1.9497, + "step": 4829 + }, + { + "epoch": 1.4825046040515653, + "grad_norm": 0.4199173152446747, + "learning_rate": 9.638051358889535e-05, + "loss": 1.9543, + "step": 4830 + }, + { + "epoch": 1.4828115408225906, + "grad_norm": 0.4560040235519409, + "learning_rate": 9.637865660986958e-05, + "loss": 1.9451, + "step": 4831 + }, + { + "epoch": 1.4831184775936157, + "grad_norm": 0.4059405028820038, + "learning_rate": 9.637679917250272e-05, + "loss": 1.9154, + "step": 4832 + }, + { + "epoch": 1.4834254143646408, + "grad_norm": 0.43314236402511597, + "learning_rate": 9.637494127681318e-05, + "loss": 1.9589, + "step": 4833 + }, + { + "epoch": 1.4837323511356661, + "grad_norm": 0.3866138458251953, + "learning_rate": 9.637308292281928e-05, + "loss": 1.9239, + "step": 4834 + }, + { + "epoch": 1.4840392879066913, + "grad_norm": 0.40781381726264954, + "learning_rate": 9.637122411053939e-05, + "loss": 1.9805, + "step": 4835 + }, + { + "epoch": 1.4843462246777164, + "grad_norm": 0.4605334401130676, + "learning_rate": 9.636936483999189e-05, + "loss": 1.9571, + "step": 4836 + }, + { + "epoch": 1.4846531614487415, + "grad_norm": 0.4730539917945862, + "learning_rate": 9.636750511119513e-05, + "loss": 1.9429, + "step": 4837 + }, + { + "epoch": 1.4849600982197666, + "grad_norm": 0.47973817586898804, + "learning_rate": 9.636564492416753e-05, + "loss": 1.9865, + "step": 4838 + }, + { + "epoch": 1.485267034990792, + "grad_norm": 0.4541794955730438, + "learning_rate": 9.636378427892744e-05, + "loss": 1.9796, + "step": 4839 + }, + { + "epoch": 1.485573971761817, + "grad_norm": 0.4863722026348114, + "learning_rate": 9.636192317549327e-05, + "loss": 1.9581, + "step": 4840 + }, + { + "epoch": 1.4858809085328422, + "grad_norm": 0.4559536278247833, + "learning_rate": 9.636006161388338e-05, + "loss": 1.9444, + "step": 4841 + }, + { + "epoch": 1.4861878453038675, + "grad_norm": 0.4385206401348114, + "learning_rate": 9.63581995941162e-05, + "loss": 1.9323, + "step": 4842 + }, + { + "epoch": 1.4864947820748926, + "grad_norm": 0.48802945017814636, + "learning_rate": 9.635633711621012e-05, + "loss": 1.9643, + "step": 4843 + }, + { + "epoch": 1.4868017188459177, + "grad_norm": 0.4051367938518524, + "learning_rate": 9.635447418018355e-05, + "loss": 1.9342, + "step": 4844 + }, + { + "epoch": 1.4871086556169428, + "grad_norm": 0.46384257078170776, + "learning_rate": 9.63526107860549e-05, + "loss": 1.9656, + "step": 4845 + }, + { + "epoch": 1.487415592387968, + "grad_norm": 0.3950713574886322, + "learning_rate": 9.635074693384257e-05, + "loss": 1.8673, + "step": 4846 + }, + { + "epoch": 1.4877225291589933, + "grad_norm": 0.4694644808769226, + "learning_rate": 9.634888262356501e-05, + "loss": 1.9484, + "step": 4847 + }, + { + "epoch": 1.4880294659300184, + "grad_norm": 0.45068567991256714, + "learning_rate": 9.63470178552406e-05, + "loss": 1.9221, + "step": 4848 + }, + { + "epoch": 1.4883364027010435, + "grad_norm": 0.44717836380004883, + "learning_rate": 9.634515262888781e-05, + "loss": 1.9968, + "step": 4849 + }, + { + "epoch": 1.4886433394720688, + "grad_norm": 0.42189615964889526, + "learning_rate": 9.634328694452506e-05, + "loss": 2.0262, + "step": 4850 + }, + { + "epoch": 1.488950276243094, + "grad_norm": 0.4895322322845459, + "learning_rate": 9.63414208021708e-05, + "loss": 2.0628, + "step": 4851 + }, + { + "epoch": 1.489257213014119, + "grad_norm": 0.4732883870601654, + "learning_rate": 9.633955420184342e-05, + "loss": 1.9487, + "step": 4852 + }, + { + "epoch": 1.4895641497851444, + "grad_norm": 0.4426051676273346, + "learning_rate": 9.633768714356143e-05, + "loss": 2.0181, + "step": 4853 + }, + { + "epoch": 1.4898710865561695, + "grad_norm": 0.5831739902496338, + "learning_rate": 9.633581962734326e-05, + "loss": 1.9311, + "step": 4854 + }, + { + "epoch": 1.4901780233271946, + "grad_norm": 0.6048587560653687, + "learning_rate": 9.633395165320734e-05, + "loss": 1.9159, + "step": 4855 + }, + { + "epoch": 1.4904849600982197, + "grad_norm": 0.60125732421875, + "learning_rate": 9.633208322117218e-05, + "loss": 1.9732, + "step": 4856 + }, + { + "epoch": 1.4907918968692448, + "grad_norm": 0.4806794822216034, + "learning_rate": 9.63302143312562e-05, + "loss": 1.9101, + "step": 4857 + }, + { + "epoch": 1.4910988336402702, + "grad_norm": 0.4032946228981018, + "learning_rate": 9.632834498347789e-05, + "loss": 1.9097, + "step": 4858 + }, + { + "epoch": 1.4914057704112953, + "grad_norm": 0.400632381439209, + "learning_rate": 9.632647517785571e-05, + "loss": 1.9949, + "step": 4859 + }, + { + "epoch": 1.4917127071823204, + "grad_norm": 0.49766576290130615, + "learning_rate": 9.632460491440818e-05, + "loss": 1.9762, + "step": 4860 + }, + { + "epoch": 1.4920196439533457, + "grad_norm": 0.6273209452629089, + "learning_rate": 9.632273419315372e-05, + "loss": 2.0797, + "step": 4861 + }, + { + "epoch": 1.4923265807243709, + "grad_norm": 0.5848406553268433, + "learning_rate": 9.632086301411087e-05, + "loss": 1.9366, + "step": 4862 + }, + { + "epoch": 1.492633517495396, + "grad_norm": 0.4683595597743988, + "learning_rate": 9.631899137729809e-05, + "loss": 1.9802, + "step": 4863 + }, + { + "epoch": 1.492940454266421, + "grad_norm": 0.43066033720970154, + "learning_rate": 9.63171192827339e-05, + "loss": 1.9621, + "step": 4864 + }, + { + "epoch": 1.4932473910374462, + "grad_norm": 0.47469422221183777, + "learning_rate": 9.63152467304368e-05, + "loss": 1.9795, + "step": 4865 + }, + { + "epoch": 1.4935543278084715, + "grad_norm": 0.5453927516937256, + "learning_rate": 9.631337372042526e-05, + "loss": 1.9711, + "step": 4866 + }, + { + "epoch": 1.4938612645794966, + "grad_norm": 0.5361614227294922, + "learning_rate": 9.631150025271782e-05, + "loss": 1.9849, + "step": 4867 + }, + { + "epoch": 1.4941682013505218, + "grad_norm": 0.4773578643798828, + "learning_rate": 9.6309626327333e-05, + "loss": 2.065, + "step": 4868 + }, + { + "epoch": 1.494475138121547, + "grad_norm": 0.428091824054718, + "learning_rate": 9.630775194428932e-05, + "loss": 1.9448, + "step": 4869 + }, + { + "epoch": 1.4947820748925722, + "grad_norm": 0.41679108142852783, + "learning_rate": 9.630587710360527e-05, + "loss": 1.9511, + "step": 4870 + }, + { + "epoch": 1.4950890116635973, + "grad_norm": 0.5072546601295471, + "learning_rate": 9.630400180529942e-05, + "loss": 1.9973, + "step": 4871 + }, + { + "epoch": 1.4953959484346224, + "grad_norm": 0.5230575799942017, + "learning_rate": 9.630212604939026e-05, + "loss": 1.9659, + "step": 4872 + }, + { + "epoch": 1.4957028852056475, + "grad_norm": 0.44307753443717957, + "learning_rate": 9.630024983589638e-05, + "loss": 1.9056, + "step": 4873 + }, + { + "epoch": 1.4960098219766729, + "grad_norm": 0.43783196806907654, + "learning_rate": 9.629837316483628e-05, + "loss": 1.9716, + "step": 4874 + }, + { + "epoch": 1.496316758747698, + "grad_norm": 0.4553990960121155, + "learning_rate": 9.629649603622852e-05, + "loss": 2.044, + "step": 4875 + }, + { + "epoch": 1.496623695518723, + "grad_norm": 0.49152833223342896, + "learning_rate": 9.629461845009164e-05, + "loss": 1.948, + "step": 4876 + }, + { + "epoch": 1.4969306322897484, + "grad_norm": 0.4371738135814667, + "learning_rate": 9.629274040644422e-05, + "loss": 1.9497, + "step": 4877 + }, + { + "epoch": 1.4972375690607735, + "grad_norm": 0.4973873198032379, + "learning_rate": 9.629086190530482e-05, + "loss": 2.0053, + "step": 4878 + }, + { + "epoch": 1.4975445058317987, + "grad_norm": 0.4250672459602356, + "learning_rate": 9.628898294669197e-05, + "loss": 1.9617, + "step": 4879 + }, + { + "epoch": 1.4978514426028238, + "grad_norm": 0.4514639675617218, + "learning_rate": 9.628710353062427e-05, + "loss": 1.9503, + "step": 4880 + }, + { + "epoch": 1.4981583793738489, + "grad_norm": 0.4960804879665375, + "learning_rate": 9.628522365712027e-05, + "loss": 1.9932, + "step": 4881 + }, + { + "epoch": 1.4984653161448742, + "grad_norm": 0.5604363083839417, + "learning_rate": 9.628334332619857e-05, + "loss": 2.0186, + "step": 4882 + }, + { + "epoch": 1.4987722529158993, + "grad_norm": 0.5125443935394287, + "learning_rate": 9.628146253787776e-05, + "loss": 1.9897, + "step": 4883 + }, + { + "epoch": 1.4990791896869244, + "grad_norm": 0.4029771089553833, + "learning_rate": 9.627958129217639e-05, + "loss": 1.9083, + "step": 4884 + }, + { + "epoch": 1.4993861264579498, + "grad_norm": 0.4608222544193268, + "learning_rate": 9.627769958911308e-05, + "loss": 2.0153, + "step": 4885 + }, + { + "epoch": 1.4996930632289749, + "grad_norm": 0.4253246486186981, + "learning_rate": 9.627581742870641e-05, + "loss": 1.9278, + "step": 4886 + }, + { + "epoch": 1.5, + "grad_norm": 0.4247463047504425, + "learning_rate": 9.6273934810975e-05, + "loss": 1.9456, + "step": 4887 + }, + { + "epoch": 1.5003069367710253, + "grad_norm": 0.44055816531181335, + "learning_rate": 9.627205173593744e-05, + "loss": 2.0225, + "step": 4888 + }, + { + "epoch": 1.5006138735420502, + "grad_norm": 0.47912710905075073, + "learning_rate": 9.627016820361235e-05, + "loss": 1.9716, + "step": 4889 + }, + { + "epoch": 1.5009208103130756, + "grad_norm": 0.47608625888824463, + "learning_rate": 9.626828421401832e-05, + "loss": 1.9444, + "step": 4890 + }, + { + "epoch": 1.5012277470841007, + "grad_norm": 0.4757349193096161, + "learning_rate": 9.6266399767174e-05, + "loss": 2.0699, + "step": 4891 + }, + { + "epoch": 1.5015346838551258, + "grad_norm": 0.5556650757789612, + "learning_rate": 9.6264514863098e-05, + "loss": 1.99, + "step": 4892 + }, + { + "epoch": 1.5018416206261511, + "grad_norm": 0.5072291493415833, + "learning_rate": 9.626262950180894e-05, + "loss": 1.9435, + "step": 4893 + }, + { + "epoch": 1.5021485573971762, + "grad_norm": 0.47811564803123474, + "learning_rate": 9.626074368332546e-05, + "loss": 1.9399, + "step": 4894 + }, + { + "epoch": 1.5024554941682013, + "grad_norm": 0.4613232910633087, + "learning_rate": 9.62588574076662e-05, + "loss": 1.9259, + "step": 4895 + }, + { + "epoch": 1.5027624309392267, + "grad_norm": 0.4170697331428528, + "learning_rate": 9.62569706748498e-05, + "loss": 1.9319, + "step": 4896 + }, + { + "epoch": 1.5030693677102516, + "grad_norm": 0.4731575548648834, + "learning_rate": 9.62550834848949e-05, + "loss": 1.9862, + "step": 4897 + }, + { + "epoch": 1.503376304481277, + "grad_norm": 0.49881401658058167, + "learning_rate": 9.625319583782016e-05, + "loss": 1.9837, + "step": 4898 + }, + { + "epoch": 1.503683241252302, + "grad_norm": 0.4689660668373108, + "learning_rate": 9.625130773364424e-05, + "loss": 1.9662, + "step": 4899 + }, + { + "epoch": 1.5039901780233271, + "grad_norm": 0.48389768600463867, + "learning_rate": 9.624941917238577e-05, + "loss": 2.0087, + "step": 4900 + }, + { + "epoch": 1.5042971147943525, + "grad_norm": 0.46716609597206116, + "learning_rate": 9.624753015406342e-05, + "loss": 1.9718, + "step": 4901 + }, + { + "epoch": 1.5046040515653776, + "grad_norm": 0.544793963432312, + "learning_rate": 9.62456406786959e-05, + "loss": 1.9878, + "step": 4902 + }, + { + "epoch": 1.5049109883364027, + "grad_norm": 0.44499701261520386, + "learning_rate": 9.624375074630183e-05, + "loss": 1.8849, + "step": 4903 + }, + { + "epoch": 1.505217925107428, + "grad_norm": 0.42464208602905273, + "learning_rate": 9.624186035689993e-05, + "loss": 1.8995, + "step": 4904 + }, + { + "epoch": 1.505524861878453, + "grad_norm": 0.41650670766830444, + "learning_rate": 9.623996951050885e-05, + "loss": 1.9138, + "step": 4905 + }, + { + "epoch": 1.5058317986494782, + "grad_norm": 0.37955889105796814, + "learning_rate": 9.62380782071473e-05, + "loss": 1.9746, + "step": 4906 + }, + { + "epoch": 1.5061387354205034, + "grad_norm": 0.3799228072166443, + "learning_rate": 9.623618644683394e-05, + "loss": 1.942, + "step": 4907 + }, + { + "epoch": 1.5064456721915285, + "grad_norm": 0.3799766004085541, + "learning_rate": 9.623429422958751e-05, + "loss": 1.9025, + "step": 4908 + }, + { + "epoch": 1.5067526089625538, + "grad_norm": 0.3780234456062317, + "learning_rate": 9.623240155542668e-05, + "loss": 1.9581, + "step": 4909 + }, + { + "epoch": 1.507059545733579, + "grad_norm": 0.36379706859588623, + "learning_rate": 9.623050842437014e-05, + "loss": 1.9299, + "step": 4910 + }, + { + "epoch": 1.507366482504604, + "grad_norm": 0.5230580568313599, + "learning_rate": 9.622861483643663e-05, + "loss": 2.0306, + "step": 4911 + }, + { + "epoch": 1.5076734192756294, + "grad_norm": 0.443945050239563, + "learning_rate": 9.622672079164486e-05, + "loss": 1.9032, + "step": 4912 + }, + { + "epoch": 1.5079803560466543, + "grad_norm": 0.4689701795578003, + "learning_rate": 9.622482629001355e-05, + "loss": 1.9901, + "step": 4913 + }, + { + "epoch": 1.5082872928176796, + "grad_norm": 0.4483632445335388, + "learning_rate": 9.622293133156139e-05, + "loss": 1.948, + "step": 4914 + }, + { + "epoch": 1.5085942295887047, + "grad_norm": 0.4064919948577881, + "learning_rate": 9.622103591630715e-05, + "loss": 1.9487, + "step": 4915 + }, + { + "epoch": 1.5089011663597298, + "grad_norm": 0.44170522689819336, + "learning_rate": 9.621914004426952e-05, + "loss": 1.9929, + "step": 4916 + }, + { + "epoch": 1.5092081031307552, + "grad_norm": 0.45979443192481995, + "learning_rate": 9.621724371546727e-05, + "loss": 1.9428, + "step": 4917 + }, + { + "epoch": 1.5095150399017803, + "grad_norm": 0.5258452892303467, + "learning_rate": 9.621534692991913e-05, + "loss": 2.0049, + "step": 4918 + }, + { + "epoch": 1.5098219766728054, + "grad_norm": 0.45191919803619385, + "learning_rate": 9.621344968764385e-05, + "loss": 2.0364, + "step": 4919 + }, + { + "epoch": 1.5101289134438307, + "grad_norm": 0.539245069026947, + "learning_rate": 9.621155198866016e-05, + "loss": 2.072, + "step": 4920 + }, + { + "epoch": 1.5104358502148556, + "grad_norm": 0.5410256385803223, + "learning_rate": 9.620965383298684e-05, + "loss": 2.0231, + "step": 4921 + }, + { + "epoch": 1.510742786985881, + "grad_norm": 0.4409741759300232, + "learning_rate": 9.620775522064264e-05, + "loss": 1.9024, + "step": 4922 + }, + { + "epoch": 1.511049723756906, + "grad_norm": 0.4911535680294037, + "learning_rate": 9.620585615164631e-05, + "loss": 2.0057, + "step": 4923 + }, + { + "epoch": 1.5113566605279312, + "grad_norm": 0.48139557242393494, + "learning_rate": 9.620395662601663e-05, + "loss": 2.0175, + "step": 4924 + }, + { + "epoch": 1.5116635972989565, + "grad_norm": 0.5130077004432678, + "learning_rate": 9.620205664377238e-05, + "loss": 1.952, + "step": 4925 + }, + { + "epoch": 1.5119705340699816, + "grad_norm": 0.5428542494773865, + "learning_rate": 9.62001562049323e-05, + "loss": 1.977, + "step": 4926 + }, + { + "epoch": 1.5122774708410067, + "grad_norm": 0.4586256444454193, + "learning_rate": 9.619825530951522e-05, + "loss": 1.9997, + "step": 4927 + }, + { + "epoch": 1.512584407612032, + "grad_norm": 0.3941349387168884, + "learning_rate": 9.61963539575399e-05, + "loss": 1.9174, + "step": 4928 + }, + { + "epoch": 1.512891344383057, + "grad_norm": 0.4396456480026245, + "learning_rate": 9.619445214902511e-05, + "loss": 1.9696, + "step": 4929 + }, + { + "epoch": 1.5131982811540823, + "grad_norm": 0.5413886904716492, + "learning_rate": 9.61925498839897e-05, + "loss": 2.0332, + "step": 4930 + }, + { + "epoch": 1.5135052179251074, + "grad_norm": 0.5946230888366699, + "learning_rate": 9.619064716245242e-05, + "loss": 2.0433, + "step": 4931 + }, + { + "epoch": 1.5138121546961325, + "grad_norm": 0.6353569030761719, + "learning_rate": 9.618874398443211e-05, + "loss": 1.9828, + "step": 4932 + }, + { + "epoch": 1.5141190914671578, + "grad_norm": 0.523690938949585, + "learning_rate": 9.618684034994754e-05, + "loss": 1.9024, + "step": 4933 + }, + { + "epoch": 1.514426028238183, + "grad_norm": 0.4437367022037506, + "learning_rate": 9.618493625901754e-05, + "loss": 1.9961, + "step": 4934 + }, + { + "epoch": 1.514732965009208, + "grad_norm": 0.48458734154701233, + "learning_rate": 9.618303171166094e-05, + "loss": 1.9515, + "step": 4935 + }, + { + "epoch": 1.5150399017802334, + "grad_norm": 0.47659310698509216, + "learning_rate": 9.618112670789657e-05, + "loss": 1.9943, + "step": 4936 + }, + { + "epoch": 1.5153468385512583, + "grad_norm": 0.49281415343284607, + "learning_rate": 9.617922124774322e-05, + "loss": 1.9311, + "step": 4937 + }, + { + "epoch": 1.5156537753222836, + "grad_norm": 0.4706041216850281, + "learning_rate": 9.617731533121972e-05, + "loss": 1.9478, + "step": 4938 + }, + { + "epoch": 1.5159607120933087, + "grad_norm": 0.4187149405479431, + "learning_rate": 9.617540895834496e-05, + "loss": 1.9915, + "step": 4939 + }, + { + "epoch": 1.5162676488643339, + "grad_norm": 0.3792540431022644, + "learning_rate": 9.617350212913772e-05, + "loss": 1.8609, + "step": 4940 + }, + { + "epoch": 1.5165745856353592, + "grad_norm": 0.46558165550231934, + "learning_rate": 9.617159484361688e-05, + "loss": 1.9574, + "step": 4941 + }, + { + "epoch": 1.5168815224063843, + "grad_norm": 0.4930344820022583, + "learning_rate": 9.616968710180127e-05, + "loss": 1.9924, + "step": 4942 + }, + { + "epoch": 1.5171884591774094, + "grad_norm": 0.44909337162971497, + "learning_rate": 9.616777890370976e-05, + "loss": 1.9674, + "step": 4943 + }, + { + "epoch": 1.5174953959484347, + "grad_norm": 0.43266600370407104, + "learning_rate": 9.616587024936119e-05, + "loss": 1.8899, + "step": 4944 + }, + { + "epoch": 1.5178023327194596, + "grad_norm": 0.43229207396507263, + "learning_rate": 9.616396113877444e-05, + "loss": 1.9671, + "step": 4945 + }, + { + "epoch": 1.518109269490485, + "grad_norm": 0.4609402120113373, + "learning_rate": 9.616205157196837e-05, + "loss": 1.9844, + "step": 4946 + }, + { + "epoch": 1.51841620626151, + "grad_norm": 0.4598314166069031, + "learning_rate": 9.616014154896184e-05, + "loss": 1.985, + "step": 4947 + }, + { + "epoch": 1.5187231430325352, + "grad_norm": 0.4746960997581482, + "learning_rate": 9.615823106977376e-05, + "loss": 2.0199, + "step": 4948 + }, + { + "epoch": 1.5190300798035605, + "grad_norm": 0.47560420632362366, + "learning_rate": 9.615632013442295e-05, + "loss": 1.8864, + "step": 4949 + }, + { + "epoch": 1.5193370165745856, + "grad_norm": 0.447837233543396, + "learning_rate": 9.615440874292835e-05, + "loss": 1.9699, + "step": 4950 + }, + { + "epoch": 1.5196439533456108, + "grad_norm": 0.49653175473213196, + "learning_rate": 9.615249689530883e-05, + "loss": 2.0645, + "step": 4951 + }, + { + "epoch": 1.519950890116636, + "grad_norm": 0.47083014249801636, + "learning_rate": 9.615058459158328e-05, + "loss": 2.01, + "step": 4952 + }, + { + "epoch": 1.520257826887661, + "grad_norm": 0.5299197435379028, + "learning_rate": 9.614867183177061e-05, + "loss": 2.0232, + "step": 4953 + }, + { + "epoch": 1.5205647636586863, + "grad_norm": 0.5005922317504883, + "learning_rate": 9.614675861588971e-05, + "loss": 1.9703, + "step": 4954 + }, + { + "epoch": 1.5208717004297114, + "grad_norm": 0.5131978392601013, + "learning_rate": 9.61448449439595e-05, + "loss": 1.9921, + "step": 4955 + }, + { + "epoch": 1.5211786372007365, + "grad_norm": 0.5278428196907043, + "learning_rate": 9.614293081599889e-05, + "loss": 1.9111, + "step": 4956 + }, + { + "epoch": 1.5214855739717619, + "grad_norm": 0.4914579689502716, + "learning_rate": 9.614101623202678e-05, + "loss": 2.0398, + "step": 4957 + }, + { + "epoch": 1.521792510742787, + "grad_norm": 0.454863041639328, + "learning_rate": 9.61391011920621e-05, + "loss": 1.9674, + "step": 4958 + }, + { + "epoch": 1.522099447513812, + "grad_norm": 0.464491605758667, + "learning_rate": 9.613718569612379e-05, + "loss": 2.0123, + "step": 4959 + }, + { + "epoch": 1.5224063842848374, + "grad_norm": 0.4252295196056366, + "learning_rate": 9.613526974423078e-05, + "loss": 1.9796, + "step": 4960 + }, + { + "epoch": 1.5227133210558625, + "grad_norm": 0.4643968641757965, + "learning_rate": 9.613335333640199e-05, + "loss": 1.9448, + "step": 4961 + }, + { + "epoch": 1.5230202578268877, + "grad_norm": 0.4204397201538086, + "learning_rate": 9.613143647265635e-05, + "loss": 2.0191, + "step": 4962 + }, + { + "epoch": 1.523327194597913, + "grad_norm": 0.3838767111301422, + "learning_rate": 9.612951915301283e-05, + "loss": 1.9057, + "step": 4963 + }, + { + "epoch": 1.5236341313689379, + "grad_norm": 0.4353863000869751, + "learning_rate": 9.612760137749035e-05, + "loss": 2.0435, + "step": 4964 + }, + { + "epoch": 1.5239410681399632, + "grad_norm": 0.4082738757133484, + "learning_rate": 9.612568314610788e-05, + "loss": 1.9229, + "step": 4965 + }, + { + "epoch": 1.5242480049109883, + "grad_norm": 0.4382591247558594, + "learning_rate": 9.612376445888437e-05, + "loss": 1.9185, + "step": 4966 + }, + { + "epoch": 1.5245549416820134, + "grad_norm": 0.48340749740600586, + "learning_rate": 9.61218453158388e-05, + "loss": 1.9669, + "step": 4967 + }, + { + "epoch": 1.5248618784530388, + "grad_norm": 0.47423556447029114, + "learning_rate": 9.611992571699012e-05, + "loss": 1.9372, + "step": 4968 + }, + { + "epoch": 1.525168815224064, + "grad_norm": 0.4070637822151184, + "learning_rate": 9.611800566235728e-05, + "loss": 2.0201, + "step": 4969 + }, + { + "epoch": 1.525475751995089, + "grad_norm": 0.43758198618888855, + "learning_rate": 9.61160851519593e-05, + "loss": 1.982, + "step": 4970 + }, + { + "epoch": 1.5257826887661143, + "grad_norm": 0.4724174737930298, + "learning_rate": 9.611416418581513e-05, + "loss": 1.9938, + "step": 4971 + }, + { + "epoch": 1.5260896255371392, + "grad_norm": 0.492405503988266, + "learning_rate": 9.611224276394374e-05, + "loss": 1.9462, + "step": 4972 + }, + { + "epoch": 1.5263965623081646, + "grad_norm": 0.5064161419868469, + "learning_rate": 9.611032088636418e-05, + "loss": 2.0326, + "step": 4973 + }, + { + "epoch": 1.5267034990791897, + "grad_norm": 0.4256031811237335, + "learning_rate": 9.610839855309537e-05, + "loss": 1.8885, + "step": 4974 + }, + { + "epoch": 1.5270104358502148, + "grad_norm": 0.4283316731452942, + "learning_rate": 9.610647576415636e-05, + "loss": 2.005, + "step": 4975 + }, + { + "epoch": 1.5273173726212401, + "grad_norm": 0.44234412908554077, + "learning_rate": 9.610455251956614e-05, + "loss": 1.9626, + "step": 4976 + }, + { + "epoch": 1.5276243093922652, + "grad_norm": 0.4135831594467163, + "learning_rate": 9.610262881934369e-05, + "loss": 1.9529, + "step": 4977 + }, + { + "epoch": 1.5279312461632903, + "grad_norm": 0.48090922832489014, + "learning_rate": 9.610070466350805e-05, + "loss": 2.0239, + "step": 4978 + }, + { + "epoch": 1.5282381829343157, + "grad_norm": 0.4546974301338196, + "learning_rate": 9.609878005207822e-05, + "loss": 1.9556, + "step": 4979 + }, + { + "epoch": 1.5285451197053406, + "grad_norm": 0.4197862148284912, + "learning_rate": 9.609685498507323e-05, + "loss": 1.9117, + "step": 4980 + }, + { + "epoch": 1.528852056476366, + "grad_norm": 0.4376974105834961, + "learning_rate": 9.60949294625121e-05, + "loss": 1.9514, + "step": 4981 + }, + { + "epoch": 1.529158993247391, + "grad_norm": 0.3671407401561737, + "learning_rate": 9.609300348441385e-05, + "loss": 1.9042, + "step": 4982 + }, + { + "epoch": 1.5294659300184161, + "grad_norm": 0.4326031506061554, + "learning_rate": 9.609107705079754e-05, + "loss": 1.9606, + "step": 4983 + }, + { + "epoch": 1.5297728667894415, + "grad_norm": 0.423308402299881, + "learning_rate": 9.608915016168218e-05, + "loss": 1.9663, + "step": 4984 + }, + { + "epoch": 1.5300798035604666, + "grad_norm": 0.46309906244277954, + "learning_rate": 9.608722281708683e-05, + "loss": 2.0114, + "step": 4985 + }, + { + "epoch": 1.5303867403314917, + "grad_norm": 0.4619913101196289, + "learning_rate": 9.608529501703053e-05, + "loss": 1.9328, + "step": 4986 + }, + { + "epoch": 1.530693677102517, + "grad_norm": 0.4335738718509674, + "learning_rate": 9.608336676153234e-05, + "loss": 1.9069, + "step": 4987 + }, + { + "epoch": 1.531000613873542, + "grad_norm": 0.40606966614723206, + "learning_rate": 9.608143805061129e-05, + "loss": 1.9243, + "step": 4988 + }, + { + "epoch": 1.5313075506445673, + "grad_norm": 0.45613235235214233, + "learning_rate": 9.607950888428649e-05, + "loss": 1.9943, + "step": 4989 + }, + { + "epoch": 1.5316144874155924, + "grad_norm": 0.4905582666397095, + "learning_rate": 9.607757926257696e-05, + "loss": 1.9649, + "step": 4990 + }, + { + "epoch": 1.5319214241866175, + "grad_norm": 0.44312527775764465, + "learning_rate": 9.607564918550179e-05, + "loss": 1.927, + "step": 4991 + }, + { + "epoch": 1.5322283609576428, + "grad_norm": 0.5193700790405273, + "learning_rate": 9.607371865308004e-05, + "loss": 1.9038, + "step": 4992 + }, + { + "epoch": 1.532535297728668, + "grad_norm": 0.5528806447982788, + "learning_rate": 9.607178766533078e-05, + "loss": 1.9194, + "step": 4993 + }, + { + "epoch": 1.532842234499693, + "grad_norm": 0.6561285257339478, + "learning_rate": 9.606985622227314e-05, + "loss": 2.0098, + "step": 4994 + }, + { + "epoch": 1.5331491712707184, + "grad_norm": 0.5642603635787964, + "learning_rate": 9.606792432392617e-05, + "loss": 1.9781, + "step": 4995 + }, + { + "epoch": 1.5334561080417433, + "grad_norm": 0.4974311590194702, + "learning_rate": 9.606599197030896e-05, + "loss": 1.9558, + "step": 4996 + }, + { + "epoch": 1.5337630448127686, + "grad_norm": 0.4324510395526886, + "learning_rate": 9.606405916144063e-05, + "loss": 1.9749, + "step": 4997 + }, + { + "epoch": 1.5340699815837937, + "grad_norm": 0.45244327187538147, + "learning_rate": 9.606212589734027e-05, + "loss": 1.8902, + "step": 4998 + }, + { + "epoch": 1.5343769183548188, + "grad_norm": 0.5418685078620911, + "learning_rate": 9.606019217802698e-05, + "loss": 1.9766, + "step": 4999 + }, + { + "epoch": 1.5346838551258442, + "grad_norm": 0.48479241132736206, + "learning_rate": 9.605825800351987e-05, + "loss": 1.9949, + "step": 5000 + }, + { + "epoch": 1.5349907918968693, + "grad_norm": 0.4958111643791199, + "learning_rate": 9.605632337383806e-05, + "loss": 1.988, + "step": 5001 + }, + { + "epoch": 1.5352977286678944, + "grad_norm": 0.47347983717918396, + "learning_rate": 9.605438828900067e-05, + "loss": 1.9157, + "step": 5002 + }, + { + "epoch": 1.5356046654389197, + "grad_norm": 0.4018974304199219, + "learning_rate": 9.605245274902684e-05, + "loss": 1.9347, + "step": 5003 + }, + { + "epoch": 1.5359116022099446, + "grad_norm": 0.46161791682243347, + "learning_rate": 9.605051675393565e-05, + "loss": 1.9785, + "step": 5004 + }, + { + "epoch": 1.53621853898097, + "grad_norm": 0.5113234519958496, + "learning_rate": 9.604858030374627e-05, + "loss": 1.9595, + "step": 5005 + }, + { + "epoch": 1.536525475751995, + "grad_norm": 0.6643409132957458, + "learning_rate": 9.604664339847784e-05, + "loss": 2.0395, + "step": 5006 + }, + { + "epoch": 1.5368324125230202, + "grad_norm": 0.6759974360466003, + "learning_rate": 9.604470603814948e-05, + "loss": 1.9058, + "step": 5007 + }, + { + "epoch": 1.5371393492940455, + "grad_norm": 0.5576213598251343, + "learning_rate": 9.604276822278035e-05, + "loss": 1.9326, + "step": 5008 + }, + { + "epoch": 1.5374462860650706, + "grad_norm": 0.4472630023956299, + "learning_rate": 9.60408299523896e-05, + "loss": 1.9553, + "step": 5009 + }, + { + "epoch": 1.5377532228360957, + "grad_norm": 0.48445144295692444, + "learning_rate": 9.603889122699638e-05, + "loss": 2.0136, + "step": 5010 + }, + { + "epoch": 1.538060159607121, + "grad_norm": 0.4793097972869873, + "learning_rate": 9.603695204661987e-05, + "loss": 1.9777, + "step": 5011 + }, + { + "epoch": 1.538367096378146, + "grad_norm": 0.5003167390823364, + "learning_rate": 9.60350124112792e-05, + "loss": 1.9672, + "step": 5012 + }, + { + "epoch": 1.5386740331491713, + "grad_norm": 0.5131042003631592, + "learning_rate": 9.603307232099355e-05, + "loss": 2.0058, + "step": 5013 + }, + { + "epoch": 1.5389809699201964, + "grad_norm": 0.4145869314670563, + "learning_rate": 9.603113177578212e-05, + "loss": 1.9332, + "step": 5014 + }, + { + "epoch": 1.5392879066912215, + "grad_norm": 0.4939991235733032, + "learning_rate": 9.602919077566404e-05, + "loss": 1.9967, + "step": 5015 + }, + { + "epoch": 1.5395948434622468, + "grad_norm": 0.4768902361392975, + "learning_rate": 9.602724932065853e-05, + "loss": 1.873, + "step": 5016 + }, + { + "epoch": 1.539901780233272, + "grad_norm": 0.45381611585617065, + "learning_rate": 9.602530741078476e-05, + "loss": 1.9416, + "step": 5017 + }, + { + "epoch": 1.540208717004297, + "grad_norm": 0.43104392290115356, + "learning_rate": 9.602336504606193e-05, + "loss": 1.9566, + "step": 5018 + }, + { + "epoch": 1.5405156537753224, + "grad_norm": 0.5354776978492737, + "learning_rate": 9.602142222650924e-05, + "loss": 1.9939, + "step": 5019 + }, + { + "epoch": 1.5408225905463473, + "grad_norm": 0.5623740553855896, + "learning_rate": 9.601947895214586e-05, + "loss": 1.9622, + "step": 5020 + }, + { + "epoch": 1.5411295273173726, + "grad_norm": 0.5234485268592834, + "learning_rate": 9.601753522299103e-05, + "loss": 1.9636, + "step": 5021 + }, + { + "epoch": 1.5414364640883977, + "grad_norm": 0.416384756565094, + "learning_rate": 9.601559103906396e-05, + "loss": 1.92, + "step": 5022 + }, + { + "epoch": 1.5417434008594229, + "grad_norm": 0.47080478072166443, + "learning_rate": 9.601364640038384e-05, + "loss": 1.9147, + "step": 5023 + }, + { + "epoch": 1.5420503376304482, + "grad_norm": 0.527463972568512, + "learning_rate": 9.601170130696988e-05, + "loss": 1.9458, + "step": 5024 + }, + { + "epoch": 1.5423572744014733, + "grad_norm": 0.4761022925376892, + "learning_rate": 9.600975575884134e-05, + "loss": 1.95, + "step": 5025 + }, + { + "epoch": 1.5426642111724984, + "grad_norm": 0.48202264308929443, + "learning_rate": 9.600780975601741e-05, + "loss": 1.9618, + "step": 5026 + }, + { + "epoch": 1.5429711479435237, + "grad_norm": 0.43222522735595703, + "learning_rate": 9.600586329851735e-05, + "loss": 1.9869, + "step": 5027 + }, + { + "epoch": 1.5432780847145486, + "grad_norm": 0.40816691517829895, + "learning_rate": 9.600391638636037e-05, + "loss": 1.991, + "step": 5028 + }, + { + "epoch": 1.543585021485574, + "grad_norm": 0.4365478754043579, + "learning_rate": 9.600196901956572e-05, + "loss": 1.9904, + "step": 5029 + }, + { + "epoch": 1.5438919582565993, + "grad_norm": 0.41411092877388, + "learning_rate": 9.600002119815268e-05, + "loss": 1.9449, + "step": 5030 + }, + { + "epoch": 1.5441988950276242, + "grad_norm": 0.41023650765419006, + "learning_rate": 9.599807292214045e-05, + "loss": 1.9318, + "step": 5031 + }, + { + "epoch": 1.5445058317986495, + "grad_norm": 0.4844631254673004, + "learning_rate": 9.599612419154831e-05, + "loss": 1.9884, + "step": 5032 + }, + { + "epoch": 1.5448127685696746, + "grad_norm": 0.4347037374973297, + "learning_rate": 9.59941750063955e-05, + "loss": 1.8992, + "step": 5033 + }, + { + "epoch": 1.5451197053406998, + "grad_norm": 0.6414445638656616, + "learning_rate": 9.59922253667013e-05, + "loss": 2.0268, + "step": 5034 + }, + { + "epoch": 1.545426642111725, + "grad_norm": 0.6607222557067871, + "learning_rate": 9.599027527248498e-05, + "loss": 2.0116, + "step": 5035 + }, + { + "epoch": 1.5457335788827502, + "grad_norm": 0.6406869292259216, + "learning_rate": 9.59883247237658e-05, + "loss": 1.9256, + "step": 5036 + }, + { + "epoch": 1.5460405156537753, + "grad_norm": 0.5388308167457581, + "learning_rate": 9.598637372056303e-05, + "loss": 1.906, + "step": 5037 + }, + { + "epoch": 1.5463474524248007, + "grad_norm": 0.42285510897636414, + "learning_rate": 9.598442226289596e-05, + "loss": 1.9137, + "step": 5038 + }, + { + "epoch": 1.5466543891958255, + "grad_norm": 0.5622994303703308, + "learning_rate": 9.598247035078389e-05, + "loss": 1.9825, + "step": 5039 + }, + { + "epoch": 1.5469613259668509, + "grad_norm": 0.7120574116706848, + "learning_rate": 9.59805179842461e-05, + "loss": 1.9467, + "step": 5040 + }, + { + "epoch": 1.547268262737876, + "grad_norm": 0.7050338983535767, + "learning_rate": 9.597856516330187e-05, + "loss": 1.9763, + "step": 5041 + }, + { + "epoch": 1.547575199508901, + "grad_norm": 0.4908922016620636, + "learning_rate": 9.597661188797051e-05, + "loss": 1.9826, + "step": 5042 + }, + { + "epoch": 1.5478821362799264, + "grad_norm": 0.47363361716270447, + "learning_rate": 9.597465815827133e-05, + "loss": 1.9769, + "step": 5043 + }, + { + "epoch": 1.5481890730509515, + "grad_norm": 0.6289864182472229, + "learning_rate": 9.597270397422364e-05, + "loss": 1.9364, + "step": 5044 + }, + { + "epoch": 1.5484960098219767, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.597074933584673e-05, + "loss": 1.949, + "step": 5045 + }, + { + "epoch": 1.548802946593002, + "grad_norm": 0.559152364730835, + "learning_rate": 9.596879424315993e-05, + "loss": 2.0194, + "step": 5046 + }, + { + "epoch": 1.5491098833640269, + "grad_norm": 0.4613901674747467, + "learning_rate": 9.596683869618257e-05, + "loss": 1.9658, + "step": 5047 + }, + { + "epoch": 1.5494168201350522, + "grad_norm": 0.6245483160018921, + "learning_rate": 9.596488269493396e-05, + "loss": 1.9265, + "step": 5048 + }, + { + "epoch": 1.5497237569060773, + "grad_norm": 0.8100824356079102, + "learning_rate": 9.596292623943343e-05, + "loss": 1.9536, + "step": 5049 + }, + { + "epoch": 1.5500306936771024, + "grad_norm": 0.7486092448234558, + "learning_rate": 9.596096932970035e-05, + "loss": 1.9801, + "step": 5050 + }, + { + "epoch": 1.5503376304481278, + "grad_norm": 0.4803295135498047, + "learning_rate": 9.595901196575401e-05, + "loss": 1.9943, + "step": 5051 + }, + { + "epoch": 1.550644567219153, + "grad_norm": 0.5027125477790833, + "learning_rate": 9.595705414761379e-05, + "loss": 1.9036, + "step": 5052 + }, + { + "epoch": 1.550951503990178, + "grad_norm": 0.5785070657730103, + "learning_rate": 9.595509587529902e-05, + "loss": 1.9489, + "step": 5053 + }, + { + "epoch": 1.5512584407612033, + "grad_norm": 0.6017338633537292, + "learning_rate": 9.595313714882906e-05, + "loss": 1.9964, + "step": 5054 + }, + { + "epoch": 1.5515653775322282, + "grad_norm": 0.5023195147514343, + "learning_rate": 9.595117796822326e-05, + "loss": 1.9778, + "step": 5055 + }, + { + "epoch": 1.5518723143032536, + "grad_norm": 0.4488884508609772, + "learning_rate": 9.594921833350099e-05, + "loss": 2.0141, + "step": 5056 + }, + { + "epoch": 1.5521792510742787, + "grad_norm": 0.47110801935195923, + "learning_rate": 9.59472582446816e-05, + "loss": 1.9294, + "step": 5057 + }, + { + "epoch": 1.5524861878453038, + "grad_norm": 0.5292330980300903, + "learning_rate": 9.594529770178449e-05, + "loss": 2.0427, + "step": 5058 + }, + { + "epoch": 1.5527931246163291, + "grad_norm": 0.522756814956665, + "learning_rate": 9.5943336704829e-05, + "loss": 1.9854, + "step": 5059 + }, + { + "epoch": 1.5531000613873542, + "grad_norm": 0.44659632444381714, + "learning_rate": 9.594137525383455e-05, + "loss": 2.028, + "step": 5060 + }, + { + "epoch": 1.5534069981583793, + "grad_norm": 0.4745616614818573, + "learning_rate": 9.593941334882048e-05, + "loss": 1.9994, + "step": 5061 + }, + { + "epoch": 1.5537139349294047, + "grad_norm": 0.41752973198890686, + "learning_rate": 9.593745098980622e-05, + "loss": 1.9466, + "step": 5062 + }, + { + "epoch": 1.5540208717004296, + "grad_norm": 0.4548248052597046, + "learning_rate": 9.593548817681115e-05, + "loss": 1.9064, + "step": 5063 + }, + { + "epoch": 1.554327808471455, + "grad_norm": 0.45780888199806213, + "learning_rate": 9.593352490985464e-05, + "loss": 2.0254, + "step": 5064 + }, + { + "epoch": 1.55463474524248, + "grad_norm": 0.4118718206882477, + "learning_rate": 9.593156118895613e-05, + "loss": 1.9761, + "step": 5065 + }, + { + "epoch": 1.5549416820135051, + "grad_norm": 0.41350236535072327, + "learning_rate": 9.592959701413501e-05, + "loss": 1.9476, + "step": 5066 + }, + { + "epoch": 1.5552486187845305, + "grad_norm": 0.4116091728210449, + "learning_rate": 9.59276323854107e-05, + "loss": 1.9325, + "step": 5067 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.44039735198020935, + "learning_rate": 9.592566730280259e-05, + "loss": 1.9916, + "step": 5068 + }, + { + "epoch": 1.5558624923265807, + "grad_norm": 0.4028816819190979, + "learning_rate": 9.592370176633012e-05, + "loss": 1.916, + "step": 5069 + }, + { + "epoch": 1.556169429097606, + "grad_norm": 0.42046302556991577, + "learning_rate": 9.592173577601271e-05, + "loss": 1.961, + "step": 5070 + }, + { + "epoch": 1.556476365868631, + "grad_norm": 0.3749450147151947, + "learning_rate": 9.591976933186982e-05, + "loss": 1.9279, + "step": 5071 + }, + { + "epoch": 1.5567833026396563, + "grad_norm": 0.3441384434700012, + "learning_rate": 9.591780243392081e-05, + "loss": 1.8967, + "step": 5072 + }, + { + "epoch": 1.5570902394106814, + "grad_norm": 0.4032546877861023, + "learning_rate": 9.59158350821852e-05, + "loss": 1.9912, + "step": 5073 + }, + { + "epoch": 1.5573971761817065, + "grad_norm": 0.44628265500068665, + "learning_rate": 9.591386727668238e-05, + "loss": 2.0539, + "step": 5074 + }, + { + "epoch": 1.5577041129527318, + "grad_norm": 0.43606969714164734, + "learning_rate": 9.59118990174318e-05, + "loss": 1.97, + "step": 5075 + }, + { + "epoch": 1.558011049723757, + "grad_norm": 0.42076775431632996, + "learning_rate": 9.590993030445295e-05, + "loss": 1.962, + "step": 5076 + }, + { + "epoch": 1.558317986494782, + "grad_norm": 0.34569117426872253, + "learning_rate": 9.590796113776526e-05, + "loss": 1.8815, + "step": 5077 + }, + { + "epoch": 1.5586249232658074, + "grad_norm": 0.3931111693382263, + "learning_rate": 9.590599151738817e-05, + "loss": 1.9016, + "step": 5078 + }, + { + "epoch": 1.5589318600368323, + "grad_norm": 0.3952369689941406, + "learning_rate": 9.590402144334117e-05, + "loss": 1.9277, + "step": 5079 + }, + { + "epoch": 1.5592387968078576, + "grad_norm": 0.3960857689380646, + "learning_rate": 9.590205091564372e-05, + "loss": 1.947, + "step": 5080 + }, + { + "epoch": 1.5595457335788827, + "grad_norm": 0.37946292757987976, + "learning_rate": 9.590007993431532e-05, + "loss": 1.9907, + "step": 5081 + }, + { + "epoch": 1.5598526703499078, + "grad_norm": 0.41619375348091125, + "learning_rate": 9.589810849937541e-05, + "loss": 1.9451, + "step": 5082 + }, + { + "epoch": 1.5601596071209332, + "grad_norm": 0.39266669750213623, + "learning_rate": 9.58961366108435e-05, + "loss": 2.0137, + "step": 5083 + }, + { + "epoch": 1.5604665438919583, + "grad_norm": 0.39510276913642883, + "learning_rate": 9.589416426873907e-05, + "loss": 1.947, + "step": 5084 + }, + { + "epoch": 1.5607734806629834, + "grad_norm": 0.40243181586265564, + "learning_rate": 9.58921914730816e-05, + "loss": 1.8957, + "step": 5085 + }, + { + "epoch": 1.5610804174340087, + "grad_norm": 0.39877578616142273, + "learning_rate": 9.58902182238906e-05, + "loss": 1.9497, + "step": 5086 + }, + { + "epoch": 1.5613873542050336, + "grad_norm": 0.39367151260375977, + "learning_rate": 9.588824452118557e-05, + "loss": 1.9616, + "step": 5087 + }, + { + "epoch": 1.561694290976059, + "grad_norm": 0.35690104961395264, + "learning_rate": 9.5886270364986e-05, + "loss": 1.9108, + "step": 5088 + }, + { + "epoch": 1.562001227747084, + "grad_norm": 0.39512762427330017, + "learning_rate": 9.588429575531141e-05, + "loss": 1.9909, + "step": 5089 + }, + { + "epoch": 1.5623081645181092, + "grad_norm": 0.39253926277160645, + "learning_rate": 9.588232069218132e-05, + "loss": 1.937, + "step": 5090 + }, + { + "epoch": 1.5626151012891345, + "grad_norm": 0.37811553478240967, + "learning_rate": 9.588034517561526e-05, + "loss": 1.8918, + "step": 5091 + }, + { + "epoch": 1.5629220380601596, + "grad_norm": 0.38191986083984375, + "learning_rate": 9.587836920563272e-05, + "loss": 1.9149, + "step": 5092 + }, + { + "epoch": 1.5632289748311847, + "grad_norm": 0.3903779089450836, + "learning_rate": 9.587639278225326e-05, + "loss": 1.9714, + "step": 5093 + }, + { + "epoch": 1.56353591160221, + "grad_norm": 0.4467499554157257, + "learning_rate": 9.587441590549639e-05, + "loss": 1.8822, + "step": 5094 + }, + { + "epoch": 1.563842848373235, + "grad_norm": 0.3819296956062317, + "learning_rate": 9.587243857538164e-05, + "loss": 1.9212, + "step": 5095 + }, + { + "epoch": 1.5641497851442603, + "grad_norm": 0.4305097162723541, + "learning_rate": 9.587046079192858e-05, + "loss": 1.9264, + "step": 5096 + }, + { + "epoch": 1.5644567219152854, + "grad_norm": 0.4135383367538452, + "learning_rate": 9.586848255515675e-05, + "loss": 1.9743, + "step": 5097 + }, + { + "epoch": 1.5647636586863105, + "grad_norm": 0.44688066840171814, + "learning_rate": 9.586650386508566e-05, + "loss": 1.8804, + "step": 5098 + }, + { + "epoch": 1.5650705954573358, + "grad_norm": 0.5358461737632751, + "learning_rate": 9.586452472173492e-05, + "loss": 1.9485, + "step": 5099 + }, + { + "epoch": 1.565377532228361, + "grad_norm": 0.5585343837738037, + "learning_rate": 9.586254512512408e-05, + "loss": 2.0901, + "step": 5100 + }, + { + "epoch": 1.565684468999386, + "grad_norm": 0.4682343602180481, + "learning_rate": 9.586056507527266e-05, + "loss": 1.8877, + "step": 5101 + }, + { + "epoch": 1.5659914057704114, + "grad_norm": 0.44076529145240784, + "learning_rate": 9.585858457220026e-05, + "loss": 1.93, + "step": 5102 + }, + { + "epoch": 1.5662983425414365, + "grad_norm": 0.4613071382045746, + "learning_rate": 9.585660361592646e-05, + "loss": 1.9689, + "step": 5103 + }, + { + "epoch": 1.5666052793124616, + "grad_norm": 0.4589289128780365, + "learning_rate": 9.585462220647082e-05, + "loss": 1.8876, + "step": 5104 + }, + { + "epoch": 1.566912216083487, + "grad_norm": 0.3495907485485077, + "learning_rate": 9.585264034385292e-05, + "loss": 1.9013, + "step": 5105 + }, + { + "epoch": 1.5672191528545119, + "grad_norm": 0.42263728380203247, + "learning_rate": 9.585065802809235e-05, + "loss": 1.8886, + "step": 5106 + }, + { + "epoch": 1.5675260896255372, + "grad_norm": 0.4275301694869995, + "learning_rate": 9.584867525920872e-05, + "loss": 1.9865, + "step": 5107 + }, + { + "epoch": 1.5678330263965623, + "grad_norm": 0.4228142201900482, + "learning_rate": 9.584669203722161e-05, + "loss": 1.8573, + "step": 5108 + }, + { + "epoch": 1.5681399631675874, + "grad_norm": 0.4422524571418762, + "learning_rate": 9.58447083621506e-05, + "loss": 1.924, + "step": 5109 + }, + { + "epoch": 1.5684468999386127, + "grad_norm": 0.41540947556495667, + "learning_rate": 9.584272423401532e-05, + "loss": 1.969, + "step": 5110 + }, + { + "epoch": 1.5687538367096379, + "grad_norm": 0.3963775336742401, + "learning_rate": 9.584073965283538e-05, + "loss": 1.9509, + "step": 5111 + }, + { + "epoch": 1.569060773480663, + "grad_norm": 0.41465985774993896, + "learning_rate": 9.583875461863037e-05, + "loss": 1.9393, + "step": 5112 + }, + { + "epoch": 1.5693677102516883, + "grad_norm": 0.4396083652973175, + "learning_rate": 9.583676913141991e-05, + "loss": 1.9872, + "step": 5113 + }, + { + "epoch": 1.5696746470227132, + "grad_norm": 0.4247182607650757, + "learning_rate": 9.583478319122366e-05, + "loss": 1.9807, + "step": 5114 + }, + { + "epoch": 1.5699815837937385, + "grad_norm": 0.3612080216407776, + "learning_rate": 9.583279679806119e-05, + "loss": 1.9563, + "step": 5115 + }, + { + "epoch": 1.5702885205647636, + "grad_norm": 0.40084055066108704, + "learning_rate": 9.583080995195217e-05, + "loss": 1.9099, + "step": 5116 + }, + { + "epoch": 1.5705954573357888, + "grad_norm": 0.432381272315979, + "learning_rate": 9.582882265291621e-05, + "loss": 2.0167, + "step": 5117 + }, + { + "epoch": 1.570902394106814, + "grad_norm": 0.45490768551826477, + "learning_rate": 9.5826834900973e-05, + "loss": 1.9179, + "step": 5118 + }, + { + "epoch": 1.5712093308778392, + "grad_norm": 0.39158329367637634, + "learning_rate": 9.582484669614211e-05, + "loss": 1.8716, + "step": 5119 + }, + { + "epoch": 1.5715162676488643, + "grad_norm": 0.45607441663742065, + "learning_rate": 9.582285803844324e-05, + "loss": 1.9631, + "step": 5120 + }, + { + "epoch": 1.5718232044198897, + "grad_norm": 0.42591094970703125, + "learning_rate": 9.582086892789604e-05, + "loss": 1.9809, + "step": 5121 + }, + { + "epoch": 1.5721301411909145, + "grad_norm": 0.46772903203964233, + "learning_rate": 9.581887936452015e-05, + "loss": 1.9991, + "step": 5122 + }, + { + "epoch": 1.5724370779619399, + "grad_norm": 0.4450485408306122, + "learning_rate": 9.581688934833524e-05, + "loss": 1.9471, + "step": 5123 + }, + { + "epoch": 1.572744014732965, + "grad_norm": 0.37539350986480713, + "learning_rate": 9.581489887936097e-05, + "loss": 1.8624, + "step": 5124 + }, + { + "epoch": 1.57305095150399, + "grad_norm": 0.4184030294418335, + "learning_rate": 9.581290795761702e-05, + "loss": 1.9746, + "step": 5125 + }, + { + "epoch": 1.5733578882750154, + "grad_norm": 0.43275317549705505, + "learning_rate": 9.581091658312305e-05, + "loss": 2.0484, + "step": 5126 + }, + { + "epoch": 1.5736648250460405, + "grad_norm": 0.48845502734184265, + "learning_rate": 9.580892475589876e-05, + "loss": 1.9331, + "step": 5127 + }, + { + "epoch": 1.5739717618170657, + "grad_norm": 0.4653528034687042, + "learning_rate": 9.580693247596383e-05, + "loss": 1.8888, + "step": 5128 + }, + { + "epoch": 1.574278698588091, + "grad_norm": 0.4371016323566437, + "learning_rate": 9.580493974333794e-05, + "loss": 1.9004, + "step": 5129 + }, + { + "epoch": 1.5745856353591159, + "grad_norm": 0.4274102747440338, + "learning_rate": 9.580294655804079e-05, + "loss": 1.9877, + "step": 5130 + }, + { + "epoch": 1.5748925721301412, + "grad_norm": 0.4053245484828949, + "learning_rate": 9.580095292009208e-05, + "loss": 1.9253, + "step": 5131 + }, + { + "epoch": 1.5751995089011663, + "grad_norm": 0.47868627309799194, + "learning_rate": 9.579895882951151e-05, + "loss": 1.9659, + "step": 5132 + }, + { + "epoch": 1.5755064456721914, + "grad_norm": 0.47420576214790344, + "learning_rate": 9.579696428631877e-05, + "loss": 1.9115, + "step": 5133 + }, + { + "epoch": 1.5758133824432168, + "grad_norm": 0.41192150115966797, + "learning_rate": 9.57949692905336e-05, + "loss": 1.8949, + "step": 5134 + }, + { + "epoch": 1.576120319214242, + "grad_norm": 0.44949471950531006, + "learning_rate": 9.57929738421757e-05, + "loss": 1.9393, + "step": 5135 + }, + { + "epoch": 1.576427255985267, + "grad_norm": 0.38450154662132263, + "learning_rate": 9.57909779412648e-05, + "loss": 1.8399, + "step": 5136 + }, + { + "epoch": 1.5767341927562923, + "grad_norm": 0.43553364276885986, + "learning_rate": 9.57889815878206e-05, + "loss": 1.9477, + "step": 5137 + }, + { + "epoch": 1.5770411295273172, + "grad_norm": 0.4546982944011688, + "learning_rate": 9.578698478186285e-05, + "loss": 1.9169, + "step": 5138 + }, + { + "epoch": 1.5773480662983426, + "grad_norm": 0.47802838683128357, + "learning_rate": 9.57849875234113e-05, + "loss": 1.9204, + "step": 5139 + }, + { + "epoch": 1.5776550030693677, + "grad_norm": 0.3648034930229187, + "learning_rate": 9.578298981248565e-05, + "loss": 1.9157, + "step": 5140 + }, + { + "epoch": 1.5779619398403928, + "grad_norm": 0.41951245069503784, + "learning_rate": 9.578099164910565e-05, + "loss": 1.9171, + "step": 5141 + }, + { + "epoch": 1.5782688766114181, + "grad_norm": 0.5198701620101929, + "learning_rate": 9.577899303329107e-05, + "loss": 1.9786, + "step": 5142 + }, + { + "epoch": 1.5785758133824432, + "grad_norm": 0.45244187116622925, + "learning_rate": 9.577699396506165e-05, + "loss": 2.0044, + "step": 5143 + }, + { + "epoch": 1.5788827501534684, + "grad_norm": 0.3874819874763489, + "learning_rate": 9.577499444443715e-05, + "loss": 1.9385, + "step": 5144 + }, + { + "epoch": 1.5791896869244937, + "grad_norm": 0.4578075110912323, + "learning_rate": 9.577299447143733e-05, + "loss": 1.9679, + "step": 5145 + }, + { + "epoch": 1.5794966236955186, + "grad_norm": 0.6001343727111816, + "learning_rate": 9.577099404608192e-05, + "loss": 1.9331, + "step": 5146 + }, + { + "epoch": 1.579803560466544, + "grad_norm": 0.5592501759529114, + "learning_rate": 9.576899316839074e-05, + "loss": 1.8968, + "step": 5147 + }, + { + "epoch": 1.580110497237569, + "grad_norm": 0.4333004951477051, + "learning_rate": 9.576699183838356e-05, + "loss": 2.0378, + "step": 5148 + }, + { + "epoch": 1.5804174340085941, + "grad_norm": 0.40593892335891724, + "learning_rate": 9.576499005608011e-05, + "loss": 1.9878, + "step": 5149 + }, + { + "epoch": 1.5807243707796195, + "grad_norm": 0.4805290400981903, + "learning_rate": 9.576298782150023e-05, + "loss": 1.9897, + "step": 5150 + }, + { + "epoch": 1.5810313075506446, + "grad_norm": 0.4620860517024994, + "learning_rate": 9.576098513466367e-05, + "loss": 1.9808, + "step": 5151 + }, + { + "epoch": 1.5813382443216697, + "grad_norm": 0.47085410356521606, + "learning_rate": 9.575898199559023e-05, + "loss": 1.9526, + "step": 5152 + }, + { + "epoch": 1.581645181092695, + "grad_norm": 0.512971043586731, + "learning_rate": 9.575697840429971e-05, + "loss": 1.9684, + "step": 5153 + }, + { + "epoch": 1.58195211786372, + "grad_norm": 0.5474939346313477, + "learning_rate": 9.575497436081193e-05, + "loss": 2.0052, + "step": 5154 + }, + { + "epoch": 1.5822590546347453, + "grad_norm": 0.6277830004692078, + "learning_rate": 9.575296986514666e-05, + "loss": 2.042, + "step": 5155 + }, + { + "epoch": 1.5825659914057704, + "grad_norm": 0.46941256523132324, + "learning_rate": 9.575096491732372e-05, + "loss": 1.952, + "step": 5156 + }, + { + "epoch": 1.5828729281767955, + "grad_norm": 0.4948115646839142, + "learning_rate": 9.574895951736294e-05, + "loss": 1.9573, + "step": 5157 + }, + { + "epoch": 1.5831798649478208, + "grad_norm": 0.5677160024642944, + "learning_rate": 9.574695366528411e-05, + "loss": 1.9696, + "step": 5158 + }, + { + "epoch": 1.583486801718846, + "grad_norm": 0.5915918350219727, + "learning_rate": 9.574494736110708e-05, + "loss": 1.9822, + "step": 5159 + }, + { + "epoch": 1.583793738489871, + "grad_norm": 0.556413471698761, + "learning_rate": 9.574294060485168e-05, + "loss": 1.9548, + "step": 5160 + }, + { + "epoch": 1.5841006752608964, + "grad_norm": 0.4706072509288788, + "learning_rate": 9.574093339653772e-05, + "loss": 2.0052, + "step": 5161 + }, + { + "epoch": 1.5844076120319213, + "grad_norm": 0.3931087553501129, + "learning_rate": 9.573892573618505e-05, + "loss": 1.9071, + "step": 5162 + }, + { + "epoch": 1.5847145488029466, + "grad_norm": 0.4590308368206024, + "learning_rate": 9.573691762381349e-05, + "loss": 2.048, + "step": 5163 + }, + { + "epoch": 1.5850214855739717, + "grad_norm": 0.4404078423976898, + "learning_rate": 9.573490905944293e-05, + "loss": 1.9426, + "step": 5164 + }, + { + "epoch": 1.5853284223449968, + "grad_norm": 0.486074298620224, + "learning_rate": 9.573290004309318e-05, + "loss": 1.9937, + "step": 5165 + }, + { + "epoch": 1.5856353591160222, + "grad_norm": 0.4650556445121765, + "learning_rate": 9.57308905747841e-05, + "loss": 1.9821, + "step": 5166 + }, + { + "epoch": 1.5859422958870473, + "grad_norm": 0.48193567991256714, + "learning_rate": 9.572888065453557e-05, + "loss": 2.0143, + "step": 5167 + }, + { + "epoch": 1.5862492326580724, + "grad_norm": 0.43178877234458923, + "learning_rate": 9.572687028236744e-05, + "loss": 2.0066, + "step": 5168 + }, + { + "epoch": 1.5865561694290977, + "grad_norm": 0.5256033539772034, + "learning_rate": 9.572485945829957e-05, + "loss": 2.0431, + "step": 5169 + }, + { + "epoch": 1.5868631062001226, + "grad_norm": 0.4714619517326355, + "learning_rate": 9.572284818235182e-05, + "loss": 1.9411, + "step": 5170 + }, + { + "epoch": 1.587170042971148, + "grad_norm": 0.4224734902381897, + "learning_rate": 9.572083645454411e-05, + "loss": 1.9648, + "step": 5171 + }, + { + "epoch": 1.5874769797421733, + "grad_norm": 0.45965152978897095, + "learning_rate": 9.571882427489628e-05, + "loss": 1.9241, + "step": 5172 + }, + { + "epoch": 1.5877839165131982, + "grad_norm": 0.459114670753479, + "learning_rate": 9.571681164342825e-05, + "loss": 2.0197, + "step": 5173 + }, + { + "epoch": 1.5880908532842235, + "grad_norm": 0.4278501272201538, + "learning_rate": 9.571479856015988e-05, + "loss": 1.9411, + "step": 5174 + }, + { + "epoch": 1.5883977900552486, + "grad_norm": 0.6875150799751282, + "learning_rate": 9.571278502511107e-05, + "loss": 1.8876, + "step": 5175 + }, + { + "epoch": 1.5887047268262737, + "grad_norm": 0.4596772789955139, + "learning_rate": 9.571077103830174e-05, + "loss": 1.9002, + "step": 5176 + }, + { + "epoch": 1.589011663597299, + "grad_norm": 0.47587937116622925, + "learning_rate": 9.570875659975178e-05, + "loss": 2.0034, + "step": 5177 + }, + { + "epoch": 1.5893186003683242, + "grad_norm": 0.42494842410087585, + "learning_rate": 9.570674170948109e-05, + "loss": 1.9668, + "step": 5178 + }, + { + "epoch": 1.5896255371393493, + "grad_norm": 0.4231310784816742, + "learning_rate": 9.570472636750957e-05, + "loss": 1.9365, + "step": 5179 + }, + { + "epoch": 1.5899324739103746, + "grad_norm": 0.4585247337818146, + "learning_rate": 9.570271057385719e-05, + "loss": 1.9707, + "step": 5180 + }, + { + "epoch": 1.5902394106813995, + "grad_norm": 0.4146895408630371, + "learning_rate": 9.570069432854382e-05, + "loss": 1.9405, + "step": 5181 + }, + { + "epoch": 1.5905463474524248, + "grad_norm": 0.42243605852127075, + "learning_rate": 9.56986776315894e-05, + "loss": 1.8893, + "step": 5182 + }, + { + "epoch": 1.59085328422345, + "grad_norm": 0.44299328327178955, + "learning_rate": 9.569666048301386e-05, + "loss": 1.9596, + "step": 5183 + }, + { + "epoch": 1.591160220994475, + "grad_norm": 0.4950970709323883, + "learning_rate": 9.569464288283716e-05, + "loss": 1.9066, + "step": 5184 + }, + { + "epoch": 1.5914671577655004, + "grad_norm": 0.4664969742298126, + "learning_rate": 9.569262483107919e-05, + "loss": 1.9485, + "step": 5185 + }, + { + "epoch": 1.5917740945365255, + "grad_norm": 0.5052160024642944, + "learning_rate": 9.569060632775993e-05, + "loss": 1.9189, + "step": 5186 + }, + { + "epoch": 1.5920810313075506, + "grad_norm": 0.4109063446521759, + "learning_rate": 9.568858737289932e-05, + "loss": 1.9236, + "step": 5187 + }, + { + "epoch": 1.592387968078576, + "grad_norm": 0.4078194499015808, + "learning_rate": 9.568656796651731e-05, + "loss": 1.9465, + "step": 5188 + }, + { + "epoch": 1.5926949048496009, + "grad_norm": 0.43199312686920166, + "learning_rate": 9.568454810863385e-05, + "loss": 1.9537, + "step": 5189 + }, + { + "epoch": 1.5930018416206262, + "grad_norm": 0.46389925479888916, + "learning_rate": 9.568252779926891e-05, + "loss": 1.9463, + "step": 5190 + }, + { + "epoch": 1.5933087783916513, + "grad_norm": 0.4130708575248718, + "learning_rate": 9.568050703844247e-05, + "loss": 1.948, + "step": 5191 + }, + { + "epoch": 1.5936157151626764, + "grad_norm": 0.4699256122112274, + "learning_rate": 9.567848582617448e-05, + "loss": 1.957, + "step": 5192 + }, + { + "epoch": 1.5939226519337018, + "grad_norm": 0.41965460777282715, + "learning_rate": 9.56764641624849e-05, + "loss": 1.9622, + "step": 5193 + }, + { + "epoch": 1.5942295887047269, + "grad_norm": 0.4313151240348816, + "learning_rate": 9.567444204739376e-05, + "loss": 1.981, + "step": 5194 + }, + { + "epoch": 1.594536525475752, + "grad_norm": 0.4149332642555237, + "learning_rate": 9.5672419480921e-05, + "loss": 1.9542, + "step": 5195 + }, + { + "epoch": 1.5948434622467773, + "grad_norm": 0.4456483721733093, + "learning_rate": 9.567039646308661e-05, + "loss": 2.0206, + "step": 5196 + }, + { + "epoch": 1.5951503990178022, + "grad_norm": 0.46637552976608276, + "learning_rate": 9.56683729939106e-05, + "loss": 2.0264, + "step": 5197 + }, + { + "epoch": 1.5954573357888275, + "grad_norm": 0.4809871315956116, + "learning_rate": 9.566634907341297e-05, + "loss": 1.9113, + "step": 5198 + }, + { + "epoch": 1.5957642725598526, + "grad_norm": 0.5220670104026794, + "learning_rate": 9.566432470161371e-05, + "loss": 1.9806, + "step": 5199 + }, + { + "epoch": 1.5960712093308778, + "grad_norm": 0.5020555853843689, + "learning_rate": 9.566229987853283e-05, + "loss": 1.9925, + "step": 5200 + }, + { + "epoch": 1.596378146101903, + "grad_norm": 0.5481683611869812, + "learning_rate": 9.566027460419034e-05, + "loss": 1.978, + "step": 5201 + }, + { + "epoch": 1.5966850828729282, + "grad_norm": 0.5014147758483887, + "learning_rate": 9.565824887860624e-05, + "loss": 1.9402, + "step": 5202 + }, + { + "epoch": 1.5969920196439533, + "grad_norm": 0.43973588943481445, + "learning_rate": 9.565622270180057e-05, + "loss": 1.9877, + "step": 5203 + }, + { + "epoch": 1.5972989564149787, + "grad_norm": 0.5172939300537109, + "learning_rate": 9.565419607379335e-05, + "loss": 1.9304, + "step": 5204 + }, + { + "epoch": 1.5976058931860035, + "grad_norm": 0.4767214357852936, + "learning_rate": 9.56521689946046e-05, + "loss": 1.9063, + "step": 5205 + }, + { + "epoch": 1.5979128299570289, + "grad_norm": 0.48810651898384094, + "learning_rate": 9.565014146425437e-05, + "loss": 1.9473, + "step": 5206 + }, + { + "epoch": 1.598219766728054, + "grad_norm": 0.4204402565956116, + "learning_rate": 9.564811348276269e-05, + "loss": 1.9562, + "step": 5207 + }, + { + "epoch": 1.598526703499079, + "grad_norm": 0.42679163813591003, + "learning_rate": 9.564608505014958e-05, + "loss": 1.8904, + "step": 5208 + }, + { + "epoch": 1.5988336402701044, + "grad_norm": 0.4240354299545288, + "learning_rate": 9.56440561664351e-05, + "loss": 1.9982, + "step": 5209 + }, + { + "epoch": 1.5991405770411296, + "grad_norm": 0.41588497161865234, + "learning_rate": 9.564202683163932e-05, + "loss": 1.9904, + "step": 5210 + }, + { + "epoch": 1.5994475138121547, + "grad_norm": 0.486240029335022, + "learning_rate": 9.563999704578226e-05, + "loss": 1.9379, + "step": 5211 + }, + { + "epoch": 1.59975445058318, + "grad_norm": 0.4628448188304901, + "learning_rate": 9.563796680888403e-05, + "loss": 2.0061, + "step": 5212 + }, + { + "epoch": 1.600061387354205, + "grad_norm": 0.4514544606208801, + "learning_rate": 9.563593612096464e-05, + "loss": 1.9692, + "step": 5213 + }, + { + "epoch": 1.6003683241252302, + "grad_norm": 0.3869803845882416, + "learning_rate": 9.563390498204419e-05, + "loss": 1.8801, + "step": 5214 + }, + { + "epoch": 1.6006752608962553, + "grad_norm": 0.47029098868370056, + "learning_rate": 9.563187339214274e-05, + "loss": 2.0457, + "step": 5215 + }, + { + "epoch": 1.6009821976672804, + "grad_norm": 0.49051982164382935, + "learning_rate": 9.562984135128037e-05, + "loss": 1.9121, + "step": 5216 + }, + { + "epoch": 1.6012891344383058, + "grad_norm": 0.5087830424308777, + "learning_rate": 9.562780885947717e-05, + "loss": 1.9165, + "step": 5217 + }, + { + "epoch": 1.601596071209331, + "grad_norm": 0.4597826600074768, + "learning_rate": 9.562577591675322e-05, + "loss": 1.9037, + "step": 5218 + }, + { + "epoch": 1.601903007980356, + "grad_norm": 0.43610528111457825, + "learning_rate": 9.562374252312858e-05, + "loss": 1.8785, + "step": 5219 + }, + { + "epoch": 1.6022099447513813, + "grad_norm": 0.45797282457351685, + "learning_rate": 9.56217086786234e-05, + "loss": 2.0713, + "step": 5220 + }, + { + "epoch": 1.6025168815224062, + "grad_norm": 0.46097078919410706, + "learning_rate": 9.561967438325777e-05, + "loss": 1.9176, + "step": 5221 + }, + { + "epoch": 1.6028238182934316, + "grad_norm": 0.47368288040161133, + "learning_rate": 9.561763963705176e-05, + "loss": 1.9333, + "step": 5222 + }, + { + "epoch": 1.6031307550644567, + "grad_norm": 0.5048179626464844, + "learning_rate": 9.561560444002551e-05, + "loss": 1.9473, + "step": 5223 + }, + { + "epoch": 1.6034376918354818, + "grad_norm": 0.42069435119628906, + "learning_rate": 9.56135687921991e-05, + "loss": 1.8507, + "step": 5224 + }, + { + "epoch": 1.6037446286065071, + "grad_norm": 0.37166985869407654, + "learning_rate": 9.561153269359269e-05, + "loss": 1.9404, + "step": 5225 + }, + { + "epoch": 1.6040515653775322, + "grad_norm": 0.42752668261528015, + "learning_rate": 9.560949614422637e-05, + "loss": 1.9791, + "step": 5226 + }, + { + "epoch": 1.6043585021485574, + "grad_norm": 0.4334527552127838, + "learning_rate": 9.560745914412029e-05, + "loss": 1.972, + "step": 5227 + }, + { + "epoch": 1.6046654389195827, + "grad_norm": 0.44162631034851074, + "learning_rate": 9.560542169329454e-05, + "loss": 1.9054, + "step": 5228 + }, + { + "epoch": 1.6049723756906076, + "grad_norm": 0.3891509771347046, + "learning_rate": 9.560338379176929e-05, + "loss": 1.9356, + "step": 5229 + }, + { + "epoch": 1.605279312461633, + "grad_norm": 0.3821989893913269, + "learning_rate": 9.56013454395647e-05, + "loss": 1.9197, + "step": 5230 + }, + { + "epoch": 1.605586249232658, + "grad_norm": 0.4338948428630829, + "learning_rate": 9.559930663670084e-05, + "loss": 2.002, + "step": 5231 + }, + { + "epoch": 1.6058931860036831, + "grad_norm": 0.4784114956855774, + "learning_rate": 9.559726738319794e-05, + "loss": 2.0344, + "step": 5232 + }, + { + "epoch": 1.6062001227747085, + "grad_norm": 0.43362441658973694, + "learning_rate": 9.559522767907612e-05, + "loss": 1.9282, + "step": 5233 + }, + { + "epoch": 1.6065070595457336, + "grad_norm": 0.40863800048828125, + "learning_rate": 9.559318752435553e-05, + "loss": 1.8468, + "step": 5234 + }, + { + "epoch": 1.6068139963167587, + "grad_norm": 0.4509727358818054, + "learning_rate": 9.559114691905633e-05, + "loss": 2.0175, + "step": 5235 + }, + { + "epoch": 1.607120933087784, + "grad_norm": 0.4650020897388458, + "learning_rate": 9.55891058631987e-05, + "loss": 1.9946, + "step": 5236 + }, + { + "epoch": 1.607427869858809, + "grad_norm": 0.4315911829471588, + "learning_rate": 9.55870643568028e-05, + "loss": 1.9271, + "step": 5237 + }, + { + "epoch": 1.6077348066298343, + "grad_norm": 0.4109809994697571, + "learning_rate": 9.558502239988882e-05, + "loss": 1.9791, + "step": 5238 + }, + { + "epoch": 1.6080417434008594, + "grad_norm": 0.4323776662349701, + "learning_rate": 9.558297999247692e-05, + "loss": 1.9745, + "step": 5239 + }, + { + "epoch": 1.6083486801718845, + "grad_norm": 0.4255007207393646, + "learning_rate": 9.558093713458729e-05, + "loss": 1.96, + "step": 5240 + }, + { + "epoch": 1.6086556169429098, + "grad_norm": 0.4045571982860565, + "learning_rate": 9.557889382624014e-05, + "loss": 1.9148, + "step": 5241 + }, + { + "epoch": 1.608962553713935, + "grad_norm": 0.39663615822792053, + "learning_rate": 9.557685006745564e-05, + "loss": 1.9313, + "step": 5242 + }, + { + "epoch": 1.60926949048496, + "grad_norm": 0.39130523800849915, + "learning_rate": 9.5574805858254e-05, + "loss": 2.0073, + "step": 5243 + }, + { + "epoch": 1.6095764272559854, + "grad_norm": 0.4071548581123352, + "learning_rate": 9.55727611986554e-05, + "loss": 1.9353, + "step": 5244 + }, + { + "epoch": 1.6098833640270105, + "grad_norm": 0.44347357749938965, + "learning_rate": 9.557071608868007e-05, + "loss": 1.9325, + "step": 5245 + }, + { + "epoch": 1.6101903007980356, + "grad_norm": 0.48900067806243896, + "learning_rate": 9.556867052834821e-05, + "loss": 2.0083, + "step": 5246 + }, + { + "epoch": 1.610497237569061, + "grad_norm": 0.44374197721481323, + "learning_rate": 9.556662451768006e-05, + "loss": 2.0143, + "step": 5247 + }, + { + "epoch": 1.6108041743400858, + "grad_norm": 0.385268896818161, + "learning_rate": 9.556457805669581e-05, + "loss": 1.8981, + "step": 5248 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.5355607867240906, + "learning_rate": 9.556253114541569e-05, + "loss": 2.0413, + "step": 5249 + }, + { + "epoch": 1.6114180478821363, + "grad_norm": 0.5672646164894104, + "learning_rate": 9.556048378385992e-05, + "loss": 1.9429, + "step": 5250 + }, + { + "epoch": 1.6117249846531614, + "grad_norm": 0.46225669980049133, + "learning_rate": 9.555843597204875e-05, + "loss": 1.9883, + "step": 5251 + }, + { + "epoch": 1.6120319214241867, + "grad_norm": 0.43236228823661804, + "learning_rate": 9.555638771000243e-05, + "loss": 1.9641, + "step": 5252 + }, + { + "epoch": 1.6123388581952118, + "grad_norm": 0.4843178987503052, + "learning_rate": 9.555433899774116e-05, + "loss": 1.9224, + "step": 5253 + }, + { + "epoch": 1.612645794966237, + "grad_norm": 0.4693675637245178, + "learning_rate": 9.555228983528523e-05, + "loss": 1.9774, + "step": 5254 + }, + { + "epoch": 1.6129527317372623, + "grad_norm": 0.3968529999256134, + "learning_rate": 9.555024022265487e-05, + "loss": 1.8939, + "step": 5255 + }, + { + "epoch": 1.6132596685082872, + "grad_norm": 0.42781850695610046, + "learning_rate": 9.554819015987033e-05, + "loss": 1.9561, + "step": 5256 + }, + { + "epoch": 1.6135666052793125, + "grad_norm": 0.5241015553474426, + "learning_rate": 9.554613964695189e-05, + "loss": 1.963, + "step": 5257 + }, + { + "epoch": 1.6138735420503376, + "grad_norm": 0.4292888641357422, + "learning_rate": 9.554408868391979e-05, + "loss": 2.0248, + "step": 5258 + }, + { + "epoch": 1.6141804788213627, + "grad_norm": 0.49197763204574585, + "learning_rate": 9.554203727079433e-05, + "loss": 1.9612, + "step": 5259 + }, + { + "epoch": 1.614487415592388, + "grad_norm": 0.45733556151390076, + "learning_rate": 9.553998540759575e-05, + "loss": 1.9093, + "step": 5260 + }, + { + "epoch": 1.6147943523634132, + "grad_norm": 0.4139576256275177, + "learning_rate": 9.553793309434436e-05, + "loss": 1.875, + "step": 5261 + }, + { + "epoch": 1.6151012891344383, + "grad_norm": 0.42295894026756287, + "learning_rate": 9.55358803310604e-05, + "loss": 1.9427, + "step": 5262 + }, + { + "epoch": 1.6154082259054636, + "grad_norm": 0.370761513710022, + "learning_rate": 9.55338271177642e-05, + "loss": 1.932, + "step": 5263 + }, + { + "epoch": 1.6157151626764885, + "grad_norm": 0.38912683725357056, + "learning_rate": 9.553177345447602e-05, + "loss": 1.9606, + "step": 5264 + }, + { + "epoch": 1.6160220994475138, + "grad_norm": 0.3901510238647461, + "learning_rate": 9.552971934121618e-05, + "loss": 1.9455, + "step": 5265 + }, + { + "epoch": 1.616329036218539, + "grad_norm": 0.4517458975315094, + "learning_rate": 9.552766477800494e-05, + "loss": 1.9291, + "step": 5266 + }, + { + "epoch": 1.616635972989564, + "grad_norm": 0.47282713651657104, + "learning_rate": 9.552560976486266e-05, + "loss": 1.9326, + "step": 5267 + }, + { + "epoch": 1.6169429097605894, + "grad_norm": 0.4741488993167877, + "learning_rate": 9.552355430180961e-05, + "loss": 1.9782, + "step": 5268 + }, + { + "epoch": 1.6172498465316145, + "grad_norm": 0.42634037137031555, + "learning_rate": 9.552149838886612e-05, + "loss": 1.9871, + "step": 5269 + }, + { + "epoch": 1.6175567833026396, + "grad_norm": 0.39007633924484253, + "learning_rate": 9.55194420260525e-05, + "loss": 1.9397, + "step": 5270 + }, + { + "epoch": 1.617863720073665, + "grad_norm": 0.41707170009613037, + "learning_rate": 9.551738521338906e-05, + "loss": 1.8555, + "step": 5271 + }, + { + "epoch": 1.6181706568446899, + "grad_norm": 0.46702343225479126, + "learning_rate": 9.551532795089616e-05, + "loss": 1.9987, + "step": 5272 + }, + { + "epoch": 1.6184775936157152, + "grad_norm": 0.44585564732551575, + "learning_rate": 9.551327023859411e-05, + "loss": 1.8512, + "step": 5273 + }, + { + "epoch": 1.6187845303867403, + "grad_norm": 0.42617684602737427, + "learning_rate": 9.551121207650324e-05, + "loss": 1.9405, + "step": 5274 + }, + { + "epoch": 1.6190914671577654, + "grad_norm": 0.39399340748786926, + "learning_rate": 9.55091534646439e-05, + "loss": 1.9787, + "step": 5275 + }, + { + "epoch": 1.6193984039287908, + "grad_norm": 0.44386324286460876, + "learning_rate": 9.550709440303642e-05, + "loss": 1.9791, + "step": 5276 + }, + { + "epoch": 1.6197053406998159, + "grad_norm": 0.3871287405490875, + "learning_rate": 9.550503489170117e-05, + "loss": 1.9354, + "step": 5277 + }, + { + "epoch": 1.620012277470841, + "grad_norm": 0.4131690263748169, + "learning_rate": 9.550297493065851e-05, + "loss": 1.9709, + "step": 5278 + }, + { + "epoch": 1.6203192142418663, + "grad_norm": 0.3919534683227539, + "learning_rate": 9.550091451992877e-05, + "loss": 1.8997, + "step": 5279 + }, + { + "epoch": 1.6206261510128912, + "grad_norm": 0.40001583099365234, + "learning_rate": 9.54988536595323e-05, + "loss": 1.9006, + "step": 5280 + }, + { + "epoch": 1.6209330877839165, + "grad_norm": 0.44222408533096313, + "learning_rate": 9.549679234948952e-05, + "loss": 2.0033, + "step": 5281 + }, + { + "epoch": 1.6212400245549416, + "grad_norm": 0.4243159592151642, + "learning_rate": 9.549473058982077e-05, + "loss": 1.9582, + "step": 5282 + }, + { + "epoch": 1.6215469613259668, + "grad_norm": 0.411408007144928, + "learning_rate": 9.549266838054641e-05, + "loss": 1.9244, + "step": 5283 + }, + { + "epoch": 1.621853898096992, + "grad_norm": 0.3833782970905304, + "learning_rate": 9.549060572168686e-05, + "loss": 1.9184, + "step": 5284 + }, + { + "epoch": 1.6221608348680172, + "grad_norm": 0.3925926685333252, + "learning_rate": 9.548854261326246e-05, + "loss": 1.9299, + "step": 5285 + }, + { + "epoch": 1.6224677716390423, + "grad_norm": 0.4472656846046448, + "learning_rate": 9.548647905529363e-05, + "loss": 2.0622, + "step": 5286 + }, + { + "epoch": 1.6227747084100677, + "grad_norm": 0.4842108488082886, + "learning_rate": 9.548441504780074e-05, + "loss": 1.9759, + "step": 5287 + }, + { + "epoch": 1.6230816451810925, + "grad_norm": 0.49826517701148987, + "learning_rate": 9.548235059080422e-05, + "loss": 1.9162, + "step": 5288 + }, + { + "epoch": 1.6233885819521179, + "grad_norm": 0.4672689735889435, + "learning_rate": 9.548028568432445e-05, + "loss": 1.9843, + "step": 5289 + }, + { + "epoch": 1.623695518723143, + "grad_norm": 0.48113325238227844, + "learning_rate": 9.547822032838182e-05, + "loss": 1.9426, + "step": 5290 + }, + { + "epoch": 1.624002455494168, + "grad_norm": 0.49646374583244324, + "learning_rate": 9.54761545229968e-05, + "loss": 1.908, + "step": 5291 + }, + { + "epoch": 1.6243093922651934, + "grad_norm": 0.42530664801597595, + "learning_rate": 9.547408826818974e-05, + "loss": 1.9189, + "step": 5292 + }, + { + "epoch": 1.6246163290362186, + "grad_norm": 0.592721164226532, + "learning_rate": 9.54720215639811e-05, + "loss": 1.9656, + "step": 5293 + }, + { + "epoch": 1.6249232658072437, + "grad_norm": 0.5530748963356018, + "learning_rate": 9.546995441039127e-05, + "loss": 1.8815, + "step": 5294 + }, + { + "epoch": 1.625230202578269, + "grad_norm": 0.4551030695438385, + "learning_rate": 9.546788680744073e-05, + "loss": 1.9485, + "step": 5295 + }, + { + "epoch": 1.625537139349294, + "grad_norm": 0.42004409432411194, + "learning_rate": 9.546581875514985e-05, + "loss": 1.9903, + "step": 5296 + }, + { + "epoch": 1.6258440761203192, + "grad_norm": 0.5363507270812988, + "learning_rate": 9.546375025353911e-05, + "loss": 1.93, + "step": 5297 + }, + { + "epoch": 1.6261510128913443, + "grad_norm": 0.457795649766922, + "learning_rate": 9.546168130262896e-05, + "loss": 1.9279, + "step": 5298 + }, + { + "epoch": 1.6264579496623695, + "grad_norm": 0.5061174631118774, + "learning_rate": 9.545961190243982e-05, + "loss": 1.9198, + "step": 5299 + }, + { + "epoch": 1.6267648864333948, + "grad_norm": 0.4366548955440521, + "learning_rate": 9.545754205299214e-05, + "loss": 1.9206, + "step": 5300 + }, + { + "epoch": 1.62707182320442, + "grad_norm": 0.361251562833786, + "learning_rate": 9.54554717543064e-05, + "loss": 1.8638, + "step": 5301 + }, + { + "epoch": 1.627378759975445, + "grad_norm": 0.45089036226272583, + "learning_rate": 9.545340100640303e-05, + "loss": 1.9206, + "step": 5302 + }, + { + "epoch": 1.6276856967464703, + "grad_norm": 0.38224726915359497, + "learning_rate": 9.545132980930251e-05, + "loss": 1.9893, + "step": 5303 + }, + { + "epoch": 1.6279926335174952, + "grad_norm": 0.43573206663131714, + "learning_rate": 9.544925816302533e-05, + "loss": 1.9358, + "step": 5304 + }, + { + "epoch": 1.6282995702885206, + "grad_norm": 0.5618723630905151, + "learning_rate": 9.544718606759193e-05, + "loss": 1.9745, + "step": 5305 + }, + { + "epoch": 1.6286065070595457, + "grad_norm": 0.517867386341095, + "learning_rate": 9.54451135230228e-05, + "loss": 2.0238, + "step": 5306 + }, + { + "epoch": 1.6289134438305708, + "grad_norm": 0.4745725393295288, + "learning_rate": 9.544304052933842e-05, + "loss": 1.999, + "step": 5307 + }, + { + "epoch": 1.6292203806015961, + "grad_norm": 0.4454270899295807, + "learning_rate": 9.544096708655928e-05, + "loss": 1.9215, + "step": 5308 + }, + { + "epoch": 1.6295273173726212, + "grad_norm": 0.5604696273803711, + "learning_rate": 9.543889319470586e-05, + "loss": 1.8756, + "step": 5309 + }, + { + "epoch": 1.6298342541436464, + "grad_norm": 0.645453155040741, + "learning_rate": 9.543681885379869e-05, + "loss": 1.9177, + "step": 5310 + }, + { + "epoch": 1.6301411909146717, + "grad_norm": 0.7018140554428101, + "learning_rate": 9.543474406385824e-05, + "loss": 1.9231, + "step": 5311 + }, + { + "epoch": 1.6304481276856968, + "grad_norm": 0.691644549369812, + "learning_rate": 9.543266882490501e-05, + "loss": 1.9055, + "step": 5312 + }, + { + "epoch": 1.630755064456722, + "grad_norm": 0.5484849810600281, + "learning_rate": 9.54305931369595e-05, + "loss": 1.8977, + "step": 5313 + }, + { + "epoch": 1.6310620012277472, + "grad_norm": 0.4035104811191559, + "learning_rate": 9.542851700004227e-05, + "loss": 1.9098, + "step": 5314 + }, + { + "epoch": 1.6313689379987721, + "grad_norm": 0.4578574299812317, + "learning_rate": 9.542644041417379e-05, + "loss": 1.9946, + "step": 5315 + }, + { + "epoch": 1.6316758747697975, + "grad_norm": 0.646272599697113, + "learning_rate": 9.542436337937462e-05, + "loss": 1.9489, + "step": 5316 + }, + { + "epoch": 1.6319828115408226, + "grad_norm": 0.5796291828155518, + "learning_rate": 9.542228589566524e-05, + "loss": 1.8396, + "step": 5317 + }, + { + "epoch": 1.6322897483118477, + "grad_norm": 0.42690619826316833, + "learning_rate": 9.542020796306623e-05, + "loss": 1.9691, + "step": 5318 + }, + { + "epoch": 1.632596685082873, + "grad_norm": 0.3943910002708435, + "learning_rate": 9.54181295815981e-05, + "loss": 1.8711, + "step": 5319 + }, + { + "epoch": 1.6329036218538981, + "grad_norm": 0.4636860489845276, + "learning_rate": 9.541605075128137e-05, + "loss": 1.8659, + "step": 5320 + }, + { + "epoch": 1.6332105586249233, + "grad_norm": 0.5485807061195374, + "learning_rate": 9.541397147213664e-05, + "loss": 2.031, + "step": 5321 + }, + { + "epoch": 1.6335174953959486, + "grad_norm": 0.40169721841812134, + "learning_rate": 9.541189174418441e-05, + "loss": 1.9346, + "step": 5322 + }, + { + "epoch": 1.6338244321669735, + "grad_norm": 0.3407663106918335, + "learning_rate": 9.540981156744524e-05, + "loss": 1.9238, + "step": 5323 + }, + { + "epoch": 1.6341313689379988, + "grad_norm": 0.4062422513961792, + "learning_rate": 9.540773094193971e-05, + "loss": 1.914, + "step": 5324 + }, + { + "epoch": 1.634438305709024, + "grad_norm": 0.47654685378074646, + "learning_rate": 9.540564986768836e-05, + "loss": 1.8957, + "step": 5325 + }, + { + "epoch": 1.634745242480049, + "grad_norm": 0.4369850754737854, + "learning_rate": 9.540356834471178e-05, + "loss": 1.968, + "step": 5326 + }, + { + "epoch": 1.6350521792510744, + "grad_norm": 0.38868457078933716, + "learning_rate": 9.540148637303052e-05, + "loss": 1.931, + "step": 5327 + }, + { + "epoch": 1.6353591160220995, + "grad_norm": 0.4998358190059662, + "learning_rate": 9.539940395266515e-05, + "loss": 1.9316, + "step": 5328 + }, + { + "epoch": 1.6356660527931246, + "grad_norm": 0.5497372150421143, + "learning_rate": 9.539732108363628e-05, + "loss": 1.9233, + "step": 5329 + }, + { + "epoch": 1.63597298956415, + "grad_norm": 0.5609846115112305, + "learning_rate": 9.539523776596445e-05, + "loss": 1.898, + "step": 5330 + }, + { + "epoch": 1.6362799263351748, + "grad_norm": 0.44984617829322815, + "learning_rate": 9.539315399967029e-05, + "loss": 2.0103, + "step": 5331 + }, + { + "epoch": 1.6365868631062002, + "grad_norm": 0.41710013151168823, + "learning_rate": 9.539106978477436e-05, + "loss": 1.9008, + "step": 5332 + }, + { + "epoch": 1.6368937998772253, + "grad_norm": 0.44854703545570374, + "learning_rate": 9.53889851212973e-05, + "loss": 1.9591, + "step": 5333 + }, + { + "epoch": 1.6372007366482504, + "grad_norm": 0.4259171485900879, + "learning_rate": 9.538690000925968e-05, + "loss": 1.915, + "step": 5334 + }, + { + "epoch": 1.6375076734192757, + "grad_norm": 0.4444480240345001, + "learning_rate": 9.53848144486821e-05, + "loss": 1.9562, + "step": 5335 + }, + { + "epoch": 1.6378146101903008, + "grad_norm": 0.40078794956207275, + "learning_rate": 9.538272843958518e-05, + "loss": 1.8802, + "step": 5336 + }, + { + "epoch": 1.638121546961326, + "grad_norm": 0.5346726179122925, + "learning_rate": 9.538064198198955e-05, + "loss": 2.0214, + "step": 5337 + }, + { + "epoch": 1.6384284837323513, + "grad_norm": 0.47136780619621277, + "learning_rate": 9.537855507591581e-05, + "loss": 1.9593, + "step": 5338 + }, + { + "epoch": 1.6387354205033762, + "grad_norm": 0.3839198052883148, + "learning_rate": 9.53764677213846e-05, + "loss": 1.9507, + "step": 5339 + }, + { + "epoch": 1.6390423572744015, + "grad_norm": 0.4565586447715759, + "learning_rate": 9.537437991841654e-05, + "loss": 1.9292, + "step": 5340 + }, + { + "epoch": 1.6393492940454266, + "grad_norm": 0.5139011740684509, + "learning_rate": 9.537229166703225e-05, + "loss": 1.9388, + "step": 5341 + }, + { + "epoch": 1.6396562308164517, + "grad_norm": 0.5421571135520935, + "learning_rate": 9.537020296725238e-05, + "loss": 1.9031, + "step": 5342 + }, + { + "epoch": 1.639963167587477, + "grad_norm": 0.4085434675216675, + "learning_rate": 9.536811381909758e-05, + "loss": 1.9167, + "step": 5343 + }, + { + "epoch": 1.6402701043585022, + "grad_norm": 0.3567824065685272, + "learning_rate": 9.536602422258849e-05, + "loss": 1.89, + "step": 5344 + }, + { + "epoch": 1.6405770411295273, + "grad_norm": 0.5427443385124207, + "learning_rate": 9.536393417774575e-05, + "loss": 2.0036, + "step": 5345 + }, + { + "epoch": 1.6408839779005526, + "grad_norm": 0.5275370478630066, + "learning_rate": 9.536184368459003e-05, + "loss": 1.94, + "step": 5346 + }, + { + "epoch": 1.6411909146715775, + "grad_norm": 0.3916989862918854, + "learning_rate": 9.535975274314198e-05, + "loss": 1.8769, + "step": 5347 + }, + { + "epoch": 1.6414978514426029, + "grad_norm": 0.4200802743434906, + "learning_rate": 9.535766135342228e-05, + "loss": 1.9384, + "step": 5348 + }, + { + "epoch": 1.641804788213628, + "grad_norm": 0.5287195444107056, + "learning_rate": 9.535556951545157e-05, + "loss": 1.9159, + "step": 5349 + }, + { + "epoch": 1.642111724984653, + "grad_norm": 0.5934851765632629, + "learning_rate": 9.535347722925055e-05, + "loss": 1.9927, + "step": 5350 + }, + { + "epoch": 1.6424186617556784, + "grad_norm": 0.49941807985305786, + "learning_rate": 9.535138449483987e-05, + "loss": 1.9124, + "step": 5351 + }, + { + "epoch": 1.6427255985267035, + "grad_norm": 0.41778016090393066, + "learning_rate": 9.534929131224024e-05, + "loss": 1.9468, + "step": 5352 + }, + { + "epoch": 1.6430325352977286, + "grad_norm": 0.5172474384307861, + "learning_rate": 9.534719768147233e-05, + "loss": 1.928, + "step": 5353 + }, + { + "epoch": 1.643339472068754, + "grad_norm": 0.6690294146537781, + "learning_rate": 9.534510360255683e-05, + "loss": 1.9697, + "step": 5354 + }, + { + "epoch": 1.6436464088397789, + "grad_norm": 0.617683470249176, + "learning_rate": 9.534300907551444e-05, + "loss": 1.9529, + "step": 5355 + }, + { + "epoch": 1.6439533456108042, + "grad_norm": 0.40067893266677856, + "learning_rate": 9.534091410036587e-05, + "loss": 1.915, + "step": 5356 + }, + { + "epoch": 1.6442602823818293, + "grad_norm": 0.46418440341949463, + "learning_rate": 9.53388186771318e-05, + "loss": 1.9056, + "step": 5357 + }, + { + "epoch": 1.6445672191528544, + "grad_norm": 0.6600098013877869, + "learning_rate": 9.533672280583295e-05, + "loss": 1.9641, + "step": 5358 + }, + { + "epoch": 1.6448741559238798, + "grad_norm": 0.6510347127914429, + "learning_rate": 9.533462648649004e-05, + "loss": 1.916, + "step": 5359 + }, + { + "epoch": 1.6451810926949049, + "grad_norm": 0.5004377365112305, + "learning_rate": 9.533252971912376e-05, + "loss": 1.9584, + "step": 5360 + }, + { + "epoch": 1.64548802946593, + "grad_norm": 0.45522230863571167, + "learning_rate": 9.533043250375488e-05, + "loss": 1.973, + "step": 5361 + }, + { + "epoch": 1.6457949662369553, + "grad_norm": 0.5304180383682251, + "learning_rate": 9.532833484040408e-05, + "loss": 1.8542, + "step": 5362 + }, + { + "epoch": 1.6461019030079802, + "grad_norm": 0.5320406556129456, + "learning_rate": 9.53262367290921e-05, + "loss": 1.9405, + "step": 5363 + }, + { + "epoch": 1.6464088397790055, + "grad_norm": 0.4377361536026001, + "learning_rate": 9.532413816983969e-05, + "loss": 1.9126, + "step": 5364 + }, + { + "epoch": 1.6467157765500307, + "grad_norm": 0.4632298946380615, + "learning_rate": 9.532203916266758e-05, + "loss": 1.9868, + "step": 5365 + }, + { + "epoch": 1.6470227133210558, + "grad_norm": 0.4861730635166168, + "learning_rate": 9.531993970759651e-05, + "loss": 1.895, + "step": 5366 + }, + { + "epoch": 1.647329650092081, + "grad_norm": 0.45012348890304565, + "learning_rate": 9.531783980464726e-05, + "loss": 1.9583, + "step": 5367 + }, + { + "epoch": 1.6476365868631062, + "grad_norm": 0.43772751092910767, + "learning_rate": 9.531573945384053e-05, + "loss": 1.9341, + "step": 5368 + }, + { + "epoch": 1.6479435236341313, + "grad_norm": 0.39253392815589905, + "learning_rate": 9.531363865519711e-05, + "loss": 1.8629, + "step": 5369 + }, + { + "epoch": 1.6482504604051567, + "grad_norm": 0.44614076614379883, + "learning_rate": 9.531153740873775e-05, + "loss": 1.9508, + "step": 5370 + }, + { + "epoch": 1.6485573971761815, + "grad_norm": 0.4442307949066162, + "learning_rate": 9.530943571448322e-05, + "loss": 1.9624, + "step": 5371 + }, + { + "epoch": 1.6488643339472069, + "grad_norm": 0.44962942600250244, + "learning_rate": 9.53073335724543e-05, + "loss": 1.9315, + "step": 5372 + }, + { + "epoch": 1.649171270718232, + "grad_norm": 0.4903222620487213, + "learning_rate": 9.530523098267173e-05, + "loss": 1.8776, + "step": 5373 + }, + { + "epoch": 1.649478207489257, + "grad_norm": 0.4733131229877472, + "learning_rate": 9.530312794515633e-05, + "loss": 1.958, + "step": 5374 + }, + { + "epoch": 1.6497851442602824, + "grad_norm": 0.4134232997894287, + "learning_rate": 9.530102445992886e-05, + "loss": 1.9184, + "step": 5375 + }, + { + "epoch": 1.6500920810313076, + "grad_norm": 0.43521758913993835, + "learning_rate": 9.529892052701012e-05, + "loss": 1.9383, + "step": 5376 + }, + { + "epoch": 1.6503990178023327, + "grad_norm": 0.5098583102226257, + "learning_rate": 9.52968161464209e-05, + "loss": 1.9596, + "step": 5377 + }, + { + "epoch": 1.650705954573358, + "grad_norm": 0.48421037197113037, + "learning_rate": 9.5294711318182e-05, + "loss": 1.9258, + "step": 5378 + }, + { + "epoch": 1.651012891344383, + "grad_norm": 0.4039461314678192, + "learning_rate": 9.52926060423142e-05, + "loss": 1.9975, + "step": 5379 + }, + { + "epoch": 1.6513198281154082, + "grad_norm": 0.491858571767807, + "learning_rate": 9.529050031883832e-05, + "loss": 1.9564, + "step": 5380 + }, + { + "epoch": 1.6516267648864333, + "grad_norm": 0.45920100808143616, + "learning_rate": 9.528839414777517e-05, + "loss": 1.8513, + "step": 5381 + }, + { + "epoch": 1.6519337016574585, + "grad_norm": 0.4812139868736267, + "learning_rate": 9.528628752914558e-05, + "loss": 1.9638, + "step": 5382 + }, + { + "epoch": 1.6522406384284838, + "grad_norm": 0.38021141290664673, + "learning_rate": 9.528418046297034e-05, + "loss": 1.848, + "step": 5383 + }, + { + "epoch": 1.652547575199509, + "grad_norm": 0.438681960105896, + "learning_rate": 9.52820729492703e-05, + "loss": 1.9931, + "step": 5384 + }, + { + "epoch": 1.652854511970534, + "grad_norm": 0.4387293756008148, + "learning_rate": 9.527996498806627e-05, + "loss": 1.9969, + "step": 5385 + }, + { + "epoch": 1.6531614487415593, + "grad_norm": 0.43315380811691284, + "learning_rate": 9.527785657937907e-05, + "loss": 1.9607, + "step": 5386 + }, + { + "epoch": 1.6534683855125845, + "grad_norm": 0.4800446927547455, + "learning_rate": 9.527574772322956e-05, + "loss": 1.9645, + "step": 5387 + }, + { + "epoch": 1.6537753222836096, + "grad_norm": 0.45495909452438354, + "learning_rate": 9.527363841963857e-05, + "loss": 1.8748, + "step": 5388 + }, + { + "epoch": 1.654082259054635, + "grad_norm": 0.4052638113498688, + "learning_rate": 9.527152866862696e-05, + "loss": 1.9491, + "step": 5389 + }, + { + "epoch": 1.6543891958256598, + "grad_norm": 0.44545745849609375, + "learning_rate": 9.526941847021558e-05, + "loss": 1.8938, + "step": 5390 + }, + { + "epoch": 1.6546961325966851, + "grad_norm": 0.5576399564743042, + "learning_rate": 9.526730782442526e-05, + "loss": 1.9656, + "step": 5391 + }, + { + "epoch": 1.6550030693677102, + "grad_norm": 0.5678401589393616, + "learning_rate": 9.526519673127686e-05, + "loss": 1.9914, + "step": 5392 + }, + { + "epoch": 1.6553100061387354, + "grad_norm": 0.4391598701477051, + "learning_rate": 9.526308519079127e-05, + "loss": 1.9452, + "step": 5393 + }, + { + "epoch": 1.6556169429097607, + "grad_norm": 0.4375559091567993, + "learning_rate": 9.526097320298934e-05, + "loss": 1.9335, + "step": 5394 + }, + { + "epoch": 1.6559238796807858, + "grad_norm": 0.4976498782634735, + "learning_rate": 9.525886076789194e-05, + "loss": 2.0065, + "step": 5395 + }, + { + "epoch": 1.656230816451811, + "grad_norm": 0.5966445207595825, + "learning_rate": 9.525674788551996e-05, + "loss": 1.9924, + "step": 5396 + }, + { + "epoch": 1.6565377532228363, + "grad_norm": 0.5119359493255615, + "learning_rate": 9.525463455589427e-05, + "loss": 2.0061, + "step": 5397 + }, + { + "epoch": 1.6568446899938611, + "grad_norm": 0.46835067868232727, + "learning_rate": 9.525252077903574e-05, + "loss": 1.9441, + "step": 5398 + }, + { + "epoch": 1.6571516267648865, + "grad_norm": 0.5319140553474426, + "learning_rate": 9.52504065549653e-05, + "loss": 1.9704, + "step": 5399 + }, + { + "epoch": 1.6574585635359116, + "grad_norm": 0.5132572054862976, + "learning_rate": 9.52482918837038e-05, + "loss": 1.9037, + "step": 5400 + }, + { + "epoch": 1.6577655003069367, + "grad_norm": 0.41260987520217896, + "learning_rate": 9.524617676527218e-05, + "loss": 1.9103, + "step": 5401 + }, + { + "epoch": 1.658072437077962, + "grad_norm": 0.41780540347099304, + "learning_rate": 9.524406119969131e-05, + "loss": 1.9419, + "step": 5402 + }, + { + "epoch": 1.6583793738489871, + "grad_norm": 0.42015889286994934, + "learning_rate": 9.524194518698211e-05, + "loss": 1.9143, + "step": 5403 + }, + { + "epoch": 1.6586863106200123, + "grad_norm": 0.4449796676635742, + "learning_rate": 9.523982872716548e-05, + "loss": 1.9794, + "step": 5404 + }, + { + "epoch": 1.6589932473910376, + "grad_norm": 0.4392293393611908, + "learning_rate": 9.523771182026237e-05, + "loss": 1.8687, + "step": 5405 + }, + { + "epoch": 1.6593001841620625, + "grad_norm": 0.49595963954925537, + "learning_rate": 9.523559446629366e-05, + "loss": 2.013, + "step": 5406 + }, + { + "epoch": 1.6596071209330878, + "grad_norm": 0.4456728994846344, + "learning_rate": 9.523347666528029e-05, + "loss": 1.9269, + "step": 5407 + }, + { + "epoch": 1.659914057704113, + "grad_norm": 0.3835284411907196, + "learning_rate": 9.52313584172432e-05, + "loss": 1.9042, + "step": 5408 + }, + { + "epoch": 1.660220994475138, + "grad_norm": 0.39068692922592163, + "learning_rate": 9.522923972220332e-05, + "loss": 1.999, + "step": 5409 + }, + { + "epoch": 1.6605279312461634, + "grad_norm": 0.4522729814052582, + "learning_rate": 9.522712058018157e-05, + "loss": 1.9546, + "step": 5410 + }, + { + "epoch": 1.6608348680171885, + "grad_norm": 0.3834155201911926, + "learning_rate": 9.522500099119891e-05, + "loss": 1.9184, + "step": 5411 + }, + { + "epoch": 1.6611418047882136, + "grad_norm": 0.36149126291275024, + "learning_rate": 9.522288095527629e-05, + "loss": 1.8973, + "step": 5412 + }, + { + "epoch": 1.661448741559239, + "grad_norm": 0.3502398729324341, + "learning_rate": 9.522076047243464e-05, + "loss": 1.8775, + "step": 5413 + }, + { + "epoch": 1.6617556783302638, + "grad_norm": 0.36552321910858154, + "learning_rate": 9.521863954269495e-05, + "loss": 1.901, + "step": 5414 + }, + { + "epoch": 1.6620626151012892, + "grad_norm": 0.37815216183662415, + "learning_rate": 9.521651816607814e-05, + "loss": 1.9143, + "step": 5415 + }, + { + "epoch": 1.6623695518723143, + "grad_norm": 0.4048994481563568, + "learning_rate": 9.52143963426052e-05, + "loss": 1.9892, + "step": 5416 + }, + { + "epoch": 1.6626764886433394, + "grad_norm": 0.35271233320236206, + "learning_rate": 9.52122740722971e-05, + "loss": 1.9209, + "step": 5417 + }, + { + "epoch": 1.6629834254143647, + "grad_norm": 0.405009925365448, + "learning_rate": 9.521015135517482e-05, + "loss": 1.9583, + "step": 5418 + }, + { + "epoch": 1.6632903621853898, + "grad_norm": 0.4041683077812195, + "learning_rate": 9.520802819125932e-05, + "loss": 1.8937, + "step": 5419 + }, + { + "epoch": 1.663597298956415, + "grad_norm": 0.41353970766067505, + "learning_rate": 9.520590458057157e-05, + "loss": 1.949, + "step": 5420 + }, + { + "epoch": 1.6639042357274403, + "grad_norm": 0.3704569637775421, + "learning_rate": 9.520378052313258e-05, + "loss": 1.9287, + "step": 5421 + }, + { + "epoch": 1.6642111724984652, + "grad_norm": 0.4043133854866028, + "learning_rate": 9.520165601896334e-05, + "loss": 1.9116, + "step": 5422 + }, + { + "epoch": 1.6645181092694905, + "grad_norm": 0.3976849317550659, + "learning_rate": 9.519953106808485e-05, + "loss": 1.9578, + "step": 5423 + }, + { + "epoch": 1.6648250460405156, + "grad_norm": 0.41225695610046387, + "learning_rate": 9.51974056705181e-05, + "loss": 1.8861, + "step": 5424 + }, + { + "epoch": 1.6651319828115407, + "grad_norm": 0.40096259117126465, + "learning_rate": 9.519527982628409e-05, + "loss": 1.926, + "step": 5425 + }, + { + "epoch": 1.665438919582566, + "grad_norm": 0.4373134970664978, + "learning_rate": 9.519315353540384e-05, + "loss": 1.8761, + "step": 5426 + }, + { + "epoch": 1.6657458563535912, + "grad_norm": 0.3798682689666748, + "learning_rate": 9.519102679789835e-05, + "loss": 1.8655, + "step": 5427 + }, + { + "epoch": 1.6660527931246163, + "grad_norm": 0.3889687955379486, + "learning_rate": 9.518889961378865e-05, + "loss": 1.8928, + "step": 5428 + }, + { + "epoch": 1.6663597298956416, + "grad_norm": 0.39567697048187256, + "learning_rate": 9.518677198309575e-05, + "loss": 1.9193, + "step": 5429 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.37571004033088684, + "learning_rate": 9.51846439058407e-05, + "loss": 1.9653, + "step": 5430 + }, + { + "epoch": 1.6669736034376919, + "grad_norm": 0.36011725664138794, + "learning_rate": 9.518251538204451e-05, + "loss": 1.9202, + "step": 5431 + }, + { + "epoch": 1.667280540208717, + "grad_norm": 0.42314839363098145, + "learning_rate": 9.518038641172822e-05, + "loss": 1.9883, + "step": 5432 + }, + { + "epoch": 1.667587476979742, + "grad_norm": 0.3986029326915741, + "learning_rate": 9.517825699491287e-05, + "loss": 1.9838, + "step": 5433 + }, + { + "epoch": 1.6678944137507674, + "grad_norm": 0.388236939907074, + "learning_rate": 9.517612713161949e-05, + "loss": 1.901, + "step": 5434 + }, + { + "epoch": 1.6682013505217925, + "grad_norm": 0.3849826455116272, + "learning_rate": 9.517399682186917e-05, + "loss": 1.9621, + "step": 5435 + }, + { + "epoch": 1.6685082872928176, + "grad_norm": 0.40182530879974365, + "learning_rate": 9.517186606568292e-05, + "loss": 1.9081, + "step": 5436 + }, + { + "epoch": 1.668815224063843, + "grad_norm": 0.4260261654853821, + "learning_rate": 9.516973486308181e-05, + "loss": 1.9701, + "step": 5437 + }, + { + "epoch": 1.6691221608348679, + "grad_norm": 0.4035099744796753, + "learning_rate": 9.516760321408692e-05, + "loss": 1.9269, + "step": 5438 + }, + { + "epoch": 1.6694290976058932, + "grad_norm": 0.42106589674949646, + "learning_rate": 9.51654711187193e-05, + "loss": 1.9026, + "step": 5439 + }, + { + "epoch": 1.6697360343769183, + "grad_norm": 0.4629819989204407, + "learning_rate": 9.516333857700001e-05, + "loss": 1.9128, + "step": 5440 + }, + { + "epoch": 1.6700429711479434, + "grad_norm": 0.3824837803840637, + "learning_rate": 9.516120558895014e-05, + "loss": 1.8861, + "step": 5441 + }, + { + "epoch": 1.6703499079189688, + "grad_norm": 0.37263223528862, + "learning_rate": 9.515907215459076e-05, + "loss": 1.9098, + "step": 5442 + }, + { + "epoch": 1.6706568446899939, + "grad_norm": 0.3980494439601898, + "learning_rate": 9.515693827394299e-05, + "loss": 1.9764, + "step": 5443 + }, + { + "epoch": 1.670963781461019, + "grad_norm": 0.5064507722854614, + "learning_rate": 9.515480394702786e-05, + "loss": 1.9771, + "step": 5444 + }, + { + "epoch": 1.6712707182320443, + "grad_norm": 0.5012909770011902, + "learning_rate": 9.515266917386649e-05, + "loss": 1.9162, + "step": 5445 + }, + { + "epoch": 1.6715776550030692, + "grad_norm": 0.5422279238700867, + "learning_rate": 9.515053395447999e-05, + "loss": 1.8913, + "step": 5446 + }, + { + "epoch": 1.6718845917740945, + "grad_norm": 0.4677022397518158, + "learning_rate": 9.514839828888946e-05, + "loss": 1.9156, + "step": 5447 + }, + { + "epoch": 1.6721915285451197, + "grad_norm": 0.39561185240745544, + "learning_rate": 9.514626217711597e-05, + "loss": 1.9203, + "step": 5448 + }, + { + "epoch": 1.6724984653161448, + "grad_norm": 0.4435743987560272, + "learning_rate": 9.514412561918068e-05, + "loss": 1.953, + "step": 5449 + }, + { + "epoch": 1.67280540208717, + "grad_norm": 0.5383535027503967, + "learning_rate": 9.514198861510467e-05, + "loss": 1.9662, + "step": 5450 + }, + { + "epoch": 1.6731123388581952, + "grad_norm": 0.4787214696407318, + "learning_rate": 9.513985116490906e-05, + "loss": 1.9278, + "step": 5451 + }, + { + "epoch": 1.6734192756292203, + "grad_norm": 0.40962034463882446, + "learning_rate": 9.513771326861501e-05, + "loss": 1.9267, + "step": 5452 + }, + { + "epoch": 1.6737262124002457, + "grad_norm": 0.43605929613113403, + "learning_rate": 9.513557492624359e-05, + "loss": 1.9537, + "step": 5453 + }, + { + "epoch": 1.6740331491712708, + "grad_norm": 0.46278494596481323, + "learning_rate": 9.513343613781599e-05, + "loss": 1.9383, + "step": 5454 + }, + { + "epoch": 1.6743400859422959, + "grad_norm": 0.4052918255329132, + "learning_rate": 9.513129690335331e-05, + "loss": 1.9289, + "step": 5455 + }, + { + "epoch": 1.6746470227133212, + "grad_norm": 0.37791141867637634, + "learning_rate": 9.51291572228767e-05, + "loss": 1.9185, + "step": 5456 + }, + { + "epoch": 1.674953959484346, + "grad_norm": 0.41135111451148987, + "learning_rate": 9.512701709640731e-05, + "loss": 2.0003, + "step": 5457 + }, + { + "epoch": 1.6752608962553714, + "grad_norm": 0.41175320744514465, + "learning_rate": 9.512487652396629e-05, + "loss": 1.9307, + "step": 5458 + }, + { + "epoch": 1.6755678330263966, + "grad_norm": 0.40061330795288086, + "learning_rate": 9.512273550557478e-05, + "loss": 1.9361, + "step": 5459 + }, + { + "epoch": 1.6758747697974217, + "grad_norm": 0.3938329219818115, + "learning_rate": 9.512059404125397e-05, + "loss": 1.9419, + "step": 5460 + }, + { + "epoch": 1.676181706568447, + "grad_norm": 0.42825883626937866, + "learning_rate": 9.511845213102498e-05, + "loss": 1.9201, + "step": 5461 + }, + { + "epoch": 1.6764886433394721, + "grad_norm": 0.3795798122882843, + "learning_rate": 9.511630977490901e-05, + "loss": 1.9872, + "step": 5462 + }, + { + "epoch": 1.6767955801104972, + "grad_norm": 0.3639005422592163, + "learning_rate": 9.511416697292724e-05, + "loss": 1.9066, + "step": 5463 + }, + { + "epoch": 1.6771025168815226, + "grad_norm": 0.4200088381767273, + "learning_rate": 9.511202372510082e-05, + "loss": 1.9928, + "step": 5464 + }, + { + "epoch": 1.6774094536525475, + "grad_norm": 0.436638742685318, + "learning_rate": 9.510988003145092e-05, + "loss": 1.8527, + "step": 5465 + }, + { + "epoch": 1.6777163904235728, + "grad_norm": 0.40901345014572144, + "learning_rate": 9.510773589199877e-05, + "loss": 1.9915, + "step": 5466 + }, + { + "epoch": 1.678023327194598, + "grad_norm": 0.39717167615890503, + "learning_rate": 9.510559130676553e-05, + "loss": 1.9682, + "step": 5467 + }, + { + "epoch": 1.678330263965623, + "grad_norm": 0.37574490904808044, + "learning_rate": 9.510344627577239e-05, + "loss": 1.9641, + "step": 5468 + }, + { + "epoch": 1.6786372007366483, + "grad_norm": 0.36686137318611145, + "learning_rate": 9.510130079904057e-05, + "loss": 1.9082, + "step": 5469 + }, + { + "epoch": 1.6789441375076735, + "grad_norm": 0.37321972846984863, + "learning_rate": 9.509915487659125e-05, + "loss": 1.8911, + "step": 5470 + }, + { + "epoch": 1.6792510742786986, + "grad_norm": 0.3911389112472534, + "learning_rate": 9.509700850844566e-05, + "loss": 1.9721, + "step": 5471 + }, + { + "epoch": 1.679558011049724, + "grad_norm": 0.41182973980903625, + "learning_rate": 9.509486169462499e-05, + "loss": 1.9188, + "step": 5472 + }, + { + "epoch": 1.6798649478207488, + "grad_norm": 0.4141900837421417, + "learning_rate": 9.509271443515047e-05, + "loss": 1.875, + "step": 5473 + }, + { + "epoch": 1.6801718845917741, + "grad_norm": 0.4259745478630066, + "learning_rate": 9.509056673004333e-05, + "loss": 1.9258, + "step": 5474 + }, + { + "epoch": 1.6804788213627992, + "grad_norm": 0.47081178426742554, + "learning_rate": 9.508841857932476e-05, + "loss": 2.0494, + "step": 5475 + }, + { + "epoch": 1.6807857581338244, + "grad_norm": 0.5346465110778809, + "learning_rate": 9.508626998301602e-05, + "loss": 1.9371, + "step": 5476 + }, + { + "epoch": 1.6810926949048497, + "grad_norm": 0.5532976388931274, + "learning_rate": 9.508412094113832e-05, + "loss": 1.8727, + "step": 5477 + }, + { + "epoch": 1.6813996316758748, + "grad_norm": 0.5262138843536377, + "learning_rate": 9.508197145371294e-05, + "loss": 1.9098, + "step": 5478 + }, + { + "epoch": 1.6817065684469, + "grad_norm": 0.47581788897514343, + "learning_rate": 9.507982152076108e-05, + "loss": 1.9174, + "step": 5479 + }, + { + "epoch": 1.6820135052179253, + "grad_norm": 0.41795024275779724, + "learning_rate": 9.507767114230399e-05, + "loss": 1.9333, + "step": 5480 + }, + { + "epoch": 1.6823204419889501, + "grad_norm": 0.5213392376899719, + "learning_rate": 9.507552031836295e-05, + "loss": 1.9731, + "step": 5481 + }, + { + "epoch": 1.6826273787599755, + "grad_norm": 0.624969482421875, + "learning_rate": 9.507336904895919e-05, + "loss": 1.965, + "step": 5482 + }, + { + "epoch": 1.6829343155310006, + "grad_norm": 0.5719303488731384, + "learning_rate": 9.507121733411397e-05, + "loss": 1.9325, + "step": 5483 + }, + { + "epoch": 1.6832412523020257, + "grad_norm": 0.45429563522338867, + "learning_rate": 9.506906517384858e-05, + "loss": 1.8846, + "step": 5484 + }, + { + "epoch": 1.683548189073051, + "grad_norm": 0.4679521322250366, + "learning_rate": 9.506691256818427e-05, + "loss": 1.9609, + "step": 5485 + }, + { + "epoch": 1.6838551258440762, + "grad_norm": 0.64385986328125, + "learning_rate": 9.50647595171423e-05, + "loss": 1.9138, + "step": 5486 + }, + { + "epoch": 1.6841620626151013, + "grad_norm": 0.6783073544502258, + "learning_rate": 9.506260602074398e-05, + "loss": 2.0252, + "step": 5487 + }, + { + "epoch": 1.6844689993861266, + "grad_norm": 0.6151844263076782, + "learning_rate": 9.506045207901058e-05, + "loss": 2.0077, + "step": 5488 + }, + { + "epoch": 1.6847759361571515, + "grad_norm": 0.43046683073043823, + "learning_rate": 9.505829769196338e-05, + "loss": 1.8945, + "step": 5489 + }, + { + "epoch": 1.6850828729281768, + "grad_norm": 0.44831258058547974, + "learning_rate": 9.505614285962366e-05, + "loss": 1.9775, + "step": 5490 + }, + { + "epoch": 1.685389809699202, + "grad_norm": 0.4917668402194977, + "learning_rate": 9.505398758201272e-05, + "loss": 1.9115, + "step": 5491 + }, + { + "epoch": 1.685696746470227, + "grad_norm": 0.4595036506652832, + "learning_rate": 9.505183185915187e-05, + "loss": 1.9103, + "step": 5492 + }, + { + "epoch": 1.6860036832412524, + "grad_norm": 0.43335607647895813, + "learning_rate": 9.504967569106243e-05, + "loss": 1.9147, + "step": 5493 + }, + { + "epoch": 1.6863106200122775, + "grad_norm": 0.42885956168174744, + "learning_rate": 9.504751907776567e-05, + "loss": 2.0085, + "step": 5494 + }, + { + "epoch": 1.6866175567833026, + "grad_norm": 0.4121492803096771, + "learning_rate": 9.504536201928295e-05, + "loss": 1.9212, + "step": 5495 + }, + { + "epoch": 1.686924493554328, + "grad_norm": 0.4387015700340271, + "learning_rate": 9.504320451563555e-05, + "loss": 1.9202, + "step": 5496 + }, + { + "epoch": 1.6872314303253528, + "grad_norm": 0.4333394467830658, + "learning_rate": 9.504104656684481e-05, + "loss": 1.9165, + "step": 5497 + }, + { + "epoch": 1.6875383670963782, + "grad_norm": 0.37835901975631714, + "learning_rate": 9.503888817293203e-05, + "loss": 1.9087, + "step": 5498 + }, + { + "epoch": 1.6878453038674033, + "grad_norm": 0.42156684398651123, + "learning_rate": 9.503672933391857e-05, + "loss": 1.8909, + "step": 5499 + }, + { + "epoch": 1.6881522406384284, + "grad_norm": 0.4315885603427887, + "learning_rate": 9.503457004982574e-05, + "loss": 1.8892, + "step": 5500 + }, + { + "epoch": 1.6884591774094537, + "grad_norm": 0.4349892735481262, + "learning_rate": 9.50324103206749e-05, + "loss": 1.9532, + "step": 5501 + }, + { + "epoch": 1.6887661141804788, + "grad_norm": 0.45786523818969727, + "learning_rate": 9.503025014648739e-05, + "loss": 1.9285, + "step": 5502 + }, + { + "epoch": 1.689073050951504, + "grad_norm": 0.36640092730522156, + "learning_rate": 9.502808952728456e-05, + "loss": 1.9167, + "step": 5503 + }, + { + "epoch": 1.6893799877225293, + "grad_norm": 0.46942031383514404, + "learning_rate": 9.502592846308775e-05, + "loss": 2.08, + "step": 5504 + }, + { + "epoch": 1.6896869244935542, + "grad_norm": 0.44714173674583435, + "learning_rate": 9.502376695391833e-05, + "loss": 1.9618, + "step": 5505 + }, + { + "epoch": 1.6899938612645795, + "grad_norm": 0.4216810464859009, + "learning_rate": 9.502160499979764e-05, + "loss": 1.888, + "step": 5506 + }, + { + "epoch": 1.6903007980356046, + "grad_norm": 0.40471377968788147, + "learning_rate": 9.501944260074709e-05, + "loss": 1.9048, + "step": 5507 + }, + { + "epoch": 1.6906077348066297, + "grad_norm": 0.399309366941452, + "learning_rate": 9.501727975678801e-05, + "loss": 1.8796, + "step": 5508 + }, + { + "epoch": 1.690914671577655, + "grad_norm": 0.36903873085975647, + "learning_rate": 9.501511646794176e-05, + "loss": 1.9607, + "step": 5509 + }, + { + "epoch": 1.6912216083486802, + "grad_norm": 0.40781939029693604, + "learning_rate": 9.501295273422977e-05, + "loss": 1.9328, + "step": 5510 + }, + { + "epoch": 1.6915285451197053, + "grad_norm": 0.38062483072280884, + "learning_rate": 9.50107885556734e-05, + "loss": 1.9552, + "step": 5511 + }, + { + "epoch": 1.6918354818907306, + "grad_norm": 0.4047648012638092, + "learning_rate": 9.500862393229402e-05, + "loss": 1.9503, + "step": 5512 + }, + { + "epoch": 1.6921424186617555, + "grad_norm": 0.3829517066478729, + "learning_rate": 9.500645886411305e-05, + "loss": 1.9034, + "step": 5513 + }, + { + "epoch": 1.6924493554327809, + "grad_norm": 0.3657867908477783, + "learning_rate": 9.500429335115188e-05, + "loss": 1.869, + "step": 5514 + }, + { + "epoch": 1.692756292203806, + "grad_norm": 0.410877525806427, + "learning_rate": 9.50021273934319e-05, + "loss": 1.9824, + "step": 5515 + }, + { + "epoch": 1.693063228974831, + "grad_norm": 0.420682817697525, + "learning_rate": 9.499996099097453e-05, + "loss": 1.969, + "step": 5516 + }, + { + "epoch": 1.6933701657458564, + "grad_norm": 0.44578227400779724, + "learning_rate": 9.499779414380115e-05, + "loss": 1.9513, + "step": 5517 + }, + { + "epoch": 1.6936771025168815, + "grad_norm": 0.42710423469543457, + "learning_rate": 9.499562685193319e-05, + "loss": 1.9423, + "step": 5518 + }, + { + "epoch": 1.6939840392879066, + "grad_norm": 0.4503214657306671, + "learning_rate": 9.49934591153921e-05, + "loss": 1.9849, + "step": 5519 + }, + { + "epoch": 1.694290976058932, + "grad_norm": 0.427157998085022, + "learning_rate": 9.499129093419926e-05, + "loss": 1.9502, + "step": 5520 + }, + { + "epoch": 1.6945979128299569, + "grad_norm": 0.4356638491153717, + "learning_rate": 9.498912230837611e-05, + "loss": 1.8593, + "step": 5521 + }, + { + "epoch": 1.6949048496009822, + "grad_norm": 0.3894338309764862, + "learning_rate": 9.498695323794409e-05, + "loss": 1.8857, + "step": 5522 + }, + { + "epoch": 1.6952117863720073, + "grad_norm": 0.4285121262073517, + "learning_rate": 9.498478372292464e-05, + "loss": 1.9774, + "step": 5523 + }, + { + "epoch": 1.6955187231430324, + "grad_norm": 0.4316183924674988, + "learning_rate": 9.498261376333916e-05, + "loss": 1.9067, + "step": 5524 + }, + { + "epoch": 1.6958256599140578, + "grad_norm": 0.3760167956352234, + "learning_rate": 9.498044335920914e-05, + "loss": 1.8375, + "step": 5525 + }, + { + "epoch": 1.6961325966850829, + "grad_norm": 0.4327097237110138, + "learning_rate": 9.497827251055602e-05, + "loss": 1.9333, + "step": 5526 + }, + { + "epoch": 1.696439533456108, + "grad_norm": 0.4169953167438507, + "learning_rate": 9.497610121740126e-05, + "loss": 1.9015, + "step": 5527 + }, + { + "epoch": 1.6967464702271333, + "grad_norm": 0.3915253281593323, + "learning_rate": 9.49739294797663e-05, + "loss": 1.8608, + "step": 5528 + }, + { + "epoch": 1.6970534069981584, + "grad_norm": 0.4071075916290283, + "learning_rate": 9.497175729767259e-05, + "loss": 1.9336, + "step": 5529 + }, + { + "epoch": 1.6973603437691835, + "grad_norm": 0.3550303876399994, + "learning_rate": 9.496958467114163e-05, + "loss": 1.8614, + "step": 5530 + }, + { + "epoch": 1.6976672805402089, + "grad_norm": 0.3757273554801941, + "learning_rate": 9.496741160019487e-05, + "loss": 1.9959, + "step": 5531 + }, + { + "epoch": 1.6979742173112338, + "grad_norm": 0.4126262366771698, + "learning_rate": 9.49652380848538e-05, + "loss": 1.935, + "step": 5532 + }, + { + "epoch": 1.698281154082259, + "grad_norm": 0.46366190910339355, + "learning_rate": 9.496306412513988e-05, + "loss": 1.9336, + "step": 5533 + }, + { + "epoch": 1.6985880908532842, + "grad_norm": 0.42553630471229553, + "learning_rate": 9.496088972107463e-05, + "loss": 1.9388, + "step": 5534 + }, + { + "epoch": 1.6988950276243093, + "grad_norm": 0.4060843884944916, + "learning_rate": 9.49587148726795e-05, + "loss": 1.917, + "step": 5535 + }, + { + "epoch": 1.6992019643953347, + "grad_norm": 0.37994736433029175, + "learning_rate": 9.495653957997601e-05, + "loss": 1.9268, + "step": 5536 + }, + { + "epoch": 1.6995089011663598, + "grad_norm": 0.4148559272289276, + "learning_rate": 9.495436384298563e-05, + "loss": 1.8936, + "step": 5537 + }, + { + "epoch": 1.6998158379373849, + "grad_norm": 0.39814767241477966, + "learning_rate": 9.495218766172989e-05, + "loss": 1.9468, + "step": 5538 + }, + { + "epoch": 1.7001227747084102, + "grad_norm": 0.40800294280052185, + "learning_rate": 9.495001103623027e-05, + "loss": 1.9649, + "step": 5539 + }, + { + "epoch": 1.7004297114794351, + "grad_norm": 0.4225989282131195, + "learning_rate": 9.49478339665083e-05, + "loss": 1.987, + "step": 5540 + }, + { + "epoch": 1.7007366482504604, + "grad_norm": 0.4280939996242523, + "learning_rate": 9.494565645258551e-05, + "loss": 2.0487, + "step": 5541 + }, + { + "epoch": 1.7010435850214856, + "grad_norm": 0.44816237688064575, + "learning_rate": 9.494347849448338e-05, + "loss": 1.9112, + "step": 5542 + }, + { + "epoch": 1.7013505217925107, + "grad_norm": 0.424629271030426, + "learning_rate": 9.494130009222346e-05, + "loss": 1.9284, + "step": 5543 + }, + { + "epoch": 1.701657458563536, + "grad_norm": 0.40010082721710205, + "learning_rate": 9.493912124582727e-05, + "loss": 1.9307, + "step": 5544 + }, + { + "epoch": 1.7019643953345611, + "grad_norm": 0.42541825771331787, + "learning_rate": 9.493694195531633e-05, + "loss": 2.0009, + "step": 5545 + }, + { + "epoch": 1.7022713321055862, + "grad_norm": 0.39693546295166016, + "learning_rate": 9.49347622207122e-05, + "loss": 1.9237, + "step": 5546 + }, + { + "epoch": 1.7025782688766116, + "grad_norm": 0.37853676080703735, + "learning_rate": 9.493258204203644e-05, + "loss": 1.9212, + "step": 5547 + }, + { + "epoch": 1.7028852056476365, + "grad_norm": 0.3856247663497925, + "learning_rate": 9.493040141931054e-05, + "loss": 1.926, + "step": 5548 + }, + { + "epoch": 1.7031921424186618, + "grad_norm": 0.3429555892944336, + "learning_rate": 9.492822035255608e-05, + "loss": 1.8854, + "step": 5549 + }, + { + "epoch": 1.703499079189687, + "grad_norm": 0.3500545620918274, + "learning_rate": 9.49260388417946e-05, + "loss": 1.8627, + "step": 5550 + }, + { + "epoch": 1.703806015960712, + "grad_norm": 0.3461480140686035, + "learning_rate": 9.49238568870477e-05, + "loss": 1.8962, + "step": 5551 + }, + { + "epoch": 1.7041129527317374, + "grad_norm": 0.36311015486717224, + "learning_rate": 9.492167448833691e-05, + "loss": 1.9398, + "step": 5552 + }, + { + "epoch": 1.7044198895027625, + "grad_norm": 0.36770105361938477, + "learning_rate": 9.491949164568379e-05, + "loss": 1.9083, + "step": 5553 + }, + { + "epoch": 1.7047268262737876, + "grad_norm": 0.42491769790649414, + "learning_rate": 9.491730835910993e-05, + "loss": 1.8874, + "step": 5554 + }, + { + "epoch": 1.705033763044813, + "grad_norm": 0.5321764945983887, + "learning_rate": 9.491512462863691e-05, + "loss": 1.9813, + "step": 5555 + }, + { + "epoch": 1.7053406998158378, + "grad_norm": 0.5481576323509216, + "learning_rate": 9.49129404542863e-05, + "loss": 1.8696, + "step": 5556 + }, + { + "epoch": 1.7056476365868631, + "grad_norm": 0.47720953822135925, + "learning_rate": 9.491075583607969e-05, + "loss": 1.9026, + "step": 5557 + }, + { + "epoch": 1.7059545733578882, + "grad_norm": 0.3976534605026245, + "learning_rate": 9.490857077403865e-05, + "loss": 1.8551, + "step": 5558 + }, + { + "epoch": 1.7062615101289134, + "grad_norm": 0.3744281828403473, + "learning_rate": 9.49063852681848e-05, + "loss": 2.012, + "step": 5559 + }, + { + "epoch": 1.7065684468999387, + "grad_norm": 0.3931918740272522, + "learning_rate": 9.490419931853974e-05, + "loss": 1.845, + "step": 5560 + }, + { + "epoch": 1.7068753836709638, + "grad_norm": 0.5411466956138611, + "learning_rate": 9.490201292512506e-05, + "loss": 2.0225, + "step": 5561 + }, + { + "epoch": 1.707182320441989, + "grad_norm": 0.6602910757064819, + "learning_rate": 9.489982608796237e-05, + "loss": 1.9559, + "step": 5562 + }, + { + "epoch": 1.7074892572130143, + "grad_norm": 0.5455329418182373, + "learning_rate": 9.489763880707329e-05, + "loss": 1.8855, + "step": 5563 + }, + { + "epoch": 1.7077961939840391, + "grad_norm": 0.42309099435806274, + "learning_rate": 9.489545108247941e-05, + "loss": 1.8784, + "step": 5564 + }, + { + "epoch": 1.7081031307550645, + "grad_norm": 0.3817001283168793, + "learning_rate": 9.489326291420239e-05, + "loss": 1.8926, + "step": 5565 + }, + { + "epoch": 1.7084100675260896, + "grad_norm": 0.5077582597732544, + "learning_rate": 9.489107430226381e-05, + "loss": 1.8742, + "step": 5566 + }, + { + "epoch": 1.7087170042971147, + "grad_norm": 0.5634065866470337, + "learning_rate": 9.488888524668533e-05, + "loss": 1.9251, + "step": 5567 + }, + { + "epoch": 1.70902394106814, + "grad_norm": 0.5182891488075256, + "learning_rate": 9.488669574748859e-05, + "loss": 1.9689, + "step": 5568 + }, + { + "epoch": 1.7093308778391652, + "grad_norm": 0.4180498719215393, + "learning_rate": 9.48845058046952e-05, + "loss": 1.9248, + "step": 5569 + }, + { + "epoch": 1.7096378146101903, + "grad_norm": 0.4833194315433502, + "learning_rate": 9.488231541832682e-05, + "loss": 2.0115, + "step": 5570 + }, + { + "epoch": 1.7099447513812156, + "grad_norm": 0.46525415778160095, + "learning_rate": 9.488012458840509e-05, + "loss": 1.9108, + "step": 5571 + }, + { + "epoch": 1.7102516881522405, + "grad_norm": 0.5051191449165344, + "learning_rate": 9.487793331495166e-05, + "loss": 1.9055, + "step": 5572 + }, + { + "epoch": 1.7105586249232658, + "grad_norm": 0.4713154137134552, + "learning_rate": 9.48757415979882e-05, + "loss": 1.9104, + "step": 5573 + }, + { + "epoch": 1.710865561694291, + "grad_norm": 0.44901835918426514, + "learning_rate": 9.487354943753635e-05, + "loss": 1.9536, + "step": 5574 + }, + { + "epoch": 1.711172498465316, + "grad_norm": 0.41106006503105164, + "learning_rate": 9.487135683361778e-05, + "loss": 1.9549, + "step": 5575 + }, + { + "epoch": 1.7114794352363414, + "grad_norm": 0.4571320116519928, + "learning_rate": 9.486916378625416e-05, + "loss": 1.859, + "step": 5576 + }, + { + "epoch": 1.7117863720073665, + "grad_norm": 0.4423540532588959, + "learning_rate": 9.486697029546718e-05, + "loss": 1.9621, + "step": 5577 + }, + { + "epoch": 1.7120933087783916, + "grad_norm": 0.44291070103645325, + "learning_rate": 9.48647763612785e-05, + "loss": 1.8567, + "step": 5578 + }, + { + "epoch": 1.712400245549417, + "grad_norm": 0.4374423921108246, + "learning_rate": 9.486258198370981e-05, + "loss": 1.9754, + "step": 5579 + }, + { + "epoch": 1.7127071823204418, + "grad_norm": 0.44008153676986694, + "learning_rate": 9.486038716278277e-05, + "loss": 1.8815, + "step": 5580 + }, + { + "epoch": 1.7130141190914672, + "grad_norm": 0.3571348190307617, + "learning_rate": 9.48581918985191e-05, + "loss": 1.8948, + "step": 5581 + }, + { + "epoch": 1.7133210558624923, + "grad_norm": 0.42260754108428955, + "learning_rate": 9.485599619094049e-05, + "loss": 1.9964, + "step": 5582 + }, + { + "epoch": 1.7136279926335174, + "grad_norm": 0.44568777084350586, + "learning_rate": 9.485380004006863e-05, + "loss": 1.9596, + "step": 5583 + }, + { + "epoch": 1.7139349294045427, + "grad_norm": 0.5488269925117493, + "learning_rate": 9.485160344592523e-05, + "loss": 1.9239, + "step": 5584 + }, + { + "epoch": 1.7142418661755678, + "grad_norm": 0.5653155446052551, + "learning_rate": 9.484940640853199e-05, + "loss": 1.9115, + "step": 5585 + }, + { + "epoch": 1.714548802946593, + "grad_norm": 0.4652312099933624, + "learning_rate": 9.484720892791064e-05, + "loss": 1.9973, + "step": 5586 + }, + { + "epoch": 1.7148557397176183, + "grad_norm": 0.41521382331848145, + "learning_rate": 9.484501100408288e-05, + "loss": 1.9395, + "step": 5587 + }, + { + "epoch": 1.7151626764886432, + "grad_norm": 0.46761438250541687, + "learning_rate": 9.484281263707043e-05, + "loss": 1.9465, + "step": 5588 + }, + { + "epoch": 1.7154696132596685, + "grad_norm": 0.46990182995796204, + "learning_rate": 9.484061382689501e-05, + "loss": 1.8969, + "step": 5589 + }, + { + "epoch": 1.7157765500306936, + "grad_norm": 0.44951021671295166, + "learning_rate": 9.48384145735784e-05, + "loss": 1.9925, + "step": 5590 + }, + { + "epoch": 1.7160834868017187, + "grad_norm": 0.4029327630996704, + "learning_rate": 9.483621487714227e-05, + "loss": 1.8574, + "step": 5591 + }, + { + "epoch": 1.716390423572744, + "grad_norm": 0.3501027226448059, + "learning_rate": 9.48340147376084e-05, + "loss": 1.9156, + "step": 5592 + }, + { + "epoch": 1.7166973603437692, + "grad_norm": 0.5058720111846924, + "learning_rate": 9.48318141549985e-05, + "loss": 2.071, + "step": 5593 + }, + { + "epoch": 1.7170042971147943, + "grad_norm": 0.5097518563270569, + "learning_rate": 9.482961312933435e-05, + "loss": 1.9609, + "step": 5594 + }, + { + "epoch": 1.7173112338858196, + "grad_norm": 0.4728573262691498, + "learning_rate": 9.482741166063769e-05, + "loss": 1.9552, + "step": 5595 + }, + { + "epoch": 1.7176181706568447, + "grad_norm": 0.44095897674560547, + "learning_rate": 9.482520974893026e-05, + "loss": 2.011, + "step": 5596 + }, + { + "epoch": 1.7179251074278699, + "grad_norm": 0.48331573605537415, + "learning_rate": 9.482300739423385e-05, + "loss": 1.9676, + "step": 5597 + }, + { + "epoch": 1.7182320441988952, + "grad_norm": 0.4890894293785095, + "learning_rate": 9.482080459657019e-05, + "loss": 1.9571, + "step": 5598 + }, + { + "epoch": 1.71853898096992, + "grad_norm": 0.4486929476261139, + "learning_rate": 9.481860135596109e-05, + "loss": 1.9205, + "step": 5599 + }, + { + "epoch": 1.7188459177409454, + "grad_norm": 0.44154083728790283, + "learning_rate": 9.48163976724283e-05, + "loss": 1.9995, + "step": 5600 + }, + { + "epoch": 1.7191528545119705, + "grad_norm": 0.4155641496181488, + "learning_rate": 9.481419354599358e-05, + "loss": 1.9192, + "step": 5601 + }, + { + "epoch": 1.7194597912829956, + "grad_norm": 0.453253835439682, + "learning_rate": 9.481198897667875e-05, + "loss": 2.0102, + "step": 5602 + }, + { + "epoch": 1.719766728054021, + "grad_norm": 0.4325653314590454, + "learning_rate": 9.480978396450557e-05, + "loss": 1.8859, + "step": 5603 + }, + { + "epoch": 1.720073664825046, + "grad_norm": 0.4191089868545532, + "learning_rate": 9.480757850949584e-05, + "loss": 2.0007, + "step": 5604 + }, + { + "epoch": 1.7203806015960712, + "grad_norm": 0.4182284474372864, + "learning_rate": 9.480537261167137e-05, + "loss": 1.9374, + "step": 5605 + }, + { + "epoch": 1.7206875383670965, + "grad_norm": 0.4695988893508911, + "learning_rate": 9.480316627105394e-05, + "loss": 1.983, + "step": 5606 + }, + { + "epoch": 1.7209944751381214, + "grad_norm": 0.4668160378932953, + "learning_rate": 9.480095948766536e-05, + "loss": 1.8705, + "step": 5607 + }, + { + "epoch": 1.7213014119091468, + "grad_norm": 0.3689236044883728, + "learning_rate": 9.479875226152744e-05, + "loss": 1.8695, + "step": 5608 + }, + { + "epoch": 1.7216083486801719, + "grad_norm": 0.4206932485103607, + "learning_rate": 9.4796544592662e-05, + "loss": 1.9494, + "step": 5609 + }, + { + "epoch": 1.721915285451197, + "grad_norm": 0.4420578181743622, + "learning_rate": 9.479433648109083e-05, + "loss": 1.8749, + "step": 5610 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.4243582487106323, + "learning_rate": 9.479212792683579e-05, + "loss": 1.9524, + "step": 5611 + }, + { + "epoch": 1.7225291589932474, + "grad_norm": 0.5053666234016418, + "learning_rate": 9.478991892991868e-05, + "loss": 1.9308, + "step": 5612 + }, + { + "epoch": 1.7228360957642725, + "grad_norm": 0.4365650713443756, + "learning_rate": 9.478770949036136e-05, + "loss": 1.9469, + "step": 5613 + }, + { + "epoch": 1.7231430325352979, + "grad_norm": 0.3916216194629669, + "learning_rate": 9.478549960818561e-05, + "loss": 1.8239, + "step": 5614 + }, + { + "epoch": 1.7234499693063228, + "grad_norm": 0.4051356911659241, + "learning_rate": 9.478328928341334e-05, + "loss": 1.892, + "step": 5615 + }, + { + "epoch": 1.723756906077348, + "grad_norm": 0.36592593789100647, + "learning_rate": 9.478107851606633e-05, + "loss": 1.8763, + "step": 5616 + }, + { + "epoch": 1.7240638428483732, + "grad_norm": 0.45741888880729675, + "learning_rate": 9.477886730616645e-05, + "loss": 1.9502, + "step": 5617 + }, + { + "epoch": 1.7243707796193983, + "grad_norm": 0.38170990347862244, + "learning_rate": 9.477665565373558e-05, + "loss": 1.8568, + "step": 5618 + }, + { + "epoch": 1.7246777163904237, + "grad_norm": 0.4193691313266754, + "learning_rate": 9.477444355879554e-05, + "loss": 1.9553, + "step": 5619 + }, + { + "epoch": 1.7249846531614488, + "grad_norm": 0.39682838320732117, + "learning_rate": 9.477223102136821e-05, + "loss": 1.9474, + "step": 5620 + }, + { + "epoch": 1.725291589932474, + "grad_norm": 0.391544371843338, + "learning_rate": 9.477001804147545e-05, + "loss": 1.9277, + "step": 5621 + }, + { + "epoch": 1.7255985267034992, + "grad_norm": 0.42348888516426086, + "learning_rate": 9.476780461913913e-05, + "loss": 1.8923, + "step": 5622 + }, + { + "epoch": 1.7259054634745241, + "grad_norm": 0.4393916130065918, + "learning_rate": 9.476559075438114e-05, + "loss": 1.9052, + "step": 5623 + }, + { + "epoch": 1.7262124002455494, + "grad_norm": 0.42631569504737854, + "learning_rate": 9.476337644722333e-05, + "loss": 1.8849, + "step": 5624 + }, + { + "epoch": 1.7265193370165746, + "grad_norm": 0.3514206111431122, + "learning_rate": 9.47611616976876e-05, + "loss": 1.9286, + "step": 5625 + }, + { + "epoch": 1.7268262737875997, + "grad_norm": 0.4104609191417694, + "learning_rate": 9.475894650579582e-05, + "loss": 1.9178, + "step": 5626 + }, + { + "epoch": 1.727133210558625, + "grad_norm": 0.44329676032066345, + "learning_rate": 9.475673087156992e-05, + "loss": 1.9789, + "step": 5627 + }, + { + "epoch": 1.7274401473296501, + "grad_norm": 0.41865840554237366, + "learning_rate": 9.475451479503175e-05, + "loss": 1.9105, + "step": 5628 + }, + { + "epoch": 1.7277470841006752, + "grad_norm": 0.4166790544986725, + "learning_rate": 9.475229827620326e-05, + "loss": 1.9089, + "step": 5629 + }, + { + "epoch": 1.7280540208717006, + "grad_norm": 0.353771448135376, + "learning_rate": 9.475008131510633e-05, + "loss": 1.9081, + "step": 5630 + }, + { + "epoch": 1.7283609576427255, + "grad_norm": 0.385046124458313, + "learning_rate": 9.474786391176284e-05, + "loss": 1.9268, + "step": 5631 + }, + { + "epoch": 1.7286678944137508, + "grad_norm": 0.3956538438796997, + "learning_rate": 9.474564606619474e-05, + "loss": 1.9445, + "step": 5632 + }, + { + "epoch": 1.728974831184776, + "grad_norm": 0.41305112838745117, + "learning_rate": 9.474342777842394e-05, + "loss": 1.9331, + "step": 5633 + }, + { + "epoch": 1.729281767955801, + "grad_norm": 0.39336860179901123, + "learning_rate": 9.474120904847237e-05, + "loss": 1.9792, + "step": 5634 + }, + { + "epoch": 1.7295887047268264, + "grad_norm": 0.41963186860084534, + "learning_rate": 9.473898987636194e-05, + "loss": 1.8719, + "step": 5635 + }, + { + "epoch": 1.7298956414978515, + "grad_norm": 0.4087338149547577, + "learning_rate": 9.473677026211458e-05, + "loss": 1.9121, + "step": 5636 + }, + { + "epoch": 1.7302025782688766, + "grad_norm": 0.3693830966949463, + "learning_rate": 9.473455020575226e-05, + "loss": 1.9293, + "step": 5637 + }, + { + "epoch": 1.730509515039902, + "grad_norm": 0.40699541568756104, + "learning_rate": 9.473232970729688e-05, + "loss": 1.94, + "step": 5638 + }, + { + "epoch": 1.7308164518109268, + "grad_norm": 0.4222811162471771, + "learning_rate": 9.473010876677041e-05, + "loss": 1.9416, + "step": 5639 + }, + { + "epoch": 1.7311233885819521, + "grad_norm": 0.41459110379219055, + "learning_rate": 9.472788738419477e-05, + "loss": 1.8801, + "step": 5640 + }, + { + "epoch": 1.7314303253529773, + "grad_norm": 0.36970487236976624, + "learning_rate": 9.472566555959195e-05, + "loss": 1.9122, + "step": 5641 + }, + { + "epoch": 1.7317372621240024, + "grad_norm": 0.35511577129364014, + "learning_rate": 9.472344329298388e-05, + "loss": 1.8646, + "step": 5642 + }, + { + "epoch": 1.7320441988950277, + "grad_norm": 0.3511577248573303, + "learning_rate": 9.472122058439252e-05, + "loss": 1.9047, + "step": 5643 + }, + { + "epoch": 1.7323511356660528, + "grad_norm": 0.3421955108642578, + "learning_rate": 9.471899743383986e-05, + "loss": 1.8732, + "step": 5644 + }, + { + "epoch": 1.732658072437078, + "grad_norm": 0.44008341431617737, + "learning_rate": 9.471677384134785e-05, + "loss": 1.8956, + "step": 5645 + }, + { + "epoch": 1.7329650092081033, + "grad_norm": 0.49410128593444824, + "learning_rate": 9.471454980693848e-05, + "loss": 1.9197, + "step": 5646 + }, + { + "epoch": 1.7332719459791281, + "grad_norm": 0.4664965867996216, + "learning_rate": 9.471232533063373e-05, + "loss": 1.8945, + "step": 5647 + }, + { + "epoch": 1.7335788827501535, + "grad_norm": 0.3789248764514923, + "learning_rate": 9.471010041245555e-05, + "loss": 1.9153, + "step": 5648 + }, + { + "epoch": 1.7338858195211786, + "grad_norm": 0.34556612372398376, + "learning_rate": 9.470787505242596e-05, + "loss": 1.9144, + "step": 5649 + }, + { + "epoch": 1.7341927562922037, + "grad_norm": 0.3466256856918335, + "learning_rate": 9.470564925056695e-05, + "loss": 1.8837, + "step": 5650 + }, + { + "epoch": 1.734499693063229, + "grad_norm": 0.34612321853637695, + "learning_rate": 9.470342300690051e-05, + "loss": 1.8667, + "step": 5651 + }, + { + "epoch": 1.7348066298342542, + "grad_norm": 0.3648833632469177, + "learning_rate": 9.470119632144864e-05, + "loss": 1.9499, + "step": 5652 + }, + { + "epoch": 1.7351135666052793, + "grad_norm": 0.3600454330444336, + "learning_rate": 9.469896919423334e-05, + "loss": 1.9093, + "step": 5653 + }, + { + "epoch": 1.7354205033763046, + "grad_norm": 0.41487598419189453, + "learning_rate": 9.469674162527664e-05, + "loss": 1.9714, + "step": 5654 + }, + { + "epoch": 1.7357274401473295, + "grad_norm": 0.35980695486068726, + "learning_rate": 9.469451361460053e-05, + "loss": 1.9006, + "step": 5655 + }, + { + "epoch": 1.7360343769183548, + "grad_norm": 0.42676928639411926, + "learning_rate": 9.469228516222705e-05, + "loss": 1.9286, + "step": 5656 + }, + { + "epoch": 1.73634131368938, + "grad_norm": 0.41541969776153564, + "learning_rate": 9.469005626817822e-05, + "loss": 1.9243, + "step": 5657 + }, + { + "epoch": 1.736648250460405, + "grad_norm": 0.4245065152645111, + "learning_rate": 9.468782693247604e-05, + "loss": 1.9427, + "step": 5658 + }, + { + "epoch": 1.7369551872314304, + "grad_norm": 0.46148940920829773, + "learning_rate": 9.468559715514257e-05, + "loss": 2.0201, + "step": 5659 + }, + { + "epoch": 1.7372621240024555, + "grad_norm": 0.47727301716804504, + "learning_rate": 9.468336693619985e-05, + "loss": 1.9792, + "step": 5660 + }, + { + "epoch": 1.7375690607734806, + "grad_norm": 0.4807848036289215, + "learning_rate": 9.46811362756699e-05, + "loss": 1.9036, + "step": 5661 + }, + { + "epoch": 1.737875997544506, + "grad_norm": 0.5129636526107788, + "learning_rate": 9.467890517357477e-05, + "loss": 1.8861, + "step": 5662 + }, + { + "epoch": 1.7381829343155308, + "grad_norm": 0.467804878950119, + "learning_rate": 9.467667362993651e-05, + "loss": 1.868, + "step": 5663 + }, + { + "epoch": 1.7384898710865562, + "grad_norm": 0.4179893136024475, + "learning_rate": 9.46744416447772e-05, + "loss": 1.9521, + "step": 5664 + }, + { + "epoch": 1.7387968078575813, + "grad_norm": 0.4384612739086151, + "learning_rate": 9.467220921811884e-05, + "loss": 1.9167, + "step": 5665 + }, + { + "epoch": 1.7391037446286064, + "grad_norm": 0.517855703830719, + "learning_rate": 9.466997634998354e-05, + "loss": 1.8919, + "step": 5666 + }, + { + "epoch": 1.7394106813996317, + "grad_norm": 0.4875940978527069, + "learning_rate": 9.466774304039334e-05, + "loss": 1.8774, + "step": 5667 + }, + { + "epoch": 1.7397176181706568, + "grad_norm": 0.44286540150642395, + "learning_rate": 9.466550928937034e-05, + "loss": 1.9696, + "step": 5668 + }, + { + "epoch": 1.740024554941682, + "grad_norm": 0.4092461168766022, + "learning_rate": 9.466327509693658e-05, + "loss": 1.9978, + "step": 5669 + }, + { + "epoch": 1.7403314917127073, + "grad_norm": 0.42797163128852844, + "learning_rate": 9.466104046311418e-05, + "loss": 1.9428, + "step": 5670 + }, + { + "epoch": 1.7406384284837324, + "grad_norm": 0.5174738764762878, + "learning_rate": 9.465880538792518e-05, + "loss": 1.9493, + "step": 5671 + }, + { + "epoch": 1.7409453652547575, + "grad_norm": 0.6263836622238159, + "learning_rate": 9.46565698713917e-05, + "loss": 1.9131, + "step": 5672 + }, + { + "epoch": 1.7412523020257828, + "grad_norm": 0.6452967524528503, + "learning_rate": 9.465433391353582e-05, + "loss": 2.0412, + "step": 5673 + }, + { + "epoch": 1.7415592387968077, + "grad_norm": 0.5004684925079346, + "learning_rate": 9.465209751437964e-05, + "loss": 1.8721, + "step": 5674 + }, + { + "epoch": 1.741866175567833, + "grad_norm": 0.4694507420063019, + "learning_rate": 9.464986067394526e-05, + "loss": 1.9614, + "step": 5675 + }, + { + "epoch": 1.7421731123388582, + "grad_norm": 0.4519532322883606, + "learning_rate": 9.464762339225479e-05, + "loss": 1.9687, + "step": 5676 + }, + { + "epoch": 1.7424800491098833, + "grad_norm": 0.4297941029071808, + "learning_rate": 9.464538566933033e-05, + "loss": 1.965, + "step": 5677 + }, + { + "epoch": 1.7427869858809086, + "grad_norm": 0.4612393081188202, + "learning_rate": 9.464314750519401e-05, + "loss": 1.9651, + "step": 5678 + }, + { + "epoch": 1.7430939226519337, + "grad_norm": 0.394142210483551, + "learning_rate": 9.464090889986794e-05, + "loss": 1.9185, + "step": 5679 + }, + { + "epoch": 1.7434008594229589, + "grad_norm": 0.39999979734420776, + "learning_rate": 9.463866985337424e-05, + "loss": 1.899, + "step": 5680 + }, + { + "epoch": 1.7437077961939842, + "grad_norm": 0.40942859649658203, + "learning_rate": 9.463643036573504e-05, + "loss": 1.9653, + "step": 5681 + }, + { + "epoch": 1.744014732965009, + "grad_norm": 0.4097300171852112, + "learning_rate": 9.463419043697248e-05, + "loss": 1.9944, + "step": 5682 + }, + { + "epoch": 1.7443216697360344, + "grad_norm": 0.41627535223960876, + "learning_rate": 9.463195006710868e-05, + "loss": 1.9156, + "step": 5683 + }, + { + "epoch": 1.7446286065070595, + "grad_norm": 0.3789215385913849, + "learning_rate": 9.46297092561658e-05, + "loss": 1.9262, + "step": 5684 + }, + { + "epoch": 1.7449355432780846, + "grad_norm": 0.4867783188819885, + "learning_rate": 9.462746800416595e-05, + "loss": 1.961, + "step": 5685 + }, + { + "epoch": 1.74524248004911, + "grad_norm": 0.6078580617904663, + "learning_rate": 9.462522631113133e-05, + "loss": 1.9694, + "step": 5686 + }, + { + "epoch": 1.745549416820135, + "grad_norm": 0.558968186378479, + "learning_rate": 9.462298417708406e-05, + "loss": 1.9537, + "step": 5687 + }, + { + "epoch": 1.7458563535911602, + "grad_norm": 0.4677596986293793, + "learning_rate": 9.46207416020463e-05, + "loss": 1.9253, + "step": 5688 + }, + { + "epoch": 1.7461632903621855, + "grad_norm": 0.40353646874427795, + "learning_rate": 9.461849858604023e-05, + "loss": 1.8992, + "step": 5689 + }, + { + "epoch": 1.7464702271332104, + "grad_norm": 0.3738614618778229, + "learning_rate": 9.4616255129088e-05, + "loss": 1.9109, + "step": 5690 + }, + { + "epoch": 1.7467771639042358, + "grad_norm": 0.4040324091911316, + "learning_rate": 9.461401123121179e-05, + "loss": 1.8981, + "step": 5691 + }, + { + "epoch": 1.7470841006752609, + "grad_norm": 0.44214901328086853, + "learning_rate": 9.461176689243376e-05, + "loss": 1.9244, + "step": 5692 + }, + { + "epoch": 1.747391037446286, + "grad_norm": 0.44187378883361816, + "learning_rate": 9.460952211277611e-05, + "loss": 1.9329, + "step": 5693 + }, + { + "epoch": 1.7476979742173113, + "grad_norm": 0.44287410378456116, + "learning_rate": 9.460727689226102e-05, + "loss": 1.97, + "step": 5694 + }, + { + "epoch": 1.7480049109883364, + "grad_norm": 0.3757341504096985, + "learning_rate": 9.460503123091067e-05, + "loss": 1.8766, + "step": 5695 + }, + { + "epoch": 1.7483118477593615, + "grad_norm": 0.4139314591884613, + "learning_rate": 9.460278512874725e-05, + "loss": 1.902, + "step": 5696 + }, + { + "epoch": 1.7486187845303869, + "grad_norm": 0.37526339292526245, + "learning_rate": 9.460053858579298e-05, + "loss": 1.9325, + "step": 5697 + }, + { + "epoch": 1.7489257213014118, + "grad_norm": 0.3770616948604584, + "learning_rate": 9.459829160207004e-05, + "loss": 1.9437, + "step": 5698 + }, + { + "epoch": 1.749232658072437, + "grad_norm": 0.4069806933403015, + "learning_rate": 9.459604417760064e-05, + "loss": 1.9454, + "step": 5699 + }, + { + "epoch": 1.7495395948434622, + "grad_norm": 0.42822694778442383, + "learning_rate": 9.459379631240699e-05, + "loss": 1.8798, + "step": 5700 + }, + { + "epoch": 1.7498465316144873, + "grad_norm": 0.44075292348861694, + "learning_rate": 9.459154800651131e-05, + "loss": 1.9842, + "step": 5701 + }, + { + "epoch": 1.7501534683855127, + "grad_norm": 0.4151122272014618, + "learning_rate": 9.458929925993583e-05, + "loss": 1.8495, + "step": 5702 + }, + { + "epoch": 1.7504604051565378, + "grad_norm": 0.41887882351875305, + "learning_rate": 9.458705007270275e-05, + "loss": 1.9611, + "step": 5703 + }, + { + "epoch": 1.750767341927563, + "grad_norm": 0.3976796865463257, + "learning_rate": 9.45848004448343e-05, + "loss": 1.8841, + "step": 5704 + }, + { + "epoch": 1.7510742786985882, + "grad_norm": 0.3783813416957855, + "learning_rate": 9.458255037635272e-05, + "loss": 1.8897, + "step": 5705 + }, + { + "epoch": 1.7513812154696131, + "grad_norm": 0.35153308510780334, + "learning_rate": 9.458029986728026e-05, + "loss": 1.911, + "step": 5706 + }, + { + "epoch": 1.7516881522406385, + "grad_norm": 0.38390985131263733, + "learning_rate": 9.457804891763913e-05, + "loss": 2.0105, + "step": 5707 + }, + { + "epoch": 1.7519950890116636, + "grad_norm": 0.3830740451812744, + "learning_rate": 9.457579752745161e-05, + "loss": 1.9635, + "step": 5708 + }, + { + "epoch": 1.7523020257826887, + "grad_norm": 0.3711417019367218, + "learning_rate": 9.457354569673993e-05, + "loss": 1.8553, + "step": 5709 + }, + { + "epoch": 1.752608962553714, + "grad_norm": 0.3670618236064911, + "learning_rate": 9.457129342552633e-05, + "loss": 1.9044, + "step": 5710 + }, + { + "epoch": 1.7529158993247391, + "grad_norm": 0.398863285779953, + "learning_rate": 9.45690407138331e-05, + "loss": 1.987, + "step": 5711 + }, + { + "epoch": 1.7532228360957642, + "grad_norm": 0.4100732207298279, + "learning_rate": 9.456678756168248e-05, + "loss": 1.8552, + "step": 5712 + }, + { + "epoch": 1.7535297728667896, + "grad_norm": 0.41883236169815063, + "learning_rate": 9.456453396909676e-05, + "loss": 1.9183, + "step": 5713 + }, + { + "epoch": 1.7538367096378145, + "grad_norm": 0.4063440263271332, + "learning_rate": 9.456227993609818e-05, + "loss": 1.8751, + "step": 5714 + }, + { + "epoch": 1.7541436464088398, + "grad_norm": 0.3880515694618225, + "learning_rate": 9.456002546270904e-05, + "loss": 1.9558, + "step": 5715 + }, + { + "epoch": 1.754450583179865, + "grad_norm": 0.38582444190979004, + "learning_rate": 9.45577705489516e-05, + "loss": 1.9588, + "step": 5716 + }, + { + "epoch": 1.75475751995089, + "grad_norm": 0.3678396940231323, + "learning_rate": 9.455551519484816e-05, + "loss": 1.9108, + "step": 5717 + }, + { + "epoch": 1.7550644567219154, + "grad_norm": 0.3590768277645111, + "learning_rate": 9.455325940042098e-05, + "loss": 1.9027, + "step": 5718 + }, + { + "epoch": 1.7553713934929405, + "grad_norm": 0.4104592204093933, + "learning_rate": 9.455100316569241e-05, + "loss": 1.9099, + "step": 5719 + }, + { + "epoch": 1.7556783302639656, + "grad_norm": 0.3774401843547821, + "learning_rate": 9.45487464906847e-05, + "loss": 1.9098, + "step": 5720 + }, + { + "epoch": 1.755985267034991, + "grad_norm": 0.38464388251304626, + "learning_rate": 9.454648937542019e-05, + "loss": 1.9194, + "step": 5721 + }, + { + "epoch": 1.7562922038060158, + "grad_norm": 0.435131698846817, + "learning_rate": 9.454423181992114e-05, + "loss": 1.9798, + "step": 5722 + }, + { + "epoch": 1.7565991405770411, + "grad_norm": 0.4583236575126648, + "learning_rate": 9.454197382420988e-05, + "loss": 1.9862, + "step": 5723 + }, + { + "epoch": 1.7569060773480663, + "grad_norm": 0.3644738793373108, + "learning_rate": 9.453971538830874e-05, + "loss": 1.8535, + "step": 5724 + }, + { + "epoch": 1.7572130141190914, + "grad_norm": 0.3644218444824219, + "learning_rate": 9.453745651224002e-05, + "loss": 1.8773, + "step": 5725 + }, + { + "epoch": 1.7575199508901167, + "grad_norm": 0.42884743213653564, + "learning_rate": 9.453519719602604e-05, + "loss": 1.882, + "step": 5726 + }, + { + "epoch": 1.7578268876611418, + "grad_norm": 0.41049477458000183, + "learning_rate": 9.453293743968916e-05, + "loss": 1.9133, + "step": 5727 + }, + { + "epoch": 1.758133824432167, + "grad_norm": 0.35882604122161865, + "learning_rate": 9.453067724325169e-05, + "loss": 1.9056, + "step": 5728 + }, + { + "epoch": 1.7584407612031923, + "grad_norm": 0.34516364336013794, + "learning_rate": 9.452841660673595e-05, + "loss": 1.8894, + "step": 5729 + }, + { + "epoch": 1.7587476979742172, + "grad_norm": 0.41804373264312744, + "learning_rate": 9.45261555301643e-05, + "loss": 1.8798, + "step": 5730 + }, + { + "epoch": 1.7590546347452425, + "grad_norm": 0.48584702610969543, + "learning_rate": 9.45238940135591e-05, + "loss": 1.9353, + "step": 5731 + }, + { + "epoch": 1.7593615715162676, + "grad_norm": 0.5693044662475586, + "learning_rate": 9.452163205694267e-05, + "loss": 1.8813, + "step": 5732 + }, + { + "epoch": 1.7596685082872927, + "grad_norm": 0.6146205067634583, + "learning_rate": 9.451936966033738e-05, + "loss": 1.9993, + "step": 5733 + }, + { + "epoch": 1.759975445058318, + "grad_norm": 0.4658338129520416, + "learning_rate": 9.451710682376558e-05, + "loss": 1.8977, + "step": 5734 + }, + { + "epoch": 1.7602823818293432, + "grad_norm": 0.35184696316719055, + "learning_rate": 9.451484354724964e-05, + "loss": 1.9924, + "step": 5735 + }, + { + "epoch": 1.7605893186003683, + "grad_norm": 0.48720163106918335, + "learning_rate": 9.451257983081194e-05, + "loss": 1.9054, + "step": 5736 + }, + { + "epoch": 1.7608962553713936, + "grad_norm": 0.6268271803855896, + "learning_rate": 9.451031567447482e-05, + "loss": 1.9956, + "step": 5737 + }, + { + "epoch": 1.7612031921424187, + "grad_norm": 0.5384534001350403, + "learning_rate": 9.450805107826068e-05, + "loss": 1.9169, + "step": 5738 + }, + { + "epoch": 1.7615101289134438, + "grad_norm": 0.4011121094226837, + "learning_rate": 9.450578604219188e-05, + "loss": 1.9845, + "step": 5739 + }, + { + "epoch": 1.7618170656844692, + "grad_norm": 0.4422668516635895, + "learning_rate": 9.450352056629082e-05, + "loss": 2.0014, + "step": 5740 + }, + { + "epoch": 1.762124002455494, + "grad_norm": 0.5033303499221802, + "learning_rate": 9.45012546505799e-05, + "loss": 1.9142, + "step": 5741 + }, + { + "epoch": 1.7624309392265194, + "grad_norm": 0.6074427366256714, + "learning_rate": 9.449898829508148e-05, + "loss": 1.9385, + "step": 5742 + }, + { + "epoch": 1.7627378759975445, + "grad_norm": 0.6405495405197144, + "learning_rate": 9.449672149981799e-05, + "loss": 1.9792, + "step": 5743 + }, + { + "epoch": 1.7630448127685696, + "grad_norm": 0.5432560443878174, + "learning_rate": 9.449445426481182e-05, + "loss": 1.9294, + "step": 5744 + }, + { + "epoch": 1.763351749539595, + "grad_norm": 0.41406089067459106, + "learning_rate": 9.449218659008536e-05, + "loss": 1.9266, + "step": 5745 + }, + { + "epoch": 1.76365868631062, + "grad_norm": 0.41278013586997986, + "learning_rate": 9.448991847566104e-05, + "loss": 1.9448, + "step": 5746 + }, + { + "epoch": 1.7639656230816452, + "grad_norm": 0.4682934582233429, + "learning_rate": 9.448764992156128e-05, + "loss": 1.9836, + "step": 5747 + }, + { + "epoch": 1.7642725598526705, + "grad_norm": 0.47673073410987854, + "learning_rate": 9.448538092780848e-05, + "loss": 2.0229, + "step": 5748 + }, + { + "epoch": 1.7645794966236954, + "grad_norm": 0.3956258296966553, + "learning_rate": 9.448311149442507e-05, + "loss": 1.9871, + "step": 5749 + }, + { + "epoch": 1.7648864333947207, + "grad_norm": 0.39578214287757874, + "learning_rate": 9.448084162143348e-05, + "loss": 1.8991, + "step": 5750 + }, + { + "epoch": 1.7651933701657458, + "grad_norm": 0.42902353405952454, + "learning_rate": 9.447857130885614e-05, + "loss": 1.9925, + "step": 5751 + }, + { + "epoch": 1.765500306936771, + "grad_norm": 0.45643556118011475, + "learning_rate": 9.44763005567155e-05, + "loss": 1.9662, + "step": 5752 + }, + { + "epoch": 1.7658072437077963, + "grad_norm": 0.39291635155677795, + "learning_rate": 9.447402936503398e-05, + "loss": 1.8925, + "step": 5753 + }, + { + "epoch": 1.7661141804788214, + "grad_norm": 0.36709296703338623, + "learning_rate": 9.447175773383404e-05, + "loss": 1.8669, + "step": 5754 + }, + { + "epoch": 1.7664211172498465, + "grad_norm": 0.41586652398109436, + "learning_rate": 9.446948566313812e-05, + "loss": 1.8925, + "step": 5755 + }, + { + "epoch": 1.7667280540208719, + "grad_norm": 0.42532578110694885, + "learning_rate": 9.446721315296867e-05, + "loss": 1.9923, + "step": 5756 + }, + { + "epoch": 1.7670349907918967, + "grad_norm": 0.45310646295547485, + "learning_rate": 9.446494020334817e-05, + "loss": 1.9908, + "step": 5757 + }, + { + "epoch": 1.767341927562922, + "grad_norm": 0.4391445219516754, + "learning_rate": 9.446266681429907e-05, + "loss": 1.9391, + "step": 5758 + }, + { + "epoch": 1.7676488643339472, + "grad_norm": 0.3728313446044922, + "learning_rate": 9.446039298584382e-05, + "loss": 1.9352, + "step": 5759 + }, + { + "epoch": 1.7679558011049723, + "grad_norm": 0.3862408697605133, + "learning_rate": 9.445811871800492e-05, + "loss": 1.9628, + "step": 5760 + }, + { + "epoch": 1.7682627378759976, + "grad_norm": 0.3704443573951721, + "learning_rate": 9.445584401080482e-05, + "loss": 1.9041, + "step": 5761 + }, + { + "epoch": 1.7685696746470227, + "grad_norm": 0.3490816652774811, + "learning_rate": 9.445356886426603e-05, + "loss": 1.9203, + "step": 5762 + }, + { + "epoch": 1.7688766114180479, + "grad_norm": 0.40135613083839417, + "learning_rate": 9.445129327841102e-05, + "loss": 1.9166, + "step": 5763 + }, + { + "epoch": 1.7691835481890732, + "grad_norm": 0.3794950246810913, + "learning_rate": 9.444901725326227e-05, + "loss": 1.8735, + "step": 5764 + }, + { + "epoch": 1.769490484960098, + "grad_norm": 0.3908408284187317, + "learning_rate": 9.444674078884228e-05, + "loss": 1.9044, + "step": 5765 + }, + { + "epoch": 1.7697974217311234, + "grad_norm": 0.45880573987960815, + "learning_rate": 9.444446388517354e-05, + "loss": 1.999, + "step": 5766 + }, + { + "epoch": 1.7701043585021485, + "grad_norm": 0.44833555817604065, + "learning_rate": 9.444218654227856e-05, + "loss": 1.8638, + "step": 5767 + }, + { + "epoch": 1.7704112952731736, + "grad_norm": 0.4608282446861267, + "learning_rate": 9.443990876017985e-05, + "loss": 2.0073, + "step": 5768 + }, + { + "epoch": 1.770718232044199, + "grad_norm": 0.41873493790626526, + "learning_rate": 9.44376305388999e-05, + "loss": 1.9337, + "step": 5769 + }, + { + "epoch": 1.771025168815224, + "grad_norm": 0.44395530223846436, + "learning_rate": 9.443535187846125e-05, + "loss": 1.9218, + "step": 5770 + }, + { + "epoch": 1.7713321055862492, + "grad_norm": 0.4347928464412689, + "learning_rate": 9.443307277888641e-05, + "loss": 1.9251, + "step": 5771 + }, + { + "epoch": 1.7716390423572745, + "grad_norm": 0.4892890155315399, + "learning_rate": 9.44307932401979e-05, + "loss": 1.9549, + "step": 5772 + }, + { + "epoch": 1.7719459791282994, + "grad_norm": 0.4234324097633362, + "learning_rate": 9.442851326241826e-05, + "loss": 1.9835, + "step": 5773 + }, + { + "epoch": 1.7722529158993248, + "grad_norm": 0.3614303171634674, + "learning_rate": 9.442623284557e-05, + "loss": 1.8942, + "step": 5774 + }, + { + "epoch": 1.7725598526703499, + "grad_norm": 0.4273429214954376, + "learning_rate": 9.442395198967566e-05, + "loss": 1.9363, + "step": 5775 + }, + { + "epoch": 1.772866789441375, + "grad_norm": 0.5049880146980286, + "learning_rate": 9.44216706947578e-05, + "loss": 1.904, + "step": 5776 + }, + { + "epoch": 1.7731737262124003, + "grad_norm": 0.5713424682617188, + "learning_rate": 9.441938896083895e-05, + "loss": 1.9756, + "step": 5777 + }, + { + "epoch": 1.7734806629834254, + "grad_norm": 0.4836362600326538, + "learning_rate": 9.441710678794166e-05, + "loss": 1.9657, + "step": 5778 + }, + { + "epoch": 1.7737875997544506, + "grad_norm": 0.39967820048332214, + "learning_rate": 9.44148241760885e-05, + "loss": 1.9566, + "step": 5779 + }, + { + "epoch": 1.7740945365254759, + "grad_norm": 0.38304075598716736, + "learning_rate": 9.4412541125302e-05, + "loss": 1.9055, + "step": 5780 + }, + { + "epoch": 1.7744014732965008, + "grad_norm": 0.3932463526725769, + "learning_rate": 9.441025763560474e-05, + "loss": 1.9603, + "step": 5781 + }, + { + "epoch": 1.774708410067526, + "grad_norm": 0.4528409242630005, + "learning_rate": 9.44079737070193e-05, + "loss": 2.0095, + "step": 5782 + }, + { + "epoch": 1.7750153468385512, + "grad_norm": 0.42075392603874207, + "learning_rate": 9.440568933956822e-05, + "loss": 1.8818, + "step": 5783 + }, + { + "epoch": 1.7753222836095763, + "grad_norm": 0.4114269018173218, + "learning_rate": 9.44034045332741e-05, + "loss": 1.8524, + "step": 5784 + }, + { + "epoch": 1.7756292203806017, + "grad_norm": 0.4052261412143707, + "learning_rate": 9.44011192881595e-05, + "loss": 1.9759, + "step": 5785 + }, + { + "epoch": 1.7759361571516268, + "grad_norm": 0.3551998436450958, + "learning_rate": 9.439883360424702e-05, + "loss": 1.9534, + "step": 5786 + }, + { + "epoch": 1.776243093922652, + "grad_norm": 0.404109925031662, + "learning_rate": 9.439654748155924e-05, + "loss": 1.8944, + "step": 5787 + }, + { + "epoch": 1.7765500306936772, + "grad_norm": 0.4092860519886017, + "learning_rate": 9.439426092011875e-05, + "loss": 2.0341, + "step": 5788 + }, + { + "epoch": 1.7768569674647021, + "grad_norm": 0.36132386326789856, + "learning_rate": 9.439197391994819e-05, + "loss": 1.8746, + "step": 5789 + }, + { + "epoch": 1.7771639042357275, + "grad_norm": 0.34845319390296936, + "learning_rate": 9.438968648107009e-05, + "loss": 1.8646, + "step": 5790 + }, + { + "epoch": 1.7774708410067526, + "grad_norm": 0.33360353112220764, + "learning_rate": 9.43873986035071e-05, + "loss": 1.901, + "step": 5791 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.348147988319397, + "learning_rate": 9.438511028728181e-05, + "loss": 1.8703, + "step": 5792 + }, + { + "epoch": 1.778084714548803, + "grad_norm": 0.385662704706192, + "learning_rate": 9.438282153241686e-05, + "loss": 1.9806, + "step": 5793 + }, + { + "epoch": 1.7783916513198281, + "grad_norm": 0.39457234740257263, + "learning_rate": 9.438053233893484e-05, + "loss": 1.9324, + "step": 5794 + }, + { + "epoch": 1.7786985880908532, + "grad_norm": 0.35745853185653687, + "learning_rate": 9.43782427068584e-05, + "loss": 1.9754, + "step": 5795 + }, + { + "epoch": 1.7790055248618786, + "grad_norm": 0.40866991877555847, + "learning_rate": 9.437595263621015e-05, + "loss": 1.959, + "step": 5796 + }, + { + "epoch": 1.7793124616329035, + "grad_norm": 0.3938930630683899, + "learning_rate": 9.437366212701274e-05, + "loss": 1.8746, + "step": 5797 + }, + { + "epoch": 1.7796193984039288, + "grad_norm": 0.36665603518486023, + "learning_rate": 9.437137117928878e-05, + "loss": 1.9209, + "step": 5798 + }, + { + "epoch": 1.779926335174954, + "grad_norm": 0.38514846563339233, + "learning_rate": 9.436907979306092e-05, + "loss": 1.8697, + "step": 5799 + }, + { + "epoch": 1.780233271945979, + "grad_norm": 0.4100898206233978, + "learning_rate": 9.43667879683518e-05, + "loss": 1.9606, + "step": 5800 + }, + { + "epoch": 1.7805402087170044, + "grad_norm": 0.40195250511169434, + "learning_rate": 9.43644957051841e-05, + "loss": 1.918, + "step": 5801 + }, + { + "epoch": 1.7808471454880295, + "grad_norm": 0.3943032920360565, + "learning_rate": 9.436220300358043e-05, + "loss": 1.9394, + "step": 5802 + }, + { + "epoch": 1.7811540822590546, + "grad_norm": 0.4171943664550781, + "learning_rate": 9.435990986356349e-05, + "loss": 1.9773, + "step": 5803 + }, + { + "epoch": 1.78146101903008, + "grad_norm": 0.4278806746006012, + "learning_rate": 9.435761628515589e-05, + "loss": 1.8696, + "step": 5804 + }, + { + "epoch": 1.7817679558011048, + "grad_norm": 0.4659377634525299, + "learning_rate": 9.435532226838036e-05, + "loss": 1.9387, + "step": 5805 + }, + { + "epoch": 1.7820748925721301, + "grad_norm": 0.4428139925003052, + "learning_rate": 9.435302781325952e-05, + "loss": 1.9673, + "step": 5806 + }, + { + "epoch": 1.7823818293431553, + "grad_norm": 0.4488377869129181, + "learning_rate": 9.435073291981607e-05, + "loss": 1.8493, + "step": 5807 + }, + { + "epoch": 1.7826887661141804, + "grad_norm": 0.5337218046188354, + "learning_rate": 9.434843758807268e-05, + "loss": 1.8631, + "step": 5808 + }, + { + "epoch": 1.7829957028852057, + "grad_norm": 0.5479410886764526, + "learning_rate": 9.434614181805202e-05, + "loss": 1.8548, + "step": 5809 + }, + { + "epoch": 1.7833026396562308, + "grad_norm": 0.5154398679733276, + "learning_rate": 9.434384560977681e-05, + "loss": 1.9558, + "step": 5810 + }, + { + "epoch": 1.783609576427256, + "grad_norm": 0.44863855838775635, + "learning_rate": 9.434154896326974e-05, + "loss": 1.9287, + "step": 5811 + }, + { + "epoch": 1.7839165131982813, + "grad_norm": 0.43923139572143555, + "learning_rate": 9.433925187855348e-05, + "loss": 1.9475, + "step": 5812 + }, + { + "epoch": 1.7842234499693064, + "grad_norm": 0.3602962791919708, + "learning_rate": 9.433695435565073e-05, + "loss": 1.8705, + "step": 5813 + }, + { + "epoch": 1.7845303867403315, + "grad_norm": 0.3956433832645416, + "learning_rate": 9.433465639458423e-05, + "loss": 1.9402, + "step": 5814 + }, + { + "epoch": 1.7848373235113568, + "grad_norm": 0.3382786810398102, + "learning_rate": 9.433235799537666e-05, + "loss": 1.9176, + "step": 5815 + }, + { + "epoch": 1.7851442602823817, + "grad_norm": 0.3681669533252716, + "learning_rate": 9.433005915805076e-05, + "loss": 1.8628, + "step": 5816 + }, + { + "epoch": 1.785451197053407, + "grad_norm": 0.32285505533218384, + "learning_rate": 9.432775988262921e-05, + "loss": 1.8875, + "step": 5817 + }, + { + "epoch": 1.7857581338244322, + "grad_norm": 0.35673508048057556, + "learning_rate": 9.432546016913477e-05, + "loss": 1.925, + "step": 5818 + }, + { + "epoch": 1.7860650705954573, + "grad_norm": 0.363308310508728, + "learning_rate": 9.432316001759015e-05, + "loss": 1.8711, + "step": 5819 + }, + { + "epoch": 1.7863720073664826, + "grad_norm": 0.36789265275001526, + "learning_rate": 9.432085942801808e-05, + "loss": 1.8578, + "step": 5820 + }, + { + "epoch": 1.7866789441375077, + "grad_norm": 0.3791796565055847, + "learning_rate": 9.43185584004413e-05, + "loss": 1.9162, + "step": 5821 + }, + { + "epoch": 1.7869858809085328, + "grad_norm": 0.3819539248943329, + "learning_rate": 9.431625693488256e-05, + "loss": 1.9042, + "step": 5822 + }, + { + "epoch": 1.7872928176795582, + "grad_norm": 0.36675095558166504, + "learning_rate": 9.43139550313646e-05, + "loss": 1.9775, + "step": 5823 + }, + { + "epoch": 1.787599754450583, + "grad_norm": 0.40895935893058777, + "learning_rate": 9.431165268991013e-05, + "loss": 1.9249, + "step": 5824 + }, + { + "epoch": 1.7879066912216084, + "grad_norm": 0.3866878151893616, + "learning_rate": 9.430934991054197e-05, + "loss": 1.8706, + "step": 5825 + }, + { + "epoch": 1.7882136279926335, + "grad_norm": 0.4892923831939697, + "learning_rate": 9.430704669328283e-05, + "loss": 1.9177, + "step": 5826 + }, + { + "epoch": 1.7885205647636586, + "grad_norm": 0.46216699481010437, + "learning_rate": 9.430474303815548e-05, + "loss": 1.8606, + "step": 5827 + }, + { + "epoch": 1.788827501534684, + "grad_norm": 0.4253760874271393, + "learning_rate": 9.430243894518271e-05, + "loss": 1.9123, + "step": 5828 + }, + { + "epoch": 1.789134438305709, + "grad_norm": 0.3316090404987335, + "learning_rate": 9.430013441438726e-05, + "loss": 1.9138, + "step": 5829 + }, + { + "epoch": 1.7894413750767342, + "grad_norm": 0.36144545674324036, + "learning_rate": 9.429782944579191e-05, + "loss": 1.8851, + "step": 5830 + }, + { + "epoch": 1.7897483118477595, + "grad_norm": 0.47213298082351685, + "learning_rate": 9.429552403941946e-05, + "loss": 1.9614, + "step": 5831 + }, + { + "epoch": 1.7900552486187844, + "grad_norm": 0.5166186094284058, + "learning_rate": 9.429321819529267e-05, + "loss": 1.9297, + "step": 5832 + }, + { + "epoch": 1.7903621853898097, + "grad_norm": 0.5276393294334412, + "learning_rate": 9.429091191343433e-05, + "loss": 1.8803, + "step": 5833 + }, + { + "epoch": 1.7906691221608348, + "grad_norm": 0.5736613869667053, + "learning_rate": 9.428860519386726e-05, + "loss": 1.9256, + "step": 5834 + }, + { + "epoch": 1.79097605893186, + "grad_norm": 0.6111080050468445, + "learning_rate": 9.428629803661421e-05, + "loss": 1.9624, + "step": 5835 + }, + { + "epoch": 1.7912829957028853, + "grad_norm": 0.45036107301712036, + "learning_rate": 9.428399044169802e-05, + "loss": 1.8625, + "step": 5836 + }, + { + "epoch": 1.7915899324739104, + "grad_norm": 0.35049325227737427, + "learning_rate": 9.428168240914148e-05, + "loss": 1.8988, + "step": 5837 + }, + { + "epoch": 1.7918968692449355, + "grad_norm": 0.4196048080921173, + "learning_rate": 9.427937393896739e-05, + "loss": 1.8593, + "step": 5838 + }, + { + "epoch": 1.7922038060159609, + "grad_norm": 0.5051491856575012, + "learning_rate": 9.42770650311986e-05, + "loss": 1.9283, + "step": 5839 + }, + { + "epoch": 1.7925107427869857, + "grad_norm": 0.5883297324180603, + "learning_rate": 9.427475568585787e-05, + "loss": 1.9211, + "step": 5840 + }, + { + "epoch": 1.792817679558011, + "grad_norm": 0.54326993227005, + "learning_rate": 9.427244590296807e-05, + "loss": 1.8856, + "step": 5841 + }, + { + "epoch": 1.7931246163290362, + "grad_norm": 0.3963034152984619, + "learning_rate": 9.4270135682552e-05, + "loss": 1.9302, + "step": 5842 + }, + { + "epoch": 1.7934315531000613, + "grad_norm": 0.3804232180118561, + "learning_rate": 9.426782502463251e-05, + "loss": 1.8615, + "step": 5843 + }, + { + "epoch": 1.7937384898710866, + "grad_norm": 0.5173880457878113, + "learning_rate": 9.426551392923244e-05, + "loss": 1.9702, + "step": 5844 + }, + { + "epoch": 1.7940454266421118, + "grad_norm": 0.5509253144264221, + "learning_rate": 9.42632023963746e-05, + "loss": 1.9091, + "step": 5845 + }, + { + "epoch": 1.7943523634131369, + "grad_norm": 0.4918860197067261, + "learning_rate": 9.426089042608186e-05, + "loss": 1.956, + "step": 5846 + }, + { + "epoch": 1.7946593001841622, + "grad_norm": 0.40632131695747375, + "learning_rate": 9.425857801837705e-05, + "loss": 1.978, + "step": 5847 + }, + { + "epoch": 1.794966236955187, + "grad_norm": 0.429643839597702, + "learning_rate": 9.425626517328303e-05, + "loss": 1.9293, + "step": 5848 + }, + { + "epoch": 1.7952731737262124, + "grad_norm": 0.46690109372138977, + "learning_rate": 9.425395189082267e-05, + "loss": 1.935, + "step": 5849 + }, + { + "epoch": 1.7955801104972375, + "grad_norm": 0.47745081782341003, + "learning_rate": 9.425163817101881e-05, + "loss": 1.9308, + "step": 5850 + }, + { + "epoch": 1.7958870472682626, + "grad_norm": 0.40971288084983826, + "learning_rate": 9.424932401389433e-05, + "loss": 1.8818, + "step": 5851 + }, + { + "epoch": 1.796193984039288, + "grad_norm": 0.44640809297561646, + "learning_rate": 9.424700941947209e-05, + "loss": 1.9298, + "step": 5852 + }, + { + "epoch": 1.796500920810313, + "grad_norm": 0.4068106412887573, + "learning_rate": 9.424469438777497e-05, + "loss": 1.9176, + "step": 5853 + }, + { + "epoch": 1.7968078575813382, + "grad_norm": 0.39228180050849915, + "learning_rate": 9.424237891882584e-05, + "loss": 1.9822, + "step": 5854 + }, + { + "epoch": 1.7971147943523635, + "grad_norm": 0.4050966203212738, + "learning_rate": 9.424006301264761e-05, + "loss": 2.0092, + "step": 5855 + }, + { + "epoch": 1.7974217311233884, + "grad_norm": 0.4402252733707428, + "learning_rate": 9.423774666926313e-05, + "loss": 1.9686, + "step": 5856 + }, + { + "epoch": 1.7977286678944138, + "grad_norm": 0.4362206757068634, + "learning_rate": 9.423542988869531e-05, + "loss": 1.9472, + "step": 5857 + }, + { + "epoch": 1.7980356046654389, + "grad_norm": 0.4363079369068146, + "learning_rate": 9.423311267096706e-05, + "loss": 1.9046, + "step": 5858 + }, + { + "epoch": 1.798342541436464, + "grad_norm": 0.4619371294975281, + "learning_rate": 9.423079501610123e-05, + "loss": 1.9322, + "step": 5859 + }, + { + "epoch": 1.7986494782074893, + "grad_norm": 0.3747330605983734, + "learning_rate": 9.42284769241208e-05, + "loss": 1.8859, + "step": 5860 + }, + { + "epoch": 1.7989564149785144, + "grad_norm": 0.46349939703941345, + "learning_rate": 9.422615839504863e-05, + "loss": 2.0343, + "step": 5861 + }, + { + "epoch": 1.7992633517495396, + "grad_norm": 0.4081406891345978, + "learning_rate": 9.422383942890762e-05, + "loss": 1.9261, + "step": 5862 + }, + { + "epoch": 1.7995702885205649, + "grad_norm": 0.4200274348258972, + "learning_rate": 9.42215200257207e-05, + "loss": 1.8922, + "step": 5863 + }, + { + "epoch": 1.7998772252915898, + "grad_norm": 0.4353233277797699, + "learning_rate": 9.421920018551084e-05, + "loss": 1.9263, + "step": 5864 + }, + { + "epoch": 1.8001841620626151, + "grad_norm": 0.43261346220970154, + "learning_rate": 9.42168799083009e-05, + "loss": 1.872, + "step": 5865 + }, + { + "epoch": 1.8004910988336402, + "grad_norm": 0.41588231921195984, + "learning_rate": 9.421455919411385e-05, + "loss": 1.9427, + "step": 5866 + }, + { + "epoch": 1.8007980356046653, + "grad_norm": 0.36490678787231445, + "learning_rate": 9.421223804297261e-05, + "loss": 1.9458, + "step": 5867 + }, + { + "epoch": 1.8011049723756907, + "grad_norm": 0.40656644105911255, + "learning_rate": 9.42099164549001e-05, + "loss": 1.8791, + "step": 5868 + }, + { + "epoch": 1.8014119091467158, + "grad_norm": 0.35529834032058716, + "learning_rate": 9.42075944299193e-05, + "loss": 1.8889, + "step": 5869 + }, + { + "epoch": 1.801718845917741, + "grad_norm": 0.3530628979206085, + "learning_rate": 9.420527196805314e-05, + "loss": 1.9093, + "step": 5870 + }, + { + "epoch": 1.8020257826887662, + "grad_norm": 0.35012003779411316, + "learning_rate": 9.420294906932457e-05, + "loss": 1.84, + "step": 5871 + }, + { + "epoch": 1.8023327194597911, + "grad_norm": 0.37993142008781433, + "learning_rate": 9.420062573375654e-05, + "loss": 1.9943, + "step": 5872 + }, + { + "epoch": 1.8026396562308165, + "grad_norm": 0.34801873564720154, + "learning_rate": 9.419830196137204e-05, + "loss": 1.9092, + "step": 5873 + }, + { + "epoch": 1.8029465930018416, + "grad_norm": 0.3381052017211914, + "learning_rate": 9.4195977752194e-05, + "loss": 1.9212, + "step": 5874 + }, + { + "epoch": 1.8032535297728667, + "grad_norm": 0.3624991476535797, + "learning_rate": 9.419365310624542e-05, + "loss": 1.9491, + "step": 5875 + }, + { + "epoch": 1.803560466543892, + "grad_norm": 0.3840768337249756, + "learning_rate": 9.419132802354925e-05, + "loss": 1.9531, + "step": 5876 + }, + { + "epoch": 1.8038674033149171, + "grad_norm": 0.377481073141098, + "learning_rate": 9.418900250412846e-05, + "loss": 1.9103, + "step": 5877 + }, + { + "epoch": 1.8041743400859422, + "grad_norm": 0.41462278366088867, + "learning_rate": 9.418667654800606e-05, + "loss": 1.944, + "step": 5878 + }, + { + "epoch": 1.8044812768569676, + "grad_norm": 0.5620705485343933, + "learning_rate": 9.418435015520502e-05, + "loss": 1.9184, + "step": 5879 + }, + { + "epoch": 1.8047882136279927, + "grad_norm": 0.6150699853897095, + "learning_rate": 9.418202332574833e-05, + "loss": 1.8971, + "step": 5880 + }, + { + "epoch": 1.8050951503990178, + "grad_norm": 0.5631645321846008, + "learning_rate": 9.4179696059659e-05, + "loss": 1.9668, + "step": 5881 + }, + { + "epoch": 1.8054020871700431, + "grad_norm": 0.4416831433773041, + "learning_rate": 9.417736835696001e-05, + "loss": 1.8531, + "step": 5882 + }, + { + "epoch": 1.805709023941068, + "grad_norm": 0.37340816855430603, + "learning_rate": 9.417504021767438e-05, + "loss": 1.8928, + "step": 5883 + }, + { + "epoch": 1.8060159607120934, + "grad_norm": 0.46018123626708984, + "learning_rate": 9.41727116418251e-05, + "loss": 1.8943, + "step": 5884 + }, + { + "epoch": 1.8063228974831185, + "grad_norm": 0.3852032721042633, + "learning_rate": 9.41703826294352e-05, + "loss": 1.8927, + "step": 5885 + }, + { + "epoch": 1.8066298342541436, + "grad_norm": 0.36783283948898315, + "learning_rate": 9.41680531805277e-05, + "loss": 1.9255, + "step": 5886 + }, + { + "epoch": 1.806936771025169, + "grad_norm": 0.39950302243232727, + "learning_rate": 9.416572329512559e-05, + "loss": 1.9215, + "step": 5887 + }, + { + "epoch": 1.807243707796194, + "grad_norm": 0.37217068672180176, + "learning_rate": 9.416339297325193e-05, + "loss": 1.8798, + "step": 5888 + }, + { + "epoch": 1.8075506445672191, + "grad_norm": 0.4334213137626648, + "learning_rate": 9.416106221492974e-05, + "loss": 1.9583, + "step": 5889 + }, + { + "epoch": 1.8078575813382445, + "grad_norm": 0.39610370993614197, + "learning_rate": 9.415873102018204e-05, + "loss": 1.9526, + "step": 5890 + }, + { + "epoch": 1.8081645181092694, + "grad_norm": 0.4256335496902466, + "learning_rate": 9.41563993890319e-05, + "loss": 1.9633, + "step": 5891 + }, + { + "epoch": 1.8084714548802947, + "grad_norm": 0.48030543327331543, + "learning_rate": 9.41540673215023e-05, + "loss": 1.8869, + "step": 5892 + }, + { + "epoch": 1.8087783916513198, + "grad_norm": 0.5549675822257996, + "learning_rate": 9.415173481761634e-05, + "loss": 1.9894, + "step": 5893 + }, + { + "epoch": 1.809085328422345, + "grad_norm": 0.5706361532211304, + "learning_rate": 9.414940187739708e-05, + "loss": 1.9721, + "step": 5894 + }, + { + "epoch": 1.8093922651933703, + "grad_norm": 0.4263947606086731, + "learning_rate": 9.414706850086754e-05, + "loss": 1.9408, + "step": 5895 + }, + { + "epoch": 1.8096992019643954, + "grad_norm": 0.3934611976146698, + "learning_rate": 9.414473468805078e-05, + "loss": 1.9444, + "step": 5896 + }, + { + "epoch": 1.8100061387354205, + "grad_norm": 0.4267776608467102, + "learning_rate": 9.41424004389699e-05, + "loss": 1.8774, + "step": 5897 + }, + { + "epoch": 1.8103130755064458, + "grad_norm": 0.46216219663619995, + "learning_rate": 9.414006575364795e-05, + "loss": 1.9648, + "step": 5898 + }, + { + "epoch": 1.8106200122774707, + "grad_norm": 0.4730767607688904, + "learning_rate": 9.413773063210798e-05, + "loss": 1.9528, + "step": 5899 + }, + { + "epoch": 1.810926949048496, + "grad_norm": 0.36383283138275146, + "learning_rate": 9.413539507437308e-05, + "loss": 1.843, + "step": 5900 + }, + { + "epoch": 1.8112338858195212, + "grad_norm": 0.343729168176651, + "learning_rate": 9.413305908046636e-05, + "loss": 1.9101, + "step": 5901 + }, + { + "epoch": 1.8115408225905463, + "grad_norm": 0.3774524927139282, + "learning_rate": 9.413072265041087e-05, + "loss": 1.8705, + "step": 5902 + }, + { + "epoch": 1.8118477593615716, + "grad_norm": 0.37734711170196533, + "learning_rate": 9.412838578422972e-05, + "loss": 1.868, + "step": 5903 + }, + { + "epoch": 1.8121546961325967, + "grad_norm": 0.3705524206161499, + "learning_rate": 9.4126048481946e-05, + "loss": 1.9587, + "step": 5904 + }, + { + "epoch": 1.8124616329036218, + "grad_norm": 0.45906612277030945, + "learning_rate": 9.41237107435828e-05, + "loss": 1.9872, + "step": 5905 + }, + { + "epoch": 1.8127685696746472, + "grad_norm": 0.5013484954833984, + "learning_rate": 9.412137256916323e-05, + "loss": 1.8692, + "step": 5906 + }, + { + "epoch": 1.813075506445672, + "grad_norm": 0.5123991370201111, + "learning_rate": 9.411903395871038e-05, + "loss": 1.9574, + "step": 5907 + }, + { + "epoch": 1.8133824432166974, + "grad_norm": 0.45425844192504883, + "learning_rate": 9.411669491224739e-05, + "loss": 1.9295, + "step": 5908 + }, + { + "epoch": 1.8136893799877225, + "grad_norm": 0.3939640522003174, + "learning_rate": 9.411435542979736e-05, + "loss": 1.9258, + "step": 5909 + }, + { + "epoch": 1.8139963167587476, + "grad_norm": 0.5032235383987427, + "learning_rate": 9.411201551138342e-05, + "loss": 1.9012, + "step": 5910 + }, + { + "epoch": 1.814303253529773, + "grad_norm": 0.6334826946258545, + "learning_rate": 9.410967515702869e-05, + "loss": 1.9699, + "step": 5911 + }, + { + "epoch": 1.814610190300798, + "grad_norm": 0.56645667552948, + "learning_rate": 9.41073343667563e-05, + "loss": 1.9346, + "step": 5912 + }, + { + "epoch": 1.8149171270718232, + "grad_norm": 0.461668461561203, + "learning_rate": 9.410499314058936e-05, + "loss": 1.9549, + "step": 5913 + }, + { + "epoch": 1.8152240638428485, + "grad_norm": 0.39917534589767456, + "learning_rate": 9.410265147855104e-05, + "loss": 1.9503, + "step": 5914 + }, + { + "epoch": 1.8155310006138734, + "grad_norm": 0.4409043788909912, + "learning_rate": 9.410030938066448e-05, + "loss": 1.897, + "step": 5915 + }, + { + "epoch": 1.8158379373848987, + "grad_norm": 0.5793384313583374, + "learning_rate": 9.40979668469528e-05, + "loss": 1.9526, + "step": 5916 + }, + { + "epoch": 1.8161448741559238, + "grad_norm": 0.4642924666404724, + "learning_rate": 9.409562387743917e-05, + "loss": 1.8993, + "step": 5917 + }, + { + "epoch": 1.816451810926949, + "grad_norm": 0.3799861669540405, + "learning_rate": 9.409328047214674e-05, + "loss": 1.9412, + "step": 5918 + }, + { + "epoch": 1.8167587476979743, + "grad_norm": 0.40758320689201355, + "learning_rate": 9.409093663109866e-05, + "loss": 1.9908, + "step": 5919 + }, + { + "epoch": 1.8170656844689994, + "grad_norm": 0.41446420550346375, + "learning_rate": 9.40885923543181e-05, + "loss": 1.8711, + "step": 5920 + }, + { + "epoch": 1.8173726212400245, + "grad_norm": 0.4744807183742523, + "learning_rate": 9.408624764182823e-05, + "loss": 2.0297, + "step": 5921 + }, + { + "epoch": 1.8176795580110499, + "grad_norm": 0.43377524614334106, + "learning_rate": 9.408390249365224e-05, + "loss": 1.9613, + "step": 5922 + }, + { + "epoch": 1.8179864947820747, + "grad_norm": 0.38450872898101807, + "learning_rate": 9.408155690981328e-05, + "loss": 1.8716, + "step": 5923 + }, + { + "epoch": 1.8182934315531, + "grad_norm": 0.4989684820175171, + "learning_rate": 9.407921089033452e-05, + "loss": 1.9909, + "step": 5924 + }, + { + "epoch": 1.8186003683241252, + "grad_norm": 0.4137042462825775, + "learning_rate": 9.407686443523918e-05, + "loss": 1.8778, + "step": 5925 + }, + { + "epoch": 1.8189073050951503, + "grad_norm": 0.3816729485988617, + "learning_rate": 9.407451754455042e-05, + "loss": 1.9355, + "step": 5926 + }, + { + "epoch": 1.8192142418661756, + "grad_norm": 0.48876214027404785, + "learning_rate": 9.407217021829145e-05, + "loss": 1.9256, + "step": 5927 + }, + { + "epoch": 1.8195211786372008, + "grad_norm": 0.5273690223693848, + "learning_rate": 9.406982245648547e-05, + "loss": 1.9456, + "step": 5928 + }, + { + "epoch": 1.8198281154082259, + "grad_norm": 0.4148990511894226, + "learning_rate": 9.406747425915566e-05, + "loss": 1.9184, + "step": 5929 + }, + { + "epoch": 1.8201350521792512, + "grad_norm": 0.4484131634235382, + "learning_rate": 9.406512562632526e-05, + "loss": 1.9305, + "step": 5930 + }, + { + "epoch": 1.820441988950276, + "grad_norm": 0.6036938428878784, + "learning_rate": 9.406277655801744e-05, + "loss": 1.9294, + "step": 5931 + }, + { + "epoch": 1.8207489257213014, + "grad_norm": 0.5399366021156311, + "learning_rate": 9.406042705425543e-05, + "loss": 1.9265, + "step": 5932 + }, + { + "epoch": 1.8210558624923265, + "grad_norm": 0.3591126501560211, + "learning_rate": 9.405807711506249e-05, + "loss": 1.8634, + "step": 5933 + }, + { + "epoch": 1.8213627992633517, + "grad_norm": 0.4474995732307434, + "learning_rate": 9.405572674046179e-05, + "loss": 2.0084, + "step": 5934 + }, + { + "epoch": 1.821669736034377, + "grad_norm": 0.4841657876968384, + "learning_rate": 9.405337593047657e-05, + "loss": 1.8885, + "step": 5935 + }, + { + "epoch": 1.821976672805402, + "grad_norm": 0.4786655008792877, + "learning_rate": 9.405102468513008e-05, + "loss": 1.9273, + "step": 5936 + }, + { + "epoch": 1.8222836095764272, + "grad_norm": 0.4675963521003723, + "learning_rate": 9.404867300444553e-05, + "loss": 1.9267, + "step": 5937 + }, + { + "epoch": 1.8225905463474525, + "grad_norm": 0.40235474705696106, + "learning_rate": 9.404632088844619e-05, + "loss": 2.0208, + "step": 5938 + }, + { + "epoch": 1.8228974831184774, + "grad_norm": 0.40626317262649536, + "learning_rate": 9.404396833715527e-05, + "loss": 1.9079, + "step": 5939 + }, + { + "epoch": 1.8232044198895028, + "grad_norm": 0.4164435565471649, + "learning_rate": 9.404161535059607e-05, + "loss": 1.8818, + "step": 5940 + }, + { + "epoch": 1.8235113566605279, + "grad_norm": 0.44487184286117554, + "learning_rate": 9.40392619287918e-05, + "loss": 1.9184, + "step": 5941 + }, + { + "epoch": 1.823818293431553, + "grad_norm": 0.4009508192539215, + "learning_rate": 9.403690807176572e-05, + "loss": 1.8814, + "step": 5942 + }, + { + "epoch": 1.8241252302025783, + "grad_norm": 0.3518575429916382, + "learning_rate": 9.403455377954112e-05, + "loss": 1.9319, + "step": 5943 + }, + { + "epoch": 1.8244321669736034, + "grad_norm": 0.36712533235549927, + "learning_rate": 9.403219905214125e-05, + "loss": 1.8609, + "step": 5944 + }, + { + "epoch": 1.8247391037446286, + "grad_norm": 0.3926267623901367, + "learning_rate": 9.402984388958937e-05, + "loss": 1.9328, + "step": 5945 + }, + { + "epoch": 1.825046040515654, + "grad_norm": 0.370781272649765, + "learning_rate": 9.402748829190878e-05, + "loss": 1.9848, + "step": 5946 + }, + { + "epoch": 1.8253529772866788, + "grad_norm": 0.38226625323295593, + "learning_rate": 9.402513225912273e-05, + "loss": 1.8933, + "step": 5947 + }, + { + "epoch": 1.8256599140577041, + "grad_norm": 0.40101101994514465, + "learning_rate": 9.402277579125451e-05, + "loss": 1.9231, + "step": 5948 + }, + { + "epoch": 1.8259668508287292, + "grad_norm": 0.41038060188293457, + "learning_rate": 9.402041888832744e-05, + "loss": 1.9445, + "step": 5949 + }, + { + "epoch": 1.8262737875997543, + "grad_norm": 0.37442395091056824, + "learning_rate": 9.401806155036479e-05, + "loss": 1.9271, + "step": 5950 + }, + { + "epoch": 1.8265807243707797, + "grad_norm": 0.43142926692962646, + "learning_rate": 9.401570377738984e-05, + "loss": 1.9489, + "step": 5951 + }, + { + "epoch": 1.8268876611418048, + "grad_norm": 0.38730981945991516, + "learning_rate": 9.401334556942591e-05, + "loss": 1.8802, + "step": 5952 + }, + { + "epoch": 1.82719459791283, + "grad_norm": 0.34189531207084656, + "learning_rate": 9.40109869264963e-05, + "loss": 1.9116, + "step": 5953 + }, + { + "epoch": 1.8275015346838552, + "grad_norm": 0.3632197678089142, + "learning_rate": 9.400862784862434e-05, + "loss": 1.8456, + "step": 5954 + }, + { + "epoch": 1.8278084714548803, + "grad_norm": 0.4008798599243164, + "learning_rate": 9.400626833583331e-05, + "loss": 1.9984, + "step": 5955 + }, + { + "epoch": 1.8281154082259055, + "grad_norm": 0.4087502062320709, + "learning_rate": 9.400390838814655e-05, + "loss": 1.8177, + "step": 5956 + }, + { + "epoch": 1.8284223449969308, + "grad_norm": 0.3753478229045868, + "learning_rate": 9.400154800558737e-05, + "loss": 1.864, + "step": 5957 + }, + { + "epoch": 1.8287292817679557, + "grad_norm": 0.37939608097076416, + "learning_rate": 9.399918718817911e-05, + "loss": 1.9331, + "step": 5958 + }, + { + "epoch": 1.829036218538981, + "grad_norm": 0.41382426023483276, + "learning_rate": 9.399682593594507e-05, + "loss": 1.9014, + "step": 5959 + }, + { + "epoch": 1.8293431553100061, + "grad_norm": 0.46129345893859863, + "learning_rate": 9.399446424890864e-05, + "loss": 1.9591, + "step": 5960 + }, + { + "epoch": 1.8296500920810312, + "grad_norm": 0.487870454788208, + "learning_rate": 9.399210212709312e-05, + "loss": 1.9073, + "step": 5961 + }, + { + "epoch": 1.8299570288520566, + "grad_norm": 0.4693615138530731, + "learning_rate": 9.398973957052185e-05, + "loss": 1.8336, + "step": 5962 + }, + { + "epoch": 1.8302639656230817, + "grad_norm": 0.38947850465774536, + "learning_rate": 9.39873765792182e-05, + "loss": 1.8599, + "step": 5963 + }, + { + "epoch": 1.8305709023941068, + "grad_norm": 0.372242271900177, + "learning_rate": 9.398501315320551e-05, + "loss": 1.9653, + "step": 5964 + }, + { + "epoch": 1.8308778391651321, + "grad_norm": 0.37679895758628845, + "learning_rate": 9.398264929250714e-05, + "loss": 1.8886, + "step": 5965 + }, + { + "epoch": 1.831184775936157, + "grad_norm": 0.347989022731781, + "learning_rate": 9.398028499714645e-05, + "loss": 1.8665, + "step": 5966 + }, + { + "epoch": 1.8314917127071824, + "grad_norm": 0.4297877550125122, + "learning_rate": 9.397792026714681e-05, + "loss": 1.9646, + "step": 5967 + }, + { + "epoch": 1.8317986494782075, + "grad_norm": 0.3698103427886963, + "learning_rate": 9.397555510253158e-05, + "loss": 1.9537, + "step": 5968 + }, + { + "epoch": 1.8321055862492326, + "grad_norm": 0.3268609941005707, + "learning_rate": 9.397318950332414e-05, + "loss": 1.8679, + "step": 5969 + }, + { + "epoch": 1.832412523020258, + "grad_norm": 0.3487341105937958, + "learning_rate": 9.397082346954788e-05, + "loss": 1.8936, + "step": 5970 + }, + { + "epoch": 1.832719459791283, + "grad_norm": 0.36363741755485535, + "learning_rate": 9.396845700122616e-05, + "loss": 1.8926, + "step": 5971 + }, + { + "epoch": 1.8330263965623081, + "grad_norm": 0.42258647084236145, + "learning_rate": 9.396609009838237e-05, + "loss": 1.9439, + "step": 5972 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.4087521433830261, + "learning_rate": 9.396372276103992e-05, + "loss": 1.8868, + "step": 5973 + }, + { + "epoch": 1.8336402701043584, + "grad_norm": 0.41857820749282837, + "learning_rate": 9.396135498922218e-05, + "loss": 1.9824, + "step": 5974 + }, + { + "epoch": 1.8339472068753837, + "grad_norm": 0.44207099080085754, + "learning_rate": 9.395898678295259e-05, + "loss": 1.9183, + "step": 5975 + }, + { + "epoch": 1.8342541436464088, + "grad_norm": 0.38295891880989075, + "learning_rate": 9.39566181422545e-05, + "loss": 1.8882, + "step": 5976 + }, + { + "epoch": 1.834561080417434, + "grad_norm": 0.4440687298774719, + "learning_rate": 9.395424906715136e-05, + "loss": 1.9401, + "step": 5977 + }, + { + "epoch": 1.8348680171884593, + "grad_norm": 0.3867577016353607, + "learning_rate": 9.395187955766655e-05, + "loss": 1.9243, + "step": 5978 + }, + { + "epoch": 1.8351749539594844, + "grad_norm": 0.47536182403564453, + "learning_rate": 9.394950961382354e-05, + "loss": 1.9248, + "step": 5979 + }, + { + "epoch": 1.8354818907305095, + "grad_norm": 0.4071936011314392, + "learning_rate": 9.394713923564569e-05, + "loss": 1.8701, + "step": 5980 + }, + { + "epoch": 1.8357888275015348, + "grad_norm": 0.41844502091407776, + "learning_rate": 9.394476842315645e-05, + "loss": 2.0087, + "step": 5981 + }, + { + "epoch": 1.8360957642725597, + "grad_norm": 0.40439316630363464, + "learning_rate": 9.394239717637927e-05, + "loss": 1.8945, + "step": 5982 + }, + { + "epoch": 1.836402701043585, + "grad_norm": 0.36738064885139465, + "learning_rate": 9.394002549533754e-05, + "loss": 1.9361, + "step": 5983 + }, + { + "epoch": 1.8367096378146102, + "grad_norm": 0.4733370542526245, + "learning_rate": 9.393765338005476e-05, + "loss": 1.9301, + "step": 5984 + }, + { + "epoch": 1.8370165745856353, + "grad_norm": 0.4467030465602875, + "learning_rate": 9.39352808305543e-05, + "loss": 1.8691, + "step": 5985 + }, + { + "epoch": 1.8373235113566606, + "grad_norm": 0.5276423692703247, + "learning_rate": 9.393290784685967e-05, + "loss": 1.9211, + "step": 5986 + }, + { + "epoch": 1.8376304481276857, + "grad_norm": 0.4791669547557831, + "learning_rate": 9.393053442899428e-05, + "loss": 1.9876, + "step": 5987 + }, + { + "epoch": 1.8379373848987108, + "grad_norm": 0.41468554735183716, + "learning_rate": 9.392816057698159e-05, + "loss": 1.9483, + "step": 5988 + }, + { + "epoch": 1.8382443216697362, + "grad_norm": 0.3979242742061615, + "learning_rate": 9.39257862908451e-05, + "loss": 1.8962, + "step": 5989 + }, + { + "epoch": 1.838551258440761, + "grad_norm": 0.47706472873687744, + "learning_rate": 9.392341157060822e-05, + "loss": 1.9028, + "step": 5990 + }, + { + "epoch": 1.8388581952117864, + "grad_norm": 0.5254244804382324, + "learning_rate": 9.392103641629446e-05, + "loss": 1.9244, + "step": 5991 + }, + { + "epoch": 1.8391651319828115, + "grad_norm": 0.49596595764160156, + "learning_rate": 9.391866082792727e-05, + "loss": 1.8731, + "step": 5992 + }, + { + "epoch": 1.8394720687538366, + "grad_norm": 0.3787136971950531, + "learning_rate": 9.391628480553013e-05, + "loss": 1.9404, + "step": 5993 + }, + { + "epoch": 1.839779005524862, + "grad_norm": 0.3986566960811615, + "learning_rate": 9.391390834912651e-05, + "loss": 1.9319, + "step": 5994 + }, + { + "epoch": 1.840085942295887, + "grad_norm": 0.4466419219970703, + "learning_rate": 9.391153145873992e-05, + "loss": 1.9755, + "step": 5995 + }, + { + "epoch": 1.8403928790669122, + "grad_norm": 0.43374884128570557, + "learning_rate": 9.390915413439385e-05, + "loss": 1.913, + "step": 5996 + }, + { + "epoch": 1.8406998158379375, + "grad_norm": 0.3897610902786255, + "learning_rate": 9.390677637611176e-05, + "loss": 1.9488, + "step": 5997 + }, + { + "epoch": 1.8410067526089624, + "grad_norm": 0.38407614827156067, + "learning_rate": 9.390439818391718e-05, + "loss": 1.8712, + "step": 5998 + }, + { + "epoch": 1.8413136893799877, + "grad_norm": 0.4159192740917206, + "learning_rate": 9.390201955783362e-05, + "loss": 1.9254, + "step": 5999 + }, + { + "epoch": 1.8416206261510129, + "grad_norm": 0.42220592498779297, + "learning_rate": 9.389964049788455e-05, + "loss": 1.9684, + "step": 6000 + }, + { + "epoch": 1.841927562922038, + "grad_norm": 0.3792029619216919, + "learning_rate": 9.389726100409351e-05, + "loss": 1.9091, + "step": 6001 + }, + { + "epoch": 1.8422344996930633, + "grad_norm": 0.37374788522720337, + "learning_rate": 9.389488107648401e-05, + "loss": 1.9498, + "step": 6002 + }, + { + "epoch": 1.8425414364640884, + "grad_norm": 0.4237084686756134, + "learning_rate": 9.389250071507958e-05, + "loss": 1.9177, + "step": 6003 + }, + { + "epoch": 1.8428483732351135, + "grad_norm": 0.5332993865013123, + "learning_rate": 9.38901199199037e-05, + "loss": 1.8994, + "step": 6004 + }, + { + "epoch": 1.8431553100061389, + "grad_norm": 0.42202335596084595, + "learning_rate": 9.388773869097996e-05, + "loss": 1.8365, + "step": 6005 + }, + { + "epoch": 1.8434622467771637, + "grad_norm": 0.3581100106239319, + "learning_rate": 9.388535702833185e-05, + "loss": 1.8536, + "step": 6006 + }, + { + "epoch": 1.843769183548189, + "grad_norm": 0.3670782446861267, + "learning_rate": 9.388297493198293e-05, + "loss": 1.8965, + "step": 6007 + }, + { + "epoch": 1.8440761203192142, + "grad_norm": 0.39181825518608093, + "learning_rate": 9.38805924019567e-05, + "loss": 1.8674, + "step": 6008 + }, + { + "epoch": 1.8443830570902393, + "grad_norm": 0.46757015585899353, + "learning_rate": 9.387820943827676e-05, + "loss": 1.8945, + "step": 6009 + }, + { + "epoch": 1.8446899938612646, + "grad_norm": 0.4656504690647125, + "learning_rate": 9.387582604096664e-05, + "loss": 1.8626, + "step": 6010 + }, + { + "epoch": 1.8449969306322898, + "grad_norm": 0.4699888825416565, + "learning_rate": 9.387344221004988e-05, + "loss": 1.9396, + "step": 6011 + }, + { + "epoch": 1.8453038674033149, + "grad_norm": 0.36591392755508423, + "learning_rate": 9.387105794555006e-05, + "loss": 1.8031, + "step": 6012 + }, + { + "epoch": 1.8456108041743402, + "grad_norm": 0.3563486933708191, + "learning_rate": 9.386867324749073e-05, + "loss": 1.8658, + "step": 6013 + }, + { + "epoch": 1.845917740945365, + "grad_norm": 0.4490883946418762, + "learning_rate": 9.386628811589547e-05, + "loss": 1.9809, + "step": 6014 + }, + { + "epoch": 1.8462246777163904, + "grad_norm": 0.39862295985221863, + "learning_rate": 9.38639025507878e-05, + "loss": 1.9268, + "step": 6015 + }, + { + "epoch": 1.8465316144874155, + "grad_norm": 0.3579883575439453, + "learning_rate": 9.386151655219138e-05, + "loss": 1.8538, + "step": 6016 + }, + { + "epoch": 1.8468385512584407, + "grad_norm": 0.411685973405838, + "learning_rate": 9.385913012012973e-05, + "loss": 1.9034, + "step": 6017 + }, + { + "epoch": 1.847145488029466, + "grad_norm": 0.44486066699028015, + "learning_rate": 9.385674325462643e-05, + "loss": 1.9279, + "step": 6018 + }, + { + "epoch": 1.847452424800491, + "grad_norm": 0.42794153094291687, + "learning_rate": 9.385435595570511e-05, + "loss": 1.9117, + "step": 6019 + }, + { + "epoch": 1.8477593615715162, + "grad_norm": 0.3652110695838928, + "learning_rate": 9.385196822338933e-05, + "loss": 1.9636, + "step": 6020 + }, + { + "epoch": 1.8480662983425415, + "grad_norm": 0.36490142345428467, + "learning_rate": 9.38495800577027e-05, + "loss": 1.9468, + "step": 6021 + }, + { + "epoch": 1.8483732351135667, + "grad_norm": 0.3946039080619812, + "learning_rate": 9.384719145866882e-05, + "loss": 1.8851, + "step": 6022 + }, + { + "epoch": 1.8486801718845918, + "grad_norm": 0.4236997067928314, + "learning_rate": 9.38448024263113e-05, + "loss": 2.0256, + "step": 6023 + }, + { + "epoch": 1.848987108655617, + "grad_norm": 0.34637942910194397, + "learning_rate": 9.384241296065374e-05, + "loss": 1.9032, + "step": 6024 + }, + { + "epoch": 1.849294045426642, + "grad_norm": 0.4096907079219818, + "learning_rate": 9.384002306171975e-05, + "loss": 1.9762, + "step": 6025 + }, + { + "epoch": 1.8496009821976673, + "grad_norm": 0.38225218653678894, + "learning_rate": 9.383763272953297e-05, + "loss": 2.023, + "step": 6026 + }, + { + "epoch": 1.8499079189686924, + "grad_norm": 0.4297153055667877, + "learning_rate": 9.3835241964117e-05, + "loss": 1.977, + "step": 6027 + }, + { + "epoch": 1.8502148557397176, + "grad_norm": 0.5225360989570618, + "learning_rate": 9.383285076549548e-05, + "loss": 1.919, + "step": 6028 + }, + { + "epoch": 1.850521792510743, + "grad_norm": 0.6799743175506592, + "learning_rate": 9.383045913369205e-05, + "loss": 1.9382, + "step": 6029 + }, + { + "epoch": 1.850828729281768, + "grad_norm": 0.6274817585945129, + "learning_rate": 9.382806706873031e-05, + "loss": 1.9782, + "step": 6030 + }, + { + "epoch": 1.8511356660527931, + "grad_norm": 0.4939708113670349, + "learning_rate": 9.382567457063392e-05, + "loss": 1.8794, + "step": 6031 + }, + { + "epoch": 1.8514426028238185, + "grad_norm": 0.3876135051250458, + "learning_rate": 9.382328163942656e-05, + "loss": 2.0153, + "step": 6032 + }, + { + "epoch": 1.8517495395948433, + "grad_norm": 0.592051088809967, + "learning_rate": 9.38208882751318e-05, + "loss": 1.9277, + "step": 6033 + }, + { + "epoch": 1.8520564763658687, + "grad_norm": 0.660763144493103, + "learning_rate": 9.381849447777337e-05, + "loss": 1.9177, + "step": 6034 + }, + { + "epoch": 1.8523634131368938, + "grad_norm": 0.5823151469230652, + "learning_rate": 9.381610024737489e-05, + "loss": 1.9363, + "step": 6035 + }, + { + "epoch": 1.852670349907919, + "grad_norm": 0.39519962668418884, + "learning_rate": 9.381370558396004e-05, + "loss": 1.8627, + "step": 6036 + }, + { + "epoch": 1.8529772866789442, + "grad_norm": 0.44657328724861145, + "learning_rate": 9.381131048755244e-05, + "loss": 1.9075, + "step": 6037 + }, + { + "epoch": 1.8532842234499693, + "grad_norm": 0.540743887424469, + "learning_rate": 9.380891495817581e-05, + "loss": 1.9518, + "step": 6038 + }, + { + "epoch": 1.8535911602209945, + "grad_norm": 0.4388680160045624, + "learning_rate": 9.38065189958538e-05, + "loss": 1.8485, + "step": 6039 + }, + { + "epoch": 1.8538980969920198, + "grad_norm": 0.37645572423934937, + "learning_rate": 9.38041226006101e-05, + "loss": 1.9542, + "step": 6040 + }, + { + "epoch": 1.8542050337630447, + "grad_norm": 0.4405656158924103, + "learning_rate": 9.380172577246837e-05, + "loss": 1.9054, + "step": 6041 + }, + { + "epoch": 1.85451197053407, + "grad_norm": 0.45483505725860596, + "learning_rate": 9.379932851145232e-05, + "loss": 1.9077, + "step": 6042 + }, + { + "epoch": 1.8548189073050951, + "grad_norm": 0.40666261315345764, + "learning_rate": 9.379693081758564e-05, + "loss": 1.9977, + "step": 6043 + }, + { + "epoch": 1.8551258440761202, + "grad_norm": 0.365241140127182, + "learning_rate": 9.379453269089202e-05, + "loss": 1.9047, + "step": 6044 + }, + { + "epoch": 1.8554327808471456, + "grad_norm": 0.40797916054725647, + "learning_rate": 9.379213413139516e-05, + "loss": 1.9621, + "step": 6045 + }, + { + "epoch": 1.8557397176181707, + "grad_norm": 0.4525306820869446, + "learning_rate": 9.378973513911875e-05, + "loss": 1.9479, + "step": 6046 + }, + { + "epoch": 1.8560466543891958, + "grad_norm": 0.45422959327697754, + "learning_rate": 9.378733571408652e-05, + "loss": 1.9754, + "step": 6047 + }, + { + "epoch": 1.8563535911602211, + "grad_norm": 0.381862998008728, + "learning_rate": 9.378493585632217e-05, + "loss": 1.8542, + "step": 6048 + }, + { + "epoch": 1.856660527931246, + "grad_norm": 0.40489691495895386, + "learning_rate": 9.378253556584944e-05, + "loss": 1.9331, + "step": 6049 + }, + { + "epoch": 1.8569674647022714, + "grad_norm": 0.40347445011138916, + "learning_rate": 9.378013484269201e-05, + "loss": 1.9414, + "step": 6050 + }, + { + "epoch": 1.8572744014732965, + "grad_norm": 0.35401904582977295, + "learning_rate": 9.377773368687363e-05, + "loss": 1.8094, + "step": 6051 + }, + { + "epoch": 1.8575813382443216, + "grad_norm": 0.4061582684516907, + "learning_rate": 9.377533209841805e-05, + "loss": 1.8686, + "step": 6052 + }, + { + "epoch": 1.857888275015347, + "grad_norm": 0.44419318437576294, + "learning_rate": 9.377293007734895e-05, + "loss": 1.929, + "step": 6053 + }, + { + "epoch": 1.858195211786372, + "grad_norm": 0.41038191318511963, + "learning_rate": 9.37705276236901e-05, + "loss": 1.9636, + "step": 6054 + }, + { + "epoch": 1.8585021485573971, + "grad_norm": 0.4431348145008087, + "learning_rate": 9.376812473746526e-05, + "loss": 1.953, + "step": 6055 + }, + { + "epoch": 1.8588090853284225, + "grad_norm": 0.42502057552337646, + "learning_rate": 9.376572141869814e-05, + "loss": 1.95, + "step": 6056 + }, + { + "epoch": 1.8591160220994474, + "grad_norm": 0.40050914883613586, + "learning_rate": 9.376331766741253e-05, + "loss": 1.9507, + "step": 6057 + }, + { + "epoch": 1.8594229588704727, + "grad_norm": 0.3863932490348816, + "learning_rate": 9.376091348363216e-05, + "loss": 1.8746, + "step": 6058 + }, + { + "epoch": 1.8597298956414978, + "grad_norm": 0.37295350432395935, + "learning_rate": 9.375850886738077e-05, + "loss": 1.8778, + "step": 6059 + }, + { + "epoch": 1.860036832412523, + "grad_norm": 0.37965887784957886, + "learning_rate": 9.375610381868217e-05, + "loss": 1.8511, + "step": 6060 + }, + { + "epoch": 1.8603437691835483, + "grad_norm": 0.3740752637386322, + "learning_rate": 9.37536983375601e-05, + "loss": 1.8988, + "step": 6061 + }, + { + "epoch": 1.8606507059545734, + "grad_norm": 0.40466782450675964, + "learning_rate": 9.375129242403834e-05, + "loss": 1.9195, + "step": 6062 + }, + { + "epoch": 1.8609576427255985, + "grad_norm": 0.3658956289291382, + "learning_rate": 9.374888607814067e-05, + "loss": 1.9598, + "step": 6063 + }, + { + "epoch": 1.8612645794966238, + "grad_norm": 0.3752783238887787, + "learning_rate": 9.374647929989085e-05, + "loss": 1.9791, + "step": 6064 + }, + { + "epoch": 1.8615715162676487, + "grad_norm": 0.408774733543396, + "learning_rate": 9.374407208931268e-05, + "loss": 1.88, + "step": 6065 + }, + { + "epoch": 1.861878453038674, + "grad_norm": 0.3968205749988556, + "learning_rate": 9.374166444642997e-05, + "loss": 1.8755, + "step": 6066 + }, + { + "epoch": 1.8621853898096992, + "grad_norm": 0.37851858139038086, + "learning_rate": 9.373925637126648e-05, + "loss": 1.9296, + "step": 6067 + }, + { + "epoch": 1.8624923265807243, + "grad_norm": 0.34285619854927063, + "learning_rate": 9.373684786384604e-05, + "loss": 2.0149, + "step": 6068 + }, + { + "epoch": 1.8627992633517496, + "grad_norm": 0.38841512799263, + "learning_rate": 9.373443892419242e-05, + "loss": 1.9134, + "step": 6069 + }, + { + "epoch": 1.8631062001227747, + "grad_norm": 0.4744485914707184, + "learning_rate": 9.373202955232943e-05, + "loss": 1.9164, + "step": 6070 + }, + { + "epoch": 1.8634131368937998, + "grad_norm": 0.522659420967102, + "learning_rate": 9.372961974828092e-05, + "loss": 1.9155, + "step": 6071 + }, + { + "epoch": 1.8637200736648252, + "grad_norm": 0.5794001817703247, + "learning_rate": 9.372720951207066e-05, + "loss": 1.9003, + "step": 6072 + }, + { + "epoch": 1.86402701043585, + "grad_norm": 0.5135447978973389, + "learning_rate": 9.372479884372247e-05, + "loss": 1.948, + "step": 6073 + }, + { + "epoch": 1.8643339472068754, + "grad_norm": 0.4060198664665222, + "learning_rate": 9.372238774326021e-05, + "loss": 1.8634, + "step": 6074 + }, + { + "epoch": 1.8646408839779005, + "grad_norm": 0.3880244195461273, + "learning_rate": 9.371997621070769e-05, + "loss": 1.8729, + "step": 6075 + }, + { + "epoch": 1.8649478207489256, + "grad_norm": 0.4862929582595825, + "learning_rate": 9.371756424608875e-05, + "loss": 1.9185, + "step": 6076 + }, + { + "epoch": 1.865254757519951, + "grad_norm": 0.4763035476207733, + "learning_rate": 9.371515184942719e-05, + "loss": 1.9696, + "step": 6077 + }, + { + "epoch": 1.865561694290976, + "grad_norm": 0.3552228808403015, + "learning_rate": 9.371273902074689e-05, + "loss": 1.9101, + "step": 6078 + }, + { + "epoch": 1.8658686310620012, + "grad_norm": 0.46329566836357117, + "learning_rate": 9.371032576007168e-05, + "loss": 1.8807, + "step": 6079 + }, + { + "epoch": 1.8661755678330265, + "grad_norm": 0.5176550149917603, + "learning_rate": 9.370791206742541e-05, + "loss": 1.9044, + "step": 6080 + }, + { + "epoch": 1.8664825046040514, + "grad_norm": 0.3929184675216675, + "learning_rate": 9.370549794283194e-05, + "loss": 1.8858, + "step": 6081 + }, + { + "epoch": 1.8667894413750767, + "grad_norm": 0.35135987401008606, + "learning_rate": 9.370308338631511e-05, + "loss": 1.8518, + "step": 6082 + }, + { + "epoch": 1.8670963781461019, + "grad_norm": 0.4229072034358978, + "learning_rate": 9.370066839789881e-05, + "loss": 1.891, + "step": 6083 + }, + { + "epoch": 1.867403314917127, + "grad_norm": 0.4862394630908966, + "learning_rate": 9.369825297760688e-05, + "loss": 1.9058, + "step": 6084 + }, + { + "epoch": 1.8677102516881523, + "grad_norm": 0.4775281548500061, + "learning_rate": 9.369583712546322e-05, + "loss": 1.9738, + "step": 6085 + }, + { + "epoch": 1.8680171884591774, + "grad_norm": 0.3831046521663666, + "learning_rate": 9.369342084149166e-05, + "loss": 1.9516, + "step": 6086 + }, + { + "epoch": 1.8683241252302025, + "grad_norm": 0.3970867395401001, + "learning_rate": 9.369100412571612e-05, + "loss": 2.0158, + "step": 6087 + }, + { + "epoch": 1.8686310620012279, + "grad_norm": 0.41662725806236267, + "learning_rate": 9.368858697816047e-05, + "loss": 1.86, + "step": 6088 + }, + { + "epoch": 1.8689379987722528, + "grad_norm": 0.44235244393348694, + "learning_rate": 9.36861693988486e-05, + "loss": 1.9257, + "step": 6089 + }, + { + "epoch": 1.869244935543278, + "grad_norm": 0.37863966822624207, + "learning_rate": 9.36837513878044e-05, + "loss": 1.8877, + "step": 6090 + }, + { + "epoch": 1.8695518723143032, + "grad_norm": 0.44757044315338135, + "learning_rate": 9.368133294505175e-05, + "loss": 1.8962, + "step": 6091 + }, + { + "epoch": 1.8698588090853283, + "grad_norm": 0.5299558639526367, + "learning_rate": 9.367891407061458e-05, + "loss": 1.8655, + "step": 6092 + }, + { + "epoch": 1.8701657458563536, + "grad_norm": 0.4899531900882721, + "learning_rate": 9.367649476451678e-05, + "loss": 1.8933, + "step": 6093 + }, + { + "epoch": 1.8704726826273788, + "grad_norm": 0.3883507251739502, + "learning_rate": 9.367407502678224e-05, + "loss": 1.88, + "step": 6094 + }, + { + "epoch": 1.8707796193984039, + "grad_norm": 0.40936750173568726, + "learning_rate": 9.367165485743493e-05, + "loss": 1.8926, + "step": 6095 + }, + { + "epoch": 1.8710865561694292, + "grad_norm": 0.5708447098731995, + "learning_rate": 9.36692342564987e-05, + "loss": 1.9701, + "step": 6096 + }, + { + "epoch": 1.8713934929404543, + "grad_norm": 0.5559602379798889, + "learning_rate": 9.366681322399751e-05, + "loss": 1.8962, + "step": 6097 + }, + { + "epoch": 1.8717004297114794, + "grad_norm": 0.45344826579093933, + "learning_rate": 9.366439175995528e-05, + "loss": 1.9766, + "step": 6098 + }, + { + "epoch": 1.8720073664825048, + "grad_norm": 0.4887133538722992, + "learning_rate": 9.366196986439592e-05, + "loss": 1.8982, + "step": 6099 + }, + { + "epoch": 1.8723143032535297, + "grad_norm": 0.536568284034729, + "learning_rate": 9.365954753734339e-05, + "loss": 1.9506, + "step": 6100 + }, + { + "epoch": 1.872621240024555, + "grad_norm": 0.4792746901512146, + "learning_rate": 9.365712477882162e-05, + "loss": 1.9392, + "step": 6101 + }, + { + "epoch": 1.87292817679558, + "grad_norm": 0.39836910367012024, + "learning_rate": 9.365470158885458e-05, + "loss": 1.8812, + "step": 6102 + }, + { + "epoch": 1.8732351135666052, + "grad_norm": 0.4263121783733368, + "learning_rate": 9.365227796746617e-05, + "loss": 1.8326, + "step": 6103 + }, + { + "epoch": 1.8735420503376305, + "grad_norm": 0.4158315360546112, + "learning_rate": 9.364985391468038e-05, + "loss": 1.8857, + "step": 6104 + }, + { + "epoch": 1.8738489871086557, + "grad_norm": 0.4384559094905853, + "learning_rate": 9.364742943052112e-05, + "loss": 1.9247, + "step": 6105 + }, + { + "epoch": 1.8741559238796808, + "grad_norm": 0.34221649169921875, + "learning_rate": 9.364500451501242e-05, + "loss": 1.8869, + "step": 6106 + }, + { + "epoch": 1.874462860650706, + "grad_norm": 0.38786688446998596, + "learning_rate": 9.364257916817817e-05, + "loss": 1.8879, + "step": 6107 + }, + { + "epoch": 1.874769797421731, + "grad_norm": 0.39408090710639954, + "learning_rate": 9.364015339004239e-05, + "loss": 1.8832, + "step": 6108 + }, + { + "epoch": 1.8750767341927563, + "grad_norm": 0.33985385298728943, + "learning_rate": 9.363772718062902e-05, + "loss": 1.8823, + "step": 6109 + }, + { + "epoch": 1.8753836709637814, + "grad_norm": 0.35319194197654724, + "learning_rate": 9.363530053996206e-05, + "loss": 1.9205, + "step": 6110 + }, + { + "epoch": 1.8756906077348066, + "grad_norm": 0.3455435335636139, + "learning_rate": 9.36328734680655e-05, + "loss": 1.9028, + "step": 6111 + }, + { + "epoch": 1.875997544505832, + "grad_norm": 0.3689115643501282, + "learning_rate": 9.363044596496329e-05, + "loss": 1.8996, + "step": 6112 + }, + { + "epoch": 1.876304481276857, + "grad_norm": 0.35776960849761963, + "learning_rate": 9.362801803067945e-05, + "loss": 1.9563, + "step": 6113 + }, + { + "epoch": 1.8766114180478821, + "grad_norm": 0.3524370491504669, + "learning_rate": 9.362558966523797e-05, + "loss": 1.9016, + "step": 6114 + }, + { + "epoch": 1.8769183548189075, + "grad_norm": 0.3725074529647827, + "learning_rate": 9.362316086866283e-05, + "loss": 1.9467, + "step": 6115 + }, + { + "epoch": 1.8772252915899323, + "grad_norm": 0.390055775642395, + "learning_rate": 9.362073164097807e-05, + "loss": 1.9326, + "step": 6116 + }, + { + "epoch": 1.8775322283609577, + "grad_norm": 0.39119964838027954, + "learning_rate": 9.361830198220764e-05, + "loss": 1.8723, + "step": 6117 + }, + { + "epoch": 1.8778391651319828, + "grad_norm": 0.3659103512763977, + "learning_rate": 9.36158718923756e-05, + "loss": 1.835, + "step": 6118 + }, + { + "epoch": 1.878146101903008, + "grad_norm": 0.3360283076763153, + "learning_rate": 9.361344137150597e-05, + "loss": 1.8622, + "step": 6119 + }, + { + "epoch": 1.8784530386740332, + "grad_norm": 0.35440295934677124, + "learning_rate": 9.361101041962272e-05, + "loss": 1.8523, + "step": 6120 + }, + { + "epoch": 1.8787599754450584, + "grad_norm": 1.2606174945831299, + "learning_rate": 9.36085790367499e-05, + "loss": 1.9826, + "step": 6121 + }, + { + "epoch": 1.8790669122160835, + "grad_norm": 0.49294769763946533, + "learning_rate": 9.360614722291157e-05, + "loss": 1.8478, + "step": 6122 + }, + { + "epoch": 1.8793738489871088, + "grad_norm": 0.5642881393432617, + "learning_rate": 9.360371497813172e-05, + "loss": 1.883, + "step": 6123 + }, + { + "epoch": 1.8796807857581337, + "grad_norm": 0.5257276296615601, + "learning_rate": 9.36012823024344e-05, + "loss": 1.8577, + "step": 6124 + }, + { + "epoch": 1.879987722529159, + "grad_norm": 0.36913231015205383, + "learning_rate": 9.359884919584366e-05, + "loss": 1.8934, + "step": 6125 + }, + { + "epoch": 1.8802946593001841, + "grad_norm": 0.43373262882232666, + "learning_rate": 9.359641565838353e-05, + "loss": 1.8354, + "step": 6126 + }, + { + "epoch": 1.8806015960712092, + "grad_norm": 0.5280462503433228, + "learning_rate": 9.359398169007807e-05, + "loss": 1.9446, + "step": 6127 + }, + { + "epoch": 1.8809085328422346, + "grad_norm": 0.4991915225982666, + "learning_rate": 9.359154729095135e-05, + "loss": 1.9003, + "step": 6128 + }, + { + "epoch": 1.8812154696132597, + "grad_norm": 0.3766331374645233, + "learning_rate": 9.358911246102738e-05, + "loss": 1.9149, + "step": 6129 + }, + { + "epoch": 1.8815224063842848, + "grad_norm": 0.39050692319869995, + "learning_rate": 9.358667720033026e-05, + "loss": 1.8945, + "step": 6130 + }, + { + "epoch": 1.8818293431553101, + "grad_norm": 0.47633904218673706, + "learning_rate": 9.358424150888405e-05, + "loss": 1.8772, + "step": 6131 + }, + { + "epoch": 1.882136279926335, + "grad_norm": 0.46322503685951233, + "learning_rate": 9.358180538671283e-05, + "loss": 1.893, + "step": 6132 + }, + { + "epoch": 1.8824432166973604, + "grad_norm": 0.39437612891197205, + "learning_rate": 9.357936883384066e-05, + "loss": 1.9394, + "step": 6133 + }, + { + "epoch": 1.8827501534683855, + "grad_norm": 0.4534996747970581, + "learning_rate": 9.357693185029162e-05, + "loss": 1.9689, + "step": 6134 + }, + { + "epoch": 1.8830570902394106, + "grad_norm": 0.4408230483531952, + "learning_rate": 9.35744944360898e-05, + "loss": 1.876, + "step": 6135 + }, + { + "epoch": 1.883364027010436, + "grad_norm": 0.5688899755477905, + "learning_rate": 9.35720565912593e-05, + "loss": 2.0153, + "step": 6136 + }, + { + "epoch": 1.883670963781461, + "grad_norm": 0.5005510449409485, + "learning_rate": 9.356961831582418e-05, + "loss": 1.9454, + "step": 6137 + }, + { + "epoch": 1.8839779005524862, + "grad_norm": 0.4002588987350464, + "learning_rate": 9.356717960980856e-05, + "loss": 1.9153, + "step": 6138 + }, + { + "epoch": 1.8842848373235115, + "grad_norm": 0.49053385853767395, + "learning_rate": 9.356474047323653e-05, + "loss": 1.9734, + "step": 6139 + }, + { + "epoch": 1.8845917740945364, + "grad_norm": 0.4828382432460785, + "learning_rate": 9.35623009061322e-05, + "loss": 1.8946, + "step": 6140 + }, + { + "epoch": 1.8848987108655617, + "grad_norm": 0.4389181137084961, + "learning_rate": 9.35598609085197e-05, + "loss": 1.9491, + "step": 6141 + }, + { + "epoch": 1.8852056476365868, + "grad_norm": 0.4010564982891083, + "learning_rate": 9.35574204804231e-05, + "loss": 1.8786, + "step": 6142 + }, + { + "epoch": 1.885512584407612, + "grad_norm": 0.4038756787776947, + "learning_rate": 9.355497962186657e-05, + "loss": 1.907, + "step": 6143 + }, + { + "epoch": 1.8858195211786373, + "grad_norm": 0.5030881762504578, + "learning_rate": 9.355253833287418e-05, + "loss": 1.8438, + "step": 6144 + }, + { + "epoch": 1.8861264579496624, + "grad_norm": 0.42690956592559814, + "learning_rate": 9.355009661347007e-05, + "loss": 1.8254, + "step": 6145 + }, + { + "epoch": 1.8864333947206875, + "grad_norm": 0.37733983993530273, + "learning_rate": 9.35476544636784e-05, + "loss": 1.9035, + "step": 6146 + }, + { + "epoch": 1.8867403314917128, + "grad_norm": 0.36874648928642273, + "learning_rate": 9.354521188352327e-05, + "loss": 1.885, + "step": 6147 + }, + { + "epoch": 1.8870472682627377, + "grad_norm": 0.36208659410476685, + "learning_rate": 9.354276887302885e-05, + "loss": 1.9416, + "step": 6148 + }, + { + "epoch": 1.887354205033763, + "grad_norm": 0.3952158987522125, + "learning_rate": 9.354032543221926e-05, + "loss": 1.9073, + "step": 6149 + }, + { + "epoch": 1.8876611418047882, + "grad_norm": 0.3603280782699585, + "learning_rate": 9.353788156111864e-05, + "loss": 1.9204, + "step": 6150 + }, + { + "epoch": 1.8879680785758133, + "grad_norm": 0.4325824975967407, + "learning_rate": 9.353543725975118e-05, + "loss": 1.9345, + "step": 6151 + }, + { + "epoch": 1.8882750153468386, + "grad_norm": 0.46270960569381714, + "learning_rate": 9.3532992528141e-05, + "loss": 1.9783, + "step": 6152 + }, + { + "epoch": 1.8885819521178637, + "grad_norm": 0.42317959666252136, + "learning_rate": 9.353054736631228e-05, + "loss": 1.9252, + "step": 6153 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.42653194069862366, + "learning_rate": 9.352810177428917e-05, + "loss": 1.9875, + "step": 6154 + }, + { + "epoch": 1.8891958256599142, + "grad_norm": 0.49819129705429077, + "learning_rate": 9.352565575209584e-05, + "loss": 1.9334, + "step": 6155 + }, + { + "epoch": 1.889502762430939, + "grad_norm": 0.4481790065765381, + "learning_rate": 9.352320929975646e-05, + "loss": 1.8939, + "step": 6156 + }, + { + "epoch": 1.8898096992019644, + "grad_norm": 0.41602686047554016, + "learning_rate": 9.352076241729524e-05, + "loss": 1.9207, + "step": 6157 + }, + { + "epoch": 1.8901166359729895, + "grad_norm": 0.4516759216785431, + "learning_rate": 9.351831510473633e-05, + "loss": 1.9384, + "step": 6158 + }, + { + "epoch": 1.8904235727440146, + "grad_norm": 0.5030251741409302, + "learning_rate": 9.351586736210391e-05, + "loss": 1.9787, + "step": 6159 + }, + { + "epoch": 1.89073050951504, + "grad_norm": 0.37176215648651123, + "learning_rate": 9.35134191894222e-05, + "loss": 1.8826, + "step": 6160 + }, + { + "epoch": 1.891037446286065, + "grad_norm": 0.3850235939025879, + "learning_rate": 9.351097058671537e-05, + "loss": 1.8689, + "step": 6161 + }, + { + "epoch": 1.8913443830570902, + "grad_norm": 0.3740260601043701, + "learning_rate": 9.350852155400764e-05, + "loss": 1.8624, + "step": 6162 + }, + { + "epoch": 1.8916513198281155, + "grad_norm": 0.386124849319458, + "learning_rate": 9.350607209132318e-05, + "loss": 1.8506, + "step": 6163 + }, + { + "epoch": 1.8919582565991406, + "grad_norm": 0.3743472993373871, + "learning_rate": 9.350362219868623e-05, + "loss": 1.9499, + "step": 6164 + }, + { + "epoch": 1.8922651933701657, + "grad_norm": 0.4257555603981018, + "learning_rate": 9.350117187612097e-05, + "loss": 1.9407, + "step": 6165 + }, + { + "epoch": 1.892572130141191, + "grad_norm": 0.37218552827835083, + "learning_rate": 9.349872112365163e-05, + "loss": 1.8772, + "step": 6166 + }, + { + "epoch": 1.892879066912216, + "grad_norm": 0.3443894386291504, + "learning_rate": 9.349626994130245e-05, + "loss": 1.8846, + "step": 6167 + }, + { + "epoch": 1.8931860036832413, + "grad_norm": 0.33507248759269714, + "learning_rate": 9.349381832909763e-05, + "loss": 1.9303, + "step": 6168 + }, + { + "epoch": 1.8934929404542664, + "grad_norm": 0.3844592869281769, + "learning_rate": 9.349136628706141e-05, + "loss": 1.9453, + "step": 6169 + }, + { + "epoch": 1.8937998772252915, + "grad_norm": 0.35765793919563293, + "learning_rate": 9.348891381521802e-05, + "loss": 1.8745, + "step": 6170 + }, + { + "epoch": 1.8941068139963169, + "grad_norm": 0.3732185661792755, + "learning_rate": 9.348646091359168e-05, + "loss": 1.9318, + "step": 6171 + }, + { + "epoch": 1.894413750767342, + "grad_norm": 0.3704257607460022, + "learning_rate": 9.348400758220666e-05, + "loss": 1.9285, + "step": 6172 + }, + { + "epoch": 1.894720687538367, + "grad_norm": 0.32159942388534546, + "learning_rate": 9.348155382108717e-05, + "loss": 1.8368, + "step": 6173 + }, + { + "epoch": 1.8950276243093924, + "grad_norm": 0.32755646109580994, + "learning_rate": 9.34790996302575e-05, + "loss": 1.8975, + "step": 6174 + }, + { + "epoch": 1.8953345610804173, + "grad_norm": 0.38797906041145325, + "learning_rate": 9.347664500974186e-05, + "loss": 1.9684, + "step": 6175 + }, + { + "epoch": 1.8956414978514426, + "grad_norm": 0.3870599865913391, + "learning_rate": 9.347418995956456e-05, + "loss": 1.963, + "step": 6176 + }, + { + "epoch": 1.8959484346224678, + "grad_norm": 0.35739025473594666, + "learning_rate": 9.347173447974982e-05, + "loss": 1.8912, + "step": 6177 + }, + { + "epoch": 1.8962553713934929, + "grad_norm": 0.3525852859020233, + "learning_rate": 9.346927857032193e-05, + "loss": 1.8455, + "step": 6178 + }, + { + "epoch": 1.8965623081645182, + "grad_norm": 0.39735934138298035, + "learning_rate": 9.346682223130514e-05, + "loss": 1.8824, + "step": 6179 + }, + { + "epoch": 1.8968692449355433, + "grad_norm": 0.3677692413330078, + "learning_rate": 9.346436546272373e-05, + "loss": 1.8723, + "step": 6180 + }, + { + "epoch": 1.8971761817065684, + "grad_norm": 0.3660476505756378, + "learning_rate": 9.346190826460199e-05, + "loss": 1.9674, + "step": 6181 + }, + { + "epoch": 1.8974831184775938, + "grad_norm": 0.4416230022907257, + "learning_rate": 9.34594506369642e-05, + "loss": 1.9309, + "step": 6182 + }, + { + "epoch": 1.8977900552486187, + "grad_norm": 0.39761826395988464, + "learning_rate": 9.345699257983466e-05, + "loss": 1.9408, + "step": 6183 + }, + { + "epoch": 1.898096992019644, + "grad_norm": 0.44419440627098083, + "learning_rate": 9.345453409323763e-05, + "loss": 2.0013, + "step": 6184 + }, + { + "epoch": 1.898403928790669, + "grad_norm": 0.4173676371574402, + "learning_rate": 9.345207517719743e-05, + "loss": 1.8462, + "step": 6185 + }, + { + "epoch": 1.8987108655616942, + "grad_norm": 0.39312002062797546, + "learning_rate": 9.344961583173837e-05, + "loss": 1.8716, + "step": 6186 + }, + { + "epoch": 1.8990178023327196, + "grad_norm": 0.389996737241745, + "learning_rate": 9.344715605688472e-05, + "loss": 1.9331, + "step": 6187 + }, + { + "epoch": 1.8993247391037447, + "grad_norm": 0.4575251340866089, + "learning_rate": 9.34446958526608e-05, + "loss": 1.9408, + "step": 6188 + }, + { + "epoch": 1.8996316758747698, + "grad_norm": 0.425075888633728, + "learning_rate": 9.344223521909097e-05, + "loss": 1.8632, + "step": 6189 + }, + { + "epoch": 1.899938612645795, + "grad_norm": 0.3622394800186157, + "learning_rate": 9.343977415619948e-05, + "loss": 1.8671, + "step": 6190 + }, + { + "epoch": 1.90024554941682, + "grad_norm": 0.38955047726631165, + "learning_rate": 9.343731266401068e-05, + "loss": 1.8955, + "step": 6191 + }, + { + "epoch": 1.9005524861878453, + "grad_norm": 0.40853381156921387, + "learning_rate": 9.34348507425489e-05, + "loss": 1.8477, + "step": 6192 + }, + { + "epoch": 1.9008594229588704, + "grad_norm": 0.36416095495224, + "learning_rate": 9.343238839183848e-05, + "loss": 1.8596, + "step": 6193 + }, + { + "epoch": 1.9011663597298956, + "grad_norm": 0.3371017277240753, + "learning_rate": 9.342992561190374e-05, + "loss": 1.9646, + "step": 6194 + }, + { + "epoch": 1.901473296500921, + "grad_norm": 0.3605191111564636, + "learning_rate": 9.3427462402769e-05, + "loss": 1.9165, + "step": 6195 + }, + { + "epoch": 1.901780233271946, + "grad_norm": 0.32952287793159485, + "learning_rate": 9.342499876445863e-05, + "loss": 1.8827, + "step": 6196 + }, + { + "epoch": 1.9020871700429711, + "grad_norm": 0.3627411425113678, + "learning_rate": 9.342253469699698e-05, + "loss": 1.9058, + "step": 6197 + }, + { + "epoch": 1.9023941068139965, + "grad_norm": 0.3830505311489105, + "learning_rate": 9.342007020040839e-05, + "loss": 1.89, + "step": 6198 + }, + { + "epoch": 1.9027010435850213, + "grad_norm": 0.36550065875053406, + "learning_rate": 9.341760527471722e-05, + "loss": 1.9004, + "step": 6199 + }, + { + "epoch": 1.9030079803560467, + "grad_norm": 0.4098506569862366, + "learning_rate": 9.341513991994782e-05, + "loss": 1.8656, + "step": 6200 + }, + { + "epoch": 1.9033149171270718, + "grad_norm": 0.5218825340270996, + "learning_rate": 9.341267413612456e-05, + "loss": 1.9179, + "step": 6201 + }, + { + "epoch": 1.903621853898097, + "grad_norm": 0.6201978921890259, + "learning_rate": 9.34102079232718e-05, + "loss": 1.9485, + "step": 6202 + }, + { + "epoch": 1.9039287906691222, + "grad_norm": 0.597594141960144, + "learning_rate": 9.340774128141395e-05, + "loss": 1.9074, + "step": 6203 + }, + { + "epoch": 1.9042357274401474, + "grad_norm": 0.477268248796463, + "learning_rate": 9.340527421057533e-05, + "loss": 1.9202, + "step": 6204 + }, + { + "epoch": 1.9045426642111725, + "grad_norm": 0.39805278182029724, + "learning_rate": 9.340280671078035e-05, + "loss": 1.8801, + "step": 6205 + }, + { + "epoch": 1.9048496009821978, + "grad_norm": 0.5815454721450806, + "learning_rate": 9.340033878205342e-05, + "loss": 1.8564, + "step": 6206 + }, + { + "epoch": 1.9051565377532227, + "grad_norm": 0.6385661363601685, + "learning_rate": 9.339787042441888e-05, + "loss": 1.8992, + "step": 6207 + }, + { + "epoch": 1.905463474524248, + "grad_norm": 0.5905124545097351, + "learning_rate": 9.339540163790116e-05, + "loss": 1.9608, + "step": 6208 + }, + { + "epoch": 1.9057704112952731, + "grad_norm": 0.37329113483428955, + "learning_rate": 9.339293242252465e-05, + "loss": 1.9037, + "step": 6209 + }, + { + "epoch": 1.9060773480662982, + "grad_norm": 0.4568968117237091, + "learning_rate": 9.339046277831374e-05, + "loss": 1.8719, + "step": 6210 + }, + { + "epoch": 1.9063842848373236, + "grad_norm": 0.43003782629966736, + "learning_rate": 9.338799270529284e-05, + "loss": 1.8594, + "step": 6211 + }, + { + "epoch": 1.9066912216083487, + "grad_norm": 0.3795240819454193, + "learning_rate": 9.338552220348637e-05, + "loss": 1.8645, + "step": 6212 + }, + { + "epoch": 1.9069981583793738, + "grad_norm": 0.3791581392288208, + "learning_rate": 9.338305127291876e-05, + "loss": 1.9076, + "step": 6213 + }, + { + "epoch": 1.9073050951503991, + "grad_norm": 0.3747733533382416, + "learning_rate": 9.338057991361438e-05, + "loss": 1.8665, + "step": 6214 + }, + { + "epoch": 1.907612031921424, + "grad_norm": 0.3994114100933075, + "learning_rate": 9.337810812559771e-05, + "loss": 1.9202, + "step": 6215 + }, + { + "epoch": 1.9079189686924494, + "grad_norm": 0.3808605670928955, + "learning_rate": 9.337563590889312e-05, + "loss": 1.9272, + "step": 6216 + }, + { + "epoch": 1.9082259054634745, + "grad_norm": 0.3461966812610626, + "learning_rate": 9.33731632635251e-05, + "loss": 1.8621, + "step": 6217 + }, + { + "epoch": 1.9085328422344996, + "grad_norm": 0.37272316217422485, + "learning_rate": 9.337069018951805e-05, + "loss": 1.8996, + "step": 6218 + }, + { + "epoch": 1.908839779005525, + "grad_norm": 0.40319329500198364, + "learning_rate": 9.336821668689642e-05, + "loss": 1.8852, + "step": 6219 + }, + { + "epoch": 1.90914671577655, + "grad_norm": 0.4059053659439087, + "learning_rate": 9.336574275568463e-05, + "loss": 1.9156, + "step": 6220 + }, + { + "epoch": 1.9094536525475752, + "grad_norm": 0.41244640946388245, + "learning_rate": 9.336326839590719e-05, + "loss": 1.9858, + "step": 6221 + }, + { + "epoch": 1.9097605893186005, + "grad_norm": 0.38230007886886597, + "learning_rate": 9.336079360758849e-05, + "loss": 1.8756, + "step": 6222 + }, + { + "epoch": 1.9100675260896254, + "grad_norm": 0.3620646297931671, + "learning_rate": 9.335831839075304e-05, + "loss": 1.9305, + "step": 6223 + }, + { + "epoch": 1.9103744628606507, + "grad_norm": 0.3700193166732788, + "learning_rate": 9.335584274542525e-05, + "loss": 1.8544, + "step": 6224 + }, + { + "epoch": 1.9106813996316758, + "grad_norm": 0.36827734112739563, + "learning_rate": 9.335336667162962e-05, + "loss": 1.8658, + "step": 6225 + }, + { + "epoch": 1.910988336402701, + "grad_norm": 0.33878061175346375, + "learning_rate": 9.33508901693906e-05, + "loss": 1.8638, + "step": 6226 + }, + { + "epoch": 1.9112952731737263, + "grad_norm": 0.3522186577320099, + "learning_rate": 9.334841323873269e-05, + "loss": 1.9109, + "step": 6227 + }, + { + "epoch": 1.9116022099447514, + "grad_norm": 0.3552776277065277, + "learning_rate": 9.334593587968035e-05, + "loss": 1.8499, + "step": 6228 + }, + { + "epoch": 1.9119091467157765, + "grad_norm": 0.3232300877571106, + "learning_rate": 9.334345809225805e-05, + "loss": 1.9078, + "step": 6229 + }, + { + "epoch": 1.9122160834868018, + "grad_norm": 0.3500599265098572, + "learning_rate": 9.33409798764903e-05, + "loss": 1.8953, + "step": 6230 + }, + { + "epoch": 1.9125230202578267, + "grad_norm": 0.4011479914188385, + "learning_rate": 9.333850123240159e-05, + "loss": 1.8961, + "step": 6231 + }, + { + "epoch": 1.912829957028852, + "grad_norm": 0.419539213180542, + "learning_rate": 9.333602216001642e-05, + "loss": 1.9381, + "step": 6232 + }, + { + "epoch": 1.9131368937998774, + "grad_norm": 0.364956259727478, + "learning_rate": 9.333354265935926e-05, + "loss": 1.8495, + "step": 6233 + }, + { + "epoch": 1.9134438305709023, + "grad_norm": 0.3322601318359375, + "learning_rate": 9.333106273045464e-05, + "loss": 1.8389, + "step": 6234 + }, + { + "epoch": 1.9137507673419276, + "grad_norm": 0.3706522583961487, + "learning_rate": 9.332858237332705e-05, + "loss": 1.904, + "step": 6235 + }, + { + "epoch": 1.9140577041129527, + "grad_norm": 0.3900963366031647, + "learning_rate": 9.332610158800104e-05, + "loss": 1.8974, + "step": 6236 + }, + { + "epoch": 1.9143646408839778, + "grad_norm": 0.3308334946632385, + "learning_rate": 9.332362037450108e-05, + "loss": 1.959, + "step": 6237 + }, + { + "epoch": 1.9146715776550032, + "grad_norm": 0.37876754999160767, + "learning_rate": 9.332113873285171e-05, + "loss": 1.9187, + "step": 6238 + }, + { + "epoch": 1.9149785144260283, + "grad_norm": 0.3557550609111786, + "learning_rate": 9.331865666307746e-05, + "loss": 1.9351, + "step": 6239 + }, + { + "epoch": 1.9152854511970534, + "grad_norm": 0.3792133927345276, + "learning_rate": 9.331617416520285e-05, + "loss": 1.8488, + "step": 6240 + }, + { + "epoch": 1.9155923879680787, + "grad_norm": 0.40517017245292664, + "learning_rate": 9.331369123925242e-05, + "loss": 1.9311, + "step": 6241 + }, + { + "epoch": 1.9158993247391036, + "grad_norm": 0.34011030197143555, + "learning_rate": 9.331120788525072e-05, + "loss": 1.8606, + "step": 6242 + }, + { + "epoch": 1.916206261510129, + "grad_norm": 0.39949584007263184, + "learning_rate": 9.330872410322227e-05, + "loss": 1.9156, + "step": 6243 + }, + { + "epoch": 1.916513198281154, + "grad_norm": 0.3771394193172455, + "learning_rate": 9.330623989319162e-05, + "loss": 1.8448, + "step": 6244 + }, + { + "epoch": 1.9168201350521792, + "grad_norm": 0.32114169001579285, + "learning_rate": 9.330375525518333e-05, + "loss": 1.8681, + "step": 6245 + }, + { + "epoch": 1.9171270718232045, + "grad_norm": 0.3438408672809601, + "learning_rate": 9.330127018922194e-05, + "loss": 1.8582, + "step": 6246 + }, + { + "epoch": 1.9174340085942296, + "grad_norm": 0.35971906781196594, + "learning_rate": 9.329878469533201e-05, + "loss": 1.9026, + "step": 6247 + }, + { + "epoch": 1.9177409453652547, + "grad_norm": 0.3953855633735657, + "learning_rate": 9.329629877353813e-05, + "loss": 1.8837, + "step": 6248 + }, + { + "epoch": 1.91804788213628, + "grad_norm": 0.36541905999183655, + "learning_rate": 9.329381242386485e-05, + "loss": 1.9156, + "step": 6249 + }, + { + "epoch": 1.918354818907305, + "grad_norm": 0.3577594459056854, + "learning_rate": 9.329132564633673e-05, + "loss": 1.8791, + "step": 6250 + }, + { + "epoch": 1.9186617556783303, + "grad_norm": 0.3869122564792633, + "learning_rate": 9.328883844097837e-05, + "loss": 1.9048, + "step": 6251 + }, + { + "epoch": 1.9189686924493554, + "grad_norm": 0.35097724199295044, + "learning_rate": 9.328635080781433e-05, + "loss": 1.9602, + "step": 6252 + }, + { + "epoch": 1.9192756292203805, + "grad_norm": 0.3813062012195587, + "learning_rate": 9.328386274686919e-05, + "loss": 1.9133, + "step": 6253 + }, + { + "epoch": 1.9195825659914059, + "grad_norm": 0.3950280249118805, + "learning_rate": 9.328137425816756e-05, + "loss": 1.9462, + "step": 6254 + }, + { + "epoch": 1.919889502762431, + "grad_norm": 0.41710540652275085, + "learning_rate": 9.327888534173402e-05, + "loss": 1.8616, + "step": 6255 + }, + { + "epoch": 1.920196439533456, + "grad_norm": 0.39998626708984375, + "learning_rate": 9.327639599759318e-05, + "loss": 1.8758, + "step": 6256 + }, + { + "epoch": 1.9205033763044814, + "grad_norm": 0.35425302386283875, + "learning_rate": 9.32739062257696e-05, + "loss": 1.8896, + "step": 6257 + }, + { + "epoch": 1.9208103130755063, + "grad_norm": 0.3487682640552521, + "learning_rate": 9.327141602628793e-05, + "loss": 1.8901, + "step": 6258 + }, + { + "epoch": 1.9211172498465316, + "grad_norm": 0.38767126202583313, + "learning_rate": 9.326892539917277e-05, + "loss": 1.9264, + "step": 6259 + }, + { + "epoch": 1.9214241866175568, + "grad_norm": 0.4265333116054535, + "learning_rate": 9.326643434444872e-05, + "loss": 1.9282, + "step": 6260 + }, + { + "epoch": 1.9217311233885819, + "grad_norm": 0.3386894166469574, + "learning_rate": 9.326394286214042e-05, + "loss": 1.8167, + "step": 6261 + }, + { + "epoch": 1.9220380601596072, + "grad_norm": 0.3594066798686981, + "learning_rate": 9.326145095227246e-05, + "loss": 1.9293, + "step": 6262 + }, + { + "epoch": 1.9223449969306323, + "grad_norm": 0.4041733741760254, + "learning_rate": 9.32589586148695e-05, + "loss": 2.0066, + "step": 6263 + }, + { + "epoch": 1.9226519337016574, + "grad_norm": 0.45588794350624084, + "learning_rate": 9.325646584995615e-05, + "loss": 1.9485, + "step": 6264 + }, + { + "epoch": 1.9229588704726828, + "grad_norm": 0.42583590745925903, + "learning_rate": 9.325397265755705e-05, + "loss": 1.8973, + "step": 6265 + }, + { + "epoch": 1.9232658072437077, + "grad_norm": 0.38701504468917847, + "learning_rate": 9.325147903769684e-05, + "loss": 1.9624, + "step": 6266 + }, + { + "epoch": 1.923572744014733, + "grad_norm": 0.4298608899116516, + "learning_rate": 9.324898499040017e-05, + "loss": 1.9033, + "step": 6267 + }, + { + "epoch": 1.923879680785758, + "grad_norm": 0.3692619800567627, + "learning_rate": 9.324649051569167e-05, + "loss": 1.973, + "step": 6268 + }, + { + "epoch": 1.9241866175567832, + "grad_norm": 0.40625011920928955, + "learning_rate": 9.324399561359602e-05, + "loss": 1.8629, + "step": 6269 + }, + { + "epoch": 1.9244935543278086, + "grad_norm": 0.43613263964653015, + "learning_rate": 9.324150028413784e-05, + "loss": 1.8928, + "step": 6270 + }, + { + "epoch": 1.9248004910988337, + "grad_norm": 0.4670937657356262, + "learning_rate": 9.323900452734182e-05, + "loss": 1.8809, + "step": 6271 + }, + { + "epoch": 1.9251074278698588, + "grad_norm": 0.43263986706733704, + "learning_rate": 9.323650834323262e-05, + "loss": 1.891, + "step": 6272 + }, + { + "epoch": 1.9254143646408841, + "grad_norm": 0.4253878891468048, + "learning_rate": 9.32340117318349e-05, + "loss": 2.0064, + "step": 6273 + }, + { + "epoch": 1.925721301411909, + "grad_norm": 0.3742302358150482, + "learning_rate": 9.323151469317332e-05, + "loss": 1.9441, + "step": 6274 + }, + { + "epoch": 1.9260282381829343, + "grad_norm": 0.37415632605552673, + "learning_rate": 9.32290172272726e-05, + "loss": 1.8901, + "step": 6275 + }, + { + "epoch": 1.9263351749539595, + "grad_norm": 0.402935266494751, + "learning_rate": 9.322651933415738e-05, + "loss": 1.9013, + "step": 6276 + }, + { + "epoch": 1.9266421117249846, + "grad_norm": 0.479819118976593, + "learning_rate": 9.322402101385235e-05, + "loss": 1.9713, + "step": 6277 + }, + { + "epoch": 1.92694904849601, + "grad_norm": 0.4472719430923462, + "learning_rate": 9.322152226638222e-05, + "loss": 1.9106, + "step": 6278 + }, + { + "epoch": 1.927255985267035, + "grad_norm": 0.36508920788764954, + "learning_rate": 9.321902309177168e-05, + "loss": 1.8999, + "step": 6279 + }, + { + "epoch": 1.9275629220380601, + "grad_norm": 0.38674476742744446, + "learning_rate": 9.321652349004542e-05, + "loss": 1.8653, + "step": 6280 + }, + { + "epoch": 1.9278698588090855, + "grad_norm": 0.3745587170124054, + "learning_rate": 9.321402346122814e-05, + "loss": 1.8764, + "step": 6281 + }, + { + "epoch": 1.9281767955801103, + "grad_norm": 0.37824445962905884, + "learning_rate": 9.321152300534454e-05, + "loss": 1.8712, + "step": 6282 + }, + { + "epoch": 1.9284837323511357, + "grad_norm": 0.3442685306072235, + "learning_rate": 9.320902212241936e-05, + "loss": 1.8242, + "step": 6283 + }, + { + "epoch": 1.9287906691221608, + "grad_norm": 0.3152186870574951, + "learning_rate": 9.32065208124773e-05, + "loss": 1.9282, + "step": 6284 + }, + { + "epoch": 1.929097605893186, + "grad_norm": 0.35380542278289795, + "learning_rate": 9.320401907554306e-05, + "loss": 1.8783, + "step": 6285 + }, + { + "epoch": 1.9294045426642112, + "grad_norm": 0.3140089511871338, + "learning_rate": 9.320151691164138e-05, + "loss": 1.9174, + "step": 6286 + }, + { + "epoch": 1.9297114794352364, + "grad_norm": 0.33666202425956726, + "learning_rate": 9.3199014320797e-05, + "loss": 1.8926, + "step": 6287 + }, + { + "epoch": 1.9300184162062615, + "grad_norm": 0.3297472894191742, + "learning_rate": 9.319651130303465e-05, + "loss": 1.8763, + "step": 6288 + }, + { + "epoch": 1.9303253529772868, + "grad_norm": 0.3323235511779785, + "learning_rate": 9.319400785837906e-05, + "loss": 1.9088, + "step": 6289 + }, + { + "epoch": 1.9306322897483117, + "grad_norm": 0.32601413130760193, + "learning_rate": 9.319150398685494e-05, + "loss": 1.8672, + "step": 6290 + }, + { + "epoch": 1.930939226519337, + "grad_norm": 0.35310089588165283, + "learning_rate": 9.318899968848708e-05, + "loss": 1.9492, + "step": 6291 + }, + { + "epoch": 1.9312461632903621, + "grad_norm": 0.3718548119068146, + "learning_rate": 9.31864949633002e-05, + "loss": 1.8692, + "step": 6292 + }, + { + "epoch": 1.9315531000613873, + "grad_norm": 0.42382025718688965, + "learning_rate": 9.318398981131908e-05, + "loss": 1.9693, + "step": 6293 + }, + { + "epoch": 1.9318600368324126, + "grad_norm": 0.5123299360275269, + "learning_rate": 9.318148423256845e-05, + "loss": 2.0117, + "step": 6294 + }, + { + "epoch": 1.9321669736034377, + "grad_norm": 0.4483809769153595, + "learning_rate": 9.317897822707308e-05, + "loss": 1.9165, + "step": 6295 + }, + { + "epoch": 1.9324739103744628, + "grad_norm": 0.4385908544063568, + "learning_rate": 9.317647179485776e-05, + "loss": 1.8869, + "step": 6296 + }, + { + "epoch": 1.9327808471454881, + "grad_norm": 0.42863771319389343, + "learning_rate": 9.317396493594724e-05, + "loss": 1.9484, + "step": 6297 + }, + { + "epoch": 1.933087783916513, + "grad_norm": 0.4130534529685974, + "learning_rate": 9.317145765036627e-05, + "loss": 1.9201, + "step": 6298 + }, + { + "epoch": 1.9333947206875384, + "grad_norm": 0.39024612307548523, + "learning_rate": 9.316894993813965e-05, + "loss": 1.9674, + "step": 6299 + }, + { + "epoch": 1.9337016574585635, + "grad_norm": 0.41060271859169006, + "learning_rate": 9.316644179929219e-05, + "loss": 1.9529, + "step": 6300 + }, + { + "epoch": 1.9340085942295886, + "grad_norm": 0.4302372634410858, + "learning_rate": 9.316393323384863e-05, + "loss": 1.8998, + "step": 6301 + }, + { + "epoch": 1.934315531000614, + "grad_norm": 0.3739410936832428, + "learning_rate": 9.316142424183379e-05, + "loss": 1.8812, + "step": 6302 + }, + { + "epoch": 1.934622467771639, + "grad_norm": 0.3965891897678375, + "learning_rate": 9.315891482327245e-05, + "loss": 1.8851, + "step": 6303 + }, + { + "epoch": 1.9349294045426642, + "grad_norm": 0.4486664831638336, + "learning_rate": 9.315640497818943e-05, + "loss": 1.9494, + "step": 6304 + }, + { + "epoch": 1.9352363413136895, + "grad_norm": 0.5530070662498474, + "learning_rate": 9.315389470660951e-05, + "loss": 1.9716, + "step": 6305 + }, + { + "epoch": 1.9355432780847146, + "grad_norm": 0.7142495512962341, + "learning_rate": 9.315138400855751e-05, + "loss": 1.947, + "step": 6306 + }, + { + "epoch": 1.9358502148557397, + "grad_norm": 0.7555594444274902, + "learning_rate": 9.314887288405827e-05, + "loss": 1.873, + "step": 6307 + }, + { + "epoch": 1.936157151626765, + "grad_norm": 0.6025232076644897, + "learning_rate": 9.314636133313654e-05, + "loss": 1.9189, + "step": 6308 + }, + { + "epoch": 1.93646408839779, + "grad_norm": 0.3686346113681793, + "learning_rate": 9.314384935581719e-05, + "loss": 1.8461, + "step": 6309 + }, + { + "epoch": 1.9367710251688153, + "grad_norm": 0.46265771985054016, + "learning_rate": 9.314133695212505e-05, + "loss": 1.8955, + "step": 6310 + }, + { + "epoch": 1.9370779619398404, + "grad_norm": 0.7023865580558777, + "learning_rate": 9.313882412208492e-05, + "loss": 1.9378, + "step": 6311 + }, + { + "epoch": 1.9373848987108655, + "grad_norm": 0.7163348197937012, + "learning_rate": 9.313631086572163e-05, + "loss": 1.9278, + "step": 6312 + }, + { + "epoch": 1.9376918354818908, + "grad_norm": 0.4772320091724396, + "learning_rate": 9.313379718306006e-05, + "loss": 1.9215, + "step": 6313 + }, + { + "epoch": 1.937998772252916, + "grad_norm": 0.4934171438217163, + "learning_rate": 9.313128307412501e-05, + "loss": 1.9725, + "step": 6314 + }, + { + "epoch": 1.938305709023941, + "grad_norm": 0.5988278985023499, + "learning_rate": 9.312876853894134e-05, + "loss": 1.9238, + "step": 6315 + }, + { + "epoch": 1.9386126457949664, + "grad_norm": 0.5819640159606934, + "learning_rate": 9.31262535775339e-05, + "loss": 1.9228, + "step": 6316 + }, + { + "epoch": 1.9389195825659913, + "grad_norm": 0.49525877833366394, + "learning_rate": 9.312373818992756e-05, + "loss": 1.8939, + "step": 6317 + }, + { + "epoch": 1.9392265193370166, + "grad_norm": 0.3778049647808075, + "learning_rate": 9.312122237614715e-05, + "loss": 1.8709, + "step": 6318 + }, + { + "epoch": 1.9395334561080417, + "grad_norm": 0.48716801404953003, + "learning_rate": 9.311870613621754e-05, + "loss": 1.9014, + "step": 6319 + }, + { + "epoch": 1.9398403928790668, + "grad_norm": 0.47298866510391235, + "learning_rate": 9.311618947016362e-05, + "loss": 1.8686, + "step": 6320 + }, + { + "epoch": 1.9401473296500922, + "grad_norm": 0.3709685206413269, + "learning_rate": 9.311367237801023e-05, + "loss": 1.9531, + "step": 6321 + }, + { + "epoch": 1.9404542664211173, + "grad_norm": 0.3898928761482239, + "learning_rate": 9.311115485978228e-05, + "loss": 1.8806, + "step": 6322 + }, + { + "epoch": 1.9407612031921424, + "grad_norm": 0.43091922998428345, + "learning_rate": 9.310863691550461e-05, + "loss": 1.9278, + "step": 6323 + }, + { + "epoch": 1.9410681399631677, + "grad_norm": 0.3788231909275055, + "learning_rate": 9.310611854520212e-05, + "loss": 1.893, + "step": 6324 + }, + { + "epoch": 1.9413750767341926, + "grad_norm": 0.4471469819545746, + "learning_rate": 9.310359974889972e-05, + "loss": 1.9706, + "step": 6325 + }, + { + "epoch": 1.941682013505218, + "grad_norm": 0.4047459661960602, + "learning_rate": 9.310108052662228e-05, + "loss": 1.8863, + "step": 6326 + }, + { + "epoch": 1.941988950276243, + "grad_norm": 0.4334566593170166, + "learning_rate": 9.309856087839468e-05, + "loss": 1.9543, + "step": 6327 + }, + { + "epoch": 1.9422958870472682, + "grad_norm": 0.3828316032886505, + "learning_rate": 9.309604080424185e-05, + "loss": 1.8601, + "step": 6328 + }, + { + "epoch": 1.9426028238182935, + "grad_norm": 0.3702560067176819, + "learning_rate": 9.30935203041887e-05, + "loss": 1.9055, + "step": 6329 + }, + { + "epoch": 1.9429097605893186, + "grad_norm": 0.4922797977924347, + "learning_rate": 9.309099937826011e-05, + "loss": 1.9589, + "step": 6330 + }, + { + "epoch": 1.9432166973603437, + "grad_norm": 0.4073271155357361, + "learning_rate": 9.308847802648102e-05, + "loss": 1.9727, + "step": 6331 + }, + { + "epoch": 1.943523634131369, + "grad_norm": 0.3833904266357422, + "learning_rate": 9.308595624887633e-05, + "loss": 1.8641, + "step": 6332 + }, + { + "epoch": 1.943830570902394, + "grad_norm": 0.44063761830329895, + "learning_rate": 9.308343404547095e-05, + "loss": 1.8996, + "step": 6333 + }, + { + "epoch": 1.9441375076734193, + "grad_norm": 0.4776977300643921, + "learning_rate": 9.308091141628983e-05, + "loss": 1.9353, + "step": 6334 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.39584699273109436, + "learning_rate": 9.307838836135792e-05, + "loss": 1.8521, + "step": 6335 + }, + { + "epoch": 1.9447513812154695, + "grad_norm": 0.3220890760421753, + "learning_rate": 9.30758648807001e-05, + "loss": 1.825, + "step": 6336 + }, + { + "epoch": 1.9450583179864949, + "grad_norm": 0.4301774501800537, + "learning_rate": 9.307334097434133e-05, + "loss": 1.9317, + "step": 6337 + }, + { + "epoch": 1.94536525475752, + "grad_norm": 0.439165323972702, + "learning_rate": 9.307081664230658e-05, + "loss": 1.8669, + "step": 6338 + }, + { + "epoch": 1.945672191528545, + "grad_norm": 0.4185279607772827, + "learning_rate": 9.306829188462076e-05, + "loss": 1.9512, + "step": 6339 + }, + { + "epoch": 1.9459791282995704, + "grad_norm": 0.4089502990245819, + "learning_rate": 9.306576670130885e-05, + "loss": 1.9607, + "step": 6340 + }, + { + "epoch": 1.9462860650705953, + "grad_norm": 0.508836567401886, + "learning_rate": 9.306324109239578e-05, + "loss": 1.9187, + "step": 6341 + }, + { + "epoch": 1.9465930018416207, + "grad_norm": 0.637534499168396, + "learning_rate": 9.306071505790652e-05, + "loss": 1.8237, + "step": 6342 + }, + { + "epoch": 1.9468999386126458, + "grad_norm": 0.5845112800598145, + "learning_rate": 9.305818859786603e-05, + "loss": 1.8238, + "step": 6343 + }, + { + "epoch": 1.9472068753836709, + "grad_norm": 0.4168374240398407, + "learning_rate": 9.305566171229932e-05, + "loss": 1.9343, + "step": 6344 + }, + { + "epoch": 1.9475138121546962, + "grad_norm": 0.43040701746940613, + "learning_rate": 9.305313440123129e-05, + "loss": 1.8774, + "step": 6345 + }, + { + "epoch": 1.9478207489257213, + "grad_norm": 0.6011641025543213, + "learning_rate": 9.305060666468696e-05, + "loss": 1.89, + "step": 6346 + }, + { + "epoch": 1.9481276856967464, + "grad_norm": 0.5530022382736206, + "learning_rate": 9.304807850269131e-05, + "loss": 2.0006, + "step": 6347 + }, + { + "epoch": 1.9484346224677718, + "grad_norm": 0.3707423210144043, + "learning_rate": 9.30455499152693e-05, + "loss": 1.9116, + "step": 6348 + }, + { + "epoch": 1.9487415592387967, + "grad_norm": 0.5013771653175354, + "learning_rate": 9.304302090244595e-05, + "loss": 1.8902, + "step": 6349 + }, + { + "epoch": 1.949048496009822, + "grad_norm": 0.5873609781265259, + "learning_rate": 9.304049146424623e-05, + "loss": 1.8879, + "step": 6350 + }, + { + "epoch": 1.949355432780847, + "grad_norm": 0.4389801621437073, + "learning_rate": 9.303796160069516e-05, + "loss": 1.9215, + "step": 6351 + }, + { + "epoch": 1.9496623695518722, + "grad_norm": 0.4004434645175934, + "learning_rate": 9.303543131181772e-05, + "loss": 1.9137, + "step": 6352 + }, + { + "epoch": 1.9499693063228976, + "grad_norm": 0.4928852617740631, + "learning_rate": 9.303290059763892e-05, + "loss": 1.9415, + "step": 6353 + }, + { + "epoch": 1.9502762430939227, + "grad_norm": 0.5045879483222961, + "learning_rate": 9.303036945818377e-05, + "loss": 1.8727, + "step": 6354 + }, + { + "epoch": 1.9505831798649478, + "grad_norm": 0.3434823453426361, + "learning_rate": 9.30278378934773e-05, + "loss": 1.8971, + "step": 6355 + }, + { + "epoch": 1.9508901166359731, + "grad_norm": 0.42980003356933594, + "learning_rate": 9.302530590354452e-05, + "loss": 1.9233, + "step": 6356 + }, + { + "epoch": 1.951197053406998, + "grad_norm": 0.3832406997680664, + "learning_rate": 9.302277348841042e-05, + "loss": 1.9317, + "step": 6357 + }, + { + "epoch": 1.9515039901780233, + "grad_norm": 0.37214264273643494, + "learning_rate": 9.30202406481001e-05, + "loss": 1.9172, + "step": 6358 + }, + { + "epoch": 1.9518109269490485, + "grad_norm": 0.3601585924625397, + "learning_rate": 9.30177073826385e-05, + "loss": 1.9286, + "step": 6359 + }, + { + "epoch": 1.9521178637200736, + "grad_norm": 0.36419349908828735, + "learning_rate": 9.301517369205072e-05, + "loss": 1.8624, + "step": 6360 + }, + { + "epoch": 1.952424800491099, + "grad_norm": 0.3808813691139221, + "learning_rate": 9.30126395763618e-05, + "loss": 1.8656, + "step": 6361 + }, + { + "epoch": 1.952731737262124, + "grad_norm": 0.39045700430870056, + "learning_rate": 9.301010503559675e-05, + "loss": 1.9205, + "step": 6362 + }, + { + "epoch": 1.9530386740331491, + "grad_norm": 0.37281444668769836, + "learning_rate": 9.300757006978065e-05, + "loss": 1.9162, + "step": 6363 + }, + { + "epoch": 1.9533456108041745, + "grad_norm": 0.4525204002857208, + "learning_rate": 9.300503467893851e-05, + "loss": 1.8999, + "step": 6364 + }, + { + "epoch": 1.9536525475751993, + "grad_norm": 0.41406187415122986, + "learning_rate": 9.300249886309542e-05, + "loss": 1.9804, + "step": 6365 + }, + { + "epoch": 1.9539594843462247, + "grad_norm": 0.4125058650970459, + "learning_rate": 9.299996262227644e-05, + "loss": 1.8464, + "step": 6366 + }, + { + "epoch": 1.9542664211172498, + "grad_norm": 0.41582876443862915, + "learning_rate": 9.299742595650663e-05, + "loss": 1.9937, + "step": 6367 + }, + { + "epoch": 1.954573357888275, + "grad_norm": 0.4360882639884949, + "learning_rate": 9.299488886581103e-05, + "loss": 1.9064, + "step": 6368 + }, + { + "epoch": 1.9548802946593002, + "grad_norm": 0.38369372487068176, + "learning_rate": 9.299235135021476e-05, + "loss": 1.9202, + "step": 6369 + }, + { + "epoch": 1.9551872314303254, + "grad_norm": 0.34401383996009827, + "learning_rate": 9.298981340974287e-05, + "loss": 1.844, + "step": 6370 + }, + { + "epoch": 1.9554941682013505, + "grad_norm": 0.3434326946735382, + "learning_rate": 9.298727504442044e-05, + "loss": 1.8206, + "step": 6371 + }, + { + "epoch": 1.9558011049723758, + "grad_norm": 0.35966724157333374, + "learning_rate": 9.298473625427257e-05, + "loss": 1.9, + "step": 6372 + }, + { + "epoch": 1.9561080417434007, + "grad_norm": 0.3726016581058502, + "learning_rate": 9.298219703932434e-05, + "loss": 1.9004, + "step": 6373 + }, + { + "epoch": 1.956414978514426, + "grad_norm": 0.3377366364002228, + "learning_rate": 9.297965739960084e-05, + "loss": 1.8747, + "step": 6374 + }, + { + "epoch": 1.9567219152854514, + "grad_norm": 0.36824578046798706, + "learning_rate": 9.297711733512718e-05, + "loss": 1.9059, + "step": 6375 + }, + { + "epoch": 1.9570288520564763, + "grad_norm": 0.3434023857116699, + "learning_rate": 9.297457684592847e-05, + "loss": 1.8624, + "step": 6376 + }, + { + "epoch": 1.9573357888275016, + "grad_norm": 0.36236703395843506, + "learning_rate": 9.297203593202979e-05, + "loss": 1.8558, + "step": 6377 + }, + { + "epoch": 1.9576427255985267, + "grad_norm": 0.3326953947544098, + "learning_rate": 9.296949459345625e-05, + "loss": 1.9189, + "step": 6378 + }, + { + "epoch": 1.9579496623695518, + "grad_norm": 0.3358452022075653, + "learning_rate": 9.2966952830233e-05, + "loss": 1.8601, + "step": 6379 + }, + { + "epoch": 1.9582565991405771, + "grad_norm": 0.36092114448547363, + "learning_rate": 9.296441064238514e-05, + "loss": 1.873, + "step": 6380 + }, + { + "epoch": 1.9585635359116023, + "grad_norm": 0.345683217048645, + "learning_rate": 9.296186802993778e-05, + "loss": 1.9122, + "step": 6381 + }, + { + "epoch": 1.9588704726826274, + "grad_norm": 0.32488611340522766, + "learning_rate": 9.295932499291606e-05, + "loss": 1.8709, + "step": 6382 + }, + { + "epoch": 1.9591774094536527, + "grad_norm": 0.34276288747787476, + "learning_rate": 9.295678153134512e-05, + "loss": 1.937, + "step": 6383 + }, + { + "epoch": 1.9594843462246776, + "grad_norm": 0.3953622877597809, + "learning_rate": 9.295423764525008e-05, + "loss": 1.9357, + "step": 6384 + }, + { + "epoch": 1.959791282995703, + "grad_norm": 0.37806951999664307, + "learning_rate": 9.29516933346561e-05, + "loss": 1.8813, + "step": 6385 + }, + { + "epoch": 1.960098219766728, + "grad_norm": 0.39551272988319397, + "learning_rate": 9.29491485995883e-05, + "loss": 1.8812, + "step": 6386 + }, + { + "epoch": 1.9604051565377532, + "grad_norm": 0.37042370438575745, + "learning_rate": 9.294660344007184e-05, + "loss": 1.9059, + "step": 6387 + }, + { + "epoch": 1.9607120933087785, + "grad_norm": 0.37503576278686523, + "learning_rate": 9.294405785613187e-05, + "loss": 1.9792, + "step": 6388 + }, + { + "epoch": 1.9610190300798036, + "grad_norm": 0.3515741229057312, + "learning_rate": 9.294151184779355e-05, + "loss": 1.8792, + "step": 6389 + }, + { + "epoch": 1.9613259668508287, + "grad_norm": 0.319890558719635, + "learning_rate": 9.293896541508205e-05, + "loss": 1.9222, + "step": 6390 + }, + { + "epoch": 1.961632903621854, + "grad_norm": 0.3517487645149231, + "learning_rate": 9.293641855802252e-05, + "loss": 1.8751, + "step": 6391 + }, + { + "epoch": 1.961939840392879, + "grad_norm": 0.33269986510276794, + "learning_rate": 9.293387127664012e-05, + "loss": 1.8372, + "step": 6392 + }, + { + "epoch": 1.9622467771639043, + "grad_norm": 0.36048516631126404, + "learning_rate": 9.293132357096007e-05, + "loss": 1.8944, + "step": 6393 + }, + { + "epoch": 1.9625537139349294, + "grad_norm": 0.4329642057418823, + "learning_rate": 9.292877544100751e-05, + "loss": 1.9868, + "step": 6394 + }, + { + "epoch": 1.9628606507059545, + "grad_norm": 0.445496529340744, + "learning_rate": 9.292622688680762e-05, + "loss": 1.9885, + "step": 6395 + }, + { + "epoch": 1.9631675874769798, + "grad_norm": 0.3818886876106262, + "learning_rate": 9.292367790838561e-05, + "loss": 1.9515, + "step": 6396 + }, + { + "epoch": 1.963474524248005, + "grad_norm": 0.3800121545791626, + "learning_rate": 9.292112850576664e-05, + "loss": 1.8838, + "step": 6397 + }, + { + "epoch": 1.96378146101903, + "grad_norm": 0.44252321124076843, + "learning_rate": 9.291857867897593e-05, + "loss": 1.9296, + "step": 6398 + }, + { + "epoch": 1.9640883977900554, + "grad_norm": 0.463766485452652, + "learning_rate": 9.291602842803867e-05, + "loss": 1.9164, + "step": 6399 + }, + { + "epoch": 1.9643953345610803, + "grad_norm": 0.4599217474460602, + "learning_rate": 9.291347775298006e-05, + "loss": 1.9277, + "step": 6400 + }, + { + "epoch": 1.9647022713321056, + "grad_norm": 0.371346652507782, + "learning_rate": 9.291092665382532e-05, + "loss": 1.9036, + "step": 6401 + }, + { + "epoch": 1.9650092081031307, + "grad_norm": 0.327197402715683, + "learning_rate": 9.290837513059965e-05, + "loss": 1.8214, + "step": 6402 + }, + { + "epoch": 1.9653161448741558, + "grad_norm": 0.3346688747406006, + "learning_rate": 9.290582318332826e-05, + "loss": 1.8671, + "step": 6403 + }, + { + "epoch": 1.9656230816451812, + "grad_norm": 0.342208594083786, + "learning_rate": 9.290327081203637e-05, + "loss": 1.9143, + "step": 6404 + }, + { + "epoch": 1.9659300184162063, + "grad_norm": 0.3430559039115906, + "learning_rate": 9.290071801674923e-05, + "loss": 1.9135, + "step": 6405 + }, + { + "epoch": 1.9662369551872314, + "grad_norm": 0.3335573971271515, + "learning_rate": 9.289816479749202e-05, + "loss": 1.9011, + "step": 6406 + }, + { + "epoch": 1.9665438919582567, + "grad_norm": 0.3464879095554352, + "learning_rate": 9.289561115429004e-05, + "loss": 1.9061, + "step": 6407 + }, + { + "epoch": 1.9668508287292816, + "grad_norm": 0.3513408899307251, + "learning_rate": 9.289305708716847e-05, + "loss": 1.8982, + "step": 6408 + }, + { + "epoch": 1.967157765500307, + "grad_norm": 0.3888663947582245, + "learning_rate": 9.289050259615256e-05, + "loss": 1.9196, + "step": 6409 + }, + { + "epoch": 1.967464702271332, + "grad_norm": 0.3414073884487152, + "learning_rate": 9.288794768126759e-05, + "loss": 1.932, + "step": 6410 + }, + { + "epoch": 1.9677716390423572, + "grad_norm": 0.33067384362220764, + "learning_rate": 9.288539234253876e-05, + "loss": 1.8547, + "step": 6411 + }, + { + "epoch": 1.9680785758133825, + "grad_norm": 0.31827688217163086, + "learning_rate": 9.288283657999135e-05, + "loss": 1.8691, + "step": 6412 + }, + { + "epoch": 1.9683855125844076, + "grad_norm": 0.32259073853492737, + "learning_rate": 9.288028039365062e-05, + "loss": 1.8889, + "step": 6413 + }, + { + "epoch": 1.9686924493554327, + "grad_norm": 0.37552687525749207, + "learning_rate": 9.287772378354182e-05, + "loss": 1.8709, + "step": 6414 + }, + { + "epoch": 1.968999386126458, + "grad_norm": 0.3446151316165924, + "learning_rate": 9.287516674969024e-05, + "loss": 1.8749, + "step": 6415 + }, + { + "epoch": 1.969306322897483, + "grad_norm": 0.3648208975791931, + "learning_rate": 9.287260929212111e-05, + "loss": 1.93, + "step": 6416 + }, + { + "epoch": 1.9696132596685083, + "grad_norm": 0.3430599868297577, + "learning_rate": 9.287005141085974e-05, + "loss": 1.8537, + "step": 6417 + }, + { + "epoch": 1.9699201964395334, + "grad_norm": 0.39110586047172546, + "learning_rate": 9.286749310593139e-05, + "loss": 1.987, + "step": 6418 + }, + { + "epoch": 1.9702271332105585, + "grad_norm": 0.4033393859863281, + "learning_rate": 9.286493437736136e-05, + "loss": 1.9793, + "step": 6419 + }, + { + "epoch": 1.9705340699815839, + "grad_norm": 0.3950151205062866, + "learning_rate": 9.286237522517491e-05, + "loss": 1.8781, + "step": 6420 + }, + { + "epoch": 1.970841006752609, + "grad_norm": 0.4614053964614868, + "learning_rate": 9.285981564939735e-05, + "loss": 1.9886, + "step": 6421 + }, + { + "epoch": 1.971147943523634, + "grad_norm": 0.4990023076534271, + "learning_rate": 9.285725565005398e-05, + "loss": 1.8957, + "step": 6422 + }, + { + "epoch": 1.9714548802946594, + "grad_norm": 0.501301109790802, + "learning_rate": 9.285469522717008e-05, + "loss": 1.8606, + "step": 6423 + }, + { + "epoch": 1.9717618170656843, + "grad_norm": 0.3820148706436157, + "learning_rate": 9.285213438077097e-05, + "loss": 1.9097, + "step": 6424 + }, + { + "epoch": 1.9720687538367097, + "grad_norm": 0.3959129750728607, + "learning_rate": 9.284957311088193e-05, + "loss": 1.8972, + "step": 6425 + }, + { + "epoch": 1.9723756906077348, + "grad_norm": 0.4914678931236267, + "learning_rate": 9.284701141752831e-05, + "loss": 1.9211, + "step": 6426 + }, + { + "epoch": 1.9726826273787599, + "grad_norm": 0.5992010831832886, + "learning_rate": 9.284444930073542e-05, + "loss": 1.917, + "step": 6427 + }, + { + "epoch": 1.9729895641497852, + "grad_norm": 0.6089407801628113, + "learning_rate": 9.284188676052856e-05, + "loss": 1.9497, + "step": 6428 + }, + { + "epoch": 1.9732965009208103, + "grad_norm": 0.5493173003196716, + "learning_rate": 9.283932379693306e-05, + "loss": 1.9888, + "step": 6429 + }, + { + "epoch": 1.9736034376918354, + "grad_norm": 0.4451984167098999, + "learning_rate": 9.283676040997426e-05, + "loss": 1.892, + "step": 6430 + }, + { + "epoch": 1.9739103744628608, + "grad_norm": 0.35765743255615234, + "learning_rate": 9.283419659967748e-05, + "loss": 1.8768, + "step": 6431 + }, + { + "epoch": 1.9742173112338857, + "grad_norm": 0.36561164259910583, + "learning_rate": 9.283163236606807e-05, + "loss": 1.825, + "step": 6432 + }, + { + "epoch": 1.974524248004911, + "grad_norm": 0.38473913073539734, + "learning_rate": 9.282906770917137e-05, + "loss": 1.9247, + "step": 6433 + }, + { + "epoch": 1.974831184775936, + "grad_norm": 0.324945867061615, + "learning_rate": 9.28265026290127e-05, + "loss": 1.8832, + "step": 6434 + }, + { + "epoch": 1.9751381215469612, + "grad_norm": 0.38697487115859985, + "learning_rate": 9.282393712561744e-05, + "loss": 1.9282, + "step": 6435 + }, + { + "epoch": 1.9754450583179866, + "grad_norm": 0.3772333264350891, + "learning_rate": 9.282137119901094e-05, + "loss": 1.8822, + "step": 6436 + }, + { + "epoch": 1.9757519950890117, + "grad_norm": 0.3522745668888092, + "learning_rate": 9.281880484921854e-05, + "loss": 1.9102, + "step": 6437 + }, + { + "epoch": 1.9760589318600368, + "grad_norm": 0.36745330691337585, + "learning_rate": 9.281623807626562e-05, + "loss": 1.8842, + "step": 6438 + }, + { + "epoch": 1.9763658686310621, + "grad_norm": 0.3990548253059387, + "learning_rate": 9.281367088017755e-05, + "loss": 1.9642, + "step": 6439 + }, + { + "epoch": 1.976672805402087, + "grad_norm": 0.3333520293235779, + "learning_rate": 9.281110326097969e-05, + "loss": 1.8541, + "step": 6440 + }, + { + "epoch": 1.9769797421731123, + "grad_norm": 0.3282802700996399, + "learning_rate": 9.280853521869739e-05, + "loss": 1.8416, + "step": 6441 + }, + { + "epoch": 1.9772866789441375, + "grad_norm": 0.3415268361568451, + "learning_rate": 9.280596675335607e-05, + "loss": 1.9009, + "step": 6442 + }, + { + "epoch": 1.9775936157151626, + "grad_norm": 0.3621836006641388, + "learning_rate": 9.28033978649811e-05, + "loss": 1.8584, + "step": 6443 + }, + { + "epoch": 1.977900552486188, + "grad_norm": 0.34778010845184326, + "learning_rate": 9.280082855359786e-05, + "loss": 1.9455, + "step": 6444 + }, + { + "epoch": 1.978207489257213, + "grad_norm": 0.36525633931159973, + "learning_rate": 9.279825881923174e-05, + "loss": 1.9182, + "step": 6445 + }, + { + "epoch": 1.9785144260282381, + "grad_norm": 0.3404203951358795, + "learning_rate": 9.279568866190815e-05, + "loss": 1.8853, + "step": 6446 + }, + { + "epoch": 1.9788213627992635, + "grad_norm": 0.4564785659313202, + "learning_rate": 9.279311808165249e-05, + "loss": 2.0012, + "step": 6447 + }, + { + "epoch": 1.9791282995702886, + "grad_norm": 0.4371441602706909, + "learning_rate": 9.279054707849015e-05, + "loss": 1.9372, + "step": 6448 + }, + { + "epoch": 1.9794352363413137, + "grad_norm": 0.3928726017475128, + "learning_rate": 9.278797565244652e-05, + "loss": 1.882, + "step": 6449 + }, + { + "epoch": 1.979742173112339, + "grad_norm": 0.483331561088562, + "learning_rate": 9.278540380354706e-05, + "loss": 1.9664, + "step": 6450 + }, + { + "epoch": 1.980049109883364, + "grad_norm": 0.39085066318511963, + "learning_rate": 9.278283153181716e-05, + "loss": 1.874, + "step": 6451 + }, + { + "epoch": 1.9803560466543892, + "grad_norm": 0.3549460172653198, + "learning_rate": 9.278025883728224e-05, + "loss": 1.9108, + "step": 6452 + }, + { + "epoch": 1.9806629834254144, + "grad_norm": 0.4260072410106659, + "learning_rate": 9.277768571996772e-05, + "loss": 1.8621, + "step": 6453 + }, + { + "epoch": 1.9809699201964395, + "grad_norm": 0.4531188905239105, + "learning_rate": 9.277511217989904e-05, + "loss": 1.9924, + "step": 6454 + }, + { + "epoch": 1.9812768569674648, + "grad_norm": 0.34916743636131287, + "learning_rate": 9.277253821710165e-05, + "loss": 1.9459, + "step": 6455 + }, + { + "epoch": 1.98158379373849, + "grad_norm": 0.45466169714927673, + "learning_rate": 9.276996383160095e-05, + "loss": 1.9129, + "step": 6456 + }, + { + "epoch": 1.981890730509515, + "grad_norm": 0.4948022663593292, + "learning_rate": 9.27673890234224e-05, + "loss": 1.9362, + "step": 6457 + }, + { + "epoch": 1.9821976672805404, + "grad_norm": 0.43365779519081116, + "learning_rate": 9.276481379259146e-05, + "loss": 1.9323, + "step": 6458 + }, + { + "epoch": 1.9825046040515653, + "grad_norm": 0.5301255583763123, + "learning_rate": 9.276223813913354e-05, + "loss": 1.9611, + "step": 6459 + }, + { + "epoch": 1.9828115408225906, + "grad_norm": 0.4785257577896118, + "learning_rate": 9.275966206307412e-05, + "loss": 1.8945, + "step": 6460 + }, + { + "epoch": 1.9831184775936157, + "grad_norm": 0.4091590940952301, + "learning_rate": 9.275708556443868e-05, + "loss": 1.9171, + "step": 6461 + }, + { + "epoch": 1.9834254143646408, + "grad_norm": 0.4031025767326355, + "learning_rate": 9.275450864325264e-05, + "loss": 1.9518, + "step": 6462 + }, + { + "epoch": 1.9837323511356661, + "grad_norm": 0.39147642254829407, + "learning_rate": 9.275193129954149e-05, + "loss": 1.8756, + "step": 6463 + }, + { + "epoch": 1.9840392879066913, + "grad_norm": 0.3863523006439209, + "learning_rate": 9.27493535333307e-05, + "loss": 1.8894, + "step": 6464 + }, + { + "epoch": 1.9843462246777164, + "grad_norm": 0.36373165249824524, + "learning_rate": 9.274677534464576e-05, + "loss": 1.8574, + "step": 6465 + }, + { + "epoch": 1.9846531614487417, + "grad_norm": 0.40247389674186707, + "learning_rate": 9.274419673351211e-05, + "loss": 1.832, + "step": 6466 + }, + { + "epoch": 1.9849600982197666, + "grad_norm": 0.3874013125896454, + "learning_rate": 9.274161769995526e-05, + "loss": 1.9079, + "step": 6467 + }, + { + "epoch": 1.985267034990792, + "grad_norm": 0.35506606101989746, + "learning_rate": 9.27390382440007e-05, + "loss": 1.8784, + "step": 6468 + }, + { + "epoch": 1.985573971761817, + "grad_norm": 0.406325101852417, + "learning_rate": 9.273645836567388e-05, + "loss": 1.9822, + "step": 6469 + }, + { + "epoch": 1.9858809085328422, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.273387806500036e-05, + "loss": 1.9334, + "step": 6470 + }, + { + "epoch": 1.9861878453038675, + "grad_norm": 0.4810343384742737, + "learning_rate": 9.273129734200561e-05, + "loss": 1.9598, + "step": 6471 + }, + { + "epoch": 1.9864947820748926, + "grad_norm": 0.4552834630012512, + "learning_rate": 9.272871619671513e-05, + "loss": 1.9504, + "step": 6472 + }, + { + "epoch": 1.9868017188459177, + "grad_norm": 0.38974207639694214, + "learning_rate": 9.272613462915443e-05, + "loss": 1.8811, + "step": 6473 + }, + { + "epoch": 1.987108655616943, + "grad_norm": 0.40983298420906067, + "learning_rate": 9.272355263934902e-05, + "loss": 1.8876, + "step": 6474 + }, + { + "epoch": 1.987415592387968, + "grad_norm": 0.3684757947921753, + "learning_rate": 9.272097022732443e-05, + "loss": 1.921, + "step": 6475 + }, + { + "epoch": 1.9877225291589933, + "grad_norm": 0.38384270668029785, + "learning_rate": 9.271838739310618e-05, + "loss": 1.9099, + "step": 6476 + }, + { + "epoch": 1.9880294659300184, + "grad_norm": 0.3783731460571289, + "learning_rate": 9.271580413671976e-05, + "loss": 1.9322, + "step": 6477 + }, + { + "epoch": 1.9883364027010435, + "grad_norm": 0.3686216473579407, + "learning_rate": 9.271322045819076e-05, + "loss": 1.914, + "step": 6478 + }, + { + "epoch": 1.9886433394720688, + "grad_norm": 0.38776305317878723, + "learning_rate": 9.271063635754466e-05, + "loss": 1.9331, + "step": 6479 + }, + { + "epoch": 1.988950276243094, + "grad_norm": 0.35099950432777405, + "learning_rate": 9.270805183480702e-05, + "loss": 1.9837, + "step": 6480 + }, + { + "epoch": 1.989257213014119, + "grad_norm": 0.3736453652381897, + "learning_rate": 9.270546689000339e-05, + "loss": 1.846, + "step": 6481 + }, + { + "epoch": 1.9895641497851444, + "grad_norm": 0.3654848635196686, + "learning_rate": 9.27028815231593e-05, + "loss": 1.8987, + "step": 6482 + }, + { + "epoch": 1.9898710865561693, + "grad_norm": 0.3534870147705078, + "learning_rate": 9.27002957343003e-05, + "loss": 1.868, + "step": 6483 + }, + { + "epoch": 1.9901780233271946, + "grad_norm": 0.3143392503261566, + "learning_rate": 9.269770952345197e-05, + "loss": 1.8042, + "step": 6484 + }, + { + "epoch": 1.9904849600982197, + "grad_norm": 0.37151026725769043, + "learning_rate": 9.269512289063982e-05, + "loss": 1.8392, + "step": 6485 + }, + { + "epoch": 1.9907918968692448, + "grad_norm": 0.39781463146209717, + "learning_rate": 9.269253583588947e-05, + "loss": 1.9911, + "step": 6486 + }, + { + "epoch": 1.9910988336402702, + "grad_norm": 0.44022107124328613, + "learning_rate": 9.268994835922643e-05, + "loss": 1.9644, + "step": 6487 + }, + { + "epoch": 1.9914057704112953, + "grad_norm": 0.4058530628681183, + "learning_rate": 9.268736046067632e-05, + "loss": 1.9062, + "step": 6488 + }, + { + "epoch": 1.9917127071823204, + "grad_norm": 0.3754481077194214, + "learning_rate": 9.268477214026467e-05, + "loss": 1.8278, + "step": 6489 + }, + { + "epoch": 1.9920196439533457, + "grad_norm": 0.318208247423172, + "learning_rate": 9.268218339801711e-05, + "loss": 1.8529, + "step": 6490 + }, + { + "epoch": 1.9923265807243706, + "grad_norm": 0.350777268409729, + "learning_rate": 9.267959423395918e-05, + "loss": 1.9024, + "step": 6491 + }, + { + "epoch": 1.992633517495396, + "grad_norm": 0.3145158588886261, + "learning_rate": 9.26770046481165e-05, + "loss": 1.934, + "step": 6492 + }, + { + "epoch": 1.992940454266421, + "grad_norm": 0.3347548842430115, + "learning_rate": 9.267441464051463e-05, + "loss": 1.8989, + "step": 6493 + }, + { + "epoch": 1.9932473910374462, + "grad_norm": 0.33111512660980225, + "learning_rate": 9.267182421117919e-05, + "loss": 1.8808, + "step": 6494 + }, + { + "epoch": 1.9935543278084715, + "grad_norm": 0.3135010898113251, + "learning_rate": 9.266923336013577e-05, + "loss": 1.895, + "step": 6495 + }, + { + "epoch": 1.9938612645794966, + "grad_norm": 0.3638830780982971, + "learning_rate": 9.266664208740998e-05, + "loss": 1.9331, + "step": 6496 + }, + { + "epoch": 1.9941682013505218, + "grad_norm": 0.3592624068260193, + "learning_rate": 9.266405039302743e-05, + "loss": 1.8963, + "step": 6497 + }, + { + "epoch": 1.994475138121547, + "grad_norm": 0.34216129779815674, + "learning_rate": 9.266145827701371e-05, + "loss": 1.9062, + "step": 6498 + }, + { + "epoch": 1.994782074892572, + "grad_norm": 0.4180343747138977, + "learning_rate": 9.265886573939447e-05, + "loss": 1.9351, + "step": 6499 + }, + { + "epoch": 1.9950890116635973, + "grad_norm": 0.36890342831611633, + "learning_rate": 9.265627278019531e-05, + "loss": 1.9037, + "step": 6500 + }, + { + "epoch": 1.9953959484346224, + "grad_norm": 0.36638152599334717, + "learning_rate": 9.265367939944188e-05, + "loss": 1.9524, + "step": 6501 + }, + { + "epoch": 1.9957028852056475, + "grad_norm": 0.44918373227119446, + "learning_rate": 9.265108559715976e-05, + "loss": 1.9236, + "step": 6502 + }, + { + "epoch": 1.9960098219766729, + "grad_norm": 0.3805326521396637, + "learning_rate": 9.264849137337462e-05, + "loss": 1.8526, + "step": 6503 + }, + { + "epoch": 1.996316758747698, + "grad_norm": 0.39035212993621826, + "learning_rate": 9.26458967281121e-05, + "loss": 1.8256, + "step": 6504 + }, + { + "epoch": 1.996623695518723, + "grad_norm": 0.330522358417511, + "learning_rate": 9.264330166139783e-05, + "loss": 1.8487, + "step": 6505 + }, + { + "epoch": 1.9969306322897484, + "grad_norm": 0.33569198846817017, + "learning_rate": 9.264070617325746e-05, + "loss": 1.8735, + "step": 6506 + }, + { + "epoch": 1.9972375690607733, + "grad_norm": 0.4121384918689728, + "learning_rate": 9.263811026371664e-05, + "loss": 2.0028, + "step": 6507 + }, + { + "epoch": 1.9975445058317987, + "grad_norm": 0.3419879972934723, + "learning_rate": 9.263551393280103e-05, + "loss": 1.8432, + "step": 6508 + }, + { + "epoch": 1.9978514426028238, + "grad_norm": 0.33369818329811096, + "learning_rate": 9.263291718053626e-05, + "loss": 1.8752, + "step": 6509 + }, + { + "epoch": 1.9981583793738489, + "grad_norm": 0.3580996096134186, + "learning_rate": 9.263032000694804e-05, + "loss": 1.9319, + "step": 6510 + }, + { + "epoch": 1.9984653161448742, + "grad_norm": 0.38216903805732727, + "learning_rate": 9.2627722412062e-05, + "loss": 1.9424, + "step": 6511 + }, + { + "epoch": 1.9987722529158993, + "grad_norm": 0.3836761713027954, + "learning_rate": 9.26251243959038e-05, + "loss": 1.9259, + "step": 6512 + }, + { + "epoch": 1.9990791896869244, + "grad_norm": 0.34978967905044556, + "learning_rate": 9.262252595849917e-05, + "loss": 1.8648, + "step": 6513 + }, + { + "epoch": 1.9993861264579498, + "grad_norm": 0.4190160632133484, + "learning_rate": 9.261992709987375e-05, + "loss": 1.9456, + "step": 6514 + }, + { + "epoch": 1.9996930632289747, + "grad_norm": 0.38700881600379944, + "learning_rate": 9.261732782005322e-05, + "loss": 1.8768, + "step": 6515 + }, + { + "epoch": 2.0, + "grad_norm": 0.3706338405609131, + "learning_rate": 9.261472811906328e-05, + "loss": 1.9247, + "step": 6516 + }, + { + "epoch": 2.0003069367710253, + "grad_norm": 0.36679908633232117, + "learning_rate": 9.261212799692962e-05, + "loss": 1.8193, + "step": 6517 + }, + { + "epoch": 2.0006138735420502, + "grad_norm": 0.45219072699546814, + "learning_rate": 9.260952745367795e-05, + "loss": 1.9019, + "step": 6518 + }, + { + "epoch": 2.0009208103130756, + "grad_norm": 0.6038491725921631, + "learning_rate": 9.260692648933393e-05, + "loss": 1.8834, + "step": 6519 + }, + { + "epoch": 2.001227747084101, + "grad_norm": 0.5823990106582642, + "learning_rate": 9.260432510392331e-05, + "loss": 1.9066, + "step": 6520 + }, + { + "epoch": 2.001534683855126, + "grad_norm": 0.4731088876724243, + "learning_rate": 9.260172329747178e-05, + "loss": 1.8997, + "step": 6521 + }, + { + "epoch": 2.001841620626151, + "grad_norm": 0.3397974669933319, + "learning_rate": 9.259912107000504e-05, + "loss": 1.9396, + "step": 6522 + }, + { + "epoch": 2.002148557397176, + "grad_norm": 0.374734103679657, + "learning_rate": 9.259651842154882e-05, + "loss": 1.9311, + "step": 6523 + }, + { + "epoch": 2.0024554941682013, + "grad_norm": 0.48218441009521484, + "learning_rate": 9.259391535212884e-05, + "loss": 1.948, + "step": 6524 + }, + { + "epoch": 2.0027624309392267, + "grad_norm": 0.40540626645088196, + "learning_rate": 9.259131186177082e-05, + "loss": 1.8541, + "step": 6525 + }, + { + "epoch": 2.0030693677102516, + "grad_norm": 0.3698440492153168, + "learning_rate": 9.258870795050048e-05, + "loss": 1.9622, + "step": 6526 + }, + { + "epoch": 2.003376304481277, + "grad_norm": 0.35084524750709534, + "learning_rate": 9.258610361834358e-05, + "loss": 1.8882, + "step": 6527 + }, + { + "epoch": 2.0036832412523022, + "grad_norm": 0.38982072472572327, + "learning_rate": 9.258349886532584e-05, + "loss": 1.9523, + "step": 6528 + }, + { + "epoch": 2.003990178023327, + "grad_norm": 0.3737744390964508, + "learning_rate": 9.258089369147302e-05, + "loss": 1.9091, + "step": 6529 + }, + { + "epoch": 2.0042971147943525, + "grad_norm": 0.36094167828559875, + "learning_rate": 9.257828809681083e-05, + "loss": 1.8711, + "step": 6530 + }, + { + "epoch": 2.0046040515653774, + "grad_norm": 0.3270244896411896, + "learning_rate": 9.257568208136506e-05, + "loss": 1.8738, + "step": 6531 + }, + { + "epoch": 2.0049109883364027, + "grad_norm": 0.3320237100124359, + "learning_rate": 9.257307564516145e-05, + "loss": 1.8889, + "step": 6532 + }, + { + "epoch": 2.005217925107428, + "grad_norm": 0.3091014623641968, + "learning_rate": 9.257046878822573e-05, + "loss": 1.8683, + "step": 6533 + }, + { + "epoch": 2.005524861878453, + "grad_norm": 0.3234712779521942, + "learning_rate": 9.25678615105837e-05, + "loss": 1.8787, + "step": 6534 + }, + { + "epoch": 2.0058317986494782, + "grad_norm": 0.38402292132377625, + "learning_rate": 9.25652538122611e-05, + "loss": 1.9414, + "step": 6535 + }, + { + "epoch": 2.0061387354205036, + "grad_norm": 0.41379863023757935, + "learning_rate": 9.256264569328372e-05, + "loss": 1.9185, + "step": 6536 + }, + { + "epoch": 2.0064456721915285, + "grad_norm": 0.35990384221076965, + "learning_rate": 9.256003715367733e-05, + "loss": 1.8756, + "step": 6537 + }, + { + "epoch": 2.006752608962554, + "grad_norm": 0.3489217460155487, + "learning_rate": 9.25574281934677e-05, + "loss": 1.8984, + "step": 6538 + }, + { + "epoch": 2.0070595457335787, + "grad_norm": 0.326541006565094, + "learning_rate": 9.255481881268064e-05, + "loss": 1.8559, + "step": 6539 + }, + { + "epoch": 2.007366482504604, + "grad_norm": 0.40900397300720215, + "learning_rate": 9.25522090113419e-05, + "loss": 1.8832, + "step": 6540 + }, + { + "epoch": 2.0076734192756294, + "grad_norm": 0.4130956828594208, + "learning_rate": 9.254959878947731e-05, + "loss": 1.8437, + "step": 6541 + }, + { + "epoch": 2.0079803560466543, + "grad_norm": 0.38869336247444153, + "learning_rate": 9.254698814711263e-05, + "loss": 1.8839, + "step": 6542 + }, + { + "epoch": 2.0082872928176796, + "grad_norm": 0.37832918763160706, + "learning_rate": 9.254437708427368e-05, + "loss": 1.9519, + "step": 6543 + }, + { + "epoch": 2.008594229588705, + "grad_norm": 0.35336560010910034, + "learning_rate": 9.254176560098625e-05, + "loss": 1.8928, + "step": 6544 + }, + { + "epoch": 2.00890116635973, + "grad_norm": 0.347260981798172, + "learning_rate": 9.253915369727617e-05, + "loss": 1.9133, + "step": 6545 + }, + { + "epoch": 2.009208103130755, + "grad_norm": 0.3706999719142914, + "learning_rate": 9.253654137316923e-05, + "loss": 1.9048, + "step": 6546 + }, + { + "epoch": 2.00951503990178, + "grad_norm": 0.40080907940864563, + "learning_rate": 9.253392862869127e-05, + "loss": 1.9169, + "step": 6547 + }, + { + "epoch": 2.0098219766728054, + "grad_norm": 0.3635334074497223, + "learning_rate": 9.253131546386808e-05, + "loss": 1.8623, + "step": 6548 + }, + { + "epoch": 2.0101289134438307, + "grad_norm": 0.32642990350723267, + "learning_rate": 9.252870187872552e-05, + "loss": 1.8624, + "step": 6549 + }, + { + "epoch": 2.0104358502148556, + "grad_norm": 0.32467779517173767, + "learning_rate": 9.25260878732894e-05, + "loss": 1.8867, + "step": 6550 + }, + { + "epoch": 2.010742786985881, + "grad_norm": 0.3496699631214142, + "learning_rate": 9.252347344758553e-05, + "loss": 1.8441, + "step": 6551 + }, + { + "epoch": 2.0110497237569063, + "grad_norm": 0.3624981939792633, + "learning_rate": 9.252085860163981e-05, + "loss": 1.9045, + "step": 6552 + }, + { + "epoch": 2.011356660527931, + "grad_norm": 0.3801099359989166, + "learning_rate": 9.251824333547801e-05, + "loss": 1.9273, + "step": 6553 + }, + { + "epoch": 2.0116635972989565, + "grad_norm": 0.355866402387619, + "learning_rate": 9.251562764912602e-05, + "loss": 1.9032, + "step": 6554 + }, + { + "epoch": 2.0119705340699814, + "grad_norm": 0.31210052967071533, + "learning_rate": 9.251301154260968e-05, + "loss": 1.8148, + "step": 6555 + }, + { + "epoch": 2.0122774708410067, + "grad_norm": 0.3583676218986511, + "learning_rate": 9.251039501595485e-05, + "loss": 1.9326, + "step": 6556 + }, + { + "epoch": 2.012584407612032, + "grad_norm": 0.40221846103668213, + "learning_rate": 9.250777806918737e-05, + "loss": 1.8968, + "step": 6557 + }, + { + "epoch": 2.012891344383057, + "grad_norm": 0.3403627574443817, + "learning_rate": 9.250516070233311e-05, + "loss": 1.8956, + "step": 6558 + }, + { + "epoch": 2.0131982811540823, + "grad_norm": 0.37752729654312134, + "learning_rate": 9.250254291541796e-05, + "loss": 1.9136, + "step": 6559 + }, + { + "epoch": 2.0135052179251076, + "grad_norm": 0.3661794364452362, + "learning_rate": 9.249992470846774e-05, + "loss": 1.8796, + "step": 6560 + }, + { + "epoch": 2.0138121546961325, + "grad_norm": 0.315603643655777, + "learning_rate": 9.249730608150837e-05, + "loss": 1.8711, + "step": 6561 + }, + { + "epoch": 2.014119091467158, + "grad_norm": 0.3187065124511719, + "learning_rate": 9.249468703456571e-05, + "loss": 1.8611, + "step": 6562 + }, + { + "epoch": 2.0144260282381827, + "grad_norm": 0.3018025755882263, + "learning_rate": 9.249206756766564e-05, + "loss": 1.786, + "step": 6563 + }, + { + "epoch": 2.014732965009208, + "grad_norm": 0.344963401556015, + "learning_rate": 9.248944768083406e-05, + "loss": 1.9428, + "step": 6564 + }, + { + "epoch": 2.0150399017802334, + "grad_norm": 0.29776978492736816, + "learning_rate": 9.248682737409687e-05, + "loss": 1.8089, + "step": 6565 + }, + { + "epoch": 2.0153468385512583, + "grad_norm": 0.348982572555542, + "learning_rate": 9.248420664747992e-05, + "loss": 1.8407, + "step": 6566 + }, + { + "epoch": 2.0156537753222836, + "grad_norm": 0.3413224518299103, + "learning_rate": 9.248158550100915e-05, + "loss": 1.9802, + "step": 6567 + }, + { + "epoch": 2.015960712093309, + "grad_norm": 0.3598950505256653, + "learning_rate": 9.247896393471044e-05, + "loss": 1.8882, + "step": 6568 + }, + { + "epoch": 2.016267648864334, + "grad_norm": 0.3609221875667572, + "learning_rate": 9.247634194860974e-05, + "loss": 1.934, + "step": 6569 + }, + { + "epoch": 2.016574585635359, + "grad_norm": 0.3893497586250305, + "learning_rate": 9.247371954273291e-05, + "loss": 1.8808, + "step": 6570 + }, + { + "epoch": 2.016881522406384, + "grad_norm": 0.347417950630188, + "learning_rate": 9.24710967171059e-05, + "loss": 1.863, + "step": 6571 + }, + { + "epoch": 2.0171884591774094, + "grad_norm": 0.35378298163414, + "learning_rate": 9.246847347175461e-05, + "loss": 1.8664, + "step": 6572 + }, + { + "epoch": 2.0174953959484347, + "grad_norm": 0.2819608151912689, + "learning_rate": 9.246584980670499e-05, + "loss": 1.9007, + "step": 6573 + }, + { + "epoch": 2.0178023327194596, + "grad_norm": 0.32445117831230164, + "learning_rate": 9.246322572198293e-05, + "loss": 1.9176, + "step": 6574 + }, + { + "epoch": 2.018109269490485, + "grad_norm": 0.33579203486442566, + "learning_rate": 9.24606012176144e-05, + "loss": 1.8192, + "step": 6575 + }, + { + "epoch": 2.0184162062615103, + "grad_norm": 0.40369588136672974, + "learning_rate": 9.245797629362532e-05, + "loss": 1.8731, + "step": 6576 + }, + { + "epoch": 2.018723143032535, + "grad_norm": 0.34241169691085815, + "learning_rate": 9.245535095004163e-05, + "loss": 1.8555, + "step": 6577 + }, + { + "epoch": 2.0190300798035605, + "grad_norm": 0.3627666234970093, + "learning_rate": 9.245272518688927e-05, + "loss": 1.9212, + "step": 6578 + }, + { + "epoch": 2.0193370165745854, + "grad_norm": 0.3330884873867035, + "learning_rate": 9.245009900419422e-05, + "loss": 1.8727, + "step": 6579 + }, + { + "epoch": 2.0196439533456108, + "grad_norm": 0.3259236514568329, + "learning_rate": 9.244747240198239e-05, + "loss": 1.8471, + "step": 6580 + }, + { + "epoch": 2.019950890116636, + "grad_norm": 0.3715277910232544, + "learning_rate": 9.244484538027976e-05, + "loss": 1.8925, + "step": 6581 + }, + { + "epoch": 2.020257826887661, + "grad_norm": 0.4752909541130066, + "learning_rate": 9.24422179391123e-05, + "loss": 1.889, + "step": 6582 + }, + { + "epoch": 2.0205647636586863, + "grad_norm": 0.5166791677474976, + "learning_rate": 9.243959007850597e-05, + "loss": 1.8637, + "step": 6583 + }, + { + "epoch": 2.0208717004297116, + "grad_norm": 0.5350266695022583, + "learning_rate": 9.243696179848673e-05, + "loss": 1.8916, + "step": 6584 + }, + { + "epoch": 2.0211786372007365, + "grad_norm": 0.6115607619285583, + "learning_rate": 9.243433309908055e-05, + "loss": 1.8847, + "step": 6585 + }, + { + "epoch": 2.021485573971762, + "grad_norm": 0.5915576219558716, + "learning_rate": 9.243170398031343e-05, + "loss": 1.8889, + "step": 6586 + }, + { + "epoch": 2.021792510742787, + "grad_norm": 0.4547630846500397, + "learning_rate": 9.242907444221134e-05, + "loss": 1.8752, + "step": 6587 + }, + { + "epoch": 2.022099447513812, + "grad_norm": 0.39437413215637207, + "learning_rate": 9.242644448480027e-05, + "loss": 1.9318, + "step": 6588 + }, + { + "epoch": 2.0224063842848374, + "grad_norm": 0.39216291904449463, + "learning_rate": 9.24238141081062e-05, + "loss": 1.8799, + "step": 6589 + }, + { + "epoch": 2.0227133210558623, + "grad_norm": 0.4100605547428131, + "learning_rate": 9.242118331215513e-05, + "loss": 1.9278, + "step": 6590 + }, + { + "epoch": 2.0230202578268877, + "grad_norm": 0.38527074456214905, + "learning_rate": 9.241855209697307e-05, + "loss": 1.9085, + "step": 6591 + }, + { + "epoch": 2.023327194597913, + "grad_norm": 0.39856311678886414, + "learning_rate": 9.241592046258602e-05, + "loss": 1.8057, + "step": 6592 + }, + { + "epoch": 2.023634131368938, + "grad_norm": 0.4070499539375305, + "learning_rate": 9.241328840902e-05, + "loss": 1.8099, + "step": 6593 + }, + { + "epoch": 2.023941068139963, + "grad_norm": 0.40319183468818665, + "learning_rate": 9.241065593630097e-05, + "loss": 1.8654, + "step": 6594 + }, + { + "epoch": 2.0242480049109886, + "grad_norm": 0.3788430988788605, + "learning_rate": 9.240802304445499e-05, + "loss": 1.9419, + "step": 6595 + }, + { + "epoch": 2.0245549416820134, + "grad_norm": 0.3656894564628601, + "learning_rate": 9.240538973350809e-05, + "loss": 1.8625, + "step": 6596 + }, + { + "epoch": 2.0248618784530388, + "grad_norm": 0.4384852945804596, + "learning_rate": 9.240275600348625e-05, + "loss": 1.8893, + "step": 6597 + }, + { + "epoch": 2.0251688152240637, + "grad_norm": 0.5054775476455688, + "learning_rate": 9.240012185441554e-05, + "loss": 1.826, + "step": 6598 + }, + { + "epoch": 2.025475751995089, + "grad_norm": 0.4576725959777832, + "learning_rate": 9.239748728632196e-05, + "loss": 1.9319, + "step": 6599 + }, + { + "epoch": 2.0257826887661143, + "grad_norm": 0.40581515431404114, + "learning_rate": 9.239485229923157e-05, + "loss": 1.905, + "step": 6600 + }, + { + "epoch": 2.0260896255371392, + "grad_norm": 0.3168322443962097, + "learning_rate": 9.23922168931704e-05, + "loss": 1.8937, + "step": 6601 + }, + { + "epoch": 2.0263965623081646, + "grad_norm": 0.39211124181747437, + "learning_rate": 9.238958106816449e-05, + "loss": 1.8346, + "step": 6602 + }, + { + "epoch": 2.02670349907919, + "grad_norm": 0.4722496569156647, + "learning_rate": 9.23869448242399e-05, + "loss": 1.933, + "step": 6603 + }, + { + "epoch": 2.027010435850215, + "grad_norm": 0.47029170393943787, + "learning_rate": 9.238430816142268e-05, + "loss": 1.8873, + "step": 6604 + }, + { + "epoch": 2.02731737262124, + "grad_norm": 0.36421555280685425, + "learning_rate": 9.238167107973888e-05, + "loss": 1.8311, + "step": 6605 + }, + { + "epoch": 2.027624309392265, + "grad_norm": 0.36506712436676025, + "learning_rate": 9.237903357921455e-05, + "loss": 1.9025, + "step": 6606 + }, + { + "epoch": 2.0279312461632903, + "grad_norm": 0.5055087208747864, + "learning_rate": 9.237639565987579e-05, + "loss": 1.9138, + "step": 6607 + }, + { + "epoch": 2.0282381829343157, + "grad_norm": 0.5850993394851685, + "learning_rate": 9.237375732174867e-05, + "loss": 1.869, + "step": 6608 + }, + { + "epoch": 2.0285451197053406, + "grad_norm": 0.5053986310958862, + "learning_rate": 9.237111856485921e-05, + "loss": 1.8196, + "step": 6609 + }, + { + "epoch": 2.028852056476366, + "grad_norm": 0.40635839104652405, + "learning_rate": 9.236847938923354e-05, + "loss": 1.8399, + "step": 6610 + }, + { + "epoch": 2.0291589932473912, + "grad_norm": 0.32075709104537964, + "learning_rate": 9.236583979489771e-05, + "loss": 1.8532, + "step": 6611 + }, + { + "epoch": 2.029465930018416, + "grad_norm": 0.4474230408668518, + "learning_rate": 9.236319978187783e-05, + "loss": 1.8807, + "step": 6612 + }, + { + "epoch": 2.0297728667894415, + "grad_norm": 0.5391832590103149, + "learning_rate": 9.236055935019998e-05, + "loss": 1.8887, + "step": 6613 + }, + { + "epoch": 2.0300798035604664, + "grad_norm": 0.5129361748695374, + "learning_rate": 9.235791849989024e-05, + "loss": 1.8541, + "step": 6614 + }, + { + "epoch": 2.0303867403314917, + "grad_norm": 0.33113735914230347, + "learning_rate": 9.235527723097474e-05, + "loss": 1.8611, + "step": 6615 + }, + { + "epoch": 2.030693677102517, + "grad_norm": 0.3526761531829834, + "learning_rate": 9.235263554347956e-05, + "loss": 1.8436, + "step": 6616 + }, + { + "epoch": 2.031000613873542, + "grad_norm": 0.4380190670490265, + "learning_rate": 9.234999343743081e-05, + "loss": 1.854, + "step": 6617 + }, + { + "epoch": 2.0313075506445673, + "grad_norm": 0.4300559163093567, + "learning_rate": 9.23473509128546e-05, + "loss": 1.919, + "step": 6618 + }, + { + "epoch": 2.0316144874155926, + "grad_norm": 0.3445209860801697, + "learning_rate": 9.234470796977705e-05, + "loss": 1.88, + "step": 6619 + }, + { + "epoch": 2.0319214241866175, + "grad_norm": 0.35759109258651733, + "learning_rate": 9.234206460822428e-05, + "loss": 1.9244, + "step": 6620 + }, + { + "epoch": 2.032228360957643, + "grad_norm": 0.432804137468338, + "learning_rate": 9.23394208282224e-05, + "loss": 1.9312, + "step": 6621 + }, + { + "epoch": 2.0325352977286677, + "grad_norm": 0.446865439414978, + "learning_rate": 9.233677662979756e-05, + "loss": 1.8791, + "step": 6622 + }, + { + "epoch": 2.032842234499693, + "grad_norm": 0.37617436051368713, + "learning_rate": 9.233413201297588e-05, + "loss": 1.8794, + "step": 6623 + }, + { + "epoch": 2.0331491712707184, + "grad_norm": 0.33695775270462036, + "learning_rate": 9.233148697778349e-05, + "loss": 1.8649, + "step": 6624 + }, + { + "epoch": 2.0334561080417433, + "grad_norm": 0.3893069624900818, + "learning_rate": 9.232884152424654e-05, + "loss": 1.899, + "step": 6625 + }, + { + "epoch": 2.0337630448127686, + "grad_norm": 0.38993194699287415, + "learning_rate": 9.232619565239116e-05, + "loss": 1.8994, + "step": 6626 + }, + { + "epoch": 2.034069981583794, + "grad_norm": 0.3725507855415344, + "learning_rate": 9.23235493622435e-05, + "loss": 1.8758, + "step": 6627 + }, + { + "epoch": 2.034376918354819, + "grad_norm": 0.3236019015312195, + "learning_rate": 9.232090265382973e-05, + "loss": 1.9041, + "step": 6628 + }, + { + "epoch": 2.034683855125844, + "grad_norm": 0.3399617671966553, + "learning_rate": 9.231825552717599e-05, + "loss": 1.9081, + "step": 6629 + }, + { + "epoch": 2.034990791896869, + "grad_norm": 0.352096289396286, + "learning_rate": 9.231560798230845e-05, + "loss": 1.9001, + "step": 6630 + }, + { + "epoch": 2.0352977286678944, + "grad_norm": 0.39621952176094055, + "learning_rate": 9.231296001925327e-05, + "loss": 1.9258, + "step": 6631 + }, + { + "epoch": 2.0356046654389197, + "grad_norm": 0.36686012148857117, + "learning_rate": 9.23103116380366e-05, + "loss": 1.9325, + "step": 6632 + }, + { + "epoch": 2.0359116022099446, + "grad_norm": 0.36286696791648865, + "learning_rate": 9.230766283868466e-05, + "loss": 1.9623, + "step": 6633 + }, + { + "epoch": 2.03621853898097, + "grad_norm": 0.34748387336730957, + "learning_rate": 9.230501362122359e-05, + "loss": 1.8326, + "step": 6634 + }, + { + "epoch": 2.0365254757519953, + "grad_norm": 0.350993275642395, + "learning_rate": 9.230236398567958e-05, + "loss": 1.8333, + "step": 6635 + }, + { + "epoch": 2.03683241252302, + "grad_norm": 0.3181723356246948, + "learning_rate": 9.229971393207881e-05, + "loss": 1.8852, + "step": 6636 + }, + { + "epoch": 2.0371393492940455, + "grad_norm": 0.3446536660194397, + "learning_rate": 9.229706346044747e-05, + "loss": 1.8833, + "step": 6637 + }, + { + "epoch": 2.0374462860650704, + "grad_norm": 0.3077203631401062, + "learning_rate": 9.229441257081176e-05, + "loss": 1.8546, + "step": 6638 + }, + { + "epoch": 2.0377532228360957, + "grad_norm": 0.3659566342830658, + "learning_rate": 9.229176126319788e-05, + "loss": 1.8687, + "step": 6639 + }, + { + "epoch": 2.038060159607121, + "grad_norm": 0.379779577255249, + "learning_rate": 9.228910953763204e-05, + "loss": 1.9208, + "step": 6640 + }, + { + "epoch": 2.038367096378146, + "grad_norm": 0.4496903121471405, + "learning_rate": 9.228645739414042e-05, + "loss": 1.9471, + "step": 6641 + }, + { + "epoch": 2.0386740331491713, + "grad_norm": 0.37597209215164185, + "learning_rate": 9.228380483274923e-05, + "loss": 1.9047, + "step": 6642 + }, + { + "epoch": 2.0389809699201966, + "grad_norm": 0.3739323019981384, + "learning_rate": 9.228115185348471e-05, + "loss": 1.9697, + "step": 6643 + }, + { + "epoch": 2.0392879066912215, + "grad_norm": 0.3524092435836792, + "learning_rate": 9.227849845637306e-05, + "loss": 1.8716, + "step": 6644 + }, + { + "epoch": 2.039594843462247, + "grad_norm": 0.36939096450805664, + "learning_rate": 9.227584464144051e-05, + "loss": 1.9836, + "step": 6645 + }, + { + "epoch": 2.0399017802332717, + "grad_norm": 0.39015519618988037, + "learning_rate": 9.22731904087133e-05, + "loss": 1.907, + "step": 6646 + }, + { + "epoch": 2.040208717004297, + "grad_norm": 0.3725626468658447, + "learning_rate": 9.227053575821763e-05, + "loss": 1.9483, + "step": 6647 + }, + { + "epoch": 2.0405156537753224, + "grad_norm": 0.41595613956451416, + "learning_rate": 9.226788068997974e-05, + "loss": 1.9352, + "step": 6648 + }, + { + "epoch": 2.0408225905463473, + "grad_norm": 0.4026443660259247, + "learning_rate": 9.226522520402589e-05, + "loss": 1.9166, + "step": 6649 + }, + { + "epoch": 2.0411295273173726, + "grad_norm": 0.39883533120155334, + "learning_rate": 9.226256930038233e-05, + "loss": 1.8594, + "step": 6650 + }, + { + "epoch": 2.041436464088398, + "grad_norm": 0.35540083050727844, + "learning_rate": 9.225991297907526e-05, + "loss": 1.9065, + "step": 6651 + }, + { + "epoch": 2.041743400859423, + "grad_norm": 0.3799804747104645, + "learning_rate": 9.225725624013097e-05, + "loss": 1.9232, + "step": 6652 + }, + { + "epoch": 2.042050337630448, + "grad_norm": 0.37289959192276, + "learning_rate": 9.225459908357572e-05, + "loss": 1.9679, + "step": 6653 + }, + { + "epoch": 2.042357274401473, + "grad_norm": 0.38069143891334534, + "learning_rate": 9.225194150943574e-05, + "loss": 1.9699, + "step": 6654 + }, + { + "epoch": 2.0426642111724984, + "grad_norm": 0.43708884716033936, + "learning_rate": 9.224928351773731e-05, + "loss": 1.8907, + "step": 6655 + }, + { + "epoch": 2.0429711479435237, + "grad_norm": 0.47203195095062256, + "learning_rate": 9.22466251085067e-05, + "loss": 1.9615, + "step": 6656 + }, + { + "epoch": 2.0432780847145486, + "grad_norm": 0.405129998922348, + "learning_rate": 9.224396628177019e-05, + "loss": 1.9165, + "step": 6657 + }, + { + "epoch": 2.043585021485574, + "grad_norm": 0.33447468280792236, + "learning_rate": 9.224130703755403e-05, + "loss": 1.852, + "step": 6658 + }, + { + "epoch": 2.0438919582565993, + "grad_norm": 0.33780771493911743, + "learning_rate": 9.223864737588453e-05, + "loss": 1.875, + "step": 6659 + }, + { + "epoch": 2.044198895027624, + "grad_norm": 0.37942594289779663, + "learning_rate": 9.223598729678796e-05, + "loss": 1.9115, + "step": 6660 + }, + { + "epoch": 2.0445058317986495, + "grad_norm": 0.3368874192237854, + "learning_rate": 9.223332680029059e-05, + "loss": 1.822, + "step": 6661 + }, + { + "epoch": 2.044812768569675, + "grad_norm": 0.3029201924800873, + "learning_rate": 9.223066588641873e-05, + "loss": 1.8902, + "step": 6662 + }, + { + "epoch": 2.0451197053406998, + "grad_norm": 0.4605506360530853, + "learning_rate": 9.22280045551987e-05, + "loss": 1.9164, + "step": 6663 + }, + { + "epoch": 2.045426642111725, + "grad_norm": 0.5012617111206055, + "learning_rate": 9.222534280665675e-05, + "loss": 1.8859, + "step": 6664 + }, + { + "epoch": 2.04573357888275, + "grad_norm": 0.5177115797996521, + "learning_rate": 9.222268064081924e-05, + "loss": 1.93, + "step": 6665 + }, + { + "epoch": 2.0460405156537753, + "grad_norm": 0.3966628313064575, + "learning_rate": 9.222001805771244e-05, + "loss": 1.8817, + "step": 6666 + }, + { + "epoch": 2.0463474524248007, + "grad_norm": 0.3670666813850403, + "learning_rate": 9.221735505736269e-05, + "loss": 1.8224, + "step": 6667 + }, + { + "epoch": 2.0466543891958255, + "grad_norm": 0.4584221839904785, + "learning_rate": 9.221469163979628e-05, + "loss": 1.7788, + "step": 6668 + }, + { + "epoch": 2.046961325966851, + "grad_norm": 0.5598693490028381, + "learning_rate": 9.221202780503954e-05, + "loss": 1.9263, + "step": 6669 + }, + { + "epoch": 2.047268262737876, + "grad_norm": 0.44200289249420166, + "learning_rate": 9.22093635531188e-05, + "loss": 1.8455, + "step": 6670 + }, + { + "epoch": 2.047575199508901, + "grad_norm": 0.33257725834846497, + "learning_rate": 9.22066988840604e-05, + "loss": 1.9019, + "step": 6671 + }, + { + "epoch": 2.0478821362799264, + "grad_norm": 0.4716290831565857, + "learning_rate": 9.220403379789066e-05, + "loss": 1.9012, + "step": 6672 + }, + { + "epoch": 2.0481890730509513, + "grad_norm": 0.5600453615188599, + "learning_rate": 9.220136829463591e-05, + "loss": 1.9158, + "step": 6673 + }, + { + "epoch": 2.0484960098219767, + "grad_norm": 0.5345216393470764, + "learning_rate": 9.219870237432252e-05, + "loss": 1.931, + "step": 6674 + }, + { + "epoch": 2.048802946593002, + "grad_norm": 0.36617112159729004, + "learning_rate": 9.219603603697682e-05, + "loss": 1.9019, + "step": 6675 + }, + { + "epoch": 2.049109883364027, + "grad_norm": 0.33677804470062256, + "learning_rate": 9.219336928262514e-05, + "loss": 1.8897, + "step": 6676 + }, + { + "epoch": 2.049416820135052, + "grad_norm": 0.48563066124916077, + "learning_rate": 9.219070211129388e-05, + "loss": 1.9147, + "step": 6677 + }, + { + "epoch": 2.0497237569060776, + "grad_norm": 0.5029729008674622, + "learning_rate": 9.218803452300935e-05, + "loss": 1.8926, + "step": 6678 + }, + { + "epoch": 2.0500306936771024, + "grad_norm": 0.3969452977180481, + "learning_rate": 9.218536651779795e-05, + "loss": 1.9337, + "step": 6679 + }, + { + "epoch": 2.050337630448128, + "grad_norm": 0.37374138832092285, + "learning_rate": 9.218269809568603e-05, + "loss": 1.9147, + "step": 6680 + }, + { + "epoch": 2.0506445672191527, + "grad_norm": 0.416608065366745, + "learning_rate": 9.218002925669996e-05, + "loss": 1.975, + "step": 6681 + }, + { + "epoch": 2.050951503990178, + "grad_norm": 0.35848283767700195, + "learning_rate": 9.217736000086612e-05, + "loss": 1.9194, + "step": 6682 + }, + { + "epoch": 2.0512584407612033, + "grad_norm": 0.3294626772403717, + "learning_rate": 9.217469032821088e-05, + "loss": 1.8541, + "step": 6683 + }, + { + "epoch": 2.0515653775322282, + "grad_norm": 0.4164618253707886, + "learning_rate": 9.217202023876064e-05, + "loss": 1.8999, + "step": 6684 + }, + { + "epoch": 2.0518723143032536, + "grad_norm": 0.4067288935184479, + "learning_rate": 9.216934973254179e-05, + "loss": 1.8609, + "step": 6685 + }, + { + "epoch": 2.052179251074279, + "grad_norm": 0.38743069767951965, + "learning_rate": 9.216667880958069e-05, + "loss": 1.8571, + "step": 6686 + }, + { + "epoch": 2.052486187845304, + "grad_norm": 0.3430919647216797, + "learning_rate": 9.216400746990377e-05, + "loss": 1.9229, + "step": 6687 + }, + { + "epoch": 2.052793124616329, + "grad_norm": 0.3512028753757477, + "learning_rate": 9.21613357135374e-05, + "loss": 1.9331, + "step": 6688 + }, + { + "epoch": 2.053100061387354, + "grad_norm": 0.3708036541938782, + "learning_rate": 9.215866354050799e-05, + "loss": 1.8499, + "step": 6689 + }, + { + "epoch": 2.0534069981583793, + "grad_norm": 0.39376455545425415, + "learning_rate": 9.215599095084199e-05, + "loss": 1.8531, + "step": 6690 + }, + { + "epoch": 2.0537139349294047, + "grad_norm": 0.3855830430984497, + "learning_rate": 9.215331794456576e-05, + "loss": 1.8597, + "step": 6691 + }, + { + "epoch": 2.0540208717004296, + "grad_norm": 0.3515113592147827, + "learning_rate": 9.215064452170574e-05, + "loss": 1.8776, + "step": 6692 + }, + { + "epoch": 2.054327808471455, + "grad_norm": 0.3165057897567749, + "learning_rate": 9.214797068228833e-05, + "loss": 1.926, + "step": 6693 + }, + { + "epoch": 2.0546347452424802, + "grad_norm": 0.3516407310962677, + "learning_rate": 9.214529642633998e-05, + "loss": 1.9397, + "step": 6694 + }, + { + "epoch": 2.054941682013505, + "grad_norm": 0.36943888664245605, + "learning_rate": 9.214262175388713e-05, + "loss": 1.9114, + "step": 6695 + }, + { + "epoch": 2.0552486187845305, + "grad_norm": 0.3490065634250641, + "learning_rate": 9.213994666495616e-05, + "loss": 1.8637, + "step": 6696 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.30341869592666626, + "learning_rate": 9.213727115957356e-05, + "loss": 1.8525, + "step": 6697 + }, + { + "epoch": 2.0558624923265807, + "grad_norm": 0.3899247646331787, + "learning_rate": 9.213459523776573e-05, + "loss": 2.0578, + "step": 6698 + }, + { + "epoch": 2.056169429097606, + "grad_norm": 0.34904104471206665, + "learning_rate": 9.213191889955915e-05, + "loss": 1.9135, + "step": 6699 + }, + { + "epoch": 2.056476365868631, + "grad_norm": 0.3806450366973877, + "learning_rate": 9.212924214498024e-05, + "loss": 1.9252, + "step": 6700 + }, + { + "epoch": 2.0567833026396563, + "grad_norm": 0.33185848593711853, + "learning_rate": 9.212656497405547e-05, + "loss": 1.8457, + "step": 6701 + }, + { + "epoch": 2.0570902394106816, + "grad_norm": 0.356717050075531, + "learning_rate": 9.21238873868113e-05, + "loss": 1.9086, + "step": 6702 + }, + { + "epoch": 2.0573971761817065, + "grad_norm": 0.41743260622024536, + "learning_rate": 9.212120938327418e-05, + "loss": 1.9255, + "step": 6703 + }, + { + "epoch": 2.057704112952732, + "grad_norm": 0.3937377631664276, + "learning_rate": 9.211853096347058e-05, + "loss": 1.9529, + "step": 6704 + }, + { + "epoch": 2.0580110497237567, + "grad_norm": 0.43980923295021057, + "learning_rate": 9.211585212742698e-05, + "loss": 1.905, + "step": 6705 + }, + { + "epoch": 2.058317986494782, + "grad_norm": 0.36891186237335205, + "learning_rate": 9.211317287516984e-05, + "loss": 1.8109, + "step": 6706 + }, + { + "epoch": 2.0586249232658074, + "grad_norm": 0.3582547605037689, + "learning_rate": 9.211049320672563e-05, + "loss": 1.9633, + "step": 6707 + }, + { + "epoch": 2.0589318600368323, + "grad_norm": 0.3421446979045868, + "learning_rate": 9.210781312212087e-05, + "loss": 1.8956, + "step": 6708 + }, + { + "epoch": 2.0592387968078576, + "grad_norm": 0.34717023372650146, + "learning_rate": 9.210513262138199e-05, + "loss": 1.837, + "step": 6709 + }, + { + "epoch": 2.059545733578883, + "grad_norm": 0.32769930362701416, + "learning_rate": 9.210245170453553e-05, + "loss": 1.8588, + "step": 6710 + }, + { + "epoch": 2.059852670349908, + "grad_norm": 0.3694380223751068, + "learning_rate": 9.209977037160796e-05, + "loss": 1.9298, + "step": 6711 + }, + { + "epoch": 2.060159607120933, + "grad_norm": 0.38598594069480896, + "learning_rate": 9.209708862262578e-05, + "loss": 1.9011, + "step": 6712 + }, + { + "epoch": 2.060466543891958, + "grad_norm": 0.33520397543907166, + "learning_rate": 9.20944064576155e-05, + "loss": 1.9689, + "step": 6713 + }, + { + "epoch": 2.0607734806629834, + "grad_norm": 0.36898335814476013, + "learning_rate": 9.209172387660363e-05, + "loss": 1.9362, + "step": 6714 + }, + { + "epoch": 2.0610804174340087, + "grad_norm": 0.3989763855934143, + "learning_rate": 9.208904087961667e-05, + "loss": 1.8875, + "step": 6715 + }, + { + "epoch": 2.0613873542050336, + "grad_norm": 0.38079237937927246, + "learning_rate": 9.208635746668113e-05, + "loss": 1.8645, + "step": 6716 + }, + { + "epoch": 2.061694290976059, + "grad_norm": 0.3853057026863098, + "learning_rate": 9.208367363782355e-05, + "loss": 1.9346, + "step": 6717 + }, + { + "epoch": 2.0620012277470843, + "grad_norm": 0.33557942509651184, + "learning_rate": 9.208098939307044e-05, + "loss": 1.8629, + "step": 6718 + }, + { + "epoch": 2.062308164518109, + "grad_norm": 0.31848183274269104, + "learning_rate": 9.207830473244832e-05, + "loss": 1.7616, + "step": 6719 + }, + { + "epoch": 2.0626151012891345, + "grad_norm": 0.2901391088962555, + "learning_rate": 9.207561965598375e-05, + "loss": 1.8876, + "step": 6720 + }, + { + "epoch": 2.06292203806016, + "grad_norm": 0.33935174345970154, + "learning_rate": 9.207293416370322e-05, + "loss": 1.8407, + "step": 6721 + }, + { + "epoch": 2.0632289748311847, + "grad_norm": 0.3615114390850067, + "learning_rate": 9.207024825563331e-05, + "loss": 1.8378, + "step": 6722 + }, + { + "epoch": 2.06353591160221, + "grad_norm": 0.35903334617614746, + "learning_rate": 9.206756193180053e-05, + "loss": 1.8316, + "step": 6723 + }, + { + "epoch": 2.063842848373235, + "grad_norm": 0.35222968459129333, + "learning_rate": 9.206487519223146e-05, + "loss": 1.8786, + "step": 6724 + }, + { + "epoch": 2.0641497851442603, + "grad_norm": 0.3412967622280121, + "learning_rate": 9.206218803695264e-05, + "loss": 1.8682, + "step": 6725 + }, + { + "epoch": 2.0644567219152856, + "grad_norm": 0.4166354835033417, + "learning_rate": 9.205950046599062e-05, + "loss": 1.8871, + "step": 6726 + }, + { + "epoch": 2.0647636586863105, + "grad_norm": 0.4631161093711853, + "learning_rate": 9.205681247937196e-05, + "loss": 1.9328, + "step": 6727 + }, + { + "epoch": 2.065070595457336, + "grad_norm": 0.39197248220443726, + "learning_rate": 9.205412407712325e-05, + "loss": 1.9434, + "step": 6728 + }, + { + "epoch": 2.0653775322283607, + "grad_norm": 0.37939852476119995, + "learning_rate": 9.205143525927103e-05, + "loss": 1.9115, + "step": 6729 + }, + { + "epoch": 2.065684468999386, + "grad_norm": 0.35442814230918884, + "learning_rate": 9.204874602584186e-05, + "loss": 1.9197, + "step": 6730 + }, + { + "epoch": 2.0659914057704114, + "grad_norm": 0.3598809242248535, + "learning_rate": 9.204605637686235e-05, + "loss": 1.8684, + "step": 6731 + }, + { + "epoch": 2.0662983425414363, + "grad_norm": 0.3360415995121002, + "learning_rate": 9.204336631235905e-05, + "loss": 1.8531, + "step": 6732 + }, + { + "epoch": 2.0666052793124616, + "grad_norm": 0.4487619698047638, + "learning_rate": 9.204067583235859e-05, + "loss": 1.8509, + "step": 6733 + }, + { + "epoch": 2.066912216083487, + "grad_norm": 0.37166881561279297, + "learning_rate": 9.203798493688753e-05, + "loss": 1.8826, + "step": 6734 + }, + { + "epoch": 2.067219152854512, + "grad_norm": 0.35294032096862793, + "learning_rate": 9.203529362597244e-05, + "loss": 1.9029, + "step": 6735 + }, + { + "epoch": 2.067526089625537, + "grad_norm": 0.4115317165851593, + "learning_rate": 9.203260189963995e-05, + "loss": 1.9117, + "step": 6736 + }, + { + "epoch": 2.0678330263965625, + "grad_norm": 0.44137999415397644, + "learning_rate": 9.202990975791666e-05, + "loss": 1.8754, + "step": 6737 + }, + { + "epoch": 2.0681399631675874, + "grad_norm": 0.46055081486701965, + "learning_rate": 9.202721720082916e-05, + "loss": 1.8322, + "step": 6738 + }, + { + "epoch": 2.0684468999386127, + "grad_norm": 0.38548141717910767, + "learning_rate": 9.202452422840407e-05, + "loss": 1.8341, + "step": 6739 + }, + { + "epoch": 2.0687538367096376, + "grad_norm": 0.3542765974998474, + "learning_rate": 9.2021830840668e-05, + "loss": 1.9301, + "step": 6740 + }, + { + "epoch": 2.069060773480663, + "grad_norm": 0.35987207293510437, + "learning_rate": 9.201913703764755e-05, + "loss": 1.8756, + "step": 6741 + }, + { + "epoch": 2.0693677102516883, + "grad_norm": 0.4297364056110382, + "learning_rate": 9.201644281936938e-05, + "loss": 1.8549, + "step": 6742 + }, + { + "epoch": 2.069674647022713, + "grad_norm": 0.3679873049259186, + "learning_rate": 9.20137481858601e-05, + "loss": 1.8905, + "step": 6743 + }, + { + "epoch": 2.0699815837937385, + "grad_norm": 0.3402685523033142, + "learning_rate": 9.201105313714632e-05, + "loss": 1.8834, + "step": 6744 + }, + { + "epoch": 2.070288520564764, + "grad_norm": 0.40986955165863037, + "learning_rate": 9.200835767325469e-05, + "loss": 1.8861, + "step": 6745 + }, + { + "epoch": 2.0705954573357888, + "grad_norm": 0.4305949807167053, + "learning_rate": 9.200566179421186e-05, + "loss": 1.8977, + "step": 6746 + }, + { + "epoch": 2.070902394106814, + "grad_norm": 0.3948439359664917, + "learning_rate": 9.200296550004446e-05, + "loss": 1.8801, + "step": 6747 + }, + { + "epoch": 2.071209330877839, + "grad_norm": 0.3404015600681305, + "learning_rate": 9.200026879077912e-05, + "loss": 1.8417, + "step": 6748 + }, + { + "epoch": 2.0715162676488643, + "grad_norm": 0.39447101950645447, + "learning_rate": 9.199757166644252e-05, + "loss": 1.9675, + "step": 6749 + }, + { + "epoch": 2.0718232044198897, + "grad_norm": 0.44323647022247314, + "learning_rate": 9.199487412706129e-05, + "loss": 1.9014, + "step": 6750 + }, + { + "epoch": 2.0721301411909145, + "grad_norm": 0.47096556425094604, + "learning_rate": 9.199217617266212e-05, + "loss": 1.8783, + "step": 6751 + }, + { + "epoch": 2.07243707796194, + "grad_norm": 0.42863038182258606, + "learning_rate": 9.198947780327163e-05, + "loss": 1.8369, + "step": 6752 + }, + { + "epoch": 2.072744014732965, + "grad_norm": 0.414079874753952, + "learning_rate": 9.198677901891652e-05, + "loss": 1.9247, + "step": 6753 + }, + { + "epoch": 2.07305095150399, + "grad_norm": 0.3445589542388916, + "learning_rate": 9.198407981962345e-05, + "loss": 1.8494, + "step": 6754 + }, + { + "epoch": 2.0733578882750154, + "grad_norm": 0.4340321719646454, + "learning_rate": 9.198138020541908e-05, + "loss": 1.904, + "step": 6755 + }, + { + "epoch": 2.0736648250460403, + "grad_norm": 0.55349200963974, + "learning_rate": 9.197868017633013e-05, + "loss": 1.9368, + "step": 6756 + }, + { + "epoch": 2.0739717618170657, + "grad_norm": 0.5893970727920532, + "learning_rate": 9.197597973238326e-05, + "loss": 1.9329, + "step": 6757 + }, + { + "epoch": 2.074278698588091, + "grad_norm": 0.4942009449005127, + "learning_rate": 9.197327887360514e-05, + "loss": 1.7726, + "step": 6758 + }, + { + "epoch": 2.074585635359116, + "grad_norm": 0.36411046981811523, + "learning_rate": 9.197057760002247e-05, + "loss": 1.8214, + "step": 6759 + }, + { + "epoch": 2.074892572130141, + "grad_norm": 0.31520166993141174, + "learning_rate": 9.196787591166198e-05, + "loss": 1.8491, + "step": 6760 + }, + { + "epoch": 2.0751995089011666, + "grad_norm": 0.47392621636390686, + "learning_rate": 9.196517380855032e-05, + "loss": 2.0165, + "step": 6761 + }, + { + "epoch": 2.0755064456721914, + "grad_norm": 0.4768085181713104, + "learning_rate": 9.196247129071423e-05, + "loss": 1.9289, + "step": 6762 + }, + { + "epoch": 2.075813382443217, + "grad_norm": 0.396391361951828, + "learning_rate": 9.195976835818039e-05, + "loss": 1.9521, + "step": 6763 + }, + { + "epoch": 2.0761203192142417, + "grad_norm": 0.4030967950820923, + "learning_rate": 9.195706501097551e-05, + "loss": 1.8386, + "step": 6764 + }, + { + "epoch": 2.076427255985267, + "grad_norm": 0.48308777809143066, + "learning_rate": 9.195436124912635e-05, + "loss": 1.8874, + "step": 6765 + }, + { + "epoch": 2.0767341927562923, + "grad_norm": 0.5232771635055542, + "learning_rate": 9.19516570726596e-05, + "loss": 1.8822, + "step": 6766 + }, + { + "epoch": 2.0770411295273172, + "grad_norm": 0.3607174754142761, + "learning_rate": 9.194895248160198e-05, + "loss": 1.8995, + "step": 6767 + }, + { + "epoch": 2.0773480662983426, + "grad_norm": 0.4354429841041565, + "learning_rate": 9.194624747598022e-05, + "loss": 1.8629, + "step": 6768 + }, + { + "epoch": 2.077655003069368, + "grad_norm": 0.5405299067497253, + "learning_rate": 9.194354205582107e-05, + "loss": 1.8608, + "step": 6769 + }, + { + "epoch": 2.077961939840393, + "grad_norm": 0.5442025065422058, + "learning_rate": 9.194083622115123e-05, + "loss": 1.885, + "step": 6770 + }, + { + "epoch": 2.078268876611418, + "grad_norm": 0.4160112142562866, + "learning_rate": 9.193812997199749e-05, + "loss": 1.8617, + "step": 6771 + }, + { + "epoch": 2.078575813382443, + "grad_norm": 0.3550199866294861, + "learning_rate": 9.193542330838656e-05, + "loss": 1.9277, + "step": 6772 + }, + { + "epoch": 2.0788827501534684, + "grad_norm": 0.5224893093109131, + "learning_rate": 9.19327162303452e-05, + "loss": 1.7893, + "step": 6773 + }, + { + "epoch": 2.0791896869244937, + "grad_norm": 0.45021727681159973, + "learning_rate": 9.193000873790014e-05, + "loss": 1.8635, + "step": 6774 + }, + { + "epoch": 2.0794966236955186, + "grad_norm": 0.3087892532348633, + "learning_rate": 9.192730083107819e-05, + "loss": 1.842, + "step": 6775 + }, + { + "epoch": 2.079803560466544, + "grad_norm": 0.4304139018058777, + "learning_rate": 9.192459250990606e-05, + "loss": 1.8461, + "step": 6776 + }, + { + "epoch": 2.0801104972375692, + "grad_norm": 0.4388587474822998, + "learning_rate": 9.192188377441054e-05, + "loss": 1.8978, + "step": 6777 + }, + { + "epoch": 2.080417434008594, + "grad_norm": 0.3452616333961487, + "learning_rate": 9.19191746246184e-05, + "loss": 1.8849, + "step": 6778 + }, + { + "epoch": 2.0807243707796195, + "grad_norm": 0.3127618432044983, + "learning_rate": 9.191646506055638e-05, + "loss": 1.8703, + "step": 6779 + }, + { + "epoch": 2.0810313075506444, + "grad_norm": 0.3424977958202362, + "learning_rate": 9.191375508225131e-05, + "loss": 1.8446, + "step": 6780 + }, + { + "epoch": 2.0813382443216697, + "grad_norm": 0.3536671996116638, + "learning_rate": 9.191104468972993e-05, + "loss": 1.9079, + "step": 6781 + }, + { + "epoch": 2.081645181092695, + "grad_norm": 0.3689599633216858, + "learning_rate": 9.190833388301905e-05, + "loss": 1.8683, + "step": 6782 + }, + { + "epoch": 2.08195211786372, + "grad_norm": 0.30976906418800354, + "learning_rate": 9.190562266214546e-05, + "loss": 1.89, + "step": 6783 + }, + { + "epoch": 2.0822590546347453, + "grad_norm": 0.34682777523994446, + "learning_rate": 9.190291102713593e-05, + "loss": 1.8384, + "step": 6784 + }, + { + "epoch": 2.0825659914057706, + "grad_norm": 0.4135018587112427, + "learning_rate": 9.190019897801727e-05, + "loss": 1.8878, + "step": 6785 + }, + { + "epoch": 2.0828729281767955, + "grad_norm": 0.4247548580169678, + "learning_rate": 9.189748651481629e-05, + "loss": 1.9244, + "step": 6786 + }, + { + "epoch": 2.083179864947821, + "grad_norm": 0.3961609899997711, + "learning_rate": 9.18947736375598e-05, + "loss": 1.9539, + "step": 6787 + }, + { + "epoch": 2.0834868017188457, + "grad_norm": 0.4174231290817261, + "learning_rate": 9.18920603462746e-05, + "loss": 1.9705, + "step": 6788 + }, + { + "epoch": 2.083793738489871, + "grad_norm": 0.38771605491638184, + "learning_rate": 9.18893466409875e-05, + "loss": 1.9038, + "step": 6789 + }, + { + "epoch": 2.0841006752608964, + "grad_norm": 0.38480475544929504, + "learning_rate": 9.188663252172534e-05, + "loss": 1.8725, + "step": 6790 + }, + { + "epoch": 2.0844076120319213, + "grad_norm": 0.37508267164230347, + "learning_rate": 9.18839179885149e-05, + "loss": 1.8819, + "step": 6791 + }, + { + "epoch": 2.0847145488029466, + "grad_norm": 0.3970893621444702, + "learning_rate": 9.188120304138306e-05, + "loss": 1.9035, + "step": 6792 + }, + { + "epoch": 2.085021485573972, + "grad_norm": 0.42629706859588623, + "learning_rate": 9.18784876803566e-05, + "loss": 1.993, + "step": 6793 + }, + { + "epoch": 2.085328422344997, + "grad_norm": 0.40387317538261414, + "learning_rate": 9.18757719054624e-05, + "loss": 1.8987, + "step": 6794 + }, + { + "epoch": 2.085635359116022, + "grad_norm": 0.40304768085479736, + "learning_rate": 9.187305571672726e-05, + "loss": 1.9017, + "step": 6795 + }, + { + "epoch": 2.0859422958870475, + "grad_norm": 0.34255313873291016, + "learning_rate": 9.187033911417805e-05, + "loss": 1.8406, + "step": 6796 + }, + { + "epoch": 2.0862492326580724, + "grad_norm": 0.34713810682296753, + "learning_rate": 9.18676220978416e-05, + "loss": 1.8773, + "step": 6797 + }, + { + "epoch": 2.0865561694290977, + "grad_norm": 0.3651806712150574, + "learning_rate": 9.186490466774478e-05, + "loss": 1.9158, + "step": 6798 + }, + { + "epoch": 2.0868631062001226, + "grad_norm": 0.3859401047229767, + "learning_rate": 9.186218682391443e-05, + "loss": 1.8488, + "step": 6799 + }, + { + "epoch": 2.087170042971148, + "grad_norm": 0.34309303760528564, + "learning_rate": 9.185946856637742e-05, + "loss": 1.8373, + "step": 6800 + }, + { + "epoch": 2.0874769797421733, + "grad_norm": 0.3597384989261627, + "learning_rate": 9.18567498951606e-05, + "loss": 1.8297, + "step": 6801 + }, + { + "epoch": 2.087783916513198, + "grad_norm": 0.39170950651168823, + "learning_rate": 9.185403081029085e-05, + "loss": 1.9623, + "step": 6802 + }, + { + "epoch": 2.0880908532842235, + "grad_norm": 0.37024664878845215, + "learning_rate": 9.185131131179503e-05, + "loss": 1.8966, + "step": 6803 + }, + { + "epoch": 2.0883977900552484, + "grad_norm": 0.37869709730148315, + "learning_rate": 9.184859139970001e-05, + "loss": 1.9121, + "step": 6804 + }, + { + "epoch": 2.0887047268262737, + "grad_norm": 0.3808143436908722, + "learning_rate": 9.184587107403271e-05, + "loss": 1.918, + "step": 6805 + }, + { + "epoch": 2.089011663597299, + "grad_norm": 0.3864719271659851, + "learning_rate": 9.184315033481996e-05, + "loss": 1.9087, + "step": 6806 + }, + { + "epoch": 2.089318600368324, + "grad_norm": 0.41121476888656616, + "learning_rate": 9.184042918208869e-05, + "loss": 1.8971, + "step": 6807 + }, + { + "epoch": 2.0896255371393493, + "grad_norm": 0.33098986744880676, + "learning_rate": 9.183770761586576e-05, + "loss": 1.8497, + "step": 6808 + }, + { + "epoch": 2.0899324739103746, + "grad_norm": 0.336174339056015, + "learning_rate": 9.183498563617809e-05, + "loss": 1.8341, + "step": 6809 + }, + { + "epoch": 2.0902394106813995, + "grad_norm": 0.339040070772171, + "learning_rate": 9.183226324305258e-05, + "loss": 1.9228, + "step": 6810 + }, + { + "epoch": 2.090546347452425, + "grad_norm": 0.395000159740448, + "learning_rate": 9.182954043651613e-05, + "loss": 1.9773, + "step": 6811 + }, + { + "epoch": 2.09085328422345, + "grad_norm": 0.3884550929069519, + "learning_rate": 9.182681721659563e-05, + "loss": 1.9665, + "step": 6812 + }, + { + "epoch": 2.091160220994475, + "grad_norm": 0.38752105832099915, + "learning_rate": 9.182409358331801e-05, + "loss": 1.9337, + "step": 6813 + }, + { + "epoch": 2.0914671577655004, + "grad_norm": 0.3557493984699249, + "learning_rate": 9.182136953671017e-05, + "loss": 1.8506, + "step": 6814 + }, + { + "epoch": 2.0917740945365253, + "grad_norm": 0.36052554845809937, + "learning_rate": 9.181864507679906e-05, + "loss": 1.8336, + "step": 6815 + }, + { + "epoch": 2.0920810313075506, + "grad_norm": 0.3311133086681366, + "learning_rate": 9.181592020361158e-05, + "loss": 1.9121, + "step": 6816 + }, + { + "epoch": 2.092387968078576, + "grad_norm": 0.33922117948532104, + "learning_rate": 9.181319491717468e-05, + "loss": 1.8366, + "step": 6817 + }, + { + "epoch": 2.092694904849601, + "grad_norm": 0.30820000171661377, + "learning_rate": 9.181046921751527e-05, + "loss": 1.8931, + "step": 6818 + }, + { + "epoch": 2.093001841620626, + "grad_norm": 0.327374666929245, + "learning_rate": 9.180774310466031e-05, + "loss": 1.8818, + "step": 6819 + }, + { + "epoch": 2.0933087783916515, + "grad_norm": 0.3244091868400574, + "learning_rate": 9.180501657863672e-05, + "loss": 1.8542, + "step": 6820 + }, + { + "epoch": 2.0936157151626764, + "grad_norm": 0.32823657989501953, + "learning_rate": 9.180228963947144e-05, + "loss": 1.8745, + "step": 6821 + }, + { + "epoch": 2.0939226519337018, + "grad_norm": 0.32869017124176025, + "learning_rate": 9.179956228719144e-05, + "loss": 1.8497, + "step": 6822 + }, + { + "epoch": 2.0942295887047266, + "grad_norm": 0.3624805808067322, + "learning_rate": 9.179683452182369e-05, + "loss": 1.9499, + "step": 6823 + }, + { + "epoch": 2.094536525475752, + "grad_norm": 0.35709038376808167, + "learning_rate": 9.179410634339509e-05, + "loss": 1.8709, + "step": 6824 + }, + { + "epoch": 2.0948434622467773, + "grad_norm": 0.3875027298927307, + "learning_rate": 9.179137775193266e-05, + "loss": 1.883, + "step": 6825 + }, + { + "epoch": 2.095150399017802, + "grad_norm": 0.4203769862651825, + "learning_rate": 9.178864874746333e-05, + "loss": 1.814, + "step": 6826 + }, + { + "epoch": 2.0954573357888275, + "grad_norm": 0.46331214904785156, + "learning_rate": 9.178591933001407e-05, + "loss": 1.9821, + "step": 6827 + }, + { + "epoch": 2.095764272559853, + "grad_norm": 0.4264145791530609, + "learning_rate": 9.178318949961188e-05, + "loss": 1.9249, + "step": 6828 + }, + { + "epoch": 2.0960712093308778, + "grad_norm": 0.3697608709335327, + "learning_rate": 9.178045925628371e-05, + "loss": 2.0052, + "step": 6829 + }, + { + "epoch": 2.096378146101903, + "grad_norm": 0.39582517743110657, + "learning_rate": 9.177772860005656e-05, + "loss": 1.9086, + "step": 6830 + }, + { + "epoch": 2.096685082872928, + "grad_norm": 0.3287788927555084, + "learning_rate": 9.17749975309574e-05, + "loss": 1.8766, + "step": 6831 + }, + { + "epoch": 2.0969920196439533, + "grad_norm": 0.33648282289505005, + "learning_rate": 9.177226604901324e-05, + "loss": 1.933, + "step": 6832 + }, + { + "epoch": 2.0972989564149787, + "grad_norm": 0.34225910902023315, + "learning_rate": 9.176953415425106e-05, + "loss": 1.8801, + "step": 6833 + }, + { + "epoch": 2.0976058931860035, + "grad_norm": 0.35536935925483704, + "learning_rate": 9.176680184669786e-05, + "loss": 1.9472, + "step": 6834 + }, + { + "epoch": 2.097912829957029, + "grad_norm": 0.39152607321739197, + "learning_rate": 9.176406912638064e-05, + "loss": 1.9502, + "step": 6835 + }, + { + "epoch": 2.098219766728054, + "grad_norm": 0.3812694549560547, + "learning_rate": 9.176133599332643e-05, + "loss": 1.8746, + "step": 6836 + }, + { + "epoch": 2.098526703499079, + "grad_norm": 0.36225396394729614, + "learning_rate": 9.17586024475622e-05, + "loss": 1.8489, + "step": 6837 + }, + { + "epoch": 2.0988336402701044, + "grad_norm": 0.3953205943107605, + "learning_rate": 9.1755868489115e-05, + "loss": 1.8671, + "step": 6838 + }, + { + "epoch": 2.0991405770411293, + "grad_norm": 0.33443906903266907, + "learning_rate": 9.175313411801181e-05, + "loss": 1.8574, + "step": 6839 + }, + { + "epoch": 2.0994475138121547, + "grad_norm": 0.3358154892921448, + "learning_rate": 9.17503993342797e-05, + "loss": 1.8329, + "step": 6840 + }, + { + "epoch": 2.09975445058318, + "grad_norm": 0.45934513211250305, + "learning_rate": 9.174766413794566e-05, + "loss": 1.862, + "step": 6841 + }, + { + "epoch": 2.100061387354205, + "grad_norm": 0.46342480182647705, + "learning_rate": 9.174492852903673e-05, + "loss": 1.8747, + "step": 6842 + }, + { + "epoch": 2.1003683241252302, + "grad_norm": 0.4199588894844055, + "learning_rate": 9.174219250757996e-05, + "loss": 1.9308, + "step": 6843 + }, + { + "epoch": 2.1006752608962556, + "grad_norm": 0.3508588373661041, + "learning_rate": 9.173945607360238e-05, + "loss": 1.8622, + "step": 6844 + }, + { + "epoch": 2.1009821976672804, + "grad_norm": 0.3656609356403351, + "learning_rate": 9.173671922713104e-05, + "loss": 1.899, + "step": 6845 + }, + { + "epoch": 2.101289134438306, + "grad_norm": 0.43374791741371155, + "learning_rate": 9.173398196819295e-05, + "loss": 1.8725, + "step": 6846 + }, + { + "epoch": 2.1015960712093307, + "grad_norm": 0.49730411171913147, + "learning_rate": 9.17312442968152e-05, + "loss": 1.9224, + "step": 6847 + }, + { + "epoch": 2.101903007980356, + "grad_norm": 0.45392677187919617, + "learning_rate": 9.172850621302484e-05, + "loss": 1.8374, + "step": 6848 + }, + { + "epoch": 2.1022099447513813, + "grad_norm": 0.3507382273674011, + "learning_rate": 9.172576771684892e-05, + "loss": 1.8875, + "step": 6849 + }, + { + "epoch": 2.1025168815224062, + "grad_norm": 0.4124681055545807, + "learning_rate": 9.172302880831451e-05, + "loss": 1.8828, + "step": 6850 + }, + { + "epoch": 2.1028238182934316, + "grad_norm": 0.5120462775230408, + "learning_rate": 9.172028948744867e-05, + "loss": 1.8218, + "step": 6851 + }, + { + "epoch": 2.103130755064457, + "grad_norm": 0.5858038067817688, + "learning_rate": 9.171754975427848e-05, + "loss": 1.8679, + "step": 6852 + }, + { + "epoch": 2.103437691835482, + "grad_norm": 0.5196588039398193, + "learning_rate": 9.171480960883101e-05, + "loss": 1.8885, + "step": 6853 + }, + { + "epoch": 2.103744628606507, + "grad_norm": 0.38581255078315735, + "learning_rate": 9.171206905113335e-05, + "loss": 1.9127, + "step": 6854 + }, + { + "epoch": 2.104051565377532, + "grad_norm": 0.31531259417533875, + "learning_rate": 9.170932808121256e-05, + "loss": 1.84, + "step": 6855 + }, + { + "epoch": 2.1043585021485574, + "grad_norm": 0.4595080018043518, + "learning_rate": 9.170658669909575e-05, + "loss": 1.908, + "step": 6856 + }, + { + "epoch": 2.1046654389195827, + "grad_norm": 0.42485639452934265, + "learning_rate": 9.170384490481001e-05, + "loss": 1.8943, + "step": 6857 + }, + { + "epoch": 2.1049723756906076, + "grad_norm": 0.3465791344642639, + "learning_rate": 9.170110269838243e-05, + "loss": 1.8362, + "step": 6858 + }, + { + "epoch": 2.105279312461633, + "grad_norm": 0.26863181591033936, + "learning_rate": 9.16983600798401e-05, + "loss": 1.856, + "step": 6859 + }, + { + "epoch": 2.1055862492326582, + "grad_norm": 0.33826425671577454, + "learning_rate": 9.169561704921014e-05, + "loss": 1.8148, + "step": 6860 + }, + { + "epoch": 2.105893186003683, + "grad_norm": 0.3657929301261902, + "learning_rate": 9.169287360651967e-05, + "loss": 1.8978, + "step": 6861 + }, + { + "epoch": 2.1062001227747085, + "grad_norm": 0.2963617444038391, + "learning_rate": 9.169012975179579e-05, + "loss": 1.8432, + "step": 6862 + }, + { + "epoch": 2.1065070595457334, + "grad_norm": 0.32966092228889465, + "learning_rate": 9.168738548506559e-05, + "loss": 1.9137, + "step": 6863 + }, + { + "epoch": 2.1068139963167587, + "grad_norm": 0.4043191075325012, + "learning_rate": 9.168464080635622e-05, + "loss": 1.9294, + "step": 6864 + }, + { + "epoch": 2.107120933087784, + "grad_norm": 0.41461876034736633, + "learning_rate": 9.168189571569479e-05, + "loss": 1.8582, + "step": 6865 + }, + { + "epoch": 2.107427869858809, + "grad_norm": 0.34119492769241333, + "learning_rate": 9.167915021310845e-05, + "loss": 1.8245, + "step": 6866 + }, + { + "epoch": 2.1077348066298343, + "grad_norm": 0.3259434401988983, + "learning_rate": 9.167640429862429e-05, + "loss": 1.8962, + "step": 6867 + }, + { + "epoch": 2.1080417434008596, + "grad_norm": 0.3074548840522766, + "learning_rate": 9.167365797226951e-05, + "loss": 1.8617, + "step": 6868 + }, + { + "epoch": 2.1083486801718845, + "grad_norm": 0.40738388895988464, + "learning_rate": 9.167091123407121e-05, + "loss": 1.9701, + "step": 6869 + }, + { + "epoch": 2.10865561694291, + "grad_norm": 0.3931449055671692, + "learning_rate": 9.166816408405653e-05, + "loss": 1.8874, + "step": 6870 + }, + { + "epoch": 2.108962553713935, + "grad_norm": 0.3726460635662079, + "learning_rate": 9.166541652225264e-05, + "loss": 1.9307, + "step": 6871 + }, + { + "epoch": 2.10926949048496, + "grad_norm": 0.36566078662872314, + "learning_rate": 9.166266854868667e-05, + "loss": 1.8782, + "step": 6872 + }, + { + "epoch": 2.1095764272559854, + "grad_norm": 0.33448025584220886, + "learning_rate": 9.16599201633858e-05, + "loss": 1.8007, + "step": 6873 + }, + { + "epoch": 2.1098833640270103, + "grad_norm": 0.4261031150817871, + "learning_rate": 9.165717136637716e-05, + "loss": 1.9092, + "step": 6874 + }, + { + "epoch": 2.1101903007980356, + "grad_norm": 0.37860241532325745, + "learning_rate": 9.165442215768798e-05, + "loss": 1.8538, + "step": 6875 + }, + { + "epoch": 2.110497237569061, + "grad_norm": 0.35417279601097107, + "learning_rate": 9.165167253734535e-05, + "loss": 1.8859, + "step": 6876 + }, + { + "epoch": 2.110804174340086, + "grad_norm": 0.33357858657836914, + "learning_rate": 9.16489225053765e-05, + "loss": 1.8615, + "step": 6877 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.40441447496414185, + "learning_rate": 9.164617206180856e-05, + "loss": 1.8711, + "step": 6878 + }, + { + "epoch": 2.1114180478821365, + "grad_norm": 0.401530921459198, + "learning_rate": 9.164342120666876e-05, + "loss": 1.8378, + "step": 6879 + }, + { + "epoch": 2.1117249846531614, + "grad_norm": 0.36379504203796387, + "learning_rate": 9.164066993998426e-05, + "loss": 1.87, + "step": 6880 + }, + { + "epoch": 2.1120319214241867, + "grad_norm": 0.36242642998695374, + "learning_rate": 9.163791826178225e-05, + "loss": 1.9041, + "step": 6881 + }, + { + "epoch": 2.1123388581952116, + "grad_norm": 0.34601980447769165, + "learning_rate": 9.163516617208994e-05, + "loss": 1.9248, + "step": 6882 + }, + { + "epoch": 2.112645794966237, + "grad_norm": 0.4664660096168518, + "learning_rate": 9.163241367093451e-05, + "loss": 1.901, + "step": 6883 + }, + { + "epoch": 2.1129527317372623, + "grad_norm": 0.5991809964179993, + "learning_rate": 9.162966075834315e-05, + "loss": 1.9061, + "step": 6884 + }, + { + "epoch": 2.113259668508287, + "grad_norm": 0.5235050320625305, + "learning_rate": 9.16269074343431e-05, + "loss": 1.8958, + "step": 6885 + }, + { + "epoch": 2.1135666052793125, + "grad_norm": 0.39008161425590515, + "learning_rate": 9.162415369896153e-05, + "loss": 1.7935, + "step": 6886 + }, + { + "epoch": 2.113873542050338, + "grad_norm": 0.4212269186973572, + "learning_rate": 9.16213995522257e-05, + "loss": 1.9876, + "step": 6887 + }, + { + "epoch": 2.1141804788213627, + "grad_norm": 0.44495880603790283, + "learning_rate": 9.161864499416279e-05, + "loss": 1.9011, + "step": 6888 + }, + { + "epoch": 2.114487415592388, + "grad_norm": 0.40533384680747986, + "learning_rate": 9.161589002480006e-05, + "loss": 1.8734, + "step": 6889 + }, + { + "epoch": 2.114794352363413, + "grad_norm": 0.45783132314682007, + "learning_rate": 9.161313464416469e-05, + "loss": 1.9769, + "step": 6890 + }, + { + "epoch": 2.1151012891344383, + "grad_norm": 0.37975600361824036, + "learning_rate": 9.161037885228393e-05, + "loss": 1.8988, + "step": 6891 + }, + { + "epoch": 2.1154082259054636, + "grad_norm": 0.394987553358078, + "learning_rate": 9.160762264918504e-05, + "loss": 1.8076, + "step": 6892 + }, + { + "epoch": 2.1157151626764885, + "grad_norm": 0.4180262088775635, + "learning_rate": 9.160486603489522e-05, + "loss": 1.9497, + "step": 6893 + }, + { + "epoch": 2.116022099447514, + "grad_norm": 0.3917383849620819, + "learning_rate": 9.160210900944173e-05, + "loss": 1.9093, + "step": 6894 + }, + { + "epoch": 2.116329036218539, + "grad_norm": 0.3631739616394043, + "learning_rate": 9.15993515728518e-05, + "loss": 1.8724, + "step": 6895 + }, + { + "epoch": 2.116635972989564, + "grad_norm": 0.3304460942745209, + "learning_rate": 9.159659372515272e-05, + "loss": 1.8291, + "step": 6896 + }, + { + "epoch": 2.1169429097605894, + "grad_norm": 0.38202792406082153, + "learning_rate": 9.159383546637172e-05, + "loss": 1.8919, + "step": 6897 + }, + { + "epoch": 2.1172498465316143, + "grad_norm": 0.39544618129730225, + "learning_rate": 9.159107679653605e-05, + "loss": 1.8748, + "step": 6898 + }, + { + "epoch": 2.1175567833026396, + "grad_norm": 0.44175153970718384, + "learning_rate": 9.158831771567298e-05, + "loss": 1.9063, + "step": 6899 + }, + { + "epoch": 2.117863720073665, + "grad_norm": 0.3696559965610504, + "learning_rate": 9.158555822380979e-05, + "loss": 1.8356, + "step": 6900 + }, + { + "epoch": 2.11817065684469, + "grad_norm": 0.2917703688144684, + "learning_rate": 9.158279832097372e-05, + "loss": 1.8996, + "step": 6901 + }, + { + "epoch": 2.118477593615715, + "grad_norm": 0.3991266191005707, + "learning_rate": 9.158003800719208e-05, + "loss": 1.8872, + "step": 6902 + }, + { + "epoch": 2.1187845303867405, + "grad_norm": 0.41425880789756775, + "learning_rate": 9.157727728249213e-05, + "loss": 1.845, + "step": 6903 + }, + { + "epoch": 2.1190914671577654, + "grad_norm": 0.33590519428253174, + "learning_rate": 9.157451614690115e-05, + "loss": 1.8779, + "step": 6904 + }, + { + "epoch": 2.1193984039287908, + "grad_norm": 0.34963786602020264, + "learning_rate": 9.157175460044644e-05, + "loss": 1.8846, + "step": 6905 + }, + { + "epoch": 2.1197053406998156, + "grad_norm": 0.3274745047092438, + "learning_rate": 9.156899264315528e-05, + "loss": 1.8859, + "step": 6906 + }, + { + "epoch": 2.120012277470841, + "grad_norm": 0.35821303725242615, + "learning_rate": 9.156623027505498e-05, + "loss": 1.8314, + "step": 6907 + }, + { + "epoch": 2.1203192142418663, + "grad_norm": 0.41185733675956726, + "learning_rate": 9.156346749617283e-05, + "loss": 1.9162, + "step": 6908 + }, + { + "epoch": 2.120626151012891, + "grad_norm": 0.4120326042175293, + "learning_rate": 9.156070430653613e-05, + "loss": 1.8593, + "step": 6909 + }, + { + "epoch": 2.1209330877839165, + "grad_norm": 0.39017269015312195, + "learning_rate": 9.155794070617218e-05, + "loss": 1.9333, + "step": 6910 + }, + { + "epoch": 2.121240024554942, + "grad_norm": 0.3104727864265442, + "learning_rate": 9.155517669510832e-05, + "loss": 1.8274, + "step": 6911 + }, + { + "epoch": 2.1215469613259668, + "grad_norm": 0.38360875844955444, + "learning_rate": 9.155241227337183e-05, + "loss": 1.9013, + "step": 6912 + }, + { + "epoch": 2.121853898096992, + "grad_norm": 0.3752502501010895, + "learning_rate": 9.154964744099006e-05, + "loss": 1.9079, + "step": 6913 + }, + { + "epoch": 2.122160834868017, + "grad_norm": 0.32074928283691406, + "learning_rate": 9.154688219799033e-05, + "loss": 1.8232, + "step": 6914 + }, + { + "epoch": 2.1224677716390423, + "grad_norm": 0.39559221267700195, + "learning_rate": 9.154411654439993e-05, + "loss": 1.9273, + "step": 6915 + }, + { + "epoch": 2.1227747084100677, + "grad_norm": 0.4010276198387146, + "learning_rate": 9.154135048024623e-05, + "loss": 1.8368, + "step": 6916 + }, + { + "epoch": 2.1230816451810925, + "grad_norm": 0.5745936036109924, + "learning_rate": 9.153858400555658e-05, + "loss": 2.0344, + "step": 6917 + }, + { + "epoch": 2.123388581952118, + "grad_norm": 0.45708227157592773, + "learning_rate": 9.153581712035827e-05, + "loss": 1.9309, + "step": 6918 + }, + { + "epoch": 2.123695518723143, + "grad_norm": 0.43845629692077637, + "learning_rate": 9.153304982467868e-05, + "loss": 1.9213, + "step": 6919 + }, + { + "epoch": 2.124002455494168, + "grad_norm": 0.34456655383110046, + "learning_rate": 9.153028211854516e-05, + "loss": 1.9, + "step": 6920 + }, + { + "epoch": 2.1243093922651934, + "grad_norm": 0.3903563618659973, + "learning_rate": 9.152751400198502e-05, + "loss": 1.8619, + "step": 6921 + }, + { + "epoch": 2.1246163290362183, + "grad_norm": 0.3465174436569214, + "learning_rate": 9.152474547502566e-05, + "loss": 1.8253, + "step": 6922 + }, + { + "epoch": 2.1249232658072437, + "grad_norm": 0.38335317373275757, + "learning_rate": 9.152197653769444e-05, + "loss": 1.8824, + "step": 6923 + }, + { + "epoch": 2.125230202578269, + "grad_norm": 0.3583361506462097, + "learning_rate": 9.15192071900187e-05, + "loss": 1.8749, + "step": 6924 + }, + { + "epoch": 2.125537139349294, + "grad_norm": 0.38249272108078003, + "learning_rate": 9.151643743202582e-05, + "loss": 1.9289, + "step": 6925 + }, + { + "epoch": 2.1258440761203192, + "grad_norm": 0.3972204327583313, + "learning_rate": 9.151366726374318e-05, + "loss": 1.8259, + "step": 6926 + }, + { + "epoch": 2.1261510128913446, + "grad_norm": 0.42475268244743347, + "learning_rate": 9.151089668519814e-05, + "loss": 1.9026, + "step": 6927 + }, + { + "epoch": 2.1264579496623695, + "grad_norm": 0.39575010538101196, + "learning_rate": 9.15081256964181e-05, + "loss": 1.8835, + "step": 6928 + }, + { + "epoch": 2.126764886433395, + "grad_norm": 0.33592918515205383, + "learning_rate": 9.150535429743041e-05, + "loss": 1.9439, + "step": 6929 + }, + { + "epoch": 2.12707182320442, + "grad_norm": 0.41760140657424927, + "learning_rate": 9.150258248826249e-05, + "loss": 1.9326, + "step": 6930 + }, + { + "epoch": 2.127378759975445, + "grad_norm": 0.4759281575679779, + "learning_rate": 9.149981026894173e-05, + "loss": 1.8443, + "step": 6931 + }, + { + "epoch": 2.1276856967464703, + "grad_norm": 0.4669014513492584, + "learning_rate": 9.149703763949552e-05, + "loss": 1.9254, + "step": 6932 + }, + { + "epoch": 2.1279926335174952, + "grad_norm": 0.3498002588748932, + "learning_rate": 9.149426459995126e-05, + "loss": 1.8814, + "step": 6933 + }, + { + "epoch": 2.1282995702885206, + "grad_norm": 0.332998663187027, + "learning_rate": 9.149149115033637e-05, + "loss": 1.8223, + "step": 6934 + }, + { + "epoch": 2.128606507059546, + "grad_norm": 0.36990395188331604, + "learning_rate": 9.148871729067823e-05, + "loss": 1.917, + "step": 6935 + }, + { + "epoch": 2.128913443830571, + "grad_norm": 0.4807330369949341, + "learning_rate": 9.148594302100426e-05, + "loss": 1.9138, + "step": 6936 + }, + { + "epoch": 2.129220380601596, + "grad_norm": 0.4821743369102478, + "learning_rate": 9.14831683413419e-05, + "loss": 1.9201, + "step": 6937 + }, + { + "epoch": 2.129527317372621, + "grad_norm": 0.45373013615608215, + "learning_rate": 9.148039325171855e-05, + "loss": 1.88, + "step": 6938 + }, + { + "epoch": 2.1298342541436464, + "grad_norm": 0.3712935745716095, + "learning_rate": 9.147761775216166e-05, + "loss": 1.8424, + "step": 6939 + }, + { + "epoch": 2.1301411909146717, + "grad_norm": 0.32493939995765686, + "learning_rate": 9.147484184269862e-05, + "loss": 1.8691, + "step": 6940 + }, + { + "epoch": 2.1304481276856966, + "grad_norm": 0.41952449083328247, + "learning_rate": 9.14720655233569e-05, + "loss": 1.8468, + "step": 6941 + }, + { + "epoch": 2.130755064456722, + "grad_norm": 0.4730648398399353, + "learning_rate": 9.14692887941639e-05, + "loss": 2.0333, + "step": 6942 + }, + { + "epoch": 2.1310620012277472, + "grad_norm": 0.3745786249637604, + "learning_rate": 9.14665116551471e-05, + "loss": 1.8835, + "step": 6943 + }, + { + "epoch": 2.131368937998772, + "grad_norm": 0.3747421205043793, + "learning_rate": 9.146373410633392e-05, + "loss": 1.8958, + "step": 6944 + }, + { + "epoch": 2.1316758747697975, + "grad_norm": 0.4383934438228607, + "learning_rate": 9.146095614775182e-05, + "loss": 1.8527, + "step": 6945 + }, + { + "epoch": 2.131982811540823, + "grad_norm": 0.4657299220561981, + "learning_rate": 9.145817777942824e-05, + "loss": 1.9073, + "step": 6946 + }, + { + "epoch": 2.1322897483118477, + "grad_norm": 0.4741605818271637, + "learning_rate": 9.145539900139067e-05, + "loss": 1.8736, + "step": 6947 + }, + { + "epoch": 2.132596685082873, + "grad_norm": 0.4058460295200348, + "learning_rate": 9.145261981366653e-05, + "loss": 1.9365, + "step": 6948 + }, + { + "epoch": 2.132903621853898, + "grad_norm": 0.3430838882923126, + "learning_rate": 9.14498402162833e-05, + "loss": 1.8992, + "step": 6949 + }, + { + "epoch": 2.1332105586249233, + "grad_norm": 0.43009114265441895, + "learning_rate": 9.144706020926847e-05, + "loss": 1.925, + "step": 6950 + }, + { + "epoch": 2.1335174953959486, + "grad_norm": 0.47696158289909363, + "learning_rate": 9.144427979264949e-05, + "loss": 1.858, + "step": 6951 + }, + { + "epoch": 2.1338244321669735, + "grad_norm": 0.4477602243423462, + "learning_rate": 9.144149896645386e-05, + "loss": 1.9042, + "step": 6952 + }, + { + "epoch": 2.134131368937999, + "grad_norm": 0.3736960291862488, + "learning_rate": 9.143871773070903e-05, + "loss": 1.782, + "step": 6953 + }, + { + "epoch": 2.1344383057090237, + "grad_norm": 0.3065558075904846, + "learning_rate": 9.143593608544251e-05, + "loss": 1.8711, + "step": 6954 + }, + { + "epoch": 2.134745242480049, + "grad_norm": 0.41738569736480713, + "learning_rate": 9.143315403068178e-05, + "loss": 1.8651, + "step": 6955 + }, + { + "epoch": 2.1350521792510744, + "grad_norm": 0.4652978479862213, + "learning_rate": 9.143037156645435e-05, + "loss": 1.8225, + "step": 6956 + }, + { + "epoch": 2.1353591160220993, + "grad_norm": 0.3625001311302185, + "learning_rate": 9.142758869278769e-05, + "loss": 1.9045, + "step": 6957 + }, + { + "epoch": 2.1356660527931246, + "grad_norm": 0.34516090154647827, + "learning_rate": 9.142480540970933e-05, + "loss": 1.8527, + "step": 6958 + }, + { + "epoch": 2.13597298956415, + "grad_norm": 0.36983323097229004, + "learning_rate": 9.142202171724674e-05, + "loss": 1.7911, + "step": 6959 + }, + { + "epoch": 2.136279926335175, + "grad_norm": 0.46084535121917725, + "learning_rate": 9.141923761542748e-05, + "loss": 1.9489, + "step": 6960 + }, + { + "epoch": 2.1365868631062, + "grad_norm": 0.49472227692604065, + "learning_rate": 9.141645310427903e-05, + "loss": 1.9904, + "step": 6961 + }, + { + "epoch": 2.1368937998772255, + "grad_norm": 0.39878135919570923, + "learning_rate": 9.14136681838289e-05, + "loss": 1.8969, + "step": 6962 + }, + { + "epoch": 2.1372007366482504, + "grad_norm": 0.3451174795627594, + "learning_rate": 9.141088285410464e-05, + "loss": 1.9186, + "step": 6963 + }, + { + "epoch": 2.1375076734192757, + "grad_norm": 0.4497967064380646, + "learning_rate": 9.140809711513377e-05, + "loss": 1.8636, + "step": 6964 + }, + { + "epoch": 2.1378146101903006, + "grad_norm": 0.4643685221672058, + "learning_rate": 9.14053109669438e-05, + "loss": 1.8427, + "step": 6965 + }, + { + "epoch": 2.138121546961326, + "grad_norm": 0.3748690187931061, + "learning_rate": 9.140252440956229e-05, + "loss": 1.8529, + "step": 6966 + }, + { + "epoch": 2.1384284837323513, + "grad_norm": 0.3211230933666229, + "learning_rate": 9.139973744301675e-05, + "loss": 1.8849, + "step": 6967 + }, + { + "epoch": 2.138735420503376, + "grad_norm": 0.41169998049736023, + "learning_rate": 9.139695006733476e-05, + "loss": 1.8535, + "step": 6968 + }, + { + "epoch": 2.1390423572744015, + "grad_norm": 0.48356300592422485, + "learning_rate": 9.139416228254382e-05, + "loss": 1.8182, + "step": 6969 + }, + { + "epoch": 2.139349294045427, + "grad_norm": 0.4596598148345947, + "learning_rate": 9.139137408867153e-05, + "loss": 1.8522, + "step": 6970 + }, + { + "epoch": 2.1396562308164517, + "grad_norm": 0.37168747186660767, + "learning_rate": 9.138858548574543e-05, + "loss": 1.896, + "step": 6971 + }, + { + "epoch": 2.139963167587477, + "grad_norm": 0.34447649121284485, + "learning_rate": 9.138579647379305e-05, + "loss": 1.8473, + "step": 6972 + }, + { + "epoch": 2.140270104358502, + "grad_norm": 0.466169536113739, + "learning_rate": 9.138300705284197e-05, + "loss": 1.9131, + "step": 6973 + }, + { + "epoch": 2.1405770411295273, + "grad_norm": 0.4297258257865906, + "learning_rate": 9.138021722291977e-05, + "loss": 1.9013, + "step": 6974 + }, + { + "epoch": 2.1408839779005526, + "grad_norm": 0.29336342215538025, + "learning_rate": 9.1377426984054e-05, + "loss": 1.8242, + "step": 6975 + }, + { + "epoch": 2.1411909146715775, + "grad_norm": 0.4282750189304352, + "learning_rate": 9.137463633627226e-05, + "loss": 1.9159, + "step": 6976 + }, + { + "epoch": 2.141497851442603, + "grad_norm": 0.6071211099624634, + "learning_rate": 9.13718452796021e-05, + "loss": 1.9105, + "step": 6977 + }, + { + "epoch": 2.141804788213628, + "grad_norm": 0.5837090015411377, + "learning_rate": 9.136905381407113e-05, + "loss": 1.8735, + "step": 6978 + }, + { + "epoch": 2.142111724984653, + "grad_norm": 0.36910486221313477, + "learning_rate": 9.13662619397069e-05, + "loss": 1.9013, + "step": 6979 + }, + { + "epoch": 2.1424186617556784, + "grad_norm": 0.37497541308403015, + "learning_rate": 9.136346965653704e-05, + "loss": 1.8444, + "step": 6980 + }, + { + "epoch": 2.1427255985267033, + "grad_norm": 0.508252739906311, + "learning_rate": 9.136067696458911e-05, + "loss": 1.8756, + "step": 6981 + }, + { + "epoch": 2.1430325352977286, + "grad_norm": 0.4045214056968689, + "learning_rate": 9.135788386389077e-05, + "loss": 1.8843, + "step": 6982 + }, + { + "epoch": 2.143339472068754, + "grad_norm": 0.36260777711868286, + "learning_rate": 9.135509035446955e-05, + "loss": 1.9264, + "step": 6983 + }, + { + "epoch": 2.143646408839779, + "grad_norm": 0.4112427234649658, + "learning_rate": 9.135229643635309e-05, + "loss": 1.8843, + "step": 6984 + }, + { + "epoch": 2.143953345610804, + "grad_norm": 0.43893104791641235, + "learning_rate": 9.1349502109569e-05, + "loss": 1.9486, + "step": 6985 + }, + { + "epoch": 2.1442602823818295, + "grad_norm": 0.3942745625972748, + "learning_rate": 9.13467073741449e-05, + "loss": 1.8607, + "step": 6986 + }, + { + "epoch": 2.1445672191528544, + "grad_norm": 0.3920004963874817, + "learning_rate": 9.13439122301084e-05, + "loss": 1.8102, + "step": 6987 + }, + { + "epoch": 2.1448741559238798, + "grad_norm": 0.3774373531341553, + "learning_rate": 9.134111667748712e-05, + "loss": 1.8326, + "step": 6988 + }, + { + "epoch": 2.1451810926949046, + "grad_norm": 0.355228453874588, + "learning_rate": 9.13383207163087e-05, + "loss": 1.895, + "step": 6989 + }, + { + "epoch": 2.14548802946593, + "grad_norm": 0.40284648537635803, + "learning_rate": 9.133552434660077e-05, + "loss": 1.928, + "step": 6990 + }, + { + "epoch": 2.1457949662369553, + "grad_norm": 0.3974910378456116, + "learning_rate": 9.133272756839096e-05, + "loss": 1.8567, + "step": 6991 + }, + { + "epoch": 2.14610190300798, + "grad_norm": 0.3878382742404938, + "learning_rate": 9.13299303817069e-05, + "loss": 1.9125, + "step": 6992 + }, + { + "epoch": 2.1464088397790055, + "grad_norm": 0.36132267117500305, + "learning_rate": 9.132713278657625e-05, + "loss": 1.8395, + "step": 6993 + }, + { + "epoch": 2.146715776550031, + "grad_norm": 0.4648832082748413, + "learning_rate": 9.132433478302667e-05, + "loss": 1.8877, + "step": 6994 + }, + { + "epoch": 2.1470227133210558, + "grad_norm": 0.5171563625335693, + "learning_rate": 9.132153637108577e-05, + "loss": 1.857, + "step": 6995 + }, + { + "epoch": 2.147329650092081, + "grad_norm": 0.4256175756454468, + "learning_rate": 9.131873755078124e-05, + "loss": 1.8434, + "step": 6996 + }, + { + "epoch": 2.147636586863106, + "grad_norm": 0.3421500623226166, + "learning_rate": 9.131593832214072e-05, + "loss": 1.8747, + "step": 6997 + }, + { + "epoch": 2.1479435236341313, + "grad_norm": 0.3880314230918884, + "learning_rate": 9.131313868519188e-05, + "loss": 1.8592, + "step": 6998 + }, + { + "epoch": 2.1482504604051567, + "grad_norm": 0.41070252656936646, + "learning_rate": 9.131033863996239e-05, + "loss": 1.8746, + "step": 6999 + }, + { + "epoch": 2.1485573971761815, + "grad_norm": 0.3837376534938812, + "learning_rate": 9.130753818647992e-05, + "loss": 1.8722, + "step": 7000 + }, + { + "epoch": 2.148864333947207, + "grad_norm": 0.311184823513031, + "learning_rate": 9.130473732477217e-05, + "loss": 1.8964, + "step": 7001 + }, + { + "epoch": 2.149171270718232, + "grad_norm": 0.3548091948032379, + "learning_rate": 9.130193605486677e-05, + "loss": 1.9235, + "step": 7002 + }, + { + "epoch": 2.149478207489257, + "grad_norm": 0.3509860932826996, + "learning_rate": 9.129913437679143e-05, + "loss": 1.8088, + "step": 7003 + }, + { + "epoch": 2.1497851442602824, + "grad_norm": 0.3301749527454376, + "learning_rate": 9.129633229057384e-05, + "loss": 1.8926, + "step": 7004 + }, + { + "epoch": 2.150092081031308, + "grad_norm": 0.3071286082267761, + "learning_rate": 9.129352979624169e-05, + "loss": 1.8045, + "step": 7005 + }, + { + "epoch": 2.1503990178023327, + "grad_norm": 0.3222786486148834, + "learning_rate": 9.129072689382268e-05, + "loss": 1.877, + "step": 7006 + }, + { + "epoch": 2.150705954573358, + "grad_norm": 0.31817424297332764, + "learning_rate": 9.128792358334451e-05, + "loss": 1.8863, + "step": 7007 + }, + { + "epoch": 2.151012891344383, + "grad_norm": 0.29379183053970337, + "learning_rate": 9.128511986483487e-05, + "loss": 1.8339, + "step": 7008 + }, + { + "epoch": 2.1513198281154082, + "grad_norm": 0.3618883788585663, + "learning_rate": 9.128231573832149e-05, + "loss": 1.9521, + "step": 7009 + }, + { + "epoch": 2.1516267648864336, + "grad_norm": 0.3188464045524597, + "learning_rate": 9.127951120383205e-05, + "loss": 1.811, + "step": 7010 + }, + { + "epoch": 2.1519337016574585, + "grad_norm": 0.3257068395614624, + "learning_rate": 9.127670626139431e-05, + "loss": 1.9084, + "step": 7011 + }, + { + "epoch": 2.152240638428484, + "grad_norm": 0.3389057219028473, + "learning_rate": 9.127390091103595e-05, + "loss": 1.9272, + "step": 7012 + }, + { + "epoch": 2.1525475751995087, + "grad_norm": 0.3376730680465698, + "learning_rate": 9.127109515278471e-05, + "loss": 1.8841, + "step": 7013 + }, + { + "epoch": 2.152854511970534, + "grad_norm": 0.3032901883125305, + "learning_rate": 9.126828898666833e-05, + "loss": 1.8057, + "step": 7014 + }, + { + "epoch": 2.1531614487415593, + "grad_norm": 0.32034799456596375, + "learning_rate": 9.126548241271451e-05, + "loss": 1.7988, + "step": 7015 + }, + { + "epoch": 2.1534683855125842, + "grad_norm": 0.31879931688308716, + "learning_rate": 9.126267543095102e-05, + "loss": 1.8932, + "step": 7016 + }, + { + "epoch": 2.1537753222836096, + "grad_norm": 0.3282395005226135, + "learning_rate": 9.125986804140559e-05, + "loss": 1.907, + "step": 7017 + }, + { + "epoch": 2.154082259054635, + "grad_norm": 0.36310696601867676, + "learning_rate": 9.125706024410594e-05, + "loss": 1.9812, + "step": 7018 + }, + { + "epoch": 2.15438919582566, + "grad_norm": 0.39414262771606445, + "learning_rate": 9.125425203907985e-05, + "loss": 1.9112, + "step": 7019 + }, + { + "epoch": 2.154696132596685, + "grad_norm": 0.4457061290740967, + "learning_rate": 9.125144342635508e-05, + "loss": 1.8876, + "step": 7020 + }, + { + "epoch": 2.1550030693677105, + "grad_norm": 0.4651646316051483, + "learning_rate": 9.124863440595934e-05, + "loss": 1.8283, + "step": 7021 + }, + { + "epoch": 2.1553100061387354, + "grad_norm": 0.4404383897781372, + "learning_rate": 9.124582497792043e-05, + "loss": 1.8646, + "step": 7022 + }, + { + "epoch": 2.1556169429097607, + "grad_norm": 0.3569783866405487, + "learning_rate": 9.124301514226612e-05, + "loss": 1.9603, + "step": 7023 + }, + { + "epoch": 2.1559238796807856, + "grad_norm": 0.3878212571144104, + "learning_rate": 9.124020489902414e-05, + "loss": 1.889, + "step": 7024 + }, + { + "epoch": 2.156230816451811, + "grad_norm": 0.43005698919296265, + "learning_rate": 9.123739424822229e-05, + "loss": 1.9127, + "step": 7025 + }, + { + "epoch": 2.1565377532228363, + "grad_norm": 0.37798774242401123, + "learning_rate": 9.123458318988834e-05, + "loss": 1.8434, + "step": 7026 + }, + { + "epoch": 2.156844689993861, + "grad_norm": 0.38182979822158813, + "learning_rate": 9.123177172405007e-05, + "loss": 1.8905, + "step": 7027 + }, + { + "epoch": 2.1571516267648865, + "grad_norm": 0.4695180058479309, + "learning_rate": 9.122895985073524e-05, + "loss": 1.9035, + "step": 7028 + }, + { + "epoch": 2.1574585635359114, + "grad_norm": 0.37112870812416077, + "learning_rate": 9.12261475699717e-05, + "loss": 1.8497, + "step": 7029 + }, + { + "epoch": 2.1577655003069367, + "grad_norm": 0.36758264899253845, + "learning_rate": 9.122333488178721e-05, + "loss": 1.9015, + "step": 7030 + }, + { + "epoch": 2.158072437077962, + "grad_norm": 0.4691081643104553, + "learning_rate": 9.122052178620953e-05, + "loss": 1.9707, + "step": 7031 + }, + { + "epoch": 2.158379373848987, + "grad_norm": 0.47068753838539124, + "learning_rate": 9.121770828326653e-05, + "loss": 1.9103, + "step": 7032 + }, + { + "epoch": 2.1586863106200123, + "grad_norm": 0.38539063930511475, + "learning_rate": 9.121489437298593e-05, + "loss": 1.7872, + "step": 7033 + }, + { + "epoch": 2.1589932473910376, + "grad_norm": 0.43769749999046326, + "learning_rate": 9.121208005539563e-05, + "loss": 1.9654, + "step": 7034 + }, + { + "epoch": 2.1593001841620625, + "grad_norm": 0.4770655930042267, + "learning_rate": 9.120926533052338e-05, + "loss": 1.9754, + "step": 7035 + }, + { + "epoch": 2.159607120933088, + "grad_norm": 0.526979386806488, + "learning_rate": 9.120645019839702e-05, + "loss": 1.8833, + "step": 7036 + }, + { + "epoch": 2.159914057704113, + "grad_norm": 0.4734671413898468, + "learning_rate": 9.120363465904438e-05, + "loss": 1.8695, + "step": 7037 + }, + { + "epoch": 2.160220994475138, + "grad_norm": 0.40346798300743103, + "learning_rate": 9.120081871249326e-05, + "loss": 1.9216, + "step": 7038 + }, + { + "epoch": 2.1605279312461634, + "grad_norm": 0.38210105895996094, + "learning_rate": 9.119800235877149e-05, + "loss": 1.9334, + "step": 7039 + }, + { + "epoch": 2.1608348680171883, + "grad_norm": 0.5528677105903625, + "learning_rate": 9.119518559790694e-05, + "loss": 1.8858, + "step": 7040 + }, + { + "epoch": 2.1611418047882136, + "grad_norm": 0.6684148907661438, + "learning_rate": 9.11923684299274e-05, + "loss": 1.9105, + "step": 7041 + }, + { + "epoch": 2.161448741559239, + "grad_norm": 0.4497738778591156, + "learning_rate": 9.118955085486073e-05, + "loss": 1.8789, + "step": 7042 + }, + { + "epoch": 2.161755678330264, + "grad_norm": 0.4440831243991852, + "learning_rate": 9.11867328727348e-05, + "loss": 1.9966, + "step": 7043 + }, + { + "epoch": 2.162062615101289, + "grad_norm": 0.5910835266113281, + "learning_rate": 9.118391448357742e-05, + "loss": 1.8841, + "step": 7044 + }, + { + "epoch": 2.1623695518723145, + "grad_norm": 0.5312752723693848, + "learning_rate": 9.118109568741645e-05, + "loss": 1.8825, + "step": 7045 + }, + { + "epoch": 2.1626764886433394, + "grad_norm": 0.3885713815689087, + "learning_rate": 9.117827648427977e-05, + "loss": 1.8763, + "step": 7046 + }, + { + "epoch": 2.1629834254143647, + "grad_norm": 0.4274894893169403, + "learning_rate": 9.117545687419522e-05, + "loss": 1.8802, + "step": 7047 + }, + { + "epoch": 2.1632903621853896, + "grad_norm": 0.3984382748603821, + "learning_rate": 9.117263685719067e-05, + "loss": 1.8319, + "step": 7048 + }, + { + "epoch": 2.163597298956415, + "grad_norm": 0.3687778115272522, + "learning_rate": 9.1169816433294e-05, + "loss": 1.838, + "step": 7049 + }, + { + "epoch": 2.1639042357274403, + "grad_norm": 0.37597915530204773, + "learning_rate": 9.116699560253306e-05, + "loss": 1.8711, + "step": 7050 + }, + { + "epoch": 2.164211172498465, + "grad_norm": 0.41217467188835144, + "learning_rate": 9.116417436493574e-05, + "loss": 1.8552, + "step": 7051 + }, + { + "epoch": 2.1645181092694905, + "grad_norm": 0.3937448263168335, + "learning_rate": 9.116135272052994e-05, + "loss": 1.8548, + "step": 7052 + }, + { + "epoch": 2.164825046040516, + "grad_norm": 0.3545389175415039, + "learning_rate": 9.115853066934351e-05, + "loss": 1.8694, + "step": 7053 + }, + { + "epoch": 2.1651319828115407, + "grad_norm": 0.32625243067741394, + "learning_rate": 9.115570821140436e-05, + "loss": 1.8579, + "step": 7054 + }, + { + "epoch": 2.165438919582566, + "grad_norm": 0.32701975107192993, + "learning_rate": 9.115288534674038e-05, + "loss": 1.8676, + "step": 7055 + }, + { + "epoch": 2.165745856353591, + "grad_norm": 0.39372533559799194, + "learning_rate": 9.115006207537947e-05, + "loss": 1.8895, + "step": 7056 + }, + { + "epoch": 2.1660527931246163, + "grad_norm": 0.3688350021839142, + "learning_rate": 9.114723839734954e-05, + "loss": 1.8742, + "step": 7057 + }, + { + "epoch": 2.1663597298956416, + "grad_norm": 0.35461875796318054, + "learning_rate": 9.114441431267846e-05, + "loss": 1.8723, + "step": 7058 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.3331618010997772, + "learning_rate": 9.114158982139416e-05, + "loss": 1.8514, + "step": 7059 + }, + { + "epoch": 2.166973603437692, + "grad_norm": 0.3313215374946594, + "learning_rate": 9.113876492352458e-05, + "loss": 1.912, + "step": 7060 + }, + { + "epoch": 2.167280540208717, + "grad_norm": 0.3320949375629425, + "learning_rate": 9.113593961909759e-05, + "loss": 1.8908, + "step": 7061 + }, + { + "epoch": 2.167587476979742, + "grad_norm": 0.3292064070701599, + "learning_rate": 9.113311390814115e-05, + "loss": 1.8702, + "step": 7062 + }, + { + "epoch": 2.1678944137507674, + "grad_norm": 0.33991244435310364, + "learning_rate": 9.113028779068316e-05, + "loss": 1.8503, + "step": 7063 + }, + { + "epoch": 2.1682013505217923, + "grad_norm": 0.3602859377861023, + "learning_rate": 9.112746126675156e-05, + "loss": 1.9185, + "step": 7064 + }, + { + "epoch": 2.1685082872928176, + "grad_norm": 0.3354876637458801, + "learning_rate": 9.112463433637428e-05, + "loss": 1.8857, + "step": 7065 + }, + { + "epoch": 2.168815224063843, + "grad_norm": 0.32364192605018616, + "learning_rate": 9.112180699957926e-05, + "loss": 1.8548, + "step": 7066 + }, + { + "epoch": 2.169122160834868, + "grad_norm": 0.3617163896560669, + "learning_rate": 9.111897925639446e-05, + "loss": 1.9021, + "step": 7067 + }, + { + "epoch": 2.169429097605893, + "grad_norm": 0.3852904438972473, + "learning_rate": 9.111615110684778e-05, + "loss": 1.9331, + "step": 7068 + }, + { + "epoch": 2.1697360343769185, + "grad_norm": 0.332939088344574, + "learning_rate": 9.111332255096721e-05, + "loss": 1.9156, + "step": 7069 + }, + { + "epoch": 2.1700429711479434, + "grad_norm": 0.3386891186237335, + "learning_rate": 9.111049358878067e-05, + "loss": 1.8898, + "step": 7070 + }, + { + "epoch": 2.1703499079189688, + "grad_norm": 0.3559711277484894, + "learning_rate": 9.110766422031617e-05, + "loss": 1.8546, + "step": 7071 + }, + { + "epoch": 2.1706568446899936, + "grad_norm": 0.3440175950527191, + "learning_rate": 9.110483444560162e-05, + "loss": 1.9005, + "step": 7072 + }, + { + "epoch": 2.170963781461019, + "grad_norm": 0.3239493668079376, + "learning_rate": 9.110200426466499e-05, + "loss": 1.9258, + "step": 7073 + }, + { + "epoch": 2.1712707182320443, + "grad_norm": 0.3658723533153534, + "learning_rate": 9.109917367753428e-05, + "loss": 2.0203, + "step": 7074 + }, + { + "epoch": 2.171577655003069, + "grad_norm": 0.35419905185699463, + "learning_rate": 9.109634268423746e-05, + "loss": 1.8515, + "step": 7075 + }, + { + "epoch": 2.1718845917740945, + "grad_norm": 0.40852081775665283, + "learning_rate": 9.109351128480246e-05, + "loss": 1.8744, + "step": 7076 + }, + { + "epoch": 2.17219152854512, + "grad_norm": 0.3502386212348938, + "learning_rate": 9.109067947925732e-05, + "loss": 1.8785, + "step": 7077 + }, + { + "epoch": 2.1724984653161448, + "grad_norm": 0.42964309453964233, + "learning_rate": 9.108784726763e-05, + "loss": 1.9175, + "step": 7078 + }, + { + "epoch": 2.17280540208717, + "grad_norm": 0.39438319206237793, + "learning_rate": 9.108501464994849e-05, + "loss": 1.9072, + "step": 7079 + }, + { + "epoch": 2.1731123388581954, + "grad_norm": 0.5045785903930664, + "learning_rate": 9.108218162624079e-05, + "loss": 1.9246, + "step": 7080 + }, + { + "epoch": 2.1734192756292203, + "grad_norm": 0.4374946653842926, + "learning_rate": 9.107934819653488e-05, + "loss": 1.8669, + "step": 7081 + }, + { + "epoch": 2.1737262124002457, + "grad_norm": 0.3263556957244873, + "learning_rate": 9.107651436085878e-05, + "loss": 1.8402, + "step": 7082 + }, + { + "epoch": 2.1740331491712706, + "grad_norm": 0.4380986988544464, + "learning_rate": 9.107368011924048e-05, + "loss": 1.8948, + "step": 7083 + }, + { + "epoch": 2.174340085942296, + "grad_norm": 0.4350908696651459, + "learning_rate": 9.1070845471708e-05, + "loss": 1.8717, + "step": 7084 + }, + { + "epoch": 2.174647022713321, + "grad_norm": 0.37809762358665466, + "learning_rate": 9.106801041828936e-05, + "loss": 1.8703, + "step": 7085 + }, + { + "epoch": 2.174953959484346, + "grad_norm": 0.3473457992076874, + "learning_rate": 9.106517495901257e-05, + "loss": 1.8999, + "step": 7086 + }, + { + "epoch": 2.1752608962553714, + "grad_norm": 0.48066645860671997, + "learning_rate": 9.106233909390564e-05, + "loss": 1.8788, + "step": 7087 + }, + { + "epoch": 2.1755678330263963, + "grad_norm": 0.5873035788536072, + "learning_rate": 9.105950282299663e-05, + "loss": 1.8879, + "step": 7088 + }, + { + "epoch": 2.1758747697974217, + "grad_norm": 0.47609585523605347, + "learning_rate": 9.105666614631354e-05, + "loss": 1.8813, + "step": 7089 + }, + { + "epoch": 2.176181706568447, + "grad_norm": 0.3845362365245819, + "learning_rate": 9.10538290638844e-05, + "loss": 1.9629, + "step": 7090 + }, + { + "epoch": 2.176488643339472, + "grad_norm": 0.5463572144508362, + "learning_rate": 9.105099157573727e-05, + "loss": 1.9455, + "step": 7091 + }, + { + "epoch": 2.1767955801104972, + "grad_norm": 0.4875337779521942, + "learning_rate": 9.104815368190017e-05, + "loss": 1.9146, + "step": 7092 + }, + { + "epoch": 2.1771025168815226, + "grad_norm": 0.37513965368270874, + "learning_rate": 9.104531538240116e-05, + "loss": 1.8626, + "step": 7093 + }, + { + "epoch": 2.1774094536525475, + "grad_norm": 0.3477539122104645, + "learning_rate": 9.104247667726828e-05, + "loss": 1.878, + "step": 7094 + }, + { + "epoch": 2.177716390423573, + "grad_norm": 0.5122693181037903, + "learning_rate": 9.103963756652961e-05, + "loss": 1.8784, + "step": 7095 + }, + { + "epoch": 2.178023327194598, + "grad_norm": 0.49106159806251526, + "learning_rate": 9.103679805021317e-05, + "loss": 1.8441, + "step": 7096 + }, + { + "epoch": 2.178330263965623, + "grad_norm": 0.3801479637622833, + "learning_rate": 9.103395812834705e-05, + "loss": 1.8986, + "step": 7097 + }, + { + "epoch": 2.1786372007366483, + "grad_norm": 0.3429640233516693, + "learning_rate": 9.10311178009593e-05, + "loss": 1.8806, + "step": 7098 + }, + { + "epoch": 2.1789441375076732, + "grad_norm": 0.36715295910835266, + "learning_rate": 9.102827706807799e-05, + "loss": 1.8215, + "step": 7099 + }, + { + "epoch": 2.1792510742786986, + "grad_norm": 0.37225866317749023, + "learning_rate": 9.10254359297312e-05, + "loss": 1.8851, + "step": 7100 + }, + { + "epoch": 2.179558011049724, + "grad_norm": 0.3552459180355072, + "learning_rate": 9.102259438594702e-05, + "loss": 1.9345, + "step": 7101 + }, + { + "epoch": 2.179864947820749, + "grad_norm": 0.3876415193080902, + "learning_rate": 9.10197524367535e-05, + "loss": 1.8657, + "step": 7102 + }, + { + "epoch": 2.180171884591774, + "grad_norm": 0.4635472595691681, + "learning_rate": 9.101691008217875e-05, + "loss": 1.8527, + "step": 7103 + }, + { + "epoch": 2.1804788213627995, + "grad_norm": 0.46319296956062317, + "learning_rate": 9.101406732225086e-05, + "loss": 1.869, + "step": 7104 + }, + { + "epoch": 2.1807857581338244, + "grad_norm": 0.36179330945014954, + "learning_rate": 9.101122415699792e-05, + "loss": 1.9157, + "step": 7105 + }, + { + "epoch": 2.1810926949048497, + "grad_norm": 0.30921339988708496, + "learning_rate": 9.100838058644801e-05, + "loss": 1.858, + "step": 7106 + }, + { + "epoch": 2.1813996316758746, + "grad_norm": 0.4568884074687958, + "learning_rate": 9.100553661062925e-05, + "loss": 1.8663, + "step": 7107 + }, + { + "epoch": 2.1817065684469, + "grad_norm": 0.43856412172317505, + "learning_rate": 9.100269222956976e-05, + "loss": 1.8492, + "step": 7108 + }, + { + "epoch": 2.1820135052179253, + "grad_norm": 0.3025546967983246, + "learning_rate": 9.099984744329761e-05, + "loss": 1.8532, + "step": 7109 + }, + { + "epoch": 2.18232044198895, + "grad_norm": 0.38365665078163147, + "learning_rate": 9.099700225184096e-05, + "loss": 1.8883, + "step": 7110 + }, + { + "epoch": 2.1826273787599755, + "grad_norm": 0.4863334596157074, + "learning_rate": 9.099415665522788e-05, + "loss": 1.8682, + "step": 7111 + }, + { + "epoch": 2.182934315531001, + "grad_norm": 0.42789241671562195, + "learning_rate": 9.099131065348653e-05, + "loss": 1.8867, + "step": 7112 + }, + { + "epoch": 2.1832412523020257, + "grad_norm": 0.35933569073677063, + "learning_rate": 9.098846424664504e-05, + "loss": 1.9282, + "step": 7113 + }, + { + "epoch": 2.183548189073051, + "grad_norm": 0.42611026763916016, + "learning_rate": 9.09856174347315e-05, + "loss": 1.9609, + "step": 7114 + }, + { + "epoch": 2.183855125844076, + "grad_norm": 0.43970558047294617, + "learning_rate": 9.098277021777406e-05, + "loss": 1.823, + "step": 7115 + }, + { + "epoch": 2.1841620626151013, + "grad_norm": 0.36792683601379395, + "learning_rate": 9.097992259580089e-05, + "loss": 1.9231, + "step": 7116 + }, + { + "epoch": 2.1844689993861266, + "grad_norm": 0.3554590344429016, + "learning_rate": 9.097707456884008e-05, + "loss": 1.914, + "step": 7117 + }, + { + "epoch": 2.1847759361571515, + "grad_norm": 0.4271651804447174, + "learning_rate": 9.097422613691982e-05, + "loss": 1.8666, + "step": 7118 + }, + { + "epoch": 2.185082872928177, + "grad_norm": 0.32142770290374756, + "learning_rate": 9.097137730006822e-05, + "loss": 1.7989, + "step": 7119 + }, + { + "epoch": 2.185389809699202, + "grad_norm": 0.33245620131492615, + "learning_rate": 9.096852805831348e-05, + "loss": 1.8536, + "step": 7120 + }, + { + "epoch": 2.185696746470227, + "grad_norm": 0.3480495810508728, + "learning_rate": 9.09656784116837e-05, + "loss": 1.9008, + "step": 7121 + }, + { + "epoch": 2.1860036832412524, + "grad_norm": 0.35290226340293884, + "learning_rate": 9.09628283602071e-05, + "loss": 1.8593, + "step": 7122 + }, + { + "epoch": 2.1863106200122773, + "grad_norm": 0.3084987998008728, + "learning_rate": 9.095997790391183e-05, + "loss": 1.827, + "step": 7123 + }, + { + "epoch": 2.1866175567833026, + "grad_norm": 0.36295285820961, + "learning_rate": 9.095712704282604e-05, + "loss": 1.909, + "step": 7124 + }, + { + "epoch": 2.186924493554328, + "grad_norm": 0.3893873691558838, + "learning_rate": 9.095427577697791e-05, + "loss": 1.9221, + "step": 7125 + }, + { + "epoch": 2.187231430325353, + "grad_norm": 0.3699241578578949, + "learning_rate": 9.095142410639564e-05, + "loss": 1.9352, + "step": 7126 + }, + { + "epoch": 2.187538367096378, + "grad_norm": 0.3384705185890198, + "learning_rate": 9.094857203110738e-05, + "loss": 1.8541, + "step": 7127 + }, + { + "epoch": 2.1878453038674035, + "grad_norm": 0.377687007188797, + "learning_rate": 9.094571955114133e-05, + "loss": 1.8336, + "step": 7128 + }, + { + "epoch": 2.1881522406384284, + "grad_norm": 0.40227916836738586, + "learning_rate": 9.094286666652567e-05, + "loss": 1.9565, + "step": 7129 + }, + { + "epoch": 2.1884591774094537, + "grad_norm": 0.3679705560207367, + "learning_rate": 9.094001337728862e-05, + "loss": 1.8152, + "step": 7130 + }, + { + "epoch": 2.1887661141804786, + "grad_norm": 0.3197132647037506, + "learning_rate": 9.093715968345836e-05, + "loss": 1.9263, + "step": 7131 + }, + { + "epoch": 2.189073050951504, + "grad_norm": 0.3518284559249878, + "learning_rate": 9.09343055850631e-05, + "loss": 1.8675, + "step": 7132 + }, + { + "epoch": 2.1893799877225293, + "grad_norm": 0.3214010000228882, + "learning_rate": 9.093145108213103e-05, + "loss": 1.8991, + "step": 7133 + }, + { + "epoch": 2.189686924493554, + "grad_norm": 0.3563176393508911, + "learning_rate": 9.092859617469037e-05, + "loss": 1.8603, + "step": 7134 + }, + { + "epoch": 2.1899938612645795, + "grad_norm": 0.34053143858909607, + "learning_rate": 9.092574086276933e-05, + "loss": 1.8955, + "step": 7135 + }, + { + "epoch": 2.190300798035605, + "grad_norm": 0.3833705484867096, + "learning_rate": 9.092288514639613e-05, + "loss": 1.8845, + "step": 7136 + }, + { + "epoch": 2.1906077348066297, + "grad_norm": 0.3932427763938904, + "learning_rate": 9.092002902559901e-05, + "loss": 1.8608, + "step": 7137 + }, + { + "epoch": 2.190914671577655, + "grad_norm": 0.332955539226532, + "learning_rate": 9.091717250040617e-05, + "loss": 1.8558, + "step": 7138 + }, + { + "epoch": 2.1912216083486804, + "grad_norm": 0.3149980306625366, + "learning_rate": 9.091431557084584e-05, + "loss": 1.893, + "step": 7139 + }, + { + "epoch": 2.1915285451197053, + "grad_norm": 0.3679150640964508, + "learning_rate": 9.091145823694628e-05, + "loss": 1.9012, + "step": 7140 + }, + { + "epoch": 2.1918354818907306, + "grad_norm": 0.36836057901382446, + "learning_rate": 9.09086004987357e-05, + "loss": 1.9121, + "step": 7141 + }, + { + "epoch": 2.1921424186617555, + "grad_norm": 0.3581927418708801, + "learning_rate": 9.090574235624237e-05, + "loss": 1.8826, + "step": 7142 + }, + { + "epoch": 2.192449355432781, + "grad_norm": 0.40886545181274414, + "learning_rate": 9.09028838094945e-05, + "loss": 1.8828, + "step": 7143 + }, + { + "epoch": 2.192756292203806, + "grad_norm": 0.32729873061180115, + "learning_rate": 9.090002485852037e-05, + "loss": 1.8827, + "step": 7144 + }, + { + "epoch": 2.193063228974831, + "grad_norm": 0.35304784774780273, + "learning_rate": 9.089716550334819e-05, + "loss": 1.846, + "step": 7145 + }, + { + "epoch": 2.1933701657458564, + "grad_norm": 0.35022708773612976, + "learning_rate": 9.089430574400629e-05, + "loss": 1.9169, + "step": 7146 + }, + { + "epoch": 2.1936771025168813, + "grad_norm": 0.4137697219848633, + "learning_rate": 9.089144558052287e-05, + "loss": 1.9111, + "step": 7147 + }, + { + "epoch": 2.1939840392879066, + "grad_norm": 0.3193536102771759, + "learning_rate": 9.088858501292622e-05, + "loss": 1.8577, + "step": 7148 + }, + { + "epoch": 2.194290976058932, + "grad_norm": 0.35795432329177856, + "learning_rate": 9.08857240412446e-05, + "loss": 1.8645, + "step": 7149 + }, + { + "epoch": 2.194597912829957, + "grad_norm": 0.3626460134983063, + "learning_rate": 9.088286266550632e-05, + "loss": 1.9288, + "step": 7150 + }, + { + "epoch": 2.194904849600982, + "grad_norm": 0.3438000977039337, + "learning_rate": 9.08800008857396e-05, + "loss": 1.9112, + "step": 7151 + }, + { + "epoch": 2.1952117863720075, + "grad_norm": 0.3445241153240204, + "learning_rate": 9.087713870197276e-05, + "loss": 1.8711, + "step": 7152 + }, + { + "epoch": 2.1955187231430324, + "grad_norm": 0.34294596314430237, + "learning_rate": 9.087427611423408e-05, + "loss": 1.9061, + "step": 7153 + }, + { + "epoch": 2.1958256599140578, + "grad_norm": 0.3608735203742981, + "learning_rate": 9.087141312255184e-05, + "loss": 1.8634, + "step": 7154 + }, + { + "epoch": 2.196132596685083, + "grad_norm": 0.3417772352695465, + "learning_rate": 9.086854972695434e-05, + "loss": 1.9, + "step": 7155 + }, + { + "epoch": 2.196439533456108, + "grad_norm": 0.3516700863838196, + "learning_rate": 9.086568592746988e-05, + "loss": 1.9021, + "step": 7156 + }, + { + "epoch": 2.1967464702271333, + "grad_norm": 0.37481075525283813, + "learning_rate": 9.086282172412677e-05, + "loss": 1.8845, + "step": 7157 + }, + { + "epoch": 2.197053406998158, + "grad_norm": 0.3413105010986328, + "learning_rate": 9.08599571169533e-05, + "loss": 1.8128, + "step": 7158 + }, + { + "epoch": 2.1973603437691835, + "grad_norm": 0.3539934754371643, + "learning_rate": 9.085709210597777e-05, + "loss": 1.857, + "step": 7159 + }, + { + "epoch": 2.197667280540209, + "grad_norm": 0.4345060884952545, + "learning_rate": 9.085422669122851e-05, + "loss": 1.8698, + "step": 7160 + }, + { + "epoch": 2.1979742173112338, + "grad_norm": 0.40369880199432373, + "learning_rate": 9.085136087273386e-05, + "loss": 1.7948, + "step": 7161 + }, + { + "epoch": 2.198281154082259, + "grad_norm": 0.3832145035266876, + "learning_rate": 9.08484946505221e-05, + "loss": 1.8682, + "step": 7162 + }, + { + "epoch": 2.198588090853284, + "grad_norm": 0.2859131097793579, + "learning_rate": 9.084562802462158e-05, + "loss": 1.8123, + "step": 7163 + }, + { + "epoch": 2.1988950276243093, + "grad_norm": 0.3062222898006439, + "learning_rate": 9.084276099506062e-05, + "loss": 1.8448, + "step": 7164 + }, + { + "epoch": 2.1992019643953347, + "grad_norm": 0.3819046914577484, + "learning_rate": 9.083989356186757e-05, + "loss": 1.8661, + "step": 7165 + }, + { + "epoch": 2.1995089011663596, + "grad_norm": 0.5007020235061646, + "learning_rate": 9.083702572507074e-05, + "loss": 1.9144, + "step": 7166 + }, + { + "epoch": 2.199815837937385, + "grad_norm": 0.521885097026825, + "learning_rate": 9.083415748469849e-05, + "loss": 1.8695, + "step": 7167 + }, + { + "epoch": 2.2001227747084102, + "grad_norm": 0.35051268339157104, + "learning_rate": 9.083128884077916e-05, + "loss": 1.9378, + "step": 7168 + }, + { + "epoch": 2.200429711479435, + "grad_norm": 0.40265345573425293, + "learning_rate": 9.082841979334111e-05, + "loss": 1.8902, + "step": 7169 + }, + { + "epoch": 2.2007366482504604, + "grad_norm": 0.506377637386322, + "learning_rate": 9.082555034241267e-05, + "loss": 1.9115, + "step": 7170 + }, + { + "epoch": 2.201043585021486, + "grad_norm": 0.42828384041786194, + "learning_rate": 9.082268048802223e-05, + "loss": 1.8173, + "step": 7171 + }, + { + "epoch": 2.2013505217925107, + "grad_norm": 0.2979312539100647, + "learning_rate": 9.081981023019812e-05, + "loss": 1.8089, + "step": 7172 + }, + { + "epoch": 2.201657458563536, + "grad_norm": 0.3840465843677521, + "learning_rate": 9.081693956896872e-05, + "loss": 1.8557, + "step": 7173 + }, + { + "epoch": 2.201964395334561, + "grad_norm": 0.41454845666885376, + "learning_rate": 9.081406850436241e-05, + "loss": 1.8599, + "step": 7174 + }, + { + "epoch": 2.2022713321055862, + "grad_norm": 0.3305908739566803, + "learning_rate": 9.081119703640756e-05, + "loss": 1.8013, + "step": 7175 + }, + { + "epoch": 2.2025782688766116, + "grad_norm": 0.33649876713752747, + "learning_rate": 9.080832516513252e-05, + "loss": 1.9028, + "step": 7176 + }, + { + "epoch": 2.2028852056476365, + "grad_norm": 0.41247284412384033, + "learning_rate": 9.08054528905657e-05, + "loss": 1.8636, + "step": 7177 + }, + { + "epoch": 2.203192142418662, + "grad_norm": 0.4355279505252838, + "learning_rate": 9.080258021273548e-05, + "loss": 1.8923, + "step": 7178 + }, + { + "epoch": 2.203499079189687, + "grad_norm": 0.34598320722579956, + "learning_rate": 9.079970713167026e-05, + "loss": 1.9187, + "step": 7179 + }, + { + "epoch": 2.203806015960712, + "grad_norm": 0.3560951054096222, + "learning_rate": 9.07968336473984e-05, + "loss": 1.9382, + "step": 7180 + }, + { + "epoch": 2.2041129527317374, + "grad_norm": 0.3873176872730255, + "learning_rate": 9.079395975994834e-05, + "loss": 1.8377, + "step": 7181 + }, + { + "epoch": 2.2044198895027622, + "grad_norm": 0.38699567317962646, + "learning_rate": 9.079108546934844e-05, + "loss": 1.848, + "step": 7182 + }, + { + "epoch": 2.2047268262737876, + "grad_norm": 0.3658364713191986, + "learning_rate": 9.078821077562712e-05, + "loss": 1.9308, + "step": 7183 + }, + { + "epoch": 2.205033763044813, + "grad_norm": 0.35228830575942993, + "learning_rate": 9.078533567881281e-05, + "loss": 1.8886, + "step": 7184 + }, + { + "epoch": 2.205340699815838, + "grad_norm": 0.4177337884902954, + "learning_rate": 9.07824601789339e-05, + "loss": 1.8695, + "step": 7185 + }, + { + "epoch": 2.205647636586863, + "grad_norm": 0.4778536260128021, + "learning_rate": 9.077958427601882e-05, + "loss": 1.8288, + "step": 7186 + }, + { + "epoch": 2.2059545733578885, + "grad_norm": 0.46544820070266724, + "learning_rate": 9.077670797009599e-05, + "loss": 1.8974, + "step": 7187 + }, + { + "epoch": 2.2062615101289134, + "grad_norm": 0.36188805103302, + "learning_rate": 9.077383126119382e-05, + "loss": 1.8953, + "step": 7188 + }, + { + "epoch": 2.2065684468999387, + "grad_norm": 0.30941206216812134, + "learning_rate": 9.077095414934075e-05, + "loss": 1.8395, + "step": 7189 + }, + { + "epoch": 2.2068753836709636, + "grad_norm": 0.4497200846672058, + "learning_rate": 9.076807663456524e-05, + "loss": 1.8485, + "step": 7190 + }, + { + "epoch": 2.207182320441989, + "grad_norm": 0.4923233985900879, + "learning_rate": 9.076519871689568e-05, + "loss": 1.8233, + "step": 7191 + }, + { + "epoch": 2.2074892572130143, + "grad_norm": 0.32226502895355225, + "learning_rate": 9.076232039636053e-05, + "loss": 1.8563, + "step": 7192 + }, + { + "epoch": 2.207796193984039, + "grad_norm": 0.46719446778297424, + "learning_rate": 9.075944167298824e-05, + "loss": 1.8602, + "step": 7193 + }, + { + "epoch": 2.2081031307550645, + "grad_norm": 0.5534674525260925, + "learning_rate": 9.075656254680727e-05, + "loss": 1.8804, + "step": 7194 + }, + { + "epoch": 2.20841006752609, + "grad_norm": 0.4895678162574768, + "learning_rate": 9.075368301784606e-05, + "loss": 1.8893, + "step": 7195 + }, + { + "epoch": 2.2087170042971147, + "grad_norm": 0.33137625455856323, + "learning_rate": 9.075080308613306e-05, + "loss": 1.9158, + "step": 7196 + }, + { + "epoch": 2.20902394106814, + "grad_norm": 0.469319611787796, + "learning_rate": 9.074792275169674e-05, + "loss": 1.8628, + "step": 7197 + }, + { + "epoch": 2.209330877839165, + "grad_norm": 0.43872305750846863, + "learning_rate": 9.074504201456556e-05, + "loss": 1.8867, + "step": 7198 + }, + { + "epoch": 2.2096378146101903, + "grad_norm": 0.32900992035865784, + "learning_rate": 9.0742160874768e-05, + "loss": 1.8079, + "step": 7199 + }, + { + "epoch": 2.2099447513812156, + "grad_norm": 0.34231048822402954, + "learning_rate": 9.073927933233253e-05, + "loss": 1.9018, + "step": 7200 + }, + { + "epoch": 2.2102516881522405, + "grad_norm": 0.43461740016937256, + "learning_rate": 9.07363973872876e-05, + "loss": 1.8299, + "step": 7201 + }, + { + "epoch": 2.210558624923266, + "grad_norm": 0.43819913268089294, + "learning_rate": 9.073351503966174e-05, + "loss": 1.8641, + "step": 7202 + }, + { + "epoch": 2.210865561694291, + "grad_norm": 0.330683171749115, + "learning_rate": 9.073063228948339e-05, + "loss": 1.8595, + "step": 7203 + }, + { + "epoch": 2.211172498465316, + "grad_norm": 0.35648414492607117, + "learning_rate": 9.072774913678108e-05, + "loss": 1.8265, + "step": 7204 + }, + { + "epoch": 2.2114794352363414, + "grad_norm": 0.4420771300792694, + "learning_rate": 9.072486558158329e-05, + "loss": 1.902, + "step": 7205 + }, + { + "epoch": 2.2117863720073663, + "grad_norm": 0.41682472825050354, + "learning_rate": 9.072198162391849e-05, + "loss": 1.903, + "step": 7206 + }, + { + "epoch": 2.2120933087783916, + "grad_norm": 0.3194744288921356, + "learning_rate": 9.07190972638152e-05, + "loss": 1.8221, + "step": 7207 + }, + { + "epoch": 2.212400245549417, + "grad_norm": 0.35625776648521423, + "learning_rate": 9.071621250130192e-05, + "loss": 1.8737, + "step": 7208 + }, + { + "epoch": 2.212707182320442, + "grad_norm": 0.4136293828487396, + "learning_rate": 9.071332733640716e-05, + "loss": 1.7995, + "step": 7209 + }, + { + "epoch": 2.213014119091467, + "grad_norm": 0.39144495129585266, + "learning_rate": 9.071044176915947e-05, + "loss": 1.8446, + "step": 7210 + }, + { + "epoch": 2.2133210558624925, + "grad_norm": 0.3082813322544098, + "learning_rate": 9.07075557995873e-05, + "loss": 1.7635, + "step": 7211 + }, + { + "epoch": 2.2136279926335174, + "grad_norm": 0.3642291724681854, + "learning_rate": 9.070466942771921e-05, + "loss": 1.9471, + "step": 7212 + }, + { + "epoch": 2.2139349294045427, + "grad_norm": 0.4506807029247284, + "learning_rate": 9.070178265358372e-05, + "loss": 1.8542, + "step": 7213 + }, + { + "epoch": 2.214241866175568, + "grad_norm": 0.5011601448059082, + "learning_rate": 9.069889547720936e-05, + "loss": 1.9135, + "step": 7214 + }, + { + "epoch": 2.214548802946593, + "grad_norm": 0.3946228623390198, + "learning_rate": 9.069600789862467e-05, + "loss": 1.876, + "step": 7215 + }, + { + "epoch": 2.2148557397176183, + "grad_norm": 0.34833815693855286, + "learning_rate": 9.069311991785816e-05, + "loss": 1.8666, + "step": 7216 + }, + { + "epoch": 2.215162676488643, + "grad_norm": 0.43735191226005554, + "learning_rate": 9.069023153493839e-05, + "loss": 1.9238, + "step": 7217 + }, + { + "epoch": 2.2154696132596685, + "grad_norm": 0.5010718107223511, + "learning_rate": 9.06873427498939e-05, + "loss": 1.8724, + "step": 7218 + }, + { + "epoch": 2.215776550030694, + "grad_norm": 0.35850396752357483, + "learning_rate": 9.068445356275326e-05, + "loss": 1.8825, + "step": 7219 + }, + { + "epoch": 2.2160834868017187, + "grad_norm": 0.3528468906879425, + "learning_rate": 9.0681563973545e-05, + "loss": 1.8724, + "step": 7220 + }, + { + "epoch": 2.216390423572744, + "grad_norm": 0.34725508093833923, + "learning_rate": 9.067867398229767e-05, + "loss": 1.8722, + "step": 7221 + }, + { + "epoch": 2.216697360343769, + "grad_norm": 0.3343757092952728, + "learning_rate": 9.067578358903985e-05, + "loss": 1.8144, + "step": 7222 + }, + { + "epoch": 2.2170042971147943, + "grad_norm": 0.33384087681770325, + "learning_rate": 9.067289279380009e-05, + "loss": 1.832, + "step": 7223 + }, + { + "epoch": 2.2173112338858196, + "grad_norm": 0.3275810778141022, + "learning_rate": 9.067000159660697e-05, + "loss": 1.8819, + "step": 7224 + }, + { + "epoch": 2.2176181706568445, + "grad_norm": 0.405293732881546, + "learning_rate": 9.066710999748904e-05, + "loss": 1.8669, + "step": 7225 + }, + { + "epoch": 2.21792510742787, + "grad_norm": 0.3554569482803345, + "learning_rate": 9.066421799647491e-05, + "loss": 1.8331, + "step": 7226 + }, + { + "epoch": 2.218232044198895, + "grad_norm": 0.3896840810775757, + "learning_rate": 9.066132559359313e-05, + "loss": 1.891, + "step": 7227 + }, + { + "epoch": 2.21853898096992, + "grad_norm": 0.38668718934059143, + "learning_rate": 9.065843278887231e-05, + "loss": 1.9162, + "step": 7228 + }, + { + "epoch": 2.2188459177409454, + "grad_norm": 0.3593392074108124, + "learning_rate": 9.065553958234103e-05, + "loss": 1.866, + "step": 7229 + }, + { + "epoch": 2.2191528545119708, + "grad_norm": 0.3509809076786041, + "learning_rate": 9.065264597402788e-05, + "loss": 1.8979, + "step": 7230 + }, + { + "epoch": 2.2194597912829956, + "grad_norm": 0.35477882623672485, + "learning_rate": 9.064975196396144e-05, + "loss": 1.8425, + "step": 7231 + }, + { + "epoch": 2.219766728054021, + "grad_norm": 0.38763463497161865, + "learning_rate": 9.064685755217033e-05, + "loss": 1.8853, + "step": 7232 + }, + { + "epoch": 2.220073664825046, + "grad_norm": 0.33559930324554443, + "learning_rate": 9.064396273868316e-05, + "loss": 1.8825, + "step": 7233 + }, + { + "epoch": 2.220380601596071, + "grad_norm": 0.3130233585834503, + "learning_rate": 9.064106752352852e-05, + "loss": 1.8082, + "step": 7234 + }, + { + "epoch": 2.2206875383670965, + "grad_norm": 0.33321285247802734, + "learning_rate": 9.063817190673503e-05, + "loss": 1.8795, + "step": 7235 + }, + { + "epoch": 2.2209944751381214, + "grad_norm": 0.47564151883125305, + "learning_rate": 9.063527588833132e-05, + "loss": 1.9461, + "step": 7236 + }, + { + "epoch": 2.2213014119091468, + "grad_norm": 0.38102859258651733, + "learning_rate": 9.063237946834597e-05, + "loss": 1.8656, + "step": 7237 + }, + { + "epoch": 2.2216083486801717, + "grad_norm": 0.32240456342697144, + "learning_rate": 9.062948264680765e-05, + "loss": 1.8187, + "step": 7238 + }, + { + "epoch": 2.221915285451197, + "grad_norm": 0.2852800190448761, + "learning_rate": 9.062658542374496e-05, + "loss": 1.8172, + "step": 7239 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3506350815296173, + "learning_rate": 9.062368779918655e-05, + "loss": 1.8909, + "step": 7240 + }, + { + "epoch": 2.222529158993247, + "grad_norm": 0.29418817162513733, + "learning_rate": 9.062078977316104e-05, + "loss": 1.8078, + "step": 7241 + }, + { + "epoch": 2.2228360957642725, + "grad_norm": 0.31221407651901245, + "learning_rate": 9.061789134569707e-05, + "loss": 1.8813, + "step": 7242 + }, + { + "epoch": 2.223143032535298, + "grad_norm": 0.32314184308052063, + "learning_rate": 9.061499251682331e-05, + "loss": 1.8838, + "step": 7243 + }, + { + "epoch": 2.2234499693063228, + "grad_norm": 0.3329566419124603, + "learning_rate": 9.061209328656838e-05, + "loss": 1.8987, + "step": 7244 + }, + { + "epoch": 2.223756906077348, + "grad_norm": 0.35992133617401123, + "learning_rate": 9.060919365496094e-05, + "loss": 1.9194, + "step": 7245 + }, + { + "epoch": 2.2240638428483734, + "grad_norm": 0.33594536781311035, + "learning_rate": 9.060629362202964e-05, + "loss": 1.8303, + "step": 7246 + }, + { + "epoch": 2.2243707796193983, + "grad_norm": 0.3469938635826111, + "learning_rate": 9.060339318780316e-05, + "loss": 1.905, + "step": 7247 + }, + { + "epoch": 2.2246777163904237, + "grad_norm": 0.3989942967891693, + "learning_rate": 9.060049235231015e-05, + "loss": 1.8655, + "step": 7248 + }, + { + "epoch": 2.2249846531614486, + "grad_norm": 0.35004356503486633, + "learning_rate": 9.059759111557926e-05, + "loss": 1.8081, + "step": 7249 + }, + { + "epoch": 2.225291589932474, + "grad_norm": 0.38162320852279663, + "learning_rate": 9.059468947763919e-05, + "loss": 1.9243, + "step": 7250 + }, + { + "epoch": 2.2255985267034992, + "grad_norm": 0.3417564034461975, + "learning_rate": 9.059178743851859e-05, + "loss": 1.8246, + "step": 7251 + }, + { + "epoch": 2.225905463474524, + "grad_norm": 0.39185380935668945, + "learning_rate": 9.058888499824618e-05, + "loss": 1.9235, + "step": 7252 + }, + { + "epoch": 2.2262124002455494, + "grad_norm": 0.5741223096847534, + "learning_rate": 9.058598215685061e-05, + "loss": 1.9104, + "step": 7253 + }, + { + "epoch": 2.226519337016575, + "grad_norm": 0.6595804691314697, + "learning_rate": 9.058307891436057e-05, + "loss": 1.9956, + "step": 7254 + }, + { + "epoch": 2.2268262737875997, + "grad_norm": 0.6249661445617676, + "learning_rate": 9.058017527080476e-05, + "loss": 1.8913, + "step": 7255 + }, + { + "epoch": 2.227133210558625, + "grad_norm": 0.48208609223365784, + "learning_rate": 9.057727122621188e-05, + "loss": 1.9116, + "step": 7256 + }, + { + "epoch": 2.22744014732965, + "grad_norm": 0.37400147318840027, + "learning_rate": 9.057436678061062e-05, + "loss": 1.8828, + "step": 7257 + }, + { + "epoch": 2.2277470841006752, + "grad_norm": 0.40321463346481323, + "learning_rate": 9.057146193402968e-05, + "loss": 1.7984, + "step": 7258 + }, + { + "epoch": 2.2280540208717006, + "grad_norm": 0.43090149760246277, + "learning_rate": 9.056855668649778e-05, + "loss": 1.9135, + "step": 7259 + }, + { + "epoch": 2.2283609576427255, + "grad_norm": 0.3625677525997162, + "learning_rate": 9.056565103804362e-05, + "loss": 1.9005, + "step": 7260 + }, + { + "epoch": 2.228667894413751, + "grad_norm": 0.3386496901512146, + "learning_rate": 9.056274498869593e-05, + "loss": 1.879, + "step": 7261 + }, + { + "epoch": 2.228974831184776, + "grad_norm": 0.45207980275154114, + "learning_rate": 9.05598385384834e-05, + "loss": 1.8748, + "step": 7262 + }, + { + "epoch": 2.229281767955801, + "grad_norm": 0.38665562868118286, + "learning_rate": 9.055693168743478e-05, + "loss": 1.8828, + "step": 7263 + }, + { + "epoch": 2.2295887047268264, + "grad_norm": 0.3074968159198761, + "learning_rate": 9.05540244355788e-05, + "loss": 1.8443, + "step": 7264 + }, + { + "epoch": 2.2298956414978512, + "grad_norm": 0.36243903636932373, + "learning_rate": 9.055111678294418e-05, + "loss": 1.8681, + "step": 7265 + }, + { + "epoch": 2.2302025782688766, + "grad_norm": 0.4070085287094116, + "learning_rate": 9.054820872955965e-05, + "loss": 1.8643, + "step": 7266 + }, + { + "epoch": 2.230509515039902, + "grad_norm": 0.3784204125404358, + "learning_rate": 9.054530027545396e-05, + "loss": 1.9197, + "step": 7267 + }, + { + "epoch": 2.230816451810927, + "grad_norm": 0.32002586126327515, + "learning_rate": 9.054239142065583e-05, + "loss": 1.9, + "step": 7268 + }, + { + "epoch": 2.231123388581952, + "grad_norm": 0.3701259195804596, + "learning_rate": 9.053948216519405e-05, + "loss": 1.8815, + "step": 7269 + }, + { + "epoch": 2.2314303253529775, + "grad_norm": 0.32927554845809937, + "learning_rate": 9.053657250909734e-05, + "loss": 1.8599, + "step": 7270 + }, + { + "epoch": 2.2317372621240024, + "grad_norm": 0.2915503680706024, + "learning_rate": 9.053366245239445e-05, + "loss": 1.8553, + "step": 7271 + }, + { + "epoch": 2.2320441988950277, + "grad_norm": 0.3347928822040558, + "learning_rate": 9.053075199511416e-05, + "loss": 1.926, + "step": 7272 + }, + { + "epoch": 2.2323511356660526, + "grad_norm": 0.37499183416366577, + "learning_rate": 9.052784113728523e-05, + "loss": 1.8636, + "step": 7273 + }, + { + "epoch": 2.232658072437078, + "grad_norm": 0.38303107023239136, + "learning_rate": 9.05249298789364e-05, + "loss": 1.8739, + "step": 7274 + }, + { + "epoch": 2.2329650092081033, + "grad_norm": 0.356942355632782, + "learning_rate": 9.052201822009648e-05, + "loss": 1.8401, + "step": 7275 + }, + { + "epoch": 2.233271945979128, + "grad_norm": 0.3391316533088684, + "learning_rate": 9.051910616079422e-05, + "loss": 1.8954, + "step": 7276 + }, + { + "epoch": 2.2335788827501535, + "grad_norm": 0.3100464344024658, + "learning_rate": 9.051619370105839e-05, + "loss": 1.8726, + "step": 7277 + }, + { + "epoch": 2.233885819521179, + "grad_norm": 0.38745078444480896, + "learning_rate": 9.05132808409178e-05, + "loss": 1.9605, + "step": 7278 + }, + { + "epoch": 2.2341927562922037, + "grad_norm": 0.40631747245788574, + "learning_rate": 9.051036758040123e-05, + "loss": 1.8458, + "step": 7279 + }, + { + "epoch": 2.234499693063229, + "grad_norm": 0.4084717929363251, + "learning_rate": 9.050745391953745e-05, + "loss": 1.8696, + "step": 7280 + }, + { + "epoch": 2.234806629834254, + "grad_norm": 0.4426955282688141, + "learning_rate": 9.050453985835527e-05, + "loss": 1.9063, + "step": 7281 + }, + { + "epoch": 2.2351135666052793, + "grad_norm": 0.37360796332359314, + "learning_rate": 9.05016253968835e-05, + "loss": 1.9299, + "step": 7282 + }, + { + "epoch": 2.2354205033763046, + "grad_norm": 0.34415799379348755, + "learning_rate": 9.049871053515091e-05, + "loss": 1.8877, + "step": 7283 + }, + { + "epoch": 2.2357274401473295, + "grad_norm": 0.3745698928833008, + "learning_rate": 9.049579527318633e-05, + "loss": 1.9272, + "step": 7284 + }, + { + "epoch": 2.236034376918355, + "grad_norm": 0.3293079435825348, + "learning_rate": 9.049287961101857e-05, + "loss": 1.8599, + "step": 7285 + }, + { + "epoch": 2.23634131368938, + "grad_norm": 0.3563106060028076, + "learning_rate": 9.048996354867644e-05, + "loss": 1.938, + "step": 7286 + }, + { + "epoch": 2.236648250460405, + "grad_norm": 0.36354976892471313, + "learning_rate": 9.048704708618876e-05, + "loss": 1.9401, + "step": 7287 + }, + { + "epoch": 2.2369551872314304, + "grad_norm": 0.32659000158309937, + "learning_rate": 9.048413022358434e-05, + "loss": 1.8056, + "step": 7288 + }, + { + "epoch": 2.2372621240024557, + "grad_norm": 0.30486637353897095, + "learning_rate": 9.048121296089202e-05, + "loss": 1.8178, + "step": 7289 + }, + { + "epoch": 2.2375690607734806, + "grad_norm": 0.34506455063819885, + "learning_rate": 9.047829529814063e-05, + "loss": 1.8866, + "step": 7290 + }, + { + "epoch": 2.237875997544506, + "grad_norm": 0.3200983703136444, + "learning_rate": 9.047537723535902e-05, + "loss": 1.8218, + "step": 7291 + }, + { + "epoch": 2.238182934315531, + "grad_norm": 0.33315715193748474, + "learning_rate": 9.047245877257597e-05, + "loss": 1.8939, + "step": 7292 + }, + { + "epoch": 2.238489871086556, + "grad_norm": 0.38259127736091614, + "learning_rate": 9.046953990982039e-05, + "loss": 1.9566, + "step": 7293 + }, + { + "epoch": 2.2387968078575815, + "grad_norm": 0.32880350947380066, + "learning_rate": 9.04666206471211e-05, + "loss": 1.9056, + "step": 7294 + }, + { + "epoch": 2.2391037446286064, + "grad_norm": 0.39114195108413696, + "learning_rate": 9.046370098450692e-05, + "loss": 1.8773, + "step": 7295 + }, + { + "epoch": 2.2394106813996317, + "grad_norm": 0.37625813484191895, + "learning_rate": 9.046078092200675e-05, + "loss": 1.8685, + "step": 7296 + }, + { + "epoch": 2.2397176181706566, + "grad_norm": 0.3604978621006012, + "learning_rate": 9.045786045964942e-05, + "loss": 1.885, + "step": 7297 + }, + { + "epoch": 2.240024554941682, + "grad_norm": 0.32200589776039124, + "learning_rate": 9.045493959746381e-05, + "loss": 1.9146, + "step": 7298 + }, + { + "epoch": 2.2403314917127073, + "grad_norm": 0.3635976314544678, + "learning_rate": 9.045201833547876e-05, + "loss": 1.8597, + "step": 7299 + }, + { + "epoch": 2.240638428483732, + "grad_norm": 0.3326318562030792, + "learning_rate": 9.044909667372317e-05, + "loss": 1.8577, + "step": 7300 + }, + { + "epoch": 2.2409453652547575, + "grad_norm": 0.32209664583206177, + "learning_rate": 9.044617461222589e-05, + "loss": 1.844, + "step": 7301 + }, + { + "epoch": 2.241252302025783, + "grad_norm": 0.3654637634754181, + "learning_rate": 9.044325215101581e-05, + "loss": 1.8858, + "step": 7302 + }, + { + "epoch": 2.2415592387968077, + "grad_norm": 0.3583166003227234, + "learning_rate": 9.04403292901218e-05, + "loss": 1.8148, + "step": 7303 + }, + { + "epoch": 2.241866175567833, + "grad_norm": 0.3315606117248535, + "learning_rate": 9.043740602957276e-05, + "loss": 1.8504, + "step": 7304 + }, + { + "epoch": 2.2421731123388584, + "grad_norm": 0.36084556579589844, + "learning_rate": 9.043448236939758e-05, + "loss": 1.9167, + "step": 7305 + }, + { + "epoch": 2.2424800491098833, + "grad_norm": 0.43558987975120544, + "learning_rate": 9.043155830962514e-05, + "loss": 1.8937, + "step": 7306 + }, + { + "epoch": 2.2427869858809086, + "grad_norm": 0.455240398645401, + "learning_rate": 9.042863385028433e-05, + "loss": 1.9774, + "step": 7307 + }, + { + "epoch": 2.2430939226519335, + "grad_norm": 0.35868698358535767, + "learning_rate": 9.042570899140408e-05, + "loss": 1.7999, + "step": 7308 + }, + { + "epoch": 2.243400859422959, + "grad_norm": 0.33930447697639465, + "learning_rate": 9.042278373301327e-05, + "loss": 1.965, + "step": 7309 + }, + { + "epoch": 2.243707796193984, + "grad_norm": 0.34124335646629333, + "learning_rate": 9.041985807514082e-05, + "loss": 1.8916, + "step": 7310 + }, + { + "epoch": 2.244014732965009, + "grad_norm": 0.3905695974826813, + "learning_rate": 9.041693201781565e-05, + "loss": 1.9066, + "step": 7311 + }, + { + "epoch": 2.2443216697360344, + "grad_norm": 0.3108711242675781, + "learning_rate": 9.041400556106667e-05, + "loss": 1.8038, + "step": 7312 + }, + { + "epoch": 2.2446286065070598, + "grad_norm": 0.2853390872478485, + "learning_rate": 9.041107870492279e-05, + "loss": 1.8945, + "step": 7313 + }, + { + "epoch": 2.2449355432780846, + "grad_norm": 0.33351564407348633, + "learning_rate": 9.040815144941295e-05, + "loss": 1.8796, + "step": 7314 + }, + { + "epoch": 2.24524248004911, + "grad_norm": 0.3470609486103058, + "learning_rate": 9.040522379456606e-05, + "loss": 1.8914, + "step": 7315 + }, + { + "epoch": 2.245549416820135, + "grad_norm": 0.3474356532096863, + "learning_rate": 9.040229574041109e-05, + "loss": 1.838, + "step": 7316 + }, + { + "epoch": 2.24585635359116, + "grad_norm": 0.36590397357940674, + "learning_rate": 9.039936728697693e-05, + "loss": 1.86, + "step": 7317 + }, + { + "epoch": 2.2461632903621855, + "grad_norm": 0.35168272256851196, + "learning_rate": 9.039643843429257e-05, + "loss": 1.9337, + "step": 7318 + }, + { + "epoch": 2.2464702271332104, + "grad_norm": 0.3402341604232788, + "learning_rate": 9.039350918238691e-05, + "loss": 1.9291, + "step": 7319 + }, + { + "epoch": 2.2467771639042358, + "grad_norm": 0.3505321443080902, + "learning_rate": 9.03905795312889e-05, + "loss": 1.8252, + "step": 7320 + }, + { + "epoch": 2.247084100675261, + "grad_norm": 0.38366270065307617, + "learning_rate": 9.038764948102754e-05, + "loss": 1.8685, + "step": 7321 + }, + { + "epoch": 2.247391037446286, + "grad_norm": 0.3616010844707489, + "learning_rate": 9.038471903163176e-05, + "loss": 1.8734, + "step": 7322 + }, + { + "epoch": 2.2476979742173113, + "grad_norm": 0.2982875108718872, + "learning_rate": 9.038178818313048e-05, + "loss": 1.824, + "step": 7323 + }, + { + "epoch": 2.248004910988336, + "grad_norm": 0.41936174035072327, + "learning_rate": 9.037885693555273e-05, + "loss": 1.8799, + "step": 7324 + }, + { + "epoch": 2.2483118477593615, + "grad_norm": 0.3460717797279358, + "learning_rate": 9.037592528892744e-05, + "loss": 1.8889, + "step": 7325 + }, + { + "epoch": 2.248618784530387, + "grad_norm": 0.34347018599510193, + "learning_rate": 9.03729932432836e-05, + "loss": 1.8779, + "step": 7326 + }, + { + "epoch": 2.2489257213014118, + "grad_norm": 0.2988032400608063, + "learning_rate": 9.037006079865016e-05, + "loss": 1.8753, + "step": 7327 + }, + { + "epoch": 2.249232658072437, + "grad_norm": 0.32754310965538025, + "learning_rate": 9.036712795505613e-05, + "loss": 1.8896, + "step": 7328 + }, + { + "epoch": 2.2495395948434624, + "grad_norm": 0.3599032163619995, + "learning_rate": 9.036419471253049e-05, + "loss": 1.8752, + "step": 7329 + }, + { + "epoch": 2.2498465316144873, + "grad_norm": 0.3461225926876068, + "learning_rate": 9.03612610711022e-05, + "loss": 1.8723, + "step": 7330 + }, + { + "epoch": 2.2501534683855127, + "grad_norm": 0.3141838610172272, + "learning_rate": 9.035832703080027e-05, + "loss": 1.8825, + "step": 7331 + }, + { + "epoch": 2.250460405156538, + "grad_norm": 0.35188567638397217, + "learning_rate": 9.035539259165371e-05, + "loss": 1.8832, + "step": 7332 + }, + { + "epoch": 2.250767341927563, + "grad_norm": 0.3496280014514923, + "learning_rate": 9.035245775369151e-05, + "loss": 1.9084, + "step": 7333 + }, + { + "epoch": 2.2510742786985882, + "grad_norm": 0.34936273097991943, + "learning_rate": 9.034952251694266e-05, + "loss": 1.8142, + "step": 7334 + }, + { + "epoch": 2.251381215469613, + "grad_norm": 0.4227045774459839, + "learning_rate": 9.034658688143618e-05, + "loss": 1.9454, + "step": 7335 + }, + { + "epoch": 2.2516881522406385, + "grad_norm": 0.4042366147041321, + "learning_rate": 9.034365084720108e-05, + "loss": 1.8993, + "step": 7336 + }, + { + "epoch": 2.251995089011664, + "grad_norm": 0.392633318901062, + "learning_rate": 9.03407144142664e-05, + "loss": 1.9229, + "step": 7337 + }, + { + "epoch": 2.2523020257826887, + "grad_norm": 0.31304940581321716, + "learning_rate": 9.033777758266111e-05, + "loss": 1.8746, + "step": 7338 + }, + { + "epoch": 2.252608962553714, + "grad_norm": 0.3205752372741699, + "learning_rate": 9.033484035241426e-05, + "loss": 1.8224, + "step": 7339 + }, + { + "epoch": 2.252915899324739, + "grad_norm": 0.32164251804351807, + "learning_rate": 9.033190272355488e-05, + "loss": 1.8164, + "step": 7340 + }, + { + "epoch": 2.2532228360957642, + "grad_norm": 0.3567545413970947, + "learning_rate": 9.032896469611201e-05, + "loss": 1.8892, + "step": 7341 + }, + { + "epoch": 2.2535297728667896, + "grad_norm": 0.3475800156593323, + "learning_rate": 9.032602627011467e-05, + "loss": 1.8594, + "step": 7342 + }, + { + "epoch": 2.2538367096378145, + "grad_norm": 0.38770994544029236, + "learning_rate": 9.032308744559189e-05, + "loss": 1.8899, + "step": 7343 + }, + { + "epoch": 2.25414364640884, + "grad_norm": 0.3671153783798218, + "learning_rate": 9.032014822257273e-05, + "loss": 1.8795, + "step": 7344 + }, + { + "epoch": 2.254450583179865, + "grad_norm": 0.3415989875793457, + "learning_rate": 9.031720860108623e-05, + "loss": 1.9007, + "step": 7345 + }, + { + "epoch": 2.25475751995089, + "grad_norm": 0.3317084014415741, + "learning_rate": 9.031426858116145e-05, + "loss": 1.8604, + "step": 7346 + }, + { + "epoch": 2.2550644567219154, + "grad_norm": 0.3760251998901367, + "learning_rate": 9.031132816282745e-05, + "loss": 1.9061, + "step": 7347 + }, + { + "epoch": 2.2553713934929407, + "grad_norm": 0.4288908541202545, + "learning_rate": 9.030838734611326e-05, + "loss": 1.8621, + "step": 7348 + }, + { + "epoch": 2.2556783302639656, + "grad_norm": 0.3840491771697998, + "learning_rate": 9.030544613104797e-05, + "loss": 1.8743, + "step": 7349 + }, + { + "epoch": 2.255985267034991, + "grad_norm": 0.32746297121047974, + "learning_rate": 9.030250451766063e-05, + "loss": 1.8813, + "step": 7350 + }, + { + "epoch": 2.256292203806016, + "grad_norm": 0.31266525387763977, + "learning_rate": 9.029956250598032e-05, + "loss": 1.816, + "step": 7351 + }, + { + "epoch": 2.256599140577041, + "grad_norm": 0.34744998812675476, + "learning_rate": 9.029662009603613e-05, + "loss": 1.8728, + "step": 7352 + }, + { + "epoch": 2.2569060773480665, + "grad_norm": 0.36204856634140015, + "learning_rate": 9.029367728785709e-05, + "loss": 1.9331, + "step": 7353 + }, + { + "epoch": 2.2572130141190914, + "grad_norm": 0.3839271664619446, + "learning_rate": 9.029073408147234e-05, + "loss": 2.0018, + "step": 7354 + }, + { + "epoch": 2.2575199508901167, + "grad_norm": 0.34844526648521423, + "learning_rate": 9.028779047691094e-05, + "loss": 1.8873, + "step": 7355 + }, + { + "epoch": 2.2578268876611416, + "grad_norm": 0.31876906752586365, + "learning_rate": 9.028484647420196e-05, + "loss": 1.8569, + "step": 7356 + }, + { + "epoch": 2.258133824432167, + "grad_norm": 0.3633274435997009, + "learning_rate": 9.028190207337452e-05, + "loss": 1.8645, + "step": 7357 + }, + { + "epoch": 2.2584407612031923, + "grad_norm": 0.39025530219078064, + "learning_rate": 9.027895727445775e-05, + "loss": 1.911, + "step": 7358 + }, + { + "epoch": 2.258747697974217, + "grad_norm": 0.34168434143066406, + "learning_rate": 9.027601207748067e-05, + "loss": 1.8675, + "step": 7359 + }, + { + "epoch": 2.2590546347452425, + "grad_norm": 0.3539605438709259, + "learning_rate": 9.027306648247245e-05, + "loss": 1.9001, + "step": 7360 + }, + { + "epoch": 2.259361571516268, + "grad_norm": 0.30433401465415955, + "learning_rate": 9.02701204894622e-05, + "loss": 1.8598, + "step": 7361 + }, + { + "epoch": 2.2596685082872927, + "grad_norm": 0.35448700189590454, + "learning_rate": 9.026717409847898e-05, + "loss": 1.8845, + "step": 7362 + }, + { + "epoch": 2.259975445058318, + "grad_norm": 0.34060248732566833, + "learning_rate": 9.026422730955197e-05, + "loss": 1.9322, + "step": 7363 + }, + { + "epoch": 2.2602823818293434, + "grad_norm": 0.3370642364025116, + "learning_rate": 9.026128012271026e-05, + "loss": 1.8356, + "step": 7364 + }, + { + "epoch": 2.2605893186003683, + "grad_norm": 0.3148033022880554, + "learning_rate": 9.025833253798298e-05, + "loss": 1.7723, + "step": 7365 + }, + { + "epoch": 2.2608962553713936, + "grad_norm": 0.3062879145145416, + "learning_rate": 9.025538455539925e-05, + "loss": 1.8548, + "step": 7366 + }, + { + "epoch": 2.2612031921424185, + "grad_norm": 0.3378484547138214, + "learning_rate": 9.025243617498825e-05, + "loss": 1.9049, + "step": 7367 + }, + { + "epoch": 2.261510128913444, + "grad_norm": 0.277660608291626, + "learning_rate": 9.024948739677905e-05, + "loss": 1.7833, + "step": 7368 + }, + { + "epoch": 2.261817065684469, + "grad_norm": 0.3986060619354248, + "learning_rate": 9.024653822080083e-05, + "loss": 1.8837, + "step": 7369 + }, + { + "epoch": 2.262124002455494, + "grad_norm": 0.3013289272785187, + "learning_rate": 9.024358864708275e-05, + "loss": 1.8659, + "step": 7370 + }, + { + "epoch": 2.2624309392265194, + "grad_norm": 0.3403053879737854, + "learning_rate": 9.024063867565391e-05, + "loss": 1.8914, + "step": 7371 + }, + { + "epoch": 2.2627378759975443, + "grad_norm": 0.3488257825374603, + "learning_rate": 9.023768830654351e-05, + "loss": 1.8887, + "step": 7372 + }, + { + "epoch": 2.2630448127685696, + "grad_norm": 0.2950255274772644, + "learning_rate": 9.023473753978069e-05, + "loss": 1.8385, + "step": 7373 + }, + { + "epoch": 2.263351749539595, + "grad_norm": 0.35732173919677734, + "learning_rate": 9.023178637539461e-05, + "loss": 1.8769, + "step": 7374 + }, + { + "epoch": 2.26365868631062, + "grad_norm": 0.5403436422348022, + "learning_rate": 9.022883481341445e-05, + "loss": 1.9742, + "step": 7375 + }, + { + "epoch": 2.263965623081645, + "grad_norm": 0.5506799221038818, + "learning_rate": 9.022588285386935e-05, + "loss": 1.8667, + "step": 7376 + }, + { + "epoch": 2.2642725598526705, + "grad_norm": 0.4272395372390747, + "learning_rate": 9.02229304967885e-05, + "loss": 1.8336, + "step": 7377 + }, + { + "epoch": 2.2645794966236954, + "grad_norm": 0.34911462664604187, + "learning_rate": 9.021997774220108e-05, + "loss": 1.8608, + "step": 7378 + }, + { + "epoch": 2.2648864333947207, + "grad_norm": 0.3592715263366699, + "learning_rate": 9.021702459013626e-05, + "loss": 1.925, + "step": 7379 + }, + { + "epoch": 2.265193370165746, + "grad_norm": 0.38482216000556946, + "learning_rate": 9.021407104062323e-05, + "loss": 1.8553, + "step": 7380 + }, + { + "epoch": 2.265500306936771, + "grad_norm": 0.4675584137439728, + "learning_rate": 9.021111709369118e-05, + "loss": 1.9303, + "step": 7381 + }, + { + "epoch": 2.2658072437077963, + "grad_norm": 0.40397754311561584, + "learning_rate": 9.02081627493693e-05, + "loss": 1.9512, + "step": 7382 + }, + { + "epoch": 2.266114180478821, + "grad_norm": 0.3385498821735382, + "learning_rate": 9.02052080076868e-05, + "loss": 1.8314, + "step": 7383 + }, + { + "epoch": 2.2664211172498465, + "grad_norm": 0.40668871998786926, + "learning_rate": 9.020225286867285e-05, + "loss": 1.8658, + "step": 7384 + }, + { + "epoch": 2.266728054020872, + "grad_norm": 0.4566061198711395, + "learning_rate": 9.01992973323567e-05, + "loss": 1.8429, + "step": 7385 + }, + { + "epoch": 2.2670349907918967, + "grad_norm": 0.42283549904823303, + "learning_rate": 9.019634139876752e-05, + "loss": 1.8858, + "step": 7386 + }, + { + "epoch": 2.267341927562922, + "grad_norm": 0.3491251468658447, + "learning_rate": 9.019338506793454e-05, + "loss": 1.8389, + "step": 7387 + }, + { + "epoch": 2.267648864333947, + "grad_norm": 0.33846428990364075, + "learning_rate": 9.019042833988696e-05, + "loss": 1.8309, + "step": 7388 + }, + { + "epoch": 2.2679558011049723, + "grad_norm": 0.39968016743659973, + "learning_rate": 9.0187471214654e-05, + "loss": 1.8591, + "step": 7389 + }, + { + "epoch": 2.2682627378759976, + "grad_norm": 0.39926376938819885, + "learning_rate": 9.018451369226493e-05, + "loss": 1.9341, + "step": 7390 + }, + { + "epoch": 2.2685696746470225, + "grad_norm": 0.41112056374549866, + "learning_rate": 9.018155577274892e-05, + "loss": 1.8856, + "step": 7391 + }, + { + "epoch": 2.268876611418048, + "grad_norm": 0.49490058422088623, + "learning_rate": 9.017859745613521e-05, + "loss": 1.8458, + "step": 7392 + }, + { + "epoch": 2.269183548189073, + "grad_norm": 0.42149874567985535, + "learning_rate": 9.017563874245308e-05, + "loss": 1.862, + "step": 7393 + }, + { + "epoch": 2.269490484960098, + "grad_norm": 0.37284091114997864, + "learning_rate": 9.017267963173173e-05, + "loss": 1.8698, + "step": 7394 + }, + { + "epoch": 2.2697974217311234, + "grad_norm": 0.3743322193622589, + "learning_rate": 9.016972012400041e-05, + "loss": 1.8847, + "step": 7395 + }, + { + "epoch": 2.2701043585021488, + "grad_norm": 0.4327050447463989, + "learning_rate": 9.016676021928838e-05, + "loss": 1.8227, + "step": 7396 + }, + { + "epoch": 2.2704112952731736, + "grad_norm": 0.4334336519241333, + "learning_rate": 9.016379991762487e-05, + "loss": 1.9292, + "step": 7397 + }, + { + "epoch": 2.270718232044199, + "grad_norm": 0.37071630358695984, + "learning_rate": 9.016083921903915e-05, + "loss": 1.8045, + "step": 7398 + }, + { + "epoch": 2.271025168815224, + "grad_norm": 0.32131752371788025, + "learning_rate": 9.015787812356049e-05, + "loss": 1.8697, + "step": 7399 + }, + { + "epoch": 2.271332105586249, + "grad_norm": 0.3604664206504822, + "learning_rate": 9.015491663121813e-05, + "loss": 1.9259, + "step": 7400 + }, + { + "epoch": 2.2716390423572745, + "grad_norm": 0.3364580571651459, + "learning_rate": 9.015195474204136e-05, + "loss": 1.8964, + "step": 7401 + }, + { + "epoch": 2.2719459791282994, + "grad_norm": 0.3141402304172516, + "learning_rate": 9.014899245605944e-05, + "loss": 1.8536, + "step": 7402 + }, + { + "epoch": 2.2722529158993248, + "grad_norm": 0.3387024402618408, + "learning_rate": 9.014602977330162e-05, + "loss": 1.8362, + "step": 7403 + }, + { + "epoch": 2.27255985267035, + "grad_norm": 0.42270272970199585, + "learning_rate": 9.014306669379723e-05, + "loss": 1.8288, + "step": 7404 + }, + { + "epoch": 2.272866789441375, + "grad_norm": 0.4565230906009674, + "learning_rate": 9.01401032175755e-05, + "loss": 1.8573, + "step": 7405 + }, + { + "epoch": 2.2731737262124003, + "grad_norm": 0.38861140608787537, + "learning_rate": 9.013713934466576e-05, + "loss": 1.8778, + "step": 7406 + }, + { + "epoch": 2.2734806629834257, + "grad_norm": 0.31552520394325256, + "learning_rate": 9.01341750750973e-05, + "loss": 1.8342, + "step": 7407 + }, + { + "epoch": 2.2737875997544506, + "grad_norm": 0.3771591782569885, + "learning_rate": 9.013121040889938e-05, + "loss": 1.8847, + "step": 7408 + }, + { + "epoch": 2.274094536525476, + "grad_norm": 0.3689042925834656, + "learning_rate": 9.012824534610132e-05, + "loss": 1.9014, + "step": 7409 + }, + { + "epoch": 2.2744014732965008, + "grad_norm": 0.31477800011634827, + "learning_rate": 9.012527988673241e-05, + "loss": 1.8631, + "step": 7410 + }, + { + "epoch": 2.274708410067526, + "grad_norm": 0.3238977789878845, + "learning_rate": 9.012231403082199e-05, + "loss": 1.8319, + "step": 7411 + }, + { + "epoch": 2.2750153468385514, + "grad_norm": 0.3587593138217926, + "learning_rate": 9.011934777839932e-05, + "loss": 1.8982, + "step": 7412 + }, + { + "epoch": 2.2753222836095763, + "grad_norm": 0.35946986079216003, + "learning_rate": 9.011638112949376e-05, + "loss": 1.9206, + "step": 7413 + }, + { + "epoch": 2.2756292203806017, + "grad_norm": 0.3451001048088074, + "learning_rate": 9.01134140841346e-05, + "loss": 1.8122, + "step": 7414 + }, + { + "epoch": 2.2759361571516266, + "grad_norm": 0.3779532313346863, + "learning_rate": 9.011044664235116e-05, + "loss": 1.8851, + "step": 7415 + }, + { + "epoch": 2.276243093922652, + "grad_norm": 0.3812767267227173, + "learning_rate": 9.010747880417279e-05, + "loss": 1.902, + "step": 7416 + }, + { + "epoch": 2.2765500306936772, + "grad_norm": 0.3666127920150757, + "learning_rate": 9.01045105696288e-05, + "loss": 1.8296, + "step": 7417 + }, + { + "epoch": 2.276856967464702, + "grad_norm": 0.3588816225528717, + "learning_rate": 9.010154193874854e-05, + "loss": 1.9023, + "step": 7418 + }, + { + "epoch": 2.2771639042357275, + "grad_norm": 0.37766706943511963, + "learning_rate": 9.009857291156134e-05, + "loss": 1.7996, + "step": 7419 + }, + { + "epoch": 2.277470841006753, + "grad_norm": 0.4222901165485382, + "learning_rate": 9.009560348809654e-05, + "loss": 1.8802, + "step": 7420 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.39289870858192444, + "learning_rate": 9.009263366838348e-05, + "loss": 1.8988, + "step": 7421 + }, + { + "epoch": 2.278084714548803, + "grad_norm": 0.3670540750026703, + "learning_rate": 9.008966345245152e-05, + "loss": 1.8348, + "step": 7422 + }, + { + "epoch": 2.2783916513198283, + "grad_norm": 0.36671552062034607, + "learning_rate": 9.008669284032998e-05, + "loss": 1.9059, + "step": 7423 + }, + { + "epoch": 2.2786985880908532, + "grad_norm": 0.33226338028907776, + "learning_rate": 9.008372183204827e-05, + "loss": 1.8736, + "step": 7424 + }, + { + "epoch": 2.2790055248618786, + "grad_norm": 0.3424983322620392, + "learning_rate": 9.008075042763573e-05, + "loss": 1.8537, + "step": 7425 + }, + { + "epoch": 2.2793124616329035, + "grad_norm": 0.3336870074272156, + "learning_rate": 9.007777862712172e-05, + "loss": 1.8622, + "step": 7426 + }, + { + "epoch": 2.279619398403929, + "grad_norm": 0.3488881289958954, + "learning_rate": 9.007480643053561e-05, + "loss": 1.88, + "step": 7427 + }, + { + "epoch": 2.279926335174954, + "grad_norm": 0.34159761667251587, + "learning_rate": 9.007183383790676e-05, + "loss": 1.8893, + "step": 7428 + }, + { + "epoch": 2.280233271945979, + "grad_norm": 0.3075805604457855, + "learning_rate": 9.006886084926459e-05, + "loss": 1.8613, + "step": 7429 + }, + { + "epoch": 2.2805402087170044, + "grad_norm": 0.32371413707733154, + "learning_rate": 9.006588746463844e-05, + "loss": 1.909, + "step": 7430 + }, + { + "epoch": 2.2808471454880292, + "grad_norm": 0.34343451261520386, + "learning_rate": 9.006291368405769e-05, + "loss": 1.8696, + "step": 7431 + }, + { + "epoch": 2.2811540822590546, + "grad_norm": 0.34018251299858093, + "learning_rate": 9.005993950755177e-05, + "loss": 1.9155, + "step": 7432 + }, + { + "epoch": 2.28146101903008, + "grad_norm": 0.42582982778549194, + "learning_rate": 9.005696493515003e-05, + "loss": 1.8901, + "step": 7433 + }, + { + "epoch": 2.281767955801105, + "grad_norm": 0.44168829917907715, + "learning_rate": 9.005398996688188e-05, + "loss": 1.8693, + "step": 7434 + }, + { + "epoch": 2.28207489257213, + "grad_norm": 0.3650555908679962, + "learning_rate": 9.005101460277673e-05, + "loss": 1.8726, + "step": 7435 + }, + { + "epoch": 2.2823818293431555, + "grad_norm": 0.2945705056190491, + "learning_rate": 9.004803884286399e-05, + "loss": 1.8655, + "step": 7436 + }, + { + "epoch": 2.2826887661141804, + "grad_norm": 0.4192120432853699, + "learning_rate": 9.004506268717305e-05, + "loss": 1.9859, + "step": 7437 + }, + { + "epoch": 2.2829957028852057, + "grad_norm": 0.35403937101364136, + "learning_rate": 9.004208613573334e-05, + "loss": 1.785, + "step": 7438 + }, + { + "epoch": 2.283302639656231, + "grad_norm": 0.3038218021392822, + "learning_rate": 9.003910918857426e-05, + "loss": 1.8199, + "step": 7439 + }, + { + "epoch": 2.283609576427256, + "grad_norm": 0.3447442352771759, + "learning_rate": 9.003613184572522e-05, + "loss": 1.882, + "step": 7440 + }, + { + "epoch": 2.2839165131982813, + "grad_norm": 0.32208123803138733, + "learning_rate": 9.003315410721567e-05, + "loss": 1.8326, + "step": 7441 + }, + { + "epoch": 2.284223449969306, + "grad_norm": 0.31731268763542175, + "learning_rate": 9.003017597307504e-05, + "loss": 1.8947, + "step": 7442 + }, + { + "epoch": 2.2845303867403315, + "grad_norm": 0.3491910398006439, + "learning_rate": 9.002719744333273e-05, + "loss": 1.924, + "step": 7443 + }, + { + "epoch": 2.284837323511357, + "grad_norm": 0.32135117053985596, + "learning_rate": 9.00242185180182e-05, + "loss": 1.838, + "step": 7444 + }, + { + "epoch": 2.2851442602823817, + "grad_norm": 0.32201823592185974, + "learning_rate": 9.00212391971609e-05, + "loss": 1.8449, + "step": 7445 + }, + { + "epoch": 2.285451197053407, + "grad_norm": 0.3983609676361084, + "learning_rate": 9.001825948079024e-05, + "loss": 1.8897, + "step": 7446 + }, + { + "epoch": 2.285758133824432, + "grad_norm": 0.4174421727657318, + "learning_rate": 9.001527936893568e-05, + "loss": 1.8671, + "step": 7447 + }, + { + "epoch": 2.2860650705954573, + "grad_norm": 0.3456888496875763, + "learning_rate": 9.001229886162668e-05, + "loss": 1.9064, + "step": 7448 + }, + { + "epoch": 2.2863720073664826, + "grad_norm": 0.3092229664325714, + "learning_rate": 9.000931795889269e-05, + "loss": 1.8478, + "step": 7449 + }, + { + "epoch": 2.2866789441375075, + "grad_norm": 0.40093541145324707, + "learning_rate": 9.000633666076317e-05, + "loss": 1.9226, + "step": 7450 + }, + { + "epoch": 2.286985880908533, + "grad_norm": 0.41090336441993713, + "learning_rate": 9.000335496726759e-05, + "loss": 1.8542, + "step": 7451 + }, + { + "epoch": 2.287292817679558, + "grad_norm": 0.48479974269866943, + "learning_rate": 9.00003728784354e-05, + "loss": 1.9217, + "step": 7452 + }, + { + "epoch": 2.287599754450583, + "grad_norm": 0.662677526473999, + "learning_rate": 8.999739039429609e-05, + "loss": 1.957, + "step": 7453 + }, + { + "epoch": 2.2879066912216084, + "grad_norm": 0.6417959928512573, + "learning_rate": 8.999440751487911e-05, + "loss": 1.8273, + "step": 7454 + }, + { + "epoch": 2.2882136279926337, + "grad_norm": 0.5561745762825012, + "learning_rate": 8.999142424021396e-05, + "loss": 1.9081, + "step": 7455 + }, + { + "epoch": 2.2885205647636586, + "grad_norm": 0.3603537976741791, + "learning_rate": 8.998844057033013e-05, + "loss": 1.8256, + "step": 7456 + }, + { + "epoch": 2.288827501534684, + "grad_norm": 0.5149406790733337, + "learning_rate": 8.998545650525707e-05, + "loss": 1.8257, + "step": 7457 + }, + { + "epoch": 2.289134438305709, + "grad_norm": 0.6777750253677368, + "learning_rate": 8.99824720450243e-05, + "loss": 1.8581, + "step": 7458 + }, + { + "epoch": 2.289441375076734, + "grad_norm": 0.6244171857833862, + "learning_rate": 8.997948718966132e-05, + "loss": 1.9195, + "step": 7459 + }, + { + "epoch": 2.2897483118477595, + "grad_norm": 0.3903466463088989, + "learning_rate": 8.99765019391976e-05, + "loss": 1.8996, + "step": 7460 + }, + { + "epoch": 2.2900552486187844, + "grad_norm": 0.4231773614883423, + "learning_rate": 8.997351629366266e-05, + "loss": 1.9447, + "step": 7461 + }, + { + "epoch": 2.2903621853898097, + "grad_norm": 0.5735896825790405, + "learning_rate": 8.997053025308602e-05, + "loss": 1.9082, + "step": 7462 + }, + { + "epoch": 2.2906691221608346, + "grad_norm": 0.5015980005264282, + "learning_rate": 8.996754381749715e-05, + "loss": 1.8744, + "step": 7463 + }, + { + "epoch": 2.29097605893186, + "grad_norm": 0.3385339677333832, + "learning_rate": 8.996455698692558e-05, + "loss": 1.8908, + "step": 7464 + }, + { + "epoch": 2.2912829957028853, + "grad_norm": 0.35323935747146606, + "learning_rate": 8.996156976140086e-05, + "loss": 1.8739, + "step": 7465 + }, + { + "epoch": 2.29158993247391, + "grad_norm": 0.386081725358963, + "learning_rate": 8.995858214095248e-05, + "loss": 1.8734, + "step": 7466 + }, + { + "epoch": 2.2918968692449355, + "grad_norm": 0.32834386825561523, + "learning_rate": 8.995559412560996e-05, + "loss": 1.8849, + "step": 7467 + }, + { + "epoch": 2.292203806015961, + "grad_norm": 0.3868117034435272, + "learning_rate": 8.995260571540284e-05, + "loss": 1.8992, + "step": 7468 + }, + { + "epoch": 2.2925107427869857, + "grad_norm": 0.3869209885597229, + "learning_rate": 8.994961691036066e-05, + "loss": 1.8562, + "step": 7469 + }, + { + "epoch": 2.292817679558011, + "grad_norm": 0.39098650217056274, + "learning_rate": 8.994662771051294e-05, + "loss": 1.9077, + "step": 7470 + }, + { + "epoch": 2.2931246163290364, + "grad_norm": 0.4433341920375824, + "learning_rate": 8.994363811588923e-05, + "loss": 1.9193, + "step": 7471 + }, + { + "epoch": 2.2934315531000613, + "grad_norm": 0.37947940826416016, + "learning_rate": 8.99406481265191e-05, + "loss": 1.8843, + "step": 7472 + }, + { + "epoch": 2.2937384898710866, + "grad_norm": 0.4123954772949219, + "learning_rate": 8.993765774243206e-05, + "loss": 1.8847, + "step": 7473 + }, + { + "epoch": 2.2940454266421115, + "grad_norm": 0.3863835036754608, + "learning_rate": 8.993466696365768e-05, + "loss": 1.8226, + "step": 7474 + }, + { + "epoch": 2.294352363413137, + "grad_norm": 0.34903961420059204, + "learning_rate": 8.993167579022551e-05, + "loss": 1.9151, + "step": 7475 + }, + { + "epoch": 2.294659300184162, + "grad_norm": 0.439989298582077, + "learning_rate": 8.992868422216512e-05, + "loss": 1.8494, + "step": 7476 + }, + { + "epoch": 2.294966236955187, + "grad_norm": 0.42929476499557495, + "learning_rate": 8.992569225950607e-05, + "loss": 1.8174, + "step": 7477 + }, + { + "epoch": 2.2952731737262124, + "grad_norm": 0.39554497599601746, + "learning_rate": 8.992269990227792e-05, + "loss": 1.8692, + "step": 7478 + }, + { + "epoch": 2.2955801104972378, + "grad_norm": 0.29355254769325256, + "learning_rate": 8.991970715051026e-05, + "loss": 1.8033, + "step": 7479 + }, + { + "epoch": 2.2958870472682626, + "grad_norm": 0.3488605320453644, + "learning_rate": 8.991671400423265e-05, + "loss": 1.8979, + "step": 7480 + }, + { + "epoch": 2.296193984039288, + "grad_norm": 0.34984245896339417, + "learning_rate": 8.991372046347468e-05, + "loss": 1.8931, + "step": 7481 + }, + { + "epoch": 2.2965009208103133, + "grad_norm": 0.29404810070991516, + "learning_rate": 8.991072652826593e-05, + "loss": 1.8626, + "step": 7482 + }, + { + "epoch": 2.296807857581338, + "grad_norm": 0.2838701009750366, + "learning_rate": 8.990773219863598e-05, + "loss": 1.8542, + "step": 7483 + }, + { + "epoch": 2.2971147943523635, + "grad_norm": 0.28008925914764404, + "learning_rate": 8.990473747461444e-05, + "loss": 1.8354, + "step": 7484 + }, + { + "epoch": 2.2974217311233884, + "grad_norm": 0.3046751320362091, + "learning_rate": 8.99017423562309e-05, + "loss": 1.8657, + "step": 7485 + }, + { + "epoch": 2.2977286678944138, + "grad_norm": 0.28220781683921814, + "learning_rate": 8.989874684351494e-05, + "loss": 1.8349, + "step": 7486 + }, + { + "epoch": 2.298035604665439, + "grad_norm": 0.2665577232837677, + "learning_rate": 8.989575093649619e-05, + "loss": 1.8551, + "step": 7487 + }, + { + "epoch": 2.298342541436464, + "grad_norm": 0.2797924280166626, + "learning_rate": 8.989275463520423e-05, + "loss": 1.8568, + "step": 7488 + }, + { + "epoch": 2.2986494782074893, + "grad_norm": 0.2917410731315613, + "learning_rate": 8.98897579396687e-05, + "loss": 1.843, + "step": 7489 + }, + { + "epoch": 2.298956414978514, + "grad_norm": 0.3014819920063019, + "learning_rate": 8.98867608499192e-05, + "loss": 1.8527, + "step": 7490 + }, + { + "epoch": 2.2992633517495396, + "grad_norm": 0.28019243478775024, + "learning_rate": 8.988376336598537e-05, + "loss": 1.7744, + "step": 7491 + }, + { + "epoch": 2.299570288520565, + "grad_norm": 0.35014277696609497, + "learning_rate": 8.988076548789678e-05, + "loss": 1.9604, + "step": 7492 + }, + { + "epoch": 2.2998772252915898, + "grad_norm": 0.3060695230960846, + "learning_rate": 8.987776721568311e-05, + "loss": 1.8463, + "step": 7493 + }, + { + "epoch": 2.300184162062615, + "grad_norm": 0.29870638251304626, + "learning_rate": 8.987476854937395e-05, + "loss": 1.815, + "step": 7494 + }, + { + "epoch": 2.3004910988336404, + "grad_norm": 0.27395132184028625, + "learning_rate": 8.987176948899898e-05, + "loss": 1.8126, + "step": 7495 + }, + { + "epoch": 2.3007980356046653, + "grad_norm": 0.2982339859008789, + "learning_rate": 8.986877003458781e-05, + "loss": 1.9114, + "step": 7496 + }, + { + "epoch": 2.3011049723756907, + "grad_norm": 0.3113982081413269, + "learning_rate": 8.986577018617008e-05, + "loss": 1.8429, + "step": 7497 + }, + { + "epoch": 2.301411909146716, + "grad_norm": 0.3538585603237152, + "learning_rate": 8.986276994377544e-05, + "loss": 1.9045, + "step": 7498 + }, + { + "epoch": 2.301718845917741, + "grad_norm": 0.37576064467430115, + "learning_rate": 8.985976930743356e-05, + "loss": 1.8955, + "step": 7499 + }, + { + "epoch": 2.3020257826887662, + "grad_norm": 0.3080044388771057, + "learning_rate": 8.985676827717406e-05, + "loss": 1.7946, + "step": 7500 + }, + { + "epoch": 2.302332719459791, + "grad_norm": 0.33935341238975525, + "learning_rate": 8.985376685302662e-05, + "loss": 1.8817, + "step": 7501 + }, + { + "epoch": 2.3026396562308165, + "grad_norm": 0.3817180395126343, + "learning_rate": 8.98507650350209e-05, + "loss": 1.9178, + "step": 7502 + }, + { + "epoch": 2.302946593001842, + "grad_norm": 0.35170307755470276, + "learning_rate": 8.984776282318657e-05, + "loss": 1.9451, + "step": 7503 + }, + { + "epoch": 2.3032535297728667, + "grad_norm": 0.3451419770717621, + "learning_rate": 8.984476021755329e-05, + "loss": 1.9127, + "step": 7504 + }, + { + "epoch": 2.303560466543892, + "grad_norm": 0.4312259554862976, + "learning_rate": 8.984175721815071e-05, + "loss": 1.8784, + "step": 7505 + }, + { + "epoch": 2.303867403314917, + "grad_norm": 0.4684976041316986, + "learning_rate": 8.983875382500856e-05, + "loss": 1.8782, + "step": 7506 + }, + { + "epoch": 2.3041743400859422, + "grad_norm": 0.4230491518974304, + "learning_rate": 8.983575003815648e-05, + "loss": 1.8769, + "step": 7507 + }, + { + "epoch": 2.3044812768569676, + "grad_norm": 0.32715409994125366, + "learning_rate": 8.983274585762417e-05, + "loss": 1.8535, + "step": 7508 + }, + { + "epoch": 2.3047882136279925, + "grad_norm": 0.3857569396495819, + "learning_rate": 8.982974128344134e-05, + "loss": 1.8689, + "step": 7509 + }, + { + "epoch": 2.305095150399018, + "grad_norm": 0.46266329288482666, + "learning_rate": 8.982673631563766e-05, + "loss": 1.9151, + "step": 7510 + }, + { + "epoch": 2.305402087170043, + "grad_norm": 0.455713152885437, + "learning_rate": 8.98237309542428e-05, + "loss": 1.9304, + "step": 7511 + }, + { + "epoch": 2.305709023941068, + "grad_norm": 0.3413514792919159, + "learning_rate": 8.98207251992865e-05, + "loss": 1.8516, + "step": 7512 + }, + { + "epoch": 2.3060159607120934, + "grad_norm": 0.3705863058567047, + "learning_rate": 8.981771905079846e-05, + "loss": 1.8434, + "step": 7513 + }, + { + "epoch": 2.3063228974831187, + "grad_norm": 0.46615147590637207, + "learning_rate": 8.981471250880839e-05, + "loss": 1.9265, + "step": 7514 + }, + { + "epoch": 2.3066298342541436, + "grad_norm": 0.5400925278663635, + "learning_rate": 8.981170557334598e-05, + "loss": 1.9061, + "step": 7515 + }, + { + "epoch": 2.306936771025169, + "grad_norm": 0.40317288041114807, + "learning_rate": 8.980869824444096e-05, + "loss": 1.7916, + "step": 7516 + }, + { + "epoch": 2.307243707796194, + "grad_norm": 0.3522326648235321, + "learning_rate": 8.980569052212307e-05, + "loss": 1.867, + "step": 7517 + }, + { + "epoch": 2.307550644567219, + "grad_norm": 0.5134142637252808, + "learning_rate": 8.9802682406422e-05, + "loss": 1.8406, + "step": 7518 + }, + { + "epoch": 2.3078575813382445, + "grad_norm": 0.5792621970176697, + "learning_rate": 8.97996738973675e-05, + "loss": 1.8467, + "step": 7519 + }, + { + "epoch": 2.3081645181092694, + "grad_norm": 0.424405962228775, + "learning_rate": 8.979666499498928e-05, + "loss": 1.779, + "step": 7520 + }, + { + "epoch": 2.3084714548802947, + "grad_norm": 0.3233562409877777, + "learning_rate": 8.979365569931712e-05, + "loss": 1.9043, + "step": 7521 + }, + { + "epoch": 2.3087783916513196, + "grad_norm": 0.6043062806129456, + "learning_rate": 8.979064601038071e-05, + "loss": 1.9245, + "step": 7522 + }, + { + "epoch": 2.309085328422345, + "grad_norm": 0.6618810892105103, + "learning_rate": 8.978763592820982e-05, + "loss": 1.8601, + "step": 7523 + }, + { + "epoch": 2.3093922651933703, + "grad_norm": 0.44771909713745117, + "learning_rate": 8.978462545283418e-05, + "loss": 1.7836, + "step": 7524 + }, + { + "epoch": 2.309699201964395, + "grad_norm": 0.3473430871963501, + "learning_rate": 8.978161458428356e-05, + "loss": 1.8743, + "step": 7525 + }, + { + "epoch": 2.3100061387354205, + "grad_norm": 0.46158188581466675, + "learning_rate": 8.977860332258772e-05, + "loss": 1.8802, + "step": 7526 + }, + { + "epoch": 2.310313075506446, + "grad_norm": 0.42034098505973816, + "learning_rate": 8.977559166777639e-05, + "loss": 1.8773, + "step": 7527 + }, + { + "epoch": 2.3106200122774707, + "grad_norm": 0.30994895100593567, + "learning_rate": 8.977257961987936e-05, + "loss": 1.8042, + "step": 7528 + }, + { + "epoch": 2.310926949048496, + "grad_norm": 0.32265907526016235, + "learning_rate": 8.976956717892638e-05, + "loss": 1.8, + "step": 7529 + }, + { + "epoch": 2.3112338858195214, + "grad_norm": 0.3592197000980377, + "learning_rate": 8.976655434494723e-05, + "loss": 1.9053, + "step": 7530 + }, + { + "epoch": 2.3115408225905463, + "grad_norm": 0.36494702100753784, + "learning_rate": 8.97635411179717e-05, + "loss": 1.8982, + "step": 7531 + }, + { + "epoch": 2.3118477593615716, + "grad_norm": 0.3697327971458435, + "learning_rate": 8.976052749802952e-05, + "loss": 1.9446, + "step": 7532 + }, + { + "epoch": 2.3121546961325965, + "grad_norm": 0.5200048089027405, + "learning_rate": 8.975751348515052e-05, + "loss": 1.9429, + "step": 7533 + }, + { + "epoch": 2.312461632903622, + "grad_norm": 0.4033229947090149, + "learning_rate": 8.975449907936446e-05, + "loss": 1.8128, + "step": 7534 + }, + { + "epoch": 2.312768569674647, + "grad_norm": 0.35759851336479187, + "learning_rate": 8.975148428070115e-05, + "loss": 1.8721, + "step": 7535 + }, + { + "epoch": 2.313075506445672, + "grad_norm": 0.4578085243701935, + "learning_rate": 8.974846908919037e-05, + "loss": 1.8397, + "step": 7536 + }, + { + "epoch": 2.3133824432166974, + "grad_norm": 0.4557357132434845, + "learning_rate": 8.974545350486192e-05, + "loss": 1.8726, + "step": 7537 + }, + { + "epoch": 2.3136893799877223, + "grad_norm": 0.3946380615234375, + "learning_rate": 8.974243752774561e-05, + "loss": 1.8662, + "step": 7538 + }, + { + "epoch": 2.3139963167587476, + "grad_norm": 0.29723790287971497, + "learning_rate": 8.973942115787122e-05, + "loss": 1.8215, + "step": 7539 + }, + { + "epoch": 2.314303253529773, + "grad_norm": 0.37225791811943054, + "learning_rate": 8.973640439526858e-05, + "loss": 1.9422, + "step": 7540 + }, + { + "epoch": 2.314610190300798, + "grad_norm": 0.3359868824481964, + "learning_rate": 8.973338723996751e-05, + "loss": 1.7974, + "step": 7541 + }, + { + "epoch": 2.314917127071823, + "grad_norm": 0.2993139922618866, + "learning_rate": 8.973036969199782e-05, + "loss": 1.8691, + "step": 7542 + }, + { + "epoch": 2.3152240638428485, + "grad_norm": 0.3155567944049835, + "learning_rate": 8.972735175138933e-05, + "loss": 1.857, + "step": 7543 + }, + { + "epoch": 2.3155310006138734, + "grad_norm": 0.315820574760437, + "learning_rate": 8.972433341817188e-05, + "loss": 1.8597, + "step": 7544 + }, + { + "epoch": 2.3158379373848987, + "grad_norm": 0.32500606775283813, + "learning_rate": 8.972131469237526e-05, + "loss": 1.9293, + "step": 7545 + }, + { + "epoch": 2.316144874155924, + "grad_norm": 0.3481442332267761, + "learning_rate": 8.971829557402933e-05, + "loss": 1.8839, + "step": 7546 + }, + { + "epoch": 2.316451810926949, + "grad_norm": 0.3110404312610626, + "learning_rate": 8.971527606316394e-05, + "loss": 1.8717, + "step": 7547 + }, + { + "epoch": 2.3167587476979743, + "grad_norm": 0.319795161485672, + "learning_rate": 8.97122561598089e-05, + "loss": 1.8855, + "step": 7548 + }, + { + "epoch": 2.317065684468999, + "grad_norm": 0.33142411708831787, + "learning_rate": 8.970923586399407e-05, + "loss": 1.863, + "step": 7549 + }, + { + "epoch": 2.3173726212400245, + "grad_norm": 0.348715603351593, + "learning_rate": 8.970621517574929e-05, + "loss": 1.8886, + "step": 7550 + }, + { + "epoch": 2.31767955801105, + "grad_norm": 0.3179607689380646, + "learning_rate": 8.970319409510444e-05, + "loss": 1.8955, + "step": 7551 + }, + { + "epoch": 2.3179864947820747, + "grad_norm": 0.33166465163230896, + "learning_rate": 8.970017262208934e-05, + "loss": 1.8366, + "step": 7552 + }, + { + "epoch": 2.3182934315531, + "grad_norm": 0.30798691511154175, + "learning_rate": 8.969715075673386e-05, + "loss": 1.8437, + "step": 7553 + }, + { + "epoch": 2.3186003683241254, + "grad_norm": 0.292639821767807, + "learning_rate": 8.969412849906788e-05, + "loss": 1.8056, + "step": 7554 + }, + { + "epoch": 2.3189073050951503, + "grad_norm": 0.2972165048122406, + "learning_rate": 8.969110584912125e-05, + "loss": 1.8596, + "step": 7555 + }, + { + "epoch": 2.3192142418661756, + "grad_norm": 0.3346043527126312, + "learning_rate": 8.968808280692385e-05, + "loss": 1.8652, + "step": 7556 + }, + { + "epoch": 2.319521178637201, + "grad_norm": 0.31866857409477234, + "learning_rate": 8.968505937250555e-05, + "loss": 1.9263, + "step": 7557 + }, + { + "epoch": 2.319828115408226, + "grad_norm": 0.3511367440223694, + "learning_rate": 8.968203554589625e-05, + "loss": 1.8615, + "step": 7558 + }, + { + "epoch": 2.320135052179251, + "grad_norm": 0.36077243089675903, + "learning_rate": 8.96790113271258e-05, + "loss": 1.9155, + "step": 7559 + }, + { + "epoch": 2.320441988950276, + "grad_norm": 0.3335363268852234, + "learning_rate": 8.96759867162241e-05, + "loss": 1.8313, + "step": 7560 + }, + { + "epoch": 2.3207489257213014, + "grad_norm": 0.31834676861763, + "learning_rate": 8.967296171322105e-05, + "loss": 1.809, + "step": 7561 + }, + { + "epoch": 2.3210558624923268, + "grad_norm": 0.3629632890224457, + "learning_rate": 8.966993631814655e-05, + "loss": 1.854, + "step": 7562 + }, + { + "epoch": 2.3213627992633517, + "grad_norm": 0.3164220154285431, + "learning_rate": 8.966691053103049e-05, + "loss": 1.8431, + "step": 7563 + }, + { + "epoch": 2.321669736034377, + "grad_norm": 0.408178448677063, + "learning_rate": 8.966388435190276e-05, + "loss": 1.8652, + "step": 7564 + }, + { + "epoch": 2.321976672805402, + "grad_norm": 0.4244436025619507, + "learning_rate": 8.966085778079327e-05, + "loss": 1.8834, + "step": 7565 + }, + { + "epoch": 2.322283609576427, + "grad_norm": 0.44187989830970764, + "learning_rate": 8.965783081773195e-05, + "loss": 1.8822, + "step": 7566 + }, + { + "epoch": 2.3225905463474525, + "grad_norm": 0.30801042914390564, + "learning_rate": 8.965480346274869e-05, + "loss": 1.8145, + "step": 7567 + }, + { + "epoch": 2.3228974831184774, + "grad_norm": 0.30103740096092224, + "learning_rate": 8.965177571587343e-05, + "loss": 1.8207, + "step": 7568 + }, + { + "epoch": 2.3232044198895028, + "grad_norm": 0.417538046836853, + "learning_rate": 8.964874757713608e-05, + "loss": 1.9213, + "step": 7569 + }, + { + "epoch": 2.323511356660528, + "grad_norm": 0.4238434433937073, + "learning_rate": 8.964571904656656e-05, + "loss": 1.8309, + "step": 7570 + }, + { + "epoch": 2.323818293431553, + "grad_norm": 0.3717726171016693, + "learning_rate": 8.964269012419482e-05, + "loss": 1.8613, + "step": 7571 + }, + { + "epoch": 2.3241252302025783, + "grad_norm": 0.369182288646698, + "learning_rate": 8.963966081005078e-05, + "loss": 1.9232, + "step": 7572 + }, + { + "epoch": 2.3244321669736037, + "grad_norm": 0.40301385521888733, + "learning_rate": 8.963663110416436e-05, + "loss": 1.9509, + "step": 7573 + }, + { + "epoch": 2.3247391037446286, + "grad_norm": 0.3336825966835022, + "learning_rate": 8.963360100656553e-05, + "loss": 1.807, + "step": 7574 + }, + { + "epoch": 2.325046040515654, + "grad_norm": 0.4070039987564087, + "learning_rate": 8.963057051728423e-05, + "loss": 1.9349, + "step": 7575 + }, + { + "epoch": 2.325352977286679, + "grad_norm": 0.34244731068611145, + "learning_rate": 8.96275396363504e-05, + "loss": 1.8378, + "step": 7576 + }, + { + "epoch": 2.325659914057704, + "grad_norm": 0.3408849835395813, + "learning_rate": 8.962450836379401e-05, + "loss": 1.8087, + "step": 7577 + }, + { + "epoch": 2.3259668508287294, + "grad_norm": 0.34224358201026917, + "learning_rate": 8.962147669964498e-05, + "loss": 1.9158, + "step": 7578 + }, + { + "epoch": 2.3262737875997543, + "grad_norm": 0.36177051067352295, + "learning_rate": 8.961844464393332e-05, + "loss": 1.8774, + "step": 7579 + }, + { + "epoch": 2.3265807243707797, + "grad_norm": 0.3000224232673645, + "learning_rate": 8.961541219668895e-05, + "loss": 1.8092, + "step": 7580 + }, + { + "epoch": 2.3268876611418046, + "grad_norm": 0.34738194942474365, + "learning_rate": 8.961237935794185e-05, + "loss": 1.9107, + "step": 7581 + }, + { + "epoch": 2.32719459791283, + "grad_norm": 0.355585515499115, + "learning_rate": 8.960934612772203e-05, + "loss": 1.8343, + "step": 7582 + }, + { + "epoch": 2.3275015346838552, + "grad_norm": 0.29839828610420227, + "learning_rate": 8.96063125060594e-05, + "loss": 1.8345, + "step": 7583 + }, + { + "epoch": 2.32780847145488, + "grad_norm": 0.3695736229419708, + "learning_rate": 8.960327849298399e-05, + "loss": 1.8763, + "step": 7584 + }, + { + "epoch": 2.3281154082259055, + "grad_norm": 0.38834989070892334, + "learning_rate": 8.960024408852578e-05, + "loss": 1.8732, + "step": 7585 + }, + { + "epoch": 2.328422344996931, + "grad_norm": 0.4515606462955475, + "learning_rate": 8.959720929271474e-05, + "loss": 1.9685, + "step": 7586 + }, + { + "epoch": 2.3287292817679557, + "grad_norm": 0.39115825295448303, + "learning_rate": 8.959417410558087e-05, + "loss": 1.7969, + "step": 7587 + }, + { + "epoch": 2.329036218538981, + "grad_norm": 0.37858307361602783, + "learning_rate": 8.959113852715417e-05, + "loss": 1.9013, + "step": 7588 + }, + { + "epoch": 2.3293431553100064, + "grad_norm": 0.35533010959625244, + "learning_rate": 8.958810255746462e-05, + "loss": 1.8862, + "step": 7589 + }, + { + "epoch": 2.3296500920810312, + "grad_norm": 0.36994054913520813, + "learning_rate": 8.958506619654226e-05, + "loss": 1.9783, + "step": 7590 + }, + { + "epoch": 2.3299570288520566, + "grad_norm": 0.4424416124820709, + "learning_rate": 8.958202944441705e-05, + "loss": 1.9095, + "step": 7591 + }, + { + "epoch": 2.3302639656230815, + "grad_norm": 0.41932111978530884, + "learning_rate": 8.957899230111903e-05, + "loss": 1.8623, + "step": 7592 + }, + { + "epoch": 2.330570902394107, + "grad_norm": 0.4359748363494873, + "learning_rate": 8.957595476667822e-05, + "loss": 1.8917, + "step": 7593 + }, + { + "epoch": 2.330877839165132, + "grad_norm": 0.362957239151001, + "learning_rate": 8.957291684112463e-05, + "loss": 1.8478, + "step": 7594 + }, + { + "epoch": 2.331184775936157, + "grad_norm": 0.3442717492580414, + "learning_rate": 8.956987852448827e-05, + "loss": 1.862, + "step": 7595 + }, + { + "epoch": 2.3314917127071824, + "grad_norm": 0.33355212211608887, + "learning_rate": 8.956683981679918e-05, + "loss": 1.8319, + "step": 7596 + }, + { + "epoch": 2.3317986494782073, + "grad_norm": 0.36758801341056824, + "learning_rate": 8.95638007180874e-05, + "loss": 1.8989, + "step": 7597 + }, + { + "epoch": 2.3321055862492326, + "grad_norm": 0.3574751019477844, + "learning_rate": 8.956076122838294e-05, + "loss": 1.8304, + "step": 7598 + }, + { + "epoch": 2.332412523020258, + "grad_norm": 0.30615341663360596, + "learning_rate": 8.955772134771585e-05, + "loss": 1.9078, + "step": 7599 + }, + { + "epoch": 2.332719459791283, + "grad_norm": 0.38824397325515747, + "learning_rate": 8.955468107611618e-05, + "loss": 1.8733, + "step": 7600 + }, + { + "epoch": 2.333026396562308, + "grad_norm": 0.40545380115509033, + "learning_rate": 8.955164041361395e-05, + "loss": 1.8264, + "step": 7601 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.3104313910007477, + "learning_rate": 8.954859936023925e-05, + "loss": 1.8272, + "step": 7602 + }, + { + "epoch": 2.3336402701043584, + "grad_norm": 0.34795114398002625, + "learning_rate": 8.954555791602211e-05, + "loss": 1.8711, + "step": 7603 + }, + { + "epoch": 2.3339472068753837, + "grad_norm": 0.42790937423706055, + "learning_rate": 8.954251608099257e-05, + "loss": 1.8802, + "step": 7604 + }, + { + "epoch": 2.334254143646409, + "grad_norm": 0.3903054893016815, + "learning_rate": 8.953947385518072e-05, + "loss": 1.8489, + "step": 7605 + }, + { + "epoch": 2.334561080417434, + "grad_norm": 0.35869601368904114, + "learning_rate": 8.953643123861661e-05, + "loss": 1.8565, + "step": 7606 + }, + { + "epoch": 2.3348680171884593, + "grad_norm": 0.3960758447647095, + "learning_rate": 8.953338823133033e-05, + "loss": 1.9335, + "step": 7607 + }, + { + "epoch": 2.335174953959484, + "grad_norm": 0.3884136974811554, + "learning_rate": 8.953034483335191e-05, + "loss": 1.887, + "step": 7608 + }, + { + "epoch": 2.3354818907305095, + "grad_norm": 0.3734811246395111, + "learning_rate": 8.952730104471147e-05, + "loss": 1.861, + "step": 7609 + }, + { + "epoch": 2.335788827501535, + "grad_norm": 0.3074554204940796, + "learning_rate": 8.952425686543908e-05, + "loss": 1.8556, + "step": 7610 + }, + { + "epoch": 2.3360957642725597, + "grad_norm": 0.3098750412464142, + "learning_rate": 8.952121229556481e-05, + "loss": 1.8724, + "step": 7611 + }, + { + "epoch": 2.336402701043585, + "grad_norm": 0.3514649569988251, + "learning_rate": 8.951816733511875e-05, + "loss": 1.8023, + "step": 7612 + }, + { + "epoch": 2.33670963781461, + "grad_norm": 0.3275100290775299, + "learning_rate": 8.951512198413101e-05, + "loss": 1.8805, + "step": 7613 + }, + { + "epoch": 2.3370165745856353, + "grad_norm": 0.3380829989910126, + "learning_rate": 8.951207624263165e-05, + "loss": 1.8559, + "step": 7614 + }, + { + "epoch": 2.3373235113566606, + "grad_norm": 0.43179723620414734, + "learning_rate": 8.950903011065082e-05, + "loss": 1.937, + "step": 7615 + }, + { + "epoch": 2.337630448127686, + "grad_norm": 0.4981893002986908, + "learning_rate": 8.950598358821858e-05, + "loss": 1.8828, + "step": 7616 + }, + { + "epoch": 2.337937384898711, + "grad_norm": 0.42164552211761475, + "learning_rate": 8.950293667536506e-05, + "loss": 1.8898, + "step": 7617 + }, + { + "epoch": 2.338244321669736, + "grad_norm": 0.32897287607192993, + "learning_rate": 8.949988937212037e-05, + "loss": 1.9073, + "step": 7618 + }, + { + "epoch": 2.338551258440761, + "grad_norm": 0.38831618428230286, + "learning_rate": 8.949684167851462e-05, + "loss": 1.9694, + "step": 7619 + }, + { + "epoch": 2.3388581952117864, + "grad_norm": 0.3728467524051666, + "learning_rate": 8.949379359457793e-05, + "loss": 1.8803, + "step": 7620 + }, + { + "epoch": 2.3391651319828117, + "grad_norm": 0.4003579020500183, + "learning_rate": 8.949074512034044e-05, + "loss": 1.9306, + "step": 7621 + }, + { + "epoch": 2.3394720687538366, + "grad_norm": 0.35670751333236694, + "learning_rate": 8.948769625583224e-05, + "loss": 1.9176, + "step": 7622 + }, + { + "epoch": 2.339779005524862, + "grad_norm": 0.3257119357585907, + "learning_rate": 8.948464700108347e-05, + "loss": 1.8781, + "step": 7623 + }, + { + "epoch": 2.340085942295887, + "grad_norm": 0.2840226888656616, + "learning_rate": 8.94815973561243e-05, + "loss": 1.8112, + "step": 7624 + }, + { + "epoch": 2.340392879066912, + "grad_norm": 0.33156147599220276, + "learning_rate": 8.947854732098484e-05, + "loss": 1.8562, + "step": 7625 + }, + { + "epoch": 2.3406998158379375, + "grad_norm": 0.33335328102111816, + "learning_rate": 8.947549689569524e-05, + "loss": 1.8404, + "step": 7626 + }, + { + "epoch": 2.3410067526089624, + "grad_norm": 0.2913919985294342, + "learning_rate": 8.947244608028562e-05, + "loss": 1.83, + "step": 7627 + }, + { + "epoch": 2.3413136893799877, + "grad_norm": 0.32735875248908997, + "learning_rate": 8.946939487478618e-05, + "loss": 1.9047, + "step": 7628 + }, + { + "epoch": 2.341620626151013, + "grad_norm": 0.3421878516674042, + "learning_rate": 8.946634327922703e-05, + "loss": 1.8771, + "step": 7629 + }, + { + "epoch": 2.341927562922038, + "grad_norm": 0.33164483308792114, + "learning_rate": 8.946329129363835e-05, + "loss": 1.8463, + "step": 7630 + }, + { + "epoch": 2.3422344996930633, + "grad_norm": 0.35423099994659424, + "learning_rate": 8.946023891805029e-05, + "loss": 1.9254, + "step": 7631 + }, + { + "epoch": 2.3425414364640886, + "grad_norm": 0.3554958403110504, + "learning_rate": 8.9457186152493e-05, + "loss": 1.8949, + "step": 7632 + }, + { + "epoch": 2.3428483732351135, + "grad_norm": 0.35155919194221497, + "learning_rate": 8.94541329969967e-05, + "loss": 1.8432, + "step": 7633 + }, + { + "epoch": 2.343155310006139, + "grad_norm": 0.3210476338863373, + "learning_rate": 8.945107945159154e-05, + "loss": 1.8512, + "step": 7634 + }, + { + "epoch": 2.3434622467771637, + "grad_norm": 0.3587365746498108, + "learning_rate": 8.944802551630767e-05, + "loss": 1.8355, + "step": 7635 + }, + { + "epoch": 2.343769183548189, + "grad_norm": 0.41851457953453064, + "learning_rate": 8.94449711911753e-05, + "loss": 1.814, + "step": 7636 + }, + { + "epoch": 2.3440761203192144, + "grad_norm": 0.3516016900539398, + "learning_rate": 8.94419164762246e-05, + "loss": 1.8563, + "step": 7637 + }, + { + "epoch": 2.3443830570902393, + "grad_norm": 0.2917228937149048, + "learning_rate": 8.943886137148576e-05, + "loss": 1.8037, + "step": 7638 + }, + { + "epoch": 2.3446899938612646, + "grad_norm": 0.3597778379917145, + "learning_rate": 8.943580587698899e-05, + "loss": 1.8766, + "step": 7639 + }, + { + "epoch": 2.3449969306322895, + "grad_norm": 0.359642893075943, + "learning_rate": 8.943274999276445e-05, + "loss": 1.8485, + "step": 7640 + }, + { + "epoch": 2.345303867403315, + "grad_norm": 0.3543380796909332, + "learning_rate": 8.942969371884238e-05, + "loss": 1.8853, + "step": 7641 + }, + { + "epoch": 2.34561080417434, + "grad_norm": 0.371267706155777, + "learning_rate": 8.942663705525296e-05, + "loss": 1.869, + "step": 7642 + }, + { + "epoch": 2.345917740945365, + "grad_norm": 0.34073930978775024, + "learning_rate": 8.942358000202642e-05, + "loss": 1.831, + "step": 7643 + }, + { + "epoch": 2.3462246777163904, + "grad_norm": 0.3654492497444153, + "learning_rate": 8.942052255919293e-05, + "loss": 1.8697, + "step": 7644 + }, + { + "epoch": 2.3465316144874158, + "grad_norm": 0.31281957030296326, + "learning_rate": 8.941746472678275e-05, + "loss": 1.7908, + "step": 7645 + }, + { + "epoch": 2.3468385512584407, + "grad_norm": 0.3310844302177429, + "learning_rate": 8.941440650482607e-05, + "loss": 1.8523, + "step": 7646 + }, + { + "epoch": 2.347145488029466, + "grad_norm": 0.3187454342842102, + "learning_rate": 8.941134789335312e-05, + "loss": 1.8808, + "step": 7647 + }, + { + "epoch": 2.3474524248004913, + "grad_norm": 0.35980424284935, + "learning_rate": 8.940828889239415e-05, + "loss": 1.8713, + "step": 7648 + }, + { + "epoch": 2.347759361571516, + "grad_norm": 0.2960885763168335, + "learning_rate": 8.940522950197935e-05, + "loss": 1.8077, + "step": 7649 + }, + { + "epoch": 2.3480662983425415, + "grad_norm": 0.3056114912033081, + "learning_rate": 8.940216972213897e-05, + "loss": 1.8805, + "step": 7650 + }, + { + "epoch": 2.3483732351135664, + "grad_norm": 0.3047563135623932, + "learning_rate": 8.939910955290328e-05, + "loss": 1.793, + "step": 7651 + }, + { + "epoch": 2.3486801718845918, + "grad_norm": 0.3381251394748688, + "learning_rate": 8.939604899430248e-05, + "loss": 1.8267, + "step": 7652 + }, + { + "epoch": 2.348987108655617, + "grad_norm": 0.36855414509773254, + "learning_rate": 8.939298804636684e-05, + "loss": 1.9386, + "step": 7653 + }, + { + "epoch": 2.349294045426642, + "grad_norm": 0.3742626905441284, + "learning_rate": 8.93899267091266e-05, + "loss": 1.8695, + "step": 7654 + }, + { + "epoch": 2.3496009821976673, + "grad_norm": 0.3170017600059509, + "learning_rate": 8.938686498261201e-05, + "loss": 1.881, + "step": 7655 + }, + { + "epoch": 2.349907918968692, + "grad_norm": 0.2740418016910553, + "learning_rate": 8.938380286685334e-05, + "loss": 1.7992, + "step": 7656 + }, + { + "epoch": 2.3502148557397176, + "grad_norm": 0.3170342743396759, + "learning_rate": 8.938074036188087e-05, + "loss": 1.8281, + "step": 7657 + }, + { + "epoch": 2.350521792510743, + "grad_norm": 0.3487764298915863, + "learning_rate": 8.93776774677248e-05, + "loss": 1.8508, + "step": 7658 + }, + { + "epoch": 2.350828729281768, + "grad_norm": 0.3193725347518921, + "learning_rate": 8.937461418441549e-05, + "loss": 1.802, + "step": 7659 + }, + { + "epoch": 2.351135666052793, + "grad_norm": 0.30621078610420227, + "learning_rate": 8.937155051198312e-05, + "loss": 1.8723, + "step": 7660 + }, + { + "epoch": 2.3514426028238185, + "grad_norm": 0.3154527544975281, + "learning_rate": 8.936848645045803e-05, + "loss": 1.8276, + "step": 7661 + }, + { + "epoch": 2.3517495395948433, + "grad_norm": 0.3809822201728821, + "learning_rate": 8.936542199987048e-05, + "loss": 1.9682, + "step": 7662 + }, + { + "epoch": 2.3520564763658687, + "grad_norm": 0.3817490339279175, + "learning_rate": 8.936235716025076e-05, + "loss": 1.8896, + "step": 7663 + }, + { + "epoch": 2.352363413136894, + "grad_norm": 0.2996097207069397, + "learning_rate": 8.935929193162915e-05, + "loss": 1.7994, + "step": 7664 + }, + { + "epoch": 2.352670349907919, + "grad_norm": 0.30788013339042664, + "learning_rate": 8.935622631403596e-05, + "loss": 1.8243, + "step": 7665 + }, + { + "epoch": 2.3529772866789442, + "grad_norm": 0.331193745136261, + "learning_rate": 8.935316030750145e-05, + "loss": 1.9044, + "step": 7666 + }, + { + "epoch": 2.353284223449969, + "grad_norm": 0.31796711683273315, + "learning_rate": 8.935009391205598e-05, + "loss": 1.8006, + "step": 7667 + }, + { + "epoch": 2.3535911602209945, + "grad_norm": 0.3864014744758606, + "learning_rate": 8.934702712772979e-05, + "loss": 2.0193, + "step": 7668 + }, + { + "epoch": 2.35389809699202, + "grad_norm": 0.3923170566558838, + "learning_rate": 8.934395995455323e-05, + "loss": 1.9418, + "step": 7669 + }, + { + "epoch": 2.3542050337630447, + "grad_norm": 0.3210037052631378, + "learning_rate": 8.934089239255659e-05, + "loss": 1.7964, + "step": 7670 + }, + { + "epoch": 2.35451197053407, + "grad_norm": 0.32465317845344543, + "learning_rate": 8.933782444177019e-05, + "loss": 1.9405, + "step": 7671 + }, + { + "epoch": 2.354818907305095, + "grad_norm": 0.35554173588752747, + "learning_rate": 8.933475610222435e-05, + "loss": 1.8645, + "step": 7672 + }, + { + "epoch": 2.3551258440761202, + "grad_norm": 0.32723551988601685, + "learning_rate": 8.933168737394942e-05, + "loss": 1.8941, + "step": 7673 + }, + { + "epoch": 2.3554327808471456, + "grad_norm": 0.3295009732246399, + "learning_rate": 8.932861825697567e-05, + "loss": 1.9047, + "step": 7674 + }, + { + "epoch": 2.3557397176181705, + "grad_norm": 0.32315388321876526, + "learning_rate": 8.932554875133348e-05, + "loss": 1.8535, + "step": 7675 + }, + { + "epoch": 2.356046654389196, + "grad_norm": 0.31577154994010925, + "learning_rate": 8.932247885705315e-05, + "loss": 1.8697, + "step": 7676 + }, + { + "epoch": 2.356353591160221, + "grad_norm": 0.31099769473075867, + "learning_rate": 8.931940857416506e-05, + "loss": 1.8377, + "step": 7677 + }, + { + "epoch": 2.356660527931246, + "grad_norm": 0.32998642325401306, + "learning_rate": 8.931633790269954e-05, + "loss": 1.8528, + "step": 7678 + }, + { + "epoch": 2.3569674647022714, + "grad_norm": 0.29609233140945435, + "learning_rate": 8.93132668426869e-05, + "loss": 1.8646, + "step": 7679 + }, + { + "epoch": 2.3572744014732967, + "grad_norm": 0.31335413455963135, + "learning_rate": 8.931019539415752e-05, + "loss": 1.9011, + "step": 7680 + }, + { + "epoch": 2.3575813382443216, + "grad_norm": 0.3441788852214813, + "learning_rate": 8.930712355714174e-05, + "loss": 1.8673, + "step": 7681 + }, + { + "epoch": 2.357888275015347, + "grad_norm": 0.34610918164253235, + "learning_rate": 8.930405133166992e-05, + "loss": 1.8613, + "step": 7682 + }, + { + "epoch": 2.358195211786372, + "grad_norm": 0.31753265857696533, + "learning_rate": 8.930097871777245e-05, + "loss": 1.873, + "step": 7683 + }, + { + "epoch": 2.358502148557397, + "grad_norm": 0.29862073063850403, + "learning_rate": 8.929790571547966e-05, + "loss": 1.8392, + "step": 7684 + }, + { + "epoch": 2.3588090853284225, + "grad_norm": 0.2953017055988312, + "learning_rate": 8.929483232482194e-05, + "loss": 1.8402, + "step": 7685 + }, + { + "epoch": 2.3591160220994474, + "grad_norm": 0.36613956093788147, + "learning_rate": 8.929175854582966e-05, + "loss": 1.8954, + "step": 7686 + }, + { + "epoch": 2.3594229588704727, + "grad_norm": 0.3867746889591217, + "learning_rate": 8.928868437853319e-05, + "loss": 1.8496, + "step": 7687 + }, + { + "epoch": 2.359729895641498, + "grad_norm": 0.30742913484573364, + "learning_rate": 8.928560982296292e-05, + "loss": 1.82, + "step": 7688 + }, + { + "epoch": 2.360036832412523, + "grad_norm": 0.306905061006546, + "learning_rate": 8.928253487914921e-05, + "loss": 1.8299, + "step": 7689 + }, + { + "epoch": 2.3603437691835483, + "grad_norm": 0.3253326416015625, + "learning_rate": 8.927945954712247e-05, + "loss": 1.896, + "step": 7690 + }, + { + "epoch": 2.3606507059545736, + "grad_norm": 0.3139156699180603, + "learning_rate": 8.927638382691309e-05, + "loss": 1.838, + "step": 7691 + }, + { + "epoch": 2.3609576427255985, + "grad_norm": 0.3865121006965637, + "learning_rate": 8.927330771855147e-05, + "loss": 1.8502, + "step": 7692 + }, + { + "epoch": 2.361264579496624, + "grad_norm": 0.3640300929546356, + "learning_rate": 8.927023122206799e-05, + "loss": 1.8929, + "step": 7693 + }, + { + "epoch": 2.3615715162676487, + "grad_norm": 0.3446909487247467, + "learning_rate": 8.926715433749309e-05, + "loss": 1.864, + "step": 7694 + }, + { + "epoch": 2.361878453038674, + "grad_norm": 0.3086490035057068, + "learning_rate": 8.926407706485713e-05, + "loss": 1.8588, + "step": 7695 + }, + { + "epoch": 2.3621853898096994, + "grad_norm": 0.28351619839668274, + "learning_rate": 8.926099940419057e-05, + "loss": 1.8114, + "step": 7696 + }, + { + "epoch": 2.3624923265807243, + "grad_norm": 0.31882742047309875, + "learning_rate": 8.925792135552379e-05, + "loss": 1.8544, + "step": 7697 + }, + { + "epoch": 2.3627992633517496, + "grad_norm": 0.2691894769668579, + "learning_rate": 8.925484291888723e-05, + "loss": 1.8143, + "step": 7698 + }, + { + "epoch": 2.3631062001227745, + "grad_norm": 0.2815118432044983, + "learning_rate": 8.925176409431129e-05, + "loss": 1.8687, + "step": 7699 + }, + { + "epoch": 2.3634131368938, + "grad_norm": 0.34842196106910706, + "learning_rate": 8.924868488182643e-05, + "loss": 1.8673, + "step": 7700 + }, + { + "epoch": 2.363720073664825, + "grad_norm": 0.33553025126457214, + "learning_rate": 8.924560528146304e-05, + "loss": 1.8982, + "step": 7701 + }, + { + "epoch": 2.36402701043585, + "grad_norm": 0.30077221989631653, + "learning_rate": 8.924252529325159e-05, + "loss": 1.8155, + "step": 7702 + }, + { + "epoch": 2.3643339472068754, + "grad_norm": 0.3376595079898834, + "learning_rate": 8.923944491722252e-05, + "loss": 1.8871, + "step": 7703 + }, + { + "epoch": 2.3646408839779007, + "grad_norm": 0.3980284333229065, + "learning_rate": 8.923636415340622e-05, + "loss": 1.8414, + "step": 7704 + }, + { + "epoch": 2.3649478207489256, + "grad_norm": 0.4772777259349823, + "learning_rate": 8.92332830018332e-05, + "loss": 1.8393, + "step": 7705 + }, + { + "epoch": 2.365254757519951, + "grad_norm": 0.5061559081077576, + "learning_rate": 8.923020146253387e-05, + "loss": 1.9134, + "step": 7706 + }, + { + "epoch": 2.3655616942909763, + "grad_norm": 0.47147873044013977, + "learning_rate": 8.922711953553871e-05, + "loss": 1.9026, + "step": 7707 + }, + { + "epoch": 2.365868631062001, + "grad_norm": 0.37263748049736023, + "learning_rate": 8.922403722087814e-05, + "loss": 1.8474, + "step": 7708 + }, + { + "epoch": 2.3661755678330265, + "grad_norm": 0.3158501386642456, + "learning_rate": 8.922095451858265e-05, + "loss": 1.8771, + "step": 7709 + }, + { + "epoch": 2.3664825046040514, + "grad_norm": 0.3170566260814667, + "learning_rate": 8.921787142868271e-05, + "loss": 1.8111, + "step": 7710 + }, + { + "epoch": 2.3667894413750767, + "grad_norm": 0.3532208502292633, + "learning_rate": 8.921478795120877e-05, + "loss": 1.8708, + "step": 7711 + }, + { + "epoch": 2.367096378146102, + "grad_norm": 0.3211480379104614, + "learning_rate": 8.921170408619131e-05, + "loss": 1.8487, + "step": 7712 + }, + { + "epoch": 2.367403314917127, + "grad_norm": 0.2806071937084198, + "learning_rate": 8.920861983366083e-05, + "loss": 1.8325, + "step": 7713 + }, + { + "epoch": 2.3677102516881523, + "grad_norm": 0.30703970789909363, + "learning_rate": 8.920553519364777e-05, + "loss": 1.8364, + "step": 7714 + }, + { + "epoch": 2.368017188459177, + "grad_norm": 0.30848923325538635, + "learning_rate": 8.920245016618263e-05, + "loss": 1.833, + "step": 7715 + }, + { + "epoch": 2.3683241252302025, + "grad_norm": 0.31656739115715027, + "learning_rate": 8.919936475129588e-05, + "loss": 1.8884, + "step": 7716 + }, + { + "epoch": 2.368631062001228, + "grad_norm": 0.2806589603424072, + "learning_rate": 8.919627894901806e-05, + "loss": 1.7779, + "step": 7717 + }, + { + "epoch": 2.3689379987722528, + "grad_norm": 0.2943432629108429, + "learning_rate": 8.919319275937962e-05, + "loss": 1.8741, + "step": 7718 + }, + { + "epoch": 2.369244935543278, + "grad_norm": 0.2870347499847412, + "learning_rate": 8.919010618241111e-05, + "loss": 1.8415, + "step": 7719 + }, + { + "epoch": 2.3695518723143034, + "grad_norm": 0.3224312663078308, + "learning_rate": 8.918701921814297e-05, + "loss": 1.8594, + "step": 7720 + }, + { + "epoch": 2.3698588090853283, + "grad_norm": 0.3007681369781494, + "learning_rate": 8.918393186660575e-05, + "loss": 1.878, + "step": 7721 + }, + { + "epoch": 2.3701657458563536, + "grad_norm": 0.3083780109882355, + "learning_rate": 8.918084412782994e-05, + "loss": 1.9088, + "step": 7722 + }, + { + "epoch": 2.370472682627379, + "grad_norm": 0.30599063634872437, + "learning_rate": 8.917775600184608e-05, + "loss": 1.8743, + "step": 7723 + }, + { + "epoch": 2.370779619398404, + "grad_norm": 0.33503273129463196, + "learning_rate": 8.917466748868466e-05, + "loss": 1.9048, + "step": 7724 + }, + { + "epoch": 2.371086556169429, + "grad_norm": 0.3861919343471527, + "learning_rate": 8.917157858837622e-05, + "loss": 1.9073, + "step": 7725 + }, + { + "epoch": 2.371393492940454, + "grad_norm": 0.395945280790329, + "learning_rate": 8.916848930095128e-05, + "loss": 1.8678, + "step": 7726 + }, + { + "epoch": 2.3717004297114794, + "grad_norm": 0.3657386600971222, + "learning_rate": 8.916539962644037e-05, + "loss": 1.9138, + "step": 7727 + }, + { + "epoch": 2.3720073664825048, + "grad_norm": 0.32392752170562744, + "learning_rate": 8.916230956487402e-05, + "loss": 1.803, + "step": 7728 + }, + { + "epoch": 2.3723143032535297, + "grad_norm": 0.406703382730484, + "learning_rate": 8.915921911628278e-05, + "loss": 1.9222, + "step": 7729 + }, + { + "epoch": 2.372621240024555, + "grad_norm": 0.4293023645877838, + "learning_rate": 8.915612828069718e-05, + "loss": 1.8874, + "step": 7730 + }, + { + "epoch": 2.37292817679558, + "grad_norm": 0.45155876874923706, + "learning_rate": 8.915303705814777e-05, + "loss": 1.9059, + "step": 7731 + }, + { + "epoch": 2.373235113566605, + "grad_norm": 0.35105881094932556, + "learning_rate": 8.91499454486651e-05, + "loss": 1.8387, + "step": 7732 + }, + { + "epoch": 2.3735420503376305, + "grad_norm": 0.3197930157184601, + "learning_rate": 8.914685345227973e-05, + "loss": 1.8174, + "step": 7733 + }, + { + "epoch": 2.3738489871086554, + "grad_norm": 0.3610389232635498, + "learning_rate": 8.91437610690222e-05, + "loss": 1.841, + "step": 7734 + }, + { + "epoch": 2.3741559238796808, + "grad_norm": 0.3696954548358917, + "learning_rate": 8.91406682989231e-05, + "loss": 1.8511, + "step": 7735 + }, + { + "epoch": 2.374462860650706, + "grad_norm": 0.3364555239677429, + "learning_rate": 8.913757514201295e-05, + "loss": 1.8382, + "step": 7736 + }, + { + "epoch": 2.374769797421731, + "grad_norm": 0.4600698947906494, + "learning_rate": 8.913448159832236e-05, + "loss": 1.8247, + "step": 7737 + }, + { + "epoch": 2.3750767341927563, + "grad_norm": 0.5877843499183655, + "learning_rate": 8.913138766788187e-05, + "loss": 1.8449, + "step": 7738 + }, + { + "epoch": 2.3753836709637817, + "grad_norm": 0.5380640029907227, + "learning_rate": 8.912829335072208e-05, + "loss": 1.8647, + "step": 7739 + }, + { + "epoch": 2.3756906077348066, + "grad_norm": 0.5100306272506714, + "learning_rate": 8.912519864687357e-05, + "loss": 1.884, + "step": 7740 + }, + { + "epoch": 2.375997544505832, + "grad_norm": 0.48175910115242004, + "learning_rate": 8.91221035563669e-05, + "loss": 1.8378, + "step": 7741 + }, + { + "epoch": 2.376304481276857, + "grad_norm": 0.3296540081501007, + "learning_rate": 8.911900807923268e-05, + "loss": 1.8036, + "step": 7742 + }, + { + "epoch": 2.376611418047882, + "grad_norm": 0.32398131489753723, + "learning_rate": 8.911591221550149e-05, + "loss": 1.8415, + "step": 7743 + }, + { + "epoch": 2.3769183548189075, + "grad_norm": 0.33934786915779114, + "learning_rate": 8.911281596520393e-05, + "loss": 1.9002, + "step": 7744 + }, + { + "epoch": 2.3772252915899323, + "grad_norm": 0.33059465885162354, + "learning_rate": 8.91097193283706e-05, + "loss": 1.8194, + "step": 7745 + }, + { + "epoch": 2.3775322283609577, + "grad_norm": 0.2908796966075897, + "learning_rate": 8.91066223050321e-05, + "loss": 1.8272, + "step": 7746 + }, + { + "epoch": 2.3778391651319826, + "grad_norm": 0.31551963090896606, + "learning_rate": 8.910352489521904e-05, + "loss": 1.8717, + "step": 7747 + }, + { + "epoch": 2.378146101903008, + "grad_norm": 0.2886766493320465, + "learning_rate": 8.910042709896203e-05, + "loss": 1.8714, + "step": 7748 + }, + { + "epoch": 2.3784530386740332, + "grad_norm": 0.3288721740245819, + "learning_rate": 8.909732891629167e-05, + "loss": 1.9194, + "step": 7749 + }, + { + "epoch": 2.378759975445058, + "grad_norm": 0.42444637417793274, + "learning_rate": 8.90942303472386e-05, + "loss": 1.8871, + "step": 7750 + }, + { + "epoch": 2.3790669122160835, + "grad_norm": 0.3550770580768585, + "learning_rate": 8.909113139183343e-05, + "loss": 1.8639, + "step": 7751 + }, + { + "epoch": 2.379373848987109, + "grad_norm": 0.3291744589805603, + "learning_rate": 8.908803205010679e-05, + "loss": 1.8284, + "step": 7752 + }, + { + "epoch": 2.3796807857581337, + "grad_norm": 0.2803054451942444, + "learning_rate": 8.908493232208928e-05, + "loss": 1.8113, + "step": 7753 + }, + { + "epoch": 2.379987722529159, + "grad_norm": 0.30959245562553406, + "learning_rate": 8.908183220781158e-05, + "loss": 1.8821, + "step": 7754 + }, + { + "epoch": 2.3802946593001844, + "grad_norm": 0.37838777899742126, + "learning_rate": 8.907873170730431e-05, + "loss": 1.8749, + "step": 7755 + }, + { + "epoch": 2.3806015960712092, + "grad_norm": 0.34625449776649475, + "learning_rate": 8.907563082059813e-05, + "loss": 1.8804, + "step": 7756 + }, + { + "epoch": 2.3809085328422346, + "grad_norm": 0.3966830372810364, + "learning_rate": 8.907252954772364e-05, + "loss": 1.9295, + "step": 7757 + }, + { + "epoch": 2.3812154696132595, + "grad_norm": 0.3144119679927826, + "learning_rate": 8.906942788871151e-05, + "loss": 1.8486, + "step": 7758 + }, + { + "epoch": 2.381522406384285, + "grad_norm": 0.3498438596725464, + "learning_rate": 8.90663258435924e-05, + "loss": 1.8813, + "step": 7759 + }, + { + "epoch": 2.38182934315531, + "grad_norm": 0.32803723216056824, + "learning_rate": 8.906322341239696e-05, + "loss": 1.8282, + "step": 7760 + }, + { + "epoch": 2.382136279926335, + "grad_norm": 0.28600773215293884, + "learning_rate": 8.906012059515585e-05, + "loss": 1.8319, + "step": 7761 + }, + { + "epoch": 2.3824432166973604, + "grad_norm": 0.2743505537509918, + "learning_rate": 8.905701739189973e-05, + "loss": 1.8198, + "step": 7762 + }, + { + "epoch": 2.3827501534683857, + "grad_norm": 0.3011966347694397, + "learning_rate": 8.905391380265929e-05, + "loss": 1.8476, + "step": 7763 + }, + { + "epoch": 2.3830570902394106, + "grad_norm": 0.3022943437099457, + "learning_rate": 8.905080982746516e-05, + "loss": 1.9037, + "step": 7764 + }, + { + "epoch": 2.383364027010436, + "grad_norm": 0.3333243727684021, + "learning_rate": 8.904770546634805e-05, + "loss": 1.8487, + "step": 7765 + }, + { + "epoch": 2.3836709637814613, + "grad_norm": 0.3773072361946106, + "learning_rate": 8.904460071933862e-05, + "loss": 1.8828, + "step": 7766 + }, + { + "epoch": 2.383977900552486, + "grad_norm": 0.4382041096687317, + "learning_rate": 8.904149558646756e-05, + "loss": 1.9069, + "step": 7767 + }, + { + "epoch": 2.3842848373235115, + "grad_norm": 0.3963650166988373, + "learning_rate": 8.903839006776557e-05, + "loss": 1.816, + "step": 7768 + }, + { + "epoch": 2.3845917740945364, + "grad_norm": 0.35340386629104614, + "learning_rate": 8.903528416326333e-05, + "loss": 1.8853, + "step": 7769 + }, + { + "epoch": 2.3848987108655617, + "grad_norm": 0.31519120931625366, + "learning_rate": 8.903217787299153e-05, + "loss": 1.8953, + "step": 7770 + }, + { + "epoch": 2.385205647636587, + "grad_norm": 0.41126203536987305, + "learning_rate": 8.902907119698088e-05, + "loss": 1.9494, + "step": 7771 + }, + { + "epoch": 2.385512584407612, + "grad_norm": 0.4488140344619751, + "learning_rate": 8.902596413526205e-05, + "loss": 1.8717, + "step": 7772 + }, + { + "epoch": 2.3858195211786373, + "grad_norm": 0.36129191517829895, + "learning_rate": 8.902285668786578e-05, + "loss": 1.8472, + "step": 7773 + }, + { + "epoch": 2.386126457949662, + "grad_norm": 0.3357439935207367, + "learning_rate": 8.901974885482277e-05, + "loss": 1.8143, + "step": 7774 + }, + { + "epoch": 2.3864333947206875, + "grad_norm": 0.2832469046115875, + "learning_rate": 8.901664063616372e-05, + "loss": 1.7952, + "step": 7775 + }, + { + "epoch": 2.386740331491713, + "grad_norm": 0.31065669655799866, + "learning_rate": 8.901353203191937e-05, + "loss": 1.8651, + "step": 7776 + }, + { + "epoch": 2.3870472682627377, + "grad_norm": 0.2985263764858246, + "learning_rate": 8.901042304212042e-05, + "loss": 1.8106, + "step": 7777 + }, + { + "epoch": 2.387354205033763, + "grad_norm": 0.31606364250183105, + "learning_rate": 8.900731366679761e-05, + "loss": 1.8831, + "step": 7778 + }, + { + "epoch": 2.3876611418047884, + "grad_norm": 0.33167949318885803, + "learning_rate": 8.900420390598166e-05, + "loss": 1.9494, + "step": 7779 + }, + { + "epoch": 2.3879680785758133, + "grad_norm": 0.32814472913742065, + "learning_rate": 8.900109375970333e-05, + "loss": 1.8654, + "step": 7780 + }, + { + "epoch": 2.3882750153468386, + "grad_norm": 0.35307401418685913, + "learning_rate": 8.899798322799331e-05, + "loss": 1.904, + "step": 7781 + }, + { + "epoch": 2.388581952117864, + "grad_norm": 0.3936740458011627, + "learning_rate": 8.899487231088236e-05, + "loss": 1.8404, + "step": 7782 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.3675380349159241, + "learning_rate": 8.899176100840124e-05, + "loss": 1.8689, + "step": 7783 + }, + { + "epoch": 2.389195825659914, + "grad_norm": 0.34065911173820496, + "learning_rate": 8.898864932058067e-05, + "loss": 1.8819, + "step": 7784 + }, + { + "epoch": 2.389502762430939, + "grad_norm": 0.31531861424446106, + "learning_rate": 8.898553724745142e-05, + "loss": 1.8379, + "step": 7785 + }, + { + "epoch": 2.3898096992019644, + "grad_norm": 0.33485177159309387, + "learning_rate": 8.898242478904424e-05, + "loss": 1.9206, + "step": 7786 + }, + { + "epoch": 2.3901166359729897, + "grad_norm": 0.33116385340690613, + "learning_rate": 8.897931194538989e-05, + "loss": 1.8744, + "step": 7787 + }, + { + "epoch": 2.3904235727440146, + "grad_norm": 0.33216002583503723, + "learning_rate": 8.897619871651915e-05, + "loss": 1.8794, + "step": 7788 + }, + { + "epoch": 2.39073050951504, + "grad_norm": 0.3246794641017914, + "learning_rate": 8.897308510246273e-05, + "loss": 1.8739, + "step": 7789 + }, + { + "epoch": 2.391037446286065, + "grad_norm": 0.3038793206214905, + "learning_rate": 8.896997110325146e-05, + "loss": 1.8314, + "step": 7790 + }, + { + "epoch": 2.39134438305709, + "grad_norm": 0.35726267099380493, + "learning_rate": 8.896685671891612e-05, + "loss": 1.8764, + "step": 7791 + }, + { + "epoch": 2.3916513198281155, + "grad_norm": 0.421522855758667, + "learning_rate": 8.896374194948744e-05, + "loss": 1.8215, + "step": 7792 + }, + { + "epoch": 2.3919582565991404, + "grad_norm": 0.4456072747707367, + "learning_rate": 8.896062679499621e-05, + "loss": 1.9146, + "step": 7793 + }, + { + "epoch": 2.3922651933701657, + "grad_norm": 0.33498415350914, + "learning_rate": 8.895751125547325e-05, + "loss": 1.8372, + "step": 7794 + }, + { + "epoch": 2.392572130141191, + "grad_norm": 0.3279598355293274, + "learning_rate": 8.895439533094933e-05, + "loss": 1.8469, + "step": 7795 + }, + { + "epoch": 2.392879066912216, + "grad_norm": 0.4238305687904358, + "learning_rate": 8.895127902145524e-05, + "loss": 1.8259, + "step": 7796 + }, + { + "epoch": 2.3931860036832413, + "grad_norm": 0.473057359457016, + "learning_rate": 8.89481623270218e-05, + "loss": 1.8374, + "step": 7797 + }, + { + "epoch": 2.3934929404542666, + "grad_norm": 0.30914968252182007, + "learning_rate": 8.894504524767976e-05, + "loss": 1.7803, + "step": 7798 + }, + { + "epoch": 2.3937998772252915, + "grad_norm": 0.3433384597301483, + "learning_rate": 8.894192778345996e-05, + "loss": 1.8568, + "step": 7799 + }, + { + "epoch": 2.394106813996317, + "grad_norm": 0.4965706467628479, + "learning_rate": 8.893880993439323e-05, + "loss": 1.8576, + "step": 7800 + }, + { + "epoch": 2.3944137507673418, + "grad_norm": 0.4996519684791565, + "learning_rate": 8.893569170051032e-05, + "loss": 1.788, + "step": 7801 + }, + { + "epoch": 2.394720687538367, + "grad_norm": 0.31231364607810974, + "learning_rate": 8.893257308184212e-05, + "loss": 1.7846, + "step": 7802 + }, + { + "epoch": 2.3950276243093924, + "grad_norm": 0.32845574617385864, + "learning_rate": 8.89294540784194e-05, + "loss": 1.8811, + "step": 7803 + }, + { + "epoch": 2.3953345610804173, + "grad_norm": 0.525324285030365, + "learning_rate": 8.8926334690273e-05, + "loss": 1.8458, + "step": 7804 + }, + { + "epoch": 2.3956414978514426, + "grad_norm": 0.5107213854789734, + "learning_rate": 8.892321491743373e-05, + "loss": 1.8419, + "step": 7805 + }, + { + "epoch": 2.3959484346224675, + "grad_norm": 0.33831658959388733, + "learning_rate": 8.892009475993245e-05, + "loss": 1.811, + "step": 7806 + }, + { + "epoch": 2.396255371393493, + "grad_norm": 0.3781357407569885, + "learning_rate": 8.891697421779999e-05, + "loss": 1.9385, + "step": 7807 + }, + { + "epoch": 2.396562308164518, + "grad_norm": 0.43507882952690125, + "learning_rate": 8.891385329106717e-05, + "loss": 1.7705, + "step": 7808 + }, + { + "epoch": 2.396869244935543, + "grad_norm": 0.45114290714263916, + "learning_rate": 8.891073197976483e-05, + "loss": 1.8661, + "step": 7809 + }, + { + "epoch": 2.3971761817065684, + "grad_norm": 0.29369547963142395, + "learning_rate": 8.890761028392385e-05, + "loss": 1.873, + "step": 7810 + }, + { + "epoch": 2.3974831184775938, + "grad_norm": 0.3268595337867737, + "learning_rate": 8.890448820357506e-05, + "loss": 1.8461, + "step": 7811 + }, + { + "epoch": 2.3977900552486187, + "grad_norm": 0.4514225423336029, + "learning_rate": 8.890136573874931e-05, + "loss": 1.8458, + "step": 7812 + }, + { + "epoch": 2.398096992019644, + "grad_norm": 0.5288760662078857, + "learning_rate": 8.889824288947745e-05, + "loss": 1.8301, + "step": 7813 + }, + { + "epoch": 2.3984039287906693, + "grad_norm": 0.46517884731292725, + "learning_rate": 8.889511965579038e-05, + "loss": 1.8769, + "step": 7814 + }, + { + "epoch": 2.398710865561694, + "grad_norm": 0.29907044768333435, + "learning_rate": 8.889199603771892e-05, + "loss": 1.7815, + "step": 7815 + }, + { + "epoch": 2.3990178023327196, + "grad_norm": 0.36091622710227966, + "learning_rate": 8.888887203529398e-05, + "loss": 1.8375, + "step": 7816 + }, + { + "epoch": 2.3993247391037444, + "grad_norm": 0.5604190230369568, + "learning_rate": 8.88857476485464e-05, + "loss": 1.9176, + "step": 7817 + }, + { + "epoch": 2.3996316758747698, + "grad_norm": 0.48299452662467957, + "learning_rate": 8.888262287750707e-05, + "loss": 1.8682, + "step": 7818 + }, + { + "epoch": 2.399938612645795, + "grad_norm": 0.32829394936561584, + "learning_rate": 8.887949772220687e-05, + "loss": 1.9143, + "step": 7819 + }, + { + "epoch": 2.40024554941682, + "grad_norm": 0.401719868183136, + "learning_rate": 8.88763721826767e-05, + "loss": 1.8517, + "step": 7820 + }, + { + "epoch": 2.4005524861878453, + "grad_norm": 0.5205032825469971, + "learning_rate": 8.887324625894741e-05, + "loss": 1.811, + "step": 7821 + }, + { + "epoch": 2.4008594229588702, + "grad_norm": 0.3828800618648529, + "learning_rate": 8.887011995104993e-05, + "loss": 1.8042, + "step": 7822 + }, + { + "epoch": 2.4011663597298956, + "grad_norm": 0.31816062331199646, + "learning_rate": 8.886699325901514e-05, + "loss": 1.8998, + "step": 7823 + }, + { + "epoch": 2.401473296500921, + "grad_norm": 0.36172720789909363, + "learning_rate": 8.886386618287394e-05, + "loss": 1.8689, + "step": 7824 + }, + { + "epoch": 2.401780233271946, + "grad_norm": 0.3582005202770233, + "learning_rate": 8.886073872265725e-05, + "loss": 1.8565, + "step": 7825 + }, + { + "epoch": 2.402087170042971, + "grad_norm": 0.2915255129337311, + "learning_rate": 8.885761087839594e-05, + "loss": 1.8686, + "step": 7826 + }, + { + "epoch": 2.4023941068139965, + "grad_norm": 0.26619917154312134, + "learning_rate": 8.885448265012095e-05, + "loss": 1.7737, + "step": 7827 + }, + { + "epoch": 2.4027010435850213, + "grad_norm": 0.31685733795166016, + "learning_rate": 8.88513540378632e-05, + "loss": 1.9136, + "step": 7828 + }, + { + "epoch": 2.4030079803560467, + "grad_norm": 0.3427450954914093, + "learning_rate": 8.884822504165359e-05, + "loss": 1.8824, + "step": 7829 + }, + { + "epoch": 2.403314917127072, + "grad_norm": 0.3207513689994812, + "learning_rate": 8.884509566152306e-05, + "loss": 1.8332, + "step": 7830 + }, + { + "epoch": 2.403621853898097, + "grad_norm": 0.3301675319671631, + "learning_rate": 8.884196589750251e-05, + "loss": 1.9129, + "step": 7831 + }, + { + "epoch": 2.4039287906691222, + "grad_norm": 0.3232486844062805, + "learning_rate": 8.88388357496229e-05, + "loss": 1.8362, + "step": 7832 + }, + { + "epoch": 2.404235727440147, + "grad_norm": 0.3152230381965637, + "learning_rate": 8.883570521791514e-05, + "loss": 1.8586, + "step": 7833 + }, + { + "epoch": 2.4045426642111725, + "grad_norm": 0.3204822540283203, + "learning_rate": 8.883257430241019e-05, + "loss": 1.842, + "step": 7834 + }, + { + "epoch": 2.404849600982198, + "grad_norm": 0.28253886103630066, + "learning_rate": 8.882944300313897e-05, + "loss": 1.8521, + "step": 7835 + }, + { + "epoch": 2.4051565377532227, + "grad_norm": 0.37631165981292725, + "learning_rate": 8.882631132013245e-05, + "loss": 1.8838, + "step": 7836 + }, + { + "epoch": 2.405463474524248, + "grad_norm": 0.3606031537055969, + "learning_rate": 8.882317925342157e-05, + "loss": 1.8452, + "step": 7837 + }, + { + "epoch": 2.4057704112952734, + "grad_norm": 0.33793914318084717, + "learning_rate": 8.882004680303726e-05, + "loss": 1.8866, + "step": 7838 + }, + { + "epoch": 2.4060773480662982, + "grad_norm": 0.2714223265647888, + "learning_rate": 8.881691396901048e-05, + "loss": 1.7953, + "step": 7839 + }, + { + "epoch": 2.4063842848373236, + "grad_norm": 0.3588239252567291, + "learning_rate": 8.881378075137224e-05, + "loss": 1.9679, + "step": 7840 + }, + { + "epoch": 2.406691221608349, + "grad_norm": 0.3266383707523346, + "learning_rate": 8.881064715015344e-05, + "loss": 1.8747, + "step": 7841 + }, + { + "epoch": 2.406998158379374, + "grad_norm": 0.3498428761959076, + "learning_rate": 8.88075131653851e-05, + "loss": 1.8882, + "step": 7842 + }, + { + "epoch": 2.407305095150399, + "grad_norm": 0.36646100878715515, + "learning_rate": 8.880437879709815e-05, + "loss": 1.8624, + "step": 7843 + }, + { + "epoch": 2.407612031921424, + "grad_norm": 0.36088457703590393, + "learning_rate": 8.88012440453236e-05, + "loss": 1.8527, + "step": 7844 + }, + { + "epoch": 2.4079189686924494, + "grad_norm": 0.3267477750778198, + "learning_rate": 8.87981089100924e-05, + "loss": 1.8374, + "step": 7845 + }, + { + "epoch": 2.4082259054634747, + "grad_norm": 0.3262403607368469, + "learning_rate": 8.879497339143556e-05, + "loss": 1.8752, + "step": 7846 + }, + { + "epoch": 2.4085328422344996, + "grad_norm": 0.278877854347229, + "learning_rate": 8.879183748938405e-05, + "loss": 1.8056, + "step": 7847 + }, + { + "epoch": 2.408839779005525, + "grad_norm": 0.35509005188941956, + "learning_rate": 8.878870120396886e-05, + "loss": 1.8555, + "step": 7848 + }, + { + "epoch": 2.40914671577655, + "grad_norm": 0.3621126413345337, + "learning_rate": 8.8785564535221e-05, + "loss": 1.8084, + "step": 7849 + }, + { + "epoch": 2.409453652547575, + "grad_norm": 0.2772746682167053, + "learning_rate": 8.878242748317145e-05, + "loss": 1.8034, + "step": 7850 + }, + { + "epoch": 2.4097605893186005, + "grad_norm": 0.30938875675201416, + "learning_rate": 8.877929004785121e-05, + "loss": 1.8341, + "step": 7851 + }, + { + "epoch": 2.4100675260896254, + "grad_norm": 0.3349369764328003, + "learning_rate": 8.877615222929133e-05, + "loss": 1.8306, + "step": 7852 + }, + { + "epoch": 2.4103744628606507, + "grad_norm": 0.3109685778617859, + "learning_rate": 8.877301402752277e-05, + "loss": 1.7998, + "step": 7853 + }, + { + "epoch": 2.410681399631676, + "grad_norm": 0.3337927460670471, + "learning_rate": 8.876987544257655e-05, + "loss": 1.8766, + "step": 7854 + }, + { + "epoch": 2.410988336402701, + "grad_norm": 0.33891361951828003, + "learning_rate": 8.87667364744837e-05, + "loss": 1.8535, + "step": 7855 + }, + { + "epoch": 2.4112952731737263, + "grad_norm": 0.30946552753448486, + "learning_rate": 8.876359712327524e-05, + "loss": 1.8144, + "step": 7856 + }, + { + "epoch": 2.4116022099447516, + "grad_norm": 0.354981929063797, + "learning_rate": 8.87604573889822e-05, + "loss": 1.9253, + "step": 7857 + }, + { + "epoch": 2.4119091467157765, + "grad_norm": 0.42054516077041626, + "learning_rate": 8.875731727163559e-05, + "loss": 1.9122, + "step": 7858 + }, + { + "epoch": 2.412216083486802, + "grad_norm": 0.37435492873191833, + "learning_rate": 8.875417677126646e-05, + "loss": 1.8639, + "step": 7859 + }, + { + "epoch": 2.4125230202578267, + "grad_norm": 0.3742216229438782, + "learning_rate": 8.875103588790584e-05, + "loss": 1.8398, + "step": 7860 + }, + { + "epoch": 2.412829957028852, + "grad_norm": 0.3152104616165161, + "learning_rate": 8.874789462158478e-05, + "loss": 1.8078, + "step": 7861 + }, + { + "epoch": 2.4131368937998774, + "grad_norm": 0.32342761754989624, + "learning_rate": 8.87447529723343e-05, + "loss": 1.8632, + "step": 7862 + }, + { + "epoch": 2.4134438305709023, + "grad_norm": 0.31065210700035095, + "learning_rate": 8.874161094018547e-05, + "loss": 1.845, + "step": 7863 + }, + { + "epoch": 2.4137507673419276, + "grad_norm": 0.31379538774490356, + "learning_rate": 8.873846852516933e-05, + "loss": 1.8184, + "step": 7864 + }, + { + "epoch": 2.4140577041129525, + "grad_norm": 0.29058924317359924, + "learning_rate": 8.873532572731694e-05, + "loss": 1.8671, + "step": 7865 + }, + { + "epoch": 2.414364640883978, + "grad_norm": 0.3024691641330719, + "learning_rate": 8.873218254665936e-05, + "loss": 1.7977, + "step": 7866 + }, + { + "epoch": 2.414671577655003, + "grad_norm": 0.30356913805007935, + "learning_rate": 8.872903898322764e-05, + "loss": 1.8284, + "step": 7867 + }, + { + "epoch": 2.414978514426028, + "grad_norm": 0.29594334959983826, + "learning_rate": 8.872589503705287e-05, + "loss": 1.8651, + "step": 7868 + }, + { + "epoch": 2.4152854511970534, + "grad_norm": 0.2929564118385315, + "learning_rate": 8.872275070816612e-05, + "loss": 1.8671, + "step": 7869 + }, + { + "epoch": 2.4155923879680787, + "grad_norm": 0.30591902136802673, + "learning_rate": 8.871960599659842e-05, + "loss": 1.9341, + "step": 7870 + }, + { + "epoch": 2.4158993247391036, + "grad_norm": 0.3944799304008484, + "learning_rate": 8.87164609023809e-05, + "loss": 1.8947, + "step": 7871 + }, + { + "epoch": 2.416206261510129, + "grad_norm": 0.3568263351917267, + "learning_rate": 8.871331542554461e-05, + "loss": 1.8466, + "step": 7872 + }, + { + "epoch": 2.4165131982811543, + "grad_norm": 0.3182635009288788, + "learning_rate": 8.871016956612066e-05, + "loss": 1.8373, + "step": 7873 + }, + { + "epoch": 2.416820135052179, + "grad_norm": 0.31941649317741394, + "learning_rate": 8.870702332414012e-05, + "loss": 1.8356, + "step": 7874 + }, + { + "epoch": 2.4171270718232045, + "grad_norm": 0.3090899586677551, + "learning_rate": 8.870387669963407e-05, + "loss": 1.9308, + "step": 7875 + }, + { + "epoch": 2.4174340085942294, + "grad_norm": 0.3078390955924988, + "learning_rate": 8.870072969263364e-05, + "loss": 1.8521, + "step": 7876 + }, + { + "epoch": 2.4177409453652547, + "grad_norm": 0.29126885533332825, + "learning_rate": 8.869758230316992e-05, + "loss": 1.8091, + "step": 7877 + }, + { + "epoch": 2.41804788213628, + "grad_norm": 0.36473605036735535, + "learning_rate": 8.869443453127402e-05, + "loss": 1.8282, + "step": 7878 + }, + { + "epoch": 2.418354818907305, + "grad_norm": 0.3617660701274872, + "learning_rate": 8.869128637697702e-05, + "loss": 1.8843, + "step": 7879 + }, + { + "epoch": 2.4186617556783303, + "grad_norm": 0.33267220854759216, + "learning_rate": 8.868813784031005e-05, + "loss": 1.8647, + "step": 7880 + }, + { + "epoch": 2.418968692449355, + "grad_norm": 0.29990482330322266, + "learning_rate": 8.868498892130424e-05, + "loss": 1.7697, + "step": 7881 + }, + { + "epoch": 2.4192756292203805, + "grad_norm": 0.3618892431259155, + "learning_rate": 8.868183961999068e-05, + "loss": 1.7699, + "step": 7882 + }, + { + "epoch": 2.419582565991406, + "grad_norm": 0.29534587264060974, + "learning_rate": 8.867868993640051e-05, + "loss": 1.828, + "step": 7883 + }, + { + "epoch": 2.4198895027624308, + "grad_norm": 0.3086758255958557, + "learning_rate": 8.867553987056487e-05, + "loss": 1.8652, + "step": 7884 + }, + { + "epoch": 2.420196439533456, + "grad_norm": 0.3273947834968567, + "learning_rate": 8.867238942251487e-05, + "loss": 1.8553, + "step": 7885 + }, + { + "epoch": 2.4205033763044814, + "grad_norm": 0.3069070279598236, + "learning_rate": 8.866923859228165e-05, + "loss": 1.8057, + "step": 7886 + }, + { + "epoch": 2.4208103130755063, + "grad_norm": 0.2884439527988434, + "learning_rate": 8.866608737989635e-05, + "loss": 1.8479, + "step": 7887 + }, + { + "epoch": 2.4211172498465316, + "grad_norm": 0.32123002409935, + "learning_rate": 8.866293578539011e-05, + "loss": 1.916, + "step": 7888 + }, + { + "epoch": 2.421424186617557, + "grad_norm": 0.285966157913208, + "learning_rate": 8.865978380879407e-05, + "loss": 1.834, + "step": 7889 + }, + { + "epoch": 2.421731123388582, + "grad_norm": 0.28088799118995667, + "learning_rate": 8.865663145013941e-05, + "loss": 1.7794, + "step": 7890 + }, + { + "epoch": 2.422038060159607, + "grad_norm": 0.31160372495651245, + "learning_rate": 8.865347870945724e-05, + "loss": 1.8584, + "step": 7891 + }, + { + "epoch": 2.422344996930632, + "grad_norm": 0.3121089041233063, + "learning_rate": 8.865032558677874e-05, + "loss": 1.8797, + "step": 7892 + }, + { + "epoch": 2.4226519337016574, + "grad_norm": 0.35856643319129944, + "learning_rate": 8.864717208213506e-05, + "loss": 1.8664, + "step": 7893 + }, + { + "epoch": 2.4229588704726828, + "grad_norm": 0.32826781272888184, + "learning_rate": 8.864401819555739e-05, + "loss": 1.8473, + "step": 7894 + }, + { + "epoch": 2.4232658072437077, + "grad_norm": 0.34450921416282654, + "learning_rate": 8.86408639270769e-05, + "loss": 1.918, + "step": 7895 + }, + { + "epoch": 2.423572744014733, + "grad_norm": 0.39621153473854065, + "learning_rate": 8.86377092767247e-05, + "loss": 1.9411, + "step": 7896 + }, + { + "epoch": 2.423879680785758, + "grad_norm": 0.3765166103839874, + "learning_rate": 8.863455424453204e-05, + "loss": 1.9003, + "step": 7897 + }, + { + "epoch": 2.424186617556783, + "grad_norm": 0.3942621946334839, + "learning_rate": 8.863139883053007e-05, + "loss": 1.9647, + "step": 7898 + }, + { + "epoch": 2.4244935543278086, + "grad_norm": 0.4255806803703308, + "learning_rate": 8.862824303474996e-05, + "loss": 1.9147, + "step": 7899 + }, + { + "epoch": 2.424800491098834, + "grad_norm": 0.3993197977542877, + "learning_rate": 8.862508685722292e-05, + "loss": 1.8822, + "step": 7900 + }, + { + "epoch": 2.425107427869859, + "grad_norm": 0.3734201490879059, + "learning_rate": 8.862193029798013e-05, + "loss": 1.8745, + "step": 7901 + }, + { + "epoch": 2.425414364640884, + "grad_norm": 0.40955278277397156, + "learning_rate": 8.861877335705279e-05, + "loss": 1.877, + "step": 7902 + }, + { + "epoch": 2.425721301411909, + "grad_norm": 0.3975965678691864, + "learning_rate": 8.861561603447211e-05, + "loss": 1.868, + "step": 7903 + }, + { + "epoch": 2.4260282381829343, + "grad_norm": 0.30194091796875, + "learning_rate": 8.861245833026926e-05, + "loss": 1.7849, + "step": 7904 + }, + { + "epoch": 2.4263351749539597, + "grad_norm": 0.349930077791214, + "learning_rate": 8.860930024447547e-05, + "loss": 1.891, + "step": 7905 + }, + { + "epoch": 2.4266421117249846, + "grad_norm": 0.40644606947898865, + "learning_rate": 8.860614177712196e-05, + "loss": 1.8463, + "step": 7906 + }, + { + "epoch": 2.42694904849601, + "grad_norm": 0.3627426028251648, + "learning_rate": 8.86029829282399e-05, + "loss": 1.8518, + "step": 7907 + }, + { + "epoch": 2.427255985267035, + "grad_norm": 0.4019826054573059, + "learning_rate": 8.859982369786055e-05, + "loss": 1.7997, + "step": 7908 + }, + { + "epoch": 2.42756292203806, + "grad_norm": 0.375589519739151, + "learning_rate": 8.859666408601512e-05, + "loss": 1.9136, + "step": 7909 + }, + { + "epoch": 2.4278698588090855, + "grad_norm": 0.3135814070701599, + "learning_rate": 8.859350409273484e-05, + "loss": 1.8511, + "step": 7910 + }, + { + "epoch": 2.4281767955801103, + "grad_norm": 0.4534473717212677, + "learning_rate": 8.859034371805093e-05, + "loss": 1.9827, + "step": 7911 + }, + { + "epoch": 2.4284837323511357, + "grad_norm": 0.5559772849082947, + "learning_rate": 8.858718296199462e-05, + "loss": 1.8578, + "step": 7912 + }, + { + "epoch": 2.428790669122161, + "grad_norm": 0.4518011212348938, + "learning_rate": 8.858402182459715e-05, + "loss": 1.8374, + "step": 7913 + }, + { + "epoch": 2.429097605893186, + "grad_norm": 0.31662946939468384, + "learning_rate": 8.858086030588977e-05, + "loss": 1.8356, + "step": 7914 + }, + { + "epoch": 2.4294045426642112, + "grad_norm": 0.4660717844963074, + "learning_rate": 8.857769840590371e-05, + "loss": 1.7977, + "step": 7915 + }, + { + "epoch": 2.4297114794352366, + "grad_norm": 0.5611162185668945, + "learning_rate": 8.857453612467022e-05, + "loss": 1.8423, + "step": 7916 + }, + { + "epoch": 2.4300184162062615, + "grad_norm": 0.5055921077728271, + "learning_rate": 8.857137346222056e-05, + "loss": 1.8595, + "step": 7917 + }, + { + "epoch": 2.430325352977287, + "grad_norm": 0.3589123487472534, + "learning_rate": 8.856821041858597e-05, + "loss": 1.776, + "step": 7918 + }, + { + "epoch": 2.4306322897483117, + "grad_norm": 0.36849313974380493, + "learning_rate": 8.856504699379773e-05, + "loss": 1.8695, + "step": 7919 + }, + { + "epoch": 2.430939226519337, + "grad_norm": 0.47566625475883484, + "learning_rate": 8.856188318788709e-05, + "loss": 1.8578, + "step": 7920 + }, + { + "epoch": 2.4312461632903624, + "grad_norm": 0.554790735244751, + "learning_rate": 8.855871900088532e-05, + "loss": 1.8406, + "step": 7921 + }, + { + "epoch": 2.4315531000613873, + "grad_norm": 0.4846283197402954, + "learning_rate": 8.855555443282369e-05, + "loss": 1.8475, + "step": 7922 + }, + { + "epoch": 2.4318600368324126, + "grad_norm": 0.35256531834602356, + "learning_rate": 8.855238948373346e-05, + "loss": 1.8594, + "step": 7923 + }, + { + "epoch": 2.4321669736034375, + "grad_norm": 0.3713412880897522, + "learning_rate": 8.854922415364593e-05, + "loss": 1.893, + "step": 7924 + }, + { + "epoch": 2.432473910374463, + "grad_norm": 0.4289644658565521, + "learning_rate": 8.854605844259237e-05, + "loss": 1.8958, + "step": 7925 + }, + { + "epoch": 2.432780847145488, + "grad_norm": 0.4209578335285187, + "learning_rate": 8.854289235060406e-05, + "loss": 1.8419, + "step": 7926 + }, + { + "epoch": 2.433087783916513, + "grad_norm": 0.41226091980934143, + "learning_rate": 8.853972587771232e-05, + "loss": 1.958, + "step": 7927 + }, + { + "epoch": 2.4333947206875384, + "grad_norm": 0.36133915185928345, + "learning_rate": 8.853655902394841e-05, + "loss": 1.9181, + "step": 7928 + }, + { + "epoch": 2.4337016574585637, + "grad_norm": 0.44178202748298645, + "learning_rate": 8.853339178934363e-05, + "loss": 1.9242, + "step": 7929 + }, + { + "epoch": 2.4340085942295886, + "grad_norm": 0.4537523686885834, + "learning_rate": 8.853022417392929e-05, + "loss": 2.0451, + "step": 7930 + }, + { + "epoch": 2.434315531000614, + "grad_norm": 0.3214915990829468, + "learning_rate": 8.852705617773669e-05, + "loss": 1.8549, + "step": 7931 + }, + { + "epoch": 2.4346224677716393, + "grad_norm": 0.4621930420398712, + "learning_rate": 8.852388780079714e-05, + "loss": 1.8705, + "step": 7932 + }, + { + "epoch": 2.434929404542664, + "grad_norm": 0.52337646484375, + "learning_rate": 8.852071904314196e-05, + "loss": 1.8381, + "step": 7933 + }, + { + "epoch": 2.4352363413136895, + "grad_norm": 0.3846060633659363, + "learning_rate": 8.851754990480246e-05, + "loss": 1.828, + "step": 7934 + }, + { + "epoch": 2.4355432780847144, + "grad_norm": 0.34233763813972473, + "learning_rate": 8.851438038580994e-05, + "loss": 1.924, + "step": 7935 + }, + { + "epoch": 2.4358502148557397, + "grad_norm": 0.39583292603492737, + "learning_rate": 8.851121048619574e-05, + "loss": 1.8383, + "step": 7936 + }, + { + "epoch": 2.436157151626765, + "grad_norm": 0.3715476393699646, + "learning_rate": 8.850804020599119e-05, + "loss": 1.9251, + "step": 7937 + }, + { + "epoch": 2.43646408839779, + "grad_norm": 0.32089582085609436, + "learning_rate": 8.850486954522762e-05, + "loss": 1.9317, + "step": 7938 + }, + { + "epoch": 2.4367710251688153, + "grad_norm": 0.46823611855506897, + "learning_rate": 8.850169850393634e-05, + "loss": 1.9743, + "step": 7939 + }, + { + "epoch": 2.43707796193984, + "grad_norm": 0.405205637216568, + "learning_rate": 8.849852708214874e-05, + "loss": 1.8772, + "step": 7940 + }, + { + "epoch": 2.4373848987108655, + "grad_norm": 0.33672770857810974, + "learning_rate": 8.849535527989612e-05, + "loss": 1.8767, + "step": 7941 + }, + { + "epoch": 2.437691835481891, + "grad_norm": 0.38022953271865845, + "learning_rate": 8.849218309720983e-05, + "loss": 1.8882, + "step": 7942 + }, + { + "epoch": 2.4379987722529157, + "grad_norm": 0.4224186837673187, + "learning_rate": 8.848901053412124e-05, + "loss": 1.9016, + "step": 7943 + }, + { + "epoch": 2.438305709023941, + "grad_norm": 0.3890904486179352, + "learning_rate": 8.848583759066167e-05, + "loss": 1.8761, + "step": 7944 + }, + { + "epoch": 2.4386126457949664, + "grad_norm": 0.3747030794620514, + "learning_rate": 8.84826642668625e-05, + "loss": 1.8576, + "step": 7945 + }, + { + "epoch": 2.4389195825659913, + "grad_norm": 0.3317604959011078, + "learning_rate": 8.84794905627551e-05, + "loss": 1.9249, + "step": 7946 + }, + { + "epoch": 2.4392265193370166, + "grad_norm": 0.3294972777366638, + "learning_rate": 8.84763164783708e-05, + "loss": 1.8308, + "step": 7947 + }, + { + "epoch": 2.439533456108042, + "grad_norm": 0.42031124234199524, + "learning_rate": 8.847314201374101e-05, + "loss": 1.7884, + "step": 7948 + }, + { + "epoch": 2.439840392879067, + "grad_norm": 0.4018419682979584, + "learning_rate": 8.846996716889708e-05, + "loss": 1.8334, + "step": 7949 + }, + { + "epoch": 2.440147329650092, + "grad_norm": 0.39541858434677124, + "learning_rate": 8.846679194387036e-05, + "loss": 1.888, + "step": 7950 + }, + { + "epoch": 2.440454266421117, + "grad_norm": 0.34641456604003906, + "learning_rate": 8.846361633869228e-05, + "loss": 1.8521, + "step": 7951 + }, + { + "epoch": 2.4407612031921424, + "grad_norm": 0.42987826466560364, + "learning_rate": 8.846044035339419e-05, + "loss": 1.8789, + "step": 7952 + }, + { + "epoch": 2.4410681399631677, + "grad_norm": 0.3651089072227478, + "learning_rate": 8.845726398800749e-05, + "loss": 1.9024, + "step": 7953 + }, + { + "epoch": 2.4413750767341926, + "grad_norm": 0.3024137616157532, + "learning_rate": 8.845408724256356e-05, + "loss": 1.7773, + "step": 7954 + }, + { + "epoch": 2.441682013505218, + "grad_norm": 0.32426944375038147, + "learning_rate": 8.845091011709381e-05, + "loss": 1.7873, + "step": 7955 + }, + { + "epoch": 2.441988950276243, + "grad_norm": 0.34448274970054626, + "learning_rate": 8.844773261162962e-05, + "loss": 1.8854, + "step": 7956 + }, + { + "epoch": 2.442295887047268, + "grad_norm": 0.2942068874835968, + "learning_rate": 8.844455472620241e-05, + "loss": 1.8186, + "step": 7957 + }, + { + "epoch": 2.4426028238182935, + "grad_norm": 0.3849888741970062, + "learning_rate": 8.844137646084358e-05, + "loss": 1.905, + "step": 7958 + }, + { + "epoch": 2.4429097605893184, + "grad_norm": 0.44277897477149963, + "learning_rate": 8.843819781558452e-05, + "loss": 1.8836, + "step": 7959 + }, + { + "epoch": 2.4432166973603437, + "grad_norm": 0.34470248222351074, + "learning_rate": 8.843501879045667e-05, + "loss": 1.9368, + "step": 7960 + }, + { + "epoch": 2.443523634131369, + "grad_norm": 0.29713204503059387, + "learning_rate": 8.843183938549145e-05, + "loss": 1.8562, + "step": 7961 + }, + { + "epoch": 2.443830570902394, + "grad_norm": 0.370623379945755, + "learning_rate": 8.842865960072025e-05, + "loss": 1.8501, + "step": 7962 + }, + { + "epoch": 2.4441375076734193, + "grad_norm": 0.38828277587890625, + "learning_rate": 8.842547943617453e-05, + "loss": 1.884, + "step": 7963 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.294223427772522, + "learning_rate": 8.842229889188566e-05, + "loss": 1.857, + "step": 7964 + }, + { + "epoch": 2.4447513812154695, + "grad_norm": 0.31901589035987854, + "learning_rate": 8.841911796788516e-05, + "loss": 1.8675, + "step": 7965 + }, + { + "epoch": 2.445058317986495, + "grad_norm": 0.3586447834968567, + "learning_rate": 8.84159366642044e-05, + "loss": 1.86, + "step": 7966 + }, + { + "epoch": 2.4453652547575198, + "grad_norm": 0.30848199129104614, + "learning_rate": 8.841275498087482e-05, + "loss": 1.8153, + "step": 7967 + }, + { + "epoch": 2.445672191528545, + "grad_norm": 0.2694801688194275, + "learning_rate": 8.84095729179279e-05, + "loss": 1.7702, + "step": 7968 + }, + { + "epoch": 2.4459791282995704, + "grad_norm": 0.3068044185638428, + "learning_rate": 8.840639047539507e-05, + "loss": 1.8531, + "step": 7969 + }, + { + "epoch": 2.4462860650705953, + "grad_norm": 0.32885125279426575, + "learning_rate": 8.840320765330776e-05, + "loss": 1.9194, + "step": 7970 + }, + { + "epoch": 2.4465930018416207, + "grad_norm": 0.2949635088443756, + "learning_rate": 8.840002445169746e-05, + "loss": 1.8427, + "step": 7971 + }, + { + "epoch": 2.446899938612646, + "grad_norm": 0.27281275391578674, + "learning_rate": 8.83968408705956e-05, + "loss": 1.8279, + "step": 7972 + }, + { + "epoch": 2.447206875383671, + "grad_norm": 0.3038519620895386, + "learning_rate": 8.839365691003367e-05, + "loss": 1.8629, + "step": 7973 + }, + { + "epoch": 2.447513812154696, + "grad_norm": 0.28468266129493713, + "learning_rate": 8.839047257004311e-05, + "loss": 1.8765, + "step": 7974 + }, + { + "epoch": 2.4478207489257215, + "grad_norm": 0.29807159304618835, + "learning_rate": 8.83872878506554e-05, + "loss": 1.8152, + "step": 7975 + }, + { + "epoch": 2.4481276856967464, + "grad_norm": 0.3005301356315613, + "learning_rate": 8.838410275190201e-05, + "loss": 1.8577, + "step": 7976 + }, + { + "epoch": 2.4484346224677718, + "grad_norm": 0.3068598806858063, + "learning_rate": 8.838091727381442e-05, + "loss": 1.863, + "step": 7977 + }, + { + "epoch": 2.4487415592387967, + "grad_norm": 0.33748000860214233, + "learning_rate": 8.837773141642411e-05, + "loss": 1.7889, + "step": 7978 + }, + { + "epoch": 2.449048496009822, + "grad_norm": 0.344417542219162, + "learning_rate": 8.837454517976256e-05, + "loss": 1.9167, + "step": 7979 + }, + { + "epoch": 2.4493554327808473, + "grad_norm": 0.29128298163414, + "learning_rate": 8.837135856386127e-05, + "loss": 1.8246, + "step": 7980 + }, + { + "epoch": 2.449662369551872, + "grad_norm": 0.27023759484291077, + "learning_rate": 8.836817156875172e-05, + "loss": 1.8493, + "step": 7981 + }, + { + "epoch": 2.4499693063228976, + "grad_norm": 0.2792586088180542, + "learning_rate": 8.836498419446541e-05, + "loss": 1.8739, + "step": 7982 + }, + { + "epoch": 2.4502762430939224, + "grad_norm": 0.2715211510658264, + "learning_rate": 8.836179644103384e-05, + "loss": 1.8218, + "step": 7983 + }, + { + "epoch": 2.450583179864948, + "grad_norm": 0.273576557636261, + "learning_rate": 8.835860830848851e-05, + "loss": 1.9063, + "step": 7984 + }, + { + "epoch": 2.450890116635973, + "grad_norm": 0.2992589473724365, + "learning_rate": 8.835541979686093e-05, + "loss": 1.8799, + "step": 7985 + }, + { + "epoch": 2.451197053406998, + "grad_norm": 0.3231843411922455, + "learning_rate": 8.835223090618263e-05, + "loss": 1.8956, + "step": 7986 + }, + { + "epoch": 2.4515039901780233, + "grad_norm": 0.31108468770980835, + "learning_rate": 8.834904163648508e-05, + "loss": 1.8371, + "step": 7987 + }, + { + "epoch": 2.4518109269490487, + "grad_norm": 0.26657021045684814, + "learning_rate": 8.834585198779983e-05, + "loss": 1.8384, + "step": 7988 + }, + { + "epoch": 2.4521178637200736, + "grad_norm": 0.32093849778175354, + "learning_rate": 8.83426619601584e-05, + "loss": 1.8603, + "step": 7989 + }, + { + "epoch": 2.452424800491099, + "grad_norm": 0.32942765951156616, + "learning_rate": 8.833947155359231e-05, + "loss": 1.8306, + "step": 7990 + }, + { + "epoch": 2.4527317372621242, + "grad_norm": 0.31677374243736267, + "learning_rate": 8.83362807681331e-05, + "loss": 1.8339, + "step": 7991 + }, + { + "epoch": 2.453038674033149, + "grad_norm": 0.2739655673503876, + "learning_rate": 8.833308960381228e-05, + "loss": 1.8514, + "step": 7992 + }, + { + "epoch": 2.4533456108041745, + "grad_norm": 0.3194214105606079, + "learning_rate": 8.83298980606614e-05, + "loss": 1.8413, + "step": 7993 + }, + { + "epoch": 2.4536525475751993, + "grad_norm": 0.3346202075481415, + "learning_rate": 8.832670613871202e-05, + "loss": 1.8558, + "step": 7994 + }, + { + "epoch": 2.4539594843462247, + "grad_norm": 0.3400736451148987, + "learning_rate": 8.832351383799565e-05, + "loss": 1.8668, + "step": 7995 + }, + { + "epoch": 2.45426642111725, + "grad_norm": 0.2807479202747345, + "learning_rate": 8.832032115854385e-05, + "loss": 1.8361, + "step": 7996 + }, + { + "epoch": 2.454573357888275, + "grad_norm": 0.2977379262447357, + "learning_rate": 8.831712810038817e-05, + "loss": 1.84, + "step": 7997 + }, + { + "epoch": 2.4548802946593002, + "grad_norm": 0.3242948353290558, + "learning_rate": 8.831393466356019e-05, + "loss": 1.9421, + "step": 7998 + }, + { + "epoch": 2.455187231430325, + "grad_norm": 0.3289327025413513, + "learning_rate": 8.831074084809144e-05, + "loss": 1.9348, + "step": 7999 + }, + { + "epoch": 2.4554941682013505, + "grad_norm": 0.3378387987613678, + "learning_rate": 8.830754665401351e-05, + "loss": 1.7871, + "step": 8000 + }, + { + "epoch": 2.455801104972376, + "grad_norm": 0.29627665877342224, + "learning_rate": 8.830435208135794e-05, + "loss": 1.815, + "step": 8001 + }, + { + "epoch": 2.4561080417434007, + "grad_norm": 0.3509432375431061, + "learning_rate": 8.83011571301563e-05, + "loss": 1.9209, + "step": 8002 + }, + { + "epoch": 2.456414978514426, + "grad_norm": 0.3272305130958557, + "learning_rate": 8.829796180044019e-05, + "loss": 1.8437, + "step": 8003 + }, + { + "epoch": 2.4567219152854514, + "grad_norm": 0.33997493982315063, + "learning_rate": 8.829476609224119e-05, + "loss": 1.8827, + "step": 8004 + }, + { + "epoch": 2.4570288520564763, + "grad_norm": 0.30387789011001587, + "learning_rate": 8.829157000559084e-05, + "loss": 1.8427, + "step": 8005 + }, + { + "epoch": 2.4573357888275016, + "grad_norm": 0.30266425013542175, + "learning_rate": 8.828837354052075e-05, + "loss": 1.8274, + "step": 8006 + }, + { + "epoch": 2.457642725598527, + "grad_norm": 0.365546315908432, + "learning_rate": 8.828517669706254e-05, + "loss": 1.8455, + "step": 8007 + }, + { + "epoch": 2.457949662369552, + "grad_norm": 0.339226633310318, + "learning_rate": 8.828197947524774e-05, + "loss": 1.8665, + "step": 8008 + }, + { + "epoch": 2.458256599140577, + "grad_norm": 0.31167346239089966, + "learning_rate": 8.8278781875108e-05, + "loss": 1.7807, + "step": 8009 + }, + { + "epoch": 2.458563535911602, + "grad_norm": 0.2788028120994568, + "learning_rate": 8.82755838966749e-05, + "loss": 1.8834, + "step": 8010 + }, + { + "epoch": 2.4588704726826274, + "grad_norm": 0.34648752212524414, + "learning_rate": 8.827238553998005e-05, + "loss": 1.8981, + "step": 8011 + }, + { + "epoch": 2.4591774094536527, + "grad_norm": 0.3169974982738495, + "learning_rate": 8.826918680505504e-05, + "loss": 1.81, + "step": 8012 + }, + { + "epoch": 2.4594843462246776, + "grad_norm": 0.46924272179603577, + "learning_rate": 8.826598769193151e-05, + "loss": 1.9016, + "step": 8013 + }, + { + "epoch": 2.459791282995703, + "grad_norm": 0.38437098264694214, + "learning_rate": 8.826278820064106e-05, + "loss": 1.8924, + "step": 8014 + }, + { + "epoch": 2.460098219766728, + "grad_norm": 0.3350604474544525, + "learning_rate": 8.82595883312153e-05, + "loss": 1.8591, + "step": 8015 + }, + { + "epoch": 2.460405156537753, + "grad_norm": 0.3053742051124573, + "learning_rate": 8.825638808368588e-05, + "loss": 1.8114, + "step": 8016 + }, + { + "epoch": 2.4607120933087785, + "grad_norm": 0.29566875100135803, + "learning_rate": 8.82531874580844e-05, + "loss": 1.8055, + "step": 8017 + }, + { + "epoch": 2.4610190300798034, + "grad_norm": 0.3057360053062439, + "learning_rate": 8.824998645444249e-05, + "loss": 1.8268, + "step": 8018 + }, + { + "epoch": 2.4613259668508287, + "grad_norm": 0.27333348989486694, + "learning_rate": 8.82467850727918e-05, + "loss": 1.7876, + "step": 8019 + }, + { + "epoch": 2.461632903621854, + "grad_norm": 0.29202890396118164, + "learning_rate": 8.824358331316398e-05, + "loss": 1.8488, + "step": 8020 + }, + { + "epoch": 2.461939840392879, + "grad_norm": 0.3640623986721039, + "learning_rate": 8.824038117559064e-05, + "loss": 1.9665, + "step": 8021 + }, + { + "epoch": 2.4622467771639043, + "grad_norm": 0.35411131381988525, + "learning_rate": 8.823717866010344e-05, + "loss": 1.8561, + "step": 8022 + }, + { + "epoch": 2.4625537139349296, + "grad_norm": 0.3695240020751953, + "learning_rate": 8.823397576673403e-05, + "loss": 1.8489, + "step": 8023 + }, + { + "epoch": 2.4628606507059545, + "grad_norm": 0.36554715037345886, + "learning_rate": 8.823077249551406e-05, + "loss": 1.8523, + "step": 8024 + }, + { + "epoch": 2.46316758747698, + "grad_norm": 0.2982638478279114, + "learning_rate": 8.822756884647521e-05, + "loss": 1.8006, + "step": 8025 + }, + { + "epoch": 2.4634745242480047, + "grad_norm": 0.3693525791168213, + "learning_rate": 8.822436481964909e-05, + "loss": 1.8695, + "step": 8026 + }, + { + "epoch": 2.46378146101903, + "grad_norm": 0.46769842505455017, + "learning_rate": 8.82211604150674e-05, + "loss": 1.8509, + "step": 8027 + }, + { + "epoch": 2.4640883977900554, + "grad_norm": 0.5327584743499756, + "learning_rate": 8.82179556327618e-05, + "loss": 1.8642, + "step": 8028 + }, + { + "epoch": 2.4643953345610803, + "grad_norm": 0.5302795767784119, + "learning_rate": 8.821475047276398e-05, + "loss": 1.8645, + "step": 8029 + }, + { + "epoch": 2.4647022713321056, + "grad_norm": 0.43549028038978577, + "learning_rate": 8.821154493510557e-05, + "loss": 1.9193, + "step": 8030 + }, + { + "epoch": 2.4650092081031305, + "grad_norm": 0.3013847768306732, + "learning_rate": 8.82083390198183e-05, + "loss": 1.7819, + "step": 8031 + }, + { + "epoch": 2.465316144874156, + "grad_norm": 0.422325074672699, + "learning_rate": 8.820513272693383e-05, + "loss": 1.9307, + "step": 8032 + }, + { + "epoch": 2.465623081645181, + "grad_norm": 0.4823217988014221, + "learning_rate": 8.820192605648383e-05, + "loss": 1.8681, + "step": 8033 + }, + { + "epoch": 2.465930018416206, + "grad_norm": 0.3938382863998413, + "learning_rate": 8.819871900850001e-05, + "loss": 1.8483, + "step": 8034 + }, + { + "epoch": 2.4662369551872314, + "grad_norm": 0.30860164761543274, + "learning_rate": 8.819551158301406e-05, + "loss": 1.8818, + "step": 8035 + }, + { + "epoch": 2.4665438919582567, + "grad_norm": 0.3715503215789795, + "learning_rate": 8.819230378005767e-05, + "loss": 1.8443, + "step": 8036 + }, + { + "epoch": 2.4668508287292816, + "grad_norm": 0.4750272333621979, + "learning_rate": 8.818909559966255e-05, + "loss": 1.8379, + "step": 8037 + }, + { + "epoch": 2.467157765500307, + "grad_norm": 0.4794345796108246, + "learning_rate": 8.818588704186041e-05, + "loss": 1.8585, + "step": 8038 + }, + { + "epoch": 2.4674647022713323, + "grad_norm": 0.33470577001571655, + "learning_rate": 8.818267810668296e-05, + "loss": 1.8231, + "step": 8039 + }, + { + "epoch": 2.467771639042357, + "grad_norm": 0.31480371952056885, + "learning_rate": 8.817946879416191e-05, + "loss": 1.867, + "step": 8040 + }, + { + "epoch": 2.4680785758133825, + "grad_norm": 0.41635531187057495, + "learning_rate": 8.817625910432897e-05, + "loss": 1.9385, + "step": 8041 + }, + { + "epoch": 2.4683855125844074, + "grad_norm": 0.4570399522781372, + "learning_rate": 8.817304903721584e-05, + "loss": 1.7855, + "step": 8042 + }, + { + "epoch": 2.4686924493554327, + "grad_norm": 0.36506229639053345, + "learning_rate": 8.816983859285429e-05, + "loss": 1.808, + "step": 8043 + }, + { + "epoch": 2.468999386126458, + "grad_norm": 0.2650545537471771, + "learning_rate": 8.8166627771276e-05, + "loss": 1.8271, + "step": 8044 + }, + { + "epoch": 2.469306322897483, + "grad_norm": 0.3143758475780487, + "learning_rate": 8.816341657251272e-05, + "loss": 1.9016, + "step": 8045 + }, + { + "epoch": 2.4696132596685083, + "grad_norm": 0.3015407621860504, + "learning_rate": 8.81602049965962e-05, + "loss": 1.8357, + "step": 8046 + }, + { + "epoch": 2.4699201964395336, + "grad_norm": 0.26860085129737854, + "learning_rate": 8.815699304355819e-05, + "loss": 1.8223, + "step": 8047 + }, + { + "epoch": 2.4702271332105585, + "grad_norm": 0.2852436602115631, + "learning_rate": 8.81537807134304e-05, + "loss": 1.8298, + "step": 8048 + }, + { + "epoch": 2.470534069981584, + "grad_norm": 0.29519692063331604, + "learning_rate": 8.815056800624457e-05, + "loss": 1.863, + "step": 8049 + }, + { + "epoch": 2.470841006752609, + "grad_norm": 0.3163367807865143, + "learning_rate": 8.814735492203247e-05, + "loss": 1.878, + "step": 8050 + }, + { + "epoch": 2.471147943523634, + "grad_norm": 0.2955954968929291, + "learning_rate": 8.814414146082586e-05, + "loss": 1.8657, + "step": 8051 + }, + { + "epoch": 2.4714548802946594, + "grad_norm": 0.2773810029029846, + "learning_rate": 8.814092762265648e-05, + "loss": 1.7626, + "step": 8052 + }, + { + "epoch": 2.4717618170656843, + "grad_norm": 0.33908557891845703, + "learning_rate": 8.813771340755609e-05, + "loss": 1.8902, + "step": 8053 + }, + { + "epoch": 2.4720687538367097, + "grad_norm": 0.3083830773830414, + "learning_rate": 8.81344988155565e-05, + "loss": 1.876, + "step": 8054 + }, + { + "epoch": 2.472375690607735, + "grad_norm": 0.29082754254341125, + "learning_rate": 8.81312838466894e-05, + "loss": 1.8637, + "step": 8055 + }, + { + "epoch": 2.47268262737876, + "grad_norm": 0.3240490257740021, + "learning_rate": 8.81280685009866e-05, + "loss": 1.9096, + "step": 8056 + }, + { + "epoch": 2.472989564149785, + "grad_norm": 0.364561527967453, + "learning_rate": 8.812485277847991e-05, + "loss": 1.9361, + "step": 8057 + }, + { + "epoch": 2.47329650092081, + "grad_norm": 0.3420087695121765, + "learning_rate": 8.812163667920107e-05, + "loss": 1.9014, + "step": 8058 + }, + { + "epoch": 2.4736034376918354, + "grad_norm": 0.3346010148525238, + "learning_rate": 8.811842020318186e-05, + "loss": 1.9195, + "step": 8059 + }, + { + "epoch": 2.4739103744628608, + "grad_norm": 0.2990448772907257, + "learning_rate": 8.811520335045409e-05, + "loss": 1.8866, + "step": 8060 + }, + { + "epoch": 2.4742173112338857, + "grad_norm": 0.3047022223472595, + "learning_rate": 8.811198612104953e-05, + "loss": 1.8226, + "step": 8061 + }, + { + "epoch": 2.474524248004911, + "grad_norm": 0.300020307302475, + "learning_rate": 8.8108768515e-05, + "loss": 1.8496, + "step": 8062 + }, + { + "epoch": 2.4748311847759363, + "grad_norm": 0.31999605894088745, + "learning_rate": 8.810555053233729e-05, + "loss": 1.7853, + "step": 8063 + }, + { + "epoch": 2.4751381215469612, + "grad_norm": 0.3136597275733948, + "learning_rate": 8.810233217309318e-05, + "loss": 1.9317, + "step": 8064 + }, + { + "epoch": 2.4754450583179866, + "grad_norm": 0.3373543322086334, + "learning_rate": 8.809911343729948e-05, + "loss": 1.7827, + "step": 8065 + }, + { + "epoch": 2.475751995089012, + "grad_norm": 0.33876341581344604, + "learning_rate": 8.809589432498804e-05, + "loss": 1.8803, + "step": 8066 + }, + { + "epoch": 2.476058931860037, + "grad_norm": 0.3455486297607422, + "learning_rate": 8.809267483619061e-05, + "loss": 1.8987, + "step": 8067 + }, + { + "epoch": 2.476365868631062, + "grad_norm": 0.34245389699935913, + "learning_rate": 8.808945497093907e-05, + "loss": 1.8948, + "step": 8068 + }, + { + "epoch": 2.476672805402087, + "grad_norm": 0.3200787901878357, + "learning_rate": 8.808623472926521e-05, + "loss": 1.8234, + "step": 8069 + }, + { + "epoch": 2.4769797421731123, + "grad_norm": 0.3244795799255371, + "learning_rate": 8.808301411120083e-05, + "loss": 1.8974, + "step": 8070 + }, + { + "epoch": 2.4772866789441377, + "grad_norm": 0.30235809087753296, + "learning_rate": 8.80797931167778e-05, + "loss": 1.8461, + "step": 8071 + }, + { + "epoch": 2.4775936157151626, + "grad_norm": 0.3719651997089386, + "learning_rate": 8.807657174602792e-05, + "loss": 1.9717, + "step": 8072 + }, + { + "epoch": 2.477900552486188, + "grad_norm": 0.3349135220050812, + "learning_rate": 8.807334999898307e-05, + "loss": 1.9, + "step": 8073 + }, + { + "epoch": 2.478207489257213, + "grad_norm": 0.28822100162506104, + "learning_rate": 8.807012787567503e-05, + "loss": 1.7606, + "step": 8074 + }, + { + "epoch": 2.478514426028238, + "grad_norm": 0.33698850870132446, + "learning_rate": 8.806690537613568e-05, + "loss": 1.8909, + "step": 8075 + }, + { + "epoch": 2.4788213627992635, + "grad_norm": 0.35167089104652405, + "learning_rate": 8.806368250039687e-05, + "loss": 1.8529, + "step": 8076 + }, + { + "epoch": 2.4791282995702884, + "grad_norm": 0.3142544627189636, + "learning_rate": 8.806045924849044e-05, + "loss": 1.8169, + "step": 8077 + }, + { + "epoch": 2.4794352363413137, + "grad_norm": 0.3489094078540802, + "learning_rate": 8.805723562044824e-05, + "loss": 1.8822, + "step": 8078 + }, + { + "epoch": 2.479742173112339, + "grad_norm": 0.33814284205436707, + "learning_rate": 8.805401161630214e-05, + "loss": 1.7982, + "step": 8079 + }, + { + "epoch": 2.480049109883364, + "grad_norm": 0.26772376894950867, + "learning_rate": 8.805078723608398e-05, + "loss": 1.8354, + "step": 8080 + }, + { + "epoch": 2.4803560466543892, + "grad_norm": 0.3259965777397156, + "learning_rate": 8.804756247982563e-05, + "loss": 1.8292, + "step": 8081 + }, + { + "epoch": 2.4806629834254146, + "grad_norm": 0.32701683044433594, + "learning_rate": 8.804433734755899e-05, + "loss": 1.8339, + "step": 8082 + }, + { + "epoch": 2.4809699201964395, + "grad_norm": 0.3180190324783325, + "learning_rate": 8.804111183931589e-05, + "loss": 1.8839, + "step": 8083 + }, + { + "epoch": 2.481276856967465, + "grad_norm": 0.3318104147911072, + "learning_rate": 8.803788595512824e-05, + "loss": 1.9024, + "step": 8084 + }, + { + "epoch": 2.4815837937384897, + "grad_norm": 0.3849479854106903, + "learning_rate": 8.80346596950279e-05, + "loss": 1.8497, + "step": 8085 + }, + { + "epoch": 2.481890730509515, + "grad_norm": 0.48812124133110046, + "learning_rate": 8.803143305904676e-05, + "loss": 1.799, + "step": 8086 + }, + { + "epoch": 2.4821976672805404, + "grad_norm": 0.4957241415977478, + "learning_rate": 8.802820604721671e-05, + "loss": 1.8842, + "step": 8087 + }, + { + "epoch": 2.4825046040515653, + "grad_norm": 0.4011611342430115, + "learning_rate": 8.802497865956964e-05, + "loss": 1.8354, + "step": 8088 + }, + { + "epoch": 2.4828115408225906, + "grad_norm": 0.3676159679889679, + "learning_rate": 8.802175089613744e-05, + "loss": 1.8564, + "step": 8089 + }, + { + "epoch": 2.4831184775936155, + "grad_norm": 0.30699628591537476, + "learning_rate": 8.801852275695202e-05, + "loss": 1.8403, + "step": 8090 + }, + { + "epoch": 2.483425414364641, + "grad_norm": 0.4100657105445862, + "learning_rate": 8.801529424204527e-05, + "loss": 1.7885, + "step": 8091 + }, + { + "epoch": 2.483732351135666, + "grad_norm": 0.30880647897720337, + "learning_rate": 8.801206535144909e-05, + "loss": 1.8682, + "step": 8092 + }, + { + "epoch": 2.484039287906691, + "grad_norm": 0.2775783836841583, + "learning_rate": 8.800883608519541e-05, + "loss": 1.8179, + "step": 8093 + }, + { + "epoch": 2.4843462246777164, + "grad_norm": 0.3048902451992035, + "learning_rate": 8.800560644331613e-05, + "loss": 1.8799, + "step": 8094 + }, + { + "epoch": 2.4846531614487417, + "grad_norm": 0.30332526564598083, + "learning_rate": 8.800237642584318e-05, + "loss": 1.8892, + "step": 8095 + }, + { + "epoch": 2.4849600982197666, + "grad_norm": 0.27216237783432007, + "learning_rate": 8.799914603280847e-05, + "loss": 1.7896, + "step": 8096 + }, + { + "epoch": 2.485267034990792, + "grad_norm": 0.28771117329597473, + "learning_rate": 8.799591526424393e-05, + "loss": 1.8593, + "step": 8097 + }, + { + "epoch": 2.4855739717618173, + "grad_norm": 0.2986912429332733, + "learning_rate": 8.799268412018146e-05, + "loss": 1.8205, + "step": 8098 + }, + { + "epoch": 2.485880908532842, + "grad_norm": 0.3072153925895691, + "learning_rate": 8.798945260065306e-05, + "loss": 1.841, + "step": 8099 + }, + { + "epoch": 2.4861878453038675, + "grad_norm": 0.33869001269340515, + "learning_rate": 8.798622070569059e-05, + "loss": 1.8353, + "step": 8100 + }, + { + "epoch": 2.4864947820748924, + "grad_norm": 0.3075481951236725, + "learning_rate": 8.798298843532605e-05, + "loss": 1.8824, + "step": 8101 + }, + { + "epoch": 2.4868017188459177, + "grad_norm": 0.2758934795856476, + "learning_rate": 8.797975578959132e-05, + "loss": 1.8068, + "step": 8102 + }, + { + "epoch": 2.487108655616943, + "grad_norm": 0.3065447211265564, + "learning_rate": 8.79765227685184e-05, + "loss": 1.8661, + "step": 8103 + }, + { + "epoch": 2.487415592387968, + "grad_norm": 0.34466415643692017, + "learning_rate": 8.797328937213923e-05, + "loss": 1.8579, + "step": 8104 + }, + { + "epoch": 2.4877225291589933, + "grad_norm": 0.4202970862388611, + "learning_rate": 8.797005560048575e-05, + "loss": 1.8526, + "step": 8105 + }, + { + "epoch": 2.488029465930018, + "grad_norm": 0.35885924100875854, + "learning_rate": 8.796682145358991e-05, + "loss": 1.8194, + "step": 8106 + }, + { + "epoch": 2.4883364027010435, + "grad_norm": 0.3208492696285248, + "learning_rate": 8.796358693148372e-05, + "loss": 1.8379, + "step": 8107 + }, + { + "epoch": 2.488643339472069, + "grad_norm": 0.26514047384262085, + "learning_rate": 8.79603520341991e-05, + "loss": 1.7978, + "step": 8108 + }, + { + "epoch": 2.4889502762430937, + "grad_norm": 0.34550225734710693, + "learning_rate": 8.795711676176803e-05, + "loss": 1.8771, + "step": 8109 + }, + { + "epoch": 2.489257213014119, + "grad_norm": 0.3016511797904968, + "learning_rate": 8.795388111422248e-05, + "loss": 1.8184, + "step": 8110 + }, + { + "epoch": 2.4895641497851444, + "grad_norm": 0.34824177622795105, + "learning_rate": 8.795064509159444e-05, + "loss": 1.8486, + "step": 8111 + }, + { + "epoch": 2.4898710865561693, + "grad_norm": 0.341482013463974, + "learning_rate": 8.794740869391587e-05, + "loss": 1.7872, + "step": 8112 + }, + { + "epoch": 2.4901780233271946, + "grad_norm": 0.3366520404815674, + "learning_rate": 8.794417192121878e-05, + "loss": 1.838, + "step": 8113 + }, + { + "epoch": 2.49048496009822, + "grad_norm": 0.3168759047985077, + "learning_rate": 8.794093477353514e-05, + "loss": 1.8195, + "step": 8114 + }, + { + "epoch": 2.490791896869245, + "grad_norm": 0.36757516860961914, + "learning_rate": 8.793769725089693e-05, + "loss": 1.8825, + "step": 8115 + }, + { + "epoch": 2.49109883364027, + "grad_norm": 0.3936297297477722, + "learning_rate": 8.793445935333617e-05, + "loss": 1.855, + "step": 8116 + }, + { + "epoch": 2.491405770411295, + "grad_norm": 0.31962448358535767, + "learning_rate": 8.793122108088485e-05, + "loss": 1.8307, + "step": 8117 + }, + { + "epoch": 2.4917127071823204, + "grad_norm": 0.3082095980644226, + "learning_rate": 8.792798243357499e-05, + "loss": 1.8204, + "step": 8118 + }, + { + "epoch": 2.4920196439533457, + "grad_norm": 0.4574470520019531, + "learning_rate": 8.792474341143855e-05, + "loss": 1.8989, + "step": 8119 + }, + { + "epoch": 2.4923265807243706, + "grad_norm": 0.4596022367477417, + "learning_rate": 8.792150401450757e-05, + "loss": 1.8773, + "step": 8120 + }, + { + "epoch": 2.492633517495396, + "grad_norm": 0.32090309262275696, + "learning_rate": 8.791826424281407e-05, + "loss": 1.8621, + "step": 8121 + }, + { + "epoch": 2.4929404542664213, + "grad_norm": 0.3492026925086975, + "learning_rate": 8.791502409639006e-05, + "loss": 1.8887, + "step": 8122 + }, + { + "epoch": 2.493247391037446, + "grad_norm": 0.39859771728515625, + "learning_rate": 8.791178357526754e-05, + "loss": 1.8326, + "step": 8123 + }, + { + "epoch": 2.4935543278084715, + "grad_norm": 0.40439239144325256, + "learning_rate": 8.790854267947857e-05, + "loss": 1.8716, + "step": 8124 + }, + { + "epoch": 2.493861264579497, + "grad_norm": 0.4004671573638916, + "learning_rate": 8.790530140905515e-05, + "loss": 1.8253, + "step": 8125 + }, + { + "epoch": 2.4941682013505218, + "grad_norm": 0.31446993350982666, + "learning_rate": 8.790205976402934e-05, + "loss": 1.8356, + "step": 8126 + }, + { + "epoch": 2.494475138121547, + "grad_norm": 0.3069862723350525, + "learning_rate": 8.789881774443315e-05, + "loss": 1.8532, + "step": 8127 + }, + { + "epoch": 2.494782074892572, + "grad_norm": 0.3192054033279419, + "learning_rate": 8.789557535029864e-05, + "loss": 1.7991, + "step": 8128 + }, + { + "epoch": 2.4950890116635973, + "grad_norm": 0.30979350209236145, + "learning_rate": 8.789233258165783e-05, + "loss": 1.8874, + "step": 8129 + }, + { + "epoch": 2.4953959484346226, + "grad_norm": 0.3193976879119873, + "learning_rate": 8.788908943854279e-05, + "loss": 1.8218, + "step": 8130 + }, + { + "epoch": 2.4957028852056475, + "grad_norm": 0.3120083808898926, + "learning_rate": 8.788584592098557e-05, + "loss": 1.9542, + "step": 8131 + }, + { + "epoch": 2.496009821976673, + "grad_norm": 0.36913001537323, + "learning_rate": 8.788260202901819e-05, + "loss": 1.8543, + "step": 8132 + }, + { + "epoch": 2.4963167587476978, + "grad_norm": 0.40216776728630066, + "learning_rate": 8.787935776267275e-05, + "loss": 1.8645, + "step": 8133 + }, + { + "epoch": 2.496623695518723, + "grad_norm": 0.3553076684474945, + "learning_rate": 8.78761131219813e-05, + "loss": 1.8881, + "step": 8134 + }, + { + "epoch": 2.4969306322897484, + "grad_norm": 0.2926538288593292, + "learning_rate": 8.787286810697589e-05, + "loss": 1.8419, + "step": 8135 + }, + { + "epoch": 2.4972375690607733, + "grad_norm": 0.3412233293056488, + "learning_rate": 8.78696227176886e-05, + "loss": 1.8766, + "step": 8136 + }, + { + "epoch": 2.4975445058317987, + "grad_norm": 0.30935296416282654, + "learning_rate": 8.78663769541515e-05, + "loss": 1.8002, + "step": 8137 + }, + { + "epoch": 2.497851442602824, + "grad_norm": 0.31171828508377075, + "learning_rate": 8.786313081639666e-05, + "loss": 1.7795, + "step": 8138 + }, + { + "epoch": 2.498158379373849, + "grad_norm": 0.2874031364917755, + "learning_rate": 8.785988430445619e-05, + "loss": 1.8508, + "step": 8139 + }, + { + "epoch": 2.498465316144874, + "grad_norm": 0.3126043379306793, + "learning_rate": 8.785663741836215e-05, + "loss": 1.8328, + "step": 8140 + }, + { + "epoch": 2.4987722529158995, + "grad_norm": 0.32581454515457153, + "learning_rate": 8.785339015814662e-05, + "loss": 1.8333, + "step": 8141 + }, + { + "epoch": 2.4990791896869244, + "grad_norm": 0.329745888710022, + "learning_rate": 8.78501425238417e-05, + "loss": 1.8257, + "step": 8142 + }, + { + "epoch": 2.4993861264579498, + "grad_norm": 0.29101938009262085, + "learning_rate": 8.78468945154795e-05, + "loss": 1.8472, + "step": 8143 + }, + { + "epoch": 2.4996930632289747, + "grad_norm": 0.3123742341995239, + "learning_rate": 8.784364613309208e-05, + "loss": 1.9226, + "step": 8144 + }, + { + "epoch": 2.5, + "grad_norm": 0.3330230116844177, + "learning_rate": 8.784039737671159e-05, + "loss": 1.8768, + "step": 8145 + }, + { + "epoch": 2.5003069367710253, + "grad_norm": 0.3147718012332916, + "learning_rate": 8.783714824637011e-05, + "loss": 1.853, + "step": 8146 + }, + { + "epoch": 2.5006138735420502, + "grad_norm": 0.34790241718292236, + "learning_rate": 8.783389874209977e-05, + "loss": 1.8328, + "step": 8147 + }, + { + "epoch": 2.5009208103130756, + "grad_norm": 0.29425308108329773, + "learning_rate": 8.783064886393264e-05, + "loss": 1.8487, + "step": 8148 + }, + { + "epoch": 2.5012277470841005, + "grad_norm": 0.30555078387260437, + "learning_rate": 8.782739861190088e-05, + "loss": 1.8588, + "step": 8149 + }, + { + "epoch": 2.501534683855126, + "grad_norm": 0.29712429642677307, + "learning_rate": 8.78241479860366e-05, + "loss": 1.8056, + "step": 8150 + }, + { + "epoch": 2.501841620626151, + "grad_norm": 0.32512977719306946, + "learning_rate": 8.782089698637191e-05, + "loss": 1.9099, + "step": 8151 + }, + { + "epoch": 2.5021485573971765, + "grad_norm": 0.3660493493080139, + "learning_rate": 8.781764561293895e-05, + "loss": 1.905, + "step": 8152 + }, + { + "epoch": 2.5024554941682013, + "grad_norm": 0.33591583371162415, + "learning_rate": 8.781439386576984e-05, + "loss": 1.8353, + "step": 8153 + }, + { + "epoch": 2.5027624309392267, + "grad_norm": 0.3774370551109314, + "learning_rate": 8.781114174489673e-05, + "loss": 1.8626, + "step": 8154 + }, + { + "epoch": 2.5030693677102516, + "grad_norm": 0.3628109097480774, + "learning_rate": 8.780788925035178e-05, + "loss": 1.8549, + "step": 8155 + }, + { + "epoch": 2.503376304481277, + "grad_norm": 0.3089732825756073, + "learning_rate": 8.78046363821671e-05, + "loss": 1.835, + "step": 8156 + }, + { + "epoch": 2.5036832412523022, + "grad_norm": 0.3630690574645996, + "learning_rate": 8.780138314037482e-05, + "loss": 1.8308, + "step": 8157 + }, + { + "epoch": 2.503990178023327, + "grad_norm": 0.3658130466938019, + "learning_rate": 8.779812952500714e-05, + "loss": 1.8484, + "step": 8158 + }, + { + "epoch": 2.5042971147943525, + "grad_norm": 0.38401272892951965, + "learning_rate": 8.779487553609617e-05, + "loss": 1.8408, + "step": 8159 + }, + { + "epoch": 2.5046040515653774, + "grad_norm": 0.354514479637146, + "learning_rate": 8.77916211736741e-05, + "loss": 1.8491, + "step": 8160 + }, + { + "epoch": 2.5049109883364027, + "grad_norm": 0.3604681193828583, + "learning_rate": 8.778836643777309e-05, + "loss": 1.8887, + "step": 8161 + }, + { + "epoch": 2.505217925107428, + "grad_norm": 0.3155761957168579, + "learning_rate": 8.778511132842528e-05, + "loss": 1.8066, + "step": 8162 + }, + { + "epoch": 2.505524861878453, + "grad_norm": 0.35986092686653137, + "learning_rate": 8.778185584566286e-05, + "loss": 1.8348, + "step": 8163 + }, + { + "epoch": 2.5058317986494782, + "grad_norm": 0.558273434638977, + "learning_rate": 8.777859998951799e-05, + "loss": 1.9118, + "step": 8164 + }, + { + "epoch": 2.506138735420503, + "grad_norm": 0.6520169377326965, + "learning_rate": 8.777534376002285e-05, + "loss": 1.8747, + "step": 8165 + }, + { + "epoch": 2.5064456721915285, + "grad_norm": 0.5059971213340759, + "learning_rate": 8.777208715720963e-05, + "loss": 1.8218, + "step": 8166 + }, + { + "epoch": 2.506752608962554, + "grad_norm": 0.2873745560646057, + "learning_rate": 8.77688301811105e-05, + "loss": 1.8266, + "step": 8167 + }, + { + "epoch": 2.507059545733579, + "grad_norm": 0.4212021827697754, + "learning_rate": 8.776557283175765e-05, + "loss": 1.8553, + "step": 8168 + }, + { + "epoch": 2.507366482504604, + "grad_norm": 0.49324098229408264, + "learning_rate": 8.776231510918328e-05, + "loss": 1.8625, + "step": 8169 + }, + { + "epoch": 2.5076734192756294, + "grad_norm": 0.4414234459400177, + "learning_rate": 8.775905701341959e-05, + "loss": 1.7956, + "step": 8170 + }, + { + "epoch": 2.5079803560466543, + "grad_norm": 0.2691541612148285, + "learning_rate": 8.775579854449876e-05, + "loss": 1.8216, + "step": 8171 + }, + { + "epoch": 2.5082872928176796, + "grad_norm": 0.3366323411464691, + "learning_rate": 8.775253970245299e-05, + "loss": 1.8738, + "step": 8172 + }, + { + "epoch": 2.508594229588705, + "grad_norm": 0.49541351199150085, + "learning_rate": 8.77492804873145e-05, + "loss": 1.8281, + "step": 8173 + }, + { + "epoch": 2.50890116635973, + "grad_norm": 0.584227442741394, + "learning_rate": 8.774602089911548e-05, + "loss": 1.8248, + "step": 8174 + }, + { + "epoch": 2.509208103130755, + "grad_norm": 0.4493597149848938, + "learning_rate": 8.774276093788818e-05, + "loss": 1.8624, + "step": 8175 + }, + { + "epoch": 2.50951503990178, + "grad_norm": 0.29684513807296753, + "learning_rate": 8.77395006036648e-05, + "loss": 1.7806, + "step": 8176 + }, + { + "epoch": 2.5098219766728054, + "grad_norm": 0.38788866996765137, + "learning_rate": 8.773623989647754e-05, + "loss": 1.8334, + "step": 8177 + }, + { + "epoch": 2.5101289134438307, + "grad_norm": 0.44810980558395386, + "learning_rate": 8.773297881635865e-05, + "loss": 1.823, + "step": 8178 + }, + { + "epoch": 2.5104358502148556, + "grad_norm": 0.39918363094329834, + "learning_rate": 8.772971736334032e-05, + "loss": 1.8535, + "step": 8179 + }, + { + "epoch": 2.510742786985881, + "grad_norm": 0.3454466462135315, + "learning_rate": 8.772645553745484e-05, + "loss": 1.8532, + "step": 8180 + }, + { + "epoch": 2.511049723756906, + "grad_norm": 0.3523466885089874, + "learning_rate": 8.77231933387344e-05, + "loss": 1.8402, + "step": 8181 + }, + { + "epoch": 2.511356660527931, + "grad_norm": 0.41947969794273376, + "learning_rate": 8.771993076721126e-05, + "loss": 1.8509, + "step": 8182 + }, + { + "epoch": 2.5116635972989565, + "grad_norm": 0.43224433064460754, + "learning_rate": 8.771666782291765e-05, + "loss": 1.858, + "step": 8183 + }, + { + "epoch": 2.511970534069982, + "grad_norm": 0.3467538058757782, + "learning_rate": 8.771340450588584e-05, + "loss": 1.8528, + "step": 8184 + }, + { + "epoch": 2.5122774708410067, + "grad_norm": 0.33712685108184814, + "learning_rate": 8.771014081614803e-05, + "loss": 1.8741, + "step": 8185 + }, + { + "epoch": 2.512584407612032, + "grad_norm": 0.4289829134941101, + "learning_rate": 8.770687675373652e-05, + "loss": 1.8252, + "step": 8186 + }, + { + "epoch": 2.512891344383057, + "grad_norm": 0.4774068295955658, + "learning_rate": 8.770361231868356e-05, + "loss": 1.8285, + "step": 8187 + }, + { + "epoch": 2.5131982811540823, + "grad_norm": 0.3455580472946167, + "learning_rate": 8.77003475110214e-05, + "loss": 1.8025, + "step": 8188 + }, + { + "epoch": 2.5135052179251076, + "grad_norm": 0.3050900399684906, + "learning_rate": 8.769708233078231e-05, + "loss": 1.8764, + "step": 8189 + }, + { + "epoch": 2.5138121546961325, + "grad_norm": 0.42384061217308044, + "learning_rate": 8.769381677799855e-05, + "loss": 1.8937, + "step": 8190 + }, + { + "epoch": 2.514119091467158, + "grad_norm": 0.4084749221801758, + "learning_rate": 8.76905508527024e-05, + "loss": 1.8124, + "step": 8191 + }, + { + "epoch": 2.5144260282381827, + "grad_norm": 0.38785848021507263, + "learning_rate": 8.768728455492615e-05, + "loss": 1.8731, + "step": 8192 + }, + { + "epoch": 2.514732965009208, + "grad_norm": 0.28196588158607483, + "learning_rate": 8.768401788470206e-05, + "loss": 1.809, + "step": 8193 + }, + { + "epoch": 2.5150399017802334, + "grad_norm": 0.3551066815853119, + "learning_rate": 8.76807508420624e-05, + "loss": 1.8955, + "step": 8194 + }, + { + "epoch": 2.5153468385512583, + "grad_norm": 0.4327031373977661, + "learning_rate": 8.76774834270395e-05, + "loss": 1.8651, + "step": 8195 + }, + { + "epoch": 2.5156537753222836, + "grad_norm": 0.3748793303966522, + "learning_rate": 8.76742156396656e-05, + "loss": 1.8158, + "step": 8196 + }, + { + "epoch": 2.5159607120933085, + "grad_norm": 0.32504430413246155, + "learning_rate": 8.767094747997304e-05, + "loss": 1.8598, + "step": 8197 + }, + { + "epoch": 2.516267648864334, + "grad_norm": 0.3639826476573944, + "learning_rate": 8.76676789479941e-05, + "loss": 1.8829, + "step": 8198 + }, + { + "epoch": 2.516574585635359, + "grad_norm": 0.36793577671051025, + "learning_rate": 8.766441004376106e-05, + "loss": 1.8215, + "step": 8199 + }, + { + "epoch": 2.5168815224063845, + "grad_norm": 0.3245735466480255, + "learning_rate": 8.766114076730624e-05, + "loss": 1.8309, + "step": 8200 + }, + { + "epoch": 2.5171884591774094, + "grad_norm": 0.3022485673427582, + "learning_rate": 8.765787111866198e-05, + "loss": 1.8286, + "step": 8201 + }, + { + "epoch": 2.5174953959484347, + "grad_norm": 0.40962809324264526, + "learning_rate": 8.765460109786056e-05, + "loss": 1.8032, + "step": 8202 + }, + { + "epoch": 2.5178023327194596, + "grad_norm": 0.4123937487602234, + "learning_rate": 8.765133070493428e-05, + "loss": 1.9311, + "step": 8203 + }, + { + "epoch": 2.518109269490485, + "grad_norm": 0.30352556705474854, + "learning_rate": 8.764805993991551e-05, + "loss": 1.8197, + "step": 8204 + }, + { + "epoch": 2.5184162062615103, + "grad_norm": 0.3201169967651367, + "learning_rate": 8.764478880283653e-05, + "loss": 1.9355, + "step": 8205 + }, + { + "epoch": 2.518723143032535, + "grad_norm": 0.36343297362327576, + "learning_rate": 8.764151729372969e-05, + "loss": 1.9201, + "step": 8206 + }, + { + "epoch": 2.5190300798035605, + "grad_norm": 0.3273618817329407, + "learning_rate": 8.763824541262729e-05, + "loss": 1.8195, + "step": 8207 + }, + { + "epoch": 2.5193370165745854, + "grad_norm": 0.30200251936912537, + "learning_rate": 8.76349731595617e-05, + "loss": 1.8094, + "step": 8208 + }, + { + "epoch": 2.5196439533456108, + "grad_norm": 0.3177770674228668, + "learning_rate": 8.763170053456527e-05, + "loss": 1.8519, + "step": 8209 + }, + { + "epoch": 2.519950890116636, + "grad_norm": 0.3206307291984558, + "learning_rate": 8.762842753767031e-05, + "loss": 1.8496, + "step": 8210 + }, + { + "epoch": 2.520257826887661, + "grad_norm": 0.31902456283569336, + "learning_rate": 8.762515416890915e-05, + "loss": 1.9069, + "step": 8211 + }, + { + "epoch": 2.5205647636586863, + "grad_norm": 0.3088377118110657, + "learning_rate": 8.762188042831419e-05, + "loss": 1.8482, + "step": 8212 + }, + { + "epoch": 2.520871700429711, + "grad_norm": 0.3046402931213379, + "learning_rate": 8.761860631591773e-05, + "loss": 1.8241, + "step": 8213 + }, + { + "epoch": 2.5211786372007365, + "grad_norm": 0.291831910610199, + "learning_rate": 8.761533183175217e-05, + "loss": 1.846, + "step": 8214 + }, + { + "epoch": 2.521485573971762, + "grad_norm": 0.3514893054962158, + "learning_rate": 8.761205697584986e-05, + "loss": 1.9, + "step": 8215 + }, + { + "epoch": 2.521792510742787, + "grad_norm": 0.31843090057373047, + "learning_rate": 8.760878174824316e-05, + "loss": 1.78, + "step": 8216 + }, + { + "epoch": 2.522099447513812, + "grad_norm": 0.30090904235839844, + "learning_rate": 8.760550614896443e-05, + "loss": 1.8718, + "step": 8217 + }, + { + "epoch": 2.5224063842848374, + "grad_norm": 0.38502126932144165, + "learning_rate": 8.760223017804604e-05, + "loss": 1.8772, + "step": 8218 + }, + { + "epoch": 2.5227133210558623, + "grad_norm": 0.30862319469451904, + "learning_rate": 8.759895383552037e-05, + "loss": 1.8532, + "step": 8219 + }, + { + "epoch": 2.5230202578268877, + "grad_norm": 0.36331596970558167, + "learning_rate": 8.759567712141981e-05, + "loss": 1.8587, + "step": 8220 + }, + { + "epoch": 2.523327194597913, + "grad_norm": 0.3370853662490845, + "learning_rate": 8.759240003577673e-05, + "loss": 1.8065, + "step": 8221 + }, + { + "epoch": 2.523634131368938, + "grad_norm": 0.3047318160533905, + "learning_rate": 8.758912257862351e-05, + "loss": 1.8783, + "step": 8222 + }, + { + "epoch": 2.523941068139963, + "grad_norm": 0.3172069787979126, + "learning_rate": 8.758584474999257e-05, + "loss": 1.7844, + "step": 8223 + }, + { + "epoch": 2.524248004910988, + "grad_norm": 0.3063897490501404, + "learning_rate": 8.758256654991626e-05, + "loss": 1.8642, + "step": 8224 + }, + { + "epoch": 2.5245549416820134, + "grad_norm": 0.2535867393016815, + "learning_rate": 8.757928797842702e-05, + "loss": 1.7784, + "step": 8225 + }, + { + "epoch": 2.5248618784530388, + "grad_norm": 0.27732348442077637, + "learning_rate": 8.757600903555722e-05, + "loss": 1.8223, + "step": 8226 + }, + { + "epoch": 2.525168815224064, + "grad_norm": 0.29819566011428833, + "learning_rate": 8.757272972133927e-05, + "loss": 1.8237, + "step": 8227 + }, + { + "epoch": 2.525475751995089, + "grad_norm": 0.26726382970809937, + "learning_rate": 8.756945003580559e-05, + "loss": 1.8134, + "step": 8228 + }, + { + "epoch": 2.5257826887661143, + "grad_norm": 0.2845614552497864, + "learning_rate": 8.756616997898859e-05, + "loss": 1.8757, + "step": 8229 + }, + { + "epoch": 2.5260896255371392, + "grad_norm": 0.33399102091789246, + "learning_rate": 8.756288955092066e-05, + "loss": 1.9036, + "step": 8230 + }, + { + "epoch": 2.5263965623081646, + "grad_norm": 0.3839001953601837, + "learning_rate": 8.755960875163426e-05, + "loss": 1.8205, + "step": 8231 + }, + { + "epoch": 2.52670349907919, + "grad_norm": 0.3703761696815491, + "learning_rate": 8.75563275811618e-05, + "loss": 1.768, + "step": 8232 + }, + { + "epoch": 2.527010435850215, + "grad_norm": 0.3083760440349579, + "learning_rate": 8.755304603953568e-05, + "loss": 1.8621, + "step": 8233 + }, + { + "epoch": 2.52731737262124, + "grad_norm": 0.2995334267616272, + "learning_rate": 8.754976412678833e-05, + "loss": 1.8246, + "step": 8234 + }, + { + "epoch": 2.527624309392265, + "grad_norm": 0.3482929766178131, + "learning_rate": 8.754648184295222e-05, + "loss": 1.7982, + "step": 8235 + }, + { + "epoch": 2.5279312461632903, + "grad_norm": 0.37462911009788513, + "learning_rate": 8.754319918805978e-05, + "loss": 1.8458, + "step": 8236 + }, + { + "epoch": 2.5282381829343157, + "grad_norm": 0.3112029433250427, + "learning_rate": 8.753991616214343e-05, + "loss": 1.9116, + "step": 8237 + }, + { + "epoch": 2.5285451197053406, + "grad_norm": 0.309711217880249, + "learning_rate": 8.753663276523563e-05, + "loss": 1.8072, + "step": 8238 + }, + { + "epoch": 2.528852056476366, + "grad_norm": 0.3831833302974701, + "learning_rate": 8.753334899736882e-05, + "loss": 1.8769, + "step": 8239 + }, + { + "epoch": 2.529158993247391, + "grad_norm": 0.30272287130355835, + "learning_rate": 8.753006485857547e-05, + "loss": 1.7874, + "step": 8240 + }, + { + "epoch": 2.529465930018416, + "grad_norm": 0.3613976538181305, + "learning_rate": 8.752678034888801e-05, + "loss": 1.8591, + "step": 8241 + }, + { + "epoch": 2.5297728667894415, + "grad_norm": 0.35976549983024597, + "learning_rate": 8.75234954683389e-05, + "loss": 1.7831, + "step": 8242 + }, + { + "epoch": 2.530079803560467, + "grad_norm": 0.33987951278686523, + "learning_rate": 8.752021021696064e-05, + "loss": 1.7986, + "step": 8243 + }, + { + "epoch": 2.5303867403314917, + "grad_norm": 0.29231634736061096, + "learning_rate": 8.751692459478567e-05, + "loss": 1.8205, + "step": 8244 + }, + { + "epoch": 2.530693677102517, + "grad_norm": 0.3382028341293335, + "learning_rate": 8.751363860184644e-05, + "loss": 1.8403, + "step": 8245 + }, + { + "epoch": 2.531000613873542, + "grad_norm": 0.44643479585647583, + "learning_rate": 8.751035223817546e-05, + "loss": 1.8273, + "step": 8246 + }, + { + "epoch": 2.5313075506445673, + "grad_norm": 0.4412732720375061, + "learning_rate": 8.750706550380518e-05, + "loss": 1.7935, + "step": 8247 + }, + { + "epoch": 2.5316144874155926, + "grad_norm": 0.3826131820678711, + "learning_rate": 8.750377839876811e-05, + "loss": 1.8622, + "step": 8248 + }, + { + "epoch": 2.5319214241866175, + "grad_norm": 0.27509525418281555, + "learning_rate": 8.750049092309672e-05, + "loss": 1.8359, + "step": 8249 + }, + { + "epoch": 2.532228360957643, + "grad_norm": 0.36282727122306824, + "learning_rate": 8.749720307682348e-05, + "loss": 1.8531, + "step": 8250 + }, + { + "epoch": 2.5325352977286677, + "grad_norm": 0.3730177581310272, + "learning_rate": 8.749391485998091e-05, + "loss": 1.8616, + "step": 8251 + }, + { + "epoch": 2.532842234499693, + "grad_norm": 0.3347858190536499, + "learning_rate": 8.749062627260152e-05, + "loss": 1.8078, + "step": 8252 + }, + { + "epoch": 2.5331491712707184, + "grad_norm": 0.29422396421432495, + "learning_rate": 8.748733731471777e-05, + "loss": 1.8623, + "step": 8253 + }, + { + "epoch": 2.5334561080417433, + "grad_norm": 0.36915895342826843, + "learning_rate": 8.748404798636219e-05, + "loss": 1.8461, + "step": 8254 + }, + { + "epoch": 2.5337630448127686, + "grad_norm": 0.4497677981853485, + "learning_rate": 8.748075828756725e-05, + "loss": 1.8328, + "step": 8255 + }, + { + "epoch": 2.5340699815837935, + "grad_norm": 0.4770478308200836, + "learning_rate": 8.747746821836552e-05, + "loss": 1.8418, + "step": 8256 + }, + { + "epoch": 2.534376918354819, + "grad_norm": 0.39125776290893555, + "learning_rate": 8.747417777878946e-05, + "loss": 1.8044, + "step": 8257 + }, + { + "epoch": 2.534683855125844, + "grad_norm": 0.2976539731025696, + "learning_rate": 8.747088696887163e-05, + "loss": 1.8819, + "step": 8258 + }, + { + "epoch": 2.5349907918968695, + "grad_norm": 0.37511107325553894, + "learning_rate": 8.746759578864452e-05, + "loss": 1.8304, + "step": 8259 + }, + { + "epoch": 2.5352977286678944, + "grad_norm": 0.4462794363498688, + "learning_rate": 8.746430423814068e-05, + "loss": 1.8248, + "step": 8260 + }, + { + "epoch": 2.5356046654389197, + "grad_norm": 0.3465537130832672, + "learning_rate": 8.746101231739261e-05, + "loss": 1.7987, + "step": 8261 + }, + { + "epoch": 2.5359116022099446, + "grad_norm": 0.3182581663131714, + "learning_rate": 8.745772002643287e-05, + "loss": 1.8817, + "step": 8262 + }, + { + "epoch": 2.53621853898097, + "grad_norm": 0.43006083369255066, + "learning_rate": 8.745442736529398e-05, + "loss": 1.8003, + "step": 8263 + }, + { + "epoch": 2.5365254757519953, + "grad_norm": 0.45511460304260254, + "learning_rate": 8.745113433400849e-05, + "loss": 1.8735, + "step": 8264 + }, + { + "epoch": 2.53683241252302, + "grad_norm": 0.3625985085964203, + "learning_rate": 8.744784093260894e-05, + "loss": 1.8469, + "step": 8265 + }, + { + "epoch": 2.5371393492940455, + "grad_norm": 0.2977297306060791, + "learning_rate": 8.744454716112787e-05, + "loss": 1.7885, + "step": 8266 + }, + { + "epoch": 2.5374462860650704, + "grad_norm": 0.34910085797309875, + "learning_rate": 8.744125301959785e-05, + "loss": 1.8885, + "step": 8267 + }, + { + "epoch": 2.5377532228360957, + "grad_norm": 0.40707942843437195, + "learning_rate": 8.743795850805141e-05, + "loss": 1.8829, + "step": 8268 + }, + { + "epoch": 2.538060159607121, + "grad_norm": 0.4142697751522064, + "learning_rate": 8.743466362652114e-05, + "loss": 1.903, + "step": 8269 + }, + { + "epoch": 2.538367096378146, + "grad_norm": 0.38610437512397766, + "learning_rate": 8.743136837503958e-05, + "loss": 1.9245, + "step": 8270 + }, + { + "epoch": 2.5386740331491713, + "grad_norm": 0.2940465211868286, + "learning_rate": 8.742807275363928e-05, + "loss": 1.8532, + "step": 8271 + }, + { + "epoch": 2.538980969920196, + "grad_norm": 0.3257673978805542, + "learning_rate": 8.742477676235284e-05, + "loss": 1.8517, + "step": 8272 + }, + { + "epoch": 2.5392879066912215, + "grad_norm": 0.3709326982498169, + "learning_rate": 8.742148040121282e-05, + "loss": 1.872, + "step": 8273 + }, + { + "epoch": 2.539594843462247, + "grad_norm": 0.3433123826980591, + "learning_rate": 8.741818367025179e-05, + "loss": 1.8717, + "step": 8274 + }, + { + "epoch": 2.539901780233272, + "grad_norm": 0.39426255226135254, + "learning_rate": 8.741488656950234e-05, + "loss": 1.8155, + "step": 8275 + }, + { + "epoch": 2.540208717004297, + "grad_norm": 0.48205071687698364, + "learning_rate": 8.741158909899706e-05, + "loss": 1.8668, + "step": 8276 + }, + { + "epoch": 2.5405156537753224, + "grad_norm": 0.35280337929725647, + "learning_rate": 8.740829125876853e-05, + "loss": 1.7845, + "step": 8277 + }, + { + "epoch": 2.5408225905463473, + "grad_norm": 0.3148525059223175, + "learning_rate": 8.740499304884932e-05, + "loss": 1.8539, + "step": 8278 + }, + { + "epoch": 2.5411295273173726, + "grad_norm": 0.387932687997818, + "learning_rate": 8.740169446927207e-05, + "loss": 1.8514, + "step": 8279 + }, + { + "epoch": 2.541436464088398, + "grad_norm": 0.37375807762145996, + "learning_rate": 8.739839552006934e-05, + "loss": 1.8497, + "step": 8280 + }, + { + "epoch": 2.541743400859423, + "grad_norm": 0.3094288408756256, + "learning_rate": 8.739509620127375e-05, + "loss": 1.8675, + "step": 8281 + }, + { + "epoch": 2.542050337630448, + "grad_norm": 0.36951884627342224, + "learning_rate": 8.73917965129179e-05, + "loss": 1.8533, + "step": 8282 + }, + { + "epoch": 2.542357274401473, + "grad_norm": 0.39360809326171875, + "learning_rate": 8.73884964550344e-05, + "loss": 1.8688, + "step": 8283 + }, + { + "epoch": 2.5426642111724984, + "grad_norm": 0.29781201481819153, + "learning_rate": 8.738519602765586e-05, + "loss": 1.8285, + "step": 8284 + }, + { + "epoch": 2.5429711479435237, + "grad_norm": 0.29476743936538696, + "learning_rate": 8.73818952308149e-05, + "loss": 1.8234, + "step": 8285 + }, + { + "epoch": 2.5432780847145486, + "grad_norm": 0.3660123646259308, + "learning_rate": 8.737859406454416e-05, + "loss": 1.8933, + "step": 8286 + }, + { + "epoch": 2.543585021485574, + "grad_norm": 0.41587865352630615, + "learning_rate": 8.737529252887621e-05, + "loss": 1.8799, + "step": 8287 + }, + { + "epoch": 2.5438919582565993, + "grad_norm": 0.4183691143989563, + "learning_rate": 8.737199062384374e-05, + "loss": 1.8479, + "step": 8288 + }, + { + "epoch": 2.544198895027624, + "grad_norm": 0.35940057039260864, + "learning_rate": 8.736868834947935e-05, + "loss": 1.8164, + "step": 8289 + }, + { + "epoch": 2.5445058317986495, + "grad_norm": 0.26804691553115845, + "learning_rate": 8.736538570581568e-05, + "loss": 1.8017, + "step": 8290 + }, + { + "epoch": 2.544812768569675, + "grad_norm": 0.34537792205810547, + "learning_rate": 8.736208269288534e-05, + "loss": 1.9002, + "step": 8291 + }, + { + "epoch": 2.5451197053406998, + "grad_norm": 0.4636915624141693, + "learning_rate": 8.735877931072106e-05, + "loss": 1.8207, + "step": 8292 + }, + { + "epoch": 2.545426642111725, + "grad_norm": 0.4897560775279999, + "learning_rate": 8.735547555935537e-05, + "loss": 1.7981, + "step": 8293 + }, + { + "epoch": 2.54573357888275, + "grad_norm": 0.37379372119903564, + "learning_rate": 8.7352171438821e-05, + "loss": 1.8727, + "step": 8294 + }, + { + "epoch": 2.5460405156537753, + "grad_norm": 0.295436292886734, + "learning_rate": 8.734886694915059e-05, + "loss": 1.8321, + "step": 8295 + }, + { + "epoch": 2.5463474524248007, + "grad_norm": 0.40406084060668945, + "learning_rate": 8.734556209037676e-05, + "loss": 1.8666, + "step": 8296 + }, + { + "epoch": 2.5466543891958255, + "grad_norm": 0.3286290466785431, + "learning_rate": 8.734225686253221e-05, + "loss": 1.8574, + "step": 8297 + }, + { + "epoch": 2.546961325966851, + "grad_norm": 0.3200569152832031, + "learning_rate": 8.73389512656496e-05, + "loss": 1.8253, + "step": 8298 + }, + { + "epoch": 2.5472682627378758, + "grad_norm": 0.35550132393836975, + "learning_rate": 8.733564529976157e-05, + "loss": 1.8293, + "step": 8299 + }, + { + "epoch": 2.547575199508901, + "grad_norm": 0.3804685175418854, + "learning_rate": 8.733233896490081e-05, + "loss": 1.8689, + "step": 8300 + }, + { + "epoch": 2.5478821362799264, + "grad_norm": 0.34739598631858826, + "learning_rate": 8.73290322611e-05, + "loss": 1.8441, + "step": 8301 + }, + { + "epoch": 2.5481890730509518, + "grad_norm": 0.29757586121559143, + "learning_rate": 8.732572518839182e-05, + "loss": 1.8698, + "step": 8302 + }, + { + "epoch": 2.5484960098219767, + "grad_norm": 0.30403536558151245, + "learning_rate": 8.732241774680895e-05, + "loss": 1.8305, + "step": 8303 + }, + { + "epoch": 2.548802946593002, + "grad_norm": 0.326876699924469, + "learning_rate": 8.731910993638406e-05, + "loss": 1.8514, + "step": 8304 + }, + { + "epoch": 2.549109883364027, + "grad_norm": 0.3108467161655426, + "learning_rate": 8.731580175714986e-05, + "loss": 1.8509, + "step": 8305 + }, + { + "epoch": 2.549416820135052, + "grad_norm": 0.31641489267349243, + "learning_rate": 8.731249320913904e-05, + "loss": 1.9009, + "step": 8306 + }, + { + "epoch": 2.5497237569060776, + "grad_norm": 0.3166131377220154, + "learning_rate": 8.730918429238428e-05, + "loss": 1.8291, + "step": 8307 + }, + { + "epoch": 2.5500306936771024, + "grad_norm": 0.27900195121765137, + "learning_rate": 8.730587500691829e-05, + "loss": 1.856, + "step": 8308 + }, + { + "epoch": 2.550337630448128, + "grad_norm": 0.3000704050064087, + "learning_rate": 8.730256535277379e-05, + "loss": 1.839, + "step": 8309 + }, + { + "epoch": 2.5506445672191527, + "grad_norm": 0.30938518047332764, + "learning_rate": 8.729925532998348e-05, + "loss": 1.929, + "step": 8310 + }, + { + "epoch": 2.550951503990178, + "grad_norm": 0.3687250316143036, + "learning_rate": 8.729594493858007e-05, + "loss": 1.9214, + "step": 8311 + }, + { + "epoch": 2.5512584407612033, + "grad_norm": 0.3302690386772156, + "learning_rate": 8.729263417859625e-05, + "loss": 1.8667, + "step": 8312 + }, + { + "epoch": 2.5515653775322282, + "grad_norm": 0.32535505294799805, + "learning_rate": 8.728932305006478e-05, + "loss": 1.8298, + "step": 8313 + }, + { + "epoch": 2.5518723143032536, + "grad_norm": 0.3425545394420624, + "learning_rate": 8.728601155301834e-05, + "loss": 1.9479, + "step": 8314 + }, + { + "epoch": 2.5521792510742785, + "grad_norm": 0.29452621936798096, + "learning_rate": 8.72826996874897e-05, + "loss": 1.7963, + "step": 8315 + }, + { + "epoch": 2.552486187845304, + "grad_norm": 0.28749120235443115, + "learning_rate": 8.727938745351156e-05, + "loss": 1.7993, + "step": 8316 + }, + { + "epoch": 2.552793124616329, + "grad_norm": 0.29261404275894165, + "learning_rate": 8.727607485111669e-05, + "loss": 1.8307, + "step": 8317 + }, + { + "epoch": 2.5531000613873545, + "grad_norm": 0.2949221730232239, + "learning_rate": 8.727276188033778e-05, + "loss": 1.7918, + "step": 8318 + }, + { + "epoch": 2.5534069981583793, + "grad_norm": 0.2975117862224579, + "learning_rate": 8.726944854120757e-05, + "loss": 1.8488, + "step": 8319 + }, + { + "epoch": 2.5537139349294047, + "grad_norm": 0.30285659432411194, + "learning_rate": 8.726613483375885e-05, + "loss": 1.8763, + "step": 8320 + }, + { + "epoch": 2.5540208717004296, + "grad_norm": 0.3068414330482483, + "learning_rate": 8.726282075802435e-05, + "loss": 1.8684, + "step": 8321 + }, + { + "epoch": 2.554327808471455, + "grad_norm": 0.3904091715812683, + "learning_rate": 8.72595063140368e-05, + "loss": 1.8643, + "step": 8322 + }, + { + "epoch": 2.5546347452424802, + "grad_norm": 0.443294882774353, + "learning_rate": 8.725619150182897e-05, + "loss": 1.8268, + "step": 8323 + }, + { + "epoch": 2.554941682013505, + "grad_norm": 0.4574877619743347, + "learning_rate": 8.725287632143362e-05, + "loss": 1.8686, + "step": 8324 + }, + { + "epoch": 2.5552486187845305, + "grad_norm": 0.3246860206127167, + "learning_rate": 8.724956077288351e-05, + "loss": 1.8304, + "step": 8325 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.30745935440063477, + "learning_rate": 8.724624485621141e-05, + "loss": 1.8129, + "step": 8326 + }, + { + "epoch": 2.5558624923265807, + "grad_norm": 0.4026782214641571, + "learning_rate": 8.72429285714501e-05, + "loss": 1.8511, + "step": 8327 + }, + { + "epoch": 2.556169429097606, + "grad_norm": 0.41659530997276306, + "learning_rate": 8.723961191863232e-05, + "loss": 1.891, + "step": 8328 + }, + { + "epoch": 2.556476365868631, + "grad_norm": 0.31792551279067993, + "learning_rate": 8.723629489779088e-05, + "loss": 1.8413, + "step": 8329 + }, + { + "epoch": 2.5567833026396563, + "grad_norm": 0.3168247640132904, + "learning_rate": 8.723297750895856e-05, + "loss": 1.902, + "step": 8330 + }, + { + "epoch": 2.557090239410681, + "grad_norm": 0.27834242582321167, + "learning_rate": 8.72296597521681e-05, + "loss": 1.8185, + "step": 8331 + }, + { + "epoch": 2.5573971761817065, + "grad_norm": 0.2997399568557739, + "learning_rate": 8.722634162745236e-05, + "loss": 1.8389, + "step": 8332 + }, + { + "epoch": 2.557704112952732, + "grad_norm": 0.29116490483283997, + "learning_rate": 8.722302313484407e-05, + "loss": 1.8391, + "step": 8333 + }, + { + "epoch": 2.558011049723757, + "grad_norm": 0.2898460030555725, + "learning_rate": 8.721970427437605e-05, + "loss": 1.8891, + "step": 8334 + }, + { + "epoch": 2.558317986494782, + "grad_norm": 0.3231159746646881, + "learning_rate": 8.721638504608109e-05, + "loss": 1.826, + "step": 8335 + }, + { + "epoch": 2.5586249232658074, + "grad_norm": 0.38665273785591125, + "learning_rate": 8.721306544999203e-05, + "loss": 1.9162, + "step": 8336 + }, + { + "epoch": 2.5589318600368323, + "grad_norm": 0.367824912071228, + "learning_rate": 8.720974548614162e-05, + "loss": 1.8165, + "step": 8337 + }, + { + "epoch": 2.5592387968078576, + "grad_norm": 0.3095315098762512, + "learning_rate": 8.72064251545627e-05, + "loss": 1.8887, + "step": 8338 + }, + { + "epoch": 2.559545733578883, + "grad_norm": 0.316890150308609, + "learning_rate": 8.720310445528807e-05, + "loss": 1.8547, + "step": 8339 + }, + { + "epoch": 2.559852670349908, + "grad_norm": 0.2962728440761566, + "learning_rate": 8.719978338835057e-05, + "loss": 1.8252, + "step": 8340 + }, + { + "epoch": 2.560159607120933, + "grad_norm": 0.3351762890815735, + "learning_rate": 8.719646195378302e-05, + "loss": 1.8056, + "step": 8341 + }, + { + "epoch": 2.560466543891958, + "grad_norm": 0.2946149706840515, + "learning_rate": 8.719314015161822e-05, + "loss": 1.8219, + "step": 8342 + }, + { + "epoch": 2.5607734806629834, + "grad_norm": 0.30291053652763367, + "learning_rate": 8.718981798188899e-05, + "loss": 1.8161, + "step": 8343 + }, + { + "epoch": 2.5610804174340087, + "grad_norm": 0.30717429518699646, + "learning_rate": 8.71864954446282e-05, + "loss": 1.8763, + "step": 8344 + }, + { + "epoch": 2.5613873542050336, + "grad_norm": 0.28360515832901, + "learning_rate": 8.718317253986866e-05, + "loss": 1.7972, + "step": 8345 + }, + { + "epoch": 2.561694290976059, + "grad_norm": 0.34898701310157776, + "learning_rate": 8.717984926764322e-05, + "loss": 1.8843, + "step": 8346 + }, + { + "epoch": 2.562001227747084, + "grad_norm": 0.2702360451221466, + "learning_rate": 8.717652562798472e-05, + "loss": 1.7917, + "step": 8347 + }, + { + "epoch": 2.562308164518109, + "grad_norm": 0.30566295981407166, + "learning_rate": 8.7173201620926e-05, + "loss": 1.9027, + "step": 8348 + }, + { + "epoch": 2.5626151012891345, + "grad_norm": 0.2882433533668518, + "learning_rate": 8.716987724649991e-05, + "loss": 1.8167, + "step": 8349 + }, + { + "epoch": 2.56292203806016, + "grad_norm": 0.2616370916366577, + "learning_rate": 8.71665525047393e-05, + "loss": 1.7779, + "step": 8350 + }, + { + "epoch": 2.5632289748311847, + "grad_norm": 0.3033899664878845, + "learning_rate": 8.716322739567706e-05, + "loss": 1.9022, + "step": 8351 + }, + { + "epoch": 2.56353591160221, + "grad_norm": 0.30584800243377686, + "learning_rate": 8.7159901919346e-05, + "loss": 1.8808, + "step": 8352 + }, + { + "epoch": 2.563842848373235, + "grad_norm": 0.34650805592536926, + "learning_rate": 8.715657607577903e-05, + "loss": 1.8817, + "step": 8353 + }, + { + "epoch": 2.5641497851442603, + "grad_norm": 0.30568572878837585, + "learning_rate": 8.715324986500898e-05, + "loss": 1.8852, + "step": 8354 + }, + { + "epoch": 2.5644567219152856, + "grad_norm": 0.36174869537353516, + "learning_rate": 8.714992328706875e-05, + "loss": 1.8518, + "step": 8355 + }, + { + "epoch": 2.5647636586863105, + "grad_norm": 0.48538872599601746, + "learning_rate": 8.714659634199119e-05, + "loss": 1.8902, + "step": 8356 + }, + { + "epoch": 2.565070595457336, + "grad_norm": 0.44997766613960266, + "learning_rate": 8.71432690298092e-05, + "loss": 1.8914, + "step": 8357 + }, + { + "epoch": 2.5653775322283607, + "grad_norm": 0.30164965987205505, + "learning_rate": 8.713994135055566e-05, + "loss": 1.826, + "step": 8358 + }, + { + "epoch": 2.565684468999386, + "grad_norm": 0.35495996475219727, + "learning_rate": 8.713661330426345e-05, + "loss": 1.8006, + "step": 8359 + }, + { + "epoch": 2.5659914057704114, + "grad_norm": 0.4141593277454376, + "learning_rate": 8.713328489096545e-05, + "loss": 1.782, + "step": 8360 + }, + { + "epoch": 2.5662983425414367, + "grad_norm": 0.4758378267288208, + "learning_rate": 8.712995611069458e-05, + "loss": 1.8378, + "step": 8361 + }, + { + "epoch": 2.5666052793124616, + "grad_norm": 0.4852865934371948, + "learning_rate": 8.71266269634837e-05, + "loss": 1.8472, + "step": 8362 + }, + { + "epoch": 2.566912216083487, + "grad_norm": 0.43413496017456055, + "learning_rate": 8.712329744936576e-05, + "loss": 1.8118, + "step": 8363 + }, + { + "epoch": 2.567219152854512, + "grad_norm": 0.3100700080394745, + "learning_rate": 8.711996756837361e-05, + "loss": 1.8699, + "step": 8364 + }, + { + "epoch": 2.567526089625537, + "grad_norm": 0.31886258721351624, + "learning_rate": 8.711663732054021e-05, + "loss": 1.8022, + "step": 8365 + }, + { + "epoch": 2.5678330263965625, + "grad_norm": 0.38900697231292725, + "learning_rate": 8.711330670589841e-05, + "loss": 1.8119, + "step": 8366 + }, + { + "epoch": 2.5681399631675874, + "grad_norm": 0.4188348650932312, + "learning_rate": 8.710997572448119e-05, + "loss": 1.8561, + "step": 8367 + }, + { + "epoch": 2.5684468999386127, + "grad_norm": 0.3562021255493164, + "learning_rate": 8.710664437632143e-05, + "loss": 1.8605, + "step": 8368 + }, + { + "epoch": 2.5687538367096376, + "grad_norm": 0.3105112910270691, + "learning_rate": 8.710331266145206e-05, + "loss": 1.8122, + "step": 8369 + }, + { + "epoch": 2.569060773480663, + "grad_norm": 0.3209846615791321, + "learning_rate": 8.7099980579906e-05, + "loss": 1.8914, + "step": 8370 + }, + { + "epoch": 2.5693677102516883, + "grad_norm": 0.32560455799102783, + "learning_rate": 8.70966481317162e-05, + "loss": 1.9245, + "step": 8371 + }, + { + "epoch": 2.569674647022713, + "grad_norm": 0.29573267698287964, + "learning_rate": 8.709331531691558e-05, + "loss": 1.8576, + "step": 8372 + }, + { + "epoch": 2.5699815837937385, + "grad_norm": 0.2974778115749359, + "learning_rate": 8.708998213553707e-05, + "loss": 1.8464, + "step": 8373 + }, + { + "epoch": 2.5702885205647634, + "grad_norm": 0.3264322578907013, + "learning_rate": 8.708664858761362e-05, + "loss": 1.8945, + "step": 8374 + }, + { + "epoch": 2.5705954573357888, + "grad_norm": 0.28260353207588196, + "learning_rate": 8.708331467317816e-05, + "loss": 1.8296, + "step": 8375 + }, + { + "epoch": 2.570902394106814, + "grad_norm": 0.2991141676902771, + "learning_rate": 8.707998039226367e-05, + "loss": 1.9227, + "step": 8376 + }, + { + "epoch": 2.5712093308778394, + "grad_norm": 0.28582924604415894, + "learning_rate": 8.707664574490306e-05, + "loss": 1.8465, + "step": 8377 + }, + { + "epoch": 2.5715162676488643, + "grad_norm": 0.2860773205757141, + "learning_rate": 8.707331073112932e-05, + "loss": 1.8403, + "step": 8378 + }, + { + "epoch": 2.5718232044198897, + "grad_norm": 0.31145161390304565, + "learning_rate": 8.70699753509754e-05, + "loss": 1.8775, + "step": 8379 + }, + { + "epoch": 2.5721301411909145, + "grad_norm": 0.28711119294166565, + "learning_rate": 8.706663960447424e-05, + "loss": 1.8354, + "step": 8380 + }, + { + "epoch": 2.57243707796194, + "grad_norm": 0.2884272634983063, + "learning_rate": 8.706330349165884e-05, + "loss": 1.8772, + "step": 8381 + }, + { + "epoch": 2.572744014732965, + "grad_norm": 0.3581789433956146, + "learning_rate": 8.705996701256214e-05, + "loss": 1.8654, + "step": 8382 + }, + { + "epoch": 2.57305095150399, + "grad_norm": 0.41561809182167053, + "learning_rate": 8.705663016721712e-05, + "loss": 1.9112, + "step": 8383 + }, + { + "epoch": 2.5733578882750154, + "grad_norm": 0.301883727312088, + "learning_rate": 8.705329295565676e-05, + "loss": 1.803, + "step": 8384 + }, + { + "epoch": 2.5736648250460403, + "grad_norm": 0.37060779333114624, + "learning_rate": 8.704995537791405e-05, + "loss": 1.9371, + "step": 8385 + }, + { + "epoch": 2.5739717618170657, + "grad_norm": 0.44705548882484436, + "learning_rate": 8.704661743402195e-05, + "loss": 1.8599, + "step": 8386 + }, + { + "epoch": 2.574278698588091, + "grad_norm": 0.44097039103507996, + "learning_rate": 8.70432791240135e-05, + "loss": 1.8305, + "step": 8387 + }, + { + "epoch": 2.574585635359116, + "grad_norm": 0.3278143107891083, + "learning_rate": 8.703994044792161e-05, + "loss": 1.8817, + "step": 8388 + }, + { + "epoch": 2.574892572130141, + "grad_norm": 0.347153902053833, + "learning_rate": 8.703660140577934e-05, + "loss": 1.8182, + "step": 8389 + }, + { + "epoch": 2.575199508901166, + "grad_norm": 0.4667893052101135, + "learning_rate": 8.703326199761966e-05, + "loss": 1.8354, + "step": 8390 + }, + { + "epoch": 2.5755064456721914, + "grad_norm": 0.4956285059452057, + "learning_rate": 8.702992222347559e-05, + "loss": 1.8284, + "step": 8391 + }, + { + "epoch": 2.575813382443217, + "grad_norm": 0.3489355146884918, + "learning_rate": 8.702658208338012e-05, + "loss": 1.8439, + "step": 8392 + }, + { + "epoch": 2.576120319214242, + "grad_norm": 0.3054865002632141, + "learning_rate": 8.702324157736625e-05, + "loss": 1.8659, + "step": 8393 + }, + { + "epoch": 2.576427255985267, + "grad_norm": 0.3459004759788513, + "learning_rate": 8.701990070546703e-05, + "loss": 1.8644, + "step": 8394 + }, + { + "epoch": 2.5767341927562923, + "grad_norm": 0.34715306758880615, + "learning_rate": 8.701655946771544e-05, + "loss": 1.8765, + "step": 8395 + }, + { + "epoch": 2.5770411295273172, + "grad_norm": 0.35610535740852356, + "learning_rate": 8.701321786414452e-05, + "loss": 1.886, + "step": 8396 + }, + { + "epoch": 2.5773480662983426, + "grad_norm": 0.34869852662086487, + "learning_rate": 8.700987589478728e-05, + "loss": 1.8858, + "step": 8397 + }, + { + "epoch": 2.577655003069368, + "grad_norm": 0.33508050441741943, + "learning_rate": 8.700653355967675e-05, + "loss": 1.8429, + "step": 8398 + }, + { + "epoch": 2.577961939840393, + "grad_norm": 0.4707668721675873, + "learning_rate": 8.700319085884597e-05, + "loss": 1.8806, + "step": 8399 + }, + { + "epoch": 2.578268876611418, + "grad_norm": 0.5073609948158264, + "learning_rate": 8.699984779232797e-05, + "loss": 1.9252, + "step": 8400 + }, + { + "epoch": 2.578575813382443, + "grad_norm": 0.4120771884918213, + "learning_rate": 8.699650436015578e-05, + "loss": 1.9463, + "step": 8401 + }, + { + "epoch": 2.5788827501534684, + "grad_norm": 0.5639505386352539, + "learning_rate": 8.699316056236246e-05, + "loss": 1.9076, + "step": 8402 + }, + { + "epoch": 2.5791896869244937, + "grad_norm": 0.7611388564109802, + "learning_rate": 8.698981639898106e-05, + "loss": 1.8344, + "step": 8403 + }, + { + "epoch": 2.5794966236955186, + "grad_norm": 0.715629518032074, + "learning_rate": 8.69864718700446e-05, + "loss": 1.7928, + "step": 8404 + }, + { + "epoch": 2.579803560466544, + "grad_norm": 0.4248988926410675, + "learning_rate": 8.698312697558614e-05, + "loss": 1.835, + "step": 8405 + }, + { + "epoch": 2.580110497237569, + "grad_norm": 0.3638152778148651, + "learning_rate": 8.697978171563875e-05, + "loss": 1.8544, + "step": 8406 + }, + { + "epoch": 2.580417434008594, + "grad_norm": 0.40734997391700745, + "learning_rate": 8.697643609023547e-05, + "loss": 1.7759, + "step": 8407 + }, + { + "epoch": 2.5807243707796195, + "grad_norm": 0.41469305753707886, + "learning_rate": 8.697309009940939e-05, + "loss": 1.8989, + "step": 8408 + }, + { + "epoch": 2.581031307550645, + "grad_norm": 0.3003403842449188, + "learning_rate": 8.696974374319355e-05, + "loss": 1.8138, + "step": 8409 + }, + { + "epoch": 2.5813382443216697, + "grad_norm": 0.3475555181503296, + "learning_rate": 8.696639702162104e-05, + "loss": 1.8851, + "step": 8410 + }, + { + "epoch": 2.581645181092695, + "grad_norm": 0.3952930271625519, + "learning_rate": 8.696304993472493e-05, + "loss": 1.8421, + "step": 8411 + }, + { + "epoch": 2.58195211786372, + "grad_norm": 0.33059266209602356, + "learning_rate": 8.69597024825383e-05, + "loss": 1.886, + "step": 8412 + }, + { + "epoch": 2.5822590546347453, + "grad_norm": 0.291877806186676, + "learning_rate": 8.695635466509422e-05, + "loss": 1.8001, + "step": 8413 + }, + { + "epoch": 2.5825659914057706, + "grad_norm": 0.3707219064235687, + "learning_rate": 8.69530064824258e-05, + "loss": 1.8419, + "step": 8414 + }, + { + "epoch": 2.5828729281767955, + "grad_norm": 0.4656111001968384, + "learning_rate": 8.694965793456609e-05, + "loss": 1.8925, + "step": 8415 + }, + { + "epoch": 2.583179864947821, + "grad_norm": 0.4284421503543854, + "learning_rate": 8.694630902154821e-05, + "loss": 1.8794, + "step": 8416 + }, + { + "epoch": 2.5834868017188457, + "grad_norm": 0.25311100482940674, + "learning_rate": 8.694295974340525e-05, + "loss": 1.8004, + "step": 8417 + }, + { + "epoch": 2.583793738489871, + "grad_norm": 0.3463805615901947, + "learning_rate": 8.693961010017031e-05, + "loss": 1.8666, + "step": 8418 + }, + { + "epoch": 2.5841006752608964, + "grad_norm": 0.3193957209587097, + "learning_rate": 8.693626009187647e-05, + "loss": 1.8787, + "step": 8419 + }, + { + "epoch": 2.5844076120319213, + "grad_norm": 0.30919939279556274, + "learning_rate": 8.69329097185569e-05, + "loss": 1.9066, + "step": 8420 + }, + { + "epoch": 2.5847145488029466, + "grad_norm": 0.31369611620903015, + "learning_rate": 8.692955898024464e-05, + "loss": 1.8714, + "step": 8421 + }, + { + "epoch": 2.5850214855739715, + "grad_norm": 0.3191319406032562, + "learning_rate": 8.692620787697284e-05, + "loss": 1.8535, + "step": 8422 + }, + { + "epoch": 2.585328422344997, + "grad_norm": 0.3148418366909027, + "learning_rate": 8.692285640877462e-05, + "loss": 1.8648, + "step": 8423 + }, + { + "epoch": 2.585635359116022, + "grad_norm": 0.28245437145233154, + "learning_rate": 8.691950457568307e-05, + "loss": 1.8574, + "step": 8424 + }, + { + "epoch": 2.5859422958870475, + "grad_norm": 0.28383150696754456, + "learning_rate": 8.691615237773137e-05, + "loss": 1.7993, + "step": 8425 + }, + { + "epoch": 2.5862492326580724, + "grad_norm": 0.30522802472114563, + "learning_rate": 8.691279981495257e-05, + "loss": 1.8809, + "step": 8426 + }, + { + "epoch": 2.5865561694290977, + "grad_norm": 0.2936995327472687, + "learning_rate": 8.690944688737988e-05, + "loss": 1.745, + "step": 8427 + }, + { + "epoch": 2.5868631062001226, + "grad_norm": 0.2923533320426941, + "learning_rate": 8.69060935950464e-05, + "loss": 1.8929, + "step": 8428 + }, + { + "epoch": 2.587170042971148, + "grad_norm": 0.3280770182609558, + "learning_rate": 8.690273993798526e-05, + "loss": 1.8587, + "step": 8429 + }, + { + "epoch": 2.5874769797421733, + "grad_norm": 0.314712792634964, + "learning_rate": 8.689938591622962e-05, + "loss": 1.8569, + "step": 8430 + }, + { + "epoch": 2.587783916513198, + "grad_norm": 0.3230959475040436, + "learning_rate": 8.689603152981263e-05, + "loss": 1.8451, + "step": 8431 + }, + { + "epoch": 2.5880908532842235, + "grad_norm": 0.35917067527770996, + "learning_rate": 8.689267677876742e-05, + "loss": 1.7755, + "step": 8432 + }, + { + "epoch": 2.5883977900552484, + "grad_norm": 0.3590618968009949, + "learning_rate": 8.688932166312715e-05, + "loss": 1.8236, + "step": 8433 + }, + { + "epoch": 2.5887047268262737, + "grad_norm": 0.29416507482528687, + "learning_rate": 8.6885966182925e-05, + "loss": 1.7852, + "step": 8434 + }, + { + "epoch": 2.589011663597299, + "grad_norm": 0.24230079352855682, + "learning_rate": 8.688261033819409e-05, + "loss": 1.8006, + "step": 8435 + }, + { + "epoch": 2.5893186003683244, + "grad_norm": 0.2519497573375702, + "learning_rate": 8.687925412896762e-05, + "loss": 1.7787, + "step": 8436 + }, + { + "epoch": 2.5896255371393493, + "grad_norm": 0.2794395089149475, + "learning_rate": 8.687589755527874e-05, + "loss": 1.8408, + "step": 8437 + }, + { + "epoch": 2.5899324739103746, + "grad_norm": 0.28811511397361755, + "learning_rate": 8.687254061716063e-05, + "loss": 1.8961, + "step": 8438 + }, + { + "epoch": 2.5902394106813995, + "grad_norm": 0.28127825260162354, + "learning_rate": 8.686918331464647e-05, + "loss": 1.8235, + "step": 8439 + }, + { + "epoch": 2.590546347452425, + "grad_norm": 0.2869607210159302, + "learning_rate": 8.686582564776942e-05, + "loss": 1.8452, + "step": 8440 + }, + { + "epoch": 2.59085328422345, + "grad_norm": 0.36350393295288086, + "learning_rate": 8.686246761656268e-05, + "loss": 1.9262, + "step": 8441 + }, + { + "epoch": 2.591160220994475, + "grad_norm": 0.30231785774230957, + "learning_rate": 8.685910922105942e-05, + "loss": 1.8674, + "step": 8442 + }, + { + "epoch": 2.5914671577655004, + "grad_norm": 0.28321847319602966, + "learning_rate": 8.685575046129285e-05, + "loss": 1.8243, + "step": 8443 + }, + { + "epoch": 2.5917740945365253, + "grad_norm": 0.30235186219215393, + "learning_rate": 8.685239133729615e-05, + "loss": 1.8442, + "step": 8444 + }, + { + "epoch": 2.5920810313075506, + "grad_norm": 0.2684946060180664, + "learning_rate": 8.684903184910252e-05, + "loss": 1.8584, + "step": 8445 + }, + { + "epoch": 2.592387968078576, + "grad_norm": 0.33788567781448364, + "learning_rate": 8.684567199674514e-05, + "loss": 1.8296, + "step": 8446 + }, + { + "epoch": 2.592694904849601, + "grad_norm": 0.38110965490341187, + "learning_rate": 8.684231178025726e-05, + "loss": 1.8581, + "step": 8447 + }, + { + "epoch": 2.593001841620626, + "grad_norm": 0.36466923356056213, + "learning_rate": 8.683895119967204e-05, + "loss": 1.8799, + "step": 8448 + }, + { + "epoch": 2.593308778391651, + "grad_norm": 0.3052733838558197, + "learning_rate": 8.683559025502272e-05, + "loss": 1.8834, + "step": 8449 + }, + { + "epoch": 2.5936157151626764, + "grad_norm": 0.31457164883613586, + "learning_rate": 8.683222894634251e-05, + "loss": 1.8635, + "step": 8450 + }, + { + "epoch": 2.5939226519337018, + "grad_norm": 0.46189576387405396, + "learning_rate": 8.682886727366464e-05, + "loss": 1.8852, + "step": 8451 + }, + { + "epoch": 2.594229588704727, + "grad_norm": 0.467640221118927, + "learning_rate": 8.682550523702229e-05, + "loss": 1.8306, + "step": 8452 + }, + { + "epoch": 2.594536525475752, + "grad_norm": 0.3384416699409485, + "learning_rate": 8.682214283644873e-05, + "loss": 1.8298, + "step": 8453 + }, + { + "epoch": 2.5948434622467773, + "grad_norm": 0.2842169404029846, + "learning_rate": 8.681878007197717e-05, + "loss": 1.8091, + "step": 8454 + }, + { + "epoch": 2.595150399017802, + "grad_norm": 0.31266552209854126, + "learning_rate": 8.681541694364084e-05, + "loss": 1.8329, + "step": 8455 + }, + { + "epoch": 2.5954573357888275, + "grad_norm": 0.36803483963012695, + "learning_rate": 8.681205345147298e-05, + "loss": 1.8427, + "step": 8456 + }, + { + "epoch": 2.595764272559853, + "grad_norm": 0.37500229477882385, + "learning_rate": 8.680868959550684e-05, + "loss": 1.8865, + "step": 8457 + }, + { + "epoch": 2.5960712093308778, + "grad_norm": 0.30494266748428345, + "learning_rate": 8.680532537577565e-05, + "loss": 1.8375, + "step": 8458 + }, + { + "epoch": 2.596378146101903, + "grad_norm": 0.38320985436439514, + "learning_rate": 8.680196079231266e-05, + "loss": 1.8762, + "step": 8459 + }, + { + "epoch": 2.596685082872928, + "grad_norm": 0.48555347323417664, + "learning_rate": 8.679859584515112e-05, + "loss": 1.8558, + "step": 8460 + }, + { + "epoch": 2.5969920196439533, + "grad_norm": 0.3975796401500702, + "learning_rate": 8.67952305343243e-05, + "loss": 1.8265, + "step": 8461 + }, + { + "epoch": 2.5972989564149787, + "grad_norm": 0.3312734365463257, + "learning_rate": 8.679186485986544e-05, + "loss": 1.8346, + "step": 8462 + }, + { + "epoch": 2.5976058931860035, + "grad_norm": 0.37137889862060547, + "learning_rate": 8.67884988218078e-05, + "loss": 1.8894, + "step": 8463 + }, + { + "epoch": 2.597912829957029, + "grad_norm": 0.3645901083946228, + "learning_rate": 8.678513242018467e-05, + "loss": 1.8103, + "step": 8464 + }, + { + "epoch": 2.5982197667280538, + "grad_norm": 0.35010847449302673, + "learning_rate": 8.67817656550293e-05, + "loss": 1.8704, + "step": 8465 + }, + { + "epoch": 2.598526703499079, + "grad_norm": 0.36948931217193604, + "learning_rate": 8.677839852637492e-05, + "loss": 1.8413, + "step": 8466 + }, + { + "epoch": 2.5988336402701044, + "grad_norm": 0.3512018322944641, + "learning_rate": 8.67750310342549e-05, + "loss": 1.8222, + "step": 8467 + }, + { + "epoch": 2.5991405770411298, + "grad_norm": 0.3678590953350067, + "learning_rate": 8.677166317870245e-05, + "loss": 1.852, + "step": 8468 + }, + { + "epoch": 2.5994475138121547, + "grad_norm": 0.46718111634254456, + "learning_rate": 8.676829495975087e-05, + "loss": 1.8459, + "step": 8469 + }, + { + "epoch": 2.59975445058318, + "grad_norm": 0.4580456018447876, + "learning_rate": 8.676492637743345e-05, + "loss": 1.8547, + "step": 8470 + }, + { + "epoch": 2.600061387354205, + "grad_norm": 0.3790566921234131, + "learning_rate": 8.676155743178348e-05, + "loss": 1.8483, + "step": 8471 + }, + { + "epoch": 2.6003683241252302, + "grad_norm": 0.34775233268737793, + "learning_rate": 8.675818812283424e-05, + "loss": 1.9, + "step": 8472 + }, + { + "epoch": 2.6006752608962556, + "grad_norm": 0.4257417619228363, + "learning_rate": 8.675481845061906e-05, + "loss": 1.8354, + "step": 8473 + }, + { + "epoch": 2.6009821976672804, + "grad_norm": 0.46964964270591736, + "learning_rate": 8.675144841517122e-05, + "loss": 1.8305, + "step": 8474 + }, + { + "epoch": 2.601289134438306, + "grad_norm": 0.3592812120914459, + "learning_rate": 8.674807801652403e-05, + "loss": 1.778, + "step": 8475 + }, + { + "epoch": 2.6015960712093307, + "grad_norm": 0.3184985816478729, + "learning_rate": 8.674470725471078e-05, + "loss": 1.8706, + "step": 8476 + }, + { + "epoch": 2.601903007980356, + "grad_norm": 0.31306785345077515, + "learning_rate": 8.674133612976481e-05, + "loss": 1.8482, + "step": 8477 + }, + { + "epoch": 2.6022099447513813, + "grad_norm": 0.30568715929985046, + "learning_rate": 8.673796464171939e-05, + "loss": 1.8346, + "step": 8478 + }, + { + "epoch": 2.6025168815224062, + "grad_norm": 0.33701828122138977, + "learning_rate": 8.673459279060791e-05, + "loss": 1.8165, + "step": 8479 + }, + { + "epoch": 2.6028238182934316, + "grad_norm": 0.3153107166290283, + "learning_rate": 8.673122057646364e-05, + "loss": 1.8175, + "step": 8480 + }, + { + "epoch": 2.6031307550644565, + "grad_norm": 0.3428439497947693, + "learning_rate": 8.67278479993199e-05, + "loss": 1.8344, + "step": 8481 + }, + { + "epoch": 2.603437691835482, + "grad_norm": 0.39118432998657227, + "learning_rate": 8.672447505921006e-05, + "loss": 1.7904, + "step": 8482 + }, + { + "epoch": 2.603744628606507, + "grad_norm": 0.3845612108707428, + "learning_rate": 8.672110175616743e-05, + "loss": 1.8442, + "step": 8483 + }, + { + "epoch": 2.6040515653775325, + "grad_norm": 0.3402850329875946, + "learning_rate": 8.671772809022535e-05, + "loss": 1.8578, + "step": 8484 + }, + { + "epoch": 2.6043585021485574, + "grad_norm": 0.30314967036247253, + "learning_rate": 8.671435406141716e-05, + "loss": 1.8235, + "step": 8485 + }, + { + "epoch": 2.6046654389195827, + "grad_norm": 0.29402145743370056, + "learning_rate": 8.67109796697762e-05, + "loss": 1.8105, + "step": 8486 + }, + { + "epoch": 2.6049723756906076, + "grad_norm": 0.33207419514656067, + "learning_rate": 8.670760491533582e-05, + "loss": 1.9133, + "step": 8487 + }, + { + "epoch": 2.605279312461633, + "grad_norm": 0.3287195861339569, + "learning_rate": 8.670422979812938e-05, + "loss": 1.8344, + "step": 8488 + }, + { + "epoch": 2.6055862492326582, + "grad_norm": 0.37947842478752136, + "learning_rate": 8.670085431819021e-05, + "loss": 1.8504, + "step": 8489 + }, + { + "epoch": 2.605893186003683, + "grad_norm": 0.3688724935054779, + "learning_rate": 8.669747847555171e-05, + "loss": 1.8305, + "step": 8490 + }, + { + "epoch": 2.6062001227747085, + "grad_norm": 0.33962976932525635, + "learning_rate": 8.669410227024721e-05, + "loss": 1.861, + "step": 8491 + }, + { + "epoch": 2.6065070595457334, + "grad_norm": 0.27068057656288147, + "learning_rate": 8.669072570231009e-05, + "loss": 1.7666, + "step": 8492 + }, + { + "epoch": 2.6068139963167587, + "grad_norm": 0.32670122385025024, + "learning_rate": 8.668734877177371e-05, + "loss": 1.8434, + "step": 8493 + }, + { + "epoch": 2.607120933087784, + "grad_norm": 0.37303030490875244, + "learning_rate": 8.668397147867144e-05, + "loss": 1.8326, + "step": 8494 + }, + { + "epoch": 2.607427869858809, + "grad_norm": 0.2860218286514282, + "learning_rate": 8.668059382303666e-05, + "loss": 1.7993, + "step": 8495 + }, + { + "epoch": 2.6077348066298343, + "grad_norm": 0.3480636477470398, + "learning_rate": 8.667721580490278e-05, + "loss": 1.8895, + "step": 8496 + }, + { + "epoch": 2.608041743400859, + "grad_norm": 0.37609198689460754, + "learning_rate": 8.667383742430313e-05, + "loss": 1.8906, + "step": 8497 + }, + { + "epoch": 2.6083486801718845, + "grad_norm": 0.30747851729393005, + "learning_rate": 8.667045868127113e-05, + "loss": 1.8169, + "step": 8498 + }, + { + "epoch": 2.60865561694291, + "grad_norm": 0.3108443021774292, + "learning_rate": 8.666707957584016e-05, + "loss": 1.8296, + "step": 8499 + }, + { + "epoch": 2.608962553713935, + "grad_norm": 0.36353448033332825, + "learning_rate": 8.666370010804361e-05, + "loss": 1.879, + "step": 8500 + }, + { + "epoch": 2.60926949048496, + "grad_norm": 0.39959096908569336, + "learning_rate": 8.666032027791491e-05, + "loss": 1.8602, + "step": 8501 + }, + { + "epoch": 2.6095764272559854, + "grad_norm": 0.3505500853061676, + "learning_rate": 8.665694008548742e-05, + "loss": 1.861, + "step": 8502 + }, + { + "epoch": 2.6098833640270103, + "grad_norm": 0.3155219852924347, + "learning_rate": 8.665355953079457e-05, + "loss": 1.7911, + "step": 8503 + }, + { + "epoch": 2.6101903007980356, + "grad_norm": 0.2868075668811798, + "learning_rate": 8.665017861386975e-05, + "loss": 1.8023, + "step": 8504 + }, + { + "epoch": 2.610497237569061, + "grad_norm": 0.2890832722187042, + "learning_rate": 8.664679733474641e-05, + "loss": 1.8653, + "step": 8505 + }, + { + "epoch": 2.610804174340086, + "grad_norm": 0.3143366575241089, + "learning_rate": 8.66434156934579e-05, + "loss": 1.8024, + "step": 8506 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.28702911734580994, + "learning_rate": 8.664003369003772e-05, + "loss": 1.8231, + "step": 8507 + }, + { + "epoch": 2.611418047882136, + "grad_norm": 0.37087059020996094, + "learning_rate": 8.663665132451924e-05, + "loss": 1.8565, + "step": 8508 + }, + { + "epoch": 2.6117249846531614, + "grad_norm": 0.29796209931373596, + "learning_rate": 8.663326859693588e-05, + "loss": 1.8188, + "step": 8509 + }, + { + "epoch": 2.6120319214241867, + "grad_norm": 0.31352412700653076, + "learning_rate": 8.66298855073211e-05, + "loss": 1.806, + "step": 8510 + }, + { + "epoch": 2.612338858195212, + "grad_norm": 0.28749167919158936, + "learning_rate": 8.662650205570832e-05, + "loss": 1.8082, + "step": 8511 + }, + { + "epoch": 2.612645794966237, + "grad_norm": 0.26889678835868835, + "learning_rate": 8.662311824213099e-05, + "loss": 1.8211, + "step": 8512 + }, + { + "epoch": 2.6129527317372623, + "grad_norm": 0.2562754154205322, + "learning_rate": 8.661973406662253e-05, + "loss": 1.7519, + "step": 8513 + }, + { + "epoch": 2.613259668508287, + "grad_norm": 0.26967912912368774, + "learning_rate": 8.661634952921639e-05, + "loss": 1.8339, + "step": 8514 + }, + { + "epoch": 2.6135666052793125, + "grad_norm": 0.3468424081802368, + "learning_rate": 8.661296462994602e-05, + "loss": 1.9219, + "step": 8515 + }, + { + "epoch": 2.613873542050338, + "grad_norm": 0.34790560603141785, + "learning_rate": 8.660957936884489e-05, + "loss": 1.9089, + "step": 8516 + }, + { + "epoch": 2.6141804788213627, + "grad_norm": 0.350337952375412, + "learning_rate": 8.660619374594643e-05, + "loss": 1.8228, + "step": 8517 + }, + { + "epoch": 2.614487415592388, + "grad_norm": 0.37077057361602783, + "learning_rate": 8.660280776128411e-05, + "loss": 1.8658, + "step": 8518 + }, + { + "epoch": 2.614794352363413, + "grad_norm": 0.35846221446990967, + "learning_rate": 8.659942141489139e-05, + "loss": 1.8573, + "step": 8519 + }, + { + "epoch": 2.6151012891344383, + "grad_norm": 0.339101642370224, + "learning_rate": 8.659603470680173e-05, + "loss": 1.875, + "step": 8520 + }, + { + "epoch": 2.6154082259054636, + "grad_norm": 0.35074207186698914, + "learning_rate": 8.65926476370486e-05, + "loss": 1.8395, + "step": 8521 + }, + { + "epoch": 2.6157151626764885, + "grad_norm": 0.31544017791748047, + "learning_rate": 8.658926020566551e-05, + "loss": 1.8453, + "step": 8522 + }, + { + "epoch": 2.616022099447514, + "grad_norm": 0.30619683861732483, + "learning_rate": 8.658587241268587e-05, + "loss": 1.775, + "step": 8523 + }, + { + "epoch": 2.6163290362185387, + "grad_norm": 0.29331618547439575, + "learning_rate": 8.658248425814322e-05, + "loss": 1.8068, + "step": 8524 + }, + { + "epoch": 2.616635972989564, + "grad_norm": 0.2824336290359497, + "learning_rate": 8.6579095742071e-05, + "loss": 1.8759, + "step": 8525 + }, + { + "epoch": 2.6169429097605894, + "grad_norm": 0.2697986364364624, + "learning_rate": 8.657570686450271e-05, + "loss": 1.8295, + "step": 8526 + }, + { + "epoch": 2.6172498465316147, + "grad_norm": 0.3031822144985199, + "learning_rate": 8.657231762547186e-05, + "loss": 1.9205, + "step": 8527 + }, + { + "epoch": 2.6175567833026396, + "grad_norm": 0.2867984473705292, + "learning_rate": 8.656892802501196e-05, + "loss": 1.8638, + "step": 8528 + }, + { + "epoch": 2.617863720073665, + "grad_norm": 0.29799792170524597, + "learning_rate": 8.656553806315644e-05, + "loss": 1.8187, + "step": 8529 + }, + { + "epoch": 2.61817065684469, + "grad_norm": 0.3222150504589081, + "learning_rate": 8.656214773993884e-05, + "loss": 1.8661, + "step": 8530 + }, + { + "epoch": 2.618477593615715, + "grad_norm": 0.35999616980552673, + "learning_rate": 8.655875705539269e-05, + "loss": 1.9155, + "step": 8531 + }, + { + "epoch": 2.6187845303867405, + "grad_norm": 0.36571675539016724, + "learning_rate": 8.655536600955147e-05, + "loss": 1.8536, + "step": 8532 + }, + { + "epoch": 2.6190914671577654, + "grad_norm": 0.29667189717292786, + "learning_rate": 8.655197460244868e-05, + "loss": 1.8208, + "step": 8533 + }, + { + "epoch": 2.6193984039287908, + "grad_norm": 0.3216320276260376, + "learning_rate": 8.654858283411787e-05, + "loss": 1.8613, + "step": 8534 + }, + { + "epoch": 2.6197053406998156, + "grad_norm": 0.28880423307418823, + "learning_rate": 8.654519070459254e-05, + "loss": 1.8547, + "step": 8535 + }, + { + "epoch": 2.620012277470841, + "grad_norm": 0.3130050301551819, + "learning_rate": 8.654179821390621e-05, + "loss": 1.9355, + "step": 8536 + }, + { + "epoch": 2.6203192142418663, + "grad_norm": 0.3151358664035797, + "learning_rate": 8.653840536209241e-05, + "loss": 1.8462, + "step": 8537 + }, + { + "epoch": 2.620626151012891, + "grad_norm": 0.2702169120311737, + "learning_rate": 8.653501214918468e-05, + "loss": 1.7966, + "step": 8538 + }, + { + "epoch": 2.6209330877839165, + "grad_norm": 0.31494441628456116, + "learning_rate": 8.653161857521655e-05, + "loss": 1.7449, + "step": 8539 + }, + { + "epoch": 2.6212400245549414, + "grad_norm": 0.3219514787197113, + "learning_rate": 8.652822464022154e-05, + "loss": 1.8238, + "step": 8540 + }, + { + "epoch": 2.6215469613259668, + "grad_norm": 0.3237066864967346, + "learning_rate": 8.652483034423322e-05, + "loss": 1.8273, + "step": 8541 + }, + { + "epoch": 2.621853898096992, + "grad_norm": 0.31354910135269165, + "learning_rate": 8.65214356872851e-05, + "loss": 1.8662, + "step": 8542 + }, + { + "epoch": 2.6221608348680174, + "grad_norm": 0.30085036158561707, + "learning_rate": 8.651804066941077e-05, + "loss": 1.8922, + "step": 8543 + }, + { + "epoch": 2.6224677716390423, + "grad_norm": 0.337528258562088, + "learning_rate": 8.651464529064373e-05, + "loss": 1.8234, + "step": 8544 + }, + { + "epoch": 2.6227747084100677, + "grad_norm": 0.33202415704727173, + "learning_rate": 8.65112495510176e-05, + "loss": 1.8331, + "step": 8545 + }, + { + "epoch": 2.6230816451810925, + "grad_norm": 0.3288112282752991, + "learning_rate": 8.650785345056586e-05, + "loss": 1.8129, + "step": 8546 + }, + { + "epoch": 2.623388581952118, + "grad_norm": 0.35483047366142273, + "learning_rate": 8.650445698932214e-05, + "loss": 1.8488, + "step": 8547 + }, + { + "epoch": 2.623695518723143, + "grad_norm": 0.32108932733535767, + "learning_rate": 8.650106016731998e-05, + "loss": 1.8263, + "step": 8548 + }, + { + "epoch": 2.624002455494168, + "grad_norm": 0.2902318239212036, + "learning_rate": 8.649766298459295e-05, + "loss": 1.8352, + "step": 8549 + }, + { + "epoch": 2.6243093922651934, + "grad_norm": 0.29014477133750916, + "learning_rate": 8.64942654411746e-05, + "loss": 1.8568, + "step": 8550 + }, + { + "epoch": 2.6246163290362183, + "grad_norm": 0.3996742367744446, + "learning_rate": 8.649086753709855e-05, + "loss": 1.8928, + "step": 8551 + }, + { + "epoch": 2.6249232658072437, + "grad_norm": 0.3703175187110901, + "learning_rate": 8.648746927239835e-05, + "loss": 1.829, + "step": 8552 + }, + { + "epoch": 2.625230202578269, + "grad_norm": 0.33802542090415955, + "learning_rate": 8.64840706471076e-05, + "loss": 1.8827, + "step": 8553 + }, + { + "epoch": 2.625537139349294, + "grad_norm": 0.33303168416023254, + "learning_rate": 8.648067166125988e-05, + "loss": 1.8964, + "step": 8554 + }, + { + "epoch": 2.6258440761203192, + "grad_norm": 0.33449646830558777, + "learning_rate": 8.647727231488878e-05, + "loss": 1.8477, + "step": 8555 + }, + { + "epoch": 2.626151012891344, + "grad_norm": 0.3260989189147949, + "learning_rate": 8.647387260802788e-05, + "loss": 1.8623, + "step": 8556 + }, + { + "epoch": 2.6264579496623695, + "grad_norm": 0.2847815752029419, + "learning_rate": 8.647047254071082e-05, + "loss": 1.769, + "step": 8557 + }, + { + "epoch": 2.626764886433395, + "grad_norm": 0.30041372776031494, + "learning_rate": 8.646707211297116e-05, + "loss": 1.8451, + "step": 8558 + }, + { + "epoch": 2.62707182320442, + "grad_norm": 0.3557286560535431, + "learning_rate": 8.646367132484252e-05, + "loss": 1.8233, + "step": 8559 + }, + { + "epoch": 2.627378759975445, + "grad_norm": 0.39471131563186646, + "learning_rate": 8.646027017635851e-05, + "loss": 1.8364, + "step": 8560 + }, + { + "epoch": 2.6276856967464703, + "grad_norm": 0.37501803040504456, + "learning_rate": 8.645686866755273e-05, + "loss": 1.8129, + "step": 8561 + }, + { + "epoch": 2.6279926335174952, + "grad_norm": 0.374553918838501, + "learning_rate": 8.645346679845881e-05, + "loss": 1.9388, + "step": 8562 + }, + { + "epoch": 2.6282995702885206, + "grad_norm": 0.34410929679870605, + "learning_rate": 8.645006456911037e-05, + "loss": 1.8496, + "step": 8563 + }, + { + "epoch": 2.628606507059546, + "grad_norm": 0.28208592534065247, + "learning_rate": 8.644666197954103e-05, + "loss": 1.8405, + "step": 8564 + }, + { + "epoch": 2.628913443830571, + "grad_norm": 0.2913917005062103, + "learning_rate": 8.644325902978441e-05, + "loss": 1.8775, + "step": 8565 + }, + { + "epoch": 2.629220380601596, + "grad_norm": 0.33285796642303467, + "learning_rate": 8.643985571987414e-05, + "loss": 1.8217, + "step": 8566 + }, + { + "epoch": 2.629527317372621, + "grad_norm": 0.3419492244720459, + "learning_rate": 8.643645204984386e-05, + "loss": 1.8911, + "step": 8567 + }, + { + "epoch": 2.6298342541436464, + "grad_norm": 0.33901095390319824, + "learning_rate": 8.643304801972721e-05, + "loss": 1.8653, + "step": 8568 + }, + { + "epoch": 2.6301411909146717, + "grad_norm": 0.30073773860931396, + "learning_rate": 8.642964362955781e-05, + "loss": 1.7544, + "step": 8569 + }, + { + "epoch": 2.630448127685697, + "grad_norm": 0.3300367593765259, + "learning_rate": 8.642623887936933e-05, + "loss": 1.8764, + "step": 8570 + }, + { + "epoch": 2.630755064456722, + "grad_norm": 0.330671101808548, + "learning_rate": 8.642283376919542e-05, + "loss": 1.8227, + "step": 8571 + }, + { + "epoch": 2.6310620012277472, + "grad_norm": 0.3498590290546417, + "learning_rate": 8.64194282990697e-05, + "loss": 1.8639, + "step": 8572 + }, + { + "epoch": 2.631368937998772, + "grad_norm": 0.33145999908447266, + "learning_rate": 8.641602246902586e-05, + "loss": 1.8442, + "step": 8573 + }, + { + "epoch": 2.6316758747697975, + "grad_norm": 0.29510337114334106, + "learning_rate": 8.641261627909754e-05, + "loss": 1.829, + "step": 8574 + }, + { + "epoch": 2.631982811540823, + "grad_norm": 0.2788131833076477, + "learning_rate": 8.640920972931839e-05, + "loss": 1.7717, + "step": 8575 + }, + { + "epoch": 2.6322897483118477, + "grad_norm": 0.27459269762039185, + "learning_rate": 8.640580281972209e-05, + "loss": 1.7924, + "step": 8576 + }, + { + "epoch": 2.632596685082873, + "grad_norm": 0.3517146110534668, + "learning_rate": 8.640239555034232e-05, + "loss": 1.8921, + "step": 8577 + }, + { + "epoch": 2.632903621853898, + "grad_norm": 0.2852388620376587, + "learning_rate": 8.639898792121273e-05, + "loss": 1.8207, + "step": 8578 + }, + { + "epoch": 2.6332105586249233, + "grad_norm": 0.3164372742176056, + "learning_rate": 8.639557993236702e-05, + "loss": 1.8782, + "step": 8579 + }, + { + "epoch": 2.6335174953959486, + "grad_norm": 0.43939462304115295, + "learning_rate": 8.639217158383885e-05, + "loss": 1.8345, + "step": 8580 + }, + { + "epoch": 2.6338244321669735, + "grad_norm": 0.45321017503738403, + "learning_rate": 8.63887628756619e-05, + "loss": 1.904, + "step": 8581 + }, + { + "epoch": 2.634131368937999, + "grad_norm": 0.4423905611038208, + "learning_rate": 8.638535380786989e-05, + "loss": 1.8894, + "step": 8582 + }, + { + "epoch": 2.6344383057090237, + "grad_norm": 0.3929237723350525, + "learning_rate": 8.638194438049648e-05, + "loss": 1.8835, + "step": 8583 + }, + { + "epoch": 2.634745242480049, + "grad_norm": 0.3178403973579407, + "learning_rate": 8.637853459357536e-05, + "loss": 1.8125, + "step": 8584 + }, + { + "epoch": 2.6350521792510744, + "grad_norm": 0.3796660602092743, + "learning_rate": 8.637512444714024e-05, + "loss": 1.9376, + "step": 8585 + }, + { + "epoch": 2.6353591160220997, + "grad_norm": 0.34011390805244446, + "learning_rate": 8.637171394122483e-05, + "loss": 1.8339, + "step": 8586 + }, + { + "epoch": 2.6356660527931246, + "grad_norm": 0.3423489034175873, + "learning_rate": 8.636830307586281e-05, + "loss": 1.82, + "step": 8587 + }, + { + "epoch": 2.63597298956415, + "grad_norm": 0.3644867241382599, + "learning_rate": 8.636489185108791e-05, + "loss": 1.811, + "step": 8588 + }, + { + "epoch": 2.636279926335175, + "grad_norm": 0.35383811593055725, + "learning_rate": 8.636148026693384e-05, + "loss": 1.8228, + "step": 8589 + }, + { + "epoch": 2.6365868631062, + "grad_norm": 0.28066012263298035, + "learning_rate": 8.635806832343431e-05, + "loss": 1.7752, + "step": 8590 + }, + { + "epoch": 2.6368937998772255, + "grad_norm": 0.27132275700569153, + "learning_rate": 8.635465602062304e-05, + "loss": 1.8053, + "step": 8591 + }, + { + "epoch": 2.6372007366482504, + "grad_norm": 0.3076920211315155, + "learning_rate": 8.635124335853375e-05, + "loss": 1.77, + "step": 8592 + }, + { + "epoch": 2.6375076734192757, + "grad_norm": 0.35130617022514343, + "learning_rate": 8.634783033720015e-05, + "loss": 1.8272, + "step": 8593 + }, + { + "epoch": 2.6378146101903006, + "grad_norm": 0.3805561661720276, + "learning_rate": 8.634441695665601e-05, + "loss": 1.8549, + "step": 8594 + }, + { + "epoch": 2.638121546961326, + "grad_norm": 0.3168867230415344, + "learning_rate": 8.634100321693504e-05, + "loss": 1.9131, + "step": 8595 + }, + { + "epoch": 2.6384284837323513, + "grad_norm": 0.3061029314994812, + "learning_rate": 8.633758911807095e-05, + "loss": 1.84, + "step": 8596 + }, + { + "epoch": 2.638735420503376, + "grad_norm": 0.2766086459159851, + "learning_rate": 8.633417466009752e-05, + "loss": 1.8519, + "step": 8597 + }, + { + "epoch": 2.6390423572744015, + "grad_norm": 0.3250633180141449, + "learning_rate": 8.633075984304849e-05, + "loss": 1.8434, + "step": 8598 + }, + { + "epoch": 2.6393492940454264, + "grad_norm": 0.2819656729698181, + "learning_rate": 8.63273446669576e-05, + "loss": 1.8181, + "step": 8599 + }, + { + "epoch": 2.6396562308164517, + "grad_norm": 0.3506627678871155, + "learning_rate": 8.632392913185859e-05, + "loss": 1.8521, + "step": 8600 + }, + { + "epoch": 2.639963167587477, + "grad_norm": 0.3026714026927948, + "learning_rate": 8.632051323778521e-05, + "loss": 1.8183, + "step": 8601 + }, + { + "epoch": 2.6402701043585024, + "grad_norm": 0.31900104880332947, + "learning_rate": 8.631709698477124e-05, + "loss": 1.8615, + "step": 8602 + }, + { + "epoch": 2.6405770411295273, + "grad_norm": 0.3017260730266571, + "learning_rate": 8.631368037285044e-05, + "loss": 1.837, + "step": 8603 + }, + { + "epoch": 2.6408839779005526, + "grad_norm": 0.29461613297462463, + "learning_rate": 8.631026340205655e-05, + "loss": 1.8398, + "step": 8604 + }, + { + "epoch": 2.6411909146715775, + "grad_norm": 0.3405241370201111, + "learning_rate": 8.630684607242337e-05, + "loss": 1.9241, + "step": 8605 + }, + { + "epoch": 2.641497851442603, + "grad_norm": 0.36280715465545654, + "learning_rate": 8.630342838398465e-05, + "loss": 1.8319, + "step": 8606 + }, + { + "epoch": 2.641804788213628, + "grad_norm": 0.32274433970451355, + "learning_rate": 8.630001033677414e-05, + "loss": 1.8462, + "step": 8607 + }, + { + "epoch": 2.642111724984653, + "grad_norm": 0.28930720686912537, + "learning_rate": 8.629659193082571e-05, + "loss": 1.8251, + "step": 8608 + }, + { + "epoch": 2.6424186617556784, + "grad_norm": 0.30114278197288513, + "learning_rate": 8.629317316617305e-05, + "loss": 1.8037, + "step": 8609 + }, + { + "epoch": 2.6427255985267033, + "grad_norm": 0.31895074248313904, + "learning_rate": 8.628975404285e-05, + "loss": 1.808, + "step": 8610 + }, + { + "epoch": 2.6430325352977286, + "grad_norm": 0.31819066405296326, + "learning_rate": 8.62863345608903e-05, + "loss": 1.811, + "step": 8611 + }, + { + "epoch": 2.643339472068754, + "grad_norm": 0.3860008716583252, + "learning_rate": 8.628291472032779e-05, + "loss": 1.9041, + "step": 8612 + }, + { + "epoch": 2.643646408839779, + "grad_norm": 0.4598442614078522, + "learning_rate": 8.627949452119626e-05, + "loss": 1.788, + "step": 8613 + }, + { + "epoch": 2.643953345610804, + "grad_norm": 0.4720706641674042, + "learning_rate": 8.62760739635295e-05, + "loss": 1.8436, + "step": 8614 + }, + { + "epoch": 2.644260282381829, + "grad_norm": 0.3894381523132324, + "learning_rate": 8.627265304736131e-05, + "loss": 1.8188, + "step": 8615 + }, + { + "epoch": 2.6445672191528544, + "grad_norm": 0.2819352149963379, + "learning_rate": 8.626923177272551e-05, + "loss": 1.7804, + "step": 8616 + }, + { + "epoch": 2.6448741559238798, + "grad_norm": 0.33847305178642273, + "learning_rate": 8.626581013965588e-05, + "loss": 1.8628, + "step": 8617 + }, + { + "epoch": 2.645181092694905, + "grad_norm": 0.49113303422927856, + "learning_rate": 8.626238814818628e-05, + "loss": 1.821, + "step": 8618 + }, + { + "epoch": 2.64548802946593, + "grad_norm": 0.5562265515327454, + "learning_rate": 8.62589657983505e-05, + "loss": 1.8732, + "step": 8619 + }, + { + "epoch": 2.6457949662369553, + "grad_norm": 0.48525476455688477, + "learning_rate": 8.625554309018237e-05, + "loss": 1.8711, + "step": 8620 + }, + { + "epoch": 2.64610190300798, + "grad_norm": 0.35900986194610596, + "learning_rate": 8.62521200237157e-05, + "loss": 1.8922, + "step": 8621 + }, + { + "epoch": 2.6464088397790055, + "grad_norm": 0.2920636832714081, + "learning_rate": 8.624869659898435e-05, + "loss": 1.8121, + "step": 8622 + }, + { + "epoch": 2.646715776550031, + "grad_norm": 0.3626689314842224, + "learning_rate": 8.624527281602213e-05, + "loss": 1.8231, + "step": 8623 + }, + { + "epoch": 2.6470227133210558, + "grad_norm": 0.37683549523353577, + "learning_rate": 8.624184867486288e-05, + "loss": 1.8648, + "step": 8624 + }, + { + "epoch": 2.647329650092081, + "grad_norm": 0.293865829706192, + "learning_rate": 8.623842417554043e-05, + "loss": 1.8347, + "step": 8625 + }, + { + "epoch": 2.647636586863106, + "grad_norm": 0.28916221857070923, + "learning_rate": 8.623499931808863e-05, + "loss": 1.8337, + "step": 8626 + }, + { + "epoch": 2.6479435236341313, + "grad_norm": 0.439003586769104, + "learning_rate": 8.623157410254134e-05, + "loss": 1.8933, + "step": 8627 + }, + { + "epoch": 2.6482504604051567, + "grad_norm": 0.39125844836235046, + "learning_rate": 8.62281485289324e-05, + "loss": 1.7986, + "step": 8628 + }, + { + "epoch": 2.6485573971761815, + "grad_norm": 0.3968810439109802, + "learning_rate": 8.622472259729566e-05, + "loss": 1.8211, + "step": 8629 + }, + { + "epoch": 2.648864333947207, + "grad_norm": 0.37775713205337524, + "learning_rate": 8.622129630766498e-05, + "loss": 1.8976, + "step": 8630 + }, + { + "epoch": 2.6491712707182318, + "grad_norm": 0.329583078622818, + "learning_rate": 8.621786966007422e-05, + "loss": 1.9164, + "step": 8631 + }, + { + "epoch": 2.649478207489257, + "grad_norm": 0.3499230742454529, + "learning_rate": 8.621444265455725e-05, + "loss": 1.8589, + "step": 8632 + }, + { + "epoch": 2.6497851442602824, + "grad_norm": 0.504540741443634, + "learning_rate": 8.621101529114792e-05, + "loss": 1.7853, + "step": 8633 + }, + { + "epoch": 2.650092081031308, + "grad_norm": 0.47648704051971436, + "learning_rate": 8.620758756988012e-05, + "loss": 1.865, + "step": 8634 + }, + { + "epoch": 2.6503990178023327, + "grad_norm": 0.3592020869255066, + "learning_rate": 8.62041594907877e-05, + "loss": 1.886, + "step": 8635 + }, + { + "epoch": 2.650705954573358, + "grad_norm": 0.4862852096557617, + "learning_rate": 8.620073105390458e-05, + "loss": 1.8408, + "step": 8636 + }, + { + "epoch": 2.651012891344383, + "grad_norm": 0.5418413877487183, + "learning_rate": 8.619730225926462e-05, + "loss": 1.8715, + "step": 8637 + }, + { + "epoch": 2.6513198281154082, + "grad_norm": 0.4154299795627594, + "learning_rate": 8.619387310690168e-05, + "loss": 1.8879, + "step": 8638 + }, + { + "epoch": 2.6516267648864336, + "grad_norm": 0.3325296938419342, + "learning_rate": 8.619044359684968e-05, + "loss": 1.8422, + "step": 8639 + }, + { + "epoch": 2.6519337016574585, + "grad_norm": 0.4082878828048706, + "learning_rate": 8.61870137291425e-05, + "loss": 1.8375, + "step": 8640 + }, + { + "epoch": 2.652240638428484, + "grad_norm": 0.46948596835136414, + "learning_rate": 8.618358350381406e-05, + "loss": 1.8367, + "step": 8641 + }, + { + "epoch": 2.6525475751995087, + "grad_norm": 0.3770928978919983, + "learning_rate": 8.618015292089823e-05, + "loss": 1.8236, + "step": 8642 + }, + { + "epoch": 2.652854511970534, + "grad_norm": 0.27340826392173767, + "learning_rate": 8.617672198042892e-05, + "loss": 1.8446, + "step": 8643 + }, + { + "epoch": 2.6531614487415593, + "grad_norm": 0.4071608781814575, + "learning_rate": 8.617329068244004e-05, + "loss": 1.8576, + "step": 8644 + }, + { + "epoch": 2.6534683855125847, + "grad_norm": 0.5041884779930115, + "learning_rate": 8.61698590269655e-05, + "loss": 1.9075, + "step": 8645 + }, + { + "epoch": 2.6537753222836096, + "grad_norm": 0.4129817485809326, + "learning_rate": 8.616642701403921e-05, + "loss": 1.8592, + "step": 8646 + }, + { + "epoch": 2.654082259054635, + "grad_norm": 0.2837994694709778, + "learning_rate": 8.616299464369508e-05, + "loss": 1.8383, + "step": 8647 + }, + { + "epoch": 2.65438919582566, + "grad_norm": 0.3413170278072357, + "learning_rate": 8.615956191596707e-05, + "loss": 1.8083, + "step": 8648 + }, + { + "epoch": 2.654696132596685, + "grad_norm": 0.3661767244338989, + "learning_rate": 8.615612883088907e-05, + "loss": 1.9141, + "step": 8649 + }, + { + "epoch": 2.6550030693677105, + "grad_norm": 0.3209584951400757, + "learning_rate": 8.6152695388495e-05, + "loss": 1.8886, + "step": 8650 + }, + { + "epoch": 2.6553100061387354, + "grad_norm": 0.3161548674106598, + "learning_rate": 8.61492615888188e-05, + "loss": 1.832, + "step": 8651 + }, + { + "epoch": 2.6556169429097607, + "grad_norm": 0.3258545696735382, + "learning_rate": 8.614582743189441e-05, + "loss": 1.8747, + "step": 8652 + }, + { + "epoch": 2.6559238796807856, + "grad_norm": 0.3528682291507721, + "learning_rate": 8.614239291775579e-05, + "loss": 1.9192, + "step": 8653 + }, + { + "epoch": 2.656230816451811, + "grad_norm": 0.3430826961994171, + "learning_rate": 8.613895804643684e-05, + "loss": 1.8601, + "step": 8654 + }, + { + "epoch": 2.6565377532228363, + "grad_norm": 0.3221988379955292, + "learning_rate": 8.613552281797152e-05, + "loss": 1.9218, + "step": 8655 + }, + { + "epoch": 2.656844689993861, + "grad_norm": 0.2917289137840271, + "learning_rate": 8.613208723239379e-05, + "loss": 1.7443, + "step": 8656 + }, + { + "epoch": 2.6571516267648865, + "grad_norm": 0.28350377082824707, + "learning_rate": 8.612865128973762e-05, + "loss": 1.809, + "step": 8657 + }, + { + "epoch": 2.6574585635359114, + "grad_norm": 0.2758159339427948, + "learning_rate": 8.61252149900369e-05, + "loss": 1.8628, + "step": 8658 + }, + { + "epoch": 2.6577655003069367, + "grad_norm": 0.3537377417087555, + "learning_rate": 8.612177833332566e-05, + "loss": 1.8586, + "step": 8659 + }, + { + "epoch": 2.658072437077962, + "grad_norm": 0.38237693905830383, + "learning_rate": 8.611834131963783e-05, + "loss": 1.8869, + "step": 8660 + }, + { + "epoch": 2.6583793738489874, + "grad_norm": 0.30623751878738403, + "learning_rate": 8.611490394900739e-05, + "loss": 1.8508, + "step": 8661 + }, + { + "epoch": 2.6586863106200123, + "grad_norm": 0.2597752809524536, + "learning_rate": 8.611146622146828e-05, + "loss": 1.7931, + "step": 8662 + }, + { + "epoch": 2.6589932473910376, + "grad_norm": 0.2953357696533203, + "learning_rate": 8.61080281370545e-05, + "loss": 1.837, + "step": 8663 + }, + { + "epoch": 2.6593001841620625, + "grad_norm": 0.3018724322319031, + "learning_rate": 8.610458969580003e-05, + "loss": 1.871, + "step": 8664 + }, + { + "epoch": 2.659607120933088, + "grad_norm": 0.36607179045677185, + "learning_rate": 8.610115089773885e-05, + "loss": 1.9453, + "step": 8665 + }, + { + "epoch": 2.659914057704113, + "grad_norm": 0.38754695653915405, + "learning_rate": 8.609771174290493e-05, + "loss": 1.8886, + "step": 8666 + }, + { + "epoch": 2.660220994475138, + "grad_norm": 0.3752847909927368, + "learning_rate": 8.609427223133226e-05, + "loss": 1.8662, + "step": 8667 + }, + { + "epoch": 2.6605279312461634, + "grad_norm": 0.3301216661930084, + "learning_rate": 8.609083236305483e-05, + "loss": 1.8697, + "step": 8668 + }, + { + "epoch": 2.6608348680171883, + "grad_norm": 0.31682586669921875, + "learning_rate": 8.608739213810666e-05, + "loss": 1.8982, + "step": 8669 + }, + { + "epoch": 2.6611418047882136, + "grad_norm": 0.30835145711898804, + "learning_rate": 8.608395155652172e-05, + "loss": 1.8245, + "step": 8670 + }, + { + "epoch": 2.661448741559239, + "grad_norm": 0.32517582178115845, + "learning_rate": 8.608051061833402e-05, + "loss": 1.9117, + "step": 8671 + }, + { + "epoch": 2.661755678330264, + "grad_norm": 0.3120395541191101, + "learning_rate": 8.607706932357757e-05, + "loss": 1.76, + "step": 8672 + }, + { + "epoch": 2.662062615101289, + "grad_norm": 0.31719091534614563, + "learning_rate": 8.607362767228637e-05, + "loss": 1.8939, + "step": 8673 + }, + { + "epoch": 2.662369551872314, + "grad_norm": 0.28792136907577515, + "learning_rate": 8.607018566449445e-05, + "loss": 1.8403, + "step": 8674 + }, + { + "epoch": 2.6626764886433394, + "grad_norm": 0.28327643871307373, + "learning_rate": 8.606674330023581e-05, + "loss": 1.8204, + "step": 8675 + }, + { + "epoch": 2.6629834254143647, + "grad_norm": 0.29808422923088074, + "learning_rate": 8.606330057954446e-05, + "loss": 1.8325, + "step": 8676 + }, + { + "epoch": 2.66329036218539, + "grad_norm": 0.36162641644477844, + "learning_rate": 8.605985750245446e-05, + "loss": 1.8387, + "step": 8677 + }, + { + "epoch": 2.663597298956415, + "grad_norm": 0.3418589234352112, + "learning_rate": 8.605641406899978e-05, + "loss": 1.8139, + "step": 8678 + }, + { + "epoch": 2.6639042357274403, + "grad_norm": 0.31307870149612427, + "learning_rate": 8.605297027921451e-05, + "loss": 1.8897, + "step": 8679 + }, + { + "epoch": 2.664211172498465, + "grad_norm": 0.36962878704071045, + "learning_rate": 8.604952613313264e-05, + "loss": 1.9233, + "step": 8680 + }, + { + "epoch": 2.6645181092694905, + "grad_norm": 0.3502652049064636, + "learning_rate": 8.604608163078824e-05, + "loss": 1.8218, + "step": 8681 + }, + { + "epoch": 2.664825046040516, + "grad_norm": 0.3703038692474365, + "learning_rate": 8.604263677221533e-05, + "loss": 1.8484, + "step": 8682 + }, + { + "epoch": 2.6651319828115407, + "grad_norm": 0.2609662711620331, + "learning_rate": 8.603919155744796e-05, + "loss": 1.7645, + "step": 8683 + }, + { + "epoch": 2.665438919582566, + "grad_norm": 0.33297231793403625, + "learning_rate": 8.603574598652015e-05, + "loss": 1.8543, + "step": 8684 + }, + { + "epoch": 2.665745856353591, + "grad_norm": 0.28411462903022766, + "learning_rate": 8.603230005946601e-05, + "loss": 1.867, + "step": 8685 + }, + { + "epoch": 2.6660527931246163, + "grad_norm": 0.3209732174873352, + "learning_rate": 8.602885377631954e-05, + "loss": 1.8886, + "step": 8686 + }, + { + "epoch": 2.6663597298956416, + "grad_norm": 0.35397234559059143, + "learning_rate": 8.602540713711482e-05, + "loss": 1.8965, + "step": 8687 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2925071716308594, + "learning_rate": 8.602196014188593e-05, + "loss": 1.8027, + "step": 8688 + }, + { + "epoch": 2.666973603437692, + "grad_norm": 0.2902941107749939, + "learning_rate": 8.60185127906669e-05, + "loss": 1.8022, + "step": 8689 + }, + { + "epoch": 2.6672805402087167, + "grad_norm": 0.31528550386428833, + "learning_rate": 8.601506508349181e-05, + "loss": 1.8153, + "step": 8690 + }, + { + "epoch": 2.667587476979742, + "grad_norm": 0.32254844903945923, + "learning_rate": 8.601161702039477e-05, + "loss": 1.8199, + "step": 8691 + }, + { + "epoch": 2.6678944137507674, + "grad_norm": 0.2999059855937958, + "learning_rate": 8.600816860140979e-05, + "loss": 1.8404, + "step": 8692 + }, + { + "epoch": 2.6682013505217927, + "grad_norm": 0.32727453112602234, + "learning_rate": 8.6004719826571e-05, + "loss": 1.8148, + "step": 8693 + }, + { + "epoch": 2.6685082872928176, + "grad_norm": 0.3048906624317169, + "learning_rate": 8.600127069591245e-05, + "loss": 1.833, + "step": 8694 + }, + { + "epoch": 2.668815224063843, + "grad_norm": 0.43790102005004883, + "learning_rate": 8.599782120946826e-05, + "loss": 1.8537, + "step": 8695 + }, + { + "epoch": 2.669122160834868, + "grad_norm": 0.38096752762794495, + "learning_rate": 8.59943713672725e-05, + "loss": 1.8094, + "step": 8696 + }, + { + "epoch": 2.669429097605893, + "grad_norm": 0.3065931499004364, + "learning_rate": 8.599092116935927e-05, + "loss": 1.8878, + "step": 8697 + }, + { + "epoch": 2.6697360343769185, + "grad_norm": 0.41807904839515686, + "learning_rate": 8.598747061576264e-05, + "loss": 1.8753, + "step": 8698 + }, + { + "epoch": 2.6700429711479434, + "grad_norm": 0.4906943142414093, + "learning_rate": 8.598401970651676e-05, + "loss": 1.7642, + "step": 8699 + }, + { + "epoch": 2.6703499079189688, + "grad_norm": 0.37138858437538147, + "learning_rate": 8.598056844165567e-05, + "loss": 1.8191, + "step": 8700 + }, + { + "epoch": 2.6706568446899936, + "grad_norm": 0.2804940938949585, + "learning_rate": 8.597711682121354e-05, + "loss": 1.8238, + "step": 8701 + }, + { + "epoch": 2.670963781461019, + "grad_norm": 0.3853018581867218, + "learning_rate": 8.597366484522445e-05, + "loss": 1.8762, + "step": 8702 + }, + { + "epoch": 2.6712707182320443, + "grad_norm": 0.3066580295562744, + "learning_rate": 8.597021251372253e-05, + "loss": 1.7638, + "step": 8703 + }, + { + "epoch": 2.671577655003069, + "grad_norm": 0.30797824263572693, + "learning_rate": 8.596675982674186e-05, + "loss": 1.8574, + "step": 8704 + }, + { + "epoch": 2.6718845917740945, + "grad_norm": 0.3268548548221588, + "learning_rate": 8.596330678431661e-05, + "loss": 1.9184, + "step": 8705 + }, + { + "epoch": 2.6721915285451194, + "grad_norm": 0.4077534079551697, + "learning_rate": 8.595985338648087e-05, + "loss": 1.8967, + "step": 8706 + }, + { + "epoch": 2.6724984653161448, + "grad_norm": 0.4514889419078827, + "learning_rate": 8.595639963326881e-05, + "loss": 1.8491, + "step": 8707 + }, + { + "epoch": 2.67280540208717, + "grad_norm": 0.39269959926605225, + "learning_rate": 8.59529455247145e-05, + "loss": 1.7865, + "step": 8708 + }, + { + "epoch": 2.6731123388581954, + "grad_norm": 0.3139820694923401, + "learning_rate": 8.594949106085212e-05, + "loss": 1.8007, + "step": 8709 + }, + { + "epoch": 2.6734192756292203, + "grad_norm": 0.3423599600791931, + "learning_rate": 8.59460362417158e-05, + "loss": 1.8389, + "step": 8710 + }, + { + "epoch": 2.6737262124002457, + "grad_norm": 0.3829670548439026, + "learning_rate": 8.594258106733968e-05, + "loss": 1.8355, + "step": 8711 + }, + { + "epoch": 2.6740331491712706, + "grad_norm": 0.34447145462036133, + "learning_rate": 8.593912553775791e-05, + "loss": 1.8595, + "step": 8712 + }, + { + "epoch": 2.674340085942296, + "grad_norm": 0.34868502616882324, + "learning_rate": 8.593566965300465e-05, + "loss": 1.9195, + "step": 8713 + }, + { + "epoch": 2.674647022713321, + "grad_norm": 0.4919234812259674, + "learning_rate": 8.593221341311402e-05, + "loss": 1.8321, + "step": 8714 + }, + { + "epoch": 2.674953959484346, + "grad_norm": 0.4413202702999115, + "learning_rate": 8.59287568181202e-05, + "loss": 1.7976, + "step": 8715 + }, + { + "epoch": 2.6752608962553714, + "grad_norm": 0.3395153880119324, + "learning_rate": 8.592529986805736e-05, + "loss": 1.7974, + "step": 8716 + }, + { + "epoch": 2.6755678330263963, + "grad_norm": 0.30407002568244934, + "learning_rate": 8.592184256295965e-05, + "loss": 1.7929, + "step": 8717 + }, + { + "epoch": 2.6758747697974217, + "grad_norm": 0.31925150752067566, + "learning_rate": 8.591838490286121e-05, + "loss": 1.8413, + "step": 8718 + }, + { + "epoch": 2.676181706568447, + "grad_norm": 0.28456512093544006, + "learning_rate": 8.591492688779627e-05, + "loss": 1.8686, + "step": 8719 + }, + { + "epoch": 2.6764886433394723, + "grad_norm": 0.3286445438861847, + "learning_rate": 8.591146851779895e-05, + "loss": 1.8538, + "step": 8720 + }, + { + "epoch": 2.6767955801104972, + "grad_norm": 0.40354880690574646, + "learning_rate": 8.590800979290346e-05, + "loss": 1.8599, + "step": 8721 + }, + { + "epoch": 2.6771025168815226, + "grad_norm": 0.3654378652572632, + "learning_rate": 8.590455071314397e-05, + "loss": 1.8063, + "step": 8722 + }, + { + "epoch": 2.6774094536525475, + "grad_norm": 0.3211844861507416, + "learning_rate": 8.590109127855466e-05, + "loss": 1.8146, + "step": 8723 + }, + { + "epoch": 2.677716390423573, + "grad_norm": 0.30884361267089844, + "learning_rate": 8.589763148916973e-05, + "loss": 1.8725, + "step": 8724 + }, + { + "epoch": 2.678023327194598, + "grad_norm": 0.303095281124115, + "learning_rate": 8.589417134502336e-05, + "loss": 1.8994, + "step": 8725 + }, + { + "epoch": 2.678330263965623, + "grad_norm": 0.3086979389190674, + "learning_rate": 8.589071084614977e-05, + "loss": 1.7941, + "step": 8726 + }, + { + "epoch": 2.6786372007366483, + "grad_norm": 0.30298081040382385, + "learning_rate": 8.588724999258311e-05, + "loss": 1.8945, + "step": 8727 + }, + { + "epoch": 2.6789441375076732, + "grad_norm": 0.33253392577171326, + "learning_rate": 8.588378878435763e-05, + "loss": 1.8397, + "step": 8728 + }, + { + "epoch": 2.6792510742786986, + "grad_norm": 0.2782913148403168, + "learning_rate": 8.588032722150752e-05, + "loss": 1.8505, + "step": 8729 + }, + { + "epoch": 2.679558011049724, + "grad_norm": 0.3482373058795929, + "learning_rate": 8.587686530406697e-05, + "loss": 1.9144, + "step": 8730 + }, + { + "epoch": 2.679864947820749, + "grad_norm": 0.31985580921173096, + "learning_rate": 8.587340303207021e-05, + "loss": 1.7695, + "step": 8731 + }, + { + "epoch": 2.680171884591774, + "grad_norm": 0.3222995400428772, + "learning_rate": 8.586994040555147e-05, + "loss": 1.8624, + "step": 8732 + }, + { + "epoch": 2.680478821362799, + "grad_norm": 0.28178468346595764, + "learning_rate": 8.586647742454495e-05, + "loss": 1.8036, + "step": 8733 + }, + { + "epoch": 2.6807857581338244, + "grad_norm": 0.27367156744003296, + "learning_rate": 8.586301408908487e-05, + "loss": 1.801, + "step": 8734 + }, + { + "epoch": 2.6810926949048497, + "grad_norm": 0.2696636915206909, + "learning_rate": 8.585955039920547e-05, + "loss": 1.8211, + "step": 8735 + }, + { + "epoch": 2.681399631675875, + "grad_norm": 0.2880568504333496, + "learning_rate": 8.585608635494098e-05, + "loss": 1.8543, + "step": 8736 + }, + { + "epoch": 2.6817065684469, + "grad_norm": 0.28708669543266296, + "learning_rate": 8.585262195632562e-05, + "loss": 1.8311, + "step": 8737 + }, + { + "epoch": 2.6820135052179253, + "grad_norm": 0.2633354663848877, + "learning_rate": 8.584915720339364e-05, + "loss": 1.7815, + "step": 8738 + }, + { + "epoch": 2.68232044198895, + "grad_norm": 0.25772908329963684, + "learning_rate": 8.584569209617928e-05, + "loss": 1.8322, + "step": 8739 + }, + { + "epoch": 2.6826273787599755, + "grad_norm": 0.2665303647518158, + "learning_rate": 8.584222663471677e-05, + "loss": 1.8456, + "step": 8740 + }, + { + "epoch": 2.682934315531001, + "grad_norm": 0.26330938935279846, + "learning_rate": 8.583876081904038e-05, + "loss": 1.8552, + "step": 8741 + }, + { + "epoch": 2.6832412523020257, + "grad_norm": 0.29758915305137634, + "learning_rate": 8.583529464918434e-05, + "loss": 1.8362, + "step": 8742 + }, + { + "epoch": 2.683548189073051, + "grad_norm": 0.32018154859542847, + "learning_rate": 8.583182812518293e-05, + "loss": 1.8439, + "step": 8743 + }, + { + "epoch": 2.683855125844076, + "grad_norm": 0.33279770612716675, + "learning_rate": 8.582836124707036e-05, + "loss": 1.8629, + "step": 8744 + }, + { + "epoch": 2.6841620626151013, + "grad_norm": 0.40244174003601074, + "learning_rate": 8.582489401488096e-05, + "loss": 1.8221, + "step": 8745 + }, + { + "epoch": 2.6844689993861266, + "grad_norm": 0.3935016393661499, + "learning_rate": 8.582142642864895e-05, + "loss": 1.8564, + "step": 8746 + }, + { + "epoch": 2.6847759361571515, + "grad_norm": 0.3062369227409363, + "learning_rate": 8.58179584884086e-05, + "loss": 1.8587, + "step": 8747 + }, + { + "epoch": 2.685082872928177, + "grad_norm": 0.320422500371933, + "learning_rate": 8.58144901941942e-05, + "loss": 1.8758, + "step": 8748 + }, + { + "epoch": 2.6853898096992017, + "grad_norm": 0.3681413531303406, + "learning_rate": 8.581102154604001e-05, + "loss": 1.7899, + "step": 8749 + }, + { + "epoch": 2.685696746470227, + "grad_norm": 0.37779754400253296, + "learning_rate": 8.580755254398032e-05, + "loss": 1.8584, + "step": 8750 + }, + { + "epoch": 2.6860036832412524, + "grad_norm": 0.34761306643486023, + "learning_rate": 8.58040831880494e-05, + "loss": 1.8656, + "step": 8751 + }, + { + "epoch": 2.6863106200122777, + "grad_norm": 0.2833636403083801, + "learning_rate": 8.580061347828156e-05, + "loss": 1.8043, + "step": 8752 + }, + { + "epoch": 2.6866175567833026, + "grad_norm": 0.29990699887275696, + "learning_rate": 8.579714341471106e-05, + "loss": 1.8365, + "step": 8753 + }, + { + "epoch": 2.686924493554328, + "grad_norm": 0.3322729766368866, + "learning_rate": 8.579367299737222e-05, + "loss": 1.8541, + "step": 8754 + }, + { + "epoch": 2.687231430325353, + "grad_norm": 0.31999245285987854, + "learning_rate": 8.579020222629931e-05, + "loss": 1.8405, + "step": 8755 + }, + { + "epoch": 2.687538367096378, + "grad_norm": 0.332714319229126, + "learning_rate": 8.578673110152666e-05, + "loss": 1.9512, + "step": 8756 + }, + { + "epoch": 2.6878453038674035, + "grad_norm": 0.36372992396354675, + "learning_rate": 8.578325962308855e-05, + "loss": 1.8969, + "step": 8757 + }, + { + "epoch": 2.6881522406384284, + "grad_norm": 0.27239182591438293, + "learning_rate": 8.577978779101929e-05, + "loss": 1.7898, + "step": 8758 + }, + { + "epoch": 2.6884591774094537, + "grad_norm": 0.3552536070346832, + "learning_rate": 8.57763156053532e-05, + "loss": 1.8919, + "step": 8759 + }, + { + "epoch": 2.6887661141804786, + "grad_norm": 0.40591174364089966, + "learning_rate": 8.577284306612458e-05, + "loss": 1.8021, + "step": 8760 + }, + { + "epoch": 2.689073050951504, + "grad_norm": 0.37012994289398193, + "learning_rate": 8.576937017336777e-05, + "loss": 1.7803, + "step": 8761 + }, + { + "epoch": 2.6893799877225293, + "grad_norm": 0.33496031165122986, + "learning_rate": 8.576589692711707e-05, + "loss": 1.8573, + "step": 8762 + }, + { + "epoch": 2.689686924493554, + "grad_norm": 0.35000404715538025, + "learning_rate": 8.576242332740683e-05, + "loss": 1.8769, + "step": 8763 + }, + { + "epoch": 2.6899938612645795, + "grad_norm": 0.32730549573898315, + "learning_rate": 8.575894937427135e-05, + "loss": 1.823, + "step": 8764 + }, + { + "epoch": 2.6903007980356044, + "grad_norm": 0.31418806314468384, + "learning_rate": 8.575547506774497e-05, + "loss": 1.7646, + "step": 8765 + }, + { + "epoch": 2.6906077348066297, + "grad_norm": 0.277721107006073, + "learning_rate": 8.575200040786205e-05, + "loss": 1.8046, + "step": 8766 + }, + { + "epoch": 2.690914671577655, + "grad_norm": 0.3289557695388794, + "learning_rate": 8.574852539465688e-05, + "loss": 1.8145, + "step": 8767 + }, + { + "epoch": 2.6912216083486804, + "grad_norm": 0.28926602005958557, + "learning_rate": 8.574505002816385e-05, + "loss": 1.7627, + "step": 8768 + }, + { + "epoch": 2.6915285451197053, + "grad_norm": 0.2972332835197449, + "learning_rate": 8.574157430841727e-05, + "loss": 1.8294, + "step": 8769 + }, + { + "epoch": 2.6918354818907306, + "grad_norm": 0.28366953134536743, + "learning_rate": 8.57380982354515e-05, + "loss": 1.8535, + "step": 8770 + }, + { + "epoch": 2.6921424186617555, + "grad_norm": 0.2798771262168884, + "learning_rate": 8.57346218093009e-05, + "loss": 1.8298, + "step": 8771 + }, + { + "epoch": 2.692449355432781, + "grad_norm": 0.2614765465259552, + "learning_rate": 8.573114502999983e-05, + "loss": 1.8555, + "step": 8772 + }, + { + "epoch": 2.692756292203806, + "grad_norm": 0.30653777718544006, + "learning_rate": 8.572766789758265e-05, + "loss": 1.8507, + "step": 8773 + }, + { + "epoch": 2.693063228974831, + "grad_norm": 0.3189094066619873, + "learning_rate": 8.572419041208369e-05, + "loss": 1.8791, + "step": 8774 + }, + { + "epoch": 2.6933701657458564, + "grad_norm": 0.33381524682044983, + "learning_rate": 8.572071257353735e-05, + "loss": 1.8241, + "step": 8775 + }, + { + "epoch": 2.6936771025168813, + "grad_norm": 0.2776879668235779, + "learning_rate": 8.571723438197801e-05, + "loss": 1.7837, + "step": 8776 + }, + { + "epoch": 2.6939840392879066, + "grad_norm": 0.35845425724983215, + "learning_rate": 8.571375583744001e-05, + "loss": 1.8896, + "step": 8777 + }, + { + "epoch": 2.694290976058932, + "grad_norm": 0.28849005699157715, + "learning_rate": 8.571027693995775e-05, + "loss": 1.803, + "step": 8778 + }, + { + "epoch": 2.694597912829957, + "grad_norm": 0.3008786141872406, + "learning_rate": 8.57067976895656e-05, + "loss": 1.8559, + "step": 8779 + }, + { + "epoch": 2.694904849600982, + "grad_norm": 0.2924736440181732, + "learning_rate": 8.570331808629795e-05, + "loss": 1.8016, + "step": 8780 + }, + { + "epoch": 2.695211786372007, + "grad_norm": 0.2962380051612854, + "learning_rate": 8.569983813018917e-05, + "loss": 1.819, + "step": 8781 + }, + { + "epoch": 2.6955187231430324, + "grad_norm": 0.3141970634460449, + "learning_rate": 8.569635782127367e-05, + "loss": 1.8462, + "step": 8782 + }, + { + "epoch": 2.6958256599140578, + "grad_norm": 0.297061562538147, + "learning_rate": 8.569287715958584e-05, + "loss": 1.855, + "step": 8783 + }, + { + "epoch": 2.696132596685083, + "grad_norm": 0.30669623613357544, + "learning_rate": 8.568939614516009e-05, + "loss": 1.8626, + "step": 8784 + }, + { + "epoch": 2.696439533456108, + "grad_norm": 0.2782025933265686, + "learning_rate": 8.568591477803081e-05, + "loss": 1.8993, + "step": 8785 + }, + { + "epoch": 2.6967464702271333, + "grad_norm": 0.3644821345806122, + "learning_rate": 8.568243305823239e-05, + "loss": 1.8318, + "step": 8786 + }, + { + "epoch": 2.697053406998158, + "grad_norm": 0.4073259234428406, + "learning_rate": 8.567895098579925e-05, + "loss": 1.8963, + "step": 8787 + }, + { + "epoch": 2.6973603437691835, + "grad_norm": 0.40539780259132385, + "learning_rate": 8.567546856076583e-05, + "loss": 1.8644, + "step": 8788 + }, + { + "epoch": 2.697667280540209, + "grad_norm": 0.36739271879196167, + "learning_rate": 8.567198578316648e-05, + "loss": 1.8555, + "step": 8789 + }, + { + "epoch": 2.6979742173112338, + "grad_norm": 0.3339182138442993, + "learning_rate": 8.566850265303568e-05, + "loss": 1.8431, + "step": 8790 + }, + { + "epoch": 2.698281154082259, + "grad_norm": 0.3389740586280823, + "learning_rate": 8.566501917040784e-05, + "loss": 1.8271, + "step": 8791 + }, + { + "epoch": 2.698588090853284, + "grad_norm": 0.33819615840911865, + "learning_rate": 8.566153533531737e-05, + "loss": 1.8504, + "step": 8792 + }, + { + "epoch": 2.6988950276243093, + "grad_norm": 0.39106276631355286, + "learning_rate": 8.56580511477987e-05, + "loss": 1.7656, + "step": 8793 + }, + { + "epoch": 2.6992019643953347, + "grad_norm": 0.3374726474285126, + "learning_rate": 8.565456660788628e-05, + "loss": 1.8256, + "step": 8794 + }, + { + "epoch": 2.69950890116636, + "grad_norm": 0.33096614480018616, + "learning_rate": 8.565108171561452e-05, + "loss": 1.9486, + "step": 8795 + }, + { + "epoch": 2.699815837937385, + "grad_norm": 0.3202100396156311, + "learning_rate": 8.564759647101788e-05, + "loss": 1.7708, + "step": 8796 + }, + { + "epoch": 2.7001227747084102, + "grad_norm": 0.28830909729003906, + "learning_rate": 8.56441108741308e-05, + "loss": 1.8247, + "step": 8797 + }, + { + "epoch": 2.700429711479435, + "grad_norm": 0.32385459542274475, + "learning_rate": 8.564062492498772e-05, + "loss": 1.8338, + "step": 8798 + }, + { + "epoch": 2.7007366482504604, + "grad_norm": 0.3059900104999542, + "learning_rate": 8.56371386236231e-05, + "loss": 1.8321, + "step": 8799 + }, + { + "epoch": 2.701043585021486, + "grad_norm": 0.2922738492488861, + "learning_rate": 8.563365197007141e-05, + "loss": 1.7734, + "step": 8800 + }, + { + "epoch": 2.7013505217925107, + "grad_norm": 0.32542386651039124, + "learning_rate": 8.563016496436704e-05, + "loss": 1.8696, + "step": 8801 + }, + { + "epoch": 2.701657458563536, + "grad_norm": 0.2830851674079895, + "learning_rate": 8.562667760654452e-05, + "loss": 1.8237, + "step": 8802 + }, + { + "epoch": 2.701964395334561, + "grad_norm": 0.2794142961502075, + "learning_rate": 8.562318989663831e-05, + "loss": 1.8301, + "step": 8803 + }, + { + "epoch": 2.7022713321055862, + "grad_norm": 0.3149101436138153, + "learning_rate": 8.561970183468281e-05, + "loss": 1.8716, + "step": 8804 + }, + { + "epoch": 2.7025782688766116, + "grad_norm": 0.29530593752861023, + "learning_rate": 8.561621342071258e-05, + "loss": 1.9069, + "step": 8805 + }, + { + "epoch": 2.7028852056476365, + "grad_norm": 0.33965879678726196, + "learning_rate": 8.561272465476204e-05, + "loss": 1.8381, + "step": 8806 + }, + { + "epoch": 2.703192142418662, + "grad_norm": 0.3310995399951935, + "learning_rate": 8.560923553686569e-05, + "loss": 1.9293, + "step": 8807 + }, + { + "epoch": 2.7034990791896867, + "grad_norm": 0.3828842043876648, + "learning_rate": 8.5605746067058e-05, + "loss": 1.8789, + "step": 8808 + }, + { + "epoch": 2.703806015960712, + "grad_norm": 0.3666260242462158, + "learning_rate": 8.560225624537346e-05, + "loss": 1.8622, + "step": 8809 + }, + { + "epoch": 2.7041129527317374, + "grad_norm": 0.36732783913612366, + "learning_rate": 8.559876607184653e-05, + "loss": 1.8177, + "step": 8810 + }, + { + "epoch": 2.7044198895027627, + "grad_norm": 0.35554859042167664, + "learning_rate": 8.559527554651176e-05, + "loss": 1.884, + "step": 8811 + }, + { + "epoch": 2.7047268262737876, + "grad_norm": 0.3118159770965576, + "learning_rate": 8.55917846694036e-05, + "loss": 1.8779, + "step": 8812 + }, + { + "epoch": 2.705033763044813, + "grad_norm": 0.278105765581131, + "learning_rate": 8.558829344055657e-05, + "loss": 1.8513, + "step": 8813 + }, + { + "epoch": 2.705340699815838, + "grad_norm": 0.30809372663497925, + "learning_rate": 8.558480186000517e-05, + "loss": 1.8023, + "step": 8814 + }, + { + "epoch": 2.705647636586863, + "grad_norm": 0.28222522139549255, + "learning_rate": 8.558130992778388e-05, + "loss": 1.8421, + "step": 8815 + }, + { + "epoch": 2.7059545733578885, + "grad_norm": 0.29532718658447266, + "learning_rate": 8.557781764392725e-05, + "loss": 1.8131, + "step": 8816 + }, + { + "epoch": 2.7062615101289134, + "grad_norm": 0.2670072317123413, + "learning_rate": 8.557432500846975e-05, + "loss": 1.7856, + "step": 8817 + }, + { + "epoch": 2.7065684468999387, + "grad_norm": 0.3431483805179596, + "learning_rate": 8.557083202144594e-05, + "loss": 1.8484, + "step": 8818 + }, + { + "epoch": 2.7068753836709636, + "grad_norm": 0.3824561536312103, + "learning_rate": 8.556733868289033e-05, + "loss": 1.8954, + "step": 8819 + }, + { + "epoch": 2.707182320441989, + "grad_norm": 0.4189379811286926, + "learning_rate": 8.55638449928374e-05, + "loss": 1.7846, + "step": 8820 + }, + { + "epoch": 2.7074892572130143, + "grad_norm": 0.34948450326919556, + "learning_rate": 8.556035095132173e-05, + "loss": 1.7696, + "step": 8821 + }, + { + "epoch": 2.707796193984039, + "grad_norm": 0.2906292676925659, + "learning_rate": 8.555685655837783e-05, + "loss": 1.8359, + "step": 8822 + }, + { + "epoch": 2.7081031307550645, + "grad_norm": 0.2756035029888153, + "learning_rate": 8.555336181404023e-05, + "loss": 1.8684, + "step": 8823 + }, + { + "epoch": 2.7084100675260894, + "grad_norm": 0.3714772164821625, + "learning_rate": 8.554986671834346e-05, + "loss": 1.8833, + "step": 8824 + }, + { + "epoch": 2.7087170042971147, + "grad_norm": 0.41674792766571045, + "learning_rate": 8.554637127132209e-05, + "loss": 1.8272, + "step": 8825 + }, + { + "epoch": 2.70902394106814, + "grad_norm": 0.333915650844574, + "learning_rate": 8.554287547301063e-05, + "loss": 1.8343, + "step": 8826 + }, + { + "epoch": 2.7093308778391654, + "grad_norm": 0.33764639496803284, + "learning_rate": 8.553937932344365e-05, + "loss": 1.812, + "step": 8827 + }, + { + "epoch": 2.7096378146101903, + "grad_norm": 0.4445551931858063, + "learning_rate": 8.553588282265569e-05, + "loss": 1.8386, + "step": 8828 + }, + { + "epoch": 2.7099447513812156, + "grad_norm": 0.43314024806022644, + "learning_rate": 8.553238597068131e-05, + "loss": 1.7727, + "step": 8829 + }, + { + "epoch": 2.7102516881522405, + "grad_norm": 0.364596426486969, + "learning_rate": 8.552888876755506e-05, + "loss": 1.8875, + "step": 8830 + }, + { + "epoch": 2.710558624923266, + "grad_norm": 0.3023224174976349, + "learning_rate": 8.552539121331151e-05, + "loss": 1.8676, + "step": 8831 + }, + { + "epoch": 2.710865561694291, + "grad_norm": 0.3278682231903076, + "learning_rate": 8.552189330798522e-05, + "loss": 1.852, + "step": 8832 + }, + { + "epoch": 2.711172498465316, + "grad_norm": 0.34684303402900696, + "learning_rate": 8.551839505161077e-05, + "loss": 1.8449, + "step": 8833 + }, + { + "epoch": 2.7114794352363414, + "grad_norm": 0.3398132920265198, + "learning_rate": 8.551489644422271e-05, + "loss": 1.8493, + "step": 8834 + }, + { + "epoch": 2.7117863720073663, + "grad_norm": 0.2835905849933624, + "learning_rate": 8.551139748585563e-05, + "loss": 1.8283, + "step": 8835 + }, + { + "epoch": 2.7120933087783916, + "grad_norm": 0.30910351872444153, + "learning_rate": 8.55078981765441e-05, + "loss": 1.8429, + "step": 8836 + }, + { + "epoch": 2.712400245549417, + "grad_norm": 0.3802061676979065, + "learning_rate": 8.550439851632272e-05, + "loss": 1.8348, + "step": 8837 + }, + { + "epoch": 2.712707182320442, + "grad_norm": 0.3686448931694031, + "learning_rate": 8.550089850522606e-05, + "loss": 1.8652, + "step": 8838 + }, + { + "epoch": 2.713014119091467, + "grad_norm": 0.2919705808162689, + "learning_rate": 8.549739814328872e-05, + "loss": 1.8318, + "step": 8839 + }, + { + "epoch": 2.713321055862492, + "grad_norm": 0.34780198335647583, + "learning_rate": 8.549389743054527e-05, + "loss": 1.8781, + "step": 8840 + }, + { + "epoch": 2.7136279926335174, + "grad_norm": 0.3955966532230377, + "learning_rate": 8.549039636703034e-05, + "loss": 1.867, + "step": 8841 + }, + { + "epoch": 2.7139349294045427, + "grad_norm": 0.2836689054965973, + "learning_rate": 8.548689495277851e-05, + "loss": 1.7859, + "step": 8842 + }, + { + "epoch": 2.714241866175568, + "grad_norm": 0.369865357875824, + "learning_rate": 8.548339318782436e-05, + "loss": 1.8246, + "step": 8843 + }, + { + "epoch": 2.714548802946593, + "grad_norm": 0.2901081442832947, + "learning_rate": 8.547989107220256e-05, + "loss": 1.7888, + "step": 8844 + }, + { + "epoch": 2.7148557397176183, + "grad_norm": 0.2790970802307129, + "learning_rate": 8.547638860594764e-05, + "loss": 1.8311, + "step": 8845 + }, + { + "epoch": 2.715162676488643, + "grad_norm": 0.2935783267021179, + "learning_rate": 8.547288578909429e-05, + "loss": 1.857, + "step": 8846 + }, + { + "epoch": 2.7154696132596685, + "grad_norm": 0.27074959874153137, + "learning_rate": 8.546938262167708e-05, + "loss": 1.7457, + "step": 8847 + }, + { + "epoch": 2.715776550030694, + "grad_norm": 0.3042888343334198, + "learning_rate": 8.546587910373063e-05, + "loss": 1.8598, + "step": 8848 + }, + { + "epoch": 2.7160834868017187, + "grad_norm": 0.29088664054870605, + "learning_rate": 8.546237523528958e-05, + "loss": 1.8461, + "step": 8849 + }, + { + "epoch": 2.716390423572744, + "grad_norm": 0.3022211492061615, + "learning_rate": 8.545887101638857e-05, + "loss": 1.8327, + "step": 8850 + }, + { + "epoch": 2.716697360343769, + "grad_norm": 0.30194929242134094, + "learning_rate": 8.545536644706218e-05, + "loss": 1.8331, + "step": 8851 + }, + { + "epoch": 2.7170042971147943, + "grad_norm": 0.31702303886413574, + "learning_rate": 8.54518615273451e-05, + "loss": 1.8576, + "step": 8852 + }, + { + "epoch": 2.7173112338858196, + "grad_norm": 0.30386796593666077, + "learning_rate": 8.544835625727195e-05, + "loss": 1.8278, + "step": 8853 + }, + { + "epoch": 2.717618170656845, + "grad_norm": 0.30670568346977234, + "learning_rate": 8.544485063687735e-05, + "loss": 1.8123, + "step": 8854 + }, + { + "epoch": 2.71792510742787, + "grad_norm": 0.3896371126174927, + "learning_rate": 8.544134466619597e-05, + "loss": 1.8101, + "step": 8855 + }, + { + "epoch": 2.718232044198895, + "grad_norm": 0.4742000699043274, + "learning_rate": 8.543783834526245e-05, + "loss": 1.8402, + "step": 8856 + }, + { + "epoch": 2.71853898096992, + "grad_norm": 0.4234209954738617, + "learning_rate": 8.543433167411143e-05, + "loss": 1.8814, + "step": 8857 + }, + { + "epoch": 2.7188459177409454, + "grad_norm": 0.28478503227233887, + "learning_rate": 8.54308246527776e-05, + "loss": 1.8165, + "step": 8858 + }, + { + "epoch": 2.7191528545119708, + "grad_norm": 0.3534078896045685, + "learning_rate": 8.542731728129558e-05, + "loss": 1.7947, + "step": 8859 + }, + { + "epoch": 2.7194597912829956, + "grad_norm": 0.5471592545509338, + "learning_rate": 8.542380955970004e-05, + "loss": 1.9073, + "step": 8860 + }, + { + "epoch": 2.719766728054021, + "grad_norm": 0.5037226676940918, + "learning_rate": 8.542030148802566e-05, + "loss": 1.8701, + "step": 8861 + }, + { + "epoch": 2.720073664825046, + "grad_norm": 0.3415449559688568, + "learning_rate": 8.54167930663071e-05, + "loss": 1.827, + "step": 8862 + }, + { + "epoch": 2.720380601596071, + "grad_norm": 0.33516764640808105, + "learning_rate": 8.541328429457903e-05, + "loss": 1.9396, + "step": 8863 + }, + { + "epoch": 2.7206875383670965, + "grad_norm": 0.3934863209724426, + "learning_rate": 8.540977517287612e-05, + "loss": 1.8738, + "step": 8864 + }, + { + "epoch": 2.7209944751381214, + "grad_norm": 0.5137139558792114, + "learning_rate": 8.540626570123307e-05, + "loss": 1.9007, + "step": 8865 + }, + { + "epoch": 2.7213014119091468, + "grad_norm": 0.5846540331840515, + "learning_rate": 8.540275587968453e-05, + "loss": 1.9335, + "step": 8866 + }, + { + "epoch": 2.7216083486801717, + "grad_norm": 0.613388180732727, + "learning_rate": 8.539924570826523e-05, + "loss": 1.8967, + "step": 8867 + }, + { + "epoch": 2.721915285451197, + "grad_norm": 0.4804840087890625, + "learning_rate": 8.539573518700983e-05, + "loss": 1.7712, + "step": 8868 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.34939101338386536, + "learning_rate": 8.539222431595303e-05, + "loss": 1.8578, + "step": 8869 + }, + { + "epoch": 2.7225291589932477, + "grad_norm": 0.4230511486530304, + "learning_rate": 8.538871309512951e-05, + "loss": 1.793, + "step": 8870 + }, + { + "epoch": 2.7228360957642725, + "grad_norm": 0.5383400917053223, + "learning_rate": 8.538520152457402e-05, + "loss": 1.8153, + "step": 8871 + }, + { + "epoch": 2.723143032535298, + "grad_norm": 0.46213194727897644, + "learning_rate": 8.538168960432118e-05, + "loss": 1.9357, + "step": 8872 + }, + { + "epoch": 2.7234499693063228, + "grad_norm": 0.3126194477081299, + "learning_rate": 8.537817733440577e-05, + "loss": 1.7954, + "step": 8873 + }, + { + "epoch": 2.723756906077348, + "grad_norm": 0.4018714129924774, + "learning_rate": 8.537466471486248e-05, + "loss": 1.824, + "step": 8874 + }, + { + "epoch": 2.7240638428483734, + "grad_norm": 0.5690213441848755, + "learning_rate": 8.537115174572602e-05, + "loss": 1.7807, + "step": 8875 + }, + { + "epoch": 2.7243707796193983, + "grad_norm": 0.4669814705848694, + "learning_rate": 8.53676384270311e-05, + "loss": 1.7438, + "step": 8876 + }, + { + "epoch": 2.7246777163904237, + "grad_norm": 0.3040566146373749, + "learning_rate": 8.536412475881246e-05, + "loss": 1.8613, + "step": 8877 + }, + { + "epoch": 2.7249846531614486, + "grad_norm": 0.38985559344291687, + "learning_rate": 8.53606107411048e-05, + "loss": 1.816, + "step": 8878 + }, + { + "epoch": 2.725291589932474, + "grad_norm": 0.4417174160480499, + "learning_rate": 8.535709637394285e-05, + "loss": 1.8675, + "step": 8879 + }, + { + "epoch": 2.7255985267034992, + "grad_norm": 0.3254696726799011, + "learning_rate": 8.535358165736138e-05, + "loss": 1.8419, + "step": 8880 + }, + { + "epoch": 2.725905463474524, + "grad_norm": 0.36002370715141296, + "learning_rate": 8.535006659139506e-05, + "loss": 1.9084, + "step": 8881 + }, + { + "epoch": 2.7262124002455494, + "grad_norm": 0.3471790850162506, + "learning_rate": 8.534655117607869e-05, + "loss": 1.8442, + "step": 8882 + }, + { + "epoch": 2.7265193370165743, + "grad_norm": 0.3042849004268646, + "learning_rate": 8.534303541144697e-05, + "loss": 1.8261, + "step": 8883 + }, + { + "epoch": 2.7268262737875997, + "grad_norm": 0.32416659593582153, + "learning_rate": 8.533951929753465e-05, + "loss": 1.8625, + "step": 8884 + }, + { + "epoch": 2.727133210558625, + "grad_norm": 0.32449519634246826, + "learning_rate": 8.53360028343765e-05, + "loss": 1.8653, + "step": 8885 + }, + { + "epoch": 2.7274401473296503, + "grad_norm": 0.34744054079055786, + "learning_rate": 8.533248602200726e-05, + "loss": 1.8742, + "step": 8886 + }, + { + "epoch": 2.7277470841006752, + "grad_norm": 0.30540695786476135, + "learning_rate": 8.532896886046167e-05, + "loss": 1.8064, + "step": 8887 + }, + { + "epoch": 2.7280540208717006, + "grad_norm": 0.27105677127838135, + "learning_rate": 8.532545134977452e-05, + "loss": 1.7867, + "step": 8888 + }, + { + "epoch": 2.7283609576427255, + "grad_norm": 0.2682685852050781, + "learning_rate": 8.532193348998054e-05, + "loss": 1.8191, + "step": 8889 + }, + { + "epoch": 2.728667894413751, + "grad_norm": 0.33534809947013855, + "learning_rate": 8.531841528111452e-05, + "loss": 1.8758, + "step": 8890 + }, + { + "epoch": 2.728974831184776, + "grad_norm": 0.33555057644844055, + "learning_rate": 8.531489672321122e-05, + "loss": 1.8932, + "step": 8891 + }, + { + "epoch": 2.729281767955801, + "grad_norm": 0.3532167077064514, + "learning_rate": 8.531137781630542e-05, + "loss": 1.8621, + "step": 8892 + }, + { + "epoch": 2.7295887047268264, + "grad_norm": 0.337634414434433, + "learning_rate": 8.530785856043186e-05, + "loss": 1.8618, + "step": 8893 + }, + { + "epoch": 2.7298956414978512, + "grad_norm": 0.28855568170547485, + "learning_rate": 8.530433895562538e-05, + "loss": 1.8248, + "step": 8894 + }, + { + "epoch": 2.7302025782688766, + "grad_norm": 0.3128049373626709, + "learning_rate": 8.530081900192071e-05, + "loss": 1.8071, + "step": 8895 + }, + { + "epoch": 2.730509515039902, + "grad_norm": 0.2949801981449127, + "learning_rate": 8.529729869935265e-05, + "loss": 1.7704, + "step": 8896 + }, + { + "epoch": 2.730816451810927, + "grad_norm": 0.2708294987678528, + "learning_rate": 8.529377804795603e-05, + "loss": 1.8127, + "step": 8897 + }, + { + "epoch": 2.731123388581952, + "grad_norm": 0.300516813993454, + "learning_rate": 8.529025704776559e-05, + "loss": 1.9063, + "step": 8898 + }, + { + "epoch": 2.731430325352977, + "grad_norm": 0.2590954005718231, + "learning_rate": 8.528673569881613e-05, + "loss": 1.7595, + "step": 8899 + }, + { + "epoch": 2.7317372621240024, + "grad_norm": 0.30067136883735657, + "learning_rate": 8.528321400114248e-05, + "loss": 1.8697, + "step": 8900 + }, + { + "epoch": 2.7320441988950277, + "grad_norm": 0.3289981186389923, + "learning_rate": 8.527969195477943e-05, + "loss": 1.8257, + "step": 8901 + }, + { + "epoch": 2.732351135666053, + "grad_norm": 0.3205581307411194, + "learning_rate": 8.527616955976178e-05, + "loss": 1.9002, + "step": 8902 + }, + { + "epoch": 2.732658072437078, + "grad_norm": 0.30869361758232117, + "learning_rate": 8.527264681612435e-05, + "loss": 1.8239, + "step": 8903 + }, + { + "epoch": 2.7329650092081033, + "grad_norm": 0.3237484097480774, + "learning_rate": 8.526912372390195e-05, + "loss": 1.8879, + "step": 8904 + }, + { + "epoch": 2.733271945979128, + "grad_norm": 0.3172036111354828, + "learning_rate": 8.52656002831294e-05, + "loss": 1.8118, + "step": 8905 + }, + { + "epoch": 2.7335788827501535, + "grad_norm": 0.3326823115348816, + "learning_rate": 8.52620764938415e-05, + "loss": 1.8035, + "step": 8906 + }, + { + "epoch": 2.733885819521179, + "grad_norm": 0.36605212092399597, + "learning_rate": 8.525855235607311e-05, + "loss": 1.8689, + "step": 8907 + }, + { + "epoch": 2.7341927562922037, + "grad_norm": 0.31904828548431396, + "learning_rate": 8.525502786985905e-05, + "loss": 1.8188, + "step": 8908 + }, + { + "epoch": 2.734499693063229, + "grad_norm": 0.2657643258571625, + "learning_rate": 8.525150303523413e-05, + "loss": 1.7471, + "step": 8909 + }, + { + "epoch": 2.734806629834254, + "grad_norm": 0.32748520374298096, + "learning_rate": 8.524797785223318e-05, + "loss": 1.8678, + "step": 8910 + }, + { + "epoch": 2.7351135666052793, + "grad_norm": 0.32576173543930054, + "learning_rate": 8.524445232089107e-05, + "loss": 1.8296, + "step": 8911 + }, + { + "epoch": 2.7354205033763046, + "grad_norm": 0.3028578758239746, + "learning_rate": 8.524092644124261e-05, + "loss": 1.8656, + "step": 8912 + }, + { + "epoch": 2.7357274401473295, + "grad_norm": 0.29967090487480164, + "learning_rate": 8.523740021332268e-05, + "loss": 1.8206, + "step": 8913 + }, + { + "epoch": 2.736034376918355, + "grad_norm": 0.3042941391468048, + "learning_rate": 8.523387363716611e-05, + "loss": 1.7928, + "step": 8914 + }, + { + "epoch": 2.7363413136893797, + "grad_norm": 0.3278021216392517, + "learning_rate": 8.523034671280772e-05, + "loss": 1.9213, + "step": 8915 + }, + { + "epoch": 2.736648250460405, + "grad_norm": 0.39839017391204834, + "learning_rate": 8.522681944028242e-05, + "loss": 1.8242, + "step": 8916 + }, + { + "epoch": 2.7369551872314304, + "grad_norm": 0.3960748016834259, + "learning_rate": 8.522329181962504e-05, + "loss": 1.8761, + "step": 8917 + }, + { + "epoch": 2.7372621240024557, + "grad_norm": 0.3250591456890106, + "learning_rate": 8.521976385087044e-05, + "loss": 1.8318, + "step": 8918 + }, + { + "epoch": 2.7375690607734806, + "grad_norm": 0.31731119751930237, + "learning_rate": 8.521623553405349e-05, + "loss": 1.8062, + "step": 8919 + }, + { + "epoch": 2.737875997544506, + "grad_norm": 0.32452264428138733, + "learning_rate": 8.521270686920906e-05, + "loss": 1.8384, + "step": 8920 + }, + { + "epoch": 2.738182934315531, + "grad_norm": 0.2892500162124634, + "learning_rate": 8.520917785637204e-05, + "loss": 1.8128, + "step": 8921 + }, + { + "epoch": 2.738489871086556, + "grad_norm": 0.30028483271598816, + "learning_rate": 8.520564849557726e-05, + "loss": 1.8512, + "step": 8922 + }, + { + "epoch": 2.7387968078575815, + "grad_norm": 0.29927411675453186, + "learning_rate": 8.520211878685964e-05, + "loss": 1.8431, + "step": 8923 + }, + { + "epoch": 2.7391037446286064, + "grad_norm": 0.3426479995250702, + "learning_rate": 8.519858873025405e-05, + "loss": 1.8724, + "step": 8924 + }, + { + "epoch": 2.7394106813996317, + "grad_norm": 0.3795917332172394, + "learning_rate": 8.519505832579538e-05, + "loss": 1.8888, + "step": 8925 + }, + { + "epoch": 2.7397176181706566, + "grad_norm": 0.4924582839012146, + "learning_rate": 8.519152757351849e-05, + "loss": 1.7743, + "step": 8926 + }, + { + "epoch": 2.740024554941682, + "grad_norm": 0.43054282665252686, + "learning_rate": 8.518799647345832e-05, + "loss": 1.8556, + "step": 8927 + }, + { + "epoch": 2.7403314917127073, + "grad_norm": 0.37040412425994873, + "learning_rate": 8.518446502564974e-05, + "loss": 1.9162, + "step": 8928 + }, + { + "epoch": 2.7406384284837326, + "grad_norm": 0.38334885239601135, + "learning_rate": 8.518093323012766e-05, + "loss": 1.8078, + "step": 8929 + }, + { + "epoch": 2.7409453652547575, + "grad_norm": 0.409101665019989, + "learning_rate": 8.517740108692698e-05, + "loss": 1.7874, + "step": 8930 + }, + { + "epoch": 2.741252302025783, + "grad_norm": 0.3953499495983124, + "learning_rate": 8.517386859608258e-05, + "loss": 1.8455, + "step": 8931 + }, + { + "epoch": 2.7415592387968077, + "grad_norm": 0.30524972081184387, + "learning_rate": 8.517033575762942e-05, + "loss": 1.822, + "step": 8932 + }, + { + "epoch": 2.741866175567833, + "grad_norm": 0.354086309671402, + "learning_rate": 8.516680257160239e-05, + "loss": 1.859, + "step": 8933 + }, + { + "epoch": 2.7421731123388584, + "grad_norm": 0.4305376410484314, + "learning_rate": 8.516326903803638e-05, + "loss": 1.8918, + "step": 8934 + }, + { + "epoch": 2.7424800491098833, + "grad_norm": 0.590727686882019, + "learning_rate": 8.515973515696635e-05, + "loss": 1.8841, + "step": 8935 + }, + { + "epoch": 2.7427869858809086, + "grad_norm": 0.665314257144928, + "learning_rate": 8.515620092842723e-05, + "loss": 1.8166, + "step": 8936 + }, + { + "epoch": 2.7430939226519335, + "grad_norm": 0.5579181909561157, + "learning_rate": 8.515266635245389e-05, + "loss": 1.8344, + "step": 8937 + }, + { + "epoch": 2.743400859422959, + "grad_norm": 0.3698382079601288, + "learning_rate": 8.514913142908132e-05, + "loss": 1.8445, + "step": 8938 + }, + { + "epoch": 2.743707796193984, + "grad_norm": 0.30882057547569275, + "learning_rate": 8.514559615834442e-05, + "loss": 1.8443, + "step": 8939 + }, + { + "epoch": 2.744014732965009, + "grad_norm": 0.35821446776390076, + "learning_rate": 8.514206054027815e-05, + "loss": 1.8482, + "step": 8940 + }, + { + "epoch": 2.7443216697360344, + "grad_norm": 0.35552099347114563, + "learning_rate": 8.513852457491744e-05, + "loss": 1.7848, + "step": 8941 + }, + { + "epoch": 2.7446286065070593, + "grad_norm": 0.27788954973220825, + "learning_rate": 8.513498826229722e-05, + "loss": 1.7935, + "step": 8942 + }, + { + "epoch": 2.7449355432780846, + "grad_norm": 0.30653929710388184, + "learning_rate": 8.513145160245246e-05, + "loss": 1.808, + "step": 8943 + }, + { + "epoch": 2.74524248004911, + "grad_norm": 0.34749966859817505, + "learning_rate": 8.512791459541812e-05, + "loss": 1.8498, + "step": 8944 + }, + { + "epoch": 2.7455494168201353, + "grad_norm": 0.362326979637146, + "learning_rate": 8.512437724122912e-05, + "loss": 1.8263, + "step": 8945 + }, + { + "epoch": 2.74585635359116, + "grad_norm": 0.2914038598537445, + "learning_rate": 8.512083953992044e-05, + "loss": 1.834, + "step": 8946 + }, + { + "epoch": 2.7461632903621855, + "grad_norm": 0.31662893295288086, + "learning_rate": 8.511730149152705e-05, + "loss": 1.8157, + "step": 8947 + }, + { + "epoch": 2.7464702271332104, + "grad_norm": 0.38970568776130676, + "learning_rate": 8.51137630960839e-05, + "loss": 1.8764, + "step": 8948 + }, + { + "epoch": 2.7467771639042358, + "grad_norm": 0.3907272517681122, + "learning_rate": 8.511022435362594e-05, + "loss": 1.8665, + "step": 8949 + }, + { + "epoch": 2.747084100675261, + "grad_norm": 0.3315196931362152, + "learning_rate": 8.510668526418819e-05, + "loss": 1.8076, + "step": 8950 + }, + { + "epoch": 2.747391037446286, + "grad_norm": 0.29783520102500916, + "learning_rate": 8.510314582780559e-05, + "loss": 1.8518, + "step": 8951 + }, + { + "epoch": 2.7476979742173113, + "grad_norm": 0.3085685670375824, + "learning_rate": 8.509960604451312e-05, + "loss": 1.8961, + "step": 8952 + }, + { + "epoch": 2.748004910988336, + "grad_norm": 0.3204992711544037, + "learning_rate": 8.509606591434579e-05, + "loss": 1.8374, + "step": 8953 + }, + { + "epoch": 2.7483118477593615, + "grad_norm": 0.2801276445388794, + "learning_rate": 8.509252543733855e-05, + "loss": 1.8455, + "step": 8954 + }, + { + "epoch": 2.748618784530387, + "grad_norm": 0.26911506056785583, + "learning_rate": 8.508898461352641e-05, + "loss": 1.8093, + "step": 8955 + }, + { + "epoch": 2.7489257213014118, + "grad_norm": 0.30429625511169434, + "learning_rate": 8.508544344294435e-05, + "loss": 1.8526, + "step": 8956 + }, + { + "epoch": 2.749232658072437, + "grad_norm": 0.308403342962265, + "learning_rate": 8.50819019256274e-05, + "loss": 1.7917, + "step": 8957 + }, + { + "epoch": 2.749539594843462, + "grad_norm": 0.3292251229286194, + "learning_rate": 8.507836006161052e-05, + "loss": 1.8206, + "step": 8958 + }, + { + "epoch": 2.7498465316144873, + "grad_norm": 0.30014076828956604, + "learning_rate": 8.507481785092871e-05, + "loss": 1.8136, + "step": 8959 + }, + { + "epoch": 2.7501534683855127, + "grad_norm": 0.2879343032836914, + "learning_rate": 8.5071275293617e-05, + "loss": 1.8476, + "step": 8960 + }, + { + "epoch": 2.750460405156538, + "grad_norm": 0.30646058917045593, + "learning_rate": 8.506773238971039e-05, + "loss": 1.7936, + "step": 8961 + }, + { + "epoch": 2.750767341927563, + "grad_norm": 0.309804230928421, + "learning_rate": 8.506418913924391e-05, + "loss": 1.8076, + "step": 8962 + }, + { + "epoch": 2.7510742786985882, + "grad_norm": 0.27035996317863464, + "learning_rate": 8.506064554225255e-05, + "loss": 1.8169, + "step": 8963 + }, + { + "epoch": 2.751381215469613, + "grad_norm": 0.3185548782348633, + "learning_rate": 8.505710159877134e-05, + "loss": 1.8265, + "step": 8964 + }, + { + "epoch": 2.7516881522406385, + "grad_norm": 0.3806973099708557, + "learning_rate": 8.505355730883532e-05, + "loss": 1.824, + "step": 8965 + }, + { + "epoch": 2.751995089011664, + "grad_norm": 0.3206372857093811, + "learning_rate": 8.505001267247949e-05, + "loss": 1.8436, + "step": 8966 + }, + { + "epoch": 2.7523020257826887, + "grad_norm": 0.2957460880279541, + "learning_rate": 8.504646768973889e-05, + "loss": 1.8212, + "step": 8967 + }, + { + "epoch": 2.752608962553714, + "grad_norm": 0.2854628562927246, + "learning_rate": 8.504292236064854e-05, + "loss": 1.862, + "step": 8968 + }, + { + "epoch": 2.752915899324739, + "grad_norm": 0.30056047439575195, + "learning_rate": 8.503937668524351e-05, + "loss": 1.8007, + "step": 8969 + }, + { + "epoch": 2.7532228360957642, + "grad_norm": 0.33884522318840027, + "learning_rate": 8.503583066355883e-05, + "loss": 1.8972, + "step": 8970 + }, + { + "epoch": 2.7535297728667896, + "grad_norm": 0.29358747601509094, + "learning_rate": 8.503228429562951e-05, + "loss": 1.8343, + "step": 8971 + }, + { + "epoch": 2.7538367096378145, + "grad_norm": 0.3650909662246704, + "learning_rate": 8.502873758149063e-05, + "loss": 1.7866, + "step": 8972 + }, + { + "epoch": 2.75414364640884, + "grad_norm": 0.3245839476585388, + "learning_rate": 8.502519052117725e-05, + "loss": 1.8451, + "step": 8973 + }, + { + "epoch": 2.7544505831798647, + "grad_norm": 0.305429071187973, + "learning_rate": 8.502164311472441e-05, + "loss": 1.9277, + "step": 8974 + }, + { + "epoch": 2.75475751995089, + "grad_norm": 0.3520638942718506, + "learning_rate": 8.501809536216716e-05, + "loss": 1.7648, + "step": 8975 + }, + { + "epoch": 2.7550644567219154, + "grad_norm": 0.419918030500412, + "learning_rate": 8.501454726354054e-05, + "loss": 1.7862, + "step": 8976 + }, + { + "epoch": 2.7553713934929407, + "grad_norm": 0.3854345977306366, + "learning_rate": 8.501099881887968e-05, + "loss": 1.8234, + "step": 8977 + }, + { + "epoch": 2.7556783302639656, + "grad_norm": 0.27826064825057983, + "learning_rate": 8.50074500282196e-05, + "loss": 1.7694, + "step": 8978 + }, + { + "epoch": 2.755985267034991, + "grad_norm": 0.3439055383205414, + "learning_rate": 8.500390089159536e-05, + "loss": 1.8136, + "step": 8979 + }, + { + "epoch": 2.756292203806016, + "grad_norm": 0.3434913754463196, + "learning_rate": 8.500035140904208e-05, + "loss": 1.8053, + "step": 8980 + }, + { + "epoch": 2.756599140577041, + "grad_norm": 0.27551600337028503, + "learning_rate": 8.49968015805948e-05, + "loss": 1.8349, + "step": 8981 + }, + { + "epoch": 2.7569060773480665, + "grad_norm": 0.304706871509552, + "learning_rate": 8.499325140628863e-05, + "loss": 1.8488, + "step": 8982 + }, + { + "epoch": 2.7572130141190914, + "grad_norm": 0.36910584568977356, + "learning_rate": 8.498970088615861e-05, + "loss": 1.8519, + "step": 8983 + }, + { + "epoch": 2.7575199508901167, + "grad_norm": 0.30584999918937683, + "learning_rate": 8.498615002023987e-05, + "loss": 1.8479, + "step": 8984 + }, + { + "epoch": 2.7578268876611416, + "grad_norm": 0.28511542081832886, + "learning_rate": 8.498259880856749e-05, + "loss": 1.8047, + "step": 8985 + }, + { + "epoch": 2.758133824432167, + "grad_norm": 0.28804922103881836, + "learning_rate": 8.497904725117658e-05, + "loss": 1.891, + "step": 8986 + }, + { + "epoch": 2.7584407612031923, + "grad_norm": 0.32592445611953735, + "learning_rate": 8.497549534810221e-05, + "loss": 1.8081, + "step": 8987 + }, + { + "epoch": 2.758747697974217, + "grad_norm": 0.3298552632331848, + "learning_rate": 8.497194309937949e-05, + "loss": 1.8897, + "step": 8988 + }, + { + "epoch": 2.7590546347452425, + "grad_norm": 0.3506438136100769, + "learning_rate": 8.496839050504353e-05, + "loss": 1.9007, + "step": 8989 + }, + { + "epoch": 2.7593615715162674, + "grad_norm": 0.30891793966293335, + "learning_rate": 8.496483756512946e-05, + "loss": 1.8154, + "step": 8990 + }, + { + "epoch": 2.7596685082872927, + "grad_norm": 0.3697068691253662, + "learning_rate": 8.496128427967235e-05, + "loss": 1.8301, + "step": 8991 + }, + { + "epoch": 2.759975445058318, + "grad_norm": 0.3090182840824127, + "learning_rate": 8.495773064870734e-05, + "loss": 1.8443, + "step": 8992 + }, + { + "epoch": 2.7602823818293434, + "grad_norm": 0.31172695755958557, + "learning_rate": 8.495417667226955e-05, + "loss": 1.8051, + "step": 8993 + }, + { + "epoch": 2.7605893186003683, + "grad_norm": 0.34285077452659607, + "learning_rate": 8.495062235039411e-05, + "loss": 1.8766, + "step": 8994 + }, + { + "epoch": 2.7608962553713936, + "grad_norm": 0.30001118779182434, + "learning_rate": 8.494706768311612e-05, + "loss": 1.8267, + "step": 8995 + }, + { + "epoch": 2.7612031921424185, + "grad_norm": 0.2767544984817505, + "learning_rate": 8.494351267047074e-05, + "loss": 1.8038, + "step": 8996 + }, + { + "epoch": 2.761510128913444, + "grad_norm": 0.2952648401260376, + "learning_rate": 8.493995731249307e-05, + "loss": 1.7863, + "step": 8997 + }, + { + "epoch": 2.761817065684469, + "grad_norm": 0.27491581439971924, + "learning_rate": 8.493640160921828e-05, + "loss": 1.844, + "step": 8998 + }, + { + "epoch": 2.762124002455494, + "grad_norm": 0.2733328938484192, + "learning_rate": 8.493284556068147e-05, + "loss": 1.7909, + "step": 8999 + }, + { + "epoch": 2.7624309392265194, + "grad_norm": 0.3201010525226593, + "learning_rate": 8.492928916691783e-05, + "loss": 1.8827, + "step": 9000 + }, + { + "epoch": 2.7627378759975443, + "grad_norm": 0.293652206659317, + "learning_rate": 8.492573242796244e-05, + "loss": 1.7755, + "step": 9001 + }, + { + "epoch": 2.7630448127685696, + "grad_norm": 0.2862321436405182, + "learning_rate": 8.492217534385053e-05, + "loss": 1.7868, + "step": 9002 + }, + { + "epoch": 2.763351749539595, + "grad_norm": 0.364490270614624, + "learning_rate": 8.491861791461722e-05, + "loss": 1.8276, + "step": 9003 + }, + { + "epoch": 2.7636586863106203, + "grad_norm": 0.4316955506801605, + "learning_rate": 8.491506014029765e-05, + "loss": 1.8727, + "step": 9004 + }, + { + "epoch": 2.763965623081645, + "grad_norm": 0.37957659363746643, + "learning_rate": 8.491150202092697e-05, + "loss": 1.8471, + "step": 9005 + }, + { + "epoch": 2.7642725598526705, + "grad_norm": 0.2936808168888092, + "learning_rate": 8.490794355654039e-05, + "loss": 1.7964, + "step": 9006 + }, + { + "epoch": 2.7645794966236954, + "grad_norm": 0.3742556869983673, + "learning_rate": 8.490438474717304e-05, + "loss": 1.8461, + "step": 9007 + }, + { + "epoch": 2.7648864333947207, + "grad_norm": 0.4273780286312103, + "learning_rate": 8.49008255928601e-05, + "loss": 1.7947, + "step": 9008 + }, + { + "epoch": 2.765193370165746, + "grad_norm": 0.35967808961868286, + "learning_rate": 8.489726609363675e-05, + "loss": 1.8125, + "step": 9009 + }, + { + "epoch": 2.765500306936771, + "grad_norm": 0.27607613801956177, + "learning_rate": 8.489370624953817e-05, + "loss": 1.8413, + "step": 9010 + }, + { + "epoch": 2.7658072437077963, + "grad_norm": 0.38287433981895447, + "learning_rate": 8.489014606059952e-05, + "loss": 1.8184, + "step": 9011 + }, + { + "epoch": 2.766114180478821, + "grad_norm": 0.4284100830554962, + "learning_rate": 8.4886585526856e-05, + "loss": 1.7965, + "step": 9012 + }, + { + "epoch": 2.7664211172498465, + "grad_norm": 0.35851627588272095, + "learning_rate": 8.48830246483428e-05, + "loss": 1.8275, + "step": 9013 + }, + { + "epoch": 2.766728054020872, + "grad_norm": 0.30598360300064087, + "learning_rate": 8.487946342509509e-05, + "loss": 1.8383, + "step": 9014 + }, + { + "epoch": 2.7670349907918967, + "grad_norm": 0.30098259449005127, + "learning_rate": 8.487590185714811e-05, + "loss": 1.8229, + "step": 9015 + }, + { + "epoch": 2.767341927562922, + "grad_norm": 0.45887723565101624, + "learning_rate": 8.487233994453701e-05, + "loss": 1.9128, + "step": 9016 + }, + { + "epoch": 2.767648864333947, + "grad_norm": 0.4983403980731964, + "learning_rate": 8.4868777687297e-05, + "loss": 1.8269, + "step": 9017 + }, + { + "epoch": 2.7679558011049723, + "grad_norm": 0.4925507605075836, + "learning_rate": 8.48652150854633e-05, + "loss": 1.9231, + "step": 9018 + }, + { + "epoch": 2.7682627378759976, + "grad_norm": 0.31434112787246704, + "learning_rate": 8.48616521390711e-05, + "loss": 1.7782, + "step": 9019 + }, + { + "epoch": 2.768569674647023, + "grad_norm": 0.31802332401275635, + "learning_rate": 8.485808884815563e-05, + "loss": 1.8927, + "step": 9020 + }, + { + "epoch": 2.768876611418048, + "grad_norm": 0.4615871012210846, + "learning_rate": 8.485452521275208e-05, + "loss": 1.7866, + "step": 9021 + }, + { + "epoch": 2.769183548189073, + "grad_norm": 0.43722355365753174, + "learning_rate": 8.48509612328957e-05, + "loss": 1.8159, + "step": 9022 + }, + { + "epoch": 2.769490484960098, + "grad_norm": 0.27137285470962524, + "learning_rate": 8.484739690862169e-05, + "loss": 1.7613, + "step": 9023 + }, + { + "epoch": 2.7697974217311234, + "grad_norm": 0.32973676919937134, + "learning_rate": 8.484383223996528e-05, + "loss": 1.8321, + "step": 9024 + }, + { + "epoch": 2.7701043585021488, + "grad_norm": 0.38628003001213074, + "learning_rate": 8.484026722696169e-05, + "loss": 1.8154, + "step": 9025 + }, + { + "epoch": 2.7704112952731736, + "grad_norm": 0.33044543862342834, + "learning_rate": 8.483670186964617e-05, + "loss": 1.857, + "step": 9026 + }, + { + "epoch": 2.770718232044199, + "grad_norm": 0.2778245210647583, + "learning_rate": 8.483313616805393e-05, + "loss": 1.8524, + "step": 9027 + }, + { + "epoch": 2.771025168815224, + "grad_norm": 0.32064709067344666, + "learning_rate": 8.482957012222024e-05, + "loss": 1.8757, + "step": 9028 + }, + { + "epoch": 2.771332105586249, + "grad_norm": 0.29325249791145325, + "learning_rate": 8.48260037321803e-05, + "loss": 1.8504, + "step": 9029 + }, + { + "epoch": 2.7716390423572745, + "grad_norm": 0.308626651763916, + "learning_rate": 8.48224369979694e-05, + "loss": 1.882, + "step": 9030 + }, + { + "epoch": 2.7719459791282994, + "grad_norm": 0.34577706456184387, + "learning_rate": 8.481886991962276e-05, + "loss": 1.8178, + "step": 9031 + }, + { + "epoch": 2.7722529158993248, + "grad_norm": 0.3902320861816406, + "learning_rate": 8.481530249717564e-05, + "loss": 1.9111, + "step": 9032 + }, + { + "epoch": 2.7725598526703497, + "grad_norm": 0.431540310382843, + "learning_rate": 8.481173473066328e-05, + "loss": 1.8145, + "step": 9033 + }, + { + "epoch": 2.772866789441375, + "grad_norm": 0.3637184798717499, + "learning_rate": 8.480816662012097e-05, + "loss": 1.8298, + "step": 9034 + }, + { + "epoch": 2.7731737262124003, + "grad_norm": 0.3045017123222351, + "learning_rate": 8.480459816558397e-05, + "loss": 1.8099, + "step": 9035 + }, + { + "epoch": 2.7734806629834257, + "grad_norm": 0.4252402186393738, + "learning_rate": 8.48010293670875e-05, + "loss": 1.8125, + "step": 9036 + }, + { + "epoch": 2.7737875997544506, + "grad_norm": 0.37933188676834106, + "learning_rate": 8.479746022466688e-05, + "loss": 1.8162, + "step": 9037 + }, + { + "epoch": 2.774094536525476, + "grad_norm": 0.287536084651947, + "learning_rate": 8.479389073835735e-05, + "loss": 1.8377, + "step": 9038 + }, + { + "epoch": 2.7744014732965008, + "grad_norm": 0.3484840393066406, + "learning_rate": 8.47903209081942e-05, + "loss": 1.8166, + "step": 9039 + }, + { + "epoch": 2.774708410067526, + "grad_norm": 0.4489477872848511, + "learning_rate": 8.478675073421272e-05, + "loss": 1.8618, + "step": 9040 + }, + { + "epoch": 2.7750153468385514, + "grad_norm": 0.3817744553089142, + "learning_rate": 8.478318021644817e-05, + "loss": 1.86, + "step": 9041 + }, + { + "epoch": 2.7753222836095763, + "grad_norm": 0.263468861579895, + "learning_rate": 8.477960935493585e-05, + "loss": 1.7802, + "step": 9042 + }, + { + "epoch": 2.7756292203806017, + "grad_norm": 0.3218925893306732, + "learning_rate": 8.477603814971104e-05, + "loss": 1.8056, + "step": 9043 + }, + { + "epoch": 2.7759361571516266, + "grad_norm": 0.38502782583236694, + "learning_rate": 8.477246660080905e-05, + "loss": 1.8405, + "step": 9044 + }, + { + "epoch": 2.776243093922652, + "grad_norm": 0.3504064381122589, + "learning_rate": 8.476889470826517e-05, + "loss": 1.8606, + "step": 9045 + }, + { + "epoch": 2.7765500306936772, + "grad_norm": 0.3007161021232605, + "learning_rate": 8.476532247211468e-05, + "loss": 1.8407, + "step": 9046 + }, + { + "epoch": 2.776856967464702, + "grad_norm": 0.30306726694107056, + "learning_rate": 8.476174989239289e-05, + "loss": 1.8399, + "step": 9047 + }, + { + "epoch": 2.7771639042357275, + "grad_norm": 0.3898545801639557, + "learning_rate": 8.475817696913511e-05, + "loss": 1.8971, + "step": 9048 + }, + { + "epoch": 2.7774708410067523, + "grad_norm": 0.35386478900909424, + "learning_rate": 8.475460370237667e-05, + "loss": 1.8213, + "step": 9049 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.35815873742103577, + "learning_rate": 8.475103009215287e-05, + "loss": 1.9593, + "step": 9050 + }, + { + "epoch": 2.778084714548803, + "grad_norm": 0.28021275997161865, + "learning_rate": 8.474745613849901e-05, + "loss": 1.7767, + "step": 9051 + }, + { + "epoch": 2.7783916513198283, + "grad_norm": 0.3393603563308716, + "learning_rate": 8.474388184145042e-05, + "loss": 1.8484, + "step": 9052 + }, + { + "epoch": 2.7786985880908532, + "grad_norm": 0.30488693714141846, + "learning_rate": 8.474030720104243e-05, + "loss": 1.835, + "step": 9053 + }, + { + "epoch": 2.7790055248618786, + "grad_norm": 0.2839586138725281, + "learning_rate": 8.473673221731037e-05, + "loss": 1.8054, + "step": 9054 + }, + { + "epoch": 2.7793124616329035, + "grad_norm": 0.2718851864337921, + "learning_rate": 8.473315689028955e-05, + "loss": 1.8216, + "step": 9055 + }, + { + "epoch": 2.779619398403929, + "grad_norm": 0.3072827458381653, + "learning_rate": 8.472958122001531e-05, + "loss": 1.8537, + "step": 9056 + }, + { + "epoch": 2.779926335174954, + "grad_norm": 0.36827966570854187, + "learning_rate": 8.472600520652301e-05, + "loss": 1.8174, + "step": 9057 + }, + { + "epoch": 2.780233271945979, + "grad_norm": 0.37436968088150024, + "learning_rate": 8.472242884984797e-05, + "loss": 1.7983, + "step": 9058 + }, + { + "epoch": 2.7805402087170044, + "grad_norm": 0.3039530813694, + "learning_rate": 8.471885215002554e-05, + "loss": 1.839, + "step": 9059 + }, + { + "epoch": 2.7808471454880292, + "grad_norm": 0.2949865162372589, + "learning_rate": 8.471527510709106e-05, + "loss": 1.8191, + "step": 9060 + }, + { + "epoch": 2.7811540822590546, + "grad_norm": 0.2914051413536072, + "learning_rate": 8.471169772107987e-05, + "loss": 1.8511, + "step": 9061 + }, + { + "epoch": 2.78146101903008, + "grad_norm": 0.29169002175331116, + "learning_rate": 8.470811999202734e-05, + "loss": 1.8242, + "step": 9062 + }, + { + "epoch": 2.781767955801105, + "grad_norm": 0.2862909436225891, + "learning_rate": 8.470454191996884e-05, + "loss": 1.8471, + "step": 9063 + }, + { + "epoch": 2.78207489257213, + "grad_norm": 0.2820829749107361, + "learning_rate": 8.47009635049397e-05, + "loss": 1.8539, + "step": 9064 + }, + { + "epoch": 2.782381829343155, + "grad_norm": 0.2778072655200958, + "learning_rate": 8.469738474697532e-05, + "loss": 1.7999, + "step": 9065 + }, + { + "epoch": 2.7826887661141804, + "grad_norm": 0.35963353514671326, + "learning_rate": 8.469380564611103e-05, + "loss": 1.8589, + "step": 9066 + }, + { + "epoch": 2.7829957028852057, + "grad_norm": 0.29438379406929016, + "learning_rate": 8.469022620238223e-05, + "loss": 1.7898, + "step": 9067 + }, + { + "epoch": 2.783302639656231, + "grad_norm": 0.2766551971435547, + "learning_rate": 8.468664641582428e-05, + "loss": 1.858, + "step": 9068 + }, + { + "epoch": 2.783609576427256, + "grad_norm": 0.29893574118614197, + "learning_rate": 8.468306628647256e-05, + "loss": 1.7859, + "step": 9069 + }, + { + "epoch": 2.7839165131982813, + "grad_norm": 0.2744910717010498, + "learning_rate": 8.467948581436243e-05, + "loss": 1.7803, + "step": 9070 + }, + { + "epoch": 2.784223449969306, + "grad_norm": 0.2405908703804016, + "learning_rate": 8.467590499952931e-05, + "loss": 1.8064, + "step": 9071 + }, + { + "epoch": 2.7845303867403315, + "grad_norm": 0.28585049510002136, + "learning_rate": 8.467232384200858e-05, + "loss": 1.809, + "step": 9072 + }, + { + "epoch": 2.784837323511357, + "grad_norm": 0.25816819071769714, + "learning_rate": 8.466874234183562e-05, + "loss": 1.7687, + "step": 9073 + }, + { + "epoch": 2.7851442602823817, + "grad_norm": 0.3135145306587219, + "learning_rate": 8.466516049904582e-05, + "loss": 1.8902, + "step": 9074 + }, + { + "epoch": 2.785451197053407, + "grad_norm": 0.32004159688949585, + "learning_rate": 8.46615783136746e-05, + "loss": 1.8227, + "step": 9075 + }, + { + "epoch": 2.785758133824432, + "grad_norm": 0.2775251567363739, + "learning_rate": 8.465799578575733e-05, + "loss": 1.8293, + "step": 9076 + }, + { + "epoch": 2.7860650705954573, + "grad_norm": 0.3377391993999481, + "learning_rate": 8.465441291532944e-05, + "loss": 1.9096, + "step": 9077 + }, + { + "epoch": 2.7863720073664826, + "grad_norm": 0.322818398475647, + "learning_rate": 8.465082970242634e-05, + "loss": 1.8372, + "step": 9078 + }, + { + "epoch": 2.786678944137508, + "grad_norm": 0.30539727210998535, + "learning_rate": 8.464724614708342e-05, + "loss": 1.8678, + "step": 9079 + }, + { + "epoch": 2.786985880908533, + "grad_norm": 0.3148079216480255, + "learning_rate": 8.464366224933611e-05, + "loss": 1.798, + "step": 9080 + }, + { + "epoch": 2.787292817679558, + "grad_norm": 0.3834371566772461, + "learning_rate": 8.464007800921983e-05, + "loss": 1.7871, + "step": 9081 + }, + { + "epoch": 2.787599754450583, + "grad_norm": 0.360202431678772, + "learning_rate": 8.463649342676998e-05, + "loss": 1.8396, + "step": 9082 + }, + { + "epoch": 2.7879066912216084, + "grad_norm": 0.28360050916671753, + "learning_rate": 8.463290850202201e-05, + "loss": 1.7905, + "step": 9083 + }, + { + "epoch": 2.7882136279926337, + "grad_norm": 0.28087326884269714, + "learning_rate": 8.462932323501134e-05, + "loss": 1.8079, + "step": 9084 + }, + { + "epoch": 2.7885205647636586, + "grad_norm": 0.2725851833820343, + "learning_rate": 8.462573762577339e-05, + "loss": 1.8099, + "step": 9085 + }, + { + "epoch": 2.788827501534684, + "grad_norm": 0.27776938676834106, + "learning_rate": 8.462215167434363e-05, + "loss": 1.8002, + "step": 9086 + }, + { + "epoch": 2.789134438305709, + "grad_norm": 0.3118545711040497, + "learning_rate": 8.461856538075745e-05, + "loss": 1.8541, + "step": 9087 + }, + { + "epoch": 2.789441375076734, + "grad_norm": 0.29499873518943787, + "learning_rate": 8.461497874505034e-05, + "loss": 1.8667, + "step": 9088 + }, + { + "epoch": 2.7897483118477595, + "grad_norm": 0.31346917152404785, + "learning_rate": 8.46113917672577e-05, + "loss": 1.8737, + "step": 9089 + }, + { + "epoch": 2.7900552486187844, + "grad_norm": 0.30406203866004944, + "learning_rate": 8.460780444741501e-05, + "loss": 1.8467, + "step": 9090 + }, + { + "epoch": 2.7903621853898097, + "grad_norm": 0.28438735008239746, + "learning_rate": 8.46042167855577e-05, + "loss": 1.8008, + "step": 9091 + }, + { + "epoch": 2.7906691221608346, + "grad_norm": 0.29893866181373596, + "learning_rate": 8.460062878172125e-05, + "loss": 1.8498, + "step": 9092 + }, + { + "epoch": 2.79097605893186, + "grad_norm": 0.33810749650001526, + "learning_rate": 8.459704043594112e-05, + "loss": 1.8259, + "step": 9093 + }, + { + "epoch": 2.7912829957028853, + "grad_norm": 0.3726813495159149, + "learning_rate": 8.459345174825273e-05, + "loss": 1.8831, + "step": 9094 + }, + { + "epoch": 2.7915899324739106, + "grad_norm": 0.2983379662036896, + "learning_rate": 8.45898627186916e-05, + "loss": 1.7886, + "step": 9095 + }, + { + "epoch": 2.7918968692449355, + "grad_norm": 0.3235681354999542, + "learning_rate": 8.458627334729316e-05, + "loss": 1.8616, + "step": 9096 + }, + { + "epoch": 2.792203806015961, + "grad_norm": 0.47961094975471497, + "learning_rate": 8.458268363409288e-05, + "loss": 1.8134, + "step": 9097 + }, + { + "epoch": 2.7925107427869857, + "grad_norm": 0.5463281869888306, + "learning_rate": 8.457909357912628e-05, + "loss": 1.8288, + "step": 9098 + }, + { + "epoch": 2.792817679558011, + "grad_norm": 0.5377171635627747, + "learning_rate": 8.45755031824288e-05, + "loss": 1.8032, + "step": 9099 + }, + { + "epoch": 2.7931246163290364, + "grad_norm": 0.30159178376197815, + "learning_rate": 8.457191244403592e-05, + "loss": 1.7619, + "step": 9100 + }, + { + "epoch": 2.7934315531000613, + "grad_norm": 0.33798086643218994, + "learning_rate": 8.456832136398315e-05, + "loss": 1.839, + "step": 9101 + }, + { + "epoch": 2.7937384898710866, + "grad_norm": 0.5194488167762756, + "learning_rate": 8.456472994230595e-05, + "loss": 1.7908, + "step": 9102 + }, + { + "epoch": 2.7940454266421115, + "grad_norm": 0.49310582876205444, + "learning_rate": 8.456113817903986e-05, + "loss": 1.8471, + "step": 9103 + }, + { + "epoch": 2.794352363413137, + "grad_norm": 0.27490735054016113, + "learning_rate": 8.455754607422032e-05, + "loss": 1.8168, + "step": 9104 + }, + { + "epoch": 2.794659300184162, + "grad_norm": 0.3760504126548767, + "learning_rate": 8.455395362788285e-05, + "loss": 1.8796, + "step": 9105 + }, + { + "epoch": 2.794966236955187, + "grad_norm": 0.4636823534965515, + "learning_rate": 8.455036084006298e-05, + "loss": 1.8001, + "step": 9106 + }, + { + "epoch": 2.7952731737262124, + "grad_norm": 0.38666999340057373, + "learning_rate": 8.454676771079619e-05, + "loss": 1.8396, + "step": 9107 + }, + { + "epoch": 2.7955801104972373, + "grad_norm": 0.2992180585861206, + "learning_rate": 8.454317424011797e-05, + "loss": 1.8298, + "step": 9108 + }, + { + "epoch": 2.7958870472682626, + "grad_norm": 0.3744206428527832, + "learning_rate": 8.453958042806389e-05, + "loss": 1.8396, + "step": 9109 + }, + { + "epoch": 2.796193984039288, + "grad_norm": 0.5117284059524536, + "learning_rate": 8.453598627466941e-05, + "loss": 1.9734, + "step": 9110 + }, + { + "epoch": 2.7965009208103133, + "grad_norm": 0.36792969703674316, + "learning_rate": 8.453239177997008e-05, + "loss": 1.8347, + "step": 9111 + }, + { + "epoch": 2.796807857581338, + "grad_norm": 0.3352719843387604, + "learning_rate": 8.452879694400139e-05, + "loss": 1.7967, + "step": 9112 + }, + { + "epoch": 2.7971147943523635, + "grad_norm": 0.45745235681533813, + "learning_rate": 8.452520176679893e-05, + "loss": 1.8484, + "step": 9113 + }, + { + "epoch": 2.7974217311233884, + "grad_norm": 0.43958255648612976, + "learning_rate": 8.452160624839816e-05, + "loss": 1.7954, + "step": 9114 + }, + { + "epoch": 2.7977286678944138, + "grad_norm": 0.28715837001800537, + "learning_rate": 8.451801038883467e-05, + "loss": 1.8088, + "step": 9115 + }, + { + "epoch": 2.798035604665439, + "grad_norm": 0.3552972078323364, + "learning_rate": 8.451441418814394e-05, + "loss": 1.7654, + "step": 9116 + }, + { + "epoch": 2.798342541436464, + "grad_norm": 0.5065462589263916, + "learning_rate": 8.451081764636156e-05, + "loss": 1.7841, + "step": 9117 + }, + { + "epoch": 2.7986494782074893, + "grad_norm": 0.48900917172431946, + "learning_rate": 8.450722076352306e-05, + "loss": 1.8709, + "step": 9118 + }, + { + "epoch": 2.798956414978514, + "grad_norm": 0.31420227885246277, + "learning_rate": 8.450362353966395e-05, + "loss": 1.9057, + "step": 9119 + }, + { + "epoch": 2.7992633517495396, + "grad_norm": 0.35886913537979126, + "learning_rate": 8.450002597481982e-05, + "loss": 1.877, + "step": 9120 + }, + { + "epoch": 2.799570288520565, + "grad_norm": 0.3822213113307953, + "learning_rate": 8.449642806902623e-05, + "loss": 1.9171, + "step": 9121 + }, + { + "epoch": 2.7998772252915898, + "grad_norm": 0.3286183476448059, + "learning_rate": 8.449282982231869e-05, + "loss": 1.8342, + "step": 9122 + }, + { + "epoch": 2.800184162062615, + "grad_norm": 0.3498966693878174, + "learning_rate": 8.448923123473282e-05, + "loss": 1.8276, + "step": 9123 + }, + { + "epoch": 2.80049109883364, + "grad_norm": 0.3550187647342682, + "learning_rate": 8.448563230630413e-05, + "loss": 1.8585, + "step": 9124 + }, + { + "epoch": 2.8007980356046653, + "grad_norm": 0.32100117206573486, + "learning_rate": 8.448203303706821e-05, + "loss": 1.8168, + "step": 9125 + }, + { + "epoch": 2.8011049723756907, + "grad_norm": 0.3859860301017761, + "learning_rate": 8.447843342706063e-05, + "loss": 1.8941, + "step": 9126 + }, + { + "epoch": 2.801411909146716, + "grad_norm": 0.41674432158470154, + "learning_rate": 8.447483347631697e-05, + "loss": 1.7894, + "step": 9127 + }, + { + "epoch": 2.801718845917741, + "grad_norm": 0.3324837386608124, + "learning_rate": 8.44712331848728e-05, + "loss": 1.8901, + "step": 9128 + }, + { + "epoch": 2.8020257826887662, + "grad_norm": 0.30357789993286133, + "learning_rate": 8.44676325527637e-05, + "loss": 1.8434, + "step": 9129 + }, + { + "epoch": 2.802332719459791, + "grad_norm": 0.3215816617012024, + "learning_rate": 8.446403158002525e-05, + "loss": 1.8291, + "step": 9130 + }, + { + "epoch": 2.8026396562308165, + "grad_norm": 0.26280832290649414, + "learning_rate": 8.446043026669303e-05, + "loss": 1.7934, + "step": 9131 + }, + { + "epoch": 2.802946593001842, + "grad_norm": 0.2963539659976959, + "learning_rate": 8.445682861280265e-05, + "loss": 1.824, + "step": 9132 + }, + { + "epoch": 2.8032535297728667, + "grad_norm": 0.4251864552497864, + "learning_rate": 8.44532266183897e-05, + "loss": 1.9, + "step": 9133 + }, + { + "epoch": 2.803560466543892, + "grad_norm": 0.3920140862464905, + "learning_rate": 8.444962428348978e-05, + "loss": 1.7753, + "step": 9134 + }, + { + "epoch": 2.803867403314917, + "grad_norm": 0.2614890933036804, + "learning_rate": 8.444602160813845e-05, + "loss": 1.844, + "step": 9135 + }, + { + "epoch": 2.8041743400859422, + "grad_norm": 0.3359995484352112, + "learning_rate": 8.444241859237135e-05, + "loss": 1.8636, + "step": 9136 + }, + { + "epoch": 2.8044812768569676, + "grad_norm": 0.34399285912513733, + "learning_rate": 8.44388152362241e-05, + "loss": 1.8304, + "step": 9137 + }, + { + "epoch": 2.804788213627993, + "grad_norm": 0.27815961837768555, + "learning_rate": 8.443521153973228e-05, + "loss": 1.7916, + "step": 9138 + }, + { + "epoch": 2.805095150399018, + "grad_norm": 0.40705251693725586, + "learning_rate": 8.443160750293152e-05, + "loss": 1.7707, + "step": 9139 + }, + { + "epoch": 2.805402087170043, + "grad_norm": 0.49512532353401184, + "learning_rate": 8.442800312585744e-05, + "loss": 1.866, + "step": 9140 + }, + { + "epoch": 2.805709023941068, + "grad_norm": 0.31373831629753113, + "learning_rate": 8.442439840854565e-05, + "loss": 1.8495, + "step": 9141 + }, + { + "epoch": 2.8060159607120934, + "grad_norm": 0.33470213413238525, + "learning_rate": 8.442079335103177e-05, + "loss": 1.8459, + "step": 9142 + }, + { + "epoch": 2.8063228974831187, + "grad_norm": 0.4092586636543274, + "learning_rate": 8.441718795335145e-05, + "loss": 1.8547, + "step": 9143 + }, + { + "epoch": 2.8066298342541436, + "grad_norm": 0.37220728397369385, + "learning_rate": 8.44135822155403e-05, + "loss": 1.8922, + "step": 9144 + }, + { + "epoch": 2.806936771025169, + "grad_norm": 0.3197399973869324, + "learning_rate": 8.440997613763395e-05, + "loss": 1.872, + "step": 9145 + }, + { + "epoch": 2.807243707796194, + "grad_norm": 0.31258881092071533, + "learning_rate": 8.440636971966805e-05, + "loss": 1.8394, + "step": 9146 + }, + { + "epoch": 2.807550644567219, + "grad_norm": 0.31450721621513367, + "learning_rate": 8.440276296167825e-05, + "loss": 1.8496, + "step": 9147 + }, + { + "epoch": 2.8078575813382445, + "grad_norm": 0.30959805846214294, + "learning_rate": 8.439915586370018e-05, + "loss": 1.8326, + "step": 9148 + }, + { + "epoch": 2.8081645181092694, + "grad_norm": 0.2942456901073456, + "learning_rate": 8.439554842576949e-05, + "loss": 1.8742, + "step": 9149 + }, + { + "epoch": 2.8084714548802947, + "grad_norm": 0.32378795742988586, + "learning_rate": 8.439194064792182e-05, + "loss": 1.7991, + "step": 9150 + }, + { + "epoch": 2.8087783916513196, + "grad_norm": 0.30733996629714966, + "learning_rate": 8.438833253019285e-05, + "loss": 1.8822, + "step": 9151 + }, + { + "epoch": 2.809085328422345, + "grad_norm": 0.29933521151542664, + "learning_rate": 8.438472407261821e-05, + "loss": 1.7785, + "step": 9152 + }, + { + "epoch": 2.8093922651933703, + "grad_norm": 0.2992005944252014, + "learning_rate": 8.438111527523358e-05, + "loss": 1.9056, + "step": 9153 + }, + { + "epoch": 2.8096992019643956, + "grad_norm": 0.3074969947338104, + "learning_rate": 8.43775061380746e-05, + "loss": 1.8283, + "step": 9154 + }, + { + "epoch": 2.8100061387354205, + "grad_norm": 0.29843345284461975, + "learning_rate": 8.437389666117699e-05, + "loss": 1.87, + "step": 9155 + }, + { + "epoch": 2.810313075506446, + "grad_norm": 0.2939853072166443, + "learning_rate": 8.437028684457635e-05, + "loss": 1.8657, + "step": 9156 + }, + { + "epoch": 2.8106200122774707, + "grad_norm": 0.292972207069397, + "learning_rate": 8.436667668830841e-05, + "loss": 1.821, + "step": 9157 + }, + { + "epoch": 2.810926949048496, + "grad_norm": 0.298244833946228, + "learning_rate": 8.436306619240882e-05, + "loss": 1.8531, + "step": 9158 + }, + { + "epoch": 2.8112338858195214, + "grad_norm": 0.28567394614219666, + "learning_rate": 8.435945535691328e-05, + "loss": 1.7719, + "step": 9159 + }, + { + "epoch": 2.8115408225905463, + "grad_norm": 0.2876092493534088, + "learning_rate": 8.435584418185745e-05, + "loss": 1.7622, + "step": 9160 + }, + { + "epoch": 2.8118477593615716, + "grad_norm": 0.2656804919242859, + "learning_rate": 8.435223266727704e-05, + "loss": 1.7624, + "step": 9161 + }, + { + "epoch": 2.8121546961325965, + "grad_norm": 0.26690298318862915, + "learning_rate": 8.434862081320774e-05, + "loss": 1.807, + "step": 9162 + }, + { + "epoch": 2.812461632903622, + "grad_norm": 0.3088238537311554, + "learning_rate": 8.434500861968521e-05, + "loss": 1.9214, + "step": 9163 + }, + { + "epoch": 2.812768569674647, + "grad_norm": 0.32310751080513, + "learning_rate": 8.43413960867452e-05, + "loss": 1.8341, + "step": 9164 + }, + { + "epoch": 2.813075506445672, + "grad_norm": 0.3028428554534912, + "learning_rate": 8.433778321442339e-05, + "loss": 1.8316, + "step": 9165 + }, + { + "epoch": 2.8133824432166974, + "grad_norm": 0.28363901376724243, + "learning_rate": 8.433417000275545e-05, + "loss": 1.8506, + "step": 9166 + }, + { + "epoch": 2.8136893799877223, + "grad_norm": 0.2976547181606293, + "learning_rate": 8.433055645177714e-05, + "loss": 1.8654, + "step": 9167 + }, + { + "epoch": 2.8139963167587476, + "grad_norm": 0.2945725619792938, + "learning_rate": 8.432694256152414e-05, + "loss": 1.8146, + "step": 9168 + }, + { + "epoch": 2.814303253529773, + "grad_norm": 0.30364149808883667, + "learning_rate": 8.432332833203217e-05, + "loss": 1.8152, + "step": 9169 + }, + { + "epoch": 2.8146101903007983, + "grad_norm": 0.2776038348674774, + "learning_rate": 8.431971376333699e-05, + "loss": 1.7723, + "step": 9170 + }, + { + "epoch": 2.814917127071823, + "grad_norm": 0.41802000999450684, + "learning_rate": 8.431609885547425e-05, + "loss": 1.7909, + "step": 9171 + }, + { + "epoch": 2.8152240638428485, + "grad_norm": 0.400622695684433, + "learning_rate": 8.43124836084797e-05, + "loss": 1.8241, + "step": 9172 + }, + { + "epoch": 2.8155310006138734, + "grad_norm": 0.3760300576686859, + "learning_rate": 8.430886802238908e-05, + "loss": 1.9298, + "step": 9173 + }, + { + "epoch": 2.8158379373848987, + "grad_norm": 0.2944977283477783, + "learning_rate": 8.430525209723813e-05, + "loss": 1.8181, + "step": 9174 + }, + { + "epoch": 2.816144874155924, + "grad_norm": 0.28091785311698914, + "learning_rate": 8.430163583306257e-05, + "loss": 1.8178, + "step": 9175 + }, + { + "epoch": 2.816451810926949, + "grad_norm": 0.33689528703689575, + "learning_rate": 8.429801922989812e-05, + "loss": 1.8195, + "step": 9176 + }, + { + "epoch": 2.8167587476979743, + "grad_norm": 0.3541412055492401, + "learning_rate": 8.429440228778058e-05, + "loss": 1.8951, + "step": 9177 + }, + { + "epoch": 2.817065684468999, + "grad_norm": 0.2846376299858093, + "learning_rate": 8.429078500674564e-05, + "loss": 1.7858, + "step": 9178 + }, + { + "epoch": 2.8173726212400245, + "grad_norm": 0.28097108006477356, + "learning_rate": 8.428716738682905e-05, + "loss": 1.8503, + "step": 9179 + }, + { + "epoch": 2.81767955801105, + "grad_norm": 0.354670912027359, + "learning_rate": 8.428354942806658e-05, + "loss": 1.8332, + "step": 9180 + }, + { + "epoch": 2.8179864947820747, + "grad_norm": 0.3589770793914795, + "learning_rate": 8.427993113049397e-05, + "loss": 1.8527, + "step": 9181 + }, + { + "epoch": 2.8182934315531, + "grad_norm": 0.3171144723892212, + "learning_rate": 8.4276312494147e-05, + "loss": 1.789, + "step": 9182 + }, + { + "epoch": 2.818600368324125, + "grad_norm": 0.3540917932987213, + "learning_rate": 8.427269351906143e-05, + "loss": 1.8338, + "step": 9183 + }, + { + "epoch": 2.8189073050951503, + "grad_norm": 0.34149861335754395, + "learning_rate": 8.426907420527302e-05, + "loss": 1.8202, + "step": 9184 + }, + { + "epoch": 2.8192142418661756, + "grad_norm": 0.3035878837108612, + "learning_rate": 8.426545455281751e-05, + "loss": 1.842, + "step": 9185 + }, + { + "epoch": 2.819521178637201, + "grad_norm": 0.29007625579833984, + "learning_rate": 8.426183456173072e-05, + "loss": 1.8486, + "step": 9186 + }, + { + "epoch": 2.819828115408226, + "grad_norm": 0.3066602647304535, + "learning_rate": 8.425821423204837e-05, + "loss": 1.7833, + "step": 9187 + }, + { + "epoch": 2.820135052179251, + "grad_norm": 0.3163747191429138, + "learning_rate": 8.425459356380627e-05, + "loss": 1.8037, + "step": 9188 + }, + { + "epoch": 2.820441988950276, + "grad_norm": 0.3282648026943207, + "learning_rate": 8.425097255704022e-05, + "loss": 1.8476, + "step": 9189 + }, + { + "epoch": 2.8207489257213014, + "grad_norm": 0.3573009669780731, + "learning_rate": 8.424735121178598e-05, + "loss": 1.87, + "step": 9190 + }, + { + "epoch": 2.8210558624923268, + "grad_norm": 0.3480490744113922, + "learning_rate": 8.424372952807933e-05, + "loss": 1.8773, + "step": 9191 + }, + { + "epoch": 2.8213627992633517, + "grad_norm": 0.3296821415424347, + "learning_rate": 8.424010750595608e-05, + "loss": 1.8775, + "step": 9192 + }, + { + "epoch": 2.821669736034377, + "grad_norm": 0.33366382122039795, + "learning_rate": 8.423648514545202e-05, + "loss": 1.8064, + "step": 9193 + }, + { + "epoch": 2.821976672805402, + "grad_norm": 0.454303503036499, + "learning_rate": 8.423286244660295e-05, + "loss": 1.9702, + "step": 9194 + }, + { + "epoch": 2.822283609576427, + "grad_norm": 0.361215740442276, + "learning_rate": 8.422923940944466e-05, + "loss": 1.8055, + "step": 9195 + }, + { + "epoch": 2.8225905463474525, + "grad_norm": 0.3678447902202606, + "learning_rate": 8.422561603401297e-05, + "loss": 1.8924, + "step": 9196 + }, + { + "epoch": 2.8228974831184774, + "grad_norm": 0.32999005913734436, + "learning_rate": 8.422199232034369e-05, + "loss": 1.7887, + "step": 9197 + }, + { + "epoch": 2.8232044198895028, + "grad_norm": 0.2811618149280548, + "learning_rate": 8.42183682684726e-05, + "loss": 1.8166, + "step": 9198 + }, + { + "epoch": 2.8235113566605277, + "grad_norm": 0.3178839385509491, + "learning_rate": 8.421474387843555e-05, + "loss": 1.7868, + "step": 9199 + }, + { + "epoch": 2.823818293431553, + "grad_norm": 0.27299264073371887, + "learning_rate": 8.421111915026836e-05, + "loss": 1.816, + "step": 9200 + }, + { + "epoch": 2.8241252302025783, + "grad_norm": 0.3191591203212738, + "learning_rate": 8.420749408400684e-05, + "loss": 1.912, + "step": 9201 + }, + { + "epoch": 2.8244321669736037, + "grad_norm": 0.3638809323310852, + "learning_rate": 8.42038686796868e-05, + "loss": 1.7716, + "step": 9202 + }, + { + "epoch": 2.8247391037446286, + "grad_norm": 0.33573171496391296, + "learning_rate": 8.420024293734407e-05, + "loss": 1.8599, + "step": 9203 + }, + { + "epoch": 2.825046040515654, + "grad_norm": 0.29062843322753906, + "learning_rate": 8.419661685701452e-05, + "loss": 1.7982, + "step": 9204 + }, + { + "epoch": 2.825352977286679, + "grad_norm": 0.27475887537002563, + "learning_rate": 8.419299043873394e-05, + "loss": 1.7763, + "step": 9205 + }, + { + "epoch": 2.825659914057704, + "grad_norm": 0.2996850609779358, + "learning_rate": 8.41893636825382e-05, + "loss": 1.7957, + "step": 9206 + }, + { + "epoch": 2.8259668508287294, + "grad_norm": 0.38112908601760864, + "learning_rate": 8.418573658846314e-05, + "loss": 1.8536, + "step": 9207 + }, + { + "epoch": 2.8262737875997543, + "grad_norm": 0.3245584964752197, + "learning_rate": 8.418210915654456e-05, + "loss": 1.8254, + "step": 9208 + }, + { + "epoch": 2.8265807243707797, + "grad_norm": 0.24600234627723694, + "learning_rate": 8.417848138681837e-05, + "loss": 1.825, + "step": 9209 + }, + { + "epoch": 2.8268876611418046, + "grad_norm": 0.3130429685115814, + "learning_rate": 8.417485327932038e-05, + "loss": 1.7954, + "step": 9210 + }, + { + "epoch": 2.82719459791283, + "grad_norm": 0.3218819200992584, + "learning_rate": 8.417122483408647e-05, + "loss": 1.8343, + "step": 9211 + }, + { + "epoch": 2.8275015346838552, + "grad_norm": 0.3020598292350769, + "learning_rate": 8.416759605115248e-05, + "loss": 1.8547, + "step": 9212 + }, + { + "epoch": 2.8278084714548806, + "grad_norm": 0.2685437798500061, + "learning_rate": 8.416396693055429e-05, + "loss": 1.7828, + "step": 9213 + }, + { + "epoch": 2.8281154082259055, + "grad_norm": 0.2990378737449646, + "learning_rate": 8.416033747232775e-05, + "loss": 1.8108, + "step": 9214 + }, + { + "epoch": 2.828422344996931, + "grad_norm": 0.25395238399505615, + "learning_rate": 8.415670767650871e-05, + "loss": 1.786, + "step": 9215 + }, + { + "epoch": 2.8287292817679557, + "grad_norm": 0.3406725823879242, + "learning_rate": 8.41530775431331e-05, + "loss": 1.9015, + "step": 9216 + }, + { + "epoch": 2.829036218538981, + "grad_norm": 0.279859721660614, + "learning_rate": 8.414944707223676e-05, + "loss": 1.8639, + "step": 9217 + }, + { + "epoch": 2.8293431553100064, + "grad_norm": 0.2574310600757599, + "learning_rate": 8.414581626385554e-05, + "loss": 1.7595, + "step": 9218 + }, + { + "epoch": 2.8296500920810312, + "grad_norm": 0.2956291437149048, + "learning_rate": 8.414218511802537e-05, + "loss": 1.8418, + "step": 9219 + }, + { + "epoch": 2.8299570288520566, + "grad_norm": 0.30965283513069153, + "learning_rate": 8.41385536347821e-05, + "loss": 1.8241, + "step": 9220 + }, + { + "epoch": 2.8302639656230815, + "grad_norm": 0.3125357925891876, + "learning_rate": 8.413492181416166e-05, + "loss": 1.7961, + "step": 9221 + }, + { + "epoch": 2.830570902394107, + "grad_norm": 0.23901188373565674, + "learning_rate": 8.413128965619988e-05, + "loss": 1.8109, + "step": 9222 + }, + { + "epoch": 2.830877839165132, + "grad_norm": 0.26556700468063354, + "learning_rate": 8.412765716093272e-05, + "loss": 1.8756, + "step": 9223 + }, + { + "epoch": 2.831184775936157, + "grad_norm": 0.3080972731113434, + "learning_rate": 8.412402432839604e-05, + "loss": 1.8271, + "step": 9224 + }, + { + "epoch": 2.8314917127071824, + "grad_norm": 0.32894501090049744, + "learning_rate": 8.412039115862573e-05, + "loss": 1.8427, + "step": 9225 + }, + { + "epoch": 2.8317986494782073, + "grad_norm": 0.3136049509048462, + "learning_rate": 8.411675765165774e-05, + "loss": 1.8716, + "step": 9226 + }, + { + "epoch": 2.8321055862492326, + "grad_norm": 0.26859185099601746, + "learning_rate": 8.411312380752795e-05, + "loss": 1.8138, + "step": 9227 + }, + { + "epoch": 2.832412523020258, + "grad_norm": 0.26863718032836914, + "learning_rate": 8.410948962627227e-05, + "loss": 1.8286, + "step": 9228 + }, + { + "epoch": 2.8327194597912833, + "grad_norm": 0.25599852204322815, + "learning_rate": 8.410585510792663e-05, + "loss": 1.8274, + "step": 9229 + }, + { + "epoch": 2.833026396562308, + "grad_norm": 0.22787287831306458, + "learning_rate": 8.410222025252694e-05, + "loss": 1.7961, + "step": 9230 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.22957643866539001, + "learning_rate": 8.409858506010912e-05, + "loss": 1.7763, + "step": 9231 + }, + { + "epoch": 2.8336402701043584, + "grad_norm": 0.2794438302516937, + "learning_rate": 8.409494953070909e-05, + "loss": 1.8552, + "step": 9232 + }, + { + "epoch": 2.8339472068753837, + "grad_norm": 0.2755461037158966, + "learning_rate": 8.409131366436279e-05, + "loss": 1.8418, + "step": 9233 + }, + { + "epoch": 2.834254143646409, + "grad_norm": 0.27968719601631165, + "learning_rate": 8.408767746110616e-05, + "loss": 1.8774, + "step": 9234 + }, + { + "epoch": 2.834561080417434, + "grad_norm": 0.3014982044696808, + "learning_rate": 8.408404092097511e-05, + "loss": 1.8886, + "step": 9235 + }, + { + "epoch": 2.8348680171884593, + "grad_norm": 0.3139450252056122, + "learning_rate": 8.408040404400558e-05, + "loss": 1.8119, + "step": 9236 + }, + { + "epoch": 2.835174953959484, + "grad_norm": 0.43578827381134033, + "learning_rate": 8.407676683023353e-05, + "loss": 1.8173, + "step": 9237 + }, + { + "epoch": 2.8354818907305095, + "grad_norm": 0.4939953088760376, + "learning_rate": 8.407312927969489e-05, + "loss": 1.8647, + "step": 9238 + }, + { + "epoch": 2.835788827501535, + "grad_norm": 0.40801018476486206, + "learning_rate": 8.406949139242562e-05, + "loss": 1.8259, + "step": 9239 + }, + { + "epoch": 2.8360957642725597, + "grad_norm": 0.331249862909317, + "learning_rate": 8.406585316846168e-05, + "loss": 1.8727, + "step": 9240 + }, + { + "epoch": 2.836402701043585, + "grad_norm": 0.3368569314479828, + "learning_rate": 8.406221460783901e-05, + "loss": 1.8362, + "step": 9241 + }, + { + "epoch": 2.83670963781461, + "grad_norm": 0.4736326336860657, + "learning_rate": 8.405857571059355e-05, + "loss": 1.9543, + "step": 9242 + }, + { + "epoch": 2.8370165745856353, + "grad_norm": 0.4151712656021118, + "learning_rate": 8.405493647676131e-05, + "loss": 1.8764, + "step": 9243 + }, + { + "epoch": 2.8373235113566606, + "grad_norm": 0.3463367819786072, + "learning_rate": 8.405129690637821e-05, + "loss": 1.8578, + "step": 9244 + }, + { + "epoch": 2.837630448127686, + "grad_norm": 0.28701671957969666, + "learning_rate": 8.404765699948023e-05, + "loss": 1.8201, + "step": 9245 + }, + { + "epoch": 2.837937384898711, + "grad_norm": 0.2893613874912262, + "learning_rate": 8.404401675610336e-05, + "loss": 1.7918, + "step": 9246 + }, + { + "epoch": 2.838244321669736, + "grad_norm": 0.29359766840934753, + "learning_rate": 8.404037617628357e-05, + "loss": 1.7919, + "step": 9247 + }, + { + "epoch": 2.838551258440761, + "grad_norm": 0.30147913098335266, + "learning_rate": 8.403673526005682e-05, + "loss": 1.8227, + "step": 9248 + }, + { + "epoch": 2.8388581952117864, + "grad_norm": 0.28443291783332825, + "learning_rate": 8.403309400745908e-05, + "loss": 1.8128, + "step": 9249 + }, + { + "epoch": 2.8391651319828117, + "grad_norm": 0.27890142798423767, + "learning_rate": 8.40294524185264e-05, + "loss": 1.8109, + "step": 9250 + }, + { + "epoch": 2.8394720687538366, + "grad_norm": 0.29900890588760376, + "learning_rate": 8.402581049329471e-05, + "loss": 1.7852, + "step": 9251 + }, + { + "epoch": 2.839779005524862, + "grad_norm": 0.34249019622802734, + "learning_rate": 8.402216823180001e-05, + "loss": 1.8681, + "step": 9252 + }, + { + "epoch": 2.840085942295887, + "grad_norm": 0.3387257754802704, + "learning_rate": 8.40185256340783e-05, + "loss": 1.9171, + "step": 9253 + }, + { + "epoch": 2.840392879066912, + "grad_norm": 0.2831752598285675, + "learning_rate": 8.40148827001656e-05, + "loss": 1.8422, + "step": 9254 + }, + { + "epoch": 2.8406998158379375, + "grad_norm": 0.30895891785621643, + "learning_rate": 8.401123943009788e-05, + "loss": 1.7967, + "step": 9255 + }, + { + "epoch": 2.8410067526089624, + "grad_norm": 0.381154328584671, + "learning_rate": 8.400759582391116e-05, + "loss": 1.8359, + "step": 9256 + }, + { + "epoch": 2.8413136893799877, + "grad_norm": 0.4041622281074524, + "learning_rate": 8.400395188164144e-05, + "loss": 1.8306, + "step": 9257 + }, + { + "epoch": 2.8416206261510126, + "grad_norm": 0.3801247775554657, + "learning_rate": 8.400030760332474e-05, + "loss": 1.8696, + "step": 9258 + }, + { + "epoch": 2.841927562922038, + "grad_norm": 0.27382874488830566, + "learning_rate": 8.399666298899706e-05, + "loss": 1.8369, + "step": 9259 + }, + { + "epoch": 2.8422344996930633, + "grad_norm": 0.31395214796066284, + "learning_rate": 8.399301803869445e-05, + "loss": 1.8135, + "step": 9260 + }, + { + "epoch": 2.8425414364640886, + "grad_norm": 0.36473774909973145, + "learning_rate": 8.398937275245291e-05, + "loss": 1.8025, + "step": 9261 + }, + { + "epoch": 2.8428483732351135, + "grad_norm": 0.38420331478118896, + "learning_rate": 8.398572713030846e-05, + "loss": 1.7873, + "step": 9262 + }, + { + "epoch": 2.843155310006139, + "grad_norm": 0.2707001566886902, + "learning_rate": 8.398208117229714e-05, + "loss": 1.8071, + "step": 9263 + }, + { + "epoch": 2.8434622467771637, + "grad_norm": 0.3391258418560028, + "learning_rate": 8.397843487845496e-05, + "loss": 1.8186, + "step": 9264 + }, + { + "epoch": 2.843769183548189, + "grad_norm": 0.4473530650138855, + "learning_rate": 8.397478824881799e-05, + "loss": 1.9144, + "step": 9265 + }, + { + "epoch": 2.8440761203192144, + "grad_norm": 0.3141709268093109, + "learning_rate": 8.397114128342224e-05, + "loss": 1.77, + "step": 9266 + }, + { + "epoch": 2.8443830570902393, + "grad_norm": 0.29191854596138, + "learning_rate": 8.396749398230377e-05, + "loss": 1.8645, + "step": 9267 + }, + { + "epoch": 2.8446899938612646, + "grad_norm": 0.4399743676185608, + "learning_rate": 8.39638463454986e-05, + "loss": 1.8261, + "step": 9268 + }, + { + "epoch": 2.8449969306322895, + "grad_norm": 0.4741196036338806, + "learning_rate": 8.396019837304281e-05, + "loss": 1.8566, + "step": 9269 + }, + { + "epoch": 2.845303867403315, + "grad_norm": 0.39640361070632935, + "learning_rate": 8.395655006497243e-05, + "loss": 1.8062, + "step": 9270 + }, + { + "epoch": 2.84561080417434, + "grad_norm": 0.290171355009079, + "learning_rate": 8.39529014213235e-05, + "loss": 1.8463, + "step": 9271 + }, + { + "epoch": 2.845917740945365, + "grad_norm": 0.2773928940296173, + "learning_rate": 8.394925244213212e-05, + "loss": 1.7929, + "step": 9272 + }, + { + "epoch": 2.8462246777163904, + "grad_norm": 0.38512173295021057, + "learning_rate": 8.394560312743433e-05, + "loss": 1.8724, + "step": 9273 + }, + { + "epoch": 2.8465316144874153, + "grad_norm": 0.44405680894851685, + "learning_rate": 8.394195347726619e-05, + "loss": 1.8184, + "step": 9274 + }, + { + "epoch": 2.8468385512584407, + "grad_norm": 0.32526880502700806, + "learning_rate": 8.393830349166376e-05, + "loss": 1.8207, + "step": 9275 + }, + { + "epoch": 2.847145488029466, + "grad_norm": 0.2934194803237915, + "learning_rate": 8.393465317066313e-05, + "loss": 1.8023, + "step": 9276 + }, + { + "epoch": 2.8474524248004913, + "grad_norm": 0.43126001954078674, + "learning_rate": 8.393100251430037e-05, + "loss": 1.8283, + "step": 9277 + }, + { + "epoch": 2.847759361571516, + "grad_norm": 0.48253729939460754, + "learning_rate": 8.392735152261157e-05, + "loss": 1.8359, + "step": 9278 + }, + { + "epoch": 2.8480662983425415, + "grad_norm": 0.3736251890659332, + "learning_rate": 8.392370019563279e-05, + "loss": 1.8553, + "step": 9279 + }, + { + "epoch": 2.8483732351135664, + "grad_norm": 0.33329901099205017, + "learning_rate": 8.39200485334001e-05, + "loss": 1.8156, + "step": 9280 + }, + { + "epoch": 2.8486801718845918, + "grad_norm": 0.42538657784461975, + "learning_rate": 8.391639653594963e-05, + "loss": 1.7812, + "step": 9281 + }, + { + "epoch": 2.848987108655617, + "grad_norm": 0.39076727628707886, + "learning_rate": 8.391274420331744e-05, + "loss": 1.8027, + "step": 9282 + }, + { + "epoch": 2.849294045426642, + "grad_norm": 0.3558272123336792, + "learning_rate": 8.390909153553963e-05, + "loss": 1.8448, + "step": 9283 + }, + { + "epoch": 2.8496009821976673, + "grad_norm": 0.26782071590423584, + "learning_rate": 8.390543853265232e-05, + "loss": 1.7995, + "step": 9284 + }, + { + "epoch": 2.849907918968692, + "grad_norm": 0.3449724614620209, + "learning_rate": 8.390178519469158e-05, + "loss": 1.7888, + "step": 9285 + }, + { + "epoch": 2.8502148557397176, + "grad_norm": 0.36390578746795654, + "learning_rate": 8.389813152169355e-05, + "loss": 1.8072, + "step": 9286 + }, + { + "epoch": 2.850521792510743, + "grad_norm": 0.31959423422813416, + "learning_rate": 8.389447751369428e-05, + "loss": 1.8513, + "step": 9287 + }, + { + "epoch": 2.8508287292817682, + "grad_norm": 0.2717762589454651, + "learning_rate": 8.389082317072994e-05, + "loss": 1.8457, + "step": 9288 + }, + { + "epoch": 2.851135666052793, + "grad_norm": 0.28937265276908875, + "learning_rate": 8.388716849283662e-05, + "loss": 1.7945, + "step": 9289 + }, + { + "epoch": 2.8514426028238185, + "grad_norm": 0.293079674243927, + "learning_rate": 8.388351348005044e-05, + "loss": 1.7731, + "step": 9290 + }, + { + "epoch": 2.8517495395948433, + "grad_norm": 0.32930463552474976, + "learning_rate": 8.38798581324075e-05, + "loss": 1.9017, + "step": 9291 + }, + { + "epoch": 2.8520564763658687, + "grad_norm": 0.2972584664821625, + "learning_rate": 8.387620244994397e-05, + "loss": 1.861, + "step": 9292 + }, + { + "epoch": 2.852363413136894, + "grad_norm": 0.24732981622219086, + "learning_rate": 8.387254643269595e-05, + "loss": 1.7749, + "step": 9293 + }, + { + "epoch": 2.852670349907919, + "grad_norm": 0.31004419922828674, + "learning_rate": 8.386889008069955e-05, + "loss": 1.7848, + "step": 9294 + }, + { + "epoch": 2.8529772866789442, + "grad_norm": 0.2916278541088104, + "learning_rate": 8.386523339399095e-05, + "loss": 1.8299, + "step": 9295 + }, + { + "epoch": 2.853284223449969, + "grad_norm": 0.3109573423862457, + "learning_rate": 8.386157637260626e-05, + "loss": 1.8072, + "step": 9296 + }, + { + "epoch": 2.8535911602209945, + "grad_norm": 0.26398584246635437, + "learning_rate": 8.385791901658162e-05, + "loss": 1.8157, + "step": 9297 + }, + { + "epoch": 2.85389809699202, + "grad_norm": 0.3289371132850647, + "learning_rate": 8.385426132595317e-05, + "loss": 1.9382, + "step": 9298 + }, + { + "epoch": 2.8542050337630447, + "grad_norm": 0.2946974039077759, + "learning_rate": 8.38506033007571e-05, + "loss": 1.7893, + "step": 9299 + }, + { + "epoch": 2.85451197053407, + "grad_norm": 0.2909530699253082, + "learning_rate": 8.384694494102949e-05, + "loss": 1.8223, + "step": 9300 + }, + { + "epoch": 2.854818907305095, + "grad_norm": 0.2886645793914795, + "learning_rate": 8.384328624680655e-05, + "loss": 1.8239, + "step": 9301 + }, + { + "epoch": 2.8551258440761202, + "grad_norm": 0.2669137716293335, + "learning_rate": 8.383962721812442e-05, + "loss": 1.8102, + "step": 9302 + }, + { + "epoch": 2.8554327808471456, + "grad_norm": 0.3740660548210144, + "learning_rate": 8.383596785501926e-05, + "loss": 1.9014, + "step": 9303 + }, + { + "epoch": 2.855739717618171, + "grad_norm": 0.3062593638896942, + "learning_rate": 8.383230815752724e-05, + "loss": 1.8071, + "step": 9304 + }, + { + "epoch": 2.856046654389196, + "grad_norm": 0.2509091794490814, + "learning_rate": 8.382864812568452e-05, + "loss": 1.7968, + "step": 9305 + }, + { + "epoch": 2.856353591160221, + "grad_norm": 0.2764138877391815, + "learning_rate": 8.382498775952725e-05, + "loss": 1.7463, + "step": 9306 + }, + { + "epoch": 2.856660527931246, + "grad_norm": 0.3292323350906372, + "learning_rate": 8.382132705909165e-05, + "loss": 1.7888, + "step": 9307 + }, + { + "epoch": 2.8569674647022714, + "grad_norm": 0.3169284462928772, + "learning_rate": 8.381766602441386e-05, + "loss": 1.841, + "step": 9308 + }, + { + "epoch": 2.8572744014732967, + "grad_norm": 0.27665168046951294, + "learning_rate": 8.381400465553007e-05, + "loss": 1.7659, + "step": 9309 + }, + { + "epoch": 2.8575813382443216, + "grad_norm": 0.34908005595207214, + "learning_rate": 8.381034295247647e-05, + "loss": 1.8752, + "step": 9310 + }, + { + "epoch": 2.857888275015347, + "grad_norm": 0.31204238533973694, + "learning_rate": 8.380668091528924e-05, + "loss": 1.8201, + "step": 9311 + }, + { + "epoch": 2.858195211786372, + "grad_norm": 0.2713339328765869, + "learning_rate": 8.380301854400459e-05, + "loss": 1.8002, + "step": 9312 + }, + { + "epoch": 2.858502148557397, + "grad_norm": 0.30525076389312744, + "learning_rate": 8.379935583865868e-05, + "loss": 1.8533, + "step": 9313 + }, + { + "epoch": 2.8588090853284225, + "grad_norm": 0.3294430673122406, + "learning_rate": 8.379569279928774e-05, + "loss": 1.8895, + "step": 9314 + }, + { + "epoch": 2.8591160220994474, + "grad_norm": 0.31798750162124634, + "learning_rate": 8.379202942592795e-05, + "loss": 1.8148, + "step": 9315 + }, + { + "epoch": 2.8594229588704727, + "grad_norm": 0.3044969141483307, + "learning_rate": 8.378836571861553e-05, + "loss": 1.8477, + "step": 9316 + }, + { + "epoch": 2.8597298956414976, + "grad_norm": 0.2694118320941925, + "learning_rate": 8.378470167738665e-05, + "loss": 1.7998, + "step": 9317 + }, + { + "epoch": 2.860036832412523, + "grad_norm": 0.2601872980594635, + "learning_rate": 8.378103730227758e-05, + "loss": 1.8118, + "step": 9318 + }, + { + "epoch": 2.8603437691835483, + "grad_norm": 0.28168994188308716, + "learning_rate": 8.377737259332446e-05, + "loss": 1.8048, + "step": 9319 + }, + { + "epoch": 2.8606507059545736, + "grad_norm": 0.3008260428905487, + "learning_rate": 8.377370755056358e-05, + "loss": 1.7743, + "step": 9320 + }, + { + "epoch": 2.8609576427255985, + "grad_norm": 0.2578682601451874, + "learning_rate": 8.37700421740311e-05, + "loss": 1.8011, + "step": 9321 + }, + { + "epoch": 2.861264579496624, + "grad_norm": 0.3051932752132416, + "learning_rate": 8.376637646376329e-05, + "loss": 1.8747, + "step": 9322 + }, + { + "epoch": 2.8615715162676487, + "grad_norm": 0.27534300088882446, + "learning_rate": 8.376271041979636e-05, + "loss": 1.8018, + "step": 9323 + }, + { + "epoch": 2.861878453038674, + "grad_norm": 0.3990626335144043, + "learning_rate": 8.375904404216653e-05, + "loss": 1.9223, + "step": 9324 + }, + { + "epoch": 2.8621853898096994, + "grad_norm": 0.43015196919441223, + "learning_rate": 8.375537733091003e-05, + "loss": 1.8219, + "step": 9325 + }, + { + "epoch": 2.8624923265807243, + "grad_norm": 0.4051269590854645, + "learning_rate": 8.37517102860631e-05, + "loss": 1.8057, + "step": 9326 + }, + { + "epoch": 2.8627992633517496, + "grad_norm": 0.31781086325645447, + "learning_rate": 8.3748042907662e-05, + "loss": 1.8374, + "step": 9327 + }, + { + "epoch": 2.8631062001227745, + "grad_norm": 0.3476638197898865, + "learning_rate": 8.374437519574297e-05, + "loss": 1.8679, + "step": 9328 + }, + { + "epoch": 2.8634131368938, + "grad_norm": 0.40497875213623047, + "learning_rate": 8.374070715034224e-05, + "loss": 1.7996, + "step": 9329 + }, + { + "epoch": 2.863720073664825, + "grad_norm": 0.40277308225631714, + "learning_rate": 8.373703877149605e-05, + "loss": 1.8156, + "step": 9330 + }, + { + "epoch": 2.86402701043585, + "grad_norm": 0.3012325167655945, + "learning_rate": 8.373337005924069e-05, + "loss": 1.8765, + "step": 9331 + }, + { + "epoch": 2.8643339472068754, + "grad_norm": 0.3151897192001343, + "learning_rate": 8.372970101361238e-05, + "loss": 1.8395, + "step": 9332 + }, + { + "epoch": 2.8646408839779003, + "grad_norm": 0.33645790815353394, + "learning_rate": 8.372603163464741e-05, + "loss": 1.8587, + "step": 9333 + }, + { + "epoch": 2.8649478207489256, + "grad_norm": 0.29943743348121643, + "learning_rate": 8.3722361922382e-05, + "loss": 1.8007, + "step": 9334 + }, + { + "epoch": 2.865254757519951, + "grad_norm": 0.24727779626846313, + "learning_rate": 8.371869187685248e-05, + "loss": 1.766, + "step": 9335 + }, + { + "epoch": 2.8655616942909763, + "grad_norm": 0.3177282512187958, + "learning_rate": 8.371502149809507e-05, + "loss": 1.7954, + "step": 9336 + }, + { + "epoch": 2.865868631062001, + "grad_norm": 0.3415081202983856, + "learning_rate": 8.371135078614605e-05, + "loss": 1.8036, + "step": 9337 + }, + { + "epoch": 2.8661755678330265, + "grad_norm": 0.3044268488883972, + "learning_rate": 8.37076797410417e-05, + "loss": 1.8196, + "step": 9338 + }, + { + "epoch": 2.8664825046040514, + "grad_norm": 0.24425630271434784, + "learning_rate": 8.370400836281831e-05, + "loss": 1.8267, + "step": 9339 + }, + { + "epoch": 2.8667894413750767, + "grad_norm": 0.27264806628227234, + "learning_rate": 8.370033665151216e-05, + "loss": 1.8218, + "step": 9340 + }, + { + "epoch": 2.867096378146102, + "grad_norm": 0.275601327419281, + "learning_rate": 8.369666460715953e-05, + "loss": 1.8427, + "step": 9341 + }, + { + "epoch": 2.867403314917127, + "grad_norm": 0.2670573592185974, + "learning_rate": 8.36929922297967e-05, + "loss": 1.8449, + "step": 9342 + }, + { + "epoch": 2.8677102516881523, + "grad_norm": 0.2991434335708618, + "learning_rate": 8.368931951945998e-05, + "loss": 1.8866, + "step": 9343 + }, + { + "epoch": 2.868017188459177, + "grad_norm": 0.2975110411643982, + "learning_rate": 8.368564647618564e-05, + "loss": 1.7992, + "step": 9344 + }, + { + "epoch": 2.8683241252302025, + "grad_norm": 0.30109819769859314, + "learning_rate": 8.368197310001001e-05, + "loss": 1.8402, + "step": 9345 + }, + { + "epoch": 2.868631062001228, + "grad_norm": 0.3303714692592621, + "learning_rate": 8.367829939096938e-05, + "loss": 1.8329, + "step": 9346 + }, + { + "epoch": 2.8689379987722528, + "grad_norm": 0.3697182834148407, + "learning_rate": 8.367462534910007e-05, + "loss": 1.9328, + "step": 9347 + }, + { + "epoch": 2.869244935543278, + "grad_norm": 0.3292355537414551, + "learning_rate": 8.367095097443836e-05, + "loss": 1.8284, + "step": 9348 + }, + { + "epoch": 2.869551872314303, + "grad_norm": 0.30440348386764526, + "learning_rate": 8.366727626702058e-05, + "loss": 1.8891, + "step": 9349 + }, + { + "epoch": 2.8698588090853283, + "grad_norm": 0.28200212121009827, + "learning_rate": 8.366360122688303e-05, + "loss": 1.7931, + "step": 9350 + }, + { + "epoch": 2.8701657458563536, + "grad_norm": 0.3162787854671478, + "learning_rate": 8.365992585406207e-05, + "loss": 1.8033, + "step": 9351 + }, + { + "epoch": 2.870472682627379, + "grad_norm": 0.3326094448566437, + "learning_rate": 8.365625014859399e-05, + "loss": 1.8474, + "step": 9352 + }, + { + "epoch": 2.870779619398404, + "grad_norm": 0.36957383155822754, + "learning_rate": 8.36525741105151e-05, + "loss": 1.8387, + "step": 9353 + }, + { + "epoch": 2.871086556169429, + "grad_norm": 0.32996198534965515, + "learning_rate": 8.364889773986175e-05, + "loss": 1.9087, + "step": 9354 + }, + { + "epoch": 2.871393492940454, + "grad_norm": 0.3164239227771759, + "learning_rate": 8.36452210366703e-05, + "loss": 1.8735, + "step": 9355 + }, + { + "epoch": 2.8717004297114794, + "grad_norm": 0.411538302898407, + "learning_rate": 8.364154400097702e-05, + "loss": 1.832, + "step": 9356 + }, + { + "epoch": 2.8720073664825048, + "grad_norm": 0.48294687271118164, + "learning_rate": 8.36378666328183e-05, + "loss": 1.7772, + "step": 9357 + }, + { + "epoch": 2.8723143032535297, + "grad_norm": 0.4894202649593353, + "learning_rate": 8.363418893223046e-05, + "loss": 1.8396, + "step": 9358 + }, + { + "epoch": 2.872621240024555, + "grad_norm": 0.3328344225883484, + "learning_rate": 8.363051089924986e-05, + "loss": 1.8264, + "step": 9359 + }, + { + "epoch": 2.87292817679558, + "grad_norm": 0.29800695180892944, + "learning_rate": 8.362683253391284e-05, + "loss": 1.8609, + "step": 9360 + }, + { + "epoch": 2.873235113566605, + "grad_norm": 0.48049718141555786, + "learning_rate": 8.362315383625574e-05, + "loss": 1.8703, + "step": 9361 + }, + { + "epoch": 2.8735420503376305, + "grad_norm": 0.5477426052093506, + "learning_rate": 8.361947480631494e-05, + "loss": 1.8336, + "step": 9362 + }, + { + "epoch": 2.873848987108656, + "grad_norm": 0.42515942454338074, + "learning_rate": 8.361579544412676e-05, + "loss": 1.826, + "step": 9363 + }, + { + "epoch": 2.8741559238796808, + "grad_norm": 0.3049539029598236, + "learning_rate": 8.361211574972762e-05, + "loss": 1.9117, + "step": 9364 + }, + { + "epoch": 2.874462860650706, + "grad_norm": 0.4089799225330353, + "learning_rate": 8.360843572315384e-05, + "loss": 1.8669, + "step": 9365 + }, + { + "epoch": 2.874769797421731, + "grad_norm": 0.42594894766807556, + "learning_rate": 8.36047553644418e-05, + "loss": 1.8527, + "step": 9366 + }, + { + "epoch": 2.8750767341927563, + "grad_norm": 0.3282840847969055, + "learning_rate": 8.360107467362785e-05, + "loss": 1.833, + "step": 9367 + }, + { + "epoch": 2.8753836709637817, + "grad_norm": 0.26597294211387634, + "learning_rate": 8.359739365074841e-05, + "loss": 1.7735, + "step": 9368 + }, + { + "epoch": 2.8756906077348066, + "grad_norm": 0.33498096466064453, + "learning_rate": 8.359371229583983e-05, + "loss": 1.7923, + "step": 9369 + }, + { + "epoch": 2.875997544505832, + "grad_norm": 0.3046290874481201, + "learning_rate": 8.35900306089385e-05, + "loss": 1.8296, + "step": 9370 + }, + { + "epoch": 2.876304481276857, + "grad_norm": 0.3128269612789154, + "learning_rate": 8.358634859008079e-05, + "loss": 1.8115, + "step": 9371 + }, + { + "epoch": 2.876611418047882, + "grad_norm": 0.3814822733402252, + "learning_rate": 8.358266623930309e-05, + "loss": 1.8454, + "step": 9372 + }, + { + "epoch": 2.8769183548189075, + "grad_norm": 0.42400503158569336, + "learning_rate": 8.35789835566418e-05, + "loss": 1.8162, + "step": 9373 + }, + { + "epoch": 2.8772252915899323, + "grad_norm": 0.3131491243839264, + "learning_rate": 8.357530054213333e-05, + "loss": 1.8281, + "step": 9374 + }, + { + "epoch": 2.8775322283609577, + "grad_norm": 0.2566036581993103, + "learning_rate": 8.357161719581406e-05, + "loss": 1.7751, + "step": 9375 + }, + { + "epoch": 2.8778391651319826, + "grad_norm": 0.3858461081981659, + "learning_rate": 8.356793351772038e-05, + "loss": 1.8558, + "step": 9376 + }, + { + "epoch": 2.878146101903008, + "grad_norm": 0.38664349913597107, + "learning_rate": 8.35642495078887e-05, + "loss": 1.8009, + "step": 9377 + }, + { + "epoch": 2.8784530386740332, + "grad_norm": 0.33365172147750854, + "learning_rate": 8.356056516635545e-05, + "loss": 1.8689, + "step": 9378 + }, + { + "epoch": 2.8787599754450586, + "grad_norm": 0.3602980971336365, + "learning_rate": 8.355688049315702e-05, + "loss": 1.8397, + "step": 9379 + }, + { + "epoch": 2.8790669122160835, + "grad_norm": 0.4508447051048279, + "learning_rate": 8.355319548832983e-05, + "loss": 1.8163, + "step": 9380 + }, + { + "epoch": 2.879373848987109, + "grad_norm": 0.4433961808681488, + "learning_rate": 8.35495101519103e-05, + "loss": 1.7868, + "step": 9381 + }, + { + "epoch": 2.8796807857581337, + "grad_norm": 0.2754592299461365, + "learning_rate": 8.354582448393483e-05, + "loss": 1.8222, + "step": 9382 + }, + { + "epoch": 2.879987722529159, + "grad_norm": 0.29384344816207886, + "learning_rate": 8.354213848443987e-05, + "loss": 1.7742, + "step": 9383 + }, + { + "epoch": 2.8802946593001844, + "grad_norm": 0.33183756470680237, + "learning_rate": 8.353845215346183e-05, + "loss": 1.8327, + "step": 9384 + }, + { + "epoch": 2.8806015960712092, + "grad_norm": 0.3018858730792999, + "learning_rate": 8.353476549103717e-05, + "loss": 1.8606, + "step": 9385 + }, + { + "epoch": 2.8809085328422346, + "grad_norm": 0.38592803478240967, + "learning_rate": 8.353107849720229e-05, + "loss": 1.8091, + "step": 9386 + }, + { + "epoch": 2.8812154696132595, + "grad_norm": 0.448723703622818, + "learning_rate": 8.352739117199364e-05, + "loss": 1.8537, + "step": 9387 + }, + { + "epoch": 2.881522406384285, + "grad_norm": 0.25959616899490356, + "learning_rate": 8.352370351544765e-05, + "loss": 1.8188, + "step": 9388 + }, + { + "epoch": 2.88182934315531, + "grad_norm": 0.3304184079170227, + "learning_rate": 8.352001552760078e-05, + "loss": 1.8008, + "step": 9389 + }, + { + "epoch": 2.882136279926335, + "grad_norm": 0.3831254541873932, + "learning_rate": 8.351632720848947e-05, + "loss": 1.7636, + "step": 9390 + }, + { + "epoch": 2.8824432166973604, + "grad_norm": 0.3358294665813446, + "learning_rate": 8.351263855815017e-05, + "loss": 1.8375, + "step": 9391 + }, + { + "epoch": 2.8827501534683853, + "grad_norm": 0.31194913387298584, + "learning_rate": 8.350894957661935e-05, + "loss": 1.817, + "step": 9392 + }, + { + "epoch": 2.8830570902394106, + "grad_norm": 0.4156818687915802, + "learning_rate": 8.350526026393343e-05, + "loss": 1.799, + "step": 9393 + }, + { + "epoch": 2.883364027010436, + "grad_norm": 0.3062533140182495, + "learning_rate": 8.350157062012889e-05, + "loss": 1.8535, + "step": 9394 + }, + { + "epoch": 2.8836709637814613, + "grad_norm": 0.3091447949409485, + "learning_rate": 8.34978806452422e-05, + "loss": 1.839, + "step": 9395 + }, + { + "epoch": 2.883977900552486, + "grad_norm": 0.38731643557548523, + "learning_rate": 8.349419033930981e-05, + "loss": 1.8714, + "step": 9396 + }, + { + "epoch": 2.8842848373235115, + "grad_norm": 0.34655869007110596, + "learning_rate": 8.34904997023682e-05, + "loss": 1.8694, + "step": 9397 + }, + { + "epoch": 2.8845917740945364, + "grad_norm": 0.3094301223754883, + "learning_rate": 8.348680873445386e-05, + "loss": 1.8773, + "step": 9398 + }, + { + "epoch": 2.8848987108655617, + "grad_norm": 0.2954508364200592, + "learning_rate": 8.348311743560325e-05, + "loss": 1.7716, + "step": 9399 + }, + { + "epoch": 2.885205647636587, + "grad_norm": 0.32545948028564453, + "learning_rate": 8.347942580585282e-05, + "loss": 1.871, + "step": 9400 + }, + { + "epoch": 2.885512584407612, + "grad_norm": 0.3251612186431885, + "learning_rate": 8.34757338452391e-05, + "loss": 1.8553, + "step": 9401 + }, + { + "epoch": 2.8858195211786373, + "grad_norm": 0.2610895335674286, + "learning_rate": 8.347204155379856e-05, + "loss": 1.8018, + "step": 9402 + }, + { + "epoch": 2.886126457949662, + "grad_norm": 0.3369129002094269, + "learning_rate": 8.346834893156768e-05, + "loss": 1.8536, + "step": 9403 + }, + { + "epoch": 2.8864333947206875, + "grad_norm": 0.4544060528278351, + "learning_rate": 8.346465597858296e-05, + "loss": 1.8332, + "step": 9404 + }, + { + "epoch": 2.886740331491713, + "grad_norm": 0.45742174983024597, + "learning_rate": 8.346096269488089e-05, + "loss": 1.89, + "step": 9405 + }, + { + "epoch": 2.8870472682627377, + "grad_norm": 0.3458103537559509, + "learning_rate": 8.345726908049799e-05, + "loss": 1.8902, + "step": 9406 + }, + { + "epoch": 2.887354205033763, + "grad_norm": 0.33266058564186096, + "learning_rate": 8.345357513547074e-05, + "loss": 1.7975, + "step": 9407 + }, + { + "epoch": 2.887661141804788, + "grad_norm": 0.3503437042236328, + "learning_rate": 8.344988085983565e-05, + "loss": 1.8503, + "step": 9408 + }, + { + "epoch": 2.8879680785758133, + "grad_norm": 0.33511486649513245, + "learning_rate": 8.344618625362923e-05, + "loss": 1.8731, + "step": 9409 + }, + { + "epoch": 2.8882750153468386, + "grad_norm": 0.295250803232193, + "learning_rate": 8.344249131688799e-05, + "loss": 1.8557, + "step": 9410 + }, + { + "epoch": 2.888581952117864, + "grad_norm": 0.33287179470062256, + "learning_rate": 8.343879604964846e-05, + "loss": 1.8015, + "step": 9411 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.35169747471809387, + "learning_rate": 8.343510045194715e-05, + "loss": 1.7857, + "step": 9412 + }, + { + "epoch": 2.889195825659914, + "grad_norm": 0.3191360533237457, + "learning_rate": 8.343140452382056e-05, + "loss": 1.8474, + "step": 9413 + }, + { + "epoch": 2.889502762430939, + "grad_norm": 0.27216482162475586, + "learning_rate": 8.342770826530526e-05, + "loss": 1.7941, + "step": 9414 + }, + { + "epoch": 2.8898096992019644, + "grad_norm": 0.32968905568122864, + "learning_rate": 8.342401167643774e-05, + "loss": 1.8568, + "step": 9415 + }, + { + "epoch": 2.8901166359729897, + "grad_norm": 0.37429341673851013, + "learning_rate": 8.342031475725456e-05, + "loss": 1.8995, + "step": 9416 + }, + { + "epoch": 2.8904235727440146, + "grad_norm": 0.3318146765232086, + "learning_rate": 8.341661750779223e-05, + "loss": 1.8886, + "step": 9417 + }, + { + "epoch": 2.89073050951504, + "grad_norm": 0.3208807408809662, + "learning_rate": 8.34129199280873e-05, + "loss": 1.8306, + "step": 9418 + }, + { + "epoch": 2.891037446286065, + "grad_norm": 0.30906134843826294, + "learning_rate": 8.340922201817632e-05, + "loss": 1.8931, + "step": 9419 + }, + { + "epoch": 2.89134438305709, + "grad_norm": 0.2949373722076416, + "learning_rate": 8.340552377809581e-05, + "loss": 1.8375, + "step": 9420 + }, + { + "epoch": 2.8916513198281155, + "grad_norm": 0.2553368806838989, + "learning_rate": 8.340182520788236e-05, + "loss": 1.7816, + "step": 9421 + }, + { + "epoch": 2.891958256599141, + "grad_norm": 0.26867765188217163, + "learning_rate": 8.339812630757246e-05, + "loss": 1.7721, + "step": 9422 + }, + { + "epoch": 2.8922651933701657, + "grad_norm": 0.3132673501968384, + "learning_rate": 8.339442707720273e-05, + "loss": 1.8412, + "step": 9423 + }, + { + "epoch": 2.892572130141191, + "grad_norm": 0.32028669118881226, + "learning_rate": 8.33907275168097e-05, + "loss": 1.8081, + "step": 9424 + }, + { + "epoch": 2.892879066912216, + "grad_norm": 0.30383285880088806, + "learning_rate": 8.338702762642992e-05, + "loss": 1.8294, + "step": 9425 + }, + { + "epoch": 2.8931860036832413, + "grad_norm": 0.284161239862442, + "learning_rate": 8.338332740609995e-05, + "loss": 1.7788, + "step": 9426 + }, + { + "epoch": 2.8934929404542666, + "grad_norm": 0.26731929183006287, + "learning_rate": 8.337962685585638e-05, + "loss": 1.8244, + "step": 9427 + }, + { + "epoch": 2.8937998772252915, + "grad_norm": 0.2687760889530182, + "learning_rate": 8.337592597573578e-05, + "loss": 1.8104, + "step": 9428 + }, + { + "epoch": 2.894106813996317, + "grad_norm": 0.3097872734069824, + "learning_rate": 8.337222476577472e-05, + "loss": 1.8311, + "step": 9429 + }, + { + "epoch": 2.8944137507673418, + "grad_norm": 0.2915988862514496, + "learning_rate": 8.336852322600977e-05, + "loss": 1.8878, + "step": 9430 + }, + { + "epoch": 2.894720687538367, + "grad_norm": 0.2783167362213135, + "learning_rate": 8.336482135647751e-05, + "loss": 1.829, + "step": 9431 + }, + { + "epoch": 2.8950276243093924, + "grad_norm": 0.27866432070732117, + "learning_rate": 8.336111915721454e-05, + "loss": 1.8881, + "step": 9432 + }, + { + "epoch": 2.8953345610804173, + "grad_norm": 0.26949164271354675, + "learning_rate": 8.335741662825743e-05, + "loss": 1.7652, + "step": 9433 + }, + { + "epoch": 2.8956414978514426, + "grad_norm": 0.31324130296707153, + "learning_rate": 8.335371376964278e-05, + "loss": 1.8362, + "step": 9434 + }, + { + "epoch": 2.8959484346224675, + "grad_norm": 0.31150999665260315, + "learning_rate": 8.335001058140718e-05, + "loss": 1.8588, + "step": 9435 + }, + { + "epoch": 2.896255371393493, + "grad_norm": 0.30692601203918457, + "learning_rate": 8.334630706358724e-05, + "loss": 1.8473, + "step": 9436 + }, + { + "epoch": 2.896562308164518, + "grad_norm": 0.2764357328414917, + "learning_rate": 8.334260321621954e-05, + "loss": 1.8696, + "step": 9437 + }, + { + "epoch": 2.8968692449355435, + "grad_norm": 0.26108071208000183, + "learning_rate": 8.333889903934069e-05, + "loss": 1.7647, + "step": 9438 + }, + { + "epoch": 2.8971761817065684, + "grad_norm": 0.3382989466190338, + "learning_rate": 8.33351945329873e-05, + "loss": 1.8936, + "step": 9439 + }, + { + "epoch": 2.8974831184775938, + "grad_norm": 0.3121405839920044, + "learning_rate": 8.333148969719598e-05, + "loss": 1.8281, + "step": 9440 + }, + { + "epoch": 2.8977900552486187, + "grad_norm": 0.283149778842926, + "learning_rate": 8.332778453200334e-05, + "loss": 1.8642, + "step": 9441 + }, + { + "epoch": 2.898096992019644, + "grad_norm": 0.4140075445175171, + "learning_rate": 8.332407903744598e-05, + "loss": 1.8553, + "step": 9442 + }, + { + "epoch": 2.8984039287906693, + "grad_norm": 0.4345620274543762, + "learning_rate": 8.332037321356057e-05, + "loss": 1.7879, + "step": 9443 + }, + { + "epoch": 2.898710865561694, + "grad_norm": 0.4103661775588989, + "learning_rate": 8.33166670603837e-05, + "loss": 1.7928, + "step": 9444 + }, + { + "epoch": 2.8990178023327196, + "grad_norm": 0.2874266505241394, + "learning_rate": 8.3312960577952e-05, + "loss": 1.8097, + "step": 9445 + }, + { + "epoch": 2.8993247391037444, + "grad_norm": 0.2949487864971161, + "learning_rate": 8.330925376630208e-05, + "loss": 1.8679, + "step": 9446 + }, + { + "epoch": 2.8996316758747698, + "grad_norm": 0.3222406804561615, + "learning_rate": 8.330554662547059e-05, + "loss": 1.8184, + "step": 9447 + }, + { + "epoch": 2.899938612645795, + "grad_norm": 0.32089436054229736, + "learning_rate": 8.330183915549418e-05, + "loss": 1.8798, + "step": 9448 + }, + { + "epoch": 2.90024554941682, + "grad_norm": 0.28950363397598267, + "learning_rate": 8.329813135640947e-05, + "loss": 1.8502, + "step": 9449 + }, + { + "epoch": 2.9005524861878453, + "grad_norm": 0.29070547223091125, + "learning_rate": 8.329442322825312e-05, + "loss": 1.8826, + "step": 9450 + }, + { + "epoch": 2.9008594229588702, + "grad_norm": 0.3030688464641571, + "learning_rate": 8.329071477106175e-05, + "loss": 1.8002, + "step": 9451 + }, + { + "epoch": 2.9011663597298956, + "grad_norm": 0.33711570501327515, + "learning_rate": 8.328700598487203e-05, + "loss": 1.8876, + "step": 9452 + }, + { + "epoch": 2.901473296500921, + "grad_norm": 0.31995612382888794, + "learning_rate": 8.328329686972063e-05, + "loss": 1.7952, + "step": 9453 + }, + { + "epoch": 2.9017802332719462, + "grad_norm": 0.2619616389274597, + "learning_rate": 8.327958742564415e-05, + "loss": 1.7371, + "step": 9454 + }, + { + "epoch": 2.902087170042971, + "grad_norm": 0.3527650535106659, + "learning_rate": 8.32758776526793e-05, + "loss": 1.8385, + "step": 9455 + }, + { + "epoch": 2.9023941068139965, + "grad_norm": 0.3238582909107208, + "learning_rate": 8.327216755086271e-05, + "loss": 1.7955, + "step": 9456 + }, + { + "epoch": 2.9027010435850213, + "grad_norm": 0.2647970914840698, + "learning_rate": 8.326845712023106e-05, + "loss": 1.8639, + "step": 9457 + }, + { + "epoch": 2.9030079803560467, + "grad_norm": 0.3435346186161041, + "learning_rate": 8.326474636082103e-05, + "loss": 1.7831, + "step": 9458 + }, + { + "epoch": 2.903314917127072, + "grad_norm": 0.42539843916893005, + "learning_rate": 8.326103527266927e-05, + "loss": 1.8473, + "step": 9459 + }, + { + "epoch": 2.903621853898097, + "grad_norm": 0.3773367404937744, + "learning_rate": 8.325732385581247e-05, + "loss": 1.8993, + "step": 9460 + }, + { + "epoch": 2.9039287906691222, + "grad_norm": 0.2918262183666229, + "learning_rate": 8.32536121102873e-05, + "loss": 1.8198, + "step": 9461 + }, + { + "epoch": 2.904235727440147, + "grad_norm": 0.3997703492641449, + "learning_rate": 8.324990003613044e-05, + "loss": 1.8307, + "step": 9462 + }, + { + "epoch": 2.9045426642111725, + "grad_norm": 0.4593566656112671, + "learning_rate": 8.324618763337858e-05, + "loss": 1.8068, + "step": 9463 + }, + { + "epoch": 2.904849600982198, + "grad_norm": 0.30200180411338806, + "learning_rate": 8.324247490206841e-05, + "loss": 1.7935, + "step": 9464 + }, + { + "epoch": 2.9051565377532227, + "grad_norm": 0.37651970982551575, + "learning_rate": 8.323876184223663e-05, + "loss": 1.9268, + "step": 9465 + }, + { + "epoch": 2.905463474524248, + "grad_norm": 0.465863436460495, + "learning_rate": 8.32350484539199e-05, + "loss": 1.8331, + "step": 9466 + }, + { + "epoch": 2.905770411295273, + "grad_norm": 0.3527480661869049, + "learning_rate": 8.323133473715496e-05, + "loss": 1.899, + "step": 9467 + }, + { + "epoch": 2.9060773480662982, + "grad_norm": 0.30979883670806885, + "learning_rate": 8.32276206919785e-05, + "loss": 1.7578, + "step": 9468 + }, + { + "epoch": 2.9063842848373236, + "grad_norm": 0.5039793252944946, + "learning_rate": 8.322390631842718e-05, + "loss": 1.7822, + "step": 9469 + }, + { + "epoch": 2.906691221608349, + "grad_norm": 0.4683503806591034, + "learning_rate": 8.322019161653777e-05, + "loss": 1.7958, + "step": 9470 + }, + { + "epoch": 2.906998158379374, + "grad_norm": 0.27022865414619446, + "learning_rate": 8.321647658634696e-05, + "loss": 1.838, + "step": 9471 + }, + { + "epoch": 2.907305095150399, + "grad_norm": 0.3253246247768402, + "learning_rate": 8.321276122789146e-05, + "loss": 1.862, + "step": 9472 + }, + { + "epoch": 2.907612031921424, + "grad_norm": 0.3654547929763794, + "learning_rate": 8.320904554120798e-05, + "loss": 1.8578, + "step": 9473 + }, + { + "epoch": 2.9079189686924494, + "grad_norm": 0.3140239417552948, + "learning_rate": 8.320532952633325e-05, + "loss": 1.7954, + "step": 9474 + }, + { + "epoch": 2.9082259054634747, + "grad_norm": 0.24541302025318146, + "learning_rate": 8.3201613183304e-05, + "loss": 1.7711, + "step": 9475 + }, + { + "epoch": 2.9085328422344996, + "grad_norm": 0.2538415491580963, + "learning_rate": 8.319789651215692e-05, + "loss": 1.7756, + "step": 9476 + }, + { + "epoch": 2.908839779005525, + "grad_norm": 0.3181871175765991, + "learning_rate": 8.31941795129288e-05, + "loss": 1.7957, + "step": 9477 + }, + { + "epoch": 2.90914671577655, + "grad_norm": 0.3094673752784729, + "learning_rate": 8.319046218565633e-05, + "loss": 1.8897, + "step": 9478 + }, + { + "epoch": 2.909453652547575, + "grad_norm": 0.3004473149776459, + "learning_rate": 8.318674453037626e-05, + "loss": 1.7853, + "step": 9479 + }, + { + "epoch": 2.9097605893186005, + "grad_norm": 0.28673505783081055, + "learning_rate": 8.318302654712532e-05, + "loss": 1.8119, + "step": 9480 + }, + { + "epoch": 2.9100675260896254, + "grad_norm": 0.3177729547023773, + "learning_rate": 8.317930823594027e-05, + "loss": 1.8211, + "step": 9481 + }, + { + "epoch": 2.9103744628606507, + "grad_norm": 0.28347232937812805, + "learning_rate": 8.317558959685786e-05, + "loss": 1.8061, + "step": 9482 + }, + { + "epoch": 2.9106813996316756, + "grad_norm": 0.28247126936912537, + "learning_rate": 8.317187062991482e-05, + "loss": 1.8175, + "step": 9483 + }, + { + "epoch": 2.910988336402701, + "grad_norm": 0.3153017461299896, + "learning_rate": 8.31681513351479e-05, + "loss": 1.8619, + "step": 9484 + }, + { + "epoch": 2.9112952731737263, + "grad_norm": 0.265821635723114, + "learning_rate": 8.316443171259389e-05, + "loss": 1.7783, + "step": 9485 + }, + { + "epoch": 2.9116022099447516, + "grad_norm": 0.33247366547584534, + "learning_rate": 8.31607117622895e-05, + "loss": 1.8701, + "step": 9486 + }, + { + "epoch": 2.9119091467157765, + "grad_norm": 0.3343275189399719, + "learning_rate": 8.315699148427154e-05, + "loss": 1.742, + "step": 9487 + }, + { + "epoch": 2.912216083486802, + "grad_norm": 0.3427117168903351, + "learning_rate": 8.315327087857677e-05, + "loss": 1.8382, + "step": 9488 + }, + { + "epoch": 2.9125230202578267, + "grad_norm": 0.2884635925292969, + "learning_rate": 8.31495499452419e-05, + "loss": 1.8378, + "step": 9489 + }, + { + "epoch": 2.912829957028852, + "grad_norm": 0.30335184931755066, + "learning_rate": 8.31458286843038e-05, + "loss": 1.7619, + "step": 9490 + }, + { + "epoch": 2.9131368937998774, + "grad_norm": 0.3224368095397949, + "learning_rate": 8.314210709579916e-05, + "loss": 1.8289, + "step": 9491 + }, + { + "epoch": 2.9134438305709023, + "grad_norm": 0.28016242384910583, + "learning_rate": 8.31383851797648e-05, + "loss": 1.8027, + "step": 9492 + }, + { + "epoch": 2.9137507673419276, + "grad_norm": 0.32091468572616577, + "learning_rate": 8.313466293623749e-05, + "loss": 1.9027, + "step": 9493 + }, + { + "epoch": 2.9140577041129525, + "grad_norm": 0.2809069752693176, + "learning_rate": 8.313094036525403e-05, + "loss": 1.9194, + "step": 9494 + }, + { + "epoch": 2.914364640883978, + "grad_norm": 0.30734366178512573, + "learning_rate": 8.312721746685119e-05, + "loss": 1.8612, + "step": 9495 + }, + { + "epoch": 2.914671577655003, + "grad_norm": 0.25953513383865356, + "learning_rate": 8.312349424106578e-05, + "loss": 1.7593, + "step": 9496 + }, + { + "epoch": 2.9149785144260285, + "grad_norm": 0.27583983540534973, + "learning_rate": 8.311977068793459e-05, + "loss": 1.8138, + "step": 9497 + }, + { + "epoch": 2.9152854511970534, + "grad_norm": 0.30315884947776794, + "learning_rate": 8.31160468074944e-05, + "loss": 1.7704, + "step": 9498 + }, + { + "epoch": 2.9155923879680787, + "grad_norm": 0.321603387594223, + "learning_rate": 8.311232259978204e-05, + "loss": 1.8055, + "step": 9499 + }, + { + "epoch": 2.9158993247391036, + "grad_norm": 0.27882421016693115, + "learning_rate": 8.310859806483429e-05, + "loss": 1.8257, + "step": 9500 + }, + { + "epoch": 2.916206261510129, + "grad_norm": 0.3095625042915344, + "learning_rate": 8.310487320268795e-05, + "loss": 1.8561, + "step": 9501 + }, + { + "epoch": 2.9165131982811543, + "grad_norm": 0.27503731846809387, + "learning_rate": 8.310114801337988e-05, + "loss": 1.7588, + "step": 9502 + }, + { + "epoch": 2.916820135052179, + "grad_norm": 0.2534404695034027, + "learning_rate": 8.309742249694686e-05, + "loss": 1.7289, + "step": 9503 + }, + { + "epoch": 2.9171270718232045, + "grad_norm": 0.24968849122524261, + "learning_rate": 8.30936966534257e-05, + "loss": 1.7763, + "step": 9504 + }, + { + "epoch": 2.9174340085942294, + "grad_norm": 0.2728060781955719, + "learning_rate": 8.308997048285324e-05, + "loss": 1.7847, + "step": 9505 + }, + { + "epoch": 2.9177409453652547, + "grad_norm": 0.28728193044662476, + "learning_rate": 8.308624398526629e-05, + "loss": 1.7957, + "step": 9506 + }, + { + "epoch": 2.91804788213628, + "grad_norm": 0.3097241520881653, + "learning_rate": 8.308251716070169e-05, + "loss": 1.8141, + "step": 9507 + }, + { + "epoch": 2.918354818907305, + "grad_norm": 0.3570188879966736, + "learning_rate": 8.307879000919628e-05, + "loss": 1.8246, + "step": 9508 + }, + { + "epoch": 2.9186617556783303, + "grad_norm": 0.27077826857566833, + "learning_rate": 8.307506253078685e-05, + "loss": 1.7912, + "step": 9509 + }, + { + "epoch": 2.918968692449355, + "grad_norm": 0.26213565468788147, + "learning_rate": 8.307133472551028e-05, + "loss": 1.8378, + "step": 9510 + }, + { + "epoch": 2.9192756292203805, + "grad_norm": 0.3482845723628998, + "learning_rate": 8.306760659340339e-05, + "loss": 1.8031, + "step": 9511 + }, + { + "epoch": 2.919582565991406, + "grad_norm": 0.3730507791042328, + "learning_rate": 8.306387813450303e-05, + "loss": 1.7404, + "step": 9512 + }, + { + "epoch": 2.919889502762431, + "grad_norm": 0.2957874536514282, + "learning_rate": 8.306014934884606e-05, + "loss": 1.8623, + "step": 9513 + }, + { + "epoch": 2.920196439533456, + "grad_norm": 0.29137885570526123, + "learning_rate": 8.30564202364693e-05, + "loss": 1.847, + "step": 9514 + }, + { + "epoch": 2.9205033763044814, + "grad_norm": 0.35623642802238464, + "learning_rate": 8.305269079740964e-05, + "loss": 1.8382, + "step": 9515 + }, + { + "epoch": 2.9208103130755063, + "grad_norm": 0.28263330459594727, + "learning_rate": 8.304896103170389e-05, + "loss": 1.7732, + "step": 9516 + }, + { + "epoch": 2.9211172498465316, + "grad_norm": 0.23631221055984497, + "learning_rate": 8.304523093938897e-05, + "loss": 1.7709, + "step": 9517 + }, + { + "epoch": 2.921424186617557, + "grad_norm": 0.25887101888656616, + "learning_rate": 8.304150052050169e-05, + "loss": 1.7966, + "step": 9518 + }, + { + "epoch": 2.921731123388582, + "grad_norm": 0.31445473432540894, + "learning_rate": 8.303776977507894e-05, + "loss": 1.8735, + "step": 9519 + }, + { + "epoch": 2.922038060159607, + "grad_norm": 0.264930784702301, + "learning_rate": 8.303403870315757e-05, + "loss": 1.7983, + "step": 9520 + }, + { + "epoch": 2.922344996930632, + "grad_norm": 0.2664194107055664, + "learning_rate": 8.30303073047745e-05, + "loss": 1.8573, + "step": 9521 + }, + { + "epoch": 2.9226519337016574, + "grad_norm": 0.31645768880844116, + "learning_rate": 8.302657557996656e-05, + "loss": 1.913, + "step": 9522 + }, + { + "epoch": 2.9229588704726828, + "grad_norm": 0.2820858657360077, + "learning_rate": 8.302284352877063e-05, + "loss": 1.8714, + "step": 9523 + }, + { + "epoch": 2.9232658072437077, + "grad_norm": 0.2960543930530548, + "learning_rate": 8.30191111512236e-05, + "loss": 1.8296, + "step": 9524 + }, + { + "epoch": 2.923572744014733, + "grad_norm": 0.319363534450531, + "learning_rate": 8.301537844736237e-05, + "loss": 1.8533, + "step": 9525 + }, + { + "epoch": 2.923879680785758, + "grad_norm": 0.28047996759414673, + "learning_rate": 8.301164541722384e-05, + "loss": 1.7415, + "step": 9526 + }, + { + "epoch": 2.924186617556783, + "grad_norm": 0.3106628656387329, + "learning_rate": 8.300791206084486e-05, + "loss": 1.8809, + "step": 9527 + }, + { + "epoch": 2.9244935543278086, + "grad_norm": 0.2650253474712372, + "learning_rate": 8.300417837826235e-05, + "loss": 1.8097, + "step": 9528 + }, + { + "epoch": 2.924800491098834, + "grad_norm": 0.31832796335220337, + "learning_rate": 8.30004443695132e-05, + "loss": 1.881, + "step": 9529 + }, + { + "epoch": 2.925107427869859, + "grad_norm": 0.311018168926239, + "learning_rate": 8.299671003463432e-05, + "loss": 1.8725, + "step": 9530 + }, + { + "epoch": 2.925414364640884, + "grad_norm": 0.3125450909137726, + "learning_rate": 8.299297537366262e-05, + "loss": 1.8159, + "step": 9531 + }, + { + "epoch": 2.925721301411909, + "grad_norm": 0.30022570490837097, + "learning_rate": 8.298924038663498e-05, + "loss": 1.8217, + "step": 9532 + }, + { + "epoch": 2.9260282381829343, + "grad_norm": 0.3061163127422333, + "learning_rate": 8.298550507358836e-05, + "loss": 1.8529, + "step": 9533 + }, + { + "epoch": 2.9263351749539597, + "grad_norm": 0.258891224861145, + "learning_rate": 8.298176943455962e-05, + "loss": 1.8579, + "step": 9534 + }, + { + "epoch": 2.9266421117249846, + "grad_norm": 0.2871147096157074, + "learning_rate": 8.297803346958571e-05, + "loss": 1.8699, + "step": 9535 + }, + { + "epoch": 2.92694904849601, + "grad_norm": 0.3047468066215515, + "learning_rate": 8.297429717870356e-05, + "loss": 1.9165, + "step": 9536 + }, + { + "epoch": 2.927255985267035, + "grad_norm": 0.2852346897125244, + "learning_rate": 8.297056056195005e-05, + "loss": 1.8417, + "step": 9537 + }, + { + "epoch": 2.92756292203806, + "grad_norm": 0.30782654881477356, + "learning_rate": 8.296682361936216e-05, + "loss": 1.835, + "step": 9538 + }, + { + "epoch": 2.9278698588090855, + "grad_norm": 0.44828128814697266, + "learning_rate": 8.296308635097678e-05, + "loss": 1.8997, + "step": 9539 + }, + { + "epoch": 2.9281767955801103, + "grad_norm": 0.48911961913108826, + "learning_rate": 8.295934875683087e-05, + "loss": 1.8249, + "step": 9540 + }, + { + "epoch": 2.9284837323511357, + "grad_norm": 0.3377256691455841, + "learning_rate": 8.295561083696136e-05, + "loss": 1.757, + "step": 9541 + }, + { + "epoch": 2.9287906691221606, + "grad_norm": 0.29486989974975586, + "learning_rate": 8.295187259140518e-05, + "loss": 1.8282, + "step": 9542 + }, + { + "epoch": 2.929097605893186, + "grad_norm": 0.4291549026966095, + "learning_rate": 8.294813402019927e-05, + "loss": 1.7633, + "step": 9543 + }, + { + "epoch": 2.9294045426642112, + "grad_norm": 0.43153640627861023, + "learning_rate": 8.294439512338061e-05, + "loss": 1.7904, + "step": 9544 + }, + { + "epoch": 2.9297114794352366, + "grad_norm": 0.3454402685165405, + "learning_rate": 8.294065590098611e-05, + "loss": 1.8586, + "step": 9545 + }, + { + "epoch": 2.9300184162062615, + "grad_norm": 0.2709622383117676, + "learning_rate": 8.293691635305276e-05, + "loss": 1.8225, + "step": 9546 + }, + { + "epoch": 2.930325352977287, + "grad_norm": 0.34379467368125916, + "learning_rate": 8.293317647961749e-05, + "loss": 1.9005, + "step": 9547 + }, + { + "epoch": 2.9306322897483117, + "grad_norm": 0.37137365341186523, + "learning_rate": 8.292943628071727e-05, + "loss": 1.829, + "step": 9548 + }, + { + "epoch": 2.930939226519337, + "grad_norm": 0.31634894013404846, + "learning_rate": 8.292569575638905e-05, + "loss": 1.8062, + "step": 9549 + }, + { + "epoch": 2.9312461632903624, + "grad_norm": 0.25719332695007324, + "learning_rate": 8.292195490666981e-05, + "loss": 1.8044, + "step": 9550 + }, + { + "epoch": 2.9315531000613873, + "grad_norm": 0.3341852128505707, + "learning_rate": 8.291821373159652e-05, + "loss": 1.8627, + "step": 9551 + }, + { + "epoch": 2.9318600368324126, + "grad_norm": 0.38499385118484497, + "learning_rate": 8.291447223120614e-05, + "loss": 1.8138, + "step": 9552 + }, + { + "epoch": 2.9321669736034375, + "grad_norm": 0.28036460280418396, + "learning_rate": 8.291073040553567e-05, + "loss": 1.7958, + "step": 9553 + }, + { + "epoch": 2.932473910374463, + "grad_norm": 0.30798816680908203, + "learning_rate": 8.290698825462207e-05, + "loss": 1.899, + "step": 9554 + }, + { + "epoch": 2.932780847145488, + "grad_norm": 0.40930941700935364, + "learning_rate": 8.290324577850232e-05, + "loss": 1.841, + "step": 9555 + }, + { + "epoch": 2.933087783916513, + "grad_norm": 0.38794800639152527, + "learning_rate": 8.289950297721341e-05, + "loss": 1.8022, + "step": 9556 + }, + { + "epoch": 2.9333947206875384, + "grad_norm": 0.2716790437698364, + "learning_rate": 8.289575985079232e-05, + "loss": 1.8009, + "step": 9557 + }, + { + "epoch": 2.9337016574585633, + "grad_norm": 0.3063231110572815, + "learning_rate": 8.289201639927605e-05, + "loss": 1.8677, + "step": 9558 + }, + { + "epoch": 2.9340085942295886, + "grad_norm": 0.3279048800468445, + "learning_rate": 8.28882726227016e-05, + "loss": 1.8071, + "step": 9559 + }, + { + "epoch": 2.934315531000614, + "grad_norm": 0.32144758105278015, + "learning_rate": 8.288452852110596e-05, + "loss": 1.8601, + "step": 9560 + }, + { + "epoch": 2.9346224677716393, + "grad_norm": 0.284495085477829, + "learning_rate": 8.288078409452614e-05, + "loss": 1.8358, + "step": 9561 + }, + { + "epoch": 2.934929404542664, + "grad_norm": 0.3779112696647644, + "learning_rate": 8.287703934299915e-05, + "loss": 1.7903, + "step": 9562 + }, + { + "epoch": 2.9352363413136895, + "grad_norm": 0.33851495385169983, + "learning_rate": 8.287329426656197e-05, + "loss": 1.806, + "step": 9563 + }, + { + "epoch": 2.9355432780847144, + "grad_norm": 0.26610738039016724, + "learning_rate": 8.286954886525164e-05, + "loss": 1.7739, + "step": 9564 + }, + { + "epoch": 2.9358502148557397, + "grad_norm": 0.24825556576251984, + "learning_rate": 8.286580313910515e-05, + "loss": 1.7595, + "step": 9565 + }, + { + "epoch": 2.936157151626765, + "grad_norm": 0.28356245160102844, + "learning_rate": 8.286205708815954e-05, + "loss": 1.8497, + "step": 9566 + }, + { + "epoch": 2.93646408839779, + "grad_norm": 0.2974208891391754, + "learning_rate": 8.285831071245182e-05, + "loss": 1.8561, + "step": 9567 + }, + { + "epoch": 2.9367710251688153, + "grad_norm": 0.26718810200691223, + "learning_rate": 8.2854564012019e-05, + "loss": 1.776, + "step": 9568 + }, + { + "epoch": 2.93707796193984, + "grad_norm": 0.30627691745758057, + "learning_rate": 8.285081698689814e-05, + "loss": 1.8141, + "step": 9569 + }, + { + "epoch": 2.9373848987108655, + "grad_norm": 0.33287444710731506, + "learning_rate": 8.284706963712625e-05, + "loss": 1.8727, + "step": 9570 + }, + { + "epoch": 2.937691835481891, + "grad_norm": 0.30571332573890686, + "learning_rate": 8.284332196274036e-05, + "loss": 1.8388, + "step": 9571 + }, + { + "epoch": 2.937998772252916, + "grad_norm": 0.3603699207305908, + "learning_rate": 8.283957396377753e-05, + "loss": 1.8655, + "step": 9572 + }, + { + "epoch": 2.938305709023941, + "grad_norm": 0.2890760898590088, + "learning_rate": 8.283582564027477e-05, + "loss": 1.7919, + "step": 9573 + }, + { + "epoch": 2.9386126457949664, + "grad_norm": 0.34981194138526917, + "learning_rate": 8.283207699226912e-05, + "loss": 1.8542, + "step": 9574 + }, + { + "epoch": 2.9389195825659913, + "grad_norm": 0.43490317463874817, + "learning_rate": 8.282832801979766e-05, + "loss": 1.8109, + "step": 9575 + }, + { + "epoch": 2.9392265193370166, + "grad_norm": 0.4337438941001892, + "learning_rate": 8.282457872289742e-05, + "loss": 1.8856, + "step": 9576 + }, + { + "epoch": 2.939533456108042, + "grad_norm": 0.2723710834980011, + "learning_rate": 8.282082910160544e-05, + "loss": 1.8554, + "step": 9577 + }, + { + "epoch": 2.939840392879067, + "grad_norm": 0.32447734475135803, + "learning_rate": 8.28170791559588e-05, + "loss": 1.8086, + "step": 9578 + }, + { + "epoch": 2.940147329650092, + "grad_norm": 0.3495276868343353, + "learning_rate": 8.281332888599455e-05, + "loss": 1.785, + "step": 9579 + }, + { + "epoch": 2.940454266421117, + "grad_norm": 0.3324705958366394, + "learning_rate": 8.280957829174975e-05, + "loss": 1.8086, + "step": 9580 + }, + { + "epoch": 2.9407612031921424, + "grad_norm": 0.2633898854255676, + "learning_rate": 8.280582737326146e-05, + "loss": 1.8116, + "step": 9581 + }, + { + "epoch": 2.9410681399631677, + "grad_norm": 0.3109157085418701, + "learning_rate": 8.280207613056676e-05, + "loss": 1.8649, + "step": 9582 + }, + { + "epoch": 2.9413750767341926, + "grad_norm": 0.2772599756717682, + "learning_rate": 8.279832456370273e-05, + "loss": 1.8578, + "step": 9583 + }, + { + "epoch": 2.941682013505218, + "grad_norm": 0.32322654128074646, + "learning_rate": 8.279457267270642e-05, + "loss": 1.8621, + "step": 9584 + }, + { + "epoch": 2.941988950276243, + "grad_norm": 0.3678343594074249, + "learning_rate": 8.279082045761493e-05, + "loss": 1.8819, + "step": 9585 + }, + { + "epoch": 2.942295887047268, + "grad_norm": 0.30976057052612305, + "learning_rate": 8.27870679184653e-05, + "loss": 1.8126, + "step": 9586 + }, + { + "epoch": 2.9426028238182935, + "grad_norm": 0.26715603470802307, + "learning_rate": 8.278331505529469e-05, + "loss": 1.8831, + "step": 9587 + }, + { + "epoch": 2.942909760589319, + "grad_norm": 0.263288289308548, + "learning_rate": 8.277956186814014e-05, + "loss": 1.8057, + "step": 9588 + }, + { + "epoch": 2.9432166973603437, + "grad_norm": 0.29458633065223694, + "learning_rate": 8.277580835703873e-05, + "loss": 1.7307, + "step": 9589 + }, + { + "epoch": 2.943523634131369, + "grad_norm": 0.27819791436195374, + "learning_rate": 8.277205452202759e-05, + "loss": 1.8783, + "step": 9590 + }, + { + "epoch": 2.943830570902394, + "grad_norm": 0.29286056756973267, + "learning_rate": 8.276830036314379e-05, + "loss": 1.8061, + "step": 9591 + }, + { + "epoch": 2.9441375076734193, + "grad_norm": 0.2955230474472046, + "learning_rate": 8.276454588042442e-05, + "loss": 1.8227, + "step": 9592 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.3070714473724365, + "learning_rate": 8.276079107390663e-05, + "loss": 1.8451, + "step": 9593 + }, + { + "epoch": 2.9447513812154695, + "grad_norm": 0.34235841035842896, + "learning_rate": 8.275703594362749e-05, + "loss": 1.8052, + "step": 9594 + }, + { + "epoch": 2.945058317986495, + "grad_norm": 0.2863236665725708, + "learning_rate": 8.275328048962412e-05, + "loss": 1.8741, + "step": 9595 + }, + { + "epoch": 2.9453652547575198, + "grad_norm": 0.3013235032558441, + "learning_rate": 8.274952471193364e-05, + "loss": 1.8177, + "step": 9596 + }, + { + "epoch": 2.945672191528545, + "grad_norm": 0.2994023561477661, + "learning_rate": 8.274576861059316e-05, + "loss": 1.903, + "step": 9597 + }, + { + "epoch": 2.9459791282995704, + "grad_norm": 0.320049524307251, + "learning_rate": 8.27420121856398e-05, + "loss": 1.882, + "step": 9598 + }, + { + "epoch": 2.9462860650705953, + "grad_norm": 0.2789655029773712, + "learning_rate": 8.273825543711069e-05, + "loss": 1.794, + "step": 9599 + }, + { + "epoch": 2.9465930018416207, + "grad_norm": 0.3148564398288727, + "learning_rate": 8.273449836504294e-05, + "loss": 1.8453, + "step": 9600 + }, + { + "epoch": 2.9468999386126455, + "grad_norm": 0.46754372119903564, + "learning_rate": 8.273074096947371e-05, + "loss": 1.8147, + "step": 9601 + }, + { + "epoch": 2.947206875383671, + "grad_norm": 0.5946900844573975, + "learning_rate": 8.27269832504401e-05, + "loss": 1.8099, + "step": 9602 + }, + { + "epoch": 2.947513812154696, + "grad_norm": 0.4916069507598877, + "learning_rate": 8.272322520797926e-05, + "loss": 1.8315, + "step": 9603 + }, + { + "epoch": 2.9478207489257215, + "grad_norm": 0.30378973484039307, + "learning_rate": 8.271946684212833e-05, + "loss": 1.87, + "step": 9604 + }, + { + "epoch": 2.9481276856967464, + "grad_norm": 0.5197327136993408, + "learning_rate": 8.271570815292447e-05, + "loss": 1.8109, + "step": 9605 + }, + { + "epoch": 2.9484346224677718, + "grad_norm": 0.7213841080665588, + "learning_rate": 8.271194914040478e-05, + "loss": 1.8526, + "step": 9606 + }, + { + "epoch": 2.9487415592387967, + "grad_norm": 0.5521572232246399, + "learning_rate": 8.270818980460643e-05, + "loss": 1.7982, + "step": 9607 + }, + { + "epoch": 2.949048496009822, + "grad_norm": 0.3072868287563324, + "learning_rate": 8.27044301455666e-05, + "loss": 1.8708, + "step": 9608 + }, + { + "epoch": 2.9493554327808473, + "grad_norm": 0.5477200746536255, + "learning_rate": 8.270067016332241e-05, + "loss": 1.8708, + "step": 9609 + }, + { + "epoch": 2.949662369551872, + "grad_norm": 0.5991030335426331, + "learning_rate": 8.269690985791104e-05, + "loss": 1.7983, + "step": 9610 + }, + { + "epoch": 2.9499693063228976, + "grad_norm": 0.33343803882598877, + "learning_rate": 8.269314922936964e-05, + "loss": 1.7867, + "step": 9611 + }, + { + "epoch": 2.9502762430939224, + "grad_norm": 0.3671727776527405, + "learning_rate": 8.268938827773538e-05, + "loss": 1.9604, + "step": 9612 + }, + { + "epoch": 2.950583179864948, + "grad_norm": 0.5015503764152527, + "learning_rate": 8.26856270030454e-05, + "loss": 1.8424, + "step": 9613 + }, + { + "epoch": 2.950890116635973, + "grad_norm": 0.4369170367717743, + "learning_rate": 8.268186540533693e-05, + "loss": 1.7915, + "step": 9614 + }, + { + "epoch": 2.951197053406998, + "grad_norm": 0.2739746868610382, + "learning_rate": 8.267810348464709e-05, + "loss": 1.7816, + "step": 9615 + }, + { + "epoch": 2.9515039901780233, + "grad_norm": 0.3660983145236969, + "learning_rate": 8.26743412410131e-05, + "loss": 1.8235, + "step": 9616 + }, + { + "epoch": 2.9518109269490482, + "grad_norm": 0.44442248344421387, + "learning_rate": 8.26705786744721e-05, + "loss": 1.8566, + "step": 9617 + }, + { + "epoch": 2.9521178637200736, + "grad_norm": 0.28847622871398926, + "learning_rate": 8.266681578506129e-05, + "loss": 1.82, + "step": 9618 + }, + { + "epoch": 2.952424800491099, + "grad_norm": 0.32827475666999817, + "learning_rate": 8.266305257281786e-05, + "loss": 1.8422, + "step": 9619 + }, + { + "epoch": 2.9527317372621242, + "grad_norm": 0.3459654748439789, + "learning_rate": 8.265928903777902e-05, + "loss": 1.7919, + "step": 9620 + }, + { + "epoch": 2.953038674033149, + "grad_norm": 0.31467050313949585, + "learning_rate": 8.265552517998191e-05, + "loss": 1.8178, + "step": 9621 + }, + { + "epoch": 2.9533456108041745, + "grad_norm": 0.2814936935901642, + "learning_rate": 8.265176099946381e-05, + "loss": 1.7823, + "step": 9622 + }, + { + "epoch": 2.9536525475751993, + "grad_norm": 0.36387261748313904, + "learning_rate": 8.264799649626182e-05, + "loss": 1.7861, + "step": 9623 + }, + { + "epoch": 2.9539594843462247, + "grad_norm": 0.3504095673561096, + "learning_rate": 8.264423167041322e-05, + "loss": 1.8216, + "step": 9624 + }, + { + "epoch": 2.95426642111725, + "grad_norm": 0.28199300169944763, + "learning_rate": 8.264046652195519e-05, + "loss": 1.8397, + "step": 9625 + }, + { + "epoch": 2.954573357888275, + "grad_norm": 0.435774028301239, + "learning_rate": 8.263670105092494e-05, + "loss": 1.8316, + "step": 9626 + }, + { + "epoch": 2.9548802946593002, + "grad_norm": 0.37712937593460083, + "learning_rate": 8.263293525735967e-05, + "loss": 1.8089, + "step": 9627 + }, + { + "epoch": 2.955187231430325, + "grad_norm": 0.34833967685699463, + "learning_rate": 8.26291691412966e-05, + "loss": 1.8324, + "step": 9628 + }, + { + "epoch": 2.9554941682013505, + "grad_norm": 0.37515538930892944, + "learning_rate": 8.262540270277297e-05, + "loss": 1.7958, + "step": 9629 + }, + { + "epoch": 2.955801104972376, + "grad_norm": 0.3392273485660553, + "learning_rate": 8.262163594182598e-05, + "loss": 1.8322, + "step": 9630 + }, + { + "epoch": 2.9561080417434007, + "grad_norm": 0.3477925956249237, + "learning_rate": 8.261786885849287e-05, + "loss": 1.8525, + "step": 9631 + }, + { + "epoch": 2.956414978514426, + "grad_norm": 0.35574036836624146, + "learning_rate": 8.261410145281085e-05, + "loss": 1.8148, + "step": 9632 + }, + { + "epoch": 2.9567219152854514, + "grad_norm": 0.3166620135307312, + "learning_rate": 8.261033372481717e-05, + "loss": 1.7914, + "step": 9633 + }, + { + "epoch": 2.9570288520564763, + "grad_norm": 0.2562217116355896, + "learning_rate": 8.260656567454907e-05, + "loss": 1.7794, + "step": 9634 + }, + { + "epoch": 2.9573357888275016, + "grad_norm": 0.3328792452812195, + "learning_rate": 8.260279730204377e-05, + "loss": 1.8235, + "step": 9635 + }, + { + "epoch": 2.957642725598527, + "grad_norm": 0.33144834637641907, + "learning_rate": 8.259902860733852e-05, + "loss": 1.7668, + "step": 9636 + }, + { + "epoch": 2.957949662369552, + "grad_norm": 0.30557021498680115, + "learning_rate": 8.259525959047056e-05, + "loss": 1.9135, + "step": 9637 + }, + { + "epoch": 2.958256599140577, + "grad_norm": 0.2901468575000763, + "learning_rate": 8.259149025147713e-05, + "loss": 1.8023, + "step": 9638 + }, + { + "epoch": 2.958563535911602, + "grad_norm": 0.35177919268608093, + "learning_rate": 8.25877205903955e-05, + "loss": 1.8541, + "step": 9639 + }, + { + "epoch": 2.9588704726826274, + "grad_norm": 0.2745177447795868, + "learning_rate": 8.258395060726291e-05, + "loss": 1.8103, + "step": 9640 + }, + { + "epoch": 2.9591774094536527, + "grad_norm": 0.29005685448646545, + "learning_rate": 8.258018030211663e-05, + "loss": 1.7587, + "step": 9641 + }, + { + "epoch": 2.9594843462246776, + "grad_norm": 0.27498918771743774, + "learning_rate": 8.257640967499391e-05, + "loss": 1.8052, + "step": 9642 + }, + { + "epoch": 2.959791282995703, + "grad_norm": 0.2689644694328308, + "learning_rate": 8.257263872593202e-05, + "loss": 1.8582, + "step": 9643 + }, + { + "epoch": 2.960098219766728, + "grad_norm": 0.2953707277774811, + "learning_rate": 8.256886745496821e-05, + "loss": 1.7654, + "step": 9644 + }, + { + "epoch": 2.960405156537753, + "grad_norm": 0.2573971450328827, + "learning_rate": 8.256509586213978e-05, + "loss": 1.7819, + "step": 9645 + }, + { + "epoch": 2.9607120933087785, + "grad_norm": 0.29667192697525024, + "learning_rate": 8.256132394748398e-05, + "loss": 1.8632, + "step": 9646 + }, + { + "epoch": 2.961019030079804, + "grad_norm": 0.2953830361366272, + "learning_rate": 8.255755171103808e-05, + "loss": 1.8672, + "step": 9647 + }, + { + "epoch": 2.9613259668508287, + "grad_norm": 0.2925500273704529, + "learning_rate": 8.255377915283937e-05, + "loss": 1.8691, + "step": 9648 + }, + { + "epoch": 2.961632903621854, + "grad_norm": 0.32245302200317383, + "learning_rate": 8.255000627292515e-05, + "loss": 1.8701, + "step": 9649 + }, + { + "epoch": 2.961939840392879, + "grad_norm": 0.2671414315700531, + "learning_rate": 8.254623307133268e-05, + "loss": 1.8045, + "step": 9650 + }, + { + "epoch": 2.9622467771639043, + "grad_norm": 0.3135749101638794, + "learning_rate": 8.254245954809928e-05, + "loss": 1.7573, + "step": 9651 + }, + { + "epoch": 2.9625537139349296, + "grad_norm": 0.2604369521141052, + "learning_rate": 8.253868570326218e-05, + "loss": 1.8513, + "step": 9652 + }, + { + "epoch": 2.9628606507059545, + "grad_norm": 0.24657092988491058, + "learning_rate": 8.253491153685875e-05, + "loss": 1.8303, + "step": 9653 + }, + { + "epoch": 2.96316758747698, + "grad_norm": 0.24310527741909027, + "learning_rate": 8.253113704892623e-05, + "loss": 1.7648, + "step": 9654 + }, + { + "epoch": 2.9634745242480047, + "grad_norm": 0.24558408558368683, + "learning_rate": 8.252736223950198e-05, + "loss": 1.7517, + "step": 9655 + }, + { + "epoch": 2.96378146101903, + "grad_norm": 0.2500043511390686, + "learning_rate": 8.252358710862324e-05, + "loss": 1.7588, + "step": 9656 + }, + { + "epoch": 2.9640883977900554, + "grad_norm": 0.2532055079936981, + "learning_rate": 8.251981165632737e-05, + "loss": 1.8414, + "step": 9657 + }, + { + "epoch": 2.9643953345610803, + "grad_norm": 0.2692684829235077, + "learning_rate": 8.251603588265165e-05, + "loss": 1.8701, + "step": 9658 + }, + { + "epoch": 2.9647022713321056, + "grad_norm": 0.2511022984981537, + "learning_rate": 8.251225978763341e-05, + "loss": 1.8068, + "step": 9659 + }, + { + "epoch": 2.9650092081031305, + "grad_norm": 0.24702081084251404, + "learning_rate": 8.250848337130997e-05, + "loss": 1.7993, + "step": 9660 + }, + { + "epoch": 2.965316144874156, + "grad_norm": 0.26960623264312744, + "learning_rate": 8.250470663371862e-05, + "loss": 1.8269, + "step": 9661 + }, + { + "epoch": 2.965623081645181, + "grad_norm": 0.2651064693927765, + "learning_rate": 8.250092957489673e-05, + "loss": 1.8235, + "step": 9662 + }, + { + "epoch": 2.9659300184162065, + "grad_norm": 0.3117934465408325, + "learning_rate": 8.249715219488158e-05, + "loss": 1.9603, + "step": 9663 + }, + { + "epoch": 2.9662369551872314, + "grad_norm": 0.3244706988334656, + "learning_rate": 8.249337449371055e-05, + "loss": 1.8766, + "step": 9664 + }, + { + "epoch": 2.9665438919582567, + "grad_norm": 0.3071763515472412, + "learning_rate": 8.248959647142094e-05, + "loss": 1.8118, + "step": 9665 + }, + { + "epoch": 2.9668508287292816, + "grad_norm": 0.2575626075267792, + "learning_rate": 8.24858181280501e-05, + "loss": 1.8578, + "step": 9666 + }, + { + "epoch": 2.967157765500307, + "grad_norm": 0.369356244802475, + "learning_rate": 8.248203946363535e-05, + "loss": 1.7831, + "step": 9667 + }, + { + "epoch": 2.9674647022713323, + "grad_norm": 0.317775160074234, + "learning_rate": 8.247826047821405e-05, + "loss": 1.8839, + "step": 9668 + }, + { + "epoch": 2.967771639042357, + "grad_norm": 0.31816980242729187, + "learning_rate": 8.247448117182355e-05, + "loss": 1.8111, + "step": 9669 + }, + { + "epoch": 2.9680785758133825, + "grad_norm": 0.2943781316280365, + "learning_rate": 8.247070154450119e-05, + "loss": 1.848, + "step": 9670 + }, + { + "epoch": 2.9683855125844074, + "grad_norm": 0.28252434730529785, + "learning_rate": 8.246692159628433e-05, + "loss": 1.8601, + "step": 9671 + }, + { + "epoch": 2.9686924493554327, + "grad_norm": 0.29150691628456116, + "learning_rate": 8.246314132721032e-05, + "loss": 1.7738, + "step": 9672 + }, + { + "epoch": 2.968999386126458, + "grad_norm": 0.3699757754802704, + "learning_rate": 8.245936073731653e-05, + "loss": 1.842, + "step": 9673 + }, + { + "epoch": 2.969306322897483, + "grad_norm": 0.37951794266700745, + "learning_rate": 8.245557982664031e-05, + "loss": 1.8648, + "step": 9674 + }, + { + "epoch": 2.9696132596685083, + "grad_norm": 0.2792273461818695, + "learning_rate": 8.245179859521901e-05, + "loss": 1.889, + "step": 9675 + }, + { + "epoch": 2.969920196439533, + "grad_norm": 0.3405047059059143, + "learning_rate": 8.244801704309002e-05, + "loss": 1.7658, + "step": 9676 + }, + { + "epoch": 2.9702271332105585, + "grad_norm": 0.40138551592826843, + "learning_rate": 8.244423517029072e-05, + "loss": 1.79, + "step": 9677 + }, + { + "epoch": 2.970534069981584, + "grad_norm": 0.42260462045669556, + "learning_rate": 8.244045297685846e-05, + "loss": 1.9248, + "step": 9678 + }, + { + "epoch": 2.970841006752609, + "grad_norm": 0.30391061305999756, + "learning_rate": 8.243667046283063e-05, + "loss": 1.7922, + "step": 9679 + }, + { + "epoch": 2.971147943523634, + "grad_norm": 0.3194752037525177, + "learning_rate": 8.243288762824463e-05, + "loss": 1.8582, + "step": 9680 + }, + { + "epoch": 2.9714548802946594, + "grad_norm": 0.47853100299835205, + "learning_rate": 8.24291044731378e-05, + "loss": 1.8206, + "step": 9681 + }, + { + "epoch": 2.9717618170656843, + "grad_norm": 0.47428956627845764, + "learning_rate": 8.242532099754756e-05, + "loss": 1.8271, + "step": 9682 + }, + { + "epoch": 2.9720687538367097, + "grad_norm": 0.30275169014930725, + "learning_rate": 8.24215372015113e-05, + "loss": 1.8532, + "step": 9683 + }, + { + "epoch": 2.972375690607735, + "grad_norm": 0.31766825914382935, + "learning_rate": 8.24177530850664e-05, + "loss": 1.7751, + "step": 9684 + }, + { + "epoch": 2.97268262737876, + "grad_norm": 0.3738986551761627, + "learning_rate": 8.241396864825026e-05, + "loss": 1.7644, + "step": 9685 + }, + { + "epoch": 2.972989564149785, + "grad_norm": 0.2794596254825592, + "learning_rate": 8.24101838911003e-05, + "loss": 1.7445, + "step": 9686 + }, + { + "epoch": 2.97329650092081, + "grad_norm": 0.30008718371391296, + "learning_rate": 8.240639881365388e-05, + "loss": 1.8181, + "step": 9687 + }, + { + "epoch": 2.9736034376918354, + "grad_norm": 0.36667200922966003, + "learning_rate": 8.240261341594846e-05, + "loss": 1.8606, + "step": 9688 + }, + { + "epoch": 2.9739103744628608, + "grad_norm": 0.2943612039089203, + "learning_rate": 8.23988276980214e-05, + "loss": 1.8169, + "step": 9689 + }, + { + "epoch": 2.9742173112338857, + "grad_norm": 0.3499365746974945, + "learning_rate": 8.239504165991015e-05, + "loss": 1.8901, + "step": 9690 + }, + { + "epoch": 2.974524248004911, + "grad_norm": 0.35552978515625, + "learning_rate": 8.239125530165211e-05, + "loss": 1.8266, + "step": 9691 + }, + { + "epoch": 2.974831184775936, + "grad_norm": 0.35415011644363403, + "learning_rate": 8.23874686232847e-05, + "loss": 1.8588, + "step": 9692 + }, + { + "epoch": 2.9751381215469612, + "grad_norm": 0.3237420618534088, + "learning_rate": 8.238368162484533e-05, + "loss": 1.8112, + "step": 9693 + }, + { + "epoch": 2.9754450583179866, + "grad_norm": 0.31672203540802, + "learning_rate": 8.237989430637145e-05, + "loss": 1.7983, + "step": 9694 + }, + { + "epoch": 2.975751995089012, + "grad_norm": 0.2926657795906067, + "learning_rate": 8.237610666790048e-05, + "loss": 1.8137, + "step": 9695 + }, + { + "epoch": 2.976058931860037, + "grad_norm": 0.2924230992794037, + "learning_rate": 8.237231870946983e-05, + "loss": 1.8789, + "step": 9696 + }, + { + "epoch": 2.976365868631062, + "grad_norm": 0.2768077850341797, + "learning_rate": 8.236853043111697e-05, + "loss": 1.8643, + "step": 9697 + }, + { + "epoch": 2.976672805402087, + "grad_norm": 0.24151389300823212, + "learning_rate": 8.23647418328793e-05, + "loss": 1.8245, + "step": 9698 + }, + { + "epoch": 2.9769797421731123, + "grad_norm": 0.24514195322990417, + "learning_rate": 8.23609529147943e-05, + "loss": 1.761, + "step": 9699 + }, + { + "epoch": 2.9772866789441377, + "grad_norm": 0.2619125545024872, + "learning_rate": 8.235716367689938e-05, + "loss": 1.8445, + "step": 9700 + }, + { + "epoch": 2.9775936157151626, + "grad_norm": 0.2570437490940094, + "learning_rate": 8.235337411923203e-05, + "loss": 1.7881, + "step": 9701 + }, + { + "epoch": 2.977900552486188, + "grad_norm": 0.288775235414505, + "learning_rate": 8.234958424182966e-05, + "loss": 1.8177, + "step": 9702 + }, + { + "epoch": 2.978207489257213, + "grad_norm": 0.3186240792274475, + "learning_rate": 8.234579404472973e-05, + "loss": 1.8438, + "step": 9703 + }, + { + "epoch": 2.978514426028238, + "grad_norm": 0.2520117163658142, + "learning_rate": 8.23420035279697e-05, + "loss": 1.7791, + "step": 9704 + }, + { + "epoch": 2.9788213627992635, + "grad_norm": 0.23164312541484833, + "learning_rate": 8.233821269158706e-05, + "loss": 1.7368, + "step": 9705 + }, + { + "epoch": 2.979128299570289, + "grad_norm": 0.33843451738357544, + "learning_rate": 8.233442153561924e-05, + "loss": 1.8656, + "step": 9706 + }, + { + "epoch": 2.9794352363413137, + "grad_norm": 0.3070257604122162, + "learning_rate": 8.23306300601037e-05, + "loss": 1.7982, + "step": 9707 + }, + { + "epoch": 2.979742173112339, + "grad_norm": 0.29138872027397156, + "learning_rate": 8.232683826507793e-05, + "loss": 1.8227, + "step": 9708 + }, + { + "epoch": 2.980049109883364, + "grad_norm": 0.22698308527469635, + "learning_rate": 8.23230461505794e-05, + "loss": 1.7841, + "step": 9709 + }, + { + "epoch": 2.9803560466543892, + "grad_norm": 0.2597857713699341, + "learning_rate": 8.231925371664559e-05, + "loss": 1.7438, + "step": 9710 + }, + { + "epoch": 2.9806629834254146, + "grad_norm": 0.28672367334365845, + "learning_rate": 8.231546096331395e-05, + "loss": 1.8415, + "step": 9711 + }, + { + "epoch": 2.9809699201964395, + "grad_norm": 0.24295037984848022, + "learning_rate": 8.2311667890622e-05, + "loss": 1.8179, + "step": 9712 + }, + { + "epoch": 2.981276856967465, + "grad_norm": 0.24558894336223602, + "learning_rate": 8.23078744986072e-05, + "loss": 1.8092, + "step": 9713 + }, + { + "epoch": 2.9815837937384897, + "grad_norm": 0.2644276022911072, + "learning_rate": 8.230408078730706e-05, + "loss": 1.8214, + "step": 9714 + }, + { + "epoch": 2.981890730509515, + "grad_norm": 0.27007076144218445, + "learning_rate": 8.230028675675907e-05, + "loss": 1.8042, + "step": 9715 + }, + { + "epoch": 2.9821976672805404, + "grad_norm": 0.2729937732219696, + "learning_rate": 8.229649240700069e-05, + "loss": 1.8419, + "step": 9716 + }, + { + "epoch": 2.9825046040515653, + "grad_norm": 0.26545679569244385, + "learning_rate": 8.229269773806945e-05, + "loss": 1.823, + "step": 9717 + }, + { + "epoch": 2.9828115408225906, + "grad_norm": 0.23276878893375397, + "learning_rate": 8.228890275000285e-05, + "loss": 1.7635, + "step": 9718 + }, + { + "epoch": 2.9831184775936155, + "grad_norm": 0.28991779685020447, + "learning_rate": 8.228510744283837e-05, + "loss": 1.8303, + "step": 9719 + }, + { + "epoch": 2.983425414364641, + "grad_norm": 0.2821960151195526, + "learning_rate": 8.228131181661357e-05, + "loss": 1.8246, + "step": 9720 + }, + { + "epoch": 2.983732351135666, + "grad_norm": 0.25588423013687134, + "learning_rate": 8.22775158713659e-05, + "loss": 1.7764, + "step": 9721 + }, + { + "epoch": 2.9840392879066915, + "grad_norm": 0.2694758176803589, + "learning_rate": 8.227371960713289e-05, + "loss": 1.8026, + "step": 9722 + }, + { + "epoch": 2.9843462246777164, + "grad_norm": 0.27571097016334534, + "learning_rate": 8.226992302395209e-05, + "loss": 1.8051, + "step": 9723 + }, + { + "epoch": 2.9846531614487417, + "grad_norm": 0.2940119504928589, + "learning_rate": 8.226612612186099e-05, + "loss": 1.8782, + "step": 9724 + }, + { + "epoch": 2.9849600982197666, + "grad_norm": 0.34924936294555664, + "learning_rate": 8.226232890089711e-05, + "loss": 1.7845, + "step": 9725 + }, + { + "epoch": 2.985267034990792, + "grad_norm": 0.30503180623054504, + "learning_rate": 8.2258531361098e-05, + "loss": 1.8345, + "step": 9726 + }, + { + "epoch": 2.9855739717618173, + "grad_norm": 0.2463730275630951, + "learning_rate": 8.225473350250117e-05, + "loss": 1.8188, + "step": 9727 + }, + { + "epoch": 2.985880908532842, + "grad_norm": 0.3514629900455475, + "learning_rate": 8.225093532514417e-05, + "loss": 1.9253, + "step": 9728 + }, + { + "epoch": 2.9861878453038675, + "grad_norm": 0.26462769508361816, + "learning_rate": 8.224713682906449e-05, + "loss": 1.7396, + "step": 9729 + }, + { + "epoch": 2.9864947820748924, + "grad_norm": 0.27125996351242065, + "learning_rate": 8.224333801429973e-05, + "loss": 1.7784, + "step": 9730 + }, + { + "epoch": 2.9868017188459177, + "grad_norm": 0.3083387315273285, + "learning_rate": 8.22395388808874e-05, + "loss": 1.8503, + "step": 9731 + }, + { + "epoch": 2.987108655616943, + "grad_norm": 0.28289708495140076, + "learning_rate": 8.223573942886505e-05, + "loss": 1.8337, + "step": 9732 + }, + { + "epoch": 2.987415592387968, + "grad_norm": 0.3667753040790558, + "learning_rate": 8.223193965827023e-05, + "loss": 1.8213, + "step": 9733 + }, + { + "epoch": 2.9877225291589933, + "grad_norm": 0.3568948805332184, + "learning_rate": 8.222813956914049e-05, + "loss": 1.8337, + "step": 9734 + }, + { + "epoch": 2.988029465930018, + "grad_norm": 0.2883065640926361, + "learning_rate": 8.22243391615134e-05, + "loss": 1.7227, + "step": 9735 + }, + { + "epoch": 2.9883364027010435, + "grad_norm": 0.24940936267375946, + "learning_rate": 8.222053843542648e-05, + "loss": 1.7889, + "step": 9736 + }, + { + "epoch": 2.988643339472069, + "grad_norm": 0.31267982721328735, + "learning_rate": 8.221673739091732e-05, + "loss": 1.8432, + "step": 9737 + }, + { + "epoch": 2.988950276243094, + "grad_norm": 0.3552311658859253, + "learning_rate": 8.221293602802349e-05, + "loss": 1.8569, + "step": 9738 + }, + { + "epoch": 2.989257213014119, + "grad_norm": 0.4149966835975647, + "learning_rate": 8.220913434678252e-05, + "loss": 1.8052, + "step": 9739 + }, + { + "epoch": 2.9895641497851444, + "grad_norm": 0.282320499420166, + "learning_rate": 8.220533234723204e-05, + "loss": 1.7629, + "step": 9740 + }, + { + "epoch": 2.9898710865561693, + "grad_norm": 0.27737030386924744, + "learning_rate": 8.220153002940958e-05, + "loss": 1.8331, + "step": 9741 + }, + { + "epoch": 2.9901780233271946, + "grad_norm": 0.29296645522117615, + "learning_rate": 8.219772739335272e-05, + "loss": 1.8414, + "step": 9742 + }, + { + "epoch": 2.99048496009822, + "grad_norm": 0.35226449370384216, + "learning_rate": 8.219392443909903e-05, + "loss": 1.8608, + "step": 9743 + }, + { + "epoch": 2.990791896869245, + "grad_norm": 0.3199223577976227, + "learning_rate": 8.219012116668612e-05, + "loss": 1.7868, + "step": 9744 + }, + { + "epoch": 2.99109883364027, + "grad_norm": 0.2904597818851471, + "learning_rate": 8.218631757615159e-05, + "loss": 1.8495, + "step": 9745 + }, + { + "epoch": 2.991405770411295, + "grad_norm": 0.34674009680747986, + "learning_rate": 8.218251366753298e-05, + "loss": 1.8143, + "step": 9746 + }, + { + "epoch": 2.9917127071823204, + "grad_norm": 0.38007479906082153, + "learning_rate": 8.217870944086791e-05, + "loss": 1.8534, + "step": 9747 + }, + { + "epoch": 2.9920196439533457, + "grad_norm": 0.31660130620002747, + "learning_rate": 8.217490489619398e-05, + "loss": 1.7807, + "step": 9748 + }, + { + "epoch": 2.9923265807243706, + "grad_norm": 0.2923539876937866, + "learning_rate": 8.217110003354877e-05, + "loss": 1.8517, + "step": 9749 + }, + { + "epoch": 2.992633517495396, + "grad_norm": 0.31018227338790894, + "learning_rate": 8.21672948529699e-05, + "loss": 1.7998, + "step": 9750 + }, + { + "epoch": 2.992940454266421, + "grad_norm": 0.29448994994163513, + "learning_rate": 8.216348935449496e-05, + "loss": 1.7883, + "step": 9751 + }, + { + "epoch": 2.993247391037446, + "grad_norm": 0.26120781898498535, + "learning_rate": 8.215968353816158e-05, + "loss": 1.7762, + "step": 9752 + }, + { + "epoch": 2.9935543278084715, + "grad_norm": 0.27784180641174316, + "learning_rate": 8.215587740400735e-05, + "loss": 1.8711, + "step": 9753 + }, + { + "epoch": 2.993861264579497, + "grad_norm": 0.3106052577495575, + "learning_rate": 8.21520709520699e-05, + "loss": 1.8112, + "step": 9754 + }, + { + "epoch": 2.9941682013505218, + "grad_norm": 0.3170885145664215, + "learning_rate": 8.214826418238684e-05, + "loss": 1.8893, + "step": 9755 + }, + { + "epoch": 2.994475138121547, + "grad_norm": 0.2969432473182678, + "learning_rate": 8.214445709499577e-05, + "loss": 1.8628, + "step": 9756 + }, + { + "epoch": 2.994782074892572, + "grad_norm": 0.30484744906425476, + "learning_rate": 8.214064968993436e-05, + "loss": 1.8421, + "step": 9757 + }, + { + "epoch": 2.9950890116635973, + "grad_norm": 0.24819856882095337, + "learning_rate": 8.213684196724019e-05, + "loss": 1.8243, + "step": 9758 + }, + { + "epoch": 2.9953959484346226, + "grad_norm": 0.28566786646842957, + "learning_rate": 8.213303392695092e-05, + "loss": 1.8064, + "step": 9759 + }, + { + "epoch": 2.9957028852056475, + "grad_norm": 0.27742111682891846, + "learning_rate": 8.212922556910418e-05, + "loss": 1.8174, + "step": 9760 + }, + { + "epoch": 2.996009821976673, + "grad_norm": 0.27103090286254883, + "learning_rate": 8.212541689373761e-05, + "loss": 1.761, + "step": 9761 + }, + { + "epoch": 2.9963167587476978, + "grad_norm": 0.27157172560691833, + "learning_rate": 8.212160790088883e-05, + "loss": 1.8893, + "step": 9762 + }, + { + "epoch": 2.996623695518723, + "grad_norm": 0.2742370367050171, + "learning_rate": 8.21177985905955e-05, + "loss": 1.8774, + "step": 9763 + }, + { + "epoch": 2.9969306322897484, + "grad_norm": 0.26467064023017883, + "learning_rate": 8.211398896289524e-05, + "loss": 1.7805, + "step": 9764 + }, + { + "epoch": 2.9972375690607733, + "grad_norm": 0.2622149884700775, + "learning_rate": 8.211017901782574e-05, + "loss": 1.7346, + "step": 9765 + }, + { + "epoch": 2.9975445058317987, + "grad_norm": 0.3163202106952667, + "learning_rate": 8.210636875542462e-05, + "loss": 1.8348, + "step": 9766 + }, + { + "epoch": 2.9978514426028235, + "grad_norm": 0.2789528965950012, + "learning_rate": 8.210255817572955e-05, + "loss": 1.7535, + "step": 9767 + }, + { + "epoch": 2.998158379373849, + "grad_norm": 0.25694188475608826, + "learning_rate": 8.209874727877818e-05, + "loss": 1.8731, + "step": 9768 + }, + { + "epoch": 2.998465316144874, + "grad_norm": 0.40298742055892944, + "learning_rate": 8.209493606460818e-05, + "loss": 1.7924, + "step": 9769 + }, + { + "epoch": 2.9987722529158995, + "grad_norm": 0.5090280771255493, + "learning_rate": 8.20911245332572e-05, + "loss": 1.8253, + "step": 9770 + }, + { + "epoch": 2.9990791896869244, + "grad_norm": 0.41809162497520447, + "learning_rate": 8.208731268476293e-05, + "loss": 1.8233, + "step": 9771 + }, + { + "epoch": 2.9993861264579498, + "grad_norm": 0.23141434788703918, + "learning_rate": 8.208350051916303e-05, + "loss": 1.7842, + "step": 9772 + }, + { + "epoch": 2.9996930632289747, + "grad_norm": 0.3174372613430023, + "learning_rate": 8.207968803649517e-05, + "loss": 1.8477, + "step": 9773 + }, + { + "epoch": 3.0, + "grad_norm": 0.41795292496681213, + "learning_rate": 8.207587523679704e-05, + "loss": 1.8407, + "step": 9774 + }, + { + "epoch": 3.0003069367710253, + "grad_norm": 0.43365660309791565, + "learning_rate": 8.20720621201063e-05, + "loss": 1.8074, + "step": 9775 + }, + { + "epoch": 3.0006138735420502, + "grad_norm": 0.461374968290329, + "learning_rate": 8.206824868646064e-05, + "loss": 1.9089, + "step": 9776 + }, + { + "epoch": 3.0009208103130756, + "grad_norm": 0.3747929632663727, + "learning_rate": 8.206443493589776e-05, + "loss": 1.8358, + "step": 9777 + }, + { + "epoch": 3.001227747084101, + "grad_norm": 0.28436774015426636, + "learning_rate": 8.206062086845532e-05, + "loss": 1.8527, + "step": 9778 + }, + { + "epoch": 3.001534683855126, + "grad_norm": 0.33642131090164185, + "learning_rate": 8.205680648417106e-05, + "loss": 1.8142, + "step": 9779 + }, + { + "epoch": 3.001841620626151, + "grad_norm": 0.4283481240272522, + "learning_rate": 8.205299178308263e-05, + "loss": 1.9006, + "step": 9780 + }, + { + "epoch": 3.002148557397176, + "grad_norm": 0.34405630826950073, + "learning_rate": 8.204917676522777e-05, + "loss": 1.7988, + "step": 9781 + }, + { + "epoch": 3.0024554941682013, + "grad_norm": 0.3161070942878723, + "learning_rate": 8.204536143064414e-05, + "loss": 1.8271, + "step": 9782 + }, + { + "epoch": 3.0027624309392267, + "grad_norm": 0.42518749833106995, + "learning_rate": 8.204154577936946e-05, + "loss": 1.864, + "step": 9783 + }, + { + "epoch": 3.0030693677102516, + "grad_norm": 0.3760852813720703, + "learning_rate": 8.203772981144146e-05, + "loss": 1.8543, + "step": 9784 + }, + { + "epoch": 3.003376304481277, + "grad_norm": 0.32794755697250366, + "learning_rate": 8.203391352689784e-05, + "loss": 1.8776, + "step": 9785 + }, + { + "epoch": 3.0036832412523022, + "grad_norm": 0.3053889274597168, + "learning_rate": 8.20300969257763e-05, + "loss": 1.8064, + "step": 9786 + }, + { + "epoch": 3.003990178023327, + "grad_norm": 0.40283143520355225, + "learning_rate": 8.202628000811456e-05, + "loss": 1.8083, + "step": 9787 + }, + { + "epoch": 3.0042971147943525, + "grad_norm": 0.49270665645599365, + "learning_rate": 8.202246277395038e-05, + "loss": 1.802, + "step": 9788 + }, + { + "epoch": 3.0046040515653774, + "grad_norm": 0.4373023211956024, + "learning_rate": 8.201864522332143e-05, + "loss": 1.8429, + "step": 9789 + }, + { + "epoch": 3.0049109883364027, + "grad_norm": 0.3136310875415802, + "learning_rate": 8.201482735626547e-05, + "loss": 1.8224, + "step": 9790 + }, + { + "epoch": 3.005217925107428, + "grad_norm": 0.3306807279586792, + "learning_rate": 8.201100917282023e-05, + "loss": 1.8463, + "step": 9791 + }, + { + "epoch": 3.005524861878453, + "grad_norm": 0.45082196593284607, + "learning_rate": 8.200719067302342e-05, + "loss": 1.7587, + "step": 9792 + }, + { + "epoch": 3.0058317986494782, + "grad_norm": 0.49246448278427124, + "learning_rate": 8.20033718569128e-05, + "loss": 1.8245, + "step": 9793 + }, + { + "epoch": 3.0061387354205036, + "grad_norm": 0.3040246367454529, + "learning_rate": 8.199955272452609e-05, + "loss": 1.8309, + "step": 9794 + }, + { + "epoch": 3.0064456721915285, + "grad_norm": 0.3909318149089813, + "learning_rate": 8.199573327590105e-05, + "loss": 1.8187, + "step": 9795 + }, + { + "epoch": 3.006752608962554, + "grad_norm": 0.5753183960914612, + "learning_rate": 8.199191351107543e-05, + "loss": 1.826, + "step": 9796 + }, + { + "epoch": 3.0070595457335787, + "grad_norm": 0.48908689618110657, + "learning_rate": 8.198809343008695e-05, + "loss": 1.8475, + "step": 9797 + }, + { + "epoch": 3.007366482504604, + "grad_norm": 0.31570208072662354, + "learning_rate": 8.198427303297341e-05, + "loss": 1.8046, + "step": 9798 + }, + { + "epoch": 3.0076734192756294, + "grad_norm": 0.39205440878868103, + "learning_rate": 8.198045231977251e-05, + "loss": 1.8413, + "step": 9799 + }, + { + "epoch": 3.0079803560466543, + "grad_norm": 0.5117597579956055, + "learning_rate": 8.197663129052204e-05, + "loss": 1.8184, + "step": 9800 + }, + { + "epoch": 3.0082872928176796, + "grad_norm": 0.3623514175415039, + "learning_rate": 8.197280994525978e-05, + "loss": 1.8292, + "step": 9801 + }, + { + "epoch": 3.008594229588705, + "grad_norm": 0.2826726734638214, + "learning_rate": 8.196898828402344e-05, + "loss": 1.8216, + "step": 9802 + }, + { + "epoch": 3.00890116635973, + "grad_norm": 0.38658398389816284, + "learning_rate": 8.196516630685085e-05, + "loss": 1.867, + "step": 9803 + }, + { + "epoch": 3.009208103130755, + "grad_norm": 0.3371698260307312, + "learning_rate": 8.196134401377973e-05, + "loss": 1.8077, + "step": 9804 + }, + { + "epoch": 3.00951503990178, + "grad_norm": 0.24108785390853882, + "learning_rate": 8.195752140484789e-05, + "loss": 1.7858, + "step": 9805 + }, + { + "epoch": 3.0098219766728054, + "grad_norm": 0.34410104155540466, + "learning_rate": 8.195369848009309e-05, + "loss": 1.801, + "step": 9806 + }, + { + "epoch": 3.0101289134438307, + "grad_norm": 0.3412116467952728, + "learning_rate": 8.194987523955311e-05, + "loss": 1.7905, + "step": 9807 + }, + { + "epoch": 3.0104358502148556, + "grad_norm": 0.2473030537366867, + "learning_rate": 8.194605168326573e-05, + "loss": 1.7765, + "step": 9808 + }, + { + "epoch": 3.010742786985881, + "grad_norm": 0.28590065240859985, + "learning_rate": 8.194222781126875e-05, + "loss": 1.7897, + "step": 9809 + }, + { + "epoch": 3.0110497237569063, + "grad_norm": 0.2994272708892822, + "learning_rate": 8.193840362359994e-05, + "loss": 1.7976, + "step": 9810 + }, + { + "epoch": 3.011356660527931, + "grad_norm": 0.2971307635307312, + "learning_rate": 8.193457912029713e-05, + "loss": 1.829, + "step": 9811 + }, + { + "epoch": 3.0116635972989565, + "grad_norm": 0.25149810314178467, + "learning_rate": 8.193075430139809e-05, + "loss": 1.7709, + "step": 9812 + }, + { + "epoch": 3.0119705340699814, + "grad_norm": 0.2561332583427429, + "learning_rate": 8.19269291669406e-05, + "loss": 1.7689, + "step": 9813 + }, + { + "epoch": 3.0122774708410067, + "grad_norm": 0.2658882141113281, + "learning_rate": 8.192310371696249e-05, + "loss": 1.8497, + "step": 9814 + }, + { + "epoch": 3.012584407612032, + "grad_norm": 0.2873780429363251, + "learning_rate": 8.191927795150156e-05, + "loss": 1.8217, + "step": 9815 + }, + { + "epoch": 3.012891344383057, + "grad_norm": 0.2181183248758316, + "learning_rate": 8.191545187059562e-05, + "loss": 1.7261, + "step": 9816 + }, + { + "epoch": 3.0131982811540823, + "grad_norm": 0.2414858490228653, + "learning_rate": 8.191162547428248e-05, + "loss": 1.8035, + "step": 9817 + }, + { + "epoch": 3.0135052179251076, + "grad_norm": 0.2799840271472931, + "learning_rate": 8.190779876259995e-05, + "loss": 1.8279, + "step": 9818 + }, + { + "epoch": 3.0138121546961325, + "grad_norm": 0.2669760584831238, + "learning_rate": 8.190397173558584e-05, + "loss": 1.8155, + "step": 9819 + }, + { + "epoch": 3.014119091467158, + "grad_norm": 0.28857991099357605, + "learning_rate": 8.1900144393278e-05, + "loss": 1.8479, + "step": 9820 + }, + { + "epoch": 3.0144260282381827, + "grad_norm": 0.30534693598747253, + "learning_rate": 8.189631673571422e-05, + "loss": 1.8609, + "step": 9821 + }, + { + "epoch": 3.014732965009208, + "grad_norm": 0.3238218128681183, + "learning_rate": 8.189248876293236e-05, + "loss": 1.9292, + "step": 9822 + }, + { + "epoch": 3.0150399017802334, + "grad_norm": 0.3000536561012268, + "learning_rate": 8.188866047497022e-05, + "loss": 1.8214, + "step": 9823 + }, + { + "epoch": 3.0153468385512583, + "grad_norm": 0.2960065007209778, + "learning_rate": 8.188483187186565e-05, + "loss": 1.8316, + "step": 9824 + }, + { + "epoch": 3.0156537753222836, + "grad_norm": 0.28609779477119446, + "learning_rate": 8.188100295365648e-05, + "loss": 1.8002, + "step": 9825 + }, + { + "epoch": 3.015960712093309, + "grad_norm": 0.31390634179115295, + "learning_rate": 8.187717372038057e-05, + "loss": 1.8134, + "step": 9826 + }, + { + "epoch": 3.016267648864334, + "grad_norm": 0.28550946712493896, + "learning_rate": 8.187334417207573e-05, + "loss": 1.8359, + "step": 9827 + }, + { + "epoch": 3.016574585635359, + "grad_norm": 0.3085210621356964, + "learning_rate": 8.186951430877982e-05, + "loss": 1.813, + "step": 9828 + }, + { + "epoch": 3.016881522406384, + "grad_norm": 0.3043847978115082, + "learning_rate": 8.18656841305307e-05, + "loss": 1.8222, + "step": 9829 + }, + { + "epoch": 3.0171884591774094, + "grad_norm": 0.32524731755256653, + "learning_rate": 8.18618536373662e-05, + "loss": 1.8258, + "step": 9830 + }, + { + "epoch": 3.0174953959484347, + "grad_norm": 0.2690991461277008, + "learning_rate": 8.18580228293242e-05, + "loss": 1.8492, + "step": 9831 + }, + { + "epoch": 3.0178023327194596, + "grad_norm": 0.34936225414276123, + "learning_rate": 8.185419170644253e-05, + "loss": 1.8363, + "step": 9832 + }, + { + "epoch": 3.018109269490485, + "grad_norm": 0.3274296820163727, + "learning_rate": 8.185036026875908e-05, + "loss": 1.7789, + "step": 9833 + }, + { + "epoch": 3.0184162062615103, + "grad_norm": 0.2729836106300354, + "learning_rate": 8.184652851631169e-05, + "loss": 1.8264, + "step": 9834 + }, + { + "epoch": 3.018723143032535, + "grad_norm": 0.28682780265808105, + "learning_rate": 8.184269644913826e-05, + "loss": 1.8399, + "step": 9835 + }, + { + "epoch": 3.0190300798035605, + "grad_norm": 0.3224826455116272, + "learning_rate": 8.183886406727662e-05, + "loss": 1.8338, + "step": 9836 + }, + { + "epoch": 3.0193370165745854, + "grad_norm": 0.30945318937301636, + "learning_rate": 8.183503137076467e-05, + "loss": 1.8248, + "step": 9837 + }, + { + "epoch": 3.0196439533456108, + "grad_norm": 0.27580398321151733, + "learning_rate": 8.183119835964029e-05, + "loss": 1.8096, + "step": 9838 + }, + { + "epoch": 3.019950890116636, + "grad_norm": 0.28927183151245117, + "learning_rate": 8.182736503394132e-05, + "loss": 1.825, + "step": 9839 + }, + { + "epoch": 3.020257826887661, + "grad_norm": 0.253000408411026, + "learning_rate": 8.182353139370571e-05, + "loss": 1.7678, + "step": 9840 + }, + { + "epoch": 3.0205647636586863, + "grad_norm": 0.2882022559642792, + "learning_rate": 8.18196974389713e-05, + "loss": 1.8895, + "step": 9841 + }, + { + "epoch": 3.0208717004297116, + "grad_norm": 0.26864609122276306, + "learning_rate": 8.1815863169776e-05, + "loss": 1.7674, + "step": 9842 + }, + { + "epoch": 3.0211786372007365, + "grad_norm": 0.27344849705696106, + "learning_rate": 8.181202858615769e-05, + "loss": 1.8146, + "step": 9843 + }, + { + "epoch": 3.021485573971762, + "grad_norm": 0.31659772992134094, + "learning_rate": 8.180819368815425e-05, + "loss": 1.8485, + "step": 9844 + }, + { + "epoch": 3.021792510742787, + "grad_norm": 0.3163176476955414, + "learning_rate": 8.18043584758036e-05, + "loss": 1.8994, + "step": 9845 + }, + { + "epoch": 3.022099447513812, + "grad_norm": 0.2583829462528229, + "learning_rate": 8.180052294914365e-05, + "loss": 1.764, + "step": 9846 + }, + { + "epoch": 3.0224063842848374, + "grad_norm": 0.3006649315357208, + "learning_rate": 8.179668710821227e-05, + "loss": 1.9232, + "step": 9847 + }, + { + "epoch": 3.0227133210558623, + "grad_norm": 0.35702988505363464, + "learning_rate": 8.179285095304741e-05, + "loss": 1.8403, + "step": 9848 + }, + { + "epoch": 3.0230202578268877, + "grad_norm": 0.29699379205703735, + "learning_rate": 8.178901448368697e-05, + "loss": 1.8412, + "step": 9849 + }, + { + "epoch": 3.023327194597913, + "grad_norm": 0.3022700548171997, + "learning_rate": 8.178517770016885e-05, + "loss": 1.8197, + "step": 9850 + }, + { + "epoch": 3.023634131368938, + "grad_norm": 0.2943836748600006, + "learning_rate": 8.178134060253097e-05, + "loss": 1.8127, + "step": 9851 + }, + { + "epoch": 3.023941068139963, + "grad_norm": 0.31290489435195923, + "learning_rate": 8.177750319081126e-05, + "loss": 1.821, + "step": 9852 + }, + { + "epoch": 3.0242480049109886, + "grad_norm": 0.30308374762535095, + "learning_rate": 8.177366546504763e-05, + "loss": 1.8522, + "step": 9853 + }, + { + "epoch": 3.0245549416820134, + "grad_norm": 0.301559716463089, + "learning_rate": 8.176982742527802e-05, + "loss": 1.8758, + "step": 9854 + }, + { + "epoch": 3.0248618784530388, + "grad_norm": 0.33314836025238037, + "learning_rate": 8.176598907154034e-05, + "loss": 1.8178, + "step": 9855 + }, + { + "epoch": 3.0251688152240637, + "grad_norm": 0.3567935526371002, + "learning_rate": 8.176215040387255e-05, + "loss": 1.7847, + "step": 9856 + }, + { + "epoch": 3.025475751995089, + "grad_norm": 0.27716195583343506, + "learning_rate": 8.175831142231258e-05, + "loss": 1.772, + "step": 9857 + }, + { + "epoch": 3.0257826887661143, + "grad_norm": 0.24568212032318115, + "learning_rate": 8.175447212689836e-05, + "loss": 1.8171, + "step": 9858 + }, + { + "epoch": 3.0260896255371392, + "grad_norm": 0.25368261337280273, + "learning_rate": 8.175063251766784e-05, + "loss": 1.852, + "step": 9859 + }, + { + "epoch": 3.0263965623081646, + "grad_norm": 0.2509497404098511, + "learning_rate": 8.174679259465894e-05, + "loss": 1.7737, + "step": 9860 + }, + { + "epoch": 3.02670349907919, + "grad_norm": 0.3539343774318695, + "learning_rate": 8.174295235790963e-05, + "loss": 1.8663, + "step": 9861 + }, + { + "epoch": 3.027010435850215, + "grad_norm": 0.36450034379959106, + "learning_rate": 8.173911180745788e-05, + "loss": 1.8179, + "step": 9862 + }, + { + "epoch": 3.02731737262124, + "grad_norm": 0.3550017178058624, + "learning_rate": 8.173527094334162e-05, + "loss": 1.8256, + "step": 9863 + }, + { + "epoch": 3.027624309392265, + "grad_norm": 0.33518701791763306, + "learning_rate": 8.17314297655988e-05, + "loss": 1.7842, + "step": 9864 + }, + { + "epoch": 3.0279312461632903, + "grad_norm": 0.2522886097431183, + "learning_rate": 8.172758827426739e-05, + "loss": 1.7688, + "step": 9865 + }, + { + "epoch": 3.0282381829343157, + "grad_norm": 0.26222914457321167, + "learning_rate": 8.172374646938536e-05, + "loss": 1.8517, + "step": 9866 + }, + { + "epoch": 3.0285451197053406, + "grad_norm": 0.3355788588523865, + "learning_rate": 8.171990435099068e-05, + "loss": 1.9002, + "step": 9867 + }, + { + "epoch": 3.028852056476366, + "grad_norm": 0.32907500863075256, + "learning_rate": 8.171606191912131e-05, + "loss": 1.7801, + "step": 9868 + }, + { + "epoch": 3.0291589932473912, + "grad_norm": 0.29234179854393005, + "learning_rate": 8.171221917381523e-05, + "loss": 1.8055, + "step": 9869 + }, + { + "epoch": 3.029465930018416, + "grad_norm": 0.26374876499176025, + "learning_rate": 8.170837611511041e-05, + "loss": 1.781, + "step": 9870 + }, + { + "epoch": 3.0297728667894415, + "grad_norm": 0.311282217502594, + "learning_rate": 8.170453274304483e-05, + "loss": 1.839, + "step": 9871 + }, + { + "epoch": 3.0300798035604664, + "grad_norm": 0.24225831031799316, + "learning_rate": 8.170068905765648e-05, + "loss": 1.804, + "step": 9872 + }, + { + "epoch": 3.0303867403314917, + "grad_norm": 0.29383334517478943, + "learning_rate": 8.169684505898335e-05, + "loss": 1.7817, + "step": 9873 + }, + { + "epoch": 3.030693677102517, + "grad_norm": 0.2607928514480591, + "learning_rate": 8.169300074706339e-05, + "loss": 1.8379, + "step": 9874 + }, + { + "epoch": 3.031000613873542, + "grad_norm": 0.283028244972229, + "learning_rate": 8.168915612193464e-05, + "loss": 1.7797, + "step": 9875 + }, + { + "epoch": 3.0313075506445673, + "grad_norm": 0.27675309777259827, + "learning_rate": 8.168531118363508e-05, + "loss": 1.8355, + "step": 9876 + }, + { + "epoch": 3.0316144874155926, + "grad_norm": 0.2598227262496948, + "learning_rate": 8.16814659322027e-05, + "loss": 1.7898, + "step": 9877 + }, + { + "epoch": 3.0319214241866175, + "grad_norm": 0.24715003371238708, + "learning_rate": 8.16776203676755e-05, + "loss": 1.7791, + "step": 9878 + }, + { + "epoch": 3.032228360957643, + "grad_norm": 0.2749374210834503, + "learning_rate": 8.167377449009149e-05, + "loss": 1.8303, + "step": 9879 + }, + { + "epoch": 3.0325352977286677, + "grad_norm": 0.26150834560394287, + "learning_rate": 8.166992829948868e-05, + "loss": 1.8462, + "step": 9880 + }, + { + "epoch": 3.032842234499693, + "grad_norm": 0.3044755160808563, + "learning_rate": 8.166608179590506e-05, + "loss": 1.806, + "step": 9881 + }, + { + "epoch": 3.0331491712707184, + "grad_norm": 0.2949555516242981, + "learning_rate": 8.166223497937868e-05, + "loss": 1.8785, + "step": 9882 + }, + { + "epoch": 3.0334561080417433, + "grad_norm": 0.33206698298454285, + "learning_rate": 8.165838784994752e-05, + "loss": 1.8476, + "step": 9883 + }, + { + "epoch": 3.0337630448127686, + "grad_norm": 0.2720400094985962, + "learning_rate": 8.165454040764962e-05, + "loss": 1.843, + "step": 9884 + }, + { + "epoch": 3.034069981583794, + "grad_norm": 0.29340869188308716, + "learning_rate": 8.1650692652523e-05, + "loss": 1.7761, + "step": 9885 + }, + { + "epoch": 3.034376918354819, + "grad_norm": 0.35155293345451355, + "learning_rate": 8.16468445846057e-05, + "loss": 1.8887, + "step": 9886 + }, + { + "epoch": 3.034683855125844, + "grad_norm": 0.2688990831375122, + "learning_rate": 8.164299620393571e-05, + "loss": 1.8001, + "step": 9887 + }, + { + "epoch": 3.034990791896869, + "grad_norm": 0.2921253442764282, + "learning_rate": 8.16391475105511e-05, + "loss": 1.7951, + "step": 9888 + }, + { + "epoch": 3.0352977286678944, + "grad_norm": 0.28100699186325073, + "learning_rate": 8.163529850448988e-05, + "loss": 1.8041, + "step": 9889 + }, + { + "epoch": 3.0356046654389197, + "grad_norm": 0.3155081868171692, + "learning_rate": 8.16314491857901e-05, + "loss": 1.8026, + "step": 9890 + }, + { + "epoch": 3.0359116022099446, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.16275995544898e-05, + "loss": 1.8502, + "step": 9891 + }, + { + "epoch": 3.03621853898097, + "grad_norm": 0.2732076644897461, + "learning_rate": 8.162374961062704e-05, + "loss": 1.8424, + "step": 9892 + }, + { + "epoch": 3.0365254757519953, + "grad_norm": 0.2943679690361023, + "learning_rate": 8.161989935423984e-05, + "loss": 1.7635, + "step": 9893 + }, + { + "epoch": 3.03683241252302, + "grad_norm": 0.28894683718681335, + "learning_rate": 8.161604878536626e-05, + "loss": 1.78, + "step": 9894 + }, + { + "epoch": 3.0371393492940455, + "grad_norm": 0.2718082666397095, + "learning_rate": 8.161219790404435e-05, + "loss": 1.7664, + "step": 9895 + }, + { + "epoch": 3.0374462860650704, + "grad_norm": 0.29092124104499817, + "learning_rate": 8.160834671031216e-05, + "loss": 1.8621, + "step": 9896 + }, + { + "epoch": 3.0377532228360957, + "grad_norm": 0.284665584564209, + "learning_rate": 8.160449520420779e-05, + "loss": 1.8607, + "step": 9897 + }, + { + "epoch": 3.038060159607121, + "grad_norm": 0.23676982522010803, + "learning_rate": 8.160064338576925e-05, + "loss": 1.7137, + "step": 9898 + }, + { + "epoch": 3.038367096378146, + "grad_norm": 0.2666932940483093, + "learning_rate": 8.159679125503466e-05, + "loss": 1.8038, + "step": 9899 + }, + { + "epoch": 3.0386740331491713, + "grad_norm": 0.36214375495910645, + "learning_rate": 8.159293881204204e-05, + "loss": 1.8902, + "step": 9900 + }, + { + "epoch": 3.0389809699201966, + "grad_norm": 0.30301332473754883, + "learning_rate": 8.158908605682948e-05, + "loss": 1.8456, + "step": 9901 + }, + { + "epoch": 3.0392879066912215, + "grad_norm": 0.32190418243408203, + "learning_rate": 8.158523298943506e-05, + "loss": 1.8246, + "step": 9902 + }, + { + "epoch": 3.039594843462247, + "grad_norm": 0.2938043475151062, + "learning_rate": 8.158137960989685e-05, + "loss": 1.8324, + "step": 9903 + }, + { + "epoch": 3.0399017802332717, + "grad_norm": 0.29493969678878784, + "learning_rate": 8.157752591825294e-05, + "loss": 1.8458, + "step": 9904 + }, + { + "epoch": 3.040208717004297, + "grad_norm": 0.2681889832019806, + "learning_rate": 8.157367191454141e-05, + "loss": 1.889, + "step": 9905 + }, + { + "epoch": 3.0405156537753224, + "grad_norm": 0.3111969232559204, + "learning_rate": 8.156981759880035e-05, + "loss": 1.8966, + "step": 9906 + }, + { + "epoch": 3.0408225905463473, + "grad_norm": 0.345262736082077, + "learning_rate": 8.156596297106784e-05, + "loss": 1.8174, + "step": 9907 + }, + { + "epoch": 3.0411295273173726, + "grad_norm": 0.30156534910202026, + "learning_rate": 8.156210803138199e-05, + "loss": 1.766, + "step": 9908 + }, + { + "epoch": 3.041436464088398, + "grad_norm": 0.28691565990448, + "learning_rate": 8.15582527797809e-05, + "loss": 1.8436, + "step": 9909 + }, + { + "epoch": 3.041743400859423, + "grad_norm": 0.33418282866477966, + "learning_rate": 8.155439721630264e-05, + "loss": 1.8939, + "step": 9910 + }, + { + "epoch": 3.042050337630448, + "grad_norm": 0.25496938824653625, + "learning_rate": 8.155054134098535e-05, + "loss": 1.8368, + "step": 9911 + }, + { + "epoch": 3.042357274401473, + "grad_norm": 0.3806788921356201, + "learning_rate": 8.154668515386711e-05, + "loss": 1.8635, + "step": 9912 + }, + { + "epoch": 3.0426642111724984, + "grad_norm": 0.42668119072914124, + "learning_rate": 8.154282865498603e-05, + "loss": 1.76, + "step": 9913 + }, + { + "epoch": 3.0429711479435237, + "grad_norm": 0.35945314168930054, + "learning_rate": 8.153897184438024e-05, + "loss": 1.8275, + "step": 9914 + }, + { + "epoch": 3.0432780847145486, + "grad_norm": 0.3225449323654175, + "learning_rate": 8.153511472208784e-05, + "loss": 1.7901, + "step": 9915 + }, + { + "epoch": 3.043585021485574, + "grad_norm": 0.2905425727367401, + "learning_rate": 8.153125728814694e-05, + "loss": 1.8021, + "step": 9916 + }, + { + "epoch": 3.0438919582565993, + "grad_norm": 0.3315529525279999, + "learning_rate": 8.15273995425957e-05, + "loss": 1.8003, + "step": 9917 + }, + { + "epoch": 3.044198895027624, + "grad_norm": 0.30256444215774536, + "learning_rate": 8.152354148547221e-05, + "loss": 1.8243, + "step": 9918 + }, + { + "epoch": 3.0445058317986495, + "grad_norm": 0.2563035190105438, + "learning_rate": 8.15196831168146e-05, + "loss": 1.7877, + "step": 9919 + }, + { + "epoch": 3.044812768569675, + "grad_norm": 0.25705814361572266, + "learning_rate": 8.151582443666101e-05, + "loss": 1.813, + "step": 9920 + }, + { + "epoch": 3.0451197053406998, + "grad_norm": 0.3649071455001831, + "learning_rate": 8.151196544504957e-05, + "loss": 1.8114, + "step": 9921 + }, + { + "epoch": 3.045426642111725, + "grad_norm": 0.4076193571090698, + "learning_rate": 8.150810614201841e-05, + "loss": 1.7869, + "step": 9922 + }, + { + "epoch": 3.04573357888275, + "grad_norm": 0.2951984107494354, + "learning_rate": 8.150424652760569e-05, + "loss": 1.7878, + "step": 9923 + }, + { + "epoch": 3.0460405156537753, + "grad_norm": 0.2243243157863617, + "learning_rate": 8.150038660184955e-05, + "loss": 1.8224, + "step": 9924 + }, + { + "epoch": 3.0463474524248007, + "grad_norm": 0.3295031487941742, + "learning_rate": 8.149652636478811e-05, + "loss": 1.8685, + "step": 9925 + }, + { + "epoch": 3.0466543891958255, + "grad_norm": 0.2973531186580658, + "learning_rate": 8.149266581645954e-05, + "loss": 1.8082, + "step": 9926 + }, + { + "epoch": 3.046961325966851, + "grad_norm": 0.25648918747901917, + "learning_rate": 8.148880495690199e-05, + "loss": 1.8089, + "step": 9927 + }, + { + "epoch": 3.047268262737876, + "grad_norm": 0.2845752537250519, + "learning_rate": 8.148494378615361e-05, + "loss": 1.8726, + "step": 9928 + }, + { + "epoch": 3.047575199508901, + "grad_norm": 0.2917105555534363, + "learning_rate": 8.148108230425255e-05, + "loss": 1.8035, + "step": 9929 + }, + { + "epoch": 3.0478821362799264, + "grad_norm": 0.2775834798812866, + "learning_rate": 8.1477220511237e-05, + "loss": 1.8545, + "step": 9930 + }, + { + "epoch": 3.0481890730509513, + "grad_norm": 0.3522767424583435, + "learning_rate": 8.14733584071451e-05, + "loss": 1.8261, + "step": 9931 + }, + { + "epoch": 3.0484960098219767, + "grad_norm": 0.3759000599384308, + "learning_rate": 8.146949599201503e-05, + "loss": 1.8405, + "step": 9932 + }, + { + "epoch": 3.048802946593002, + "grad_norm": 0.3353044390678406, + "learning_rate": 8.146563326588496e-05, + "loss": 1.7762, + "step": 9933 + }, + { + "epoch": 3.049109883364027, + "grad_norm": 0.263810932636261, + "learning_rate": 8.146177022879304e-05, + "loss": 1.7546, + "step": 9934 + }, + { + "epoch": 3.049416820135052, + "grad_norm": 0.24064256250858307, + "learning_rate": 8.14579068807775e-05, + "loss": 1.7903, + "step": 9935 + }, + { + "epoch": 3.0497237569060776, + "grad_norm": 0.3144194781780243, + "learning_rate": 8.145404322187645e-05, + "loss": 1.8011, + "step": 9936 + }, + { + "epoch": 3.0500306936771024, + "grad_norm": 0.3362879455089569, + "learning_rate": 8.145017925212812e-05, + "loss": 1.8224, + "step": 9937 + }, + { + "epoch": 3.050337630448128, + "grad_norm": 0.33979395031929016, + "learning_rate": 8.144631497157071e-05, + "loss": 1.8415, + "step": 9938 + }, + { + "epoch": 3.0506445672191527, + "grad_norm": 0.33391237258911133, + "learning_rate": 8.144245038024235e-05, + "loss": 1.7983, + "step": 9939 + }, + { + "epoch": 3.050951503990178, + "grad_norm": 0.34034964442253113, + "learning_rate": 8.143858547818128e-05, + "loss": 1.8635, + "step": 9940 + }, + { + "epoch": 3.0512584407612033, + "grad_norm": 0.3472529947757721, + "learning_rate": 8.143472026542569e-05, + "loss": 1.8067, + "step": 9941 + }, + { + "epoch": 3.0515653775322282, + "grad_norm": 0.3369109630584717, + "learning_rate": 8.143085474201376e-05, + "loss": 1.7933, + "step": 9942 + }, + { + "epoch": 3.0518723143032536, + "grad_norm": 0.3055182993412018, + "learning_rate": 8.14269889079837e-05, + "loss": 1.7358, + "step": 9943 + }, + { + "epoch": 3.052179251074279, + "grad_norm": 0.26729708909988403, + "learning_rate": 8.142312276337372e-05, + "loss": 1.8315, + "step": 9944 + }, + { + "epoch": 3.052486187845304, + "grad_norm": 0.3626720607280731, + "learning_rate": 8.141925630822203e-05, + "loss": 1.7593, + "step": 9945 + }, + { + "epoch": 3.052793124616329, + "grad_norm": 0.3673512637615204, + "learning_rate": 8.141538954256683e-05, + "loss": 1.8414, + "step": 9946 + }, + { + "epoch": 3.053100061387354, + "grad_norm": 0.30554768443107605, + "learning_rate": 8.141152246644632e-05, + "loss": 1.7504, + "step": 9947 + }, + { + "epoch": 3.0534069981583793, + "grad_norm": 0.41163405776023865, + "learning_rate": 8.140765507989875e-05, + "loss": 1.8794, + "step": 9948 + }, + { + "epoch": 3.0537139349294047, + "grad_norm": 0.592751145362854, + "learning_rate": 8.140378738296233e-05, + "loss": 1.8538, + "step": 9949 + }, + { + "epoch": 3.0540208717004296, + "grad_norm": 0.483828604221344, + "learning_rate": 8.139991937567527e-05, + "loss": 1.7952, + "step": 9950 + }, + { + "epoch": 3.054327808471455, + "grad_norm": 0.26665306091308594, + "learning_rate": 8.13960510580758e-05, + "loss": 1.8268, + "step": 9951 + }, + { + "epoch": 3.0546347452424802, + "grad_norm": 0.42917072772979736, + "learning_rate": 8.139218243020215e-05, + "loss": 1.843, + "step": 9952 + }, + { + "epoch": 3.054941682013505, + "grad_norm": 0.47911396622657776, + "learning_rate": 8.138831349209256e-05, + "loss": 1.8223, + "step": 9953 + }, + { + "epoch": 3.0552486187845305, + "grad_norm": 0.4540431797504425, + "learning_rate": 8.138444424378524e-05, + "loss": 1.9198, + "step": 9954 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.29719051718711853, + "learning_rate": 8.138057468531845e-05, + "loss": 1.7873, + "step": 9955 + }, + { + "epoch": 3.0558624923265807, + "grad_norm": 0.35133618116378784, + "learning_rate": 8.137670481673045e-05, + "loss": 1.8459, + "step": 9956 + }, + { + "epoch": 3.056169429097606, + "grad_norm": 0.42896488308906555, + "learning_rate": 8.137283463805945e-05, + "loss": 1.7814, + "step": 9957 + }, + { + "epoch": 3.056476365868631, + "grad_norm": 0.38993972539901733, + "learning_rate": 8.136896414934372e-05, + "loss": 1.7636, + "step": 9958 + }, + { + "epoch": 3.0567833026396563, + "grad_norm": 0.31362372636795044, + "learning_rate": 8.13650933506215e-05, + "loss": 1.8021, + "step": 9959 + }, + { + "epoch": 3.0570902394106816, + "grad_norm": 0.27980196475982666, + "learning_rate": 8.136122224193103e-05, + "loss": 1.8445, + "step": 9960 + }, + { + "epoch": 3.0573971761817065, + "grad_norm": 0.2721461057662964, + "learning_rate": 8.135735082331059e-05, + "loss": 1.7614, + "step": 9961 + }, + { + "epoch": 3.057704112952732, + "grad_norm": 0.25157424807548523, + "learning_rate": 8.135347909479843e-05, + "loss": 1.7598, + "step": 9962 + }, + { + "epoch": 3.0580110497237567, + "grad_norm": 0.25798025727272034, + "learning_rate": 8.13496070564328e-05, + "loss": 1.7823, + "step": 9963 + }, + { + "epoch": 3.058317986494782, + "grad_norm": 0.30775198340415955, + "learning_rate": 8.134573470825199e-05, + "loss": 1.7755, + "step": 9964 + }, + { + "epoch": 3.0586249232658074, + "grad_norm": 0.28916797041893005, + "learning_rate": 8.134186205029426e-05, + "loss": 1.8189, + "step": 9965 + }, + { + "epoch": 3.0589318600368323, + "grad_norm": 0.2829149067401886, + "learning_rate": 8.133798908259787e-05, + "loss": 1.8546, + "step": 9966 + }, + { + "epoch": 3.0592387968078576, + "grad_norm": 0.2884117662906647, + "learning_rate": 8.13341158052011e-05, + "loss": 1.7705, + "step": 9967 + }, + { + "epoch": 3.059545733578883, + "grad_norm": 0.28311973810195923, + "learning_rate": 8.133024221814225e-05, + "loss": 1.8147, + "step": 9968 + }, + { + "epoch": 3.059852670349908, + "grad_norm": 0.25405213236808777, + "learning_rate": 8.132636832145957e-05, + "loss": 1.7813, + "step": 9969 + }, + { + "epoch": 3.060159607120933, + "grad_norm": 0.3082229793071747, + "learning_rate": 8.132249411519137e-05, + "loss": 1.8536, + "step": 9970 + }, + { + "epoch": 3.060466543891958, + "grad_norm": 0.29918181896209717, + "learning_rate": 8.13186195993759e-05, + "loss": 1.8181, + "step": 9971 + }, + { + "epoch": 3.0607734806629834, + "grad_norm": 0.3025238811969757, + "learning_rate": 8.13147447740515e-05, + "loss": 1.7785, + "step": 9972 + }, + { + "epoch": 3.0610804174340087, + "grad_norm": 0.2798222303390503, + "learning_rate": 8.131086963925643e-05, + "loss": 1.7873, + "step": 9973 + }, + { + "epoch": 3.0613873542050336, + "grad_norm": 0.32636210322380066, + "learning_rate": 8.130699419502898e-05, + "loss": 1.882, + "step": 9974 + }, + { + "epoch": 3.061694290976059, + "grad_norm": 0.27722054719924927, + "learning_rate": 8.130311844140748e-05, + "loss": 1.7788, + "step": 9975 + }, + { + "epoch": 3.0620012277470843, + "grad_norm": 0.289156436920166, + "learning_rate": 8.129924237843023e-05, + "loss": 1.8591, + "step": 9976 + }, + { + "epoch": 3.062308164518109, + "grad_norm": 0.2839665412902832, + "learning_rate": 8.12953660061355e-05, + "loss": 1.8255, + "step": 9977 + }, + { + "epoch": 3.0626151012891345, + "grad_norm": 0.2650148272514343, + "learning_rate": 8.129148932456161e-05, + "loss": 1.8353, + "step": 9978 + }, + { + "epoch": 3.06292203806016, + "grad_norm": 0.2884560227394104, + "learning_rate": 8.128761233374691e-05, + "loss": 1.8099, + "step": 9979 + }, + { + "epoch": 3.0632289748311847, + "grad_norm": 0.2610029876232147, + "learning_rate": 8.128373503372967e-05, + "loss": 1.8173, + "step": 9980 + }, + { + "epoch": 3.06353591160221, + "grad_norm": 0.32512393593788147, + "learning_rate": 8.127985742454822e-05, + "loss": 1.8619, + "step": 9981 + }, + { + "epoch": 3.063842848373235, + "grad_norm": 0.3382968604564667, + "learning_rate": 8.127597950624091e-05, + "loss": 1.831, + "step": 9982 + }, + { + "epoch": 3.0641497851442603, + "grad_norm": 0.33773133158683777, + "learning_rate": 8.127210127884602e-05, + "loss": 1.8194, + "step": 9983 + }, + { + "epoch": 3.0644567219152856, + "grad_norm": 0.31642746925354004, + "learning_rate": 8.126822274240188e-05, + "loss": 1.8782, + "step": 9984 + }, + { + "epoch": 3.0647636586863105, + "grad_norm": 0.2476506233215332, + "learning_rate": 8.126434389694686e-05, + "loss": 1.7866, + "step": 9985 + }, + { + "epoch": 3.065070595457336, + "grad_norm": 0.27296319603919983, + "learning_rate": 8.126046474251927e-05, + "loss": 1.8276, + "step": 9986 + }, + { + "epoch": 3.0653775322283607, + "grad_norm": 0.353865385055542, + "learning_rate": 8.125658527915744e-05, + "loss": 1.9525, + "step": 9987 + }, + { + "epoch": 3.065684468999386, + "grad_norm": 0.370256632566452, + "learning_rate": 8.12527055068997e-05, + "loss": 1.8514, + "step": 9988 + }, + { + "epoch": 3.0659914057704114, + "grad_norm": 0.30738842487335205, + "learning_rate": 8.124882542578442e-05, + "loss": 1.8125, + "step": 9989 + }, + { + "epoch": 3.0662983425414363, + "grad_norm": 0.3151233494281769, + "learning_rate": 8.124494503584995e-05, + "loss": 1.8165, + "step": 9990 + }, + { + "epoch": 3.0666052793124616, + "grad_norm": 0.29071590304374695, + "learning_rate": 8.124106433713458e-05, + "loss": 1.7617, + "step": 9991 + }, + { + "epoch": 3.066912216083487, + "grad_norm": 0.2898697853088379, + "learning_rate": 8.123718332967672e-05, + "loss": 1.7779, + "step": 9992 + }, + { + "epoch": 3.067219152854512, + "grad_norm": 0.26601701974868774, + "learning_rate": 8.123330201351471e-05, + "loss": 1.8307, + "step": 9993 + }, + { + "epoch": 3.067526089625537, + "grad_norm": 0.2622119188308716, + "learning_rate": 8.12294203886869e-05, + "loss": 1.7958, + "step": 9994 + }, + { + "epoch": 3.0678330263965625, + "grad_norm": 0.29709386825561523, + "learning_rate": 8.122553845523166e-05, + "loss": 1.7799, + "step": 9995 + }, + { + "epoch": 3.0681399631675874, + "grad_norm": 0.31267789006233215, + "learning_rate": 8.122165621318733e-05, + "loss": 1.8149, + "step": 9996 + }, + { + "epoch": 3.0684468999386127, + "grad_norm": 0.3076523244380951, + "learning_rate": 8.121777366259232e-05, + "loss": 1.7701, + "step": 9997 + }, + { + "epoch": 3.0687538367096376, + "grad_norm": 0.30096009373664856, + "learning_rate": 8.121389080348496e-05, + "loss": 1.8323, + "step": 9998 + }, + { + "epoch": 3.069060773480663, + "grad_norm": 0.25739142298698425, + "learning_rate": 8.121000763590363e-05, + "loss": 1.8105, + "step": 9999 + }, + { + "epoch": 3.0693677102516883, + "grad_norm": 0.2780844271183014, + "learning_rate": 8.120612415988671e-05, + "loss": 1.8502, + "step": 10000 + }, + { + "epoch": 3.069674647022713, + "grad_norm": 0.3316378593444824, + "learning_rate": 8.120224037547259e-05, + "loss": 1.8244, + "step": 10001 + }, + { + "epoch": 3.0699815837937385, + "grad_norm": 0.261129766702652, + "learning_rate": 8.119835628269964e-05, + "loss": 1.7769, + "step": 10002 + }, + { + "epoch": 3.070288520564764, + "grad_norm": 0.29213985800743103, + "learning_rate": 8.119447188160625e-05, + "loss": 1.7717, + "step": 10003 + }, + { + "epoch": 3.0705954573357888, + "grad_norm": 0.38545623421669006, + "learning_rate": 8.11905871722308e-05, + "loss": 1.8433, + "step": 10004 + }, + { + "epoch": 3.070902394106814, + "grad_norm": 0.3617223799228668, + "learning_rate": 8.118670215461168e-05, + "loss": 1.8172, + "step": 10005 + }, + { + "epoch": 3.071209330877839, + "grad_norm": 0.3241543769836426, + "learning_rate": 8.11828168287873e-05, + "loss": 1.8325, + "step": 10006 + }, + { + "epoch": 3.0715162676488643, + "grad_norm": 0.3538578152656555, + "learning_rate": 8.117893119479605e-05, + "loss": 1.8188, + "step": 10007 + }, + { + "epoch": 3.0718232044198897, + "grad_norm": 0.3861970603466034, + "learning_rate": 8.117504525267632e-05, + "loss": 1.8518, + "step": 10008 + }, + { + "epoch": 3.0721301411909145, + "grad_norm": 0.35433146357536316, + "learning_rate": 8.117115900246652e-05, + "loss": 1.8601, + "step": 10009 + }, + { + "epoch": 3.07243707796194, + "grad_norm": 0.29796987771987915, + "learning_rate": 8.116727244420507e-05, + "loss": 1.7934, + "step": 10010 + }, + { + "epoch": 3.072744014732965, + "grad_norm": 0.3091779947280884, + "learning_rate": 8.116338557793035e-05, + "loss": 1.8111, + "step": 10011 + }, + { + "epoch": 3.07305095150399, + "grad_norm": 0.2741319537162781, + "learning_rate": 8.11594984036808e-05, + "loss": 1.8079, + "step": 10012 + }, + { + "epoch": 3.0733578882750154, + "grad_norm": 0.28905320167541504, + "learning_rate": 8.115561092149482e-05, + "loss": 1.8475, + "step": 10013 + }, + { + "epoch": 3.0736648250460403, + "grad_norm": 0.2897081673145294, + "learning_rate": 8.115172313141081e-05, + "loss": 1.838, + "step": 10014 + }, + { + "epoch": 3.0739717618170657, + "grad_norm": 0.2620783746242523, + "learning_rate": 8.114783503346725e-05, + "loss": 1.8024, + "step": 10015 + }, + { + "epoch": 3.074278698588091, + "grad_norm": 0.26478636264801025, + "learning_rate": 8.11439466277025e-05, + "loss": 1.8137, + "step": 10016 + }, + { + "epoch": 3.074585635359116, + "grad_norm": 0.2796174883842468, + "learning_rate": 8.114005791415502e-05, + "loss": 1.7976, + "step": 10017 + }, + { + "epoch": 3.074892572130141, + "grad_norm": 0.26813286542892456, + "learning_rate": 8.113616889286325e-05, + "loss": 1.7945, + "step": 10018 + }, + { + "epoch": 3.0751995089011666, + "grad_norm": 0.2443828582763672, + "learning_rate": 8.11322795638656e-05, + "loss": 1.7829, + "step": 10019 + }, + { + "epoch": 3.0755064456721914, + "grad_norm": 0.2981395423412323, + "learning_rate": 8.112838992720053e-05, + "loss": 1.7928, + "step": 10020 + }, + { + "epoch": 3.075813382443217, + "grad_norm": 0.25605037808418274, + "learning_rate": 8.112449998290644e-05, + "loss": 1.8129, + "step": 10021 + }, + { + "epoch": 3.0761203192142417, + "grad_norm": 0.31180307269096375, + "learning_rate": 8.112060973102181e-05, + "loss": 1.7393, + "step": 10022 + }, + { + "epoch": 3.076427255985267, + "grad_norm": 0.3230421543121338, + "learning_rate": 8.111671917158508e-05, + "loss": 1.818, + "step": 10023 + }, + { + "epoch": 3.0767341927562923, + "grad_norm": 0.3158549964427948, + "learning_rate": 8.111282830463468e-05, + "loss": 1.7582, + "step": 10024 + }, + { + "epoch": 3.0770411295273172, + "grad_norm": 0.24524325132369995, + "learning_rate": 8.110893713020908e-05, + "loss": 1.8215, + "step": 10025 + }, + { + "epoch": 3.0773480662983426, + "grad_norm": 0.2793932259082794, + "learning_rate": 8.110504564834675e-05, + "loss": 1.8551, + "step": 10026 + }, + { + "epoch": 3.077655003069368, + "grad_norm": 0.29629403352737427, + "learning_rate": 8.110115385908612e-05, + "loss": 1.8019, + "step": 10027 + }, + { + "epoch": 3.077961939840393, + "grad_norm": 0.3138490915298462, + "learning_rate": 8.109726176246564e-05, + "loss": 1.8436, + "step": 10028 + }, + { + "epoch": 3.078268876611418, + "grad_norm": 0.29802024364471436, + "learning_rate": 8.10933693585238e-05, + "loss": 1.8158, + "step": 10029 + }, + { + "epoch": 3.078575813382443, + "grad_norm": 0.30785220861434937, + "learning_rate": 8.108947664729907e-05, + "loss": 1.8674, + "step": 10030 + }, + { + "epoch": 3.0788827501534684, + "grad_norm": 0.277662992477417, + "learning_rate": 8.10855836288299e-05, + "loss": 1.8253, + "step": 10031 + }, + { + "epoch": 3.0791896869244937, + "grad_norm": 0.27399590611457825, + "learning_rate": 8.108169030315477e-05, + "loss": 1.8587, + "step": 10032 + }, + { + "epoch": 3.0794966236955186, + "grad_norm": 0.28398239612579346, + "learning_rate": 8.107779667031217e-05, + "loss": 1.8326, + "step": 10033 + }, + { + "epoch": 3.079803560466544, + "grad_norm": 0.2882741093635559, + "learning_rate": 8.107390273034057e-05, + "loss": 1.785, + "step": 10034 + }, + { + "epoch": 3.0801104972375692, + "grad_norm": 0.271043598651886, + "learning_rate": 8.107000848327843e-05, + "loss": 1.765, + "step": 10035 + }, + { + "epoch": 3.080417434008594, + "grad_norm": 0.2589638829231262, + "learning_rate": 8.106611392916427e-05, + "loss": 1.8136, + "step": 10036 + }, + { + "epoch": 3.0807243707796195, + "grad_norm": 0.3068227469921112, + "learning_rate": 8.106221906803656e-05, + "loss": 1.8034, + "step": 10037 + }, + { + "epoch": 3.0810313075506444, + "grad_norm": 0.2714168131351471, + "learning_rate": 8.105832389993379e-05, + "loss": 1.8007, + "step": 10038 + }, + { + "epoch": 3.0813382443216697, + "grad_norm": 0.2747504711151123, + "learning_rate": 8.105442842489447e-05, + "loss": 1.8135, + "step": 10039 + }, + { + "epoch": 3.081645181092695, + "grad_norm": 0.2719285488128662, + "learning_rate": 8.105053264295708e-05, + "loss": 1.7629, + "step": 10040 + }, + { + "epoch": 3.08195211786372, + "grad_norm": 0.3119582235813141, + "learning_rate": 8.104663655416014e-05, + "loss": 1.7887, + "step": 10041 + }, + { + "epoch": 3.0822590546347453, + "grad_norm": 0.35965192317962646, + "learning_rate": 8.104274015854212e-05, + "loss": 1.8484, + "step": 10042 + }, + { + "epoch": 3.0825659914057706, + "grad_norm": 0.3045980632305145, + "learning_rate": 8.103884345614157e-05, + "loss": 1.8625, + "step": 10043 + }, + { + "epoch": 3.0828729281767955, + "grad_norm": 0.2925138473510742, + "learning_rate": 8.103494644699696e-05, + "loss": 1.9306, + "step": 10044 + }, + { + "epoch": 3.083179864947821, + "grad_norm": 0.2894277274608612, + "learning_rate": 8.103104913114681e-05, + "loss": 1.7796, + "step": 10045 + }, + { + "epoch": 3.0834868017188457, + "grad_norm": 0.2776826322078705, + "learning_rate": 8.102715150862967e-05, + "loss": 1.8169, + "step": 10046 + }, + { + "epoch": 3.083793738489871, + "grad_norm": 0.3315230906009674, + "learning_rate": 8.102325357948402e-05, + "loss": 1.8139, + "step": 10047 + }, + { + "epoch": 3.0841006752608964, + "grad_norm": 0.2906761169433594, + "learning_rate": 8.10193553437484e-05, + "loss": 1.8162, + "step": 10048 + }, + { + "epoch": 3.0844076120319213, + "grad_norm": 0.32681339979171753, + "learning_rate": 8.101545680146132e-05, + "loss": 1.8245, + "step": 10049 + }, + { + "epoch": 3.0847145488029466, + "grad_norm": 0.32525795698165894, + "learning_rate": 8.101155795266131e-05, + "loss": 1.8605, + "step": 10050 + }, + { + "epoch": 3.085021485573972, + "grad_norm": 0.31705379486083984, + "learning_rate": 8.100765879738692e-05, + "loss": 1.8214, + "step": 10051 + }, + { + "epoch": 3.085328422344997, + "grad_norm": 0.27772918343544006, + "learning_rate": 8.100375933567668e-05, + "loss": 1.7822, + "step": 10052 + }, + { + "epoch": 3.085635359116022, + "grad_norm": 0.2877809405326843, + "learning_rate": 8.09998595675691e-05, + "loss": 1.7935, + "step": 10053 + }, + { + "epoch": 3.0859422958870475, + "grad_norm": 0.29759806394577026, + "learning_rate": 8.099595949310276e-05, + "loss": 1.8041, + "step": 10054 + }, + { + "epoch": 3.0862492326580724, + "grad_norm": 0.2715320289134979, + "learning_rate": 8.099205911231617e-05, + "loss": 1.7923, + "step": 10055 + }, + { + "epoch": 3.0865561694290977, + "grad_norm": 0.33566340804100037, + "learning_rate": 8.098815842524789e-05, + "loss": 1.7953, + "step": 10056 + }, + { + "epoch": 3.0868631062001226, + "grad_norm": 0.3360871970653534, + "learning_rate": 8.098425743193645e-05, + "loss": 1.8275, + "step": 10057 + }, + { + "epoch": 3.087170042971148, + "grad_norm": 0.2797739803791046, + "learning_rate": 8.098035613242043e-05, + "loss": 1.7597, + "step": 10058 + }, + { + "epoch": 3.0874769797421733, + "grad_norm": 0.25500187277793884, + "learning_rate": 8.097645452673837e-05, + "loss": 1.8059, + "step": 10059 + }, + { + "epoch": 3.087783916513198, + "grad_norm": 0.28042587637901306, + "learning_rate": 8.097255261492884e-05, + "loss": 1.7954, + "step": 10060 + }, + { + "epoch": 3.0880908532842235, + "grad_norm": 0.3616262376308441, + "learning_rate": 8.096865039703038e-05, + "loss": 1.8605, + "step": 10061 + }, + { + "epoch": 3.0883977900552484, + "grad_norm": 0.3453714847564697, + "learning_rate": 8.096474787308157e-05, + "loss": 1.7643, + "step": 10062 + }, + { + "epoch": 3.0887047268262737, + "grad_norm": 0.3192278742790222, + "learning_rate": 8.096084504312098e-05, + "loss": 1.8415, + "step": 10063 + }, + { + "epoch": 3.089011663597299, + "grad_norm": 0.2714482545852661, + "learning_rate": 8.095694190718715e-05, + "loss": 1.8204, + "step": 10064 + }, + { + "epoch": 3.089318600368324, + "grad_norm": 0.26562005281448364, + "learning_rate": 8.09530384653187e-05, + "loss": 1.7322, + "step": 10065 + }, + { + "epoch": 3.0896255371393493, + "grad_norm": 0.33727800846099854, + "learning_rate": 8.094913471755417e-05, + "loss": 1.8221, + "step": 10066 + }, + { + "epoch": 3.0899324739103746, + "grad_norm": 0.3561044931411743, + "learning_rate": 8.094523066393215e-05, + "loss": 1.8879, + "step": 10067 + }, + { + "epoch": 3.0902394106813995, + "grad_norm": 0.2568742334842682, + "learning_rate": 8.094132630449122e-05, + "loss": 1.8178, + "step": 10068 + }, + { + "epoch": 3.090546347452425, + "grad_norm": 0.4025525450706482, + "learning_rate": 8.093742163926998e-05, + "loss": 1.8186, + "step": 10069 + }, + { + "epoch": 3.09085328422345, + "grad_norm": 0.43863433599472046, + "learning_rate": 8.0933516668307e-05, + "loss": 1.8371, + "step": 10070 + }, + { + "epoch": 3.091160220994475, + "grad_norm": 0.34873950481414795, + "learning_rate": 8.092961139164087e-05, + "loss": 1.8083, + "step": 10071 + }, + { + "epoch": 3.0914671577655004, + "grad_norm": 0.31433534622192383, + "learning_rate": 8.092570580931021e-05, + "loss": 1.8154, + "step": 10072 + }, + { + "epoch": 3.0917740945365253, + "grad_norm": 0.25523966550827026, + "learning_rate": 8.092179992135358e-05, + "loss": 1.8158, + "step": 10073 + }, + { + "epoch": 3.0920810313075506, + "grad_norm": 0.348469078540802, + "learning_rate": 8.09178937278096e-05, + "loss": 1.8358, + "step": 10074 + }, + { + "epoch": 3.092387968078576, + "grad_norm": 0.33455297350883484, + "learning_rate": 8.091398722871688e-05, + "loss": 1.7779, + "step": 10075 + }, + { + "epoch": 3.092694904849601, + "grad_norm": 0.36544880270957947, + "learning_rate": 8.091008042411403e-05, + "loss": 1.9186, + "step": 10076 + }, + { + "epoch": 3.093001841620626, + "grad_norm": 0.29165831208229065, + "learning_rate": 8.090617331403965e-05, + "loss": 1.8964, + "step": 10077 + }, + { + "epoch": 3.0933087783916515, + "grad_norm": 0.31011059880256653, + "learning_rate": 8.090226589853234e-05, + "loss": 1.8453, + "step": 10078 + }, + { + "epoch": 3.0936157151626764, + "grad_norm": 0.2835703492164612, + "learning_rate": 8.089835817763071e-05, + "loss": 1.7718, + "step": 10079 + }, + { + "epoch": 3.0939226519337018, + "grad_norm": 0.2910583019256592, + "learning_rate": 8.08944501513734e-05, + "loss": 1.7881, + "step": 10080 + }, + { + "epoch": 3.0942295887047266, + "grad_norm": 0.391303688287735, + "learning_rate": 8.089054181979905e-05, + "loss": 1.7915, + "step": 10081 + }, + { + "epoch": 3.094536525475752, + "grad_norm": 0.4119330048561096, + "learning_rate": 8.088663318294623e-05, + "loss": 1.7975, + "step": 10082 + }, + { + "epoch": 3.0948434622467773, + "grad_norm": 0.2980102002620697, + "learning_rate": 8.088272424085361e-05, + "loss": 1.805, + "step": 10083 + }, + { + "epoch": 3.095150399017802, + "grad_norm": 0.3089980483055115, + "learning_rate": 8.087881499355983e-05, + "loss": 1.8265, + "step": 10084 + }, + { + "epoch": 3.0954573357888275, + "grad_norm": 0.3851003348827362, + "learning_rate": 8.087490544110348e-05, + "loss": 1.8174, + "step": 10085 + }, + { + "epoch": 3.095764272559853, + "grad_norm": 0.42357420921325684, + "learning_rate": 8.08709955835232e-05, + "loss": 1.8083, + "step": 10086 + }, + { + "epoch": 3.0960712093308778, + "grad_norm": 0.291777640581131, + "learning_rate": 8.086708542085768e-05, + "loss": 1.7713, + "step": 10087 + }, + { + "epoch": 3.096378146101903, + "grad_norm": 0.2563805878162384, + "learning_rate": 8.086317495314552e-05, + "loss": 1.7691, + "step": 10088 + }, + { + "epoch": 3.096685082872928, + "grad_norm": 0.3418877422809601, + "learning_rate": 8.085926418042536e-05, + "loss": 1.8547, + "step": 10089 + }, + { + "epoch": 3.0969920196439533, + "grad_norm": 0.3859385550022125, + "learning_rate": 8.085535310273589e-05, + "loss": 1.8226, + "step": 10090 + }, + { + "epoch": 3.0972989564149787, + "grad_norm": 0.3427267372608185, + "learning_rate": 8.085144172011571e-05, + "loss": 1.837, + "step": 10091 + }, + { + "epoch": 3.0976058931860035, + "grad_norm": 0.29290953278541565, + "learning_rate": 8.084753003260352e-05, + "loss": 1.8392, + "step": 10092 + }, + { + "epoch": 3.097912829957029, + "grad_norm": 0.33282020688056946, + "learning_rate": 8.084361804023795e-05, + "loss": 1.8351, + "step": 10093 + }, + { + "epoch": 3.098219766728054, + "grad_norm": 0.3802134394645691, + "learning_rate": 8.083970574305768e-05, + "loss": 1.7467, + "step": 10094 + }, + { + "epoch": 3.098526703499079, + "grad_norm": 0.3142111897468567, + "learning_rate": 8.083579314110135e-05, + "loss": 1.7966, + "step": 10095 + }, + { + "epoch": 3.0988336402701044, + "grad_norm": 0.2956278324127197, + "learning_rate": 8.083188023440765e-05, + "loss": 1.8724, + "step": 10096 + }, + { + "epoch": 3.0991405770411293, + "grad_norm": 0.3262473940849304, + "learning_rate": 8.082796702301522e-05, + "loss": 1.8448, + "step": 10097 + }, + { + "epoch": 3.0994475138121547, + "grad_norm": 0.29358017444610596, + "learning_rate": 8.082405350696276e-05, + "loss": 1.8679, + "step": 10098 + }, + { + "epoch": 3.09975445058318, + "grad_norm": 0.36439722776412964, + "learning_rate": 8.082013968628893e-05, + "loss": 1.8801, + "step": 10099 + }, + { + "epoch": 3.100061387354205, + "grad_norm": 0.3565322458744049, + "learning_rate": 8.081622556103244e-05, + "loss": 1.794, + "step": 10100 + }, + { + "epoch": 3.1003683241252302, + "grad_norm": 0.2841760814189911, + "learning_rate": 8.081231113123191e-05, + "loss": 1.7593, + "step": 10101 + }, + { + "epoch": 3.1006752608962556, + "grad_norm": 0.28589630126953125, + "learning_rate": 8.080839639692608e-05, + "loss": 1.864, + "step": 10102 + }, + { + "epoch": 3.1009821976672804, + "grad_norm": 0.3595057427883148, + "learning_rate": 8.080448135815362e-05, + "loss": 1.8067, + "step": 10103 + }, + { + "epoch": 3.101289134438306, + "grad_norm": 0.3909708261489868, + "learning_rate": 8.080056601495322e-05, + "loss": 1.8601, + "step": 10104 + }, + { + "epoch": 3.1015960712093307, + "grad_norm": 0.35180148482322693, + "learning_rate": 8.079665036736358e-05, + "loss": 1.8328, + "step": 10105 + }, + { + "epoch": 3.101903007980356, + "grad_norm": 0.3065175712108612, + "learning_rate": 8.079273441542338e-05, + "loss": 1.8449, + "step": 10106 + }, + { + "epoch": 3.1022099447513813, + "grad_norm": 0.31358617544174194, + "learning_rate": 8.078881815917134e-05, + "loss": 1.8325, + "step": 10107 + }, + { + "epoch": 3.1025168815224062, + "grad_norm": 0.4737118184566498, + "learning_rate": 8.078490159864614e-05, + "loss": 1.8232, + "step": 10108 + }, + { + "epoch": 3.1028238182934316, + "grad_norm": 0.435148686170578, + "learning_rate": 8.078098473388651e-05, + "loss": 1.8227, + "step": 10109 + }, + { + "epoch": 3.103130755064457, + "grad_norm": 0.3080987334251404, + "learning_rate": 8.077706756493115e-05, + "loss": 1.8072, + "step": 10110 + }, + { + "epoch": 3.103437691835482, + "grad_norm": 0.3225170075893402, + "learning_rate": 8.077315009181876e-05, + "loss": 1.7716, + "step": 10111 + }, + { + "epoch": 3.103744628606507, + "grad_norm": 0.46642443537712097, + "learning_rate": 8.076923231458808e-05, + "loss": 1.8295, + "step": 10112 + }, + { + "epoch": 3.104051565377532, + "grad_norm": 0.42561766505241394, + "learning_rate": 8.07653142332778e-05, + "loss": 1.8553, + "step": 10113 + }, + { + "epoch": 3.1043585021485574, + "grad_norm": 0.27187541127204895, + "learning_rate": 8.076139584792664e-05, + "loss": 1.7937, + "step": 10114 + }, + { + "epoch": 3.1046654389195827, + "grad_norm": 0.27822238206863403, + "learning_rate": 8.075747715857335e-05, + "loss": 1.8151, + "step": 10115 + }, + { + "epoch": 3.1049723756906076, + "grad_norm": 0.40106478333473206, + "learning_rate": 8.075355816525665e-05, + "loss": 1.8637, + "step": 10116 + }, + { + "epoch": 3.105279312461633, + "grad_norm": 0.33455124497413635, + "learning_rate": 8.074963886801525e-05, + "loss": 1.8543, + "step": 10117 + }, + { + "epoch": 3.1055862492326582, + "grad_norm": 0.32246437668800354, + "learning_rate": 8.07457192668879e-05, + "loss": 1.7907, + "step": 10118 + }, + { + "epoch": 3.105893186003683, + "grad_norm": 0.45360109210014343, + "learning_rate": 8.074179936191332e-05, + "loss": 1.7404, + "step": 10119 + }, + { + "epoch": 3.1062001227747085, + "grad_norm": 0.445916086435318, + "learning_rate": 8.07378791531303e-05, + "loss": 1.778, + "step": 10120 + }, + { + "epoch": 3.1065070595457334, + "grad_norm": 0.28561538457870483, + "learning_rate": 8.073395864057751e-05, + "loss": 1.8723, + "step": 10121 + }, + { + "epoch": 3.1068139963167587, + "grad_norm": 0.3258218467235565, + "learning_rate": 8.073003782429373e-05, + "loss": 1.8106, + "step": 10122 + }, + { + "epoch": 3.107120933087784, + "grad_norm": 0.5459560751914978, + "learning_rate": 8.07261167043177e-05, + "loss": 1.8022, + "step": 10123 + }, + { + "epoch": 3.107427869858809, + "grad_norm": 0.4828549921512604, + "learning_rate": 8.072219528068819e-05, + "loss": 1.7556, + "step": 10124 + }, + { + "epoch": 3.1077348066298343, + "grad_norm": 0.24075324833393097, + "learning_rate": 8.071827355344393e-05, + "loss": 1.7901, + "step": 10125 + }, + { + "epoch": 3.1080417434008596, + "grad_norm": 0.44677188992500305, + "learning_rate": 8.071435152262367e-05, + "loss": 1.7858, + "step": 10126 + }, + { + "epoch": 3.1083486801718845, + "grad_norm": 0.49862590432167053, + "learning_rate": 8.071042918826622e-05, + "loss": 1.805, + "step": 10127 + }, + { + "epoch": 3.10865561694291, + "grad_norm": 0.30883491039276123, + "learning_rate": 8.07065065504103e-05, + "loss": 1.7693, + "step": 10128 + }, + { + "epoch": 3.108962553713935, + "grad_norm": 0.29583030939102173, + "learning_rate": 8.070258360909467e-05, + "loss": 1.8141, + "step": 10129 + }, + { + "epoch": 3.10926949048496, + "grad_norm": 0.3595346510410309, + "learning_rate": 8.069866036435812e-05, + "loss": 1.8286, + "step": 10130 + }, + { + "epoch": 3.1095764272559854, + "grad_norm": 0.3215504288673401, + "learning_rate": 8.069473681623942e-05, + "loss": 1.8557, + "step": 10131 + }, + { + "epoch": 3.1098833640270103, + "grad_norm": 0.29734939336776733, + "learning_rate": 8.069081296477734e-05, + "loss": 1.7996, + "step": 10132 + }, + { + "epoch": 3.1101903007980356, + "grad_norm": 0.33546003699302673, + "learning_rate": 8.068688881001065e-05, + "loss": 1.8307, + "step": 10133 + }, + { + "epoch": 3.110497237569061, + "grad_norm": 0.3886832296848297, + "learning_rate": 8.068296435197814e-05, + "loss": 1.751, + "step": 10134 + }, + { + "epoch": 3.110804174340086, + "grad_norm": 0.34505394101142883, + "learning_rate": 8.06790395907186e-05, + "loss": 1.7543, + "step": 10135 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.27018141746520996, + "learning_rate": 8.06751145262708e-05, + "loss": 1.8109, + "step": 10136 + }, + { + "epoch": 3.1114180478821365, + "grad_norm": 0.3367149531841278, + "learning_rate": 8.067118915867355e-05, + "loss": 1.8025, + "step": 10137 + }, + { + "epoch": 3.1117249846531614, + "grad_norm": 0.40811091661453247, + "learning_rate": 8.066726348796562e-05, + "loss": 1.7327, + "step": 10138 + }, + { + "epoch": 3.1120319214241867, + "grad_norm": 0.3511471152305603, + "learning_rate": 8.066333751418583e-05, + "loss": 1.8711, + "step": 10139 + }, + { + "epoch": 3.1123388581952116, + "grad_norm": 0.3112446367740631, + "learning_rate": 8.065941123737295e-05, + "loss": 1.8621, + "step": 10140 + }, + { + "epoch": 3.112645794966237, + "grad_norm": 0.3424238860607147, + "learning_rate": 8.065548465756581e-05, + "loss": 1.8383, + "step": 10141 + }, + { + "epoch": 3.1129527317372623, + "grad_norm": 0.380013108253479, + "learning_rate": 8.06515577748032e-05, + "loss": 1.8121, + "step": 10142 + }, + { + "epoch": 3.113259668508287, + "grad_norm": 0.2650558650493622, + "learning_rate": 8.064763058912393e-05, + "loss": 1.866, + "step": 10143 + }, + { + "epoch": 3.1135666052793125, + "grad_norm": 0.30580762028694153, + "learning_rate": 8.06437031005668e-05, + "loss": 1.7769, + "step": 10144 + }, + { + "epoch": 3.113873542050338, + "grad_norm": 0.29927194118499756, + "learning_rate": 8.063977530917066e-05, + "loss": 1.7897, + "step": 10145 + }, + { + "epoch": 3.1141804788213627, + "grad_norm": 0.24322012066841125, + "learning_rate": 8.063584721497429e-05, + "loss": 1.7968, + "step": 10146 + }, + { + "epoch": 3.114487415592388, + "grad_norm": 0.3082945644855499, + "learning_rate": 8.063191881801651e-05, + "loss": 1.8456, + "step": 10147 + }, + { + "epoch": 3.114794352363413, + "grad_norm": 0.3247329890727997, + "learning_rate": 8.062799011833617e-05, + "loss": 1.7436, + "step": 10148 + }, + { + "epoch": 3.1151012891344383, + "grad_norm": 0.27591946721076965, + "learning_rate": 8.062406111597207e-05, + "loss": 1.7976, + "step": 10149 + }, + { + "epoch": 3.1154082259054636, + "grad_norm": 0.2752058804035187, + "learning_rate": 8.062013181096306e-05, + "loss": 1.7814, + "step": 10150 + }, + { + "epoch": 3.1157151626764885, + "grad_norm": 0.3207196891307831, + "learning_rate": 8.061620220334795e-05, + "loss": 1.7767, + "step": 10151 + }, + { + "epoch": 3.116022099447514, + "grad_norm": 0.2895309627056122, + "learning_rate": 8.061227229316559e-05, + "loss": 1.8588, + "step": 10152 + }, + { + "epoch": 3.116329036218539, + "grad_norm": 0.333843469619751, + "learning_rate": 8.060834208045481e-05, + "loss": 1.7871, + "step": 10153 + }, + { + "epoch": 3.116635972989564, + "grad_norm": 0.43877774477005005, + "learning_rate": 8.060441156525445e-05, + "loss": 1.8165, + "step": 10154 + }, + { + "epoch": 3.1169429097605894, + "grad_norm": 0.35700589418411255, + "learning_rate": 8.060048074760337e-05, + "loss": 1.777, + "step": 10155 + }, + { + "epoch": 3.1172498465316143, + "grad_norm": 0.26124534010887146, + "learning_rate": 8.059654962754039e-05, + "loss": 1.8343, + "step": 10156 + }, + { + "epoch": 3.1175567833026396, + "grad_norm": 0.331444650888443, + "learning_rate": 8.059261820510438e-05, + "loss": 1.9437, + "step": 10157 + }, + { + "epoch": 3.117863720073665, + "grad_norm": 0.31657731533050537, + "learning_rate": 8.058868648033419e-05, + "loss": 1.7621, + "step": 10158 + }, + { + "epoch": 3.11817065684469, + "grad_norm": 0.2785957455635071, + "learning_rate": 8.058475445326867e-05, + "loss": 1.9049, + "step": 10159 + }, + { + "epoch": 3.118477593615715, + "grad_norm": 0.2605743408203125, + "learning_rate": 8.058082212394667e-05, + "loss": 1.7895, + "step": 10160 + }, + { + "epoch": 3.1187845303867405, + "grad_norm": 0.2981378138065338, + "learning_rate": 8.057688949240707e-05, + "loss": 1.8373, + "step": 10161 + }, + { + "epoch": 3.1190914671577654, + "grad_norm": 0.2944273054599762, + "learning_rate": 8.057295655868873e-05, + "loss": 1.8373, + "step": 10162 + }, + { + "epoch": 3.1193984039287908, + "grad_norm": 0.2696721851825714, + "learning_rate": 8.056902332283052e-05, + "loss": 1.8023, + "step": 10163 + }, + { + "epoch": 3.1197053406998156, + "grad_norm": 0.27659857273101807, + "learning_rate": 8.056508978487128e-05, + "loss": 1.8453, + "step": 10164 + }, + { + "epoch": 3.120012277470841, + "grad_norm": 0.2982441186904907, + "learning_rate": 8.056115594484992e-05, + "loss": 1.9072, + "step": 10165 + }, + { + "epoch": 3.1203192142418663, + "grad_norm": 0.3136404752731323, + "learning_rate": 8.055722180280531e-05, + "loss": 1.8585, + "step": 10166 + }, + { + "epoch": 3.120626151012891, + "grad_norm": 0.2979940176010132, + "learning_rate": 8.055328735877631e-05, + "loss": 1.8699, + "step": 10167 + }, + { + "epoch": 3.1209330877839165, + "grad_norm": 0.2585618793964386, + "learning_rate": 8.054935261280184e-05, + "loss": 1.8323, + "step": 10168 + }, + { + "epoch": 3.121240024554942, + "grad_norm": 0.28734859824180603, + "learning_rate": 8.054541756492075e-05, + "loss": 1.8694, + "step": 10169 + }, + { + "epoch": 3.1215469613259668, + "grad_norm": 0.30582788586616516, + "learning_rate": 8.054148221517193e-05, + "loss": 1.856, + "step": 10170 + }, + { + "epoch": 3.121853898096992, + "grad_norm": 0.3128255009651184, + "learning_rate": 8.053754656359429e-05, + "loss": 1.8329, + "step": 10171 + }, + { + "epoch": 3.122160834868017, + "grad_norm": 0.2845318615436554, + "learning_rate": 8.053361061022671e-05, + "loss": 1.8111, + "step": 10172 + }, + { + "epoch": 3.1224677716390423, + "grad_norm": 0.2994609773159027, + "learning_rate": 8.05296743551081e-05, + "loss": 1.8157, + "step": 10173 + }, + { + "epoch": 3.1227747084100677, + "grad_norm": 0.26397961378097534, + "learning_rate": 8.052573779827737e-05, + "loss": 1.8572, + "step": 10174 + }, + { + "epoch": 3.1230816451810925, + "grad_norm": 0.2911500334739685, + "learning_rate": 8.052180093977339e-05, + "loss": 1.8312, + "step": 10175 + }, + { + "epoch": 3.123388581952118, + "grad_norm": 0.33455008268356323, + "learning_rate": 8.051786377963509e-05, + "loss": 1.8748, + "step": 10176 + }, + { + "epoch": 3.123695518723143, + "grad_norm": 0.3127586841583252, + "learning_rate": 8.051392631790135e-05, + "loss": 1.8224, + "step": 10177 + }, + { + "epoch": 3.124002455494168, + "grad_norm": 0.2910686433315277, + "learning_rate": 8.050998855461113e-05, + "loss": 1.8557, + "step": 10178 + }, + { + "epoch": 3.1243093922651934, + "grad_norm": 0.2849208414554596, + "learning_rate": 8.050605048980333e-05, + "loss": 1.82, + "step": 10179 + }, + { + "epoch": 3.1246163290362183, + "grad_norm": 0.35189691185951233, + "learning_rate": 8.050211212351683e-05, + "loss": 1.7884, + "step": 10180 + }, + { + "epoch": 3.1249232658072437, + "grad_norm": 0.3641110360622406, + "learning_rate": 8.04981734557906e-05, + "loss": 1.7984, + "step": 10181 + }, + { + "epoch": 3.125230202578269, + "grad_norm": 0.3111717700958252, + "learning_rate": 8.049423448666353e-05, + "loss": 1.8134, + "step": 10182 + }, + { + "epoch": 3.125537139349294, + "grad_norm": 0.2608453631401062, + "learning_rate": 8.049029521617457e-05, + "loss": 1.765, + "step": 10183 + }, + { + "epoch": 3.1258440761203192, + "grad_norm": 0.28779423236846924, + "learning_rate": 8.048635564436265e-05, + "loss": 1.8355, + "step": 10184 + }, + { + "epoch": 3.1261510128913446, + "grad_norm": 0.38227665424346924, + "learning_rate": 8.048241577126668e-05, + "loss": 1.8487, + "step": 10185 + }, + { + "epoch": 3.1264579496623695, + "grad_norm": 0.3603171706199646, + "learning_rate": 8.047847559692562e-05, + "loss": 1.8035, + "step": 10186 + }, + { + "epoch": 3.126764886433395, + "grad_norm": 0.21950066089630127, + "learning_rate": 8.04745351213784e-05, + "loss": 1.7399, + "step": 10187 + }, + { + "epoch": 3.12707182320442, + "grad_norm": 0.2796075642108917, + "learning_rate": 8.047059434466395e-05, + "loss": 1.8229, + "step": 10188 + }, + { + "epoch": 3.127378759975445, + "grad_norm": 0.3382907807826996, + "learning_rate": 8.046665326682125e-05, + "loss": 1.7713, + "step": 10189 + }, + { + "epoch": 3.1276856967464703, + "grad_norm": 0.36472463607788086, + "learning_rate": 8.04627118878892e-05, + "loss": 1.8129, + "step": 10190 + }, + { + "epoch": 3.1279926335174952, + "grad_norm": 0.2971884310245514, + "learning_rate": 8.045877020790679e-05, + "loss": 1.7894, + "step": 10191 + }, + { + "epoch": 3.1282995702885206, + "grad_norm": 0.2292303442955017, + "learning_rate": 8.045482822691297e-05, + "loss": 1.7637, + "step": 10192 + }, + { + "epoch": 3.128606507059546, + "grad_norm": 0.300750732421875, + "learning_rate": 8.045088594494668e-05, + "loss": 1.7678, + "step": 10193 + }, + { + "epoch": 3.128913443830571, + "grad_norm": 0.3121531009674072, + "learning_rate": 8.044694336204688e-05, + "loss": 1.8651, + "step": 10194 + }, + { + "epoch": 3.129220380601596, + "grad_norm": 0.2456093430519104, + "learning_rate": 8.044300047825254e-05, + "loss": 1.7769, + "step": 10195 + }, + { + "epoch": 3.129527317372621, + "grad_norm": 0.25085800886154175, + "learning_rate": 8.043905729360264e-05, + "loss": 1.7723, + "step": 10196 + }, + { + "epoch": 3.1298342541436464, + "grad_norm": 0.2505287826061249, + "learning_rate": 8.043511380813612e-05, + "loss": 1.7943, + "step": 10197 + }, + { + "epoch": 3.1301411909146717, + "grad_norm": 0.27144530415534973, + "learning_rate": 8.043117002189198e-05, + "loss": 1.8119, + "step": 10198 + }, + { + "epoch": 3.1304481276856966, + "grad_norm": 0.2702989876270294, + "learning_rate": 8.042722593490916e-05, + "loss": 1.8517, + "step": 10199 + }, + { + "epoch": 3.130755064456722, + "grad_norm": 0.2585136890411377, + "learning_rate": 8.042328154722667e-05, + "loss": 1.8382, + "step": 10200 + }, + { + "epoch": 3.1310620012277472, + "grad_norm": 0.26306065917015076, + "learning_rate": 8.041933685888348e-05, + "loss": 1.8211, + "step": 10201 + }, + { + "epoch": 3.131368937998772, + "grad_norm": 0.2208927720785141, + "learning_rate": 8.041539186991858e-05, + "loss": 1.7765, + "step": 10202 + }, + { + "epoch": 3.1316758747697975, + "grad_norm": 0.2756440043449402, + "learning_rate": 8.041144658037095e-05, + "loss": 1.898, + "step": 10203 + }, + { + "epoch": 3.131982811540823, + "grad_norm": 0.29718101024627686, + "learning_rate": 8.040750099027958e-05, + "loss": 1.8226, + "step": 10204 + }, + { + "epoch": 3.1322897483118477, + "grad_norm": 0.3166738748550415, + "learning_rate": 8.040355509968345e-05, + "loss": 1.8129, + "step": 10205 + }, + { + "epoch": 3.132596685082873, + "grad_norm": 0.3534909784793854, + "learning_rate": 8.039960890862158e-05, + "loss": 1.8915, + "step": 10206 + }, + { + "epoch": 3.132903621853898, + "grad_norm": 0.3015006184577942, + "learning_rate": 8.039566241713297e-05, + "loss": 1.8389, + "step": 10207 + }, + { + "epoch": 3.1332105586249233, + "grad_norm": 0.35226619243621826, + "learning_rate": 8.039171562525659e-05, + "loss": 1.7287, + "step": 10208 + }, + { + "epoch": 3.1335174953959486, + "grad_norm": 0.4290136694908142, + "learning_rate": 8.038776853303146e-05, + "loss": 1.8768, + "step": 10209 + }, + { + "epoch": 3.1338244321669735, + "grad_norm": 0.2828960418701172, + "learning_rate": 8.03838211404966e-05, + "loss": 1.7552, + "step": 10210 + }, + { + "epoch": 3.134131368937999, + "grad_norm": 0.3781953752040863, + "learning_rate": 8.0379873447691e-05, + "loss": 1.7812, + "step": 10211 + }, + { + "epoch": 3.1344383057090237, + "grad_norm": 0.4282926023006439, + "learning_rate": 8.037592545465371e-05, + "loss": 1.84, + "step": 10212 + }, + { + "epoch": 3.134745242480049, + "grad_norm": 0.2622411251068115, + "learning_rate": 8.03719771614237e-05, + "loss": 1.8114, + "step": 10213 + }, + { + "epoch": 3.1350521792510744, + "grad_norm": 0.34881457686424255, + "learning_rate": 8.036802856804001e-05, + "loss": 1.7694, + "step": 10214 + }, + { + "epoch": 3.1353591160220993, + "grad_norm": 0.40797632932662964, + "learning_rate": 8.036407967454167e-05, + "loss": 1.7595, + "step": 10215 + }, + { + "epoch": 3.1356660527931246, + "grad_norm": 0.24902814626693726, + "learning_rate": 8.036013048096769e-05, + "loss": 1.8068, + "step": 10216 + }, + { + "epoch": 3.13597298956415, + "grad_norm": 0.3682909607887268, + "learning_rate": 8.035618098735711e-05, + "loss": 1.8519, + "step": 10217 + }, + { + "epoch": 3.136279926335175, + "grad_norm": 0.6111233234405518, + "learning_rate": 8.035223119374895e-05, + "loss": 1.9254, + "step": 10218 + }, + { + "epoch": 3.1365868631062, + "grad_norm": 0.4793062210083008, + "learning_rate": 8.034828110018227e-05, + "loss": 1.786, + "step": 10219 + }, + { + "epoch": 3.1368937998772255, + "grad_norm": 0.3074932396411896, + "learning_rate": 8.034433070669607e-05, + "loss": 1.8495, + "step": 10220 + }, + { + "epoch": 3.1372007366482504, + "grad_norm": 0.4366479218006134, + "learning_rate": 8.034038001332942e-05, + "loss": 1.8501, + "step": 10221 + }, + { + "epoch": 3.1375076734192757, + "grad_norm": 0.4660070538520813, + "learning_rate": 8.033642902012135e-05, + "loss": 1.8317, + "step": 10222 + }, + { + "epoch": 3.1378146101903006, + "grad_norm": 0.3452899158000946, + "learning_rate": 8.03324777271109e-05, + "loss": 1.8702, + "step": 10223 + }, + { + "epoch": 3.138121546961326, + "grad_norm": 0.3658824563026428, + "learning_rate": 8.032852613433713e-05, + "loss": 1.8754, + "step": 10224 + }, + { + "epoch": 3.1384284837323513, + "grad_norm": 0.3777768909931183, + "learning_rate": 8.03245742418391e-05, + "loss": 1.8613, + "step": 10225 + }, + { + "epoch": 3.138735420503376, + "grad_norm": 0.3873192071914673, + "learning_rate": 8.032062204965582e-05, + "loss": 1.8438, + "step": 10226 + }, + { + "epoch": 3.1390423572744015, + "grad_norm": 0.30686715245246887, + "learning_rate": 8.031666955782641e-05, + "loss": 1.811, + "step": 10227 + }, + { + "epoch": 3.139349294045427, + "grad_norm": 0.2738516330718994, + "learning_rate": 8.03127167663899e-05, + "loss": 1.757, + "step": 10228 + }, + { + "epoch": 3.1396562308164517, + "grad_norm": 0.3093133270740509, + "learning_rate": 8.030876367538536e-05, + "loss": 1.8181, + "step": 10229 + }, + { + "epoch": 3.139963167587477, + "grad_norm": 0.3247159719467163, + "learning_rate": 8.030481028485185e-05, + "loss": 1.7798, + "step": 10230 + }, + { + "epoch": 3.140270104358502, + "grad_norm": 0.2855088412761688, + "learning_rate": 8.030085659482845e-05, + "loss": 1.825, + "step": 10231 + }, + { + "epoch": 3.1405770411295273, + "grad_norm": 0.2818242907524109, + "learning_rate": 8.02969026053542e-05, + "loss": 1.7737, + "step": 10232 + }, + { + "epoch": 3.1408839779005526, + "grad_norm": 0.27074751257896423, + "learning_rate": 8.029294831646822e-05, + "loss": 1.8306, + "step": 10233 + }, + { + "epoch": 3.1411909146715775, + "grad_norm": 0.29740920662879944, + "learning_rate": 8.028899372820954e-05, + "loss": 1.8157, + "step": 10234 + }, + { + "epoch": 3.141497851442603, + "grad_norm": 0.30743202567100525, + "learning_rate": 8.028503884061731e-05, + "loss": 1.7626, + "step": 10235 + }, + { + "epoch": 3.141804788213628, + "grad_norm": 0.27812567353248596, + "learning_rate": 8.028108365373058e-05, + "loss": 1.7604, + "step": 10236 + }, + { + "epoch": 3.142111724984653, + "grad_norm": 0.26212629675865173, + "learning_rate": 8.027712816758839e-05, + "loss": 1.8161, + "step": 10237 + }, + { + "epoch": 3.1424186617556784, + "grad_norm": 0.3611658811569214, + "learning_rate": 8.02731723822299e-05, + "loss": 1.8283, + "step": 10238 + }, + { + "epoch": 3.1427255985267033, + "grad_norm": 0.31705498695373535, + "learning_rate": 8.026921629769418e-05, + "loss": 1.7986, + "step": 10239 + }, + { + "epoch": 3.1430325352977286, + "grad_norm": 0.25905972719192505, + "learning_rate": 8.026525991402032e-05, + "loss": 1.7926, + "step": 10240 + }, + { + "epoch": 3.143339472068754, + "grad_norm": 0.42376595735549927, + "learning_rate": 8.026130323124741e-05, + "loss": 1.8275, + "step": 10241 + }, + { + "epoch": 3.143646408839779, + "grad_norm": 0.415556401014328, + "learning_rate": 8.025734624941458e-05, + "loss": 1.7938, + "step": 10242 + }, + { + "epoch": 3.143953345610804, + "grad_norm": 0.3558904528617859, + "learning_rate": 8.025338896856091e-05, + "loss": 1.836, + "step": 10243 + }, + { + "epoch": 3.1442602823818295, + "grad_norm": 0.3091062307357788, + "learning_rate": 8.024943138872553e-05, + "loss": 1.8285, + "step": 10244 + }, + { + "epoch": 3.1445672191528544, + "grad_norm": 0.2620905041694641, + "learning_rate": 8.024547350994753e-05, + "loss": 1.7115, + "step": 10245 + }, + { + "epoch": 3.1448741559238798, + "grad_norm": 0.25716835260391235, + "learning_rate": 8.024151533226604e-05, + "loss": 1.7702, + "step": 10246 + }, + { + "epoch": 3.1451810926949046, + "grad_norm": 0.250844269990921, + "learning_rate": 8.023755685572017e-05, + "loss": 1.7617, + "step": 10247 + }, + { + "epoch": 3.14548802946593, + "grad_norm": 0.23898956179618835, + "learning_rate": 8.023359808034903e-05, + "loss": 1.7872, + "step": 10248 + }, + { + "epoch": 3.1457949662369553, + "grad_norm": 0.2335387021303177, + "learning_rate": 8.022963900619176e-05, + "loss": 1.7656, + "step": 10249 + }, + { + "epoch": 3.14610190300798, + "grad_norm": 0.21822704374790192, + "learning_rate": 8.022567963328749e-05, + "loss": 1.7706, + "step": 10250 + }, + { + "epoch": 3.1464088397790055, + "grad_norm": 0.2627898156642914, + "learning_rate": 8.022171996167531e-05, + "loss": 1.8559, + "step": 10251 + }, + { + "epoch": 3.146715776550031, + "grad_norm": 0.2530064582824707, + "learning_rate": 8.021775999139441e-05, + "loss": 1.788, + "step": 10252 + }, + { + "epoch": 3.1470227133210558, + "grad_norm": 0.2293635457754135, + "learning_rate": 8.021379972248387e-05, + "loss": 1.8129, + "step": 10253 + }, + { + "epoch": 3.147329650092081, + "grad_norm": 0.27753588557243347, + "learning_rate": 8.020983915498286e-05, + "loss": 1.7957, + "step": 10254 + }, + { + "epoch": 3.147636586863106, + "grad_norm": 0.24507668614387512, + "learning_rate": 8.020587828893051e-05, + "loss": 1.7969, + "step": 10255 + }, + { + "epoch": 3.1479435236341313, + "grad_norm": 0.24818891286849976, + "learning_rate": 8.020191712436598e-05, + "loss": 1.8412, + "step": 10256 + }, + { + "epoch": 3.1482504604051567, + "grad_norm": 0.2463149130344391, + "learning_rate": 8.01979556613284e-05, + "loss": 1.8097, + "step": 10257 + }, + { + "epoch": 3.1485573971761815, + "grad_norm": 0.26742151379585266, + "learning_rate": 8.019399389985692e-05, + "loss": 1.8487, + "step": 10258 + }, + { + "epoch": 3.148864333947207, + "grad_norm": 0.3078254461288452, + "learning_rate": 8.01900318399907e-05, + "loss": 1.8189, + "step": 10259 + }, + { + "epoch": 3.149171270718232, + "grad_norm": 0.3819321393966675, + "learning_rate": 8.018606948176887e-05, + "loss": 1.8019, + "step": 10260 + }, + { + "epoch": 3.149478207489257, + "grad_norm": 0.3932126462459564, + "learning_rate": 8.018210682523061e-05, + "loss": 1.787, + "step": 10261 + }, + { + "epoch": 3.1497851442602824, + "grad_norm": 0.2696186900138855, + "learning_rate": 8.017814387041511e-05, + "loss": 1.8345, + "step": 10262 + }, + { + "epoch": 3.150092081031308, + "grad_norm": 0.32631832361221313, + "learning_rate": 8.017418061736149e-05, + "loss": 1.7724, + "step": 10263 + }, + { + "epoch": 3.1503990178023327, + "grad_norm": 0.36187833547592163, + "learning_rate": 8.017021706610893e-05, + "loss": 1.7829, + "step": 10264 + }, + { + "epoch": 3.150705954573358, + "grad_norm": 0.29678142070770264, + "learning_rate": 8.01662532166966e-05, + "loss": 1.7896, + "step": 10265 + }, + { + "epoch": 3.151012891344383, + "grad_norm": 0.2997078001499176, + "learning_rate": 8.016228906916368e-05, + "loss": 1.8401, + "step": 10266 + }, + { + "epoch": 3.1513198281154082, + "grad_norm": 0.4688792824745178, + "learning_rate": 8.015832462354933e-05, + "loss": 1.8263, + "step": 10267 + }, + { + "epoch": 3.1516267648864336, + "grad_norm": 0.42710503935813904, + "learning_rate": 8.015435987989275e-05, + "loss": 1.8233, + "step": 10268 + }, + { + "epoch": 3.1519337016574585, + "grad_norm": 0.2490987628698349, + "learning_rate": 8.01503948382331e-05, + "loss": 1.7792, + "step": 10269 + }, + { + "epoch": 3.152240638428484, + "grad_norm": 0.400836706161499, + "learning_rate": 8.014642949860957e-05, + "loss": 1.8113, + "step": 10270 + }, + { + "epoch": 3.1525475751995087, + "grad_norm": 0.47995972633361816, + "learning_rate": 8.014246386106138e-05, + "loss": 1.8754, + "step": 10271 + }, + { + "epoch": 3.152854511970534, + "grad_norm": 0.39069879055023193, + "learning_rate": 8.013849792562769e-05, + "loss": 1.8541, + "step": 10272 + }, + { + "epoch": 3.1531614487415593, + "grad_norm": 0.27174463868141174, + "learning_rate": 8.013453169234768e-05, + "loss": 1.8018, + "step": 10273 + }, + { + "epoch": 3.1534683855125842, + "grad_norm": 0.37808045744895935, + "learning_rate": 8.013056516126058e-05, + "loss": 1.8346, + "step": 10274 + }, + { + "epoch": 3.1537753222836096, + "grad_norm": 0.43864908814430237, + "learning_rate": 8.012659833240557e-05, + "loss": 1.7626, + "step": 10275 + }, + { + "epoch": 3.154082259054635, + "grad_norm": 0.3592168688774109, + "learning_rate": 8.012263120582187e-05, + "loss": 1.8261, + "step": 10276 + }, + { + "epoch": 3.15438919582566, + "grad_norm": 0.3056562542915344, + "learning_rate": 8.011866378154866e-05, + "loss": 1.903, + "step": 10277 + }, + { + "epoch": 3.154696132596685, + "grad_norm": 0.2898549735546112, + "learning_rate": 8.011469605962517e-05, + "loss": 1.7781, + "step": 10278 + }, + { + "epoch": 3.1550030693677105, + "grad_norm": 0.3498871624469757, + "learning_rate": 8.011072804009059e-05, + "loss": 1.7571, + "step": 10279 + }, + { + "epoch": 3.1553100061387354, + "grad_norm": 0.3330932557582855, + "learning_rate": 8.010675972298416e-05, + "loss": 1.8298, + "step": 10280 + }, + { + "epoch": 3.1556169429097607, + "grad_norm": 0.2540839910507202, + "learning_rate": 8.010279110834507e-05, + "loss": 1.8327, + "step": 10281 + }, + { + "epoch": 3.1559238796807856, + "grad_norm": 0.3557111322879791, + "learning_rate": 8.009882219621257e-05, + "loss": 1.7611, + "step": 10282 + }, + { + "epoch": 3.156230816451811, + "grad_norm": 0.28293952345848083, + "learning_rate": 8.009485298662584e-05, + "loss": 1.7761, + "step": 10283 + }, + { + "epoch": 3.1565377532228363, + "grad_norm": 0.27089303731918335, + "learning_rate": 8.009088347962416e-05, + "loss": 1.8081, + "step": 10284 + }, + { + "epoch": 3.156844689993861, + "grad_norm": 0.2689332664012909, + "learning_rate": 8.008691367524673e-05, + "loss": 1.7458, + "step": 10285 + }, + { + "epoch": 3.1571516267648865, + "grad_norm": 0.2495841234922409, + "learning_rate": 8.008294357353278e-05, + "loss": 1.8307, + "step": 10286 + }, + { + "epoch": 3.1574585635359114, + "grad_norm": 0.29242852330207825, + "learning_rate": 8.007897317452156e-05, + "loss": 1.9216, + "step": 10287 + }, + { + "epoch": 3.1577655003069367, + "grad_norm": 0.26574134826660156, + "learning_rate": 8.007500247825229e-05, + "loss": 1.8392, + "step": 10288 + }, + { + "epoch": 3.158072437077962, + "grad_norm": 0.2503872811794281, + "learning_rate": 8.00710314847642e-05, + "loss": 1.7742, + "step": 10289 + }, + { + "epoch": 3.158379373848987, + "grad_norm": 0.25614771246910095, + "learning_rate": 8.006706019409658e-05, + "loss": 1.828, + "step": 10290 + }, + { + "epoch": 3.1586863106200123, + "grad_norm": 0.259369820356369, + "learning_rate": 8.006308860628863e-05, + "loss": 1.8328, + "step": 10291 + }, + { + "epoch": 3.1589932473910376, + "grad_norm": 0.28183647990226746, + "learning_rate": 8.005911672137962e-05, + "loss": 1.8269, + "step": 10292 + }, + { + "epoch": 3.1593001841620625, + "grad_norm": 0.2926514446735382, + "learning_rate": 8.005514453940881e-05, + "loss": 1.8334, + "step": 10293 + }, + { + "epoch": 3.159607120933088, + "grad_norm": 0.34313449263572693, + "learning_rate": 8.005117206041543e-05, + "loss": 1.7866, + "step": 10294 + }, + { + "epoch": 3.159914057704113, + "grad_norm": 0.30971628427505493, + "learning_rate": 8.004719928443875e-05, + "loss": 1.7827, + "step": 10295 + }, + { + "epoch": 3.160220994475138, + "grad_norm": 0.23955371975898743, + "learning_rate": 8.004322621151807e-05, + "loss": 1.7619, + "step": 10296 + }, + { + "epoch": 3.1605279312461634, + "grad_norm": 0.31311795115470886, + "learning_rate": 8.003925284169261e-05, + "loss": 1.8247, + "step": 10297 + }, + { + "epoch": 3.1608348680171883, + "grad_norm": 0.3408358097076416, + "learning_rate": 8.003527917500163e-05, + "loss": 1.8146, + "step": 10298 + }, + { + "epoch": 3.1611418047882136, + "grad_norm": 0.3030858337879181, + "learning_rate": 8.003130521148442e-05, + "loss": 1.857, + "step": 10299 + }, + { + "epoch": 3.161448741559239, + "grad_norm": 0.25168511271476746, + "learning_rate": 8.002733095118025e-05, + "loss": 1.8404, + "step": 10300 + }, + { + "epoch": 3.161755678330264, + "grad_norm": 0.2956216335296631, + "learning_rate": 8.002335639412839e-05, + "loss": 1.7352, + "step": 10301 + }, + { + "epoch": 3.162062615101289, + "grad_norm": 0.27791857719421387, + "learning_rate": 8.001938154036814e-05, + "loss": 1.7797, + "step": 10302 + }, + { + "epoch": 3.1623695518723145, + "grad_norm": 0.3106420040130615, + "learning_rate": 8.001540638993876e-05, + "loss": 1.8434, + "step": 10303 + }, + { + "epoch": 3.1626764886433394, + "grad_norm": 0.2940445840358734, + "learning_rate": 8.001143094287954e-05, + "loss": 1.8459, + "step": 10304 + }, + { + "epoch": 3.1629834254143647, + "grad_norm": 0.3857429325580597, + "learning_rate": 8.000745519922977e-05, + "loss": 1.7853, + "step": 10305 + }, + { + "epoch": 3.1632903621853896, + "grad_norm": 0.3585071861743927, + "learning_rate": 8.000347915902874e-05, + "loss": 1.8905, + "step": 10306 + }, + { + "epoch": 3.163597298956415, + "grad_norm": 0.320003867149353, + "learning_rate": 7.999950282231574e-05, + "loss": 1.8397, + "step": 10307 + }, + { + "epoch": 3.1639042357274403, + "grad_norm": 0.24986252188682556, + "learning_rate": 7.999552618913009e-05, + "loss": 1.7916, + "step": 10308 + }, + { + "epoch": 3.164211172498465, + "grad_norm": 0.33077237010002136, + "learning_rate": 7.999154925951104e-05, + "loss": 1.8334, + "step": 10309 + }, + { + "epoch": 3.1645181092694905, + "grad_norm": 0.35700327157974243, + "learning_rate": 7.998757203349794e-05, + "loss": 1.7773, + "step": 10310 + }, + { + "epoch": 3.164825046040516, + "grad_norm": 0.3095493018627167, + "learning_rate": 7.998359451113007e-05, + "loss": 1.8156, + "step": 10311 + }, + { + "epoch": 3.1651319828115407, + "grad_norm": 0.3004748225212097, + "learning_rate": 7.997961669244673e-05, + "loss": 1.7862, + "step": 10312 + }, + { + "epoch": 3.165438919582566, + "grad_norm": 0.39382806420326233, + "learning_rate": 7.99756385774873e-05, + "loss": 1.764, + "step": 10313 + }, + { + "epoch": 3.165745856353591, + "grad_norm": 0.3109463155269623, + "learning_rate": 7.997166016629099e-05, + "loss": 1.8006, + "step": 10314 + }, + { + "epoch": 3.1660527931246163, + "grad_norm": 0.2896469235420227, + "learning_rate": 7.996768145889717e-05, + "loss": 1.8373, + "step": 10315 + }, + { + "epoch": 3.1663597298956416, + "grad_norm": 0.35024940967559814, + "learning_rate": 7.996370245534517e-05, + "loss": 1.797, + "step": 10316 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.3228827714920044, + "learning_rate": 7.995972315567431e-05, + "loss": 1.7757, + "step": 10317 + }, + { + "epoch": 3.166973603437692, + "grad_norm": 0.27102410793304443, + "learning_rate": 7.995574355992388e-05, + "loss": 1.7786, + "step": 10318 + }, + { + "epoch": 3.167280540208717, + "grad_norm": 0.2556116580963135, + "learning_rate": 7.995176366813325e-05, + "loss": 1.7621, + "step": 10319 + }, + { + "epoch": 3.167587476979742, + "grad_norm": 0.28279444575309753, + "learning_rate": 7.994778348034173e-05, + "loss": 1.7954, + "step": 10320 + }, + { + "epoch": 3.1678944137507674, + "grad_norm": 0.31778639554977417, + "learning_rate": 7.994380299658867e-05, + "loss": 1.7657, + "step": 10321 + }, + { + "epoch": 3.1682013505217923, + "grad_norm": 0.27935469150543213, + "learning_rate": 7.993982221691339e-05, + "loss": 1.7502, + "step": 10322 + }, + { + "epoch": 3.1685082872928176, + "grad_norm": 0.29012617468833923, + "learning_rate": 7.993584114135524e-05, + "loss": 1.8497, + "step": 10323 + }, + { + "epoch": 3.168815224063843, + "grad_norm": 0.2674056887626648, + "learning_rate": 7.993185976995356e-05, + "loss": 1.7875, + "step": 10324 + }, + { + "epoch": 3.169122160834868, + "grad_norm": 0.2667328417301178, + "learning_rate": 7.992787810274771e-05, + "loss": 1.771, + "step": 10325 + }, + { + "epoch": 3.169429097605893, + "grad_norm": 0.25807151198387146, + "learning_rate": 7.992389613977702e-05, + "loss": 1.7638, + "step": 10326 + }, + { + "epoch": 3.1697360343769185, + "grad_norm": 0.2572930157184601, + "learning_rate": 7.991991388108084e-05, + "loss": 1.8218, + "step": 10327 + }, + { + "epoch": 3.1700429711479434, + "grad_norm": 0.3955067992210388, + "learning_rate": 7.991593132669855e-05, + "loss": 1.8458, + "step": 10328 + }, + { + "epoch": 3.1703499079189688, + "grad_norm": 0.2813466489315033, + "learning_rate": 7.991194847666948e-05, + "loss": 1.8042, + "step": 10329 + }, + { + "epoch": 3.1706568446899936, + "grad_norm": 0.2645012140274048, + "learning_rate": 7.990796533103302e-05, + "loss": 1.8241, + "step": 10330 + }, + { + "epoch": 3.170963781461019, + "grad_norm": 0.28462091088294983, + "learning_rate": 7.99039818898285e-05, + "loss": 1.8853, + "step": 10331 + }, + { + "epoch": 3.1712707182320443, + "grad_norm": 0.2727372944355011, + "learning_rate": 7.98999981530953e-05, + "loss": 1.7564, + "step": 10332 + }, + { + "epoch": 3.171577655003069, + "grad_norm": 0.2658170759677887, + "learning_rate": 7.989601412087281e-05, + "loss": 1.8344, + "step": 10333 + }, + { + "epoch": 3.1718845917740945, + "grad_norm": 0.29713502526283264, + "learning_rate": 7.989202979320039e-05, + "loss": 1.8721, + "step": 10334 + }, + { + "epoch": 3.17219152854512, + "grad_norm": 0.26609495282173157, + "learning_rate": 7.98880451701174e-05, + "loss": 1.7991, + "step": 10335 + }, + { + "epoch": 3.1724984653161448, + "grad_norm": 0.29779741168022156, + "learning_rate": 7.988406025166322e-05, + "loss": 1.8182, + "step": 10336 + }, + { + "epoch": 3.17280540208717, + "grad_norm": 0.2771340012550354, + "learning_rate": 7.988007503787724e-05, + "loss": 1.8034, + "step": 10337 + }, + { + "epoch": 3.1731123388581954, + "grad_norm": 0.30510422587394714, + "learning_rate": 7.987608952879886e-05, + "loss": 1.8477, + "step": 10338 + }, + { + "epoch": 3.1734192756292203, + "grad_norm": 0.3097476363182068, + "learning_rate": 7.987210372446745e-05, + "loss": 1.7572, + "step": 10339 + }, + { + "epoch": 3.1737262124002457, + "grad_norm": 0.2553942799568176, + "learning_rate": 7.986811762492239e-05, + "loss": 1.7837, + "step": 10340 + }, + { + "epoch": 3.1740331491712706, + "grad_norm": 0.26546719670295715, + "learning_rate": 7.986413123020312e-05, + "loss": 1.7893, + "step": 10341 + }, + { + "epoch": 3.174340085942296, + "grad_norm": 0.37721553444862366, + "learning_rate": 7.986014454034895e-05, + "loss": 1.8475, + "step": 10342 + }, + { + "epoch": 3.174647022713321, + "grad_norm": 0.3215494453907013, + "learning_rate": 7.985615755539937e-05, + "loss": 1.7806, + "step": 10343 + }, + { + "epoch": 3.174953959484346, + "grad_norm": 0.2662442922592163, + "learning_rate": 7.985217027539373e-05, + "loss": 1.8116, + "step": 10344 + }, + { + "epoch": 3.1752608962553714, + "grad_norm": 0.23334236443042755, + "learning_rate": 7.984818270037145e-05, + "loss": 1.7929, + "step": 10345 + }, + { + "epoch": 3.1755678330263963, + "grad_norm": 0.2873367667198181, + "learning_rate": 7.98441948303719e-05, + "loss": 1.7808, + "step": 10346 + }, + { + "epoch": 3.1758747697974217, + "grad_norm": 0.3623826801776886, + "learning_rate": 7.984020666543458e-05, + "loss": 1.8817, + "step": 10347 + }, + { + "epoch": 3.176181706568447, + "grad_norm": 0.3060589134693146, + "learning_rate": 7.983621820559881e-05, + "loss": 1.796, + "step": 10348 + }, + { + "epoch": 3.176488643339472, + "grad_norm": 0.2396882325410843, + "learning_rate": 7.983222945090407e-05, + "loss": 1.7455, + "step": 10349 + }, + { + "epoch": 3.1767955801104972, + "grad_norm": 0.24811476469039917, + "learning_rate": 7.982824040138974e-05, + "loss": 1.7907, + "step": 10350 + }, + { + "epoch": 3.1771025168815226, + "grad_norm": 0.32749706506729126, + "learning_rate": 7.982425105709524e-05, + "loss": 1.8553, + "step": 10351 + }, + { + "epoch": 3.1774094536525475, + "grad_norm": 0.3648095726966858, + "learning_rate": 7.982026141806003e-05, + "loss": 1.8387, + "step": 10352 + }, + { + "epoch": 3.177716390423573, + "grad_norm": 0.2749348282814026, + "learning_rate": 7.981627148432352e-05, + "loss": 1.7676, + "step": 10353 + }, + { + "epoch": 3.178023327194598, + "grad_norm": 0.2735142409801483, + "learning_rate": 7.981228125592513e-05, + "loss": 1.822, + "step": 10354 + }, + { + "epoch": 3.178330263965623, + "grad_norm": 0.28759655356407166, + "learning_rate": 7.98082907329043e-05, + "loss": 1.8113, + "step": 10355 + }, + { + "epoch": 3.1786372007366483, + "grad_norm": 0.33661654591560364, + "learning_rate": 7.980429991530048e-05, + "loss": 1.8036, + "step": 10356 + }, + { + "epoch": 3.1789441375076732, + "grad_norm": 0.2634892761707306, + "learning_rate": 7.98003088031531e-05, + "loss": 1.8323, + "step": 10357 + }, + { + "epoch": 3.1792510742786986, + "grad_norm": 0.25864094495773315, + "learning_rate": 7.979631739650158e-05, + "loss": 1.8199, + "step": 10358 + }, + { + "epoch": 3.179558011049724, + "grad_norm": 0.27368444204330444, + "learning_rate": 7.979232569538541e-05, + "loss": 1.7673, + "step": 10359 + }, + { + "epoch": 3.179864947820749, + "grad_norm": 0.2506616413593292, + "learning_rate": 7.9788333699844e-05, + "loss": 1.7912, + "step": 10360 + }, + { + "epoch": 3.180171884591774, + "grad_norm": 0.2539178133010864, + "learning_rate": 7.978434140991684e-05, + "loss": 1.7934, + "step": 10361 + }, + { + "epoch": 3.1804788213627995, + "grad_norm": 0.2605626881122589, + "learning_rate": 7.978034882564334e-05, + "loss": 1.8031, + "step": 10362 + }, + { + "epoch": 3.1807857581338244, + "grad_norm": 0.2610207796096802, + "learning_rate": 7.977635594706299e-05, + "loss": 1.8664, + "step": 10363 + }, + { + "epoch": 3.1810926949048497, + "grad_norm": 0.26164132356643677, + "learning_rate": 7.977236277421523e-05, + "loss": 1.7758, + "step": 10364 + }, + { + "epoch": 3.1813996316758746, + "grad_norm": 0.3122340142726898, + "learning_rate": 7.976836930713953e-05, + "loss": 1.9033, + "step": 10365 + }, + { + "epoch": 3.1817065684469, + "grad_norm": 0.3317202031612396, + "learning_rate": 7.976437554587537e-05, + "loss": 1.7899, + "step": 10366 + }, + { + "epoch": 3.1820135052179253, + "grad_norm": 0.28612568974494934, + "learning_rate": 7.97603814904622e-05, + "loss": 1.8145, + "step": 10367 + }, + { + "epoch": 3.18232044198895, + "grad_norm": 0.349917471408844, + "learning_rate": 7.975638714093949e-05, + "loss": 1.877, + "step": 10368 + }, + { + "epoch": 3.1826273787599755, + "grad_norm": 0.3737771809101105, + "learning_rate": 7.975239249734672e-05, + "loss": 1.8204, + "step": 10369 + }, + { + "epoch": 3.182934315531001, + "grad_norm": 0.3688446879386902, + "learning_rate": 7.974839755972339e-05, + "loss": 1.8487, + "step": 10370 + }, + { + "epoch": 3.1832412523020257, + "grad_norm": 0.2934897541999817, + "learning_rate": 7.974440232810894e-05, + "loss": 1.8243, + "step": 10371 + }, + { + "epoch": 3.183548189073051, + "grad_norm": 0.2596173882484436, + "learning_rate": 7.974040680254287e-05, + "loss": 1.7887, + "step": 10372 + }, + { + "epoch": 3.183855125844076, + "grad_norm": 0.35686594247817993, + "learning_rate": 7.973641098306468e-05, + "loss": 1.8653, + "step": 10373 + }, + { + "epoch": 3.1841620626151013, + "grad_norm": 0.3187713921070099, + "learning_rate": 7.973241486971383e-05, + "loss": 1.8767, + "step": 10374 + }, + { + "epoch": 3.1844689993861266, + "grad_norm": 0.2596273124217987, + "learning_rate": 7.972841846252985e-05, + "loss": 1.8028, + "step": 10375 + }, + { + "epoch": 3.1847759361571515, + "grad_norm": 0.2637474834918976, + "learning_rate": 7.972442176155221e-05, + "loss": 1.802, + "step": 10376 + }, + { + "epoch": 3.185082872928177, + "grad_norm": 0.2641126215457916, + "learning_rate": 7.97204247668204e-05, + "loss": 1.7931, + "step": 10377 + }, + { + "epoch": 3.185389809699202, + "grad_norm": 0.25594159960746765, + "learning_rate": 7.971642747837393e-05, + "loss": 1.818, + "step": 10378 + }, + { + "epoch": 3.185696746470227, + "grad_norm": 0.26567938923835754, + "learning_rate": 7.971242989625233e-05, + "loss": 1.8174, + "step": 10379 + }, + { + "epoch": 3.1860036832412524, + "grad_norm": 0.29580214619636536, + "learning_rate": 7.970843202049508e-05, + "loss": 1.869, + "step": 10380 + }, + { + "epoch": 3.1863106200122773, + "grad_norm": 0.2657530605792999, + "learning_rate": 7.970443385114168e-05, + "loss": 1.8352, + "step": 10381 + }, + { + "epoch": 3.1866175567833026, + "grad_norm": 0.2468358278274536, + "learning_rate": 7.970043538823165e-05, + "loss": 1.7851, + "step": 10382 + }, + { + "epoch": 3.186924493554328, + "grad_norm": 0.26464715600013733, + "learning_rate": 7.969643663180451e-05, + "loss": 1.8208, + "step": 10383 + }, + { + "epoch": 3.187231430325353, + "grad_norm": 0.26035723090171814, + "learning_rate": 7.969243758189979e-05, + "loss": 1.8089, + "step": 10384 + }, + { + "epoch": 3.187538367096378, + "grad_norm": 0.2644619941711426, + "learning_rate": 7.968843823855699e-05, + "loss": 1.8379, + "step": 10385 + }, + { + "epoch": 3.1878453038674035, + "grad_norm": 0.25576624274253845, + "learning_rate": 7.968443860181565e-05, + "loss": 1.7932, + "step": 10386 + }, + { + "epoch": 3.1881522406384284, + "grad_norm": 0.24276074767112732, + "learning_rate": 7.968043867171528e-05, + "loss": 1.8037, + "step": 10387 + }, + { + "epoch": 3.1884591774094537, + "grad_norm": 0.27156540751457214, + "learning_rate": 7.967643844829543e-05, + "loss": 1.7998, + "step": 10388 + }, + { + "epoch": 3.1887661141804786, + "grad_norm": 0.2555428743362427, + "learning_rate": 7.96724379315956e-05, + "loss": 1.7612, + "step": 10389 + }, + { + "epoch": 3.189073050951504, + "grad_norm": 0.3358438014984131, + "learning_rate": 7.966843712165537e-05, + "loss": 1.8543, + "step": 10390 + }, + { + "epoch": 3.1893799877225293, + "grad_norm": 0.2799586355686188, + "learning_rate": 7.966443601851424e-05, + "loss": 1.819, + "step": 10391 + }, + { + "epoch": 3.189686924493554, + "grad_norm": 0.2364189177751541, + "learning_rate": 7.966043462221178e-05, + "loss": 1.8537, + "step": 10392 + }, + { + "epoch": 3.1899938612645795, + "grad_norm": 0.23849403858184814, + "learning_rate": 7.96564329327875e-05, + "loss": 1.8125, + "step": 10393 + }, + { + "epoch": 3.190300798035605, + "grad_norm": 0.2371583878993988, + "learning_rate": 7.965243095028098e-05, + "loss": 1.7352, + "step": 10394 + }, + { + "epoch": 3.1906077348066297, + "grad_norm": 0.2584737539291382, + "learning_rate": 7.964842867473176e-05, + "loss": 1.8801, + "step": 10395 + }, + { + "epoch": 3.190914671577655, + "grad_norm": 0.27768051624298096, + "learning_rate": 7.964442610617939e-05, + "loss": 1.8221, + "step": 10396 + }, + { + "epoch": 3.1912216083486804, + "grad_norm": 0.2680891752243042, + "learning_rate": 7.964042324466341e-05, + "loss": 1.8371, + "step": 10397 + }, + { + "epoch": 3.1915285451197053, + "grad_norm": 0.25301921367645264, + "learning_rate": 7.963642009022343e-05, + "loss": 1.7972, + "step": 10398 + }, + { + "epoch": 3.1918354818907306, + "grad_norm": 0.2589731216430664, + "learning_rate": 7.963241664289896e-05, + "loss": 1.8145, + "step": 10399 + }, + { + "epoch": 3.1921424186617555, + "grad_norm": 0.2611297369003296, + "learning_rate": 7.962841290272956e-05, + "loss": 1.8736, + "step": 10400 + }, + { + "epoch": 3.192449355432781, + "grad_norm": 0.2812272906303406, + "learning_rate": 7.962440886975483e-05, + "loss": 1.8116, + "step": 10401 + }, + { + "epoch": 3.192756292203806, + "grad_norm": 0.3261657655239105, + "learning_rate": 7.962040454401434e-05, + "loss": 1.7935, + "step": 10402 + }, + { + "epoch": 3.193063228974831, + "grad_norm": 0.3355373442173004, + "learning_rate": 7.961639992554764e-05, + "loss": 1.7957, + "step": 10403 + }, + { + "epoch": 3.1933701657458564, + "grad_norm": 0.2811843156814575, + "learning_rate": 7.961239501439432e-05, + "loss": 1.797, + "step": 10404 + }, + { + "epoch": 3.1936771025168813, + "grad_norm": 0.24933238327503204, + "learning_rate": 7.960838981059395e-05, + "loss": 1.7594, + "step": 10405 + }, + { + "epoch": 3.1939840392879066, + "grad_norm": 0.29110121726989746, + "learning_rate": 7.960438431418613e-05, + "loss": 1.8268, + "step": 10406 + }, + { + "epoch": 3.194290976058932, + "grad_norm": 0.3702283799648285, + "learning_rate": 7.960037852521043e-05, + "loss": 1.7629, + "step": 10407 + }, + { + "epoch": 3.194597912829957, + "grad_norm": 0.33275437355041504, + "learning_rate": 7.959637244370644e-05, + "loss": 1.8507, + "step": 10408 + }, + { + "epoch": 3.194904849600982, + "grad_norm": 0.2691981792449951, + "learning_rate": 7.959236606971375e-05, + "loss": 1.8084, + "step": 10409 + }, + { + "epoch": 3.1952117863720075, + "grad_norm": 0.30108413100242615, + "learning_rate": 7.958835940327194e-05, + "loss": 1.8525, + "step": 10410 + }, + { + "epoch": 3.1955187231430324, + "grad_norm": 0.32112306356430054, + "learning_rate": 7.958435244442064e-05, + "loss": 1.7431, + "step": 10411 + }, + { + "epoch": 3.1958256599140578, + "grad_norm": 0.2795291543006897, + "learning_rate": 7.958034519319942e-05, + "loss": 1.7985, + "step": 10412 + }, + { + "epoch": 3.196132596685083, + "grad_norm": 0.2485792338848114, + "learning_rate": 7.957633764964788e-05, + "loss": 1.7363, + "step": 10413 + }, + { + "epoch": 3.196439533456108, + "grad_norm": 0.3552432358264923, + "learning_rate": 7.957232981380565e-05, + "loss": 1.8174, + "step": 10414 + }, + { + "epoch": 3.1967464702271333, + "grad_norm": 0.3829655051231384, + "learning_rate": 7.956832168571234e-05, + "loss": 1.9249, + "step": 10415 + }, + { + "epoch": 3.197053406998158, + "grad_norm": 0.2498074769973755, + "learning_rate": 7.956431326540752e-05, + "loss": 1.8104, + "step": 10416 + }, + { + "epoch": 3.1973603437691835, + "grad_norm": 0.24596504867076874, + "learning_rate": 7.956030455293082e-05, + "loss": 1.8007, + "step": 10417 + }, + { + "epoch": 3.197667280540209, + "grad_norm": 0.2795363664627075, + "learning_rate": 7.95562955483219e-05, + "loss": 1.775, + "step": 10418 + }, + { + "epoch": 3.1979742173112338, + "grad_norm": 0.3581138253211975, + "learning_rate": 7.95522862516203e-05, + "loss": 1.8567, + "step": 10419 + }, + { + "epoch": 3.198281154082259, + "grad_norm": 0.36102500557899475, + "learning_rate": 7.95482766628657e-05, + "loss": 1.8509, + "step": 10420 + }, + { + "epoch": 3.198588090853284, + "grad_norm": 0.4717029929161072, + "learning_rate": 7.954426678209774e-05, + "loss": 1.8218, + "step": 10421 + }, + { + "epoch": 3.1988950276243093, + "grad_norm": 0.3211984932422638, + "learning_rate": 7.9540256609356e-05, + "loss": 1.8696, + "step": 10422 + }, + { + "epoch": 3.1992019643953347, + "grad_norm": 0.30094626545906067, + "learning_rate": 7.953624614468011e-05, + "loss": 1.8714, + "step": 10423 + }, + { + "epoch": 3.1995089011663596, + "grad_norm": 0.267578125, + "learning_rate": 7.953223538810976e-05, + "loss": 1.7903, + "step": 10424 + }, + { + "epoch": 3.199815837937385, + "grad_norm": 0.35577845573425293, + "learning_rate": 7.952822433968453e-05, + "loss": 1.7808, + "step": 10425 + }, + { + "epoch": 3.2001227747084102, + "grad_norm": 0.4117741882801056, + "learning_rate": 7.952421299944408e-05, + "loss": 1.7856, + "step": 10426 + }, + { + "epoch": 3.200429711479435, + "grad_norm": 0.35202035307884216, + "learning_rate": 7.952020136742806e-05, + "loss": 1.8112, + "step": 10427 + }, + { + "epoch": 3.2007366482504604, + "grad_norm": 0.26514917612075806, + "learning_rate": 7.951618944367611e-05, + "loss": 1.828, + "step": 10428 + }, + { + "epoch": 3.201043585021486, + "grad_norm": 0.29219159483909607, + "learning_rate": 7.951217722822786e-05, + "loss": 1.9366, + "step": 10429 + }, + { + "epoch": 3.2013505217925107, + "grad_norm": 0.2929961383342743, + "learning_rate": 7.950816472112298e-05, + "loss": 1.8006, + "step": 10430 + }, + { + "epoch": 3.201657458563536, + "grad_norm": 0.28339722752571106, + "learning_rate": 7.950415192240114e-05, + "loss": 1.7411, + "step": 10431 + }, + { + "epoch": 3.201964395334561, + "grad_norm": 0.258884996175766, + "learning_rate": 7.950013883210196e-05, + "loss": 1.8153, + "step": 10432 + }, + { + "epoch": 3.2022713321055862, + "grad_norm": 0.3065929114818573, + "learning_rate": 7.949612545026512e-05, + "loss": 1.7918, + "step": 10433 + }, + { + "epoch": 3.2025782688766116, + "grad_norm": 0.289874404668808, + "learning_rate": 7.949211177693029e-05, + "loss": 1.7975, + "step": 10434 + }, + { + "epoch": 3.2028852056476365, + "grad_norm": 0.27025631070137024, + "learning_rate": 7.948809781213711e-05, + "loss": 1.8129, + "step": 10435 + }, + { + "epoch": 3.203192142418662, + "grad_norm": 0.2501074969768524, + "learning_rate": 7.948408355592528e-05, + "loss": 1.7653, + "step": 10436 + }, + { + "epoch": 3.203499079189687, + "grad_norm": 0.30402958393096924, + "learning_rate": 7.948006900833445e-05, + "loss": 1.8311, + "step": 10437 + }, + { + "epoch": 3.203806015960712, + "grad_norm": 0.28783223032951355, + "learning_rate": 7.94760541694043e-05, + "loss": 1.82, + "step": 10438 + }, + { + "epoch": 3.2041129527317374, + "grad_norm": 0.30428317189216614, + "learning_rate": 7.947203903917451e-05, + "loss": 1.8673, + "step": 10439 + }, + { + "epoch": 3.2044198895027622, + "grad_norm": 0.2860367000102997, + "learning_rate": 7.946802361768473e-05, + "loss": 1.824, + "step": 10440 + }, + { + "epoch": 3.2047268262737876, + "grad_norm": 0.2995273172855377, + "learning_rate": 7.946400790497469e-05, + "loss": 1.7342, + "step": 10441 + }, + { + "epoch": 3.205033763044813, + "grad_norm": 0.4374088943004608, + "learning_rate": 7.945999190108407e-05, + "loss": 1.8522, + "step": 10442 + }, + { + "epoch": 3.205340699815838, + "grad_norm": 0.37659478187561035, + "learning_rate": 7.945597560605252e-05, + "loss": 1.7518, + "step": 10443 + }, + { + "epoch": 3.205647636586863, + "grad_norm": 0.24257932603359222, + "learning_rate": 7.945195901991975e-05, + "loss": 1.7892, + "step": 10444 + }, + { + "epoch": 3.2059545733578885, + "grad_norm": 0.3682694435119629, + "learning_rate": 7.944794214272546e-05, + "loss": 1.7757, + "step": 10445 + }, + { + "epoch": 3.2062615101289134, + "grad_norm": 0.434692919254303, + "learning_rate": 7.944392497450936e-05, + "loss": 1.8207, + "step": 10446 + }, + { + "epoch": 3.2065684468999387, + "grad_norm": 0.3982211947441101, + "learning_rate": 7.943990751531113e-05, + "loss": 1.8303, + "step": 10447 + }, + { + "epoch": 3.2068753836709636, + "grad_norm": 0.2877334654331207, + "learning_rate": 7.943588976517049e-05, + "loss": 1.8495, + "step": 10448 + }, + { + "epoch": 3.207182320441989, + "grad_norm": 0.34589654207229614, + "learning_rate": 7.943187172412712e-05, + "loss": 1.7773, + "step": 10449 + }, + { + "epoch": 3.2074892572130143, + "grad_norm": 0.4727517366409302, + "learning_rate": 7.942785339222074e-05, + "loss": 1.8702, + "step": 10450 + }, + { + "epoch": 3.207796193984039, + "grad_norm": 0.4019354581832886, + "learning_rate": 7.942383476949107e-05, + "loss": 1.8095, + "step": 10451 + }, + { + "epoch": 3.2081031307550645, + "grad_norm": 0.2726243734359741, + "learning_rate": 7.941981585597782e-05, + "loss": 1.7273, + "step": 10452 + }, + { + "epoch": 3.20841006752609, + "grad_norm": 0.2944760024547577, + "learning_rate": 7.941579665172072e-05, + "loss": 1.7507, + "step": 10453 + }, + { + "epoch": 3.2087170042971147, + "grad_norm": 0.3530777096748352, + "learning_rate": 7.941177715675945e-05, + "loss": 1.8434, + "step": 10454 + }, + { + "epoch": 3.20902394106814, + "grad_norm": 0.28612539172172546, + "learning_rate": 7.940775737113378e-05, + "loss": 1.8094, + "step": 10455 + }, + { + "epoch": 3.209330877839165, + "grad_norm": 0.27006468176841736, + "learning_rate": 7.94037372948834e-05, + "loss": 1.7854, + "step": 10456 + }, + { + "epoch": 3.2096378146101903, + "grad_norm": 0.3027147054672241, + "learning_rate": 7.939971692804806e-05, + "loss": 1.7596, + "step": 10457 + }, + { + "epoch": 3.2099447513812156, + "grad_norm": 0.31999528408050537, + "learning_rate": 7.939569627066749e-05, + "loss": 1.8836, + "step": 10458 + }, + { + "epoch": 3.2102516881522405, + "grad_norm": 0.267600417137146, + "learning_rate": 7.939167532278142e-05, + "loss": 1.8508, + "step": 10459 + }, + { + "epoch": 3.210558624923266, + "grad_norm": 0.3171706795692444, + "learning_rate": 7.938765408442958e-05, + "loss": 1.7507, + "step": 10460 + }, + { + "epoch": 3.210865561694291, + "grad_norm": 0.2955280840396881, + "learning_rate": 7.938363255565171e-05, + "loss": 1.733, + "step": 10461 + }, + { + "epoch": 3.211172498465316, + "grad_norm": 0.3427969217300415, + "learning_rate": 7.937961073648759e-05, + "loss": 1.9208, + "step": 10462 + }, + { + "epoch": 3.2114794352363414, + "grad_norm": 0.28788647055625916, + "learning_rate": 7.937558862697692e-05, + "loss": 1.7723, + "step": 10463 + }, + { + "epoch": 3.2117863720073663, + "grad_norm": 0.26093682646751404, + "learning_rate": 7.937156622715945e-05, + "loss": 1.803, + "step": 10464 + }, + { + "epoch": 3.2120933087783916, + "grad_norm": 0.2791301906108856, + "learning_rate": 7.936754353707497e-05, + "loss": 1.7601, + "step": 10465 + }, + { + "epoch": 3.212400245549417, + "grad_norm": 0.3039831519126892, + "learning_rate": 7.93635205567632e-05, + "loss": 1.7864, + "step": 10466 + }, + { + "epoch": 3.212707182320442, + "grad_norm": 0.28498128056526184, + "learning_rate": 7.935949728626392e-05, + "loss": 1.7745, + "step": 10467 + }, + { + "epoch": 3.213014119091467, + "grad_norm": 0.2908780872821808, + "learning_rate": 7.935547372561687e-05, + "loss": 1.8281, + "step": 10468 + }, + { + "epoch": 3.2133210558624925, + "grad_norm": 0.26148509979248047, + "learning_rate": 7.935144987486183e-05, + "loss": 1.8545, + "step": 10469 + }, + { + "epoch": 3.2136279926335174, + "grad_norm": 0.2853962481021881, + "learning_rate": 7.934742573403856e-05, + "loss": 1.7765, + "step": 10470 + }, + { + "epoch": 3.2139349294045427, + "grad_norm": 0.26497501134872437, + "learning_rate": 7.934340130318681e-05, + "loss": 1.7472, + "step": 10471 + }, + { + "epoch": 3.214241866175568, + "grad_norm": 0.2806912660598755, + "learning_rate": 7.933937658234638e-05, + "loss": 1.7879, + "step": 10472 + }, + { + "epoch": 3.214548802946593, + "grad_norm": 0.2699974477291107, + "learning_rate": 7.933535157155705e-05, + "loss": 1.7539, + "step": 10473 + }, + { + "epoch": 3.2148557397176183, + "grad_norm": 0.22714731097221375, + "learning_rate": 7.933132627085856e-05, + "loss": 1.7861, + "step": 10474 + }, + { + "epoch": 3.215162676488643, + "grad_norm": 0.291340708732605, + "learning_rate": 7.932730068029072e-05, + "loss": 1.8381, + "step": 10475 + }, + { + "epoch": 3.2154696132596685, + "grad_norm": 0.3257324695587158, + "learning_rate": 7.93232747998933e-05, + "loss": 1.8293, + "step": 10476 + }, + { + "epoch": 3.215776550030694, + "grad_norm": 0.3518911600112915, + "learning_rate": 7.93192486297061e-05, + "loss": 1.853, + "step": 10477 + }, + { + "epoch": 3.2160834868017187, + "grad_norm": 0.27663540840148926, + "learning_rate": 7.93152221697689e-05, + "loss": 1.7831, + "step": 10478 + }, + { + "epoch": 3.216390423572744, + "grad_norm": 0.3153248429298401, + "learning_rate": 7.931119542012149e-05, + "loss": 1.7443, + "step": 10479 + }, + { + "epoch": 3.216697360343769, + "grad_norm": 0.2919597029685974, + "learning_rate": 7.930716838080368e-05, + "loss": 1.8108, + "step": 10480 + }, + { + "epoch": 3.2170042971147943, + "grad_norm": 0.26892516016960144, + "learning_rate": 7.930314105185524e-05, + "loss": 1.7791, + "step": 10481 + }, + { + "epoch": 3.2173112338858196, + "grad_norm": 0.2486005276441574, + "learning_rate": 7.929911343331599e-05, + "loss": 1.8184, + "step": 10482 + }, + { + "epoch": 3.2176181706568445, + "grad_norm": 0.260728120803833, + "learning_rate": 7.929508552522571e-05, + "loss": 1.7933, + "step": 10483 + }, + { + "epoch": 3.21792510742787, + "grad_norm": 0.3081948757171631, + "learning_rate": 7.929105732762425e-05, + "loss": 1.7732, + "step": 10484 + }, + { + "epoch": 3.218232044198895, + "grad_norm": 0.3807671368122101, + "learning_rate": 7.928702884055138e-05, + "loss": 1.7652, + "step": 10485 + }, + { + "epoch": 3.21853898096992, + "grad_norm": 0.31637755036354065, + "learning_rate": 7.928300006404692e-05, + "loss": 1.7605, + "step": 10486 + }, + { + "epoch": 3.2188459177409454, + "grad_norm": 0.2812853455543518, + "learning_rate": 7.927897099815071e-05, + "loss": 1.7925, + "step": 10487 + }, + { + "epoch": 3.2191528545119708, + "grad_norm": 0.3472350239753723, + "learning_rate": 7.927494164290253e-05, + "loss": 1.8252, + "step": 10488 + }, + { + "epoch": 3.2194597912829956, + "grad_norm": 0.4202714264392853, + "learning_rate": 7.927091199834222e-05, + "loss": 1.7993, + "step": 10489 + }, + { + "epoch": 3.219766728054021, + "grad_norm": 0.44552353024482727, + "learning_rate": 7.92668820645096e-05, + "loss": 1.8609, + "step": 10490 + }, + { + "epoch": 3.220073664825046, + "grad_norm": 0.38964664936065674, + "learning_rate": 7.926285184144451e-05, + "loss": 1.864, + "step": 10491 + }, + { + "epoch": 3.220380601596071, + "grad_norm": 0.2978462278842926, + "learning_rate": 7.925882132918676e-05, + "loss": 1.7892, + "step": 10492 + }, + { + "epoch": 3.2206875383670965, + "grad_norm": 0.2520316243171692, + "learning_rate": 7.925479052777619e-05, + "loss": 1.7702, + "step": 10493 + }, + { + "epoch": 3.2209944751381214, + "grad_norm": 0.28151068091392517, + "learning_rate": 7.925075943725263e-05, + "loss": 1.7613, + "step": 10494 + }, + { + "epoch": 3.2213014119091468, + "grad_norm": 0.3346099555492401, + "learning_rate": 7.924672805765592e-05, + "loss": 1.894, + "step": 10495 + }, + { + "epoch": 3.2216083486801717, + "grad_norm": 0.2981362044811249, + "learning_rate": 7.924269638902591e-05, + "loss": 1.8157, + "step": 10496 + }, + { + "epoch": 3.221915285451197, + "grad_norm": 0.2561499774456024, + "learning_rate": 7.923866443140242e-05, + "loss": 1.8259, + "step": 10497 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.26480481028556824, + "learning_rate": 7.923463218482532e-05, + "loss": 1.7856, + "step": 10498 + }, + { + "epoch": 3.222529158993247, + "grad_norm": 0.24103692173957825, + "learning_rate": 7.923059964933446e-05, + "loss": 1.7765, + "step": 10499 + }, + { + "epoch": 3.2228360957642725, + "grad_norm": 0.2399173080921173, + "learning_rate": 7.922656682496967e-05, + "loss": 1.8216, + "step": 10500 + }, + { + "epoch": 3.223143032535298, + "grad_norm": 0.24530018866062164, + "learning_rate": 7.922253371177082e-05, + "loss": 1.8155, + "step": 10501 + }, + { + "epoch": 3.2234499693063228, + "grad_norm": 0.23298653960227966, + "learning_rate": 7.921850030977775e-05, + "loss": 1.7843, + "step": 10502 + }, + { + "epoch": 3.223756906077348, + "grad_norm": 0.3053973317146301, + "learning_rate": 7.921446661903035e-05, + "loss": 1.8113, + "step": 10503 + }, + { + "epoch": 3.2240638428483734, + "grad_norm": 0.261336088180542, + "learning_rate": 7.921043263956847e-05, + "loss": 1.8073, + "step": 10504 + }, + { + "epoch": 3.2243707796193983, + "grad_norm": 0.24877268075942993, + "learning_rate": 7.920639837143195e-05, + "loss": 1.8344, + "step": 10505 + }, + { + "epoch": 3.2246777163904237, + "grad_norm": 0.26784422993659973, + "learning_rate": 7.920236381466071e-05, + "loss": 1.7757, + "step": 10506 + }, + { + "epoch": 3.2249846531614486, + "grad_norm": 0.2672121226787567, + "learning_rate": 7.919832896929458e-05, + "loss": 1.8384, + "step": 10507 + }, + { + "epoch": 3.225291589932474, + "grad_norm": 0.27254921197891235, + "learning_rate": 7.919429383537346e-05, + "loss": 1.8056, + "step": 10508 + }, + { + "epoch": 3.2255985267034992, + "grad_norm": 0.24467822909355164, + "learning_rate": 7.91902584129372e-05, + "loss": 1.8109, + "step": 10509 + }, + { + "epoch": 3.225905463474524, + "grad_norm": 0.25966358184814453, + "learning_rate": 7.918622270202571e-05, + "loss": 1.82, + "step": 10510 + }, + { + "epoch": 3.2262124002455494, + "grad_norm": 0.28601330518722534, + "learning_rate": 7.918218670267886e-05, + "loss": 1.7266, + "step": 10511 + }, + { + "epoch": 3.226519337016575, + "grad_norm": 0.4017516076564789, + "learning_rate": 7.917815041493653e-05, + "loss": 1.8408, + "step": 10512 + }, + { + "epoch": 3.2268262737875997, + "grad_norm": 0.3995787501335144, + "learning_rate": 7.917411383883862e-05, + "loss": 1.8441, + "step": 10513 + }, + { + "epoch": 3.227133210558625, + "grad_norm": 0.26997458934783936, + "learning_rate": 7.917007697442502e-05, + "loss": 1.8078, + "step": 10514 + }, + { + "epoch": 3.22744014732965, + "grad_norm": 0.34353014826774597, + "learning_rate": 7.916603982173562e-05, + "loss": 1.7523, + "step": 10515 + }, + { + "epoch": 3.2277470841006752, + "grad_norm": 0.39522337913513184, + "learning_rate": 7.916200238081032e-05, + "loss": 1.7532, + "step": 10516 + }, + { + "epoch": 3.2280540208717006, + "grad_norm": 0.4176923334598541, + "learning_rate": 7.915796465168903e-05, + "loss": 1.8895, + "step": 10517 + }, + { + "epoch": 3.2283609576427255, + "grad_norm": 0.30232906341552734, + "learning_rate": 7.915392663441164e-05, + "loss": 1.8223, + "step": 10518 + }, + { + "epoch": 3.228667894413751, + "grad_norm": 0.230951726436615, + "learning_rate": 7.914988832901805e-05, + "loss": 1.7265, + "step": 10519 + }, + { + "epoch": 3.228974831184776, + "grad_norm": 0.26381877064704895, + "learning_rate": 7.914584973554819e-05, + "loss": 1.7858, + "step": 10520 + }, + { + "epoch": 3.229281767955801, + "grad_norm": 0.2500905394554138, + "learning_rate": 7.914181085404194e-05, + "loss": 1.7606, + "step": 10521 + }, + { + "epoch": 3.2295887047268264, + "grad_norm": 0.2585415840148926, + "learning_rate": 7.913777168453925e-05, + "loss": 1.787, + "step": 10522 + }, + { + "epoch": 3.2298956414978512, + "grad_norm": 0.24236604571342468, + "learning_rate": 7.913373222708001e-05, + "loss": 1.7623, + "step": 10523 + }, + { + "epoch": 3.2302025782688766, + "grad_norm": 0.3113093078136444, + "learning_rate": 7.912969248170416e-05, + "loss": 1.7736, + "step": 10524 + }, + { + "epoch": 3.230509515039902, + "grad_norm": 0.3341342806816101, + "learning_rate": 7.912565244845163e-05, + "loss": 1.8583, + "step": 10525 + }, + { + "epoch": 3.230816451810927, + "grad_norm": 0.2644478678703308, + "learning_rate": 7.912161212736231e-05, + "loss": 1.7891, + "step": 10526 + }, + { + "epoch": 3.231123388581952, + "grad_norm": 0.22916561365127563, + "learning_rate": 7.911757151847616e-05, + "loss": 1.7642, + "step": 10527 + }, + { + "epoch": 3.2314303253529775, + "grad_norm": 0.24204877018928528, + "learning_rate": 7.911353062183309e-05, + "loss": 1.8522, + "step": 10528 + }, + { + "epoch": 3.2317372621240024, + "grad_norm": 0.25339365005493164, + "learning_rate": 7.910948943747307e-05, + "loss": 1.8391, + "step": 10529 + }, + { + "epoch": 3.2320441988950277, + "grad_norm": 0.2652709186077118, + "learning_rate": 7.9105447965436e-05, + "loss": 1.7735, + "step": 10530 + }, + { + "epoch": 3.2323511356660526, + "grad_norm": 0.2711019217967987, + "learning_rate": 7.910140620576183e-05, + "loss": 1.8491, + "step": 10531 + }, + { + "epoch": 3.232658072437078, + "grad_norm": 0.2598389685153961, + "learning_rate": 7.909736415849052e-05, + "loss": 1.8417, + "step": 10532 + }, + { + "epoch": 3.2329650092081033, + "grad_norm": 0.278037428855896, + "learning_rate": 7.9093321823662e-05, + "loss": 1.8774, + "step": 10533 + }, + { + "epoch": 3.233271945979128, + "grad_norm": 0.32015568017959595, + "learning_rate": 7.90892792013162e-05, + "loss": 1.8873, + "step": 10534 + }, + { + "epoch": 3.2335788827501535, + "grad_norm": 0.3098098635673523, + "learning_rate": 7.908523629149312e-05, + "loss": 1.8141, + "step": 10535 + }, + { + "epoch": 3.233885819521179, + "grad_norm": 0.3127266764640808, + "learning_rate": 7.908119309423267e-05, + "loss": 1.8587, + "step": 10536 + }, + { + "epoch": 3.2341927562922037, + "grad_norm": 0.3085545301437378, + "learning_rate": 7.907714960957483e-05, + "loss": 1.8544, + "step": 10537 + }, + { + "epoch": 3.234499693063229, + "grad_norm": 0.3051004409790039, + "learning_rate": 7.907310583755956e-05, + "loss": 1.8144, + "step": 10538 + }, + { + "epoch": 3.234806629834254, + "grad_norm": 0.3458186686038971, + "learning_rate": 7.906906177822682e-05, + "loss": 1.8388, + "step": 10539 + }, + { + "epoch": 3.2351135666052793, + "grad_norm": 0.37064439058303833, + "learning_rate": 7.906501743161656e-05, + "loss": 1.7574, + "step": 10540 + }, + { + "epoch": 3.2354205033763046, + "grad_norm": 0.3382316827774048, + "learning_rate": 7.906097279776876e-05, + "loss": 1.8785, + "step": 10541 + }, + { + "epoch": 3.2357274401473295, + "grad_norm": 0.254802942276001, + "learning_rate": 7.905692787672341e-05, + "loss": 1.8276, + "step": 10542 + }, + { + "epoch": 3.236034376918355, + "grad_norm": 0.3362341523170471, + "learning_rate": 7.905288266852047e-05, + "loss": 1.8057, + "step": 10543 + }, + { + "epoch": 3.23634131368938, + "grad_norm": 0.38821661472320557, + "learning_rate": 7.904883717319988e-05, + "loss": 1.7841, + "step": 10544 + }, + { + "epoch": 3.236648250460405, + "grad_norm": 0.33889076113700867, + "learning_rate": 7.90447913908017e-05, + "loss": 1.7892, + "step": 10545 + }, + { + "epoch": 3.2369551872314304, + "grad_norm": 0.2741014361381531, + "learning_rate": 7.904074532136585e-05, + "loss": 1.7611, + "step": 10546 + }, + { + "epoch": 3.2372621240024557, + "grad_norm": 0.28950995206832886, + "learning_rate": 7.903669896493233e-05, + "loss": 1.7963, + "step": 10547 + }, + { + "epoch": 3.2375690607734806, + "grad_norm": 0.30647143721580505, + "learning_rate": 7.903265232154113e-05, + "loss": 1.7522, + "step": 10548 + }, + { + "epoch": 3.237875997544506, + "grad_norm": 0.30428263545036316, + "learning_rate": 7.902860539123225e-05, + "loss": 1.7383, + "step": 10549 + }, + { + "epoch": 3.238182934315531, + "grad_norm": 0.2357146292924881, + "learning_rate": 7.902455817404569e-05, + "loss": 1.7243, + "step": 10550 + }, + { + "epoch": 3.238489871086556, + "grad_norm": 0.3125104606151581, + "learning_rate": 7.90205106700214e-05, + "loss": 1.8542, + "step": 10551 + }, + { + "epoch": 3.2387968078575815, + "grad_norm": 0.25797244906425476, + "learning_rate": 7.901646287919944e-05, + "loss": 1.8374, + "step": 10552 + }, + { + "epoch": 3.2391037446286064, + "grad_norm": 0.3127591907978058, + "learning_rate": 7.901241480161978e-05, + "loss": 1.9457, + "step": 10553 + }, + { + "epoch": 3.2394106813996317, + "grad_norm": 0.2971835434436798, + "learning_rate": 7.900836643732243e-05, + "loss": 1.7933, + "step": 10554 + }, + { + "epoch": 3.2397176181706566, + "grad_norm": 0.28931814432144165, + "learning_rate": 7.90043177863474e-05, + "loss": 1.8201, + "step": 10555 + }, + { + "epoch": 3.240024554941682, + "grad_norm": 0.3348724842071533, + "learning_rate": 7.90002688487347e-05, + "loss": 1.8718, + "step": 10556 + }, + { + "epoch": 3.2403314917127073, + "grad_norm": 0.28566426038742065, + "learning_rate": 7.899621962452436e-05, + "loss": 1.805, + "step": 10557 + }, + { + "epoch": 3.240638428483732, + "grad_norm": 0.27074119448661804, + "learning_rate": 7.899217011375637e-05, + "loss": 1.842, + "step": 10558 + }, + { + "epoch": 3.2409453652547575, + "grad_norm": 0.27014291286468506, + "learning_rate": 7.898812031647076e-05, + "loss": 1.8156, + "step": 10559 + }, + { + "epoch": 3.241252302025783, + "grad_norm": 0.28087863326072693, + "learning_rate": 7.898407023270756e-05, + "loss": 1.8399, + "step": 10560 + }, + { + "epoch": 3.2415592387968077, + "grad_norm": 0.2641037404537201, + "learning_rate": 7.898001986250679e-05, + "loss": 1.7977, + "step": 10561 + }, + { + "epoch": 3.241866175567833, + "grad_norm": 0.2843858301639557, + "learning_rate": 7.897596920590848e-05, + "loss": 1.834, + "step": 10562 + }, + { + "epoch": 3.2421731123388584, + "grad_norm": 0.2724611163139343, + "learning_rate": 7.897191826295266e-05, + "loss": 1.7547, + "step": 10563 + }, + { + "epoch": 3.2424800491098833, + "grad_norm": 0.2583858370780945, + "learning_rate": 7.896786703367935e-05, + "loss": 1.7658, + "step": 10564 + }, + { + "epoch": 3.2427869858809086, + "grad_norm": 0.2666650712490082, + "learning_rate": 7.896381551812861e-05, + "loss": 1.8017, + "step": 10565 + }, + { + "epoch": 3.2430939226519335, + "grad_norm": 0.23269347846508026, + "learning_rate": 7.895976371634047e-05, + "loss": 1.8267, + "step": 10566 + }, + { + "epoch": 3.243400859422959, + "grad_norm": 0.27865225076675415, + "learning_rate": 7.895571162835496e-05, + "loss": 1.8093, + "step": 10567 + }, + { + "epoch": 3.243707796193984, + "grad_norm": 0.29445022344589233, + "learning_rate": 7.895165925421216e-05, + "loss": 1.7999, + "step": 10568 + }, + { + "epoch": 3.244014732965009, + "grad_norm": 0.32135528326034546, + "learning_rate": 7.894760659395206e-05, + "loss": 1.8405, + "step": 10569 + }, + { + "epoch": 3.2443216697360344, + "grad_norm": 0.3409091532230377, + "learning_rate": 7.894355364761477e-05, + "loss": 1.7861, + "step": 10570 + }, + { + "epoch": 3.2446286065070598, + "grad_norm": 0.3379025459289551, + "learning_rate": 7.893950041524032e-05, + "loss": 1.8495, + "step": 10571 + }, + { + "epoch": 3.2449355432780846, + "grad_norm": 0.2843063473701477, + "learning_rate": 7.893544689686874e-05, + "loss": 1.7888, + "step": 10572 + }, + { + "epoch": 3.24524248004911, + "grad_norm": 0.2914074957370758, + "learning_rate": 7.893139309254013e-05, + "loss": 1.7866, + "step": 10573 + }, + { + "epoch": 3.245549416820135, + "grad_norm": 0.39855021238327026, + "learning_rate": 7.892733900229454e-05, + "loss": 1.7865, + "step": 10574 + }, + { + "epoch": 3.24585635359116, + "grad_norm": 0.4232102632522583, + "learning_rate": 7.892328462617203e-05, + "loss": 1.8443, + "step": 10575 + }, + { + "epoch": 3.2461632903621855, + "grad_norm": 0.390794962644577, + "learning_rate": 7.891922996421267e-05, + "loss": 1.8735, + "step": 10576 + }, + { + "epoch": 3.2464702271332104, + "grad_norm": 0.3051595687866211, + "learning_rate": 7.891517501645653e-05, + "loss": 1.8654, + "step": 10577 + }, + { + "epoch": 3.2467771639042358, + "grad_norm": 0.25363096594810486, + "learning_rate": 7.891111978294367e-05, + "loss": 1.7602, + "step": 10578 + }, + { + "epoch": 3.247084100675261, + "grad_norm": 0.29785794019699097, + "learning_rate": 7.890706426371419e-05, + "loss": 1.8242, + "step": 10579 + }, + { + "epoch": 3.247391037446286, + "grad_norm": 0.346162885427475, + "learning_rate": 7.890300845880816e-05, + "loss": 1.8551, + "step": 10580 + }, + { + "epoch": 3.2476979742173113, + "grad_norm": 0.33906155824661255, + "learning_rate": 7.889895236826566e-05, + "loss": 1.765, + "step": 10581 + }, + { + "epoch": 3.248004910988336, + "grad_norm": 0.26083165407180786, + "learning_rate": 7.889489599212676e-05, + "loss": 1.8246, + "step": 10582 + }, + { + "epoch": 3.2483118477593615, + "grad_norm": 0.3042019009590149, + "learning_rate": 7.889083933043157e-05, + "loss": 1.9017, + "step": 10583 + }, + { + "epoch": 3.248618784530387, + "grad_norm": 0.34833577275276184, + "learning_rate": 7.888678238322018e-05, + "loss": 1.7863, + "step": 10584 + }, + { + "epoch": 3.2489257213014118, + "grad_norm": 0.34436655044555664, + "learning_rate": 7.888272515053267e-05, + "loss": 1.7937, + "step": 10585 + }, + { + "epoch": 3.249232658072437, + "grad_norm": 0.2550172507762909, + "learning_rate": 7.887866763240914e-05, + "loss": 1.7615, + "step": 10586 + }, + { + "epoch": 3.2495395948434624, + "grad_norm": 0.3334405720233917, + "learning_rate": 7.88746098288897e-05, + "loss": 1.7465, + "step": 10587 + }, + { + "epoch": 3.2498465316144873, + "grad_norm": 0.4668157696723938, + "learning_rate": 7.887055174001443e-05, + "loss": 1.7836, + "step": 10588 + }, + { + "epoch": 3.2501534683855127, + "grad_norm": 0.524680495262146, + "learning_rate": 7.886649336582344e-05, + "loss": 1.844, + "step": 10589 + }, + { + "epoch": 3.250460405156538, + "grad_norm": 0.36859074234962463, + "learning_rate": 7.886243470635685e-05, + "loss": 1.8072, + "step": 10590 + }, + { + "epoch": 3.250767341927563, + "grad_norm": 0.32370296120643616, + "learning_rate": 7.885837576165478e-05, + "loss": 1.802, + "step": 10591 + }, + { + "epoch": 3.2510742786985882, + "grad_norm": 0.3506374955177307, + "learning_rate": 7.88543165317573e-05, + "loss": 1.7965, + "step": 10592 + }, + { + "epoch": 3.251381215469613, + "grad_norm": 0.39058688282966614, + "learning_rate": 7.885025701670457e-05, + "loss": 1.7987, + "step": 10593 + }, + { + "epoch": 3.2516881522406385, + "grad_norm": 0.3042154014110565, + "learning_rate": 7.884619721653669e-05, + "loss": 1.8345, + "step": 10594 + }, + { + "epoch": 3.251995089011664, + "grad_norm": 0.2249498963356018, + "learning_rate": 7.884213713129378e-05, + "loss": 1.7796, + "step": 10595 + }, + { + "epoch": 3.2523020257826887, + "grad_norm": 0.2701997458934784, + "learning_rate": 7.883807676101595e-05, + "loss": 1.8027, + "step": 10596 + }, + { + "epoch": 3.252608962553714, + "grad_norm": 0.2574785053730011, + "learning_rate": 7.883401610574336e-05, + "loss": 1.7878, + "step": 10597 + }, + { + "epoch": 3.252915899324739, + "grad_norm": 0.24964739382266998, + "learning_rate": 7.882995516551613e-05, + "loss": 1.7612, + "step": 10598 + }, + { + "epoch": 3.2532228360957642, + "grad_norm": 0.2519865930080414, + "learning_rate": 7.882589394037437e-05, + "loss": 1.7583, + "step": 10599 + }, + { + "epoch": 3.2535297728667896, + "grad_norm": 0.23174463212490082, + "learning_rate": 7.882183243035823e-05, + "loss": 1.7607, + "step": 10600 + }, + { + "epoch": 3.2538367096378145, + "grad_norm": 0.28103554248809814, + "learning_rate": 7.881777063550786e-05, + "loss": 1.904, + "step": 10601 + }, + { + "epoch": 3.25414364640884, + "grad_norm": 0.265677809715271, + "learning_rate": 7.881370855586339e-05, + "loss": 1.8169, + "step": 10602 + }, + { + "epoch": 3.254450583179865, + "grad_norm": 0.2539603114128113, + "learning_rate": 7.880964619146493e-05, + "loss": 1.8439, + "step": 10603 + }, + { + "epoch": 3.25475751995089, + "grad_norm": 0.2741886377334595, + "learning_rate": 7.88055835423527e-05, + "loss": 1.8737, + "step": 10604 + }, + { + "epoch": 3.2550644567219154, + "grad_norm": 0.27548348903656006, + "learning_rate": 7.88015206085668e-05, + "loss": 1.8385, + "step": 10605 + }, + { + "epoch": 3.2553713934929407, + "grad_norm": 0.2958502769470215, + "learning_rate": 7.879745739014739e-05, + "loss": 1.8603, + "step": 10606 + }, + { + "epoch": 3.2556783302639656, + "grad_norm": 0.2728644907474518, + "learning_rate": 7.879339388713462e-05, + "loss": 1.8, + "step": 10607 + }, + { + "epoch": 3.255985267034991, + "grad_norm": 0.28718289732933044, + "learning_rate": 7.878933009956866e-05, + "loss": 1.7803, + "step": 10608 + }, + { + "epoch": 3.256292203806016, + "grad_norm": 0.2989691197872162, + "learning_rate": 7.878526602748967e-05, + "loss": 1.8155, + "step": 10609 + }, + { + "epoch": 3.256599140577041, + "grad_norm": 0.24515527486801147, + "learning_rate": 7.87812016709378e-05, + "loss": 1.7623, + "step": 10610 + }, + { + "epoch": 3.2569060773480665, + "grad_norm": 0.29946041107177734, + "learning_rate": 7.877713702995324e-05, + "loss": 1.8097, + "step": 10611 + }, + { + "epoch": 3.2572130141190914, + "grad_norm": 0.2854483723640442, + "learning_rate": 7.877307210457613e-05, + "loss": 1.8088, + "step": 10612 + }, + { + "epoch": 3.2575199508901167, + "grad_norm": 0.27812930941581726, + "learning_rate": 7.876900689484668e-05, + "loss": 1.8151, + "step": 10613 + }, + { + "epoch": 3.2578268876611416, + "grad_norm": 0.2658015787601471, + "learning_rate": 7.876494140080503e-05, + "loss": 1.8314, + "step": 10614 + }, + { + "epoch": 3.258133824432167, + "grad_norm": 0.28935661911964417, + "learning_rate": 7.876087562249137e-05, + "loss": 1.7948, + "step": 10615 + }, + { + "epoch": 3.2584407612031923, + "grad_norm": 0.27497121691703796, + "learning_rate": 7.875680955994587e-05, + "loss": 1.7964, + "step": 10616 + }, + { + "epoch": 3.258747697974217, + "grad_norm": 0.3313405513763428, + "learning_rate": 7.875274321320873e-05, + "loss": 1.8143, + "step": 10617 + }, + { + "epoch": 3.2590546347452425, + "grad_norm": 0.3217218816280365, + "learning_rate": 7.874867658232013e-05, + "loss": 1.7749, + "step": 10618 + }, + { + "epoch": 3.259361571516268, + "grad_norm": 0.25105544924736023, + "learning_rate": 7.874460966732025e-05, + "loss": 1.7834, + "step": 10619 + }, + { + "epoch": 3.2596685082872927, + "grad_norm": 0.2931382358074188, + "learning_rate": 7.874054246824931e-05, + "loss": 1.8252, + "step": 10620 + }, + { + "epoch": 3.259975445058318, + "grad_norm": 0.2803363502025604, + "learning_rate": 7.873647498514747e-05, + "loss": 1.7527, + "step": 10621 + }, + { + "epoch": 3.2602823818293434, + "grad_norm": 0.29857927560806274, + "learning_rate": 7.873240721805492e-05, + "loss": 1.8085, + "step": 10622 + }, + { + "epoch": 3.2605893186003683, + "grad_norm": 0.24864110350608826, + "learning_rate": 7.872833916701192e-05, + "loss": 1.7509, + "step": 10623 + }, + { + "epoch": 3.2608962553713936, + "grad_norm": 0.24105949699878693, + "learning_rate": 7.872427083205862e-05, + "loss": 1.7871, + "step": 10624 + }, + { + "epoch": 3.2612031921424185, + "grad_norm": 0.2429245114326477, + "learning_rate": 7.872020221323523e-05, + "loss": 1.777, + "step": 10625 + }, + { + "epoch": 3.261510128913444, + "grad_norm": 0.234287828207016, + "learning_rate": 7.871613331058197e-05, + "loss": 1.8001, + "step": 10626 + }, + { + "epoch": 3.261817065684469, + "grad_norm": 0.3463406264781952, + "learning_rate": 7.871206412413905e-05, + "loss": 1.8925, + "step": 10627 + }, + { + "epoch": 3.262124002455494, + "grad_norm": 0.26798921823501587, + "learning_rate": 7.87079946539467e-05, + "loss": 1.7963, + "step": 10628 + }, + { + "epoch": 3.2624309392265194, + "grad_norm": 0.28603312373161316, + "learning_rate": 7.87039249000451e-05, + "loss": 1.8308, + "step": 10629 + }, + { + "epoch": 3.2627378759975443, + "grad_norm": 0.2717527747154236, + "learning_rate": 7.86998548624745e-05, + "loss": 1.8246, + "step": 10630 + }, + { + "epoch": 3.2630448127685696, + "grad_norm": 0.32215580344200134, + "learning_rate": 7.86957845412751e-05, + "loss": 1.7278, + "step": 10631 + }, + { + "epoch": 3.263351749539595, + "grad_norm": 0.3578735589981079, + "learning_rate": 7.869171393648717e-05, + "loss": 1.7288, + "step": 10632 + }, + { + "epoch": 3.26365868631062, + "grad_norm": 0.3120707869529724, + "learning_rate": 7.868764304815089e-05, + "loss": 1.7971, + "step": 10633 + }, + { + "epoch": 3.263965623081645, + "grad_norm": 0.27419236302375793, + "learning_rate": 7.86835718763065e-05, + "loss": 1.8529, + "step": 10634 + }, + { + "epoch": 3.2642725598526705, + "grad_norm": 0.3200531601905823, + "learning_rate": 7.867950042099423e-05, + "loss": 1.7892, + "step": 10635 + }, + { + "epoch": 3.2645794966236954, + "grad_norm": 0.325706422328949, + "learning_rate": 7.867542868225435e-05, + "loss": 1.8236, + "step": 10636 + }, + { + "epoch": 3.2648864333947207, + "grad_norm": 0.2950136065483093, + "learning_rate": 7.867135666012707e-05, + "loss": 1.8163, + "step": 10637 + }, + { + "epoch": 3.265193370165746, + "grad_norm": 0.2772117257118225, + "learning_rate": 7.866728435465263e-05, + "loss": 1.8373, + "step": 10638 + }, + { + "epoch": 3.265500306936771, + "grad_norm": 0.2887401580810547, + "learning_rate": 7.866321176587129e-05, + "loss": 1.7756, + "step": 10639 + }, + { + "epoch": 3.2658072437077963, + "grad_norm": 0.3474489152431488, + "learning_rate": 7.865913889382329e-05, + "loss": 1.7539, + "step": 10640 + }, + { + "epoch": 3.266114180478821, + "grad_norm": 0.3433493971824646, + "learning_rate": 7.865506573854888e-05, + "loss": 1.7987, + "step": 10641 + }, + { + "epoch": 3.2664211172498465, + "grad_norm": 0.3075394630432129, + "learning_rate": 7.865099230008832e-05, + "loss": 1.7907, + "step": 10642 + }, + { + "epoch": 3.266728054020872, + "grad_norm": 0.24817697703838348, + "learning_rate": 7.864691857848187e-05, + "loss": 1.7941, + "step": 10643 + }, + { + "epoch": 3.2670349907918967, + "grad_norm": 0.290147602558136, + "learning_rate": 7.864284457376976e-05, + "loss": 1.9125, + "step": 10644 + }, + { + "epoch": 3.267341927562922, + "grad_norm": 0.253684937953949, + "learning_rate": 7.863877028599229e-05, + "loss": 1.8084, + "step": 10645 + }, + { + "epoch": 3.267648864333947, + "grad_norm": 0.26349252462387085, + "learning_rate": 7.863469571518969e-05, + "loss": 1.7548, + "step": 10646 + }, + { + "epoch": 3.2679558011049723, + "grad_norm": 0.30568864941596985, + "learning_rate": 7.863062086140224e-05, + "loss": 1.8551, + "step": 10647 + }, + { + "epoch": 3.2682627378759976, + "grad_norm": 0.2866690456867218, + "learning_rate": 7.862654572467024e-05, + "loss": 1.8145, + "step": 10648 + }, + { + "epoch": 3.2685696746470225, + "grad_norm": 0.32022854685783386, + "learning_rate": 7.862247030503391e-05, + "loss": 1.896, + "step": 10649 + }, + { + "epoch": 3.268876611418048, + "grad_norm": 0.25260284543037415, + "learning_rate": 7.861839460253356e-05, + "loss": 1.814, + "step": 10650 + }, + { + "epoch": 3.269183548189073, + "grad_norm": 0.26776066422462463, + "learning_rate": 7.861431861720947e-05, + "loss": 1.7755, + "step": 10651 + }, + { + "epoch": 3.269490484960098, + "grad_norm": 0.26514193415641785, + "learning_rate": 7.861024234910191e-05, + "loss": 1.7606, + "step": 10652 + }, + { + "epoch": 3.2697974217311234, + "grad_norm": 0.27213940024375916, + "learning_rate": 7.860616579825116e-05, + "loss": 1.8074, + "step": 10653 + }, + { + "epoch": 3.2701043585021488, + "grad_norm": 0.29192888736724854, + "learning_rate": 7.860208896469752e-05, + "loss": 1.8436, + "step": 10654 + }, + { + "epoch": 3.2704112952731736, + "grad_norm": 0.3772370219230652, + "learning_rate": 7.859801184848127e-05, + "loss": 1.8096, + "step": 10655 + }, + { + "epoch": 3.270718232044199, + "grad_norm": 0.4574970006942749, + "learning_rate": 7.859393444964269e-05, + "loss": 1.7612, + "step": 10656 + }, + { + "epoch": 3.271025168815224, + "grad_norm": 0.4614393413066864, + "learning_rate": 7.858985676822211e-05, + "loss": 1.8529, + "step": 10657 + }, + { + "epoch": 3.271332105586249, + "grad_norm": 0.33567267656326294, + "learning_rate": 7.85857788042598e-05, + "loss": 1.8391, + "step": 10658 + }, + { + "epoch": 3.2716390423572745, + "grad_norm": 0.2564064860343933, + "learning_rate": 7.858170055779609e-05, + "loss": 1.7621, + "step": 10659 + }, + { + "epoch": 3.2719459791282994, + "grad_norm": 0.26769882440567017, + "learning_rate": 7.857762202887122e-05, + "loss": 1.8145, + "step": 10660 + }, + { + "epoch": 3.2722529158993248, + "grad_norm": 0.262008935213089, + "learning_rate": 7.857354321752558e-05, + "loss": 1.7513, + "step": 10661 + }, + { + "epoch": 3.27255985267035, + "grad_norm": 0.26494377851486206, + "learning_rate": 7.856946412379942e-05, + "loss": 1.8071, + "step": 10662 + }, + { + "epoch": 3.272866789441375, + "grad_norm": 0.25613999366760254, + "learning_rate": 7.856538474773307e-05, + "loss": 1.8775, + "step": 10663 + }, + { + "epoch": 3.2731737262124003, + "grad_norm": 0.24789929389953613, + "learning_rate": 7.856130508936684e-05, + "loss": 1.8055, + "step": 10664 + }, + { + "epoch": 3.2734806629834257, + "grad_norm": 0.29111939668655396, + "learning_rate": 7.855722514874107e-05, + "loss": 1.8114, + "step": 10665 + }, + { + "epoch": 3.2737875997544506, + "grad_norm": 0.30511030554771423, + "learning_rate": 7.855314492589605e-05, + "loss": 1.8131, + "step": 10666 + }, + { + "epoch": 3.274094536525476, + "grad_norm": 0.2545989453792572, + "learning_rate": 7.854906442087212e-05, + "loss": 1.7933, + "step": 10667 + }, + { + "epoch": 3.2744014732965008, + "grad_norm": 0.26684823632240295, + "learning_rate": 7.85449836337096e-05, + "loss": 1.7604, + "step": 10668 + }, + { + "epoch": 3.274708410067526, + "grad_norm": 0.5097808837890625, + "learning_rate": 7.854090256444881e-05, + "loss": 1.777, + "step": 10669 + }, + { + "epoch": 3.2750153468385514, + "grad_norm": 0.27828142046928406, + "learning_rate": 7.853682121313011e-05, + "loss": 1.7885, + "step": 10670 + }, + { + "epoch": 3.2753222836095763, + "grad_norm": 0.2925552725791931, + "learning_rate": 7.853273957979381e-05, + "loss": 1.7962, + "step": 10671 + }, + { + "epoch": 3.2756292203806017, + "grad_norm": 0.284574955701828, + "learning_rate": 7.852865766448025e-05, + "loss": 1.8645, + "step": 10672 + }, + { + "epoch": 3.2759361571516266, + "grad_norm": 0.23407664895057678, + "learning_rate": 7.85245754672298e-05, + "loss": 1.7106, + "step": 10673 + }, + { + "epoch": 3.276243093922652, + "grad_norm": 0.2555919885635376, + "learning_rate": 7.852049298808274e-05, + "loss": 1.8237, + "step": 10674 + }, + { + "epoch": 3.2765500306936772, + "grad_norm": 0.26703694462776184, + "learning_rate": 7.851641022707947e-05, + "loss": 1.7844, + "step": 10675 + }, + { + "epoch": 3.276856967464702, + "grad_norm": 0.24889135360717773, + "learning_rate": 7.851232718426033e-05, + "loss": 1.7783, + "step": 10676 + }, + { + "epoch": 3.2771639042357275, + "grad_norm": 0.25770726799964905, + "learning_rate": 7.850824385966564e-05, + "loss": 1.8007, + "step": 10677 + }, + { + "epoch": 3.277470841006753, + "grad_norm": 0.31806984543800354, + "learning_rate": 7.850416025333578e-05, + "loss": 1.8623, + "step": 10678 + }, + { + "epoch": 3.2777777777777777, + "grad_norm": 0.2906930148601532, + "learning_rate": 7.850007636531111e-05, + "loss": 1.8315, + "step": 10679 + }, + { + "epoch": 3.278084714548803, + "grad_norm": 0.2802525460720062, + "learning_rate": 7.849599219563197e-05, + "loss": 1.8488, + "step": 10680 + }, + { + "epoch": 3.2783916513198283, + "grad_norm": 0.26150405406951904, + "learning_rate": 7.849190774433874e-05, + "loss": 1.7967, + "step": 10681 + }, + { + "epoch": 3.2786985880908532, + "grad_norm": 0.25863370299339294, + "learning_rate": 7.848782301147178e-05, + "loss": 1.864, + "step": 10682 + }, + { + "epoch": 3.2790055248618786, + "grad_norm": 0.25381043553352356, + "learning_rate": 7.848373799707145e-05, + "loss": 1.8239, + "step": 10683 + }, + { + "epoch": 3.2793124616329035, + "grad_norm": 0.2583387792110443, + "learning_rate": 7.847965270117814e-05, + "loss": 1.8449, + "step": 10684 + }, + { + "epoch": 3.279619398403929, + "grad_norm": 0.30759841203689575, + "learning_rate": 7.84755671238322e-05, + "loss": 1.7992, + "step": 10685 + }, + { + "epoch": 3.279926335174954, + "grad_norm": 0.4316023588180542, + "learning_rate": 7.847148126507402e-05, + "loss": 1.7912, + "step": 10686 + }, + { + "epoch": 3.280233271945979, + "grad_norm": 0.3988901674747467, + "learning_rate": 7.846739512494396e-05, + "loss": 1.8831, + "step": 10687 + }, + { + "epoch": 3.2805402087170044, + "grad_norm": 0.318934828042984, + "learning_rate": 7.846330870348244e-05, + "loss": 1.8411, + "step": 10688 + }, + { + "epoch": 3.2808471454880292, + "grad_norm": 0.27755632996559143, + "learning_rate": 7.84592220007298e-05, + "loss": 1.8763, + "step": 10689 + }, + { + "epoch": 3.2811540822590546, + "grad_norm": 0.33544883131980896, + "learning_rate": 7.845513501672646e-05, + "loss": 1.731, + "step": 10690 + }, + { + "epoch": 3.28146101903008, + "grad_norm": 0.28299057483673096, + "learning_rate": 7.845104775151278e-05, + "loss": 1.813, + "step": 10691 + }, + { + "epoch": 3.281767955801105, + "grad_norm": 0.2761382460594177, + "learning_rate": 7.844696020512918e-05, + "loss": 1.8018, + "step": 10692 + }, + { + "epoch": 3.28207489257213, + "grad_norm": 0.2919033169746399, + "learning_rate": 7.844287237761605e-05, + "loss": 1.793, + "step": 10693 + }, + { + "epoch": 3.2823818293431555, + "grad_norm": 0.32922014594078064, + "learning_rate": 7.843878426901378e-05, + "loss": 1.8186, + "step": 10694 + }, + { + "epoch": 3.2826887661141804, + "grad_norm": 0.2818562090396881, + "learning_rate": 7.843469587936279e-05, + "loss": 1.7794, + "step": 10695 + }, + { + "epoch": 3.2829957028852057, + "grad_norm": 0.26414254307746887, + "learning_rate": 7.843060720870345e-05, + "loss": 1.7854, + "step": 10696 + }, + { + "epoch": 3.283302639656231, + "grad_norm": 0.28345760703086853, + "learning_rate": 7.842651825707618e-05, + "loss": 1.7659, + "step": 10697 + }, + { + "epoch": 3.283609576427256, + "grad_norm": 0.3522340655326843, + "learning_rate": 7.842242902452141e-05, + "loss": 1.8427, + "step": 10698 + }, + { + "epoch": 3.2839165131982813, + "grad_norm": 0.2861590087413788, + "learning_rate": 7.841833951107954e-05, + "loss": 1.7539, + "step": 10699 + }, + { + "epoch": 3.284223449969306, + "grad_norm": 0.2596624493598938, + "learning_rate": 7.841424971679099e-05, + "loss": 1.8407, + "step": 10700 + }, + { + "epoch": 3.2845303867403315, + "grad_norm": 0.2847718298435211, + "learning_rate": 7.841015964169616e-05, + "loss": 1.8085, + "step": 10701 + }, + { + "epoch": 3.284837323511357, + "grad_norm": 0.29566115140914917, + "learning_rate": 7.840606928583547e-05, + "loss": 1.7873, + "step": 10702 + }, + { + "epoch": 3.2851442602823817, + "grad_norm": 0.2752111256122589, + "learning_rate": 7.840197864924936e-05, + "loss": 1.8186, + "step": 10703 + }, + { + "epoch": 3.285451197053407, + "grad_norm": 0.2907958924770355, + "learning_rate": 7.839788773197826e-05, + "loss": 1.8081, + "step": 10704 + }, + { + "epoch": 3.285758133824432, + "grad_norm": 0.25808724761009216, + "learning_rate": 7.839379653406258e-05, + "loss": 1.7635, + "step": 10705 + }, + { + "epoch": 3.2860650705954573, + "grad_norm": 0.2732730507850647, + "learning_rate": 7.838970505554277e-05, + "loss": 1.8061, + "step": 10706 + }, + { + "epoch": 3.2863720073664826, + "grad_norm": 0.23820067942142487, + "learning_rate": 7.838561329645923e-05, + "loss": 1.8091, + "step": 10707 + }, + { + "epoch": 3.2866789441375075, + "grad_norm": 0.24179396033287048, + "learning_rate": 7.838152125685245e-05, + "loss": 1.7513, + "step": 10708 + }, + { + "epoch": 3.286985880908533, + "grad_norm": 0.2627546787261963, + "learning_rate": 7.837742893676283e-05, + "loss": 1.8741, + "step": 10709 + }, + { + "epoch": 3.287292817679558, + "grad_norm": 0.2827817499637604, + "learning_rate": 7.837333633623083e-05, + "loss": 1.8387, + "step": 10710 + }, + { + "epoch": 3.287599754450583, + "grad_norm": 0.2666749060153961, + "learning_rate": 7.836924345529688e-05, + "loss": 1.8319, + "step": 10711 + }, + { + "epoch": 3.2879066912216084, + "grad_norm": 0.3403390944004059, + "learning_rate": 7.836515029400145e-05, + "loss": 1.7827, + "step": 10712 + }, + { + "epoch": 3.2882136279926337, + "grad_norm": 0.30646705627441406, + "learning_rate": 7.836105685238497e-05, + "loss": 1.8612, + "step": 10713 + }, + { + "epoch": 3.2885205647636586, + "grad_norm": 0.2580253481864929, + "learning_rate": 7.83569631304879e-05, + "loss": 1.7332, + "step": 10714 + }, + { + "epoch": 3.288827501534684, + "grad_norm": 0.23734542727470398, + "learning_rate": 7.835286912835071e-05, + "loss": 1.7899, + "step": 10715 + }, + { + "epoch": 3.289134438305709, + "grad_norm": 0.2457810491323471, + "learning_rate": 7.834877484601384e-05, + "loss": 1.8059, + "step": 10716 + }, + { + "epoch": 3.289441375076734, + "grad_norm": 0.2558443248271942, + "learning_rate": 7.834468028351778e-05, + "loss": 1.8689, + "step": 10717 + }, + { + "epoch": 3.2897483118477595, + "grad_norm": 0.26596710085868835, + "learning_rate": 7.834058544090298e-05, + "loss": 1.816, + "step": 10718 + }, + { + "epoch": 3.2900552486187844, + "grad_norm": 0.25424903631210327, + "learning_rate": 7.833649031820987e-05, + "loss": 1.7907, + "step": 10719 + }, + { + "epoch": 3.2903621853898097, + "grad_norm": 0.23873139917850494, + "learning_rate": 7.833239491547896e-05, + "loss": 1.7666, + "step": 10720 + }, + { + "epoch": 3.2906691221608346, + "grad_norm": 0.23292972147464752, + "learning_rate": 7.832829923275073e-05, + "loss": 1.7674, + "step": 10721 + }, + { + "epoch": 3.29097605893186, + "grad_norm": 0.30133312940597534, + "learning_rate": 7.832420327006566e-05, + "loss": 1.8229, + "step": 10722 + }, + { + "epoch": 3.2912829957028853, + "grad_norm": 0.2882522642612457, + "learning_rate": 7.83201070274642e-05, + "loss": 1.7855, + "step": 10723 + }, + { + "epoch": 3.29158993247391, + "grad_norm": 0.2578088045120239, + "learning_rate": 7.831601050498683e-05, + "loss": 1.7276, + "step": 10724 + }, + { + "epoch": 3.2918968692449355, + "grad_norm": 0.29511600732803345, + "learning_rate": 7.831191370267406e-05, + "loss": 1.8085, + "step": 10725 + }, + { + "epoch": 3.292203806015961, + "grad_norm": 0.29557499289512634, + "learning_rate": 7.830781662056634e-05, + "loss": 1.815, + "step": 10726 + }, + { + "epoch": 3.2925107427869857, + "grad_norm": 0.32722121477127075, + "learning_rate": 7.830371925870422e-05, + "loss": 1.7889, + "step": 10727 + }, + { + "epoch": 3.292817679558011, + "grad_norm": 0.3124488592147827, + "learning_rate": 7.829962161712814e-05, + "loss": 1.8063, + "step": 10728 + }, + { + "epoch": 3.2931246163290364, + "grad_norm": 0.311334490776062, + "learning_rate": 7.829552369587861e-05, + "loss": 1.8852, + "step": 10729 + }, + { + "epoch": 3.2934315531000613, + "grad_norm": 0.28010860085487366, + "learning_rate": 7.829142549499613e-05, + "loss": 1.8274, + "step": 10730 + }, + { + "epoch": 3.2937384898710866, + "grad_norm": 0.3453529477119446, + "learning_rate": 7.828732701452119e-05, + "loss": 1.8618, + "step": 10731 + }, + { + "epoch": 3.2940454266421115, + "grad_norm": 0.2946802079677582, + "learning_rate": 7.828322825449432e-05, + "loss": 1.7123, + "step": 10732 + }, + { + "epoch": 3.294352363413137, + "grad_norm": 0.2467648684978485, + "learning_rate": 7.827912921495601e-05, + "loss": 1.7786, + "step": 10733 + }, + { + "epoch": 3.294659300184162, + "grad_norm": 0.2957034707069397, + "learning_rate": 7.827502989594677e-05, + "loss": 1.7817, + "step": 10734 + }, + { + "epoch": 3.294966236955187, + "grad_norm": 0.300905704498291, + "learning_rate": 7.827093029750713e-05, + "loss": 1.7582, + "step": 10735 + }, + { + "epoch": 3.2952731737262124, + "grad_norm": 0.28935131430625916, + "learning_rate": 7.826683041967757e-05, + "loss": 1.7766, + "step": 10736 + }, + { + "epoch": 3.2955801104972378, + "grad_norm": 0.26046010851860046, + "learning_rate": 7.826273026249861e-05, + "loss": 1.8152, + "step": 10737 + }, + { + "epoch": 3.2958870472682626, + "grad_norm": 0.24247924983501434, + "learning_rate": 7.82586298260108e-05, + "loss": 1.8679, + "step": 10738 + }, + { + "epoch": 3.296193984039288, + "grad_norm": 0.25977620482444763, + "learning_rate": 7.825452911025466e-05, + "loss": 1.8108, + "step": 10739 + }, + { + "epoch": 3.2965009208103133, + "grad_norm": 0.2732592821121216, + "learning_rate": 7.825042811527068e-05, + "loss": 1.7355, + "step": 10740 + }, + { + "epoch": 3.296807857581338, + "grad_norm": 0.38407859206199646, + "learning_rate": 7.824632684109941e-05, + "loss": 1.8418, + "step": 10741 + }, + { + "epoch": 3.2971147943523635, + "grad_norm": 0.4239252805709839, + "learning_rate": 7.82422252877814e-05, + "loss": 1.7655, + "step": 10742 + }, + { + "epoch": 3.2974217311233884, + "grad_norm": 0.3810526132583618, + "learning_rate": 7.823812345535716e-05, + "loss": 1.8804, + "step": 10743 + }, + { + "epoch": 3.2977286678944138, + "grad_norm": 0.29939520359039307, + "learning_rate": 7.823402134386722e-05, + "loss": 1.8207, + "step": 10744 + }, + { + "epoch": 3.298035604665439, + "grad_norm": 0.4053972065448761, + "learning_rate": 7.822991895335215e-05, + "loss": 1.7901, + "step": 10745 + }, + { + "epoch": 3.298342541436464, + "grad_norm": 0.4975005090236664, + "learning_rate": 7.822581628385247e-05, + "loss": 1.8344, + "step": 10746 + }, + { + "epoch": 3.2986494782074893, + "grad_norm": 0.4100436270236969, + "learning_rate": 7.822171333540874e-05, + "loss": 1.7891, + "step": 10747 + }, + { + "epoch": 3.298956414978514, + "grad_norm": 0.2817644476890564, + "learning_rate": 7.821761010806147e-05, + "loss": 1.7895, + "step": 10748 + }, + { + "epoch": 3.2992633517495396, + "grad_norm": 0.332660973072052, + "learning_rate": 7.821350660185125e-05, + "loss": 1.7281, + "step": 10749 + }, + { + "epoch": 3.299570288520565, + "grad_norm": 0.42652732133865356, + "learning_rate": 7.820940281681863e-05, + "loss": 1.7855, + "step": 10750 + }, + { + "epoch": 3.2998772252915898, + "grad_norm": 0.35700714588165283, + "learning_rate": 7.820529875300415e-05, + "loss": 1.8722, + "step": 10751 + }, + { + "epoch": 3.300184162062615, + "grad_norm": 0.25305211544036865, + "learning_rate": 7.820119441044838e-05, + "loss": 1.7696, + "step": 10752 + }, + { + "epoch": 3.3004910988336404, + "grad_norm": 0.280205637216568, + "learning_rate": 7.819708978919188e-05, + "loss": 1.756, + "step": 10753 + }, + { + "epoch": 3.3007980356046653, + "grad_norm": 0.4176226854324341, + "learning_rate": 7.819298488927521e-05, + "loss": 1.7731, + "step": 10754 + }, + { + "epoch": 3.3011049723756907, + "grad_norm": 0.4264865517616272, + "learning_rate": 7.818887971073894e-05, + "loss": 1.7851, + "step": 10755 + }, + { + "epoch": 3.301411909146716, + "grad_norm": 0.2901221215724945, + "learning_rate": 7.818477425362363e-05, + "loss": 1.7356, + "step": 10756 + }, + { + "epoch": 3.301718845917741, + "grad_norm": 0.29583361744880676, + "learning_rate": 7.818066851796986e-05, + "loss": 1.8269, + "step": 10757 + }, + { + "epoch": 3.3020257826887662, + "grad_norm": 0.38592997193336487, + "learning_rate": 7.817656250381821e-05, + "loss": 1.7515, + "step": 10758 + }, + { + "epoch": 3.302332719459791, + "grad_norm": 0.29301533102989197, + "learning_rate": 7.817245621120927e-05, + "loss": 1.7955, + "step": 10759 + }, + { + "epoch": 3.3026396562308165, + "grad_norm": 0.2770880162715912, + "learning_rate": 7.816834964018359e-05, + "loss": 1.7899, + "step": 10760 + }, + { + "epoch": 3.302946593001842, + "grad_norm": 0.32566413283348083, + "learning_rate": 7.816424279078176e-05, + "loss": 1.74, + "step": 10761 + }, + { + "epoch": 3.3032535297728667, + "grad_norm": 0.3077750504016876, + "learning_rate": 7.81601356630444e-05, + "loss": 1.8123, + "step": 10762 + }, + { + "epoch": 3.303560466543892, + "grad_norm": 0.2826370298862457, + "learning_rate": 7.815602825701206e-05, + "loss": 1.865, + "step": 10763 + }, + { + "epoch": 3.303867403314917, + "grad_norm": 0.31700822710990906, + "learning_rate": 7.815192057272534e-05, + "loss": 1.8021, + "step": 10764 + }, + { + "epoch": 3.3041743400859422, + "grad_norm": 0.33182790875434875, + "learning_rate": 7.814781261022486e-05, + "loss": 1.818, + "step": 10765 + }, + { + "epoch": 3.3044812768569676, + "grad_norm": 0.2720039486885071, + "learning_rate": 7.814370436955118e-05, + "loss": 1.8369, + "step": 10766 + }, + { + "epoch": 3.3047882136279925, + "grad_norm": 0.28134068846702576, + "learning_rate": 7.813959585074493e-05, + "loss": 1.8391, + "step": 10767 + }, + { + "epoch": 3.305095150399018, + "grad_norm": 0.25748828053474426, + "learning_rate": 7.813548705384667e-05, + "loss": 1.7987, + "step": 10768 + }, + { + "epoch": 3.305402087170043, + "grad_norm": 0.26187625527381897, + "learning_rate": 7.813137797889708e-05, + "loss": 1.7645, + "step": 10769 + }, + { + "epoch": 3.305709023941068, + "grad_norm": 0.297262579202652, + "learning_rate": 7.812726862593671e-05, + "loss": 1.771, + "step": 10770 + }, + { + "epoch": 3.3060159607120934, + "grad_norm": 0.2987872064113617, + "learning_rate": 7.812315899500618e-05, + "loss": 1.8115, + "step": 10771 + }, + { + "epoch": 3.3063228974831187, + "grad_norm": 0.31963878870010376, + "learning_rate": 7.81190490861461e-05, + "loss": 1.7685, + "step": 10772 + }, + { + "epoch": 3.3066298342541436, + "grad_norm": 0.27007177472114563, + "learning_rate": 7.81149388993971e-05, + "loss": 1.8272, + "step": 10773 + }, + { + "epoch": 3.306936771025169, + "grad_norm": 0.26818498969078064, + "learning_rate": 7.811082843479981e-05, + "loss": 1.7894, + "step": 10774 + }, + { + "epoch": 3.307243707796194, + "grad_norm": 0.28857091069221497, + "learning_rate": 7.810671769239483e-05, + "loss": 1.8769, + "step": 10775 + }, + { + "epoch": 3.307550644567219, + "grad_norm": 0.26983144879341125, + "learning_rate": 7.810260667222277e-05, + "loss": 1.796, + "step": 10776 + }, + { + "epoch": 3.3078575813382445, + "grad_norm": 0.2566467225551605, + "learning_rate": 7.809849537432432e-05, + "loss": 1.848, + "step": 10777 + }, + { + "epoch": 3.3081645181092694, + "grad_norm": 0.25607848167419434, + "learning_rate": 7.809438379874005e-05, + "loss": 1.8072, + "step": 10778 + }, + { + "epoch": 3.3084714548802947, + "grad_norm": 0.29158470034599304, + "learning_rate": 7.809027194551059e-05, + "loss": 1.7772, + "step": 10779 + }, + { + "epoch": 3.3087783916513196, + "grad_norm": 0.360897421836853, + "learning_rate": 7.808615981467664e-05, + "loss": 1.8404, + "step": 10780 + }, + { + "epoch": 3.309085328422345, + "grad_norm": 0.31121253967285156, + "learning_rate": 7.808204740627877e-05, + "loss": 1.8137, + "step": 10781 + }, + { + "epoch": 3.3093922651933703, + "grad_norm": 0.2846451699733734, + "learning_rate": 7.807793472035765e-05, + "loss": 1.8367, + "step": 10782 + }, + { + "epoch": 3.309699201964395, + "grad_norm": 0.2711004316806793, + "learning_rate": 7.807382175695393e-05, + "loss": 1.7728, + "step": 10783 + }, + { + "epoch": 3.3100061387354205, + "grad_norm": 0.2693859338760376, + "learning_rate": 7.806970851610824e-05, + "loss": 1.7026, + "step": 10784 + }, + { + "epoch": 3.310313075506446, + "grad_norm": 0.3050517439842224, + "learning_rate": 7.806559499786125e-05, + "loss": 1.8041, + "step": 10785 + }, + { + "epoch": 3.3106200122774707, + "grad_norm": 0.27304747700691223, + "learning_rate": 7.80614812022536e-05, + "loss": 1.8182, + "step": 10786 + }, + { + "epoch": 3.310926949048496, + "grad_norm": 0.28378555178642273, + "learning_rate": 7.805736712932594e-05, + "loss": 1.8519, + "step": 10787 + }, + { + "epoch": 3.3112338858195214, + "grad_norm": 0.30620133876800537, + "learning_rate": 7.805325277911892e-05, + "loss": 1.8594, + "step": 10788 + }, + { + "epoch": 3.3115408225905463, + "grad_norm": 0.2580169141292572, + "learning_rate": 7.804913815167325e-05, + "loss": 1.7897, + "step": 10789 + }, + { + "epoch": 3.3118477593615716, + "grad_norm": 0.28937023878097534, + "learning_rate": 7.804502324702951e-05, + "loss": 1.8362, + "step": 10790 + }, + { + "epoch": 3.3121546961325965, + "grad_norm": 0.28032705187797546, + "learning_rate": 7.804090806522844e-05, + "loss": 1.8168, + "step": 10791 + }, + { + "epoch": 3.312461632903622, + "grad_norm": 0.33712559938430786, + "learning_rate": 7.803679260631069e-05, + "loss": 1.7489, + "step": 10792 + }, + { + "epoch": 3.312768569674647, + "grad_norm": 0.40536820888519287, + "learning_rate": 7.80326768703169e-05, + "loss": 1.8413, + "step": 10793 + }, + { + "epoch": 3.313075506445672, + "grad_norm": 0.34967559576034546, + "learning_rate": 7.802856085728778e-05, + "loss": 1.8076, + "step": 10794 + }, + { + "epoch": 3.3133824432166974, + "grad_norm": 0.2429870367050171, + "learning_rate": 7.8024444567264e-05, + "loss": 1.8002, + "step": 10795 + }, + { + "epoch": 3.3136893799877223, + "grad_norm": 0.40956684947013855, + "learning_rate": 7.802032800028621e-05, + "loss": 1.8151, + "step": 10796 + }, + { + "epoch": 3.3139963167587476, + "grad_norm": 0.4908781945705414, + "learning_rate": 7.801621115639512e-05, + "loss": 1.8124, + "step": 10797 + }, + { + "epoch": 3.314303253529773, + "grad_norm": 0.3922197222709656, + "learning_rate": 7.801209403563143e-05, + "loss": 1.7911, + "step": 10798 + }, + { + "epoch": 3.314610190300798, + "grad_norm": 0.29467105865478516, + "learning_rate": 7.800797663803578e-05, + "loss": 1.8472, + "step": 10799 + }, + { + "epoch": 3.314917127071823, + "grad_norm": 0.384974867105484, + "learning_rate": 7.800385896364891e-05, + "loss": 1.8139, + "step": 10800 + }, + { + "epoch": 3.3152240638428485, + "grad_norm": 0.4605129063129425, + "learning_rate": 7.79997410125115e-05, + "loss": 1.7982, + "step": 10801 + }, + { + "epoch": 3.3155310006138734, + "grad_norm": 0.2982464134693146, + "learning_rate": 7.799562278466423e-05, + "loss": 1.8496, + "step": 10802 + }, + { + "epoch": 3.3158379373848987, + "grad_norm": 0.3101392984390259, + "learning_rate": 7.79915042801478e-05, + "loss": 1.8172, + "step": 10803 + }, + { + "epoch": 3.316144874155924, + "grad_norm": 0.3651282489299774, + "learning_rate": 7.798738549900292e-05, + "loss": 1.7497, + "step": 10804 + }, + { + "epoch": 3.316451810926949, + "grad_norm": 0.28504419326782227, + "learning_rate": 7.79832664412703e-05, + "loss": 1.8027, + "step": 10805 + }, + { + "epoch": 3.3167587476979743, + "grad_norm": 0.28333309292793274, + "learning_rate": 7.797914710699063e-05, + "loss": 1.8121, + "step": 10806 + }, + { + "epoch": 3.317065684468999, + "grad_norm": 0.37549784779548645, + "learning_rate": 7.797502749620462e-05, + "loss": 1.817, + "step": 10807 + }, + { + "epoch": 3.3173726212400245, + "grad_norm": 0.3864210844039917, + "learning_rate": 7.797090760895301e-05, + "loss": 1.852, + "step": 10808 + }, + { + "epoch": 3.31767955801105, + "grad_norm": 0.2422102987766266, + "learning_rate": 7.79667874452765e-05, + "loss": 1.7523, + "step": 10809 + }, + { + "epoch": 3.3179864947820747, + "grad_norm": 0.307892382144928, + "learning_rate": 7.79626670052158e-05, + "loss": 1.7436, + "step": 10810 + }, + { + "epoch": 3.3182934315531, + "grad_norm": 0.29607462882995605, + "learning_rate": 7.795854628881162e-05, + "loss": 1.768, + "step": 10811 + }, + { + "epoch": 3.3186003683241254, + "grad_norm": 0.23334427177906036, + "learning_rate": 7.795442529610471e-05, + "loss": 1.7687, + "step": 10812 + }, + { + "epoch": 3.3189073050951503, + "grad_norm": 0.26257455348968506, + "learning_rate": 7.795030402713578e-05, + "loss": 1.8266, + "step": 10813 + }, + { + "epoch": 3.3192142418661756, + "grad_norm": 0.3252788782119751, + "learning_rate": 7.794618248194556e-05, + "loss": 1.8645, + "step": 10814 + }, + { + "epoch": 3.319521178637201, + "grad_norm": 0.3807232975959778, + "learning_rate": 7.79420606605748e-05, + "loss": 1.8154, + "step": 10815 + }, + { + "epoch": 3.319828115408226, + "grad_norm": 0.3395625948905945, + "learning_rate": 7.793793856306422e-05, + "loss": 1.8002, + "step": 10816 + }, + { + "epoch": 3.320135052179251, + "grad_norm": 0.2896415889263153, + "learning_rate": 7.793381618945455e-05, + "loss": 1.8077, + "step": 10817 + }, + { + "epoch": 3.320441988950276, + "grad_norm": 0.27733489871025085, + "learning_rate": 7.792969353978652e-05, + "loss": 1.7976, + "step": 10818 + }, + { + "epoch": 3.3207489257213014, + "grad_norm": 0.36985141038894653, + "learning_rate": 7.79255706141009e-05, + "loss": 1.8724, + "step": 10819 + }, + { + "epoch": 3.3210558624923268, + "grad_norm": 0.37886983156204224, + "learning_rate": 7.792144741243843e-05, + "loss": 1.8249, + "step": 10820 + }, + { + "epoch": 3.3213627992633517, + "grad_norm": 0.3030721843242645, + "learning_rate": 7.791732393483986e-05, + "loss": 1.7975, + "step": 10821 + }, + { + "epoch": 3.321669736034377, + "grad_norm": 0.2637709081172943, + "learning_rate": 7.791320018134592e-05, + "loss": 1.7205, + "step": 10822 + }, + { + "epoch": 3.321976672805402, + "grad_norm": 0.35307520627975464, + "learning_rate": 7.790907615199736e-05, + "loss": 1.8786, + "step": 10823 + }, + { + "epoch": 3.322283609576427, + "grad_norm": 0.3333272635936737, + "learning_rate": 7.790495184683497e-05, + "loss": 1.7715, + "step": 10824 + }, + { + "epoch": 3.3225905463474525, + "grad_norm": 0.2597469091415405, + "learning_rate": 7.790082726589948e-05, + "loss": 1.8379, + "step": 10825 + }, + { + "epoch": 3.3228974831184774, + "grad_norm": 0.34176257252693176, + "learning_rate": 7.789670240923168e-05, + "loss": 1.8305, + "step": 10826 + }, + { + "epoch": 3.3232044198895028, + "grad_norm": 0.37954533100128174, + "learning_rate": 7.789257727687229e-05, + "loss": 1.7728, + "step": 10827 + }, + { + "epoch": 3.323511356660528, + "grad_norm": 0.2840248644351959, + "learning_rate": 7.788845186886212e-05, + "loss": 1.8059, + "step": 10828 + }, + { + "epoch": 3.323818293431553, + "grad_norm": 0.3650275766849518, + "learning_rate": 7.788432618524193e-05, + "loss": 1.8127, + "step": 10829 + }, + { + "epoch": 3.3241252302025783, + "grad_norm": 0.4869692623615265, + "learning_rate": 7.788020022605247e-05, + "loss": 1.833, + "step": 10830 + }, + { + "epoch": 3.3244321669736037, + "grad_norm": 0.3419482707977295, + "learning_rate": 7.787607399133453e-05, + "loss": 1.7812, + "step": 10831 + }, + { + "epoch": 3.3247391037446286, + "grad_norm": 0.27625617384910583, + "learning_rate": 7.787194748112889e-05, + "loss": 1.8513, + "step": 10832 + }, + { + "epoch": 3.325046040515654, + "grad_norm": 0.4287806749343872, + "learning_rate": 7.786782069547633e-05, + "loss": 1.836, + "step": 10833 + }, + { + "epoch": 3.325352977286679, + "grad_norm": 0.4345545172691345, + "learning_rate": 7.786369363441763e-05, + "loss": 1.8027, + "step": 10834 + }, + { + "epoch": 3.325659914057704, + "grad_norm": 0.32976534962654114, + "learning_rate": 7.78595662979936e-05, + "loss": 1.7987, + "step": 10835 + }, + { + "epoch": 3.3259668508287294, + "grad_norm": 0.2677469849586487, + "learning_rate": 7.785543868624498e-05, + "loss": 1.8312, + "step": 10836 + }, + { + "epoch": 3.3262737875997543, + "grad_norm": 0.2547740638256073, + "learning_rate": 7.785131079921259e-05, + "loss": 1.7844, + "step": 10837 + }, + { + "epoch": 3.3265807243707797, + "grad_norm": 0.26755592226982117, + "learning_rate": 7.784718263693725e-05, + "loss": 1.8263, + "step": 10838 + }, + { + "epoch": 3.3268876611418046, + "grad_norm": 0.23884403705596924, + "learning_rate": 7.784305419945969e-05, + "loss": 1.7862, + "step": 10839 + }, + { + "epoch": 3.32719459791283, + "grad_norm": 0.2896903157234192, + "learning_rate": 7.783892548682077e-05, + "loss": 1.9138, + "step": 10840 + }, + { + "epoch": 3.3275015346838552, + "grad_norm": 0.3201359510421753, + "learning_rate": 7.783479649906127e-05, + "loss": 1.8382, + "step": 10841 + }, + { + "epoch": 3.32780847145488, + "grad_norm": 0.39285311102867126, + "learning_rate": 7.7830667236222e-05, + "loss": 1.7763, + "step": 10842 + }, + { + "epoch": 3.3281154082259055, + "grad_norm": 0.435007244348526, + "learning_rate": 7.782653769834376e-05, + "loss": 1.8415, + "step": 10843 + }, + { + "epoch": 3.328422344996931, + "grad_norm": 0.34605318307876587, + "learning_rate": 7.782240788546736e-05, + "loss": 1.757, + "step": 10844 + }, + { + "epoch": 3.3287292817679557, + "grad_norm": 0.26830604672431946, + "learning_rate": 7.781827779763362e-05, + "loss": 1.7779, + "step": 10845 + }, + { + "epoch": 3.329036218538981, + "grad_norm": 0.41851529479026794, + "learning_rate": 7.781414743488336e-05, + "loss": 1.8609, + "step": 10846 + }, + { + "epoch": 3.3293431553100064, + "grad_norm": 0.5058079361915588, + "learning_rate": 7.78100167972574e-05, + "loss": 1.8146, + "step": 10847 + }, + { + "epoch": 3.3296500920810312, + "grad_norm": 0.34394967555999756, + "learning_rate": 7.780588588479654e-05, + "loss": 1.8079, + "step": 10848 + }, + { + "epoch": 3.3299570288520566, + "grad_norm": 0.3033885061740875, + "learning_rate": 7.780175469754161e-05, + "loss": 1.8223, + "step": 10849 + }, + { + "epoch": 3.3302639656230815, + "grad_norm": 0.4431045651435852, + "learning_rate": 7.779762323553347e-05, + "loss": 1.8841, + "step": 10850 + }, + { + "epoch": 3.330570902394107, + "grad_norm": 0.3451448976993561, + "learning_rate": 7.77934914988129e-05, + "loss": 1.8092, + "step": 10851 + }, + { + "epoch": 3.330877839165132, + "grad_norm": 0.26580891013145447, + "learning_rate": 7.778935948742077e-05, + "loss": 1.8244, + "step": 10852 + }, + { + "epoch": 3.331184775936157, + "grad_norm": 0.32079070806503296, + "learning_rate": 7.778522720139792e-05, + "loss": 1.7816, + "step": 10853 + }, + { + "epoch": 3.3314917127071824, + "grad_norm": 0.35789042711257935, + "learning_rate": 7.778109464078514e-05, + "loss": 1.8211, + "step": 10854 + }, + { + "epoch": 3.3317986494782073, + "grad_norm": 0.2808612585067749, + "learning_rate": 7.77769618056233e-05, + "loss": 1.8387, + "step": 10855 + }, + { + "epoch": 3.3321055862492326, + "grad_norm": 0.24760548770427704, + "learning_rate": 7.777282869595326e-05, + "loss": 1.7795, + "step": 10856 + }, + { + "epoch": 3.332412523020258, + "grad_norm": 0.2840912640094757, + "learning_rate": 7.776869531181583e-05, + "loss": 1.7492, + "step": 10857 + }, + { + "epoch": 3.332719459791283, + "grad_norm": 0.2881413698196411, + "learning_rate": 7.77645616532519e-05, + "loss": 1.8157, + "step": 10858 + }, + { + "epoch": 3.333026396562308, + "grad_norm": 0.2508779764175415, + "learning_rate": 7.776042772030228e-05, + "loss": 1.8196, + "step": 10859 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.3307822048664093, + "learning_rate": 7.775629351300785e-05, + "loss": 1.8195, + "step": 10860 + }, + { + "epoch": 3.3336402701043584, + "grad_norm": 0.34392043948173523, + "learning_rate": 7.775215903140946e-05, + "loss": 1.7775, + "step": 10861 + }, + { + "epoch": 3.3339472068753837, + "grad_norm": 0.2594252824783325, + "learning_rate": 7.774802427554796e-05, + "loss": 1.7687, + "step": 10862 + }, + { + "epoch": 3.334254143646409, + "grad_norm": 0.3109053075313568, + "learning_rate": 7.774388924546423e-05, + "loss": 1.7908, + "step": 10863 + }, + { + "epoch": 3.334561080417434, + "grad_norm": 0.4801923930644989, + "learning_rate": 7.773975394119913e-05, + "loss": 1.8316, + "step": 10864 + }, + { + "epoch": 3.3348680171884593, + "grad_norm": 0.4754973351955414, + "learning_rate": 7.77356183627935e-05, + "loss": 1.8015, + "step": 10865 + }, + { + "epoch": 3.335174953959484, + "grad_norm": 0.29624658823013306, + "learning_rate": 7.773148251028825e-05, + "loss": 1.8179, + "step": 10866 + }, + { + "epoch": 3.3354818907305095, + "grad_norm": 0.32207581400871277, + "learning_rate": 7.772734638372423e-05, + "loss": 1.799, + "step": 10867 + }, + { + "epoch": 3.335788827501535, + "grad_norm": 0.5227517485618591, + "learning_rate": 7.772320998314233e-05, + "loss": 1.8452, + "step": 10868 + }, + { + "epoch": 3.3360957642725597, + "grad_norm": 0.4081100523471832, + "learning_rate": 7.771907330858341e-05, + "loss": 1.8182, + "step": 10869 + }, + { + "epoch": 3.336402701043585, + "grad_norm": 0.23786653578281403, + "learning_rate": 7.771493636008838e-05, + "loss": 1.7392, + "step": 10870 + }, + { + "epoch": 3.33670963781461, + "grad_norm": 0.37913820147514343, + "learning_rate": 7.771079913769807e-05, + "loss": 1.7559, + "step": 10871 + }, + { + "epoch": 3.3370165745856353, + "grad_norm": 0.4939163625240326, + "learning_rate": 7.770666164145344e-05, + "loss": 1.8076, + "step": 10872 + }, + { + "epoch": 3.3373235113566606, + "grad_norm": 0.3322528302669525, + "learning_rate": 7.770252387139532e-05, + "loss": 1.8045, + "step": 10873 + }, + { + "epoch": 3.337630448127686, + "grad_norm": 0.3685782849788666, + "learning_rate": 7.769838582756461e-05, + "loss": 1.7703, + "step": 10874 + }, + { + "epoch": 3.337937384898711, + "grad_norm": 0.5564271807670593, + "learning_rate": 7.769424751000224e-05, + "loss": 1.7697, + "step": 10875 + }, + { + "epoch": 3.338244321669736, + "grad_norm": 0.38610726594924927, + "learning_rate": 7.769010891874906e-05, + "loss": 1.7944, + "step": 10876 + }, + { + "epoch": 3.338551258440761, + "grad_norm": 0.23838558793067932, + "learning_rate": 7.768597005384602e-05, + "loss": 1.765, + "step": 10877 + }, + { + "epoch": 3.3388581952117864, + "grad_norm": 0.4334571063518524, + "learning_rate": 7.768183091533399e-05, + "loss": 1.7854, + "step": 10878 + }, + { + "epoch": 3.3391651319828117, + "grad_norm": 0.44844719767570496, + "learning_rate": 7.767769150325386e-05, + "loss": 1.7955, + "step": 10879 + }, + { + "epoch": 3.3394720687538366, + "grad_norm": 0.26543378829956055, + "learning_rate": 7.767355181764659e-05, + "loss": 1.8311, + "step": 10880 + }, + { + "epoch": 3.339779005524862, + "grad_norm": 0.39401358366012573, + "learning_rate": 7.766941185855304e-05, + "loss": 1.8264, + "step": 10881 + }, + { + "epoch": 3.340085942295887, + "grad_norm": 0.5476824045181274, + "learning_rate": 7.766527162601416e-05, + "loss": 1.8051, + "step": 10882 + }, + { + "epoch": 3.340392879066912, + "grad_norm": 0.4021138548851013, + "learning_rate": 7.766113112007084e-05, + "loss": 1.7941, + "step": 10883 + }, + { + "epoch": 3.3406998158379375, + "grad_norm": 0.3262040317058563, + "learning_rate": 7.765699034076402e-05, + "loss": 1.8317, + "step": 10884 + }, + { + "epoch": 3.3410067526089624, + "grad_norm": 0.5461146831512451, + "learning_rate": 7.765284928813459e-05, + "loss": 1.833, + "step": 10885 + }, + { + "epoch": 3.3413136893799877, + "grad_norm": 0.5067405700683594, + "learning_rate": 7.764870796222351e-05, + "loss": 1.7862, + "step": 10886 + }, + { + "epoch": 3.341620626151013, + "grad_norm": 0.2731069028377533, + "learning_rate": 7.76445663630717e-05, + "loss": 1.8173, + "step": 10887 + }, + { + "epoch": 3.341927562922038, + "grad_norm": 0.48928195238113403, + "learning_rate": 7.764042449072008e-05, + "loss": 1.7992, + "step": 10888 + }, + { + "epoch": 3.3422344996930633, + "grad_norm": 0.5338504910469055, + "learning_rate": 7.763628234520958e-05, + "loss": 1.7891, + "step": 10889 + }, + { + "epoch": 3.3425414364640886, + "grad_norm": 0.3136523365974426, + "learning_rate": 7.763213992658114e-05, + "loss": 1.8623, + "step": 10890 + }, + { + "epoch": 3.3428483732351135, + "grad_norm": 0.36551395058631897, + "learning_rate": 7.762799723487568e-05, + "loss": 1.8474, + "step": 10891 + }, + { + "epoch": 3.343155310006139, + "grad_norm": 0.35772353410720825, + "learning_rate": 7.762385427013419e-05, + "loss": 1.8625, + "step": 10892 + }, + { + "epoch": 3.3434622467771637, + "grad_norm": 0.29944708943367004, + "learning_rate": 7.761971103239755e-05, + "loss": 1.8181, + "step": 10893 + }, + { + "epoch": 3.343769183548189, + "grad_norm": 0.3395330309867859, + "learning_rate": 7.761556752170676e-05, + "loss": 1.7943, + "step": 10894 + }, + { + "epoch": 3.3440761203192144, + "grad_norm": 0.3624265193939209, + "learning_rate": 7.761142373810274e-05, + "loss": 1.8234, + "step": 10895 + }, + { + "epoch": 3.3443830570902393, + "grad_norm": 0.25409621000289917, + "learning_rate": 7.760727968162644e-05, + "loss": 1.7532, + "step": 10896 + }, + { + "epoch": 3.3446899938612646, + "grad_norm": 0.321437805891037, + "learning_rate": 7.760313535231883e-05, + "loss": 1.8808, + "step": 10897 + }, + { + "epoch": 3.3449969306322895, + "grad_norm": 0.2919142544269562, + "learning_rate": 7.759899075022086e-05, + "loss": 1.7677, + "step": 10898 + }, + { + "epoch": 3.345303867403315, + "grad_norm": 0.26515716314315796, + "learning_rate": 7.759484587537346e-05, + "loss": 1.8118, + "step": 10899 + }, + { + "epoch": 3.34561080417434, + "grad_norm": 0.2963240146636963, + "learning_rate": 7.759070072781764e-05, + "loss": 1.8329, + "step": 10900 + }, + { + "epoch": 3.345917740945365, + "grad_norm": 0.3186480700969696, + "learning_rate": 7.758655530759435e-05, + "loss": 1.8013, + "step": 10901 + }, + { + "epoch": 3.3462246777163904, + "grad_norm": 0.256145715713501, + "learning_rate": 7.758240961474454e-05, + "loss": 1.7865, + "step": 10902 + }, + { + "epoch": 3.3465316144874158, + "grad_norm": 0.28951629996299744, + "learning_rate": 7.757826364930921e-05, + "loss": 1.8091, + "step": 10903 + }, + { + "epoch": 3.3468385512584407, + "grad_norm": 0.2692483365535736, + "learning_rate": 7.75741174113293e-05, + "loss": 1.8308, + "step": 10904 + }, + { + "epoch": 3.347145488029466, + "grad_norm": 0.27615389227867126, + "learning_rate": 7.75699709008458e-05, + "loss": 1.7888, + "step": 10905 + }, + { + "epoch": 3.3474524248004913, + "grad_norm": 0.2819034457206726, + "learning_rate": 7.75658241178997e-05, + "loss": 1.7624, + "step": 10906 + }, + { + "epoch": 3.347759361571516, + "grad_norm": 0.2627592086791992, + "learning_rate": 7.756167706253196e-05, + "loss": 1.7696, + "step": 10907 + }, + { + "epoch": 3.3480662983425415, + "grad_norm": 0.3528621196746826, + "learning_rate": 7.755752973478356e-05, + "loss": 1.7725, + "step": 10908 + }, + { + "epoch": 3.3483732351135664, + "grad_norm": 0.35949698090553284, + "learning_rate": 7.755338213469552e-05, + "loss": 1.8163, + "step": 10909 + }, + { + "epoch": 3.3486801718845918, + "grad_norm": 0.25142577290534973, + "learning_rate": 7.75492342623088e-05, + "loss": 1.7879, + "step": 10910 + }, + { + "epoch": 3.348987108655617, + "grad_norm": 0.25766023993492126, + "learning_rate": 7.75450861176644e-05, + "loss": 1.8143, + "step": 10911 + }, + { + "epoch": 3.349294045426642, + "grad_norm": 0.2736956477165222, + "learning_rate": 7.754093770080331e-05, + "loss": 1.8907, + "step": 10912 + }, + { + "epoch": 3.3496009821976673, + "grad_norm": 0.23700755834579468, + "learning_rate": 7.753678901176654e-05, + "loss": 1.813, + "step": 10913 + }, + { + "epoch": 3.349907918968692, + "grad_norm": 0.245509073138237, + "learning_rate": 7.753264005059507e-05, + "loss": 1.8019, + "step": 10914 + }, + { + "epoch": 3.3502148557397176, + "grad_norm": 0.232910618185997, + "learning_rate": 7.752849081732993e-05, + "loss": 1.784, + "step": 10915 + }, + { + "epoch": 3.350521792510743, + "grad_norm": 0.22989360988140106, + "learning_rate": 7.75243413120121e-05, + "loss": 1.7597, + "step": 10916 + }, + { + "epoch": 3.350828729281768, + "grad_norm": 0.2093925178050995, + "learning_rate": 7.752019153468258e-05, + "loss": 1.7698, + "step": 10917 + }, + { + "epoch": 3.351135666052793, + "grad_norm": 0.25539630651474, + "learning_rate": 7.751604148538241e-05, + "loss": 1.8287, + "step": 10918 + }, + { + "epoch": 3.3514426028238185, + "grad_norm": 0.2731820046901703, + "learning_rate": 7.75118911641526e-05, + "loss": 1.8862, + "step": 10919 + }, + { + "epoch": 3.3517495395948433, + "grad_norm": 0.2464541345834732, + "learning_rate": 7.750774057103416e-05, + "loss": 1.8165, + "step": 10920 + }, + { + "epoch": 3.3520564763658687, + "grad_norm": 0.26380276679992676, + "learning_rate": 7.75035897060681e-05, + "loss": 1.78, + "step": 10921 + }, + { + "epoch": 3.352363413136894, + "grad_norm": 0.3080748915672302, + "learning_rate": 7.749943856929542e-05, + "loss": 1.7925, + "step": 10922 + }, + { + "epoch": 3.352670349907919, + "grad_norm": 0.317754864692688, + "learning_rate": 7.74952871607572e-05, + "loss": 1.8248, + "step": 10923 + }, + { + "epoch": 3.3529772866789442, + "grad_norm": 0.2525196373462677, + "learning_rate": 7.749113548049442e-05, + "loss": 1.762, + "step": 10924 + }, + { + "epoch": 3.353284223449969, + "grad_norm": 0.3149549961090088, + "learning_rate": 7.748698352854814e-05, + "loss": 1.8289, + "step": 10925 + }, + { + "epoch": 3.3535911602209945, + "grad_norm": 0.35744383931159973, + "learning_rate": 7.748283130495937e-05, + "loss": 1.8132, + "step": 10926 + }, + { + "epoch": 3.35389809699202, + "grad_norm": 0.28599128127098083, + "learning_rate": 7.747867880976916e-05, + "loss": 1.7351, + "step": 10927 + }, + { + "epoch": 3.3542050337630447, + "grad_norm": 0.24428869783878326, + "learning_rate": 7.747452604301852e-05, + "loss": 1.794, + "step": 10928 + }, + { + "epoch": 3.35451197053407, + "grad_norm": 0.29067808389663696, + "learning_rate": 7.747037300474854e-05, + "loss": 1.8181, + "step": 10929 + }, + { + "epoch": 3.354818907305095, + "grad_norm": 0.32417505979537964, + "learning_rate": 7.746621969500021e-05, + "loss": 1.8338, + "step": 10930 + }, + { + "epoch": 3.3551258440761202, + "grad_norm": 0.29536551237106323, + "learning_rate": 7.746206611381462e-05, + "loss": 1.8732, + "step": 10931 + }, + { + "epoch": 3.3554327808471456, + "grad_norm": 0.3169345259666443, + "learning_rate": 7.745791226123278e-05, + "loss": 1.876, + "step": 10932 + }, + { + "epoch": 3.3557397176181705, + "grad_norm": 0.2680271565914154, + "learning_rate": 7.745375813729576e-05, + "loss": 1.7347, + "step": 10933 + }, + { + "epoch": 3.356046654389196, + "grad_norm": 0.28339266777038574, + "learning_rate": 7.74496037420446e-05, + "loss": 1.8507, + "step": 10934 + }, + { + "epoch": 3.356353591160221, + "grad_norm": 0.2567409574985504, + "learning_rate": 7.744544907552038e-05, + "loss": 1.8244, + "step": 10935 + }, + { + "epoch": 3.356660527931246, + "grad_norm": 0.266063928604126, + "learning_rate": 7.744129413776416e-05, + "loss": 1.7864, + "step": 10936 + }, + { + "epoch": 3.3569674647022714, + "grad_norm": 0.2490999698638916, + "learning_rate": 7.743713892881696e-05, + "loss": 1.7637, + "step": 10937 + }, + { + "epoch": 3.3572744014732967, + "grad_norm": 0.25857025384902954, + "learning_rate": 7.743298344871988e-05, + "loss": 1.8101, + "step": 10938 + }, + { + "epoch": 3.3575813382443216, + "grad_norm": 0.2549006938934326, + "learning_rate": 7.742882769751398e-05, + "loss": 1.7782, + "step": 10939 + }, + { + "epoch": 3.357888275015347, + "grad_norm": 0.23915350437164307, + "learning_rate": 7.742467167524035e-05, + "loss": 1.7822, + "step": 10940 + }, + { + "epoch": 3.358195211786372, + "grad_norm": 0.25501590967178345, + "learning_rate": 7.742051538194e-05, + "loss": 1.798, + "step": 10941 + }, + { + "epoch": 3.358502148557397, + "grad_norm": 0.29332005977630615, + "learning_rate": 7.741635881765408e-05, + "loss": 1.8334, + "step": 10942 + }, + { + "epoch": 3.3588090853284225, + "grad_norm": 0.28878241777420044, + "learning_rate": 7.741220198242362e-05, + "loss": 1.8266, + "step": 10943 + }, + { + "epoch": 3.3591160220994474, + "grad_norm": 0.3068650960922241, + "learning_rate": 7.740804487628971e-05, + "loss": 1.8562, + "step": 10944 + }, + { + "epoch": 3.3594229588704727, + "grad_norm": 0.2522405683994293, + "learning_rate": 7.740388749929343e-05, + "loss": 1.8001, + "step": 10945 + }, + { + "epoch": 3.359729895641498, + "grad_norm": 0.3073521554470062, + "learning_rate": 7.739972985147588e-05, + "loss": 1.7454, + "step": 10946 + }, + { + "epoch": 3.360036832412523, + "grad_norm": 0.3018052577972412, + "learning_rate": 7.739557193287815e-05, + "loss": 1.7888, + "step": 10947 + }, + { + "epoch": 3.3603437691835483, + "grad_norm": 0.2738604247570038, + "learning_rate": 7.73914137435413e-05, + "loss": 1.7208, + "step": 10948 + }, + { + "epoch": 3.3606507059545736, + "grad_norm": 0.37699586153030396, + "learning_rate": 7.738725528350646e-05, + "loss": 1.8175, + "step": 10949 + }, + { + "epoch": 3.3609576427255985, + "grad_norm": 0.3479778468608856, + "learning_rate": 7.738309655281471e-05, + "loss": 1.818, + "step": 10950 + }, + { + "epoch": 3.361264579496624, + "grad_norm": 0.24871166050434113, + "learning_rate": 7.737893755150715e-05, + "loss": 1.7046, + "step": 10951 + }, + { + "epoch": 3.3615715162676487, + "grad_norm": 0.45015642046928406, + "learning_rate": 7.737477827962488e-05, + "loss": 1.8517, + "step": 10952 + }, + { + "epoch": 3.361878453038674, + "grad_norm": 0.4149077534675598, + "learning_rate": 7.7370618737209e-05, + "loss": 1.7403, + "step": 10953 + }, + { + "epoch": 3.3621853898096994, + "grad_norm": 0.2556059658527374, + "learning_rate": 7.736645892430064e-05, + "loss": 1.8167, + "step": 10954 + }, + { + "epoch": 3.3624923265807243, + "grad_norm": 0.3153657615184784, + "learning_rate": 7.736229884094088e-05, + "loss": 1.8471, + "step": 10955 + }, + { + "epoch": 3.3627992633517496, + "grad_norm": 0.27943772077560425, + "learning_rate": 7.735813848717084e-05, + "loss": 1.7742, + "step": 10956 + }, + { + "epoch": 3.3631062001227745, + "grad_norm": 0.28270283341407776, + "learning_rate": 7.735397786303164e-05, + "loss": 1.8418, + "step": 10957 + }, + { + "epoch": 3.3634131368938, + "grad_norm": 0.3596261441707611, + "learning_rate": 7.734981696856442e-05, + "loss": 1.8213, + "step": 10958 + }, + { + "epoch": 3.363720073664825, + "grad_norm": 0.3678492307662964, + "learning_rate": 7.734565580381026e-05, + "loss": 1.806, + "step": 10959 + }, + { + "epoch": 3.36402701043585, + "grad_norm": 0.27758681774139404, + "learning_rate": 7.734149436881031e-05, + "loss": 1.7832, + "step": 10960 + }, + { + "epoch": 3.3643339472068754, + "grad_norm": 0.2821379005908966, + "learning_rate": 7.733733266360568e-05, + "loss": 1.8888, + "step": 10961 + }, + { + "epoch": 3.3646408839779007, + "grad_norm": 0.33676958084106445, + "learning_rate": 7.733317068823751e-05, + "loss": 1.902, + "step": 10962 + }, + { + "epoch": 3.3649478207489256, + "grad_norm": 0.3116114139556885, + "learning_rate": 7.732900844274691e-05, + "loss": 1.8228, + "step": 10963 + }, + { + "epoch": 3.365254757519951, + "grad_norm": 0.3286324143409729, + "learning_rate": 7.732484592717506e-05, + "loss": 1.8707, + "step": 10964 + }, + { + "epoch": 3.3655616942909763, + "grad_norm": 0.2732192873954773, + "learning_rate": 7.732068314156304e-05, + "loss": 1.773, + "step": 10965 + }, + { + "epoch": 3.365868631062001, + "grad_norm": 0.26663896441459656, + "learning_rate": 7.731652008595204e-05, + "loss": 1.7837, + "step": 10966 + }, + { + "epoch": 3.3661755678330265, + "grad_norm": 0.27447745203971863, + "learning_rate": 7.731235676038317e-05, + "loss": 1.9103, + "step": 10967 + }, + { + "epoch": 3.3664825046040514, + "grad_norm": 0.30832916498184204, + "learning_rate": 7.730819316489757e-05, + "loss": 1.7552, + "step": 10968 + }, + { + "epoch": 3.3667894413750767, + "grad_norm": 0.29657161235809326, + "learning_rate": 7.73040292995364e-05, + "loss": 1.7654, + "step": 10969 + }, + { + "epoch": 3.367096378146102, + "grad_norm": 0.30434274673461914, + "learning_rate": 7.729986516434082e-05, + "loss": 1.8646, + "step": 10970 + }, + { + "epoch": 3.367403314917127, + "grad_norm": 0.25926661491394043, + "learning_rate": 7.729570075935198e-05, + "loss": 1.7555, + "step": 10971 + }, + { + "epoch": 3.3677102516881523, + "grad_norm": 0.2775980532169342, + "learning_rate": 7.729153608461102e-05, + "loss": 1.8427, + "step": 10972 + }, + { + "epoch": 3.368017188459177, + "grad_norm": 0.23915666341781616, + "learning_rate": 7.72873711401591e-05, + "loss": 1.7902, + "step": 10973 + }, + { + "epoch": 3.3683241252302025, + "grad_norm": 0.2603691518306732, + "learning_rate": 7.728320592603737e-05, + "loss": 1.8587, + "step": 10974 + }, + { + "epoch": 3.368631062001228, + "grad_norm": 0.2579508125782013, + "learning_rate": 7.727904044228703e-05, + "loss": 1.7617, + "step": 10975 + }, + { + "epoch": 3.3689379987722528, + "grad_norm": 0.3384297788143158, + "learning_rate": 7.72748746889492e-05, + "loss": 1.8499, + "step": 10976 + }, + { + "epoch": 3.369244935543278, + "grad_norm": 0.36756646633148193, + "learning_rate": 7.727070866606509e-05, + "loss": 1.808, + "step": 10977 + }, + { + "epoch": 3.3695518723143034, + "grad_norm": 0.3212372958660126, + "learning_rate": 7.726654237367587e-05, + "loss": 1.8245, + "step": 10978 + }, + { + "epoch": 3.3698588090853283, + "grad_norm": 0.23782415688037872, + "learning_rate": 7.726237581182267e-05, + "loss": 1.7629, + "step": 10979 + }, + { + "epoch": 3.3701657458563536, + "grad_norm": 0.2782919108867645, + "learning_rate": 7.725820898054669e-05, + "loss": 1.8, + "step": 10980 + }, + { + "epoch": 3.370472682627379, + "grad_norm": 0.2973455488681793, + "learning_rate": 7.725404187988914e-05, + "loss": 1.7949, + "step": 10981 + }, + { + "epoch": 3.370779619398404, + "grad_norm": 0.2875392735004425, + "learning_rate": 7.724987450989114e-05, + "loss": 1.8019, + "step": 10982 + }, + { + "epoch": 3.371086556169429, + "grad_norm": 0.26133236289024353, + "learning_rate": 7.724570687059394e-05, + "loss": 1.7984, + "step": 10983 + }, + { + "epoch": 3.371393492940454, + "grad_norm": 0.2760173976421356, + "learning_rate": 7.724153896203867e-05, + "loss": 1.8082, + "step": 10984 + }, + { + "epoch": 3.3717004297114794, + "grad_norm": 0.26373061537742615, + "learning_rate": 7.723737078426656e-05, + "loss": 1.8408, + "step": 10985 + }, + { + "epoch": 3.3720073664825048, + "grad_norm": 0.29425618052482605, + "learning_rate": 7.723320233731879e-05, + "loss": 1.7992, + "step": 10986 + }, + { + "epoch": 3.3723143032535297, + "grad_norm": 0.29822099208831787, + "learning_rate": 7.722903362123655e-05, + "loss": 1.8204, + "step": 10987 + }, + { + "epoch": 3.372621240024555, + "grad_norm": 0.25945618748664856, + "learning_rate": 7.722486463606104e-05, + "loss": 1.7376, + "step": 10988 + }, + { + "epoch": 3.37292817679558, + "grad_norm": 0.26367196440696716, + "learning_rate": 7.722069538183345e-05, + "loss": 1.814, + "step": 10989 + }, + { + "epoch": 3.373235113566605, + "grad_norm": 0.25015249848365784, + "learning_rate": 7.7216525858595e-05, + "loss": 1.8199, + "step": 10990 + }, + { + "epoch": 3.3735420503376305, + "grad_norm": 0.3035781681537628, + "learning_rate": 7.72123560663869e-05, + "loss": 1.739, + "step": 10991 + }, + { + "epoch": 3.3738489871086554, + "grad_norm": 0.2847912013530731, + "learning_rate": 7.720818600525033e-05, + "loss": 1.8754, + "step": 10992 + }, + { + "epoch": 3.3741559238796808, + "grad_norm": 0.2533976435661316, + "learning_rate": 7.720401567522653e-05, + "loss": 1.7616, + "step": 10993 + }, + { + "epoch": 3.374462860650706, + "grad_norm": 0.250828355550766, + "learning_rate": 7.719984507635669e-05, + "loss": 1.7973, + "step": 10994 + }, + { + "epoch": 3.374769797421731, + "grad_norm": 0.3019898235797882, + "learning_rate": 7.719567420868206e-05, + "loss": 1.7563, + "step": 10995 + }, + { + "epoch": 3.3750767341927563, + "grad_norm": 0.2703310549259186, + "learning_rate": 7.719150307224382e-05, + "loss": 1.8183, + "step": 10996 + }, + { + "epoch": 3.3753836709637817, + "grad_norm": 0.2434745579957962, + "learning_rate": 7.718733166708321e-05, + "loss": 1.7913, + "step": 10997 + }, + { + "epoch": 3.3756906077348066, + "grad_norm": 0.28036773204803467, + "learning_rate": 7.718315999324146e-05, + "loss": 1.7884, + "step": 10998 + }, + { + "epoch": 3.375997544505832, + "grad_norm": 0.25123077630996704, + "learning_rate": 7.717898805075978e-05, + "loss": 1.7394, + "step": 10999 + }, + { + "epoch": 3.376304481276857, + "grad_norm": 0.2313947230577469, + "learning_rate": 7.717481583967943e-05, + "loss": 1.7537, + "step": 11000 + }, + { + "epoch": 3.376611418047882, + "grad_norm": 0.27152860164642334, + "learning_rate": 7.71706433600416e-05, + "loss": 1.8596, + "step": 11001 + }, + { + "epoch": 3.3769183548189075, + "grad_norm": 0.32866382598876953, + "learning_rate": 7.716647061188757e-05, + "loss": 1.9007, + "step": 11002 + }, + { + "epoch": 3.3772252915899323, + "grad_norm": 0.2842368185520172, + "learning_rate": 7.716229759525854e-05, + "loss": 1.7781, + "step": 11003 + }, + { + "epoch": 3.3775322283609577, + "grad_norm": 0.30411216616630554, + "learning_rate": 7.715812431019576e-05, + "loss": 1.7403, + "step": 11004 + }, + { + "epoch": 3.3778391651319826, + "grad_norm": 0.31848132610321045, + "learning_rate": 7.71539507567405e-05, + "loss": 1.817, + "step": 11005 + }, + { + "epoch": 3.378146101903008, + "grad_norm": 0.24206148087978363, + "learning_rate": 7.714977693493397e-05, + "loss": 1.7796, + "step": 11006 + }, + { + "epoch": 3.3784530386740332, + "grad_norm": 0.2982998490333557, + "learning_rate": 7.714560284481742e-05, + "loss": 1.7883, + "step": 11007 + }, + { + "epoch": 3.378759975445058, + "grad_norm": 0.24857483804225922, + "learning_rate": 7.714142848643213e-05, + "loss": 1.7447, + "step": 11008 + }, + { + "epoch": 3.3790669122160835, + "grad_norm": 0.2509039044380188, + "learning_rate": 7.713725385981932e-05, + "loss": 1.8362, + "step": 11009 + }, + { + "epoch": 3.379373848987109, + "grad_norm": 0.2759779095649719, + "learning_rate": 7.713307896502027e-05, + "loss": 1.8655, + "step": 11010 + }, + { + "epoch": 3.3796807857581337, + "grad_norm": 0.264776349067688, + "learning_rate": 7.712890380207623e-05, + "loss": 1.8221, + "step": 11011 + }, + { + "epoch": 3.379987722529159, + "grad_norm": 0.2771971821784973, + "learning_rate": 7.712472837102846e-05, + "loss": 1.6992, + "step": 11012 + }, + { + "epoch": 3.3802946593001844, + "grad_norm": 0.2749316096305847, + "learning_rate": 7.712055267191822e-05, + "loss": 1.8128, + "step": 11013 + }, + { + "epoch": 3.3806015960712092, + "grad_norm": 0.256656289100647, + "learning_rate": 7.71163767047868e-05, + "loss": 1.8382, + "step": 11014 + }, + { + "epoch": 3.3809085328422346, + "grad_norm": 0.27646976709365845, + "learning_rate": 7.711220046967545e-05, + "loss": 1.8321, + "step": 11015 + }, + { + "epoch": 3.3812154696132595, + "grad_norm": 0.3083149194717407, + "learning_rate": 7.710802396662542e-05, + "loss": 1.904, + "step": 11016 + }, + { + "epoch": 3.381522406384285, + "grad_norm": 0.2750856280326843, + "learning_rate": 7.710384719567803e-05, + "loss": 1.7596, + "step": 11017 + }, + { + "epoch": 3.38182934315531, + "grad_norm": 0.3029455244541168, + "learning_rate": 7.709967015687452e-05, + "loss": 1.8542, + "step": 11018 + }, + { + "epoch": 3.382136279926335, + "grad_norm": 0.3144093453884125, + "learning_rate": 7.709549285025622e-05, + "loss": 1.7489, + "step": 11019 + }, + { + "epoch": 3.3824432166973604, + "grad_norm": 0.2675442099571228, + "learning_rate": 7.709131527586433e-05, + "loss": 1.7324, + "step": 11020 + }, + { + "epoch": 3.3827501534683857, + "grad_norm": 0.2906095087528229, + "learning_rate": 7.708713743374021e-05, + "loss": 1.7848, + "step": 11021 + }, + { + "epoch": 3.3830570902394106, + "grad_norm": 0.25141623616218567, + "learning_rate": 7.708295932392513e-05, + "loss": 1.7423, + "step": 11022 + }, + { + "epoch": 3.383364027010436, + "grad_norm": 0.25832003355026245, + "learning_rate": 7.707878094646037e-05, + "loss": 1.7792, + "step": 11023 + }, + { + "epoch": 3.3836709637814613, + "grad_norm": 0.23710070550441742, + "learning_rate": 7.70746023013872e-05, + "loss": 1.7916, + "step": 11024 + }, + { + "epoch": 3.383977900552486, + "grad_norm": 0.286735862493515, + "learning_rate": 7.707042338874697e-05, + "loss": 1.8272, + "step": 11025 + }, + { + "epoch": 3.3842848373235115, + "grad_norm": 0.2536577582359314, + "learning_rate": 7.706624420858094e-05, + "loss": 1.7839, + "step": 11026 + }, + { + "epoch": 3.3845917740945364, + "grad_norm": 0.5564702749252319, + "learning_rate": 7.706206476093043e-05, + "loss": 1.7832, + "step": 11027 + }, + { + "epoch": 3.3848987108655617, + "grad_norm": 0.34694772958755493, + "learning_rate": 7.705788504583671e-05, + "loss": 1.8668, + "step": 11028 + }, + { + "epoch": 3.385205647636587, + "grad_norm": 0.30388176441192627, + "learning_rate": 7.705370506334113e-05, + "loss": 1.8244, + "step": 11029 + }, + { + "epoch": 3.385512584407612, + "grad_norm": 0.2998919188976288, + "learning_rate": 7.704952481348497e-05, + "loss": 1.7927, + "step": 11030 + }, + { + "epoch": 3.3858195211786373, + "grad_norm": 0.2714936435222626, + "learning_rate": 7.704534429630955e-05, + "loss": 1.8757, + "step": 11031 + }, + { + "epoch": 3.386126457949662, + "grad_norm": 0.26670241355895996, + "learning_rate": 7.704116351185619e-05, + "loss": 1.8146, + "step": 11032 + }, + { + "epoch": 3.3864333947206875, + "grad_norm": 0.2500552833080292, + "learning_rate": 7.703698246016621e-05, + "loss": 1.7984, + "step": 11033 + }, + { + "epoch": 3.386740331491713, + "grad_norm": 0.2494918406009674, + "learning_rate": 7.703280114128091e-05, + "loss": 1.7433, + "step": 11034 + }, + { + "epoch": 3.3870472682627377, + "grad_norm": 0.25658491253852844, + "learning_rate": 7.702861955524163e-05, + "loss": 1.8487, + "step": 11035 + }, + { + "epoch": 3.387354205033763, + "grad_norm": 0.2871410548686981, + "learning_rate": 7.702443770208969e-05, + "loss": 1.7919, + "step": 11036 + }, + { + "epoch": 3.3876611418047884, + "grad_norm": 0.3347938060760498, + "learning_rate": 7.702025558186643e-05, + "loss": 1.8091, + "step": 11037 + }, + { + "epoch": 3.3879680785758133, + "grad_norm": 0.39016643166542053, + "learning_rate": 7.701607319461315e-05, + "loss": 1.7816, + "step": 11038 + }, + { + "epoch": 3.3882750153468386, + "grad_norm": 0.3423028290271759, + "learning_rate": 7.701189054037121e-05, + "loss": 1.8454, + "step": 11039 + }, + { + "epoch": 3.388581952117864, + "grad_norm": 0.27592089772224426, + "learning_rate": 7.700770761918192e-05, + "loss": 1.8431, + "step": 11040 + }, + { + "epoch": 3.388888888888889, + "grad_norm": 0.46047264337539673, + "learning_rate": 7.700352443108665e-05, + "loss": 1.8412, + "step": 11041 + }, + { + "epoch": 3.389195825659914, + "grad_norm": 0.49226754903793335, + "learning_rate": 7.699934097612673e-05, + "loss": 1.8212, + "step": 11042 + }, + { + "epoch": 3.389502762430939, + "grad_norm": 0.3958778381347656, + "learning_rate": 7.699515725434348e-05, + "loss": 1.747, + "step": 11043 + }, + { + "epoch": 3.3898096992019644, + "grad_norm": 0.26097169518470764, + "learning_rate": 7.699097326577827e-05, + "loss": 1.7631, + "step": 11044 + }, + { + "epoch": 3.3901166359729897, + "grad_norm": 0.2922612130641937, + "learning_rate": 7.698678901047245e-05, + "loss": 1.7891, + "step": 11045 + }, + { + "epoch": 3.3904235727440146, + "grad_norm": 0.4195055365562439, + "learning_rate": 7.698260448846734e-05, + "loss": 1.7765, + "step": 11046 + }, + { + "epoch": 3.39073050951504, + "grad_norm": 0.4572988450527191, + "learning_rate": 7.697841969980434e-05, + "loss": 1.8085, + "step": 11047 + }, + { + "epoch": 3.391037446286065, + "grad_norm": 0.38819587230682373, + "learning_rate": 7.697423464452478e-05, + "loss": 1.8854, + "step": 11048 + }, + { + "epoch": 3.39134438305709, + "grad_norm": 0.27421653270721436, + "learning_rate": 7.697004932267003e-05, + "loss": 1.8327, + "step": 11049 + }, + { + "epoch": 3.3916513198281155, + "grad_norm": 0.33559146523475647, + "learning_rate": 7.696586373428142e-05, + "loss": 1.8109, + "step": 11050 + }, + { + "epoch": 3.3919582565991404, + "grad_norm": 0.39438655972480774, + "learning_rate": 7.696167787940037e-05, + "loss": 1.7909, + "step": 11051 + }, + { + "epoch": 3.3922651933701657, + "grad_norm": 0.3425842523574829, + "learning_rate": 7.695749175806819e-05, + "loss": 1.8571, + "step": 11052 + }, + { + "epoch": 3.392572130141191, + "grad_norm": 0.2860080301761627, + "learning_rate": 7.695330537032628e-05, + "loss": 1.8546, + "step": 11053 + }, + { + "epoch": 3.392879066912216, + "grad_norm": 0.35894665122032166, + "learning_rate": 7.694911871621601e-05, + "loss": 1.7895, + "step": 11054 + }, + { + "epoch": 3.3931860036832413, + "grad_norm": 0.351193904876709, + "learning_rate": 7.694493179577879e-05, + "loss": 1.7453, + "step": 11055 + }, + { + "epoch": 3.3934929404542666, + "grad_norm": 0.24812865257263184, + "learning_rate": 7.694074460905592e-05, + "loss": 1.8131, + "step": 11056 + }, + { + "epoch": 3.3937998772252915, + "grad_norm": 0.38620972633361816, + "learning_rate": 7.693655715608883e-05, + "loss": 1.8346, + "step": 11057 + }, + { + "epoch": 3.394106813996317, + "grad_norm": 0.5005692839622498, + "learning_rate": 7.69323694369189e-05, + "loss": 1.9031, + "step": 11058 + }, + { + "epoch": 3.3944137507673418, + "grad_norm": 0.4321887791156769, + "learning_rate": 7.692818145158751e-05, + "loss": 1.8783, + "step": 11059 + }, + { + "epoch": 3.394720687538367, + "grad_norm": 0.269307017326355, + "learning_rate": 7.692399320013603e-05, + "loss": 1.8075, + "step": 11060 + }, + { + "epoch": 3.3950276243093924, + "grad_norm": 0.2945556342601776, + "learning_rate": 7.69198046826059e-05, + "loss": 1.8366, + "step": 11061 + }, + { + "epoch": 3.3953345610804173, + "grad_norm": 0.30531853437423706, + "learning_rate": 7.691561589903847e-05, + "loss": 1.7665, + "step": 11062 + }, + { + "epoch": 3.3956414978514426, + "grad_norm": 0.25105199217796326, + "learning_rate": 7.691142684947513e-05, + "loss": 1.782, + "step": 11063 + }, + { + "epoch": 3.3959484346224675, + "grad_norm": 0.3373202085494995, + "learning_rate": 7.69072375339573e-05, + "loss": 1.8148, + "step": 11064 + }, + { + "epoch": 3.396255371393493, + "grad_norm": 0.34207093715667725, + "learning_rate": 7.690304795252638e-05, + "loss": 1.8287, + "step": 11065 + }, + { + "epoch": 3.396562308164518, + "grad_norm": 0.26281681656837463, + "learning_rate": 7.68988581052238e-05, + "loss": 1.8551, + "step": 11066 + }, + { + "epoch": 3.396869244935543, + "grad_norm": 0.3091152608394623, + "learning_rate": 7.689466799209091e-05, + "loss": 1.7689, + "step": 11067 + }, + { + "epoch": 3.3971761817065684, + "grad_norm": 0.37421298027038574, + "learning_rate": 7.689047761316914e-05, + "loss": 1.7908, + "step": 11068 + }, + { + "epoch": 3.3974831184775938, + "grad_norm": 0.3745511770248413, + "learning_rate": 7.688628696849993e-05, + "loss": 1.8408, + "step": 11069 + }, + { + "epoch": 3.3977900552486187, + "grad_norm": 0.3003663122653961, + "learning_rate": 7.688209605812467e-05, + "loss": 1.9109, + "step": 11070 + }, + { + "epoch": 3.398096992019644, + "grad_norm": 0.3437681496143341, + "learning_rate": 7.687790488208478e-05, + "loss": 1.811, + "step": 11071 + }, + { + "epoch": 3.3984039287906693, + "grad_norm": 0.3480641841888428, + "learning_rate": 7.687371344042168e-05, + "loss": 1.8114, + "step": 11072 + }, + { + "epoch": 3.398710865561694, + "grad_norm": 0.24670913815498352, + "learning_rate": 7.686952173317679e-05, + "loss": 1.7959, + "step": 11073 + }, + { + "epoch": 3.3990178023327196, + "grad_norm": 0.2939499020576477, + "learning_rate": 7.686532976039154e-05, + "loss": 1.7518, + "step": 11074 + }, + { + "epoch": 3.3993247391037444, + "grad_norm": 0.3332279622554779, + "learning_rate": 7.686113752210736e-05, + "loss": 1.843, + "step": 11075 + }, + { + "epoch": 3.3996316758747698, + "grad_norm": 0.22967280447483063, + "learning_rate": 7.685694501836566e-05, + "loss": 1.7408, + "step": 11076 + }, + { + "epoch": 3.399938612645795, + "grad_norm": 0.3443470001220703, + "learning_rate": 7.685275224920789e-05, + "loss": 1.8004, + "step": 11077 + }, + { + "epoch": 3.40024554941682, + "grad_norm": 0.3725457489490509, + "learning_rate": 7.684855921467548e-05, + "loss": 1.833, + "step": 11078 + }, + { + "epoch": 3.4005524861878453, + "grad_norm": 0.3178638219833374, + "learning_rate": 7.68443659148099e-05, + "loss": 1.8055, + "step": 11079 + }, + { + "epoch": 3.4008594229588702, + "grad_norm": 0.2609167695045471, + "learning_rate": 7.684017234965254e-05, + "loss": 1.7881, + "step": 11080 + }, + { + "epoch": 3.4011663597298956, + "grad_norm": 0.26975762844085693, + "learning_rate": 7.683597851924486e-05, + "loss": 1.8424, + "step": 11081 + }, + { + "epoch": 3.401473296500921, + "grad_norm": 0.266661673784256, + "learning_rate": 7.683178442362832e-05, + "loss": 1.7785, + "step": 11082 + }, + { + "epoch": 3.401780233271946, + "grad_norm": 0.27915671467781067, + "learning_rate": 7.682759006284436e-05, + "loss": 1.8241, + "step": 11083 + }, + { + "epoch": 3.402087170042971, + "grad_norm": 0.25167274475097656, + "learning_rate": 7.682339543693444e-05, + "loss": 1.7637, + "step": 11084 + }, + { + "epoch": 3.4023941068139965, + "grad_norm": 0.2439529299736023, + "learning_rate": 7.681920054593999e-05, + "loss": 1.7796, + "step": 11085 + }, + { + "epoch": 3.4027010435850213, + "grad_norm": 0.26224252581596375, + "learning_rate": 7.681500538990249e-05, + "loss": 1.8018, + "step": 11086 + }, + { + "epoch": 3.4030079803560467, + "grad_norm": 0.25093868374824524, + "learning_rate": 7.681080996886336e-05, + "loss": 1.7664, + "step": 11087 + }, + { + "epoch": 3.403314917127072, + "grad_norm": 0.26393210887908936, + "learning_rate": 7.680661428286413e-05, + "loss": 1.8389, + "step": 11088 + }, + { + "epoch": 3.403621853898097, + "grad_norm": 0.24750283360481262, + "learning_rate": 7.680241833194622e-05, + "loss": 1.8358, + "step": 11089 + }, + { + "epoch": 3.4039287906691222, + "grad_norm": 0.21568982303142548, + "learning_rate": 7.67982221161511e-05, + "loss": 1.7874, + "step": 11090 + }, + { + "epoch": 3.404235727440147, + "grad_norm": 0.24407126009464264, + "learning_rate": 7.679402563552023e-05, + "loss": 1.7753, + "step": 11091 + }, + { + "epoch": 3.4045426642111725, + "grad_norm": 0.23288260400295258, + "learning_rate": 7.67898288900951e-05, + "loss": 1.8046, + "step": 11092 + }, + { + "epoch": 3.404849600982198, + "grad_norm": 0.2548544108867645, + "learning_rate": 7.678563187991718e-05, + "loss": 1.8778, + "step": 11093 + }, + { + "epoch": 3.4051565377532227, + "grad_norm": 0.24008090794086456, + "learning_rate": 7.678143460502796e-05, + "loss": 1.7912, + "step": 11094 + }, + { + "epoch": 3.405463474524248, + "grad_norm": 0.26085031032562256, + "learning_rate": 7.677723706546889e-05, + "loss": 1.849, + "step": 11095 + }, + { + "epoch": 3.4057704112952734, + "grad_norm": 0.2830932140350342, + "learning_rate": 7.677303926128147e-05, + "loss": 1.8265, + "step": 11096 + }, + { + "epoch": 3.4060773480662982, + "grad_norm": 0.27593597769737244, + "learning_rate": 7.676884119250718e-05, + "loss": 1.8555, + "step": 11097 + }, + { + "epoch": 3.4063842848373236, + "grad_norm": 0.2403372824192047, + "learning_rate": 7.676464285918751e-05, + "loss": 1.7243, + "step": 11098 + }, + { + "epoch": 3.406691221608349, + "grad_norm": 0.28830090165138245, + "learning_rate": 7.676044426136397e-05, + "loss": 1.8108, + "step": 11099 + }, + { + "epoch": 3.406998158379374, + "grad_norm": 0.2918153405189514, + "learning_rate": 7.675624539907802e-05, + "loss": 1.7875, + "step": 11100 + }, + { + "epoch": 3.407305095150399, + "grad_norm": 0.2609013020992279, + "learning_rate": 7.675204627237117e-05, + "loss": 1.778, + "step": 11101 + }, + { + "epoch": 3.407612031921424, + "grad_norm": 0.2714763283729553, + "learning_rate": 7.674784688128494e-05, + "loss": 1.8472, + "step": 11102 + }, + { + "epoch": 3.4079189686924494, + "grad_norm": 0.25857117772102356, + "learning_rate": 7.674364722586078e-05, + "loss": 1.7495, + "step": 11103 + }, + { + "epoch": 3.4082259054634747, + "grad_norm": 0.25485143065452576, + "learning_rate": 7.673944730614023e-05, + "loss": 1.7817, + "step": 11104 + }, + { + "epoch": 3.4085328422344996, + "grad_norm": 0.2735857665538788, + "learning_rate": 7.67352471221648e-05, + "loss": 1.7522, + "step": 11105 + }, + { + "epoch": 3.408839779005525, + "grad_norm": 0.25079572200775146, + "learning_rate": 7.6731046673976e-05, + "loss": 1.765, + "step": 11106 + }, + { + "epoch": 3.40914671577655, + "grad_norm": 0.3080148696899414, + "learning_rate": 7.672684596161532e-05, + "loss": 1.8305, + "step": 11107 + }, + { + "epoch": 3.409453652547575, + "grad_norm": 0.23771968483924866, + "learning_rate": 7.672264498512427e-05, + "loss": 1.7837, + "step": 11108 + }, + { + "epoch": 3.4097605893186005, + "grad_norm": 0.29941999912261963, + "learning_rate": 7.671844374454437e-05, + "loss": 1.8013, + "step": 11109 + }, + { + "epoch": 3.4100675260896254, + "grad_norm": 0.27871644496917725, + "learning_rate": 7.671424223991717e-05, + "loss": 1.8598, + "step": 11110 + }, + { + "epoch": 3.4103744628606507, + "grad_norm": 0.2751443684101105, + "learning_rate": 7.671004047128416e-05, + "loss": 1.8341, + "step": 11111 + }, + { + "epoch": 3.410681399631676, + "grad_norm": 0.27227312326431274, + "learning_rate": 7.670583843868688e-05, + "loss": 1.81, + "step": 11112 + }, + { + "epoch": 3.410988336402701, + "grad_norm": 0.29617756605148315, + "learning_rate": 7.670163614216685e-05, + "loss": 1.8795, + "step": 11113 + }, + { + "epoch": 3.4112952731737263, + "grad_norm": 0.268920361995697, + "learning_rate": 7.669743358176563e-05, + "loss": 1.7659, + "step": 11114 + }, + { + "epoch": 3.4116022099447516, + "grad_norm": 0.2875109314918518, + "learning_rate": 7.669323075752467e-05, + "loss": 1.8263, + "step": 11115 + }, + { + "epoch": 3.4119091467157765, + "grad_norm": 0.34703585505485535, + "learning_rate": 7.668902766948558e-05, + "loss": 1.7622, + "step": 11116 + }, + { + "epoch": 3.412216083486802, + "grad_norm": 0.3090265393257141, + "learning_rate": 7.668482431768989e-05, + "loss": 1.7381, + "step": 11117 + }, + { + "epoch": 3.4125230202578267, + "grad_norm": 0.2619737684726715, + "learning_rate": 7.668062070217911e-05, + "loss": 1.8004, + "step": 11118 + }, + { + "epoch": 3.412829957028852, + "grad_norm": 0.289815217256546, + "learning_rate": 7.667641682299482e-05, + "loss": 1.7946, + "step": 11119 + }, + { + "epoch": 3.4131368937998774, + "grad_norm": 0.28732073307037354, + "learning_rate": 7.667221268017852e-05, + "loss": 1.8746, + "step": 11120 + }, + { + "epoch": 3.4134438305709023, + "grad_norm": 0.23232576251029968, + "learning_rate": 7.666800827377178e-05, + "loss": 1.7403, + "step": 11121 + }, + { + "epoch": 3.4137507673419276, + "grad_norm": 0.22903507947921753, + "learning_rate": 7.666380360381616e-05, + "loss": 1.7785, + "step": 11122 + }, + { + "epoch": 3.4140577041129525, + "grad_norm": 0.25023025274276733, + "learning_rate": 7.665959867035321e-05, + "loss": 1.7881, + "step": 11123 + }, + { + "epoch": 3.414364640883978, + "grad_norm": 0.2199166864156723, + "learning_rate": 7.665539347342449e-05, + "loss": 1.7522, + "step": 11124 + }, + { + "epoch": 3.414671577655003, + "grad_norm": 0.2539862394332886, + "learning_rate": 7.665118801307152e-05, + "loss": 1.7964, + "step": 11125 + }, + { + "epoch": 3.414978514426028, + "grad_norm": 0.22670161724090576, + "learning_rate": 7.664698228933591e-05, + "loss": 1.7071, + "step": 11126 + }, + { + "epoch": 3.4152854511970534, + "grad_norm": 0.24827396869659424, + "learning_rate": 7.664277630225919e-05, + "loss": 1.7897, + "step": 11127 + }, + { + "epoch": 3.4155923879680787, + "grad_norm": 0.29391366243362427, + "learning_rate": 7.663857005188296e-05, + "loss": 1.7967, + "step": 11128 + }, + { + "epoch": 3.4158993247391036, + "grad_norm": 0.3201812505722046, + "learning_rate": 7.663436353824874e-05, + "loss": 1.7681, + "step": 11129 + }, + { + "epoch": 3.416206261510129, + "grad_norm": 0.2274552583694458, + "learning_rate": 7.663015676139814e-05, + "loss": 1.7535, + "step": 11130 + }, + { + "epoch": 3.4165131982811543, + "grad_norm": 0.3955044150352478, + "learning_rate": 7.662594972137273e-05, + "loss": 1.8175, + "step": 11131 + }, + { + "epoch": 3.416820135052179, + "grad_norm": 0.46493569016456604, + "learning_rate": 7.662174241821406e-05, + "loss": 1.7806, + "step": 11132 + }, + { + "epoch": 3.4171270718232045, + "grad_norm": 0.37731611728668213, + "learning_rate": 7.661753485196375e-05, + "loss": 1.7555, + "step": 11133 + }, + { + "epoch": 3.4174340085942294, + "grad_norm": 0.23983556032180786, + "learning_rate": 7.661332702266334e-05, + "loss": 1.7662, + "step": 11134 + }, + { + "epoch": 3.4177409453652547, + "grad_norm": 0.34964314103126526, + "learning_rate": 7.660911893035445e-05, + "loss": 1.7786, + "step": 11135 + }, + { + "epoch": 3.41804788213628, + "grad_norm": 0.44820764660835266, + "learning_rate": 7.660491057507864e-05, + "loss": 1.778, + "step": 11136 + }, + { + "epoch": 3.418354818907305, + "grad_norm": 0.32936233282089233, + "learning_rate": 7.660070195687752e-05, + "loss": 1.8181, + "step": 11137 + }, + { + "epoch": 3.4186617556783303, + "grad_norm": 0.2874850332736969, + "learning_rate": 7.659649307579266e-05, + "loss": 1.8733, + "step": 11138 + }, + { + "epoch": 3.418968692449355, + "grad_norm": 0.46269866824150085, + "learning_rate": 7.659228393186566e-05, + "loss": 1.8566, + "step": 11139 + }, + { + "epoch": 3.4192756292203805, + "grad_norm": 0.5873839855194092, + "learning_rate": 7.658807452513816e-05, + "loss": 1.8317, + "step": 11140 + }, + { + "epoch": 3.419582565991406, + "grad_norm": 0.43150341510772705, + "learning_rate": 7.65838648556517e-05, + "loss": 1.7702, + "step": 11141 + }, + { + "epoch": 3.4198895027624308, + "grad_norm": 0.2803891599178314, + "learning_rate": 7.65796549234479e-05, + "loss": 1.8043, + "step": 11142 + }, + { + "epoch": 3.420196439533456, + "grad_norm": 0.37295013666152954, + "learning_rate": 7.657544472856838e-05, + "loss": 1.7923, + "step": 11143 + }, + { + "epoch": 3.4205033763044814, + "grad_norm": 0.3922573924064636, + "learning_rate": 7.657123427105473e-05, + "loss": 1.8231, + "step": 11144 + }, + { + "epoch": 3.4208103130755063, + "grad_norm": 0.27254152297973633, + "learning_rate": 7.656702355094859e-05, + "loss": 1.8168, + "step": 11145 + }, + { + "epoch": 3.4211172498465316, + "grad_norm": 0.28005337715148926, + "learning_rate": 7.656281256829152e-05, + "loss": 1.8047, + "step": 11146 + }, + { + "epoch": 3.421424186617557, + "grad_norm": 0.4369073808193207, + "learning_rate": 7.655860132312519e-05, + "loss": 1.7243, + "step": 11147 + }, + { + "epoch": 3.421731123388582, + "grad_norm": 0.4127553701400757, + "learning_rate": 7.655438981549119e-05, + "loss": 1.8148, + "step": 11148 + }, + { + "epoch": 3.422038060159607, + "grad_norm": 0.3131798207759857, + "learning_rate": 7.655017804543114e-05, + "loss": 1.789, + "step": 11149 + }, + { + "epoch": 3.422344996930632, + "grad_norm": 0.2947194576263428, + "learning_rate": 7.654596601298666e-05, + "loss": 1.8221, + "step": 11150 + }, + { + "epoch": 3.4226519337016574, + "grad_norm": 0.3072497546672821, + "learning_rate": 7.654175371819941e-05, + "loss": 1.7747, + "step": 11151 + }, + { + "epoch": 3.4229588704726828, + "grad_norm": 0.29408320784568787, + "learning_rate": 7.653754116111099e-05, + "loss": 1.9009, + "step": 11152 + }, + { + "epoch": 3.4232658072437077, + "grad_norm": 0.2629215717315674, + "learning_rate": 7.653332834176303e-05, + "loss": 1.7354, + "step": 11153 + }, + { + "epoch": 3.423572744014733, + "grad_norm": 0.2850257456302643, + "learning_rate": 7.652911526019716e-05, + "loss": 1.8422, + "step": 11154 + }, + { + "epoch": 3.423879680785758, + "grad_norm": 0.29787111282348633, + "learning_rate": 7.652490191645503e-05, + "loss": 1.8122, + "step": 11155 + }, + { + "epoch": 3.424186617556783, + "grad_norm": 0.2670947015285492, + "learning_rate": 7.652068831057826e-05, + "loss": 1.7734, + "step": 11156 + }, + { + "epoch": 3.4244935543278086, + "grad_norm": 0.26415133476257324, + "learning_rate": 7.651647444260853e-05, + "loss": 1.7661, + "step": 11157 + }, + { + "epoch": 3.424800491098834, + "grad_norm": 0.2614886164665222, + "learning_rate": 7.651226031258745e-05, + "loss": 1.6918, + "step": 11158 + }, + { + "epoch": 3.425107427869859, + "grad_norm": 0.28485649824142456, + "learning_rate": 7.650804592055667e-05, + "loss": 1.7771, + "step": 11159 + }, + { + "epoch": 3.425414364640884, + "grad_norm": 0.26080289483070374, + "learning_rate": 7.650383126655784e-05, + "loss": 1.7637, + "step": 11160 + }, + { + "epoch": 3.425721301411909, + "grad_norm": 0.2503695487976074, + "learning_rate": 7.649961635063261e-05, + "loss": 1.7864, + "step": 11161 + }, + { + "epoch": 3.4260282381829343, + "grad_norm": 0.3165570795536041, + "learning_rate": 7.649540117282263e-05, + "loss": 1.8107, + "step": 11162 + }, + { + "epoch": 3.4263351749539597, + "grad_norm": 0.28411731123924255, + "learning_rate": 7.649118573316959e-05, + "loss": 1.7557, + "step": 11163 + }, + { + "epoch": 3.4266421117249846, + "grad_norm": 0.24469570815563202, + "learning_rate": 7.648697003171512e-05, + "loss": 1.7597, + "step": 11164 + }, + { + "epoch": 3.42694904849601, + "grad_norm": 0.31968292593955994, + "learning_rate": 7.648275406850087e-05, + "loss": 1.7796, + "step": 11165 + }, + { + "epoch": 3.427255985267035, + "grad_norm": 0.24520765244960785, + "learning_rate": 7.647853784356856e-05, + "loss": 1.7931, + "step": 11166 + }, + { + "epoch": 3.42756292203806, + "grad_norm": 0.23946821689605713, + "learning_rate": 7.647432135695977e-05, + "loss": 1.7143, + "step": 11167 + }, + { + "epoch": 3.4278698588090855, + "grad_norm": 0.321455180644989, + "learning_rate": 7.647010460871624e-05, + "loss": 1.8682, + "step": 11168 + }, + { + "epoch": 3.4281767955801103, + "grad_norm": 0.2803197503089905, + "learning_rate": 7.646588759887964e-05, + "loss": 1.8, + "step": 11169 + }, + { + "epoch": 3.4284837323511357, + "grad_norm": 0.2597559988498688, + "learning_rate": 7.64616703274916e-05, + "loss": 1.8027, + "step": 11170 + }, + { + "epoch": 3.428790669122161, + "grad_norm": 0.25055503845214844, + "learning_rate": 7.645745279459384e-05, + "loss": 1.7659, + "step": 11171 + }, + { + "epoch": 3.429097605893186, + "grad_norm": 0.34582629799842834, + "learning_rate": 7.645323500022803e-05, + "loss": 1.7868, + "step": 11172 + }, + { + "epoch": 3.4294045426642112, + "grad_norm": 0.32845041155815125, + "learning_rate": 7.644901694443584e-05, + "loss": 1.8247, + "step": 11173 + }, + { + "epoch": 3.4297114794352366, + "grad_norm": 0.2570398449897766, + "learning_rate": 7.644479862725896e-05, + "loss": 1.7802, + "step": 11174 + }, + { + "epoch": 3.4300184162062615, + "grad_norm": 0.23117294907569885, + "learning_rate": 7.644058004873908e-05, + "loss": 1.7575, + "step": 11175 + }, + { + "epoch": 3.430325352977287, + "grad_norm": 0.2417830377817154, + "learning_rate": 7.64363612089179e-05, + "loss": 1.7954, + "step": 11176 + }, + { + "epoch": 3.4306322897483117, + "grad_norm": 0.249378964304924, + "learning_rate": 7.643214210783708e-05, + "loss": 1.8161, + "step": 11177 + }, + { + "epoch": 3.430939226519337, + "grad_norm": 0.24494746327400208, + "learning_rate": 7.642792274553836e-05, + "loss": 1.825, + "step": 11178 + }, + { + "epoch": 3.4312461632903624, + "grad_norm": 0.2663760185241699, + "learning_rate": 7.642370312206342e-05, + "loss": 1.7589, + "step": 11179 + }, + { + "epoch": 3.4315531000613873, + "grad_norm": 0.2819322645664215, + "learning_rate": 7.641948323745395e-05, + "loss": 1.8097, + "step": 11180 + }, + { + "epoch": 3.4318600368324126, + "grad_norm": 0.26917630434036255, + "learning_rate": 7.641526309175166e-05, + "loss": 1.7934, + "step": 11181 + }, + { + "epoch": 3.4321669736034375, + "grad_norm": 0.31618112325668335, + "learning_rate": 7.641104268499826e-05, + "loss": 1.8522, + "step": 11182 + }, + { + "epoch": 3.432473910374463, + "grad_norm": 0.29209139943122864, + "learning_rate": 7.640682201723546e-05, + "loss": 1.7499, + "step": 11183 + }, + { + "epoch": 3.432780847145488, + "grad_norm": 0.24831914901733398, + "learning_rate": 7.640260108850496e-05, + "loss": 1.7897, + "step": 11184 + }, + { + "epoch": 3.433087783916513, + "grad_norm": 0.2459818720817566, + "learning_rate": 7.639837989884849e-05, + "loss": 1.7604, + "step": 11185 + }, + { + "epoch": 3.4333947206875384, + "grad_norm": 0.27157485485076904, + "learning_rate": 7.639415844830774e-05, + "loss": 1.7776, + "step": 11186 + }, + { + "epoch": 3.4337016574585637, + "grad_norm": 0.3021515905857086, + "learning_rate": 7.638993673692445e-05, + "loss": 1.7771, + "step": 11187 + }, + { + "epoch": 3.4340085942295886, + "grad_norm": 0.2591722309589386, + "learning_rate": 7.638571476474036e-05, + "loss": 1.8333, + "step": 11188 + }, + { + "epoch": 3.434315531000614, + "grad_norm": 0.2255258709192276, + "learning_rate": 7.638149253179717e-05, + "loss": 1.7647, + "step": 11189 + }, + { + "epoch": 3.4346224677716393, + "grad_norm": 0.2585793733596802, + "learning_rate": 7.637727003813658e-05, + "loss": 1.786, + "step": 11190 + }, + { + "epoch": 3.434929404542664, + "grad_norm": 0.23649543523788452, + "learning_rate": 7.637304728380036e-05, + "loss": 1.822, + "step": 11191 + }, + { + "epoch": 3.4352363413136895, + "grad_norm": 0.2610832452774048, + "learning_rate": 7.636882426883023e-05, + "loss": 1.7925, + "step": 11192 + }, + { + "epoch": 3.4355432780847144, + "grad_norm": 0.26230642199516296, + "learning_rate": 7.636460099326793e-05, + "loss": 1.8169, + "step": 11193 + }, + { + "epoch": 3.4358502148557397, + "grad_norm": 0.2800561189651489, + "learning_rate": 7.636037745715518e-05, + "loss": 1.845, + "step": 11194 + }, + { + "epoch": 3.436157151626765, + "grad_norm": 0.27790409326553345, + "learning_rate": 7.635615366053372e-05, + "loss": 1.8141, + "step": 11195 + }, + { + "epoch": 3.43646408839779, + "grad_norm": 0.2894865870475769, + "learning_rate": 7.635192960344533e-05, + "loss": 1.7916, + "step": 11196 + }, + { + "epoch": 3.4367710251688153, + "grad_norm": 0.22310738265514374, + "learning_rate": 7.634770528593171e-05, + "loss": 1.79, + "step": 11197 + }, + { + "epoch": 3.43707796193984, + "grad_norm": 0.2837755084037781, + "learning_rate": 7.634348070803463e-05, + "loss": 1.8763, + "step": 11198 + }, + { + "epoch": 3.4373848987108655, + "grad_norm": 0.32488104701042175, + "learning_rate": 7.633925586979583e-05, + "loss": 1.8331, + "step": 11199 + }, + { + "epoch": 3.437691835481891, + "grad_norm": 0.2708779573440552, + "learning_rate": 7.633503077125706e-05, + "loss": 1.761, + "step": 11200 + }, + { + "epoch": 3.4379987722529157, + "grad_norm": 0.23929642140865326, + "learning_rate": 7.633080541246008e-05, + "loss": 1.8217, + "step": 11201 + }, + { + "epoch": 3.438305709023941, + "grad_norm": 0.3213331997394562, + "learning_rate": 7.632657979344667e-05, + "loss": 1.8375, + "step": 11202 + }, + { + "epoch": 3.4386126457949664, + "grad_norm": 0.38420629501342773, + "learning_rate": 7.632235391425854e-05, + "loss": 1.765, + "step": 11203 + }, + { + "epoch": 3.4389195825659913, + "grad_norm": 0.40466073155403137, + "learning_rate": 7.631812777493749e-05, + "loss": 1.8262, + "step": 11204 + }, + { + "epoch": 3.4392265193370166, + "grad_norm": 0.35904639959335327, + "learning_rate": 7.631390137552527e-05, + "loss": 1.894, + "step": 11205 + }, + { + "epoch": 3.439533456108042, + "grad_norm": 0.28880515694618225, + "learning_rate": 7.630967471606368e-05, + "loss": 1.87, + "step": 11206 + }, + { + "epoch": 3.439840392879067, + "grad_norm": 0.2878882884979248, + "learning_rate": 7.630544779659444e-05, + "loss": 1.7841, + "step": 11207 + }, + { + "epoch": 3.440147329650092, + "grad_norm": 0.36002418398857117, + "learning_rate": 7.630122061715935e-05, + "loss": 1.7318, + "step": 11208 + }, + { + "epoch": 3.440454266421117, + "grad_norm": 0.3304644227027893, + "learning_rate": 7.629699317780019e-05, + "loss": 1.8581, + "step": 11209 + }, + { + "epoch": 3.4407612031921424, + "grad_norm": 0.23396331071853638, + "learning_rate": 7.629276547855872e-05, + "loss": 1.7897, + "step": 11210 + }, + { + "epoch": 3.4410681399631677, + "grad_norm": 0.34914183616638184, + "learning_rate": 7.628853751947674e-05, + "loss": 1.8531, + "step": 11211 + }, + { + "epoch": 3.4413750767341926, + "grad_norm": 0.3700502812862396, + "learning_rate": 7.6284309300596e-05, + "loss": 1.7884, + "step": 11212 + }, + { + "epoch": 3.441682013505218, + "grad_norm": 0.24606801569461823, + "learning_rate": 7.628008082195835e-05, + "loss": 1.7292, + "step": 11213 + }, + { + "epoch": 3.441988950276243, + "grad_norm": 0.26344993710517883, + "learning_rate": 7.627585208360551e-05, + "loss": 1.7832, + "step": 11214 + }, + { + "epoch": 3.442295887047268, + "grad_norm": 0.4034743010997772, + "learning_rate": 7.62716230855793e-05, + "loss": 1.8164, + "step": 11215 + }, + { + "epoch": 3.4426028238182935, + "grad_norm": 0.4508039355278015, + "learning_rate": 7.626739382792152e-05, + "loss": 1.7855, + "step": 11216 + }, + { + "epoch": 3.4429097605893184, + "grad_norm": 0.2963111400604248, + "learning_rate": 7.626316431067395e-05, + "loss": 1.7995, + "step": 11217 + }, + { + "epoch": 3.4432166973603437, + "grad_norm": 0.35248515009880066, + "learning_rate": 7.625893453387841e-05, + "loss": 1.8761, + "step": 11218 + }, + { + "epoch": 3.443523634131369, + "grad_norm": 0.4032224416732788, + "learning_rate": 7.625470449757668e-05, + "loss": 1.7746, + "step": 11219 + }, + { + "epoch": 3.443830570902394, + "grad_norm": 0.3505195081233978, + "learning_rate": 7.625047420181057e-05, + "loss": 1.851, + "step": 11220 + }, + { + "epoch": 3.4441375076734193, + "grad_norm": 0.288968563079834, + "learning_rate": 7.62462436466219e-05, + "loss": 1.8055, + "step": 11221 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.43141910433769226, + "learning_rate": 7.624201283205246e-05, + "loss": 1.816, + "step": 11222 + }, + { + "epoch": 3.4447513812154695, + "grad_norm": 0.46902137994766235, + "learning_rate": 7.623778175814407e-05, + "loss": 1.8478, + "step": 11223 + }, + { + "epoch": 3.445058317986495, + "grad_norm": 0.3333328366279602, + "learning_rate": 7.623355042493854e-05, + "loss": 1.7949, + "step": 11224 + }, + { + "epoch": 3.4453652547575198, + "grad_norm": 0.2625340521335602, + "learning_rate": 7.622931883247768e-05, + "loss": 1.745, + "step": 11225 + }, + { + "epoch": 3.445672191528545, + "grad_norm": 0.4565848410129547, + "learning_rate": 7.622508698080333e-05, + "loss": 1.796, + "step": 11226 + }, + { + "epoch": 3.4459791282995704, + "grad_norm": 0.4676518738269806, + "learning_rate": 7.622085486995729e-05, + "loss": 1.8115, + "step": 11227 + }, + { + "epoch": 3.4462860650705953, + "grad_norm": 0.3828938603401184, + "learning_rate": 7.62166224999814e-05, + "loss": 1.8758, + "step": 11228 + }, + { + "epoch": 3.4465930018416207, + "grad_norm": 0.2786383628845215, + "learning_rate": 7.621238987091747e-05, + "loss": 1.7616, + "step": 11229 + }, + { + "epoch": 3.446899938612646, + "grad_norm": 0.4442835748195648, + "learning_rate": 7.620815698280734e-05, + "loss": 1.8342, + "step": 11230 + }, + { + "epoch": 3.447206875383671, + "grad_norm": 0.45760586857795715, + "learning_rate": 7.620392383569286e-05, + "loss": 1.8159, + "step": 11231 + }, + { + "epoch": 3.447513812154696, + "grad_norm": 0.2567009925842285, + "learning_rate": 7.619969042961583e-05, + "loss": 1.774, + "step": 11232 + }, + { + "epoch": 3.4478207489257215, + "grad_norm": 0.3720102310180664, + "learning_rate": 7.619545676461812e-05, + "loss": 1.8366, + "step": 11233 + }, + { + "epoch": 3.4481276856967464, + "grad_norm": 0.36436137557029724, + "learning_rate": 7.619122284074154e-05, + "loss": 1.832, + "step": 11234 + }, + { + "epoch": 3.4484346224677718, + "grad_norm": 0.310310959815979, + "learning_rate": 7.618698865802795e-05, + "loss": 1.9023, + "step": 11235 + }, + { + "epoch": 3.4487415592387967, + "grad_norm": 0.2693026661872864, + "learning_rate": 7.618275421651916e-05, + "loss": 1.7696, + "step": 11236 + }, + { + "epoch": 3.449048496009822, + "grad_norm": 0.2942425608634949, + "learning_rate": 7.61785195162571e-05, + "loss": 1.822, + "step": 11237 + }, + { + "epoch": 3.4493554327808473, + "grad_norm": 0.22454749047756195, + "learning_rate": 7.617428455728353e-05, + "loss": 1.7011, + "step": 11238 + }, + { + "epoch": 3.449662369551872, + "grad_norm": 0.23345038294792175, + "learning_rate": 7.617004933964035e-05, + "loss": 1.7563, + "step": 11239 + }, + { + "epoch": 3.4499693063228976, + "grad_norm": 0.24990662932395935, + "learning_rate": 7.616581386336941e-05, + "loss": 1.8031, + "step": 11240 + }, + { + "epoch": 3.4502762430939224, + "grad_norm": 0.2919348478317261, + "learning_rate": 7.616157812851254e-05, + "loss": 1.7355, + "step": 11241 + }, + { + "epoch": 3.450583179864948, + "grad_norm": 0.2926909327507019, + "learning_rate": 7.615734213511165e-05, + "loss": 1.8341, + "step": 11242 + }, + { + "epoch": 3.450890116635973, + "grad_norm": 0.24316683411598206, + "learning_rate": 7.615310588320855e-05, + "loss": 1.8154, + "step": 11243 + }, + { + "epoch": 3.451197053406998, + "grad_norm": 0.23154498636722565, + "learning_rate": 7.614886937284513e-05, + "loss": 1.7904, + "step": 11244 + }, + { + "epoch": 3.4515039901780233, + "grad_norm": 0.25973939895629883, + "learning_rate": 7.614463260406327e-05, + "loss": 1.7598, + "step": 11245 + }, + { + "epoch": 3.4518109269490487, + "grad_norm": 0.22110119462013245, + "learning_rate": 7.614039557690482e-05, + "loss": 1.7903, + "step": 11246 + }, + { + "epoch": 3.4521178637200736, + "grad_norm": 0.26184993982315063, + "learning_rate": 7.613615829141165e-05, + "loss": 1.748, + "step": 11247 + }, + { + "epoch": 3.452424800491099, + "grad_norm": 0.26128727197647095, + "learning_rate": 7.613192074762565e-05, + "loss": 1.7786, + "step": 11248 + }, + { + "epoch": 3.4527317372621242, + "grad_norm": 0.23230813443660736, + "learning_rate": 7.612768294558871e-05, + "loss": 1.8114, + "step": 11249 + }, + { + "epoch": 3.453038674033149, + "grad_norm": 0.2686540186405182, + "learning_rate": 7.612344488534268e-05, + "loss": 1.7311, + "step": 11250 + }, + { + "epoch": 3.4533456108041745, + "grad_norm": 0.25553348660469055, + "learning_rate": 7.611920656692946e-05, + "loss": 1.8468, + "step": 11251 + }, + { + "epoch": 3.4536525475751993, + "grad_norm": 0.2639308273792267, + "learning_rate": 7.611496799039092e-05, + "loss": 1.8292, + "step": 11252 + }, + { + "epoch": 3.4539594843462247, + "grad_norm": 0.2468358874320984, + "learning_rate": 7.611072915576895e-05, + "loss": 1.8173, + "step": 11253 + }, + { + "epoch": 3.45426642111725, + "grad_norm": 0.27236035466194153, + "learning_rate": 7.610649006310549e-05, + "loss": 1.8082, + "step": 11254 + }, + { + "epoch": 3.454573357888275, + "grad_norm": 0.2277914434671402, + "learning_rate": 7.610225071244237e-05, + "loss": 1.7483, + "step": 11255 + }, + { + "epoch": 3.4548802946593002, + "grad_norm": 0.2292868196964264, + "learning_rate": 7.60980111038215e-05, + "loss": 1.7716, + "step": 11256 + }, + { + "epoch": 3.455187231430325, + "grad_norm": 0.22116152942180634, + "learning_rate": 7.60937712372848e-05, + "loss": 1.773, + "step": 11257 + }, + { + "epoch": 3.4554941682013505, + "grad_norm": 0.23238304257392883, + "learning_rate": 7.608953111287416e-05, + "loss": 1.7602, + "step": 11258 + }, + { + "epoch": 3.455801104972376, + "grad_norm": 0.2810615003108978, + "learning_rate": 7.608529073063149e-05, + "loss": 1.8781, + "step": 11259 + }, + { + "epoch": 3.4561080417434007, + "grad_norm": 0.2516821324825287, + "learning_rate": 7.608105009059867e-05, + "loss": 1.835, + "step": 11260 + }, + { + "epoch": 3.456414978514426, + "grad_norm": 0.25698330998420715, + "learning_rate": 7.607680919281763e-05, + "loss": 1.7859, + "step": 11261 + }, + { + "epoch": 3.4567219152854514, + "grad_norm": 0.2597602903842926, + "learning_rate": 7.60725680373303e-05, + "loss": 1.8287, + "step": 11262 + }, + { + "epoch": 3.4570288520564763, + "grad_norm": 0.2564091980457306, + "learning_rate": 7.606832662417855e-05, + "loss": 1.8003, + "step": 11263 + }, + { + "epoch": 3.4573357888275016, + "grad_norm": 0.2872684597969055, + "learning_rate": 7.606408495340432e-05, + "loss": 1.8242, + "step": 11264 + }, + { + "epoch": 3.457642725598527, + "grad_norm": 0.27513590455055237, + "learning_rate": 7.605984302504952e-05, + "loss": 1.8605, + "step": 11265 + }, + { + "epoch": 3.457949662369552, + "grad_norm": 0.27768459916114807, + "learning_rate": 7.605560083915609e-05, + "loss": 1.7948, + "step": 11266 + }, + { + "epoch": 3.458256599140577, + "grad_norm": 0.23911382257938385, + "learning_rate": 7.605135839576593e-05, + "loss": 1.7575, + "step": 11267 + }, + { + "epoch": 3.458563535911602, + "grad_norm": 0.26773568987846375, + "learning_rate": 7.604711569492098e-05, + "loss": 1.752, + "step": 11268 + }, + { + "epoch": 3.4588704726826274, + "grad_norm": 0.30079394578933716, + "learning_rate": 7.604287273666316e-05, + "loss": 1.8022, + "step": 11269 + }, + { + "epoch": 3.4591774094536527, + "grad_norm": 0.27393853664398193, + "learning_rate": 7.603862952103441e-05, + "loss": 1.8054, + "step": 11270 + }, + { + "epoch": 3.4594843462246776, + "grad_norm": 0.2794870436191559, + "learning_rate": 7.603438604807667e-05, + "loss": 1.808, + "step": 11271 + }, + { + "epoch": 3.459791282995703, + "grad_norm": 0.26482146978378296, + "learning_rate": 7.603014231783185e-05, + "loss": 1.8696, + "step": 11272 + }, + { + "epoch": 3.460098219766728, + "grad_norm": 0.2755354344844818, + "learning_rate": 7.602589833034192e-05, + "loss": 1.8412, + "step": 11273 + }, + { + "epoch": 3.460405156537753, + "grad_norm": 0.2666642367839813, + "learning_rate": 7.602165408564883e-05, + "loss": 1.8333, + "step": 11274 + }, + { + "epoch": 3.4607120933087785, + "grad_norm": 0.26958519220352173, + "learning_rate": 7.601740958379448e-05, + "loss": 1.7943, + "step": 11275 + }, + { + "epoch": 3.4610190300798034, + "grad_norm": 0.2915789783000946, + "learning_rate": 7.601316482482084e-05, + "loss": 1.7519, + "step": 11276 + }, + { + "epoch": 3.4613259668508287, + "grad_norm": 0.2456950694322586, + "learning_rate": 7.600891980876985e-05, + "loss": 1.8064, + "step": 11277 + }, + { + "epoch": 3.461632903621854, + "grad_norm": 0.2517867088317871, + "learning_rate": 7.600467453568348e-05, + "loss": 1.7766, + "step": 11278 + }, + { + "epoch": 3.461939840392879, + "grad_norm": 0.24567969143390656, + "learning_rate": 7.600042900560368e-05, + "loss": 1.7331, + "step": 11279 + }, + { + "epoch": 3.4622467771639043, + "grad_norm": 0.23986820876598358, + "learning_rate": 7.599618321857239e-05, + "loss": 1.7477, + "step": 11280 + }, + { + "epoch": 3.4625537139349296, + "grad_norm": 0.2555375397205353, + "learning_rate": 7.599193717463158e-05, + "loss": 1.8154, + "step": 11281 + }, + { + "epoch": 3.4628606507059545, + "grad_norm": 0.2522781193256378, + "learning_rate": 7.598769087382323e-05, + "loss": 1.7821, + "step": 11282 + }, + { + "epoch": 3.46316758747698, + "grad_norm": 0.25631004571914673, + "learning_rate": 7.598344431618926e-05, + "loss": 1.8043, + "step": 11283 + }, + { + "epoch": 3.4634745242480047, + "grad_norm": 0.2611328661441803, + "learning_rate": 7.597919750177168e-05, + "loss": 1.8036, + "step": 11284 + }, + { + "epoch": 3.46378146101903, + "grad_norm": 0.255670428276062, + "learning_rate": 7.597495043061244e-05, + "loss": 1.7375, + "step": 11285 + }, + { + "epoch": 3.4640883977900554, + "grad_norm": 0.2687236964702606, + "learning_rate": 7.597070310275353e-05, + "loss": 1.7496, + "step": 11286 + }, + { + "epoch": 3.4643953345610803, + "grad_norm": 0.2643752992153168, + "learning_rate": 7.596645551823688e-05, + "loss": 1.8444, + "step": 11287 + }, + { + "epoch": 3.4647022713321056, + "grad_norm": 0.2564511299133301, + "learning_rate": 7.596220767710452e-05, + "loss": 1.7557, + "step": 11288 + }, + { + "epoch": 3.4650092081031305, + "grad_norm": 0.2510208487510681, + "learning_rate": 7.59579595793984e-05, + "loss": 1.7234, + "step": 11289 + }, + { + "epoch": 3.465316144874156, + "grad_norm": 0.2765158712863922, + "learning_rate": 7.595371122516051e-05, + "loss": 1.8215, + "step": 11290 + }, + { + "epoch": 3.465623081645181, + "grad_norm": 0.28233039379119873, + "learning_rate": 7.594946261443286e-05, + "loss": 1.7752, + "step": 11291 + }, + { + "epoch": 3.465930018416206, + "grad_norm": 0.26971468329429626, + "learning_rate": 7.594521374725735e-05, + "loss": 1.7924, + "step": 11292 + }, + { + "epoch": 3.4662369551872314, + "grad_norm": 0.29425930976867676, + "learning_rate": 7.594096462367608e-05, + "loss": 1.8144, + "step": 11293 + }, + { + "epoch": 3.4665438919582567, + "grad_norm": 0.233150452375412, + "learning_rate": 7.593671524373098e-05, + "loss": 1.7741, + "step": 11294 + }, + { + "epoch": 3.4668508287292816, + "grad_norm": 0.2947762608528137, + "learning_rate": 7.593246560746406e-05, + "loss": 1.8031, + "step": 11295 + }, + { + "epoch": 3.467157765500307, + "grad_norm": 0.250552773475647, + "learning_rate": 7.59282157149173e-05, + "loss": 1.7501, + "step": 11296 + }, + { + "epoch": 3.4674647022713323, + "grad_norm": 0.26091331243515015, + "learning_rate": 7.592396556613274e-05, + "loss": 1.836, + "step": 11297 + }, + { + "epoch": 3.467771639042357, + "grad_norm": 0.28625619411468506, + "learning_rate": 7.591971516115233e-05, + "loss": 1.7555, + "step": 11298 + }, + { + "epoch": 3.4680785758133825, + "grad_norm": 0.2723398804664612, + "learning_rate": 7.591546450001811e-05, + "loss": 1.825, + "step": 11299 + }, + { + "epoch": 3.4683855125844074, + "grad_norm": 0.24289946258068085, + "learning_rate": 7.591121358277211e-05, + "loss": 1.7441, + "step": 11300 + }, + { + "epoch": 3.4686924493554327, + "grad_norm": 0.2706952691078186, + "learning_rate": 7.590696240945629e-05, + "loss": 1.8651, + "step": 11301 + }, + { + "epoch": 3.468999386126458, + "grad_norm": 0.24632862210273743, + "learning_rate": 7.590271098011268e-05, + "loss": 1.8229, + "step": 11302 + }, + { + "epoch": 3.469306322897483, + "grad_norm": 0.29275211691856384, + "learning_rate": 7.58984592947833e-05, + "loss": 1.7591, + "step": 11303 + }, + { + "epoch": 3.4696132596685083, + "grad_norm": 0.29228144884109497, + "learning_rate": 7.589420735351016e-05, + "loss": 1.8395, + "step": 11304 + }, + { + "epoch": 3.4699201964395336, + "grad_norm": 0.28339114785194397, + "learning_rate": 7.588995515633528e-05, + "loss": 1.8543, + "step": 11305 + }, + { + "epoch": 3.4702271332105585, + "grad_norm": 0.2834693193435669, + "learning_rate": 7.588570270330071e-05, + "loss": 1.826, + "step": 11306 + }, + { + "epoch": 3.470534069981584, + "grad_norm": 0.26130759716033936, + "learning_rate": 7.588144999444844e-05, + "loss": 1.7887, + "step": 11307 + }, + { + "epoch": 3.470841006752609, + "grad_norm": 0.29554685950279236, + "learning_rate": 7.587719702982052e-05, + "loss": 1.819, + "step": 11308 + }, + { + "epoch": 3.471147943523634, + "grad_norm": 0.2687968611717224, + "learning_rate": 7.587294380945898e-05, + "loss": 1.7354, + "step": 11309 + }, + { + "epoch": 3.4714548802946594, + "grad_norm": 0.28795287013053894, + "learning_rate": 7.586869033340582e-05, + "loss": 1.8267, + "step": 11310 + }, + { + "epoch": 3.4717618170656843, + "grad_norm": 0.33244553208351135, + "learning_rate": 7.58644366017031e-05, + "loss": 1.86, + "step": 11311 + }, + { + "epoch": 3.4720687538367097, + "grad_norm": 0.2878025472164154, + "learning_rate": 7.586018261439288e-05, + "loss": 1.7587, + "step": 11312 + }, + { + "epoch": 3.472375690607735, + "grad_norm": 0.26856711506843567, + "learning_rate": 7.585592837151716e-05, + "loss": 1.7351, + "step": 11313 + }, + { + "epoch": 3.47268262737876, + "grad_norm": 0.2554367780685425, + "learning_rate": 7.585167387311802e-05, + "loss": 1.7664, + "step": 11314 + }, + { + "epoch": 3.472989564149785, + "grad_norm": 0.3193204700946808, + "learning_rate": 7.584741911923748e-05, + "loss": 1.7487, + "step": 11315 + }, + { + "epoch": 3.47329650092081, + "grad_norm": 0.3227958679199219, + "learning_rate": 7.584316410991759e-05, + "loss": 1.8107, + "step": 11316 + }, + { + "epoch": 3.4736034376918354, + "grad_norm": 0.33891916275024414, + "learning_rate": 7.58389088452004e-05, + "loss": 1.8466, + "step": 11317 + }, + { + "epoch": 3.4739103744628608, + "grad_norm": 0.27050724625587463, + "learning_rate": 7.583465332512797e-05, + "loss": 1.7877, + "step": 11318 + }, + { + "epoch": 3.4742173112338857, + "grad_norm": 0.2935837209224701, + "learning_rate": 7.583039754974235e-05, + "loss": 1.7932, + "step": 11319 + }, + { + "epoch": 3.474524248004911, + "grad_norm": 0.27780550718307495, + "learning_rate": 7.582614151908561e-05, + "loss": 1.8374, + "step": 11320 + }, + { + "epoch": 3.4748311847759363, + "grad_norm": 0.2579033076763153, + "learning_rate": 7.58218852331998e-05, + "loss": 1.7305, + "step": 11321 + }, + { + "epoch": 3.4751381215469612, + "grad_norm": 0.2531716227531433, + "learning_rate": 7.581762869212699e-05, + "loss": 1.8136, + "step": 11322 + }, + { + "epoch": 3.4754450583179866, + "grad_norm": 0.25504544377326965, + "learning_rate": 7.581337189590924e-05, + "loss": 1.787, + "step": 11323 + }, + { + "epoch": 3.475751995089012, + "grad_norm": 0.23659855127334595, + "learning_rate": 7.580911484458861e-05, + "loss": 1.77, + "step": 11324 + }, + { + "epoch": 3.476058931860037, + "grad_norm": 0.22556856274604797, + "learning_rate": 7.580485753820721e-05, + "loss": 1.7808, + "step": 11325 + }, + { + "epoch": 3.476365868631062, + "grad_norm": 0.2860291600227356, + "learning_rate": 7.580059997680705e-05, + "loss": 1.8224, + "step": 11326 + }, + { + "epoch": 3.476672805402087, + "grad_norm": 0.3134596645832062, + "learning_rate": 7.579634216043023e-05, + "loss": 1.8278, + "step": 11327 + }, + { + "epoch": 3.4769797421731123, + "grad_norm": 0.2883087992668152, + "learning_rate": 7.579208408911887e-05, + "loss": 1.7917, + "step": 11328 + }, + { + "epoch": 3.4772866789441377, + "grad_norm": 0.2743333578109741, + "learning_rate": 7.578782576291501e-05, + "loss": 1.8228, + "step": 11329 + }, + { + "epoch": 3.4775936157151626, + "grad_norm": 0.25026053190231323, + "learning_rate": 7.578356718186073e-05, + "loss": 1.7717, + "step": 11330 + }, + { + "epoch": 3.477900552486188, + "grad_norm": 0.246905118227005, + "learning_rate": 7.577930834599813e-05, + "loss": 1.7979, + "step": 11331 + }, + { + "epoch": 3.478207489257213, + "grad_norm": 0.24709418416023254, + "learning_rate": 7.577504925536929e-05, + "loss": 1.8111, + "step": 11332 + }, + { + "epoch": 3.478514426028238, + "grad_norm": 0.25685814023017883, + "learning_rate": 7.577078991001632e-05, + "loss": 1.8255, + "step": 11333 + }, + { + "epoch": 3.4788213627992635, + "grad_norm": 0.23937836289405823, + "learning_rate": 7.576653030998129e-05, + "loss": 1.7254, + "step": 11334 + }, + { + "epoch": 3.4791282995702884, + "grad_norm": 0.22638650238513947, + "learning_rate": 7.57622704553063e-05, + "loss": 1.7847, + "step": 11335 + }, + { + "epoch": 3.4794352363413137, + "grad_norm": 0.26083993911743164, + "learning_rate": 7.575801034603347e-05, + "loss": 1.7947, + "step": 11336 + }, + { + "epoch": 3.479742173112339, + "grad_norm": 0.2715466022491455, + "learning_rate": 7.575374998220488e-05, + "loss": 1.848, + "step": 11337 + }, + { + "epoch": 3.480049109883364, + "grad_norm": 0.25554224848747253, + "learning_rate": 7.574948936386262e-05, + "loss": 1.7811, + "step": 11338 + }, + { + "epoch": 3.4803560466543892, + "grad_norm": 0.2689397931098938, + "learning_rate": 7.574522849104882e-05, + "loss": 1.82, + "step": 11339 + }, + { + "epoch": 3.4806629834254146, + "grad_norm": 0.25027474761009216, + "learning_rate": 7.57409673638056e-05, + "loss": 1.775, + "step": 11340 + }, + { + "epoch": 3.4809699201964395, + "grad_norm": 0.2545457184314728, + "learning_rate": 7.573670598217504e-05, + "loss": 1.8056, + "step": 11341 + }, + { + "epoch": 3.481276856967465, + "grad_norm": 0.28404027223587036, + "learning_rate": 7.573244434619928e-05, + "loss": 1.8372, + "step": 11342 + }, + { + "epoch": 3.4815837937384897, + "grad_norm": 0.28046950697898865, + "learning_rate": 7.572818245592041e-05, + "loss": 1.7851, + "step": 11343 + }, + { + "epoch": 3.481890730509515, + "grad_norm": 0.23005759716033936, + "learning_rate": 7.572392031138056e-05, + "loss": 1.7059, + "step": 11344 + }, + { + "epoch": 3.4821976672805404, + "grad_norm": 0.2931719124317169, + "learning_rate": 7.571965791262185e-05, + "loss": 1.84, + "step": 11345 + }, + { + "epoch": 3.4825046040515653, + "grad_norm": 0.4399266242980957, + "learning_rate": 7.571539525968642e-05, + "loss": 1.7465, + "step": 11346 + }, + { + "epoch": 3.4828115408225906, + "grad_norm": 0.48957565426826477, + "learning_rate": 7.571113235261638e-05, + "loss": 1.8494, + "step": 11347 + }, + { + "epoch": 3.4831184775936155, + "grad_norm": 0.37828895449638367, + "learning_rate": 7.570686919145385e-05, + "loss": 1.7598, + "step": 11348 + }, + { + "epoch": 3.483425414364641, + "grad_norm": 0.22943973541259766, + "learning_rate": 7.570260577624098e-05, + "loss": 1.7443, + "step": 11349 + }, + { + "epoch": 3.483732351135666, + "grad_norm": 0.3245384991168976, + "learning_rate": 7.569834210701987e-05, + "loss": 1.7232, + "step": 11350 + }, + { + "epoch": 3.484039287906691, + "grad_norm": 0.4419693648815155, + "learning_rate": 7.569407818383271e-05, + "loss": 1.841, + "step": 11351 + }, + { + "epoch": 3.4843462246777164, + "grad_norm": 0.4061864912509918, + "learning_rate": 7.568981400672159e-05, + "loss": 1.8274, + "step": 11352 + }, + { + "epoch": 3.4846531614487417, + "grad_norm": 0.2609417736530304, + "learning_rate": 7.56855495757287e-05, + "loss": 1.8631, + "step": 11353 + }, + { + "epoch": 3.4849600982197666, + "grad_norm": 0.28758567571640015, + "learning_rate": 7.568128489089612e-05, + "loss": 1.8169, + "step": 11354 + }, + { + "epoch": 3.485267034990792, + "grad_norm": 0.40643060207366943, + "learning_rate": 7.567701995226606e-05, + "loss": 1.809, + "step": 11355 + }, + { + "epoch": 3.4855739717618173, + "grad_norm": 0.37649446725845337, + "learning_rate": 7.56727547598806e-05, + "loss": 1.7661, + "step": 11356 + }, + { + "epoch": 3.485880908532842, + "grad_norm": 0.22863779962062836, + "learning_rate": 7.566848931378197e-05, + "loss": 1.808, + "step": 11357 + }, + { + "epoch": 3.4861878453038675, + "grad_norm": 0.4487019181251526, + "learning_rate": 7.566422361401226e-05, + "loss": 1.7627, + "step": 11358 + }, + { + "epoch": 3.4864947820748924, + "grad_norm": 0.4583640694618225, + "learning_rate": 7.565995766061367e-05, + "loss": 1.8186, + "step": 11359 + }, + { + "epoch": 3.4868017188459177, + "grad_norm": 0.27231526374816895, + "learning_rate": 7.565569145362833e-05, + "loss": 1.8465, + "step": 11360 + }, + { + "epoch": 3.487108655616943, + "grad_norm": 0.3877887725830078, + "learning_rate": 7.565142499309841e-05, + "loss": 1.7668, + "step": 11361 + }, + { + "epoch": 3.487415592387968, + "grad_norm": 0.5511242747306824, + "learning_rate": 7.564715827906606e-05, + "loss": 1.8417, + "step": 11362 + }, + { + "epoch": 3.4877225291589933, + "grad_norm": 0.5112231373786926, + "learning_rate": 7.564289131157348e-05, + "loss": 1.8038, + "step": 11363 + }, + { + "epoch": 3.488029465930018, + "grad_norm": 0.279502809047699, + "learning_rate": 7.56386240906628e-05, + "loss": 1.7545, + "step": 11364 + }, + { + "epoch": 3.4883364027010435, + "grad_norm": 0.30080464482307434, + "learning_rate": 7.563435661637623e-05, + "loss": 1.8136, + "step": 11365 + }, + { + "epoch": 3.488643339472069, + "grad_norm": 0.4424717128276825, + "learning_rate": 7.563008888875591e-05, + "loss": 1.7542, + "step": 11366 + }, + { + "epoch": 3.4889502762430937, + "grad_norm": 0.42144715785980225, + "learning_rate": 7.562582090784403e-05, + "loss": 1.8245, + "step": 11367 + }, + { + "epoch": 3.489257213014119, + "grad_norm": 0.2533668875694275, + "learning_rate": 7.562155267368277e-05, + "loss": 1.8654, + "step": 11368 + }, + { + "epoch": 3.4895641497851444, + "grad_norm": 0.3327534794807434, + "learning_rate": 7.56172841863143e-05, + "loss": 1.7882, + "step": 11369 + }, + { + "epoch": 3.4898710865561693, + "grad_norm": 0.44001486897468567, + "learning_rate": 7.561301544578081e-05, + "loss": 1.8397, + "step": 11370 + }, + { + "epoch": 3.4901780233271946, + "grad_norm": 0.2779090106487274, + "learning_rate": 7.56087464521245e-05, + "loss": 1.7398, + "step": 11371 + }, + { + "epoch": 3.49048496009822, + "grad_norm": 0.3018067479133606, + "learning_rate": 7.560447720538755e-05, + "loss": 1.8076, + "step": 11372 + }, + { + "epoch": 3.490791896869245, + "grad_norm": 0.4370935261249542, + "learning_rate": 7.560020770561216e-05, + "loss": 1.8057, + "step": 11373 + }, + { + "epoch": 3.49109883364027, + "grad_norm": 0.2936978042125702, + "learning_rate": 7.559593795284047e-05, + "loss": 1.7726, + "step": 11374 + }, + { + "epoch": 3.491405770411295, + "grad_norm": 0.28825095295906067, + "learning_rate": 7.559166794711476e-05, + "loss": 1.8039, + "step": 11375 + }, + { + "epoch": 3.4917127071823204, + "grad_norm": 0.39334073662757874, + "learning_rate": 7.55873976884772e-05, + "loss": 1.8388, + "step": 11376 + }, + { + "epoch": 3.4920196439533457, + "grad_norm": 0.33880460262298584, + "learning_rate": 7.558312717696995e-05, + "loss": 1.7791, + "step": 11377 + }, + { + "epoch": 3.4923265807243706, + "grad_norm": 0.4433762729167938, + "learning_rate": 7.557885641263524e-05, + "loss": 1.7786, + "step": 11378 + }, + { + "epoch": 3.492633517495396, + "grad_norm": 0.4710264205932617, + "learning_rate": 7.557458539551527e-05, + "loss": 1.7193, + "step": 11379 + }, + { + "epoch": 3.4929404542664213, + "grad_norm": 0.27514326572418213, + "learning_rate": 7.557031412565228e-05, + "loss": 1.823, + "step": 11380 + }, + { + "epoch": 3.493247391037446, + "grad_norm": 0.4681413471698761, + "learning_rate": 7.556604260308846e-05, + "loss": 1.7598, + "step": 11381 + }, + { + "epoch": 3.4935543278084715, + "grad_norm": 0.5032503604888916, + "learning_rate": 7.556177082786602e-05, + "loss": 1.741, + "step": 11382 + }, + { + "epoch": 3.493861264579497, + "grad_norm": 0.2677086889743805, + "learning_rate": 7.555749880002716e-05, + "loss": 1.8528, + "step": 11383 + }, + { + "epoch": 3.4941682013505218, + "grad_norm": 0.43870940804481506, + "learning_rate": 7.555322651961414e-05, + "loss": 1.7632, + "step": 11384 + }, + { + "epoch": 3.494475138121547, + "grad_norm": 0.5403209924697876, + "learning_rate": 7.554895398666914e-05, + "loss": 1.8181, + "step": 11385 + }, + { + "epoch": 3.494782074892572, + "grad_norm": 0.2714318335056305, + "learning_rate": 7.554468120123441e-05, + "loss": 1.8151, + "step": 11386 + }, + { + "epoch": 3.4950890116635973, + "grad_norm": 0.49661698937416077, + "learning_rate": 7.554040816335217e-05, + "loss": 1.8116, + "step": 11387 + }, + { + "epoch": 3.4953959484346226, + "grad_norm": 0.49954715371131897, + "learning_rate": 7.553613487306465e-05, + "loss": 1.8841, + "step": 11388 + }, + { + "epoch": 3.4957028852056475, + "grad_norm": 0.28189441561698914, + "learning_rate": 7.553186133041406e-05, + "loss": 1.7834, + "step": 11389 + }, + { + "epoch": 3.496009821976673, + "grad_norm": 0.36029115319252014, + "learning_rate": 7.552758753544267e-05, + "loss": 1.7796, + "step": 11390 + }, + { + "epoch": 3.4963167587476978, + "grad_norm": 0.45023465156555176, + "learning_rate": 7.552331348819268e-05, + "loss": 1.8773, + "step": 11391 + }, + { + "epoch": 3.496623695518723, + "grad_norm": 0.3235788643360138, + "learning_rate": 7.551903918870636e-05, + "loss": 1.7984, + "step": 11392 + }, + { + "epoch": 3.4969306322897484, + "grad_norm": 0.25656190514564514, + "learning_rate": 7.551476463702596e-05, + "loss": 1.8403, + "step": 11393 + }, + { + "epoch": 3.4972375690607733, + "grad_norm": 0.2866458594799042, + "learning_rate": 7.551048983319366e-05, + "loss": 1.7428, + "step": 11394 + }, + { + "epoch": 3.4975445058317987, + "grad_norm": 0.2713877856731415, + "learning_rate": 7.550621477725177e-05, + "loss": 1.8508, + "step": 11395 + }, + { + "epoch": 3.497851442602824, + "grad_norm": 0.27978867292404175, + "learning_rate": 7.55019394692425e-05, + "loss": 1.8049, + "step": 11396 + }, + { + "epoch": 3.498158379373849, + "grad_norm": 0.3275020122528076, + "learning_rate": 7.549766390920814e-05, + "loss": 1.8553, + "step": 11397 + }, + { + "epoch": 3.498465316144874, + "grad_norm": 0.29947492480278015, + "learning_rate": 7.54933880971909e-05, + "loss": 1.7614, + "step": 11398 + }, + { + "epoch": 3.4987722529158995, + "grad_norm": 0.25790849328041077, + "learning_rate": 7.548911203323308e-05, + "loss": 1.8223, + "step": 11399 + }, + { + "epoch": 3.4990791896869244, + "grad_norm": 0.3145451545715332, + "learning_rate": 7.54848357173769e-05, + "loss": 1.7642, + "step": 11400 + }, + { + "epoch": 3.4993861264579498, + "grad_norm": 0.29052913188934326, + "learning_rate": 7.548055914966463e-05, + "loss": 1.7728, + "step": 11401 + }, + { + "epoch": 3.4996930632289747, + "grad_norm": 0.2741037905216217, + "learning_rate": 7.547628233013854e-05, + "loss": 1.7382, + "step": 11402 + }, + { + "epoch": 3.5, + "grad_norm": 0.2562723755836487, + "learning_rate": 7.54720052588409e-05, + "loss": 1.7455, + "step": 11403 + }, + { + "epoch": 3.5003069367710253, + "grad_norm": 0.27649983763694763, + "learning_rate": 7.546772793581398e-05, + "loss": 1.7194, + "step": 11404 + }, + { + "epoch": 3.5006138735420502, + "grad_norm": 0.27290579676628113, + "learning_rate": 7.546345036110004e-05, + "loss": 1.87, + "step": 11405 + }, + { + "epoch": 3.5009208103130756, + "grad_norm": 0.33585605025291443, + "learning_rate": 7.545917253474136e-05, + "loss": 1.7703, + "step": 11406 + }, + { + "epoch": 3.5012277470841005, + "grad_norm": 0.2592691481113434, + "learning_rate": 7.545489445678022e-05, + "loss": 1.7657, + "step": 11407 + }, + { + "epoch": 3.501534683855126, + "grad_norm": 0.3081367015838623, + "learning_rate": 7.545061612725888e-05, + "loss": 1.8067, + "step": 11408 + }, + { + "epoch": 3.501841620626151, + "grad_norm": 0.31012001633644104, + "learning_rate": 7.544633754621965e-05, + "loss": 1.8009, + "step": 11409 + }, + { + "epoch": 3.5021485573971765, + "grad_norm": 0.28232479095458984, + "learning_rate": 7.54420587137048e-05, + "loss": 1.8124, + "step": 11410 + }, + { + "epoch": 3.5024554941682013, + "grad_norm": 0.24079222977161407, + "learning_rate": 7.54377796297566e-05, + "loss": 1.789, + "step": 11411 + }, + { + "epoch": 3.5027624309392267, + "grad_norm": 0.27347204089164734, + "learning_rate": 7.543350029441737e-05, + "loss": 1.7704, + "step": 11412 + }, + { + "epoch": 3.5030693677102516, + "grad_norm": 0.25545811653137207, + "learning_rate": 7.542922070772935e-05, + "loss": 1.7871, + "step": 11413 + }, + { + "epoch": 3.503376304481277, + "grad_norm": 0.2507263123989105, + "learning_rate": 7.54249408697349e-05, + "loss": 1.8424, + "step": 11414 + }, + { + "epoch": 3.5036832412523022, + "grad_norm": 0.2776084244251251, + "learning_rate": 7.542066078047627e-05, + "loss": 1.8246, + "step": 11415 + }, + { + "epoch": 3.503990178023327, + "grad_norm": 0.32833749055862427, + "learning_rate": 7.541638043999577e-05, + "loss": 1.7785, + "step": 11416 + }, + { + "epoch": 3.5042971147943525, + "grad_norm": 0.258486270904541, + "learning_rate": 7.541209984833571e-05, + "loss": 1.7543, + "step": 11417 + }, + { + "epoch": 3.5046040515653774, + "grad_norm": 0.25825178623199463, + "learning_rate": 7.540781900553837e-05, + "loss": 1.7939, + "step": 11418 + }, + { + "epoch": 3.5049109883364027, + "grad_norm": 0.26980888843536377, + "learning_rate": 7.540353791164606e-05, + "loss": 1.7777, + "step": 11419 + }, + { + "epoch": 3.505217925107428, + "grad_norm": 0.24103333055973053, + "learning_rate": 7.539925656670111e-05, + "loss": 1.7565, + "step": 11420 + }, + { + "epoch": 3.505524861878453, + "grad_norm": 0.25192007422447205, + "learning_rate": 7.539497497074584e-05, + "loss": 1.7696, + "step": 11421 + }, + { + "epoch": 3.5058317986494782, + "grad_norm": 0.218489870429039, + "learning_rate": 7.539069312382252e-05, + "loss": 1.761, + "step": 11422 + }, + { + "epoch": 3.506138735420503, + "grad_norm": 0.27533552050590515, + "learning_rate": 7.53864110259735e-05, + "loss": 1.7374, + "step": 11423 + }, + { + "epoch": 3.5064456721915285, + "grad_norm": 0.2603490650653839, + "learning_rate": 7.538212867724108e-05, + "loss": 1.8342, + "step": 11424 + }, + { + "epoch": 3.506752608962554, + "grad_norm": 0.27340635657310486, + "learning_rate": 7.537784607766758e-05, + "loss": 1.8099, + "step": 11425 + }, + { + "epoch": 3.507059545733579, + "grad_norm": 0.25342679023742676, + "learning_rate": 7.537356322729537e-05, + "loss": 1.7949, + "step": 11426 + }, + { + "epoch": 3.507366482504604, + "grad_norm": 0.292819082736969, + "learning_rate": 7.536928012616669e-05, + "loss": 1.9049, + "step": 11427 + }, + { + "epoch": 3.5076734192756294, + "grad_norm": 0.28256532549858093, + "learning_rate": 7.536499677432393e-05, + "loss": 1.8464, + "step": 11428 + }, + { + "epoch": 3.5079803560466543, + "grad_norm": 0.2672989070415497, + "learning_rate": 7.536071317180942e-05, + "loss": 1.8301, + "step": 11429 + }, + { + "epoch": 3.5082872928176796, + "grad_norm": 0.2525518238544464, + "learning_rate": 7.535642931866546e-05, + "loss": 1.8054, + "step": 11430 + }, + { + "epoch": 3.508594229588705, + "grad_norm": 0.2622447609901428, + "learning_rate": 7.535214521493442e-05, + "loss": 1.8293, + "step": 11431 + }, + { + "epoch": 3.50890116635973, + "grad_norm": 0.27057385444641113, + "learning_rate": 7.534786086065859e-05, + "loss": 1.7426, + "step": 11432 + }, + { + "epoch": 3.509208103130755, + "grad_norm": 0.27363866567611694, + "learning_rate": 7.534357625588038e-05, + "loss": 1.7138, + "step": 11433 + }, + { + "epoch": 3.50951503990178, + "grad_norm": 0.3029060363769531, + "learning_rate": 7.533929140064207e-05, + "loss": 1.864, + "step": 11434 + }, + { + "epoch": 3.5098219766728054, + "grad_norm": 0.3144821524620056, + "learning_rate": 7.533500629498604e-05, + "loss": 1.7846, + "step": 11435 + }, + { + "epoch": 3.5101289134438307, + "grad_norm": 0.44535213708877563, + "learning_rate": 7.533072093895461e-05, + "loss": 1.799, + "step": 11436 + }, + { + "epoch": 3.5104358502148556, + "grad_norm": 0.25344160199165344, + "learning_rate": 7.532643533259017e-05, + "loss": 1.7391, + "step": 11437 + }, + { + "epoch": 3.510742786985881, + "grad_norm": 0.286026269197464, + "learning_rate": 7.532214947593506e-05, + "loss": 1.8436, + "step": 11438 + }, + { + "epoch": 3.511049723756906, + "grad_norm": 0.3317352533340454, + "learning_rate": 7.53178633690316e-05, + "loss": 1.8507, + "step": 11439 + }, + { + "epoch": 3.511356660527931, + "grad_norm": 0.2547265589237213, + "learning_rate": 7.53135770119222e-05, + "loss": 1.7483, + "step": 11440 + }, + { + "epoch": 3.5116635972989565, + "grad_norm": 0.24281835556030273, + "learning_rate": 7.530929040464917e-05, + "loss": 1.759, + "step": 11441 + }, + { + "epoch": 3.511970534069982, + "grad_norm": 0.2935381829738617, + "learning_rate": 7.530500354725491e-05, + "loss": 1.8235, + "step": 11442 + }, + { + "epoch": 3.5122774708410067, + "grad_norm": 0.26642969250679016, + "learning_rate": 7.53007164397818e-05, + "loss": 1.8324, + "step": 11443 + }, + { + "epoch": 3.512584407612032, + "grad_norm": 0.24830882251262665, + "learning_rate": 7.529642908227215e-05, + "loss": 1.8132, + "step": 11444 + }, + { + "epoch": 3.512891344383057, + "grad_norm": 0.3100191056728363, + "learning_rate": 7.529214147476838e-05, + "loss": 1.8453, + "step": 11445 + }, + { + "epoch": 3.5131982811540823, + "grad_norm": 0.27948811650276184, + "learning_rate": 7.528785361731282e-05, + "loss": 1.7792, + "step": 11446 + }, + { + "epoch": 3.5135052179251076, + "grad_norm": 0.26978832483291626, + "learning_rate": 7.528356550994787e-05, + "loss": 1.7857, + "step": 11447 + }, + { + "epoch": 3.5138121546961325, + "grad_norm": 0.30527836084365845, + "learning_rate": 7.527927715271592e-05, + "loss": 1.807, + "step": 11448 + }, + { + "epoch": 3.514119091467158, + "grad_norm": 0.2915664315223694, + "learning_rate": 7.527498854565934e-05, + "loss": 1.8414, + "step": 11449 + }, + { + "epoch": 3.5144260282381827, + "grad_norm": 0.2854034900665283, + "learning_rate": 7.52706996888205e-05, + "loss": 1.793, + "step": 11450 + }, + { + "epoch": 3.514732965009208, + "grad_norm": 0.30281978845596313, + "learning_rate": 7.52664105822418e-05, + "loss": 1.7896, + "step": 11451 + }, + { + "epoch": 3.5150399017802334, + "grad_norm": 0.3317166566848755, + "learning_rate": 7.526212122596561e-05, + "loss": 1.7776, + "step": 11452 + }, + { + "epoch": 3.5153468385512583, + "grad_norm": 0.3400021195411682, + "learning_rate": 7.525783162003434e-05, + "loss": 1.8411, + "step": 11453 + }, + { + "epoch": 3.5156537753222836, + "grad_norm": 0.25169485807418823, + "learning_rate": 7.525354176449037e-05, + "loss": 1.7871, + "step": 11454 + }, + { + "epoch": 3.5159607120933085, + "grad_norm": 0.3442455530166626, + "learning_rate": 7.52492516593761e-05, + "loss": 1.7644, + "step": 11455 + }, + { + "epoch": 3.516267648864334, + "grad_norm": 0.35644033551216125, + "learning_rate": 7.524496130473394e-05, + "loss": 1.801, + "step": 11456 + }, + { + "epoch": 3.516574585635359, + "grad_norm": 0.3180185854434967, + "learning_rate": 7.524067070060625e-05, + "loss": 1.7897, + "step": 11457 + }, + { + "epoch": 3.5168815224063845, + "grad_norm": 0.2417978048324585, + "learning_rate": 7.523637984703548e-05, + "loss": 1.8527, + "step": 11458 + }, + { + "epoch": 3.5171884591774094, + "grad_norm": 0.29661375284194946, + "learning_rate": 7.5232088744064e-05, + "loss": 1.8276, + "step": 11459 + }, + { + "epoch": 3.5174953959484347, + "grad_norm": 0.2467545121908188, + "learning_rate": 7.522779739173424e-05, + "loss": 1.7819, + "step": 11460 + }, + { + "epoch": 3.5178023327194596, + "grad_norm": 0.26177898049354553, + "learning_rate": 7.522350579008859e-05, + "loss": 1.8017, + "step": 11461 + }, + { + "epoch": 3.518109269490485, + "grad_norm": 0.28740498423576355, + "learning_rate": 7.521921393916948e-05, + "loss": 1.7863, + "step": 11462 + }, + { + "epoch": 3.5184162062615103, + "grad_norm": 0.28685200214385986, + "learning_rate": 7.521492183901932e-05, + "loss": 1.8069, + "step": 11463 + }, + { + "epoch": 3.518723143032535, + "grad_norm": 0.24174338579177856, + "learning_rate": 7.521062948968051e-05, + "loss": 1.7523, + "step": 11464 + }, + { + "epoch": 3.5190300798035605, + "grad_norm": 0.23273243010044098, + "learning_rate": 7.520633689119548e-05, + "loss": 1.7827, + "step": 11465 + }, + { + "epoch": 3.5193370165745854, + "grad_norm": 0.22708217799663544, + "learning_rate": 7.520204404360667e-05, + "loss": 1.7377, + "step": 11466 + }, + { + "epoch": 3.5196439533456108, + "grad_norm": 0.24725353717803955, + "learning_rate": 7.519775094695649e-05, + "loss": 1.7828, + "step": 11467 + }, + { + "epoch": 3.519950890116636, + "grad_norm": 0.23046265542507172, + "learning_rate": 7.519345760128736e-05, + "loss": 1.7427, + "step": 11468 + }, + { + "epoch": 3.520257826887661, + "grad_norm": 0.2618728280067444, + "learning_rate": 7.518916400664171e-05, + "loss": 1.8133, + "step": 11469 + }, + { + "epoch": 3.5205647636586863, + "grad_norm": 0.23232363164424896, + "learning_rate": 7.5184870163062e-05, + "loss": 1.7468, + "step": 11470 + }, + { + "epoch": 3.520871700429711, + "grad_norm": 0.21993626654148102, + "learning_rate": 7.51805760705906e-05, + "loss": 1.7565, + "step": 11471 + }, + { + "epoch": 3.5211786372007365, + "grad_norm": 0.23563124239444733, + "learning_rate": 7.517628172927001e-05, + "loss": 1.7795, + "step": 11472 + }, + { + "epoch": 3.521485573971762, + "grad_norm": 0.24502862989902496, + "learning_rate": 7.517198713914266e-05, + "loss": 1.813, + "step": 11473 + }, + { + "epoch": 3.521792510742787, + "grad_norm": 0.24745969474315643, + "learning_rate": 7.516769230025097e-05, + "loss": 1.7601, + "step": 11474 + }, + { + "epoch": 3.522099447513812, + "grad_norm": 0.27686986327171326, + "learning_rate": 7.516339721263739e-05, + "loss": 1.8121, + "step": 11475 + }, + { + "epoch": 3.5224063842848374, + "grad_norm": 0.3110332787036896, + "learning_rate": 7.515910187634439e-05, + "loss": 1.7978, + "step": 11476 + }, + { + "epoch": 3.5227133210558623, + "grad_norm": 0.3394792377948761, + "learning_rate": 7.515480629141436e-05, + "loss": 1.8427, + "step": 11477 + }, + { + "epoch": 3.5230202578268877, + "grad_norm": 0.2802537679672241, + "learning_rate": 7.515051045788984e-05, + "loss": 1.7343, + "step": 11478 + }, + { + "epoch": 3.523327194597913, + "grad_norm": 0.23687711358070374, + "learning_rate": 7.514621437581319e-05, + "loss": 1.7786, + "step": 11479 + }, + { + "epoch": 3.523634131368938, + "grad_norm": 0.31114310026168823, + "learning_rate": 7.514191804522693e-05, + "loss": 1.8137, + "step": 11480 + }, + { + "epoch": 3.523941068139963, + "grad_norm": 0.3257891833782196, + "learning_rate": 7.513762146617351e-05, + "loss": 1.8015, + "step": 11481 + }, + { + "epoch": 3.524248004910988, + "grad_norm": 0.24353443086147308, + "learning_rate": 7.513332463869536e-05, + "loss": 1.7485, + "step": 11482 + }, + { + "epoch": 3.5245549416820134, + "grad_norm": 0.29861485958099365, + "learning_rate": 7.512902756283498e-05, + "loss": 1.7993, + "step": 11483 + }, + { + "epoch": 3.5248618784530388, + "grad_norm": 0.40380924940109253, + "learning_rate": 7.51247302386348e-05, + "loss": 1.7664, + "step": 11484 + }, + { + "epoch": 3.525168815224064, + "grad_norm": 0.3365862965583801, + "learning_rate": 7.512043266613733e-05, + "loss": 1.7512, + "step": 11485 + }, + { + "epoch": 3.525475751995089, + "grad_norm": 0.2502824068069458, + "learning_rate": 7.511613484538502e-05, + "loss": 1.8414, + "step": 11486 + }, + { + "epoch": 3.5257826887661143, + "grad_norm": 0.2598603069782257, + "learning_rate": 7.511183677642034e-05, + "loss": 1.7358, + "step": 11487 + }, + { + "epoch": 3.5260896255371392, + "grad_norm": 0.30246880650520325, + "learning_rate": 7.510753845928576e-05, + "loss": 1.791, + "step": 11488 + }, + { + "epoch": 3.5263965623081646, + "grad_norm": 0.25170832872390747, + "learning_rate": 7.510323989402378e-05, + "loss": 1.7498, + "step": 11489 + }, + { + "epoch": 3.52670349907919, + "grad_norm": 0.2925282418727875, + "learning_rate": 7.509894108067688e-05, + "loss": 1.8413, + "step": 11490 + }, + { + "epoch": 3.527010435850215, + "grad_norm": 0.2643601596355438, + "learning_rate": 7.509464201928752e-05, + "loss": 1.8052, + "step": 11491 + }, + { + "epoch": 3.52731737262124, + "grad_norm": 0.2938917279243469, + "learning_rate": 7.50903427098982e-05, + "loss": 1.7308, + "step": 11492 + }, + { + "epoch": 3.527624309392265, + "grad_norm": 0.2978343367576599, + "learning_rate": 7.508604315255142e-05, + "loss": 1.8147, + "step": 11493 + }, + { + "epoch": 3.5279312461632903, + "grad_norm": 0.2507816255092621, + "learning_rate": 7.508174334728963e-05, + "loss": 1.774, + "step": 11494 + }, + { + "epoch": 3.5282381829343157, + "grad_norm": 0.32971861958503723, + "learning_rate": 7.507744329415538e-05, + "loss": 1.7634, + "step": 11495 + }, + { + "epoch": 3.5285451197053406, + "grad_norm": 0.3149639964103699, + "learning_rate": 7.507314299319113e-05, + "loss": 1.8032, + "step": 11496 + }, + { + "epoch": 3.528852056476366, + "grad_norm": 0.2721364498138428, + "learning_rate": 7.506884244443937e-05, + "loss": 1.7702, + "step": 11497 + }, + { + "epoch": 3.529158993247391, + "grad_norm": 0.29375985264778137, + "learning_rate": 7.506454164794263e-05, + "loss": 1.8673, + "step": 11498 + }, + { + "epoch": 3.529465930018416, + "grad_norm": 0.379944384098053, + "learning_rate": 7.50602406037434e-05, + "loss": 1.883, + "step": 11499 + }, + { + "epoch": 3.5297728667894415, + "grad_norm": 0.4041840136051178, + "learning_rate": 7.505593931188417e-05, + "loss": 1.7998, + "step": 11500 + }, + { + "epoch": 3.530079803560467, + "grad_norm": 0.30013784766197205, + "learning_rate": 7.505163777240747e-05, + "loss": 1.775, + "step": 11501 + }, + { + "epoch": 3.5303867403314917, + "grad_norm": 0.25161153078079224, + "learning_rate": 7.50473359853558e-05, + "loss": 1.8609, + "step": 11502 + }, + { + "epoch": 3.530693677102517, + "grad_norm": 0.2803831100463867, + "learning_rate": 7.504303395077168e-05, + "loss": 1.8397, + "step": 11503 + }, + { + "epoch": 3.531000613873542, + "grad_norm": 0.26678118109703064, + "learning_rate": 7.503873166869762e-05, + "loss": 1.7877, + "step": 11504 + }, + { + "epoch": 3.5313075506445673, + "grad_norm": 0.24280449748039246, + "learning_rate": 7.503442913917613e-05, + "loss": 1.7891, + "step": 11505 + }, + { + "epoch": 3.5316144874155926, + "grad_norm": 0.26461485028266907, + "learning_rate": 7.503012636224976e-05, + "loss": 1.7993, + "step": 11506 + }, + { + "epoch": 3.5319214241866175, + "grad_norm": 0.27001824975013733, + "learning_rate": 7.502582333796098e-05, + "loss": 1.7719, + "step": 11507 + }, + { + "epoch": 3.532228360957643, + "grad_norm": 0.27585846185684204, + "learning_rate": 7.502152006635237e-05, + "loss": 1.7412, + "step": 11508 + }, + { + "epoch": 3.5325352977286677, + "grad_norm": 0.24896648526191711, + "learning_rate": 7.501721654746643e-05, + "loss": 1.7459, + "step": 11509 + }, + { + "epoch": 3.532842234499693, + "grad_norm": 0.2308502197265625, + "learning_rate": 7.501291278134569e-05, + "loss": 1.7717, + "step": 11510 + }, + { + "epoch": 3.5331491712707184, + "grad_norm": 0.3026069104671478, + "learning_rate": 7.500860876803267e-05, + "loss": 1.8578, + "step": 11511 + }, + { + "epoch": 3.5334561080417433, + "grad_norm": 0.30242082476615906, + "learning_rate": 7.500430450756995e-05, + "loss": 1.7793, + "step": 11512 + }, + { + "epoch": 3.5337630448127686, + "grad_norm": 0.2583339214324951, + "learning_rate": 7.500000000000001e-05, + "loss": 1.8388, + "step": 11513 + }, + { + "epoch": 3.5340699815837935, + "grad_norm": 0.29673871397972107, + "learning_rate": 7.499569524536542e-05, + "loss": 1.7749, + "step": 11514 + }, + { + "epoch": 3.534376918354819, + "grad_norm": 0.35199788212776184, + "learning_rate": 7.499139024370874e-05, + "loss": 1.7863, + "step": 11515 + }, + { + "epoch": 3.534683855125844, + "grad_norm": 0.25776436924934387, + "learning_rate": 7.498708499507247e-05, + "loss": 1.7568, + "step": 11516 + }, + { + "epoch": 3.5349907918968695, + "grad_norm": 0.26081520318984985, + "learning_rate": 7.498277949949919e-05, + "loss": 1.807, + "step": 11517 + }, + { + "epoch": 3.5352977286678944, + "grad_norm": 0.29247912764549255, + "learning_rate": 7.497847375703145e-05, + "loss": 1.7568, + "step": 11518 + }, + { + "epoch": 3.5356046654389197, + "grad_norm": 0.20964498817920685, + "learning_rate": 7.497416776771178e-05, + "loss": 1.7601, + "step": 11519 + }, + { + "epoch": 3.5359116022099446, + "grad_norm": 0.28739818930625916, + "learning_rate": 7.496986153158273e-05, + "loss": 1.7915, + "step": 11520 + }, + { + "epoch": 3.53621853898097, + "grad_norm": 0.3109932839870453, + "learning_rate": 7.496555504868691e-05, + "loss": 1.8046, + "step": 11521 + }, + { + "epoch": 3.5365254757519953, + "grad_norm": 0.259284108877182, + "learning_rate": 7.496124831906681e-05, + "loss": 1.7595, + "step": 11522 + }, + { + "epoch": 3.53683241252302, + "grad_norm": 0.265909343957901, + "learning_rate": 7.495694134276504e-05, + "loss": 1.8249, + "step": 11523 + }, + { + "epoch": 3.5371393492940455, + "grad_norm": 0.2478799819946289, + "learning_rate": 7.495263411982415e-05, + "loss": 1.8531, + "step": 11524 + }, + { + "epoch": 3.5374462860650704, + "grad_norm": 0.2636432945728302, + "learning_rate": 7.494832665028671e-05, + "loss": 1.8114, + "step": 11525 + }, + { + "epoch": 3.5377532228360957, + "grad_norm": 0.25323864817619324, + "learning_rate": 7.494401893419527e-05, + "loss": 1.8271, + "step": 11526 + }, + { + "epoch": 3.538060159607121, + "grad_norm": 0.2352467179298401, + "learning_rate": 7.493971097159241e-05, + "loss": 1.7524, + "step": 11527 + }, + { + "epoch": 3.538367096378146, + "grad_norm": 0.2788623869419098, + "learning_rate": 7.493540276252072e-05, + "loss": 1.8238, + "step": 11528 + }, + { + "epoch": 3.5386740331491713, + "grad_norm": 0.3506326377391815, + "learning_rate": 7.493109430702277e-05, + "loss": 1.8525, + "step": 11529 + }, + { + "epoch": 3.538980969920196, + "grad_norm": 0.3685263395309448, + "learning_rate": 7.492678560514113e-05, + "loss": 1.8497, + "step": 11530 + }, + { + "epoch": 3.5392879066912215, + "grad_norm": 0.32200056314468384, + "learning_rate": 7.492247665691837e-05, + "loss": 1.7587, + "step": 11531 + }, + { + "epoch": 3.539594843462247, + "grad_norm": 0.2800062894821167, + "learning_rate": 7.49181674623971e-05, + "loss": 1.8188, + "step": 11532 + }, + { + "epoch": 3.539901780233272, + "grad_norm": 0.24137580394744873, + "learning_rate": 7.491385802161989e-05, + "loss": 1.7947, + "step": 11533 + }, + { + "epoch": 3.540208717004297, + "grad_norm": 0.21900027990341187, + "learning_rate": 7.490954833462933e-05, + "loss": 1.7722, + "step": 11534 + }, + { + "epoch": 3.5405156537753224, + "grad_norm": 0.25009945034980774, + "learning_rate": 7.490523840146803e-05, + "loss": 1.8173, + "step": 11535 + }, + { + "epoch": 3.5408225905463473, + "grad_norm": 0.2778431475162506, + "learning_rate": 7.490092822217855e-05, + "loss": 1.8368, + "step": 11536 + }, + { + "epoch": 3.5411295273173726, + "grad_norm": 0.2845982611179352, + "learning_rate": 7.48966177968035e-05, + "loss": 1.7539, + "step": 11537 + }, + { + "epoch": 3.541436464088398, + "grad_norm": 0.27480921149253845, + "learning_rate": 7.48923071253855e-05, + "loss": 1.8494, + "step": 11538 + }, + { + "epoch": 3.541743400859423, + "grad_norm": 0.2722087502479553, + "learning_rate": 7.488799620796711e-05, + "loss": 1.8422, + "step": 11539 + }, + { + "epoch": 3.542050337630448, + "grad_norm": 0.2984340190887451, + "learning_rate": 7.488368504459097e-05, + "loss": 1.8042, + "step": 11540 + }, + { + "epoch": 3.542357274401473, + "grad_norm": 0.2405850738286972, + "learning_rate": 7.487937363529966e-05, + "loss": 1.749, + "step": 11541 + }, + { + "epoch": 3.5426642111724984, + "grad_norm": 0.24816973507404327, + "learning_rate": 7.487506198013579e-05, + "loss": 1.8671, + "step": 11542 + }, + { + "epoch": 3.5429711479435237, + "grad_norm": 0.2796473503112793, + "learning_rate": 7.487075007914199e-05, + "loss": 1.8023, + "step": 11543 + }, + { + "epoch": 3.5432780847145486, + "grad_norm": 0.2600162625312805, + "learning_rate": 7.486643793236086e-05, + "loss": 1.7997, + "step": 11544 + }, + { + "epoch": 3.543585021485574, + "grad_norm": 0.2746226489543915, + "learning_rate": 7.486212553983503e-05, + "loss": 1.7773, + "step": 11545 + }, + { + "epoch": 3.5438919582565993, + "grad_norm": 0.24142079055309296, + "learning_rate": 7.485781290160708e-05, + "loss": 1.791, + "step": 11546 + }, + { + "epoch": 3.544198895027624, + "grad_norm": 0.2472934126853943, + "learning_rate": 7.485350001771966e-05, + "loss": 1.8183, + "step": 11547 + }, + { + "epoch": 3.5445058317986495, + "grad_norm": 0.26891404390335083, + "learning_rate": 7.48491868882154e-05, + "loss": 1.7421, + "step": 11548 + }, + { + "epoch": 3.544812768569675, + "grad_norm": 0.24820464849472046, + "learning_rate": 7.48448735131369e-05, + "loss": 1.7372, + "step": 11549 + }, + { + "epoch": 3.5451197053406998, + "grad_norm": 0.2456594705581665, + "learning_rate": 7.484055989252679e-05, + "loss": 1.7883, + "step": 11550 + }, + { + "epoch": 3.545426642111725, + "grad_norm": 0.32420551776885986, + "learning_rate": 7.48362460264277e-05, + "loss": 1.8363, + "step": 11551 + }, + { + "epoch": 3.54573357888275, + "grad_norm": 0.3187662661075592, + "learning_rate": 7.483193191488229e-05, + "loss": 1.7957, + "step": 11552 + }, + { + "epoch": 3.5460405156537753, + "grad_norm": 0.2845410108566284, + "learning_rate": 7.482761755793316e-05, + "loss": 1.8288, + "step": 11553 + }, + { + "epoch": 3.5463474524248007, + "grad_norm": 0.2816021740436554, + "learning_rate": 7.482330295562298e-05, + "loss": 1.7562, + "step": 11554 + }, + { + "epoch": 3.5466543891958255, + "grad_norm": 0.28938058018684387, + "learning_rate": 7.481898810799435e-05, + "loss": 1.8139, + "step": 11555 + }, + { + "epoch": 3.546961325966851, + "grad_norm": 0.3305707573890686, + "learning_rate": 7.481467301508995e-05, + "loss": 1.8956, + "step": 11556 + }, + { + "epoch": 3.5472682627378758, + "grad_norm": 0.3890376091003418, + "learning_rate": 7.48103576769524e-05, + "loss": 1.8552, + "step": 11557 + }, + { + "epoch": 3.547575199508901, + "grad_norm": 0.3900652825832367, + "learning_rate": 7.480604209362434e-05, + "loss": 1.7748, + "step": 11558 + }, + { + "epoch": 3.5478821362799264, + "grad_norm": 0.3297326862812042, + "learning_rate": 7.480172626514845e-05, + "loss": 1.8201, + "step": 11559 + }, + { + "epoch": 3.5481890730509518, + "grad_norm": 0.28797218203544617, + "learning_rate": 7.479741019156737e-05, + "loss": 1.7652, + "step": 11560 + }, + { + "epoch": 3.5484960098219767, + "grad_norm": 0.2764691114425659, + "learning_rate": 7.479309387292373e-05, + "loss": 1.7534, + "step": 11561 + }, + { + "epoch": 3.548802946593002, + "grad_norm": 0.25067585706710815, + "learning_rate": 7.47887773092602e-05, + "loss": 1.7849, + "step": 11562 + }, + { + "epoch": 3.549109883364027, + "grad_norm": 0.29966798424720764, + "learning_rate": 7.478446050061947e-05, + "loss": 1.8299, + "step": 11563 + }, + { + "epoch": 3.549416820135052, + "grad_norm": 0.24068406224250793, + "learning_rate": 7.478014344704416e-05, + "loss": 1.8366, + "step": 11564 + }, + { + "epoch": 3.5497237569060776, + "grad_norm": 0.2559303641319275, + "learning_rate": 7.477582614857695e-05, + "loss": 1.7665, + "step": 11565 + }, + { + "epoch": 3.5500306936771024, + "grad_norm": 0.24617858231067657, + "learning_rate": 7.47715086052605e-05, + "loss": 1.8334, + "step": 11566 + }, + { + "epoch": 3.550337630448128, + "grad_norm": 0.2433501034975052, + "learning_rate": 7.476719081713749e-05, + "loss": 1.7963, + "step": 11567 + }, + { + "epoch": 3.5506445672191527, + "grad_norm": 0.2583518326282501, + "learning_rate": 7.476287278425057e-05, + "loss": 1.8311, + "step": 11568 + }, + { + "epoch": 3.550951503990178, + "grad_norm": 0.3232485055923462, + "learning_rate": 7.475855450664244e-05, + "loss": 1.9162, + "step": 11569 + }, + { + "epoch": 3.5512584407612033, + "grad_norm": 0.28247153759002686, + "learning_rate": 7.475423598435576e-05, + "loss": 1.8027, + "step": 11570 + }, + { + "epoch": 3.5515653775322282, + "grad_norm": 0.27201834321022034, + "learning_rate": 7.47499172174332e-05, + "loss": 1.7822, + "step": 11571 + }, + { + "epoch": 3.5518723143032536, + "grad_norm": 0.2408471554517746, + "learning_rate": 7.474559820591748e-05, + "loss": 1.7735, + "step": 11572 + }, + { + "epoch": 3.5521792510742785, + "grad_norm": 0.24187393486499786, + "learning_rate": 7.474127894985124e-05, + "loss": 1.7931, + "step": 11573 + }, + { + "epoch": 3.552486187845304, + "grad_norm": 0.2759699523448944, + "learning_rate": 7.473695944927717e-05, + "loss": 1.8407, + "step": 11574 + }, + { + "epoch": 3.552793124616329, + "grad_norm": 0.2503111958503723, + "learning_rate": 7.473263970423797e-05, + "loss": 1.7613, + "step": 11575 + }, + { + "epoch": 3.5531000613873545, + "grad_norm": 0.24795177578926086, + "learning_rate": 7.472831971477633e-05, + "loss": 1.8221, + "step": 11576 + }, + { + "epoch": 3.5534069981583793, + "grad_norm": 0.23190177977085114, + "learning_rate": 7.472399948093494e-05, + "loss": 1.7541, + "step": 11577 + }, + { + "epoch": 3.5537139349294047, + "grad_norm": 0.24650825560092926, + "learning_rate": 7.471967900275653e-05, + "loss": 1.8002, + "step": 11578 + }, + { + "epoch": 3.5540208717004296, + "grad_norm": 0.256598562002182, + "learning_rate": 7.471535828028372e-05, + "loss": 1.7052, + "step": 11579 + }, + { + "epoch": 3.554327808471455, + "grad_norm": 0.2715381681919098, + "learning_rate": 7.471103731355926e-05, + "loss": 1.7701, + "step": 11580 + }, + { + "epoch": 3.5546347452424802, + "grad_norm": 0.29806044697761536, + "learning_rate": 7.470671610262586e-05, + "loss": 1.7614, + "step": 11581 + }, + { + "epoch": 3.554941682013505, + "grad_norm": 0.26364314556121826, + "learning_rate": 7.470239464752621e-05, + "loss": 1.7957, + "step": 11582 + }, + { + "epoch": 3.5552486187845305, + "grad_norm": 0.29270800948143005, + "learning_rate": 7.4698072948303e-05, + "loss": 1.8263, + "step": 11583 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.25941839814186096, + "learning_rate": 7.469375100499898e-05, + "loss": 1.8517, + "step": 11584 + }, + { + "epoch": 3.5558624923265807, + "grad_norm": 0.29509237408638, + "learning_rate": 7.468942881765681e-05, + "loss": 1.8643, + "step": 11585 + }, + { + "epoch": 3.556169429097606, + "grad_norm": 0.23090367019176483, + "learning_rate": 7.468510638631926e-05, + "loss": 1.7239, + "step": 11586 + }, + { + "epoch": 3.556476365868631, + "grad_norm": 0.2696724236011505, + "learning_rate": 7.468078371102901e-05, + "loss": 1.848, + "step": 11587 + }, + { + "epoch": 3.5567833026396563, + "grad_norm": 0.2691192626953125, + "learning_rate": 7.46764607918288e-05, + "loss": 1.8194, + "step": 11588 + }, + { + "epoch": 3.557090239410681, + "grad_norm": 0.26616501808166504, + "learning_rate": 7.467213762876131e-05, + "loss": 1.8382, + "step": 11589 + }, + { + "epoch": 3.5573971761817065, + "grad_norm": 0.30629831552505493, + "learning_rate": 7.466781422186933e-05, + "loss": 1.8417, + "step": 11590 + }, + { + "epoch": 3.557704112952732, + "grad_norm": 0.27212417125701904, + "learning_rate": 7.466349057119552e-05, + "loss": 1.7612, + "step": 11591 + }, + { + "epoch": 3.558011049723757, + "grad_norm": 0.2872084379196167, + "learning_rate": 7.465916667678266e-05, + "loss": 1.7998, + "step": 11592 + }, + { + "epoch": 3.558317986494782, + "grad_norm": 0.3017117977142334, + "learning_rate": 7.465484253867348e-05, + "loss": 1.7996, + "step": 11593 + }, + { + "epoch": 3.5586249232658074, + "grad_norm": 0.2707957327365875, + "learning_rate": 7.465051815691066e-05, + "loss": 1.7678, + "step": 11594 + }, + { + "epoch": 3.5589318600368323, + "grad_norm": 0.28932711482048035, + "learning_rate": 7.464619353153702e-05, + "loss": 1.8576, + "step": 11595 + }, + { + "epoch": 3.5592387968078576, + "grad_norm": 0.2585125267505646, + "learning_rate": 7.464186866259519e-05, + "loss": 1.8678, + "step": 11596 + }, + { + "epoch": 3.559545733578883, + "grad_norm": 0.24386851489543915, + "learning_rate": 7.4637543550128e-05, + "loss": 1.7778, + "step": 11597 + }, + { + "epoch": 3.559852670349908, + "grad_norm": 0.2375860959291458, + "learning_rate": 7.463321819417817e-05, + "loss": 1.8096, + "step": 11598 + }, + { + "epoch": 3.560159607120933, + "grad_norm": 0.2341299206018448, + "learning_rate": 7.462889259478842e-05, + "loss": 1.7191, + "step": 11599 + }, + { + "epoch": 3.560466543891958, + "grad_norm": 0.2510595917701721, + "learning_rate": 7.462456675200154e-05, + "loss": 1.7763, + "step": 11600 + }, + { + "epoch": 3.5607734806629834, + "grad_norm": 0.2554674744606018, + "learning_rate": 7.462024066586025e-05, + "loss": 1.7578, + "step": 11601 + }, + { + "epoch": 3.5610804174340087, + "grad_norm": 0.25040730834007263, + "learning_rate": 7.46159143364073e-05, + "loss": 1.8194, + "step": 11602 + }, + { + "epoch": 3.5613873542050336, + "grad_norm": 0.24294932186603546, + "learning_rate": 7.461158776368547e-05, + "loss": 1.8063, + "step": 11603 + }, + { + "epoch": 3.561694290976059, + "grad_norm": 0.2388325333595276, + "learning_rate": 7.46072609477375e-05, + "loss": 1.7942, + "step": 11604 + }, + { + "epoch": 3.562001227747084, + "grad_norm": 0.2569502890110016, + "learning_rate": 7.460293388860615e-05, + "loss": 1.7824, + "step": 11605 + }, + { + "epoch": 3.562308164518109, + "grad_norm": 0.24004346132278442, + "learning_rate": 7.45986065863342e-05, + "loss": 1.8676, + "step": 11606 + }, + { + "epoch": 3.5626151012891345, + "grad_norm": 0.25446319580078125, + "learning_rate": 7.45942790409644e-05, + "loss": 1.7726, + "step": 11607 + }, + { + "epoch": 3.56292203806016, + "grad_norm": 0.26257482171058655, + "learning_rate": 7.458995125253951e-05, + "loss": 1.779, + "step": 11608 + }, + { + "epoch": 3.5632289748311847, + "grad_norm": 0.27703070640563965, + "learning_rate": 7.458562322110231e-05, + "loss": 1.8247, + "step": 11609 + }, + { + "epoch": 3.56353591160221, + "grad_norm": 0.25478535890579224, + "learning_rate": 7.458129494669556e-05, + "loss": 1.7794, + "step": 11610 + }, + { + "epoch": 3.563842848373235, + "grad_norm": 0.26173365116119385, + "learning_rate": 7.457696642936207e-05, + "loss": 1.758, + "step": 11611 + }, + { + "epoch": 3.5641497851442603, + "grad_norm": 0.25077274441719055, + "learning_rate": 7.45726376691446e-05, + "loss": 1.8234, + "step": 11612 + }, + { + "epoch": 3.5644567219152856, + "grad_norm": 0.2591109275817871, + "learning_rate": 7.456830866608589e-05, + "loss": 1.7723, + "step": 11613 + }, + { + "epoch": 3.5647636586863105, + "grad_norm": 0.2653447091579437, + "learning_rate": 7.456397942022877e-05, + "loss": 1.7839, + "step": 11614 + }, + { + "epoch": 3.565070595457336, + "grad_norm": 0.3203454911708832, + "learning_rate": 7.455964993161601e-05, + "loss": 1.8548, + "step": 11615 + }, + { + "epoch": 3.5653775322283607, + "grad_norm": 0.3041793704032898, + "learning_rate": 7.455532020029039e-05, + "loss": 1.7925, + "step": 11616 + }, + { + "epoch": 3.565684468999386, + "grad_norm": 0.26066139340400696, + "learning_rate": 7.45509902262947e-05, + "loss": 1.7905, + "step": 11617 + }, + { + "epoch": 3.5659914057704114, + "grad_norm": 0.2483314871788025, + "learning_rate": 7.454666000967174e-05, + "loss": 1.7658, + "step": 11618 + }, + { + "epoch": 3.5662983425414367, + "grad_norm": 0.24285900592803955, + "learning_rate": 7.45423295504643e-05, + "loss": 1.7575, + "step": 11619 + }, + { + "epoch": 3.5666052793124616, + "grad_norm": 0.27231669425964355, + "learning_rate": 7.453799884871517e-05, + "loss": 1.8389, + "step": 11620 + }, + { + "epoch": 3.566912216083487, + "grad_norm": 0.24324406683444977, + "learning_rate": 7.453366790446717e-05, + "loss": 1.7775, + "step": 11621 + }, + { + "epoch": 3.567219152854512, + "grad_norm": 0.2724440097808838, + "learning_rate": 7.452933671776305e-05, + "loss": 1.8135, + "step": 11622 + }, + { + "epoch": 3.567526089625537, + "grad_norm": 0.22207655012607574, + "learning_rate": 7.452500528864568e-05, + "loss": 1.722, + "step": 11623 + }, + { + "epoch": 3.5678330263965625, + "grad_norm": 0.25650298595428467, + "learning_rate": 7.452067361715782e-05, + "loss": 1.7813, + "step": 11624 + }, + { + "epoch": 3.5681399631675874, + "grad_norm": 0.2582200765609741, + "learning_rate": 7.45163417033423e-05, + "loss": 1.8253, + "step": 11625 + }, + { + "epoch": 3.5684468999386127, + "grad_norm": 0.29545384645462036, + "learning_rate": 7.451200954724188e-05, + "loss": 1.8108, + "step": 11626 + }, + { + "epoch": 3.5687538367096376, + "grad_norm": 0.30457428097724915, + "learning_rate": 7.450767714889946e-05, + "loss": 1.8257, + "step": 11627 + }, + { + "epoch": 3.569060773480663, + "grad_norm": 0.2955166697502136, + "learning_rate": 7.450334450835781e-05, + "loss": 1.8172, + "step": 11628 + }, + { + "epoch": 3.5693677102516883, + "grad_norm": 0.2793857753276825, + "learning_rate": 7.449901162565974e-05, + "loss": 1.8493, + "step": 11629 + }, + { + "epoch": 3.569674647022713, + "grad_norm": 0.27154335379600525, + "learning_rate": 7.449467850084808e-05, + "loss": 1.8306, + "step": 11630 + }, + { + "epoch": 3.5699815837937385, + "grad_norm": 0.22336189448833466, + "learning_rate": 7.449034513396564e-05, + "loss": 1.7435, + "step": 11631 + }, + { + "epoch": 3.5702885205647634, + "grad_norm": 0.22799183428287506, + "learning_rate": 7.448601152505526e-05, + "loss": 1.7818, + "step": 11632 + }, + { + "epoch": 3.5705954573357888, + "grad_norm": 0.26670658588409424, + "learning_rate": 7.448167767415976e-05, + "loss": 1.7777, + "step": 11633 + }, + { + "epoch": 3.570902394106814, + "grad_norm": 0.2848666310310364, + "learning_rate": 7.447734358132196e-05, + "loss": 1.7572, + "step": 11634 + }, + { + "epoch": 3.5712093308778394, + "grad_norm": 0.26843544840812683, + "learning_rate": 7.447300924658473e-05, + "loss": 1.7642, + "step": 11635 + }, + { + "epoch": 3.5715162676488643, + "grad_norm": 0.24666404724121094, + "learning_rate": 7.446867466999087e-05, + "loss": 1.7533, + "step": 11636 + }, + { + "epoch": 3.5718232044198897, + "grad_norm": 0.31111210584640503, + "learning_rate": 7.44643398515832e-05, + "loss": 1.7875, + "step": 11637 + }, + { + "epoch": 3.5721301411909145, + "grad_norm": 0.3157108724117279, + "learning_rate": 7.446000479140462e-05, + "loss": 1.7879, + "step": 11638 + }, + { + "epoch": 3.57243707796194, + "grad_norm": 0.2935558259487152, + "learning_rate": 7.445566948949792e-05, + "loss": 1.7819, + "step": 11639 + }, + { + "epoch": 3.572744014732965, + "grad_norm": 0.2265472710132599, + "learning_rate": 7.445133394590597e-05, + "loss": 1.7518, + "step": 11640 + }, + { + "epoch": 3.57305095150399, + "grad_norm": 0.2564176023006439, + "learning_rate": 7.444699816067159e-05, + "loss": 1.7281, + "step": 11641 + }, + { + "epoch": 3.5733578882750154, + "grad_norm": 0.27933555841445923, + "learning_rate": 7.444266213383766e-05, + "loss": 1.7852, + "step": 11642 + }, + { + "epoch": 3.5736648250460403, + "grad_norm": 0.29105356335639954, + "learning_rate": 7.4438325865447e-05, + "loss": 1.8056, + "step": 11643 + }, + { + "epoch": 3.5739717618170657, + "grad_norm": 0.27665549516677856, + "learning_rate": 7.443398935554249e-05, + "loss": 1.7249, + "step": 11644 + }, + { + "epoch": 3.574278698588091, + "grad_norm": 0.21899232268333435, + "learning_rate": 7.442965260416698e-05, + "loss": 1.7689, + "step": 11645 + }, + { + "epoch": 3.574585635359116, + "grad_norm": 0.3250672221183777, + "learning_rate": 7.442531561136333e-05, + "loss": 1.8058, + "step": 11646 + }, + { + "epoch": 3.574892572130141, + "grad_norm": 0.42442524433135986, + "learning_rate": 7.442097837717438e-05, + "loss": 1.7887, + "step": 11647 + }, + { + "epoch": 3.575199508901166, + "grad_norm": 0.33108964562416077, + "learning_rate": 7.441664090164302e-05, + "loss": 1.7628, + "step": 11648 + }, + { + "epoch": 3.5755064456721914, + "grad_norm": 0.23050357401371002, + "learning_rate": 7.44123031848121e-05, + "loss": 1.8121, + "step": 11649 + }, + { + "epoch": 3.575813382443217, + "grad_norm": 0.29251593351364136, + "learning_rate": 7.440796522672448e-05, + "loss": 1.8051, + "step": 11650 + }, + { + "epoch": 3.576120319214242, + "grad_norm": 0.3764750063419342, + "learning_rate": 7.440362702742305e-05, + "loss": 1.9002, + "step": 11651 + }, + { + "epoch": 3.576427255985267, + "grad_norm": 0.3751949071884155, + "learning_rate": 7.439928858695069e-05, + "loss": 1.821, + "step": 11652 + }, + { + "epoch": 3.5767341927562923, + "grad_norm": 0.268476665019989, + "learning_rate": 7.439494990535024e-05, + "loss": 1.8241, + "step": 11653 + }, + { + "epoch": 3.5770411295273172, + "grad_norm": 0.3072795271873474, + "learning_rate": 7.439061098266459e-05, + "loss": 1.8169, + "step": 11654 + }, + { + "epoch": 3.5773480662983426, + "grad_norm": 0.4948901832103729, + "learning_rate": 7.438627181893664e-05, + "loss": 1.7706, + "step": 11655 + }, + { + "epoch": 3.577655003069368, + "grad_norm": 0.5892601013183594, + "learning_rate": 7.438193241420926e-05, + "loss": 1.7631, + "step": 11656 + }, + { + "epoch": 3.577961939840393, + "grad_norm": 0.4599401652812958, + "learning_rate": 7.437759276852533e-05, + "loss": 1.7471, + "step": 11657 + }, + { + "epoch": 3.578268876611418, + "grad_norm": 0.2545170783996582, + "learning_rate": 7.437325288192773e-05, + "loss": 1.7945, + "step": 11658 + }, + { + "epoch": 3.578575813382443, + "grad_norm": 0.3136496841907501, + "learning_rate": 7.436891275445938e-05, + "loss": 1.828, + "step": 11659 + }, + { + "epoch": 3.5788827501534684, + "grad_norm": 0.3631688058376312, + "learning_rate": 7.436457238616313e-05, + "loss": 1.8302, + "step": 11660 + }, + { + "epoch": 3.5791896869244937, + "grad_norm": 0.3097386658191681, + "learning_rate": 7.436023177708192e-05, + "loss": 1.8397, + "step": 11661 + }, + { + "epoch": 3.5794966236955186, + "grad_norm": 0.20948798954486847, + "learning_rate": 7.43558909272586e-05, + "loss": 1.7844, + "step": 11662 + }, + { + "epoch": 3.579803560466544, + "grad_norm": 0.24327392876148224, + "learning_rate": 7.43515498367361e-05, + "loss": 1.7827, + "step": 11663 + }, + { + "epoch": 3.580110497237569, + "grad_norm": 0.25268325209617615, + "learning_rate": 7.434720850555731e-05, + "loss": 1.8224, + "step": 11664 + }, + { + "epoch": 3.580417434008594, + "grad_norm": 0.24883607029914856, + "learning_rate": 7.434286693376513e-05, + "loss": 1.8189, + "step": 11665 + }, + { + "epoch": 3.5807243707796195, + "grad_norm": 0.2942518889904022, + "learning_rate": 7.433852512140248e-05, + "loss": 1.8325, + "step": 11666 + }, + { + "epoch": 3.581031307550645, + "grad_norm": 0.3556186556816101, + "learning_rate": 7.433418306851225e-05, + "loss": 1.7511, + "step": 11667 + }, + { + "epoch": 3.5813382443216697, + "grad_norm": 0.421220600605011, + "learning_rate": 7.432984077513738e-05, + "loss": 1.8081, + "step": 11668 + }, + { + "epoch": 3.581645181092695, + "grad_norm": 0.3338243067264557, + "learning_rate": 7.432549824132074e-05, + "loss": 1.8274, + "step": 11669 + }, + { + "epoch": 3.58195211786372, + "grad_norm": 0.25091543793678284, + "learning_rate": 7.432115546710528e-05, + "loss": 1.7637, + "step": 11670 + }, + { + "epoch": 3.5822590546347453, + "grad_norm": 0.29870370030403137, + "learning_rate": 7.431681245253389e-05, + "loss": 1.8036, + "step": 11671 + }, + { + "epoch": 3.5825659914057706, + "grad_norm": 0.2682137191295624, + "learning_rate": 7.431246919764953e-05, + "loss": 1.8252, + "step": 11672 + }, + { + "epoch": 3.5828729281767955, + "grad_norm": 0.28790801763534546, + "learning_rate": 7.430812570249508e-05, + "loss": 1.7713, + "step": 11673 + }, + { + "epoch": 3.583179864947821, + "grad_norm": 0.26357609033584595, + "learning_rate": 7.43037819671135e-05, + "loss": 1.8388, + "step": 11674 + }, + { + "epoch": 3.5834868017188457, + "grad_norm": 0.2505483031272888, + "learning_rate": 7.42994379915477e-05, + "loss": 1.7722, + "step": 11675 + }, + { + "epoch": 3.583793738489871, + "grad_norm": 0.2535844147205353, + "learning_rate": 7.42950937758406e-05, + "loss": 1.756, + "step": 11676 + }, + { + "epoch": 3.5841006752608964, + "grad_norm": 0.23045027256011963, + "learning_rate": 7.429074932003515e-05, + "loss": 1.791, + "step": 11677 + }, + { + "epoch": 3.5844076120319213, + "grad_norm": 0.22525762021541595, + "learning_rate": 7.428640462417428e-05, + "loss": 1.7234, + "step": 11678 + }, + { + "epoch": 3.5847145488029466, + "grad_norm": 0.2402270883321762, + "learning_rate": 7.428205968830094e-05, + "loss": 1.845, + "step": 11679 + }, + { + "epoch": 3.5850214855739715, + "grad_norm": 0.24909646809101105, + "learning_rate": 7.427771451245802e-05, + "loss": 1.8537, + "step": 11680 + }, + { + "epoch": 3.585328422344997, + "grad_norm": 0.25813063979148865, + "learning_rate": 7.427336909668853e-05, + "loss": 1.7353, + "step": 11681 + }, + { + "epoch": 3.585635359116022, + "grad_norm": 0.26073768734931946, + "learning_rate": 7.426902344103534e-05, + "loss": 1.8142, + "step": 11682 + }, + { + "epoch": 3.5859422958870475, + "grad_norm": 0.2498280256986618, + "learning_rate": 7.426467754554147e-05, + "loss": 1.7996, + "step": 11683 + }, + { + "epoch": 3.5862492326580724, + "grad_norm": 0.3131188154220581, + "learning_rate": 7.426033141024981e-05, + "loss": 1.7793, + "step": 11684 + }, + { + "epoch": 3.5865561694290977, + "grad_norm": 0.24118199944496155, + "learning_rate": 7.425598503520337e-05, + "loss": 1.8249, + "step": 11685 + }, + { + "epoch": 3.5868631062001226, + "grad_norm": 0.2791197597980499, + "learning_rate": 7.425163842044504e-05, + "loss": 1.7966, + "step": 11686 + }, + { + "epoch": 3.587170042971148, + "grad_norm": 0.2298576384782791, + "learning_rate": 7.424729156601781e-05, + "loss": 1.7224, + "step": 11687 + }, + { + "epoch": 3.5874769797421733, + "grad_norm": 0.23113438487052917, + "learning_rate": 7.424294447196462e-05, + "loss": 1.7641, + "step": 11688 + }, + { + "epoch": 3.587783916513198, + "grad_norm": 0.3064495027065277, + "learning_rate": 7.423859713832847e-05, + "loss": 1.8688, + "step": 11689 + }, + { + "epoch": 3.5880908532842235, + "grad_norm": 0.22847676277160645, + "learning_rate": 7.423424956515228e-05, + "loss": 1.7513, + "step": 11690 + }, + { + "epoch": 3.5883977900552484, + "grad_norm": 0.2797350585460663, + "learning_rate": 7.422990175247905e-05, + "loss": 1.8268, + "step": 11691 + }, + { + "epoch": 3.5887047268262737, + "grad_norm": 0.2753821313381195, + "learning_rate": 7.422555370035171e-05, + "loss": 1.7313, + "step": 11692 + }, + { + "epoch": 3.589011663597299, + "grad_norm": 0.2981179654598236, + "learning_rate": 7.422120540881326e-05, + "loss": 1.8455, + "step": 11693 + }, + { + "epoch": 3.5893186003683244, + "grad_norm": 0.33028867840766907, + "learning_rate": 7.421685687790667e-05, + "loss": 1.8397, + "step": 11694 + }, + { + "epoch": 3.5896255371393493, + "grad_norm": 0.409173846244812, + "learning_rate": 7.421250810767487e-05, + "loss": 1.8088, + "step": 11695 + }, + { + "epoch": 3.5899324739103746, + "grad_norm": 0.4118194878101349, + "learning_rate": 7.42081590981609e-05, + "loss": 1.7719, + "step": 11696 + }, + { + "epoch": 3.5902394106813995, + "grad_norm": 0.34716179966926575, + "learning_rate": 7.420380984940773e-05, + "loss": 1.8063, + "step": 11697 + }, + { + "epoch": 3.590546347452425, + "grad_norm": 0.27763083577156067, + "learning_rate": 7.419946036145829e-05, + "loss": 1.7777, + "step": 11698 + }, + { + "epoch": 3.59085328422345, + "grad_norm": 0.3175280690193176, + "learning_rate": 7.419511063435562e-05, + "loss": 1.697, + "step": 11699 + }, + { + "epoch": 3.591160220994475, + "grad_norm": 0.3151503801345825, + "learning_rate": 7.419076066814268e-05, + "loss": 1.8067, + "step": 11700 + }, + { + "epoch": 3.5914671577655004, + "grad_norm": 0.26914867758750916, + "learning_rate": 7.418641046286245e-05, + "loss": 1.7797, + "step": 11701 + }, + { + "epoch": 3.5917740945365253, + "grad_norm": 0.27231964468955994, + "learning_rate": 7.418206001855797e-05, + "loss": 1.7931, + "step": 11702 + }, + { + "epoch": 3.5920810313075506, + "grad_norm": 0.3352177143096924, + "learning_rate": 7.417770933527217e-05, + "loss": 1.9187, + "step": 11703 + }, + { + "epoch": 3.592387968078576, + "grad_norm": 0.3510081470012665, + "learning_rate": 7.417335841304808e-05, + "loss": 1.7889, + "step": 11704 + }, + { + "epoch": 3.592694904849601, + "grad_norm": 0.24949313700199127, + "learning_rate": 7.41690072519287e-05, + "loss": 1.7683, + "step": 11705 + }, + { + "epoch": 3.593001841620626, + "grad_norm": 0.28442221879959106, + "learning_rate": 7.416465585195702e-05, + "loss": 1.7889, + "step": 11706 + }, + { + "epoch": 3.593308778391651, + "grad_norm": 0.3355824649333954, + "learning_rate": 7.416030421317605e-05, + "loss": 1.7637, + "step": 11707 + }, + { + "epoch": 3.5936157151626764, + "grad_norm": 0.33569446206092834, + "learning_rate": 7.415595233562878e-05, + "loss": 1.919, + "step": 11708 + }, + { + "epoch": 3.5939226519337018, + "grad_norm": 0.2488354742527008, + "learning_rate": 7.415160021935825e-05, + "loss": 1.8424, + "step": 11709 + }, + { + "epoch": 3.594229588704727, + "grad_norm": 0.2701130509376526, + "learning_rate": 7.414724786440746e-05, + "loss": 1.7586, + "step": 11710 + }, + { + "epoch": 3.594536525475752, + "grad_norm": 0.26289790868759155, + "learning_rate": 7.414289527081939e-05, + "loss": 1.7975, + "step": 11711 + }, + { + "epoch": 3.5948434622467773, + "grad_norm": 0.25382301211357117, + "learning_rate": 7.413854243863707e-05, + "loss": 1.7393, + "step": 11712 + }, + { + "epoch": 3.595150399017802, + "grad_norm": 0.28282979130744934, + "learning_rate": 7.413418936790357e-05, + "loss": 1.8048, + "step": 11713 + }, + { + "epoch": 3.5954573357888275, + "grad_norm": 0.28001347184181213, + "learning_rate": 7.412983605866183e-05, + "loss": 1.7864, + "step": 11714 + }, + { + "epoch": 3.595764272559853, + "grad_norm": 0.26107707619667053, + "learning_rate": 7.412548251095491e-05, + "loss": 1.8016, + "step": 11715 + }, + { + "epoch": 3.5960712093308778, + "grad_norm": 0.2518761456012726, + "learning_rate": 7.412112872482583e-05, + "loss": 1.7565, + "step": 11716 + }, + { + "epoch": 3.596378146101903, + "grad_norm": 0.25911152362823486, + "learning_rate": 7.411677470031762e-05, + "loss": 1.8333, + "step": 11717 + }, + { + "epoch": 3.596685082872928, + "grad_norm": 0.3411506414413452, + "learning_rate": 7.41124204374733e-05, + "loss": 1.8027, + "step": 11718 + }, + { + "epoch": 3.5969920196439533, + "grad_norm": 0.28535547852516174, + "learning_rate": 7.410806593633593e-05, + "loss": 1.7596, + "step": 11719 + }, + { + "epoch": 3.5972989564149787, + "grad_norm": 0.24665530025959015, + "learning_rate": 7.410371119694852e-05, + "loss": 1.7777, + "step": 11720 + }, + { + "epoch": 3.5976058931860035, + "grad_norm": 0.29162275791168213, + "learning_rate": 7.40993562193541e-05, + "loss": 1.795, + "step": 11721 + }, + { + "epoch": 3.597912829957029, + "grad_norm": 0.2712220549583435, + "learning_rate": 7.409500100359573e-05, + "loss": 1.824, + "step": 11722 + }, + { + "epoch": 3.5982197667280538, + "grad_norm": 0.239755779504776, + "learning_rate": 7.40906455497164e-05, + "loss": 1.7534, + "step": 11723 + }, + { + "epoch": 3.598526703499079, + "grad_norm": 0.26056957244873047, + "learning_rate": 7.408628985775922e-05, + "loss": 1.757, + "step": 11724 + }, + { + "epoch": 3.5988336402701044, + "grad_norm": 0.3230258822441101, + "learning_rate": 7.40819339277672e-05, + "loss": 1.8684, + "step": 11725 + }, + { + "epoch": 3.5991405770411298, + "grad_norm": 0.26070696115493774, + "learning_rate": 7.407757775978339e-05, + "loss": 1.7868, + "step": 11726 + }, + { + "epoch": 3.5994475138121547, + "grad_norm": 0.24940893054008484, + "learning_rate": 7.407322135385085e-05, + "loss": 1.8391, + "step": 11727 + }, + { + "epoch": 3.59975445058318, + "grad_norm": 0.2717723250389099, + "learning_rate": 7.406886471001263e-05, + "loss": 1.7567, + "step": 11728 + }, + { + "epoch": 3.600061387354205, + "grad_norm": 0.2328445315361023, + "learning_rate": 7.406450782831177e-05, + "loss": 1.7761, + "step": 11729 + }, + { + "epoch": 3.6003683241252302, + "grad_norm": 0.2740287184715271, + "learning_rate": 7.406015070879136e-05, + "loss": 1.8599, + "step": 11730 + }, + { + "epoch": 3.6006752608962556, + "grad_norm": 0.2930558919906616, + "learning_rate": 7.405579335149441e-05, + "loss": 1.852, + "step": 11731 + }, + { + "epoch": 3.6009821976672804, + "grad_norm": 0.30175161361694336, + "learning_rate": 7.405143575646403e-05, + "loss": 1.8861, + "step": 11732 + }, + { + "epoch": 3.601289134438306, + "grad_norm": 0.2617531418800354, + "learning_rate": 7.404707792374328e-05, + "loss": 1.7598, + "step": 11733 + }, + { + "epoch": 3.6015960712093307, + "grad_norm": 0.25384122133255005, + "learning_rate": 7.404271985337517e-05, + "loss": 1.7634, + "step": 11734 + }, + { + "epoch": 3.601903007980356, + "grad_norm": 0.31706711649894714, + "learning_rate": 7.403836154540284e-05, + "loss": 1.8125, + "step": 11735 + }, + { + "epoch": 3.6022099447513813, + "grad_norm": 0.299662709236145, + "learning_rate": 7.403400299986932e-05, + "loss": 1.748, + "step": 11736 + }, + { + "epoch": 3.6025168815224062, + "grad_norm": 0.23828944563865662, + "learning_rate": 7.40296442168177e-05, + "loss": 1.7473, + "step": 11737 + }, + { + "epoch": 3.6028238182934316, + "grad_norm": 0.22611604630947113, + "learning_rate": 7.402528519629106e-05, + "loss": 1.7519, + "step": 11738 + }, + { + "epoch": 3.6031307550644565, + "grad_norm": 0.28498536348342896, + "learning_rate": 7.402092593833246e-05, + "loss": 1.7792, + "step": 11739 + }, + { + "epoch": 3.603437691835482, + "grad_norm": 0.2404283881187439, + "learning_rate": 7.4016566442985e-05, + "loss": 1.7434, + "step": 11740 + }, + { + "epoch": 3.603744628606507, + "grad_norm": 0.2291589230298996, + "learning_rate": 7.401220671029173e-05, + "loss": 1.7623, + "step": 11741 + }, + { + "epoch": 3.6040515653775325, + "grad_norm": 0.23962698876857758, + "learning_rate": 7.400784674029578e-05, + "loss": 1.7232, + "step": 11742 + }, + { + "epoch": 3.6043585021485574, + "grad_norm": 0.3015185594558716, + "learning_rate": 7.400348653304022e-05, + "loss": 1.7808, + "step": 11743 + }, + { + "epoch": 3.6046654389195827, + "grad_norm": 0.30623099207878113, + "learning_rate": 7.399912608856813e-05, + "loss": 1.8518, + "step": 11744 + }, + { + "epoch": 3.6049723756906076, + "grad_norm": 0.2698235511779785, + "learning_rate": 7.39947654069226e-05, + "loss": 1.7829, + "step": 11745 + }, + { + "epoch": 3.605279312461633, + "grad_norm": 0.2195274829864502, + "learning_rate": 7.399040448814674e-05, + "loss": 1.7709, + "step": 11746 + }, + { + "epoch": 3.6055862492326582, + "grad_norm": 0.22962357103824615, + "learning_rate": 7.398604333228366e-05, + "loss": 1.7482, + "step": 11747 + }, + { + "epoch": 3.605893186003683, + "grad_norm": 0.2403932511806488, + "learning_rate": 7.398168193937642e-05, + "loss": 1.8063, + "step": 11748 + }, + { + "epoch": 3.6062001227747085, + "grad_norm": 0.23542718589305878, + "learning_rate": 7.397732030946816e-05, + "loss": 1.7599, + "step": 11749 + }, + { + "epoch": 3.6065070595457334, + "grad_norm": 0.2462490350008011, + "learning_rate": 7.397295844260195e-05, + "loss": 1.8183, + "step": 11750 + }, + { + "epoch": 3.6068139963167587, + "grad_norm": 0.21428349614143372, + "learning_rate": 7.396859633882091e-05, + "loss": 1.6944, + "step": 11751 + }, + { + "epoch": 3.607120933087784, + "grad_norm": 0.21240907907485962, + "learning_rate": 7.396423399816817e-05, + "loss": 1.7795, + "step": 11752 + }, + { + "epoch": 3.607427869858809, + "grad_norm": 0.23413677513599396, + "learning_rate": 7.395987142068682e-05, + "loss": 1.8015, + "step": 11753 + }, + { + "epoch": 3.6077348066298343, + "grad_norm": 0.26724907755851746, + "learning_rate": 7.395550860641998e-05, + "loss": 1.8174, + "step": 11754 + }, + { + "epoch": 3.608041743400859, + "grad_norm": 0.22077679634094238, + "learning_rate": 7.395114555541077e-05, + "loss": 1.7929, + "step": 11755 + }, + { + "epoch": 3.6083486801718845, + "grad_norm": 0.2475263774394989, + "learning_rate": 7.394678226770228e-05, + "loss": 1.7744, + "step": 11756 + }, + { + "epoch": 3.60865561694291, + "grad_norm": 0.22579342126846313, + "learning_rate": 7.394241874333764e-05, + "loss": 1.79, + "step": 11757 + }, + { + "epoch": 3.608962553713935, + "grad_norm": 0.26798152923583984, + "learning_rate": 7.393805498236001e-05, + "loss": 1.8087, + "step": 11758 + }, + { + "epoch": 3.60926949048496, + "grad_norm": 0.2755621373653412, + "learning_rate": 7.393369098481248e-05, + "loss": 1.7834, + "step": 11759 + }, + { + "epoch": 3.6095764272559854, + "grad_norm": 0.2741812467575073, + "learning_rate": 7.39293267507382e-05, + "loss": 1.7948, + "step": 11760 + }, + { + "epoch": 3.6098833640270103, + "grad_norm": 0.2378924936056137, + "learning_rate": 7.392496228018028e-05, + "loss": 1.8317, + "step": 11761 + }, + { + "epoch": 3.6101903007980356, + "grad_norm": 0.2628132700920105, + "learning_rate": 7.392059757318187e-05, + "loss": 1.8123, + "step": 11762 + }, + { + "epoch": 3.610497237569061, + "grad_norm": 0.2613002359867096, + "learning_rate": 7.391623262978607e-05, + "loss": 1.795, + "step": 11763 + }, + { + "epoch": 3.610804174340086, + "grad_norm": 0.27272161841392517, + "learning_rate": 7.391186745003608e-05, + "loss": 1.7808, + "step": 11764 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.21366162598133087, + "learning_rate": 7.390750203397497e-05, + "loss": 1.77, + "step": 11765 + }, + { + "epoch": 3.611418047882136, + "grad_norm": 0.25559261441230774, + "learning_rate": 7.390313638164593e-05, + "loss": 1.8442, + "step": 11766 + }, + { + "epoch": 3.6117249846531614, + "grad_norm": 0.23794838786125183, + "learning_rate": 7.389877049309207e-05, + "loss": 1.8237, + "step": 11767 + }, + { + "epoch": 3.6120319214241867, + "grad_norm": 0.2690154016017914, + "learning_rate": 7.389440436835656e-05, + "loss": 1.8194, + "step": 11768 + }, + { + "epoch": 3.612338858195212, + "grad_norm": 0.26148009300231934, + "learning_rate": 7.389003800748254e-05, + "loss": 1.7862, + "step": 11769 + }, + { + "epoch": 3.612645794966237, + "grad_norm": 0.26414936780929565, + "learning_rate": 7.388567141051315e-05, + "loss": 1.7815, + "step": 11770 + }, + { + "epoch": 3.6129527317372623, + "grad_norm": 0.24473857879638672, + "learning_rate": 7.388130457749157e-05, + "loss": 1.801, + "step": 11771 + }, + { + "epoch": 3.613259668508287, + "grad_norm": 0.24356001615524292, + "learning_rate": 7.387693750846094e-05, + "loss": 1.8031, + "step": 11772 + }, + { + "epoch": 3.6135666052793125, + "grad_norm": 0.26716411113739014, + "learning_rate": 7.387257020346441e-05, + "loss": 1.7999, + "step": 11773 + }, + { + "epoch": 3.613873542050338, + "grad_norm": 0.2730760872364044, + "learning_rate": 7.386820266254516e-05, + "loss": 1.8079, + "step": 11774 + }, + { + "epoch": 3.6141804788213627, + "grad_norm": 0.2570728361606598, + "learning_rate": 7.386383488574635e-05, + "loss": 1.7374, + "step": 11775 + }, + { + "epoch": 3.614487415592388, + "grad_norm": 0.24992883205413818, + "learning_rate": 7.385946687311112e-05, + "loss": 1.8432, + "step": 11776 + }, + { + "epoch": 3.614794352363413, + "grad_norm": 0.28632259368896484, + "learning_rate": 7.385509862468266e-05, + "loss": 1.8014, + "step": 11777 + }, + { + "epoch": 3.6151012891344383, + "grad_norm": 0.257303923368454, + "learning_rate": 7.385073014050412e-05, + "loss": 1.8166, + "step": 11778 + }, + { + "epoch": 3.6154082259054636, + "grad_norm": 0.2791872024536133, + "learning_rate": 7.38463614206187e-05, + "loss": 1.7865, + "step": 11779 + }, + { + "epoch": 3.6157151626764885, + "grad_norm": 0.25708603858947754, + "learning_rate": 7.384199246506956e-05, + "loss": 1.807, + "step": 11780 + }, + { + "epoch": 3.616022099447514, + "grad_norm": 0.28693172335624695, + "learning_rate": 7.383762327389988e-05, + "loss": 1.8049, + "step": 11781 + }, + { + "epoch": 3.6163290362185387, + "grad_norm": 0.2731167674064636, + "learning_rate": 7.383325384715283e-05, + "loss": 1.8937, + "step": 11782 + }, + { + "epoch": 3.616635972989564, + "grad_norm": 0.26151663064956665, + "learning_rate": 7.38288841848716e-05, + "loss": 1.8288, + "step": 11783 + }, + { + "epoch": 3.6169429097605894, + "grad_norm": 0.2732257843017578, + "learning_rate": 7.382451428709936e-05, + "loss": 1.7668, + "step": 11784 + }, + { + "epoch": 3.6172498465316147, + "grad_norm": 0.2747575640678406, + "learning_rate": 7.38201441538793e-05, + "loss": 1.7991, + "step": 11785 + }, + { + "epoch": 3.6175567833026396, + "grad_norm": 0.2884783446788788, + "learning_rate": 7.381577378525462e-05, + "loss": 1.7798, + "step": 11786 + }, + { + "epoch": 3.617863720073665, + "grad_norm": 0.2716344892978668, + "learning_rate": 7.381140318126851e-05, + "loss": 1.7923, + "step": 11787 + }, + { + "epoch": 3.61817065684469, + "grad_norm": 0.3007747232913971, + "learning_rate": 7.380703234196416e-05, + "loss": 1.8397, + "step": 11788 + }, + { + "epoch": 3.618477593615715, + "grad_norm": 0.39218056201934814, + "learning_rate": 7.380266126738476e-05, + "loss": 1.8517, + "step": 11789 + }, + { + "epoch": 3.6187845303867405, + "grad_norm": 0.43425866961479187, + "learning_rate": 7.379828995757351e-05, + "loss": 1.7518, + "step": 11790 + }, + { + "epoch": 3.6190914671577654, + "grad_norm": 0.34399518370628357, + "learning_rate": 7.37939184125736e-05, + "loss": 1.7607, + "step": 11791 + }, + { + "epoch": 3.6193984039287908, + "grad_norm": 0.23124302923679352, + "learning_rate": 7.378954663242825e-05, + "loss": 1.7898, + "step": 11792 + }, + { + "epoch": 3.6197053406998156, + "grad_norm": 0.32839757204055786, + "learning_rate": 7.378517461718066e-05, + "loss": 1.7472, + "step": 11793 + }, + { + "epoch": 3.620012277470841, + "grad_norm": 0.38583460450172424, + "learning_rate": 7.378080236687403e-05, + "loss": 1.7947, + "step": 11794 + }, + { + "epoch": 3.6203192142418663, + "grad_norm": 0.4622896909713745, + "learning_rate": 7.377642988155157e-05, + "loss": 1.9023, + "step": 11795 + }, + { + "epoch": 3.620626151012891, + "grad_norm": 0.3783189058303833, + "learning_rate": 7.37720571612565e-05, + "loss": 1.7813, + "step": 11796 + }, + { + "epoch": 3.6209330877839165, + "grad_norm": 0.3468814790248871, + "learning_rate": 7.376768420603204e-05, + "loss": 1.7509, + "step": 11797 + }, + { + "epoch": 3.6212400245549414, + "grad_norm": 0.2602507174015045, + "learning_rate": 7.376331101592138e-05, + "loss": 1.8158, + "step": 11798 + }, + { + "epoch": 3.6215469613259668, + "grad_norm": 0.28337883949279785, + "learning_rate": 7.375893759096775e-05, + "loss": 1.7755, + "step": 11799 + }, + { + "epoch": 3.621853898096992, + "grad_norm": 0.3644609749317169, + "learning_rate": 7.375456393121437e-05, + "loss": 1.8193, + "step": 11800 + }, + { + "epoch": 3.6221608348680174, + "grad_norm": 0.338211327791214, + "learning_rate": 7.375019003670448e-05, + "loss": 1.821, + "step": 11801 + }, + { + "epoch": 3.6224677716390423, + "grad_norm": 0.23850654065608978, + "learning_rate": 7.374581590748129e-05, + "loss": 1.7317, + "step": 11802 + }, + { + "epoch": 3.6227747084100677, + "grad_norm": 0.3496716618537903, + "learning_rate": 7.374144154358801e-05, + "loss": 1.8361, + "step": 11803 + }, + { + "epoch": 3.6230816451810925, + "grad_norm": 0.5585216283798218, + "learning_rate": 7.37370669450679e-05, + "loss": 1.7667, + "step": 11804 + }, + { + "epoch": 3.623388581952118, + "grad_norm": 0.4578089714050293, + "learning_rate": 7.373269211196418e-05, + "loss": 1.8051, + "step": 11805 + }, + { + "epoch": 3.623695518723143, + "grad_norm": 0.28195759654045105, + "learning_rate": 7.37283170443201e-05, + "loss": 1.7823, + "step": 11806 + }, + { + "epoch": 3.624002455494168, + "grad_norm": 0.4066108465194702, + "learning_rate": 7.372394174217887e-05, + "loss": 1.7819, + "step": 11807 + }, + { + "epoch": 3.6243093922651934, + "grad_norm": 0.5368703007698059, + "learning_rate": 7.371956620558375e-05, + "loss": 1.8121, + "step": 11808 + }, + { + "epoch": 3.6246163290362183, + "grad_norm": 0.36627063155174255, + "learning_rate": 7.371519043457795e-05, + "loss": 1.7944, + "step": 11809 + }, + { + "epoch": 3.6249232658072437, + "grad_norm": 0.3100780248641968, + "learning_rate": 7.371081442920476e-05, + "loss": 1.783, + "step": 11810 + }, + { + "epoch": 3.625230202578269, + "grad_norm": 0.3277178704738617, + "learning_rate": 7.370643818950741e-05, + "loss": 1.8105, + "step": 11811 + }, + { + "epoch": 3.625537139349294, + "grad_norm": 0.3887772560119629, + "learning_rate": 7.370206171552914e-05, + "loss": 1.8136, + "step": 11812 + }, + { + "epoch": 3.6258440761203192, + "grad_norm": 0.2770824134349823, + "learning_rate": 7.36976850073132e-05, + "loss": 1.7852, + "step": 11813 + }, + { + "epoch": 3.626151012891344, + "grad_norm": 0.26357728242874146, + "learning_rate": 7.369330806490284e-05, + "loss": 1.7621, + "step": 11814 + }, + { + "epoch": 3.6264579496623695, + "grad_norm": 0.3387344181537628, + "learning_rate": 7.368893088834135e-05, + "loss": 1.7785, + "step": 11815 + }, + { + "epoch": 3.626764886433395, + "grad_norm": 0.35155174136161804, + "learning_rate": 7.368455347767193e-05, + "loss": 1.8081, + "step": 11816 + }, + { + "epoch": 3.62707182320442, + "grad_norm": 0.2855289876461029, + "learning_rate": 7.368017583293788e-05, + "loss": 1.8245, + "step": 11817 + }, + { + "epoch": 3.627378759975445, + "grad_norm": 0.28462162613868713, + "learning_rate": 7.367579795418245e-05, + "loss": 1.8066, + "step": 11818 + }, + { + "epoch": 3.6276856967464703, + "grad_norm": 0.40696555376052856, + "learning_rate": 7.367141984144891e-05, + "loss": 1.8897, + "step": 11819 + }, + { + "epoch": 3.6279926335174952, + "grad_norm": 0.472782701253891, + "learning_rate": 7.366704149478054e-05, + "loss": 1.8071, + "step": 11820 + }, + { + "epoch": 3.6282995702885206, + "grad_norm": 0.27022916078567505, + "learning_rate": 7.366266291422057e-05, + "loss": 1.8574, + "step": 11821 + }, + { + "epoch": 3.628606507059546, + "grad_norm": 0.4207148253917694, + "learning_rate": 7.365828409981231e-05, + "loss": 1.7759, + "step": 11822 + }, + { + "epoch": 3.628913443830571, + "grad_norm": 0.42866072058677673, + "learning_rate": 7.365390505159902e-05, + "loss": 1.7366, + "step": 11823 + }, + { + "epoch": 3.629220380601596, + "grad_norm": 0.28288859128952026, + "learning_rate": 7.364952576962398e-05, + "loss": 1.8591, + "step": 11824 + }, + { + "epoch": 3.629527317372621, + "grad_norm": 0.30544906854629517, + "learning_rate": 7.364514625393045e-05, + "loss": 1.7965, + "step": 11825 + }, + { + "epoch": 3.6298342541436464, + "grad_norm": 0.3251616954803467, + "learning_rate": 7.364076650456173e-05, + "loss": 1.8197, + "step": 11826 + }, + { + "epoch": 3.6301411909146717, + "grad_norm": 0.3133888840675354, + "learning_rate": 7.363638652156109e-05, + "loss": 1.7978, + "step": 11827 + }, + { + "epoch": 3.630448127685697, + "grad_norm": 0.29004594683647156, + "learning_rate": 7.363200630497185e-05, + "loss": 1.8035, + "step": 11828 + }, + { + "epoch": 3.630755064456722, + "grad_norm": 0.2781279683113098, + "learning_rate": 7.362762585483725e-05, + "loss": 1.8462, + "step": 11829 + }, + { + "epoch": 3.6310620012277472, + "grad_norm": 0.29003822803497314, + "learning_rate": 7.362324517120063e-05, + "loss": 1.7952, + "step": 11830 + }, + { + "epoch": 3.631368937998772, + "grad_norm": 0.2510940134525299, + "learning_rate": 7.361886425410524e-05, + "loss": 1.7645, + "step": 11831 + }, + { + "epoch": 3.6316758747697975, + "grad_norm": 0.23798540234565735, + "learning_rate": 7.361448310359438e-05, + "loss": 1.7329, + "step": 11832 + }, + { + "epoch": 3.631982811540823, + "grad_norm": 0.2711278796195984, + "learning_rate": 7.361010171971137e-05, + "loss": 1.8245, + "step": 11833 + }, + { + "epoch": 3.6322897483118477, + "grad_norm": 0.2895669639110565, + "learning_rate": 7.360572010249949e-05, + "loss": 1.7668, + "step": 11834 + }, + { + "epoch": 3.632596685082873, + "grad_norm": 0.2216273844242096, + "learning_rate": 7.360133825200205e-05, + "loss": 1.8164, + "step": 11835 + }, + { + "epoch": 3.632903621853898, + "grad_norm": 0.3075082302093506, + "learning_rate": 7.359695616826236e-05, + "loss": 1.8159, + "step": 11836 + }, + { + "epoch": 3.6332105586249233, + "grad_norm": 0.3208801746368408, + "learning_rate": 7.35925738513237e-05, + "loss": 1.8385, + "step": 11837 + }, + { + "epoch": 3.6335174953959486, + "grad_norm": 0.272517591714859, + "learning_rate": 7.35881913012294e-05, + "loss": 1.7653, + "step": 11838 + }, + { + "epoch": 3.6338244321669735, + "grad_norm": 0.23105360567569733, + "learning_rate": 7.358380851802277e-05, + "loss": 1.7697, + "step": 11839 + }, + { + "epoch": 3.634131368937999, + "grad_norm": 0.2643153667449951, + "learning_rate": 7.357942550174714e-05, + "loss": 1.7885, + "step": 11840 + }, + { + "epoch": 3.6344383057090237, + "grad_norm": 0.22643202543258667, + "learning_rate": 7.357504225244579e-05, + "loss": 1.746, + "step": 11841 + }, + { + "epoch": 3.634745242480049, + "grad_norm": 0.27782970666885376, + "learning_rate": 7.357065877016207e-05, + "loss": 1.794, + "step": 11842 + }, + { + "epoch": 3.6350521792510744, + "grad_norm": 0.3035561740398407, + "learning_rate": 7.356627505493925e-05, + "loss": 1.7892, + "step": 11843 + }, + { + "epoch": 3.6353591160220997, + "grad_norm": 0.31859731674194336, + "learning_rate": 7.356189110682072e-05, + "loss": 1.7636, + "step": 11844 + }, + { + "epoch": 3.6356660527931246, + "grad_norm": 0.2960890233516693, + "learning_rate": 7.355750692584977e-05, + "loss": 1.8294, + "step": 11845 + }, + { + "epoch": 3.63597298956415, + "grad_norm": 0.2544194459915161, + "learning_rate": 7.355312251206972e-05, + "loss": 1.7603, + "step": 11846 + }, + { + "epoch": 3.636279926335175, + "grad_norm": 0.27864789962768555, + "learning_rate": 7.354873786552391e-05, + "loss": 1.7917, + "step": 11847 + }, + { + "epoch": 3.6365868631062, + "grad_norm": 0.32552552223205566, + "learning_rate": 7.354435298625568e-05, + "loss": 1.7769, + "step": 11848 + }, + { + "epoch": 3.6368937998772255, + "grad_norm": 0.25094640254974365, + "learning_rate": 7.353996787430833e-05, + "loss": 1.8371, + "step": 11849 + }, + { + "epoch": 3.6372007366482504, + "grad_norm": 0.26656433939933777, + "learning_rate": 7.353558252972524e-05, + "loss": 1.7686, + "step": 11850 + }, + { + "epoch": 3.6375076734192757, + "grad_norm": 0.3023635745048523, + "learning_rate": 7.353119695254973e-05, + "loss": 1.7892, + "step": 11851 + }, + { + "epoch": 3.6378146101903006, + "grad_norm": 0.2822463810443878, + "learning_rate": 7.352681114282514e-05, + "loss": 1.8221, + "step": 11852 + }, + { + "epoch": 3.638121546961326, + "grad_norm": 0.31159496307373047, + "learning_rate": 7.35224251005948e-05, + "loss": 1.803, + "step": 11853 + }, + { + "epoch": 3.6384284837323513, + "grad_norm": 0.3133087158203125, + "learning_rate": 7.351803882590207e-05, + "loss": 1.744, + "step": 11854 + }, + { + "epoch": 3.638735420503376, + "grad_norm": 0.3050002455711365, + "learning_rate": 7.351365231879029e-05, + "loss": 1.7522, + "step": 11855 + }, + { + "epoch": 3.6390423572744015, + "grad_norm": 0.2729037404060364, + "learning_rate": 7.350926557930283e-05, + "loss": 1.7629, + "step": 11856 + }, + { + "epoch": 3.6393492940454264, + "grad_norm": 0.3181995153427124, + "learning_rate": 7.350487860748303e-05, + "loss": 1.7603, + "step": 11857 + }, + { + "epoch": 3.6396562308164517, + "grad_norm": 0.352651447057724, + "learning_rate": 7.350049140337423e-05, + "loss": 1.8177, + "step": 11858 + }, + { + "epoch": 3.639963167587477, + "grad_norm": 0.22935177385807037, + "learning_rate": 7.349610396701981e-05, + "loss": 1.7421, + "step": 11859 + }, + { + "epoch": 3.6402701043585024, + "grad_norm": 0.26442599296569824, + "learning_rate": 7.349171629846312e-05, + "loss": 1.8026, + "step": 11860 + }, + { + "epoch": 3.6405770411295273, + "grad_norm": 0.25357648730278015, + "learning_rate": 7.348732839774751e-05, + "loss": 1.788, + "step": 11861 + }, + { + "epoch": 3.6408839779005526, + "grad_norm": 0.26959577202796936, + "learning_rate": 7.348294026491635e-05, + "loss": 1.884, + "step": 11862 + }, + { + "epoch": 3.6411909146715775, + "grad_norm": 0.2243001013994217, + "learning_rate": 7.347855190001304e-05, + "loss": 1.7765, + "step": 11863 + }, + { + "epoch": 3.641497851442603, + "grad_norm": 0.2480708807706833, + "learning_rate": 7.34741633030809e-05, + "loss": 1.7597, + "step": 11864 + }, + { + "epoch": 3.641804788213628, + "grad_norm": 0.22512994706630707, + "learning_rate": 7.346977447416332e-05, + "loss": 1.7647, + "step": 11865 + }, + { + "epoch": 3.642111724984653, + "grad_norm": 0.24961981177330017, + "learning_rate": 7.346538541330368e-05, + "loss": 1.8178, + "step": 11866 + }, + { + "epoch": 3.6424186617556784, + "grad_norm": 0.320896714925766, + "learning_rate": 7.346099612054533e-05, + "loss": 1.85, + "step": 11867 + }, + { + "epoch": 3.6427255985267033, + "grad_norm": 0.3420880436897278, + "learning_rate": 7.345660659593167e-05, + "loss": 1.8661, + "step": 11868 + }, + { + "epoch": 3.6430325352977286, + "grad_norm": 0.2675844132900238, + "learning_rate": 7.34522168395061e-05, + "loss": 1.8177, + "step": 11869 + }, + { + "epoch": 3.643339472068754, + "grad_norm": 0.23993943631649017, + "learning_rate": 7.344782685131195e-05, + "loss": 1.7365, + "step": 11870 + }, + { + "epoch": 3.643646408839779, + "grad_norm": 0.21805813908576965, + "learning_rate": 7.344343663139264e-05, + "loss": 1.7813, + "step": 11871 + }, + { + "epoch": 3.643953345610804, + "grad_norm": 0.24334421753883362, + "learning_rate": 7.343904617979154e-05, + "loss": 1.7763, + "step": 11872 + }, + { + "epoch": 3.644260282381829, + "grad_norm": 0.22768431901931763, + "learning_rate": 7.343465549655206e-05, + "loss": 1.7817, + "step": 11873 + }, + { + "epoch": 3.6445672191528544, + "grad_norm": 0.23828962445259094, + "learning_rate": 7.343026458171757e-05, + "loss": 1.8391, + "step": 11874 + }, + { + "epoch": 3.6448741559238798, + "grad_norm": 0.24838197231292725, + "learning_rate": 7.342587343533149e-05, + "loss": 1.759, + "step": 11875 + }, + { + "epoch": 3.645181092694905, + "grad_norm": 0.22732019424438477, + "learning_rate": 7.342148205743718e-05, + "loss": 1.7348, + "step": 11876 + }, + { + "epoch": 3.64548802946593, + "grad_norm": 0.25106775760650635, + "learning_rate": 7.341709044807807e-05, + "loss": 1.8121, + "step": 11877 + }, + { + "epoch": 3.6457949662369553, + "grad_norm": 0.28532838821411133, + "learning_rate": 7.341269860729753e-05, + "loss": 1.7147, + "step": 11878 + }, + { + "epoch": 3.64610190300798, + "grad_norm": 0.3041890859603882, + "learning_rate": 7.340830653513899e-05, + "loss": 1.7666, + "step": 11879 + }, + { + "epoch": 3.6464088397790055, + "grad_norm": 0.3142147958278656, + "learning_rate": 7.340391423164585e-05, + "loss": 1.8707, + "step": 11880 + }, + { + "epoch": 3.646715776550031, + "grad_norm": 0.28531381487846375, + "learning_rate": 7.339952169686151e-05, + "loss": 1.7961, + "step": 11881 + }, + { + "epoch": 3.6470227133210558, + "grad_norm": 0.33779671788215637, + "learning_rate": 7.339512893082938e-05, + "loss": 1.7428, + "step": 11882 + }, + { + "epoch": 3.647329650092081, + "grad_norm": 0.29611849784851074, + "learning_rate": 7.339073593359287e-05, + "loss": 1.8803, + "step": 11883 + }, + { + "epoch": 3.647636586863106, + "grad_norm": 0.31248557567596436, + "learning_rate": 7.33863427051954e-05, + "loss": 1.7868, + "step": 11884 + }, + { + "epoch": 3.6479435236341313, + "grad_norm": 0.42829564213752747, + "learning_rate": 7.338194924568039e-05, + "loss": 1.8558, + "step": 11885 + }, + { + "epoch": 3.6482504604051567, + "grad_norm": 0.431023508310318, + "learning_rate": 7.337755555509126e-05, + "loss": 1.7565, + "step": 11886 + }, + { + "epoch": 3.6485573971761815, + "grad_norm": 0.2917975187301636, + "learning_rate": 7.33731616334714e-05, + "loss": 1.8067, + "step": 11887 + }, + { + "epoch": 3.648864333947207, + "grad_norm": 0.3072175085544586, + "learning_rate": 7.336876748086427e-05, + "loss": 1.782, + "step": 11888 + }, + { + "epoch": 3.6491712707182318, + "grad_norm": 0.33658862113952637, + "learning_rate": 7.336437309731327e-05, + "loss": 1.8007, + "step": 11889 + }, + { + "epoch": 3.649478207489257, + "grad_norm": 0.23774033784866333, + "learning_rate": 7.335997848286185e-05, + "loss": 1.7606, + "step": 11890 + }, + { + "epoch": 3.6497851442602824, + "grad_norm": 0.3373236358165741, + "learning_rate": 7.335558363755344e-05, + "loss": 1.7335, + "step": 11891 + }, + { + "epoch": 3.650092081031308, + "grad_norm": 0.3906517028808594, + "learning_rate": 7.335118856143145e-05, + "loss": 1.7974, + "step": 11892 + }, + { + "epoch": 3.6503990178023327, + "grad_norm": 0.37715303897857666, + "learning_rate": 7.334679325453934e-05, + "loss": 1.8875, + "step": 11893 + }, + { + "epoch": 3.650705954573358, + "grad_norm": 0.278540700674057, + "learning_rate": 7.334239771692053e-05, + "loss": 1.8165, + "step": 11894 + }, + { + "epoch": 3.651012891344383, + "grad_norm": 0.24434895813465118, + "learning_rate": 7.333800194861845e-05, + "loss": 1.7756, + "step": 11895 + }, + { + "epoch": 3.6513198281154082, + "grad_norm": 0.25057271122932434, + "learning_rate": 7.333360594967658e-05, + "loss": 1.7932, + "step": 11896 + }, + { + "epoch": 3.6516267648864336, + "grad_norm": 0.3277342617511749, + "learning_rate": 7.332920972013833e-05, + "loss": 1.7781, + "step": 11897 + }, + { + "epoch": 3.6519337016574585, + "grad_norm": 0.2754829525947571, + "learning_rate": 7.332481326004715e-05, + "loss": 1.7916, + "step": 11898 + }, + { + "epoch": 3.652240638428484, + "grad_norm": 0.24490588903427124, + "learning_rate": 7.332041656944651e-05, + "loss": 1.7904, + "step": 11899 + }, + { + "epoch": 3.6525475751995087, + "grad_norm": 0.3176959455013275, + "learning_rate": 7.331601964837982e-05, + "loss": 1.7379, + "step": 11900 + }, + { + "epoch": 3.652854511970534, + "grad_norm": 0.3435784876346588, + "learning_rate": 7.331162249689057e-05, + "loss": 1.7635, + "step": 11901 + }, + { + "epoch": 3.6531614487415593, + "grad_norm": 0.335697740316391, + "learning_rate": 7.330722511502221e-05, + "loss": 1.7903, + "step": 11902 + }, + { + "epoch": 3.6534683855125847, + "grad_norm": 0.2748894691467285, + "learning_rate": 7.330282750281819e-05, + "loss": 1.8259, + "step": 11903 + }, + { + "epoch": 3.6537753222836096, + "grad_norm": 0.36754751205444336, + "learning_rate": 7.329842966032197e-05, + "loss": 1.7728, + "step": 11904 + }, + { + "epoch": 3.654082259054635, + "grad_norm": 0.4355713129043579, + "learning_rate": 7.3294031587577e-05, + "loss": 1.7447, + "step": 11905 + }, + { + "epoch": 3.65438919582566, + "grad_norm": 0.3967476487159729, + "learning_rate": 7.328963328462677e-05, + "loss": 1.8299, + "step": 11906 + }, + { + "epoch": 3.654696132596685, + "grad_norm": 0.23805755376815796, + "learning_rate": 7.328523475151472e-05, + "loss": 1.7631, + "step": 11907 + }, + { + "epoch": 3.6550030693677105, + "grad_norm": 0.40350377559661865, + "learning_rate": 7.328083598828435e-05, + "loss": 1.8693, + "step": 11908 + }, + { + "epoch": 3.6553100061387354, + "grad_norm": 0.4743673801422119, + "learning_rate": 7.32764369949791e-05, + "loss": 1.7887, + "step": 11909 + }, + { + "epoch": 3.6556169429097607, + "grad_norm": 0.33830127120018005, + "learning_rate": 7.327203777164246e-05, + "loss": 1.7527, + "step": 11910 + }, + { + "epoch": 3.6559238796807856, + "grad_norm": 0.2465003877878189, + "learning_rate": 7.326763831831791e-05, + "loss": 1.7898, + "step": 11911 + }, + { + "epoch": 3.656230816451811, + "grad_norm": 0.31647852063179016, + "learning_rate": 7.326323863504892e-05, + "loss": 1.8056, + "step": 11912 + }, + { + "epoch": 3.6565377532228363, + "grad_norm": 0.31436124444007874, + "learning_rate": 7.325883872187896e-05, + "loss": 1.7972, + "step": 11913 + }, + { + "epoch": 3.656844689993861, + "grad_norm": 0.260405957698822, + "learning_rate": 7.325443857885153e-05, + "loss": 1.8109, + "step": 11914 + }, + { + "epoch": 3.6571516267648865, + "grad_norm": 0.29312583804130554, + "learning_rate": 7.325003820601011e-05, + "loss": 1.8947, + "step": 11915 + }, + { + "epoch": 3.6574585635359114, + "grad_norm": 0.2641582190990448, + "learning_rate": 7.324563760339819e-05, + "loss": 1.7737, + "step": 11916 + }, + { + "epoch": 3.6577655003069367, + "grad_norm": 0.2338121086359024, + "learning_rate": 7.324123677105923e-05, + "loss": 1.7462, + "step": 11917 + }, + { + "epoch": 3.658072437077962, + "grad_norm": 0.27877378463745117, + "learning_rate": 7.323683570903676e-05, + "loss": 1.8371, + "step": 11918 + }, + { + "epoch": 3.6583793738489874, + "grad_norm": 0.24238766729831696, + "learning_rate": 7.323243441737427e-05, + "loss": 1.7304, + "step": 11919 + }, + { + "epoch": 3.6586863106200123, + "grad_norm": 0.2349759042263031, + "learning_rate": 7.322803289611525e-05, + "loss": 1.7422, + "step": 11920 + }, + { + "epoch": 3.6589932473910376, + "grad_norm": 0.2254217565059662, + "learning_rate": 7.322363114530318e-05, + "loss": 1.7296, + "step": 11921 + }, + { + "epoch": 3.6593001841620625, + "grad_norm": 0.24533270299434662, + "learning_rate": 7.321922916498158e-05, + "loss": 1.7834, + "step": 11922 + }, + { + "epoch": 3.659607120933088, + "grad_norm": 0.24993161857128143, + "learning_rate": 7.321482695519393e-05, + "loss": 1.8502, + "step": 11923 + }, + { + "epoch": 3.659914057704113, + "grad_norm": 0.2540178894996643, + "learning_rate": 7.321042451598378e-05, + "loss": 1.8372, + "step": 11924 + }, + { + "epoch": 3.660220994475138, + "grad_norm": 0.2241390198469162, + "learning_rate": 7.32060218473946e-05, + "loss": 1.7619, + "step": 11925 + }, + { + "epoch": 3.6605279312461634, + "grad_norm": 0.2137840837240219, + "learning_rate": 7.32016189494699e-05, + "loss": 1.751, + "step": 11926 + }, + { + "epoch": 3.6608348680171883, + "grad_norm": 0.2596585154533386, + "learning_rate": 7.319721582225323e-05, + "loss": 1.7773, + "step": 11927 + }, + { + "epoch": 3.6611418047882136, + "grad_norm": 0.24898354709148407, + "learning_rate": 7.319281246578806e-05, + "loss": 1.7347, + "step": 11928 + }, + { + "epoch": 3.661448741559239, + "grad_norm": 0.26553863286972046, + "learning_rate": 7.31884088801179e-05, + "loss": 1.7812, + "step": 11929 + }, + { + "epoch": 3.661755678330264, + "grad_norm": 0.2494438737630844, + "learning_rate": 7.318400506528633e-05, + "loss": 1.7554, + "step": 11930 + }, + { + "epoch": 3.662062615101289, + "grad_norm": 0.2794995903968811, + "learning_rate": 7.317960102133682e-05, + "loss": 1.7495, + "step": 11931 + }, + { + "epoch": 3.662369551872314, + "grad_norm": 0.2843860983848572, + "learning_rate": 7.317519674831293e-05, + "loss": 1.7734, + "step": 11932 + }, + { + "epoch": 3.6626764886433394, + "grad_norm": 0.28261128067970276, + "learning_rate": 7.317079224625813e-05, + "loss": 1.7794, + "step": 11933 + }, + { + "epoch": 3.6629834254143647, + "grad_norm": 0.2552426755428314, + "learning_rate": 7.316638751521599e-05, + "loss": 1.8397, + "step": 11934 + }, + { + "epoch": 3.66329036218539, + "grad_norm": 0.4140608608722687, + "learning_rate": 7.316198255523002e-05, + "loss": 1.848, + "step": 11935 + }, + { + "epoch": 3.663597298956415, + "grad_norm": 0.3709854483604431, + "learning_rate": 7.315757736634377e-05, + "loss": 1.8489, + "step": 11936 + }, + { + "epoch": 3.6639042357274403, + "grad_norm": 0.23637300729751587, + "learning_rate": 7.315317194860078e-05, + "loss": 1.7549, + "step": 11937 + }, + { + "epoch": 3.664211172498465, + "grad_norm": 0.32884421944618225, + "learning_rate": 7.314876630204456e-05, + "loss": 1.8061, + "step": 11938 + }, + { + "epoch": 3.6645181092694905, + "grad_norm": 0.33354130387306213, + "learning_rate": 7.314436042671867e-05, + "loss": 1.8346, + "step": 11939 + }, + { + "epoch": 3.664825046040516, + "grad_norm": 0.25776317715644836, + "learning_rate": 7.313995432266663e-05, + "loss": 1.8598, + "step": 11940 + }, + { + "epoch": 3.6651319828115407, + "grad_norm": 0.2910402715206146, + "learning_rate": 7.313554798993202e-05, + "loss": 1.7613, + "step": 11941 + }, + { + "epoch": 3.665438919582566, + "grad_norm": 0.3487538695335388, + "learning_rate": 7.313114142855836e-05, + "loss": 1.8105, + "step": 11942 + }, + { + "epoch": 3.665745856353591, + "grad_norm": 0.27271291613578796, + "learning_rate": 7.312673463858918e-05, + "loss": 1.8107, + "step": 11943 + }, + { + "epoch": 3.6660527931246163, + "grad_norm": 0.2613036632537842, + "learning_rate": 7.312232762006809e-05, + "loss": 1.7871, + "step": 11944 + }, + { + "epoch": 3.6663597298956416, + "grad_norm": 0.30594903230667114, + "learning_rate": 7.311792037303859e-05, + "loss": 1.8043, + "step": 11945 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.3960847854614258, + "learning_rate": 7.311351289754425e-05, + "loss": 1.8434, + "step": 11946 + }, + { + "epoch": 3.666973603437692, + "grad_norm": 0.33369311690330505, + "learning_rate": 7.310910519362861e-05, + "loss": 1.7496, + "step": 11947 + }, + { + "epoch": 3.6672805402087167, + "grad_norm": 0.29852384328842163, + "learning_rate": 7.310469726133528e-05, + "loss": 1.858, + "step": 11948 + }, + { + "epoch": 3.667587476979742, + "grad_norm": 0.2610527276992798, + "learning_rate": 7.310028910070777e-05, + "loss": 1.7642, + "step": 11949 + }, + { + "epoch": 3.6678944137507674, + "grad_norm": 0.3606704771518707, + "learning_rate": 7.309588071178967e-05, + "loss": 1.845, + "step": 11950 + }, + { + "epoch": 3.6682013505217927, + "grad_norm": 0.3157273828983307, + "learning_rate": 7.309147209462454e-05, + "loss": 1.7864, + "step": 11951 + }, + { + "epoch": 3.6685082872928176, + "grad_norm": 0.23907925188541412, + "learning_rate": 7.308706324925594e-05, + "loss": 1.8363, + "step": 11952 + }, + { + "epoch": 3.668815224063843, + "grad_norm": 0.3365088999271393, + "learning_rate": 7.308265417572747e-05, + "loss": 1.8755, + "step": 11953 + }, + { + "epoch": 3.669122160834868, + "grad_norm": 0.29404979944229126, + "learning_rate": 7.307824487408266e-05, + "loss": 1.8128, + "step": 11954 + }, + { + "epoch": 3.669429097605893, + "grad_norm": 0.2689574658870697, + "learning_rate": 7.307383534436511e-05, + "loss": 1.8072, + "step": 11955 + }, + { + "epoch": 3.6697360343769185, + "grad_norm": 0.28394198417663574, + "learning_rate": 7.306942558661841e-05, + "loss": 1.7919, + "step": 11956 + }, + { + "epoch": 3.6700429711479434, + "grad_norm": 0.2594783902168274, + "learning_rate": 7.306501560088612e-05, + "loss": 1.7467, + "step": 11957 + }, + { + "epoch": 3.6703499079189688, + "grad_norm": 0.24765191972255707, + "learning_rate": 7.30606053872118e-05, + "loss": 1.7876, + "step": 11958 + }, + { + "epoch": 3.6706568446899936, + "grad_norm": 0.22157172858715057, + "learning_rate": 7.305619494563909e-05, + "loss": 1.7802, + "step": 11959 + }, + { + "epoch": 3.670963781461019, + "grad_norm": 0.270151287317276, + "learning_rate": 7.305178427621155e-05, + "loss": 1.7723, + "step": 11960 + }, + { + "epoch": 3.6712707182320443, + "grad_norm": 0.3163939118385315, + "learning_rate": 7.304737337897277e-05, + "loss": 1.8488, + "step": 11961 + }, + { + "epoch": 3.671577655003069, + "grad_norm": 0.2605706453323364, + "learning_rate": 7.304296225396632e-05, + "loss": 1.7442, + "step": 11962 + }, + { + "epoch": 3.6718845917740945, + "grad_norm": 0.31179291009902954, + "learning_rate": 7.303855090123582e-05, + "loss": 1.831, + "step": 11963 + }, + { + "epoch": 3.6721915285451194, + "grad_norm": 0.33365359902381897, + "learning_rate": 7.303413932082483e-05, + "loss": 1.8376, + "step": 11964 + }, + { + "epoch": 3.6724984653161448, + "grad_norm": 0.2952130138874054, + "learning_rate": 7.302972751277701e-05, + "loss": 1.7733, + "step": 11965 + }, + { + "epoch": 3.67280540208717, + "grad_norm": 0.24270877242088318, + "learning_rate": 7.302531547713592e-05, + "loss": 1.8367, + "step": 11966 + }, + { + "epoch": 3.6731123388581954, + "grad_norm": 0.34315919876098633, + "learning_rate": 7.302090321394517e-05, + "loss": 1.7901, + "step": 11967 + }, + { + "epoch": 3.6734192756292203, + "grad_norm": 0.33511418104171753, + "learning_rate": 7.301649072324834e-05, + "loss": 1.7929, + "step": 11968 + }, + { + "epoch": 3.6737262124002457, + "grad_norm": 0.22397933900356293, + "learning_rate": 7.301207800508907e-05, + "loss": 1.7533, + "step": 11969 + }, + { + "epoch": 3.6740331491712706, + "grad_norm": 0.2882738411426544, + "learning_rate": 7.300766505951095e-05, + "loss": 1.8071, + "step": 11970 + }, + { + "epoch": 3.674340085942296, + "grad_norm": 0.242112398147583, + "learning_rate": 7.300325188655761e-05, + "loss": 1.7739, + "step": 11971 + }, + { + "epoch": 3.674647022713321, + "grad_norm": 0.27754491567611694, + "learning_rate": 7.299883848627265e-05, + "loss": 1.8295, + "step": 11972 + }, + { + "epoch": 3.674953959484346, + "grad_norm": 0.2787899076938629, + "learning_rate": 7.29944248586997e-05, + "loss": 1.7682, + "step": 11973 + }, + { + "epoch": 3.6752608962553714, + "grad_norm": 0.24448934197425842, + "learning_rate": 7.299001100388234e-05, + "loss": 1.7826, + "step": 11974 + }, + { + "epoch": 3.6755678330263963, + "grad_norm": 0.37869495153427124, + "learning_rate": 7.298559692186421e-05, + "loss": 1.8582, + "step": 11975 + }, + { + "epoch": 3.6758747697974217, + "grad_norm": 0.3299996256828308, + "learning_rate": 7.298118261268897e-05, + "loss": 1.7716, + "step": 11976 + }, + { + "epoch": 3.676181706568447, + "grad_norm": 0.278891384601593, + "learning_rate": 7.29767680764002e-05, + "loss": 1.879, + "step": 11977 + }, + { + "epoch": 3.6764886433394723, + "grad_norm": 0.29326459765434265, + "learning_rate": 7.297235331304155e-05, + "loss": 1.804, + "step": 11978 + }, + { + "epoch": 3.6767955801104972, + "grad_norm": 0.2697092592716217, + "learning_rate": 7.296793832265663e-05, + "loss": 1.7842, + "step": 11979 + }, + { + "epoch": 3.6771025168815226, + "grad_norm": 0.3045118749141693, + "learning_rate": 7.296352310528909e-05, + "loss": 1.7959, + "step": 11980 + }, + { + "epoch": 3.6774094536525475, + "grad_norm": 0.278647780418396, + "learning_rate": 7.295910766098252e-05, + "loss": 1.7907, + "step": 11981 + }, + { + "epoch": 3.677716390423573, + "grad_norm": 0.2370275855064392, + "learning_rate": 7.295469198978063e-05, + "loss": 1.757, + "step": 11982 + }, + { + "epoch": 3.678023327194598, + "grad_norm": 0.3061021566390991, + "learning_rate": 7.295027609172702e-05, + "loss": 1.7927, + "step": 11983 + }, + { + "epoch": 3.678330263965623, + "grad_norm": 0.2844544053077698, + "learning_rate": 7.294585996686532e-05, + "loss": 1.7705, + "step": 11984 + }, + { + "epoch": 3.6786372007366483, + "grad_norm": 0.31121113896369934, + "learning_rate": 7.29414436152392e-05, + "loss": 1.783, + "step": 11985 + }, + { + "epoch": 3.6789441375076732, + "grad_norm": 0.2566785514354706, + "learning_rate": 7.293702703689225e-05, + "loss": 1.7781, + "step": 11986 + }, + { + "epoch": 3.6792510742786986, + "grad_norm": 0.22176961600780487, + "learning_rate": 7.293261023186818e-05, + "loss": 1.7302, + "step": 11987 + }, + { + "epoch": 3.679558011049724, + "grad_norm": 0.21547441184520721, + "learning_rate": 7.292819320021062e-05, + "loss": 1.7666, + "step": 11988 + }, + { + "epoch": 3.679864947820749, + "grad_norm": 0.26309674978256226, + "learning_rate": 7.29237759419632e-05, + "loss": 1.7817, + "step": 11989 + }, + { + "epoch": 3.680171884591774, + "grad_norm": 0.2558063864707947, + "learning_rate": 7.29193584571696e-05, + "loss": 1.8257, + "step": 11990 + }, + { + "epoch": 3.680478821362799, + "grad_norm": 0.24516844749450684, + "learning_rate": 7.291494074587347e-05, + "loss": 1.7803, + "step": 11991 + }, + { + "epoch": 3.6807857581338244, + "grad_norm": 0.22891047596931458, + "learning_rate": 7.291052280811843e-05, + "loss": 1.7977, + "step": 11992 + }, + { + "epoch": 3.6810926949048497, + "grad_norm": 0.2776026129722595, + "learning_rate": 7.290610464394822e-05, + "loss": 1.8486, + "step": 11993 + }, + { + "epoch": 3.681399631675875, + "grad_norm": 0.31472426652908325, + "learning_rate": 7.290168625340644e-05, + "loss": 1.7841, + "step": 11994 + }, + { + "epoch": 3.6817065684469, + "grad_norm": 0.3459274470806122, + "learning_rate": 7.289726763653677e-05, + "loss": 1.7458, + "step": 11995 + }, + { + "epoch": 3.6820135052179253, + "grad_norm": 0.23645849525928497, + "learning_rate": 7.289284879338289e-05, + "loss": 1.781, + "step": 11996 + }, + { + "epoch": 3.68232044198895, + "grad_norm": 0.3257114291191101, + "learning_rate": 7.288842972398845e-05, + "loss": 1.8269, + "step": 11997 + }, + { + "epoch": 3.6826273787599755, + "grad_norm": 0.5450126528739929, + "learning_rate": 7.288401042839713e-05, + "loss": 1.8342, + "step": 11998 + }, + { + "epoch": 3.682934315531001, + "grad_norm": 0.5080512762069702, + "learning_rate": 7.287959090665262e-05, + "loss": 1.8097, + "step": 11999 + }, + { + "epoch": 3.6832412523020257, + "grad_norm": 0.3005252480506897, + "learning_rate": 7.287517115879858e-05, + "loss": 1.8271, + "step": 12000 + }, + { + "epoch": 3.683548189073051, + "grad_norm": 0.2760924994945526, + "learning_rate": 7.287075118487869e-05, + "loss": 1.8267, + "step": 12001 + }, + { + "epoch": 3.683855125844076, + "grad_norm": 0.3475865423679352, + "learning_rate": 7.286633098493663e-05, + "loss": 1.785, + "step": 12002 + }, + { + "epoch": 3.6841620626151013, + "grad_norm": 0.2905690670013428, + "learning_rate": 7.286191055901608e-05, + "loss": 1.8283, + "step": 12003 + }, + { + "epoch": 3.6844689993861266, + "grad_norm": 0.23666246235370636, + "learning_rate": 7.285748990716072e-05, + "loss": 1.7665, + "step": 12004 + }, + { + "epoch": 3.6847759361571515, + "grad_norm": 0.32329514622688293, + "learning_rate": 7.285306902941427e-05, + "loss": 1.7267, + "step": 12005 + }, + { + "epoch": 3.685082872928177, + "grad_norm": 0.32345879077911377, + "learning_rate": 7.28486479258204e-05, + "loss": 1.7529, + "step": 12006 + }, + { + "epoch": 3.6853898096992017, + "grad_norm": 0.2727855443954468, + "learning_rate": 7.284422659642279e-05, + "loss": 1.8279, + "step": 12007 + }, + { + "epoch": 3.685696746470227, + "grad_norm": 0.37847277522087097, + "learning_rate": 7.283980504126513e-05, + "loss": 1.7809, + "step": 12008 + }, + { + "epoch": 3.6860036832412524, + "grad_norm": 0.44694215059280396, + "learning_rate": 7.283538326039113e-05, + "loss": 1.8184, + "step": 12009 + }, + { + "epoch": 3.6863106200122777, + "grad_norm": 0.2868261933326721, + "learning_rate": 7.28309612538445e-05, + "loss": 1.7461, + "step": 12010 + }, + { + "epoch": 3.6866175567833026, + "grad_norm": 0.2601351737976074, + "learning_rate": 7.282653902166894e-05, + "loss": 1.8011, + "step": 12011 + }, + { + "epoch": 3.686924493554328, + "grad_norm": 0.328185498714447, + "learning_rate": 7.282211656390813e-05, + "loss": 1.7934, + "step": 12012 + }, + { + "epoch": 3.687231430325353, + "grad_norm": 0.2712559103965759, + "learning_rate": 7.281769388060578e-05, + "loss": 1.7566, + "step": 12013 + }, + { + "epoch": 3.687538367096378, + "grad_norm": 0.2725805938243866, + "learning_rate": 7.281327097180562e-05, + "loss": 1.8024, + "step": 12014 + }, + { + "epoch": 3.6878453038674035, + "grad_norm": 0.37282630801200867, + "learning_rate": 7.280884783755133e-05, + "loss": 1.7624, + "step": 12015 + }, + { + "epoch": 3.6881522406384284, + "grad_norm": 0.36519256234169006, + "learning_rate": 7.280442447788664e-05, + "loss": 1.8691, + "step": 12016 + }, + { + "epoch": 3.6884591774094537, + "grad_norm": 0.21699345111846924, + "learning_rate": 7.280000089285528e-05, + "loss": 1.7308, + "step": 12017 + }, + { + "epoch": 3.6887661141804786, + "grad_norm": 0.3159945011138916, + "learning_rate": 7.279557708250094e-05, + "loss": 1.8144, + "step": 12018 + }, + { + "epoch": 3.689073050951504, + "grad_norm": 0.2927449643611908, + "learning_rate": 7.279115304686735e-05, + "loss": 1.7746, + "step": 12019 + }, + { + "epoch": 3.6893799877225293, + "grad_norm": 0.279208242893219, + "learning_rate": 7.278672878599819e-05, + "loss": 1.7678, + "step": 12020 + }, + { + "epoch": 3.689686924493554, + "grad_norm": 0.40005648136138916, + "learning_rate": 7.278230429993725e-05, + "loss": 1.7876, + "step": 12021 + }, + { + "epoch": 3.6899938612645795, + "grad_norm": 0.3444392681121826, + "learning_rate": 7.277787958872824e-05, + "loss": 1.7591, + "step": 12022 + }, + { + "epoch": 3.6903007980356044, + "grad_norm": 0.21841467916965485, + "learning_rate": 7.277345465241485e-05, + "loss": 1.785, + "step": 12023 + }, + { + "epoch": 3.6906077348066297, + "grad_norm": 0.32463181018829346, + "learning_rate": 7.276902949104084e-05, + "loss": 1.8164, + "step": 12024 + }, + { + "epoch": 3.690914671577655, + "grad_norm": 0.36221247911453247, + "learning_rate": 7.276460410464994e-05, + "loss": 1.7529, + "step": 12025 + }, + { + "epoch": 3.6912216083486804, + "grad_norm": 0.24451927840709686, + "learning_rate": 7.276017849328588e-05, + "loss": 1.8031, + "step": 12026 + }, + { + "epoch": 3.6915285451197053, + "grad_norm": 0.3055694103240967, + "learning_rate": 7.275575265699239e-05, + "loss": 1.8158, + "step": 12027 + }, + { + "epoch": 3.6918354818907306, + "grad_norm": 0.4315083622932434, + "learning_rate": 7.27513265958132e-05, + "loss": 1.8322, + "step": 12028 + }, + { + "epoch": 3.6921424186617555, + "grad_norm": 0.3391095697879791, + "learning_rate": 7.274690030979209e-05, + "loss": 1.8214, + "step": 12029 + }, + { + "epoch": 3.692449355432781, + "grad_norm": 0.22714883089065552, + "learning_rate": 7.274247379897277e-05, + "loss": 1.7312, + "step": 12030 + }, + { + "epoch": 3.692756292203806, + "grad_norm": 0.24982765316963196, + "learning_rate": 7.273804706339899e-05, + "loss": 1.738, + "step": 12031 + }, + { + "epoch": 3.693063228974831, + "grad_norm": 0.32509860396385193, + "learning_rate": 7.273362010311451e-05, + "loss": 1.7773, + "step": 12032 + }, + { + "epoch": 3.6933701657458564, + "grad_norm": 0.2643086612224579, + "learning_rate": 7.272919291816307e-05, + "loss": 1.7545, + "step": 12033 + }, + { + "epoch": 3.6936771025168813, + "grad_norm": 0.2568800747394562, + "learning_rate": 7.272476550858842e-05, + "loss": 1.8055, + "step": 12034 + }, + { + "epoch": 3.6939840392879066, + "grad_norm": 0.27418240904808044, + "learning_rate": 7.272033787443433e-05, + "loss": 1.7769, + "step": 12035 + }, + { + "epoch": 3.694290976058932, + "grad_norm": 0.2459677755832672, + "learning_rate": 7.271591001574453e-05, + "loss": 1.7971, + "step": 12036 + }, + { + "epoch": 3.694597912829957, + "grad_norm": 0.22349393367767334, + "learning_rate": 7.27114819325628e-05, + "loss": 1.7791, + "step": 12037 + }, + { + "epoch": 3.694904849600982, + "grad_norm": 0.25321197509765625, + "learning_rate": 7.270705362493288e-05, + "loss": 1.7475, + "step": 12038 + }, + { + "epoch": 3.695211786372007, + "grad_norm": 0.2585916519165039, + "learning_rate": 7.270262509289855e-05, + "loss": 1.7801, + "step": 12039 + }, + { + "epoch": 3.6955187231430324, + "grad_norm": 0.2673574686050415, + "learning_rate": 7.269819633650359e-05, + "loss": 1.7578, + "step": 12040 + }, + { + "epoch": 3.6958256599140578, + "grad_norm": 0.2509469985961914, + "learning_rate": 7.269376735579175e-05, + "loss": 1.7994, + "step": 12041 + }, + { + "epoch": 3.696132596685083, + "grad_norm": 0.28527703881263733, + "learning_rate": 7.268933815080679e-05, + "loss": 1.7752, + "step": 12042 + }, + { + "epoch": 3.696439533456108, + "grad_norm": 0.22716578841209412, + "learning_rate": 7.268490872159248e-05, + "loss": 1.7186, + "step": 12043 + }, + { + "epoch": 3.6967464702271333, + "grad_norm": 0.24888403713703156, + "learning_rate": 7.268047906819262e-05, + "loss": 1.7882, + "step": 12044 + }, + { + "epoch": 3.697053406998158, + "grad_norm": 0.28976112604141235, + "learning_rate": 7.267604919065096e-05, + "loss": 1.7655, + "step": 12045 + }, + { + "epoch": 3.6973603437691835, + "grad_norm": 0.24668502807617188, + "learning_rate": 7.267161908901131e-05, + "loss": 1.8051, + "step": 12046 + }, + { + "epoch": 3.697667280540209, + "grad_norm": 0.2464776188135147, + "learning_rate": 7.266718876331742e-05, + "loss": 1.809, + "step": 12047 + }, + { + "epoch": 3.6979742173112338, + "grad_norm": 0.27648577094078064, + "learning_rate": 7.266275821361309e-05, + "loss": 1.7869, + "step": 12048 + }, + { + "epoch": 3.698281154082259, + "grad_norm": 0.26427242159843445, + "learning_rate": 7.26583274399421e-05, + "loss": 1.7681, + "step": 12049 + }, + { + "epoch": 3.698588090853284, + "grad_norm": 0.24595285952091217, + "learning_rate": 7.265389644234823e-05, + "loss": 1.7209, + "step": 12050 + }, + { + "epoch": 3.6988950276243093, + "grad_norm": 0.32514405250549316, + "learning_rate": 7.26494652208753e-05, + "loss": 1.8702, + "step": 12051 + }, + { + "epoch": 3.6992019643953347, + "grad_norm": 0.24512936174869537, + "learning_rate": 7.264503377556705e-05, + "loss": 1.784, + "step": 12052 + }, + { + "epoch": 3.69950890116636, + "grad_norm": 0.28698310256004333, + "learning_rate": 7.264060210646733e-05, + "loss": 1.905, + "step": 12053 + }, + { + "epoch": 3.699815837937385, + "grad_norm": 0.2995007336139679, + "learning_rate": 7.263617021361989e-05, + "loss": 1.7822, + "step": 12054 + }, + { + "epoch": 3.7001227747084102, + "grad_norm": 0.25869423151016235, + "learning_rate": 7.263173809706855e-05, + "loss": 1.7988, + "step": 12055 + }, + { + "epoch": 3.700429711479435, + "grad_norm": 0.350918710231781, + "learning_rate": 7.262730575685711e-05, + "loss": 1.9504, + "step": 12056 + }, + { + "epoch": 3.7007366482504604, + "grad_norm": 0.3407665491104126, + "learning_rate": 7.262287319302937e-05, + "loss": 1.8506, + "step": 12057 + }, + { + "epoch": 3.701043585021486, + "grad_norm": 0.3039441704750061, + "learning_rate": 7.261844040562915e-05, + "loss": 1.7841, + "step": 12058 + }, + { + "epoch": 3.7013505217925107, + "grad_norm": 0.23483428359031677, + "learning_rate": 7.261400739470023e-05, + "loss": 1.7899, + "step": 12059 + }, + { + "epoch": 3.701657458563536, + "grad_norm": 0.30779507756233215, + "learning_rate": 7.260957416028645e-05, + "loss": 1.8131, + "step": 12060 + }, + { + "epoch": 3.701964395334561, + "grad_norm": 0.29901376366615295, + "learning_rate": 7.26051407024316e-05, + "loss": 1.7861, + "step": 12061 + }, + { + "epoch": 3.7022713321055862, + "grad_norm": 0.30058762431144714, + "learning_rate": 7.260070702117949e-05, + "loss": 1.7485, + "step": 12062 + }, + { + "epoch": 3.7025782688766116, + "grad_norm": 0.24523651599884033, + "learning_rate": 7.259627311657396e-05, + "loss": 1.772, + "step": 12063 + }, + { + "epoch": 3.7028852056476365, + "grad_norm": 0.24375474452972412, + "learning_rate": 7.259183898865882e-05, + "loss": 1.7848, + "step": 12064 + }, + { + "epoch": 3.703192142418662, + "grad_norm": 0.2562403380870819, + "learning_rate": 7.258740463747788e-05, + "loss": 1.7447, + "step": 12065 + }, + { + "epoch": 3.7034990791896867, + "grad_norm": 0.265229195356369, + "learning_rate": 7.258297006307496e-05, + "loss": 1.8111, + "step": 12066 + }, + { + "epoch": 3.703806015960712, + "grad_norm": 0.2836552858352661, + "learning_rate": 7.25785352654939e-05, + "loss": 1.7952, + "step": 12067 + }, + { + "epoch": 3.7041129527317374, + "grad_norm": 0.3269572854042053, + "learning_rate": 7.257410024477852e-05, + "loss": 1.8604, + "step": 12068 + }, + { + "epoch": 3.7044198895027627, + "grad_norm": 0.2391490638256073, + "learning_rate": 7.256966500097264e-05, + "loss": 1.7417, + "step": 12069 + }, + { + "epoch": 3.7047268262737876, + "grad_norm": 0.2610675096511841, + "learning_rate": 7.256522953412011e-05, + "loss": 1.7712, + "step": 12070 + }, + { + "epoch": 3.705033763044813, + "grad_norm": 0.24954774975776672, + "learning_rate": 7.256079384426477e-05, + "loss": 1.7506, + "step": 12071 + }, + { + "epoch": 3.705340699815838, + "grad_norm": 0.2603892385959625, + "learning_rate": 7.255635793145042e-05, + "loss": 1.8105, + "step": 12072 + }, + { + "epoch": 3.705647636586863, + "grad_norm": 0.32728591561317444, + "learning_rate": 7.255192179572092e-05, + "loss": 1.8448, + "step": 12073 + }, + { + "epoch": 3.7059545733578885, + "grad_norm": 0.4559340178966522, + "learning_rate": 7.254748543712013e-05, + "loss": 1.7232, + "step": 12074 + }, + { + "epoch": 3.7062615101289134, + "grad_norm": 0.36526206135749817, + "learning_rate": 7.254304885569186e-05, + "loss": 1.7874, + "step": 12075 + }, + { + "epoch": 3.7065684468999387, + "grad_norm": 0.21606837213039398, + "learning_rate": 7.253861205147998e-05, + "loss": 1.7266, + "step": 12076 + }, + { + "epoch": 3.7068753836709636, + "grad_norm": 0.3629585802555084, + "learning_rate": 7.253417502452831e-05, + "loss": 1.7722, + "step": 12077 + }, + { + "epoch": 3.707182320441989, + "grad_norm": 0.4224923551082611, + "learning_rate": 7.252973777488072e-05, + "loss": 1.7369, + "step": 12078 + }, + { + "epoch": 3.7074892572130143, + "grad_norm": 0.32245784997940063, + "learning_rate": 7.252530030258106e-05, + "loss": 1.7836, + "step": 12079 + }, + { + "epoch": 3.707796193984039, + "grad_norm": 0.29909494519233704, + "learning_rate": 7.252086260767317e-05, + "loss": 1.8718, + "step": 12080 + }, + { + "epoch": 3.7081031307550645, + "grad_norm": 0.21995799243450165, + "learning_rate": 7.251642469020093e-05, + "loss": 1.7103, + "step": 12081 + }, + { + "epoch": 3.7084100675260894, + "grad_norm": 0.2737572193145752, + "learning_rate": 7.251198655020818e-05, + "loss": 1.7787, + "step": 12082 + }, + { + "epoch": 3.7087170042971147, + "grad_norm": 0.22417058050632477, + "learning_rate": 7.250754818773879e-05, + "loss": 1.7782, + "step": 12083 + }, + { + "epoch": 3.70902394106814, + "grad_norm": 0.3350662887096405, + "learning_rate": 7.25031096028366e-05, + "loss": 1.8193, + "step": 12084 + }, + { + "epoch": 3.7093308778391654, + "grad_norm": 0.3199101686477661, + "learning_rate": 7.24986707955455e-05, + "loss": 1.831, + "step": 12085 + }, + { + "epoch": 3.7096378146101903, + "grad_norm": 0.2513977289199829, + "learning_rate": 7.249423176590936e-05, + "loss": 1.8288, + "step": 12086 + }, + { + "epoch": 3.7099447513812156, + "grad_norm": 0.30411866307258606, + "learning_rate": 7.248979251397203e-05, + "loss": 1.7837, + "step": 12087 + }, + { + "epoch": 3.7102516881522405, + "grad_norm": 0.30755332112312317, + "learning_rate": 7.248535303977738e-05, + "loss": 1.8016, + "step": 12088 + }, + { + "epoch": 3.710558624923266, + "grad_norm": 0.25746986269950867, + "learning_rate": 7.248091334336929e-05, + "loss": 1.8014, + "step": 12089 + }, + { + "epoch": 3.710865561694291, + "grad_norm": 0.3327447772026062, + "learning_rate": 7.247647342479164e-05, + "loss": 1.752, + "step": 12090 + }, + { + "epoch": 3.711172498465316, + "grad_norm": 0.3101816475391388, + "learning_rate": 7.247203328408832e-05, + "loss": 1.7867, + "step": 12091 + }, + { + "epoch": 3.7114794352363414, + "grad_norm": 0.2168906182050705, + "learning_rate": 7.246759292130318e-05, + "loss": 1.7452, + "step": 12092 + }, + { + "epoch": 3.7117863720073663, + "grad_norm": 0.34260258078575134, + "learning_rate": 7.246315233648013e-05, + "loss": 1.8156, + "step": 12093 + }, + { + "epoch": 3.7120933087783916, + "grad_norm": 0.2730714976787567, + "learning_rate": 7.245871152966303e-05, + "loss": 1.7429, + "step": 12094 + }, + { + "epoch": 3.712400245549417, + "grad_norm": 0.2560936212539673, + "learning_rate": 7.245427050089578e-05, + "loss": 1.7969, + "step": 12095 + }, + { + "epoch": 3.712707182320442, + "grad_norm": 0.27510303258895874, + "learning_rate": 7.244982925022228e-05, + "loss": 1.7981, + "step": 12096 + }, + { + "epoch": 3.713014119091467, + "grad_norm": 0.29171642661094666, + "learning_rate": 7.24453877776864e-05, + "loss": 1.7913, + "step": 12097 + }, + { + "epoch": 3.713321055862492, + "grad_norm": 0.26431843638420105, + "learning_rate": 7.244094608333206e-05, + "loss": 1.8262, + "step": 12098 + }, + { + "epoch": 3.7136279926335174, + "grad_norm": 0.30747905373573303, + "learning_rate": 7.243650416720311e-05, + "loss": 1.7951, + "step": 12099 + }, + { + "epoch": 3.7139349294045427, + "grad_norm": 0.346443772315979, + "learning_rate": 7.24320620293435e-05, + "loss": 1.7677, + "step": 12100 + }, + { + "epoch": 3.714241866175568, + "grad_norm": 0.2910652458667755, + "learning_rate": 7.242761966979709e-05, + "loss": 1.7887, + "step": 12101 + }, + { + "epoch": 3.714548802946593, + "grad_norm": 0.22342006862163544, + "learning_rate": 7.24231770886078e-05, + "loss": 1.7678, + "step": 12102 + }, + { + "epoch": 3.7148557397176183, + "grad_norm": 0.24125796556472778, + "learning_rate": 7.241873428581954e-05, + "loss": 1.7436, + "step": 12103 + }, + { + "epoch": 3.715162676488643, + "grad_norm": 0.23542635142803192, + "learning_rate": 7.24142912614762e-05, + "loss": 1.7942, + "step": 12104 + }, + { + "epoch": 3.7154696132596685, + "grad_norm": 0.22476384043693542, + "learning_rate": 7.240984801562169e-05, + "loss": 1.8235, + "step": 12105 + }, + { + "epoch": 3.715776550030694, + "grad_norm": 0.25123465061187744, + "learning_rate": 7.240540454829992e-05, + "loss": 1.8112, + "step": 12106 + }, + { + "epoch": 3.7160834868017187, + "grad_norm": 0.27230000495910645, + "learning_rate": 7.240096085955483e-05, + "loss": 1.8312, + "step": 12107 + }, + { + "epoch": 3.716390423572744, + "grad_norm": 0.2722976803779602, + "learning_rate": 7.239651694943031e-05, + "loss": 1.8368, + "step": 12108 + }, + { + "epoch": 3.716697360343769, + "grad_norm": 0.264138400554657, + "learning_rate": 7.239207281797028e-05, + "loss": 1.8206, + "step": 12109 + }, + { + "epoch": 3.7170042971147943, + "grad_norm": 0.28813931345939636, + "learning_rate": 7.238762846521866e-05, + "loss": 1.7391, + "step": 12110 + }, + { + "epoch": 3.7173112338858196, + "grad_norm": 0.2319631576538086, + "learning_rate": 7.238318389121939e-05, + "loss": 1.7574, + "step": 12111 + }, + { + "epoch": 3.717618170656845, + "grad_norm": 0.2507809102535248, + "learning_rate": 7.237873909601635e-05, + "loss": 1.7359, + "step": 12112 + }, + { + "epoch": 3.71792510742787, + "grad_norm": 0.2717304825782776, + "learning_rate": 7.237429407965351e-05, + "loss": 1.774, + "step": 12113 + }, + { + "epoch": 3.718232044198895, + "grad_norm": 0.2619280517101288, + "learning_rate": 7.236984884217478e-05, + "loss": 1.8083, + "step": 12114 + }, + { + "epoch": 3.71853898096992, + "grad_norm": 0.22268806397914886, + "learning_rate": 7.23654033836241e-05, + "loss": 1.7436, + "step": 12115 + }, + { + "epoch": 3.7188459177409454, + "grad_norm": 0.2341407984495163, + "learning_rate": 7.236095770404539e-05, + "loss": 1.7807, + "step": 12116 + }, + { + "epoch": 3.7191528545119708, + "grad_norm": 0.23519712686538696, + "learning_rate": 7.235651180348258e-05, + "loss": 1.8051, + "step": 12117 + }, + { + "epoch": 3.7194597912829956, + "grad_norm": 0.2391074150800705, + "learning_rate": 7.235206568197963e-05, + "loss": 1.8377, + "step": 12118 + }, + { + "epoch": 3.719766728054021, + "grad_norm": 0.26821592450141907, + "learning_rate": 7.234761933958045e-05, + "loss": 1.8586, + "step": 12119 + }, + { + "epoch": 3.720073664825046, + "grad_norm": 0.24971134960651398, + "learning_rate": 7.234317277632902e-05, + "loss": 1.8404, + "step": 12120 + }, + { + "epoch": 3.720380601596071, + "grad_norm": 0.20817919075489044, + "learning_rate": 7.233872599226926e-05, + "loss": 1.7204, + "step": 12121 + }, + { + "epoch": 3.7206875383670965, + "grad_norm": 0.29301291704177856, + "learning_rate": 7.233427898744509e-05, + "loss": 1.8528, + "step": 12122 + }, + { + "epoch": 3.7209944751381214, + "grad_norm": 0.22214651107788086, + "learning_rate": 7.23298317619005e-05, + "loss": 1.748, + "step": 12123 + }, + { + "epoch": 3.7213014119091468, + "grad_norm": 0.2511044442653656, + "learning_rate": 7.232538431567941e-05, + "loss": 1.8146, + "step": 12124 + }, + { + "epoch": 3.7216083486801717, + "grad_norm": 0.26976367831230164, + "learning_rate": 7.232093664882581e-05, + "loss": 1.8483, + "step": 12125 + }, + { + "epoch": 3.721915285451197, + "grad_norm": 0.2538089156150818, + "learning_rate": 7.231648876138361e-05, + "loss": 1.8097, + "step": 12126 + }, + { + "epoch": 3.7222222222222223, + "grad_norm": 0.2353016883134842, + "learning_rate": 7.231204065339682e-05, + "loss": 1.737, + "step": 12127 + }, + { + "epoch": 3.7225291589932477, + "grad_norm": 0.3205147981643677, + "learning_rate": 7.230759232490935e-05, + "loss": 1.8116, + "step": 12128 + }, + { + "epoch": 3.7228360957642725, + "grad_norm": 0.39056599140167236, + "learning_rate": 7.230314377596516e-05, + "loss": 1.7785, + "step": 12129 + }, + { + "epoch": 3.723143032535298, + "grad_norm": 0.3846863806247711, + "learning_rate": 7.229869500660825e-05, + "loss": 1.738, + "step": 12130 + }, + { + "epoch": 3.7234499693063228, + "grad_norm": 0.24412120878696442, + "learning_rate": 7.229424601688256e-05, + "loss": 1.7351, + "step": 12131 + }, + { + "epoch": 3.723756906077348, + "grad_norm": 0.2978009581565857, + "learning_rate": 7.228979680683206e-05, + "loss": 1.8037, + "step": 12132 + }, + { + "epoch": 3.7240638428483734, + "grad_norm": 0.33787262439727783, + "learning_rate": 7.228534737650074e-05, + "loss": 1.8421, + "step": 12133 + }, + { + "epoch": 3.7243707796193983, + "grad_norm": 0.2536921203136444, + "learning_rate": 7.228089772593254e-05, + "loss": 1.7472, + "step": 12134 + }, + { + "epoch": 3.7246777163904237, + "grad_norm": 0.24103601276874542, + "learning_rate": 7.227644785517144e-05, + "loss": 1.8011, + "step": 12135 + }, + { + "epoch": 3.7249846531614486, + "grad_norm": 0.3653033375740051, + "learning_rate": 7.227199776426146e-05, + "loss": 1.8018, + "step": 12136 + }, + { + "epoch": 3.725291589932474, + "grad_norm": 0.35728752613067627, + "learning_rate": 7.226754745324652e-05, + "loss": 1.7684, + "step": 12137 + }, + { + "epoch": 3.7255985267034992, + "grad_norm": 0.262018620967865, + "learning_rate": 7.226309692217063e-05, + "loss": 1.8124, + "step": 12138 + }, + { + "epoch": 3.725905463474524, + "grad_norm": 0.3467118442058563, + "learning_rate": 7.225864617107776e-05, + "loss": 1.8761, + "step": 12139 + }, + { + "epoch": 3.7262124002455494, + "grad_norm": 0.4365626871585846, + "learning_rate": 7.22541952000119e-05, + "loss": 1.7159, + "step": 12140 + }, + { + "epoch": 3.7265193370165743, + "grad_norm": 0.2819811999797821, + "learning_rate": 7.224974400901705e-05, + "loss": 1.8051, + "step": 12141 + }, + { + "epoch": 3.7268262737875997, + "grad_norm": 0.39062437415122986, + "learning_rate": 7.224529259813719e-05, + "loss": 1.8517, + "step": 12142 + }, + { + "epoch": 3.727133210558625, + "grad_norm": 0.4383927285671234, + "learning_rate": 7.22408409674163e-05, + "loss": 1.8295, + "step": 12143 + }, + { + "epoch": 3.7274401473296503, + "grad_norm": 0.3043094575405121, + "learning_rate": 7.223638911689839e-05, + "loss": 1.7653, + "step": 12144 + }, + { + "epoch": 3.7277470841006752, + "grad_norm": 0.25198984146118164, + "learning_rate": 7.223193704662746e-05, + "loss": 1.7561, + "step": 12145 + }, + { + "epoch": 3.7280540208717006, + "grad_norm": 0.353565514087677, + "learning_rate": 7.222748475664749e-05, + "loss": 1.8077, + "step": 12146 + }, + { + "epoch": 3.7283609576427255, + "grad_norm": 0.39757224917411804, + "learning_rate": 7.222303224700248e-05, + "loss": 1.7622, + "step": 12147 + }, + { + "epoch": 3.728667894413751, + "grad_norm": 0.35595703125, + "learning_rate": 7.221857951773644e-05, + "loss": 1.8436, + "step": 12148 + }, + { + "epoch": 3.728974831184776, + "grad_norm": 0.2469715029001236, + "learning_rate": 7.221412656889338e-05, + "loss": 1.8531, + "step": 12149 + }, + { + "epoch": 3.729281767955801, + "grad_norm": 0.35324424505233765, + "learning_rate": 7.22096734005173e-05, + "loss": 1.7361, + "step": 12150 + }, + { + "epoch": 3.7295887047268264, + "grad_norm": 0.3783365488052368, + "learning_rate": 7.220522001265223e-05, + "loss": 1.7459, + "step": 12151 + }, + { + "epoch": 3.7298956414978512, + "grad_norm": 0.27526360750198364, + "learning_rate": 7.220076640534212e-05, + "loss": 1.8867, + "step": 12152 + }, + { + "epoch": 3.7302025782688766, + "grad_norm": 0.30863118171691895, + "learning_rate": 7.219631257863105e-05, + "loss": 1.7363, + "step": 12153 + }, + { + "epoch": 3.730509515039902, + "grad_norm": 0.38505107164382935, + "learning_rate": 7.219185853256301e-05, + "loss": 1.764, + "step": 12154 + }, + { + "epoch": 3.730816451810927, + "grad_norm": 0.2925978899002075, + "learning_rate": 7.218740426718202e-05, + "loss": 1.7693, + "step": 12155 + }, + { + "epoch": 3.731123388581952, + "grad_norm": 0.24510078132152557, + "learning_rate": 7.218294978253209e-05, + "loss": 1.8089, + "step": 12156 + }, + { + "epoch": 3.731430325352977, + "grad_norm": 0.33029109239578247, + "learning_rate": 7.217849507865724e-05, + "loss": 1.6885, + "step": 12157 + }, + { + "epoch": 3.7317372621240024, + "grad_norm": 0.333970308303833, + "learning_rate": 7.217404015560149e-05, + "loss": 1.8132, + "step": 12158 + }, + { + "epoch": 3.7320441988950277, + "grad_norm": 0.2467660754919052, + "learning_rate": 7.216958501340891e-05, + "loss": 1.8021, + "step": 12159 + }, + { + "epoch": 3.732351135666053, + "grad_norm": 0.2701449990272522, + "learning_rate": 7.216512965212348e-05, + "loss": 1.7006, + "step": 12160 + }, + { + "epoch": 3.732658072437078, + "grad_norm": 0.2784138023853302, + "learning_rate": 7.216067407178926e-05, + "loss": 1.7616, + "step": 12161 + }, + { + "epoch": 3.7329650092081033, + "grad_norm": 0.2082870900630951, + "learning_rate": 7.215621827245026e-05, + "loss": 1.7391, + "step": 12162 + }, + { + "epoch": 3.733271945979128, + "grad_norm": 0.2477869987487793, + "learning_rate": 7.215176225415053e-05, + "loss": 1.7761, + "step": 12163 + }, + { + "epoch": 3.7335788827501535, + "grad_norm": 0.28395572304725647, + "learning_rate": 7.21473060169341e-05, + "loss": 1.8181, + "step": 12164 + }, + { + "epoch": 3.733885819521179, + "grad_norm": 0.20430058240890503, + "learning_rate": 7.2142849560845e-05, + "loss": 1.7035, + "step": 12165 + }, + { + "epoch": 3.7341927562922037, + "grad_norm": 0.30061420798301697, + "learning_rate": 7.21383928859273e-05, + "loss": 1.7703, + "step": 12166 + }, + { + "epoch": 3.734499693063229, + "grad_norm": 0.33865803480148315, + "learning_rate": 7.2133935992225e-05, + "loss": 1.8204, + "step": 12167 + }, + { + "epoch": 3.734806629834254, + "grad_norm": 0.29172980785369873, + "learning_rate": 7.212947887978221e-05, + "loss": 1.739, + "step": 12168 + }, + { + "epoch": 3.7351135666052793, + "grad_norm": 0.2799396812915802, + "learning_rate": 7.212502154864291e-05, + "loss": 1.8503, + "step": 12169 + }, + { + "epoch": 3.7354205033763046, + "grad_norm": 0.2945539355278015, + "learning_rate": 7.212056399885118e-05, + "loss": 1.7523, + "step": 12170 + }, + { + "epoch": 3.7357274401473295, + "grad_norm": 0.2395290732383728, + "learning_rate": 7.211610623045108e-05, + "loss": 1.7728, + "step": 12171 + }, + { + "epoch": 3.736034376918355, + "grad_norm": 0.24369286000728607, + "learning_rate": 7.211164824348667e-05, + "loss": 1.7725, + "step": 12172 + }, + { + "epoch": 3.7363413136893797, + "grad_norm": 0.3272435963153839, + "learning_rate": 7.210719003800197e-05, + "loss": 1.8531, + "step": 12173 + }, + { + "epoch": 3.736648250460405, + "grad_norm": 0.23954182863235474, + "learning_rate": 7.210273161404107e-05, + "loss": 1.7807, + "step": 12174 + }, + { + "epoch": 3.7369551872314304, + "grad_norm": 0.24547603726387024, + "learning_rate": 7.209827297164801e-05, + "loss": 1.8481, + "step": 12175 + }, + { + "epoch": 3.7372621240024557, + "grad_norm": 0.26926249265670776, + "learning_rate": 7.209381411086687e-05, + "loss": 1.7496, + "step": 12176 + }, + { + "epoch": 3.7375690607734806, + "grad_norm": 0.22948235273361206, + "learning_rate": 7.208935503174172e-05, + "loss": 1.7681, + "step": 12177 + }, + { + "epoch": 3.737875997544506, + "grad_norm": 0.2697654664516449, + "learning_rate": 7.20848957343166e-05, + "loss": 1.789, + "step": 12178 + }, + { + "epoch": 3.738182934315531, + "grad_norm": 0.235344797372818, + "learning_rate": 7.208043621863562e-05, + "loss": 1.8309, + "step": 12179 + }, + { + "epoch": 3.738489871086556, + "grad_norm": 0.2688879072666168, + "learning_rate": 7.20759764847428e-05, + "loss": 1.7898, + "step": 12180 + }, + { + "epoch": 3.7387968078575815, + "grad_norm": 0.26818978786468506, + "learning_rate": 7.207151653268226e-05, + "loss": 1.7882, + "step": 12181 + }, + { + "epoch": 3.7391037446286064, + "grad_norm": 0.2612875998020172, + "learning_rate": 7.206705636249804e-05, + "loss": 1.7352, + "step": 12182 + }, + { + "epoch": 3.7394106813996317, + "grad_norm": 0.22547565400600433, + "learning_rate": 7.206259597423425e-05, + "loss": 1.733, + "step": 12183 + }, + { + "epoch": 3.7397176181706566, + "grad_norm": 0.24645474553108215, + "learning_rate": 7.205813536793495e-05, + "loss": 1.8064, + "step": 12184 + }, + { + "epoch": 3.740024554941682, + "grad_norm": 0.25879329442977905, + "learning_rate": 7.205367454364424e-05, + "loss": 1.8134, + "step": 12185 + }, + { + "epoch": 3.7403314917127073, + "grad_norm": 0.22420097887516022, + "learning_rate": 7.204921350140617e-05, + "loss": 1.7819, + "step": 12186 + }, + { + "epoch": 3.7406384284837326, + "grad_norm": 0.2569858431816101, + "learning_rate": 7.204475224126487e-05, + "loss": 1.784, + "step": 12187 + }, + { + "epoch": 3.7409453652547575, + "grad_norm": 0.23769912123680115, + "learning_rate": 7.20402907632644e-05, + "loss": 1.7853, + "step": 12188 + }, + { + "epoch": 3.741252302025783, + "grad_norm": 0.26935988664627075, + "learning_rate": 7.203582906744885e-05, + "loss": 1.806, + "step": 12189 + }, + { + "epoch": 3.7415592387968077, + "grad_norm": 0.2544274628162384, + "learning_rate": 7.203136715386233e-05, + "loss": 1.7988, + "step": 12190 + }, + { + "epoch": 3.741866175567833, + "grad_norm": 0.22665882110595703, + "learning_rate": 7.202690502254892e-05, + "loss": 1.7798, + "step": 12191 + }, + { + "epoch": 3.7421731123388584, + "grad_norm": 0.24512888491153717, + "learning_rate": 7.202244267355273e-05, + "loss": 1.816, + "step": 12192 + }, + { + "epoch": 3.7424800491098833, + "grad_norm": 0.2408553808927536, + "learning_rate": 7.201798010691785e-05, + "loss": 1.7417, + "step": 12193 + }, + { + "epoch": 3.7427869858809086, + "grad_norm": 0.23142600059509277, + "learning_rate": 7.201351732268838e-05, + "loss": 1.7771, + "step": 12194 + }, + { + "epoch": 3.7430939226519335, + "grad_norm": 0.245071142911911, + "learning_rate": 7.200905432090844e-05, + "loss": 1.7556, + "step": 12195 + }, + { + "epoch": 3.743400859422959, + "grad_norm": 0.2623934745788574, + "learning_rate": 7.200459110162211e-05, + "loss": 1.8042, + "step": 12196 + }, + { + "epoch": 3.743707796193984, + "grad_norm": 0.2531217038631439, + "learning_rate": 7.200012766487353e-05, + "loss": 1.7709, + "step": 12197 + }, + { + "epoch": 3.744014732965009, + "grad_norm": 0.23839864134788513, + "learning_rate": 7.19956640107068e-05, + "loss": 1.8202, + "step": 12198 + }, + { + "epoch": 3.7443216697360344, + "grad_norm": 0.2342260777950287, + "learning_rate": 7.1991200139166e-05, + "loss": 1.827, + "step": 12199 + }, + { + "epoch": 3.7446286065070593, + "grad_norm": 0.25511276721954346, + "learning_rate": 7.198673605029528e-05, + "loss": 1.7766, + "step": 12200 + }, + { + "epoch": 3.7449355432780846, + "grad_norm": 0.27601274847984314, + "learning_rate": 7.198227174413876e-05, + "loss": 1.7716, + "step": 12201 + }, + { + "epoch": 3.74524248004911, + "grad_norm": 0.3027385175228119, + "learning_rate": 7.197780722074056e-05, + "loss": 1.8007, + "step": 12202 + }, + { + "epoch": 3.7455494168201353, + "grad_norm": 0.31242382526397705, + "learning_rate": 7.197334248014477e-05, + "loss": 1.8089, + "step": 12203 + }, + { + "epoch": 3.74585635359116, + "grad_norm": 0.3673859238624573, + "learning_rate": 7.196887752239551e-05, + "loss": 1.8017, + "step": 12204 + }, + { + "epoch": 3.7461632903621855, + "grad_norm": 0.3152726888656616, + "learning_rate": 7.196441234753695e-05, + "loss": 1.7108, + "step": 12205 + }, + { + "epoch": 3.7464702271332104, + "grad_norm": 0.2606927156448364, + "learning_rate": 7.195994695561319e-05, + "loss": 1.8066, + "step": 12206 + }, + { + "epoch": 3.7467771639042358, + "grad_norm": 0.37624871730804443, + "learning_rate": 7.195548134666836e-05, + "loss": 1.725, + "step": 12207 + }, + { + "epoch": 3.747084100675261, + "grad_norm": 0.4138187766075134, + "learning_rate": 7.195101552074658e-05, + "loss": 1.7838, + "step": 12208 + }, + { + "epoch": 3.747391037446286, + "grad_norm": 0.3668459951877594, + "learning_rate": 7.194654947789204e-05, + "loss": 1.7575, + "step": 12209 + }, + { + "epoch": 3.7476979742173113, + "grad_norm": 0.27947792410850525, + "learning_rate": 7.19420832181488e-05, + "loss": 1.792, + "step": 12210 + }, + { + "epoch": 3.748004910988336, + "grad_norm": 0.2507692873477936, + "learning_rate": 7.193761674156103e-05, + "loss": 1.7752, + "step": 12211 + }, + { + "epoch": 3.7483118477593615, + "grad_norm": 0.3209949731826782, + "learning_rate": 7.193315004817289e-05, + "loss": 1.8491, + "step": 12212 + }, + { + "epoch": 3.748618784530387, + "grad_norm": 0.32883042097091675, + "learning_rate": 7.192868313802849e-05, + "loss": 1.8135, + "step": 12213 + }, + { + "epoch": 3.7489257213014118, + "grad_norm": 0.2450616955757141, + "learning_rate": 7.192421601117201e-05, + "loss": 1.7722, + "step": 12214 + }, + { + "epoch": 3.749232658072437, + "grad_norm": 0.2545110285282135, + "learning_rate": 7.191974866764757e-05, + "loss": 1.7866, + "step": 12215 + }, + { + "epoch": 3.749539594843462, + "grad_norm": 0.264017790555954, + "learning_rate": 7.191528110749932e-05, + "loss": 1.778, + "step": 12216 + }, + { + "epoch": 3.7498465316144873, + "grad_norm": 0.3156309425830841, + "learning_rate": 7.191081333077142e-05, + "loss": 1.7917, + "step": 12217 + }, + { + "epoch": 3.7501534683855127, + "grad_norm": 0.3578774631023407, + "learning_rate": 7.190634533750802e-05, + "loss": 1.8468, + "step": 12218 + }, + { + "epoch": 3.750460405156538, + "grad_norm": 0.30735981464385986, + "learning_rate": 7.19018771277533e-05, + "loss": 1.7502, + "step": 12219 + }, + { + "epoch": 3.750767341927563, + "grad_norm": 0.22870220243930817, + "learning_rate": 7.189740870155135e-05, + "loss": 1.7686, + "step": 12220 + }, + { + "epoch": 3.7510742786985882, + "grad_norm": 0.30297720432281494, + "learning_rate": 7.18929400589464e-05, + "loss": 1.826, + "step": 12221 + }, + { + "epoch": 3.751381215469613, + "grad_norm": 0.2735389173030853, + "learning_rate": 7.188847119998257e-05, + "loss": 1.8142, + "step": 12222 + }, + { + "epoch": 3.7516881522406385, + "grad_norm": 0.2823885679244995, + "learning_rate": 7.188400212470405e-05, + "loss": 1.8028, + "step": 12223 + }, + { + "epoch": 3.751995089011664, + "grad_norm": 0.4184139370918274, + "learning_rate": 7.187953283315499e-05, + "loss": 1.8467, + "step": 12224 + }, + { + "epoch": 3.7523020257826887, + "grad_norm": 0.3559226095676422, + "learning_rate": 7.187506332537957e-05, + "loss": 1.7416, + "step": 12225 + }, + { + "epoch": 3.752608962553714, + "grad_norm": 0.26055800914764404, + "learning_rate": 7.187059360142194e-05, + "loss": 1.8309, + "step": 12226 + }, + { + "epoch": 3.752915899324739, + "grad_norm": 0.28032660484313965, + "learning_rate": 7.186612366132629e-05, + "loss": 1.7926, + "step": 12227 + }, + { + "epoch": 3.7532228360957642, + "grad_norm": 0.26229965686798096, + "learning_rate": 7.18616535051368e-05, + "loss": 1.7368, + "step": 12228 + }, + { + "epoch": 3.7535297728667896, + "grad_norm": 0.2779417634010315, + "learning_rate": 7.185718313289763e-05, + "loss": 1.8418, + "step": 12229 + }, + { + "epoch": 3.7538367096378145, + "grad_norm": 0.26164770126342773, + "learning_rate": 7.185271254465295e-05, + "loss": 1.7511, + "step": 12230 + }, + { + "epoch": 3.75414364640884, + "grad_norm": 0.30725157260894775, + "learning_rate": 7.184824174044698e-05, + "loss": 1.7661, + "step": 12231 + }, + { + "epoch": 3.7544505831798647, + "grad_norm": 0.33111417293548584, + "learning_rate": 7.184377072032386e-05, + "loss": 1.7341, + "step": 12232 + }, + { + "epoch": 3.75475751995089, + "grad_norm": 0.23978343605995178, + "learning_rate": 7.183929948432779e-05, + "loss": 1.7151, + "step": 12233 + }, + { + "epoch": 3.7550644567219154, + "grad_norm": 0.3057664632797241, + "learning_rate": 7.183482803250299e-05, + "loss": 1.8446, + "step": 12234 + }, + { + "epoch": 3.7553713934929407, + "grad_norm": 0.2629055678844452, + "learning_rate": 7.18303563648936e-05, + "loss": 1.7415, + "step": 12235 + }, + { + "epoch": 3.7556783302639656, + "grad_norm": 0.22703498601913452, + "learning_rate": 7.182588448154386e-05, + "loss": 1.8188, + "step": 12236 + }, + { + "epoch": 3.755985267034991, + "grad_norm": 0.3014034032821655, + "learning_rate": 7.182141238249792e-05, + "loss": 1.8634, + "step": 12237 + }, + { + "epoch": 3.756292203806016, + "grad_norm": 0.28859084844589233, + "learning_rate": 7.181694006779998e-05, + "loss": 1.7509, + "step": 12238 + }, + { + "epoch": 3.756599140577041, + "grad_norm": 0.293720543384552, + "learning_rate": 7.181246753749426e-05, + "loss": 1.777, + "step": 12239 + }, + { + "epoch": 3.7569060773480665, + "grad_norm": 0.2374580055475235, + "learning_rate": 7.180799479162496e-05, + "loss": 1.7492, + "step": 12240 + }, + { + "epoch": 3.7572130141190914, + "grad_norm": 0.30106452107429504, + "learning_rate": 7.180352183023627e-05, + "loss": 1.7538, + "step": 12241 + }, + { + "epoch": 3.7575199508901167, + "grad_norm": 0.3504682183265686, + "learning_rate": 7.179904865337238e-05, + "loss": 1.7477, + "step": 12242 + }, + { + "epoch": 3.7578268876611416, + "grad_norm": 0.2901679575443268, + "learning_rate": 7.179457526107754e-05, + "loss": 1.9412, + "step": 12243 + }, + { + "epoch": 3.758133824432167, + "grad_norm": 0.37690606713294983, + "learning_rate": 7.179010165339591e-05, + "loss": 1.8222, + "step": 12244 + }, + { + "epoch": 3.7584407612031923, + "grad_norm": 0.45126965641975403, + "learning_rate": 7.178562783037172e-05, + "loss": 1.8563, + "step": 12245 + }, + { + "epoch": 3.758747697974217, + "grad_norm": 0.2747548818588257, + "learning_rate": 7.178115379204921e-05, + "loss": 1.7179, + "step": 12246 + }, + { + "epoch": 3.7590546347452425, + "grad_norm": 0.43243977427482605, + "learning_rate": 7.177667953847257e-05, + "loss": 1.8157, + "step": 12247 + }, + { + "epoch": 3.7593615715162674, + "grad_norm": 0.529448390007019, + "learning_rate": 7.177220506968602e-05, + "loss": 1.8113, + "step": 12248 + }, + { + "epoch": 3.7596685082872927, + "grad_norm": 0.3099314868450165, + "learning_rate": 7.176773038573377e-05, + "loss": 1.7833, + "step": 12249 + }, + { + "epoch": 3.759975445058318, + "grad_norm": 0.3111872375011444, + "learning_rate": 7.176325548666004e-05, + "loss": 1.7965, + "step": 12250 + }, + { + "epoch": 3.7602823818293434, + "grad_norm": 0.38437551259994507, + "learning_rate": 7.175878037250907e-05, + "loss": 1.7822, + "step": 12251 + }, + { + "epoch": 3.7605893186003683, + "grad_norm": 0.33643704652786255, + "learning_rate": 7.175430504332509e-05, + "loss": 1.7839, + "step": 12252 + }, + { + "epoch": 3.7608962553713936, + "grad_norm": 0.24705304205417633, + "learning_rate": 7.174982949915232e-05, + "loss": 1.8302, + "step": 12253 + }, + { + "epoch": 3.7612031921424185, + "grad_norm": 0.3615458309650421, + "learning_rate": 7.174535374003497e-05, + "loss": 1.7963, + "step": 12254 + }, + { + "epoch": 3.761510128913444, + "grad_norm": 0.36486589908599854, + "learning_rate": 7.17408777660173e-05, + "loss": 1.7933, + "step": 12255 + }, + { + "epoch": 3.761817065684469, + "grad_norm": 0.2566867172718048, + "learning_rate": 7.173640157714352e-05, + "loss": 1.7254, + "step": 12256 + }, + { + "epoch": 3.762124002455494, + "grad_norm": 0.2602523863315582, + "learning_rate": 7.17319251734579e-05, + "loss": 1.7357, + "step": 12257 + }, + { + "epoch": 3.7624309392265194, + "grad_norm": 0.3626105785369873, + "learning_rate": 7.172744855500464e-05, + "loss": 1.7971, + "step": 12258 + }, + { + "epoch": 3.7627378759975443, + "grad_norm": 0.36327603459358215, + "learning_rate": 7.172297172182802e-05, + "loss": 1.7819, + "step": 12259 + }, + { + "epoch": 3.7630448127685696, + "grad_norm": 0.25935736298561096, + "learning_rate": 7.171849467397224e-05, + "loss": 1.8112, + "step": 12260 + }, + { + "epoch": 3.763351749539595, + "grad_norm": 0.2779700756072998, + "learning_rate": 7.171401741148156e-05, + "loss": 1.786, + "step": 12261 + }, + { + "epoch": 3.7636586863106203, + "grad_norm": 0.3089013695716858, + "learning_rate": 7.170953993440025e-05, + "loss": 1.7808, + "step": 12262 + }, + { + "epoch": 3.763965623081645, + "grad_norm": 0.2562308609485626, + "learning_rate": 7.170506224277253e-05, + "loss": 1.8207, + "step": 12263 + }, + { + "epoch": 3.7642725598526705, + "grad_norm": 0.2907634973526001, + "learning_rate": 7.170058433664268e-05, + "loss": 1.7638, + "step": 12264 + }, + { + "epoch": 3.7645794966236954, + "grad_norm": 0.30341312289237976, + "learning_rate": 7.169610621605493e-05, + "loss": 1.7827, + "step": 12265 + }, + { + "epoch": 3.7648864333947207, + "grad_norm": 0.27091866731643677, + "learning_rate": 7.169162788105353e-05, + "loss": 1.786, + "step": 12266 + }, + { + "epoch": 3.765193370165746, + "grad_norm": 0.234042689204216, + "learning_rate": 7.168714933168277e-05, + "loss": 1.7638, + "step": 12267 + }, + { + "epoch": 3.765500306936771, + "grad_norm": 0.2477465271949768, + "learning_rate": 7.168267056798686e-05, + "loss": 1.7275, + "step": 12268 + }, + { + "epoch": 3.7658072437077963, + "grad_norm": 0.25578543543815613, + "learning_rate": 7.167819159001012e-05, + "loss": 1.7831, + "step": 12269 + }, + { + "epoch": 3.766114180478821, + "grad_norm": 0.26629674434661865, + "learning_rate": 7.167371239779678e-05, + "loss": 1.7866, + "step": 12270 + }, + { + "epoch": 3.7664211172498465, + "grad_norm": 0.31350967288017273, + "learning_rate": 7.16692329913911e-05, + "loss": 1.7755, + "step": 12271 + }, + { + "epoch": 3.766728054020872, + "grad_norm": 0.2670116126537323, + "learning_rate": 7.166475337083735e-05, + "loss": 1.7524, + "step": 12272 + }, + { + "epoch": 3.7670349907918967, + "grad_norm": 0.26503682136535645, + "learning_rate": 7.166027353617983e-05, + "loss": 1.7867, + "step": 12273 + }, + { + "epoch": 3.767341927562922, + "grad_norm": 0.3674192428588867, + "learning_rate": 7.165579348746278e-05, + "loss": 1.7604, + "step": 12274 + }, + { + "epoch": 3.767648864333947, + "grad_norm": 0.4120824337005615, + "learning_rate": 7.16513132247305e-05, + "loss": 1.7905, + "step": 12275 + }, + { + "epoch": 3.7679558011049723, + "grad_norm": 0.29074826836586, + "learning_rate": 7.164683274802723e-05, + "loss": 1.7539, + "step": 12276 + }, + { + "epoch": 3.7682627378759976, + "grad_norm": 0.22223204374313354, + "learning_rate": 7.164235205739729e-05, + "loss": 1.755, + "step": 12277 + }, + { + "epoch": 3.768569674647023, + "grad_norm": 0.23997461795806885, + "learning_rate": 7.163787115288494e-05, + "loss": 1.8024, + "step": 12278 + }, + { + "epoch": 3.768876611418048, + "grad_norm": 0.2556418776512146, + "learning_rate": 7.163339003453445e-05, + "loss": 1.7717, + "step": 12279 + }, + { + "epoch": 3.769183548189073, + "grad_norm": 0.3107141852378845, + "learning_rate": 7.162890870239013e-05, + "loss": 1.8257, + "step": 12280 + }, + { + "epoch": 3.769490484960098, + "grad_norm": 0.35293644666671753, + "learning_rate": 7.162442715649627e-05, + "loss": 1.7855, + "step": 12281 + }, + { + "epoch": 3.7697974217311234, + "grad_norm": 0.25989311933517456, + "learning_rate": 7.161994539689713e-05, + "loss": 1.7816, + "step": 12282 + }, + { + "epoch": 3.7701043585021488, + "grad_norm": 0.25615137815475464, + "learning_rate": 7.161546342363701e-05, + "loss": 1.7738, + "step": 12283 + }, + { + "epoch": 3.7704112952731736, + "grad_norm": 0.29345229268074036, + "learning_rate": 7.161098123676023e-05, + "loss": 1.8496, + "step": 12284 + }, + { + "epoch": 3.770718232044199, + "grad_norm": 0.2975969612598419, + "learning_rate": 7.160649883631105e-05, + "loss": 1.7342, + "step": 12285 + }, + { + "epoch": 3.771025168815224, + "grad_norm": 0.28458064794540405, + "learning_rate": 7.16020162223338e-05, + "loss": 1.8253, + "step": 12286 + }, + { + "epoch": 3.771332105586249, + "grad_norm": 0.2798703908920288, + "learning_rate": 7.159753339487276e-05, + "loss": 1.746, + "step": 12287 + }, + { + "epoch": 3.7716390423572745, + "grad_norm": 0.380044549703598, + "learning_rate": 7.159305035397223e-05, + "loss": 1.769, + "step": 12288 + }, + { + "epoch": 3.7719459791282994, + "grad_norm": 0.28760263323783875, + "learning_rate": 7.158856709967654e-05, + "loss": 1.7466, + "step": 12289 + }, + { + "epoch": 3.7722529158993248, + "grad_norm": 0.23314130306243896, + "learning_rate": 7.158408363202996e-05, + "loss": 1.7545, + "step": 12290 + }, + { + "epoch": 3.7725598526703497, + "grad_norm": 0.2864209711551666, + "learning_rate": 7.15795999510768e-05, + "loss": 1.7549, + "step": 12291 + }, + { + "epoch": 3.772866789441375, + "grad_norm": 0.2605510354042053, + "learning_rate": 7.15751160568614e-05, + "loss": 1.7684, + "step": 12292 + }, + { + "epoch": 3.7731737262124003, + "grad_norm": 0.2475409358739853, + "learning_rate": 7.157063194942806e-05, + "loss": 1.7841, + "step": 12293 + }, + { + "epoch": 3.7734806629834257, + "grad_norm": 0.22479289770126343, + "learning_rate": 7.15661476288211e-05, + "loss": 1.7592, + "step": 12294 + }, + { + "epoch": 3.7737875997544506, + "grad_norm": 0.22076937556266785, + "learning_rate": 7.156166309508482e-05, + "loss": 1.7853, + "step": 12295 + }, + { + "epoch": 3.774094536525476, + "grad_norm": 0.26082465052604675, + "learning_rate": 7.155717834826353e-05, + "loss": 1.7828, + "step": 12296 + }, + { + "epoch": 3.7744014732965008, + "grad_norm": 0.24771755933761597, + "learning_rate": 7.15526933884016e-05, + "loss": 1.758, + "step": 12297 + }, + { + "epoch": 3.774708410067526, + "grad_norm": 0.23806311190128326, + "learning_rate": 7.15482082155433e-05, + "loss": 1.7237, + "step": 12298 + }, + { + "epoch": 3.7750153468385514, + "grad_norm": 0.24822844564914703, + "learning_rate": 7.154372282973299e-05, + "loss": 1.7828, + "step": 12299 + }, + { + "epoch": 3.7753222836095763, + "grad_norm": 0.24423740804195404, + "learning_rate": 7.153923723101496e-05, + "loss": 1.8014, + "step": 12300 + }, + { + "epoch": 3.7756292203806017, + "grad_norm": 0.24966634809970856, + "learning_rate": 7.15347514194336e-05, + "loss": 1.8005, + "step": 12301 + }, + { + "epoch": 3.7759361571516266, + "grad_norm": 0.2549348473548889, + "learning_rate": 7.153026539503317e-05, + "loss": 1.8473, + "step": 12302 + }, + { + "epoch": 3.776243093922652, + "grad_norm": 0.23709465563297272, + "learning_rate": 7.152577915785807e-05, + "loss": 1.8031, + "step": 12303 + }, + { + "epoch": 3.7765500306936772, + "grad_norm": 0.28554168343544006, + "learning_rate": 7.152129270795258e-05, + "loss": 1.7836, + "step": 12304 + }, + { + "epoch": 3.776856967464702, + "grad_norm": 0.2568756639957428, + "learning_rate": 7.151680604536107e-05, + "loss": 1.7345, + "step": 12305 + }, + { + "epoch": 3.7771639042357275, + "grad_norm": 0.23883797228336334, + "learning_rate": 7.151231917012787e-05, + "loss": 1.7342, + "step": 12306 + }, + { + "epoch": 3.7774708410067523, + "grad_norm": 0.24026677012443542, + "learning_rate": 7.150783208229732e-05, + "loss": 1.8156, + "step": 12307 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.25756222009658813, + "learning_rate": 7.150334478191376e-05, + "loss": 1.8204, + "step": 12308 + }, + { + "epoch": 3.778084714548803, + "grad_norm": 0.24917428195476532, + "learning_rate": 7.149885726902156e-05, + "loss": 1.7867, + "step": 12309 + }, + { + "epoch": 3.7783916513198283, + "grad_norm": 0.26269277930259705, + "learning_rate": 7.149436954366504e-05, + "loss": 1.8233, + "step": 12310 + }, + { + "epoch": 3.7786985880908532, + "grad_norm": 0.2502293586730957, + "learning_rate": 7.148988160588857e-05, + "loss": 1.8329, + "step": 12311 + }, + { + "epoch": 3.7790055248618786, + "grad_norm": 0.24845796823501587, + "learning_rate": 7.14853934557365e-05, + "loss": 1.7936, + "step": 12312 + }, + { + "epoch": 3.7793124616329035, + "grad_norm": 0.2453537881374359, + "learning_rate": 7.148090509325315e-05, + "loss": 1.8149, + "step": 12313 + }, + { + "epoch": 3.779619398403929, + "grad_norm": 0.2336922138929367, + "learning_rate": 7.147641651848293e-05, + "loss": 1.7826, + "step": 12314 + }, + { + "epoch": 3.779926335174954, + "grad_norm": 0.25542667508125305, + "learning_rate": 7.147192773147017e-05, + "loss": 1.801, + "step": 12315 + }, + { + "epoch": 3.780233271945979, + "grad_norm": 0.2301866114139557, + "learning_rate": 7.146743873225923e-05, + "loss": 1.7302, + "step": 12316 + }, + { + "epoch": 3.7805402087170044, + "grad_norm": 0.25821468234062195, + "learning_rate": 7.14629495208945e-05, + "loss": 1.7704, + "step": 12317 + }, + { + "epoch": 3.7808471454880292, + "grad_norm": 0.22537970542907715, + "learning_rate": 7.145846009742029e-05, + "loss": 1.7281, + "step": 12318 + }, + { + "epoch": 3.7811540822590546, + "grad_norm": 0.2565869688987732, + "learning_rate": 7.145397046188102e-05, + "loss": 1.8077, + "step": 12319 + }, + { + "epoch": 3.78146101903008, + "grad_norm": 0.2588396966457367, + "learning_rate": 7.144948061432105e-05, + "loss": 1.7438, + "step": 12320 + }, + { + "epoch": 3.781767955801105, + "grad_norm": 0.2538135349750519, + "learning_rate": 7.144499055478472e-05, + "loss": 1.8253, + "step": 12321 + }, + { + "epoch": 3.78207489257213, + "grad_norm": 0.2272680401802063, + "learning_rate": 7.144050028331644e-05, + "loss": 1.7408, + "step": 12322 + }, + { + "epoch": 3.782381829343155, + "grad_norm": 0.25010406970977783, + "learning_rate": 7.143600979996055e-05, + "loss": 1.8219, + "step": 12323 + }, + { + "epoch": 3.7826887661141804, + "grad_norm": 0.2560291290283203, + "learning_rate": 7.143151910476144e-05, + "loss": 1.7734, + "step": 12324 + }, + { + "epoch": 3.7829957028852057, + "grad_norm": 0.24927431344985962, + "learning_rate": 7.142702819776352e-05, + "loss": 1.7682, + "step": 12325 + }, + { + "epoch": 3.783302639656231, + "grad_norm": 0.2501368224620819, + "learning_rate": 7.142253707901114e-05, + "loss": 1.818, + "step": 12326 + }, + { + "epoch": 3.783609576427256, + "grad_norm": 0.3132917284965515, + "learning_rate": 7.141804574854871e-05, + "loss": 1.7793, + "step": 12327 + }, + { + "epoch": 3.7839165131982813, + "grad_norm": 0.24229925870895386, + "learning_rate": 7.141355420642057e-05, + "loss": 1.7585, + "step": 12328 + }, + { + "epoch": 3.784223449969306, + "grad_norm": 0.22612906992435455, + "learning_rate": 7.140906245267116e-05, + "loss": 1.7374, + "step": 12329 + }, + { + "epoch": 3.7845303867403315, + "grad_norm": 0.26354333758354187, + "learning_rate": 7.140457048734482e-05, + "loss": 1.7751, + "step": 12330 + }, + { + "epoch": 3.784837323511357, + "grad_norm": 0.21500451862812042, + "learning_rate": 7.140007831048599e-05, + "loss": 1.7827, + "step": 12331 + }, + { + "epoch": 3.7851442602823817, + "grad_norm": 0.2826332151889801, + "learning_rate": 7.139558592213904e-05, + "loss": 1.7522, + "step": 12332 + }, + { + "epoch": 3.785451197053407, + "grad_norm": 0.3217725455760956, + "learning_rate": 7.139109332234837e-05, + "loss": 1.8758, + "step": 12333 + }, + { + "epoch": 3.785758133824432, + "grad_norm": 0.26934614777565, + "learning_rate": 7.138660051115837e-05, + "loss": 1.8322, + "step": 12334 + }, + { + "epoch": 3.7860650705954573, + "grad_norm": 0.2653827667236328, + "learning_rate": 7.138210748861346e-05, + "loss": 1.7651, + "step": 12335 + }, + { + "epoch": 3.7863720073664826, + "grad_norm": 0.30470311641693115, + "learning_rate": 7.137761425475802e-05, + "loss": 1.855, + "step": 12336 + }, + { + "epoch": 3.786678944137508, + "grad_norm": 0.2558726370334625, + "learning_rate": 7.137312080963647e-05, + "loss": 1.7174, + "step": 12337 + }, + { + "epoch": 3.786985880908533, + "grad_norm": 0.24025602638721466, + "learning_rate": 7.136862715329322e-05, + "loss": 1.7565, + "step": 12338 + }, + { + "epoch": 3.787292817679558, + "grad_norm": 0.34205392003059387, + "learning_rate": 7.136413328577267e-05, + "loss": 1.8116, + "step": 12339 + }, + { + "epoch": 3.787599754450583, + "grad_norm": 0.4069152772426605, + "learning_rate": 7.135963920711923e-05, + "loss": 1.7662, + "step": 12340 + }, + { + "epoch": 3.7879066912216084, + "grad_norm": 0.3915627598762512, + "learning_rate": 7.13551449173773e-05, + "loss": 1.81, + "step": 12341 + }, + { + "epoch": 3.7882136279926337, + "grad_norm": 0.27136507630348206, + "learning_rate": 7.135065041659134e-05, + "loss": 1.7845, + "step": 12342 + }, + { + "epoch": 3.7885205647636586, + "grad_norm": 0.2924078106880188, + "learning_rate": 7.134615570480572e-05, + "loss": 1.8606, + "step": 12343 + }, + { + "epoch": 3.788827501534684, + "grad_norm": 0.35581526160240173, + "learning_rate": 7.134166078206488e-05, + "loss": 1.7785, + "step": 12344 + }, + { + "epoch": 3.789134438305709, + "grad_norm": 0.3003756105899811, + "learning_rate": 7.133716564841324e-05, + "loss": 1.7321, + "step": 12345 + }, + { + "epoch": 3.789441375076734, + "grad_norm": 0.2586000859737396, + "learning_rate": 7.133267030389524e-05, + "loss": 1.7889, + "step": 12346 + }, + { + "epoch": 3.7897483118477595, + "grad_norm": 0.28053075075149536, + "learning_rate": 7.132817474855527e-05, + "loss": 1.8216, + "step": 12347 + }, + { + "epoch": 3.7900552486187844, + "grad_norm": 0.3064870834350586, + "learning_rate": 7.132367898243777e-05, + "loss": 1.7528, + "step": 12348 + }, + { + "epoch": 3.7903621853898097, + "grad_norm": 0.3045158386230469, + "learning_rate": 7.131918300558719e-05, + "loss": 1.8251, + "step": 12349 + }, + { + "epoch": 3.7906691221608346, + "grad_norm": 0.2438485324382782, + "learning_rate": 7.131468681804794e-05, + "loss": 1.7505, + "step": 12350 + }, + { + "epoch": 3.79097605893186, + "grad_norm": 0.24239958822727203, + "learning_rate": 7.131019041986447e-05, + "loss": 1.7544, + "step": 12351 + }, + { + "epoch": 3.7912829957028853, + "grad_norm": 0.24632441997528076, + "learning_rate": 7.130569381108121e-05, + "loss": 1.7485, + "step": 12352 + }, + { + "epoch": 3.7915899324739106, + "grad_norm": 0.22553624212741852, + "learning_rate": 7.13011969917426e-05, + "loss": 1.803, + "step": 12353 + }, + { + "epoch": 3.7918968692449355, + "grad_norm": 0.2164420485496521, + "learning_rate": 7.129669996189306e-05, + "loss": 1.7307, + "step": 12354 + }, + { + "epoch": 3.792203806015961, + "grad_norm": 0.25104281306266785, + "learning_rate": 7.129220272157705e-05, + "loss": 1.8154, + "step": 12355 + }, + { + "epoch": 3.7925107427869857, + "grad_norm": 0.25533202290534973, + "learning_rate": 7.128770527083903e-05, + "loss": 1.8046, + "step": 12356 + }, + { + "epoch": 3.792817679558011, + "grad_norm": 0.24428130686283112, + "learning_rate": 7.128320760972341e-05, + "loss": 1.7984, + "step": 12357 + }, + { + "epoch": 3.7931246163290364, + "grad_norm": 0.2366408109664917, + "learning_rate": 7.127870973827467e-05, + "loss": 1.7781, + "step": 12358 + }, + { + "epoch": 3.7934315531000613, + "grad_norm": 0.2558888792991638, + "learning_rate": 7.127421165653722e-05, + "loss": 1.7858, + "step": 12359 + }, + { + "epoch": 3.7937384898710866, + "grad_norm": 0.25825443863868713, + "learning_rate": 7.126971336455558e-05, + "loss": 1.8292, + "step": 12360 + }, + { + "epoch": 3.7940454266421115, + "grad_norm": 0.2554624080657959, + "learning_rate": 7.126521486237415e-05, + "loss": 1.822, + "step": 12361 + }, + { + "epoch": 3.794352363413137, + "grad_norm": 0.3030763268470764, + "learning_rate": 7.126071615003742e-05, + "loss": 1.8261, + "step": 12362 + }, + { + "epoch": 3.794659300184162, + "grad_norm": 0.3047907054424286, + "learning_rate": 7.125621722758981e-05, + "loss": 1.8419, + "step": 12363 + }, + { + "epoch": 3.794966236955187, + "grad_norm": 0.27782654762268066, + "learning_rate": 7.12517180950758e-05, + "loss": 1.7959, + "step": 12364 + }, + { + "epoch": 3.7952731737262124, + "grad_norm": 0.24526572227478027, + "learning_rate": 7.124721875253986e-05, + "loss": 1.7313, + "step": 12365 + }, + { + "epoch": 3.7955801104972373, + "grad_norm": 0.23718179762363434, + "learning_rate": 7.124271920002646e-05, + "loss": 1.7479, + "step": 12366 + }, + { + "epoch": 3.7958870472682626, + "grad_norm": 0.2880019247531891, + "learning_rate": 7.123821943758004e-05, + "loss": 1.7792, + "step": 12367 + }, + { + "epoch": 3.796193984039288, + "grad_norm": 0.28923723101615906, + "learning_rate": 7.123371946524511e-05, + "loss": 1.7474, + "step": 12368 + }, + { + "epoch": 3.7965009208103133, + "grad_norm": 0.2281525880098343, + "learning_rate": 7.122921928306612e-05, + "loss": 1.8106, + "step": 12369 + }, + { + "epoch": 3.796807857581338, + "grad_norm": 0.34825438261032104, + "learning_rate": 7.122471889108752e-05, + "loss": 1.8076, + "step": 12370 + }, + { + "epoch": 3.7971147943523635, + "grad_norm": 0.41145995259284973, + "learning_rate": 7.122021828935382e-05, + "loss": 1.7692, + "step": 12371 + }, + { + "epoch": 3.7974217311233884, + "grad_norm": 0.31711262464523315, + "learning_rate": 7.12157174779095e-05, + "loss": 1.8101, + "step": 12372 + }, + { + "epoch": 3.7977286678944138, + "grad_norm": 0.3044308125972748, + "learning_rate": 7.1211216456799e-05, + "loss": 1.8238, + "step": 12373 + }, + { + "epoch": 3.798035604665439, + "grad_norm": 0.3750055134296417, + "learning_rate": 7.120671522606683e-05, + "loss": 1.7323, + "step": 12374 + }, + { + "epoch": 3.798342541436464, + "grad_norm": 0.38852599263191223, + "learning_rate": 7.120221378575749e-05, + "loss": 1.8402, + "step": 12375 + }, + { + "epoch": 3.7986494782074893, + "grad_norm": 0.3430371582508087, + "learning_rate": 7.119771213591541e-05, + "loss": 1.8369, + "step": 12376 + }, + { + "epoch": 3.798956414978514, + "grad_norm": 0.4787428677082062, + "learning_rate": 7.119321027658515e-05, + "loss": 1.7977, + "step": 12377 + }, + { + "epoch": 3.7992633517495396, + "grad_norm": 0.4263977110385895, + "learning_rate": 7.118870820781114e-05, + "loss": 1.8208, + "step": 12378 + }, + { + "epoch": 3.799570288520565, + "grad_norm": 0.28649669885635376, + "learning_rate": 7.118420592963793e-05, + "loss": 1.773, + "step": 12379 + }, + { + "epoch": 3.7998772252915898, + "grad_norm": 0.26070261001586914, + "learning_rate": 7.117970344210996e-05, + "loss": 1.6866, + "step": 12380 + }, + { + "epoch": 3.800184162062615, + "grad_norm": 0.30127593874931335, + "learning_rate": 7.117520074527173e-05, + "loss": 1.7208, + "step": 12381 + }, + { + "epoch": 3.80049109883364, + "grad_norm": 0.23639258742332458, + "learning_rate": 7.117069783916777e-05, + "loss": 1.7504, + "step": 12382 + }, + { + "epoch": 3.8007980356046653, + "grad_norm": 0.2852858901023865, + "learning_rate": 7.116619472384256e-05, + "loss": 1.7954, + "step": 12383 + }, + { + "epoch": 3.8011049723756907, + "grad_norm": 0.2673225998878479, + "learning_rate": 7.116169139934063e-05, + "loss": 1.7562, + "step": 12384 + }, + { + "epoch": 3.801411909146716, + "grad_norm": 0.21615394949913025, + "learning_rate": 7.115718786570644e-05, + "loss": 1.7126, + "step": 12385 + }, + { + "epoch": 3.801718845917741, + "grad_norm": 0.2165435254573822, + "learning_rate": 7.115268412298453e-05, + "loss": 1.7171, + "step": 12386 + }, + { + "epoch": 3.8020257826887662, + "grad_norm": 0.280564546585083, + "learning_rate": 7.114818017121939e-05, + "loss": 1.7711, + "step": 12387 + }, + { + "epoch": 3.802332719459791, + "grad_norm": 0.3023521304130554, + "learning_rate": 7.114367601045555e-05, + "loss": 1.7538, + "step": 12388 + }, + { + "epoch": 3.8026396562308165, + "grad_norm": 0.27252480387687683, + "learning_rate": 7.11391716407375e-05, + "loss": 1.7604, + "step": 12389 + }, + { + "epoch": 3.802946593001842, + "grad_norm": 0.2122909128665924, + "learning_rate": 7.113466706210976e-05, + "loss": 1.716, + "step": 12390 + }, + { + "epoch": 3.8032535297728667, + "grad_norm": 0.30141574144363403, + "learning_rate": 7.113016227461686e-05, + "loss": 1.7636, + "step": 12391 + }, + { + "epoch": 3.803560466543892, + "grad_norm": 0.33359697461128235, + "learning_rate": 7.112565727830331e-05, + "loss": 1.7805, + "step": 12392 + }, + { + "epoch": 3.803867403314917, + "grad_norm": 0.3161376714706421, + "learning_rate": 7.112115207321364e-05, + "loss": 1.7974, + "step": 12393 + }, + { + "epoch": 3.8041743400859422, + "grad_norm": 0.29028698801994324, + "learning_rate": 7.111664665939235e-05, + "loss": 1.83, + "step": 12394 + }, + { + "epoch": 3.8044812768569676, + "grad_norm": 0.38829556107521057, + "learning_rate": 7.1112141036884e-05, + "loss": 1.8684, + "step": 12395 + }, + { + "epoch": 3.804788213627993, + "grad_norm": 0.4118283987045288, + "learning_rate": 7.110763520573309e-05, + "loss": 1.7812, + "step": 12396 + }, + { + "epoch": 3.805095150399018, + "grad_norm": 0.3907717168331146, + "learning_rate": 7.110312916598416e-05, + "loss": 1.7789, + "step": 12397 + }, + { + "epoch": 3.805402087170043, + "grad_norm": 0.2768644690513611, + "learning_rate": 7.109862291768173e-05, + "loss": 1.8575, + "step": 12398 + }, + { + "epoch": 3.805709023941068, + "grad_norm": 0.3234006464481354, + "learning_rate": 7.109411646087035e-05, + "loss": 1.7485, + "step": 12399 + }, + { + "epoch": 3.8060159607120934, + "grad_norm": 0.415475994348526, + "learning_rate": 7.108960979559454e-05, + "loss": 1.7363, + "step": 12400 + }, + { + "epoch": 3.8063228974831187, + "grad_norm": 0.38654613494873047, + "learning_rate": 7.108510292189884e-05, + "loss": 1.7907, + "step": 12401 + }, + { + "epoch": 3.8066298342541436, + "grad_norm": 0.2541481852531433, + "learning_rate": 7.10805958398278e-05, + "loss": 1.8458, + "step": 12402 + }, + { + "epoch": 3.806936771025169, + "grad_norm": 0.32562851905822754, + "learning_rate": 7.107608854942597e-05, + "loss": 1.7989, + "step": 12403 + }, + { + "epoch": 3.807243707796194, + "grad_norm": 0.3628395199775696, + "learning_rate": 7.107158105073786e-05, + "loss": 1.8044, + "step": 12404 + }, + { + "epoch": 3.807550644567219, + "grad_norm": 0.3363969027996063, + "learning_rate": 7.106707334380805e-05, + "loss": 1.8078, + "step": 12405 + }, + { + "epoch": 3.8078575813382445, + "grad_norm": 0.2853989601135254, + "learning_rate": 7.106256542868108e-05, + "loss": 1.7913, + "step": 12406 + }, + { + "epoch": 3.8081645181092694, + "grad_norm": 0.33455806970596313, + "learning_rate": 7.105805730540148e-05, + "loss": 1.7252, + "step": 12407 + }, + { + "epoch": 3.8084714548802947, + "grad_norm": 0.28103405237197876, + "learning_rate": 7.105354897401382e-05, + "loss": 1.6942, + "step": 12408 + }, + { + "epoch": 3.8087783916513196, + "grad_norm": 0.23230718076229095, + "learning_rate": 7.104904043456264e-05, + "loss": 1.7723, + "step": 12409 + }, + { + "epoch": 3.809085328422345, + "grad_norm": 0.2883053421974182, + "learning_rate": 7.104453168709251e-05, + "loss": 1.8015, + "step": 12410 + }, + { + "epoch": 3.8093922651933703, + "grad_norm": 0.28462252020835876, + "learning_rate": 7.104002273164798e-05, + "loss": 1.791, + "step": 12411 + }, + { + "epoch": 3.8096992019643956, + "grad_norm": 0.3004699647426605, + "learning_rate": 7.103551356827363e-05, + "loss": 1.8401, + "step": 12412 + }, + { + "epoch": 3.8100061387354205, + "grad_norm": 0.2546156048774719, + "learning_rate": 7.1031004197014e-05, + "loss": 1.7645, + "step": 12413 + }, + { + "epoch": 3.810313075506446, + "grad_norm": 0.24532915651798248, + "learning_rate": 7.102649461791364e-05, + "loss": 1.8, + "step": 12414 + }, + { + "epoch": 3.8106200122774707, + "grad_norm": 0.2432405799627304, + "learning_rate": 7.102198483101716e-05, + "loss": 1.7957, + "step": 12415 + }, + { + "epoch": 3.810926949048496, + "grad_norm": 0.24405215680599213, + "learning_rate": 7.101747483636908e-05, + "loss": 1.79, + "step": 12416 + }, + { + "epoch": 3.8112338858195214, + "grad_norm": 0.29519838094711304, + "learning_rate": 7.101296463401401e-05, + "loss": 1.8087, + "step": 12417 + }, + { + "epoch": 3.8115408225905463, + "grad_norm": 0.28205612301826477, + "learning_rate": 7.100845422399652e-05, + "loss": 1.7897, + "step": 12418 + }, + { + "epoch": 3.8118477593615716, + "grad_norm": 0.25014567375183105, + "learning_rate": 7.100394360636115e-05, + "loss": 1.7574, + "step": 12419 + }, + { + "epoch": 3.8121546961325965, + "grad_norm": 0.3133499026298523, + "learning_rate": 7.099943278115251e-05, + "loss": 1.7957, + "step": 12420 + }, + { + "epoch": 3.812461632903622, + "grad_norm": 0.3706473708152771, + "learning_rate": 7.099492174841516e-05, + "loss": 1.8519, + "step": 12421 + }, + { + "epoch": 3.812768569674647, + "grad_norm": 0.30085715651512146, + "learning_rate": 7.09904105081937e-05, + "loss": 1.778, + "step": 12422 + }, + { + "epoch": 3.813075506445672, + "grad_norm": 0.23897981643676758, + "learning_rate": 7.09858990605327e-05, + "loss": 1.7289, + "step": 12423 + }, + { + "epoch": 3.8133824432166974, + "grad_norm": 0.30046290159225464, + "learning_rate": 7.098138740547673e-05, + "loss": 1.8838, + "step": 12424 + }, + { + "epoch": 3.8136893799877223, + "grad_norm": 0.32126328349113464, + "learning_rate": 7.097687554307041e-05, + "loss": 1.7916, + "step": 12425 + }, + { + "epoch": 3.8139963167587476, + "grad_norm": 0.2922256886959076, + "learning_rate": 7.097236347335829e-05, + "loss": 1.8305, + "step": 12426 + }, + { + "epoch": 3.814303253529773, + "grad_norm": 0.2772706151008606, + "learning_rate": 7.0967851196385e-05, + "loss": 1.7694, + "step": 12427 + }, + { + "epoch": 3.8146101903007983, + "grad_norm": 0.25763455033302307, + "learning_rate": 7.096333871219511e-05, + "loss": 1.8716, + "step": 12428 + }, + { + "epoch": 3.814917127071823, + "grad_norm": 0.2631739377975464, + "learning_rate": 7.095882602083322e-05, + "loss": 1.7771, + "step": 12429 + }, + { + "epoch": 3.8152240638428485, + "grad_norm": 0.29229632019996643, + "learning_rate": 7.095431312234392e-05, + "loss": 1.7865, + "step": 12430 + }, + { + "epoch": 3.8155310006138734, + "grad_norm": 0.2672729790210724, + "learning_rate": 7.094980001677181e-05, + "loss": 1.7848, + "step": 12431 + }, + { + "epoch": 3.8158379373848987, + "grad_norm": 0.2388373166322708, + "learning_rate": 7.094528670416152e-05, + "loss": 1.75, + "step": 12432 + }, + { + "epoch": 3.816144874155924, + "grad_norm": 0.2385305017232895, + "learning_rate": 7.094077318455762e-05, + "loss": 1.748, + "step": 12433 + }, + { + "epoch": 3.816451810926949, + "grad_norm": 0.25421401858329773, + "learning_rate": 7.093625945800471e-05, + "loss": 1.779, + "step": 12434 + }, + { + "epoch": 3.8167587476979743, + "grad_norm": 0.2785158157348633, + "learning_rate": 7.093174552454743e-05, + "loss": 1.8295, + "step": 12435 + }, + { + "epoch": 3.817065684468999, + "grad_norm": 0.2907472252845764, + "learning_rate": 7.092723138423036e-05, + "loss": 1.8216, + "step": 12436 + }, + { + "epoch": 3.8173726212400245, + "grad_norm": 0.253955215215683, + "learning_rate": 7.092271703709814e-05, + "loss": 1.8394, + "step": 12437 + }, + { + "epoch": 3.81767955801105, + "grad_norm": 0.32139912247657776, + "learning_rate": 7.091820248319537e-05, + "loss": 1.8634, + "step": 12438 + }, + { + "epoch": 3.8179864947820747, + "grad_norm": 0.25890466570854187, + "learning_rate": 7.091368772256664e-05, + "loss": 1.7336, + "step": 12439 + }, + { + "epoch": 3.8182934315531, + "grad_norm": 0.2823775112628937, + "learning_rate": 7.090917275525661e-05, + "loss": 1.7927, + "step": 12440 + }, + { + "epoch": 3.818600368324125, + "grad_norm": 0.28739333152770996, + "learning_rate": 7.090465758130988e-05, + "loss": 1.7807, + "step": 12441 + }, + { + "epoch": 3.8189073050951503, + "grad_norm": 0.36823949217796326, + "learning_rate": 7.090014220077106e-05, + "loss": 1.7288, + "step": 12442 + }, + { + "epoch": 3.8192142418661756, + "grad_norm": 0.3061312735080719, + "learning_rate": 7.089562661368479e-05, + "loss": 1.8039, + "step": 12443 + }, + { + "epoch": 3.819521178637201, + "grad_norm": 0.25867924094200134, + "learning_rate": 7.089111082009569e-05, + "loss": 1.7678, + "step": 12444 + }, + { + "epoch": 3.819828115408226, + "grad_norm": 0.26834985613822937, + "learning_rate": 7.088659482004837e-05, + "loss": 1.7592, + "step": 12445 + }, + { + "epoch": 3.820135052179251, + "grad_norm": 0.25608211755752563, + "learning_rate": 7.08820786135875e-05, + "loss": 1.7622, + "step": 12446 + }, + { + "epoch": 3.820441988950276, + "grad_norm": 0.2512456774711609, + "learning_rate": 7.087756220075769e-05, + "loss": 1.7648, + "step": 12447 + }, + { + "epoch": 3.8207489257213014, + "grad_norm": 0.2434878647327423, + "learning_rate": 7.087304558160355e-05, + "loss": 1.7435, + "step": 12448 + }, + { + "epoch": 3.8210558624923268, + "grad_norm": 0.26456570625305176, + "learning_rate": 7.086852875616978e-05, + "loss": 1.7342, + "step": 12449 + }, + { + "epoch": 3.8213627992633517, + "grad_norm": 0.2958984971046448, + "learning_rate": 7.086401172450095e-05, + "loss": 1.8532, + "step": 12450 + }, + { + "epoch": 3.821669736034377, + "grad_norm": 0.25939157605171204, + "learning_rate": 7.085949448664172e-05, + "loss": 1.7746, + "step": 12451 + }, + { + "epoch": 3.821976672805402, + "grad_norm": 0.2210223525762558, + "learning_rate": 7.085497704263675e-05, + "loss": 1.7745, + "step": 12452 + }, + { + "epoch": 3.822283609576427, + "grad_norm": 0.2409319430589676, + "learning_rate": 7.085045939253068e-05, + "loss": 1.7981, + "step": 12453 + }, + { + "epoch": 3.8225905463474525, + "grad_norm": 0.26331812143325806, + "learning_rate": 7.084594153636815e-05, + "loss": 1.8163, + "step": 12454 + }, + { + "epoch": 3.8228974831184774, + "grad_norm": 0.2613828480243683, + "learning_rate": 7.08414234741938e-05, + "loss": 1.8362, + "step": 12455 + }, + { + "epoch": 3.8232044198895028, + "grad_norm": 0.3139529228210449, + "learning_rate": 7.083690520605228e-05, + "loss": 1.8247, + "step": 12456 + }, + { + "epoch": 3.8235113566605277, + "grad_norm": 0.2958570718765259, + "learning_rate": 7.083238673198826e-05, + "loss": 1.8011, + "step": 12457 + }, + { + "epoch": 3.823818293431553, + "grad_norm": 0.2517626881599426, + "learning_rate": 7.082786805204639e-05, + "loss": 1.7353, + "step": 12458 + }, + { + "epoch": 3.8241252302025783, + "grad_norm": 0.2443888783454895, + "learning_rate": 7.082334916627132e-05, + "loss": 1.7916, + "step": 12459 + }, + { + "epoch": 3.8244321669736037, + "grad_norm": 0.283514142036438, + "learning_rate": 7.08188300747077e-05, + "loss": 1.8048, + "step": 12460 + }, + { + "epoch": 3.8247391037446286, + "grad_norm": 0.24775351583957672, + "learning_rate": 7.08143107774002e-05, + "loss": 1.8145, + "step": 12461 + }, + { + "epoch": 3.825046040515654, + "grad_norm": 0.27904003858566284, + "learning_rate": 7.080979127439347e-05, + "loss": 1.8003, + "step": 12462 + }, + { + "epoch": 3.825352977286679, + "grad_norm": 0.24997512996196747, + "learning_rate": 7.08052715657322e-05, + "loss": 1.7962, + "step": 12463 + }, + { + "epoch": 3.825659914057704, + "grad_norm": 0.25874343514442444, + "learning_rate": 7.080075165146104e-05, + "loss": 1.7861, + "step": 12464 + }, + { + "epoch": 3.8259668508287294, + "grad_norm": 0.2964434027671814, + "learning_rate": 7.079623153162467e-05, + "loss": 1.7618, + "step": 12465 + }, + { + "epoch": 3.8262737875997543, + "grad_norm": 0.26403337717056274, + "learning_rate": 7.079171120626774e-05, + "loss": 1.8016, + "step": 12466 + }, + { + "epoch": 3.8265807243707797, + "grad_norm": 0.28369295597076416, + "learning_rate": 7.078719067543494e-05, + "loss": 1.7517, + "step": 12467 + }, + { + "epoch": 3.8268876611418046, + "grad_norm": 0.254312127828598, + "learning_rate": 7.078266993917093e-05, + "loss": 1.8085, + "step": 12468 + }, + { + "epoch": 3.82719459791283, + "grad_norm": 0.24992622435092926, + "learning_rate": 7.077814899752038e-05, + "loss": 1.7657, + "step": 12469 + }, + { + "epoch": 3.8275015346838552, + "grad_norm": 0.26485762000083923, + "learning_rate": 7.077362785052802e-05, + "loss": 1.7303, + "step": 12470 + }, + { + "epoch": 3.8278084714548806, + "grad_norm": 0.29864901304244995, + "learning_rate": 7.076910649823846e-05, + "loss": 1.7734, + "step": 12471 + }, + { + "epoch": 3.8281154082259055, + "grad_norm": 0.2973599433898926, + "learning_rate": 7.076458494069644e-05, + "loss": 1.8055, + "step": 12472 + }, + { + "epoch": 3.828422344996931, + "grad_norm": 0.2150362730026245, + "learning_rate": 7.07600631779466e-05, + "loss": 1.7377, + "step": 12473 + }, + { + "epoch": 3.8287292817679557, + "grad_norm": 0.26443010568618774, + "learning_rate": 7.075554121003367e-05, + "loss": 1.837, + "step": 12474 + }, + { + "epoch": 3.829036218538981, + "grad_norm": 0.27365007996559143, + "learning_rate": 7.075101903700231e-05, + "loss": 1.7784, + "step": 12475 + }, + { + "epoch": 3.8293431553100064, + "grad_norm": 0.22037263214588165, + "learning_rate": 7.074649665889721e-05, + "loss": 1.8182, + "step": 12476 + }, + { + "epoch": 3.8296500920810312, + "grad_norm": 0.29614946246147156, + "learning_rate": 7.074197407576308e-05, + "loss": 1.7993, + "step": 12477 + }, + { + "epoch": 3.8299570288520566, + "grad_norm": 0.25135520100593567, + "learning_rate": 7.07374512876446e-05, + "loss": 1.8211, + "step": 12478 + }, + { + "epoch": 3.8302639656230815, + "grad_norm": 0.2711503207683563, + "learning_rate": 7.073292829458645e-05, + "loss": 1.8274, + "step": 12479 + }, + { + "epoch": 3.830570902394107, + "grad_norm": 0.38659265637397766, + "learning_rate": 7.072840509663338e-05, + "loss": 1.796, + "step": 12480 + }, + { + "epoch": 3.830877839165132, + "grad_norm": 0.39382728934288025, + "learning_rate": 7.072388169383005e-05, + "loss": 1.8439, + "step": 12481 + }, + { + "epoch": 3.831184775936157, + "grad_norm": 0.27570033073425293, + "learning_rate": 7.071935808622118e-05, + "loss": 1.8155, + "step": 12482 + }, + { + "epoch": 3.8314917127071824, + "grad_norm": 0.29054465889930725, + "learning_rate": 7.071483427385147e-05, + "loss": 1.754, + "step": 12483 + }, + { + "epoch": 3.8317986494782073, + "grad_norm": 0.4138031303882599, + "learning_rate": 7.071031025676562e-05, + "loss": 1.7686, + "step": 12484 + }, + { + "epoch": 3.8321055862492326, + "grad_norm": 0.3447251617908478, + "learning_rate": 7.070578603500833e-05, + "loss": 1.8135, + "step": 12485 + }, + { + "epoch": 3.832412523020258, + "grad_norm": 0.265115886926651, + "learning_rate": 7.070126160862436e-05, + "loss": 1.803, + "step": 12486 + }, + { + "epoch": 3.8327194597912833, + "grad_norm": 0.4288817346096039, + "learning_rate": 7.069673697765837e-05, + "loss": 1.7814, + "step": 12487 + }, + { + "epoch": 3.833026396562308, + "grad_norm": 0.4890103340148926, + "learning_rate": 7.06922121421551e-05, + "loss": 1.8318, + "step": 12488 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.3676142990589142, + "learning_rate": 7.068768710215928e-05, + "loss": 1.7792, + "step": 12489 + }, + { + "epoch": 3.8336402701043584, + "grad_norm": 0.23254090547561646, + "learning_rate": 7.068316185771557e-05, + "loss": 1.7154, + "step": 12490 + }, + { + "epoch": 3.8339472068753837, + "grad_norm": 0.35014036297798157, + "learning_rate": 7.067863640886876e-05, + "loss": 1.7031, + "step": 12491 + }, + { + "epoch": 3.834254143646409, + "grad_norm": 0.32155317068099976, + "learning_rate": 7.067411075566353e-05, + "loss": 1.7692, + "step": 12492 + }, + { + "epoch": 3.834561080417434, + "grad_norm": 0.260772705078125, + "learning_rate": 7.066958489814463e-05, + "loss": 1.7488, + "step": 12493 + }, + { + "epoch": 3.8348680171884593, + "grad_norm": 0.2624910771846771, + "learning_rate": 7.066505883635678e-05, + "loss": 1.7436, + "step": 12494 + }, + { + "epoch": 3.835174953959484, + "grad_norm": 0.2782299220561981, + "learning_rate": 7.066053257034471e-05, + "loss": 1.8219, + "step": 12495 + }, + { + "epoch": 3.8354818907305095, + "grad_norm": 0.2749497890472412, + "learning_rate": 7.065600610015312e-05, + "loss": 1.8068, + "step": 12496 + }, + { + "epoch": 3.835788827501535, + "grad_norm": 0.2730359733104706, + "learning_rate": 7.06514794258268e-05, + "loss": 1.7588, + "step": 12497 + }, + { + "epoch": 3.8360957642725597, + "grad_norm": 0.3606291711330414, + "learning_rate": 7.064695254741044e-05, + "loss": 1.8509, + "step": 12498 + }, + { + "epoch": 3.836402701043585, + "grad_norm": 0.23282989859580994, + "learning_rate": 7.064242546494879e-05, + "loss": 1.7444, + "step": 12499 + }, + { + "epoch": 3.83670963781461, + "grad_norm": 0.2554507255554199, + "learning_rate": 7.06378981784866e-05, + "loss": 1.7486, + "step": 12500 + }, + { + "epoch": 3.8370165745856353, + "grad_norm": 0.2916143834590912, + "learning_rate": 7.06333706880686e-05, + "loss": 1.8035, + "step": 12501 + }, + { + "epoch": 3.8373235113566606, + "grad_norm": 0.23719090223312378, + "learning_rate": 7.062884299373955e-05, + "loss": 1.7896, + "step": 12502 + }, + { + "epoch": 3.837630448127686, + "grad_norm": 0.2596152126789093, + "learning_rate": 7.062431509554417e-05, + "loss": 1.7944, + "step": 12503 + }, + { + "epoch": 3.837937384898711, + "grad_norm": 0.29140764474868774, + "learning_rate": 7.061978699352723e-05, + "loss": 1.7988, + "step": 12504 + }, + { + "epoch": 3.838244321669736, + "grad_norm": 0.3421068489551544, + "learning_rate": 7.061525868773347e-05, + "loss": 1.751, + "step": 12505 + }, + { + "epoch": 3.838551258440761, + "grad_norm": 0.2705349624156952, + "learning_rate": 7.061073017820764e-05, + "loss": 1.7578, + "step": 12506 + }, + { + "epoch": 3.8388581952117864, + "grad_norm": 0.2403286248445511, + "learning_rate": 7.060620146499448e-05, + "loss": 1.8422, + "step": 12507 + }, + { + "epoch": 3.8391651319828117, + "grad_norm": 0.3860442042350769, + "learning_rate": 7.060167254813876e-05, + "loss": 1.8168, + "step": 12508 + }, + { + "epoch": 3.8394720687538366, + "grad_norm": 0.4729512631893158, + "learning_rate": 7.059714342768526e-05, + "loss": 1.7786, + "step": 12509 + }, + { + "epoch": 3.839779005524862, + "grad_norm": 0.3522968888282776, + "learning_rate": 7.059261410367871e-05, + "loss": 1.8749, + "step": 12510 + }, + { + "epoch": 3.840085942295887, + "grad_norm": 0.28071436285972595, + "learning_rate": 7.058808457616386e-05, + "loss": 1.7959, + "step": 12511 + }, + { + "epoch": 3.840392879066912, + "grad_norm": 0.4356439411640167, + "learning_rate": 7.05835548451855e-05, + "loss": 1.8045, + "step": 12512 + }, + { + "epoch": 3.8406998158379375, + "grad_norm": 0.4051562249660492, + "learning_rate": 7.057902491078839e-05, + "loss": 1.7909, + "step": 12513 + }, + { + "epoch": 3.8410067526089624, + "grad_norm": 0.2817205488681793, + "learning_rate": 7.057449477301728e-05, + "loss": 1.8736, + "step": 12514 + }, + { + "epoch": 3.8413136893799877, + "grad_norm": 0.33369559049606323, + "learning_rate": 7.056996443191697e-05, + "loss": 1.7799, + "step": 12515 + }, + { + "epoch": 3.8416206261510126, + "grad_norm": 0.369954913854599, + "learning_rate": 7.056543388753221e-05, + "loss": 1.795, + "step": 12516 + }, + { + "epoch": 3.841927562922038, + "grad_norm": 0.289474755525589, + "learning_rate": 7.056090313990778e-05, + "loss": 1.786, + "step": 12517 + }, + { + "epoch": 3.8422344996930633, + "grad_norm": 0.2431849092245102, + "learning_rate": 7.055637218908845e-05, + "loss": 1.7363, + "step": 12518 + }, + { + "epoch": 3.8425414364640886, + "grad_norm": 0.3736060857772827, + "learning_rate": 7.0551841035119e-05, + "loss": 1.8234, + "step": 12519 + }, + { + "epoch": 3.8428483732351135, + "grad_norm": 0.34008854627609253, + "learning_rate": 7.054730967804422e-05, + "loss": 1.8001, + "step": 12520 + }, + { + "epoch": 3.843155310006139, + "grad_norm": 0.24852876365184784, + "learning_rate": 7.054277811790887e-05, + "loss": 1.8298, + "step": 12521 + }, + { + "epoch": 3.8434622467771637, + "grad_norm": 0.3491046726703644, + "learning_rate": 7.053824635475777e-05, + "loss": 1.7336, + "step": 12522 + }, + { + "epoch": 3.843769183548189, + "grad_norm": 0.38757824897766113, + "learning_rate": 7.053371438863566e-05, + "loss": 1.8241, + "step": 12523 + }, + { + "epoch": 3.8440761203192144, + "grad_norm": 0.2607647180557251, + "learning_rate": 7.052918221958735e-05, + "loss": 1.7813, + "step": 12524 + }, + { + "epoch": 3.8443830570902393, + "grad_norm": 0.25634410977363586, + "learning_rate": 7.052464984765764e-05, + "loss": 1.7836, + "step": 12525 + }, + { + "epoch": 3.8446899938612646, + "grad_norm": 0.3113503158092499, + "learning_rate": 7.052011727289129e-05, + "loss": 1.8477, + "step": 12526 + }, + { + "epoch": 3.8449969306322895, + "grad_norm": 0.2852596044540405, + "learning_rate": 7.051558449533313e-05, + "loss": 1.7607, + "step": 12527 + }, + { + "epoch": 3.845303867403315, + "grad_norm": 0.24841541051864624, + "learning_rate": 7.051105151502795e-05, + "loss": 1.8109, + "step": 12528 + }, + { + "epoch": 3.84561080417434, + "grad_norm": 0.2231549620628357, + "learning_rate": 7.050651833202053e-05, + "loss": 1.7245, + "step": 12529 + }, + { + "epoch": 3.845917740945365, + "grad_norm": 0.21975892782211304, + "learning_rate": 7.050198494635566e-05, + "loss": 1.7512, + "step": 12530 + }, + { + "epoch": 3.8462246777163904, + "grad_norm": 0.2546280324459076, + "learning_rate": 7.049745135807816e-05, + "loss": 1.8003, + "step": 12531 + }, + { + "epoch": 3.8465316144874153, + "grad_norm": 0.21507929265499115, + "learning_rate": 7.049291756723284e-05, + "loss": 1.7616, + "step": 12532 + }, + { + "epoch": 3.8468385512584407, + "grad_norm": 0.24927987158298492, + "learning_rate": 7.04883835738645e-05, + "loss": 1.7519, + "step": 12533 + }, + { + "epoch": 3.847145488029466, + "grad_norm": 0.24988602101802826, + "learning_rate": 7.048384937801793e-05, + "loss": 1.7966, + "step": 12534 + }, + { + "epoch": 3.8474524248004913, + "grad_norm": 0.24039845168590546, + "learning_rate": 7.047931497973798e-05, + "loss": 1.7834, + "step": 12535 + }, + { + "epoch": 3.847759361571516, + "grad_norm": 0.22826696932315826, + "learning_rate": 7.047478037906943e-05, + "loss": 1.7334, + "step": 12536 + }, + { + "epoch": 3.8480662983425415, + "grad_norm": 0.22260744869709015, + "learning_rate": 7.047024557605708e-05, + "loss": 1.787, + "step": 12537 + }, + { + "epoch": 3.8483732351135664, + "grad_norm": 0.2457917332649231, + "learning_rate": 7.046571057074578e-05, + "loss": 1.7865, + "step": 12538 + }, + { + "epoch": 3.8486801718845918, + "grad_norm": 0.23952928185462952, + "learning_rate": 7.046117536318035e-05, + "loss": 1.7764, + "step": 12539 + }, + { + "epoch": 3.848987108655617, + "grad_norm": 0.22186748683452606, + "learning_rate": 7.045663995340557e-05, + "loss": 1.7917, + "step": 12540 + }, + { + "epoch": 3.849294045426642, + "grad_norm": 0.24234962463378906, + "learning_rate": 7.045210434146629e-05, + "loss": 1.7697, + "step": 12541 + }, + { + "epoch": 3.8496009821976673, + "grad_norm": 0.2510770857334137, + "learning_rate": 7.044756852740732e-05, + "loss": 1.8012, + "step": 12542 + }, + { + "epoch": 3.849907918968692, + "grad_norm": 0.24910703301429749, + "learning_rate": 7.044303251127349e-05, + "loss": 1.831, + "step": 12543 + }, + { + "epoch": 3.8502148557397176, + "grad_norm": 0.3159966468811035, + "learning_rate": 7.043849629310964e-05, + "loss": 1.8029, + "step": 12544 + }, + { + "epoch": 3.850521792510743, + "grad_norm": 0.3155403733253479, + "learning_rate": 7.04339598729606e-05, + "loss": 1.7429, + "step": 12545 + }, + { + "epoch": 3.8508287292817682, + "grad_norm": 0.3037515878677368, + "learning_rate": 7.042942325087117e-05, + "loss": 1.8186, + "step": 12546 + }, + { + "epoch": 3.851135666052793, + "grad_norm": 0.2319766730070114, + "learning_rate": 7.042488642688621e-05, + "loss": 1.7853, + "step": 12547 + }, + { + "epoch": 3.8514426028238185, + "grad_norm": 0.23911969363689423, + "learning_rate": 7.042034940105055e-05, + "loss": 1.8314, + "step": 12548 + }, + { + "epoch": 3.8517495395948433, + "grad_norm": 0.2541846036911011, + "learning_rate": 7.041581217340905e-05, + "loss": 1.8289, + "step": 12549 + }, + { + "epoch": 3.8520564763658687, + "grad_norm": 0.22234943509101868, + "learning_rate": 7.04112747440065e-05, + "loss": 1.7847, + "step": 12550 + }, + { + "epoch": 3.852363413136894, + "grad_norm": 0.2747870981693268, + "learning_rate": 7.04067371128878e-05, + "loss": 1.7875, + "step": 12551 + }, + { + "epoch": 3.852670349907919, + "grad_norm": 0.28589147329330444, + "learning_rate": 7.040219928009775e-05, + "loss": 1.7289, + "step": 12552 + }, + { + "epoch": 3.8529772866789442, + "grad_norm": 0.21180351078510284, + "learning_rate": 7.039766124568119e-05, + "loss": 1.7611, + "step": 12553 + }, + { + "epoch": 3.853284223449969, + "grad_norm": 0.27751782536506653, + "learning_rate": 7.0393123009683e-05, + "loss": 1.7481, + "step": 12554 + }, + { + "epoch": 3.8535911602209945, + "grad_norm": 0.32883307337760925, + "learning_rate": 7.038858457214802e-05, + "loss": 1.7271, + "step": 12555 + }, + { + "epoch": 3.85389809699202, + "grad_norm": 0.30965641140937805, + "learning_rate": 7.03840459331211e-05, + "loss": 1.81, + "step": 12556 + }, + { + "epoch": 3.8542050337630447, + "grad_norm": 0.25184348225593567, + "learning_rate": 7.037950709264709e-05, + "loss": 1.7642, + "step": 12557 + }, + { + "epoch": 3.85451197053407, + "grad_norm": 0.2376822829246521, + "learning_rate": 7.037496805077084e-05, + "loss": 1.7774, + "step": 12558 + }, + { + "epoch": 3.854818907305095, + "grad_norm": 0.2395993024110794, + "learning_rate": 7.03704288075372e-05, + "loss": 1.8397, + "step": 12559 + }, + { + "epoch": 3.8551258440761202, + "grad_norm": 0.26460394263267517, + "learning_rate": 7.036588936299107e-05, + "loss": 1.7472, + "step": 12560 + }, + { + "epoch": 3.8554327808471456, + "grad_norm": 0.34742459654808044, + "learning_rate": 7.036134971717725e-05, + "loss": 1.8003, + "step": 12561 + }, + { + "epoch": 3.855739717618171, + "grad_norm": 0.2829316556453705, + "learning_rate": 7.035680987014068e-05, + "loss": 1.7765, + "step": 12562 + }, + { + "epoch": 3.856046654389196, + "grad_norm": 0.3087223172187805, + "learning_rate": 7.035226982192615e-05, + "loss": 1.8462, + "step": 12563 + }, + { + "epoch": 3.856353591160221, + "grad_norm": 0.2806380093097687, + "learning_rate": 7.034772957257858e-05, + "loss": 1.7704, + "step": 12564 + }, + { + "epoch": 3.856660527931246, + "grad_norm": 0.25598087906837463, + "learning_rate": 7.03431891221428e-05, + "loss": 1.7843, + "step": 12565 + }, + { + "epoch": 3.8569674647022714, + "grad_norm": 0.30833700299263, + "learning_rate": 7.033864847066373e-05, + "loss": 1.8404, + "step": 12566 + }, + { + "epoch": 3.8572744014732967, + "grad_norm": 0.29562532901763916, + "learning_rate": 7.03341076181862e-05, + "loss": 1.8044, + "step": 12567 + }, + { + "epoch": 3.8575813382443216, + "grad_norm": 0.2901719808578491, + "learning_rate": 7.03295665647551e-05, + "loss": 1.7789, + "step": 12568 + }, + { + "epoch": 3.857888275015347, + "grad_norm": 0.25453686714172363, + "learning_rate": 7.03250253104153e-05, + "loss": 1.6792, + "step": 12569 + }, + { + "epoch": 3.858195211786372, + "grad_norm": 0.26009416580200195, + "learning_rate": 7.03204838552117e-05, + "loss": 1.7835, + "step": 12570 + }, + { + "epoch": 3.858502148557397, + "grad_norm": 0.28074127435684204, + "learning_rate": 7.031594219918916e-05, + "loss": 1.7932, + "step": 12571 + }, + { + "epoch": 3.8588090853284225, + "grad_norm": 0.3341725170612335, + "learning_rate": 7.031140034239258e-05, + "loss": 1.7439, + "step": 12572 + }, + { + "epoch": 3.8591160220994474, + "grad_norm": 0.28142449259757996, + "learning_rate": 7.030685828486684e-05, + "loss": 1.8263, + "step": 12573 + }, + { + "epoch": 3.8594229588704727, + "grad_norm": 0.2571438252925873, + "learning_rate": 7.030231602665681e-05, + "loss": 1.7628, + "step": 12574 + }, + { + "epoch": 3.8597298956414976, + "grad_norm": 0.3079041838645935, + "learning_rate": 7.029777356780741e-05, + "loss": 1.7879, + "step": 12575 + }, + { + "epoch": 3.860036832412523, + "grad_norm": 0.2605433464050293, + "learning_rate": 7.029323090836349e-05, + "loss": 1.7841, + "step": 12576 + }, + { + "epoch": 3.8603437691835483, + "grad_norm": 0.24069640040397644, + "learning_rate": 7.028868804836999e-05, + "loss": 1.7939, + "step": 12577 + }, + { + "epoch": 3.8606507059545736, + "grad_norm": 0.26801639795303345, + "learning_rate": 7.028414498787177e-05, + "loss": 1.8082, + "step": 12578 + }, + { + "epoch": 3.8609576427255985, + "grad_norm": 0.28828585147857666, + "learning_rate": 7.027960172691375e-05, + "loss": 1.8094, + "step": 12579 + }, + { + "epoch": 3.861264579496624, + "grad_norm": 0.22927051782608032, + "learning_rate": 7.027505826554082e-05, + "loss": 1.7758, + "step": 12580 + }, + { + "epoch": 3.8615715162676487, + "grad_norm": 0.25755998492240906, + "learning_rate": 7.027051460379788e-05, + "loss": 1.8429, + "step": 12581 + }, + { + "epoch": 3.861878453038674, + "grad_norm": 0.23636581003665924, + "learning_rate": 7.026597074172982e-05, + "loss": 1.7662, + "step": 12582 + }, + { + "epoch": 3.8621853898096994, + "grad_norm": 0.22599349915981293, + "learning_rate": 7.026142667938156e-05, + "loss": 1.7199, + "step": 12583 + }, + { + "epoch": 3.8624923265807243, + "grad_norm": 0.2504875659942627, + "learning_rate": 7.025688241679802e-05, + "loss": 1.8473, + "step": 12584 + }, + { + "epoch": 3.8627992633517496, + "grad_norm": 0.3012976348400116, + "learning_rate": 7.025233795402408e-05, + "loss": 1.8715, + "step": 12585 + }, + { + "epoch": 3.8631062001227745, + "grad_norm": 0.31703677773475647, + "learning_rate": 7.024779329110469e-05, + "loss": 1.8143, + "step": 12586 + }, + { + "epoch": 3.8634131368938, + "grad_norm": 0.27287593483924866, + "learning_rate": 7.024324842808472e-05, + "loss": 1.7227, + "step": 12587 + }, + { + "epoch": 3.863720073664825, + "grad_norm": 0.24663801491260529, + "learning_rate": 7.02387033650091e-05, + "loss": 1.7529, + "step": 12588 + }, + { + "epoch": 3.86402701043585, + "grad_norm": 0.26127147674560547, + "learning_rate": 7.023415810192277e-05, + "loss": 1.7629, + "step": 12589 + }, + { + "epoch": 3.8643339472068754, + "grad_norm": 0.3457142114639282, + "learning_rate": 7.022961263887062e-05, + "loss": 1.8212, + "step": 12590 + }, + { + "epoch": 3.8646408839779003, + "grad_norm": 0.3296070694923401, + "learning_rate": 7.022506697589759e-05, + "loss": 1.7907, + "step": 12591 + }, + { + "epoch": 3.8649478207489256, + "grad_norm": 0.29474303126335144, + "learning_rate": 7.022052111304858e-05, + "loss": 1.7866, + "step": 12592 + }, + { + "epoch": 3.865254757519951, + "grad_norm": 0.2535403072834015, + "learning_rate": 7.021597505036852e-05, + "loss": 1.7607, + "step": 12593 + }, + { + "epoch": 3.8655616942909763, + "grad_norm": 0.26691222190856934, + "learning_rate": 7.021142878790237e-05, + "loss": 1.8063, + "step": 12594 + }, + { + "epoch": 3.865868631062001, + "grad_norm": 0.2784755229949951, + "learning_rate": 7.020688232569502e-05, + "loss": 1.8065, + "step": 12595 + }, + { + "epoch": 3.8661755678330265, + "grad_norm": 0.23714317381381989, + "learning_rate": 7.020233566379142e-05, + "loss": 1.8317, + "step": 12596 + }, + { + "epoch": 3.8664825046040514, + "grad_norm": 0.25010553002357483, + "learning_rate": 7.019778880223649e-05, + "loss": 1.8493, + "step": 12597 + }, + { + "epoch": 3.8667894413750767, + "grad_norm": 0.2798489034175873, + "learning_rate": 7.01932417410752e-05, + "loss": 1.8134, + "step": 12598 + }, + { + "epoch": 3.867096378146102, + "grad_norm": 0.26199260354042053, + "learning_rate": 7.018869448035243e-05, + "loss": 1.6931, + "step": 12599 + }, + { + "epoch": 3.867403314917127, + "grad_norm": 0.24582891166210175, + "learning_rate": 7.018414702011314e-05, + "loss": 1.8076, + "step": 12600 + }, + { + "epoch": 3.8677102516881523, + "grad_norm": 0.25493237376213074, + "learning_rate": 7.01795993604023e-05, + "loss": 1.7851, + "step": 12601 + }, + { + "epoch": 3.868017188459177, + "grad_norm": 0.2607674300670624, + "learning_rate": 7.017505150126483e-05, + "loss": 1.7285, + "step": 12602 + }, + { + "epoch": 3.8683241252302025, + "grad_norm": 0.23629581928253174, + "learning_rate": 7.017050344274568e-05, + "loss": 1.8254, + "step": 12603 + }, + { + "epoch": 3.868631062001228, + "grad_norm": 0.3129318058490753, + "learning_rate": 7.016595518488979e-05, + "loss": 1.7914, + "step": 12604 + }, + { + "epoch": 3.8689379987722528, + "grad_norm": 0.3178271949291229, + "learning_rate": 7.01614067277421e-05, + "loss": 1.8139, + "step": 12605 + }, + { + "epoch": 3.869244935543278, + "grad_norm": 0.3230711817741394, + "learning_rate": 7.015685807134757e-05, + "loss": 1.8203, + "step": 12606 + }, + { + "epoch": 3.869551872314303, + "grad_norm": 0.26339825987815857, + "learning_rate": 7.015230921575118e-05, + "loss": 1.8022, + "step": 12607 + }, + { + "epoch": 3.8698588090853283, + "grad_norm": 0.25337356328964233, + "learning_rate": 7.014776016099785e-05, + "loss": 1.7779, + "step": 12608 + }, + { + "epoch": 3.8701657458563536, + "grad_norm": 0.2506195306777954, + "learning_rate": 7.014321090713253e-05, + "loss": 1.7858, + "step": 12609 + }, + { + "epoch": 3.870472682627379, + "grad_norm": 0.26249951124191284, + "learning_rate": 7.013866145420021e-05, + "loss": 1.8051, + "step": 12610 + }, + { + "epoch": 3.870779619398404, + "grad_norm": 0.25666534900665283, + "learning_rate": 7.013411180224581e-05, + "loss": 1.7945, + "step": 12611 + }, + { + "epoch": 3.871086556169429, + "grad_norm": 0.23901648819446564, + "learning_rate": 7.012956195131433e-05, + "loss": 1.7844, + "step": 12612 + }, + { + "epoch": 3.871393492940454, + "grad_norm": 0.26814451813697815, + "learning_rate": 7.012501190145071e-05, + "loss": 1.7713, + "step": 12613 + }, + { + "epoch": 3.8717004297114794, + "grad_norm": 0.28377315402030945, + "learning_rate": 7.012046165269995e-05, + "loss": 1.7866, + "step": 12614 + }, + { + "epoch": 3.8720073664825048, + "grad_norm": 0.2751680612564087, + "learning_rate": 7.011591120510699e-05, + "loss": 1.7215, + "step": 12615 + }, + { + "epoch": 3.8723143032535297, + "grad_norm": 0.21988113224506378, + "learning_rate": 7.011136055871679e-05, + "loss": 1.8009, + "step": 12616 + }, + { + "epoch": 3.872621240024555, + "grad_norm": 0.26462143659591675, + "learning_rate": 7.010680971357434e-05, + "loss": 1.7618, + "step": 12617 + }, + { + "epoch": 3.87292817679558, + "grad_norm": 0.29054632782936096, + "learning_rate": 7.010225866972462e-05, + "loss": 1.7549, + "step": 12618 + }, + { + "epoch": 3.873235113566605, + "grad_norm": 0.31341224908828735, + "learning_rate": 7.00977074272126e-05, + "loss": 1.8827, + "step": 12619 + }, + { + "epoch": 3.8735420503376305, + "grad_norm": 0.24252115190029144, + "learning_rate": 7.009315598608324e-05, + "loss": 1.7544, + "step": 12620 + }, + { + "epoch": 3.873848987108656, + "grad_norm": 0.30036893486976624, + "learning_rate": 7.008860434638154e-05, + "loss": 1.7465, + "step": 12621 + }, + { + "epoch": 3.8741559238796808, + "grad_norm": 0.3217438757419586, + "learning_rate": 7.00840525081525e-05, + "loss": 1.72, + "step": 12622 + }, + { + "epoch": 3.874462860650706, + "grad_norm": 0.22507290542125702, + "learning_rate": 7.007950047144105e-05, + "loss": 1.7177, + "step": 12623 + }, + { + "epoch": 3.874769797421731, + "grad_norm": 0.3014441728591919, + "learning_rate": 7.007494823629224e-05, + "loss": 1.7502, + "step": 12624 + }, + { + "epoch": 3.8750767341927563, + "grad_norm": 0.3836904466152191, + "learning_rate": 7.0070395802751e-05, + "loss": 1.7971, + "step": 12625 + }, + { + "epoch": 3.8753836709637817, + "grad_norm": 0.33565691113471985, + "learning_rate": 7.006584317086235e-05, + "loss": 1.7439, + "step": 12626 + }, + { + "epoch": 3.8756906077348066, + "grad_norm": 0.2292134314775467, + "learning_rate": 7.006129034067128e-05, + "loss": 1.7998, + "step": 12627 + }, + { + "epoch": 3.875997544505832, + "grad_norm": 0.26385873556137085, + "learning_rate": 7.005673731222277e-05, + "loss": 1.7914, + "step": 12628 + }, + { + "epoch": 3.876304481276857, + "grad_norm": 0.2854950428009033, + "learning_rate": 7.005218408556184e-05, + "loss": 1.7761, + "step": 12629 + }, + { + "epoch": 3.876611418047882, + "grad_norm": 0.34260645508766174, + "learning_rate": 7.004763066073348e-05, + "loss": 1.8015, + "step": 12630 + }, + { + "epoch": 3.8769183548189075, + "grad_norm": 0.3223683834075928, + "learning_rate": 7.004307703778267e-05, + "loss": 1.7453, + "step": 12631 + }, + { + "epoch": 3.8772252915899323, + "grad_norm": 0.24715089797973633, + "learning_rate": 7.003852321675442e-05, + "loss": 1.7813, + "step": 12632 + }, + { + "epoch": 3.8775322283609577, + "grad_norm": 0.22822390496730804, + "learning_rate": 7.003396919769377e-05, + "loss": 1.7982, + "step": 12633 + }, + { + "epoch": 3.8778391651319826, + "grad_norm": 0.24125081300735474, + "learning_rate": 7.002941498064565e-05, + "loss": 1.8606, + "step": 12634 + }, + { + "epoch": 3.878146101903008, + "grad_norm": 0.23512506484985352, + "learning_rate": 7.002486056565513e-05, + "loss": 1.7469, + "step": 12635 + }, + { + "epoch": 3.8784530386740332, + "grad_norm": 0.2908322215080261, + "learning_rate": 7.00203059527672e-05, + "loss": 1.796, + "step": 12636 + }, + { + "epoch": 3.8787599754450586, + "grad_norm": 0.22931252419948578, + "learning_rate": 7.001575114202689e-05, + "loss": 1.7482, + "step": 12637 + }, + { + "epoch": 3.8790669122160835, + "grad_norm": 0.22574284672737122, + "learning_rate": 7.001119613347917e-05, + "loss": 1.7698, + "step": 12638 + }, + { + "epoch": 3.879373848987109, + "grad_norm": 0.23129726946353912, + "learning_rate": 7.000664092716909e-05, + "loss": 1.776, + "step": 12639 + }, + { + "epoch": 3.8796807857581337, + "grad_norm": 0.2763366401195526, + "learning_rate": 7.000208552314165e-05, + "loss": 1.7814, + "step": 12640 + }, + { + "epoch": 3.879987722529159, + "grad_norm": 0.29870158433914185, + "learning_rate": 6.99975299214419e-05, + "loss": 1.7467, + "step": 12641 + }, + { + "epoch": 3.8802946593001844, + "grad_norm": 0.33574381470680237, + "learning_rate": 6.999297412211484e-05, + "loss": 1.8159, + "step": 12642 + }, + { + "epoch": 3.8806015960712092, + "grad_norm": 0.30309897661209106, + "learning_rate": 6.998841812520547e-05, + "loss": 1.8454, + "step": 12643 + }, + { + "epoch": 3.8809085328422346, + "grad_norm": 0.27399247884750366, + "learning_rate": 6.998386193075886e-05, + "loss": 1.7956, + "step": 12644 + }, + { + "epoch": 3.8812154696132595, + "grad_norm": 0.28649580478668213, + "learning_rate": 6.997930553881998e-05, + "loss": 1.8308, + "step": 12645 + }, + { + "epoch": 3.881522406384285, + "grad_norm": 0.2716052532196045, + "learning_rate": 6.997474894943392e-05, + "loss": 1.7698, + "step": 12646 + }, + { + "epoch": 3.88182934315531, + "grad_norm": 0.21380536258220673, + "learning_rate": 6.997019216264567e-05, + "loss": 1.7028, + "step": 12647 + }, + { + "epoch": 3.882136279926335, + "grad_norm": 0.25262731313705444, + "learning_rate": 6.996563517850028e-05, + "loss": 1.8236, + "step": 12648 + }, + { + "epoch": 3.8824432166973604, + "grad_norm": 0.21150052547454834, + "learning_rate": 6.996107799704277e-05, + "loss": 1.7437, + "step": 12649 + }, + { + "epoch": 3.8827501534683853, + "grad_norm": 0.2614554464817047, + "learning_rate": 6.995652061831821e-05, + "loss": 1.7575, + "step": 12650 + }, + { + "epoch": 3.8830570902394106, + "grad_norm": 0.214684396982193, + "learning_rate": 6.995196304237159e-05, + "loss": 1.8195, + "step": 12651 + }, + { + "epoch": 3.883364027010436, + "grad_norm": 0.2226872444152832, + "learning_rate": 6.994740526924798e-05, + "loss": 1.7556, + "step": 12652 + }, + { + "epoch": 3.8836709637814613, + "grad_norm": 0.22270764410495758, + "learning_rate": 6.994284729899246e-05, + "loss": 1.7536, + "step": 12653 + }, + { + "epoch": 3.883977900552486, + "grad_norm": 0.20683564245700836, + "learning_rate": 6.993828913165e-05, + "loss": 1.7728, + "step": 12654 + }, + { + "epoch": 3.8842848373235115, + "grad_norm": 0.23667018115520477, + "learning_rate": 6.993373076726568e-05, + "loss": 1.7819, + "step": 12655 + }, + { + "epoch": 3.8845917740945364, + "grad_norm": 0.2265234887599945, + "learning_rate": 6.992917220588455e-05, + "loss": 1.7502, + "step": 12656 + }, + { + "epoch": 3.8848987108655617, + "grad_norm": 0.24490754306316376, + "learning_rate": 6.992461344755168e-05, + "loss": 1.7513, + "step": 12657 + }, + { + "epoch": 3.885205647636587, + "grad_norm": 0.23001348972320557, + "learning_rate": 6.992005449231208e-05, + "loss": 1.733, + "step": 12658 + }, + { + "epoch": 3.885512584407612, + "grad_norm": 0.25424695014953613, + "learning_rate": 6.991549534021084e-05, + "loss": 1.7621, + "step": 12659 + }, + { + "epoch": 3.8858195211786373, + "grad_norm": 0.25552862882614136, + "learning_rate": 6.991093599129299e-05, + "loss": 1.7974, + "step": 12660 + }, + { + "epoch": 3.886126457949662, + "grad_norm": 0.26876959204673767, + "learning_rate": 6.99063764456036e-05, + "loss": 1.7924, + "step": 12661 + }, + { + "epoch": 3.8864333947206875, + "grad_norm": 0.2754429578781128, + "learning_rate": 6.990181670318772e-05, + "loss": 1.7981, + "step": 12662 + }, + { + "epoch": 3.886740331491713, + "grad_norm": 0.281818687915802, + "learning_rate": 6.989725676409044e-05, + "loss": 1.7328, + "step": 12663 + }, + { + "epoch": 3.8870472682627377, + "grad_norm": 0.21676552295684814, + "learning_rate": 6.989269662835681e-05, + "loss": 1.7376, + "step": 12664 + }, + { + "epoch": 3.887354205033763, + "grad_norm": 0.276115745306015, + "learning_rate": 6.98881362960319e-05, + "loss": 1.7784, + "step": 12665 + }, + { + "epoch": 3.887661141804788, + "grad_norm": 0.2806364893913269, + "learning_rate": 6.988357576716075e-05, + "loss": 1.8078, + "step": 12666 + }, + { + "epoch": 3.8879680785758133, + "grad_norm": 0.27620184421539307, + "learning_rate": 6.987901504178845e-05, + "loss": 1.8115, + "step": 12667 + }, + { + "epoch": 3.8882750153468386, + "grad_norm": 0.23845402896404266, + "learning_rate": 6.987445411996009e-05, + "loss": 1.7485, + "step": 12668 + }, + { + "epoch": 3.888581952117864, + "grad_norm": 0.25063586235046387, + "learning_rate": 6.986989300172071e-05, + "loss": 1.7663, + "step": 12669 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.2417975515127182, + "learning_rate": 6.98653316871154e-05, + "loss": 1.7562, + "step": 12670 + }, + { + "epoch": 3.889195825659914, + "grad_norm": 0.24952733516693115, + "learning_rate": 6.986077017618923e-05, + "loss": 1.8063, + "step": 12671 + }, + { + "epoch": 3.889502762430939, + "grad_norm": 0.25847554206848145, + "learning_rate": 6.985620846898732e-05, + "loss": 1.7722, + "step": 12672 + }, + { + "epoch": 3.8898096992019644, + "grad_norm": 0.23762650787830353, + "learning_rate": 6.985164656555471e-05, + "loss": 1.8368, + "step": 12673 + }, + { + "epoch": 3.8901166359729897, + "grad_norm": 0.25346314907073975, + "learning_rate": 6.984708446593648e-05, + "loss": 1.7957, + "step": 12674 + }, + { + "epoch": 3.8904235727440146, + "grad_norm": 0.2466745674610138, + "learning_rate": 6.984252217017774e-05, + "loss": 1.8286, + "step": 12675 + }, + { + "epoch": 3.89073050951504, + "grad_norm": 0.25413215160369873, + "learning_rate": 6.983795967832356e-05, + "loss": 1.7711, + "step": 12676 + }, + { + "epoch": 3.891037446286065, + "grad_norm": 0.2315925806760788, + "learning_rate": 6.983339699041903e-05, + "loss": 1.7546, + "step": 12677 + }, + { + "epoch": 3.89134438305709, + "grad_norm": 0.26473405957221985, + "learning_rate": 6.982883410650925e-05, + "loss": 1.7563, + "step": 12678 + }, + { + "epoch": 3.8916513198281155, + "grad_norm": 0.24176491796970367, + "learning_rate": 6.982427102663932e-05, + "loss": 1.7734, + "step": 12679 + }, + { + "epoch": 3.891958256599141, + "grad_norm": 0.25444844365119934, + "learning_rate": 6.98197077508543e-05, + "loss": 1.803, + "step": 12680 + }, + { + "epoch": 3.8922651933701657, + "grad_norm": 0.25234144926071167, + "learning_rate": 6.981514427919933e-05, + "loss": 1.8099, + "step": 12681 + }, + { + "epoch": 3.892572130141191, + "grad_norm": 0.2571142315864563, + "learning_rate": 6.98105806117195e-05, + "loss": 1.8618, + "step": 12682 + }, + { + "epoch": 3.892879066912216, + "grad_norm": 0.21235275268554688, + "learning_rate": 6.980601674845988e-05, + "loss": 1.7121, + "step": 12683 + }, + { + "epoch": 3.8931860036832413, + "grad_norm": 0.27078527212142944, + "learning_rate": 6.98014526894656e-05, + "loss": 1.8103, + "step": 12684 + }, + { + "epoch": 3.8934929404542666, + "grad_norm": 0.3198096454143524, + "learning_rate": 6.979688843478176e-05, + "loss": 1.7529, + "step": 12685 + }, + { + "epoch": 3.8937998772252915, + "grad_norm": 0.3170493245124817, + "learning_rate": 6.979232398445345e-05, + "loss": 1.7629, + "step": 12686 + }, + { + "epoch": 3.894106813996317, + "grad_norm": 0.2495265007019043, + "learning_rate": 6.978775933852582e-05, + "loss": 1.7407, + "step": 12687 + }, + { + "epoch": 3.8944137507673418, + "grad_norm": 0.24570141732692719, + "learning_rate": 6.978319449704395e-05, + "loss": 1.7688, + "step": 12688 + }, + { + "epoch": 3.894720687538367, + "grad_norm": 0.23956388235092163, + "learning_rate": 6.977862946005295e-05, + "loss": 1.7115, + "step": 12689 + }, + { + "epoch": 3.8950276243093924, + "grad_norm": 0.21548940241336823, + "learning_rate": 6.977406422759793e-05, + "loss": 1.7611, + "step": 12690 + }, + { + "epoch": 3.8953345610804173, + "grad_norm": 0.25797295570373535, + "learning_rate": 6.976949879972403e-05, + "loss": 1.7688, + "step": 12691 + }, + { + "epoch": 3.8956414978514426, + "grad_norm": 0.28257784247398376, + "learning_rate": 6.976493317647636e-05, + "loss": 1.7517, + "step": 12692 + }, + { + "epoch": 3.8959484346224675, + "grad_norm": 0.23828580975532532, + "learning_rate": 6.976036735790004e-05, + "loss": 1.7877, + "step": 12693 + }, + { + "epoch": 3.896255371393493, + "grad_norm": 0.22915001213550568, + "learning_rate": 6.975580134404017e-05, + "loss": 1.7741, + "step": 12694 + }, + { + "epoch": 3.896562308164518, + "grad_norm": 0.22975030541419983, + "learning_rate": 6.97512351349419e-05, + "loss": 1.772, + "step": 12695 + }, + { + "epoch": 3.8968692449355435, + "grad_norm": 0.29515185952186584, + "learning_rate": 6.974666873065034e-05, + "loss": 1.8001, + "step": 12696 + }, + { + "epoch": 3.8971761817065684, + "grad_norm": 0.26904794573783875, + "learning_rate": 6.974210213121064e-05, + "loss": 1.7069, + "step": 12697 + }, + { + "epoch": 3.8974831184775938, + "grad_norm": 0.2549479603767395, + "learning_rate": 6.97375353366679e-05, + "loss": 1.7419, + "step": 12698 + }, + { + "epoch": 3.8977900552486187, + "grad_norm": 0.23750101029872894, + "learning_rate": 6.973296834706729e-05, + "loss": 1.7815, + "step": 12699 + }, + { + "epoch": 3.898096992019644, + "grad_norm": 0.23529762029647827, + "learning_rate": 6.972840116245389e-05, + "loss": 1.8139, + "step": 12700 + }, + { + "epoch": 3.8984039287906693, + "grad_norm": 0.3212098777294159, + "learning_rate": 6.97238337828729e-05, + "loss": 1.7507, + "step": 12701 + }, + { + "epoch": 3.898710865561694, + "grad_norm": 0.3167687952518463, + "learning_rate": 6.971926620836941e-05, + "loss": 1.8062, + "step": 12702 + }, + { + "epoch": 3.8990178023327196, + "grad_norm": 0.31298309564590454, + "learning_rate": 6.971469843898855e-05, + "loss": 1.8127, + "step": 12703 + }, + { + "epoch": 3.8993247391037444, + "grad_norm": 0.2537378668785095, + "learning_rate": 6.971013047477551e-05, + "loss": 1.7675, + "step": 12704 + }, + { + "epoch": 3.8996316758747698, + "grad_norm": 0.24292805790901184, + "learning_rate": 6.97055623157754e-05, + "loss": 1.8004, + "step": 12705 + }, + { + "epoch": 3.899938612645795, + "grad_norm": 0.2929537296295166, + "learning_rate": 6.970099396203338e-05, + "loss": 1.7963, + "step": 12706 + }, + { + "epoch": 3.90024554941682, + "grad_norm": 0.30531612038612366, + "learning_rate": 6.969642541359459e-05, + "loss": 1.7347, + "step": 12707 + }, + { + "epoch": 3.9005524861878453, + "grad_norm": 0.3138202726840973, + "learning_rate": 6.969185667050417e-05, + "loss": 1.7987, + "step": 12708 + }, + { + "epoch": 3.9008594229588702, + "grad_norm": 0.2366247922182083, + "learning_rate": 6.96872877328073e-05, + "loss": 1.7671, + "step": 12709 + }, + { + "epoch": 3.9011663597298956, + "grad_norm": 0.26251721382141113, + "learning_rate": 6.96827186005491e-05, + "loss": 1.7657, + "step": 12710 + }, + { + "epoch": 3.901473296500921, + "grad_norm": 0.32497119903564453, + "learning_rate": 6.967814927377474e-05, + "loss": 1.7873, + "step": 12711 + }, + { + "epoch": 3.9017802332719462, + "grad_norm": 0.3290228843688965, + "learning_rate": 6.967357975252939e-05, + "loss": 1.8076, + "step": 12712 + }, + { + "epoch": 3.902087170042971, + "grad_norm": 0.2737300992012024, + "learning_rate": 6.966901003685817e-05, + "loss": 1.7405, + "step": 12713 + }, + { + "epoch": 3.9023941068139965, + "grad_norm": 0.25465309619903564, + "learning_rate": 6.966444012680626e-05, + "loss": 1.8063, + "step": 12714 + }, + { + "epoch": 3.9027010435850213, + "grad_norm": 0.2397255003452301, + "learning_rate": 6.965987002241885e-05, + "loss": 1.8079, + "step": 12715 + }, + { + "epoch": 3.9030079803560467, + "grad_norm": 0.23115718364715576, + "learning_rate": 6.965529972374108e-05, + "loss": 1.8032, + "step": 12716 + }, + { + "epoch": 3.903314917127072, + "grad_norm": 0.2536461055278778, + "learning_rate": 6.96507292308181e-05, + "loss": 1.7477, + "step": 12717 + }, + { + "epoch": 3.903621853898097, + "grad_norm": 0.27151185274124146, + "learning_rate": 6.96461585436951e-05, + "loss": 1.75, + "step": 12718 + }, + { + "epoch": 3.9039287906691222, + "grad_norm": 0.26894113421440125, + "learning_rate": 6.964158766241726e-05, + "loss": 1.7816, + "step": 12719 + }, + { + "epoch": 3.904235727440147, + "grad_norm": 0.23541375994682312, + "learning_rate": 6.963701658702972e-05, + "loss": 1.7991, + "step": 12720 + }, + { + "epoch": 3.9045426642111725, + "grad_norm": 0.22142915427684784, + "learning_rate": 6.96324453175777e-05, + "loss": 1.7245, + "step": 12721 + }, + { + "epoch": 3.904849600982198, + "grad_norm": 0.32864269614219666, + "learning_rate": 6.962787385410632e-05, + "loss": 1.7631, + "step": 12722 + }, + { + "epoch": 3.9051565377532227, + "grad_norm": 0.23657776415348053, + "learning_rate": 6.96233021966608e-05, + "loss": 1.8081, + "step": 12723 + }, + { + "epoch": 3.905463474524248, + "grad_norm": 0.24790632724761963, + "learning_rate": 6.961873034528629e-05, + "loss": 1.7193, + "step": 12724 + }, + { + "epoch": 3.905770411295273, + "grad_norm": 0.2517886459827423, + "learning_rate": 6.961415830002801e-05, + "loss": 1.7785, + "step": 12725 + }, + { + "epoch": 3.9060773480662982, + "grad_norm": 0.2340923547744751, + "learning_rate": 6.960958606093113e-05, + "loss": 1.7632, + "step": 12726 + }, + { + "epoch": 3.9063842848373236, + "grad_norm": 0.23260441422462463, + "learning_rate": 6.960501362804079e-05, + "loss": 1.7865, + "step": 12727 + }, + { + "epoch": 3.906691221608349, + "grad_norm": 0.22616329789161682, + "learning_rate": 6.960044100140224e-05, + "loss": 1.7851, + "step": 12728 + }, + { + "epoch": 3.906998158379374, + "grad_norm": 0.2849951982498169, + "learning_rate": 6.959586818106064e-05, + "loss": 1.8618, + "step": 12729 + }, + { + "epoch": 3.907305095150399, + "grad_norm": 0.3279374837875366, + "learning_rate": 6.95912951670612e-05, + "loss": 1.8563, + "step": 12730 + }, + { + "epoch": 3.907612031921424, + "grad_norm": 0.24359555542469025, + "learning_rate": 6.958672195944906e-05, + "loss": 1.7604, + "step": 12731 + }, + { + "epoch": 3.9079189686924494, + "grad_norm": 0.30881935358047485, + "learning_rate": 6.958214855826947e-05, + "loss": 1.8463, + "step": 12732 + }, + { + "epoch": 3.9082259054634747, + "grad_norm": 0.25361543893814087, + "learning_rate": 6.957757496356763e-05, + "loss": 1.7831, + "step": 12733 + }, + { + "epoch": 3.9085328422344996, + "grad_norm": 0.26763513684272766, + "learning_rate": 6.957300117538869e-05, + "loss": 1.8383, + "step": 12734 + }, + { + "epoch": 3.908839779005525, + "grad_norm": 0.2238057255744934, + "learning_rate": 6.95684271937779e-05, + "loss": 1.7702, + "step": 12735 + }, + { + "epoch": 3.90914671577655, + "grad_norm": 0.22110232710838318, + "learning_rate": 6.956385301878045e-05, + "loss": 1.7931, + "step": 12736 + }, + { + "epoch": 3.909453652547575, + "grad_norm": 0.23765070736408234, + "learning_rate": 6.955927865044152e-05, + "loss": 1.7212, + "step": 12737 + }, + { + "epoch": 3.9097605893186005, + "grad_norm": 0.22324508428573608, + "learning_rate": 6.955470408880633e-05, + "loss": 1.7161, + "step": 12738 + }, + { + "epoch": 3.9100675260896254, + "grad_norm": 0.22485347092151642, + "learning_rate": 6.955012933392012e-05, + "loss": 1.7374, + "step": 12739 + }, + { + "epoch": 3.9103744628606507, + "grad_norm": 0.28046715259552, + "learning_rate": 6.954555438582806e-05, + "loss": 1.9264, + "step": 12740 + }, + { + "epoch": 3.9106813996316756, + "grad_norm": 0.26391276717185974, + "learning_rate": 6.954097924457536e-05, + "loss": 1.7343, + "step": 12741 + }, + { + "epoch": 3.910988336402701, + "grad_norm": 0.29596614837646484, + "learning_rate": 6.953640391020726e-05, + "loss": 1.8111, + "step": 12742 + }, + { + "epoch": 3.9112952731737263, + "grad_norm": 0.2709808051586151, + "learning_rate": 6.953182838276896e-05, + "loss": 1.7776, + "step": 12743 + }, + { + "epoch": 3.9116022099447516, + "grad_norm": 0.2585100531578064, + "learning_rate": 6.952725266230571e-05, + "loss": 1.7774, + "step": 12744 + }, + { + "epoch": 3.9119091467157765, + "grad_norm": 0.26490530371665955, + "learning_rate": 6.952267674886268e-05, + "loss": 1.78, + "step": 12745 + }, + { + "epoch": 3.912216083486802, + "grad_norm": 0.23654767870903015, + "learning_rate": 6.951810064248512e-05, + "loss": 1.8263, + "step": 12746 + }, + { + "epoch": 3.9125230202578267, + "grad_norm": 0.2495296597480774, + "learning_rate": 6.951352434321826e-05, + "loss": 1.787, + "step": 12747 + }, + { + "epoch": 3.912829957028852, + "grad_norm": 0.24038313329219818, + "learning_rate": 6.950894785110728e-05, + "loss": 1.774, + "step": 12748 + }, + { + "epoch": 3.9131368937998774, + "grad_norm": 0.23738732933998108, + "learning_rate": 6.950437116619749e-05, + "loss": 1.7401, + "step": 12749 + }, + { + "epoch": 3.9134438305709023, + "grad_norm": 0.28192025423049927, + "learning_rate": 6.949979428853405e-05, + "loss": 1.8416, + "step": 12750 + }, + { + "epoch": 3.9137507673419276, + "grad_norm": 0.30579057335853577, + "learning_rate": 6.949521721816221e-05, + "loss": 1.7404, + "step": 12751 + }, + { + "epoch": 3.9140577041129525, + "grad_norm": 0.23972894251346588, + "learning_rate": 6.949063995512721e-05, + "loss": 1.7543, + "step": 12752 + }, + { + "epoch": 3.914364640883978, + "grad_norm": 0.2837793231010437, + "learning_rate": 6.94860624994743e-05, + "loss": 1.7779, + "step": 12753 + }, + { + "epoch": 3.914671577655003, + "grad_norm": 0.3344916105270386, + "learning_rate": 6.948148485124868e-05, + "loss": 1.7803, + "step": 12754 + }, + { + "epoch": 3.9149785144260285, + "grad_norm": 0.24271291494369507, + "learning_rate": 6.94769070104956e-05, + "loss": 1.7362, + "step": 12755 + }, + { + "epoch": 3.9152854511970534, + "grad_norm": 0.25299304723739624, + "learning_rate": 6.947232897726031e-05, + "loss": 1.7685, + "step": 12756 + }, + { + "epoch": 3.9155923879680787, + "grad_norm": 0.24766205251216888, + "learning_rate": 6.946775075158807e-05, + "loss": 1.829, + "step": 12757 + }, + { + "epoch": 3.9158993247391036, + "grad_norm": 0.2508428692817688, + "learning_rate": 6.94631723335241e-05, + "loss": 1.809, + "step": 12758 + }, + { + "epoch": 3.916206261510129, + "grad_norm": 0.2172096222639084, + "learning_rate": 6.945859372311365e-05, + "loss": 1.7376, + "step": 12759 + }, + { + "epoch": 3.9165131982811543, + "grad_norm": 0.28976425528526306, + "learning_rate": 6.945401492040198e-05, + "loss": 1.8229, + "step": 12760 + }, + { + "epoch": 3.916820135052179, + "grad_norm": 0.3528063893318176, + "learning_rate": 6.944943592543432e-05, + "loss": 1.7559, + "step": 12761 + }, + { + "epoch": 3.9171270718232045, + "grad_norm": 0.46312370896339417, + "learning_rate": 6.944485673825595e-05, + "loss": 1.7664, + "step": 12762 + }, + { + "epoch": 3.9174340085942294, + "grad_norm": 0.4466164708137512, + "learning_rate": 6.94402773589121e-05, + "loss": 1.7833, + "step": 12763 + }, + { + "epoch": 3.9177409453652547, + "grad_norm": 0.2637740969657898, + "learning_rate": 6.943569778744804e-05, + "loss": 1.818, + "step": 12764 + }, + { + "epoch": 3.91804788213628, + "grad_norm": 0.37515267729759216, + "learning_rate": 6.943111802390901e-05, + "loss": 1.7898, + "step": 12765 + }, + { + "epoch": 3.918354818907305, + "grad_norm": 0.45146289467811584, + "learning_rate": 6.942653806834029e-05, + "loss": 1.7797, + "step": 12766 + }, + { + "epoch": 3.9186617556783303, + "grad_norm": 0.2809859812259674, + "learning_rate": 6.942195792078712e-05, + "loss": 1.7836, + "step": 12767 + }, + { + "epoch": 3.918968692449355, + "grad_norm": 0.3606306314468384, + "learning_rate": 6.94173775812948e-05, + "loss": 1.7657, + "step": 12768 + }, + { + "epoch": 3.9192756292203805, + "grad_norm": 0.49528738856315613, + "learning_rate": 6.941279704990857e-05, + "loss": 1.7628, + "step": 12769 + }, + { + "epoch": 3.919582565991406, + "grad_norm": 0.3484322428703308, + "learning_rate": 6.940821632667371e-05, + "loss": 1.7939, + "step": 12770 + }, + { + "epoch": 3.919889502762431, + "grad_norm": 0.2479606419801712, + "learning_rate": 6.940363541163546e-05, + "loss": 1.813, + "step": 12771 + }, + { + "epoch": 3.920196439533456, + "grad_norm": 0.3491765558719635, + "learning_rate": 6.939905430483911e-05, + "loss": 1.7338, + "step": 12772 + }, + { + "epoch": 3.9205033763044814, + "grad_norm": 0.291810005903244, + "learning_rate": 6.939447300632995e-05, + "loss": 1.7445, + "step": 12773 + }, + { + "epoch": 3.9208103130755063, + "grad_norm": 0.2467527985572815, + "learning_rate": 6.938989151615324e-05, + "loss": 1.8462, + "step": 12774 + }, + { + "epoch": 3.9211172498465316, + "grad_norm": 0.35656824707984924, + "learning_rate": 6.938530983435426e-05, + "loss": 1.7751, + "step": 12775 + }, + { + "epoch": 3.921424186617557, + "grad_norm": 0.31269776821136475, + "learning_rate": 6.938072796097828e-05, + "loss": 1.7714, + "step": 12776 + }, + { + "epoch": 3.921731123388582, + "grad_norm": 0.2082831859588623, + "learning_rate": 6.937614589607058e-05, + "loss": 1.7263, + "step": 12777 + }, + { + "epoch": 3.922038060159607, + "grad_norm": 0.27583765983581543, + "learning_rate": 6.937156363967646e-05, + "loss": 1.6822, + "step": 12778 + }, + { + "epoch": 3.922344996930632, + "grad_norm": 0.32773876190185547, + "learning_rate": 6.93669811918412e-05, + "loss": 1.7792, + "step": 12779 + }, + { + "epoch": 3.9226519337016574, + "grad_norm": 0.2583121657371521, + "learning_rate": 6.936239855261007e-05, + "loss": 1.7812, + "step": 12780 + }, + { + "epoch": 3.9229588704726828, + "grad_norm": 0.245570570230484, + "learning_rate": 6.935781572202836e-05, + "loss": 1.7252, + "step": 12781 + }, + { + "epoch": 3.9232658072437077, + "grad_norm": 0.2379419505596161, + "learning_rate": 6.935323270014138e-05, + "loss": 1.7485, + "step": 12782 + }, + { + "epoch": 3.923572744014733, + "grad_norm": 0.2239784598350525, + "learning_rate": 6.934864948699439e-05, + "loss": 1.7444, + "step": 12783 + }, + { + "epoch": 3.923879680785758, + "grad_norm": 0.2366618812084198, + "learning_rate": 6.934406608263274e-05, + "loss": 1.777, + "step": 12784 + }, + { + "epoch": 3.924186617556783, + "grad_norm": 0.22583791613578796, + "learning_rate": 6.933948248710169e-05, + "loss": 1.7291, + "step": 12785 + }, + { + "epoch": 3.9244935543278086, + "grad_norm": 0.24141047894954681, + "learning_rate": 6.933489870044651e-05, + "loss": 1.7748, + "step": 12786 + }, + { + "epoch": 3.924800491098834, + "grad_norm": 0.2389962524175644, + "learning_rate": 6.933031472271255e-05, + "loss": 1.7957, + "step": 12787 + }, + { + "epoch": 3.925107427869859, + "grad_norm": 0.25230300426483154, + "learning_rate": 6.932573055394509e-05, + "loss": 1.7621, + "step": 12788 + }, + { + "epoch": 3.925414364640884, + "grad_norm": 0.23894043266773224, + "learning_rate": 6.932114619418941e-05, + "loss": 1.7285, + "step": 12789 + }, + { + "epoch": 3.925721301411909, + "grad_norm": 0.2650291919708252, + "learning_rate": 6.931656164349086e-05, + "loss": 1.7613, + "step": 12790 + }, + { + "epoch": 3.9260282381829343, + "grad_norm": 0.20616789162158966, + "learning_rate": 6.931197690189472e-05, + "loss": 1.7505, + "step": 12791 + }, + { + "epoch": 3.9263351749539597, + "grad_norm": 0.23915675282478333, + "learning_rate": 6.930739196944633e-05, + "loss": 1.7477, + "step": 12792 + }, + { + "epoch": 3.9266421117249846, + "grad_norm": 0.2522687613964081, + "learning_rate": 6.930280684619094e-05, + "loss": 1.8, + "step": 12793 + }, + { + "epoch": 3.92694904849601, + "grad_norm": 0.264167845249176, + "learning_rate": 6.929822153217391e-05, + "loss": 1.7516, + "step": 12794 + }, + { + "epoch": 3.927255985267035, + "grad_norm": 0.21358054876327515, + "learning_rate": 6.929363602744054e-05, + "loss": 1.7207, + "step": 12795 + }, + { + "epoch": 3.92756292203806, + "grad_norm": 0.25632721185684204, + "learning_rate": 6.928905033203617e-05, + "loss": 1.7446, + "step": 12796 + }, + { + "epoch": 3.9278698588090855, + "grad_norm": 0.2717185318470001, + "learning_rate": 6.928446444600608e-05, + "loss": 1.8555, + "step": 12797 + }, + { + "epoch": 3.9281767955801103, + "grad_norm": 0.2871767282485962, + "learning_rate": 6.927987836939561e-05, + "loss": 1.7861, + "step": 12798 + }, + { + "epoch": 3.9284837323511357, + "grad_norm": 0.282507061958313, + "learning_rate": 6.927529210225009e-05, + "loss": 1.7683, + "step": 12799 + }, + { + "epoch": 3.9287906691221606, + "grad_norm": 0.24870644509792328, + "learning_rate": 6.927070564461482e-05, + "loss": 1.7355, + "step": 12800 + }, + { + "epoch": 3.929097605893186, + "grad_norm": 0.2093631625175476, + "learning_rate": 6.926611899653516e-05, + "loss": 1.7691, + "step": 12801 + }, + { + "epoch": 3.9294045426642112, + "grad_norm": 0.34258076548576355, + "learning_rate": 6.926153215805642e-05, + "loss": 1.8398, + "step": 12802 + }, + { + "epoch": 3.9297114794352366, + "grad_norm": 0.39179500937461853, + "learning_rate": 6.925694512922391e-05, + "loss": 1.8229, + "step": 12803 + }, + { + "epoch": 3.9300184162062615, + "grad_norm": 0.36814743280410767, + "learning_rate": 6.9252357910083e-05, + "loss": 1.7759, + "step": 12804 + }, + { + "epoch": 3.930325352977287, + "grad_norm": 0.2659403085708618, + "learning_rate": 6.924777050067902e-05, + "loss": 1.7553, + "step": 12805 + }, + { + "epoch": 3.9306322897483117, + "grad_norm": 0.20617491006851196, + "learning_rate": 6.924318290105724e-05, + "loss": 1.7398, + "step": 12806 + }, + { + "epoch": 3.930939226519337, + "grad_norm": 0.23730522394180298, + "learning_rate": 6.923859511126309e-05, + "loss": 1.699, + "step": 12807 + }, + { + "epoch": 3.9312461632903624, + "grad_norm": 0.24865423142910004, + "learning_rate": 6.923400713134184e-05, + "loss": 1.7801, + "step": 12808 + }, + { + "epoch": 3.9315531000613873, + "grad_norm": 0.2495356798171997, + "learning_rate": 6.92294189613389e-05, + "loss": 1.803, + "step": 12809 + }, + { + "epoch": 3.9318600368324126, + "grad_norm": 0.24223244190216064, + "learning_rate": 6.922483060129955e-05, + "loss": 1.751, + "step": 12810 + }, + { + "epoch": 3.9321669736034375, + "grad_norm": 0.2541450262069702, + "learning_rate": 6.922024205126913e-05, + "loss": 1.7721, + "step": 12811 + }, + { + "epoch": 3.932473910374463, + "grad_norm": 0.24528831243515015, + "learning_rate": 6.921565331129304e-05, + "loss": 1.792, + "step": 12812 + }, + { + "epoch": 3.932780847145488, + "grad_norm": 0.22789500653743744, + "learning_rate": 6.921106438141659e-05, + "loss": 1.8455, + "step": 12813 + }, + { + "epoch": 3.933087783916513, + "grad_norm": 0.26267170906066895, + "learning_rate": 6.920647526168515e-05, + "loss": 1.7254, + "step": 12814 + }, + { + "epoch": 3.9333947206875384, + "grad_norm": 0.23044808208942413, + "learning_rate": 6.920188595214406e-05, + "loss": 1.7217, + "step": 12815 + }, + { + "epoch": 3.9337016574585633, + "grad_norm": 0.2304011732339859, + "learning_rate": 6.919729645283867e-05, + "loss": 1.8121, + "step": 12816 + }, + { + "epoch": 3.9340085942295886, + "grad_norm": 0.21516792476177216, + "learning_rate": 6.919270676381435e-05, + "loss": 1.7305, + "step": 12817 + }, + { + "epoch": 3.934315531000614, + "grad_norm": 0.24698840081691742, + "learning_rate": 6.918811688511646e-05, + "loss": 1.7967, + "step": 12818 + }, + { + "epoch": 3.9346224677716393, + "grad_norm": 0.23132537305355072, + "learning_rate": 6.918352681679035e-05, + "loss": 1.7439, + "step": 12819 + }, + { + "epoch": 3.934929404542664, + "grad_norm": 0.2597793936729431, + "learning_rate": 6.917893655888139e-05, + "loss": 1.7882, + "step": 12820 + }, + { + "epoch": 3.9352363413136895, + "grad_norm": 0.23946607112884521, + "learning_rate": 6.917434611143493e-05, + "loss": 1.7991, + "step": 12821 + }, + { + "epoch": 3.9355432780847144, + "grad_norm": 0.25808244943618774, + "learning_rate": 6.916975547449634e-05, + "loss": 1.845, + "step": 12822 + }, + { + "epoch": 3.9358502148557397, + "grad_norm": 0.26082557439804077, + "learning_rate": 6.9165164648111e-05, + "loss": 1.7562, + "step": 12823 + }, + { + "epoch": 3.936157151626765, + "grad_norm": 0.24810053408145905, + "learning_rate": 6.916057363232425e-05, + "loss": 1.778, + "step": 12824 + }, + { + "epoch": 3.93646408839779, + "grad_norm": 0.24168157577514648, + "learning_rate": 6.91559824271815e-05, + "loss": 1.7628, + "step": 12825 + }, + { + "epoch": 3.9367710251688153, + "grad_norm": 0.23800434172153473, + "learning_rate": 6.91513910327281e-05, + "loss": 1.8063, + "step": 12826 + }, + { + "epoch": 3.93707796193984, + "grad_norm": 0.23055073618888855, + "learning_rate": 6.914679944900944e-05, + "loss": 1.749, + "step": 12827 + }, + { + "epoch": 3.9373848987108655, + "grad_norm": 0.22455987334251404, + "learning_rate": 6.914220767607088e-05, + "loss": 1.7471, + "step": 12828 + }, + { + "epoch": 3.937691835481891, + "grad_norm": 0.21808378398418427, + "learning_rate": 6.913761571395778e-05, + "loss": 1.7503, + "step": 12829 + }, + { + "epoch": 3.937998772252916, + "grad_norm": 0.23136213421821594, + "learning_rate": 6.913302356271556e-05, + "loss": 1.752, + "step": 12830 + }, + { + "epoch": 3.938305709023941, + "grad_norm": 0.29579970240592957, + "learning_rate": 6.912843122238959e-05, + "loss": 1.8028, + "step": 12831 + }, + { + "epoch": 3.9386126457949664, + "grad_norm": 0.28578072786331177, + "learning_rate": 6.912383869302526e-05, + "loss": 1.8183, + "step": 12832 + }, + { + "epoch": 3.9389195825659913, + "grad_norm": 0.2616737186908722, + "learning_rate": 6.911924597466793e-05, + "loss": 1.8366, + "step": 12833 + }, + { + "epoch": 3.9392265193370166, + "grad_norm": 0.29275768995285034, + "learning_rate": 6.911465306736302e-05, + "loss": 1.731, + "step": 12834 + }, + { + "epoch": 3.939533456108042, + "grad_norm": 0.3300873041152954, + "learning_rate": 6.91100599711559e-05, + "loss": 1.8713, + "step": 12835 + }, + { + "epoch": 3.939840392879067, + "grad_norm": 0.2744643986225128, + "learning_rate": 6.910546668609195e-05, + "loss": 1.8479, + "step": 12836 + }, + { + "epoch": 3.940147329650092, + "grad_norm": 0.25248417258262634, + "learning_rate": 6.91008732122166e-05, + "loss": 1.7962, + "step": 12837 + }, + { + "epoch": 3.940454266421117, + "grad_norm": 0.3068546652793884, + "learning_rate": 6.909627954957521e-05, + "loss": 1.759, + "step": 12838 + }, + { + "epoch": 3.9407612031921424, + "grad_norm": 0.3273559808731079, + "learning_rate": 6.909168569821321e-05, + "loss": 1.814, + "step": 12839 + }, + { + "epoch": 3.9410681399631677, + "grad_norm": 0.31192758679389954, + "learning_rate": 6.908709165817597e-05, + "loss": 1.7906, + "step": 12840 + }, + { + "epoch": 3.9413750767341926, + "grad_norm": 0.24487090110778809, + "learning_rate": 6.90824974295089e-05, + "loss": 1.8238, + "step": 12841 + }, + { + "epoch": 3.941682013505218, + "grad_norm": 0.24863721430301666, + "learning_rate": 6.907790301225743e-05, + "loss": 1.7651, + "step": 12842 + }, + { + "epoch": 3.941988950276243, + "grad_norm": 0.26555630564689636, + "learning_rate": 6.907330840646693e-05, + "loss": 1.8268, + "step": 12843 + }, + { + "epoch": 3.942295887047268, + "grad_norm": 0.2439817190170288, + "learning_rate": 6.906871361218281e-05, + "loss": 1.7291, + "step": 12844 + }, + { + "epoch": 3.9426028238182935, + "grad_norm": 0.2410304993391037, + "learning_rate": 6.906411862945048e-05, + "loss": 1.712, + "step": 12845 + }, + { + "epoch": 3.942909760589319, + "grad_norm": 0.28575149178504944, + "learning_rate": 6.905952345831537e-05, + "loss": 1.7269, + "step": 12846 + }, + { + "epoch": 3.9432166973603437, + "grad_norm": 0.3055815100669861, + "learning_rate": 6.905492809882286e-05, + "loss": 1.7234, + "step": 12847 + }, + { + "epoch": 3.943523634131369, + "grad_norm": 0.2762533724308014, + "learning_rate": 6.905033255101839e-05, + "loss": 1.7768, + "step": 12848 + }, + { + "epoch": 3.943830570902394, + "grad_norm": 0.22819125652313232, + "learning_rate": 6.904573681494738e-05, + "loss": 1.7416, + "step": 12849 + }, + { + "epoch": 3.9441375076734193, + "grad_norm": 0.21664194762706757, + "learning_rate": 6.904114089065523e-05, + "loss": 1.7506, + "step": 12850 + }, + { + "epoch": 3.9444444444444446, + "grad_norm": 0.21935151517391205, + "learning_rate": 6.903654477818735e-05, + "loss": 1.7522, + "step": 12851 + }, + { + "epoch": 3.9447513812154695, + "grad_norm": 0.2204175442457199, + "learning_rate": 6.903194847758918e-05, + "loss": 1.7753, + "step": 12852 + }, + { + "epoch": 3.945058317986495, + "grad_norm": 0.23130151629447937, + "learning_rate": 6.902735198890615e-05, + "loss": 1.7743, + "step": 12853 + }, + { + "epoch": 3.9453652547575198, + "grad_norm": 0.2548399567604065, + "learning_rate": 6.902275531218368e-05, + "loss": 1.8373, + "step": 12854 + }, + { + "epoch": 3.945672191528545, + "grad_norm": 0.2905479371547699, + "learning_rate": 6.901815844746718e-05, + "loss": 1.8336, + "step": 12855 + }, + { + "epoch": 3.9459791282995704, + "grad_norm": 0.2698945105075836, + "learning_rate": 6.90135613948021e-05, + "loss": 1.7498, + "step": 12856 + }, + { + "epoch": 3.9462860650705953, + "grad_norm": 0.24966828525066376, + "learning_rate": 6.900896415423387e-05, + "loss": 1.7664, + "step": 12857 + }, + { + "epoch": 3.9465930018416207, + "grad_norm": 0.23272784054279327, + "learning_rate": 6.90043667258079e-05, + "loss": 1.7742, + "step": 12858 + }, + { + "epoch": 3.9468999386126455, + "grad_norm": 0.2277698516845703, + "learning_rate": 6.899976910956965e-05, + "loss": 1.7465, + "step": 12859 + }, + { + "epoch": 3.947206875383671, + "grad_norm": 0.2376442402601242, + "learning_rate": 6.899517130556454e-05, + "loss": 1.7995, + "step": 12860 + }, + { + "epoch": 3.947513812154696, + "grad_norm": 0.25591593980789185, + "learning_rate": 6.899057331383802e-05, + "loss": 1.8017, + "step": 12861 + }, + { + "epoch": 3.9478207489257215, + "grad_norm": 0.2715262472629547, + "learning_rate": 6.898597513443551e-05, + "loss": 1.7967, + "step": 12862 + }, + { + "epoch": 3.9481276856967464, + "grad_norm": 0.20916256308555603, + "learning_rate": 6.898137676740246e-05, + "loss": 1.7711, + "step": 12863 + }, + { + "epoch": 3.9484346224677718, + "grad_norm": 0.2570229768753052, + "learning_rate": 6.897677821278435e-05, + "loss": 1.833, + "step": 12864 + }, + { + "epoch": 3.9487415592387967, + "grad_norm": 0.26343438029289246, + "learning_rate": 6.897217947062657e-05, + "loss": 1.7625, + "step": 12865 + }, + { + "epoch": 3.949048496009822, + "grad_norm": 0.23407024145126343, + "learning_rate": 6.896758054097459e-05, + "loss": 1.7211, + "step": 12866 + }, + { + "epoch": 3.9493554327808473, + "grad_norm": 0.2554715573787689, + "learning_rate": 6.896298142387387e-05, + "loss": 1.8548, + "step": 12867 + }, + { + "epoch": 3.949662369551872, + "grad_norm": 0.24143370985984802, + "learning_rate": 6.895838211936986e-05, + "loss": 1.7635, + "step": 12868 + }, + { + "epoch": 3.9499693063228976, + "grad_norm": 0.24634715914726257, + "learning_rate": 6.8953782627508e-05, + "loss": 1.8012, + "step": 12869 + }, + { + "epoch": 3.9502762430939224, + "grad_norm": 0.22740426659584045, + "learning_rate": 6.894918294833375e-05, + "loss": 1.7294, + "step": 12870 + }, + { + "epoch": 3.950583179864948, + "grad_norm": 0.2651631832122803, + "learning_rate": 6.894458308189257e-05, + "loss": 1.8289, + "step": 12871 + }, + { + "epoch": 3.950890116635973, + "grad_norm": 0.28693267703056335, + "learning_rate": 6.893998302822991e-05, + "loss": 1.8462, + "step": 12872 + }, + { + "epoch": 3.951197053406998, + "grad_norm": 0.26584213972091675, + "learning_rate": 6.893538278739125e-05, + "loss": 1.7621, + "step": 12873 + }, + { + "epoch": 3.9515039901780233, + "grad_norm": 0.29970669746398926, + "learning_rate": 6.893078235942203e-05, + "loss": 1.7659, + "step": 12874 + }, + { + "epoch": 3.9518109269490482, + "grad_norm": 0.2271152138710022, + "learning_rate": 6.892618174436771e-05, + "loss": 1.7151, + "step": 12875 + }, + { + "epoch": 3.9521178637200736, + "grad_norm": 0.24783682823181152, + "learning_rate": 6.892158094227379e-05, + "loss": 1.761, + "step": 12876 + }, + { + "epoch": 3.952424800491099, + "grad_norm": 0.2371140718460083, + "learning_rate": 6.891697995318573e-05, + "loss": 1.7557, + "step": 12877 + }, + { + "epoch": 3.9527317372621242, + "grad_norm": 0.29708394408226013, + "learning_rate": 6.891237877714896e-05, + "loss": 1.8629, + "step": 12878 + }, + { + "epoch": 3.953038674033149, + "grad_norm": 0.2724219262599945, + "learning_rate": 6.890777741420899e-05, + "loss": 1.7378, + "step": 12879 + }, + { + "epoch": 3.9533456108041745, + "grad_norm": 0.2227276861667633, + "learning_rate": 6.890317586441126e-05, + "loss": 1.6989, + "step": 12880 + }, + { + "epoch": 3.9536525475751993, + "grad_norm": 0.2546161413192749, + "learning_rate": 6.889857412780128e-05, + "loss": 1.8688, + "step": 12881 + }, + { + "epoch": 3.9539594843462247, + "grad_norm": 0.24882884323596954, + "learning_rate": 6.889397220442452e-05, + "loss": 1.8137, + "step": 12882 + }, + { + "epoch": 3.95426642111725, + "grad_norm": 0.2549113929271698, + "learning_rate": 6.888937009432644e-05, + "loss": 1.8366, + "step": 12883 + }, + { + "epoch": 3.954573357888275, + "grad_norm": 0.30032673478126526, + "learning_rate": 6.888476779755255e-05, + "loss": 1.8267, + "step": 12884 + }, + { + "epoch": 3.9548802946593002, + "grad_norm": 0.2887294292449951, + "learning_rate": 6.888016531414832e-05, + "loss": 1.8295, + "step": 12885 + }, + { + "epoch": 3.955187231430325, + "grad_norm": 0.2947406470775604, + "learning_rate": 6.88755626441592e-05, + "loss": 1.7713, + "step": 12886 + }, + { + "epoch": 3.9554941682013505, + "grad_norm": 0.2967108190059662, + "learning_rate": 6.887095978763072e-05, + "loss": 1.7636, + "step": 12887 + }, + { + "epoch": 3.955801104972376, + "grad_norm": 0.2495311200618744, + "learning_rate": 6.886635674460836e-05, + "loss": 1.8148, + "step": 12888 + }, + { + "epoch": 3.9561080417434007, + "grad_norm": 0.23367099463939667, + "learning_rate": 6.88617535151376e-05, + "loss": 1.7353, + "step": 12889 + }, + { + "epoch": 3.956414978514426, + "grad_norm": 0.36790570616722107, + "learning_rate": 6.885715009926395e-05, + "loss": 1.7853, + "step": 12890 + }, + { + "epoch": 3.9567219152854514, + "grad_norm": 0.5013020038604736, + "learning_rate": 6.885254649703287e-05, + "loss": 1.7923, + "step": 12891 + }, + { + "epoch": 3.9570288520564763, + "grad_norm": 0.4446276128292084, + "learning_rate": 6.884794270848988e-05, + "loss": 1.7504, + "step": 12892 + }, + { + "epoch": 3.9573357888275016, + "grad_norm": 0.2478526383638382, + "learning_rate": 6.88433387336805e-05, + "loss": 1.7629, + "step": 12893 + }, + { + "epoch": 3.957642725598527, + "grad_norm": 0.30111798644065857, + "learning_rate": 6.883873457265019e-05, + "loss": 1.8291, + "step": 12894 + }, + { + "epoch": 3.957949662369552, + "grad_norm": 0.3812437951564789, + "learning_rate": 6.883413022544445e-05, + "loss": 1.7919, + "step": 12895 + }, + { + "epoch": 3.958256599140577, + "grad_norm": 0.2895318269729614, + "learning_rate": 6.882952569210881e-05, + "loss": 1.7467, + "step": 12896 + }, + { + "epoch": 3.958563535911602, + "grad_norm": 0.30391454696655273, + "learning_rate": 6.882492097268873e-05, + "loss": 1.8145, + "step": 12897 + }, + { + "epoch": 3.9588704726826274, + "grad_norm": 0.5033623576164246, + "learning_rate": 6.882031606722977e-05, + "loss": 1.8231, + "step": 12898 + }, + { + "epoch": 3.9591774094536527, + "grad_norm": 0.5351777672767639, + "learning_rate": 6.881571097577742e-05, + "loss": 1.807, + "step": 12899 + }, + { + "epoch": 3.9594843462246776, + "grad_norm": 0.35540491342544556, + "learning_rate": 6.881110569837719e-05, + "loss": 1.7626, + "step": 12900 + }, + { + "epoch": 3.959791282995703, + "grad_norm": 0.22447600960731506, + "learning_rate": 6.880650023507457e-05, + "loss": 1.7392, + "step": 12901 + }, + { + "epoch": 3.960098219766728, + "grad_norm": 0.44619202613830566, + "learning_rate": 6.88018945859151e-05, + "loss": 1.8138, + "step": 12902 + }, + { + "epoch": 3.960405156537753, + "grad_norm": 0.41381633281707764, + "learning_rate": 6.879728875094428e-05, + "loss": 1.7676, + "step": 12903 + }, + { + "epoch": 3.9607120933087785, + "grad_norm": 0.2601528465747833, + "learning_rate": 6.879268273020764e-05, + "loss": 1.8406, + "step": 12904 + }, + { + "epoch": 3.961019030079804, + "grad_norm": 0.3309035003185272, + "learning_rate": 6.878807652375071e-05, + "loss": 1.7673, + "step": 12905 + }, + { + "epoch": 3.9613259668508287, + "grad_norm": 0.5281669497489929, + "learning_rate": 6.878347013161899e-05, + "loss": 1.7686, + "step": 12906 + }, + { + "epoch": 3.961632903621854, + "grad_norm": 0.5397645831108093, + "learning_rate": 6.8778863553858e-05, + "loss": 1.8575, + "step": 12907 + }, + { + "epoch": 3.961939840392879, + "grad_norm": 0.329485684633255, + "learning_rate": 6.877425679051327e-05, + "loss": 1.8185, + "step": 12908 + }, + { + "epoch": 3.9622467771639043, + "grad_norm": 0.3012789487838745, + "learning_rate": 6.876964984163034e-05, + "loss": 1.7962, + "step": 12909 + }, + { + "epoch": 3.9625537139349296, + "grad_norm": 0.5596817135810852, + "learning_rate": 6.876504270725472e-05, + "loss": 1.7972, + "step": 12910 + }, + { + "epoch": 3.9628606507059545, + "grad_norm": 0.5374729633331299, + "learning_rate": 6.876043538743197e-05, + "loss": 1.7863, + "step": 12911 + }, + { + "epoch": 3.96316758747698, + "grad_norm": 0.24617290496826172, + "learning_rate": 6.875582788220757e-05, + "loss": 1.7555, + "step": 12912 + }, + { + "epoch": 3.9634745242480047, + "grad_norm": 0.3493972420692444, + "learning_rate": 6.875122019162712e-05, + "loss": 1.8595, + "step": 12913 + }, + { + "epoch": 3.96378146101903, + "grad_norm": 0.4293089807033539, + "learning_rate": 6.874661231573609e-05, + "loss": 1.7647, + "step": 12914 + }, + { + "epoch": 3.9640883977900554, + "grad_norm": 0.30602574348449707, + "learning_rate": 6.874200425458006e-05, + "loss": 1.7122, + "step": 12915 + }, + { + "epoch": 3.9643953345610803, + "grad_norm": 0.22776013612747192, + "learning_rate": 6.873739600820457e-05, + "loss": 1.7136, + "step": 12916 + }, + { + "epoch": 3.9647022713321056, + "grad_norm": 0.3727327585220337, + "learning_rate": 6.873278757665513e-05, + "loss": 1.8314, + "step": 12917 + }, + { + "epoch": 3.9650092081031305, + "grad_norm": 0.35110536217689514, + "learning_rate": 6.872817895997733e-05, + "loss": 1.7506, + "step": 12918 + }, + { + "epoch": 3.965316144874156, + "grad_norm": 0.275560587644577, + "learning_rate": 6.872357015821666e-05, + "loss": 1.7865, + "step": 12919 + }, + { + "epoch": 3.965623081645181, + "grad_norm": 0.2686980366706848, + "learning_rate": 6.871896117141873e-05, + "loss": 1.8431, + "step": 12920 + }, + { + "epoch": 3.9659300184162065, + "grad_norm": 0.3299664556980133, + "learning_rate": 6.871435199962901e-05, + "loss": 1.7988, + "step": 12921 + }, + { + "epoch": 3.9662369551872314, + "grad_norm": 0.2833637297153473, + "learning_rate": 6.870974264289313e-05, + "loss": 1.6993, + "step": 12922 + }, + { + "epoch": 3.9665438919582567, + "grad_norm": 0.25062620639801025, + "learning_rate": 6.870513310125659e-05, + "loss": 1.7814, + "step": 12923 + }, + { + "epoch": 3.9668508287292816, + "grad_norm": 0.26609909534454346, + "learning_rate": 6.870052337476498e-05, + "loss": 1.7871, + "step": 12924 + }, + { + "epoch": 3.967157765500307, + "grad_norm": 0.22760890424251556, + "learning_rate": 6.869591346346382e-05, + "loss": 1.7941, + "step": 12925 + }, + { + "epoch": 3.9674647022713323, + "grad_norm": 0.2845582067966461, + "learning_rate": 6.869130336739869e-05, + "loss": 1.8215, + "step": 12926 + }, + { + "epoch": 3.967771639042357, + "grad_norm": 0.254948228597641, + "learning_rate": 6.868669308661514e-05, + "loss": 1.7515, + "step": 12927 + }, + { + "epoch": 3.9680785758133825, + "grad_norm": 0.2372167855501175, + "learning_rate": 6.868208262115875e-05, + "loss": 1.7524, + "step": 12928 + }, + { + "epoch": 3.9683855125844074, + "grad_norm": 0.31165993213653564, + "learning_rate": 6.867747197107506e-05, + "loss": 1.8139, + "step": 12929 + }, + { + "epoch": 3.9686924493554327, + "grad_norm": 0.2617839276790619, + "learning_rate": 6.867286113640965e-05, + "loss": 1.7388, + "step": 12930 + }, + { + "epoch": 3.968999386126458, + "grad_norm": 0.22749558091163635, + "learning_rate": 6.866825011720807e-05, + "loss": 1.7421, + "step": 12931 + }, + { + "epoch": 3.969306322897483, + "grad_norm": 0.27737462520599365, + "learning_rate": 6.86636389135159e-05, + "loss": 1.7977, + "step": 12932 + }, + { + "epoch": 3.9696132596685083, + "grad_norm": 0.3331063985824585, + "learning_rate": 6.865902752537871e-05, + "loss": 1.7925, + "step": 12933 + }, + { + "epoch": 3.969920196439533, + "grad_norm": 0.24229519069194794, + "learning_rate": 6.86544159528421e-05, + "loss": 1.7782, + "step": 12934 + }, + { + "epoch": 3.9702271332105585, + "grad_norm": 0.29494860768318176, + "learning_rate": 6.86498041959516e-05, + "loss": 1.7713, + "step": 12935 + }, + { + "epoch": 3.970534069981584, + "grad_norm": 0.26064008474349976, + "learning_rate": 6.86451922547528e-05, + "loss": 1.7161, + "step": 12936 + }, + { + "epoch": 3.970841006752609, + "grad_norm": 0.2656785547733307, + "learning_rate": 6.864058012929129e-05, + "loss": 1.8154, + "step": 12937 + }, + { + "epoch": 3.971147943523634, + "grad_norm": 0.21170997619628906, + "learning_rate": 6.863596781961263e-05, + "loss": 1.7614, + "step": 12938 + }, + { + "epoch": 3.9714548802946594, + "grad_norm": 0.21709072589874268, + "learning_rate": 6.863135532576241e-05, + "loss": 1.7896, + "step": 12939 + }, + { + "epoch": 3.9717618170656843, + "grad_norm": 0.2361367791891098, + "learning_rate": 6.862674264778623e-05, + "loss": 1.7775, + "step": 12940 + }, + { + "epoch": 3.9720687538367097, + "grad_norm": 0.22042550146579742, + "learning_rate": 6.862212978572967e-05, + "loss": 1.7781, + "step": 12941 + }, + { + "epoch": 3.972375690607735, + "grad_norm": 0.2535422146320343, + "learning_rate": 6.86175167396383e-05, + "loss": 1.7665, + "step": 12942 + }, + { + "epoch": 3.97268262737876, + "grad_norm": 0.23741906881332397, + "learning_rate": 6.861290350955771e-05, + "loss": 1.7829, + "step": 12943 + }, + { + "epoch": 3.972989564149785, + "grad_norm": 0.23789910972118378, + "learning_rate": 6.860829009553351e-05, + "loss": 1.7745, + "step": 12944 + }, + { + "epoch": 3.97329650092081, + "grad_norm": 0.26867765188217163, + "learning_rate": 6.860367649761127e-05, + "loss": 1.7239, + "step": 12945 + }, + { + "epoch": 3.9736034376918354, + "grad_norm": 0.3211663067340851, + "learning_rate": 6.85990627158366e-05, + "loss": 1.7976, + "step": 12946 + }, + { + "epoch": 3.9739103744628608, + "grad_norm": 0.26177310943603516, + "learning_rate": 6.85944487502551e-05, + "loss": 1.7446, + "step": 12947 + }, + { + "epoch": 3.9742173112338857, + "grad_norm": 0.23622745275497437, + "learning_rate": 6.858983460091234e-05, + "loss": 1.7824, + "step": 12948 + }, + { + "epoch": 3.974524248004911, + "grad_norm": 0.24372988939285278, + "learning_rate": 6.858522026785395e-05, + "loss": 1.8014, + "step": 12949 + }, + { + "epoch": 3.974831184775936, + "grad_norm": 0.2566998600959778, + "learning_rate": 6.85806057511255e-05, + "loss": 1.742, + "step": 12950 + }, + { + "epoch": 3.9751381215469612, + "grad_norm": 0.24418365955352783, + "learning_rate": 6.857599105077264e-05, + "loss": 1.7331, + "step": 12951 + }, + { + "epoch": 3.9754450583179866, + "grad_norm": 0.2260327935218811, + "learning_rate": 6.857137616684094e-05, + "loss": 1.7173, + "step": 12952 + }, + { + "epoch": 3.975751995089012, + "grad_norm": 0.277044415473938, + "learning_rate": 6.856676109937602e-05, + "loss": 1.7255, + "step": 12953 + }, + { + "epoch": 3.976058931860037, + "grad_norm": 0.228300079703331, + "learning_rate": 6.856214584842348e-05, + "loss": 1.7796, + "step": 12954 + }, + { + "epoch": 3.976365868631062, + "grad_norm": 0.2246638983488083, + "learning_rate": 6.855753041402893e-05, + "loss": 1.7458, + "step": 12955 + }, + { + "epoch": 3.976672805402087, + "grad_norm": 0.22235621511936188, + "learning_rate": 6.855291479623799e-05, + "loss": 1.7585, + "step": 12956 + }, + { + "epoch": 3.9769797421731123, + "grad_norm": 0.23710694909095764, + "learning_rate": 6.854829899509627e-05, + "loss": 1.767, + "step": 12957 + }, + { + "epoch": 3.9772866789441377, + "grad_norm": 0.2527346611022949, + "learning_rate": 6.854368301064939e-05, + "loss": 1.828, + "step": 12958 + }, + { + "epoch": 3.9775936157151626, + "grad_norm": 0.25032514333724976, + "learning_rate": 6.853906684294298e-05, + "loss": 1.8533, + "step": 12959 + }, + { + "epoch": 3.977900552486188, + "grad_norm": 0.2346320003271103, + "learning_rate": 6.853445049202262e-05, + "loss": 1.8046, + "step": 12960 + }, + { + "epoch": 3.978207489257213, + "grad_norm": 0.22576460242271423, + "learning_rate": 6.852983395793398e-05, + "loss": 1.7502, + "step": 12961 + }, + { + "epoch": 3.978514426028238, + "grad_norm": 0.2230147123336792, + "learning_rate": 6.852521724072266e-05, + "loss": 1.7362, + "step": 12962 + }, + { + "epoch": 3.9788213627992635, + "grad_norm": 0.2339705526828766, + "learning_rate": 6.852060034043425e-05, + "loss": 1.763, + "step": 12963 + }, + { + "epoch": 3.979128299570289, + "grad_norm": 0.24511271715164185, + "learning_rate": 6.851598325711446e-05, + "loss": 1.7988, + "step": 12964 + }, + { + "epoch": 3.9794352363413137, + "grad_norm": 0.2927285134792328, + "learning_rate": 6.851136599080885e-05, + "loss": 1.8346, + "step": 12965 + }, + { + "epoch": 3.979742173112339, + "grad_norm": 0.2593212425708771, + "learning_rate": 6.850674854156305e-05, + "loss": 1.7368, + "step": 12966 + }, + { + "epoch": 3.980049109883364, + "grad_norm": 0.3013291656970978, + "learning_rate": 6.850213090942275e-05, + "loss": 1.7911, + "step": 12967 + }, + { + "epoch": 3.9803560466543892, + "grad_norm": 0.3420047163963318, + "learning_rate": 6.849751309443352e-05, + "loss": 1.7899, + "step": 12968 + }, + { + "epoch": 3.9806629834254146, + "grad_norm": 0.2901746928691864, + "learning_rate": 6.849289509664105e-05, + "loss": 1.8244, + "step": 12969 + }, + { + "epoch": 3.9809699201964395, + "grad_norm": 0.2389298677444458, + "learning_rate": 6.848827691609093e-05, + "loss": 1.7116, + "step": 12970 + }, + { + "epoch": 3.981276856967465, + "grad_norm": 0.3153960704803467, + "learning_rate": 6.848365855282882e-05, + "loss": 1.7665, + "step": 12971 + }, + { + "epoch": 3.9815837937384897, + "grad_norm": 0.3162175118923187, + "learning_rate": 6.847904000690036e-05, + "loss": 1.7722, + "step": 12972 + }, + { + "epoch": 3.981890730509515, + "grad_norm": 0.27458643913269043, + "learning_rate": 6.847442127835122e-05, + "loss": 1.8095, + "step": 12973 + }, + { + "epoch": 3.9821976672805404, + "grad_norm": 0.22330710291862488, + "learning_rate": 6.846980236722699e-05, + "loss": 1.7179, + "step": 12974 + }, + { + "epoch": 3.9825046040515653, + "grad_norm": 0.2940923869609833, + "learning_rate": 6.846518327357339e-05, + "loss": 1.7363, + "step": 12975 + }, + { + "epoch": 3.9828115408225906, + "grad_norm": 0.26479849219322205, + "learning_rate": 6.846056399743599e-05, + "loss": 1.7788, + "step": 12976 + }, + { + "epoch": 3.9831184775936155, + "grad_norm": 0.24145057797431946, + "learning_rate": 6.845594453886048e-05, + "loss": 1.7825, + "step": 12977 + }, + { + "epoch": 3.983425414364641, + "grad_norm": 0.2795869708061218, + "learning_rate": 6.845132489789252e-05, + "loss": 1.7705, + "step": 12978 + }, + { + "epoch": 3.983732351135666, + "grad_norm": 0.3117202818393707, + "learning_rate": 6.844670507457776e-05, + "loss": 1.8183, + "step": 12979 + }, + { + "epoch": 3.9840392879066915, + "grad_norm": 0.2666899263858795, + "learning_rate": 6.844208506896184e-05, + "loss": 1.7434, + "step": 12980 + }, + { + "epoch": 3.9843462246777164, + "grad_norm": 0.24682332575321198, + "learning_rate": 6.843746488109042e-05, + "loss": 1.751, + "step": 12981 + }, + { + "epoch": 3.9846531614487417, + "grad_norm": 0.2558208703994751, + "learning_rate": 6.843284451100916e-05, + "loss": 1.7983, + "step": 12982 + }, + { + "epoch": 3.9849600982197666, + "grad_norm": 0.4236481189727783, + "learning_rate": 6.842822395876374e-05, + "loss": 1.8584, + "step": 12983 + }, + { + "epoch": 3.985267034990792, + "grad_norm": 0.4931485950946808, + "learning_rate": 6.84236032243998e-05, + "loss": 1.7617, + "step": 12984 + }, + { + "epoch": 3.9855739717618173, + "grad_norm": 0.37793654203414917, + "learning_rate": 6.841898230796302e-05, + "loss": 1.7411, + "step": 12985 + }, + { + "epoch": 3.985880908532842, + "grad_norm": 0.2093842774629593, + "learning_rate": 6.841436120949906e-05, + "loss": 1.772, + "step": 12986 + }, + { + "epoch": 3.9861878453038675, + "grad_norm": 0.4065552055835724, + "learning_rate": 6.840973992905359e-05, + "loss": 1.7675, + "step": 12987 + }, + { + "epoch": 3.9864947820748924, + "grad_norm": 0.5334183573722839, + "learning_rate": 6.840511846667228e-05, + "loss": 1.7872, + "step": 12988 + }, + { + "epoch": 3.9868017188459177, + "grad_norm": 0.378974974155426, + "learning_rate": 6.84004968224008e-05, + "loss": 1.8288, + "step": 12989 + }, + { + "epoch": 3.987108655616943, + "grad_norm": 0.22518309950828552, + "learning_rate": 6.839587499628483e-05, + "loss": 1.7715, + "step": 12990 + }, + { + "epoch": 3.987415592387968, + "grad_norm": 0.4270850718021393, + "learning_rate": 6.839125298837003e-05, + "loss": 1.7797, + "step": 12991 + }, + { + "epoch": 3.9877225291589933, + "grad_norm": 0.4629896879196167, + "learning_rate": 6.838663079870211e-05, + "loss": 1.7936, + "step": 12992 + }, + { + "epoch": 3.988029465930018, + "grad_norm": 0.29273948073387146, + "learning_rate": 6.838200842732672e-05, + "loss": 1.8264, + "step": 12993 + }, + { + "epoch": 3.9883364027010435, + "grad_norm": 0.31575852632522583, + "learning_rate": 6.837738587428954e-05, + "loss": 1.8043, + "step": 12994 + }, + { + "epoch": 3.988643339472069, + "grad_norm": 0.40602433681488037, + "learning_rate": 6.837276313963627e-05, + "loss": 1.7409, + "step": 12995 + }, + { + "epoch": 3.988950276243094, + "grad_norm": 0.23413142561912537, + "learning_rate": 6.836814022341259e-05, + "loss": 1.8585, + "step": 12996 + }, + { + "epoch": 3.989257213014119, + "grad_norm": 0.3518814444541931, + "learning_rate": 6.836351712566416e-05, + "loss": 1.7768, + "step": 12997 + }, + { + "epoch": 3.9895641497851444, + "grad_norm": 0.3811505436897278, + "learning_rate": 6.83588938464367e-05, + "loss": 1.7738, + "step": 12998 + }, + { + "epoch": 3.9898710865561693, + "grad_norm": 0.2516780197620392, + "learning_rate": 6.835427038577589e-05, + "loss": 1.7351, + "step": 12999 + }, + { + "epoch": 3.9901780233271946, + "grad_norm": 0.23704510927200317, + "learning_rate": 6.834964674372744e-05, + "loss": 1.7907, + "step": 13000 + }, + { + "epoch": 3.99048496009822, + "grad_norm": 0.2890201807022095, + "learning_rate": 6.8345022920337e-05, + "loss": 1.9546, + "step": 13001 + }, + { + "epoch": 3.990791896869245, + "grad_norm": 0.2678101360797882, + "learning_rate": 6.834039891565031e-05, + "loss": 1.7338, + "step": 13002 + }, + { + "epoch": 3.99109883364027, + "grad_norm": 0.31726256012916565, + "learning_rate": 6.833577472971304e-05, + "loss": 1.8464, + "step": 13003 + }, + { + "epoch": 3.991405770411295, + "grad_norm": 0.28112682700157166, + "learning_rate": 6.83311503625709e-05, + "loss": 1.7427, + "step": 13004 + }, + { + "epoch": 3.9917127071823204, + "grad_norm": 0.2651563584804535, + "learning_rate": 6.832652581426958e-05, + "loss": 1.8117, + "step": 13005 + }, + { + "epoch": 3.9920196439533457, + "grad_norm": 0.3095388114452362, + "learning_rate": 6.83219010848548e-05, + "loss": 1.8286, + "step": 13006 + }, + { + "epoch": 3.9923265807243706, + "grad_norm": 0.24704942107200623, + "learning_rate": 6.831727617437225e-05, + "loss": 1.77, + "step": 13007 + }, + { + "epoch": 3.992633517495396, + "grad_norm": 0.24868519604206085, + "learning_rate": 6.831265108286764e-05, + "loss": 1.8129, + "step": 13008 + }, + { + "epoch": 3.992940454266421, + "grad_norm": 0.26511049270629883, + "learning_rate": 6.830802581038669e-05, + "loss": 1.7539, + "step": 13009 + }, + { + "epoch": 3.993247391037446, + "grad_norm": 0.2823421061038971, + "learning_rate": 6.830340035697508e-05, + "loss": 1.8068, + "step": 13010 + }, + { + "epoch": 3.9935543278084715, + "grad_norm": 0.28526121377944946, + "learning_rate": 6.829877472267856e-05, + "loss": 1.764, + "step": 13011 + }, + { + "epoch": 3.993861264579497, + "grad_norm": 0.2576456069946289, + "learning_rate": 6.829414890754281e-05, + "loss": 1.728, + "step": 13012 + }, + { + "epoch": 3.9941682013505218, + "grad_norm": 0.27154842019081116, + "learning_rate": 6.828952291161356e-05, + "loss": 1.797, + "step": 13013 + }, + { + "epoch": 3.994475138121547, + "grad_norm": 0.3129710555076599, + "learning_rate": 6.828489673493652e-05, + "loss": 1.769, + "step": 13014 + }, + { + "epoch": 3.994782074892572, + "grad_norm": 0.40118902921676636, + "learning_rate": 6.828027037755742e-05, + "loss": 1.8029, + "step": 13015 + }, + { + "epoch": 3.9950890116635973, + "grad_norm": 0.33228442072868347, + "learning_rate": 6.827564383952197e-05, + "loss": 1.7295, + "step": 13016 + }, + { + "epoch": 3.9953959484346226, + "grad_norm": 0.218771830201149, + "learning_rate": 6.827101712087591e-05, + "loss": 1.7693, + "step": 13017 + }, + { + "epoch": 3.9957028852056475, + "grad_norm": 0.31354373693466187, + "learning_rate": 6.826639022166492e-05, + "loss": 1.743, + "step": 13018 + }, + { + "epoch": 3.996009821976673, + "grad_norm": 0.3584701418876648, + "learning_rate": 6.826176314193478e-05, + "loss": 1.7597, + "step": 13019 + }, + { + "epoch": 3.9963167587476978, + "grad_norm": 0.2692064344882965, + "learning_rate": 6.82571358817312e-05, + "loss": 1.7871, + "step": 13020 + }, + { + "epoch": 3.996623695518723, + "grad_norm": 0.3064020276069641, + "learning_rate": 6.825250844109987e-05, + "loss": 1.7858, + "step": 13021 + }, + { + "epoch": 3.9969306322897484, + "grad_norm": 0.29913413524627686, + "learning_rate": 6.824788082008657e-05, + "loss": 1.7773, + "step": 13022 + }, + { + "epoch": 3.9972375690607733, + "grad_norm": 0.2682165801525116, + "learning_rate": 6.824325301873703e-05, + "loss": 1.8321, + "step": 13023 + }, + { + "epoch": 3.9975445058317987, + "grad_norm": 0.3274376690387726, + "learning_rate": 6.823862503709694e-05, + "loss": 1.8514, + "step": 13024 + }, + { + "epoch": 3.9978514426028235, + "grad_norm": 0.29828041791915894, + "learning_rate": 6.823399687521211e-05, + "loss": 1.7923, + "step": 13025 + }, + { + "epoch": 3.998158379373849, + "grad_norm": 0.22339288890361786, + "learning_rate": 6.82293685331282e-05, + "loss": 1.756, + "step": 13026 + }, + { + "epoch": 3.998465316144874, + "grad_norm": 0.2254658192396164, + "learning_rate": 6.8224740010891e-05, + "loss": 1.7392, + "step": 13027 + }, + { + "epoch": 3.9987722529158995, + "grad_norm": 0.24932752549648285, + "learning_rate": 6.822011130854624e-05, + "loss": 1.7538, + "step": 13028 + }, + { + "epoch": 3.9990791896869244, + "grad_norm": 0.21429690718650818, + "learning_rate": 6.821548242613966e-05, + "loss": 1.7746, + "step": 13029 + }, + { + "epoch": 3.9993861264579498, + "grad_norm": 0.25503116846084595, + "learning_rate": 6.8210853363717e-05, + "loss": 1.814, + "step": 13030 + }, + { + "epoch": 3.9996930632289747, + "grad_norm": 0.23168155550956726, + "learning_rate": 6.820622412132402e-05, + "loss": 1.769, + "step": 13031 + }, + { + "epoch": 4.0, + "grad_norm": 0.2252223789691925, + "learning_rate": 6.820159469900645e-05, + "loss": 1.7782, + "step": 13032 + }, + { + "epoch": 4.000306936771025, + "grad_norm": 0.1996588408946991, + "learning_rate": 6.819696509681007e-05, + "loss": 1.6839, + "step": 13033 + }, + { + "epoch": 4.000613873542051, + "grad_norm": 0.22297053039073944, + "learning_rate": 6.81923353147806e-05, + "loss": 1.7767, + "step": 13034 + }, + { + "epoch": 4.000920810313075, + "grad_norm": 0.25867611169815063, + "learning_rate": 6.818770535296381e-05, + "loss": 1.8623, + "step": 13035 + }, + { + "epoch": 4.0012277470841005, + "grad_norm": 0.2173648178577423, + "learning_rate": 6.818307521140547e-05, + "loss": 1.8034, + "step": 13036 + }, + { + "epoch": 4.001534683855126, + "grad_norm": 0.23634609580039978, + "learning_rate": 6.81784448901513e-05, + "loss": 1.7503, + "step": 13037 + }, + { + "epoch": 4.001841620626151, + "grad_norm": 0.2626810073852539, + "learning_rate": 6.81738143892471e-05, + "loss": 1.8116, + "step": 13038 + }, + { + "epoch": 4.0021485573971765, + "grad_norm": 0.27888983488082886, + "learning_rate": 6.816918370873861e-05, + "loss": 1.8032, + "step": 13039 + }, + { + "epoch": 4.002455494168202, + "grad_norm": 0.275038480758667, + "learning_rate": 6.816455284867162e-05, + "loss": 1.7445, + "step": 13040 + }, + { + "epoch": 4.002762430939226, + "grad_norm": 0.3475828170776367, + "learning_rate": 6.815992180909184e-05, + "loss": 1.7404, + "step": 13041 + }, + { + "epoch": 4.003069367710252, + "grad_norm": 0.27314287424087524, + "learning_rate": 6.815529059004507e-05, + "loss": 1.8333, + "step": 13042 + }, + { + "epoch": 4.003376304481277, + "grad_norm": 0.34846973419189453, + "learning_rate": 6.815065919157709e-05, + "loss": 1.7921, + "step": 13043 + }, + { + "epoch": 4.003683241252302, + "grad_norm": 0.4191788136959076, + "learning_rate": 6.814602761373365e-05, + "loss": 1.8018, + "step": 13044 + }, + { + "epoch": 4.003990178023328, + "grad_norm": 0.2655608057975769, + "learning_rate": 6.814139585656055e-05, + "loss": 1.7638, + "step": 13045 + }, + { + "epoch": 4.004297114794352, + "grad_norm": 0.25938618183135986, + "learning_rate": 6.813676392010353e-05, + "loss": 1.794, + "step": 13046 + }, + { + "epoch": 4.004604051565377, + "grad_norm": 0.3464813828468323, + "learning_rate": 6.813213180440837e-05, + "loss": 1.8662, + "step": 13047 + }, + { + "epoch": 4.004910988336403, + "grad_norm": 0.30185338854789734, + "learning_rate": 6.812749950952087e-05, + "loss": 1.8029, + "step": 13048 + }, + { + "epoch": 4.005217925107428, + "grad_norm": 0.23291908204555511, + "learning_rate": 6.812286703548678e-05, + "loss": 1.7365, + "step": 13049 + }, + { + "epoch": 4.005524861878453, + "grad_norm": 0.3542841374874115, + "learning_rate": 6.811823438235189e-05, + "loss": 1.8674, + "step": 13050 + }, + { + "epoch": 4.005831798649478, + "grad_norm": 0.2914685606956482, + "learning_rate": 6.811360155016202e-05, + "loss": 1.8306, + "step": 13051 + }, + { + "epoch": 4.006138735420503, + "grad_norm": 0.24888737499713898, + "learning_rate": 6.810896853896289e-05, + "loss": 1.7767, + "step": 13052 + }, + { + "epoch": 4.0064456721915285, + "grad_norm": 0.2977537512779236, + "learning_rate": 6.810433534880033e-05, + "loss": 1.8227, + "step": 13053 + }, + { + "epoch": 4.006752608962554, + "grad_norm": 0.3367510735988617, + "learning_rate": 6.809970197972013e-05, + "loss": 1.734, + "step": 13054 + }, + { + "epoch": 4.007059545733579, + "grad_norm": 0.28098800778388977, + "learning_rate": 6.809506843176806e-05, + "loss": 1.7032, + "step": 13055 + }, + { + "epoch": 4.0073664825046045, + "grad_norm": 0.24016784131526947, + "learning_rate": 6.809043470498991e-05, + "loss": 1.7863, + "step": 13056 + }, + { + "epoch": 4.007673419275629, + "grad_norm": 0.2883957624435425, + "learning_rate": 6.808580079943148e-05, + "loss": 1.7342, + "step": 13057 + }, + { + "epoch": 4.007980356046654, + "grad_norm": 0.3069116473197937, + "learning_rate": 6.808116671513856e-05, + "loss": 1.8544, + "step": 13058 + }, + { + "epoch": 4.00828729281768, + "grad_norm": 0.24113236367702484, + "learning_rate": 6.807653245215697e-05, + "loss": 1.7692, + "step": 13059 + }, + { + "epoch": 4.008594229588705, + "grad_norm": 0.2651619017124176, + "learning_rate": 6.807189801053249e-05, + "loss": 1.8096, + "step": 13060 + }, + { + "epoch": 4.00890116635973, + "grad_norm": 0.2636481523513794, + "learning_rate": 6.806726339031092e-05, + "loss": 1.8062, + "step": 13061 + }, + { + "epoch": 4.009208103130755, + "grad_norm": 0.22691169381141663, + "learning_rate": 6.806262859153807e-05, + "loss": 1.7001, + "step": 13062 + }, + { + "epoch": 4.00951503990178, + "grad_norm": 0.23288170993328094, + "learning_rate": 6.805799361425972e-05, + "loss": 1.7508, + "step": 13063 + }, + { + "epoch": 4.009821976672805, + "grad_norm": 0.243272602558136, + "learning_rate": 6.80533584585217e-05, + "loss": 1.7797, + "step": 13064 + }, + { + "epoch": 4.010128913443831, + "grad_norm": 0.24594646692276, + "learning_rate": 6.80487231243698e-05, + "loss": 1.7894, + "step": 13065 + }, + { + "epoch": 4.010435850214856, + "grad_norm": 0.21726086735725403, + "learning_rate": 6.804408761184986e-05, + "loss": 1.7472, + "step": 13066 + }, + { + "epoch": 4.0107427869858805, + "grad_norm": 0.2262321561574936, + "learning_rate": 6.803945192100767e-05, + "loss": 1.7563, + "step": 13067 + }, + { + "epoch": 4.011049723756906, + "grad_norm": 0.2449522763490677, + "learning_rate": 6.803481605188903e-05, + "loss": 1.7282, + "step": 13068 + }, + { + "epoch": 4.011356660527931, + "grad_norm": 0.2281760573387146, + "learning_rate": 6.803018000453975e-05, + "loss": 1.8191, + "step": 13069 + }, + { + "epoch": 4.0116635972989565, + "grad_norm": 0.3039850890636444, + "learning_rate": 6.80255437790057e-05, + "loss": 1.8258, + "step": 13070 + }, + { + "epoch": 4.011970534069982, + "grad_norm": 0.3978467881679535, + "learning_rate": 6.802090737533264e-05, + "loss": 1.7338, + "step": 13071 + }, + { + "epoch": 4.012277470841007, + "grad_norm": 0.29175812005996704, + "learning_rate": 6.801627079356641e-05, + "loss": 1.7754, + "step": 13072 + }, + { + "epoch": 4.012584407612032, + "grad_norm": 0.24228449165821075, + "learning_rate": 6.801163403375285e-05, + "loss": 1.7624, + "step": 13073 + }, + { + "epoch": 4.012891344383057, + "grad_norm": 0.34527531266212463, + "learning_rate": 6.800699709593776e-05, + "loss": 1.87, + "step": 13074 + }, + { + "epoch": 4.013198281154082, + "grad_norm": 0.1995161920785904, + "learning_rate": 6.800235998016696e-05, + "loss": 1.7253, + "step": 13075 + }, + { + "epoch": 4.013505217925108, + "grad_norm": 0.3509151339530945, + "learning_rate": 6.799772268648628e-05, + "loss": 1.8013, + "step": 13076 + }, + { + "epoch": 4.013812154696133, + "grad_norm": 0.38569679856300354, + "learning_rate": 6.799308521494156e-05, + "loss": 1.7761, + "step": 13077 + }, + { + "epoch": 4.014119091467157, + "grad_norm": 0.2636256814002991, + "learning_rate": 6.798844756557865e-05, + "loss": 1.8101, + "step": 13078 + }, + { + "epoch": 4.014426028238183, + "grad_norm": 0.2570696473121643, + "learning_rate": 6.798380973844335e-05, + "loss": 1.7561, + "step": 13079 + }, + { + "epoch": 4.014732965009208, + "grad_norm": 0.38540002703666687, + "learning_rate": 6.797917173358148e-05, + "loss": 1.7893, + "step": 13080 + }, + { + "epoch": 4.015039901780233, + "grad_norm": 0.2974525988101959, + "learning_rate": 6.79745335510389e-05, + "loss": 1.8331, + "step": 13081 + }, + { + "epoch": 4.015346838551259, + "grad_norm": 0.2563362419605255, + "learning_rate": 6.796989519086146e-05, + "loss": 1.7784, + "step": 13082 + }, + { + "epoch": 4.015653775322283, + "grad_norm": 0.37037795782089233, + "learning_rate": 6.7965256653095e-05, + "loss": 1.7947, + "step": 13083 + }, + { + "epoch": 4.0159607120933085, + "grad_norm": 0.4145336449146271, + "learning_rate": 6.796061793778531e-05, + "loss": 1.7633, + "step": 13084 + }, + { + "epoch": 4.016267648864334, + "grad_norm": 0.32278406620025635, + "learning_rate": 6.795597904497828e-05, + "loss": 1.7827, + "step": 13085 + }, + { + "epoch": 4.016574585635359, + "grad_norm": 0.26466837525367737, + "learning_rate": 6.795133997471974e-05, + "loss": 1.7441, + "step": 13086 + }, + { + "epoch": 4.0168815224063845, + "grad_norm": 0.3212043344974518, + "learning_rate": 6.794670072705553e-05, + "loss": 1.7602, + "step": 13087 + }, + { + "epoch": 4.01718845917741, + "grad_norm": 0.3054736852645874, + "learning_rate": 6.79420613020315e-05, + "loss": 1.7417, + "step": 13088 + }, + { + "epoch": 4.017495395948434, + "grad_norm": 0.22281476855278015, + "learning_rate": 6.793742169969351e-05, + "loss": 1.7675, + "step": 13089 + }, + { + "epoch": 4.01780233271946, + "grad_norm": 0.32630839943885803, + "learning_rate": 6.793278192008742e-05, + "loss": 1.8409, + "step": 13090 + }, + { + "epoch": 4.018109269490485, + "grad_norm": 0.2658778429031372, + "learning_rate": 6.792814196325905e-05, + "loss": 1.7718, + "step": 13091 + }, + { + "epoch": 4.01841620626151, + "grad_norm": 0.24016901850700378, + "learning_rate": 6.792350182925429e-05, + "loss": 1.8393, + "step": 13092 + }, + { + "epoch": 4.018723143032536, + "grad_norm": 0.2882223427295685, + "learning_rate": 6.791886151811897e-05, + "loss": 1.7497, + "step": 13093 + }, + { + "epoch": 4.01903007980356, + "grad_norm": 0.24340751767158508, + "learning_rate": 6.791422102989895e-05, + "loss": 1.72, + "step": 13094 + }, + { + "epoch": 4.019337016574585, + "grad_norm": 0.235665962100029, + "learning_rate": 6.79095803646401e-05, + "loss": 1.7269, + "step": 13095 + }, + { + "epoch": 4.019643953345611, + "grad_norm": 0.32772955298423767, + "learning_rate": 6.79049395223883e-05, + "loss": 1.7916, + "step": 13096 + }, + { + "epoch": 4.019950890116636, + "grad_norm": 0.3189625144004822, + "learning_rate": 6.790029850318938e-05, + "loss": 1.7571, + "step": 13097 + }, + { + "epoch": 4.020257826887661, + "grad_norm": 0.2211185097694397, + "learning_rate": 6.789565730708921e-05, + "loss": 1.793, + "step": 13098 + }, + { + "epoch": 4.020564763658686, + "grad_norm": 0.2840392291545868, + "learning_rate": 6.789101593413367e-05, + "loss": 1.7434, + "step": 13099 + }, + { + "epoch": 4.020871700429711, + "grad_norm": 0.27857357263565063, + "learning_rate": 6.788637438436863e-05, + "loss": 1.742, + "step": 13100 + }, + { + "epoch": 4.0211786372007365, + "grad_norm": 0.314628005027771, + "learning_rate": 6.788173265783996e-05, + "loss": 1.7881, + "step": 13101 + }, + { + "epoch": 4.021485573971762, + "grad_norm": 0.2994774580001831, + "learning_rate": 6.787709075459352e-05, + "loss": 1.7741, + "step": 13102 + }, + { + "epoch": 4.021792510742787, + "grad_norm": 0.3256312310695648, + "learning_rate": 6.787244867467519e-05, + "loss": 1.7758, + "step": 13103 + }, + { + "epoch": 4.0220994475138125, + "grad_norm": 0.2332412451505661, + "learning_rate": 6.786780641813083e-05, + "loss": 1.7654, + "step": 13104 + }, + { + "epoch": 4.022406384284837, + "grad_norm": 0.23226258158683777, + "learning_rate": 6.786316398500636e-05, + "loss": 1.7605, + "step": 13105 + }, + { + "epoch": 4.022713321055862, + "grad_norm": 0.24631965160369873, + "learning_rate": 6.785852137534763e-05, + "loss": 1.7469, + "step": 13106 + }, + { + "epoch": 4.023020257826888, + "grad_norm": 0.1969226449728012, + "learning_rate": 6.785387858920051e-05, + "loss": 1.8151, + "step": 13107 + }, + { + "epoch": 4.023327194597913, + "grad_norm": 0.22769485414028168, + "learning_rate": 6.784923562661091e-05, + "loss": 1.7024, + "step": 13108 + }, + { + "epoch": 4.023634131368938, + "grad_norm": 0.2174670249223709, + "learning_rate": 6.78445924876247e-05, + "loss": 1.8094, + "step": 13109 + }, + { + "epoch": 4.023941068139963, + "grad_norm": 0.2606858015060425, + "learning_rate": 6.783994917228775e-05, + "loss": 1.8043, + "step": 13110 + }, + { + "epoch": 4.024248004910988, + "grad_norm": 0.24721349775791168, + "learning_rate": 6.783530568064599e-05, + "loss": 1.842, + "step": 13111 + }, + { + "epoch": 4.024554941682013, + "grad_norm": 0.2353603094816208, + "learning_rate": 6.783066201274529e-05, + "loss": 1.76, + "step": 13112 + }, + { + "epoch": 4.024861878453039, + "grad_norm": 0.22285830974578857, + "learning_rate": 6.782601816863153e-05, + "loss": 1.8014, + "step": 13113 + }, + { + "epoch": 4.025168815224064, + "grad_norm": 0.2482440173625946, + "learning_rate": 6.782137414835061e-05, + "loss": 1.7552, + "step": 13114 + }, + { + "epoch": 4.0254757519950894, + "grad_norm": 0.19926191866397858, + "learning_rate": 6.781672995194842e-05, + "loss": 1.7549, + "step": 13115 + }, + { + "epoch": 4.025782688766114, + "grad_norm": 0.2342877984046936, + "learning_rate": 6.781208557947086e-05, + "loss": 1.8622, + "step": 13116 + }, + { + "epoch": 4.026089625537139, + "grad_norm": 0.24096547067165375, + "learning_rate": 6.780744103096382e-05, + "loss": 1.7795, + "step": 13117 + }, + { + "epoch": 4.026396562308165, + "grad_norm": 0.23714657127857208, + "learning_rate": 6.780279630647322e-05, + "loss": 1.799, + "step": 13118 + }, + { + "epoch": 4.02670349907919, + "grad_norm": 0.28252026438713074, + "learning_rate": 6.779815140604496e-05, + "loss": 1.7573, + "step": 13119 + }, + { + "epoch": 4.027010435850215, + "grad_norm": 0.28028404712677, + "learning_rate": 6.779350632972493e-05, + "loss": 1.8103, + "step": 13120 + }, + { + "epoch": 4.02731737262124, + "grad_norm": 0.21088312566280365, + "learning_rate": 6.778886107755904e-05, + "loss": 1.7169, + "step": 13121 + }, + { + "epoch": 4.027624309392265, + "grad_norm": 0.22282038629055023, + "learning_rate": 6.77842156495932e-05, + "loss": 1.7206, + "step": 13122 + }, + { + "epoch": 4.02793124616329, + "grad_norm": 0.3281327784061432, + "learning_rate": 6.777957004587331e-05, + "loss": 1.8664, + "step": 13123 + }, + { + "epoch": 4.028238182934316, + "grad_norm": 0.29496827721595764, + "learning_rate": 6.77749242664453e-05, + "loss": 1.7532, + "step": 13124 + }, + { + "epoch": 4.028545119705341, + "grad_norm": 0.25299328565597534, + "learning_rate": 6.777027831135508e-05, + "loss": 1.7836, + "step": 13125 + }, + { + "epoch": 4.0288520564763655, + "grad_norm": 0.3000280559062958, + "learning_rate": 6.776563218064854e-05, + "loss": 1.8079, + "step": 13126 + }, + { + "epoch": 4.029158993247391, + "grad_norm": 0.3613673448562622, + "learning_rate": 6.77609858743716e-05, + "loss": 1.7931, + "step": 13127 + }, + { + "epoch": 4.029465930018416, + "grad_norm": 0.25613468885421753, + "learning_rate": 6.77563393925702e-05, + "loss": 1.7522, + "step": 13128 + }, + { + "epoch": 4.0297728667894415, + "grad_norm": 0.24391578137874603, + "learning_rate": 6.775169273529026e-05, + "loss": 1.818, + "step": 13129 + }, + { + "epoch": 4.030079803560467, + "grad_norm": 0.2806173264980316, + "learning_rate": 6.774704590257768e-05, + "loss": 1.7349, + "step": 13130 + }, + { + "epoch": 4.030386740331492, + "grad_norm": 0.22214172780513763, + "learning_rate": 6.774239889447838e-05, + "loss": 1.759, + "step": 13131 + }, + { + "epoch": 4.030693677102517, + "grad_norm": 0.27285513281822205, + "learning_rate": 6.773775171103828e-05, + "loss": 1.742, + "step": 13132 + }, + { + "epoch": 4.031000613873542, + "grad_norm": 0.22302402555942535, + "learning_rate": 6.773310435230334e-05, + "loss": 1.7277, + "step": 13133 + }, + { + "epoch": 4.031307550644567, + "grad_norm": 0.2350187450647354, + "learning_rate": 6.772845681831947e-05, + "loss": 1.8648, + "step": 13134 + }, + { + "epoch": 4.031614487415593, + "grad_norm": 0.2665547728538513, + "learning_rate": 6.772380910913261e-05, + "loss": 1.776, + "step": 13135 + }, + { + "epoch": 4.031921424186618, + "grad_norm": 0.30652403831481934, + "learning_rate": 6.771916122478867e-05, + "loss": 1.7884, + "step": 13136 + }, + { + "epoch": 4.032228360957642, + "grad_norm": 0.29372814297676086, + "learning_rate": 6.771451316533359e-05, + "loss": 1.8203, + "step": 13137 + }, + { + "epoch": 4.032535297728668, + "grad_norm": 0.2244873046875, + "learning_rate": 6.770986493081329e-05, + "loss": 1.7869, + "step": 13138 + }, + { + "epoch": 4.032842234499693, + "grad_norm": 0.25075265765190125, + "learning_rate": 6.770521652127375e-05, + "loss": 1.772, + "step": 13139 + }, + { + "epoch": 4.033149171270718, + "grad_norm": 0.28118211030960083, + "learning_rate": 6.770056793676087e-05, + "loss": 1.7922, + "step": 13140 + }, + { + "epoch": 4.033456108041744, + "grad_norm": 0.25199100375175476, + "learning_rate": 6.769591917732062e-05, + "loss": 1.7526, + "step": 13141 + }, + { + "epoch": 4.033763044812768, + "grad_norm": 0.2920379638671875, + "learning_rate": 6.769127024299892e-05, + "loss": 1.8365, + "step": 13142 + }, + { + "epoch": 4.0340699815837935, + "grad_norm": 0.23018018901348114, + "learning_rate": 6.768662113384171e-05, + "loss": 1.7411, + "step": 13143 + }, + { + "epoch": 4.034376918354819, + "grad_norm": 0.23253841698169708, + "learning_rate": 6.768197184989494e-05, + "loss": 1.7921, + "step": 13144 + }, + { + "epoch": 4.034683855125844, + "grad_norm": 0.22618864476680756, + "learning_rate": 6.767732239120456e-05, + "loss": 1.7421, + "step": 13145 + }, + { + "epoch": 4.0349907918968695, + "grad_norm": 0.24552187323570251, + "learning_rate": 6.767267275781655e-05, + "loss": 1.7299, + "step": 13146 + }, + { + "epoch": 4.035297728667895, + "grad_norm": 0.22562766075134277, + "learning_rate": 6.76680229497768e-05, + "loss": 1.766, + "step": 13147 + }, + { + "epoch": 4.035604665438919, + "grad_norm": 0.28718629479408264, + "learning_rate": 6.76633729671313e-05, + "loss": 1.7366, + "step": 13148 + }, + { + "epoch": 4.035911602209945, + "grad_norm": 0.38769885897636414, + "learning_rate": 6.765872280992598e-05, + "loss": 1.8244, + "step": 13149 + }, + { + "epoch": 4.03621853898097, + "grad_norm": 0.4232725501060486, + "learning_rate": 6.765407247820683e-05, + "loss": 1.8244, + "step": 13150 + }, + { + "epoch": 4.036525475751995, + "grad_norm": 0.2771088778972626, + "learning_rate": 6.764942197201977e-05, + "loss": 1.7863, + "step": 13151 + }, + { + "epoch": 4.036832412523021, + "grad_norm": 0.2917862832546234, + "learning_rate": 6.76447712914108e-05, + "loss": 1.791, + "step": 13152 + }, + { + "epoch": 4.037139349294045, + "grad_norm": 0.37355467677116394, + "learning_rate": 6.764012043642584e-05, + "loss": 1.74, + "step": 13153 + }, + { + "epoch": 4.03744628606507, + "grad_norm": 0.35664018988609314, + "learning_rate": 6.763546940711089e-05, + "loss": 1.7734, + "step": 13154 + }, + { + "epoch": 4.037753222836096, + "grad_norm": 0.2335754930973053, + "learning_rate": 6.763081820351188e-05, + "loss": 1.7765, + "step": 13155 + }, + { + "epoch": 4.038060159607121, + "grad_norm": 0.2825562357902527, + "learning_rate": 6.762616682567478e-05, + "loss": 1.7867, + "step": 13156 + }, + { + "epoch": 4.038367096378146, + "grad_norm": 0.3103202283382416, + "learning_rate": 6.762151527364559e-05, + "loss": 1.7331, + "step": 13157 + }, + { + "epoch": 4.038674033149171, + "grad_norm": 0.2897353172302246, + "learning_rate": 6.761686354747025e-05, + "loss": 1.7638, + "step": 13158 + }, + { + "epoch": 4.038980969920196, + "grad_norm": 0.21260851621627808, + "learning_rate": 6.761221164719474e-05, + "loss": 1.7302, + "step": 13159 + }, + { + "epoch": 4.0392879066912215, + "grad_norm": 0.2878021001815796, + "learning_rate": 6.760755957286503e-05, + "loss": 1.7368, + "step": 13160 + }, + { + "epoch": 4.039594843462247, + "grad_norm": 0.2785978317260742, + "learning_rate": 6.76029073245271e-05, + "loss": 1.7258, + "step": 13161 + }, + { + "epoch": 4.039901780233272, + "grad_norm": 0.1963953971862793, + "learning_rate": 6.759825490222692e-05, + "loss": 1.755, + "step": 13162 + }, + { + "epoch": 4.0402087170042975, + "grad_norm": 0.26776790618896484, + "learning_rate": 6.759360230601047e-05, + "loss": 1.7676, + "step": 13163 + }, + { + "epoch": 4.040515653775322, + "grad_norm": 0.2751332223415375, + "learning_rate": 6.758894953592373e-05, + "loss": 1.7313, + "step": 13164 + }, + { + "epoch": 4.040822590546347, + "grad_norm": 0.2339213341474533, + "learning_rate": 6.758429659201269e-05, + "loss": 1.714, + "step": 13165 + }, + { + "epoch": 4.041129527317373, + "grad_norm": 0.2624664008617401, + "learning_rate": 6.75796434743233e-05, + "loss": 1.8296, + "step": 13166 + }, + { + "epoch": 4.041436464088398, + "grad_norm": 0.40156883001327515, + "learning_rate": 6.757499018290159e-05, + "loss": 1.8228, + "step": 13167 + }, + { + "epoch": 4.041743400859423, + "grad_norm": 0.32976576685905457, + "learning_rate": 6.757033671779352e-05, + "loss": 1.7403, + "step": 13168 + }, + { + "epoch": 4.042050337630448, + "grad_norm": 0.2343887835741043, + "learning_rate": 6.756568307904508e-05, + "loss": 1.7837, + "step": 13169 + }, + { + "epoch": 4.042357274401473, + "grad_norm": 0.36174145340919495, + "learning_rate": 6.756102926670227e-05, + "loss": 1.7291, + "step": 13170 + }, + { + "epoch": 4.042664211172498, + "grad_norm": 0.3324793577194214, + "learning_rate": 6.755637528081108e-05, + "loss": 1.7414, + "step": 13171 + }, + { + "epoch": 4.042971147943524, + "grad_norm": 0.21945348381996155, + "learning_rate": 6.75517211214175e-05, + "loss": 1.7762, + "step": 13172 + }, + { + "epoch": 4.043278084714549, + "grad_norm": 0.31069812178611755, + "learning_rate": 6.75470667885675e-05, + "loss": 1.7666, + "step": 13173 + }, + { + "epoch": 4.043585021485574, + "grad_norm": 0.3931153118610382, + "learning_rate": 6.754241228230713e-05, + "loss": 1.7871, + "step": 13174 + }, + { + "epoch": 4.043891958256599, + "grad_norm": 0.25559595227241516, + "learning_rate": 6.753775760268234e-05, + "loss": 1.7916, + "step": 13175 + }, + { + "epoch": 4.044198895027624, + "grad_norm": 0.3686937391757965, + "learning_rate": 6.753310274973917e-05, + "loss": 1.7642, + "step": 13176 + }, + { + "epoch": 4.0445058317986495, + "grad_norm": 0.4793247580528259, + "learning_rate": 6.75284477235236e-05, + "loss": 1.739, + "step": 13177 + }, + { + "epoch": 4.044812768569675, + "grad_norm": 0.36179354786872864, + "learning_rate": 6.752379252408164e-05, + "loss": 1.7993, + "step": 13178 + }, + { + "epoch": 4.0451197053407, + "grad_norm": 0.22559234499931335, + "learning_rate": 6.751913715145926e-05, + "loss": 1.7401, + "step": 13179 + }, + { + "epoch": 4.045426642111725, + "grad_norm": 0.29058873653411865, + "learning_rate": 6.751448160570253e-05, + "loss": 1.8089, + "step": 13180 + }, + { + "epoch": 4.04573357888275, + "grad_norm": 0.3069808781147003, + "learning_rate": 6.750982588685742e-05, + "loss": 1.7587, + "step": 13181 + }, + { + "epoch": 4.046040515653775, + "grad_norm": 0.2292155921459198, + "learning_rate": 6.750516999496994e-05, + "loss": 1.7429, + "step": 13182 + }, + { + "epoch": 4.046347452424801, + "grad_norm": 0.2520677149295807, + "learning_rate": 6.750051393008612e-05, + "loss": 1.7842, + "step": 13183 + }, + { + "epoch": 4.046654389195826, + "grad_norm": 0.32546502351760864, + "learning_rate": 6.749585769225194e-05, + "loss": 1.8057, + "step": 13184 + }, + { + "epoch": 4.04696132596685, + "grad_norm": 0.27634644508361816, + "learning_rate": 6.749120128151346e-05, + "loss": 1.7708, + "step": 13185 + }, + { + "epoch": 4.047268262737876, + "grad_norm": 0.2546750009059906, + "learning_rate": 6.748654469791668e-05, + "loss": 1.8744, + "step": 13186 + }, + { + "epoch": 4.047575199508901, + "grad_norm": 0.43873605132102966, + "learning_rate": 6.748188794150761e-05, + "loss": 1.8573, + "step": 13187 + }, + { + "epoch": 4.047882136279926, + "grad_norm": 0.45526960492134094, + "learning_rate": 6.747723101233227e-05, + "loss": 1.7761, + "step": 13188 + }, + { + "epoch": 4.048189073050952, + "grad_norm": 0.24995557963848114, + "learning_rate": 6.74725739104367e-05, + "loss": 1.7679, + "step": 13189 + }, + { + "epoch": 4.048496009821977, + "grad_norm": 0.3203068971633911, + "learning_rate": 6.74679166358669e-05, + "loss": 1.7772, + "step": 13190 + }, + { + "epoch": 4.0488029465930016, + "grad_norm": 0.37020671367645264, + "learning_rate": 6.746325918866893e-05, + "loss": 1.8002, + "step": 13191 + }, + { + "epoch": 4.049109883364027, + "grad_norm": 0.2543959319591522, + "learning_rate": 6.745860156888878e-05, + "loss": 1.8057, + "step": 13192 + }, + { + "epoch": 4.049416820135052, + "grad_norm": 0.2566509246826172, + "learning_rate": 6.74539437765725e-05, + "loss": 1.7853, + "step": 13193 + }, + { + "epoch": 4.0497237569060776, + "grad_norm": 0.2545804977416992, + "learning_rate": 6.744928581176612e-05, + "loss": 1.8136, + "step": 13194 + }, + { + "epoch": 4.050030693677103, + "grad_norm": 0.24307197332382202, + "learning_rate": 6.744462767451568e-05, + "loss": 1.7919, + "step": 13195 + }, + { + "epoch": 4.050337630448127, + "grad_norm": 0.24427616596221924, + "learning_rate": 6.743996936486719e-05, + "loss": 1.8037, + "step": 13196 + }, + { + "epoch": 4.050644567219153, + "grad_norm": 0.2154439389705658, + "learning_rate": 6.743531088286673e-05, + "loss": 1.7088, + "step": 13197 + }, + { + "epoch": 4.050951503990178, + "grad_norm": 0.22251558303833008, + "learning_rate": 6.743065222856027e-05, + "loss": 1.7512, + "step": 13198 + }, + { + "epoch": 4.051258440761203, + "grad_norm": 0.2373272329568863, + "learning_rate": 6.74259934019939e-05, + "loss": 1.8056, + "step": 13199 + }, + { + "epoch": 4.051565377532229, + "grad_norm": 0.23308727145195007, + "learning_rate": 6.742133440321366e-05, + "loss": 1.731, + "step": 13200 + }, + { + "epoch": 4.051872314303253, + "grad_norm": 0.2438805252313614, + "learning_rate": 6.741667523226557e-05, + "loss": 1.7938, + "step": 13201 + }, + { + "epoch": 4.0521792510742785, + "grad_norm": 0.22354702651500702, + "learning_rate": 6.741201588919569e-05, + "loss": 1.762, + "step": 13202 + }, + { + "epoch": 4.052486187845304, + "grad_norm": 0.2505488097667694, + "learning_rate": 6.740735637405006e-05, + "loss": 1.7627, + "step": 13203 + }, + { + "epoch": 4.052793124616329, + "grad_norm": 0.21378709375858307, + "learning_rate": 6.740269668687474e-05, + "loss": 1.7598, + "step": 13204 + }, + { + "epoch": 4.0531000613873545, + "grad_norm": 0.24863660335540771, + "learning_rate": 6.739803682771577e-05, + "loss": 1.7665, + "step": 13205 + }, + { + "epoch": 4.05340699815838, + "grad_norm": 0.3041808605194092, + "learning_rate": 6.739337679661921e-05, + "loss": 1.7909, + "step": 13206 + }, + { + "epoch": 4.053713934929404, + "grad_norm": 0.2745797634124756, + "learning_rate": 6.738871659363109e-05, + "loss": 1.7547, + "step": 13207 + }, + { + "epoch": 4.05402087170043, + "grad_norm": 0.2610073387622833, + "learning_rate": 6.738405621879748e-05, + "loss": 1.7723, + "step": 13208 + }, + { + "epoch": 4.054327808471455, + "grad_norm": 0.22728075087070465, + "learning_rate": 6.737939567216446e-05, + "loss": 1.7865, + "step": 13209 + }, + { + "epoch": 4.05463474524248, + "grad_norm": 0.2877669930458069, + "learning_rate": 6.737473495377804e-05, + "loss": 1.8352, + "step": 13210 + }, + { + "epoch": 4.054941682013506, + "grad_norm": 0.35316282510757446, + "learning_rate": 6.737007406368432e-05, + "loss": 1.8202, + "step": 13211 + }, + { + "epoch": 4.05524861878453, + "grad_norm": 0.34625691175460815, + "learning_rate": 6.736541300192936e-05, + "loss": 1.8456, + "step": 13212 + }, + { + "epoch": 4.055555555555555, + "grad_norm": 0.2432134598493576, + "learning_rate": 6.736075176855917e-05, + "loss": 1.8237, + "step": 13213 + }, + { + "epoch": 4.055862492326581, + "grad_norm": 0.27446529269218445, + "learning_rate": 6.735609036361989e-05, + "loss": 1.71, + "step": 13214 + }, + { + "epoch": 4.056169429097606, + "grad_norm": 0.2870408892631531, + "learning_rate": 6.735142878715754e-05, + "loss": 1.7473, + "step": 13215 + }, + { + "epoch": 4.056476365868631, + "grad_norm": 0.22249078750610352, + "learning_rate": 6.734676703921822e-05, + "loss": 1.7462, + "step": 13216 + }, + { + "epoch": 4.056783302639656, + "grad_norm": 0.25519105792045593, + "learning_rate": 6.734210511984796e-05, + "loss": 1.7022, + "step": 13217 + }, + { + "epoch": 4.057090239410681, + "grad_norm": 0.3366561830043793, + "learning_rate": 6.733744302909285e-05, + "loss": 1.787, + "step": 13218 + }, + { + "epoch": 4.0573971761817065, + "grad_norm": 0.2443208247423172, + "learning_rate": 6.733278076699897e-05, + "loss": 1.8048, + "step": 13219 + }, + { + "epoch": 4.057704112952732, + "grad_norm": 0.2893153131008148, + "learning_rate": 6.73281183336124e-05, + "loss": 1.7805, + "step": 13220 + }, + { + "epoch": 4.058011049723757, + "grad_norm": 0.3178043067455292, + "learning_rate": 6.73234557289792e-05, + "loss": 1.8264, + "step": 13221 + }, + { + "epoch": 4.0583179864947825, + "grad_norm": 0.27355703711509705, + "learning_rate": 6.731879295314546e-05, + "loss": 1.8427, + "step": 13222 + }, + { + "epoch": 4.058624923265807, + "grad_norm": 0.32180166244506836, + "learning_rate": 6.731413000615726e-05, + "loss": 1.7332, + "step": 13223 + }, + { + "epoch": 4.058931860036832, + "grad_norm": 0.3736574351787567, + "learning_rate": 6.730946688806067e-05, + "loss": 1.7447, + "step": 13224 + }, + { + "epoch": 4.059238796807858, + "grad_norm": 0.2526068687438965, + "learning_rate": 6.73048035989018e-05, + "loss": 1.8104, + "step": 13225 + }, + { + "epoch": 4.059545733578883, + "grad_norm": 0.29076167941093445, + "learning_rate": 6.73001401387267e-05, + "loss": 1.7977, + "step": 13226 + }, + { + "epoch": 4.059852670349908, + "grad_norm": 0.37963762879371643, + "learning_rate": 6.729547650758148e-05, + "loss": 1.8336, + "step": 13227 + }, + { + "epoch": 4.060159607120933, + "grad_norm": 0.31584078073501587, + "learning_rate": 6.729081270551222e-05, + "loss": 1.7843, + "step": 13228 + }, + { + "epoch": 4.060466543891958, + "grad_norm": 0.22793468832969666, + "learning_rate": 6.728614873256502e-05, + "loss": 1.7444, + "step": 13229 + }, + { + "epoch": 4.060773480662983, + "grad_norm": 0.3114435076713562, + "learning_rate": 6.728148458878596e-05, + "loss": 1.8012, + "step": 13230 + }, + { + "epoch": 4.061080417434009, + "grad_norm": 0.29843854904174805, + "learning_rate": 6.727682027422116e-05, + "loss": 1.8014, + "step": 13231 + }, + { + "epoch": 4.061387354205034, + "grad_norm": 0.22745616734027863, + "learning_rate": 6.727215578891668e-05, + "loss": 1.7303, + "step": 13232 + }, + { + "epoch": 4.0616942909760585, + "grad_norm": 0.2701241970062256, + "learning_rate": 6.726749113291864e-05, + "loss": 1.7665, + "step": 13233 + }, + { + "epoch": 4.062001227747084, + "grad_norm": 0.29304635524749756, + "learning_rate": 6.726282630627313e-05, + "loss": 1.875, + "step": 13234 + }, + { + "epoch": 4.062308164518109, + "grad_norm": 0.21467708051204681, + "learning_rate": 6.725816130902625e-05, + "loss": 1.7442, + "step": 13235 + }, + { + "epoch": 4.0626151012891345, + "grad_norm": 0.23517470061779022, + "learning_rate": 6.72534961412241e-05, + "loss": 1.7154, + "step": 13236 + }, + { + "epoch": 4.06292203806016, + "grad_norm": 0.21483808755874634, + "learning_rate": 6.724883080291278e-05, + "loss": 1.7162, + "step": 13237 + }, + { + "epoch": 4.063228974831185, + "grad_norm": 0.2274744212627411, + "learning_rate": 6.724416529413843e-05, + "loss": 1.8066, + "step": 13238 + }, + { + "epoch": 4.06353591160221, + "grad_norm": 0.24682378768920898, + "learning_rate": 6.723949961494712e-05, + "loss": 1.7905, + "step": 13239 + }, + { + "epoch": 4.063842848373235, + "grad_norm": 0.2516227066516876, + "learning_rate": 6.723483376538498e-05, + "loss": 1.7693, + "step": 13240 + }, + { + "epoch": 4.06414978514426, + "grad_norm": 0.22076398134231567, + "learning_rate": 6.723016774549808e-05, + "loss": 1.7357, + "step": 13241 + }, + { + "epoch": 4.064456721915286, + "grad_norm": 0.20741026103496552, + "learning_rate": 6.722550155533258e-05, + "loss": 1.8082, + "step": 13242 + }, + { + "epoch": 4.064763658686311, + "grad_norm": 0.2074010819196701, + "learning_rate": 6.722083519493458e-05, + "loss": 1.71, + "step": 13243 + }, + { + "epoch": 4.065070595457335, + "grad_norm": 0.2661527991294861, + "learning_rate": 6.72161686643502e-05, + "loss": 1.7448, + "step": 13244 + }, + { + "epoch": 4.065377532228361, + "grad_norm": 0.2877216935157776, + "learning_rate": 6.721150196362555e-05, + "loss": 1.7574, + "step": 13245 + }, + { + "epoch": 4.065684468999386, + "grad_norm": 0.2520955801010132, + "learning_rate": 6.720683509280675e-05, + "loss": 1.7717, + "step": 13246 + }, + { + "epoch": 4.065991405770411, + "grad_norm": 0.2219560444355011, + "learning_rate": 6.72021680519399e-05, + "loss": 1.7355, + "step": 13247 + }, + { + "epoch": 4.066298342541437, + "grad_norm": 0.24671706557273865, + "learning_rate": 6.719750084107117e-05, + "loss": 1.8204, + "step": 13248 + }, + { + "epoch": 4.066605279312462, + "grad_norm": 0.24512135982513428, + "learning_rate": 6.719283346024664e-05, + "loss": 1.826, + "step": 13249 + }, + { + "epoch": 4.0669122160834865, + "grad_norm": 0.24370841681957245, + "learning_rate": 6.718816590951247e-05, + "loss": 1.8322, + "step": 13250 + }, + { + "epoch": 4.067219152854512, + "grad_norm": 0.2312363088130951, + "learning_rate": 6.718349818891475e-05, + "loss": 1.7621, + "step": 13251 + }, + { + "epoch": 4.067526089625537, + "grad_norm": 0.2500494420528412, + "learning_rate": 6.717883029849965e-05, + "loss": 1.829, + "step": 13252 + }, + { + "epoch": 4.0678330263965625, + "grad_norm": 0.29882633686065674, + "learning_rate": 6.717416223831324e-05, + "loss": 1.799, + "step": 13253 + }, + { + "epoch": 4.068139963167588, + "grad_norm": 0.21962928771972656, + "learning_rate": 6.716949400840172e-05, + "loss": 1.7714, + "step": 13254 + }, + { + "epoch": 4.068446899938612, + "grad_norm": 0.25544899702072144, + "learning_rate": 6.716482560881121e-05, + "loss": 1.7911, + "step": 13255 + }, + { + "epoch": 4.068753836709638, + "grad_norm": 0.24865686893463135, + "learning_rate": 6.716015703958781e-05, + "loss": 1.7107, + "step": 13256 + }, + { + "epoch": 4.069060773480663, + "grad_norm": 0.22669239342212677, + "learning_rate": 6.715548830077769e-05, + "loss": 1.8503, + "step": 13257 + }, + { + "epoch": 4.069367710251688, + "grad_norm": 0.2973819077014923, + "learning_rate": 6.715081939242698e-05, + "loss": 1.7859, + "step": 13258 + }, + { + "epoch": 4.069674647022714, + "grad_norm": 0.3178746700286865, + "learning_rate": 6.714615031458181e-05, + "loss": 1.7705, + "step": 13259 + }, + { + "epoch": 4.069981583793738, + "grad_norm": 0.20452535152435303, + "learning_rate": 6.714148106728835e-05, + "loss": 1.7386, + "step": 13260 + }, + { + "epoch": 4.070288520564763, + "grad_norm": 0.30288320779800415, + "learning_rate": 6.713681165059271e-05, + "loss": 1.7823, + "step": 13261 + }, + { + "epoch": 4.070595457335789, + "grad_norm": 0.30014416575431824, + "learning_rate": 6.713214206454107e-05, + "loss": 1.7626, + "step": 13262 + }, + { + "epoch": 4.070902394106814, + "grad_norm": 0.25144243240356445, + "learning_rate": 6.712747230917956e-05, + "loss": 1.8359, + "step": 13263 + }, + { + "epoch": 4.071209330877839, + "grad_norm": 0.308148592710495, + "learning_rate": 6.712280238455432e-05, + "loss": 1.7226, + "step": 13264 + }, + { + "epoch": 4.071516267648865, + "grad_norm": 0.2704198658466339, + "learning_rate": 6.711813229071151e-05, + "loss": 1.7982, + "step": 13265 + }, + { + "epoch": 4.071823204419889, + "grad_norm": 0.3928656280040741, + "learning_rate": 6.711346202769729e-05, + "loss": 1.7987, + "step": 13266 + }, + { + "epoch": 4.0721301411909145, + "grad_norm": 0.3603350520133972, + "learning_rate": 6.71087915955578e-05, + "loss": 1.7963, + "step": 13267 + }, + { + "epoch": 4.07243707796194, + "grad_norm": 0.2673214077949524, + "learning_rate": 6.710412099433921e-05, + "loss": 1.8011, + "step": 13268 + }, + { + "epoch": 4.072744014732965, + "grad_norm": 0.2523653209209442, + "learning_rate": 6.709945022408768e-05, + "loss": 1.755, + "step": 13269 + }, + { + "epoch": 4.0730509515039905, + "grad_norm": 0.3818903863430023, + "learning_rate": 6.709477928484934e-05, + "loss": 1.7968, + "step": 13270 + }, + { + "epoch": 4.073357888275015, + "grad_norm": 0.31509929895401, + "learning_rate": 6.709010817667039e-05, + "loss": 1.744, + "step": 13271 + }, + { + "epoch": 4.07366482504604, + "grad_norm": 0.21875518560409546, + "learning_rate": 6.708543689959697e-05, + "loss": 1.7511, + "step": 13272 + }, + { + "epoch": 4.073971761817066, + "grad_norm": 0.25381338596343994, + "learning_rate": 6.708076545367523e-05, + "loss": 1.7523, + "step": 13273 + }, + { + "epoch": 4.074278698588091, + "grad_norm": 0.24193842709064484, + "learning_rate": 6.707609383895137e-05, + "loss": 1.7713, + "step": 13274 + }, + { + "epoch": 4.074585635359116, + "grad_norm": 0.21972359716892242, + "learning_rate": 6.707142205547154e-05, + "loss": 1.7329, + "step": 13275 + }, + { + "epoch": 4.074892572130141, + "grad_norm": 0.22188499569892883, + "learning_rate": 6.706675010328192e-05, + "loss": 1.7507, + "step": 13276 + }, + { + "epoch": 4.075199508901166, + "grad_norm": 0.23344436287879944, + "learning_rate": 6.706207798242865e-05, + "loss": 1.771, + "step": 13277 + }, + { + "epoch": 4.0755064456721914, + "grad_norm": 0.3008805513381958, + "learning_rate": 6.705740569295795e-05, + "loss": 1.775, + "step": 13278 + }, + { + "epoch": 4.075813382443217, + "grad_norm": 0.31407982110977173, + "learning_rate": 6.705273323491595e-05, + "loss": 1.7625, + "step": 13279 + }, + { + "epoch": 4.076120319214242, + "grad_norm": 0.2430381178855896, + "learning_rate": 6.704806060834886e-05, + "loss": 1.7706, + "step": 13280 + }, + { + "epoch": 4.0764272559852675, + "grad_norm": 0.23250171542167664, + "learning_rate": 6.704338781330284e-05, + "loss": 1.7977, + "step": 13281 + }, + { + "epoch": 4.076734192756292, + "grad_norm": 0.22073723375797272, + "learning_rate": 6.703871484982407e-05, + "loss": 1.7686, + "step": 13282 + }, + { + "epoch": 4.077041129527317, + "grad_norm": 0.24987035989761353, + "learning_rate": 6.703404171795874e-05, + "loss": 1.736, + "step": 13283 + }, + { + "epoch": 4.077348066298343, + "grad_norm": 0.2697623670101166, + "learning_rate": 6.702936841775301e-05, + "loss": 1.8367, + "step": 13284 + }, + { + "epoch": 4.077655003069368, + "grad_norm": 0.21592749655246735, + "learning_rate": 6.702469494925309e-05, + "loss": 1.7467, + "step": 13285 + }, + { + "epoch": 4.077961939840393, + "grad_norm": 0.2612052261829376, + "learning_rate": 6.702002131250515e-05, + "loss": 1.7689, + "step": 13286 + }, + { + "epoch": 4.078268876611418, + "grad_norm": 0.3004797697067261, + "learning_rate": 6.701534750755539e-05, + "loss": 1.7586, + "step": 13287 + }, + { + "epoch": 4.078575813382443, + "grad_norm": 0.24615366756916046, + "learning_rate": 6.701067353444998e-05, + "loss": 1.7636, + "step": 13288 + }, + { + "epoch": 4.078882750153468, + "grad_norm": 0.23401159048080444, + "learning_rate": 6.700599939323515e-05, + "loss": 1.8015, + "step": 13289 + }, + { + "epoch": 4.079189686924494, + "grad_norm": 0.24546295404434204, + "learning_rate": 6.700132508395705e-05, + "loss": 1.7606, + "step": 13290 + }, + { + "epoch": 4.079496623695519, + "grad_norm": 0.24664412438869476, + "learning_rate": 6.69966506066619e-05, + "loss": 1.7994, + "step": 13291 + }, + { + "epoch": 4.0798035604665435, + "grad_norm": 0.2780163288116455, + "learning_rate": 6.699197596139587e-05, + "loss": 1.7972, + "step": 13292 + }, + { + "epoch": 4.080110497237569, + "grad_norm": 0.2554188668727875, + "learning_rate": 6.698730114820517e-05, + "loss": 1.7928, + "step": 13293 + }, + { + "epoch": 4.080417434008594, + "grad_norm": 0.2471141666173935, + "learning_rate": 6.698262616713602e-05, + "loss": 1.7948, + "step": 13294 + }, + { + "epoch": 4.0807243707796195, + "grad_norm": 0.2556581199169159, + "learning_rate": 6.697795101823461e-05, + "loss": 1.7942, + "step": 13295 + }, + { + "epoch": 4.081031307550645, + "grad_norm": 0.24462421238422394, + "learning_rate": 6.697327570154712e-05, + "loss": 1.7336, + "step": 13296 + }, + { + "epoch": 4.08133824432167, + "grad_norm": 0.22378689050674438, + "learning_rate": 6.696860021711978e-05, + "loss": 1.7703, + "step": 13297 + }, + { + "epoch": 4.081645181092695, + "grad_norm": 0.23949933052062988, + "learning_rate": 6.69639245649988e-05, + "loss": 1.7651, + "step": 13298 + }, + { + "epoch": 4.08195211786372, + "grad_norm": 0.27751216292381287, + "learning_rate": 6.695924874523035e-05, + "loss": 1.7866, + "step": 13299 + }, + { + "epoch": 4.082259054634745, + "grad_norm": 0.22700226306915283, + "learning_rate": 6.695457275786068e-05, + "loss": 1.79, + "step": 13300 + }, + { + "epoch": 4.082565991405771, + "grad_norm": 0.2138090431690216, + "learning_rate": 6.694989660293598e-05, + "loss": 1.7882, + "step": 13301 + }, + { + "epoch": 4.082872928176796, + "grad_norm": 0.2963469326496124, + "learning_rate": 6.694522028050246e-05, + "loss": 1.8779, + "step": 13302 + }, + { + "epoch": 4.08317986494782, + "grad_norm": 0.31833669543266296, + "learning_rate": 6.694054379060634e-05, + "loss": 1.7923, + "step": 13303 + }, + { + "epoch": 4.083486801718846, + "grad_norm": 0.27751585841178894, + "learning_rate": 6.693586713329385e-05, + "loss": 1.7557, + "step": 13304 + }, + { + "epoch": 4.083793738489871, + "grad_norm": 0.23790816962718964, + "learning_rate": 6.69311903086112e-05, + "loss": 1.7587, + "step": 13305 + }, + { + "epoch": 4.084100675260896, + "grad_norm": 0.24153777956962585, + "learning_rate": 6.692651331660458e-05, + "loss": 1.7573, + "step": 13306 + }, + { + "epoch": 4.084407612031922, + "grad_norm": 0.26607179641723633, + "learning_rate": 6.692183615732025e-05, + "loss": 1.7823, + "step": 13307 + }, + { + "epoch": 4.084714548802946, + "grad_norm": 0.26670268177986145, + "learning_rate": 6.691715883080442e-05, + "loss": 1.784, + "step": 13308 + }, + { + "epoch": 4.0850214855739715, + "grad_norm": 0.25980666279792786, + "learning_rate": 6.69124813371033e-05, + "loss": 1.797, + "step": 13309 + }, + { + "epoch": 4.085328422344997, + "grad_norm": 0.2805597484111786, + "learning_rate": 6.690780367626314e-05, + "loss": 1.8298, + "step": 13310 + }, + { + "epoch": 4.085635359116022, + "grad_norm": 0.27198413014411926, + "learning_rate": 6.690312584833012e-05, + "loss": 1.8104, + "step": 13311 + }, + { + "epoch": 4.0859422958870475, + "grad_norm": 0.2619116008281708, + "learning_rate": 6.689844785335054e-05, + "loss": 1.771, + "step": 13312 + }, + { + "epoch": 4.086249232658073, + "grad_norm": 0.22647863626480103, + "learning_rate": 6.689376969137057e-05, + "loss": 1.8114, + "step": 13313 + }, + { + "epoch": 4.086556169429097, + "grad_norm": 1.469475507736206, + "learning_rate": 6.68890913624365e-05, + "loss": 1.8796, + "step": 13314 + }, + { + "epoch": 4.086863106200123, + "grad_norm": 0.4577515423297882, + "learning_rate": 6.68844128665945e-05, + "loss": 1.716, + "step": 13315 + }, + { + "epoch": 4.087170042971148, + "grad_norm": 0.5830543637275696, + "learning_rate": 6.687973420389085e-05, + "loss": 1.7692, + "step": 13316 + }, + { + "epoch": 4.087476979742173, + "grad_norm": 0.4404197037220001, + "learning_rate": 6.687505537437178e-05, + "loss": 1.7909, + "step": 13317 + }, + { + "epoch": 4.087783916513199, + "grad_norm": 0.31379908323287964, + "learning_rate": 6.68703763780835e-05, + "loss": 1.7957, + "step": 13318 + }, + { + "epoch": 4.088090853284223, + "grad_norm": 0.49588730931282043, + "learning_rate": 6.686569721507229e-05, + "loss": 1.7126, + "step": 13319 + }, + { + "epoch": 4.088397790055248, + "grad_norm": 0.3690234124660492, + "learning_rate": 6.686101788538437e-05, + "loss": 1.8233, + "step": 13320 + }, + { + "epoch": 4.088704726826274, + "grad_norm": 0.337310254573822, + "learning_rate": 6.685633838906598e-05, + "loss": 1.6886, + "step": 13321 + }, + { + "epoch": 4.089011663597299, + "grad_norm": 0.5164821147918701, + "learning_rate": 6.685165872616337e-05, + "loss": 1.7967, + "step": 13322 + }, + { + "epoch": 4.089318600368324, + "grad_norm": 0.36501309275627136, + "learning_rate": 6.68469788967228e-05, + "loss": 1.755, + "step": 13323 + }, + { + "epoch": 4.08962553713935, + "grad_norm": 0.35017216205596924, + "learning_rate": 6.684229890079052e-05, + "loss": 1.7595, + "step": 13324 + }, + { + "epoch": 4.089932473910374, + "grad_norm": 0.5622650980949402, + "learning_rate": 6.683761873841277e-05, + "loss": 1.7841, + "step": 13325 + }, + { + "epoch": 4.0902394106813995, + "grad_norm": 0.47010260820388794, + "learning_rate": 6.683293840963578e-05, + "loss": 1.7537, + "step": 13326 + }, + { + "epoch": 4.090546347452425, + "grad_norm": 0.25515374541282654, + "learning_rate": 6.682825791450584e-05, + "loss": 1.7692, + "step": 13327 + }, + { + "epoch": 4.09085328422345, + "grad_norm": 0.5063003897666931, + "learning_rate": 6.682357725306919e-05, + "loss": 1.7454, + "step": 13328 + }, + { + "epoch": 4.0911602209944755, + "grad_norm": 0.4197622835636139, + "learning_rate": 6.681889642537209e-05, + "loss": 1.7792, + "step": 13329 + }, + { + "epoch": 4.0914671577655, + "grad_norm": 0.24038295447826385, + "learning_rate": 6.68142154314608e-05, + "loss": 1.7631, + "step": 13330 + }, + { + "epoch": 4.091774094536525, + "grad_norm": 0.42108532786369324, + "learning_rate": 6.680953427138159e-05, + "loss": 1.7784, + "step": 13331 + }, + { + "epoch": 4.092081031307551, + "grad_norm": 0.33729633688926697, + "learning_rate": 6.68048529451807e-05, + "loss": 1.8057, + "step": 13332 + }, + { + "epoch": 4.092387968078576, + "grad_norm": 0.31847241520881653, + "learning_rate": 6.68001714529044e-05, + "loss": 1.7375, + "step": 13333 + }, + { + "epoch": 4.092694904849601, + "grad_norm": 0.45276644825935364, + "learning_rate": 6.679548979459896e-05, + "loss": 1.7507, + "step": 13334 + }, + { + "epoch": 4.093001841620626, + "grad_norm": 0.3781665861606598, + "learning_rate": 6.679080797031065e-05, + "loss": 1.7718, + "step": 13335 + }, + { + "epoch": 4.093308778391651, + "grad_norm": 0.25868359208106995, + "learning_rate": 6.678612598008573e-05, + "loss": 1.8105, + "step": 13336 + }, + { + "epoch": 4.093615715162676, + "grad_norm": 0.32834702730178833, + "learning_rate": 6.678144382397048e-05, + "loss": 1.7883, + "step": 13337 + }, + { + "epoch": 4.093922651933702, + "grad_norm": 0.2830568253993988, + "learning_rate": 6.677676150201116e-05, + "loss": 1.7994, + "step": 13338 + }, + { + "epoch": 4.094229588704727, + "grad_norm": 0.219541534781456, + "learning_rate": 6.677207901425405e-05, + "loss": 1.7344, + "step": 13339 + }, + { + "epoch": 4.094536525475752, + "grad_norm": 0.2557326555252075, + "learning_rate": 6.676739636074542e-05, + "loss": 1.7734, + "step": 13340 + }, + { + "epoch": 4.094843462246777, + "grad_norm": 0.2741365432739258, + "learning_rate": 6.676271354153156e-05, + "loss": 1.7912, + "step": 13341 + }, + { + "epoch": 4.095150399017802, + "grad_norm": 0.31258970499038696, + "learning_rate": 6.675803055665874e-05, + "loss": 1.7798, + "step": 13342 + }, + { + "epoch": 4.0954573357888275, + "grad_norm": 0.30181947350502014, + "learning_rate": 6.675334740617322e-05, + "loss": 1.7746, + "step": 13343 + }, + { + "epoch": 4.095764272559853, + "grad_norm": 0.3000102937221527, + "learning_rate": 6.674866409012133e-05, + "loss": 1.7842, + "step": 13344 + }, + { + "epoch": 4.096071209330878, + "grad_norm": 0.22871005535125732, + "learning_rate": 6.674398060854931e-05, + "loss": 1.7473, + "step": 13345 + }, + { + "epoch": 4.096378146101903, + "grad_norm": 0.2700810432434082, + "learning_rate": 6.673929696150346e-05, + "loss": 1.7862, + "step": 13346 + }, + { + "epoch": 4.096685082872928, + "grad_norm": 0.27537551522254944, + "learning_rate": 6.673461314903007e-05, + "loss": 1.7843, + "step": 13347 + }, + { + "epoch": 4.096992019643953, + "grad_norm": 0.23700574040412903, + "learning_rate": 6.672992917117542e-05, + "loss": 1.765, + "step": 13348 + }, + { + "epoch": 4.097298956414979, + "grad_norm": 0.23331589996814728, + "learning_rate": 6.672524502798583e-05, + "loss": 1.7894, + "step": 13349 + }, + { + "epoch": 4.097605893186004, + "grad_norm": 0.28591978549957275, + "learning_rate": 6.672056071950753e-05, + "loss": 1.7736, + "step": 13350 + }, + { + "epoch": 4.097912829957028, + "grad_norm": 0.3000452518463135, + "learning_rate": 6.671587624578685e-05, + "loss": 1.7635, + "step": 13351 + }, + { + "epoch": 4.098219766728054, + "grad_norm": 0.21877998113632202, + "learning_rate": 6.67111916068701e-05, + "loss": 1.7225, + "step": 13352 + }, + { + "epoch": 4.098526703499079, + "grad_norm": 0.2598817050457001, + "learning_rate": 6.670650680280358e-05, + "loss": 1.6874, + "step": 13353 + }, + { + "epoch": 4.098833640270104, + "grad_norm": 0.3063203692436218, + "learning_rate": 6.670182183363353e-05, + "loss": 1.7821, + "step": 13354 + }, + { + "epoch": 4.09914057704113, + "grad_norm": 0.2328508347272873, + "learning_rate": 6.66971366994063e-05, + "loss": 1.788, + "step": 13355 + }, + { + "epoch": 4.099447513812155, + "grad_norm": 0.33936765789985657, + "learning_rate": 6.669245140016817e-05, + "loss": 1.8159, + "step": 13356 + }, + { + "epoch": 4.0997544505831796, + "grad_norm": 0.27464553713798523, + "learning_rate": 6.668776593596546e-05, + "loss": 1.7371, + "step": 13357 + }, + { + "epoch": 4.100061387354205, + "grad_norm": 0.24255812168121338, + "learning_rate": 6.668308030684447e-05, + "loss": 1.7993, + "step": 13358 + }, + { + "epoch": 4.10036832412523, + "grad_norm": 0.27203628420829773, + "learning_rate": 6.667839451285149e-05, + "loss": 1.8253, + "step": 13359 + }, + { + "epoch": 4.100675260896256, + "grad_norm": 0.2503862679004669, + "learning_rate": 6.667370855403286e-05, + "loss": 1.7927, + "step": 13360 + }, + { + "epoch": 4.100982197667281, + "grad_norm": 0.2616904377937317, + "learning_rate": 6.666902243043486e-05, + "loss": 1.8226, + "step": 13361 + }, + { + "epoch": 4.101289134438305, + "grad_norm": 0.26707521080970764, + "learning_rate": 6.666433614210379e-05, + "loss": 1.8485, + "step": 13362 + }, + { + "epoch": 4.101596071209331, + "grad_norm": 0.2427528202533722, + "learning_rate": 6.6659649689086e-05, + "loss": 1.7387, + "step": 13363 + }, + { + "epoch": 4.101903007980356, + "grad_norm": 0.2319549173116684, + "learning_rate": 6.66549630714278e-05, + "loss": 1.7396, + "step": 13364 + }, + { + "epoch": 4.102209944751381, + "grad_norm": 0.2248002141714096, + "learning_rate": 6.665027628917548e-05, + "loss": 1.7817, + "step": 13365 + }, + { + "epoch": 4.102516881522407, + "grad_norm": 0.21929535269737244, + "learning_rate": 6.664558934237538e-05, + "loss": 1.7478, + "step": 13366 + }, + { + "epoch": 4.102823818293431, + "grad_norm": 0.21144583821296692, + "learning_rate": 6.66409022310738e-05, + "loss": 1.7602, + "step": 13367 + }, + { + "epoch": 4.1031307550644565, + "grad_norm": 0.21984660625457764, + "learning_rate": 6.663621495531707e-05, + "loss": 1.7541, + "step": 13368 + }, + { + "epoch": 4.103437691835482, + "grad_norm": 0.2075357735157013, + "learning_rate": 6.663152751515152e-05, + "loss": 1.7362, + "step": 13369 + }, + { + "epoch": 4.103744628606507, + "grad_norm": 0.23316961526870728, + "learning_rate": 6.662683991062347e-05, + "loss": 1.8273, + "step": 13370 + }, + { + "epoch": 4.1040515653775325, + "grad_norm": 0.23142337799072266, + "learning_rate": 6.662215214177922e-05, + "loss": 1.7543, + "step": 13371 + }, + { + "epoch": 4.104358502148558, + "grad_norm": 0.24335260689258575, + "learning_rate": 6.661746420866515e-05, + "loss": 1.8328, + "step": 13372 + }, + { + "epoch": 4.104665438919582, + "grad_norm": 0.2440192997455597, + "learning_rate": 6.661277611132753e-05, + "loss": 1.8114, + "step": 13373 + }, + { + "epoch": 4.104972375690608, + "grad_norm": 0.252808541059494, + "learning_rate": 6.660808784981273e-05, + "loss": 1.8556, + "step": 13374 + }, + { + "epoch": 4.105279312461633, + "grad_norm": 0.24564477801322937, + "learning_rate": 6.660339942416708e-05, + "loss": 1.8231, + "step": 13375 + }, + { + "epoch": 4.105586249232658, + "grad_norm": 0.2371874898672104, + "learning_rate": 6.65987108344369e-05, + "loss": 1.7763, + "step": 13376 + }, + { + "epoch": 4.105893186003684, + "grad_norm": 0.22882802784442902, + "learning_rate": 6.659402208066854e-05, + "loss": 1.7388, + "step": 13377 + }, + { + "epoch": 4.106200122774708, + "grad_norm": 0.24857540428638458, + "learning_rate": 6.658933316290832e-05, + "loss": 1.7735, + "step": 13378 + }, + { + "epoch": 4.106507059545733, + "grad_norm": 0.22574029862880707, + "learning_rate": 6.658464408120257e-05, + "loss": 1.7403, + "step": 13379 + }, + { + "epoch": 4.106813996316759, + "grad_norm": 0.24944272637367249, + "learning_rate": 6.657995483559767e-05, + "loss": 1.7827, + "step": 13380 + }, + { + "epoch": 4.107120933087784, + "grad_norm": 0.27386224269866943, + "learning_rate": 6.657526542613992e-05, + "loss": 1.7673, + "step": 13381 + }, + { + "epoch": 4.107427869858809, + "grad_norm": 0.29222097992897034, + "learning_rate": 6.65705758528757e-05, + "loss": 1.7958, + "step": 13382 + }, + { + "epoch": 4.107734806629834, + "grad_norm": 0.2471150904893875, + "learning_rate": 6.656588611585133e-05, + "loss": 1.7706, + "step": 13383 + }, + { + "epoch": 4.108041743400859, + "grad_norm": 0.289316862821579, + "learning_rate": 6.656119621511317e-05, + "loss": 1.7828, + "step": 13384 + }, + { + "epoch": 4.1083486801718845, + "grad_norm": 0.36710497736930847, + "learning_rate": 6.655650615070756e-05, + "loss": 1.712, + "step": 13385 + }, + { + "epoch": 4.10865561694291, + "grad_norm": 0.2999880611896515, + "learning_rate": 6.655181592268084e-05, + "loss": 1.7711, + "step": 13386 + }, + { + "epoch": 4.108962553713935, + "grad_norm": 0.332011342048645, + "learning_rate": 6.654712553107939e-05, + "loss": 1.907, + "step": 13387 + }, + { + "epoch": 4.1092694904849605, + "grad_norm": 0.43125995993614197, + "learning_rate": 6.654243497594953e-05, + "loss": 1.7819, + "step": 13388 + }, + { + "epoch": 4.109576427255985, + "grad_norm": 0.33719149231910706, + "learning_rate": 6.653774425733765e-05, + "loss": 1.797, + "step": 13389 + }, + { + "epoch": 4.10988336402701, + "grad_norm": 0.23091599345207214, + "learning_rate": 6.653305337529006e-05, + "loss": 1.7384, + "step": 13390 + }, + { + "epoch": 4.110190300798036, + "grad_norm": 0.4283982515335083, + "learning_rate": 6.652836232985317e-05, + "loss": 1.8284, + "step": 13391 + }, + { + "epoch": 4.110497237569061, + "grad_norm": 0.43575870990753174, + "learning_rate": 6.652367112107332e-05, + "loss": 1.7235, + "step": 13392 + }, + { + "epoch": 4.110804174340086, + "grad_norm": 0.246877059340477, + "learning_rate": 6.651897974899685e-05, + "loss": 1.7174, + "step": 13393 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.36063629388809204, + "learning_rate": 6.651428821367015e-05, + "loss": 1.8064, + "step": 13394 + }, + { + "epoch": 4.111418047882136, + "grad_norm": 0.4454420804977417, + "learning_rate": 6.650959651513957e-05, + "loss": 1.7575, + "step": 13395 + }, + { + "epoch": 4.111724984653161, + "grad_norm": 0.2788856327533722, + "learning_rate": 6.650490465345149e-05, + "loss": 1.7696, + "step": 13396 + }, + { + "epoch": 4.112031921424187, + "grad_norm": 0.40281879901885986, + "learning_rate": 6.650021262865225e-05, + "loss": 1.8368, + "step": 13397 + }, + { + "epoch": 4.112338858195212, + "grad_norm": 0.5151103138923645, + "learning_rate": 6.649552044078825e-05, + "loss": 1.8224, + "step": 13398 + }, + { + "epoch": 4.112645794966237, + "grad_norm": 0.29390639066696167, + "learning_rate": 6.649082808990586e-05, + "loss": 1.7846, + "step": 13399 + }, + { + "epoch": 4.112952731737262, + "grad_norm": 0.3061942458152771, + "learning_rate": 6.648613557605142e-05, + "loss": 1.7954, + "step": 13400 + }, + { + "epoch": 4.113259668508287, + "grad_norm": 0.47628748416900635, + "learning_rate": 6.648144289927132e-05, + "loss": 1.7782, + "step": 13401 + }, + { + "epoch": 4.1135666052793125, + "grad_norm": 0.4299588203430176, + "learning_rate": 6.647675005961197e-05, + "loss": 1.7459, + "step": 13402 + }, + { + "epoch": 4.113873542050338, + "grad_norm": 0.24556589126586914, + "learning_rate": 6.64720570571197e-05, + "loss": 1.753, + "step": 13403 + }, + { + "epoch": 4.114180478821363, + "grad_norm": 0.29620522260665894, + "learning_rate": 6.646736389184092e-05, + "loss": 1.773, + "step": 13404 + }, + { + "epoch": 4.114487415592388, + "grad_norm": 0.37710070610046387, + "learning_rate": 6.646267056382199e-05, + "loss": 1.8389, + "step": 13405 + }, + { + "epoch": 4.114794352363413, + "grad_norm": 0.2562984824180603, + "learning_rate": 6.64579770731093e-05, + "loss": 1.7905, + "step": 13406 + }, + { + "epoch": 4.115101289134438, + "grad_norm": 0.3999946713447571, + "learning_rate": 6.645328341974924e-05, + "loss": 1.7734, + "step": 13407 + }, + { + "epoch": 4.115408225905464, + "grad_norm": 0.36087217926979065, + "learning_rate": 6.644858960378817e-05, + "loss": 1.801, + "step": 13408 + }, + { + "epoch": 4.115715162676489, + "grad_norm": 0.2520254850387573, + "learning_rate": 6.644389562527251e-05, + "loss": 1.7394, + "step": 13409 + }, + { + "epoch": 4.116022099447513, + "grad_norm": 0.4321835935115814, + "learning_rate": 6.643920148424864e-05, + "loss": 1.8091, + "step": 13410 + }, + { + "epoch": 4.116329036218539, + "grad_norm": 0.40900173783302307, + "learning_rate": 6.643450718076294e-05, + "loss": 1.8198, + "step": 13411 + }, + { + "epoch": 4.116635972989564, + "grad_norm": 0.23693956434726715, + "learning_rate": 6.642981271486182e-05, + "loss": 1.6807, + "step": 13412 + }, + { + "epoch": 4.116942909760589, + "grad_norm": 0.33526891469955444, + "learning_rate": 6.642511808659164e-05, + "loss": 1.8673, + "step": 13413 + }, + { + "epoch": 4.117249846531615, + "grad_norm": 0.4037325382232666, + "learning_rate": 6.642042329599883e-05, + "loss": 1.743, + "step": 13414 + }, + { + "epoch": 4.11755678330264, + "grad_norm": 0.25629740953445435, + "learning_rate": 6.641572834312975e-05, + "loss": 1.6904, + "step": 13415 + }, + { + "epoch": 4.1178637200736645, + "grad_norm": 0.29203253984451294, + "learning_rate": 6.641103322803087e-05, + "loss": 1.7811, + "step": 13416 + }, + { + "epoch": 4.11817065684469, + "grad_norm": 0.423926442861557, + "learning_rate": 6.64063379507485e-05, + "loss": 1.7341, + "step": 13417 + }, + { + "epoch": 4.118477593615715, + "grad_norm": 0.29561251401901245, + "learning_rate": 6.64016425113291e-05, + "loss": 1.7915, + "step": 13418 + }, + { + "epoch": 4.1187845303867405, + "grad_norm": 0.2536832094192505, + "learning_rate": 6.639694690981903e-05, + "loss": 1.7628, + "step": 13419 + }, + { + "epoch": 4.119091467157766, + "grad_norm": 0.2931392192840576, + "learning_rate": 6.639225114626475e-05, + "loss": 1.7877, + "step": 13420 + }, + { + "epoch": 4.11939840392879, + "grad_norm": 0.2219499796628952, + "learning_rate": 6.638755522071263e-05, + "loss": 1.7183, + "step": 13421 + }, + { + "epoch": 4.119705340699816, + "grad_norm": 0.2951931953430176, + "learning_rate": 6.638285913320908e-05, + "loss": 1.7983, + "step": 13422 + }, + { + "epoch": 4.120012277470841, + "grad_norm": 0.3495960533618927, + "learning_rate": 6.63781628838005e-05, + "loss": 1.7531, + "step": 13423 + }, + { + "epoch": 4.120319214241866, + "grad_norm": 0.2389262616634369, + "learning_rate": 6.637346647253333e-05, + "loss": 1.7454, + "step": 13424 + }, + { + "epoch": 4.120626151012892, + "grad_norm": 0.28729167580604553, + "learning_rate": 6.636876989945395e-05, + "loss": 1.8105, + "step": 13425 + }, + { + "epoch": 4.120933087783916, + "grad_norm": 0.2620082199573517, + "learning_rate": 6.636407316460882e-05, + "loss": 1.7948, + "step": 13426 + }, + { + "epoch": 4.121240024554941, + "grad_norm": 0.2694189250469208, + "learning_rate": 6.635937626804432e-05, + "loss": 1.809, + "step": 13427 + }, + { + "epoch": 4.121546961325967, + "grad_norm": 0.2660866379737854, + "learning_rate": 6.635467920980687e-05, + "loss": 1.7431, + "step": 13428 + }, + { + "epoch": 4.121853898096992, + "grad_norm": 0.2579907774925232, + "learning_rate": 6.634998198994289e-05, + "loss": 1.7941, + "step": 13429 + }, + { + "epoch": 4.122160834868017, + "grad_norm": 0.28349989652633667, + "learning_rate": 6.634528460849881e-05, + "loss": 1.8142, + "step": 13430 + }, + { + "epoch": 4.122467771639043, + "grad_norm": 0.28716522455215454, + "learning_rate": 6.634058706552104e-05, + "loss": 1.7496, + "step": 13431 + }, + { + "epoch": 4.122774708410067, + "grad_norm": 0.23228077590465546, + "learning_rate": 6.633588936105601e-05, + "loss": 1.7399, + "step": 13432 + }, + { + "epoch": 4.1230816451810925, + "grad_norm": 0.3649841248989105, + "learning_rate": 6.633119149515017e-05, + "loss": 1.7696, + "step": 13433 + }, + { + "epoch": 4.123388581952118, + "grad_norm": 0.2757830321788788, + "learning_rate": 6.632649346784992e-05, + "loss": 1.8329, + "step": 13434 + }, + { + "epoch": 4.123695518723143, + "grad_norm": 0.28163692355155945, + "learning_rate": 6.632179527920167e-05, + "loss": 1.7761, + "step": 13435 + }, + { + "epoch": 4.1240024554941686, + "grad_norm": 0.3453187048435211, + "learning_rate": 6.631709692925188e-05, + "loss": 1.7843, + "step": 13436 + }, + { + "epoch": 4.124309392265193, + "grad_norm": 0.2792697250843048, + "learning_rate": 6.631239841804698e-05, + "loss": 1.7889, + "step": 13437 + }, + { + "epoch": 4.124616329036218, + "grad_norm": 0.21881693601608276, + "learning_rate": 6.630769974563339e-05, + "loss": 1.8015, + "step": 13438 + }, + { + "epoch": 4.124923265807244, + "grad_norm": 0.4464910328388214, + "learning_rate": 6.630300091205756e-05, + "loss": 1.7851, + "step": 13439 + }, + { + "epoch": 4.125230202578269, + "grad_norm": 0.40191107988357544, + "learning_rate": 6.629830191736591e-05, + "loss": 1.8608, + "step": 13440 + }, + { + "epoch": 4.125537139349294, + "grad_norm": 0.2809060513973236, + "learning_rate": 6.62936027616049e-05, + "loss": 1.7374, + "step": 13441 + }, + { + "epoch": 4.12584407612032, + "grad_norm": 0.24980643391609192, + "learning_rate": 6.628890344482095e-05, + "loss": 1.8152, + "step": 13442 + }, + { + "epoch": 4.126151012891344, + "grad_norm": 0.24538342654705048, + "learning_rate": 6.62842039670605e-05, + "loss": 1.7687, + "step": 13443 + }, + { + "epoch": 4.1264579496623695, + "grad_norm": 0.24684634804725647, + "learning_rate": 6.627950432837002e-05, + "loss": 1.787, + "step": 13444 + }, + { + "epoch": 4.126764886433395, + "grad_norm": 0.22724607586860657, + "learning_rate": 6.627480452879593e-05, + "loss": 1.7871, + "step": 13445 + }, + { + "epoch": 4.12707182320442, + "grad_norm": 0.24724406003952026, + "learning_rate": 6.627010456838469e-05, + "loss": 1.7524, + "step": 13446 + }, + { + "epoch": 4.1273787599754455, + "grad_norm": 0.24219536781311035, + "learning_rate": 6.626540444718274e-05, + "loss": 1.7754, + "step": 13447 + }, + { + "epoch": 4.12768569674647, + "grad_norm": 0.24857915937900543, + "learning_rate": 6.626070416523652e-05, + "loss": 1.7839, + "step": 13448 + }, + { + "epoch": 4.127992633517495, + "grad_norm": 0.2639105021953583, + "learning_rate": 6.625600372259248e-05, + "loss": 1.7546, + "step": 13449 + }, + { + "epoch": 4.128299570288521, + "grad_norm": 0.23598137497901917, + "learning_rate": 6.62513031192971e-05, + "loss": 1.7957, + "step": 13450 + }, + { + "epoch": 4.128606507059546, + "grad_norm": 0.3038909137248993, + "learning_rate": 6.624660235539682e-05, + "loss": 1.8117, + "step": 13451 + }, + { + "epoch": 4.128913443830571, + "grad_norm": 0.27671241760253906, + "learning_rate": 6.624190143093809e-05, + "loss": 1.729, + "step": 13452 + }, + { + "epoch": 4.129220380601596, + "grad_norm": 0.24638360738754272, + "learning_rate": 6.623720034596735e-05, + "loss": 1.7414, + "step": 13453 + }, + { + "epoch": 4.129527317372621, + "grad_norm": 0.24073924124240875, + "learning_rate": 6.623249910053111e-05, + "loss": 1.8046, + "step": 13454 + }, + { + "epoch": 4.129834254143646, + "grad_norm": 0.29734376072883606, + "learning_rate": 6.622779769467578e-05, + "loss": 1.8336, + "step": 13455 + }, + { + "epoch": 4.130141190914672, + "grad_norm": 0.23182810842990875, + "learning_rate": 6.622309612844785e-05, + "loss": 1.7742, + "step": 13456 + }, + { + "epoch": 4.130448127685697, + "grad_norm": 0.2179390788078308, + "learning_rate": 6.621839440189378e-05, + "loss": 1.7656, + "step": 13457 + }, + { + "epoch": 4.1307550644567215, + "grad_norm": 0.21389013528823853, + "learning_rate": 6.621369251506002e-05, + "loss": 1.7504, + "step": 13458 + }, + { + "epoch": 4.131062001227747, + "grad_norm": 0.22306203842163086, + "learning_rate": 6.620899046799305e-05, + "loss": 1.7573, + "step": 13459 + }, + { + "epoch": 4.131368937998772, + "grad_norm": 0.2699708938598633, + "learning_rate": 6.620428826073934e-05, + "loss": 1.7419, + "step": 13460 + }, + { + "epoch": 4.1316758747697975, + "grad_norm": 0.34087565541267395, + "learning_rate": 6.619958589334534e-05, + "loss": 1.7545, + "step": 13461 + }, + { + "epoch": 4.131982811540823, + "grad_norm": 0.2934977412223816, + "learning_rate": 6.619488336585755e-05, + "loss": 1.7611, + "step": 13462 + }, + { + "epoch": 4.132289748311848, + "grad_norm": 0.22545567154884338, + "learning_rate": 6.619018067832243e-05, + "loss": 1.7562, + "step": 13463 + }, + { + "epoch": 4.132596685082873, + "grad_norm": 0.23334743082523346, + "learning_rate": 6.618547783078647e-05, + "loss": 1.7784, + "step": 13464 + }, + { + "epoch": 4.132903621853898, + "grad_norm": 0.22466403245925903, + "learning_rate": 6.618077482329612e-05, + "loss": 1.7277, + "step": 13465 + }, + { + "epoch": 4.133210558624923, + "grad_norm": 0.23504197597503662, + "learning_rate": 6.617607165589785e-05, + "loss": 1.7983, + "step": 13466 + }, + { + "epoch": 4.133517495395949, + "grad_norm": 0.2500833570957184, + "learning_rate": 6.617136832863819e-05, + "loss": 1.7826, + "step": 13467 + }, + { + "epoch": 4.133824432166974, + "grad_norm": 0.22398658096790314, + "learning_rate": 6.616666484156357e-05, + "loss": 1.7281, + "step": 13468 + }, + { + "epoch": 4.134131368937998, + "grad_norm": 0.2537873089313507, + "learning_rate": 6.616196119472052e-05, + "loss": 1.7598, + "step": 13469 + }, + { + "epoch": 4.134438305709024, + "grad_norm": 0.26881173253059387, + "learning_rate": 6.615725738815546e-05, + "loss": 1.8161, + "step": 13470 + }, + { + "epoch": 4.134745242480049, + "grad_norm": 0.3311346471309662, + "learning_rate": 6.615255342191492e-05, + "loss": 1.7954, + "step": 13471 + }, + { + "epoch": 4.135052179251074, + "grad_norm": 0.2562953233718872, + "learning_rate": 6.614784929604539e-05, + "loss": 1.7284, + "step": 13472 + }, + { + "epoch": 4.1353591160221, + "grad_norm": 0.2563154101371765, + "learning_rate": 6.614314501059334e-05, + "loss": 1.7995, + "step": 13473 + }, + { + "epoch": 4.135666052793125, + "grad_norm": 0.24861161410808563, + "learning_rate": 6.613844056560527e-05, + "loss": 1.7589, + "step": 13474 + }, + { + "epoch": 4.1359729895641495, + "grad_norm": 0.23815487325191498, + "learning_rate": 6.613373596112769e-05, + "loss": 1.6906, + "step": 13475 + }, + { + "epoch": 4.136279926335175, + "grad_norm": 0.25394049286842346, + "learning_rate": 6.612903119720705e-05, + "loss": 1.781, + "step": 13476 + }, + { + "epoch": 4.1365868631062, + "grad_norm": 0.24501466751098633, + "learning_rate": 6.612432627388988e-05, + "loss": 1.797, + "step": 13477 + }, + { + "epoch": 4.1368937998772255, + "grad_norm": 0.24909707903862, + "learning_rate": 6.611962119122267e-05, + "loss": 1.7643, + "step": 13478 + }, + { + "epoch": 4.137200736648251, + "grad_norm": 0.24954476952552795, + "learning_rate": 6.611491594925192e-05, + "loss": 1.8219, + "step": 13479 + }, + { + "epoch": 4.137507673419275, + "grad_norm": 0.30572372674942017, + "learning_rate": 6.611021054802411e-05, + "loss": 1.8039, + "step": 13480 + }, + { + "epoch": 4.137814610190301, + "grad_norm": 0.27466365694999695, + "learning_rate": 6.610550498758577e-05, + "loss": 1.6945, + "step": 13481 + }, + { + "epoch": 4.138121546961326, + "grad_norm": 0.2614271640777588, + "learning_rate": 6.610079926798339e-05, + "loss": 1.8648, + "step": 13482 + }, + { + "epoch": 4.138428483732351, + "grad_norm": 0.23645827174186707, + "learning_rate": 6.609609338926346e-05, + "loss": 1.7424, + "step": 13483 + }, + { + "epoch": 4.138735420503377, + "grad_norm": 0.24473626911640167, + "learning_rate": 6.609138735147253e-05, + "loss": 1.8036, + "step": 13484 + }, + { + "epoch": 4.139042357274401, + "grad_norm": 0.2472417950630188, + "learning_rate": 6.608668115465706e-05, + "loss": 1.794, + "step": 13485 + }, + { + "epoch": 4.139349294045426, + "grad_norm": 0.25330284237861633, + "learning_rate": 6.608197479886358e-05, + "loss": 1.8052, + "step": 13486 + }, + { + "epoch": 4.139656230816452, + "grad_norm": 0.24279309809207916, + "learning_rate": 6.60772682841386e-05, + "loss": 1.7375, + "step": 13487 + }, + { + "epoch": 4.139963167587477, + "grad_norm": 0.22319461405277252, + "learning_rate": 6.607256161052862e-05, + "loss": 1.7696, + "step": 13488 + }, + { + "epoch": 4.140270104358502, + "grad_norm": 0.25261563062667847, + "learning_rate": 6.606785477808017e-05, + "loss": 1.7646, + "step": 13489 + }, + { + "epoch": 4.140577041129528, + "grad_norm": 0.3127744793891907, + "learning_rate": 6.606314778683977e-05, + "loss": 1.7899, + "step": 13490 + }, + { + "epoch": 4.140883977900552, + "grad_norm": 0.3550816774368286, + "learning_rate": 6.605844063685392e-05, + "loss": 1.7971, + "step": 13491 + }, + { + "epoch": 4.1411909146715775, + "grad_norm": 0.20977813005447388, + "learning_rate": 6.605373332816916e-05, + "loss": 1.7416, + "step": 13492 + }, + { + "epoch": 4.141497851442603, + "grad_norm": 0.26593849062919617, + "learning_rate": 6.6049025860832e-05, + "loss": 1.7586, + "step": 13493 + }, + { + "epoch": 4.141804788213628, + "grad_norm": 0.2452937364578247, + "learning_rate": 6.604431823488893e-05, + "loss": 1.757, + "step": 13494 + }, + { + "epoch": 4.1421117249846535, + "grad_norm": 0.21029168367385864, + "learning_rate": 6.603961045038652e-05, + "loss": 1.7665, + "step": 13495 + }, + { + "epoch": 4.142418661755678, + "grad_norm": 0.2396312952041626, + "learning_rate": 6.603490250737128e-05, + "loss": 1.7609, + "step": 13496 + }, + { + "epoch": 4.142725598526703, + "grad_norm": 0.23266808688640594, + "learning_rate": 6.603019440588975e-05, + "loss": 1.7893, + "step": 13497 + }, + { + "epoch": 4.143032535297729, + "grad_norm": 0.25235217809677124, + "learning_rate": 6.602548614598842e-05, + "loss": 1.7465, + "step": 13498 + }, + { + "epoch": 4.143339472068754, + "grad_norm": 0.22944024205207825, + "learning_rate": 6.602077772771386e-05, + "loss": 1.7052, + "step": 13499 + }, + { + "epoch": 4.143646408839779, + "grad_norm": 0.2116660475730896, + "learning_rate": 6.601606915111257e-05, + "loss": 1.7042, + "step": 13500 + }, + { + "epoch": 4.143953345610804, + "grad_norm": 0.21777184307575226, + "learning_rate": 6.601136041623111e-05, + "loss": 1.7938, + "step": 13501 + }, + { + "epoch": 4.144260282381829, + "grad_norm": 0.23663075268268585, + "learning_rate": 6.600665152311601e-05, + "loss": 1.7475, + "step": 13502 + }, + { + "epoch": 4.144567219152854, + "grad_norm": 0.20644642412662506, + "learning_rate": 6.600194247181377e-05, + "loss": 1.7992, + "step": 13503 + }, + { + "epoch": 4.14487415592388, + "grad_norm": 0.21479010581970215, + "learning_rate": 6.599723326237098e-05, + "loss": 1.7877, + "step": 13504 + }, + { + "epoch": 4.145181092694905, + "grad_norm": 0.2266562283039093, + "learning_rate": 6.599252389483413e-05, + "loss": 1.8097, + "step": 13505 + }, + { + "epoch": 4.14548802946593, + "grad_norm": 0.2053738683462143, + "learning_rate": 6.59878143692498e-05, + "loss": 1.6878, + "step": 13506 + }, + { + "epoch": 4.145794966236955, + "grad_norm": 0.19583995640277863, + "learning_rate": 6.598310468566452e-05, + "loss": 1.7547, + "step": 13507 + }, + { + "epoch": 4.14610190300798, + "grad_norm": 0.23421542346477509, + "learning_rate": 6.597839484412484e-05, + "loss": 1.7926, + "step": 13508 + }, + { + "epoch": 4.1464088397790055, + "grad_norm": 0.24575260281562805, + "learning_rate": 6.597368484467728e-05, + "loss": 1.7311, + "step": 13509 + }, + { + "epoch": 4.146715776550031, + "grad_norm": 0.27519574761390686, + "learning_rate": 6.596897468736842e-05, + "loss": 1.7858, + "step": 13510 + }, + { + "epoch": 4.147022713321056, + "grad_norm": 0.26434022188186646, + "learning_rate": 6.596426437224477e-05, + "loss": 1.7387, + "step": 13511 + }, + { + "epoch": 4.147329650092081, + "grad_norm": 0.2192772775888443, + "learning_rate": 6.595955389935291e-05, + "loss": 1.7565, + "step": 13512 + }, + { + "epoch": 4.147636586863106, + "grad_norm": 0.21047350764274597, + "learning_rate": 6.595484326873938e-05, + "loss": 1.7234, + "step": 13513 + }, + { + "epoch": 4.147943523634131, + "grad_norm": 0.22838951647281647, + "learning_rate": 6.595013248045075e-05, + "loss": 1.8205, + "step": 13514 + }, + { + "epoch": 4.148250460405157, + "grad_norm": 0.3467923402786255, + "learning_rate": 6.594542153453356e-05, + "loss": 1.7973, + "step": 13515 + }, + { + "epoch": 4.148557397176182, + "grad_norm": 0.241237074136734, + "learning_rate": 6.594071043103438e-05, + "loss": 1.7764, + "step": 13516 + }, + { + "epoch": 4.148864333947207, + "grad_norm": 0.22543516755104065, + "learning_rate": 6.593599916999973e-05, + "loss": 1.7528, + "step": 13517 + }, + { + "epoch": 4.149171270718232, + "grad_norm": 0.24590276181697845, + "learning_rate": 6.593128775147623e-05, + "loss": 1.7422, + "step": 13518 + }, + { + "epoch": 4.149478207489257, + "grad_norm": 0.2434391975402832, + "learning_rate": 6.592657617551038e-05, + "loss": 1.7523, + "step": 13519 + }, + { + "epoch": 4.149785144260282, + "grad_norm": 0.23169009387493134, + "learning_rate": 6.592186444214877e-05, + "loss": 1.8158, + "step": 13520 + }, + { + "epoch": 4.150092081031308, + "grad_norm": 0.2217840999364853, + "learning_rate": 6.591715255143798e-05, + "loss": 1.7487, + "step": 13521 + }, + { + "epoch": 4.150399017802333, + "grad_norm": 0.2405092418193817, + "learning_rate": 6.591244050342454e-05, + "loss": 1.7726, + "step": 13522 + }, + { + "epoch": 4.150705954573358, + "grad_norm": 0.29432612657546997, + "learning_rate": 6.590772829815504e-05, + "loss": 1.7841, + "step": 13523 + }, + { + "epoch": 4.151012891344383, + "grad_norm": 0.2708737850189209, + "learning_rate": 6.590301593567605e-05, + "loss": 1.8551, + "step": 13524 + }, + { + "epoch": 4.151319828115408, + "grad_norm": 0.26643216609954834, + "learning_rate": 6.589830341603413e-05, + "loss": 1.7697, + "step": 13525 + }, + { + "epoch": 4.151626764886434, + "grad_norm": 0.3672652840614319, + "learning_rate": 6.589359073927587e-05, + "loss": 1.8292, + "step": 13526 + }, + { + "epoch": 4.151933701657459, + "grad_norm": 0.2413325160741806, + "learning_rate": 6.588887790544782e-05, + "loss": 1.7514, + "step": 13527 + }, + { + "epoch": 4.152240638428483, + "grad_norm": 0.3248155117034912, + "learning_rate": 6.588416491459657e-05, + "loss": 1.7437, + "step": 13528 + }, + { + "epoch": 4.152547575199509, + "grad_norm": 0.40951836109161377, + "learning_rate": 6.587945176676869e-05, + "loss": 1.7779, + "step": 13529 + }, + { + "epoch": 4.152854511970534, + "grad_norm": 0.23874351382255554, + "learning_rate": 6.587473846201075e-05, + "loss": 1.8343, + "step": 13530 + }, + { + "epoch": 4.153161448741559, + "grad_norm": 0.4535207450389862, + "learning_rate": 6.587002500036936e-05, + "loss": 1.8301, + "step": 13531 + }, + { + "epoch": 4.153468385512585, + "grad_norm": 0.458003968000412, + "learning_rate": 6.586531138189108e-05, + "loss": 1.7053, + "step": 13532 + }, + { + "epoch": 4.153775322283609, + "grad_norm": 0.24350887537002563, + "learning_rate": 6.586059760662248e-05, + "loss": 1.7642, + "step": 13533 + }, + { + "epoch": 4.1540822590546345, + "grad_norm": 0.46951553225517273, + "learning_rate": 6.585588367461017e-05, + "loss": 1.7345, + "step": 13534 + }, + { + "epoch": 4.15438919582566, + "grad_norm": 0.5524527430534363, + "learning_rate": 6.585116958590072e-05, + "loss": 1.7677, + "step": 13535 + }, + { + "epoch": 4.154696132596685, + "grad_norm": 0.2887112498283386, + "learning_rate": 6.584645534054072e-05, + "loss": 1.7704, + "step": 13536 + }, + { + "epoch": 4.1550030693677105, + "grad_norm": 0.36243724822998047, + "learning_rate": 6.584174093857675e-05, + "loss": 1.8133, + "step": 13537 + }, + { + "epoch": 4.155310006138736, + "grad_norm": 0.3869550824165344, + "learning_rate": 6.583702638005543e-05, + "loss": 1.7253, + "step": 13538 + }, + { + "epoch": 4.15561694290976, + "grad_norm": 0.25859662890434265, + "learning_rate": 6.583231166502333e-05, + "loss": 1.7683, + "step": 13539 + }, + { + "epoch": 4.155923879680786, + "grad_norm": 0.3011144995689392, + "learning_rate": 6.582759679352704e-05, + "loss": 1.7139, + "step": 13540 + }, + { + "epoch": 4.156230816451811, + "grad_norm": 0.38033372163772583, + "learning_rate": 6.582288176561316e-05, + "loss": 1.8182, + "step": 13541 + }, + { + "epoch": 4.156537753222836, + "grad_norm": 0.2224060595035553, + "learning_rate": 6.581816658132829e-05, + "loss": 1.7527, + "step": 13542 + }, + { + "epoch": 4.156844689993862, + "grad_norm": 0.4147234261035919, + "learning_rate": 6.581345124071903e-05, + "loss": 1.7339, + "step": 13543 + }, + { + "epoch": 4.157151626764886, + "grad_norm": 0.45334625244140625, + "learning_rate": 6.580873574383198e-05, + "loss": 1.8166, + "step": 13544 + }, + { + "epoch": 4.157458563535911, + "grad_norm": 0.3050530254840851, + "learning_rate": 6.580402009071372e-05, + "loss": 1.7967, + "step": 13545 + }, + { + "epoch": 4.157765500306937, + "grad_norm": 0.25901293754577637, + "learning_rate": 6.579930428141088e-05, + "loss": 1.7806, + "step": 13546 + }, + { + "epoch": 4.158072437077962, + "grad_norm": 0.3142934739589691, + "learning_rate": 6.579458831597006e-05, + "loss": 1.7724, + "step": 13547 + }, + { + "epoch": 4.158379373848987, + "grad_norm": 0.23943179845809937, + "learning_rate": 6.578987219443787e-05, + "loss": 1.7515, + "step": 13548 + }, + { + "epoch": 4.158686310620013, + "grad_norm": 0.2838635742664337, + "learning_rate": 6.578515591686089e-05, + "loss": 1.7707, + "step": 13549 + }, + { + "epoch": 4.158993247391037, + "grad_norm": 0.3064457178115845, + "learning_rate": 6.578043948328575e-05, + "loss": 1.7839, + "step": 13550 + }, + { + "epoch": 4.1593001841620625, + "grad_norm": 0.2311718463897705, + "learning_rate": 6.577572289375907e-05, + "loss": 1.8298, + "step": 13551 + }, + { + "epoch": 4.159607120933088, + "grad_norm": 0.35726481676101685, + "learning_rate": 6.577100614832743e-05, + "loss": 1.811, + "step": 13552 + }, + { + "epoch": 4.159914057704113, + "grad_norm": 0.3176140785217285, + "learning_rate": 6.576628924703749e-05, + "loss": 1.732, + "step": 13553 + }, + { + "epoch": 4.1602209944751385, + "grad_norm": 0.2325647473335266, + "learning_rate": 6.576157218993582e-05, + "loss": 1.827, + "step": 13554 + }, + { + "epoch": 4.160527931246163, + "grad_norm": 0.32260453701019287, + "learning_rate": 6.575685497706905e-05, + "loss": 1.8218, + "step": 13555 + }, + { + "epoch": 4.160834868017188, + "grad_norm": 0.2638537287712097, + "learning_rate": 6.575213760848382e-05, + "loss": 1.7091, + "step": 13556 + }, + { + "epoch": 4.161141804788214, + "grad_norm": 0.2501799762248993, + "learning_rate": 6.574742008422671e-05, + "loss": 1.7707, + "step": 13557 + }, + { + "epoch": 4.161448741559239, + "grad_norm": 0.3212645649909973, + "learning_rate": 6.574270240434439e-05, + "loss": 1.7541, + "step": 13558 + }, + { + "epoch": 4.161755678330264, + "grad_norm": 0.25915586948394775, + "learning_rate": 6.573798456888345e-05, + "loss": 1.7597, + "step": 13559 + }, + { + "epoch": 4.162062615101289, + "grad_norm": 0.2538192868232727, + "learning_rate": 6.573326657789052e-05, + "loss": 1.8507, + "step": 13560 + }, + { + "epoch": 4.162369551872314, + "grad_norm": 0.2542131543159485, + "learning_rate": 6.572854843141223e-05, + "loss": 1.782, + "step": 13561 + }, + { + "epoch": 4.162676488643339, + "grad_norm": 0.26163414120674133, + "learning_rate": 6.572383012949521e-05, + "loss": 1.8482, + "step": 13562 + }, + { + "epoch": 4.162983425414365, + "grad_norm": 0.2566238343715668, + "learning_rate": 6.571911167218608e-05, + "loss": 1.7284, + "step": 13563 + }, + { + "epoch": 4.16329036218539, + "grad_norm": 0.28413113951683044, + "learning_rate": 6.571439305953147e-05, + "loss": 1.7473, + "step": 13564 + }, + { + "epoch": 4.163597298956415, + "grad_norm": 0.20399242639541626, + "learning_rate": 6.570967429157802e-05, + "loss": 1.6942, + "step": 13565 + }, + { + "epoch": 4.16390423572744, + "grad_norm": 0.256104439496994, + "learning_rate": 6.570495536837235e-05, + "loss": 1.7346, + "step": 13566 + }, + { + "epoch": 4.164211172498465, + "grad_norm": 0.350909560918808, + "learning_rate": 6.570023628996112e-05, + "loss": 1.8284, + "step": 13567 + }, + { + "epoch": 4.1645181092694905, + "grad_norm": 0.23500367999076843, + "learning_rate": 6.569551705639096e-05, + "loss": 1.7504, + "step": 13568 + }, + { + "epoch": 4.164825046040516, + "grad_norm": 0.26683783531188965, + "learning_rate": 6.569079766770849e-05, + "loss": 1.7293, + "step": 13569 + }, + { + "epoch": 4.165131982811541, + "grad_norm": 0.3145855963230133, + "learning_rate": 6.568607812396037e-05, + "loss": 1.8171, + "step": 13570 + }, + { + "epoch": 4.165438919582566, + "grad_norm": 0.2354860156774521, + "learning_rate": 6.568135842519324e-05, + "loss": 1.7555, + "step": 13571 + }, + { + "epoch": 4.165745856353591, + "grad_norm": 0.2893243730068207, + "learning_rate": 6.56766385714537e-05, + "loss": 1.7636, + "step": 13572 + }, + { + "epoch": 4.166052793124616, + "grad_norm": 0.20707663893699646, + "learning_rate": 6.567191856278846e-05, + "loss": 1.7239, + "step": 13573 + }, + { + "epoch": 4.166359729895642, + "grad_norm": 0.34200331568717957, + "learning_rate": 6.566719839924412e-05, + "loss": 1.7848, + "step": 13574 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.23326615989208221, + "learning_rate": 6.566247808086734e-05, + "loss": 1.7447, + "step": 13575 + }, + { + "epoch": 4.166973603437691, + "grad_norm": 0.22375629842281342, + "learning_rate": 6.565775760770479e-05, + "loss": 1.7429, + "step": 13576 + }, + { + "epoch": 4.167280540208717, + "grad_norm": 0.2412862777709961, + "learning_rate": 6.565303697980308e-05, + "loss": 1.7671, + "step": 13577 + }, + { + "epoch": 4.167587476979742, + "grad_norm": 0.2482215315103531, + "learning_rate": 6.56483161972089e-05, + "loss": 1.812, + "step": 13578 + }, + { + "epoch": 4.167894413750767, + "grad_norm": 0.2252974659204483, + "learning_rate": 6.564359525996889e-05, + "loss": 1.8173, + "step": 13579 + }, + { + "epoch": 4.168201350521793, + "grad_norm": 0.23497292399406433, + "learning_rate": 6.563887416812969e-05, + "loss": 1.7945, + "step": 13580 + }, + { + "epoch": 4.168508287292818, + "grad_norm": 0.24911245703697205, + "learning_rate": 6.563415292173796e-05, + "loss": 1.7516, + "step": 13581 + }, + { + "epoch": 4.1688152240638425, + "grad_norm": 0.20920930802822113, + "learning_rate": 6.562943152084039e-05, + "loss": 1.765, + "step": 13582 + }, + { + "epoch": 4.169122160834868, + "grad_norm": 0.26001816987991333, + "learning_rate": 6.562470996548361e-05, + "loss": 1.7504, + "step": 13583 + }, + { + "epoch": 4.169429097605893, + "grad_norm": 0.2504529058933258, + "learning_rate": 6.561998825571429e-05, + "loss": 1.7689, + "step": 13584 + }, + { + "epoch": 4.1697360343769185, + "grad_norm": 0.2210187464952469, + "learning_rate": 6.561526639157908e-05, + "loss": 1.752, + "step": 13585 + }, + { + "epoch": 4.170042971147944, + "grad_norm": 0.26323240995407104, + "learning_rate": 6.561054437312467e-05, + "loss": 1.8104, + "step": 13586 + }, + { + "epoch": 4.170349907918968, + "grad_norm": 0.20436744391918182, + "learning_rate": 6.560582220039771e-05, + "loss": 1.7281, + "step": 13587 + }, + { + "epoch": 4.170656844689994, + "grad_norm": 0.2053878903388977, + "learning_rate": 6.560109987344487e-05, + "loss": 1.7192, + "step": 13588 + }, + { + "epoch": 4.170963781461019, + "grad_norm": 0.2416568547487259, + "learning_rate": 6.559637739231281e-05, + "loss": 1.7679, + "step": 13589 + }, + { + "epoch": 4.171270718232044, + "grad_norm": 0.23847989737987518, + "learning_rate": 6.55916547570482e-05, + "loss": 1.7182, + "step": 13590 + }, + { + "epoch": 4.17157765500307, + "grad_norm": 0.2057785540819168, + "learning_rate": 6.558693196769772e-05, + "loss": 1.816, + "step": 13591 + }, + { + "epoch": 4.171884591774095, + "grad_norm": 0.2270805537700653, + "learning_rate": 6.558220902430804e-05, + "loss": 1.7091, + "step": 13592 + }, + { + "epoch": 4.172191528545119, + "grad_norm": 0.22143644094467163, + "learning_rate": 6.557748592692585e-05, + "loss": 1.7446, + "step": 13593 + }, + { + "epoch": 4.172498465316145, + "grad_norm": 0.2032770961523056, + "learning_rate": 6.557276267559781e-05, + "loss": 1.7501, + "step": 13594 + }, + { + "epoch": 4.17280540208717, + "grad_norm": 0.20851244032382965, + "learning_rate": 6.55680392703706e-05, + "loss": 1.8283, + "step": 13595 + }, + { + "epoch": 4.173112338858195, + "grad_norm": 0.2603934109210968, + "learning_rate": 6.55633157112909e-05, + "loss": 1.8523, + "step": 13596 + }, + { + "epoch": 4.173419275629221, + "grad_norm": 0.2232515811920166, + "learning_rate": 6.55585919984054e-05, + "loss": 1.7803, + "step": 13597 + }, + { + "epoch": 4.173726212400245, + "grad_norm": 0.2541115880012512, + "learning_rate": 6.555386813176075e-05, + "loss": 1.7407, + "step": 13598 + }, + { + "epoch": 4.1740331491712706, + "grad_norm": 0.3044603765010834, + "learning_rate": 6.55491441114037e-05, + "loss": 1.8257, + "step": 13599 + }, + { + "epoch": 4.174340085942296, + "grad_norm": 0.29227301478385925, + "learning_rate": 6.554441993738086e-05, + "loss": 1.7998, + "step": 13600 + }, + { + "epoch": 4.174647022713321, + "grad_norm": 0.25166594982147217, + "learning_rate": 6.553969560973896e-05, + "loss": 1.8258, + "step": 13601 + }, + { + "epoch": 4.1749539594843466, + "grad_norm": 0.22973991930484772, + "learning_rate": 6.55349711285247e-05, + "loss": 1.7871, + "step": 13602 + }, + { + "epoch": 4.175260896255371, + "grad_norm": 0.2615009844303131, + "learning_rate": 6.553024649378473e-05, + "loss": 1.7572, + "step": 13603 + }, + { + "epoch": 4.175567833026396, + "grad_norm": 0.24145473539829254, + "learning_rate": 6.552552170556576e-05, + "loss": 1.7546, + "step": 13604 + }, + { + "epoch": 4.175874769797422, + "grad_norm": 0.21989156305789948, + "learning_rate": 6.55207967639145e-05, + "loss": 1.6939, + "step": 13605 + }, + { + "epoch": 4.176181706568447, + "grad_norm": 0.206025168299675, + "learning_rate": 6.551607166887761e-05, + "loss": 1.7531, + "step": 13606 + }, + { + "epoch": 4.176488643339472, + "grad_norm": 0.2175903469324112, + "learning_rate": 6.551134642050181e-05, + "loss": 1.7631, + "step": 13607 + }, + { + "epoch": 4.176795580110497, + "grad_norm": 0.23259282112121582, + "learning_rate": 6.550662101883379e-05, + "loss": 1.7773, + "step": 13608 + }, + { + "epoch": 4.177102516881522, + "grad_norm": 0.23955227434635162, + "learning_rate": 6.550189546392025e-05, + "loss": 1.7321, + "step": 13609 + }, + { + "epoch": 4.1774094536525475, + "grad_norm": 0.23614998161792755, + "learning_rate": 6.549716975580792e-05, + "loss": 1.7855, + "step": 13610 + }, + { + "epoch": 4.177716390423573, + "grad_norm": 0.2274426817893982, + "learning_rate": 6.549244389454345e-05, + "loss": 1.7778, + "step": 13611 + }, + { + "epoch": 4.178023327194598, + "grad_norm": 0.2204308807849884, + "learning_rate": 6.548771788017358e-05, + "loss": 1.7175, + "step": 13612 + }, + { + "epoch": 4.1783302639656235, + "grad_norm": 0.2283930778503418, + "learning_rate": 6.548299171274501e-05, + "loss": 1.8081, + "step": 13613 + }, + { + "epoch": 4.178637200736648, + "grad_norm": 0.25433486700057983, + "learning_rate": 6.547826539230442e-05, + "loss": 1.8009, + "step": 13614 + }, + { + "epoch": 4.178944137507673, + "grad_norm": 0.24452579021453857, + "learning_rate": 6.547353891889856e-05, + "loss": 1.7244, + "step": 13615 + }, + { + "epoch": 4.179251074278699, + "grad_norm": 0.20611275732517242, + "learning_rate": 6.546881229257411e-05, + "loss": 1.7566, + "step": 13616 + }, + { + "epoch": 4.179558011049724, + "grad_norm": 0.24557232856750488, + "learning_rate": 6.546408551337779e-05, + "loss": 1.7638, + "step": 13617 + }, + { + "epoch": 4.179864947820749, + "grad_norm": 0.2158801257610321, + "learning_rate": 6.545935858135631e-05, + "loss": 1.7659, + "step": 13618 + }, + { + "epoch": 4.180171884591774, + "grad_norm": 0.23800688982009888, + "learning_rate": 6.54546314965564e-05, + "loss": 1.7468, + "step": 13619 + }, + { + "epoch": 4.180478821362799, + "grad_norm": 0.2504122853279114, + "learning_rate": 6.544990425902476e-05, + "loss": 1.7682, + "step": 13620 + }, + { + "epoch": 4.180785758133824, + "grad_norm": 0.21556814014911652, + "learning_rate": 6.54451768688081e-05, + "loss": 1.772, + "step": 13621 + }, + { + "epoch": 4.18109269490485, + "grad_norm": 0.23404552042484283, + "learning_rate": 6.544044932595315e-05, + "loss": 1.7844, + "step": 13622 + }, + { + "epoch": 4.181399631675875, + "grad_norm": 0.22129055857658386, + "learning_rate": 6.543572163050664e-05, + "loss": 1.7725, + "step": 13623 + }, + { + "epoch": 4.1817065684469, + "grad_norm": 0.2533521354198456, + "learning_rate": 6.543099378251528e-05, + "loss": 1.7908, + "step": 13624 + }, + { + "epoch": 4.182013505217925, + "grad_norm": 0.2905815541744232, + "learning_rate": 6.542626578202579e-05, + "loss": 1.7913, + "step": 13625 + }, + { + "epoch": 4.18232044198895, + "grad_norm": 0.3330783247947693, + "learning_rate": 6.54215376290849e-05, + "loss": 1.8374, + "step": 13626 + }, + { + "epoch": 4.1826273787599755, + "grad_norm": 0.29268717765808105, + "learning_rate": 6.541680932373933e-05, + "loss": 1.8714, + "step": 13627 + }, + { + "epoch": 4.182934315531001, + "grad_norm": 0.2820781171321869, + "learning_rate": 6.541208086603584e-05, + "loss": 1.8089, + "step": 13628 + }, + { + "epoch": 4.183241252302026, + "grad_norm": 0.3062323033809662, + "learning_rate": 6.54073522560211e-05, + "loss": 1.7307, + "step": 13629 + }, + { + "epoch": 4.183548189073051, + "grad_norm": 0.3010510504245758, + "learning_rate": 6.54026234937419e-05, + "loss": 1.7523, + "step": 13630 + }, + { + "epoch": 4.183855125844076, + "grad_norm": 0.21932095289230347, + "learning_rate": 6.539789457924493e-05, + "loss": 1.737, + "step": 13631 + }, + { + "epoch": 4.184162062615101, + "grad_norm": 0.2710212469100952, + "learning_rate": 6.539316551257695e-05, + "loss": 1.7228, + "step": 13632 + }, + { + "epoch": 4.184468999386127, + "grad_norm": 0.2885816991329193, + "learning_rate": 6.538843629378469e-05, + "loss": 1.8734, + "step": 13633 + }, + { + "epoch": 4.184775936157152, + "grad_norm": 0.2621026635169983, + "learning_rate": 6.538370692291487e-05, + "loss": 1.7884, + "step": 13634 + }, + { + "epoch": 4.185082872928176, + "grad_norm": 0.30503126978874207, + "learning_rate": 6.537897740001426e-05, + "loss": 1.7833, + "step": 13635 + }, + { + "epoch": 4.185389809699202, + "grad_norm": 0.29491373896598816, + "learning_rate": 6.537424772512955e-05, + "loss": 1.7894, + "step": 13636 + }, + { + "epoch": 4.185696746470227, + "grad_norm": 0.24423296749591827, + "learning_rate": 6.536951789830754e-05, + "loss": 1.7409, + "step": 13637 + }, + { + "epoch": 4.186003683241252, + "grad_norm": 0.2184748351573944, + "learning_rate": 6.536478791959495e-05, + "loss": 1.747, + "step": 13638 + }, + { + "epoch": 4.186310620012278, + "grad_norm": 0.2348455935716629, + "learning_rate": 6.53600577890385e-05, + "loss": 1.7422, + "step": 13639 + }, + { + "epoch": 4.186617556783303, + "grad_norm": 0.2554566264152527, + "learning_rate": 6.535532750668497e-05, + "loss": 1.7623, + "step": 13640 + }, + { + "epoch": 4.1869244935543275, + "grad_norm": 0.26424553990364075, + "learning_rate": 6.535059707258109e-05, + "loss": 1.8408, + "step": 13641 + }, + { + "epoch": 4.187231430325353, + "grad_norm": 0.35363274812698364, + "learning_rate": 6.534586648677361e-05, + "loss": 1.7435, + "step": 13642 + }, + { + "epoch": 4.187538367096378, + "grad_norm": 0.3225265443325043, + "learning_rate": 6.534113574930926e-05, + "loss": 1.7181, + "step": 13643 + }, + { + "epoch": 4.1878453038674035, + "grad_norm": 0.23529650270938873, + "learning_rate": 6.533640486023485e-05, + "loss": 1.7712, + "step": 13644 + }, + { + "epoch": 4.188152240638429, + "grad_norm": 0.3490132987499237, + "learning_rate": 6.53316738195971e-05, + "loss": 1.7329, + "step": 13645 + }, + { + "epoch": 4.188459177409453, + "grad_norm": 0.3759285509586334, + "learning_rate": 6.532694262744274e-05, + "loss": 1.802, + "step": 13646 + }, + { + "epoch": 4.188766114180479, + "grad_norm": 0.27383577823638916, + "learning_rate": 6.532221128381858e-05, + "loss": 1.801, + "step": 13647 + }, + { + "epoch": 4.189073050951504, + "grad_norm": 0.23240652680397034, + "learning_rate": 6.531747978877132e-05, + "loss": 1.8415, + "step": 13648 + }, + { + "epoch": 4.189379987722529, + "grad_norm": 0.3302704989910126, + "learning_rate": 6.531274814234773e-05, + "loss": 1.7765, + "step": 13649 + }, + { + "epoch": 4.189686924493555, + "grad_norm": 0.3209368586540222, + "learning_rate": 6.530801634459463e-05, + "loss": 1.6935, + "step": 13650 + }, + { + "epoch": 4.189993861264579, + "grad_norm": 0.26643648743629456, + "learning_rate": 6.530328439555872e-05, + "loss": 1.8159, + "step": 13651 + }, + { + "epoch": 4.190300798035604, + "grad_norm": 0.22594431042671204, + "learning_rate": 6.529855229528679e-05, + "loss": 1.7764, + "step": 13652 + }, + { + "epoch": 4.19060773480663, + "grad_norm": 0.3288109302520752, + "learning_rate": 6.529382004382561e-05, + "loss": 1.7963, + "step": 13653 + }, + { + "epoch": 4.190914671577655, + "grad_norm": 0.3067106604576111, + "learning_rate": 6.528908764122191e-05, + "loss": 1.7564, + "step": 13654 + }, + { + "epoch": 4.19122160834868, + "grad_norm": 0.23437078297138214, + "learning_rate": 6.528435508752249e-05, + "loss": 1.759, + "step": 13655 + }, + { + "epoch": 4.191528545119706, + "grad_norm": 0.30662333965301514, + "learning_rate": 6.527962238277413e-05, + "loss": 1.7549, + "step": 13656 + }, + { + "epoch": 4.19183548189073, + "grad_norm": 0.3545009195804596, + "learning_rate": 6.527488952702356e-05, + "loss": 1.7761, + "step": 13657 + }, + { + "epoch": 4.1921424186617555, + "grad_norm": 0.2509438991546631, + "learning_rate": 6.52701565203176e-05, + "loss": 1.7162, + "step": 13658 + }, + { + "epoch": 4.192449355432781, + "grad_norm": 0.24423806369304657, + "learning_rate": 6.5265423362703e-05, + "loss": 1.735, + "step": 13659 + }, + { + "epoch": 4.192756292203806, + "grad_norm": 0.37365156412124634, + "learning_rate": 6.526069005422654e-05, + "loss": 1.7697, + "step": 13660 + }, + { + "epoch": 4.1930632289748315, + "grad_norm": 0.4025731682777405, + "learning_rate": 6.525595659493499e-05, + "loss": 1.7931, + "step": 13661 + }, + { + "epoch": 4.193370165745856, + "grad_norm": 0.31360915303230286, + "learning_rate": 6.525122298487514e-05, + "loss": 1.8014, + "step": 13662 + }, + { + "epoch": 4.193677102516881, + "grad_norm": 0.2480524778366089, + "learning_rate": 6.524648922409376e-05, + "loss": 1.7753, + "step": 13663 + }, + { + "epoch": 4.193984039287907, + "grad_norm": 0.33740919828414917, + "learning_rate": 6.524175531263765e-05, + "loss": 1.7296, + "step": 13664 + }, + { + "epoch": 4.194290976058932, + "grad_norm": 0.26871639490127563, + "learning_rate": 6.523702125055358e-05, + "loss": 1.7113, + "step": 13665 + }, + { + "epoch": 4.194597912829957, + "grad_norm": 0.2687455415725708, + "learning_rate": 6.52322870378883e-05, + "loss": 1.7645, + "step": 13666 + }, + { + "epoch": 4.194904849600983, + "grad_norm": 0.4207400679588318, + "learning_rate": 6.522755267468868e-05, + "loss": 1.7758, + "step": 13667 + }, + { + "epoch": 4.195211786372007, + "grad_norm": 0.36043494939804077, + "learning_rate": 6.522281816100142e-05, + "loss": 1.7433, + "step": 13668 + }, + { + "epoch": 4.195518723143032, + "grad_norm": 0.2515890598297119, + "learning_rate": 6.52180834968734e-05, + "loss": 1.7646, + "step": 13669 + }, + { + "epoch": 4.195825659914058, + "grad_norm": 0.2871458828449249, + "learning_rate": 6.521334868235132e-05, + "loss": 1.8147, + "step": 13670 + }, + { + "epoch": 4.196132596685083, + "grad_norm": 0.28454354405403137, + "learning_rate": 6.5208613717482e-05, + "loss": 1.8576, + "step": 13671 + }, + { + "epoch": 4.196439533456108, + "grad_norm": 0.2520541548728943, + "learning_rate": 6.520387860231227e-05, + "loss": 1.7513, + "step": 13672 + }, + { + "epoch": 4.196746470227133, + "grad_norm": 0.22782307863235474, + "learning_rate": 6.51991433368889e-05, + "loss": 1.7737, + "step": 13673 + }, + { + "epoch": 4.197053406998158, + "grad_norm": 0.2451259195804596, + "learning_rate": 6.519440792125869e-05, + "loss": 1.7483, + "step": 13674 + }, + { + "epoch": 4.1973603437691835, + "grad_norm": 0.21915963292121887, + "learning_rate": 6.518967235546841e-05, + "loss": 1.718, + "step": 13675 + }, + { + "epoch": 4.197667280540209, + "grad_norm": 0.23005805909633636, + "learning_rate": 6.51849366395649e-05, + "loss": 1.7786, + "step": 13676 + }, + { + "epoch": 4.197974217311234, + "grad_norm": 0.25039517879486084, + "learning_rate": 6.518020077359494e-05, + "loss": 1.7785, + "step": 13677 + }, + { + "epoch": 4.198281154082259, + "grad_norm": 0.26631081104278564, + "learning_rate": 6.517546475760535e-05, + "loss": 1.7921, + "step": 13678 + }, + { + "epoch": 4.198588090853284, + "grad_norm": 0.2220793515443802, + "learning_rate": 6.517072859164292e-05, + "loss": 1.7696, + "step": 13679 + }, + { + "epoch": 4.198895027624309, + "grad_norm": 0.24681030213832855, + "learning_rate": 6.516599227575446e-05, + "loss": 1.7702, + "step": 13680 + }, + { + "epoch": 4.199201964395335, + "grad_norm": 0.2421828955411911, + "learning_rate": 6.516125580998678e-05, + "loss": 1.8058, + "step": 13681 + }, + { + "epoch": 4.19950890116636, + "grad_norm": 0.2170087695121765, + "learning_rate": 6.515651919438667e-05, + "loss": 1.7271, + "step": 13682 + }, + { + "epoch": 4.199815837937384, + "grad_norm": 0.23383566737174988, + "learning_rate": 6.515178242900096e-05, + "loss": 1.7515, + "step": 13683 + }, + { + "epoch": 4.20012277470841, + "grad_norm": 0.2522997558116913, + "learning_rate": 6.514704551387645e-05, + "loss": 1.7619, + "step": 13684 + }, + { + "epoch": 4.200429711479435, + "grad_norm": 0.20973703265190125, + "learning_rate": 6.514230844905995e-05, + "loss": 1.7326, + "step": 13685 + }, + { + "epoch": 4.2007366482504604, + "grad_norm": 0.2308073341846466, + "learning_rate": 6.513757123459832e-05, + "loss": 1.811, + "step": 13686 + }, + { + "epoch": 4.201043585021486, + "grad_norm": 0.21751229465007782, + "learning_rate": 6.51328338705383e-05, + "loss": 1.7795, + "step": 13687 + }, + { + "epoch": 4.201350521792511, + "grad_norm": 0.2357407957315445, + "learning_rate": 6.512809635692675e-05, + "loss": 1.8069, + "step": 13688 + }, + { + "epoch": 4.201657458563536, + "grad_norm": 0.32245033979415894, + "learning_rate": 6.51233586938105e-05, + "loss": 1.8179, + "step": 13689 + }, + { + "epoch": 4.201964395334561, + "grad_norm": 0.22740167379379272, + "learning_rate": 6.511862088123635e-05, + "loss": 1.7482, + "step": 13690 + }, + { + "epoch": 4.202271332105586, + "grad_norm": 0.26880496740341187, + "learning_rate": 6.511388291925114e-05, + "loss": 1.7919, + "step": 13691 + }, + { + "epoch": 4.202578268876612, + "grad_norm": 0.2261822521686554, + "learning_rate": 6.510914480790166e-05, + "loss": 1.7543, + "step": 13692 + }, + { + "epoch": 4.202885205647637, + "grad_norm": 0.2635782063007355, + "learning_rate": 6.510440654723477e-05, + "loss": 1.7874, + "step": 13693 + }, + { + "epoch": 4.203192142418661, + "grad_norm": 0.2505982518196106, + "learning_rate": 6.509966813729726e-05, + "loss": 1.8016, + "step": 13694 + }, + { + "epoch": 4.203499079189687, + "grad_norm": 0.23177236318588257, + "learning_rate": 6.5094929578136e-05, + "loss": 1.7582, + "step": 13695 + }, + { + "epoch": 4.203806015960712, + "grad_norm": 0.2315056324005127, + "learning_rate": 6.509019086979779e-05, + "loss": 1.7418, + "step": 13696 + }, + { + "epoch": 4.204112952731737, + "grad_norm": 0.25565484166145325, + "learning_rate": 6.508545201232947e-05, + "loss": 1.7476, + "step": 13697 + }, + { + "epoch": 4.204419889502763, + "grad_norm": 0.29210081696510315, + "learning_rate": 6.508071300577787e-05, + "loss": 1.8397, + "step": 13698 + }, + { + "epoch": 4.204726826273788, + "grad_norm": 0.2830582559108734, + "learning_rate": 6.507597385018984e-05, + "loss": 1.834, + "step": 13699 + }, + { + "epoch": 4.2050337630448125, + "grad_norm": 0.23013398051261902, + "learning_rate": 6.507123454561217e-05, + "loss": 1.7593, + "step": 13700 + }, + { + "epoch": 4.205340699815838, + "grad_norm": 0.21970276534557343, + "learning_rate": 6.506649509209174e-05, + "loss": 1.754, + "step": 13701 + }, + { + "epoch": 4.205647636586863, + "grad_norm": 0.32052233815193176, + "learning_rate": 6.50617554896754e-05, + "loss": 1.7531, + "step": 13702 + }, + { + "epoch": 4.2059545733578885, + "grad_norm": 0.2597332000732422, + "learning_rate": 6.505701573840995e-05, + "loss": 1.7836, + "step": 13703 + }, + { + "epoch": 4.206261510128914, + "grad_norm": 0.22070355713367462, + "learning_rate": 6.505227583834224e-05, + "loss": 1.7225, + "step": 13704 + }, + { + "epoch": 4.206568446899938, + "grad_norm": 0.27219358086586, + "learning_rate": 6.50475357895191e-05, + "loss": 1.8215, + "step": 13705 + }, + { + "epoch": 4.206875383670964, + "grad_norm": 0.32541659474372864, + "learning_rate": 6.504279559198741e-05, + "loss": 1.7786, + "step": 13706 + }, + { + "epoch": 4.207182320441989, + "grad_norm": 0.25871729850769043, + "learning_rate": 6.5038055245794e-05, + "loss": 1.7621, + "step": 13707 + }, + { + "epoch": 4.207489257213014, + "grad_norm": 0.2190464735031128, + "learning_rate": 6.50333147509857e-05, + "loss": 1.7612, + "step": 13708 + }, + { + "epoch": 4.20779619398404, + "grad_norm": 0.19565832614898682, + "learning_rate": 6.50285741076094e-05, + "loss": 1.7581, + "step": 13709 + }, + { + "epoch": 4.208103130755064, + "grad_norm": 0.1889251321554184, + "learning_rate": 6.50238333157119e-05, + "loss": 1.7611, + "step": 13710 + }, + { + "epoch": 4.208410067526089, + "grad_norm": 0.2013053596019745, + "learning_rate": 6.501909237534008e-05, + "loss": 1.7393, + "step": 13711 + }, + { + "epoch": 4.208717004297115, + "grad_norm": 0.1899433434009552, + "learning_rate": 6.501435128654077e-05, + "loss": 1.7122, + "step": 13712 + }, + { + "epoch": 4.20902394106814, + "grad_norm": 0.19337882101535797, + "learning_rate": 6.500961004936085e-05, + "loss": 1.7538, + "step": 13713 + }, + { + "epoch": 4.209330877839165, + "grad_norm": 0.20419920980930328, + "learning_rate": 6.500486866384718e-05, + "loss": 1.728, + "step": 13714 + }, + { + "epoch": 4.209637814610191, + "grad_norm": 0.20615679025650024, + "learning_rate": 6.50001271300466e-05, + "loss": 1.7843, + "step": 13715 + }, + { + "epoch": 4.209944751381215, + "grad_norm": 0.22178977727890015, + "learning_rate": 6.499538544800596e-05, + "loss": 1.7751, + "step": 13716 + }, + { + "epoch": 4.2102516881522405, + "grad_norm": 0.23703891038894653, + "learning_rate": 6.499064361777214e-05, + "loss": 1.7304, + "step": 13717 + }, + { + "epoch": 4.210558624923266, + "grad_norm": 0.2785723805427551, + "learning_rate": 6.498590163939198e-05, + "loss": 1.802, + "step": 13718 + }, + { + "epoch": 4.210865561694291, + "grad_norm": 0.23277060687541962, + "learning_rate": 6.498115951291237e-05, + "loss": 1.7316, + "step": 13719 + }, + { + "epoch": 4.2111724984653165, + "grad_norm": 0.22289474308490753, + "learning_rate": 6.497641723838017e-05, + "loss": 1.8469, + "step": 13720 + }, + { + "epoch": 4.211479435236341, + "grad_norm": 0.2715846002101898, + "learning_rate": 6.497167481584221e-05, + "loss": 1.7919, + "step": 13721 + }, + { + "epoch": 4.211786372007366, + "grad_norm": 0.29262226819992065, + "learning_rate": 6.49669322453454e-05, + "loss": 1.8379, + "step": 13722 + }, + { + "epoch": 4.212093308778392, + "grad_norm": 0.29136186838150024, + "learning_rate": 6.49621895269366e-05, + "loss": 1.789, + "step": 13723 + }, + { + "epoch": 4.212400245549417, + "grad_norm": 0.25110194087028503, + "learning_rate": 6.495744666066266e-05, + "loss": 1.7574, + "step": 13724 + }, + { + "epoch": 4.212707182320442, + "grad_norm": 0.2301366776227951, + "learning_rate": 6.495270364657048e-05, + "loss": 1.7637, + "step": 13725 + }, + { + "epoch": 4.213014119091467, + "grad_norm": 0.2556478977203369, + "learning_rate": 6.49479604847069e-05, + "loss": 1.7975, + "step": 13726 + }, + { + "epoch": 4.213321055862492, + "grad_norm": 0.2645667493343353, + "learning_rate": 6.494321717511884e-05, + "loss": 1.7594, + "step": 13727 + }, + { + "epoch": 4.213627992633517, + "grad_norm": 0.23664188385009766, + "learning_rate": 6.493847371785312e-05, + "loss": 1.7963, + "step": 13728 + }, + { + "epoch": 4.213934929404543, + "grad_norm": 0.2947930693626404, + "learning_rate": 6.493373011295665e-05, + "loss": 1.7477, + "step": 13729 + }, + { + "epoch": 4.214241866175568, + "grad_norm": 0.34598737955093384, + "learning_rate": 6.492898636047631e-05, + "loss": 1.7014, + "step": 13730 + }, + { + "epoch": 4.214548802946593, + "grad_norm": 0.24406935274600983, + "learning_rate": 6.4924242460459e-05, + "loss": 1.7436, + "step": 13731 + }, + { + "epoch": 4.214855739717618, + "grad_norm": 0.27176225185394287, + "learning_rate": 6.491949841295156e-05, + "loss": 1.8429, + "step": 13732 + }, + { + "epoch": 4.215162676488643, + "grad_norm": 0.2506968080997467, + "learning_rate": 6.491475421800089e-05, + "loss": 1.7519, + "step": 13733 + }, + { + "epoch": 4.2154696132596685, + "grad_norm": 0.2240980863571167, + "learning_rate": 6.491000987565387e-05, + "loss": 1.7595, + "step": 13734 + }, + { + "epoch": 4.215776550030694, + "grad_norm": 0.23201732337474823, + "learning_rate": 6.490526538595741e-05, + "loss": 1.7466, + "step": 13735 + }, + { + "epoch": 4.216083486801719, + "grad_norm": 0.24624750018119812, + "learning_rate": 6.490052074895836e-05, + "loss": 1.7364, + "step": 13736 + }, + { + "epoch": 4.216390423572744, + "grad_norm": 0.22936980426311493, + "learning_rate": 6.489577596470366e-05, + "loss": 1.7095, + "step": 13737 + }, + { + "epoch": 4.216697360343769, + "grad_norm": 0.2106638103723526, + "learning_rate": 6.489103103324016e-05, + "loss": 1.7387, + "step": 13738 + }, + { + "epoch": 4.217004297114794, + "grad_norm": 0.2936140298843384, + "learning_rate": 6.488628595461477e-05, + "loss": 1.9129, + "step": 13739 + }, + { + "epoch": 4.21731123388582, + "grad_norm": 0.21871696412563324, + "learning_rate": 6.488154072887435e-05, + "loss": 1.7489, + "step": 13740 + }, + { + "epoch": 4.217618170656845, + "grad_norm": 0.25941070914268494, + "learning_rate": 6.487679535606583e-05, + "loss": 1.7788, + "step": 13741 + }, + { + "epoch": 4.21792510742787, + "grad_norm": 0.2540862560272217, + "learning_rate": 6.487204983623612e-05, + "loss": 1.8074, + "step": 13742 + }, + { + "epoch": 4.218232044198895, + "grad_norm": 0.25180327892303467, + "learning_rate": 6.486730416943207e-05, + "loss": 1.7503, + "step": 13743 + }, + { + "epoch": 4.21853898096992, + "grad_norm": 0.26625585556030273, + "learning_rate": 6.486255835570063e-05, + "loss": 1.8149, + "step": 13744 + }, + { + "epoch": 4.218845917740945, + "grad_norm": 0.3023914396762848, + "learning_rate": 6.485781239508867e-05, + "loss": 1.8599, + "step": 13745 + }, + { + "epoch": 4.219152854511971, + "grad_norm": 0.2683780789375305, + "learning_rate": 6.48530662876431e-05, + "loss": 1.7911, + "step": 13746 + }, + { + "epoch": 4.219459791282996, + "grad_norm": 0.20747442543506622, + "learning_rate": 6.484832003341081e-05, + "loss": 1.7343, + "step": 13747 + }, + { + "epoch": 4.2197667280540205, + "grad_norm": 0.29284465312957764, + "learning_rate": 6.484357363243873e-05, + "loss": 1.7917, + "step": 13748 + }, + { + "epoch": 4.220073664825046, + "grad_norm": 0.24303840100765228, + "learning_rate": 6.483882708477376e-05, + "loss": 1.7921, + "step": 13749 + }, + { + "epoch": 4.220380601596071, + "grad_norm": 0.26253026723861694, + "learning_rate": 6.48340803904628e-05, + "loss": 1.7971, + "step": 13750 + }, + { + "epoch": 4.2206875383670965, + "grad_norm": 0.23888511955738068, + "learning_rate": 6.482933354955275e-05, + "loss": 1.7967, + "step": 13751 + }, + { + "epoch": 4.220994475138122, + "grad_norm": 0.24966883659362793, + "learning_rate": 6.482458656209054e-05, + "loss": 1.7924, + "step": 13752 + }, + { + "epoch": 4.221301411909146, + "grad_norm": 0.26556864380836487, + "learning_rate": 6.481983942812309e-05, + "loss": 1.8608, + "step": 13753 + }, + { + "epoch": 4.221608348680172, + "grad_norm": 0.29064711928367615, + "learning_rate": 6.48150921476973e-05, + "loss": 1.7785, + "step": 13754 + }, + { + "epoch": 4.221915285451197, + "grad_norm": 0.30876123905181885, + "learning_rate": 6.481034472086008e-05, + "loss": 1.8287, + "step": 13755 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.2622467875480652, + "learning_rate": 6.480559714765835e-05, + "loss": 1.8336, + "step": 13756 + }, + { + "epoch": 4.222529158993248, + "grad_norm": 0.2502644956111908, + "learning_rate": 6.480084942813902e-05, + "loss": 1.7803, + "step": 13757 + }, + { + "epoch": 4.222836095764273, + "grad_norm": 0.2879922688007355, + "learning_rate": 6.479610156234903e-05, + "loss": 1.7544, + "step": 13758 + }, + { + "epoch": 4.223143032535297, + "grad_norm": 0.2831384241580963, + "learning_rate": 6.47913535503353e-05, + "loss": 1.887, + "step": 13759 + }, + { + "epoch": 4.223449969306323, + "grad_norm": 0.3221064805984497, + "learning_rate": 6.478660539214474e-05, + "loss": 1.7455, + "step": 13760 + }, + { + "epoch": 4.223756906077348, + "grad_norm": 0.4231930673122406, + "learning_rate": 6.478185708782427e-05, + "loss": 1.8209, + "step": 13761 + }, + { + "epoch": 4.224063842848373, + "grad_norm": 0.34327802062034607, + "learning_rate": 6.477710863742083e-05, + "loss": 1.7754, + "step": 13762 + }, + { + "epoch": 4.224370779619399, + "grad_norm": 0.21713349223136902, + "learning_rate": 6.477236004098135e-05, + "loss": 1.7576, + "step": 13763 + }, + { + "epoch": 4.224677716390423, + "grad_norm": 0.3262602388858795, + "learning_rate": 6.476761129855275e-05, + "loss": 1.7772, + "step": 13764 + }, + { + "epoch": 4.2249846531614486, + "grad_norm": 0.3231413662433624, + "learning_rate": 6.476286241018195e-05, + "loss": 1.7821, + "step": 13765 + }, + { + "epoch": 4.225291589932474, + "grad_norm": 0.2440098226070404, + "learning_rate": 6.475811337591588e-05, + "loss": 1.7684, + "step": 13766 + }, + { + "epoch": 4.225598526703499, + "grad_norm": 0.329949289560318, + "learning_rate": 6.475336419580151e-05, + "loss": 1.8564, + "step": 13767 + }, + { + "epoch": 4.225905463474525, + "grad_norm": 0.3567483425140381, + "learning_rate": 6.474861486988574e-05, + "loss": 1.7625, + "step": 13768 + }, + { + "epoch": 4.226212400245549, + "grad_norm": 0.25257283449172974, + "learning_rate": 6.47438653982155e-05, + "loss": 1.823, + "step": 13769 + }, + { + "epoch": 4.226519337016574, + "grad_norm": 0.31542617082595825, + "learning_rate": 6.473911578083776e-05, + "loss": 1.7817, + "step": 13770 + }, + { + "epoch": 4.2268262737876, + "grad_norm": 0.29670149087905884, + "learning_rate": 6.473436601779944e-05, + "loss": 1.7493, + "step": 13771 + }, + { + "epoch": 4.227133210558625, + "grad_norm": 0.2635453939437866, + "learning_rate": 6.472961610914745e-05, + "loss": 1.792, + "step": 13772 + }, + { + "epoch": 4.22744014732965, + "grad_norm": 0.25017979741096497, + "learning_rate": 6.472486605492878e-05, + "loss": 1.7183, + "step": 13773 + }, + { + "epoch": 4.227747084100676, + "grad_norm": 0.3766646087169647, + "learning_rate": 6.472011585519034e-05, + "loss": 1.8039, + "step": 13774 + }, + { + "epoch": 4.2280540208717, + "grad_norm": 0.29860204458236694, + "learning_rate": 6.47153655099791e-05, + "loss": 1.8016, + "step": 13775 + }, + { + "epoch": 4.2283609576427255, + "grad_norm": 0.2540898323059082, + "learning_rate": 6.4710615019342e-05, + "loss": 1.8481, + "step": 13776 + }, + { + "epoch": 4.228667894413751, + "grad_norm": 0.3677786886692047, + "learning_rate": 6.470586438332597e-05, + "loss": 1.7663, + "step": 13777 + }, + { + "epoch": 4.228974831184776, + "grad_norm": 0.35693466663360596, + "learning_rate": 6.470111360197797e-05, + "loss": 1.7733, + "step": 13778 + }, + { + "epoch": 4.2292817679558015, + "grad_norm": 0.23747926950454712, + "learning_rate": 6.469636267534496e-05, + "loss": 1.7938, + "step": 13779 + }, + { + "epoch": 4.229588704726826, + "grad_norm": 0.32890695333480835, + "learning_rate": 6.469161160347386e-05, + "loss": 1.7233, + "step": 13780 + }, + { + "epoch": 4.229895641497851, + "grad_norm": 0.3437706530094147, + "learning_rate": 6.468686038641164e-05, + "loss": 1.7716, + "step": 13781 + }, + { + "epoch": 4.230202578268877, + "grad_norm": 0.23452162742614746, + "learning_rate": 6.468210902420527e-05, + "loss": 1.764, + "step": 13782 + }, + { + "epoch": 4.230509515039902, + "grad_norm": 0.3205265402793884, + "learning_rate": 6.46773575169017e-05, + "loss": 1.7464, + "step": 13783 + }, + { + "epoch": 4.230816451810927, + "grad_norm": 0.4234732985496521, + "learning_rate": 6.467260586454787e-05, + "loss": 1.7786, + "step": 13784 + }, + { + "epoch": 4.231123388581952, + "grad_norm": 0.2484128773212433, + "learning_rate": 6.466785406719076e-05, + "loss": 1.8125, + "step": 13785 + }, + { + "epoch": 4.231430325352977, + "grad_norm": 0.3696556091308594, + "learning_rate": 6.46631021248773e-05, + "loss": 1.7974, + "step": 13786 + }, + { + "epoch": 4.231737262124002, + "grad_norm": 0.4251437485218048, + "learning_rate": 6.465835003765449e-05, + "loss": 1.7486, + "step": 13787 + }, + { + "epoch": 4.232044198895028, + "grad_norm": 0.2507621943950653, + "learning_rate": 6.465359780556927e-05, + "loss": 1.829, + "step": 13788 + }, + { + "epoch": 4.232351135666053, + "grad_norm": 0.2911818325519562, + "learning_rate": 6.464884542866861e-05, + "loss": 1.7401, + "step": 13789 + }, + { + "epoch": 4.232658072437078, + "grad_norm": 0.35354506969451904, + "learning_rate": 6.464409290699946e-05, + "loss": 1.7848, + "step": 13790 + }, + { + "epoch": 4.232965009208103, + "grad_norm": 0.2659081518650055, + "learning_rate": 6.46393402406088e-05, + "loss": 1.7408, + "step": 13791 + }, + { + "epoch": 4.233271945979128, + "grad_norm": 0.22676481306552887, + "learning_rate": 6.46345874295436e-05, + "loss": 1.7542, + "step": 13792 + }, + { + "epoch": 4.2335788827501535, + "grad_norm": 0.2549789845943451, + "learning_rate": 6.462983447385085e-05, + "loss": 1.8095, + "step": 13793 + }, + { + "epoch": 4.233885819521179, + "grad_norm": 0.2157238870859146, + "learning_rate": 6.462508137357748e-05, + "loss": 1.7529, + "step": 13794 + }, + { + "epoch": 4.234192756292204, + "grad_norm": 0.2494724988937378, + "learning_rate": 6.46203281287705e-05, + "loss": 1.7839, + "step": 13795 + }, + { + "epoch": 4.234499693063229, + "grad_norm": 0.29560065269470215, + "learning_rate": 6.461557473947685e-05, + "loss": 1.7239, + "step": 13796 + }, + { + "epoch": 4.234806629834254, + "grad_norm": 0.23693916201591492, + "learning_rate": 6.461082120574354e-05, + "loss": 1.8074, + "step": 13797 + }, + { + "epoch": 4.235113566605279, + "grad_norm": 0.2538869082927704, + "learning_rate": 6.460606752761752e-05, + "loss": 1.8319, + "step": 13798 + }, + { + "epoch": 4.235420503376305, + "grad_norm": 0.3186401426792145, + "learning_rate": 6.460131370514578e-05, + "loss": 1.7877, + "step": 13799 + }, + { + "epoch": 4.23572744014733, + "grad_norm": 0.2473619133234024, + "learning_rate": 6.45965597383753e-05, + "loss": 1.8323, + "step": 13800 + }, + { + "epoch": 4.236034376918354, + "grad_norm": 0.32806503772735596, + "learning_rate": 6.459180562735307e-05, + "loss": 1.744, + "step": 13801 + }, + { + "epoch": 4.23634131368938, + "grad_norm": 0.3975784480571747, + "learning_rate": 6.458705137212606e-05, + "loss": 1.7216, + "step": 13802 + }, + { + "epoch": 4.236648250460405, + "grad_norm": 0.2946135997772217, + "learning_rate": 6.458229697274125e-05, + "loss": 1.8781, + "step": 13803 + }, + { + "epoch": 4.23695518723143, + "grad_norm": 0.25109192728996277, + "learning_rate": 6.457754242924565e-05, + "loss": 1.7458, + "step": 13804 + }, + { + "epoch": 4.237262124002456, + "grad_norm": 0.2763883173465729, + "learning_rate": 6.457278774168623e-05, + "loss": 1.7612, + "step": 13805 + }, + { + "epoch": 4.237569060773481, + "grad_norm": 0.22427856922149658, + "learning_rate": 6.456803291010996e-05, + "loss": 1.8049, + "step": 13806 + }, + { + "epoch": 4.2378759975445055, + "grad_norm": 0.28295788168907166, + "learning_rate": 6.456327793456387e-05, + "loss": 1.7608, + "step": 13807 + }, + { + "epoch": 4.238182934315531, + "grad_norm": 0.27857527136802673, + "learning_rate": 6.455852281509493e-05, + "loss": 1.7281, + "step": 13808 + }, + { + "epoch": 4.238489871086556, + "grad_norm": 0.24014849960803986, + "learning_rate": 6.455376755175012e-05, + "loss": 1.7247, + "step": 13809 + }, + { + "epoch": 4.2387968078575815, + "grad_norm": 0.25149038434028625, + "learning_rate": 6.454901214457646e-05, + "loss": 1.8575, + "step": 13810 + }, + { + "epoch": 4.239103744628607, + "grad_norm": 0.32072681188583374, + "learning_rate": 6.454425659362093e-05, + "loss": 1.7421, + "step": 13811 + }, + { + "epoch": 4.239410681399631, + "grad_norm": 0.28418242931365967, + "learning_rate": 6.453950089893054e-05, + "loss": 1.7031, + "step": 13812 + }, + { + "epoch": 4.239717618170657, + "grad_norm": 0.23725132644176483, + "learning_rate": 6.453474506055228e-05, + "loss": 1.7901, + "step": 13813 + }, + { + "epoch": 4.240024554941682, + "grad_norm": 0.3056317865848541, + "learning_rate": 6.452998907853315e-05, + "loss": 1.7414, + "step": 13814 + }, + { + "epoch": 4.240331491712707, + "grad_norm": 0.3111891448497772, + "learning_rate": 6.452523295292013e-05, + "loss": 1.7532, + "step": 13815 + }, + { + "epoch": 4.240638428483733, + "grad_norm": 0.2126779705286026, + "learning_rate": 6.452047668376027e-05, + "loss": 1.6779, + "step": 13816 + }, + { + "epoch": 4.240945365254758, + "grad_norm": 0.26660779118537903, + "learning_rate": 6.451572027110054e-05, + "loss": 1.7162, + "step": 13817 + }, + { + "epoch": 4.241252302025782, + "grad_norm": 0.25901922583580017, + "learning_rate": 6.451096371498794e-05, + "loss": 1.7784, + "step": 13818 + }, + { + "epoch": 4.241559238796808, + "grad_norm": 0.24091807007789612, + "learning_rate": 6.450620701546953e-05, + "loss": 1.7928, + "step": 13819 + }, + { + "epoch": 4.241866175567833, + "grad_norm": 0.25097009539604187, + "learning_rate": 6.450145017259225e-05, + "loss": 1.761, + "step": 13820 + }, + { + "epoch": 4.242173112338858, + "grad_norm": 0.22978942096233368, + "learning_rate": 6.449669318640315e-05, + "loss": 1.7891, + "step": 13821 + }, + { + "epoch": 4.242480049109884, + "grad_norm": 0.27255937457084656, + "learning_rate": 6.449193605694923e-05, + "loss": 1.7964, + "step": 13822 + }, + { + "epoch": 4.242786985880908, + "grad_norm": 0.2210773378610611, + "learning_rate": 6.44871787842775e-05, + "loss": 1.7628, + "step": 13823 + }, + { + "epoch": 4.2430939226519335, + "grad_norm": 0.25784751772880554, + "learning_rate": 6.448242136843497e-05, + "loss": 1.7596, + "step": 13824 + }, + { + "epoch": 4.243400859422959, + "grad_norm": 0.23475486040115356, + "learning_rate": 6.447766380946868e-05, + "loss": 1.8174, + "step": 13825 + }, + { + "epoch": 4.243707796193984, + "grad_norm": 0.2567705512046814, + "learning_rate": 6.447290610742561e-05, + "loss": 1.737, + "step": 13826 + }, + { + "epoch": 4.2440147329650095, + "grad_norm": 0.23973144590854645, + "learning_rate": 6.446814826235281e-05, + "loss": 1.7881, + "step": 13827 + }, + { + "epoch": 4.244321669736034, + "grad_norm": 0.25584739446640015, + "learning_rate": 6.446339027429729e-05, + "loss": 1.7673, + "step": 13828 + }, + { + "epoch": 4.244628606507059, + "grad_norm": 0.2653748393058777, + "learning_rate": 6.445863214330608e-05, + "loss": 1.7443, + "step": 13829 + }, + { + "epoch": 4.244935543278085, + "grad_norm": 0.2492038607597351, + "learning_rate": 6.445387386942619e-05, + "loss": 1.7223, + "step": 13830 + }, + { + "epoch": 4.24524248004911, + "grad_norm": 0.2282228320837021, + "learning_rate": 6.444911545270464e-05, + "loss": 1.7577, + "step": 13831 + }, + { + "epoch": 4.245549416820135, + "grad_norm": 0.2411092072725296, + "learning_rate": 6.444435689318845e-05, + "loss": 1.7324, + "step": 13832 + }, + { + "epoch": 4.245856353591161, + "grad_norm": 0.21557089686393738, + "learning_rate": 6.443959819092468e-05, + "loss": 1.7355, + "step": 13833 + }, + { + "epoch": 4.246163290362185, + "grad_norm": 0.2500394880771637, + "learning_rate": 6.443483934596033e-05, + "loss": 1.775, + "step": 13834 + }, + { + "epoch": 4.24647022713321, + "grad_norm": 0.24135248363018036, + "learning_rate": 6.443008035834244e-05, + "loss": 1.7885, + "step": 13835 + }, + { + "epoch": 4.246777163904236, + "grad_norm": 0.22860904037952423, + "learning_rate": 6.442532122811803e-05, + "loss": 1.7891, + "step": 13836 + }, + { + "epoch": 4.247084100675261, + "grad_norm": 0.2277665138244629, + "learning_rate": 6.442056195533415e-05, + "loss": 1.7583, + "step": 13837 + }, + { + "epoch": 4.247391037446286, + "grad_norm": 0.22822454571723938, + "learning_rate": 6.441580254003782e-05, + "loss": 1.7777, + "step": 13838 + }, + { + "epoch": 4.247697974217311, + "grad_norm": 0.24274896085262299, + "learning_rate": 6.441104298227608e-05, + "loss": 1.7537, + "step": 13839 + }, + { + "epoch": 4.248004910988336, + "grad_norm": 0.25080999732017517, + "learning_rate": 6.440628328209598e-05, + "loss": 1.7537, + "step": 13840 + }, + { + "epoch": 4.2483118477593615, + "grad_norm": 0.22409579157829285, + "learning_rate": 6.440152343954453e-05, + "loss": 1.7652, + "step": 13841 + }, + { + "epoch": 4.248618784530387, + "grad_norm": 0.24028798937797546, + "learning_rate": 6.439676345466877e-05, + "loss": 1.7512, + "step": 13842 + }, + { + "epoch": 4.248925721301412, + "grad_norm": 0.28739503026008606, + "learning_rate": 6.439200332751576e-05, + "loss": 1.8034, + "step": 13843 + }, + { + "epoch": 4.249232658072437, + "grad_norm": 0.2244807928800583, + "learning_rate": 6.438724305813255e-05, + "loss": 1.7243, + "step": 13844 + }, + { + "epoch": 4.249539594843462, + "grad_norm": 0.24478118121623993, + "learning_rate": 6.438248264656618e-05, + "loss": 1.7754, + "step": 13845 + }, + { + "epoch": 4.249846531614487, + "grad_norm": 0.25554370880126953, + "learning_rate": 6.437772209286368e-05, + "loss": 1.7845, + "step": 13846 + }, + { + "epoch": 4.250153468385513, + "grad_norm": 0.24478472769260406, + "learning_rate": 6.43729613970721e-05, + "loss": 1.7954, + "step": 13847 + }, + { + "epoch": 4.250460405156538, + "grad_norm": 0.22287282347679138, + "learning_rate": 6.436820055923849e-05, + "loss": 1.7379, + "step": 13848 + }, + { + "epoch": 4.250767341927563, + "grad_norm": 0.2810569703578949, + "learning_rate": 6.43634395794099e-05, + "loss": 1.8492, + "step": 13849 + }, + { + "epoch": 4.251074278698588, + "grad_norm": 0.2544163465499878, + "learning_rate": 6.435867845763337e-05, + "loss": 1.7846, + "step": 13850 + }, + { + "epoch": 4.251381215469613, + "grad_norm": 0.27879175543785095, + "learning_rate": 6.435391719395598e-05, + "loss": 1.767, + "step": 13851 + }, + { + "epoch": 4.2516881522406385, + "grad_norm": 0.2876715362071991, + "learning_rate": 6.434915578842477e-05, + "loss": 1.8048, + "step": 13852 + }, + { + "epoch": 4.251995089011664, + "grad_norm": 0.27844297885894775, + "learning_rate": 6.434439424108678e-05, + "loss": 1.7472, + "step": 13853 + }, + { + "epoch": 4.252302025782689, + "grad_norm": 0.2417020946741104, + "learning_rate": 6.43396325519891e-05, + "loss": 1.8481, + "step": 13854 + }, + { + "epoch": 4.252608962553714, + "grad_norm": 0.23828522861003876, + "learning_rate": 6.433487072117874e-05, + "loss": 1.7536, + "step": 13855 + }, + { + "epoch": 4.252915899324739, + "grad_norm": 0.22304333746433258, + "learning_rate": 6.43301087487028e-05, + "loss": 1.741, + "step": 13856 + }, + { + "epoch": 4.253222836095764, + "grad_norm": 0.27089163661003113, + "learning_rate": 6.432534663460832e-05, + "loss": 1.7974, + "step": 13857 + }, + { + "epoch": 4.25352977286679, + "grad_norm": 0.2439592182636261, + "learning_rate": 6.432058437894237e-05, + "loss": 1.7713, + "step": 13858 + }, + { + "epoch": 4.253836709637815, + "grad_norm": 0.2368553727865219, + "learning_rate": 6.431582198175203e-05, + "loss": 1.6915, + "step": 13859 + }, + { + "epoch": 4.25414364640884, + "grad_norm": 0.25248441100120544, + "learning_rate": 6.431105944308431e-05, + "loss": 1.7286, + "step": 13860 + }, + { + "epoch": 4.254450583179865, + "grad_norm": 0.20928484201431274, + "learning_rate": 6.430629676298634e-05, + "loss": 1.79, + "step": 13861 + }, + { + "epoch": 4.25475751995089, + "grad_norm": 0.25262540578842163, + "learning_rate": 6.430153394150514e-05, + "loss": 1.7443, + "step": 13862 + }, + { + "epoch": 4.255064456721915, + "grad_norm": 0.27508237957954407, + "learning_rate": 6.429677097868783e-05, + "loss": 1.8207, + "step": 13863 + }, + { + "epoch": 4.255371393492941, + "grad_norm": 0.28129303455352783, + "learning_rate": 6.429200787458141e-05, + "loss": 1.7589, + "step": 13864 + }, + { + "epoch": 4.255678330263966, + "grad_norm": 0.3205658495426178, + "learning_rate": 6.428724462923302e-05, + "loss": 1.8037, + "step": 13865 + }, + { + "epoch": 4.2559852670349905, + "grad_norm": 0.24048078060150146, + "learning_rate": 6.428248124268969e-05, + "loss": 1.7303, + "step": 13866 + }, + { + "epoch": 4.256292203806016, + "grad_norm": 0.24742475152015686, + "learning_rate": 6.427771771499852e-05, + "loss": 1.7753, + "step": 13867 + }, + { + "epoch": 4.256599140577041, + "grad_norm": 0.3082354962825775, + "learning_rate": 6.427295404620656e-05, + "loss": 1.7275, + "step": 13868 + }, + { + "epoch": 4.2569060773480665, + "grad_norm": 0.23319822549819946, + "learning_rate": 6.426819023636093e-05, + "loss": 1.7562, + "step": 13869 + }, + { + "epoch": 4.257213014119092, + "grad_norm": 0.2611405551433563, + "learning_rate": 6.426342628550866e-05, + "loss": 1.7417, + "step": 13870 + }, + { + "epoch": 4.257519950890116, + "grad_norm": 0.2577543258666992, + "learning_rate": 6.425866219369686e-05, + "loss": 1.6906, + "step": 13871 + }, + { + "epoch": 4.257826887661142, + "grad_norm": 0.31353357434272766, + "learning_rate": 6.42538979609726e-05, + "loss": 1.7155, + "step": 13872 + }, + { + "epoch": 4.258133824432167, + "grad_norm": 0.23280073702335358, + "learning_rate": 6.424913358738296e-05, + "loss": 1.7576, + "step": 13873 + }, + { + "epoch": 4.258440761203192, + "grad_norm": 0.24087542295455933, + "learning_rate": 6.424436907297504e-05, + "loss": 1.7622, + "step": 13874 + }, + { + "epoch": 4.258747697974218, + "grad_norm": 0.3146509826183319, + "learning_rate": 6.42396044177959e-05, + "loss": 1.769, + "step": 13875 + }, + { + "epoch": 4.259054634745242, + "grad_norm": 0.2645811438560486, + "learning_rate": 6.423483962189268e-05, + "loss": 1.7713, + "step": 13876 + }, + { + "epoch": 4.259361571516267, + "grad_norm": 0.2166455090045929, + "learning_rate": 6.423007468531238e-05, + "loss": 1.7705, + "step": 13877 + }, + { + "epoch": 4.259668508287293, + "grad_norm": 0.29142528772354126, + "learning_rate": 6.422530960810217e-05, + "loss": 1.7725, + "step": 13878 + }, + { + "epoch": 4.259975445058318, + "grad_norm": 0.28777652978897095, + "learning_rate": 6.422054439030911e-05, + "loss": 1.7853, + "step": 13879 + }, + { + "epoch": 4.260282381829343, + "grad_norm": 0.2285117357969284, + "learning_rate": 6.42157790319803e-05, + "loss": 1.7034, + "step": 13880 + }, + { + "epoch": 4.260589318600369, + "grad_norm": 0.32407644391059875, + "learning_rate": 6.421101353316282e-05, + "loss": 1.7858, + "step": 13881 + }, + { + "epoch": 4.260896255371393, + "grad_norm": 0.4803469777107239, + "learning_rate": 6.420624789390378e-05, + "loss": 1.7337, + "step": 13882 + }, + { + "epoch": 4.2612031921424185, + "grad_norm": 0.4245823919773102, + "learning_rate": 6.420148211425027e-05, + "loss": 1.8024, + "step": 13883 + }, + { + "epoch": 4.261510128913444, + "grad_norm": 0.22298674285411835, + "learning_rate": 6.419671619424938e-05, + "loss": 1.7129, + "step": 13884 + }, + { + "epoch": 4.261817065684469, + "grad_norm": 0.46955862641334534, + "learning_rate": 6.419195013394824e-05, + "loss": 1.7151, + "step": 13885 + }, + { + "epoch": 4.2621240024554945, + "grad_norm": 0.4809224009513855, + "learning_rate": 6.418718393339392e-05, + "loss": 1.7697, + "step": 13886 + }, + { + "epoch": 4.262430939226519, + "grad_norm": 0.2741130292415619, + "learning_rate": 6.418241759263353e-05, + "loss": 1.8133, + "step": 13887 + }, + { + "epoch": 4.262737875997544, + "grad_norm": 0.3673117756843567, + "learning_rate": 6.417765111171419e-05, + "loss": 1.7424, + "step": 13888 + }, + { + "epoch": 4.26304481276857, + "grad_norm": 0.4609327018260956, + "learning_rate": 6.417288449068299e-05, + "loss": 1.741, + "step": 13889 + }, + { + "epoch": 4.263351749539595, + "grad_norm": 0.2929460406303406, + "learning_rate": 6.416811772958702e-05, + "loss": 1.8385, + "step": 13890 + }, + { + "epoch": 4.26365868631062, + "grad_norm": 0.2727305293083191, + "learning_rate": 6.416335082847342e-05, + "loss": 1.794, + "step": 13891 + }, + { + "epoch": 4.263965623081646, + "grad_norm": 0.26089411973953247, + "learning_rate": 6.41585837873893e-05, + "loss": 1.7907, + "step": 13892 + }, + { + "epoch": 4.26427255985267, + "grad_norm": 0.24655573070049286, + "learning_rate": 6.415381660638174e-05, + "loss": 1.7481, + "step": 13893 + }, + { + "epoch": 4.264579496623695, + "grad_norm": 0.4186919629573822, + "learning_rate": 6.414904928549787e-05, + "loss": 1.8021, + "step": 13894 + }, + { + "epoch": 4.264886433394721, + "grad_norm": 0.38188236951828003, + "learning_rate": 6.414428182478478e-05, + "loss": 1.75, + "step": 13895 + }, + { + "epoch": 4.265193370165746, + "grad_norm": 0.23686440289020538, + "learning_rate": 6.413951422428963e-05, + "loss": 1.7882, + "step": 13896 + }, + { + "epoch": 4.265500306936771, + "grad_norm": 0.35963737964630127, + "learning_rate": 6.413474648405952e-05, + "loss": 1.7427, + "step": 13897 + }, + { + "epoch": 4.265807243707796, + "grad_norm": 0.38558289408683777, + "learning_rate": 6.412997860414155e-05, + "loss": 1.7622, + "step": 13898 + }, + { + "epoch": 4.266114180478821, + "grad_norm": 0.2311459481716156, + "learning_rate": 6.412521058458285e-05, + "loss": 1.7894, + "step": 13899 + }, + { + "epoch": 4.2664211172498465, + "grad_norm": 0.2647818624973297, + "learning_rate": 6.412044242543054e-05, + "loss": 1.7399, + "step": 13900 + }, + { + "epoch": 4.266728054020872, + "grad_norm": 0.3174133002758026, + "learning_rate": 6.411567412673174e-05, + "loss": 1.7552, + "step": 13901 + }, + { + "epoch": 4.267034990791897, + "grad_norm": 0.25207316875457764, + "learning_rate": 6.411090568853358e-05, + "loss": 1.7876, + "step": 13902 + }, + { + "epoch": 4.267341927562922, + "grad_norm": 0.24549202620983124, + "learning_rate": 6.410613711088317e-05, + "loss": 1.8554, + "step": 13903 + }, + { + "epoch": 4.267648864333947, + "grad_norm": 0.26293641328811646, + "learning_rate": 6.410136839382765e-05, + "loss": 1.8553, + "step": 13904 + }, + { + "epoch": 4.267955801104972, + "grad_norm": 0.20258362591266632, + "learning_rate": 6.409659953741416e-05, + "loss": 1.7205, + "step": 13905 + }, + { + "epoch": 4.268262737875998, + "grad_norm": 0.24885907769203186, + "learning_rate": 6.409183054168979e-05, + "loss": 1.7718, + "step": 13906 + }, + { + "epoch": 4.268569674647023, + "grad_norm": 0.22737209498882294, + "learning_rate": 6.408706140670169e-05, + "loss": 1.7228, + "step": 13907 + }, + { + "epoch": 4.268876611418047, + "grad_norm": 0.2201235145330429, + "learning_rate": 6.4082292132497e-05, + "loss": 1.7451, + "step": 13908 + }, + { + "epoch": 4.269183548189073, + "grad_norm": 0.24108454585075378, + "learning_rate": 6.407752271912285e-05, + "loss": 1.7531, + "step": 13909 + }, + { + "epoch": 4.269490484960098, + "grad_norm": 0.21723641455173492, + "learning_rate": 6.407275316662636e-05, + "loss": 1.7139, + "step": 13910 + }, + { + "epoch": 4.269797421731123, + "grad_norm": 0.22557848691940308, + "learning_rate": 6.406798347505469e-05, + "loss": 1.7633, + "step": 13911 + }, + { + "epoch": 4.270104358502149, + "grad_norm": 0.24664700031280518, + "learning_rate": 6.406321364445494e-05, + "loss": 1.7854, + "step": 13912 + }, + { + "epoch": 4.270411295273174, + "grad_norm": 0.2599056661128998, + "learning_rate": 6.405844367487428e-05, + "loss": 1.7662, + "step": 13913 + }, + { + "epoch": 4.2707182320441985, + "grad_norm": 0.2378663718700409, + "learning_rate": 6.405367356635982e-05, + "loss": 1.7477, + "step": 13914 + }, + { + "epoch": 4.271025168815224, + "grad_norm": 0.27158626914024353, + "learning_rate": 6.404890331895876e-05, + "loss": 1.7426, + "step": 13915 + }, + { + "epoch": 4.271332105586249, + "grad_norm": 0.28585317730903625, + "learning_rate": 6.404413293271818e-05, + "loss": 1.7492, + "step": 13916 + }, + { + "epoch": 4.2716390423572745, + "grad_norm": 0.2321750968694687, + "learning_rate": 6.403936240768526e-05, + "loss": 1.8594, + "step": 13917 + }, + { + "epoch": 4.2719459791283, + "grad_norm": 0.25824111700057983, + "learning_rate": 6.40345917439071e-05, + "loss": 1.7622, + "step": 13918 + }, + { + "epoch": 4.272252915899324, + "grad_norm": 0.24641194939613342, + "learning_rate": 6.40298209414309e-05, + "loss": 1.7519, + "step": 13919 + }, + { + "epoch": 4.27255985267035, + "grad_norm": 0.2132398933172226, + "learning_rate": 6.40250500003038e-05, + "loss": 1.7339, + "step": 13920 + }, + { + "epoch": 4.272866789441375, + "grad_norm": 0.22630736231803894, + "learning_rate": 6.402027892057292e-05, + "loss": 1.7396, + "step": 13921 + }, + { + "epoch": 4.2731737262124, + "grad_norm": 0.295163631439209, + "learning_rate": 6.401550770228543e-05, + "loss": 1.8063, + "step": 13922 + }, + { + "epoch": 4.273480662983426, + "grad_norm": 0.2722746729850769, + "learning_rate": 6.401073634548848e-05, + "loss": 1.7775, + "step": 13923 + }, + { + "epoch": 4.273787599754451, + "grad_norm": 0.23201976716518402, + "learning_rate": 6.400596485022922e-05, + "loss": 1.7755, + "step": 13924 + }, + { + "epoch": 4.274094536525475, + "grad_norm": 0.23880761861801147, + "learning_rate": 6.40011932165548e-05, + "loss": 1.778, + "step": 13925 + }, + { + "epoch": 4.274401473296501, + "grad_norm": 0.22305625677108765, + "learning_rate": 6.399642144451239e-05, + "loss": 1.761, + "step": 13926 + }, + { + "epoch": 4.274708410067526, + "grad_norm": 0.21874886751174927, + "learning_rate": 6.399164953414914e-05, + "loss": 1.7148, + "step": 13927 + }, + { + "epoch": 4.2750153468385514, + "grad_norm": 0.2003604918718338, + "learning_rate": 6.398687748551221e-05, + "loss": 1.8049, + "step": 13928 + }, + { + "epoch": 4.275322283609577, + "grad_norm": 0.2443511188030243, + "learning_rate": 6.398210529864875e-05, + "loss": 1.782, + "step": 13929 + }, + { + "epoch": 4.275629220380601, + "grad_norm": 0.2297198623418808, + "learning_rate": 6.397733297360594e-05, + "loss": 1.7682, + "step": 13930 + }, + { + "epoch": 4.275936157151627, + "grad_norm": 0.23474562168121338, + "learning_rate": 6.39725605104309e-05, + "loss": 1.7809, + "step": 13931 + }, + { + "epoch": 4.276243093922652, + "grad_norm": 0.25908544659614563, + "learning_rate": 6.396778790917087e-05, + "loss": 1.7343, + "step": 13932 + }, + { + "epoch": 4.276550030693677, + "grad_norm": 0.2440379112958908, + "learning_rate": 6.396301516987295e-05, + "loss": 1.786, + "step": 13933 + }, + { + "epoch": 4.276856967464703, + "grad_norm": 0.26185858249664307, + "learning_rate": 6.395824229258435e-05, + "loss": 1.7863, + "step": 13934 + }, + { + "epoch": 4.277163904235728, + "grad_norm": 0.24470919370651245, + "learning_rate": 6.39534692773522e-05, + "loss": 1.7774, + "step": 13935 + }, + { + "epoch": 4.277470841006752, + "grad_norm": 0.2612632215023041, + "learning_rate": 6.39486961242237e-05, + "loss": 1.7536, + "step": 13936 + }, + { + "epoch": 4.277777777777778, + "grad_norm": 0.26870301365852356, + "learning_rate": 6.3943922833246e-05, + "loss": 1.8177, + "step": 13937 + }, + { + "epoch": 4.278084714548803, + "grad_norm": 0.24445784091949463, + "learning_rate": 6.393914940446628e-05, + "loss": 1.7539, + "step": 13938 + }, + { + "epoch": 4.278391651319828, + "grad_norm": 0.2622319757938385, + "learning_rate": 6.393437583793174e-05, + "loss": 1.8252, + "step": 13939 + }, + { + "epoch": 4.278698588090854, + "grad_norm": 0.2586652636528015, + "learning_rate": 6.39296021336895e-05, + "loss": 1.7975, + "step": 13940 + }, + { + "epoch": 4.279005524861878, + "grad_norm": 0.19488228857517242, + "learning_rate": 6.392482829178678e-05, + "loss": 1.7678, + "step": 13941 + }, + { + "epoch": 4.2793124616329035, + "grad_norm": 0.23956604301929474, + "learning_rate": 6.392005431227074e-05, + "loss": 1.7444, + "step": 13942 + }, + { + "epoch": 4.279619398403929, + "grad_norm": 0.24195842444896698, + "learning_rate": 6.391528019518857e-05, + "loss": 1.8116, + "step": 13943 + }, + { + "epoch": 4.279926335174954, + "grad_norm": 0.21479523181915283, + "learning_rate": 6.391050594058746e-05, + "loss": 1.7351, + "step": 13944 + }, + { + "epoch": 4.2802332719459795, + "grad_norm": 0.2309941202402115, + "learning_rate": 6.390573154851456e-05, + "loss": 1.8245, + "step": 13945 + }, + { + "epoch": 4.280540208717004, + "grad_norm": 0.2375536412000656, + "learning_rate": 6.390095701901706e-05, + "loss": 1.7921, + "step": 13946 + }, + { + "epoch": 4.280847145488029, + "grad_norm": 0.25518664717674255, + "learning_rate": 6.389618235214216e-05, + "loss": 1.7549, + "step": 13947 + }, + { + "epoch": 4.281154082259055, + "grad_norm": 0.2579016089439392, + "learning_rate": 6.389140754793705e-05, + "loss": 1.7637, + "step": 13948 + }, + { + "epoch": 4.28146101903008, + "grad_norm": 0.25350916385650635, + "learning_rate": 6.388663260644892e-05, + "loss": 1.746, + "step": 13949 + }, + { + "epoch": 4.281767955801105, + "grad_norm": 0.2994026839733124, + "learning_rate": 6.388185752772493e-05, + "loss": 1.8196, + "step": 13950 + }, + { + "epoch": 4.28207489257213, + "grad_norm": 0.29938533902168274, + "learning_rate": 6.387708231181229e-05, + "loss": 1.7187, + "step": 13951 + }, + { + "epoch": 4.282381829343155, + "grad_norm": 0.23865137994289398, + "learning_rate": 6.387230695875819e-05, + "loss": 1.7317, + "step": 13952 + }, + { + "epoch": 4.28268876611418, + "grad_norm": 0.23812857270240784, + "learning_rate": 6.386753146860982e-05, + "loss": 1.7536, + "step": 13953 + }, + { + "epoch": 4.282995702885206, + "grad_norm": 0.3395650088787079, + "learning_rate": 6.386275584141438e-05, + "loss": 1.7932, + "step": 13954 + }, + { + "epoch": 4.283302639656231, + "grad_norm": 0.38207507133483887, + "learning_rate": 6.385798007721906e-05, + "loss": 1.8196, + "step": 13955 + }, + { + "epoch": 4.283609576427256, + "grad_norm": 0.32960978150367737, + "learning_rate": 6.385320417607107e-05, + "loss": 1.7898, + "step": 13956 + }, + { + "epoch": 4.283916513198281, + "grad_norm": 0.22978928685188293, + "learning_rate": 6.384842813801757e-05, + "loss": 1.7835, + "step": 13957 + }, + { + "epoch": 4.284223449969306, + "grad_norm": 0.24607588350772858, + "learning_rate": 6.38436519631058e-05, + "loss": 1.7829, + "step": 13958 + }, + { + "epoch": 4.2845303867403315, + "grad_norm": 0.2770270109176636, + "learning_rate": 6.383887565138295e-05, + "loss": 1.7294, + "step": 13959 + }, + { + "epoch": 4.284837323511357, + "grad_norm": 0.27644863724708557, + "learning_rate": 6.383409920289622e-05, + "loss": 1.829, + "step": 13960 + }, + { + "epoch": 4.285144260282382, + "grad_norm": 0.3870919942855835, + "learning_rate": 6.382932261769282e-05, + "loss": 1.8146, + "step": 13961 + }, + { + "epoch": 4.285451197053407, + "grad_norm": 0.3562348186969757, + "learning_rate": 6.382454589581994e-05, + "loss": 1.8225, + "step": 13962 + }, + { + "epoch": 4.285758133824432, + "grad_norm": 0.28444886207580566, + "learning_rate": 6.38197690373248e-05, + "loss": 1.7734, + "step": 13963 + }, + { + "epoch": 4.286065070595457, + "grad_norm": 0.27935758233070374, + "learning_rate": 6.381499204225459e-05, + "loss": 1.7402, + "step": 13964 + }, + { + "epoch": 4.286372007366483, + "grad_norm": 0.34188997745513916, + "learning_rate": 6.381021491065653e-05, + "loss": 1.7661, + "step": 13965 + }, + { + "epoch": 4.286678944137508, + "grad_norm": 0.28648918867111206, + "learning_rate": 6.380543764257785e-05, + "loss": 1.8312, + "step": 13966 + }, + { + "epoch": 4.286985880908533, + "grad_norm": 0.2733290493488312, + "learning_rate": 6.380066023806572e-05, + "loss": 1.7505, + "step": 13967 + }, + { + "epoch": 4.287292817679558, + "grad_norm": 0.3344273865222931, + "learning_rate": 6.37958826971674e-05, + "loss": 1.8392, + "step": 13968 + }, + { + "epoch": 4.287599754450583, + "grad_norm": 0.2655799090862274, + "learning_rate": 6.379110501993006e-05, + "loss": 1.7575, + "step": 13969 + }, + { + "epoch": 4.287906691221608, + "grad_norm": 0.2569151818752289, + "learning_rate": 6.378632720640095e-05, + "loss": 1.6619, + "step": 13970 + }, + { + "epoch": 4.288213627992634, + "grad_norm": 0.2477198988199234, + "learning_rate": 6.378154925662727e-05, + "loss": 1.7532, + "step": 13971 + }, + { + "epoch": 4.288520564763659, + "grad_norm": 0.2867630422115326, + "learning_rate": 6.377677117065624e-05, + "loss": 1.7725, + "step": 13972 + }, + { + "epoch": 4.2888275015346835, + "grad_norm": 0.28316137194633484, + "learning_rate": 6.37719929485351e-05, + "loss": 1.7628, + "step": 13973 + }, + { + "epoch": 4.289134438305709, + "grad_norm": 0.2934304475784302, + "learning_rate": 6.376721459031106e-05, + "loss": 1.7346, + "step": 13974 + }, + { + "epoch": 4.289441375076734, + "grad_norm": 0.22847147285938263, + "learning_rate": 6.376243609603129e-05, + "loss": 1.7409, + "step": 13975 + }, + { + "epoch": 4.2897483118477595, + "grad_norm": 0.360441118478775, + "learning_rate": 6.375765746574311e-05, + "loss": 1.808, + "step": 13976 + }, + { + "epoch": 4.290055248618785, + "grad_norm": 0.2750907242298126, + "learning_rate": 6.375287869949367e-05, + "loss": 1.8046, + "step": 13977 + }, + { + "epoch": 4.290362185389809, + "grad_norm": 0.26193201541900635, + "learning_rate": 6.374809979733022e-05, + "loss": 1.7097, + "step": 13978 + }, + { + "epoch": 4.290669122160835, + "grad_norm": 0.3282175064086914, + "learning_rate": 6.37433207593e-05, + "loss": 1.7924, + "step": 13979 + }, + { + "epoch": 4.29097605893186, + "grad_norm": 0.2845167815685272, + "learning_rate": 6.373854158545021e-05, + "loss": 1.7663, + "step": 13980 + }, + { + "epoch": 4.291282995702885, + "grad_norm": 0.21816621720790863, + "learning_rate": 6.37337622758281e-05, + "loss": 1.7368, + "step": 13981 + }, + { + "epoch": 4.291589932473911, + "grad_norm": 0.264272540807724, + "learning_rate": 6.372898283048094e-05, + "loss": 1.7377, + "step": 13982 + }, + { + "epoch": 4.291896869244935, + "grad_norm": 0.2182006686925888, + "learning_rate": 6.37242032494559e-05, + "loss": 1.8107, + "step": 13983 + }, + { + "epoch": 4.29220380601596, + "grad_norm": 0.26856422424316406, + "learning_rate": 6.371942353280023e-05, + "loss": 1.7708, + "step": 13984 + }, + { + "epoch": 4.292510742786986, + "grad_norm": 0.3025323748588562, + "learning_rate": 6.37146436805612e-05, + "loss": 1.7768, + "step": 13985 + }, + { + "epoch": 4.292817679558011, + "grad_norm": 0.2949144244194031, + "learning_rate": 6.3709863692786e-05, + "loss": 1.7848, + "step": 13986 + }, + { + "epoch": 4.293124616329036, + "grad_norm": 0.20670418441295624, + "learning_rate": 6.370508356952188e-05, + "loss": 1.7367, + "step": 13987 + }, + { + "epoch": 4.293431553100062, + "grad_norm": 0.2453860342502594, + "learning_rate": 6.370030331081611e-05, + "loss": 1.7246, + "step": 13988 + }, + { + "epoch": 4.293738489871086, + "grad_norm": 0.3413507044315338, + "learning_rate": 6.369552291671592e-05, + "loss": 1.7829, + "step": 13989 + }, + { + "epoch": 4.2940454266421115, + "grad_norm": 0.28352782130241394, + "learning_rate": 6.369074238726856e-05, + "loss": 1.7755, + "step": 13990 + }, + { + "epoch": 4.294352363413137, + "grad_norm": 0.21408751606941223, + "learning_rate": 6.368596172252124e-05, + "loss": 1.7292, + "step": 13991 + }, + { + "epoch": 4.294659300184162, + "grad_norm": 0.28372085094451904, + "learning_rate": 6.36811809225212e-05, + "loss": 1.8197, + "step": 13992 + }, + { + "epoch": 4.2949662369551875, + "grad_norm": 0.2400829792022705, + "learning_rate": 6.367639998731573e-05, + "loss": 1.7559, + "step": 13993 + }, + { + "epoch": 4.295273173726212, + "grad_norm": 0.22853593528270721, + "learning_rate": 6.367161891695207e-05, + "loss": 1.8116, + "step": 13994 + }, + { + "epoch": 4.295580110497237, + "grad_norm": 0.22098208963871002, + "learning_rate": 6.366683771147745e-05, + "loss": 1.7269, + "step": 13995 + }, + { + "epoch": 4.295887047268263, + "grad_norm": 0.22293934226036072, + "learning_rate": 6.366205637093914e-05, + "loss": 1.7944, + "step": 13996 + }, + { + "epoch": 4.296193984039288, + "grad_norm": 0.26120004057884216, + "learning_rate": 6.365727489538437e-05, + "loss": 1.7581, + "step": 13997 + }, + { + "epoch": 4.296500920810313, + "grad_norm": 0.2568937838077545, + "learning_rate": 6.365249328486041e-05, + "loss": 1.7356, + "step": 13998 + }, + { + "epoch": 4.296807857581339, + "grad_norm": 0.2419043630361557, + "learning_rate": 6.364771153941449e-05, + "loss": 1.8127, + "step": 13999 + }, + { + "epoch": 4.297114794352363, + "grad_norm": 0.2521972060203552, + "learning_rate": 6.364292965909391e-05, + "loss": 1.7445, + "step": 14000 + }, + { + "epoch": 4.297421731123388, + "grad_norm": 0.3269292414188385, + "learning_rate": 6.363814764394589e-05, + "loss": 1.7835, + "step": 14001 + }, + { + "epoch": 4.297728667894414, + "grad_norm": 0.258405864238739, + "learning_rate": 6.36333654940177e-05, + "loss": 1.7407, + "step": 14002 + }, + { + "epoch": 4.298035604665439, + "grad_norm": 0.21527236700057983, + "learning_rate": 6.362858320935662e-05, + "loss": 1.7729, + "step": 14003 + }, + { + "epoch": 4.298342541436464, + "grad_norm": 0.25343602895736694, + "learning_rate": 6.362380079000988e-05, + "loss": 1.8087, + "step": 14004 + }, + { + "epoch": 4.298649478207489, + "grad_norm": 0.26110637187957764, + "learning_rate": 6.361901823602474e-05, + "loss": 1.813, + "step": 14005 + }, + { + "epoch": 4.298956414978514, + "grad_norm": 0.26749926805496216, + "learning_rate": 6.361423554744851e-05, + "loss": 1.8193, + "step": 14006 + }, + { + "epoch": 4.2992633517495396, + "grad_norm": 0.22357676923274994, + "learning_rate": 6.360945272432841e-05, + "loss": 1.7498, + "step": 14007 + }, + { + "epoch": 4.299570288520565, + "grad_norm": 0.2367832362651825, + "learning_rate": 6.360466976671172e-05, + "loss": 1.7843, + "step": 14008 + }, + { + "epoch": 4.29987722529159, + "grad_norm": 0.23594366014003754, + "learning_rate": 6.35998866746457e-05, + "loss": 1.7442, + "step": 14009 + }, + { + "epoch": 4.300184162062616, + "grad_norm": 0.2660543918609619, + "learning_rate": 6.359510344817765e-05, + "loss": 1.7557, + "step": 14010 + }, + { + "epoch": 4.30049109883364, + "grad_norm": 0.191593199968338, + "learning_rate": 6.359032008735481e-05, + "loss": 1.7988, + "step": 14011 + }, + { + "epoch": 4.300798035604665, + "grad_norm": 0.2755490243434906, + "learning_rate": 6.358553659222447e-05, + "loss": 1.7551, + "step": 14012 + }, + { + "epoch": 4.301104972375691, + "grad_norm": 0.2900530993938446, + "learning_rate": 6.358075296283387e-05, + "loss": 1.7523, + "step": 14013 + }, + { + "epoch": 4.301411909146716, + "grad_norm": 0.22242774069309235, + "learning_rate": 6.357596919923033e-05, + "loss": 1.7626, + "step": 14014 + }, + { + "epoch": 4.301718845917741, + "grad_norm": 0.26636210083961487, + "learning_rate": 6.357118530146108e-05, + "loss": 1.7855, + "step": 14015 + }, + { + "epoch": 4.302025782688766, + "grad_norm": 0.3055269718170166, + "learning_rate": 6.356640126957344e-05, + "loss": 1.7528, + "step": 14016 + }, + { + "epoch": 4.302332719459791, + "grad_norm": 0.29695719480514526, + "learning_rate": 6.356161710361468e-05, + "loss": 1.7482, + "step": 14017 + }, + { + "epoch": 4.3026396562308165, + "grad_norm": 0.2369711697101593, + "learning_rate": 6.355683280363207e-05, + "loss": 1.7635, + "step": 14018 + }, + { + "epoch": 4.302946593001842, + "grad_norm": 0.26681363582611084, + "learning_rate": 6.35520483696729e-05, + "loss": 1.8814, + "step": 14019 + }, + { + "epoch": 4.303253529772867, + "grad_norm": 0.2623308598995209, + "learning_rate": 6.354726380178442e-05, + "loss": 1.8645, + "step": 14020 + }, + { + "epoch": 4.303560466543892, + "grad_norm": 0.23326413333415985, + "learning_rate": 6.354247910001394e-05, + "loss": 1.8093, + "step": 14021 + }, + { + "epoch": 4.303867403314917, + "grad_norm": 0.3037295639514923, + "learning_rate": 6.353769426440875e-05, + "loss": 1.8556, + "step": 14022 + }, + { + "epoch": 4.304174340085942, + "grad_norm": 0.23624882102012634, + "learning_rate": 6.353290929501616e-05, + "loss": 1.803, + "step": 14023 + }, + { + "epoch": 4.304481276856968, + "grad_norm": 0.22106927633285522, + "learning_rate": 6.35281241918834e-05, + "loss": 1.7133, + "step": 14024 + }, + { + "epoch": 4.304788213627993, + "grad_norm": 0.2374040186405182, + "learning_rate": 6.352333895505778e-05, + "loss": 1.8127, + "step": 14025 + }, + { + "epoch": 4.305095150399017, + "grad_norm": 0.2782450318336487, + "learning_rate": 6.35185535845866e-05, + "loss": 1.8613, + "step": 14026 + }, + { + "epoch": 4.305402087170043, + "grad_norm": 0.2527763843536377, + "learning_rate": 6.351376808051717e-05, + "loss": 1.7533, + "step": 14027 + }, + { + "epoch": 4.305709023941068, + "grad_norm": 0.2462318390607834, + "learning_rate": 6.350898244289675e-05, + "loss": 1.8075, + "step": 14028 + }, + { + "epoch": 4.306015960712093, + "grad_norm": 0.2646189332008362, + "learning_rate": 6.350419667177265e-05, + "loss": 1.8261, + "step": 14029 + }, + { + "epoch": 4.306322897483119, + "grad_norm": 0.24918611347675323, + "learning_rate": 6.349941076719218e-05, + "loss": 1.7542, + "step": 14030 + }, + { + "epoch": 4.306629834254144, + "grad_norm": 0.22440841794013977, + "learning_rate": 6.349462472920259e-05, + "loss": 1.7897, + "step": 14031 + }, + { + "epoch": 4.3069367710251685, + "grad_norm": 0.28614330291748047, + "learning_rate": 6.348983855785121e-05, + "loss": 1.88, + "step": 14032 + }, + { + "epoch": 4.307243707796194, + "grad_norm": 0.25015848875045776, + "learning_rate": 6.348505225318535e-05, + "loss": 1.8008, + "step": 14033 + }, + { + "epoch": 4.307550644567219, + "grad_norm": 0.2468707263469696, + "learning_rate": 6.34802658152523e-05, + "loss": 1.8025, + "step": 14034 + }, + { + "epoch": 4.3078575813382445, + "grad_norm": 0.30504748225212097, + "learning_rate": 6.347547924409937e-05, + "loss": 1.8765, + "step": 14035 + }, + { + "epoch": 4.30816451810927, + "grad_norm": 0.35419392585754395, + "learning_rate": 6.347069253977385e-05, + "loss": 1.7807, + "step": 14036 + }, + { + "epoch": 4.308471454880294, + "grad_norm": 0.33683931827545166, + "learning_rate": 6.346590570232305e-05, + "loss": 1.7244, + "step": 14037 + }, + { + "epoch": 4.30877839165132, + "grad_norm": 0.3339467942714691, + "learning_rate": 6.346111873179427e-05, + "loss": 1.7642, + "step": 14038 + }, + { + "epoch": 4.309085328422345, + "grad_norm": 0.2369392216205597, + "learning_rate": 6.345633162823484e-05, + "loss": 1.7127, + "step": 14039 + }, + { + "epoch": 4.30939226519337, + "grad_norm": 0.26469686627388, + "learning_rate": 6.345154439169206e-05, + "loss": 1.7235, + "step": 14040 + }, + { + "epoch": 4.309699201964396, + "grad_norm": 0.2737344205379486, + "learning_rate": 6.344675702221321e-05, + "loss": 1.783, + "step": 14041 + }, + { + "epoch": 4.310006138735421, + "grad_norm": 0.2381773442029953, + "learning_rate": 6.344196951984565e-05, + "loss": 1.7172, + "step": 14042 + }, + { + "epoch": 4.310313075506445, + "grad_norm": 0.28199076652526855, + "learning_rate": 6.343718188463663e-05, + "loss": 1.8315, + "step": 14043 + }, + { + "epoch": 4.310620012277471, + "grad_norm": 0.24378590285778046, + "learning_rate": 6.343239411663353e-05, + "loss": 1.7828, + "step": 14044 + }, + { + "epoch": 4.310926949048496, + "grad_norm": 0.26343944668769836, + "learning_rate": 6.342760621588365e-05, + "loss": 1.7679, + "step": 14045 + }, + { + "epoch": 4.311233885819521, + "grad_norm": 0.23703521490097046, + "learning_rate": 6.342281818243427e-05, + "loss": 1.7885, + "step": 14046 + }, + { + "epoch": 4.311540822590547, + "grad_norm": 0.2230173498392105, + "learning_rate": 6.341803001633276e-05, + "loss": 1.767, + "step": 14047 + }, + { + "epoch": 4.311847759361571, + "grad_norm": 0.249002143740654, + "learning_rate": 6.34132417176264e-05, + "loss": 1.8032, + "step": 14048 + }, + { + "epoch": 4.3121546961325965, + "grad_norm": 0.2383791208267212, + "learning_rate": 6.34084532863625e-05, + "loss": 1.7558, + "step": 14049 + }, + { + "epoch": 4.312461632903622, + "grad_norm": 0.2783047556877136, + "learning_rate": 6.340366472258843e-05, + "loss": 1.8389, + "step": 14050 + }, + { + "epoch": 4.312768569674647, + "grad_norm": 0.2654891312122345, + "learning_rate": 6.339887602635148e-05, + "loss": 1.7989, + "step": 14051 + }, + { + "epoch": 4.3130755064456725, + "grad_norm": 0.2638411521911621, + "learning_rate": 6.3394087197699e-05, + "loss": 1.8707, + "step": 14052 + }, + { + "epoch": 4.313382443216697, + "grad_norm": 0.3026179075241089, + "learning_rate": 6.338929823667829e-05, + "loss": 1.7892, + "step": 14053 + }, + { + "epoch": 4.313689379987722, + "grad_norm": 0.27496880292892456, + "learning_rate": 6.338450914333668e-05, + "loss": 1.7398, + "step": 14054 + }, + { + "epoch": 4.313996316758748, + "grad_norm": 0.2601073086261749, + "learning_rate": 6.337971991772151e-05, + "loss": 1.7646, + "step": 14055 + }, + { + "epoch": 4.314303253529773, + "grad_norm": 0.2061719298362732, + "learning_rate": 6.337493055988011e-05, + "loss": 1.7372, + "step": 14056 + }, + { + "epoch": 4.314610190300798, + "grad_norm": 0.23722340166568756, + "learning_rate": 6.337014106985981e-05, + "loss": 1.7457, + "step": 14057 + }, + { + "epoch": 4.314917127071823, + "grad_norm": 0.2729428708553314, + "learning_rate": 6.336535144770793e-05, + "loss": 1.8423, + "step": 14058 + }, + { + "epoch": 4.315224063842848, + "grad_norm": 0.23520450294017792, + "learning_rate": 6.336056169347182e-05, + "loss": 1.8124, + "step": 14059 + }, + { + "epoch": 4.315531000613873, + "grad_norm": 0.25142738223075867, + "learning_rate": 6.33557718071988e-05, + "loss": 1.7285, + "step": 14060 + }, + { + "epoch": 4.315837937384899, + "grad_norm": 0.24833035469055176, + "learning_rate": 6.335098178893621e-05, + "loss": 1.766, + "step": 14061 + }, + { + "epoch": 4.316144874155924, + "grad_norm": 0.2406177669763565, + "learning_rate": 6.334619163873141e-05, + "loss": 1.8824, + "step": 14062 + }, + { + "epoch": 4.316451810926949, + "grad_norm": 0.23077574372291565, + "learning_rate": 6.334140135663172e-05, + "loss": 1.7589, + "step": 14063 + }, + { + "epoch": 4.316758747697974, + "grad_norm": 0.20476560294628143, + "learning_rate": 6.333661094268448e-05, + "loss": 1.7331, + "step": 14064 + }, + { + "epoch": 4.317065684468999, + "grad_norm": 0.207991823554039, + "learning_rate": 6.333182039693704e-05, + "loss": 1.6876, + "step": 14065 + }, + { + "epoch": 4.3173726212400245, + "grad_norm": 0.20813052356243134, + "learning_rate": 6.332702971943671e-05, + "loss": 1.775, + "step": 14066 + }, + { + "epoch": 4.31767955801105, + "grad_norm": 0.2470991462469101, + "learning_rate": 6.332223891023087e-05, + "loss": 1.7673, + "step": 14067 + }, + { + "epoch": 4.317986494782075, + "grad_norm": 0.23855723440647125, + "learning_rate": 6.331744796936687e-05, + "loss": 1.7842, + "step": 14068 + }, + { + "epoch": 4.3182934315531, + "grad_norm": 0.21852652728557587, + "learning_rate": 6.331265689689204e-05, + "loss": 1.7727, + "step": 14069 + }, + { + "epoch": 4.318600368324125, + "grad_norm": 0.284496545791626, + "learning_rate": 6.330786569285374e-05, + "loss": 1.8248, + "step": 14070 + }, + { + "epoch": 4.31890730509515, + "grad_norm": 0.21709981560707092, + "learning_rate": 6.33030743572993e-05, + "loss": 1.7547, + "step": 14071 + }, + { + "epoch": 4.319214241866176, + "grad_norm": 0.24209457635879517, + "learning_rate": 6.329828289027608e-05, + "loss": 1.7695, + "step": 14072 + }, + { + "epoch": 4.319521178637201, + "grad_norm": 0.24869373440742493, + "learning_rate": 6.329349129183144e-05, + "loss": 1.8204, + "step": 14073 + }, + { + "epoch": 4.319828115408226, + "grad_norm": 0.21702703833580017, + "learning_rate": 6.328869956201274e-05, + "loss": 1.779, + "step": 14074 + }, + { + "epoch": 4.320135052179251, + "grad_norm": 0.22993850708007812, + "learning_rate": 6.328390770086731e-05, + "loss": 1.7935, + "step": 14075 + }, + { + "epoch": 4.320441988950276, + "grad_norm": 0.23491734266281128, + "learning_rate": 6.327911570844252e-05, + "loss": 1.7261, + "step": 14076 + }, + { + "epoch": 4.320748925721301, + "grad_norm": 0.2479303777217865, + "learning_rate": 6.327432358478571e-05, + "loss": 1.7683, + "step": 14077 + }, + { + "epoch": 4.321055862492327, + "grad_norm": 0.24261580407619476, + "learning_rate": 6.326953132994427e-05, + "loss": 1.7147, + "step": 14078 + }, + { + "epoch": 4.321362799263352, + "grad_norm": 0.24627646803855896, + "learning_rate": 6.326473894396553e-05, + "loss": 1.7976, + "step": 14079 + }, + { + "epoch": 4.3216697360343765, + "grad_norm": 0.269149512052536, + "learning_rate": 6.325994642689688e-05, + "loss": 1.7247, + "step": 14080 + }, + { + "epoch": 4.321976672805402, + "grad_norm": 0.4162158966064453, + "learning_rate": 6.325515377878566e-05, + "loss": 1.7485, + "step": 14081 + }, + { + "epoch": 4.322283609576427, + "grad_norm": 0.366459459066391, + "learning_rate": 6.325036099967925e-05, + "loss": 1.7286, + "step": 14082 + }, + { + "epoch": 4.3225905463474525, + "grad_norm": 0.2465270757675171, + "learning_rate": 6.324556808962499e-05, + "loss": 1.8097, + "step": 14083 + }, + { + "epoch": 4.322897483118478, + "grad_norm": 0.2911076843738556, + "learning_rate": 6.324077504867026e-05, + "loss": 1.7979, + "step": 14084 + }, + { + "epoch": 4.323204419889503, + "grad_norm": 0.33455169200897217, + "learning_rate": 6.323598187686245e-05, + "loss": 1.7988, + "step": 14085 + }, + { + "epoch": 4.323511356660528, + "grad_norm": 0.25020337104797363, + "learning_rate": 6.32311885742489e-05, + "loss": 1.7184, + "step": 14086 + }, + { + "epoch": 4.323818293431553, + "grad_norm": 0.23941513895988464, + "learning_rate": 6.322639514087699e-05, + "loss": 1.7672, + "step": 14087 + }, + { + "epoch": 4.324125230202578, + "grad_norm": 0.35258981585502625, + "learning_rate": 6.32216015767941e-05, + "loss": 1.7571, + "step": 14088 + }, + { + "epoch": 4.324432166973604, + "grad_norm": 0.2854993939399719, + "learning_rate": 6.321680788204758e-05, + "loss": 1.8096, + "step": 14089 + }, + { + "epoch": 4.324739103744629, + "grad_norm": 0.24422863125801086, + "learning_rate": 6.321201405668482e-05, + "loss": 1.778, + "step": 14090 + }, + { + "epoch": 4.3250460405156534, + "grad_norm": 0.36629122495651245, + "learning_rate": 6.320722010075321e-05, + "loss": 1.716, + "step": 14091 + }, + { + "epoch": 4.325352977286679, + "grad_norm": 0.37115517258644104, + "learning_rate": 6.32024260143001e-05, + "loss": 1.77, + "step": 14092 + }, + { + "epoch": 4.325659914057704, + "grad_norm": 0.21540327370166779, + "learning_rate": 6.319763179737288e-05, + "loss": 1.7529, + "step": 14093 + }, + { + "epoch": 4.3259668508287294, + "grad_norm": 0.2573898732662201, + "learning_rate": 6.319283745001892e-05, + "loss": 1.8101, + "step": 14094 + }, + { + "epoch": 4.326273787599755, + "grad_norm": 0.29481247067451477, + "learning_rate": 6.31880429722856e-05, + "loss": 1.7459, + "step": 14095 + }, + { + "epoch": 4.326580724370779, + "grad_norm": 0.23474647104740143, + "learning_rate": 6.318324836422031e-05, + "loss": 1.786, + "step": 14096 + }, + { + "epoch": 4.326887661141805, + "grad_norm": 0.2884673476219177, + "learning_rate": 6.317845362587045e-05, + "loss": 1.8123, + "step": 14097 + }, + { + "epoch": 4.32719459791283, + "grad_norm": 0.39008447527885437, + "learning_rate": 6.317365875728338e-05, + "loss": 1.7729, + "step": 14098 + }, + { + "epoch": 4.327501534683855, + "grad_norm": 0.30568063259124756, + "learning_rate": 6.316886375850651e-05, + "loss": 1.7088, + "step": 14099 + }, + { + "epoch": 4.327808471454881, + "grad_norm": 0.2538018524646759, + "learning_rate": 6.316406862958718e-05, + "loss": 1.8028, + "step": 14100 + }, + { + "epoch": 4.328115408225905, + "grad_norm": 0.3815068006515503, + "learning_rate": 6.315927337057281e-05, + "loss": 1.7143, + "step": 14101 + }, + { + "epoch": 4.32842234499693, + "grad_norm": 0.3813243508338928, + "learning_rate": 6.31544779815108e-05, + "loss": 1.7072, + "step": 14102 + }, + { + "epoch": 4.328729281767956, + "grad_norm": 0.22438868880271912, + "learning_rate": 6.314968246244852e-05, + "loss": 1.7445, + "step": 14103 + }, + { + "epoch": 4.329036218538981, + "grad_norm": 0.3818886876106262, + "learning_rate": 6.314488681343337e-05, + "loss": 1.8292, + "step": 14104 + }, + { + "epoch": 4.329343155310006, + "grad_norm": 0.4376567006111145, + "learning_rate": 6.314009103451277e-05, + "loss": 1.8224, + "step": 14105 + }, + { + "epoch": 4.329650092081032, + "grad_norm": 0.2741515636444092, + "learning_rate": 6.313529512573406e-05, + "loss": 1.8078, + "step": 14106 + }, + { + "epoch": 4.329957028852056, + "grad_norm": 0.264343798160553, + "learning_rate": 6.313049908714467e-05, + "loss": 1.7314, + "step": 14107 + }, + { + "epoch": 4.3302639656230815, + "grad_norm": 0.3601943552494049, + "learning_rate": 6.312570291879201e-05, + "loss": 1.7351, + "step": 14108 + }, + { + "epoch": 4.330570902394107, + "grad_norm": 0.2931751012802124, + "learning_rate": 6.312090662072345e-05, + "loss": 1.8117, + "step": 14109 + }, + { + "epoch": 4.330877839165132, + "grad_norm": 0.27670225501060486, + "learning_rate": 6.31161101929864e-05, + "loss": 1.7707, + "step": 14110 + }, + { + "epoch": 4.3311847759361575, + "grad_norm": 0.33669596910476685, + "learning_rate": 6.311131363562825e-05, + "loss": 1.7337, + "step": 14111 + }, + { + "epoch": 4.331491712707182, + "grad_norm": 0.232634037733078, + "learning_rate": 6.310651694869643e-05, + "loss": 1.7372, + "step": 14112 + }, + { + "epoch": 4.331798649478207, + "grad_norm": 0.28611311316490173, + "learning_rate": 6.310172013223832e-05, + "loss": 1.6977, + "step": 14113 + }, + { + "epoch": 4.332105586249233, + "grad_norm": 0.30207201838493347, + "learning_rate": 6.309692318630132e-05, + "loss": 1.7765, + "step": 14114 + }, + { + "epoch": 4.332412523020258, + "grad_norm": 0.20757484436035156, + "learning_rate": 6.309212611093287e-05, + "loss": 1.697, + "step": 14115 + }, + { + "epoch": 4.332719459791283, + "grad_norm": 0.31472963094711304, + "learning_rate": 6.308732890618034e-05, + "loss": 1.7757, + "step": 14116 + }, + { + "epoch": 4.333026396562309, + "grad_norm": 0.37042325735092163, + "learning_rate": 6.308253157209117e-05, + "loss": 1.7745, + "step": 14117 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.25001442432403564, + "learning_rate": 6.307773410871275e-05, + "loss": 1.7461, + "step": 14118 + }, + { + "epoch": 4.333640270104358, + "grad_norm": 0.2691943347454071, + "learning_rate": 6.307293651609248e-05, + "loss": 1.7539, + "step": 14119 + }, + { + "epoch": 4.333947206875384, + "grad_norm": 0.30845868587493896, + "learning_rate": 6.306813879427782e-05, + "loss": 1.7559, + "step": 14120 + }, + { + "epoch": 4.334254143646409, + "grad_norm": 0.2244730293750763, + "learning_rate": 6.306334094331613e-05, + "loss": 1.7609, + "step": 14121 + }, + { + "epoch": 4.334561080417434, + "grad_norm": 0.32132062315940857, + "learning_rate": 6.305854296325485e-05, + "loss": 1.7837, + "step": 14122 + }, + { + "epoch": 4.334868017188459, + "grad_norm": 0.3762948513031006, + "learning_rate": 6.30537448541414e-05, + "loss": 1.7631, + "step": 14123 + }, + { + "epoch": 4.335174953959484, + "grad_norm": 0.24174273014068604, + "learning_rate": 6.30489466160232e-05, + "loss": 1.7532, + "step": 14124 + }, + { + "epoch": 4.3354818907305095, + "grad_norm": 0.23468497395515442, + "learning_rate": 6.304414824894765e-05, + "loss": 1.7731, + "step": 14125 + }, + { + "epoch": 4.335788827501535, + "grad_norm": 0.29086077213287354, + "learning_rate": 6.303934975296218e-05, + "loss": 1.7668, + "step": 14126 + }, + { + "epoch": 4.33609576427256, + "grad_norm": 0.2889879643917084, + "learning_rate": 6.303455112811422e-05, + "loss": 1.8188, + "step": 14127 + }, + { + "epoch": 4.336402701043585, + "grad_norm": 0.2335619181394577, + "learning_rate": 6.302975237445119e-05, + "loss": 1.7944, + "step": 14128 + }, + { + "epoch": 4.33670963781461, + "grad_norm": 0.29027310013771057, + "learning_rate": 6.302495349202051e-05, + "loss": 1.7771, + "step": 14129 + }, + { + "epoch": 4.337016574585635, + "grad_norm": 0.31961241364479065, + "learning_rate": 6.302015448086959e-05, + "loss": 1.8187, + "step": 14130 + }, + { + "epoch": 4.337323511356661, + "grad_norm": 0.26015788316726685, + "learning_rate": 6.301535534104587e-05, + "loss": 1.7819, + "step": 14131 + }, + { + "epoch": 4.337630448127686, + "grad_norm": 0.2440631091594696, + "learning_rate": 6.30105560725968e-05, + "loss": 1.7127, + "step": 14132 + }, + { + "epoch": 4.337937384898711, + "grad_norm": 0.304441899061203, + "learning_rate": 6.300575667556979e-05, + "loss": 1.7619, + "step": 14133 + }, + { + "epoch": 4.338244321669736, + "grad_norm": 0.3085228204727173, + "learning_rate": 6.300095715001226e-05, + "loss": 1.8287, + "step": 14134 + }, + { + "epoch": 4.338551258440761, + "grad_norm": 0.2863372564315796, + "learning_rate": 6.299615749597165e-05, + "loss": 1.8068, + "step": 14135 + }, + { + "epoch": 4.338858195211786, + "grad_norm": 0.25255265831947327, + "learning_rate": 6.299135771349537e-05, + "loss": 1.7506, + "step": 14136 + }, + { + "epoch": 4.339165131982812, + "grad_norm": 0.30224961042404175, + "learning_rate": 6.298655780263092e-05, + "loss": 1.7292, + "step": 14137 + }, + { + "epoch": 4.339472068753837, + "grad_norm": 0.24222104251384735, + "learning_rate": 6.298175776342567e-05, + "loss": 1.7616, + "step": 14138 + }, + { + "epoch": 4.3397790055248615, + "grad_norm": 0.3236368000507355, + "learning_rate": 6.29769575959271e-05, + "loss": 1.787, + "step": 14139 + }, + { + "epoch": 4.340085942295887, + "grad_norm": 0.26049408316612244, + "learning_rate": 6.297215730018261e-05, + "loss": 1.7108, + "step": 14140 + }, + { + "epoch": 4.340392879066912, + "grad_norm": 0.22833532094955444, + "learning_rate": 6.296735687623967e-05, + "loss": 1.7661, + "step": 14141 + }, + { + "epoch": 4.3406998158379375, + "grad_norm": 0.28397905826568604, + "learning_rate": 6.296255632414571e-05, + "loss": 1.7163, + "step": 14142 + }, + { + "epoch": 4.341006752608963, + "grad_norm": 0.3072611093521118, + "learning_rate": 6.295775564394817e-05, + "loss": 1.857, + "step": 14143 + }, + { + "epoch": 4.341313689379987, + "grad_norm": 0.22901058197021484, + "learning_rate": 6.295295483569448e-05, + "loss": 1.7325, + "step": 14144 + }, + { + "epoch": 4.341620626151013, + "grad_norm": 0.27433091402053833, + "learning_rate": 6.294815389943212e-05, + "loss": 1.8229, + "step": 14145 + }, + { + "epoch": 4.341927562922038, + "grad_norm": 0.2635616958141327, + "learning_rate": 6.29433528352085e-05, + "loss": 1.7585, + "step": 14146 + }, + { + "epoch": 4.342234499693063, + "grad_norm": 0.29129260778427124, + "learning_rate": 6.293855164307108e-05, + "loss": 1.8294, + "step": 14147 + }, + { + "epoch": 4.342541436464089, + "grad_norm": 0.3429001569747925, + "learning_rate": 6.293375032306731e-05, + "loss": 1.7725, + "step": 14148 + }, + { + "epoch": 4.342848373235114, + "grad_norm": 0.22407259047031403, + "learning_rate": 6.292894887524464e-05, + "loss": 1.7018, + "step": 14149 + }, + { + "epoch": 4.343155310006138, + "grad_norm": 0.3319321274757385, + "learning_rate": 6.292414729965053e-05, + "loss": 1.8472, + "step": 14150 + }, + { + "epoch": 4.343462246777164, + "grad_norm": 0.42744341492652893, + "learning_rate": 6.291934559633241e-05, + "loss": 1.8118, + "step": 14151 + }, + { + "epoch": 4.343769183548189, + "grad_norm": 0.24572840332984924, + "learning_rate": 6.291454376533774e-05, + "loss": 1.7184, + "step": 14152 + }, + { + "epoch": 4.344076120319214, + "grad_norm": 0.2485980987548828, + "learning_rate": 6.290974180671397e-05, + "loss": 1.7649, + "step": 14153 + }, + { + "epoch": 4.34438305709024, + "grad_norm": 0.3911706209182739, + "learning_rate": 6.29049397205086e-05, + "loss": 1.8105, + "step": 14154 + }, + { + "epoch": 4.344689993861264, + "grad_norm": 0.3008342981338501, + "learning_rate": 6.290013750676902e-05, + "loss": 1.7671, + "step": 14155 + }, + { + "epoch": 4.3449969306322895, + "grad_norm": 0.2072051614522934, + "learning_rate": 6.289533516554274e-05, + "loss": 1.7406, + "step": 14156 + }, + { + "epoch": 4.345303867403315, + "grad_norm": 0.3047312796115875, + "learning_rate": 6.289053269687719e-05, + "loss": 1.8133, + "step": 14157 + }, + { + "epoch": 4.34561080417434, + "grad_norm": 0.28260552883148193, + "learning_rate": 6.288573010081984e-05, + "loss": 1.7253, + "step": 14158 + }, + { + "epoch": 4.3459177409453655, + "grad_norm": 0.2474137246608734, + "learning_rate": 6.288092737741815e-05, + "loss": 1.822, + "step": 14159 + }, + { + "epoch": 4.346224677716391, + "grad_norm": 0.23717878758907318, + "learning_rate": 6.287612452671961e-05, + "loss": 1.7826, + "step": 14160 + }, + { + "epoch": 4.346531614487415, + "grad_norm": 0.2646107077598572, + "learning_rate": 6.287132154877163e-05, + "loss": 1.8118, + "step": 14161 + }, + { + "epoch": 4.346838551258441, + "grad_norm": 0.22026480734348297, + "learning_rate": 6.286651844362172e-05, + "loss": 1.7767, + "step": 14162 + }, + { + "epoch": 4.347145488029466, + "grad_norm": 0.2692350447177887, + "learning_rate": 6.286171521131733e-05, + "loss": 1.8718, + "step": 14163 + }, + { + "epoch": 4.347452424800491, + "grad_norm": 0.2749998867511749, + "learning_rate": 6.285691185190592e-05, + "loss": 1.7689, + "step": 14164 + }, + { + "epoch": 4.347759361571517, + "grad_norm": 0.24552448093891144, + "learning_rate": 6.2852108365435e-05, + "loss": 1.8049, + "step": 14165 + }, + { + "epoch": 4.348066298342541, + "grad_norm": 0.20530807971954346, + "learning_rate": 6.2847304751952e-05, + "loss": 1.7606, + "step": 14166 + }, + { + "epoch": 4.348373235113566, + "grad_norm": 0.23396088182926178, + "learning_rate": 6.28425010115044e-05, + "loss": 1.7482, + "step": 14167 + }, + { + "epoch": 4.348680171884592, + "grad_norm": 0.20512452721595764, + "learning_rate": 6.283769714413968e-05, + "loss": 1.6976, + "step": 14168 + }, + { + "epoch": 4.348987108655617, + "grad_norm": 0.20287172496318817, + "learning_rate": 6.283289314990531e-05, + "loss": 1.7439, + "step": 14169 + }, + { + "epoch": 4.349294045426642, + "grad_norm": 0.2193746268749237, + "learning_rate": 6.282808902884876e-05, + "loss": 1.763, + "step": 14170 + }, + { + "epoch": 4.349600982197667, + "grad_norm": 0.20415273308753967, + "learning_rate": 6.282328478101753e-05, + "loss": 1.7025, + "step": 14171 + }, + { + "epoch": 4.349907918968692, + "grad_norm": 0.19286803901195526, + "learning_rate": 6.281848040645907e-05, + "loss": 1.7529, + "step": 14172 + }, + { + "epoch": 4.350214855739718, + "grad_norm": 0.20908218622207642, + "learning_rate": 6.281367590522088e-05, + "loss": 1.7896, + "step": 14173 + }, + { + "epoch": 4.350521792510743, + "grad_norm": 0.2599989175796509, + "learning_rate": 6.280887127735045e-05, + "loss": 1.764, + "step": 14174 + }, + { + "epoch": 4.350828729281768, + "grad_norm": 0.23955710232257843, + "learning_rate": 6.280406652289523e-05, + "loss": 1.7321, + "step": 14175 + }, + { + "epoch": 4.351135666052793, + "grad_norm": 0.2311990112066269, + "learning_rate": 6.279926164190272e-05, + "loss": 1.7338, + "step": 14176 + }, + { + "epoch": 4.351442602823818, + "grad_norm": 0.2599658966064453, + "learning_rate": 6.27944566344204e-05, + "loss": 1.7444, + "step": 14177 + }, + { + "epoch": 4.351749539594843, + "grad_norm": 0.23079386353492737, + "learning_rate": 6.278965150049579e-05, + "loss": 1.7011, + "step": 14178 + }, + { + "epoch": 4.352056476365869, + "grad_norm": 0.24844171106815338, + "learning_rate": 6.278484624017631e-05, + "loss": 1.7298, + "step": 14179 + }, + { + "epoch": 4.352363413136894, + "grad_norm": 0.24839860200881958, + "learning_rate": 6.27800408535095e-05, + "loss": 1.7717, + "step": 14180 + }, + { + "epoch": 4.352670349907919, + "grad_norm": 0.2652966380119324, + "learning_rate": 6.277523534054284e-05, + "loss": 1.7759, + "step": 14181 + }, + { + "epoch": 4.352977286678944, + "grad_norm": 0.2787603735923767, + "learning_rate": 6.277042970132381e-05, + "loss": 1.8981, + "step": 14182 + }, + { + "epoch": 4.353284223449969, + "grad_norm": 0.2535475194454193, + "learning_rate": 6.276562393589991e-05, + "loss": 1.7538, + "step": 14183 + }, + { + "epoch": 4.3535911602209945, + "grad_norm": 0.3210967183113098, + "learning_rate": 6.276081804431863e-05, + "loss": 1.7087, + "step": 14184 + }, + { + "epoch": 4.35389809699202, + "grad_norm": 0.29936519265174866, + "learning_rate": 6.275601202662749e-05, + "loss": 1.7647, + "step": 14185 + }, + { + "epoch": 4.354205033763045, + "grad_norm": 0.21980762481689453, + "learning_rate": 6.275120588287394e-05, + "loss": 1.7759, + "step": 14186 + }, + { + "epoch": 4.35451197053407, + "grad_norm": 0.26833051443099976, + "learning_rate": 6.274639961310549e-05, + "loss": 1.7648, + "step": 14187 + }, + { + "epoch": 4.354818907305095, + "grad_norm": 0.27998095750808716, + "learning_rate": 6.274159321736966e-05, + "loss": 1.746, + "step": 14188 + }, + { + "epoch": 4.35512584407612, + "grad_norm": 0.21354494988918304, + "learning_rate": 6.273678669571395e-05, + "loss": 1.7417, + "step": 14189 + }, + { + "epoch": 4.355432780847146, + "grad_norm": 0.2295297235250473, + "learning_rate": 6.273198004818583e-05, + "loss": 1.7805, + "step": 14190 + }, + { + "epoch": 4.355739717618171, + "grad_norm": 0.2416422963142395, + "learning_rate": 6.272717327483283e-05, + "loss": 1.73, + "step": 14191 + }, + { + "epoch": 4.356046654389196, + "grad_norm": 0.2685304880142212, + "learning_rate": 6.272236637570244e-05, + "loss": 1.7936, + "step": 14192 + }, + { + "epoch": 4.356353591160221, + "grad_norm": 0.32481294870376587, + "learning_rate": 6.271755935084218e-05, + "loss": 1.7192, + "step": 14193 + }, + { + "epoch": 4.356660527931246, + "grad_norm": 0.2428581267595291, + "learning_rate": 6.271275220029954e-05, + "loss": 1.7428, + "step": 14194 + }, + { + "epoch": 4.356967464702271, + "grad_norm": 0.2266654521226883, + "learning_rate": 6.270794492412203e-05, + "loss": 1.7266, + "step": 14195 + }, + { + "epoch": 4.357274401473297, + "grad_norm": 0.25062093138694763, + "learning_rate": 6.270313752235716e-05, + "loss": 1.7476, + "step": 14196 + }, + { + "epoch": 4.357581338244322, + "grad_norm": 0.24085770547389984, + "learning_rate": 6.269832999505244e-05, + "loss": 1.7981, + "step": 14197 + }, + { + "epoch": 4.3578882750153465, + "grad_norm": 0.27035796642303467, + "learning_rate": 6.269352234225536e-05, + "loss": 1.8867, + "step": 14198 + }, + { + "epoch": 4.358195211786372, + "grad_norm": 0.22464458644390106, + "learning_rate": 6.268871456401348e-05, + "loss": 1.7514, + "step": 14199 + }, + { + "epoch": 4.358502148557397, + "grad_norm": 0.22485734522342682, + "learning_rate": 6.268390666037427e-05, + "loss": 1.7558, + "step": 14200 + }, + { + "epoch": 4.3588090853284225, + "grad_norm": 0.2052135169506073, + "learning_rate": 6.267909863138527e-05, + "loss": 1.7453, + "step": 14201 + }, + { + "epoch": 4.359116022099448, + "grad_norm": 0.2130763679742813, + "learning_rate": 6.267429047709397e-05, + "loss": 1.7712, + "step": 14202 + }, + { + "epoch": 4.359422958870473, + "grad_norm": 0.23146997392177582, + "learning_rate": 6.266948219754793e-05, + "loss": 1.6978, + "step": 14203 + }, + { + "epoch": 4.359729895641498, + "grad_norm": 0.21657225489616394, + "learning_rate": 6.266467379279463e-05, + "loss": 1.7641, + "step": 14204 + }, + { + "epoch": 4.360036832412523, + "grad_norm": 0.2598700523376465, + "learning_rate": 6.265986526288158e-05, + "loss": 1.7956, + "step": 14205 + }, + { + "epoch": 4.360343769183548, + "grad_norm": 0.23497453331947327, + "learning_rate": 6.265505660785633e-05, + "loss": 1.7835, + "step": 14206 + }, + { + "epoch": 4.360650705954574, + "grad_norm": 0.2491760104894638, + "learning_rate": 6.265024782776641e-05, + "loss": 1.8454, + "step": 14207 + }, + { + "epoch": 4.360957642725599, + "grad_norm": 0.224884033203125, + "learning_rate": 6.264543892265932e-05, + "loss": 1.8383, + "step": 14208 + }, + { + "epoch": 4.361264579496623, + "grad_norm": 0.24057646095752716, + "learning_rate": 6.264062989258259e-05, + "loss": 1.7437, + "step": 14209 + }, + { + "epoch": 4.361571516267649, + "grad_norm": 0.24661841988563538, + "learning_rate": 6.263582073758374e-05, + "loss": 1.8151, + "step": 14210 + }, + { + "epoch": 4.361878453038674, + "grad_norm": 0.24618980288505554, + "learning_rate": 6.263101145771031e-05, + "loss": 1.7955, + "step": 14211 + }, + { + "epoch": 4.362185389809699, + "grad_norm": 0.2615448236465454, + "learning_rate": 6.262620205300981e-05, + "loss": 1.7819, + "step": 14212 + }, + { + "epoch": 4.362492326580725, + "grad_norm": 0.3528309464454651, + "learning_rate": 6.26213925235298e-05, + "loss": 1.7723, + "step": 14213 + }, + { + "epoch": 4.362799263351749, + "grad_norm": 0.3099561035633087, + "learning_rate": 6.261658286931779e-05, + "loss": 1.7361, + "step": 14214 + }, + { + "epoch": 4.3631062001227745, + "grad_norm": 0.23693235218524933, + "learning_rate": 6.26117730904213e-05, + "loss": 1.8117, + "step": 14215 + }, + { + "epoch": 4.3634131368938, + "grad_norm": 0.4164150655269623, + "learning_rate": 6.260696318688786e-05, + "loss": 1.7908, + "step": 14216 + }, + { + "epoch": 4.363720073664825, + "grad_norm": 0.39376336336135864, + "learning_rate": 6.260215315876506e-05, + "loss": 1.7832, + "step": 14217 + }, + { + "epoch": 4.3640270104358505, + "grad_norm": 0.24071799218654633, + "learning_rate": 6.259734300610037e-05, + "loss": 1.7569, + "step": 14218 + }, + { + "epoch": 4.364333947206875, + "grad_norm": 0.4305122494697571, + "learning_rate": 6.259253272894136e-05, + "loss": 1.7974, + "step": 14219 + }, + { + "epoch": 4.3646408839779, + "grad_norm": 0.3023197054862976, + "learning_rate": 6.258772232733556e-05, + "loss": 1.7589, + "step": 14220 + }, + { + "epoch": 4.364947820748926, + "grad_norm": 0.23253366351127625, + "learning_rate": 6.258291180133052e-05, + "loss": 1.7138, + "step": 14221 + }, + { + "epoch": 4.365254757519951, + "grad_norm": 0.41141277551651, + "learning_rate": 6.257810115097376e-05, + "loss": 1.7608, + "step": 14222 + }, + { + "epoch": 4.365561694290976, + "grad_norm": 0.3308235704898834, + "learning_rate": 6.257329037631284e-05, + "loss": 1.8006, + "step": 14223 + }, + { + "epoch": 4.365868631062002, + "grad_norm": 0.2635105848312378, + "learning_rate": 6.256847947739528e-05, + "loss": 1.7275, + "step": 14224 + }, + { + "epoch": 4.366175567833026, + "grad_norm": 0.45886602997779846, + "learning_rate": 6.256366845426864e-05, + "loss": 1.7701, + "step": 14225 + }, + { + "epoch": 4.366482504604051, + "grad_norm": 0.48503565788269043, + "learning_rate": 6.255885730698049e-05, + "loss": 1.7409, + "step": 14226 + }, + { + "epoch": 4.366789441375077, + "grad_norm": 0.26727184653282166, + "learning_rate": 6.255404603557833e-05, + "loss": 1.7288, + "step": 14227 + }, + { + "epoch": 4.367096378146102, + "grad_norm": 0.3343912363052368, + "learning_rate": 6.254923464010974e-05, + "loss": 1.764, + "step": 14228 + }, + { + "epoch": 4.367403314917127, + "grad_norm": 0.40050622820854187, + "learning_rate": 6.254442312062224e-05, + "loss": 1.7653, + "step": 14229 + }, + { + "epoch": 4.367710251688152, + "grad_norm": 0.23941144347190857, + "learning_rate": 6.253961147716341e-05, + "loss": 1.6886, + "step": 14230 + }, + { + "epoch": 4.368017188459177, + "grad_norm": 0.25737255811691284, + "learning_rate": 6.253479970978079e-05, + "loss": 1.8047, + "step": 14231 + }, + { + "epoch": 4.3683241252302025, + "grad_norm": 0.28780993819236755, + "learning_rate": 6.252998781852192e-05, + "loss": 1.7453, + "step": 14232 + }, + { + "epoch": 4.368631062001228, + "grad_norm": 0.2362327128648758, + "learning_rate": 6.252517580343438e-05, + "loss": 1.7963, + "step": 14233 + }, + { + "epoch": 4.368937998772253, + "grad_norm": 0.263013631105423, + "learning_rate": 6.252036366456571e-05, + "loss": 1.7837, + "step": 14234 + }, + { + "epoch": 4.3692449355432785, + "grad_norm": 0.27674412727355957, + "learning_rate": 6.251555140196347e-05, + "loss": 1.767, + "step": 14235 + }, + { + "epoch": 4.369551872314303, + "grad_norm": 0.2360621690750122, + "learning_rate": 6.251073901567522e-05, + "loss": 1.7806, + "step": 14236 + }, + { + "epoch": 4.369858809085328, + "grad_norm": 0.2568018138408661, + "learning_rate": 6.25059265057485e-05, + "loss": 1.7672, + "step": 14237 + }, + { + "epoch": 4.370165745856354, + "grad_norm": 0.2512381374835968, + "learning_rate": 6.25011138722309e-05, + "loss": 1.7506, + "step": 14238 + }, + { + "epoch": 4.370472682627379, + "grad_norm": 0.21587291359901428, + "learning_rate": 6.249630111516994e-05, + "loss": 1.7336, + "step": 14239 + }, + { + "epoch": 4.370779619398404, + "grad_norm": 0.21791933476924896, + "learning_rate": 6.249148823461323e-05, + "loss": 1.7588, + "step": 14240 + }, + { + "epoch": 4.371086556169429, + "grad_norm": 0.23061512410640717, + "learning_rate": 6.248667523060831e-05, + "loss": 1.742, + "step": 14241 + }, + { + "epoch": 4.371393492940454, + "grad_norm": 0.2007007598876953, + "learning_rate": 6.248186210320274e-05, + "loss": 1.7227, + "step": 14242 + }, + { + "epoch": 4.371700429711479, + "grad_norm": 0.2564350366592407, + "learning_rate": 6.247704885244411e-05, + "loss": 1.7529, + "step": 14243 + }, + { + "epoch": 4.372007366482505, + "grad_norm": 0.21880537271499634, + "learning_rate": 6.247223547837995e-05, + "loss": 1.7828, + "step": 14244 + }, + { + "epoch": 4.37231430325353, + "grad_norm": 0.26154282689094543, + "learning_rate": 6.246742198105785e-05, + "loss": 1.7895, + "step": 14245 + }, + { + "epoch": 4.3726212400245545, + "grad_norm": 0.2652645707130432, + "learning_rate": 6.24626083605254e-05, + "loss": 1.8038, + "step": 14246 + }, + { + "epoch": 4.37292817679558, + "grad_norm": 0.21463751792907715, + "learning_rate": 6.245779461683013e-05, + "loss": 1.7139, + "step": 14247 + }, + { + "epoch": 4.373235113566605, + "grad_norm": 0.21285851299762726, + "learning_rate": 6.245298075001961e-05, + "loss": 1.7686, + "step": 14248 + }, + { + "epoch": 4.3735420503376305, + "grad_norm": 0.258602499961853, + "learning_rate": 6.244816676014149e-05, + "loss": 1.8518, + "step": 14249 + }, + { + "epoch": 4.373848987108656, + "grad_norm": 0.25747501850128174, + "learning_rate": 6.244335264724323e-05, + "loss": 1.8019, + "step": 14250 + }, + { + "epoch": 4.37415592387968, + "grad_norm": 0.24678784608840942, + "learning_rate": 6.243853841137251e-05, + "loss": 1.7846, + "step": 14251 + }, + { + "epoch": 4.374462860650706, + "grad_norm": 0.31382107734680176, + "learning_rate": 6.243372405257685e-05, + "loss": 1.8389, + "step": 14252 + }, + { + "epoch": 4.374769797421731, + "grad_norm": 0.30522868037223816, + "learning_rate": 6.242890957090383e-05, + "loss": 1.8057, + "step": 14253 + }, + { + "epoch": 4.375076734192756, + "grad_norm": 0.2449347972869873, + "learning_rate": 6.242409496640106e-05, + "loss": 1.7144, + "step": 14254 + }, + { + "epoch": 4.375383670963782, + "grad_norm": 0.3193594217300415, + "learning_rate": 6.241928023911609e-05, + "loss": 1.7404, + "step": 14255 + }, + { + "epoch": 4.375690607734807, + "grad_norm": 0.23948179185390472, + "learning_rate": 6.241446538909651e-05, + "loss": 1.7338, + "step": 14256 + }, + { + "epoch": 4.3759975445058314, + "grad_norm": 0.35325706005096436, + "learning_rate": 6.240965041638991e-05, + "loss": 1.7673, + "step": 14257 + }, + { + "epoch": 4.376304481276857, + "grad_norm": 0.38753262162208557, + "learning_rate": 6.240483532104387e-05, + "loss": 1.769, + "step": 14258 + }, + { + "epoch": 4.376611418047882, + "grad_norm": 0.2749052941799164, + "learning_rate": 6.2400020103106e-05, + "loss": 1.8086, + "step": 14259 + }, + { + "epoch": 4.3769183548189075, + "grad_norm": 0.2553126811981201, + "learning_rate": 6.239520476262384e-05, + "loss": 1.7733, + "step": 14260 + }, + { + "epoch": 4.377225291589933, + "grad_norm": 0.2854517698287964, + "learning_rate": 6.2390389299645e-05, + "loss": 1.7926, + "step": 14261 + }, + { + "epoch": 4.377532228360957, + "grad_norm": 0.24617259204387665, + "learning_rate": 6.238557371421708e-05, + "loss": 1.7297, + "step": 14262 + }, + { + "epoch": 4.377839165131983, + "grad_norm": 0.2555331289768219, + "learning_rate": 6.238075800638765e-05, + "loss": 1.7566, + "step": 14263 + }, + { + "epoch": 4.378146101903008, + "grad_norm": 0.31666773557662964, + "learning_rate": 6.237594217620432e-05, + "loss": 1.8003, + "step": 14264 + }, + { + "epoch": 4.378453038674033, + "grad_norm": 0.24166476726531982, + "learning_rate": 6.237112622371468e-05, + "loss": 1.7425, + "step": 14265 + }, + { + "epoch": 4.378759975445059, + "grad_norm": 0.21237102150917053, + "learning_rate": 6.236631014896633e-05, + "loss": 1.73, + "step": 14266 + }, + { + "epoch": 4.379066912216084, + "grad_norm": 0.2739151120185852, + "learning_rate": 6.236149395200683e-05, + "loss": 1.7113, + "step": 14267 + }, + { + "epoch": 4.379373848987108, + "grad_norm": 0.23700746893882751, + "learning_rate": 6.23566776328838e-05, + "loss": 1.7256, + "step": 14268 + }, + { + "epoch": 4.379680785758134, + "grad_norm": 0.22366748750209808, + "learning_rate": 6.235186119164485e-05, + "loss": 1.7981, + "step": 14269 + }, + { + "epoch": 4.379987722529159, + "grad_norm": 0.28440114855766296, + "learning_rate": 6.234704462833758e-05, + "loss": 1.8087, + "step": 14270 + }, + { + "epoch": 4.380294659300184, + "grad_norm": 0.2706616520881653, + "learning_rate": 6.234222794300957e-05, + "loss": 1.7502, + "step": 14271 + }, + { + "epoch": 4.38060159607121, + "grad_norm": 0.21666266024112701, + "learning_rate": 6.233741113570843e-05, + "loss": 1.7639, + "step": 14272 + }, + { + "epoch": 4.380908532842234, + "grad_norm": 0.26790255308151245, + "learning_rate": 6.233259420648175e-05, + "loss": 1.796, + "step": 14273 + }, + { + "epoch": 4.3812154696132595, + "grad_norm": 0.22233673930168152, + "learning_rate": 6.232777715537715e-05, + "loss": 1.7661, + "step": 14274 + }, + { + "epoch": 4.381522406384285, + "grad_norm": 0.3277546763420105, + "learning_rate": 6.232295998244223e-05, + "loss": 1.7932, + "step": 14275 + }, + { + "epoch": 4.38182934315531, + "grad_norm": 0.2907596826553345, + "learning_rate": 6.231814268772463e-05, + "loss": 1.7103, + "step": 14276 + }, + { + "epoch": 4.3821362799263355, + "grad_norm": 0.2318384349346161, + "learning_rate": 6.231332527127188e-05, + "loss": 1.7351, + "step": 14277 + }, + { + "epoch": 4.382443216697361, + "grad_norm": 0.32904061675071716, + "learning_rate": 6.230850773313163e-05, + "loss": 1.7967, + "step": 14278 + }, + { + "epoch": 4.382750153468385, + "grad_norm": 0.2455490082502365, + "learning_rate": 6.230369007335153e-05, + "loss": 1.7474, + "step": 14279 + }, + { + "epoch": 4.383057090239411, + "grad_norm": 0.23648180067539215, + "learning_rate": 6.229887229197913e-05, + "loss": 1.7106, + "step": 14280 + }, + { + "epoch": 4.383364027010436, + "grad_norm": 0.29552599787712097, + "learning_rate": 6.229405438906207e-05, + "loss": 1.7765, + "step": 14281 + }, + { + "epoch": 4.383670963781461, + "grad_norm": 0.2094641923904419, + "learning_rate": 6.228923636464796e-05, + "loss": 1.7105, + "step": 14282 + }, + { + "epoch": 4.383977900552487, + "grad_norm": 0.24632154405117035, + "learning_rate": 6.228441821878441e-05, + "loss": 1.7913, + "step": 14283 + }, + { + "epoch": 4.384284837323511, + "grad_norm": 0.28114691376686096, + "learning_rate": 6.227959995151904e-05, + "loss": 1.7456, + "step": 14284 + }, + { + "epoch": 4.384591774094536, + "grad_norm": 0.24226875603199005, + "learning_rate": 6.227478156289946e-05, + "loss": 1.797, + "step": 14285 + }, + { + "epoch": 4.384898710865562, + "grad_norm": 0.2526854872703552, + "learning_rate": 6.22699630529733e-05, + "loss": 1.7155, + "step": 14286 + }, + { + "epoch": 4.385205647636587, + "grad_norm": 0.312916100025177, + "learning_rate": 6.226514442178818e-05, + "loss": 1.7808, + "step": 14287 + }, + { + "epoch": 4.385512584407612, + "grad_norm": 0.23087100684642792, + "learning_rate": 6.22603256693917e-05, + "loss": 1.7543, + "step": 14288 + }, + { + "epoch": 4.385819521178637, + "grad_norm": 0.3042476177215576, + "learning_rate": 6.22555067958315e-05, + "loss": 1.747, + "step": 14289 + }, + { + "epoch": 4.386126457949662, + "grad_norm": 0.2604007422924042, + "learning_rate": 6.225068780115522e-05, + "loss": 1.7262, + "step": 14290 + }, + { + "epoch": 4.3864333947206875, + "grad_norm": 0.2200118750333786, + "learning_rate": 6.224586868541044e-05, + "loss": 1.75, + "step": 14291 + }, + { + "epoch": 4.386740331491713, + "grad_norm": 0.3452017307281494, + "learning_rate": 6.224104944864481e-05, + "loss": 1.7598, + "step": 14292 + }, + { + "epoch": 4.387047268262738, + "grad_norm": 0.3169453740119934, + "learning_rate": 6.223623009090597e-05, + "loss": 1.7939, + "step": 14293 + }, + { + "epoch": 4.387354205033763, + "grad_norm": 0.23640502989292145, + "learning_rate": 6.223141061224151e-05, + "loss": 1.8005, + "step": 14294 + }, + { + "epoch": 4.387661141804788, + "grad_norm": 0.26212456822395325, + "learning_rate": 6.22265910126991e-05, + "loss": 1.7951, + "step": 14295 + }, + { + "epoch": 4.387968078575813, + "grad_norm": 0.2687644362449646, + "learning_rate": 6.222177129232634e-05, + "loss": 1.7674, + "step": 14296 + }, + { + "epoch": 4.388275015346839, + "grad_norm": 0.2553202211856842, + "learning_rate": 6.221695145117086e-05, + "loss": 1.8142, + "step": 14297 + }, + { + "epoch": 4.388581952117864, + "grad_norm": 0.3317619264125824, + "learning_rate": 6.221213148928034e-05, + "loss": 1.7884, + "step": 14298 + }, + { + "epoch": 4.388888888888889, + "grad_norm": 0.3059331476688385, + "learning_rate": 6.220731140670235e-05, + "loss": 1.7377, + "step": 14299 + }, + { + "epoch": 4.389195825659914, + "grad_norm": 0.21544015407562256, + "learning_rate": 6.220249120348457e-05, + "loss": 1.6818, + "step": 14300 + }, + { + "epoch": 4.389502762430939, + "grad_norm": 0.3112640380859375, + "learning_rate": 6.219767087967461e-05, + "loss": 1.72, + "step": 14301 + }, + { + "epoch": 4.389809699201964, + "grad_norm": 0.2572654187679291, + "learning_rate": 6.219285043532011e-05, + "loss": 1.793, + "step": 14302 + }, + { + "epoch": 4.39011663597299, + "grad_norm": 0.2621476948261261, + "learning_rate": 6.218802987046874e-05, + "loss": 1.8301, + "step": 14303 + }, + { + "epoch": 4.390423572744015, + "grad_norm": 0.2592658996582031, + "learning_rate": 6.218320918516809e-05, + "loss": 1.7219, + "step": 14304 + }, + { + "epoch": 4.3907305095150395, + "grad_norm": 0.25503265857696533, + "learning_rate": 6.217838837946584e-05, + "loss": 1.8149, + "step": 14305 + }, + { + "epoch": 4.391037446286065, + "grad_norm": 0.21944166719913483, + "learning_rate": 6.217356745340962e-05, + "loss": 1.7174, + "step": 14306 + }, + { + "epoch": 4.39134438305709, + "grad_norm": 0.2937396466732025, + "learning_rate": 6.216874640704707e-05, + "loss": 1.8562, + "step": 14307 + }, + { + "epoch": 4.3916513198281155, + "grad_norm": 0.22520211338996887, + "learning_rate": 6.216392524042581e-05, + "loss": 1.7701, + "step": 14308 + }, + { + "epoch": 4.391958256599141, + "grad_norm": 0.24397830665111542, + "learning_rate": 6.215910395359355e-05, + "loss": 1.7794, + "step": 14309 + }, + { + "epoch": 4.392265193370166, + "grad_norm": 0.2867623567581177, + "learning_rate": 6.215428254659788e-05, + "loss": 1.7275, + "step": 14310 + }, + { + "epoch": 4.392572130141191, + "grad_norm": 0.2632426917552948, + "learning_rate": 6.214946101948648e-05, + "loss": 1.7919, + "step": 14311 + }, + { + "epoch": 4.392879066912216, + "grad_norm": 0.23146092891693115, + "learning_rate": 6.214463937230696e-05, + "loss": 1.744, + "step": 14312 + }, + { + "epoch": 4.393186003683241, + "grad_norm": 0.21877676248550415, + "learning_rate": 6.213981760510701e-05, + "loss": 1.7577, + "step": 14313 + }, + { + "epoch": 4.393492940454267, + "grad_norm": 0.2320399284362793, + "learning_rate": 6.213499571793426e-05, + "loss": 1.7864, + "step": 14314 + }, + { + "epoch": 4.393799877225292, + "grad_norm": 0.2951548993587494, + "learning_rate": 6.213017371083638e-05, + "loss": 1.8257, + "step": 14315 + }, + { + "epoch": 4.394106813996316, + "grad_norm": 0.26062941551208496, + "learning_rate": 6.212535158386102e-05, + "loss": 1.7448, + "step": 14316 + }, + { + "epoch": 4.394413750767342, + "grad_norm": 0.24760986864566803, + "learning_rate": 6.21205293370558e-05, + "loss": 1.7902, + "step": 14317 + }, + { + "epoch": 4.394720687538367, + "grad_norm": 0.2686399221420288, + "learning_rate": 6.211570697046844e-05, + "loss": 1.8209, + "step": 14318 + }, + { + "epoch": 4.395027624309392, + "grad_norm": 0.2599134147167206, + "learning_rate": 6.211088448414653e-05, + "loss": 1.8231, + "step": 14319 + }, + { + "epoch": 4.395334561080418, + "grad_norm": 0.254044771194458, + "learning_rate": 6.210606187813778e-05, + "loss": 1.806, + "step": 14320 + }, + { + "epoch": 4.395641497851442, + "grad_norm": 0.262229323387146, + "learning_rate": 6.210123915248982e-05, + "loss": 1.7857, + "step": 14321 + }, + { + "epoch": 4.3959484346224675, + "grad_norm": 0.2849259078502655, + "learning_rate": 6.209641630725033e-05, + "loss": 1.8005, + "step": 14322 + }, + { + "epoch": 4.396255371393493, + "grad_norm": 0.35480254888534546, + "learning_rate": 6.209159334246697e-05, + "loss": 1.8189, + "step": 14323 + }, + { + "epoch": 4.396562308164518, + "grad_norm": 0.2599184215068817, + "learning_rate": 6.20867702581874e-05, + "loss": 1.7384, + "step": 14324 + }, + { + "epoch": 4.3968692449355435, + "grad_norm": 0.23994222283363342, + "learning_rate": 6.208194705445926e-05, + "loss": 1.7566, + "step": 14325 + }, + { + "epoch": 4.397176181706568, + "grad_norm": 0.24361753463745117, + "learning_rate": 6.207712373133024e-05, + "loss": 1.6965, + "step": 14326 + }, + { + "epoch": 4.397483118477593, + "grad_norm": 0.23925161361694336, + "learning_rate": 6.207230028884803e-05, + "loss": 1.7596, + "step": 14327 + }, + { + "epoch": 4.397790055248619, + "grad_norm": 0.24365897476673126, + "learning_rate": 6.206747672706025e-05, + "loss": 1.7951, + "step": 14328 + }, + { + "epoch": 4.398096992019644, + "grad_norm": 0.25245413184165955, + "learning_rate": 6.206265304601461e-05, + "loss": 1.8086, + "step": 14329 + }, + { + "epoch": 4.398403928790669, + "grad_norm": 0.24272513389587402, + "learning_rate": 6.205782924575874e-05, + "loss": 1.8148, + "step": 14330 + }, + { + "epoch": 4.398710865561695, + "grad_norm": 0.21299590170383453, + "learning_rate": 6.205300532634036e-05, + "loss": 1.7666, + "step": 14331 + }, + { + "epoch": 4.399017802332719, + "grad_norm": 0.23543189465999603, + "learning_rate": 6.20481812878071e-05, + "loss": 1.7629, + "step": 14332 + }, + { + "epoch": 4.399324739103744, + "grad_norm": 0.2284495085477829, + "learning_rate": 6.204335713020665e-05, + "loss": 1.768, + "step": 14333 + }, + { + "epoch": 4.39963167587477, + "grad_norm": 0.23158542811870575, + "learning_rate": 6.20385328535867e-05, + "loss": 1.7761, + "step": 14334 + }, + { + "epoch": 4.399938612645795, + "grad_norm": 0.2378150224685669, + "learning_rate": 6.20337084579949e-05, + "loss": 1.8483, + "step": 14335 + }, + { + "epoch": 4.4002455494168204, + "grad_norm": 0.2407436966896057, + "learning_rate": 6.202888394347892e-05, + "loss": 1.7364, + "step": 14336 + }, + { + "epoch": 4.400552486187845, + "grad_norm": 0.256259560585022, + "learning_rate": 6.202405931008649e-05, + "loss": 1.7376, + "step": 14337 + }, + { + "epoch": 4.40085942295887, + "grad_norm": 0.29293057322502136, + "learning_rate": 6.201923455786524e-05, + "loss": 1.7493, + "step": 14338 + }, + { + "epoch": 4.401166359729896, + "grad_norm": 0.24025334417819977, + "learning_rate": 6.201440968686288e-05, + "loss": 1.7522, + "step": 14339 + }, + { + "epoch": 4.401473296500921, + "grad_norm": 0.3215656280517578, + "learning_rate": 6.200958469712708e-05, + "loss": 1.7748, + "step": 14340 + }, + { + "epoch": 4.401780233271946, + "grad_norm": 0.43553170561790466, + "learning_rate": 6.200475958870553e-05, + "loss": 1.771, + "step": 14341 + }, + { + "epoch": 4.402087170042972, + "grad_norm": 0.3112131953239441, + "learning_rate": 6.19999343616459e-05, + "loss": 1.7655, + "step": 14342 + }, + { + "epoch": 4.402394106813996, + "grad_norm": 0.25197842717170715, + "learning_rate": 6.199510901599589e-05, + "loss": 1.7214, + "step": 14343 + }, + { + "epoch": 4.402701043585021, + "grad_norm": 0.33227142691612244, + "learning_rate": 6.19902835518032e-05, + "loss": 1.7332, + "step": 14344 + }, + { + "epoch": 4.403007980356047, + "grad_norm": 0.27962982654571533, + "learning_rate": 6.198545796911548e-05, + "loss": 1.6943, + "step": 14345 + }, + { + "epoch": 4.403314917127072, + "grad_norm": 0.24374182522296906, + "learning_rate": 6.198063226798044e-05, + "loss": 1.7222, + "step": 14346 + }, + { + "epoch": 4.403621853898097, + "grad_norm": 0.3101944625377655, + "learning_rate": 6.197580644844576e-05, + "loss": 1.7113, + "step": 14347 + }, + { + "epoch": 4.403928790669122, + "grad_norm": 0.25919321179389954, + "learning_rate": 6.197098051055916e-05, + "loss": 1.71, + "step": 14348 + }, + { + "epoch": 4.404235727440147, + "grad_norm": 0.23140330612659454, + "learning_rate": 6.19661544543683e-05, + "loss": 1.7472, + "step": 14349 + }, + { + "epoch": 4.4045426642111725, + "grad_norm": 0.3274286687374115, + "learning_rate": 6.19613282799209e-05, + "loss": 1.7093, + "step": 14350 + }, + { + "epoch": 4.404849600982198, + "grad_norm": 0.3187442123889923, + "learning_rate": 6.195650198726464e-05, + "loss": 1.7488, + "step": 14351 + }, + { + "epoch": 4.405156537753223, + "grad_norm": 0.20547433197498322, + "learning_rate": 6.195167557644722e-05, + "loss": 1.7295, + "step": 14352 + }, + { + "epoch": 4.4054634745242485, + "grad_norm": 0.2623414993286133, + "learning_rate": 6.194684904751633e-05, + "loss": 1.8258, + "step": 14353 + }, + { + "epoch": 4.405770411295273, + "grad_norm": 0.2468457818031311, + "learning_rate": 6.194202240051967e-05, + "loss": 1.6957, + "step": 14354 + }, + { + "epoch": 4.406077348066298, + "grad_norm": 0.2082364559173584, + "learning_rate": 6.193719563550496e-05, + "loss": 1.7596, + "step": 14355 + }, + { + "epoch": 4.406384284837324, + "grad_norm": 0.27072983980178833, + "learning_rate": 6.193236875251988e-05, + "loss": 1.7341, + "step": 14356 + }, + { + "epoch": 4.406691221608349, + "grad_norm": 0.2630362808704376, + "learning_rate": 6.192754175161215e-05, + "loss": 1.7664, + "step": 14357 + }, + { + "epoch": 4.406998158379374, + "grad_norm": 0.25400006771087646, + "learning_rate": 6.192271463282944e-05, + "loss": 1.7582, + "step": 14358 + }, + { + "epoch": 4.407305095150399, + "grad_norm": 0.22256311774253845, + "learning_rate": 6.191788739621949e-05, + "loss": 1.7389, + "step": 14359 + }, + { + "epoch": 4.407612031921424, + "grad_norm": 0.2160387486219406, + "learning_rate": 6.191306004182999e-05, + "loss": 1.7051, + "step": 14360 + }, + { + "epoch": 4.407918968692449, + "grad_norm": 0.20665684342384338, + "learning_rate": 6.190823256970865e-05, + "loss": 1.7606, + "step": 14361 + }, + { + "epoch": 4.408225905463475, + "grad_norm": 0.2173188328742981, + "learning_rate": 6.190340497990318e-05, + "loss": 1.7944, + "step": 14362 + }, + { + "epoch": 4.4085328422345, + "grad_norm": 0.189287930727005, + "learning_rate": 6.189857727246127e-05, + "loss": 1.7283, + "step": 14363 + }, + { + "epoch": 4.4088397790055245, + "grad_norm": 0.2531645596027374, + "learning_rate": 6.189374944743065e-05, + "loss": 1.7554, + "step": 14364 + }, + { + "epoch": 4.40914671577655, + "grad_norm": 0.25439125299453735, + "learning_rate": 6.188892150485903e-05, + "loss": 1.8032, + "step": 14365 + }, + { + "epoch": 4.409453652547575, + "grad_norm": 0.20938685536384583, + "learning_rate": 6.188409344479412e-05, + "loss": 1.7385, + "step": 14366 + }, + { + "epoch": 4.4097605893186005, + "grad_norm": 0.20471477508544922, + "learning_rate": 6.187926526728364e-05, + "loss": 1.7487, + "step": 14367 + }, + { + "epoch": 4.410067526089626, + "grad_norm": 0.2381851226091385, + "learning_rate": 6.187443697237529e-05, + "loss": 1.7443, + "step": 14368 + }, + { + "epoch": 4.41037446286065, + "grad_norm": 0.21584098041057587, + "learning_rate": 6.18696085601168e-05, + "loss": 1.7818, + "step": 14369 + }, + { + "epoch": 4.410681399631676, + "grad_norm": 0.2575368583202362, + "learning_rate": 6.186478003055587e-05, + "loss": 1.8204, + "step": 14370 + }, + { + "epoch": 4.410988336402701, + "grad_norm": 0.21133238077163696, + "learning_rate": 6.185995138374024e-05, + "loss": 1.7274, + "step": 14371 + }, + { + "epoch": 4.411295273173726, + "grad_norm": 0.24918322265148163, + "learning_rate": 6.18551226197176e-05, + "loss": 1.8021, + "step": 14372 + }, + { + "epoch": 4.411602209944752, + "grad_norm": 0.2253655642271042, + "learning_rate": 6.185029373853572e-05, + "loss": 1.7308, + "step": 14373 + }, + { + "epoch": 4.411909146715777, + "grad_norm": 0.20098713040351868, + "learning_rate": 6.184546474024226e-05, + "loss": 1.7549, + "step": 14374 + }, + { + "epoch": 4.412216083486801, + "grad_norm": 0.25612789392471313, + "learning_rate": 6.1840635624885e-05, + "loss": 1.8305, + "step": 14375 + }, + { + "epoch": 4.412523020257827, + "grad_norm": 0.24287539720535278, + "learning_rate": 6.183580639251164e-05, + "loss": 1.7339, + "step": 14376 + }, + { + "epoch": 4.412829957028852, + "grad_norm": 0.2304944545030594, + "learning_rate": 6.183097704316988e-05, + "loss": 1.7023, + "step": 14377 + }, + { + "epoch": 4.413136893799877, + "grad_norm": 0.21911773085594177, + "learning_rate": 6.18261475769075e-05, + "loss": 1.7305, + "step": 14378 + }, + { + "epoch": 4.413443830570903, + "grad_norm": 0.24207864701747894, + "learning_rate": 6.182131799377217e-05, + "loss": 1.7318, + "step": 14379 + }, + { + "epoch": 4.413750767341927, + "grad_norm": 0.2551634609699249, + "learning_rate": 6.181648829381165e-05, + "loss": 1.8101, + "step": 14380 + }, + { + "epoch": 4.4140577041129525, + "grad_norm": 0.4114011526107788, + "learning_rate": 6.181165847707368e-05, + "loss": 1.772, + "step": 14381 + }, + { + "epoch": 4.414364640883978, + "grad_norm": 0.4592796862125397, + "learning_rate": 6.180682854360598e-05, + "loss": 1.7359, + "step": 14382 + }, + { + "epoch": 4.414671577655003, + "grad_norm": 0.2599259614944458, + "learning_rate": 6.180199849345627e-05, + "loss": 1.7028, + "step": 14383 + }, + { + "epoch": 4.4149785144260285, + "grad_norm": 0.3489506244659424, + "learning_rate": 6.17971683266723e-05, + "loss": 1.8252, + "step": 14384 + }, + { + "epoch": 4.415285451197054, + "grad_norm": 0.44563809037208557, + "learning_rate": 6.179233804330179e-05, + "loss": 1.6894, + "step": 14385 + }, + { + "epoch": 4.415592387968078, + "grad_norm": 0.2596888542175293, + "learning_rate": 6.17875076433925e-05, + "loss": 1.8141, + "step": 14386 + }, + { + "epoch": 4.415899324739104, + "grad_norm": 0.3560626804828644, + "learning_rate": 6.178267712699213e-05, + "loss": 1.7764, + "step": 14387 + }, + { + "epoch": 4.416206261510129, + "grad_norm": 0.3746717572212219, + "learning_rate": 6.177784649414843e-05, + "loss": 1.7528, + "step": 14388 + }, + { + "epoch": 4.416513198281154, + "grad_norm": 0.23248885571956635, + "learning_rate": 6.177301574490918e-05, + "loss": 1.7148, + "step": 14389 + }, + { + "epoch": 4.41682013505218, + "grad_norm": 0.26936978101730347, + "learning_rate": 6.176818487932208e-05, + "loss": 1.7199, + "step": 14390 + }, + { + "epoch": 4.417127071823204, + "grad_norm": 0.3102504014968872, + "learning_rate": 6.176335389743486e-05, + "loss": 1.6886, + "step": 14391 + }, + { + "epoch": 4.417434008594229, + "grad_norm": 0.24406832456588745, + "learning_rate": 6.175852279929531e-05, + "loss": 1.7766, + "step": 14392 + }, + { + "epoch": 4.417740945365255, + "grad_norm": 0.271158903837204, + "learning_rate": 6.175369158495112e-05, + "loss": 1.8099, + "step": 14393 + }, + { + "epoch": 4.41804788213628, + "grad_norm": 0.343667209148407, + "learning_rate": 6.174886025445008e-05, + "loss": 1.779, + "step": 14394 + }, + { + "epoch": 4.418354818907305, + "grad_norm": 0.37423139810562134, + "learning_rate": 6.17440288078399e-05, + "loss": 1.7796, + "step": 14395 + }, + { + "epoch": 4.41866175567833, + "grad_norm": 0.3152335286140442, + "learning_rate": 6.173919724516836e-05, + "loss": 1.7388, + "step": 14396 + }, + { + "epoch": 4.418968692449355, + "grad_norm": 0.21467824280261993, + "learning_rate": 6.173436556648319e-05, + "loss": 1.7689, + "step": 14397 + }, + { + "epoch": 4.4192756292203805, + "grad_norm": 0.2861369848251343, + "learning_rate": 6.172953377183213e-05, + "loss": 1.819, + "step": 14398 + }, + { + "epoch": 4.419582565991406, + "grad_norm": 0.34777504205703735, + "learning_rate": 6.172470186126295e-05, + "loss": 1.7444, + "step": 14399 + }, + { + "epoch": 4.419889502762431, + "grad_norm": 0.2728833854198456, + "learning_rate": 6.171986983482339e-05, + "loss": 1.7637, + "step": 14400 + }, + { + "epoch": 4.420196439533456, + "grad_norm": 0.2593914270401001, + "learning_rate": 6.17150376925612e-05, + "loss": 1.8196, + "step": 14401 + }, + { + "epoch": 4.420503376304481, + "grad_norm": 0.29425305128097534, + "learning_rate": 6.171020543452416e-05, + "loss": 1.7511, + "step": 14402 + }, + { + "epoch": 4.420810313075506, + "grad_norm": 0.2587110102176666, + "learning_rate": 6.170537306076e-05, + "loss": 1.8085, + "step": 14403 + }, + { + "epoch": 4.421117249846532, + "grad_norm": 0.22442933917045593, + "learning_rate": 6.170054057131648e-05, + "loss": 1.8023, + "step": 14404 + }, + { + "epoch": 4.421424186617557, + "grad_norm": 0.23302629590034485, + "learning_rate": 6.169570796624136e-05, + "loss": 1.7995, + "step": 14405 + }, + { + "epoch": 4.421731123388582, + "grad_norm": 0.2295885682106018, + "learning_rate": 6.169087524558239e-05, + "loss": 1.7948, + "step": 14406 + }, + { + "epoch": 4.422038060159607, + "grad_norm": 0.2161262482404709, + "learning_rate": 6.168604240938735e-05, + "loss": 1.7159, + "step": 14407 + }, + { + "epoch": 4.422344996930632, + "grad_norm": 0.20746205747127533, + "learning_rate": 6.1681209457704e-05, + "loss": 1.7703, + "step": 14408 + }, + { + "epoch": 4.422651933701657, + "grad_norm": 0.25677376985549927, + "learning_rate": 6.167637639058006e-05, + "loss": 1.7819, + "step": 14409 + }, + { + "epoch": 4.422958870472683, + "grad_norm": 0.226568341255188, + "learning_rate": 6.167154320806336e-05, + "loss": 1.7661, + "step": 14410 + }, + { + "epoch": 4.423265807243708, + "grad_norm": 0.22997824847698212, + "learning_rate": 6.166670991020162e-05, + "loss": 1.7364, + "step": 14411 + }, + { + "epoch": 4.4235727440147325, + "grad_norm": 0.2528770864009857, + "learning_rate": 6.166187649704261e-05, + "loss": 1.8505, + "step": 14412 + }, + { + "epoch": 4.423879680785758, + "grad_norm": 0.27278614044189453, + "learning_rate": 6.165704296863409e-05, + "loss": 1.7855, + "step": 14413 + }, + { + "epoch": 4.424186617556783, + "grad_norm": 0.23086364567279816, + "learning_rate": 6.165220932502385e-05, + "loss": 1.7489, + "step": 14414 + }, + { + "epoch": 4.4244935543278086, + "grad_norm": 0.2570587396621704, + "learning_rate": 6.164737556625965e-05, + "loss": 1.8008, + "step": 14415 + }, + { + "epoch": 4.424800491098834, + "grad_norm": 0.2637264132499695, + "learning_rate": 6.164254169238923e-05, + "loss": 1.7563, + "step": 14416 + }, + { + "epoch": 4.425107427869859, + "grad_norm": 0.23046623170375824, + "learning_rate": 6.163770770346043e-05, + "loss": 1.7433, + "step": 14417 + }, + { + "epoch": 4.425414364640884, + "grad_norm": 0.2531467080116272, + "learning_rate": 6.163287359952095e-05, + "loss": 1.8122, + "step": 14418 + }, + { + "epoch": 4.425721301411909, + "grad_norm": 0.26507216691970825, + "learning_rate": 6.162803938061861e-05, + "loss": 1.7019, + "step": 14419 + }, + { + "epoch": 4.426028238182934, + "grad_norm": 0.229641854763031, + "learning_rate": 6.162320504680117e-05, + "loss": 1.7518, + "step": 14420 + }, + { + "epoch": 4.42633517495396, + "grad_norm": 0.22777152061462402, + "learning_rate": 6.161837059811641e-05, + "loss": 1.8094, + "step": 14421 + }, + { + "epoch": 4.426642111724985, + "grad_norm": 0.22121338546276093, + "learning_rate": 6.161353603461209e-05, + "loss": 1.7204, + "step": 14422 + }, + { + "epoch": 4.4269490484960095, + "grad_norm": 0.21914128959178925, + "learning_rate": 6.1608701356336e-05, + "loss": 1.7554, + "step": 14423 + }, + { + "epoch": 4.427255985267035, + "grad_norm": 0.22649390995502472, + "learning_rate": 6.160386656333593e-05, + "loss": 1.8058, + "step": 14424 + }, + { + "epoch": 4.42756292203806, + "grad_norm": 0.24529023468494415, + "learning_rate": 6.159903165565964e-05, + "loss": 1.7302, + "step": 14425 + }, + { + "epoch": 4.4278698588090855, + "grad_norm": 0.2726481854915619, + "learning_rate": 6.159419663335492e-05, + "loss": 1.825, + "step": 14426 + }, + { + "epoch": 4.428176795580111, + "grad_norm": 0.2772440016269684, + "learning_rate": 6.158936149646957e-05, + "loss": 1.7322, + "step": 14427 + }, + { + "epoch": 4.428483732351136, + "grad_norm": 0.29778853058815, + "learning_rate": 6.158452624505135e-05, + "loss": 1.7421, + "step": 14428 + }, + { + "epoch": 4.428790669122161, + "grad_norm": 0.21327480673789978, + "learning_rate": 6.157969087914804e-05, + "loss": 1.7269, + "step": 14429 + }, + { + "epoch": 4.429097605893186, + "grad_norm": 0.2718868851661682, + "learning_rate": 6.157485539880744e-05, + "loss": 1.7817, + "step": 14430 + }, + { + "epoch": 4.429404542664211, + "grad_norm": 0.32242509722709656, + "learning_rate": 6.157001980407735e-05, + "loss": 1.7115, + "step": 14431 + }, + { + "epoch": 4.429711479435237, + "grad_norm": 0.2931978106498718, + "learning_rate": 6.156518409500553e-05, + "loss": 1.7822, + "step": 14432 + }, + { + "epoch": 4.430018416206262, + "grad_norm": 0.229528546333313, + "learning_rate": 6.156034827163977e-05, + "loss": 1.7623, + "step": 14433 + }, + { + "epoch": 4.430325352977286, + "grad_norm": 0.28702354431152344, + "learning_rate": 6.15555123340279e-05, + "loss": 1.8101, + "step": 14434 + }, + { + "epoch": 4.430632289748312, + "grad_norm": 0.27162131667137146, + "learning_rate": 6.155067628221766e-05, + "loss": 1.7525, + "step": 14435 + }, + { + "epoch": 4.430939226519337, + "grad_norm": 0.24290388822555542, + "learning_rate": 6.154584011625688e-05, + "loss": 1.8701, + "step": 14436 + }, + { + "epoch": 4.431246163290362, + "grad_norm": 0.3055405020713806, + "learning_rate": 6.154100383619334e-05, + "loss": 1.8659, + "step": 14437 + }, + { + "epoch": 4.431553100061388, + "grad_norm": 0.24528950452804565, + "learning_rate": 6.153616744207483e-05, + "loss": 1.8493, + "step": 14438 + }, + { + "epoch": 4.431860036832412, + "grad_norm": 0.2611897587776184, + "learning_rate": 6.153133093394917e-05, + "loss": 1.7905, + "step": 14439 + }, + { + "epoch": 4.4321669736034375, + "grad_norm": 0.2172730267047882, + "learning_rate": 6.15264943118641e-05, + "loss": 1.7087, + "step": 14440 + }, + { + "epoch": 4.432473910374463, + "grad_norm": 0.2320949286222458, + "learning_rate": 6.152165757586749e-05, + "loss": 1.7473, + "step": 14441 + }, + { + "epoch": 4.432780847145488, + "grad_norm": 0.2602086365222931, + "learning_rate": 6.15168207260071e-05, + "loss": 1.7365, + "step": 14442 + }, + { + "epoch": 4.4330877839165135, + "grad_norm": 0.25193190574645996, + "learning_rate": 6.151198376233074e-05, + "loss": 1.8205, + "step": 14443 + }, + { + "epoch": 4.433394720687538, + "grad_norm": 0.2894204556941986, + "learning_rate": 6.150714668488621e-05, + "loss": 1.7759, + "step": 14444 + }, + { + "epoch": 4.433701657458563, + "grad_norm": 0.24150310456752777, + "learning_rate": 6.150230949372131e-05, + "loss": 1.8415, + "step": 14445 + }, + { + "epoch": 4.434008594229589, + "grad_norm": 0.23475918173789978, + "learning_rate": 6.149747218888384e-05, + "loss": 1.7487, + "step": 14446 + }, + { + "epoch": 4.434315531000614, + "grad_norm": 0.29425546526908875, + "learning_rate": 6.149263477042162e-05, + "loss": 1.7538, + "step": 14447 + }, + { + "epoch": 4.434622467771639, + "grad_norm": 0.26241615414619446, + "learning_rate": 6.148779723838244e-05, + "loss": 1.7564, + "step": 14448 + }, + { + "epoch": 4.434929404542665, + "grad_norm": 0.23195287585258484, + "learning_rate": 6.148295959281411e-05, + "loss": 1.837, + "step": 14449 + }, + { + "epoch": 4.435236341313689, + "grad_norm": 0.34972792863845825, + "learning_rate": 6.147812183376445e-05, + "loss": 1.7632, + "step": 14450 + }, + { + "epoch": 4.435543278084714, + "grad_norm": 0.3536125719547272, + "learning_rate": 6.147328396128126e-05, + "loss": 1.8372, + "step": 14451 + }, + { + "epoch": 4.43585021485574, + "grad_norm": 0.2086079865694046, + "learning_rate": 6.146844597541235e-05, + "loss": 1.7014, + "step": 14452 + }, + { + "epoch": 4.436157151626765, + "grad_norm": 0.25547802448272705, + "learning_rate": 6.146360787620554e-05, + "loss": 1.7544, + "step": 14453 + }, + { + "epoch": 4.43646408839779, + "grad_norm": 0.26176998019218445, + "learning_rate": 6.145876966370864e-05, + "loss": 1.7617, + "step": 14454 + }, + { + "epoch": 4.436771025168815, + "grad_norm": 0.2672959566116333, + "learning_rate": 6.145393133796946e-05, + "loss": 1.8178, + "step": 14455 + }, + { + "epoch": 4.43707796193984, + "grad_norm": 0.23373909294605255, + "learning_rate": 6.144909289903582e-05, + "loss": 1.7295, + "step": 14456 + }, + { + "epoch": 4.4373848987108655, + "grad_norm": 0.2369835078716278, + "learning_rate": 6.144425434695551e-05, + "loss": 1.8097, + "step": 14457 + }, + { + "epoch": 4.437691835481891, + "grad_norm": 0.25528979301452637, + "learning_rate": 6.14394156817764e-05, + "loss": 1.7523, + "step": 14458 + }, + { + "epoch": 4.437998772252916, + "grad_norm": 0.2541787624359131, + "learning_rate": 6.143457690354626e-05, + "loss": 1.7606, + "step": 14459 + }, + { + "epoch": 4.4383057090239415, + "grad_norm": 0.2032637745141983, + "learning_rate": 6.142973801231295e-05, + "loss": 1.7967, + "step": 14460 + }, + { + "epoch": 4.438612645794966, + "grad_norm": 0.2413996160030365, + "learning_rate": 6.142489900812426e-05, + "loss": 1.7688, + "step": 14461 + }, + { + "epoch": 4.438919582565991, + "grad_norm": 0.43451038002967834, + "learning_rate": 6.142005989102803e-05, + "loss": 1.8269, + "step": 14462 + }, + { + "epoch": 4.439226519337017, + "grad_norm": 0.23981481790542603, + "learning_rate": 6.141522066107206e-05, + "loss": 1.7628, + "step": 14463 + }, + { + "epoch": 4.439533456108042, + "grad_norm": 0.25396493077278137, + "learning_rate": 6.14103813183042e-05, + "loss": 1.7913, + "step": 14464 + }, + { + "epoch": 4.439840392879067, + "grad_norm": 0.2567536532878876, + "learning_rate": 6.140554186277225e-05, + "loss": 1.7612, + "step": 14465 + }, + { + "epoch": 4.440147329650092, + "grad_norm": 0.2201337069272995, + "learning_rate": 6.140070229452406e-05, + "loss": 1.7541, + "step": 14466 + }, + { + "epoch": 4.440454266421117, + "grad_norm": 0.24202953279018402, + "learning_rate": 6.139586261360746e-05, + "loss": 1.777, + "step": 14467 + }, + { + "epoch": 4.440761203192142, + "grad_norm": 0.23891687393188477, + "learning_rate": 6.139102282007024e-05, + "loss": 1.7509, + "step": 14468 + }, + { + "epoch": 4.441068139963168, + "grad_norm": 0.21132555603981018, + "learning_rate": 6.138618291396026e-05, + "loss": 1.7362, + "step": 14469 + }, + { + "epoch": 4.441375076734193, + "grad_norm": 0.2731861472129822, + "learning_rate": 6.138134289532536e-05, + "loss": 1.8063, + "step": 14470 + }, + { + "epoch": 4.4416820135052175, + "grad_norm": 0.29503315687179565, + "learning_rate": 6.137650276421336e-05, + "loss": 1.7193, + "step": 14471 + }, + { + "epoch": 4.441988950276243, + "grad_norm": 0.2778526544570923, + "learning_rate": 6.137166252067208e-05, + "loss": 1.7507, + "step": 14472 + }, + { + "epoch": 4.442295887047268, + "grad_norm": 0.2907710075378418, + "learning_rate": 6.136682216474938e-05, + "loss": 1.7939, + "step": 14473 + }, + { + "epoch": 4.4426028238182935, + "grad_norm": 0.4133768379688263, + "learning_rate": 6.136198169649306e-05, + "loss": 1.8012, + "step": 14474 + }, + { + "epoch": 4.442909760589319, + "grad_norm": 0.2505052983760834, + "learning_rate": 6.135714111595099e-05, + "loss": 1.8426, + "step": 14475 + }, + { + "epoch": 4.443216697360343, + "grad_norm": 0.3884379267692566, + "learning_rate": 6.135230042317099e-05, + "loss": 1.7383, + "step": 14476 + }, + { + "epoch": 4.443523634131369, + "grad_norm": 0.42902377247810364, + "learning_rate": 6.134745961820091e-05, + "loss": 1.732, + "step": 14477 + }, + { + "epoch": 4.443830570902394, + "grad_norm": 0.21782708168029785, + "learning_rate": 6.134261870108858e-05, + "loss": 1.7369, + "step": 14478 + }, + { + "epoch": 4.444137507673419, + "grad_norm": 0.4160648286342621, + "learning_rate": 6.133777767188186e-05, + "loss": 1.8083, + "step": 14479 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.5057216882705688, + "learning_rate": 6.133293653062856e-05, + "loss": 1.8971, + "step": 14480 + }, + { + "epoch": 4.44475138121547, + "grad_norm": 0.2189750075340271, + "learning_rate": 6.132809527737654e-05, + "loss": 1.7508, + "step": 14481 + }, + { + "epoch": 4.445058317986494, + "grad_norm": 0.4415782392024994, + "learning_rate": 6.132325391217364e-05, + "loss": 1.8548, + "step": 14482 + }, + { + "epoch": 4.44536525475752, + "grad_norm": 0.3907296359539032, + "learning_rate": 6.13184124350677e-05, + "loss": 1.7879, + "step": 14483 + }, + { + "epoch": 4.445672191528545, + "grad_norm": 0.24117955565452576, + "learning_rate": 6.131357084610659e-05, + "loss": 1.7227, + "step": 14484 + }, + { + "epoch": 4.44597912829957, + "grad_norm": 0.3083679974079132, + "learning_rate": 6.130872914533815e-05, + "loss": 1.7505, + "step": 14485 + }, + { + "epoch": 4.446286065070596, + "grad_norm": 0.27730658650398254, + "learning_rate": 6.13038873328102e-05, + "loss": 1.7485, + "step": 14486 + }, + { + "epoch": 4.44659300184162, + "grad_norm": 0.28548410534858704, + "learning_rate": 6.12990454085706e-05, + "loss": 1.8145, + "step": 14487 + }, + { + "epoch": 4.4468999386126455, + "grad_norm": 0.24743106961250305, + "learning_rate": 6.129420337266724e-05, + "loss": 1.7131, + "step": 14488 + }, + { + "epoch": 4.447206875383671, + "grad_norm": 0.2899693250656128, + "learning_rate": 6.128936122514794e-05, + "loss": 1.8567, + "step": 14489 + }, + { + "epoch": 4.447513812154696, + "grad_norm": 0.259916752576828, + "learning_rate": 6.128451896606053e-05, + "loss": 1.7563, + "step": 14490 + }, + { + "epoch": 4.4478207489257215, + "grad_norm": 0.21112586557865143, + "learning_rate": 6.12796765954529e-05, + "loss": 1.6975, + "step": 14491 + }, + { + "epoch": 4.448127685696747, + "grad_norm": 0.2890239953994751, + "learning_rate": 6.12748341133729e-05, + "loss": 1.7904, + "step": 14492 + }, + { + "epoch": 4.448434622467771, + "grad_norm": 0.23394012451171875, + "learning_rate": 6.126999151986839e-05, + "loss": 1.7559, + "step": 14493 + }, + { + "epoch": 4.448741559238797, + "grad_norm": 0.3492949903011322, + "learning_rate": 6.12651488149872e-05, + "loss": 1.7734, + "step": 14494 + }, + { + "epoch": 4.449048496009822, + "grad_norm": 0.48309218883514404, + "learning_rate": 6.126030599877723e-05, + "loss": 1.7798, + "step": 14495 + }, + { + "epoch": 4.449355432780847, + "grad_norm": 0.341146320104599, + "learning_rate": 6.12554630712863e-05, + "loss": 1.7921, + "step": 14496 + }, + { + "epoch": 4.449662369551873, + "grad_norm": 0.223160982131958, + "learning_rate": 6.125062003256229e-05, + "loss": 1.7784, + "step": 14497 + }, + { + "epoch": 4.449969306322897, + "grad_norm": 0.32664811611175537, + "learning_rate": 6.124577688265306e-05, + "loss": 1.7353, + "step": 14498 + }, + { + "epoch": 4.4502762430939224, + "grad_norm": 0.215936541557312, + "learning_rate": 6.124093362160646e-05, + "loss": 1.68, + "step": 14499 + }, + { + "epoch": 4.450583179864948, + "grad_norm": 0.26081225275993347, + "learning_rate": 6.123609024947038e-05, + "loss": 1.7107, + "step": 14500 + }, + { + "epoch": 4.450890116635973, + "grad_norm": 0.3124069571495056, + "learning_rate": 6.123124676629267e-05, + "loss": 1.7338, + "step": 14501 + }, + { + "epoch": 4.4511970534069984, + "grad_norm": 0.23125620186328888, + "learning_rate": 6.122640317212118e-05, + "loss": 1.7842, + "step": 14502 + }, + { + "epoch": 4.451503990178024, + "grad_norm": 0.27065595984458923, + "learning_rate": 6.122155946700381e-05, + "loss": 1.7284, + "step": 14503 + }, + { + "epoch": 4.451810926949048, + "grad_norm": 0.4677436053752899, + "learning_rate": 6.121671565098841e-05, + "loss": 1.8156, + "step": 14504 + }, + { + "epoch": 4.452117863720074, + "grad_norm": 0.36325082182884216, + "learning_rate": 6.121187172412285e-05, + "loss": 1.7875, + "step": 14505 + }, + { + "epoch": 4.452424800491099, + "grad_norm": 0.23409567773342133, + "learning_rate": 6.1207027686455e-05, + "loss": 1.7421, + "step": 14506 + }, + { + "epoch": 4.452731737262124, + "grad_norm": 0.36919257044792175, + "learning_rate": 6.120218353803273e-05, + "loss": 1.7545, + "step": 14507 + }, + { + "epoch": 4.45303867403315, + "grad_norm": 0.318452388048172, + "learning_rate": 6.119733927890393e-05, + "loss": 1.7179, + "step": 14508 + }, + { + "epoch": 4.453345610804174, + "grad_norm": 0.21279768645763397, + "learning_rate": 6.119249490911643e-05, + "loss": 1.7534, + "step": 14509 + }, + { + "epoch": 4.453652547575199, + "grad_norm": 0.30565473437309265, + "learning_rate": 6.118765042871816e-05, + "loss": 1.7962, + "step": 14510 + }, + { + "epoch": 4.453959484346225, + "grad_norm": 0.2608480453491211, + "learning_rate": 6.118280583775697e-05, + "loss": 1.7336, + "step": 14511 + }, + { + "epoch": 4.45426642111725, + "grad_norm": 0.22978845238685608, + "learning_rate": 6.117796113628075e-05, + "loss": 1.8244, + "step": 14512 + }, + { + "epoch": 4.454573357888275, + "grad_norm": 0.26357781887054443, + "learning_rate": 6.117311632433735e-05, + "loss": 1.7425, + "step": 14513 + }, + { + "epoch": 4.4548802946593, + "grad_norm": 0.22127102315425873, + "learning_rate": 6.116827140197467e-05, + "loss": 1.7679, + "step": 14514 + }, + { + "epoch": 4.455187231430325, + "grad_norm": 0.2876584231853485, + "learning_rate": 6.116342636924058e-05, + "loss": 1.8104, + "step": 14515 + }, + { + "epoch": 4.4554941682013505, + "grad_norm": 0.28290677070617676, + "learning_rate": 6.115858122618297e-05, + "loss": 1.7485, + "step": 14516 + }, + { + "epoch": 4.455801104972376, + "grad_norm": 0.21914640069007874, + "learning_rate": 6.115373597284974e-05, + "loss": 1.7736, + "step": 14517 + }, + { + "epoch": 4.456108041743401, + "grad_norm": 0.2603909969329834, + "learning_rate": 6.114889060928873e-05, + "loss": 1.7446, + "step": 14518 + }, + { + "epoch": 4.456414978514426, + "grad_norm": 0.2157236635684967, + "learning_rate": 6.114404513554784e-05, + "loss": 1.7594, + "step": 14519 + }, + { + "epoch": 4.456721915285451, + "grad_norm": 0.27622368931770325, + "learning_rate": 6.113919955167499e-05, + "loss": 1.8154, + "step": 14520 + }, + { + "epoch": 4.457028852056476, + "grad_norm": 0.27298516035079956, + "learning_rate": 6.113435385771803e-05, + "loss": 1.7458, + "step": 14521 + }, + { + "epoch": 4.457335788827502, + "grad_norm": 0.22220586240291595, + "learning_rate": 6.112950805372485e-05, + "loss": 1.7102, + "step": 14522 + }, + { + "epoch": 4.457642725598527, + "grad_norm": 0.19480876624584198, + "learning_rate": 6.112466213974336e-05, + "loss": 1.7696, + "step": 14523 + }, + { + "epoch": 4.457949662369552, + "grad_norm": 0.24261653423309326, + "learning_rate": 6.111981611582144e-05, + "loss": 1.8193, + "step": 14524 + }, + { + "epoch": 4.458256599140577, + "grad_norm": 0.2502967417240143, + "learning_rate": 6.111496998200697e-05, + "loss": 1.7701, + "step": 14525 + }, + { + "epoch": 4.458563535911602, + "grad_norm": 0.25764599442481995, + "learning_rate": 6.111012373834786e-05, + "loss": 1.8055, + "step": 14526 + }, + { + "epoch": 4.458870472682627, + "grad_norm": 0.24085427820682526, + "learning_rate": 6.110527738489198e-05, + "loss": 1.7592, + "step": 14527 + }, + { + "epoch": 4.459177409453653, + "grad_norm": 0.2469809502363205, + "learning_rate": 6.110043092168727e-05, + "loss": 1.6977, + "step": 14528 + }, + { + "epoch": 4.459484346224678, + "grad_norm": 0.21888838708400726, + "learning_rate": 6.109558434878159e-05, + "loss": 1.777, + "step": 14529 + }, + { + "epoch": 4.4597912829957025, + "grad_norm": 0.2094014585018158, + "learning_rate": 6.109073766622281e-05, + "loss": 1.7041, + "step": 14530 + }, + { + "epoch": 4.460098219766728, + "grad_norm": 0.23801055550575256, + "learning_rate": 6.108589087405888e-05, + "loss": 1.8392, + "step": 14531 + }, + { + "epoch": 4.460405156537753, + "grad_norm": 0.2164965718984604, + "learning_rate": 6.108104397233769e-05, + "loss": 1.7643, + "step": 14532 + }, + { + "epoch": 4.4607120933087785, + "grad_norm": 0.21322336792945862, + "learning_rate": 6.107619696110712e-05, + "loss": 1.7063, + "step": 14533 + }, + { + "epoch": 4.461019030079804, + "grad_norm": 0.29019200801849365, + "learning_rate": 6.107134984041507e-05, + "loss": 1.8254, + "step": 14534 + }, + { + "epoch": 4.461325966850829, + "grad_norm": 0.2765025496482849, + "learning_rate": 6.106650261030947e-05, + "loss": 1.7609, + "step": 14535 + }, + { + "epoch": 4.461632903621854, + "grad_norm": 0.20879749953746796, + "learning_rate": 6.106165527083818e-05, + "loss": 1.7387, + "step": 14536 + }, + { + "epoch": 4.461939840392879, + "grad_norm": 0.22295843064785004, + "learning_rate": 6.105680782204913e-05, + "loss": 1.7691, + "step": 14537 + }, + { + "epoch": 4.462246777163904, + "grad_norm": 0.23502351343631744, + "learning_rate": 6.105196026399025e-05, + "loss": 1.7335, + "step": 14538 + }, + { + "epoch": 4.46255371393493, + "grad_norm": 0.22143007814884186, + "learning_rate": 6.104711259670941e-05, + "loss": 1.7338, + "step": 14539 + }, + { + "epoch": 4.462860650705955, + "grad_norm": 0.22361041605472565, + "learning_rate": 6.104226482025453e-05, + "loss": 1.7033, + "step": 14540 + }, + { + "epoch": 4.463167587476979, + "grad_norm": 0.27104905247688293, + "learning_rate": 6.10374169346735e-05, + "loss": 1.7926, + "step": 14541 + }, + { + "epoch": 4.463474524248005, + "grad_norm": 0.23564264178276062, + "learning_rate": 6.103256894001427e-05, + "loss": 1.7522, + "step": 14542 + }, + { + "epoch": 4.46378146101903, + "grad_norm": 0.2585970163345337, + "learning_rate": 6.102772083632471e-05, + "loss": 1.7755, + "step": 14543 + }, + { + "epoch": 4.464088397790055, + "grad_norm": 0.358634889125824, + "learning_rate": 6.102287262365276e-05, + "loss": 1.8092, + "step": 14544 + }, + { + "epoch": 4.464395334561081, + "grad_norm": 0.2862946689128876, + "learning_rate": 6.1018024302046314e-05, + "loss": 1.7051, + "step": 14545 + }, + { + "epoch": 4.464702271332105, + "grad_norm": 0.21907158195972443, + "learning_rate": 6.101317587155331e-05, + "loss": 1.7882, + "step": 14546 + }, + { + "epoch": 4.4650092081031305, + "grad_norm": 0.24268488585948944, + "learning_rate": 6.100832733222164e-05, + "loss": 1.7756, + "step": 14547 + }, + { + "epoch": 4.465316144874156, + "grad_norm": 0.2350744605064392, + "learning_rate": 6.1003478684099214e-05, + "loss": 1.7483, + "step": 14548 + }, + { + "epoch": 4.465623081645181, + "grad_norm": 0.22902250289916992, + "learning_rate": 6.099862992723397e-05, + "loss": 1.7687, + "step": 14549 + }, + { + "epoch": 4.4659300184162065, + "grad_norm": 0.23590944707393646, + "learning_rate": 6.099378106167382e-05, + "loss": 1.8481, + "step": 14550 + }, + { + "epoch": 4.466236955187231, + "grad_norm": 0.23644296824932098, + "learning_rate": 6.098893208746668e-05, + "loss": 1.7422, + "step": 14551 + }, + { + "epoch": 4.466543891958256, + "grad_norm": 0.23782360553741455, + "learning_rate": 6.0984083004660475e-05, + "loss": 1.7852, + "step": 14552 + }, + { + "epoch": 4.466850828729282, + "grad_norm": 0.2546575665473938, + "learning_rate": 6.097923381330313e-05, + "loss": 1.8483, + "step": 14553 + }, + { + "epoch": 4.467157765500307, + "grad_norm": 0.2555409371852875, + "learning_rate": 6.097438451344254e-05, + "loss": 1.7887, + "step": 14554 + }, + { + "epoch": 4.467464702271332, + "grad_norm": 0.28074198961257935, + "learning_rate": 6.0969535105126664e-05, + "loss": 1.7521, + "step": 14555 + }, + { + "epoch": 4.467771639042358, + "grad_norm": 0.22622554004192352, + "learning_rate": 6.096468558840341e-05, + "loss": 1.8088, + "step": 14556 + }, + { + "epoch": 4.468078575813382, + "grad_norm": 0.302749902009964, + "learning_rate": 6.095983596332071e-05, + "loss": 1.8192, + "step": 14557 + }, + { + "epoch": 4.468385512584407, + "grad_norm": 0.27925750613212585, + "learning_rate": 6.0954986229926494e-05, + "loss": 1.8453, + "step": 14558 + }, + { + "epoch": 4.468692449355433, + "grad_norm": 0.2246330976486206, + "learning_rate": 6.095013638826868e-05, + "loss": 1.744, + "step": 14559 + }, + { + "epoch": 4.468999386126458, + "grad_norm": 0.26677101850509644, + "learning_rate": 6.094528643839518e-05, + "loss": 1.708, + "step": 14560 + }, + { + "epoch": 4.469306322897483, + "grad_norm": 0.23684042692184448, + "learning_rate": 6.094043638035396e-05, + "loss": 1.713, + "step": 14561 + }, + { + "epoch": 4.469613259668508, + "grad_norm": 0.2470075935125351, + "learning_rate": 6.093558621419294e-05, + "loss": 1.8096, + "step": 14562 + }, + { + "epoch": 4.469920196439533, + "grad_norm": 0.2775517702102661, + "learning_rate": 6.093073593996005e-05, + "loss": 1.697, + "step": 14563 + }, + { + "epoch": 4.4702271332105585, + "grad_norm": 0.21053175628185272, + "learning_rate": 6.092588555770322e-05, + "loss": 1.6894, + "step": 14564 + }, + { + "epoch": 4.470534069981584, + "grad_norm": 0.2555869221687317, + "learning_rate": 6.0921035067470366e-05, + "loss": 1.7051, + "step": 14565 + }, + { + "epoch": 4.470841006752609, + "grad_norm": 0.34468984603881836, + "learning_rate": 6.0916184469309454e-05, + "loss": 1.7317, + "step": 14566 + }, + { + "epoch": 4.4711479435236345, + "grad_norm": 0.2517752945423126, + "learning_rate": 6.0911333763268407e-05, + "loss": 1.7524, + "step": 14567 + }, + { + "epoch": 4.471454880294659, + "grad_norm": 0.2749727666378021, + "learning_rate": 6.090648294939517e-05, + "loss": 1.7045, + "step": 14568 + }, + { + "epoch": 4.471761817065684, + "grad_norm": 0.36250773072242737, + "learning_rate": 6.0901632027737673e-05, + "loss": 1.7196, + "step": 14569 + }, + { + "epoch": 4.47206875383671, + "grad_norm": 0.2317698448896408, + "learning_rate": 6.089678099834386e-05, + "loss": 1.7318, + "step": 14570 + }, + { + "epoch": 4.472375690607735, + "grad_norm": 0.2863345444202423, + "learning_rate": 6.089192986126166e-05, + "loss": 1.7798, + "step": 14571 + }, + { + "epoch": 4.47268262737876, + "grad_norm": 0.3493366241455078, + "learning_rate": 6.088707861653904e-05, + "loss": 1.7749, + "step": 14572 + }, + { + "epoch": 4.472989564149785, + "grad_norm": 0.25718605518341064, + "learning_rate": 6.0882227264223924e-05, + "loss": 1.7683, + "step": 14573 + }, + { + "epoch": 4.47329650092081, + "grad_norm": 0.2320062816143036, + "learning_rate": 6.087737580436426e-05, + "loss": 1.8296, + "step": 14574 + }, + { + "epoch": 4.473603437691835, + "grad_norm": 0.29071560502052307, + "learning_rate": 6.087252423700799e-05, + "loss": 1.7428, + "step": 14575 + }, + { + "epoch": 4.473910374462861, + "grad_norm": 0.24233707785606384, + "learning_rate": 6.086767256220306e-05, + "loss": 1.7332, + "step": 14576 + }, + { + "epoch": 4.474217311233886, + "grad_norm": 0.228043332695961, + "learning_rate": 6.086282077999742e-05, + "loss": 1.7697, + "step": 14577 + }, + { + "epoch": 4.474524248004911, + "grad_norm": 0.29154402017593384, + "learning_rate": 6.085796889043902e-05, + "loss": 1.8043, + "step": 14578 + }, + { + "epoch": 4.474831184775936, + "grad_norm": 0.30543211102485657, + "learning_rate": 6.0853116893575814e-05, + "loss": 1.7665, + "step": 14579 + }, + { + "epoch": 4.475138121546961, + "grad_norm": 0.22792959213256836, + "learning_rate": 6.0848264789455754e-05, + "loss": 1.729, + "step": 14580 + }, + { + "epoch": 4.475445058317987, + "grad_norm": 0.2615707218647003, + "learning_rate": 6.084341257812677e-05, + "loss": 1.7438, + "step": 14581 + }, + { + "epoch": 4.475751995089012, + "grad_norm": 0.23342981934547424, + "learning_rate": 6.083856025963681e-05, + "loss": 1.7158, + "step": 14582 + }, + { + "epoch": 4.476058931860037, + "grad_norm": 0.22279240190982819, + "learning_rate": 6.083370783403387e-05, + "loss": 1.7413, + "step": 14583 + }, + { + "epoch": 4.476365868631062, + "grad_norm": 0.28867462277412415, + "learning_rate": 6.082885530136587e-05, + "loss": 1.7932, + "step": 14584 + }, + { + "epoch": 4.476672805402087, + "grad_norm": 0.2947152256965637, + "learning_rate": 6.082400266168078e-05, + "loss": 1.8986, + "step": 14585 + }, + { + "epoch": 4.476979742173112, + "grad_norm": 0.2948935627937317, + "learning_rate": 6.0819149915026555e-05, + "loss": 1.9134, + "step": 14586 + }, + { + "epoch": 4.477286678944138, + "grad_norm": 0.4436163902282715, + "learning_rate": 6.081429706145114e-05, + "loss": 1.7616, + "step": 14587 + }, + { + "epoch": 4.477593615715163, + "grad_norm": 0.4879693388938904, + "learning_rate": 6.080944410100249e-05, + "loss": 1.8155, + "step": 14588 + }, + { + "epoch": 4.4779005524861875, + "grad_norm": 0.29742667078971863, + "learning_rate": 6.08045910337286e-05, + "loss": 1.7428, + "step": 14589 + }, + { + "epoch": 4.478207489257213, + "grad_norm": 0.2994751036167145, + "learning_rate": 6.0799737859677395e-05, + "loss": 1.7764, + "step": 14590 + }, + { + "epoch": 4.478514426028238, + "grad_norm": 0.46379905939102173, + "learning_rate": 6.079488457889686e-05, + "loss": 1.7289, + "step": 14591 + }, + { + "epoch": 4.4788213627992635, + "grad_norm": 0.3511717617511749, + "learning_rate": 6.0790031191434946e-05, + "loss": 1.7658, + "step": 14592 + }, + { + "epoch": 4.479128299570289, + "grad_norm": 0.22678083181381226, + "learning_rate": 6.0785177697339626e-05, + "loss": 1.7973, + "step": 14593 + }, + { + "epoch": 4.479435236341313, + "grad_norm": 0.31201767921447754, + "learning_rate": 6.0780324096658837e-05, + "loss": 1.7542, + "step": 14594 + }, + { + "epoch": 4.479742173112339, + "grad_norm": 0.23759113252162933, + "learning_rate": 6.077547038944058e-05, + "loss": 1.7191, + "step": 14595 + }, + { + "epoch": 4.480049109883364, + "grad_norm": 0.25801756978034973, + "learning_rate": 6.077061657573282e-05, + "loss": 1.8229, + "step": 14596 + }, + { + "epoch": 4.480356046654389, + "grad_norm": 0.3435722887516022, + "learning_rate": 6.0765762655583514e-05, + "loss": 1.7633, + "step": 14597 + }, + { + "epoch": 4.480662983425415, + "grad_norm": 0.2710443437099457, + "learning_rate": 6.076090862904063e-05, + "loss": 1.8126, + "step": 14598 + }, + { + "epoch": 4.48096992019644, + "grad_norm": 0.25750285387039185, + "learning_rate": 6.075605449615212e-05, + "loss": 1.7382, + "step": 14599 + }, + { + "epoch": 4.481276856967464, + "grad_norm": 0.3638051152229309, + "learning_rate": 6.075120025696598e-05, + "loss": 1.8191, + "step": 14600 + }, + { + "epoch": 4.48158379373849, + "grad_norm": 0.24185293912887573, + "learning_rate": 6.074634591153019e-05, + "loss": 1.7637, + "step": 14601 + }, + { + "epoch": 4.481890730509515, + "grad_norm": 0.317283570766449, + "learning_rate": 6.0741491459892707e-05, + "loss": 1.7805, + "step": 14602 + }, + { + "epoch": 4.48219766728054, + "grad_norm": 0.33884385228157043, + "learning_rate": 6.073663690210151e-05, + "loss": 1.7719, + "step": 14603 + }, + { + "epoch": 4.482504604051566, + "grad_norm": 0.2554258704185486, + "learning_rate": 6.073178223820457e-05, + "loss": 1.836, + "step": 14604 + }, + { + "epoch": 4.48281154082259, + "grad_norm": 0.3363535702228546, + "learning_rate": 6.072692746824987e-05, + "loss": 1.8249, + "step": 14605 + }, + { + "epoch": 4.4831184775936155, + "grad_norm": 0.36090195178985596, + "learning_rate": 6.072207259228537e-05, + "loss": 1.733, + "step": 14606 + }, + { + "epoch": 4.483425414364641, + "grad_norm": 0.21928483247756958, + "learning_rate": 6.071721761035909e-05, + "loss": 1.7413, + "step": 14607 + }, + { + "epoch": 4.483732351135666, + "grad_norm": 0.4256608486175537, + "learning_rate": 6.071236252251897e-05, + "loss": 1.7585, + "step": 14608 + }, + { + "epoch": 4.4840392879066915, + "grad_norm": 0.41980308294296265, + "learning_rate": 6.0707507328813007e-05, + "loss": 1.7584, + "step": 14609 + }, + { + "epoch": 4.484346224677717, + "grad_norm": 0.200295090675354, + "learning_rate": 6.0702652029289186e-05, + "loss": 1.7492, + "step": 14610 + }, + { + "epoch": 4.484653161448741, + "grad_norm": 0.41847771406173706, + "learning_rate": 6.069779662399549e-05, + "loss": 1.8101, + "step": 14611 + }, + { + "epoch": 4.484960098219767, + "grad_norm": 0.4846353530883789, + "learning_rate": 6.069294111297987e-05, + "loss": 1.8227, + "step": 14612 + }, + { + "epoch": 4.485267034990792, + "grad_norm": 0.23216098546981812, + "learning_rate": 6.068808549629036e-05, + "loss": 1.6811, + "step": 14613 + }, + { + "epoch": 4.485573971761817, + "grad_norm": 0.34903186559677124, + "learning_rate": 6.0683229773974934e-05, + "loss": 1.6858, + "step": 14614 + }, + { + "epoch": 4.485880908532843, + "grad_norm": 0.4349122941493988, + "learning_rate": 6.0678373946081556e-05, + "loss": 1.7704, + "step": 14615 + }, + { + "epoch": 4.486187845303867, + "grad_norm": 0.25738775730133057, + "learning_rate": 6.067351801265824e-05, + "loss": 1.7487, + "step": 14616 + }, + { + "epoch": 4.486494782074892, + "grad_norm": 0.3052736818790436, + "learning_rate": 6.0668661973752936e-05, + "loss": 1.7528, + "step": 14617 + }, + { + "epoch": 4.486801718845918, + "grad_norm": 0.3400498628616333, + "learning_rate": 6.066380582941368e-05, + "loss": 1.7414, + "step": 14618 + }, + { + "epoch": 4.487108655616943, + "grad_norm": 0.28251948952674866, + "learning_rate": 6.065894957968845e-05, + "loss": 1.8078, + "step": 14619 + }, + { + "epoch": 4.487415592387968, + "grad_norm": 0.26907965540885925, + "learning_rate": 6.0654093224625216e-05, + "loss": 1.8143, + "step": 14620 + }, + { + "epoch": 4.487722529158993, + "grad_norm": 0.2821955978870392, + "learning_rate": 6.064923676427201e-05, + "loss": 1.7163, + "step": 14621 + }, + { + "epoch": 4.488029465930018, + "grad_norm": 0.2223028987646103, + "learning_rate": 6.0644380198676786e-05, + "loss": 1.704, + "step": 14622 + }, + { + "epoch": 4.4883364027010435, + "grad_norm": 0.25243067741394043, + "learning_rate": 6.063952352788755e-05, + "loss": 1.7236, + "step": 14623 + }, + { + "epoch": 4.488643339472069, + "grad_norm": 0.30026015639305115, + "learning_rate": 6.063466675195233e-05, + "loss": 1.7575, + "step": 14624 + }, + { + "epoch": 4.488950276243094, + "grad_norm": 0.2055491805076599, + "learning_rate": 6.0629809870919085e-05, + "loss": 1.7294, + "step": 14625 + }, + { + "epoch": 4.4892572130141195, + "grad_norm": 0.2507593035697937, + "learning_rate": 6.0624952884835836e-05, + "loss": 1.762, + "step": 14626 + }, + { + "epoch": 4.489564149785144, + "grad_norm": 0.21385909616947174, + "learning_rate": 6.0620095793750576e-05, + "loss": 1.7396, + "step": 14627 + }, + { + "epoch": 4.489871086556169, + "grad_norm": 0.21926651895046234, + "learning_rate": 6.06152385977113e-05, + "loss": 1.7863, + "step": 14628 + }, + { + "epoch": 4.490178023327195, + "grad_norm": 0.21950845420360565, + "learning_rate": 6.0610381296766016e-05, + "loss": 1.7576, + "step": 14629 + }, + { + "epoch": 4.49048496009822, + "grad_norm": 0.2030971795320511, + "learning_rate": 6.0605523890962736e-05, + "loss": 1.7069, + "step": 14630 + }, + { + "epoch": 4.490791896869245, + "grad_norm": 0.23991432785987854, + "learning_rate": 6.0600666380349436e-05, + "loss": 1.7598, + "step": 14631 + }, + { + "epoch": 4.49109883364027, + "grad_norm": 0.23766861855983734, + "learning_rate": 6.059580876497415e-05, + "loss": 1.7687, + "step": 14632 + }, + { + "epoch": 4.491405770411295, + "grad_norm": 0.2361454963684082, + "learning_rate": 6.059095104488487e-05, + "loss": 1.7883, + "step": 14633 + }, + { + "epoch": 4.49171270718232, + "grad_norm": 0.3128328323364258, + "learning_rate": 6.058609322012958e-05, + "loss": 1.8087, + "step": 14634 + }, + { + "epoch": 4.492019643953346, + "grad_norm": 0.2958957850933075, + "learning_rate": 6.0581235290756335e-05, + "loss": 1.782, + "step": 14635 + }, + { + "epoch": 4.492326580724371, + "grad_norm": 0.2197243571281433, + "learning_rate": 6.057637725681312e-05, + "loss": 1.7408, + "step": 14636 + }, + { + "epoch": 4.4926335174953955, + "grad_norm": 0.22227831184864044, + "learning_rate": 6.0571519118347944e-05, + "loss": 1.734, + "step": 14637 + }, + { + "epoch": 4.492940454266421, + "grad_norm": 0.2784527540206909, + "learning_rate": 6.056666087540882e-05, + "loss": 1.8017, + "step": 14638 + }, + { + "epoch": 4.493247391037446, + "grad_norm": 0.21929821372032166, + "learning_rate": 6.056180252804377e-05, + "loss": 1.7271, + "step": 14639 + }, + { + "epoch": 4.4935543278084715, + "grad_norm": 0.2156134843826294, + "learning_rate": 6.055694407630077e-05, + "loss": 1.8082, + "step": 14640 + }, + { + "epoch": 4.493861264579497, + "grad_norm": 0.22672387957572937, + "learning_rate": 6.0552085520227875e-05, + "loss": 1.7506, + "step": 14641 + }, + { + "epoch": 4.494168201350522, + "grad_norm": 0.228785440325737, + "learning_rate": 6.0547226859873086e-05, + "loss": 1.7023, + "step": 14642 + }, + { + "epoch": 4.494475138121547, + "grad_norm": 0.19483685493469238, + "learning_rate": 6.054236809528443e-05, + "loss": 1.6879, + "step": 14643 + }, + { + "epoch": 4.494782074892572, + "grad_norm": 0.24911309778690338, + "learning_rate": 6.0537509226509904e-05, + "loss": 1.7856, + "step": 14644 + }, + { + "epoch": 4.495089011663597, + "grad_norm": 0.24811938405036926, + "learning_rate": 6.053265025359753e-05, + "loss": 1.7581, + "step": 14645 + }, + { + "epoch": 4.495395948434623, + "grad_norm": 0.2487260401248932, + "learning_rate": 6.052779117659534e-05, + "loss": 1.7536, + "step": 14646 + }, + { + "epoch": 4.495702885205648, + "grad_norm": 0.2594854235649109, + "learning_rate": 6.052293199555136e-05, + "loss": 1.7822, + "step": 14647 + }, + { + "epoch": 4.496009821976672, + "grad_norm": 0.22837325930595398, + "learning_rate": 6.051807271051359e-05, + "loss": 1.7542, + "step": 14648 + }, + { + "epoch": 4.496316758747698, + "grad_norm": 0.23106649518013, + "learning_rate": 6.051321332153005e-05, + "loss": 1.7758, + "step": 14649 + }, + { + "epoch": 4.496623695518723, + "grad_norm": 0.29424673318862915, + "learning_rate": 6.050835382864878e-05, + "loss": 1.8335, + "step": 14650 + }, + { + "epoch": 4.496930632289748, + "grad_norm": 0.28297343850135803, + "learning_rate": 6.050349423191779e-05, + "loss": 1.7711, + "step": 14651 + }, + { + "epoch": 4.497237569060774, + "grad_norm": 0.2001795768737793, + "learning_rate": 6.049863453138511e-05, + "loss": 1.7008, + "step": 14652 + }, + { + "epoch": 4.497544505831799, + "grad_norm": 0.35177022218704224, + "learning_rate": 6.04937747270988e-05, + "loss": 1.7763, + "step": 14653 + }, + { + "epoch": 4.4978514426028235, + "grad_norm": 0.28870898485183716, + "learning_rate": 6.0488914819106835e-05, + "loss": 1.7373, + "step": 14654 + }, + { + "epoch": 4.498158379373849, + "grad_norm": 0.23962664604187012, + "learning_rate": 6.048405480745727e-05, + "loss": 1.7278, + "step": 14655 + }, + { + "epoch": 4.498465316144874, + "grad_norm": 0.324505478143692, + "learning_rate": 6.047919469219813e-05, + "loss": 1.7674, + "step": 14656 + }, + { + "epoch": 4.4987722529158995, + "grad_norm": 0.38313817977905273, + "learning_rate": 6.047433447337744e-05, + "loss": 1.789, + "step": 14657 + }, + { + "epoch": 4.499079189686925, + "grad_norm": 0.2101358324289322, + "learning_rate": 6.046947415104324e-05, + "loss": 1.7331, + "step": 14658 + }, + { + "epoch": 4.499386126457949, + "grad_norm": 0.3388524353504181, + "learning_rate": 6.046461372524357e-05, + "loss": 1.8467, + "step": 14659 + }, + { + "epoch": 4.499693063228975, + "grad_norm": 0.3360123634338379, + "learning_rate": 6.045975319602645e-05, + "loss": 1.8427, + "step": 14660 + }, + { + "epoch": 4.5, + "grad_norm": 0.27596545219421387, + "learning_rate": 6.0454892563439914e-05, + "loss": 1.7768, + "step": 14661 + }, + { + "epoch": 4.500306936771025, + "grad_norm": 0.2580861747264862, + "learning_rate": 6.0450031827532e-05, + "loss": 1.763, + "step": 14662 + }, + { + "epoch": 4.500613873542051, + "grad_norm": 0.3521091938018799, + "learning_rate": 6.044517098835074e-05, + "loss": 1.7118, + "step": 14663 + }, + { + "epoch": 4.500920810313076, + "grad_norm": 0.29412439465522766, + "learning_rate": 6.0440310045944204e-05, + "loss": 1.7252, + "step": 14664 + }, + { + "epoch": 4.5012277470841005, + "grad_norm": 0.23845252394676208, + "learning_rate": 6.043544900036039e-05, + "loss": 1.7622, + "step": 14665 + }, + { + "epoch": 4.501534683855126, + "grad_norm": 0.22957031428813934, + "learning_rate": 6.043058785164736e-05, + "loss": 1.7527, + "step": 14666 + }, + { + "epoch": 4.501841620626151, + "grad_norm": 0.2564462721347809, + "learning_rate": 6.042572659985314e-05, + "loss": 1.801, + "step": 14667 + }, + { + "epoch": 4.5021485573971765, + "grad_norm": 0.22588051855564117, + "learning_rate": 6.042086524502576e-05, + "loss": 1.7387, + "step": 14668 + }, + { + "epoch": 4.502455494168201, + "grad_norm": 0.2609740197658539, + "learning_rate": 6.0416003787213306e-05, + "loss": 1.7615, + "step": 14669 + }, + { + "epoch": 4.502762430939226, + "grad_norm": 0.2535521984100342, + "learning_rate": 6.041114222646379e-05, + "loss": 1.7398, + "step": 14670 + }, + { + "epoch": 4.503069367710252, + "grad_norm": 0.2512127757072449, + "learning_rate": 6.040628056282527e-05, + "loss": 1.7679, + "step": 14671 + }, + { + "epoch": 4.503376304481277, + "grad_norm": 0.2438639998435974, + "learning_rate": 6.0401418796345774e-05, + "loss": 1.7, + "step": 14672 + }, + { + "epoch": 4.503683241252302, + "grad_norm": 0.23428042232990265, + "learning_rate": 6.0396556927073376e-05, + "loss": 1.7748, + "step": 14673 + }, + { + "epoch": 4.503990178023328, + "grad_norm": 0.22894345223903656, + "learning_rate": 6.03916949550561e-05, + "loss": 1.7881, + "step": 14674 + }, + { + "epoch": 4.504297114794352, + "grad_norm": 0.24813716113567352, + "learning_rate": 6.0386832880342006e-05, + "loss": 1.7676, + "step": 14675 + }, + { + "epoch": 4.504604051565377, + "grad_norm": 0.23448842763900757, + "learning_rate": 6.038197070297914e-05, + "loss": 1.7828, + "step": 14676 + }, + { + "epoch": 4.504910988336403, + "grad_norm": 0.25302332639694214, + "learning_rate": 6.037710842301556e-05, + "loss": 1.8061, + "step": 14677 + }, + { + "epoch": 4.505217925107428, + "grad_norm": 0.2411813735961914, + "learning_rate": 6.0372246040499305e-05, + "loss": 1.6901, + "step": 14678 + }, + { + "epoch": 4.505524861878453, + "grad_norm": 0.3154819905757904, + "learning_rate": 6.036738355547844e-05, + "loss": 1.7472, + "step": 14679 + }, + { + "epoch": 4.505831798649478, + "grad_norm": 0.2935639023780823, + "learning_rate": 6.0362520968001014e-05, + "loss": 1.7508, + "step": 14680 + }, + { + "epoch": 4.506138735420503, + "grad_norm": 0.27064070105552673, + "learning_rate": 6.035765827811508e-05, + "loss": 1.8133, + "step": 14681 + }, + { + "epoch": 4.5064456721915285, + "grad_norm": 0.23748525977134705, + "learning_rate": 6.03527954858687e-05, + "loss": 1.7742, + "step": 14682 + }, + { + "epoch": 4.506752608962554, + "grad_norm": 0.216410830616951, + "learning_rate": 6.034793259130992e-05, + "loss": 1.7448, + "step": 14683 + }, + { + "epoch": 4.507059545733579, + "grad_norm": 0.23339977860450745, + "learning_rate": 6.034306959448681e-05, + "loss": 1.7437, + "step": 14684 + }, + { + "epoch": 4.5073664825046045, + "grad_norm": 0.23951120674610138, + "learning_rate": 6.0338206495447414e-05, + "loss": 1.7535, + "step": 14685 + }, + { + "epoch": 4.507673419275629, + "grad_norm": 0.22137518227100372, + "learning_rate": 6.0333343294239816e-05, + "loss": 1.7537, + "step": 14686 + }, + { + "epoch": 4.507980356046654, + "grad_norm": 0.2550075054168701, + "learning_rate": 6.032847999091206e-05, + "loss": 1.8069, + "step": 14687 + }, + { + "epoch": 4.50828729281768, + "grad_norm": 0.2166420966386795, + "learning_rate": 6.032361658551221e-05, + "loss": 1.7746, + "step": 14688 + }, + { + "epoch": 4.508594229588705, + "grad_norm": 0.21926096081733704, + "learning_rate": 6.031875307808833e-05, + "loss": 1.7848, + "step": 14689 + }, + { + "epoch": 4.50890116635973, + "grad_norm": 0.27769652009010315, + "learning_rate": 6.031388946868848e-05, + "loss": 1.7563, + "step": 14690 + }, + { + "epoch": 4.509208103130755, + "grad_norm": 0.23417410254478455, + "learning_rate": 6.030902575736074e-05, + "loss": 1.7475, + "step": 14691 + }, + { + "epoch": 4.50951503990178, + "grad_norm": 0.25454118847846985, + "learning_rate": 6.030416194415314e-05, + "loss": 1.7416, + "step": 14692 + }, + { + "epoch": 4.509821976672805, + "grad_norm": 0.3118220567703247, + "learning_rate": 6.029929802911379e-05, + "loss": 1.8001, + "step": 14693 + }, + { + "epoch": 4.510128913443831, + "grad_norm": 0.2338017225265503, + "learning_rate": 6.029443401229075e-05, + "loss": 1.7243, + "step": 14694 + }, + { + "epoch": 4.510435850214856, + "grad_norm": 0.2490454763174057, + "learning_rate": 6.028956989373207e-05, + "loss": 1.7866, + "step": 14695 + }, + { + "epoch": 4.510742786985881, + "grad_norm": 0.2579275369644165, + "learning_rate": 6.028470567348582e-05, + "loss": 1.7594, + "step": 14696 + }, + { + "epoch": 4.511049723756906, + "grad_norm": 0.23982174694538116, + "learning_rate": 6.0279841351600094e-05, + "loss": 1.7444, + "step": 14697 + }, + { + "epoch": 4.511356660527931, + "grad_norm": 0.2160159945487976, + "learning_rate": 6.027497692812295e-05, + "loss": 1.7002, + "step": 14698 + }, + { + "epoch": 4.5116635972989565, + "grad_norm": 0.24604511260986328, + "learning_rate": 6.0270112403102455e-05, + "loss": 1.7654, + "step": 14699 + }, + { + "epoch": 4.511970534069982, + "grad_norm": 0.21978263556957245, + "learning_rate": 6.026524777658669e-05, + "loss": 1.7278, + "step": 14700 + }, + { + "epoch": 4.512277470841006, + "grad_norm": 0.2814212441444397, + "learning_rate": 6.026038304862373e-05, + "loss": 1.7743, + "step": 14701 + }, + { + "epoch": 4.512584407612032, + "grad_norm": 0.23798944056034088, + "learning_rate": 6.025551821926165e-05, + "loss": 1.7348, + "step": 14702 + }, + { + "epoch": 4.512891344383057, + "grad_norm": 0.22415988147258759, + "learning_rate": 6.025065328854853e-05, + "loss": 1.7973, + "step": 14703 + }, + { + "epoch": 4.513198281154082, + "grad_norm": 0.34614792466163635, + "learning_rate": 6.0245788256532445e-05, + "loss": 1.7263, + "step": 14704 + }, + { + "epoch": 4.513505217925108, + "grad_norm": 0.333918958902359, + "learning_rate": 6.0240923123261485e-05, + "loss": 1.7305, + "step": 14705 + }, + { + "epoch": 4.513812154696133, + "grad_norm": 0.22231793403625488, + "learning_rate": 6.02360578887837e-05, + "loss": 1.806, + "step": 14706 + }, + { + "epoch": 4.514119091467157, + "grad_norm": 0.23323194682598114, + "learning_rate": 6.023119255314721e-05, + "loss": 1.7076, + "step": 14707 + }, + { + "epoch": 4.514426028238183, + "grad_norm": 0.26695477962493896, + "learning_rate": 6.022632711640007e-05, + "loss": 1.775, + "step": 14708 + }, + { + "epoch": 4.514732965009208, + "grad_norm": 0.21446476876735687, + "learning_rate": 6.0221461578590364e-05, + "loss": 1.7524, + "step": 14709 + }, + { + "epoch": 4.515039901780233, + "grad_norm": 0.2677358090877533, + "learning_rate": 6.0216595939766204e-05, + "loss": 1.7513, + "step": 14710 + }, + { + "epoch": 4.515346838551259, + "grad_norm": 0.28648239374160767, + "learning_rate": 6.021173019997565e-05, + "loss": 1.7249, + "step": 14711 + }, + { + "epoch": 4.515653775322283, + "grad_norm": 0.2178548276424408, + "learning_rate": 6.020686435926678e-05, + "loss": 1.7502, + "step": 14712 + }, + { + "epoch": 4.5159607120933085, + "grad_norm": 0.3391740024089813, + "learning_rate": 6.02019984176877e-05, + "loss": 1.6828, + "step": 14713 + }, + { + "epoch": 4.516267648864334, + "grad_norm": 0.25222229957580566, + "learning_rate": 6.01971323752865e-05, + "loss": 1.6982, + "step": 14714 + }, + { + "epoch": 4.516574585635359, + "grad_norm": 0.28776636719703674, + "learning_rate": 6.019226623211125e-05, + "loss": 1.8595, + "step": 14715 + }, + { + "epoch": 4.5168815224063845, + "grad_norm": 0.3240084648132324, + "learning_rate": 6.018739998821006e-05, + "loss": 1.7461, + "step": 14716 + }, + { + "epoch": 4.51718845917741, + "grad_norm": 0.26735052466392517, + "learning_rate": 6.0182533643631015e-05, + "loss": 1.7955, + "step": 14717 + }, + { + "epoch": 4.517495395948434, + "grad_norm": 0.24573692679405212, + "learning_rate": 6.017766719842219e-05, + "loss": 1.7441, + "step": 14718 + }, + { + "epoch": 4.51780233271946, + "grad_norm": 0.27401313185691833, + "learning_rate": 6.01728006526317e-05, + "loss": 1.7399, + "step": 14719 + }, + { + "epoch": 4.518109269490485, + "grad_norm": 0.23578806221485138, + "learning_rate": 6.016793400630763e-05, + "loss": 1.7936, + "step": 14720 + }, + { + "epoch": 4.51841620626151, + "grad_norm": 0.27763426303863525, + "learning_rate": 6.0163067259498074e-05, + "loss": 1.7263, + "step": 14721 + }, + { + "epoch": 4.518723143032536, + "grad_norm": 0.27102044224739075, + "learning_rate": 6.015820041225113e-05, + "loss": 1.7085, + "step": 14722 + }, + { + "epoch": 4.51903007980356, + "grad_norm": 0.2046152651309967, + "learning_rate": 6.01533334646149e-05, + "loss": 1.7602, + "step": 14723 + }, + { + "epoch": 4.519337016574585, + "grad_norm": 0.2645253837108612, + "learning_rate": 6.0148466416637484e-05, + "loss": 1.7729, + "step": 14724 + }, + { + "epoch": 4.519643953345611, + "grad_norm": 0.27467650175094604, + "learning_rate": 6.014359926836697e-05, + "loss": 1.7834, + "step": 14725 + }, + { + "epoch": 4.519950890116636, + "grad_norm": 0.30357635021209717, + "learning_rate": 6.013873201985145e-05, + "loss": 1.8685, + "step": 14726 + }, + { + "epoch": 4.520257826887661, + "grad_norm": 0.22923336923122406, + "learning_rate": 6.013386467113905e-05, + "loss": 1.7531, + "step": 14727 + }, + { + "epoch": 4.520564763658687, + "grad_norm": 0.2792156934738159, + "learning_rate": 6.012899722227786e-05, + "loss": 1.7927, + "step": 14728 + }, + { + "epoch": 4.520871700429711, + "grad_norm": 0.286161869764328, + "learning_rate": 6.012412967331598e-05, + "loss": 1.77, + "step": 14729 + }, + { + "epoch": 4.5211786372007365, + "grad_norm": 0.23964659869670868, + "learning_rate": 6.011926202430151e-05, + "loss": 1.7873, + "step": 14730 + }, + { + "epoch": 4.521485573971762, + "grad_norm": 0.2250162959098816, + "learning_rate": 6.011439427528258e-05, + "loss": 1.741, + "step": 14731 + }, + { + "epoch": 4.521792510742787, + "grad_norm": 0.2797175347805023, + "learning_rate": 6.010952642630726e-05, + "loss": 1.7482, + "step": 14732 + }, + { + "epoch": 4.5220994475138125, + "grad_norm": 0.22159560024738312, + "learning_rate": 6.010465847742368e-05, + "loss": 1.7591, + "step": 14733 + }, + { + "epoch": 4.522406384284837, + "grad_norm": 0.26638463139533997, + "learning_rate": 6.009979042867995e-05, + "loss": 1.8564, + "step": 14734 + }, + { + "epoch": 4.522713321055862, + "grad_norm": 0.2972821891307831, + "learning_rate": 6.009492228012416e-05, + "loss": 1.7569, + "step": 14735 + }, + { + "epoch": 4.523020257826888, + "grad_norm": 0.28108885884284973, + "learning_rate": 6.0090054031804444e-05, + "loss": 1.7256, + "step": 14736 + }, + { + "epoch": 4.523327194597913, + "grad_norm": 0.22359851002693176, + "learning_rate": 6.008518568376888e-05, + "loss": 1.7342, + "step": 14737 + }, + { + "epoch": 4.523634131368938, + "grad_norm": 0.2620728015899658, + "learning_rate": 6.008031723606562e-05, + "loss": 1.7703, + "step": 14738 + }, + { + "epoch": 4.523941068139964, + "grad_norm": 0.2641485333442688, + "learning_rate": 6.007544868874274e-05, + "loss": 1.6944, + "step": 14739 + }, + { + "epoch": 4.524248004910988, + "grad_norm": 0.24957752227783203, + "learning_rate": 6.007058004184839e-05, + "loss": 1.7746, + "step": 14740 + }, + { + "epoch": 4.524554941682013, + "grad_norm": 0.29830998182296753, + "learning_rate": 6.006571129543065e-05, + "loss": 1.7718, + "step": 14741 + }, + { + "epoch": 4.524861878453039, + "grad_norm": 0.32740798592567444, + "learning_rate": 6.006084244953766e-05, + "loss": 1.8194, + "step": 14742 + }, + { + "epoch": 4.525168815224064, + "grad_norm": 0.2614956796169281, + "learning_rate": 6.005597350421751e-05, + "loss": 1.7078, + "step": 14743 + }, + { + "epoch": 4.525475751995089, + "grad_norm": 0.23940515518188477, + "learning_rate": 6.005110445951836e-05, + "loss": 1.7488, + "step": 14744 + }, + { + "epoch": 4.525782688766114, + "grad_norm": 0.25485914945602417, + "learning_rate": 6.004623531548829e-05, + "loss": 1.7705, + "step": 14745 + }, + { + "epoch": 4.526089625537139, + "grad_norm": 0.213532954454422, + "learning_rate": 6.0041366072175445e-05, + "loss": 1.7501, + "step": 14746 + }, + { + "epoch": 4.526396562308165, + "grad_norm": 0.2420104295015335, + "learning_rate": 6.003649672962792e-05, + "loss": 1.717, + "step": 14747 + }, + { + "epoch": 4.52670349907919, + "grad_norm": 0.26179102063179016, + "learning_rate": 6.0031627287893865e-05, + "loss": 1.7665, + "step": 14748 + }, + { + "epoch": 4.527010435850215, + "grad_norm": 0.22032082080841064, + "learning_rate": 6.002675774702139e-05, + "loss": 1.7555, + "step": 14749 + }, + { + "epoch": 4.52731737262124, + "grad_norm": 0.23915240168571472, + "learning_rate": 6.002188810705861e-05, + "loss": 1.8219, + "step": 14750 + }, + { + "epoch": 4.527624309392265, + "grad_norm": 0.2275150567293167, + "learning_rate": 6.0017018368053665e-05, + "loss": 1.7418, + "step": 14751 + }, + { + "epoch": 4.52793124616329, + "grad_norm": 0.2349669486284256, + "learning_rate": 6.001214853005467e-05, + "loss": 1.7814, + "step": 14752 + }, + { + "epoch": 4.528238182934316, + "grad_norm": 0.29985731840133667, + "learning_rate": 6.000727859310975e-05, + "loss": 1.7109, + "step": 14753 + }, + { + "epoch": 4.528545119705341, + "grad_norm": 0.27282044291496277, + "learning_rate": 6.0002408557267044e-05, + "loss": 1.7806, + "step": 14754 + }, + { + "epoch": 4.5288520564763655, + "grad_norm": 0.20906320214271545, + "learning_rate": 5.9997538422574675e-05, + "loss": 1.7221, + "step": 14755 + }, + { + "epoch": 4.529158993247391, + "grad_norm": 0.24553455412387848, + "learning_rate": 5.999266818908076e-05, + "loss": 1.793, + "step": 14756 + }, + { + "epoch": 4.529465930018416, + "grad_norm": 0.29730647802352905, + "learning_rate": 5.998779785683345e-05, + "loss": 1.7597, + "step": 14757 + }, + { + "epoch": 4.5297728667894415, + "grad_norm": 0.28297582268714905, + "learning_rate": 5.998292742588087e-05, + "loss": 1.7459, + "step": 14758 + }, + { + "epoch": 4.530079803560467, + "grad_norm": 0.21853844821453094, + "learning_rate": 5.997805689627115e-05, + "loss": 1.7234, + "step": 14759 + }, + { + "epoch": 4.530386740331492, + "grad_norm": 0.2997361421585083, + "learning_rate": 5.997318626805242e-05, + "loss": 1.7294, + "step": 14760 + }, + { + "epoch": 4.530693677102517, + "grad_norm": 0.3298671543598175, + "learning_rate": 5.9968315541272804e-05, + "loss": 1.7837, + "step": 14761 + }, + { + "epoch": 4.531000613873542, + "grad_norm": 0.22812490165233612, + "learning_rate": 5.996344471598047e-05, + "loss": 1.7509, + "step": 14762 + }, + { + "epoch": 4.531307550644567, + "grad_norm": 0.3179669678211212, + "learning_rate": 5.995857379222354e-05, + "loss": 1.8354, + "step": 14763 + }, + { + "epoch": 4.531614487415593, + "grad_norm": 0.3072827458381653, + "learning_rate": 5.9953702770050135e-05, + "loss": 1.8051, + "step": 14764 + }, + { + "epoch": 4.531921424186618, + "grad_norm": 0.19386722147464752, + "learning_rate": 5.994883164950841e-05, + "loss": 1.7093, + "step": 14765 + }, + { + "epoch": 4.532228360957642, + "grad_norm": 0.2380950152873993, + "learning_rate": 5.99439604306465e-05, + "loss": 1.7547, + "step": 14766 + }, + { + "epoch": 4.532535297728668, + "grad_norm": 0.32604947686195374, + "learning_rate": 5.993908911351254e-05, + "loss": 1.8708, + "step": 14767 + }, + { + "epoch": 4.532842234499693, + "grad_norm": 0.2436954528093338, + "learning_rate": 5.993421769815468e-05, + "loss": 1.7272, + "step": 14768 + }, + { + "epoch": 4.533149171270718, + "grad_norm": 0.2470337301492691, + "learning_rate": 5.992934618462105e-05, + "loss": 1.7242, + "step": 14769 + }, + { + "epoch": 4.533456108041744, + "grad_norm": 0.25720325112342834, + "learning_rate": 5.992447457295981e-05, + "loss": 1.7219, + "step": 14770 + }, + { + "epoch": 4.533763044812769, + "grad_norm": 0.2518918812274933, + "learning_rate": 5.991960286321909e-05, + "loss": 1.7916, + "step": 14771 + }, + { + "epoch": 4.5340699815837935, + "grad_norm": 0.2561487853527069, + "learning_rate": 5.9914731055447037e-05, + "loss": 1.7695, + "step": 14772 + }, + { + "epoch": 4.534376918354819, + "grad_norm": 0.25361356139183044, + "learning_rate": 5.9909859149691804e-05, + "loss": 1.7464, + "step": 14773 + }, + { + "epoch": 4.534683855125844, + "grad_norm": 0.22827522456645966, + "learning_rate": 5.9904987146001545e-05, + "loss": 1.7288, + "step": 14774 + }, + { + "epoch": 4.5349907918968695, + "grad_norm": 0.2417261302471161, + "learning_rate": 5.9900115044424385e-05, + "loss": 1.7311, + "step": 14775 + }, + { + "epoch": 4.535297728667894, + "grad_norm": 0.20756755769252777, + "learning_rate": 5.9895242845008495e-05, + "loss": 1.7799, + "step": 14776 + }, + { + "epoch": 4.535604665438919, + "grad_norm": 0.21999207139015198, + "learning_rate": 5.989037054780201e-05, + "loss": 1.7782, + "step": 14777 + }, + { + "epoch": 4.535911602209945, + "grad_norm": 0.22863444685935974, + "learning_rate": 5.988549815285308e-05, + "loss": 1.7869, + "step": 14778 + }, + { + "epoch": 4.53621853898097, + "grad_norm": 0.23033374547958374, + "learning_rate": 5.988062566020987e-05, + "loss": 1.7328, + "step": 14779 + }, + { + "epoch": 4.536525475751995, + "grad_norm": 0.21903404593467712, + "learning_rate": 5.987575306992053e-05, + "loss": 1.7689, + "step": 14780 + }, + { + "epoch": 4.536832412523021, + "grad_norm": 0.2433948963880539, + "learning_rate": 5.98708803820332e-05, + "loss": 1.7647, + "step": 14781 + }, + { + "epoch": 4.537139349294045, + "grad_norm": 0.2564239799976349, + "learning_rate": 5.986600759659606e-05, + "loss": 1.7958, + "step": 14782 + }, + { + "epoch": 4.53744628606507, + "grad_norm": 0.24009190499782562, + "learning_rate": 5.9861134713657244e-05, + "loss": 1.7511, + "step": 14783 + }, + { + "epoch": 4.537753222836096, + "grad_norm": 0.2578975558280945, + "learning_rate": 5.985626173326491e-05, + "loss": 1.8285, + "step": 14784 + }, + { + "epoch": 4.538060159607121, + "grad_norm": 0.24334335327148438, + "learning_rate": 5.9851388655467225e-05, + "loss": 1.7391, + "step": 14785 + }, + { + "epoch": 4.538367096378146, + "grad_norm": 0.26446983218193054, + "learning_rate": 5.9846515480312335e-05, + "loss": 1.8232, + "step": 14786 + }, + { + "epoch": 4.538674033149171, + "grad_norm": 0.3125670850276947, + "learning_rate": 5.9841642207848415e-05, + "loss": 1.7202, + "step": 14787 + }, + { + "epoch": 4.538980969920196, + "grad_norm": 0.2524511218070984, + "learning_rate": 5.983676883812361e-05, + "loss": 1.7653, + "step": 14788 + }, + { + "epoch": 4.5392879066912215, + "grad_norm": 0.3693946897983551, + "learning_rate": 5.98318953711861e-05, + "loss": 1.7457, + "step": 14789 + }, + { + "epoch": 4.539594843462247, + "grad_norm": 0.32625386118888855, + "learning_rate": 5.9827021807084026e-05, + "loss": 1.784, + "step": 14790 + }, + { + "epoch": 4.539901780233272, + "grad_norm": 0.24243168532848358, + "learning_rate": 5.9822148145865574e-05, + "loss": 1.7651, + "step": 14791 + }, + { + "epoch": 4.5402087170042975, + "grad_norm": 0.2950129210948944, + "learning_rate": 5.9817274387578895e-05, + "loss": 1.7316, + "step": 14792 + }, + { + "epoch": 4.540515653775322, + "grad_norm": 0.29455235600471497, + "learning_rate": 5.981240053227216e-05, + "loss": 1.7504, + "step": 14793 + }, + { + "epoch": 4.540822590546347, + "grad_norm": 0.23161925375461578, + "learning_rate": 5.980752657999352e-05, + "loss": 1.7663, + "step": 14794 + }, + { + "epoch": 4.541129527317373, + "grad_norm": 0.2725144922733307, + "learning_rate": 5.980265253079116e-05, + "loss": 1.765, + "step": 14795 + }, + { + "epoch": 4.541436464088398, + "grad_norm": 0.30911222100257874, + "learning_rate": 5.979777838471324e-05, + "loss": 1.7888, + "step": 14796 + }, + { + "epoch": 4.541743400859423, + "grad_norm": 0.2818063497543335, + "learning_rate": 5.979290414180794e-05, + "loss": 1.8047, + "step": 14797 + }, + { + "epoch": 4.542050337630448, + "grad_norm": 0.23335030674934387, + "learning_rate": 5.978802980212341e-05, + "loss": 1.8205, + "step": 14798 + }, + { + "epoch": 4.542357274401473, + "grad_norm": 0.24228201806545258, + "learning_rate": 5.9783155365707855e-05, + "loss": 1.7774, + "step": 14799 + }, + { + "epoch": 4.542664211172498, + "grad_norm": 0.2410847544670105, + "learning_rate": 5.97782808326094e-05, + "loss": 1.6959, + "step": 14800 + }, + { + "epoch": 4.542971147943524, + "grad_norm": 0.24812567234039307, + "learning_rate": 5.9773406202876245e-05, + "loss": 1.8158, + "step": 14801 + }, + { + "epoch": 4.543278084714549, + "grad_norm": 0.2606147229671478, + "learning_rate": 5.9768531476556566e-05, + "loss": 1.7478, + "step": 14802 + }, + { + "epoch": 4.543585021485574, + "grad_norm": 0.24853013455867767, + "learning_rate": 5.976365665369854e-05, + "loss": 1.8158, + "step": 14803 + }, + { + "epoch": 4.543891958256599, + "grad_norm": 0.2320917695760727, + "learning_rate": 5.9758781734350334e-05, + "loss": 1.7812, + "step": 14804 + }, + { + "epoch": 4.544198895027624, + "grad_norm": 0.3460223376750946, + "learning_rate": 5.9753906718560127e-05, + "loss": 1.7562, + "step": 14805 + }, + { + "epoch": 4.5445058317986495, + "grad_norm": 0.2941136658191681, + "learning_rate": 5.9749031606376086e-05, + "loss": 1.7562, + "step": 14806 + }, + { + "epoch": 4.544812768569675, + "grad_norm": 0.2371312975883484, + "learning_rate": 5.9744156397846404e-05, + "loss": 1.7793, + "step": 14807 + }, + { + "epoch": 4.5451197053407, + "grad_norm": 0.2885094881057739, + "learning_rate": 5.973928109301926e-05, + "loss": 1.7564, + "step": 14808 + }, + { + "epoch": 4.545426642111725, + "grad_norm": 0.2369023859500885, + "learning_rate": 5.973440569194284e-05, + "loss": 1.7862, + "step": 14809 + }, + { + "epoch": 4.54573357888275, + "grad_norm": 0.26628994941711426, + "learning_rate": 5.972953019466531e-05, + "loss": 1.7828, + "step": 14810 + }, + { + "epoch": 4.546040515653775, + "grad_norm": 0.3091031610965729, + "learning_rate": 5.9724654601234864e-05, + "loss": 1.7623, + "step": 14811 + }, + { + "epoch": 4.546347452424801, + "grad_norm": 0.24652205407619476, + "learning_rate": 5.971977891169966e-05, + "loss": 1.6982, + "step": 14812 + }, + { + "epoch": 4.546654389195826, + "grad_norm": 0.21779046952724457, + "learning_rate": 5.971490312610793e-05, + "loss": 1.7363, + "step": 14813 + }, + { + "epoch": 4.546961325966851, + "grad_norm": 0.24130751192569733, + "learning_rate": 5.971002724450783e-05, + "loss": 1.7014, + "step": 14814 + }, + { + "epoch": 4.547268262737876, + "grad_norm": 0.21868734061717987, + "learning_rate": 5.9705151266947534e-05, + "loss": 1.7872, + "step": 14815 + }, + { + "epoch": 4.547575199508901, + "grad_norm": 0.257376492023468, + "learning_rate": 5.9700275193475275e-05, + "loss": 1.75, + "step": 14816 + }, + { + "epoch": 4.547882136279926, + "grad_norm": 0.3182791769504547, + "learning_rate": 5.9695399024139174e-05, + "loss": 1.7965, + "step": 14817 + }, + { + "epoch": 4.548189073050952, + "grad_norm": 0.25553280115127563, + "learning_rate": 5.969052275898748e-05, + "loss": 1.8394, + "step": 14818 + }, + { + "epoch": 4.548496009821976, + "grad_norm": 0.2810833752155304, + "learning_rate": 5.9685646398068354e-05, + "loss": 1.704, + "step": 14819 + }, + { + "epoch": 4.5488029465930016, + "grad_norm": 0.21320512890815735, + "learning_rate": 5.9680769941429993e-05, + "loss": 1.7248, + "step": 14820 + }, + { + "epoch": 4.549109883364027, + "grad_norm": 0.3159593939781189, + "learning_rate": 5.96758933891206e-05, + "loss": 1.7885, + "step": 14821 + }, + { + "epoch": 4.549416820135052, + "grad_norm": 0.21894599497318268, + "learning_rate": 5.967101674118834e-05, + "loss": 1.7388, + "step": 14822 + }, + { + "epoch": 4.5497237569060776, + "grad_norm": 0.24804852902889252, + "learning_rate": 5.9666139997681424e-05, + "loss": 1.7631, + "step": 14823 + }, + { + "epoch": 4.550030693677103, + "grad_norm": 0.2678423523902893, + "learning_rate": 5.966126315864806e-05, + "loss": 1.7631, + "step": 14824 + }, + { + "epoch": 4.550337630448127, + "grad_norm": 0.229649156332016, + "learning_rate": 5.9656386224136426e-05, + "loss": 1.7292, + "step": 14825 + }, + { + "epoch": 4.550644567219153, + "grad_norm": 0.25248458981513977, + "learning_rate": 5.965150919419473e-05, + "loss": 1.8, + "step": 14826 + }, + { + "epoch": 4.550951503990178, + "grad_norm": 0.2583169937133789, + "learning_rate": 5.964663206887116e-05, + "loss": 1.7641, + "step": 14827 + }, + { + "epoch": 4.551258440761203, + "grad_norm": 0.21465209126472473, + "learning_rate": 5.964175484821392e-05, + "loss": 1.7475, + "step": 14828 + }, + { + "epoch": 4.551565377532229, + "grad_norm": 0.28028783202171326, + "learning_rate": 5.963687753227118e-05, + "loss": 1.7649, + "step": 14829 + }, + { + "epoch": 4.551872314303253, + "grad_norm": 0.30248284339904785, + "learning_rate": 5.9632000121091194e-05, + "loss": 1.6969, + "step": 14830 + }, + { + "epoch": 4.5521792510742785, + "grad_norm": 0.24335962533950806, + "learning_rate": 5.962712261472213e-05, + "loss": 1.7295, + "step": 14831 + }, + { + "epoch": 4.552486187845304, + "grad_norm": 0.21014504134655, + "learning_rate": 5.9622245013212206e-05, + "loss": 1.7508, + "step": 14832 + }, + { + "epoch": 4.552793124616329, + "grad_norm": 0.24892041087150574, + "learning_rate": 5.961736731660963e-05, + "loss": 1.7317, + "step": 14833 + }, + { + "epoch": 4.5531000613873545, + "grad_norm": 0.2159881740808487, + "learning_rate": 5.9612489524962556e-05, + "loss": 1.7114, + "step": 14834 + }, + { + "epoch": 4.55340699815838, + "grad_norm": 0.2952292263507843, + "learning_rate": 5.960761163831925e-05, + "loss": 1.8226, + "step": 14835 + }, + { + "epoch": 4.553713934929404, + "grad_norm": 0.3019000291824341, + "learning_rate": 5.9602733656727895e-05, + "loss": 1.7391, + "step": 14836 + }, + { + "epoch": 4.55402087170043, + "grad_norm": 0.2273966521024704, + "learning_rate": 5.9597855580236696e-05, + "loss": 1.7718, + "step": 14837 + }, + { + "epoch": 4.554327808471455, + "grad_norm": 0.2462005764245987, + "learning_rate": 5.959297740889386e-05, + "loss": 1.8428, + "step": 14838 + }, + { + "epoch": 4.55463474524248, + "grad_norm": 0.2773323059082031, + "learning_rate": 5.95880991427476e-05, + "loss": 1.6878, + "step": 14839 + }, + { + "epoch": 4.554941682013506, + "grad_norm": 0.26519861817359924, + "learning_rate": 5.958322078184611e-05, + "loss": 1.737, + "step": 14840 + }, + { + "epoch": 4.55524861878453, + "grad_norm": 0.20157647132873535, + "learning_rate": 5.9578342326237626e-05, + "loss": 1.7164, + "step": 14841 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.21715669333934784, + "learning_rate": 5.957346377597035e-05, + "loss": 1.705, + "step": 14842 + }, + { + "epoch": 4.555862492326581, + "grad_norm": 0.3056442439556122, + "learning_rate": 5.95685851310925e-05, + "loss": 1.7672, + "step": 14843 + }, + { + "epoch": 4.556169429097606, + "grad_norm": 0.24832262098789215, + "learning_rate": 5.956370639165228e-05, + "loss": 1.7305, + "step": 14844 + }, + { + "epoch": 4.556476365868631, + "grad_norm": 0.25814661383628845, + "learning_rate": 5.955882755769791e-05, + "loss": 1.7562, + "step": 14845 + }, + { + "epoch": 4.556783302639657, + "grad_norm": 0.38242629170417786, + "learning_rate": 5.95539486292776e-05, + "loss": 1.7077, + "step": 14846 + }, + { + "epoch": 4.557090239410681, + "grad_norm": 0.2901807427406311, + "learning_rate": 5.954906960643956e-05, + "loss": 1.7233, + "step": 14847 + }, + { + "epoch": 4.5573971761817065, + "grad_norm": 0.22636106610298157, + "learning_rate": 5.954419048923202e-05, + "loss": 1.777, + "step": 14848 + }, + { + "epoch": 4.557704112952732, + "grad_norm": 0.32392850518226624, + "learning_rate": 5.953931127770321e-05, + "loss": 1.7477, + "step": 14849 + }, + { + "epoch": 4.558011049723757, + "grad_norm": 0.3403460681438446, + "learning_rate": 5.953443197190134e-05, + "loss": 1.7712, + "step": 14850 + }, + { + "epoch": 4.558317986494782, + "grad_norm": 0.22923234105110168, + "learning_rate": 5.95295525718746e-05, + "loss": 1.8154, + "step": 14851 + }, + { + "epoch": 4.558624923265807, + "grad_norm": 0.25152841210365295, + "learning_rate": 5.952467307767124e-05, + "loss": 1.7091, + "step": 14852 + }, + { + "epoch": 4.558931860036832, + "grad_norm": 0.27743563055992126, + "learning_rate": 5.951979348933949e-05, + "loss": 1.7621, + "step": 14853 + }, + { + "epoch": 4.559238796807858, + "grad_norm": 0.25809308886528015, + "learning_rate": 5.951491380692756e-05, + "loss": 1.7669, + "step": 14854 + }, + { + "epoch": 4.559545733578883, + "grad_norm": 0.24863946437835693, + "learning_rate": 5.9510034030483676e-05, + "loss": 1.7354, + "step": 14855 + }, + { + "epoch": 4.559852670349908, + "grad_norm": 0.2896040380001068, + "learning_rate": 5.9505154160056066e-05, + "loss": 1.7878, + "step": 14856 + }, + { + "epoch": 4.560159607120933, + "grad_norm": 0.23814482986927032, + "learning_rate": 5.950027419569294e-05, + "loss": 1.7781, + "step": 14857 + }, + { + "epoch": 4.560466543891958, + "grad_norm": 0.2531175911426544, + "learning_rate": 5.949539413744253e-05, + "loss": 1.762, + "step": 14858 + }, + { + "epoch": 4.560773480662983, + "grad_norm": 0.2541767656803131, + "learning_rate": 5.949051398535308e-05, + "loss": 1.7722, + "step": 14859 + }, + { + "epoch": 4.561080417434009, + "grad_norm": 0.25216221809387207, + "learning_rate": 5.948563373947281e-05, + "loss": 1.754, + "step": 14860 + }, + { + "epoch": 4.561387354205034, + "grad_norm": 0.24421775341033936, + "learning_rate": 5.948075339984994e-05, + "loss": 1.7976, + "step": 14861 + }, + { + "epoch": 4.5616942909760585, + "grad_norm": 0.24435418844223022, + "learning_rate": 5.947587296653272e-05, + "loss": 1.79, + "step": 14862 + }, + { + "epoch": 4.562001227747084, + "grad_norm": 0.24471627175807953, + "learning_rate": 5.947099243956936e-05, + "loss": 1.755, + "step": 14863 + }, + { + "epoch": 4.562308164518109, + "grad_norm": 0.2762158215045929, + "learning_rate": 5.9466111819008096e-05, + "loss": 1.7695, + "step": 14864 + }, + { + "epoch": 4.5626151012891345, + "grad_norm": 0.23841319978237152, + "learning_rate": 5.9461231104897174e-05, + "loss": 1.7302, + "step": 14865 + }, + { + "epoch": 4.56292203806016, + "grad_norm": 0.260231077671051, + "learning_rate": 5.9456350297284826e-05, + "loss": 1.7917, + "step": 14866 + }, + { + "epoch": 4.563228974831185, + "grad_norm": 0.2752247452735901, + "learning_rate": 5.945146939621929e-05, + "loss": 1.7953, + "step": 14867 + }, + { + "epoch": 4.56353591160221, + "grad_norm": 0.28760650753974915, + "learning_rate": 5.944658840174878e-05, + "loss": 1.8582, + "step": 14868 + }, + { + "epoch": 4.563842848373235, + "grad_norm": 0.24311676621437073, + "learning_rate": 5.944170731392153e-05, + "loss": 1.8006, + "step": 14869 + }, + { + "epoch": 4.56414978514426, + "grad_norm": 0.2692974805831909, + "learning_rate": 5.943682613278583e-05, + "loss": 1.6984, + "step": 14870 + }, + { + "epoch": 4.564456721915286, + "grad_norm": 0.2784348726272583, + "learning_rate": 5.943194485838985e-05, + "loss": 1.8082, + "step": 14871 + }, + { + "epoch": 4.564763658686311, + "grad_norm": 0.2557264268398285, + "learning_rate": 5.9427063490781885e-05, + "loss": 1.7715, + "step": 14872 + }, + { + "epoch": 4.565070595457335, + "grad_norm": 0.3738742470741272, + "learning_rate": 5.942218203001015e-05, + "loss": 1.7549, + "step": 14873 + }, + { + "epoch": 4.565377532228361, + "grad_norm": 0.2424495816230774, + "learning_rate": 5.941730047612288e-05, + "loss": 1.7388, + "step": 14874 + }, + { + "epoch": 4.565684468999386, + "grad_norm": 0.27020737528800964, + "learning_rate": 5.941241882916833e-05, + "loss": 1.752, + "step": 14875 + }, + { + "epoch": 4.565991405770411, + "grad_norm": 0.3763764798641205, + "learning_rate": 5.940753708919474e-05, + "loss": 1.7918, + "step": 14876 + }, + { + "epoch": 4.566298342541437, + "grad_norm": 0.26782163977622986, + "learning_rate": 5.940265525625036e-05, + "loss": 1.7244, + "step": 14877 + }, + { + "epoch": 4.566605279312462, + "grad_norm": 0.24978911876678467, + "learning_rate": 5.9397773330383434e-05, + "loss": 1.7706, + "step": 14878 + }, + { + "epoch": 4.5669122160834865, + "grad_norm": 0.32905304431915283, + "learning_rate": 5.93928913116422e-05, + "loss": 1.7381, + "step": 14879 + }, + { + "epoch": 4.567219152854512, + "grad_norm": 0.2196444720029831, + "learning_rate": 5.93880092000749e-05, + "loss": 1.7605, + "step": 14880 + }, + { + "epoch": 4.567526089625537, + "grad_norm": 0.3156622350215912, + "learning_rate": 5.9383126995729786e-05, + "loss": 1.9181, + "step": 14881 + }, + { + "epoch": 4.5678330263965625, + "grad_norm": 0.2895203232765198, + "learning_rate": 5.937824469865513e-05, + "loss": 1.7967, + "step": 14882 + }, + { + "epoch": 4.568139963167588, + "grad_norm": 0.24854810535907745, + "learning_rate": 5.937336230889916e-05, + "loss": 1.7332, + "step": 14883 + }, + { + "epoch": 4.568446899938612, + "grad_norm": 0.3417081832885742, + "learning_rate": 5.936847982651013e-05, + "loss": 1.7525, + "step": 14884 + }, + { + "epoch": 4.568753836709638, + "grad_norm": 0.2874949276447296, + "learning_rate": 5.936359725153629e-05, + "loss": 1.7659, + "step": 14885 + }, + { + "epoch": 4.569060773480663, + "grad_norm": 0.25031307339668274, + "learning_rate": 5.935871458402588e-05, + "loss": 1.8061, + "step": 14886 + }, + { + "epoch": 4.569367710251688, + "grad_norm": 0.27047309279441833, + "learning_rate": 5.935383182402717e-05, + "loss": 1.7318, + "step": 14887 + }, + { + "epoch": 4.569674647022714, + "grad_norm": 0.2642819881439209, + "learning_rate": 5.9348948971588425e-05, + "loss": 1.849, + "step": 14888 + }, + { + "epoch": 4.569981583793739, + "grad_norm": 0.2452307790517807, + "learning_rate": 5.9344066026757886e-05, + "loss": 1.7491, + "step": 14889 + }, + { + "epoch": 4.570288520564763, + "grad_norm": 0.24055036902427673, + "learning_rate": 5.9339182989583795e-05, + "loss": 1.7573, + "step": 14890 + }, + { + "epoch": 4.570595457335789, + "grad_norm": 0.23036183416843414, + "learning_rate": 5.933429986011444e-05, + "loss": 1.7841, + "step": 14891 + }, + { + "epoch": 4.570902394106814, + "grad_norm": 0.27987608313560486, + "learning_rate": 5.932941663839805e-05, + "loss": 1.7835, + "step": 14892 + }, + { + "epoch": 4.571209330877839, + "grad_norm": 0.31747013330459595, + "learning_rate": 5.93245333244829e-05, + "loss": 1.7905, + "step": 14893 + }, + { + "epoch": 4.571516267648864, + "grad_norm": 0.24841344356536865, + "learning_rate": 5.931964991841725e-05, + "loss": 1.8003, + "step": 14894 + }, + { + "epoch": 4.571823204419889, + "grad_norm": 0.2416950911283493, + "learning_rate": 5.9314766420249356e-05, + "loss": 1.7787, + "step": 14895 + }, + { + "epoch": 4.5721301411909145, + "grad_norm": 0.2322494238615036, + "learning_rate": 5.930988283002748e-05, + "loss": 1.8153, + "step": 14896 + }, + { + "epoch": 4.57243707796194, + "grad_norm": 0.22629016637802124, + "learning_rate": 5.930499914779989e-05, + "loss": 1.6743, + "step": 14897 + }, + { + "epoch": 4.572744014732965, + "grad_norm": 0.21481508016586304, + "learning_rate": 5.930011537361483e-05, + "loss": 1.7301, + "step": 14898 + }, + { + "epoch": 4.5730509515039905, + "grad_norm": 0.1993340700864792, + "learning_rate": 5.9295231507520586e-05, + "loss": 1.6796, + "step": 14899 + }, + { + "epoch": 4.573357888275015, + "grad_norm": 0.21681822836399078, + "learning_rate": 5.929034754956543e-05, + "loss": 1.7333, + "step": 14900 + }, + { + "epoch": 4.57366482504604, + "grad_norm": 0.23105305433273315, + "learning_rate": 5.928546349979761e-05, + "loss": 1.8207, + "step": 14901 + }, + { + "epoch": 4.573971761817066, + "grad_norm": 0.24656468629837036, + "learning_rate": 5.9280579358265384e-05, + "loss": 1.7805, + "step": 14902 + }, + { + "epoch": 4.574278698588091, + "grad_norm": 0.28564780950546265, + "learning_rate": 5.927569512501704e-05, + "loss": 1.7224, + "step": 14903 + }, + { + "epoch": 4.574585635359116, + "grad_norm": 0.26030251383781433, + "learning_rate": 5.927081080010084e-05, + "loss": 1.7417, + "step": 14904 + }, + { + "epoch": 4.574892572130141, + "grad_norm": 0.21427087485790253, + "learning_rate": 5.926592638356505e-05, + "loss": 1.7239, + "step": 14905 + }, + { + "epoch": 4.575199508901166, + "grad_norm": 0.2351662665605545, + "learning_rate": 5.9261041875457956e-05, + "loss": 1.7711, + "step": 14906 + }, + { + "epoch": 4.5755064456721914, + "grad_norm": 0.27335020899772644, + "learning_rate": 5.925615727582781e-05, + "loss": 1.7496, + "step": 14907 + }, + { + "epoch": 4.575813382443217, + "grad_norm": 0.27849945425987244, + "learning_rate": 5.925127258472289e-05, + "loss": 1.7576, + "step": 14908 + }, + { + "epoch": 4.576120319214242, + "grad_norm": 0.27859339118003845, + "learning_rate": 5.924638780219147e-05, + "loss": 1.8076, + "step": 14909 + }, + { + "epoch": 4.5764272559852675, + "grad_norm": 0.24664369225502014, + "learning_rate": 5.9241502928281836e-05, + "loss": 1.7657, + "step": 14910 + }, + { + "epoch": 4.576734192756292, + "grad_norm": 0.29881149530410767, + "learning_rate": 5.923661796304224e-05, + "loss": 1.7611, + "step": 14911 + }, + { + "epoch": 4.577041129527317, + "grad_norm": 0.2672356367111206, + "learning_rate": 5.9231732906520984e-05, + "loss": 1.7605, + "step": 14912 + }, + { + "epoch": 4.577348066298343, + "grad_norm": 0.24282832443714142, + "learning_rate": 5.9226847758766336e-05, + "loss": 1.7037, + "step": 14913 + }, + { + "epoch": 4.577655003069368, + "grad_norm": 0.3822915852069855, + "learning_rate": 5.922196251982656e-05, + "loss": 1.7609, + "step": 14914 + }, + { + "epoch": 4.577961939840393, + "grad_norm": 0.30721214413642883, + "learning_rate": 5.921707718974994e-05, + "loss": 1.7398, + "step": 14915 + }, + { + "epoch": 4.578268876611418, + "grad_norm": 0.235477477312088, + "learning_rate": 5.921219176858477e-05, + "loss": 1.6869, + "step": 14916 + }, + { + "epoch": 4.578575813382443, + "grad_norm": 0.3752216100692749, + "learning_rate": 5.920730625637934e-05, + "loss": 1.7296, + "step": 14917 + }, + { + "epoch": 4.578882750153468, + "grad_norm": 0.36901310086250305, + "learning_rate": 5.920242065318189e-05, + "loss": 1.7405, + "step": 14918 + }, + { + "epoch": 4.579189686924494, + "grad_norm": 0.2308608740568161, + "learning_rate": 5.9197534959040725e-05, + "loss": 1.7953, + "step": 14919 + }, + { + "epoch": 4.579496623695519, + "grad_norm": 0.3286738991737366, + "learning_rate": 5.919264917400412e-05, + "loss": 1.7669, + "step": 14920 + }, + { + "epoch": 4.579803560466544, + "grad_norm": 0.3944021165370941, + "learning_rate": 5.918776329812039e-05, + "loss": 1.7165, + "step": 14921 + }, + { + "epoch": 4.580110497237569, + "grad_norm": 0.22054845094680786, + "learning_rate": 5.9182877331437795e-05, + "loss": 1.7739, + "step": 14922 + }, + { + "epoch": 4.580417434008594, + "grad_norm": 0.3467540740966797, + "learning_rate": 5.9177991274004605e-05, + "loss": 1.7713, + "step": 14923 + }, + { + "epoch": 4.5807243707796195, + "grad_norm": 0.4313695728778839, + "learning_rate": 5.917310512586914e-05, + "loss": 1.7654, + "step": 14924 + }, + { + "epoch": 4.581031307550645, + "grad_norm": 0.2723502814769745, + "learning_rate": 5.9168218887079685e-05, + "loss": 1.7314, + "step": 14925 + }, + { + "epoch": 4.581338244321669, + "grad_norm": 0.2641250789165497, + "learning_rate": 5.9163332557684504e-05, + "loss": 1.7303, + "step": 14926 + }, + { + "epoch": 4.581645181092695, + "grad_norm": 0.3780760169029236, + "learning_rate": 5.915844613773189e-05, + "loss": 1.7748, + "step": 14927 + }, + { + "epoch": 4.58195211786372, + "grad_norm": 0.23379632830619812, + "learning_rate": 5.915355962727015e-05, + "loss": 1.7482, + "step": 14928 + }, + { + "epoch": 4.582259054634745, + "grad_norm": 0.35227084159851074, + "learning_rate": 5.914867302634758e-05, + "loss": 1.8198, + "step": 14929 + }, + { + "epoch": 4.582565991405771, + "grad_norm": 0.34348124265670776, + "learning_rate": 5.914378633501245e-05, + "loss": 1.8364, + "step": 14930 + }, + { + "epoch": 4.582872928176796, + "grad_norm": 0.2446804940700531, + "learning_rate": 5.9138899553313066e-05, + "loss": 1.7779, + "step": 14931 + }, + { + "epoch": 4.58317986494782, + "grad_norm": 0.23893557488918304, + "learning_rate": 5.913401268129772e-05, + "loss": 1.7582, + "step": 14932 + }, + { + "epoch": 4.583486801718846, + "grad_norm": 0.3046814203262329, + "learning_rate": 5.912912571901471e-05, + "loss": 1.6871, + "step": 14933 + }, + { + "epoch": 4.583793738489871, + "grad_norm": 0.2232733964920044, + "learning_rate": 5.912423866651233e-05, + "loss": 1.7269, + "step": 14934 + }, + { + "epoch": 4.584100675260896, + "grad_norm": 0.18664126098155975, + "learning_rate": 5.911935152383888e-05, + "loss": 1.7155, + "step": 14935 + }, + { + "epoch": 4.584407612031922, + "grad_norm": 0.2573263347148895, + "learning_rate": 5.911446429104265e-05, + "loss": 1.7901, + "step": 14936 + }, + { + "epoch": 4.584714548802946, + "grad_norm": 0.2382393181324005, + "learning_rate": 5.910957696817194e-05, + "loss": 1.7407, + "step": 14937 + }, + { + "epoch": 4.5850214855739715, + "grad_norm": 0.28363972902297974, + "learning_rate": 5.910468955527504e-05, + "loss": 1.7971, + "step": 14938 + }, + { + "epoch": 4.585328422344997, + "grad_norm": 0.3173120617866516, + "learning_rate": 5.909980205240027e-05, + "loss": 1.744, + "step": 14939 + }, + { + "epoch": 4.585635359116022, + "grad_norm": 0.2281302511692047, + "learning_rate": 5.909491445959592e-05, + "loss": 1.6976, + "step": 14940 + }, + { + "epoch": 4.5859422958870475, + "grad_norm": 0.24962912499904633, + "learning_rate": 5.9090026776910304e-05, + "loss": 1.7979, + "step": 14941 + }, + { + "epoch": 4.586249232658073, + "grad_norm": 0.22330854833126068, + "learning_rate": 5.908513900439171e-05, + "loss": 1.7854, + "step": 14942 + }, + { + "epoch": 4.586556169429097, + "grad_norm": 0.20861582458019257, + "learning_rate": 5.908025114208845e-05, + "loss": 1.7133, + "step": 14943 + }, + { + "epoch": 4.586863106200123, + "grad_norm": 0.21838510036468506, + "learning_rate": 5.90753631900488e-05, + "loss": 1.6919, + "step": 14944 + }, + { + "epoch": 4.587170042971148, + "grad_norm": 0.252798467874527, + "learning_rate": 5.907047514832112e-05, + "loss": 1.838, + "step": 14945 + }, + { + "epoch": 4.587476979742173, + "grad_norm": 0.326893150806427, + "learning_rate": 5.906558701695369e-05, + "loss": 1.7303, + "step": 14946 + }, + { + "epoch": 4.587783916513199, + "grad_norm": 0.36489585041999817, + "learning_rate": 5.9060698795994804e-05, + "loss": 1.7631, + "step": 14947 + }, + { + "epoch": 4.588090853284223, + "grad_norm": 0.27491649985313416, + "learning_rate": 5.905581048549279e-05, + "loss": 1.7773, + "step": 14948 + }, + { + "epoch": 4.588397790055248, + "grad_norm": 0.2334890067577362, + "learning_rate": 5.905092208549595e-05, + "loss": 1.7254, + "step": 14949 + }, + { + "epoch": 4.588704726826274, + "grad_norm": 0.24383895099163055, + "learning_rate": 5.904603359605257e-05, + "loss": 1.7496, + "step": 14950 + }, + { + "epoch": 4.589011663597299, + "grad_norm": 0.2144637256860733, + "learning_rate": 5.904114501721102e-05, + "loss": 1.7028, + "step": 14951 + }, + { + "epoch": 4.589318600368324, + "grad_norm": 0.19675977528095245, + "learning_rate": 5.9036256349019555e-05, + "loss": 1.7548, + "step": 14952 + }, + { + "epoch": 4.58962553713935, + "grad_norm": 0.23712843656539917, + "learning_rate": 5.903136759152652e-05, + "loss": 1.7722, + "step": 14953 + }, + { + "epoch": 4.589932473910374, + "grad_norm": 0.20307733118534088, + "learning_rate": 5.902647874478021e-05, + "loss": 1.7177, + "step": 14954 + }, + { + "epoch": 4.5902394106813995, + "grad_norm": 0.21767669916152954, + "learning_rate": 5.9021589808828936e-05, + "loss": 1.7963, + "step": 14955 + }, + { + "epoch": 4.590546347452425, + "grad_norm": 0.2056351602077484, + "learning_rate": 5.9016700783721036e-05, + "loss": 1.7439, + "step": 14956 + }, + { + "epoch": 4.59085328422345, + "grad_norm": 0.20480911433696747, + "learning_rate": 5.90118116695048e-05, + "loss": 1.7122, + "step": 14957 + }, + { + "epoch": 4.5911602209944755, + "grad_norm": 0.24091731011867523, + "learning_rate": 5.900692246622858e-05, + "loss": 1.7862, + "step": 14958 + }, + { + "epoch": 4.5914671577655, + "grad_norm": 0.20246434211730957, + "learning_rate": 5.900203317394066e-05, + "loss": 1.6895, + "step": 14959 + }, + { + "epoch": 4.591774094536525, + "grad_norm": 0.23771630227565765, + "learning_rate": 5.899714379268938e-05, + "loss": 1.7794, + "step": 14960 + }, + { + "epoch": 4.592081031307551, + "grad_norm": 0.2638718783855438, + "learning_rate": 5.899225432252303e-05, + "loss": 1.8059, + "step": 14961 + }, + { + "epoch": 4.592387968078576, + "grad_norm": 0.24251408874988556, + "learning_rate": 5.898736476348997e-05, + "loss": 1.8063, + "step": 14962 + }, + { + "epoch": 4.592694904849601, + "grad_norm": 0.2487735152244568, + "learning_rate": 5.8982475115638515e-05, + "loss": 1.7615, + "step": 14963 + }, + { + "epoch": 4.593001841620627, + "grad_norm": 0.23507241904735565, + "learning_rate": 5.897758537901696e-05, + "loss": 1.7496, + "step": 14964 + }, + { + "epoch": 4.593308778391651, + "grad_norm": 0.22354768216609955, + "learning_rate": 5.897269555367365e-05, + "loss": 1.7293, + "step": 14965 + }, + { + "epoch": 4.593615715162676, + "grad_norm": 0.2711353003978729, + "learning_rate": 5.89678056396569e-05, + "loss": 1.8127, + "step": 14966 + }, + { + "epoch": 4.593922651933702, + "grad_norm": 0.30061110854148865, + "learning_rate": 5.8962915637015036e-05, + "loss": 1.7653, + "step": 14967 + }, + { + "epoch": 4.594229588704727, + "grad_norm": 0.24577318131923676, + "learning_rate": 5.895802554579639e-05, + "loss": 1.7888, + "step": 14968 + }, + { + "epoch": 4.5945365254757515, + "grad_norm": 0.25568944215774536, + "learning_rate": 5.895313536604929e-05, + "loss": 1.7912, + "step": 14969 + }, + { + "epoch": 4.594843462246777, + "grad_norm": 0.2710168957710266, + "learning_rate": 5.894824509782206e-05, + "loss": 1.7681, + "step": 14970 + }, + { + "epoch": 4.595150399017802, + "grad_norm": 0.24056777358055115, + "learning_rate": 5.894335474116303e-05, + "loss": 1.7729, + "step": 14971 + }, + { + "epoch": 4.5954573357888275, + "grad_norm": 0.21956710517406464, + "learning_rate": 5.89384642961205e-05, + "loss": 1.7576, + "step": 14972 + }, + { + "epoch": 4.595764272559853, + "grad_norm": 0.27499106526374817, + "learning_rate": 5.893357376274284e-05, + "loss": 1.7909, + "step": 14973 + }, + { + "epoch": 4.596071209330878, + "grad_norm": 0.28581273555755615, + "learning_rate": 5.8928683141078376e-05, + "loss": 1.7592, + "step": 14974 + }, + { + "epoch": 4.596378146101903, + "grad_norm": 0.23218442499637604, + "learning_rate": 5.892379243117543e-05, + "loss": 1.7142, + "step": 14975 + }, + { + "epoch": 4.596685082872928, + "grad_norm": 0.34015771746635437, + "learning_rate": 5.891890163308234e-05, + "loss": 1.7457, + "step": 14976 + }, + { + "epoch": 4.596992019643953, + "grad_norm": 0.2630012333393097, + "learning_rate": 5.8914010746847435e-05, + "loss": 1.7612, + "step": 14977 + }, + { + "epoch": 4.597298956414979, + "grad_norm": 0.2265843003988266, + "learning_rate": 5.890911977251904e-05, + "loss": 1.7272, + "step": 14978 + }, + { + "epoch": 4.597605893186004, + "grad_norm": 0.22325244545936584, + "learning_rate": 5.8904228710145505e-05, + "loss": 1.7447, + "step": 14979 + }, + { + "epoch": 4.597912829957028, + "grad_norm": 0.23512716591358185, + "learning_rate": 5.889933755977517e-05, + "loss": 1.7123, + "step": 14980 + }, + { + "epoch": 4.598219766728054, + "grad_norm": 0.22534869611263275, + "learning_rate": 5.8894446321456365e-05, + "loss": 1.785, + "step": 14981 + }, + { + "epoch": 4.598526703499079, + "grad_norm": 0.2447836697101593, + "learning_rate": 5.888955499523743e-05, + "loss": 1.7154, + "step": 14982 + }, + { + "epoch": 4.598833640270104, + "grad_norm": 0.2451140582561493, + "learning_rate": 5.88846635811667e-05, + "loss": 1.7494, + "step": 14983 + }, + { + "epoch": 4.59914057704113, + "grad_norm": 0.2253585308790207, + "learning_rate": 5.8879772079292504e-05, + "loss": 1.7591, + "step": 14984 + }, + { + "epoch": 4.599447513812155, + "grad_norm": 0.21714572608470917, + "learning_rate": 5.887488048966322e-05, + "loss": 1.7314, + "step": 14985 + }, + { + "epoch": 4.5997544505831796, + "grad_norm": 0.24897411465644836, + "learning_rate": 5.8869988812327145e-05, + "loss": 1.776, + "step": 14986 + }, + { + "epoch": 4.600061387354205, + "grad_norm": 0.22575093805789948, + "learning_rate": 5.8865097047332653e-05, + "loss": 1.7168, + "step": 14987 + }, + { + "epoch": 4.60036832412523, + "grad_norm": 0.22857412695884705, + "learning_rate": 5.886020519472808e-05, + "loss": 1.8262, + "step": 14988 + }, + { + "epoch": 4.600675260896256, + "grad_norm": 0.22741298377513885, + "learning_rate": 5.885531325456174e-05, + "loss": 1.6732, + "step": 14989 + }, + { + "epoch": 4.600982197667281, + "grad_norm": 0.2229645550251007, + "learning_rate": 5.885042122688202e-05, + "loss": 1.7384, + "step": 14990 + }, + { + "epoch": 4.601289134438305, + "grad_norm": 0.22609494626522064, + "learning_rate": 5.884552911173726e-05, + "loss": 1.714, + "step": 14991 + }, + { + "epoch": 4.601596071209331, + "grad_norm": 0.2629149854183197, + "learning_rate": 5.884063690917578e-05, + "loss": 1.8133, + "step": 14992 + }, + { + "epoch": 4.601903007980356, + "grad_norm": 0.220725417137146, + "learning_rate": 5.883574461924597e-05, + "loss": 1.6898, + "step": 14993 + }, + { + "epoch": 4.602209944751381, + "grad_norm": 0.207612082362175, + "learning_rate": 5.8830852241996135e-05, + "loss": 1.7302, + "step": 14994 + }, + { + "epoch": 4.602516881522407, + "grad_norm": 0.22418084740638733, + "learning_rate": 5.8825959777474625e-05, + "loss": 1.763, + "step": 14995 + }, + { + "epoch": 4.602823818293432, + "grad_norm": 0.30606865882873535, + "learning_rate": 5.882106722572983e-05, + "loss": 1.7657, + "step": 14996 + }, + { + "epoch": 4.6031307550644565, + "grad_norm": 0.2947966456413269, + "learning_rate": 5.881617458681008e-05, + "loss": 1.7796, + "step": 14997 + }, + { + "epoch": 4.603437691835482, + "grad_norm": 0.23430216312408447, + "learning_rate": 5.881128186076372e-05, + "loss": 1.78, + "step": 14998 + }, + { + "epoch": 4.603744628606507, + "grad_norm": 0.28081849217414856, + "learning_rate": 5.880638904763911e-05, + "loss": 1.6791, + "step": 14999 + }, + { + "epoch": 4.6040515653775325, + "grad_norm": 0.25459226965904236, + "learning_rate": 5.88014961474846e-05, + "loss": 1.8064, + "step": 15000 + }, + { + "epoch": 4.604358502148557, + "grad_norm": 0.2358713001012802, + "learning_rate": 5.879660316034854e-05, + "loss": 1.763, + "step": 15001 + }, + { + "epoch": 4.604665438919582, + "grad_norm": 0.32954758405685425, + "learning_rate": 5.879171008627931e-05, + "loss": 1.7462, + "step": 15002 + }, + { + "epoch": 4.604972375690608, + "grad_norm": 0.2588615417480469, + "learning_rate": 5.878681692532523e-05, + "loss": 1.7771, + "step": 15003 + }, + { + "epoch": 4.605279312461633, + "grad_norm": 0.21216195821762085, + "learning_rate": 5.878192367753468e-05, + "loss": 1.7128, + "step": 15004 + }, + { + "epoch": 4.605586249232658, + "grad_norm": 0.26849040389060974, + "learning_rate": 5.8777030342956016e-05, + "loss": 1.7048, + "step": 15005 + }, + { + "epoch": 4.605893186003684, + "grad_norm": 0.22343295812606812, + "learning_rate": 5.877213692163759e-05, + "loss": 1.7695, + "step": 15006 + }, + { + "epoch": 4.606200122774708, + "grad_norm": 0.2794288694858551, + "learning_rate": 5.876724341362776e-05, + "loss": 1.7856, + "step": 15007 + }, + { + "epoch": 4.606507059545733, + "grad_norm": 0.3525427579879761, + "learning_rate": 5.8762349818974905e-05, + "loss": 1.7807, + "step": 15008 + }, + { + "epoch": 4.606813996316759, + "grad_norm": 0.25886499881744385, + "learning_rate": 5.875745613772736e-05, + "loss": 1.7818, + "step": 15009 + }, + { + "epoch": 4.607120933087784, + "grad_norm": 0.24822987616062164, + "learning_rate": 5.8752562369933515e-05, + "loss": 1.7369, + "step": 15010 + }, + { + "epoch": 4.607427869858809, + "grad_norm": 0.26067355275154114, + "learning_rate": 5.874766851564171e-05, + "loss": 1.7056, + "step": 15011 + }, + { + "epoch": 4.607734806629834, + "grad_norm": 0.2869747579097748, + "learning_rate": 5.874277457490033e-05, + "loss": 1.7284, + "step": 15012 + }, + { + "epoch": 4.608041743400859, + "grad_norm": 0.23153580725193024, + "learning_rate": 5.87378805477577e-05, + "loss": 1.7331, + "step": 15013 + }, + { + "epoch": 4.6083486801718845, + "grad_norm": 0.29307299852371216, + "learning_rate": 5.873298643426223e-05, + "loss": 1.7376, + "step": 15014 + }, + { + "epoch": 4.60865561694291, + "grad_norm": 0.25638771057128906, + "learning_rate": 5.872809223446227e-05, + "loss": 1.7585, + "step": 15015 + }, + { + "epoch": 4.608962553713935, + "grad_norm": 0.2272702306509018, + "learning_rate": 5.872319794840618e-05, + "loss": 1.7482, + "step": 15016 + }, + { + "epoch": 4.6092694904849605, + "grad_norm": 0.2579486072063446, + "learning_rate": 5.8718303576142356e-05, + "loss": 1.778, + "step": 15017 + }, + { + "epoch": 4.609576427255985, + "grad_norm": 0.2216452956199646, + "learning_rate": 5.871340911771912e-05, + "loss": 1.7517, + "step": 15018 + }, + { + "epoch": 4.60988336402701, + "grad_norm": 0.22628961503505707, + "learning_rate": 5.870851457318488e-05, + "loss": 1.7579, + "step": 15019 + }, + { + "epoch": 4.610190300798036, + "grad_norm": 0.31018149852752686, + "learning_rate": 5.8703619942588e-05, + "loss": 1.7911, + "step": 15020 + }, + { + "epoch": 4.610497237569061, + "grad_norm": 0.2618122100830078, + "learning_rate": 5.869872522597683e-05, + "loss": 1.8121, + "step": 15021 + }, + { + "epoch": 4.610804174340086, + "grad_norm": 0.26085740327835083, + "learning_rate": 5.869383042339978e-05, + "loss": 1.7952, + "step": 15022 + }, + { + "epoch": 4.611111111111111, + "grad_norm": 0.25237780809402466, + "learning_rate": 5.86889355349052e-05, + "loss": 1.7575, + "step": 15023 + }, + { + "epoch": 4.611418047882136, + "grad_norm": 0.27550897002220154, + "learning_rate": 5.868404056054144e-05, + "loss": 1.7816, + "step": 15024 + }, + { + "epoch": 4.611724984653161, + "grad_norm": 0.2458692342042923, + "learning_rate": 5.8679145500356926e-05, + "loss": 1.7783, + "step": 15025 + }, + { + "epoch": 4.612031921424187, + "grad_norm": 0.25606176257133484, + "learning_rate": 5.867425035439999e-05, + "loss": 1.7863, + "step": 15026 + }, + { + "epoch": 4.612338858195212, + "grad_norm": 0.3206995725631714, + "learning_rate": 5.866935512271905e-05, + "loss": 1.7468, + "step": 15027 + }, + { + "epoch": 4.612645794966237, + "grad_norm": 0.2754824459552765, + "learning_rate": 5.866445980536245e-05, + "loss": 1.793, + "step": 15028 + }, + { + "epoch": 4.612952731737262, + "grad_norm": 0.25168612599372864, + "learning_rate": 5.865956440237859e-05, + "loss": 1.7252, + "step": 15029 + }, + { + "epoch": 4.613259668508287, + "grad_norm": 0.3226735293865204, + "learning_rate": 5.8654668913815815e-05, + "loss": 1.7291, + "step": 15030 + }, + { + "epoch": 4.6135666052793125, + "grad_norm": 0.2580295503139496, + "learning_rate": 5.864977333972255e-05, + "loss": 1.7622, + "step": 15031 + }, + { + "epoch": 4.613873542050338, + "grad_norm": 0.21486075222492218, + "learning_rate": 5.864487768014715e-05, + "loss": 1.7662, + "step": 15032 + }, + { + "epoch": 4.614180478821363, + "grad_norm": 0.2331690639257431, + "learning_rate": 5.8639981935137996e-05, + "loss": 1.7389, + "step": 15033 + }, + { + "epoch": 4.614487415592388, + "grad_norm": 0.2573511302471161, + "learning_rate": 5.863508610474348e-05, + "loss": 1.7699, + "step": 15034 + }, + { + "epoch": 4.614794352363413, + "grad_norm": 0.2260694056749344, + "learning_rate": 5.863019018901199e-05, + "loss": 1.7784, + "step": 15035 + }, + { + "epoch": 4.615101289134438, + "grad_norm": 0.2283065915107727, + "learning_rate": 5.8625294187991895e-05, + "loss": 1.7061, + "step": 15036 + }, + { + "epoch": 4.615408225905464, + "grad_norm": 0.24772310256958008, + "learning_rate": 5.862039810173159e-05, + "loss": 1.7568, + "step": 15037 + }, + { + "epoch": 4.615715162676489, + "grad_norm": 0.2515513002872467, + "learning_rate": 5.861550193027945e-05, + "loss": 1.7445, + "step": 15038 + }, + { + "epoch": 4.616022099447514, + "grad_norm": 0.26472151279449463, + "learning_rate": 5.8610605673683885e-05, + "loss": 1.7735, + "step": 15039 + }, + { + "epoch": 4.616329036218539, + "grad_norm": 0.24053528904914856, + "learning_rate": 5.8605709331993254e-05, + "loss": 1.8009, + "step": 15040 + }, + { + "epoch": 4.616635972989564, + "grad_norm": 0.25125381350517273, + "learning_rate": 5.860081290525596e-05, + "loss": 1.7712, + "step": 15041 + }, + { + "epoch": 4.616942909760589, + "grad_norm": 0.23056018352508545, + "learning_rate": 5.85959163935204e-05, + "loss": 1.7684, + "step": 15042 + }, + { + "epoch": 4.617249846531615, + "grad_norm": 0.2533007562160492, + "learning_rate": 5.859101979683494e-05, + "loss": 1.7793, + "step": 15043 + }, + { + "epoch": 4.617556783302639, + "grad_norm": 0.21007375419139862, + "learning_rate": 5.8586123115248e-05, + "loss": 1.7484, + "step": 15044 + }, + { + "epoch": 4.6178637200736645, + "grad_norm": 0.21329566836357117, + "learning_rate": 5.858122634880797e-05, + "loss": 1.7763, + "step": 15045 + }, + { + "epoch": 4.61817065684469, + "grad_norm": 0.2362898588180542, + "learning_rate": 5.857632949756322e-05, + "loss": 1.7484, + "step": 15046 + }, + { + "epoch": 4.618477593615715, + "grad_norm": 0.2168794423341751, + "learning_rate": 5.857143256156214e-05, + "loss": 1.7752, + "step": 15047 + }, + { + "epoch": 4.6187845303867405, + "grad_norm": 0.24761471152305603, + "learning_rate": 5.856653554085316e-05, + "loss": 1.7793, + "step": 15048 + }, + { + "epoch": 4.619091467157766, + "grad_norm": 0.23202158510684967, + "learning_rate": 5.856163843548466e-05, + "loss": 1.6862, + "step": 15049 + }, + { + "epoch": 4.61939840392879, + "grad_norm": 0.23868000507354736, + "learning_rate": 5.855674124550501e-05, + "loss": 1.8075, + "step": 15050 + }, + { + "epoch": 4.619705340699816, + "grad_norm": 0.3063114583492279, + "learning_rate": 5.855184397096265e-05, + "loss": 1.8051, + "step": 15051 + }, + { + "epoch": 4.620012277470841, + "grad_norm": 0.22672493755817413, + "learning_rate": 5.854694661190594e-05, + "loss": 1.7478, + "step": 15052 + }, + { + "epoch": 4.620319214241866, + "grad_norm": 0.3403559923171997, + "learning_rate": 5.8542049168383296e-05, + "loss": 1.765, + "step": 15053 + }, + { + "epoch": 4.620626151012892, + "grad_norm": 0.33852189779281616, + "learning_rate": 5.853715164044312e-05, + "loss": 1.7602, + "step": 15054 + }, + { + "epoch": 4.620933087783916, + "grad_norm": 0.25166940689086914, + "learning_rate": 5.85322540281338e-05, + "loss": 1.7584, + "step": 15055 + }, + { + "epoch": 4.621240024554941, + "grad_norm": 0.3417987823486328, + "learning_rate": 5.8527356331503757e-05, + "loss": 1.8491, + "step": 15056 + }, + { + "epoch": 4.621546961325967, + "grad_norm": 0.3286994397640228, + "learning_rate": 5.852245855060138e-05, + "loss": 1.7146, + "step": 15057 + }, + { + "epoch": 4.621853898096992, + "grad_norm": 0.24394257366657257, + "learning_rate": 5.851756068547505e-05, + "loss": 1.8762, + "step": 15058 + }, + { + "epoch": 4.622160834868017, + "grad_norm": 0.34945347905158997, + "learning_rate": 5.851266273617321e-05, + "loss": 1.8086, + "step": 15059 + }, + { + "epoch": 4.622467771639043, + "grad_norm": 0.30189210176467896, + "learning_rate": 5.850776470274425e-05, + "loss": 1.7366, + "step": 15060 + }, + { + "epoch": 4.622774708410067, + "grad_norm": 0.24050579965114594, + "learning_rate": 5.850286658523657e-05, + "loss": 1.7599, + "step": 15061 + }, + { + "epoch": 4.6230816451810925, + "grad_norm": 0.33650726079940796, + "learning_rate": 5.849796838369857e-05, + "loss": 1.7343, + "step": 15062 + }, + { + "epoch": 4.623388581952118, + "grad_norm": 0.2855902910232544, + "learning_rate": 5.849307009817868e-05, + "loss": 1.7325, + "step": 15063 + }, + { + "epoch": 4.623695518723143, + "grad_norm": 0.2562592923641205, + "learning_rate": 5.8488171728725275e-05, + "loss": 1.7772, + "step": 15064 + }, + { + "epoch": 4.6240024554941686, + "grad_norm": 0.23494984209537506, + "learning_rate": 5.84832732753868e-05, + "loss": 1.7263, + "step": 15065 + }, + { + "epoch": 4.624309392265193, + "grad_norm": 0.23248226940631866, + "learning_rate": 5.847837473821164e-05, + "loss": 1.7441, + "step": 15066 + }, + { + "epoch": 4.624616329036218, + "grad_norm": 0.2291254848241806, + "learning_rate": 5.847347611724821e-05, + "loss": 1.7742, + "step": 15067 + }, + { + "epoch": 4.624923265807244, + "grad_norm": 0.28305280208587646, + "learning_rate": 5.8468577412544925e-05, + "loss": 1.8224, + "step": 15068 + }, + { + "epoch": 4.625230202578269, + "grad_norm": 0.25531691312789917, + "learning_rate": 5.84636786241502e-05, + "loss": 1.7458, + "step": 15069 + }, + { + "epoch": 4.625537139349294, + "grad_norm": 0.2363462746143341, + "learning_rate": 5.845877975211242e-05, + "loss": 1.7977, + "step": 15070 + }, + { + "epoch": 4.62584407612032, + "grad_norm": 0.2707001864910126, + "learning_rate": 5.845388079648004e-05, + "loss": 1.774, + "step": 15071 + }, + { + "epoch": 4.626151012891344, + "grad_norm": 0.22281844913959503, + "learning_rate": 5.844898175730146e-05, + "loss": 1.7888, + "step": 15072 + }, + { + "epoch": 4.6264579496623695, + "grad_norm": 0.24809995293617249, + "learning_rate": 5.8444082634625086e-05, + "loss": 1.7895, + "step": 15073 + }, + { + "epoch": 4.626764886433395, + "grad_norm": 0.2842096984386444, + "learning_rate": 5.843918342849933e-05, + "loss": 1.7323, + "step": 15074 + }, + { + "epoch": 4.62707182320442, + "grad_norm": 0.21343614161014557, + "learning_rate": 5.843428413897261e-05, + "loss": 1.7298, + "step": 15075 + }, + { + "epoch": 4.627378759975445, + "grad_norm": 0.2420526146888733, + "learning_rate": 5.842938476609336e-05, + "loss": 1.778, + "step": 15076 + }, + { + "epoch": 4.62768569674647, + "grad_norm": 0.22202003002166748, + "learning_rate": 5.842448530990999e-05, + "loss": 1.779, + "step": 15077 + }, + { + "epoch": 4.627992633517495, + "grad_norm": 0.26784011721611023, + "learning_rate": 5.841958577047092e-05, + "loss": 1.799, + "step": 15078 + }, + { + "epoch": 4.628299570288521, + "grad_norm": 0.3230212926864624, + "learning_rate": 5.841468614782457e-05, + "loss": 1.7789, + "step": 15079 + }, + { + "epoch": 4.628606507059546, + "grad_norm": 0.24062715470790863, + "learning_rate": 5.840978644201935e-05, + "loss": 1.7697, + "step": 15080 + }, + { + "epoch": 4.628913443830571, + "grad_norm": 0.2882130444049835, + "learning_rate": 5.84048866531037e-05, + "loss": 1.7946, + "step": 15081 + }, + { + "epoch": 4.629220380601596, + "grad_norm": 0.3145603537559509, + "learning_rate": 5.839998678112602e-05, + "loss": 1.7116, + "step": 15082 + }, + { + "epoch": 4.629527317372621, + "grad_norm": 0.270997017621994, + "learning_rate": 5.839508682613477e-05, + "loss": 1.8281, + "step": 15083 + }, + { + "epoch": 4.629834254143646, + "grad_norm": 0.27299395203590393, + "learning_rate": 5.839018678817834e-05, + "loss": 1.8233, + "step": 15084 + }, + { + "epoch": 4.630141190914672, + "grad_norm": 0.2684478461742401, + "learning_rate": 5.838528666730517e-05, + "loss": 1.8111, + "step": 15085 + }, + { + "epoch": 4.630448127685697, + "grad_norm": 0.2365201860666275, + "learning_rate": 5.838038646356367e-05, + "loss": 1.7475, + "step": 15086 + }, + { + "epoch": 4.6307550644567215, + "grad_norm": 0.2661258280277252, + "learning_rate": 5.8375486177002305e-05, + "loss": 1.748, + "step": 15087 + }, + { + "epoch": 4.631062001227747, + "grad_norm": 0.2865012586116791, + "learning_rate": 5.8370585807669455e-05, + "loss": 1.7525, + "step": 15088 + }, + { + "epoch": 4.631368937998772, + "grad_norm": 0.2445172518491745, + "learning_rate": 5.836568535561358e-05, + "loss": 1.7278, + "step": 15089 + }, + { + "epoch": 4.6316758747697975, + "grad_norm": 0.28192558884620667, + "learning_rate": 5.8360784820883083e-05, + "loss": 1.7371, + "step": 15090 + }, + { + "epoch": 4.631982811540823, + "grad_norm": 0.38927358388900757, + "learning_rate": 5.835588420352642e-05, + "loss": 1.8088, + "step": 15091 + }, + { + "epoch": 4.632289748311848, + "grad_norm": 0.3409229516983032, + "learning_rate": 5.8350983503592025e-05, + "loss": 1.8011, + "step": 15092 + }, + { + "epoch": 4.632596685082873, + "grad_norm": 0.2464994341135025, + "learning_rate": 5.8346082721128294e-05, + "loss": 1.8354, + "step": 15093 + }, + { + "epoch": 4.632903621853898, + "grad_norm": 0.38765814900398254, + "learning_rate": 5.834118185618369e-05, + "loss": 1.7811, + "step": 15094 + }, + { + "epoch": 4.633210558624923, + "grad_norm": 0.42435070872306824, + "learning_rate": 5.833628090880664e-05, + "loss": 1.7855, + "step": 15095 + }, + { + "epoch": 4.633517495395949, + "grad_norm": 0.244876891374588, + "learning_rate": 5.833137987904558e-05, + "loss": 1.7494, + "step": 15096 + }, + { + "epoch": 4.633824432166974, + "grad_norm": 0.30353477597236633, + "learning_rate": 5.8326478766948934e-05, + "loss": 1.7772, + "step": 15097 + }, + { + "epoch": 4.634131368937998, + "grad_norm": 0.38839244842529297, + "learning_rate": 5.8321577572565146e-05, + "loss": 1.7689, + "step": 15098 + }, + { + "epoch": 4.634438305709024, + "grad_norm": 0.357129842042923, + "learning_rate": 5.8316676295942644e-05, + "loss": 1.7777, + "step": 15099 + }, + { + "epoch": 4.634745242480049, + "grad_norm": 0.23458799719810486, + "learning_rate": 5.831177493712988e-05, + "loss": 1.7544, + "step": 15100 + }, + { + "epoch": 4.635052179251074, + "grad_norm": 0.23751308023929596, + "learning_rate": 5.830687349617529e-05, + "loss": 1.7491, + "step": 15101 + }, + { + "epoch": 4.6353591160221, + "grad_norm": 0.31978943943977356, + "learning_rate": 5.83019719731273e-05, + "loss": 1.7439, + "step": 15102 + }, + { + "epoch": 4.635666052793125, + "grad_norm": 0.2751142084598541, + "learning_rate": 5.829707036803438e-05, + "loss": 1.8598, + "step": 15103 + }, + { + "epoch": 4.6359729895641495, + "grad_norm": 0.23670406639575958, + "learning_rate": 5.8292168680944914e-05, + "loss": 1.7629, + "step": 15104 + }, + { + "epoch": 4.636279926335175, + "grad_norm": 0.2447349727153778, + "learning_rate": 5.828726691190739e-05, + "loss": 1.7606, + "step": 15105 + }, + { + "epoch": 4.6365868631062, + "grad_norm": 0.2739902436733246, + "learning_rate": 5.828236506097023e-05, + "loss": 1.707, + "step": 15106 + }, + { + "epoch": 4.6368937998772255, + "grad_norm": 0.2050863653421402, + "learning_rate": 5.82774631281819e-05, + "loss": 1.7235, + "step": 15107 + }, + { + "epoch": 4.637200736648251, + "grad_norm": 0.3005560338497162, + "learning_rate": 5.827256111359082e-05, + "loss": 1.7785, + "step": 15108 + }, + { + "epoch": 4.637507673419275, + "grad_norm": 0.27168264985084534, + "learning_rate": 5.8267659017245434e-05, + "loss": 1.7844, + "step": 15109 + }, + { + "epoch": 4.637814610190301, + "grad_norm": 0.2965840995311737, + "learning_rate": 5.82627568391942e-05, + "loss": 1.7631, + "step": 15110 + }, + { + "epoch": 4.638121546961326, + "grad_norm": 0.3114408552646637, + "learning_rate": 5.825785457948556e-05, + "loss": 1.77, + "step": 15111 + }, + { + "epoch": 4.638428483732351, + "grad_norm": 0.2638910114765167, + "learning_rate": 5.825295223816796e-05, + "loss": 1.9183, + "step": 15112 + }, + { + "epoch": 4.638735420503377, + "grad_norm": 0.3293665051460266, + "learning_rate": 5.824804981528986e-05, + "loss": 1.6779, + "step": 15113 + }, + { + "epoch": 4.639042357274402, + "grad_norm": 0.28586456179618835, + "learning_rate": 5.824314731089968e-05, + "loss": 1.7905, + "step": 15114 + }, + { + "epoch": 4.639349294045426, + "grad_norm": 0.2254554182291031, + "learning_rate": 5.8238244725045906e-05, + "loss": 1.7602, + "step": 15115 + }, + { + "epoch": 4.639656230816452, + "grad_norm": 0.2770406901836395, + "learning_rate": 5.823334205777695e-05, + "loss": 1.7789, + "step": 15116 + }, + { + "epoch": 4.639963167587477, + "grad_norm": 0.2867025136947632, + "learning_rate": 5.822843930914129e-05, + "loss": 1.7408, + "step": 15117 + }, + { + "epoch": 4.640270104358502, + "grad_norm": 0.23486989736557007, + "learning_rate": 5.822353647918737e-05, + "loss": 1.7489, + "step": 15118 + }, + { + "epoch": 4.640577041129527, + "grad_norm": 0.2274324595928192, + "learning_rate": 5.821863356796367e-05, + "loss": 1.768, + "step": 15119 + }, + { + "epoch": 4.640883977900552, + "grad_norm": 0.25032591819763184, + "learning_rate": 5.821373057551858e-05, + "loss": 1.7602, + "step": 15120 + }, + { + "epoch": 4.6411909146715775, + "grad_norm": 0.22332963347434998, + "learning_rate": 5.820882750190059e-05, + "loss": 1.756, + "step": 15121 + }, + { + "epoch": 4.641497851442603, + "grad_norm": 0.24975591897964478, + "learning_rate": 5.820392434715817e-05, + "loss": 1.6963, + "step": 15122 + }, + { + "epoch": 4.641804788213628, + "grad_norm": 0.27892687916755676, + "learning_rate": 5.819902111133976e-05, + "loss": 1.8295, + "step": 15123 + }, + { + "epoch": 4.6421117249846535, + "grad_norm": 0.23914897441864014, + "learning_rate": 5.819411779449381e-05, + "loss": 1.7636, + "step": 15124 + }, + { + "epoch": 4.642418661755678, + "grad_norm": 0.2349565476179123, + "learning_rate": 5.818921439666879e-05, + "loss": 1.7823, + "step": 15125 + }, + { + "epoch": 4.642725598526703, + "grad_norm": 0.2075800597667694, + "learning_rate": 5.818431091791315e-05, + "loss": 1.7282, + "step": 15126 + }, + { + "epoch": 4.643032535297729, + "grad_norm": 0.19781073927879333, + "learning_rate": 5.817940735827535e-05, + "loss": 1.7598, + "step": 15127 + }, + { + "epoch": 4.643339472068754, + "grad_norm": 0.21997439861297607, + "learning_rate": 5.8174503717803866e-05, + "loss": 1.766, + "step": 15128 + }, + { + "epoch": 4.643646408839779, + "grad_norm": 0.23971444368362427, + "learning_rate": 5.816959999654713e-05, + "loss": 1.7824, + "step": 15129 + }, + { + "epoch": 4.643953345610804, + "grad_norm": 0.23357853293418884, + "learning_rate": 5.816469619455363e-05, + "loss": 1.7353, + "step": 15130 + }, + { + "epoch": 4.644260282381829, + "grad_norm": 0.22030897438526154, + "learning_rate": 5.815979231187181e-05, + "loss": 1.7413, + "step": 15131 + }, + { + "epoch": 4.644567219152854, + "grad_norm": 0.2322571873664856, + "learning_rate": 5.815488834855014e-05, + "loss": 1.7305, + "step": 15132 + }, + { + "epoch": 4.64487415592388, + "grad_norm": 0.25256821513175964, + "learning_rate": 5.814998430463709e-05, + "loss": 1.7533, + "step": 15133 + }, + { + "epoch": 4.645181092694905, + "grad_norm": 0.248504638671875, + "learning_rate": 5.81450801801811e-05, + "loss": 1.7345, + "step": 15134 + }, + { + "epoch": 4.64548802946593, + "grad_norm": 0.22850964963436127, + "learning_rate": 5.8140175975230673e-05, + "loss": 1.8308, + "step": 15135 + }, + { + "epoch": 4.645794966236955, + "grad_norm": 0.3517951965332031, + "learning_rate": 5.813527168983426e-05, + "loss": 1.811, + "step": 15136 + }, + { + "epoch": 4.64610190300798, + "grad_norm": 0.32132068276405334, + "learning_rate": 5.813036732404031e-05, + "loss": 1.7584, + "step": 15137 + }, + { + "epoch": 4.6464088397790055, + "grad_norm": 0.2349396049976349, + "learning_rate": 5.812546287789731e-05, + "loss": 1.7762, + "step": 15138 + }, + { + "epoch": 4.646715776550031, + "grad_norm": 0.23519493639469147, + "learning_rate": 5.812055835145372e-05, + "loss": 1.7428, + "step": 15139 + }, + { + "epoch": 4.647022713321056, + "grad_norm": 0.29277852177619934, + "learning_rate": 5.8115653744758016e-05, + "loss": 1.7599, + "step": 15140 + }, + { + "epoch": 4.647329650092081, + "grad_norm": 0.2347593754529953, + "learning_rate": 5.811074905785867e-05, + "loss": 1.7401, + "step": 15141 + }, + { + "epoch": 4.647636586863106, + "grad_norm": 0.23080264031887054, + "learning_rate": 5.8105844290804147e-05, + "loss": 1.7705, + "step": 15142 + }, + { + "epoch": 4.647943523634131, + "grad_norm": 0.24686801433563232, + "learning_rate": 5.810093944364291e-05, + "loss": 1.7409, + "step": 15143 + }, + { + "epoch": 4.648250460405157, + "grad_norm": 0.24098120629787445, + "learning_rate": 5.809603451642344e-05, + "loss": 1.7893, + "step": 15144 + }, + { + "epoch": 4.648557397176182, + "grad_norm": 0.23020638525485992, + "learning_rate": 5.809112950919422e-05, + "loss": 1.7589, + "step": 15145 + }, + { + "epoch": 4.648864333947207, + "grad_norm": 0.3036736249923706, + "learning_rate": 5.808622442200371e-05, + "loss": 1.7964, + "step": 15146 + }, + { + "epoch": 4.649171270718232, + "grad_norm": 0.2965635657310486, + "learning_rate": 5.808131925490039e-05, + "loss": 1.7986, + "step": 15147 + }, + { + "epoch": 4.649478207489257, + "grad_norm": 0.22241640090942383, + "learning_rate": 5.8076414007932745e-05, + "loss": 1.749, + "step": 15148 + }, + { + "epoch": 4.649785144260282, + "grad_norm": 0.20304246246814728, + "learning_rate": 5.8071508681149246e-05, + "loss": 1.7374, + "step": 15149 + }, + { + "epoch": 4.650092081031308, + "grad_norm": 0.19534410536289215, + "learning_rate": 5.806660327459834e-05, + "loss": 1.7087, + "step": 15150 + }, + { + "epoch": 4.650399017802332, + "grad_norm": 0.2151753008365631, + "learning_rate": 5.806169778832856e-05, + "loss": 1.7409, + "step": 15151 + }, + { + "epoch": 4.650705954573358, + "grad_norm": 0.2180301696062088, + "learning_rate": 5.805679222238836e-05, + "loss": 1.7522, + "step": 15152 + }, + { + "epoch": 4.651012891344383, + "grad_norm": 0.19917607307434082, + "learning_rate": 5.8051886576826205e-05, + "loss": 1.768, + "step": 15153 + }, + { + "epoch": 4.651319828115408, + "grad_norm": 0.2312052994966507, + "learning_rate": 5.804698085169059e-05, + "loss": 1.7799, + "step": 15154 + }, + { + "epoch": 4.651626764886434, + "grad_norm": 0.21541514992713928, + "learning_rate": 5.804207504702999e-05, + "loss": 1.7595, + "step": 15155 + }, + { + "epoch": 4.651933701657459, + "grad_norm": 0.2029450386762619, + "learning_rate": 5.803716916289289e-05, + "loss": 1.7727, + "step": 15156 + }, + { + "epoch": 4.652240638428484, + "grad_norm": 0.21796850860118866, + "learning_rate": 5.8032263199327787e-05, + "loss": 1.7445, + "step": 15157 + }, + { + "epoch": 4.652547575199509, + "grad_norm": 0.20309078693389893, + "learning_rate": 5.802735715638314e-05, + "loss": 1.6971, + "step": 15158 + }, + { + "epoch": 4.652854511970534, + "grad_norm": 0.21270112693309784, + "learning_rate": 5.802245103410745e-05, + "loss": 1.7162, + "step": 15159 + }, + { + "epoch": 4.653161448741559, + "grad_norm": 0.25357750058174133, + "learning_rate": 5.8017544832549184e-05, + "loss": 1.7534, + "step": 15160 + }, + { + "epoch": 4.653468385512585, + "grad_norm": 0.24015015363693237, + "learning_rate": 5.8012638551756847e-05, + "loss": 1.7639, + "step": 15161 + }, + { + "epoch": 4.653775322283609, + "grad_norm": 0.20507018268108368, + "learning_rate": 5.800773219177893e-05, + "loss": 1.7293, + "step": 15162 + }, + { + "epoch": 4.6540822590546345, + "grad_norm": 0.23399868607521057, + "learning_rate": 5.800282575266389e-05, + "loss": 1.8286, + "step": 15163 + }, + { + "epoch": 4.65438919582566, + "grad_norm": 0.27126726508140564, + "learning_rate": 5.799791923446025e-05, + "loss": 1.8028, + "step": 15164 + }, + { + "epoch": 4.654696132596685, + "grad_norm": 0.23644569516181946, + "learning_rate": 5.7993012637216494e-05, + "loss": 1.7138, + "step": 15165 + }, + { + "epoch": 4.6550030693677105, + "grad_norm": 0.21557916700839996, + "learning_rate": 5.7988105960981086e-05, + "loss": 1.7703, + "step": 15166 + }, + { + "epoch": 4.655310006138736, + "grad_norm": 0.22030150890350342, + "learning_rate": 5.798319920580254e-05, + "loss": 1.7282, + "step": 15167 + }, + { + "epoch": 4.65561694290976, + "grad_norm": 0.2092939168214798, + "learning_rate": 5.7978292371729325e-05, + "loss": 1.7853, + "step": 15168 + }, + { + "epoch": 4.655923879680786, + "grad_norm": 0.21643707156181335, + "learning_rate": 5.797338545880997e-05, + "loss": 1.7582, + "step": 15169 + }, + { + "epoch": 4.656230816451811, + "grad_norm": 0.3064669668674469, + "learning_rate": 5.796847846709294e-05, + "loss": 1.8139, + "step": 15170 + }, + { + "epoch": 4.656537753222836, + "grad_norm": 0.3060479760169983, + "learning_rate": 5.796357139662674e-05, + "loss": 1.7356, + "step": 15171 + }, + { + "epoch": 4.656844689993862, + "grad_norm": 0.23546656966209412, + "learning_rate": 5.7958664247459835e-05, + "loss": 1.7937, + "step": 15172 + }, + { + "epoch": 4.657151626764886, + "grad_norm": 0.2890888750553131, + "learning_rate": 5.795375701964077e-05, + "loss": 1.7305, + "step": 15173 + }, + { + "epoch": 4.657458563535911, + "grad_norm": 0.27948084473609924, + "learning_rate": 5.794884971321801e-05, + "loss": 1.7428, + "step": 15174 + }, + { + "epoch": 4.657765500306937, + "grad_norm": 0.2354089468717575, + "learning_rate": 5.794394232824007e-05, + "loss": 1.7622, + "step": 15175 + }, + { + "epoch": 4.658072437077962, + "grad_norm": 0.3271159827709198, + "learning_rate": 5.793903486475541e-05, + "loss": 1.7826, + "step": 15176 + }, + { + "epoch": 4.658379373848987, + "grad_norm": 0.3561338782310486, + "learning_rate": 5.793412732281257e-05, + "loss": 1.7698, + "step": 15177 + }, + { + "epoch": 4.658686310620013, + "grad_norm": 0.2913050949573517, + "learning_rate": 5.7929219702460035e-05, + "loss": 1.8156, + "step": 15178 + }, + { + "epoch": 4.658993247391037, + "grad_norm": 0.2345089465379715, + "learning_rate": 5.7924312003746294e-05, + "loss": 1.7859, + "step": 15179 + }, + { + "epoch": 4.6593001841620625, + "grad_norm": 0.3018132150173187, + "learning_rate": 5.7919404226719865e-05, + "loss": 1.7622, + "step": 15180 + }, + { + "epoch": 4.659607120933088, + "grad_norm": 0.29134172201156616, + "learning_rate": 5.791449637142924e-05, + "loss": 1.7287, + "step": 15181 + }, + { + "epoch": 4.659914057704113, + "grad_norm": 0.24126321077346802, + "learning_rate": 5.7909588437922924e-05, + "loss": 1.7969, + "step": 15182 + }, + { + "epoch": 4.6602209944751385, + "grad_norm": 0.27053284645080566, + "learning_rate": 5.7904680426249415e-05, + "loss": 1.7399, + "step": 15183 + }, + { + "epoch": 4.660527931246163, + "grad_norm": 0.2636512219905853, + "learning_rate": 5.789977233645722e-05, + "loss": 1.7615, + "step": 15184 + }, + { + "epoch": 4.660834868017188, + "grad_norm": 0.2263207584619522, + "learning_rate": 5.789486416859484e-05, + "loss": 1.7668, + "step": 15185 + }, + { + "epoch": 4.661141804788214, + "grad_norm": 0.25387826561927795, + "learning_rate": 5.78899559227108e-05, + "loss": 1.7594, + "step": 15186 + }, + { + "epoch": 4.661448741559239, + "grad_norm": 0.2268977165222168, + "learning_rate": 5.7885047598853596e-05, + "loss": 1.75, + "step": 15187 + }, + { + "epoch": 4.661755678330264, + "grad_norm": 0.29093095660209656, + "learning_rate": 5.788013919707172e-05, + "loss": 1.7291, + "step": 15188 + }, + { + "epoch": 4.66206261510129, + "grad_norm": 0.26578736305236816, + "learning_rate": 5.7875230717413684e-05, + "loss": 1.7276, + "step": 15189 + }, + { + "epoch": 4.662369551872314, + "grad_norm": 0.2548983097076416, + "learning_rate": 5.7870322159928e-05, + "loss": 1.755, + "step": 15190 + }, + { + "epoch": 4.662676488643339, + "grad_norm": 0.2246701419353485, + "learning_rate": 5.7865413524663184e-05, + "loss": 1.751, + "step": 15191 + }, + { + "epoch": 4.662983425414365, + "grad_norm": 0.3069002032279968, + "learning_rate": 5.7860504811667747e-05, + "loss": 1.7522, + "step": 15192 + }, + { + "epoch": 4.66329036218539, + "grad_norm": 0.3081241250038147, + "learning_rate": 5.7855596020990186e-05, + "loss": 1.7152, + "step": 15193 + }, + { + "epoch": 4.6635972989564145, + "grad_norm": 0.29006731510162354, + "learning_rate": 5.7850687152679026e-05, + "loss": 1.8471, + "step": 15194 + }, + { + "epoch": 4.66390423572744, + "grad_norm": 0.24131664633750916, + "learning_rate": 5.7845778206782786e-05, + "loss": 1.763, + "step": 15195 + }, + { + "epoch": 4.664211172498465, + "grad_norm": 0.21808001399040222, + "learning_rate": 5.784086918334994e-05, + "loss": 1.6989, + "step": 15196 + }, + { + "epoch": 4.6645181092694905, + "grad_norm": 0.2413240373134613, + "learning_rate": 5.783596008242904e-05, + "loss": 1.7869, + "step": 15197 + }, + { + "epoch": 4.664825046040516, + "grad_norm": 0.23310934007167816, + "learning_rate": 5.7831050904068594e-05, + "loss": 1.8017, + "step": 15198 + }, + { + "epoch": 4.665131982811541, + "grad_norm": 0.2577926814556122, + "learning_rate": 5.7826141648317125e-05, + "loss": 1.6938, + "step": 15199 + }, + { + "epoch": 4.665438919582566, + "grad_norm": 0.22523443400859833, + "learning_rate": 5.782123231522312e-05, + "loss": 1.8104, + "step": 15200 + }, + { + "epoch": 4.665745856353591, + "grad_norm": 0.23603026568889618, + "learning_rate": 5.781632290483512e-05, + "loss": 1.7484, + "step": 15201 + }, + { + "epoch": 4.666052793124616, + "grad_norm": 0.23195989429950714, + "learning_rate": 5.781141341720162e-05, + "loss": 1.7786, + "step": 15202 + }, + { + "epoch": 4.666359729895642, + "grad_norm": 0.21838274598121643, + "learning_rate": 5.780650385237118e-05, + "loss": 1.7509, + "step": 15203 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.26656514406204224, + "learning_rate": 5.780159421039229e-05, + "loss": 1.7875, + "step": 15204 + }, + { + "epoch": 4.666973603437691, + "grad_norm": 0.2293243706226349, + "learning_rate": 5.7796684491313456e-05, + "loss": 1.7518, + "step": 15205 + }, + { + "epoch": 4.667280540208717, + "grad_norm": 0.24190817773342133, + "learning_rate": 5.779177469518323e-05, + "loss": 1.7593, + "step": 15206 + }, + { + "epoch": 4.667587476979742, + "grad_norm": 0.31113871932029724, + "learning_rate": 5.77868648220501e-05, + "loss": 1.7911, + "step": 15207 + }, + { + "epoch": 4.667894413750767, + "grad_norm": 0.2875262498855591, + "learning_rate": 5.778195487196263e-05, + "loss": 1.7871, + "step": 15208 + }, + { + "epoch": 4.668201350521793, + "grad_norm": 0.2172149419784546, + "learning_rate": 5.777704484496931e-05, + "loss": 1.7592, + "step": 15209 + }, + { + "epoch": 4.668508287292818, + "grad_norm": 0.3282458186149597, + "learning_rate": 5.7772134741118675e-05, + "loss": 1.7687, + "step": 15210 + }, + { + "epoch": 4.6688152240638425, + "grad_norm": 0.36963000893592834, + "learning_rate": 5.7767224560459255e-05, + "loss": 1.812, + "step": 15211 + }, + { + "epoch": 4.669122160834868, + "grad_norm": 0.22387740015983582, + "learning_rate": 5.776231430303957e-05, + "loss": 1.7449, + "step": 15212 + }, + { + "epoch": 4.669429097605893, + "grad_norm": 0.21468734741210938, + "learning_rate": 5.775740396890813e-05, + "loss": 1.716, + "step": 15213 + }, + { + "epoch": 4.6697360343769185, + "grad_norm": 0.2478475719690323, + "learning_rate": 5.7752493558113486e-05, + "loss": 1.7182, + "step": 15214 + }, + { + "epoch": 4.670042971147944, + "grad_norm": 0.20924845337867737, + "learning_rate": 5.774758307070416e-05, + "loss": 1.784, + "step": 15215 + }, + { + "epoch": 4.670349907918968, + "grad_norm": 0.2933209538459778, + "learning_rate": 5.774267250672868e-05, + "loss": 1.8375, + "step": 15216 + }, + { + "epoch": 4.670656844689994, + "grad_norm": 0.2744538486003876, + "learning_rate": 5.7737761866235565e-05, + "loss": 1.7019, + "step": 15217 + }, + { + "epoch": 4.670963781461019, + "grad_norm": 0.20991720259189606, + "learning_rate": 5.773285114927336e-05, + "loss": 1.7189, + "step": 15218 + }, + { + "epoch": 4.671270718232044, + "grad_norm": 0.2873254716396332, + "learning_rate": 5.772794035589057e-05, + "loss": 1.7492, + "step": 15219 + }, + { + "epoch": 4.67157765500307, + "grad_norm": 0.2781519591808319, + "learning_rate": 5.772302948613576e-05, + "loss": 1.7342, + "step": 15220 + }, + { + "epoch": 4.671884591774095, + "grad_norm": 0.23288768529891968, + "learning_rate": 5.7718118540057455e-05, + "loss": 1.7245, + "step": 15221 + }, + { + "epoch": 4.672191528545119, + "grad_norm": 0.40817564725875854, + "learning_rate": 5.771320751770417e-05, + "loss": 1.7659, + "step": 15222 + }, + { + "epoch": 4.672498465316145, + "grad_norm": 0.45521771907806396, + "learning_rate": 5.770829641912444e-05, + "loss": 1.7875, + "step": 15223 + }, + { + "epoch": 4.67280540208717, + "grad_norm": 0.22353248298168182, + "learning_rate": 5.77033852443668e-05, + "loss": 1.7098, + "step": 15224 + }, + { + "epoch": 4.673112338858195, + "grad_norm": 0.4066791534423828, + "learning_rate": 5.769847399347981e-05, + "loss": 1.7277, + "step": 15225 + }, + { + "epoch": 4.67341927562922, + "grad_norm": 0.4299545884132385, + "learning_rate": 5.769356266651198e-05, + "loss": 1.7777, + "step": 15226 + }, + { + "epoch": 4.673726212400245, + "grad_norm": 0.21037638187408447, + "learning_rate": 5.768865126351186e-05, + "loss": 1.7263, + "step": 15227 + }, + { + "epoch": 4.6740331491712706, + "grad_norm": 0.3390437066555023, + "learning_rate": 5.768373978452798e-05, + "loss": 1.7457, + "step": 15228 + }, + { + "epoch": 4.674340085942296, + "grad_norm": 0.40003323554992676, + "learning_rate": 5.767882822960887e-05, + "loss": 1.8137, + "step": 15229 + }, + { + "epoch": 4.674647022713321, + "grad_norm": 0.2212848961353302, + "learning_rate": 5.767391659880308e-05, + "loss": 1.7131, + "step": 15230 + }, + { + "epoch": 4.6749539594843466, + "grad_norm": 0.30634984374046326, + "learning_rate": 5.766900489215915e-05, + "loss": 1.7775, + "step": 15231 + }, + { + "epoch": 4.675260896255372, + "grad_norm": 0.31412798166275024, + "learning_rate": 5.766409310972563e-05, + "loss": 1.7383, + "step": 15232 + }, + { + "epoch": 4.675567833026396, + "grad_norm": 0.21125225722789764, + "learning_rate": 5.7659181251551045e-05, + "loss": 1.8046, + "step": 15233 + }, + { + "epoch": 4.675874769797422, + "grad_norm": 0.3234494924545288, + "learning_rate": 5.765426931768394e-05, + "loss": 1.7838, + "step": 15234 + }, + { + "epoch": 4.676181706568447, + "grad_norm": 0.2668779194355011, + "learning_rate": 5.764935730817286e-05, + "loss": 1.7464, + "step": 15235 + }, + { + "epoch": 4.676488643339472, + "grad_norm": 0.22423583269119263, + "learning_rate": 5.764444522306633e-05, + "loss": 1.7165, + "step": 15236 + }, + { + "epoch": 4.676795580110497, + "grad_norm": 0.29066675901412964, + "learning_rate": 5.7639533062412945e-05, + "loss": 1.75, + "step": 15237 + }, + { + "epoch": 4.677102516881522, + "grad_norm": 0.2963598370552063, + "learning_rate": 5.76346208262612e-05, + "loss": 1.8168, + "step": 15238 + }, + { + "epoch": 4.6774094536525475, + "grad_norm": 0.21484358608722687, + "learning_rate": 5.7629708514659655e-05, + "loss": 1.71, + "step": 15239 + }, + { + "epoch": 4.677716390423573, + "grad_norm": 0.20657925307750702, + "learning_rate": 5.762479612765686e-05, + "loss": 1.7239, + "step": 15240 + }, + { + "epoch": 4.678023327194598, + "grad_norm": 0.21336235105991364, + "learning_rate": 5.761988366530136e-05, + "loss": 1.7952, + "step": 15241 + }, + { + "epoch": 4.6783302639656235, + "grad_norm": 0.24156586825847626, + "learning_rate": 5.7614971127641696e-05, + "loss": 1.7709, + "step": 15242 + }, + { + "epoch": 4.678637200736648, + "grad_norm": 0.2633824944496155, + "learning_rate": 5.761005851472643e-05, + "loss": 1.7404, + "step": 15243 + }, + { + "epoch": 4.678944137507673, + "grad_norm": 0.23302829265594482, + "learning_rate": 5.760514582660411e-05, + "loss": 1.7006, + "step": 15244 + }, + { + "epoch": 4.679251074278699, + "grad_norm": 0.22404874861240387, + "learning_rate": 5.7600233063323283e-05, + "loss": 1.7731, + "step": 15245 + }, + { + "epoch": 4.679558011049724, + "grad_norm": 0.23217839002609253, + "learning_rate": 5.7595320224932495e-05, + "loss": 1.7452, + "step": 15246 + }, + { + "epoch": 4.679864947820749, + "grad_norm": 0.23131491243839264, + "learning_rate": 5.7590407311480296e-05, + "loss": 1.7547, + "step": 15247 + }, + { + "epoch": 4.680171884591774, + "grad_norm": 0.21907350420951843, + "learning_rate": 5.7585494323015245e-05, + "loss": 1.7556, + "step": 15248 + }, + { + "epoch": 4.680478821362799, + "grad_norm": 0.22416768968105316, + "learning_rate": 5.7580581259585895e-05, + "loss": 1.7783, + "step": 15249 + }, + { + "epoch": 4.680785758133824, + "grad_norm": 0.20203055441379547, + "learning_rate": 5.75756681212408e-05, + "loss": 1.7285, + "step": 15250 + }, + { + "epoch": 4.68109269490485, + "grad_norm": 0.27838602662086487, + "learning_rate": 5.75707549080285e-05, + "loss": 1.7489, + "step": 15251 + }, + { + "epoch": 4.681399631675875, + "grad_norm": 0.2415023297071457, + "learning_rate": 5.7565841619997586e-05, + "loss": 1.7453, + "step": 15252 + }, + { + "epoch": 4.6817065684469, + "grad_norm": 0.22986920177936554, + "learning_rate": 5.756092825719658e-05, + "loss": 1.7315, + "step": 15253 + }, + { + "epoch": 4.682013505217925, + "grad_norm": 0.2427850216627121, + "learning_rate": 5.755601481967404e-05, + "loss": 1.772, + "step": 15254 + }, + { + "epoch": 4.68232044198895, + "grad_norm": 0.24556589126586914, + "learning_rate": 5.755110130747854e-05, + "loss": 1.7475, + "step": 15255 + }, + { + "epoch": 4.6826273787599755, + "grad_norm": 0.25252529978752136, + "learning_rate": 5.754618772065864e-05, + "loss": 1.7152, + "step": 15256 + }, + { + "epoch": 4.682934315531001, + "grad_norm": 0.24599005281925201, + "learning_rate": 5.754127405926287e-05, + "loss": 1.7911, + "step": 15257 + }, + { + "epoch": 4.683241252302026, + "grad_norm": 0.18961480259895325, + "learning_rate": 5.7536360323339836e-05, + "loss": 1.681, + "step": 15258 + }, + { + "epoch": 4.683548189073051, + "grad_norm": 0.24372327327728271, + "learning_rate": 5.7531446512938035e-05, + "loss": 1.7771, + "step": 15259 + }, + { + "epoch": 4.683855125844076, + "grad_norm": 0.23239269852638245, + "learning_rate": 5.752653262810609e-05, + "loss": 1.7502, + "step": 15260 + }, + { + "epoch": 4.684162062615101, + "grad_norm": 0.25076135993003845, + "learning_rate": 5.752161866889254e-05, + "loss": 1.7974, + "step": 15261 + }, + { + "epoch": 4.684468999386127, + "grad_norm": 0.2703748941421509, + "learning_rate": 5.7516704635345945e-05, + "loss": 1.7245, + "step": 15262 + }, + { + "epoch": 4.684775936157152, + "grad_norm": 0.19247616827487946, + "learning_rate": 5.751179052751487e-05, + "loss": 1.7105, + "step": 15263 + }, + { + "epoch": 4.685082872928177, + "grad_norm": 0.23166817426681519, + "learning_rate": 5.750687634544787e-05, + "loss": 1.8026, + "step": 15264 + }, + { + "epoch": 4.685389809699202, + "grad_norm": 0.22434166073799133, + "learning_rate": 5.7501962089193507e-05, + "loss": 1.7779, + "step": 15265 + }, + { + "epoch": 4.685696746470227, + "grad_norm": 0.190699502825737, + "learning_rate": 5.749704775880037e-05, + "loss": 1.726, + "step": 15266 + }, + { + "epoch": 4.686003683241252, + "grad_norm": 0.22995290160179138, + "learning_rate": 5.749213335431702e-05, + "loss": 1.7495, + "step": 15267 + }, + { + "epoch": 4.686310620012278, + "grad_norm": 0.2712057828903198, + "learning_rate": 5.7487218875792016e-05, + "loss": 1.7862, + "step": 15268 + }, + { + "epoch": 4.686617556783302, + "grad_norm": 0.2524562180042267, + "learning_rate": 5.7482304323273913e-05, + "loss": 1.7092, + "step": 15269 + }, + { + "epoch": 4.6869244935543275, + "grad_norm": 0.23810559511184692, + "learning_rate": 5.747738969681131e-05, + "loss": 1.8049, + "step": 15270 + }, + { + "epoch": 4.687231430325353, + "grad_norm": 0.25521910190582275, + "learning_rate": 5.747247499645275e-05, + "loss": 1.8124, + "step": 15271 + }, + { + "epoch": 4.687538367096378, + "grad_norm": 0.27797845005989075, + "learning_rate": 5.746756022224682e-05, + "loss": 1.7694, + "step": 15272 + }, + { + "epoch": 4.6878453038674035, + "grad_norm": 0.23849260807037354, + "learning_rate": 5.746264537424208e-05, + "loss": 1.7771, + "step": 15273 + }, + { + "epoch": 4.688152240638429, + "grad_norm": 0.24368882179260254, + "learning_rate": 5.74577304524871e-05, + "loss": 1.8143, + "step": 15274 + }, + { + "epoch": 4.688459177409453, + "grad_norm": 0.2712198793888092, + "learning_rate": 5.745281545703045e-05, + "loss": 1.7683, + "step": 15275 + }, + { + "epoch": 4.688766114180479, + "grad_norm": 0.30913081765174866, + "learning_rate": 5.7447900387920716e-05, + "loss": 1.7111, + "step": 15276 + }, + { + "epoch": 4.689073050951504, + "grad_norm": 0.22123363614082336, + "learning_rate": 5.744298524520646e-05, + "loss": 1.7466, + "step": 15277 + }, + { + "epoch": 4.689379987722529, + "grad_norm": 0.32836318016052246, + "learning_rate": 5.743807002893628e-05, + "loss": 1.8083, + "step": 15278 + }, + { + "epoch": 4.689686924493555, + "grad_norm": 0.33319979906082153, + "learning_rate": 5.743315473915871e-05, + "loss": 1.7122, + "step": 15279 + }, + { + "epoch": 4.689993861264579, + "grad_norm": 0.252163290977478, + "learning_rate": 5.742823937592236e-05, + "loss": 1.7599, + "step": 15280 + }, + { + "epoch": 4.690300798035604, + "grad_norm": 0.23248571157455444, + "learning_rate": 5.7423323939275797e-05, + "loss": 1.7791, + "step": 15281 + }, + { + "epoch": 4.69060773480663, + "grad_norm": 0.27024057507514954, + "learning_rate": 5.741840842926759e-05, + "loss": 1.7608, + "step": 15282 + }, + { + "epoch": 4.690914671577655, + "grad_norm": 0.21888256072998047, + "learning_rate": 5.7413492845946326e-05, + "loss": 1.7407, + "step": 15283 + }, + { + "epoch": 4.69122160834868, + "grad_norm": 0.2574782073497772, + "learning_rate": 5.740857718936058e-05, + "loss": 1.707, + "step": 15284 + }, + { + "epoch": 4.691528545119706, + "grad_norm": 0.2541569769382477, + "learning_rate": 5.740366145955893e-05, + "loss": 1.7301, + "step": 15285 + }, + { + "epoch": 4.69183548189073, + "grad_norm": 0.23484647274017334, + "learning_rate": 5.7398745656589955e-05, + "loss": 1.772, + "step": 15286 + }, + { + "epoch": 4.6921424186617555, + "grad_norm": 0.2827093005180359, + "learning_rate": 5.739382978050225e-05, + "loss": 1.7745, + "step": 15287 + }, + { + "epoch": 4.692449355432781, + "grad_norm": 0.300387978553772, + "learning_rate": 5.738891383134437e-05, + "loss": 1.7966, + "step": 15288 + }, + { + "epoch": 4.692756292203806, + "grad_norm": 0.2414523959159851, + "learning_rate": 5.7383997809164926e-05, + "loss": 1.7355, + "step": 15289 + }, + { + "epoch": 4.6930632289748315, + "grad_norm": 0.21221841871738434, + "learning_rate": 5.737908171401248e-05, + "loss": 1.7935, + "step": 15290 + }, + { + "epoch": 4.693370165745856, + "grad_norm": 0.23488084971904755, + "learning_rate": 5.737416554593563e-05, + "loss": 1.7447, + "step": 15291 + }, + { + "epoch": 4.693677102516881, + "grad_norm": 0.26176631450653076, + "learning_rate": 5.7369249304982954e-05, + "loss": 1.769, + "step": 15292 + }, + { + "epoch": 4.693984039287907, + "grad_norm": 0.23060615360736847, + "learning_rate": 5.736433299120303e-05, + "loss": 1.7344, + "step": 15293 + }, + { + "epoch": 4.694290976058932, + "grad_norm": 0.2536846399307251, + "learning_rate": 5.7359416604644456e-05, + "loss": 1.7862, + "step": 15294 + }, + { + "epoch": 4.694597912829957, + "grad_norm": 0.23221342265605927, + "learning_rate": 5.735450014535581e-05, + "loss": 1.743, + "step": 15295 + }, + { + "epoch": 4.694904849600983, + "grad_norm": 0.25320062041282654, + "learning_rate": 5.734958361338568e-05, + "loss": 1.8001, + "step": 15296 + }, + { + "epoch": 4.695211786372007, + "grad_norm": 0.23132461309432983, + "learning_rate": 5.734466700878267e-05, + "loss": 1.7676, + "step": 15297 + }, + { + "epoch": 4.695518723143032, + "grad_norm": 0.2222728580236435, + "learning_rate": 5.7339750331595346e-05, + "loss": 1.7267, + "step": 15298 + }, + { + "epoch": 4.695825659914058, + "grad_norm": 0.2505118250846863, + "learning_rate": 5.733483358187231e-05, + "loss": 1.7467, + "step": 15299 + }, + { + "epoch": 4.696132596685083, + "grad_norm": 0.23609887063503265, + "learning_rate": 5.732991675966214e-05, + "loss": 1.7319, + "step": 15300 + }, + { + "epoch": 4.696439533456108, + "grad_norm": 0.2939738631248474, + "learning_rate": 5.732499986501345e-05, + "loss": 1.8676, + "step": 15301 + }, + { + "epoch": 4.696746470227133, + "grad_norm": 0.29868564009666443, + "learning_rate": 5.7320082897974814e-05, + "loss": 1.7541, + "step": 15302 + }, + { + "epoch": 4.697053406998158, + "grad_norm": 0.2366383820772171, + "learning_rate": 5.731516585859482e-05, + "loss": 1.7531, + "step": 15303 + }, + { + "epoch": 4.6973603437691835, + "grad_norm": 0.2721317410469055, + "learning_rate": 5.731024874692208e-05, + "loss": 1.7444, + "step": 15304 + }, + { + "epoch": 4.697667280540209, + "grad_norm": 0.24925900995731354, + "learning_rate": 5.730533156300517e-05, + "loss": 1.7716, + "step": 15305 + }, + { + "epoch": 4.697974217311234, + "grad_norm": 0.23012754321098328, + "learning_rate": 5.7300414306892704e-05, + "loss": 1.7211, + "step": 15306 + }, + { + "epoch": 4.6982811540822595, + "grad_norm": 0.21274085342884064, + "learning_rate": 5.7295496978633254e-05, + "loss": 1.7853, + "step": 15307 + }, + { + "epoch": 4.698588090853284, + "grad_norm": 0.21799001097679138, + "learning_rate": 5.729057957827544e-05, + "loss": 1.7505, + "step": 15308 + }, + { + "epoch": 4.698895027624309, + "grad_norm": 0.22365793585777283, + "learning_rate": 5.728566210586783e-05, + "loss": 1.7934, + "step": 15309 + }, + { + "epoch": 4.699201964395335, + "grad_norm": 0.23325085639953613, + "learning_rate": 5.728074456145903e-05, + "loss": 1.7354, + "step": 15310 + }, + { + "epoch": 4.69950890116636, + "grad_norm": 0.2175164669752121, + "learning_rate": 5.7275826945097654e-05, + "loss": 1.7541, + "step": 15311 + }, + { + "epoch": 4.699815837937384, + "grad_norm": 0.24657388031482697, + "learning_rate": 5.727090925683231e-05, + "loss": 1.814, + "step": 15312 + }, + { + "epoch": 4.70012277470841, + "grad_norm": 0.2437550574541092, + "learning_rate": 5.726599149671156e-05, + "loss": 1.7234, + "step": 15313 + }, + { + "epoch": 4.700429711479435, + "grad_norm": 0.21053487062454224, + "learning_rate": 5.726107366478402e-05, + "loss": 1.7788, + "step": 15314 + }, + { + "epoch": 4.7007366482504604, + "grad_norm": 0.2007097452878952, + "learning_rate": 5.725615576109831e-05, + "loss": 1.7453, + "step": 15315 + }, + { + "epoch": 4.701043585021486, + "grad_norm": 0.19331564009189606, + "learning_rate": 5.725123778570299e-05, + "loss": 1.7142, + "step": 15316 + }, + { + "epoch": 4.701350521792511, + "grad_norm": 0.24291567504405975, + "learning_rate": 5.7246319738646706e-05, + "loss": 1.8081, + "step": 15317 + }, + { + "epoch": 4.701657458563536, + "grad_norm": 0.21423695981502533, + "learning_rate": 5.724140161997804e-05, + "loss": 1.7021, + "step": 15318 + }, + { + "epoch": 4.701964395334561, + "grad_norm": 0.20857618749141693, + "learning_rate": 5.72364834297456e-05, + "loss": 1.7447, + "step": 15319 + }, + { + "epoch": 4.702271332105586, + "grad_norm": 0.2547401487827301, + "learning_rate": 5.7231565167998e-05, + "loss": 1.7505, + "step": 15320 + }, + { + "epoch": 4.702578268876612, + "grad_norm": 0.2729472219944, + "learning_rate": 5.7226646834783825e-05, + "loss": 1.7974, + "step": 15321 + }, + { + "epoch": 4.702885205647637, + "grad_norm": 0.23258371651172638, + "learning_rate": 5.722172843015169e-05, + "loss": 1.7562, + "step": 15322 + }, + { + "epoch": 4.703192142418661, + "grad_norm": 0.23399893939495087, + "learning_rate": 5.72168099541502e-05, + "loss": 1.7674, + "step": 15323 + }, + { + "epoch": 4.703499079189687, + "grad_norm": 0.2678206264972687, + "learning_rate": 5.721189140682797e-05, + "loss": 1.7331, + "step": 15324 + }, + { + "epoch": 4.703806015960712, + "grad_norm": 0.19472146034240723, + "learning_rate": 5.7206972788233593e-05, + "loss": 1.7003, + "step": 15325 + }, + { + "epoch": 4.704112952731737, + "grad_norm": 0.2199394404888153, + "learning_rate": 5.72020540984157e-05, + "loss": 1.7072, + "step": 15326 + }, + { + "epoch": 4.704419889502763, + "grad_norm": 0.219175323843956, + "learning_rate": 5.719713533742287e-05, + "loss": 1.7591, + "step": 15327 + }, + { + "epoch": 4.704726826273788, + "grad_norm": 0.21127547323703766, + "learning_rate": 5.719221650530374e-05, + "loss": 1.8059, + "step": 15328 + }, + { + "epoch": 4.7050337630448125, + "grad_norm": 0.22189834713935852, + "learning_rate": 5.7187297602106905e-05, + "loss": 1.7529, + "step": 15329 + }, + { + "epoch": 4.705340699815838, + "grad_norm": 0.19945195317268372, + "learning_rate": 5.7182378627881e-05, + "loss": 1.7133, + "step": 15330 + }, + { + "epoch": 4.705647636586863, + "grad_norm": 0.2177499681711197, + "learning_rate": 5.7177459582674595e-05, + "loss": 1.7451, + "step": 15331 + }, + { + "epoch": 4.7059545733578885, + "grad_norm": 0.19489440321922302, + "learning_rate": 5.717254046653635e-05, + "loss": 1.7499, + "step": 15332 + }, + { + "epoch": 4.706261510128914, + "grad_norm": 0.21366968750953674, + "learning_rate": 5.716762127951485e-05, + "loss": 1.7683, + "step": 15333 + }, + { + "epoch": 4.706568446899938, + "grad_norm": 0.2894177734851837, + "learning_rate": 5.71627020216587e-05, + "loss": 1.8235, + "step": 15334 + }, + { + "epoch": 4.706875383670964, + "grad_norm": 0.22175677120685577, + "learning_rate": 5.7157782693016534e-05, + "loss": 1.7421, + "step": 15335 + }, + { + "epoch": 4.707182320441989, + "grad_norm": 0.23653541505336761, + "learning_rate": 5.715286329363698e-05, + "loss": 1.6937, + "step": 15336 + }, + { + "epoch": 4.707489257213014, + "grad_norm": 0.3015746772289276, + "learning_rate": 5.714794382356863e-05, + "loss": 1.7159, + "step": 15337 + }, + { + "epoch": 4.70779619398404, + "grad_norm": 0.24045881628990173, + "learning_rate": 5.714302428286011e-05, + "loss": 1.7263, + "step": 15338 + }, + { + "epoch": 4.708103130755065, + "grad_norm": 0.19836920499801636, + "learning_rate": 5.7138104671560035e-05, + "loss": 1.7604, + "step": 15339 + }, + { + "epoch": 4.708410067526089, + "grad_norm": 0.2430238276720047, + "learning_rate": 5.7133184989717036e-05, + "loss": 1.7147, + "step": 15340 + }, + { + "epoch": 4.708717004297115, + "grad_norm": 0.19388417899608612, + "learning_rate": 5.712826523737971e-05, + "loss": 1.7153, + "step": 15341 + }, + { + "epoch": 4.70902394106814, + "grad_norm": 0.19648151099681854, + "learning_rate": 5.7123345414596694e-05, + "loss": 1.7373, + "step": 15342 + }, + { + "epoch": 4.709330877839165, + "grad_norm": 0.20326325297355652, + "learning_rate": 5.711842552141661e-05, + "loss": 1.7012, + "step": 15343 + }, + { + "epoch": 4.70963781461019, + "grad_norm": 0.20798304677009583, + "learning_rate": 5.711350555788806e-05, + "loss": 1.7134, + "step": 15344 + }, + { + "epoch": 4.709944751381215, + "grad_norm": 0.29318806529045105, + "learning_rate": 5.7108585524059674e-05, + "loss": 1.7661, + "step": 15345 + }, + { + "epoch": 4.7102516881522405, + "grad_norm": 0.273318350315094, + "learning_rate": 5.710366541998009e-05, + "loss": 1.7329, + "step": 15346 + }, + { + "epoch": 4.710558624923266, + "grad_norm": 0.2306031584739685, + "learning_rate": 5.7098745245697925e-05, + "loss": 1.8152, + "step": 15347 + }, + { + "epoch": 4.710865561694291, + "grad_norm": 0.27630630135536194, + "learning_rate": 5.709382500126179e-05, + "loss": 1.7955, + "step": 15348 + }, + { + "epoch": 4.7111724984653165, + "grad_norm": 0.2366025298833847, + "learning_rate": 5.7088904686720326e-05, + "loss": 1.7943, + "step": 15349 + }, + { + "epoch": 4.711479435236341, + "grad_norm": 0.24196656048297882, + "learning_rate": 5.708398430212215e-05, + "loss": 1.698, + "step": 15350 + }, + { + "epoch": 4.711786372007366, + "grad_norm": 0.2770058512687683, + "learning_rate": 5.707906384751588e-05, + "loss": 1.7618, + "step": 15351 + }, + { + "epoch": 4.712093308778392, + "grad_norm": 0.20432323217391968, + "learning_rate": 5.7074143322950157e-05, + "loss": 1.7422, + "step": 15352 + }, + { + "epoch": 4.712400245549417, + "grad_norm": 0.25543150305747986, + "learning_rate": 5.70692227284736e-05, + "loss": 1.7744, + "step": 15353 + }, + { + "epoch": 4.712707182320442, + "grad_norm": 0.24315913021564484, + "learning_rate": 5.7064302064134855e-05, + "loss": 1.7127, + "step": 15354 + }, + { + "epoch": 4.713014119091467, + "grad_norm": 0.23636099696159363, + "learning_rate": 5.705938132998252e-05, + "loss": 1.7725, + "step": 15355 + }, + { + "epoch": 4.713321055862492, + "grad_norm": 0.26809820532798767, + "learning_rate": 5.705446052606526e-05, + "loss": 1.8338, + "step": 15356 + }, + { + "epoch": 4.713627992633517, + "grad_norm": 0.24969002604484558, + "learning_rate": 5.704953965243167e-05, + "loss": 1.8225, + "step": 15357 + }, + { + "epoch": 4.713934929404543, + "grad_norm": 0.23189692199230194, + "learning_rate": 5.70446187091304e-05, + "loss": 1.7901, + "step": 15358 + }, + { + "epoch": 4.714241866175568, + "grad_norm": 0.22373750805854797, + "learning_rate": 5.703969769621008e-05, + "loss": 1.6919, + "step": 15359 + }, + { + "epoch": 4.714548802946593, + "grad_norm": 0.23963531851768494, + "learning_rate": 5.703477661371934e-05, + "loss": 1.7806, + "step": 15360 + }, + { + "epoch": 4.714855739717618, + "grad_norm": 0.20365150272846222, + "learning_rate": 5.702985546170683e-05, + "loss": 1.7207, + "step": 15361 + }, + { + "epoch": 4.715162676488643, + "grad_norm": 0.245658278465271, + "learning_rate": 5.702493424022114e-05, + "loss": 1.7589, + "step": 15362 + }, + { + "epoch": 4.7154696132596685, + "grad_norm": 0.22633756697177887, + "learning_rate": 5.702001294931094e-05, + "loss": 1.7893, + "step": 15363 + }, + { + "epoch": 4.715776550030694, + "grad_norm": 0.21587726473808289, + "learning_rate": 5.701509158902487e-05, + "loss": 1.8095, + "step": 15364 + }, + { + "epoch": 4.716083486801719, + "grad_norm": 0.22553963959217072, + "learning_rate": 5.701017015941155e-05, + "loss": 1.7419, + "step": 15365 + }, + { + "epoch": 4.716390423572744, + "grad_norm": 0.2276087999343872, + "learning_rate": 5.700524866051962e-05, + "loss": 1.7052, + "step": 15366 + }, + { + "epoch": 4.716697360343769, + "grad_norm": 0.22236761450767517, + "learning_rate": 5.700032709239771e-05, + "loss": 1.8612, + "step": 15367 + }, + { + "epoch": 4.717004297114794, + "grad_norm": 0.22816185653209686, + "learning_rate": 5.6995405455094465e-05, + "loss": 1.78, + "step": 15368 + }, + { + "epoch": 4.71731123388582, + "grad_norm": 0.21597479283809662, + "learning_rate": 5.6990483748658516e-05, + "loss": 1.8276, + "step": 15369 + }, + { + "epoch": 4.717618170656845, + "grad_norm": 0.22209586203098297, + "learning_rate": 5.6985561973138533e-05, + "loss": 1.74, + "step": 15370 + }, + { + "epoch": 4.71792510742787, + "grad_norm": 0.24249997735023499, + "learning_rate": 5.6980640128583116e-05, + "loss": 1.8035, + "step": 15371 + }, + { + "epoch": 4.718232044198895, + "grad_norm": 0.23326106369495392, + "learning_rate": 5.6975718215040943e-05, + "loss": 1.7969, + "step": 15372 + }, + { + "epoch": 4.71853898096992, + "grad_norm": 0.215044766664505, + "learning_rate": 5.6970796232560596e-05, + "loss": 1.7345, + "step": 15373 + }, + { + "epoch": 4.718845917740945, + "grad_norm": 0.20231883227825165, + "learning_rate": 5.696587418119078e-05, + "loss": 1.7231, + "step": 15374 + }, + { + "epoch": 4.719152854511971, + "grad_norm": 0.2136038839817047, + "learning_rate": 5.696095206098011e-05, + "loss": 1.7421, + "step": 15375 + }, + { + "epoch": 4.719459791282996, + "grad_norm": 0.2662335932254791, + "learning_rate": 5.6956029871977235e-05, + "loss": 1.7518, + "step": 15376 + }, + { + "epoch": 4.7197667280540205, + "grad_norm": 0.25649648904800415, + "learning_rate": 5.6951107614230783e-05, + "loss": 1.8314, + "step": 15377 + }, + { + "epoch": 4.720073664825046, + "grad_norm": 0.21995560824871063, + "learning_rate": 5.6946185287789425e-05, + "loss": 1.7511, + "step": 15378 + }, + { + "epoch": 4.720380601596071, + "grad_norm": 0.3388935923576355, + "learning_rate": 5.694126289270177e-05, + "loss": 1.7975, + "step": 15379 + }, + { + "epoch": 4.7206875383670965, + "grad_norm": 0.32886409759521484, + "learning_rate": 5.693634042901651e-05, + "loss": 1.7153, + "step": 15380 + }, + { + "epoch": 4.720994475138122, + "grad_norm": 0.21727977693080902, + "learning_rate": 5.693141789678226e-05, + "loss": 1.7095, + "step": 15381 + }, + { + "epoch": 4.721301411909147, + "grad_norm": 0.2680833041667938, + "learning_rate": 5.6926495296047675e-05, + "loss": 1.696, + "step": 15382 + }, + { + "epoch": 4.721608348680172, + "grad_norm": 0.2645499110221863, + "learning_rate": 5.692157262686141e-05, + "loss": 1.6889, + "step": 15383 + }, + { + "epoch": 4.721915285451197, + "grad_norm": 0.20362348854541779, + "learning_rate": 5.69166498892721e-05, + "loss": 1.7303, + "step": 15384 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.24259062111377716, + "learning_rate": 5.691172708332839e-05, + "loss": 1.7684, + "step": 15385 + }, + { + "epoch": 4.722529158993248, + "grad_norm": 0.24204276502132416, + "learning_rate": 5.690680420907897e-05, + "loss": 1.7728, + "step": 15386 + }, + { + "epoch": 4.722836095764272, + "grad_norm": 0.3038320243358612, + "learning_rate": 5.690188126657244e-05, + "loss": 1.7573, + "step": 15387 + }, + { + "epoch": 4.723143032535297, + "grad_norm": 0.24619868397712708, + "learning_rate": 5.689695825585749e-05, + "loss": 1.754, + "step": 15388 + }, + { + "epoch": 4.723449969306323, + "grad_norm": 0.19441325962543488, + "learning_rate": 5.689203517698276e-05, + "loss": 1.726, + "step": 15389 + }, + { + "epoch": 4.723756906077348, + "grad_norm": 0.2874276340007782, + "learning_rate": 5.688711202999688e-05, + "loss": 1.7704, + "step": 15390 + }, + { + "epoch": 4.724063842848373, + "grad_norm": 0.24488390982151031, + "learning_rate": 5.6882188814948535e-05, + "loss": 1.7477, + "step": 15391 + }, + { + "epoch": 4.724370779619399, + "grad_norm": 0.22674018144607544, + "learning_rate": 5.687726553188636e-05, + "loss": 1.7287, + "step": 15392 + }, + { + "epoch": 4.724677716390423, + "grad_norm": 0.2653258442878723, + "learning_rate": 5.687234218085902e-05, + "loss": 1.7415, + "step": 15393 + }, + { + "epoch": 4.7249846531614486, + "grad_norm": 0.20345374941825867, + "learning_rate": 5.686741876191516e-05, + "loss": 1.764, + "step": 15394 + }, + { + "epoch": 4.725291589932474, + "grad_norm": 0.23193977773189545, + "learning_rate": 5.686249527510345e-05, + "loss": 1.7557, + "step": 15395 + }, + { + "epoch": 4.725598526703499, + "grad_norm": 0.26426708698272705, + "learning_rate": 5.685757172047253e-05, + "loss": 1.7708, + "step": 15396 + }, + { + "epoch": 4.725905463474525, + "grad_norm": 0.21377156674861908, + "learning_rate": 5.685264809807107e-05, + "loss": 1.6921, + "step": 15397 + }, + { + "epoch": 4.726212400245549, + "grad_norm": 0.21628457307815552, + "learning_rate": 5.684772440794773e-05, + "loss": 1.72, + "step": 15398 + }, + { + "epoch": 4.726519337016574, + "grad_norm": 0.19200581312179565, + "learning_rate": 5.684280065015116e-05, + "loss": 1.7311, + "step": 15399 + }, + { + "epoch": 4.7268262737876, + "grad_norm": 0.22227540612220764, + "learning_rate": 5.683787682473003e-05, + "loss": 1.7451, + "step": 15400 + }, + { + "epoch": 4.727133210558625, + "grad_norm": 0.18053604662418365, + "learning_rate": 5.683295293173299e-05, + "loss": 1.6816, + "step": 15401 + }, + { + "epoch": 4.72744014732965, + "grad_norm": 0.19827169179916382, + "learning_rate": 5.682802897120869e-05, + "loss": 1.7315, + "step": 15402 + }, + { + "epoch": 4.727747084100676, + "grad_norm": 0.2768021821975708, + "learning_rate": 5.682310494320582e-05, + "loss": 1.7714, + "step": 15403 + }, + { + "epoch": 4.7280540208717, + "grad_norm": 0.2613474428653717, + "learning_rate": 5.6818180847773027e-05, + "loss": 1.7332, + "step": 15404 + }, + { + "epoch": 4.7283609576427255, + "grad_norm": 0.21546787023544312, + "learning_rate": 5.681325668495898e-05, + "loss": 1.771, + "step": 15405 + }, + { + "epoch": 4.728667894413751, + "grad_norm": 0.24442137777805328, + "learning_rate": 5.680833245481234e-05, + "loss": 1.7296, + "step": 15406 + }, + { + "epoch": 4.728974831184776, + "grad_norm": 0.2622109055519104, + "learning_rate": 5.680340815738175e-05, + "loss": 1.7778, + "step": 15407 + }, + { + "epoch": 4.7292817679558015, + "grad_norm": 0.22379513084888458, + "learning_rate": 5.6798483792715904e-05, + "loss": 1.7953, + "step": 15408 + }, + { + "epoch": 4.729588704726826, + "grad_norm": 0.21901065111160278, + "learning_rate": 5.679355936086346e-05, + "loss": 1.7287, + "step": 15409 + }, + { + "epoch": 4.729895641497851, + "grad_norm": 0.3023792505264282, + "learning_rate": 5.6788634861873066e-05, + "loss": 1.7851, + "step": 15410 + }, + { + "epoch": 4.730202578268877, + "grad_norm": 0.23882482945919037, + "learning_rate": 5.678371029579342e-05, + "loss": 1.7621, + "step": 15411 + }, + { + "epoch": 4.730509515039902, + "grad_norm": 0.2661043703556061, + "learning_rate": 5.6778785662673175e-05, + "loss": 1.7453, + "step": 15412 + }, + { + "epoch": 4.730816451810927, + "grad_norm": 0.330208957195282, + "learning_rate": 5.677386096256099e-05, + "loss": 1.761, + "step": 15413 + }, + { + "epoch": 4.731123388581953, + "grad_norm": 0.2686570882797241, + "learning_rate": 5.676893619550552e-05, + "loss": 1.7539, + "step": 15414 + }, + { + "epoch": 4.731430325352977, + "grad_norm": 0.24308046698570251, + "learning_rate": 5.676401136155548e-05, + "loss": 1.7345, + "step": 15415 + }, + { + "epoch": 4.731737262124002, + "grad_norm": 0.4137137830257416, + "learning_rate": 5.67590864607595e-05, + "loss": 1.7688, + "step": 15416 + }, + { + "epoch": 4.732044198895028, + "grad_norm": 0.32161539793014526, + "learning_rate": 5.675416149316628e-05, + "loss": 1.7881, + "step": 15417 + }, + { + "epoch": 4.732351135666053, + "grad_norm": 0.2336999475955963, + "learning_rate": 5.674923645882447e-05, + "loss": 1.755, + "step": 15418 + }, + { + "epoch": 4.7326580724370775, + "grad_norm": 0.32781684398651123, + "learning_rate": 5.6744311357782754e-05, + "loss": 1.8062, + "step": 15419 + }, + { + "epoch": 4.732965009208103, + "grad_norm": 0.2475704401731491, + "learning_rate": 5.6739386190089795e-05, + "loss": 1.725, + "step": 15420 + }, + { + "epoch": 4.733271945979128, + "grad_norm": 0.26295650005340576, + "learning_rate": 5.673446095579427e-05, + "loss": 1.7673, + "step": 15421 + }, + { + "epoch": 4.7335788827501535, + "grad_norm": 0.3454873859882355, + "learning_rate": 5.6729535654944864e-05, + "loss": 1.7523, + "step": 15422 + }, + { + "epoch": 4.733885819521179, + "grad_norm": 0.2306666374206543, + "learning_rate": 5.672461028759024e-05, + "loss": 1.7085, + "step": 15423 + }, + { + "epoch": 4.734192756292204, + "grad_norm": 0.30825871229171753, + "learning_rate": 5.671968485377908e-05, + "loss": 1.7642, + "step": 15424 + }, + { + "epoch": 4.734499693063229, + "grad_norm": 0.42611342668533325, + "learning_rate": 5.6714759353560045e-05, + "loss": 1.7832, + "step": 15425 + }, + { + "epoch": 4.734806629834254, + "grad_norm": 0.29502514004707336, + "learning_rate": 5.670983378698182e-05, + "loss": 1.8153, + "step": 15426 + }, + { + "epoch": 4.735113566605279, + "grad_norm": 0.28416305780410767, + "learning_rate": 5.6704908154093096e-05, + "loss": 1.756, + "step": 15427 + }, + { + "epoch": 4.735420503376305, + "grad_norm": 0.43111103773117065, + "learning_rate": 5.6699982454942534e-05, + "loss": 1.7797, + "step": 15428 + }, + { + "epoch": 4.73572744014733, + "grad_norm": 0.27667397260665894, + "learning_rate": 5.669505668957882e-05, + "loss": 1.7316, + "step": 15429 + }, + { + "epoch": 4.736034376918354, + "grad_norm": 0.3045295774936676, + "learning_rate": 5.669013085805063e-05, + "loss": 1.7591, + "step": 15430 + }, + { + "epoch": 4.73634131368938, + "grad_norm": 0.4494635760784149, + "learning_rate": 5.6685204960406635e-05, + "loss": 1.8295, + "step": 15431 + }, + { + "epoch": 4.736648250460405, + "grad_norm": 0.2951449453830719, + "learning_rate": 5.6680278996695544e-05, + "loss": 1.7857, + "step": 15432 + }, + { + "epoch": 4.73695518723143, + "grad_norm": 0.2714167535305023, + "learning_rate": 5.6675352966966014e-05, + "loss": 1.816, + "step": 15433 + }, + { + "epoch": 4.737262124002456, + "grad_norm": 0.32701000571250916, + "learning_rate": 5.667042687126673e-05, + "loss": 1.7637, + "step": 15434 + }, + { + "epoch": 4.737569060773481, + "grad_norm": 0.2466556429862976, + "learning_rate": 5.666550070964638e-05, + "loss": 1.7805, + "step": 15435 + }, + { + "epoch": 4.7378759975445055, + "grad_norm": 0.3283855617046356, + "learning_rate": 5.666057448215365e-05, + "loss": 1.786, + "step": 15436 + }, + { + "epoch": 4.738182934315531, + "grad_norm": 0.35860660672187805, + "learning_rate": 5.6655648188837205e-05, + "loss": 1.8309, + "step": 15437 + }, + { + "epoch": 4.738489871086556, + "grad_norm": 0.22293898463249207, + "learning_rate": 5.665072182974576e-05, + "loss": 1.7317, + "step": 15438 + }, + { + "epoch": 4.7387968078575815, + "grad_norm": 0.3155089020729065, + "learning_rate": 5.664579540492798e-05, + "loss": 1.7202, + "step": 15439 + }, + { + "epoch": 4.739103744628607, + "grad_norm": 0.28723904490470886, + "learning_rate": 5.6640868914432566e-05, + "loss": 1.7788, + "step": 15440 + }, + { + "epoch": 4.739410681399631, + "grad_norm": 0.2461984008550644, + "learning_rate": 5.6635942358308183e-05, + "loss": 1.8504, + "step": 15441 + }, + { + "epoch": 4.739717618170657, + "grad_norm": 0.2503122091293335, + "learning_rate": 5.663101573660351e-05, + "loss": 1.7375, + "step": 15442 + }, + { + "epoch": 4.740024554941682, + "grad_norm": 0.24925372004508972, + "learning_rate": 5.662608904936727e-05, + "loss": 1.7152, + "step": 15443 + }, + { + "epoch": 4.740331491712707, + "grad_norm": 0.2734573483467102, + "learning_rate": 5.662116229664813e-05, + "loss": 1.7476, + "step": 15444 + }, + { + "epoch": 4.740638428483733, + "grad_norm": 0.38122060894966125, + "learning_rate": 5.661623547849479e-05, + "loss": 1.7682, + "step": 15445 + }, + { + "epoch": 4.740945365254758, + "grad_norm": 0.3786417245864868, + "learning_rate": 5.661130859495593e-05, + "loss": 1.7446, + "step": 15446 + }, + { + "epoch": 4.741252302025782, + "grad_norm": 0.22618255019187927, + "learning_rate": 5.6606381646080244e-05, + "loss": 1.7427, + "step": 15447 + }, + { + "epoch": 4.741559238796808, + "grad_norm": 0.3000899851322174, + "learning_rate": 5.6601454631916405e-05, + "loss": 1.7087, + "step": 15448 + }, + { + "epoch": 4.741866175567833, + "grad_norm": 0.36542513966560364, + "learning_rate": 5.659652755251315e-05, + "loss": 1.7985, + "step": 15449 + }, + { + "epoch": 4.742173112338858, + "grad_norm": 0.23550496995449066, + "learning_rate": 5.659160040791912e-05, + "loss": 1.8163, + "step": 15450 + }, + { + "epoch": 4.742480049109884, + "grad_norm": 0.25615251064300537, + "learning_rate": 5.658667319818305e-05, + "loss": 1.7372, + "step": 15451 + }, + { + "epoch": 4.742786985880908, + "grad_norm": 0.28744083642959595, + "learning_rate": 5.6581745923353615e-05, + "loss": 1.7193, + "step": 15452 + }, + { + "epoch": 4.7430939226519335, + "grad_norm": 0.2500229775905609, + "learning_rate": 5.65768185834795e-05, + "loss": 1.7263, + "step": 15453 + }, + { + "epoch": 4.743400859422959, + "grad_norm": 0.21520425379276276, + "learning_rate": 5.6571891178609394e-05, + "loss": 1.7337, + "step": 15454 + }, + { + "epoch": 4.743707796193984, + "grad_norm": 0.212506502866745, + "learning_rate": 5.656696370879202e-05, + "loss": 1.7672, + "step": 15455 + }, + { + "epoch": 4.7440147329650095, + "grad_norm": 0.21143417060375214, + "learning_rate": 5.656203617407607e-05, + "loss": 1.7189, + "step": 15456 + }, + { + "epoch": 4.744321669736035, + "grad_norm": 0.18320922553539276, + "learning_rate": 5.6557108574510243e-05, + "loss": 1.7521, + "step": 15457 + }, + { + "epoch": 4.744628606507059, + "grad_norm": 0.19202999770641327, + "learning_rate": 5.655218091014321e-05, + "loss": 1.6756, + "step": 15458 + }, + { + "epoch": 4.744935543278085, + "grad_norm": 0.2152331918478012, + "learning_rate": 5.654725318102367e-05, + "loss": 1.7653, + "step": 15459 + }, + { + "epoch": 4.74524248004911, + "grad_norm": 0.24565903842449188, + "learning_rate": 5.6542325387200354e-05, + "loss": 1.7654, + "step": 15460 + }, + { + "epoch": 4.745549416820135, + "grad_norm": 0.2504819333553314, + "learning_rate": 5.653739752872195e-05, + "loss": 1.7073, + "step": 15461 + }, + { + "epoch": 4.74585635359116, + "grad_norm": 0.19258706271648407, + "learning_rate": 5.653246960563714e-05, + "loss": 1.7106, + "step": 15462 + }, + { + "epoch": 4.746163290362185, + "grad_norm": 0.22961968183517456, + "learning_rate": 5.652754161799465e-05, + "loss": 1.7868, + "step": 15463 + }, + { + "epoch": 4.74647022713321, + "grad_norm": 0.2763231098651886, + "learning_rate": 5.652261356584315e-05, + "loss": 1.7714, + "step": 15464 + }, + { + "epoch": 4.746777163904236, + "grad_norm": 0.23866096138954163, + "learning_rate": 5.651768544923136e-05, + "loss": 1.7537, + "step": 15465 + }, + { + "epoch": 4.747084100675261, + "grad_norm": 0.21851976215839386, + "learning_rate": 5.6512757268207997e-05, + "loss": 1.8109, + "step": 15466 + }, + { + "epoch": 4.747391037446286, + "grad_norm": 0.22249393165111542, + "learning_rate": 5.6507829022821745e-05, + "loss": 1.7357, + "step": 15467 + }, + { + "epoch": 4.747697974217311, + "grad_norm": 0.20202289521694183, + "learning_rate": 5.650290071312131e-05, + "loss": 1.7867, + "step": 15468 + }, + { + "epoch": 4.748004910988336, + "grad_norm": 0.20618727803230286, + "learning_rate": 5.649797233915539e-05, + "loss": 1.6904, + "step": 15469 + }, + { + "epoch": 4.7483118477593615, + "grad_norm": 0.25609052181243896, + "learning_rate": 5.649304390097272e-05, + "loss": 1.7287, + "step": 15470 + }, + { + "epoch": 4.748618784530387, + "grad_norm": 0.22966544330120087, + "learning_rate": 5.648811539862195e-05, + "loss": 1.7384, + "step": 15471 + }, + { + "epoch": 4.748925721301412, + "grad_norm": 0.24070143699645996, + "learning_rate": 5.6483186832151856e-05, + "loss": 1.7625, + "step": 15472 + }, + { + "epoch": 4.749232658072437, + "grad_norm": 0.22642426192760468, + "learning_rate": 5.647825820161109e-05, + "loss": 1.7291, + "step": 15473 + }, + { + "epoch": 4.749539594843462, + "grad_norm": 0.23255646228790283, + "learning_rate": 5.64733295070484e-05, + "loss": 1.8076, + "step": 15474 + }, + { + "epoch": 4.749846531614487, + "grad_norm": 0.20902042090892792, + "learning_rate": 5.646840074851246e-05, + "loss": 1.6627, + "step": 15475 + }, + { + "epoch": 4.750153468385513, + "grad_norm": 0.21608836948871613, + "learning_rate": 5.646347192605198e-05, + "loss": 1.7458, + "step": 15476 + }, + { + "epoch": 4.750460405156538, + "grad_norm": 0.22368495166301727, + "learning_rate": 5.6458543039715694e-05, + "loss": 1.7601, + "step": 15477 + }, + { + "epoch": 4.750767341927563, + "grad_norm": 0.30586308240890503, + "learning_rate": 5.645361408955231e-05, + "loss": 1.8389, + "step": 15478 + }, + { + "epoch": 4.751074278698588, + "grad_norm": 0.25122150778770447, + "learning_rate": 5.644868507561052e-05, + "loss": 1.7509, + "step": 15479 + }, + { + "epoch": 4.751381215469613, + "grad_norm": 0.28435763716697693, + "learning_rate": 5.644375599793904e-05, + "loss": 1.7723, + "step": 15480 + }, + { + "epoch": 4.7516881522406385, + "grad_norm": 0.3111409842967987, + "learning_rate": 5.643882685658659e-05, + "loss": 1.7973, + "step": 15481 + }, + { + "epoch": 4.751995089011664, + "grad_norm": 0.3108380138874054, + "learning_rate": 5.6433897651601874e-05, + "loss": 1.8126, + "step": 15482 + }, + { + "epoch": 4.752302025782689, + "grad_norm": 0.25894731283187866, + "learning_rate": 5.642896838303362e-05, + "loss": 1.7849, + "step": 15483 + }, + { + "epoch": 4.752608962553714, + "grad_norm": 0.39321839809417725, + "learning_rate": 5.642403905093052e-05, + "loss": 1.7583, + "step": 15484 + }, + { + "epoch": 4.752915899324739, + "grad_norm": 0.3206121027469635, + "learning_rate": 5.6419109655341315e-05, + "loss": 1.8061, + "step": 15485 + }, + { + "epoch": 4.753222836095764, + "grad_norm": 0.2817624807357788, + "learning_rate": 5.64141801963147e-05, + "loss": 1.8252, + "step": 15486 + }, + { + "epoch": 4.75352977286679, + "grad_norm": 0.3344736397266388, + "learning_rate": 5.6409250673899405e-05, + "loss": 1.6975, + "step": 15487 + }, + { + "epoch": 4.753836709637815, + "grad_norm": 0.21873882412910461, + "learning_rate": 5.640432108814413e-05, + "loss": 1.7126, + "step": 15488 + }, + { + "epoch": 4.75414364640884, + "grad_norm": 0.3317199945449829, + "learning_rate": 5.639939143909758e-05, + "loss": 1.7826, + "step": 15489 + }, + { + "epoch": 4.754450583179865, + "grad_norm": 0.34901630878448486, + "learning_rate": 5.639446172680854e-05, + "loss": 1.7411, + "step": 15490 + }, + { + "epoch": 4.75475751995089, + "grad_norm": 0.24015867710113525, + "learning_rate": 5.6389531951325645e-05, + "loss": 1.7514, + "step": 15491 + }, + { + "epoch": 4.755064456721915, + "grad_norm": 0.28364554047584534, + "learning_rate": 5.6384602112697674e-05, + "loss": 1.7569, + "step": 15492 + }, + { + "epoch": 4.755371393492941, + "grad_norm": 0.3561246693134308, + "learning_rate": 5.637967221097329e-05, + "loss": 1.7212, + "step": 15493 + }, + { + "epoch": 4.755678330263965, + "grad_norm": 0.3383684456348419, + "learning_rate": 5.637474224620126e-05, + "loss": 1.6866, + "step": 15494 + }, + { + "epoch": 4.7559852670349905, + "grad_norm": 0.2399235963821411, + "learning_rate": 5.63698122184303e-05, + "loss": 1.7609, + "step": 15495 + }, + { + "epoch": 4.756292203806016, + "grad_norm": 0.38559645414352417, + "learning_rate": 5.636488212770912e-05, + "loss": 1.7509, + "step": 15496 + }, + { + "epoch": 4.756599140577041, + "grad_norm": 0.365005224943161, + "learning_rate": 5.635995197408645e-05, + "loss": 1.7894, + "step": 15497 + }, + { + "epoch": 4.7569060773480665, + "grad_norm": 0.21254757046699524, + "learning_rate": 5.635502175761099e-05, + "loss": 1.6969, + "step": 15498 + }, + { + "epoch": 4.757213014119092, + "grad_norm": 0.42865821719169617, + "learning_rate": 5.635009147833149e-05, + "loss": 1.7989, + "step": 15499 + }, + { + "epoch": 4.757519950890116, + "grad_norm": 0.35717228055000305, + "learning_rate": 5.634516113629665e-05, + "loss": 1.7338, + "step": 15500 + }, + { + "epoch": 4.757826887661142, + "grad_norm": 0.21582463383674622, + "learning_rate": 5.634023073155523e-05, + "loss": 1.7429, + "step": 15501 + }, + { + "epoch": 4.758133824432167, + "grad_norm": 0.3376842141151428, + "learning_rate": 5.633530026415592e-05, + "loss": 1.7703, + "step": 15502 + }, + { + "epoch": 4.758440761203192, + "grad_norm": 0.2760981023311615, + "learning_rate": 5.633036973414747e-05, + "loss": 1.7389, + "step": 15503 + }, + { + "epoch": 4.758747697974218, + "grad_norm": 0.3808997571468353, + "learning_rate": 5.63254391415786e-05, + "loss": 1.7513, + "step": 15504 + }, + { + "epoch": 4.759054634745242, + "grad_norm": 0.5152496695518494, + "learning_rate": 5.6320508486498014e-05, + "loss": 1.7376, + "step": 15505 + }, + { + "epoch": 4.759361571516267, + "grad_norm": 0.33983346819877625, + "learning_rate": 5.6315577768954464e-05, + "loss": 1.7209, + "step": 15506 + }, + { + "epoch": 4.759668508287293, + "grad_norm": 0.27064043283462524, + "learning_rate": 5.631064698899669e-05, + "loss": 1.7808, + "step": 15507 + }, + { + "epoch": 4.759975445058318, + "grad_norm": 0.3659237027168274, + "learning_rate": 5.630571614667339e-05, + "loss": 1.7706, + "step": 15508 + }, + { + "epoch": 4.760282381829343, + "grad_norm": 0.246379554271698, + "learning_rate": 5.63007852420333e-05, + "loss": 1.7425, + "step": 15509 + }, + { + "epoch": 4.760589318600369, + "grad_norm": 0.2683795392513275, + "learning_rate": 5.629585427512518e-05, + "loss": 1.7332, + "step": 15510 + }, + { + "epoch": 4.760896255371393, + "grad_norm": 0.32626205682754517, + "learning_rate": 5.6290923245997704e-05, + "loss": 1.786, + "step": 15511 + }, + { + "epoch": 4.7612031921424185, + "grad_norm": 0.23723098635673523, + "learning_rate": 5.6285992154699666e-05, + "loss": 1.7305, + "step": 15512 + }, + { + "epoch": 4.761510128913444, + "grad_norm": 0.26316091418266296, + "learning_rate": 5.628106100127976e-05, + "loss": 1.7804, + "step": 15513 + }, + { + "epoch": 4.761817065684469, + "grad_norm": 0.24376356601715088, + "learning_rate": 5.6276129785786726e-05, + "loss": 1.738, + "step": 15514 + }, + { + "epoch": 4.7621240024554945, + "grad_norm": 0.27778422832489014, + "learning_rate": 5.627119850826931e-05, + "loss": 1.7444, + "step": 15515 + }, + { + "epoch": 4.762430939226519, + "grad_norm": 0.3134306073188782, + "learning_rate": 5.6266267168776224e-05, + "loss": 1.7696, + "step": 15516 + }, + { + "epoch": 4.762737875997544, + "grad_norm": 0.2354283481836319, + "learning_rate": 5.6261335767356195e-05, + "loss": 1.799, + "step": 15517 + }, + { + "epoch": 4.76304481276857, + "grad_norm": 0.26902756094932556, + "learning_rate": 5.6256404304058e-05, + "loss": 1.7091, + "step": 15518 + }, + { + "epoch": 4.763351749539595, + "grad_norm": 0.2760716676712036, + "learning_rate": 5.6251472778930345e-05, + "loss": 1.742, + "step": 15519 + }, + { + "epoch": 4.76365868631062, + "grad_norm": 0.2138829231262207, + "learning_rate": 5.624654119202197e-05, + "loss": 1.7093, + "step": 15520 + }, + { + "epoch": 4.763965623081646, + "grad_norm": 0.31404614448547363, + "learning_rate": 5.624160954338162e-05, + "loss": 1.7467, + "step": 15521 + }, + { + "epoch": 4.76427255985267, + "grad_norm": 0.24810083210468292, + "learning_rate": 5.623667783305803e-05, + "loss": 1.745, + "step": 15522 + }, + { + "epoch": 4.764579496623695, + "grad_norm": 0.23674242198467255, + "learning_rate": 5.6231746061099913e-05, + "loss": 1.7662, + "step": 15523 + }, + { + "epoch": 4.764886433394721, + "grad_norm": 0.264230877161026, + "learning_rate": 5.622681422755606e-05, + "loss": 1.7627, + "step": 15524 + }, + { + "epoch": 4.765193370165746, + "grad_norm": 0.2982041537761688, + "learning_rate": 5.6221882332475165e-05, + "loss": 1.7558, + "step": 15525 + }, + { + "epoch": 4.765500306936771, + "grad_norm": 0.29215967655181885, + "learning_rate": 5.6216950375905975e-05, + "loss": 1.7981, + "step": 15526 + }, + { + "epoch": 4.765807243707796, + "grad_norm": 0.20014487206935883, + "learning_rate": 5.6212018357897244e-05, + "loss": 1.7113, + "step": 15527 + }, + { + "epoch": 4.766114180478821, + "grad_norm": 0.22359825670719147, + "learning_rate": 5.620708627849769e-05, + "loss": 1.7356, + "step": 15528 + }, + { + "epoch": 4.7664211172498465, + "grad_norm": 0.2254783809185028, + "learning_rate": 5.620215413775609e-05, + "loss": 1.7397, + "step": 15529 + }, + { + "epoch": 4.766728054020872, + "grad_norm": 0.2827560305595398, + "learning_rate": 5.619722193572117e-05, + "loss": 1.732, + "step": 15530 + }, + { + "epoch": 4.767034990791897, + "grad_norm": 0.22591307759284973, + "learning_rate": 5.619228967244165e-05, + "loss": 1.7713, + "step": 15531 + }, + { + "epoch": 4.7673419275629225, + "grad_norm": 0.25872737169265747, + "learning_rate": 5.618735734796632e-05, + "loss": 1.7291, + "step": 15532 + }, + { + "epoch": 4.767648864333947, + "grad_norm": 0.24515275657176971, + "learning_rate": 5.6182424962343884e-05, + "loss": 1.8079, + "step": 15533 + }, + { + "epoch": 4.767955801104972, + "grad_norm": 0.2456643134355545, + "learning_rate": 5.617749251562309e-05, + "loss": 1.7082, + "step": 15534 + }, + { + "epoch": 4.768262737875998, + "grad_norm": 0.21684220433235168, + "learning_rate": 5.6172560007852716e-05, + "loss": 1.7563, + "step": 15535 + }, + { + "epoch": 4.768569674647023, + "grad_norm": 0.2141445428133011, + "learning_rate": 5.616762743908147e-05, + "loss": 1.7115, + "step": 15536 + }, + { + "epoch": 4.768876611418047, + "grad_norm": 0.22502638399600983, + "learning_rate": 5.616269480935812e-05, + "loss": 1.723, + "step": 15537 + }, + { + "epoch": 4.769183548189073, + "grad_norm": 0.23387989401817322, + "learning_rate": 5.6157762118731416e-05, + "loss": 1.7775, + "step": 15538 + }, + { + "epoch": 4.769490484960098, + "grad_norm": 0.19615057110786438, + "learning_rate": 5.6152829367250096e-05, + "loss": 1.7696, + "step": 15539 + }, + { + "epoch": 4.769797421731123, + "grad_norm": 0.2408154010772705, + "learning_rate": 5.614789655496289e-05, + "loss": 1.7758, + "step": 15540 + }, + { + "epoch": 4.770104358502149, + "grad_norm": 0.20994634926319122, + "learning_rate": 5.614296368191859e-05, + "loss": 1.6935, + "step": 15541 + }, + { + "epoch": 4.770411295273174, + "grad_norm": 0.24135129153728485, + "learning_rate": 5.613803074816591e-05, + "loss": 1.7644, + "step": 15542 + }, + { + "epoch": 4.7707182320441985, + "grad_norm": 0.2380143105983734, + "learning_rate": 5.6133097753753625e-05, + "loss": 1.741, + "step": 15543 + }, + { + "epoch": 4.771025168815224, + "grad_norm": 0.30300623178482056, + "learning_rate": 5.6128164698730465e-05, + "loss": 1.7935, + "step": 15544 + }, + { + "epoch": 4.771332105586249, + "grad_norm": 0.2620760500431061, + "learning_rate": 5.612323158314519e-05, + "loss": 1.7436, + "step": 15545 + }, + { + "epoch": 4.7716390423572745, + "grad_norm": 0.3791491389274597, + "learning_rate": 5.6118298407046544e-05, + "loss": 1.7503, + "step": 15546 + }, + { + "epoch": 4.7719459791283, + "grad_norm": 0.3830909729003906, + "learning_rate": 5.61133651704833e-05, + "loss": 1.7651, + "step": 15547 + }, + { + "epoch": 4.772252915899324, + "grad_norm": 0.26680612564086914, + "learning_rate": 5.610843187350419e-05, + "loss": 1.8075, + "step": 15548 + }, + { + "epoch": 4.77255985267035, + "grad_norm": 0.38018953800201416, + "learning_rate": 5.610349851615798e-05, + "loss": 1.8301, + "step": 15549 + }, + { + "epoch": 4.772866789441375, + "grad_norm": 0.4514484107494354, + "learning_rate": 5.6098565098493414e-05, + "loss": 1.7709, + "step": 15550 + }, + { + "epoch": 4.7731737262124, + "grad_norm": 0.28267863392829895, + "learning_rate": 5.6093631620559254e-05, + "loss": 1.8087, + "step": 15551 + }, + { + "epoch": 4.773480662983426, + "grad_norm": 0.22541162371635437, + "learning_rate": 5.6088698082404256e-05, + "loss": 1.7457, + "step": 15552 + }, + { + "epoch": 4.773787599754451, + "grad_norm": 0.3012544512748718, + "learning_rate": 5.608376448407718e-05, + "loss": 1.7454, + "step": 15553 + }, + { + "epoch": 4.774094536525475, + "grad_norm": 0.2460169941186905, + "learning_rate": 5.607883082562677e-05, + "loss": 1.8237, + "step": 15554 + }, + { + "epoch": 4.774401473296501, + "grad_norm": 0.2918507158756256, + "learning_rate": 5.6073897107101804e-05, + "loss": 1.7416, + "step": 15555 + }, + { + "epoch": 4.774708410067526, + "grad_norm": 0.3104710280895233, + "learning_rate": 5.6068963328551016e-05, + "loss": 1.8162, + "step": 15556 + }, + { + "epoch": 4.7750153468385514, + "grad_norm": 0.2576459050178528, + "learning_rate": 5.606402949002317e-05, + "loss": 1.7732, + "step": 15557 + }, + { + "epoch": 4.775322283609577, + "grad_norm": 0.2373739629983902, + "learning_rate": 5.605909559156706e-05, + "loss": 1.7812, + "step": 15558 + }, + { + "epoch": 4.775629220380601, + "grad_norm": 0.30436694622039795, + "learning_rate": 5.6054161633231385e-05, + "loss": 1.7606, + "step": 15559 + }, + { + "epoch": 4.775936157151627, + "grad_norm": 0.3058558702468872, + "learning_rate": 5.604922761506495e-05, + "loss": 1.8384, + "step": 15560 + }, + { + "epoch": 4.776243093922652, + "grad_norm": 0.26421624422073364, + "learning_rate": 5.6044293537116496e-05, + "loss": 1.8041, + "step": 15561 + }, + { + "epoch": 4.776550030693677, + "grad_norm": 0.4945085346698761, + "learning_rate": 5.603935939943479e-05, + "loss": 1.7522, + "step": 15562 + }, + { + "epoch": 4.776856967464703, + "grad_norm": 0.41049134731292725, + "learning_rate": 5.6034425202068595e-05, + "loss": 1.7471, + "step": 15563 + }, + { + "epoch": 4.777163904235728, + "grad_norm": 0.22972853481769562, + "learning_rate": 5.602949094506668e-05, + "loss": 1.7041, + "step": 15564 + }, + { + "epoch": 4.777470841006752, + "grad_norm": 0.37373700737953186, + "learning_rate": 5.6024556628477785e-05, + "loss": 1.7811, + "step": 15565 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.3603375554084778, + "learning_rate": 5.6019622252350714e-05, + "loss": 1.8396, + "step": 15566 + }, + { + "epoch": 4.778084714548803, + "grad_norm": 0.2085956335067749, + "learning_rate": 5.601468781673419e-05, + "loss": 1.7453, + "step": 15567 + }, + { + "epoch": 4.778391651319828, + "grad_norm": 0.28871405124664307, + "learning_rate": 5.6009753321677e-05, + "loss": 1.7135, + "step": 15568 + }, + { + "epoch": 4.778698588090853, + "grad_norm": 0.2378411591053009, + "learning_rate": 5.600481876722791e-05, + "loss": 1.77, + "step": 15569 + }, + { + "epoch": 4.779005524861878, + "grad_norm": 0.2902696430683136, + "learning_rate": 5.599988415343567e-05, + "loss": 1.7416, + "step": 15570 + }, + { + "epoch": 4.7793124616329035, + "grad_norm": 0.36155447363853455, + "learning_rate": 5.5994949480349066e-05, + "loss": 1.7095, + "step": 15571 + }, + { + "epoch": 4.779619398403929, + "grad_norm": 0.24867403507232666, + "learning_rate": 5.599001474801686e-05, + "loss": 1.8063, + "step": 15572 + }, + { + "epoch": 4.779926335174954, + "grad_norm": 0.24853186309337616, + "learning_rate": 5.5985079956487815e-05, + "loss": 1.7537, + "step": 15573 + }, + { + "epoch": 4.7802332719459795, + "grad_norm": 0.31984636187553406, + "learning_rate": 5.598014510581071e-05, + "loss": 1.7888, + "step": 15574 + }, + { + "epoch": 4.780540208717004, + "grad_norm": 0.23907123506069183, + "learning_rate": 5.597521019603429e-05, + "loss": 1.7157, + "step": 15575 + }, + { + "epoch": 4.780847145488029, + "grad_norm": 0.25759413838386536, + "learning_rate": 5.597027522720736e-05, + "loss": 1.7579, + "step": 15576 + }, + { + "epoch": 4.781154082259055, + "grad_norm": 0.34123921394348145, + "learning_rate": 5.5965340199378654e-05, + "loss": 1.838, + "step": 15577 + }, + { + "epoch": 4.78146101903008, + "grad_norm": 0.2769980728626251, + "learning_rate": 5.596040511259697e-05, + "loss": 1.7889, + "step": 15578 + }, + { + "epoch": 4.781767955801105, + "grad_norm": 0.21936915814876556, + "learning_rate": 5.5955469966911066e-05, + "loss": 1.7434, + "step": 15579 + }, + { + "epoch": 4.78207489257213, + "grad_norm": 0.27583181858062744, + "learning_rate": 5.59505347623697e-05, + "loss": 1.7229, + "step": 15580 + }, + { + "epoch": 4.782381829343155, + "grad_norm": 0.24246171116828918, + "learning_rate": 5.594559949902168e-05, + "loss": 1.7368, + "step": 15581 + }, + { + "epoch": 4.78268876611418, + "grad_norm": 0.22705630958080292, + "learning_rate": 5.594066417691576e-05, + "loss": 1.7261, + "step": 15582 + }, + { + "epoch": 4.782995702885206, + "grad_norm": 0.23308728635311127, + "learning_rate": 5.593572879610072e-05, + "loss": 1.7451, + "step": 15583 + }, + { + "epoch": 4.783302639656231, + "grad_norm": 0.21654267609119415, + "learning_rate": 5.5930793356625324e-05, + "loss": 1.7133, + "step": 15584 + }, + { + "epoch": 4.783609576427256, + "grad_norm": 0.22884133458137512, + "learning_rate": 5.5925857858538347e-05, + "loss": 1.6899, + "step": 15585 + }, + { + "epoch": 4.783916513198281, + "grad_norm": 0.2396838665008545, + "learning_rate": 5.5920922301888555e-05, + "loss": 1.7837, + "step": 15586 + }, + { + "epoch": 4.784223449969306, + "grad_norm": 0.22941450774669647, + "learning_rate": 5.5915986686724765e-05, + "loss": 1.7443, + "step": 15587 + }, + { + "epoch": 4.7845303867403315, + "grad_norm": 0.23992502689361572, + "learning_rate": 5.591105101309572e-05, + "loss": 1.8054, + "step": 15588 + }, + { + "epoch": 4.784837323511357, + "grad_norm": 0.2540588974952698, + "learning_rate": 5.59061152810502e-05, + "loss": 1.855, + "step": 15589 + }, + { + "epoch": 4.785144260282382, + "grad_norm": 0.22691720724105835, + "learning_rate": 5.590117949063699e-05, + "loss": 1.7441, + "step": 15590 + }, + { + "epoch": 4.785451197053407, + "grad_norm": 0.23691289126873016, + "learning_rate": 5.5896243641904864e-05, + "loss": 1.8156, + "step": 15591 + }, + { + "epoch": 4.785758133824432, + "grad_norm": 0.2749332785606384, + "learning_rate": 5.589130773490261e-05, + "loss": 1.8157, + "step": 15592 + }, + { + "epoch": 4.786065070595457, + "grad_norm": 0.2435624748468399, + "learning_rate": 5.588637176967899e-05, + "loss": 1.7473, + "step": 15593 + }, + { + "epoch": 4.786372007366483, + "grad_norm": 0.22931383550167084, + "learning_rate": 5.5881435746282795e-05, + "loss": 1.7652, + "step": 15594 + }, + { + "epoch": 4.786678944137508, + "grad_norm": 0.23916593194007874, + "learning_rate": 5.587649966476282e-05, + "loss": 1.7415, + "step": 15595 + }, + { + "epoch": 4.786985880908533, + "grad_norm": 0.23483172059059143, + "learning_rate": 5.5871563525167814e-05, + "loss": 1.7308, + "step": 15596 + }, + { + "epoch": 4.787292817679558, + "grad_norm": 0.24850021302700043, + "learning_rate": 5.586662732754656e-05, + "loss": 1.8294, + "step": 15597 + }, + { + "epoch": 4.787599754450583, + "grad_norm": 0.2439260333776474, + "learning_rate": 5.586169107194788e-05, + "loss": 1.7599, + "step": 15598 + }, + { + "epoch": 4.787906691221608, + "grad_norm": 0.22379007935523987, + "learning_rate": 5.585675475842054e-05, + "loss": 1.7278, + "step": 15599 + }, + { + "epoch": 4.788213627992634, + "grad_norm": 0.2633908689022064, + "learning_rate": 5.58518183870133e-05, + "loss": 1.7318, + "step": 15600 + }, + { + "epoch": 4.788520564763659, + "grad_norm": 0.20992474257946014, + "learning_rate": 5.584688195777497e-05, + "loss": 1.7003, + "step": 15601 + }, + { + "epoch": 4.7888275015346835, + "grad_norm": 0.2460084706544876, + "learning_rate": 5.584194547075432e-05, + "loss": 1.78, + "step": 15602 + }, + { + "epoch": 4.789134438305709, + "grad_norm": 0.23955418169498444, + "learning_rate": 5.583700892600013e-05, + "loss": 1.7953, + "step": 15603 + }, + { + "epoch": 4.789441375076734, + "grad_norm": 0.2495713233947754, + "learning_rate": 5.583207232356121e-05, + "loss": 1.7874, + "step": 15604 + }, + { + "epoch": 4.7897483118477595, + "grad_norm": 0.22878028452396393, + "learning_rate": 5.5827135663486344e-05, + "loss": 1.7961, + "step": 15605 + }, + { + "epoch": 4.790055248618785, + "grad_norm": 0.2299363762140274, + "learning_rate": 5.582219894582429e-05, + "loss": 1.7497, + "step": 15606 + }, + { + "epoch": 4.79036218538981, + "grad_norm": 0.22896108031272888, + "learning_rate": 5.5817262170623865e-05, + "loss": 1.7543, + "step": 15607 + }, + { + "epoch": 4.790669122160835, + "grad_norm": 0.2150495946407318, + "learning_rate": 5.581232533793383e-05, + "loss": 1.8034, + "step": 15608 + }, + { + "epoch": 4.79097605893186, + "grad_norm": 0.21317999064922333, + "learning_rate": 5.580738844780301e-05, + "loss": 1.7482, + "step": 15609 + }, + { + "epoch": 4.791282995702885, + "grad_norm": 0.21904391050338745, + "learning_rate": 5.580245150028016e-05, + "loss": 1.7647, + "step": 15610 + }, + { + "epoch": 4.791589932473911, + "grad_norm": 0.2026481032371521, + "learning_rate": 5.5797514495414095e-05, + "loss": 1.6997, + "step": 15611 + }, + { + "epoch": 4.791896869244935, + "grad_norm": 0.22508487105369568, + "learning_rate": 5.579257743325359e-05, + "loss": 1.8258, + "step": 15612 + }, + { + "epoch": 4.79220380601596, + "grad_norm": 0.2801211178302765, + "learning_rate": 5.5787640313847435e-05, + "loss": 1.6991, + "step": 15613 + }, + { + "epoch": 4.792510742786986, + "grad_norm": 0.2696724236011505, + "learning_rate": 5.578270313724442e-05, + "loss": 1.7339, + "step": 15614 + }, + { + "epoch": 4.792817679558011, + "grad_norm": 0.2909143269062042, + "learning_rate": 5.577776590349334e-05, + "loss": 1.8481, + "step": 15615 + }, + { + "epoch": 4.793124616329036, + "grad_norm": 0.21682757139205933, + "learning_rate": 5.5772828612643005e-05, + "loss": 1.759, + "step": 15616 + }, + { + "epoch": 4.793431553100062, + "grad_norm": 0.23074059188365936, + "learning_rate": 5.576789126474219e-05, + "loss": 1.7652, + "step": 15617 + }, + { + "epoch": 4.793738489871086, + "grad_norm": 0.24018999934196472, + "learning_rate": 5.576295385983969e-05, + "loss": 1.7986, + "step": 15618 + }, + { + "epoch": 4.7940454266421115, + "grad_norm": 0.23987948894500732, + "learning_rate": 5.575801639798431e-05, + "loss": 1.779, + "step": 15619 + }, + { + "epoch": 4.794352363413137, + "grad_norm": 0.2138533890247345, + "learning_rate": 5.575307887922482e-05, + "loss": 1.7097, + "step": 15620 + }, + { + "epoch": 4.794659300184162, + "grad_norm": 0.1995106190443039, + "learning_rate": 5.5748141303610044e-05, + "loss": 1.6924, + "step": 15621 + }, + { + "epoch": 4.7949662369551875, + "grad_norm": 0.23547641932964325, + "learning_rate": 5.574320367118877e-05, + "loss": 1.8492, + "step": 15622 + }, + { + "epoch": 4.795273173726212, + "grad_norm": 0.22931239008903503, + "learning_rate": 5.5738265982009794e-05, + "loss": 1.8054, + "step": 15623 + }, + { + "epoch": 4.795580110497237, + "grad_norm": 0.19957222044467926, + "learning_rate": 5.573332823612191e-05, + "loss": 1.7464, + "step": 15624 + }, + { + "epoch": 4.795887047268263, + "grad_norm": 0.1990327090024948, + "learning_rate": 5.5728390433573905e-05, + "loss": 1.7438, + "step": 15625 + }, + { + "epoch": 4.796193984039288, + "grad_norm": 0.22276802361011505, + "learning_rate": 5.572345257441459e-05, + "loss": 1.7674, + "step": 15626 + }, + { + "epoch": 4.796500920810313, + "grad_norm": 0.2109617441892624, + "learning_rate": 5.571851465869277e-05, + "loss": 1.7577, + "step": 15627 + }, + { + "epoch": 4.796807857581339, + "grad_norm": 0.22917217016220093, + "learning_rate": 5.5713576686457234e-05, + "loss": 1.7478, + "step": 15628 + }, + { + "epoch": 4.797114794352363, + "grad_norm": 0.21016938984394073, + "learning_rate": 5.570863865775678e-05, + "loss": 1.8078, + "step": 15629 + }, + { + "epoch": 4.797421731123388, + "grad_norm": 0.22478216886520386, + "learning_rate": 5.5703700572640215e-05, + "loss": 1.7621, + "step": 15630 + }, + { + "epoch": 4.797728667894414, + "grad_norm": 0.26899904012680054, + "learning_rate": 5.569876243115634e-05, + "loss": 1.8065, + "step": 15631 + }, + { + "epoch": 4.798035604665439, + "grad_norm": 0.23187808692455292, + "learning_rate": 5.569382423335394e-05, + "loss": 1.7337, + "step": 15632 + }, + { + "epoch": 4.798342541436464, + "grad_norm": 0.2264855057001114, + "learning_rate": 5.568888597928185e-05, + "loss": 1.7879, + "step": 15633 + }, + { + "epoch": 4.798649478207489, + "grad_norm": 0.244137242436409, + "learning_rate": 5.568394766898886e-05, + "loss": 1.8307, + "step": 15634 + }, + { + "epoch": 4.798956414978514, + "grad_norm": 0.2400583177804947, + "learning_rate": 5.5679009302523744e-05, + "loss": 1.76, + "step": 15635 + }, + { + "epoch": 4.7992633517495396, + "grad_norm": 0.2324059158563614, + "learning_rate": 5.5674070879935347e-05, + "loss": 1.7594, + "step": 15636 + }, + { + "epoch": 4.799570288520565, + "grad_norm": 0.21753786504268646, + "learning_rate": 5.566913240127244e-05, + "loss": 1.7568, + "step": 15637 + }, + { + "epoch": 4.79987722529159, + "grad_norm": 0.21557624638080597, + "learning_rate": 5.566419386658386e-05, + "loss": 1.7733, + "step": 15638 + }, + { + "epoch": 4.800184162062616, + "grad_norm": 0.22795113921165466, + "learning_rate": 5.565925527591839e-05, + "loss": 1.7624, + "step": 15639 + }, + { + "epoch": 4.80049109883364, + "grad_norm": 0.23035180568695068, + "learning_rate": 5.565431662932484e-05, + "loss": 1.7436, + "step": 15640 + }, + { + "epoch": 4.800798035604665, + "grad_norm": 0.2569425404071808, + "learning_rate": 5.564937792685203e-05, + "loss": 1.7027, + "step": 15641 + }, + { + "epoch": 4.801104972375691, + "grad_norm": 0.20544980466365814, + "learning_rate": 5.564443916854875e-05, + "loss": 1.7125, + "step": 15642 + }, + { + "epoch": 4.801411909146716, + "grad_norm": 0.25040850043296814, + "learning_rate": 5.5639500354463815e-05, + "loss": 1.7646, + "step": 15643 + }, + { + "epoch": 4.8017188459177405, + "grad_norm": 0.1991344839334488, + "learning_rate": 5.563456148464602e-05, + "loss": 1.7206, + "step": 15644 + }, + { + "epoch": 4.802025782688766, + "grad_norm": 0.236537903547287, + "learning_rate": 5.56296225591442e-05, + "loss": 1.7288, + "step": 15645 + }, + { + "epoch": 4.802332719459791, + "grad_norm": 0.253619521856308, + "learning_rate": 5.562468357800714e-05, + "loss": 1.7347, + "step": 15646 + }, + { + "epoch": 4.8026396562308165, + "grad_norm": 0.22038741409778595, + "learning_rate": 5.561974454128367e-05, + "loss": 1.7854, + "step": 15647 + }, + { + "epoch": 4.802946593001842, + "grad_norm": 0.24848157167434692, + "learning_rate": 5.5614805449022576e-05, + "loss": 1.6904, + "step": 15648 + }, + { + "epoch": 4.803253529772867, + "grad_norm": 0.28735271096229553, + "learning_rate": 5.56098663012727e-05, + "loss": 1.7476, + "step": 15649 + }, + { + "epoch": 4.803560466543892, + "grad_norm": 0.2658432722091675, + "learning_rate": 5.5604927098082825e-05, + "loss": 1.7314, + "step": 15650 + }, + { + "epoch": 4.803867403314917, + "grad_norm": 0.20409154891967773, + "learning_rate": 5.559998783950179e-05, + "loss": 1.7698, + "step": 15651 + }, + { + "epoch": 4.804174340085942, + "grad_norm": 0.21932728588581085, + "learning_rate": 5.5595048525578384e-05, + "loss": 1.7808, + "step": 15652 + }, + { + "epoch": 4.804481276856968, + "grad_norm": 0.2549879848957062, + "learning_rate": 5.559010915636143e-05, + "loss": 1.8294, + "step": 15653 + }, + { + "epoch": 4.804788213627993, + "grad_norm": 0.2002289742231369, + "learning_rate": 5.5585169731899736e-05, + "loss": 1.732, + "step": 15654 + }, + { + "epoch": 4.805095150399017, + "grad_norm": 0.19988931715488434, + "learning_rate": 5.558023025224212e-05, + "loss": 1.7482, + "step": 15655 + }, + { + "epoch": 4.805402087170043, + "grad_norm": 0.21265259385108948, + "learning_rate": 5.55752907174374e-05, + "loss": 1.8003, + "step": 15656 + }, + { + "epoch": 4.805709023941068, + "grad_norm": 0.22365640103816986, + "learning_rate": 5.5570351127534395e-05, + "loss": 1.7536, + "step": 15657 + }, + { + "epoch": 4.806015960712093, + "grad_norm": 0.25516408681869507, + "learning_rate": 5.556541148258192e-05, + "loss": 1.7648, + "step": 15658 + }, + { + "epoch": 4.806322897483119, + "grad_norm": 0.24870765209197998, + "learning_rate": 5.5560471782628775e-05, + "loss": 1.7793, + "step": 15659 + }, + { + "epoch": 4.806629834254144, + "grad_norm": 0.22119416296482086, + "learning_rate": 5.555553202772379e-05, + "loss": 1.7464, + "step": 15660 + }, + { + "epoch": 4.8069367710251685, + "grad_norm": 0.2781904637813568, + "learning_rate": 5.555059221791579e-05, + "loss": 1.7537, + "step": 15661 + }, + { + "epoch": 4.807243707796194, + "grad_norm": 0.2433774471282959, + "learning_rate": 5.5545652353253574e-05, + "loss": 1.74, + "step": 15662 + }, + { + "epoch": 4.807550644567219, + "grad_norm": 0.19932180643081665, + "learning_rate": 5.554071243378598e-05, + "loss": 1.75, + "step": 15663 + }, + { + "epoch": 4.8078575813382445, + "grad_norm": 0.2428865283727646, + "learning_rate": 5.553577245956182e-05, + "loss": 1.7198, + "step": 15664 + }, + { + "epoch": 4.80816451810927, + "grad_norm": 0.2914198338985443, + "learning_rate": 5.553083243062991e-05, + "loss": 1.7544, + "step": 15665 + }, + { + "epoch": 4.808471454880294, + "grad_norm": 0.2274291068315506, + "learning_rate": 5.5525892347039056e-05, + "loss": 1.8213, + "step": 15666 + }, + { + "epoch": 4.80877839165132, + "grad_norm": 0.23662471771240234, + "learning_rate": 5.552095220883811e-05, + "loss": 1.8025, + "step": 15667 + }, + { + "epoch": 4.809085328422345, + "grad_norm": 0.23062555491924286, + "learning_rate": 5.551601201607587e-05, + "loss": 1.7109, + "step": 15668 + }, + { + "epoch": 4.80939226519337, + "grad_norm": 0.19986943900585175, + "learning_rate": 5.551107176880117e-05, + "loss": 1.7442, + "step": 15669 + }, + { + "epoch": 4.809699201964396, + "grad_norm": 0.2545560300350189, + "learning_rate": 5.5506131467062836e-05, + "loss": 1.7609, + "step": 15670 + }, + { + "epoch": 4.810006138735421, + "grad_norm": 0.253296434879303, + "learning_rate": 5.550119111090968e-05, + "loss": 1.7307, + "step": 15671 + }, + { + "epoch": 4.810313075506445, + "grad_norm": 0.19617940485477448, + "learning_rate": 5.549625070039052e-05, + "loss": 1.7507, + "step": 15672 + }, + { + "epoch": 4.810620012277471, + "grad_norm": 0.2525297999382019, + "learning_rate": 5.5491310235554193e-05, + "loss": 1.8021, + "step": 15673 + }, + { + "epoch": 4.810926949048496, + "grad_norm": 0.20537389814853668, + "learning_rate": 5.548636971644953e-05, + "loss": 1.7432, + "step": 15674 + }, + { + "epoch": 4.811233885819521, + "grad_norm": 0.19924211502075195, + "learning_rate": 5.548142914312533e-05, + "loss": 1.7741, + "step": 15675 + }, + { + "epoch": 4.811540822590547, + "grad_norm": 0.21121448278427124, + "learning_rate": 5.547648851563046e-05, + "loss": 1.7198, + "step": 15676 + }, + { + "epoch": 4.811847759361571, + "grad_norm": 0.23504914343357086, + "learning_rate": 5.547154783401369e-05, + "loss": 1.7173, + "step": 15677 + }, + { + "epoch": 4.8121546961325965, + "grad_norm": 0.2362392097711563, + "learning_rate": 5.54666070983239e-05, + "loss": 1.7752, + "step": 15678 + }, + { + "epoch": 4.812461632903622, + "grad_norm": 0.2524966895580292, + "learning_rate": 5.5461666308609886e-05, + "loss": 1.7943, + "step": 15679 + }, + { + "epoch": 4.812768569674647, + "grad_norm": 0.2250952422618866, + "learning_rate": 5.5456725464920476e-05, + "loss": 1.7606, + "step": 15680 + }, + { + "epoch": 4.8130755064456725, + "grad_norm": 0.21753156185150146, + "learning_rate": 5.5451784567304524e-05, + "loss": 1.7846, + "step": 15681 + }, + { + "epoch": 4.813382443216698, + "grad_norm": 0.220795676112175, + "learning_rate": 5.5446843615810825e-05, + "loss": 1.7422, + "step": 15682 + }, + { + "epoch": 4.813689379987722, + "grad_norm": 0.23597733676433563, + "learning_rate": 5.544190261048823e-05, + "loss": 1.7818, + "step": 15683 + }, + { + "epoch": 4.813996316758748, + "grad_norm": 0.2625976502895355, + "learning_rate": 5.543696155138557e-05, + "loss": 1.7796, + "step": 15684 + }, + { + "epoch": 4.814303253529773, + "grad_norm": 0.20515871047973633, + "learning_rate": 5.5432020438551656e-05, + "loss": 1.7096, + "step": 15685 + }, + { + "epoch": 4.814610190300798, + "grad_norm": 0.19353924691677094, + "learning_rate": 5.542707927203536e-05, + "loss": 1.7541, + "step": 15686 + }, + { + "epoch": 4.814917127071823, + "grad_norm": 0.21998172998428345, + "learning_rate": 5.5422138051885454e-05, + "loss": 1.7696, + "step": 15687 + }, + { + "epoch": 4.815224063842848, + "grad_norm": 0.27576857805252075, + "learning_rate": 5.5417196778150816e-05, + "loss": 1.7491, + "step": 15688 + }, + { + "epoch": 4.815531000613873, + "grad_norm": 0.28202036023139954, + "learning_rate": 5.5412255450880254e-05, + "loss": 1.8615, + "step": 15689 + }, + { + "epoch": 4.815837937384899, + "grad_norm": 0.29632845520973206, + "learning_rate": 5.540731407012263e-05, + "loss": 1.7698, + "step": 15690 + }, + { + "epoch": 4.816144874155924, + "grad_norm": 0.35393890738487244, + "learning_rate": 5.540237263592675e-05, + "loss": 1.7924, + "step": 15691 + }, + { + "epoch": 4.816451810926949, + "grad_norm": 0.23756493628025055, + "learning_rate": 5.5397431148341447e-05, + "loss": 1.8301, + "step": 15692 + }, + { + "epoch": 4.816758747697974, + "grad_norm": 0.310153603553772, + "learning_rate": 5.53924896074156e-05, + "loss": 1.8162, + "step": 15693 + }, + { + "epoch": 4.817065684468999, + "grad_norm": 0.3355565369129181, + "learning_rate": 5.538754801319797e-05, + "loss": 1.7738, + "step": 15694 + }, + { + "epoch": 4.8173726212400245, + "grad_norm": 0.2360079288482666, + "learning_rate": 5.5382606365737446e-05, + "loss": 1.6883, + "step": 15695 + }, + { + "epoch": 4.81767955801105, + "grad_norm": 0.2932819724082947, + "learning_rate": 5.537766466508286e-05, + "loss": 1.8045, + "step": 15696 + }, + { + "epoch": 4.817986494782075, + "grad_norm": 0.31298181414604187, + "learning_rate": 5.537272291128304e-05, + "loss": 1.7516, + "step": 15697 + }, + { + "epoch": 4.8182934315531, + "grad_norm": 0.22871924936771393, + "learning_rate": 5.5367781104386806e-05, + "loss": 1.7386, + "step": 15698 + }, + { + "epoch": 4.818600368324125, + "grad_norm": 0.27097782492637634, + "learning_rate": 5.5362839244443034e-05, + "loss": 1.733, + "step": 15699 + }, + { + "epoch": 4.81890730509515, + "grad_norm": 0.23296736180782318, + "learning_rate": 5.535789733150052e-05, + "loss": 1.7735, + "step": 15700 + }, + { + "epoch": 4.819214241866176, + "grad_norm": 0.22650237381458282, + "learning_rate": 5.5352955365608125e-05, + "loss": 1.7443, + "step": 15701 + }, + { + "epoch": 4.819521178637201, + "grad_norm": 0.25525161623954773, + "learning_rate": 5.534801334681471e-05, + "loss": 1.7379, + "step": 15702 + }, + { + "epoch": 4.819828115408226, + "grad_norm": 0.2249457836151123, + "learning_rate": 5.534307127516908e-05, + "loss": 1.7393, + "step": 15703 + }, + { + "epoch": 4.820135052179251, + "grad_norm": 0.1995566338300705, + "learning_rate": 5.5338129150720084e-05, + "loss": 1.7411, + "step": 15704 + }, + { + "epoch": 4.820441988950276, + "grad_norm": 0.250851035118103, + "learning_rate": 5.533318697351657e-05, + "loss": 1.7801, + "step": 15705 + }, + { + "epoch": 4.820748925721301, + "grad_norm": 0.3175830543041229, + "learning_rate": 5.532824474360737e-05, + "loss": 1.7553, + "step": 15706 + }, + { + "epoch": 4.821055862492327, + "grad_norm": 0.22842039167881012, + "learning_rate": 5.532330246104134e-05, + "loss": 1.7489, + "step": 15707 + }, + { + "epoch": 4.821362799263352, + "grad_norm": 0.21125485002994537, + "learning_rate": 5.531836012586732e-05, + "loss": 1.7543, + "step": 15708 + }, + { + "epoch": 4.8216697360343765, + "grad_norm": 0.33028700947761536, + "learning_rate": 5.531341773813414e-05, + "loss": 1.8237, + "step": 15709 + }, + { + "epoch": 4.821976672805402, + "grad_norm": 0.324564129114151, + "learning_rate": 5.530847529789067e-05, + "loss": 1.7288, + "step": 15710 + }, + { + "epoch": 4.822283609576427, + "grad_norm": 0.3299528956413269, + "learning_rate": 5.530353280518571e-05, + "loss": 1.7536, + "step": 15711 + }, + { + "epoch": 4.8225905463474525, + "grad_norm": 0.3535030782222748, + "learning_rate": 5.5298590260068136e-05, + "loss": 1.7941, + "step": 15712 + }, + { + "epoch": 4.822897483118478, + "grad_norm": 0.2627669870853424, + "learning_rate": 5.5293647662586804e-05, + "loss": 1.7638, + "step": 15713 + }, + { + "epoch": 4.823204419889503, + "grad_norm": 0.25569450855255127, + "learning_rate": 5.5288705012790535e-05, + "loss": 1.7396, + "step": 15714 + }, + { + "epoch": 4.823511356660528, + "grad_norm": 0.26099520921707153, + "learning_rate": 5.528376231072817e-05, + "loss": 1.7415, + "step": 15715 + }, + { + "epoch": 4.823818293431553, + "grad_norm": 0.31833693385124207, + "learning_rate": 5.527881955644858e-05, + "loss": 1.7683, + "step": 15716 + }, + { + "epoch": 4.824125230202578, + "grad_norm": 0.2753448188304901, + "learning_rate": 5.5273876750000594e-05, + "loss": 1.6653, + "step": 15717 + }, + { + "epoch": 4.824432166973604, + "grad_norm": 0.23816895484924316, + "learning_rate": 5.526893389143307e-05, + "loss": 1.7575, + "step": 15718 + }, + { + "epoch": 4.824739103744628, + "grad_norm": 0.25376051664352417, + "learning_rate": 5.5263990980794856e-05, + "loss": 1.755, + "step": 15719 + }, + { + "epoch": 4.8250460405156534, + "grad_norm": 0.2483726590871811, + "learning_rate": 5.52590480181348e-05, + "loss": 1.7566, + "step": 15720 + }, + { + "epoch": 4.825352977286679, + "grad_norm": 0.2073517143726349, + "learning_rate": 5.5254105003501746e-05, + "loss": 1.7069, + "step": 15721 + }, + { + "epoch": 4.825659914057704, + "grad_norm": 0.3166659474372864, + "learning_rate": 5.524916193694455e-05, + "loss": 1.7012, + "step": 15722 + }, + { + "epoch": 4.8259668508287294, + "grad_norm": 0.24518641829490662, + "learning_rate": 5.524421881851205e-05, + "loss": 1.7027, + "step": 15723 + }, + { + "epoch": 4.826273787599755, + "grad_norm": 0.23137906193733215, + "learning_rate": 5.523927564825311e-05, + "loss": 1.746, + "step": 15724 + }, + { + "epoch": 4.82658072437078, + "grad_norm": 0.27937051653862, + "learning_rate": 5.5234332426216586e-05, + "loss": 1.7064, + "step": 15725 + }, + { + "epoch": 4.826887661141805, + "grad_norm": 0.26408496499061584, + "learning_rate": 5.522938915245131e-05, + "loss": 1.6598, + "step": 15726 + }, + { + "epoch": 4.82719459791283, + "grad_norm": 0.22269997000694275, + "learning_rate": 5.5224445827006164e-05, + "loss": 1.7166, + "step": 15727 + }, + { + "epoch": 4.827501534683855, + "grad_norm": 0.22687453031539917, + "learning_rate": 5.5219502449929964e-05, + "loss": 1.7156, + "step": 15728 + }, + { + "epoch": 4.827808471454881, + "grad_norm": 0.26355600357055664, + "learning_rate": 5.5214559021271585e-05, + "loss": 1.8016, + "step": 15729 + }, + { + "epoch": 4.828115408225905, + "grad_norm": 0.30103012919425964, + "learning_rate": 5.520961554107987e-05, + "loss": 1.7856, + "step": 15730 + }, + { + "epoch": 4.82842234499693, + "grad_norm": 0.22604018449783325, + "learning_rate": 5.520467200940369e-05, + "loss": 1.813, + "step": 15731 + }, + { + "epoch": 4.828729281767956, + "grad_norm": 0.25435203313827515, + "learning_rate": 5.51997284262919e-05, + "loss": 1.7511, + "step": 15732 + }, + { + "epoch": 4.829036218538981, + "grad_norm": 0.2740691304206848, + "learning_rate": 5.519478479179333e-05, + "loss": 1.7326, + "step": 15733 + }, + { + "epoch": 4.829343155310006, + "grad_norm": 0.19710861146450043, + "learning_rate": 5.5189841105956866e-05, + "loss": 1.7581, + "step": 15734 + }, + { + "epoch": 4.829650092081032, + "grad_norm": 0.2315293401479721, + "learning_rate": 5.518489736883132e-05, + "loss": 1.6796, + "step": 15735 + }, + { + "epoch": 4.829957028852056, + "grad_norm": 0.2465476542711258, + "learning_rate": 5.51799535804656e-05, + "loss": 1.7276, + "step": 15736 + }, + { + "epoch": 4.8302639656230815, + "grad_norm": 0.20438486337661743, + "learning_rate": 5.5175009740908546e-05, + "loss": 1.7188, + "step": 15737 + }, + { + "epoch": 4.830570902394107, + "grad_norm": 0.24328351020812988, + "learning_rate": 5.5170065850209016e-05, + "loss": 1.7165, + "step": 15738 + }, + { + "epoch": 4.830877839165132, + "grad_norm": 0.22486837208271027, + "learning_rate": 5.516512190841586e-05, + "loss": 1.7369, + "step": 15739 + }, + { + "epoch": 4.8311847759361575, + "grad_norm": 0.2065822333097458, + "learning_rate": 5.5160177915577934e-05, + "loss": 1.7125, + "step": 15740 + }, + { + "epoch": 4.831491712707182, + "grad_norm": 0.21223095059394836, + "learning_rate": 5.5155233871744104e-05, + "loss": 1.7319, + "step": 15741 + }, + { + "epoch": 4.831798649478207, + "grad_norm": 0.25712934136390686, + "learning_rate": 5.515028977696325e-05, + "loss": 1.7847, + "step": 15742 + }, + { + "epoch": 4.832105586249233, + "grad_norm": 0.21289978921413422, + "learning_rate": 5.5145345631284215e-05, + "loss": 1.7629, + "step": 15743 + }, + { + "epoch": 4.832412523020258, + "grad_norm": 0.22347134351730347, + "learning_rate": 5.514040143475585e-05, + "loss": 1.7491, + "step": 15744 + }, + { + "epoch": 4.832719459791283, + "grad_norm": 0.20660510659217834, + "learning_rate": 5.513545718742702e-05, + "loss": 1.7377, + "step": 15745 + }, + { + "epoch": 4.833026396562309, + "grad_norm": 0.21612273156642914, + "learning_rate": 5.513051288934658e-05, + "loss": 1.7973, + "step": 15746 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.22515933215618134, + "learning_rate": 5.512556854056342e-05, + "loss": 1.7774, + "step": 15747 + }, + { + "epoch": 4.833640270104358, + "grad_norm": 0.21075554192066193, + "learning_rate": 5.512062414112639e-05, + "loss": 1.7741, + "step": 15748 + }, + { + "epoch": 4.833947206875384, + "grad_norm": 0.2203720659017563, + "learning_rate": 5.511567969108436e-05, + "loss": 1.7902, + "step": 15749 + }, + { + "epoch": 4.834254143646409, + "grad_norm": 0.20247167348861694, + "learning_rate": 5.511073519048616e-05, + "loss": 1.7084, + "step": 15750 + }, + { + "epoch": 4.834561080417434, + "grad_norm": 0.247711181640625, + "learning_rate": 5.5105790639380695e-05, + "loss": 1.8465, + "step": 15751 + }, + { + "epoch": 4.834868017188459, + "grad_norm": 0.22866854071617126, + "learning_rate": 5.51008460378168e-05, + "loss": 1.7252, + "step": 15752 + }, + { + "epoch": 4.835174953959484, + "grad_norm": 0.2335643470287323, + "learning_rate": 5.5095901385843374e-05, + "loss": 1.703, + "step": 15753 + }, + { + "epoch": 4.8354818907305095, + "grad_norm": 0.20874348282814026, + "learning_rate": 5.509095668350926e-05, + "loss": 1.7114, + "step": 15754 + }, + { + "epoch": 4.835788827501535, + "grad_norm": 0.19156917929649353, + "learning_rate": 5.5086011930863314e-05, + "loss": 1.6975, + "step": 15755 + }, + { + "epoch": 4.83609576427256, + "grad_norm": 0.23480524122714996, + "learning_rate": 5.508106712795443e-05, + "loss": 1.8291, + "step": 15756 + }, + { + "epoch": 4.8364027010435855, + "grad_norm": 0.20430417358875275, + "learning_rate": 5.5076122274831454e-05, + "loss": 1.7605, + "step": 15757 + }, + { + "epoch": 4.83670963781461, + "grad_norm": 0.26790598034858704, + "learning_rate": 5.5071177371543256e-05, + "loss": 1.7541, + "step": 15758 + }, + { + "epoch": 4.837016574585635, + "grad_norm": 0.3339289724826813, + "learning_rate": 5.506623241813873e-05, + "loss": 1.7566, + "step": 15759 + }, + { + "epoch": 4.837323511356661, + "grad_norm": 0.30528193712234497, + "learning_rate": 5.5061287414666726e-05, + "loss": 1.7371, + "step": 15760 + }, + { + "epoch": 4.837630448127686, + "grad_norm": 0.21059657633304596, + "learning_rate": 5.5056342361176114e-05, + "loss": 1.7599, + "step": 15761 + }, + { + "epoch": 4.83793738489871, + "grad_norm": 0.27918973565101624, + "learning_rate": 5.5051397257715756e-05, + "loss": 1.7485, + "step": 15762 + }, + { + "epoch": 4.838244321669736, + "grad_norm": 0.23147793114185333, + "learning_rate": 5.5046452104334514e-05, + "loss": 1.7121, + "step": 15763 + }, + { + "epoch": 4.838551258440761, + "grad_norm": 0.22028742730617523, + "learning_rate": 5.5041506901081294e-05, + "loss": 1.803, + "step": 15764 + }, + { + "epoch": 4.838858195211786, + "grad_norm": 0.22840891778469086, + "learning_rate": 5.5036561648004946e-05, + "loss": 1.7555, + "step": 15765 + }, + { + "epoch": 4.839165131982812, + "grad_norm": 0.2610893249511719, + "learning_rate": 5.503161634515433e-05, + "loss": 1.7873, + "step": 15766 + }, + { + "epoch": 4.839472068753837, + "grad_norm": 0.2530003786087036, + "learning_rate": 5.502667099257836e-05, + "loss": 1.7604, + "step": 15767 + }, + { + "epoch": 4.8397790055248615, + "grad_norm": 0.20120400190353394, + "learning_rate": 5.5021725590325854e-05, + "loss": 1.7476, + "step": 15768 + }, + { + "epoch": 4.840085942295887, + "grad_norm": 0.2189723700284958, + "learning_rate": 5.501678013844571e-05, + "loss": 1.7174, + "step": 15769 + }, + { + "epoch": 4.840392879066912, + "grad_norm": 0.2511899173259735, + "learning_rate": 5.501183463698683e-05, + "loss": 1.7589, + "step": 15770 + }, + { + "epoch": 4.8406998158379375, + "grad_norm": 0.24899333715438843, + "learning_rate": 5.5006889085998035e-05, + "loss": 1.7253, + "step": 15771 + }, + { + "epoch": 4.841006752608963, + "grad_norm": 0.21223559975624084, + "learning_rate": 5.5001943485528254e-05, + "loss": 1.6949, + "step": 15772 + }, + { + "epoch": 4.841313689379987, + "grad_norm": 0.21394596993923187, + "learning_rate": 5.499699783562632e-05, + "loss": 1.7827, + "step": 15773 + }, + { + "epoch": 4.841620626151013, + "grad_norm": 0.2379613220691681, + "learning_rate": 5.4992052136341134e-05, + "loss": 1.7968, + "step": 15774 + }, + { + "epoch": 4.841927562922038, + "grad_norm": 0.23748385906219482, + "learning_rate": 5.498710638772154e-05, + "loss": 1.797, + "step": 15775 + }, + { + "epoch": 4.842234499693063, + "grad_norm": 0.2502206265926361, + "learning_rate": 5.498216058981646e-05, + "loss": 1.7292, + "step": 15776 + }, + { + "epoch": 4.842541436464089, + "grad_norm": 0.23613516986370087, + "learning_rate": 5.497721474267475e-05, + "loss": 1.7353, + "step": 15777 + }, + { + "epoch": 4.842848373235114, + "grad_norm": 0.25274696946144104, + "learning_rate": 5.497226884634527e-05, + "loss": 1.7782, + "step": 15778 + }, + { + "epoch": 4.843155310006138, + "grad_norm": 0.19574183225631714, + "learning_rate": 5.496732290087694e-05, + "loss": 1.6926, + "step": 15779 + }, + { + "epoch": 4.843462246777164, + "grad_norm": 0.21040405333042145, + "learning_rate": 5.496237690631858e-05, + "loss": 1.7235, + "step": 15780 + }, + { + "epoch": 4.843769183548189, + "grad_norm": 0.22499679028987885, + "learning_rate": 5.495743086271913e-05, + "loss": 1.7889, + "step": 15781 + }, + { + "epoch": 4.844076120319214, + "grad_norm": 0.24623246490955353, + "learning_rate": 5.4952484770127433e-05, + "loss": 1.7357, + "step": 15782 + }, + { + "epoch": 4.84438305709024, + "grad_norm": 0.21706275641918182, + "learning_rate": 5.494753862859238e-05, + "loss": 1.7349, + "step": 15783 + }, + { + "epoch": 4.844689993861264, + "grad_norm": 0.20705166459083557, + "learning_rate": 5.4942592438162855e-05, + "loss": 1.7047, + "step": 15784 + }, + { + "epoch": 4.8449969306322895, + "grad_norm": 0.21216751635074615, + "learning_rate": 5.493764619888773e-05, + "loss": 1.7335, + "step": 15785 + }, + { + "epoch": 4.845303867403315, + "grad_norm": 0.2945895195007324, + "learning_rate": 5.493269991081588e-05, + "loss": 1.838, + "step": 15786 + }, + { + "epoch": 4.84561080417434, + "grad_norm": 0.22013652324676514, + "learning_rate": 5.492775357399621e-05, + "loss": 1.7541, + "step": 15787 + }, + { + "epoch": 4.8459177409453655, + "grad_norm": 0.25428512692451477, + "learning_rate": 5.4922807188477585e-05, + "loss": 1.7405, + "step": 15788 + }, + { + "epoch": 4.846224677716391, + "grad_norm": 0.23189012706279755, + "learning_rate": 5.49178607543089e-05, + "loss": 1.8075, + "step": 15789 + }, + { + "epoch": 4.846531614487415, + "grad_norm": 0.21637389063835144, + "learning_rate": 5.491291427153904e-05, + "loss": 1.7229, + "step": 15790 + }, + { + "epoch": 4.846838551258441, + "grad_norm": 0.20628009736537933, + "learning_rate": 5.490796774021687e-05, + "loss": 1.7605, + "step": 15791 + }, + { + "epoch": 4.847145488029466, + "grad_norm": 0.20845308899879456, + "learning_rate": 5.4903021160391276e-05, + "loss": 1.7864, + "step": 15792 + }, + { + "epoch": 4.847452424800491, + "grad_norm": 0.20367322862148285, + "learning_rate": 5.4898074532111164e-05, + "loss": 1.733, + "step": 15793 + }, + { + "epoch": 4.847759361571516, + "grad_norm": 0.2066505253314972, + "learning_rate": 5.489312785542543e-05, + "loss": 1.7113, + "step": 15794 + }, + { + "epoch": 4.848066298342541, + "grad_norm": 0.23874987661838531, + "learning_rate": 5.488818113038292e-05, + "loss": 1.7735, + "step": 15795 + }, + { + "epoch": 4.848373235113566, + "grad_norm": 0.26583850383758545, + "learning_rate": 5.488323435703254e-05, + "loss": 1.8019, + "step": 15796 + }, + { + "epoch": 4.848680171884592, + "grad_norm": 0.25207552313804626, + "learning_rate": 5.487828753542317e-05, + "loss": 1.7491, + "step": 15797 + }, + { + "epoch": 4.848987108655617, + "grad_norm": 0.23065905272960663, + "learning_rate": 5.48733406656037e-05, + "loss": 1.7451, + "step": 15798 + }, + { + "epoch": 4.849294045426642, + "grad_norm": 0.26914483308792114, + "learning_rate": 5.486839374762304e-05, + "loss": 1.7553, + "step": 15799 + }, + { + "epoch": 4.849600982197668, + "grad_norm": 0.2509605884552002, + "learning_rate": 5.4863446781530046e-05, + "loss": 1.7124, + "step": 15800 + }, + { + "epoch": 4.849907918968692, + "grad_norm": 0.2618432343006134, + "learning_rate": 5.485849976737362e-05, + "loss": 1.7368, + "step": 15801 + }, + { + "epoch": 4.850214855739718, + "grad_norm": 0.46875160932540894, + "learning_rate": 5.485355270520266e-05, + "loss": 1.7883, + "step": 15802 + }, + { + "epoch": 4.850521792510743, + "grad_norm": 0.37585484981536865, + "learning_rate": 5.4848605595066025e-05, + "loss": 1.7894, + "step": 15803 + }, + { + "epoch": 4.850828729281768, + "grad_norm": 0.2244408279657364, + "learning_rate": 5.4843658437012646e-05, + "loss": 1.7394, + "step": 15804 + }, + { + "epoch": 4.851135666052793, + "grad_norm": 0.4061773419380188, + "learning_rate": 5.48387112310914e-05, + "loss": 1.7703, + "step": 15805 + }, + { + "epoch": 4.851442602823818, + "grad_norm": 0.35925009846687317, + "learning_rate": 5.483376397735117e-05, + "loss": 1.7798, + "step": 15806 + }, + { + "epoch": 4.851749539594843, + "grad_norm": 0.23050184547901154, + "learning_rate": 5.482881667584084e-05, + "loss": 1.7984, + "step": 15807 + }, + { + "epoch": 4.852056476365869, + "grad_norm": 0.37308645248413086, + "learning_rate": 5.4823869326609335e-05, + "loss": 1.6747, + "step": 15808 + }, + { + "epoch": 4.852363413136894, + "grad_norm": 0.29826754331588745, + "learning_rate": 5.481892192970551e-05, + "loss": 1.7432, + "step": 15809 + }, + { + "epoch": 4.852670349907919, + "grad_norm": 0.23652370274066925, + "learning_rate": 5.4813974485178266e-05, + "loss": 1.7557, + "step": 15810 + }, + { + "epoch": 4.852977286678944, + "grad_norm": 0.40549808740615845, + "learning_rate": 5.4809026993076526e-05, + "loss": 1.7317, + "step": 15811 + }, + { + "epoch": 4.853284223449969, + "grad_norm": 0.3367961347103119, + "learning_rate": 5.4804079453449156e-05, + "loss": 1.7648, + "step": 15812 + }, + { + "epoch": 4.8535911602209945, + "grad_norm": 0.21629661321640015, + "learning_rate": 5.4799131866345055e-05, + "loss": 1.7986, + "step": 15813 + }, + { + "epoch": 4.85389809699202, + "grad_norm": 0.26381492614746094, + "learning_rate": 5.4794184231813105e-05, + "loss": 1.7401, + "step": 15814 + }, + { + "epoch": 4.854205033763045, + "grad_norm": 0.22319363057613373, + "learning_rate": 5.478923654990223e-05, + "loss": 1.7773, + "step": 15815 + }, + { + "epoch": 4.85451197053407, + "grad_norm": 0.2547159492969513, + "learning_rate": 5.4784288820661326e-05, + "loss": 1.8194, + "step": 15816 + }, + { + "epoch": 4.854818907305095, + "grad_norm": 0.29574522376060486, + "learning_rate": 5.477934104413925e-05, + "loss": 1.7351, + "step": 15817 + }, + { + "epoch": 4.85512584407612, + "grad_norm": 0.17389361560344696, + "learning_rate": 5.4774393220384945e-05, + "loss": 1.6957, + "step": 15818 + }, + { + "epoch": 4.855432780847146, + "grad_norm": 0.23746751248836517, + "learning_rate": 5.476944534944728e-05, + "loss": 1.7713, + "step": 15819 + }, + { + "epoch": 4.855739717618171, + "grad_norm": 0.182356595993042, + "learning_rate": 5.476449743137516e-05, + "loss": 1.7144, + "step": 15820 + }, + { + "epoch": 4.856046654389196, + "grad_norm": 0.23716382682323456, + "learning_rate": 5.4759549466217475e-05, + "loss": 1.7451, + "step": 15821 + }, + { + "epoch": 4.856353591160221, + "grad_norm": 0.316806823015213, + "learning_rate": 5.475460145402313e-05, + "loss": 1.7823, + "step": 15822 + }, + { + "epoch": 4.856660527931246, + "grad_norm": 0.2333129197359085, + "learning_rate": 5.474965339484105e-05, + "loss": 1.7788, + "step": 15823 + }, + { + "epoch": 4.856967464702271, + "grad_norm": 0.21180212497711182, + "learning_rate": 5.47447052887201e-05, + "loss": 1.7513, + "step": 15824 + }, + { + "epoch": 4.857274401473297, + "grad_norm": 0.22641299664974213, + "learning_rate": 5.473975713570919e-05, + "loss": 1.7514, + "step": 15825 + }, + { + "epoch": 4.857581338244322, + "grad_norm": 0.3179668188095093, + "learning_rate": 5.473480893585723e-05, + "loss": 1.7939, + "step": 15826 + }, + { + "epoch": 4.8578882750153465, + "grad_norm": 0.27463147044181824, + "learning_rate": 5.472986068921309e-05, + "loss": 1.7487, + "step": 15827 + }, + { + "epoch": 4.858195211786372, + "grad_norm": 0.18621626496315002, + "learning_rate": 5.472491239582572e-05, + "loss": 1.7155, + "step": 15828 + }, + { + "epoch": 4.858502148557397, + "grad_norm": 0.2437327802181244, + "learning_rate": 5.471996405574399e-05, + "loss": 1.7586, + "step": 15829 + }, + { + "epoch": 4.8588090853284225, + "grad_norm": 0.26658934354782104, + "learning_rate": 5.47150156690168e-05, + "loss": 1.7331, + "step": 15830 + }, + { + "epoch": 4.859116022099448, + "grad_norm": 0.2257174700498581, + "learning_rate": 5.471006723569308e-05, + "loss": 1.7556, + "step": 15831 + }, + { + "epoch": 4.859422958870473, + "grad_norm": 0.25434550642967224, + "learning_rate": 5.470511875582168e-05, + "loss": 1.7196, + "step": 15832 + }, + { + "epoch": 4.859729895641498, + "grad_norm": 0.2251453697681427, + "learning_rate": 5.470017022945156e-05, + "loss": 1.7174, + "step": 15833 + }, + { + "epoch": 4.860036832412523, + "grad_norm": 0.2757972180843353, + "learning_rate": 5.469522165663161e-05, + "loss": 1.7701, + "step": 15834 + }, + { + "epoch": 4.860343769183548, + "grad_norm": 0.2771994173526764, + "learning_rate": 5.469027303741072e-05, + "loss": 1.8085, + "step": 15835 + }, + { + "epoch": 4.860650705954574, + "grad_norm": 0.23825454711914062, + "learning_rate": 5.468532437183781e-05, + "loss": 1.733, + "step": 15836 + }, + { + "epoch": 4.860957642725598, + "grad_norm": 0.18100066483020782, + "learning_rate": 5.468037565996177e-05, + "loss": 1.7012, + "step": 15837 + }, + { + "epoch": 4.861264579496623, + "grad_norm": 0.22552812099456787, + "learning_rate": 5.4675426901831506e-05, + "loss": 1.728, + "step": 15838 + }, + { + "epoch": 4.861571516267649, + "grad_norm": 0.2505643665790558, + "learning_rate": 5.467047809749595e-05, + "loss": 1.7219, + "step": 15839 + }, + { + "epoch": 4.861878453038674, + "grad_norm": 0.25920796394348145, + "learning_rate": 5.4665529247003975e-05, + "loss": 1.7945, + "step": 15840 + }, + { + "epoch": 4.862185389809699, + "grad_norm": 0.23549394309520721, + "learning_rate": 5.466058035040452e-05, + "loss": 1.7904, + "step": 15841 + }, + { + "epoch": 4.862492326580725, + "grad_norm": 0.26510992646217346, + "learning_rate": 5.465563140774648e-05, + "loss": 1.8051, + "step": 15842 + }, + { + "epoch": 4.862799263351749, + "grad_norm": 0.19175390899181366, + "learning_rate": 5.465068241907876e-05, + "loss": 1.6799, + "step": 15843 + }, + { + "epoch": 4.8631062001227745, + "grad_norm": 0.2588976323604584, + "learning_rate": 5.464573338445025e-05, + "loss": 1.7394, + "step": 15844 + }, + { + "epoch": 4.8634131368938, + "grad_norm": 0.28729483485221863, + "learning_rate": 5.464078430390991e-05, + "loss": 1.797, + "step": 15845 + }, + { + "epoch": 4.863720073664825, + "grad_norm": 0.21302445232868195, + "learning_rate": 5.463583517750661e-05, + "loss": 1.7303, + "step": 15846 + }, + { + "epoch": 4.8640270104358505, + "grad_norm": 0.2407636195421219, + "learning_rate": 5.463088600528926e-05, + "loss": 1.7175, + "step": 15847 + }, + { + "epoch": 4.864333947206875, + "grad_norm": 0.25653502345085144, + "learning_rate": 5.4625936787306784e-05, + "loss": 1.6996, + "step": 15848 + }, + { + "epoch": 4.8646408839779, + "grad_norm": 0.2100832760334015, + "learning_rate": 5.462098752360809e-05, + "loss": 1.7416, + "step": 15849 + }, + { + "epoch": 4.864947820748926, + "grad_norm": 0.2785186469554901, + "learning_rate": 5.461603821424208e-05, + "loss": 1.74, + "step": 15850 + }, + { + "epoch": 4.865254757519951, + "grad_norm": 0.2896614968776703, + "learning_rate": 5.4611088859257696e-05, + "loss": 1.7436, + "step": 15851 + }, + { + "epoch": 4.865561694290976, + "grad_norm": 0.18890418112277985, + "learning_rate": 5.460613945870382e-05, + "loss": 1.7093, + "step": 15852 + }, + { + "epoch": 4.865868631062002, + "grad_norm": 0.27681079506874084, + "learning_rate": 5.4601190012629364e-05, + "loss": 1.8772, + "step": 15853 + }, + { + "epoch": 4.866175567833026, + "grad_norm": 0.24658115208148956, + "learning_rate": 5.4596240521083265e-05, + "loss": 1.776, + "step": 15854 + }, + { + "epoch": 4.866482504604051, + "grad_norm": 0.21958144009113312, + "learning_rate": 5.459129098411441e-05, + "loss": 1.7503, + "step": 15855 + }, + { + "epoch": 4.866789441375077, + "grad_norm": 0.2778300642967224, + "learning_rate": 5.458634140177174e-05, + "loss": 1.8194, + "step": 15856 + }, + { + "epoch": 4.867096378146102, + "grad_norm": 0.28673580288887024, + "learning_rate": 5.458139177410414e-05, + "loss": 1.8033, + "step": 15857 + }, + { + "epoch": 4.867403314917127, + "grad_norm": 0.24472850561141968, + "learning_rate": 5.457644210116055e-05, + "loss": 1.7304, + "step": 15858 + }, + { + "epoch": 4.867710251688152, + "grad_norm": 0.24581189453601837, + "learning_rate": 5.4571492382989886e-05, + "loss": 1.7443, + "step": 15859 + }, + { + "epoch": 4.868017188459177, + "grad_norm": 0.22296221554279327, + "learning_rate": 5.4566542619641045e-05, + "loss": 1.7201, + "step": 15860 + }, + { + "epoch": 4.8683241252302025, + "grad_norm": 0.2378673404455185, + "learning_rate": 5.456159281116295e-05, + "loss": 1.7893, + "step": 15861 + }, + { + "epoch": 4.868631062001228, + "grad_norm": 0.3320823907852173, + "learning_rate": 5.4556642957604534e-05, + "loss": 1.7944, + "step": 15862 + }, + { + "epoch": 4.868937998772253, + "grad_norm": 0.3303453326225281, + "learning_rate": 5.45516930590147e-05, + "loss": 1.7267, + "step": 15863 + }, + { + "epoch": 4.8692449355432785, + "grad_norm": 0.223227858543396, + "learning_rate": 5.454674311544235e-05, + "loss": 1.7477, + "step": 15864 + }, + { + "epoch": 4.869551872314303, + "grad_norm": 0.3012549579143524, + "learning_rate": 5.454179312693643e-05, + "loss": 1.731, + "step": 15865 + }, + { + "epoch": 4.869858809085328, + "grad_norm": 0.3780311942100525, + "learning_rate": 5.453684309354585e-05, + "loss": 1.7296, + "step": 15866 + }, + { + "epoch": 4.870165745856354, + "grad_norm": 0.2753889262676239, + "learning_rate": 5.4531893015319526e-05, + "loss": 1.8024, + "step": 15867 + }, + { + "epoch": 4.870472682627379, + "grad_norm": 0.2270934134721756, + "learning_rate": 5.452694289230639e-05, + "loss": 1.7095, + "step": 15868 + }, + { + "epoch": 4.870779619398404, + "grad_norm": 0.2621576488018036, + "learning_rate": 5.452199272455534e-05, + "loss": 1.75, + "step": 15869 + }, + { + "epoch": 4.871086556169429, + "grad_norm": 0.22175776958465576, + "learning_rate": 5.45170425121153e-05, + "loss": 1.7658, + "step": 15870 + }, + { + "epoch": 4.871393492940454, + "grad_norm": 0.2038736790418625, + "learning_rate": 5.451209225503521e-05, + "loss": 1.6916, + "step": 15871 + }, + { + "epoch": 4.871700429711479, + "grad_norm": 0.2493467777967453, + "learning_rate": 5.450714195336397e-05, + "loss": 1.7408, + "step": 15872 + }, + { + "epoch": 4.872007366482505, + "grad_norm": 0.1966754049062729, + "learning_rate": 5.450219160715052e-05, + "loss": 1.7379, + "step": 15873 + }, + { + "epoch": 4.87231430325353, + "grad_norm": 0.23193517327308655, + "learning_rate": 5.4497241216443775e-05, + "loss": 1.7736, + "step": 15874 + }, + { + "epoch": 4.872621240024555, + "grad_norm": 0.2164391279220581, + "learning_rate": 5.4492290781292646e-05, + "loss": 1.7618, + "step": 15875 + }, + { + "epoch": 4.87292817679558, + "grad_norm": 0.286460816860199, + "learning_rate": 5.448734030174607e-05, + "loss": 1.7745, + "step": 15876 + }, + { + "epoch": 4.873235113566605, + "grad_norm": 0.3454538881778717, + "learning_rate": 5.448238977785298e-05, + "loss": 1.7605, + "step": 15877 + }, + { + "epoch": 4.8735420503376305, + "grad_norm": 0.26775062084198, + "learning_rate": 5.447743920966227e-05, + "loss": 1.7263, + "step": 15878 + }, + { + "epoch": 4.873848987108656, + "grad_norm": 0.2644907832145691, + "learning_rate": 5.447248859722289e-05, + "loss": 1.8489, + "step": 15879 + }, + { + "epoch": 4.87415592387968, + "grad_norm": 0.21646654605865479, + "learning_rate": 5.446753794058376e-05, + "loss": 1.7605, + "step": 15880 + }, + { + "epoch": 4.874462860650706, + "grad_norm": 0.23431318998336792, + "learning_rate": 5.446258723979381e-05, + "loss": 1.7209, + "step": 15881 + }, + { + "epoch": 4.874769797421731, + "grad_norm": 0.24665607511997223, + "learning_rate": 5.4457636494901934e-05, + "loss": 1.813, + "step": 15882 + }, + { + "epoch": 4.875076734192756, + "grad_norm": 0.26269975304603577, + "learning_rate": 5.445268570595708e-05, + "loss": 1.8255, + "step": 15883 + }, + { + "epoch": 4.875383670963782, + "grad_norm": 0.2722402811050415, + "learning_rate": 5.444773487300819e-05, + "loss": 1.7795, + "step": 15884 + }, + { + "epoch": 4.875690607734807, + "grad_norm": 0.3235624134540558, + "learning_rate": 5.444278399610417e-05, + "loss": 1.7804, + "step": 15885 + }, + { + "epoch": 4.8759975445058314, + "grad_norm": 0.2647583782672882, + "learning_rate": 5.4437833075293964e-05, + "loss": 1.7359, + "step": 15886 + }, + { + "epoch": 4.876304481276857, + "grad_norm": 0.272370845079422, + "learning_rate": 5.443288211062649e-05, + "loss": 1.7605, + "step": 15887 + }, + { + "epoch": 4.876611418047882, + "grad_norm": 0.3147594630718231, + "learning_rate": 5.4427931102150675e-05, + "loss": 1.7118, + "step": 15888 + }, + { + "epoch": 4.8769183548189075, + "grad_norm": 0.22751441597938538, + "learning_rate": 5.442298004991544e-05, + "loss": 1.723, + "step": 15889 + }, + { + "epoch": 4.877225291589933, + "grad_norm": 0.2121521681547165, + "learning_rate": 5.441802895396972e-05, + "loss": 1.7485, + "step": 15890 + }, + { + "epoch": 4.877532228360957, + "grad_norm": 0.25370222330093384, + "learning_rate": 5.4413077814362466e-05, + "loss": 1.8064, + "step": 15891 + }, + { + "epoch": 4.877839165131983, + "grad_norm": 0.19492633640766144, + "learning_rate": 5.440812663114259e-05, + "loss": 1.6773, + "step": 15892 + }, + { + "epoch": 4.878146101903008, + "grad_norm": 0.2101750522851944, + "learning_rate": 5.440317540435901e-05, + "loss": 1.7215, + "step": 15893 + }, + { + "epoch": 4.878453038674033, + "grad_norm": 0.21150651574134827, + "learning_rate": 5.439822413406068e-05, + "loss": 1.7875, + "step": 15894 + }, + { + "epoch": 4.878759975445059, + "grad_norm": 0.21008379757404327, + "learning_rate": 5.439327282029651e-05, + "loss": 1.7108, + "step": 15895 + }, + { + "epoch": 4.879066912216084, + "grad_norm": 0.22885502874851227, + "learning_rate": 5.4388321463115453e-05, + "loss": 1.7899, + "step": 15896 + }, + { + "epoch": 4.879373848987108, + "grad_norm": 0.24868059158325195, + "learning_rate": 5.4383370062566444e-05, + "loss": 1.7368, + "step": 15897 + }, + { + "epoch": 4.879680785758134, + "grad_norm": 0.27225378155708313, + "learning_rate": 5.437841861869838e-05, + "loss": 1.7623, + "step": 15898 + }, + { + "epoch": 4.879987722529159, + "grad_norm": 0.23353120684623718, + "learning_rate": 5.437346713156023e-05, + "loss": 1.7908, + "step": 15899 + }, + { + "epoch": 4.880294659300184, + "grad_norm": 0.19032470881938934, + "learning_rate": 5.436851560120091e-05, + "loss": 1.7511, + "step": 15900 + }, + { + "epoch": 4.88060159607121, + "grad_norm": 0.23714862763881683, + "learning_rate": 5.4363564027669345e-05, + "loss": 1.7197, + "step": 15901 + }, + { + "epoch": 4.880908532842234, + "grad_norm": 0.24897022545337677, + "learning_rate": 5.4358612411014495e-05, + "loss": 1.7822, + "step": 15902 + }, + { + "epoch": 4.8812154696132595, + "grad_norm": 0.21433588862419128, + "learning_rate": 5.435366075128528e-05, + "loss": 1.7928, + "step": 15903 + }, + { + "epoch": 4.881522406384285, + "grad_norm": 0.30019649863243103, + "learning_rate": 5.4348709048530646e-05, + "loss": 1.8067, + "step": 15904 + }, + { + "epoch": 4.88182934315531, + "grad_norm": 0.20227669179439545, + "learning_rate": 5.4343757302799515e-05, + "loss": 1.7254, + "step": 15905 + }, + { + "epoch": 4.8821362799263355, + "grad_norm": 0.23447728157043457, + "learning_rate": 5.4338805514140836e-05, + "loss": 1.7314, + "step": 15906 + }, + { + "epoch": 4.882443216697361, + "grad_norm": 0.29545050859451294, + "learning_rate": 5.4333853682603506e-05, + "loss": 1.7659, + "step": 15907 + }, + { + "epoch": 4.882750153468385, + "grad_norm": 0.245390385389328, + "learning_rate": 5.432890180823652e-05, + "loss": 1.7264, + "step": 15908 + }, + { + "epoch": 4.883057090239411, + "grad_norm": 0.209987074136734, + "learning_rate": 5.432394989108879e-05, + "loss": 1.7174, + "step": 15909 + }, + { + "epoch": 4.883364027010436, + "grad_norm": 0.2402341365814209, + "learning_rate": 5.431899793120925e-05, + "loss": 1.7512, + "step": 15910 + }, + { + "epoch": 4.883670963781461, + "grad_norm": 0.26227688789367676, + "learning_rate": 5.431404592864684e-05, + "loss": 1.7697, + "step": 15911 + }, + { + "epoch": 4.883977900552486, + "grad_norm": 0.2556503117084503, + "learning_rate": 5.4309093883450504e-05, + "loss": 1.8191, + "step": 15912 + }, + { + "epoch": 4.884284837323511, + "grad_norm": 0.24766884744167328, + "learning_rate": 5.4304141795669174e-05, + "loss": 1.7574, + "step": 15913 + }, + { + "epoch": 4.884591774094536, + "grad_norm": 0.19925951957702637, + "learning_rate": 5.429918966535179e-05, + "loss": 1.7249, + "step": 15914 + }, + { + "epoch": 4.884898710865562, + "grad_norm": 0.1899442970752716, + "learning_rate": 5.4294237492547294e-05, + "loss": 1.7446, + "step": 15915 + }, + { + "epoch": 4.885205647636587, + "grad_norm": 0.25900956988334656, + "learning_rate": 5.4289285277304636e-05, + "loss": 1.725, + "step": 15916 + }, + { + "epoch": 4.885512584407612, + "grad_norm": 0.2537781000137329, + "learning_rate": 5.428433301967274e-05, + "loss": 1.7861, + "step": 15917 + }, + { + "epoch": 4.885819521178637, + "grad_norm": 0.26432034373283386, + "learning_rate": 5.427938071970054e-05, + "loss": 1.7538, + "step": 15918 + }, + { + "epoch": 4.886126457949662, + "grad_norm": 0.22722363471984863, + "learning_rate": 5.4274428377437e-05, + "loss": 1.7631, + "step": 15919 + }, + { + "epoch": 4.8864333947206875, + "grad_norm": 0.24846172332763672, + "learning_rate": 5.426947599293106e-05, + "loss": 1.7833, + "step": 15920 + }, + { + "epoch": 4.886740331491713, + "grad_norm": 0.24821995198726654, + "learning_rate": 5.426452356623165e-05, + "loss": 1.7638, + "step": 15921 + }, + { + "epoch": 4.887047268262738, + "grad_norm": 0.2796781063079834, + "learning_rate": 5.425957109738773e-05, + "loss": 1.6982, + "step": 15922 + }, + { + "epoch": 4.887354205033763, + "grad_norm": 0.2875385284423828, + "learning_rate": 5.425461858644821e-05, + "loss": 1.7172, + "step": 15923 + }, + { + "epoch": 4.887661141804788, + "grad_norm": 0.21614491939544678, + "learning_rate": 5.424966603346207e-05, + "loss": 1.7521, + "step": 15924 + }, + { + "epoch": 4.887968078575813, + "grad_norm": 0.22944390773773193, + "learning_rate": 5.4244713438478235e-05, + "loss": 1.772, + "step": 15925 + }, + { + "epoch": 4.888275015346839, + "grad_norm": 0.21566039323806763, + "learning_rate": 5.423976080154566e-05, + "loss": 1.734, + "step": 15926 + }, + { + "epoch": 4.888581952117864, + "grad_norm": 0.4253925383090973, + "learning_rate": 5.4234808122713275e-05, + "loss": 1.8017, + "step": 15927 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.239146426320076, + "learning_rate": 5.422985540203004e-05, + "loss": 1.7229, + "step": 15928 + }, + { + "epoch": 4.889195825659914, + "grad_norm": 0.2344054877758026, + "learning_rate": 5.42249026395449e-05, + "loss": 1.7111, + "step": 15929 + }, + { + "epoch": 4.889502762430939, + "grad_norm": 0.21717922389507294, + "learning_rate": 5.421994983530679e-05, + "loss": 1.7427, + "step": 15930 + }, + { + "epoch": 4.889809699201964, + "grad_norm": 0.26895472407341003, + "learning_rate": 5.421499698936466e-05, + "loss": 1.8402, + "step": 15931 + }, + { + "epoch": 4.89011663597299, + "grad_norm": 0.25761866569519043, + "learning_rate": 5.421004410176746e-05, + "loss": 1.7822, + "step": 15932 + }, + { + "epoch": 4.890423572744015, + "grad_norm": 0.24465128779411316, + "learning_rate": 5.420509117256415e-05, + "loss": 1.8074, + "step": 15933 + }, + { + "epoch": 4.8907305095150395, + "grad_norm": 0.2527398467063904, + "learning_rate": 5.4200138201803655e-05, + "loss": 1.7522, + "step": 15934 + }, + { + "epoch": 4.891037446286065, + "grad_norm": 0.23118112981319427, + "learning_rate": 5.4195185189534916e-05, + "loss": 1.7394, + "step": 15935 + }, + { + "epoch": 4.89134438305709, + "grad_norm": 0.2054537534713745, + "learning_rate": 5.419023213580691e-05, + "loss": 1.7096, + "step": 15936 + }, + { + "epoch": 4.8916513198281155, + "grad_norm": 0.2929638922214508, + "learning_rate": 5.418527904066858e-05, + "loss": 1.8733, + "step": 15937 + }, + { + "epoch": 4.891958256599141, + "grad_norm": 0.2957170009613037, + "learning_rate": 5.418032590416886e-05, + "loss": 1.7201, + "step": 15938 + }, + { + "epoch": 4.892265193370166, + "grad_norm": 0.2520081698894501, + "learning_rate": 5.417537272635672e-05, + "loss": 1.7034, + "step": 15939 + }, + { + "epoch": 4.892572130141191, + "grad_norm": 0.25217053294181824, + "learning_rate": 5.41704195072811e-05, + "loss": 1.8538, + "step": 15940 + }, + { + "epoch": 4.892879066912216, + "grad_norm": 0.23605379462242126, + "learning_rate": 5.416546624699093e-05, + "loss": 1.724, + "step": 15941 + }, + { + "epoch": 4.893186003683241, + "grad_norm": 0.321750283241272, + "learning_rate": 5.416051294553519e-05, + "loss": 1.806, + "step": 15942 + }, + { + "epoch": 4.893492940454267, + "grad_norm": 0.23800241947174072, + "learning_rate": 5.415555960296284e-05, + "loss": 1.7578, + "step": 15943 + }, + { + "epoch": 4.893799877225292, + "grad_norm": 0.3423094153404236, + "learning_rate": 5.4150606219322796e-05, + "loss": 1.7324, + "step": 15944 + }, + { + "epoch": 4.894106813996316, + "grad_norm": 0.453074187040329, + "learning_rate": 5.414565279466404e-05, + "loss": 1.7268, + "step": 15945 + }, + { + "epoch": 4.894413750767342, + "grad_norm": 0.21972697973251343, + "learning_rate": 5.4140699329035504e-05, + "loss": 1.6547, + "step": 15946 + }, + { + "epoch": 4.894720687538367, + "grad_norm": 0.32876282930374146, + "learning_rate": 5.413574582248616e-05, + "loss": 1.7527, + "step": 15947 + }, + { + "epoch": 4.895027624309392, + "grad_norm": 0.34035229682922363, + "learning_rate": 5.413079227506494e-05, + "loss": 1.7636, + "step": 15948 + }, + { + "epoch": 4.895334561080418, + "grad_norm": 0.2410411536693573, + "learning_rate": 5.412583868682082e-05, + "loss": 1.8114, + "step": 15949 + }, + { + "epoch": 4.895641497851443, + "grad_norm": 0.2787366211414337, + "learning_rate": 5.412088505780274e-05, + "loss": 1.7393, + "step": 15950 + }, + { + "epoch": 4.8959484346224675, + "grad_norm": 0.23288428783416748, + "learning_rate": 5.411593138805966e-05, + "loss": 1.7413, + "step": 15951 + }, + { + "epoch": 4.896255371393493, + "grad_norm": 0.26302778720855713, + "learning_rate": 5.411097767764053e-05, + "loss": 1.7372, + "step": 15952 + }, + { + "epoch": 4.896562308164518, + "grad_norm": 0.31638020277023315, + "learning_rate": 5.410602392659431e-05, + "loss": 1.8114, + "step": 15953 + }, + { + "epoch": 4.8968692449355435, + "grad_norm": 0.23361825942993164, + "learning_rate": 5.410107013496996e-05, + "loss": 1.7592, + "step": 15954 + }, + { + "epoch": 4.897176181706568, + "grad_norm": 0.19887785613536835, + "learning_rate": 5.409611630281642e-05, + "loss": 1.7509, + "step": 15955 + }, + { + "epoch": 4.897483118477593, + "grad_norm": 0.22396783530712128, + "learning_rate": 5.409116243018266e-05, + "loss": 1.6841, + "step": 15956 + }, + { + "epoch": 4.897790055248619, + "grad_norm": 0.20397686958312988, + "learning_rate": 5.4086208517117645e-05, + "loss": 1.7427, + "step": 15957 + }, + { + "epoch": 4.898096992019644, + "grad_norm": 0.20848311483860016, + "learning_rate": 5.4081254563670314e-05, + "loss": 1.713, + "step": 15958 + }, + { + "epoch": 4.898403928790669, + "grad_norm": 0.2739275395870209, + "learning_rate": 5.407630056988964e-05, + "loss": 1.7673, + "step": 15959 + }, + { + "epoch": 4.898710865561695, + "grad_norm": 0.21485929191112518, + "learning_rate": 5.407134653582456e-05, + "loss": 1.7347, + "step": 15960 + }, + { + "epoch": 4.899017802332719, + "grad_norm": 0.26980286836624146, + "learning_rate": 5.406639246152406e-05, + "loss": 1.7158, + "step": 15961 + }, + { + "epoch": 4.899324739103744, + "grad_norm": 0.22327515482902527, + "learning_rate": 5.4061438347037084e-05, + "loss": 1.7387, + "step": 15962 + }, + { + "epoch": 4.89963167587477, + "grad_norm": 0.2542823553085327, + "learning_rate": 5.4056484192412603e-05, + "loss": 1.7826, + "step": 15963 + }, + { + "epoch": 4.899938612645795, + "grad_norm": 0.3248840868473053, + "learning_rate": 5.405152999769956e-05, + "loss": 1.7878, + "step": 15964 + }, + { + "epoch": 4.9002455494168204, + "grad_norm": 0.21210803091526031, + "learning_rate": 5.404657576294691e-05, + "loss": 1.7378, + "step": 15965 + }, + { + "epoch": 4.900552486187845, + "grad_norm": 0.25679782032966614, + "learning_rate": 5.404162148820365e-05, + "loss": 1.7493, + "step": 15966 + }, + { + "epoch": 4.90085942295887, + "grad_norm": 0.36698678135871887, + "learning_rate": 5.4036667173518704e-05, + "loss": 1.7662, + "step": 15967 + }, + { + "epoch": 4.901166359729896, + "grad_norm": 0.3396874964237213, + "learning_rate": 5.403171281894105e-05, + "loss": 1.7618, + "step": 15968 + }, + { + "epoch": 4.901473296500921, + "grad_norm": 0.2792030870914459, + "learning_rate": 5.402675842451964e-05, + "loss": 1.7858, + "step": 15969 + }, + { + "epoch": 4.901780233271946, + "grad_norm": 0.24499626457691193, + "learning_rate": 5.4021803990303454e-05, + "loss": 1.7503, + "step": 15970 + }, + { + "epoch": 4.902087170042972, + "grad_norm": 0.29185110330581665, + "learning_rate": 5.401684951634144e-05, + "loss": 1.7536, + "step": 15971 + }, + { + "epoch": 4.902394106813996, + "grad_norm": 0.2480020374059677, + "learning_rate": 5.401189500268256e-05, + "loss": 1.7877, + "step": 15972 + }, + { + "epoch": 4.902701043585021, + "grad_norm": 0.3302663564682007, + "learning_rate": 5.400694044937579e-05, + "loss": 1.8693, + "step": 15973 + }, + { + "epoch": 4.903007980356047, + "grad_norm": 0.2500915825366974, + "learning_rate": 5.400198585647008e-05, + "loss": 1.7489, + "step": 15974 + }, + { + "epoch": 4.903314917127072, + "grad_norm": 0.25079864263534546, + "learning_rate": 5.399703122401441e-05, + "loss": 1.7965, + "step": 15975 + }, + { + "epoch": 4.903621853898097, + "grad_norm": 0.2643207907676697, + "learning_rate": 5.399207655205771e-05, + "loss": 1.7696, + "step": 15976 + }, + { + "epoch": 4.903928790669122, + "grad_norm": 0.23719522356987, + "learning_rate": 5.398712184064899e-05, + "loss": 1.7608, + "step": 15977 + }, + { + "epoch": 4.904235727440147, + "grad_norm": 0.25226888060569763, + "learning_rate": 5.3982167089837184e-05, + "loss": 1.8055, + "step": 15978 + }, + { + "epoch": 4.9045426642111725, + "grad_norm": 0.21601852774620056, + "learning_rate": 5.39772122996713e-05, + "loss": 1.7553, + "step": 15979 + }, + { + "epoch": 4.904849600982198, + "grad_norm": 0.20275430381298065, + "learning_rate": 5.397225747020023e-05, + "loss": 1.7221, + "step": 15980 + }, + { + "epoch": 4.905156537753223, + "grad_norm": 0.24815937876701355, + "learning_rate": 5.3967302601473e-05, + "loss": 1.8098, + "step": 15981 + }, + { + "epoch": 4.9054634745242485, + "grad_norm": 0.2193612903356552, + "learning_rate": 5.3962347693538575e-05, + "loss": 1.7116, + "step": 15982 + }, + { + "epoch": 4.905770411295273, + "grad_norm": 0.21409118175506592, + "learning_rate": 5.395739274644589e-05, + "loss": 1.7503, + "step": 15983 + }, + { + "epoch": 4.906077348066298, + "grad_norm": 0.20907564461231232, + "learning_rate": 5.3952437760243935e-05, + "loss": 1.7518, + "step": 15984 + }, + { + "epoch": 4.906384284837324, + "grad_norm": 0.21193571388721466, + "learning_rate": 5.394748273498168e-05, + "loss": 1.6905, + "step": 15985 + }, + { + "epoch": 4.906691221608349, + "grad_norm": 0.19729891419410706, + "learning_rate": 5.394252767070808e-05, + "loss": 1.7398, + "step": 15986 + }, + { + "epoch": 4.906998158379373, + "grad_norm": 0.2654789686203003, + "learning_rate": 5.393757256747211e-05, + "loss": 1.7931, + "step": 15987 + }, + { + "epoch": 4.907305095150399, + "grad_norm": 0.2627345025539398, + "learning_rate": 5.3932617425322726e-05, + "loss": 1.8174, + "step": 15988 + }, + { + "epoch": 4.907612031921424, + "grad_norm": 0.27162298560142517, + "learning_rate": 5.392766224430894e-05, + "loss": 1.8015, + "step": 15989 + }, + { + "epoch": 4.907918968692449, + "grad_norm": 0.24248667061328888, + "learning_rate": 5.3922707024479676e-05, + "loss": 1.7457, + "step": 15990 + }, + { + "epoch": 4.908225905463475, + "grad_norm": 0.24715331196784973, + "learning_rate": 5.391775176588393e-05, + "loss": 1.7724, + "step": 15991 + }, + { + "epoch": 4.9085328422345, + "grad_norm": 0.26335644721984863, + "learning_rate": 5.3912796468570656e-05, + "loss": 1.7183, + "step": 15992 + }, + { + "epoch": 4.9088397790055245, + "grad_norm": 0.23459944128990173, + "learning_rate": 5.3907841132588843e-05, + "loss": 1.7245, + "step": 15993 + }, + { + "epoch": 4.90914671577655, + "grad_norm": 0.21779637038707733, + "learning_rate": 5.3902885757987444e-05, + "loss": 1.7485, + "step": 15994 + }, + { + "epoch": 4.909453652547575, + "grad_norm": 0.227664977312088, + "learning_rate": 5.389793034481545e-05, + "loss": 1.7418, + "step": 15995 + }, + { + "epoch": 4.9097605893186005, + "grad_norm": 0.26230278611183167, + "learning_rate": 5.389297489312183e-05, + "loss": 1.7619, + "step": 15996 + }, + { + "epoch": 4.910067526089626, + "grad_norm": 0.22563579678535461, + "learning_rate": 5.388801940295555e-05, + "loss": 1.7168, + "step": 15997 + }, + { + "epoch": 4.91037446286065, + "grad_norm": 0.24829435348510742, + "learning_rate": 5.388306387436556e-05, + "loss": 1.7422, + "step": 15998 + }, + { + "epoch": 4.910681399631676, + "grad_norm": 0.24395976960659027, + "learning_rate": 5.387810830740088e-05, + "loss": 1.7783, + "step": 15999 + }, + { + "epoch": 4.910988336402701, + "grad_norm": 0.2189297378063202, + "learning_rate": 5.387315270211044e-05, + "loss": 1.7885, + "step": 16000 + }, + { + "epoch": 4.911295273173726, + "grad_norm": 0.21750971674919128, + "learning_rate": 5.386819705854324e-05, + "loss": 1.7659, + "step": 16001 + }, + { + "epoch": 4.911602209944752, + "grad_norm": 0.21907657384872437, + "learning_rate": 5.386324137674826e-05, + "loss": 1.789, + "step": 16002 + }, + { + "epoch": 4.911909146715777, + "grad_norm": 0.18778781592845917, + "learning_rate": 5.3858285656774465e-05, + "loss": 1.7151, + "step": 16003 + }, + { + "epoch": 4.912216083486801, + "grad_norm": 0.24217712879180908, + "learning_rate": 5.385332989867082e-05, + "loss": 1.8108, + "step": 16004 + }, + { + "epoch": 4.912523020257827, + "grad_norm": 0.27637016773223877, + "learning_rate": 5.384837410248632e-05, + "loss": 1.8368, + "step": 16005 + }, + { + "epoch": 4.912829957028852, + "grad_norm": 0.22366084158420563, + "learning_rate": 5.3843418268269926e-05, + "loss": 1.7351, + "step": 16006 + }, + { + "epoch": 4.913136893799877, + "grad_norm": 0.2742357552051544, + "learning_rate": 5.383846239607062e-05, + "loss": 1.7599, + "step": 16007 + }, + { + "epoch": 4.913443830570903, + "grad_norm": 0.2288598269224167, + "learning_rate": 5.383350648593738e-05, + "loss": 1.7056, + "step": 16008 + }, + { + "epoch": 4.913750767341927, + "grad_norm": 0.23319020867347717, + "learning_rate": 5.382855053791919e-05, + "loss": 1.7356, + "step": 16009 + }, + { + "epoch": 4.9140577041129525, + "grad_norm": 0.2232198268175125, + "learning_rate": 5.382359455206499e-05, + "loss": 1.7375, + "step": 16010 + }, + { + "epoch": 4.914364640883978, + "grad_norm": 0.24420048296451569, + "learning_rate": 5.381863852842381e-05, + "loss": 1.8287, + "step": 16011 + }, + { + "epoch": 4.914671577655003, + "grad_norm": 0.22653080523014069, + "learning_rate": 5.381368246704461e-05, + "loss": 1.7137, + "step": 16012 + }, + { + "epoch": 4.9149785144260285, + "grad_norm": 0.20439405739307404, + "learning_rate": 5.380872636797637e-05, + "loss": 1.7688, + "step": 16013 + }, + { + "epoch": 4.915285451197054, + "grad_norm": 0.2602155804634094, + "learning_rate": 5.380377023126806e-05, + "loss": 1.7875, + "step": 16014 + }, + { + "epoch": 4.915592387968078, + "grad_norm": 0.2757892608642578, + "learning_rate": 5.3798814056968647e-05, + "loss": 1.7446, + "step": 16015 + }, + { + "epoch": 4.915899324739104, + "grad_norm": 0.25938209891319275, + "learning_rate": 5.379385784512714e-05, + "loss": 1.6997, + "step": 16016 + }, + { + "epoch": 4.916206261510129, + "grad_norm": 0.2056962549686432, + "learning_rate": 5.37889015957925e-05, + "loss": 1.6961, + "step": 16017 + }, + { + "epoch": 4.916513198281154, + "grad_norm": 0.24388402700424194, + "learning_rate": 5.3783945309013714e-05, + "loss": 1.712, + "step": 16018 + }, + { + "epoch": 4.91682013505218, + "grad_norm": 0.2381993532180786, + "learning_rate": 5.3778988984839775e-05, + "loss": 1.7444, + "step": 16019 + }, + { + "epoch": 4.917127071823204, + "grad_norm": 0.20201562345027924, + "learning_rate": 5.377403262331964e-05, + "loss": 1.7254, + "step": 16020 + }, + { + "epoch": 4.917434008594229, + "grad_norm": 0.24019409716129303, + "learning_rate": 5.376907622450229e-05, + "loss": 1.684, + "step": 16021 + }, + { + "epoch": 4.917740945365255, + "grad_norm": 0.2441694289445877, + "learning_rate": 5.376411978843674e-05, + "loss": 1.7334, + "step": 16022 + }, + { + "epoch": 4.91804788213628, + "grad_norm": 0.23866300284862518, + "learning_rate": 5.3759163315171945e-05, + "loss": 1.7258, + "step": 16023 + }, + { + "epoch": 4.918354818907305, + "grad_norm": 0.28068670630455017, + "learning_rate": 5.375420680475689e-05, + "loss": 1.8049, + "step": 16024 + }, + { + "epoch": 4.918661755678331, + "grad_norm": 0.2956274151802063, + "learning_rate": 5.3749250257240566e-05, + "loss": 1.8544, + "step": 16025 + }, + { + "epoch": 4.918968692449355, + "grad_norm": 0.1971627175807953, + "learning_rate": 5.374429367267196e-05, + "loss": 1.7314, + "step": 16026 + }, + { + "epoch": 4.9192756292203805, + "grad_norm": 0.28565749526023865, + "learning_rate": 5.373933705110004e-05, + "loss": 1.7587, + "step": 16027 + }, + { + "epoch": 4.919582565991406, + "grad_norm": 0.3087369501590729, + "learning_rate": 5.37343803925738e-05, + "loss": 1.7708, + "step": 16028 + }, + { + "epoch": 4.919889502762431, + "grad_norm": 0.22460010647773743, + "learning_rate": 5.372942369714223e-05, + "loss": 1.7401, + "step": 16029 + }, + { + "epoch": 4.920196439533456, + "grad_norm": 0.29492735862731934, + "learning_rate": 5.3724466964854326e-05, + "loss": 1.7033, + "step": 16030 + }, + { + "epoch": 4.920503376304481, + "grad_norm": 0.24452674388885498, + "learning_rate": 5.371951019575904e-05, + "loss": 1.7688, + "step": 16031 + }, + { + "epoch": 4.920810313075506, + "grad_norm": 0.24686957895755768, + "learning_rate": 5.3714553389905366e-05, + "loss": 1.7463, + "step": 16032 + }, + { + "epoch": 4.921117249846532, + "grad_norm": 0.23661597073078156, + "learning_rate": 5.37095965473423e-05, + "loss": 1.7256, + "step": 16033 + }, + { + "epoch": 4.921424186617557, + "grad_norm": 0.22861288487911224, + "learning_rate": 5.370463966811884e-05, + "loss": 1.7722, + "step": 16034 + }, + { + "epoch": 4.921731123388582, + "grad_norm": 0.2453136146068573, + "learning_rate": 5.3699682752283944e-05, + "loss": 1.7343, + "step": 16035 + }, + { + "epoch": 4.922038060159607, + "grad_norm": 0.25267064571380615, + "learning_rate": 5.369472579988663e-05, + "loss": 1.7817, + "step": 16036 + }, + { + "epoch": 4.922344996930632, + "grad_norm": 0.25301575660705566, + "learning_rate": 5.368976881097586e-05, + "loss": 1.8146, + "step": 16037 + }, + { + "epoch": 4.922651933701657, + "grad_norm": 0.23579831421375275, + "learning_rate": 5.368481178560062e-05, + "loss": 1.8089, + "step": 16038 + }, + { + "epoch": 4.922958870472683, + "grad_norm": 0.2181949019432068, + "learning_rate": 5.367985472380993e-05, + "loss": 1.7689, + "step": 16039 + }, + { + "epoch": 4.923265807243708, + "grad_norm": 0.24622827768325806, + "learning_rate": 5.367489762565276e-05, + "loss": 1.791, + "step": 16040 + }, + { + "epoch": 4.9235727440147325, + "grad_norm": 0.2545134723186493, + "learning_rate": 5.3669940491178084e-05, + "loss": 1.738, + "step": 16041 + }, + { + "epoch": 4.923879680785758, + "grad_norm": 0.258139431476593, + "learning_rate": 5.366498332043491e-05, + "loss": 1.8303, + "step": 16042 + }, + { + "epoch": 4.924186617556783, + "grad_norm": 0.23804105818271637, + "learning_rate": 5.366002611347223e-05, + "loss": 1.751, + "step": 16043 + }, + { + "epoch": 4.9244935543278086, + "grad_norm": 0.2354477345943451, + "learning_rate": 5.365506887033901e-05, + "loss": 1.7911, + "step": 16044 + }, + { + "epoch": 4.924800491098834, + "grad_norm": 0.22212550044059753, + "learning_rate": 5.3650111591084276e-05, + "loss": 1.7439, + "step": 16045 + }, + { + "epoch": 4.925107427869859, + "grad_norm": 0.23621168732643127, + "learning_rate": 5.3645154275756984e-05, + "loss": 1.7339, + "step": 16046 + }, + { + "epoch": 4.925414364640884, + "grad_norm": 0.2163209468126297, + "learning_rate": 5.364019692440616e-05, + "loss": 1.7247, + "step": 16047 + }, + { + "epoch": 4.925721301411909, + "grad_norm": 0.21352291107177734, + "learning_rate": 5.3635239537080774e-05, + "loss": 1.7431, + "step": 16048 + }, + { + "epoch": 4.926028238182934, + "grad_norm": 0.3170754909515381, + "learning_rate": 5.36302821138298e-05, + "loss": 1.8075, + "step": 16049 + }, + { + "epoch": 4.92633517495396, + "grad_norm": 0.27073633670806885, + "learning_rate": 5.362532465470226e-05, + "loss": 1.7209, + "step": 16050 + }, + { + "epoch": 4.926642111724985, + "grad_norm": 0.2677803039550781, + "learning_rate": 5.362036715974714e-05, + "loss": 1.7454, + "step": 16051 + }, + { + "epoch": 4.9269490484960095, + "grad_norm": 0.3555704355239868, + "learning_rate": 5.3615409629013436e-05, + "loss": 1.7737, + "step": 16052 + }, + { + "epoch": 4.927255985267035, + "grad_norm": 0.2819947302341461, + "learning_rate": 5.3610452062550124e-05, + "loss": 1.7588, + "step": 16053 + }, + { + "epoch": 4.92756292203806, + "grad_norm": 0.26638996601104736, + "learning_rate": 5.360549446040621e-05, + "loss": 1.8078, + "step": 16054 + }, + { + "epoch": 4.9278698588090855, + "grad_norm": 0.37828773260116577, + "learning_rate": 5.360053682263069e-05, + "loss": 1.7527, + "step": 16055 + }, + { + "epoch": 4.928176795580111, + "grad_norm": 0.35836395621299744, + "learning_rate": 5.359557914927254e-05, + "loss": 1.7199, + "step": 16056 + }, + { + "epoch": 4.928483732351136, + "grad_norm": 0.2720802128314972, + "learning_rate": 5.359062144038078e-05, + "loss": 1.7598, + "step": 16057 + }, + { + "epoch": 4.928790669122161, + "grad_norm": 0.36662939190864563, + "learning_rate": 5.358566369600441e-05, + "loss": 1.7199, + "step": 16058 + }, + { + "epoch": 4.929097605893186, + "grad_norm": 0.42243221402168274, + "learning_rate": 5.3580705916192395e-05, + "loss": 1.7584, + "step": 16059 + }, + { + "epoch": 4.929404542664211, + "grad_norm": 0.21667765080928802, + "learning_rate": 5.357574810099375e-05, + "loss": 1.7608, + "step": 16060 + }, + { + "epoch": 4.929711479435237, + "grad_norm": 0.48101645708084106, + "learning_rate": 5.3570790250457456e-05, + "loss": 1.8157, + "step": 16061 + }, + { + "epoch": 4.930018416206261, + "grad_norm": 0.5289245843887329, + "learning_rate": 5.356583236463253e-05, + "loss": 1.7173, + "step": 16062 + }, + { + "epoch": 4.930325352977286, + "grad_norm": 0.21454930305480957, + "learning_rate": 5.356087444356795e-05, + "loss": 1.7399, + "step": 16063 + }, + { + "epoch": 4.930632289748312, + "grad_norm": 0.5648324489593506, + "learning_rate": 5.355591648731274e-05, + "loss": 1.7814, + "step": 16064 + }, + { + "epoch": 4.930939226519337, + "grad_norm": 0.5669483542442322, + "learning_rate": 5.355095849591587e-05, + "loss": 1.7769, + "step": 16065 + }, + { + "epoch": 4.931246163290362, + "grad_norm": 0.33108505606651306, + "learning_rate": 5.354600046942635e-05, + "loss": 1.7704, + "step": 16066 + }, + { + "epoch": 4.931553100061388, + "grad_norm": 0.31149306893348694, + "learning_rate": 5.3541042407893164e-05, + "loss": 1.7631, + "step": 16067 + }, + { + "epoch": 4.931860036832412, + "grad_norm": 0.30377596616744995, + "learning_rate": 5.353608431136532e-05, + "loss": 1.7888, + "step": 16068 + }, + { + "epoch": 4.9321669736034375, + "grad_norm": 0.25041452050209045, + "learning_rate": 5.3531126179891825e-05, + "loss": 1.7507, + "step": 16069 + }, + { + "epoch": 4.932473910374463, + "grad_norm": 0.33900725841522217, + "learning_rate": 5.352616801352167e-05, + "loss": 1.7365, + "step": 16070 + }, + { + "epoch": 4.932780847145488, + "grad_norm": 0.23939846456050873, + "learning_rate": 5.352120981230386e-05, + "loss": 1.7934, + "step": 16071 + }, + { + "epoch": 4.9330877839165135, + "grad_norm": 0.2419881969690323, + "learning_rate": 5.351625157628739e-05, + "loss": 1.7555, + "step": 16072 + }, + { + "epoch": 4.933394720687538, + "grad_norm": 0.3517596423625946, + "learning_rate": 5.351129330552125e-05, + "loss": 1.7102, + "step": 16073 + }, + { + "epoch": 4.933701657458563, + "grad_norm": 0.2660250663757324, + "learning_rate": 5.350633500005446e-05, + "loss": 1.7692, + "step": 16074 + }, + { + "epoch": 4.934008594229589, + "grad_norm": 0.20726454257965088, + "learning_rate": 5.350137665993601e-05, + "loss": 1.718, + "step": 16075 + }, + { + "epoch": 4.934315531000614, + "grad_norm": 0.28218522667884827, + "learning_rate": 5.3496418285214914e-05, + "loss": 1.8402, + "step": 16076 + }, + { + "epoch": 4.934622467771639, + "grad_norm": 0.2142515480518341, + "learning_rate": 5.349145987594015e-05, + "loss": 1.7571, + "step": 16077 + }, + { + "epoch": 4.934929404542665, + "grad_norm": 0.2777026891708374, + "learning_rate": 5.348650143216074e-05, + "loss": 1.7617, + "step": 16078 + }, + { + "epoch": 4.935236341313689, + "grad_norm": 0.24057620763778687, + "learning_rate": 5.348154295392567e-05, + "loss": 1.7149, + "step": 16079 + }, + { + "epoch": 4.935543278084714, + "grad_norm": 0.22220350801944733, + "learning_rate": 5.3476584441283964e-05, + "loss": 1.7402, + "step": 16080 + }, + { + "epoch": 4.93585021485574, + "grad_norm": 0.2451290488243103, + "learning_rate": 5.347162589428462e-05, + "loss": 1.7004, + "step": 16081 + }, + { + "epoch": 4.936157151626765, + "grad_norm": 0.25621771812438965, + "learning_rate": 5.3466667312976625e-05, + "loss": 1.7765, + "step": 16082 + }, + { + "epoch": 4.93646408839779, + "grad_norm": 0.217393159866333, + "learning_rate": 5.346170869740899e-05, + "loss": 1.7695, + "step": 16083 + }, + { + "epoch": 4.936771025168815, + "grad_norm": 0.21248537302017212, + "learning_rate": 5.345675004763071e-05, + "loss": 1.7277, + "step": 16084 + }, + { + "epoch": 4.93707796193984, + "grad_norm": 0.19431474804878235, + "learning_rate": 5.3451791363690805e-05, + "loss": 1.7352, + "step": 16085 + }, + { + "epoch": 4.9373848987108655, + "grad_norm": 0.20233909785747528, + "learning_rate": 5.344683264563829e-05, + "loss": 1.71, + "step": 16086 + }, + { + "epoch": 4.937691835481891, + "grad_norm": 0.2199622094631195, + "learning_rate": 5.344187389352214e-05, + "loss": 1.7443, + "step": 16087 + }, + { + "epoch": 4.937998772252916, + "grad_norm": 0.23495158553123474, + "learning_rate": 5.343691510739138e-05, + "loss": 1.7758, + "step": 16088 + }, + { + "epoch": 4.9383057090239415, + "grad_norm": 0.228348970413208, + "learning_rate": 5.3431956287295015e-05, + "loss": 1.7645, + "step": 16089 + }, + { + "epoch": 4.938612645794966, + "grad_norm": 0.2337537258863449, + "learning_rate": 5.342699743328203e-05, + "loss": 1.7353, + "step": 16090 + }, + { + "epoch": 4.938919582565991, + "grad_norm": 0.1899309754371643, + "learning_rate": 5.3422038545401454e-05, + "loss": 1.6907, + "step": 16091 + }, + { + "epoch": 4.939226519337017, + "grad_norm": 0.2479192316532135, + "learning_rate": 5.341707962370229e-05, + "loss": 1.7961, + "step": 16092 + }, + { + "epoch": 4.939533456108042, + "grad_norm": 0.2444314956665039, + "learning_rate": 5.341212066823355e-05, + "loss": 1.7768, + "step": 16093 + }, + { + "epoch": 4.939840392879067, + "grad_norm": 0.2123393714427948, + "learning_rate": 5.340716167904423e-05, + "loss": 1.7617, + "step": 16094 + }, + { + "epoch": 4.940147329650092, + "grad_norm": 0.20779116451740265, + "learning_rate": 5.340220265618334e-05, + "loss": 1.6951, + "step": 16095 + }, + { + "epoch": 4.940454266421117, + "grad_norm": 0.22189265489578247, + "learning_rate": 5.3397243599699884e-05, + "loss": 1.8368, + "step": 16096 + }, + { + "epoch": 4.940761203192142, + "grad_norm": 0.22316497564315796, + "learning_rate": 5.3392284509642875e-05, + "loss": 1.7096, + "step": 16097 + }, + { + "epoch": 4.941068139963168, + "grad_norm": 0.20406664907932281, + "learning_rate": 5.3387325386061346e-05, + "loss": 1.7269, + "step": 16098 + }, + { + "epoch": 4.941375076734193, + "grad_norm": 0.263007789850235, + "learning_rate": 5.338236622900427e-05, + "loss": 1.7663, + "step": 16099 + }, + { + "epoch": 4.941682013505218, + "grad_norm": 0.24388311803340912, + "learning_rate": 5.3377407038520654e-05, + "loss": 1.7113, + "step": 16100 + }, + { + "epoch": 4.941988950276243, + "grad_norm": 0.21918313205242157, + "learning_rate": 5.3372447814659524e-05, + "loss": 1.775, + "step": 16101 + }, + { + "epoch": 4.942295887047268, + "grad_norm": 0.30842962861061096, + "learning_rate": 5.336748855746989e-05, + "loss": 1.8229, + "step": 16102 + }, + { + "epoch": 4.9426028238182935, + "grad_norm": 0.2875657379627228, + "learning_rate": 5.336252926700077e-05, + "loss": 1.7377, + "step": 16103 + }, + { + "epoch": 4.942909760589319, + "grad_norm": 0.23411425948143005, + "learning_rate": 5.3357569943301156e-05, + "loss": 1.754, + "step": 16104 + }, + { + "epoch": 4.943216697360343, + "grad_norm": 0.29758864641189575, + "learning_rate": 5.335261058642007e-05, + "loss": 1.7471, + "step": 16105 + }, + { + "epoch": 4.943523634131369, + "grad_norm": 0.31761085987091064, + "learning_rate": 5.3347651196406534e-05, + "loss": 1.7658, + "step": 16106 + }, + { + "epoch": 4.943830570902394, + "grad_norm": 0.2487023025751114, + "learning_rate": 5.334269177330952e-05, + "loss": 1.786, + "step": 16107 + }, + { + "epoch": 4.944137507673419, + "grad_norm": 0.23954913020133972, + "learning_rate": 5.333773231717808e-05, + "loss": 1.8486, + "step": 16108 + }, + { + "epoch": 4.944444444444445, + "grad_norm": 0.24893096089363098, + "learning_rate": 5.3332772828061214e-05, + "loss": 1.7927, + "step": 16109 + }, + { + "epoch": 4.94475138121547, + "grad_norm": 0.28653839230537415, + "learning_rate": 5.332781330600795e-05, + "loss": 1.8331, + "step": 16110 + }, + { + "epoch": 4.945058317986494, + "grad_norm": 0.2597404718399048, + "learning_rate": 5.332285375106726e-05, + "loss": 1.7128, + "step": 16111 + }, + { + "epoch": 4.94536525475752, + "grad_norm": 0.23813198506832123, + "learning_rate": 5.3317894163288196e-05, + "loss": 1.7483, + "step": 16112 + }, + { + "epoch": 4.945672191528545, + "grad_norm": 0.2545793652534485, + "learning_rate": 5.331293454271974e-05, + "loss": 1.7987, + "step": 16113 + }, + { + "epoch": 4.94597912829957, + "grad_norm": 0.2453712821006775, + "learning_rate": 5.330797488941095e-05, + "loss": 1.7376, + "step": 16114 + }, + { + "epoch": 4.946286065070596, + "grad_norm": 0.20583751797676086, + "learning_rate": 5.33030152034108e-05, + "loss": 1.7038, + "step": 16115 + }, + { + "epoch": 4.94659300184162, + "grad_norm": 0.22557811439037323, + "learning_rate": 5.3298055484768313e-05, + "loss": 1.6999, + "step": 16116 + }, + { + "epoch": 4.9468999386126455, + "grad_norm": 0.23163801431655884, + "learning_rate": 5.329309573353252e-05, + "loss": 1.7575, + "step": 16117 + }, + { + "epoch": 4.947206875383671, + "grad_norm": 0.3560176491737366, + "learning_rate": 5.3288135949752394e-05, + "loss": 1.8494, + "step": 16118 + }, + { + "epoch": 4.947513812154696, + "grad_norm": 0.306379109621048, + "learning_rate": 5.328317613347701e-05, + "loss": 1.7229, + "step": 16119 + }, + { + "epoch": 4.9478207489257215, + "grad_norm": 0.24428823590278625, + "learning_rate": 5.3278216284755344e-05, + "loss": 1.7939, + "step": 16120 + }, + { + "epoch": 4.948127685696747, + "grad_norm": 0.22251521050930023, + "learning_rate": 5.327325640363643e-05, + "loss": 1.7624, + "step": 16121 + }, + { + "epoch": 4.948434622467771, + "grad_norm": 0.23310889303684235, + "learning_rate": 5.326829649016928e-05, + "loss": 1.7727, + "step": 16122 + }, + { + "epoch": 4.948741559238797, + "grad_norm": 0.22457881271839142, + "learning_rate": 5.326333654440291e-05, + "loss": 1.7602, + "step": 16123 + }, + { + "epoch": 4.949048496009822, + "grad_norm": 0.24032343924045563, + "learning_rate": 5.325837656638631e-05, + "loss": 1.7591, + "step": 16124 + }, + { + "epoch": 4.949355432780847, + "grad_norm": 0.25082892179489136, + "learning_rate": 5.3253416556168546e-05, + "loss": 1.7745, + "step": 16125 + }, + { + "epoch": 4.949662369551873, + "grad_norm": 0.22859038412570953, + "learning_rate": 5.3248456513798615e-05, + "loss": 1.7475, + "step": 16126 + }, + { + "epoch": 4.949969306322897, + "grad_norm": 0.27282553911209106, + "learning_rate": 5.3243496439325525e-05, + "loss": 1.7438, + "step": 16127 + }, + { + "epoch": 4.9502762430939224, + "grad_norm": 0.23622353374958038, + "learning_rate": 5.3238536332798303e-05, + "loss": 1.7625, + "step": 16128 + }, + { + "epoch": 4.950583179864948, + "grad_norm": 0.28060024976730347, + "learning_rate": 5.3233576194265975e-05, + "loss": 1.8028, + "step": 16129 + }, + { + "epoch": 4.950890116635973, + "grad_norm": 0.33281829953193665, + "learning_rate": 5.322861602377755e-05, + "loss": 1.7163, + "step": 16130 + }, + { + "epoch": 4.9511970534069984, + "grad_norm": 0.26457497477531433, + "learning_rate": 5.322365582138203e-05, + "loss": 1.7347, + "step": 16131 + }, + { + "epoch": 4.951503990178024, + "grad_norm": 0.21651674807071686, + "learning_rate": 5.3218695587128476e-05, + "loss": 1.7123, + "step": 16132 + }, + { + "epoch": 4.951810926949048, + "grad_norm": 0.2299882024526596, + "learning_rate": 5.3213735321065885e-05, + "loss": 1.775, + "step": 16133 + }, + { + "epoch": 4.952117863720074, + "grad_norm": 0.2252396047115326, + "learning_rate": 5.3208775023243265e-05, + "loss": 1.7598, + "step": 16134 + }, + { + "epoch": 4.952424800491099, + "grad_norm": 0.2263660430908203, + "learning_rate": 5.3203814693709655e-05, + "loss": 1.7519, + "step": 16135 + }, + { + "epoch": 4.952731737262124, + "grad_norm": 0.2425432950258255, + "learning_rate": 5.3198854332514056e-05, + "loss": 1.7769, + "step": 16136 + }, + { + "epoch": 4.953038674033149, + "grad_norm": 0.22624996304512024, + "learning_rate": 5.319389393970553e-05, + "loss": 1.7686, + "step": 16137 + }, + { + "epoch": 4.953345610804174, + "grad_norm": 0.2240568846464157, + "learning_rate": 5.318893351533306e-05, + "loss": 1.7795, + "step": 16138 + }, + { + "epoch": 4.953652547575199, + "grad_norm": 0.21708132326602936, + "learning_rate": 5.318397305944568e-05, + "loss": 1.7348, + "step": 16139 + }, + { + "epoch": 4.953959484346225, + "grad_norm": 0.2263328731060028, + "learning_rate": 5.3179012572092415e-05, + "loss": 1.7645, + "step": 16140 + }, + { + "epoch": 4.95426642111725, + "grad_norm": 0.2541986107826233, + "learning_rate": 5.3174052053322274e-05, + "loss": 1.723, + "step": 16141 + }, + { + "epoch": 4.954573357888275, + "grad_norm": 0.25829461216926575, + "learning_rate": 5.316909150318429e-05, + "loss": 1.7469, + "step": 16142 + }, + { + "epoch": 4.9548802946593, + "grad_norm": 0.21251125633716583, + "learning_rate": 5.3164130921727494e-05, + "loss": 1.7699, + "step": 16143 + }, + { + "epoch": 4.955187231430325, + "grad_norm": 0.29195618629455566, + "learning_rate": 5.315917030900091e-05, + "loss": 1.7373, + "step": 16144 + }, + { + "epoch": 4.9554941682013505, + "grad_norm": 0.29457888007164, + "learning_rate": 5.315420966505355e-05, + "loss": 1.7202, + "step": 16145 + }, + { + "epoch": 4.955801104972376, + "grad_norm": 0.19679461419582367, + "learning_rate": 5.314924898993443e-05, + "loss": 1.75, + "step": 16146 + }, + { + "epoch": 4.956108041743401, + "grad_norm": 0.287955105304718, + "learning_rate": 5.314428828369259e-05, + "loss": 1.7385, + "step": 16147 + }, + { + "epoch": 4.956414978514426, + "grad_norm": 0.3081825375556946, + "learning_rate": 5.313932754637706e-05, + "loss": 1.7558, + "step": 16148 + }, + { + "epoch": 4.956721915285451, + "grad_norm": 0.25226521492004395, + "learning_rate": 5.3134366778036846e-05, + "loss": 1.8407, + "step": 16149 + }, + { + "epoch": 4.957028852056476, + "grad_norm": 0.43601852655410767, + "learning_rate": 5.3129405978720984e-05, + "loss": 1.7762, + "step": 16150 + }, + { + "epoch": 4.957335788827502, + "grad_norm": 0.3630274832248688, + "learning_rate": 5.31244451484785e-05, + "loss": 1.7802, + "step": 16151 + }, + { + "epoch": 4.957642725598527, + "grad_norm": 0.21337948739528656, + "learning_rate": 5.311948428735841e-05, + "loss": 1.7107, + "step": 16152 + }, + { + "epoch": 4.957949662369552, + "grad_norm": 0.38581085205078125, + "learning_rate": 5.311452339540974e-05, + "loss": 1.7583, + "step": 16153 + }, + { + "epoch": 4.958256599140577, + "grad_norm": 0.28447309136390686, + "learning_rate": 5.310956247268154e-05, + "loss": 1.6992, + "step": 16154 + }, + { + "epoch": 4.958563535911602, + "grad_norm": 0.24510730803012848, + "learning_rate": 5.310460151922283e-05, + "loss": 1.7059, + "step": 16155 + }, + { + "epoch": 4.958870472682627, + "grad_norm": 0.41670146584510803, + "learning_rate": 5.309964053508262e-05, + "loss": 1.7191, + "step": 16156 + }, + { + "epoch": 4.959177409453653, + "grad_norm": 0.3123849034309387, + "learning_rate": 5.309467952030993e-05, + "loss": 1.7161, + "step": 16157 + }, + { + "epoch": 4.959484346224678, + "grad_norm": 0.2275281697511673, + "learning_rate": 5.308971847495382e-05, + "loss": 1.722, + "step": 16158 + }, + { + "epoch": 4.9597912829957025, + "grad_norm": 0.40216436982154846, + "learning_rate": 5.308475739906329e-05, + "loss": 1.7477, + "step": 16159 + }, + { + "epoch": 4.960098219766728, + "grad_norm": 0.259981244802475, + "learning_rate": 5.307979629268739e-05, + "loss": 1.7384, + "step": 16160 + }, + { + "epoch": 4.960405156537753, + "grad_norm": 0.22969573736190796, + "learning_rate": 5.3074835155875134e-05, + "loss": 1.7328, + "step": 16161 + }, + { + "epoch": 4.9607120933087785, + "grad_norm": 0.2773746848106384, + "learning_rate": 5.3069873988675556e-05, + "loss": 1.7333, + "step": 16162 + }, + { + "epoch": 4.961019030079804, + "grad_norm": 0.2764189541339874, + "learning_rate": 5.306491279113768e-05, + "loss": 1.7956, + "step": 16163 + }, + { + "epoch": 4.961325966850829, + "grad_norm": 0.3640958070755005, + "learning_rate": 5.305995156331054e-05, + "loss": 1.7464, + "step": 16164 + }, + { + "epoch": 4.961632903621854, + "grad_norm": 0.3573450446128845, + "learning_rate": 5.305499030524317e-05, + "loss": 1.75, + "step": 16165 + }, + { + "epoch": 4.961939840392879, + "grad_norm": 0.24313980340957642, + "learning_rate": 5.305002901698459e-05, + "loss": 1.7505, + "step": 16166 + }, + { + "epoch": 4.962246777163904, + "grad_norm": 0.3417615592479706, + "learning_rate": 5.304506769858384e-05, + "loss": 1.7387, + "step": 16167 + }, + { + "epoch": 4.96255371393493, + "grad_norm": 0.23209623992443085, + "learning_rate": 5.304010635008995e-05, + "loss": 1.7111, + "step": 16168 + }, + { + "epoch": 4.962860650705955, + "grad_norm": 0.2994776666164398, + "learning_rate": 5.3035144971551944e-05, + "loss": 1.75, + "step": 16169 + }, + { + "epoch": 4.963167587476979, + "grad_norm": 0.3147084712982178, + "learning_rate": 5.303018356301884e-05, + "loss": 1.7598, + "step": 16170 + }, + { + "epoch": 4.963474524248005, + "grad_norm": 0.20136526226997375, + "learning_rate": 5.30252221245397e-05, + "loss": 1.7217, + "step": 16171 + }, + { + "epoch": 4.96378146101903, + "grad_norm": 0.3308684229850769, + "learning_rate": 5.302026065616355e-05, + "loss": 1.7554, + "step": 16172 + }, + { + "epoch": 4.964088397790055, + "grad_norm": 0.22890877723693848, + "learning_rate": 5.30152991579394e-05, + "loss": 1.7598, + "step": 16173 + }, + { + "epoch": 4.964395334561081, + "grad_norm": 0.3036035895347595, + "learning_rate": 5.301033762991631e-05, + "loss": 1.758, + "step": 16174 + }, + { + "epoch": 4.964702271332106, + "grad_norm": 0.2983579933643341, + "learning_rate": 5.300537607214329e-05, + "loss": 1.8132, + "step": 16175 + }, + { + "epoch": 4.9650092081031305, + "grad_norm": 0.21401815116405487, + "learning_rate": 5.300041448466937e-05, + "loss": 1.7179, + "step": 16176 + }, + { + "epoch": 4.965316144874156, + "grad_norm": 0.2939651608467102, + "learning_rate": 5.2995452867543606e-05, + "loss": 1.7928, + "step": 16177 + }, + { + "epoch": 4.965623081645181, + "grad_norm": 0.24803484976291656, + "learning_rate": 5.2990491220815034e-05, + "loss": 1.7366, + "step": 16178 + }, + { + "epoch": 4.9659300184162065, + "grad_norm": 0.1999569535255432, + "learning_rate": 5.2985529544532656e-05, + "loss": 1.6691, + "step": 16179 + }, + { + "epoch": 4.966236955187231, + "grad_norm": 0.22315269708633423, + "learning_rate": 5.298056783874553e-05, + "loss": 1.7693, + "step": 16180 + }, + { + "epoch": 4.966543891958256, + "grad_norm": 0.22688794136047363, + "learning_rate": 5.2975606103502694e-05, + "loss": 1.8401, + "step": 16181 + }, + { + "epoch": 4.966850828729282, + "grad_norm": 0.2592024505138397, + "learning_rate": 5.297064433885317e-05, + "loss": 1.8054, + "step": 16182 + }, + { + "epoch": 4.967157765500307, + "grad_norm": 0.2508920133113861, + "learning_rate": 5.2965682544846e-05, + "loss": 1.766, + "step": 16183 + }, + { + "epoch": 4.967464702271332, + "grad_norm": 0.22318799793720245, + "learning_rate": 5.296072072153022e-05, + "loss": 1.751, + "step": 16184 + }, + { + "epoch": 4.967771639042358, + "grad_norm": 0.2348448485136032, + "learning_rate": 5.2955758868954855e-05, + "loss": 1.7844, + "step": 16185 + }, + { + "epoch": 4.968078575813382, + "grad_norm": 0.23294343054294586, + "learning_rate": 5.295079698716895e-05, + "loss": 1.7685, + "step": 16186 + }, + { + "epoch": 4.968385512584407, + "grad_norm": 0.20854508876800537, + "learning_rate": 5.2945835076221526e-05, + "loss": 1.6914, + "step": 16187 + }, + { + "epoch": 4.968692449355433, + "grad_norm": 0.21952031552791595, + "learning_rate": 5.294087313616165e-05, + "loss": 1.7121, + "step": 16188 + }, + { + "epoch": 4.968999386126458, + "grad_norm": 0.24097788333892822, + "learning_rate": 5.2935911167038346e-05, + "loss": 1.7712, + "step": 16189 + }, + { + "epoch": 4.969306322897483, + "grad_norm": 0.24433603882789612, + "learning_rate": 5.293094916890063e-05, + "loss": 1.7608, + "step": 16190 + }, + { + "epoch": 4.969613259668508, + "grad_norm": 0.22209061682224274, + "learning_rate": 5.292598714179757e-05, + "loss": 1.7563, + "step": 16191 + }, + { + "epoch": 4.969920196439533, + "grad_norm": 0.24291595816612244, + "learning_rate": 5.29210250857782e-05, + "loss": 1.7765, + "step": 16192 + }, + { + "epoch": 4.9702271332105585, + "grad_norm": 0.3143673837184906, + "learning_rate": 5.291606300089151e-05, + "loss": 1.7945, + "step": 16193 + }, + { + "epoch": 4.970534069981584, + "grad_norm": 0.22693613171577454, + "learning_rate": 5.291110088718661e-05, + "loss": 1.7411, + "step": 16194 + }, + { + "epoch": 4.970841006752609, + "grad_norm": 0.2271365374326706, + "learning_rate": 5.2906138744712494e-05, + "loss": 1.7754, + "step": 16195 + }, + { + "epoch": 4.9711479435236345, + "grad_norm": 0.2428499162197113, + "learning_rate": 5.290117657351822e-05, + "loss": 1.8007, + "step": 16196 + }, + { + "epoch": 4.971454880294659, + "grad_norm": 0.21862711012363434, + "learning_rate": 5.289621437365281e-05, + "loss": 1.7484, + "step": 16197 + }, + { + "epoch": 4.971761817065684, + "grad_norm": 0.26744964718818665, + "learning_rate": 5.2891252145165315e-05, + "loss": 1.7759, + "step": 16198 + }, + { + "epoch": 4.97206875383671, + "grad_norm": 0.2608526647090912, + "learning_rate": 5.288628988810477e-05, + "loss": 1.8527, + "step": 16199 + }, + { + "epoch": 4.972375690607735, + "grad_norm": 0.2245805710554123, + "learning_rate": 5.2881327602520216e-05, + "loss": 1.7773, + "step": 16200 + }, + { + "epoch": 4.97268262737876, + "grad_norm": 0.22023041546344757, + "learning_rate": 5.2876365288460694e-05, + "loss": 1.7101, + "step": 16201 + }, + { + "epoch": 4.972989564149785, + "grad_norm": 0.22034525871276855, + "learning_rate": 5.287140294597525e-05, + "loss": 1.7672, + "step": 16202 + }, + { + "epoch": 4.97329650092081, + "grad_norm": 0.23101158440113068, + "learning_rate": 5.286644057511292e-05, + "loss": 1.741, + "step": 16203 + }, + { + "epoch": 4.973603437691835, + "grad_norm": 0.23050430417060852, + "learning_rate": 5.286147817592273e-05, + "loss": 1.7727, + "step": 16204 + }, + { + "epoch": 4.973910374462861, + "grad_norm": 0.21803520619869232, + "learning_rate": 5.285651574845374e-05, + "loss": 1.7353, + "step": 16205 + }, + { + "epoch": 4.974217311233886, + "grad_norm": 0.22252169251441956, + "learning_rate": 5.2851553292754995e-05, + "loss": 1.7658, + "step": 16206 + }, + { + "epoch": 4.974524248004911, + "grad_norm": 0.22458864748477936, + "learning_rate": 5.284659080887552e-05, + "loss": 1.7157, + "step": 16207 + }, + { + "epoch": 4.974831184775936, + "grad_norm": 0.20769210159778595, + "learning_rate": 5.2841628296864376e-05, + "loss": 1.7731, + "step": 16208 + }, + { + "epoch": 4.975138121546961, + "grad_norm": 0.1952340304851532, + "learning_rate": 5.283666575677059e-05, + "loss": 1.6907, + "step": 16209 + }, + { + "epoch": 4.975445058317987, + "grad_norm": 0.21943804621696472, + "learning_rate": 5.28317031886432e-05, + "loss": 1.8007, + "step": 16210 + }, + { + "epoch": 4.975751995089012, + "grad_norm": 0.21987493336200714, + "learning_rate": 5.2826740592531276e-05, + "loss": 1.7205, + "step": 16211 + }, + { + "epoch": 4.976058931860036, + "grad_norm": 0.2076522558927536, + "learning_rate": 5.2821777968483845e-05, + "loss": 1.7063, + "step": 16212 + }, + { + "epoch": 4.976365868631062, + "grad_norm": 0.19126583635807037, + "learning_rate": 5.281681531654994e-05, + "loss": 1.7118, + "step": 16213 + }, + { + "epoch": 4.976672805402087, + "grad_norm": 0.22308050096035004, + "learning_rate": 5.2811852636778625e-05, + "loss": 1.7565, + "step": 16214 + }, + { + "epoch": 4.976979742173112, + "grad_norm": 0.23187528550624847, + "learning_rate": 5.280688992921893e-05, + "loss": 1.8261, + "step": 16215 + }, + { + "epoch": 4.977286678944138, + "grad_norm": 0.21373791992664337, + "learning_rate": 5.28019271939199e-05, + "loss": 1.6974, + "step": 16216 + }, + { + "epoch": 4.977593615715163, + "grad_norm": 0.21647346019744873, + "learning_rate": 5.2796964430930585e-05, + "loss": 1.7967, + "step": 16217 + }, + { + "epoch": 4.9779005524861875, + "grad_norm": 0.2231660932302475, + "learning_rate": 5.279200164030002e-05, + "loss": 1.7495, + "step": 16218 + }, + { + "epoch": 4.978207489257213, + "grad_norm": 0.2810545563697815, + "learning_rate": 5.278703882207728e-05, + "loss": 1.875, + "step": 16219 + }, + { + "epoch": 4.978514426028238, + "grad_norm": 0.298984557390213, + "learning_rate": 5.2782075976311374e-05, + "loss": 1.7494, + "step": 16220 + }, + { + "epoch": 4.9788213627992635, + "grad_norm": 0.2530893385410309, + "learning_rate": 5.2777113103051365e-05, + "loss": 1.7594, + "step": 16221 + }, + { + "epoch": 4.979128299570289, + "grad_norm": 0.26165664196014404, + "learning_rate": 5.277215020234629e-05, + "loss": 1.7543, + "step": 16222 + }, + { + "epoch": 4.979435236341313, + "grad_norm": 0.25115957856178284, + "learning_rate": 5.276718727424521e-05, + "loss": 1.7925, + "step": 16223 + }, + { + "epoch": 4.979742173112339, + "grad_norm": 0.22134126722812653, + "learning_rate": 5.276222431879716e-05, + "loss": 1.8359, + "step": 16224 + }, + { + "epoch": 4.980049109883364, + "grad_norm": 0.24447613954544067, + "learning_rate": 5.275726133605119e-05, + "loss": 1.7693, + "step": 16225 + }, + { + "epoch": 4.980356046654389, + "grad_norm": 0.23025095462799072, + "learning_rate": 5.275229832605635e-05, + "loss": 1.7911, + "step": 16226 + }, + { + "epoch": 4.980662983425415, + "grad_norm": 0.23424232006072998, + "learning_rate": 5.2747335288861686e-05, + "loss": 1.7628, + "step": 16227 + }, + { + "epoch": 4.98096992019644, + "grad_norm": 0.24598535895347595, + "learning_rate": 5.2742372224516235e-05, + "loss": 1.7651, + "step": 16228 + }, + { + "epoch": 4.981276856967464, + "grad_norm": 0.262893944978714, + "learning_rate": 5.273740913306906e-05, + "loss": 1.7282, + "step": 16229 + }, + { + "epoch": 4.98158379373849, + "grad_norm": 0.21981783211231232, + "learning_rate": 5.2732446014569207e-05, + "loss": 1.7448, + "step": 16230 + }, + { + "epoch": 4.981890730509515, + "grad_norm": 0.24244973063468933, + "learning_rate": 5.272748286906573e-05, + "loss": 1.7216, + "step": 16231 + }, + { + "epoch": 4.98219766728054, + "grad_norm": 0.2365221232175827, + "learning_rate": 5.272251969660766e-05, + "loss": 1.7227, + "step": 16232 + }, + { + "epoch": 4.982504604051566, + "grad_norm": 0.2081129401922226, + "learning_rate": 5.271755649724405e-05, + "loss": 1.7184, + "step": 16233 + }, + { + "epoch": 4.98281154082259, + "grad_norm": 0.2256374955177307, + "learning_rate": 5.271259327102395e-05, + "loss": 1.7412, + "step": 16234 + }, + { + "epoch": 4.9831184775936155, + "grad_norm": 0.23727381229400635, + "learning_rate": 5.270763001799643e-05, + "loss": 1.8095, + "step": 16235 + }, + { + "epoch": 4.983425414364641, + "grad_norm": 0.21498435735702515, + "learning_rate": 5.2702666738210504e-05, + "loss": 1.744, + "step": 16236 + }, + { + "epoch": 4.983732351135666, + "grad_norm": 0.24772173166275024, + "learning_rate": 5.269770343171525e-05, + "loss": 1.741, + "step": 16237 + }, + { + "epoch": 4.9840392879066915, + "grad_norm": 0.2835623621940613, + "learning_rate": 5.269274009855971e-05, + "loss": 1.7765, + "step": 16238 + }, + { + "epoch": 4.984346224677717, + "grad_norm": 0.2570044696331024, + "learning_rate": 5.2687776738792926e-05, + "loss": 1.8206, + "step": 16239 + }, + { + "epoch": 4.984653161448741, + "grad_norm": 0.21549640595912933, + "learning_rate": 5.268281335246397e-05, + "loss": 1.7022, + "step": 16240 + }, + { + "epoch": 4.984960098219767, + "grad_norm": 0.23158684372901917, + "learning_rate": 5.267784993962187e-05, + "loss": 1.7882, + "step": 16241 + }, + { + "epoch": 4.985267034990792, + "grad_norm": 0.22778423130512238, + "learning_rate": 5.26728865003157e-05, + "loss": 1.7358, + "step": 16242 + }, + { + "epoch": 4.985573971761817, + "grad_norm": 0.23197145760059357, + "learning_rate": 5.266792303459449e-05, + "loss": 1.7687, + "step": 16243 + }, + { + "epoch": 4.985880908532843, + "grad_norm": 0.19270172715187073, + "learning_rate": 5.26629595425073e-05, + "loss": 1.6999, + "step": 16244 + }, + { + "epoch": 4.986187845303867, + "grad_norm": 0.25262632966041565, + "learning_rate": 5.2657996024103175e-05, + "loss": 1.7536, + "step": 16245 + }, + { + "epoch": 4.986494782074892, + "grad_norm": 0.18620926141738892, + "learning_rate": 5.2653032479431185e-05, + "loss": 1.7033, + "step": 16246 + }, + { + "epoch": 4.986801718845918, + "grad_norm": 0.19537273049354553, + "learning_rate": 5.2648068908540374e-05, + "loss": 1.7457, + "step": 16247 + }, + { + "epoch": 4.987108655616943, + "grad_norm": 0.19447599351406097, + "learning_rate": 5.26431053114798e-05, + "loss": 1.7053, + "step": 16248 + }, + { + "epoch": 4.987415592387968, + "grad_norm": 0.20431137084960938, + "learning_rate": 5.263814168829852e-05, + "loss": 1.7695, + "step": 16249 + }, + { + "epoch": 4.987722529158994, + "grad_norm": 0.21123024821281433, + "learning_rate": 5.263317803904554e-05, + "loss": 1.7666, + "step": 16250 + }, + { + "epoch": 4.988029465930018, + "grad_norm": 0.21279335021972656, + "learning_rate": 5.262821436376998e-05, + "loss": 1.7231, + "step": 16251 + }, + { + "epoch": 4.9883364027010435, + "grad_norm": 0.22504910826683044, + "learning_rate": 5.262325066252085e-05, + "loss": 1.7657, + "step": 16252 + }, + { + "epoch": 4.988643339472069, + "grad_norm": 0.23505981266498566, + "learning_rate": 5.261828693534723e-05, + "loss": 1.7576, + "step": 16253 + }, + { + "epoch": 4.988950276243094, + "grad_norm": 0.21553601324558258, + "learning_rate": 5.261332318229817e-05, + "loss": 1.7782, + "step": 16254 + }, + { + "epoch": 4.989257213014119, + "grad_norm": 0.29189521074295044, + "learning_rate": 5.26083594034227e-05, + "loss": 1.7664, + "step": 16255 + }, + { + "epoch": 4.989564149785144, + "grad_norm": 0.38108906149864197, + "learning_rate": 5.26033955987699e-05, + "loss": 1.8573, + "step": 16256 + }, + { + "epoch": 4.989871086556169, + "grad_norm": 0.30329224467277527, + "learning_rate": 5.2598431768388824e-05, + "loss": 1.7584, + "step": 16257 + }, + { + "epoch": 4.990178023327195, + "grad_norm": 0.2437417358160019, + "learning_rate": 5.259346791232852e-05, + "loss": 1.7352, + "step": 16258 + }, + { + "epoch": 4.99048496009822, + "grad_norm": 0.3601737320423126, + "learning_rate": 5.258850403063804e-05, + "loss": 1.7206, + "step": 16259 + }, + { + "epoch": 4.990791896869245, + "grad_norm": 0.20259195566177368, + "learning_rate": 5.258354012336646e-05, + "loss": 1.7403, + "step": 16260 + }, + { + "epoch": 4.99109883364027, + "grad_norm": 0.38022148609161377, + "learning_rate": 5.257857619056281e-05, + "loss": 1.7783, + "step": 16261 + }, + { + "epoch": 4.991405770411295, + "grad_norm": 0.30131712555885315, + "learning_rate": 5.257361223227615e-05, + "loss": 1.7826, + "step": 16262 + }, + { + "epoch": 4.99171270718232, + "grad_norm": 0.24159663915634155, + "learning_rate": 5.2568648248555565e-05, + "loss": 1.7792, + "step": 16263 + }, + { + "epoch": 4.992019643953346, + "grad_norm": 0.4641213119029999, + "learning_rate": 5.2563684239450084e-05, + "loss": 1.7432, + "step": 16264 + }, + { + "epoch": 4.992326580724371, + "grad_norm": 0.3526865541934967, + "learning_rate": 5.255872020500877e-05, + "loss": 1.7736, + "step": 16265 + }, + { + "epoch": 4.9926335174953955, + "grad_norm": 0.2396051585674286, + "learning_rate": 5.255375614528071e-05, + "loss": 1.7505, + "step": 16266 + }, + { + "epoch": 4.992940454266421, + "grad_norm": 0.320987343788147, + "learning_rate": 5.25487920603149e-05, + "loss": 1.8229, + "step": 16267 + }, + { + "epoch": 4.993247391037446, + "grad_norm": 0.24689678847789764, + "learning_rate": 5.254382795016044e-05, + "loss": 1.7011, + "step": 16268 + }, + { + "epoch": 4.9935543278084715, + "grad_norm": 0.2407137155532837, + "learning_rate": 5.253886381486639e-05, + "loss": 1.741, + "step": 16269 + }, + { + "epoch": 4.993861264579497, + "grad_norm": 0.3677252531051636, + "learning_rate": 5.25338996544818e-05, + "loss": 1.7792, + "step": 16270 + }, + { + "epoch": 4.994168201350522, + "grad_norm": 0.25096553564071655, + "learning_rate": 5.252893546905573e-05, + "loss": 1.7523, + "step": 16271 + }, + { + "epoch": 4.994475138121547, + "grad_norm": 0.2966327965259552, + "learning_rate": 5.252397125863723e-05, + "loss": 1.7114, + "step": 16272 + }, + { + "epoch": 4.994782074892572, + "grad_norm": 0.36577650904655457, + "learning_rate": 5.2519007023275356e-05, + "loss": 1.7609, + "step": 16273 + }, + { + "epoch": 4.995089011663597, + "grad_norm": 0.2450687140226364, + "learning_rate": 5.25140427630192e-05, + "loss": 1.7452, + "step": 16274 + }, + { + "epoch": 4.995395948434623, + "grad_norm": 0.20782120525836945, + "learning_rate": 5.250907847791778e-05, + "loss": 1.7109, + "step": 16275 + }, + { + "epoch": 4.995702885205648, + "grad_norm": 0.2423330545425415, + "learning_rate": 5.25041141680202e-05, + "loss": 1.7234, + "step": 16276 + }, + { + "epoch": 4.996009821976672, + "grad_norm": 0.20855975151062012, + "learning_rate": 5.2499149833375484e-05, + "loss": 1.7734, + "step": 16277 + }, + { + "epoch": 4.996316758747698, + "grad_norm": 0.24400894343852997, + "learning_rate": 5.24941854740327e-05, + "loss": 1.7566, + "step": 16278 + }, + { + "epoch": 4.996623695518723, + "grad_norm": 0.4378018379211426, + "learning_rate": 5.2489221090040906e-05, + "loss": 1.7536, + "step": 16279 + }, + { + "epoch": 4.996930632289748, + "grad_norm": 0.20726722478866577, + "learning_rate": 5.248425668144918e-05, + "loss": 1.8008, + "step": 16280 + }, + { + "epoch": 4.997237569060774, + "grad_norm": 0.2506333589553833, + "learning_rate": 5.247929224830658e-05, + "loss": 1.7404, + "step": 16281 + }, + { + "epoch": 4.997544505831799, + "grad_norm": 0.24178004264831543, + "learning_rate": 5.247432779066216e-05, + "loss": 1.7517, + "step": 16282 + }, + { + "epoch": 4.9978514426028235, + "grad_norm": 0.2500220835208893, + "learning_rate": 5.246936330856499e-05, + "loss": 1.7705, + "step": 16283 + }, + { + "epoch": 4.998158379373849, + "grad_norm": 0.30043718218803406, + "learning_rate": 5.24643988020641e-05, + "loss": 1.8118, + "step": 16284 + }, + { + "epoch": 4.998465316144874, + "grad_norm": 0.284805566072464, + "learning_rate": 5.245943427120859e-05, + "loss": 1.7968, + "step": 16285 + }, + { + "epoch": 4.9987722529158995, + "grad_norm": 0.3652406632900238, + "learning_rate": 5.245446971604751e-05, + "loss": 1.7785, + "step": 16286 + }, + { + "epoch": 4.999079189686924, + "grad_norm": 0.24879656732082367, + "learning_rate": 5.244950513662992e-05, + "loss": 1.734, + "step": 16287 + }, + { + "epoch": 4.999386126457949, + "grad_norm": 0.2374224215745926, + "learning_rate": 5.244454053300488e-05, + "loss": 1.7394, + "step": 16288 + }, + { + "epoch": 4.999693063228975, + "grad_norm": 0.27090463042259216, + "learning_rate": 5.243957590522147e-05, + "loss": 1.7529, + "step": 16289 + }, + { + "epoch": 5.0, + "grad_norm": 0.23060791194438934, + "learning_rate": 5.243461125332873e-05, + "loss": 1.7599, + "step": 16290 + }, + { + "epoch": 5.000306936771025, + "grad_norm": 0.21159487962722778, + "learning_rate": 5.242964657737572e-05, + "loss": 1.747, + "step": 16291 + }, + { + "epoch": 5.000613873542051, + "grad_norm": 0.21556304395198822, + "learning_rate": 5.242468187741154e-05, + "loss": 1.7653, + "step": 16292 + }, + { + "epoch": 5.000920810313075, + "grad_norm": 0.2569669783115387, + "learning_rate": 5.241971715348524e-05, + "loss": 1.7284, + "step": 16293 + }, + { + "epoch": 5.0012277470841005, + "grad_norm": 0.2827381491661072, + "learning_rate": 5.241475240564586e-05, + "loss": 1.7765, + "step": 16294 + }, + { + "epoch": 5.001534683855126, + "grad_norm": 0.22498267889022827, + "learning_rate": 5.240978763394249e-05, + "loss": 1.729, + "step": 16295 + }, + { + "epoch": 5.001841620626151, + "grad_norm": 0.23975814878940582, + "learning_rate": 5.240482283842418e-05, + "loss": 1.7968, + "step": 16296 + }, + { + "epoch": 5.0021485573971765, + "grad_norm": 0.20811420679092407, + "learning_rate": 5.239985801914e-05, + "loss": 1.6931, + "step": 16297 + }, + { + "epoch": 5.002455494168202, + "grad_norm": 0.22985060513019562, + "learning_rate": 5.2394893176139014e-05, + "loss": 1.7724, + "step": 16298 + }, + { + "epoch": 5.002762430939226, + "grad_norm": 0.22867995500564575, + "learning_rate": 5.2389928309470305e-05, + "loss": 1.7179, + "step": 16299 + }, + { + "epoch": 5.003069367710252, + "grad_norm": 0.2543974220752716, + "learning_rate": 5.238496341918293e-05, + "loss": 1.7859, + "step": 16300 + }, + { + "epoch": 5.003376304481277, + "grad_norm": 0.226583793759346, + "learning_rate": 5.237999850532592e-05, + "loss": 1.7567, + "step": 16301 + }, + { + "epoch": 5.003683241252302, + "grad_norm": 0.21744728088378906, + "learning_rate": 5.237503356794838e-05, + "loss": 1.7345, + "step": 16302 + }, + { + "epoch": 5.003990178023328, + "grad_norm": 0.25915467739105225, + "learning_rate": 5.2370068607099373e-05, + "loss": 1.7179, + "step": 16303 + }, + { + "epoch": 5.004297114794352, + "grad_norm": 0.20572461187839508, + "learning_rate": 5.236510362282796e-05, + "loss": 1.7211, + "step": 16304 + }, + { + "epoch": 5.004604051565377, + "grad_norm": 0.2821461856365204, + "learning_rate": 5.236013861518321e-05, + "loss": 1.7894, + "step": 16305 + }, + { + "epoch": 5.004910988336403, + "grad_norm": 0.22273759543895721, + "learning_rate": 5.235517358421417e-05, + "loss": 1.7919, + "step": 16306 + }, + { + "epoch": 5.005217925107428, + "grad_norm": 0.23875468969345093, + "learning_rate": 5.2350208529969935e-05, + "loss": 1.7558, + "step": 16307 + }, + { + "epoch": 5.005524861878453, + "grad_norm": 0.24673783779144287, + "learning_rate": 5.234524345249955e-05, + "loss": 1.7705, + "step": 16308 + }, + { + "epoch": 5.005831798649478, + "grad_norm": 0.21992872655391693, + "learning_rate": 5.234027835185211e-05, + "loss": 1.7059, + "step": 16309 + }, + { + "epoch": 5.006138735420503, + "grad_norm": 0.19214966893196106, + "learning_rate": 5.233531322807667e-05, + "loss": 1.6647, + "step": 16310 + }, + { + "epoch": 5.0064456721915285, + "grad_norm": 0.18525120615959167, + "learning_rate": 5.233034808122228e-05, + "loss": 1.719, + "step": 16311 + }, + { + "epoch": 5.006752608962554, + "grad_norm": 0.25996243953704834, + "learning_rate": 5.232538291133804e-05, + "loss": 1.7227, + "step": 16312 + }, + { + "epoch": 5.007059545733579, + "grad_norm": 0.2163757085800171, + "learning_rate": 5.232041771847299e-05, + "loss": 1.6962, + "step": 16313 + }, + { + "epoch": 5.0073664825046045, + "grad_norm": 0.23484158515930176, + "learning_rate": 5.231545250267621e-05, + "loss": 1.7816, + "step": 16314 + }, + { + "epoch": 5.007673419275629, + "grad_norm": 0.2188636213541031, + "learning_rate": 5.2310487263996776e-05, + "loss": 1.7477, + "step": 16315 + }, + { + "epoch": 5.007980356046654, + "grad_norm": 0.1950213611125946, + "learning_rate": 5.230552200248377e-05, + "loss": 1.7165, + "step": 16316 + }, + { + "epoch": 5.00828729281768, + "grad_norm": 0.25340089201927185, + "learning_rate": 5.230055671818623e-05, + "loss": 1.7764, + "step": 16317 + }, + { + "epoch": 5.008594229588705, + "grad_norm": 0.23749271035194397, + "learning_rate": 5.2295591411153245e-05, + "loss": 1.7193, + "step": 16318 + }, + { + "epoch": 5.00890116635973, + "grad_norm": 0.2317294180393219, + "learning_rate": 5.229062608143387e-05, + "loss": 1.7607, + "step": 16319 + }, + { + "epoch": 5.009208103130755, + "grad_norm": 0.2751505672931671, + "learning_rate": 5.228566072907719e-05, + "loss": 1.7562, + "step": 16320 + }, + { + "epoch": 5.00951503990178, + "grad_norm": 0.29476025700569153, + "learning_rate": 5.2280695354132267e-05, + "loss": 1.687, + "step": 16321 + }, + { + "epoch": 5.009821976672805, + "grad_norm": 0.20734120905399323, + "learning_rate": 5.227572995664819e-05, + "loss": 1.7608, + "step": 16322 + }, + { + "epoch": 5.010128913443831, + "grad_norm": 0.2537878155708313, + "learning_rate": 5.227076453667401e-05, + "loss": 1.7947, + "step": 16323 + }, + { + "epoch": 5.010435850214856, + "grad_norm": 0.23516076803207397, + "learning_rate": 5.2265799094258796e-05, + "loss": 1.7545, + "step": 16324 + }, + { + "epoch": 5.0107427869858805, + "grad_norm": 0.2581529915332794, + "learning_rate": 5.226083362945162e-05, + "loss": 1.7529, + "step": 16325 + }, + { + "epoch": 5.011049723756906, + "grad_norm": 0.2982035279273987, + "learning_rate": 5.225586814230158e-05, + "loss": 1.74, + "step": 16326 + }, + { + "epoch": 5.011356660527931, + "grad_norm": 0.2773981988430023, + "learning_rate": 5.225090263285772e-05, + "loss": 1.7562, + "step": 16327 + }, + { + "epoch": 5.0116635972989565, + "grad_norm": 0.19992689788341522, + "learning_rate": 5.2245937101169116e-05, + "loss": 1.6896, + "step": 16328 + }, + { + "epoch": 5.011970534069982, + "grad_norm": 0.2913428246974945, + "learning_rate": 5.224097154728486e-05, + "loss": 1.7574, + "step": 16329 + }, + { + "epoch": 5.012277470841007, + "grad_norm": 0.23173104226589203, + "learning_rate": 5.2236005971254e-05, + "loss": 1.6954, + "step": 16330 + }, + { + "epoch": 5.012584407612032, + "grad_norm": 0.2019525170326233, + "learning_rate": 5.2231040373125614e-05, + "loss": 1.7711, + "step": 16331 + }, + { + "epoch": 5.012891344383057, + "grad_norm": 0.29070746898651123, + "learning_rate": 5.222607475294878e-05, + "loss": 1.8201, + "step": 16332 + }, + { + "epoch": 5.013198281154082, + "grad_norm": 0.22005079686641693, + "learning_rate": 5.222110911077258e-05, + "loss": 1.7421, + "step": 16333 + }, + { + "epoch": 5.013505217925108, + "grad_norm": 0.24422192573547363, + "learning_rate": 5.2216143446646085e-05, + "loss": 1.7074, + "step": 16334 + }, + { + "epoch": 5.013812154696133, + "grad_norm": 0.2417927384376526, + "learning_rate": 5.221117776061836e-05, + "loss": 1.7726, + "step": 16335 + }, + { + "epoch": 5.014119091467157, + "grad_norm": 0.245828777551651, + "learning_rate": 5.2206212052738454e-05, + "loss": 1.7932, + "step": 16336 + }, + { + "epoch": 5.014426028238183, + "grad_norm": 0.24054239690303802, + "learning_rate": 5.220124632305548e-05, + "loss": 1.727, + "step": 16337 + }, + { + "epoch": 5.014732965009208, + "grad_norm": 0.2572494149208069, + "learning_rate": 5.21962805716185e-05, + "loss": 1.7234, + "step": 16338 + }, + { + "epoch": 5.015039901780233, + "grad_norm": 0.33624622225761414, + "learning_rate": 5.2191314798476595e-05, + "loss": 1.7499, + "step": 16339 + }, + { + "epoch": 5.015346838551259, + "grad_norm": 0.22321413457393646, + "learning_rate": 5.218634900367883e-05, + "loss": 1.7155, + "step": 16340 + }, + { + "epoch": 5.015653775322283, + "grad_norm": 0.26709917187690735, + "learning_rate": 5.218138318727429e-05, + "loss": 1.8346, + "step": 16341 + }, + { + "epoch": 5.0159607120933085, + "grad_norm": 0.27600952982902527, + "learning_rate": 5.217641734931202e-05, + "loss": 1.789, + "step": 16342 + }, + { + "epoch": 5.016267648864334, + "grad_norm": 0.21392405033111572, + "learning_rate": 5.217145148984114e-05, + "loss": 1.7266, + "step": 16343 + }, + { + "epoch": 5.016574585635359, + "grad_norm": 0.3215450942516327, + "learning_rate": 5.2166485608910696e-05, + "loss": 1.7453, + "step": 16344 + }, + { + "epoch": 5.0168815224063845, + "grad_norm": 0.22328032553195953, + "learning_rate": 5.2161519706569776e-05, + "loss": 1.7209, + "step": 16345 + }, + { + "epoch": 5.01718845917741, + "grad_norm": 0.2438887059688568, + "learning_rate": 5.215655378286744e-05, + "loss": 1.7289, + "step": 16346 + }, + { + "epoch": 5.017495395948434, + "grad_norm": 0.30078747868537903, + "learning_rate": 5.2151587837852786e-05, + "loss": 1.7483, + "step": 16347 + }, + { + "epoch": 5.01780233271946, + "grad_norm": 0.21723167598247528, + "learning_rate": 5.214662187157488e-05, + "loss": 1.7654, + "step": 16348 + }, + { + "epoch": 5.018109269490485, + "grad_norm": 0.26358669996261597, + "learning_rate": 5.2141655884082784e-05, + "loss": 1.7563, + "step": 16349 + }, + { + "epoch": 5.01841620626151, + "grad_norm": 0.24285505712032318, + "learning_rate": 5.2136689875425615e-05, + "loss": 1.7377, + "step": 16350 + }, + { + "epoch": 5.018723143032536, + "grad_norm": 0.2401108294725418, + "learning_rate": 5.2131723845652416e-05, + "loss": 1.7445, + "step": 16351 + }, + { + "epoch": 5.01903007980356, + "grad_norm": 0.3347793519496918, + "learning_rate": 5.212675779481226e-05, + "loss": 1.7872, + "step": 16352 + }, + { + "epoch": 5.019337016574585, + "grad_norm": 0.306728720664978, + "learning_rate": 5.212179172295424e-05, + "loss": 1.8051, + "step": 16353 + }, + { + "epoch": 5.019643953345611, + "grad_norm": 0.22297725081443787, + "learning_rate": 5.211682563012743e-05, + "loss": 1.7082, + "step": 16354 + }, + { + "epoch": 5.019950890116636, + "grad_norm": 0.24047277867794037, + "learning_rate": 5.211185951638091e-05, + "loss": 1.7024, + "step": 16355 + }, + { + "epoch": 5.020257826887661, + "grad_norm": 0.19570080935955048, + "learning_rate": 5.210689338176377e-05, + "loss": 1.6947, + "step": 16356 + }, + { + "epoch": 5.020564763658686, + "grad_norm": 0.2024889886379242, + "learning_rate": 5.2101927226325066e-05, + "loss": 1.7168, + "step": 16357 + }, + { + "epoch": 5.020871700429711, + "grad_norm": 0.23546278476715088, + "learning_rate": 5.209696105011388e-05, + "loss": 1.7697, + "step": 16358 + }, + { + "epoch": 5.0211786372007365, + "grad_norm": 0.21003498136997223, + "learning_rate": 5.209199485317928e-05, + "loss": 1.7198, + "step": 16359 + }, + { + "epoch": 5.021485573971762, + "grad_norm": 0.21375493705272675, + "learning_rate": 5.208702863557039e-05, + "loss": 1.7689, + "step": 16360 + }, + { + "epoch": 5.021792510742787, + "grad_norm": 0.21549762785434723, + "learning_rate": 5.2082062397336254e-05, + "loss": 1.6936, + "step": 16361 + }, + { + "epoch": 5.0220994475138125, + "grad_norm": 0.22633691132068634, + "learning_rate": 5.207709613852595e-05, + "loss": 1.7512, + "step": 16362 + }, + { + "epoch": 5.022406384284837, + "grad_norm": 0.21888238191604614, + "learning_rate": 5.2072129859188566e-05, + "loss": 1.7082, + "step": 16363 + }, + { + "epoch": 5.022713321055862, + "grad_norm": 0.2416619062423706, + "learning_rate": 5.206716355937318e-05, + "loss": 1.7938, + "step": 16364 + }, + { + "epoch": 5.023020257826888, + "grad_norm": 0.22451527416706085, + "learning_rate": 5.206219723912886e-05, + "loss": 1.7372, + "step": 16365 + }, + { + "epoch": 5.023327194597913, + "grad_norm": 0.19698494672775269, + "learning_rate": 5.2057230898504716e-05, + "loss": 1.7205, + "step": 16366 + }, + { + "epoch": 5.023634131368938, + "grad_norm": 0.2441127747297287, + "learning_rate": 5.205226453754982e-05, + "loss": 1.7625, + "step": 16367 + }, + { + "epoch": 5.023941068139963, + "grad_norm": 0.21940121054649353, + "learning_rate": 5.204729815631323e-05, + "loss": 1.7985, + "step": 16368 + }, + { + "epoch": 5.024248004910988, + "grad_norm": 0.21751399338245392, + "learning_rate": 5.204233175484403e-05, + "loss": 1.7759, + "step": 16369 + }, + { + "epoch": 5.024554941682013, + "grad_norm": 0.20261377096176147, + "learning_rate": 5.2037365333191315e-05, + "loss": 1.746, + "step": 16370 + }, + { + "epoch": 5.024861878453039, + "grad_norm": 0.2628774046897888, + "learning_rate": 5.2032398891404166e-05, + "loss": 1.8178, + "step": 16371 + }, + { + "epoch": 5.025168815224064, + "grad_norm": 0.20626378059387207, + "learning_rate": 5.2027432429531665e-05, + "loss": 1.7456, + "step": 16372 + }, + { + "epoch": 5.0254757519950894, + "grad_norm": 0.25548869371414185, + "learning_rate": 5.2022465947622876e-05, + "loss": 1.8098, + "step": 16373 + }, + { + "epoch": 5.025782688766114, + "grad_norm": 0.1978374719619751, + "learning_rate": 5.20174994457269e-05, + "loss": 1.685, + "step": 16374 + }, + { + "epoch": 5.026089625537139, + "grad_norm": 0.2708980143070221, + "learning_rate": 5.201253292389282e-05, + "loss": 1.7464, + "step": 16375 + }, + { + "epoch": 5.026396562308165, + "grad_norm": 0.2730494737625122, + "learning_rate": 5.2007566382169706e-05, + "loss": 1.7391, + "step": 16376 + }, + { + "epoch": 5.02670349907919, + "grad_norm": 0.243557408452034, + "learning_rate": 5.2002599820606624e-05, + "loss": 1.7439, + "step": 16377 + }, + { + "epoch": 5.027010435850215, + "grad_norm": 0.2208259105682373, + "learning_rate": 5.19976332392527e-05, + "loss": 1.7612, + "step": 16378 + }, + { + "epoch": 5.02731737262124, + "grad_norm": 0.21288715302944183, + "learning_rate": 5.199266663815698e-05, + "loss": 1.7546, + "step": 16379 + }, + { + "epoch": 5.027624309392265, + "grad_norm": 0.2106054425239563, + "learning_rate": 5.198770001736857e-05, + "loss": 1.7281, + "step": 16380 + }, + { + "epoch": 5.02793124616329, + "grad_norm": 0.2247164249420166, + "learning_rate": 5.198273337693654e-05, + "loss": 1.8405, + "step": 16381 + }, + { + "epoch": 5.028238182934316, + "grad_norm": 0.21713724732398987, + "learning_rate": 5.197776671690998e-05, + "loss": 1.7333, + "step": 16382 + }, + { + "epoch": 5.028545119705341, + "grad_norm": 0.24063727259635925, + "learning_rate": 5.1972800037337956e-05, + "loss": 1.7608, + "step": 16383 + }, + { + "epoch": 5.0288520564763655, + "grad_norm": 0.22022177278995514, + "learning_rate": 5.196783333826959e-05, + "loss": 1.7045, + "step": 16384 + }, + { + "epoch": 5.029158993247391, + "grad_norm": 0.21348948776721954, + "learning_rate": 5.1962866619753927e-05, + "loss": 1.7516, + "step": 16385 + }, + { + "epoch": 5.029465930018416, + "grad_norm": 0.289315789937973, + "learning_rate": 5.195789988184007e-05, + "loss": 1.8555, + "step": 16386 + }, + { + "epoch": 5.0297728667894415, + "grad_norm": 0.30966848134994507, + "learning_rate": 5.19529331245771e-05, + "loss": 1.7245, + "step": 16387 + }, + { + "epoch": 5.030079803560467, + "grad_norm": 0.24625633656978607, + "learning_rate": 5.194796634801409e-05, + "loss": 1.7788, + "step": 16388 + }, + { + "epoch": 5.030386740331492, + "grad_norm": 0.25937986373901367, + "learning_rate": 5.1942999552200136e-05, + "loss": 1.7655, + "step": 16389 + }, + { + "epoch": 5.030693677102517, + "grad_norm": 0.3056741952896118, + "learning_rate": 5.1938032737184325e-05, + "loss": 1.7167, + "step": 16390 + }, + { + "epoch": 5.031000613873542, + "grad_norm": 0.29773563146591187, + "learning_rate": 5.1933065903015743e-05, + "loss": 1.7247, + "step": 16391 + }, + { + "epoch": 5.031307550644567, + "grad_norm": 0.26433971524238586, + "learning_rate": 5.192809904974347e-05, + "loss": 1.7779, + "step": 16392 + }, + { + "epoch": 5.031614487415593, + "grad_norm": 0.3308073580265045, + "learning_rate": 5.192313217741659e-05, + "loss": 1.7782, + "step": 16393 + }, + { + "epoch": 5.031921424186618, + "grad_norm": 0.2584165632724762, + "learning_rate": 5.1918165286084176e-05, + "loss": 1.7812, + "step": 16394 + }, + { + "epoch": 5.032228360957642, + "grad_norm": 0.31678953766822815, + "learning_rate": 5.1913198375795346e-05, + "loss": 1.7341, + "step": 16395 + }, + { + "epoch": 5.032535297728668, + "grad_norm": 0.3527325391769409, + "learning_rate": 5.190823144659916e-05, + "loss": 1.7844, + "step": 16396 + }, + { + "epoch": 5.032842234499693, + "grad_norm": 0.29233935475349426, + "learning_rate": 5.1903264498544724e-05, + "loss": 1.7993, + "step": 16397 + }, + { + "epoch": 5.033149171270718, + "grad_norm": 0.24549467861652374, + "learning_rate": 5.1898297531681106e-05, + "loss": 1.7294, + "step": 16398 + }, + { + "epoch": 5.033456108041744, + "grad_norm": 0.3446930944919586, + "learning_rate": 5.18933305460574e-05, + "loss": 1.6818, + "step": 16399 + }, + { + "epoch": 5.033763044812768, + "grad_norm": 0.2628229856491089, + "learning_rate": 5.188836354172268e-05, + "loss": 1.7867, + "step": 16400 + }, + { + "epoch": 5.0340699815837935, + "grad_norm": 0.26548629999160767, + "learning_rate": 5.188339651872607e-05, + "loss": 1.7448, + "step": 16401 + }, + { + "epoch": 5.034376918354819, + "grad_norm": 0.29242032766342163, + "learning_rate": 5.187842947711662e-05, + "loss": 1.7103, + "step": 16402 + }, + { + "epoch": 5.034683855125844, + "grad_norm": 0.2515408992767334, + "learning_rate": 5.187346241694343e-05, + "loss": 1.7865, + "step": 16403 + }, + { + "epoch": 5.0349907918968695, + "grad_norm": 0.2253103256225586, + "learning_rate": 5.186849533825559e-05, + "loss": 1.6993, + "step": 16404 + }, + { + "epoch": 5.035297728667895, + "grad_norm": 0.2743360102176666, + "learning_rate": 5.1863528241102154e-05, + "loss": 1.7532, + "step": 16405 + }, + { + "epoch": 5.035604665438919, + "grad_norm": 0.22807851433753967, + "learning_rate": 5.185856112553227e-05, + "loss": 1.7873, + "step": 16406 + }, + { + "epoch": 5.035911602209945, + "grad_norm": 0.23719090223312378, + "learning_rate": 5.1853593991594985e-05, + "loss": 1.7555, + "step": 16407 + }, + { + "epoch": 5.03621853898097, + "grad_norm": 0.2964477241039276, + "learning_rate": 5.184862683933941e-05, + "loss": 1.7204, + "step": 16408 + }, + { + "epoch": 5.036525475751995, + "grad_norm": 0.23717865347862244, + "learning_rate": 5.18436596688146e-05, + "loss": 1.7239, + "step": 16409 + }, + { + "epoch": 5.036832412523021, + "grad_norm": 0.22650085389614105, + "learning_rate": 5.1838692480069686e-05, + "loss": 1.7148, + "step": 16410 + }, + { + "epoch": 5.037139349294045, + "grad_norm": 0.25606781244277954, + "learning_rate": 5.183372527315371e-05, + "loss": 1.7916, + "step": 16411 + }, + { + "epoch": 5.03744628606507, + "grad_norm": 0.22266390919685364, + "learning_rate": 5.182875804811581e-05, + "loss": 1.7481, + "step": 16412 + }, + { + "epoch": 5.037753222836096, + "grad_norm": 0.23481780290603638, + "learning_rate": 5.1823790805005045e-05, + "loss": 1.8014, + "step": 16413 + }, + { + "epoch": 5.038060159607121, + "grad_norm": 0.2629338800907135, + "learning_rate": 5.1818823543870506e-05, + "loss": 1.81, + "step": 16414 + }, + { + "epoch": 5.038367096378146, + "grad_norm": 0.22891482710838318, + "learning_rate": 5.18138562647613e-05, + "loss": 1.757, + "step": 16415 + }, + { + "epoch": 5.038674033149171, + "grad_norm": 0.2666641175746918, + "learning_rate": 5.180888896772649e-05, + "loss": 1.7457, + "step": 16416 + }, + { + "epoch": 5.038980969920196, + "grad_norm": 0.37610310316085815, + "learning_rate": 5.180392165281517e-05, + "loss": 1.8214, + "step": 16417 + }, + { + "epoch": 5.0392879066912215, + "grad_norm": 0.2521277964115143, + "learning_rate": 5.1798954320076455e-05, + "loss": 1.7731, + "step": 16418 + }, + { + "epoch": 5.039594843462247, + "grad_norm": 0.25097090005874634, + "learning_rate": 5.1793986969559415e-05, + "loss": 1.8029, + "step": 16419 + }, + { + "epoch": 5.039901780233272, + "grad_norm": 0.2946726381778717, + "learning_rate": 5.178901960131315e-05, + "loss": 1.7483, + "step": 16420 + }, + { + "epoch": 5.0402087170042975, + "grad_norm": 0.24240419268608093, + "learning_rate": 5.1784052215386736e-05, + "loss": 1.731, + "step": 16421 + }, + { + "epoch": 5.040515653775322, + "grad_norm": 0.2403198480606079, + "learning_rate": 5.177908481182926e-05, + "loss": 1.722, + "step": 16422 + }, + { + "epoch": 5.040822590546347, + "grad_norm": 0.3451874554157257, + "learning_rate": 5.177411739068985e-05, + "loss": 1.7562, + "step": 16423 + }, + { + "epoch": 5.041129527317373, + "grad_norm": 0.3244951069355011, + "learning_rate": 5.176914995201756e-05, + "loss": 1.7321, + "step": 16424 + }, + { + "epoch": 5.041436464088398, + "grad_norm": 0.2346230000257492, + "learning_rate": 5.176418249586149e-05, + "loss": 1.7839, + "step": 16425 + }, + { + "epoch": 5.041743400859423, + "grad_norm": 0.357022225856781, + "learning_rate": 5.1759215022270744e-05, + "loss": 1.7776, + "step": 16426 + }, + { + "epoch": 5.042050337630448, + "grad_norm": 0.259007066488266, + "learning_rate": 5.17542475312944e-05, + "loss": 1.7544, + "step": 16427 + }, + { + "epoch": 5.042357274401473, + "grad_norm": 0.2516533136367798, + "learning_rate": 5.174928002298154e-05, + "loss": 1.7269, + "step": 16428 + }, + { + "epoch": 5.042664211172498, + "grad_norm": 0.3393619954586029, + "learning_rate": 5.174431249738129e-05, + "loss": 1.7487, + "step": 16429 + }, + { + "epoch": 5.042971147943524, + "grad_norm": 0.2730594873428345, + "learning_rate": 5.1739344954542714e-05, + "loss": 1.7468, + "step": 16430 + }, + { + "epoch": 5.043278084714549, + "grad_norm": 0.21233965456485748, + "learning_rate": 5.1734377394514914e-05, + "loss": 1.783, + "step": 16431 + }, + { + "epoch": 5.043585021485574, + "grad_norm": 0.3460896909236908, + "learning_rate": 5.1729409817346974e-05, + "loss": 1.7497, + "step": 16432 + }, + { + "epoch": 5.043891958256599, + "grad_norm": 0.31918221712112427, + "learning_rate": 5.1724442223088e-05, + "loss": 1.7834, + "step": 16433 + }, + { + "epoch": 5.044198895027624, + "grad_norm": 0.23016802966594696, + "learning_rate": 5.171947461178706e-05, + "loss": 1.7348, + "step": 16434 + }, + { + "epoch": 5.0445058317986495, + "grad_norm": 0.35758304595947266, + "learning_rate": 5.171450698349329e-05, + "loss": 1.7734, + "step": 16435 + }, + { + "epoch": 5.044812768569675, + "grad_norm": 0.279725581407547, + "learning_rate": 5.170953933825574e-05, + "loss": 1.7283, + "step": 16436 + }, + { + "epoch": 5.0451197053407, + "grad_norm": 0.23965120315551758, + "learning_rate": 5.170457167612354e-05, + "loss": 1.7606, + "step": 16437 + }, + { + "epoch": 5.045426642111725, + "grad_norm": 0.28026309609413147, + "learning_rate": 5.169960399714574e-05, + "loss": 1.7872, + "step": 16438 + }, + { + "epoch": 5.04573357888275, + "grad_norm": 0.3262448012828827, + "learning_rate": 5.169463630137146e-05, + "loss": 1.8654, + "step": 16439 + }, + { + "epoch": 5.046040515653775, + "grad_norm": 0.4249584674835205, + "learning_rate": 5.168966858884979e-05, + "loss": 1.7244, + "step": 16440 + }, + { + "epoch": 5.046347452424801, + "grad_norm": 0.3385370969772339, + "learning_rate": 5.168470085962984e-05, + "loss": 1.7745, + "step": 16441 + }, + { + "epoch": 5.046654389195826, + "grad_norm": 0.2321811318397522, + "learning_rate": 5.1679733113760675e-05, + "loss": 1.8093, + "step": 16442 + }, + { + "epoch": 5.04696132596685, + "grad_norm": 0.3426755368709564, + "learning_rate": 5.167476535129141e-05, + "loss": 1.7752, + "step": 16443 + }, + { + "epoch": 5.047268262737876, + "grad_norm": 0.27672505378723145, + "learning_rate": 5.166979757227114e-05, + "loss": 1.7619, + "step": 16444 + }, + { + "epoch": 5.047575199508901, + "grad_norm": 0.4111184775829315, + "learning_rate": 5.1664829776748925e-05, + "loss": 1.7672, + "step": 16445 + }, + { + "epoch": 5.047882136279926, + "grad_norm": 0.40139874815940857, + "learning_rate": 5.1659861964773905e-05, + "loss": 1.7753, + "step": 16446 + }, + { + "epoch": 5.048189073050952, + "grad_norm": 0.28931725025177, + "learning_rate": 5.165489413639516e-05, + "loss": 1.7607, + "step": 16447 + }, + { + "epoch": 5.048496009821977, + "grad_norm": 0.297538161277771, + "learning_rate": 5.1649926291661775e-05, + "loss": 1.7661, + "step": 16448 + }, + { + "epoch": 5.0488029465930016, + "grad_norm": 0.4299027621746063, + "learning_rate": 5.1644958430622846e-05, + "loss": 1.6998, + "step": 16449 + }, + { + "epoch": 5.049109883364027, + "grad_norm": 0.2554767429828644, + "learning_rate": 5.163999055332749e-05, + "loss": 1.7716, + "step": 16450 + }, + { + "epoch": 5.049416820135052, + "grad_norm": 0.3561006486415863, + "learning_rate": 5.163502265982477e-05, + "loss": 1.7493, + "step": 16451 + }, + { + "epoch": 5.0497237569060776, + "grad_norm": 0.3839687407016754, + "learning_rate": 5.1630054750163806e-05, + "loss": 1.7314, + "step": 16452 + }, + { + "epoch": 5.050030693677103, + "grad_norm": 0.20022284984588623, + "learning_rate": 5.1625086824393684e-05, + "loss": 1.6992, + "step": 16453 + }, + { + "epoch": 5.050337630448127, + "grad_norm": 0.36830398440361023, + "learning_rate": 5.162011888256349e-05, + "loss": 1.7339, + "step": 16454 + }, + { + "epoch": 5.050644567219153, + "grad_norm": 0.31947389245033264, + "learning_rate": 5.161515092472236e-05, + "loss": 1.7254, + "step": 16455 + }, + { + "epoch": 5.050951503990178, + "grad_norm": 0.2779252827167511, + "learning_rate": 5.161018295091933e-05, + "loss": 1.7941, + "step": 16456 + }, + { + "epoch": 5.051258440761203, + "grad_norm": 0.3796578347682953, + "learning_rate": 5.160521496120354e-05, + "loss": 1.7389, + "step": 16457 + }, + { + "epoch": 5.051565377532229, + "grad_norm": 0.23569442331790924, + "learning_rate": 5.1600246955624076e-05, + "loss": 1.7149, + "step": 16458 + }, + { + "epoch": 5.051872314303253, + "grad_norm": 0.27342507243156433, + "learning_rate": 5.159527893423004e-05, + "loss": 1.699, + "step": 16459 + }, + { + "epoch": 5.0521792510742785, + "grad_norm": 0.2877296209335327, + "learning_rate": 5.159031089707052e-05, + "loss": 1.7668, + "step": 16460 + }, + { + "epoch": 5.052486187845304, + "grad_norm": 0.21482446789741516, + "learning_rate": 5.1585342844194605e-05, + "loss": 1.7132, + "step": 16461 + }, + { + "epoch": 5.052793124616329, + "grad_norm": 0.23588669300079346, + "learning_rate": 5.158037477565142e-05, + "loss": 1.7267, + "step": 16462 + }, + { + "epoch": 5.0531000613873545, + "grad_norm": 0.20188623666763306, + "learning_rate": 5.157540669149003e-05, + "loss": 1.7486, + "step": 16463 + }, + { + "epoch": 5.05340699815838, + "grad_norm": 0.2012643963098526, + "learning_rate": 5.157043859175955e-05, + "loss": 1.718, + "step": 16464 + }, + { + "epoch": 5.053713934929404, + "grad_norm": 0.23133818805217743, + "learning_rate": 5.156547047650908e-05, + "loss": 1.7892, + "step": 16465 + }, + { + "epoch": 5.05402087170043, + "grad_norm": 0.2524542510509491, + "learning_rate": 5.156050234578771e-05, + "loss": 1.8034, + "step": 16466 + }, + { + "epoch": 5.054327808471455, + "grad_norm": 0.20992529392242432, + "learning_rate": 5.155553419964454e-05, + "loss": 1.7158, + "step": 16467 + }, + { + "epoch": 5.05463474524248, + "grad_norm": 0.23815447092056274, + "learning_rate": 5.155056603812868e-05, + "loss": 1.7632, + "step": 16468 + }, + { + "epoch": 5.054941682013506, + "grad_norm": 0.3306051790714264, + "learning_rate": 5.1545597861289205e-05, + "loss": 1.7719, + "step": 16469 + }, + { + "epoch": 5.05524861878453, + "grad_norm": 0.287541925907135, + "learning_rate": 5.154062966917523e-05, + "loss": 1.7092, + "step": 16470 + }, + { + "epoch": 5.055555555555555, + "grad_norm": 0.28186658024787903, + "learning_rate": 5.153566146183586e-05, + "loss": 1.8548, + "step": 16471 + }, + { + "epoch": 5.055862492326581, + "grad_norm": 0.3511136472225189, + "learning_rate": 5.153069323932017e-05, + "loss": 1.8029, + "step": 16472 + }, + { + "epoch": 5.056169429097606, + "grad_norm": 0.32083824276924133, + "learning_rate": 5.152572500167728e-05, + "loss": 1.7321, + "step": 16473 + }, + { + "epoch": 5.056476365868631, + "grad_norm": 0.22571051120758057, + "learning_rate": 5.1520756748956265e-05, + "loss": 1.7218, + "step": 16474 + }, + { + "epoch": 5.056783302639656, + "grad_norm": 0.2902646064758301, + "learning_rate": 5.151578848120626e-05, + "loss": 1.7231, + "step": 16475 + }, + { + "epoch": 5.057090239410681, + "grad_norm": 0.20447610318660736, + "learning_rate": 5.1510820198476336e-05, + "loss": 1.6998, + "step": 16476 + }, + { + "epoch": 5.0573971761817065, + "grad_norm": 0.29436638951301575, + "learning_rate": 5.1505851900815606e-05, + "loss": 1.6793, + "step": 16477 + }, + { + "epoch": 5.057704112952732, + "grad_norm": 0.29718565940856934, + "learning_rate": 5.1500883588273164e-05, + "loss": 1.8322, + "step": 16478 + }, + { + "epoch": 5.058011049723757, + "grad_norm": 0.23530519008636475, + "learning_rate": 5.149591526089811e-05, + "loss": 1.7408, + "step": 16479 + }, + { + "epoch": 5.0583179864947825, + "grad_norm": 0.30735042691230774, + "learning_rate": 5.1490946918739536e-05, + "loss": 1.7454, + "step": 16480 + }, + { + "epoch": 5.058624923265807, + "grad_norm": 0.26151445508003235, + "learning_rate": 5.148597856184656e-05, + "loss": 1.7728, + "step": 16481 + }, + { + "epoch": 5.058931860036832, + "grad_norm": 0.2657756209373474, + "learning_rate": 5.1481010190268263e-05, + "loss": 1.7905, + "step": 16482 + }, + { + "epoch": 5.059238796807858, + "grad_norm": 0.25418251752853394, + "learning_rate": 5.147604180405376e-05, + "loss": 1.7676, + "step": 16483 + }, + { + "epoch": 5.059545733578883, + "grad_norm": 0.25486254692077637, + "learning_rate": 5.1471073403252154e-05, + "loss": 1.8347, + "step": 16484 + }, + { + "epoch": 5.059852670349908, + "grad_norm": 0.22693100571632385, + "learning_rate": 5.146610498791255e-05, + "loss": 1.7308, + "step": 16485 + }, + { + "epoch": 5.060159607120933, + "grad_norm": 0.22056837379932404, + "learning_rate": 5.146113655808401e-05, + "loss": 1.7158, + "step": 16486 + }, + { + "epoch": 5.060466543891958, + "grad_norm": 0.221246138215065, + "learning_rate": 5.1456168113815685e-05, + "loss": 1.6985, + "step": 16487 + }, + { + "epoch": 5.060773480662983, + "grad_norm": 0.2149408906698227, + "learning_rate": 5.145119965515664e-05, + "loss": 1.716, + "step": 16488 + }, + { + "epoch": 5.061080417434009, + "grad_norm": 0.23958513140678406, + "learning_rate": 5.144623118215599e-05, + "loss": 1.8092, + "step": 16489 + }, + { + "epoch": 5.061387354205034, + "grad_norm": 0.2870621085166931, + "learning_rate": 5.1441262694862836e-05, + "loss": 1.75, + "step": 16490 + }, + { + "epoch": 5.0616942909760585, + "grad_norm": 0.26755061745643616, + "learning_rate": 5.1436294193326276e-05, + "loss": 1.7848, + "step": 16491 + }, + { + "epoch": 5.062001227747084, + "grad_norm": 0.2434249073266983, + "learning_rate": 5.143132567759542e-05, + "loss": 1.7487, + "step": 16492 + }, + { + "epoch": 5.062308164518109, + "grad_norm": 0.3044668138027191, + "learning_rate": 5.142635714771936e-05, + "loss": 1.741, + "step": 16493 + }, + { + "epoch": 5.0626151012891345, + "grad_norm": 0.2166958749294281, + "learning_rate": 5.142138860374721e-05, + "loss": 1.7232, + "step": 16494 + }, + { + "epoch": 5.06292203806016, + "grad_norm": 0.34558552503585815, + "learning_rate": 5.141642004572806e-05, + "loss": 1.7663, + "step": 16495 + }, + { + "epoch": 5.063228974831185, + "grad_norm": 0.330751895904541, + "learning_rate": 5.141145147371102e-05, + "loss": 1.6818, + "step": 16496 + }, + { + "epoch": 5.06353591160221, + "grad_norm": 0.21613973379135132, + "learning_rate": 5.140648288774518e-05, + "loss": 1.7914, + "step": 16497 + }, + { + "epoch": 5.063842848373235, + "grad_norm": 0.32759732007980347, + "learning_rate": 5.140151428787966e-05, + "loss": 1.7543, + "step": 16498 + }, + { + "epoch": 5.06414978514426, + "grad_norm": 0.3180293142795563, + "learning_rate": 5.1396545674163556e-05, + "loss": 1.8163, + "step": 16499 + }, + { + "epoch": 5.064456721915286, + "grad_norm": 0.19757944345474243, + "learning_rate": 5.1391577046645964e-05, + "loss": 1.71, + "step": 16500 + }, + { + "epoch": 5.064763658686311, + "grad_norm": 0.253366619348526, + "learning_rate": 5.1386608405376005e-05, + "loss": 1.7266, + "step": 16501 + }, + { + "epoch": 5.065070595457335, + "grad_norm": 0.24577608704566956, + "learning_rate": 5.1381639750402754e-05, + "loss": 1.7218, + "step": 16502 + }, + { + "epoch": 5.065377532228361, + "grad_norm": 0.22847014665603638, + "learning_rate": 5.137667108177533e-05, + "loss": 1.8025, + "step": 16503 + }, + { + "epoch": 5.065684468999386, + "grad_norm": 0.2089833766222, + "learning_rate": 5.137170239954284e-05, + "loss": 1.8032, + "step": 16504 + }, + { + "epoch": 5.065991405770411, + "grad_norm": 0.21528512239456177, + "learning_rate": 5.136673370375439e-05, + "loss": 1.7227, + "step": 16505 + }, + { + "epoch": 5.066298342541437, + "grad_norm": 0.2099117785692215, + "learning_rate": 5.1361764994459074e-05, + "loss": 1.7176, + "step": 16506 + }, + { + "epoch": 5.066605279312462, + "grad_norm": 0.2140430212020874, + "learning_rate": 5.135679627170599e-05, + "loss": 1.8195, + "step": 16507 + }, + { + "epoch": 5.0669122160834865, + "grad_norm": 0.20253533124923706, + "learning_rate": 5.135182753554424e-05, + "loss": 1.7284, + "step": 16508 + }, + { + "epoch": 5.067219152854512, + "grad_norm": 0.19945639371871948, + "learning_rate": 5.134685878602295e-05, + "loss": 1.6915, + "step": 16509 + }, + { + "epoch": 5.067526089625537, + "grad_norm": 0.20138494670391083, + "learning_rate": 5.1341890023191216e-05, + "loss": 1.7856, + "step": 16510 + }, + { + "epoch": 5.0678330263965625, + "grad_norm": 0.22124232351779938, + "learning_rate": 5.1336921247098136e-05, + "loss": 1.7674, + "step": 16511 + }, + { + "epoch": 5.068139963167588, + "grad_norm": 0.21564216911792755, + "learning_rate": 5.133195245779282e-05, + "loss": 1.6998, + "step": 16512 + }, + { + "epoch": 5.068446899938612, + "grad_norm": 0.21836799383163452, + "learning_rate": 5.1326983655324365e-05, + "loss": 1.7468, + "step": 16513 + }, + { + "epoch": 5.068753836709638, + "grad_norm": 0.2412201464176178, + "learning_rate": 5.132201483974187e-05, + "loss": 1.7433, + "step": 16514 + }, + { + "epoch": 5.069060773480663, + "grad_norm": 0.262054979801178, + "learning_rate": 5.131704601109446e-05, + "loss": 1.8315, + "step": 16515 + }, + { + "epoch": 5.069367710251688, + "grad_norm": 0.21573080122470856, + "learning_rate": 5.1312077169431225e-05, + "loss": 1.7668, + "step": 16516 + }, + { + "epoch": 5.069674647022714, + "grad_norm": 0.21407057344913483, + "learning_rate": 5.130710831480129e-05, + "loss": 1.7486, + "step": 16517 + }, + { + "epoch": 5.069981583793738, + "grad_norm": 0.2128407508134842, + "learning_rate": 5.130213944725373e-05, + "loss": 1.7618, + "step": 16518 + }, + { + "epoch": 5.070288520564763, + "grad_norm": 0.2034141719341278, + "learning_rate": 5.129717056683767e-05, + "loss": 1.726, + "step": 16519 + }, + { + "epoch": 5.070595457335789, + "grad_norm": 0.21474458277225494, + "learning_rate": 5.1292201673602205e-05, + "loss": 1.7883, + "step": 16520 + }, + { + "epoch": 5.070902394106814, + "grad_norm": 0.2102673202753067, + "learning_rate": 5.128723276759645e-05, + "loss": 1.7826, + "step": 16521 + }, + { + "epoch": 5.071209330877839, + "grad_norm": 0.21342496573925018, + "learning_rate": 5.1282263848869505e-05, + "loss": 1.7561, + "step": 16522 + }, + { + "epoch": 5.071516267648865, + "grad_norm": 0.21749620139598846, + "learning_rate": 5.1277294917470474e-05, + "loss": 1.7814, + "step": 16523 + }, + { + "epoch": 5.071823204419889, + "grad_norm": 0.20006774365901947, + "learning_rate": 5.1272325973448476e-05, + "loss": 1.6965, + "step": 16524 + }, + { + "epoch": 5.0721301411909145, + "grad_norm": 0.20878590643405914, + "learning_rate": 5.1267357016852593e-05, + "loss": 1.7426, + "step": 16525 + }, + { + "epoch": 5.07243707796194, + "grad_norm": 0.21824820339679718, + "learning_rate": 5.1262388047731946e-05, + "loss": 1.7704, + "step": 16526 + }, + { + "epoch": 5.072744014732965, + "grad_norm": 0.1992526650428772, + "learning_rate": 5.125741906613565e-05, + "loss": 1.7874, + "step": 16527 + }, + { + "epoch": 5.0730509515039905, + "grad_norm": 0.21028028428554535, + "learning_rate": 5.12524500721128e-05, + "loss": 1.7483, + "step": 16528 + }, + { + "epoch": 5.073357888275015, + "grad_norm": 0.21840833127498627, + "learning_rate": 5.12474810657125e-05, + "loss": 1.7763, + "step": 16529 + }, + { + "epoch": 5.07366482504604, + "grad_norm": 0.249269038438797, + "learning_rate": 5.124251204698387e-05, + "loss": 1.7451, + "step": 16530 + }, + { + "epoch": 5.073971761817066, + "grad_norm": 0.2176963835954666, + "learning_rate": 5.1237543015975986e-05, + "loss": 1.7079, + "step": 16531 + }, + { + "epoch": 5.074278698588091, + "grad_norm": 0.20284616947174072, + "learning_rate": 5.1232573972738e-05, + "loss": 1.7235, + "step": 16532 + }, + { + "epoch": 5.074585635359116, + "grad_norm": 0.20140530169010162, + "learning_rate": 5.1227604917318984e-05, + "loss": 1.7014, + "step": 16533 + }, + { + "epoch": 5.074892572130141, + "grad_norm": 0.2407023161649704, + "learning_rate": 5.1222635849768066e-05, + "loss": 1.7493, + "step": 16534 + }, + { + "epoch": 5.075199508901166, + "grad_norm": 0.2013770490884781, + "learning_rate": 5.121766677013433e-05, + "loss": 1.7601, + "step": 16535 + }, + { + "epoch": 5.0755064456721914, + "grad_norm": 0.23889221251010895, + "learning_rate": 5.1212697678466916e-05, + "loss": 1.7282, + "step": 16536 + }, + { + "epoch": 5.075813382443217, + "grad_norm": 0.2411198765039444, + "learning_rate": 5.120772857481489e-05, + "loss": 1.8138, + "step": 16537 + }, + { + "epoch": 5.076120319214242, + "grad_norm": 0.24521365761756897, + "learning_rate": 5.12027594592274e-05, + "loss": 1.7659, + "step": 16538 + }, + { + "epoch": 5.0764272559852675, + "grad_norm": 0.2841372787952423, + "learning_rate": 5.119779033175354e-05, + "loss": 1.7973, + "step": 16539 + }, + { + "epoch": 5.076734192756292, + "grad_norm": 0.21796928346157074, + "learning_rate": 5.1192821192442395e-05, + "loss": 1.6985, + "step": 16540 + }, + { + "epoch": 5.077041129527317, + "grad_norm": 0.2244848757982254, + "learning_rate": 5.118785204134311e-05, + "loss": 1.7413, + "step": 16541 + }, + { + "epoch": 5.077348066298343, + "grad_norm": 0.22581063210964203, + "learning_rate": 5.1182882878504766e-05, + "loss": 1.7706, + "step": 16542 + }, + { + "epoch": 5.077655003069368, + "grad_norm": 0.24478016793727875, + "learning_rate": 5.117791370397647e-05, + "loss": 1.7628, + "step": 16543 + }, + { + "epoch": 5.077961939840393, + "grad_norm": 0.31270188093185425, + "learning_rate": 5.117294451780734e-05, + "loss": 1.8254, + "step": 16544 + }, + { + "epoch": 5.078268876611418, + "grad_norm": 0.3547368049621582, + "learning_rate": 5.11679753200465e-05, + "loss": 1.781, + "step": 16545 + }, + { + "epoch": 5.078575813382443, + "grad_norm": 0.24920180439949036, + "learning_rate": 5.116300611074304e-05, + "loss": 1.7748, + "step": 16546 + }, + { + "epoch": 5.078882750153468, + "grad_norm": 0.2368776649236679, + "learning_rate": 5.115803688994607e-05, + "loss": 1.7459, + "step": 16547 + }, + { + "epoch": 5.079189686924494, + "grad_norm": 0.28341975808143616, + "learning_rate": 5.115306765770471e-05, + "loss": 1.6694, + "step": 16548 + }, + { + "epoch": 5.079496623695519, + "grad_norm": 0.2521432936191559, + "learning_rate": 5.114809841406804e-05, + "loss": 1.7544, + "step": 16549 + }, + { + "epoch": 5.0798035604665435, + "grad_norm": 0.21199844777584076, + "learning_rate": 5.11431291590852e-05, + "loss": 1.7215, + "step": 16550 + }, + { + "epoch": 5.080110497237569, + "grad_norm": 0.25157347321510315, + "learning_rate": 5.113815989280528e-05, + "loss": 1.8021, + "step": 16551 + }, + { + "epoch": 5.080417434008594, + "grad_norm": 0.2284129559993744, + "learning_rate": 5.1133190615277414e-05, + "loss": 1.7125, + "step": 16552 + }, + { + "epoch": 5.0807243707796195, + "grad_norm": 0.2297726720571518, + "learning_rate": 5.11282213265507e-05, + "loss": 1.7602, + "step": 16553 + }, + { + "epoch": 5.081031307550645, + "grad_norm": 0.22392617166042328, + "learning_rate": 5.112325202667421e-05, + "loss": 1.7251, + "step": 16554 + }, + { + "epoch": 5.08133824432167, + "grad_norm": 0.22406147420406342, + "learning_rate": 5.11182827156971e-05, + "loss": 1.7232, + "step": 16555 + }, + { + "epoch": 5.081645181092695, + "grad_norm": 0.2547284960746765, + "learning_rate": 5.111331339366846e-05, + "loss": 1.7335, + "step": 16556 + }, + { + "epoch": 5.08195211786372, + "grad_norm": 0.216146782040596, + "learning_rate": 5.1108344060637415e-05, + "loss": 1.7469, + "step": 16557 + }, + { + "epoch": 5.082259054634745, + "grad_norm": 0.1926967352628708, + "learning_rate": 5.110337471665306e-05, + "loss": 1.7492, + "step": 16558 + }, + { + "epoch": 5.082565991405771, + "grad_norm": 0.30311331152915955, + "learning_rate": 5.109840536176451e-05, + "loss": 1.8129, + "step": 16559 + }, + { + "epoch": 5.082872928176796, + "grad_norm": 0.24273787438869476, + "learning_rate": 5.109343599602087e-05, + "loss": 1.7206, + "step": 16560 + }, + { + "epoch": 5.08317986494782, + "grad_norm": 0.22736592590808868, + "learning_rate": 5.1088466619471255e-05, + "loss": 1.732, + "step": 16561 + }, + { + "epoch": 5.083486801718846, + "grad_norm": 0.21457640826702118, + "learning_rate": 5.1083497232164777e-05, + "loss": 1.726, + "step": 16562 + }, + { + "epoch": 5.083793738489871, + "grad_norm": 0.20968590676784515, + "learning_rate": 5.107852783415055e-05, + "loss": 1.8095, + "step": 16563 + }, + { + "epoch": 5.084100675260896, + "grad_norm": 0.2846728265285492, + "learning_rate": 5.107355842547768e-05, + "loss": 1.7524, + "step": 16564 + }, + { + "epoch": 5.084407612031922, + "grad_norm": 0.21162885427474976, + "learning_rate": 5.106858900619526e-05, + "loss": 1.753, + "step": 16565 + }, + { + "epoch": 5.084714548802946, + "grad_norm": 0.24349012970924377, + "learning_rate": 5.106361957635242e-05, + "loss": 1.7003, + "step": 16566 + }, + { + "epoch": 5.0850214855739715, + "grad_norm": 0.24532537162303925, + "learning_rate": 5.105865013599828e-05, + "loss": 1.7818, + "step": 16567 + }, + { + "epoch": 5.085328422344997, + "grad_norm": 0.22788558900356293, + "learning_rate": 5.1053680685181926e-05, + "loss": 1.7291, + "step": 16568 + }, + { + "epoch": 5.085635359116022, + "grad_norm": 0.22402508556842804, + "learning_rate": 5.10487112239525e-05, + "loss": 1.8292, + "step": 16569 + }, + { + "epoch": 5.0859422958870475, + "grad_norm": 0.2396162748336792, + "learning_rate": 5.1043741752359085e-05, + "loss": 1.7441, + "step": 16570 + }, + { + "epoch": 5.086249232658073, + "grad_norm": 0.22364887595176697, + "learning_rate": 5.1038772270450796e-05, + "loss": 1.7356, + "step": 16571 + }, + { + "epoch": 5.086556169429097, + "grad_norm": 0.20385414361953735, + "learning_rate": 5.103380277827676e-05, + "loss": 1.774, + "step": 16572 + }, + { + "epoch": 5.086863106200123, + "grad_norm": 0.2050715535879135, + "learning_rate": 5.102883327588608e-05, + "loss": 1.7217, + "step": 16573 + }, + { + "epoch": 5.087170042971148, + "grad_norm": 0.23750410974025726, + "learning_rate": 5.102386376332786e-05, + "loss": 1.7605, + "step": 16574 + }, + { + "epoch": 5.087476979742173, + "grad_norm": 0.24313338100910187, + "learning_rate": 5.101889424065122e-05, + "loss": 1.7498, + "step": 16575 + }, + { + "epoch": 5.087783916513199, + "grad_norm": 0.22145850956439972, + "learning_rate": 5.101392470790527e-05, + "loss": 1.7827, + "step": 16576 + }, + { + "epoch": 5.088090853284223, + "grad_norm": 0.23073779046535492, + "learning_rate": 5.100895516513912e-05, + "loss": 1.7722, + "step": 16577 + }, + { + "epoch": 5.088397790055248, + "grad_norm": 0.2112295925617218, + "learning_rate": 5.100398561240188e-05, + "loss": 1.7755, + "step": 16578 + }, + { + "epoch": 5.088704726826274, + "grad_norm": 0.23263800144195557, + "learning_rate": 5.0999016049742675e-05, + "loss": 1.7593, + "step": 16579 + }, + { + "epoch": 5.089011663597299, + "grad_norm": 0.23011381924152374, + "learning_rate": 5.09940464772106e-05, + "loss": 1.704, + "step": 16580 + }, + { + "epoch": 5.089318600368324, + "grad_norm": 0.1930779367685318, + "learning_rate": 5.0989076894854785e-05, + "loss": 1.7038, + "step": 16581 + }, + { + "epoch": 5.08962553713935, + "grad_norm": 0.2100505381822586, + "learning_rate": 5.098410730272433e-05, + "loss": 1.7671, + "step": 16582 + }, + { + "epoch": 5.089932473910374, + "grad_norm": 0.1919277459383011, + "learning_rate": 5.097913770086833e-05, + "loss": 1.651, + "step": 16583 + }, + { + "epoch": 5.0902394106813995, + "grad_norm": 0.23310615122318268, + "learning_rate": 5.097416808933594e-05, + "loss": 1.8294, + "step": 16584 + }, + { + "epoch": 5.090546347452425, + "grad_norm": 0.26191771030426025, + "learning_rate": 5.096919846817624e-05, + "loss": 1.7522, + "step": 16585 + }, + { + "epoch": 5.09085328422345, + "grad_norm": 0.2508419156074524, + "learning_rate": 5.096422883743835e-05, + "loss": 1.8025, + "step": 16586 + }, + { + "epoch": 5.0911602209944755, + "grad_norm": 0.23192499577999115, + "learning_rate": 5.0959259197171414e-05, + "loss": 1.7885, + "step": 16587 + }, + { + "epoch": 5.0914671577655, + "grad_norm": 0.2164602279663086, + "learning_rate": 5.095428954742448e-05, + "loss": 1.7299, + "step": 16588 + }, + { + "epoch": 5.091774094536525, + "grad_norm": 0.21431668102741241, + "learning_rate": 5.094931988824671e-05, + "loss": 1.7122, + "step": 16589 + }, + { + "epoch": 5.092081031307551, + "grad_norm": 0.20563583076000214, + "learning_rate": 5.094435021968722e-05, + "loss": 1.7118, + "step": 16590 + }, + { + "epoch": 5.092387968078576, + "grad_norm": 0.20916326344013214, + "learning_rate": 5.093938054179509e-05, + "loss": 1.7639, + "step": 16591 + }, + { + "epoch": 5.092694904849601, + "grad_norm": 0.21197481453418732, + "learning_rate": 5.0934410854619454e-05, + "loss": 1.7357, + "step": 16592 + }, + { + "epoch": 5.093001841620626, + "grad_norm": 0.21085995435714722, + "learning_rate": 5.092944115820942e-05, + "loss": 1.6921, + "step": 16593 + }, + { + "epoch": 5.093308778391651, + "grad_norm": 0.2608145773410797, + "learning_rate": 5.09244714526141e-05, + "loss": 1.7541, + "step": 16594 + }, + { + "epoch": 5.093615715162676, + "grad_norm": 0.2138587087392807, + "learning_rate": 5.0919501737882624e-05, + "loss": 1.727, + "step": 16595 + }, + { + "epoch": 5.093922651933702, + "grad_norm": 0.230251282453537, + "learning_rate": 5.0914532014064084e-05, + "loss": 1.7828, + "step": 16596 + }, + { + "epoch": 5.094229588704727, + "grad_norm": 0.2162851244211197, + "learning_rate": 5.0909562281207614e-05, + "loss": 1.6905, + "step": 16597 + }, + { + "epoch": 5.094536525475752, + "grad_norm": 0.20637664198875427, + "learning_rate": 5.090459253936231e-05, + "loss": 1.7484, + "step": 16598 + }, + { + "epoch": 5.094843462246777, + "grad_norm": 0.19427815079689026, + "learning_rate": 5.089962278857728e-05, + "loss": 1.7379, + "step": 16599 + }, + { + "epoch": 5.095150399017802, + "grad_norm": 0.1877593845129013, + "learning_rate": 5.089465302890165e-05, + "loss": 1.7017, + "step": 16600 + }, + { + "epoch": 5.0954573357888275, + "grad_norm": 0.19219037890434265, + "learning_rate": 5.0889683260384543e-05, + "loss": 1.7379, + "step": 16601 + }, + { + "epoch": 5.095764272559853, + "grad_norm": 0.19855685532093048, + "learning_rate": 5.088471348307507e-05, + "loss": 1.7171, + "step": 16602 + }, + { + "epoch": 5.096071209330878, + "grad_norm": 0.19119660556316376, + "learning_rate": 5.087974369702235e-05, + "loss": 1.6912, + "step": 16603 + }, + { + "epoch": 5.096378146101903, + "grad_norm": 0.2102670818567276, + "learning_rate": 5.0874773902275476e-05, + "loss": 1.6825, + "step": 16604 + }, + { + "epoch": 5.096685082872928, + "grad_norm": 0.2120765596628189, + "learning_rate": 5.0869804098883564e-05, + "loss": 1.7055, + "step": 16605 + }, + { + "epoch": 5.096992019643953, + "grad_norm": 0.25874772667884827, + "learning_rate": 5.0864834286895745e-05, + "loss": 1.7193, + "step": 16606 + }, + { + "epoch": 5.097298956414979, + "grad_norm": 0.20822012424468994, + "learning_rate": 5.085986446636113e-05, + "loss": 1.6748, + "step": 16607 + }, + { + "epoch": 5.097605893186004, + "grad_norm": 0.21364718675613403, + "learning_rate": 5.085489463732883e-05, + "loss": 1.7762, + "step": 16608 + }, + { + "epoch": 5.097912829957028, + "grad_norm": 0.21961788833141327, + "learning_rate": 5.084992479984796e-05, + "loss": 1.7243, + "step": 16609 + }, + { + "epoch": 5.098219766728054, + "grad_norm": 0.22056026756763458, + "learning_rate": 5.0844954953967624e-05, + "loss": 1.6983, + "step": 16610 + }, + { + "epoch": 5.098526703499079, + "grad_norm": 0.21347738802433014, + "learning_rate": 5.083998509973695e-05, + "loss": 1.7319, + "step": 16611 + }, + { + "epoch": 5.098833640270104, + "grad_norm": 0.23593664169311523, + "learning_rate": 5.083501523720506e-05, + "loss": 1.7121, + "step": 16612 + }, + { + "epoch": 5.09914057704113, + "grad_norm": 0.2088623344898224, + "learning_rate": 5.0830045366421055e-05, + "loss": 1.72, + "step": 16613 + }, + { + "epoch": 5.099447513812155, + "grad_norm": 0.2293832004070282, + "learning_rate": 5.082507548743406e-05, + "loss": 1.7548, + "step": 16614 + }, + { + "epoch": 5.0997544505831796, + "grad_norm": 0.2509057819843292, + "learning_rate": 5.082010560029319e-05, + "loss": 1.7729, + "step": 16615 + }, + { + "epoch": 5.100061387354205, + "grad_norm": 0.1925390362739563, + "learning_rate": 5.081513570504755e-05, + "loss": 1.7109, + "step": 16616 + }, + { + "epoch": 5.10036832412523, + "grad_norm": 0.20876559615135193, + "learning_rate": 5.081016580174626e-05, + "loss": 1.7031, + "step": 16617 + }, + { + "epoch": 5.100675260896256, + "grad_norm": 0.2038683146238327, + "learning_rate": 5.080519589043842e-05, + "loss": 1.7489, + "step": 16618 + }, + { + "epoch": 5.100982197667281, + "grad_norm": 0.25018224120140076, + "learning_rate": 5.080022597117318e-05, + "loss": 1.7884, + "step": 16619 + }, + { + "epoch": 5.101289134438305, + "grad_norm": 0.24430342018604279, + "learning_rate": 5.079525604399965e-05, + "loss": 1.7558, + "step": 16620 + }, + { + "epoch": 5.101596071209331, + "grad_norm": 0.22151432931423187, + "learning_rate": 5.079028610896692e-05, + "loss": 1.7543, + "step": 16621 + }, + { + "epoch": 5.101903007980356, + "grad_norm": 0.2313055694103241, + "learning_rate": 5.0785316166124107e-05, + "loss": 1.7755, + "step": 16622 + }, + { + "epoch": 5.102209944751381, + "grad_norm": 0.27405816316604614, + "learning_rate": 5.0780346215520355e-05, + "loss": 1.7006, + "step": 16623 + }, + { + "epoch": 5.102516881522407, + "grad_norm": 0.2209920734167099, + "learning_rate": 5.077537625720476e-05, + "loss": 1.6877, + "step": 16624 + }, + { + "epoch": 5.102823818293431, + "grad_norm": 0.20993784070014954, + "learning_rate": 5.077040629122645e-05, + "loss": 1.7558, + "step": 16625 + }, + { + "epoch": 5.1031307550644565, + "grad_norm": 0.25554344058036804, + "learning_rate": 5.076543631763453e-05, + "loss": 1.7142, + "step": 16626 + }, + { + "epoch": 5.103437691835482, + "grad_norm": 0.28980588912963867, + "learning_rate": 5.0760466336478116e-05, + "loss": 1.7632, + "step": 16627 + }, + { + "epoch": 5.103744628606507, + "grad_norm": 0.20144744217395782, + "learning_rate": 5.075549634780633e-05, + "loss": 1.7472, + "step": 16628 + }, + { + "epoch": 5.1040515653775325, + "grad_norm": 0.30335596203804016, + "learning_rate": 5.075052635166827e-05, + "loss": 1.7283, + "step": 16629 + }, + { + "epoch": 5.104358502148558, + "grad_norm": 0.3014097213745117, + "learning_rate": 5.074555634811309e-05, + "loss": 1.7273, + "step": 16630 + }, + { + "epoch": 5.104665438919582, + "grad_norm": 0.20123563706874847, + "learning_rate": 5.074058633718988e-05, + "loss": 1.7119, + "step": 16631 + }, + { + "epoch": 5.104972375690608, + "grad_norm": 0.3375137746334076, + "learning_rate": 5.073561631894776e-05, + "loss": 1.7594, + "step": 16632 + }, + { + "epoch": 5.105279312461633, + "grad_norm": 0.3471776247024536, + "learning_rate": 5.0730646293435846e-05, + "loss": 1.729, + "step": 16633 + }, + { + "epoch": 5.105586249232658, + "grad_norm": 0.26405471563339233, + "learning_rate": 5.072567626070327e-05, + "loss": 1.7472, + "step": 16634 + }, + { + "epoch": 5.105893186003684, + "grad_norm": 0.2339334636926651, + "learning_rate": 5.072070622079911e-05, + "loss": 1.7285, + "step": 16635 + }, + { + "epoch": 5.106200122774708, + "grad_norm": 0.26267752051353455, + "learning_rate": 5.0715736173772534e-05, + "loss": 1.7171, + "step": 16636 + }, + { + "epoch": 5.106507059545733, + "grad_norm": 0.22254765033721924, + "learning_rate": 5.0710766119672626e-05, + "loss": 1.7702, + "step": 16637 + }, + { + "epoch": 5.106813996316759, + "grad_norm": 0.2457888424396515, + "learning_rate": 5.070579605854852e-05, + "loss": 1.7987, + "step": 16638 + }, + { + "epoch": 5.107120933087784, + "grad_norm": 0.24500930309295654, + "learning_rate": 5.070082599044931e-05, + "loss": 1.8103, + "step": 16639 + }, + { + "epoch": 5.107427869858809, + "grad_norm": 0.24446405470371246, + "learning_rate": 5.0695855915424116e-05, + "loss": 1.7058, + "step": 16640 + }, + { + "epoch": 5.107734806629834, + "grad_norm": 0.22352534532546997, + "learning_rate": 5.0690885833522086e-05, + "loss": 1.7503, + "step": 16641 + }, + { + "epoch": 5.108041743400859, + "grad_norm": 0.2308795005083084, + "learning_rate": 5.068591574479231e-05, + "loss": 1.8064, + "step": 16642 + }, + { + "epoch": 5.1083486801718845, + "grad_norm": 0.23804180324077606, + "learning_rate": 5.068094564928392e-05, + "loss": 1.7603, + "step": 16643 + }, + { + "epoch": 5.10865561694291, + "grad_norm": 0.1956508308649063, + "learning_rate": 5.0675975547046016e-05, + "loss": 1.7448, + "step": 16644 + }, + { + "epoch": 5.108962553713935, + "grad_norm": 0.24438725411891937, + "learning_rate": 5.067100543812773e-05, + "loss": 1.7706, + "step": 16645 + }, + { + "epoch": 5.1092694904849605, + "grad_norm": 0.26129621267318726, + "learning_rate": 5.066603532257817e-05, + "loss": 1.7321, + "step": 16646 + }, + { + "epoch": 5.109576427255985, + "grad_norm": 0.2024240493774414, + "learning_rate": 5.066106520044646e-05, + "loss": 1.7033, + "step": 16647 + }, + { + "epoch": 5.10988336402701, + "grad_norm": 0.2096802294254303, + "learning_rate": 5.0656095071781716e-05, + "loss": 1.716, + "step": 16648 + }, + { + "epoch": 5.110190300798036, + "grad_norm": 0.20643317699432373, + "learning_rate": 5.0651124936633054e-05, + "loss": 1.7473, + "step": 16649 + }, + { + "epoch": 5.110497237569061, + "grad_norm": 0.2268853783607483, + "learning_rate": 5.0646154795049604e-05, + "loss": 1.7844, + "step": 16650 + }, + { + "epoch": 5.110804174340086, + "grad_norm": 0.20215095579624176, + "learning_rate": 5.064118464708046e-05, + "loss": 1.7138, + "step": 16651 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 0.19411569833755493, + "learning_rate": 5.063621449277476e-05, + "loss": 1.7526, + "step": 16652 + }, + { + "epoch": 5.111418047882136, + "grad_norm": 0.20199783146381378, + "learning_rate": 5.063124433218161e-05, + "loss": 1.806, + "step": 16653 + }, + { + "epoch": 5.111724984653161, + "grad_norm": 0.23351836204528809, + "learning_rate": 5.0626274165350165e-05, + "loss": 1.7529, + "step": 16654 + }, + { + "epoch": 5.112031921424187, + "grad_norm": 0.21098989248275757, + "learning_rate": 5.062130399232948e-05, + "loss": 1.7647, + "step": 16655 + }, + { + "epoch": 5.112338858195212, + "grad_norm": 0.21959169209003448, + "learning_rate": 5.0616333813168714e-05, + "loss": 1.7462, + "step": 16656 + }, + { + "epoch": 5.112645794966237, + "grad_norm": 0.21173696219921112, + "learning_rate": 5.061136362791696e-05, + "loss": 1.7413, + "step": 16657 + }, + { + "epoch": 5.112952731737262, + "grad_norm": 0.22357577085494995, + "learning_rate": 5.0606393436623365e-05, + "loss": 1.7163, + "step": 16658 + }, + { + "epoch": 5.113259668508287, + "grad_norm": 0.24364936351776123, + "learning_rate": 5.060142323933704e-05, + "loss": 1.8139, + "step": 16659 + }, + { + "epoch": 5.1135666052793125, + "grad_norm": 0.21646073460578918, + "learning_rate": 5.05964530361071e-05, + "loss": 1.741, + "step": 16660 + }, + { + "epoch": 5.113873542050338, + "grad_norm": 0.24261775612831116, + "learning_rate": 5.059148282698265e-05, + "loss": 1.7162, + "step": 16661 + }, + { + "epoch": 5.114180478821363, + "grad_norm": 0.22883281111717224, + "learning_rate": 5.058651261201283e-05, + "loss": 1.7342, + "step": 16662 + }, + { + "epoch": 5.114487415592388, + "grad_norm": 0.2616727352142334, + "learning_rate": 5.058154239124674e-05, + "loss": 1.8054, + "step": 16663 + }, + { + "epoch": 5.114794352363413, + "grad_norm": 0.21293358504772186, + "learning_rate": 5.0576572164733505e-05, + "loss": 1.742, + "step": 16664 + }, + { + "epoch": 5.115101289134438, + "grad_norm": 0.20037685334682465, + "learning_rate": 5.057160193252225e-05, + "loss": 1.7518, + "step": 16665 + }, + { + "epoch": 5.115408225905464, + "grad_norm": 0.19102689623832703, + "learning_rate": 5.056663169466209e-05, + "loss": 1.6892, + "step": 16666 + }, + { + "epoch": 5.115715162676489, + "grad_norm": 0.22261591255664825, + "learning_rate": 5.056166145120216e-05, + "loss": 1.7744, + "step": 16667 + }, + { + "epoch": 5.116022099447513, + "grad_norm": 0.23966702818870544, + "learning_rate": 5.055669120219154e-05, + "loss": 1.7786, + "step": 16668 + }, + { + "epoch": 5.116329036218539, + "grad_norm": 0.22008271515369415, + "learning_rate": 5.055172094767937e-05, + "loss": 1.7501, + "step": 16669 + }, + { + "epoch": 5.116635972989564, + "grad_norm": 0.21643415093421936, + "learning_rate": 5.054675068771478e-05, + "loss": 1.7548, + "step": 16670 + }, + { + "epoch": 5.116942909760589, + "grad_norm": 0.24661116302013397, + "learning_rate": 5.0541780422346894e-05, + "loss": 1.8117, + "step": 16671 + }, + { + "epoch": 5.117249846531615, + "grad_norm": 0.21393093466758728, + "learning_rate": 5.05368101516248e-05, + "loss": 1.7341, + "step": 16672 + }, + { + "epoch": 5.11755678330264, + "grad_norm": 0.30949896574020386, + "learning_rate": 5.053183987559763e-05, + "loss": 1.7703, + "step": 16673 + }, + { + "epoch": 5.1178637200736645, + "grad_norm": 0.22236786782741547, + "learning_rate": 5.052686959431451e-05, + "loss": 1.719, + "step": 16674 + }, + { + "epoch": 5.11817065684469, + "grad_norm": 0.26826921105384827, + "learning_rate": 5.052189930782455e-05, + "loss": 1.741, + "step": 16675 + }, + { + "epoch": 5.118477593615715, + "grad_norm": 0.2608947455883026, + "learning_rate": 5.051692901617688e-05, + "loss": 1.7062, + "step": 16676 + }, + { + "epoch": 5.1187845303867405, + "grad_norm": 0.20709002017974854, + "learning_rate": 5.051195871942063e-05, + "loss": 1.703, + "step": 16677 + }, + { + "epoch": 5.119091467157766, + "grad_norm": 0.18957734107971191, + "learning_rate": 5.0506988417604885e-05, + "loss": 1.762, + "step": 16678 + }, + { + "epoch": 5.11939840392879, + "grad_norm": 0.21578781306743622, + "learning_rate": 5.050201811077879e-05, + "loss": 1.7167, + "step": 16679 + }, + { + "epoch": 5.119705340699816, + "grad_norm": 0.2253631353378296, + "learning_rate": 5.049704779899145e-05, + "loss": 1.7374, + "step": 16680 + }, + { + "epoch": 5.120012277470841, + "grad_norm": 0.1977664828300476, + "learning_rate": 5.049207748229199e-05, + "loss": 1.7399, + "step": 16681 + }, + { + "epoch": 5.120319214241866, + "grad_norm": 0.2964428663253784, + "learning_rate": 5.048710716072954e-05, + "loss": 1.8359, + "step": 16682 + }, + { + "epoch": 5.120626151012892, + "grad_norm": 0.24788637459278107, + "learning_rate": 5.0482136834353224e-05, + "loss": 1.7593, + "step": 16683 + }, + { + "epoch": 5.120933087783916, + "grad_norm": 0.21537743508815765, + "learning_rate": 5.0477166503212135e-05, + "loss": 1.7472, + "step": 16684 + }, + { + "epoch": 5.121240024554941, + "grad_norm": 0.2055196613073349, + "learning_rate": 5.047219616735541e-05, + "loss": 1.7106, + "step": 16685 + }, + { + "epoch": 5.121546961325967, + "grad_norm": 0.19770687818527222, + "learning_rate": 5.046722582683215e-05, + "loss": 1.6887, + "step": 16686 + }, + { + "epoch": 5.121853898096992, + "grad_norm": 0.20407389104366302, + "learning_rate": 5.046225548169151e-05, + "loss": 1.7412, + "step": 16687 + }, + { + "epoch": 5.122160834868017, + "grad_norm": 0.20153474807739258, + "learning_rate": 5.045728513198259e-05, + "loss": 1.7643, + "step": 16688 + }, + { + "epoch": 5.122467771639043, + "grad_norm": 0.18737752735614777, + "learning_rate": 5.045231477775452e-05, + "loss": 1.763, + "step": 16689 + }, + { + "epoch": 5.122774708410067, + "grad_norm": 0.19790658354759216, + "learning_rate": 5.0447344419056385e-05, + "loss": 1.7446, + "step": 16690 + }, + { + "epoch": 5.1230816451810925, + "grad_norm": 0.21496973931789398, + "learning_rate": 5.0442374055937336e-05, + "loss": 1.7756, + "step": 16691 + }, + { + "epoch": 5.123388581952118, + "grad_norm": 0.19318655133247375, + "learning_rate": 5.043740368844649e-05, + "loss": 1.7687, + "step": 16692 + }, + { + "epoch": 5.123695518723143, + "grad_norm": 0.2237338423728943, + "learning_rate": 5.0432433316632976e-05, + "loss": 1.7258, + "step": 16693 + }, + { + "epoch": 5.1240024554941686, + "grad_norm": 0.2257162630558014, + "learning_rate": 5.042746294054589e-05, + "loss": 1.7462, + "step": 16694 + }, + { + "epoch": 5.124309392265193, + "grad_norm": 0.25666359066963196, + "learning_rate": 5.0422492560234366e-05, + "loss": 1.7318, + "step": 16695 + }, + { + "epoch": 5.124616329036218, + "grad_norm": 0.2615324556827545, + "learning_rate": 5.0417522175747536e-05, + "loss": 1.7533, + "step": 16696 + }, + { + "epoch": 5.124923265807244, + "grad_norm": 0.2372874766588211, + "learning_rate": 5.0412551787134475e-05, + "loss": 1.7361, + "step": 16697 + }, + { + "epoch": 5.125230202578269, + "grad_norm": 0.25976815819740295, + "learning_rate": 5.040758139444436e-05, + "loss": 1.7542, + "step": 16698 + }, + { + "epoch": 5.125537139349294, + "grad_norm": 0.36173003911972046, + "learning_rate": 5.040261099772629e-05, + "loss": 1.7421, + "step": 16699 + }, + { + "epoch": 5.12584407612032, + "grad_norm": 0.2767728269100189, + "learning_rate": 5.039764059702937e-05, + "loss": 1.7341, + "step": 16700 + }, + { + "epoch": 5.126151012891344, + "grad_norm": 0.20185241103172302, + "learning_rate": 5.039267019240275e-05, + "loss": 1.7068, + "step": 16701 + }, + { + "epoch": 5.1264579496623695, + "grad_norm": 0.26872581243515015, + "learning_rate": 5.0387699783895514e-05, + "loss": 1.7404, + "step": 16702 + }, + { + "epoch": 5.126764886433395, + "grad_norm": 0.2867858111858368, + "learning_rate": 5.038272937155682e-05, + "loss": 1.7702, + "step": 16703 + }, + { + "epoch": 5.12707182320442, + "grad_norm": 0.20939521491527557, + "learning_rate": 5.037775895543574e-05, + "loss": 1.7653, + "step": 16704 + }, + { + "epoch": 5.1273787599754455, + "grad_norm": 0.2674047648906708, + "learning_rate": 5.037278853558146e-05, + "loss": 1.701, + "step": 16705 + }, + { + "epoch": 5.12768569674647, + "grad_norm": 0.20776906609535217, + "learning_rate": 5.036781811204304e-05, + "loss": 1.7476, + "step": 16706 + }, + { + "epoch": 5.127992633517495, + "grad_norm": 0.2695952355861664, + "learning_rate": 5.036284768486964e-05, + "loss": 1.7206, + "step": 16707 + }, + { + "epoch": 5.128299570288521, + "grad_norm": 0.30661383271217346, + "learning_rate": 5.0357877254110363e-05, + "loss": 1.72, + "step": 16708 + }, + { + "epoch": 5.128606507059546, + "grad_norm": 0.2527785003185272, + "learning_rate": 5.0352906819814316e-05, + "loss": 1.6936, + "step": 16709 + }, + { + "epoch": 5.128913443830571, + "grad_norm": 0.23000696301460266, + "learning_rate": 5.034793638203066e-05, + "loss": 1.7634, + "step": 16710 + }, + { + "epoch": 5.129220380601596, + "grad_norm": 0.33594760298728943, + "learning_rate": 5.0342965940808486e-05, + "loss": 1.6952, + "step": 16711 + }, + { + "epoch": 5.129527317372621, + "grad_norm": 0.22834168374538422, + "learning_rate": 5.033799549619692e-05, + "loss": 1.7537, + "step": 16712 + }, + { + "epoch": 5.129834254143646, + "grad_norm": 0.26585114002227783, + "learning_rate": 5.033302504824509e-05, + "loss": 1.7554, + "step": 16713 + }, + { + "epoch": 5.130141190914672, + "grad_norm": 0.25632211565971375, + "learning_rate": 5.032805459700211e-05, + "loss": 1.8141, + "step": 16714 + }, + { + "epoch": 5.130448127685697, + "grad_norm": 0.256523996591568, + "learning_rate": 5.0323084142517084e-05, + "loss": 1.777, + "step": 16715 + }, + { + "epoch": 5.1307550644567215, + "grad_norm": 0.31409457325935364, + "learning_rate": 5.0318113684839166e-05, + "loss": 1.7414, + "step": 16716 + }, + { + "epoch": 5.131062001227747, + "grad_norm": 0.21156816184520721, + "learning_rate": 5.0313143224017455e-05, + "loss": 1.7397, + "step": 16717 + }, + { + "epoch": 5.131368937998772, + "grad_norm": 0.23596547544002533, + "learning_rate": 5.030817276010109e-05, + "loss": 1.752, + "step": 16718 + }, + { + "epoch": 5.1316758747697975, + "grad_norm": 0.2587638199329376, + "learning_rate": 5.0303202293139186e-05, + "loss": 1.7645, + "step": 16719 + }, + { + "epoch": 5.131982811540823, + "grad_norm": 0.2006666213274002, + "learning_rate": 5.029823182318084e-05, + "loss": 1.7009, + "step": 16720 + }, + { + "epoch": 5.132289748311848, + "grad_norm": 0.3075694739818573, + "learning_rate": 5.029326135027521e-05, + "loss": 1.749, + "step": 16721 + }, + { + "epoch": 5.132596685082873, + "grad_norm": 0.3116205334663391, + "learning_rate": 5.028829087447139e-05, + "loss": 1.7458, + "step": 16722 + }, + { + "epoch": 5.132903621853898, + "grad_norm": 0.17925913631916046, + "learning_rate": 5.028332039581851e-05, + "loss": 1.6502, + "step": 16723 + }, + { + "epoch": 5.133210558624923, + "grad_norm": 0.21779952943325043, + "learning_rate": 5.0278349914365694e-05, + "loss": 1.7656, + "step": 16724 + }, + { + "epoch": 5.133517495395949, + "grad_norm": 0.20085318386554718, + "learning_rate": 5.027337943016207e-05, + "loss": 1.7662, + "step": 16725 + }, + { + "epoch": 5.133824432166974, + "grad_norm": 0.19975553452968597, + "learning_rate": 5.026840894325673e-05, + "loss": 1.7392, + "step": 16726 + }, + { + "epoch": 5.134131368937998, + "grad_norm": 0.20610745251178741, + "learning_rate": 5.026343845369883e-05, + "loss": 1.7221, + "step": 16727 + }, + { + "epoch": 5.134438305709024, + "grad_norm": 0.21451768279075623, + "learning_rate": 5.025846796153747e-05, + "loss": 1.8381, + "step": 16728 + }, + { + "epoch": 5.134745242480049, + "grad_norm": 0.19518613815307617, + "learning_rate": 5.0253497466821786e-05, + "loss": 1.7483, + "step": 16729 + }, + { + "epoch": 5.135052179251074, + "grad_norm": 0.24284996092319489, + "learning_rate": 5.024852696960088e-05, + "loss": 1.7895, + "step": 16730 + }, + { + "epoch": 5.1353591160221, + "grad_norm": 0.23962461948394775, + "learning_rate": 5.0243556469923905e-05, + "loss": 1.8468, + "step": 16731 + }, + { + "epoch": 5.135666052793125, + "grad_norm": 0.20455054938793182, + "learning_rate": 5.023858596783993e-05, + "loss": 1.6973, + "step": 16732 + }, + { + "epoch": 5.1359729895641495, + "grad_norm": 0.20629842579364777, + "learning_rate": 5.023361546339813e-05, + "loss": 1.7608, + "step": 16733 + }, + { + "epoch": 5.136279926335175, + "grad_norm": 0.19375818967819214, + "learning_rate": 5.0228644956647606e-05, + "loss": 1.7327, + "step": 16734 + }, + { + "epoch": 5.1365868631062, + "grad_norm": 0.20960548520088196, + "learning_rate": 5.022367444763748e-05, + "loss": 1.7227, + "step": 16735 + }, + { + "epoch": 5.1368937998772255, + "grad_norm": 0.24732786417007446, + "learning_rate": 5.021870393641687e-05, + "loss": 1.8144, + "step": 16736 + }, + { + "epoch": 5.137200736648251, + "grad_norm": 0.22190099954605103, + "learning_rate": 5.021373342303489e-05, + "loss": 1.705, + "step": 16737 + }, + { + "epoch": 5.137507673419275, + "grad_norm": 0.2091664969921112, + "learning_rate": 5.020876290754069e-05, + "loss": 1.7926, + "step": 16738 + }, + { + "epoch": 5.137814610190301, + "grad_norm": 0.22298938035964966, + "learning_rate": 5.020379238998335e-05, + "loss": 1.7782, + "step": 16739 + }, + { + "epoch": 5.138121546961326, + "grad_norm": 0.20843006670475006, + "learning_rate": 5.019882187041203e-05, + "loss": 1.7245, + "step": 16740 + }, + { + "epoch": 5.138428483732351, + "grad_norm": 0.23383544385433197, + "learning_rate": 5.019385134887583e-05, + "loss": 1.6834, + "step": 16741 + }, + { + "epoch": 5.138735420503377, + "grad_norm": 0.3015683889389038, + "learning_rate": 5.018888082542388e-05, + "loss": 1.7636, + "step": 16742 + }, + { + "epoch": 5.139042357274401, + "grad_norm": 0.2253810614347458, + "learning_rate": 5.0183910300105284e-05, + "loss": 1.7375, + "step": 16743 + }, + { + "epoch": 5.139349294045426, + "grad_norm": 0.2064623087644577, + "learning_rate": 5.01789397729692e-05, + "loss": 1.7683, + "step": 16744 + }, + { + "epoch": 5.139656230816452, + "grad_norm": 0.2106693685054779, + "learning_rate": 5.0173969244064724e-05, + "loss": 1.7432, + "step": 16745 + }, + { + "epoch": 5.139963167587477, + "grad_norm": 0.19944638013839722, + "learning_rate": 5.016899871344097e-05, + "loss": 1.701, + "step": 16746 + }, + { + "epoch": 5.140270104358502, + "grad_norm": 0.23210744559764862, + "learning_rate": 5.016402818114708e-05, + "loss": 1.8008, + "step": 16747 + }, + { + "epoch": 5.140577041129528, + "grad_norm": 0.26014089584350586, + "learning_rate": 5.015905764723217e-05, + "loss": 1.7131, + "step": 16748 + }, + { + "epoch": 5.140883977900552, + "grad_norm": 0.25526607036590576, + "learning_rate": 5.015408711174535e-05, + "loss": 1.7525, + "step": 16749 + }, + { + "epoch": 5.1411909146715775, + "grad_norm": 0.2092386782169342, + "learning_rate": 5.0149116574735756e-05, + "loss": 1.7502, + "step": 16750 + }, + { + "epoch": 5.141497851442603, + "grad_norm": 0.21560105681419373, + "learning_rate": 5.01441460362525e-05, + "loss": 1.7903, + "step": 16751 + }, + { + "epoch": 5.141804788213628, + "grad_norm": 0.23538467288017273, + "learning_rate": 5.013917549634471e-05, + "loss": 1.6995, + "step": 16752 + }, + { + "epoch": 5.1421117249846535, + "grad_norm": 0.26545262336730957, + "learning_rate": 5.0134204955061526e-05, + "loss": 1.7511, + "step": 16753 + }, + { + "epoch": 5.142418661755678, + "grad_norm": 0.23030948638916016, + "learning_rate": 5.012923441245203e-05, + "loss": 1.7271, + "step": 16754 + }, + { + "epoch": 5.142725598526703, + "grad_norm": 0.22395408153533936, + "learning_rate": 5.012426386856537e-05, + "loss": 1.7273, + "step": 16755 + }, + { + "epoch": 5.143032535297729, + "grad_norm": 0.21355997025966644, + "learning_rate": 5.011929332345066e-05, + "loss": 1.7347, + "step": 16756 + }, + { + "epoch": 5.143339472068754, + "grad_norm": 0.2355809509754181, + "learning_rate": 5.011432277715702e-05, + "loss": 1.8289, + "step": 16757 + }, + { + "epoch": 5.143646408839779, + "grad_norm": 0.24319802224636078, + "learning_rate": 5.0109352229733584e-05, + "loss": 1.7621, + "step": 16758 + }, + { + "epoch": 5.143953345610804, + "grad_norm": 0.2591453492641449, + "learning_rate": 5.010438168122946e-05, + "loss": 1.8043, + "step": 16759 + }, + { + "epoch": 5.144260282381829, + "grad_norm": 0.22595751285552979, + "learning_rate": 5.009941113169376e-05, + "loss": 1.8137, + "step": 16760 + }, + { + "epoch": 5.144567219152854, + "grad_norm": 0.220921128988266, + "learning_rate": 5.009444058117564e-05, + "loss": 1.7105, + "step": 16761 + }, + { + "epoch": 5.14487415592388, + "grad_norm": 0.25713789463043213, + "learning_rate": 5.0089470029724195e-05, + "loss": 1.8184, + "step": 16762 + }, + { + "epoch": 5.145181092694905, + "grad_norm": 0.19849328696727753, + "learning_rate": 5.008449947738856e-05, + "loss": 1.7331, + "step": 16763 + }, + { + "epoch": 5.14548802946593, + "grad_norm": 0.2073405385017395, + "learning_rate": 5.007952892421785e-05, + "loss": 1.7053, + "step": 16764 + }, + { + "epoch": 5.145794966236955, + "grad_norm": 0.22307951748371124, + "learning_rate": 5.007455837026119e-05, + "loss": 1.7724, + "step": 16765 + }, + { + "epoch": 5.14610190300798, + "grad_norm": 0.22160649299621582, + "learning_rate": 5.006958781556769e-05, + "loss": 1.7191, + "step": 16766 + }, + { + "epoch": 5.1464088397790055, + "grad_norm": 0.2202252298593521, + "learning_rate": 5.0064617260186487e-05, + "loss": 1.7339, + "step": 16767 + }, + { + "epoch": 5.146715776550031, + "grad_norm": 0.23693829774856567, + "learning_rate": 5.005964670416671e-05, + "loss": 1.7143, + "step": 16768 + }, + { + "epoch": 5.147022713321056, + "grad_norm": 0.22675764560699463, + "learning_rate": 5.005467614755746e-05, + "loss": 1.7913, + "step": 16769 + }, + { + "epoch": 5.147329650092081, + "grad_norm": 0.21288467943668365, + "learning_rate": 5.0049705590407866e-05, + "loss": 1.7581, + "step": 16770 + }, + { + "epoch": 5.147636586863106, + "grad_norm": 0.216839998960495, + "learning_rate": 5.0044735032767064e-05, + "loss": 1.7305, + "step": 16771 + }, + { + "epoch": 5.147943523634131, + "grad_norm": 0.2111063450574875, + "learning_rate": 5.003976447468416e-05, + "loss": 1.7444, + "step": 16772 + }, + { + "epoch": 5.148250460405157, + "grad_norm": 0.2536773085594177, + "learning_rate": 5.003479391620827e-05, + "loss": 1.6952, + "step": 16773 + }, + { + "epoch": 5.148557397176182, + "grad_norm": 0.23585477471351624, + "learning_rate": 5.002982335738854e-05, + "loss": 1.6921, + "step": 16774 + }, + { + "epoch": 5.148864333947207, + "grad_norm": 0.1927027702331543, + "learning_rate": 5.002485279827407e-05, + "loss": 1.7781, + "step": 16775 + }, + { + "epoch": 5.149171270718232, + "grad_norm": 0.22545355558395386, + "learning_rate": 5.001988223891399e-05, + "loss": 1.7582, + "step": 16776 + }, + { + "epoch": 5.149478207489257, + "grad_norm": 0.20837660133838654, + "learning_rate": 5.001491167935741e-05, + "loss": 1.7379, + "step": 16777 + }, + { + "epoch": 5.149785144260282, + "grad_norm": 0.20510734617710114, + "learning_rate": 5.000994111965348e-05, + "loss": 1.7568, + "step": 16778 + }, + { + "epoch": 5.150092081031308, + "grad_norm": 0.2629711329936981, + "learning_rate": 5.00049705598513e-05, + "loss": 1.7613, + "step": 16779 + }, + { + "epoch": 5.150399017802333, + "grad_norm": 0.2390555888414383, + "learning_rate": 5e-05, + "loss": 1.7099, + "step": 16780 + }, + { + "epoch": 5.150705954573358, + "grad_norm": 0.19643893837928772, + "learning_rate": 4.9995029440148715e-05, + "loss": 1.7012, + "step": 16781 + }, + { + "epoch": 5.151012891344383, + "grad_norm": 0.1881607472896576, + "learning_rate": 4.999005888034653e-05, + "loss": 1.705, + "step": 16782 + }, + { + "epoch": 5.151319828115408, + "grad_norm": 0.3219485282897949, + "learning_rate": 4.99850883206426e-05, + "loss": 1.8089, + "step": 16783 + }, + { + "epoch": 5.151626764886434, + "grad_norm": 0.22285562753677368, + "learning_rate": 4.998011776108602e-05, + "loss": 1.7343, + "step": 16784 + }, + { + "epoch": 5.151933701657459, + "grad_norm": 0.1981910616159439, + "learning_rate": 4.9975147201725955e-05, + "loss": 1.6939, + "step": 16785 + }, + { + "epoch": 5.152240638428483, + "grad_norm": 0.2338661551475525, + "learning_rate": 4.997017664261148e-05, + "loss": 1.6833, + "step": 16786 + }, + { + "epoch": 5.152547575199509, + "grad_norm": 0.2613268792629242, + "learning_rate": 4.996520608379175e-05, + "loss": 1.7251, + "step": 16787 + }, + { + "epoch": 5.152854511970534, + "grad_norm": 0.26063668727874756, + "learning_rate": 4.996023552531586e-05, + "loss": 1.8444, + "step": 16788 + }, + { + "epoch": 5.153161448741559, + "grad_norm": 0.2711321711540222, + "learning_rate": 4.9955264967232954e-05, + "loss": 1.7257, + "step": 16789 + }, + { + "epoch": 5.153468385512585, + "grad_norm": 0.30134227871894836, + "learning_rate": 4.995029440959213e-05, + "loss": 1.7599, + "step": 16790 + }, + { + "epoch": 5.153775322283609, + "grad_norm": 0.22983741760253906, + "learning_rate": 4.994532385244255e-05, + "loss": 1.7944, + "step": 16791 + }, + { + "epoch": 5.1540822590546345, + "grad_norm": 0.2992973327636719, + "learning_rate": 4.994035329583329e-05, + "loss": 1.7507, + "step": 16792 + }, + { + "epoch": 5.15438919582566, + "grad_norm": 0.2659669518470764, + "learning_rate": 4.993538273981352e-05, + "loss": 1.7246, + "step": 16793 + }, + { + "epoch": 5.154696132596685, + "grad_norm": 0.24235470592975616, + "learning_rate": 4.9930412184432315e-05, + "loss": 1.8378, + "step": 16794 + }, + { + "epoch": 5.1550030693677105, + "grad_norm": 0.30005061626434326, + "learning_rate": 4.992544162973882e-05, + "loss": 1.7526, + "step": 16795 + }, + { + "epoch": 5.155310006138736, + "grad_norm": 0.2183740884065628, + "learning_rate": 4.992047107578215e-05, + "loss": 1.7197, + "step": 16796 + }, + { + "epoch": 5.15561694290976, + "grad_norm": 0.35874706506729126, + "learning_rate": 4.991550052261145e-05, + "loss": 1.8196, + "step": 16797 + }, + { + "epoch": 5.155923879680786, + "grad_norm": 0.42146921157836914, + "learning_rate": 4.991052997027583e-05, + "loss": 1.7165, + "step": 16798 + }, + { + "epoch": 5.156230816451811, + "grad_norm": 0.2738321125507355, + "learning_rate": 4.990555941882437e-05, + "loss": 1.7042, + "step": 16799 + }, + { + "epoch": 5.156537753222836, + "grad_norm": 0.26304566860198975, + "learning_rate": 4.990058886830625e-05, + "loss": 1.7551, + "step": 16800 + }, + { + "epoch": 5.156844689993862, + "grad_norm": 0.4301520586013794, + "learning_rate": 4.9895618318770556e-05, + "loss": 1.7219, + "step": 16801 + }, + { + "epoch": 5.157151626764886, + "grad_norm": 0.3316499590873718, + "learning_rate": 4.989064777026644e-05, + "loss": 1.8034, + "step": 16802 + }, + { + "epoch": 5.157458563535911, + "grad_norm": 0.30105581879615784, + "learning_rate": 4.9885677222842984e-05, + "loss": 1.7022, + "step": 16803 + }, + { + "epoch": 5.157765500306937, + "grad_norm": 0.3830905854701996, + "learning_rate": 4.988070667654937e-05, + "loss": 1.7898, + "step": 16804 + }, + { + "epoch": 5.158072437077962, + "grad_norm": 0.2204640656709671, + "learning_rate": 4.9875736131434644e-05, + "loss": 1.7081, + "step": 16805 + }, + { + "epoch": 5.158379373848987, + "grad_norm": 0.3620772063732147, + "learning_rate": 4.9870765587547976e-05, + "loss": 1.7345, + "step": 16806 + }, + { + "epoch": 5.158686310620013, + "grad_norm": 0.3268207907676697, + "learning_rate": 4.986579504493848e-05, + "loss": 1.7364, + "step": 16807 + }, + { + "epoch": 5.158993247391037, + "grad_norm": 0.2499808967113495, + "learning_rate": 4.986082450365529e-05, + "loss": 1.7836, + "step": 16808 + }, + { + "epoch": 5.1593001841620625, + "grad_norm": 0.3696226477622986, + "learning_rate": 4.98558539637475e-05, + "loss": 1.8094, + "step": 16809 + }, + { + "epoch": 5.159607120933088, + "grad_norm": 0.3239068388938904, + "learning_rate": 4.9850883425264256e-05, + "loss": 1.7448, + "step": 16810 + }, + { + "epoch": 5.159914057704113, + "grad_norm": 0.19875772297382355, + "learning_rate": 4.9845912888254655e-05, + "loss": 1.6945, + "step": 16811 + }, + { + "epoch": 5.1602209944751385, + "grad_norm": 0.3952203691005707, + "learning_rate": 4.984094235276784e-05, + "loss": 1.8457, + "step": 16812 + }, + { + "epoch": 5.160527931246163, + "grad_norm": 0.3052334785461426, + "learning_rate": 4.9835971818852916e-05, + "loss": 1.7371, + "step": 16813 + }, + { + "epoch": 5.160834868017188, + "grad_norm": 0.2874486446380615, + "learning_rate": 4.983100128655904e-05, + "loss": 1.7194, + "step": 16814 + }, + { + "epoch": 5.161141804788214, + "grad_norm": 0.39117491245269775, + "learning_rate": 4.98260307559353e-05, + "loss": 1.7919, + "step": 16815 + }, + { + "epoch": 5.161448741559239, + "grad_norm": 0.2532150149345398, + "learning_rate": 4.982106022703081e-05, + "loss": 1.8103, + "step": 16816 + }, + { + "epoch": 5.161755678330264, + "grad_norm": 0.3545167148113251, + "learning_rate": 4.981608969989473e-05, + "loss": 1.8093, + "step": 16817 + }, + { + "epoch": 5.162062615101289, + "grad_norm": 0.397806316614151, + "learning_rate": 4.981111917457613e-05, + "loss": 1.7885, + "step": 16818 + }, + { + "epoch": 5.162369551872314, + "grad_norm": 0.2523536682128906, + "learning_rate": 4.980614865112419e-05, + "loss": 1.797, + "step": 16819 + }, + { + "epoch": 5.162676488643339, + "grad_norm": 0.3666839301586151, + "learning_rate": 4.980117812958798e-05, + "loss": 1.7859, + "step": 16820 + }, + { + "epoch": 5.162983425414365, + "grad_norm": 0.3392138183116913, + "learning_rate": 4.9796207610016664e-05, + "loss": 1.7717, + "step": 16821 + }, + { + "epoch": 5.16329036218539, + "grad_norm": 0.21040666103363037, + "learning_rate": 4.9791237092459325e-05, + "loss": 1.7447, + "step": 16822 + }, + { + "epoch": 5.163597298956415, + "grad_norm": 0.3140225112438202, + "learning_rate": 4.978626657696512e-05, + "loss": 1.7405, + "step": 16823 + }, + { + "epoch": 5.16390423572744, + "grad_norm": 0.23963581025600433, + "learning_rate": 4.978129606358313e-05, + "loss": 1.7041, + "step": 16824 + }, + { + "epoch": 5.164211172498465, + "grad_norm": 0.32476937770843506, + "learning_rate": 4.977632555236253e-05, + "loss": 1.736, + "step": 16825 + }, + { + "epoch": 5.1645181092694905, + "grad_norm": 0.4362463653087616, + "learning_rate": 4.977135504335239e-05, + "loss": 1.7657, + "step": 16826 + }, + { + "epoch": 5.164825046040516, + "grad_norm": 0.26118260622024536, + "learning_rate": 4.976638453660188e-05, + "loss": 1.7339, + "step": 16827 + }, + { + "epoch": 5.165131982811541, + "grad_norm": 0.27284330129623413, + "learning_rate": 4.9761414032160065e-05, + "loss": 1.8086, + "step": 16828 + }, + { + "epoch": 5.165438919582566, + "grad_norm": 0.2942579388618469, + "learning_rate": 4.975644353007611e-05, + "loss": 1.7869, + "step": 16829 + }, + { + "epoch": 5.165745856353591, + "grad_norm": 0.23257993161678314, + "learning_rate": 4.975147303039912e-05, + "loss": 1.8048, + "step": 16830 + }, + { + "epoch": 5.166052793124616, + "grad_norm": 0.28638842701911926, + "learning_rate": 4.9746502533178225e-05, + "loss": 1.7744, + "step": 16831 + }, + { + "epoch": 5.166359729895642, + "grad_norm": 0.21571335196495056, + "learning_rate": 4.974153203846255e-05, + "loss": 1.7842, + "step": 16832 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 0.268883615732193, + "learning_rate": 4.9736561546301185e-05, + "loss": 1.7194, + "step": 16833 + }, + { + "epoch": 5.166973603437691, + "grad_norm": 0.22934168577194214, + "learning_rate": 4.9731591056743285e-05, + "loss": 1.757, + "step": 16834 + }, + { + "epoch": 5.167280540208717, + "grad_norm": 0.26321718096733093, + "learning_rate": 4.9726620569837946e-05, + "loss": 1.7675, + "step": 16835 + }, + { + "epoch": 5.167587476979742, + "grad_norm": 0.2893882393836975, + "learning_rate": 4.9721650085634325e-05, + "loss": 1.7134, + "step": 16836 + }, + { + "epoch": 5.167894413750767, + "grad_norm": 0.24130617082118988, + "learning_rate": 4.97166796041815e-05, + "loss": 1.7119, + "step": 16837 + }, + { + "epoch": 5.168201350521793, + "grad_norm": 0.23614190518856049, + "learning_rate": 4.9711709125528635e-05, + "loss": 1.7556, + "step": 16838 + }, + { + "epoch": 5.168508287292818, + "grad_norm": 0.2031065821647644, + "learning_rate": 4.97067386497248e-05, + "loss": 1.7678, + "step": 16839 + }, + { + "epoch": 5.1688152240638425, + "grad_norm": 0.30695948004722595, + "learning_rate": 4.970176817681917e-05, + "loss": 1.7907, + "step": 16840 + }, + { + "epoch": 5.169122160834868, + "grad_norm": 0.31256723403930664, + "learning_rate": 4.969679770686082e-05, + "loss": 1.7448, + "step": 16841 + }, + { + "epoch": 5.169429097605893, + "grad_norm": 0.24183644354343414, + "learning_rate": 4.969182723989892e-05, + "loss": 1.7259, + "step": 16842 + }, + { + "epoch": 5.1697360343769185, + "grad_norm": 0.22440548241138458, + "learning_rate": 4.9686856775982536e-05, + "loss": 1.7949, + "step": 16843 + }, + { + "epoch": 5.170042971147944, + "grad_norm": 0.29006195068359375, + "learning_rate": 4.9681886315160846e-05, + "loss": 1.7128, + "step": 16844 + }, + { + "epoch": 5.170349907918968, + "grad_norm": 0.2189658135175705, + "learning_rate": 4.967691585748292e-05, + "loss": 1.7375, + "step": 16845 + }, + { + "epoch": 5.170656844689994, + "grad_norm": 0.289909690618515, + "learning_rate": 4.967194540299791e-05, + "loss": 1.779, + "step": 16846 + }, + { + "epoch": 5.170963781461019, + "grad_norm": 0.28279590606689453, + "learning_rate": 4.966697495175492e-05, + "loss": 1.7368, + "step": 16847 + }, + { + "epoch": 5.171270718232044, + "grad_norm": 0.2056259959936142, + "learning_rate": 4.966200450380309e-05, + "loss": 1.7548, + "step": 16848 + }, + { + "epoch": 5.17157765500307, + "grad_norm": 0.2607482969760895, + "learning_rate": 4.965703405919154e-05, + "loss": 1.7178, + "step": 16849 + }, + { + "epoch": 5.171884591774095, + "grad_norm": 0.26085609197616577, + "learning_rate": 4.965206361796935e-05, + "loss": 1.751, + "step": 16850 + }, + { + "epoch": 5.172191528545119, + "grad_norm": 0.17960335314273834, + "learning_rate": 4.964709318018569e-05, + "loss": 1.6932, + "step": 16851 + }, + { + "epoch": 5.172498465316145, + "grad_norm": 0.2617340385913849, + "learning_rate": 4.964212274588965e-05, + "loss": 1.7753, + "step": 16852 + }, + { + "epoch": 5.17280540208717, + "grad_norm": 0.2454555630683899, + "learning_rate": 4.9637152315130383e-05, + "loss": 1.7587, + "step": 16853 + }, + { + "epoch": 5.173112338858195, + "grad_norm": 0.19221605360507965, + "learning_rate": 4.963218188795696e-05, + "loss": 1.7337, + "step": 16854 + }, + { + "epoch": 5.173419275629221, + "grad_norm": 0.24314738810062408, + "learning_rate": 4.9627211464418565e-05, + "loss": 1.725, + "step": 16855 + }, + { + "epoch": 5.173726212400245, + "grad_norm": 0.2533986568450928, + "learning_rate": 4.962224104456426e-05, + "loss": 1.7502, + "step": 16856 + }, + { + "epoch": 5.1740331491712706, + "grad_norm": 0.21800079941749573, + "learning_rate": 4.9617270628443195e-05, + "loss": 1.7622, + "step": 16857 + }, + { + "epoch": 5.174340085942296, + "grad_norm": 0.22742362320423126, + "learning_rate": 4.96123002161045e-05, + "loss": 1.7078, + "step": 16858 + }, + { + "epoch": 5.174647022713321, + "grad_norm": 0.22729982435703278, + "learning_rate": 4.960732980759727e-05, + "loss": 1.8349, + "step": 16859 + }, + { + "epoch": 5.1749539594843466, + "grad_norm": 0.28869518637657166, + "learning_rate": 4.9602359402970625e-05, + "loss": 1.8932, + "step": 16860 + }, + { + "epoch": 5.175260896255371, + "grad_norm": 0.21931354701519012, + "learning_rate": 4.9597389002273725e-05, + "loss": 1.6989, + "step": 16861 + }, + { + "epoch": 5.175567833026396, + "grad_norm": 0.2130192667245865, + "learning_rate": 4.959241860555564e-05, + "loss": 1.752, + "step": 16862 + }, + { + "epoch": 5.175874769797422, + "grad_norm": 0.21272781491279602, + "learning_rate": 4.958744821286553e-05, + "loss": 1.7402, + "step": 16863 + }, + { + "epoch": 5.176181706568447, + "grad_norm": 0.20279285311698914, + "learning_rate": 4.958247782425248e-05, + "loss": 1.7103, + "step": 16864 + }, + { + "epoch": 5.176488643339472, + "grad_norm": 0.23561790585517883, + "learning_rate": 4.957750743976564e-05, + "loss": 1.7742, + "step": 16865 + }, + { + "epoch": 5.176795580110497, + "grad_norm": 0.27608510851860046, + "learning_rate": 4.957253705945413e-05, + "loss": 1.7505, + "step": 16866 + }, + { + "epoch": 5.177102516881522, + "grad_norm": 0.20624001324176788, + "learning_rate": 4.956756668336704e-05, + "loss": 1.7032, + "step": 16867 + }, + { + "epoch": 5.1774094536525475, + "grad_norm": 0.23743939399719238, + "learning_rate": 4.956259631155352e-05, + "loss": 1.7469, + "step": 16868 + }, + { + "epoch": 5.177716390423573, + "grad_norm": 0.27421119809150696, + "learning_rate": 4.9557625944062675e-05, + "loss": 1.7028, + "step": 16869 + }, + { + "epoch": 5.178023327194598, + "grad_norm": 0.23788046836853027, + "learning_rate": 4.955265558094363e-05, + "loss": 1.7468, + "step": 16870 + }, + { + "epoch": 5.1783302639656235, + "grad_norm": 0.24712958931922913, + "learning_rate": 4.95476852222455e-05, + "loss": 1.7348, + "step": 16871 + }, + { + "epoch": 5.178637200736648, + "grad_norm": 0.21558570861816406, + "learning_rate": 4.9542714868017424e-05, + "loss": 1.7599, + "step": 16872 + }, + { + "epoch": 5.178944137507673, + "grad_norm": 0.2561664283275604, + "learning_rate": 4.953774451830849e-05, + "loss": 1.7673, + "step": 16873 + }, + { + "epoch": 5.179251074278699, + "grad_norm": 0.19761815667152405, + "learning_rate": 4.953277417316786e-05, + "loss": 1.743, + "step": 16874 + }, + { + "epoch": 5.179558011049724, + "grad_norm": 0.24140769243240356, + "learning_rate": 4.95278038326446e-05, + "loss": 1.8229, + "step": 16875 + }, + { + "epoch": 5.179864947820749, + "grad_norm": 0.21686211228370667, + "learning_rate": 4.9522833496787876e-05, + "loss": 1.7914, + "step": 16876 + }, + { + "epoch": 5.180171884591774, + "grad_norm": 0.2537819743156433, + "learning_rate": 4.951786316564678e-05, + "loss": 1.7532, + "step": 16877 + }, + { + "epoch": 5.180478821362799, + "grad_norm": 0.24567632377147675, + "learning_rate": 4.951289283927046e-05, + "loss": 1.7528, + "step": 16878 + }, + { + "epoch": 5.180785758133824, + "grad_norm": 0.1958467960357666, + "learning_rate": 4.9507922517708e-05, + "loss": 1.6922, + "step": 16879 + }, + { + "epoch": 5.18109269490485, + "grad_norm": 0.2012091726064682, + "learning_rate": 4.950295220100857e-05, + "loss": 1.7509, + "step": 16880 + }, + { + "epoch": 5.181399631675875, + "grad_norm": 0.2416311800479889, + "learning_rate": 4.9497981889221226e-05, + "loss": 1.7341, + "step": 16881 + }, + { + "epoch": 5.1817065684469, + "grad_norm": 0.21407842636108398, + "learning_rate": 4.949301158239513e-05, + "loss": 1.7493, + "step": 16882 + }, + { + "epoch": 5.182013505217925, + "grad_norm": 0.2354930192232132, + "learning_rate": 4.94880412805794e-05, + "loss": 1.7726, + "step": 16883 + }, + { + "epoch": 5.18232044198895, + "grad_norm": 0.2168428748846054, + "learning_rate": 4.948307098382313e-05, + "loss": 1.77, + "step": 16884 + }, + { + "epoch": 5.1826273787599755, + "grad_norm": 0.19605880975723267, + "learning_rate": 4.947810069217547e-05, + "loss": 1.7292, + "step": 16885 + }, + { + "epoch": 5.182934315531001, + "grad_norm": 0.23066702485084534, + "learning_rate": 4.947313040568551e-05, + "loss": 1.7265, + "step": 16886 + }, + { + "epoch": 5.183241252302026, + "grad_norm": 0.20139534771442413, + "learning_rate": 4.9468160124402386e-05, + "loss": 1.7443, + "step": 16887 + }, + { + "epoch": 5.183548189073051, + "grad_norm": 0.25097572803497314, + "learning_rate": 4.946318984837521e-05, + "loss": 1.7537, + "step": 16888 + }, + { + "epoch": 5.183855125844076, + "grad_norm": 0.26215067505836487, + "learning_rate": 4.945821957765313e-05, + "loss": 1.8397, + "step": 16889 + }, + { + "epoch": 5.184162062615101, + "grad_norm": 0.22072140872478485, + "learning_rate": 4.9453249312285215e-05, + "loss": 1.7052, + "step": 16890 + }, + { + "epoch": 5.184468999386127, + "grad_norm": 0.20372305810451508, + "learning_rate": 4.944827905232064e-05, + "loss": 1.7228, + "step": 16891 + }, + { + "epoch": 5.184775936157152, + "grad_norm": 0.20383495092391968, + "learning_rate": 4.944330879780847e-05, + "loss": 1.7063, + "step": 16892 + }, + { + "epoch": 5.185082872928176, + "grad_norm": 0.1903693675994873, + "learning_rate": 4.943833854879786e-05, + "loss": 1.6435, + "step": 16893 + }, + { + "epoch": 5.185389809699202, + "grad_norm": 0.20357775688171387, + "learning_rate": 4.94333683053379e-05, + "loss": 1.7485, + "step": 16894 + }, + { + "epoch": 5.185696746470227, + "grad_norm": 0.24776104092597961, + "learning_rate": 4.942839806747775e-05, + "loss": 1.718, + "step": 16895 + }, + { + "epoch": 5.186003683241252, + "grad_norm": 0.2455051839351654, + "learning_rate": 4.942342783526649e-05, + "loss": 1.7124, + "step": 16896 + }, + { + "epoch": 5.186310620012278, + "grad_norm": 0.2102014273405075, + "learning_rate": 4.941845760875328e-05, + "loss": 1.7584, + "step": 16897 + }, + { + "epoch": 5.186617556783303, + "grad_norm": 0.2177651822566986, + "learning_rate": 4.941348738798718e-05, + "loss": 1.7019, + "step": 16898 + }, + { + "epoch": 5.1869244935543275, + "grad_norm": 0.21296697854995728, + "learning_rate": 4.9408517173017355e-05, + "loss": 1.7299, + "step": 16899 + }, + { + "epoch": 5.187231430325353, + "grad_norm": 0.23485495150089264, + "learning_rate": 4.940354696389292e-05, + "loss": 1.7271, + "step": 16900 + }, + { + "epoch": 5.187538367096378, + "grad_norm": 0.27287766337394714, + "learning_rate": 4.939857676066297e-05, + "loss": 1.7601, + "step": 16901 + }, + { + "epoch": 5.1878453038674035, + "grad_norm": 0.2060246467590332, + "learning_rate": 4.939360656337665e-05, + "loss": 1.7064, + "step": 16902 + }, + { + "epoch": 5.188152240638429, + "grad_norm": 0.25422418117523193, + "learning_rate": 4.938863637208305e-05, + "loss": 1.7423, + "step": 16903 + }, + { + "epoch": 5.188459177409453, + "grad_norm": 0.2798483669757843, + "learning_rate": 4.9383666186831304e-05, + "loss": 1.7132, + "step": 16904 + }, + { + "epoch": 5.188766114180479, + "grad_norm": 0.23505693674087524, + "learning_rate": 4.9378696007670525e-05, + "loss": 1.7759, + "step": 16905 + }, + { + "epoch": 5.189073050951504, + "grad_norm": 0.23761989176273346, + "learning_rate": 4.937372583464987e-05, + "loss": 1.7076, + "step": 16906 + }, + { + "epoch": 5.189379987722529, + "grad_norm": 0.3005945086479187, + "learning_rate": 4.9368755667818385e-05, + "loss": 1.6957, + "step": 16907 + }, + { + "epoch": 5.189686924493555, + "grad_norm": 0.2502881586551666, + "learning_rate": 4.936378550722525e-05, + "loss": 1.7352, + "step": 16908 + }, + { + "epoch": 5.189993861264579, + "grad_norm": 0.24194179475307465, + "learning_rate": 4.9358815352919544e-05, + "loss": 1.738, + "step": 16909 + }, + { + "epoch": 5.190300798035604, + "grad_norm": 0.27478742599487305, + "learning_rate": 4.935384520495041e-05, + "loss": 1.7118, + "step": 16910 + }, + { + "epoch": 5.19060773480663, + "grad_norm": 0.22327560186386108, + "learning_rate": 4.9348875063366944e-05, + "loss": 1.7697, + "step": 16911 + }, + { + "epoch": 5.190914671577655, + "grad_norm": 0.21844418346881866, + "learning_rate": 4.9343904928218295e-05, + "loss": 1.7733, + "step": 16912 + }, + { + "epoch": 5.19122160834868, + "grad_norm": 0.25267866253852844, + "learning_rate": 4.933893479955354e-05, + "loss": 1.7313, + "step": 16913 + }, + { + "epoch": 5.191528545119706, + "grad_norm": 0.22045068442821503, + "learning_rate": 4.933396467742185e-05, + "loss": 1.7856, + "step": 16914 + }, + { + "epoch": 5.19183548189073, + "grad_norm": 0.22642305493354797, + "learning_rate": 4.932899456187229e-05, + "loss": 1.7326, + "step": 16915 + }, + { + "epoch": 5.1921424186617555, + "grad_norm": 0.20601733028888702, + "learning_rate": 4.9324024452953995e-05, + "loss": 1.7743, + "step": 16916 + }, + { + "epoch": 5.192449355432781, + "grad_norm": 0.25580671429634094, + "learning_rate": 4.931905435071611e-05, + "loss": 1.7705, + "step": 16917 + }, + { + "epoch": 5.192756292203806, + "grad_norm": 0.38173142075538635, + "learning_rate": 4.9314084255207706e-05, + "loss": 1.7504, + "step": 16918 + }, + { + "epoch": 5.1930632289748315, + "grad_norm": 0.2254420667886734, + "learning_rate": 4.930911416647794e-05, + "loss": 1.7344, + "step": 16919 + }, + { + "epoch": 5.193370165745856, + "grad_norm": 0.2354312688112259, + "learning_rate": 4.9304144084575896e-05, + "loss": 1.7607, + "step": 16920 + }, + { + "epoch": 5.193677102516881, + "grad_norm": 0.23879510164260864, + "learning_rate": 4.9299174009550716e-05, + "loss": 1.683, + "step": 16921 + }, + { + "epoch": 5.193984039287907, + "grad_norm": 0.228669211268425, + "learning_rate": 4.9294203941451494e-05, + "loss": 1.7776, + "step": 16922 + }, + { + "epoch": 5.194290976058932, + "grad_norm": 0.2266843616962433, + "learning_rate": 4.928923388032739e-05, + "loss": 1.7563, + "step": 16923 + }, + { + "epoch": 5.194597912829957, + "grad_norm": 0.2581404745578766, + "learning_rate": 4.928426382622747e-05, + "loss": 1.8112, + "step": 16924 + }, + { + "epoch": 5.194904849600983, + "grad_norm": 0.25179803371429443, + "learning_rate": 4.92792937792009e-05, + "loss": 1.7661, + "step": 16925 + }, + { + "epoch": 5.195211786372007, + "grad_norm": 0.23408514261245728, + "learning_rate": 4.9274323739296746e-05, + "loss": 1.7618, + "step": 16926 + }, + { + "epoch": 5.195518723143032, + "grad_norm": 0.23110872507095337, + "learning_rate": 4.926935370656416e-05, + "loss": 1.6945, + "step": 16927 + }, + { + "epoch": 5.195825659914058, + "grad_norm": 0.2863025665283203, + "learning_rate": 4.926438368105224e-05, + "loss": 1.8659, + "step": 16928 + }, + { + "epoch": 5.196132596685083, + "grad_norm": 0.2156454175710678, + "learning_rate": 4.925941366281013e-05, + "loss": 1.7281, + "step": 16929 + }, + { + "epoch": 5.196439533456108, + "grad_norm": 0.2338300198316574, + "learning_rate": 4.925444365188691e-05, + "loss": 1.7271, + "step": 16930 + }, + { + "epoch": 5.196746470227133, + "grad_norm": 0.21434102952480316, + "learning_rate": 4.924947364833173e-05, + "loss": 1.7342, + "step": 16931 + }, + { + "epoch": 5.197053406998158, + "grad_norm": 0.21619778871536255, + "learning_rate": 4.924450365219369e-05, + "loss": 1.7493, + "step": 16932 + }, + { + "epoch": 5.1973603437691835, + "grad_norm": 0.24532032012939453, + "learning_rate": 4.9239533663521896e-05, + "loss": 1.7707, + "step": 16933 + }, + { + "epoch": 5.197667280540209, + "grad_norm": 0.21795547008514404, + "learning_rate": 4.923456368236549e-05, + "loss": 1.7642, + "step": 16934 + }, + { + "epoch": 5.197974217311234, + "grad_norm": 0.2070101797580719, + "learning_rate": 4.922959370877356e-05, + "loss": 1.7377, + "step": 16935 + }, + { + "epoch": 5.198281154082259, + "grad_norm": 0.22546489536762238, + "learning_rate": 4.9224623742795256e-05, + "loss": 1.7766, + "step": 16936 + }, + { + "epoch": 5.198588090853284, + "grad_norm": 0.20723624527454376, + "learning_rate": 4.921965378447965e-05, + "loss": 1.7316, + "step": 16937 + }, + { + "epoch": 5.198895027624309, + "grad_norm": 0.21870547533035278, + "learning_rate": 4.9214683833875905e-05, + "loss": 1.7653, + "step": 16938 + }, + { + "epoch": 5.199201964395335, + "grad_norm": 0.19606490433216095, + "learning_rate": 4.920971389103309e-05, + "loss": 1.7181, + "step": 16939 + }, + { + "epoch": 5.19950890116636, + "grad_norm": 0.18372730910778046, + "learning_rate": 4.920474395600037e-05, + "loss": 1.7041, + "step": 16940 + }, + { + "epoch": 5.199815837937384, + "grad_norm": 0.22051765024662018, + "learning_rate": 4.919977402882682e-05, + "loss": 1.7172, + "step": 16941 + }, + { + "epoch": 5.20012277470841, + "grad_norm": 0.2135835587978363, + "learning_rate": 4.919480410956159e-05, + "loss": 1.6918, + "step": 16942 + }, + { + "epoch": 5.200429711479435, + "grad_norm": 0.19619768857955933, + "learning_rate": 4.918983419825376e-05, + "loss": 1.7005, + "step": 16943 + }, + { + "epoch": 5.2007366482504604, + "grad_norm": 0.22726574540138245, + "learning_rate": 4.918486429495246e-05, + "loss": 1.6775, + "step": 16944 + }, + { + "epoch": 5.201043585021486, + "grad_norm": 0.21471361815929413, + "learning_rate": 4.9179894399706815e-05, + "loss": 1.7102, + "step": 16945 + }, + { + "epoch": 5.201350521792511, + "grad_norm": 0.20113740861415863, + "learning_rate": 4.917492451256595e-05, + "loss": 1.7548, + "step": 16946 + }, + { + "epoch": 5.201657458563536, + "grad_norm": 0.2337827831506729, + "learning_rate": 4.916995463357894e-05, + "loss": 1.818, + "step": 16947 + }, + { + "epoch": 5.201964395334561, + "grad_norm": 0.2649554908275604, + "learning_rate": 4.9164984762794955e-05, + "loss": 1.7784, + "step": 16948 + }, + { + "epoch": 5.202271332105586, + "grad_norm": 0.2297617793083191, + "learning_rate": 4.916001490026306e-05, + "loss": 1.7484, + "step": 16949 + }, + { + "epoch": 5.202578268876612, + "grad_norm": 0.20791979134082794, + "learning_rate": 4.915504504603238e-05, + "loss": 1.7164, + "step": 16950 + }, + { + "epoch": 5.202885205647637, + "grad_norm": 0.21769596636295319, + "learning_rate": 4.915007520015207e-05, + "loss": 1.7783, + "step": 16951 + }, + { + "epoch": 5.203192142418661, + "grad_norm": 0.21038469672203064, + "learning_rate": 4.914510536267118e-05, + "loss": 1.6863, + "step": 16952 + }, + { + "epoch": 5.203499079189687, + "grad_norm": 0.20725449919700623, + "learning_rate": 4.914013553363889e-05, + "loss": 1.6855, + "step": 16953 + }, + { + "epoch": 5.203806015960712, + "grad_norm": 0.23879854381084442, + "learning_rate": 4.9135165713104266e-05, + "loss": 1.6986, + "step": 16954 + }, + { + "epoch": 5.204112952731737, + "grad_norm": 0.20515915751457214, + "learning_rate": 4.913019590111645e-05, + "loss": 1.6912, + "step": 16955 + }, + { + "epoch": 5.204419889502763, + "grad_norm": 0.2252528965473175, + "learning_rate": 4.912522609772453e-05, + "loss": 1.6974, + "step": 16956 + }, + { + "epoch": 5.204726826273788, + "grad_norm": 0.1946130096912384, + "learning_rate": 4.9120256302977665e-05, + "loss": 1.7009, + "step": 16957 + }, + { + "epoch": 5.2050337630448125, + "grad_norm": 0.21323645114898682, + "learning_rate": 4.9115286516924925e-05, + "loss": 1.7746, + "step": 16958 + }, + { + "epoch": 5.205340699815838, + "grad_norm": 0.20721712708473206, + "learning_rate": 4.911031673961546e-05, + "loss": 1.7103, + "step": 16959 + }, + { + "epoch": 5.205647636586863, + "grad_norm": 0.19630689918994904, + "learning_rate": 4.910534697109834e-05, + "loss": 1.7042, + "step": 16960 + }, + { + "epoch": 5.2059545733578885, + "grad_norm": 0.2036786526441574, + "learning_rate": 4.910037721142273e-05, + "loss": 1.7713, + "step": 16961 + }, + { + "epoch": 5.206261510128914, + "grad_norm": 0.20518352091312408, + "learning_rate": 4.9095407460637696e-05, + "loss": 1.7456, + "step": 16962 + }, + { + "epoch": 5.206568446899938, + "grad_norm": 0.199858620762825, + "learning_rate": 4.9090437718792404e-05, + "loss": 1.7598, + "step": 16963 + }, + { + "epoch": 5.206875383670964, + "grad_norm": 0.22860252857208252, + "learning_rate": 4.9085467985935914e-05, + "loss": 1.7947, + "step": 16964 + }, + { + "epoch": 5.207182320441989, + "grad_norm": 0.22179929912090302, + "learning_rate": 4.9080498262117395e-05, + "loss": 1.7537, + "step": 16965 + }, + { + "epoch": 5.207489257213014, + "grad_norm": 0.24737581610679626, + "learning_rate": 4.9075528547385906e-05, + "loss": 1.7932, + "step": 16966 + }, + { + "epoch": 5.20779619398404, + "grad_norm": 0.2653762400150299, + "learning_rate": 4.907055884179059e-05, + "loss": 1.7683, + "step": 16967 + }, + { + "epoch": 5.208103130755064, + "grad_norm": 0.2891876697540283, + "learning_rate": 4.9065589145380564e-05, + "loss": 1.7867, + "step": 16968 + }, + { + "epoch": 5.208410067526089, + "grad_norm": 0.23162086308002472, + "learning_rate": 4.906061945820492e-05, + "loss": 1.7981, + "step": 16969 + }, + { + "epoch": 5.208717004297115, + "grad_norm": 0.2746187150478363, + "learning_rate": 4.9055649780312805e-05, + "loss": 1.7215, + "step": 16970 + }, + { + "epoch": 5.20902394106814, + "grad_norm": 0.3217853605747223, + "learning_rate": 4.905068011175329e-05, + "loss": 1.8027, + "step": 16971 + }, + { + "epoch": 5.209330877839165, + "grad_norm": 0.21517686545848846, + "learning_rate": 4.904571045257553e-05, + "loss": 1.7055, + "step": 16972 + }, + { + "epoch": 5.209637814610191, + "grad_norm": 0.23613709211349487, + "learning_rate": 4.90407408028286e-05, + "loss": 1.751, + "step": 16973 + }, + { + "epoch": 5.209944751381215, + "grad_norm": 0.35093945264816284, + "learning_rate": 4.903577116256165e-05, + "loss": 1.7749, + "step": 16974 + }, + { + "epoch": 5.2102516881522405, + "grad_norm": 0.3289217948913574, + "learning_rate": 4.903080153182376e-05, + "loss": 1.7722, + "step": 16975 + }, + { + "epoch": 5.210558624923266, + "grad_norm": 0.29387256503105164, + "learning_rate": 4.9025831910664074e-05, + "loss": 1.8121, + "step": 16976 + }, + { + "epoch": 5.210865561694291, + "grad_norm": 0.44418805837631226, + "learning_rate": 4.9020862299131664e-05, + "loss": 1.7744, + "step": 16977 + }, + { + "epoch": 5.2111724984653165, + "grad_norm": 0.39242252707481384, + "learning_rate": 4.901589269727568e-05, + "loss": 1.7183, + "step": 16978 + }, + { + "epoch": 5.211479435236341, + "grad_norm": 0.2028690129518509, + "learning_rate": 4.901092310514522e-05, + "loss": 1.7101, + "step": 16979 + }, + { + "epoch": 5.211786372007366, + "grad_norm": 0.4025843143463135, + "learning_rate": 4.900595352278941e-05, + "loss": 1.7545, + "step": 16980 + }, + { + "epoch": 5.212093308778392, + "grad_norm": 0.284568727016449, + "learning_rate": 4.900098395025733e-05, + "loss": 1.7758, + "step": 16981 + }, + { + "epoch": 5.212400245549417, + "grad_norm": 0.2527516484260559, + "learning_rate": 4.899601438759813e-05, + "loss": 1.695, + "step": 16982 + }, + { + "epoch": 5.212707182320442, + "grad_norm": 0.3063630759716034, + "learning_rate": 4.89910448348609e-05, + "loss": 1.714, + "step": 16983 + }, + { + "epoch": 5.213014119091467, + "grad_norm": 0.22754468023777008, + "learning_rate": 4.898607529209474e-05, + "loss": 1.8315, + "step": 16984 + }, + { + "epoch": 5.213321055862492, + "grad_norm": 0.29594969749450684, + "learning_rate": 4.89811057593488e-05, + "loss": 1.6669, + "step": 16985 + }, + { + "epoch": 5.213627992633517, + "grad_norm": 0.21486569941043854, + "learning_rate": 4.897613623667215e-05, + "loss": 1.7425, + "step": 16986 + }, + { + "epoch": 5.213934929404543, + "grad_norm": 0.30908775329589844, + "learning_rate": 4.897116672411395e-05, + "loss": 1.7915, + "step": 16987 + }, + { + "epoch": 5.214241866175568, + "grad_norm": 0.23515601456165314, + "learning_rate": 4.896619722172325e-05, + "loss": 1.7226, + "step": 16988 + }, + { + "epoch": 5.214548802946593, + "grad_norm": 0.2847287952899933, + "learning_rate": 4.8961227729549215e-05, + "loss": 1.7641, + "step": 16989 + }, + { + "epoch": 5.214855739717618, + "grad_norm": 0.2986287772655487, + "learning_rate": 4.895625824764092e-05, + "loss": 1.8025, + "step": 16990 + }, + { + "epoch": 5.215162676488643, + "grad_norm": 0.23454971611499786, + "learning_rate": 4.8951288776047514e-05, + "loss": 1.7057, + "step": 16991 + }, + { + "epoch": 5.2154696132596685, + "grad_norm": 0.2578633725643158, + "learning_rate": 4.894631931481807e-05, + "loss": 1.7267, + "step": 16992 + }, + { + "epoch": 5.215776550030694, + "grad_norm": 0.29975566267967224, + "learning_rate": 4.894134986400174e-05, + "loss": 1.7452, + "step": 16993 + }, + { + "epoch": 5.216083486801719, + "grad_norm": 0.22313638031482697, + "learning_rate": 4.893638042364758e-05, + "loss": 1.6917, + "step": 16994 + }, + { + "epoch": 5.216390423572744, + "grad_norm": 0.258297860622406, + "learning_rate": 4.893141099380475e-05, + "loss": 1.7816, + "step": 16995 + }, + { + "epoch": 5.216697360343769, + "grad_norm": 0.2656872272491455, + "learning_rate": 4.892644157452233e-05, + "loss": 1.7248, + "step": 16996 + }, + { + "epoch": 5.217004297114794, + "grad_norm": 0.20239698886871338, + "learning_rate": 4.8921472165849464e-05, + "loss": 1.7629, + "step": 16997 + }, + { + "epoch": 5.21731123388582, + "grad_norm": 0.2575492262840271, + "learning_rate": 4.891650276783523e-05, + "loss": 1.719, + "step": 16998 + }, + { + "epoch": 5.217618170656845, + "grad_norm": 0.27563637495040894, + "learning_rate": 4.8911533380528756e-05, + "loss": 1.718, + "step": 16999 + }, + { + "epoch": 5.21792510742787, + "grad_norm": 0.1969723105430603, + "learning_rate": 4.890656400397915e-05, + "loss": 1.7557, + "step": 17000 + }, + { + "epoch": 5.218232044198895, + "grad_norm": 0.24336831271648407, + "learning_rate": 4.89015946382355e-05, + "loss": 1.6861, + "step": 17001 + }, + { + "epoch": 5.21853898096992, + "grad_norm": 0.2804388403892517, + "learning_rate": 4.889662528334696e-05, + "loss": 1.7411, + "step": 17002 + }, + { + "epoch": 5.218845917740945, + "grad_norm": 0.21116352081298828, + "learning_rate": 4.8891655939362596e-05, + "loss": 1.7135, + "step": 17003 + }, + { + "epoch": 5.219152854511971, + "grad_norm": 0.21042904257774353, + "learning_rate": 4.8886686606331556e-05, + "loss": 1.7224, + "step": 17004 + }, + { + "epoch": 5.219459791282996, + "grad_norm": 0.22463755309581757, + "learning_rate": 4.888171728430291e-05, + "loss": 1.8272, + "step": 17005 + }, + { + "epoch": 5.2197667280540205, + "grad_norm": 0.25604158639907837, + "learning_rate": 4.8876747973325805e-05, + "loss": 1.674, + "step": 17006 + }, + { + "epoch": 5.220073664825046, + "grad_norm": 0.3108421564102173, + "learning_rate": 4.887177867344932e-05, + "loss": 1.761, + "step": 17007 + }, + { + "epoch": 5.220380601596071, + "grad_norm": 0.25135359168052673, + "learning_rate": 4.88668093847226e-05, + "loss": 1.7455, + "step": 17008 + }, + { + "epoch": 5.2206875383670965, + "grad_norm": 0.24508307874202728, + "learning_rate": 4.886184010719471e-05, + "loss": 1.7632, + "step": 17009 + }, + { + "epoch": 5.220994475138122, + "grad_norm": 0.26777148246765137, + "learning_rate": 4.8856870840914816e-05, + "loss": 1.7814, + "step": 17010 + }, + { + "epoch": 5.221301411909146, + "grad_norm": 0.22404739260673523, + "learning_rate": 4.8851901585931967e-05, + "loss": 1.7441, + "step": 17011 + }, + { + "epoch": 5.221608348680172, + "grad_norm": 0.2406606674194336, + "learning_rate": 4.884693234229531e-05, + "loss": 1.7789, + "step": 17012 + }, + { + "epoch": 5.221915285451197, + "grad_norm": 0.27320384979248047, + "learning_rate": 4.884196311005394e-05, + "loss": 1.8046, + "step": 17013 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.3393586277961731, + "learning_rate": 4.8836993889256965e-05, + "loss": 1.7155, + "step": 17014 + }, + { + "epoch": 5.222529158993248, + "grad_norm": 0.3069504499435425, + "learning_rate": 4.88320246799535e-05, + "loss": 1.6985, + "step": 17015 + }, + { + "epoch": 5.222836095764273, + "grad_norm": 0.22184616327285767, + "learning_rate": 4.8827055482192664e-05, + "loss": 1.7996, + "step": 17016 + }, + { + "epoch": 5.223143032535297, + "grad_norm": 0.2791864573955536, + "learning_rate": 4.8822086296023544e-05, + "loss": 1.7223, + "step": 17017 + }, + { + "epoch": 5.223449969306323, + "grad_norm": 0.259726345539093, + "learning_rate": 4.8817117121495245e-05, + "loss": 1.7481, + "step": 17018 + }, + { + "epoch": 5.223756906077348, + "grad_norm": 0.19968681037425995, + "learning_rate": 4.8812147958656916e-05, + "loss": 1.702, + "step": 17019 + }, + { + "epoch": 5.224063842848373, + "grad_norm": 0.20161856710910797, + "learning_rate": 4.8807178807557616e-05, + "loss": 1.6689, + "step": 17020 + }, + { + "epoch": 5.224370779619399, + "grad_norm": 0.2365240454673767, + "learning_rate": 4.880220966824649e-05, + "loss": 1.7742, + "step": 17021 + }, + { + "epoch": 5.224677716390423, + "grad_norm": 0.20116381347179413, + "learning_rate": 4.879724054077261e-05, + "loss": 1.7584, + "step": 17022 + }, + { + "epoch": 5.2249846531614486, + "grad_norm": 0.22845037281513214, + "learning_rate": 4.879227142518511e-05, + "loss": 1.7794, + "step": 17023 + }, + { + "epoch": 5.225291589932474, + "grad_norm": 0.251724511384964, + "learning_rate": 4.87873023215331e-05, + "loss": 1.7722, + "step": 17024 + }, + { + "epoch": 5.225598526703499, + "grad_norm": 0.206145241856575, + "learning_rate": 4.878233322986568e-05, + "loss": 1.7452, + "step": 17025 + }, + { + "epoch": 5.225905463474525, + "grad_norm": 0.24065247178077698, + "learning_rate": 4.877736415023194e-05, + "loss": 1.8144, + "step": 17026 + }, + { + "epoch": 5.226212400245549, + "grad_norm": 0.2255484163761139, + "learning_rate": 4.877239508268103e-05, + "loss": 1.706, + "step": 17027 + }, + { + "epoch": 5.226519337016574, + "grad_norm": 0.21035850048065186, + "learning_rate": 4.8767426027262e-05, + "loss": 1.7167, + "step": 17028 + }, + { + "epoch": 5.2268262737876, + "grad_norm": 0.19618964195251465, + "learning_rate": 4.8762456984024025e-05, + "loss": 1.7063, + "step": 17029 + }, + { + "epoch": 5.227133210558625, + "grad_norm": 0.19595398008823395, + "learning_rate": 4.875748795301614e-05, + "loss": 1.7452, + "step": 17030 + }, + { + "epoch": 5.22744014732965, + "grad_norm": 0.22870996594429016, + "learning_rate": 4.8752518934287506e-05, + "loss": 1.8169, + "step": 17031 + }, + { + "epoch": 5.227747084100676, + "grad_norm": 0.24048443138599396, + "learning_rate": 4.87475499278872e-05, + "loss": 1.6988, + "step": 17032 + }, + { + "epoch": 5.2280540208717, + "grad_norm": 0.24177183210849762, + "learning_rate": 4.8742580933864356e-05, + "loss": 1.77, + "step": 17033 + }, + { + "epoch": 5.2283609576427255, + "grad_norm": 0.2023085057735443, + "learning_rate": 4.873761195226806e-05, + "loss": 1.7, + "step": 17034 + }, + { + "epoch": 5.228667894413751, + "grad_norm": 0.2614101767539978, + "learning_rate": 4.873264298314742e-05, + "loss": 1.767, + "step": 17035 + }, + { + "epoch": 5.228974831184776, + "grad_norm": 0.19607602059841156, + "learning_rate": 4.872767402655154e-05, + "loss": 1.7391, + "step": 17036 + }, + { + "epoch": 5.2292817679558015, + "grad_norm": 0.2053994983434677, + "learning_rate": 4.872270508252953e-05, + "loss": 1.7155, + "step": 17037 + }, + { + "epoch": 5.229588704726826, + "grad_norm": 0.18256273865699768, + "learning_rate": 4.871773615113051e-05, + "loss": 1.6999, + "step": 17038 + }, + { + "epoch": 5.229895641497851, + "grad_norm": 0.21956393122673035, + "learning_rate": 4.871276723240356e-05, + "loss": 1.7946, + "step": 17039 + }, + { + "epoch": 5.230202578268877, + "grad_norm": 0.23779109120368958, + "learning_rate": 4.870779832639781e-05, + "loss": 1.8063, + "step": 17040 + }, + { + "epoch": 5.230509515039902, + "grad_norm": 0.21662941575050354, + "learning_rate": 4.8702829433162346e-05, + "loss": 1.7276, + "step": 17041 + }, + { + "epoch": 5.230816451810927, + "grad_norm": 0.21578755974769592, + "learning_rate": 4.869786055274628e-05, + "loss": 1.7577, + "step": 17042 + }, + { + "epoch": 5.231123388581952, + "grad_norm": 0.23229347169399261, + "learning_rate": 4.8692891685198715e-05, + "loss": 1.7884, + "step": 17043 + }, + { + "epoch": 5.231430325352977, + "grad_norm": 0.2302366942167282, + "learning_rate": 4.868792283056878e-05, + "loss": 1.7823, + "step": 17044 + }, + { + "epoch": 5.231737262124002, + "grad_norm": 0.2181033343076706, + "learning_rate": 4.868295398890554e-05, + "loss": 1.7027, + "step": 17045 + }, + { + "epoch": 5.232044198895028, + "grad_norm": 0.20863409340381622, + "learning_rate": 4.8677985160258135e-05, + "loss": 1.7247, + "step": 17046 + }, + { + "epoch": 5.232351135666053, + "grad_norm": 0.2242976278066635, + "learning_rate": 4.867301634467564e-05, + "loss": 1.7799, + "step": 17047 + }, + { + "epoch": 5.232658072437078, + "grad_norm": 0.19934964179992676, + "learning_rate": 4.866804754220719e-05, + "loss": 1.6973, + "step": 17048 + }, + { + "epoch": 5.232965009208103, + "grad_norm": 0.22056198120117188, + "learning_rate": 4.8663078752901855e-05, + "loss": 1.7677, + "step": 17049 + }, + { + "epoch": 5.233271945979128, + "grad_norm": 0.2303200513124466, + "learning_rate": 4.865810997680879e-05, + "loss": 1.7517, + "step": 17050 + }, + { + "epoch": 5.2335788827501535, + "grad_norm": 0.21193410456180573, + "learning_rate": 4.8653141213977066e-05, + "loss": 1.7478, + "step": 17051 + }, + { + "epoch": 5.233885819521179, + "grad_norm": 0.18498395383358002, + "learning_rate": 4.864817246445577e-05, + "loss": 1.6891, + "step": 17052 + }, + { + "epoch": 5.234192756292204, + "grad_norm": 0.22879233956336975, + "learning_rate": 4.8643203728294036e-05, + "loss": 1.7166, + "step": 17053 + }, + { + "epoch": 5.234499693063229, + "grad_norm": 0.2128525823354721, + "learning_rate": 4.8638235005540944e-05, + "loss": 1.7993, + "step": 17054 + }, + { + "epoch": 5.234806629834254, + "grad_norm": 0.21245025098323822, + "learning_rate": 4.8633266296245634e-05, + "loss": 1.7436, + "step": 17055 + }, + { + "epoch": 5.235113566605279, + "grad_norm": 0.20301629602909088, + "learning_rate": 4.8628297600457165e-05, + "loss": 1.7774, + "step": 17056 + }, + { + "epoch": 5.235420503376305, + "grad_norm": 0.23251961171627045, + "learning_rate": 4.8623328918224687e-05, + "loss": 1.7897, + "step": 17057 + }, + { + "epoch": 5.23572744014733, + "grad_norm": 0.2272956669330597, + "learning_rate": 4.861836024959726e-05, + "loss": 1.7668, + "step": 17058 + }, + { + "epoch": 5.236034376918354, + "grad_norm": 0.20540569722652435, + "learning_rate": 4.8613391594624013e-05, + "loss": 1.7549, + "step": 17059 + }, + { + "epoch": 5.23634131368938, + "grad_norm": 0.20306967198848724, + "learning_rate": 4.8608422953354034e-05, + "loss": 1.6993, + "step": 17060 + }, + { + "epoch": 5.236648250460405, + "grad_norm": 0.19415293633937836, + "learning_rate": 4.8603454325836455e-05, + "loss": 1.7313, + "step": 17061 + }, + { + "epoch": 5.23695518723143, + "grad_norm": 0.2058337777853012, + "learning_rate": 4.859848571212034e-05, + "loss": 1.7994, + "step": 17062 + }, + { + "epoch": 5.237262124002456, + "grad_norm": 0.24489709734916687, + "learning_rate": 4.859351711225483e-05, + "loss": 1.7555, + "step": 17063 + }, + { + "epoch": 5.237569060773481, + "grad_norm": 0.22589795291423798, + "learning_rate": 4.858854852628899e-05, + "loss": 1.7136, + "step": 17064 + }, + { + "epoch": 5.2378759975445055, + "grad_norm": 0.21404492855072021, + "learning_rate": 4.858357995427195e-05, + "loss": 1.7598, + "step": 17065 + }, + { + "epoch": 5.238182934315531, + "grad_norm": 0.24936965107917786, + "learning_rate": 4.8578611396252786e-05, + "loss": 1.8027, + "step": 17066 + }, + { + "epoch": 5.238489871086556, + "grad_norm": 0.23391515016555786, + "learning_rate": 4.857364285228065e-05, + "loss": 1.7704, + "step": 17067 + }, + { + "epoch": 5.2387968078575815, + "grad_norm": 0.22633357346057892, + "learning_rate": 4.85686743224046e-05, + "loss": 1.7075, + "step": 17068 + }, + { + "epoch": 5.239103744628607, + "grad_norm": 0.221492201089859, + "learning_rate": 4.8563705806673736e-05, + "loss": 1.7755, + "step": 17069 + }, + { + "epoch": 5.239410681399631, + "grad_norm": 0.2381046712398529, + "learning_rate": 4.855873730513719e-05, + "loss": 1.7971, + "step": 17070 + }, + { + "epoch": 5.239717618170657, + "grad_norm": 0.21930988132953644, + "learning_rate": 4.855376881784402e-05, + "loss": 1.7295, + "step": 17071 + }, + { + "epoch": 5.240024554941682, + "grad_norm": 0.20897921919822693, + "learning_rate": 4.854880034484339e-05, + "loss": 1.7796, + "step": 17072 + }, + { + "epoch": 5.240331491712707, + "grad_norm": 0.26616254448890686, + "learning_rate": 4.8543831886184334e-05, + "loss": 1.7095, + "step": 17073 + }, + { + "epoch": 5.240638428483733, + "grad_norm": 0.19513870775699615, + "learning_rate": 4.853886344191601e-05, + "loss": 1.7181, + "step": 17074 + }, + { + "epoch": 5.240945365254758, + "grad_norm": 0.23476530611515045, + "learning_rate": 4.853389501208747e-05, + "loss": 1.7928, + "step": 17075 + }, + { + "epoch": 5.241252302025782, + "grad_norm": 0.18197014927864075, + "learning_rate": 4.852892659674785e-05, + "loss": 1.6888, + "step": 17076 + }, + { + "epoch": 5.241559238796808, + "grad_norm": 0.20317208766937256, + "learning_rate": 4.852395819594623e-05, + "loss": 1.7828, + "step": 17077 + }, + { + "epoch": 5.241866175567833, + "grad_norm": 0.1953772008419037, + "learning_rate": 4.851898980973175e-05, + "loss": 1.7394, + "step": 17078 + }, + { + "epoch": 5.242173112338858, + "grad_norm": 0.19714407622814178, + "learning_rate": 4.851402143815345e-05, + "loss": 1.7261, + "step": 17079 + }, + { + "epoch": 5.242480049109884, + "grad_norm": 0.2196008861064911, + "learning_rate": 4.850905308126048e-05, + "loss": 1.7387, + "step": 17080 + }, + { + "epoch": 5.242786985880908, + "grad_norm": 0.2337818443775177, + "learning_rate": 4.85040847391019e-05, + "loss": 1.7448, + "step": 17081 + }, + { + "epoch": 5.2430939226519335, + "grad_norm": 0.20940040051937103, + "learning_rate": 4.849911641172685e-05, + "loss": 1.7354, + "step": 17082 + }, + { + "epoch": 5.243400859422959, + "grad_norm": 0.2242170125246048, + "learning_rate": 4.849414809918439e-05, + "loss": 1.7325, + "step": 17083 + }, + { + "epoch": 5.243707796193984, + "grad_norm": 0.2322687953710556, + "learning_rate": 4.8489179801523675e-05, + "loss": 1.7557, + "step": 17084 + }, + { + "epoch": 5.2440147329650095, + "grad_norm": 0.20303767919540405, + "learning_rate": 4.8484211518793764e-05, + "loss": 1.7063, + "step": 17085 + }, + { + "epoch": 5.244321669736034, + "grad_norm": 0.2446853369474411, + "learning_rate": 4.8479243251043746e-05, + "loss": 1.7587, + "step": 17086 + }, + { + "epoch": 5.244628606507059, + "grad_norm": 0.22901636362075806, + "learning_rate": 4.8474274998322735e-05, + "loss": 1.7992, + "step": 17087 + }, + { + "epoch": 5.244935543278085, + "grad_norm": 0.29676303267478943, + "learning_rate": 4.846930676067984e-05, + "loss": 1.7688, + "step": 17088 + }, + { + "epoch": 5.24524248004911, + "grad_norm": 0.24160240590572357, + "learning_rate": 4.846433853816416e-05, + "loss": 1.7367, + "step": 17089 + }, + { + "epoch": 5.245549416820135, + "grad_norm": 0.2097402662038803, + "learning_rate": 4.8459370330824774e-05, + "loss": 1.721, + "step": 17090 + }, + { + "epoch": 5.245856353591161, + "grad_norm": 0.26451143622398376, + "learning_rate": 4.8454402138710814e-05, + "loss": 1.7707, + "step": 17091 + }, + { + "epoch": 5.246163290362185, + "grad_norm": 0.30428358912467957, + "learning_rate": 4.844943396187133e-05, + "loss": 1.7232, + "step": 17092 + }, + { + "epoch": 5.24647022713321, + "grad_norm": 0.24332918226718903, + "learning_rate": 4.8444465800355466e-05, + "loss": 1.8215, + "step": 17093 + }, + { + "epoch": 5.246777163904236, + "grad_norm": 0.292703777551651, + "learning_rate": 4.843949765421229e-05, + "loss": 1.7199, + "step": 17094 + }, + { + "epoch": 5.247084100675261, + "grad_norm": 0.2458789199590683, + "learning_rate": 4.843452952349094e-05, + "loss": 1.7615, + "step": 17095 + }, + { + "epoch": 5.247391037446286, + "grad_norm": 0.22538037598133087, + "learning_rate": 4.842956140824045e-05, + "loss": 1.7279, + "step": 17096 + }, + { + "epoch": 5.247697974217311, + "grad_norm": 0.2959176003932953, + "learning_rate": 4.842459330850999e-05, + "loss": 1.767, + "step": 17097 + }, + { + "epoch": 5.248004910988336, + "grad_norm": 0.26158571243286133, + "learning_rate": 4.84196252243486e-05, + "loss": 1.7387, + "step": 17098 + }, + { + "epoch": 5.2483118477593615, + "grad_norm": 0.22855687141418457, + "learning_rate": 4.84146571558054e-05, + "loss": 1.7497, + "step": 17099 + }, + { + "epoch": 5.248618784530387, + "grad_norm": 0.22470593452453613, + "learning_rate": 4.840968910292949e-05, + "loss": 1.7705, + "step": 17100 + }, + { + "epoch": 5.248925721301412, + "grad_norm": 0.24680538475513458, + "learning_rate": 4.840472106576998e-05, + "loss": 1.7426, + "step": 17101 + }, + { + "epoch": 5.249232658072437, + "grad_norm": 0.23919185996055603, + "learning_rate": 4.839975304437594e-05, + "loss": 1.78, + "step": 17102 + }, + { + "epoch": 5.249539594843462, + "grad_norm": 0.24717366695404053, + "learning_rate": 4.839478503879647e-05, + "loss": 1.7373, + "step": 17103 + }, + { + "epoch": 5.249846531614487, + "grad_norm": 0.20463785529136658, + "learning_rate": 4.838981704908068e-05, + "loss": 1.702, + "step": 17104 + }, + { + "epoch": 5.250153468385513, + "grad_norm": 0.19791419804096222, + "learning_rate": 4.838484907527766e-05, + "loss": 1.746, + "step": 17105 + }, + { + "epoch": 5.250460405156538, + "grad_norm": 0.26169353723526, + "learning_rate": 4.837988111743652e-05, + "loss": 1.7227, + "step": 17106 + }, + { + "epoch": 5.250767341927563, + "grad_norm": 0.23545648157596588, + "learning_rate": 4.837491317560633e-05, + "loss": 1.7104, + "step": 17107 + }, + { + "epoch": 5.251074278698588, + "grad_norm": 0.21569804847240448, + "learning_rate": 4.836994524983622e-05, + "loss": 1.7883, + "step": 17108 + }, + { + "epoch": 5.251381215469613, + "grad_norm": 0.2730300724506378, + "learning_rate": 4.836497734017524e-05, + "loss": 1.7105, + "step": 17109 + }, + { + "epoch": 5.2516881522406385, + "grad_norm": 0.2834697663784027, + "learning_rate": 4.836000944667253e-05, + "loss": 1.8041, + "step": 17110 + }, + { + "epoch": 5.251995089011664, + "grad_norm": 0.31536951661109924, + "learning_rate": 4.835504156937715e-05, + "loss": 1.7708, + "step": 17111 + }, + { + "epoch": 5.252302025782689, + "grad_norm": 0.3830285668373108, + "learning_rate": 4.835007370833824e-05, + "loss": 1.7464, + "step": 17112 + }, + { + "epoch": 5.252608962553714, + "grad_norm": 0.23248349130153656, + "learning_rate": 4.834510586360485e-05, + "loss": 1.7274, + "step": 17113 + }, + { + "epoch": 5.252915899324739, + "grad_norm": 0.4755091071128845, + "learning_rate": 4.834013803522611e-05, + "loss": 1.7853, + "step": 17114 + }, + { + "epoch": 5.253222836095764, + "grad_norm": 0.4267823398113251, + "learning_rate": 4.8335170223251073e-05, + "loss": 1.7424, + "step": 17115 + }, + { + "epoch": 5.25352977286679, + "grad_norm": 0.17621731758117676, + "learning_rate": 4.8330202427728876e-05, + "loss": 1.7415, + "step": 17116 + }, + { + "epoch": 5.253836709637815, + "grad_norm": 0.37484630942344666, + "learning_rate": 4.832523464870859e-05, + "loss": 1.7357, + "step": 17117 + }, + { + "epoch": 5.25414364640884, + "grad_norm": 0.27773791551589966, + "learning_rate": 4.832026688623933e-05, + "loss": 1.717, + "step": 17118 + }, + { + "epoch": 5.254450583179865, + "grad_norm": 0.31190845370292664, + "learning_rate": 4.8315299140370183e-05, + "loss": 1.7226, + "step": 17119 + }, + { + "epoch": 5.25475751995089, + "grad_norm": 0.4321303367614746, + "learning_rate": 4.8310331411150215e-05, + "loss": 1.8003, + "step": 17120 + }, + { + "epoch": 5.255064456721915, + "grad_norm": 0.31622835993766785, + "learning_rate": 4.830536369862855e-05, + "loss": 1.8462, + "step": 17121 + }, + { + "epoch": 5.255371393492941, + "grad_norm": 0.2144850194454193, + "learning_rate": 4.830039600285427e-05, + "loss": 1.8153, + "step": 17122 + }, + { + "epoch": 5.255678330263966, + "grad_norm": 0.3107511103153229, + "learning_rate": 4.829542832387649e-05, + "loss": 1.7271, + "step": 17123 + }, + { + "epoch": 5.2559852670349905, + "grad_norm": 0.24607159197330475, + "learning_rate": 4.8290460661744265e-05, + "loss": 1.7946, + "step": 17124 + }, + { + "epoch": 5.256292203806016, + "grad_norm": 0.226362943649292, + "learning_rate": 4.828549301650673e-05, + "loss": 1.7338, + "step": 17125 + }, + { + "epoch": 5.256599140577041, + "grad_norm": 0.29993724822998047, + "learning_rate": 4.828052538821294e-05, + "loss": 1.8, + "step": 17126 + }, + { + "epoch": 5.2569060773480665, + "grad_norm": 0.25639984011650085, + "learning_rate": 4.8275557776912014e-05, + "loss": 1.8009, + "step": 17127 + }, + { + "epoch": 5.257213014119092, + "grad_norm": 0.2308105081319809, + "learning_rate": 4.8270590182653024e-05, + "loss": 1.7468, + "step": 17128 + }, + { + "epoch": 5.257519950890116, + "grad_norm": 0.27337542176246643, + "learning_rate": 4.82656226054851e-05, + "loss": 1.7725, + "step": 17129 + }, + { + "epoch": 5.257826887661142, + "grad_norm": 0.24848094582557678, + "learning_rate": 4.826065504545729e-05, + "loss": 1.8084, + "step": 17130 + }, + { + "epoch": 5.258133824432167, + "grad_norm": 0.35026392340660095, + "learning_rate": 4.825568750261872e-05, + "loss": 1.7705, + "step": 17131 + }, + { + "epoch": 5.258440761203192, + "grad_norm": 0.3207968473434448, + "learning_rate": 4.825071997701846e-05, + "loss": 1.7329, + "step": 17132 + }, + { + "epoch": 5.258747697974218, + "grad_norm": 0.20949263870716095, + "learning_rate": 4.8245752468705614e-05, + "loss": 1.7658, + "step": 17133 + }, + { + "epoch": 5.259054634745242, + "grad_norm": 0.3158881366252899, + "learning_rate": 4.824078497772926e-05, + "loss": 1.7249, + "step": 17134 + }, + { + "epoch": 5.259361571516267, + "grad_norm": 0.2283414602279663, + "learning_rate": 4.823581750413852e-05, + "loss": 1.7177, + "step": 17135 + }, + { + "epoch": 5.259668508287293, + "grad_norm": 0.24753578007221222, + "learning_rate": 4.823085004798247e-05, + "loss": 1.7232, + "step": 17136 + }, + { + "epoch": 5.259975445058318, + "grad_norm": 0.20381587743759155, + "learning_rate": 4.822588260931017e-05, + "loss": 1.7049, + "step": 17137 + }, + { + "epoch": 5.260282381829343, + "grad_norm": 0.21220643818378448, + "learning_rate": 4.8220915188170746e-05, + "loss": 1.7221, + "step": 17138 + }, + { + "epoch": 5.260589318600369, + "grad_norm": 0.19324758648872375, + "learning_rate": 4.8215947784613276e-05, + "loss": 1.7168, + "step": 17139 + }, + { + "epoch": 5.260896255371393, + "grad_norm": 0.26500338315963745, + "learning_rate": 4.821098039868688e-05, + "loss": 1.7627, + "step": 17140 + }, + { + "epoch": 5.2612031921424185, + "grad_norm": 0.19597655534744263, + "learning_rate": 4.82060130304406e-05, + "loss": 1.7214, + "step": 17141 + }, + { + "epoch": 5.261510128913444, + "grad_norm": 0.2105483114719391, + "learning_rate": 4.820104567992357e-05, + "loss": 1.6742, + "step": 17142 + }, + { + "epoch": 5.261817065684469, + "grad_norm": 0.20020028948783875, + "learning_rate": 4.8196078347184837e-05, + "loss": 1.7721, + "step": 17143 + }, + { + "epoch": 5.2621240024554945, + "grad_norm": 0.2313549965620041, + "learning_rate": 4.819111103227353e-05, + "loss": 1.7644, + "step": 17144 + }, + { + "epoch": 5.262430939226519, + "grad_norm": 0.31893789768218994, + "learning_rate": 4.818614373523871e-05, + "loss": 1.747, + "step": 17145 + }, + { + "epoch": 5.262737875997544, + "grad_norm": 0.2531197667121887, + "learning_rate": 4.8181176456129505e-05, + "loss": 1.7713, + "step": 17146 + }, + { + "epoch": 5.26304481276857, + "grad_norm": 0.2063976377248764, + "learning_rate": 4.817620919499496e-05, + "loss": 1.7254, + "step": 17147 + }, + { + "epoch": 5.263351749539595, + "grad_norm": 0.22220590710639954, + "learning_rate": 4.8171241951884204e-05, + "loss": 1.7345, + "step": 17148 + }, + { + "epoch": 5.26365868631062, + "grad_norm": 0.24240384995937347, + "learning_rate": 4.8166274726846286e-05, + "loss": 1.7302, + "step": 17149 + }, + { + "epoch": 5.263965623081646, + "grad_norm": 0.215829998254776, + "learning_rate": 4.8161307519930326e-05, + "loss": 1.7725, + "step": 17150 + }, + { + "epoch": 5.26427255985267, + "grad_norm": 0.2697906494140625, + "learning_rate": 4.815634033118541e-05, + "loss": 1.7156, + "step": 17151 + }, + { + "epoch": 5.264579496623695, + "grad_norm": 0.21649456024169922, + "learning_rate": 4.815137316066061e-05, + "loss": 1.745, + "step": 17152 + }, + { + "epoch": 5.264886433394721, + "grad_norm": 0.22773787379264832, + "learning_rate": 4.8146406008405033e-05, + "loss": 1.7592, + "step": 17153 + }, + { + "epoch": 5.265193370165746, + "grad_norm": 0.2920280396938324, + "learning_rate": 4.8141438874467745e-05, + "loss": 1.8301, + "step": 17154 + }, + { + "epoch": 5.265500306936771, + "grad_norm": 0.23919162154197693, + "learning_rate": 4.813647175889785e-05, + "loss": 1.7687, + "step": 17155 + }, + { + "epoch": 5.265807243707796, + "grad_norm": 0.24617896974086761, + "learning_rate": 4.8131504661744425e-05, + "loss": 1.8279, + "step": 17156 + }, + { + "epoch": 5.266114180478821, + "grad_norm": 0.22756172716617584, + "learning_rate": 4.812653758305659e-05, + "loss": 1.7595, + "step": 17157 + }, + { + "epoch": 5.2664211172498465, + "grad_norm": 0.22939376533031464, + "learning_rate": 4.812157052288339e-05, + "loss": 1.7445, + "step": 17158 + }, + { + "epoch": 5.266728054020872, + "grad_norm": 0.21021319925785065, + "learning_rate": 4.811660348127395e-05, + "loss": 1.7875, + "step": 17159 + }, + { + "epoch": 5.267034990791897, + "grad_norm": 0.2271810919046402, + "learning_rate": 4.811163645827732e-05, + "loss": 1.74, + "step": 17160 + }, + { + "epoch": 5.267341927562922, + "grad_norm": 0.238374263048172, + "learning_rate": 4.81066694539426e-05, + "loss": 1.7717, + "step": 17161 + }, + { + "epoch": 5.267648864333947, + "grad_norm": 0.20655091106891632, + "learning_rate": 4.8101702468318885e-05, + "loss": 1.7447, + "step": 17162 + }, + { + "epoch": 5.267955801104972, + "grad_norm": 0.24652259051799774, + "learning_rate": 4.809673550145528e-05, + "loss": 1.7755, + "step": 17163 + }, + { + "epoch": 5.268262737875998, + "grad_norm": 0.20256781578063965, + "learning_rate": 4.809176855340083e-05, + "loss": 1.7689, + "step": 17164 + }, + { + "epoch": 5.268569674647023, + "grad_norm": 0.27023112773895264, + "learning_rate": 4.8086801624204665e-05, + "loss": 1.8364, + "step": 17165 + }, + { + "epoch": 5.268876611418047, + "grad_norm": 0.251638799905777, + "learning_rate": 4.808183471391582e-05, + "loss": 1.7924, + "step": 17166 + }, + { + "epoch": 5.269183548189073, + "grad_norm": 0.22897782921791077, + "learning_rate": 4.807686782258342e-05, + "loss": 1.7378, + "step": 17167 + }, + { + "epoch": 5.269490484960098, + "grad_norm": 0.19141456484794617, + "learning_rate": 4.807190095025655e-05, + "loss": 1.6911, + "step": 17168 + }, + { + "epoch": 5.269797421731123, + "grad_norm": 0.19960568845272064, + "learning_rate": 4.806693409698427e-05, + "loss": 1.71, + "step": 17169 + }, + { + "epoch": 5.270104358502149, + "grad_norm": 0.23332087695598602, + "learning_rate": 4.8061967262815694e-05, + "loss": 1.7993, + "step": 17170 + }, + { + "epoch": 5.270411295273174, + "grad_norm": 0.24831432104110718, + "learning_rate": 4.8057000447799876e-05, + "loss": 1.7459, + "step": 17171 + }, + { + "epoch": 5.2707182320441985, + "grad_norm": 0.24735838174819946, + "learning_rate": 4.805203365198593e-05, + "loss": 1.7751, + "step": 17172 + }, + { + "epoch": 5.271025168815224, + "grad_norm": 0.32630103826522827, + "learning_rate": 4.804706687542291e-05, + "loss": 1.7885, + "step": 17173 + }, + { + "epoch": 5.271332105586249, + "grad_norm": 0.29055842757225037, + "learning_rate": 4.804210011815995e-05, + "loss": 1.6819, + "step": 17174 + }, + { + "epoch": 5.2716390423572745, + "grad_norm": 0.22968806326389313, + "learning_rate": 4.803713338024608e-05, + "loss": 1.8146, + "step": 17175 + }, + { + "epoch": 5.2719459791283, + "grad_norm": 0.23430144786834717, + "learning_rate": 4.8032166661730434e-05, + "loss": 1.7401, + "step": 17176 + }, + { + "epoch": 5.272252915899324, + "grad_norm": 0.26312723755836487, + "learning_rate": 4.802719996266204e-05, + "loss": 1.8319, + "step": 17177 + }, + { + "epoch": 5.27255985267035, + "grad_norm": 0.23715369403362274, + "learning_rate": 4.802223328309003e-05, + "loss": 1.8014, + "step": 17178 + }, + { + "epoch": 5.272866789441375, + "grad_norm": 0.23943877220153809, + "learning_rate": 4.801726662306347e-05, + "loss": 1.7181, + "step": 17179 + }, + { + "epoch": 5.2731737262124, + "grad_norm": 0.2366543412208557, + "learning_rate": 4.8012299982631435e-05, + "loss": 1.6685, + "step": 17180 + }, + { + "epoch": 5.273480662983426, + "grad_norm": 0.20688587427139282, + "learning_rate": 4.8007333361843016e-05, + "loss": 1.7089, + "step": 17181 + }, + { + "epoch": 5.273787599754451, + "grad_norm": 0.2069951444864273, + "learning_rate": 4.8002366760747314e-05, + "loss": 1.7447, + "step": 17182 + }, + { + "epoch": 5.274094536525475, + "grad_norm": 0.26072344183921814, + "learning_rate": 4.7997400179393374e-05, + "loss": 1.7346, + "step": 17183 + }, + { + "epoch": 5.274401473296501, + "grad_norm": 0.2397938072681427, + "learning_rate": 4.799243361783031e-05, + "loss": 1.7556, + "step": 17184 + }, + { + "epoch": 5.274708410067526, + "grad_norm": 0.23606348037719727, + "learning_rate": 4.798746707610721e-05, + "loss": 1.732, + "step": 17185 + }, + { + "epoch": 5.2750153468385514, + "grad_norm": 0.21078252792358398, + "learning_rate": 4.798250055427311e-05, + "loss": 1.7571, + "step": 17186 + }, + { + "epoch": 5.275322283609577, + "grad_norm": 0.21331414580345154, + "learning_rate": 4.797753405237714e-05, + "loss": 1.732, + "step": 17187 + }, + { + "epoch": 5.275629220380601, + "grad_norm": 0.23700307309627533, + "learning_rate": 4.7972567570468354e-05, + "loss": 1.7354, + "step": 17188 + }, + { + "epoch": 5.275936157151627, + "grad_norm": 0.20519722998142242, + "learning_rate": 4.7967601108595845e-05, + "loss": 1.7435, + "step": 17189 + }, + { + "epoch": 5.276243093922652, + "grad_norm": 0.22358302772045135, + "learning_rate": 4.79626346668087e-05, + "loss": 1.7891, + "step": 17190 + }, + { + "epoch": 5.276550030693677, + "grad_norm": 0.2434413880109787, + "learning_rate": 4.795766824515598e-05, + "loss": 1.814, + "step": 17191 + }, + { + "epoch": 5.276856967464703, + "grad_norm": 0.2198423594236374, + "learning_rate": 4.795270184368678e-05, + "loss": 1.7212, + "step": 17192 + }, + { + "epoch": 5.277163904235728, + "grad_norm": 0.23587806522846222, + "learning_rate": 4.7947735462450205e-05, + "loss": 1.8337, + "step": 17193 + }, + { + "epoch": 5.277470841006752, + "grad_norm": 0.234666645526886, + "learning_rate": 4.794276910149528e-05, + "loss": 1.7548, + "step": 17194 + }, + { + "epoch": 5.277777777777778, + "grad_norm": 0.23363247513771057, + "learning_rate": 4.793780276087115e-05, + "loss": 1.7587, + "step": 17195 + }, + { + "epoch": 5.278084714548803, + "grad_norm": 0.23191119730472565, + "learning_rate": 4.793283644062683e-05, + "loss": 1.7691, + "step": 17196 + }, + { + "epoch": 5.278391651319828, + "grad_norm": 0.2363097071647644, + "learning_rate": 4.7927870140811445e-05, + "loss": 1.8139, + "step": 17197 + }, + { + "epoch": 5.278698588090854, + "grad_norm": 0.2852413058280945, + "learning_rate": 4.7922903861474056e-05, + "loss": 1.7905, + "step": 17198 + }, + { + "epoch": 5.279005524861878, + "grad_norm": 0.23633842170238495, + "learning_rate": 4.7917937602663764e-05, + "loss": 1.8014, + "step": 17199 + }, + { + "epoch": 5.2793124616329035, + "grad_norm": 0.27007919549942017, + "learning_rate": 4.791297136442961e-05, + "loss": 1.7242, + "step": 17200 + }, + { + "epoch": 5.279619398403929, + "grad_norm": 0.29482147097587585, + "learning_rate": 4.790800514682072e-05, + "loss": 1.7154, + "step": 17201 + }, + { + "epoch": 5.279926335174954, + "grad_norm": 0.27772340178489685, + "learning_rate": 4.790303894988614e-05, + "loss": 1.7771, + "step": 17202 + }, + { + "epoch": 5.2802332719459795, + "grad_norm": 0.21761848032474518, + "learning_rate": 4.789807277367495e-05, + "loss": 1.6983, + "step": 17203 + }, + { + "epoch": 5.280540208717004, + "grad_norm": 0.22621290385723114, + "learning_rate": 4.789310661823626e-05, + "loss": 1.7667, + "step": 17204 + }, + { + "epoch": 5.280847145488029, + "grad_norm": 0.2284683883190155, + "learning_rate": 4.7888140483619095e-05, + "loss": 1.7419, + "step": 17205 + }, + { + "epoch": 5.281154082259055, + "grad_norm": 0.20145639777183533, + "learning_rate": 4.788317436987259e-05, + "loss": 1.7068, + "step": 17206 + }, + { + "epoch": 5.28146101903008, + "grad_norm": 0.23146072030067444, + "learning_rate": 4.7878208277045775e-05, + "loss": 1.7195, + "step": 17207 + }, + { + "epoch": 5.281767955801105, + "grad_norm": 0.24014149606227875, + "learning_rate": 4.787324220518776e-05, + "loss": 1.8148, + "step": 17208 + }, + { + "epoch": 5.28207489257213, + "grad_norm": 0.21067874133586884, + "learning_rate": 4.7868276154347595e-05, + "loss": 1.7754, + "step": 17209 + }, + { + "epoch": 5.282381829343155, + "grad_norm": 0.2313496321439743, + "learning_rate": 4.786331012457441e-05, + "loss": 1.7693, + "step": 17210 + }, + { + "epoch": 5.28268876611418, + "grad_norm": 0.24190983176231384, + "learning_rate": 4.7858344115917214e-05, + "loss": 1.7342, + "step": 17211 + }, + { + "epoch": 5.282995702885206, + "grad_norm": 0.24541905522346497, + "learning_rate": 4.785337812842514e-05, + "loss": 1.7721, + "step": 17212 + }, + { + "epoch": 5.283302639656231, + "grad_norm": 0.21989032626152039, + "learning_rate": 4.784841216214722e-05, + "loss": 1.7522, + "step": 17213 + }, + { + "epoch": 5.283609576427256, + "grad_norm": 0.20637241005897522, + "learning_rate": 4.784344621713256e-05, + "loss": 1.7418, + "step": 17214 + }, + { + "epoch": 5.283916513198281, + "grad_norm": 0.22538220882415771, + "learning_rate": 4.783848029343023e-05, + "loss": 1.8287, + "step": 17215 + }, + { + "epoch": 5.284223449969306, + "grad_norm": 0.24478071928024292, + "learning_rate": 4.7833514391089315e-05, + "loss": 1.7419, + "step": 17216 + }, + { + "epoch": 5.2845303867403315, + "grad_norm": 0.22707650065422058, + "learning_rate": 4.782854851015886e-05, + "loss": 1.7831, + "step": 17217 + }, + { + "epoch": 5.284837323511357, + "grad_norm": 0.2843529284000397, + "learning_rate": 4.7823582650687984e-05, + "loss": 1.7704, + "step": 17218 + }, + { + "epoch": 5.285144260282382, + "grad_norm": 0.21647678315639496, + "learning_rate": 4.781861681272573e-05, + "loss": 1.7514, + "step": 17219 + }, + { + "epoch": 5.285451197053407, + "grad_norm": 0.2279205620288849, + "learning_rate": 4.781365099632117e-05, + "loss": 1.6803, + "step": 17220 + }, + { + "epoch": 5.285758133824432, + "grad_norm": 0.2287401556968689, + "learning_rate": 4.7808685201523417e-05, + "loss": 1.7278, + "step": 17221 + }, + { + "epoch": 5.286065070595457, + "grad_norm": 0.2103174477815628, + "learning_rate": 4.78037194283815e-05, + "loss": 1.7667, + "step": 17222 + }, + { + "epoch": 5.286372007366483, + "grad_norm": 0.24339279532432556, + "learning_rate": 4.7798753676944536e-05, + "loss": 1.7828, + "step": 17223 + }, + { + "epoch": 5.286678944137508, + "grad_norm": 0.2343035340309143, + "learning_rate": 4.779378794726156e-05, + "loss": 1.7277, + "step": 17224 + }, + { + "epoch": 5.286985880908533, + "grad_norm": 0.22456331551074982, + "learning_rate": 4.778882223938167e-05, + "loss": 1.756, + "step": 17225 + }, + { + "epoch": 5.287292817679558, + "grad_norm": 0.2211158126592636, + "learning_rate": 4.778385655335392e-05, + "loss": 1.7733, + "step": 17226 + }, + { + "epoch": 5.287599754450583, + "grad_norm": 0.2731948792934418, + "learning_rate": 4.777889088922743e-05, + "loss": 1.787, + "step": 17227 + }, + { + "epoch": 5.287906691221608, + "grad_norm": 0.19578024744987488, + "learning_rate": 4.7773925247051215e-05, + "loss": 1.7474, + "step": 17228 + }, + { + "epoch": 5.288213627992634, + "grad_norm": 0.277332067489624, + "learning_rate": 4.77689596268744e-05, + "loss": 1.7432, + "step": 17229 + }, + { + "epoch": 5.288520564763659, + "grad_norm": 0.2979765832424164, + "learning_rate": 4.7763994028746003e-05, + "loss": 1.8198, + "step": 17230 + }, + { + "epoch": 5.2888275015346835, + "grad_norm": 0.23176288604736328, + "learning_rate": 4.775902845271515e-05, + "loss": 1.7317, + "step": 17231 + }, + { + "epoch": 5.289134438305709, + "grad_norm": 0.35821911692619324, + "learning_rate": 4.7754062898830876e-05, + "loss": 1.7287, + "step": 17232 + }, + { + "epoch": 5.289441375076734, + "grad_norm": 0.2881525158882141, + "learning_rate": 4.7749097367142296e-05, + "loss": 1.7391, + "step": 17233 + }, + { + "epoch": 5.2897483118477595, + "grad_norm": 0.22021767497062683, + "learning_rate": 4.774413185769842e-05, + "loss": 1.7462, + "step": 17234 + }, + { + "epoch": 5.290055248618785, + "grad_norm": 0.3286842703819275, + "learning_rate": 4.7739166370548385e-05, + "loss": 1.7749, + "step": 17235 + }, + { + "epoch": 5.290362185389809, + "grad_norm": 0.3298519253730774, + "learning_rate": 4.773420090574122e-05, + "loss": 1.7548, + "step": 17236 + }, + { + "epoch": 5.290669122160835, + "grad_norm": 0.20910575985908508, + "learning_rate": 4.7729235463326005e-05, + "loss": 1.7308, + "step": 17237 + }, + { + "epoch": 5.29097605893186, + "grad_norm": 0.3324633240699768, + "learning_rate": 4.7724270043351835e-05, + "loss": 1.7328, + "step": 17238 + }, + { + "epoch": 5.291282995702885, + "grad_norm": 0.21235628426074982, + "learning_rate": 4.771930464586774e-05, + "loss": 1.7186, + "step": 17239 + }, + { + "epoch": 5.291589932473911, + "grad_norm": 0.2971087694168091, + "learning_rate": 4.771433927092283e-05, + "loss": 1.7947, + "step": 17240 + }, + { + "epoch": 5.291896869244935, + "grad_norm": 0.3637695908546448, + "learning_rate": 4.770937391856614e-05, + "loss": 1.7753, + "step": 17241 + }, + { + "epoch": 5.29220380601596, + "grad_norm": 0.2503713369369507, + "learning_rate": 4.770440858884678e-05, + "loss": 1.684, + "step": 17242 + }, + { + "epoch": 5.292510742786986, + "grad_norm": 0.25510790944099426, + "learning_rate": 4.7699443281813774e-05, + "loss": 1.7517, + "step": 17243 + }, + { + "epoch": 5.292817679558011, + "grad_norm": 0.3189590871334076, + "learning_rate": 4.7694477997516244e-05, + "loss": 1.7488, + "step": 17244 + }, + { + "epoch": 5.293124616329036, + "grad_norm": 0.2807229161262512, + "learning_rate": 4.7689512736003215e-05, + "loss": 1.7962, + "step": 17245 + }, + { + "epoch": 5.293431553100062, + "grad_norm": 0.2166406810283661, + "learning_rate": 4.76845474973238e-05, + "loss": 1.7423, + "step": 17246 + }, + { + "epoch": 5.293738489871086, + "grad_norm": 0.29000815749168396, + "learning_rate": 4.767958228152702e-05, + "loss": 1.7508, + "step": 17247 + }, + { + "epoch": 5.2940454266421115, + "grad_norm": 0.19301612675189972, + "learning_rate": 4.767461708866198e-05, + "loss": 1.7223, + "step": 17248 + }, + { + "epoch": 5.294352363413137, + "grad_norm": 0.2828899323940277, + "learning_rate": 4.766965191877772e-05, + "loss": 1.8139, + "step": 17249 + }, + { + "epoch": 5.294659300184162, + "grad_norm": 0.32610374689102173, + "learning_rate": 4.766468677192335e-05, + "loss": 1.7744, + "step": 17250 + }, + { + "epoch": 5.2949662369551875, + "grad_norm": 0.2175719439983368, + "learning_rate": 4.7659721648147895e-05, + "loss": 1.7345, + "step": 17251 + }, + { + "epoch": 5.295273173726212, + "grad_norm": 0.24777816236019135, + "learning_rate": 4.7654756547500457e-05, + "loss": 1.7382, + "step": 17252 + }, + { + "epoch": 5.295580110497237, + "grad_norm": 0.25927749276161194, + "learning_rate": 4.764979147003008e-05, + "loss": 1.7625, + "step": 17253 + }, + { + "epoch": 5.295887047268263, + "grad_norm": 0.2271798849105835, + "learning_rate": 4.7644826415785834e-05, + "loss": 1.6928, + "step": 17254 + }, + { + "epoch": 5.296193984039288, + "grad_norm": 0.30804958939552307, + "learning_rate": 4.763986138481682e-05, + "loss": 1.743, + "step": 17255 + }, + { + "epoch": 5.296500920810313, + "grad_norm": 0.2247130572795868, + "learning_rate": 4.763489637717205e-05, + "loss": 1.7593, + "step": 17256 + }, + { + "epoch": 5.296807857581339, + "grad_norm": 0.22203052043914795, + "learning_rate": 4.7629931392900645e-05, + "loss": 1.6923, + "step": 17257 + }, + { + "epoch": 5.297114794352363, + "grad_norm": 0.23044714331626892, + "learning_rate": 4.7624966432051624e-05, + "loss": 1.7676, + "step": 17258 + }, + { + "epoch": 5.297421731123388, + "grad_norm": 0.2824070155620575, + "learning_rate": 4.7620001494674096e-05, + "loss": 1.8272, + "step": 17259 + }, + { + "epoch": 5.297728667894414, + "grad_norm": 0.27077800035476685, + "learning_rate": 4.761503658081709e-05, + "loss": 1.8106, + "step": 17260 + }, + { + "epoch": 5.298035604665439, + "grad_norm": 0.2333833873271942, + "learning_rate": 4.7610071690529706e-05, + "loss": 1.6841, + "step": 17261 + }, + { + "epoch": 5.298342541436464, + "grad_norm": 0.2542032301425934, + "learning_rate": 4.760510682386098e-05, + "loss": 1.7656, + "step": 17262 + }, + { + "epoch": 5.298649478207489, + "grad_norm": 0.30680081248283386, + "learning_rate": 4.760014198086002e-05, + "loss": 1.7443, + "step": 17263 + }, + { + "epoch": 5.298956414978514, + "grad_norm": 0.21580225229263306, + "learning_rate": 4.759517716157583e-05, + "loss": 1.7907, + "step": 17264 + }, + { + "epoch": 5.2992633517495396, + "grad_norm": 0.2644323408603668, + "learning_rate": 4.7590212366057516e-05, + "loss": 1.6835, + "step": 17265 + }, + { + "epoch": 5.299570288520565, + "grad_norm": 0.23600110411643982, + "learning_rate": 4.758524759435414e-05, + "loss": 1.7481, + "step": 17266 + }, + { + "epoch": 5.29987722529159, + "grad_norm": 0.23825959861278534, + "learning_rate": 4.758028284651477e-05, + "loss": 1.7267, + "step": 17267 + }, + { + "epoch": 5.300184162062616, + "grad_norm": 0.2659476101398468, + "learning_rate": 4.757531812258845e-05, + "loss": 1.7303, + "step": 17268 + }, + { + "epoch": 5.30049109883364, + "grad_norm": 0.30770114064216614, + "learning_rate": 4.757035342262428e-05, + "loss": 1.7636, + "step": 17269 + }, + { + "epoch": 5.300798035604665, + "grad_norm": 0.27921241521835327, + "learning_rate": 4.756538874667129e-05, + "loss": 1.7736, + "step": 17270 + }, + { + "epoch": 5.301104972375691, + "grad_norm": 0.2518016993999481, + "learning_rate": 4.756042409477855e-05, + "loss": 1.7942, + "step": 17271 + }, + { + "epoch": 5.301411909146716, + "grad_norm": 0.2678029537200928, + "learning_rate": 4.755545946699514e-05, + "loss": 1.7179, + "step": 17272 + }, + { + "epoch": 5.301718845917741, + "grad_norm": 0.3082284927368164, + "learning_rate": 4.7550494863370094e-05, + "loss": 1.7282, + "step": 17273 + }, + { + "epoch": 5.302025782688766, + "grad_norm": 0.23269952833652496, + "learning_rate": 4.754553028395251e-05, + "loss": 1.755, + "step": 17274 + }, + { + "epoch": 5.302332719459791, + "grad_norm": 0.2273751199245453, + "learning_rate": 4.754056572879142e-05, + "loss": 1.7661, + "step": 17275 + }, + { + "epoch": 5.3026396562308165, + "grad_norm": 0.2175082415342331, + "learning_rate": 4.7535601197935915e-05, + "loss": 1.7034, + "step": 17276 + }, + { + "epoch": 5.302946593001842, + "grad_norm": 0.20551301538944244, + "learning_rate": 4.753063669143503e-05, + "loss": 1.7329, + "step": 17277 + }, + { + "epoch": 5.303253529772867, + "grad_norm": 0.2350638061761856, + "learning_rate": 4.752567220933785e-05, + "loss": 1.8361, + "step": 17278 + }, + { + "epoch": 5.303560466543892, + "grad_norm": 0.20268140733242035, + "learning_rate": 4.752070775169342e-05, + "loss": 1.6736, + "step": 17279 + }, + { + "epoch": 5.303867403314917, + "grad_norm": 0.1891544908285141, + "learning_rate": 4.7515743318550823e-05, + "loss": 1.7241, + "step": 17280 + }, + { + "epoch": 5.304174340085942, + "grad_norm": 0.22900860011577606, + "learning_rate": 4.751077890995909e-05, + "loss": 1.7321, + "step": 17281 + }, + { + "epoch": 5.304481276856968, + "grad_norm": 0.25827866792678833, + "learning_rate": 4.7505814525967304e-05, + "loss": 1.8021, + "step": 17282 + }, + { + "epoch": 5.304788213627993, + "grad_norm": 0.22459273040294647, + "learning_rate": 4.7500850166624514e-05, + "loss": 1.7845, + "step": 17283 + }, + { + "epoch": 5.305095150399017, + "grad_norm": 0.23737964034080505, + "learning_rate": 4.7495885831979816e-05, + "loss": 1.7274, + "step": 17284 + }, + { + "epoch": 5.305402087170043, + "grad_norm": 0.2267502397298813, + "learning_rate": 4.749092152208221e-05, + "loss": 1.7747, + "step": 17285 + }, + { + "epoch": 5.305709023941068, + "grad_norm": 0.31811007857322693, + "learning_rate": 4.748595723698081e-05, + "loss": 1.7852, + "step": 17286 + }, + { + "epoch": 5.306015960712093, + "grad_norm": 0.42865583300590515, + "learning_rate": 4.7480992976724655e-05, + "loss": 1.7711, + "step": 17287 + }, + { + "epoch": 5.306322897483119, + "grad_norm": 0.3211027979850769, + "learning_rate": 4.747602874136278e-05, + "loss": 1.7813, + "step": 17288 + }, + { + "epoch": 5.306629834254144, + "grad_norm": 0.22552837431430817, + "learning_rate": 4.7471064530944295e-05, + "loss": 1.7407, + "step": 17289 + }, + { + "epoch": 5.3069367710251685, + "grad_norm": 0.3119906485080719, + "learning_rate": 4.746610034551821e-05, + "loss": 1.7255, + "step": 17290 + }, + { + "epoch": 5.307243707796194, + "grad_norm": 0.26405754685401917, + "learning_rate": 4.7461136185133623e-05, + "loss": 1.6945, + "step": 17291 + }, + { + "epoch": 5.307550644567219, + "grad_norm": 0.21759621798992157, + "learning_rate": 4.7456172049839566e-05, + "loss": 1.7319, + "step": 17292 + }, + { + "epoch": 5.3078575813382445, + "grad_norm": 0.26193925738334656, + "learning_rate": 4.745120793968511e-05, + "loss": 1.7508, + "step": 17293 + }, + { + "epoch": 5.30816451810927, + "grad_norm": 0.2549780011177063, + "learning_rate": 4.74462438547193e-05, + "loss": 1.7153, + "step": 17294 + }, + { + "epoch": 5.308471454880294, + "grad_norm": 0.21164020895957947, + "learning_rate": 4.7441279794991235e-05, + "loss": 1.7315, + "step": 17295 + }, + { + "epoch": 5.30877839165132, + "grad_norm": 0.20548345148563385, + "learning_rate": 4.7436315760549914e-05, + "loss": 1.68, + "step": 17296 + }, + { + "epoch": 5.309085328422345, + "grad_norm": 0.23997166752815247, + "learning_rate": 4.7431351751444446e-05, + "loss": 1.8528, + "step": 17297 + }, + { + "epoch": 5.30939226519337, + "grad_norm": 0.2639109194278717, + "learning_rate": 4.7426387767723845e-05, + "loss": 1.8041, + "step": 17298 + }, + { + "epoch": 5.309699201964396, + "grad_norm": 0.2285986840724945, + "learning_rate": 4.7421423809437196e-05, + "loss": 1.8188, + "step": 17299 + }, + { + "epoch": 5.310006138735421, + "grad_norm": 0.22183369100093842, + "learning_rate": 4.741645987663355e-05, + "loss": 1.7581, + "step": 17300 + }, + { + "epoch": 5.310313075506445, + "grad_norm": 0.22716040909290314, + "learning_rate": 4.741149596936197e-05, + "loss": 1.7438, + "step": 17301 + }, + { + "epoch": 5.310620012277471, + "grad_norm": 0.24641327559947968, + "learning_rate": 4.740653208767148e-05, + "loss": 1.761, + "step": 17302 + }, + { + "epoch": 5.310926949048496, + "grad_norm": 0.28470689058303833, + "learning_rate": 4.7401568231611194e-05, + "loss": 1.7512, + "step": 17303 + }, + { + "epoch": 5.311233885819521, + "grad_norm": 0.23279942572116852, + "learning_rate": 4.739660440123012e-05, + "loss": 1.7797, + "step": 17304 + }, + { + "epoch": 5.311540822590547, + "grad_norm": 0.26397696137428284, + "learning_rate": 4.739164059657731e-05, + "loss": 1.748, + "step": 17305 + }, + { + "epoch": 5.311847759361571, + "grad_norm": 0.25072020292282104, + "learning_rate": 4.7386676817701856e-05, + "loss": 1.7571, + "step": 17306 + }, + { + "epoch": 5.3121546961325965, + "grad_norm": 0.20815810561180115, + "learning_rate": 4.7381713064652774e-05, + "loss": 1.7566, + "step": 17307 + }, + { + "epoch": 5.312461632903622, + "grad_norm": 0.23104289174079895, + "learning_rate": 4.7376749337479174e-05, + "loss": 1.7308, + "step": 17308 + }, + { + "epoch": 5.312768569674647, + "grad_norm": 0.21978867053985596, + "learning_rate": 4.737178563623004e-05, + "loss": 1.7997, + "step": 17309 + }, + { + "epoch": 5.3130755064456725, + "grad_norm": 0.34588614106178284, + "learning_rate": 4.736682196095447e-05, + "loss": 1.8414, + "step": 17310 + }, + { + "epoch": 5.313382443216697, + "grad_norm": 0.3475342094898224, + "learning_rate": 4.73618583117015e-05, + "loss": 1.7823, + "step": 17311 + }, + { + "epoch": 5.313689379987722, + "grad_norm": 0.1965305358171463, + "learning_rate": 4.7356894688520215e-05, + "loss": 1.7597, + "step": 17312 + }, + { + "epoch": 5.313996316758748, + "grad_norm": 0.3035048246383667, + "learning_rate": 4.7351931091459624e-05, + "loss": 1.6803, + "step": 17313 + }, + { + "epoch": 5.314303253529773, + "grad_norm": 0.27722910046577454, + "learning_rate": 4.7346967520568827e-05, + "loss": 1.7472, + "step": 17314 + }, + { + "epoch": 5.314610190300798, + "grad_norm": 0.21481415629386902, + "learning_rate": 4.734200397589682e-05, + "loss": 1.7319, + "step": 17315 + }, + { + "epoch": 5.314917127071823, + "grad_norm": 0.2570357918739319, + "learning_rate": 4.733704045749271e-05, + "loss": 1.7392, + "step": 17316 + }, + { + "epoch": 5.315224063842848, + "grad_norm": 0.2404400259256363, + "learning_rate": 4.733207696540551e-05, + "loss": 1.7231, + "step": 17317 + }, + { + "epoch": 5.315531000613873, + "grad_norm": 0.222911074757576, + "learning_rate": 4.732711349968432e-05, + "loss": 1.7584, + "step": 17318 + }, + { + "epoch": 5.315837937384899, + "grad_norm": 0.22908064723014832, + "learning_rate": 4.732215006037813e-05, + "loss": 1.7242, + "step": 17319 + }, + { + "epoch": 5.316144874155924, + "grad_norm": 0.2432398796081543, + "learning_rate": 4.7317186647536044e-05, + "loss": 1.7056, + "step": 17320 + }, + { + "epoch": 5.316451810926949, + "grad_norm": 0.1994420737028122, + "learning_rate": 4.7312223261207086e-05, + "loss": 1.6667, + "step": 17321 + }, + { + "epoch": 5.316758747697974, + "grad_norm": 0.22314350306987762, + "learning_rate": 4.73072599014403e-05, + "loss": 1.7945, + "step": 17322 + }, + { + "epoch": 5.317065684468999, + "grad_norm": 0.2309068888425827, + "learning_rate": 4.730229656828477e-05, + "loss": 1.7099, + "step": 17323 + }, + { + "epoch": 5.3173726212400245, + "grad_norm": 0.22388015687465668, + "learning_rate": 4.729733326178951e-05, + "loss": 1.7053, + "step": 17324 + }, + { + "epoch": 5.31767955801105, + "grad_norm": 0.20203040540218353, + "learning_rate": 4.72923699820036e-05, + "loss": 1.6992, + "step": 17325 + }, + { + "epoch": 5.317986494782075, + "grad_norm": 0.24416297674179077, + "learning_rate": 4.728740672897606e-05, + "loss": 1.7455, + "step": 17326 + }, + { + "epoch": 5.3182934315531, + "grad_norm": 0.2501862049102783, + "learning_rate": 4.728244350275597e-05, + "loss": 1.7609, + "step": 17327 + }, + { + "epoch": 5.318600368324125, + "grad_norm": 0.21482665836811066, + "learning_rate": 4.727748030339235e-05, + "loss": 1.7614, + "step": 17328 + }, + { + "epoch": 5.31890730509515, + "grad_norm": 0.2241419404745102, + "learning_rate": 4.727251713093429e-05, + "loss": 1.736, + "step": 17329 + }, + { + "epoch": 5.319214241866176, + "grad_norm": 0.1757260262966156, + "learning_rate": 4.726755398543079e-05, + "loss": 1.6646, + "step": 17330 + }, + { + "epoch": 5.319521178637201, + "grad_norm": 0.18697243928909302, + "learning_rate": 4.726259086693095e-05, + "loss": 1.7512, + "step": 17331 + }, + { + "epoch": 5.319828115408226, + "grad_norm": 0.22584228217601776, + "learning_rate": 4.725762777548376e-05, + "loss": 1.7439, + "step": 17332 + }, + { + "epoch": 5.320135052179251, + "grad_norm": 0.18673470616340637, + "learning_rate": 4.725266471113832e-05, + "loss": 1.7007, + "step": 17333 + }, + { + "epoch": 5.320441988950276, + "grad_norm": 0.23030288517475128, + "learning_rate": 4.7247701673943656e-05, + "loss": 1.8021, + "step": 17334 + }, + { + "epoch": 5.320748925721301, + "grad_norm": 0.19333480298519135, + "learning_rate": 4.7242738663948813e-05, + "loss": 1.6659, + "step": 17335 + }, + { + "epoch": 5.321055862492327, + "grad_norm": 0.278097003698349, + "learning_rate": 4.723777568120284e-05, + "loss": 1.7302, + "step": 17336 + }, + { + "epoch": 5.321362799263352, + "grad_norm": 0.2146742343902588, + "learning_rate": 4.72328127257548e-05, + "loss": 1.7644, + "step": 17337 + }, + { + "epoch": 5.3216697360343765, + "grad_norm": 0.25582969188690186, + "learning_rate": 4.722784979765372e-05, + "loss": 1.7872, + "step": 17338 + }, + { + "epoch": 5.321976672805402, + "grad_norm": 0.20411577820777893, + "learning_rate": 4.722288689694864e-05, + "loss": 1.7167, + "step": 17339 + }, + { + "epoch": 5.322283609576427, + "grad_norm": 0.20894703269004822, + "learning_rate": 4.7217924023688645e-05, + "loss": 1.7526, + "step": 17340 + }, + { + "epoch": 5.3225905463474525, + "grad_norm": 0.20197831094264984, + "learning_rate": 4.721296117792273e-05, + "loss": 1.711, + "step": 17341 + }, + { + "epoch": 5.322897483118478, + "grad_norm": 0.20490549504756927, + "learning_rate": 4.720799835969999e-05, + "loss": 1.7303, + "step": 17342 + }, + { + "epoch": 5.323204419889503, + "grad_norm": 0.20666229724884033, + "learning_rate": 4.720303556906943e-05, + "loss": 1.6738, + "step": 17343 + }, + { + "epoch": 5.323511356660528, + "grad_norm": 0.21899856626987457, + "learning_rate": 4.719807280608011e-05, + "loss": 1.7632, + "step": 17344 + }, + { + "epoch": 5.323818293431553, + "grad_norm": 0.2310410887002945, + "learning_rate": 4.719311007078108e-05, + "loss": 1.7568, + "step": 17345 + }, + { + "epoch": 5.324125230202578, + "grad_norm": 0.20057427883148193, + "learning_rate": 4.7188147363221394e-05, + "loss": 1.6716, + "step": 17346 + }, + { + "epoch": 5.324432166973604, + "grad_norm": 0.21361050009727478, + "learning_rate": 4.718318468345006e-05, + "loss": 1.7224, + "step": 17347 + }, + { + "epoch": 5.324739103744629, + "grad_norm": 0.28389376401901245, + "learning_rate": 4.7178222031516173e-05, + "loss": 1.8519, + "step": 17348 + }, + { + "epoch": 5.3250460405156534, + "grad_norm": 0.2094416618347168, + "learning_rate": 4.717325940746872e-05, + "loss": 1.7763, + "step": 17349 + }, + { + "epoch": 5.325352977286679, + "grad_norm": 0.2263312190771103, + "learning_rate": 4.716829681135681e-05, + "loss": 1.7961, + "step": 17350 + }, + { + "epoch": 5.325659914057704, + "grad_norm": 0.2685631811618805, + "learning_rate": 4.7163334243229417e-05, + "loss": 1.7763, + "step": 17351 + }, + { + "epoch": 5.3259668508287294, + "grad_norm": 0.2029418647289276, + "learning_rate": 4.7158371703135636e-05, + "loss": 1.7662, + "step": 17352 + }, + { + "epoch": 5.326273787599755, + "grad_norm": 0.3109094798564911, + "learning_rate": 4.715340919112447e-05, + "loss": 1.7064, + "step": 17353 + }, + { + "epoch": 5.326580724370779, + "grad_norm": 0.24679912626743317, + "learning_rate": 4.714844670724502e-05, + "loss": 1.6903, + "step": 17354 + }, + { + "epoch": 5.326887661141805, + "grad_norm": 0.2004890739917755, + "learning_rate": 4.714348425154627e-05, + "loss": 1.7242, + "step": 17355 + }, + { + "epoch": 5.32719459791283, + "grad_norm": 0.27442196011543274, + "learning_rate": 4.7138521824077284e-05, + "loss": 1.826, + "step": 17356 + }, + { + "epoch": 5.327501534683855, + "grad_norm": 0.19933666288852692, + "learning_rate": 4.713355942488711e-05, + "loss": 1.748, + "step": 17357 + }, + { + "epoch": 5.327808471454881, + "grad_norm": 0.2306378185749054, + "learning_rate": 4.712859705402476e-05, + "loss": 1.7426, + "step": 17358 + }, + { + "epoch": 5.328115408225905, + "grad_norm": 0.22484014928340912, + "learning_rate": 4.7123634711539324e-05, + "loss": 1.7355, + "step": 17359 + }, + { + "epoch": 5.32842234499693, + "grad_norm": 0.2501749098300934, + "learning_rate": 4.711867239747979e-05, + "loss": 1.7502, + "step": 17360 + }, + { + "epoch": 5.328729281767956, + "grad_norm": 0.1940663903951645, + "learning_rate": 4.711371011189525e-05, + "loss": 1.7423, + "step": 17361 + }, + { + "epoch": 5.329036218538981, + "grad_norm": 0.28115448355674744, + "learning_rate": 4.71087478548347e-05, + "loss": 1.7134, + "step": 17362 + }, + { + "epoch": 5.329343155310006, + "grad_norm": 0.29717928171157837, + "learning_rate": 4.71037856263472e-05, + "loss": 1.8145, + "step": 17363 + }, + { + "epoch": 5.329650092081032, + "grad_norm": 0.24278375506401062, + "learning_rate": 4.709882342648179e-05, + "loss": 1.689, + "step": 17364 + }, + { + "epoch": 5.329957028852056, + "grad_norm": 0.26382890343666077, + "learning_rate": 4.709386125528751e-05, + "loss": 1.801, + "step": 17365 + }, + { + "epoch": 5.3302639656230815, + "grad_norm": 0.237087219953537, + "learning_rate": 4.708889911281339e-05, + "loss": 1.7019, + "step": 17366 + }, + { + "epoch": 5.330570902394107, + "grad_norm": 0.21994253993034363, + "learning_rate": 4.7083936999108494e-05, + "loss": 1.707, + "step": 17367 + }, + { + "epoch": 5.330877839165132, + "grad_norm": 0.3028903901576996, + "learning_rate": 4.707897491422182e-05, + "loss": 1.7992, + "step": 17368 + }, + { + "epoch": 5.3311847759361575, + "grad_norm": 0.24991434812545776, + "learning_rate": 4.7074012858202435e-05, + "loss": 1.7894, + "step": 17369 + }, + { + "epoch": 5.331491712707182, + "grad_norm": 0.20631250739097595, + "learning_rate": 4.706905083109936e-05, + "loss": 1.6816, + "step": 17370 + }, + { + "epoch": 5.331798649478207, + "grad_norm": 0.23300573229789734, + "learning_rate": 4.7064088832961666e-05, + "loss": 1.7101, + "step": 17371 + }, + { + "epoch": 5.332105586249233, + "grad_norm": 0.22331316769123077, + "learning_rate": 4.705912686383837e-05, + "loss": 1.861, + "step": 17372 + }, + { + "epoch": 5.332412523020258, + "grad_norm": 0.204593226313591, + "learning_rate": 4.7054164923778485e-05, + "loss": 1.7062, + "step": 17373 + }, + { + "epoch": 5.332719459791283, + "grad_norm": 0.22207681834697723, + "learning_rate": 4.704920301283107e-05, + "loss": 1.7546, + "step": 17374 + }, + { + "epoch": 5.333026396562309, + "grad_norm": 0.2508530020713806, + "learning_rate": 4.7044241131045157e-05, + "loss": 1.7881, + "step": 17375 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.26084616780281067, + "learning_rate": 4.7039279278469804e-05, + "loss": 1.7292, + "step": 17376 + }, + { + "epoch": 5.333640270104358, + "grad_norm": 0.2122940719127655, + "learning_rate": 4.7034317455154006e-05, + "loss": 1.7493, + "step": 17377 + }, + { + "epoch": 5.333947206875384, + "grad_norm": 0.2627449333667755, + "learning_rate": 4.702935566114685e-05, + "loss": 1.759, + "step": 17378 + }, + { + "epoch": 5.334254143646409, + "grad_norm": 0.20637977123260498, + "learning_rate": 4.702439389649732e-05, + "loss": 1.8043, + "step": 17379 + }, + { + "epoch": 5.334561080417434, + "grad_norm": 0.28783395886421204, + "learning_rate": 4.701943216125447e-05, + "loss": 1.7256, + "step": 17380 + }, + { + "epoch": 5.334868017188459, + "grad_norm": 0.21130618453025818, + "learning_rate": 4.701447045546734e-05, + "loss": 1.7161, + "step": 17381 + }, + { + "epoch": 5.335174953959484, + "grad_norm": 0.2793416678905487, + "learning_rate": 4.7009508779184984e-05, + "loss": 1.7659, + "step": 17382 + }, + { + "epoch": 5.3354818907305095, + "grad_norm": 0.3088020384311676, + "learning_rate": 4.700454713245639e-05, + "loss": 1.6877, + "step": 17383 + }, + { + "epoch": 5.335788827501535, + "grad_norm": 0.19697681069374084, + "learning_rate": 4.6999585515330646e-05, + "loss": 1.7111, + "step": 17384 + }, + { + "epoch": 5.33609576427256, + "grad_norm": 0.29234182834625244, + "learning_rate": 4.699462392785673e-05, + "loss": 1.7136, + "step": 17385 + }, + { + "epoch": 5.336402701043585, + "grad_norm": 0.2593611776828766, + "learning_rate": 4.698966237008371e-05, + "loss": 1.7531, + "step": 17386 + }, + { + "epoch": 5.33670963781461, + "grad_norm": 0.20024444162845612, + "learning_rate": 4.6984700842060604e-05, + "loss": 1.7035, + "step": 17387 + }, + { + "epoch": 5.337016574585635, + "grad_norm": 0.2929787039756775, + "learning_rate": 4.697973934383647e-05, + "loss": 1.7212, + "step": 17388 + }, + { + "epoch": 5.337323511356661, + "grad_norm": 0.2425665408372879, + "learning_rate": 4.697477787546032e-05, + "loss": 1.7191, + "step": 17389 + }, + { + "epoch": 5.337630448127686, + "grad_norm": 0.19175556302070618, + "learning_rate": 4.6969816436981176e-05, + "loss": 1.7291, + "step": 17390 + }, + { + "epoch": 5.337937384898711, + "grad_norm": 0.2602384686470032, + "learning_rate": 4.696485502844809e-05, + "loss": 1.7035, + "step": 17391 + }, + { + "epoch": 5.338244321669736, + "grad_norm": 0.19117408990859985, + "learning_rate": 4.695989364991006e-05, + "loss": 1.707, + "step": 17392 + }, + { + "epoch": 5.338551258440761, + "grad_norm": 0.31086108088493347, + "learning_rate": 4.6954932301416174e-05, + "loss": 1.7397, + "step": 17393 + }, + { + "epoch": 5.338858195211786, + "grad_norm": 0.27402472496032715, + "learning_rate": 4.694997098301542e-05, + "loss": 1.7144, + "step": 17394 + }, + { + "epoch": 5.339165131982812, + "grad_norm": 0.20345155894756317, + "learning_rate": 4.694500969475685e-05, + "loss": 1.7492, + "step": 17395 + }, + { + "epoch": 5.339472068753837, + "grad_norm": 0.23786045610904694, + "learning_rate": 4.694004843668947e-05, + "loss": 1.7781, + "step": 17396 + }, + { + "epoch": 5.3397790055248615, + "grad_norm": 0.19747424125671387, + "learning_rate": 4.6935087208862335e-05, + "loss": 1.7353, + "step": 17397 + }, + { + "epoch": 5.340085942295887, + "grad_norm": 0.224543035030365, + "learning_rate": 4.693012601132445e-05, + "loss": 1.7229, + "step": 17398 + }, + { + "epoch": 5.340392879066912, + "grad_norm": 0.20840135216712952, + "learning_rate": 4.692516484412488e-05, + "loss": 1.7557, + "step": 17399 + }, + { + "epoch": 5.3406998158379375, + "grad_norm": 0.21019098162651062, + "learning_rate": 4.692020370731261e-05, + "loss": 1.7793, + "step": 17400 + }, + { + "epoch": 5.341006752608963, + "grad_norm": 0.20540091395378113, + "learning_rate": 4.691524260093672e-05, + "loss": 1.6925, + "step": 17401 + }, + { + "epoch": 5.341313689379987, + "grad_norm": 0.2414131462574005, + "learning_rate": 4.691028152504619e-05, + "loss": 1.7706, + "step": 17402 + }, + { + "epoch": 5.341620626151013, + "grad_norm": 0.19627155363559723, + "learning_rate": 4.6905320479690073e-05, + "loss": 1.6356, + "step": 17403 + }, + { + "epoch": 5.341927562922038, + "grad_norm": 0.20978952944278717, + "learning_rate": 4.690035946491741e-05, + "loss": 1.7487, + "step": 17404 + }, + { + "epoch": 5.342234499693063, + "grad_norm": 0.2524566054344177, + "learning_rate": 4.689539848077719e-05, + "loss": 1.7713, + "step": 17405 + }, + { + "epoch": 5.342541436464089, + "grad_norm": 0.1967654973268509, + "learning_rate": 4.689043752731847e-05, + "loss": 1.7358, + "step": 17406 + }, + { + "epoch": 5.342848373235114, + "grad_norm": 0.2085377424955368, + "learning_rate": 4.688547660459026e-05, + "loss": 1.7104, + "step": 17407 + }, + { + "epoch": 5.343155310006138, + "grad_norm": 0.21294310688972473, + "learning_rate": 4.688051571264161e-05, + "loss": 1.7349, + "step": 17408 + }, + { + "epoch": 5.343462246777164, + "grad_norm": 0.23702891170978546, + "learning_rate": 4.6875554851521514e-05, + "loss": 1.8048, + "step": 17409 + }, + { + "epoch": 5.343769183548189, + "grad_norm": 0.2513964772224426, + "learning_rate": 4.687059402127904e-05, + "loss": 1.6669, + "step": 17410 + }, + { + "epoch": 5.344076120319214, + "grad_norm": 0.259540855884552, + "learning_rate": 4.6865633221963165e-05, + "loss": 1.7763, + "step": 17411 + }, + { + "epoch": 5.34438305709024, + "grad_norm": 0.28354617953300476, + "learning_rate": 4.6860672453622966e-05, + "loss": 1.7912, + "step": 17412 + }, + { + "epoch": 5.344689993861264, + "grad_norm": 0.2503860592842102, + "learning_rate": 4.685571171630742e-05, + "loss": 1.6817, + "step": 17413 + }, + { + "epoch": 5.3449969306322895, + "grad_norm": 0.2317555695772171, + "learning_rate": 4.685075101006558e-05, + "loss": 1.7652, + "step": 17414 + }, + { + "epoch": 5.345303867403315, + "grad_norm": 0.23333363234996796, + "learning_rate": 4.684579033494646e-05, + "loss": 1.722, + "step": 17415 + }, + { + "epoch": 5.34561080417434, + "grad_norm": 0.22507359087467194, + "learning_rate": 4.6840829690999104e-05, + "loss": 1.7522, + "step": 17416 + }, + { + "epoch": 5.3459177409453655, + "grad_norm": 0.2298288643360138, + "learning_rate": 4.6835869078272504e-05, + "loss": 1.7425, + "step": 17417 + }, + { + "epoch": 5.346224677716391, + "grad_norm": 0.2829224765300751, + "learning_rate": 4.683090849681572e-05, + "loss": 1.7798, + "step": 17418 + }, + { + "epoch": 5.346531614487415, + "grad_norm": 0.18153807520866394, + "learning_rate": 4.682594794667773e-05, + "loss": 1.6846, + "step": 17419 + }, + { + "epoch": 5.346838551258441, + "grad_norm": 0.24153028428554535, + "learning_rate": 4.6820987427907596e-05, + "loss": 1.7474, + "step": 17420 + }, + { + "epoch": 5.347145488029466, + "grad_norm": 0.2529772222042084, + "learning_rate": 4.681602694055434e-05, + "loss": 1.7465, + "step": 17421 + }, + { + "epoch": 5.347452424800491, + "grad_norm": 0.20414131879806519, + "learning_rate": 4.681106648466696e-05, + "loss": 1.7704, + "step": 17422 + }, + { + "epoch": 5.347759361571517, + "grad_norm": 0.27280452847480774, + "learning_rate": 4.68061060602945e-05, + "loss": 1.791, + "step": 17423 + }, + { + "epoch": 5.348066298342541, + "grad_norm": 0.20767468214035034, + "learning_rate": 4.680114566748595e-05, + "loss": 1.7744, + "step": 17424 + }, + { + "epoch": 5.348373235113566, + "grad_norm": 0.2661697566509247, + "learning_rate": 4.679618530629036e-05, + "loss": 1.7999, + "step": 17425 + }, + { + "epoch": 5.348680171884592, + "grad_norm": 0.23666872084140778, + "learning_rate": 4.679122497675674e-05, + "loss": 1.7204, + "step": 17426 + }, + { + "epoch": 5.348987108655617, + "grad_norm": 0.2688015401363373, + "learning_rate": 4.678626467893414e-05, + "loss": 1.7619, + "step": 17427 + }, + { + "epoch": 5.349294045426642, + "grad_norm": 0.23924420773983002, + "learning_rate": 4.678130441287153e-05, + "loss": 1.7754, + "step": 17428 + }, + { + "epoch": 5.349600982197667, + "grad_norm": 0.25724148750305176, + "learning_rate": 4.677634417861798e-05, + "loss": 1.761, + "step": 17429 + }, + { + "epoch": 5.349907918968692, + "grad_norm": 0.2633780241012573, + "learning_rate": 4.6771383976222464e-05, + "loss": 1.8705, + "step": 17430 + }, + { + "epoch": 5.350214855739718, + "grad_norm": 0.24774575233459473, + "learning_rate": 4.6766423805734036e-05, + "loss": 1.7127, + "step": 17431 + }, + { + "epoch": 5.350521792510743, + "grad_norm": 0.29887545108795166, + "learning_rate": 4.6761463667201695e-05, + "loss": 1.7651, + "step": 17432 + }, + { + "epoch": 5.350828729281768, + "grad_norm": 0.2231605499982834, + "learning_rate": 4.6756503560674486e-05, + "loss": 1.7636, + "step": 17433 + }, + { + "epoch": 5.351135666052793, + "grad_norm": 0.27977073192596436, + "learning_rate": 4.675154348620139e-05, + "loss": 1.7108, + "step": 17434 + }, + { + "epoch": 5.351442602823818, + "grad_norm": 0.26866039633750916, + "learning_rate": 4.674658344383146e-05, + "loss": 1.7593, + "step": 17435 + }, + { + "epoch": 5.351749539594843, + "grad_norm": 0.2154620885848999, + "learning_rate": 4.6741623433613685e-05, + "loss": 1.7536, + "step": 17436 + }, + { + "epoch": 5.352056476365869, + "grad_norm": 0.276656836271286, + "learning_rate": 4.673666345559711e-05, + "loss": 1.803, + "step": 17437 + }, + { + "epoch": 5.352363413136894, + "grad_norm": 0.22247640788555145, + "learning_rate": 4.6731703509830744e-05, + "loss": 1.7273, + "step": 17438 + }, + { + "epoch": 5.352670349907919, + "grad_norm": 0.2399090677499771, + "learning_rate": 4.6726743596363574e-05, + "loss": 1.7708, + "step": 17439 + }, + { + "epoch": 5.352977286678944, + "grad_norm": 0.2550101578235626, + "learning_rate": 4.6721783715244674e-05, + "loss": 1.7016, + "step": 17440 + }, + { + "epoch": 5.353284223449969, + "grad_norm": 0.19929546117782593, + "learning_rate": 4.6716823866523e-05, + "loss": 1.7417, + "step": 17441 + }, + { + "epoch": 5.3535911602209945, + "grad_norm": 0.2496672421693802, + "learning_rate": 4.671186405024761e-05, + "loss": 1.72, + "step": 17442 + }, + { + "epoch": 5.35389809699202, + "grad_norm": 0.19827665388584137, + "learning_rate": 4.67069042664675e-05, + "loss": 1.7515, + "step": 17443 + }, + { + "epoch": 5.354205033763045, + "grad_norm": 0.2528775930404663, + "learning_rate": 4.670194451523171e-05, + "loss": 1.7429, + "step": 17444 + }, + { + "epoch": 5.35451197053407, + "grad_norm": 0.19569729268550873, + "learning_rate": 4.6696984796589215e-05, + "loss": 1.7314, + "step": 17445 + }, + { + "epoch": 5.354818907305095, + "grad_norm": 0.21892370283603668, + "learning_rate": 4.669202511058908e-05, + "loss": 1.7331, + "step": 17446 + }, + { + "epoch": 5.35512584407612, + "grad_norm": 0.21609409153461456, + "learning_rate": 4.668706545728026e-05, + "loss": 1.7267, + "step": 17447 + }, + { + "epoch": 5.355432780847146, + "grad_norm": 0.2631370425224304, + "learning_rate": 4.668210583671182e-05, + "loss": 1.7513, + "step": 17448 + }, + { + "epoch": 5.355739717618171, + "grad_norm": 0.31327441334724426, + "learning_rate": 4.667714624893274e-05, + "loss": 1.7936, + "step": 17449 + }, + { + "epoch": 5.356046654389196, + "grad_norm": 0.21602430939674377, + "learning_rate": 4.667218669399207e-05, + "loss": 1.7387, + "step": 17450 + }, + { + "epoch": 5.356353591160221, + "grad_norm": 0.2895040214061737, + "learning_rate": 4.6667227171938784e-05, + "loss": 1.7293, + "step": 17451 + }, + { + "epoch": 5.356660527931246, + "grad_norm": 0.35150307416915894, + "learning_rate": 4.666226768282193e-05, + "loss": 1.8215, + "step": 17452 + }, + { + "epoch": 5.356967464702271, + "grad_norm": 0.19034281373023987, + "learning_rate": 4.665730822669048e-05, + "loss": 1.702, + "step": 17453 + }, + { + "epoch": 5.357274401473297, + "grad_norm": 0.25586241483688354, + "learning_rate": 4.6652348803593484e-05, + "loss": 1.7809, + "step": 17454 + }, + { + "epoch": 5.357581338244322, + "grad_norm": 0.23919305205345154, + "learning_rate": 4.6647389413579944e-05, + "loss": 1.7555, + "step": 17455 + }, + { + "epoch": 5.3578882750153465, + "grad_norm": 0.22707165777683258, + "learning_rate": 4.664243005669885e-05, + "loss": 1.7633, + "step": 17456 + }, + { + "epoch": 5.358195211786372, + "grad_norm": 0.20666839182376862, + "learning_rate": 4.663747073299925e-05, + "loss": 1.6522, + "step": 17457 + }, + { + "epoch": 5.358502148557397, + "grad_norm": 0.20557542145252228, + "learning_rate": 4.663251144253012e-05, + "loss": 1.73, + "step": 17458 + }, + { + "epoch": 5.3588090853284225, + "grad_norm": 0.22375571727752686, + "learning_rate": 4.662755218534049e-05, + "loss": 1.7189, + "step": 17459 + }, + { + "epoch": 5.359116022099448, + "grad_norm": 0.261393278837204, + "learning_rate": 4.662259296147936e-05, + "loss": 1.6863, + "step": 17460 + }, + { + "epoch": 5.359422958870473, + "grad_norm": 0.2279379516839981, + "learning_rate": 4.6617633770995764e-05, + "loss": 1.7332, + "step": 17461 + }, + { + "epoch": 5.359729895641498, + "grad_norm": 0.2194606065750122, + "learning_rate": 4.6612674613938666e-05, + "loss": 1.7324, + "step": 17462 + }, + { + "epoch": 5.360036832412523, + "grad_norm": 0.27714410424232483, + "learning_rate": 4.660771549035713e-05, + "loss": 1.7386, + "step": 17463 + }, + { + "epoch": 5.360343769183548, + "grad_norm": 0.2118787169456482, + "learning_rate": 4.660275640030012e-05, + "loss": 1.7587, + "step": 17464 + }, + { + "epoch": 5.360650705954574, + "grad_norm": 0.2546979784965515, + "learning_rate": 4.6597797343816665e-05, + "loss": 1.7756, + "step": 17465 + }, + { + "epoch": 5.360957642725599, + "grad_norm": 0.194237619638443, + "learning_rate": 4.659283832095577e-05, + "loss": 1.7351, + "step": 17466 + }, + { + "epoch": 5.361264579496623, + "grad_norm": 0.23448583483695984, + "learning_rate": 4.658787933176646e-05, + "loss": 1.7051, + "step": 17467 + }, + { + "epoch": 5.361571516267649, + "grad_norm": 0.22796298563480377, + "learning_rate": 4.65829203762977e-05, + "loss": 1.7395, + "step": 17468 + }, + { + "epoch": 5.361878453038674, + "grad_norm": 0.22674904763698578, + "learning_rate": 4.657796145459855e-05, + "loss": 1.714, + "step": 17469 + }, + { + "epoch": 5.362185389809699, + "grad_norm": 0.2697311341762543, + "learning_rate": 4.657300256671797e-05, + "loss": 1.8271, + "step": 17470 + }, + { + "epoch": 5.362492326580725, + "grad_norm": 0.28040480613708496, + "learning_rate": 4.6568043712705004e-05, + "loss": 1.8192, + "step": 17471 + }, + { + "epoch": 5.362799263351749, + "grad_norm": 0.21100232005119324, + "learning_rate": 4.6563084892608644e-05, + "loss": 1.7285, + "step": 17472 + }, + { + "epoch": 5.3631062001227745, + "grad_norm": 0.23545897006988525, + "learning_rate": 4.655812610647787e-05, + "loss": 1.7302, + "step": 17473 + }, + { + "epoch": 5.3634131368938, + "grad_norm": 0.23278315365314484, + "learning_rate": 4.655316735436174e-05, + "loss": 1.7749, + "step": 17474 + }, + { + "epoch": 5.363720073664825, + "grad_norm": 0.333763986825943, + "learning_rate": 4.65482086363092e-05, + "loss": 1.7393, + "step": 17475 + }, + { + "epoch": 5.3640270104358505, + "grad_norm": 0.2743878662586212, + "learning_rate": 4.6543249952369306e-05, + "loss": 1.7274, + "step": 17476 + }, + { + "epoch": 5.364333947206875, + "grad_norm": 0.234402596950531, + "learning_rate": 4.6538291302591024e-05, + "loss": 1.7848, + "step": 17477 + }, + { + "epoch": 5.3646408839779, + "grad_norm": 0.29100897908210754, + "learning_rate": 4.65333326870234e-05, + "loss": 1.7698, + "step": 17478 + }, + { + "epoch": 5.364947820748926, + "grad_norm": 0.24178378283977509, + "learning_rate": 4.652837410571539e-05, + "loss": 1.8142, + "step": 17479 + }, + { + "epoch": 5.365254757519951, + "grad_norm": 0.4189155101776123, + "learning_rate": 4.652341555871605e-05, + "loss": 1.7435, + "step": 17480 + }, + { + "epoch": 5.365561694290976, + "grad_norm": 0.40106773376464844, + "learning_rate": 4.651845704607433e-05, + "loss": 1.837, + "step": 17481 + }, + { + "epoch": 5.365868631062002, + "grad_norm": 0.24127443134784698, + "learning_rate": 4.651349856783927e-05, + "loss": 1.7257, + "step": 17482 + }, + { + "epoch": 5.366175567833026, + "grad_norm": 0.412812739610672, + "learning_rate": 4.650854012405985e-05, + "loss": 1.762, + "step": 17483 + }, + { + "epoch": 5.366482504604051, + "grad_norm": 0.2636469602584839, + "learning_rate": 4.65035817147851e-05, + "loss": 1.7995, + "step": 17484 + }, + { + "epoch": 5.366789441375077, + "grad_norm": 0.282186895608902, + "learning_rate": 4.649862334006399e-05, + "loss": 1.75, + "step": 17485 + }, + { + "epoch": 5.367096378146102, + "grad_norm": 0.3280154764652252, + "learning_rate": 4.649366499994555e-05, + "loss": 1.7668, + "step": 17486 + }, + { + "epoch": 5.367403314917127, + "grad_norm": 0.24608035385608673, + "learning_rate": 4.648870669447875e-05, + "loss": 1.8332, + "step": 17487 + }, + { + "epoch": 5.367710251688152, + "grad_norm": 0.21927174925804138, + "learning_rate": 4.648374842371262e-05, + "loss": 1.7365, + "step": 17488 + }, + { + "epoch": 5.368017188459177, + "grad_norm": 0.2658425569534302, + "learning_rate": 4.6478790187696164e-05, + "loss": 1.841, + "step": 17489 + }, + { + "epoch": 5.3683241252302025, + "grad_norm": 0.2302858531475067, + "learning_rate": 4.647383198647834e-05, + "loss": 1.7882, + "step": 17490 + }, + { + "epoch": 5.368631062001228, + "grad_norm": 0.2562740743160248, + "learning_rate": 4.64688738201082e-05, + "loss": 1.7188, + "step": 17491 + }, + { + "epoch": 5.368937998772253, + "grad_norm": 0.28140220046043396, + "learning_rate": 4.646391568863469e-05, + "loss": 1.7482, + "step": 17492 + }, + { + "epoch": 5.3692449355432785, + "grad_norm": 0.21040008962154388, + "learning_rate": 4.6458957592106855e-05, + "loss": 1.7695, + "step": 17493 + }, + { + "epoch": 5.369551872314303, + "grad_norm": 0.25322291254997253, + "learning_rate": 4.645399953057367e-05, + "loss": 1.7127, + "step": 17494 + }, + { + "epoch": 5.369858809085328, + "grad_norm": 0.2239738404750824, + "learning_rate": 4.644904150408415e-05, + "loss": 1.7376, + "step": 17495 + }, + { + "epoch": 5.370165745856354, + "grad_norm": 0.21432901918888092, + "learning_rate": 4.644408351268727e-05, + "loss": 1.7156, + "step": 17496 + }, + { + "epoch": 5.370472682627379, + "grad_norm": 0.3057272732257843, + "learning_rate": 4.643912555643205e-05, + "loss": 1.7706, + "step": 17497 + }, + { + "epoch": 5.370779619398404, + "grad_norm": 0.2826928496360779, + "learning_rate": 4.643416763536748e-05, + "loss": 1.8298, + "step": 17498 + }, + { + "epoch": 5.371086556169429, + "grad_norm": 0.2395278513431549, + "learning_rate": 4.642920974954255e-05, + "loss": 1.7357, + "step": 17499 + }, + { + "epoch": 5.371393492940454, + "grad_norm": 0.21004743874073029, + "learning_rate": 4.642425189900626e-05, + "loss": 1.7263, + "step": 17500 + }, + { + "epoch": 5.371700429711479, + "grad_norm": 0.23981697857379913, + "learning_rate": 4.641929408380761e-05, + "loss": 1.7341, + "step": 17501 + }, + { + "epoch": 5.372007366482505, + "grad_norm": 0.1984727531671524, + "learning_rate": 4.641433630399559e-05, + "loss": 1.7133, + "step": 17502 + }, + { + "epoch": 5.37231430325353, + "grad_norm": 0.22153446078300476, + "learning_rate": 4.640937855961922e-05, + "loss": 1.8028, + "step": 17503 + }, + { + "epoch": 5.3726212400245545, + "grad_norm": 0.24257974326610565, + "learning_rate": 4.6404420850727455e-05, + "loss": 1.7842, + "step": 17504 + }, + { + "epoch": 5.37292817679558, + "grad_norm": 0.19444705545902252, + "learning_rate": 4.6399463177369316e-05, + "loss": 1.7296, + "step": 17505 + }, + { + "epoch": 5.373235113566605, + "grad_norm": 0.2068849354982376, + "learning_rate": 4.6394505539593806e-05, + "loss": 1.6949, + "step": 17506 + }, + { + "epoch": 5.3735420503376305, + "grad_norm": 0.21762309968471527, + "learning_rate": 4.638954793744989e-05, + "loss": 1.7556, + "step": 17507 + }, + { + "epoch": 5.373848987108656, + "grad_norm": 0.20791584253311157, + "learning_rate": 4.638459037098659e-05, + "loss": 1.7442, + "step": 17508 + }, + { + "epoch": 5.37415592387968, + "grad_norm": 0.27774497866630554, + "learning_rate": 4.6379632840252875e-05, + "loss": 1.7834, + "step": 17509 + }, + { + "epoch": 5.374462860650706, + "grad_norm": 0.24211421608924866, + "learning_rate": 4.637467534529775e-05, + "loss": 1.819, + "step": 17510 + }, + { + "epoch": 5.374769797421731, + "grad_norm": 0.24857789278030396, + "learning_rate": 4.636971788617022e-05, + "loss": 1.7483, + "step": 17511 + }, + { + "epoch": 5.375076734192756, + "grad_norm": 0.25142937898635864, + "learning_rate": 4.636476046291925e-05, + "loss": 1.7405, + "step": 17512 + }, + { + "epoch": 5.375383670963782, + "grad_norm": 0.25860801339149475, + "learning_rate": 4.6359803075593846e-05, + "loss": 1.7821, + "step": 17513 + }, + { + "epoch": 5.375690607734807, + "grad_norm": 0.25223109126091003, + "learning_rate": 4.635484572424302e-05, + "loss": 1.738, + "step": 17514 + }, + { + "epoch": 5.3759975445058314, + "grad_norm": 0.22931768000125885, + "learning_rate": 4.634988840891573e-05, + "loss": 1.7717, + "step": 17515 + }, + { + "epoch": 5.376304481276857, + "grad_norm": 0.21371231973171234, + "learning_rate": 4.6344931129661e-05, + "loss": 1.7741, + "step": 17516 + }, + { + "epoch": 5.376611418047882, + "grad_norm": 0.2653632164001465, + "learning_rate": 4.633997388652778e-05, + "loss": 1.7548, + "step": 17517 + }, + { + "epoch": 5.3769183548189075, + "grad_norm": 0.2559951841831207, + "learning_rate": 4.6335016679565094e-05, + "loss": 1.7833, + "step": 17518 + }, + { + "epoch": 5.377225291589933, + "grad_norm": 0.22560031712055206, + "learning_rate": 4.6330059508821914e-05, + "loss": 1.6929, + "step": 17519 + }, + { + "epoch": 5.377532228360957, + "grad_norm": 0.3084852695465088, + "learning_rate": 4.6325102374347255e-05, + "loss": 1.8107, + "step": 17520 + }, + { + "epoch": 5.377839165131983, + "grad_norm": 0.3329267203807831, + "learning_rate": 4.632014527619007e-05, + "loss": 1.6791, + "step": 17521 + }, + { + "epoch": 5.378146101903008, + "grad_norm": 0.26274019479751587, + "learning_rate": 4.631518821439939e-05, + "loss": 1.7187, + "step": 17522 + }, + { + "epoch": 5.378453038674033, + "grad_norm": 0.3769492208957672, + "learning_rate": 4.6310231189024165e-05, + "loss": 1.8366, + "step": 17523 + }, + { + "epoch": 5.378759975445059, + "grad_norm": 0.2503921687602997, + "learning_rate": 4.6305274200113385e-05, + "loss": 1.7281, + "step": 17524 + }, + { + "epoch": 5.379066912216084, + "grad_norm": 0.26305708289146423, + "learning_rate": 4.6300317247716074e-05, + "loss": 1.7231, + "step": 17525 + }, + { + "epoch": 5.379373848987108, + "grad_norm": 0.31899142265319824, + "learning_rate": 4.629536033188118e-05, + "loss": 1.8025, + "step": 17526 + }, + { + "epoch": 5.379680785758134, + "grad_norm": 0.21400104463100433, + "learning_rate": 4.629040345265772e-05, + "loss": 1.7481, + "step": 17527 + }, + { + "epoch": 5.379987722529159, + "grad_norm": 0.23147371411323547, + "learning_rate": 4.628544661009465e-05, + "loss": 1.7049, + "step": 17528 + }, + { + "epoch": 5.380294659300184, + "grad_norm": 0.21156759560108185, + "learning_rate": 4.628048980424099e-05, + "loss": 1.806, + "step": 17529 + }, + { + "epoch": 5.38060159607121, + "grad_norm": 0.22061556577682495, + "learning_rate": 4.6275533035145685e-05, + "loss": 1.7606, + "step": 17530 + }, + { + "epoch": 5.380908532842234, + "grad_norm": 0.23379987478256226, + "learning_rate": 4.6270576302857774e-05, + "loss": 1.7874, + "step": 17531 + }, + { + "epoch": 5.3812154696132595, + "grad_norm": 0.24738669395446777, + "learning_rate": 4.62656196074262e-05, + "loss": 1.7611, + "step": 17532 + }, + { + "epoch": 5.381522406384285, + "grad_norm": 0.19738905131816864, + "learning_rate": 4.6260662948899974e-05, + "loss": 1.7375, + "step": 17533 + }, + { + "epoch": 5.38182934315531, + "grad_norm": 0.2327810823917389, + "learning_rate": 4.6255706327328044e-05, + "loss": 1.7188, + "step": 17534 + }, + { + "epoch": 5.3821362799263355, + "grad_norm": 0.18944145739078522, + "learning_rate": 4.625074974275944e-05, + "loss": 1.6672, + "step": 17535 + }, + { + "epoch": 5.382443216697361, + "grad_norm": 0.20943734049797058, + "learning_rate": 4.624579319524311e-05, + "loss": 1.7238, + "step": 17536 + }, + { + "epoch": 5.382750153468385, + "grad_norm": 0.2060960829257965, + "learning_rate": 4.6240836684828074e-05, + "loss": 1.744, + "step": 17537 + }, + { + "epoch": 5.383057090239411, + "grad_norm": 0.19089816510677338, + "learning_rate": 4.6235880211563264e-05, + "loss": 1.6884, + "step": 17538 + }, + { + "epoch": 5.383364027010436, + "grad_norm": 0.22362665832042694, + "learning_rate": 4.623092377549772e-05, + "loss": 1.7076, + "step": 17539 + }, + { + "epoch": 5.383670963781461, + "grad_norm": 0.19429968297481537, + "learning_rate": 4.622596737668039e-05, + "loss": 1.7315, + "step": 17540 + }, + { + "epoch": 5.383977900552487, + "grad_norm": 0.20481903851032257, + "learning_rate": 4.622101101516024e-05, + "loss": 1.711, + "step": 17541 + }, + { + "epoch": 5.384284837323511, + "grad_norm": 0.19181163609027863, + "learning_rate": 4.6216054690986304e-05, + "loss": 1.6879, + "step": 17542 + }, + { + "epoch": 5.384591774094536, + "grad_norm": 0.23105846345424652, + "learning_rate": 4.6211098404207514e-05, + "loss": 1.7797, + "step": 17543 + }, + { + "epoch": 5.384898710865562, + "grad_norm": 0.2742008864879608, + "learning_rate": 4.6206142154872886e-05, + "loss": 1.7404, + "step": 17544 + }, + { + "epoch": 5.385205647636587, + "grad_norm": 0.2256750613451004, + "learning_rate": 4.6201185943031365e-05, + "loss": 1.7616, + "step": 17545 + }, + { + "epoch": 5.385512584407612, + "grad_norm": 0.23230868577957153, + "learning_rate": 4.6196229768731964e-05, + "loss": 1.7457, + "step": 17546 + }, + { + "epoch": 5.385819521178637, + "grad_norm": 0.2200126200914383, + "learning_rate": 4.6191273632023634e-05, + "loss": 1.7835, + "step": 17547 + }, + { + "epoch": 5.386126457949662, + "grad_norm": 0.21903863549232483, + "learning_rate": 4.6186317532955395e-05, + "loss": 1.7315, + "step": 17548 + }, + { + "epoch": 5.3864333947206875, + "grad_norm": 0.1915556788444519, + "learning_rate": 4.6181361471576186e-05, + "loss": 1.6786, + "step": 17549 + }, + { + "epoch": 5.386740331491713, + "grad_norm": 0.20177799463272095, + "learning_rate": 4.617640544793501e-05, + "loss": 1.7453, + "step": 17550 + }, + { + "epoch": 5.387047268262738, + "grad_norm": 0.2598256766796112, + "learning_rate": 4.617144946208083e-05, + "loss": 1.7931, + "step": 17551 + }, + { + "epoch": 5.387354205033763, + "grad_norm": 0.2357153594493866, + "learning_rate": 4.616649351406263e-05, + "loss": 1.7932, + "step": 17552 + }, + { + "epoch": 5.387661141804788, + "grad_norm": 0.2228964865207672, + "learning_rate": 4.616153760392938e-05, + "loss": 1.7725, + "step": 17553 + }, + { + "epoch": 5.387968078575813, + "grad_norm": 0.20811811089515686, + "learning_rate": 4.6156581731730085e-05, + "loss": 1.744, + "step": 17554 + }, + { + "epoch": 5.388275015346839, + "grad_norm": 0.20008429884910583, + "learning_rate": 4.615162589751369e-05, + "loss": 1.6973, + "step": 17555 + }, + { + "epoch": 5.388581952117864, + "grad_norm": 0.20487523078918457, + "learning_rate": 4.614667010132919e-05, + "loss": 1.7712, + "step": 17556 + }, + { + "epoch": 5.388888888888889, + "grad_norm": 0.21279677748680115, + "learning_rate": 4.6141714343225554e-05, + "loss": 1.7783, + "step": 17557 + }, + { + "epoch": 5.389195825659914, + "grad_norm": 0.28035736083984375, + "learning_rate": 4.613675862325174e-05, + "loss": 1.767, + "step": 17558 + }, + { + "epoch": 5.389502762430939, + "grad_norm": 0.27426794171333313, + "learning_rate": 4.613180294145677e-05, + "loss": 1.7909, + "step": 17559 + }, + { + "epoch": 5.389809699201964, + "grad_norm": 0.22420327365398407, + "learning_rate": 4.612684729788957e-05, + "loss": 1.6902, + "step": 17560 + }, + { + "epoch": 5.39011663597299, + "grad_norm": 0.19799382984638214, + "learning_rate": 4.612189169259915e-05, + "loss": 1.7276, + "step": 17561 + }, + { + "epoch": 5.390423572744015, + "grad_norm": 0.2508823573589325, + "learning_rate": 4.611693612563445e-05, + "loss": 1.7445, + "step": 17562 + }, + { + "epoch": 5.3907305095150395, + "grad_norm": 0.20835694670677185, + "learning_rate": 4.611198059704448e-05, + "loss": 1.696, + "step": 17563 + }, + { + "epoch": 5.391037446286065, + "grad_norm": 0.22136010229587555, + "learning_rate": 4.6107025106878176e-05, + "loss": 1.7701, + "step": 17564 + }, + { + "epoch": 5.39134438305709, + "grad_norm": 0.23835612833499908, + "learning_rate": 4.610206965518456e-05, + "loss": 1.7494, + "step": 17565 + }, + { + "epoch": 5.3916513198281155, + "grad_norm": 0.26142916083335876, + "learning_rate": 4.6097114242012554e-05, + "loss": 1.7616, + "step": 17566 + }, + { + "epoch": 5.391958256599141, + "grad_norm": 0.3366851806640625, + "learning_rate": 4.6092158867411175e-05, + "loss": 1.7409, + "step": 17567 + }, + { + "epoch": 5.392265193370166, + "grad_norm": 0.2592991292476654, + "learning_rate": 4.608720353142935e-05, + "loss": 1.7469, + "step": 17568 + }, + { + "epoch": 5.392572130141191, + "grad_norm": 0.25810322165489197, + "learning_rate": 4.608224823411608e-05, + "loss": 1.7345, + "step": 17569 + }, + { + "epoch": 5.392879066912216, + "grad_norm": 0.26776888966560364, + "learning_rate": 4.607729297552032e-05, + "loss": 1.7698, + "step": 17570 + }, + { + "epoch": 5.393186003683241, + "grad_norm": 0.21023939549922943, + "learning_rate": 4.607233775569107e-05, + "loss": 1.7681, + "step": 17571 + }, + { + "epoch": 5.393492940454267, + "grad_norm": 0.24452096223831177, + "learning_rate": 4.6067382574677265e-05, + "loss": 1.8154, + "step": 17572 + }, + { + "epoch": 5.393799877225292, + "grad_norm": 0.27084338665008545, + "learning_rate": 4.606242743252791e-05, + "loss": 1.7106, + "step": 17573 + }, + { + "epoch": 5.394106813996316, + "grad_norm": 0.24783825874328613, + "learning_rate": 4.605747232929195e-05, + "loss": 1.713, + "step": 17574 + }, + { + "epoch": 5.394413750767342, + "grad_norm": 0.2528151869773865, + "learning_rate": 4.6052517265018333e-05, + "loss": 1.8475, + "step": 17575 + }, + { + "epoch": 5.394720687538367, + "grad_norm": 0.24361065030097961, + "learning_rate": 4.604756223975609e-05, + "loss": 1.7414, + "step": 17576 + }, + { + "epoch": 5.395027624309392, + "grad_norm": 0.2751234769821167, + "learning_rate": 4.604260725355412e-05, + "loss": 1.7603, + "step": 17577 + }, + { + "epoch": 5.395334561080418, + "grad_norm": 0.23183637857437134, + "learning_rate": 4.603765230646146e-05, + "loss": 1.7053, + "step": 17578 + }, + { + "epoch": 5.395641497851442, + "grad_norm": 0.27462145686149597, + "learning_rate": 4.6032697398527005e-05, + "loss": 1.746, + "step": 17579 + }, + { + "epoch": 5.3959484346224675, + "grad_norm": 0.3665321171283722, + "learning_rate": 4.602774252979978e-05, + "loss": 1.6883, + "step": 17580 + }, + { + "epoch": 5.396255371393493, + "grad_norm": 0.22438424825668335, + "learning_rate": 4.602278770032872e-05, + "loss": 1.7473, + "step": 17581 + }, + { + "epoch": 5.396562308164518, + "grad_norm": 0.38713687658309937, + "learning_rate": 4.601783291016282e-05, + "loss": 1.7993, + "step": 17582 + }, + { + "epoch": 5.3968692449355435, + "grad_norm": 0.3399868905544281, + "learning_rate": 4.6012878159351015e-05, + "loss": 1.7709, + "step": 17583 + }, + { + "epoch": 5.397176181706568, + "grad_norm": 0.21916119754314423, + "learning_rate": 4.60079234479423e-05, + "loss": 1.7351, + "step": 17584 + }, + { + "epoch": 5.397483118477593, + "grad_norm": 0.3796394467353821, + "learning_rate": 4.600296877598561e-05, + "loss": 1.7534, + "step": 17585 + }, + { + "epoch": 5.397790055248619, + "grad_norm": 0.27824562788009644, + "learning_rate": 4.599801414352993e-05, + "loss": 1.6962, + "step": 17586 + }, + { + "epoch": 5.398096992019644, + "grad_norm": 0.21037112176418304, + "learning_rate": 4.599305955062421e-05, + "loss": 1.7062, + "step": 17587 + }, + { + "epoch": 5.398403928790669, + "grad_norm": 0.3373035192489624, + "learning_rate": 4.598810499731745e-05, + "loss": 1.8263, + "step": 17588 + }, + { + "epoch": 5.398710865561695, + "grad_norm": 0.2560507357120514, + "learning_rate": 4.5983150483658564e-05, + "loss": 1.7232, + "step": 17589 + }, + { + "epoch": 5.399017802332719, + "grad_norm": 0.23010993003845215, + "learning_rate": 4.5978196009696564e-05, + "loss": 1.805, + "step": 17590 + }, + { + "epoch": 5.399324739103744, + "grad_norm": 0.32955634593963623, + "learning_rate": 4.597324157548037e-05, + "loss": 1.7018, + "step": 17591 + }, + { + "epoch": 5.39963167587477, + "grad_norm": 0.2534363865852356, + "learning_rate": 4.5968287181058953e-05, + "loss": 1.6919, + "step": 17592 + }, + { + "epoch": 5.399938612645795, + "grad_norm": 0.23179130256175995, + "learning_rate": 4.5963332826481314e-05, + "loss": 1.7237, + "step": 17593 + }, + { + "epoch": 5.4002455494168204, + "grad_norm": 0.37712663412094116, + "learning_rate": 4.5958378511796365e-05, + "loss": 1.7694, + "step": 17594 + }, + { + "epoch": 5.400552486187845, + "grad_norm": 0.21228717267513275, + "learning_rate": 4.59534242370531e-05, + "loss": 1.7528, + "step": 17595 + }, + { + "epoch": 5.40085942295887, + "grad_norm": 0.2818812429904938, + "learning_rate": 4.5948470002300454e-05, + "loss": 1.8214, + "step": 17596 + }, + { + "epoch": 5.401166359729896, + "grad_norm": 0.24916675686836243, + "learning_rate": 4.5943515807587415e-05, + "loss": 1.7792, + "step": 17597 + }, + { + "epoch": 5.401473296500921, + "grad_norm": 0.2096913456916809, + "learning_rate": 4.593856165296291e-05, + "loss": 1.6983, + "step": 17598 + }, + { + "epoch": 5.401780233271946, + "grad_norm": 0.271124005317688, + "learning_rate": 4.593360753847595e-05, + "loss": 1.7534, + "step": 17599 + }, + { + "epoch": 5.402087170042972, + "grad_norm": 0.24798092246055603, + "learning_rate": 4.5928653464175435e-05, + "loss": 1.7783, + "step": 17600 + }, + { + "epoch": 5.402394106813996, + "grad_norm": 0.3531748056411743, + "learning_rate": 4.592369943011038e-05, + "loss": 1.7834, + "step": 17601 + }, + { + "epoch": 5.402701043585021, + "grad_norm": 0.29650232195854187, + "learning_rate": 4.591874543632969e-05, + "loss": 1.7186, + "step": 17602 + }, + { + "epoch": 5.403007980356047, + "grad_norm": 0.25578248500823975, + "learning_rate": 4.591379148288236e-05, + "loss": 1.7849, + "step": 17603 + }, + { + "epoch": 5.403314917127072, + "grad_norm": 0.3790532946586609, + "learning_rate": 4.590883756981733e-05, + "loss": 1.7192, + "step": 17604 + }, + { + "epoch": 5.403621853898097, + "grad_norm": 0.23684249818325043, + "learning_rate": 4.590388369718359e-05, + "loss": 1.7171, + "step": 17605 + }, + { + "epoch": 5.403928790669122, + "grad_norm": 0.267702579498291, + "learning_rate": 4.589892986503005e-05, + "loss": 1.7181, + "step": 17606 + }, + { + "epoch": 5.404235727440147, + "grad_norm": 0.29105648398399353, + "learning_rate": 4.5893976073405704e-05, + "loss": 1.7395, + "step": 17607 + }, + { + "epoch": 5.4045426642111725, + "grad_norm": 0.2266589254140854, + "learning_rate": 4.588902232235949e-05, + "loss": 1.7244, + "step": 17608 + }, + { + "epoch": 5.404849600982198, + "grad_norm": 0.24065524339675903, + "learning_rate": 4.588406861194035e-05, + "loss": 1.7398, + "step": 17609 + }, + { + "epoch": 5.405156537753223, + "grad_norm": 0.23166650533676147, + "learning_rate": 4.587911494219728e-05, + "loss": 1.7592, + "step": 17610 + }, + { + "epoch": 5.4054634745242485, + "grad_norm": 0.19882038235664368, + "learning_rate": 4.5874161313179186e-05, + "loss": 1.7087, + "step": 17611 + }, + { + "epoch": 5.405770411295273, + "grad_norm": 0.2688273787498474, + "learning_rate": 4.5869207724935076e-05, + "loss": 1.7791, + "step": 17612 + }, + { + "epoch": 5.406077348066298, + "grad_norm": 0.1970982402563095, + "learning_rate": 4.5864254177513855e-05, + "loss": 1.7079, + "step": 17613 + }, + { + "epoch": 5.406384284837324, + "grad_norm": 0.2531265318393707, + "learning_rate": 4.585930067096451e-05, + "loss": 1.716, + "step": 17614 + }, + { + "epoch": 5.406691221608349, + "grad_norm": 0.2610352337360382, + "learning_rate": 4.585434720533596e-05, + "loss": 1.7133, + "step": 17615 + }, + { + "epoch": 5.406998158379374, + "grad_norm": 0.2420870065689087, + "learning_rate": 4.5849393780677216e-05, + "loss": 1.7044, + "step": 17616 + }, + { + "epoch": 5.407305095150399, + "grad_norm": 0.24078647792339325, + "learning_rate": 4.584444039703717e-05, + "loss": 1.7486, + "step": 17617 + }, + { + "epoch": 5.407612031921424, + "grad_norm": 0.19324539601802826, + "learning_rate": 4.583948705446481e-05, + "loss": 1.7439, + "step": 17618 + }, + { + "epoch": 5.407918968692449, + "grad_norm": 0.2311750054359436, + "learning_rate": 4.5834533753009065e-05, + "loss": 1.7794, + "step": 17619 + }, + { + "epoch": 5.408225905463475, + "grad_norm": 0.2554466128349304, + "learning_rate": 4.5829580492718914e-05, + "loss": 1.7146, + "step": 17620 + }, + { + "epoch": 5.4085328422345, + "grad_norm": 0.2679688334465027, + "learning_rate": 4.582462727364328e-05, + "loss": 1.7677, + "step": 17621 + }, + { + "epoch": 5.4088397790055245, + "grad_norm": 0.19292913377285004, + "learning_rate": 4.5819674095831146e-05, + "loss": 1.7544, + "step": 17622 + }, + { + "epoch": 5.40914671577655, + "grad_norm": 0.2146623730659485, + "learning_rate": 4.5814720959331425e-05, + "loss": 1.7182, + "step": 17623 + }, + { + "epoch": 5.409453652547575, + "grad_norm": 0.23098216950893402, + "learning_rate": 4.5809767864193096e-05, + "loss": 1.6844, + "step": 17624 + }, + { + "epoch": 5.4097605893186005, + "grad_norm": 0.22482910752296448, + "learning_rate": 4.5804814810465096e-05, + "loss": 1.7921, + "step": 17625 + }, + { + "epoch": 5.410067526089626, + "grad_norm": 0.22098569571971893, + "learning_rate": 4.579986179819636e-05, + "loss": 1.7419, + "step": 17626 + }, + { + "epoch": 5.41037446286065, + "grad_norm": 0.2131706178188324, + "learning_rate": 4.579490882743588e-05, + "loss": 1.7587, + "step": 17627 + }, + { + "epoch": 5.410681399631676, + "grad_norm": 0.22448734939098358, + "learning_rate": 4.578995589823254e-05, + "loss": 1.6959, + "step": 17628 + }, + { + "epoch": 5.410988336402701, + "grad_norm": 0.22372964024543762, + "learning_rate": 4.578500301063536e-05, + "loss": 1.7462, + "step": 17629 + }, + { + "epoch": 5.411295273173726, + "grad_norm": 0.22140730917453766, + "learning_rate": 4.578005016469322e-05, + "loss": 1.8348, + "step": 17630 + }, + { + "epoch": 5.411602209944752, + "grad_norm": 0.21697622537612915, + "learning_rate": 4.577509736045511e-05, + "loss": 1.7634, + "step": 17631 + }, + { + "epoch": 5.411909146715777, + "grad_norm": 0.2044363021850586, + "learning_rate": 4.5770144597969954e-05, + "loss": 1.7095, + "step": 17632 + }, + { + "epoch": 5.412216083486801, + "grad_norm": 0.1910451501607895, + "learning_rate": 4.576519187728674e-05, + "loss": 1.7022, + "step": 17633 + }, + { + "epoch": 5.412523020257827, + "grad_norm": 0.21787554025650024, + "learning_rate": 4.576023919845434e-05, + "loss": 1.7206, + "step": 17634 + }, + { + "epoch": 5.412829957028852, + "grad_norm": 0.2363428920507431, + "learning_rate": 4.575528656152178e-05, + "loss": 1.8052, + "step": 17635 + }, + { + "epoch": 5.413136893799877, + "grad_norm": 0.22830195724964142, + "learning_rate": 4.575033396653793e-05, + "loss": 1.7432, + "step": 17636 + }, + { + "epoch": 5.413443830570903, + "grad_norm": 0.24867239594459534, + "learning_rate": 4.5745381413551794e-05, + "loss": 1.7011, + "step": 17637 + }, + { + "epoch": 5.413750767341927, + "grad_norm": 0.19329775869846344, + "learning_rate": 4.574042890261228e-05, + "loss": 1.7749, + "step": 17638 + }, + { + "epoch": 5.4140577041129525, + "grad_norm": 0.22917115688323975, + "learning_rate": 4.573547643376836e-05, + "loss": 1.7478, + "step": 17639 + }, + { + "epoch": 5.414364640883978, + "grad_norm": 0.23882724344730377, + "learning_rate": 4.573052400706894e-05, + "loss": 1.7396, + "step": 17640 + }, + { + "epoch": 5.414671577655003, + "grad_norm": 0.19127070903778076, + "learning_rate": 4.572557162256301e-05, + "loss": 1.6791, + "step": 17641 + }, + { + "epoch": 5.4149785144260285, + "grad_norm": 0.18385560810565948, + "learning_rate": 4.5720619280299475e-05, + "loss": 1.7288, + "step": 17642 + }, + { + "epoch": 5.415285451197054, + "grad_norm": 0.19845189154148102, + "learning_rate": 4.571566698032728e-05, + "loss": 1.7525, + "step": 17643 + }, + { + "epoch": 5.415592387968078, + "grad_norm": 0.18987210094928741, + "learning_rate": 4.571071472269539e-05, + "loss": 1.7253, + "step": 17644 + }, + { + "epoch": 5.415899324739104, + "grad_norm": 0.18257199227809906, + "learning_rate": 4.570576250745271e-05, + "loss": 1.7051, + "step": 17645 + }, + { + "epoch": 5.416206261510129, + "grad_norm": 0.22803467512130737, + "learning_rate": 4.570081033464823e-05, + "loss": 1.7478, + "step": 17646 + }, + { + "epoch": 5.416513198281154, + "grad_norm": 0.18763841688632965, + "learning_rate": 4.569585820433084e-05, + "loss": 1.7316, + "step": 17647 + }, + { + "epoch": 5.41682013505218, + "grad_norm": 0.23974654078483582, + "learning_rate": 4.56909061165495e-05, + "loss": 1.7566, + "step": 17648 + }, + { + "epoch": 5.417127071823204, + "grad_norm": 0.24336253106594086, + "learning_rate": 4.568595407135315e-05, + "loss": 1.7468, + "step": 17649 + }, + { + "epoch": 5.417434008594229, + "grad_norm": 0.23891226947307587, + "learning_rate": 4.5681002068790755e-05, + "loss": 1.7201, + "step": 17650 + }, + { + "epoch": 5.417740945365255, + "grad_norm": 0.19209685921669006, + "learning_rate": 4.56760501089112e-05, + "loss": 1.713, + "step": 17651 + }, + { + "epoch": 5.41804788213628, + "grad_norm": 0.2407880276441574, + "learning_rate": 4.567109819176349e-05, + "loss": 1.7073, + "step": 17652 + }, + { + "epoch": 5.418354818907305, + "grad_norm": 0.2385055273771286, + "learning_rate": 4.5666146317396485e-05, + "loss": 1.7387, + "step": 17653 + }, + { + "epoch": 5.41866175567833, + "grad_norm": 0.22068475186824799, + "learning_rate": 4.566119448585918e-05, + "loss": 1.7116, + "step": 17654 + }, + { + "epoch": 5.418968692449355, + "grad_norm": 0.318375825881958, + "learning_rate": 4.5656242697200496e-05, + "loss": 1.7659, + "step": 17655 + }, + { + "epoch": 5.4192756292203805, + "grad_norm": 0.25311973690986633, + "learning_rate": 4.5651290951469366e-05, + "loss": 1.7814, + "step": 17656 + }, + { + "epoch": 5.419582565991406, + "grad_norm": 0.18701443076133728, + "learning_rate": 4.5646339248714735e-05, + "loss": 1.6993, + "step": 17657 + }, + { + "epoch": 5.419889502762431, + "grad_norm": 0.2964496314525604, + "learning_rate": 4.5641387588985516e-05, + "loss": 1.8254, + "step": 17658 + }, + { + "epoch": 5.420196439533456, + "grad_norm": 0.19447220861911774, + "learning_rate": 4.563643597233067e-05, + "loss": 1.7208, + "step": 17659 + }, + { + "epoch": 5.420503376304481, + "grad_norm": 0.21666039526462555, + "learning_rate": 4.5631484398799105e-05, + "loss": 1.6695, + "step": 17660 + }, + { + "epoch": 5.420810313075506, + "grad_norm": 0.23104412853717804, + "learning_rate": 4.5626532868439796e-05, + "loss": 1.7449, + "step": 17661 + }, + { + "epoch": 5.421117249846532, + "grad_norm": 0.20463459193706512, + "learning_rate": 4.562158138130163e-05, + "loss": 1.6714, + "step": 17662 + }, + { + "epoch": 5.421424186617557, + "grad_norm": 0.21948079764842987, + "learning_rate": 4.561662993743359e-05, + "loss": 1.6957, + "step": 17663 + }, + { + "epoch": 5.421731123388582, + "grad_norm": 0.2672746777534485, + "learning_rate": 4.561167853688455e-05, + "loss": 1.7137, + "step": 17664 + }, + { + "epoch": 5.422038060159607, + "grad_norm": 0.2652325928211212, + "learning_rate": 4.5606727179703493e-05, + "loss": 1.7943, + "step": 17665 + }, + { + "epoch": 5.422344996930632, + "grad_norm": 0.17761313915252686, + "learning_rate": 4.560177586593933e-05, + "loss": 1.7072, + "step": 17666 + }, + { + "epoch": 5.422651933701657, + "grad_norm": 0.24759770929813385, + "learning_rate": 4.5596824595641e-05, + "loss": 1.7807, + "step": 17667 + }, + { + "epoch": 5.422958870472683, + "grad_norm": 0.22191929817199707, + "learning_rate": 4.5591873368857416e-05, + "loss": 1.7668, + "step": 17668 + }, + { + "epoch": 5.423265807243708, + "grad_norm": 0.21293842792510986, + "learning_rate": 4.5586922185637546e-05, + "loss": 1.7304, + "step": 17669 + }, + { + "epoch": 5.4235727440147325, + "grad_norm": 0.2646051049232483, + "learning_rate": 4.5581971046030277e-05, + "loss": 1.7258, + "step": 17670 + }, + { + "epoch": 5.423879680785758, + "grad_norm": 0.1894550621509552, + "learning_rate": 4.5577019950084574e-05, + "loss": 1.7066, + "step": 17671 + }, + { + "epoch": 5.424186617556783, + "grad_norm": 0.2533467710018158, + "learning_rate": 4.557206889784934e-05, + "loss": 1.7668, + "step": 17672 + }, + { + "epoch": 5.4244935543278086, + "grad_norm": 0.1972150355577469, + "learning_rate": 4.556711788937352e-05, + "loss": 1.7306, + "step": 17673 + }, + { + "epoch": 5.424800491098834, + "grad_norm": 0.2726735472679138, + "learning_rate": 4.5562166924706054e-05, + "loss": 1.7281, + "step": 17674 + }, + { + "epoch": 5.425107427869859, + "grad_norm": 0.2244454175233841, + "learning_rate": 4.555721600389584e-05, + "loss": 1.7461, + "step": 17675 + }, + { + "epoch": 5.425414364640884, + "grad_norm": 0.19486510753631592, + "learning_rate": 4.555226512699182e-05, + "loss": 1.7361, + "step": 17676 + }, + { + "epoch": 5.425721301411909, + "grad_norm": 0.18128283321857452, + "learning_rate": 4.554731429404293e-05, + "loss": 1.7637, + "step": 17677 + }, + { + "epoch": 5.426028238182934, + "grad_norm": 0.24709749221801758, + "learning_rate": 4.5542363505098084e-05, + "loss": 1.7928, + "step": 17678 + }, + { + "epoch": 5.42633517495396, + "grad_norm": 0.2236633151769638, + "learning_rate": 4.553741276020621e-05, + "loss": 1.8262, + "step": 17679 + }, + { + "epoch": 5.426642111724985, + "grad_norm": 0.2592087984085083, + "learning_rate": 4.553246205941626e-05, + "loss": 1.675, + "step": 17680 + }, + { + "epoch": 5.4269490484960095, + "grad_norm": 0.27751871943473816, + "learning_rate": 4.552751140277712e-05, + "loss": 1.7344, + "step": 17681 + }, + { + "epoch": 5.427255985267035, + "grad_norm": 0.23752287030220032, + "learning_rate": 4.5522560790337746e-05, + "loss": 1.7748, + "step": 17682 + }, + { + "epoch": 5.42756292203806, + "grad_norm": 0.3259925842285156, + "learning_rate": 4.5517610222147035e-05, + "loss": 1.7855, + "step": 17683 + }, + { + "epoch": 5.4278698588090855, + "grad_norm": 0.2579646706581116, + "learning_rate": 4.551265969825394e-05, + "loss": 1.7978, + "step": 17684 + }, + { + "epoch": 5.428176795580111, + "grad_norm": 0.3217744827270508, + "learning_rate": 4.550770921870735e-05, + "loss": 1.7793, + "step": 17685 + }, + { + "epoch": 5.428483732351136, + "grad_norm": 0.2930903434753418, + "learning_rate": 4.550275878355624e-05, + "loss": 1.7226, + "step": 17686 + }, + { + "epoch": 5.428790669122161, + "grad_norm": 0.1982879489660263, + "learning_rate": 4.549780839284948e-05, + "loss": 1.6841, + "step": 17687 + }, + { + "epoch": 5.429097605893186, + "grad_norm": 0.20843900740146637, + "learning_rate": 4.5492858046636046e-05, + "loss": 1.7201, + "step": 17688 + }, + { + "epoch": 5.429404542664211, + "grad_norm": 0.23116534948349, + "learning_rate": 4.5487907744964794e-05, + "loss": 1.7565, + "step": 17689 + }, + { + "epoch": 5.429711479435237, + "grad_norm": 0.19177772104740143, + "learning_rate": 4.548295748788471e-05, + "loss": 1.7479, + "step": 17690 + }, + { + "epoch": 5.430018416206262, + "grad_norm": 0.22261449694633484, + "learning_rate": 4.547800727544469e-05, + "loss": 1.7785, + "step": 17691 + }, + { + "epoch": 5.430325352977286, + "grad_norm": 0.20073406398296356, + "learning_rate": 4.547305710769363e-05, + "loss": 1.741, + "step": 17692 + }, + { + "epoch": 5.430632289748312, + "grad_norm": 0.21662208437919617, + "learning_rate": 4.546810698468049e-05, + "loss": 1.7269, + "step": 17693 + }, + { + "epoch": 5.430939226519337, + "grad_norm": 0.19540879130363464, + "learning_rate": 4.546315690645416e-05, + "loss": 1.7141, + "step": 17694 + }, + { + "epoch": 5.431246163290362, + "grad_norm": 0.20063656568527222, + "learning_rate": 4.545820687306358e-05, + "loss": 1.7244, + "step": 17695 + }, + { + "epoch": 5.431553100061388, + "grad_norm": 0.2172660082578659, + "learning_rate": 4.545325688455765e-05, + "loss": 1.7172, + "step": 17696 + }, + { + "epoch": 5.431860036832412, + "grad_norm": 0.2480388581752777, + "learning_rate": 4.5448306940985326e-05, + "loss": 1.6994, + "step": 17697 + }, + { + "epoch": 5.4321669736034375, + "grad_norm": 0.22499477863311768, + "learning_rate": 4.544335704239547e-05, + "loss": 1.7405, + "step": 17698 + }, + { + "epoch": 5.432473910374463, + "grad_norm": 0.20655590295791626, + "learning_rate": 4.5438407188837065e-05, + "loss": 1.6867, + "step": 17699 + }, + { + "epoch": 5.432780847145488, + "grad_norm": 0.2045906037092209, + "learning_rate": 4.543345738035896e-05, + "loss": 1.7752, + "step": 17700 + }, + { + "epoch": 5.4330877839165135, + "grad_norm": 0.2092052847146988, + "learning_rate": 4.542850761701013e-05, + "loss": 1.7389, + "step": 17701 + }, + { + "epoch": 5.433394720687538, + "grad_norm": 0.1943730264902115, + "learning_rate": 4.5423557898839446e-05, + "loss": 1.7276, + "step": 17702 + }, + { + "epoch": 5.433701657458563, + "grad_norm": 0.23487289249897003, + "learning_rate": 4.541860822589587e-05, + "loss": 1.8119, + "step": 17703 + }, + { + "epoch": 5.434008594229589, + "grad_norm": 0.204689159989357, + "learning_rate": 4.541365859822827e-05, + "loss": 1.7865, + "step": 17704 + }, + { + "epoch": 5.434315531000614, + "grad_norm": 0.20850931107997894, + "learning_rate": 4.5408709015885604e-05, + "loss": 1.7733, + "step": 17705 + }, + { + "epoch": 5.434622467771639, + "grad_norm": 0.18685877323150635, + "learning_rate": 4.540375947891675e-05, + "loss": 1.7526, + "step": 17706 + }, + { + "epoch": 5.434929404542665, + "grad_norm": 0.2009890079498291, + "learning_rate": 4.539880998737064e-05, + "loss": 1.6904, + "step": 17707 + }, + { + "epoch": 5.435236341313689, + "grad_norm": 0.16602718830108643, + "learning_rate": 4.5393860541296205e-05, + "loss": 1.689, + "step": 17708 + }, + { + "epoch": 5.435543278084714, + "grad_norm": 0.24318818747997284, + "learning_rate": 4.5388911140742315e-05, + "loss": 1.7993, + "step": 17709 + }, + { + "epoch": 5.43585021485574, + "grad_norm": 0.24094417691230774, + "learning_rate": 4.538396178575793e-05, + "loss": 1.7235, + "step": 17710 + }, + { + "epoch": 5.436157151626765, + "grad_norm": 0.20361751317977905, + "learning_rate": 4.537901247639192e-05, + "loss": 1.7198, + "step": 17711 + }, + { + "epoch": 5.43646408839779, + "grad_norm": 0.2563718259334564, + "learning_rate": 4.537406321269323e-05, + "loss": 1.795, + "step": 17712 + }, + { + "epoch": 5.436771025168815, + "grad_norm": 0.29895591735839844, + "learning_rate": 4.536911399471075e-05, + "loss": 1.7515, + "step": 17713 + }, + { + "epoch": 5.43707796193984, + "grad_norm": 0.22535841166973114, + "learning_rate": 4.536416482249342e-05, + "loss": 1.6998, + "step": 17714 + }, + { + "epoch": 5.4373848987108655, + "grad_norm": 0.26025068759918213, + "learning_rate": 4.53592156960901e-05, + "loss": 1.7821, + "step": 17715 + }, + { + "epoch": 5.437691835481891, + "grad_norm": 0.3473168611526489, + "learning_rate": 4.535426661554975e-05, + "loss": 1.7035, + "step": 17716 + }, + { + "epoch": 5.437998772252916, + "grad_norm": 0.22207199037075043, + "learning_rate": 4.534931758092126e-05, + "loss": 1.7485, + "step": 17717 + }, + { + "epoch": 5.4383057090239415, + "grad_norm": 0.26839709281921387, + "learning_rate": 4.534436859225353e-05, + "loss": 1.7272, + "step": 17718 + }, + { + "epoch": 5.438612645794966, + "grad_norm": 0.37715891003608704, + "learning_rate": 4.5339419649595476e-05, + "loss": 1.7254, + "step": 17719 + }, + { + "epoch": 5.438919582565991, + "grad_norm": 0.21485768258571625, + "learning_rate": 4.533447075299603e-05, + "loss": 1.7349, + "step": 17720 + }, + { + "epoch": 5.439226519337017, + "grad_norm": 0.29502415657043457, + "learning_rate": 4.5329521902504055e-05, + "loss": 1.7325, + "step": 17721 + }, + { + "epoch": 5.439533456108042, + "grad_norm": 0.29448410868644714, + "learning_rate": 4.5324573098168505e-05, + "loss": 1.768, + "step": 17722 + }, + { + "epoch": 5.439840392879067, + "grad_norm": 0.1892058402299881, + "learning_rate": 4.5319624340038244e-05, + "loss": 1.6866, + "step": 17723 + }, + { + "epoch": 5.440147329650092, + "grad_norm": 0.3365040123462677, + "learning_rate": 4.531467562816221e-05, + "loss": 1.7662, + "step": 17724 + }, + { + "epoch": 5.440454266421117, + "grad_norm": 0.2960789203643799, + "learning_rate": 4.53097269625893e-05, + "loss": 1.746, + "step": 17725 + }, + { + "epoch": 5.440761203192142, + "grad_norm": 0.21623700857162476, + "learning_rate": 4.530477834336841e-05, + "loss": 1.7619, + "step": 17726 + }, + { + "epoch": 5.441068139963168, + "grad_norm": 0.29010120034217834, + "learning_rate": 4.5299829770548456e-05, + "loss": 1.717, + "step": 17727 + }, + { + "epoch": 5.441375076734193, + "grad_norm": 0.18467605113983154, + "learning_rate": 4.529488124417833e-05, + "loss": 1.6938, + "step": 17728 + }, + { + "epoch": 5.4416820135052175, + "grad_norm": 0.2875411808490753, + "learning_rate": 4.528993276430695e-05, + "loss": 1.7633, + "step": 17729 + }, + { + "epoch": 5.441988950276243, + "grad_norm": 0.24252675473690033, + "learning_rate": 4.528498433098321e-05, + "loss": 1.6477, + "step": 17730 + }, + { + "epoch": 5.442295887047268, + "grad_norm": 0.18885886669158936, + "learning_rate": 4.5280035944256035e-05, + "loss": 1.7241, + "step": 17731 + }, + { + "epoch": 5.4426028238182935, + "grad_norm": 0.2594204246997833, + "learning_rate": 4.527508760417429e-05, + "loss": 1.6697, + "step": 17732 + }, + { + "epoch": 5.442909760589319, + "grad_norm": 0.23796287178993225, + "learning_rate": 4.527013931078692e-05, + "loss": 1.7035, + "step": 17733 + }, + { + "epoch": 5.443216697360343, + "grad_norm": 0.2591552436351776, + "learning_rate": 4.5265191064142787e-05, + "loss": 1.8014, + "step": 17734 + }, + { + "epoch": 5.443523634131369, + "grad_norm": 0.3316073417663574, + "learning_rate": 4.526024286429082e-05, + "loss": 1.752, + "step": 17735 + }, + { + "epoch": 5.443830570902394, + "grad_norm": 0.2409597635269165, + "learning_rate": 4.52552947112799e-05, + "loss": 1.7662, + "step": 17736 + }, + { + "epoch": 5.444137507673419, + "grad_norm": 0.2896713614463806, + "learning_rate": 4.5250346605158964e-05, + "loss": 1.7168, + "step": 17737 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 0.30870527029037476, + "learning_rate": 4.524539854597686e-05, + "loss": 1.704, + "step": 17738 + }, + { + "epoch": 5.44475138121547, + "grad_norm": 0.2476932406425476, + "learning_rate": 4.524045053378254e-05, + "loss": 1.7649, + "step": 17739 + }, + { + "epoch": 5.445058317986494, + "grad_norm": 0.2937077283859253, + "learning_rate": 4.5235502568624855e-05, + "loss": 1.7028, + "step": 17740 + }, + { + "epoch": 5.44536525475752, + "grad_norm": 0.22881117463111877, + "learning_rate": 4.523055465055273e-05, + "loss": 1.7539, + "step": 17741 + }, + { + "epoch": 5.445672191528545, + "grad_norm": 0.2551842927932739, + "learning_rate": 4.522560677961508e-05, + "loss": 1.7601, + "step": 17742 + }, + { + "epoch": 5.44597912829957, + "grad_norm": 0.27533504366874695, + "learning_rate": 4.5220658955860754e-05, + "loss": 1.7695, + "step": 17743 + }, + { + "epoch": 5.446286065070596, + "grad_norm": 0.23387418687343597, + "learning_rate": 4.5215711179338706e-05, + "loss": 1.7218, + "step": 17744 + }, + { + "epoch": 5.44659300184162, + "grad_norm": 0.37932485342025757, + "learning_rate": 4.521076345009777e-05, + "loss": 1.7685, + "step": 17745 + }, + { + "epoch": 5.4468999386126455, + "grad_norm": 0.2668898105621338, + "learning_rate": 4.520581576818691e-05, + "loss": 1.7217, + "step": 17746 + }, + { + "epoch": 5.447206875383671, + "grad_norm": 0.2417856752872467, + "learning_rate": 4.520086813365496e-05, + "loss": 1.692, + "step": 17747 + }, + { + "epoch": 5.447513812154696, + "grad_norm": 0.3170008063316345, + "learning_rate": 4.519592054655086e-05, + "loss": 1.7565, + "step": 17748 + }, + { + "epoch": 5.4478207489257215, + "grad_norm": 0.20711660385131836, + "learning_rate": 4.519097300692348e-05, + "loss": 1.6708, + "step": 17749 + }, + { + "epoch": 5.448127685696747, + "grad_norm": 0.2196272760629654, + "learning_rate": 4.5186025514821746e-05, + "loss": 1.7335, + "step": 17750 + }, + { + "epoch": 5.448434622467771, + "grad_norm": 0.27563074231147766, + "learning_rate": 4.5181078070294505e-05, + "loss": 1.7383, + "step": 17751 + }, + { + "epoch": 5.448741559238797, + "grad_norm": 0.185418501496315, + "learning_rate": 4.517613067339068e-05, + "loss": 1.6841, + "step": 17752 + }, + { + "epoch": 5.449048496009822, + "grad_norm": 0.26787856221199036, + "learning_rate": 4.517118332415915e-05, + "loss": 1.7733, + "step": 17753 + }, + { + "epoch": 5.449355432780847, + "grad_norm": 0.22114823758602142, + "learning_rate": 4.516623602264885e-05, + "loss": 1.7153, + "step": 17754 + }, + { + "epoch": 5.449662369551873, + "grad_norm": 0.23090483248233795, + "learning_rate": 4.51612887689086e-05, + "loss": 1.7063, + "step": 17755 + }, + { + "epoch": 5.449969306322897, + "grad_norm": 0.3227362632751465, + "learning_rate": 4.515634156298736e-05, + "loss": 1.7528, + "step": 17756 + }, + { + "epoch": 5.4502762430939224, + "grad_norm": 0.24202494323253632, + "learning_rate": 4.515139440493397e-05, + "loss": 1.8119, + "step": 17757 + }, + { + "epoch": 5.450583179864948, + "grad_norm": 0.3778383731842041, + "learning_rate": 4.5146447294797356e-05, + "loss": 1.7589, + "step": 17758 + }, + { + "epoch": 5.450890116635973, + "grad_norm": 0.3726772964000702, + "learning_rate": 4.51415002326264e-05, + "loss": 1.7095, + "step": 17759 + }, + { + "epoch": 5.4511970534069984, + "grad_norm": 0.2424323409795761, + "learning_rate": 4.5136553218469966e-05, + "loss": 1.7374, + "step": 17760 + }, + { + "epoch": 5.451503990178024, + "grad_norm": 0.4347550570964813, + "learning_rate": 4.513160625237699e-05, + "loss": 1.8339, + "step": 17761 + }, + { + "epoch": 5.451810926949048, + "grad_norm": 0.2556018829345703, + "learning_rate": 4.512665933439631e-05, + "loss": 1.7024, + "step": 17762 + }, + { + "epoch": 5.452117863720074, + "grad_norm": 0.36380240321159363, + "learning_rate": 4.512171246457685e-05, + "loss": 1.7706, + "step": 17763 + }, + { + "epoch": 5.452424800491099, + "grad_norm": 0.42120790481567383, + "learning_rate": 4.5116765642967476e-05, + "loss": 1.7609, + "step": 17764 + }, + { + "epoch": 5.452731737262124, + "grad_norm": 0.20573028922080994, + "learning_rate": 4.51118188696171e-05, + "loss": 1.7521, + "step": 17765 + }, + { + "epoch": 5.45303867403315, + "grad_norm": 0.39001402258872986, + "learning_rate": 4.510687214457458e-05, + "loss": 1.7097, + "step": 17766 + }, + { + "epoch": 5.453345610804174, + "grad_norm": 0.2778739333152771, + "learning_rate": 4.510192546788884e-05, + "loss": 1.7677, + "step": 17767 + }, + { + "epoch": 5.453652547575199, + "grad_norm": 0.2500934600830078, + "learning_rate": 4.509697883960872e-05, + "loss": 1.7322, + "step": 17768 + }, + { + "epoch": 5.453959484346225, + "grad_norm": 0.23733557760715485, + "learning_rate": 4.509203225978314e-05, + "loss": 1.7426, + "step": 17769 + }, + { + "epoch": 5.45426642111725, + "grad_norm": 0.20033739507198334, + "learning_rate": 4.508708572846096e-05, + "loss": 1.7093, + "step": 17770 + }, + { + "epoch": 5.454573357888275, + "grad_norm": 0.202667698264122, + "learning_rate": 4.508213924569111e-05, + "loss": 1.6807, + "step": 17771 + }, + { + "epoch": 5.4548802946593, + "grad_norm": 0.1980566531419754, + "learning_rate": 4.507719281152241e-05, + "loss": 1.7102, + "step": 17772 + }, + { + "epoch": 5.455187231430325, + "grad_norm": 0.20612162351608276, + "learning_rate": 4.507224642600381e-05, + "loss": 1.7692, + "step": 17773 + }, + { + "epoch": 5.4554941682013505, + "grad_norm": 0.22859175503253937, + "learning_rate": 4.506730008918412e-05, + "loss": 1.7887, + "step": 17774 + }, + { + "epoch": 5.455801104972376, + "grad_norm": 0.19720709323883057, + "learning_rate": 4.5062353801112285e-05, + "loss": 1.7557, + "step": 17775 + }, + { + "epoch": 5.456108041743401, + "grad_norm": 0.23289217054843903, + "learning_rate": 4.505740756183717e-05, + "loss": 1.7023, + "step": 17776 + }, + { + "epoch": 5.456414978514426, + "grad_norm": 0.2120361477136612, + "learning_rate": 4.505246137140763e-05, + "loss": 1.7249, + "step": 17777 + }, + { + "epoch": 5.456721915285451, + "grad_norm": 0.2094341218471527, + "learning_rate": 4.504751522987259e-05, + "loss": 1.7586, + "step": 17778 + }, + { + "epoch": 5.457028852056476, + "grad_norm": 0.22361092269420624, + "learning_rate": 4.504256913728088e-05, + "loss": 1.737, + "step": 17779 + }, + { + "epoch": 5.457335788827502, + "grad_norm": 0.2100353240966797, + "learning_rate": 4.5037623093681424e-05, + "loss": 1.704, + "step": 17780 + }, + { + "epoch": 5.457642725598527, + "grad_norm": 0.20550231635570526, + "learning_rate": 4.503267709912308e-05, + "loss": 1.7732, + "step": 17781 + }, + { + "epoch": 5.457949662369552, + "grad_norm": 0.22843749821186066, + "learning_rate": 4.502773115365474e-05, + "loss": 1.6916, + "step": 17782 + }, + { + "epoch": 5.458256599140577, + "grad_norm": 0.2351907640695572, + "learning_rate": 4.502278525732526e-05, + "loss": 1.8043, + "step": 17783 + }, + { + "epoch": 5.458563535911602, + "grad_norm": 0.271028071641922, + "learning_rate": 4.501783941018355e-05, + "loss": 1.7665, + "step": 17784 + }, + { + "epoch": 5.458870472682627, + "grad_norm": 0.1974802166223526, + "learning_rate": 4.501289361227846e-05, + "loss": 1.718, + "step": 17785 + }, + { + "epoch": 5.459177409453653, + "grad_norm": 0.23726068437099457, + "learning_rate": 4.5007947863658884e-05, + "loss": 1.7507, + "step": 17786 + }, + { + "epoch": 5.459484346224678, + "grad_norm": 0.2112259715795517, + "learning_rate": 4.5003002164373684e-05, + "loss": 1.8116, + "step": 17787 + }, + { + "epoch": 5.4597912829957025, + "grad_norm": 0.2676105201244354, + "learning_rate": 4.4998056514471764e-05, + "loss": 1.7013, + "step": 17788 + }, + { + "epoch": 5.460098219766728, + "grad_norm": 0.2735576033592224, + "learning_rate": 4.4993110914001956e-05, + "loss": 1.7516, + "step": 17789 + }, + { + "epoch": 5.460405156537753, + "grad_norm": 0.1925152987241745, + "learning_rate": 4.498816536301319e-05, + "loss": 1.7018, + "step": 17790 + }, + { + "epoch": 5.4607120933087785, + "grad_norm": 0.25037717819213867, + "learning_rate": 4.498321986155429e-05, + "loss": 1.7207, + "step": 17791 + }, + { + "epoch": 5.461019030079804, + "grad_norm": 0.20481008291244507, + "learning_rate": 4.497827440967415e-05, + "loss": 1.6988, + "step": 17792 + }, + { + "epoch": 5.461325966850829, + "grad_norm": 0.19434049725532532, + "learning_rate": 4.4973329007421673e-05, + "loss": 1.7363, + "step": 17793 + }, + { + "epoch": 5.461632903621854, + "grad_norm": 0.21797434985637665, + "learning_rate": 4.496838365484567e-05, + "loss": 1.7218, + "step": 17794 + }, + { + "epoch": 5.461939840392879, + "grad_norm": 0.18477453291416168, + "learning_rate": 4.496343835199508e-05, + "loss": 1.7204, + "step": 17795 + }, + { + "epoch": 5.462246777163904, + "grad_norm": 0.21657803654670715, + "learning_rate": 4.495849309891872e-05, + "loss": 1.7671, + "step": 17796 + }, + { + "epoch": 5.46255371393493, + "grad_norm": 0.21027342975139618, + "learning_rate": 4.495354789566549e-05, + "loss": 1.7424, + "step": 17797 + }, + { + "epoch": 5.462860650705955, + "grad_norm": 0.2016189992427826, + "learning_rate": 4.4948602742284256e-05, + "loss": 1.7706, + "step": 17798 + }, + { + "epoch": 5.463167587476979, + "grad_norm": 0.2155935913324356, + "learning_rate": 4.494365763882391e-05, + "loss": 1.7314, + "step": 17799 + }, + { + "epoch": 5.463474524248005, + "grad_norm": 0.22079701721668243, + "learning_rate": 4.493871258533328e-05, + "loss": 1.7938, + "step": 17800 + }, + { + "epoch": 5.46378146101903, + "grad_norm": 0.1907699704170227, + "learning_rate": 4.4933767581861283e-05, + "loss": 1.6958, + "step": 17801 + }, + { + "epoch": 5.464088397790055, + "grad_norm": 0.2784879207611084, + "learning_rate": 4.4928822628456735e-05, + "loss": 1.7285, + "step": 17802 + }, + { + "epoch": 5.464395334561081, + "grad_norm": 0.29470255970954895, + "learning_rate": 4.492387772516855e-05, + "loss": 1.7363, + "step": 17803 + }, + { + "epoch": 5.464702271332105, + "grad_norm": 0.21387436985969543, + "learning_rate": 4.4918932872045575e-05, + "loss": 1.7414, + "step": 17804 + }, + { + "epoch": 5.4650092081031305, + "grad_norm": 0.3102552890777588, + "learning_rate": 4.49139880691367e-05, + "loss": 1.7359, + "step": 17805 + }, + { + "epoch": 5.465316144874156, + "grad_norm": 0.2312939465045929, + "learning_rate": 4.490904331649075e-05, + "loss": 1.7609, + "step": 17806 + }, + { + "epoch": 5.465623081645181, + "grad_norm": 0.323913037776947, + "learning_rate": 4.4904098614156645e-05, + "loss": 1.7693, + "step": 17807 + }, + { + "epoch": 5.4659300184162065, + "grad_norm": 0.2975599467754364, + "learning_rate": 4.48991539621832e-05, + "loss": 1.7506, + "step": 17808 + }, + { + "epoch": 5.466236955187231, + "grad_norm": 0.24702571332454681, + "learning_rate": 4.4894209360619316e-05, + "loss": 1.8258, + "step": 17809 + }, + { + "epoch": 5.466543891958256, + "grad_norm": 0.29016581177711487, + "learning_rate": 4.488926480951386e-05, + "loss": 1.7096, + "step": 17810 + }, + { + "epoch": 5.466850828729282, + "grad_norm": 0.2194555252790451, + "learning_rate": 4.488432030891566e-05, + "loss": 1.788, + "step": 17811 + }, + { + "epoch": 5.467157765500307, + "grad_norm": 0.2504041790962219, + "learning_rate": 4.487937585887363e-05, + "loss": 1.7672, + "step": 17812 + }, + { + "epoch": 5.467464702271332, + "grad_norm": 0.2362445741891861, + "learning_rate": 4.487443145943659e-05, + "loss": 1.7426, + "step": 17813 + }, + { + "epoch": 5.467771639042358, + "grad_norm": 0.20075896382331848, + "learning_rate": 4.486948711065343e-05, + "loss": 1.7406, + "step": 17814 + }, + { + "epoch": 5.468078575813382, + "grad_norm": 0.2219153791666031, + "learning_rate": 4.486454281257299e-05, + "loss": 1.683, + "step": 17815 + }, + { + "epoch": 5.468385512584407, + "grad_norm": 0.22551953792572021, + "learning_rate": 4.4859598565244176e-05, + "loss": 1.7896, + "step": 17816 + }, + { + "epoch": 5.468692449355433, + "grad_norm": 0.2385476976633072, + "learning_rate": 4.48546543687158e-05, + "loss": 1.7799, + "step": 17817 + }, + { + "epoch": 5.468999386126458, + "grad_norm": 0.24263370037078857, + "learning_rate": 4.4849710223036764e-05, + "loss": 1.682, + "step": 17818 + }, + { + "epoch": 5.469306322897483, + "grad_norm": 0.24301160871982574, + "learning_rate": 4.484476612825589e-05, + "loss": 1.8121, + "step": 17819 + }, + { + "epoch": 5.469613259668508, + "grad_norm": 0.2516932487487793, + "learning_rate": 4.483982208442207e-05, + "loss": 1.7344, + "step": 17820 + }, + { + "epoch": 5.469920196439533, + "grad_norm": 0.24309395253658295, + "learning_rate": 4.4834878091584156e-05, + "loss": 1.7746, + "step": 17821 + }, + { + "epoch": 5.4702271332105585, + "grad_norm": 0.24711866676807404, + "learning_rate": 4.4829934149790996e-05, + "loss": 1.7887, + "step": 17822 + }, + { + "epoch": 5.470534069981584, + "grad_norm": 0.2923797369003296, + "learning_rate": 4.4824990259091445e-05, + "loss": 1.7017, + "step": 17823 + }, + { + "epoch": 5.470841006752609, + "grad_norm": 0.21658629179000854, + "learning_rate": 4.482004641953441e-05, + "loss": 1.725, + "step": 17824 + }, + { + "epoch": 5.4711479435236345, + "grad_norm": 0.233424574136734, + "learning_rate": 4.481510263116868e-05, + "loss": 1.74, + "step": 17825 + }, + { + "epoch": 5.471454880294659, + "grad_norm": 0.28997600078582764, + "learning_rate": 4.481015889404315e-05, + "loss": 1.8418, + "step": 17826 + }, + { + "epoch": 5.471761817065684, + "grad_norm": 0.2245558649301529, + "learning_rate": 4.480521520820669e-05, + "loss": 1.7519, + "step": 17827 + }, + { + "epoch": 5.47206875383671, + "grad_norm": 0.21008887887001038, + "learning_rate": 4.480027157370812e-05, + "loss": 1.6977, + "step": 17828 + }, + { + "epoch": 5.472375690607735, + "grad_norm": 0.1990261971950531, + "learning_rate": 4.479532799059633e-05, + "loss": 1.7004, + "step": 17829 + }, + { + "epoch": 5.47268262737876, + "grad_norm": 0.2354540079832077, + "learning_rate": 4.479038445892014e-05, + "loss": 1.7755, + "step": 17830 + }, + { + "epoch": 5.472989564149785, + "grad_norm": 0.21904973685741425, + "learning_rate": 4.478544097872843e-05, + "loss": 1.8328, + "step": 17831 + }, + { + "epoch": 5.47329650092081, + "grad_norm": 0.21188503503799438, + "learning_rate": 4.4780497550070055e-05, + "loss": 1.7105, + "step": 17832 + }, + { + "epoch": 5.473603437691835, + "grad_norm": 0.2196870595216751, + "learning_rate": 4.477555417299386e-05, + "loss": 1.7261, + "step": 17833 + }, + { + "epoch": 5.473910374462861, + "grad_norm": 0.24522331357002258, + "learning_rate": 4.477061084754869e-05, + "loss": 1.8101, + "step": 17834 + }, + { + "epoch": 5.474217311233886, + "grad_norm": 0.24073927104473114, + "learning_rate": 4.476566757378343e-05, + "loss": 1.8295, + "step": 17835 + }, + { + "epoch": 5.474524248004911, + "grad_norm": 0.3724605143070221, + "learning_rate": 4.476072435174689e-05, + "loss": 1.7785, + "step": 17836 + }, + { + "epoch": 5.474831184775936, + "grad_norm": 0.25552257895469666, + "learning_rate": 4.475578118148797e-05, + "loss": 1.6978, + "step": 17837 + }, + { + "epoch": 5.475138121546961, + "grad_norm": 0.22402255237102509, + "learning_rate": 4.475083806305546e-05, + "loss": 1.697, + "step": 17838 + }, + { + "epoch": 5.475445058317987, + "grad_norm": 0.25869324803352356, + "learning_rate": 4.474589499649826e-05, + "loss": 1.7026, + "step": 17839 + }, + { + "epoch": 5.475751995089012, + "grad_norm": 0.249742329120636, + "learning_rate": 4.47409519818652e-05, + "loss": 1.7738, + "step": 17840 + }, + { + "epoch": 5.476058931860037, + "grad_norm": 0.28722140192985535, + "learning_rate": 4.473600901920515e-05, + "loss": 1.7555, + "step": 17841 + }, + { + "epoch": 5.476365868631062, + "grad_norm": 0.250964879989624, + "learning_rate": 4.4731066108566926e-05, + "loss": 1.6951, + "step": 17842 + }, + { + "epoch": 5.476672805402087, + "grad_norm": 0.20562006533145905, + "learning_rate": 4.472612324999942e-05, + "loss": 1.7109, + "step": 17843 + }, + { + "epoch": 5.476979742173112, + "grad_norm": 0.26964858174324036, + "learning_rate": 4.472118044355144e-05, + "loss": 1.7468, + "step": 17844 + }, + { + "epoch": 5.477286678944138, + "grad_norm": 0.25700438022613525, + "learning_rate": 4.471623768927184e-05, + "loss": 1.7046, + "step": 17845 + }, + { + "epoch": 5.477593615715163, + "grad_norm": 0.2152809500694275, + "learning_rate": 4.47112949872095e-05, + "loss": 1.7464, + "step": 17846 + }, + { + "epoch": 5.4779005524861875, + "grad_norm": 0.26429688930511475, + "learning_rate": 4.470635233741321e-05, + "loss": 1.7629, + "step": 17847 + }, + { + "epoch": 5.478207489257213, + "grad_norm": 0.18546637892723083, + "learning_rate": 4.470140973993188e-05, + "loss": 1.7143, + "step": 17848 + }, + { + "epoch": 5.478514426028238, + "grad_norm": 0.1927761435508728, + "learning_rate": 4.46964671948143e-05, + "loss": 1.6919, + "step": 17849 + }, + { + "epoch": 5.4788213627992635, + "grad_norm": 0.21581199765205383, + "learning_rate": 4.469152470210935e-05, + "loss": 1.7596, + "step": 17850 + }, + { + "epoch": 5.479128299570289, + "grad_norm": 0.20244133472442627, + "learning_rate": 4.468658226186586e-05, + "loss": 1.7372, + "step": 17851 + }, + { + "epoch": 5.479435236341313, + "grad_norm": 0.2467198520898819, + "learning_rate": 4.468163987413269e-05, + "loss": 1.7361, + "step": 17852 + }, + { + "epoch": 5.479742173112339, + "grad_norm": 0.22134411334991455, + "learning_rate": 4.467669753895866e-05, + "loss": 1.7276, + "step": 17853 + }, + { + "epoch": 5.480049109883364, + "grad_norm": 0.1953750103712082, + "learning_rate": 4.4671755256392636e-05, + "loss": 1.6931, + "step": 17854 + }, + { + "epoch": 5.480356046654389, + "grad_norm": 0.21492068469524384, + "learning_rate": 4.466681302648343e-05, + "loss": 1.7437, + "step": 17855 + }, + { + "epoch": 5.480662983425415, + "grad_norm": 0.24377848207950592, + "learning_rate": 4.466187084927993e-05, + "loss": 1.7869, + "step": 17856 + }, + { + "epoch": 5.48096992019644, + "grad_norm": 0.23674219846725464, + "learning_rate": 4.465692872483093e-05, + "loss": 1.8142, + "step": 17857 + }, + { + "epoch": 5.481276856967464, + "grad_norm": 0.25036486983299255, + "learning_rate": 4.4651986653185304e-05, + "loss": 1.8075, + "step": 17858 + }, + { + "epoch": 5.48158379373849, + "grad_norm": 0.32649150490760803, + "learning_rate": 4.4647044634391867e-05, + "loss": 1.7177, + "step": 17859 + }, + { + "epoch": 5.481890730509515, + "grad_norm": 0.20300604403018951, + "learning_rate": 4.46421026684995e-05, + "loss": 1.6912, + "step": 17860 + }, + { + "epoch": 5.48219766728054, + "grad_norm": 0.24630679190158844, + "learning_rate": 4.4637160755557e-05, + "loss": 1.8312, + "step": 17861 + }, + { + "epoch": 5.482504604051566, + "grad_norm": 0.2263093739748001, + "learning_rate": 4.46322188956132e-05, + "loss": 1.7214, + "step": 17862 + }, + { + "epoch": 5.48281154082259, + "grad_norm": 0.22949177026748657, + "learning_rate": 4.462727708871699e-05, + "loss": 1.6882, + "step": 17863 + }, + { + "epoch": 5.4831184775936155, + "grad_norm": 0.23389381170272827, + "learning_rate": 4.4622335334917156e-05, + "loss": 1.7613, + "step": 17864 + }, + { + "epoch": 5.483425414364641, + "grad_norm": 0.2259683907032013, + "learning_rate": 4.461739363426257e-05, + "loss": 1.7021, + "step": 17865 + }, + { + "epoch": 5.483732351135666, + "grad_norm": 0.3213486969470978, + "learning_rate": 4.4612451986802036e-05, + "loss": 1.7469, + "step": 17866 + }, + { + "epoch": 5.4840392879066915, + "grad_norm": 0.3415670096874237, + "learning_rate": 4.4607510392584426e-05, + "loss": 1.7605, + "step": 17867 + }, + { + "epoch": 5.484346224677717, + "grad_norm": 0.2079494297504425, + "learning_rate": 4.460256885165855e-05, + "loss": 1.7832, + "step": 17868 + }, + { + "epoch": 5.484653161448741, + "grad_norm": 0.30334988236427307, + "learning_rate": 4.459762736407327e-05, + "loss": 1.6825, + "step": 17869 + }, + { + "epoch": 5.484960098219767, + "grad_norm": 0.22320730984210968, + "learning_rate": 4.4592685929877374e-05, + "loss": 1.7452, + "step": 17870 + }, + { + "epoch": 5.485267034990792, + "grad_norm": 0.25325682759284973, + "learning_rate": 4.458774454911975e-05, + "loss": 1.7359, + "step": 17871 + }, + { + "epoch": 5.485573971761817, + "grad_norm": 0.305501788854599, + "learning_rate": 4.458280322184919e-05, + "loss": 1.7161, + "step": 17872 + }, + { + "epoch": 5.485880908532843, + "grad_norm": 0.19486182928085327, + "learning_rate": 4.457786194811455e-05, + "loss": 1.7097, + "step": 17873 + }, + { + "epoch": 5.486187845303867, + "grad_norm": 0.3306363821029663, + "learning_rate": 4.457292072796465e-05, + "loss": 1.7653, + "step": 17874 + }, + { + "epoch": 5.486494782074892, + "grad_norm": 0.25172874331474304, + "learning_rate": 4.456797956144835e-05, + "loss": 1.7289, + "step": 17875 + }, + { + "epoch": 5.486801718845918, + "grad_norm": 0.24508661031723022, + "learning_rate": 4.456303844861444e-05, + "loss": 1.7255, + "step": 17876 + }, + { + "epoch": 5.487108655616943, + "grad_norm": 0.3043360114097595, + "learning_rate": 4.455809738951178e-05, + "loss": 1.7852, + "step": 17877 + }, + { + "epoch": 5.487415592387968, + "grad_norm": 0.22181758284568787, + "learning_rate": 4.4553156384189186e-05, + "loss": 1.7887, + "step": 17878 + }, + { + "epoch": 5.487722529158993, + "grad_norm": 0.2174321413040161, + "learning_rate": 4.454821543269549e-05, + "loss": 1.7024, + "step": 17879 + }, + { + "epoch": 5.488029465930018, + "grad_norm": 0.19634750485420227, + "learning_rate": 4.4543274535079535e-05, + "loss": 1.7451, + "step": 17880 + }, + { + "epoch": 5.4883364027010435, + "grad_norm": 0.20481908321380615, + "learning_rate": 4.4538333691390125e-05, + "loss": 1.7068, + "step": 17881 + }, + { + "epoch": 5.488643339472069, + "grad_norm": 0.2025458663702011, + "learning_rate": 4.453339290167612e-05, + "loss": 1.72, + "step": 17882 + }, + { + "epoch": 5.488950276243094, + "grad_norm": 0.21013019979000092, + "learning_rate": 4.452845216598632e-05, + "loss": 1.7113, + "step": 17883 + }, + { + "epoch": 5.4892572130141195, + "grad_norm": 0.2057499885559082, + "learning_rate": 4.452351148436956e-05, + "loss": 1.7007, + "step": 17884 + }, + { + "epoch": 5.489564149785144, + "grad_norm": 0.19957664608955383, + "learning_rate": 4.4518570856874666e-05, + "loss": 1.6999, + "step": 17885 + }, + { + "epoch": 5.489871086556169, + "grad_norm": 0.22609412670135498, + "learning_rate": 4.451363028355048e-05, + "loss": 1.8124, + "step": 17886 + }, + { + "epoch": 5.490178023327195, + "grad_norm": 0.27350863814353943, + "learning_rate": 4.4508689764445805e-05, + "loss": 1.8042, + "step": 17887 + }, + { + "epoch": 5.49048496009822, + "grad_norm": 0.23416854441165924, + "learning_rate": 4.450374929960949e-05, + "loss": 1.7607, + "step": 17888 + }, + { + "epoch": 5.490791896869245, + "grad_norm": 0.2891421318054199, + "learning_rate": 4.449880888909033e-05, + "loss": 1.7419, + "step": 17889 + }, + { + "epoch": 5.49109883364027, + "grad_norm": 0.2458745837211609, + "learning_rate": 4.449386853293717e-05, + "loss": 1.7234, + "step": 17890 + }, + { + "epoch": 5.491405770411295, + "grad_norm": 0.23390449583530426, + "learning_rate": 4.4488928231198826e-05, + "loss": 1.7482, + "step": 17891 + }, + { + "epoch": 5.49171270718232, + "grad_norm": 0.3509657084941864, + "learning_rate": 4.448398798392414e-05, + "loss": 1.7639, + "step": 17892 + }, + { + "epoch": 5.492019643953346, + "grad_norm": 0.2487955242395401, + "learning_rate": 4.4479047791161916e-05, + "loss": 1.7163, + "step": 17893 + }, + { + "epoch": 5.492326580724371, + "grad_norm": 0.22630274295806885, + "learning_rate": 4.4474107652960956e-05, + "loss": 1.7449, + "step": 17894 + }, + { + "epoch": 5.4926335174953955, + "grad_norm": 0.25909537076950073, + "learning_rate": 4.446916756937012e-05, + "loss": 1.7396, + "step": 17895 + }, + { + "epoch": 5.492940454266421, + "grad_norm": 0.29732683300971985, + "learning_rate": 4.446422754043819e-05, + "loss": 1.8109, + "step": 17896 + }, + { + "epoch": 5.493247391037446, + "grad_norm": 0.22436772286891937, + "learning_rate": 4.4459287566214035e-05, + "loss": 1.7657, + "step": 17897 + }, + { + "epoch": 5.4935543278084715, + "grad_norm": 0.24584892392158508, + "learning_rate": 4.445434764674643e-05, + "loss": 1.73, + "step": 17898 + }, + { + "epoch": 5.493861264579497, + "grad_norm": 0.27446454763412476, + "learning_rate": 4.444940778208423e-05, + "loss": 1.7428, + "step": 17899 + }, + { + "epoch": 5.494168201350522, + "grad_norm": 0.20442110300064087, + "learning_rate": 4.4444467972276215e-05, + "loss": 1.6911, + "step": 17900 + }, + { + "epoch": 5.494475138121547, + "grad_norm": 0.23089268803596497, + "learning_rate": 4.4439528217371236e-05, + "loss": 1.7192, + "step": 17901 + }, + { + "epoch": 5.494782074892572, + "grad_norm": 0.19402450323104858, + "learning_rate": 4.443458851741808e-05, + "loss": 1.7304, + "step": 17902 + }, + { + "epoch": 5.495089011663597, + "grad_norm": 0.2310219705104828, + "learning_rate": 4.442964887246561e-05, + "loss": 1.6963, + "step": 17903 + }, + { + "epoch": 5.495395948434623, + "grad_norm": 0.25573140382766724, + "learning_rate": 4.44247092825626e-05, + "loss": 1.7781, + "step": 17904 + }, + { + "epoch": 5.495702885205648, + "grad_norm": 0.20298753678798676, + "learning_rate": 4.4419769747757894e-05, + "loss": 1.763, + "step": 17905 + }, + { + "epoch": 5.496009821976672, + "grad_norm": 0.22243307530879974, + "learning_rate": 4.441483026810027e-05, + "loss": 1.7345, + "step": 17906 + }, + { + "epoch": 5.496316758747698, + "grad_norm": 0.19801411032676697, + "learning_rate": 4.4409890843638584e-05, + "loss": 1.7504, + "step": 17907 + }, + { + "epoch": 5.496623695518723, + "grad_norm": 0.2804374396800995, + "learning_rate": 4.440495147442162e-05, + "loss": 1.7985, + "step": 17908 + }, + { + "epoch": 5.496930632289748, + "grad_norm": 0.21824021637439728, + "learning_rate": 4.440001216049822e-05, + "loss": 1.6703, + "step": 17909 + }, + { + "epoch": 5.497237569060774, + "grad_norm": 0.23335935175418854, + "learning_rate": 4.439507290191719e-05, + "loss": 1.7426, + "step": 17910 + }, + { + "epoch": 5.497544505831799, + "grad_norm": 0.2093769609928131, + "learning_rate": 4.4390133698727315e-05, + "loss": 1.7178, + "step": 17911 + }, + { + "epoch": 5.4978514426028235, + "grad_norm": 0.18354324996471405, + "learning_rate": 4.438519455097743e-05, + "loss": 1.6849, + "step": 17912 + }, + { + "epoch": 5.498158379373849, + "grad_norm": 0.26826491951942444, + "learning_rate": 4.438025545871633e-05, + "loss": 1.7804, + "step": 17913 + }, + { + "epoch": 5.498465316144874, + "grad_norm": 0.29171738028526306, + "learning_rate": 4.437531642199288e-05, + "loss": 1.764, + "step": 17914 + }, + { + "epoch": 5.4987722529158995, + "grad_norm": 0.17870590090751648, + "learning_rate": 4.437037744085581e-05, + "loss": 1.6789, + "step": 17915 + }, + { + "epoch": 5.499079189686925, + "grad_norm": 0.25412192940711975, + "learning_rate": 4.4365438515354e-05, + "loss": 1.7536, + "step": 17916 + }, + { + "epoch": 5.499386126457949, + "grad_norm": 0.24465163052082062, + "learning_rate": 4.4360499645536203e-05, + "loss": 1.7582, + "step": 17917 + }, + { + "epoch": 5.499693063228975, + "grad_norm": 0.21248452365398407, + "learning_rate": 4.4355560831451264e-05, + "loss": 1.7209, + "step": 17918 + }, + { + "epoch": 5.5, + "grad_norm": 0.21018685400485992, + "learning_rate": 4.435062207314797e-05, + "loss": 1.7461, + "step": 17919 + }, + { + "epoch": 5.500306936771025, + "grad_norm": 0.1880551278591156, + "learning_rate": 4.434568337067517e-05, + "loss": 1.6818, + "step": 17920 + }, + { + "epoch": 5.500613873542051, + "grad_norm": 0.2224894016981125, + "learning_rate": 4.434074472408161e-05, + "loss": 1.8211, + "step": 17921 + }, + { + "epoch": 5.500920810313076, + "grad_norm": 0.19419749081134796, + "learning_rate": 4.433580613341615e-05, + "loss": 1.7625, + "step": 17922 + }, + { + "epoch": 5.5012277470841005, + "grad_norm": 0.2167430967092514, + "learning_rate": 4.433086759872756e-05, + "loss": 1.745, + "step": 17923 + }, + { + "epoch": 5.501534683855126, + "grad_norm": 0.1926383525133133, + "learning_rate": 4.4325929120064665e-05, + "loss": 1.7353, + "step": 17924 + }, + { + "epoch": 5.501841620626151, + "grad_norm": 0.22943224012851715, + "learning_rate": 4.432099069747625e-05, + "loss": 1.6903, + "step": 17925 + }, + { + "epoch": 5.5021485573971765, + "grad_norm": 0.18218693137168884, + "learning_rate": 4.431605233101116e-05, + "loss": 1.742, + "step": 17926 + }, + { + "epoch": 5.502455494168201, + "grad_norm": 0.2660788893699646, + "learning_rate": 4.431111402071817e-05, + "loss": 1.7208, + "step": 17927 + }, + { + "epoch": 5.502762430939226, + "grad_norm": 0.20015788078308105, + "learning_rate": 4.430617576664606e-05, + "loss": 1.721, + "step": 17928 + }, + { + "epoch": 5.503069367710252, + "grad_norm": 0.20011179149150848, + "learning_rate": 4.430123756884368e-05, + "loss": 1.7488, + "step": 17929 + }, + { + "epoch": 5.503376304481277, + "grad_norm": 0.22541452944278717, + "learning_rate": 4.429629942735979e-05, + "loss": 1.7997, + "step": 17930 + }, + { + "epoch": 5.503683241252302, + "grad_norm": 0.21067193150520325, + "learning_rate": 4.4291361342243236e-05, + "loss": 1.6652, + "step": 17931 + }, + { + "epoch": 5.503990178023328, + "grad_norm": 0.38401395082473755, + "learning_rate": 4.428642331354278e-05, + "loss": 1.815, + "step": 17932 + }, + { + "epoch": 5.504297114794352, + "grad_norm": 0.22600100934505463, + "learning_rate": 4.428148534130725e-05, + "loss": 1.7593, + "step": 17933 + }, + { + "epoch": 5.504604051565377, + "grad_norm": 0.21340666711330414, + "learning_rate": 4.427654742558542e-05, + "loss": 1.7447, + "step": 17934 + }, + { + "epoch": 5.504910988336403, + "grad_norm": 0.20676501095294952, + "learning_rate": 4.427160956642611e-05, + "loss": 1.7174, + "step": 17935 + }, + { + "epoch": 5.505217925107428, + "grad_norm": 0.2374252825975418, + "learning_rate": 4.42666717638781e-05, + "loss": 1.703, + "step": 17936 + }, + { + "epoch": 5.505524861878453, + "grad_norm": 0.20975756645202637, + "learning_rate": 4.426173401799022e-05, + "loss": 1.7076, + "step": 17937 + }, + { + "epoch": 5.505831798649478, + "grad_norm": 0.23778517544269562, + "learning_rate": 4.4256796328811226e-05, + "loss": 1.7647, + "step": 17938 + }, + { + "epoch": 5.506138735420503, + "grad_norm": 0.2088557481765747, + "learning_rate": 4.425185869638996e-05, + "loss": 1.764, + "step": 17939 + }, + { + "epoch": 5.5064456721915285, + "grad_norm": 0.26953455805778503, + "learning_rate": 4.424692112077518e-05, + "loss": 1.7351, + "step": 17940 + }, + { + "epoch": 5.506752608962554, + "grad_norm": 0.2762589454650879, + "learning_rate": 4.42419836020157e-05, + "loss": 1.7051, + "step": 17941 + }, + { + "epoch": 5.507059545733579, + "grad_norm": 0.19611702859401703, + "learning_rate": 4.4237046140160306e-05, + "loss": 1.7445, + "step": 17942 + }, + { + "epoch": 5.5073664825046045, + "grad_norm": 0.2708270251750946, + "learning_rate": 4.4232108735257824e-05, + "loss": 1.7284, + "step": 17943 + }, + { + "epoch": 5.507673419275629, + "grad_norm": 0.24194146692752838, + "learning_rate": 4.422717138735701e-05, + "loss": 1.7302, + "step": 17944 + }, + { + "epoch": 5.507980356046654, + "grad_norm": 0.21558286249637604, + "learning_rate": 4.422223409650666e-05, + "loss": 1.7435, + "step": 17945 + }, + { + "epoch": 5.50828729281768, + "grad_norm": 0.1842707246541977, + "learning_rate": 4.4217296862755597e-05, + "loss": 1.6579, + "step": 17946 + }, + { + "epoch": 5.508594229588705, + "grad_norm": 0.20211941003799438, + "learning_rate": 4.4212359686152576e-05, + "loss": 1.8017, + "step": 17947 + }, + { + "epoch": 5.50890116635973, + "grad_norm": 0.23749016225337982, + "learning_rate": 4.420742256674644e-05, + "loss": 1.6721, + "step": 17948 + }, + { + "epoch": 5.509208103130755, + "grad_norm": 0.2076852172613144, + "learning_rate": 4.420248550458592e-05, + "loss": 1.7102, + "step": 17949 + }, + { + "epoch": 5.50951503990178, + "grad_norm": 0.2599447965621948, + "learning_rate": 4.419754849971986e-05, + "loss": 1.7819, + "step": 17950 + }, + { + "epoch": 5.509821976672805, + "grad_norm": 0.2017187476158142, + "learning_rate": 4.4192611552197e-05, + "loss": 1.6812, + "step": 17951 + }, + { + "epoch": 5.510128913443831, + "grad_norm": 0.21972116827964783, + "learning_rate": 4.418767466206617e-05, + "loss": 1.7122, + "step": 17952 + }, + { + "epoch": 5.510435850214856, + "grad_norm": 0.21750569343566895, + "learning_rate": 4.418273782937613e-05, + "loss": 1.7285, + "step": 17953 + }, + { + "epoch": 5.510742786985881, + "grad_norm": 0.19349125027656555, + "learning_rate": 4.417780105417572e-05, + "loss": 1.7383, + "step": 17954 + }, + { + "epoch": 5.511049723756906, + "grad_norm": 0.2094268798828125, + "learning_rate": 4.417286433651366e-05, + "loss": 1.7107, + "step": 17955 + }, + { + "epoch": 5.511356660527931, + "grad_norm": 0.2684331238269806, + "learning_rate": 4.41679276764388e-05, + "loss": 1.7336, + "step": 17956 + }, + { + "epoch": 5.5116635972989565, + "grad_norm": 0.27616915106773376, + "learning_rate": 4.416299107399987e-05, + "loss": 1.7439, + "step": 17957 + }, + { + "epoch": 5.511970534069982, + "grad_norm": 0.23874540627002716, + "learning_rate": 4.415805452924569e-05, + "loss": 1.7979, + "step": 17958 + }, + { + "epoch": 5.512277470841006, + "grad_norm": 0.21870921552181244, + "learning_rate": 4.415311804222503e-05, + "loss": 1.6674, + "step": 17959 + }, + { + "epoch": 5.512584407612032, + "grad_norm": 0.23042429983615875, + "learning_rate": 4.414818161298671e-05, + "loss": 1.7588, + "step": 17960 + }, + { + "epoch": 5.512891344383057, + "grad_norm": 0.2957153916358948, + "learning_rate": 4.4143245241579486e-05, + "loss": 1.8412, + "step": 17961 + }, + { + "epoch": 5.513198281154082, + "grad_norm": 0.28292644023895264, + "learning_rate": 4.413830892805213e-05, + "loss": 1.7915, + "step": 17962 + }, + { + "epoch": 5.513505217925108, + "grad_norm": 0.26526281237602234, + "learning_rate": 4.413337267245344e-05, + "loss": 1.7199, + "step": 17963 + }, + { + "epoch": 5.513812154696133, + "grad_norm": 0.41243693232536316, + "learning_rate": 4.4128436474832204e-05, + "loss": 1.7419, + "step": 17964 + }, + { + "epoch": 5.514119091467157, + "grad_norm": 0.2747771739959717, + "learning_rate": 4.4123500335237214e-05, + "loss": 1.7449, + "step": 17965 + }, + { + "epoch": 5.514426028238183, + "grad_norm": 0.25944122672080994, + "learning_rate": 4.4118564253717216e-05, + "loss": 1.7667, + "step": 17966 + }, + { + "epoch": 5.514732965009208, + "grad_norm": 0.32558533549308777, + "learning_rate": 4.411362823032103e-05, + "loss": 1.7292, + "step": 17967 + }, + { + "epoch": 5.515039901780233, + "grad_norm": 0.20190958678722382, + "learning_rate": 4.4108692265097404e-05, + "loss": 1.7529, + "step": 17968 + }, + { + "epoch": 5.515346838551259, + "grad_norm": 0.35485807061195374, + "learning_rate": 4.410375635809514e-05, + "loss": 1.7335, + "step": 17969 + }, + { + "epoch": 5.515653775322283, + "grad_norm": 0.2670159935951233, + "learning_rate": 4.409882050936301e-05, + "loss": 1.6789, + "step": 17970 + }, + { + "epoch": 5.5159607120933085, + "grad_norm": 0.19106578826904297, + "learning_rate": 4.409388471894981e-05, + "loss": 1.708, + "step": 17971 + }, + { + "epoch": 5.516267648864334, + "grad_norm": 0.2707268297672272, + "learning_rate": 4.4088948986904286e-05, + "loss": 1.7917, + "step": 17972 + }, + { + "epoch": 5.516574585635359, + "grad_norm": 0.2329230159521103, + "learning_rate": 4.408401331327525e-05, + "loss": 1.7378, + "step": 17973 + }, + { + "epoch": 5.5168815224063845, + "grad_norm": 0.22164998948574066, + "learning_rate": 4.4079077698111436e-05, + "loss": 1.7287, + "step": 17974 + }, + { + "epoch": 5.51718845917741, + "grad_norm": 0.25895699858665466, + "learning_rate": 4.4074142141461665e-05, + "loss": 1.7158, + "step": 17975 + }, + { + "epoch": 5.517495395948434, + "grad_norm": 0.2617860436439514, + "learning_rate": 4.4069206643374695e-05, + "loss": 1.7767, + "step": 17976 + }, + { + "epoch": 5.51780233271946, + "grad_norm": 0.20443588495254517, + "learning_rate": 4.40642712038993e-05, + "loss": 1.7371, + "step": 17977 + }, + { + "epoch": 5.518109269490485, + "grad_norm": 0.26251545548439026, + "learning_rate": 4.4059335823084266e-05, + "loss": 1.8154, + "step": 17978 + }, + { + "epoch": 5.51841620626151, + "grad_norm": 0.2315993458032608, + "learning_rate": 4.405440050097833e-05, + "loss": 1.7426, + "step": 17979 + }, + { + "epoch": 5.518723143032536, + "grad_norm": 0.19467706978321075, + "learning_rate": 4.404946523763031e-05, + "loss": 1.7418, + "step": 17980 + }, + { + "epoch": 5.51903007980356, + "grad_norm": 0.2387837916612625, + "learning_rate": 4.4044530033088946e-05, + "loss": 1.7648, + "step": 17981 + }, + { + "epoch": 5.519337016574585, + "grad_norm": 0.21097531914710999, + "learning_rate": 4.403959488740306e-05, + "loss": 1.7198, + "step": 17982 + }, + { + "epoch": 5.519643953345611, + "grad_norm": 0.22303247451782227, + "learning_rate": 4.403465980062136e-05, + "loss": 1.7679, + "step": 17983 + }, + { + "epoch": 5.519950890116636, + "grad_norm": 0.19705620408058167, + "learning_rate": 4.4029724772792666e-05, + "loss": 1.7747, + "step": 17984 + }, + { + "epoch": 5.520257826887661, + "grad_norm": 0.20864570140838623, + "learning_rate": 4.4024789803965715e-05, + "loss": 1.6797, + "step": 17985 + }, + { + "epoch": 5.520564763658687, + "grad_norm": 0.1917724758386612, + "learning_rate": 4.401985489418931e-05, + "loss": 1.7246, + "step": 17986 + }, + { + "epoch": 5.520871700429711, + "grad_norm": 0.25668975710868835, + "learning_rate": 4.401492004351219e-05, + "loss": 1.7245, + "step": 17987 + }, + { + "epoch": 5.5211786372007365, + "grad_norm": 0.22576093673706055, + "learning_rate": 4.4009985251983146e-05, + "loss": 1.6766, + "step": 17988 + }, + { + "epoch": 5.521485573971762, + "grad_norm": 0.18614664673805237, + "learning_rate": 4.400505051965093e-05, + "loss": 1.7379, + "step": 17989 + }, + { + "epoch": 5.521792510742787, + "grad_norm": 0.21472783386707306, + "learning_rate": 4.4000115846564335e-05, + "loss": 1.7203, + "step": 17990 + }, + { + "epoch": 5.5220994475138125, + "grad_norm": 0.201142817735672, + "learning_rate": 4.39951812327721e-05, + "loss": 1.7049, + "step": 17991 + }, + { + "epoch": 5.522406384284837, + "grad_norm": 0.193614661693573, + "learning_rate": 4.3990246678323e-05, + "loss": 1.6938, + "step": 17992 + }, + { + "epoch": 5.522713321055862, + "grad_norm": 0.23343239724636078, + "learning_rate": 4.398531218326582e-05, + "loss": 1.744, + "step": 17993 + }, + { + "epoch": 5.523020257826888, + "grad_norm": 0.26271605491638184, + "learning_rate": 4.3980377747649305e-05, + "loss": 1.7458, + "step": 17994 + }, + { + "epoch": 5.523327194597913, + "grad_norm": 0.2048577219247818, + "learning_rate": 4.397544337152223e-05, + "loss": 1.763, + "step": 17995 + }, + { + "epoch": 5.523634131368938, + "grad_norm": 0.27748194336891174, + "learning_rate": 4.397050905493334e-05, + "loss": 1.7346, + "step": 17996 + }, + { + "epoch": 5.523941068139964, + "grad_norm": 0.3040253520011902, + "learning_rate": 4.3965574797931417e-05, + "loss": 1.7396, + "step": 17997 + }, + { + "epoch": 5.524248004910988, + "grad_norm": 0.3310317397117615, + "learning_rate": 4.396064060056523e-05, + "loss": 1.8094, + "step": 17998 + }, + { + "epoch": 5.524554941682013, + "grad_norm": 0.21845392882823944, + "learning_rate": 4.395570646288352e-05, + "loss": 1.7013, + "step": 17999 + }, + { + "epoch": 5.524861878453039, + "grad_norm": 0.319876492023468, + "learning_rate": 4.395077238493506e-05, + "loss": 1.7985, + "step": 18000 + }, + { + "epoch": 5.525168815224064, + "grad_norm": 0.28261950612068176, + "learning_rate": 4.394583836676863e-05, + "loss": 1.7979, + "step": 18001 + }, + { + "epoch": 5.525475751995089, + "grad_norm": 0.20874030888080597, + "learning_rate": 4.394090440843296e-05, + "loss": 1.7363, + "step": 18002 + }, + { + "epoch": 5.525782688766114, + "grad_norm": 0.28587406873703003, + "learning_rate": 4.393597050997684e-05, + "loss": 1.6787, + "step": 18003 + }, + { + "epoch": 5.526089625537139, + "grad_norm": 0.2719021439552307, + "learning_rate": 4.393103667144899e-05, + "loss": 1.7625, + "step": 18004 + }, + { + "epoch": 5.526396562308165, + "grad_norm": 0.22485414147377014, + "learning_rate": 4.392610289289821e-05, + "loss": 1.6847, + "step": 18005 + }, + { + "epoch": 5.52670349907919, + "grad_norm": 0.3500347435474396, + "learning_rate": 4.392116917437322e-05, + "loss": 1.7244, + "step": 18006 + }, + { + "epoch": 5.527010435850215, + "grad_norm": 0.26308783888816833, + "learning_rate": 4.3916235515922836e-05, + "loss": 1.7738, + "step": 18007 + }, + { + "epoch": 5.52731737262124, + "grad_norm": 0.27030646800994873, + "learning_rate": 4.391130191759574e-05, + "loss": 1.7149, + "step": 18008 + }, + { + "epoch": 5.527624309392265, + "grad_norm": 0.4137318730354309, + "learning_rate": 4.390636837944076e-05, + "loss": 1.7581, + "step": 18009 + }, + { + "epoch": 5.52793124616329, + "grad_norm": 0.2462068647146225, + "learning_rate": 4.390143490150659e-05, + "loss": 1.7767, + "step": 18010 + }, + { + "epoch": 5.528238182934316, + "grad_norm": 0.27424392104148865, + "learning_rate": 4.3896501483842036e-05, + "loss": 1.7701, + "step": 18011 + }, + { + "epoch": 5.528545119705341, + "grad_norm": 0.31268683075904846, + "learning_rate": 4.389156812649583e-05, + "loss": 1.7342, + "step": 18012 + }, + { + "epoch": 5.5288520564763655, + "grad_norm": 0.20428471267223358, + "learning_rate": 4.388663482951671e-05, + "loss": 1.7083, + "step": 18013 + }, + { + "epoch": 5.529158993247391, + "grad_norm": 0.322344034910202, + "learning_rate": 4.3881701592953475e-05, + "loss": 1.7423, + "step": 18014 + }, + { + "epoch": 5.529465930018416, + "grad_norm": 0.2267894744873047, + "learning_rate": 4.387676841685483e-05, + "loss": 1.7309, + "step": 18015 + }, + { + "epoch": 5.5297728667894415, + "grad_norm": 0.23041954636573792, + "learning_rate": 4.387183530126955e-05, + "loss": 1.7352, + "step": 18016 + }, + { + "epoch": 5.530079803560467, + "grad_norm": 0.31139662861824036, + "learning_rate": 4.386690224624638e-05, + "loss": 1.7223, + "step": 18017 + }, + { + "epoch": 5.530386740331492, + "grad_norm": 0.20144063234329224, + "learning_rate": 4.38619692518341e-05, + "loss": 1.7607, + "step": 18018 + }, + { + "epoch": 5.530693677102517, + "grad_norm": 0.23812296986579895, + "learning_rate": 4.385703631808142e-05, + "loss": 1.7599, + "step": 18019 + }, + { + "epoch": 5.531000613873542, + "grad_norm": 0.2442231923341751, + "learning_rate": 4.385210344503712e-05, + "loss": 1.7094, + "step": 18020 + }, + { + "epoch": 5.531307550644567, + "grad_norm": 0.19497406482696533, + "learning_rate": 4.384717063274992e-05, + "loss": 1.7686, + "step": 18021 + }, + { + "epoch": 5.531614487415593, + "grad_norm": 0.29085835814476013, + "learning_rate": 4.38422378812686e-05, + "loss": 1.7454, + "step": 18022 + }, + { + "epoch": 5.531921424186618, + "grad_norm": 0.2701610028743744, + "learning_rate": 4.3837305190641876e-05, + "loss": 1.7376, + "step": 18023 + }, + { + "epoch": 5.532228360957642, + "grad_norm": 0.21232132613658905, + "learning_rate": 4.383237256091854e-05, + "loss": 1.7773, + "step": 18024 + }, + { + "epoch": 5.532535297728668, + "grad_norm": 0.24131610989570618, + "learning_rate": 4.382743999214729e-05, + "loss": 1.7899, + "step": 18025 + }, + { + "epoch": 5.532842234499693, + "grad_norm": 0.2752540409564972, + "learning_rate": 4.382250748437692e-05, + "loss": 1.7603, + "step": 18026 + }, + { + "epoch": 5.533149171270718, + "grad_norm": 0.2007865607738495, + "learning_rate": 4.381757503765613e-05, + "loss": 1.7553, + "step": 18027 + }, + { + "epoch": 5.533456108041744, + "grad_norm": 0.23768723011016846, + "learning_rate": 4.38126426520337e-05, + "loss": 1.757, + "step": 18028 + }, + { + "epoch": 5.533763044812769, + "grad_norm": 0.22198502719402313, + "learning_rate": 4.3807710327558366e-05, + "loss": 1.7578, + "step": 18029 + }, + { + "epoch": 5.5340699815837935, + "grad_norm": 0.22432352602481842, + "learning_rate": 4.380277806427885e-05, + "loss": 1.75, + "step": 18030 + }, + { + "epoch": 5.534376918354819, + "grad_norm": 0.23029591143131256, + "learning_rate": 4.379784586224394e-05, + "loss": 1.7829, + "step": 18031 + }, + { + "epoch": 5.534683855125844, + "grad_norm": 0.23901896178722382, + "learning_rate": 4.379291372150232e-05, + "loss": 1.7461, + "step": 18032 + }, + { + "epoch": 5.5349907918968695, + "grad_norm": 0.20958681404590607, + "learning_rate": 4.378798164210278e-05, + "loss": 1.7224, + "step": 18033 + }, + { + "epoch": 5.535297728667894, + "grad_norm": 0.21619680523872375, + "learning_rate": 4.3783049624094036e-05, + "loss": 1.7605, + "step": 18034 + }, + { + "epoch": 5.535604665438919, + "grad_norm": 0.22988620400428772, + "learning_rate": 4.3778117667524867e-05, + "loss": 1.7668, + "step": 18035 + }, + { + "epoch": 5.535911602209945, + "grad_norm": 0.20107243955135345, + "learning_rate": 4.377318577244395e-05, + "loss": 1.7932, + "step": 18036 + }, + { + "epoch": 5.53621853898097, + "grad_norm": 0.25803956389427185, + "learning_rate": 4.376825393890009e-05, + "loss": 1.7409, + "step": 18037 + }, + { + "epoch": 5.536525475751995, + "grad_norm": 0.34292399883270264, + "learning_rate": 4.376332216694198e-05, + "loss": 1.8554, + "step": 18038 + }, + { + "epoch": 5.536832412523021, + "grad_norm": 0.23147790133953094, + "learning_rate": 4.375839045661839e-05, + "loss": 1.7918, + "step": 18039 + }, + { + "epoch": 5.537139349294045, + "grad_norm": 0.2387644350528717, + "learning_rate": 4.375345880797802e-05, + "loss": 1.7391, + "step": 18040 + }, + { + "epoch": 5.53744628606507, + "grad_norm": 0.21463727951049805, + "learning_rate": 4.374852722106966e-05, + "loss": 1.6812, + "step": 18041 + }, + { + "epoch": 5.537753222836096, + "grad_norm": 0.21994563937187195, + "learning_rate": 4.3743595695941994e-05, + "loss": 1.7727, + "step": 18042 + }, + { + "epoch": 5.538060159607121, + "grad_norm": 0.21102699637413025, + "learning_rate": 4.373866423264381e-05, + "loss": 1.7854, + "step": 18043 + }, + { + "epoch": 5.538367096378146, + "grad_norm": 0.21742786467075348, + "learning_rate": 4.3733732831223794e-05, + "loss": 1.7352, + "step": 18044 + }, + { + "epoch": 5.538674033149171, + "grad_norm": 0.20080791413784027, + "learning_rate": 4.372880149173071e-05, + "loss": 1.7264, + "step": 18045 + }, + { + "epoch": 5.538980969920196, + "grad_norm": 0.21027569472789764, + "learning_rate": 4.372387021421329e-05, + "loss": 1.766, + "step": 18046 + }, + { + "epoch": 5.5392879066912215, + "grad_norm": 0.22870683670043945, + "learning_rate": 4.371893899872025e-05, + "loss": 1.7746, + "step": 18047 + }, + { + "epoch": 5.539594843462247, + "grad_norm": 0.21248690783977509, + "learning_rate": 4.371400784530036e-05, + "loss": 1.7447, + "step": 18048 + }, + { + "epoch": 5.539901780233272, + "grad_norm": 0.23059454560279846, + "learning_rate": 4.37090767540023e-05, + "loss": 1.7827, + "step": 18049 + }, + { + "epoch": 5.5402087170042975, + "grad_norm": 0.2519036531448364, + "learning_rate": 4.370414572487485e-05, + "loss": 1.7984, + "step": 18050 + }, + { + "epoch": 5.540515653775322, + "grad_norm": 0.23621398210525513, + "learning_rate": 4.36992147579667e-05, + "loss": 1.7517, + "step": 18051 + }, + { + "epoch": 5.540822590546347, + "grad_norm": 0.24267609417438507, + "learning_rate": 4.3694283853326625e-05, + "loss": 1.8285, + "step": 18052 + }, + { + "epoch": 5.541129527317373, + "grad_norm": 0.23209960758686066, + "learning_rate": 4.368935301100332e-05, + "loss": 1.7765, + "step": 18053 + }, + { + "epoch": 5.541436464088398, + "grad_norm": 0.21277187764644623, + "learning_rate": 4.368442223104555e-05, + "loss": 1.7182, + "step": 18054 + }, + { + "epoch": 5.541743400859423, + "grad_norm": 0.20821616053581238, + "learning_rate": 4.367949151350199e-05, + "loss": 1.6766, + "step": 18055 + }, + { + "epoch": 5.542050337630448, + "grad_norm": 0.23019999265670776, + "learning_rate": 4.3674560858421414e-05, + "loss": 1.7438, + "step": 18056 + }, + { + "epoch": 5.542357274401473, + "grad_norm": 0.21547134220600128, + "learning_rate": 4.366963026585253e-05, + "loss": 1.7003, + "step": 18057 + }, + { + "epoch": 5.542664211172498, + "grad_norm": 0.22454513609409332, + "learning_rate": 4.3664699735844084e-05, + "loss": 1.7072, + "step": 18058 + }, + { + "epoch": 5.542971147943524, + "grad_norm": 0.22228482365608215, + "learning_rate": 4.365976926844477e-05, + "loss": 1.7557, + "step": 18059 + }, + { + "epoch": 5.543278084714549, + "grad_norm": 0.25762560963630676, + "learning_rate": 4.365483886370335e-05, + "loss": 1.7751, + "step": 18060 + }, + { + "epoch": 5.543585021485574, + "grad_norm": 0.2086205631494522, + "learning_rate": 4.3649908521668516e-05, + "loss": 1.7399, + "step": 18061 + }, + { + "epoch": 5.543891958256599, + "grad_norm": 0.2759089767932892, + "learning_rate": 4.3644978242389014e-05, + "loss": 1.7503, + "step": 18062 + }, + { + "epoch": 5.544198895027624, + "grad_norm": 0.2235182225704193, + "learning_rate": 4.364004802591358e-05, + "loss": 1.7313, + "step": 18063 + }, + { + "epoch": 5.5445058317986495, + "grad_norm": 0.23074570298194885, + "learning_rate": 4.3635117872290885e-05, + "loss": 1.7649, + "step": 18064 + }, + { + "epoch": 5.544812768569675, + "grad_norm": 0.24929538369178772, + "learning_rate": 4.363018778156972e-05, + "loss": 1.732, + "step": 18065 + }, + { + "epoch": 5.5451197053407, + "grad_norm": 0.26422035694122314, + "learning_rate": 4.362525775379874e-05, + "loss": 1.7276, + "step": 18066 + }, + { + "epoch": 5.545426642111725, + "grad_norm": 0.3160388767719269, + "learning_rate": 4.362032778902672e-05, + "loss": 1.7777, + "step": 18067 + }, + { + "epoch": 5.54573357888275, + "grad_norm": 0.20791196823120117, + "learning_rate": 4.3615397887302345e-05, + "loss": 1.7058, + "step": 18068 + }, + { + "epoch": 5.546040515653775, + "grad_norm": 0.31438156962394714, + "learning_rate": 4.361046804867437e-05, + "loss": 1.8102, + "step": 18069 + }, + { + "epoch": 5.546347452424801, + "grad_norm": 0.3008113205432892, + "learning_rate": 4.3605538273191475e-05, + "loss": 1.7297, + "step": 18070 + }, + { + "epoch": 5.546654389195826, + "grad_norm": 0.21147282421588898, + "learning_rate": 4.3600608560902425e-05, + "loss": 1.776, + "step": 18071 + }, + { + "epoch": 5.546961325966851, + "grad_norm": 0.25202393531799316, + "learning_rate": 4.3595678911855884e-05, + "loss": 1.7273, + "step": 18072 + }, + { + "epoch": 5.547268262737876, + "grad_norm": 0.18881210684776306, + "learning_rate": 4.3590749326100614e-05, + "loss": 1.7026, + "step": 18073 + }, + { + "epoch": 5.547575199508901, + "grad_norm": 0.25075671076774597, + "learning_rate": 4.3585819803685295e-05, + "loss": 1.7694, + "step": 18074 + }, + { + "epoch": 5.547882136279926, + "grad_norm": 0.2625887989997864, + "learning_rate": 4.358089034465869e-05, + "loss": 1.7338, + "step": 18075 + }, + { + "epoch": 5.548189073050952, + "grad_norm": 0.27278679609298706, + "learning_rate": 4.357596094906947e-05, + "loss": 1.7684, + "step": 18076 + }, + { + "epoch": 5.548496009821976, + "grad_norm": 0.283964604139328, + "learning_rate": 4.3571031616966396e-05, + "loss": 1.7539, + "step": 18077 + }, + { + "epoch": 5.5488029465930016, + "grad_norm": 0.2702009975910187, + "learning_rate": 4.3566102348398124e-05, + "loss": 1.8064, + "step": 18078 + }, + { + "epoch": 5.549109883364027, + "grad_norm": 0.449733167886734, + "learning_rate": 4.356117314341342e-05, + "loss": 1.7258, + "step": 18079 + }, + { + "epoch": 5.549416820135052, + "grad_norm": 0.3199995160102844, + "learning_rate": 4.3556244002060975e-05, + "loss": 1.7526, + "step": 18080 + }, + { + "epoch": 5.5497237569060776, + "grad_norm": 0.2803747355937958, + "learning_rate": 4.3551314924389494e-05, + "loss": 1.764, + "step": 18081 + }, + { + "epoch": 5.550030693677103, + "grad_norm": 0.28995978832244873, + "learning_rate": 4.3546385910447715e-05, + "loss": 1.7617, + "step": 18082 + }, + { + "epoch": 5.550337630448127, + "grad_norm": 0.24313311278820038, + "learning_rate": 4.354145696028431e-05, + "loss": 1.7515, + "step": 18083 + }, + { + "epoch": 5.550644567219153, + "grad_norm": 0.2668032944202423, + "learning_rate": 4.3536528073948025e-05, + "loss": 1.743, + "step": 18084 + }, + { + "epoch": 5.550951503990178, + "grad_norm": 0.22831310331821442, + "learning_rate": 4.353159925148755e-05, + "loss": 1.7971, + "step": 18085 + }, + { + "epoch": 5.551258440761203, + "grad_norm": 0.22047942876815796, + "learning_rate": 4.352667049295162e-05, + "loss": 1.6983, + "step": 18086 + }, + { + "epoch": 5.551565377532229, + "grad_norm": 0.22895069420337677, + "learning_rate": 4.35217417983889e-05, + "loss": 1.7866, + "step": 18087 + }, + { + "epoch": 5.551872314303253, + "grad_norm": 0.19946368038654327, + "learning_rate": 4.3516813167848156e-05, + "loss": 1.7129, + "step": 18088 + }, + { + "epoch": 5.5521792510742785, + "grad_norm": 0.21508903801441193, + "learning_rate": 4.351188460137804e-05, + "loss": 1.7154, + "step": 18089 + }, + { + "epoch": 5.552486187845304, + "grad_norm": 0.24813953042030334, + "learning_rate": 4.3506956099027294e-05, + "loss": 1.8326, + "step": 18090 + }, + { + "epoch": 5.552793124616329, + "grad_norm": 0.21306444704532623, + "learning_rate": 4.35020276608446e-05, + "loss": 1.7651, + "step": 18091 + }, + { + "epoch": 5.5531000613873545, + "grad_norm": 0.22041217982769012, + "learning_rate": 4.34970992868787e-05, + "loss": 1.6852, + "step": 18092 + }, + { + "epoch": 5.55340699815838, + "grad_norm": 0.21699896454811096, + "learning_rate": 4.349217097717826e-05, + "loss": 1.7524, + "step": 18093 + }, + { + "epoch": 5.553713934929404, + "grad_norm": 0.23086662590503693, + "learning_rate": 4.3487242731792015e-05, + "loss": 1.7441, + "step": 18094 + }, + { + "epoch": 5.55402087170043, + "grad_norm": 0.21898184716701508, + "learning_rate": 4.348231455076864e-05, + "loss": 1.7131, + "step": 18095 + }, + { + "epoch": 5.554327808471455, + "grad_norm": 0.17392560839653015, + "learning_rate": 4.3477386434156854e-05, + "loss": 1.7049, + "step": 18096 + }, + { + "epoch": 5.55463474524248, + "grad_norm": 0.1984172910451889, + "learning_rate": 4.3472458382005374e-05, + "loss": 1.7136, + "step": 18097 + }, + { + "epoch": 5.554941682013506, + "grad_norm": 0.19227837026119232, + "learning_rate": 4.3467530394362866e-05, + "loss": 1.7468, + "step": 18098 + }, + { + "epoch": 5.55524861878453, + "grad_norm": 0.2307087779045105, + "learning_rate": 4.346260247127807e-05, + "loss": 1.7004, + "step": 18099 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.21496252715587616, + "learning_rate": 4.345767461279965e-05, + "loss": 1.7508, + "step": 18100 + }, + { + "epoch": 5.555862492326581, + "grad_norm": 0.21119998395442963, + "learning_rate": 4.3452746818976333e-05, + "loss": 1.7965, + "step": 18101 + }, + { + "epoch": 5.556169429097606, + "grad_norm": 0.2416355311870575, + "learning_rate": 4.34478190898568e-05, + "loss": 1.7006, + "step": 18102 + }, + { + "epoch": 5.556476365868631, + "grad_norm": 0.2009642869234085, + "learning_rate": 4.344289142548978e-05, + "loss": 1.7567, + "step": 18103 + }, + { + "epoch": 5.556783302639657, + "grad_norm": 0.2387058436870575, + "learning_rate": 4.343796382592393e-05, + "loss": 1.7898, + "step": 18104 + }, + { + "epoch": 5.557090239410681, + "grad_norm": 0.19835951924324036, + "learning_rate": 4.343303629120798e-05, + "loss": 1.7888, + "step": 18105 + }, + { + "epoch": 5.5573971761817065, + "grad_norm": 0.23324637115001678, + "learning_rate": 4.3428108821390604e-05, + "loss": 1.7923, + "step": 18106 + }, + { + "epoch": 5.557704112952732, + "grad_norm": 0.22334477305412292, + "learning_rate": 4.342318141652052e-05, + "loss": 1.7234, + "step": 18107 + }, + { + "epoch": 5.558011049723757, + "grad_norm": 0.20220427215099335, + "learning_rate": 4.341825407664639e-05, + "loss": 1.7639, + "step": 18108 + }, + { + "epoch": 5.558317986494782, + "grad_norm": 0.23658546805381775, + "learning_rate": 4.3413326801816964e-05, + "loss": 1.7505, + "step": 18109 + }, + { + "epoch": 5.558624923265807, + "grad_norm": 0.21157726645469666, + "learning_rate": 4.3408399592080875e-05, + "loss": 1.7655, + "step": 18110 + }, + { + "epoch": 5.558931860036832, + "grad_norm": 0.2139829397201538, + "learning_rate": 4.340347244748687e-05, + "loss": 1.767, + "step": 18111 + }, + { + "epoch": 5.559238796807858, + "grad_norm": 0.17811299860477448, + "learning_rate": 4.339854536808359e-05, + "loss": 1.6629, + "step": 18112 + }, + { + "epoch": 5.559545733578883, + "grad_norm": 0.2005898356437683, + "learning_rate": 4.339361835391977e-05, + "loss": 1.7269, + "step": 18113 + }, + { + "epoch": 5.559852670349908, + "grad_norm": 0.21514086425304413, + "learning_rate": 4.338869140504409e-05, + "loss": 1.7806, + "step": 18114 + }, + { + "epoch": 5.560159607120933, + "grad_norm": 0.23163840174674988, + "learning_rate": 4.338376452150522e-05, + "loss": 1.7259, + "step": 18115 + }, + { + "epoch": 5.560466543891958, + "grad_norm": 0.23657509684562683, + "learning_rate": 4.337883770335189e-05, + "loss": 1.7778, + "step": 18116 + }, + { + "epoch": 5.560773480662983, + "grad_norm": 0.20135201513767242, + "learning_rate": 4.337391095063274e-05, + "loss": 1.7359, + "step": 18117 + }, + { + "epoch": 5.561080417434009, + "grad_norm": 0.22871774435043335, + "learning_rate": 4.33689842633965e-05, + "loss": 1.7658, + "step": 18118 + }, + { + "epoch": 5.561387354205034, + "grad_norm": 0.21755221486091614, + "learning_rate": 4.3364057641691835e-05, + "loss": 1.7408, + "step": 18119 + }, + { + "epoch": 5.5616942909760585, + "grad_norm": 0.215267151594162, + "learning_rate": 4.335913108556746e-05, + "loss": 1.7175, + "step": 18120 + }, + { + "epoch": 5.562001227747084, + "grad_norm": 0.25724974274635315, + "learning_rate": 4.335420459507202e-05, + "loss": 1.7197, + "step": 18121 + }, + { + "epoch": 5.562308164518109, + "grad_norm": 0.25375521183013916, + "learning_rate": 4.3349278170254254e-05, + "loss": 1.7251, + "step": 18122 + }, + { + "epoch": 5.5626151012891345, + "grad_norm": 0.24768905341625214, + "learning_rate": 4.334435181116279e-05, + "loss": 1.7405, + "step": 18123 + }, + { + "epoch": 5.56292203806016, + "grad_norm": 0.21281081438064575, + "learning_rate": 4.333942551784636e-05, + "loss": 1.7131, + "step": 18124 + }, + { + "epoch": 5.563228974831185, + "grad_norm": 0.2129398137331009, + "learning_rate": 4.333449929035361e-05, + "loss": 1.7049, + "step": 18125 + }, + { + "epoch": 5.56353591160221, + "grad_norm": 0.24582397937774658, + "learning_rate": 4.332957312873328e-05, + "loss": 1.7205, + "step": 18126 + }, + { + "epoch": 5.563842848373235, + "grad_norm": 0.21282973885536194, + "learning_rate": 4.332464703303399e-05, + "loss": 1.7655, + "step": 18127 + }, + { + "epoch": 5.56414978514426, + "grad_norm": 0.2302251160144806, + "learning_rate": 4.331972100330447e-05, + "loss": 1.7597, + "step": 18128 + }, + { + "epoch": 5.564456721915286, + "grad_norm": 0.23453226685523987, + "learning_rate": 4.331479503959336e-05, + "loss": 1.7028, + "step": 18129 + }, + { + "epoch": 5.564763658686311, + "grad_norm": 0.19723562896251678, + "learning_rate": 4.330986914194938e-05, + "loss": 1.7101, + "step": 18130 + }, + { + "epoch": 5.565070595457335, + "grad_norm": 0.22021643817424774, + "learning_rate": 4.33049433104212e-05, + "loss": 1.7123, + "step": 18131 + }, + { + "epoch": 5.565377532228361, + "grad_norm": 0.25540977716445923, + "learning_rate": 4.3300017545057484e-05, + "loss": 1.7392, + "step": 18132 + }, + { + "epoch": 5.565684468999386, + "grad_norm": 0.23482176661491394, + "learning_rate": 4.329509184590693e-05, + "loss": 1.7175, + "step": 18133 + }, + { + "epoch": 5.565991405770411, + "grad_norm": 0.19537311792373657, + "learning_rate": 4.329016621301819e-05, + "loss": 1.7583, + "step": 18134 + }, + { + "epoch": 5.566298342541437, + "grad_norm": 0.21828842163085938, + "learning_rate": 4.328524064643997e-05, + "loss": 1.7411, + "step": 18135 + }, + { + "epoch": 5.566605279312462, + "grad_norm": 0.24589122831821442, + "learning_rate": 4.328031514622093e-05, + "loss": 1.7769, + "step": 18136 + }, + { + "epoch": 5.5669122160834865, + "grad_norm": 0.20964545011520386, + "learning_rate": 4.327538971240978e-05, + "loss": 1.7743, + "step": 18137 + }, + { + "epoch": 5.567219152854512, + "grad_norm": 0.2210713028907776, + "learning_rate": 4.327046434505514e-05, + "loss": 1.7671, + "step": 18138 + }, + { + "epoch": 5.567526089625537, + "grad_norm": 0.21382687985897064, + "learning_rate": 4.3265539044205736e-05, + "loss": 1.793, + "step": 18139 + }, + { + "epoch": 5.5678330263965625, + "grad_norm": 0.23289678990840912, + "learning_rate": 4.326061380991021e-05, + "loss": 1.738, + "step": 18140 + }, + { + "epoch": 5.568139963167588, + "grad_norm": 0.23789258301258087, + "learning_rate": 4.325568864221725e-05, + "loss": 1.8315, + "step": 18141 + }, + { + "epoch": 5.568446899938612, + "grad_norm": 0.1925022453069687, + "learning_rate": 4.325076354117554e-05, + "loss": 1.6956, + "step": 18142 + }, + { + "epoch": 5.568753836709638, + "grad_norm": 0.22522561252117157, + "learning_rate": 4.324583850683373e-05, + "loss": 1.7957, + "step": 18143 + }, + { + "epoch": 5.569060773480663, + "grad_norm": 0.2787671387195587, + "learning_rate": 4.324091353924049e-05, + "loss": 1.7325, + "step": 18144 + }, + { + "epoch": 5.569367710251688, + "grad_norm": 0.2723194658756256, + "learning_rate": 4.3235988638444536e-05, + "loss": 1.7668, + "step": 18145 + }, + { + "epoch": 5.569674647022714, + "grad_norm": 0.2241704910993576, + "learning_rate": 4.3231063804494484e-05, + "loss": 1.7977, + "step": 18146 + }, + { + "epoch": 5.569981583793739, + "grad_norm": 0.2627747356891632, + "learning_rate": 4.322613903743903e-05, + "loss": 1.6775, + "step": 18147 + }, + { + "epoch": 5.570288520564763, + "grad_norm": 0.2644255757331848, + "learning_rate": 4.322121433732686e-05, + "loss": 1.7404, + "step": 18148 + }, + { + "epoch": 5.570595457335789, + "grad_norm": 0.2386743575334549, + "learning_rate": 4.321628970420659e-05, + "loss": 1.7386, + "step": 18149 + }, + { + "epoch": 5.570902394106814, + "grad_norm": 0.22444583475589752, + "learning_rate": 4.3211365138126945e-05, + "loss": 1.7482, + "step": 18150 + }, + { + "epoch": 5.571209330877839, + "grad_norm": 0.21770013868808746, + "learning_rate": 4.3206440639136554e-05, + "loss": 1.7322, + "step": 18151 + }, + { + "epoch": 5.571516267648864, + "grad_norm": 0.22356587648391724, + "learning_rate": 4.320151620728411e-05, + "loss": 1.751, + "step": 18152 + }, + { + "epoch": 5.571823204419889, + "grad_norm": 0.2040669322013855, + "learning_rate": 4.319659184261826e-05, + "loss": 1.712, + "step": 18153 + }, + { + "epoch": 5.5721301411909145, + "grad_norm": 0.20951713621616364, + "learning_rate": 4.319166754518768e-05, + "loss": 1.7308, + "step": 18154 + }, + { + "epoch": 5.57243707796194, + "grad_norm": 0.186195969581604, + "learning_rate": 4.3186743315041025e-05, + "loss": 1.7133, + "step": 18155 + }, + { + "epoch": 5.572744014732965, + "grad_norm": 0.2098865509033203, + "learning_rate": 4.318181915222698e-05, + "loss": 1.7645, + "step": 18156 + }, + { + "epoch": 5.5730509515039905, + "grad_norm": 0.20552097260951996, + "learning_rate": 4.317689505679418e-05, + "loss": 1.7156, + "step": 18157 + }, + { + "epoch": 5.573357888275015, + "grad_norm": 0.22506964206695557, + "learning_rate": 4.3171971028791314e-05, + "loss": 1.7192, + "step": 18158 + }, + { + "epoch": 5.57366482504604, + "grad_norm": 0.2296760082244873, + "learning_rate": 4.316704706826702e-05, + "loss": 1.7534, + "step": 18159 + }, + { + "epoch": 5.573971761817066, + "grad_norm": 0.20140253007411957, + "learning_rate": 4.316212317526998e-05, + "loss": 1.6906, + "step": 18160 + }, + { + "epoch": 5.574278698588091, + "grad_norm": 0.23313316702842712, + "learning_rate": 4.315719934984884e-05, + "loss": 1.6929, + "step": 18161 + }, + { + "epoch": 5.574585635359116, + "grad_norm": 0.23398169875144958, + "learning_rate": 4.315227559205228e-05, + "loss": 1.7254, + "step": 18162 + }, + { + "epoch": 5.574892572130141, + "grad_norm": 0.20836731791496277, + "learning_rate": 4.314735190192894e-05, + "loss": 1.7335, + "step": 18163 + }, + { + "epoch": 5.575199508901166, + "grad_norm": 0.19899079203605652, + "learning_rate": 4.3142428279527485e-05, + "loss": 1.69, + "step": 18164 + }, + { + "epoch": 5.5755064456721914, + "grad_norm": 0.24623680114746094, + "learning_rate": 4.313750472489657e-05, + "loss": 1.7413, + "step": 18165 + }, + { + "epoch": 5.575813382443217, + "grad_norm": 0.2432616949081421, + "learning_rate": 4.313258123808484e-05, + "loss": 1.7426, + "step": 18166 + }, + { + "epoch": 5.576120319214242, + "grad_norm": 0.22773970663547516, + "learning_rate": 4.3127657819141006e-05, + "loss": 1.7986, + "step": 18167 + }, + { + "epoch": 5.5764272559852675, + "grad_norm": 0.19891540706157684, + "learning_rate": 4.312273446811366e-05, + "loss": 1.7007, + "step": 18168 + }, + { + "epoch": 5.576734192756292, + "grad_norm": 0.23402714729309082, + "learning_rate": 4.311781118505149e-05, + "loss": 1.7774, + "step": 18169 + }, + { + "epoch": 5.577041129527317, + "grad_norm": 0.2248220294713974, + "learning_rate": 4.3112887970003134e-05, + "loss": 1.7079, + "step": 18170 + }, + { + "epoch": 5.577348066298343, + "grad_norm": 0.20901209115982056, + "learning_rate": 4.310796482301726e-05, + "loss": 1.7336, + "step": 18171 + }, + { + "epoch": 5.577655003069368, + "grad_norm": 0.21872754395008087, + "learning_rate": 4.3103041744142516e-05, + "loss": 1.7742, + "step": 18172 + }, + { + "epoch": 5.577961939840393, + "grad_norm": 0.2567403018474579, + "learning_rate": 4.309811873342757e-05, + "loss": 1.7894, + "step": 18173 + }, + { + "epoch": 5.578268876611418, + "grad_norm": 0.219998300075531, + "learning_rate": 4.3093195790921035e-05, + "loss": 1.7283, + "step": 18174 + }, + { + "epoch": 5.578575813382443, + "grad_norm": 0.1944747269153595, + "learning_rate": 4.3088272916671614e-05, + "loss": 1.7129, + "step": 18175 + }, + { + "epoch": 5.578882750153468, + "grad_norm": 0.19492141902446747, + "learning_rate": 4.308335011072791e-05, + "loss": 1.7286, + "step": 18176 + }, + { + "epoch": 5.579189686924494, + "grad_norm": 0.22383002936840057, + "learning_rate": 4.3078427373138604e-05, + "loss": 1.733, + "step": 18177 + }, + { + "epoch": 5.579496623695519, + "grad_norm": 0.20238643884658813, + "learning_rate": 4.307350470395232e-05, + "loss": 1.7522, + "step": 18178 + }, + { + "epoch": 5.579803560466544, + "grad_norm": 0.21456125378608704, + "learning_rate": 4.3068582103217755e-05, + "loss": 1.7298, + "step": 18179 + }, + { + "epoch": 5.580110497237569, + "grad_norm": 0.28084230422973633, + "learning_rate": 4.3063659570983514e-05, + "loss": 1.7805, + "step": 18180 + }, + { + "epoch": 5.580417434008594, + "grad_norm": 0.21319706737995148, + "learning_rate": 4.305873710729824e-05, + "loss": 1.6801, + "step": 18181 + }, + { + "epoch": 5.5807243707796195, + "grad_norm": 0.2279660850763321, + "learning_rate": 4.30538147122106e-05, + "loss": 1.752, + "step": 18182 + }, + { + "epoch": 5.581031307550645, + "grad_norm": 0.1958594173192978, + "learning_rate": 4.304889238576922e-05, + "loss": 1.7487, + "step": 18183 + }, + { + "epoch": 5.581338244321669, + "grad_norm": 0.19484321773052216, + "learning_rate": 4.304397012802279e-05, + "loss": 1.7222, + "step": 18184 + }, + { + "epoch": 5.581645181092695, + "grad_norm": 0.19863305985927582, + "learning_rate": 4.3039047939019906e-05, + "loss": 1.7296, + "step": 18185 + }, + { + "epoch": 5.58195211786372, + "grad_norm": 0.18674087524414062, + "learning_rate": 4.303412581880924e-05, + "loss": 1.6753, + "step": 18186 + }, + { + "epoch": 5.582259054634745, + "grad_norm": 0.22263208031654358, + "learning_rate": 4.302920376743941e-05, + "loss": 1.7431, + "step": 18187 + }, + { + "epoch": 5.582565991405771, + "grad_norm": 0.1926872879266739, + "learning_rate": 4.302428178495909e-05, + "loss": 1.7662, + "step": 18188 + }, + { + "epoch": 5.582872928176796, + "grad_norm": 0.23190459609031677, + "learning_rate": 4.301935987141689e-05, + "loss": 1.7271, + "step": 18189 + }, + { + "epoch": 5.58317986494782, + "grad_norm": 0.30057230591773987, + "learning_rate": 4.301443802686148e-05, + "loss": 1.7957, + "step": 18190 + }, + { + "epoch": 5.583486801718846, + "grad_norm": 0.2520695626735687, + "learning_rate": 4.3009516251341475e-05, + "loss": 1.7501, + "step": 18191 + }, + { + "epoch": 5.583793738489871, + "grad_norm": 0.19143317639827728, + "learning_rate": 4.300459454490555e-05, + "loss": 1.7091, + "step": 18192 + }, + { + "epoch": 5.584100675260896, + "grad_norm": 0.2064475119113922, + "learning_rate": 4.299967290760229e-05, + "loss": 1.6849, + "step": 18193 + }, + { + "epoch": 5.584407612031922, + "grad_norm": 0.3093598484992981, + "learning_rate": 4.299475133948039e-05, + "loss": 1.8479, + "step": 18194 + }, + { + "epoch": 5.584714548802946, + "grad_norm": 0.2875300943851471, + "learning_rate": 4.298982984058845e-05, + "loss": 1.7296, + "step": 18195 + }, + { + "epoch": 5.5850214855739715, + "grad_norm": 0.33194443583488464, + "learning_rate": 4.298490841097514e-05, + "loss": 1.7668, + "step": 18196 + }, + { + "epoch": 5.585328422344997, + "grad_norm": 0.20940829813480377, + "learning_rate": 4.297998705068908e-05, + "loss": 1.7316, + "step": 18197 + }, + { + "epoch": 5.585635359116022, + "grad_norm": 0.32381999492645264, + "learning_rate": 4.297506575977887e-05, + "loss": 1.7212, + "step": 18198 + }, + { + "epoch": 5.5859422958870475, + "grad_norm": 0.31585511565208435, + "learning_rate": 4.29701445382932e-05, + "loss": 1.7695, + "step": 18199 + }, + { + "epoch": 5.586249232658073, + "grad_norm": 0.2272588014602661, + "learning_rate": 4.2965223386280664e-05, + "loss": 1.7105, + "step": 18200 + }, + { + "epoch": 5.586556169429097, + "grad_norm": 0.2949761152267456, + "learning_rate": 4.296030230378993e-05, + "loss": 1.803, + "step": 18201 + }, + { + "epoch": 5.586863106200123, + "grad_norm": 0.20512579381465912, + "learning_rate": 4.29553812908696e-05, + "loss": 1.759, + "step": 18202 + }, + { + "epoch": 5.587170042971148, + "grad_norm": 0.21143598854541779, + "learning_rate": 4.295046034756835e-05, + "loss": 1.7286, + "step": 18203 + }, + { + "epoch": 5.587476979742173, + "grad_norm": 0.22148001194000244, + "learning_rate": 4.294553947393476e-05, + "loss": 1.7258, + "step": 18204 + }, + { + "epoch": 5.587783916513199, + "grad_norm": 0.17245957255363464, + "learning_rate": 4.2940618670017484e-05, + "loss": 1.6863, + "step": 18205 + }, + { + "epoch": 5.588090853284223, + "grad_norm": 0.20260390639305115, + "learning_rate": 4.293569793586515e-05, + "loss": 1.6866, + "step": 18206 + }, + { + "epoch": 5.588397790055248, + "grad_norm": 0.20671936869621277, + "learning_rate": 4.293077727152641e-05, + "loss": 1.7849, + "step": 18207 + }, + { + "epoch": 5.588704726826274, + "grad_norm": 0.21415838599205017, + "learning_rate": 4.292585667704984e-05, + "loss": 1.7279, + "step": 18208 + }, + { + "epoch": 5.589011663597299, + "grad_norm": 0.18668091297149658, + "learning_rate": 4.2920936152484134e-05, + "loss": 1.7087, + "step": 18209 + }, + { + "epoch": 5.589318600368324, + "grad_norm": 0.2253870815038681, + "learning_rate": 4.291601569787786e-05, + "loss": 1.769, + "step": 18210 + }, + { + "epoch": 5.58962553713935, + "grad_norm": 0.22426939010620117, + "learning_rate": 4.291109531327968e-05, + "loss": 1.7382, + "step": 18211 + }, + { + "epoch": 5.589932473910374, + "grad_norm": 0.21552452445030212, + "learning_rate": 4.29061749987382e-05, + "loss": 1.7316, + "step": 18212 + }, + { + "epoch": 5.5902394106813995, + "grad_norm": 0.2337147295475006, + "learning_rate": 4.290125475430209e-05, + "loss": 1.7836, + "step": 18213 + }, + { + "epoch": 5.590546347452425, + "grad_norm": 0.21780124306678772, + "learning_rate": 4.289633458001992e-05, + "loss": 1.6923, + "step": 18214 + }, + { + "epoch": 5.59085328422345, + "grad_norm": 0.20009608566761017, + "learning_rate": 4.289141447594033e-05, + "loss": 1.719, + "step": 18215 + }, + { + "epoch": 5.5911602209944755, + "grad_norm": 0.18165744841098785, + "learning_rate": 4.288649444211196e-05, + "loss": 1.6825, + "step": 18216 + }, + { + "epoch": 5.5914671577655, + "grad_norm": 0.2244826704263687, + "learning_rate": 4.288157447858341e-05, + "loss": 1.7323, + "step": 18217 + }, + { + "epoch": 5.591774094536525, + "grad_norm": 0.16875946521759033, + "learning_rate": 4.2876654585403325e-05, + "loss": 1.6787, + "step": 18218 + }, + { + "epoch": 5.592081031307551, + "grad_norm": 0.19244243204593658, + "learning_rate": 4.28717347626203e-05, + "loss": 1.7225, + "step": 18219 + }, + { + "epoch": 5.592387968078576, + "grad_norm": 0.21081633865833282, + "learning_rate": 4.286681501028299e-05, + "loss": 1.7063, + "step": 18220 + }, + { + "epoch": 5.592694904849601, + "grad_norm": 0.20926406979560852, + "learning_rate": 4.286189532843997e-05, + "loss": 1.7307, + "step": 18221 + }, + { + "epoch": 5.593001841620627, + "grad_norm": 0.20258775353431702, + "learning_rate": 4.28569757171399e-05, + "loss": 1.6917, + "step": 18222 + }, + { + "epoch": 5.593308778391651, + "grad_norm": 0.21956230700016022, + "learning_rate": 4.285205617643137e-05, + "loss": 1.7127, + "step": 18223 + }, + { + "epoch": 5.593615715162676, + "grad_norm": 0.2071436047554016, + "learning_rate": 4.284713670636303e-05, + "loss": 1.7487, + "step": 18224 + }, + { + "epoch": 5.593922651933702, + "grad_norm": 0.2002478390932083, + "learning_rate": 4.2842217306983464e-05, + "loss": 1.6544, + "step": 18225 + }, + { + "epoch": 5.594229588704727, + "grad_norm": 0.20691382884979248, + "learning_rate": 4.283729797834132e-05, + "loss": 1.768, + "step": 18226 + }, + { + "epoch": 5.5945365254757515, + "grad_norm": 0.18423563241958618, + "learning_rate": 4.283237872048517e-05, + "loss": 1.7563, + "step": 18227 + }, + { + "epoch": 5.594843462246777, + "grad_norm": 0.23055453598499298, + "learning_rate": 4.2827459533463665e-05, + "loss": 1.8083, + "step": 18228 + }, + { + "epoch": 5.595150399017802, + "grad_norm": 0.20735648274421692, + "learning_rate": 4.2822540417325396e-05, + "loss": 1.7761, + "step": 18229 + }, + { + "epoch": 5.5954573357888275, + "grad_norm": 0.2919909656047821, + "learning_rate": 4.281762137211902e-05, + "loss": 1.7836, + "step": 18230 + }, + { + "epoch": 5.595764272559853, + "grad_norm": 0.22636881470680237, + "learning_rate": 4.2812702397893113e-05, + "loss": 1.7389, + "step": 18231 + }, + { + "epoch": 5.596071209330878, + "grad_norm": 0.23788630962371826, + "learning_rate": 4.280778349469627e-05, + "loss": 1.7536, + "step": 18232 + }, + { + "epoch": 5.596378146101903, + "grad_norm": 0.22089426219463348, + "learning_rate": 4.280286466257715e-05, + "loss": 1.7584, + "step": 18233 + }, + { + "epoch": 5.596685082872928, + "grad_norm": 0.20486171543598175, + "learning_rate": 4.279794590158431e-05, + "loss": 1.7182, + "step": 18234 + }, + { + "epoch": 5.596992019643953, + "grad_norm": 0.2343701422214508, + "learning_rate": 4.2793027211766425e-05, + "loss": 1.751, + "step": 18235 + }, + { + "epoch": 5.597298956414979, + "grad_norm": 0.21734023094177246, + "learning_rate": 4.2788108593172036e-05, + "loss": 1.7084, + "step": 18236 + }, + { + "epoch": 5.597605893186004, + "grad_norm": 0.20593903958797455, + "learning_rate": 4.278319004584982e-05, + "loss": 1.6805, + "step": 18237 + }, + { + "epoch": 5.597912829957028, + "grad_norm": 0.20877878367900848, + "learning_rate": 4.2778271569848324e-05, + "loss": 1.7011, + "step": 18238 + }, + { + "epoch": 5.598219766728054, + "grad_norm": 0.23915995657444, + "learning_rate": 4.277335316521619e-05, + "loss": 1.732, + "step": 18239 + }, + { + "epoch": 5.598526703499079, + "grad_norm": 0.24310529232025146, + "learning_rate": 4.2768434832002004e-05, + "loss": 1.7859, + "step": 18240 + }, + { + "epoch": 5.598833640270104, + "grad_norm": 0.23189407587051392, + "learning_rate": 4.27635165702544e-05, + "loss": 1.7237, + "step": 18241 + }, + { + "epoch": 5.59914057704113, + "grad_norm": 0.2708875834941864, + "learning_rate": 4.275859838002195e-05, + "loss": 1.7046, + "step": 18242 + }, + { + "epoch": 5.599447513812155, + "grad_norm": 0.23692840337753296, + "learning_rate": 4.27536802613533e-05, + "loss": 1.8556, + "step": 18243 + }, + { + "epoch": 5.5997544505831796, + "grad_norm": 0.28285983204841614, + "learning_rate": 4.274876221429701e-05, + "loss": 1.6734, + "step": 18244 + }, + { + "epoch": 5.600061387354205, + "grad_norm": 0.20602203905582428, + "learning_rate": 4.27438442389017e-05, + "loss": 1.7113, + "step": 18245 + }, + { + "epoch": 5.60036832412523, + "grad_norm": 0.19719314575195312, + "learning_rate": 4.273892633521598e-05, + "loss": 1.7229, + "step": 18246 + }, + { + "epoch": 5.600675260896256, + "grad_norm": 0.2396705001592636, + "learning_rate": 4.273400850328846e-05, + "loss": 1.6986, + "step": 18247 + }, + { + "epoch": 5.600982197667281, + "grad_norm": 0.1974172443151474, + "learning_rate": 4.2729090743167724e-05, + "loss": 1.7445, + "step": 18248 + }, + { + "epoch": 5.601289134438305, + "grad_norm": 0.2193709760904312, + "learning_rate": 4.272417305490235e-05, + "loss": 1.7657, + "step": 18249 + }, + { + "epoch": 5.601596071209331, + "grad_norm": 0.24138681590557098, + "learning_rate": 4.271925543854098e-05, + "loss": 1.7388, + "step": 18250 + }, + { + "epoch": 5.601903007980356, + "grad_norm": 0.19056223332881927, + "learning_rate": 4.271433789413219e-05, + "loss": 1.6897, + "step": 18251 + }, + { + "epoch": 5.602209944751381, + "grad_norm": 0.20533505082130432, + "learning_rate": 4.270942042172459e-05, + "loss": 1.7222, + "step": 18252 + }, + { + "epoch": 5.602516881522407, + "grad_norm": 0.20570224523544312, + "learning_rate": 4.270450302136675e-05, + "loss": 1.8089, + "step": 18253 + }, + { + "epoch": 5.602823818293432, + "grad_norm": 0.2822209298610687, + "learning_rate": 4.269958569310732e-05, + "loss": 1.7523, + "step": 18254 + }, + { + "epoch": 5.6031307550644565, + "grad_norm": 0.2994859218597412, + "learning_rate": 4.269466843699484e-05, + "loss": 1.7538, + "step": 18255 + }, + { + "epoch": 5.603437691835482, + "grad_norm": 0.24851159751415253, + "learning_rate": 4.2689751253077925e-05, + "loss": 1.8162, + "step": 18256 + }, + { + "epoch": 5.603744628606507, + "grad_norm": 0.20387138426303864, + "learning_rate": 4.268483414140517e-05, + "loss": 1.6803, + "step": 18257 + }, + { + "epoch": 5.6040515653775325, + "grad_norm": 0.21620385348796844, + "learning_rate": 4.2679917102025204e-05, + "loss": 1.7236, + "step": 18258 + }, + { + "epoch": 5.604358502148557, + "grad_norm": 0.1925734579563141, + "learning_rate": 4.267500013498655e-05, + "loss": 1.7295, + "step": 18259 + }, + { + "epoch": 5.604665438919582, + "grad_norm": 0.22216086089611053, + "learning_rate": 4.267008324033787e-05, + "loss": 1.6844, + "step": 18260 + }, + { + "epoch": 5.604972375690608, + "grad_norm": 0.20293502509593964, + "learning_rate": 4.26651664181277e-05, + "loss": 1.7065, + "step": 18261 + }, + { + "epoch": 5.605279312461633, + "grad_norm": 0.21269507706165314, + "learning_rate": 4.266024966840466e-05, + "loss": 1.7573, + "step": 18262 + }, + { + "epoch": 5.605586249232658, + "grad_norm": 0.23574227094650269, + "learning_rate": 4.2655332991217334e-05, + "loss": 1.7625, + "step": 18263 + }, + { + "epoch": 5.605893186003684, + "grad_norm": 0.1875103861093521, + "learning_rate": 4.265041638661433e-05, + "loss": 1.7266, + "step": 18264 + }, + { + "epoch": 5.606200122774708, + "grad_norm": 0.20348483324050903, + "learning_rate": 4.264549985464421e-05, + "loss": 1.731, + "step": 18265 + }, + { + "epoch": 5.606507059545733, + "grad_norm": 0.2345927655696869, + "learning_rate": 4.264058339535556e-05, + "loss": 1.7809, + "step": 18266 + }, + { + "epoch": 5.606813996316759, + "grad_norm": 0.21142496168613434, + "learning_rate": 4.2635667008796985e-05, + "loss": 1.7362, + "step": 18267 + }, + { + "epoch": 5.607120933087784, + "grad_norm": 0.19670210778713226, + "learning_rate": 4.263075069501705e-05, + "loss": 1.7029, + "step": 18268 + }, + { + "epoch": 5.607427869858809, + "grad_norm": 0.20985090732574463, + "learning_rate": 4.262583445406439e-05, + "loss": 1.7478, + "step": 18269 + }, + { + "epoch": 5.607734806629834, + "grad_norm": 0.20972272753715515, + "learning_rate": 4.262091828598752e-05, + "loss": 1.7561, + "step": 18270 + }, + { + "epoch": 5.608041743400859, + "grad_norm": 0.20006676018238068, + "learning_rate": 4.261600219083509e-05, + "loss": 1.7584, + "step": 18271 + }, + { + "epoch": 5.6083486801718845, + "grad_norm": 0.21590086817741394, + "learning_rate": 4.2611086168655635e-05, + "loss": 1.7405, + "step": 18272 + }, + { + "epoch": 5.60865561694291, + "grad_norm": 0.19330906867980957, + "learning_rate": 4.260617021949776e-05, + "loss": 1.6797, + "step": 18273 + }, + { + "epoch": 5.608962553713935, + "grad_norm": 0.1955050528049469, + "learning_rate": 4.260125434341004e-05, + "loss": 1.7174, + "step": 18274 + }, + { + "epoch": 5.6092694904849605, + "grad_norm": 0.2117784321308136, + "learning_rate": 4.2596338540441086e-05, + "loss": 1.743, + "step": 18275 + }, + { + "epoch": 5.609576427255985, + "grad_norm": 0.21788950264453888, + "learning_rate": 4.2591422810639425e-05, + "loss": 1.7603, + "step": 18276 + }, + { + "epoch": 5.60988336402701, + "grad_norm": 0.2092670351266861, + "learning_rate": 4.258650715405369e-05, + "loss": 1.7379, + "step": 18277 + }, + { + "epoch": 5.610190300798036, + "grad_norm": 0.1941552758216858, + "learning_rate": 4.2581591570732414e-05, + "loss": 1.7547, + "step": 18278 + }, + { + "epoch": 5.610497237569061, + "grad_norm": 0.21306751668453217, + "learning_rate": 4.2576676060724215e-05, + "loss": 1.7284, + "step": 18279 + }, + { + "epoch": 5.610804174340086, + "grad_norm": 0.18618693947792053, + "learning_rate": 4.2571760624077635e-05, + "loss": 1.7268, + "step": 18280 + }, + { + "epoch": 5.611111111111111, + "grad_norm": 0.21530354022979736, + "learning_rate": 4.256684526084129e-05, + "loss": 1.7036, + "step": 18281 + }, + { + "epoch": 5.611418047882136, + "grad_norm": 0.23363792896270752, + "learning_rate": 4.256192997106375e-05, + "loss": 1.7797, + "step": 18282 + }, + { + "epoch": 5.611724984653161, + "grad_norm": 0.1786416620016098, + "learning_rate": 4.2557014754793544e-05, + "loss": 1.7008, + "step": 18283 + }, + { + "epoch": 5.612031921424187, + "grad_norm": 0.2042730301618576, + "learning_rate": 4.25520996120793e-05, + "loss": 1.7667, + "step": 18284 + }, + { + "epoch": 5.612338858195212, + "grad_norm": 0.2275264412164688, + "learning_rate": 4.2547184542969554e-05, + "loss": 1.8277, + "step": 18285 + }, + { + "epoch": 5.612645794966237, + "grad_norm": 0.21252553164958954, + "learning_rate": 4.2542269547512925e-05, + "loss": 1.7272, + "step": 18286 + }, + { + "epoch": 5.612952731737262, + "grad_norm": 0.20384398102760315, + "learning_rate": 4.2537354625757934e-05, + "loss": 1.6707, + "step": 18287 + }, + { + "epoch": 5.613259668508287, + "grad_norm": 0.19805553555488586, + "learning_rate": 4.253243977775321e-05, + "loss": 1.7443, + "step": 18288 + }, + { + "epoch": 5.6135666052793125, + "grad_norm": 0.20447707176208496, + "learning_rate": 4.2527525003547256e-05, + "loss": 1.7392, + "step": 18289 + }, + { + "epoch": 5.613873542050338, + "grad_norm": 0.21025662124156952, + "learning_rate": 4.25226103031887e-05, + "loss": 1.7856, + "step": 18290 + }, + { + "epoch": 5.614180478821363, + "grad_norm": 0.2131013125181198, + "learning_rate": 4.2517695676726085e-05, + "loss": 1.7521, + "step": 18291 + }, + { + "epoch": 5.614487415592388, + "grad_norm": 0.2511558532714844, + "learning_rate": 4.2512781124208e-05, + "loss": 1.6873, + "step": 18292 + }, + { + "epoch": 5.614794352363413, + "grad_norm": 0.19668610394001007, + "learning_rate": 4.2507866645682984e-05, + "loss": 1.6808, + "step": 18293 + }, + { + "epoch": 5.615101289134438, + "grad_norm": 0.22313621640205383, + "learning_rate": 4.2502952241199637e-05, + "loss": 1.7794, + "step": 18294 + }, + { + "epoch": 5.615408225905464, + "grad_norm": 0.2053089439868927, + "learning_rate": 4.249803791080649e-05, + "loss": 1.7405, + "step": 18295 + }, + { + "epoch": 5.615715162676489, + "grad_norm": 0.2052931934595108, + "learning_rate": 4.249312365455215e-05, + "loss": 1.6698, + "step": 18296 + }, + { + "epoch": 5.616022099447514, + "grad_norm": 0.223783478140831, + "learning_rate": 4.248820947248515e-05, + "loss": 1.7696, + "step": 18297 + }, + { + "epoch": 5.616329036218539, + "grad_norm": 0.3424001932144165, + "learning_rate": 4.248329536465407e-05, + "loss": 1.7724, + "step": 18298 + }, + { + "epoch": 5.616635972989564, + "grad_norm": 0.25015103816986084, + "learning_rate": 4.247838133110749e-05, + "loss": 1.7188, + "step": 18299 + }, + { + "epoch": 5.616942909760589, + "grad_norm": 0.239765465259552, + "learning_rate": 4.247346737189392e-05, + "loss": 1.695, + "step": 18300 + }, + { + "epoch": 5.617249846531615, + "grad_norm": 0.42259401082992554, + "learning_rate": 4.246855348706197e-05, + "loss": 1.6882, + "step": 18301 + }, + { + "epoch": 5.617556783302639, + "grad_norm": 0.2985959053039551, + "learning_rate": 4.246363967666018e-05, + "loss": 1.7236, + "step": 18302 + }, + { + "epoch": 5.6178637200736645, + "grad_norm": 0.22437956929206848, + "learning_rate": 4.245872594073714e-05, + "loss": 1.7158, + "step": 18303 + }, + { + "epoch": 5.61817065684469, + "grad_norm": 0.3165835440158844, + "learning_rate": 4.245381227934138e-05, + "loss": 1.7543, + "step": 18304 + }, + { + "epoch": 5.618477593615715, + "grad_norm": 0.2565564513206482, + "learning_rate": 4.244889869252148e-05, + "loss": 1.7863, + "step": 18305 + }, + { + "epoch": 5.6187845303867405, + "grad_norm": 0.25741446018218994, + "learning_rate": 4.244398518032597e-05, + "loss": 1.721, + "step": 18306 + }, + { + "epoch": 5.619091467157766, + "grad_norm": 0.26492297649383545, + "learning_rate": 4.2439071742803435e-05, + "loss": 1.7697, + "step": 18307 + }, + { + "epoch": 5.61939840392879, + "grad_norm": 0.2086823433637619, + "learning_rate": 4.243415838000243e-05, + "loss": 1.7072, + "step": 18308 + }, + { + "epoch": 5.619705340699816, + "grad_norm": 0.26784422993659973, + "learning_rate": 4.24292450919715e-05, + "loss": 1.7826, + "step": 18309 + }, + { + "epoch": 5.620012277470841, + "grad_norm": 0.21774251759052277, + "learning_rate": 4.242433187875921e-05, + "loss": 1.7204, + "step": 18310 + }, + { + "epoch": 5.620319214241866, + "grad_norm": 0.29547446966171265, + "learning_rate": 4.241941874041412e-05, + "loss": 1.7303, + "step": 18311 + }, + { + "epoch": 5.620626151012892, + "grad_norm": 0.20278988778591156, + "learning_rate": 4.241450567698476e-05, + "loss": 1.692, + "step": 18312 + }, + { + "epoch": 5.620933087783916, + "grad_norm": 0.2084289938211441, + "learning_rate": 4.240959268851971e-05, + "loss": 1.7069, + "step": 18313 + }, + { + "epoch": 5.621240024554941, + "grad_norm": 0.19901904463768005, + "learning_rate": 4.240467977506752e-05, + "loss": 1.6798, + "step": 18314 + }, + { + "epoch": 5.621546961325967, + "grad_norm": 0.24629411101341248, + "learning_rate": 4.2399766936676735e-05, + "loss": 1.775, + "step": 18315 + }, + { + "epoch": 5.621853898096992, + "grad_norm": 0.2532403767108917, + "learning_rate": 4.239485417339591e-05, + "loss": 1.7669, + "step": 18316 + }, + { + "epoch": 5.622160834868017, + "grad_norm": 0.22495722770690918, + "learning_rate": 4.2389941485273576e-05, + "loss": 1.7772, + "step": 18317 + }, + { + "epoch": 5.622467771639043, + "grad_norm": 0.2789733111858368, + "learning_rate": 4.2385028872358316e-05, + "loss": 1.751, + "step": 18318 + }, + { + "epoch": 5.622774708410067, + "grad_norm": 0.2266954481601715, + "learning_rate": 4.238011633469866e-05, + "loss": 1.7213, + "step": 18319 + }, + { + "epoch": 5.6230816451810925, + "grad_norm": 0.2163502722978592, + "learning_rate": 4.237520387234316e-05, + "loss": 1.7781, + "step": 18320 + }, + { + "epoch": 5.623388581952118, + "grad_norm": 0.25249144434928894, + "learning_rate": 4.237029148534036e-05, + "loss": 1.7293, + "step": 18321 + }, + { + "epoch": 5.623695518723143, + "grad_norm": 0.2320011854171753, + "learning_rate": 4.2365379173738826e-05, + "loss": 1.7909, + "step": 18322 + }, + { + "epoch": 5.6240024554941686, + "grad_norm": 0.22074681520462036, + "learning_rate": 4.2360466937587074e-05, + "loss": 1.743, + "step": 18323 + }, + { + "epoch": 5.624309392265193, + "grad_norm": 0.20864775776863098, + "learning_rate": 4.235555477693368e-05, + "loss": 1.726, + "step": 18324 + }, + { + "epoch": 5.624616329036218, + "grad_norm": 0.24547792971134186, + "learning_rate": 4.235064269182716e-05, + "loss": 1.7646, + "step": 18325 + }, + { + "epoch": 5.624923265807244, + "grad_norm": 0.29965806007385254, + "learning_rate": 4.234573068231607e-05, + "loss": 1.7789, + "step": 18326 + }, + { + "epoch": 5.625230202578269, + "grad_norm": 0.20844583213329315, + "learning_rate": 4.234081874844896e-05, + "loss": 1.7007, + "step": 18327 + }, + { + "epoch": 5.625537139349294, + "grad_norm": 0.2455398142337799, + "learning_rate": 4.2335906890274385e-05, + "loss": 1.7094, + "step": 18328 + }, + { + "epoch": 5.62584407612032, + "grad_norm": 0.17839518189430237, + "learning_rate": 4.233099510784085e-05, + "loss": 1.6849, + "step": 18329 + }, + { + "epoch": 5.626151012891344, + "grad_norm": 0.20219004154205322, + "learning_rate": 4.232608340119693e-05, + "loss": 1.716, + "step": 18330 + }, + { + "epoch": 5.6264579496623695, + "grad_norm": 0.23570619523525238, + "learning_rate": 4.232117177039114e-05, + "loss": 1.7622, + "step": 18331 + }, + { + "epoch": 5.626764886433395, + "grad_norm": 0.23534397780895233, + "learning_rate": 4.231626021547204e-05, + "loss": 1.7758, + "step": 18332 + }, + { + "epoch": 5.62707182320442, + "grad_norm": 0.2177352011203766, + "learning_rate": 4.231134873648817e-05, + "loss": 1.7102, + "step": 18333 + }, + { + "epoch": 5.627378759975445, + "grad_norm": 0.22886058688163757, + "learning_rate": 4.230643733348803e-05, + "loss": 1.7766, + "step": 18334 + }, + { + "epoch": 5.62768569674647, + "grad_norm": 0.20723696053028107, + "learning_rate": 4.2301526006520215e-05, + "loss": 1.7287, + "step": 18335 + }, + { + "epoch": 5.627992633517495, + "grad_norm": 0.18612104654312134, + "learning_rate": 4.229661475563321e-05, + "loss": 1.7255, + "step": 18336 + }, + { + "epoch": 5.628299570288521, + "grad_norm": 0.26456236839294434, + "learning_rate": 4.229170358087558e-05, + "loss": 1.7388, + "step": 18337 + }, + { + "epoch": 5.628606507059546, + "grad_norm": 0.25253555178642273, + "learning_rate": 4.2286792482295845e-05, + "loss": 1.7031, + "step": 18338 + }, + { + "epoch": 5.628913443830571, + "grad_norm": 0.23093348741531372, + "learning_rate": 4.228188145994257e-05, + "loss": 1.8032, + "step": 18339 + }, + { + "epoch": 5.629220380601596, + "grad_norm": 0.24142487347126007, + "learning_rate": 4.227697051386424e-05, + "loss": 1.6621, + "step": 18340 + }, + { + "epoch": 5.629527317372621, + "grad_norm": 0.2883392572402954, + "learning_rate": 4.227205964410944e-05, + "loss": 1.7125, + "step": 18341 + }, + { + "epoch": 5.629834254143646, + "grad_norm": 0.22670713067054749, + "learning_rate": 4.226714885072665e-05, + "loss": 1.7659, + "step": 18342 + }, + { + "epoch": 5.630141190914672, + "grad_norm": 0.2795337438583374, + "learning_rate": 4.226223813376444e-05, + "loss": 1.7559, + "step": 18343 + }, + { + "epoch": 5.630448127685697, + "grad_norm": 0.2513083219528198, + "learning_rate": 4.225732749327132e-05, + "loss": 1.6969, + "step": 18344 + }, + { + "epoch": 5.6307550644567215, + "grad_norm": 0.24588467180728912, + "learning_rate": 4.225241692929585e-05, + "loss": 1.7724, + "step": 18345 + }, + { + "epoch": 5.631062001227747, + "grad_norm": 0.41726353764533997, + "learning_rate": 4.224750644188651e-05, + "loss": 1.7308, + "step": 18346 + }, + { + "epoch": 5.631368937998772, + "grad_norm": 0.2512385845184326, + "learning_rate": 4.2242596031091886e-05, + "loss": 1.7068, + "step": 18347 + }, + { + "epoch": 5.6316758747697975, + "grad_norm": 0.3077464997768402, + "learning_rate": 4.223768569696044e-05, + "loss": 1.7383, + "step": 18348 + }, + { + "epoch": 5.631982811540823, + "grad_norm": 0.3460720479488373, + "learning_rate": 4.2232775439540756e-05, + "loss": 1.7317, + "step": 18349 + }, + { + "epoch": 5.632289748311848, + "grad_norm": 0.24827539920806885, + "learning_rate": 4.222786525888134e-05, + "loss": 1.6871, + "step": 18350 + }, + { + "epoch": 5.632596685082873, + "grad_norm": 0.24851584434509277, + "learning_rate": 4.22229551550307e-05, + "loss": 1.7058, + "step": 18351 + }, + { + "epoch": 5.632903621853898, + "grad_norm": 0.31132519245147705, + "learning_rate": 4.2218045128037396e-05, + "loss": 1.7523, + "step": 18352 + }, + { + "epoch": 5.633210558624923, + "grad_norm": 0.3104027807712555, + "learning_rate": 4.2213135177949906e-05, + "loss": 1.7669, + "step": 18353 + }, + { + "epoch": 5.633517495395949, + "grad_norm": 0.31351104378700256, + "learning_rate": 4.2208225304816795e-05, + "loss": 1.7031, + "step": 18354 + }, + { + "epoch": 5.633824432166974, + "grad_norm": 0.3217851221561432, + "learning_rate": 4.2203315508686555e-05, + "loss": 1.7694, + "step": 18355 + }, + { + "epoch": 5.634131368937998, + "grad_norm": 0.22287796437740326, + "learning_rate": 4.2198405789607745e-05, + "loss": 1.7742, + "step": 18356 + }, + { + "epoch": 5.634438305709024, + "grad_norm": 0.20288340747356415, + "learning_rate": 4.219349614762883e-05, + "loss": 1.7113, + "step": 18357 + }, + { + "epoch": 5.634745242480049, + "grad_norm": 0.19823449850082397, + "learning_rate": 4.218858658279839e-05, + "loss": 1.7433, + "step": 18358 + }, + { + "epoch": 5.635052179251074, + "grad_norm": 0.2756347358226776, + "learning_rate": 4.2183677095164895e-05, + "loss": 1.8278, + "step": 18359 + }, + { + "epoch": 5.6353591160221, + "grad_norm": 0.2303706556558609, + "learning_rate": 4.2178767684776895e-05, + "loss": 1.6943, + "step": 18360 + }, + { + "epoch": 5.635666052793125, + "grad_norm": 0.25089216232299805, + "learning_rate": 4.217385835168288e-05, + "loss": 1.6562, + "step": 18361 + }, + { + "epoch": 5.6359729895641495, + "grad_norm": 0.3013486862182617, + "learning_rate": 4.216894909593141e-05, + "loss": 1.7323, + "step": 18362 + }, + { + "epoch": 5.636279926335175, + "grad_norm": 0.19471928477287292, + "learning_rate": 4.2164039917570956e-05, + "loss": 1.7301, + "step": 18363 + }, + { + "epoch": 5.6365868631062, + "grad_norm": 0.3257733881473541, + "learning_rate": 4.2159130816650075e-05, + "loss": 1.7522, + "step": 18364 + }, + { + "epoch": 5.6368937998772255, + "grad_norm": 0.3065868020057678, + "learning_rate": 4.215422179321723e-05, + "loss": 1.7077, + "step": 18365 + }, + { + "epoch": 5.637200736648251, + "grad_norm": 0.20643819868564606, + "learning_rate": 4.214931284732098e-05, + "loss": 1.8033, + "step": 18366 + }, + { + "epoch": 5.637507673419275, + "grad_norm": 0.23551981151103973, + "learning_rate": 4.2144403979009826e-05, + "loss": 1.7391, + "step": 18367 + }, + { + "epoch": 5.637814610190301, + "grad_norm": 0.20602314174175262, + "learning_rate": 4.2139495188332265e-05, + "loss": 1.7593, + "step": 18368 + }, + { + "epoch": 5.638121546961326, + "grad_norm": 0.27911239862442017, + "learning_rate": 4.2134586475336834e-05, + "loss": 1.7212, + "step": 18369 + }, + { + "epoch": 5.638428483732351, + "grad_norm": 0.2700496017932892, + "learning_rate": 4.212967784007201e-05, + "loss": 1.7755, + "step": 18370 + }, + { + "epoch": 5.638735420503377, + "grad_norm": 0.24988985061645508, + "learning_rate": 4.2124769282586334e-05, + "loss": 1.7364, + "step": 18371 + }, + { + "epoch": 5.639042357274402, + "grad_norm": 0.20491284132003784, + "learning_rate": 4.211986080292829e-05, + "loss": 1.7477, + "step": 18372 + }, + { + "epoch": 5.639349294045426, + "grad_norm": 0.24953459203243256, + "learning_rate": 4.211495240114643e-05, + "loss": 1.7712, + "step": 18373 + }, + { + "epoch": 5.639656230816452, + "grad_norm": 0.2028491199016571, + "learning_rate": 4.2110044077289204e-05, + "loss": 1.701, + "step": 18374 + }, + { + "epoch": 5.639963167587477, + "grad_norm": 0.22320568561553955, + "learning_rate": 4.210513583140517e-05, + "loss": 1.7818, + "step": 18375 + }, + { + "epoch": 5.640270104358502, + "grad_norm": 0.22680947184562683, + "learning_rate": 4.210022766354278e-05, + "loss": 1.7631, + "step": 18376 + }, + { + "epoch": 5.640577041129527, + "grad_norm": 0.20724014937877655, + "learning_rate": 4.2095319573750596e-05, + "loss": 1.7757, + "step": 18377 + }, + { + "epoch": 5.640883977900552, + "grad_norm": 0.21785953640937805, + "learning_rate": 4.209041156207708e-05, + "loss": 1.7161, + "step": 18378 + }, + { + "epoch": 5.6411909146715775, + "grad_norm": 0.21751803159713745, + "learning_rate": 4.208550362857078e-05, + "loss": 1.7449, + "step": 18379 + }, + { + "epoch": 5.641497851442603, + "grad_norm": 0.1765962839126587, + "learning_rate": 4.208059577328014e-05, + "loss": 1.7191, + "step": 18380 + }, + { + "epoch": 5.641804788213628, + "grad_norm": 0.22720913589000702, + "learning_rate": 4.2075687996253724e-05, + "loss": 1.7037, + "step": 18381 + }, + { + "epoch": 5.6421117249846535, + "grad_norm": 0.23589655756950378, + "learning_rate": 4.2070780297539976e-05, + "loss": 1.8147, + "step": 18382 + }, + { + "epoch": 5.642418661755678, + "grad_norm": 0.21187056601047516, + "learning_rate": 4.2065872677187435e-05, + "loss": 1.7655, + "step": 18383 + }, + { + "epoch": 5.642725598526703, + "grad_norm": 0.24153946340084076, + "learning_rate": 4.2060965135244606e-05, + "loss": 1.7841, + "step": 18384 + }, + { + "epoch": 5.643032535297729, + "grad_norm": 0.2059229612350464, + "learning_rate": 4.205605767175995e-05, + "loss": 1.6718, + "step": 18385 + }, + { + "epoch": 5.643339472068754, + "grad_norm": 0.20235973596572876, + "learning_rate": 4.205115028678201e-05, + "loss": 1.6931, + "step": 18386 + }, + { + "epoch": 5.643646408839779, + "grad_norm": 0.25149911642074585, + "learning_rate": 4.204624298035924e-05, + "loss": 1.7465, + "step": 18387 + }, + { + "epoch": 5.643953345610804, + "grad_norm": 0.2050812691450119, + "learning_rate": 4.204133575254017e-05, + "loss": 1.7147, + "step": 18388 + }, + { + "epoch": 5.644260282381829, + "grad_norm": 0.20906420052051544, + "learning_rate": 4.2036428603373274e-05, + "loss": 1.6762, + "step": 18389 + }, + { + "epoch": 5.644567219152854, + "grad_norm": 0.20150595903396606, + "learning_rate": 4.2031521532907075e-05, + "loss": 1.678, + "step": 18390 + }, + { + "epoch": 5.64487415592388, + "grad_norm": 0.2141568511724472, + "learning_rate": 4.202661454119004e-05, + "loss": 1.7274, + "step": 18391 + }, + { + "epoch": 5.645181092694905, + "grad_norm": 0.2641741931438446, + "learning_rate": 4.202170762827069e-05, + "loss": 1.7975, + "step": 18392 + }, + { + "epoch": 5.64548802946593, + "grad_norm": 0.22928468883037567, + "learning_rate": 4.201680079419747e-05, + "loss": 1.7687, + "step": 18393 + }, + { + "epoch": 5.645794966236955, + "grad_norm": 0.22713731229305267, + "learning_rate": 4.2011894039018925e-05, + "loss": 1.7475, + "step": 18394 + }, + { + "epoch": 5.64610190300798, + "grad_norm": 0.25602981448173523, + "learning_rate": 4.200698736278351e-05, + "loss": 1.7356, + "step": 18395 + }, + { + "epoch": 5.6464088397790055, + "grad_norm": 0.2619759738445282, + "learning_rate": 4.200208076553975e-05, + "loss": 1.7334, + "step": 18396 + }, + { + "epoch": 5.646715776550031, + "grad_norm": 0.24756783246994019, + "learning_rate": 4.19971742473361e-05, + "loss": 1.7253, + "step": 18397 + }, + { + "epoch": 5.647022713321056, + "grad_norm": 0.2068249136209488, + "learning_rate": 4.199226780822109e-05, + "loss": 1.7246, + "step": 18398 + }, + { + "epoch": 5.647329650092081, + "grad_norm": 0.23219087719917297, + "learning_rate": 4.1987361448243165e-05, + "loss": 1.7388, + "step": 18399 + }, + { + "epoch": 5.647636586863106, + "grad_norm": 0.2051403522491455, + "learning_rate": 4.198245516745082e-05, + "loss": 1.7775, + "step": 18400 + }, + { + "epoch": 5.647943523634131, + "grad_norm": 0.26408639550209045, + "learning_rate": 4.1977548965892575e-05, + "loss": 1.8069, + "step": 18401 + }, + { + "epoch": 5.648250460405157, + "grad_norm": 0.2104891538619995, + "learning_rate": 4.197264284361687e-05, + "loss": 1.7335, + "step": 18402 + }, + { + "epoch": 5.648557397176182, + "grad_norm": 0.23963849246501923, + "learning_rate": 4.196773680067224e-05, + "loss": 1.7254, + "step": 18403 + }, + { + "epoch": 5.648864333947207, + "grad_norm": 0.2770128846168518, + "learning_rate": 4.1962830837107117e-05, + "loss": 1.7848, + "step": 18404 + }, + { + "epoch": 5.649171270718232, + "grad_norm": 0.23342710733413696, + "learning_rate": 4.195792495297002e-05, + "loss": 1.7818, + "step": 18405 + }, + { + "epoch": 5.649478207489257, + "grad_norm": 0.23835061490535736, + "learning_rate": 4.195301914830941e-05, + "loss": 1.7453, + "step": 18406 + }, + { + "epoch": 5.649785144260282, + "grad_norm": 0.21896767616271973, + "learning_rate": 4.194811342317381e-05, + "loss": 1.7205, + "step": 18407 + }, + { + "epoch": 5.650092081031308, + "grad_norm": 0.20222818851470947, + "learning_rate": 4.1943207777611646e-05, + "loss": 1.6833, + "step": 18408 + }, + { + "epoch": 5.650399017802332, + "grad_norm": 0.2182089239358902, + "learning_rate": 4.193830221167146e-05, + "loss": 1.7296, + "step": 18409 + }, + { + "epoch": 5.650705954573358, + "grad_norm": 0.19981688261032104, + "learning_rate": 4.1933396725401655e-05, + "loss": 1.7327, + "step": 18410 + }, + { + "epoch": 5.651012891344383, + "grad_norm": 0.23925067484378815, + "learning_rate": 4.192849131885077e-05, + "loss": 1.7545, + "step": 18411 + }, + { + "epoch": 5.651319828115408, + "grad_norm": 0.21967993676662445, + "learning_rate": 4.192358599206725e-05, + "loss": 1.6973, + "step": 18412 + }, + { + "epoch": 5.651626764886434, + "grad_norm": 0.2273840606212616, + "learning_rate": 4.1918680745099614e-05, + "loss": 1.8229, + "step": 18413 + }, + { + "epoch": 5.651933701657459, + "grad_norm": 0.26950231194496155, + "learning_rate": 4.1913775577996286e-05, + "loss": 1.7666, + "step": 18414 + }, + { + "epoch": 5.652240638428484, + "grad_norm": 0.26608848571777344, + "learning_rate": 4.190887049080579e-05, + "loss": 1.8279, + "step": 18415 + }, + { + "epoch": 5.652547575199509, + "grad_norm": 0.20856785774230957, + "learning_rate": 4.190396548357658e-05, + "loss": 1.7224, + "step": 18416 + }, + { + "epoch": 5.652854511970534, + "grad_norm": 0.2894255816936493, + "learning_rate": 4.18990605563571e-05, + "loss": 1.7308, + "step": 18417 + }, + { + "epoch": 5.653161448741559, + "grad_norm": 0.2047591209411621, + "learning_rate": 4.189415570919588e-05, + "loss": 1.758, + "step": 18418 + }, + { + "epoch": 5.653468385512585, + "grad_norm": 0.37161269783973694, + "learning_rate": 4.1889250942141346e-05, + "loss": 1.7926, + "step": 18419 + }, + { + "epoch": 5.653775322283609, + "grad_norm": 0.37338340282440186, + "learning_rate": 4.1884346255242e-05, + "loss": 1.7491, + "step": 18420 + }, + { + "epoch": 5.6540822590546345, + "grad_norm": 0.24279838800430298, + "learning_rate": 4.187944164854629e-05, + "loss": 1.7103, + "step": 18421 + }, + { + "epoch": 5.65438919582566, + "grad_norm": 0.219639852643013, + "learning_rate": 4.18745371221027e-05, + "loss": 1.7824, + "step": 18422 + }, + { + "epoch": 5.654696132596685, + "grad_norm": 0.22248409688472748, + "learning_rate": 4.186963267595969e-05, + "loss": 1.8098, + "step": 18423 + }, + { + "epoch": 5.6550030693677105, + "grad_norm": 0.2115657478570938, + "learning_rate": 4.1864728310165755e-05, + "loss": 1.72, + "step": 18424 + }, + { + "epoch": 5.655310006138736, + "grad_norm": 0.19723005592823029, + "learning_rate": 4.1859824024769325e-05, + "loss": 1.6818, + "step": 18425 + }, + { + "epoch": 5.65561694290976, + "grad_norm": 0.1828317642211914, + "learning_rate": 4.185491981981891e-05, + "loss": 1.7243, + "step": 18426 + }, + { + "epoch": 5.655923879680786, + "grad_norm": 0.271781861782074, + "learning_rate": 4.185001569536292e-05, + "loss": 1.7688, + "step": 18427 + }, + { + "epoch": 5.656230816451811, + "grad_norm": 0.3140811324119568, + "learning_rate": 4.184511165144986e-05, + "loss": 1.7319, + "step": 18428 + }, + { + "epoch": 5.656537753222836, + "grad_norm": 0.20013047754764557, + "learning_rate": 4.184020768812818e-05, + "loss": 1.7104, + "step": 18429 + }, + { + "epoch": 5.656844689993862, + "grad_norm": 0.2615044414997101, + "learning_rate": 4.183530380544638e-05, + "loss": 1.7314, + "step": 18430 + }, + { + "epoch": 5.657151626764886, + "grad_norm": 0.2645856440067291, + "learning_rate": 4.183040000345287e-05, + "loss": 1.7431, + "step": 18431 + }, + { + "epoch": 5.657458563535911, + "grad_norm": 0.1916145384311676, + "learning_rate": 4.182549628219615e-05, + "loss": 1.7013, + "step": 18432 + }, + { + "epoch": 5.657765500306937, + "grad_norm": 0.2647114396095276, + "learning_rate": 4.182059264172466e-05, + "loss": 1.7278, + "step": 18433 + }, + { + "epoch": 5.658072437077962, + "grad_norm": 0.20201756060123444, + "learning_rate": 4.1815689082086854e-05, + "loss": 1.7065, + "step": 18434 + }, + { + "epoch": 5.658379373848987, + "grad_norm": 0.23892022669315338, + "learning_rate": 4.181078560333123e-05, + "loss": 1.7365, + "step": 18435 + }, + { + "epoch": 5.658686310620013, + "grad_norm": 0.3125975728034973, + "learning_rate": 4.18058822055062e-05, + "loss": 1.7152, + "step": 18436 + }, + { + "epoch": 5.658993247391037, + "grad_norm": 0.18924804031848907, + "learning_rate": 4.180097888866027e-05, + "loss": 1.7763, + "step": 18437 + }, + { + "epoch": 5.6593001841620625, + "grad_norm": 0.28476929664611816, + "learning_rate": 4.1796075652841845e-05, + "loss": 1.7517, + "step": 18438 + }, + { + "epoch": 5.659607120933088, + "grad_norm": 0.30616337060928345, + "learning_rate": 4.1791172498099416e-05, + "loss": 1.7446, + "step": 18439 + }, + { + "epoch": 5.659914057704113, + "grad_norm": 0.3219330608844757, + "learning_rate": 4.1786269424481426e-05, + "loss": 1.8374, + "step": 18440 + }, + { + "epoch": 5.6602209944751385, + "grad_norm": 0.34074151515960693, + "learning_rate": 4.1781366432036364e-05, + "loss": 1.7915, + "step": 18441 + }, + { + "epoch": 5.660527931246163, + "grad_norm": 0.2321610003709793, + "learning_rate": 4.177646352081263e-05, + "loss": 1.7361, + "step": 18442 + }, + { + "epoch": 5.660834868017188, + "grad_norm": 0.34283575415611267, + "learning_rate": 4.1771560690858716e-05, + "loss": 1.6859, + "step": 18443 + }, + { + "epoch": 5.661141804788214, + "grad_norm": 0.32274290919303894, + "learning_rate": 4.1766657942223055e-05, + "loss": 1.7376, + "step": 18444 + }, + { + "epoch": 5.661448741559239, + "grad_norm": 0.23960906267166138, + "learning_rate": 4.1761755274954105e-05, + "loss": 1.7198, + "step": 18445 + }, + { + "epoch": 5.661755678330264, + "grad_norm": 0.2622305154800415, + "learning_rate": 4.175685268910031e-05, + "loss": 1.6997, + "step": 18446 + }, + { + "epoch": 5.66206261510129, + "grad_norm": 0.19836951792240143, + "learning_rate": 4.1751950184710157e-05, + "loss": 1.6612, + "step": 18447 + }, + { + "epoch": 5.662369551872314, + "grad_norm": 0.29541507363319397, + "learning_rate": 4.174704776183204e-05, + "loss": 1.7606, + "step": 18448 + }, + { + "epoch": 5.662676488643339, + "grad_norm": 0.21632203459739685, + "learning_rate": 4.174214542051445e-05, + "loss": 1.7108, + "step": 18449 + }, + { + "epoch": 5.662983425414365, + "grad_norm": 0.2851164638996124, + "learning_rate": 4.173724316080582e-05, + "loss": 1.747, + "step": 18450 + }, + { + "epoch": 5.66329036218539, + "grad_norm": 0.30293309688568115, + "learning_rate": 4.173234098275458e-05, + "loss": 1.7549, + "step": 18451 + }, + { + "epoch": 5.6635972989564145, + "grad_norm": 0.2131963074207306, + "learning_rate": 4.172743888640921e-05, + "loss": 1.7804, + "step": 18452 + }, + { + "epoch": 5.66390423572744, + "grad_norm": 0.234910249710083, + "learning_rate": 4.172253687181812e-05, + "loss": 1.7149, + "step": 18453 + }, + { + "epoch": 5.664211172498465, + "grad_norm": 0.21238654851913452, + "learning_rate": 4.171763493902979e-05, + "loss": 1.7272, + "step": 18454 + }, + { + "epoch": 5.6645181092694905, + "grad_norm": 0.20571236312389374, + "learning_rate": 4.171273308809263e-05, + "loss": 1.713, + "step": 18455 + }, + { + "epoch": 5.664825046040516, + "grad_norm": 0.24867361783981323, + "learning_rate": 4.1707831319055104e-05, + "loss": 1.682, + "step": 18456 + }, + { + "epoch": 5.665131982811541, + "grad_norm": 0.20556440949440002, + "learning_rate": 4.170292963196564e-05, + "loss": 1.7126, + "step": 18457 + }, + { + "epoch": 5.665438919582566, + "grad_norm": 0.26431065797805786, + "learning_rate": 4.169802802687271e-05, + "loss": 1.8142, + "step": 18458 + }, + { + "epoch": 5.665745856353591, + "grad_norm": 0.26041486859321594, + "learning_rate": 4.169312650382471e-05, + "loss": 1.7206, + "step": 18459 + }, + { + "epoch": 5.666052793124616, + "grad_norm": 0.2190525084733963, + "learning_rate": 4.1688225062870126e-05, + "loss": 1.787, + "step": 18460 + }, + { + "epoch": 5.666359729895642, + "grad_norm": 0.24726425111293793, + "learning_rate": 4.1683323704057354e-05, + "loss": 1.7677, + "step": 18461 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.22206442058086395, + "learning_rate": 4.167842242743486e-05, + "loss": 1.73, + "step": 18462 + }, + { + "epoch": 5.666973603437691, + "grad_norm": 0.22501195967197418, + "learning_rate": 4.167352123305108e-05, + "loss": 1.7213, + "step": 18463 + }, + { + "epoch": 5.667280540208717, + "grad_norm": 0.26164770126342773, + "learning_rate": 4.166862012095443e-05, + "loss": 1.7839, + "step": 18464 + }, + { + "epoch": 5.667587476979742, + "grad_norm": 0.19480809569358826, + "learning_rate": 4.166371909119336e-05, + "loss": 1.7562, + "step": 18465 + }, + { + "epoch": 5.667894413750767, + "grad_norm": 0.26677292585372925, + "learning_rate": 4.165881814381632e-05, + "loss": 1.776, + "step": 18466 + }, + { + "epoch": 5.668201350521793, + "grad_norm": 0.22019581496715546, + "learning_rate": 4.165391727887172e-05, + "loss": 1.7575, + "step": 18467 + }, + { + "epoch": 5.668508287292818, + "grad_norm": 0.23851899802684784, + "learning_rate": 4.1649016496407986e-05, + "loss": 1.7346, + "step": 18468 + }, + { + "epoch": 5.6688152240638425, + "grad_norm": 0.3118130564689636, + "learning_rate": 4.1644115796473596e-05, + "loss": 1.7808, + "step": 18469 + }, + { + "epoch": 5.669122160834868, + "grad_norm": 0.22783879935741425, + "learning_rate": 4.163921517911692e-05, + "loss": 1.831, + "step": 18470 + }, + { + "epoch": 5.669429097605893, + "grad_norm": 0.2203773707151413, + "learning_rate": 4.163431464438645e-05, + "loss": 1.7034, + "step": 18471 + }, + { + "epoch": 5.6697360343769185, + "grad_norm": 0.21838103234767914, + "learning_rate": 4.162941419233056e-05, + "loss": 1.7553, + "step": 18472 + }, + { + "epoch": 5.670042971147944, + "grad_norm": 0.18453563749790192, + "learning_rate": 4.162451382299771e-05, + "loss": 1.7139, + "step": 18473 + }, + { + "epoch": 5.670349907918968, + "grad_norm": 0.25308313965797424, + "learning_rate": 4.161961353643633e-05, + "loss": 1.7291, + "step": 18474 + }, + { + "epoch": 5.670656844689994, + "grad_norm": 0.2528827488422394, + "learning_rate": 4.1614713332694845e-05, + "loss": 1.781, + "step": 18475 + }, + { + "epoch": 5.670963781461019, + "grad_norm": 0.24774135649204254, + "learning_rate": 4.160981321182166e-05, + "loss": 1.7808, + "step": 18476 + }, + { + "epoch": 5.671270718232044, + "grad_norm": 0.25225830078125, + "learning_rate": 4.160491317386524e-05, + "loss": 1.739, + "step": 18477 + }, + { + "epoch": 5.67157765500307, + "grad_norm": 0.2095808982849121, + "learning_rate": 4.160001321887397e-05, + "loss": 1.7242, + "step": 18478 + }, + { + "epoch": 5.671884591774095, + "grad_norm": 0.23906216025352478, + "learning_rate": 4.159511334689631e-05, + "loss": 1.7071, + "step": 18479 + }, + { + "epoch": 5.672191528545119, + "grad_norm": 0.21851155161857605, + "learning_rate": 4.159021355798065e-05, + "loss": 1.7171, + "step": 18480 + }, + { + "epoch": 5.672498465316145, + "grad_norm": 0.2005140632390976, + "learning_rate": 4.158531385217544e-05, + "loss": 1.7483, + "step": 18481 + }, + { + "epoch": 5.67280540208717, + "grad_norm": 0.2230832278728485, + "learning_rate": 4.1580414229529074e-05, + "loss": 1.7386, + "step": 18482 + }, + { + "epoch": 5.673112338858195, + "grad_norm": 0.22402967512607574, + "learning_rate": 4.1575514690090014e-05, + "loss": 1.7989, + "step": 18483 + }, + { + "epoch": 5.67341927562922, + "grad_norm": 0.20350080728530884, + "learning_rate": 4.157061523390665e-05, + "loss": 1.6856, + "step": 18484 + }, + { + "epoch": 5.673726212400245, + "grad_norm": 0.2039422243833542, + "learning_rate": 4.15657158610274e-05, + "loss": 1.7262, + "step": 18485 + }, + { + "epoch": 5.6740331491712706, + "grad_norm": 0.20411522686481476, + "learning_rate": 4.156081657150069e-05, + "loss": 1.738, + "step": 18486 + }, + { + "epoch": 5.674340085942296, + "grad_norm": 0.2693086862564087, + "learning_rate": 4.155591736537493e-05, + "loss": 1.731, + "step": 18487 + }, + { + "epoch": 5.674647022713321, + "grad_norm": 0.20745019614696503, + "learning_rate": 4.1551018242698567e-05, + "loss": 1.7138, + "step": 18488 + }, + { + "epoch": 5.6749539594843466, + "grad_norm": 0.22033964097499847, + "learning_rate": 4.1546119203519964e-05, + "loss": 1.8144, + "step": 18489 + }, + { + "epoch": 5.675260896255372, + "grad_norm": 0.22859029471874237, + "learning_rate": 4.154122024788759e-05, + "loss": 1.6724, + "step": 18490 + }, + { + "epoch": 5.675567833026396, + "grad_norm": 0.2226465791463852, + "learning_rate": 4.153632137584982e-05, + "loss": 1.731, + "step": 18491 + }, + { + "epoch": 5.675874769797422, + "grad_norm": 0.19657716155052185, + "learning_rate": 4.1531422587455086e-05, + "loss": 1.6937, + "step": 18492 + }, + { + "epoch": 5.676181706568447, + "grad_norm": 0.23167578876018524, + "learning_rate": 4.152652388275179e-05, + "loss": 1.7444, + "step": 18493 + }, + { + "epoch": 5.676488643339472, + "grad_norm": 0.24468563497066498, + "learning_rate": 4.1521625261788374e-05, + "loss": 1.7173, + "step": 18494 + }, + { + "epoch": 5.676795580110497, + "grad_norm": 0.27125802636146545, + "learning_rate": 4.1516726724613206e-05, + "loss": 1.7424, + "step": 18495 + }, + { + "epoch": 5.677102516881522, + "grad_norm": 0.23816901445388794, + "learning_rate": 4.151182827127473e-05, + "loss": 1.6911, + "step": 18496 + }, + { + "epoch": 5.6774094536525475, + "grad_norm": 0.26058733463287354, + "learning_rate": 4.150692990182133e-05, + "loss": 1.7142, + "step": 18497 + }, + { + "epoch": 5.677716390423573, + "grad_norm": 0.20207929611206055, + "learning_rate": 4.150203161630143e-05, + "loss": 1.7506, + "step": 18498 + }, + { + "epoch": 5.678023327194598, + "grad_norm": 0.259857714176178, + "learning_rate": 4.1497133414763435e-05, + "loss": 1.7181, + "step": 18499 + }, + { + "epoch": 5.6783302639656235, + "grad_norm": 0.2607496380805969, + "learning_rate": 4.149223529725577e-05, + "loss": 1.7829, + "step": 18500 + }, + { + "epoch": 5.678637200736648, + "grad_norm": 0.23265719413757324, + "learning_rate": 4.148733726382681e-05, + "loss": 1.7028, + "step": 18501 + }, + { + "epoch": 5.678944137507673, + "grad_norm": 0.26610276103019714, + "learning_rate": 4.1482439314524964e-05, + "loss": 1.8604, + "step": 18502 + }, + { + "epoch": 5.679251074278699, + "grad_norm": 0.24022582173347473, + "learning_rate": 4.147754144939865e-05, + "loss": 1.7142, + "step": 18503 + }, + { + "epoch": 5.679558011049724, + "grad_norm": 0.2849755585193634, + "learning_rate": 4.1472643668496255e-05, + "loss": 1.6956, + "step": 18504 + }, + { + "epoch": 5.679864947820749, + "grad_norm": 0.24330341815948486, + "learning_rate": 4.1467745971866216e-05, + "loss": 1.7617, + "step": 18505 + }, + { + "epoch": 5.680171884591774, + "grad_norm": 0.21072770655155182, + "learning_rate": 4.146284835955689e-05, + "loss": 1.6999, + "step": 18506 + }, + { + "epoch": 5.680478821362799, + "grad_norm": 0.1971336454153061, + "learning_rate": 4.145795083161673e-05, + "loss": 1.6756, + "step": 18507 + }, + { + "epoch": 5.680785758133824, + "grad_norm": 0.18576614558696747, + "learning_rate": 4.1453053388094073e-05, + "loss": 1.6885, + "step": 18508 + }, + { + "epoch": 5.68109269490485, + "grad_norm": 0.21335965394973755, + "learning_rate": 4.144815602903737e-05, + "loss": 1.7278, + "step": 18509 + }, + { + "epoch": 5.681399631675875, + "grad_norm": 0.21756233274936676, + "learning_rate": 4.1443258754494986e-05, + "loss": 1.7549, + "step": 18510 + }, + { + "epoch": 5.6817065684469, + "grad_norm": 0.2214142084121704, + "learning_rate": 4.143836156451536e-05, + "loss": 1.6654, + "step": 18511 + }, + { + "epoch": 5.682013505217925, + "grad_norm": 0.2230863869190216, + "learning_rate": 4.143346445914684e-05, + "loss": 1.7286, + "step": 18512 + }, + { + "epoch": 5.68232044198895, + "grad_norm": 0.2283746749162674, + "learning_rate": 4.142856743843787e-05, + "loss": 1.7652, + "step": 18513 + }, + { + "epoch": 5.6826273787599755, + "grad_norm": 0.20059749484062195, + "learning_rate": 4.142367050243679e-05, + "loss": 1.6854, + "step": 18514 + }, + { + "epoch": 5.682934315531001, + "grad_norm": 0.17887794971466064, + "learning_rate": 4.141877365119204e-05, + "loss": 1.6975, + "step": 18515 + }, + { + "epoch": 5.683241252302026, + "grad_norm": 0.21266087889671326, + "learning_rate": 4.141387688475199e-05, + "loss": 1.7361, + "step": 18516 + }, + { + "epoch": 5.683548189073051, + "grad_norm": 0.20075422525405884, + "learning_rate": 4.140898020316506e-05, + "loss": 1.7496, + "step": 18517 + }, + { + "epoch": 5.683855125844076, + "grad_norm": 0.21430443227291107, + "learning_rate": 4.140408360647963e-05, + "loss": 1.7481, + "step": 18518 + }, + { + "epoch": 5.684162062615101, + "grad_norm": 0.1951984018087387, + "learning_rate": 4.139918709474405e-05, + "loss": 1.713, + "step": 18519 + }, + { + "epoch": 5.684468999386127, + "grad_norm": 0.21636274456977844, + "learning_rate": 4.1394290668006764e-05, + "loss": 1.8169, + "step": 18520 + }, + { + "epoch": 5.684775936157152, + "grad_norm": 0.21003715693950653, + "learning_rate": 4.138939432631613e-05, + "loss": 1.7453, + "step": 18521 + }, + { + "epoch": 5.685082872928177, + "grad_norm": 0.23559699952602386, + "learning_rate": 4.138449806972057e-05, + "loss": 1.7534, + "step": 18522 + }, + { + "epoch": 5.685389809699202, + "grad_norm": 0.23322029411792755, + "learning_rate": 4.137960189826843e-05, + "loss": 1.7535, + "step": 18523 + }, + { + "epoch": 5.685696746470227, + "grad_norm": 0.1998462826013565, + "learning_rate": 4.137470581200813e-05, + "loss": 1.7025, + "step": 18524 + }, + { + "epoch": 5.686003683241252, + "grad_norm": 0.22321350872516632, + "learning_rate": 4.1369809810988025e-05, + "loss": 1.7666, + "step": 18525 + }, + { + "epoch": 5.686310620012278, + "grad_norm": 0.20851604640483856, + "learning_rate": 4.136491389525653e-05, + "loss": 1.6958, + "step": 18526 + }, + { + "epoch": 5.686617556783302, + "grad_norm": 0.21494868397712708, + "learning_rate": 4.136001806486201e-05, + "loss": 1.7703, + "step": 18527 + }, + { + "epoch": 5.6869244935543275, + "grad_norm": 0.19872798025608063, + "learning_rate": 4.135512231985287e-05, + "loss": 1.7451, + "step": 18528 + }, + { + "epoch": 5.687231430325353, + "grad_norm": 0.2424371987581253, + "learning_rate": 4.1350226660277456e-05, + "loss": 1.8153, + "step": 18529 + }, + { + "epoch": 5.687538367096378, + "grad_norm": 0.20388297736644745, + "learning_rate": 4.1345331086184196e-05, + "loss": 1.6882, + "step": 18530 + }, + { + "epoch": 5.6878453038674035, + "grad_norm": 0.22662605345249176, + "learning_rate": 4.134043559762143e-05, + "loss": 1.7532, + "step": 18531 + }, + { + "epoch": 5.688152240638429, + "grad_norm": 0.2281452864408493, + "learning_rate": 4.133554019463756e-05, + "loss": 1.769, + "step": 18532 + }, + { + "epoch": 5.688459177409453, + "grad_norm": 0.2303505390882492, + "learning_rate": 4.1330644877280955e-05, + "loss": 1.7176, + "step": 18533 + }, + { + "epoch": 5.688766114180479, + "grad_norm": 0.24411743879318237, + "learning_rate": 4.132574964560001e-05, + "loss": 1.7557, + "step": 18534 + }, + { + "epoch": 5.689073050951504, + "grad_norm": 0.2674088776111603, + "learning_rate": 4.13208544996431e-05, + "loss": 1.6997, + "step": 18535 + }, + { + "epoch": 5.689379987722529, + "grad_norm": 0.22232958674430847, + "learning_rate": 4.1315959439458565e-05, + "loss": 1.7731, + "step": 18536 + }, + { + "epoch": 5.689686924493555, + "grad_norm": 0.23894453048706055, + "learning_rate": 4.131106446509483e-05, + "loss": 1.7454, + "step": 18537 + }, + { + "epoch": 5.689993861264579, + "grad_norm": 0.19710026681423187, + "learning_rate": 4.1306169576600226e-05, + "loss": 1.6872, + "step": 18538 + }, + { + "epoch": 5.690300798035604, + "grad_norm": 0.1879546344280243, + "learning_rate": 4.130127477402318e-05, + "loss": 1.6929, + "step": 18539 + }, + { + "epoch": 5.69060773480663, + "grad_norm": 0.1964653730392456, + "learning_rate": 4.129638005741201e-05, + "loss": 1.7778, + "step": 18540 + }, + { + "epoch": 5.690914671577655, + "grad_norm": 0.20161493122577667, + "learning_rate": 4.129148542681513e-05, + "loss": 1.7388, + "step": 18541 + }, + { + "epoch": 5.69122160834868, + "grad_norm": 0.26742830872535706, + "learning_rate": 4.1286590882280886e-05, + "loss": 1.7472, + "step": 18542 + }, + { + "epoch": 5.691528545119706, + "grad_norm": 0.2613312900066376, + "learning_rate": 4.128169642385766e-05, + "loss": 1.7656, + "step": 18543 + }, + { + "epoch": 5.69183548189073, + "grad_norm": 0.17979474365711212, + "learning_rate": 4.127680205159381e-05, + "loss": 1.6992, + "step": 18544 + }, + { + "epoch": 5.6921424186617555, + "grad_norm": 0.23575037717819214, + "learning_rate": 4.1271907765537745e-05, + "loss": 1.7399, + "step": 18545 + }, + { + "epoch": 5.692449355432781, + "grad_norm": 0.19461458921432495, + "learning_rate": 4.126701356573777e-05, + "loss": 1.709, + "step": 18546 + }, + { + "epoch": 5.692756292203806, + "grad_norm": 0.19715365767478943, + "learning_rate": 4.1262119452242306e-05, + "loss": 1.7634, + "step": 18547 + }, + { + "epoch": 5.6930632289748315, + "grad_norm": 0.21454904973506927, + "learning_rate": 4.125722542509969e-05, + "loss": 1.7663, + "step": 18548 + }, + { + "epoch": 5.693370165745856, + "grad_norm": 0.19884896278381348, + "learning_rate": 4.12523314843583e-05, + "loss": 1.7618, + "step": 18549 + }, + { + "epoch": 5.693677102516881, + "grad_norm": 0.2080020159482956, + "learning_rate": 4.124743763006648e-05, + "loss": 1.7379, + "step": 18550 + }, + { + "epoch": 5.693984039287907, + "grad_norm": 0.18780875205993652, + "learning_rate": 4.124254386227264e-05, + "loss": 1.7036, + "step": 18551 + }, + { + "epoch": 5.694290976058932, + "grad_norm": 0.2114439308643341, + "learning_rate": 4.123765018102512e-05, + "loss": 1.6873, + "step": 18552 + }, + { + "epoch": 5.694597912829957, + "grad_norm": 0.1712789535522461, + "learning_rate": 4.123275658637225e-05, + "loss": 1.6772, + "step": 18553 + }, + { + "epoch": 5.694904849600983, + "grad_norm": 0.2435859888792038, + "learning_rate": 4.122786307836243e-05, + "loss": 1.7946, + "step": 18554 + }, + { + "epoch": 5.695211786372007, + "grad_norm": 0.20587889850139618, + "learning_rate": 4.122296965704399e-05, + "loss": 1.7459, + "step": 18555 + }, + { + "epoch": 5.695518723143032, + "grad_norm": 0.2183443009853363, + "learning_rate": 4.121807632246534e-05, + "loss": 1.7036, + "step": 18556 + }, + { + "epoch": 5.695825659914058, + "grad_norm": 0.19276869297027588, + "learning_rate": 4.121318307467478e-05, + "loss": 1.7371, + "step": 18557 + }, + { + "epoch": 5.696132596685083, + "grad_norm": 0.19815512001514435, + "learning_rate": 4.120828991372072e-05, + "loss": 1.7038, + "step": 18558 + }, + { + "epoch": 5.696439533456108, + "grad_norm": 0.18509675562381744, + "learning_rate": 4.120339683965146e-05, + "loss": 1.6936, + "step": 18559 + }, + { + "epoch": 5.696746470227133, + "grad_norm": 0.2296193689107895, + "learning_rate": 4.1198503852515416e-05, + "loss": 1.7626, + "step": 18560 + }, + { + "epoch": 5.697053406998158, + "grad_norm": 0.2064799964427948, + "learning_rate": 4.11936109523609e-05, + "loss": 1.7387, + "step": 18561 + }, + { + "epoch": 5.6973603437691835, + "grad_norm": 0.20171360671520233, + "learning_rate": 4.1188718139236296e-05, + "loss": 1.7372, + "step": 18562 + }, + { + "epoch": 5.697667280540209, + "grad_norm": 0.19421936571598053, + "learning_rate": 4.118382541318993e-05, + "loss": 1.7187, + "step": 18563 + }, + { + "epoch": 5.697974217311234, + "grad_norm": 0.22517532110214233, + "learning_rate": 4.117893277427018e-05, + "loss": 1.7503, + "step": 18564 + }, + { + "epoch": 5.6982811540822595, + "grad_norm": 0.2293393909931183, + "learning_rate": 4.1174040222525366e-05, + "loss": 1.7174, + "step": 18565 + }, + { + "epoch": 5.698588090853284, + "grad_norm": 0.24003073573112488, + "learning_rate": 4.1169147758003876e-05, + "loss": 1.7829, + "step": 18566 + }, + { + "epoch": 5.698895027624309, + "grad_norm": 0.21476133167743683, + "learning_rate": 4.1164255380754034e-05, + "loss": 1.7906, + "step": 18567 + }, + { + "epoch": 5.699201964395335, + "grad_norm": 0.21347576379776, + "learning_rate": 4.115936309082422e-05, + "loss": 1.6986, + "step": 18568 + }, + { + "epoch": 5.69950890116636, + "grad_norm": 0.22650402784347534, + "learning_rate": 4.115447088826276e-05, + "loss": 1.7949, + "step": 18569 + }, + { + "epoch": 5.699815837937384, + "grad_norm": 0.25815197825431824, + "learning_rate": 4.114957877311799e-05, + "loss": 1.7499, + "step": 18570 + }, + { + "epoch": 5.70012277470841, + "grad_norm": 0.22644442319869995, + "learning_rate": 4.1144686745438265e-05, + "loss": 1.7689, + "step": 18571 + }, + { + "epoch": 5.700429711479435, + "grad_norm": 0.241188645362854, + "learning_rate": 4.113979480527194e-05, + "loss": 1.7341, + "step": 18572 + }, + { + "epoch": 5.7007366482504604, + "grad_norm": 0.20984862744808197, + "learning_rate": 4.1134902952667365e-05, + "loss": 1.7091, + "step": 18573 + }, + { + "epoch": 5.701043585021486, + "grad_norm": 0.25150877237319946, + "learning_rate": 4.113001118767286e-05, + "loss": 1.723, + "step": 18574 + }, + { + "epoch": 5.701350521792511, + "grad_norm": 0.21693028509616852, + "learning_rate": 4.1125119510336804e-05, + "loss": 1.7483, + "step": 18575 + }, + { + "epoch": 5.701657458563536, + "grad_norm": 0.2620212733745575, + "learning_rate": 4.11202279207075e-05, + "loss": 1.8159, + "step": 18576 + }, + { + "epoch": 5.701964395334561, + "grad_norm": 0.18722239136695862, + "learning_rate": 4.111533641883332e-05, + "loss": 1.7197, + "step": 18577 + }, + { + "epoch": 5.702271332105586, + "grad_norm": 0.21321091055870056, + "learning_rate": 4.111044500476258e-05, + "loss": 1.7408, + "step": 18578 + }, + { + "epoch": 5.702578268876612, + "grad_norm": 0.24459265172481537, + "learning_rate": 4.110555367854365e-05, + "loss": 1.8304, + "step": 18579 + }, + { + "epoch": 5.702885205647637, + "grad_norm": 0.24987100064754486, + "learning_rate": 4.110066244022483e-05, + "loss": 1.7051, + "step": 18580 + }, + { + "epoch": 5.703192142418661, + "grad_norm": 0.19059090316295624, + "learning_rate": 4.1095771289854506e-05, + "loss": 1.7489, + "step": 18581 + }, + { + "epoch": 5.703499079189687, + "grad_norm": 0.23020480573177338, + "learning_rate": 4.1090880227480966e-05, + "loss": 1.7101, + "step": 18582 + }, + { + "epoch": 5.703806015960712, + "grad_norm": 0.18733634054660797, + "learning_rate": 4.108598925315258e-05, + "loss": 1.7116, + "step": 18583 + }, + { + "epoch": 5.704112952731737, + "grad_norm": 0.1959095001220703, + "learning_rate": 4.108109836691766e-05, + "loss": 1.7283, + "step": 18584 + }, + { + "epoch": 5.704419889502763, + "grad_norm": 0.22685091197490692, + "learning_rate": 4.107620756882457e-05, + "loss": 1.7588, + "step": 18585 + }, + { + "epoch": 5.704726826273788, + "grad_norm": 0.1998603790998459, + "learning_rate": 4.107131685892164e-05, + "loss": 1.7071, + "step": 18586 + }, + { + "epoch": 5.7050337630448125, + "grad_norm": 0.2018733024597168, + "learning_rate": 4.106642623725717e-05, + "loss": 1.6782, + "step": 18587 + }, + { + "epoch": 5.705340699815838, + "grad_norm": 0.21826615929603577, + "learning_rate": 4.106153570387951e-05, + "loss": 1.736, + "step": 18588 + }, + { + "epoch": 5.705647636586863, + "grad_norm": 0.20197603106498718, + "learning_rate": 4.105664525883699e-05, + "loss": 1.6921, + "step": 18589 + }, + { + "epoch": 5.7059545733578885, + "grad_norm": 0.20943905413150787, + "learning_rate": 4.105175490217796e-05, + "loss": 1.665, + "step": 18590 + }, + { + "epoch": 5.706261510128914, + "grad_norm": 0.202060267329216, + "learning_rate": 4.104686463395071e-05, + "loss": 1.714, + "step": 18591 + }, + { + "epoch": 5.706568446899938, + "grad_norm": 0.220698744058609, + "learning_rate": 4.1041974454203623e-05, + "loss": 1.8076, + "step": 18592 + }, + { + "epoch": 5.706875383670964, + "grad_norm": 0.21536946296691895, + "learning_rate": 4.103708436298497e-05, + "loss": 1.6801, + "step": 18593 + }, + { + "epoch": 5.707182320441989, + "grad_norm": 0.21442468464374542, + "learning_rate": 4.103219436034311e-05, + "loss": 1.6921, + "step": 18594 + }, + { + "epoch": 5.707489257213014, + "grad_norm": 0.2047559767961502, + "learning_rate": 4.1027304446326356e-05, + "loss": 1.7861, + "step": 18595 + }, + { + "epoch": 5.70779619398404, + "grad_norm": 0.20304669439792633, + "learning_rate": 4.102241462098305e-05, + "loss": 1.7751, + "step": 18596 + }, + { + "epoch": 5.708103130755065, + "grad_norm": 0.18702620267868042, + "learning_rate": 4.101752488436149e-05, + "loss": 1.6951, + "step": 18597 + }, + { + "epoch": 5.708410067526089, + "grad_norm": 0.1821923404932022, + "learning_rate": 4.1012635236510034e-05, + "loss": 1.711, + "step": 18598 + }, + { + "epoch": 5.708717004297115, + "grad_norm": 0.19422096014022827, + "learning_rate": 4.100774567747696e-05, + "loss": 1.7202, + "step": 18599 + }, + { + "epoch": 5.70902394106814, + "grad_norm": 0.20800530910491943, + "learning_rate": 4.100285620731063e-05, + "loss": 1.7403, + "step": 18600 + }, + { + "epoch": 5.709330877839165, + "grad_norm": 0.221746027469635, + "learning_rate": 4.099796682605934e-05, + "loss": 1.7769, + "step": 18601 + }, + { + "epoch": 5.70963781461019, + "grad_norm": 0.19284313917160034, + "learning_rate": 4.099307753377143e-05, + "loss": 1.692, + "step": 18602 + }, + { + "epoch": 5.709944751381215, + "grad_norm": 0.17635129392147064, + "learning_rate": 4.0988188330495216e-05, + "loss": 1.7212, + "step": 18603 + }, + { + "epoch": 5.7102516881522405, + "grad_norm": 0.17728061974048615, + "learning_rate": 4.098329921627898e-05, + "loss": 1.7217, + "step": 18604 + }, + { + "epoch": 5.710558624923266, + "grad_norm": 0.19998152554035187, + "learning_rate": 4.097841019117108e-05, + "loss": 1.7583, + "step": 18605 + }, + { + "epoch": 5.710865561694291, + "grad_norm": 0.18840095400810242, + "learning_rate": 4.09735212552198e-05, + "loss": 1.7353, + "step": 18606 + }, + { + "epoch": 5.7111724984653165, + "grad_norm": 0.2528367042541504, + "learning_rate": 4.09686324084735e-05, + "loss": 1.7576, + "step": 18607 + }, + { + "epoch": 5.711479435236341, + "grad_norm": 0.27240338921546936, + "learning_rate": 4.096374365098045e-05, + "loss": 1.7303, + "step": 18608 + }, + { + "epoch": 5.711786372007366, + "grad_norm": 0.20187151432037354, + "learning_rate": 4.0958854982789e-05, + "loss": 1.7599, + "step": 18609 + }, + { + "epoch": 5.712093308778392, + "grad_norm": 0.24890528619289398, + "learning_rate": 4.095396640394742e-05, + "loss": 1.7737, + "step": 18610 + }, + { + "epoch": 5.712400245549417, + "grad_norm": 0.21524454653263092, + "learning_rate": 4.094907791450406e-05, + "loss": 1.7704, + "step": 18611 + }, + { + "epoch": 5.712707182320442, + "grad_norm": 0.20070379972457886, + "learning_rate": 4.094418951450721e-05, + "loss": 1.7358, + "step": 18612 + }, + { + "epoch": 5.713014119091467, + "grad_norm": 0.2252196967601776, + "learning_rate": 4.09393012040052e-05, + "loss": 1.7262, + "step": 18613 + }, + { + "epoch": 5.713321055862492, + "grad_norm": 0.19511987268924713, + "learning_rate": 4.093441298304631e-05, + "loss": 1.7146, + "step": 18614 + }, + { + "epoch": 5.713627992633517, + "grad_norm": 0.2047072798013687, + "learning_rate": 4.092952485167888e-05, + "loss": 1.7864, + "step": 18615 + }, + { + "epoch": 5.713934929404543, + "grad_norm": 0.21794871985912323, + "learning_rate": 4.092463680995119e-05, + "loss": 1.7759, + "step": 18616 + }, + { + "epoch": 5.714241866175568, + "grad_norm": 0.23863841593265533, + "learning_rate": 4.0919748857911566e-05, + "loss": 1.7207, + "step": 18617 + }, + { + "epoch": 5.714548802946593, + "grad_norm": 0.19706958532333374, + "learning_rate": 4.09148609956083e-05, + "loss": 1.7247, + "step": 18618 + }, + { + "epoch": 5.714855739717618, + "grad_norm": 0.23663771152496338, + "learning_rate": 4.090997322308971e-05, + "loss": 1.7929, + "step": 18619 + }, + { + "epoch": 5.715162676488643, + "grad_norm": 0.23079079389572144, + "learning_rate": 4.09050855404041e-05, + "loss": 1.763, + "step": 18620 + }, + { + "epoch": 5.7154696132596685, + "grad_norm": 0.23883379995822906, + "learning_rate": 4.0900197947599736e-05, + "loss": 1.7995, + "step": 18621 + }, + { + "epoch": 5.715776550030694, + "grad_norm": 0.2125123143196106, + "learning_rate": 4.0895310444724974e-05, + "loss": 1.8045, + "step": 18622 + }, + { + "epoch": 5.716083486801719, + "grad_norm": 0.21062424778938293, + "learning_rate": 4.0890423031828076e-05, + "loss": 1.7348, + "step": 18623 + }, + { + "epoch": 5.716390423572744, + "grad_norm": 0.24079614877700806, + "learning_rate": 4.088553570895737e-05, + "loss": 1.7462, + "step": 18624 + }, + { + "epoch": 5.716697360343769, + "grad_norm": 0.2120666354894638, + "learning_rate": 4.088064847616113e-05, + "loss": 1.7235, + "step": 18625 + }, + { + "epoch": 5.717004297114794, + "grad_norm": 0.19663050770759583, + "learning_rate": 4.0875761333487685e-05, + "loss": 1.6743, + "step": 18626 + }, + { + "epoch": 5.71731123388582, + "grad_norm": 0.24010685086250305, + "learning_rate": 4.0870874280985295e-05, + "loss": 1.6742, + "step": 18627 + }, + { + "epoch": 5.717618170656845, + "grad_norm": 0.22140294313430786, + "learning_rate": 4.086598731870228e-05, + "loss": 1.7601, + "step": 18628 + }, + { + "epoch": 5.71792510742787, + "grad_norm": 0.2876693308353424, + "learning_rate": 4.086110044668694e-05, + "loss": 1.7601, + "step": 18629 + }, + { + "epoch": 5.718232044198895, + "grad_norm": 0.3103853464126587, + "learning_rate": 4.085621366498756e-05, + "loss": 1.6824, + "step": 18630 + }, + { + "epoch": 5.71853898096992, + "grad_norm": 0.18194396793842316, + "learning_rate": 4.0851326973652424e-05, + "loss": 1.6976, + "step": 18631 + }, + { + "epoch": 5.718845917740945, + "grad_norm": 0.28400903940200806, + "learning_rate": 4.0846440372729854e-05, + "loss": 1.7352, + "step": 18632 + }, + { + "epoch": 5.719152854511971, + "grad_norm": 0.23753583431243896, + "learning_rate": 4.084155386226811e-05, + "loss": 1.7418, + "step": 18633 + }, + { + "epoch": 5.719459791282996, + "grad_norm": 0.215620756149292, + "learning_rate": 4.0836667442315514e-05, + "loss": 1.7602, + "step": 18634 + }, + { + "epoch": 5.7197667280540205, + "grad_norm": 0.21057941019535065, + "learning_rate": 4.083178111292034e-05, + "loss": 1.6818, + "step": 18635 + }, + { + "epoch": 5.720073664825046, + "grad_norm": 0.2169445902109146, + "learning_rate": 4.0826894874130863e-05, + "loss": 1.7942, + "step": 18636 + }, + { + "epoch": 5.720380601596071, + "grad_norm": 0.2779453992843628, + "learning_rate": 4.082200872599541e-05, + "loss": 1.7432, + "step": 18637 + }, + { + "epoch": 5.7206875383670965, + "grad_norm": 0.22556698322296143, + "learning_rate": 4.0817122668562224e-05, + "loss": 1.7748, + "step": 18638 + }, + { + "epoch": 5.720994475138122, + "grad_norm": 0.2570365071296692, + "learning_rate": 4.081223670187962e-05, + "loss": 1.7314, + "step": 18639 + }, + { + "epoch": 5.721301411909147, + "grad_norm": 0.266176700592041, + "learning_rate": 4.080735082599588e-05, + "loss": 1.689, + "step": 18640 + }, + { + "epoch": 5.721608348680172, + "grad_norm": 0.20190037786960602, + "learning_rate": 4.080246504095929e-05, + "loss": 1.7467, + "step": 18641 + }, + { + "epoch": 5.721915285451197, + "grad_norm": 0.2498215138912201, + "learning_rate": 4.079757934681813e-05, + "loss": 1.7063, + "step": 18642 + }, + { + "epoch": 5.722222222222222, + "grad_norm": 0.25594204664230347, + "learning_rate": 4.0792693743620695e-05, + "loss": 1.7096, + "step": 18643 + }, + { + "epoch": 5.722529158993248, + "grad_norm": 0.22674626111984253, + "learning_rate": 4.0787808231415233e-05, + "loss": 1.715, + "step": 18644 + }, + { + "epoch": 5.722836095764272, + "grad_norm": 0.267140656709671, + "learning_rate": 4.078292281025007e-05, + "loss": 1.7747, + "step": 18645 + }, + { + "epoch": 5.723143032535297, + "grad_norm": 0.21161147952079773, + "learning_rate": 4.077803748017345e-05, + "loss": 1.7312, + "step": 18646 + }, + { + "epoch": 5.723449969306323, + "grad_norm": 0.2580260634422302, + "learning_rate": 4.077315224123368e-05, + "loss": 1.7246, + "step": 18647 + }, + { + "epoch": 5.723756906077348, + "grad_norm": 0.23766927421092987, + "learning_rate": 4.076826709347902e-05, + "loss": 1.7147, + "step": 18648 + }, + { + "epoch": 5.724063842848373, + "grad_norm": 0.22764286398887634, + "learning_rate": 4.076338203695776e-05, + "loss": 1.7034, + "step": 18649 + }, + { + "epoch": 5.724370779619399, + "grad_norm": 0.28205159306526184, + "learning_rate": 4.075849707171817e-05, + "loss": 1.7472, + "step": 18650 + }, + { + "epoch": 5.724677716390423, + "grad_norm": 0.2091183066368103, + "learning_rate": 4.075361219780854e-05, + "loss": 1.7693, + "step": 18651 + }, + { + "epoch": 5.7249846531614486, + "grad_norm": 0.29513829946517944, + "learning_rate": 4.074872741527713e-05, + "loss": 1.7286, + "step": 18652 + }, + { + "epoch": 5.725291589932474, + "grad_norm": 0.226357102394104, + "learning_rate": 4.07438427241722e-05, + "loss": 1.7658, + "step": 18653 + }, + { + "epoch": 5.725598526703499, + "grad_norm": 0.23732580244541168, + "learning_rate": 4.073895812454207e-05, + "loss": 1.7591, + "step": 18654 + }, + { + "epoch": 5.725905463474525, + "grad_norm": 0.2835488021373749, + "learning_rate": 4.0734073616434956e-05, + "loss": 1.757, + "step": 18655 + }, + { + "epoch": 5.726212400245549, + "grad_norm": 0.1986306756734848, + "learning_rate": 4.0729189199899186e-05, + "loss": 1.714, + "step": 18656 + }, + { + "epoch": 5.726519337016574, + "grad_norm": 0.25071820616722107, + "learning_rate": 4.072430487498298e-05, + "loss": 1.7334, + "step": 18657 + }, + { + "epoch": 5.7268262737876, + "grad_norm": 0.19989889860153198, + "learning_rate": 4.0719420641734634e-05, + "loss": 1.7472, + "step": 18658 + }, + { + "epoch": 5.727133210558625, + "grad_norm": 0.30006101727485657, + "learning_rate": 4.071453650020241e-05, + "loss": 1.7846, + "step": 18659 + }, + { + "epoch": 5.72744014732965, + "grad_norm": 0.19856922328472137, + "learning_rate": 4.070965245043459e-05, + "loss": 1.6965, + "step": 18660 + }, + { + "epoch": 5.727747084100676, + "grad_norm": 0.20139823853969574, + "learning_rate": 4.070476849247941e-05, + "loss": 1.7265, + "step": 18661 + }, + { + "epoch": 5.7280540208717, + "grad_norm": 0.21507953107357025, + "learning_rate": 4.0699884626385184e-05, + "loss": 1.762, + "step": 18662 + }, + { + "epoch": 5.7283609576427255, + "grad_norm": 0.1885843127965927, + "learning_rate": 4.069500085220013e-05, + "loss": 1.6721, + "step": 18663 + }, + { + "epoch": 5.728667894413751, + "grad_norm": 0.2076897919178009, + "learning_rate": 4.069011716997253e-05, + "loss": 1.7399, + "step": 18664 + }, + { + "epoch": 5.728974831184776, + "grad_norm": 0.21482045948505402, + "learning_rate": 4.068523357975065e-05, + "loss": 1.7105, + "step": 18665 + }, + { + "epoch": 5.7292817679558015, + "grad_norm": 0.20438800752162933, + "learning_rate": 4.0680350081582765e-05, + "loss": 1.7408, + "step": 18666 + }, + { + "epoch": 5.729588704726826, + "grad_norm": 0.2137845903635025, + "learning_rate": 4.0675466675517104e-05, + "loss": 1.7814, + "step": 18667 + }, + { + "epoch": 5.729895641497851, + "grad_norm": 0.23009657859802246, + "learning_rate": 4.067058336160197e-05, + "loss": 1.7311, + "step": 18668 + }, + { + "epoch": 5.730202578268877, + "grad_norm": 0.20602397620677948, + "learning_rate": 4.066570013988558e-05, + "loss": 1.741, + "step": 18669 + }, + { + "epoch": 5.730509515039902, + "grad_norm": 0.24884814023971558, + "learning_rate": 4.066081701041621e-05, + "loss": 1.7222, + "step": 18670 + }, + { + "epoch": 5.730816451810927, + "grad_norm": 0.17906342446804047, + "learning_rate": 4.065593397324214e-05, + "loss": 1.6879, + "step": 18671 + }, + { + "epoch": 5.731123388581953, + "grad_norm": 0.20345427095890045, + "learning_rate": 4.0651051028411586e-05, + "loss": 1.7713, + "step": 18672 + }, + { + "epoch": 5.731430325352977, + "grad_norm": 0.21115002036094666, + "learning_rate": 4.0646168175972846e-05, + "loss": 1.7666, + "step": 18673 + }, + { + "epoch": 5.731737262124002, + "grad_norm": 0.22189734876155853, + "learning_rate": 4.064128541597413e-05, + "loss": 1.6989, + "step": 18674 + }, + { + "epoch": 5.732044198895028, + "grad_norm": 0.24036027491092682, + "learning_rate": 4.063640274846373e-05, + "loss": 1.707, + "step": 18675 + }, + { + "epoch": 5.732351135666053, + "grad_norm": 0.23091022670269012, + "learning_rate": 4.063152017348988e-05, + "loss": 1.7072, + "step": 18676 + }, + { + "epoch": 5.7326580724370775, + "grad_norm": 0.3142668306827545, + "learning_rate": 4.062663769110085e-05, + "loss": 1.7641, + "step": 18677 + }, + { + "epoch": 5.732965009208103, + "grad_norm": 0.2634848356246948, + "learning_rate": 4.0621755301344875e-05, + "loss": 1.7007, + "step": 18678 + }, + { + "epoch": 5.733271945979128, + "grad_norm": 0.21296904981136322, + "learning_rate": 4.061687300427022e-05, + "loss": 1.7201, + "step": 18679 + }, + { + "epoch": 5.7335788827501535, + "grad_norm": 0.24943144619464874, + "learning_rate": 4.0611990799925104e-05, + "loss": 1.7186, + "step": 18680 + }, + { + "epoch": 5.733885819521179, + "grad_norm": 0.2574152946472168, + "learning_rate": 4.060710868835781e-05, + "loss": 1.8671, + "step": 18681 + }, + { + "epoch": 5.734192756292204, + "grad_norm": 0.26023826003074646, + "learning_rate": 4.0602226669616564e-05, + "loss": 1.7618, + "step": 18682 + }, + { + "epoch": 5.734499693063229, + "grad_norm": 0.21078336238861084, + "learning_rate": 4.0597344743749645e-05, + "loss": 1.7548, + "step": 18683 + }, + { + "epoch": 5.734806629834254, + "grad_norm": 0.2195056676864624, + "learning_rate": 4.059246291080525e-05, + "loss": 1.6843, + "step": 18684 + }, + { + "epoch": 5.735113566605279, + "grad_norm": 0.20719893276691437, + "learning_rate": 4.058758117083168e-05, + "loss": 1.692, + "step": 18685 + }, + { + "epoch": 5.735420503376305, + "grad_norm": 0.23012077808380127, + "learning_rate": 4.058269952387713e-05, + "loss": 1.7072, + "step": 18686 + }, + { + "epoch": 5.73572744014733, + "grad_norm": 0.18598411977291107, + "learning_rate": 4.057781796998986e-05, + "loss": 1.6983, + "step": 18687 + }, + { + "epoch": 5.736034376918354, + "grad_norm": 0.20211926102638245, + "learning_rate": 4.057293650921813e-05, + "loss": 1.6818, + "step": 18688 + }, + { + "epoch": 5.73634131368938, + "grad_norm": 0.1957080215215683, + "learning_rate": 4.056805514161015e-05, + "loss": 1.7154, + "step": 18689 + }, + { + "epoch": 5.736648250460405, + "grad_norm": 0.23581798374652863, + "learning_rate": 4.0563173867214196e-05, + "loss": 1.7724, + "step": 18690 + }, + { + "epoch": 5.73695518723143, + "grad_norm": 0.22706671059131622, + "learning_rate": 4.055829268607847e-05, + "loss": 1.7387, + "step": 18691 + }, + { + "epoch": 5.737262124002456, + "grad_norm": 0.20050427317619324, + "learning_rate": 4.055341159825124e-05, + "loss": 1.7585, + "step": 18692 + }, + { + "epoch": 5.737569060773481, + "grad_norm": 0.18666231632232666, + "learning_rate": 4.054853060378072e-05, + "loss": 1.6996, + "step": 18693 + }, + { + "epoch": 5.7378759975445055, + "grad_norm": 0.23018911480903625, + "learning_rate": 4.0543649702715186e-05, + "loss": 1.7167, + "step": 18694 + }, + { + "epoch": 5.738182934315531, + "grad_norm": 0.21207039058208466, + "learning_rate": 4.053876889510282e-05, + "loss": 1.7539, + "step": 18695 + }, + { + "epoch": 5.738489871086556, + "grad_norm": 0.22042523324489594, + "learning_rate": 4.0533888180991915e-05, + "loss": 1.8145, + "step": 18696 + }, + { + "epoch": 5.7387968078575815, + "grad_norm": 0.20705139636993408, + "learning_rate": 4.0529007560430646e-05, + "loss": 1.7612, + "step": 18697 + }, + { + "epoch": 5.739103744628607, + "grad_norm": 0.20673857629299164, + "learning_rate": 4.052412703346729e-05, + "loss": 1.7338, + "step": 18698 + }, + { + "epoch": 5.739410681399631, + "grad_norm": 0.20742641389369965, + "learning_rate": 4.051924660015005e-05, + "loss": 1.7497, + "step": 18699 + }, + { + "epoch": 5.739717618170657, + "grad_norm": 0.22352617979049683, + "learning_rate": 4.05143662605272e-05, + "loss": 1.7568, + "step": 18700 + }, + { + "epoch": 5.740024554941682, + "grad_norm": 0.20306691527366638, + "learning_rate": 4.050948601464692e-05, + "loss": 1.7416, + "step": 18701 + }, + { + "epoch": 5.740331491712707, + "grad_norm": 0.22972522675991058, + "learning_rate": 4.050460586255748e-05, + "loss": 1.7907, + "step": 18702 + }, + { + "epoch": 5.740638428483733, + "grad_norm": 0.2056068629026413, + "learning_rate": 4.0499725804307084e-05, + "loss": 1.7584, + "step": 18703 + }, + { + "epoch": 5.740945365254758, + "grad_norm": 0.2150508463382721, + "learning_rate": 4.049484583994395e-05, + "loss": 1.7695, + "step": 18704 + }, + { + "epoch": 5.741252302025782, + "grad_norm": 0.20274797081947327, + "learning_rate": 4.048996596951634e-05, + "loss": 1.7398, + "step": 18705 + }, + { + "epoch": 5.741559238796808, + "grad_norm": 0.20521290600299835, + "learning_rate": 4.0485086193072444e-05, + "loss": 1.7529, + "step": 18706 + }, + { + "epoch": 5.741866175567833, + "grad_norm": 0.22344307601451874, + "learning_rate": 4.0480206510660527e-05, + "loss": 1.6729, + "step": 18707 + }, + { + "epoch": 5.742173112338858, + "grad_norm": 0.20007841289043427, + "learning_rate": 4.047532692232876e-05, + "loss": 1.7004, + "step": 18708 + }, + { + "epoch": 5.742480049109884, + "grad_norm": 0.2455853819847107, + "learning_rate": 4.047044742812541e-05, + "loss": 1.7324, + "step": 18709 + }, + { + "epoch": 5.742786985880908, + "grad_norm": 0.29901546239852905, + "learning_rate": 4.046556802809867e-05, + "loss": 1.7138, + "step": 18710 + }, + { + "epoch": 5.7430939226519335, + "grad_norm": 0.19636842608451843, + "learning_rate": 4.04606887222968e-05, + "loss": 1.7098, + "step": 18711 + }, + { + "epoch": 5.743400859422959, + "grad_norm": 0.24916070699691772, + "learning_rate": 4.045580951076797e-05, + "loss": 1.7073, + "step": 18712 + }, + { + "epoch": 5.743707796193984, + "grad_norm": 0.2122841477394104, + "learning_rate": 4.0450930393560453e-05, + "loss": 1.7608, + "step": 18713 + }, + { + "epoch": 5.7440147329650095, + "grad_norm": 0.25119176506996155, + "learning_rate": 4.044605137072241e-05, + "loss": 1.7528, + "step": 18714 + }, + { + "epoch": 5.744321669736035, + "grad_norm": 0.2128097116947174, + "learning_rate": 4.0441172442302104e-05, + "loss": 1.6834, + "step": 18715 + }, + { + "epoch": 5.744628606507059, + "grad_norm": 0.1771443784236908, + "learning_rate": 4.043629360834772e-05, + "loss": 1.6699, + "step": 18716 + }, + { + "epoch": 5.744935543278085, + "grad_norm": 0.2360549122095108, + "learning_rate": 4.043141486890751e-05, + "loss": 1.7704, + "step": 18717 + }, + { + "epoch": 5.74524248004911, + "grad_norm": 0.22453519701957703, + "learning_rate": 4.0426536224029645e-05, + "loss": 1.7305, + "step": 18718 + }, + { + "epoch": 5.745549416820135, + "grad_norm": 0.2170165628194809, + "learning_rate": 4.042165767376238e-05, + "loss": 1.7859, + "step": 18719 + }, + { + "epoch": 5.74585635359116, + "grad_norm": 0.233921617269516, + "learning_rate": 4.0416779218153896e-05, + "loss": 1.7622, + "step": 18720 + }, + { + "epoch": 5.746163290362185, + "grad_norm": 0.2698482871055603, + "learning_rate": 4.041190085725242e-05, + "loss": 1.7419, + "step": 18721 + }, + { + "epoch": 5.74647022713321, + "grad_norm": 0.28437280654907227, + "learning_rate": 4.0407022591106165e-05, + "loss": 1.7242, + "step": 18722 + }, + { + "epoch": 5.746777163904236, + "grad_norm": 0.2087356448173523, + "learning_rate": 4.040214441976332e-05, + "loss": 1.747, + "step": 18723 + }, + { + "epoch": 5.747084100675261, + "grad_norm": 0.2028181403875351, + "learning_rate": 4.039726634327213e-05, + "loss": 1.7843, + "step": 18724 + }, + { + "epoch": 5.747391037446286, + "grad_norm": 0.18513897061347961, + "learning_rate": 4.039238836168076e-05, + "loss": 1.692, + "step": 18725 + }, + { + "epoch": 5.747697974217311, + "grad_norm": 0.2308989316225052, + "learning_rate": 4.038751047503745e-05, + "loss": 1.6625, + "step": 18726 + }, + { + "epoch": 5.748004910988336, + "grad_norm": 0.23922030627727509, + "learning_rate": 4.0382632683390386e-05, + "loss": 1.7407, + "step": 18727 + }, + { + "epoch": 5.7483118477593615, + "grad_norm": 0.17225340008735657, + "learning_rate": 4.0377754986787806e-05, + "loss": 1.6888, + "step": 18728 + }, + { + "epoch": 5.748618784530387, + "grad_norm": 0.1898551732301712, + "learning_rate": 4.037287738527786e-05, + "loss": 1.6931, + "step": 18729 + }, + { + "epoch": 5.748925721301412, + "grad_norm": 0.22900012135505676, + "learning_rate": 4.036799987890881e-05, + "loss": 1.751, + "step": 18730 + }, + { + "epoch": 5.749232658072437, + "grad_norm": 0.21106193959712982, + "learning_rate": 4.0363122467728815e-05, + "loss": 1.6919, + "step": 18731 + }, + { + "epoch": 5.749539594843462, + "grad_norm": 0.19944290816783905, + "learning_rate": 4.03582451517861e-05, + "loss": 1.7232, + "step": 18732 + }, + { + "epoch": 5.749846531614487, + "grad_norm": 0.1833256036043167, + "learning_rate": 4.035336793112885e-05, + "loss": 1.7199, + "step": 18733 + }, + { + "epoch": 5.750153468385513, + "grad_norm": 0.2596902847290039, + "learning_rate": 4.0348490805805287e-05, + "loss": 1.7386, + "step": 18734 + }, + { + "epoch": 5.750460405156538, + "grad_norm": 0.23708637058734894, + "learning_rate": 4.034361377586357e-05, + "loss": 1.7697, + "step": 18735 + }, + { + "epoch": 5.750767341927563, + "grad_norm": 0.20476554334163666, + "learning_rate": 4.033873684135195e-05, + "loss": 1.7804, + "step": 18736 + }, + { + "epoch": 5.751074278698588, + "grad_norm": 0.2625868320465088, + "learning_rate": 4.033386000231858e-05, + "loss": 1.7046, + "step": 18737 + }, + { + "epoch": 5.751381215469613, + "grad_norm": 0.23011820018291473, + "learning_rate": 4.032898325881166e-05, + "loss": 1.7758, + "step": 18738 + }, + { + "epoch": 5.7516881522406385, + "grad_norm": 0.23972748219966888, + "learning_rate": 4.032410661087943e-05, + "loss": 1.7165, + "step": 18739 + }, + { + "epoch": 5.751995089011664, + "grad_norm": 0.2241208404302597, + "learning_rate": 4.031923005857001e-05, + "loss": 1.713, + "step": 18740 + }, + { + "epoch": 5.752302025782689, + "grad_norm": 0.22316952049732208, + "learning_rate": 4.0314353601931665e-05, + "loss": 1.7655, + "step": 18741 + }, + { + "epoch": 5.752608962553714, + "grad_norm": 0.2177707403898239, + "learning_rate": 4.030947724101253e-05, + "loss": 1.7517, + "step": 18742 + }, + { + "epoch": 5.752915899324739, + "grad_norm": 0.21731823682785034, + "learning_rate": 4.030460097586083e-05, + "loss": 1.718, + "step": 18743 + }, + { + "epoch": 5.753222836095764, + "grad_norm": 0.1700165718793869, + "learning_rate": 4.0299724806524744e-05, + "loss": 1.6536, + "step": 18744 + }, + { + "epoch": 5.75352977286679, + "grad_norm": 0.21920062601566315, + "learning_rate": 4.029484873305247e-05, + "loss": 1.7298, + "step": 18745 + }, + { + "epoch": 5.753836709637815, + "grad_norm": 0.22648905217647552, + "learning_rate": 4.028997275549218e-05, + "loss": 1.7878, + "step": 18746 + }, + { + "epoch": 5.75414364640884, + "grad_norm": 0.19443005323410034, + "learning_rate": 4.028509687389208e-05, + "loss": 1.7582, + "step": 18747 + }, + { + "epoch": 5.754450583179865, + "grad_norm": 0.21973860263824463, + "learning_rate": 4.028022108830034e-05, + "loss": 1.8215, + "step": 18748 + }, + { + "epoch": 5.75475751995089, + "grad_norm": 0.2215481847524643, + "learning_rate": 4.0275345398765155e-05, + "loss": 1.7092, + "step": 18749 + }, + { + "epoch": 5.755064456721915, + "grad_norm": 0.18789733946323395, + "learning_rate": 4.0270469805334696e-05, + "loss": 1.7089, + "step": 18750 + }, + { + "epoch": 5.755371393492941, + "grad_norm": 0.2423657774925232, + "learning_rate": 4.0265594308057175e-05, + "loss": 1.7412, + "step": 18751 + }, + { + "epoch": 5.755678330263965, + "grad_norm": 0.22020475566387177, + "learning_rate": 4.026071890698074e-05, + "loss": 1.7644, + "step": 18752 + }, + { + "epoch": 5.7559852670349905, + "grad_norm": 0.31772032380104065, + "learning_rate": 4.025584360215361e-05, + "loss": 1.7326, + "step": 18753 + }, + { + "epoch": 5.756292203806016, + "grad_norm": 0.23786257207393646, + "learning_rate": 4.025096839362393e-05, + "loss": 1.7652, + "step": 18754 + }, + { + "epoch": 5.756599140577041, + "grad_norm": 0.24288083612918854, + "learning_rate": 4.024609328143989e-05, + "loss": 1.6797, + "step": 18755 + }, + { + "epoch": 5.7569060773480665, + "grad_norm": 0.30519670248031616, + "learning_rate": 4.024121826564969e-05, + "loss": 1.7442, + "step": 18756 + }, + { + "epoch": 5.757213014119092, + "grad_norm": 0.218281090259552, + "learning_rate": 4.023634334630147e-05, + "loss": 1.7498, + "step": 18757 + }, + { + "epoch": 5.757519950890116, + "grad_norm": 0.215846985578537, + "learning_rate": 4.023146852344345e-05, + "loss": 1.7728, + "step": 18758 + }, + { + "epoch": 5.757826887661142, + "grad_norm": 0.2883944511413574, + "learning_rate": 4.022659379712376e-05, + "loss": 1.8098, + "step": 18759 + }, + { + "epoch": 5.758133824432167, + "grad_norm": 0.25141629576683044, + "learning_rate": 4.022171916739062e-05, + "loss": 1.6574, + "step": 18760 + }, + { + "epoch": 5.758440761203192, + "grad_norm": 0.22118757665157318, + "learning_rate": 4.021684463429216e-05, + "loss": 1.7542, + "step": 18761 + }, + { + "epoch": 5.758747697974218, + "grad_norm": 0.2437646985054016, + "learning_rate": 4.02119701978766e-05, + "loss": 1.7182, + "step": 18762 + }, + { + "epoch": 5.759054634745242, + "grad_norm": 0.24247203767299652, + "learning_rate": 4.020709585819206e-05, + "loss": 1.7134, + "step": 18763 + }, + { + "epoch": 5.759361571516267, + "grad_norm": 0.208528533577919, + "learning_rate": 4.020222161528677e-05, + "loss": 1.6966, + "step": 18764 + }, + { + "epoch": 5.759668508287293, + "grad_norm": 0.19645826518535614, + "learning_rate": 4.0197347469208843e-05, + "loss": 1.7261, + "step": 18765 + }, + { + "epoch": 5.759975445058318, + "grad_norm": 0.20066291093826294, + "learning_rate": 4.019247342000648e-05, + "loss": 1.7197, + "step": 18766 + }, + { + "epoch": 5.760282381829343, + "grad_norm": 0.25344669818878174, + "learning_rate": 4.0187599467727845e-05, + "loss": 1.7957, + "step": 18767 + }, + { + "epoch": 5.760589318600369, + "grad_norm": 0.1917620301246643, + "learning_rate": 4.018272561242111e-05, + "loss": 1.6868, + "step": 18768 + }, + { + "epoch": 5.760896255371393, + "grad_norm": 0.21996566653251648, + "learning_rate": 4.0177851854134424e-05, + "loss": 1.7128, + "step": 18769 + }, + { + "epoch": 5.7612031921424185, + "grad_norm": 0.23226283490657806, + "learning_rate": 4.017297819291598e-05, + "loss": 1.7079, + "step": 18770 + }, + { + "epoch": 5.761510128913444, + "grad_norm": 0.30606213212013245, + "learning_rate": 4.016810462881391e-05, + "loss": 1.8087, + "step": 18771 + }, + { + "epoch": 5.761817065684469, + "grad_norm": 0.2171698361635208, + "learning_rate": 4.016323116187639e-05, + "loss": 1.7377, + "step": 18772 + }, + { + "epoch": 5.7621240024554945, + "grad_norm": 0.24234412610530853, + "learning_rate": 4.01583577921516e-05, + "loss": 1.734, + "step": 18773 + }, + { + "epoch": 5.762430939226519, + "grad_norm": 0.2648961544036865, + "learning_rate": 4.015348451968767e-05, + "loss": 1.7423, + "step": 18774 + }, + { + "epoch": 5.762737875997544, + "grad_norm": 0.18316571414470673, + "learning_rate": 4.01486113445328e-05, + "loss": 1.6708, + "step": 18775 + }, + { + "epoch": 5.76304481276857, + "grad_norm": 0.241583451628685, + "learning_rate": 4.0143738266735104e-05, + "loss": 1.708, + "step": 18776 + }, + { + "epoch": 5.763351749539595, + "grad_norm": 0.2268480360507965, + "learning_rate": 4.0138865286342775e-05, + "loss": 1.7106, + "step": 18777 + }, + { + "epoch": 5.76365868631062, + "grad_norm": 0.2038748860359192, + "learning_rate": 4.0133992403403944e-05, + "loss": 1.7349, + "step": 18778 + }, + { + "epoch": 5.763965623081646, + "grad_norm": 0.24422483146190643, + "learning_rate": 4.0129119617966805e-05, + "loss": 1.659, + "step": 18779 + }, + { + "epoch": 5.76427255985267, + "grad_norm": 0.19925715029239655, + "learning_rate": 4.0124246930079476e-05, + "loss": 1.6983, + "step": 18780 + }, + { + "epoch": 5.764579496623695, + "grad_norm": 0.29671359062194824, + "learning_rate": 4.0119374339790136e-05, + "loss": 1.7188, + "step": 18781 + }, + { + "epoch": 5.764886433394721, + "grad_norm": 0.2752140760421753, + "learning_rate": 4.011450184714692e-05, + "loss": 1.738, + "step": 18782 + }, + { + "epoch": 5.765193370165746, + "grad_norm": 0.2112676352262497, + "learning_rate": 4.0109629452198e-05, + "loss": 1.7529, + "step": 18783 + }, + { + "epoch": 5.765500306936771, + "grad_norm": 0.2091330885887146, + "learning_rate": 4.010475715499151e-05, + "loss": 1.6771, + "step": 18784 + }, + { + "epoch": 5.765807243707796, + "grad_norm": 0.26556238532066345, + "learning_rate": 4.009988495557562e-05, + "loss": 1.7721, + "step": 18785 + }, + { + "epoch": 5.766114180478821, + "grad_norm": 0.20728638768196106, + "learning_rate": 4.009501285399846e-05, + "loss": 1.6893, + "step": 18786 + }, + { + "epoch": 5.7664211172498465, + "grad_norm": 0.213730126619339, + "learning_rate": 4.00901408503082e-05, + "loss": 1.704, + "step": 18787 + }, + { + "epoch": 5.766728054020872, + "grad_norm": 0.21422363817691803, + "learning_rate": 4.0085268944552975e-05, + "loss": 1.7571, + "step": 18788 + }, + { + "epoch": 5.767034990791897, + "grad_norm": 0.20936815440654755, + "learning_rate": 4.0080397136780915e-05, + "loss": 1.7423, + "step": 18789 + }, + { + "epoch": 5.7673419275629225, + "grad_norm": 0.26223674416542053, + "learning_rate": 4.007552542704021e-05, + "loss": 1.7687, + "step": 18790 + }, + { + "epoch": 5.767648864333947, + "grad_norm": 0.3524645268917084, + "learning_rate": 4.0070653815378954e-05, + "loss": 1.7754, + "step": 18791 + }, + { + "epoch": 5.767955801104972, + "grad_norm": 0.20238324999809265, + "learning_rate": 4.006578230184534e-05, + "loss": 1.7043, + "step": 18792 + }, + { + "epoch": 5.768262737875998, + "grad_norm": 0.2739984393119812, + "learning_rate": 4.006091088648747e-05, + "loss": 1.7596, + "step": 18793 + }, + { + "epoch": 5.768569674647023, + "grad_norm": 0.29209306836128235, + "learning_rate": 4.0056039569353515e-05, + "loss": 1.6857, + "step": 18794 + }, + { + "epoch": 5.768876611418047, + "grad_norm": 0.21838447451591492, + "learning_rate": 4.005116835049161e-05, + "loss": 1.7531, + "step": 18795 + }, + { + "epoch": 5.769183548189073, + "grad_norm": 0.21940091252326965, + "learning_rate": 4.0046297229949884e-05, + "loss": 1.7363, + "step": 18796 + }, + { + "epoch": 5.769490484960098, + "grad_norm": 0.22679758071899414, + "learning_rate": 4.004142620777647e-05, + "loss": 1.7586, + "step": 18797 + }, + { + "epoch": 5.769797421731123, + "grad_norm": 0.23782022297382355, + "learning_rate": 4.003655528401954e-05, + "loss": 1.7154, + "step": 18798 + }, + { + "epoch": 5.770104358502149, + "grad_norm": 0.20452092587947845, + "learning_rate": 4.0031684458727194e-05, + "loss": 1.7078, + "step": 18799 + }, + { + "epoch": 5.770411295273174, + "grad_norm": 0.22733618319034576, + "learning_rate": 4.0026813731947594e-05, + "loss": 1.6989, + "step": 18800 + }, + { + "epoch": 5.7707182320441985, + "grad_norm": 0.2322154939174652, + "learning_rate": 4.002194310372886e-05, + "loss": 1.7508, + "step": 18801 + }, + { + "epoch": 5.771025168815224, + "grad_norm": 0.24573352932929993, + "learning_rate": 4.001707257411914e-05, + "loss": 1.7245, + "step": 18802 + }, + { + "epoch": 5.771332105586249, + "grad_norm": 0.19692079722881317, + "learning_rate": 4.001220214316655e-05, + "loss": 1.7116, + "step": 18803 + }, + { + "epoch": 5.7716390423572745, + "grad_norm": 0.20525199174880981, + "learning_rate": 4.000733181091925e-05, + "loss": 1.7503, + "step": 18804 + }, + { + "epoch": 5.7719459791283, + "grad_norm": 0.2097626030445099, + "learning_rate": 4.0002461577425344e-05, + "loss": 1.8204, + "step": 18805 + }, + { + "epoch": 5.772252915899324, + "grad_norm": 0.23059608042240143, + "learning_rate": 3.9997591442732975e-05, + "loss": 1.7747, + "step": 18806 + }, + { + "epoch": 5.77255985267035, + "grad_norm": 0.22085745632648468, + "learning_rate": 3.9992721406890265e-05, + "loss": 1.7579, + "step": 18807 + }, + { + "epoch": 5.772866789441375, + "grad_norm": 0.21529869735240936, + "learning_rate": 3.9987851469945334e-05, + "loss": 1.711, + "step": 18808 + }, + { + "epoch": 5.7731737262124, + "grad_norm": 0.20563572645187378, + "learning_rate": 3.998298163194636e-05, + "loss": 1.761, + "step": 18809 + }, + { + "epoch": 5.773480662983426, + "grad_norm": 0.2081122100353241, + "learning_rate": 3.9978111892941394e-05, + "loss": 1.7112, + "step": 18810 + }, + { + "epoch": 5.773787599754451, + "grad_norm": 0.2373751550912857, + "learning_rate": 3.9973242252978635e-05, + "loss": 1.7726, + "step": 18811 + }, + { + "epoch": 5.774094536525475, + "grad_norm": 0.2742944359779358, + "learning_rate": 3.996837271210615e-05, + "loss": 1.7743, + "step": 18812 + }, + { + "epoch": 5.774401473296501, + "grad_norm": 0.20724992454051971, + "learning_rate": 3.996350327037208e-05, + "loss": 1.7052, + "step": 18813 + }, + { + "epoch": 5.774708410067526, + "grad_norm": 0.22324968874454498, + "learning_rate": 3.995863392782456e-05, + "loss": 1.7865, + "step": 18814 + }, + { + "epoch": 5.7750153468385514, + "grad_norm": 0.22314245998859406, + "learning_rate": 3.995376468451172e-05, + "loss": 1.7705, + "step": 18815 + }, + { + "epoch": 5.775322283609577, + "grad_norm": 0.20793841779232025, + "learning_rate": 3.994889554048165e-05, + "loss": 1.739, + "step": 18816 + }, + { + "epoch": 5.775629220380601, + "grad_norm": 0.20117145776748657, + "learning_rate": 3.994402649578249e-05, + "loss": 1.7256, + "step": 18817 + }, + { + "epoch": 5.775936157151627, + "grad_norm": 0.24406170845031738, + "learning_rate": 3.993915755046235e-05, + "loss": 1.8015, + "step": 18818 + }, + { + "epoch": 5.776243093922652, + "grad_norm": 0.20912545919418335, + "learning_rate": 3.993428870456935e-05, + "loss": 1.7038, + "step": 18819 + }, + { + "epoch": 5.776550030693677, + "grad_norm": 0.2587272822856903, + "learning_rate": 3.992941995815162e-05, + "loss": 1.7918, + "step": 18820 + }, + { + "epoch": 5.776856967464703, + "grad_norm": 0.2996658980846405, + "learning_rate": 3.9924551311257266e-05, + "loss": 1.7513, + "step": 18821 + }, + { + "epoch": 5.777163904235728, + "grad_norm": 0.24603547155857086, + "learning_rate": 3.991968276393441e-05, + "loss": 1.7329, + "step": 18822 + }, + { + "epoch": 5.777470841006752, + "grad_norm": 0.2321038693189621, + "learning_rate": 3.991481431623113e-05, + "loss": 1.7406, + "step": 18823 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.3397100269794464, + "learning_rate": 3.990994596819558e-05, + "loss": 1.8129, + "step": 18824 + }, + { + "epoch": 5.778084714548803, + "grad_norm": 0.2807735800743103, + "learning_rate": 3.990507771987584e-05, + "loss": 1.7579, + "step": 18825 + }, + { + "epoch": 5.778391651319828, + "grad_norm": 0.1952899694442749, + "learning_rate": 3.990020957132007e-05, + "loss": 1.7153, + "step": 18826 + }, + { + "epoch": 5.778698588090853, + "grad_norm": 0.28998714685440063, + "learning_rate": 3.989534152257632e-05, + "loss": 1.7844, + "step": 18827 + }, + { + "epoch": 5.779005524861878, + "grad_norm": 0.20929136872291565, + "learning_rate": 3.989047357369275e-05, + "loss": 1.7499, + "step": 18828 + }, + { + "epoch": 5.7793124616329035, + "grad_norm": 0.31144043803215027, + "learning_rate": 3.9885605724717436e-05, + "loss": 1.7745, + "step": 18829 + }, + { + "epoch": 5.779619398403929, + "grad_norm": 0.22598792612552643, + "learning_rate": 3.988073797569849e-05, + "loss": 1.7226, + "step": 18830 + }, + { + "epoch": 5.779926335174954, + "grad_norm": 0.1971752643585205, + "learning_rate": 3.987587032668402e-05, + "loss": 1.7033, + "step": 18831 + }, + { + "epoch": 5.7802332719459795, + "grad_norm": 0.221087247133255, + "learning_rate": 3.9871002777722156e-05, + "loss": 1.7281, + "step": 18832 + }, + { + "epoch": 5.780540208717004, + "grad_norm": 0.21678583323955536, + "learning_rate": 3.986613532886095e-05, + "loss": 1.7207, + "step": 18833 + }, + { + "epoch": 5.780847145488029, + "grad_norm": 0.2511122226715088, + "learning_rate": 3.9861267980148566e-05, + "loss": 1.7091, + "step": 18834 + }, + { + "epoch": 5.781154082259055, + "grad_norm": 0.2883855104446411, + "learning_rate": 3.985640073163304e-05, + "loss": 1.7963, + "step": 18835 + }, + { + "epoch": 5.78146101903008, + "grad_norm": 0.21786242723464966, + "learning_rate": 3.985153358336253e-05, + "loss": 1.6883, + "step": 18836 + }, + { + "epoch": 5.781767955801105, + "grad_norm": 0.18529155850410461, + "learning_rate": 3.98466665353851e-05, + "loss": 1.7194, + "step": 18837 + }, + { + "epoch": 5.78207489257213, + "grad_norm": 0.20535743236541748, + "learning_rate": 3.984179958774888e-05, + "loss": 1.6943, + "step": 18838 + }, + { + "epoch": 5.782381829343155, + "grad_norm": 0.19377392530441284, + "learning_rate": 3.983693274050195e-05, + "loss": 1.6732, + "step": 18839 + }, + { + "epoch": 5.78268876611418, + "grad_norm": 0.22373615205287933, + "learning_rate": 3.983206599369239e-05, + "loss": 1.7668, + "step": 18840 + }, + { + "epoch": 5.782995702885206, + "grad_norm": 0.2132388800382614, + "learning_rate": 3.982719934736832e-05, + "loss": 1.7155, + "step": 18841 + }, + { + "epoch": 5.783302639656231, + "grad_norm": 0.24871744215488434, + "learning_rate": 3.982233280157782e-05, + "loss": 1.7232, + "step": 18842 + }, + { + "epoch": 5.783609576427256, + "grad_norm": 0.1861848086118698, + "learning_rate": 3.981746635636902e-05, + "loss": 1.707, + "step": 18843 + }, + { + "epoch": 5.783916513198281, + "grad_norm": 0.21882779896259308, + "learning_rate": 3.981260001178995e-05, + "loss": 1.7165, + "step": 18844 + }, + { + "epoch": 5.784223449969306, + "grad_norm": 0.22144648432731628, + "learning_rate": 3.980773376788877e-05, + "loss": 1.7799, + "step": 18845 + }, + { + "epoch": 5.7845303867403315, + "grad_norm": 0.210894376039505, + "learning_rate": 3.980286762471351e-05, + "loss": 1.7539, + "step": 18846 + }, + { + "epoch": 5.784837323511357, + "grad_norm": 0.20435640215873718, + "learning_rate": 3.9798001582312305e-05, + "loss": 1.6736, + "step": 18847 + }, + { + "epoch": 5.785144260282382, + "grad_norm": 0.18998762965202332, + "learning_rate": 3.979313564073322e-05, + "loss": 1.7045, + "step": 18848 + }, + { + "epoch": 5.785451197053407, + "grad_norm": 0.19869361817836761, + "learning_rate": 3.978826980002437e-05, + "loss": 1.7444, + "step": 18849 + }, + { + "epoch": 5.785758133824432, + "grad_norm": 0.2175174504518509, + "learning_rate": 3.97834040602338e-05, + "loss": 1.7565, + "step": 18850 + }, + { + "epoch": 5.786065070595457, + "grad_norm": 0.22726793587207794, + "learning_rate": 3.977853842140964e-05, + "loss": 1.713, + "step": 18851 + }, + { + "epoch": 5.786372007366483, + "grad_norm": 0.26518720388412476, + "learning_rate": 3.9773672883599934e-05, + "loss": 1.6892, + "step": 18852 + }, + { + "epoch": 5.786678944137508, + "grad_norm": 0.20721858739852905, + "learning_rate": 3.97688074468528e-05, + "loss": 1.724, + "step": 18853 + }, + { + "epoch": 5.786985880908533, + "grad_norm": 0.22739483416080475, + "learning_rate": 3.976394211121629e-05, + "loss": 1.762, + "step": 18854 + }, + { + "epoch": 5.787292817679558, + "grad_norm": 0.21918894350528717, + "learning_rate": 3.975907687673853e-05, + "loss": 1.6812, + "step": 18855 + }, + { + "epoch": 5.787599754450583, + "grad_norm": 0.20931273698806763, + "learning_rate": 3.9754211743467574e-05, + "loss": 1.6874, + "step": 18856 + }, + { + "epoch": 5.787906691221608, + "grad_norm": 0.2015041708946228, + "learning_rate": 3.974934671145148e-05, + "loss": 1.7248, + "step": 18857 + }, + { + "epoch": 5.788213627992634, + "grad_norm": 0.21632663905620575, + "learning_rate": 3.974448178073836e-05, + "loss": 1.7313, + "step": 18858 + }, + { + "epoch": 5.788520564763659, + "grad_norm": 0.18995213508605957, + "learning_rate": 3.973961695137627e-05, + "loss": 1.6761, + "step": 18859 + }, + { + "epoch": 5.7888275015346835, + "grad_norm": 0.18678395450115204, + "learning_rate": 3.973475222341333e-05, + "loss": 1.7082, + "step": 18860 + }, + { + "epoch": 5.789134438305709, + "grad_norm": 0.1889343559741974, + "learning_rate": 3.972988759689756e-05, + "loss": 1.7296, + "step": 18861 + }, + { + "epoch": 5.789441375076734, + "grad_norm": 0.20196790993213654, + "learning_rate": 3.9725023071877074e-05, + "loss": 1.6876, + "step": 18862 + }, + { + "epoch": 5.7897483118477595, + "grad_norm": 0.198349729180336, + "learning_rate": 3.972015864839992e-05, + "loss": 1.6826, + "step": 18863 + }, + { + "epoch": 5.790055248618785, + "grad_norm": 0.21323837339878082, + "learning_rate": 3.9715294326514185e-05, + "loss": 1.7444, + "step": 18864 + }, + { + "epoch": 5.79036218538981, + "grad_norm": 0.18581731617450714, + "learning_rate": 3.9710430106267934e-05, + "loss": 1.7731, + "step": 18865 + }, + { + "epoch": 5.790669122160835, + "grad_norm": 0.21925146877765656, + "learning_rate": 3.970556598770927e-05, + "loss": 1.7505, + "step": 18866 + }, + { + "epoch": 5.79097605893186, + "grad_norm": 0.20773115754127502, + "learning_rate": 3.970070197088621e-05, + "loss": 1.7408, + "step": 18867 + }, + { + "epoch": 5.791282995702885, + "grad_norm": 0.1805189698934555, + "learning_rate": 3.9695838055846865e-05, + "loss": 1.6871, + "step": 18868 + }, + { + "epoch": 5.791589932473911, + "grad_norm": 0.24685314297676086, + "learning_rate": 3.969097424263928e-05, + "loss": 1.7186, + "step": 18869 + }, + { + "epoch": 5.791896869244935, + "grad_norm": 0.18801769614219666, + "learning_rate": 3.9686110531311526e-05, + "loss": 1.7196, + "step": 18870 + }, + { + "epoch": 5.79220380601596, + "grad_norm": 0.22717779874801636, + "learning_rate": 3.968124692191168e-05, + "loss": 1.7309, + "step": 18871 + }, + { + "epoch": 5.792510742786986, + "grad_norm": 0.23058642446994781, + "learning_rate": 3.9676383414487806e-05, + "loss": 1.6993, + "step": 18872 + }, + { + "epoch": 5.792817679558011, + "grad_norm": 0.24307532608509064, + "learning_rate": 3.967152000908796e-05, + "loss": 1.6986, + "step": 18873 + }, + { + "epoch": 5.793124616329036, + "grad_norm": 0.3032459318637848, + "learning_rate": 3.9666656705760195e-05, + "loss": 1.677, + "step": 18874 + }, + { + "epoch": 5.793431553100062, + "grad_norm": 0.22669538855552673, + "learning_rate": 3.966179350455259e-05, + "loss": 1.7361, + "step": 18875 + }, + { + "epoch": 5.793738489871086, + "grad_norm": 0.27729150652885437, + "learning_rate": 3.96569304055132e-05, + "loss": 1.746, + "step": 18876 + }, + { + "epoch": 5.7940454266421115, + "grad_norm": 0.3422098755836487, + "learning_rate": 3.96520674086901e-05, + "loss": 1.783, + "step": 18877 + }, + { + "epoch": 5.794352363413137, + "grad_norm": 0.2114052176475525, + "learning_rate": 3.964720451413131e-05, + "loss": 1.7127, + "step": 18878 + }, + { + "epoch": 5.794659300184162, + "grad_norm": 0.22928549349308014, + "learning_rate": 3.964234172188494e-05, + "loss": 1.6579, + "step": 18879 + }, + { + "epoch": 5.7949662369551875, + "grad_norm": 0.24813635647296906, + "learning_rate": 3.9637479031999e-05, + "loss": 1.728, + "step": 18880 + }, + { + "epoch": 5.795273173726212, + "grad_norm": 0.19779744744300842, + "learning_rate": 3.963261644452158e-05, + "loss": 1.7338, + "step": 18881 + }, + { + "epoch": 5.795580110497237, + "grad_norm": 0.2424263060092926, + "learning_rate": 3.96277539595007e-05, + "loss": 1.7762, + "step": 18882 + }, + { + "epoch": 5.795887047268263, + "grad_norm": 0.24621224403381348, + "learning_rate": 3.9622891576984456e-05, + "loss": 1.7746, + "step": 18883 + }, + { + "epoch": 5.796193984039288, + "grad_norm": 0.1973372846841812, + "learning_rate": 3.961802929702086e-05, + "loss": 1.7243, + "step": 18884 + }, + { + "epoch": 5.796500920810313, + "grad_norm": 0.22170570492744446, + "learning_rate": 3.961316711965801e-05, + "loss": 1.764, + "step": 18885 + }, + { + "epoch": 5.796807857581339, + "grad_norm": 0.22319282591342926, + "learning_rate": 3.9608305044943906e-05, + "loss": 1.6795, + "step": 18886 + }, + { + "epoch": 5.797114794352363, + "grad_norm": 0.20000022649765015, + "learning_rate": 3.9603443072926635e-05, + "loss": 1.7587, + "step": 18887 + }, + { + "epoch": 5.797421731123388, + "grad_norm": 0.25041815638542175, + "learning_rate": 3.959858120365424e-05, + "loss": 1.7631, + "step": 18888 + }, + { + "epoch": 5.797728667894414, + "grad_norm": 0.23383729159832, + "learning_rate": 3.959371943717474e-05, + "loss": 1.741, + "step": 18889 + }, + { + "epoch": 5.798035604665439, + "grad_norm": 0.18609663844108582, + "learning_rate": 3.958885777353623e-05, + "loss": 1.6981, + "step": 18890 + }, + { + "epoch": 5.798342541436464, + "grad_norm": 0.29523593187332153, + "learning_rate": 3.9583996212786706e-05, + "loss": 1.8018, + "step": 18891 + }, + { + "epoch": 5.798649478207489, + "grad_norm": 0.20356589555740356, + "learning_rate": 3.9579134754974244e-05, + "loss": 1.7157, + "step": 18892 + }, + { + "epoch": 5.798956414978514, + "grad_norm": 0.2901862561702728, + "learning_rate": 3.957427340014688e-05, + "loss": 1.7249, + "step": 18893 + }, + { + "epoch": 5.7992633517495396, + "grad_norm": 0.24768278002738953, + "learning_rate": 3.956941214835267e-05, + "loss": 1.6894, + "step": 18894 + }, + { + "epoch": 5.799570288520565, + "grad_norm": 0.2417999804019928, + "learning_rate": 3.956455099963962e-05, + "loss": 1.7203, + "step": 18895 + }, + { + "epoch": 5.79987722529159, + "grad_norm": 0.2889639437198639, + "learning_rate": 3.9559689954055814e-05, + "loss": 1.7531, + "step": 18896 + }, + { + "epoch": 5.800184162062616, + "grad_norm": 0.21204611659049988, + "learning_rate": 3.955482901164926e-05, + "loss": 1.7521, + "step": 18897 + }, + { + "epoch": 5.80049109883364, + "grad_norm": 0.2961438298225403, + "learning_rate": 3.954996817246801e-05, + "loss": 1.8102, + "step": 18898 + }, + { + "epoch": 5.800798035604665, + "grad_norm": 0.36562761664390564, + "learning_rate": 3.9545107436560084e-05, + "loss": 1.6722, + "step": 18899 + }, + { + "epoch": 5.801104972375691, + "grad_norm": 0.22423696517944336, + "learning_rate": 3.954024680397357e-05, + "loss": 1.7101, + "step": 18900 + }, + { + "epoch": 5.801411909146716, + "grad_norm": 0.3122335970401764, + "learning_rate": 3.953538627475644e-05, + "loss": 1.7314, + "step": 18901 + }, + { + "epoch": 5.8017188459177405, + "grad_norm": 0.39004257321357727, + "learning_rate": 3.953052584895677e-05, + "loss": 1.762, + "step": 18902 + }, + { + "epoch": 5.802025782688766, + "grad_norm": 0.1827487200498581, + "learning_rate": 3.952566552662256e-05, + "loss": 1.6935, + "step": 18903 + }, + { + "epoch": 5.802332719459791, + "grad_norm": 0.3025164306163788, + "learning_rate": 3.952080530780188e-05, + "loss": 1.7448, + "step": 18904 + }, + { + "epoch": 5.8026396562308165, + "grad_norm": 0.2313300520181656, + "learning_rate": 3.9515945192542754e-05, + "loss": 1.7686, + "step": 18905 + }, + { + "epoch": 5.802946593001842, + "grad_norm": 0.3501042425632477, + "learning_rate": 3.9511085180893184e-05, + "loss": 1.775, + "step": 18906 + }, + { + "epoch": 5.803253529772867, + "grad_norm": 0.4111124873161316, + "learning_rate": 3.950622527290123e-05, + "loss": 1.7561, + "step": 18907 + }, + { + "epoch": 5.803560466543892, + "grad_norm": 0.20877736806869507, + "learning_rate": 3.950136546861489e-05, + "loss": 1.7356, + "step": 18908 + }, + { + "epoch": 5.803867403314917, + "grad_norm": 0.33404025435447693, + "learning_rate": 3.949650576808222e-05, + "loss": 1.7289, + "step": 18909 + }, + { + "epoch": 5.804174340085942, + "grad_norm": 0.2183927446603775, + "learning_rate": 3.9491646171351234e-05, + "loss": 1.7136, + "step": 18910 + }, + { + "epoch": 5.804481276856968, + "grad_norm": 0.27149543166160583, + "learning_rate": 3.948678667846997e-05, + "loss": 1.7516, + "step": 18911 + }, + { + "epoch": 5.804788213627993, + "grad_norm": 0.2369886338710785, + "learning_rate": 3.948192728948643e-05, + "loss": 1.6767, + "step": 18912 + }, + { + "epoch": 5.805095150399017, + "grad_norm": 0.20671069622039795, + "learning_rate": 3.947706800444867e-05, + "loss": 1.7831, + "step": 18913 + }, + { + "epoch": 5.805402087170043, + "grad_norm": 0.23622260987758636, + "learning_rate": 3.9472208823404665e-05, + "loss": 1.7121, + "step": 18914 + }, + { + "epoch": 5.805709023941068, + "grad_norm": 0.21099595725536346, + "learning_rate": 3.946734974640247e-05, + "loss": 1.7137, + "step": 18915 + }, + { + "epoch": 5.806015960712093, + "grad_norm": 0.2205580472946167, + "learning_rate": 3.9462490773490094e-05, + "loss": 1.713, + "step": 18916 + }, + { + "epoch": 5.806322897483119, + "grad_norm": 0.20183326303958893, + "learning_rate": 3.9457631904715584e-05, + "loss": 1.7316, + "step": 18917 + }, + { + "epoch": 5.806629834254144, + "grad_norm": 0.27381497621536255, + "learning_rate": 3.9452773140126906e-05, + "loss": 1.7577, + "step": 18918 + }, + { + "epoch": 5.8069367710251685, + "grad_norm": 0.29962384700775146, + "learning_rate": 3.944791447977214e-05, + "loss": 1.7579, + "step": 18919 + }, + { + "epoch": 5.807243707796194, + "grad_norm": 0.22385326027870178, + "learning_rate": 3.944305592369923e-05, + "loss": 1.7795, + "step": 18920 + }, + { + "epoch": 5.807550644567219, + "grad_norm": 0.2954902648925781, + "learning_rate": 3.943819747195625e-05, + "loss": 1.6655, + "step": 18921 + }, + { + "epoch": 5.8078575813382445, + "grad_norm": 0.18947024643421173, + "learning_rate": 3.94333391245912e-05, + "loss": 1.6803, + "step": 18922 + }, + { + "epoch": 5.80816451810927, + "grad_norm": 0.26797959208488464, + "learning_rate": 3.942848088165206e-05, + "loss": 1.7671, + "step": 18923 + }, + { + "epoch": 5.808471454880294, + "grad_norm": 0.23453201353549957, + "learning_rate": 3.94236227431869e-05, + "loss": 1.7472, + "step": 18924 + }, + { + "epoch": 5.80877839165132, + "grad_norm": 0.24471673369407654, + "learning_rate": 3.941876470924367e-05, + "loss": 1.7482, + "step": 18925 + }, + { + "epoch": 5.809085328422345, + "grad_norm": 0.22249098122119904, + "learning_rate": 3.9413906779870426e-05, + "loss": 1.6794, + "step": 18926 + }, + { + "epoch": 5.80939226519337, + "grad_norm": 0.1985001564025879, + "learning_rate": 3.9409048955115144e-05, + "loss": 1.7278, + "step": 18927 + }, + { + "epoch": 5.809699201964396, + "grad_norm": 0.22482000291347504, + "learning_rate": 3.940419123502587e-05, + "loss": 1.7658, + "step": 18928 + }, + { + "epoch": 5.810006138735421, + "grad_norm": 0.18513578176498413, + "learning_rate": 3.939933361965057e-05, + "loss": 1.7154, + "step": 18929 + }, + { + "epoch": 5.810313075506445, + "grad_norm": 0.1984710991382599, + "learning_rate": 3.939447610903729e-05, + "loss": 1.7324, + "step": 18930 + }, + { + "epoch": 5.810620012277471, + "grad_norm": 0.26089081168174744, + "learning_rate": 3.938961870323399e-05, + "loss": 1.774, + "step": 18931 + }, + { + "epoch": 5.810926949048496, + "grad_norm": 0.2059585452079773, + "learning_rate": 3.9384761402288706e-05, + "loss": 1.7059, + "step": 18932 + }, + { + "epoch": 5.811233885819521, + "grad_norm": 0.1887979656457901, + "learning_rate": 3.937990420624942e-05, + "loss": 1.6829, + "step": 18933 + }, + { + "epoch": 5.811540822590547, + "grad_norm": 0.2589145600795746, + "learning_rate": 3.937504711516417e-05, + "loss": 1.7301, + "step": 18934 + }, + { + "epoch": 5.811847759361571, + "grad_norm": 0.209516704082489, + "learning_rate": 3.9370190129080907e-05, + "loss": 1.7716, + "step": 18935 + }, + { + "epoch": 5.8121546961325965, + "grad_norm": 0.3321632146835327, + "learning_rate": 3.936533324804768e-05, + "loss": 1.7754, + "step": 18936 + }, + { + "epoch": 5.812461632903622, + "grad_norm": 0.236944317817688, + "learning_rate": 3.9360476472112446e-05, + "loss": 1.7546, + "step": 18937 + }, + { + "epoch": 5.812768569674647, + "grad_norm": 0.29667431116104126, + "learning_rate": 3.9355619801323226e-05, + "loss": 1.7712, + "step": 18938 + }, + { + "epoch": 5.8130755064456725, + "grad_norm": 0.3071129620075226, + "learning_rate": 3.935076323572802e-05, + "loss": 1.7351, + "step": 18939 + }, + { + "epoch": 5.813382443216698, + "grad_norm": 0.22747032344341278, + "learning_rate": 3.934590677537479e-05, + "loss": 1.7788, + "step": 18940 + }, + { + "epoch": 5.813689379987722, + "grad_norm": 0.2575854957103729, + "learning_rate": 3.934105042031158e-05, + "loss": 1.705, + "step": 18941 + }, + { + "epoch": 5.813996316758748, + "grad_norm": 0.2561504542827606, + "learning_rate": 3.9336194170586325e-05, + "loss": 1.7309, + "step": 18942 + }, + { + "epoch": 5.814303253529773, + "grad_norm": 0.21570482850074768, + "learning_rate": 3.933133802624707e-05, + "loss": 1.7408, + "step": 18943 + }, + { + "epoch": 5.814610190300798, + "grad_norm": 0.29227179288864136, + "learning_rate": 3.932648198734177e-05, + "loss": 1.7415, + "step": 18944 + }, + { + "epoch": 5.814917127071823, + "grad_norm": 0.17847758531570435, + "learning_rate": 3.9321626053918456e-05, + "loss": 1.7926, + "step": 18945 + }, + { + "epoch": 5.815224063842848, + "grad_norm": 0.24604015052318573, + "learning_rate": 3.931677022602507e-05, + "loss": 1.7519, + "step": 18946 + }, + { + "epoch": 5.815531000613873, + "grad_norm": 0.23843185603618622, + "learning_rate": 3.931191450370965e-05, + "loss": 1.7206, + "step": 18947 + }, + { + "epoch": 5.815837937384899, + "grad_norm": 0.23431400954723358, + "learning_rate": 3.9307058887020126e-05, + "loss": 1.7743, + "step": 18948 + }, + { + "epoch": 5.816144874155924, + "grad_norm": 0.23685097694396973, + "learning_rate": 3.9302203376004525e-05, + "loss": 1.7485, + "step": 18949 + }, + { + "epoch": 5.816451810926949, + "grad_norm": 0.2129819542169571, + "learning_rate": 3.929734797071082e-05, + "loss": 1.6897, + "step": 18950 + }, + { + "epoch": 5.816758747697974, + "grad_norm": 0.24736030399799347, + "learning_rate": 3.9292492671187e-05, + "loss": 1.7292, + "step": 18951 + }, + { + "epoch": 5.817065684468999, + "grad_norm": 0.28659793734550476, + "learning_rate": 3.9287637477481025e-05, + "loss": 1.6772, + "step": 18952 + }, + { + "epoch": 5.8173726212400245, + "grad_norm": 0.22304075956344604, + "learning_rate": 3.928278238964092e-05, + "loss": 1.7991, + "step": 18953 + }, + { + "epoch": 5.81767955801105, + "grad_norm": 0.25354304909706116, + "learning_rate": 3.927792740771462e-05, + "loss": 1.7407, + "step": 18954 + }, + { + "epoch": 5.817986494782075, + "grad_norm": 0.3014552593231201, + "learning_rate": 3.927307253175014e-05, + "loss": 1.7714, + "step": 18955 + }, + { + "epoch": 5.8182934315531, + "grad_norm": 0.20537856221199036, + "learning_rate": 3.926821776179545e-05, + "loss": 1.6992, + "step": 18956 + }, + { + "epoch": 5.818600368324125, + "grad_norm": 0.29656440019607544, + "learning_rate": 3.92633630978985e-05, + "loss": 1.7476, + "step": 18957 + }, + { + "epoch": 5.81890730509515, + "grad_norm": 0.20956869423389435, + "learning_rate": 3.925850854010732e-05, + "loss": 1.808, + "step": 18958 + }, + { + "epoch": 5.819214241866176, + "grad_norm": 0.29395633935928345, + "learning_rate": 3.925365408846983e-05, + "loss": 1.7787, + "step": 18959 + }, + { + "epoch": 5.819521178637201, + "grad_norm": 0.31101030111312866, + "learning_rate": 3.9248799743034025e-05, + "loss": 1.7685, + "step": 18960 + }, + { + "epoch": 5.819828115408226, + "grad_norm": 0.2109794020652771, + "learning_rate": 3.9243945503847894e-05, + "loss": 1.7307, + "step": 18961 + }, + { + "epoch": 5.820135052179251, + "grad_norm": 0.2503393292427063, + "learning_rate": 3.9239091370959405e-05, + "loss": 1.763, + "step": 18962 + }, + { + "epoch": 5.820441988950276, + "grad_norm": 0.21757015585899353, + "learning_rate": 3.92342373444165e-05, + "loss": 1.7862, + "step": 18963 + }, + { + "epoch": 5.820748925721301, + "grad_norm": 0.22108088433742523, + "learning_rate": 3.9229383424267197e-05, + "loss": 1.6845, + "step": 18964 + }, + { + "epoch": 5.821055862492327, + "grad_norm": 0.20059655606746674, + "learning_rate": 3.922452961055941e-05, + "loss": 1.7523, + "step": 18965 + }, + { + "epoch": 5.821362799263352, + "grad_norm": 0.22009585797786713, + "learning_rate": 3.921967590334117e-05, + "loss": 1.7802, + "step": 18966 + }, + { + "epoch": 5.8216697360343765, + "grad_norm": 0.22554142773151398, + "learning_rate": 3.9214822302660386e-05, + "loss": 1.7911, + "step": 18967 + }, + { + "epoch": 5.821976672805402, + "grad_norm": 0.23434770107269287, + "learning_rate": 3.920996880856506e-05, + "loss": 1.6755, + "step": 18968 + }, + { + "epoch": 5.822283609576427, + "grad_norm": 0.2162926346063614, + "learning_rate": 3.920511542110314e-05, + "loss": 1.7145, + "step": 18969 + }, + { + "epoch": 5.8225905463474525, + "grad_norm": 0.18654806911945343, + "learning_rate": 3.9200262140322616e-05, + "loss": 1.7076, + "step": 18970 + }, + { + "epoch": 5.822897483118478, + "grad_norm": 0.22357499599456787, + "learning_rate": 3.9195408966271404e-05, + "loss": 1.791, + "step": 18971 + }, + { + "epoch": 5.823204419889503, + "grad_norm": 0.21073313057422638, + "learning_rate": 3.919055589899752e-05, + "loss": 1.7976, + "step": 18972 + }, + { + "epoch": 5.823511356660528, + "grad_norm": 0.21481956541538239, + "learning_rate": 3.9185702938548886e-05, + "loss": 1.7468, + "step": 18973 + }, + { + "epoch": 5.823818293431553, + "grad_norm": 0.22051872313022614, + "learning_rate": 3.9180850084973464e-05, + "loss": 1.7201, + "step": 18974 + }, + { + "epoch": 5.824125230202578, + "grad_norm": 0.24410493671894073, + "learning_rate": 3.917599733831924e-05, + "loss": 1.7774, + "step": 18975 + }, + { + "epoch": 5.824432166973604, + "grad_norm": 0.19711458683013916, + "learning_rate": 3.917114469863414e-05, + "loss": 1.7907, + "step": 18976 + }, + { + "epoch": 5.824739103744628, + "grad_norm": 0.2045203000307083, + "learning_rate": 3.9166292165966155e-05, + "loss": 1.7105, + "step": 18977 + }, + { + "epoch": 5.8250460405156534, + "grad_norm": 0.21570880711078644, + "learning_rate": 3.9161439740363196e-05, + "loss": 1.7312, + "step": 18978 + }, + { + "epoch": 5.825352977286679, + "grad_norm": 0.21203923225402832, + "learning_rate": 3.915658742187325e-05, + "loss": 1.7869, + "step": 18979 + }, + { + "epoch": 5.825659914057704, + "grad_norm": 0.26233312487602234, + "learning_rate": 3.915173521054426e-05, + "loss": 1.7453, + "step": 18980 + }, + { + "epoch": 5.8259668508287294, + "grad_norm": 0.23792949318885803, + "learning_rate": 3.91468831064242e-05, + "loss": 1.6886, + "step": 18981 + }, + { + "epoch": 5.826273787599755, + "grad_norm": 0.20325250923633575, + "learning_rate": 3.914203110956098e-05, + "loss": 1.7538, + "step": 18982 + }, + { + "epoch": 5.82658072437078, + "grad_norm": 0.28146329522132874, + "learning_rate": 3.9137179220002596e-05, + "loss": 1.7674, + "step": 18983 + }, + { + "epoch": 5.826887661141805, + "grad_norm": 0.2319503277540207, + "learning_rate": 3.9132327437796946e-05, + "loss": 1.7864, + "step": 18984 + }, + { + "epoch": 5.82719459791283, + "grad_norm": 0.22653794288635254, + "learning_rate": 3.9127475762992025e-05, + "loss": 1.7424, + "step": 18985 + }, + { + "epoch": 5.827501534683855, + "grad_norm": 0.26855236291885376, + "learning_rate": 3.912262419563574e-05, + "loss": 1.762, + "step": 18986 + }, + { + "epoch": 5.827808471454881, + "grad_norm": 0.18356221914291382, + "learning_rate": 3.9117772735776095e-05, + "loss": 1.7199, + "step": 18987 + }, + { + "epoch": 5.828115408225905, + "grad_norm": 0.2802455425262451, + "learning_rate": 3.911292138346096e-05, + "loss": 1.7142, + "step": 18988 + }, + { + "epoch": 5.82842234499693, + "grad_norm": 0.2638777494430542, + "learning_rate": 3.910807013873835e-05, + "loss": 1.6759, + "step": 18989 + }, + { + "epoch": 5.828729281767956, + "grad_norm": 0.18397162854671478, + "learning_rate": 3.910321900165615e-05, + "loss": 1.693, + "step": 18990 + }, + { + "epoch": 5.829036218538981, + "grad_norm": 0.20967607200145721, + "learning_rate": 3.909836797226233e-05, + "loss": 1.6908, + "step": 18991 + }, + { + "epoch": 5.829343155310006, + "grad_norm": 0.21123014390468597, + "learning_rate": 3.909351705060485e-05, + "loss": 1.7875, + "step": 18992 + }, + { + "epoch": 5.829650092081032, + "grad_norm": 0.1988777220249176, + "learning_rate": 3.90886662367316e-05, + "loss": 1.7254, + "step": 18993 + }, + { + "epoch": 5.829957028852056, + "grad_norm": 0.17793473601341248, + "learning_rate": 3.9083815530690564e-05, + "loss": 1.7233, + "step": 18994 + }, + { + "epoch": 5.8302639656230815, + "grad_norm": 0.2289644330739975, + "learning_rate": 3.9078964932529645e-05, + "loss": 1.7739, + "step": 18995 + }, + { + "epoch": 5.830570902394107, + "grad_norm": 0.18145552277565002, + "learning_rate": 3.9074114442296804e-05, + "loss": 1.6989, + "step": 18996 + }, + { + "epoch": 5.830877839165132, + "grad_norm": 0.1941588670015335, + "learning_rate": 3.9069264060039956e-05, + "loss": 1.6981, + "step": 18997 + }, + { + "epoch": 5.8311847759361575, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.9064413785807075e-05, + "loss": 1.7163, + "step": 18998 + }, + { + "epoch": 5.831491712707182, + "grad_norm": 0.19494447112083435, + "learning_rate": 3.905956361964604e-05, + "loss": 1.7481, + "step": 18999 + }, + { + "epoch": 5.831798649478207, + "grad_norm": 0.2127624899148941, + "learning_rate": 3.9054713561604826e-05, + "loss": 1.7494, + "step": 19000 + }, + { + "epoch": 5.832105586249233, + "grad_norm": 0.20107653737068176, + "learning_rate": 3.9049863611731334e-05, + "loss": 1.7483, + "step": 19001 + }, + { + "epoch": 5.832412523020258, + "grad_norm": 0.22574639320373535, + "learning_rate": 3.904501377007352e-05, + "loss": 1.8184, + "step": 19002 + }, + { + "epoch": 5.832719459791283, + "grad_norm": 0.20027579367160797, + "learning_rate": 3.9040164036679285e-05, + "loss": 1.6995, + "step": 19003 + }, + { + "epoch": 5.833026396562309, + "grad_norm": 0.21599887311458588, + "learning_rate": 3.90353144115966e-05, + "loss": 1.7487, + "step": 19004 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.21122781932353973, + "learning_rate": 3.9030464894873334e-05, + "loss": 1.7332, + "step": 19005 + }, + { + "epoch": 5.833640270104358, + "grad_norm": 0.19006453454494476, + "learning_rate": 3.902561548655747e-05, + "loss": 1.688, + "step": 19006 + }, + { + "epoch": 5.833947206875384, + "grad_norm": 0.22979344427585602, + "learning_rate": 3.9020766186696895e-05, + "loss": 1.7495, + "step": 19007 + }, + { + "epoch": 5.834254143646409, + "grad_norm": 0.18405365943908691, + "learning_rate": 3.901591699533953e-05, + "loss": 1.7395, + "step": 19008 + }, + { + "epoch": 5.834561080417434, + "grad_norm": 0.26198676228523254, + "learning_rate": 3.901106791253334e-05, + "loss": 1.8286, + "step": 19009 + }, + { + "epoch": 5.834868017188459, + "grad_norm": 0.2535797357559204, + "learning_rate": 3.900621893832619e-05, + "loss": 1.757, + "step": 19010 + }, + { + "epoch": 5.835174953959484, + "grad_norm": 0.24599581956863403, + "learning_rate": 3.900137007276605e-05, + "loss": 1.7266, + "step": 19011 + }, + { + "epoch": 5.8354818907305095, + "grad_norm": 0.25688427686691284, + "learning_rate": 3.8996521315900805e-05, + "loss": 1.7255, + "step": 19012 + }, + { + "epoch": 5.835788827501535, + "grad_norm": 0.24668128788471222, + "learning_rate": 3.8991672667778385e-05, + "loss": 1.737, + "step": 19013 + }, + { + "epoch": 5.83609576427256, + "grad_norm": 0.28365740180015564, + "learning_rate": 3.8986824128446695e-05, + "loss": 1.7129, + "step": 19014 + }, + { + "epoch": 5.8364027010435855, + "grad_norm": 0.2543952465057373, + "learning_rate": 3.89819756979537e-05, + "loss": 1.7249, + "step": 19015 + }, + { + "epoch": 5.83670963781461, + "grad_norm": 0.2868666350841522, + "learning_rate": 3.8977127376347245e-05, + "loss": 1.6985, + "step": 19016 + }, + { + "epoch": 5.837016574585635, + "grad_norm": 0.3818367123603821, + "learning_rate": 3.897227916367531e-05, + "loss": 1.6954, + "step": 19017 + }, + { + "epoch": 5.837323511356661, + "grad_norm": 0.20922113955020905, + "learning_rate": 3.896743105998574e-05, + "loss": 1.7571, + "step": 19018 + }, + { + "epoch": 5.837630448127686, + "grad_norm": 0.3669843375682831, + "learning_rate": 3.89625830653265e-05, + "loss": 1.8041, + "step": 19019 + }, + { + "epoch": 5.83793738489871, + "grad_norm": 0.2889872193336487, + "learning_rate": 3.895773517974548e-05, + "loss": 1.7775, + "step": 19020 + }, + { + "epoch": 5.838244321669736, + "grad_norm": 0.22619491815567017, + "learning_rate": 3.89528874032906e-05, + "loss": 1.7019, + "step": 19021 + }, + { + "epoch": 5.838551258440761, + "grad_norm": 0.4169046878814697, + "learning_rate": 3.894803973600976e-05, + "loss": 1.8282, + "step": 19022 + }, + { + "epoch": 5.838858195211786, + "grad_norm": 0.2567043900489807, + "learning_rate": 3.894319217795087e-05, + "loss": 1.733, + "step": 19023 + }, + { + "epoch": 5.839165131982812, + "grad_norm": 0.2435060739517212, + "learning_rate": 3.8938344729161834e-05, + "loss": 1.7208, + "step": 19024 + }, + { + "epoch": 5.839472068753837, + "grad_norm": 0.2941838204860687, + "learning_rate": 3.893349738969055e-05, + "loss": 1.7202, + "step": 19025 + }, + { + "epoch": 5.8397790055248615, + "grad_norm": 0.23542317748069763, + "learning_rate": 3.892865015958495e-05, + "loss": 1.7571, + "step": 19026 + }, + { + "epoch": 5.840085942295887, + "grad_norm": 0.3248259723186493, + "learning_rate": 3.8923803038892897e-05, + "loss": 1.7118, + "step": 19027 + }, + { + "epoch": 5.840392879066912, + "grad_norm": 0.24359026551246643, + "learning_rate": 3.891895602766234e-05, + "loss": 1.8126, + "step": 19028 + }, + { + "epoch": 5.8406998158379375, + "grad_norm": 0.3053695559501648, + "learning_rate": 3.8914109125941126e-05, + "loss": 1.6632, + "step": 19029 + }, + { + "epoch": 5.841006752608963, + "grad_norm": 0.3194943368434906, + "learning_rate": 3.8909262333777195e-05, + "loss": 1.8432, + "step": 19030 + }, + { + "epoch": 5.841313689379987, + "grad_norm": 0.23532693088054657, + "learning_rate": 3.8904415651218426e-05, + "loss": 1.716, + "step": 19031 + }, + { + "epoch": 5.841620626151013, + "grad_norm": 0.2941347062587738, + "learning_rate": 3.889956907831275e-05, + "loss": 1.7737, + "step": 19032 + }, + { + "epoch": 5.841927562922038, + "grad_norm": 0.2265428602695465, + "learning_rate": 3.889472261510801e-05, + "loss": 1.7111, + "step": 19033 + }, + { + "epoch": 5.842234499693063, + "grad_norm": 0.3023710548877716, + "learning_rate": 3.888987626165216e-05, + "loss": 1.7845, + "step": 19034 + }, + { + "epoch": 5.842541436464089, + "grad_norm": 0.2855348289012909, + "learning_rate": 3.8885030017993026e-05, + "loss": 1.8009, + "step": 19035 + }, + { + "epoch": 5.842848373235114, + "grad_norm": 0.23046357929706573, + "learning_rate": 3.888018388417857e-05, + "loss": 1.8225, + "step": 19036 + }, + { + "epoch": 5.843155310006138, + "grad_norm": 0.23732341825962067, + "learning_rate": 3.8875337860256634e-05, + "loss": 1.7542, + "step": 19037 + }, + { + "epoch": 5.843462246777164, + "grad_norm": 0.18987004458904266, + "learning_rate": 3.887049194627516e-05, + "loss": 1.7327, + "step": 19038 + }, + { + "epoch": 5.843769183548189, + "grad_norm": 0.21539908647537231, + "learning_rate": 3.8865646142281974e-05, + "loss": 1.715, + "step": 19039 + }, + { + "epoch": 5.844076120319214, + "grad_norm": 0.2991954982280731, + "learning_rate": 3.8860800448325024e-05, + "loss": 1.7728, + "step": 19040 + }, + { + "epoch": 5.84438305709024, + "grad_norm": 0.19066409766674042, + "learning_rate": 3.885595486445216e-05, + "loss": 1.7128, + "step": 19041 + }, + { + "epoch": 5.844689993861264, + "grad_norm": 0.21643762290477753, + "learning_rate": 3.885110939071128e-05, + "loss": 1.7584, + "step": 19042 + }, + { + "epoch": 5.8449969306322895, + "grad_norm": 0.20227304100990295, + "learning_rate": 3.884626402715029e-05, + "loss": 1.7053, + "step": 19043 + }, + { + "epoch": 5.845303867403315, + "grad_norm": 0.20429107546806335, + "learning_rate": 3.884141877381703e-05, + "loss": 1.761, + "step": 19044 + }, + { + "epoch": 5.84561080417434, + "grad_norm": 0.1873873621225357, + "learning_rate": 3.8836573630759435e-05, + "loss": 1.7251, + "step": 19045 + }, + { + "epoch": 5.8459177409453655, + "grad_norm": 0.18025323748588562, + "learning_rate": 3.883172859802534e-05, + "loss": 1.6696, + "step": 19046 + }, + { + "epoch": 5.846224677716391, + "grad_norm": 0.22011777758598328, + "learning_rate": 3.8826883675662664e-05, + "loss": 1.7148, + "step": 19047 + }, + { + "epoch": 5.846531614487415, + "grad_norm": 0.17827673256397247, + "learning_rate": 3.882203886371925e-05, + "loss": 1.69, + "step": 19048 + }, + { + "epoch": 5.846838551258441, + "grad_norm": 0.200766459107399, + "learning_rate": 3.881719416224303e-05, + "loss": 1.7773, + "step": 19049 + }, + { + "epoch": 5.847145488029466, + "grad_norm": 0.22770950198173523, + "learning_rate": 3.8812349571281834e-05, + "loss": 1.7156, + "step": 19050 + }, + { + "epoch": 5.847452424800491, + "grad_norm": 0.19483895599842072, + "learning_rate": 3.880750509088357e-05, + "loss": 1.7304, + "step": 19051 + }, + { + "epoch": 5.847759361571516, + "grad_norm": 0.1988774836063385, + "learning_rate": 3.8802660721096086e-05, + "loss": 1.7428, + "step": 19052 + }, + { + "epoch": 5.848066298342541, + "grad_norm": 0.19881510734558105, + "learning_rate": 3.879781646196727e-05, + "loss": 1.7268, + "step": 19053 + }, + { + "epoch": 5.848373235113566, + "grad_norm": 0.21257543563842773, + "learning_rate": 3.8792972313545e-05, + "loss": 1.7532, + "step": 19054 + }, + { + "epoch": 5.848680171884592, + "grad_norm": 0.21000613272190094, + "learning_rate": 3.878812827587716e-05, + "loss": 1.7782, + "step": 19055 + }, + { + "epoch": 5.848987108655617, + "grad_norm": 0.2136746346950531, + "learning_rate": 3.878328434901159e-05, + "loss": 1.6875, + "step": 19056 + }, + { + "epoch": 5.849294045426642, + "grad_norm": 0.20291505753993988, + "learning_rate": 3.8778440532996204e-05, + "loss": 1.74, + "step": 19057 + }, + { + "epoch": 5.849600982197668, + "grad_norm": 0.22568103671073914, + "learning_rate": 3.877359682787883e-05, + "loss": 1.7074, + "step": 19058 + }, + { + "epoch": 5.849907918968692, + "grad_norm": 0.24398963153362274, + "learning_rate": 3.876875323370734e-05, + "loss": 1.6825, + "step": 19059 + }, + { + "epoch": 5.850214855739718, + "grad_norm": 0.19684453308582306, + "learning_rate": 3.876390975052964e-05, + "loss": 1.7143, + "step": 19060 + }, + { + "epoch": 5.850521792510743, + "grad_norm": 0.2786783277988434, + "learning_rate": 3.8759066378393544e-05, + "loss": 1.8339, + "step": 19061 + }, + { + "epoch": 5.850828729281768, + "grad_norm": 0.1977633833885193, + "learning_rate": 3.875422311734697e-05, + "loss": 1.742, + "step": 19062 + }, + { + "epoch": 5.851135666052793, + "grad_norm": 0.260643869638443, + "learning_rate": 3.874937996743772e-05, + "loss": 1.7728, + "step": 19063 + }, + { + "epoch": 5.851442602823818, + "grad_norm": 0.20998433232307434, + "learning_rate": 3.874453692871372e-05, + "loss": 1.768, + "step": 19064 + }, + { + "epoch": 5.851749539594843, + "grad_norm": 0.2603224217891693, + "learning_rate": 3.873969400122278e-05, + "loss": 1.8015, + "step": 19065 + }, + { + "epoch": 5.852056476365869, + "grad_norm": 0.24428118765354156, + "learning_rate": 3.87348511850128e-05, + "loss": 1.8133, + "step": 19066 + }, + { + "epoch": 5.852363413136894, + "grad_norm": 0.19380085170269012, + "learning_rate": 3.873000848013161e-05, + "loss": 1.7331, + "step": 19067 + }, + { + "epoch": 5.852670349907919, + "grad_norm": 0.20088011026382446, + "learning_rate": 3.87251658866271e-05, + "loss": 1.7501, + "step": 19068 + }, + { + "epoch": 5.852977286678944, + "grad_norm": 0.21920672059059143, + "learning_rate": 3.8720323404547095e-05, + "loss": 1.6848, + "step": 19069 + }, + { + "epoch": 5.853284223449969, + "grad_norm": 0.21692565083503723, + "learning_rate": 3.871548103393947e-05, + "loss": 1.7132, + "step": 19070 + }, + { + "epoch": 5.8535911602209945, + "grad_norm": 0.19463133811950684, + "learning_rate": 3.871063877485207e-05, + "loss": 1.7263, + "step": 19071 + }, + { + "epoch": 5.85389809699202, + "grad_norm": 0.21563300490379333, + "learning_rate": 3.870579662733277e-05, + "loss": 1.7271, + "step": 19072 + }, + { + "epoch": 5.854205033763045, + "grad_norm": 0.19901902973651886, + "learning_rate": 3.870095459142939e-05, + "loss": 1.7153, + "step": 19073 + }, + { + "epoch": 5.85451197053407, + "grad_norm": 0.2053879052400589, + "learning_rate": 3.869611266718982e-05, + "loss": 1.7769, + "step": 19074 + }, + { + "epoch": 5.854818907305095, + "grad_norm": 0.18877504765987396, + "learning_rate": 3.869127085466188e-05, + "loss": 1.7427, + "step": 19075 + }, + { + "epoch": 5.85512584407612, + "grad_norm": 0.2000892460346222, + "learning_rate": 3.8686429153893414e-05, + "loss": 1.7245, + "step": 19076 + }, + { + "epoch": 5.855432780847146, + "grad_norm": 0.23791030049324036, + "learning_rate": 3.868158756493231e-05, + "loss": 1.7128, + "step": 19077 + }, + { + "epoch": 5.855739717618171, + "grad_norm": 0.20807631313800812, + "learning_rate": 3.8676746087826374e-05, + "loss": 1.7235, + "step": 19078 + }, + { + "epoch": 5.856046654389196, + "grad_norm": 0.2603290379047394, + "learning_rate": 3.867190472262349e-05, + "loss": 1.7272, + "step": 19079 + }, + { + "epoch": 5.856353591160221, + "grad_norm": 0.25234153866767883, + "learning_rate": 3.8667063469371456e-05, + "loss": 1.7818, + "step": 19080 + }, + { + "epoch": 5.856660527931246, + "grad_norm": 0.20621159672737122, + "learning_rate": 3.866222232811816e-05, + "loss": 1.7318, + "step": 19081 + }, + { + "epoch": 5.856967464702271, + "grad_norm": 0.19565562903881073, + "learning_rate": 3.865738129891141e-05, + "loss": 1.6364, + "step": 19082 + }, + { + "epoch": 5.857274401473297, + "grad_norm": 0.2090953141450882, + "learning_rate": 3.86525403817991e-05, + "loss": 1.7763, + "step": 19083 + }, + { + "epoch": 5.857581338244322, + "grad_norm": 0.21286322176456451, + "learning_rate": 3.864769957682901e-05, + "loss": 1.7652, + "step": 19084 + }, + { + "epoch": 5.8578882750153465, + "grad_norm": 0.20606130361557007, + "learning_rate": 3.864285888404902e-05, + "loss": 1.7267, + "step": 19085 + }, + { + "epoch": 5.858195211786372, + "grad_norm": 0.18837152421474457, + "learning_rate": 3.863801830350694e-05, + "loss": 1.7013, + "step": 19086 + }, + { + "epoch": 5.858502148557397, + "grad_norm": 0.19374001026153564, + "learning_rate": 3.8633177835250636e-05, + "loss": 1.7462, + "step": 19087 + }, + { + "epoch": 5.8588090853284225, + "grad_norm": 0.19090552628040314, + "learning_rate": 3.8628337479327914e-05, + "loss": 1.7321, + "step": 19088 + }, + { + "epoch": 5.859116022099448, + "grad_norm": 0.19487829506397247, + "learning_rate": 3.8623497235786656e-05, + "loss": 1.7323, + "step": 19089 + }, + { + "epoch": 5.859422958870473, + "grad_norm": 0.23836077749729156, + "learning_rate": 3.861865710467464e-05, + "loss": 1.7277, + "step": 19090 + }, + { + "epoch": 5.859729895641498, + "grad_norm": 0.22283829748630524, + "learning_rate": 3.861381708603974e-05, + "loss": 1.7521, + "step": 19091 + }, + { + "epoch": 5.860036832412523, + "grad_norm": 0.2094828337430954, + "learning_rate": 3.8608977179929774e-05, + "loss": 1.763, + "step": 19092 + }, + { + "epoch": 5.860343769183548, + "grad_norm": 0.30857667326927185, + "learning_rate": 3.860413738639256e-05, + "loss": 1.7112, + "step": 19093 + }, + { + "epoch": 5.860650705954574, + "grad_norm": 0.22634989023208618, + "learning_rate": 3.8599297705475954e-05, + "loss": 1.7076, + "step": 19094 + }, + { + "epoch": 5.860957642725598, + "grad_norm": 0.20488132536411285, + "learning_rate": 3.8594458137227757e-05, + "loss": 1.6821, + "step": 19095 + }, + { + "epoch": 5.861264579496623, + "grad_norm": 0.22760719060897827, + "learning_rate": 3.8589618681695826e-05, + "loss": 1.6981, + "step": 19096 + }, + { + "epoch": 5.861571516267649, + "grad_norm": 0.21168997883796692, + "learning_rate": 3.858477933892795e-05, + "loss": 1.7396, + "step": 19097 + }, + { + "epoch": 5.861878453038674, + "grad_norm": 0.24725143611431122, + "learning_rate": 3.8579940108971984e-05, + "loss": 1.791, + "step": 19098 + }, + { + "epoch": 5.862185389809699, + "grad_norm": 0.2245369702577591, + "learning_rate": 3.857510099187573e-05, + "loss": 1.7643, + "step": 19099 + }, + { + "epoch": 5.862492326580725, + "grad_norm": 0.20065639913082123, + "learning_rate": 3.8570261987687056e-05, + "loss": 1.715, + "step": 19100 + }, + { + "epoch": 5.862799263351749, + "grad_norm": 0.1857454925775528, + "learning_rate": 3.856542309645373e-05, + "loss": 1.6833, + "step": 19101 + }, + { + "epoch": 5.8631062001227745, + "grad_norm": 0.18816804885864258, + "learning_rate": 3.856058431822361e-05, + "loss": 1.7049, + "step": 19102 + }, + { + "epoch": 5.8634131368938, + "grad_norm": 0.2861626148223877, + "learning_rate": 3.855574565304448e-05, + "loss": 1.8275, + "step": 19103 + }, + { + "epoch": 5.863720073664825, + "grad_norm": 0.19937226176261902, + "learning_rate": 3.8550907100964196e-05, + "loss": 1.7137, + "step": 19104 + }, + { + "epoch": 5.8640270104358505, + "grad_norm": 0.2040586620569229, + "learning_rate": 3.854606866203055e-05, + "loss": 1.725, + "step": 19105 + }, + { + "epoch": 5.864333947206875, + "grad_norm": 0.21082650125026703, + "learning_rate": 3.854123033629137e-05, + "loss": 1.7143, + "step": 19106 + }, + { + "epoch": 5.8646408839779, + "grad_norm": 0.1977517306804657, + "learning_rate": 3.853639212379446e-05, + "loss": 1.7482, + "step": 19107 + }, + { + "epoch": 5.864947820748926, + "grad_norm": 0.2272191196680069, + "learning_rate": 3.8531554024587655e-05, + "loss": 1.7678, + "step": 19108 + }, + { + "epoch": 5.865254757519951, + "grad_norm": 0.22765736281871796, + "learning_rate": 3.852671603871876e-05, + "loss": 1.7721, + "step": 19109 + }, + { + "epoch": 5.865561694290976, + "grad_norm": 0.20707197487354279, + "learning_rate": 3.852187816623556e-05, + "loss": 1.7509, + "step": 19110 + }, + { + "epoch": 5.865868631062002, + "grad_norm": 0.2699931561946869, + "learning_rate": 3.851704040718591e-05, + "loss": 1.6845, + "step": 19111 + }, + { + "epoch": 5.866175567833026, + "grad_norm": 0.24394196271896362, + "learning_rate": 3.8512202761617575e-05, + "loss": 1.6895, + "step": 19112 + }, + { + "epoch": 5.866482504604051, + "grad_norm": 0.21921835839748383, + "learning_rate": 3.850736522957841e-05, + "loss": 1.7739, + "step": 19113 + }, + { + "epoch": 5.866789441375077, + "grad_norm": 0.2268306314945221, + "learning_rate": 3.8502527811116175e-05, + "loss": 1.7773, + "step": 19114 + }, + { + "epoch": 5.867096378146102, + "grad_norm": 0.2165728509426117, + "learning_rate": 3.84976905062787e-05, + "loss": 1.7567, + "step": 19115 + }, + { + "epoch": 5.867403314917127, + "grad_norm": 0.188106968998909, + "learning_rate": 3.8492853315113804e-05, + "loss": 1.7209, + "step": 19116 + }, + { + "epoch": 5.867710251688152, + "grad_norm": 0.20750530064105988, + "learning_rate": 3.848801623766927e-05, + "loss": 1.6999, + "step": 19117 + }, + { + "epoch": 5.868017188459177, + "grad_norm": 0.2475438266992569, + "learning_rate": 3.84831792739929e-05, + "loss": 1.7535, + "step": 19118 + }, + { + "epoch": 5.8683241252302025, + "grad_norm": 0.23291872441768646, + "learning_rate": 3.847834242413252e-05, + "loss": 1.7137, + "step": 19119 + }, + { + "epoch": 5.868631062001228, + "grad_norm": 0.18381048738956451, + "learning_rate": 3.847350568813589e-05, + "loss": 1.7657, + "step": 19120 + }, + { + "epoch": 5.868937998772253, + "grad_norm": 0.19330385327339172, + "learning_rate": 3.8468669066050845e-05, + "loss": 1.7109, + "step": 19121 + }, + { + "epoch": 5.8692449355432785, + "grad_norm": 0.22503000497817993, + "learning_rate": 3.846383255792517e-05, + "loss": 1.7668, + "step": 19122 + }, + { + "epoch": 5.869551872314303, + "grad_norm": 0.2147306352853775, + "learning_rate": 3.845899616380667e-05, + "loss": 1.74, + "step": 19123 + }, + { + "epoch": 5.869858809085328, + "grad_norm": 0.18493011593818665, + "learning_rate": 3.845415988374312e-05, + "loss": 1.7066, + "step": 19124 + }, + { + "epoch": 5.870165745856354, + "grad_norm": 0.28276753425598145, + "learning_rate": 3.844932371778235e-05, + "loss": 1.7925, + "step": 19125 + }, + { + "epoch": 5.870472682627379, + "grad_norm": 0.23486676812171936, + "learning_rate": 3.844448766597212e-05, + "loss": 1.8216, + "step": 19126 + }, + { + "epoch": 5.870779619398404, + "grad_norm": 0.24370723962783813, + "learning_rate": 3.843965172836024e-05, + "loss": 1.709, + "step": 19127 + }, + { + "epoch": 5.871086556169429, + "grad_norm": 0.22540852427482605, + "learning_rate": 3.843481590499449e-05, + "loss": 1.7608, + "step": 19128 + }, + { + "epoch": 5.871393492940454, + "grad_norm": 0.20578467845916748, + "learning_rate": 3.8429980195922666e-05, + "loss": 1.7288, + "step": 19129 + }, + { + "epoch": 5.871700429711479, + "grad_norm": 0.265325129032135, + "learning_rate": 3.842514460119258e-05, + "loss": 1.7711, + "step": 19130 + }, + { + "epoch": 5.872007366482505, + "grad_norm": 0.20076121389865875, + "learning_rate": 3.842030912085197e-05, + "loss": 1.6764, + "step": 19131 + }, + { + "epoch": 5.87231430325353, + "grad_norm": 0.23941899836063385, + "learning_rate": 3.841547375494868e-05, + "loss": 1.8157, + "step": 19132 + }, + { + "epoch": 5.872621240024555, + "grad_norm": 0.23184041678905487, + "learning_rate": 3.841063850353044e-05, + "loss": 1.6948, + "step": 19133 + }, + { + "epoch": 5.87292817679558, + "grad_norm": 0.20299546420574188, + "learning_rate": 3.840580336664508e-05, + "loss": 1.7812, + "step": 19134 + }, + { + "epoch": 5.873235113566605, + "grad_norm": 0.24654673039913177, + "learning_rate": 3.840096834434036e-05, + "loss": 1.7999, + "step": 19135 + }, + { + "epoch": 5.8735420503376305, + "grad_norm": 0.21144285798072815, + "learning_rate": 3.8396133436664085e-05, + "loss": 1.7033, + "step": 19136 + }, + { + "epoch": 5.873848987108656, + "grad_norm": 0.22186708450317383, + "learning_rate": 3.8391298643663997e-05, + "loss": 1.7292, + "step": 19137 + }, + { + "epoch": 5.87415592387968, + "grad_norm": 0.21017275750637054, + "learning_rate": 3.838646396538793e-05, + "loss": 1.6989, + "step": 19138 + }, + { + "epoch": 5.874462860650706, + "grad_norm": 0.19430704414844513, + "learning_rate": 3.83816294018836e-05, + "loss": 1.7446, + "step": 19139 + }, + { + "epoch": 5.874769797421731, + "grad_norm": 0.25048547983169556, + "learning_rate": 3.8376794953198836e-05, + "loss": 1.7358, + "step": 19140 + }, + { + "epoch": 5.875076734192756, + "grad_norm": 0.21869583427906036, + "learning_rate": 3.8371960619381406e-05, + "loss": 1.7017, + "step": 19141 + }, + { + "epoch": 5.875383670963782, + "grad_norm": 0.2053002119064331, + "learning_rate": 3.836712640047905e-05, + "loss": 1.7077, + "step": 19142 + }, + { + "epoch": 5.875690607734807, + "grad_norm": 0.2222425490617752, + "learning_rate": 3.83622922965396e-05, + "loss": 1.7259, + "step": 19143 + }, + { + "epoch": 5.8759975445058314, + "grad_norm": 0.20682495832443237, + "learning_rate": 3.8357458307610774e-05, + "loss": 1.7597, + "step": 19144 + }, + { + "epoch": 5.876304481276857, + "grad_norm": 0.2001802772283554, + "learning_rate": 3.835262443374038e-05, + "loss": 1.7546, + "step": 19145 + }, + { + "epoch": 5.876611418047882, + "grad_norm": 0.20499882102012634, + "learning_rate": 3.8347790674976166e-05, + "loss": 1.6741, + "step": 19146 + }, + { + "epoch": 5.8769183548189075, + "grad_norm": 0.17830348014831543, + "learning_rate": 3.834295703136593e-05, + "loss": 1.7067, + "step": 19147 + }, + { + "epoch": 5.877225291589933, + "grad_norm": 0.25055429339408875, + "learning_rate": 3.833812350295741e-05, + "loss": 1.753, + "step": 19148 + }, + { + "epoch": 5.877532228360957, + "grad_norm": 0.19037213921546936, + "learning_rate": 3.8333290089798415e-05, + "loss": 1.7336, + "step": 19149 + }, + { + "epoch": 5.877839165131983, + "grad_norm": 0.18041233718395233, + "learning_rate": 3.8328456791936656e-05, + "loss": 1.7172, + "step": 19150 + }, + { + "epoch": 5.878146101903008, + "grad_norm": 0.21531802415847778, + "learning_rate": 3.832362360941994e-05, + "loss": 1.7328, + "step": 19151 + }, + { + "epoch": 5.878453038674033, + "grad_norm": 0.23101283609867096, + "learning_rate": 3.831879054229601e-05, + "loss": 1.7548, + "step": 19152 + }, + { + "epoch": 5.878759975445059, + "grad_norm": 0.19029635190963745, + "learning_rate": 3.831395759061266e-05, + "loss": 1.6852, + "step": 19153 + }, + { + "epoch": 5.879066912216084, + "grad_norm": 0.20305602252483368, + "learning_rate": 3.830912475441761e-05, + "loss": 1.6982, + "step": 19154 + }, + { + "epoch": 5.879373848987108, + "grad_norm": 0.19752593338489532, + "learning_rate": 3.830429203375866e-05, + "loss": 1.7726, + "step": 19155 + }, + { + "epoch": 5.879680785758134, + "grad_norm": 0.2109406590461731, + "learning_rate": 3.8299459428683526e-05, + "loss": 1.7629, + "step": 19156 + }, + { + "epoch": 5.879987722529159, + "grad_norm": 0.19448740780353546, + "learning_rate": 3.829462693924001e-05, + "loss": 1.6981, + "step": 19157 + }, + { + "epoch": 5.880294659300184, + "grad_norm": 0.19344154000282288, + "learning_rate": 3.828979456547586e-05, + "loss": 1.6822, + "step": 19158 + }, + { + "epoch": 5.88060159607121, + "grad_norm": 0.24466145038604736, + "learning_rate": 3.82849623074388e-05, + "loss": 1.7575, + "step": 19159 + }, + { + "epoch": 5.880908532842234, + "grad_norm": 0.20174476504325867, + "learning_rate": 3.828013016517663e-05, + "loss": 1.7267, + "step": 19160 + }, + { + "epoch": 5.8812154696132595, + "grad_norm": 0.23560820519924164, + "learning_rate": 3.827529813873706e-05, + "loss": 1.7125, + "step": 19161 + }, + { + "epoch": 5.881522406384285, + "grad_norm": 0.18118280172348022, + "learning_rate": 3.827046622816789e-05, + "loss": 1.7436, + "step": 19162 + }, + { + "epoch": 5.88182934315531, + "grad_norm": 0.27250152826309204, + "learning_rate": 3.8265634433516824e-05, + "loss": 1.7249, + "step": 19163 + }, + { + "epoch": 5.8821362799263355, + "grad_norm": 0.23510734736919403, + "learning_rate": 3.826080275483166e-05, + "loss": 1.7502, + "step": 19164 + }, + { + "epoch": 5.882443216697361, + "grad_norm": 0.22708909213542938, + "learning_rate": 3.82559711921601e-05, + "loss": 1.7478, + "step": 19165 + }, + { + "epoch": 5.882750153468385, + "grad_norm": 0.292584627866745, + "learning_rate": 3.825113974554995e-05, + "loss": 1.6757, + "step": 19166 + }, + { + "epoch": 5.883057090239411, + "grad_norm": 0.22186334431171417, + "learning_rate": 3.8246308415048884e-05, + "loss": 1.7061, + "step": 19167 + }, + { + "epoch": 5.883364027010436, + "grad_norm": 0.23995520174503326, + "learning_rate": 3.8241477200704714e-05, + "loss": 1.6962, + "step": 19168 + }, + { + "epoch": 5.883670963781461, + "grad_norm": 0.25545260310173035, + "learning_rate": 3.823664610256513e-05, + "loss": 1.7582, + "step": 19169 + }, + { + "epoch": 5.883977900552486, + "grad_norm": 0.2209167629480362, + "learning_rate": 3.823181512067794e-05, + "loss": 1.7212, + "step": 19170 + }, + { + "epoch": 5.884284837323511, + "grad_norm": 0.24626508355140686, + "learning_rate": 3.8226984255090824e-05, + "loss": 1.7356, + "step": 19171 + }, + { + "epoch": 5.884591774094536, + "grad_norm": 0.22982320189476013, + "learning_rate": 3.822215350585157e-05, + "loss": 1.7516, + "step": 19172 + }, + { + "epoch": 5.884898710865562, + "grad_norm": 0.19458627700805664, + "learning_rate": 3.8217322873007874e-05, + "loss": 1.7097, + "step": 19173 + }, + { + "epoch": 5.885205647636587, + "grad_norm": 0.2030913233757019, + "learning_rate": 3.8212492356607524e-05, + "loss": 1.7273, + "step": 19174 + }, + { + "epoch": 5.885512584407612, + "grad_norm": 0.20174767076969147, + "learning_rate": 3.820766195669823e-05, + "loss": 1.7167, + "step": 19175 + }, + { + "epoch": 5.885819521178637, + "grad_norm": 0.22572553157806396, + "learning_rate": 3.820283167332772e-05, + "loss": 1.8034, + "step": 19176 + }, + { + "epoch": 5.886126457949662, + "grad_norm": 0.24423041939735413, + "learning_rate": 3.819800150654376e-05, + "loss": 1.7188, + "step": 19177 + }, + { + "epoch": 5.8864333947206875, + "grad_norm": 0.20805509388446808, + "learning_rate": 3.819317145639404e-05, + "loss": 1.7252, + "step": 19178 + }, + { + "epoch": 5.886740331491713, + "grad_norm": 0.2731400728225708, + "learning_rate": 3.8188341522926334e-05, + "loss": 1.7778, + "step": 19179 + }, + { + "epoch": 5.887047268262738, + "grad_norm": 0.2604491412639618, + "learning_rate": 3.818351170618835e-05, + "loss": 1.7524, + "step": 19180 + }, + { + "epoch": 5.887354205033763, + "grad_norm": 0.20043112337589264, + "learning_rate": 3.817868200622785e-05, + "loss": 1.7176, + "step": 19181 + }, + { + "epoch": 5.887661141804788, + "grad_norm": 0.2224988341331482, + "learning_rate": 3.817385242309253e-05, + "loss": 1.7267, + "step": 19182 + }, + { + "epoch": 5.887968078575813, + "grad_norm": 0.24603894352912903, + "learning_rate": 3.8169022956830135e-05, + "loss": 1.716, + "step": 19183 + }, + { + "epoch": 5.888275015346839, + "grad_norm": 0.19959969818592072, + "learning_rate": 3.816419360748839e-05, + "loss": 1.7461, + "step": 19184 + }, + { + "epoch": 5.888581952117864, + "grad_norm": 0.21907947957515717, + "learning_rate": 3.815936437511501e-05, + "loss": 1.6982, + "step": 19185 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.1920289248228073, + "learning_rate": 3.8154535259757735e-05, + "loss": 1.7213, + "step": 19186 + }, + { + "epoch": 5.889195825659914, + "grad_norm": 0.21930737793445587, + "learning_rate": 3.81497062614643e-05, + "loss": 1.7389, + "step": 19187 + }, + { + "epoch": 5.889502762430939, + "grad_norm": 0.1972137838602066, + "learning_rate": 3.814487738028239e-05, + "loss": 1.7317, + "step": 19188 + }, + { + "epoch": 5.889809699201964, + "grad_norm": 0.20000529289245605, + "learning_rate": 3.8140048616259785e-05, + "loss": 1.7148, + "step": 19189 + }, + { + "epoch": 5.89011663597299, + "grad_norm": 0.18828663229942322, + "learning_rate": 3.8135219969444135e-05, + "loss": 1.725, + "step": 19190 + }, + { + "epoch": 5.890423572744015, + "grad_norm": 0.2237224131822586, + "learning_rate": 3.8130391439883216e-05, + "loss": 1.7252, + "step": 19191 + }, + { + "epoch": 5.8907305095150395, + "grad_norm": 0.19954712688922882, + "learning_rate": 3.812556302762473e-05, + "loss": 1.7071, + "step": 19192 + }, + { + "epoch": 5.891037446286065, + "grad_norm": 0.23509685695171356, + "learning_rate": 3.812073473271637e-05, + "loss": 1.7603, + "step": 19193 + }, + { + "epoch": 5.89134438305709, + "grad_norm": 0.28477707505226135, + "learning_rate": 3.81159065552059e-05, + "loss": 1.8193, + "step": 19194 + }, + { + "epoch": 5.8916513198281155, + "grad_norm": 0.1936045140028, + "learning_rate": 3.811107849514098e-05, + "loss": 1.7438, + "step": 19195 + }, + { + "epoch": 5.891958256599141, + "grad_norm": 0.288253515958786, + "learning_rate": 3.810625055256936e-05, + "loss": 1.8042, + "step": 19196 + }, + { + "epoch": 5.892265193370166, + "grad_norm": 0.19256485998630524, + "learning_rate": 3.810142272753873e-05, + "loss": 1.6997, + "step": 19197 + }, + { + "epoch": 5.892572130141191, + "grad_norm": 0.2823546826839447, + "learning_rate": 3.809659502009684e-05, + "loss": 1.7133, + "step": 19198 + }, + { + "epoch": 5.892879066912216, + "grad_norm": 0.25116851925849915, + "learning_rate": 3.809176743029136e-05, + "loss": 1.7402, + "step": 19199 + }, + { + "epoch": 5.893186003683241, + "grad_norm": 0.19840675592422485, + "learning_rate": 3.808693995817003e-05, + "loss": 1.7009, + "step": 19200 + }, + { + "epoch": 5.893492940454267, + "grad_norm": 0.2703700363636017, + "learning_rate": 3.808211260378051e-05, + "loss": 1.741, + "step": 19201 + }, + { + "epoch": 5.893799877225292, + "grad_norm": 0.25683698058128357, + "learning_rate": 3.807728536717056e-05, + "loss": 1.7431, + "step": 19202 + }, + { + "epoch": 5.894106813996316, + "grad_norm": 0.19033822417259216, + "learning_rate": 3.8072458248387855e-05, + "loss": 1.7423, + "step": 19203 + }, + { + "epoch": 5.894413750767342, + "grad_norm": 0.2771024703979492, + "learning_rate": 3.806763124748012e-05, + "loss": 1.7376, + "step": 19204 + }, + { + "epoch": 5.894720687538367, + "grad_norm": 0.30265524983406067, + "learning_rate": 3.806280436449504e-05, + "loss": 1.7124, + "step": 19205 + }, + { + "epoch": 5.895027624309392, + "grad_norm": 0.21838776767253876, + "learning_rate": 3.805797759948033e-05, + "loss": 1.7319, + "step": 19206 + }, + { + "epoch": 5.895334561080418, + "grad_norm": 0.22244395315647125, + "learning_rate": 3.805315095248368e-05, + "loss": 1.7034, + "step": 19207 + }, + { + "epoch": 5.895641497851443, + "grad_norm": 0.20621941983699799, + "learning_rate": 3.8048324423552786e-05, + "loss": 1.7231, + "step": 19208 + }, + { + "epoch": 5.8959484346224675, + "grad_norm": 0.23735111951828003, + "learning_rate": 3.804349801273538e-05, + "loss": 1.7484, + "step": 19209 + }, + { + "epoch": 5.896255371393493, + "grad_norm": 0.33221447467803955, + "learning_rate": 3.803867172007911e-05, + "loss": 1.7782, + "step": 19210 + }, + { + "epoch": 5.896562308164518, + "grad_norm": 0.20859810709953308, + "learning_rate": 3.803384554563172e-05, + "loss": 1.688, + "step": 19211 + }, + { + "epoch": 5.8968692449355435, + "grad_norm": 0.25731268525123596, + "learning_rate": 3.8029019489440855e-05, + "loss": 1.7463, + "step": 19212 + }, + { + "epoch": 5.897176181706568, + "grad_norm": 0.26556700468063354, + "learning_rate": 3.802419355155425e-05, + "loss": 1.7251, + "step": 19213 + }, + { + "epoch": 5.897483118477593, + "grad_norm": 0.20397205650806427, + "learning_rate": 3.801936773201957e-05, + "loss": 1.6785, + "step": 19214 + }, + { + "epoch": 5.897790055248619, + "grad_norm": 0.2198234349489212, + "learning_rate": 3.8014542030884544e-05, + "loss": 1.7608, + "step": 19215 + }, + { + "epoch": 5.898096992019644, + "grad_norm": 0.22619546949863434, + "learning_rate": 3.800971644819681e-05, + "loss": 1.8034, + "step": 19216 + }, + { + "epoch": 5.898403928790669, + "grad_norm": 0.22074444591999054, + "learning_rate": 3.800489098400412e-05, + "loss": 1.777, + "step": 19217 + }, + { + "epoch": 5.898710865561695, + "grad_norm": 0.2555946707725525, + "learning_rate": 3.80000656383541e-05, + "loss": 1.7578, + "step": 19218 + }, + { + "epoch": 5.899017802332719, + "grad_norm": 0.2130863517522812, + "learning_rate": 3.7995240411294474e-05, + "loss": 1.7312, + "step": 19219 + }, + { + "epoch": 5.899324739103744, + "grad_norm": 0.2574099898338318, + "learning_rate": 3.799041530287291e-05, + "loss": 1.7509, + "step": 19220 + }, + { + "epoch": 5.89963167587477, + "grad_norm": 0.2556573152542114, + "learning_rate": 3.798559031313712e-05, + "loss": 1.7624, + "step": 19221 + }, + { + "epoch": 5.899938612645795, + "grad_norm": 0.19909335672855377, + "learning_rate": 3.798076544213475e-05, + "loss": 1.7466, + "step": 19222 + }, + { + "epoch": 5.9002455494168204, + "grad_norm": 0.19832594692707062, + "learning_rate": 3.7975940689913526e-05, + "loss": 1.6896, + "step": 19223 + }, + { + "epoch": 5.900552486187845, + "grad_norm": 0.18473665416240692, + "learning_rate": 3.7971116056521076e-05, + "loss": 1.7167, + "step": 19224 + }, + { + "epoch": 5.90085942295887, + "grad_norm": 0.21106892824172974, + "learning_rate": 3.796629154200512e-05, + "loss": 1.8071, + "step": 19225 + }, + { + "epoch": 5.901166359729896, + "grad_norm": 0.20903728902339935, + "learning_rate": 3.796146714641333e-05, + "loss": 1.6946, + "step": 19226 + }, + { + "epoch": 5.901473296500921, + "grad_norm": 0.21518728137016296, + "learning_rate": 3.795664286979336e-05, + "loss": 1.6899, + "step": 19227 + }, + { + "epoch": 5.901780233271946, + "grad_norm": 0.1948135644197464, + "learning_rate": 3.7951818712192926e-05, + "loss": 1.7568, + "step": 19228 + }, + { + "epoch": 5.902087170042972, + "grad_norm": 0.2222091257572174, + "learning_rate": 3.7946994673659667e-05, + "loss": 1.8118, + "step": 19229 + }, + { + "epoch": 5.902394106813996, + "grad_norm": 0.2173513025045395, + "learning_rate": 3.794217075424127e-05, + "loss": 1.7194, + "step": 19230 + }, + { + "epoch": 5.902701043585021, + "grad_norm": 0.2026323676109314, + "learning_rate": 3.79373469539854e-05, + "loss": 1.6944, + "step": 19231 + }, + { + "epoch": 5.903007980356047, + "grad_norm": 0.22178098559379578, + "learning_rate": 3.7932523272939765e-05, + "loss": 1.7328, + "step": 19232 + }, + { + "epoch": 5.903314917127072, + "grad_norm": 0.22846719622612, + "learning_rate": 3.792769971115198e-05, + "loss": 1.8065, + "step": 19233 + }, + { + "epoch": 5.903621853898097, + "grad_norm": 0.2086053490638733, + "learning_rate": 3.792287626866977e-05, + "loss": 1.7511, + "step": 19234 + }, + { + "epoch": 5.903928790669122, + "grad_norm": 0.22444705665111542, + "learning_rate": 3.791805294554075e-05, + "loss": 1.742, + "step": 19235 + }, + { + "epoch": 5.904235727440147, + "grad_norm": 0.24630236625671387, + "learning_rate": 3.7913229741812625e-05, + "loss": 1.7531, + "step": 19236 + }, + { + "epoch": 5.9045426642111725, + "grad_norm": 0.2618274986743927, + "learning_rate": 3.7908406657533036e-05, + "loss": 1.7387, + "step": 19237 + }, + { + "epoch": 5.904849600982198, + "grad_norm": 0.25871509313583374, + "learning_rate": 3.790358369274968e-05, + "loss": 1.7822, + "step": 19238 + }, + { + "epoch": 5.905156537753223, + "grad_norm": 0.22675062716007233, + "learning_rate": 3.789876084751018e-05, + "loss": 1.7788, + "step": 19239 + }, + { + "epoch": 5.9054634745242485, + "grad_norm": 0.26623663306236267, + "learning_rate": 3.789393812186224e-05, + "loss": 1.7092, + "step": 19240 + }, + { + "epoch": 5.905770411295273, + "grad_norm": 0.19448868930339813, + "learning_rate": 3.788911551585348e-05, + "loss": 1.7164, + "step": 19241 + }, + { + "epoch": 5.906077348066298, + "grad_norm": 0.22451938688755035, + "learning_rate": 3.788429302953158e-05, + "loss": 1.667, + "step": 19242 + }, + { + "epoch": 5.906384284837324, + "grad_norm": 0.2323608547449112, + "learning_rate": 3.7879470662944214e-05, + "loss": 1.7992, + "step": 19243 + }, + { + "epoch": 5.906691221608349, + "grad_norm": 0.2508258819580078, + "learning_rate": 3.7874648416139e-05, + "loss": 1.7681, + "step": 19244 + }, + { + "epoch": 5.906998158379373, + "grad_norm": 0.22333547472953796, + "learning_rate": 3.786982628916364e-05, + "loss": 1.7006, + "step": 19245 + }, + { + "epoch": 5.907305095150399, + "grad_norm": 0.19816327095031738, + "learning_rate": 3.786500428206575e-05, + "loss": 1.7458, + "step": 19246 + }, + { + "epoch": 5.907612031921424, + "grad_norm": 0.2047683447599411, + "learning_rate": 3.7860182394893006e-05, + "loss": 1.7385, + "step": 19247 + }, + { + "epoch": 5.907918968692449, + "grad_norm": 0.2124621719121933, + "learning_rate": 3.785536062769304e-05, + "loss": 1.7373, + "step": 19248 + }, + { + "epoch": 5.908225905463475, + "grad_norm": 0.200453981757164, + "learning_rate": 3.785053898051355e-05, + "loss": 1.7754, + "step": 19249 + }, + { + "epoch": 5.9085328422345, + "grad_norm": 0.19543224573135376, + "learning_rate": 3.784571745340212e-05, + "loss": 1.724, + "step": 19250 + }, + { + "epoch": 5.9088397790055245, + "grad_norm": 0.17079658806324005, + "learning_rate": 3.784089604640647e-05, + "loss": 1.6843, + "step": 19251 + }, + { + "epoch": 5.90914671577655, + "grad_norm": 0.22792236506938934, + "learning_rate": 3.783607475957418e-05, + "loss": 1.7442, + "step": 19252 + }, + { + "epoch": 5.909453652547575, + "grad_norm": 0.20699752867221832, + "learning_rate": 3.783125359295294e-05, + "loss": 1.7868, + "step": 19253 + }, + { + "epoch": 5.9097605893186005, + "grad_norm": 0.2156144678592682, + "learning_rate": 3.782643254659038e-05, + "loss": 1.7443, + "step": 19254 + }, + { + "epoch": 5.910067526089626, + "grad_norm": 0.2021300345659256, + "learning_rate": 3.782161162053417e-05, + "loss": 1.7749, + "step": 19255 + }, + { + "epoch": 5.91037446286065, + "grad_norm": 0.17613129317760468, + "learning_rate": 3.7816790814831905e-05, + "loss": 1.7001, + "step": 19256 + }, + { + "epoch": 5.910681399631676, + "grad_norm": 0.18911564350128174, + "learning_rate": 3.781197012953128e-05, + "loss": 1.6817, + "step": 19257 + }, + { + "epoch": 5.910988336402701, + "grad_norm": 0.18920689821243286, + "learning_rate": 3.780714956467989e-05, + "loss": 1.7554, + "step": 19258 + }, + { + "epoch": 5.911295273173726, + "grad_norm": 0.22030571103096008, + "learning_rate": 3.7802329120325396e-05, + "loss": 1.7554, + "step": 19259 + }, + { + "epoch": 5.911602209944752, + "grad_norm": 0.21164962649345398, + "learning_rate": 3.779750879651545e-05, + "loss": 1.74, + "step": 19260 + }, + { + "epoch": 5.911909146715777, + "grad_norm": 0.2205103188753128, + "learning_rate": 3.779268859329766e-05, + "loss": 1.7424, + "step": 19261 + }, + { + "epoch": 5.912216083486801, + "grad_norm": 0.19262658059597015, + "learning_rate": 3.7787868510719685e-05, + "loss": 1.7157, + "step": 19262 + }, + { + "epoch": 5.912523020257827, + "grad_norm": 0.19583287835121155, + "learning_rate": 3.778304854882914e-05, + "loss": 1.7343, + "step": 19263 + }, + { + "epoch": 5.912829957028852, + "grad_norm": 0.18275529146194458, + "learning_rate": 3.777822870767368e-05, + "loss": 1.6938, + "step": 19264 + }, + { + "epoch": 5.913136893799877, + "grad_norm": 0.21268916130065918, + "learning_rate": 3.7773408987300914e-05, + "loss": 1.7546, + "step": 19265 + }, + { + "epoch": 5.913443830570903, + "grad_norm": 0.20878887176513672, + "learning_rate": 3.77685893877585e-05, + "loss": 1.8109, + "step": 19266 + }, + { + "epoch": 5.913750767341927, + "grad_norm": 0.2326175421476364, + "learning_rate": 3.776376990909404e-05, + "loss": 1.7248, + "step": 19267 + }, + { + "epoch": 5.9140577041129525, + "grad_norm": 0.28189611434936523, + "learning_rate": 3.7758950551355204e-05, + "loss": 1.7796, + "step": 19268 + }, + { + "epoch": 5.914364640883978, + "grad_norm": 0.1922682821750641, + "learning_rate": 3.775413131458957e-05, + "loss": 1.7096, + "step": 19269 + }, + { + "epoch": 5.914671577655003, + "grad_norm": 0.2839193642139435, + "learning_rate": 3.774931219884479e-05, + "loss": 1.7341, + "step": 19270 + }, + { + "epoch": 5.9149785144260285, + "grad_norm": 0.2075256109237671, + "learning_rate": 3.7744493204168495e-05, + "loss": 1.7565, + "step": 19271 + }, + { + "epoch": 5.915285451197054, + "grad_norm": 0.2780497372150421, + "learning_rate": 3.7739674330608306e-05, + "loss": 1.7186, + "step": 19272 + }, + { + "epoch": 5.915592387968078, + "grad_norm": 0.26129212975502014, + "learning_rate": 3.773485557821182e-05, + "loss": 1.8468, + "step": 19273 + }, + { + "epoch": 5.915899324739104, + "grad_norm": 0.3299194276332855, + "learning_rate": 3.773003694702671e-05, + "loss": 1.7705, + "step": 19274 + }, + { + "epoch": 5.916206261510129, + "grad_norm": 0.3011106848716736, + "learning_rate": 3.772521843710054e-05, + "loss": 1.748, + "step": 19275 + }, + { + "epoch": 5.916513198281154, + "grad_norm": 0.21370603144168854, + "learning_rate": 3.7720400048480966e-05, + "loss": 1.7709, + "step": 19276 + }, + { + "epoch": 5.91682013505218, + "grad_norm": 0.29374879598617554, + "learning_rate": 3.771558178121561e-05, + "loss": 1.6948, + "step": 19277 + }, + { + "epoch": 5.917127071823204, + "grad_norm": 0.2545807659626007, + "learning_rate": 3.771076363535205e-05, + "loss": 1.7974, + "step": 19278 + }, + { + "epoch": 5.917434008594229, + "grad_norm": 0.24210263788700104, + "learning_rate": 3.7705945610937954e-05, + "loss": 1.7438, + "step": 19279 + }, + { + "epoch": 5.917740945365255, + "grad_norm": 0.26224827766418457, + "learning_rate": 3.770112770802088e-05, + "loss": 1.7294, + "step": 19280 + }, + { + "epoch": 5.91804788213628, + "grad_norm": 0.23358991742134094, + "learning_rate": 3.7696309926648486e-05, + "loss": 1.7973, + "step": 19281 + }, + { + "epoch": 5.918354818907305, + "grad_norm": 0.3466563820838928, + "learning_rate": 3.769149226686837e-05, + "loss": 1.784, + "step": 19282 + }, + { + "epoch": 5.918661755678331, + "grad_norm": 0.2416994869709015, + "learning_rate": 3.768667472872814e-05, + "loss": 1.6957, + "step": 19283 + }, + { + "epoch": 5.918968692449355, + "grad_norm": 0.2285085767507553, + "learning_rate": 3.768185731227539e-05, + "loss": 1.71, + "step": 19284 + }, + { + "epoch": 5.9192756292203805, + "grad_norm": 0.2566430866718292, + "learning_rate": 3.7677040017557775e-05, + "loss": 1.792, + "step": 19285 + }, + { + "epoch": 5.919582565991406, + "grad_norm": 0.21566689014434814, + "learning_rate": 3.767222284462285e-05, + "loss": 1.8085, + "step": 19286 + }, + { + "epoch": 5.919889502762431, + "grad_norm": 0.24078889191150665, + "learning_rate": 3.7667405793518264e-05, + "loss": 1.7221, + "step": 19287 + }, + { + "epoch": 5.920196439533456, + "grad_norm": 0.22127531468868256, + "learning_rate": 3.7662588864291584e-05, + "loss": 1.7173, + "step": 19288 + }, + { + "epoch": 5.920503376304481, + "grad_norm": 0.18165946006774902, + "learning_rate": 3.765777205699045e-05, + "loss": 1.7518, + "step": 19289 + }, + { + "epoch": 5.920810313075506, + "grad_norm": 0.2569290101528168, + "learning_rate": 3.765295537166242e-05, + "loss": 1.7716, + "step": 19290 + }, + { + "epoch": 5.921117249846532, + "grad_norm": 0.19010202586650848, + "learning_rate": 3.764813880835515e-05, + "loss": 1.7146, + "step": 19291 + }, + { + "epoch": 5.921424186617557, + "grad_norm": 0.2882116436958313, + "learning_rate": 3.7643322367116195e-05, + "loss": 1.7677, + "step": 19292 + }, + { + "epoch": 5.921731123388582, + "grad_norm": 0.30711185932159424, + "learning_rate": 3.763850604799319e-05, + "loss": 1.7506, + "step": 19293 + }, + { + "epoch": 5.922038060159607, + "grad_norm": 0.19295164942741394, + "learning_rate": 3.76336898510337e-05, + "loss": 1.715, + "step": 19294 + }, + { + "epoch": 5.922344996930632, + "grad_norm": 0.24849168956279755, + "learning_rate": 3.762887377628533e-05, + "loss": 1.6807, + "step": 19295 + }, + { + "epoch": 5.922651933701657, + "grad_norm": 0.23573634028434753, + "learning_rate": 3.7624057823795696e-05, + "loss": 1.7363, + "step": 19296 + }, + { + "epoch": 5.922958870472683, + "grad_norm": 0.24384267628192902, + "learning_rate": 3.761924199361235e-05, + "loss": 1.726, + "step": 19297 + }, + { + "epoch": 5.923265807243708, + "grad_norm": 0.2589210271835327, + "learning_rate": 3.761442628578294e-05, + "loss": 1.7771, + "step": 19298 + }, + { + "epoch": 5.9235727440147325, + "grad_norm": 0.23527951538562775, + "learning_rate": 3.760961070035501e-05, + "loss": 1.6561, + "step": 19299 + }, + { + "epoch": 5.923879680785758, + "grad_norm": 0.20286870002746582, + "learning_rate": 3.7604795237376175e-05, + "loss": 1.7464, + "step": 19300 + }, + { + "epoch": 5.924186617556783, + "grad_norm": 0.22705033421516418, + "learning_rate": 3.759997989689401e-05, + "loss": 1.7814, + "step": 19301 + }, + { + "epoch": 5.9244935543278086, + "grad_norm": 0.21780981123447418, + "learning_rate": 3.7595164678956135e-05, + "loss": 1.7601, + "step": 19302 + }, + { + "epoch": 5.924800491098834, + "grad_norm": 0.2030021697282791, + "learning_rate": 3.759034958361009e-05, + "loss": 1.7222, + "step": 19303 + }, + { + "epoch": 5.925107427869859, + "grad_norm": 0.22956500947475433, + "learning_rate": 3.758553461090351e-05, + "loss": 1.674, + "step": 19304 + }, + { + "epoch": 5.925414364640884, + "grad_norm": 0.2368287444114685, + "learning_rate": 3.758071976088392e-05, + "loss": 1.7483, + "step": 19305 + }, + { + "epoch": 5.925721301411909, + "grad_norm": 0.22852632403373718, + "learning_rate": 3.757590503359896e-05, + "loss": 1.7561, + "step": 19306 + }, + { + "epoch": 5.926028238182934, + "grad_norm": 0.21657361090183258, + "learning_rate": 3.757109042909617e-05, + "loss": 1.7814, + "step": 19307 + }, + { + "epoch": 5.92633517495396, + "grad_norm": 0.21996551752090454, + "learning_rate": 3.756627594742317e-05, + "loss": 1.732, + "step": 19308 + }, + { + "epoch": 5.926642111724985, + "grad_norm": 0.23319712281227112, + "learning_rate": 3.75614615886275e-05, + "loss": 1.6807, + "step": 19309 + }, + { + "epoch": 5.9269490484960095, + "grad_norm": 0.17926698923110962, + "learning_rate": 3.755664735275677e-05, + "loss": 1.6925, + "step": 19310 + }, + { + "epoch": 5.927255985267035, + "grad_norm": 0.18986931443214417, + "learning_rate": 3.755183323985855e-05, + "loss": 1.7002, + "step": 19311 + }, + { + "epoch": 5.92756292203806, + "grad_norm": 0.18753086030483246, + "learning_rate": 3.7547019249980385e-05, + "loss": 1.695, + "step": 19312 + }, + { + "epoch": 5.9278698588090855, + "grad_norm": 0.21354973316192627, + "learning_rate": 3.7542205383169904e-05, + "loss": 1.6629, + "step": 19313 + }, + { + "epoch": 5.928176795580111, + "grad_norm": 0.19713245332241058, + "learning_rate": 3.753739163947463e-05, + "loss": 1.707, + "step": 19314 + }, + { + "epoch": 5.928483732351136, + "grad_norm": 0.2122458517551422, + "learning_rate": 3.753257801894217e-05, + "loss": 1.7309, + "step": 19315 + }, + { + "epoch": 5.928790669122161, + "grad_norm": 0.20360666513442993, + "learning_rate": 3.7527764521620065e-05, + "loss": 1.6861, + "step": 19316 + }, + { + "epoch": 5.929097605893186, + "grad_norm": 0.2652932405471802, + "learning_rate": 3.752295114755592e-05, + "loss": 1.7662, + "step": 19317 + }, + { + "epoch": 5.929404542664211, + "grad_norm": 0.18292152881622314, + "learning_rate": 3.751813789679726e-05, + "loss": 1.6691, + "step": 19318 + }, + { + "epoch": 5.929711479435237, + "grad_norm": 0.25630465149879456, + "learning_rate": 3.75133247693917e-05, + "loss": 1.7647, + "step": 19319 + }, + { + "epoch": 5.930018416206261, + "grad_norm": 0.2463291883468628, + "learning_rate": 3.750851176538677e-05, + "loss": 1.7252, + "step": 19320 + }, + { + "epoch": 5.930325352977286, + "grad_norm": 0.19977931678295135, + "learning_rate": 3.750369888483007e-05, + "loss": 1.7694, + "step": 19321 + }, + { + "epoch": 5.930632289748312, + "grad_norm": 0.19523118436336517, + "learning_rate": 3.7498886127769116e-05, + "loss": 1.7095, + "step": 19322 + }, + { + "epoch": 5.930939226519337, + "grad_norm": 0.19273912906646729, + "learning_rate": 3.749407349425151e-05, + "loss": 1.7009, + "step": 19323 + }, + { + "epoch": 5.931246163290362, + "grad_norm": 0.2419402152299881, + "learning_rate": 3.748926098432479e-05, + "loss": 1.7167, + "step": 19324 + }, + { + "epoch": 5.931553100061388, + "grad_norm": 0.22429771721363068, + "learning_rate": 3.7484448598036534e-05, + "loss": 1.6957, + "step": 19325 + }, + { + "epoch": 5.931860036832412, + "grad_norm": 0.23211807012557983, + "learning_rate": 3.747963633543429e-05, + "loss": 1.767, + "step": 19326 + }, + { + "epoch": 5.9321669736034375, + "grad_norm": 0.23204533755779266, + "learning_rate": 3.7474824196565625e-05, + "loss": 1.7405, + "step": 19327 + }, + { + "epoch": 5.932473910374463, + "grad_norm": 0.24068887531757355, + "learning_rate": 3.747001218147809e-05, + "loss": 1.7539, + "step": 19328 + }, + { + "epoch": 5.932780847145488, + "grad_norm": 0.18140049278736115, + "learning_rate": 3.746520029021922e-05, + "loss": 1.6956, + "step": 19329 + }, + { + "epoch": 5.9330877839165135, + "grad_norm": 0.28421929478645325, + "learning_rate": 3.746038852283661e-05, + "loss": 1.8539, + "step": 19330 + }, + { + "epoch": 5.933394720687538, + "grad_norm": 0.21984805166721344, + "learning_rate": 3.745557687937777e-05, + "loss": 1.7469, + "step": 19331 + }, + { + "epoch": 5.933701657458563, + "grad_norm": 0.2500358819961548, + "learning_rate": 3.7450765359890294e-05, + "loss": 1.7184, + "step": 19332 + }, + { + "epoch": 5.934008594229589, + "grad_norm": 0.2608816623687744, + "learning_rate": 3.744595396442169e-05, + "loss": 1.6825, + "step": 19333 + }, + { + "epoch": 5.934315531000614, + "grad_norm": 0.20359274744987488, + "learning_rate": 3.7441142693019526e-05, + "loss": 1.7535, + "step": 19334 + }, + { + "epoch": 5.934622467771639, + "grad_norm": 0.24795760214328766, + "learning_rate": 3.743633154573135e-05, + "loss": 1.7829, + "step": 19335 + }, + { + "epoch": 5.934929404542665, + "grad_norm": 0.20762503147125244, + "learning_rate": 3.7431520522604736e-05, + "loss": 1.7657, + "step": 19336 + }, + { + "epoch": 5.935236341313689, + "grad_norm": 0.24349527060985565, + "learning_rate": 3.7426709623687174e-05, + "loss": 1.7037, + "step": 19337 + }, + { + "epoch": 5.935543278084714, + "grad_norm": 0.2138780951499939, + "learning_rate": 3.742189884902626e-05, + "loss": 1.7302, + "step": 19338 + }, + { + "epoch": 5.93585021485574, + "grad_norm": 0.24776574969291687, + "learning_rate": 3.741708819866949e-05, + "loss": 1.7293, + "step": 19339 + }, + { + "epoch": 5.936157151626765, + "grad_norm": 0.297888845205307, + "learning_rate": 3.7412277672664444e-05, + "loss": 1.8341, + "step": 19340 + }, + { + "epoch": 5.93646408839779, + "grad_norm": 0.2811104953289032, + "learning_rate": 3.740746727105864e-05, + "loss": 1.7188, + "step": 19341 + }, + { + "epoch": 5.936771025168815, + "grad_norm": 0.37908127903938293, + "learning_rate": 3.740265699389964e-05, + "loss": 1.765, + "step": 19342 + }, + { + "epoch": 5.93707796193984, + "grad_norm": 0.24403691291809082, + "learning_rate": 3.739784684123495e-05, + "loss": 1.6897, + "step": 19343 + }, + { + "epoch": 5.9373848987108655, + "grad_norm": 0.2393181174993515, + "learning_rate": 3.7393036813112135e-05, + "loss": 1.6843, + "step": 19344 + }, + { + "epoch": 5.937691835481891, + "grad_norm": 0.2927580177783966, + "learning_rate": 3.738822690957872e-05, + "loss": 1.6946, + "step": 19345 + }, + { + "epoch": 5.937998772252916, + "grad_norm": 0.23423373699188232, + "learning_rate": 3.738341713068223e-05, + "loss": 1.7409, + "step": 19346 + }, + { + "epoch": 5.9383057090239415, + "grad_norm": 0.2544272840023041, + "learning_rate": 3.7378607476470216e-05, + "loss": 1.698, + "step": 19347 + }, + { + "epoch": 5.938612645794966, + "grad_norm": 0.2120404839515686, + "learning_rate": 3.737379794699019e-05, + "loss": 1.7412, + "step": 19348 + }, + { + "epoch": 5.938919582565991, + "grad_norm": 0.2076033353805542, + "learning_rate": 3.736898854228971e-05, + "loss": 1.752, + "step": 19349 + }, + { + "epoch": 5.939226519337017, + "grad_norm": 0.20122376084327698, + "learning_rate": 3.736417926241627e-05, + "loss": 1.6741, + "step": 19350 + }, + { + "epoch": 5.939533456108042, + "grad_norm": 0.1856858730316162, + "learning_rate": 3.735937010741742e-05, + "loss": 1.6959, + "step": 19351 + }, + { + "epoch": 5.939840392879067, + "grad_norm": 0.22192558646202087, + "learning_rate": 3.7354561077340684e-05, + "loss": 1.7597, + "step": 19352 + }, + { + "epoch": 5.940147329650092, + "grad_norm": 0.2653545141220093, + "learning_rate": 3.73497521722336e-05, + "loss": 1.7324, + "step": 19353 + }, + { + "epoch": 5.940454266421117, + "grad_norm": 0.1975676715373993, + "learning_rate": 3.734494339214366e-05, + "loss": 1.6852, + "step": 19354 + }, + { + "epoch": 5.940761203192142, + "grad_norm": 0.26949796080589294, + "learning_rate": 3.734013473711843e-05, + "loss": 1.7695, + "step": 19355 + }, + { + "epoch": 5.941068139963168, + "grad_norm": 0.2272176742553711, + "learning_rate": 3.733532620720539e-05, + "loss": 1.745, + "step": 19356 + }, + { + "epoch": 5.941375076734193, + "grad_norm": 0.25740066170692444, + "learning_rate": 3.733051780245208e-05, + "loss": 1.7701, + "step": 19357 + }, + { + "epoch": 5.941682013505218, + "grad_norm": 0.1910635381937027, + "learning_rate": 3.732570952290602e-05, + "loss": 1.7276, + "step": 19358 + }, + { + "epoch": 5.941988950276243, + "grad_norm": 0.24896447360515594, + "learning_rate": 3.732090136861474e-05, + "loss": 1.7717, + "step": 19359 + }, + { + "epoch": 5.942295887047268, + "grad_norm": 0.20696721971035004, + "learning_rate": 3.731609333962572e-05, + "loss": 1.7053, + "step": 19360 + }, + { + "epoch": 5.9426028238182935, + "grad_norm": 0.18822510540485382, + "learning_rate": 3.731128543598653e-05, + "loss": 1.6869, + "step": 19361 + }, + { + "epoch": 5.942909760589319, + "grad_norm": 0.20757299661636353, + "learning_rate": 3.730647765774464e-05, + "loss": 1.7214, + "step": 19362 + }, + { + "epoch": 5.943216697360343, + "grad_norm": 0.21238471567630768, + "learning_rate": 3.7301670004947574e-05, + "loss": 1.6953, + "step": 19363 + }, + { + "epoch": 5.943523634131369, + "grad_norm": 0.19326119124889374, + "learning_rate": 3.729686247764286e-05, + "loss": 1.7224, + "step": 19364 + }, + { + "epoch": 5.943830570902394, + "grad_norm": 0.17631326615810394, + "learning_rate": 3.729205507587798e-05, + "loss": 1.6471, + "step": 19365 + }, + { + "epoch": 5.944137507673419, + "grad_norm": 0.1741493195295334, + "learning_rate": 3.728724779970048e-05, + "loss": 1.7169, + "step": 19366 + }, + { + "epoch": 5.944444444444445, + "grad_norm": 0.18203428387641907, + "learning_rate": 3.728244064915782e-05, + "loss": 1.7301, + "step": 19367 + }, + { + "epoch": 5.94475138121547, + "grad_norm": 0.2063162475824356, + "learning_rate": 3.727763362429756e-05, + "loss": 1.7274, + "step": 19368 + }, + { + "epoch": 5.945058317986494, + "grad_norm": 0.17239537835121155, + "learning_rate": 3.7272826725167164e-05, + "loss": 1.7194, + "step": 19369 + }, + { + "epoch": 5.94536525475752, + "grad_norm": 0.1910972148180008, + "learning_rate": 3.726801995181418e-05, + "loss": 1.7017, + "step": 19370 + }, + { + "epoch": 5.945672191528545, + "grad_norm": 0.18822111189365387, + "learning_rate": 3.726321330428606e-05, + "loss": 1.723, + "step": 19371 + }, + { + "epoch": 5.94597912829957, + "grad_norm": 0.19680333137512207, + "learning_rate": 3.725840678263035e-05, + "loss": 1.685, + "step": 19372 + }, + { + "epoch": 5.946286065070596, + "grad_norm": 0.19016215205192566, + "learning_rate": 3.725360038689451e-05, + "loss": 1.7148, + "step": 19373 + }, + { + "epoch": 5.94659300184162, + "grad_norm": 0.1992037147283554, + "learning_rate": 3.7248794117126075e-05, + "loss": 1.7278, + "step": 19374 + }, + { + "epoch": 5.9468999386126455, + "grad_norm": 0.1892910748720169, + "learning_rate": 3.724398797337252e-05, + "loss": 1.7093, + "step": 19375 + }, + { + "epoch": 5.947206875383671, + "grad_norm": 0.23379561305046082, + "learning_rate": 3.723918195568137e-05, + "loss": 1.768, + "step": 19376 + }, + { + "epoch": 5.947513812154696, + "grad_norm": 0.1986081600189209, + "learning_rate": 3.7234376064100104e-05, + "loss": 1.719, + "step": 19377 + }, + { + "epoch": 5.9478207489257215, + "grad_norm": 0.20901642739772797, + "learning_rate": 3.7229570298676195e-05, + "loss": 1.7066, + "step": 19378 + }, + { + "epoch": 5.948127685696747, + "grad_norm": 0.2102847546339035, + "learning_rate": 3.722476465945718e-05, + "loss": 1.7354, + "step": 19379 + }, + { + "epoch": 5.948434622467771, + "grad_norm": 0.1857316792011261, + "learning_rate": 3.72199591464905e-05, + "loss": 1.7159, + "step": 19380 + }, + { + "epoch": 5.948741559238797, + "grad_norm": 0.3045661151409149, + "learning_rate": 3.721515375982371e-05, + "loss": 1.8782, + "step": 19381 + }, + { + "epoch": 5.949048496009822, + "grad_norm": 0.24114711582660675, + "learning_rate": 3.7210348499504236e-05, + "loss": 1.6819, + "step": 19382 + }, + { + "epoch": 5.949355432780847, + "grad_norm": 0.20186996459960938, + "learning_rate": 3.720554336557961e-05, + "loss": 1.8028, + "step": 19383 + }, + { + "epoch": 5.949662369551873, + "grad_norm": 0.25385335087776184, + "learning_rate": 3.7200738358097295e-05, + "loss": 1.7278, + "step": 19384 + }, + { + "epoch": 5.949969306322897, + "grad_norm": 0.23390468955039978, + "learning_rate": 3.719593347710478e-05, + "loss": 1.7775, + "step": 19385 + }, + { + "epoch": 5.9502762430939224, + "grad_norm": 0.22577936947345734, + "learning_rate": 3.719112872264956e-05, + "loss": 1.7567, + "step": 19386 + }, + { + "epoch": 5.950583179864948, + "grad_norm": 0.2540932297706604, + "learning_rate": 3.718632409477912e-05, + "loss": 1.6749, + "step": 19387 + }, + { + "epoch": 5.950890116635973, + "grad_norm": 0.1994820535182953, + "learning_rate": 3.718151959354093e-05, + "loss": 1.6809, + "step": 19388 + }, + { + "epoch": 5.9511970534069984, + "grad_norm": 0.27669432759284973, + "learning_rate": 3.717671521898249e-05, + "loss": 1.7633, + "step": 19389 + }, + { + "epoch": 5.951503990178024, + "grad_norm": 0.2533062994480133, + "learning_rate": 3.717191097115125e-05, + "loss": 1.7536, + "step": 19390 + }, + { + "epoch": 5.951810926949048, + "grad_norm": 0.22249148786067963, + "learning_rate": 3.716710685009471e-05, + "loss": 1.7325, + "step": 19391 + }, + { + "epoch": 5.952117863720074, + "grad_norm": 0.3085922598838806, + "learning_rate": 3.716230285586033e-05, + "loss": 1.7046, + "step": 19392 + }, + { + "epoch": 5.952424800491099, + "grad_norm": 0.2591574192047119, + "learning_rate": 3.715749898849562e-05, + "loss": 1.7165, + "step": 19393 + }, + { + "epoch": 5.952731737262124, + "grad_norm": 0.24586348235607147, + "learning_rate": 3.715269524804803e-05, + "loss": 1.749, + "step": 19394 + }, + { + "epoch": 5.953038674033149, + "grad_norm": 0.3424640893936157, + "learning_rate": 3.714789163456502e-05, + "loss": 1.7143, + "step": 19395 + }, + { + "epoch": 5.953345610804174, + "grad_norm": 0.24856910109519958, + "learning_rate": 3.714308814809408e-05, + "loss": 1.868, + "step": 19396 + }, + { + "epoch": 5.953652547575199, + "grad_norm": 0.2758113145828247, + "learning_rate": 3.7138284788682676e-05, + "loss": 1.6722, + "step": 19397 + }, + { + "epoch": 5.953959484346225, + "grad_norm": 0.25981786847114563, + "learning_rate": 3.71334815563783e-05, + "loss": 1.764, + "step": 19398 + }, + { + "epoch": 5.95426642111725, + "grad_norm": 0.27885568141937256, + "learning_rate": 3.7128678451228385e-05, + "loss": 1.7422, + "step": 19399 + }, + { + "epoch": 5.954573357888275, + "grad_norm": 0.2909421920776367, + "learning_rate": 3.712387547328042e-05, + "loss": 1.7862, + "step": 19400 + }, + { + "epoch": 5.9548802946593, + "grad_norm": 0.2288074642419815, + "learning_rate": 3.711907262258185e-05, + "loss": 1.7054, + "step": 19401 + }, + { + "epoch": 5.955187231430325, + "grad_norm": 0.2986883819103241, + "learning_rate": 3.711426989918017e-05, + "loss": 1.7555, + "step": 19402 + }, + { + "epoch": 5.9554941682013505, + "grad_norm": 0.23201194405555725, + "learning_rate": 3.710946730312281e-05, + "loss": 1.8186, + "step": 19403 + }, + { + "epoch": 5.955801104972376, + "grad_norm": 0.2609403431415558, + "learning_rate": 3.710466483445728e-05, + "loss": 1.7743, + "step": 19404 + }, + { + "epoch": 5.956108041743401, + "grad_norm": 0.31131741404533386, + "learning_rate": 3.709986249323098e-05, + "loss": 1.7938, + "step": 19405 + }, + { + "epoch": 5.956414978514426, + "grad_norm": 0.20544753968715668, + "learning_rate": 3.7095060279491424e-05, + "loss": 1.7278, + "step": 19406 + }, + { + "epoch": 5.956721915285451, + "grad_norm": 0.3063479959964752, + "learning_rate": 3.709025819328602e-05, + "loss": 1.7544, + "step": 19407 + }, + { + "epoch": 5.957028852056476, + "grad_norm": 0.34868693351745605, + "learning_rate": 3.708545623466227e-05, + "loss": 1.7536, + "step": 19408 + }, + { + "epoch": 5.957335788827502, + "grad_norm": 0.20847822725772858, + "learning_rate": 3.70806544036676e-05, + "loss": 1.7003, + "step": 19409 + }, + { + "epoch": 5.957642725598527, + "grad_norm": 0.3250095844268799, + "learning_rate": 3.707585270034949e-05, + "loss": 1.6815, + "step": 19410 + }, + { + "epoch": 5.957949662369552, + "grad_norm": 0.24854284524917603, + "learning_rate": 3.707105112475539e-05, + "loss": 1.7665, + "step": 19411 + }, + { + "epoch": 5.958256599140577, + "grad_norm": 0.2921455502510071, + "learning_rate": 3.706624967693271e-05, + "loss": 1.7039, + "step": 19412 + }, + { + "epoch": 5.958563535911602, + "grad_norm": 0.2659071385860443, + "learning_rate": 3.706144835692894e-05, + "loss": 1.7641, + "step": 19413 + }, + { + "epoch": 5.958870472682627, + "grad_norm": 0.30329519510269165, + "learning_rate": 3.7056647164791516e-05, + "loss": 1.7962, + "step": 19414 + }, + { + "epoch": 5.959177409453653, + "grad_norm": 0.4023756682872772, + "learning_rate": 3.7051846100567906e-05, + "loss": 1.7624, + "step": 19415 + }, + { + "epoch": 5.959484346224678, + "grad_norm": 0.24528828263282776, + "learning_rate": 3.704704516430553e-05, + "loss": 1.8156, + "step": 19416 + }, + { + "epoch": 5.9597912829957025, + "grad_norm": 0.46833130717277527, + "learning_rate": 3.704224435605186e-05, + "loss": 1.798, + "step": 19417 + }, + { + "epoch": 5.960098219766728, + "grad_norm": 0.26952674984931946, + "learning_rate": 3.70374436758543e-05, + "loss": 1.743, + "step": 19418 + }, + { + "epoch": 5.960405156537753, + "grad_norm": 0.3126155734062195, + "learning_rate": 3.703264312376034e-05, + "loss": 1.8003, + "step": 19419 + }, + { + "epoch": 5.9607120933087785, + "grad_norm": 0.2833348512649536, + "learning_rate": 3.702784269981738e-05, + "loss": 1.7524, + "step": 19420 + }, + { + "epoch": 5.961019030079804, + "grad_norm": 0.25425654649734497, + "learning_rate": 3.7023042404072916e-05, + "loss": 1.7241, + "step": 19421 + }, + { + "epoch": 5.961325966850829, + "grad_norm": 0.29460933804512024, + "learning_rate": 3.701824223657433e-05, + "loss": 1.676, + "step": 19422 + }, + { + "epoch": 5.961632903621854, + "grad_norm": 0.21040670573711395, + "learning_rate": 3.7013442197369094e-05, + "loss": 1.71, + "step": 19423 + }, + { + "epoch": 5.961939840392879, + "grad_norm": 0.3200007379055023, + "learning_rate": 3.7008642286504624e-05, + "loss": 1.7108, + "step": 19424 + }, + { + "epoch": 5.962246777163904, + "grad_norm": 0.20397430658340454, + "learning_rate": 3.7003842504028366e-05, + "loss": 1.7472, + "step": 19425 + }, + { + "epoch": 5.96255371393493, + "grad_norm": 0.24811354279518127, + "learning_rate": 3.699904284998776e-05, + "loss": 1.7116, + "step": 19426 + }, + { + "epoch": 5.962860650705955, + "grad_norm": 0.20980580151081085, + "learning_rate": 3.699424332443023e-05, + "loss": 1.786, + "step": 19427 + }, + { + "epoch": 5.963167587476979, + "grad_norm": 0.1967400163412094, + "learning_rate": 3.698944392740322e-05, + "loss": 1.7141, + "step": 19428 + }, + { + "epoch": 5.963474524248005, + "grad_norm": 0.21907822787761688, + "learning_rate": 3.698464465895414e-05, + "loss": 1.6983, + "step": 19429 + }, + { + "epoch": 5.96378146101903, + "grad_norm": 0.19938960671424866, + "learning_rate": 3.697984551913043e-05, + "loss": 1.6811, + "step": 19430 + }, + { + "epoch": 5.964088397790055, + "grad_norm": 0.22280220687389374, + "learning_rate": 3.6975046507979506e-05, + "loss": 1.6838, + "step": 19431 + }, + { + "epoch": 5.964395334561081, + "grad_norm": 0.2530672550201416, + "learning_rate": 3.697024762554883e-05, + "loss": 1.8116, + "step": 19432 + }, + { + "epoch": 5.964702271332106, + "grad_norm": 0.21853135526180267, + "learning_rate": 3.696544887188579e-05, + "loss": 1.692, + "step": 19433 + }, + { + "epoch": 5.9650092081031305, + "grad_norm": 0.18738535046577454, + "learning_rate": 3.696065024703783e-05, + "loss": 1.6971, + "step": 19434 + }, + { + "epoch": 5.965316144874156, + "grad_norm": 0.21199190616607666, + "learning_rate": 3.695585175105236e-05, + "loss": 1.7526, + "step": 19435 + }, + { + "epoch": 5.965623081645181, + "grad_norm": 0.22184251248836517, + "learning_rate": 3.695105338397681e-05, + "loss": 1.8075, + "step": 19436 + }, + { + "epoch": 5.9659300184162065, + "grad_norm": 0.20191644132137299, + "learning_rate": 3.6946255145858605e-05, + "loss": 1.7427, + "step": 19437 + }, + { + "epoch": 5.966236955187231, + "grad_norm": 0.2113640457391739, + "learning_rate": 3.694145703674515e-05, + "loss": 1.7556, + "step": 19438 + }, + { + "epoch": 5.966543891958256, + "grad_norm": 0.21834735572338104, + "learning_rate": 3.693665905668387e-05, + "loss": 1.7673, + "step": 19439 + }, + { + "epoch": 5.966850828729282, + "grad_norm": 0.2260274887084961, + "learning_rate": 3.6931861205722197e-05, + "loss": 1.8168, + "step": 19440 + }, + { + "epoch": 5.967157765500307, + "grad_norm": 0.24090524017810822, + "learning_rate": 3.692706348390751e-05, + "loss": 1.821, + "step": 19441 + }, + { + "epoch": 5.967464702271332, + "grad_norm": 0.27469882369041443, + "learning_rate": 3.6922265891287256e-05, + "loss": 1.7114, + "step": 19442 + }, + { + "epoch": 5.967771639042358, + "grad_norm": 0.23479801416397095, + "learning_rate": 3.6917468427908833e-05, + "loss": 1.7334, + "step": 19443 + }, + { + "epoch": 5.968078575813382, + "grad_norm": 0.21109704673290253, + "learning_rate": 3.6912671093819663e-05, + "loss": 1.7047, + "step": 19444 + }, + { + "epoch": 5.968385512584407, + "grad_norm": 0.21141986548900604, + "learning_rate": 3.690787388906715e-05, + "loss": 1.6868, + "step": 19445 + }, + { + "epoch": 5.968692449355433, + "grad_norm": 0.21836397051811218, + "learning_rate": 3.690307681369868e-05, + "loss": 1.6923, + "step": 19446 + }, + { + "epoch": 5.968999386126458, + "grad_norm": 0.21733662486076355, + "learning_rate": 3.6898279867761695e-05, + "loss": 1.7699, + "step": 19447 + }, + { + "epoch": 5.969306322897483, + "grad_norm": 0.19220437109470367, + "learning_rate": 3.689348305130359e-05, + "loss": 1.7002, + "step": 19448 + }, + { + "epoch": 5.969613259668508, + "grad_norm": 0.22644726932048798, + "learning_rate": 3.688868636437176e-05, + "loss": 1.7024, + "step": 19449 + }, + { + "epoch": 5.969920196439533, + "grad_norm": 0.1832779198884964, + "learning_rate": 3.688388980701361e-05, + "loss": 1.699, + "step": 19450 + }, + { + "epoch": 5.9702271332105585, + "grad_norm": 0.20793284475803375, + "learning_rate": 3.687909337927658e-05, + "loss": 1.7557, + "step": 19451 + }, + { + "epoch": 5.970534069981584, + "grad_norm": 0.19485175609588623, + "learning_rate": 3.6874297081207995e-05, + "loss": 1.7641, + "step": 19452 + }, + { + "epoch": 5.970841006752609, + "grad_norm": 0.20980949699878693, + "learning_rate": 3.686950091285534e-05, + "loss": 1.7542, + "step": 19453 + }, + { + "epoch": 5.9711479435236345, + "grad_norm": 0.24902600049972534, + "learning_rate": 3.686470487426594e-05, + "loss": 1.7342, + "step": 19454 + }, + { + "epoch": 5.971454880294659, + "grad_norm": 0.20191124081611633, + "learning_rate": 3.685990896548724e-05, + "loss": 1.6844, + "step": 19455 + }, + { + "epoch": 5.971761817065684, + "grad_norm": 0.23217806220054626, + "learning_rate": 3.685511318656662e-05, + "loss": 1.7054, + "step": 19456 + }, + { + "epoch": 5.97206875383671, + "grad_norm": 0.23383383452892303, + "learning_rate": 3.6850317537551484e-05, + "loss": 1.6903, + "step": 19457 + }, + { + "epoch": 5.972375690607735, + "grad_norm": 0.2147756665945053, + "learning_rate": 3.6845522018489196e-05, + "loss": 1.736, + "step": 19458 + }, + { + "epoch": 5.97268262737876, + "grad_norm": 0.23864400386810303, + "learning_rate": 3.68407266294272e-05, + "loss": 1.7483, + "step": 19459 + }, + { + "epoch": 5.972989564149785, + "grad_norm": 0.18702742457389832, + "learning_rate": 3.6835931370412836e-05, + "loss": 1.6874, + "step": 19460 + }, + { + "epoch": 5.97329650092081, + "grad_norm": 0.2167401760816574, + "learning_rate": 3.683113624149351e-05, + "loss": 1.652, + "step": 19461 + }, + { + "epoch": 5.973603437691835, + "grad_norm": 0.17105139791965485, + "learning_rate": 3.6826341242716636e-05, + "loss": 1.7029, + "step": 19462 + }, + { + "epoch": 5.973910374462861, + "grad_norm": 0.2189798206090927, + "learning_rate": 3.682154637412956e-05, + "loss": 1.7203, + "step": 19463 + }, + { + "epoch": 5.974217311233886, + "grad_norm": 0.17864444851875305, + "learning_rate": 3.68167516357797e-05, + "loss": 1.7176, + "step": 19464 + }, + { + "epoch": 5.974524248004911, + "grad_norm": 0.22356030344963074, + "learning_rate": 3.681195702771442e-05, + "loss": 1.7492, + "step": 19465 + }, + { + "epoch": 5.974831184775936, + "grad_norm": 0.19020728766918182, + "learning_rate": 3.68071625499811e-05, + "loss": 1.6925, + "step": 19466 + }, + { + "epoch": 5.975138121546961, + "grad_norm": 0.19092151522636414, + "learning_rate": 3.680236820262714e-05, + "loss": 1.7253, + "step": 19467 + }, + { + "epoch": 5.975445058317987, + "grad_norm": 0.20842085778713226, + "learning_rate": 3.6797573985699926e-05, + "loss": 1.7251, + "step": 19468 + }, + { + "epoch": 5.975751995089012, + "grad_norm": 0.2245844155550003, + "learning_rate": 3.6792779899246796e-05, + "loss": 1.7351, + "step": 19469 + }, + { + "epoch": 5.976058931860036, + "grad_norm": 0.18867328763008118, + "learning_rate": 3.678798594331519e-05, + "loss": 1.6646, + "step": 19470 + }, + { + "epoch": 5.976365868631062, + "grad_norm": 0.2892500162124634, + "learning_rate": 3.678319211795242e-05, + "loss": 1.7146, + "step": 19471 + }, + { + "epoch": 5.976672805402087, + "grad_norm": 0.22490514814853668, + "learning_rate": 3.677839842320591e-05, + "loss": 1.7147, + "step": 19472 + }, + { + "epoch": 5.976979742173112, + "grad_norm": 0.296724796295166, + "learning_rate": 3.677360485912301e-05, + "loss": 1.7714, + "step": 19473 + }, + { + "epoch": 5.977286678944138, + "grad_norm": 0.2784444987773895, + "learning_rate": 3.676881142575111e-05, + "loss": 1.7198, + "step": 19474 + }, + { + "epoch": 5.977593615715163, + "grad_norm": 0.20270293951034546, + "learning_rate": 3.676401812313755e-05, + "loss": 1.7336, + "step": 19475 + }, + { + "epoch": 5.9779005524861875, + "grad_norm": 0.23352907598018646, + "learning_rate": 3.6759224951329745e-05, + "loss": 1.7428, + "step": 19476 + }, + { + "epoch": 5.978207489257213, + "grad_norm": 0.1892426460981369, + "learning_rate": 3.675443191037502e-05, + "loss": 1.6636, + "step": 19477 + }, + { + "epoch": 5.978514426028238, + "grad_norm": 0.22216783463954926, + "learning_rate": 3.6749639000320766e-05, + "loss": 1.7446, + "step": 19478 + }, + { + "epoch": 5.9788213627992635, + "grad_norm": 0.19465389847755432, + "learning_rate": 3.6744846221214364e-05, + "loss": 1.7403, + "step": 19479 + }, + { + "epoch": 5.979128299570289, + "grad_norm": 0.1918177455663681, + "learning_rate": 3.674005357310314e-05, + "loss": 1.6974, + "step": 19480 + }, + { + "epoch": 5.979435236341313, + "grad_norm": 0.19065791368484497, + "learning_rate": 3.673526105603449e-05, + "loss": 1.7299, + "step": 19481 + }, + { + "epoch": 5.979742173112339, + "grad_norm": 0.24036844074726105, + "learning_rate": 3.673046867005575e-05, + "loss": 1.7441, + "step": 19482 + }, + { + "epoch": 5.980049109883364, + "grad_norm": 0.22352568805217743, + "learning_rate": 3.6725676415214305e-05, + "loss": 1.7556, + "step": 19483 + }, + { + "epoch": 5.980356046654389, + "grad_norm": 0.2492935210466385, + "learning_rate": 3.67208842915575e-05, + "loss": 1.6833, + "step": 19484 + }, + { + "epoch": 5.980662983425415, + "grad_norm": 0.2554415762424469, + "learning_rate": 3.671609229913272e-05, + "loss": 1.7426, + "step": 19485 + }, + { + "epoch": 5.98096992019644, + "grad_norm": 0.24076475203037262, + "learning_rate": 3.671130043798728e-05, + "loss": 1.7362, + "step": 19486 + }, + { + "epoch": 5.981276856967464, + "grad_norm": 0.24297118186950684, + "learning_rate": 3.670650870816858e-05, + "loss": 1.7493, + "step": 19487 + }, + { + "epoch": 5.98158379373849, + "grad_norm": 0.19533030688762665, + "learning_rate": 3.6701717109723924e-05, + "loss": 1.7397, + "step": 19488 + }, + { + "epoch": 5.981890730509515, + "grad_norm": 0.24731193482875824, + "learning_rate": 3.669692564270071e-05, + "loss": 1.7483, + "step": 19489 + }, + { + "epoch": 5.98219766728054, + "grad_norm": 0.23274390399456024, + "learning_rate": 3.669213430714626e-05, + "loss": 1.7677, + "step": 19490 + }, + { + "epoch": 5.982504604051566, + "grad_norm": 0.180234894156456, + "learning_rate": 3.668734310310796e-05, + "loss": 1.7065, + "step": 19491 + }, + { + "epoch": 5.98281154082259, + "grad_norm": 0.19045281410217285, + "learning_rate": 3.6682552030633125e-05, + "loss": 1.7089, + "step": 19492 + }, + { + "epoch": 5.9831184775936155, + "grad_norm": 0.17261318862438202, + "learning_rate": 3.667776108976914e-05, + "loss": 1.7227, + "step": 19493 + }, + { + "epoch": 5.983425414364641, + "grad_norm": 0.2156316339969635, + "learning_rate": 3.667297028056329e-05, + "loss": 1.7025, + "step": 19494 + }, + { + "epoch": 5.983732351135666, + "grad_norm": 0.22288112342357635, + "learning_rate": 3.666817960306298e-05, + "loss": 1.7123, + "step": 19495 + }, + { + "epoch": 5.9840392879066915, + "grad_norm": 0.21983082592487335, + "learning_rate": 3.6663389057315543e-05, + "loss": 1.7688, + "step": 19496 + }, + { + "epoch": 5.984346224677717, + "grad_norm": 0.1804746687412262, + "learning_rate": 3.665859864336829e-05, + "loss": 1.759, + "step": 19497 + }, + { + "epoch": 5.984653161448741, + "grad_norm": 0.22762230038642883, + "learning_rate": 3.6653808361268605e-05, + "loss": 1.8128, + "step": 19498 + }, + { + "epoch": 5.984960098219767, + "grad_norm": 0.21779340505599976, + "learning_rate": 3.664901821106379e-05, + "loss": 1.7316, + "step": 19499 + }, + { + "epoch": 5.985267034990792, + "grad_norm": 0.18899449706077576, + "learning_rate": 3.664422819280121e-05, + "loss": 1.7535, + "step": 19500 + }, + { + "epoch": 5.985573971761817, + "grad_norm": 0.22799427807331085, + "learning_rate": 3.663943830652819e-05, + "loss": 1.7626, + "step": 19501 + }, + { + "epoch": 5.985880908532843, + "grad_norm": 0.19936929643154144, + "learning_rate": 3.6634648552292086e-05, + "loss": 1.6887, + "step": 19502 + }, + { + "epoch": 5.986187845303867, + "grad_norm": 0.22482532262802124, + "learning_rate": 3.6629858930140206e-05, + "loss": 1.6867, + "step": 19503 + }, + { + "epoch": 5.986494782074892, + "grad_norm": 0.23543842136859894, + "learning_rate": 3.662506944011991e-05, + "loss": 1.7715, + "step": 19504 + }, + { + "epoch": 5.986801718845918, + "grad_norm": 0.230603888630867, + "learning_rate": 3.6620280082278495e-05, + "loss": 1.7514, + "step": 19505 + }, + { + "epoch": 5.987108655616943, + "grad_norm": 0.26767033338546753, + "learning_rate": 3.6615490856663334e-05, + "loss": 1.6862, + "step": 19506 + }, + { + "epoch": 5.987415592387968, + "grad_norm": 0.18282492458820343, + "learning_rate": 3.661070176332172e-05, + "loss": 1.6569, + "step": 19507 + }, + { + "epoch": 5.987722529158994, + "grad_norm": 0.255426824092865, + "learning_rate": 3.6605912802301016e-05, + "loss": 1.7623, + "step": 19508 + }, + { + "epoch": 5.988029465930018, + "grad_norm": 0.25026118755340576, + "learning_rate": 3.6601123973648524e-05, + "loss": 1.6907, + "step": 19509 + }, + { + "epoch": 5.9883364027010435, + "grad_norm": 0.19193407893180847, + "learning_rate": 3.659633527741159e-05, + "loss": 1.7647, + "step": 19510 + }, + { + "epoch": 5.988643339472069, + "grad_norm": 0.25562727451324463, + "learning_rate": 3.6591546713637506e-05, + "loss": 1.6806, + "step": 19511 + }, + { + "epoch": 5.988950276243094, + "grad_norm": 0.2296016663312912, + "learning_rate": 3.6586758282373624e-05, + "loss": 1.7747, + "step": 19512 + }, + { + "epoch": 5.989257213014119, + "grad_norm": 0.22875753045082092, + "learning_rate": 3.6581969983667275e-05, + "loss": 1.7847, + "step": 19513 + }, + { + "epoch": 5.989564149785144, + "grad_norm": 0.24469317495822906, + "learning_rate": 3.6577181817565736e-05, + "loss": 1.6784, + "step": 19514 + }, + { + "epoch": 5.989871086556169, + "grad_norm": 0.22855928540229797, + "learning_rate": 3.657239378411638e-05, + "loss": 1.788, + "step": 19515 + }, + { + "epoch": 5.990178023327195, + "grad_norm": 0.28745612502098083, + "learning_rate": 3.656760588336647e-05, + "loss": 1.6836, + "step": 19516 + }, + { + "epoch": 5.99048496009822, + "grad_norm": 0.18221193552017212, + "learning_rate": 3.656281811536337e-05, + "loss": 1.6687, + "step": 19517 + }, + { + "epoch": 5.990791896869245, + "grad_norm": 0.2556660771369934, + "learning_rate": 3.655803048015437e-05, + "loss": 1.7351, + "step": 19518 + }, + { + "epoch": 5.99109883364027, + "grad_norm": 0.18791422247886658, + "learning_rate": 3.6553242977786803e-05, + "loss": 1.6749, + "step": 19519 + }, + { + "epoch": 5.991405770411295, + "grad_norm": 0.28149592876434326, + "learning_rate": 3.654845560830796e-05, + "loss": 1.7333, + "step": 19520 + }, + { + "epoch": 5.99171270718232, + "grad_norm": 0.24631322920322418, + "learning_rate": 3.654366837176517e-05, + "loss": 1.7672, + "step": 19521 + }, + { + "epoch": 5.992019643953346, + "grad_norm": 0.22054782509803772, + "learning_rate": 3.653888126820573e-05, + "loss": 1.7499, + "step": 19522 + }, + { + "epoch": 5.992326580724371, + "grad_norm": 0.23334862291812897, + "learning_rate": 3.653409429767696e-05, + "loss": 1.7133, + "step": 19523 + }, + { + "epoch": 5.9926335174953955, + "grad_norm": 0.19809292256832123, + "learning_rate": 3.6529307460226145e-05, + "loss": 1.6965, + "step": 19524 + }, + { + "epoch": 5.992940454266421, + "grad_norm": 0.23769772052764893, + "learning_rate": 3.652452075590064e-05, + "loss": 1.699, + "step": 19525 + }, + { + "epoch": 5.993247391037446, + "grad_norm": 0.19045031070709229, + "learning_rate": 3.6519734184747686e-05, + "loss": 1.7043, + "step": 19526 + }, + { + "epoch": 5.9935543278084715, + "grad_norm": 0.20795129239559174, + "learning_rate": 3.651494774681465e-05, + "loss": 1.7159, + "step": 19527 + }, + { + "epoch": 5.993861264579497, + "grad_norm": 0.1933370679616928, + "learning_rate": 3.651016144214878e-05, + "loss": 1.6999, + "step": 19528 + }, + { + "epoch": 5.994168201350522, + "grad_norm": 0.18360544741153717, + "learning_rate": 3.650537527079742e-05, + "loss": 1.7525, + "step": 19529 + }, + { + "epoch": 5.994475138121547, + "grad_norm": 0.21080785989761353, + "learning_rate": 3.650058923280786e-05, + "loss": 1.6832, + "step": 19530 + }, + { + "epoch": 5.994782074892572, + "grad_norm": 0.19701606035232544, + "learning_rate": 3.649580332822736e-05, + "loss": 1.7104, + "step": 19531 + }, + { + "epoch": 5.995089011663597, + "grad_norm": 0.24208703637123108, + "learning_rate": 3.6491017557103266e-05, + "loss": 1.726, + "step": 19532 + }, + { + "epoch": 5.995395948434623, + "grad_norm": 0.25981345772743225, + "learning_rate": 3.648623191948284e-05, + "loss": 1.7644, + "step": 19533 + }, + { + "epoch": 5.995702885205648, + "grad_norm": 0.24137455224990845, + "learning_rate": 3.64814464154134e-05, + "loss": 1.7354, + "step": 19534 + }, + { + "epoch": 5.996009821976672, + "grad_norm": 0.2140759378671646, + "learning_rate": 3.647666104494222e-05, + "loss": 1.7244, + "step": 19535 + }, + { + "epoch": 5.996316758747698, + "grad_norm": 0.2801622748374939, + "learning_rate": 3.647187580811663e-05, + "loss": 1.6996, + "step": 19536 + }, + { + "epoch": 5.996623695518723, + "grad_norm": 0.21048817038536072, + "learning_rate": 3.6467090704983856e-05, + "loss": 1.7378, + "step": 19537 + }, + { + "epoch": 5.996930632289748, + "grad_norm": 0.2935819625854492, + "learning_rate": 3.6462305735591254e-05, + "loss": 1.7066, + "step": 19538 + }, + { + "epoch": 5.997237569060774, + "grad_norm": 0.22473880648612976, + "learning_rate": 3.645752089998606e-05, + "loss": 1.7539, + "step": 19539 + }, + { + "epoch": 5.997544505831799, + "grad_norm": 0.20606113970279694, + "learning_rate": 3.6452736198215585e-05, + "loss": 1.7338, + "step": 19540 + }, + { + "epoch": 5.9978514426028235, + "grad_norm": 0.2702842950820923, + "learning_rate": 3.6447951630327116e-05, + "loss": 1.7171, + "step": 19541 + }, + { + "epoch": 5.998158379373849, + "grad_norm": 0.19971637427806854, + "learning_rate": 3.6443167196367946e-05, + "loss": 1.7132, + "step": 19542 + }, + { + "epoch": 5.998465316144874, + "grad_norm": 0.2352653592824936, + "learning_rate": 3.643838289638531e-05, + "loss": 1.787, + "step": 19543 + }, + { + "epoch": 5.9987722529158995, + "grad_norm": 0.2324669510126114, + "learning_rate": 3.643359873042656e-05, + "loss": 1.7039, + "step": 19544 + }, + { + "epoch": 5.999079189686924, + "grad_norm": 0.1935029774904251, + "learning_rate": 3.6428814698538914e-05, + "loss": 1.6846, + "step": 19545 + }, + { + "epoch": 5.999386126457949, + "grad_norm": 0.18433111906051636, + "learning_rate": 3.642403080076968e-05, + "loss": 1.7018, + "step": 19546 + }, + { + "epoch": 5.999693063228975, + "grad_norm": 0.19364693760871887, + "learning_rate": 3.6419247037166146e-05, + "loss": 1.6901, + "step": 19547 + }, + { + "epoch": 6.0, + "grad_norm": 0.23718556761741638, + "learning_rate": 3.641446340777556e-05, + "loss": 1.7743, + "step": 19548 + }, + { + "epoch": 6.000306936771025, + "grad_norm": 0.23907634615898132, + "learning_rate": 3.640967991264521e-05, + "loss": 1.8225, + "step": 19549 + }, + { + "epoch": 6.000613873542051, + "grad_norm": 0.18895737826824188, + "learning_rate": 3.6404896551822365e-05, + "loss": 1.7004, + "step": 19550 + }, + { + "epoch": 6.000920810313075, + "grad_norm": 0.20192188024520874, + "learning_rate": 3.64001133253543e-05, + "loss": 1.7304, + "step": 19551 + }, + { + "epoch": 6.0012277470841005, + "grad_norm": 0.1961488425731659, + "learning_rate": 3.6395330233288285e-05, + "loss": 1.6839, + "step": 19552 + }, + { + "epoch": 6.001534683855126, + "grad_norm": 0.271635502576828, + "learning_rate": 3.639054727567161e-05, + "loss": 1.8182, + "step": 19553 + }, + { + "epoch": 6.001841620626151, + "grad_norm": 0.20838679373264313, + "learning_rate": 3.63857644525515e-05, + "loss": 1.7688, + "step": 19554 + }, + { + "epoch": 6.0021485573971765, + "grad_norm": 0.23661796748638153, + "learning_rate": 3.6380981763975266e-05, + "loss": 1.6785, + "step": 19555 + }, + { + "epoch": 6.002455494168202, + "grad_norm": 0.1728433072566986, + "learning_rate": 3.637619920999013e-05, + "loss": 1.6648, + "step": 19556 + }, + { + "epoch": 6.002762430939226, + "grad_norm": 0.2845853269100189, + "learning_rate": 3.6371416790643395e-05, + "loss": 1.7592, + "step": 19557 + }, + { + "epoch": 6.003069367710252, + "grad_norm": 0.3246566951274872, + "learning_rate": 3.636663450598229e-05, + "loss": 1.7045, + "step": 19558 + }, + { + "epoch": 6.003376304481277, + "grad_norm": 0.21857120096683502, + "learning_rate": 3.636185235605412e-05, + "loss": 1.756, + "step": 19559 + }, + { + "epoch": 6.003683241252302, + "grad_norm": 0.3583754599094391, + "learning_rate": 3.63570703409061e-05, + "loss": 1.6828, + "step": 19560 + }, + { + "epoch": 6.003990178023328, + "grad_norm": 0.25527241826057434, + "learning_rate": 3.635228846058552e-05, + "loss": 1.7611, + "step": 19561 + }, + { + "epoch": 6.004297114794352, + "grad_norm": 0.29662930965423584, + "learning_rate": 3.6347506715139604e-05, + "loss": 1.747, + "step": 19562 + }, + { + "epoch": 6.004604051565377, + "grad_norm": 0.2588978707790375, + "learning_rate": 3.634272510461564e-05, + "loss": 1.7153, + "step": 19563 + }, + { + "epoch": 6.004910988336403, + "grad_norm": 0.23874366283416748, + "learning_rate": 3.633794362906089e-05, + "loss": 1.7285, + "step": 19564 + }, + { + "epoch": 6.005217925107428, + "grad_norm": 0.2898634374141693, + "learning_rate": 3.633316228852256e-05, + "loss": 1.7539, + "step": 19565 + }, + { + "epoch": 6.005524861878453, + "grad_norm": 0.2578127682209015, + "learning_rate": 3.6328381083047946e-05, + "loss": 1.7504, + "step": 19566 + }, + { + "epoch": 6.005831798649478, + "grad_norm": 0.3094595968723297, + "learning_rate": 3.632360001268427e-05, + "loss": 1.7076, + "step": 19567 + }, + { + "epoch": 6.006138735420503, + "grad_norm": 0.27825623750686646, + "learning_rate": 3.63188190774788e-05, + "loss": 1.7651, + "step": 19568 + }, + { + "epoch": 6.0064456721915285, + "grad_norm": 0.27732032537460327, + "learning_rate": 3.631403827747878e-05, + "loss": 1.7209, + "step": 19569 + }, + { + "epoch": 6.006752608962554, + "grad_norm": 0.36446672677993774, + "learning_rate": 3.6309257612731475e-05, + "loss": 1.7191, + "step": 19570 + }, + { + "epoch": 6.007059545733579, + "grad_norm": 0.19071432948112488, + "learning_rate": 3.6304477083284076e-05, + "loss": 1.6981, + "step": 19571 + }, + { + "epoch": 6.0073664825046045, + "grad_norm": 0.40523234009742737, + "learning_rate": 3.6299696689183895e-05, + "loss": 1.7259, + "step": 19572 + }, + { + "epoch": 6.007673419275629, + "grad_norm": 0.30279576778411865, + "learning_rate": 3.6294916430478116e-05, + "loss": 1.8017, + "step": 19573 + }, + { + "epoch": 6.007980356046654, + "grad_norm": 0.2944689989089966, + "learning_rate": 3.629013630721402e-05, + "loss": 1.7347, + "step": 19574 + }, + { + "epoch": 6.00828729281768, + "grad_norm": 0.3557213246822357, + "learning_rate": 3.6285356319438814e-05, + "loss": 1.7308, + "step": 19575 + }, + { + "epoch": 6.008594229588705, + "grad_norm": 0.19888661801815033, + "learning_rate": 3.628057646719978e-05, + "loss": 1.7571, + "step": 19576 + }, + { + "epoch": 6.00890116635973, + "grad_norm": 0.34002986550331116, + "learning_rate": 3.627579675054411e-05, + "loss": 1.7417, + "step": 19577 + }, + { + "epoch": 6.009208103130755, + "grad_norm": 0.2756921350955963, + "learning_rate": 3.627101716951908e-05, + "loss": 1.7351, + "step": 19578 + }, + { + "epoch": 6.00951503990178, + "grad_norm": 0.3520946502685547, + "learning_rate": 3.6266237724171885e-05, + "loss": 1.7056, + "step": 19579 + }, + { + "epoch": 6.009821976672805, + "grad_norm": 0.3673728406429291, + "learning_rate": 3.6261458414549786e-05, + "loss": 1.6388, + "step": 19580 + }, + { + "epoch": 6.010128913443831, + "grad_norm": 0.2247757613658905, + "learning_rate": 3.625667924070003e-05, + "loss": 1.7772, + "step": 19581 + }, + { + "epoch": 6.010435850214856, + "grad_norm": 0.4387452006340027, + "learning_rate": 3.6251900202669795e-05, + "loss": 1.7629, + "step": 19582 + }, + { + "epoch": 6.0107427869858805, + "grad_norm": 0.23595796525478363, + "learning_rate": 3.624712130050636e-05, + "loss": 1.8044, + "step": 19583 + }, + { + "epoch": 6.011049723756906, + "grad_norm": 0.31198835372924805, + "learning_rate": 3.624234253425691e-05, + "loss": 1.7623, + "step": 19584 + }, + { + "epoch": 6.011356660527931, + "grad_norm": 0.25283896923065186, + "learning_rate": 3.6237563903968705e-05, + "loss": 1.7771, + "step": 19585 + }, + { + "epoch": 6.0116635972989565, + "grad_norm": 0.2595483064651489, + "learning_rate": 3.6232785409688954e-05, + "loss": 1.7405, + "step": 19586 + }, + { + "epoch": 6.011970534069982, + "grad_norm": 0.302273690700531, + "learning_rate": 3.622800705146491e-05, + "loss": 1.7236, + "step": 19587 + }, + { + "epoch": 6.012277470841007, + "grad_norm": 0.20444928109645844, + "learning_rate": 3.622322882934375e-05, + "loss": 1.6863, + "step": 19588 + }, + { + "epoch": 6.012584407612032, + "grad_norm": 0.2682531774044037, + "learning_rate": 3.621845074337273e-05, + "loss": 1.752, + "step": 19589 + }, + { + "epoch": 6.012891344383057, + "grad_norm": 0.25617173314094543, + "learning_rate": 3.621367279359905e-05, + "loss": 1.7496, + "step": 19590 + }, + { + "epoch": 6.013198281154082, + "grad_norm": 0.24514207243919373, + "learning_rate": 3.620889498006994e-05, + "loss": 1.6568, + "step": 19591 + }, + { + "epoch": 6.013505217925108, + "grad_norm": 0.2799128293991089, + "learning_rate": 3.6204117302832616e-05, + "loss": 1.7284, + "step": 19592 + }, + { + "epoch": 6.013812154696133, + "grad_norm": 0.2025543451309204, + "learning_rate": 3.619933976193428e-05, + "loss": 1.7172, + "step": 19593 + }, + { + "epoch": 6.014119091467157, + "grad_norm": 0.24697700142860413, + "learning_rate": 3.619456235742216e-05, + "loss": 1.7316, + "step": 19594 + }, + { + "epoch": 6.014426028238183, + "grad_norm": 0.2518150210380554, + "learning_rate": 3.618978508934348e-05, + "loss": 1.8183, + "step": 19595 + }, + { + "epoch": 6.014732965009208, + "grad_norm": 0.165326327085495, + "learning_rate": 3.618500795774542e-05, + "loss": 1.665, + "step": 19596 + }, + { + "epoch": 6.015039901780233, + "grad_norm": 0.19158180058002472, + "learning_rate": 3.6180230962675216e-05, + "loss": 1.7232, + "step": 19597 + }, + { + "epoch": 6.015346838551259, + "grad_norm": 0.19456413388252258, + "learning_rate": 3.6175454104180086e-05, + "loss": 1.7153, + "step": 19598 + }, + { + "epoch": 6.015653775322283, + "grad_norm": 0.233373761177063, + "learning_rate": 3.6170677382307195e-05, + "loss": 1.7914, + "step": 19599 + }, + { + "epoch": 6.0159607120933085, + "grad_norm": 0.18567882478237152, + "learning_rate": 3.6165900797103796e-05, + "loss": 1.6793, + "step": 19600 + }, + { + "epoch": 6.016267648864334, + "grad_norm": 0.2119273990392685, + "learning_rate": 3.616112434861706e-05, + "loss": 1.689, + "step": 19601 + }, + { + "epoch": 6.016574585635359, + "grad_norm": 0.1915217787027359, + "learning_rate": 3.61563480368942e-05, + "loss": 1.6835, + "step": 19602 + }, + { + "epoch": 6.0168815224063845, + "grad_norm": 0.24824760854244232, + "learning_rate": 3.615157186198244e-05, + "loss": 1.8411, + "step": 19603 + }, + { + "epoch": 6.01718845917741, + "grad_norm": 0.2198900282382965, + "learning_rate": 3.6146795823928955e-05, + "loss": 1.7311, + "step": 19604 + }, + { + "epoch": 6.017495395948434, + "grad_norm": 0.22993668913841248, + "learning_rate": 3.614201992278095e-05, + "loss": 1.7249, + "step": 19605 + }, + { + "epoch": 6.01780233271946, + "grad_norm": 0.20677974820137024, + "learning_rate": 3.613724415858564e-05, + "loss": 1.7137, + "step": 19606 + }, + { + "epoch": 6.018109269490485, + "grad_norm": 0.1844938099384308, + "learning_rate": 3.6132468531390184e-05, + "loss": 1.6512, + "step": 19607 + }, + { + "epoch": 6.01841620626151, + "grad_norm": 0.224154993891716, + "learning_rate": 3.6127693041241815e-05, + "loss": 1.7116, + "step": 19608 + }, + { + "epoch": 6.018723143032536, + "grad_norm": 0.17322199046611786, + "learning_rate": 3.612291768818772e-05, + "loss": 1.6743, + "step": 19609 + }, + { + "epoch": 6.01903007980356, + "grad_norm": 0.24451903998851776, + "learning_rate": 3.611814247227508e-05, + "loss": 1.8332, + "step": 19610 + }, + { + "epoch": 6.019337016574585, + "grad_norm": 0.1911642849445343, + "learning_rate": 3.611336739355109e-05, + "loss": 1.707, + "step": 19611 + }, + { + "epoch": 6.019643953345611, + "grad_norm": 0.20917518436908722, + "learning_rate": 3.6108592452062954e-05, + "loss": 1.7328, + "step": 19612 + }, + { + "epoch": 6.019950890116636, + "grad_norm": 0.2314450889825821, + "learning_rate": 3.610381764785784e-05, + "loss": 1.7575, + "step": 19613 + }, + { + "epoch": 6.020257826887661, + "grad_norm": 0.20701734721660614, + "learning_rate": 3.609904298098296e-05, + "loss": 1.6958, + "step": 19614 + }, + { + "epoch": 6.020564763658686, + "grad_norm": 0.2494465857744217, + "learning_rate": 3.609426845148547e-05, + "loss": 1.706, + "step": 19615 + }, + { + "epoch": 6.020871700429711, + "grad_norm": 0.25842729210853577, + "learning_rate": 3.608949405941256e-05, + "loss": 1.7667, + "step": 19616 + }, + { + "epoch": 6.0211786372007365, + "grad_norm": 0.19831863045692444, + "learning_rate": 3.608471980481145e-05, + "loss": 1.7135, + "step": 19617 + }, + { + "epoch": 6.021485573971762, + "grad_norm": 0.21611735224723816, + "learning_rate": 3.607994568772927e-05, + "loss": 1.7416, + "step": 19618 + }, + { + "epoch": 6.021792510742787, + "grad_norm": 0.2356715202331543, + "learning_rate": 3.607517170821324e-05, + "loss": 1.7696, + "step": 19619 + }, + { + "epoch": 6.0220994475138125, + "grad_norm": 0.24737675487995148, + "learning_rate": 3.6070397866310514e-05, + "loss": 1.7189, + "step": 19620 + }, + { + "epoch": 6.022406384284837, + "grad_norm": 0.19260701537132263, + "learning_rate": 3.6065624162068284e-05, + "loss": 1.7292, + "step": 19621 + }, + { + "epoch": 6.022713321055862, + "grad_norm": 0.29366952180862427, + "learning_rate": 3.6060850595533716e-05, + "loss": 1.7875, + "step": 19622 + }, + { + "epoch": 6.023020257826888, + "grad_norm": 0.2038174718618393, + "learning_rate": 3.605607716675401e-05, + "loss": 1.6777, + "step": 19623 + }, + { + "epoch": 6.023327194597913, + "grad_norm": 0.28923583030700684, + "learning_rate": 3.605130387577631e-05, + "loss": 1.7175, + "step": 19624 + }, + { + "epoch": 6.023634131368938, + "grad_norm": 0.3004317283630371, + "learning_rate": 3.6046530722647816e-05, + "loss": 1.8059, + "step": 19625 + }, + { + "epoch": 6.023941068139963, + "grad_norm": 0.19832390546798706, + "learning_rate": 3.6041757707415666e-05, + "loss": 1.7197, + "step": 19626 + }, + { + "epoch": 6.024248004910988, + "grad_norm": 0.2782927453517914, + "learning_rate": 3.6036984830127054e-05, + "loss": 1.6563, + "step": 19627 + }, + { + "epoch": 6.024554941682013, + "grad_norm": 0.20395785570144653, + "learning_rate": 3.603221209082913e-05, + "loss": 1.6972, + "step": 19628 + }, + { + "epoch": 6.024861878453039, + "grad_norm": 0.26302096247673035, + "learning_rate": 3.60274394895691e-05, + "loss": 1.7348, + "step": 19629 + }, + { + "epoch": 6.025168815224064, + "grad_norm": 0.26376327872276306, + "learning_rate": 3.6022667026394095e-05, + "loss": 1.7183, + "step": 19630 + }, + { + "epoch": 6.0254757519950894, + "grad_norm": 0.20590877532958984, + "learning_rate": 3.601789470135127e-05, + "loss": 1.7114, + "step": 19631 + }, + { + "epoch": 6.025782688766114, + "grad_norm": 0.2873607277870178, + "learning_rate": 3.6013122514487815e-05, + "loss": 1.7598, + "step": 19632 + }, + { + "epoch": 6.026089625537139, + "grad_norm": 0.24324963986873627, + "learning_rate": 3.600835046585087e-05, + "loss": 1.8844, + "step": 19633 + }, + { + "epoch": 6.026396562308165, + "grad_norm": 0.27910730242729187, + "learning_rate": 3.6003578555487624e-05, + "loss": 1.8598, + "step": 19634 + }, + { + "epoch": 6.02670349907919, + "grad_norm": 0.22766844928264618, + "learning_rate": 3.59988067834452e-05, + "loss": 1.7281, + "step": 19635 + }, + { + "epoch": 6.027010435850215, + "grad_norm": 0.2390190064907074, + "learning_rate": 3.5994035149770804e-05, + "loss": 1.7355, + "step": 19636 + }, + { + "epoch": 6.02731737262124, + "grad_norm": 0.23422548174858093, + "learning_rate": 3.598926365451153e-05, + "loss": 1.7226, + "step": 19637 + }, + { + "epoch": 6.027624309392265, + "grad_norm": 0.20240288972854614, + "learning_rate": 3.598449229771458e-05, + "loss": 1.7523, + "step": 19638 + }, + { + "epoch": 6.02793124616329, + "grad_norm": 0.26388832926750183, + "learning_rate": 3.597972107942708e-05, + "loss": 1.7003, + "step": 19639 + }, + { + "epoch": 6.028238182934316, + "grad_norm": 0.19814053177833557, + "learning_rate": 3.597494999969622e-05, + "loss": 1.7087, + "step": 19640 + }, + { + "epoch": 6.028545119705341, + "grad_norm": 0.2779136896133423, + "learning_rate": 3.5970179058569095e-05, + "loss": 1.7581, + "step": 19641 + }, + { + "epoch": 6.0288520564763655, + "grad_norm": 0.220394566655159, + "learning_rate": 3.5965408256092905e-05, + "loss": 1.7236, + "step": 19642 + }, + { + "epoch": 6.029158993247391, + "grad_norm": 0.28568828105926514, + "learning_rate": 3.596063759231476e-05, + "loss": 1.7933, + "step": 19643 + }, + { + "epoch": 6.029465930018416, + "grad_norm": 0.19509564340114594, + "learning_rate": 3.595586706728183e-05, + "loss": 1.6803, + "step": 19644 + }, + { + "epoch": 6.0297728667894415, + "grad_norm": 0.30855104327201843, + "learning_rate": 3.595109668104124e-05, + "loss": 1.7345, + "step": 19645 + }, + { + "epoch": 6.030079803560467, + "grad_norm": 0.24195496737957, + "learning_rate": 3.5946326433640174e-05, + "loss": 1.7493, + "step": 19646 + }, + { + "epoch": 6.030386740331492, + "grad_norm": 0.28324684500694275, + "learning_rate": 3.5941556325125744e-05, + "loss": 1.7959, + "step": 19647 + }, + { + "epoch": 6.030693677102517, + "grad_norm": 0.25351646542549133, + "learning_rate": 3.593678635554508e-05, + "loss": 1.7298, + "step": 19648 + }, + { + "epoch": 6.031000613873542, + "grad_norm": 0.2608177959918976, + "learning_rate": 3.593201652494534e-05, + "loss": 1.7072, + "step": 19649 + }, + { + "epoch": 6.031307550644567, + "grad_norm": 0.3182333707809448, + "learning_rate": 3.592724683337365e-05, + "loss": 1.6976, + "step": 19650 + }, + { + "epoch": 6.031614487415593, + "grad_norm": 0.19296859204769135, + "learning_rate": 3.592247728087717e-05, + "loss": 1.6879, + "step": 19651 + }, + { + "epoch": 6.031921424186618, + "grad_norm": 0.3927764594554901, + "learning_rate": 3.591770786750301e-05, + "loss": 1.6824, + "step": 19652 + }, + { + "epoch": 6.032228360957642, + "grad_norm": 0.23609496653079987, + "learning_rate": 3.591293859329833e-05, + "loss": 1.7224, + "step": 19653 + }, + { + "epoch": 6.032535297728668, + "grad_norm": 0.40787333250045776, + "learning_rate": 3.590816945831023e-05, + "loss": 1.7206, + "step": 19654 + }, + { + "epoch": 6.032842234499693, + "grad_norm": 0.31101885437965393, + "learning_rate": 3.590340046258586e-05, + "loss": 1.7446, + "step": 19655 + }, + { + "epoch": 6.033149171270718, + "grad_norm": 0.19401656091213226, + "learning_rate": 3.589863160617235e-05, + "loss": 1.6778, + "step": 19656 + }, + { + "epoch": 6.033456108041744, + "grad_norm": 0.3309115469455719, + "learning_rate": 3.589386288911684e-05, + "loss": 1.7196, + "step": 19657 + }, + { + "epoch": 6.033763044812768, + "grad_norm": 0.22281408309936523, + "learning_rate": 3.588909431146643e-05, + "loss": 1.7122, + "step": 19658 + }, + { + "epoch": 6.0340699815837935, + "grad_norm": 0.2903781831264496, + "learning_rate": 3.5884325873268275e-05, + "loss": 1.7428, + "step": 19659 + }, + { + "epoch": 6.034376918354819, + "grad_norm": 0.2529856562614441, + "learning_rate": 3.587955757456947e-05, + "loss": 1.7075, + "step": 19660 + }, + { + "epoch": 6.034683855125844, + "grad_norm": 0.2445102334022522, + "learning_rate": 3.587478941541716e-05, + "loss": 1.6631, + "step": 19661 + }, + { + "epoch": 6.0349907918968695, + "grad_norm": 0.31834688782691956, + "learning_rate": 3.5870021395858454e-05, + "loss": 1.7009, + "step": 19662 + }, + { + "epoch": 6.035297728667895, + "grad_norm": 0.20666317641735077, + "learning_rate": 3.5865253515940496e-05, + "loss": 1.7252, + "step": 19663 + }, + { + "epoch": 6.035604665438919, + "grad_norm": 0.3070019483566284, + "learning_rate": 3.586048577571039e-05, + "loss": 1.7139, + "step": 19664 + }, + { + "epoch": 6.035911602209945, + "grad_norm": 0.22463096678256989, + "learning_rate": 3.585571817521522e-05, + "loss": 1.7574, + "step": 19665 + }, + { + "epoch": 6.03621853898097, + "grad_norm": 0.25405722856521606, + "learning_rate": 3.585095071450216e-05, + "loss": 1.7135, + "step": 19666 + }, + { + "epoch": 6.036525475751995, + "grad_norm": 0.24543432891368866, + "learning_rate": 3.584618339361828e-05, + "loss": 1.7312, + "step": 19667 + }, + { + "epoch": 6.036832412523021, + "grad_norm": 0.2454189658164978, + "learning_rate": 3.584141621261073e-05, + "loss": 1.7905, + "step": 19668 + }, + { + "epoch": 6.037139349294045, + "grad_norm": 0.2163272649049759, + "learning_rate": 3.583664917152658e-05, + "loss": 1.7042, + "step": 19669 + }, + { + "epoch": 6.03744628606507, + "grad_norm": 0.2088690549135208, + "learning_rate": 3.5831882270412994e-05, + "loss": 1.7905, + "step": 19670 + }, + { + "epoch": 6.037753222836096, + "grad_norm": 0.26145869493484497, + "learning_rate": 3.5827115509317024e-05, + "loss": 1.7487, + "step": 19671 + }, + { + "epoch": 6.038060159607121, + "grad_norm": 0.20306496322155, + "learning_rate": 3.582234888828582e-05, + "loss": 1.7103, + "step": 19672 + }, + { + "epoch": 6.038367096378146, + "grad_norm": 0.2504192292690277, + "learning_rate": 3.5817582407366454e-05, + "loss": 1.7397, + "step": 19673 + }, + { + "epoch": 6.038674033149171, + "grad_norm": 0.22803208231925964, + "learning_rate": 3.5812816066606084e-05, + "loss": 1.7105, + "step": 19674 + }, + { + "epoch": 6.038980969920196, + "grad_norm": 0.24963071942329407, + "learning_rate": 3.580804986605176e-05, + "loss": 1.734, + "step": 19675 + }, + { + "epoch": 6.0392879066912215, + "grad_norm": 0.2468494027853012, + "learning_rate": 3.580328380575062e-05, + "loss": 1.6866, + "step": 19676 + }, + { + "epoch": 6.039594843462247, + "grad_norm": 0.17628586292266846, + "learning_rate": 3.579851788574973e-05, + "loss": 1.7106, + "step": 19677 + }, + { + "epoch": 6.039901780233272, + "grad_norm": 0.23965299129486084, + "learning_rate": 3.579375210609622e-05, + "loss": 1.7675, + "step": 19678 + }, + { + "epoch": 6.0402087170042975, + "grad_norm": 0.19638453423976898, + "learning_rate": 3.5788986466837175e-05, + "loss": 1.7242, + "step": 19679 + }, + { + "epoch": 6.040515653775322, + "grad_norm": 0.2602851092815399, + "learning_rate": 3.578422096801971e-05, + "loss": 1.7287, + "step": 19680 + }, + { + "epoch": 6.040822590546347, + "grad_norm": 0.25868186354637146, + "learning_rate": 3.577945560969091e-05, + "loss": 1.7604, + "step": 19681 + }, + { + "epoch": 6.041129527317373, + "grad_norm": 0.1996527463197708, + "learning_rate": 3.577469039189784e-05, + "loss": 1.7469, + "step": 19682 + }, + { + "epoch": 6.041436464088398, + "grad_norm": 0.29909980297088623, + "learning_rate": 3.576992531468763e-05, + "loss": 1.682, + "step": 19683 + }, + { + "epoch": 6.041743400859423, + "grad_norm": 0.20064286887645721, + "learning_rate": 3.576516037810734e-05, + "loss": 1.7125, + "step": 19684 + }, + { + "epoch": 6.042050337630448, + "grad_norm": 0.2134515345096588, + "learning_rate": 3.576039558220411e-05, + "loss": 1.7371, + "step": 19685 + }, + { + "epoch": 6.042357274401473, + "grad_norm": 0.20365437865257263, + "learning_rate": 3.575563092702497e-05, + "loss": 1.7446, + "step": 19686 + }, + { + "epoch": 6.042664211172498, + "grad_norm": 0.24526065587997437, + "learning_rate": 3.5750866412617054e-05, + "loss": 1.759, + "step": 19687 + }, + { + "epoch": 6.042971147943524, + "grad_norm": 0.24521295726299286, + "learning_rate": 3.5746102039027414e-05, + "loss": 1.7589, + "step": 19688 + }, + { + "epoch": 6.043278084714549, + "grad_norm": 0.2151515632867813, + "learning_rate": 3.5741337806303155e-05, + "loss": 1.761, + "step": 19689 + }, + { + "epoch": 6.043585021485574, + "grad_norm": 0.25733521580696106, + "learning_rate": 3.573657371449134e-05, + "loss": 1.7171, + "step": 19690 + }, + { + "epoch": 6.043891958256599, + "grad_norm": 0.18520839512348175, + "learning_rate": 3.5731809763639084e-05, + "loss": 1.6691, + "step": 19691 + }, + { + "epoch": 6.044198895027624, + "grad_norm": 0.24617944657802582, + "learning_rate": 3.572704595379342e-05, + "loss": 1.7869, + "step": 19692 + }, + { + "epoch": 6.0445058317986495, + "grad_norm": 0.20246629416942596, + "learning_rate": 3.5722282285001493e-05, + "loss": 1.7667, + "step": 19693 + }, + { + "epoch": 6.044812768569675, + "grad_norm": 0.21190209686756134, + "learning_rate": 3.5717518757310305e-05, + "loss": 1.6839, + "step": 19694 + }, + { + "epoch": 6.0451197053407, + "grad_norm": 0.19021087884902954, + "learning_rate": 3.571275537076699e-05, + "loss": 1.7023, + "step": 19695 + }, + { + "epoch": 6.045426642111725, + "grad_norm": 0.1793040931224823, + "learning_rate": 3.570799212541858e-05, + "loss": 1.7022, + "step": 19696 + }, + { + "epoch": 6.04573357888275, + "grad_norm": 0.19105301797389984, + "learning_rate": 3.570322902131219e-05, + "loss": 1.7151, + "step": 19697 + }, + { + "epoch": 6.046040515653775, + "grad_norm": 0.22083842754364014, + "learning_rate": 3.569846605849487e-05, + "loss": 1.7097, + "step": 19698 + }, + { + "epoch": 6.046347452424801, + "grad_norm": 0.2607622444629669, + "learning_rate": 3.569370323701368e-05, + "loss": 1.7508, + "step": 19699 + }, + { + "epoch": 6.046654389195826, + "grad_norm": 0.22349929809570312, + "learning_rate": 3.56889405569157e-05, + "loss": 1.7131, + "step": 19700 + }, + { + "epoch": 6.04696132596685, + "grad_norm": 0.19442661106586456, + "learning_rate": 3.5684178018247996e-05, + "loss": 1.7476, + "step": 19701 + }, + { + "epoch": 6.047268262737876, + "grad_norm": 0.2002776861190796, + "learning_rate": 3.5679415621057646e-05, + "loss": 1.7982, + "step": 19702 + }, + { + "epoch": 6.047575199508901, + "grad_norm": 0.21558646857738495, + "learning_rate": 3.567465336539169e-05, + "loss": 1.7231, + "step": 19703 + }, + { + "epoch": 6.047882136279926, + "grad_norm": 0.20468449592590332, + "learning_rate": 3.5669891251297224e-05, + "loss": 1.6426, + "step": 19704 + }, + { + "epoch": 6.048189073050952, + "grad_norm": 0.23098553717136383, + "learning_rate": 3.566512927882127e-05, + "loss": 1.7763, + "step": 19705 + }, + { + "epoch": 6.048496009821977, + "grad_norm": 0.22959274053573608, + "learning_rate": 3.566036744801092e-05, + "loss": 1.7663, + "step": 19706 + }, + { + "epoch": 6.0488029465930016, + "grad_norm": 0.18519435822963715, + "learning_rate": 3.5655605758913215e-05, + "loss": 1.6995, + "step": 19707 + }, + { + "epoch": 6.049109883364027, + "grad_norm": 0.2529381513595581, + "learning_rate": 3.565084421157524e-05, + "loss": 1.754, + "step": 19708 + }, + { + "epoch": 6.049416820135052, + "grad_norm": 0.2208617776632309, + "learning_rate": 3.5646082806044015e-05, + "loss": 1.6939, + "step": 19709 + }, + { + "epoch": 6.0497237569060776, + "grad_norm": 0.18433862924575806, + "learning_rate": 3.564132154236663e-05, + "loss": 1.7145, + "step": 19710 + }, + { + "epoch": 6.050030693677103, + "grad_norm": 0.1963127702474594, + "learning_rate": 3.563656042059011e-05, + "loss": 1.7101, + "step": 19711 + }, + { + "epoch": 6.050337630448127, + "grad_norm": 0.19860461354255676, + "learning_rate": 3.5631799440761526e-05, + "loss": 1.7218, + "step": 19712 + }, + { + "epoch": 6.050644567219153, + "grad_norm": 0.19304174184799194, + "learning_rate": 3.5627038602927905e-05, + "loss": 1.7575, + "step": 19713 + }, + { + "epoch": 6.050951503990178, + "grad_norm": 0.20402809977531433, + "learning_rate": 3.5622277907136335e-05, + "loss": 1.7438, + "step": 19714 + }, + { + "epoch": 6.051258440761203, + "grad_norm": 0.20821911096572876, + "learning_rate": 3.5617517353433844e-05, + "loss": 1.7381, + "step": 19715 + }, + { + "epoch": 6.051565377532229, + "grad_norm": 0.24375931918621063, + "learning_rate": 3.561275694186745e-05, + "loss": 1.8377, + "step": 19716 + }, + { + "epoch": 6.051872314303253, + "grad_norm": 0.19745339453220367, + "learning_rate": 3.560799667248424e-05, + "loss": 1.6839, + "step": 19717 + }, + { + "epoch": 6.0521792510742785, + "grad_norm": 0.2039431631565094, + "learning_rate": 3.560323654533124e-05, + "loss": 1.692, + "step": 19718 + }, + { + "epoch": 6.052486187845304, + "grad_norm": 0.23229047656059265, + "learning_rate": 3.559847656045551e-05, + "loss": 1.7408, + "step": 19719 + }, + { + "epoch": 6.052793124616329, + "grad_norm": 0.20387259125709534, + "learning_rate": 3.559371671790404e-05, + "loss": 1.7215, + "step": 19720 + }, + { + "epoch": 6.0531000613873545, + "grad_norm": 0.23960062861442566, + "learning_rate": 3.5588957017723944e-05, + "loss": 1.8048, + "step": 19721 + }, + { + "epoch": 6.05340699815838, + "grad_norm": 0.1979944109916687, + "learning_rate": 3.5584197459962196e-05, + "loss": 1.7307, + "step": 19722 + }, + { + "epoch": 6.053713934929404, + "grad_norm": 0.21914203464984894, + "learning_rate": 3.557943804466586e-05, + "loss": 1.6999, + "step": 19723 + }, + { + "epoch": 6.05402087170043, + "grad_norm": 0.22338175773620605, + "learning_rate": 3.557467877188197e-05, + "loss": 1.6977, + "step": 19724 + }, + { + "epoch": 6.054327808471455, + "grad_norm": 0.2692863643169403, + "learning_rate": 3.5569919641657576e-05, + "loss": 1.7664, + "step": 19725 + }, + { + "epoch": 6.05463474524248, + "grad_norm": 0.2882823944091797, + "learning_rate": 3.5565160654039675e-05, + "loss": 1.6943, + "step": 19726 + }, + { + "epoch": 6.054941682013506, + "grad_norm": 0.2114996612071991, + "learning_rate": 3.5560401809075336e-05, + "loss": 1.7426, + "step": 19727 + }, + { + "epoch": 6.05524861878453, + "grad_norm": 0.19616106152534485, + "learning_rate": 3.5555643106811546e-05, + "loss": 1.6616, + "step": 19728 + }, + { + "epoch": 6.055555555555555, + "grad_norm": 0.241346076130867, + "learning_rate": 3.555088454729537e-05, + "loss": 1.7423, + "step": 19729 + }, + { + "epoch": 6.055862492326581, + "grad_norm": 0.24495846033096313, + "learning_rate": 3.554612613057381e-05, + "loss": 1.7699, + "step": 19730 + }, + { + "epoch": 6.056169429097606, + "grad_norm": 0.233306422829628, + "learning_rate": 3.554136785669393e-05, + "loss": 1.7201, + "step": 19731 + }, + { + "epoch": 6.056476365868631, + "grad_norm": 0.23820927739143372, + "learning_rate": 3.553660972570272e-05, + "loss": 1.7694, + "step": 19732 + }, + { + "epoch": 6.056783302639656, + "grad_norm": 0.20664167404174805, + "learning_rate": 3.553185173764719e-05, + "loss": 1.7151, + "step": 19733 + }, + { + "epoch": 6.057090239410681, + "grad_norm": 0.22572578489780426, + "learning_rate": 3.5527093892574394e-05, + "loss": 1.7715, + "step": 19734 + }, + { + "epoch": 6.0573971761817065, + "grad_norm": 0.18554186820983887, + "learning_rate": 3.552233619053133e-05, + "loss": 1.7481, + "step": 19735 + }, + { + "epoch": 6.057704112952732, + "grad_norm": 0.2434636950492859, + "learning_rate": 3.551757863156504e-05, + "loss": 1.7992, + "step": 19736 + }, + { + "epoch": 6.058011049723757, + "grad_norm": 0.1949392408132553, + "learning_rate": 3.5512821215722514e-05, + "loss": 1.7439, + "step": 19737 + }, + { + "epoch": 6.0583179864947825, + "grad_norm": 0.2696731686592102, + "learning_rate": 3.55080639430508e-05, + "loss": 1.7092, + "step": 19738 + }, + { + "epoch": 6.058624923265807, + "grad_norm": 0.1963263303041458, + "learning_rate": 3.550330681359686e-05, + "loss": 1.6726, + "step": 19739 + }, + { + "epoch": 6.058931860036832, + "grad_norm": 0.20115122199058533, + "learning_rate": 3.549854982740776e-05, + "loss": 1.7459, + "step": 19740 + }, + { + "epoch": 6.059238796807858, + "grad_norm": 0.21378284692764282, + "learning_rate": 3.549379298453048e-05, + "loss": 1.7028, + "step": 19741 + }, + { + "epoch": 6.059545733578883, + "grad_norm": 0.21954336762428284, + "learning_rate": 3.5489036285012055e-05, + "loss": 1.7209, + "step": 19742 + }, + { + "epoch": 6.059852670349908, + "grad_norm": 0.20117704570293427, + "learning_rate": 3.548427972889946e-05, + "loss": 1.7273, + "step": 19743 + }, + { + "epoch": 6.060159607120933, + "grad_norm": 0.23786263167858124, + "learning_rate": 3.5479523316239745e-05, + "loss": 1.7519, + "step": 19744 + }, + { + "epoch": 6.060466543891958, + "grad_norm": 0.17704391479492188, + "learning_rate": 3.5474767047079864e-05, + "loss": 1.6644, + "step": 19745 + }, + { + "epoch": 6.060773480662983, + "grad_norm": 0.1883699744939804, + "learning_rate": 3.547001092146687e-05, + "loss": 1.6586, + "step": 19746 + }, + { + "epoch": 6.061080417434009, + "grad_norm": 0.19101519882678986, + "learning_rate": 3.546525493944773e-05, + "loss": 1.7575, + "step": 19747 + }, + { + "epoch": 6.061387354205034, + "grad_norm": 0.1924263834953308, + "learning_rate": 3.546049910106947e-05, + "loss": 1.743, + "step": 19748 + }, + { + "epoch": 6.0616942909760585, + "grad_norm": 0.1853020042181015, + "learning_rate": 3.5455743406379084e-05, + "loss": 1.7466, + "step": 19749 + }, + { + "epoch": 6.062001227747084, + "grad_norm": 0.21322499215602875, + "learning_rate": 3.545098785542355e-05, + "loss": 1.7625, + "step": 19750 + }, + { + "epoch": 6.062308164518109, + "grad_norm": 0.1567271500825882, + "learning_rate": 3.544623244824989e-05, + "loss": 1.6531, + "step": 19751 + }, + { + "epoch": 6.0626151012891345, + "grad_norm": 0.2125476449728012, + "learning_rate": 3.544147718490508e-05, + "loss": 1.7547, + "step": 19752 + }, + { + "epoch": 6.06292203806016, + "grad_norm": 0.19470059871673584, + "learning_rate": 3.543672206543615e-05, + "loss": 1.7327, + "step": 19753 + }, + { + "epoch": 6.063228974831185, + "grad_norm": 0.1690339744091034, + "learning_rate": 3.543196708989004e-05, + "loss": 1.6621, + "step": 19754 + }, + { + "epoch": 6.06353591160221, + "grad_norm": 0.17322230339050293, + "learning_rate": 3.54272122583138e-05, + "loss": 1.7018, + "step": 19755 + }, + { + "epoch": 6.063842848373235, + "grad_norm": 0.22174575924873352, + "learning_rate": 3.5422457570754365e-05, + "loss": 1.724, + "step": 19756 + }, + { + "epoch": 6.06414978514426, + "grad_norm": 0.20233364403247833, + "learning_rate": 3.541770302725875e-05, + "loss": 1.6518, + "step": 19757 + }, + { + "epoch": 6.064456721915286, + "grad_norm": 0.1585279405117035, + "learning_rate": 3.541294862787395e-05, + "loss": 1.6985, + "step": 19758 + }, + { + "epoch": 6.064763658686311, + "grad_norm": 0.2180105745792389, + "learning_rate": 3.540819437264694e-05, + "loss": 1.6728, + "step": 19759 + }, + { + "epoch": 6.065070595457335, + "grad_norm": 0.2295975238084793, + "learning_rate": 3.5403440261624696e-05, + "loss": 1.7566, + "step": 19760 + }, + { + "epoch": 6.065377532228361, + "grad_norm": 0.17460396885871887, + "learning_rate": 3.5398686294854234e-05, + "loss": 1.6977, + "step": 19761 + }, + { + "epoch": 6.065684468999386, + "grad_norm": 0.20828662812709808, + "learning_rate": 3.539393247238249e-05, + "loss": 1.7789, + "step": 19762 + }, + { + "epoch": 6.065991405770411, + "grad_norm": 0.2273385375738144, + "learning_rate": 3.5389178794256476e-05, + "loss": 1.7316, + "step": 19763 + }, + { + "epoch": 6.066298342541437, + "grad_norm": 0.2332257330417633, + "learning_rate": 3.538442526052316e-05, + "loss": 1.7355, + "step": 19764 + }, + { + "epoch": 6.066605279312462, + "grad_norm": 0.17953866720199585, + "learning_rate": 3.537967187122952e-05, + "loss": 1.7107, + "step": 19765 + }, + { + "epoch": 6.0669122160834865, + "grad_norm": 0.2334052473306656, + "learning_rate": 3.537491862642254e-05, + "loss": 1.7572, + "step": 19766 + }, + { + "epoch": 6.067219152854512, + "grad_norm": 0.2427968829870224, + "learning_rate": 3.5370165526149165e-05, + "loss": 1.7254, + "step": 19767 + }, + { + "epoch": 6.067526089625537, + "grad_norm": 0.2701692283153534, + "learning_rate": 3.53654125704564e-05, + "loss": 1.7525, + "step": 19768 + }, + { + "epoch": 6.0678330263965625, + "grad_norm": 0.3775569796562195, + "learning_rate": 3.536065975939121e-05, + "loss": 1.7516, + "step": 19769 + }, + { + "epoch": 6.068139963167588, + "grad_norm": 0.18971984088420868, + "learning_rate": 3.535590709300056e-05, + "loss": 1.6777, + "step": 19770 + }, + { + "epoch": 6.068446899938612, + "grad_norm": 0.2710094749927521, + "learning_rate": 3.535115457133141e-05, + "loss": 1.7612, + "step": 19771 + }, + { + "epoch": 6.068753836709638, + "grad_norm": 0.19414621591567993, + "learning_rate": 3.534640219443075e-05, + "loss": 1.6795, + "step": 19772 + }, + { + "epoch": 6.069060773480663, + "grad_norm": 0.2384893298149109, + "learning_rate": 3.534164996234552e-05, + "loss": 1.7869, + "step": 19773 + }, + { + "epoch": 6.069367710251688, + "grad_norm": 0.2206166833639145, + "learning_rate": 3.533689787512271e-05, + "loss": 1.7332, + "step": 19774 + }, + { + "epoch": 6.069674647022714, + "grad_norm": 0.19740800559520721, + "learning_rate": 3.533214593280926e-05, + "loss": 1.6744, + "step": 19775 + }, + { + "epoch": 6.069981583793738, + "grad_norm": 0.2098212093114853, + "learning_rate": 3.532739413545214e-05, + "loss": 1.731, + "step": 19776 + }, + { + "epoch": 6.070288520564763, + "grad_norm": 0.2508943974971771, + "learning_rate": 3.5322642483098304e-05, + "loss": 1.7682, + "step": 19777 + }, + { + "epoch": 6.070595457335789, + "grad_norm": 0.22202368080615997, + "learning_rate": 3.531789097579474e-05, + "loss": 1.6965, + "step": 19778 + }, + { + "epoch": 6.070902394106814, + "grad_norm": 0.19276803731918335, + "learning_rate": 3.5313139613588355e-05, + "loss": 1.6855, + "step": 19779 + }, + { + "epoch": 6.071209330877839, + "grad_norm": 0.23910140991210938, + "learning_rate": 3.530838839652616e-05, + "loss": 1.8099, + "step": 19780 + }, + { + "epoch": 6.071516267648865, + "grad_norm": 0.19440437853336334, + "learning_rate": 3.530363732465506e-05, + "loss": 1.67, + "step": 19781 + }, + { + "epoch": 6.071823204419889, + "grad_norm": 0.1954154074192047, + "learning_rate": 3.529888639802204e-05, + "loss": 1.7154, + "step": 19782 + }, + { + "epoch": 6.0721301411909145, + "grad_norm": 0.20836392045021057, + "learning_rate": 3.529413561667405e-05, + "loss": 1.7451, + "step": 19783 + }, + { + "epoch": 6.07243707796194, + "grad_norm": 0.20521731674671173, + "learning_rate": 3.5289384980658016e-05, + "loss": 1.7008, + "step": 19784 + }, + { + "epoch": 6.072744014732965, + "grad_norm": 0.22885540127754211, + "learning_rate": 3.528463449002092e-05, + "loss": 1.7605, + "step": 19785 + }, + { + "epoch": 6.0730509515039905, + "grad_norm": 0.27740219235420227, + "learning_rate": 3.5279884144809664e-05, + "loss": 1.7816, + "step": 19786 + }, + { + "epoch": 6.073357888275015, + "grad_norm": 0.24747557938098907, + "learning_rate": 3.527513394507124e-05, + "loss": 1.7207, + "step": 19787 + }, + { + "epoch": 6.07366482504604, + "grad_norm": 0.20127782225608826, + "learning_rate": 3.527038389085256e-05, + "loss": 1.702, + "step": 19788 + }, + { + "epoch": 6.073971761817066, + "grad_norm": 0.20683316886425018, + "learning_rate": 3.5265633982200595e-05, + "loss": 1.7022, + "step": 19789 + }, + { + "epoch": 6.074278698588091, + "grad_norm": 0.17829765379428864, + "learning_rate": 3.5260884219162256e-05, + "loss": 1.7099, + "step": 19790 + }, + { + "epoch": 6.074585635359116, + "grad_norm": 0.256964772939682, + "learning_rate": 3.525613460178452e-05, + "loss": 1.7226, + "step": 19791 + }, + { + "epoch": 6.074892572130141, + "grad_norm": 0.22840122878551483, + "learning_rate": 3.525138513011428e-05, + "loss": 1.7738, + "step": 19792 + }, + { + "epoch": 6.075199508901166, + "grad_norm": 0.18988655507564545, + "learning_rate": 3.52466358041985e-05, + "loss": 1.6775, + "step": 19793 + }, + { + "epoch": 6.0755064456721914, + "grad_norm": 0.21857139468193054, + "learning_rate": 3.524188662408411e-05, + "loss": 1.7596, + "step": 19794 + }, + { + "epoch": 6.075813382443217, + "grad_norm": 0.22910535335540771, + "learning_rate": 3.523713758981807e-05, + "loss": 1.7969, + "step": 19795 + }, + { + "epoch": 6.076120319214242, + "grad_norm": 0.20885716378688812, + "learning_rate": 3.523238870144726e-05, + "loss": 1.7407, + "step": 19796 + }, + { + "epoch": 6.0764272559852675, + "grad_norm": 0.2056209295988083, + "learning_rate": 3.5227639959018666e-05, + "loss": 1.759, + "step": 19797 + }, + { + "epoch": 6.076734192756292, + "grad_norm": 0.17485356330871582, + "learning_rate": 3.522289136257917e-05, + "loss": 1.6988, + "step": 19798 + }, + { + "epoch": 6.077041129527317, + "grad_norm": 0.2103404402732849, + "learning_rate": 3.521814291217573e-05, + "loss": 1.766, + "step": 19799 + }, + { + "epoch": 6.077348066298343, + "grad_norm": 0.21852105855941772, + "learning_rate": 3.521339460785528e-05, + "loss": 1.7435, + "step": 19800 + }, + { + "epoch": 6.077655003069368, + "grad_norm": 0.21578362584114075, + "learning_rate": 3.520864644966471e-05, + "loss": 1.7281, + "step": 19801 + }, + { + "epoch": 6.077961939840393, + "grad_norm": 0.20405036211013794, + "learning_rate": 3.520389843765099e-05, + "loss": 1.7367, + "step": 19802 + }, + { + "epoch": 6.078268876611418, + "grad_norm": 0.2578286826610565, + "learning_rate": 3.5199150571860996e-05, + "loss": 1.7625, + "step": 19803 + }, + { + "epoch": 6.078575813382443, + "grad_norm": 0.240324467420578, + "learning_rate": 3.519440285234168e-05, + "loss": 1.6979, + "step": 19804 + }, + { + "epoch": 6.078882750153468, + "grad_norm": 0.220765620470047, + "learning_rate": 3.5189655279139935e-05, + "loss": 1.7679, + "step": 19805 + }, + { + "epoch": 6.079189686924494, + "grad_norm": 0.2731996774673462, + "learning_rate": 3.518490785230273e-05, + "loss": 1.6723, + "step": 19806 + }, + { + "epoch": 6.079496623695519, + "grad_norm": 0.2593478262424469, + "learning_rate": 3.518016057187692e-05, + "loss": 1.7232, + "step": 19807 + }, + { + "epoch": 6.0798035604665435, + "grad_norm": 0.34642404317855835, + "learning_rate": 3.517541343790947e-05, + "loss": 1.8265, + "step": 19808 + }, + { + "epoch": 6.080110497237569, + "grad_norm": 0.3187299370765686, + "learning_rate": 3.5170666450447255e-05, + "loss": 1.6847, + "step": 19809 + }, + { + "epoch": 6.080417434008594, + "grad_norm": 0.20413202047348022, + "learning_rate": 3.5165919609537215e-05, + "loss": 1.6533, + "step": 19810 + }, + { + "epoch": 6.0807243707796195, + "grad_norm": 0.2753545343875885, + "learning_rate": 3.516117291522625e-05, + "loss": 1.7491, + "step": 19811 + }, + { + "epoch": 6.081031307550645, + "grad_norm": 0.20174793899059296, + "learning_rate": 3.515642636756128e-05, + "loss": 1.6902, + "step": 19812 + }, + { + "epoch": 6.08133824432167, + "grad_norm": 0.22567492723464966, + "learning_rate": 3.515167996658919e-05, + "loss": 1.7165, + "step": 19813 + }, + { + "epoch": 6.081645181092695, + "grad_norm": 0.2115732729434967, + "learning_rate": 3.514693371235692e-05, + "loss": 1.6888, + "step": 19814 + }, + { + "epoch": 6.08195211786372, + "grad_norm": 0.2141808122396469, + "learning_rate": 3.514218760491134e-05, + "loss": 1.7152, + "step": 19815 + }, + { + "epoch": 6.082259054634745, + "grad_norm": 0.19767558574676514, + "learning_rate": 3.513744164429938e-05, + "loss": 1.6926, + "step": 19816 + }, + { + "epoch": 6.082565991405771, + "grad_norm": 0.20220023393630981, + "learning_rate": 3.5132695830567944e-05, + "loss": 1.6727, + "step": 19817 + }, + { + "epoch": 6.082872928176796, + "grad_norm": 0.19589759409427643, + "learning_rate": 3.5127950163763896e-05, + "loss": 1.7545, + "step": 19818 + }, + { + "epoch": 6.08317986494782, + "grad_norm": 0.21303611993789673, + "learning_rate": 3.512320464393418e-05, + "loss": 1.753, + "step": 19819 + }, + { + "epoch": 6.083486801718846, + "grad_norm": 0.19438377022743225, + "learning_rate": 3.511845927112566e-05, + "loss": 1.7022, + "step": 19820 + }, + { + "epoch": 6.083793738489871, + "grad_norm": 0.21282976865768433, + "learning_rate": 3.511371404538526e-05, + "loss": 1.7099, + "step": 19821 + }, + { + "epoch": 6.084100675260896, + "grad_norm": 0.1874496042728424, + "learning_rate": 3.5108968966759846e-05, + "loss": 1.7033, + "step": 19822 + }, + { + "epoch": 6.084407612031922, + "grad_norm": 0.21199075877666473, + "learning_rate": 3.510422403529636e-05, + "loss": 1.7088, + "step": 19823 + }, + { + "epoch": 6.084714548802946, + "grad_norm": 0.21847110986709595, + "learning_rate": 3.5099479251041634e-05, + "loss": 1.7395, + "step": 19824 + }, + { + "epoch": 6.0850214855739715, + "grad_norm": 0.201395645737648, + "learning_rate": 3.509473461404261e-05, + "loss": 1.7522, + "step": 19825 + }, + { + "epoch": 6.085328422344997, + "grad_norm": 0.19637656211853027, + "learning_rate": 3.5089990124346135e-05, + "loss": 1.6774, + "step": 19826 + }, + { + "epoch": 6.085635359116022, + "grad_norm": 0.25918442010879517, + "learning_rate": 3.5085245781999124e-05, + "loss": 1.7704, + "step": 19827 + }, + { + "epoch": 6.0859422958870475, + "grad_norm": 0.21271947026252747, + "learning_rate": 3.508050158704844e-05, + "loss": 1.6902, + "step": 19828 + }, + { + "epoch": 6.086249232658073, + "grad_norm": 0.2065698802471161, + "learning_rate": 3.5075757539541024e-05, + "loss": 1.7945, + "step": 19829 + }, + { + "epoch": 6.086556169429097, + "grad_norm": 0.20247824490070343, + "learning_rate": 3.5071013639523684e-05, + "loss": 1.7532, + "step": 19830 + }, + { + "epoch": 6.086863106200123, + "grad_norm": 0.19705431163311005, + "learning_rate": 3.506626988704336e-05, + "loss": 1.6353, + "step": 19831 + }, + { + "epoch": 6.087170042971148, + "grad_norm": 0.20158523321151733, + "learning_rate": 3.5061526282146886e-05, + "loss": 1.6596, + "step": 19832 + }, + { + "epoch": 6.087476979742173, + "grad_norm": 0.19492848217487335, + "learning_rate": 3.505678282488118e-05, + "loss": 1.7107, + "step": 19833 + }, + { + "epoch": 6.087783916513199, + "grad_norm": 0.2403736114501953, + "learning_rate": 3.505203951529312e-05, + "loss": 1.7456, + "step": 19834 + }, + { + "epoch": 6.088090853284223, + "grad_norm": 0.25649771094322205, + "learning_rate": 3.504729635342954e-05, + "loss": 1.7513, + "step": 19835 + }, + { + "epoch": 6.088397790055248, + "grad_norm": 0.20172113180160522, + "learning_rate": 3.504255333933736e-05, + "loss": 1.7737, + "step": 19836 + }, + { + "epoch": 6.088704726826274, + "grad_norm": 0.2715936303138733, + "learning_rate": 3.5037810473063414e-05, + "loss": 1.759, + "step": 19837 + }, + { + "epoch": 6.089011663597299, + "grad_norm": 0.23145076632499695, + "learning_rate": 3.503306775465461e-05, + "loss": 1.7811, + "step": 19838 + }, + { + "epoch": 6.089318600368324, + "grad_norm": 0.1953691691160202, + "learning_rate": 3.502832518415778e-05, + "loss": 1.752, + "step": 19839 + }, + { + "epoch": 6.08962553713935, + "grad_norm": 0.1927584707736969, + "learning_rate": 3.502358276161986e-05, + "loss": 1.6865, + "step": 19840 + }, + { + "epoch": 6.089932473910374, + "grad_norm": 0.19294732809066772, + "learning_rate": 3.501884048708763e-05, + "loss": 1.6838, + "step": 19841 + }, + { + "epoch": 6.0902394106813995, + "grad_norm": 0.23351021111011505, + "learning_rate": 3.501409836060803e-05, + "loss": 1.8029, + "step": 19842 + }, + { + "epoch": 6.090546347452425, + "grad_norm": 0.21615718305110931, + "learning_rate": 3.5009356382227877e-05, + "loss": 1.7441, + "step": 19843 + }, + { + "epoch": 6.09085328422345, + "grad_norm": 0.19091549515724182, + "learning_rate": 3.500461455199405e-05, + "loss": 1.7056, + "step": 19844 + }, + { + "epoch": 6.0911602209944755, + "grad_norm": 0.21189090609550476, + "learning_rate": 3.499987286995341e-05, + "loss": 1.6853, + "step": 19845 + }, + { + "epoch": 6.0914671577655, + "grad_norm": 0.22545887529850006, + "learning_rate": 3.499513133615283e-05, + "loss": 1.7854, + "step": 19846 + }, + { + "epoch": 6.091774094536525, + "grad_norm": 0.21960650384426117, + "learning_rate": 3.4990389950639144e-05, + "loss": 1.7558, + "step": 19847 + }, + { + "epoch": 6.092081031307551, + "grad_norm": 0.20825782418251038, + "learning_rate": 3.4985648713459244e-05, + "loss": 1.7103, + "step": 19848 + }, + { + "epoch": 6.092387968078576, + "grad_norm": 0.20886415243148804, + "learning_rate": 3.498090762465993e-05, + "loss": 1.6897, + "step": 19849 + }, + { + "epoch": 6.092694904849601, + "grad_norm": 0.19306892156600952, + "learning_rate": 3.4976166684288115e-05, + "loss": 1.7506, + "step": 19850 + }, + { + "epoch": 6.093001841620626, + "grad_norm": 0.2178204357624054, + "learning_rate": 3.497142589239063e-05, + "loss": 1.6774, + "step": 19851 + }, + { + "epoch": 6.093308778391651, + "grad_norm": 0.1914307177066803, + "learning_rate": 3.4966685249014294e-05, + "loss": 1.7182, + "step": 19852 + }, + { + "epoch": 6.093615715162676, + "grad_norm": 0.22006092965602875, + "learning_rate": 3.496194475420602e-05, + "loss": 1.7209, + "step": 19853 + }, + { + "epoch": 6.093922651933702, + "grad_norm": 0.20621439814567566, + "learning_rate": 3.49572044080126e-05, + "loss": 1.7403, + "step": 19854 + }, + { + "epoch": 6.094229588704727, + "grad_norm": 0.24079272150993347, + "learning_rate": 3.495246421048091e-05, + "loss": 1.7619, + "step": 19855 + }, + { + "epoch": 6.094536525475752, + "grad_norm": 0.19073884189128876, + "learning_rate": 3.494772416165777e-05, + "loss": 1.6677, + "step": 19856 + }, + { + "epoch": 6.094843462246777, + "grad_norm": 0.18217229843139648, + "learning_rate": 3.494298426159007e-05, + "loss": 1.7162, + "step": 19857 + }, + { + "epoch": 6.095150399017802, + "grad_norm": 0.21901506185531616, + "learning_rate": 3.493824451032461e-05, + "loss": 1.7173, + "step": 19858 + }, + { + "epoch": 6.0954573357888275, + "grad_norm": 0.22156217694282532, + "learning_rate": 3.493350490790826e-05, + "loss": 1.8029, + "step": 19859 + }, + { + "epoch": 6.095764272559853, + "grad_norm": 0.1663675606250763, + "learning_rate": 3.4928765454387824e-05, + "loss": 1.7306, + "step": 19860 + }, + { + "epoch": 6.096071209330878, + "grad_norm": 0.19684657454490662, + "learning_rate": 3.4924026149810175e-05, + "loss": 1.6944, + "step": 19861 + }, + { + "epoch": 6.096378146101903, + "grad_norm": 0.19163468480110168, + "learning_rate": 3.4919286994222125e-05, + "loss": 1.7331, + "step": 19862 + }, + { + "epoch": 6.096685082872928, + "grad_norm": 0.20134083926677704, + "learning_rate": 3.491454798767054e-05, + "loss": 1.7365, + "step": 19863 + }, + { + "epoch": 6.096992019643953, + "grad_norm": 0.23877696692943573, + "learning_rate": 3.490980913020221e-05, + "loss": 1.753, + "step": 19864 + }, + { + "epoch": 6.097298956414979, + "grad_norm": 0.207699254155159, + "learning_rate": 3.490507042186402e-05, + "loss": 1.6835, + "step": 19865 + }, + { + "epoch": 6.097605893186004, + "grad_norm": 0.20608612895011902, + "learning_rate": 3.490033186270274e-05, + "loss": 1.7379, + "step": 19866 + }, + { + "epoch": 6.097912829957028, + "grad_norm": 0.25086313486099243, + "learning_rate": 3.489559345276524e-05, + "loss": 1.7692, + "step": 19867 + }, + { + "epoch": 6.098219766728054, + "grad_norm": 0.22025549411773682, + "learning_rate": 3.489085519209836e-05, + "loss": 1.6579, + "step": 19868 + }, + { + "epoch": 6.098526703499079, + "grad_norm": 0.23805730044841766, + "learning_rate": 3.4886117080748875e-05, + "loss": 1.7695, + "step": 19869 + }, + { + "epoch": 6.098833640270104, + "grad_norm": 0.23271869122982025, + "learning_rate": 3.4881379118763666e-05, + "loss": 1.7268, + "step": 19870 + }, + { + "epoch": 6.09914057704113, + "grad_norm": 0.21795618534088135, + "learning_rate": 3.4876641306189505e-05, + "loss": 1.6996, + "step": 19871 + }, + { + "epoch": 6.099447513812155, + "grad_norm": 0.22064761817455292, + "learning_rate": 3.487190364307326e-05, + "loss": 1.7032, + "step": 19872 + }, + { + "epoch": 6.0997544505831796, + "grad_norm": 0.23834183812141418, + "learning_rate": 3.4867166129461706e-05, + "loss": 1.6942, + "step": 19873 + }, + { + "epoch": 6.100061387354205, + "grad_norm": 0.21143686771392822, + "learning_rate": 3.486242876540171e-05, + "loss": 1.6904, + "step": 19874 + }, + { + "epoch": 6.10036832412523, + "grad_norm": 0.18099969625473022, + "learning_rate": 3.485769155094004e-05, + "loss": 1.6669, + "step": 19875 + }, + { + "epoch": 6.100675260896256, + "grad_norm": 0.25324884057044983, + "learning_rate": 3.4852954486123566e-05, + "loss": 1.7878, + "step": 19876 + }, + { + "epoch": 6.100982197667281, + "grad_norm": 0.2252139449119568, + "learning_rate": 3.4848217570999055e-05, + "loss": 1.7674, + "step": 19877 + }, + { + "epoch": 6.101289134438305, + "grad_norm": 0.19629882276058197, + "learning_rate": 3.4843480805613346e-05, + "loss": 1.6898, + "step": 19878 + }, + { + "epoch": 6.101596071209331, + "grad_norm": 0.1858786642551422, + "learning_rate": 3.483874419001323e-05, + "loss": 1.6856, + "step": 19879 + }, + { + "epoch": 6.101903007980356, + "grad_norm": 0.1842946857213974, + "learning_rate": 3.483400772424555e-05, + "loss": 1.7229, + "step": 19880 + }, + { + "epoch": 6.102209944751381, + "grad_norm": 0.18981511890888214, + "learning_rate": 3.482927140835708e-05, + "loss": 1.75, + "step": 19881 + }, + { + "epoch": 6.102516881522407, + "grad_norm": 0.19914525747299194, + "learning_rate": 3.482453524239466e-05, + "loss": 1.7702, + "step": 19882 + }, + { + "epoch": 6.102823818293431, + "grad_norm": 0.1960345208644867, + "learning_rate": 3.481979922640507e-05, + "loss": 1.7189, + "step": 19883 + }, + { + "epoch": 6.1031307550644565, + "grad_norm": 0.20309221744537354, + "learning_rate": 3.48150633604351e-05, + "loss": 1.7888, + "step": 19884 + }, + { + "epoch": 6.103437691835482, + "grad_norm": 0.20090891420841217, + "learning_rate": 3.48103276445316e-05, + "loss": 1.8017, + "step": 19885 + }, + { + "epoch": 6.103744628606507, + "grad_norm": 0.22500385344028473, + "learning_rate": 3.480559207874133e-05, + "loss": 1.7061, + "step": 19886 + }, + { + "epoch": 6.1040515653775325, + "grad_norm": 0.22594885528087616, + "learning_rate": 3.480085666311113e-05, + "loss": 1.7659, + "step": 19887 + }, + { + "epoch": 6.104358502148558, + "grad_norm": 0.2769651710987091, + "learning_rate": 3.479612139768774e-05, + "loss": 1.7668, + "step": 19888 + }, + { + "epoch": 6.104665438919582, + "grad_norm": 0.24251700937747955, + "learning_rate": 3.4791386282518e-05, + "loss": 1.8068, + "step": 19889 + }, + { + "epoch": 6.104972375690608, + "grad_norm": 0.23325790464878082, + "learning_rate": 3.478665131764869e-05, + "loss": 1.7116, + "step": 19890 + }, + { + "epoch": 6.105279312461633, + "grad_norm": 0.19998812675476074, + "learning_rate": 3.478191650312663e-05, + "loss": 1.7116, + "step": 19891 + }, + { + "epoch": 6.105586249232658, + "grad_norm": 0.20933640003204346, + "learning_rate": 3.4777181838998566e-05, + "loss": 1.7138, + "step": 19892 + }, + { + "epoch": 6.105893186003684, + "grad_norm": 0.24344035983085632, + "learning_rate": 3.477244732531134e-05, + "loss": 1.784, + "step": 19893 + }, + { + "epoch": 6.106200122774708, + "grad_norm": 0.2220575362443924, + "learning_rate": 3.4767712962111686e-05, + "loss": 1.7479, + "step": 19894 + }, + { + "epoch": 6.106507059545733, + "grad_norm": 0.2222832590341568, + "learning_rate": 3.476297874944644e-05, + "loss": 1.7278, + "step": 19895 + }, + { + "epoch": 6.106813996316759, + "grad_norm": 0.222265362739563, + "learning_rate": 3.4758244687362353e-05, + "loss": 1.7321, + "step": 19896 + }, + { + "epoch": 6.107120933087784, + "grad_norm": 0.2921304702758789, + "learning_rate": 3.475351077590625e-05, + "loss": 1.7848, + "step": 19897 + }, + { + "epoch": 6.107427869858809, + "grad_norm": 0.21015208959579468, + "learning_rate": 3.4748777015124856e-05, + "loss": 1.7987, + "step": 19898 + }, + { + "epoch": 6.107734806629834, + "grad_norm": 0.19510969519615173, + "learning_rate": 3.474404340506502e-05, + "loss": 1.7317, + "step": 19899 + }, + { + "epoch": 6.108041743400859, + "grad_norm": 0.21978609263896942, + "learning_rate": 3.473930994577348e-05, + "loss": 1.6943, + "step": 19900 + }, + { + "epoch": 6.1083486801718845, + "grad_norm": 0.1793510913848877, + "learning_rate": 3.4734576637297004e-05, + "loss": 1.6659, + "step": 19901 + }, + { + "epoch": 6.10865561694291, + "grad_norm": 0.2029319554567337, + "learning_rate": 3.4729843479682414e-05, + "loss": 1.7127, + "step": 19902 + }, + { + "epoch": 6.108962553713935, + "grad_norm": 0.2001914530992508, + "learning_rate": 3.472511047297644e-05, + "loss": 1.691, + "step": 19903 + }, + { + "epoch": 6.1092694904849605, + "grad_norm": 0.2194693237543106, + "learning_rate": 3.47203776172259e-05, + "loss": 1.7181, + "step": 19904 + }, + { + "epoch": 6.109576427255985, + "grad_norm": 0.1865277737379074, + "learning_rate": 3.4715644912477515e-05, + "loss": 1.6786, + "step": 19905 + }, + { + "epoch": 6.10988336402701, + "grad_norm": 0.20574906468391418, + "learning_rate": 3.471091235877811e-05, + "loss": 1.7681, + "step": 19906 + }, + { + "epoch": 6.110190300798036, + "grad_norm": 0.21072493493556976, + "learning_rate": 3.470617995617441e-05, + "loss": 1.7494, + "step": 19907 + }, + { + "epoch": 6.110497237569061, + "grad_norm": 0.2411658763885498, + "learning_rate": 3.470144770471323e-05, + "loss": 1.7183, + "step": 19908 + }, + { + "epoch": 6.110804174340086, + "grad_norm": 0.19782759249210358, + "learning_rate": 3.4696715604441285e-05, + "loss": 1.6823, + "step": 19909 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.315026193857193, + "learning_rate": 3.469198365540539e-05, + "loss": 1.691, + "step": 19910 + }, + { + "epoch": 6.111418047882136, + "grad_norm": 0.19840773940086365, + "learning_rate": 3.468725185765226e-05, + "loss": 1.7413, + "step": 19911 + }, + { + "epoch": 6.111724984653161, + "grad_norm": 0.1813160926103592, + "learning_rate": 3.46825202112287e-05, + "loss": 1.7095, + "step": 19912 + }, + { + "epoch": 6.112031921424187, + "grad_norm": 0.21025459468364716, + "learning_rate": 3.467778871618145e-05, + "loss": 1.7783, + "step": 19913 + }, + { + "epoch": 6.112338858195212, + "grad_norm": 0.20088298618793488, + "learning_rate": 3.4673057372557265e-05, + "loss": 1.7671, + "step": 19914 + }, + { + "epoch": 6.112645794966237, + "grad_norm": 0.21919472515583038, + "learning_rate": 3.466832618040291e-05, + "loss": 1.7052, + "step": 19915 + }, + { + "epoch": 6.112952731737262, + "grad_norm": 0.19135436415672302, + "learning_rate": 3.466359513976516e-05, + "loss": 1.7862, + "step": 19916 + }, + { + "epoch": 6.113259668508287, + "grad_norm": 0.19943594932556152, + "learning_rate": 3.465886425069074e-05, + "loss": 1.6926, + "step": 19917 + }, + { + "epoch": 6.1135666052793125, + "grad_norm": 0.19390980899333954, + "learning_rate": 3.46541335132264e-05, + "loss": 1.761, + "step": 19918 + }, + { + "epoch": 6.113873542050338, + "grad_norm": 0.22745995223522186, + "learning_rate": 3.4649402927418935e-05, + "loss": 1.7147, + "step": 19919 + }, + { + "epoch": 6.114180478821363, + "grad_norm": 0.17792920768260956, + "learning_rate": 3.4644672493315045e-05, + "loss": 1.6946, + "step": 19920 + }, + { + "epoch": 6.114487415592388, + "grad_norm": 0.2009986788034439, + "learning_rate": 3.463994221096152e-05, + "loss": 1.6977, + "step": 19921 + }, + { + "epoch": 6.114794352363413, + "grad_norm": 0.2448386251926422, + "learning_rate": 3.4635212080405066e-05, + "loss": 1.7169, + "step": 19922 + }, + { + "epoch": 6.115101289134438, + "grad_norm": 0.21506112813949585, + "learning_rate": 3.463048210169247e-05, + "loss": 1.6632, + "step": 19923 + }, + { + "epoch": 6.115408225905464, + "grad_norm": 0.1805233359336853, + "learning_rate": 3.462575227487045e-05, + "loss": 1.6742, + "step": 19924 + }, + { + "epoch": 6.115715162676489, + "grad_norm": 0.20023848116397858, + "learning_rate": 3.4621022599985766e-05, + "loss": 1.7106, + "step": 19925 + }, + { + "epoch": 6.116022099447513, + "grad_norm": 0.20388077199459076, + "learning_rate": 3.461629307708513e-05, + "loss": 1.7065, + "step": 19926 + }, + { + "epoch": 6.116329036218539, + "grad_norm": 0.23886005580425262, + "learning_rate": 3.461156370621533e-05, + "loss": 1.7177, + "step": 19927 + }, + { + "epoch": 6.116635972989564, + "grad_norm": 0.2054048627614975, + "learning_rate": 3.460683448742306e-05, + "loss": 1.6773, + "step": 19928 + }, + { + "epoch": 6.116942909760589, + "grad_norm": 0.1909634917974472, + "learning_rate": 3.460210542075508e-05, + "loss": 1.7562, + "step": 19929 + }, + { + "epoch": 6.117249846531615, + "grad_norm": 0.20221595466136932, + "learning_rate": 3.459737650625812e-05, + "loss": 1.7948, + "step": 19930 + }, + { + "epoch": 6.11755678330264, + "grad_norm": 0.25445356965065, + "learning_rate": 3.459264774397891e-05, + "loss": 1.7964, + "step": 19931 + }, + { + "epoch": 6.1178637200736645, + "grad_norm": 0.2227735072374344, + "learning_rate": 3.4587919133964176e-05, + "loss": 1.7833, + "step": 19932 + }, + { + "epoch": 6.11817065684469, + "grad_norm": 0.20591853559017181, + "learning_rate": 3.458319067626068e-05, + "loss": 1.7535, + "step": 19933 + }, + { + "epoch": 6.118477593615715, + "grad_norm": 0.22087402641773224, + "learning_rate": 3.4578462370915115e-05, + "loss": 1.7228, + "step": 19934 + }, + { + "epoch": 6.1187845303867405, + "grad_norm": 0.234156996011734, + "learning_rate": 3.457373421797423e-05, + "loss": 1.7167, + "step": 19935 + }, + { + "epoch": 6.119091467157766, + "grad_norm": 0.209685817360878, + "learning_rate": 3.4569006217484746e-05, + "loss": 1.6633, + "step": 19936 + }, + { + "epoch": 6.11939840392879, + "grad_norm": 0.18499237298965454, + "learning_rate": 3.4564278369493366e-05, + "loss": 1.6769, + "step": 19937 + }, + { + "epoch": 6.119705340699816, + "grad_norm": 0.2600767910480499, + "learning_rate": 3.455955067404686e-05, + "loss": 1.7788, + "step": 19938 + }, + { + "epoch": 6.120012277470841, + "grad_norm": 0.21499377489089966, + "learning_rate": 3.455482313119191e-05, + "loss": 1.789, + "step": 19939 + }, + { + "epoch": 6.120319214241866, + "grad_norm": 0.19618432223796844, + "learning_rate": 3.455009574097527e-05, + "loss": 1.7162, + "step": 19940 + }, + { + "epoch": 6.120626151012892, + "grad_norm": 0.23219916224479675, + "learning_rate": 3.4545368503443616e-05, + "loss": 1.7871, + "step": 19941 + }, + { + "epoch": 6.120933087783916, + "grad_norm": 0.22315794229507446, + "learning_rate": 3.45406414186437e-05, + "loss": 1.6944, + "step": 19942 + }, + { + "epoch": 6.121240024554941, + "grad_norm": 0.22536693513393402, + "learning_rate": 3.453591448662221e-05, + "loss": 1.7727, + "step": 19943 + }, + { + "epoch": 6.121546961325967, + "grad_norm": 0.21811100840568542, + "learning_rate": 3.45311877074259e-05, + "loss": 1.7037, + "step": 19944 + }, + { + "epoch": 6.121853898096992, + "grad_norm": 0.1957094967365265, + "learning_rate": 3.452646108110145e-05, + "loss": 1.7734, + "step": 19945 + }, + { + "epoch": 6.122160834868017, + "grad_norm": 0.185706228017807, + "learning_rate": 3.452173460769559e-05, + "loss": 1.6715, + "step": 19946 + }, + { + "epoch": 6.122467771639043, + "grad_norm": 0.21081562340259552, + "learning_rate": 3.4517008287255005e-05, + "loss": 1.7798, + "step": 19947 + }, + { + "epoch": 6.122774708410067, + "grad_norm": 0.24175535142421722, + "learning_rate": 3.451228211982642e-05, + "loss": 1.7111, + "step": 19948 + }, + { + "epoch": 6.1230816451810925, + "grad_norm": 0.244124636054039, + "learning_rate": 3.450755610545654e-05, + "loss": 1.7263, + "step": 19949 + }, + { + "epoch": 6.123388581952118, + "grad_norm": 0.21109984815120697, + "learning_rate": 3.45028302441921e-05, + "loss": 1.7556, + "step": 19950 + }, + { + "epoch": 6.123695518723143, + "grad_norm": 0.21721722185611725, + "learning_rate": 3.449810453607976e-05, + "loss": 1.7416, + "step": 19951 + }, + { + "epoch": 6.1240024554941686, + "grad_norm": 0.18695317208766937, + "learning_rate": 3.4493378981166216e-05, + "loss": 1.7128, + "step": 19952 + }, + { + "epoch": 6.124309392265193, + "grad_norm": 0.19175554811954498, + "learning_rate": 3.4488653579498206e-05, + "loss": 1.7014, + "step": 19953 + }, + { + "epoch": 6.124616329036218, + "grad_norm": 0.22297006845474243, + "learning_rate": 3.4483928331122405e-05, + "loss": 1.7231, + "step": 19954 + }, + { + "epoch": 6.124923265807244, + "grad_norm": 0.2407974898815155, + "learning_rate": 3.447920323608553e-05, + "loss": 1.7354, + "step": 19955 + }, + { + "epoch": 6.125230202578269, + "grad_norm": 0.19767232239246368, + "learning_rate": 3.447447829443425e-05, + "loss": 1.7487, + "step": 19956 + }, + { + "epoch": 6.125537139349294, + "grad_norm": 0.20033477246761322, + "learning_rate": 3.446975350621529e-05, + "loss": 1.7232, + "step": 19957 + }, + { + "epoch": 6.12584407612032, + "grad_norm": 0.20310243964195251, + "learning_rate": 3.446502887147532e-05, + "loss": 1.6946, + "step": 19958 + }, + { + "epoch": 6.126151012891344, + "grad_norm": 0.2322724461555481, + "learning_rate": 3.446030439026104e-05, + "loss": 1.7071, + "step": 19959 + }, + { + "epoch": 6.1264579496623695, + "grad_norm": 0.24134255945682526, + "learning_rate": 3.445558006261914e-05, + "loss": 1.7259, + "step": 19960 + }, + { + "epoch": 6.126764886433395, + "grad_norm": 0.22821731865406036, + "learning_rate": 3.445085588859632e-05, + "loss": 1.7488, + "step": 19961 + }, + { + "epoch": 6.12707182320442, + "grad_norm": 0.258241206407547, + "learning_rate": 3.444613186823924e-05, + "loss": 1.7403, + "step": 19962 + }, + { + "epoch": 6.1273787599754455, + "grad_norm": 0.18758481740951538, + "learning_rate": 3.4441408001594625e-05, + "loss": 1.7079, + "step": 19963 + }, + { + "epoch": 6.12768569674647, + "grad_norm": 0.24032682180404663, + "learning_rate": 3.443668428870911e-05, + "loss": 1.7377, + "step": 19964 + }, + { + "epoch": 6.127992633517495, + "grad_norm": 0.24468545615673065, + "learning_rate": 3.4431960729629406e-05, + "loss": 1.7724, + "step": 19965 + }, + { + "epoch": 6.128299570288521, + "grad_norm": 0.23840154707431793, + "learning_rate": 3.4427237324402197e-05, + "loss": 1.7813, + "step": 19966 + }, + { + "epoch": 6.128606507059546, + "grad_norm": 0.2476109117269516, + "learning_rate": 3.4422514073074165e-05, + "loss": 1.7578, + "step": 19967 + }, + { + "epoch": 6.128913443830571, + "grad_norm": 0.2109041064977646, + "learning_rate": 3.4417790975691974e-05, + "loss": 1.6917, + "step": 19968 + }, + { + "epoch": 6.129220380601596, + "grad_norm": 0.21841584146022797, + "learning_rate": 3.4413068032302296e-05, + "loss": 1.7511, + "step": 19969 + }, + { + "epoch": 6.129527317372621, + "grad_norm": 0.2111930102109909, + "learning_rate": 3.440834524295182e-05, + "loss": 1.7194, + "step": 19970 + }, + { + "epoch": 6.129834254143646, + "grad_norm": 0.21868006885051727, + "learning_rate": 3.440362260768721e-05, + "loss": 1.7933, + "step": 19971 + }, + { + "epoch": 6.130141190914672, + "grad_norm": 0.19846780598163605, + "learning_rate": 3.439890012655516e-05, + "loss": 1.6985, + "step": 19972 + }, + { + "epoch": 6.130448127685697, + "grad_norm": 0.218460813164711, + "learning_rate": 3.439417779960231e-05, + "loss": 1.7205, + "step": 19973 + }, + { + "epoch": 6.1307550644567215, + "grad_norm": 0.22504402697086334, + "learning_rate": 3.438945562687535e-05, + "loss": 1.7437, + "step": 19974 + }, + { + "epoch": 6.131062001227747, + "grad_norm": 0.35414671897888184, + "learning_rate": 3.438473360842093e-05, + "loss": 1.7641, + "step": 19975 + }, + { + "epoch": 6.131368937998772, + "grad_norm": 0.21090710163116455, + "learning_rate": 3.4380011744285726e-05, + "loss": 1.6817, + "step": 19976 + }, + { + "epoch": 6.1316758747697975, + "grad_norm": 0.19118748605251312, + "learning_rate": 3.437529003451639e-05, + "loss": 1.694, + "step": 19977 + }, + { + "epoch": 6.131982811540823, + "grad_norm": 0.2341139018535614, + "learning_rate": 3.437056847915962e-05, + "loss": 1.781, + "step": 19978 + }, + { + "epoch": 6.132289748311848, + "grad_norm": 0.19120962917804718, + "learning_rate": 3.4365847078262033e-05, + "loss": 1.6974, + "step": 19979 + }, + { + "epoch": 6.132596685082873, + "grad_norm": 0.1998066008090973, + "learning_rate": 3.436112583187033e-05, + "loss": 1.6933, + "step": 19980 + }, + { + "epoch": 6.132903621853898, + "grad_norm": 0.19839663803577423, + "learning_rate": 3.4356404740031123e-05, + "loss": 1.6867, + "step": 19981 + }, + { + "epoch": 6.133210558624923, + "grad_norm": 0.19892877340316772, + "learning_rate": 3.4351683802791114e-05, + "loss": 1.7349, + "step": 19982 + }, + { + "epoch": 6.133517495395949, + "grad_norm": 0.23215502500534058, + "learning_rate": 3.434696302019692e-05, + "loss": 1.7411, + "step": 19983 + }, + { + "epoch": 6.133824432166974, + "grad_norm": 0.21246971189975739, + "learning_rate": 3.4342242392295225e-05, + "loss": 1.6918, + "step": 19984 + }, + { + "epoch": 6.134131368937998, + "grad_norm": 0.18585935235023499, + "learning_rate": 3.4337521919132675e-05, + "loss": 1.71, + "step": 19985 + }, + { + "epoch": 6.134438305709024, + "grad_norm": 0.24194715917110443, + "learning_rate": 3.4332801600755896e-05, + "loss": 1.7314, + "step": 19986 + }, + { + "epoch": 6.134745242480049, + "grad_norm": 0.19925665855407715, + "learning_rate": 3.432808143721156e-05, + "loss": 1.7425, + "step": 19987 + }, + { + "epoch": 6.135052179251074, + "grad_norm": 0.22253449261188507, + "learning_rate": 3.43233614285463e-05, + "loss": 1.702, + "step": 19988 + }, + { + "epoch": 6.1353591160221, + "grad_norm": 0.22180478274822235, + "learning_rate": 3.4318641574806796e-05, + "loss": 1.6659, + "step": 19989 + }, + { + "epoch": 6.135666052793125, + "grad_norm": 0.19818264245986938, + "learning_rate": 3.431392187603964e-05, + "loss": 1.8057, + "step": 19990 + }, + { + "epoch": 6.1359729895641495, + "grad_norm": 0.34630170464515686, + "learning_rate": 3.4309202332291526e-05, + "loss": 1.7233, + "step": 19991 + }, + { + "epoch": 6.136279926335175, + "grad_norm": 0.2633006274700165, + "learning_rate": 3.430448294360905e-05, + "loss": 1.7421, + "step": 19992 + }, + { + "epoch": 6.1365868631062, + "grad_norm": 0.1976388394832611, + "learning_rate": 3.429976371003888e-05, + "loss": 1.7474, + "step": 19993 + }, + { + "epoch": 6.1368937998772255, + "grad_norm": 0.2386583834886551, + "learning_rate": 3.429504463162764e-05, + "loss": 1.7026, + "step": 19994 + }, + { + "epoch": 6.137200736648251, + "grad_norm": 0.20853812992572784, + "learning_rate": 3.4290325708422e-05, + "loss": 1.7846, + "step": 19995 + }, + { + "epoch": 6.137507673419275, + "grad_norm": 0.24667194485664368, + "learning_rate": 3.428560694046854e-05, + "loss": 1.6446, + "step": 19996 + }, + { + "epoch": 6.137814610190301, + "grad_norm": 0.24396342039108276, + "learning_rate": 3.428088832781394e-05, + "loss": 1.7368, + "step": 19997 + }, + { + "epoch": 6.138121546961326, + "grad_norm": 0.1958172619342804, + "learning_rate": 3.4276169870504804e-05, + "loss": 1.7197, + "step": 19998 + }, + { + "epoch": 6.138428483732351, + "grad_norm": 0.21487464010715485, + "learning_rate": 3.427145156858778e-05, + "loss": 1.7318, + "step": 19999 + }, + { + "epoch": 6.138735420503377, + "grad_norm": 0.2152775675058365, + "learning_rate": 3.4266733422109476e-05, + "loss": 1.7924, + "step": 20000 + }, + { + "epoch": 6.139042357274401, + "grad_norm": 0.17151346802711487, + "learning_rate": 3.426201543111656e-05, + "loss": 1.6915, + "step": 20001 + }, + { + "epoch": 6.139349294045426, + "grad_norm": 0.22197338938713074, + "learning_rate": 3.425729759565563e-05, + "loss": 1.8028, + "step": 20002 + }, + { + "epoch": 6.139656230816452, + "grad_norm": 0.23111973702907562, + "learning_rate": 3.42525799157733e-05, + "loss": 1.7515, + "step": 20003 + }, + { + "epoch": 6.139963167587477, + "grad_norm": 0.2829805314540863, + "learning_rate": 3.42478623915162e-05, + "loss": 1.8379, + "step": 20004 + }, + { + "epoch": 6.140270104358502, + "grad_norm": 0.23467600345611572, + "learning_rate": 3.424314502293096e-05, + "loss": 1.7755, + "step": 20005 + }, + { + "epoch": 6.140577041129528, + "grad_norm": 0.2047930657863617, + "learning_rate": 3.42384278100642e-05, + "loss": 1.7198, + "step": 20006 + }, + { + "epoch": 6.140883977900552, + "grad_norm": 0.1893673986196518, + "learning_rate": 3.423371075296253e-05, + "loss": 1.7318, + "step": 20007 + }, + { + "epoch": 6.1411909146715775, + "grad_norm": 0.21514710783958435, + "learning_rate": 3.422899385167259e-05, + "loss": 1.7499, + "step": 20008 + }, + { + "epoch": 6.141497851442603, + "grad_norm": 0.20030297338962555, + "learning_rate": 3.422427710624095e-05, + "loss": 1.7109, + "step": 20009 + }, + { + "epoch": 6.141804788213628, + "grad_norm": 0.23581266403198242, + "learning_rate": 3.421956051671426e-05, + "loss": 1.7834, + "step": 20010 + }, + { + "epoch": 6.1421117249846535, + "grad_norm": 0.22492484748363495, + "learning_rate": 3.421484408313911e-05, + "loss": 1.785, + "step": 20011 + }, + { + "epoch": 6.142418661755678, + "grad_norm": 0.34137019515037537, + "learning_rate": 3.421012780556215e-05, + "loss": 1.8101, + "step": 20012 + }, + { + "epoch": 6.142725598526703, + "grad_norm": 0.28489169478416443, + "learning_rate": 3.420541168402994e-05, + "loss": 1.7945, + "step": 20013 + }, + { + "epoch": 6.143032535297729, + "grad_norm": 0.259362131357193, + "learning_rate": 3.420069571858913e-05, + "loss": 1.7011, + "step": 20014 + }, + { + "epoch": 6.143339472068754, + "grad_norm": 0.3628309667110443, + "learning_rate": 3.419597990928628e-05, + "loss": 1.8273, + "step": 20015 + }, + { + "epoch": 6.143646408839779, + "grad_norm": 0.22306841611862183, + "learning_rate": 3.419126425616803e-05, + "loss": 1.7447, + "step": 20016 + }, + { + "epoch": 6.143953345610804, + "grad_norm": 0.36336812376976013, + "learning_rate": 3.4186548759280964e-05, + "loss": 1.7076, + "step": 20017 + }, + { + "epoch": 6.144260282381829, + "grad_norm": 0.23167413473129272, + "learning_rate": 3.418183341867172e-05, + "loss": 1.6924, + "step": 20018 + }, + { + "epoch": 6.144567219152854, + "grad_norm": 0.2541113495826721, + "learning_rate": 3.417711823438686e-05, + "loss": 1.755, + "step": 20019 + }, + { + "epoch": 6.14487415592388, + "grad_norm": 0.3733784854412079, + "learning_rate": 3.4172403206472975e-05, + "loss": 1.7087, + "step": 20020 + }, + { + "epoch": 6.145181092694905, + "grad_norm": 0.1940508335828781, + "learning_rate": 3.416768833497669e-05, + "loss": 1.717, + "step": 20021 + }, + { + "epoch": 6.14548802946593, + "grad_norm": 0.2707524001598358, + "learning_rate": 3.416297361994457e-05, + "loss": 1.7422, + "step": 20022 + }, + { + "epoch": 6.145794966236955, + "grad_norm": 0.25535452365875244, + "learning_rate": 3.415825906142326e-05, + "loss": 1.6915, + "step": 20023 + }, + { + "epoch": 6.14610190300798, + "grad_norm": 0.24094220995903015, + "learning_rate": 3.415354465945929e-05, + "loss": 1.7192, + "step": 20024 + }, + { + "epoch": 6.1464088397790055, + "grad_norm": 0.28329676389694214, + "learning_rate": 3.4148830414099306e-05, + "loss": 1.7272, + "step": 20025 + }, + { + "epoch": 6.146715776550031, + "grad_norm": 0.217180535197258, + "learning_rate": 3.414411632538984e-05, + "loss": 1.7195, + "step": 20026 + }, + { + "epoch": 6.147022713321056, + "grad_norm": 0.22693867981433868, + "learning_rate": 3.413940239337753e-05, + "loss": 1.6889, + "step": 20027 + }, + { + "epoch": 6.147329650092081, + "grad_norm": 0.30376315116882324, + "learning_rate": 3.413468861810892e-05, + "loss": 1.7741, + "step": 20028 + }, + { + "epoch": 6.147636586863106, + "grad_norm": 0.1928185671567917, + "learning_rate": 3.412997499963065e-05, + "loss": 1.6986, + "step": 20029 + }, + { + "epoch": 6.147943523634131, + "grad_norm": 0.260929137468338, + "learning_rate": 3.412526153798924e-05, + "loss": 1.7044, + "step": 20030 + }, + { + "epoch": 6.148250460405157, + "grad_norm": 0.23274847865104675, + "learning_rate": 3.4120548233231326e-05, + "loss": 1.7626, + "step": 20031 + }, + { + "epoch": 6.148557397176182, + "grad_norm": 0.2389308512210846, + "learning_rate": 3.411583508540344e-05, + "loss": 1.71, + "step": 20032 + }, + { + "epoch": 6.148864333947207, + "grad_norm": 0.2745562195777893, + "learning_rate": 3.411112209455219e-05, + "loss": 1.7144, + "step": 20033 + }, + { + "epoch": 6.149171270718232, + "grad_norm": 0.2369096428155899, + "learning_rate": 3.4106409260724135e-05, + "loss": 1.7879, + "step": 20034 + }, + { + "epoch": 6.149478207489257, + "grad_norm": 0.3103141486644745, + "learning_rate": 3.4101696583965874e-05, + "loss": 1.7862, + "step": 20035 + }, + { + "epoch": 6.149785144260282, + "grad_norm": 0.18625277280807495, + "learning_rate": 3.409698406432397e-05, + "loss": 1.7717, + "step": 20036 + }, + { + "epoch": 6.150092081031308, + "grad_norm": 0.2539508640766144, + "learning_rate": 3.409227170184497e-05, + "loss": 1.7023, + "step": 20037 + }, + { + "epoch": 6.150399017802333, + "grad_norm": 0.2185351699590683, + "learning_rate": 3.4087559496575474e-05, + "loss": 1.7283, + "step": 20038 + }, + { + "epoch": 6.150705954573358, + "grad_norm": 0.21225227415561676, + "learning_rate": 3.408284744856204e-05, + "loss": 1.7055, + "step": 20039 + }, + { + "epoch": 6.151012891344383, + "grad_norm": 0.23623189330101013, + "learning_rate": 3.407813555785125e-05, + "loss": 1.6862, + "step": 20040 + }, + { + "epoch": 6.151319828115408, + "grad_norm": 0.19061312079429626, + "learning_rate": 3.4073423824489634e-05, + "loss": 1.7501, + "step": 20041 + }, + { + "epoch": 6.151626764886434, + "grad_norm": 0.22176402807235718, + "learning_rate": 3.4068712248523804e-05, + "loss": 1.7417, + "step": 20042 + }, + { + "epoch": 6.151933701657459, + "grad_norm": 0.20093770325183868, + "learning_rate": 3.406400083000028e-05, + "loss": 1.7283, + "step": 20043 + }, + { + "epoch": 6.152240638428483, + "grad_norm": 0.21968910098075867, + "learning_rate": 3.4059289568965635e-05, + "loss": 1.7187, + "step": 20044 + }, + { + "epoch": 6.152547575199509, + "grad_norm": 0.19038841128349304, + "learning_rate": 3.4054578465466435e-05, + "loss": 1.7131, + "step": 20045 + }, + { + "epoch": 6.152854511970534, + "grad_norm": 0.2239457368850708, + "learning_rate": 3.404986751954925e-05, + "loss": 1.7643, + "step": 20046 + }, + { + "epoch": 6.153161448741559, + "grad_norm": 0.2357017546892166, + "learning_rate": 3.404515673126061e-05, + "loss": 1.7196, + "step": 20047 + }, + { + "epoch": 6.153468385512585, + "grad_norm": 0.2633310556411743, + "learning_rate": 3.4040446100647104e-05, + "loss": 1.7613, + "step": 20048 + }, + { + "epoch": 6.153775322283609, + "grad_norm": 0.28470975160598755, + "learning_rate": 3.403573562775524e-05, + "loss": 1.7564, + "step": 20049 + }, + { + "epoch": 6.1540822590546345, + "grad_norm": 0.37435805797576904, + "learning_rate": 3.40310253126316e-05, + "loss": 1.8365, + "step": 20050 + }, + { + "epoch": 6.15438919582566, + "grad_norm": 0.1706259697675705, + "learning_rate": 3.402631515532272e-05, + "loss": 1.7373, + "step": 20051 + }, + { + "epoch": 6.154696132596685, + "grad_norm": 0.30885928869247437, + "learning_rate": 3.402160515587518e-05, + "loss": 1.7152, + "step": 20052 + }, + { + "epoch": 6.1550030693677105, + "grad_norm": 0.21448500454425812, + "learning_rate": 3.40168953143355e-05, + "loss": 1.7463, + "step": 20053 + }, + { + "epoch": 6.155310006138736, + "grad_norm": 0.23774586617946625, + "learning_rate": 3.4012185630750204e-05, + "loss": 1.7268, + "step": 20054 + }, + { + "epoch": 6.15561694290976, + "grad_norm": 0.1943385899066925, + "learning_rate": 3.400747610516588e-05, + "loss": 1.6578, + "step": 20055 + }, + { + "epoch": 6.155923879680786, + "grad_norm": 0.27488210797309875, + "learning_rate": 3.400276673762903e-05, + "loss": 1.8204, + "step": 20056 + }, + { + "epoch": 6.156230816451811, + "grad_norm": 0.1871461570262909, + "learning_rate": 3.3998057528186244e-05, + "loss": 1.6775, + "step": 20057 + }, + { + "epoch": 6.156537753222836, + "grad_norm": 0.23566775023937225, + "learning_rate": 3.399334847688401e-05, + "loss": 1.7089, + "step": 20058 + }, + { + "epoch": 6.156844689993862, + "grad_norm": 0.26842471957206726, + "learning_rate": 3.398863958376891e-05, + "loss": 1.7554, + "step": 20059 + }, + { + "epoch": 6.157151626764886, + "grad_norm": 0.19267809391021729, + "learning_rate": 3.3983930848887435e-05, + "loss": 1.6709, + "step": 20060 + }, + { + "epoch": 6.157458563535911, + "grad_norm": 0.21130084991455078, + "learning_rate": 3.3979222272286156e-05, + "loss": 1.7312, + "step": 20061 + }, + { + "epoch": 6.157765500306937, + "grad_norm": 0.2322172224521637, + "learning_rate": 3.397451385401158e-05, + "loss": 1.8069, + "step": 20062 + }, + { + "epoch": 6.158072437077962, + "grad_norm": 0.21852418780326843, + "learning_rate": 3.396980559411027e-05, + "loss": 1.715, + "step": 20063 + }, + { + "epoch": 6.158379373848987, + "grad_norm": 0.21385829150676727, + "learning_rate": 3.3965097492628714e-05, + "loss": 1.6804, + "step": 20064 + }, + { + "epoch": 6.158686310620013, + "grad_norm": 0.21639080345630646, + "learning_rate": 3.3960389549613494e-05, + "loss": 1.655, + "step": 20065 + }, + { + "epoch": 6.158993247391037, + "grad_norm": 0.19219942390918732, + "learning_rate": 3.395568176511107e-05, + "loss": 1.7325, + "step": 20066 + }, + { + "epoch": 6.1593001841620625, + "grad_norm": 0.21853557229042053, + "learning_rate": 3.3950974139168024e-05, + "loss": 1.7204, + "step": 20067 + }, + { + "epoch": 6.159607120933088, + "grad_norm": 0.24144381284713745, + "learning_rate": 3.3946266671830854e-05, + "loss": 1.754, + "step": 20068 + }, + { + "epoch": 6.159914057704113, + "grad_norm": 0.2014230340719223, + "learning_rate": 3.394155936314609e-05, + "loss": 1.6905, + "step": 20069 + }, + { + "epoch": 6.1602209944751385, + "grad_norm": 0.26940762996673584, + "learning_rate": 3.393685221316025e-05, + "loss": 1.729, + "step": 20070 + }, + { + "epoch": 6.160527931246163, + "grad_norm": 0.1937808394432068, + "learning_rate": 3.3932145221919843e-05, + "loss": 1.7492, + "step": 20071 + }, + { + "epoch": 6.160834868017188, + "grad_norm": 0.2586243450641632, + "learning_rate": 3.39274383894714e-05, + "loss": 1.7706, + "step": 20072 + }, + { + "epoch": 6.161141804788214, + "grad_norm": 0.21995361149311066, + "learning_rate": 3.3922731715861416e-05, + "loss": 1.7716, + "step": 20073 + }, + { + "epoch": 6.161448741559239, + "grad_norm": 0.22915497422218323, + "learning_rate": 3.391802520113645e-05, + "loss": 1.716, + "step": 20074 + }, + { + "epoch": 6.161755678330264, + "grad_norm": 0.24317315220832825, + "learning_rate": 3.3913318845342956e-05, + "loss": 1.7392, + "step": 20075 + }, + { + "epoch": 6.162062615101289, + "grad_norm": 0.20439307391643524, + "learning_rate": 3.390861264852749e-05, + "loss": 1.7076, + "step": 20076 + }, + { + "epoch": 6.162369551872314, + "grad_norm": 0.2197176069021225, + "learning_rate": 3.3903906610736534e-05, + "loss": 1.7334, + "step": 20077 + }, + { + "epoch": 6.162676488643339, + "grad_norm": 0.21651993691921234, + "learning_rate": 3.389920073201662e-05, + "loss": 1.7651, + "step": 20078 + }, + { + "epoch": 6.162983425414365, + "grad_norm": 0.1999540627002716, + "learning_rate": 3.389449501241424e-05, + "loss": 1.7031, + "step": 20079 + }, + { + "epoch": 6.16329036218539, + "grad_norm": 0.21965044736862183, + "learning_rate": 3.38897894519759e-05, + "loss": 1.7243, + "step": 20080 + }, + { + "epoch": 6.163597298956415, + "grad_norm": 0.20127563178539276, + "learning_rate": 3.388508405074808e-05, + "loss": 1.693, + "step": 20081 + }, + { + "epoch": 6.16390423572744, + "grad_norm": 0.2143397182226181, + "learning_rate": 3.3880378808777336e-05, + "loss": 1.7304, + "step": 20082 + }, + { + "epoch": 6.164211172498465, + "grad_norm": 0.23116083443164825, + "learning_rate": 3.387567372611012e-05, + "loss": 1.7558, + "step": 20083 + }, + { + "epoch": 6.1645181092694905, + "grad_norm": 0.25513985753059387, + "learning_rate": 3.3870968802792946e-05, + "loss": 1.7169, + "step": 20084 + }, + { + "epoch": 6.164825046040516, + "grad_norm": 0.20549121499061584, + "learning_rate": 3.386626403887232e-05, + "loss": 1.7147, + "step": 20085 + }, + { + "epoch": 6.165131982811541, + "grad_norm": 0.2850625514984131, + "learning_rate": 3.386155943439473e-05, + "loss": 1.7865, + "step": 20086 + }, + { + "epoch": 6.165438919582566, + "grad_norm": 0.2689895033836365, + "learning_rate": 3.3856854989406675e-05, + "loss": 1.7576, + "step": 20087 + }, + { + "epoch": 6.165745856353591, + "grad_norm": 0.21677634119987488, + "learning_rate": 3.385215070395462e-05, + "loss": 1.7186, + "step": 20088 + }, + { + "epoch": 6.166052793124616, + "grad_norm": 0.19525155425071716, + "learning_rate": 3.384744657808509e-05, + "loss": 1.6713, + "step": 20089 + }, + { + "epoch": 6.166359729895642, + "grad_norm": 0.23097296059131622, + "learning_rate": 3.3842742611844555e-05, + "loss": 1.6975, + "step": 20090 + }, + { + "epoch": 6.166666666666667, + "grad_norm": 0.22210827469825745, + "learning_rate": 3.3838038805279516e-05, + "loss": 1.733, + "step": 20091 + }, + { + "epoch": 6.166973603437691, + "grad_norm": 0.3336607813835144, + "learning_rate": 3.383333515843643e-05, + "loss": 1.7441, + "step": 20092 + }, + { + "epoch": 6.167280540208717, + "grad_norm": 0.25274014472961426, + "learning_rate": 3.382863167136183e-05, + "loss": 1.7235, + "step": 20093 + }, + { + "epoch": 6.167587476979742, + "grad_norm": 0.3228790760040283, + "learning_rate": 3.3823928344102144e-05, + "loss": 1.8096, + "step": 20094 + }, + { + "epoch": 6.167894413750767, + "grad_norm": 0.34542208909988403, + "learning_rate": 3.381922517670389e-05, + "loss": 1.7431, + "step": 20095 + }, + { + "epoch": 6.168201350521793, + "grad_norm": 0.1921117901802063, + "learning_rate": 3.381452216921355e-05, + "loss": 1.787, + "step": 20096 + }, + { + "epoch": 6.168508287292818, + "grad_norm": 0.29019802808761597, + "learning_rate": 3.380981932167757e-05, + "loss": 1.7122, + "step": 20097 + }, + { + "epoch": 6.1688152240638425, + "grad_norm": 0.17999929189682007, + "learning_rate": 3.380511663414244e-05, + "loss": 1.7153, + "step": 20098 + }, + { + "epoch": 6.169122160834868, + "grad_norm": 0.2641841471195221, + "learning_rate": 3.380041410665466e-05, + "loss": 1.7317, + "step": 20099 + }, + { + "epoch": 6.169429097605893, + "grad_norm": 0.25492918491363525, + "learning_rate": 3.379571173926067e-05, + "loss": 1.6975, + "step": 20100 + }, + { + "epoch": 6.1697360343769185, + "grad_norm": 0.2554764151573181, + "learning_rate": 3.379100953200697e-05, + "loss": 1.7539, + "step": 20101 + }, + { + "epoch": 6.170042971147944, + "grad_norm": 0.2339072823524475, + "learning_rate": 3.378630748493999e-05, + "loss": 1.6871, + "step": 20102 + }, + { + "epoch": 6.170349907918968, + "grad_norm": 0.19663162529468536, + "learning_rate": 3.3781605598106236e-05, + "loss": 1.7419, + "step": 20103 + }, + { + "epoch": 6.170656844689994, + "grad_norm": 0.2479846328496933, + "learning_rate": 3.3776903871552166e-05, + "loss": 1.7849, + "step": 20104 + }, + { + "epoch": 6.170963781461019, + "grad_norm": 0.18630735576152802, + "learning_rate": 3.377220230532423e-05, + "loss": 1.7412, + "step": 20105 + }, + { + "epoch": 6.171270718232044, + "grad_norm": 0.2211095094680786, + "learning_rate": 3.376750089946892e-05, + "loss": 1.7445, + "step": 20106 + }, + { + "epoch": 6.17157765500307, + "grad_norm": 0.20783299207687378, + "learning_rate": 3.3762799654032653e-05, + "loss": 1.7346, + "step": 20107 + }, + { + "epoch": 6.171884591774095, + "grad_norm": 0.18022862076759338, + "learning_rate": 3.3758098569061934e-05, + "loss": 1.7083, + "step": 20108 + }, + { + "epoch": 6.172191528545119, + "grad_norm": 0.23707088828086853, + "learning_rate": 3.375339764460319e-05, + "loss": 1.8542, + "step": 20109 + }, + { + "epoch": 6.172498465316145, + "grad_norm": 0.2289234846830368, + "learning_rate": 3.3748696880702913e-05, + "loss": 1.7564, + "step": 20110 + }, + { + "epoch": 6.17280540208717, + "grad_norm": 0.28396767377853394, + "learning_rate": 3.374399627740752e-05, + "loss": 1.7349, + "step": 20111 + }, + { + "epoch": 6.173112338858195, + "grad_norm": 0.20154817402362823, + "learning_rate": 3.373929583476351e-05, + "loss": 1.7356, + "step": 20112 + }, + { + "epoch": 6.173419275629221, + "grad_norm": 0.22590605914592743, + "learning_rate": 3.373459555281728e-05, + "loss": 1.7291, + "step": 20113 + }, + { + "epoch": 6.173726212400245, + "grad_norm": 0.2145034223794937, + "learning_rate": 3.372989543161532e-05, + "loss": 1.7544, + "step": 20114 + }, + { + "epoch": 6.1740331491712706, + "grad_norm": 0.26797109842300415, + "learning_rate": 3.372519547120407e-05, + "loss": 1.743, + "step": 20115 + }, + { + "epoch": 6.174340085942296, + "grad_norm": 0.2795363664627075, + "learning_rate": 3.372049567162999e-05, + "loss": 1.7278, + "step": 20116 + }, + { + "epoch": 6.174647022713321, + "grad_norm": 0.21436716616153717, + "learning_rate": 3.3715796032939494e-05, + "loss": 1.7306, + "step": 20117 + }, + { + "epoch": 6.1749539594843466, + "grad_norm": 0.2593919336795807, + "learning_rate": 3.3711096555179064e-05, + "loss": 1.7323, + "step": 20118 + }, + { + "epoch": 6.175260896255371, + "grad_norm": 0.19639115035533905, + "learning_rate": 3.3706397238395124e-05, + "loss": 1.7444, + "step": 20119 + }, + { + "epoch": 6.175567833026396, + "grad_norm": 0.23408278822898865, + "learning_rate": 3.370169808263409e-05, + "loss": 1.7461, + "step": 20120 + }, + { + "epoch": 6.175874769797422, + "grad_norm": 0.21200022101402283, + "learning_rate": 3.369699908794246e-05, + "loss": 1.7588, + "step": 20121 + }, + { + "epoch": 6.176181706568447, + "grad_norm": 0.17609953880310059, + "learning_rate": 3.369230025436662e-05, + "loss": 1.6608, + "step": 20122 + }, + { + "epoch": 6.176488643339472, + "grad_norm": 0.19895964860916138, + "learning_rate": 3.3687601581953046e-05, + "loss": 1.729, + "step": 20123 + }, + { + "epoch": 6.176795580110497, + "grad_norm": 0.22833310067653656, + "learning_rate": 3.368290307074814e-05, + "loss": 1.7148, + "step": 20124 + }, + { + "epoch": 6.177102516881522, + "grad_norm": 0.1847219169139862, + "learning_rate": 3.367820472079835e-05, + "loss": 1.6894, + "step": 20125 + }, + { + "epoch": 6.1774094536525475, + "grad_norm": 0.20269884169101715, + "learning_rate": 3.36735065321501e-05, + "loss": 1.794, + "step": 20126 + }, + { + "epoch": 6.177716390423573, + "grad_norm": 0.19277122616767883, + "learning_rate": 3.3668808504849845e-05, + "loss": 1.6936, + "step": 20127 + }, + { + "epoch": 6.178023327194598, + "grad_norm": 0.23804394900798798, + "learning_rate": 3.3664110638943985e-05, + "loss": 1.746, + "step": 20128 + }, + { + "epoch": 6.1783302639656235, + "grad_norm": 0.20946018397808075, + "learning_rate": 3.365941293447897e-05, + "loss": 1.6952, + "step": 20129 + }, + { + "epoch": 6.178637200736648, + "grad_norm": 0.21680596470832825, + "learning_rate": 3.36547153915012e-05, + "loss": 1.7709, + "step": 20130 + }, + { + "epoch": 6.178944137507673, + "grad_norm": 0.22549709677696228, + "learning_rate": 3.365001801005712e-05, + "loss": 1.6814, + "step": 20131 + }, + { + "epoch": 6.179251074278699, + "grad_norm": 0.20660072565078735, + "learning_rate": 3.3645320790193136e-05, + "loss": 1.6992, + "step": 20132 + }, + { + "epoch": 6.179558011049724, + "grad_norm": 0.23697195947170258, + "learning_rate": 3.36406237319557e-05, + "loss": 1.7325, + "step": 20133 + }, + { + "epoch": 6.179864947820749, + "grad_norm": 0.20847748219966888, + "learning_rate": 3.363592683539118e-05, + "loss": 1.7066, + "step": 20134 + }, + { + "epoch": 6.180171884591774, + "grad_norm": 0.24317312240600586, + "learning_rate": 3.363123010054605e-05, + "loss": 1.7259, + "step": 20135 + }, + { + "epoch": 6.180478821362799, + "grad_norm": 0.22137925028800964, + "learning_rate": 3.3626533527466686e-05, + "loss": 1.7492, + "step": 20136 + }, + { + "epoch": 6.180785758133824, + "grad_norm": 0.23857460916042328, + "learning_rate": 3.362183711619951e-05, + "loss": 1.6671, + "step": 20137 + }, + { + "epoch": 6.18109269490485, + "grad_norm": 0.20017468929290771, + "learning_rate": 3.361714086679095e-05, + "loss": 1.7151, + "step": 20138 + }, + { + "epoch": 6.181399631675875, + "grad_norm": 0.21566617488861084, + "learning_rate": 3.361244477928739e-05, + "loss": 1.7659, + "step": 20139 + }, + { + "epoch": 6.1817065684469, + "grad_norm": 0.21695555746555328, + "learning_rate": 3.360774885373528e-05, + "loss": 1.7463, + "step": 20140 + }, + { + "epoch": 6.182013505217925, + "grad_norm": 0.19326116144657135, + "learning_rate": 3.360305309018098e-05, + "loss": 1.7182, + "step": 20141 + }, + { + "epoch": 6.18232044198895, + "grad_norm": 0.2135429084300995, + "learning_rate": 3.359835748867093e-05, + "loss": 1.8001, + "step": 20142 + }, + { + "epoch": 6.1826273787599755, + "grad_norm": 0.20097343623638153, + "learning_rate": 3.359366204925151e-05, + "loss": 1.7442, + "step": 20143 + }, + { + "epoch": 6.182934315531001, + "grad_norm": 0.212847501039505, + "learning_rate": 3.358896677196916e-05, + "loss": 1.7418, + "step": 20144 + }, + { + "epoch": 6.183241252302026, + "grad_norm": 0.18414677679538727, + "learning_rate": 3.358427165687024e-05, + "loss": 1.6813, + "step": 20145 + }, + { + "epoch": 6.183548189073051, + "grad_norm": 0.23170427978038788, + "learning_rate": 3.357957670400119e-05, + "loss": 1.7722, + "step": 20146 + }, + { + "epoch": 6.183855125844076, + "grad_norm": 0.28952550888061523, + "learning_rate": 3.357488191340837e-05, + "loss": 1.7785, + "step": 20147 + }, + { + "epoch": 6.184162062615101, + "grad_norm": 0.2126605361700058, + "learning_rate": 3.35701872851382e-05, + "loss": 1.7064, + "step": 20148 + }, + { + "epoch": 6.184468999386127, + "grad_norm": 0.2376919537782669, + "learning_rate": 3.356549281923706e-05, + "loss": 1.7322, + "step": 20149 + }, + { + "epoch": 6.184775936157152, + "grad_norm": 0.24168729782104492, + "learning_rate": 3.3560798515751375e-05, + "loss": 1.7296, + "step": 20150 + }, + { + "epoch": 6.185082872928176, + "grad_norm": 0.19746467471122742, + "learning_rate": 3.355610437472749e-05, + "loss": 1.7816, + "step": 20151 + }, + { + "epoch": 6.185389809699202, + "grad_norm": 0.2399774193763733, + "learning_rate": 3.3551410396211844e-05, + "loss": 1.7309, + "step": 20152 + }, + { + "epoch": 6.185696746470227, + "grad_norm": 0.20560777187347412, + "learning_rate": 3.3546716580250785e-05, + "loss": 1.7134, + "step": 20153 + }, + { + "epoch": 6.186003683241252, + "grad_norm": 0.22640523314476013, + "learning_rate": 3.354202292689072e-05, + "loss": 1.7572, + "step": 20154 + }, + { + "epoch": 6.186310620012278, + "grad_norm": 0.20796974003314972, + "learning_rate": 3.353732943617803e-05, + "loss": 1.6897, + "step": 20155 + }, + { + "epoch": 6.186617556783303, + "grad_norm": 0.19902797043323517, + "learning_rate": 3.35326361081591e-05, + "loss": 1.6836, + "step": 20156 + }, + { + "epoch": 6.1869244935543275, + "grad_norm": 0.30999818444252014, + "learning_rate": 3.352794294288032e-05, + "loss": 1.7704, + "step": 20157 + }, + { + "epoch": 6.187231430325353, + "grad_norm": 0.20634675025939941, + "learning_rate": 3.3523249940388045e-05, + "loss": 1.7599, + "step": 20158 + }, + { + "epoch": 6.187538367096378, + "grad_norm": 0.25650453567504883, + "learning_rate": 3.3518557100728674e-05, + "loss": 1.7441, + "step": 20159 + }, + { + "epoch": 6.1878453038674035, + "grad_norm": 0.2400079369544983, + "learning_rate": 3.351386442394858e-05, + "loss": 1.6836, + "step": 20160 + }, + { + "epoch": 6.188152240638429, + "grad_norm": 0.23734217882156372, + "learning_rate": 3.350917191009416e-05, + "loss": 1.7, + "step": 20161 + }, + { + "epoch": 6.188459177409453, + "grad_norm": 0.29579323530197144, + "learning_rate": 3.3504479559211755e-05, + "loss": 1.71, + "step": 20162 + }, + { + "epoch": 6.188766114180479, + "grad_norm": 0.18999184668064117, + "learning_rate": 3.349978737134776e-05, + "loss": 1.7396, + "step": 20163 + }, + { + "epoch": 6.189073050951504, + "grad_norm": 0.26760223507881165, + "learning_rate": 3.3495095346548525e-05, + "loss": 1.7846, + "step": 20164 + }, + { + "epoch": 6.189379987722529, + "grad_norm": 0.18416397273540497, + "learning_rate": 3.349040348486044e-05, + "loss": 1.6911, + "step": 20165 + }, + { + "epoch": 6.189686924493555, + "grad_norm": 0.23761679232120514, + "learning_rate": 3.348571178632986e-05, + "loss": 1.6776, + "step": 20166 + }, + { + "epoch": 6.189993861264579, + "grad_norm": 0.2056473195552826, + "learning_rate": 3.348102025100316e-05, + "loss": 1.697, + "step": 20167 + }, + { + "epoch": 6.190300798035604, + "grad_norm": 0.23916250467300415, + "learning_rate": 3.3476328878926685e-05, + "loss": 1.7943, + "step": 20168 + }, + { + "epoch": 6.19060773480663, + "grad_norm": 0.2205415964126587, + "learning_rate": 3.347163767014684e-05, + "loss": 1.8037, + "step": 20169 + }, + { + "epoch": 6.190914671577655, + "grad_norm": 0.28907346725463867, + "learning_rate": 3.346694662470995e-05, + "loss": 1.6875, + "step": 20170 + }, + { + "epoch": 6.19122160834868, + "grad_norm": 0.2382480502128601, + "learning_rate": 3.3462255742662364e-05, + "loss": 1.7116, + "step": 20171 + }, + { + "epoch": 6.191528545119706, + "grad_norm": 0.25309205055236816, + "learning_rate": 3.3457565024050485e-05, + "loss": 1.7584, + "step": 20172 + }, + { + "epoch": 6.19183548189073, + "grad_norm": 0.3959091901779175, + "learning_rate": 3.3452874468920626e-05, + "loss": 1.7054, + "step": 20173 + }, + { + "epoch": 6.1921424186617555, + "grad_norm": 0.22697016596794128, + "learning_rate": 3.344818407731918e-05, + "loss": 1.7373, + "step": 20174 + }, + { + "epoch": 6.192449355432781, + "grad_norm": 0.298178493976593, + "learning_rate": 3.3443493849292465e-05, + "loss": 1.7192, + "step": 20175 + }, + { + "epoch": 6.192756292203806, + "grad_norm": 0.2742854058742523, + "learning_rate": 3.343880378488685e-05, + "loss": 1.7538, + "step": 20176 + }, + { + "epoch": 6.1930632289748315, + "grad_norm": 0.23367546498775482, + "learning_rate": 3.343411388414867e-05, + "loss": 1.694, + "step": 20177 + }, + { + "epoch": 6.193370165745856, + "grad_norm": 0.2932305932044983, + "learning_rate": 3.342942414712431e-05, + "loss": 1.7291, + "step": 20178 + }, + { + "epoch": 6.193677102516881, + "grad_norm": 0.24306413531303406, + "learning_rate": 3.342473457386007e-05, + "loss": 1.6959, + "step": 20179 + }, + { + "epoch": 6.193984039287907, + "grad_norm": 0.30828577280044556, + "learning_rate": 3.3420045164402344e-05, + "loss": 1.6848, + "step": 20180 + }, + { + "epoch": 6.194290976058932, + "grad_norm": 0.18766994774341583, + "learning_rate": 3.341535591879743e-05, + "loss": 1.7261, + "step": 20181 + }, + { + "epoch": 6.194597912829957, + "grad_norm": 0.300778329372406, + "learning_rate": 3.3410666837091696e-05, + "loss": 1.7539, + "step": 20182 + }, + { + "epoch": 6.194904849600983, + "grad_norm": 0.20148977637290955, + "learning_rate": 3.340597791933147e-05, + "loss": 1.7496, + "step": 20183 + }, + { + "epoch": 6.195211786372007, + "grad_norm": 0.2746329605579376, + "learning_rate": 3.340128916556311e-05, + "loss": 1.6458, + "step": 20184 + }, + { + "epoch": 6.195518723143032, + "grad_norm": 0.2715265452861786, + "learning_rate": 3.339660057583292e-05, + "loss": 1.7799, + "step": 20185 + }, + { + "epoch": 6.195825659914058, + "grad_norm": 0.2145555317401886, + "learning_rate": 3.339191215018728e-05, + "loss": 1.6854, + "step": 20186 + }, + { + "epoch": 6.196132596685083, + "grad_norm": 0.3018960654735565, + "learning_rate": 3.338722388867248e-05, + "loss": 1.7569, + "step": 20187 + }, + { + "epoch": 6.196439533456108, + "grad_norm": 0.24876931309700012, + "learning_rate": 3.338253579133487e-05, + "loss": 1.7434, + "step": 20188 + }, + { + "epoch": 6.196746470227133, + "grad_norm": 0.3609273433685303, + "learning_rate": 3.337784785822079e-05, + "loss": 1.737, + "step": 20189 + }, + { + "epoch": 6.197053406998158, + "grad_norm": 0.21586830914020538, + "learning_rate": 3.337316008937655e-05, + "loss": 1.7553, + "step": 20190 + }, + { + "epoch": 6.1973603437691835, + "grad_norm": 0.23542988300323486, + "learning_rate": 3.3368472484848504e-05, + "loss": 1.7174, + "step": 20191 + }, + { + "epoch": 6.197667280540209, + "grad_norm": 0.19861294329166412, + "learning_rate": 3.336378504468294e-05, + "loss": 1.7268, + "step": 20192 + }, + { + "epoch": 6.197974217311234, + "grad_norm": 0.26865682005882263, + "learning_rate": 3.335909776892622e-05, + "loss": 1.7656, + "step": 20193 + }, + { + "epoch": 6.198281154082259, + "grad_norm": 0.343078076839447, + "learning_rate": 3.3354410657624624e-05, + "loss": 1.734, + "step": 20194 + }, + { + "epoch": 6.198588090853284, + "grad_norm": 0.21613667905330658, + "learning_rate": 3.334972371082453e-05, + "loss": 1.7777, + "step": 20195 + }, + { + "epoch": 6.198895027624309, + "grad_norm": 0.22268854081630707, + "learning_rate": 3.3345036928572207e-05, + "loss": 1.667, + "step": 20196 + }, + { + "epoch": 6.199201964395335, + "grad_norm": 0.22870087623596191, + "learning_rate": 3.3340350310914e-05, + "loss": 1.7532, + "step": 20197 + }, + { + "epoch": 6.19950890116636, + "grad_norm": 0.1969831883907318, + "learning_rate": 3.3335663857896205e-05, + "loss": 1.7821, + "step": 20198 + }, + { + "epoch": 6.199815837937384, + "grad_norm": 0.20414133369922638, + "learning_rate": 3.3330977569565154e-05, + "loss": 1.7449, + "step": 20199 + }, + { + "epoch": 6.20012277470841, + "grad_norm": 0.21947748959064484, + "learning_rate": 3.332629144596714e-05, + "loss": 1.6888, + "step": 20200 + }, + { + "epoch": 6.200429711479435, + "grad_norm": 0.20943035185337067, + "learning_rate": 3.332160548714851e-05, + "loss": 1.7278, + "step": 20201 + }, + { + "epoch": 6.2007366482504604, + "grad_norm": 0.22410117089748383, + "learning_rate": 3.331691969315553e-05, + "loss": 1.721, + "step": 20202 + }, + { + "epoch": 6.201043585021486, + "grad_norm": 0.21422281861305237, + "learning_rate": 3.3312234064034555e-05, + "loss": 1.7199, + "step": 20203 + }, + { + "epoch": 6.201350521792511, + "grad_norm": 0.21021418273448944, + "learning_rate": 3.330754859983184e-05, + "loss": 1.7972, + "step": 20204 + }, + { + "epoch": 6.201657458563536, + "grad_norm": 0.21155185997486115, + "learning_rate": 3.330286330059371e-05, + "loss": 1.7463, + "step": 20205 + }, + { + "epoch": 6.201964395334561, + "grad_norm": 0.20241162180900574, + "learning_rate": 3.329817816636649e-05, + "loss": 1.7804, + "step": 20206 + }, + { + "epoch": 6.202271332105586, + "grad_norm": 0.19882376492023468, + "learning_rate": 3.329349319719644e-05, + "loss": 1.7564, + "step": 20207 + }, + { + "epoch": 6.202578268876612, + "grad_norm": 0.20528686046600342, + "learning_rate": 3.328880839312991e-05, + "loss": 1.751, + "step": 20208 + }, + { + "epoch": 6.202885205647637, + "grad_norm": 0.2708488404750824, + "learning_rate": 3.328412375421315e-05, + "loss": 1.8008, + "step": 20209 + }, + { + "epoch": 6.203192142418661, + "grad_norm": 0.1986229121685028, + "learning_rate": 3.3279439280492486e-05, + "loss": 1.6833, + "step": 20210 + }, + { + "epoch": 6.203499079189687, + "grad_norm": 0.2700355350971222, + "learning_rate": 3.3274754972014186e-05, + "loss": 1.8071, + "step": 20211 + }, + { + "epoch": 6.203806015960712, + "grad_norm": 0.23060421645641327, + "learning_rate": 3.327007082882458e-05, + "loss": 1.6856, + "step": 20212 + }, + { + "epoch": 6.204112952731737, + "grad_norm": 0.20798510313034058, + "learning_rate": 3.3265386850969926e-05, + "loss": 1.7421, + "step": 20213 + }, + { + "epoch": 6.204419889502763, + "grad_norm": 0.21828265488147736, + "learning_rate": 3.3260703038496556e-05, + "loss": 1.7212, + "step": 20214 + }, + { + "epoch": 6.204726826273788, + "grad_norm": 0.1965378224849701, + "learning_rate": 3.325601939145069e-05, + "loss": 1.6987, + "step": 20215 + }, + { + "epoch": 6.2050337630448125, + "grad_norm": 0.23897121846675873, + "learning_rate": 3.325133590987868e-05, + "loss": 1.7501, + "step": 20216 + }, + { + "epoch": 6.205340699815838, + "grad_norm": 0.18647781014442444, + "learning_rate": 3.324665259382676e-05, + "loss": 1.688, + "step": 20217 + }, + { + "epoch": 6.205647636586863, + "grad_norm": 0.19906121492385864, + "learning_rate": 3.324196944334127e-05, + "loss": 1.749, + "step": 20218 + }, + { + "epoch": 6.2059545733578885, + "grad_norm": 0.2061154991388321, + "learning_rate": 3.3237286458468444e-05, + "loss": 1.757, + "step": 20219 + }, + { + "epoch": 6.206261510128914, + "grad_norm": 0.19410182535648346, + "learning_rate": 3.323260363925459e-05, + "loss": 1.6826, + "step": 20220 + }, + { + "epoch": 6.206568446899938, + "grad_norm": 0.2017979919910431, + "learning_rate": 3.322792098574597e-05, + "loss": 1.7568, + "step": 20221 + }, + { + "epoch": 6.206875383670964, + "grad_norm": 0.19491736590862274, + "learning_rate": 3.322323849798885e-05, + "loss": 1.7082, + "step": 20222 + }, + { + "epoch": 6.207182320441989, + "grad_norm": 0.19826333224773407, + "learning_rate": 3.321855617602954e-05, + "loss": 1.7654, + "step": 20223 + }, + { + "epoch": 6.207489257213014, + "grad_norm": 0.18185383081436157, + "learning_rate": 3.321387401991428e-05, + "loss": 1.6826, + "step": 20224 + }, + { + "epoch": 6.20779619398404, + "grad_norm": 0.22402678430080414, + "learning_rate": 3.320919202968937e-05, + "loss": 1.795, + "step": 20225 + }, + { + "epoch": 6.208103130755064, + "grad_norm": 0.201541468501091, + "learning_rate": 3.320451020540105e-05, + "loss": 1.6838, + "step": 20226 + }, + { + "epoch": 6.208410067526089, + "grad_norm": 0.25479504466056824, + "learning_rate": 3.3199828547095616e-05, + "loss": 1.7881, + "step": 20227 + }, + { + "epoch": 6.208717004297115, + "grad_norm": 0.2057993859052658, + "learning_rate": 3.31951470548193e-05, + "loss": 1.737, + "step": 20228 + }, + { + "epoch": 6.20902394106814, + "grad_norm": 0.183469757437706, + "learning_rate": 3.319046572861842e-05, + "loss": 1.6989, + "step": 20229 + }, + { + "epoch": 6.209330877839165, + "grad_norm": 0.21723738312721252, + "learning_rate": 3.318578456853919e-05, + "loss": 1.7537, + "step": 20230 + }, + { + "epoch": 6.209637814610191, + "grad_norm": 0.21919457614421844, + "learning_rate": 3.318110357462791e-05, + "loss": 1.7444, + "step": 20231 + }, + { + "epoch": 6.209944751381215, + "grad_norm": 0.17009909451007843, + "learning_rate": 3.317642274693081e-05, + "loss": 1.6885, + "step": 20232 + }, + { + "epoch": 6.2102516881522405, + "grad_norm": 0.19625195860862732, + "learning_rate": 3.317174208549416e-05, + "loss": 1.7255, + "step": 20233 + }, + { + "epoch": 6.210558624923266, + "grad_norm": 0.2131364941596985, + "learning_rate": 3.316706159036422e-05, + "loss": 1.7047, + "step": 20234 + }, + { + "epoch": 6.210865561694291, + "grad_norm": 0.18454425036907196, + "learning_rate": 3.316238126158725e-05, + "loss": 1.7536, + "step": 20235 + }, + { + "epoch": 6.2111724984653165, + "grad_norm": 0.2124820202589035, + "learning_rate": 3.3157701099209485e-05, + "loss": 1.7456, + "step": 20236 + }, + { + "epoch": 6.211479435236341, + "grad_norm": 0.1929594725370407, + "learning_rate": 3.3153021103277206e-05, + "loss": 1.7118, + "step": 20237 + }, + { + "epoch": 6.211786372007366, + "grad_norm": 0.19876480102539062, + "learning_rate": 3.314834127383664e-05, + "loss": 1.6855, + "step": 20238 + }, + { + "epoch": 6.212093308778392, + "grad_norm": 0.18902665376663208, + "learning_rate": 3.314366161093403e-05, + "loss": 1.7052, + "step": 20239 + }, + { + "epoch": 6.212400245549417, + "grad_norm": 0.1859758198261261, + "learning_rate": 3.313898211461566e-05, + "loss": 1.7277, + "step": 20240 + }, + { + "epoch": 6.212707182320442, + "grad_norm": 0.2160472422838211, + "learning_rate": 3.313430278492773e-05, + "loss": 1.6787, + "step": 20241 + }, + { + "epoch": 6.213014119091467, + "grad_norm": 0.24482262134552002, + "learning_rate": 3.312962362191652e-05, + "loss": 1.7439, + "step": 20242 + }, + { + "epoch": 6.213321055862492, + "grad_norm": 0.2343531847000122, + "learning_rate": 3.312494462562824e-05, + "loss": 1.7981, + "step": 20243 + }, + { + "epoch": 6.213627992633517, + "grad_norm": 0.2385960817337036, + "learning_rate": 3.3120265796109163e-05, + "loss": 1.7144, + "step": 20244 + }, + { + "epoch": 6.213934929404543, + "grad_norm": 0.21878042817115784, + "learning_rate": 3.3115587133405503e-05, + "loss": 1.7057, + "step": 20245 + }, + { + "epoch": 6.214241866175568, + "grad_norm": 0.23426075279712677, + "learning_rate": 3.311090863756351e-05, + "loss": 1.7372, + "step": 20246 + }, + { + "epoch": 6.214548802946593, + "grad_norm": 0.2369524985551834, + "learning_rate": 3.310623030862942e-05, + "loss": 1.7502, + "step": 20247 + }, + { + "epoch": 6.214855739717618, + "grad_norm": 0.31635788083076477, + "learning_rate": 3.3101552146649474e-05, + "loss": 1.7616, + "step": 20248 + }, + { + "epoch": 6.215162676488643, + "grad_norm": 0.2312999814748764, + "learning_rate": 3.309687415166986e-05, + "loss": 1.6991, + "step": 20249 + }, + { + "epoch": 6.2154696132596685, + "grad_norm": 0.23423358798027039, + "learning_rate": 3.309219632373688e-05, + "loss": 1.7737, + "step": 20250 + }, + { + "epoch": 6.215776550030694, + "grad_norm": 0.28763437271118164, + "learning_rate": 3.308751866289671e-05, + "loss": 1.7822, + "step": 20251 + }, + { + "epoch": 6.216083486801719, + "grad_norm": 0.20754525065422058, + "learning_rate": 3.30828411691956e-05, + "loss": 1.7427, + "step": 20252 + }, + { + "epoch": 6.216390423572744, + "grad_norm": 0.31858858466148376, + "learning_rate": 3.307816384267975e-05, + "loss": 1.7384, + "step": 20253 + }, + { + "epoch": 6.216697360343769, + "grad_norm": 0.21968062222003937, + "learning_rate": 3.307348668339543e-05, + "loss": 1.6896, + "step": 20254 + }, + { + "epoch": 6.217004297114794, + "grad_norm": 0.21643556654453278, + "learning_rate": 3.306880969138882e-05, + "loss": 1.7353, + "step": 20255 + }, + { + "epoch": 6.21731123388582, + "grad_norm": 0.22141097486019135, + "learning_rate": 3.306413286670616e-05, + "loss": 1.7254, + "step": 20256 + }, + { + "epoch": 6.217618170656845, + "grad_norm": 0.17666983604431152, + "learning_rate": 3.305945620939367e-05, + "loss": 1.7198, + "step": 20257 + }, + { + "epoch": 6.21792510742787, + "grad_norm": 0.25182467699050903, + "learning_rate": 3.3054779719497544e-05, + "loss": 1.7562, + "step": 20258 + }, + { + "epoch": 6.218232044198895, + "grad_norm": 0.23481281101703644, + "learning_rate": 3.305010339706404e-05, + "loss": 1.8293, + "step": 20259 + }, + { + "epoch": 6.21853898096992, + "grad_norm": 0.23981143534183502, + "learning_rate": 3.304542724213933e-05, + "loss": 1.7619, + "step": 20260 + }, + { + "epoch": 6.218845917740945, + "grad_norm": 0.2388351708650589, + "learning_rate": 3.3040751254769665e-05, + "loss": 1.7471, + "step": 20261 + }, + { + "epoch": 6.219152854511971, + "grad_norm": 0.2039698362350464, + "learning_rate": 3.3036075435001216e-05, + "loss": 1.6893, + "step": 20262 + }, + { + "epoch": 6.219459791282996, + "grad_norm": 0.218357652425766, + "learning_rate": 3.3031399782880224e-05, + "loss": 1.753, + "step": 20263 + }, + { + "epoch": 6.2197667280540205, + "grad_norm": 0.25466734170913696, + "learning_rate": 3.302672429845288e-05, + "loss": 1.7496, + "step": 20264 + }, + { + "epoch": 6.220073664825046, + "grad_norm": 0.1853330284357071, + "learning_rate": 3.302204898176541e-05, + "loss": 1.7779, + "step": 20265 + }, + { + "epoch": 6.220380601596071, + "grad_norm": 0.24044091999530792, + "learning_rate": 3.3017373832863976e-05, + "loss": 1.8226, + "step": 20266 + }, + { + "epoch": 6.2206875383670965, + "grad_norm": 0.2209070324897766, + "learning_rate": 3.3012698851794835e-05, + "loss": 1.7069, + "step": 20267 + }, + { + "epoch": 6.220994475138122, + "grad_norm": 0.2775282561779022, + "learning_rate": 3.3008024038604135e-05, + "loss": 1.7048, + "step": 20268 + }, + { + "epoch": 6.221301411909146, + "grad_norm": 0.22873717546463013, + "learning_rate": 3.3003349393338116e-05, + "loss": 1.7956, + "step": 20269 + }, + { + "epoch": 6.221608348680172, + "grad_norm": 0.27883464097976685, + "learning_rate": 3.2998674916042946e-05, + "loss": 1.6955, + "step": 20270 + }, + { + "epoch": 6.221915285451197, + "grad_norm": 0.2383071482181549, + "learning_rate": 3.2994000606764865e-05, + "loss": 1.7645, + "step": 20271 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 0.26280200481414795, + "learning_rate": 3.298932646555003e-05, + "loss": 1.7854, + "step": 20272 + }, + { + "epoch": 6.222529158993248, + "grad_norm": 0.2387673407793045, + "learning_rate": 3.2984652492444625e-05, + "loss": 1.679, + "step": 20273 + }, + { + "epoch": 6.222836095764273, + "grad_norm": 0.2136983871459961, + "learning_rate": 3.297997868749486e-05, + "loss": 1.7313, + "step": 20274 + }, + { + "epoch": 6.223143032535297, + "grad_norm": 0.2629627585411072, + "learning_rate": 3.297530505074692e-05, + "loss": 1.7452, + "step": 20275 + }, + { + "epoch": 6.223449969306323, + "grad_norm": 0.22018705308437347, + "learning_rate": 3.2970631582247e-05, + "loss": 1.7368, + "step": 20276 + }, + { + "epoch": 6.223756906077348, + "grad_norm": 0.19277356564998627, + "learning_rate": 3.296595828204128e-05, + "loss": 1.7084, + "step": 20277 + }, + { + "epoch": 6.224063842848373, + "grad_norm": 0.18806682527065277, + "learning_rate": 3.2961285150175944e-05, + "loss": 1.6576, + "step": 20278 + }, + { + "epoch": 6.224370779619399, + "grad_norm": 0.2019709348678589, + "learning_rate": 3.295661218669717e-05, + "loss": 1.7594, + "step": 20279 + }, + { + "epoch": 6.224677716390423, + "grad_norm": 0.19662119448184967, + "learning_rate": 3.295193939165114e-05, + "loss": 1.6946, + "step": 20280 + }, + { + "epoch": 6.2249846531614486, + "grad_norm": 0.1880662590265274, + "learning_rate": 3.294726676508404e-05, + "loss": 1.7232, + "step": 20281 + }, + { + "epoch": 6.225291589932474, + "grad_norm": 0.23242273926734924, + "learning_rate": 3.294259430704206e-05, + "loss": 1.7331, + "step": 20282 + }, + { + "epoch": 6.225598526703499, + "grad_norm": 0.19915202260017395, + "learning_rate": 3.293792201757134e-05, + "loss": 1.7844, + "step": 20283 + }, + { + "epoch": 6.225905463474525, + "grad_norm": 0.1845373958349228, + "learning_rate": 3.2933249896718097e-05, + "loss": 1.6803, + "step": 20284 + }, + { + "epoch": 6.226212400245549, + "grad_norm": 0.19340910017490387, + "learning_rate": 3.292857794452846e-05, + "loss": 1.6929, + "step": 20285 + }, + { + "epoch": 6.226519337016574, + "grad_norm": 0.21429216861724854, + "learning_rate": 3.292390616104863e-05, + "loss": 1.6833, + "step": 20286 + }, + { + "epoch": 6.2268262737876, + "grad_norm": 0.2267037034034729, + "learning_rate": 3.291923454632476e-05, + "loss": 1.7271, + "step": 20287 + }, + { + "epoch": 6.227133210558625, + "grad_norm": 0.23121988773345947, + "learning_rate": 3.2914563100403054e-05, + "loss": 1.8443, + "step": 20288 + }, + { + "epoch": 6.22744014732965, + "grad_norm": 0.20980899035930634, + "learning_rate": 3.290989182332964e-05, + "loss": 1.6907, + "step": 20289 + }, + { + "epoch": 6.227747084100676, + "grad_norm": 0.28162500262260437, + "learning_rate": 3.290522071515067e-05, + "loss": 1.7497, + "step": 20290 + }, + { + "epoch": 6.2280540208717, + "grad_norm": 0.2163640707731247, + "learning_rate": 3.290054977591234e-05, + "loss": 1.736, + "step": 20291 + }, + { + "epoch": 6.2283609576427255, + "grad_norm": 0.19144479930400848, + "learning_rate": 3.289587900566079e-05, + "loss": 1.7222, + "step": 20292 + }, + { + "epoch": 6.228667894413751, + "grad_norm": 0.24952897429466248, + "learning_rate": 3.2891208404442216e-05, + "loss": 1.7095, + "step": 20293 + }, + { + "epoch": 6.228974831184776, + "grad_norm": 0.19421981275081635, + "learning_rate": 3.288653797230272e-05, + "loss": 1.7231, + "step": 20294 + }, + { + "epoch": 6.2292817679558015, + "grad_norm": 0.22837944328784943, + "learning_rate": 3.288186770928851e-05, + "loss": 1.7404, + "step": 20295 + }, + { + "epoch": 6.229588704726826, + "grad_norm": 0.2292151004076004, + "learning_rate": 3.2877197615445685e-05, + "loss": 1.6999, + "step": 20296 + }, + { + "epoch": 6.229895641497851, + "grad_norm": 0.18376365303993225, + "learning_rate": 3.2872527690820456e-05, + "loss": 1.681, + "step": 20297 + }, + { + "epoch": 6.230202578268877, + "grad_norm": 0.21331918239593506, + "learning_rate": 3.286785793545893e-05, + "loss": 1.7362, + "step": 20298 + }, + { + "epoch": 6.230509515039902, + "grad_norm": 0.21247150003910065, + "learning_rate": 3.286318834940729e-05, + "loss": 1.7816, + "step": 20299 + }, + { + "epoch": 6.230816451810927, + "grad_norm": 0.19166043400764465, + "learning_rate": 3.285851893271165e-05, + "loss": 1.7209, + "step": 20300 + }, + { + "epoch": 6.231123388581952, + "grad_norm": 0.2139919251203537, + "learning_rate": 3.2853849685418195e-05, + "loss": 1.6946, + "step": 20301 + }, + { + "epoch": 6.231430325352977, + "grad_norm": 0.20296575129032135, + "learning_rate": 3.284918060757303e-05, + "loss": 1.6829, + "step": 20302 + }, + { + "epoch": 6.231737262124002, + "grad_norm": 0.2465996891260147, + "learning_rate": 3.2844511699222314e-05, + "loss": 1.751, + "step": 20303 + }, + { + "epoch": 6.232044198895028, + "grad_norm": 0.23327109217643738, + "learning_rate": 3.283984296041219e-05, + "loss": 1.736, + "step": 20304 + }, + { + "epoch": 6.232351135666053, + "grad_norm": 0.24316997826099396, + "learning_rate": 3.2835174391188806e-05, + "loss": 1.7187, + "step": 20305 + }, + { + "epoch": 6.232658072437078, + "grad_norm": 0.25280308723449707, + "learning_rate": 3.2830505991598294e-05, + "loss": 1.7087, + "step": 20306 + }, + { + "epoch": 6.232965009208103, + "grad_norm": 0.19143202900886536, + "learning_rate": 3.282583776168676e-05, + "loss": 1.674, + "step": 20307 + }, + { + "epoch": 6.233271945979128, + "grad_norm": 0.2667979598045349, + "learning_rate": 3.282116970150038e-05, + "loss": 1.7978, + "step": 20308 + }, + { + "epoch": 6.2335788827501535, + "grad_norm": 0.18397411704063416, + "learning_rate": 3.281650181108526e-05, + "loss": 1.7669, + "step": 20309 + }, + { + "epoch": 6.233885819521179, + "grad_norm": 0.2842588722705841, + "learning_rate": 3.281183409048756e-05, + "loss": 1.8238, + "step": 20310 + }, + { + "epoch": 6.234192756292204, + "grad_norm": 0.20290467143058777, + "learning_rate": 3.280716653975336e-05, + "loss": 1.7317, + "step": 20311 + }, + { + "epoch": 6.234499693063229, + "grad_norm": 0.224524587392807, + "learning_rate": 3.280249915892885e-05, + "loss": 1.8166, + "step": 20312 + }, + { + "epoch": 6.234806629834254, + "grad_norm": 0.28204405307769775, + "learning_rate": 3.2797831948060096e-05, + "loss": 1.7435, + "step": 20313 + }, + { + "epoch": 6.235113566605279, + "grad_norm": 0.2101798951625824, + "learning_rate": 3.2793164907193264e-05, + "loss": 1.6747, + "step": 20314 + }, + { + "epoch": 6.235420503376305, + "grad_norm": 0.1961289346218109, + "learning_rate": 3.278849803637445e-05, + "loss": 1.7131, + "step": 20315 + }, + { + "epoch": 6.23572744014733, + "grad_norm": 0.30541354417800903, + "learning_rate": 3.27838313356498e-05, + "loss": 1.8036, + "step": 20316 + }, + { + "epoch": 6.236034376918354, + "grad_norm": 0.21517200767993927, + "learning_rate": 3.277916480506541e-05, + "loss": 1.7684, + "step": 20317 + }, + { + "epoch": 6.23634131368938, + "grad_norm": 0.22871750593185425, + "learning_rate": 3.2774498444667426e-05, + "loss": 1.7545, + "step": 20318 + }, + { + "epoch": 6.236648250460405, + "grad_norm": 0.24596424400806427, + "learning_rate": 3.276983225450192e-05, + "loss": 1.6705, + "step": 20319 + }, + { + "epoch": 6.23695518723143, + "grad_norm": 0.19123119115829468, + "learning_rate": 3.2765166234615044e-05, + "loss": 1.7402, + "step": 20320 + }, + { + "epoch": 6.237262124002456, + "grad_norm": 0.25287121534347534, + "learning_rate": 3.276050038505288e-05, + "loss": 1.741, + "step": 20321 + }, + { + "epoch": 6.237569060773481, + "grad_norm": 0.19741536676883698, + "learning_rate": 3.275583470586158e-05, + "loss": 1.736, + "step": 20322 + }, + { + "epoch": 6.2378759975445055, + "grad_norm": 0.24529922008514404, + "learning_rate": 3.275116919708723e-05, + "loss": 1.6696, + "step": 20323 + }, + { + "epoch": 6.238182934315531, + "grad_norm": 0.25428420305252075, + "learning_rate": 3.274650385877591e-05, + "loss": 1.696, + "step": 20324 + }, + { + "epoch": 6.238489871086556, + "grad_norm": 0.19502994418144226, + "learning_rate": 3.274183869097377e-05, + "loss": 1.6976, + "step": 20325 + }, + { + "epoch": 6.2387968078575815, + "grad_norm": 0.23710335791110992, + "learning_rate": 3.273717369372688e-05, + "loss": 1.7395, + "step": 20326 + }, + { + "epoch": 6.239103744628607, + "grad_norm": 0.20904341340065002, + "learning_rate": 3.273250886708138e-05, + "loss": 1.7455, + "step": 20327 + }, + { + "epoch": 6.239410681399631, + "grad_norm": 0.2112383097410202, + "learning_rate": 3.272784421108332e-05, + "loss": 1.7401, + "step": 20328 + }, + { + "epoch": 6.239717618170657, + "grad_norm": 0.2310914695262909, + "learning_rate": 3.272317972577886e-05, + "loss": 1.8049, + "step": 20329 + }, + { + "epoch": 6.240024554941682, + "grad_norm": 0.18222108483314514, + "learning_rate": 3.271851541121404e-05, + "loss": 1.7119, + "step": 20330 + }, + { + "epoch": 6.240331491712707, + "grad_norm": 0.18739092350006104, + "learning_rate": 3.2713851267434984e-05, + "loss": 1.744, + "step": 20331 + }, + { + "epoch": 6.240638428483733, + "grad_norm": 0.17722012102603912, + "learning_rate": 3.2709187294487775e-05, + "loss": 1.7054, + "step": 20332 + }, + { + "epoch": 6.240945365254758, + "grad_norm": 0.18650192022323608, + "learning_rate": 3.270452349241854e-05, + "loss": 1.7272, + "step": 20333 + }, + { + "epoch": 6.241252302025782, + "grad_norm": 0.2004886120557785, + "learning_rate": 3.269985986127331e-05, + "loss": 1.6777, + "step": 20334 + }, + { + "epoch": 6.241559238796808, + "grad_norm": 0.1855446845293045, + "learning_rate": 3.269519640109823e-05, + "loss": 1.6823, + "step": 20335 + }, + { + "epoch": 6.241866175567833, + "grad_norm": 0.1950632780790329, + "learning_rate": 3.269053311193934e-05, + "loss": 1.7052, + "step": 20336 + }, + { + "epoch": 6.242173112338858, + "grad_norm": 0.19386698305606842, + "learning_rate": 3.268586999384276e-05, + "loss": 1.7431, + "step": 20337 + }, + { + "epoch": 6.242480049109884, + "grad_norm": 0.2266446053981781, + "learning_rate": 3.268120704685454e-05, + "loss": 1.735, + "step": 20338 + }, + { + "epoch": 6.242786985880908, + "grad_norm": 0.24133828282356262, + "learning_rate": 3.2676544271020814e-05, + "loss": 1.7707, + "step": 20339 + }, + { + "epoch": 6.2430939226519335, + "grad_norm": 0.22397162020206451, + "learning_rate": 3.267188166638763e-05, + "loss": 1.6943, + "step": 20340 + }, + { + "epoch": 6.243400859422959, + "grad_norm": 0.1614205688238144, + "learning_rate": 3.266721923300104e-05, + "loss": 1.6801, + "step": 20341 + }, + { + "epoch": 6.243707796193984, + "grad_norm": 0.22376522421836853, + "learning_rate": 3.2662556970907166e-05, + "loss": 1.6933, + "step": 20342 + }, + { + "epoch": 6.2440147329650095, + "grad_norm": 0.18614265322685242, + "learning_rate": 3.265789488015205e-05, + "loss": 1.7396, + "step": 20343 + }, + { + "epoch": 6.244321669736034, + "grad_norm": 0.2385358214378357, + "learning_rate": 3.265323296078181e-05, + "loss": 1.7782, + "step": 20344 + }, + { + "epoch": 6.244628606507059, + "grad_norm": 0.24316444993019104, + "learning_rate": 3.264857121284246e-05, + "loss": 1.7443, + "step": 20345 + }, + { + "epoch": 6.244935543278085, + "grad_norm": 0.184532031416893, + "learning_rate": 3.264390963638012e-05, + "loss": 1.7603, + "step": 20346 + }, + { + "epoch": 6.24524248004911, + "grad_norm": 0.2018461376428604, + "learning_rate": 3.2639248231440825e-05, + "loss": 1.7289, + "step": 20347 + }, + { + "epoch": 6.245549416820135, + "grad_norm": 0.23732338845729828, + "learning_rate": 3.263458699807066e-05, + "loss": 1.7924, + "step": 20348 + }, + { + "epoch": 6.245856353591161, + "grad_norm": 0.19645710289478302, + "learning_rate": 3.2629925936315674e-05, + "loss": 1.6855, + "step": 20349 + }, + { + "epoch": 6.246163290362185, + "grad_norm": 0.20730608701705933, + "learning_rate": 3.262526504622196e-05, + "loss": 1.7238, + "step": 20350 + }, + { + "epoch": 6.24647022713321, + "grad_norm": 0.21139587461948395, + "learning_rate": 3.2620604327835545e-05, + "loss": 1.7173, + "step": 20351 + }, + { + "epoch": 6.246777163904236, + "grad_norm": 0.22644877433776855, + "learning_rate": 3.261594378120252e-05, + "loss": 1.7976, + "step": 20352 + }, + { + "epoch": 6.247084100675261, + "grad_norm": 0.23719535768032074, + "learning_rate": 3.2611283406368906e-05, + "loss": 1.7549, + "step": 20353 + }, + { + "epoch": 6.247391037446286, + "grad_norm": 0.2046387791633606, + "learning_rate": 3.2606623203380807e-05, + "loss": 1.7343, + "step": 20354 + }, + { + "epoch": 6.247697974217311, + "grad_norm": 0.19325366616249084, + "learning_rate": 3.260196317228422e-05, + "loss": 1.7352, + "step": 20355 + }, + { + "epoch": 6.248004910988336, + "grad_norm": 0.2315458059310913, + "learning_rate": 3.259730331312526e-05, + "loss": 1.7838, + "step": 20356 + }, + { + "epoch": 6.2483118477593615, + "grad_norm": 0.24549536406993866, + "learning_rate": 3.2592643625949956e-05, + "loss": 1.7418, + "step": 20357 + }, + { + "epoch": 6.248618784530387, + "grad_norm": 0.2702246606349945, + "learning_rate": 3.258798411080432e-05, + "loss": 1.7651, + "step": 20358 + }, + { + "epoch": 6.248925721301412, + "grad_norm": 0.20515258610248566, + "learning_rate": 3.2583324767734444e-05, + "loss": 1.6866, + "step": 20359 + }, + { + "epoch": 6.249232658072437, + "grad_norm": 0.2696690261363983, + "learning_rate": 3.257866559678635e-05, + "loss": 1.7446, + "step": 20360 + }, + { + "epoch": 6.249539594843462, + "grad_norm": 0.19707174599170685, + "learning_rate": 3.2574006598006114e-05, + "loss": 1.6835, + "step": 20361 + }, + { + "epoch": 6.249846531614487, + "grad_norm": 0.23478952050209045, + "learning_rate": 3.256934777143974e-05, + "loss": 1.7344, + "step": 20362 + }, + { + "epoch": 6.250153468385513, + "grad_norm": 0.24214082956314087, + "learning_rate": 3.2564689117133306e-05, + "loss": 1.722, + "step": 20363 + }, + { + "epoch": 6.250460405156538, + "grad_norm": 0.18361221253871918, + "learning_rate": 3.256003063513281e-05, + "loss": 1.7336, + "step": 20364 + }, + { + "epoch": 6.250767341927563, + "grad_norm": 0.18548928201198578, + "learning_rate": 3.255537232548433e-05, + "loss": 1.6586, + "step": 20365 + }, + { + "epoch": 6.251074278698588, + "grad_norm": 0.2121812105178833, + "learning_rate": 3.2550714188233874e-05, + "loss": 1.7273, + "step": 20366 + }, + { + "epoch": 6.251381215469613, + "grad_norm": 0.2351878583431244, + "learning_rate": 3.25460562234275e-05, + "loss": 1.7101, + "step": 20367 + }, + { + "epoch": 6.2516881522406385, + "grad_norm": 0.20723144710063934, + "learning_rate": 3.2541398431111216e-05, + "loss": 1.7042, + "step": 20368 + }, + { + "epoch": 6.251995089011664, + "grad_norm": 0.19093643128871918, + "learning_rate": 3.2536740811331084e-05, + "loss": 1.7585, + "step": 20369 + }, + { + "epoch": 6.252302025782689, + "grad_norm": 0.27191361784935, + "learning_rate": 3.2532083364133094e-05, + "loss": 1.7734, + "step": 20370 + }, + { + "epoch": 6.252608962553714, + "grad_norm": 0.21019349992275238, + "learning_rate": 3.2527426089563306e-05, + "loss": 1.7015, + "step": 20371 + }, + { + "epoch": 6.252915899324739, + "grad_norm": 0.2300454080104828, + "learning_rate": 3.2522768987667744e-05, + "loss": 1.7311, + "step": 20372 + }, + { + "epoch": 6.253222836095764, + "grad_norm": 0.24723999202251434, + "learning_rate": 3.25181120584924e-05, + "loss": 1.674, + "step": 20373 + }, + { + "epoch": 6.25352977286679, + "grad_norm": 0.20302192866802216, + "learning_rate": 3.251345530208335e-05, + "loss": 1.6999, + "step": 20374 + }, + { + "epoch": 6.253836709637815, + "grad_norm": 0.25393861532211304, + "learning_rate": 3.250879871848655e-05, + "loss": 1.6761, + "step": 20375 + }, + { + "epoch": 6.25414364640884, + "grad_norm": 0.1879536211490631, + "learning_rate": 3.2504142307748064e-05, + "loss": 1.7233, + "step": 20376 + }, + { + "epoch": 6.254450583179865, + "grad_norm": 0.22197771072387695, + "learning_rate": 3.24994860699139e-05, + "loss": 1.6994, + "step": 20377 + }, + { + "epoch": 6.25475751995089, + "grad_norm": 0.24946242570877075, + "learning_rate": 3.249483000503008e-05, + "loss": 1.8488, + "step": 20378 + }, + { + "epoch": 6.255064456721915, + "grad_norm": 0.25218987464904785, + "learning_rate": 3.2490174113142594e-05, + "loss": 1.7947, + "step": 20379 + }, + { + "epoch": 6.255371393492941, + "grad_norm": 0.23970970511436462, + "learning_rate": 3.248551839429749e-05, + "loss": 1.785, + "step": 20380 + }, + { + "epoch": 6.255678330263966, + "grad_norm": 0.243649423122406, + "learning_rate": 3.248086284854074e-05, + "loss": 1.8089, + "step": 20381 + }, + { + "epoch": 6.2559852670349905, + "grad_norm": 0.18813125789165497, + "learning_rate": 3.247620747591838e-05, + "loss": 1.6892, + "step": 20382 + }, + { + "epoch": 6.256292203806016, + "grad_norm": 0.2495514154434204, + "learning_rate": 3.2471552276476404e-05, + "loss": 1.7573, + "step": 20383 + }, + { + "epoch": 6.256599140577041, + "grad_norm": 0.200107604265213, + "learning_rate": 3.2466897250260835e-05, + "loss": 1.7292, + "step": 20384 + }, + { + "epoch": 6.2569060773480665, + "grad_norm": 0.25782206654548645, + "learning_rate": 3.246224239731765e-05, + "loss": 1.8533, + "step": 20385 + }, + { + "epoch": 6.257213014119092, + "grad_norm": 0.1966158151626587, + "learning_rate": 3.245758771769288e-05, + "loss": 1.648, + "step": 20386 + }, + { + "epoch": 6.257519950890116, + "grad_norm": 0.23248116672039032, + "learning_rate": 3.245293321143249e-05, + "loss": 1.7277, + "step": 20387 + }, + { + "epoch": 6.257826887661142, + "grad_norm": 0.26347780227661133, + "learning_rate": 3.244827887858251e-05, + "loss": 1.7429, + "step": 20388 + }, + { + "epoch": 6.258133824432167, + "grad_norm": 0.20794285833835602, + "learning_rate": 3.244362471918894e-05, + "loss": 1.7358, + "step": 20389 + }, + { + "epoch": 6.258440761203192, + "grad_norm": 0.200898677110672, + "learning_rate": 3.243897073329774e-05, + "loss": 1.6661, + "step": 20390 + }, + { + "epoch": 6.258747697974218, + "grad_norm": 0.20945283770561218, + "learning_rate": 3.2434316920954935e-05, + "loss": 1.7036, + "step": 20391 + }, + { + "epoch": 6.259054634745242, + "grad_norm": 0.3154161274433136, + "learning_rate": 3.242966328220649e-05, + "loss": 1.8174, + "step": 20392 + }, + { + "epoch": 6.259361571516267, + "grad_norm": 0.19321799278259277, + "learning_rate": 3.242500981709843e-05, + "loss": 1.6823, + "step": 20393 + }, + { + "epoch": 6.259668508287293, + "grad_norm": 0.22610130906105042, + "learning_rate": 3.2420356525676696e-05, + "loss": 1.6865, + "step": 20394 + }, + { + "epoch": 6.259975445058318, + "grad_norm": 0.19190505146980286, + "learning_rate": 3.241570340798734e-05, + "loss": 1.6663, + "step": 20395 + }, + { + "epoch": 6.260282381829343, + "grad_norm": 0.21956418454647064, + "learning_rate": 3.2411050464076276e-05, + "loss": 1.7279, + "step": 20396 + }, + { + "epoch": 6.260589318600369, + "grad_norm": 0.2448553591966629, + "learning_rate": 3.240639769398956e-05, + "loss": 1.7438, + "step": 20397 + }, + { + "epoch": 6.260896255371393, + "grad_norm": 0.19194214046001434, + "learning_rate": 3.2401745097773096e-05, + "loss": 1.7429, + "step": 20398 + }, + { + "epoch": 6.2612031921424185, + "grad_norm": 0.2567521333694458, + "learning_rate": 3.239709267547291e-05, + "loss": 1.7051, + "step": 20399 + }, + { + "epoch": 6.261510128913444, + "grad_norm": 0.18335886299610138, + "learning_rate": 3.239244042713498e-05, + "loss": 1.6828, + "step": 20400 + }, + { + "epoch": 6.261817065684469, + "grad_norm": 0.20112362504005432, + "learning_rate": 3.238778835280527e-05, + "loss": 1.6887, + "step": 20401 + }, + { + "epoch": 6.2621240024554945, + "grad_norm": 0.17095179855823517, + "learning_rate": 3.238313645252975e-05, + "loss": 1.7202, + "step": 20402 + }, + { + "epoch": 6.262430939226519, + "grad_norm": 0.24681979417800903, + "learning_rate": 3.237848472635442e-05, + "loss": 1.7196, + "step": 20403 + }, + { + "epoch": 6.262737875997544, + "grad_norm": 0.2022300660610199, + "learning_rate": 3.237383317432522e-05, + "loss": 1.7265, + "step": 20404 + }, + { + "epoch": 6.26304481276857, + "grad_norm": 0.2900621294975281, + "learning_rate": 3.236918179648813e-05, + "loss": 1.7051, + "step": 20405 + }, + { + "epoch": 6.263351749539595, + "grad_norm": 0.37675586342811584, + "learning_rate": 3.2364530592889135e-05, + "loss": 1.7747, + "step": 20406 + }, + { + "epoch": 6.26365868631062, + "grad_norm": 0.19033703207969666, + "learning_rate": 3.235987956357416e-05, + "loss": 1.7529, + "step": 20407 + }, + { + "epoch": 6.263965623081646, + "grad_norm": 0.2877013385295868, + "learning_rate": 3.235522870858922e-05, + "loss": 1.6942, + "step": 20408 + }, + { + "epoch": 6.26427255985267, + "grad_norm": 0.22717125713825226, + "learning_rate": 3.235057802798023e-05, + "loss": 1.7302, + "step": 20409 + }, + { + "epoch": 6.264579496623695, + "grad_norm": 0.2571920156478882, + "learning_rate": 3.2345927521793185e-05, + "loss": 1.6782, + "step": 20410 + }, + { + "epoch": 6.264886433394721, + "grad_norm": 0.43085625767707825, + "learning_rate": 3.234127719007403e-05, + "loss": 1.7946, + "step": 20411 + }, + { + "epoch": 6.265193370165746, + "grad_norm": 0.19355928897857666, + "learning_rate": 3.2336627032868726e-05, + "loss": 1.7288, + "step": 20412 + }, + { + "epoch": 6.265500306936771, + "grad_norm": 0.24871474504470825, + "learning_rate": 3.233197705022322e-05, + "loss": 1.6862, + "step": 20413 + }, + { + "epoch": 6.265807243707796, + "grad_norm": 0.26919320225715637, + "learning_rate": 3.232732724218348e-05, + "loss": 1.8061, + "step": 20414 + }, + { + "epoch": 6.266114180478821, + "grad_norm": 0.21714363992214203, + "learning_rate": 3.2322677608795436e-05, + "loss": 1.7036, + "step": 20415 + }, + { + "epoch": 6.2664211172498465, + "grad_norm": 0.24496719241142273, + "learning_rate": 3.231802815010506e-05, + "loss": 1.7334, + "step": 20416 + }, + { + "epoch": 6.266728054020872, + "grad_norm": 0.22501519322395325, + "learning_rate": 3.231337886615831e-05, + "loss": 1.7545, + "step": 20417 + }, + { + "epoch": 6.267034990791897, + "grad_norm": 0.2683655917644501, + "learning_rate": 3.23087297570011e-05, + "loss": 1.7235, + "step": 20418 + }, + { + "epoch": 6.267341927562922, + "grad_norm": 0.23341359198093414, + "learning_rate": 3.230408082267938e-05, + "loss": 1.7389, + "step": 20419 + }, + { + "epoch": 6.267648864333947, + "grad_norm": 0.2914128601551056, + "learning_rate": 3.229943206323913e-05, + "loss": 1.7223, + "step": 20420 + }, + { + "epoch": 6.267955801104972, + "grad_norm": 0.2072528451681137, + "learning_rate": 3.229478347872625e-05, + "loss": 1.7422, + "step": 20421 + }, + { + "epoch": 6.268262737875998, + "grad_norm": 0.22678662836551666, + "learning_rate": 3.229013506918671e-05, + "loss": 1.6973, + "step": 20422 + }, + { + "epoch": 6.268569674647023, + "grad_norm": 0.1928883194923401, + "learning_rate": 3.228548683466643e-05, + "loss": 1.7235, + "step": 20423 + }, + { + "epoch": 6.268876611418047, + "grad_norm": 0.2402963638305664, + "learning_rate": 3.2280838775211345e-05, + "loss": 1.7587, + "step": 20424 + }, + { + "epoch": 6.269183548189073, + "grad_norm": 0.20416294038295746, + "learning_rate": 3.227619089086742e-05, + "loss": 1.7591, + "step": 20425 + }, + { + "epoch": 6.269490484960098, + "grad_norm": 0.20308947563171387, + "learning_rate": 3.227154318168053e-05, + "loss": 1.7264, + "step": 20426 + }, + { + "epoch": 6.269797421731123, + "grad_norm": 0.18733863532543182, + "learning_rate": 3.226689564769667e-05, + "loss": 1.6943, + "step": 20427 + }, + { + "epoch": 6.270104358502149, + "grad_norm": 0.183793842792511, + "learning_rate": 3.226224828896173e-05, + "loss": 1.7082, + "step": 20428 + }, + { + "epoch": 6.270411295273174, + "grad_norm": 0.20471547544002533, + "learning_rate": 3.225760110552165e-05, + "loss": 1.7352, + "step": 20429 + }, + { + "epoch": 6.2707182320441985, + "grad_norm": 0.23386713862419128, + "learning_rate": 3.225295409742234e-05, + "loss": 1.7666, + "step": 20430 + }, + { + "epoch": 6.271025168815224, + "grad_norm": 0.2024994194507599, + "learning_rate": 3.224830726470976e-05, + "loss": 1.6573, + "step": 20431 + }, + { + "epoch": 6.271332105586249, + "grad_norm": 0.2352776825428009, + "learning_rate": 3.2243660607429805e-05, + "loss": 1.7884, + "step": 20432 + }, + { + "epoch": 6.2716390423572745, + "grad_norm": 0.19755585491657257, + "learning_rate": 3.223901412562841e-05, + "loss": 1.6964, + "step": 20433 + }, + { + "epoch": 6.2719459791283, + "grad_norm": 0.25833839178085327, + "learning_rate": 3.223436781935148e-05, + "loss": 1.715, + "step": 20434 + }, + { + "epoch": 6.272252915899324, + "grad_norm": 0.2110220193862915, + "learning_rate": 3.222972168864493e-05, + "loss": 1.7617, + "step": 20435 + }, + { + "epoch": 6.27255985267035, + "grad_norm": 0.23262515664100647, + "learning_rate": 3.2225075733554685e-05, + "loss": 1.7616, + "step": 20436 + }, + { + "epoch": 6.272866789441375, + "grad_norm": 0.1926576942205429, + "learning_rate": 3.222042995412669e-05, + "loss": 1.6956, + "step": 20437 + }, + { + "epoch": 6.2731737262124, + "grad_norm": 0.20662757754325867, + "learning_rate": 3.22157843504068e-05, + "loss": 1.703, + "step": 20438 + }, + { + "epoch": 6.273480662983426, + "grad_norm": 0.22137406468391418, + "learning_rate": 3.2211138922440975e-05, + "loss": 1.6961, + "step": 20439 + }, + { + "epoch": 6.273787599754451, + "grad_norm": 0.25777003169059753, + "learning_rate": 3.2206493670275086e-05, + "loss": 1.704, + "step": 20440 + }, + { + "epoch": 6.274094536525475, + "grad_norm": 0.20540094375610352, + "learning_rate": 3.2201848593955046e-05, + "loss": 1.6759, + "step": 20441 + }, + { + "epoch": 6.274401473296501, + "grad_norm": 0.2447255402803421, + "learning_rate": 3.21972036935268e-05, + "loss": 1.7379, + "step": 20442 + }, + { + "epoch": 6.274708410067526, + "grad_norm": 0.2017194777727127, + "learning_rate": 3.219255896903619e-05, + "loss": 1.6518, + "step": 20443 + }, + { + "epoch": 6.2750153468385514, + "grad_norm": 0.22742003202438354, + "learning_rate": 3.2187914420529174e-05, + "loss": 1.7568, + "step": 20444 + }, + { + "epoch": 6.275322283609577, + "grad_norm": 0.2065356969833374, + "learning_rate": 3.218327004805161e-05, + "loss": 1.643, + "step": 20445 + }, + { + "epoch": 6.275629220380601, + "grad_norm": 0.18083053827285767, + "learning_rate": 3.217862585164942e-05, + "loss": 1.77, + "step": 20446 + }, + { + "epoch": 6.275936157151627, + "grad_norm": 0.2175968736410141, + "learning_rate": 3.2173981831368484e-05, + "loss": 1.738, + "step": 20447 + }, + { + "epoch": 6.276243093922652, + "grad_norm": 0.17635080218315125, + "learning_rate": 3.216933798725473e-05, + "loss": 1.7109, + "step": 20448 + }, + { + "epoch": 6.276550030693677, + "grad_norm": 0.22289423644542694, + "learning_rate": 3.216469431935401e-05, + "loss": 1.7853, + "step": 20449 + }, + { + "epoch": 6.276856967464703, + "grad_norm": 0.21214549243450165, + "learning_rate": 3.216005082771225e-05, + "loss": 1.8196, + "step": 20450 + }, + { + "epoch": 6.277163904235728, + "grad_norm": 0.21992212533950806, + "learning_rate": 3.215540751237531e-05, + "loss": 1.7445, + "step": 20451 + }, + { + "epoch": 6.277470841006752, + "grad_norm": 0.16256563365459442, + "learning_rate": 3.2150764373389096e-05, + "loss": 1.6582, + "step": 20452 + }, + { + "epoch": 6.277777777777778, + "grad_norm": 0.1885976791381836, + "learning_rate": 3.214612141079949e-05, + "loss": 1.7491, + "step": 20453 + }, + { + "epoch": 6.278084714548803, + "grad_norm": 0.24101774394512177, + "learning_rate": 3.2141478624652386e-05, + "loss": 1.7476, + "step": 20454 + }, + { + "epoch": 6.278391651319828, + "grad_norm": 0.23378998041152954, + "learning_rate": 3.213683601499364e-05, + "loss": 1.7575, + "step": 20455 + }, + { + "epoch": 6.278698588090854, + "grad_norm": 0.2032867670059204, + "learning_rate": 3.213219358186917e-05, + "loss": 1.6999, + "step": 20456 + }, + { + "epoch": 6.279005524861878, + "grad_norm": 0.21332181990146637, + "learning_rate": 3.2127551325324836e-05, + "loss": 1.6634, + "step": 20457 + }, + { + "epoch": 6.2793124616329035, + "grad_norm": 0.23767098784446716, + "learning_rate": 3.2122909245406494e-05, + "loss": 1.8023, + "step": 20458 + }, + { + "epoch": 6.279619398403929, + "grad_norm": 0.19987638294696808, + "learning_rate": 3.211826734216007e-05, + "loss": 1.6848, + "step": 20459 + }, + { + "epoch": 6.279926335174954, + "grad_norm": 0.22169579565525055, + "learning_rate": 3.2113625615631385e-05, + "loss": 1.7599, + "step": 20460 + }, + { + "epoch": 6.2802332719459795, + "grad_norm": 0.1768191009759903, + "learning_rate": 3.210898406586634e-05, + "loss": 1.6894, + "step": 20461 + }, + { + "epoch": 6.280540208717004, + "grad_norm": 0.1923041045665741, + "learning_rate": 3.21043426929108e-05, + "loss": 1.7379, + "step": 20462 + }, + { + "epoch": 6.280847145488029, + "grad_norm": 0.1836252212524414, + "learning_rate": 3.2099701496810644e-05, + "loss": 1.6748, + "step": 20463 + }, + { + "epoch": 6.281154082259055, + "grad_norm": 0.2203192561864853, + "learning_rate": 3.2095060477611705e-05, + "loss": 1.6969, + "step": 20464 + }, + { + "epoch": 6.28146101903008, + "grad_norm": 0.25511759519577026, + "learning_rate": 3.20904196353599e-05, + "loss": 1.7806, + "step": 20465 + }, + { + "epoch": 6.281767955801105, + "grad_norm": 0.19464822113513947, + "learning_rate": 3.208577897010106e-05, + "loss": 1.6784, + "step": 20466 + }, + { + "epoch": 6.28207489257213, + "grad_norm": 0.1949714869260788, + "learning_rate": 3.208113848188105e-05, + "loss": 1.713, + "step": 20467 + }, + { + "epoch": 6.282381829343155, + "grad_norm": 0.22094127535820007, + "learning_rate": 3.207649817074572e-05, + "loss": 1.7397, + "step": 20468 + }, + { + "epoch": 6.28268876611418, + "grad_norm": 0.22343899309635162, + "learning_rate": 3.2071858036740954e-05, + "loss": 1.717, + "step": 20469 + }, + { + "epoch": 6.282995702885206, + "grad_norm": 0.20854893326759338, + "learning_rate": 3.2067218079912584e-05, + "loss": 1.7255, + "step": 20470 + }, + { + "epoch": 6.283302639656231, + "grad_norm": 0.21306286752223969, + "learning_rate": 3.206257830030649e-05, + "loss": 1.7251, + "step": 20471 + }, + { + "epoch": 6.283609576427256, + "grad_norm": 0.24995777010917664, + "learning_rate": 3.20579386979685e-05, + "loss": 1.7892, + "step": 20472 + }, + { + "epoch": 6.283916513198281, + "grad_norm": 0.23720023036003113, + "learning_rate": 3.2053299272944486e-05, + "loss": 1.7843, + "step": 20473 + }, + { + "epoch": 6.284223449969306, + "grad_norm": 0.2042113095521927, + "learning_rate": 3.204866002528029e-05, + "loss": 1.7318, + "step": 20474 + }, + { + "epoch": 6.2845303867403315, + "grad_norm": 0.22996367514133453, + "learning_rate": 3.2044020955021735e-05, + "loss": 1.6875, + "step": 20475 + }, + { + "epoch": 6.284837323511357, + "grad_norm": 0.187749981880188, + "learning_rate": 3.203938206221471e-05, + "loss": 1.7297, + "step": 20476 + }, + { + "epoch": 6.285144260282382, + "grad_norm": 0.18279509246349335, + "learning_rate": 3.2034743346905025e-05, + "loss": 1.6858, + "step": 20477 + }, + { + "epoch": 6.285451197053407, + "grad_norm": 0.1871512532234192, + "learning_rate": 3.203010480913855e-05, + "loss": 1.7224, + "step": 20478 + }, + { + "epoch": 6.285758133824432, + "grad_norm": 0.17732922732830048, + "learning_rate": 3.202546644896109e-05, + "loss": 1.6872, + "step": 20479 + }, + { + "epoch": 6.286065070595457, + "grad_norm": 0.21146097779273987, + "learning_rate": 3.2020828266418527e-05, + "loss": 1.797, + "step": 20480 + }, + { + "epoch": 6.286372007366483, + "grad_norm": 0.18914340436458588, + "learning_rate": 3.201619026155666e-05, + "loss": 1.7149, + "step": 20481 + }, + { + "epoch": 6.286678944137508, + "grad_norm": 0.20919133722782135, + "learning_rate": 3.2011552434421364e-05, + "loss": 1.7803, + "step": 20482 + }, + { + "epoch": 6.286985880908533, + "grad_norm": 0.17882505059242249, + "learning_rate": 3.200691478505843e-05, + "loss": 1.757, + "step": 20483 + }, + { + "epoch": 6.287292817679558, + "grad_norm": 0.1850014477968216, + "learning_rate": 3.200227731351373e-05, + "loss": 1.7006, + "step": 20484 + }, + { + "epoch": 6.287599754450583, + "grad_norm": 0.19999323785305023, + "learning_rate": 3.1997640019833056e-05, + "loss": 1.702, + "step": 20485 + }, + { + "epoch": 6.287906691221608, + "grad_norm": 0.20464713871479034, + "learning_rate": 3.1993002904062255e-05, + "loss": 1.7272, + "step": 20486 + }, + { + "epoch": 6.288213627992634, + "grad_norm": 0.2105564922094345, + "learning_rate": 3.1988365966247154e-05, + "loss": 1.7062, + "step": 20487 + }, + { + "epoch": 6.288520564763659, + "grad_norm": 0.26322871446609497, + "learning_rate": 3.198372920643359e-05, + "loss": 1.7309, + "step": 20488 + }, + { + "epoch": 6.2888275015346835, + "grad_norm": 0.22787201404571533, + "learning_rate": 3.197909262466736e-05, + "loss": 1.7797, + "step": 20489 + }, + { + "epoch": 6.289134438305709, + "grad_norm": 0.21409621834754944, + "learning_rate": 3.1974456220994314e-05, + "loss": 1.8211, + "step": 20490 + }, + { + "epoch": 6.289441375076734, + "grad_norm": 0.2241450846195221, + "learning_rate": 3.196981999546025e-05, + "loss": 1.7255, + "step": 20491 + }, + { + "epoch": 6.2897483118477595, + "grad_norm": 0.23141883313655853, + "learning_rate": 3.1965183948110985e-05, + "loss": 1.7695, + "step": 20492 + }, + { + "epoch": 6.290055248618785, + "grad_norm": 0.209358349442482, + "learning_rate": 3.196054807899236e-05, + "loss": 1.6808, + "step": 20493 + }, + { + "epoch": 6.290362185389809, + "grad_norm": 0.20730538666248322, + "learning_rate": 3.195591238815015e-05, + "loss": 1.6847, + "step": 20494 + }, + { + "epoch": 6.290669122160835, + "grad_norm": 0.2568998634815216, + "learning_rate": 3.195127687563021e-05, + "loss": 1.664, + "step": 20495 + }, + { + "epoch": 6.29097605893186, + "grad_norm": 0.238932803273201, + "learning_rate": 3.1946641541478316e-05, + "loss": 1.7166, + "step": 20496 + }, + { + "epoch": 6.291282995702885, + "grad_norm": 0.235393688082695, + "learning_rate": 3.19420063857403e-05, + "loss": 1.6572, + "step": 20497 + }, + { + "epoch": 6.291589932473911, + "grad_norm": 0.2888807952404022, + "learning_rate": 3.1937371408461944e-05, + "loss": 1.7484, + "step": 20498 + }, + { + "epoch": 6.291896869244935, + "grad_norm": 0.18588709831237793, + "learning_rate": 3.1932736609689096e-05, + "loss": 1.7027, + "step": 20499 + }, + { + "epoch": 6.29220380601596, + "grad_norm": 0.3065604865550995, + "learning_rate": 3.1928101989467514e-05, + "loss": 1.8051, + "step": 20500 + }, + { + "epoch": 6.292510742786986, + "grad_norm": 0.2480497658252716, + "learning_rate": 3.192346754784304e-05, + "loss": 1.7749, + "step": 20501 + }, + { + "epoch": 6.292817679558011, + "grad_norm": 0.268686443567276, + "learning_rate": 3.1918833284861436e-05, + "loss": 1.7062, + "step": 20502 + }, + { + "epoch": 6.293124616329036, + "grad_norm": 0.337510883808136, + "learning_rate": 3.191419920056853e-05, + "loss": 1.745, + "step": 20503 + }, + { + "epoch": 6.293431553100062, + "grad_norm": 0.18532821536064148, + "learning_rate": 3.190956529501009e-05, + "loss": 1.7098, + "step": 20504 + }, + { + "epoch": 6.293738489871086, + "grad_norm": 0.27805468440055847, + "learning_rate": 3.1904931568231956e-05, + "loss": 1.7252, + "step": 20505 + }, + { + "epoch": 6.2940454266421115, + "grad_norm": 0.22137443721294403, + "learning_rate": 3.190029802027987e-05, + "loss": 1.7595, + "step": 20506 + }, + { + "epoch": 6.294352363413137, + "grad_norm": 0.23159445822238922, + "learning_rate": 3.189566465119968e-05, + "loss": 1.7503, + "step": 20507 + }, + { + "epoch": 6.294659300184162, + "grad_norm": 0.2089100182056427, + "learning_rate": 3.189103146103712e-05, + "loss": 1.7021, + "step": 20508 + }, + { + "epoch": 6.2949662369551875, + "grad_norm": 0.1985119879245758, + "learning_rate": 3.1886398449838e-05, + "loss": 1.7468, + "step": 20509 + }, + { + "epoch": 6.295273173726212, + "grad_norm": 0.18612028658390045, + "learning_rate": 3.188176561764812e-05, + "loss": 1.6657, + "step": 20510 + }, + { + "epoch": 6.295580110497237, + "grad_norm": 0.22453728318214417, + "learning_rate": 3.1877132964513226e-05, + "loss": 1.7223, + "step": 20511 + }, + { + "epoch": 6.295887047268263, + "grad_norm": 0.270304799079895, + "learning_rate": 3.187250049047916e-05, + "loss": 1.7548, + "step": 20512 + }, + { + "epoch": 6.296193984039288, + "grad_norm": 0.19762152433395386, + "learning_rate": 3.1867868195591643e-05, + "loss": 1.6945, + "step": 20513 + }, + { + "epoch": 6.296500920810313, + "grad_norm": 0.25173795223236084, + "learning_rate": 3.1863236079896486e-05, + "loss": 1.7303, + "step": 20514 + }, + { + "epoch": 6.296807857581339, + "grad_norm": 0.2073308676481247, + "learning_rate": 3.185860414343945e-05, + "loss": 1.7327, + "step": 20515 + }, + { + "epoch": 6.297114794352363, + "grad_norm": 0.24174070358276367, + "learning_rate": 3.185397238626635e-05, + "loss": 1.7577, + "step": 20516 + }, + { + "epoch": 6.297421731123388, + "grad_norm": 0.1950366348028183, + "learning_rate": 3.1849340808422905e-05, + "loss": 1.7137, + "step": 20517 + }, + { + "epoch": 6.297728667894414, + "grad_norm": 0.23416653275489807, + "learning_rate": 3.1844709409954936e-05, + "loss": 1.7547, + "step": 20518 + }, + { + "epoch": 6.298035604665439, + "grad_norm": 0.1939592808485031, + "learning_rate": 3.184007819090817e-05, + "loss": 1.7215, + "step": 20519 + }, + { + "epoch": 6.298342541436464, + "grad_norm": 0.21807245910167694, + "learning_rate": 3.1835447151328405e-05, + "loss": 1.7021, + "step": 20520 + }, + { + "epoch": 6.298649478207489, + "grad_norm": 0.21653762459754944, + "learning_rate": 3.183081629126138e-05, + "loss": 1.7426, + "step": 20521 + }, + { + "epoch": 6.298956414978514, + "grad_norm": 0.20749153196811676, + "learning_rate": 3.18261856107529e-05, + "loss": 1.7302, + "step": 20522 + }, + { + "epoch": 6.2992633517495396, + "grad_norm": 0.23450545966625214, + "learning_rate": 3.182155510984869e-05, + "loss": 1.7414, + "step": 20523 + }, + { + "epoch": 6.299570288520565, + "grad_norm": 0.17081578075885773, + "learning_rate": 3.181692478859455e-05, + "loss": 1.7017, + "step": 20524 + }, + { + "epoch": 6.29987722529159, + "grad_norm": 0.20244698226451874, + "learning_rate": 3.18122946470362e-05, + "loss": 1.6765, + "step": 20525 + }, + { + "epoch": 6.300184162062616, + "grad_norm": 0.20153406262397766, + "learning_rate": 3.180766468521941e-05, + "loss": 1.7437, + "step": 20526 + }, + { + "epoch": 6.30049109883364, + "grad_norm": 0.21135647594928741, + "learning_rate": 3.180303490318996e-05, + "loss": 1.7202, + "step": 20527 + }, + { + "epoch": 6.300798035604665, + "grad_norm": 0.20342735946178436, + "learning_rate": 3.1798405300993555e-05, + "loss": 1.7268, + "step": 20528 + }, + { + "epoch": 6.301104972375691, + "grad_norm": 0.21153734624385834, + "learning_rate": 3.1793775878676e-05, + "loss": 1.7455, + "step": 20529 + }, + { + "epoch": 6.301411909146716, + "grad_norm": 0.2197744995355606, + "learning_rate": 3.1789146636283015e-05, + "loss": 1.7876, + "step": 20530 + }, + { + "epoch": 6.301718845917741, + "grad_norm": 0.2236124575138092, + "learning_rate": 3.1784517573860356e-05, + "loss": 1.7454, + "step": 20531 + }, + { + "epoch": 6.302025782688766, + "grad_norm": 0.22071333229541779, + "learning_rate": 3.177988869145376e-05, + "loss": 1.7197, + "step": 20532 + }, + { + "epoch": 6.302332719459791, + "grad_norm": 0.20137591660022736, + "learning_rate": 3.177525998910901e-05, + "loss": 1.7153, + "step": 20533 + }, + { + "epoch": 6.3026396562308165, + "grad_norm": 0.18981720507144928, + "learning_rate": 3.17706314668718e-05, + "loss": 1.6948, + "step": 20534 + }, + { + "epoch": 6.302946593001842, + "grad_norm": 0.20803335309028625, + "learning_rate": 3.176600312478791e-05, + "loss": 1.7454, + "step": 20535 + }, + { + "epoch": 6.303253529772867, + "grad_norm": 0.2224191278219223, + "learning_rate": 3.176137496290305e-05, + "loss": 1.708, + "step": 20536 + }, + { + "epoch": 6.303560466543892, + "grad_norm": 0.21110501885414124, + "learning_rate": 3.175674698126298e-05, + "loss": 1.6976, + "step": 20537 + }, + { + "epoch": 6.303867403314917, + "grad_norm": 0.19902437925338745, + "learning_rate": 3.175211917991342e-05, + "loss": 1.7246, + "step": 20538 + }, + { + "epoch": 6.304174340085942, + "grad_norm": 0.1930927336215973, + "learning_rate": 3.174749155890013e-05, + "loss": 1.7849, + "step": 20539 + }, + { + "epoch": 6.304481276856968, + "grad_norm": 0.19350691139698029, + "learning_rate": 3.174286411826881e-05, + "loss": 1.7441, + "step": 20540 + }, + { + "epoch": 6.304788213627993, + "grad_norm": 0.18532924354076385, + "learning_rate": 3.173823685806523e-05, + "loss": 1.6675, + "step": 20541 + }, + { + "epoch": 6.305095150399017, + "grad_norm": 0.18890263140201569, + "learning_rate": 3.173360977833508e-05, + "loss": 1.7889, + "step": 20542 + }, + { + "epoch": 6.305402087170043, + "grad_norm": 0.20418904721736908, + "learning_rate": 3.17289828791241e-05, + "loss": 1.8298, + "step": 20543 + }, + { + "epoch": 6.305709023941068, + "grad_norm": 0.2298857718706131, + "learning_rate": 3.172435616047804e-05, + "loss": 1.7889, + "step": 20544 + }, + { + "epoch": 6.306015960712093, + "grad_norm": 0.20661889016628265, + "learning_rate": 3.171972962244258e-05, + "loss": 1.74, + "step": 20545 + }, + { + "epoch": 6.306322897483119, + "grad_norm": 0.17712774872779846, + "learning_rate": 3.1715103265063496e-05, + "loss": 1.72, + "step": 20546 + }, + { + "epoch": 6.306629834254144, + "grad_norm": 0.16776354610919952, + "learning_rate": 3.1710477088386456e-05, + "loss": 1.6715, + "step": 20547 + }, + { + "epoch": 6.3069367710251685, + "grad_norm": 0.21919682621955872, + "learning_rate": 3.170585109245721e-05, + "loss": 1.7232, + "step": 20548 + }, + { + "epoch": 6.307243707796194, + "grad_norm": 0.2026829719543457, + "learning_rate": 3.170122527732144e-05, + "loss": 1.7551, + "step": 20549 + }, + { + "epoch": 6.307550644567219, + "grad_norm": 0.18783780932426453, + "learning_rate": 3.169659964302493e-05, + "loss": 1.7024, + "step": 20550 + }, + { + "epoch": 6.3078575813382445, + "grad_norm": 0.2058420479297638, + "learning_rate": 3.1691974189613316e-05, + "loss": 1.7006, + "step": 20551 + }, + { + "epoch": 6.30816451810927, + "grad_norm": 0.21351832151412964, + "learning_rate": 3.168734891713237e-05, + "loss": 1.7586, + "step": 20552 + }, + { + "epoch": 6.308471454880294, + "grad_norm": 0.19816654920578003, + "learning_rate": 3.168272382562776e-05, + "loss": 1.7532, + "step": 20553 + }, + { + "epoch": 6.30877839165132, + "grad_norm": 0.18253186345100403, + "learning_rate": 3.16780989151452e-05, + "loss": 1.7413, + "step": 20554 + }, + { + "epoch": 6.309085328422345, + "grad_norm": 0.23097483813762665, + "learning_rate": 3.167347418573042e-05, + "loss": 1.7355, + "step": 20555 + }, + { + "epoch": 6.30939226519337, + "grad_norm": 0.1984725296497345, + "learning_rate": 3.166884963742911e-05, + "loss": 1.6754, + "step": 20556 + }, + { + "epoch": 6.309699201964396, + "grad_norm": 0.2385166734457016, + "learning_rate": 3.166422527028696e-05, + "loss": 1.7322, + "step": 20557 + }, + { + "epoch": 6.310006138735421, + "grad_norm": 0.23216524720191956, + "learning_rate": 3.165960108434971e-05, + "loss": 1.7426, + "step": 20558 + }, + { + "epoch": 6.310313075506445, + "grad_norm": 0.22017790377140045, + "learning_rate": 3.165497707966301e-05, + "loss": 1.6977, + "step": 20559 + }, + { + "epoch": 6.310620012277471, + "grad_norm": 0.2934584617614746, + "learning_rate": 3.165035325627257e-05, + "loss": 1.7252, + "step": 20560 + }, + { + "epoch": 6.310926949048496, + "grad_norm": 0.21830198168754578, + "learning_rate": 3.1645729614224126e-05, + "loss": 1.781, + "step": 20561 + }, + { + "epoch": 6.311233885819521, + "grad_norm": 0.3082836866378784, + "learning_rate": 3.1641106153563306e-05, + "loss": 1.8015, + "step": 20562 + }, + { + "epoch": 6.311540822590547, + "grad_norm": 0.22441358864307404, + "learning_rate": 3.163648287433586e-05, + "loss": 1.8058, + "step": 20563 + }, + { + "epoch": 6.311847759361571, + "grad_norm": 0.36623889207839966, + "learning_rate": 3.163185977658744e-05, + "loss": 1.7092, + "step": 20564 + }, + { + "epoch": 6.3121546961325965, + "grad_norm": 0.22231145203113556, + "learning_rate": 3.1627236860363755e-05, + "loss": 1.6432, + "step": 20565 + }, + { + "epoch": 6.312461632903622, + "grad_norm": 0.25871971249580383, + "learning_rate": 3.162261412571047e-05, + "loss": 1.7156, + "step": 20566 + }, + { + "epoch": 6.312768569674647, + "grad_norm": 0.24574241042137146, + "learning_rate": 3.16179915726733e-05, + "loss": 1.7977, + "step": 20567 + }, + { + "epoch": 6.3130755064456725, + "grad_norm": 0.197379007935524, + "learning_rate": 3.1613369201297895e-05, + "loss": 1.6966, + "step": 20568 + }, + { + "epoch": 6.313382443216697, + "grad_norm": 0.2149469256401062, + "learning_rate": 3.1608747011629975e-05, + "loss": 1.7385, + "step": 20569 + }, + { + "epoch": 6.313689379987722, + "grad_norm": 0.21942345798015594, + "learning_rate": 3.1604125003715174e-05, + "loss": 1.7369, + "step": 20570 + }, + { + "epoch": 6.313996316758748, + "grad_norm": 0.20977036654949188, + "learning_rate": 3.1599503177599197e-05, + "loss": 1.7429, + "step": 20571 + }, + { + "epoch": 6.314303253529773, + "grad_norm": 0.20113405585289001, + "learning_rate": 3.159488153332772e-05, + "loss": 1.7163, + "step": 20572 + }, + { + "epoch": 6.314610190300798, + "grad_norm": 0.22031868994235992, + "learning_rate": 3.1590260070946414e-05, + "loss": 1.7085, + "step": 20573 + }, + { + "epoch": 6.314917127071823, + "grad_norm": 0.24137777090072632, + "learning_rate": 3.158563879050094e-05, + "loss": 1.7169, + "step": 20574 + }, + { + "epoch": 6.315224063842848, + "grad_norm": 0.20265905559062958, + "learning_rate": 3.1581017692036985e-05, + "loss": 1.7466, + "step": 20575 + }, + { + "epoch": 6.315531000613873, + "grad_norm": 0.2997782528400421, + "learning_rate": 3.1576396775600206e-05, + "loss": 1.7287, + "step": 20576 + }, + { + "epoch": 6.315837937384899, + "grad_norm": 0.19672340154647827, + "learning_rate": 3.157177604123628e-05, + "loss": 1.7121, + "step": 20577 + }, + { + "epoch": 6.316144874155924, + "grad_norm": 0.26618507504463196, + "learning_rate": 3.156715548899085e-05, + "loss": 1.6958, + "step": 20578 + }, + { + "epoch": 6.316451810926949, + "grad_norm": 0.18854503333568573, + "learning_rate": 3.156253511890959e-05, + "loss": 1.7751, + "step": 20579 + }, + { + "epoch": 6.316758747697974, + "grad_norm": 0.2306061089038849, + "learning_rate": 3.155791493103819e-05, + "loss": 1.6853, + "step": 20580 + }, + { + "epoch": 6.317065684468999, + "grad_norm": 0.20650778710842133, + "learning_rate": 3.1553294925422254e-05, + "loss": 1.7021, + "step": 20581 + }, + { + "epoch": 6.3173726212400245, + "grad_norm": 0.19474658370018005, + "learning_rate": 3.1548675102107494e-05, + "loss": 1.7146, + "step": 20582 + }, + { + "epoch": 6.31767955801105, + "grad_norm": 0.2150747925043106, + "learning_rate": 3.154405546113952e-05, + "loss": 1.7473, + "step": 20583 + }, + { + "epoch": 6.317986494782075, + "grad_norm": 0.19304975867271423, + "learning_rate": 3.153943600256402e-05, + "loss": 1.7209, + "step": 20584 + }, + { + "epoch": 6.3182934315531, + "grad_norm": 0.22610948979854584, + "learning_rate": 3.153481672642662e-05, + "loss": 1.717, + "step": 20585 + }, + { + "epoch": 6.318600368324125, + "grad_norm": 0.18705105781555176, + "learning_rate": 3.1530197632773006e-05, + "loss": 1.7326, + "step": 20586 + }, + { + "epoch": 6.31890730509515, + "grad_norm": 0.25632867217063904, + "learning_rate": 3.152557872164878e-05, + "loss": 1.7391, + "step": 20587 + }, + { + "epoch": 6.319214241866176, + "grad_norm": 0.18723119795322418, + "learning_rate": 3.152095999309964e-05, + "loss": 1.7193, + "step": 20588 + }, + { + "epoch": 6.319521178637201, + "grad_norm": 0.1759091317653656, + "learning_rate": 3.1516341447171184e-05, + "loss": 1.7024, + "step": 20589 + }, + { + "epoch": 6.319828115408226, + "grad_norm": 0.1838626265525818, + "learning_rate": 3.1511723083909084e-05, + "loss": 1.7027, + "step": 20590 + }, + { + "epoch": 6.320135052179251, + "grad_norm": 0.2615656554698944, + "learning_rate": 3.1507104903358964e-05, + "loss": 1.7798, + "step": 20591 + }, + { + "epoch": 6.320441988950276, + "grad_norm": 0.18816477060317993, + "learning_rate": 3.150248690556649e-05, + "loss": 1.6778, + "step": 20592 + }, + { + "epoch": 6.320748925721301, + "grad_norm": 0.20011866092681885, + "learning_rate": 3.149786909057728e-05, + "loss": 1.6653, + "step": 20593 + }, + { + "epoch": 6.321055862492327, + "grad_norm": 0.26681140065193176, + "learning_rate": 3.149325145843696e-05, + "loss": 1.7523, + "step": 20594 + }, + { + "epoch": 6.321362799263352, + "grad_norm": 0.2062411904335022, + "learning_rate": 3.1488634009191177e-05, + "loss": 1.7584, + "step": 20595 + }, + { + "epoch": 6.3216697360343765, + "grad_norm": 0.22355243563652039, + "learning_rate": 3.148401674288556e-05, + "loss": 1.7106, + "step": 20596 + }, + { + "epoch": 6.321976672805402, + "grad_norm": 0.20189255475997925, + "learning_rate": 3.147939965956576e-05, + "loss": 1.6775, + "step": 20597 + }, + { + "epoch": 6.322283609576427, + "grad_norm": 0.23753875494003296, + "learning_rate": 3.147478275927736e-05, + "loss": 1.7661, + "step": 20598 + }, + { + "epoch": 6.3225905463474525, + "grad_norm": 0.18658648431301117, + "learning_rate": 3.147016604206604e-05, + "loss": 1.7562, + "step": 20599 + }, + { + "epoch": 6.322897483118478, + "grad_norm": 0.2610020637512207, + "learning_rate": 3.146554950797738e-05, + "loss": 1.7217, + "step": 20600 + }, + { + "epoch": 6.323204419889503, + "grad_norm": 0.18329289555549622, + "learning_rate": 3.146093315705704e-05, + "loss": 1.7206, + "step": 20601 + }, + { + "epoch": 6.323511356660528, + "grad_norm": 0.2393725961446762, + "learning_rate": 3.1456316989350606e-05, + "loss": 1.7646, + "step": 20602 + }, + { + "epoch": 6.323818293431553, + "grad_norm": 0.23535947501659393, + "learning_rate": 3.1451701004903736e-05, + "loss": 1.7718, + "step": 20603 + }, + { + "epoch": 6.324125230202578, + "grad_norm": 0.23179253935813904, + "learning_rate": 3.1447085203762014e-05, + "loss": 1.7311, + "step": 20604 + }, + { + "epoch": 6.324432166973604, + "grad_norm": 0.24929681420326233, + "learning_rate": 3.144246958597109e-05, + "loss": 1.7728, + "step": 20605 + }, + { + "epoch": 6.324739103744629, + "grad_norm": 0.22520960867404938, + "learning_rate": 3.1437854151576526e-05, + "loss": 1.749, + "step": 20606 + }, + { + "epoch": 6.3250460405156534, + "grad_norm": 0.3005391061306, + "learning_rate": 3.1433238900623997e-05, + "loss": 1.7725, + "step": 20607 + }, + { + "epoch": 6.325352977286679, + "grad_norm": 0.22625432908535004, + "learning_rate": 3.142862383315908e-05, + "loss": 1.7083, + "step": 20608 + }, + { + "epoch": 6.325659914057704, + "grad_norm": 0.28015029430389404, + "learning_rate": 3.142400894922737e-05, + "loss": 1.6862, + "step": 20609 + }, + { + "epoch": 6.3259668508287294, + "grad_norm": 0.2520587146282196, + "learning_rate": 3.141939424887451e-05, + "loss": 1.7059, + "step": 20610 + }, + { + "epoch": 6.326273787599755, + "grad_norm": 0.24668551981449127, + "learning_rate": 3.141477973214607e-05, + "loss": 1.6858, + "step": 20611 + }, + { + "epoch": 6.326580724370779, + "grad_norm": 0.2524704337120056, + "learning_rate": 3.1410165399087675e-05, + "loss": 1.6884, + "step": 20612 + }, + { + "epoch": 6.326887661141805, + "grad_norm": 0.18849264085292816, + "learning_rate": 3.1405551249744916e-05, + "loss": 1.6984, + "step": 20613 + }, + { + "epoch": 6.32719459791283, + "grad_norm": 0.2411552518606186, + "learning_rate": 3.140093728416342e-05, + "loss": 1.7455, + "step": 20614 + }, + { + "epoch": 6.327501534683855, + "grad_norm": 0.2268913835287094, + "learning_rate": 3.139632350238874e-05, + "loss": 1.7124, + "step": 20615 + }, + { + "epoch": 6.327808471454881, + "grad_norm": 0.3118770718574524, + "learning_rate": 3.1391709904466515e-05, + "loss": 1.7322, + "step": 20616 + }, + { + "epoch": 6.328115408225905, + "grad_norm": 0.25166428089141846, + "learning_rate": 3.1387096490442294e-05, + "loss": 1.7136, + "step": 20617 + }, + { + "epoch": 6.32842234499693, + "grad_norm": 0.2733297049999237, + "learning_rate": 3.138248326036172e-05, + "loss": 1.7939, + "step": 20618 + }, + { + "epoch": 6.328729281767956, + "grad_norm": 0.24583236873149872, + "learning_rate": 3.1377870214270334e-05, + "loss": 1.7105, + "step": 20619 + }, + { + "epoch": 6.329036218538981, + "grad_norm": 0.2533528506755829, + "learning_rate": 3.137325735221377e-05, + "loss": 1.7828, + "step": 20620 + }, + { + "epoch": 6.329343155310006, + "grad_norm": 0.27662715315818787, + "learning_rate": 3.136864467423758e-05, + "loss": 1.6969, + "step": 20621 + }, + { + "epoch": 6.329650092081032, + "grad_norm": 0.20107655227184296, + "learning_rate": 3.136403218038738e-05, + "loss": 1.6659, + "step": 20622 + }, + { + "epoch": 6.329957028852056, + "grad_norm": 0.21126115322113037, + "learning_rate": 3.135941987070872e-05, + "loss": 1.7372, + "step": 20623 + }, + { + "epoch": 6.3302639656230815, + "grad_norm": 0.1840609908103943, + "learning_rate": 3.1354807745247206e-05, + "loss": 1.7219, + "step": 20624 + }, + { + "epoch": 6.330570902394107, + "grad_norm": 0.23623648285865784, + "learning_rate": 3.135019580404842e-05, + "loss": 1.8059, + "step": 20625 + }, + { + "epoch": 6.330877839165132, + "grad_norm": 0.19853124022483826, + "learning_rate": 3.134558404715792e-05, + "loss": 1.7336, + "step": 20626 + }, + { + "epoch": 6.3311847759361575, + "grad_norm": 0.2261304259300232, + "learning_rate": 3.13409724746213e-05, + "loss": 1.7508, + "step": 20627 + }, + { + "epoch": 6.331491712707182, + "grad_norm": 0.1797952800989151, + "learning_rate": 3.1336361086484104e-05, + "loss": 1.6569, + "step": 20628 + }, + { + "epoch": 6.331798649478207, + "grad_norm": 0.21610359847545624, + "learning_rate": 3.133174988279195e-05, + "loss": 1.7093, + "step": 20629 + }, + { + "epoch": 6.332105586249233, + "grad_norm": 0.1818271279335022, + "learning_rate": 3.1327138863590365e-05, + "loss": 1.6951, + "step": 20630 + }, + { + "epoch": 6.332412523020258, + "grad_norm": 0.20425963401794434, + "learning_rate": 3.1322528028924956e-05, + "loss": 1.7399, + "step": 20631 + }, + { + "epoch": 6.332719459791283, + "grad_norm": 0.20357854664325714, + "learning_rate": 3.131791737884126e-05, + "loss": 1.693, + "step": 20632 + }, + { + "epoch": 6.333026396562309, + "grad_norm": 0.25307130813598633, + "learning_rate": 3.1313306913384874e-05, + "loss": 1.674, + "step": 20633 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 0.21596084535121918, + "learning_rate": 3.130869663260132e-05, + "loss": 1.7521, + "step": 20634 + }, + { + "epoch": 6.333640270104358, + "grad_norm": 0.24110902845859528, + "learning_rate": 3.1304086536536194e-05, + "loss": 1.6723, + "step": 20635 + }, + { + "epoch": 6.333947206875384, + "grad_norm": 0.21365956962108612, + "learning_rate": 3.129947662523503e-05, + "loss": 1.7702, + "step": 20636 + }, + { + "epoch": 6.334254143646409, + "grad_norm": 0.21873877942562103, + "learning_rate": 3.129486689874341e-05, + "loss": 1.7176, + "step": 20637 + }, + { + "epoch": 6.334561080417434, + "grad_norm": 0.2543679475784302, + "learning_rate": 3.129025735710687e-05, + "loss": 1.7733, + "step": 20638 + }, + { + "epoch": 6.334868017188459, + "grad_norm": 0.24591630697250366, + "learning_rate": 3.1285648000370996e-05, + "loss": 1.7212, + "step": 20639 + }, + { + "epoch": 6.335174953959484, + "grad_norm": 0.2453039139509201, + "learning_rate": 3.128103882858129e-05, + "loss": 1.7316, + "step": 20640 + }, + { + "epoch": 6.3354818907305095, + "grad_norm": 0.239897683262825, + "learning_rate": 3.127642984178334e-05, + "loss": 1.7495, + "step": 20641 + }, + { + "epoch": 6.335788827501535, + "grad_norm": 0.20719192922115326, + "learning_rate": 3.12718210400227e-05, + "loss": 1.7242, + "step": 20642 + }, + { + "epoch": 6.33609576427256, + "grad_norm": 0.1813955008983612, + "learning_rate": 3.126721242334487e-05, + "loss": 1.672, + "step": 20643 + }, + { + "epoch": 6.336402701043585, + "grad_norm": 0.20045650005340576, + "learning_rate": 3.126260399179546e-05, + "loss": 1.7854, + "step": 20644 + }, + { + "epoch": 6.33670963781461, + "grad_norm": 0.23010976612567902, + "learning_rate": 3.125799574541995e-05, + "loss": 1.7508, + "step": 20645 + }, + { + "epoch": 6.337016574585635, + "grad_norm": 0.1854519248008728, + "learning_rate": 3.1253387684263924e-05, + "loss": 1.7049, + "step": 20646 + }, + { + "epoch": 6.337323511356661, + "grad_norm": 0.2062511295080185, + "learning_rate": 3.1248779808372894e-05, + "loss": 1.6894, + "step": 20647 + }, + { + "epoch": 6.337630448127686, + "grad_norm": 0.19851341843605042, + "learning_rate": 3.124417211779244e-05, + "loss": 1.7332, + "step": 20648 + }, + { + "epoch": 6.337937384898711, + "grad_norm": 0.2099175751209259, + "learning_rate": 3.1239564612568054e-05, + "loss": 1.7577, + "step": 20649 + }, + { + "epoch": 6.338244321669736, + "grad_norm": 0.2152891904115677, + "learning_rate": 3.123495729274529e-05, + "loss": 1.7691, + "step": 20650 + }, + { + "epoch": 6.338551258440761, + "grad_norm": 0.19431835412979126, + "learning_rate": 3.123035015836967e-05, + "loss": 1.7035, + "step": 20651 + }, + { + "epoch": 6.338858195211786, + "grad_norm": 0.20863930881023407, + "learning_rate": 3.122574320948674e-05, + "loss": 1.7166, + "step": 20652 + }, + { + "epoch": 6.339165131982812, + "grad_norm": 0.17948369681835175, + "learning_rate": 3.122113644614201e-05, + "loss": 1.732, + "step": 20653 + }, + { + "epoch": 6.339472068753837, + "grad_norm": 0.2329161912202835, + "learning_rate": 3.121652986838103e-05, + "loss": 1.6934, + "step": 20654 + }, + { + "epoch": 6.3397790055248615, + "grad_norm": 0.23563681542873383, + "learning_rate": 3.12119234762493e-05, + "loss": 1.7329, + "step": 20655 + }, + { + "epoch": 6.340085942295887, + "grad_norm": 0.22654885053634644, + "learning_rate": 3.120731726979236e-05, + "loss": 1.767, + "step": 20656 + }, + { + "epoch": 6.340392879066912, + "grad_norm": 0.2507181465625763, + "learning_rate": 3.1202711249055715e-05, + "loss": 1.7071, + "step": 20657 + }, + { + "epoch": 6.3406998158379375, + "grad_norm": 0.20573864877223969, + "learning_rate": 3.1198105414084906e-05, + "loss": 1.7566, + "step": 20658 + }, + { + "epoch": 6.341006752608963, + "grad_norm": 0.23311644792556763, + "learning_rate": 3.119349976492545e-05, + "loss": 1.6778, + "step": 20659 + }, + { + "epoch": 6.341313689379987, + "grad_norm": 0.18166053295135498, + "learning_rate": 3.118889430162283e-05, + "loss": 1.7109, + "step": 20660 + }, + { + "epoch": 6.341620626151013, + "grad_norm": 0.21054090559482574, + "learning_rate": 3.11842890242226e-05, + "loss": 1.7255, + "step": 20661 + }, + { + "epoch": 6.341927562922038, + "grad_norm": 0.19898973405361176, + "learning_rate": 3.1179683932770235e-05, + "loss": 1.7017, + "step": 20662 + }, + { + "epoch": 6.342234499693063, + "grad_norm": 0.17782434821128845, + "learning_rate": 3.117507902731127e-05, + "loss": 1.6858, + "step": 20663 + }, + { + "epoch": 6.342541436464089, + "grad_norm": 0.19286927580833435, + "learning_rate": 3.117047430789121e-05, + "loss": 1.707, + "step": 20664 + }, + { + "epoch": 6.342848373235114, + "grad_norm": 0.18578651547431946, + "learning_rate": 3.1165869774555565e-05, + "loss": 1.7331, + "step": 20665 + }, + { + "epoch": 6.343155310006138, + "grad_norm": 0.19728249311447144, + "learning_rate": 3.1161265427349826e-05, + "loss": 1.7165, + "step": 20666 + }, + { + "epoch": 6.343462246777164, + "grad_norm": 0.18240176141262054, + "learning_rate": 3.115666126631952e-05, + "loss": 1.7167, + "step": 20667 + }, + { + "epoch": 6.343769183548189, + "grad_norm": 0.1928495317697525, + "learning_rate": 3.115205729151011e-05, + "loss": 1.7431, + "step": 20668 + }, + { + "epoch": 6.344076120319214, + "grad_norm": 0.19459952414035797, + "learning_rate": 3.1147453502967125e-05, + "loss": 1.7294, + "step": 20669 + }, + { + "epoch": 6.34438305709024, + "grad_norm": 0.18829894065856934, + "learning_rate": 3.1142849900736046e-05, + "loss": 1.7512, + "step": 20670 + }, + { + "epoch": 6.344689993861264, + "grad_norm": 0.19678451120853424, + "learning_rate": 3.11382464848624e-05, + "loss": 1.673, + "step": 20671 + }, + { + "epoch": 6.3449969306322895, + "grad_norm": 0.22256550192832947, + "learning_rate": 3.1133643255391635e-05, + "loss": 1.7044, + "step": 20672 + }, + { + "epoch": 6.345303867403315, + "grad_norm": 0.24741628766059875, + "learning_rate": 3.112904021236929e-05, + "loss": 1.7904, + "step": 20673 + }, + { + "epoch": 6.34561080417434, + "grad_norm": 0.20286159217357635, + "learning_rate": 3.11244373558408e-05, + "loss": 1.6976, + "step": 20674 + }, + { + "epoch": 6.3459177409453655, + "grad_norm": 0.2005387842655182, + "learning_rate": 3.11198346858517e-05, + "loss": 1.7083, + "step": 20675 + }, + { + "epoch": 6.346224677716391, + "grad_norm": 0.22312256693840027, + "learning_rate": 3.111523220244747e-05, + "loss": 1.7575, + "step": 20676 + }, + { + "epoch": 6.346531614487415, + "grad_norm": 0.2968841791152954, + "learning_rate": 3.111062990567356e-05, + "loss": 1.7813, + "step": 20677 + }, + { + "epoch": 6.346838551258441, + "grad_norm": 0.22900697588920593, + "learning_rate": 3.1106027795575496e-05, + "loss": 1.6818, + "step": 20678 + }, + { + "epoch": 6.347145488029466, + "grad_norm": 0.1912240833044052, + "learning_rate": 3.110142587219873e-05, + "loss": 1.7174, + "step": 20679 + }, + { + "epoch": 6.347452424800491, + "grad_norm": 0.20461280643939972, + "learning_rate": 3.1096824135588754e-05, + "loss": 1.6945, + "step": 20680 + }, + { + "epoch": 6.347759361571517, + "grad_norm": 0.19344913959503174, + "learning_rate": 3.109222258579103e-05, + "loss": 1.7064, + "step": 20681 + }, + { + "epoch": 6.348066298342541, + "grad_norm": 0.1833983063697815, + "learning_rate": 3.108762122285106e-05, + "loss": 1.702, + "step": 20682 + }, + { + "epoch": 6.348373235113566, + "grad_norm": 0.20344893634319305, + "learning_rate": 3.108302004681429e-05, + "loss": 1.7323, + "step": 20683 + }, + { + "epoch": 6.348680171884592, + "grad_norm": 0.18629617989063263, + "learning_rate": 3.107841905772622e-05, + "loss": 1.6841, + "step": 20684 + }, + { + "epoch": 6.348987108655617, + "grad_norm": 0.19279471039772034, + "learning_rate": 3.107381825563228e-05, + "loss": 1.7581, + "step": 20685 + }, + { + "epoch": 6.349294045426642, + "grad_norm": 0.21727058291435242, + "learning_rate": 3.106921764057798e-05, + "loss": 1.7231, + "step": 20686 + }, + { + "epoch": 6.349600982197667, + "grad_norm": 0.20952723920345306, + "learning_rate": 3.1064617212608747e-05, + "loss": 1.713, + "step": 20687 + }, + { + "epoch": 6.349907918968692, + "grad_norm": 0.2358582466840744, + "learning_rate": 3.10600169717701e-05, + "loss": 1.7291, + "step": 20688 + }, + { + "epoch": 6.350214855739718, + "grad_norm": 0.21846619248390198, + "learning_rate": 3.105541691810743e-05, + "loss": 1.7365, + "step": 20689 + }, + { + "epoch": 6.350521792510743, + "grad_norm": 0.22137843072414398, + "learning_rate": 3.1050817051666256e-05, + "loss": 1.7404, + "step": 20690 + }, + { + "epoch": 6.350828729281768, + "grad_norm": 0.2301674485206604, + "learning_rate": 3.1046217372492e-05, + "loss": 1.7422, + "step": 20691 + }, + { + "epoch": 6.351135666052793, + "grad_norm": 0.18955166637897491, + "learning_rate": 3.104161788063015e-05, + "loss": 1.7063, + "step": 20692 + }, + { + "epoch": 6.351442602823818, + "grad_norm": 0.21172095835208893, + "learning_rate": 3.103701857612614e-05, + "loss": 1.6856, + "step": 20693 + }, + { + "epoch": 6.351749539594843, + "grad_norm": 0.20921260118484497, + "learning_rate": 3.103241945902541e-05, + "loss": 1.7384, + "step": 20694 + }, + { + "epoch": 6.352056476365869, + "grad_norm": 0.21005603671073914, + "learning_rate": 3.102782052937345e-05, + "loss": 1.7118, + "step": 20695 + }, + { + "epoch": 6.352363413136894, + "grad_norm": 0.20888659358024597, + "learning_rate": 3.102322178721567e-05, + "loss": 1.7172, + "step": 20696 + }, + { + "epoch": 6.352670349907919, + "grad_norm": 0.194463849067688, + "learning_rate": 3.101862323259754e-05, + "loss": 1.6909, + "step": 20697 + }, + { + "epoch": 6.352977286678944, + "grad_norm": 0.20848685503005981, + "learning_rate": 3.1014024865564494e-05, + "loss": 1.7846, + "step": 20698 + }, + { + "epoch": 6.353284223449969, + "grad_norm": 0.18669761717319489, + "learning_rate": 3.100942668616201e-05, + "loss": 1.7542, + "step": 20699 + }, + { + "epoch": 6.3535911602209945, + "grad_norm": 0.23618464171886444, + "learning_rate": 3.100482869443547e-05, + "loss": 1.7292, + "step": 20700 + }, + { + "epoch": 6.35389809699202, + "grad_norm": 0.19389905035495758, + "learning_rate": 3.100023089043037e-05, + "loss": 1.6847, + "step": 20701 + }, + { + "epoch": 6.354205033763045, + "grad_norm": 0.20346343517303467, + "learning_rate": 3.09956332741921e-05, + "loss": 1.7096, + "step": 20702 + }, + { + "epoch": 6.35451197053407, + "grad_norm": 0.20825842022895813, + "learning_rate": 3.099103584576614e-05, + "loss": 1.6974, + "step": 20703 + }, + { + "epoch": 6.354818907305095, + "grad_norm": 0.2093508094549179, + "learning_rate": 3.0986438605197895e-05, + "loss": 1.6849, + "step": 20704 + }, + { + "epoch": 6.35512584407612, + "grad_norm": 0.2576633393764496, + "learning_rate": 3.098184155253282e-05, + "loss": 1.7974, + "step": 20705 + }, + { + "epoch": 6.355432780847146, + "grad_norm": 0.18197253346443176, + "learning_rate": 3.097724468781632e-05, + "loss": 1.6723, + "step": 20706 + }, + { + "epoch": 6.355739717618171, + "grad_norm": 0.24809512495994568, + "learning_rate": 3.0972648011093855e-05, + "loss": 1.7378, + "step": 20707 + }, + { + "epoch": 6.356046654389196, + "grad_norm": 0.2046923190355301, + "learning_rate": 3.0968051522410814e-05, + "loss": 1.7502, + "step": 20708 + }, + { + "epoch": 6.356353591160221, + "grad_norm": 0.20443019270896912, + "learning_rate": 3.096345522181265e-05, + "loss": 1.7179, + "step": 20709 + }, + { + "epoch": 6.356660527931246, + "grad_norm": 0.1906277984380722, + "learning_rate": 3.09588591093448e-05, + "loss": 1.7167, + "step": 20710 + }, + { + "epoch": 6.356967464702271, + "grad_norm": 0.20729197561740875, + "learning_rate": 3.095426318505263e-05, + "loss": 1.7193, + "step": 20711 + }, + { + "epoch": 6.357274401473297, + "grad_norm": 0.23446644842624664, + "learning_rate": 3.094966744898162e-05, + "loss": 1.7341, + "step": 20712 + }, + { + "epoch": 6.357581338244322, + "grad_norm": 0.18882590532302856, + "learning_rate": 3.094507190117715e-05, + "loss": 1.7001, + "step": 20713 + }, + { + "epoch": 6.3578882750153465, + "grad_norm": 0.27240705490112305, + "learning_rate": 3.094047654168465e-05, + "loss": 1.7641, + "step": 20714 + }, + { + "epoch": 6.358195211786372, + "grad_norm": 0.19616954028606415, + "learning_rate": 3.093588137054952e-05, + "loss": 1.751, + "step": 20715 + }, + { + "epoch": 6.358502148557397, + "grad_norm": 0.23402562737464905, + "learning_rate": 3.093128638781721e-05, + "loss": 1.7274, + "step": 20716 + }, + { + "epoch": 6.3588090853284225, + "grad_norm": 0.18189528584480286, + "learning_rate": 3.092669159353309e-05, + "loss": 1.7079, + "step": 20717 + }, + { + "epoch": 6.359116022099448, + "grad_norm": 0.21583771705627441, + "learning_rate": 3.092209698774259e-05, + "loss": 1.6811, + "step": 20718 + }, + { + "epoch": 6.359422958870473, + "grad_norm": 0.2477681040763855, + "learning_rate": 3.091750257049109e-05, + "loss": 1.6963, + "step": 20719 + }, + { + "epoch": 6.359729895641498, + "grad_norm": 0.2883109152317047, + "learning_rate": 3.091290834182403e-05, + "loss": 1.8349, + "step": 20720 + }, + { + "epoch": 6.360036832412523, + "grad_norm": 0.23407170176506042, + "learning_rate": 3.09083143017868e-05, + "loss": 1.7271, + "step": 20721 + }, + { + "epoch": 6.360343769183548, + "grad_norm": 0.2818833589553833, + "learning_rate": 3.090372045042479e-05, + "loss": 1.7852, + "step": 20722 + }, + { + "epoch": 6.360650705954574, + "grad_norm": 0.24415317177772522, + "learning_rate": 3.089912678778341e-05, + "loss": 1.6826, + "step": 20723 + }, + { + "epoch": 6.360957642725599, + "grad_norm": 0.26786303520202637, + "learning_rate": 3.0894533313908056e-05, + "loss": 1.7616, + "step": 20724 + }, + { + "epoch": 6.361264579496623, + "grad_norm": 0.3235633969306946, + "learning_rate": 3.088994002884411e-05, + "loss": 1.7637, + "step": 20725 + }, + { + "epoch": 6.361571516267649, + "grad_norm": 0.18675416707992554, + "learning_rate": 3.0885346932637e-05, + "loss": 1.7037, + "step": 20726 + }, + { + "epoch": 6.361878453038674, + "grad_norm": 0.295802503824234, + "learning_rate": 3.0880754025332084e-05, + "loss": 1.7435, + "step": 20727 + }, + { + "epoch": 6.362185389809699, + "grad_norm": 0.18665561079978943, + "learning_rate": 3.0876161306974756e-05, + "loss": 1.684, + "step": 20728 + }, + { + "epoch": 6.362492326580725, + "grad_norm": 0.2530463635921478, + "learning_rate": 3.087156877761043e-05, + "loss": 1.7934, + "step": 20729 + }, + { + "epoch": 6.362799263351749, + "grad_norm": 0.17860126495361328, + "learning_rate": 3.086697643728445e-05, + "loss": 1.6977, + "step": 20730 + }, + { + "epoch": 6.3631062001227745, + "grad_norm": 0.20118845999240875, + "learning_rate": 3.086238428604223e-05, + "loss": 1.7241, + "step": 20731 + }, + { + "epoch": 6.3634131368938, + "grad_norm": 0.18811924755573273, + "learning_rate": 3.085779232392915e-05, + "loss": 1.6918, + "step": 20732 + }, + { + "epoch": 6.363720073664825, + "grad_norm": 0.1841908097267151, + "learning_rate": 3.085320055099058e-05, + "loss": 1.735, + "step": 20733 + }, + { + "epoch": 6.3640270104358505, + "grad_norm": 0.1956033855676651, + "learning_rate": 3.08486089672719e-05, + "loss": 1.7203, + "step": 20734 + }, + { + "epoch": 6.364333947206875, + "grad_norm": 0.19844500720500946, + "learning_rate": 3.084401757281851e-05, + "loss": 1.6767, + "step": 20735 + }, + { + "epoch": 6.3646408839779, + "grad_norm": 0.2018919438123703, + "learning_rate": 3.083942636767575e-05, + "loss": 1.6912, + "step": 20736 + }, + { + "epoch": 6.364947820748926, + "grad_norm": 0.18929271399974823, + "learning_rate": 3.083483535188901e-05, + "loss": 1.6838, + "step": 20737 + }, + { + "epoch": 6.365254757519951, + "grad_norm": 0.19833499193191528, + "learning_rate": 3.0830244525503674e-05, + "loss": 1.7139, + "step": 20738 + }, + { + "epoch": 6.365561694290976, + "grad_norm": 0.17029902338981628, + "learning_rate": 3.082565388856509e-05, + "loss": 1.6665, + "step": 20739 + }, + { + "epoch": 6.365868631062002, + "grad_norm": 0.19526802003383636, + "learning_rate": 3.082106344111861e-05, + "loss": 1.7021, + "step": 20740 + }, + { + "epoch": 6.366175567833026, + "grad_norm": 0.19061279296875, + "learning_rate": 3.081647318320966e-05, + "loss": 1.7134, + "step": 20741 + }, + { + "epoch": 6.366482504604051, + "grad_norm": 0.17782293260097504, + "learning_rate": 3.081188311488354e-05, + "loss": 1.741, + "step": 20742 + }, + { + "epoch": 6.366789441375077, + "grad_norm": 0.20002372562885284, + "learning_rate": 3.080729323618565e-05, + "loss": 1.6943, + "step": 20743 + }, + { + "epoch": 6.367096378146102, + "grad_norm": 0.22873486578464508, + "learning_rate": 3.080270354716134e-05, + "loss": 1.7223, + "step": 20744 + }, + { + "epoch": 6.367403314917127, + "grad_norm": 0.191136434674263, + "learning_rate": 3.079811404785595e-05, + "loss": 1.6774, + "step": 20745 + }, + { + "epoch": 6.367710251688152, + "grad_norm": 0.20446795225143433, + "learning_rate": 3.0793524738314874e-05, + "loss": 1.7443, + "step": 20746 + }, + { + "epoch": 6.368017188459177, + "grad_norm": 0.20668596029281616, + "learning_rate": 3.078893561858341e-05, + "loss": 1.7553, + "step": 20747 + }, + { + "epoch": 6.3683241252302025, + "grad_norm": 0.18445394933223724, + "learning_rate": 3.078434668870698e-05, + "loss": 1.7365, + "step": 20748 + }, + { + "epoch": 6.368631062001228, + "grad_norm": 0.1824318915605545, + "learning_rate": 3.077975794873088e-05, + "loss": 1.7248, + "step": 20749 + }, + { + "epoch": 6.368937998772253, + "grad_norm": 0.18452249467372894, + "learning_rate": 3.077516939870047e-05, + "loss": 1.7095, + "step": 20750 + }, + { + "epoch": 6.3692449355432785, + "grad_norm": 0.17254458367824554, + "learning_rate": 3.077058103866112e-05, + "loss": 1.6937, + "step": 20751 + }, + { + "epoch": 6.369551872314303, + "grad_norm": 0.2022976130247116, + "learning_rate": 3.0765992868658154e-05, + "loss": 1.7593, + "step": 20752 + }, + { + "epoch": 6.369858809085328, + "grad_norm": 0.19274397194385529, + "learning_rate": 3.076140488873691e-05, + "loss": 1.7288, + "step": 20753 + }, + { + "epoch": 6.370165745856354, + "grad_norm": 0.18847523629665375, + "learning_rate": 3.075681709894276e-05, + "loss": 1.7293, + "step": 20754 + }, + { + "epoch": 6.370472682627379, + "grad_norm": 0.21054589748382568, + "learning_rate": 3.075222949932101e-05, + "loss": 1.7688, + "step": 20755 + }, + { + "epoch": 6.370779619398404, + "grad_norm": 0.16934558749198914, + "learning_rate": 3.0747642089917005e-05, + "loss": 1.7092, + "step": 20756 + }, + { + "epoch": 6.371086556169429, + "grad_norm": 0.19154684245586395, + "learning_rate": 3.0743054870776075e-05, + "loss": 1.6827, + "step": 20757 + }, + { + "epoch": 6.371393492940454, + "grad_norm": 0.2622900605201721, + "learning_rate": 3.0738467841943594e-05, + "loss": 1.748, + "step": 20758 + }, + { + "epoch": 6.371700429711479, + "grad_norm": 0.1767888218164444, + "learning_rate": 3.073388100346484e-05, + "loss": 1.717, + "step": 20759 + }, + { + "epoch": 6.372007366482505, + "grad_norm": 0.21692602336406708, + "learning_rate": 3.072929435538518e-05, + "loss": 1.7543, + "step": 20760 + }, + { + "epoch": 6.37231430325353, + "grad_norm": 0.19853977859020233, + "learning_rate": 3.0724707897749926e-05, + "loss": 1.7599, + "step": 20761 + }, + { + "epoch": 6.3726212400245545, + "grad_norm": 0.1904703676700592, + "learning_rate": 3.0720121630604396e-05, + "loss": 1.7094, + "step": 20762 + }, + { + "epoch": 6.37292817679558, + "grad_norm": 0.1961483359336853, + "learning_rate": 3.071553555399395e-05, + "loss": 1.7363, + "step": 20763 + }, + { + "epoch": 6.373235113566605, + "grad_norm": 0.16419392824172974, + "learning_rate": 3.071094966796385e-05, + "loss": 1.7073, + "step": 20764 + }, + { + "epoch": 6.3735420503376305, + "grad_norm": 0.1784946471452713, + "learning_rate": 3.0706363972559476e-05, + "loss": 1.699, + "step": 20765 + }, + { + "epoch": 6.373848987108656, + "grad_norm": 0.19472888112068176, + "learning_rate": 3.070177846782611e-05, + "loss": 1.7541, + "step": 20766 + }, + { + "epoch": 6.37415592387968, + "grad_norm": 0.2355004847049713, + "learning_rate": 3.0697193153809076e-05, + "loss": 1.7389, + "step": 20767 + }, + { + "epoch": 6.374462860650706, + "grad_norm": 0.1956906020641327, + "learning_rate": 3.069260803055369e-05, + "loss": 1.7197, + "step": 20768 + }, + { + "epoch": 6.374769797421731, + "grad_norm": 0.21212655305862427, + "learning_rate": 3.068802309810529e-05, + "loss": 1.7291, + "step": 20769 + }, + { + "epoch": 6.375076734192756, + "grad_norm": 0.22920182347297668, + "learning_rate": 3.068343835650914e-05, + "loss": 1.7397, + "step": 20770 + }, + { + "epoch": 6.375383670963782, + "grad_norm": 0.2143404483795166, + "learning_rate": 3.0678853805810605e-05, + "loss": 1.76, + "step": 20771 + }, + { + "epoch": 6.375690607734807, + "grad_norm": 0.1848321557044983, + "learning_rate": 3.067426944605492e-05, + "loss": 1.7127, + "step": 20772 + }, + { + "epoch": 6.3759975445058314, + "grad_norm": 0.23339331150054932, + "learning_rate": 3.0669685277287465e-05, + "loss": 1.7828, + "step": 20773 + }, + { + "epoch": 6.376304481276857, + "grad_norm": 0.19590741395950317, + "learning_rate": 3.066510129955349e-05, + "loss": 1.7224, + "step": 20774 + }, + { + "epoch": 6.376611418047882, + "grad_norm": 0.19986604154109955, + "learning_rate": 3.066051751289833e-05, + "loss": 1.7412, + "step": 20775 + }, + { + "epoch": 6.3769183548189075, + "grad_norm": 0.18629087507724762, + "learning_rate": 3.0655933917367266e-05, + "loss": 1.695, + "step": 20776 + }, + { + "epoch": 6.377225291589933, + "grad_norm": 0.2248111218214035, + "learning_rate": 3.0651350513005605e-05, + "loss": 1.7685, + "step": 20777 + }, + { + "epoch": 6.377532228360957, + "grad_norm": 0.1803683638572693, + "learning_rate": 3.064676729985864e-05, + "loss": 1.7206, + "step": 20778 + }, + { + "epoch": 6.377839165131983, + "grad_norm": 0.23836754262447357, + "learning_rate": 3.064218427797165e-05, + "loss": 1.7428, + "step": 20779 + }, + { + "epoch": 6.378146101903008, + "grad_norm": 0.22549279034137726, + "learning_rate": 3.063760144738996e-05, + "loss": 1.7314, + "step": 20780 + }, + { + "epoch": 6.378453038674033, + "grad_norm": 0.20714345574378967, + "learning_rate": 3.063301880815882e-05, + "loss": 1.7179, + "step": 20781 + }, + { + "epoch": 6.378759975445059, + "grad_norm": 0.17024052143096924, + "learning_rate": 3.0628436360323565e-05, + "loss": 1.6602, + "step": 20782 + }, + { + "epoch": 6.379066912216084, + "grad_norm": 0.20378601551055908, + "learning_rate": 3.062385410392943e-05, + "loss": 1.7708, + "step": 20783 + }, + { + "epoch": 6.379373848987108, + "grad_norm": 0.1885673850774765, + "learning_rate": 3.0619272039021734e-05, + "loss": 1.7034, + "step": 20784 + }, + { + "epoch": 6.379680785758134, + "grad_norm": 0.18746556341648102, + "learning_rate": 3.0614690165645746e-05, + "loss": 1.6946, + "step": 20785 + }, + { + "epoch": 6.379987722529159, + "grad_norm": 0.19569392502307892, + "learning_rate": 3.061010848384677e-05, + "loss": 1.7298, + "step": 20786 + }, + { + "epoch": 6.380294659300184, + "grad_norm": 0.21114139258861542, + "learning_rate": 3.0605526993670046e-05, + "loss": 1.795, + "step": 20787 + }, + { + "epoch": 6.38060159607121, + "grad_norm": 0.20940302312374115, + "learning_rate": 3.06009456951609e-05, + "loss": 1.6747, + "step": 20788 + }, + { + "epoch": 6.380908532842234, + "grad_norm": 0.21008993685245514, + "learning_rate": 3.059636458836455e-05, + "loss": 1.7219, + "step": 20789 + }, + { + "epoch": 6.3812154696132595, + "grad_norm": 0.17642457783222198, + "learning_rate": 3.0591783673326304e-05, + "loss": 1.6555, + "step": 20790 + }, + { + "epoch": 6.381522406384285, + "grad_norm": 0.2786177396774292, + "learning_rate": 3.058720295009143e-05, + "loss": 1.8463, + "step": 20791 + }, + { + "epoch": 6.38182934315531, + "grad_norm": 0.21209503710269928, + "learning_rate": 3.058262241870521e-05, + "loss": 1.6848, + "step": 20792 + }, + { + "epoch": 6.3821362799263355, + "grad_norm": 0.1880561262369156, + "learning_rate": 3.057804207921287e-05, + "loss": 1.7401, + "step": 20793 + }, + { + "epoch": 6.382443216697361, + "grad_norm": 0.22108516097068787, + "learning_rate": 3.0573461931659726e-05, + "loss": 1.7482, + "step": 20794 + }, + { + "epoch": 6.382750153468385, + "grad_norm": 0.2161533385515213, + "learning_rate": 3.0568881976091006e-05, + "loss": 1.7425, + "step": 20795 + }, + { + "epoch": 6.383057090239411, + "grad_norm": 0.22933612763881683, + "learning_rate": 3.0564302212551975e-05, + "loss": 1.7424, + "step": 20796 + }, + { + "epoch": 6.383364027010436, + "grad_norm": 0.19572989642620087, + "learning_rate": 3.0559722641087916e-05, + "loss": 1.6763, + "step": 20797 + }, + { + "epoch": 6.383670963781461, + "grad_norm": 0.2181084007024765, + "learning_rate": 3.0555143261744056e-05, + "loss": 1.7164, + "step": 20798 + }, + { + "epoch": 6.383977900552487, + "grad_norm": 0.1927991509437561, + "learning_rate": 3.055056407456569e-05, + "loss": 1.6833, + "step": 20799 + }, + { + "epoch": 6.384284837323511, + "grad_norm": 0.20569704473018646, + "learning_rate": 3.0545985079598025e-05, + "loss": 1.7716, + "step": 20800 + }, + { + "epoch": 6.384591774094536, + "grad_norm": 0.1856541931629181, + "learning_rate": 3.054140627688635e-05, + "loss": 1.6939, + "step": 20801 + }, + { + "epoch": 6.384898710865562, + "grad_norm": 0.2450970858335495, + "learning_rate": 3.05368276664759e-05, + "loss": 1.8197, + "step": 20802 + }, + { + "epoch": 6.385205647636587, + "grad_norm": 0.23325784504413605, + "learning_rate": 3.053224924841194e-05, + "loss": 1.7195, + "step": 20803 + }, + { + "epoch": 6.385512584407612, + "grad_norm": 0.19614358246326447, + "learning_rate": 3.052767102273968e-05, + "loss": 1.6966, + "step": 20804 + }, + { + "epoch": 6.385819521178637, + "grad_norm": 0.20615628361701965, + "learning_rate": 3.0523092989504415e-05, + "loss": 1.7429, + "step": 20805 + }, + { + "epoch": 6.386126457949662, + "grad_norm": 0.18418943881988525, + "learning_rate": 3.0518515148751336e-05, + "loss": 1.7612, + "step": 20806 + }, + { + "epoch": 6.3864333947206875, + "grad_norm": 0.17176245152950287, + "learning_rate": 3.0513937500525725e-05, + "loss": 1.6918, + "step": 20807 + }, + { + "epoch": 6.386740331491713, + "grad_norm": 0.22239255905151367, + "learning_rate": 3.0509360044872787e-05, + "loss": 1.8072, + "step": 20808 + }, + { + "epoch": 6.387047268262738, + "grad_norm": 0.20312704145908356, + "learning_rate": 3.0504782781837798e-05, + "loss": 1.7348, + "step": 20809 + }, + { + "epoch": 6.387354205033763, + "grad_norm": 0.23198208212852478, + "learning_rate": 3.0500205711465958e-05, + "loss": 1.7516, + "step": 20810 + }, + { + "epoch": 6.387661141804788, + "grad_norm": 0.2244081050157547, + "learning_rate": 3.0495628833802526e-05, + "loss": 1.731, + "step": 20811 + }, + { + "epoch": 6.387968078575813, + "grad_norm": 0.18282169103622437, + "learning_rate": 3.0491052148892717e-05, + "loss": 1.6743, + "step": 20812 + }, + { + "epoch": 6.388275015346839, + "grad_norm": 0.19108405709266663, + "learning_rate": 3.0486475656781753e-05, + "loss": 1.7485, + "step": 20813 + }, + { + "epoch": 6.388581952117864, + "grad_norm": 0.20574834942817688, + "learning_rate": 3.0481899357514898e-05, + "loss": 1.6979, + "step": 20814 + }, + { + "epoch": 6.388888888888889, + "grad_norm": 0.21263298392295837, + "learning_rate": 3.047732325113733e-05, + "loss": 1.687, + "step": 20815 + }, + { + "epoch": 6.389195825659914, + "grad_norm": 0.22646664083003998, + "learning_rate": 3.047274733769432e-05, + "loss": 1.7593, + "step": 20816 + }, + { + "epoch": 6.389502762430939, + "grad_norm": 0.1846906542778015, + "learning_rate": 3.046817161723104e-05, + "loss": 1.7271, + "step": 20817 + }, + { + "epoch": 6.389809699201964, + "grad_norm": 0.1965247541666031, + "learning_rate": 3.0463596089792746e-05, + "loss": 1.7121, + "step": 20818 + }, + { + "epoch": 6.39011663597299, + "grad_norm": 0.255577951669693, + "learning_rate": 3.045902075542464e-05, + "loss": 1.7311, + "step": 20819 + }, + { + "epoch": 6.390423572744015, + "grad_norm": 0.1837676465511322, + "learning_rate": 3.0454445614171966e-05, + "loss": 1.7177, + "step": 20820 + }, + { + "epoch": 6.3907305095150395, + "grad_norm": 0.24845893681049347, + "learning_rate": 3.0449870666079895e-05, + "loss": 1.6902, + "step": 20821 + }, + { + "epoch": 6.391037446286065, + "grad_norm": 0.28572577238082886, + "learning_rate": 3.0445295911193678e-05, + "loss": 1.7942, + "step": 20822 + }, + { + "epoch": 6.39134438305709, + "grad_norm": 0.20460839569568634, + "learning_rate": 3.044072134955849e-05, + "loss": 1.6747, + "step": 20823 + }, + { + "epoch": 6.3916513198281155, + "grad_norm": 0.3547010123729706, + "learning_rate": 3.0436146981219565e-05, + "loss": 1.7359, + "step": 20824 + }, + { + "epoch": 6.391958256599141, + "grad_norm": 0.20490451157093048, + "learning_rate": 3.04315728062221e-05, + "loss": 1.6863, + "step": 20825 + }, + { + "epoch": 6.392265193370166, + "grad_norm": 0.25874415040016174, + "learning_rate": 3.0426998824611307e-05, + "loss": 1.6798, + "step": 20826 + }, + { + "epoch": 6.392572130141191, + "grad_norm": 0.27858632802963257, + "learning_rate": 3.0422425036432378e-05, + "loss": 1.6943, + "step": 20827 + }, + { + "epoch": 6.392879066912216, + "grad_norm": 0.20951922237873077, + "learning_rate": 3.041785144173054e-05, + "loss": 1.7025, + "step": 20828 + }, + { + "epoch": 6.393186003683241, + "grad_norm": 0.3158397674560547, + "learning_rate": 3.0413278040550952e-05, + "loss": 1.7193, + "step": 20829 + }, + { + "epoch": 6.393492940454267, + "grad_norm": 0.18556484580039978, + "learning_rate": 3.0408704832938824e-05, + "loss": 1.7017, + "step": 20830 + }, + { + "epoch": 6.393799877225292, + "grad_norm": 0.31651169061660767, + "learning_rate": 3.0404131818939376e-05, + "loss": 1.7716, + "step": 20831 + }, + { + "epoch": 6.394106813996316, + "grad_norm": 0.2850388288497925, + "learning_rate": 3.0399558998597765e-05, + "loss": 1.7144, + "step": 20832 + }, + { + "epoch": 6.394413750767342, + "grad_norm": 0.19256308674812317, + "learning_rate": 3.0394986371959223e-05, + "loss": 1.6603, + "step": 20833 + }, + { + "epoch": 6.394720687538367, + "grad_norm": 0.2654922604560852, + "learning_rate": 3.0390413939068896e-05, + "loss": 1.6825, + "step": 20834 + }, + { + "epoch": 6.395027624309392, + "grad_norm": 0.19514231383800507, + "learning_rate": 3.0385841699971997e-05, + "loss": 1.7226, + "step": 20835 + }, + { + "epoch": 6.395334561080418, + "grad_norm": 0.27765151858329773, + "learning_rate": 3.0381269654713702e-05, + "loss": 1.7599, + "step": 20836 + }, + { + "epoch": 6.395641497851442, + "grad_norm": 0.2056504338979721, + "learning_rate": 3.0376697803339215e-05, + "loss": 1.7237, + "step": 20837 + }, + { + "epoch": 6.3959484346224675, + "grad_norm": 0.22516649961471558, + "learning_rate": 3.0372126145893688e-05, + "loss": 1.7566, + "step": 20838 + }, + { + "epoch": 6.396255371393493, + "grad_norm": 0.17632099986076355, + "learning_rate": 3.0367554682422327e-05, + "loss": 1.7014, + "step": 20839 + }, + { + "epoch": 6.396562308164518, + "grad_norm": 0.21872831881046295, + "learning_rate": 3.036298341297028e-05, + "loss": 1.6935, + "step": 20840 + }, + { + "epoch": 6.3968692449355435, + "grad_norm": 0.22132672369480133, + "learning_rate": 3.0358412337582752e-05, + "loss": 1.6735, + "step": 20841 + }, + { + "epoch": 6.397176181706568, + "grad_norm": 0.17865684628486633, + "learning_rate": 3.0353841456304895e-05, + "loss": 1.7097, + "step": 20842 + }, + { + "epoch": 6.397483118477593, + "grad_norm": 0.2069701999425888, + "learning_rate": 3.0349270769181914e-05, + "loss": 1.7592, + "step": 20843 + }, + { + "epoch": 6.397790055248619, + "grad_norm": 0.19800925254821777, + "learning_rate": 3.034470027625893e-05, + "loss": 1.6943, + "step": 20844 + }, + { + "epoch": 6.398096992019644, + "grad_norm": 0.24116787314414978, + "learning_rate": 3.0340129977581165e-05, + "loss": 1.7126, + "step": 20845 + }, + { + "epoch": 6.398403928790669, + "grad_norm": 0.1995212435722351, + "learning_rate": 3.033555987319375e-05, + "loss": 1.75, + "step": 20846 + }, + { + "epoch": 6.398710865561695, + "grad_norm": 0.23717111349105835, + "learning_rate": 3.0330989963141843e-05, + "loss": 1.7338, + "step": 20847 + }, + { + "epoch": 6.399017802332719, + "grad_norm": 0.18372474610805511, + "learning_rate": 3.0326420247470643e-05, + "loss": 1.7034, + "step": 20848 + }, + { + "epoch": 6.399324739103744, + "grad_norm": 0.25953924655914307, + "learning_rate": 3.0321850726225265e-05, + "loss": 1.731, + "step": 20849 + }, + { + "epoch": 6.39963167587477, + "grad_norm": 0.24846702814102173, + "learning_rate": 3.031728139945092e-05, + "loss": 1.7559, + "step": 20850 + }, + { + "epoch": 6.399938612645795, + "grad_norm": 0.20783887803554535, + "learning_rate": 3.0312712267192713e-05, + "loss": 1.7229, + "step": 20851 + }, + { + "epoch": 6.4002455494168204, + "grad_norm": 0.1904737949371338, + "learning_rate": 3.030814332949583e-05, + "loss": 1.6986, + "step": 20852 + }, + { + "epoch": 6.400552486187845, + "grad_norm": 0.2275397777557373, + "learning_rate": 3.030357458640541e-05, + "loss": 1.708, + "step": 20853 + }, + { + "epoch": 6.40085942295887, + "grad_norm": 0.20119737088680267, + "learning_rate": 3.0299006037966628e-05, + "loss": 1.7727, + "step": 20854 + }, + { + "epoch": 6.401166359729896, + "grad_norm": 0.17214249074459076, + "learning_rate": 3.0294437684224596e-05, + "loss": 1.6674, + "step": 20855 + }, + { + "epoch": 6.401473296500921, + "grad_norm": 0.21268978714942932, + "learning_rate": 3.02898695252245e-05, + "loss": 1.7182, + "step": 20856 + }, + { + "epoch": 6.401780233271946, + "grad_norm": 0.19911682605743408, + "learning_rate": 3.0285301561011448e-05, + "loss": 1.6861, + "step": 20857 + }, + { + "epoch": 6.402087170042972, + "grad_norm": 0.194064199924469, + "learning_rate": 3.0280733791630613e-05, + "loss": 1.6768, + "step": 20858 + }, + { + "epoch": 6.402394106813996, + "grad_norm": 0.17554323375225067, + "learning_rate": 3.027616621712711e-05, + "loss": 1.6987, + "step": 20859 + }, + { + "epoch": 6.402701043585021, + "grad_norm": 0.205257385969162, + "learning_rate": 3.027159883754611e-05, + "loss": 1.7951, + "step": 20860 + }, + { + "epoch": 6.403007980356047, + "grad_norm": 0.1766849011182785, + "learning_rate": 3.0267031652932743e-05, + "loss": 1.7157, + "step": 20861 + }, + { + "epoch": 6.403314917127072, + "grad_norm": 0.17106789350509644, + "learning_rate": 3.0262464663332106e-05, + "loss": 1.685, + "step": 20862 + }, + { + "epoch": 6.403621853898097, + "grad_norm": 0.17380768060684204, + "learning_rate": 3.0257897868789377e-05, + "loss": 1.708, + "step": 20863 + }, + { + "epoch": 6.403928790669122, + "grad_norm": 0.15817396342754364, + "learning_rate": 3.0253331269349662e-05, + "loss": 1.6629, + "step": 20864 + }, + { + "epoch": 6.404235727440147, + "grad_norm": 0.18253934383392334, + "learning_rate": 3.0248764865058122e-05, + "loss": 1.6877, + "step": 20865 + }, + { + "epoch": 6.4045426642111725, + "grad_norm": 0.20645618438720703, + "learning_rate": 3.0244198655959843e-05, + "loss": 1.7238, + "step": 20866 + }, + { + "epoch": 6.404849600982198, + "grad_norm": 0.2216680645942688, + "learning_rate": 3.0239632642099992e-05, + "loss": 1.7721, + "step": 20867 + }, + { + "epoch": 6.405156537753223, + "grad_norm": 0.21479755640029907, + "learning_rate": 3.023506682352365e-05, + "loss": 1.6686, + "step": 20868 + }, + { + "epoch": 6.4054634745242485, + "grad_norm": 0.21274925768375397, + "learning_rate": 3.0230501200275974e-05, + "loss": 1.7245, + "step": 20869 + }, + { + "epoch": 6.405770411295273, + "grad_norm": 0.19894039630889893, + "learning_rate": 3.0225935772402064e-05, + "loss": 1.6734, + "step": 20870 + }, + { + "epoch": 6.406077348066298, + "grad_norm": 0.24450170993804932, + "learning_rate": 3.022137053994707e-05, + "loss": 1.7103, + "step": 20871 + }, + { + "epoch": 6.406384284837324, + "grad_norm": 0.18289846181869507, + "learning_rate": 3.0216805502956057e-05, + "loss": 1.7866, + "step": 20872 + }, + { + "epoch": 6.406691221608349, + "grad_norm": 0.2884466350078583, + "learning_rate": 3.021224066147419e-05, + "loss": 1.7817, + "step": 20873 + }, + { + "epoch": 6.406998158379374, + "grad_norm": 0.21871373057365417, + "learning_rate": 3.0207676015546537e-05, + "loss": 1.6871, + "step": 20874 + }, + { + "epoch": 6.407305095150399, + "grad_norm": 0.239889994263649, + "learning_rate": 3.0203111565218244e-05, + "loss": 1.6412, + "step": 20875 + }, + { + "epoch": 6.407612031921424, + "grad_norm": 0.26960206031799316, + "learning_rate": 3.019854731053441e-05, + "loss": 1.7537, + "step": 20876 + }, + { + "epoch": 6.407918968692449, + "grad_norm": 0.32872483134269714, + "learning_rate": 3.019398325154013e-05, + "loss": 1.7718, + "step": 20877 + }, + { + "epoch": 6.408225905463475, + "grad_norm": 0.27766308188438416, + "learning_rate": 3.018941938828053e-05, + "loss": 1.7537, + "step": 20878 + }, + { + "epoch": 6.4085328422345, + "grad_norm": 0.1989286094903946, + "learning_rate": 3.0184855720800674e-05, + "loss": 1.7373, + "step": 20879 + }, + { + "epoch": 6.4088397790055245, + "grad_norm": 0.19748768210411072, + "learning_rate": 3.0180292249145703e-05, + "loss": 1.6821, + "step": 20880 + }, + { + "epoch": 6.40914671577655, + "grad_norm": 0.20632879436016083, + "learning_rate": 3.0175728973360694e-05, + "loss": 1.7641, + "step": 20881 + }, + { + "epoch": 6.409453652547575, + "grad_norm": 0.23808124661445618, + "learning_rate": 3.017116589349076e-05, + "loss": 1.7434, + "step": 20882 + }, + { + "epoch": 6.4097605893186005, + "grad_norm": 0.265514612197876, + "learning_rate": 3.0166603009580974e-05, + "loss": 1.7877, + "step": 20883 + }, + { + "epoch": 6.410067526089626, + "grad_norm": 0.21031250059604645, + "learning_rate": 3.0162040321676465e-05, + "loss": 1.738, + "step": 20884 + }, + { + "epoch": 6.41037446286065, + "grad_norm": 0.3011578619480133, + "learning_rate": 3.015747782982228e-05, + "loss": 1.7063, + "step": 20885 + }, + { + "epoch": 6.410681399631676, + "grad_norm": 0.28601503372192383, + "learning_rate": 3.015291553406353e-05, + "loss": 1.7021, + "step": 20886 + }, + { + "epoch": 6.410988336402701, + "grad_norm": 0.2433992624282837, + "learning_rate": 3.014835343444531e-05, + "loss": 1.6887, + "step": 20887 + }, + { + "epoch": 6.411295273173726, + "grad_norm": 0.3342660963535309, + "learning_rate": 3.014379153101269e-05, + "loss": 1.7798, + "step": 20888 + }, + { + "epoch": 6.411602209944752, + "grad_norm": 0.2390800267457962, + "learning_rate": 3.0139229823810757e-05, + "loss": 1.774, + "step": 20889 + }, + { + "epoch": 6.411909146715777, + "grad_norm": 0.2659217417240143, + "learning_rate": 3.0134668312884613e-05, + "loss": 1.7396, + "step": 20890 + }, + { + "epoch": 6.412216083486801, + "grad_norm": 0.22885620594024658, + "learning_rate": 3.0130106998279294e-05, + "loss": 1.7303, + "step": 20891 + }, + { + "epoch": 6.412523020257827, + "grad_norm": 0.20651856064796448, + "learning_rate": 3.0125545880039925e-05, + "loss": 1.7796, + "step": 20892 + }, + { + "epoch": 6.412829957028852, + "grad_norm": 0.26611828804016113, + "learning_rate": 3.0120984958211552e-05, + "loss": 1.7019, + "step": 20893 + }, + { + "epoch": 6.413136893799877, + "grad_norm": 0.2526776194572449, + "learning_rate": 3.0116424232839258e-05, + "loss": 1.7062, + "step": 20894 + }, + { + "epoch": 6.413443830570903, + "grad_norm": 0.2087634801864624, + "learning_rate": 3.0111863703968128e-05, + "loss": 1.7011, + "step": 20895 + }, + { + "epoch": 6.413750767341927, + "grad_norm": 0.20656780898571014, + "learning_rate": 3.0107303371643197e-05, + "loss": 1.7637, + "step": 20896 + }, + { + "epoch": 6.4140577041129525, + "grad_norm": 0.2083009034395218, + "learning_rate": 3.010274323590956e-05, + "loss": 1.7213, + "step": 20897 + }, + { + "epoch": 6.414364640883978, + "grad_norm": 0.22496090829372406, + "learning_rate": 3.0098183296812277e-05, + "loss": 1.7793, + "step": 20898 + }, + { + "epoch": 6.414671577655003, + "grad_norm": 0.2601132392883301, + "learning_rate": 3.0093623554396416e-05, + "loss": 1.8358, + "step": 20899 + }, + { + "epoch": 6.4149785144260285, + "grad_norm": 0.2364497184753418, + "learning_rate": 3.0089064008707026e-05, + "loss": 1.7299, + "step": 20900 + }, + { + "epoch": 6.415285451197054, + "grad_norm": 0.2011861503124237, + "learning_rate": 3.0084504659789186e-05, + "loss": 1.7521, + "step": 20901 + }, + { + "epoch": 6.415592387968078, + "grad_norm": 0.20605513453483582, + "learning_rate": 3.007994550768793e-05, + "loss": 1.7099, + "step": 20902 + }, + { + "epoch": 6.415899324739104, + "grad_norm": 0.20890796184539795, + "learning_rate": 3.0075386552448337e-05, + "loss": 1.7383, + "step": 20903 + }, + { + "epoch": 6.416206261510129, + "grad_norm": 0.20005083084106445, + "learning_rate": 3.0070827794115452e-05, + "loss": 1.6999, + "step": 20904 + }, + { + "epoch": 6.416513198281154, + "grad_norm": 0.20547670125961304, + "learning_rate": 3.006626923273433e-05, + "loss": 1.7424, + "step": 20905 + }, + { + "epoch": 6.41682013505218, + "grad_norm": 0.20799006521701813, + "learning_rate": 3.0061710868350003e-05, + "loss": 1.7266, + "step": 20906 + }, + { + "epoch": 6.417127071823204, + "grad_norm": 0.22234687209129333, + "learning_rate": 3.0057152701007563e-05, + "loss": 1.7755, + "step": 20907 + }, + { + "epoch": 6.417434008594229, + "grad_norm": 0.21947267651557922, + "learning_rate": 3.0052594730752005e-05, + "loss": 1.826, + "step": 20908 + }, + { + "epoch": 6.417740945365255, + "grad_norm": 0.2183268964290619, + "learning_rate": 3.0048036957628416e-05, + "loss": 1.7772, + "step": 20909 + }, + { + "epoch": 6.41804788213628, + "grad_norm": 0.1967134177684784, + "learning_rate": 3.0043479381681805e-05, + "loss": 1.6833, + "step": 20910 + }, + { + "epoch": 6.418354818907305, + "grad_norm": 0.2016787827014923, + "learning_rate": 3.003892200295723e-05, + "loss": 1.773, + "step": 20911 + }, + { + "epoch": 6.41866175567833, + "grad_norm": 0.2192344218492508, + "learning_rate": 3.0034364821499745e-05, + "loss": 1.7124, + "step": 20912 + }, + { + "epoch": 6.418968692449355, + "grad_norm": 0.24924327433109283, + "learning_rate": 3.002980783735434e-05, + "loss": 1.6882, + "step": 20913 + }, + { + "epoch": 6.4192756292203805, + "grad_norm": 0.2221844494342804, + "learning_rate": 3.0025251050566106e-05, + "loss": 1.8028, + "step": 20914 + }, + { + "epoch": 6.419582565991406, + "grad_norm": 0.27141162753105164, + "learning_rate": 3.0020694461180033e-05, + "loss": 1.698, + "step": 20915 + }, + { + "epoch": 6.419889502762431, + "grad_norm": 0.18856655061244965, + "learning_rate": 3.001613806924117e-05, + "loss": 1.7112, + "step": 20916 + }, + { + "epoch": 6.420196439533456, + "grad_norm": 0.2226688265800476, + "learning_rate": 3.0011581874794537e-05, + "loss": 1.6967, + "step": 20917 + }, + { + "epoch": 6.420503376304481, + "grad_norm": 0.2070344239473343, + "learning_rate": 3.000702587788518e-05, + "loss": 1.742, + "step": 20918 + }, + { + "epoch": 6.420810313075506, + "grad_norm": 0.22616387903690338, + "learning_rate": 3.00024700785581e-05, + "loss": 1.6865, + "step": 20919 + }, + { + "epoch": 6.421117249846532, + "grad_norm": 0.19745604693889618, + "learning_rate": 2.9997914476858348e-05, + "loss": 1.7328, + "step": 20920 + }, + { + "epoch": 6.421424186617557, + "grad_norm": 0.20654593408107758, + "learning_rate": 2.9993359072830906e-05, + "loss": 1.7811, + "step": 20921 + }, + { + "epoch": 6.421731123388582, + "grad_norm": 0.19188611209392548, + "learning_rate": 2.9988803866520832e-05, + "loss": 1.6808, + "step": 20922 + }, + { + "epoch": 6.422038060159607, + "grad_norm": 0.19907493889331818, + "learning_rate": 2.9984248857973118e-05, + "loss": 1.7326, + "step": 20923 + }, + { + "epoch": 6.422344996930632, + "grad_norm": 0.17484794557094574, + "learning_rate": 2.9979694047232804e-05, + "loss": 1.7166, + "step": 20924 + }, + { + "epoch": 6.422651933701657, + "grad_norm": 0.21412795782089233, + "learning_rate": 2.997513943434487e-05, + "loss": 1.7926, + "step": 20925 + }, + { + "epoch": 6.422958870472683, + "grad_norm": 0.17554008960723877, + "learning_rate": 2.9970585019354357e-05, + "loss": 1.6931, + "step": 20926 + }, + { + "epoch": 6.423265807243708, + "grad_norm": 0.16687868535518646, + "learning_rate": 2.9966030802306256e-05, + "loss": 1.6911, + "step": 20927 + }, + { + "epoch": 6.4235727440147325, + "grad_norm": 0.1802106350660324, + "learning_rate": 2.9961476783245578e-05, + "loss": 1.6921, + "step": 20928 + }, + { + "epoch": 6.423879680785758, + "grad_norm": 0.1968134343624115, + "learning_rate": 2.9956922962217347e-05, + "loss": 1.7035, + "step": 20929 + }, + { + "epoch": 6.424186617556783, + "grad_norm": 0.17703908681869507, + "learning_rate": 2.9952369339266538e-05, + "loss": 1.7122, + "step": 20930 + }, + { + "epoch": 6.4244935543278086, + "grad_norm": 0.22176744043827057, + "learning_rate": 2.9947815914438175e-05, + "loss": 1.7189, + "step": 20931 + }, + { + "epoch": 6.424800491098834, + "grad_norm": 0.19128306210041046, + "learning_rate": 2.9943262687777236e-05, + "loss": 1.7208, + "step": 20932 + }, + { + "epoch": 6.425107427869859, + "grad_norm": 0.2285725623369217, + "learning_rate": 2.9938709659328735e-05, + "loss": 1.7859, + "step": 20933 + }, + { + "epoch": 6.425414364640884, + "grad_norm": 0.1998651921749115, + "learning_rate": 2.9934156829137653e-05, + "loss": 1.6912, + "step": 20934 + }, + { + "epoch": 6.425721301411909, + "grad_norm": 0.1879023313522339, + "learning_rate": 2.9929604197249016e-05, + "loss": 1.7164, + "step": 20935 + }, + { + "epoch": 6.426028238182934, + "grad_norm": 0.2675700783729553, + "learning_rate": 2.992505176370778e-05, + "loss": 1.7475, + "step": 20936 + }, + { + "epoch": 6.42633517495396, + "grad_norm": 0.22345949709415436, + "learning_rate": 2.992049952855896e-05, + "loss": 1.6867, + "step": 20937 + }, + { + "epoch": 6.426642111724985, + "grad_norm": 0.17801997065544128, + "learning_rate": 2.9915947491847517e-05, + "loss": 1.736, + "step": 20938 + }, + { + "epoch": 6.4269490484960095, + "grad_norm": 0.22132502496242523, + "learning_rate": 2.991139565361846e-05, + "loss": 1.7244, + "step": 20939 + }, + { + "epoch": 6.427255985267035, + "grad_norm": 0.1899508535861969, + "learning_rate": 2.9906844013916758e-05, + "loss": 1.6781, + "step": 20940 + }, + { + "epoch": 6.42756292203806, + "grad_norm": 0.21948131918907166, + "learning_rate": 2.9902292572787414e-05, + "loss": 1.6911, + "step": 20941 + }, + { + "epoch": 6.4278698588090855, + "grad_norm": 0.16277503967285156, + "learning_rate": 2.9897741330275387e-05, + "loss": 1.702, + "step": 20942 + }, + { + "epoch": 6.428176795580111, + "grad_norm": 0.22303056716918945, + "learning_rate": 2.989319028642567e-05, + "loss": 1.7573, + "step": 20943 + }, + { + "epoch": 6.428483732351136, + "grad_norm": 0.21077899634838104, + "learning_rate": 2.9888639441283217e-05, + "loss": 1.7903, + "step": 20944 + }, + { + "epoch": 6.428790669122161, + "grad_norm": 0.23918256163597107, + "learning_rate": 2.988408879489303e-05, + "loss": 1.7112, + "step": 20945 + }, + { + "epoch": 6.429097605893186, + "grad_norm": 0.22226610779762268, + "learning_rate": 2.9879538347300074e-05, + "loss": 1.7039, + "step": 20946 + }, + { + "epoch": 6.429404542664211, + "grad_norm": 0.18605270981788635, + "learning_rate": 2.987498809854929e-05, + "loss": 1.7102, + "step": 20947 + }, + { + "epoch": 6.429711479435237, + "grad_norm": 0.24812746047973633, + "learning_rate": 2.987043804868569e-05, + "loss": 1.7112, + "step": 20948 + }, + { + "epoch": 6.430018416206262, + "grad_norm": 0.1869048923254013, + "learning_rate": 2.9865888197754206e-05, + "loss": 1.6946, + "step": 20949 + }, + { + "epoch": 6.430325352977286, + "grad_norm": 0.30707576870918274, + "learning_rate": 2.986133854579982e-05, + "loss": 1.7596, + "step": 20950 + }, + { + "epoch": 6.430632289748312, + "grad_norm": 0.20475640892982483, + "learning_rate": 2.985678909286748e-05, + "loss": 1.7162, + "step": 20951 + }, + { + "epoch": 6.430939226519337, + "grad_norm": 0.24273128807544708, + "learning_rate": 2.9852239839002182e-05, + "loss": 1.6803, + "step": 20952 + }, + { + "epoch": 6.431246163290362, + "grad_norm": 0.27484890818595886, + "learning_rate": 2.9847690784248834e-05, + "loss": 1.7948, + "step": 20953 + }, + { + "epoch": 6.431553100061388, + "grad_norm": 0.2204331010580063, + "learning_rate": 2.984314192865244e-05, + "loss": 1.769, + "step": 20954 + }, + { + "epoch": 6.431860036832412, + "grad_norm": 0.262463241815567, + "learning_rate": 2.9838593272257907e-05, + "loss": 1.7483, + "step": 20955 + }, + { + "epoch": 6.4321669736034375, + "grad_norm": 0.225942924618721, + "learning_rate": 2.983404481511023e-05, + "loss": 1.7228, + "step": 20956 + }, + { + "epoch": 6.432473910374463, + "grad_norm": 0.22381044924259186, + "learning_rate": 2.982949655725432e-05, + "loss": 1.7579, + "step": 20957 + }, + { + "epoch": 6.432780847145488, + "grad_norm": 0.1937711238861084, + "learning_rate": 2.982494849873518e-05, + "loss": 1.6833, + "step": 20958 + }, + { + "epoch": 6.4330877839165135, + "grad_norm": 0.2609664499759674, + "learning_rate": 2.9820400639597702e-05, + "loss": 1.7524, + "step": 20959 + }, + { + "epoch": 6.433394720687538, + "grad_norm": 0.2891463041305542, + "learning_rate": 2.981585297988686e-05, + "loss": 1.7672, + "step": 20960 + }, + { + "epoch": 6.433701657458563, + "grad_norm": 0.19604064524173737, + "learning_rate": 2.9811305519647582e-05, + "loss": 1.6684, + "step": 20961 + }, + { + "epoch": 6.434008594229589, + "grad_norm": 0.23522239923477173, + "learning_rate": 2.9806758258924822e-05, + "loss": 1.7461, + "step": 20962 + }, + { + "epoch": 6.434315531000614, + "grad_norm": 0.24907514452934265, + "learning_rate": 2.9802211197763525e-05, + "loss": 1.7702, + "step": 20963 + }, + { + "epoch": 6.434622467771639, + "grad_norm": 0.21963126957416534, + "learning_rate": 2.9797664336208592e-05, + "loss": 1.7263, + "step": 20964 + }, + { + "epoch": 6.434929404542665, + "grad_norm": 0.23124000430107117, + "learning_rate": 2.9793117674305004e-05, + "loss": 1.7362, + "step": 20965 + }, + { + "epoch": 6.435236341313689, + "grad_norm": 0.1917882263660431, + "learning_rate": 2.978857121209765e-05, + "loss": 1.7505, + "step": 20966 + }, + { + "epoch": 6.435543278084714, + "grad_norm": 0.24407804012298584, + "learning_rate": 2.9784024949631484e-05, + "loss": 1.7898, + "step": 20967 + }, + { + "epoch": 6.43585021485574, + "grad_norm": 0.210384339094162, + "learning_rate": 2.977947888695143e-05, + "loss": 1.7515, + "step": 20968 + }, + { + "epoch": 6.436157151626765, + "grad_norm": 0.20764803886413574, + "learning_rate": 2.9774933024102436e-05, + "loss": 1.7628, + "step": 20969 + }, + { + "epoch": 6.43646408839779, + "grad_norm": 0.21542097628116608, + "learning_rate": 2.9770387361129387e-05, + "loss": 1.7882, + "step": 20970 + }, + { + "epoch": 6.436771025168815, + "grad_norm": 0.1768570989370346, + "learning_rate": 2.976584189807725e-05, + "loss": 1.7471, + "step": 20971 + }, + { + "epoch": 6.43707796193984, + "grad_norm": 0.2398732751607895, + "learning_rate": 2.97612966349909e-05, + "loss": 1.6676, + "step": 20972 + }, + { + "epoch": 6.4373848987108655, + "grad_norm": 0.18291664123535156, + "learning_rate": 2.9756751571915286e-05, + "loss": 1.6791, + "step": 20973 + }, + { + "epoch": 6.437691835481891, + "grad_norm": 0.2769327759742737, + "learning_rate": 2.9752206708895314e-05, + "loss": 1.7675, + "step": 20974 + }, + { + "epoch": 6.437998772252916, + "grad_norm": 0.24859526753425598, + "learning_rate": 2.974766204597592e-05, + "loss": 1.7661, + "step": 20975 + }, + { + "epoch": 6.4383057090239415, + "grad_norm": 0.20495273172855377, + "learning_rate": 2.9743117583201984e-05, + "loss": 1.6774, + "step": 20976 + }, + { + "epoch": 6.438612645794966, + "grad_norm": 0.24650859832763672, + "learning_rate": 2.9738573320618447e-05, + "loss": 1.759, + "step": 20977 + }, + { + "epoch": 6.438919582565991, + "grad_norm": 0.21430176496505737, + "learning_rate": 2.973402925827019e-05, + "loss": 1.7273, + "step": 20978 + }, + { + "epoch": 6.439226519337017, + "grad_norm": 0.22392596304416656, + "learning_rate": 2.972948539620214e-05, + "loss": 1.7506, + "step": 20979 + }, + { + "epoch": 6.439533456108042, + "grad_norm": 0.24393923580646515, + "learning_rate": 2.9724941734459205e-05, + "loss": 1.7815, + "step": 20980 + }, + { + "epoch": 6.439840392879067, + "grad_norm": 0.2873772084712982, + "learning_rate": 2.9720398273086264e-05, + "loss": 1.7863, + "step": 20981 + }, + { + "epoch": 6.440147329650092, + "grad_norm": 0.218470498919487, + "learning_rate": 2.9715855012128246e-05, + "loss": 1.7347, + "step": 20982 + }, + { + "epoch": 6.440454266421117, + "grad_norm": 0.24520666897296906, + "learning_rate": 2.971131195163003e-05, + "loss": 1.6892, + "step": 20983 + }, + { + "epoch": 6.440761203192142, + "grad_norm": 0.2255270928144455, + "learning_rate": 2.970676909163652e-05, + "loss": 1.7179, + "step": 20984 + }, + { + "epoch": 6.441068139963168, + "grad_norm": 0.25171026587486267, + "learning_rate": 2.9702226432192604e-05, + "loss": 1.7087, + "step": 20985 + }, + { + "epoch": 6.441375076734193, + "grad_norm": 0.27045872807502747, + "learning_rate": 2.9697683973343204e-05, + "loss": 1.732, + "step": 20986 + }, + { + "epoch": 6.4416820135052175, + "grad_norm": 0.25374144315719604, + "learning_rate": 2.9693141715133177e-05, + "loss": 1.7688, + "step": 20987 + }, + { + "epoch": 6.441988950276243, + "grad_norm": 0.22694779932498932, + "learning_rate": 2.9688599657607442e-05, + "loss": 1.7105, + "step": 20988 + }, + { + "epoch": 6.442295887047268, + "grad_norm": 0.23455791175365448, + "learning_rate": 2.9684057800810845e-05, + "loss": 1.8007, + "step": 20989 + }, + { + "epoch": 6.4426028238182935, + "grad_norm": 0.23054158687591553, + "learning_rate": 2.9679516144788312e-05, + "loss": 1.6787, + "step": 20990 + }, + { + "epoch": 6.442909760589319, + "grad_norm": 0.22110030055046082, + "learning_rate": 2.9674974689584696e-05, + "loss": 1.8048, + "step": 20991 + }, + { + "epoch": 6.443216697360343, + "grad_norm": 0.22141657769680023, + "learning_rate": 2.9670433435244915e-05, + "loss": 1.7691, + "step": 20992 + }, + { + "epoch": 6.443523634131369, + "grad_norm": 0.18511974811553955, + "learning_rate": 2.9665892381813807e-05, + "loss": 1.6825, + "step": 20993 + }, + { + "epoch": 6.443830570902394, + "grad_norm": 0.21904997527599335, + "learning_rate": 2.966135152933629e-05, + "loss": 1.7711, + "step": 20994 + }, + { + "epoch": 6.444137507673419, + "grad_norm": 0.19334301352500916, + "learning_rate": 2.9656810877857196e-05, + "loss": 1.687, + "step": 20995 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 0.1766969859600067, + "learning_rate": 2.9652270427421426e-05, + "loss": 1.7211, + "step": 20996 + }, + { + "epoch": 6.44475138121547, + "grad_norm": 0.1821468323469162, + "learning_rate": 2.9647730178073864e-05, + "loss": 1.7086, + "step": 20997 + }, + { + "epoch": 6.445058317986494, + "grad_norm": 0.20812760293483734, + "learning_rate": 2.9643190129859333e-05, + "loss": 1.6844, + "step": 20998 + }, + { + "epoch": 6.44536525475752, + "grad_norm": 0.259042352437973, + "learning_rate": 2.9638650282822754e-05, + "loss": 1.7971, + "step": 20999 + }, + { + "epoch": 6.445672191528545, + "grad_norm": 0.2134076952934265, + "learning_rate": 2.9634110637008948e-05, + "loss": 1.7061, + "step": 21000 + }, + { + "epoch": 6.44597912829957, + "grad_norm": 0.21120613813400269, + "learning_rate": 2.962957119246281e-05, + "loss": 1.6708, + "step": 21001 + }, + { + "epoch": 6.446286065070596, + "grad_norm": 0.18577797710895538, + "learning_rate": 2.9625031949229176e-05, + "loss": 1.719, + "step": 21002 + }, + { + "epoch": 6.44659300184162, + "grad_norm": 0.21755708754062653, + "learning_rate": 2.962049290735294e-05, + "loss": 1.7203, + "step": 21003 + }, + { + "epoch": 6.4468999386126455, + "grad_norm": 0.2161538451910019, + "learning_rate": 2.961595406687891e-05, + "loss": 1.7254, + "step": 21004 + }, + { + "epoch": 6.447206875383671, + "grad_norm": 0.19979329407215118, + "learning_rate": 2.9611415427851995e-05, + "loss": 1.7203, + "step": 21005 + }, + { + "epoch": 6.447513812154696, + "grad_norm": 0.2103399932384491, + "learning_rate": 2.9606876990317e-05, + "loss": 1.7291, + "step": 21006 + }, + { + "epoch": 6.4478207489257215, + "grad_norm": 0.19513745605945587, + "learning_rate": 2.9602338754318815e-05, + "loss": 1.7574, + "step": 21007 + }, + { + "epoch": 6.448127685696747, + "grad_norm": 0.19819851219654083, + "learning_rate": 2.9597800719902256e-05, + "loss": 1.6913, + "step": 21008 + }, + { + "epoch": 6.448434622467771, + "grad_norm": 0.1847768872976303, + "learning_rate": 2.9593262887112215e-05, + "loss": 1.6987, + "step": 21009 + }, + { + "epoch": 6.448741559238797, + "grad_norm": 0.22399301826953888, + "learning_rate": 2.9588725255993487e-05, + "loss": 1.8328, + "step": 21010 + }, + { + "epoch": 6.449048496009822, + "grad_norm": 0.20540264248847961, + "learning_rate": 2.958418782659097e-05, + "loss": 1.765, + "step": 21011 + }, + { + "epoch": 6.449355432780847, + "grad_norm": 0.183661550283432, + "learning_rate": 2.9579650598949442e-05, + "loss": 1.7128, + "step": 21012 + }, + { + "epoch": 6.449662369551873, + "grad_norm": 0.1972927302122116, + "learning_rate": 2.9575113573113788e-05, + "loss": 1.717, + "step": 21013 + }, + { + "epoch": 6.449969306322897, + "grad_norm": 0.20188379287719727, + "learning_rate": 2.9570576749128846e-05, + "loss": 1.7603, + "step": 21014 + }, + { + "epoch": 6.4502762430939224, + "grad_norm": 0.20789781212806702, + "learning_rate": 2.9566040127039418e-05, + "loss": 1.7142, + "step": 21015 + }, + { + "epoch": 6.450583179864948, + "grad_norm": 0.19319608807563782, + "learning_rate": 2.956150370689038e-05, + "loss": 1.7524, + "step": 21016 + }, + { + "epoch": 6.450890116635973, + "grad_norm": 0.2153816968202591, + "learning_rate": 2.9556967488726516e-05, + "loss": 1.7325, + "step": 21017 + }, + { + "epoch": 6.4511970534069984, + "grad_norm": 0.19134823977947235, + "learning_rate": 2.9552431472592702e-05, + "loss": 1.7547, + "step": 21018 + }, + { + "epoch": 6.451503990178024, + "grad_norm": 0.21069955825805664, + "learning_rate": 2.9547895658533725e-05, + "loss": 1.7038, + "step": 21019 + }, + { + "epoch": 6.451810926949048, + "grad_norm": 0.20742546021938324, + "learning_rate": 2.9543360046594455e-05, + "loss": 1.7151, + "step": 21020 + }, + { + "epoch": 6.452117863720074, + "grad_norm": 0.16917672753334045, + "learning_rate": 2.9538824636819666e-05, + "loss": 1.6957, + "step": 21021 + }, + { + "epoch": 6.452424800491099, + "grad_norm": 0.21134577691555023, + "learning_rate": 2.953428942925423e-05, + "loss": 1.711, + "step": 21022 + }, + { + "epoch": 6.452731737262124, + "grad_norm": 0.19403810799121857, + "learning_rate": 2.9529754423942918e-05, + "loss": 1.734, + "step": 21023 + }, + { + "epoch": 6.45303867403315, + "grad_norm": 0.18534770607948303, + "learning_rate": 2.9525219620930582e-05, + "loss": 1.6857, + "step": 21024 + }, + { + "epoch": 6.453345610804174, + "grad_norm": 0.24268858134746552, + "learning_rate": 2.9520685020262016e-05, + "loss": 1.7316, + "step": 21025 + }, + { + "epoch": 6.453652547575199, + "grad_norm": 0.17590615153312683, + "learning_rate": 2.9516150621982063e-05, + "loss": 1.6608, + "step": 21026 + }, + { + "epoch": 6.453959484346225, + "grad_norm": 0.1949763298034668, + "learning_rate": 2.9511616426135504e-05, + "loss": 1.7955, + "step": 21027 + }, + { + "epoch": 6.45426642111725, + "grad_norm": 0.2424435019493103, + "learning_rate": 2.950708243276717e-05, + "loss": 1.7334, + "step": 21028 + }, + { + "epoch": 6.454573357888275, + "grad_norm": 0.22753369808197021, + "learning_rate": 2.950254864192184e-05, + "loss": 1.733, + "step": 21029 + }, + { + "epoch": 6.4548802946593, + "grad_norm": 0.1706271469593048, + "learning_rate": 2.949801505364435e-05, + "loss": 1.7424, + "step": 21030 + }, + { + "epoch": 6.455187231430325, + "grad_norm": 0.21614442765712738, + "learning_rate": 2.9493481667979506e-05, + "loss": 1.7813, + "step": 21031 + }, + { + "epoch": 6.4554941682013505, + "grad_norm": 0.1793162226676941, + "learning_rate": 2.9488948484972068e-05, + "loss": 1.7076, + "step": 21032 + }, + { + "epoch": 6.455801104972376, + "grad_norm": 0.19251759350299835, + "learning_rate": 2.9484415504666885e-05, + "loss": 1.7487, + "step": 21033 + }, + { + "epoch": 6.456108041743401, + "grad_norm": 0.1817556619644165, + "learning_rate": 2.947988272710871e-05, + "loss": 1.6958, + "step": 21034 + }, + { + "epoch": 6.456414978514426, + "grad_norm": 0.24368418753147125, + "learning_rate": 2.9475350152342378e-05, + "loss": 1.7867, + "step": 21035 + }, + { + "epoch": 6.456721915285451, + "grad_norm": 0.2362157702445984, + "learning_rate": 2.9470817780412653e-05, + "loss": 1.7241, + "step": 21036 + }, + { + "epoch": 6.457028852056476, + "grad_norm": 0.21049003303050995, + "learning_rate": 2.9466285611364358e-05, + "loss": 1.7146, + "step": 21037 + }, + { + "epoch": 6.457335788827502, + "grad_norm": 0.2516530454158783, + "learning_rate": 2.9461753645242246e-05, + "loss": 1.7349, + "step": 21038 + }, + { + "epoch": 6.457642725598527, + "grad_norm": 0.23165179789066315, + "learning_rate": 2.945722188209114e-05, + "loss": 1.7285, + "step": 21039 + }, + { + "epoch": 6.457949662369552, + "grad_norm": 0.27345010638237, + "learning_rate": 2.945269032195579e-05, + "loss": 1.7266, + "step": 21040 + }, + { + "epoch": 6.458256599140577, + "grad_norm": 0.16312900185585022, + "learning_rate": 2.9448158964881e-05, + "loss": 1.6781, + "step": 21041 + }, + { + "epoch": 6.458563535911602, + "grad_norm": 0.238658607006073, + "learning_rate": 2.9443627810911557e-05, + "loss": 1.6819, + "step": 21042 + }, + { + "epoch": 6.458870472682627, + "grad_norm": 0.19861388206481934, + "learning_rate": 2.943909686009223e-05, + "loss": 1.7397, + "step": 21043 + }, + { + "epoch": 6.459177409453653, + "grad_norm": 0.22675637900829315, + "learning_rate": 2.9434566112467793e-05, + "loss": 1.7231, + "step": 21044 + }, + { + "epoch": 6.459484346224678, + "grad_norm": 0.22638066112995148, + "learning_rate": 2.9430035568083043e-05, + "loss": 1.7466, + "step": 21045 + }, + { + "epoch": 6.4597912829957025, + "grad_norm": 0.2237064391374588, + "learning_rate": 2.942550522698272e-05, + "loss": 1.7373, + "step": 21046 + }, + { + "epoch": 6.460098219766728, + "grad_norm": 0.2613731324672699, + "learning_rate": 2.942097508921162e-05, + "loss": 1.7567, + "step": 21047 + }, + { + "epoch": 6.460405156537753, + "grad_norm": 0.21602070331573486, + "learning_rate": 2.941644515481452e-05, + "loss": 1.7512, + "step": 21048 + }, + { + "epoch": 6.4607120933087785, + "grad_norm": 0.30129116773605347, + "learning_rate": 2.941191542383615e-05, + "loss": 1.761, + "step": 21049 + }, + { + "epoch": 6.461019030079804, + "grad_norm": 0.2303919792175293, + "learning_rate": 2.940738589632132e-05, + "loss": 1.742, + "step": 21050 + }, + { + "epoch": 6.461325966850829, + "grad_norm": 0.2195158153772354, + "learning_rate": 2.940285657231475e-05, + "loss": 1.7169, + "step": 21051 + }, + { + "epoch": 6.461632903621854, + "grad_norm": 0.19029918313026428, + "learning_rate": 2.9398327451861242e-05, + "loss": 1.6721, + "step": 21052 + }, + { + "epoch": 6.461939840392879, + "grad_norm": 0.2006317377090454, + "learning_rate": 2.939379853500553e-05, + "loss": 1.7393, + "step": 21053 + }, + { + "epoch": 6.462246777163904, + "grad_norm": 0.222677081823349, + "learning_rate": 2.9389269821792377e-05, + "loss": 1.7858, + "step": 21054 + }, + { + "epoch": 6.46255371393493, + "grad_norm": 0.20772451162338257, + "learning_rate": 2.938474131226654e-05, + "loss": 1.735, + "step": 21055 + }, + { + "epoch": 6.462860650705955, + "grad_norm": 0.21006503701210022, + "learning_rate": 2.9380213006472778e-05, + "loss": 1.7197, + "step": 21056 + }, + { + "epoch": 6.463167587476979, + "grad_norm": 0.23545250296592712, + "learning_rate": 2.9375684904455825e-05, + "loss": 1.8278, + "step": 21057 + }, + { + "epoch": 6.463474524248005, + "grad_norm": 0.24590329825878143, + "learning_rate": 2.937115700626045e-05, + "loss": 1.6411, + "step": 21058 + }, + { + "epoch": 6.46378146101903, + "grad_norm": 0.22359445691108704, + "learning_rate": 2.9366629311931393e-05, + "loss": 1.7901, + "step": 21059 + }, + { + "epoch": 6.464088397790055, + "grad_norm": 0.22807523608207703, + "learning_rate": 2.93621018215134e-05, + "loss": 1.7472, + "step": 21060 + }, + { + "epoch": 6.464395334561081, + "grad_norm": 0.24183115363121033, + "learning_rate": 2.93575745350512e-05, + "loss": 1.7553, + "step": 21061 + }, + { + "epoch": 6.464702271332105, + "grad_norm": 0.23809055984020233, + "learning_rate": 2.935304745258958e-05, + "loss": 1.7451, + "step": 21062 + }, + { + "epoch": 6.4650092081031305, + "grad_norm": 0.28455644845962524, + "learning_rate": 2.934852057417321e-05, + "loss": 1.8112, + "step": 21063 + }, + { + "epoch": 6.465316144874156, + "grad_norm": 0.22193321585655212, + "learning_rate": 2.9343993899846888e-05, + "loss": 1.747, + "step": 21064 + }, + { + "epoch": 6.465623081645181, + "grad_norm": 0.30524322390556335, + "learning_rate": 2.933946742965532e-05, + "loss": 1.7117, + "step": 21065 + }, + { + "epoch": 6.4659300184162065, + "grad_norm": 0.19748717546463013, + "learning_rate": 2.9334941163643233e-05, + "loss": 1.6899, + "step": 21066 + }, + { + "epoch": 6.466236955187231, + "grad_norm": 0.25551193952560425, + "learning_rate": 2.933041510185539e-05, + "loss": 1.7264, + "step": 21067 + }, + { + "epoch": 6.466543891958256, + "grad_norm": 0.20016206800937653, + "learning_rate": 2.932588924433648e-05, + "loss": 1.6613, + "step": 21068 + }, + { + "epoch": 6.466850828729282, + "grad_norm": 0.31049394607543945, + "learning_rate": 2.932136359113127e-05, + "loss": 1.6575, + "step": 21069 + }, + { + "epoch": 6.467157765500307, + "grad_norm": 0.29408347606658936, + "learning_rate": 2.9316838142284436e-05, + "loss": 1.72, + "step": 21070 + }, + { + "epoch": 6.467464702271332, + "grad_norm": 0.18981193006038666, + "learning_rate": 2.9312312897840748e-05, + "loss": 1.6799, + "step": 21071 + }, + { + "epoch": 6.467771639042358, + "grad_norm": 0.26828575134277344, + "learning_rate": 2.9307787857844905e-05, + "loss": 1.6983, + "step": 21072 + }, + { + "epoch": 6.468078575813382, + "grad_norm": 0.2605530321598053, + "learning_rate": 2.9303263022341642e-05, + "loss": 1.7973, + "step": 21073 + }, + { + "epoch": 6.468385512584407, + "grad_norm": 0.389957070350647, + "learning_rate": 2.9298738391375648e-05, + "loss": 1.7288, + "step": 21074 + }, + { + "epoch": 6.468692449355433, + "grad_norm": 0.20525416731834412, + "learning_rate": 2.9294213964991667e-05, + "loss": 1.7526, + "step": 21075 + }, + { + "epoch": 6.468999386126458, + "grad_norm": 0.3628186285495758, + "learning_rate": 2.9289689743234387e-05, + "loss": 1.7055, + "step": 21076 + }, + { + "epoch": 6.469306322897483, + "grad_norm": 0.21661829948425293, + "learning_rate": 2.9285165726148545e-05, + "loss": 1.7806, + "step": 21077 + }, + { + "epoch": 6.469613259668508, + "grad_norm": 0.3815501034259796, + "learning_rate": 2.9280641913778816e-05, + "loss": 1.7257, + "step": 21078 + }, + { + "epoch": 6.469920196439533, + "grad_norm": 0.19470983743667603, + "learning_rate": 2.9276118306169957e-05, + "loss": 1.7055, + "step": 21079 + }, + { + "epoch": 6.4702271332105585, + "grad_norm": 0.36236056685447693, + "learning_rate": 2.927159490336662e-05, + "loss": 1.6748, + "step": 21080 + }, + { + "epoch": 6.470534069981584, + "grad_norm": 0.201282799243927, + "learning_rate": 2.9267071705413552e-05, + "loss": 1.6987, + "step": 21081 + }, + { + "epoch": 6.470841006752609, + "grad_norm": 0.3806697130203247, + "learning_rate": 2.9262548712355425e-05, + "loss": 1.7386, + "step": 21082 + }, + { + "epoch": 6.4711479435236345, + "grad_norm": 0.3023025691509247, + "learning_rate": 2.9258025924236933e-05, + "loss": 1.7183, + "step": 21083 + }, + { + "epoch": 6.471454880294659, + "grad_norm": 0.2648932635784149, + "learning_rate": 2.9253503341102806e-05, + "loss": 1.6755, + "step": 21084 + }, + { + "epoch": 6.471761817065684, + "grad_norm": 0.2647169828414917, + "learning_rate": 2.9248980962997707e-05, + "loss": 1.7326, + "step": 21085 + }, + { + "epoch": 6.47206875383671, + "grad_norm": 0.23535950481891632, + "learning_rate": 2.9244458789966355e-05, + "loss": 1.7541, + "step": 21086 + }, + { + "epoch": 6.472375690607735, + "grad_norm": 0.2551584541797638, + "learning_rate": 2.9239936822053403e-05, + "loss": 1.6907, + "step": 21087 + }, + { + "epoch": 6.47268262737876, + "grad_norm": 0.23313823342323303, + "learning_rate": 2.923541505930357e-05, + "loss": 1.705, + "step": 21088 + }, + { + "epoch": 6.472989564149785, + "grad_norm": 0.2368597686290741, + "learning_rate": 2.9230893501761534e-05, + "loss": 1.6666, + "step": 21089 + }, + { + "epoch": 6.47329650092081, + "grad_norm": 0.17861969769001007, + "learning_rate": 2.9226372149472003e-05, + "loss": 1.6927, + "step": 21090 + }, + { + "epoch": 6.473603437691835, + "grad_norm": 0.2212727665901184, + "learning_rate": 2.9221851002479616e-05, + "loss": 1.6972, + "step": 21091 + }, + { + "epoch": 6.473910374462861, + "grad_norm": 0.19382402300834656, + "learning_rate": 2.9217330060829096e-05, + "loss": 1.7602, + "step": 21092 + }, + { + "epoch": 6.474217311233886, + "grad_norm": 0.2762092053890228, + "learning_rate": 2.9212809324565076e-05, + "loss": 1.7642, + "step": 21093 + }, + { + "epoch": 6.474524248004911, + "grad_norm": 0.22068747878074646, + "learning_rate": 2.9208288793732274e-05, + "loss": 1.7477, + "step": 21094 + }, + { + "epoch": 6.474831184775936, + "grad_norm": 0.19979839026927948, + "learning_rate": 2.9203768468375337e-05, + "loss": 1.7266, + "step": 21095 + }, + { + "epoch": 6.475138121546961, + "grad_norm": 0.23038682341575623, + "learning_rate": 2.9199248348538965e-05, + "loss": 1.7428, + "step": 21096 + }, + { + "epoch": 6.475445058317987, + "grad_norm": 0.16841283440589905, + "learning_rate": 2.91947284342678e-05, + "loss": 1.6788, + "step": 21097 + }, + { + "epoch": 6.475751995089012, + "grad_norm": 0.22812627255916595, + "learning_rate": 2.9190208725606528e-05, + "loss": 1.7513, + "step": 21098 + }, + { + "epoch": 6.476058931860037, + "grad_norm": 0.18409393727779388, + "learning_rate": 2.9185689222599832e-05, + "loss": 1.6834, + "step": 21099 + }, + { + "epoch": 6.476365868631062, + "grad_norm": 0.26226910948753357, + "learning_rate": 2.9181169925292313e-05, + "loss": 1.7375, + "step": 21100 + }, + { + "epoch": 6.476672805402087, + "grad_norm": 0.1915685385465622, + "learning_rate": 2.9176650833728697e-05, + "loss": 1.7521, + "step": 21101 + }, + { + "epoch": 6.476979742173112, + "grad_norm": 0.22342176735401154, + "learning_rate": 2.917213194795362e-05, + "loss": 1.8018, + "step": 21102 + }, + { + "epoch": 6.477286678944138, + "grad_norm": 0.18338742852210999, + "learning_rate": 2.9167613268011745e-05, + "loss": 1.6817, + "step": 21103 + }, + { + "epoch": 6.477593615715163, + "grad_norm": 0.23008635640144348, + "learning_rate": 2.9163094793947728e-05, + "loss": 1.7037, + "step": 21104 + }, + { + "epoch": 6.4779005524861875, + "grad_norm": 0.20954197645187378, + "learning_rate": 2.9158576525806215e-05, + "loss": 1.7565, + "step": 21105 + }, + { + "epoch": 6.478207489257213, + "grad_norm": 0.21065562963485718, + "learning_rate": 2.9154058463631874e-05, + "loss": 1.6899, + "step": 21106 + }, + { + "epoch": 6.478514426028238, + "grad_norm": 0.20217828452587128, + "learning_rate": 2.9149540607469335e-05, + "loss": 1.7055, + "step": 21107 + }, + { + "epoch": 6.4788213627992635, + "grad_norm": 0.19058823585510254, + "learning_rate": 2.9145022957363244e-05, + "loss": 1.6794, + "step": 21108 + }, + { + "epoch": 6.479128299570289, + "grad_norm": 0.2308664619922638, + "learning_rate": 2.9140505513358297e-05, + "loss": 1.7322, + "step": 21109 + }, + { + "epoch": 6.479435236341313, + "grad_norm": 0.18911845982074738, + "learning_rate": 2.9135988275499056e-05, + "loss": 1.7255, + "step": 21110 + }, + { + "epoch": 6.479742173112339, + "grad_norm": 0.21459296345710754, + "learning_rate": 2.9131471243830256e-05, + "loss": 1.6599, + "step": 21111 + }, + { + "epoch": 6.480049109883364, + "grad_norm": 0.20521530508995056, + "learning_rate": 2.912695441839644e-05, + "loss": 1.7564, + "step": 21112 + }, + { + "epoch": 6.480356046654389, + "grad_norm": 0.21924994885921478, + "learning_rate": 2.912243779924232e-05, + "loss": 1.6922, + "step": 21113 + }, + { + "epoch": 6.480662983425415, + "grad_norm": 0.18219491839408875, + "learning_rate": 2.911792138641253e-05, + "loss": 1.6907, + "step": 21114 + }, + { + "epoch": 6.48096992019644, + "grad_norm": 0.23122453689575195, + "learning_rate": 2.9113405179951626e-05, + "loss": 1.7665, + "step": 21115 + }, + { + "epoch": 6.481276856967464, + "grad_norm": 0.18411210179328918, + "learning_rate": 2.9108889179904348e-05, + "loss": 1.7216, + "step": 21116 + }, + { + "epoch": 6.48158379373849, + "grad_norm": 0.2251562923192978, + "learning_rate": 2.9104373386315225e-05, + "loss": 1.7605, + "step": 21117 + }, + { + "epoch": 6.481890730509515, + "grad_norm": 0.2252185344696045, + "learning_rate": 2.9099857799228957e-05, + "loss": 1.7345, + "step": 21118 + }, + { + "epoch": 6.48219766728054, + "grad_norm": 0.20799386501312256, + "learning_rate": 2.909534241869014e-05, + "loss": 1.7497, + "step": 21119 + }, + { + "epoch": 6.482504604051566, + "grad_norm": 0.2059052586555481, + "learning_rate": 2.90908272447434e-05, + "loss": 1.7444, + "step": 21120 + }, + { + "epoch": 6.48281154082259, + "grad_norm": 0.17851221561431885, + "learning_rate": 2.9086312277433362e-05, + "loss": 1.7208, + "step": 21121 + }, + { + "epoch": 6.4831184775936155, + "grad_norm": 0.20561498403549194, + "learning_rate": 2.908179751680465e-05, + "loss": 1.731, + "step": 21122 + }, + { + "epoch": 6.483425414364641, + "grad_norm": 0.2386128008365631, + "learning_rate": 2.9077282962901868e-05, + "loss": 1.7493, + "step": 21123 + }, + { + "epoch": 6.483732351135666, + "grad_norm": 0.21024827659130096, + "learning_rate": 2.9072768615769642e-05, + "loss": 1.7353, + "step": 21124 + }, + { + "epoch": 6.4840392879066915, + "grad_norm": 0.23443256318569183, + "learning_rate": 2.9068254475452582e-05, + "loss": 1.7419, + "step": 21125 + }, + { + "epoch": 6.484346224677717, + "grad_norm": 0.1849295198917389, + "learning_rate": 2.90637405419953e-05, + "loss": 1.7239, + "step": 21126 + }, + { + "epoch": 6.484653161448741, + "grad_norm": 0.1967659890651703, + "learning_rate": 2.9059226815442385e-05, + "loss": 1.7163, + "step": 21127 + }, + { + "epoch": 6.484960098219767, + "grad_norm": 0.20395416021347046, + "learning_rate": 2.9054713295838505e-05, + "loss": 1.7108, + "step": 21128 + }, + { + "epoch": 6.485267034990792, + "grad_norm": 0.24162746965885162, + "learning_rate": 2.9050199983228184e-05, + "loss": 1.7666, + "step": 21129 + }, + { + "epoch": 6.485573971761817, + "grad_norm": 0.18104900419712067, + "learning_rate": 2.9045686877656086e-05, + "loss": 1.6863, + "step": 21130 + }, + { + "epoch": 6.485880908532843, + "grad_norm": 0.18469318747520447, + "learning_rate": 2.9041173979166813e-05, + "loss": 1.7344, + "step": 21131 + }, + { + "epoch": 6.486187845303867, + "grad_norm": 0.18488821387290955, + "learning_rate": 2.90366612878049e-05, + "loss": 1.694, + "step": 21132 + }, + { + "epoch": 6.486494782074892, + "grad_norm": 0.2030600905418396, + "learning_rate": 2.903214880361503e-05, + "loss": 1.7079, + "step": 21133 + }, + { + "epoch": 6.486801718845918, + "grad_norm": 0.2222873419523239, + "learning_rate": 2.902763652664171e-05, + "loss": 1.7193, + "step": 21134 + }, + { + "epoch": 6.487108655616943, + "grad_norm": 0.1936846524477005, + "learning_rate": 2.9023124456929608e-05, + "loss": 1.7152, + "step": 21135 + }, + { + "epoch": 6.487415592387968, + "grad_norm": 0.25259360671043396, + "learning_rate": 2.9018612594523274e-05, + "loss": 1.776, + "step": 21136 + }, + { + "epoch": 6.487722529158993, + "grad_norm": 0.22994543612003326, + "learning_rate": 2.9014100939467316e-05, + "loss": 1.7437, + "step": 21137 + }, + { + "epoch": 6.488029465930018, + "grad_norm": 0.2646990716457367, + "learning_rate": 2.900958949180631e-05, + "loss": 1.7535, + "step": 21138 + }, + { + "epoch": 6.4883364027010435, + "grad_norm": 0.22973869740962982, + "learning_rate": 2.9005078251584843e-05, + "loss": 1.6772, + "step": 21139 + }, + { + "epoch": 6.488643339472069, + "grad_norm": 0.21261750161647797, + "learning_rate": 2.9000567218847497e-05, + "loss": 1.6899, + "step": 21140 + }, + { + "epoch": 6.488950276243094, + "grad_norm": 0.24828271567821503, + "learning_rate": 2.8996056393638858e-05, + "loss": 1.7994, + "step": 21141 + }, + { + "epoch": 6.4892572130141195, + "grad_norm": 0.18308857083320618, + "learning_rate": 2.8991545776003497e-05, + "loss": 1.7847, + "step": 21142 + }, + { + "epoch": 6.489564149785144, + "grad_norm": 0.22744092345237732, + "learning_rate": 2.8987035365985994e-05, + "loss": 1.7789, + "step": 21143 + }, + { + "epoch": 6.489871086556169, + "grad_norm": 0.18573936820030212, + "learning_rate": 2.8982525163630903e-05, + "loss": 1.6649, + "step": 21144 + }, + { + "epoch": 6.490178023327195, + "grad_norm": 0.26056674122810364, + "learning_rate": 2.8978015168982863e-05, + "loss": 1.68, + "step": 21145 + }, + { + "epoch": 6.49048496009822, + "grad_norm": 0.1912553906440735, + "learning_rate": 2.897350538208635e-05, + "loss": 1.7011, + "step": 21146 + }, + { + "epoch": 6.490791896869245, + "grad_norm": 0.25937187671661377, + "learning_rate": 2.896899580298603e-05, + "loss": 1.7409, + "step": 21147 + }, + { + "epoch": 6.49109883364027, + "grad_norm": 0.22148750722408295, + "learning_rate": 2.8964486431726397e-05, + "loss": 1.6921, + "step": 21148 + }, + { + "epoch": 6.491405770411295, + "grad_norm": 0.23678559064865112, + "learning_rate": 2.8959977268352012e-05, + "loss": 1.6833, + "step": 21149 + }, + { + "epoch": 6.49171270718232, + "grad_norm": 0.2942093312740326, + "learning_rate": 2.8955468312907506e-05, + "loss": 1.7119, + "step": 21150 + }, + { + "epoch": 6.492019643953346, + "grad_norm": 0.18726128339767456, + "learning_rate": 2.8950959565437365e-05, + "loss": 1.7067, + "step": 21151 + }, + { + "epoch": 6.492326580724371, + "grad_norm": 0.23851951956748962, + "learning_rate": 2.894645102598621e-05, + "loss": 1.73, + "step": 21152 + }, + { + "epoch": 6.4926335174953955, + "grad_norm": 0.18054445087909698, + "learning_rate": 2.8941942694598533e-05, + "loss": 1.7243, + "step": 21153 + }, + { + "epoch": 6.492940454266421, + "grad_norm": 0.21889349818229675, + "learning_rate": 2.8937434571318934e-05, + "loss": 1.7789, + "step": 21154 + }, + { + "epoch": 6.493247391037446, + "grad_norm": 0.18788981437683105, + "learning_rate": 2.893292665619195e-05, + "loss": 1.7496, + "step": 21155 + }, + { + "epoch": 6.4935543278084715, + "grad_norm": 0.1964103877544403, + "learning_rate": 2.8928418949262138e-05, + "loss": 1.6732, + "step": 21156 + }, + { + "epoch": 6.493861264579497, + "grad_norm": 0.21939502656459808, + "learning_rate": 2.8923911450574043e-05, + "loss": 1.7149, + "step": 21157 + }, + { + "epoch": 6.494168201350522, + "grad_norm": 0.16927817463874817, + "learning_rate": 2.8919404160172203e-05, + "loss": 1.7093, + "step": 21158 + }, + { + "epoch": 6.494475138121547, + "grad_norm": 0.19907668232917786, + "learning_rate": 2.8914897078101166e-05, + "loss": 1.718, + "step": 21159 + }, + { + "epoch": 6.494782074892572, + "grad_norm": 0.18071576952934265, + "learning_rate": 2.891039020440548e-05, + "loss": 1.7241, + "step": 21160 + }, + { + "epoch": 6.495089011663597, + "grad_norm": 0.17780692875385284, + "learning_rate": 2.890588353912965e-05, + "loss": 1.7013, + "step": 21161 + }, + { + "epoch": 6.495395948434623, + "grad_norm": 0.20762500166893005, + "learning_rate": 2.8901377082318292e-05, + "loss": 1.8149, + "step": 21162 + }, + { + "epoch": 6.495702885205648, + "grad_norm": 0.21616768836975098, + "learning_rate": 2.889687083401585e-05, + "loss": 1.7467, + "step": 21163 + }, + { + "epoch": 6.496009821976672, + "grad_norm": 0.20075570046901703, + "learning_rate": 2.8892364794266935e-05, + "loss": 1.6643, + "step": 21164 + }, + { + "epoch": 6.496316758747698, + "grad_norm": 0.18893925845623016, + "learning_rate": 2.8887858963116028e-05, + "loss": 1.7362, + "step": 21165 + }, + { + "epoch": 6.496623695518723, + "grad_norm": 0.20031611621379852, + "learning_rate": 2.888335334060765e-05, + "loss": 1.6902, + "step": 21166 + }, + { + "epoch": 6.496930632289748, + "grad_norm": 0.2959407866001129, + "learning_rate": 2.887884792678639e-05, + "loss": 1.7874, + "step": 21167 + }, + { + "epoch": 6.497237569060774, + "grad_norm": 0.17434875667095184, + "learning_rate": 2.8874342721696697e-05, + "loss": 1.7353, + "step": 21168 + }, + { + "epoch": 6.497544505831799, + "grad_norm": 0.19451481103897095, + "learning_rate": 2.8869837725383163e-05, + "loss": 1.6942, + "step": 21169 + }, + { + "epoch": 6.4978514426028235, + "grad_norm": 0.17984920740127563, + "learning_rate": 2.886533293789025e-05, + "loss": 1.7461, + "step": 21170 + }, + { + "epoch": 6.498158379373849, + "grad_norm": 0.18166208267211914, + "learning_rate": 2.8860828359262516e-05, + "loss": 1.7202, + "step": 21171 + }, + { + "epoch": 6.498465316144874, + "grad_norm": 0.1849331557750702, + "learning_rate": 2.8856323989544472e-05, + "loss": 1.6862, + "step": 21172 + }, + { + "epoch": 6.4987722529158995, + "grad_norm": 0.17846204340457916, + "learning_rate": 2.8851819828780623e-05, + "loss": 1.7446, + "step": 21173 + }, + { + "epoch": 6.499079189686925, + "grad_norm": 0.1963818222284317, + "learning_rate": 2.8847315877015486e-05, + "loss": 1.7366, + "step": 21174 + }, + { + "epoch": 6.499386126457949, + "grad_norm": 0.1917402446269989, + "learning_rate": 2.8842812134293574e-05, + "loss": 1.7362, + "step": 21175 + }, + { + "epoch": 6.499693063228975, + "grad_norm": 0.16559138894081116, + "learning_rate": 2.883830860065939e-05, + "loss": 1.6735, + "step": 21176 + }, + { + "epoch": 6.5, + "grad_norm": 0.1820032149553299, + "learning_rate": 2.8833805276157442e-05, + "loss": 1.7107, + "step": 21177 + }, + { + "epoch": 6.500306936771025, + "grad_norm": 0.23760980367660522, + "learning_rate": 2.882930216083222e-05, + "loss": 1.7024, + "step": 21178 + }, + { + "epoch": 6.500613873542051, + "grad_norm": 0.22314296662807465, + "learning_rate": 2.8824799254728285e-05, + "loss": 1.714, + "step": 21179 + }, + { + "epoch": 6.500920810313076, + "grad_norm": 0.21919335424900055, + "learning_rate": 2.8820296557890046e-05, + "loss": 1.7625, + "step": 21180 + }, + { + "epoch": 6.5012277470841005, + "grad_norm": 0.21632128953933716, + "learning_rate": 2.88157940703621e-05, + "loss": 1.6589, + "step": 21181 + }, + { + "epoch": 6.501534683855126, + "grad_norm": 0.17998506128787994, + "learning_rate": 2.8811291792188867e-05, + "loss": 1.7528, + "step": 21182 + }, + { + "epoch": 6.501841620626151, + "grad_norm": 0.19783075153827667, + "learning_rate": 2.880678972341485e-05, + "loss": 1.6908, + "step": 21183 + }, + { + "epoch": 6.5021485573971765, + "grad_norm": 0.20510388910770416, + "learning_rate": 2.88022878640846e-05, + "loss": 1.7342, + "step": 21184 + }, + { + "epoch": 6.502455494168201, + "grad_norm": 0.24218666553497314, + "learning_rate": 2.879778621424253e-05, + "loss": 1.8, + "step": 21185 + }, + { + "epoch": 6.502762430939226, + "grad_norm": 0.1901179403066635, + "learning_rate": 2.8793284773933195e-05, + "loss": 1.699, + "step": 21186 + }, + { + "epoch": 6.503069367710252, + "grad_norm": 0.2652232348918915, + "learning_rate": 2.8788783543201007e-05, + "loss": 1.8394, + "step": 21187 + }, + { + "epoch": 6.503376304481277, + "grad_norm": 0.17701558768749237, + "learning_rate": 2.878428252209052e-05, + "loss": 1.6674, + "step": 21188 + }, + { + "epoch": 6.503683241252302, + "grad_norm": 0.17464707791805267, + "learning_rate": 2.8779781710646185e-05, + "loss": 1.6894, + "step": 21189 + }, + { + "epoch": 6.503990178023328, + "grad_norm": 0.19469478726387024, + "learning_rate": 2.877528110891249e-05, + "loss": 1.7487, + "step": 21190 + }, + { + "epoch": 6.504297114794352, + "grad_norm": 0.21656417846679688, + "learning_rate": 2.87707807169339e-05, + "loss": 1.641, + "step": 21191 + }, + { + "epoch": 6.504604051565377, + "grad_norm": 0.20374895632266998, + "learning_rate": 2.8766280534754896e-05, + "loss": 1.6692, + "step": 21192 + }, + { + "epoch": 6.504910988336403, + "grad_norm": 0.26638445258140564, + "learning_rate": 2.876178056241996e-05, + "loss": 1.7415, + "step": 21193 + }, + { + "epoch": 6.505217925107428, + "grad_norm": 0.1852893978357315, + "learning_rate": 2.8757280799973557e-05, + "loss": 1.6981, + "step": 21194 + }, + { + "epoch": 6.505524861878453, + "grad_norm": 0.20518383383750916, + "learning_rate": 2.875278124746013e-05, + "loss": 1.781, + "step": 21195 + }, + { + "epoch": 6.505831798649478, + "grad_norm": 0.19968904554843903, + "learning_rate": 2.874828190492422e-05, + "loss": 1.6813, + "step": 21196 + }, + { + "epoch": 6.506138735420503, + "grad_norm": 0.19164247810840607, + "learning_rate": 2.87437827724102e-05, + "loss": 1.6833, + "step": 21197 + }, + { + "epoch": 6.5064456721915285, + "grad_norm": 0.19305361807346344, + "learning_rate": 2.873928384996262e-05, + "loss": 1.7164, + "step": 21198 + }, + { + "epoch": 6.506752608962554, + "grad_norm": 0.1853758841753006, + "learning_rate": 2.873478513762587e-05, + "loss": 1.7481, + "step": 21199 + }, + { + "epoch": 6.507059545733579, + "grad_norm": 0.20187529921531677, + "learning_rate": 2.8730286635444425e-05, + "loss": 1.7666, + "step": 21200 + }, + { + "epoch": 6.5073664825046045, + "grad_norm": 0.19769401848316193, + "learning_rate": 2.872578834346279e-05, + "loss": 1.798, + "step": 21201 + }, + { + "epoch": 6.507673419275629, + "grad_norm": 0.1936112940311432, + "learning_rate": 2.8721290261725342e-05, + "loss": 1.6992, + "step": 21202 + }, + { + "epoch": 6.507980356046654, + "grad_norm": 0.17090481519699097, + "learning_rate": 2.871679239027662e-05, + "loss": 1.6802, + "step": 21203 + }, + { + "epoch": 6.50828729281768, + "grad_norm": 0.19443605840206146, + "learning_rate": 2.8712294729160987e-05, + "loss": 1.736, + "step": 21204 + }, + { + "epoch": 6.508594229588705, + "grad_norm": 0.19216817617416382, + "learning_rate": 2.8707797278422954e-05, + "loss": 1.7109, + "step": 21205 + }, + { + "epoch": 6.50890116635973, + "grad_norm": 0.19900040328502655, + "learning_rate": 2.8703300038106952e-05, + "loss": 1.7158, + "step": 21206 + }, + { + "epoch": 6.509208103130755, + "grad_norm": 0.17810803651809692, + "learning_rate": 2.8698803008257425e-05, + "loss": 1.6886, + "step": 21207 + }, + { + "epoch": 6.50951503990178, + "grad_norm": 0.1890508532524109, + "learning_rate": 2.8694306188918807e-05, + "loss": 1.7447, + "step": 21208 + }, + { + "epoch": 6.509821976672805, + "grad_norm": 0.17456012964248657, + "learning_rate": 2.868980958013554e-05, + "loss": 1.7094, + "step": 21209 + }, + { + "epoch": 6.510128913443831, + "grad_norm": 0.17089629173278809, + "learning_rate": 2.8685313181952066e-05, + "loss": 1.6827, + "step": 21210 + }, + { + "epoch": 6.510435850214856, + "grad_norm": 0.22681273519992828, + "learning_rate": 2.8680816994412823e-05, + "loss": 1.7374, + "step": 21211 + }, + { + "epoch": 6.510742786985881, + "grad_norm": 0.20642207562923431, + "learning_rate": 2.8676321017562225e-05, + "loss": 1.7609, + "step": 21212 + }, + { + "epoch": 6.511049723756906, + "grad_norm": 0.2360219657421112, + "learning_rate": 2.867182525144475e-05, + "loss": 1.7577, + "step": 21213 + }, + { + "epoch": 6.511356660527931, + "grad_norm": 0.19686923921108246, + "learning_rate": 2.8667329696104766e-05, + "loss": 1.7459, + "step": 21214 + }, + { + "epoch": 6.5116635972989565, + "grad_norm": 0.21280834078788757, + "learning_rate": 2.8662834351586777e-05, + "loss": 1.7837, + "step": 21215 + }, + { + "epoch": 6.511970534069982, + "grad_norm": 0.19297273457050323, + "learning_rate": 2.8658339217935136e-05, + "loss": 1.734, + "step": 21216 + }, + { + "epoch": 6.512277470841006, + "grad_norm": 0.1937931329011917, + "learning_rate": 2.8653844295194283e-05, + "loss": 1.6631, + "step": 21217 + }, + { + "epoch": 6.512584407612032, + "grad_norm": 0.2061077207326889, + "learning_rate": 2.8649349583408692e-05, + "loss": 1.7324, + "step": 21218 + }, + { + "epoch": 6.512891344383057, + "grad_norm": 0.19711358845233917, + "learning_rate": 2.8644855082622695e-05, + "loss": 1.7024, + "step": 21219 + }, + { + "epoch": 6.513198281154082, + "grad_norm": 0.17352496087551117, + "learning_rate": 2.8640360792880804e-05, + "loss": 1.7261, + "step": 21220 + }, + { + "epoch": 6.513505217925108, + "grad_norm": 0.181448295712471, + "learning_rate": 2.8635866714227344e-05, + "loss": 1.7147, + "step": 21221 + }, + { + "epoch": 6.513812154696133, + "grad_norm": 0.1827932894229889, + "learning_rate": 2.8631372846706787e-05, + "loss": 1.7338, + "step": 21222 + }, + { + "epoch": 6.514119091467157, + "grad_norm": 0.20659075677394867, + "learning_rate": 2.862687919036353e-05, + "loss": 1.6611, + "step": 21223 + }, + { + "epoch": 6.514426028238183, + "grad_norm": 0.19185996055603027, + "learning_rate": 2.8622385745241987e-05, + "loss": 1.7834, + "step": 21224 + }, + { + "epoch": 6.514732965009208, + "grad_norm": 0.19825506210327148, + "learning_rate": 2.8617892511386558e-05, + "loss": 1.7608, + "step": 21225 + }, + { + "epoch": 6.515039901780233, + "grad_norm": 0.16927020251750946, + "learning_rate": 2.861339948884164e-05, + "loss": 1.6651, + "step": 21226 + }, + { + "epoch": 6.515346838551259, + "grad_norm": 0.19211016595363617, + "learning_rate": 2.8608906677651646e-05, + "loss": 1.6673, + "step": 21227 + }, + { + "epoch": 6.515653775322283, + "grad_norm": 0.20192545652389526, + "learning_rate": 2.8604414077860974e-05, + "loss": 1.7301, + "step": 21228 + }, + { + "epoch": 6.5159607120933085, + "grad_norm": 0.2075425237417221, + "learning_rate": 2.8599921689514002e-05, + "loss": 1.783, + "step": 21229 + }, + { + "epoch": 6.516267648864334, + "grad_norm": 0.21261392533779144, + "learning_rate": 2.8595429512655192e-05, + "loss": 1.7277, + "step": 21230 + }, + { + "epoch": 6.516574585635359, + "grad_norm": 0.21201452612876892, + "learning_rate": 2.8590937547328844e-05, + "loss": 1.6582, + "step": 21231 + }, + { + "epoch": 6.5168815224063845, + "grad_norm": 0.2071799635887146, + "learning_rate": 2.858644579357944e-05, + "loss": 1.7559, + "step": 21232 + }, + { + "epoch": 6.51718845917741, + "grad_norm": 0.20225903391838074, + "learning_rate": 2.858195425145132e-05, + "loss": 1.7507, + "step": 21233 + }, + { + "epoch": 6.517495395948434, + "grad_norm": 0.2738147974014282, + "learning_rate": 2.8577462920988852e-05, + "loss": 1.7073, + "step": 21234 + }, + { + "epoch": 6.51780233271946, + "grad_norm": 0.17878220975399017, + "learning_rate": 2.8572971802236498e-05, + "loss": 1.6598, + "step": 21235 + }, + { + "epoch": 6.518109269490485, + "grad_norm": 0.21365594863891602, + "learning_rate": 2.8568480895238552e-05, + "loss": 1.7404, + "step": 21236 + }, + { + "epoch": 6.51841620626151, + "grad_norm": 0.18392804265022278, + "learning_rate": 2.856399020003948e-05, + "loss": 1.706, + "step": 21237 + }, + { + "epoch": 6.518723143032536, + "grad_norm": 0.16268405318260193, + "learning_rate": 2.855949971668358e-05, + "loss": 1.6725, + "step": 21238 + }, + { + "epoch": 6.51903007980356, + "grad_norm": 0.19590096175670624, + "learning_rate": 2.855500944521529e-05, + "loss": 1.7269, + "step": 21239 + }, + { + "epoch": 6.519337016574585, + "grad_norm": 0.19443263113498688, + "learning_rate": 2.8550519385678965e-05, + "loss": 1.686, + "step": 21240 + }, + { + "epoch": 6.519643953345611, + "grad_norm": 0.2112705111503601, + "learning_rate": 2.8546029538118985e-05, + "loss": 1.6904, + "step": 21241 + }, + { + "epoch": 6.519950890116636, + "grad_norm": 0.21015888452529907, + "learning_rate": 2.8541539902579712e-05, + "loss": 1.6972, + "step": 21242 + }, + { + "epoch": 6.520257826887661, + "grad_norm": 0.2853320837020874, + "learning_rate": 2.853705047910552e-05, + "loss": 1.7415, + "step": 21243 + }, + { + "epoch": 6.520564763658687, + "grad_norm": 0.20927128195762634, + "learning_rate": 2.853256126774077e-05, + "loss": 1.6955, + "step": 21244 + }, + { + "epoch": 6.520871700429711, + "grad_norm": 0.27824920415878296, + "learning_rate": 2.8528072268529836e-05, + "loss": 1.7666, + "step": 21245 + }, + { + "epoch": 6.5211786372007365, + "grad_norm": 0.21164646744728088, + "learning_rate": 2.8523583481517057e-05, + "loss": 1.75, + "step": 21246 + }, + { + "epoch": 6.521485573971762, + "grad_norm": 0.249397411942482, + "learning_rate": 2.851909490674686e-05, + "loss": 1.6767, + "step": 21247 + }, + { + "epoch": 6.521792510742787, + "grad_norm": 0.2311551868915558, + "learning_rate": 2.8514606544263507e-05, + "loss": 1.8071, + "step": 21248 + }, + { + "epoch": 6.5220994475138125, + "grad_norm": 0.21878042817115784, + "learning_rate": 2.8510118394111453e-05, + "loss": 1.6881, + "step": 21249 + }, + { + "epoch": 6.522406384284837, + "grad_norm": 0.2095690816640854, + "learning_rate": 2.8505630456334974e-05, + "loss": 1.6526, + "step": 21250 + }, + { + "epoch": 6.522713321055862, + "grad_norm": 0.2303982526063919, + "learning_rate": 2.850114273097844e-05, + "loss": 1.7256, + "step": 21251 + }, + { + "epoch": 6.523020257826888, + "grad_norm": 0.22640225291252136, + "learning_rate": 2.8496655218086255e-05, + "loss": 1.7797, + "step": 21252 + }, + { + "epoch": 6.523327194597913, + "grad_norm": 0.24268805980682373, + "learning_rate": 2.8492167917702683e-05, + "loss": 1.7673, + "step": 21253 + }, + { + "epoch": 6.523634131368938, + "grad_norm": 0.1988469958305359, + "learning_rate": 2.8487680829872158e-05, + "loss": 1.7126, + "step": 21254 + }, + { + "epoch": 6.523941068139964, + "grad_norm": 0.18385496735572815, + "learning_rate": 2.8483193954638942e-05, + "loss": 1.7113, + "step": 21255 + }, + { + "epoch": 6.524248004910988, + "grad_norm": 0.21865327656269073, + "learning_rate": 2.847870729204743e-05, + "loss": 1.6686, + "step": 21256 + }, + { + "epoch": 6.524554941682013, + "grad_norm": 0.16982951760292053, + "learning_rate": 2.8474220842141946e-05, + "loss": 1.6865, + "step": 21257 + }, + { + "epoch": 6.524861878453039, + "grad_norm": 0.23028478026390076, + "learning_rate": 2.8469734604966834e-05, + "loss": 1.7647, + "step": 21258 + }, + { + "epoch": 6.525168815224064, + "grad_norm": 0.1805485039949417, + "learning_rate": 2.8465248580566415e-05, + "loss": 1.7524, + "step": 21259 + }, + { + "epoch": 6.525475751995089, + "grad_norm": 0.18652063608169556, + "learning_rate": 2.8460762768985037e-05, + "loss": 1.7028, + "step": 21260 + }, + { + "epoch": 6.525782688766114, + "grad_norm": 0.22772997617721558, + "learning_rate": 2.845627717026703e-05, + "loss": 1.7866, + "step": 21261 + }, + { + "epoch": 6.526089625537139, + "grad_norm": 0.19889821112155914, + "learning_rate": 2.8451791784456718e-05, + "loss": 1.7076, + "step": 21262 + }, + { + "epoch": 6.526396562308165, + "grad_norm": 0.24747174978256226, + "learning_rate": 2.8447306611598402e-05, + "loss": 1.7615, + "step": 21263 + }, + { + "epoch": 6.52670349907919, + "grad_norm": 0.1988009363412857, + "learning_rate": 2.8442821651736473e-05, + "loss": 1.7853, + "step": 21264 + }, + { + "epoch": 6.527010435850215, + "grad_norm": 0.250032901763916, + "learning_rate": 2.8438336904915185e-05, + "loss": 1.6906, + "step": 21265 + }, + { + "epoch": 6.52731737262124, + "grad_norm": 0.15398284792900085, + "learning_rate": 2.8433852371178925e-05, + "loss": 1.6437, + "step": 21266 + }, + { + "epoch": 6.527624309392265, + "grad_norm": 0.33137503266334534, + "learning_rate": 2.8429368050571958e-05, + "loss": 1.8213, + "step": 21267 + }, + { + "epoch": 6.52793124616329, + "grad_norm": 0.23827852308750153, + "learning_rate": 2.8424883943138593e-05, + "loss": 1.7148, + "step": 21268 + }, + { + "epoch": 6.528238182934316, + "grad_norm": 0.21171489357948303, + "learning_rate": 2.8420400048923217e-05, + "loss": 1.7729, + "step": 21269 + }, + { + "epoch": 6.528545119705341, + "grad_norm": 0.21698513627052307, + "learning_rate": 2.8415916367970053e-05, + "loss": 1.7267, + "step": 21270 + }, + { + "epoch": 6.5288520564763655, + "grad_norm": 0.2217913120985031, + "learning_rate": 2.8411432900323498e-05, + "loss": 1.7259, + "step": 21271 + }, + { + "epoch": 6.529158993247391, + "grad_norm": 0.25518202781677246, + "learning_rate": 2.8406949646027768e-05, + "loss": 1.7754, + "step": 21272 + }, + { + "epoch": 6.529465930018416, + "grad_norm": 0.22206325829029083, + "learning_rate": 2.8402466605127247e-05, + "loss": 1.755, + "step": 21273 + }, + { + "epoch": 6.5297728667894415, + "grad_norm": 0.26918017864227295, + "learning_rate": 2.8397983777666206e-05, + "loss": 1.783, + "step": 21274 + }, + { + "epoch": 6.530079803560467, + "grad_norm": 0.19280646741390228, + "learning_rate": 2.8393501163688952e-05, + "loss": 1.6942, + "step": 21275 + }, + { + "epoch": 6.530386740331492, + "grad_norm": 0.24567140638828278, + "learning_rate": 2.8389018763239784e-05, + "loss": 1.7316, + "step": 21276 + }, + { + "epoch": 6.530693677102517, + "grad_norm": 0.21791695058345795, + "learning_rate": 2.8384536576362997e-05, + "loss": 1.7627, + "step": 21277 + }, + { + "epoch": 6.531000613873542, + "grad_norm": 0.2441660761833191, + "learning_rate": 2.8380054603102885e-05, + "loss": 1.7112, + "step": 21278 + }, + { + "epoch": 6.531307550644567, + "grad_norm": 0.1768653243780136, + "learning_rate": 2.837557284350375e-05, + "loss": 1.6906, + "step": 21279 + }, + { + "epoch": 6.531614487415593, + "grad_norm": 0.21037769317626953, + "learning_rate": 2.8371091297609877e-05, + "loss": 1.7197, + "step": 21280 + }, + { + "epoch": 6.531921424186618, + "grad_norm": 0.23989829421043396, + "learning_rate": 2.8366609965465563e-05, + "loss": 1.7693, + "step": 21281 + }, + { + "epoch": 6.532228360957642, + "grad_norm": 0.18302181363105774, + "learning_rate": 2.836212884711506e-05, + "loss": 1.6643, + "step": 21282 + }, + { + "epoch": 6.532535297728668, + "grad_norm": 0.2068471908569336, + "learning_rate": 2.835764794260273e-05, + "loss": 1.7431, + "step": 21283 + }, + { + "epoch": 6.532842234499693, + "grad_norm": 0.18803778290748596, + "learning_rate": 2.8353167251972777e-05, + "loss": 1.7506, + "step": 21284 + }, + { + "epoch": 6.533149171270718, + "grad_norm": 0.20789632201194763, + "learning_rate": 2.8348686775269507e-05, + "loss": 1.7174, + "step": 21285 + }, + { + "epoch": 6.533456108041744, + "grad_norm": 0.18927012383937836, + "learning_rate": 2.834420651253723e-05, + "loss": 1.6723, + "step": 21286 + }, + { + "epoch": 6.533763044812769, + "grad_norm": 0.22616887092590332, + "learning_rate": 2.8339726463820172e-05, + "loss": 1.7045, + "step": 21287 + }, + { + "epoch": 6.5340699815837935, + "grad_norm": 0.23880253732204437, + "learning_rate": 2.8335246629162658e-05, + "loss": 1.7255, + "step": 21288 + }, + { + "epoch": 6.534376918354819, + "grad_norm": 0.24279431998729706, + "learning_rate": 2.8330767008608904e-05, + "loss": 1.7548, + "step": 21289 + }, + { + "epoch": 6.534683855125844, + "grad_norm": 0.20542044937610626, + "learning_rate": 2.832628760220323e-05, + "loss": 1.6851, + "step": 21290 + }, + { + "epoch": 6.5349907918968695, + "grad_norm": 0.19426794350147247, + "learning_rate": 2.832180840998988e-05, + "loss": 1.7528, + "step": 21291 + }, + { + "epoch": 6.535297728667894, + "grad_norm": 0.2744491398334503, + "learning_rate": 2.8317329432013136e-05, + "loss": 1.7821, + "step": 21292 + }, + { + "epoch": 6.535604665438919, + "grad_norm": 0.2692170739173889, + "learning_rate": 2.8312850668317243e-05, + "loss": 1.6626, + "step": 21293 + }, + { + "epoch": 6.535911602209945, + "grad_norm": 0.24998809397220612, + "learning_rate": 2.830837211894647e-05, + "loss": 1.7031, + "step": 21294 + }, + { + "epoch": 6.53621853898097, + "grad_norm": 0.22888946533203125, + "learning_rate": 2.830389378394508e-05, + "loss": 1.7706, + "step": 21295 + }, + { + "epoch": 6.536525475751995, + "grad_norm": 0.21685005724430084, + "learning_rate": 2.8299415663357332e-05, + "loss": 1.681, + "step": 21296 + }, + { + "epoch": 6.536832412523021, + "grad_norm": 0.23309725522994995, + "learning_rate": 2.8294937757227475e-05, + "loss": 1.7781, + "step": 21297 + }, + { + "epoch": 6.537139349294045, + "grad_norm": 0.26712173223495483, + "learning_rate": 2.829046006559976e-05, + "loss": 1.6966, + "step": 21298 + }, + { + "epoch": 6.53744628606507, + "grad_norm": 0.1836499124765396, + "learning_rate": 2.8285982588518428e-05, + "loss": 1.7192, + "step": 21299 + }, + { + "epoch": 6.537753222836096, + "grad_norm": 0.24073021113872528, + "learning_rate": 2.828150532602778e-05, + "loss": 1.6997, + "step": 21300 + }, + { + "epoch": 6.538060159607121, + "grad_norm": 0.16308051347732544, + "learning_rate": 2.8277028278172014e-05, + "loss": 1.6901, + "step": 21301 + }, + { + "epoch": 6.538367096378146, + "grad_norm": 0.2330634444952011, + "learning_rate": 2.8272551444995376e-05, + "loss": 1.7426, + "step": 21302 + }, + { + "epoch": 6.538674033149171, + "grad_norm": 0.18600425124168396, + "learning_rate": 2.8268074826542123e-05, + "loss": 1.6906, + "step": 21303 + }, + { + "epoch": 6.538980969920196, + "grad_norm": 0.24717238545417786, + "learning_rate": 2.8263598422856475e-05, + "loss": 1.6962, + "step": 21304 + }, + { + "epoch": 6.5392879066912215, + "grad_norm": 0.1907368302345276, + "learning_rate": 2.8259122233982727e-05, + "loss": 1.7083, + "step": 21305 + }, + { + "epoch": 6.539594843462247, + "grad_norm": 0.22698798775672913, + "learning_rate": 2.8254646259965035e-05, + "loss": 1.7377, + "step": 21306 + }, + { + "epoch": 6.539901780233272, + "grad_norm": 0.19169457256793976, + "learning_rate": 2.8250170500847696e-05, + "loss": 1.7416, + "step": 21307 + }, + { + "epoch": 6.5402087170042975, + "grad_norm": 0.18730394542217255, + "learning_rate": 2.8245694956674918e-05, + "loss": 1.7273, + "step": 21308 + }, + { + "epoch": 6.540515653775322, + "grad_norm": 0.19813422858715057, + "learning_rate": 2.8241219627490927e-05, + "loss": 1.7638, + "step": 21309 + }, + { + "epoch": 6.540822590546347, + "grad_norm": 0.20460368692874908, + "learning_rate": 2.8236744513339965e-05, + "loss": 1.7266, + "step": 21310 + }, + { + "epoch": 6.541129527317373, + "grad_norm": 0.20448380708694458, + "learning_rate": 2.823226961426625e-05, + "loss": 1.7335, + "step": 21311 + }, + { + "epoch": 6.541436464088398, + "grad_norm": 0.21458712220191956, + "learning_rate": 2.8227794930314e-05, + "loss": 1.7274, + "step": 21312 + }, + { + "epoch": 6.541743400859423, + "grad_norm": 0.1964675635099411, + "learning_rate": 2.8223320461527442e-05, + "loss": 1.7514, + "step": 21313 + }, + { + "epoch": 6.542050337630448, + "grad_norm": 0.18982458114624023, + "learning_rate": 2.82188462079508e-05, + "loss": 1.6858, + "step": 21314 + }, + { + "epoch": 6.542357274401473, + "grad_norm": 0.21377761662006378, + "learning_rate": 2.8214372169628277e-05, + "loss": 1.727, + "step": 21315 + }, + { + "epoch": 6.542664211172498, + "grad_norm": 0.19484922289848328, + "learning_rate": 2.8209898346604087e-05, + "loss": 1.7646, + "step": 21316 + }, + { + "epoch": 6.542971147943524, + "grad_norm": 0.20614980161190033, + "learning_rate": 2.8205424738922488e-05, + "loss": 1.6705, + "step": 21317 + }, + { + "epoch": 6.543278084714549, + "grad_norm": 0.1888885796070099, + "learning_rate": 2.8200951346627636e-05, + "loss": 1.7854, + "step": 21318 + }, + { + "epoch": 6.543585021485574, + "grad_norm": 0.20957863330841064, + "learning_rate": 2.8196478169763763e-05, + "loss": 1.6971, + "step": 21319 + }, + { + "epoch": 6.543891958256599, + "grad_norm": 0.20744509994983673, + "learning_rate": 2.8192005208375073e-05, + "loss": 1.7408, + "step": 21320 + }, + { + "epoch": 6.544198895027624, + "grad_norm": 0.20038767158985138, + "learning_rate": 2.818753246250574e-05, + "loss": 1.7355, + "step": 21321 + }, + { + "epoch": 6.5445058317986495, + "grad_norm": 0.18535862863063812, + "learning_rate": 2.818305993220004e-05, + "loss": 1.7229, + "step": 21322 + }, + { + "epoch": 6.544812768569675, + "grad_norm": 0.2191225290298462, + "learning_rate": 2.8178587617502095e-05, + "loss": 1.7364, + "step": 21323 + }, + { + "epoch": 6.5451197053407, + "grad_norm": 0.2055424451828003, + "learning_rate": 2.8174115518456175e-05, + "loss": 1.7488, + "step": 21324 + }, + { + "epoch": 6.545426642111725, + "grad_norm": 0.22267968952655792, + "learning_rate": 2.8169643635106398e-05, + "loss": 1.6936, + "step": 21325 + }, + { + "epoch": 6.54573357888275, + "grad_norm": 0.20295512676239014, + "learning_rate": 2.8165171967497018e-05, + "loss": 1.7651, + "step": 21326 + }, + { + "epoch": 6.546040515653775, + "grad_norm": 0.25859618186950684, + "learning_rate": 2.81607005156722e-05, + "loss": 1.7264, + "step": 21327 + }, + { + "epoch": 6.546347452424801, + "grad_norm": 0.22232379019260406, + "learning_rate": 2.8156229279676143e-05, + "loss": 1.7282, + "step": 21328 + }, + { + "epoch": 6.546654389195826, + "grad_norm": 0.2548457682132721, + "learning_rate": 2.8151758259553035e-05, + "loss": 1.7137, + "step": 21329 + }, + { + "epoch": 6.546961325966851, + "grad_norm": 0.22040672600269318, + "learning_rate": 2.8147287455347055e-05, + "loss": 1.7553, + "step": 21330 + }, + { + "epoch": 6.547268262737876, + "grad_norm": 0.19622360169887543, + "learning_rate": 2.8142816867102388e-05, + "loss": 1.6502, + "step": 21331 + }, + { + "epoch": 6.547575199508901, + "grad_norm": 0.20849336683750153, + "learning_rate": 2.813834649486322e-05, + "loss": 1.6824, + "step": 21332 + }, + { + "epoch": 6.547882136279926, + "grad_norm": 0.18474788963794708, + "learning_rate": 2.8133876338673703e-05, + "loss": 1.7136, + "step": 21333 + }, + { + "epoch": 6.548189073050952, + "grad_norm": 0.2421834021806717, + "learning_rate": 2.8129406398578074e-05, + "loss": 1.7841, + "step": 21334 + }, + { + "epoch": 6.548496009821976, + "grad_norm": 0.18089748919010162, + "learning_rate": 2.812493667462045e-05, + "loss": 1.6918, + "step": 21335 + }, + { + "epoch": 6.5488029465930016, + "grad_norm": 0.18575069308280945, + "learning_rate": 2.8120467166845022e-05, + "loss": 1.7098, + "step": 21336 + }, + { + "epoch": 6.549109883364027, + "grad_norm": 0.20840388536453247, + "learning_rate": 2.811599787529596e-05, + "loss": 1.7405, + "step": 21337 + }, + { + "epoch": 6.549416820135052, + "grad_norm": 0.19018858671188354, + "learning_rate": 2.811152880001742e-05, + "loss": 1.7098, + "step": 21338 + }, + { + "epoch": 6.5497237569060776, + "grad_norm": 0.22326117753982544, + "learning_rate": 2.8107059941053627e-05, + "loss": 1.7452, + "step": 21339 + }, + { + "epoch": 6.550030693677103, + "grad_norm": 0.26071304082870483, + "learning_rate": 2.8102591298448643e-05, + "loss": 1.7685, + "step": 21340 + }, + { + "epoch": 6.550337630448127, + "grad_norm": 0.2253575623035431, + "learning_rate": 2.8098122872246734e-05, + "loss": 1.8025, + "step": 21341 + }, + { + "epoch": 6.550644567219153, + "grad_norm": 0.2503850758075714, + "learning_rate": 2.8093654662491975e-05, + "loss": 1.7453, + "step": 21342 + }, + { + "epoch": 6.550951503990178, + "grad_norm": 0.18953700363636017, + "learning_rate": 2.808918666922858e-05, + "loss": 1.7549, + "step": 21343 + }, + { + "epoch": 6.551258440761203, + "grad_norm": 0.21360619366168976, + "learning_rate": 2.8084718892500685e-05, + "loss": 1.7363, + "step": 21344 + }, + { + "epoch": 6.551565377532229, + "grad_norm": 0.24622702598571777, + "learning_rate": 2.8080251332352437e-05, + "loss": 1.7325, + "step": 21345 + }, + { + "epoch": 6.551872314303253, + "grad_norm": 0.20079167187213898, + "learning_rate": 2.8075783988827997e-05, + "loss": 1.7478, + "step": 21346 + }, + { + "epoch": 6.5521792510742785, + "grad_norm": 0.2337643951177597, + "learning_rate": 2.807131686197151e-05, + "loss": 1.6683, + "step": 21347 + }, + { + "epoch": 6.552486187845304, + "grad_norm": 0.20815308392047882, + "learning_rate": 2.8066849951827123e-05, + "loss": 1.7436, + "step": 21348 + }, + { + "epoch": 6.552793124616329, + "grad_norm": 0.2450367957353592, + "learning_rate": 2.8062383258438972e-05, + "loss": 1.7464, + "step": 21349 + }, + { + "epoch": 6.5531000613873545, + "grad_norm": 0.232087641954422, + "learning_rate": 2.8057916781851222e-05, + "loss": 1.7378, + "step": 21350 + }, + { + "epoch": 6.55340699815838, + "grad_norm": 0.2254600077867508, + "learning_rate": 2.8053450522107993e-05, + "loss": 1.7299, + "step": 21351 + }, + { + "epoch": 6.553713934929404, + "grad_norm": 0.23282572627067566, + "learning_rate": 2.8048984479253425e-05, + "loss": 1.7512, + "step": 21352 + }, + { + "epoch": 6.55402087170043, + "grad_norm": 0.21826763451099396, + "learning_rate": 2.8044518653331665e-05, + "loss": 1.706, + "step": 21353 + }, + { + "epoch": 6.554327808471455, + "grad_norm": 0.20807425677776337, + "learning_rate": 2.804005304438683e-05, + "loss": 1.7013, + "step": 21354 + }, + { + "epoch": 6.55463474524248, + "grad_norm": 0.21791879832744598, + "learning_rate": 2.8035587652463046e-05, + "loss": 1.7312, + "step": 21355 + }, + { + "epoch": 6.554941682013506, + "grad_norm": 0.23205329477787018, + "learning_rate": 2.8031122477604505e-05, + "loss": 1.7166, + "step": 21356 + }, + { + "epoch": 6.55524861878453, + "grad_norm": 0.1910320371389389, + "learning_rate": 2.802665751985525e-05, + "loss": 1.694, + "step": 21357 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 0.24150735139846802, + "learning_rate": 2.8022192779259472e-05, + "loss": 1.7934, + "step": 21358 + }, + { + "epoch": 6.555862492326581, + "grad_norm": 0.18308573961257935, + "learning_rate": 2.801772825586123e-05, + "loss": 1.6851, + "step": 21359 + }, + { + "epoch": 6.556169429097606, + "grad_norm": 0.28410083055496216, + "learning_rate": 2.8013263949704705e-05, + "loss": 1.7687, + "step": 21360 + }, + { + "epoch": 6.556476365868631, + "grad_norm": 0.21073146164417267, + "learning_rate": 2.8008799860833996e-05, + "loss": 1.711, + "step": 21361 + }, + { + "epoch": 6.556783302639657, + "grad_norm": 0.22758159041404724, + "learning_rate": 2.8004335989293213e-05, + "loss": 1.7495, + "step": 21362 + }, + { + "epoch": 6.557090239410681, + "grad_norm": 0.2112412452697754, + "learning_rate": 2.799987233512647e-05, + "loss": 1.7125, + "step": 21363 + }, + { + "epoch": 6.5573971761817065, + "grad_norm": 0.1804153323173523, + "learning_rate": 2.7995408898377884e-05, + "loss": 1.689, + "step": 21364 + }, + { + "epoch": 6.557704112952732, + "grad_norm": 0.17632657289505005, + "learning_rate": 2.7990945679091572e-05, + "loss": 1.6868, + "step": 21365 + }, + { + "epoch": 6.558011049723757, + "grad_norm": 0.17942996323108673, + "learning_rate": 2.7986482677311632e-05, + "loss": 1.7082, + "step": 21366 + }, + { + "epoch": 6.558317986494782, + "grad_norm": 0.278486967086792, + "learning_rate": 2.7982019893082167e-05, + "loss": 1.7909, + "step": 21367 + }, + { + "epoch": 6.558624923265807, + "grad_norm": 0.208990678191185, + "learning_rate": 2.797755732644729e-05, + "loss": 1.7643, + "step": 21368 + }, + { + "epoch": 6.558931860036832, + "grad_norm": 0.20375309884548187, + "learning_rate": 2.7973094977451096e-05, + "loss": 1.6957, + "step": 21369 + }, + { + "epoch": 6.559238796807858, + "grad_norm": 0.24685338139533997, + "learning_rate": 2.7968632846137694e-05, + "loss": 1.7574, + "step": 21370 + }, + { + "epoch": 6.559545733578883, + "grad_norm": 0.2237502634525299, + "learning_rate": 2.796417093255117e-05, + "loss": 1.7422, + "step": 21371 + }, + { + "epoch": 6.559852670349908, + "grad_norm": 0.22731846570968628, + "learning_rate": 2.795970923673561e-05, + "loss": 1.7594, + "step": 21372 + }, + { + "epoch": 6.560159607120933, + "grad_norm": 0.2518742084503174, + "learning_rate": 2.7955247758735158e-05, + "loss": 1.6817, + "step": 21373 + }, + { + "epoch": 6.560466543891958, + "grad_norm": 0.21982096135616302, + "learning_rate": 2.7950786498593827e-05, + "loss": 1.7289, + "step": 21374 + }, + { + "epoch": 6.560773480662983, + "grad_norm": 0.19061018526554108, + "learning_rate": 2.7946325456355787e-05, + "loss": 1.6809, + "step": 21375 + }, + { + "epoch": 6.561080417434009, + "grad_norm": 0.2023245394229889, + "learning_rate": 2.794186463206505e-05, + "loss": 1.7053, + "step": 21376 + }, + { + "epoch": 6.561387354205034, + "grad_norm": 0.18003186583518982, + "learning_rate": 2.7937404025765752e-05, + "loss": 1.6447, + "step": 21377 + }, + { + "epoch": 6.5616942909760585, + "grad_norm": 0.19133709371089935, + "learning_rate": 2.7932943637501956e-05, + "loss": 1.7677, + "step": 21378 + }, + { + "epoch": 6.562001227747084, + "grad_norm": 0.18476714193820953, + "learning_rate": 2.7928483467317746e-05, + "loss": 1.685, + "step": 21379 + }, + { + "epoch": 6.562308164518109, + "grad_norm": 0.2065780758857727, + "learning_rate": 2.79240235152572e-05, + "loss": 1.6827, + "step": 21380 + }, + { + "epoch": 6.5626151012891345, + "grad_norm": 0.1885409951210022, + "learning_rate": 2.79195637813644e-05, + "loss": 1.6819, + "step": 21381 + }, + { + "epoch": 6.56292203806016, + "grad_norm": 0.18055391311645508, + "learning_rate": 2.79151042656834e-05, + "loss": 1.7007, + "step": 21382 + }, + { + "epoch": 6.563228974831185, + "grad_norm": 0.25148439407348633, + "learning_rate": 2.7910644968258294e-05, + "loss": 1.7723, + "step": 21383 + }, + { + "epoch": 6.56353591160221, + "grad_norm": 0.2308066487312317, + "learning_rate": 2.7906185889133134e-05, + "loss": 1.7525, + "step": 21384 + }, + { + "epoch": 6.563842848373235, + "grad_norm": 0.19580784440040588, + "learning_rate": 2.7901727028351997e-05, + "loss": 1.7197, + "step": 21385 + }, + { + "epoch": 6.56414978514426, + "grad_norm": 0.19686979055404663, + "learning_rate": 2.7897268385958952e-05, + "loss": 1.6873, + "step": 21386 + }, + { + "epoch": 6.564456721915286, + "grad_norm": 0.2657351493835449, + "learning_rate": 2.7892809961998045e-05, + "loss": 1.7005, + "step": 21387 + }, + { + "epoch": 6.564763658686311, + "grad_norm": 0.20131130516529083, + "learning_rate": 2.7888351756513353e-05, + "loss": 1.7211, + "step": 21388 + }, + { + "epoch": 6.565070595457335, + "grad_norm": 0.2524282932281494, + "learning_rate": 2.7883893769548908e-05, + "loss": 1.7038, + "step": 21389 + }, + { + "epoch": 6.565377532228361, + "grad_norm": 0.1601654291152954, + "learning_rate": 2.787943600114883e-05, + "loss": 1.691, + "step": 21390 + }, + { + "epoch": 6.565684468999386, + "grad_norm": 0.25074124336242676, + "learning_rate": 2.787497845135709e-05, + "loss": 1.688, + "step": 21391 + }, + { + "epoch": 6.565991405770411, + "grad_norm": 0.19491349160671234, + "learning_rate": 2.787052112021782e-05, + "loss": 1.7108, + "step": 21392 + }, + { + "epoch": 6.566298342541437, + "grad_norm": 0.23931637406349182, + "learning_rate": 2.786606400777499e-05, + "loss": 1.7315, + "step": 21393 + }, + { + "epoch": 6.566605279312462, + "grad_norm": 0.1643616110086441, + "learning_rate": 2.786160711407271e-05, + "loss": 1.6745, + "step": 21394 + }, + { + "epoch": 6.5669122160834865, + "grad_norm": 0.17805394530296326, + "learning_rate": 2.7857150439155e-05, + "loss": 1.6817, + "step": 21395 + }, + { + "epoch": 6.567219152854512, + "grad_norm": 0.20370139181613922, + "learning_rate": 2.7852693983065913e-05, + "loss": 1.7173, + "step": 21396 + }, + { + "epoch": 6.567526089625537, + "grad_norm": 0.1620296984910965, + "learning_rate": 2.784823774584948e-05, + "loss": 1.7135, + "step": 21397 + }, + { + "epoch": 6.5678330263965625, + "grad_norm": 0.19116036593914032, + "learning_rate": 2.7843781727549752e-05, + "loss": 1.6815, + "step": 21398 + }, + { + "epoch": 6.568139963167588, + "grad_norm": 0.20118895173072815, + "learning_rate": 2.7839325928210757e-05, + "loss": 1.7336, + "step": 21399 + }, + { + "epoch": 6.568446899938612, + "grad_norm": 0.198282390832901, + "learning_rate": 2.7834870347876528e-05, + "loss": 1.7379, + "step": 21400 + }, + { + "epoch": 6.568753836709638, + "grad_norm": 0.19203920662403107, + "learning_rate": 2.7830414986591104e-05, + "loss": 1.6913, + "step": 21401 + }, + { + "epoch": 6.569060773480663, + "grad_norm": 0.24601610004901886, + "learning_rate": 2.7825959844398507e-05, + "loss": 1.7842, + "step": 21402 + }, + { + "epoch": 6.569367710251688, + "grad_norm": 0.19069935381412506, + "learning_rate": 2.7821504921342777e-05, + "loss": 1.706, + "step": 21403 + }, + { + "epoch": 6.569674647022714, + "grad_norm": 0.20221085846424103, + "learning_rate": 2.7817050217467945e-05, + "loss": 1.7223, + "step": 21404 + }, + { + "epoch": 6.569981583793739, + "grad_norm": 0.2129664123058319, + "learning_rate": 2.781259573281801e-05, + "loss": 1.7429, + "step": 21405 + }, + { + "epoch": 6.570288520564763, + "grad_norm": 0.20684000849723816, + "learning_rate": 2.7808141467436993e-05, + "loss": 1.7349, + "step": 21406 + }, + { + "epoch": 6.570595457335789, + "grad_norm": 0.2153804898262024, + "learning_rate": 2.7803687421368968e-05, + "loss": 1.7245, + "step": 21407 + }, + { + "epoch": 6.570902394106814, + "grad_norm": 0.245448499917984, + "learning_rate": 2.7799233594657875e-05, + "loss": 1.7102, + "step": 21408 + }, + { + "epoch": 6.571209330877839, + "grad_norm": 0.18146783113479614, + "learning_rate": 2.7794779987347807e-05, + "loss": 1.6777, + "step": 21409 + }, + { + "epoch": 6.571516267648864, + "grad_norm": 0.21388854086399078, + "learning_rate": 2.7790326599482698e-05, + "loss": 1.7263, + "step": 21410 + }, + { + "epoch": 6.571823204419889, + "grad_norm": 0.2242165058851242, + "learning_rate": 2.7785873431106625e-05, + "loss": 1.7624, + "step": 21411 + }, + { + "epoch": 6.5721301411909145, + "grad_norm": 0.23132537305355072, + "learning_rate": 2.7781420482263565e-05, + "loss": 1.7013, + "step": 21412 + }, + { + "epoch": 6.57243707796194, + "grad_norm": 0.21074987947940826, + "learning_rate": 2.777696775299753e-05, + "loss": 1.7111, + "step": 21413 + }, + { + "epoch": 6.572744014732965, + "grad_norm": 0.2933674156665802, + "learning_rate": 2.7772515243352525e-05, + "loss": 1.7515, + "step": 21414 + }, + { + "epoch": 6.5730509515039905, + "grad_norm": 0.2100256085395813, + "learning_rate": 2.7768062953372552e-05, + "loss": 1.7425, + "step": 21415 + }, + { + "epoch": 6.573357888275015, + "grad_norm": 0.21765680611133575, + "learning_rate": 2.776361088310161e-05, + "loss": 1.7064, + "step": 21416 + }, + { + "epoch": 6.57366482504604, + "grad_norm": 0.205422043800354, + "learning_rate": 2.7759159032583702e-05, + "loss": 1.7458, + "step": 21417 + }, + { + "epoch": 6.573971761817066, + "grad_norm": 0.2009960114955902, + "learning_rate": 2.775470740186282e-05, + "loss": 1.7111, + "step": 21418 + }, + { + "epoch": 6.574278698588091, + "grad_norm": 0.18974804878234863, + "learning_rate": 2.7750255990982955e-05, + "loss": 1.7385, + "step": 21419 + }, + { + "epoch": 6.574585635359116, + "grad_norm": 0.1784054934978485, + "learning_rate": 2.7745804799988106e-05, + "loss": 1.7129, + "step": 21420 + }, + { + "epoch": 6.574892572130141, + "grad_norm": 0.2047782689332962, + "learning_rate": 2.7741353828922258e-05, + "loss": 1.6972, + "step": 21421 + }, + { + "epoch": 6.575199508901166, + "grad_norm": 0.18886682391166687, + "learning_rate": 2.773690307782939e-05, + "loss": 1.6564, + "step": 21422 + }, + { + "epoch": 6.5755064456721914, + "grad_norm": 0.2088952213525772, + "learning_rate": 2.7732452546753484e-05, + "loss": 1.7309, + "step": 21423 + }, + { + "epoch": 6.575813382443217, + "grad_norm": 0.20526883006095886, + "learning_rate": 2.7728002235738565e-05, + "loss": 1.6811, + "step": 21424 + }, + { + "epoch": 6.576120319214242, + "grad_norm": 0.19648446142673492, + "learning_rate": 2.7723552144828545e-05, + "loss": 1.7237, + "step": 21425 + }, + { + "epoch": 6.5764272559852675, + "grad_norm": 0.22405673563480377, + "learning_rate": 2.7719102274067484e-05, + "loss": 1.7454, + "step": 21426 + }, + { + "epoch": 6.576734192756292, + "grad_norm": 0.24119171500205994, + "learning_rate": 2.7714652623499265e-05, + "loss": 1.7106, + "step": 21427 + }, + { + "epoch": 6.577041129527317, + "grad_norm": 0.2127196192741394, + "learning_rate": 2.771020319316794e-05, + "loss": 1.7895, + "step": 21428 + }, + { + "epoch": 6.577348066298343, + "grad_norm": 0.23805706202983856, + "learning_rate": 2.7705753983117443e-05, + "loss": 1.739, + "step": 21429 + }, + { + "epoch": 6.577655003069368, + "grad_norm": 0.24212954938411713, + "learning_rate": 2.7701304993391753e-05, + "loss": 1.683, + "step": 21430 + }, + { + "epoch": 6.577961939840393, + "grad_norm": 0.1946132481098175, + "learning_rate": 2.769685622403484e-05, + "loss": 1.6953, + "step": 21431 + }, + { + "epoch": 6.578268876611418, + "grad_norm": 0.2465951144695282, + "learning_rate": 2.769240767509067e-05, + "loss": 1.6594, + "step": 21432 + }, + { + "epoch": 6.578575813382443, + "grad_norm": 0.17029622197151184, + "learning_rate": 2.76879593466032e-05, + "loss": 1.6977, + "step": 21433 + }, + { + "epoch": 6.578882750153468, + "grad_norm": 0.23793117702007294, + "learning_rate": 2.7683511238616388e-05, + "loss": 1.6709, + "step": 21434 + }, + { + "epoch": 6.579189686924494, + "grad_norm": 0.20149341225624084, + "learning_rate": 2.76790633511742e-05, + "loss": 1.8074, + "step": 21435 + }, + { + "epoch": 6.579496623695519, + "grad_norm": 0.25029948353767395, + "learning_rate": 2.7674615684320593e-05, + "loss": 1.6649, + "step": 21436 + }, + { + "epoch": 6.579803560466544, + "grad_norm": 0.22212490439414978, + "learning_rate": 2.7670168238099515e-05, + "loss": 1.7322, + "step": 21437 + }, + { + "epoch": 6.580110497237569, + "grad_norm": 0.26087918877601624, + "learning_rate": 2.7665721012554925e-05, + "loss": 1.7285, + "step": 21438 + }, + { + "epoch": 6.580417434008594, + "grad_norm": 0.19286726415157318, + "learning_rate": 2.7661274007730776e-05, + "loss": 1.6912, + "step": 21439 + }, + { + "epoch": 6.5807243707796195, + "grad_norm": 0.23935118317604065, + "learning_rate": 2.7656827223670982e-05, + "loss": 1.6929, + "step": 21440 + }, + { + "epoch": 6.581031307550645, + "grad_norm": 0.2263423204421997, + "learning_rate": 2.7652380660419563e-05, + "loss": 1.6786, + "step": 21441 + }, + { + "epoch": 6.581338244321669, + "grad_norm": 0.19788038730621338, + "learning_rate": 2.7647934318020373e-05, + "loss": 1.7906, + "step": 21442 + }, + { + "epoch": 6.581645181092695, + "grad_norm": 0.25891759991645813, + "learning_rate": 2.7643488196517435e-05, + "loss": 1.7691, + "step": 21443 + }, + { + "epoch": 6.58195211786372, + "grad_norm": 0.25175485014915466, + "learning_rate": 2.7639042295954615e-05, + "loss": 1.7329, + "step": 21444 + }, + { + "epoch": 6.582259054634745, + "grad_norm": 0.1860336810350418, + "learning_rate": 2.7634596616375908e-05, + "loss": 1.7348, + "step": 21445 + }, + { + "epoch": 6.582565991405771, + "grad_norm": 0.2704271972179413, + "learning_rate": 2.7630151157825218e-05, + "loss": 1.7199, + "step": 21446 + }, + { + "epoch": 6.582872928176796, + "grad_norm": 0.16306720674037933, + "learning_rate": 2.762570592034649e-05, + "loss": 1.7174, + "step": 21447 + }, + { + "epoch": 6.58317986494782, + "grad_norm": 0.2585636079311371, + "learning_rate": 2.7621260903983648e-05, + "loss": 1.7392, + "step": 21448 + }, + { + "epoch": 6.583486801718846, + "grad_norm": 0.2086072564125061, + "learning_rate": 2.7616816108780623e-05, + "loss": 1.7417, + "step": 21449 + }, + { + "epoch": 6.583793738489871, + "grad_norm": 0.1747613251209259, + "learning_rate": 2.7612371534781343e-05, + "loss": 1.6607, + "step": 21450 + }, + { + "epoch": 6.584100675260896, + "grad_norm": 0.21026404201984406, + "learning_rate": 2.7607927182029726e-05, + "loss": 1.7725, + "step": 21451 + }, + { + "epoch": 6.584407612031922, + "grad_norm": 0.17881789803504944, + "learning_rate": 2.76034830505697e-05, + "loss": 1.7502, + "step": 21452 + }, + { + "epoch": 6.584714548802946, + "grad_norm": 0.2503713369369507, + "learning_rate": 2.7599039140445182e-05, + "loss": 1.798, + "step": 21453 + }, + { + "epoch": 6.5850214855739715, + "grad_norm": 0.22163939476013184, + "learning_rate": 2.7594595451700083e-05, + "loss": 1.725, + "step": 21454 + }, + { + "epoch": 6.585328422344997, + "grad_norm": 0.2154664546251297, + "learning_rate": 2.759015198437833e-05, + "loss": 1.7917, + "step": 21455 + }, + { + "epoch": 6.585635359116022, + "grad_norm": 0.1814090609550476, + "learning_rate": 2.7585708738523823e-05, + "loss": 1.6562, + "step": 21456 + }, + { + "epoch": 6.5859422958870475, + "grad_norm": 0.18815121054649353, + "learning_rate": 2.758126571418049e-05, + "loss": 1.6833, + "step": 21457 + }, + { + "epoch": 6.586249232658073, + "grad_norm": 0.19383473694324493, + "learning_rate": 2.757682291139222e-05, + "loss": 1.6987, + "step": 21458 + }, + { + "epoch": 6.586556169429097, + "grad_norm": 0.19574831426143646, + "learning_rate": 2.7572380330202912e-05, + "loss": 1.7231, + "step": 21459 + }, + { + "epoch": 6.586863106200123, + "grad_norm": 0.17509032785892487, + "learning_rate": 2.7567937970656527e-05, + "loss": 1.6452, + "step": 21460 + }, + { + "epoch": 6.587170042971148, + "grad_norm": 0.19439785182476044, + "learning_rate": 2.7563495832796886e-05, + "loss": 1.7168, + "step": 21461 + }, + { + "epoch": 6.587476979742173, + "grad_norm": 0.17384520173072815, + "learning_rate": 2.7559053916667953e-05, + "loss": 1.7128, + "step": 21462 + }, + { + "epoch": 6.587783916513199, + "grad_norm": 0.18308506906032562, + "learning_rate": 2.7554612222313597e-05, + "loss": 1.7184, + "step": 21463 + }, + { + "epoch": 6.588090853284223, + "grad_norm": 0.20052805542945862, + "learning_rate": 2.7550170749777726e-05, + "loss": 1.7239, + "step": 21464 + }, + { + "epoch": 6.588397790055248, + "grad_norm": 0.21892015635967255, + "learning_rate": 2.7545729499104215e-05, + "loss": 1.7297, + "step": 21465 + }, + { + "epoch": 6.588704726826274, + "grad_norm": 0.19819483160972595, + "learning_rate": 2.7541288470336973e-05, + "loss": 1.7303, + "step": 21466 + }, + { + "epoch": 6.589011663597299, + "grad_norm": 0.24296818673610687, + "learning_rate": 2.7536847663519884e-05, + "loss": 1.8525, + "step": 21467 + }, + { + "epoch": 6.589318600368324, + "grad_norm": 0.1971593201160431, + "learning_rate": 2.753240707869683e-05, + "loss": 1.7396, + "step": 21468 + }, + { + "epoch": 6.58962553713935, + "grad_norm": 0.24418935179710388, + "learning_rate": 2.7527966715911696e-05, + "loss": 1.7414, + "step": 21469 + }, + { + "epoch": 6.589932473910374, + "grad_norm": 0.2193990796804428, + "learning_rate": 2.7523526575208368e-05, + "loss": 1.7243, + "step": 21470 + }, + { + "epoch": 6.5902394106813995, + "grad_norm": 0.23612114787101746, + "learning_rate": 2.7519086656630722e-05, + "loss": 1.7072, + "step": 21471 + }, + { + "epoch": 6.590546347452425, + "grad_norm": 0.22282655537128448, + "learning_rate": 2.751464696022264e-05, + "loss": 1.7423, + "step": 21472 + }, + { + "epoch": 6.59085328422345, + "grad_norm": 0.21411976218223572, + "learning_rate": 2.7510207486027995e-05, + "loss": 1.7397, + "step": 21473 + }, + { + "epoch": 6.5911602209944755, + "grad_norm": 0.2244768589735031, + "learning_rate": 2.7505768234090663e-05, + "loss": 1.6964, + "step": 21474 + }, + { + "epoch": 6.5914671577655, + "grad_norm": 0.2250032275915146, + "learning_rate": 2.7501329204454512e-05, + "loss": 1.7307, + "step": 21475 + }, + { + "epoch": 6.591774094536525, + "grad_norm": 0.2643435299396515, + "learning_rate": 2.7496890397163395e-05, + "loss": 1.7298, + "step": 21476 + }, + { + "epoch": 6.592081031307551, + "grad_norm": 0.2204463928937912, + "learning_rate": 2.7492451812261232e-05, + "loss": 1.723, + "step": 21477 + }, + { + "epoch": 6.592387968078576, + "grad_norm": 0.2278377115726471, + "learning_rate": 2.7488013449791816e-05, + "loss": 1.7597, + "step": 21478 + }, + { + "epoch": 6.592694904849601, + "grad_norm": 0.18430690467357635, + "learning_rate": 2.7483575309799086e-05, + "loss": 1.6314, + "step": 21479 + }, + { + "epoch": 6.593001841620627, + "grad_norm": 0.26019781827926636, + "learning_rate": 2.7479137392326827e-05, + "loss": 1.7362, + "step": 21480 + }, + { + "epoch": 6.593308778391651, + "grad_norm": 0.2103995382785797, + "learning_rate": 2.7474699697418936e-05, + "loss": 1.7137, + "step": 21481 + }, + { + "epoch": 6.593615715162676, + "grad_norm": 0.220427006483078, + "learning_rate": 2.747026222511928e-05, + "loss": 1.7323, + "step": 21482 + }, + { + "epoch": 6.593922651933702, + "grad_norm": 0.21523109078407288, + "learning_rate": 2.7465824975471693e-05, + "loss": 1.7572, + "step": 21483 + }, + { + "epoch": 6.594229588704727, + "grad_norm": 0.21639512479305267, + "learning_rate": 2.7461387948520033e-05, + "loss": 1.7275, + "step": 21484 + }, + { + "epoch": 6.5945365254757515, + "grad_norm": 0.2043544203042984, + "learning_rate": 2.7456951144308147e-05, + "loss": 1.7454, + "step": 21485 + }, + { + "epoch": 6.594843462246777, + "grad_norm": 0.17847217619419098, + "learning_rate": 2.7452514562879882e-05, + "loss": 1.7356, + "step": 21486 + }, + { + "epoch": 6.595150399017802, + "grad_norm": 0.20756758749485016, + "learning_rate": 2.744807820427908e-05, + "loss": 1.7557, + "step": 21487 + }, + { + "epoch": 6.5954573357888275, + "grad_norm": 0.23579071462154388, + "learning_rate": 2.744364206854959e-05, + "loss": 1.7855, + "step": 21488 + }, + { + "epoch": 6.595764272559853, + "grad_norm": 0.1947307586669922, + "learning_rate": 2.7439206155735254e-05, + "loss": 1.7105, + "step": 21489 + }, + { + "epoch": 6.596071209330878, + "grad_norm": 0.1900642365217209, + "learning_rate": 2.74347704658799e-05, + "loss": 1.6692, + "step": 21490 + }, + { + "epoch": 6.596378146101903, + "grad_norm": 0.16756244003772736, + "learning_rate": 2.7430334999027375e-05, + "loss": 1.7175, + "step": 21491 + }, + { + "epoch": 6.596685082872928, + "grad_norm": 0.18581146001815796, + "learning_rate": 2.7425899755221506e-05, + "loss": 1.72, + "step": 21492 + }, + { + "epoch": 6.596992019643953, + "grad_norm": 0.2384853959083557, + "learning_rate": 2.7421464734506107e-05, + "loss": 1.718, + "step": 21493 + }, + { + "epoch": 6.597298956414979, + "grad_norm": 0.16853606700897217, + "learning_rate": 2.7417029936925065e-05, + "loss": 1.6819, + "step": 21494 + }, + { + "epoch": 6.597605893186004, + "grad_norm": 0.2273230254650116, + "learning_rate": 2.741259536252213e-05, + "loss": 1.7158, + "step": 21495 + }, + { + "epoch": 6.597912829957028, + "grad_norm": 0.2291530966758728, + "learning_rate": 2.7408161011341205e-05, + "loss": 1.7804, + "step": 21496 + }, + { + "epoch": 6.598219766728054, + "grad_norm": 0.17676831781864166, + "learning_rate": 2.740372688342604e-05, + "loss": 1.6693, + "step": 21497 + }, + { + "epoch": 6.598526703499079, + "grad_norm": 0.2386767417192459, + "learning_rate": 2.7399292978820508e-05, + "loss": 1.6932, + "step": 21498 + }, + { + "epoch": 6.598833640270104, + "grad_norm": 0.21329782903194427, + "learning_rate": 2.739485929756841e-05, + "loss": 1.7811, + "step": 21499 + }, + { + "epoch": 6.59914057704113, + "grad_norm": 0.19382116198539734, + "learning_rate": 2.7390425839713556e-05, + "loss": 1.7152, + "step": 21500 + }, + { + "epoch": 6.599447513812155, + "grad_norm": 0.1819920688867569, + "learning_rate": 2.738599260529977e-05, + "loss": 1.6571, + "step": 21501 + }, + { + "epoch": 6.5997544505831796, + "grad_norm": 0.19947806000709534, + "learning_rate": 2.738155959437086e-05, + "loss": 1.7138, + "step": 21502 + }, + { + "epoch": 6.600061387354205, + "grad_norm": 0.1851014792919159, + "learning_rate": 2.7377126806970634e-05, + "loss": 1.7109, + "step": 21503 + }, + { + "epoch": 6.60036832412523, + "grad_norm": 0.20365974307060242, + "learning_rate": 2.7372694243142905e-05, + "loss": 1.7145, + "step": 21504 + }, + { + "epoch": 6.600675260896256, + "grad_norm": 0.2070893943309784, + "learning_rate": 2.736826190293147e-05, + "loss": 1.7172, + "step": 21505 + }, + { + "epoch": 6.600982197667281, + "grad_norm": 0.19077777862548828, + "learning_rate": 2.7363829786380136e-05, + "loss": 1.7059, + "step": 21506 + }, + { + "epoch": 6.601289134438305, + "grad_norm": 0.21168744564056396, + "learning_rate": 2.73593978935327e-05, + "loss": 1.7483, + "step": 21507 + }, + { + "epoch": 6.601596071209331, + "grad_norm": 0.20746631920337677, + "learning_rate": 2.7354966224432965e-05, + "loss": 1.7165, + "step": 21508 + }, + { + "epoch": 6.601903007980356, + "grad_norm": 0.19440631568431854, + "learning_rate": 2.7350534779124732e-05, + "loss": 1.694, + "step": 21509 + }, + { + "epoch": 6.602209944751381, + "grad_norm": 0.20699405670166016, + "learning_rate": 2.7346103557651765e-05, + "loss": 1.7077, + "step": 21510 + }, + { + "epoch": 6.602516881522407, + "grad_norm": 0.19856512546539307, + "learning_rate": 2.7341672560057917e-05, + "loss": 1.77, + "step": 21511 + }, + { + "epoch": 6.602823818293432, + "grad_norm": 0.23978421092033386, + "learning_rate": 2.7337241786386915e-05, + "loss": 1.7531, + "step": 21512 + }, + { + "epoch": 6.6031307550644565, + "grad_norm": 0.1834867000579834, + "learning_rate": 2.73328112366826e-05, + "loss": 1.751, + "step": 21513 + }, + { + "epoch": 6.603437691835482, + "grad_norm": 0.2154606282711029, + "learning_rate": 2.7328380910988694e-05, + "loss": 1.737, + "step": 21514 + }, + { + "epoch": 6.603744628606507, + "grad_norm": 0.20554645359516144, + "learning_rate": 2.7323950809349035e-05, + "loss": 1.7629, + "step": 21515 + }, + { + "epoch": 6.6040515653775325, + "grad_norm": 0.20497548580169678, + "learning_rate": 2.7319520931807386e-05, + "loss": 1.7001, + "step": 21516 + }, + { + "epoch": 6.604358502148557, + "grad_norm": 0.18628253042697906, + "learning_rate": 2.7315091278407523e-05, + "loss": 1.7477, + "step": 21517 + }, + { + "epoch": 6.604665438919582, + "grad_norm": 0.20788705348968506, + "learning_rate": 2.731066184919323e-05, + "loss": 1.7185, + "step": 21518 + }, + { + "epoch": 6.604972375690608, + "grad_norm": 0.17834967374801636, + "learning_rate": 2.730623264420827e-05, + "loss": 1.67, + "step": 21519 + }, + { + "epoch": 6.605279312461633, + "grad_norm": 0.2183784693479538, + "learning_rate": 2.7301803663496417e-05, + "loss": 1.6983, + "step": 21520 + }, + { + "epoch": 6.605586249232658, + "grad_norm": 0.1735544204711914, + "learning_rate": 2.7297374907101447e-05, + "loss": 1.7352, + "step": 21521 + }, + { + "epoch": 6.605893186003684, + "grad_norm": 0.2504538893699646, + "learning_rate": 2.729294637506713e-05, + "loss": 1.7332, + "step": 21522 + }, + { + "epoch": 6.606200122774708, + "grad_norm": 0.1801074892282486, + "learning_rate": 2.728851806743722e-05, + "loss": 1.7251, + "step": 21523 + }, + { + "epoch": 6.606507059545733, + "grad_norm": 0.25701379776000977, + "learning_rate": 2.728408998425549e-05, + "loss": 1.732, + "step": 21524 + }, + { + "epoch": 6.606813996316759, + "grad_norm": 0.1801779717206955, + "learning_rate": 2.7279662125565697e-05, + "loss": 1.6793, + "step": 21525 + }, + { + "epoch": 6.607120933087784, + "grad_norm": 0.21244947612285614, + "learning_rate": 2.7275234491411595e-05, + "loss": 1.7493, + "step": 21526 + }, + { + "epoch": 6.607427869858809, + "grad_norm": 0.20944559574127197, + "learning_rate": 2.7270807081836924e-05, + "loss": 1.722, + "step": 21527 + }, + { + "epoch": 6.607734806629834, + "grad_norm": 0.2526783049106598, + "learning_rate": 2.7266379896885508e-05, + "loss": 1.7628, + "step": 21528 + }, + { + "epoch": 6.608041743400859, + "grad_norm": 0.19788937270641327, + "learning_rate": 2.7261952936601002e-05, + "loss": 1.6538, + "step": 21529 + }, + { + "epoch": 6.6083486801718845, + "grad_norm": 0.2623229920864105, + "learning_rate": 2.725752620102725e-05, + "loss": 1.7694, + "step": 21530 + }, + { + "epoch": 6.60865561694291, + "grad_norm": 0.21503256261348724, + "learning_rate": 2.7253099690207913e-05, + "loss": 1.7553, + "step": 21531 + }, + { + "epoch": 6.608962553713935, + "grad_norm": 0.2114928811788559, + "learning_rate": 2.724867340418679e-05, + "loss": 1.7067, + "step": 21532 + }, + { + "epoch": 6.6092694904849605, + "grad_norm": 0.17945198714733124, + "learning_rate": 2.7244247343007623e-05, + "loss": 1.7419, + "step": 21533 + }, + { + "epoch": 6.609576427255985, + "grad_norm": 0.19239214062690735, + "learning_rate": 2.7239821506714137e-05, + "loss": 1.7644, + "step": 21534 + }, + { + "epoch": 6.60988336402701, + "grad_norm": 0.22906997799873352, + "learning_rate": 2.7235395895350068e-05, + "loss": 1.8063, + "step": 21535 + }, + { + "epoch": 6.610190300798036, + "grad_norm": 0.1965717375278473, + "learning_rate": 2.7230970508959162e-05, + "loss": 1.7841, + "step": 21536 + }, + { + "epoch": 6.610497237569061, + "grad_norm": 0.19944418966770172, + "learning_rate": 2.7226545347585158e-05, + "loss": 1.7382, + "step": 21537 + }, + { + "epoch": 6.610804174340086, + "grad_norm": 0.17155805230140686, + "learning_rate": 2.722212041127178e-05, + "loss": 1.6621, + "step": 21538 + }, + { + "epoch": 6.611111111111111, + "grad_norm": 0.20459938049316406, + "learning_rate": 2.721769570006275e-05, + "loss": 1.7481, + "step": 21539 + }, + { + "epoch": 6.611418047882136, + "grad_norm": 0.1991354376077652, + "learning_rate": 2.7213271214001813e-05, + "loss": 1.7874, + "step": 21540 + }, + { + "epoch": 6.611724984653161, + "grad_norm": 0.25073128938674927, + "learning_rate": 2.7208846953132682e-05, + "loss": 1.7921, + "step": 21541 + }, + { + "epoch": 6.612031921424187, + "grad_norm": 0.24456258118152618, + "learning_rate": 2.7204422917499085e-05, + "loss": 1.7564, + "step": 21542 + }, + { + "epoch": 6.612338858195212, + "grad_norm": 0.18416531383991241, + "learning_rate": 2.7199999107144736e-05, + "loss": 1.7247, + "step": 21543 + }, + { + "epoch": 6.612645794966237, + "grad_norm": 0.18439221382141113, + "learning_rate": 2.7195575522113347e-05, + "loss": 1.6607, + "step": 21544 + }, + { + "epoch": 6.612952731737262, + "grad_norm": 0.20334671437740326, + "learning_rate": 2.7191152162448685e-05, + "loss": 1.7487, + "step": 21545 + }, + { + "epoch": 6.613259668508287, + "grad_norm": 0.17871633172035217, + "learning_rate": 2.718672902819438e-05, + "loss": 1.7355, + "step": 21546 + }, + { + "epoch": 6.6135666052793125, + "grad_norm": 0.23006688058376312, + "learning_rate": 2.718230611939424e-05, + "loss": 1.6489, + "step": 21547 + }, + { + "epoch": 6.613873542050338, + "grad_norm": 0.19141538441181183, + "learning_rate": 2.7177883436091877e-05, + "loss": 1.6793, + "step": 21548 + }, + { + "epoch": 6.614180478821363, + "grad_norm": 0.20549756288528442, + "learning_rate": 2.7173460978331068e-05, + "loss": 1.8331, + "step": 21549 + }, + { + "epoch": 6.614487415592388, + "grad_norm": 0.19106455147266388, + "learning_rate": 2.7169038746155495e-05, + "loss": 1.7295, + "step": 21550 + }, + { + "epoch": 6.614794352363413, + "grad_norm": 0.20190143585205078, + "learning_rate": 2.7164616739608866e-05, + "loss": 1.7032, + "step": 21551 + }, + { + "epoch": 6.615101289134438, + "grad_norm": 0.1969708949327469, + "learning_rate": 2.716019495873488e-05, + "loss": 1.6935, + "step": 21552 + }, + { + "epoch": 6.615408225905464, + "grad_norm": 0.23748311400413513, + "learning_rate": 2.7155773403577235e-05, + "loss": 1.7942, + "step": 21553 + }, + { + "epoch": 6.615715162676489, + "grad_norm": 0.29168081283569336, + "learning_rate": 2.715135207417962e-05, + "loss": 1.7121, + "step": 21554 + }, + { + "epoch": 6.616022099447514, + "grad_norm": 0.2428344041109085, + "learning_rate": 2.7146930970585738e-05, + "loss": 1.7287, + "step": 21555 + }, + { + "epoch": 6.616329036218539, + "grad_norm": 0.2520657479763031, + "learning_rate": 2.714251009283928e-05, + "loss": 1.8462, + "step": 21556 + }, + { + "epoch": 6.616635972989564, + "grad_norm": 0.2426053285598755, + "learning_rate": 2.713808944098394e-05, + "loss": 1.7094, + "step": 21557 + }, + { + "epoch": 6.616942909760589, + "grad_norm": 0.17593255639076233, + "learning_rate": 2.713366901506339e-05, + "loss": 1.6891, + "step": 21558 + }, + { + "epoch": 6.617249846531615, + "grad_norm": 0.20620940625667572, + "learning_rate": 2.7129248815121332e-05, + "loss": 1.7277, + "step": 21559 + }, + { + "epoch": 6.617556783302639, + "grad_norm": 0.21467719972133636, + "learning_rate": 2.7124828841201445e-05, + "loss": 1.7543, + "step": 21560 + }, + { + "epoch": 6.6178637200736645, + "grad_norm": 0.21372607350349426, + "learning_rate": 2.7120409093347378e-05, + "loss": 1.7207, + "step": 21561 + }, + { + "epoch": 6.61817065684469, + "grad_norm": 0.2123684585094452, + "learning_rate": 2.7115989571602884e-05, + "loss": 1.71, + "step": 21562 + }, + { + "epoch": 6.618477593615715, + "grad_norm": 0.19155478477478027, + "learning_rate": 2.711157027601155e-05, + "loss": 1.7182, + "step": 21563 + }, + { + "epoch": 6.6187845303867405, + "grad_norm": 0.23053184151649475, + "learning_rate": 2.7107151206617136e-05, + "loss": 1.7147, + "step": 21564 + }, + { + "epoch": 6.619091467157766, + "grad_norm": 0.1635691374540329, + "learning_rate": 2.7102732363463235e-05, + "loss": 1.6913, + "step": 21565 + }, + { + "epoch": 6.61939840392879, + "grad_norm": 0.19415298104286194, + "learning_rate": 2.709831374659357e-05, + "loss": 1.6813, + "step": 21566 + }, + { + "epoch": 6.619705340699816, + "grad_norm": 0.19547943770885468, + "learning_rate": 2.709389535605179e-05, + "loss": 1.6988, + "step": 21567 + }, + { + "epoch": 6.620012277470841, + "grad_norm": 0.1921805888414383, + "learning_rate": 2.7089477191881564e-05, + "loss": 1.6931, + "step": 21568 + }, + { + "epoch": 6.620319214241866, + "grad_norm": 0.18463274836540222, + "learning_rate": 2.7085059254126554e-05, + "loss": 1.7168, + "step": 21569 + }, + { + "epoch": 6.620626151012892, + "grad_norm": 0.2078532725572586, + "learning_rate": 2.7080641542830414e-05, + "loss": 1.7248, + "step": 21570 + }, + { + "epoch": 6.620933087783916, + "grad_norm": 0.18778283894062042, + "learning_rate": 2.7076224058036813e-05, + "loss": 1.6745, + "step": 21571 + }, + { + "epoch": 6.621240024554941, + "grad_norm": 0.26190707087516785, + "learning_rate": 2.70718067997894e-05, + "loss": 1.7317, + "step": 21572 + }, + { + "epoch": 6.621546961325967, + "grad_norm": 0.20449557900428772, + "learning_rate": 2.7067389768131836e-05, + "loss": 1.7167, + "step": 21573 + }, + { + "epoch": 6.621853898096992, + "grad_norm": 0.22722119092941284, + "learning_rate": 2.706297296310776e-05, + "loss": 1.7262, + "step": 21574 + }, + { + "epoch": 6.622160834868017, + "grad_norm": 0.24897173047065735, + "learning_rate": 2.7058556384760825e-05, + "loss": 1.7273, + "step": 21575 + }, + { + "epoch": 6.622467771639043, + "grad_norm": 0.19774340093135834, + "learning_rate": 2.705414003313469e-05, + "loss": 1.6765, + "step": 21576 + }, + { + "epoch": 6.622774708410067, + "grad_norm": 0.2661767303943634, + "learning_rate": 2.7049723908272995e-05, + "loss": 1.7046, + "step": 21577 + }, + { + "epoch": 6.6230816451810925, + "grad_norm": 0.2013266384601593, + "learning_rate": 2.7045308010219356e-05, + "loss": 1.7156, + "step": 21578 + }, + { + "epoch": 6.623388581952118, + "grad_norm": 0.22952915728092194, + "learning_rate": 2.7040892339017475e-05, + "loss": 1.7601, + "step": 21579 + }, + { + "epoch": 6.623695518723143, + "grad_norm": 0.18262411653995514, + "learning_rate": 2.7036476894710916e-05, + "loss": 1.7334, + "step": 21580 + }, + { + "epoch": 6.6240024554941686, + "grad_norm": 0.18907666206359863, + "learning_rate": 2.703206167734339e-05, + "loss": 1.7196, + "step": 21581 + }, + { + "epoch": 6.624309392265193, + "grad_norm": 0.2192571759223938, + "learning_rate": 2.7027646686958453e-05, + "loss": 1.7046, + "step": 21582 + }, + { + "epoch": 6.624616329036218, + "grad_norm": 0.165769562125206, + "learning_rate": 2.70232319235998e-05, + "loss": 1.7028, + "step": 21583 + }, + { + "epoch": 6.624923265807244, + "grad_norm": 0.19245828688144684, + "learning_rate": 2.701881738731103e-05, + "loss": 1.7153, + "step": 21584 + }, + { + "epoch": 6.625230202578269, + "grad_norm": 0.17638756334781647, + "learning_rate": 2.7014403078135776e-05, + "loss": 1.7071, + "step": 21585 + }, + { + "epoch": 6.625537139349294, + "grad_norm": 0.17205210030078888, + "learning_rate": 2.700998899611767e-05, + "loss": 1.6706, + "step": 21586 + }, + { + "epoch": 6.62584407612032, + "grad_norm": 0.24107681214809418, + "learning_rate": 2.700557514130032e-05, + "loss": 1.8013, + "step": 21587 + }, + { + "epoch": 6.626151012891344, + "grad_norm": 0.1839917004108429, + "learning_rate": 2.7001161513727358e-05, + "loss": 1.7381, + "step": 21588 + }, + { + "epoch": 6.6264579496623695, + "grad_norm": 0.24043352901935577, + "learning_rate": 2.6996748113442394e-05, + "loss": 1.7523, + "step": 21589 + }, + { + "epoch": 6.626764886433395, + "grad_norm": 0.23488068580627441, + "learning_rate": 2.6992334940489056e-05, + "loss": 1.7587, + "step": 21590 + }, + { + "epoch": 6.62707182320442, + "grad_norm": 0.18784530460834503, + "learning_rate": 2.698792199491094e-05, + "loss": 1.7053, + "step": 21591 + }, + { + "epoch": 6.627378759975445, + "grad_norm": 0.2758429944515228, + "learning_rate": 2.6983509276751673e-05, + "loss": 1.6927, + "step": 21592 + }, + { + "epoch": 6.62768569674647, + "grad_norm": 0.2731272280216217, + "learning_rate": 2.697909678605486e-05, + "loss": 1.7351, + "step": 21593 + }, + { + "epoch": 6.627992633517495, + "grad_norm": 0.24450576305389404, + "learning_rate": 2.6974684522864098e-05, + "loss": 1.7126, + "step": 21594 + }, + { + "epoch": 6.628299570288521, + "grad_norm": 0.21820391714572906, + "learning_rate": 2.6970272487222982e-05, + "loss": 1.7075, + "step": 21595 + }, + { + "epoch": 6.628606507059546, + "grad_norm": 0.23647959530353546, + "learning_rate": 2.696586067917517e-05, + "loss": 1.7369, + "step": 21596 + }, + { + "epoch": 6.628913443830571, + "grad_norm": 0.2665121555328369, + "learning_rate": 2.696144909876419e-05, + "loss": 1.7575, + "step": 21597 + }, + { + "epoch": 6.629220380601596, + "grad_norm": 0.19871680438518524, + "learning_rate": 2.695703774603371e-05, + "loss": 1.7334, + "step": 21598 + }, + { + "epoch": 6.629527317372621, + "grad_norm": 0.2363109588623047, + "learning_rate": 2.6952626621027245e-05, + "loss": 1.6878, + "step": 21599 + }, + { + "epoch": 6.629834254143646, + "grad_norm": 0.21958591043949127, + "learning_rate": 2.694821572378845e-05, + "loss": 1.6828, + "step": 21600 + }, + { + "epoch": 6.630141190914672, + "grad_norm": 0.20437858998775482, + "learning_rate": 2.6943805054360906e-05, + "loss": 1.7138, + "step": 21601 + }, + { + "epoch": 6.630448127685697, + "grad_norm": 0.27741923928260803, + "learning_rate": 2.6939394612788193e-05, + "loss": 1.7506, + "step": 21602 + }, + { + "epoch": 6.6307550644567215, + "grad_norm": 0.1885133981704712, + "learning_rate": 2.6934984399113917e-05, + "loss": 1.7669, + "step": 21603 + }, + { + "epoch": 6.631062001227747, + "grad_norm": 0.19453810155391693, + "learning_rate": 2.6930574413381604e-05, + "loss": 1.6837, + "step": 21604 + }, + { + "epoch": 6.631368937998772, + "grad_norm": 0.1685735285282135, + "learning_rate": 2.6926164655634894e-05, + "loss": 1.7045, + "step": 21605 + }, + { + "epoch": 6.6316758747697975, + "grad_norm": 0.2507462203502655, + "learning_rate": 2.6921755125917347e-05, + "loss": 1.7754, + "step": 21606 + }, + { + "epoch": 6.631982811540823, + "grad_norm": 0.1725471317768097, + "learning_rate": 2.691734582427255e-05, + "loss": 1.7219, + "step": 21607 + }, + { + "epoch": 6.632289748311848, + "grad_norm": 0.2633528709411621, + "learning_rate": 2.6912936750744068e-05, + "loss": 1.7362, + "step": 21608 + }, + { + "epoch": 6.632596685082873, + "grad_norm": 0.1808360069990158, + "learning_rate": 2.6908527905375474e-05, + "loss": 1.7338, + "step": 21609 + }, + { + "epoch": 6.632903621853898, + "grad_norm": 0.16186563670635223, + "learning_rate": 2.6904119288210344e-05, + "loss": 1.6752, + "step": 21610 + }, + { + "epoch": 6.633210558624923, + "grad_norm": 0.1954091340303421, + "learning_rate": 2.689971089929224e-05, + "loss": 1.714, + "step": 21611 + }, + { + "epoch": 6.633517495395949, + "grad_norm": 0.18954069912433624, + "learning_rate": 2.689530273866474e-05, + "loss": 1.7869, + "step": 21612 + }, + { + "epoch": 6.633824432166974, + "grad_norm": 0.182058185338974, + "learning_rate": 2.6890894806371392e-05, + "loss": 1.7708, + "step": 21613 + }, + { + "epoch": 6.634131368937998, + "grad_norm": 0.17313501238822937, + "learning_rate": 2.6886487102455755e-05, + "loss": 1.7064, + "step": 21614 + }, + { + "epoch": 6.634438305709024, + "grad_norm": 0.1732148379087448, + "learning_rate": 2.688207962696143e-05, + "loss": 1.7378, + "step": 21615 + }, + { + "epoch": 6.634745242480049, + "grad_norm": 0.17057274281978607, + "learning_rate": 2.687767237993191e-05, + "loss": 1.671, + "step": 21616 + }, + { + "epoch": 6.635052179251074, + "grad_norm": 0.17723220586776733, + "learning_rate": 2.6873265361410805e-05, + "loss": 1.7179, + "step": 21617 + }, + { + "epoch": 6.6353591160221, + "grad_norm": 0.18634437024593353, + "learning_rate": 2.6868858571441645e-05, + "loss": 1.7355, + "step": 21618 + }, + { + "epoch": 6.635666052793125, + "grad_norm": 0.205010786652565, + "learning_rate": 2.6864452010067985e-05, + "loss": 1.7399, + "step": 21619 + }, + { + "epoch": 6.6359729895641495, + "grad_norm": 0.2071879357099533, + "learning_rate": 2.6860045677333383e-05, + "loss": 1.7199, + "step": 21620 + }, + { + "epoch": 6.636279926335175, + "grad_norm": 0.17309685051441193, + "learning_rate": 2.685563957328134e-05, + "loss": 1.6595, + "step": 21621 + }, + { + "epoch": 6.6365868631062, + "grad_norm": 0.3505750000476837, + "learning_rate": 2.685123369795545e-05, + "loss": 1.7601, + "step": 21622 + }, + { + "epoch": 6.6368937998772255, + "grad_norm": 0.19184419512748718, + "learning_rate": 2.684682805139923e-05, + "loss": 1.7225, + "step": 21623 + }, + { + "epoch": 6.637200736648251, + "grad_norm": 0.20142409205436707, + "learning_rate": 2.6842422633656233e-05, + "loss": 1.7201, + "step": 21624 + }, + { + "epoch": 6.637507673419275, + "grad_norm": 0.18348537385463715, + "learning_rate": 2.6838017444769993e-05, + "loss": 1.6983, + "step": 21625 + }, + { + "epoch": 6.637814610190301, + "grad_norm": 0.19275228679180145, + "learning_rate": 2.6833612484784033e-05, + "loss": 1.7028, + "step": 21626 + }, + { + "epoch": 6.638121546961326, + "grad_norm": 0.21269574761390686, + "learning_rate": 2.682920775374189e-05, + "loss": 1.7888, + "step": 21627 + }, + { + "epoch": 6.638428483732351, + "grad_norm": 0.17470422387123108, + "learning_rate": 2.68248032516871e-05, + "loss": 1.7147, + "step": 21628 + }, + { + "epoch": 6.638735420503377, + "grad_norm": 0.15697288513183594, + "learning_rate": 2.6820398978663185e-05, + "loss": 1.6544, + "step": 21629 + }, + { + "epoch": 6.639042357274402, + "grad_norm": 0.18636487424373627, + "learning_rate": 2.6815994934713677e-05, + "loss": 1.721, + "step": 21630 + }, + { + "epoch": 6.639349294045426, + "grad_norm": 0.18091215193271637, + "learning_rate": 2.681159111988208e-05, + "loss": 1.6973, + "step": 21631 + }, + { + "epoch": 6.639656230816452, + "grad_norm": 0.21360217034816742, + "learning_rate": 2.6807187534211965e-05, + "loss": 1.7379, + "step": 21632 + }, + { + "epoch": 6.639963167587477, + "grad_norm": 0.20027592778205872, + "learning_rate": 2.6802784177746777e-05, + "loss": 1.7207, + "step": 21633 + }, + { + "epoch": 6.640270104358502, + "grad_norm": 0.21839644014835358, + "learning_rate": 2.679838105053011e-05, + "loss": 1.715, + "step": 21634 + }, + { + "epoch": 6.640577041129527, + "grad_norm": 0.19237302243709564, + "learning_rate": 2.6793978152605404e-05, + "loss": 1.7415, + "step": 21635 + }, + { + "epoch": 6.640883977900552, + "grad_norm": 0.1979883313179016, + "learning_rate": 2.678957548401623e-05, + "loss": 1.7005, + "step": 21636 + }, + { + "epoch": 6.6411909146715775, + "grad_norm": 0.21867144107818604, + "learning_rate": 2.678517304480609e-05, + "loss": 1.8008, + "step": 21637 + }, + { + "epoch": 6.641497851442603, + "grad_norm": 0.17232954502105713, + "learning_rate": 2.6780770835018433e-05, + "loss": 1.6867, + "step": 21638 + }, + { + "epoch": 6.641804788213628, + "grad_norm": 0.21535196900367737, + "learning_rate": 2.6776368854696853e-05, + "loss": 1.7545, + "step": 21639 + }, + { + "epoch": 6.6421117249846535, + "grad_norm": 0.18891240656375885, + "learning_rate": 2.6771967103884766e-05, + "loss": 1.7164, + "step": 21640 + }, + { + "epoch": 6.642418661755678, + "grad_norm": 0.2558320462703705, + "learning_rate": 2.6767565582625743e-05, + "loss": 1.8125, + "step": 21641 + }, + { + "epoch": 6.642725598526703, + "grad_norm": 0.20400027930736542, + "learning_rate": 2.6763164290963244e-05, + "loss": 1.7335, + "step": 21642 + }, + { + "epoch": 6.643032535297729, + "grad_norm": 0.21388766169548035, + "learning_rate": 2.6758763228940775e-05, + "loss": 1.7788, + "step": 21643 + }, + { + "epoch": 6.643339472068754, + "grad_norm": 0.20607435703277588, + "learning_rate": 2.6754362396601834e-05, + "loss": 1.7481, + "step": 21644 + }, + { + "epoch": 6.643646408839779, + "grad_norm": 0.1608831286430359, + "learning_rate": 2.6749961793989907e-05, + "loss": 1.6577, + "step": 21645 + }, + { + "epoch": 6.643953345610804, + "grad_norm": 0.19074808061122894, + "learning_rate": 2.6745561421148485e-05, + "loss": 1.7335, + "step": 21646 + }, + { + "epoch": 6.644260282381829, + "grad_norm": 0.16517756879329681, + "learning_rate": 2.6741161278121053e-05, + "loss": 1.6663, + "step": 21647 + }, + { + "epoch": 6.644567219152854, + "grad_norm": 0.18976998329162598, + "learning_rate": 2.673676136495108e-05, + "loss": 1.7231, + "step": 21648 + }, + { + "epoch": 6.64487415592388, + "grad_norm": 0.20694875717163086, + "learning_rate": 2.6732361681682106e-05, + "loss": 1.7469, + "step": 21649 + }, + { + "epoch": 6.645181092694905, + "grad_norm": 0.1994311809539795, + "learning_rate": 2.6727962228357533e-05, + "loss": 1.6864, + "step": 21650 + }, + { + "epoch": 6.64548802946593, + "grad_norm": 0.18886511027812958, + "learning_rate": 2.672356300502091e-05, + "loss": 1.6874, + "step": 21651 + }, + { + "epoch": 6.645794966236955, + "grad_norm": 0.2152819186449051, + "learning_rate": 2.6719164011715653e-05, + "loss": 1.7327, + "step": 21652 + }, + { + "epoch": 6.64610190300798, + "grad_norm": 0.20525617897510529, + "learning_rate": 2.6714765248485275e-05, + "loss": 1.7409, + "step": 21653 + }, + { + "epoch": 6.6464088397790055, + "grad_norm": 0.21892790496349335, + "learning_rate": 2.6710366715373254e-05, + "loss": 1.7281, + "step": 21654 + }, + { + "epoch": 6.646715776550031, + "grad_norm": 0.20156462490558624, + "learning_rate": 2.6705968412423e-05, + "loss": 1.7211, + "step": 21655 + }, + { + "epoch": 6.647022713321056, + "grad_norm": 0.19993625581264496, + "learning_rate": 2.670157033967806e-05, + "loss": 1.8058, + "step": 21656 + }, + { + "epoch": 6.647329650092081, + "grad_norm": 0.1970909684896469, + "learning_rate": 2.669717249718182e-05, + "loss": 1.7707, + "step": 21657 + }, + { + "epoch": 6.647636586863106, + "grad_norm": 0.19287796318531036, + "learning_rate": 2.6692774884977796e-05, + "loss": 1.688, + "step": 21658 + }, + { + "epoch": 6.647943523634131, + "grad_norm": 0.17658226191997528, + "learning_rate": 2.668837750310943e-05, + "loss": 1.6936, + "step": 21659 + }, + { + "epoch": 6.648250460405157, + "grad_norm": 0.20234479010105133, + "learning_rate": 2.6683980351620184e-05, + "loss": 1.7069, + "step": 21660 + }, + { + "epoch": 6.648557397176182, + "grad_norm": 0.1957871913909912, + "learning_rate": 2.6679583430553513e-05, + "loss": 1.736, + "step": 21661 + }, + { + "epoch": 6.648864333947207, + "grad_norm": 0.20084553956985474, + "learning_rate": 2.667518673995286e-05, + "loss": 1.7262, + "step": 21662 + }, + { + "epoch": 6.649171270718232, + "grad_norm": 0.18749211728572845, + "learning_rate": 2.667079027986169e-05, + "loss": 1.7127, + "step": 21663 + }, + { + "epoch": 6.649478207489257, + "grad_norm": 0.1747027188539505, + "learning_rate": 2.666639405032344e-05, + "loss": 1.6922, + "step": 21664 + }, + { + "epoch": 6.649785144260282, + "grad_norm": 0.3119397759437561, + "learning_rate": 2.666199805138154e-05, + "loss": 1.7373, + "step": 21665 + }, + { + "epoch": 6.650092081031308, + "grad_norm": 0.25986436009407043, + "learning_rate": 2.6657602283079498e-05, + "loss": 1.7521, + "step": 21666 + }, + { + "epoch": 6.650399017802332, + "grad_norm": 0.20535705983638763, + "learning_rate": 2.6653206745460663e-05, + "loss": 1.7144, + "step": 21667 + }, + { + "epoch": 6.650705954573358, + "grad_norm": 0.20804347097873688, + "learning_rate": 2.6648811438568566e-05, + "loss": 1.7186, + "step": 21668 + }, + { + "epoch": 6.651012891344383, + "grad_norm": 0.20753289759159088, + "learning_rate": 2.6644416362446566e-05, + "loss": 1.7098, + "step": 21669 + }, + { + "epoch": 6.651319828115408, + "grad_norm": 0.18725311756134033, + "learning_rate": 2.6640021517138148e-05, + "loss": 1.7331, + "step": 21670 + }, + { + "epoch": 6.651626764886434, + "grad_norm": 0.1907210648059845, + "learning_rate": 2.663562690268675e-05, + "loss": 1.6677, + "step": 21671 + }, + { + "epoch": 6.651933701657459, + "grad_norm": 0.19124922156333923, + "learning_rate": 2.6631232519135747e-05, + "loss": 1.7337, + "step": 21672 + }, + { + "epoch": 6.652240638428484, + "grad_norm": 0.21045447885990143, + "learning_rate": 2.6626838366528633e-05, + "loss": 1.7028, + "step": 21673 + }, + { + "epoch": 6.652547575199509, + "grad_norm": 0.1891855001449585, + "learning_rate": 2.6622444444908767e-05, + "loss": 1.7247, + "step": 21674 + }, + { + "epoch": 6.652854511970534, + "grad_norm": 0.2236541211605072, + "learning_rate": 2.6618050754319623e-05, + "loss": 1.6986, + "step": 21675 + }, + { + "epoch": 6.653161448741559, + "grad_norm": 0.19088539481163025, + "learning_rate": 2.6613657294804604e-05, + "loss": 1.7118, + "step": 21676 + }, + { + "epoch": 6.653468385512585, + "grad_norm": 0.26210764050483704, + "learning_rate": 2.660926406640714e-05, + "loss": 1.7542, + "step": 21677 + }, + { + "epoch": 6.653775322283609, + "grad_norm": 0.2564029097557068, + "learning_rate": 2.6604871069170632e-05, + "loss": 1.7395, + "step": 21678 + }, + { + "epoch": 6.6540822590546345, + "grad_norm": 0.22974301874637604, + "learning_rate": 2.6600478303138503e-05, + "loss": 1.6905, + "step": 21679 + }, + { + "epoch": 6.65438919582566, + "grad_norm": 0.299772173166275, + "learning_rate": 2.659608576835416e-05, + "loss": 1.7875, + "step": 21680 + }, + { + "epoch": 6.654696132596685, + "grad_norm": 0.26459556818008423, + "learning_rate": 2.6591693464861018e-05, + "loss": 1.7185, + "step": 21681 + }, + { + "epoch": 6.6550030693677105, + "grad_norm": 0.24505311250686646, + "learning_rate": 2.6587301392702457e-05, + "loss": 1.7105, + "step": 21682 + }, + { + "epoch": 6.655310006138736, + "grad_norm": 0.1626308262348175, + "learning_rate": 2.6582909551921953e-05, + "loss": 1.6668, + "step": 21683 + }, + { + "epoch": 6.65561694290976, + "grad_norm": 0.20354291796684265, + "learning_rate": 2.6578517942562813e-05, + "loss": 1.7437, + "step": 21684 + }, + { + "epoch": 6.655923879680786, + "grad_norm": 0.18618443608283997, + "learning_rate": 2.6574126564668532e-05, + "loss": 1.6757, + "step": 21685 + }, + { + "epoch": 6.656230816451811, + "grad_norm": 0.1863735467195511, + "learning_rate": 2.656973541828242e-05, + "loss": 1.6549, + "step": 21686 + }, + { + "epoch": 6.656537753222836, + "grad_norm": 0.2118620127439499, + "learning_rate": 2.6565344503447935e-05, + "loss": 1.6927, + "step": 21687 + }, + { + "epoch": 6.656844689993862, + "grad_norm": 0.24023136496543884, + "learning_rate": 2.6560953820208478e-05, + "loss": 1.6969, + "step": 21688 + }, + { + "epoch": 6.657151626764886, + "grad_norm": 0.21124204993247986, + "learning_rate": 2.6556563368607368e-05, + "loss": 1.6662, + "step": 21689 + }, + { + "epoch": 6.657458563535911, + "grad_norm": 0.16295355558395386, + "learning_rate": 2.6552173148688075e-05, + "loss": 1.7203, + "step": 21690 + }, + { + "epoch": 6.657765500306937, + "grad_norm": 0.18650858104228973, + "learning_rate": 2.6547783160493916e-05, + "loss": 1.7177, + "step": 21691 + }, + { + "epoch": 6.658072437077962, + "grad_norm": 0.20509213209152222, + "learning_rate": 2.6543393404068328e-05, + "loss": 1.723, + "step": 21692 + }, + { + "epoch": 6.658379373848987, + "grad_norm": 0.20985513925552368, + "learning_rate": 2.6539003879454678e-05, + "loss": 1.6679, + "step": 21693 + }, + { + "epoch": 6.658686310620013, + "grad_norm": 0.19907233119010925, + "learning_rate": 2.6534614586696338e-05, + "loss": 1.7028, + "step": 21694 + }, + { + "epoch": 6.658993247391037, + "grad_norm": 0.21793772280216217, + "learning_rate": 2.6530225525836692e-05, + "loss": 1.7706, + "step": 21695 + }, + { + "epoch": 6.6593001841620625, + "grad_norm": 0.24162191152572632, + "learning_rate": 2.6525836696919117e-05, + "loss": 1.806, + "step": 21696 + }, + { + "epoch": 6.659607120933088, + "grad_norm": 0.1735360324382782, + "learning_rate": 2.652144809998698e-05, + "loss": 1.7047, + "step": 21697 + }, + { + "epoch": 6.659914057704113, + "grad_norm": 0.18471799790859222, + "learning_rate": 2.651705973508365e-05, + "loss": 1.7306, + "step": 21698 + }, + { + "epoch": 6.6602209944751385, + "grad_norm": 0.17422814667224884, + "learning_rate": 2.6512671602252482e-05, + "loss": 1.6666, + "step": 21699 + }, + { + "epoch": 6.660527931246163, + "grad_norm": 0.19209833443164825, + "learning_rate": 2.6508283701536897e-05, + "loss": 1.6966, + "step": 21700 + }, + { + "epoch": 6.660834868017188, + "grad_norm": 0.1902640461921692, + "learning_rate": 2.650389603298019e-05, + "loss": 1.7887, + "step": 21701 + }, + { + "epoch": 6.661141804788214, + "grad_norm": 0.18551218509674072, + "learning_rate": 2.6499508596625787e-05, + "loss": 1.6851, + "step": 21702 + }, + { + "epoch": 6.661448741559239, + "grad_norm": 0.2165011614561081, + "learning_rate": 2.6495121392516976e-05, + "loss": 1.7465, + "step": 21703 + }, + { + "epoch": 6.661755678330264, + "grad_norm": 0.22871245443820953, + "learning_rate": 2.6490734420697172e-05, + "loss": 1.7487, + "step": 21704 + }, + { + "epoch": 6.66206261510129, + "grad_norm": 0.21275551617145538, + "learning_rate": 2.6486347681209723e-05, + "loss": 1.7782, + "step": 21705 + }, + { + "epoch": 6.662369551872314, + "grad_norm": 0.2926945984363556, + "learning_rate": 2.6481961174097937e-05, + "loss": 1.7413, + "step": 21706 + }, + { + "epoch": 6.662676488643339, + "grad_norm": 0.17143094539642334, + "learning_rate": 2.6477574899405233e-05, + "loss": 1.6639, + "step": 21707 + }, + { + "epoch": 6.662983425414365, + "grad_norm": 0.22194001078605652, + "learning_rate": 2.647318885717488e-05, + "loss": 1.7035, + "step": 21708 + }, + { + "epoch": 6.66329036218539, + "grad_norm": 0.18232671916484833, + "learning_rate": 2.6468803047450286e-05, + "loss": 1.6977, + "step": 21709 + }, + { + "epoch": 6.6635972989564145, + "grad_norm": 0.2626599371433258, + "learning_rate": 2.6464417470274773e-05, + "loss": 1.7422, + "step": 21710 + }, + { + "epoch": 6.66390423572744, + "grad_norm": 0.2034282237291336, + "learning_rate": 2.6460032125691668e-05, + "loss": 1.7531, + "step": 21711 + }, + { + "epoch": 6.664211172498465, + "grad_norm": 0.2308860868215561, + "learning_rate": 2.645564701374434e-05, + "loss": 1.7271, + "step": 21712 + }, + { + "epoch": 6.6645181092694905, + "grad_norm": 0.2163545936346054, + "learning_rate": 2.64512621344761e-05, + "loss": 1.7632, + "step": 21713 + }, + { + "epoch": 6.664825046040516, + "grad_norm": 0.2566233277320862, + "learning_rate": 2.644687748793029e-05, + "loss": 1.7573, + "step": 21714 + }, + { + "epoch": 6.665131982811541, + "grad_norm": 0.21093623340129852, + "learning_rate": 2.6442493074150244e-05, + "loss": 1.6703, + "step": 21715 + }, + { + "epoch": 6.665438919582566, + "grad_norm": 0.2083086222410202, + "learning_rate": 2.643810889317927e-05, + "loss": 1.6672, + "step": 21716 + }, + { + "epoch": 6.665745856353591, + "grad_norm": 0.20711155235767365, + "learning_rate": 2.643372494506075e-05, + "loss": 1.7276, + "step": 21717 + }, + { + "epoch": 6.666052793124616, + "grad_norm": 0.18977457284927368, + "learning_rate": 2.6429341229837935e-05, + "loss": 1.7207, + "step": 21718 + }, + { + "epoch": 6.666359729895642, + "grad_norm": 0.28336507081985474, + "learning_rate": 2.6424957747554224e-05, + "loss": 1.7473, + "step": 21719 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.1761232167482376, + "learning_rate": 2.642057449825286e-05, + "loss": 1.7172, + "step": 21720 + }, + { + "epoch": 6.666973603437691, + "grad_norm": 0.21672405302524567, + "learning_rate": 2.6416191481977215e-05, + "loss": 1.6561, + "step": 21721 + }, + { + "epoch": 6.667280540208717, + "grad_norm": 0.226834237575531, + "learning_rate": 2.6411808698770613e-05, + "loss": 1.7315, + "step": 21722 + }, + { + "epoch": 6.667587476979742, + "grad_norm": 0.22553586959838867, + "learning_rate": 2.6407426148676307e-05, + "loss": 1.7301, + "step": 21723 + }, + { + "epoch": 6.667894413750767, + "grad_norm": 0.1913517564535141, + "learning_rate": 2.6403043831737672e-05, + "loss": 1.6739, + "step": 21724 + }, + { + "epoch": 6.668201350521793, + "grad_norm": 0.24560052156448364, + "learning_rate": 2.6398661747997955e-05, + "loss": 1.7347, + "step": 21725 + }, + { + "epoch": 6.668508287292818, + "grad_norm": 0.27361172437667847, + "learning_rate": 2.6394279897500517e-05, + "loss": 1.7713, + "step": 21726 + }, + { + "epoch": 6.6688152240638425, + "grad_norm": 0.21486583352088928, + "learning_rate": 2.6389898280288638e-05, + "loss": 1.7504, + "step": 21727 + }, + { + "epoch": 6.669122160834868, + "grad_norm": 0.19056405127048492, + "learning_rate": 2.6385516896405627e-05, + "loss": 1.7457, + "step": 21728 + }, + { + "epoch": 6.669429097605893, + "grad_norm": 0.19316376745700836, + "learning_rate": 2.638113574589478e-05, + "loss": 1.6969, + "step": 21729 + }, + { + "epoch": 6.6697360343769185, + "grad_norm": 0.21700869500637054, + "learning_rate": 2.637675482879939e-05, + "loss": 1.7055, + "step": 21730 + }, + { + "epoch": 6.670042971147944, + "grad_norm": 0.19720883667469025, + "learning_rate": 2.637237414516275e-05, + "loss": 1.7029, + "step": 21731 + }, + { + "epoch": 6.670349907918968, + "grad_norm": 0.16528408229351044, + "learning_rate": 2.6367993695028158e-05, + "loss": 1.6915, + "step": 21732 + }, + { + "epoch": 6.670656844689994, + "grad_norm": 0.19576294720172882, + "learning_rate": 2.636361347843889e-05, + "loss": 1.7034, + "step": 21733 + }, + { + "epoch": 6.670963781461019, + "grad_norm": 0.16859273612499237, + "learning_rate": 2.6359233495438285e-05, + "loss": 1.7114, + "step": 21734 + }, + { + "epoch": 6.671270718232044, + "grad_norm": 0.20480163395404816, + "learning_rate": 2.6354853746069553e-05, + "loss": 1.7304, + "step": 21735 + }, + { + "epoch": 6.67157765500307, + "grad_norm": 0.19104263186454773, + "learning_rate": 2.6350474230376048e-05, + "loss": 1.7026, + "step": 21736 + }, + { + "epoch": 6.671884591774095, + "grad_norm": 0.18243174254894257, + "learning_rate": 2.634609494840098e-05, + "loss": 1.6769, + "step": 21737 + }, + { + "epoch": 6.672191528545119, + "grad_norm": 0.20766063034534454, + "learning_rate": 2.634171590018769e-05, + "loss": 1.7436, + "step": 21738 + }, + { + "epoch": 6.672498465316145, + "grad_norm": 0.22035297751426697, + "learning_rate": 2.6337337085779444e-05, + "loss": 1.8211, + "step": 21739 + }, + { + "epoch": 6.67280540208717, + "grad_norm": 0.18965984880924225, + "learning_rate": 2.6332958505219475e-05, + "loss": 1.7067, + "step": 21740 + }, + { + "epoch": 6.673112338858195, + "grad_norm": 0.21209993958473206, + "learning_rate": 2.632858015855111e-05, + "loss": 1.7743, + "step": 21741 + }, + { + "epoch": 6.67341927562922, + "grad_norm": 0.18409015238285065, + "learning_rate": 2.6324202045817547e-05, + "loss": 1.7494, + "step": 21742 + }, + { + "epoch": 6.673726212400245, + "grad_norm": 0.23252969980239868, + "learning_rate": 2.6319824167062125e-05, + "loss": 1.7459, + "step": 21743 + }, + { + "epoch": 6.6740331491712706, + "grad_norm": 0.16296416521072388, + "learning_rate": 2.631544652232808e-05, + "loss": 1.648, + "step": 21744 + }, + { + "epoch": 6.674340085942296, + "grad_norm": 0.2458602488040924, + "learning_rate": 2.631106911165867e-05, + "loss": 1.6847, + "step": 21745 + }, + { + "epoch": 6.674647022713321, + "grad_norm": 0.21203550696372986, + "learning_rate": 2.6306691935097162e-05, + "loss": 1.713, + "step": 21746 + }, + { + "epoch": 6.6749539594843466, + "grad_norm": 0.19969885051250458, + "learning_rate": 2.6302314992686804e-05, + "loss": 1.7445, + "step": 21747 + }, + { + "epoch": 6.675260896255372, + "grad_norm": 0.21001017093658447, + "learning_rate": 2.629793828447087e-05, + "loss": 1.703, + "step": 21748 + }, + { + "epoch": 6.675567833026396, + "grad_norm": 0.18607214093208313, + "learning_rate": 2.6293561810492595e-05, + "loss": 1.6765, + "step": 21749 + }, + { + "epoch": 6.675874769797422, + "grad_norm": 0.21806176006793976, + "learning_rate": 2.6289185570795223e-05, + "loss": 1.7099, + "step": 21750 + }, + { + "epoch": 6.676181706568447, + "grad_norm": 0.1861930787563324, + "learning_rate": 2.6284809565422052e-05, + "loss": 1.6978, + "step": 21751 + }, + { + "epoch": 6.676488643339472, + "grad_norm": 0.18779867887496948, + "learning_rate": 2.6280433794416254e-05, + "loss": 1.7132, + "step": 21752 + }, + { + "epoch": 6.676795580110497, + "grad_norm": 0.18255293369293213, + "learning_rate": 2.627605825782115e-05, + "loss": 1.7045, + "step": 21753 + }, + { + "epoch": 6.677102516881522, + "grad_norm": 0.22258871793746948, + "learning_rate": 2.6271682955679904e-05, + "loss": 1.7159, + "step": 21754 + }, + { + "epoch": 6.6774094536525475, + "grad_norm": 0.17425768077373505, + "learning_rate": 2.626730788803582e-05, + "loss": 1.6571, + "step": 21755 + }, + { + "epoch": 6.677716390423573, + "grad_norm": 0.1921091377735138, + "learning_rate": 2.6262933054932122e-05, + "loss": 1.8178, + "step": 21756 + }, + { + "epoch": 6.678023327194598, + "grad_norm": 0.16262951493263245, + "learning_rate": 2.6258558456411996e-05, + "loss": 1.6586, + "step": 21757 + }, + { + "epoch": 6.6783302639656235, + "grad_norm": 0.1853780597448349, + "learning_rate": 2.6254184092518752e-05, + "loss": 1.7116, + "step": 21758 + }, + { + "epoch": 6.678637200736648, + "grad_norm": 0.17973974347114563, + "learning_rate": 2.6249809963295536e-05, + "loss": 1.7317, + "step": 21759 + }, + { + "epoch": 6.678944137507673, + "grad_norm": 0.21258050203323364, + "learning_rate": 2.6245436068785634e-05, + "loss": 1.7852, + "step": 21760 + }, + { + "epoch": 6.679251074278699, + "grad_norm": 0.18741287291049957, + "learning_rate": 2.6241062409032262e-05, + "loss": 1.7071, + "step": 21761 + }, + { + "epoch": 6.679558011049724, + "grad_norm": 0.20436155796051025, + "learning_rate": 2.623668898407864e-05, + "loss": 1.7683, + "step": 21762 + }, + { + "epoch": 6.679864947820749, + "grad_norm": 0.18840116262435913, + "learning_rate": 2.6232315793967977e-05, + "loss": 1.7335, + "step": 21763 + }, + { + "epoch": 6.680171884591774, + "grad_norm": 0.1968357264995575, + "learning_rate": 2.62279428387435e-05, + "loss": 1.6848, + "step": 21764 + }, + { + "epoch": 6.680478821362799, + "grad_norm": 0.1774388998746872, + "learning_rate": 2.622357011844844e-05, + "loss": 1.6943, + "step": 21765 + }, + { + "epoch": 6.680785758133824, + "grad_norm": 0.2424328327178955, + "learning_rate": 2.621919763312598e-05, + "loss": 1.7479, + "step": 21766 + }, + { + "epoch": 6.68109269490485, + "grad_norm": 0.21220771968364716, + "learning_rate": 2.6214825382819353e-05, + "loss": 1.7384, + "step": 21767 + }, + { + "epoch": 6.681399631675875, + "grad_norm": 0.23322279751300812, + "learning_rate": 2.6210453367571764e-05, + "loss": 1.6625, + "step": 21768 + }, + { + "epoch": 6.6817065684469, + "grad_norm": 0.1726260483264923, + "learning_rate": 2.620608158742639e-05, + "loss": 1.7055, + "step": 21769 + }, + { + "epoch": 6.682013505217925, + "grad_norm": 0.25436410307884216, + "learning_rate": 2.6201710042426512e-05, + "loss": 1.7449, + "step": 21770 + }, + { + "epoch": 6.68232044198895, + "grad_norm": 0.20275171101093292, + "learning_rate": 2.619733873261524e-05, + "loss": 1.7575, + "step": 21771 + }, + { + "epoch": 6.6826273787599755, + "grad_norm": 0.24221903085708618, + "learning_rate": 2.6192967658035846e-05, + "loss": 1.7312, + "step": 21772 + }, + { + "epoch": 6.682934315531001, + "grad_norm": 0.30804362893104553, + "learning_rate": 2.6188596818731507e-05, + "loss": 1.7669, + "step": 21773 + }, + { + "epoch": 6.683241252302026, + "grad_norm": 0.1818273365497589, + "learning_rate": 2.6184226214745377e-05, + "loss": 1.7102, + "step": 21774 + }, + { + "epoch": 6.683548189073051, + "grad_norm": 0.28026455640792847, + "learning_rate": 2.6179855846120727e-05, + "loss": 1.7313, + "step": 21775 + }, + { + "epoch": 6.683855125844076, + "grad_norm": 0.26503586769104004, + "learning_rate": 2.6175485712900655e-05, + "loss": 1.7622, + "step": 21776 + }, + { + "epoch": 6.684162062615101, + "grad_norm": 0.19122248888015747, + "learning_rate": 2.6171115815128423e-05, + "loss": 1.7347, + "step": 21777 + }, + { + "epoch": 6.684468999386127, + "grad_norm": 0.18789063394069672, + "learning_rate": 2.6166746152847187e-05, + "loss": 1.7158, + "step": 21778 + }, + { + "epoch": 6.684775936157152, + "grad_norm": 0.17315362393856049, + "learning_rate": 2.6162376726100135e-05, + "loss": 1.6561, + "step": 21779 + }, + { + "epoch": 6.685082872928177, + "grad_norm": 0.20659680664539337, + "learning_rate": 2.615800753493045e-05, + "loss": 1.7063, + "step": 21780 + }, + { + "epoch": 6.685389809699202, + "grad_norm": 0.2051183432340622, + "learning_rate": 2.6153638579381307e-05, + "loss": 1.7213, + "step": 21781 + }, + { + "epoch": 6.685696746470227, + "grad_norm": 0.23349207639694214, + "learning_rate": 2.6149269859495884e-05, + "loss": 1.7453, + "step": 21782 + }, + { + "epoch": 6.686003683241252, + "grad_norm": 0.1979275941848755, + "learning_rate": 2.6144901375317355e-05, + "loss": 1.7482, + "step": 21783 + }, + { + "epoch": 6.686310620012278, + "grad_norm": 0.2742067873477936, + "learning_rate": 2.61405331268889e-05, + "loss": 1.7114, + "step": 21784 + }, + { + "epoch": 6.686617556783302, + "grad_norm": 0.18656300008296967, + "learning_rate": 2.6136165114253675e-05, + "loss": 1.7114, + "step": 21785 + }, + { + "epoch": 6.6869244935543275, + "grad_norm": 0.19345268607139587, + "learning_rate": 2.6131797337454834e-05, + "loss": 1.6818, + "step": 21786 + }, + { + "epoch": 6.687231430325353, + "grad_norm": 0.2194962054491043, + "learning_rate": 2.6127429796535597e-05, + "loss": 1.7519, + "step": 21787 + }, + { + "epoch": 6.687538367096378, + "grad_norm": 0.21714645624160767, + "learning_rate": 2.6123062491539054e-05, + "loss": 1.7334, + "step": 21788 + }, + { + "epoch": 6.6878453038674035, + "grad_norm": 0.1684521585702896, + "learning_rate": 2.6118695422508444e-05, + "loss": 1.6843, + "step": 21789 + }, + { + "epoch": 6.688152240638429, + "grad_norm": 0.16155442595481873, + "learning_rate": 2.6114328589486865e-05, + "loss": 1.6541, + "step": 21790 + }, + { + "epoch": 6.688459177409453, + "grad_norm": 0.18483634293079376, + "learning_rate": 2.6109961992517462e-05, + "loss": 1.688, + "step": 21791 + }, + { + "epoch": 6.688766114180479, + "grad_norm": 0.23146624863147736, + "learning_rate": 2.6105595631643466e-05, + "loss": 1.8006, + "step": 21792 + }, + { + "epoch": 6.689073050951504, + "grad_norm": 0.1852748543024063, + "learning_rate": 2.6101229506907937e-05, + "loss": 1.6624, + "step": 21793 + }, + { + "epoch": 6.689379987722529, + "grad_norm": 0.23809482157230377, + "learning_rate": 2.6096863618354105e-05, + "loss": 1.7313, + "step": 21794 + }, + { + "epoch": 6.689686924493555, + "grad_norm": 0.17145361006259918, + "learning_rate": 2.609249796602503e-05, + "loss": 1.6966, + "step": 21795 + }, + { + "epoch": 6.689993861264579, + "grad_norm": 0.1842796355485916, + "learning_rate": 2.6088132549963933e-05, + "loss": 1.6871, + "step": 21796 + }, + { + "epoch": 6.690300798035604, + "grad_norm": 0.1810201108455658, + "learning_rate": 2.608376737021392e-05, + "loss": 1.7509, + "step": 21797 + }, + { + "epoch": 6.69060773480663, + "grad_norm": 0.20428195595741272, + "learning_rate": 2.607940242681814e-05, + "loss": 1.7102, + "step": 21798 + }, + { + "epoch": 6.690914671577655, + "grad_norm": 0.1659073680639267, + "learning_rate": 2.6075037719819716e-05, + "loss": 1.7053, + "step": 21799 + }, + { + "epoch": 6.69122160834868, + "grad_norm": 0.19351087510585785, + "learning_rate": 2.60706732492618e-05, + "loss": 1.6847, + "step": 21800 + }, + { + "epoch": 6.691528545119706, + "grad_norm": 0.1734616905450821, + "learning_rate": 2.6066309015187517e-05, + "loss": 1.6989, + "step": 21801 + }, + { + "epoch": 6.69183548189073, + "grad_norm": 0.1863887459039688, + "learning_rate": 2.6061945017639995e-05, + "loss": 1.665, + "step": 21802 + }, + { + "epoch": 6.6921424186617555, + "grad_norm": 0.20225204527378082, + "learning_rate": 2.6057581256662344e-05, + "loss": 1.718, + "step": 21803 + }, + { + "epoch": 6.692449355432781, + "grad_norm": 0.22148309648036957, + "learning_rate": 2.605321773229774e-05, + "loss": 1.7801, + "step": 21804 + }, + { + "epoch": 6.692756292203806, + "grad_norm": 0.1870507448911667, + "learning_rate": 2.6048854444589242e-05, + "loss": 1.6613, + "step": 21805 + }, + { + "epoch": 6.6930632289748315, + "grad_norm": 0.18597224354743958, + "learning_rate": 2.604449139358004e-05, + "loss": 1.7284, + "step": 21806 + }, + { + "epoch": 6.693370165745856, + "grad_norm": 0.2082163542509079, + "learning_rate": 2.6040128579313193e-05, + "loss": 1.7456, + "step": 21807 + }, + { + "epoch": 6.693677102516881, + "grad_norm": 0.22506757080554962, + "learning_rate": 2.603576600183183e-05, + "loss": 1.7369, + "step": 21808 + }, + { + "epoch": 6.693984039287907, + "grad_norm": 0.20707464218139648, + "learning_rate": 2.60314036611791e-05, + "loss": 1.7176, + "step": 21809 + }, + { + "epoch": 6.694290976058932, + "grad_norm": 0.2306852787733078, + "learning_rate": 2.6027041557398053e-05, + "loss": 1.7582, + "step": 21810 + }, + { + "epoch": 6.694597912829957, + "grad_norm": 0.23120234906673431, + "learning_rate": 2.602267969053187e-05, + "loss": 1.7169, + "step": 21811 + }, + { + "epoch": 6.694904849600983, + "grad_norm": 0.24841509759426117, + "learning_rate": 2.6018318060623582e-05, + "loss": 1.7636, + "step": 21812 + }, + { + "epoch": 6.695211786372007, + "grad_norm": 0.22443681955337524, + "learning_rate": 2.601395666771635e-05, + "loss": 1.7465, + "step": 21813 + }, + { + "epoch": 6.695518723143032, + "grad_norm": 0.2905699908733368, + "learning_rate": 2.6009595511853257e-05, + "loss": 1.779, + "step": 21814 + }, + { + "epoch": 6.695825659914058, + "grad_norm": 0.18677717447280884, + "learning_rate": 2.60052345930774e-05, + "loss": 1.711, + "step": 21815 + }, + { + "epoch": 6.696132596685083, + "grad_norm": 0.2150946855545044, + "learning_rate": 2.6000873911431883e-05, + "loss": 1.7254, + "step": 21816 + }, + { + "epoch": 6.696439533456108, + "grad_norm": 0.20066408812999725, + "learning_rate": 2.5996513466959794e-05, + "loss": 1.7198, + "step": 21817 + }, + { + "epoch": 6.696746470227133, + "grad_norm": 0.23815886676311493, + "learning_rate": 2.5992153259704228e-05, + "loss": 1.749, + "step": 21818 + }, + { + "epoch": 6.697053406998158, + "grad_norm": 0.2067428082227707, + "learning_rate": 2.5987793289708273e-05, + "loss": 1.736, + "step": 21819 + }, + { + "epoch": 6.6973603437691835, + "grad_norm": 0.2126816362142563, + "learning_rate": 2.5983433557015e-05, + "loss": 1.6804, + "step": 21820 + }, + { + "epoch": 6.697667280540209, + "grad_norm": 0.2003033310174942, + "learning_rate": 2.597907406166756e-05, + "loss": 1.7303, + "step": 21821 + }, + { + "epoch": 6.697974217311234, + "grad_norm": 0.238821879029274, + "learning_rate": 2.5974714803708946e-05, + "loss": 1.7399, + "step": 21822 + }, + { + "epoch": 6.6982811540822595, + "grad_norm": 0.21327996253967285, + "learning_rate": 2.597035578318231e-05, + "loss": 1.766, + "step": 21823 + }, + { + "epoch": 6.698588090853284, + "grad_norm": 0.19689476490020752, + "learning_rate": 2.5965997000130694e-05, + "loss": 1.7621, + "step": 21824 + }, + { + "epoch": 6.698895027624309, + "grad_norm": 0.18349261581897736, + "learning_rate": 2.5961638454597158e-05, + "loss": 1.6339, + "step": 21825 + }, + { + "epoch": 6.699201964395335, + "grad_norm": 0.21475930511951447, + "learning_rate": 2.595728014662484e-05, + "loss": 1.6973, + "step": 21826 + }, + { + "epoch": 6.69950890116636, + "grad_norm": 0.2711705267429352, + "learning_rate": 2.5952922076256737e-05, + "loss": 1.7801, + "step": 21827 + }, + { + "epoch": 6.699815837937384, + "grad_norm": 0.2601792514324188, + "learning_rate": 2.5948564243535988e-05, + "loss": 1.7508, + "step": 21828 + }, + { + "epoch": 6.70012277470841, + "grad_norm": 0.206949844956398, + "learning_rate": 2.5944206648505586e-05, + "loss": 1.7853, + "step": 21829 + }, + { + "epoch": 6.700429711479435, + "grad_norm": 0.25003641843795776, + "learning_rate": 2.5939849291208653e-05, + "loss": 1.766, + "step": 21830 + }, + { + "epoch": 6.7007366482504604, + "grad_norm": 0.25864318013191223, + "learning_rate": 2.593549217168823e-05, + "loss": 1.7778, + "step": 21831 + }, + { + "epoch": 6.701043585021486, + "grad_norm": 0.20212729275226593, + "learning_rate": 2.593113528998738e-05, + "loss": 1.7249, + "step": 21832 + }, + { + "epoch": 6.701350521792511, + "grad_norm": 0.2518431842327118, + "learning_rate": 2.5926778646149154e-05, + "loss": 1.7466, + "step": 21833 + }, + { + "epoch": 6.701657458563536, + "grad_norm": 0.24284590780735016, + "learning_rate": 2.5922422240216614e-05, + "loss": 1.8309, + "step": 21834 + }, + { + "epoch": 6.701964395334561, + "grad_norm": 0.21829955279827118, + "learning_rate": 2.5918066072232817e-05, + "loss": 1.7458, + "step": 21835 + }, + { + "epoch": 6.702271332105586, + "grad_norm": 0.2842165231704712, + "learning_rate": 2.5913710142240792e-05, + "loss": 1.7379, + "step": 21836 + }, + { + "epoch": 6.702578268876612, + "grad_norm": 0.19648514688014984, + "learning_rate": 2.590935445028359e-05, + "loss": 1.7141, + "step": 21837 + }, + { + "epoch": 6.702885205647637, + "grad_norm": 0.24336646497249603, + "learning_rate": 2.5904998996404305e-05, + "loss": 1.6719, + "step": 21838 + }, + { + "epoch": 6.703192142418661, + "grad_norm": 0.17288628220558167, + "learning_rate": 2.5900643780645905e-05, + "loss": 1.6982, + "step": 21839 + }, + { + "epoch": 6.703499079189687, + "grad_norm": 0.24906334280967712, + "learning_rate": 2.5896288803051505e-05, + "loss": 1.6873, + "step": 21840 + }, + { + "epoch": 6.703806015960712, + "grad_norm": 0.2177029550075531, + "learning_rate": 2.5891934063664085e-05, + "loss": 1.6884, + "step": 21841 + }, + { + "epoch": 6.704112952731737, + "grad_norm": 0.20478956401348114, + "learning_rate": 2.5887579562526688e-05, + "loss": 1.7342, + "step": 21842 + }, + { + "epoch": 6.704419889502763, + "grad_norm": 0.26212164759635925, + "learning_rate": 2.58832252996824e-05, + "loss": 1.7304, + "step": 21843 + }, + { + "epoch": 6.704726826273788, + "grad_norm": 0.2049340009689331, + "learning_rate": 2.587887127517418e-05, + "loss": 1.7472, + "step": 21844 + }, + { + "epoch": 6.7050337630448125, + "grad_norm": 0.2453075796365738, + "learning_rate": 2.587451748904512e-05, + "loss": 1.7443, + "step": 21845 + }, + { + "epoch": 6.705340699815838, + "grad_norm": 0.19545187056064606, + "learning_rate": 2.5870163941338188e-05, + "loss": 1.7328, + "step": 21846 + }, + { + "epoch": 6.705647636586863, + "grad_norm": 0.24424482882022858, + "learning_rate": 2.5865810632096456e-05, + "loss": 1.6876, + "step": 21847 + }, + { + "epoch": 6.7059545733578885, + "grad_norm": 0.2150830626487732, + "learning_rate": 2.5861457561362922e-05, + "loss": 1.7272, + "step": 21848 + }, + { + "epoch": 6.706261510128914, + "grad_norm": 0.2632520794868469, + "learning_rate": 2.5857104729180626e-05, + "loss": 1.7542, + "step": 21849 + }, + { + "epoch": 6.706568446899938, + "grad_norm": 0.21789421141147614, + "learning_rate": 2.5852752135592563e-05, + "loss": 1.6856, + "step": 21850 + }, + { + "epoch": 6.706875383670964, + "grad_norm": 0.2227005511522293, + "learning_rate": 2.5848399780641758e-05, + "loss": 1.7473, + "step": 21851 + }, + { + "epoch": 6.707182320441989, + "grad_norm": 0.23424866795539856, + "learning_rate": 2.5844047664371218e-05, + "loss": 1.7016, + "step": 21852 + }, + { + "epoch": 6.707489257213014, + "grad_norm": 0.2125028669834137, + "learning_rate": 2.5839695786823964e-05, + "loss": 1.8296, + "step": 21853 + }, + { + "epoch": 6.70779619398404, + "grad_norm": 0.2533423900604248, + "learning_rate": 2.5835344148042972e-05, + "loss": 1.7237, + "step": 21854 + }, + { + "epoch": 6.708103130755065, + "grad_norm": 0.1951744705438614, + "learning_rate": 2.583099274807132e-05, + "loss": 1.6685, + "step": 21855 + }, + { + "epoch": 6.708410067526089, + "grad_norm": 0.2564519941806793, + "learning_rate": 2.5826641586951938e-05, + "loss": 1.7542, + "step": 21856 + }, + { + "epoch": 6.708717004297115, + "grad_norm": 0.2586502134799957, + "learning_rate": 2.5822290664727856e-05, + "loss": 1.7477, + "step": 21857 + }, + { + "epoch": 6.70902394106814, + "grad_norm": 0.30357107520103455, + "learning_rate": 2.5817939981442062e-05, + "loss": 1.7454, + "step": 21858 + }, + { + "epoch": 6.709330877839165, + "grad_norm": 0.20547500252723694, + "learning_rate": 2.5813589537137544e-05, + "loss": 1.7517, + "step": 21859 + }, + { + "epoch": 6.70963781461019, + "grad_norm": 0.2961783707141876, + "learning_rate": 2.5809239331857348e-05, + "loss": 1.698, + "step": 21860 + }, + { + "epoch": 6.709944751381215, + "grad_norm": 0.2062019556760788, + "learning_rate": 2.580488936564439e-05, + "loss": 1.7358, + "step": 21861 + }, + { + "epoch": 6.7102516881522405, + "grad_norm": 0.22287480533123016, + "learning_rate": 2.580053963854173e-05, + "loss": 1.7099, + "step": 21862 + }, + { + "epoch": 6.710558624923266, + "grad_norm": 0.1853112131357193, + "learning_rate": 2.579619015059229e-05, + "loss": 1.7493, + "step": 21863 + }, + { + "epoch": 6.710865561694291, + "grad_norm": 0.24855247139930725, + "learning_rate": 2.5791840901839105e-05, + "loss": 1.7248, + "step": 21864 + }, + { + "epoch": 6.7111724984653165, + "grad_norm": 0.18156948685646057, + "learning_rate": 2.5787491892325126e-05, + "loss": 1.6744, + "step": 21865 + }, + { + "epoch": 6.711479435236341, + "grad_norm": 0.3272082209587097, + "learning_rate": 2.5783143122093357e-05, + "loss": 1.7546, + "step": 21866 + }, + { + "epoch": 6.711786372007366, + "grad_norm": 0.2875421643257141, + "learning_rate": 2.577879459118675e-05, + "loss": 1.6477, + "step": 21867 + }, + { + "epoch": 6.712093308778392, + "grad_norm": 0.19682031869888306, + "learning_rate": 2.5774446299648297e-05, + "loss": 1.7455, + "step": 21868 + }, + { + "epoch": 6.712400245549417, + "grad_norm": 0.32829195261001587, + "learning_rate": 2.5770098247520968e-05, + "loss": 1.7817, + "step": 21869 + }, + { + "epoch": 6.712707182320442, + "grad_norm": 0.26227760314941406, + "learning_rate": 2.5765750434847724e-05, + "loss": 1.763, + "step": 21870 + }, + { + "epoch": 6.713014119091467, + "grad_norm": 0.2902637720108032, + "learning_rate": 2.576140286167152e-05, + "loss": 1.7432, + "step": 21871 + }, + { + "epoch": 6.713321055862492, + "grad_norm": 0.2290763407945633, + "learning_rate": 2.5757055528035377e-05, + "loss": 1.7149, + "step": 21872 + }, + { + "epoch": 6.713627992633517, + "grad_norm": 0.3445907533168793, + "learning_rate": 2.575270843398221e-05, + "loss": 1.7874, + "step": 21873 + }, + { + "epoch": 6.713934929404543, + "grad_norm": 0.1841191053390503, + "learning_rate": 2.574836157955498e-05, + "loss": 1.6954, + "step": 21874 + }, + { + "epoch": 6.714241866175568, + "grad_norm": 0.24168385565280914, + "learning_rate": 2.5744014964796657e-05, + "loss": 1.7153, + "step": 21875 + }, + { + "epoch": 6.714548802946593, + "grad_norm": 0.17855188250541687, + "learning_rate": 2.5739668589750175e-05, + "loss": 1.7329, + "step": 21876 + }, + { + "epoch": 6.714855739717618, + "grad_norm": 0.189789280295372, + "learning_rate": 2.5735322454458554e-05, + "loss": 1.6854, + "step": 21877 + }, + { + "epoch": 6.715162676488643, + "grad_norm": 0.1792519986629486, + "learning_rate": 2.5730976558964647e-05, + "loss": 1.7483, + "step": 21878 + }, + { + "epoch": 6.7154696132596685, + "grad_norm": 0.24460360407829285, + "learning_rate": 2.5726630903311504e-05, + "loss": 1.8337, + "step": 21879 + }, + { + "epoch": 6.715776550030694, + "grad_norm": 0.21612058579921722, + "learning_rate": 2.572228548754198e-05, + "loss": 1.7293, + "step": 21880 + }, + { + "epoch": 6.716083486801719, + "grad_norm": 0.22057892382144928, + "learning_rate": 2.5717940311699078e-05, + "loss": 1.7269, + "step": 21881 + }, + { + "epoch": 6.716390423572744, + "grad_norm": 0.19635777175426483, + "learning_rate": 2.571359537582572e-05, + "loss": 1.6744, + "step": 21882 + }, + { + "epoch": 6.716697360343769, + "grad_norm": 0.20406895875930786, + "learning_rate": 2.570925067996485e-05, + "loss": 1.6866, + "step": 21883 + }, + { + "epoch": 6.717004297114794, + "grad_norm": 0.1942419856786728, + "learning_rate": 2.5704906224159407e-05, + "loss": 1.724, + "step": 21884 + }, + { + "epoch": 6.71731123388582, + "grad_norm": 0.20423445105552673, + "learning_rate": 2.570056200845231e-05, + "loss": 1.6709, + "step": 21885 + }, + { + "epoch": 6.717618170656845, + "grad_norm": 0.27171632647514343, + "learning_rate": 2.569621803288651e-05, + "loss": 1.7532, + "step": 21886 + }, + { + "epoch": 6.71792510742787, + "grad_norm": 0.22753871977329254, + "learning_rate": 2.5691874297504926e-05, + "loss": 1.7534, + "step": 21887 + }, + { + "epoch": 6.718232044198895, + "grad_norm": 0.1907290369272232, + "learning_rate": 2.5687530802350468e-05, + "loss": 1.6696, + "step": 21888 + }, + { + "epoch": 6.71853898096992, + "grad_norm": 0.2226637750864029, + "learning_rate": 2.568318754746612e-05, + "loss": 1.7194, + "step": 21889 + }, + { + "epoch": 6.718845917740945, + "grad_norm": 0.20878726243972778, + "learning_rate": 2.5678844532894742e-05, + "loss": 1.6878, + "step": 21890 + }, + { + "epoch": 6.719152854511971, + "grad_norm": 0.18087267875671387, + "learning_rate": 2.567450175867928e-05, + "loss": 1.7432, + "step": 21891 + }, + { + "epoch": 6.719459791282996, + "grad_norm": 0.19818328320980072, + "learning_rate": 2.567015922486265e-05, + "loss": 1.6959, + "step": 21892 + }, + { + "epoch": 6.7197667280540205, + "grad_norm": 0.19593466818332672, + "learning_rate": 2.566581693148775e-05, + "loss": 1.7357, + "step": 21893 + }, + { + "epoch": 6.720073664825046, + "grad_norm": 0.24518795311450958, + "learning_rate": 2.5661474878597546e-05, + "loss": 1.7948, + "step": 21894 + }, + { + "epoch": 6.720380601596071, + "grad_norm": 0.18471074104309082, + "learning_rate": 2.5657133066234872e-05, + "loss": 1.6983, + "step": 21895 + }, + { + "epoch": 6.7206875383670965, + "grad_norm": 0.20073382556438446, + "learning_rate": 2.5652791494442718e-05, + "loss": 1.7241, + "step": 21896 + }, + { + "epoch": 6.720994475138122, + "grad_norm": 0.21688152849674225, + "learning_rate": 2.5648450163263903e-05, + "loss": 1.7073, + "step": 21897 + }, + { + "epoch": 6.721301411909147, + "grad_norm": 0.17722688615322113, + "learning_rate": 2.5644109072741406e-05, + "loss": 1.7047, + "step": 21898 + }, + { + "epoch": 6.721608348680172, + "grad_norm": 0.2060708999633789, + "learning_rate": 2.5639768222918093e-05, + "loss": 1.7246, + "step": 21899 + }, + { + "epoch": 6.721915285451197, + "grad_norm": 0.26590242981910706, + "learning_rate": 2.563542761383687e-05, + "loss": 1.8141, + "step": 21900 + }, + { + "epoch": 6.722222222222222, + "grad_norm": 0.22498780488967896, + "learning_rate": 2.5631087245540632e-05, + "loss": 1.7211, + "step": 21901 + }, + { + "epoch": 6.722529158993248, + "grad_norm": 0.20546968281269073, + "learning_rate": 2.562674711807227e-05, + "loss": 1.8001, + "step": 21902 + }, + { + "epoch": 6.722836095764272, + "grad_norm": 0.19668535888195038, + "learning_rate": 2.5622407231474683e-05, + "loss": 1.7443, + "step": 21903 + }, + { + "epoch": 6.723143032535297, + "grad_norm": 0.18932129442691803, + "learning_rate": 2.5618067585790752e-05, + "loss": 1.7307, + "step": 21904 + }, + { + "epoch": 6.723449969306323, + "grad_norm": 0.19501622021198273, + "learning_rate": 2.561372818106335e-05, + "loss": 1.7016, + "step": 21905 + }, + { + "epoch": 6.723756906077348, + "grad_norm": 0.21313562989234924, + "learning_rate": 2.5609389017335416e-05, + "loss": 1.8012, + "step": 21906 + }, + { + "epoch": 6.724063842848373, + "grad_norm": 0.174738347530365, + "learning_rate": 2.560505009464978e-05, + "loss": 1.6824, + "step": 21907 + }, + { + "epoch": 6.724370779619399, + "grad_norm": 0.20349650084972382, + "learning_rate": 2.560071141304934e-05, + "loss": 1.7813, + "step": 21908 + }, + { + "epoch": 6.724677716390423, + "grad_norm": 0.21878227591514587, + "learning_rate": 2.5596372972576967e-05, + "loss": 1.8166, + "step": 21909 + }, + { + "epoch": 6.7249846531614486, + "grad_norm": 0.2082633078098297, + "learning_rate": 2.559203477327552e-05, + "loss": 1.7197, + "step": 21910 + }, + { + "epoch": 6.725291589932474, + "grad_norm": 0.17738287150859833, + "learning_rate": 2.558769681518792e-05, + "loss": 1.7093, + "step": 21911 + }, + { + "epoch": 6.725598526703499, + "grad_norm": 0.1930074542760849, + "learning_rate": 2.5583359098356986e-05, + "loss": 1.7702, + "step": 21912 + }, + { + "epoch": 6.725905463474525, + "grad_norm": 0.17668531835079193, + "learning_rate": 2.5579021622825638e-05, + "loss": 1.7466, + "step": 21913 + }, + { + "epoch": 6.726212400245549, + "grad_norm": 0.1737186163663864, + "learning_rate": 2.5574684388636677e-05, + "loss": 1.6876, + "step": 21914 + }, + { + "epoch": 6.726519337016574, + "grad_norm": 0.18352502584457397, + "learning_rate": 2.5570347395833018e-05, + "loss": 1.6745, + "step": 21915 + }, + { + "epoch": 6.7268262737876, + "grad_norm": 0.19047673046588898, + "learning_rate": 2.5566010644457506e-05, + "loss": 1.7465, + "step": 21916 + }, + { + "epoch": 6.727133210558625, + "grad_norm": 0.1762397438287735, + "learning_rate": 2.5561674134553005e-05, + "loss": 1.6767, + "step": 21917 + }, + { + "epoch": 6.72744014732965, + "grad_norm": 0.22884784638881683, + "learning_rate": 2.5557337866162358e-05, + "loss": 1.7054, + "step": 21918 + }, + { + "epoch": 6.727747084100676, + "grad_norm": 0.17476098239421844, + "learning_rate": 2.5553001839328417e-05, + "loss": 1.721, + "step": 21919 + }, + { + "epoch": 6.7280540208717, + "grad_norm": 0.1827213317155838, + "learning_rate": 2.554866605409405e-05, + "loss": 1.78, + "step": 21920 + }, + { + "epoch": 6.7283609576427255, + "grad_norm": 0.21709343791007996, + "learning_rate": 2.554433051050209e-05, + "loss": 1.8064, + "step": 21921 + }, + { + "epoch": 6.728667894413751, + "grad_norm": 0.1972692310810089, + "learning_rate": 2.5539995208595398e-05, + "loss": 1.7231, + "step": 21922 + }, + { + "epoch": 6.728974831184776, + "grad_norm": 0.19464808702468872, + "learning_rate": 2.5535660148416802e-05, + "loss": 1.7931, + "step": 21923 + }, + { + "epoch": 6.7292817679558015, + "grad_norm": 0.19610099494457245, + "learning_rate": 2.5531325330009158e-05, + "loss": 1.7467, + "step": 21924 + }, + { + "epoch": 6.729588704726826, + "grad_norm": 0.21104763448238373, + "learning_rate": 2.5526990753415292e-05, + "loss": 1.7543, + "step": 21925 + }, + { + "epoch": 6.729895641497851, + "grad_norm": 0.1881588101387024, + "learning_rate": 2.5522656418678047e-05, + "loss": 1.7666, + "step": 21926 + }, + { + "epoch": 6.730202578268877, + "grad_norm": 0.2163291722536087, + "learning_rate": 2.551832232584025e-05, + "loss": 1.7321, + "step": 21927 + }, + { + "epoch": 6.730509515039902, + "grad_norm": 0.19252021610736847, + "learning_rate": 2.551398847494477e-05, + "loss": 1.7287, + "step": 21928 + }, + { + "epoch": 6.730816451810927, + "grad_norm": 0.22602233290672302, + "learning_rate": 2.550965486603437e-05, + "loss": 1.767, + "step": 21929 + }, + { + "epoch": 6.731123388581953, + "grad_norm": 0.21509617567062378, + "learning_rate": 2.5505321499151957e-05, + "loss": 1.7637, + "step": 21930 + }, + { + "epoch": 6.731430325352977, + "grad_norm": 0.24291658401489258, + "learning_rate": 2.5500988374340274e-05, + "loss": 1.7312, + "step": 21931 + }, + { + "epoch": 6.731737262124002, + "grad_norm": 0.26562216877937317, + "learning_rate": 2.5496655491642195e-05, + "loss": 1.7763, + "step": 21932 + }, + { + "epoch": 6.732044198895028, + "grad_norm": 0.19785790145397186, + "learning_rate": 2.5492322851100535e-05, + "loss": 1.6979, + "step": 21933 + }, + { + "epoch": 6.732351135666053, + "grad_norm": 0.20044486224651337, + "learning_rate": 2.5487990452758104e-05, + "loss": 1.7359, + "step": 21934 + }, + { + "epoch": 6.7326580724370775, + "grad_norm": 0.20468659698963165, + "learning_rate": 2.548365829665772e-05, + "loss": 1.6996, + "step": 21935 + }, + { + "epoch": 6.732965009208103, + "grad_norm": 0.16516120731830597, + "learning_rate": 2.5479326382842195e-05, + "loss": 1.717, + "step": 21936 + }, + { + "epoch": 6.733271945979128, + "grad_norm": 0.22404411435127258, + "learning_rate": 2.547499471135433e-05, + "loss": 1.7261, + "step": 21937 + }, + { + "epoch": 6.7335788827501535, + "grad_norm": 0.21485663950443268, + "learning_rate": 2.547066328223695e-05, + "loss": 1.7463, + "step": 21938 + }, + { + "epoch": 6.733885819521179, + "grad_norm": 0.330018550157547, + "learning_rate": 2.5466332095532853e-05, + "loss": 1.854, + "step": 21939 + }, + { + "epoch": 6.734192756292204, + "grad_norm": 0.25225213170051575, + "learning_rate": 2.5462001151284842e-05, + "loss": 1.722, + "step": 21940 + }, + { + "epoch": 6.734499693063229, + "grad_norm": 0.2422008365392685, + "learning_rate": 2.5457670449535713e-05, + "loss": 1.6996, + "step": 21941 + }, + { + "epoch": 6.734806629834254, + "grad_norm": 0.2421465814113617, + "learning_rate": 2.5453339990328275e-05, + "loss": 1.7014, + "step": 21942 + }, + { + "epoch": 6.735113566605279, + "grad_norm": 0.2520611882209778, + "learning_rate": 2.5449009773705313e-05, + "loss": 1.7149, + "step": 21943 + }, + { + "epoch": 6.735420503376305, + "grad_norm": 0.24940338730812073, + "learning_rate": 2.5444679799709626e-05, + "loss": 1.7423, + "step": 21944 + }, + { + "epoch": 6.73572744014733, + "grad_norm": 0.2328663021326065, + "learning_rate": 2.544035006838401e-05, + "loss": 1.6893, + "step": 21945 + }, + { + "epoch": 6.736034376918354, + "grad_norm": 0.2190757393836975, + "learning_rate": 2.5436020579771226e-05, + "loss": 1.7375, + "step": 21946 + }, + { + "epoch": 6.73634131368938, + "grad_norm": 0.2204900085926056, + "learning_rate": 2.543169133391413e-05, + "loss": 1.6971, + "step": 21947 + }, + { + "epoch": 6.736648250460405, + "grad_norm": 0.29192328453063965, + "learning_rate": 2.5427362330855415e-05, + "loss": 1.7633, + "step": 21948 + }, + { + "epoch": 6.73695518723143, + "grad_norm": 0.19859355688095093, + "learning_rate": 2.542303357063793e-05, + "loss": 1.7515, + "step": 21949 + }, + { + "epoch": 6.737262124002456, + "grad_norm": 0.23010417819023132, + "learning_rate": 2.5418705053304425e-05, + "loss": 1.7282, + "step": 21950 + }, + { + "epoch": 6.737569060773481, + "grad_norm": 0.2168324589729309, + "learning_rate": 2.5414376778897698e-05, + "loss": 1.7347, + "step": 21951 + }, + { + "epoch": 6.7378759975445055, + "grad_norm": 0.2190646231174469, + "learning_rate": 2.54100487474605e-05, + "loss": 1.7893, + "step": 21952 + }, + { + "epoch": 6.738182934315531, + "grad_norm": 0.23925794661045074, + "learning_rate": 2.5405720959035617e-05, + "loss": 1.7825, + "step": 21953 + }, + { + "epoch": 6.738489871086556, + "grad_norm": 0.17987917363643646, + "learning_rate": 2.5401393413665807e-05, + "loss": 1.724, + "step": 21954 + }, + { + "epoch": 6.7387968078575815, + "grad_norm": 0.2300983965396881, + "learning_rate": 2.5397066111393853e-05, + "loss": 1.7023, + "step": 21955 + }, + { + "epoch": 6.739103744628607, + "grad_norm": 0.2128167450428009, + "learning_rate": 2.539273905226251e-05, + "loss": 1.7218, + "step": 21956 + }, + { + "epoch": 6.739410681399631, + "grad_norm": 0.19105537235736847, + "learning_rate": 2.538841223631454e-05, + "loss": 1.7781, + "step": 21957 + }, + { + "epoch": 6.739717618170657, + "grad_norm": 0.22985289990901947, + "learning_rate": 2.5384085663592704e-05, + "loss": 1.7362, + "step": 21958 + }, + { + "epoch": 6.740024554941682, + "grad_norm": 0.18608705699443817, + "learning_rate": 2.5379759334139768e-05, + "loss": 1.7174, + "step": 21959 + }, + { + "epoch": 6.740331491712707, + "grad_norm": 0.2659450173377991, + "learning_rate": 2.5375433247998482e-05, + "loss": 1.8118, + "step": 21960 + }, + { + "epoch": 6.740638428483733, + "grad_norm": 0.1904401034116745, + "learning_rate": 2.537110740521159e-05, + "loss": 1.6789, + "step": 21961 + }, + { + "epoch": 6.740945365254758, + "grad_norm": 0.1826045662164688, + "learning_rate": 2.5366781805821847e-05, + "loss": 1.6906, + "step": 21962 + }, + { + "epoch": 6.741252302025782, + "grad_norm": 0.1919000893831253, + "learning_rate": 2.5362456449871995e-05, + "loss": 1.7412, + "step": 21963 + }, + { + "epoch": 6.741559238796808, + "grad_norm": 0.1921864151954651, + "learning_rate": 2.5358131337404822e-05, + "loss": 1.7023, + "step": 21964 + }, + { + "epoch": 6.741866175567833, + "grad_norm": 0.1628783494234085, + "learning_rate": 2.5353806468463004e-05, + "loss": 1.6842, + "step": 21965 + }, + { + "epoch": 6.742173112338858, + "grad_norm": 0.19764694571495056, + "learning_rate": 2.534948184308935e-05, + "loss": 1.7238, + "step": 21966 + }, + { + "epoch": 6.742480049109884, + "grad_norm": 0.1845860630273819, + "learning_rate": 2.534515746132653e-05, + "loss": 1.728, + "step": 21967 + }, + { + "epoch": 6.742786985880908, + "grad_norm": 0.20269328355789185, + "learning_rate": 2.5340833323217327e-05, + "loss": 1.7541, + "step": 21968 + }, + { + "epoch": 6.7430939226519335, + "grad_norm": 0.16586242616176605, + "learning_rate": 2.5336509428804468e-05, + "loss": 1.7025, + "step": 21969 + }, + { + "epoch": 6.743400859422959, + "grad_norm": 0.1693086177110672, + "learning_rate": 2.533218577813068e-05, + "loss": 1.6975, + "step": 21970 + }, + { + "epoch": 6.743707796193984, + "grad_norm": 0.2206759750843048, + "learning_rate": 2.5327862371238686e-05, + "loss": 1.764, + "step": 21971 + }, + { + "epoch": 6.7440147329650095, + "grad_norm": 0.1915574073791504, + "learning_rate": 2.532353920817122e-05, + "loss": 1.7576, + "step": 21972 + }, + { + "epoch": 6.744321669736035, + "grad_norm": 0.1741783618927002, + "learning_rate": 2.5319216288971003e-05, + "loss": 1.7394, + "step": 21973 + }, + { + "epoch": 6.744628606507059, + "grad_norm": 0.21624934673309326, + "learning_rate": 2.5314893613680755e-05, + "loss": 1.7358, + "step": 21974 + }, + { + "epoch": 6.744935543278085, + "grad_norm": 0.2350481003522873, + "learning_rate": 2.5310571182343197e-05, + "loss": 1.7801, + "step": 21975 + }, + { + "epoch": 6.74524248004911, + "grad_norm": 0.18618559837341309, + "learning_rate": 2.5306248995001048e-05, + "loss": 1.7012, + "step": 21976 + }, + { + "epoch": 6.745549416820135, + "grad_norm": 0.18479639291763306, + "learning_rate": 2.5301927051697016e-05, + "loss": 1.7238, + "step": 21977 + }, + { + "epoch": 6.74585635359116, + "grad_norm": 0.19978758692741394, + "learning_rate": 2.5297605352473818e-05, + "loss": 1.6636, + "step": 21978 + }, + { + "epoch": 6.746163290362185, + "grad_norm": 0.23122164607048035, + "learning_rate": 2.529328389737416e-05, + "loss": 1.7455, + "step": 21979 + }, + { + "epoch": 6.74647022713321, + "grad_norm": 0.20423240959644318, + "learning_rate": 2.5288962686440732e-05, + "loss": 1.7516, + "step": 21980 + }, + { + "epoch": 6.746777163904236, + "grad_norm": 0.18271920084953308, + "learning_rate": 2.52846417197163e-05, + "loss": 1.762, + "step": 21981 + }, + { + "epoch": 6.747084100675261, + "grad_norm": 0.19280247390270233, + "learning_rate": 2.528032099724349e-05, + "loss": 1.7298, + "step": 21982 + }, + { + "epoch": 6.747391037446286, + "grad_norm": 0.20908337831497192, + "learning_rate": 2.527600051906507e-05, + "loss": 1.7323, + "step": 21983 + }, + { + "epoch": 6.747697974217311, + "grad_norm": 0.18399856984615326, + "learning_rate": 2.5271680285223663e-05, + "loss": 1.6795, + "step": 21984 + }, + { + "epoch": 6.748004910988336, + "grad_norm": 0.2273191213607788, + "learning_rate": 2.5267360295762033e-05, + "loss": 1.6811, + "step": 21985 + }, + { + "epoch": 6.7483118477593615, + "grad_norm": 0.1844841092824936, + "learning_rate": 2.526304055072284e-05, + "loss": 1.7404, + "step": 21986 + }, + { + "epoch": 6.748618784530387, + "grad_norm": 0.25975871086120605, + "learning_rate": 2.5258721050148775e-05, + "loss": 1.6994, + "step": 21987 + }, + { + "epoch": 6.748925721301412, + "grad_norm": 0.1664818376302719, + "learning_rate": 2.5254401794082532e-05, + "loss": 1.6722, + "step": 21988 + }, + { + "epoch": 6.749232658072437, + "grad_norm": 0.2597639560699463, + "learning_rate": 2.5250082782566796e-05, + "loss": 1.7654, + "step": 21989 + }, + { + "epoch": 6.749539594843462, + "grad_norm": 0.19326356053352356, + "learning_rate": 2.5245764015644248e-05, + "loss": 1.668, + "step": 21990 + }, + { + "epoch": 6.749846531614487, + "grad_norm": 0.22924599051475525, + "learning_rate": 2.5241445493357574e-05, + "loss": 1.7522, + "step": 21991 + }, + { + "epoch": 6.750153468385513, + "grad_norm": 0.24588358402252197, + "learning_rate": 2.523712721574944e-05, + "loss": 1.7396, + "step": 21992 + }, + { + "epoch": 6.750460405156538, + "grad_norm": 0.1988971084356308, + "learning_rate": 2.5232809182862526e-05, + "loss": 1.7338, + "step": 21993 + }, + { + "epoch": 6.750767341927563, + "grad_norm": 0.18566425144672394, + "learning_rate": 2.5228491394739518e-05, + "loss": 1.7135, + "step": 21994 + }, + { + "epoch": 6.751074278698588, + "grad_norm": 0.22216622531414032, + "learning_rate": 2.5224173851423073e-05, + "loss": 1.744, + "step": 21995 + }, + { + "epoch": 6.751381215469613, + "grad_norm": 0.18695887923240662, + "learning_rate": 2.5219856552955863e-05, + "loss": 1.7324, + "step": 21996 + }, + { + "epoch": 6.7516881522406385, + "grad_norm": 0.1866987645626068, + "learning_rate": 2.5215539499380535e-05, + "loss": 1.6855, + "step": 21997 + }, + { + "epoch": 6.751995089011664, + "grad_norm": 0.1743573248386383, + "learning_rate": 2.521122269073981e-05, + "loss": 1.6833, + "step": 21998 + }, + { + "epoch": 6.752302025782689, + "grad_norm": 0.2173541784286499, + "learning_rate": 2.5206906127076274e-05, + "loss": 1.7434, + "step": 21999 + }, + { + "epoch": 6.752608962553714, + "grad_norm": 0.17558147013187408, + "learning_rate": 2.5202589808432665e-05, + "loss": 1.6884, + "step": 22000 + }, + { + "epoch": 6.752915899324739, + "grad_norm": 0.16630353033542633, + "learning_rate": 2.5198273734851553e-05, + "loss": 1.7005, + "step": 22001 + }, + { + "epoch": 6.753222836095764, + "grad_norm": 0.1834949105978012, + "learning_rate": 2.519395790637566e-05, + "loss": 1.7123, + "step": 22002 + }, + { + "epoch": 6.75352977286679, + "grad_norm": 0.1806751936674118, + "learning_rate": 2.5189642323047614e-05, + "loss": 1.7305, + "step": 22003 + }, + { + "epoch": 6.753836709637815, + "grad_norm": 0.2350265085697174, + "learning_rate": 2.5185326984910062e-05, + "loss": 1.772, + "step": 22004 + }, + { + "epoch": 6.75414364640884, + "grad_norm": 0.18105818331241608, + "learning_rate": 2.518101189200566e-05, + "loss": 1.7487, + "step": 22005 + }, + { + "epoch": 6.754450583179865, + "grad_norm": 0.17640845477581024, + "learning_rate": 2.517669704437704e-05, + "loss": 1.7178, + "step": 22006 + }, + { + "epoch": 6.75475751995089, + "grad_norm": 0.21648885309696198, + "learning_rate": 2.5172382442066845e-05, + "loss": 1.7144, + "step": 22007 + }, + { + "epoch": 6.755064456721915, + "grad_norm": 0.2042703926563263, + "learning_rate": 2.5168068085117724e-05, + "loss": 1.7476, + "step": 22008 + }, + { + "epoch": 6.755371393492941, + "grad_norm": 0.24397306144237518, + "learning_rate": 2.5163753973572306e-05, + "loss": 1.7033, + "step": 22009 + }, + { + "epoch": 6.755678330263965, + "grad_norm": 0.2030377835035324, + "learning_rate": 2.5159440107473232e-05, + "loss": 1.7353, + "step": 22010 + }, + { + "epoch": 6.7559852670349905, + "grad_norm": 0.2493598908185959, + "learning_rate": 2.5155126486863127e-05, + "loss": 1.7346, + "step": 22011 + }, + { + "epoch": 6.756292203806016, + "grad_norm": 0.17272062599658966, + "learning_rate": 2.5150813111784627e-05, + "loss": 1.7095, + "step": 22012 + }, + { + "epoch": 6.756599140577041, + "grad_norm": 0.2417706698179245, + "learning_rate": 2.514649998228036e-05, + "loss": 1.7631, + "step": 22013 + }, + { + "epoch": 6.7569060773480665, + "grad_norm": 0.17753612995147705, + "learning_rate": 2.5142187098392915e-05, + "loss": 1.697, + "step": 22014 + }, + { + "epoch": 6.757213014119092, + "grad_norm": 0.2246367186307907, + "learning_rate": 2.5137874460164995e-05, + "loss": 1.7216, + "step": 22015 + }, + { + "epoch": 6.757519950890116, + "grad_norm": 0.24141135811805725, + "learning_rate": 2.5133562067639134e-05, + "loss": 1.7368, + "step": 22016 + }, + { + "epoch": 6.757826887661142, + "grad_norm": 0.21253570914268494, + "learning_rate": 2.5129249920858022e-05, + "loss": 1.7029, + "step": 22017 + }, + { + "epoch": 6.758133824432167, + "grad_norm": 0.21176676452159882, + "learning_rate": 2.5124938019864198e-05, + "loss": 1.7472, + "step": 22018 + }, + { + "epoch": 6.758440761203192, + "grad_norm": 0.1990927904844284, + "learning_rate": 2.5120626364700338e-05, + "loss": 1.6686, + "step": 22019 + }, + { + "epoch": 6.758747697974218, + "grad_norm": 0.1736145317554474, + "learning_rate": 2.5116314955409038e-05, + "loss": 1.6984, + "step": 22020 + }, + { + "epoch": 6.759054634745242, + "grad_norm": 0.2618037462234497, + "learning_rate": 2.511200379203289e-05, + "loss": 1.7374, + "step": 22021 + }, + { + "epoch": 6.759361571516267, + "grad_norm": 0.25363266468048096, + "learning_rate": 2.5107692874614507e-05, + "loss": 1.7001, + "step": 22022 + }, + { + "epoch": 6.759668508287293, + "grad_norm": 0.20287153124809265, + "learning_rate": 2.51033822031965e-05, + "loss": 1.7704, + "step": 22023 + }, + { + "epoch": 6.759975445058318, + "grad_norm": 0.2401949167251587, + "learning_rate": 2.509907177782146e-05, + "loss": 1.7157, + "step": 22024 + }, + { + "epoch": 6.760282381829343, + "grad_norm": 0.177081897854805, + "learning_rate": 2.5094761598531985e-05, + "loss": 1.7572, + "step": 22025 + }, + { + "epoch": 6.760589318600369, + "grad_norm": 0.2641974687576294, + "learning_rate": 2.5090451665370674e-05, + "loss": 1.725, + "step": 22026 + }, + { + "epoch": 6.760896255371393, + "grad_norm": 0.20262297987937927, + "learning_rate": 2.5086141978380116e-05, + "loss": 1.6591, + "step": 22027 + }, + { + "epoch": 6.7612031921424185, + "grad_norm": 0.19107301533222198, + "learning_rate": 2.5081832537602913e-05, + "loss": 1.6914, + "step": 22028 + }, + { + "epoch": 6.761510128913444, + "grad_norm": 0.28122687339782715, + "learning_rate": 2.5077523343081643e-05, + "loss": 1.7759, + "step": 22029 + }, + { + "epoch": 6.761817065684469, + "grad_norm": 0.16575101017951965, + "learning_rate": 2.5073214394858897e-05, + "loss": 1.6994, + "step": 22030 + }, + { + "epoch": 6.7621240024554945, + "grad_norm": 0.26933449506759644, + "learning_rate": 2.506890569297723e-05, + "loss": 1.7565, + "step": 22031 + }, + { + "epoch": 6.762430939226519, + "grad_norm": 0.2452966868877411, + "learning_rate": 2.5064597237479292e-05, + "loss": 1.7442, + "step": 22032 + }, + { + "epoch": 6.762737875997544, + "grad_norm": 0.20781855285167694, + "learning_rate": 2.5060289028407585e-05, + "loss": 1.714, + "step": 22033 + }, + { + "epoch": 6.76304481276857, + "grad_norm": 0.1997823268175125, + "learning_rate": 2.5055981065804756e-05, + "loss": 1.7318, + "step": 22034 + }, + { + "epoch": 6.763351749539595, + "grad_norm": 0.2080194652080536, + "learning_rate": 2.50516733497133e-05, + "loss": 1.7466, + "step": 22035 + }, + { + "epoch": 6.76365868631062, + "grad_norm": 0.17558889091014862, + "learning_rate": 2.504736588017585e-05, + "loss": 1.7049, + "step": 22036 + }, + { + "epoch": 6.763965623081646, + "grad_norm": 0.1999572217464447, + "learning_rate": 2.5043058657234957e-05, + "loss": 1.7121, + "step": 22037 + }, + { + "epoch": 6.76427255985267, + "grad_norm": 0.16219176352024078, + "learning_rate": 2.5038751680933185e-05, + "loss": 1.698, + "step": 22038 + }, + { + "epoch": 6.764579496623695, + "grad_norm": 0.17965151369571686, + "learning_rate": 2.50344449513131e-05, + "loss": 1.7021, + "step": 22039 + }, + { + "epoch": 6.764886433394721, + "grad_norm": 0.18831093609333038, + "learning_rate": 2.5030138468417263e-05, + "loss": 1.7049, + "step": 22040 + }, + { + "epoch": 6.765193370165746, + "grad_norm": 0.20622828602790833, + "learning_rate": 2.5025832232288236e-05, + "loss": 1.7834, + "step": 22041 + }, + { + "epoch": 6.765500306936771, + "grad_norm": 0.22746746242046356, + "learning_rate": 2.5021526242968574e-05, + "loss": 1.7426, + "step": 22042 + }, + { + "epoch": 6.765807243707796, + "grad_norm": 0.2048977166414261, + "learning_rate": 2.5017220500500828e-05, + "loss": 1.7192, + "step": 22043 + }, + { + "epoch": 6.766114180478821, + "grad_norm": 0.19647538661956787, + "learning_rate": 2.5012915004927546e-05, + "loss": 1.6738, + "step": 22044 + }, + { + "epoch": 6.7664211172498465, + "grad_norm": 0.2133142054080963, + "learning_rate": 2.5008609756291284e-05, + "loss": 1.7482, + "step": 22045 + }, + { + "epoch": 6.766728054020872, + "grad_norm": 0.23578259348869324, + "learning_rate": 2.500430475463459e-05, + "loss": 1.696, + "step": 22046 + }, + { + "epoch": 6.767034990791897, + "grad_norm": 0.24862529337406158, + "learning_rate": 2.500000000000001e-05, + "loss": 1.7508, + "step": 22047 + }, + { + "epoch": 6.7673419275629225, + "grad_norm": 0.22704963386058807, + "learning_rate": 2.4995695492430066e-05, + "loss": 1.7739, + "step": 22048 + }, + { + "epoch": 6.767648864333947, + "grad_norm": 0.20216481387615204, + "learning_rate": 2.4991391231967347e-05, + "loss": 1.7406, + "step": 22049 + }, + { + "epoch": 6.767955801104972, + "grad_norm": 0.18778519332408905, + "learning_rate": 2.498708721865432e-05, + "loss": 1.683, + "step": 22050 + }, + { + "epoch": 6.768262737875998, + "grad_norm": 0.21680599451065063, + "learning_rate": 2.4982783452533597e-05, + "loss": 1.7652, + "step": 22051 + }, + { + "epoch": 6.768569674647023, + "grad_norm": 0.16952121257781982, + "learning_rate": 2.4978479933647637e-05, + "loss": 1.6551, + "step": 22052 + }, + { + "epoch": 6.768876611418047, + "grad_norm": 0.1979489028453827, + "learning_rate": 2.4974176662039017e-05, + "loss": 1.7399, + "step": 22053 + }, + { + "epoch": 6.769183548189073, + "grad_norm": 0.18934862315654755, + "learning_rate": 2.496987363775025e-05, + "loss": 1.7228, + "step": 22054 + }, + { + "epoch": 6.769490484960098, + "grad_norm": 0.17551462352275848, + "learning_rate": 2.496557086082387e-05, + "loss": 1.6725, + "step": 22055 + }, + { + "epoch": 6.769797421731123, + "grad_norm": 0.23561003804206848, + "learning_rate": 2.496126833130239e-05, + "loss": 1.7606, + "step": 22056 + }, + { + "epoch": 6.770104358502149, + "grad_norm": 0.19105803966522217, + "learning_rate": 2.4956966049228324e-05, + "loss": 1.6975, + "step": 22057 + }, + { + "epoch": 6.770411295273174, + "grad_norm": 0.28581124544143677, + "learning_rate": 2.4952664014644204e-05, + "loss": 1.7408, + "step": 22058 + }, + { + "epoch": 6.7707182320441985, + "grad_norm": 0.20723536610603333, + "learning_rate": 2.494836222759254e-05, + "loss": 1.752, + "step": 22059 + }, + { + "epoch": 6.771025168815224, + "grad_norm": 0.2089354693889618, + "learning_rate": 2.4944060688115846e-05, + "loss": 1.6662, + "step": 22060 + }, + { + "epoch": 6.771332105586249, + "grad_norm": 0.2299557626247406, + "learning_rate": 2.4939759396256625e-05, + "loss": 1.7978, + "step": 22061 + }, + { + "epoch": 6.7716390423572745, + "grad_norm": 0.17900820076465607, + "learning_rate": 2.493545835205739e-05, + "loss": 1.6876, + "step": 22062 + }, + { + "epoch": 6.7719459791283, + "grad_norm": 0.21412713825702667, + "learning_rate": 2.4931157555560648e-05, + "loss": 1.7347, + "step": 22063 + }, + { + "epoch": 6.772252915899324, + "grad_norm": 0.24448172748088837, + "learning_rate": 2.49268570068089e-05, + "loss": 1.7611, + "step": 22064 + }, + { + "epoch": 6.77255985267035, + "grad_norm": 0.20153972506523132, + "learning_rate": 2.4922556705844624e-05, + "loss": 1.7347, + "step": 22065 + }, + { + "epoch": 6.772866789441375, + "grad_norm": 0.2142268568277359, + "learning_rate": 2.4918256652710387e-05, + "loss": 1.7548, + "step": 22066 + }, + { + "epoch": 6.7731737262124, + "grad_norm": 0.19735601544380188, + "learning_rate": 2.4913956847448595e-05, + "loss": 1.7138, + "step": 22067 + }, + { + "epoch": 6.773480662983426, + "grad_norm": 0.1847008913755417, + "learning_rate": 2.4909657290101824e-05, + "loss": 1.6812, + "step": 22068 + }, + { + "epoch": 6.773787599754451, + "grad_norm": 0.18406464159488678, + "learning_rate": 2.4905357980712486e-05, + "loss": 1.6992, + "step": 22069 + }, + { + "epoch": 6.774094536525475, + "grad_norm": 0.19595865905284882, + "learning_rate": 2.490105891932313e-05, + "loss": 1.7118, + "step": 22070 + }, + { + "epoch": 6.774401473296501, + "grad_norm": 0.1929878294467926, + "learning_rate": 2.4896760105976218e-05, + "loss": 1.7187, + "step": 22071 + }, + { + "epoch": 6.774708410067526, + "grad_norm": 0.23972687125205994, + "learning_rate": 2.4892461540714242e-05, + "loss": 1.7293, + "step": 22072 + }, + { + "epoch": 6.7750153468385514, + "grad_norm": 0.18744204938411713, + "learning_rate": 2.4888163223579675e-05, + "loss": 1.7102, + "step": 22073 + }, + { + "epoch": 6.775322283609577, + "grad_norm": 0.20168112218379974, + "learning_rate": 2.4883865154614994e-05, + "loss": 1.7655, + "step": 22074 + }, + { + "epoch": 6.775629220380601, + "grad_norm": 0.22825658321380615, + "learning_rate": 2.487956733386268e-05, + "loss": 1.7251, + "step": 22075 + }, + { + "epoch": 6.775936157151627, + "grad_norm": 0.19441691040992737, + "learning_rate": 2.4875269761365205e-05, + "loss": 1.7657, + "step": 22076 + }, + { + "epoch": 6.776243093922652, + "grad_norm": 0.22861605882644653, + "learning_rate": 2.487097243716504e-05, + "loss": 1.7132, + "step": 22077 + }, + { + "epoch": 6.776550030693677, + "grad_norm": 0.19157674908638, + "learning_rate": 2.486667536130466e-05, + "loss": 1.7448, + "step": 22078 + }, + { + "epoch": 6.776856967464703, + "grad_norm": 0.2203369438648224, + "learning_rate": 2.486237853382652e-05, + "loss": 1.7535, + "step": 22079 + }, + { + "epoch": 6.777163904235728, + "grad_norm": 0.16477027535438538, + "learning_rate": 2.4858081954773088e-05, + "loss": 1.706, + "step": 22080 + }, + { + "epoch": 6.777470841006752, + "grad_norm": 0.16536933183670044, + "learning_rate": 2.4853785624186827e-05, + "loss": 1.6725, + "step": 22081 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 0.18266050517559052, + "learning_rate": 2.4849489542110176e-05, + "loss": 1.6799, + "step": 22082 + }, + { + "epoch": 6.778084714548803, + "grad_norm": 0.21422190964221954, + "learning_rate": 2.4845193708585647e-05, + "loss": 1.7275, + "step": 22083 + }, + { + "epoch": 6.778391651319828, + "grad_norm": 0.19356754422187805, + "learning_rate": 2.4840898123655622e-05, + "loss": 1.7172, + "step": 22084 + }, + { + "epoch": 6.778698588090853, + "grad_norm": 0.21090209484100342, + "learning_rate": 2.4836602787362628e-05, + "loss": 1.6581, + "step": 22085 + }, + { + "epoch": 6.779005524861878, + "grad_norm": 0.20072491466999054, + "learning_rate": 2.483230769974903e-05, + "loss": 1.7398, + "step": 22086 + }, + { + "epoch": 6.7793124616329035, + "grad_norm": 0.20642702281475067, + "learning_rate": 2.482801286085734e-05, + "loss": 1.7505, + "step": 22087 + }, + { + "epoch": 6.779619398403929, + "grad_norm": 0.20322991907596588, + "learning_rate": 2.4823718270729985e-05, + "loss": 1.6693, + "step": 22088 + }, + { + "epoch": 6.779926335174954, + "grad_norm": 0.17060843110084534, + "learning_rate": 2.4819423929409396e-05, + "loss": 1.6746, + "step": 22089 + }, + { + "epoch": 6.7802332719459795, + "grad_norm": 0.20697785913944244, + "learning_rate": 2.4815129836938024e-05, + "loss": 1.7413, + "step": 22090 + }, + { + "epoch": 6.780540208717004, + "grad_norm": 0.19845673441886902, + "learning_rate": 2.48108359933583e-05, + "loss": 1.694, + "step": 22091 + }, + { + "epoch": 6.780847145488029, + "grad_norm": 0.24547794461250305, + "learning_rate": 2.4806542398712657e-05, + "loss": 1.7316, + "step": 22092 + }, + { + "epoch": 6.781154082259055, + "grad_norm": 0.15587118268013, + "learning_rate": 2.4802249053043526e-05, + "loss": 1.667, + "step": 22093 + }, + { + "epoch": 6.78146101903008, + "grad_norm": 0.22754593193531036, + "learning_rate": 2.4797955956393336e-05, + "loss": 1.7504, + "step": 22094 + }, + { + "epoch": 6.781767955801105, + "grad_norm": 0.201420396566391, + "learning_rate": 2.4793663108804528e-05, + "loss": 1.749, + "step": 22095 + }, + { + "epoch": 6.78207489257213, + "grad_norm": 0.1952153891324997, + "learning_rate": 2.4789370510319504e-05, + "loss": 1.7306, + "step": 22096 + }, + { + "epoch": 6.782381829343155, + "grad_norm": 0.16750730574131012, + "learning_rate": 2.4785078160980703e-05, + "loss": 1.6775, + "step": 22097 + }, + { + "epoch": 6.78268876611418, + "grad_norm": 0.19943620264530182, + "learning_rate": 2.4780786060830535e-05, + "loss": 1.7233, + "step": 22098 + }, + { + "epoch": 6.782995702885206, + "grad_norm": 0.21302999556064606, + "learning_rate": 2.4776494209911423e-05, + "loss": 1.798, + "step": 22099 + }, + { + "epoch": 6.783302639656231, + "grad_norm": 0.22949734330177307, + "learning_rate": 2.4772202608265776e-05, + "loss": 1.7678, + "step": 22100 + }, + { + "epoch": 6.783609576427256, + "grad_norm": 0.20945954322814941, + "learning_rate": 2.4767911255935993e-05, + "loss": 1.701, + "step": 22101 + }, + { + "epoch": 6.783916513198281, + "grad_norm": 0.189425989985466, + "learning_rate": 2.476362015296454e-05, + "loss": 1.7152, + "step": 22102 + }, + { + "epoch": 6.784223449969306, + "grad_norm": 0.18826924264431, + "learning_rate": 2.4759329299393747e-05, + "loss": 1.7004, + "step": 22103 + }, + { + "epoch": 6.7845303867403315, + "grad_norm": 0.20359934866428375, + "learning_rate": 2.475503869526607e-05, + "loss": 1.705, + "step": 22104 + }, + { + "epoch": 6.784837323511357, + "grad_norm": 0.22381560504436493, + "learning_rate": 2.4750748340623896e-05, + "loss": 1.7345, + "step": 22105 + }, + { + "epoch": 6.785144260282382, + "grad_norm": 0.1750476062297821, + "learning_rate": 2.474645823550963e-05, + "loss": 1.7084, + "step": 22106 + }, + { + "epoch": 6.785451197053407, + "grad_norm": 0.17943856120109558, + "learning_rate": 2.4742168379965662e-05, + "loss": 1.7417, + "step": 22107 + }, + { + "epoch": 6.785758133824432, + "grad_norm": 0.21809861063957214, + "learning_rate": 2.4737878774034397e-05, + "loss": 1.7197, + "step": 22108 + }, + { + "epoch": 6.786065070595457, + "grad_norm": 0.19761307537555695, + "learning_rate": 2.473358941775821e-05, + "loss": 1.6763, + "step": 22109 + }, + { + "epoch": 6.786372007366483, + "grad_norm": 0.19513878226280212, + "learning_rate": 2.472930031117951e-05, + "loss": 1.6859, + "step": 22110 + }, + { + "epoch": 6.786678944137508, + "grad_norm": 0.21796870231628418, + "learning_rate": 2.4725011454340675e-05, + "loss": 1.6957, + "step": 22111 + }, + { + "epoch": 6.786985880908533, + "grad_norm": 0.1885530948638916, + "learning_rate": 2.4720722847284088e-05, + "loss": 1.731, + "step": 22112 + }, + { + "epoch": 6.787292817679558, + "grad_norm": 0.2108110785484314, + "learning_rate": 2.4716434490052137e-05, + "loss": 1.7985, + "step": 22113 + }, + { + "epoch": 6.787599754450583, + "grad_norm": 0.23425176739692688, + "learning_rate": 2.4712146382687194e-05, + "loss": 1.7177, + "step": 22114 + }, + { + "epoch": 6.787906691221608, + "grad_norm": 0.17368707060813904, + "learning_rate": 2.4707858525231652e-05, + "loss": 1.7158, + "step": 22115 + }, + { + "epoch": 6.788213627992634, + "grad_norm": 0.22731448709964752, + "learning_rate": 2.470357091772787e-05, + "loss": 1.7037, + "step": 22116 + }, + { + "epoch": 6.788520564763659, + "grad_norm": 0.19142407178878784, + "learning_rate": 2.469928356021823e-05, + "loss": 1.7283, + "step": 22117 + }, + { + "epoch": 6.7888275015346835, + "grad_norm": 0.17515631020069122, + "learning_rate": 2.4694996452745072e-05, + "loss": 1.6812, + "step": 22118 + }, + { + "epoch": 6.789134438305709, + "grad_norm": 0.17932391166687012, + "learning_rate": 2.4690709595350838e-05, + "loss": 1.6832, + "step": 22119 + }, + { + "epoch": 6.789441375076734, + "grad_norm": 0.21177144348621368, + "learning_rate": 2.4686422988077802e-05, + "loss": 1.7443, + "step": 22120 + }, + { + "epoch": 6.7897483118477595, + "grad_norm": 0.17952793836593628, + "learning_rate": 2.4682136630968412e-05, + "loss": 1.6794, + "step": 22121 + }, + { + "epoch": 6.790055248618785, + "grad_norm": 0.18464395403862, + "learning_rate": 2.467785052406495e-05, + "loss": 1.6316, + "step": 22122 + }, + { + "epoch": 6.79036218538981, + "grad_norm": 0.1936565786600113, + "learning_rate": 2.4673564667409828e-05, + "loss": 1.6935, + "step": 22123 + }, + { + "epoch": 6.790669122160835, + "grad_norm": 0.21169735491275787, + "learning_rate": 2.4669279061045387e-05, + "loss": 1.7232, + "step": 22124 + }, + { + "epoch": 6.79097605893186, + "grad_norm": 0.199925035238266, + "learning_rate": 2.466499370501397e-05, + "loss": 1.8242, + "step": 22125 + }, + { + "epoch": 6.791282995702885, + "grad_norm": 0.19049705564975739, + "learning_rate": 2.4660708599357963e-05, + "loss": 1.7342, + "step": 22126 + }, + { + "epoch": 6.791589932473911, + "grad_norm": 0.16483616828918457, + "learning_rate": 2.465642374411964e-05, + "loss": 1.7144, + "step": 22127 + }, + { + "epoch": 6.791896869244935, + "grad_norm": 0.17355477809906006, + "learning_rate": 2.4652139139341413e-05, + "loss": 1.6715, + "step": 22128 + }, + { + "epoch": 6.79220380601596, + "grad_norm": 0.17448700964450836, + "learning_rate": 2.4647854785065605e-05, + "loss": 1.6669, + "step": 22129 + }, + { + "epoch": 6.792510742786986, + "grad_norm": 0.19858810305595398, + "learning_rate": 2.4643570681334553e-05, + "loss": 1.6781, + "step": 22130 + }, + { + "epoch": 6.792817679558011, + "grad_norm": 0.17350561916828156, + "learning_rate": 2.46392868281906e-05, + "loss": 1.7005, + "step": 22131 + }, + { + "epoch": 6.793124616329036, + "grad_norm": 0.17494787275791168, + "learning_rate": 2.4635003225676078e-05, + "loss": 1.7204, + "step": 22132 + }, + { + "epoch": 6.793431553100062, + "grad_norm": 0.1988590806722641, + "learning_rate": 2.463071987383332e-05, + "loss": 1.7314, + "step": 22133 + }, + { + "epoch": 6.793738489871086, + "grad_norm": 0.18046239018440247, + "learning_rate": 2.4626436772704658e-05, + "loss": 1.706, + "step": 22134 + }, + { + "epoch": 6.7940454266421115, + "grad_norm": 0.21060462296009064, + "learning_rate": 2.4622153922332402e-05, + "loss": 1.6967, + "step": 22135 + }, + { + "epoch": 6.794352363413137, + "grad_norm": 0.22328679263591766, + "learning_rate": 2.4617871322758934e-05, + "loss": 1.7502, + "step": 22136 + }, + { + "epoch": 6.794659300184162, + "grad_norm": 0.18324224650859833, + "learning_rate": 2.46135889740265e-05, + "loss": 1.7183, + "step": 22137 + }, + { + "epoch": 6.7949662369551875, + "grad_norm": 0.2381133884191513, + "learning_rate": 2.4609306876177496e-05, + "loss": 1.739, + "step": 22138 + }, + { + "epoch": 6.795273173726212, + "grad_norm": 0.21471738815307617, + "learning_rate": 2.4605025029254164e-05, + "loss": 1.7466, + "step": 22139 + }, + { + "epoch": 6.795580110497237, + "grad_norm": 0.209581658244133, + "learning_rate": 2.4600743433298885e-05, + "loss": 1.7495, + "step": 22140 + }, + { + "epoch": 6.795887047268263, + "grad_norm": 0.1806897670030594, + "learning_rate": 2.459646208835394e-05, + "loss": 1.7137, + "step": 22141 + }, + { + "epoch": 6.796193984039288, + "grad_norm": 0.19036264717578888, + "learning_rate": 2.4592180994461644e-05, + "loss": 1.6993, + "step": 22142 + }, + { + "epoch": 6.796500920810313, + "grad_norm": 0.17937630414962769, + "learning_rate": 2.4587900151664335e-05, + "loss": 1.7102, + "step": 22143 + }, + { + "epoch": 6.796807857581339, + "grad_norm": 0.19278483092784882, + "learning_rate": 2.4583619560004244e-05, + "loss": 1.7058, + "step": 22144 + }, + { + "epoch": 6.797114794352363, + "grad_norm": 0.19507993757724762, + "learning_rate": 2.4579339219523744e-05, + "loss": 1.7137, + "step": 22145 + }, + { + "epoch": 6.797421731123388, + "grad_norm": 0.20417597889900208, + "learning_rate": 2.4575059130265115e-05, + "loss": 1.7156, + "step": 22146 + }, + { + "epoch": 6.797728667894414, + "grad_norm": 0.1898338943719864, + "learning_rate": 2.4570779292270658e-05, + "loss": 1.7501, + "step": 22147 + }, + { + "epoch": 6.798035604665439, + "grad_norm": 0.18777382373809814, + "learning_rate": 2.4566499705582656e-05, + "loss": 1.7192, + "step": 22148 + }, + { + "epoch": 6.798342541436464, + "grad_norm": 0.19526423513889313, + "learning_rate": 2.4562220370243415e-05, + "loss": 1.6637, + "step": 22149 + }, + { + "epoch": 6.798649478207489, + "grad_norm": 0.23661594092845917, + "learning_rate": 2.455794128629522e-05, + "loss": 1.7557, + "step": 22150 + }, + { + "epoch": 6.798956414978514, + "grad_norm": 0.27043846249580383, + "learning_rate": 2.4553662453780362e-05, + "loss": 1.7712, + "step": 22151 + }, + { + "epoch": 6.7992633517495396, + "grad_norm": 0.17968088388442993, + "learning_rate": 2.454938387274111e-05, + "loss": 1.6721, + "step": 22152 + }, + { + "epoch": 6.799570288520565, + "grad_norm": 0.21456219255924225, + "learning_rate": 2.45451055432198e-05, + "loss": 1.7249, + "step": 22153 + }, + { + "epoch": 6.79987722529159, + "grad_norm": 0.22433941066265106, + "learning_rate": 2.4540827465258638e-05, + "loss": 1.7319, + "step": 22154 + }, + { + "epoch": 6.800184162062616, + "grad_norm": 0.2808871567249298, + "learning_rate": 2.4536549638899976e-05, + "loss": 1.7802, + "step": 22155 + }, + { + "epoch": 6.80049109883364, + "grad_norm": 0.28654494881629944, + "learning_rate": 2.4532272064186018e-05, + "loss": 1.7431, + "step": 22156 + }, + { + "epoch": 6.800798035604665, + "grad_norm": 0.19476976990699768, + "learning_rate": 2.45279947411591e-05, + "loss": 1.6792, + "step": 22157 + }, + { + "epoch": 6.801104972375691, + "grad_norm": 0.25114744901657104, + "learning_rate": 2.452371766986146e-05, + "loss": 1.7458, + "step": 22158 + }, + { + "epoch": 6.801411909146716, + "grad_norm": 0.18099439144134521, + "learning_rate": 2.451944085033538e-05, + "loss": 1.6952, + "step": 22159 + }, + { + "epoch": 6.8017188459177405, + "grad_norm": 0.21425777673721313, + "learning_rate": 2.4515164282623138e-05, + "loss": 1.7593, + "step": 22160 + }, + { + "epoch": 6.802025782688766, + "grad_norm": 0.19833709299564362, + "learning_rate": 2.4510887966766937e-05, + "loss": 1.6643, + "step": 22161 + }, + { + "epoch": 6.802332719459791, + "grad_norm": 0.20073090493679047, + "learning_rate": 2.45066119028091e-05, + "loss": 1.7112, + "step": 22162 + }, + { + "epoch": 6.8026396562308165, + "grad_norm": 0.18599852919578552, + "learning_rate": 2.4502336090791872e-05, + "loss": 1.7121, + "step": 22163 + }, + { + "epoch": 6.802946593001842, + "grad_norm": 0.22036875784397125, + "learning_rate": 2.4498060530757498e-05, + "loss": 1.7944, + "step": 22164 + }, + { + "epoch": 6.803253529772867, + "grad_norm": 0.19521577656269073, + "learning_rate": 2.4493785222748243e-05, + "loss": 1.7463, + "step": 22165 + }, + { + "epoch": 6.803560466543892, + "grad_norm": 0.22010843455791473, + "learning_rate": 2.448951016680635e-05, + "loss": 1.6951, + "step": 22166 + }, + { + "epoch": 6.803867403314917, + "grad_norm": 0.20490090548992157, + "learning_rate": 2.448523536297407e-05, + "loss": 1.7723, + "step": 22167 + }, + { + "epoch": 6.804174340085942, + "grad_norm": 0.2298613339662552, + "learning_rate": 2.4480960811293648e-05, + "loss": 1.7644, + "step": 22168 + }, + { + "epoch": 6.804481276856968, + "grad_norm": 0.18560375273227692, + "learning_rate": 2.4476686511807306e-05, + "loss": 1.686, + "step": 22169 + }, + { + "epoch": 6.804788213627993, + "grad_norm": 0.24295780062675476, + "learning_rate": 2.4472412464557347e-05, + "loss": 1.7561, + "step": 22170 + }, + { + "epoch": 6.805095150399017, + "grad_norm": 0.1962144672870636, + "learning_rate": 2.4468138669585932e-05, + "loss": 1.7438, + "step": 22171 + }, + { + "epoch": 6.805402087170043, + "grad_norm": 0.21924439072608948, + "learning_rate": 2.4463865126935377e-05, + "loss": 1.7488, + "step": 22172 + }, + { + "epoch": 6.805709023941068, + "grad_norm": 0.1777856945991516, + "learning_rate": 2.4459591836647833e-05, + "loss": 1.6664, + "step": 22173 + }, + { + "epoch": 6.806015960712093, + "grad_norm": 0.24367454648017883, + "learning_rate": 2.4455318798765593e-05, + "loss": 1.7441, + "step": 22174 + }, + { + "epoch": 6.806322897483119, + "grad_norm": 0.2269427478313446, + "learning_rate": 2.4451046013330865e-05, + "loss": 1.7809, + "step": 22175 + }, + { + "epoch": 6.806629834254144, + "grad_norm": 0.21986174583435059, + "learning_rate": 2.444677348038587e-05, + "loss": 1.7453, + "step": 22176 + }, + { + "epoch": 6.8069367710251685, + "grad_norm": 0.1773367077112198, + "learning_rate": 2.4442501199972862e-05, + "loss": 1.6927, + "step": 22177 + }, + { + "epoch": 6.807243707796194, + "grad_norm": 0.20545031130313873, + "learning_rate": 2.4438229172133997e-05, + "loss": 1.7782, + "step": 22178 + }, + { + "epoch": 6.807550644567219, + "grad_norm": 0.1997014880180359, + "learning_rate": 2.443395739691155e-05, + "loss": 1.7295, + "step": 22179 + }, + { + "epoch": 6.8078575813382445, + "grad_norm": 0.19634006917476654, + "learning_rate": 2.4429685874347723e-05, + "loss": 1.7017, + "step": 22180 + }, + { + "epoch": 6.80816451810927, + "grad_norm": 0.2007836550474167, + "learning_rate": 2.442541460448473e-05, + "loss": 1.7252, + "step": 22181 + }, + { + "epoch": 6.808471454880294, + "grad_norm": 0.22204343974590302, + "learning_rate": 2.4421143587364775e-05, + "loss": 1.7526, + "step": 22182 + }, + { + "epoch": 6.80877839165132, + "grad_norm": 0.1906677633523941, + "learning_rate": 2.4416872823030073e-05, + "loss": 1.7121, + "step": 22183 + }, + { + "epoch": 6.809085328422345, + "grad_norm": 0.17165397107601166, + "learning_rate": 2.441260231152283e-05, + "loss": 1.6942, + "step": 22184 + }, + { + "epoch": 6.80939226519337, + "grad_norm": 0.17022575438022614, + "learning_rate": 2.4408332052885246e-05, + "loss": 1.6973, + "step": 22185 + }, + { + "epoch": 6.809699201964396, + "grad_norm": 0.16693587601184845, + "learning_rate": 2.4404062047159503e-05, + "loss": 1.6996, + "step": 22186 + }, + { + "epoch": 6.810006138735421, + "grad_norm": 0.2251187264919281, + "learning_rate": 2.4399792294387864e-05, + "loss": 1.778, + "step": 22187 + }, + { + "epoch": 6.810313075506445, + "grad_norm": 0.20622244477272034, + "learning_rate": 2.439552279461244e-05, + "loss": 1.7273, + "step": 22188 + }, + { + "epoch": 6.810620012277471, + "grad_norm": 0.19736994802951813, + "learning_rate": 2.439125354787551e-05, + "loss": 1.7096, + "step": 22189 + }, + { + "epoch": 6.810926949048496, + "grad_norm": 0.22955237329006195, + "learning_rate": 2.4386984554219182e-05, + "loss": 1.7859, + "step": 22190 + }, + { + "epoch": 6.811233885819521, + "grad_norm": 0.2283364087343216, + "learning_rate": 2.43827158136857e-05, + "loss": 1.6999, + "step": 22191 + }, + { + "epoch": 6.811540822590547, + "grad_norm": 0.18393704295158386, + "learning_rate": 2.4378447326317243e-05, + "loss": 1.654, + "step": 22192 + }, + { + "epoch": 6.811847759361571, + "grad_norm": 0.2031537890434265, + "learning_rate": 2.4374179092155986e-05, + "loss": 1.7353, + "step": 22193 + }, + { + "epoch": 6.8121546961325965, + "grad_norm": 0.1849071979522705, + "learning_rate": 2.4369911111244125e-05, + "loss": 1.7157, + "step": 22194 + }, + { + "epoch": 6.812461632903622, + "grad_norm": 0.20584192872047424, + "learning_rate": 2.4365643383623787e-05, + "loss": 1.7529, + "step": 22195 + }, + { + "epoch": 6.812768569674647, + "grad_norm": 0.24152903258800507, + "learning_rate": 2.436137590933721e-05, + "loss": 1.7662, + "step": 22196 + }, + { + "epoch": 6.8130755064456725, + "grad_norm": 0.26625362038612366, + "learning_rate": 2.4357108688426532e-05, + "loss": 1.7624, + "step": 22197 + }, + { + "epoch": 6.813382443216698, + "grad_norm": 0.27122190594673157, + "learning_rate": 2.435284172093395e-05, + "loss": 1.747, + "step": 22198 + }, + { + "epoch": 6.813689379987722, + "grad_norm": 0.18996810913085938, + "learning_rate": 2.434857500690161e-05, + "loss": 1.7377, + "step": 22199 + }, + { + "epoch": 6.813996316758748, + "grad_norm": 0.22355122864246368, + "learning_rate": 2.4344308546371686e-05, + "loss": 1.6865, + "step": 22200 + }, + { + "epoch": 6.814303253529773, + "grad_norm": 0.18468965590000153, + "learning_rate": 2.4340042339386348e-05, + "loss": 1.7091, + "step": 22201 + }, + { + "epoch": 6.814610190300798, + "grad_norm": 0.25356602668762207, + "learning_rate": 2.4335776385987747e-05, + "loss": 1.7482, + "step": 22202 + }, + { + "epoch": 6.814917127071823, + "grad_norm": 0.22462932765483856, + "learning_rate": 2.433151068621803e-05, + "loss": 1.6985, + "step": 22203 + }, + { + "epoch": 6.815224063842848, + "grad_norm": 0.2540687024593353, + "learning_rate": 2.43272452401194e-05, + "loss": 1.7878, + "step": 22204 + }, + { + "epoch": 6.815531000613873, + "grad_norm": 0.267811119556427, + "learning_rate": 2.432298004773395e-05, + "loss": 1.7862, + "step": 22205 + }, + { + "epoch": 6.815837937384899, + "grad_norm": 0.23089277744293213, + "learning_rate": 2.4318715109103894e-05, + "loss": 1.6892, + "step": 22206 + }, + { + "epoch": 6.816144874155924, + "grad_norm": 0.22740885615348816, + "learning_rate": 2.431445042427131e-05, + "loss": 1.6934, + "step": 22207 + }, + { + "epoch": 6.816451810926949, + "grad_norm": 0.18555034697055817, + "learning_rate": 2.4310185993278405e-05, + "loss": 1.6747, + "step": 22208 + }, + { + "epoch": 6.816758747697974, + "grad_norm": 0.23693101108074188, + "learning_rate": 2.430592181616729e-05, + "loss": 1.7212, + "step": 22209 + }, + { + "epoch": 6.817065684468999, + "grad_norm": 0.20551325380802155, + "learning_rate": 2.4301657892980128e-05, + "loss": 1.711, + "step": 22210 + }, + { + "epoch": 6.8173726212400245, + "grad_norm": 0.20047837495803833, + "learning_rate": 2.4297394223759056e-05, + "loss": 1.729, + "step": 22211 + }, + { + "epoch": 6.81767955801105, + "grad_norm": 0.22111602127552032, + "learning_rate": 2.4293130808546167e-05, + "loss": 1.706, + "step": 22212 + }, + { + "epoch": 6.817986494782075, + "grad_norm": 0.18199655413627625, + "learning_rate": 2.428886764738364e-05, + "loss": 1.7082, + "step": 22213 + }, + { + "epoch": 6.8182934315531, + "grad_norm": 0.18591821193695068, + "learning_rate": 2.4284604740313595e-05, + "loss": 1.6957, + "step": 22214 + }, + { + "epoch": 6.818600368324125, + "grad_norm": 0.19427789747714996, + "learning_rate": 2.4280342087378154e-05, + "loss": 1.7396, + "step": 22215 + }, + { + "epoch": 6.81890730509515, + "grad_norm": 0.233908548951149, + "learning_rate": 2.427607968861945e-05, + "loss": 1.741, + "step": 22216 + }, + { + "epoch": 6.819214241866176, + "grad_norm": 0.168926402926445, + "learning_rate": 2.4271817544079606e-05, + "loss": 1.7023, + "step": 22217 + }, + { + "epoch": 6.819521178637201, + "grad_norm": 0.34345322847366333, + "learning_rate": 2.426755565380074e-05, + "loss": 1.7201, + "step": 22218 + }, + { + "epoch": 6.819828115408226, + "grad_norm": 0.21531274914741516, + "learning_rate": 2.4263294017824974e-05, + "loss": 1.725, + "step": 22219 + }, + { + "epoch": 6.820135052179251, + "grad_norm": 0.25251755118370056, + "learning_rate": 2.4259032636194395e-05, + "loss": 1.6764, + "step": 22220 + }, + { + "epoch": 6.820441988950276, + "grad_norm": 0.246616929769516, + "learning_rate": 2.4254771508951186e-05, + "loss": 1.7971, + "step": 22221 + }, + { + "epoch": 6.820748925721301, + "grad_norm": 0.20998120307922363, + "learning_rate": 2.4250510636137375e-05, + "loss": 1.723, + "step": 22222 + }, + { + "epoch": 6.821055862492327, + "grad_norm": 0.28388240933418274, + "learning_rate": 2.4246250017795148e-05, + "loss": 1.7508, + "step": 22223 + }, + { + "epoch": 6.821362799263352, + "grad_norm": 0.18146218359470367, + "learning_rate": 2.4241989653966535e-05, + "loss": 1.7254, + "step": 22224 + }, + { + "epoch": 6.8216697360343765, + "grad_norm": 0.2384043037891388, + "learning_rate": 2.4237729544693694e-05, + "loss": 1.7624, + "step": 22225 + }, + { + "epoch": 6.821976672805402, + "grad_norm": 0.21908332407474518, + "learning_rate": 2.4233469690018714e-05, + "loss": 1.7595, + "step": 22226 + }, + { + "epoch": 6.822283609576427, + "grad_norm": 0.20963989198207855, + "learning_rate": 2.422921008998369e-05, + "loss": 1.6679, + "step": 22227 + }, + { + "epoch": 6.8225905463474525, + "grad_norm": 0.21045777201652527, + "learning_rate": 2.4224950744630732e-05, + "loss": 1.657, + "step": 22228 + }, + { + "epoch": 6.822897483118478, + "grad_norm": 0.21567417681217194, + "learning_rate": 2.4220691654001883e-05, + "loss": 1.7788, + "step": 22229 + }, + { + "epoch": 6.823204419889503, + "grad_norm": 0.2908889055252075, + "learning_rate": 2.4216432818139283e-05, + "loss": 1.7633, + "step": 22230 + }, + { + "epoch": 6.823511356660528, + "grad_norm": 0.22683843970298767, + "learning_rate": 2.4212174237085007e-05, + "loss": 1.7974, + "step": 22231 + }, + { + "epoch": 6.823818293431553, + "grad_norm": 0.25254085659980774, + "learning_rate": 2.420791591088114e-05, + "loss": 1.6871, + "step": 22232 + }, + { + "epoch": 6.824125230202578, + "grad_norm": 0.1804734766483307, + "learning_rate": 2.420365783956977e-05, + "loss": 1.7331, + "step": 22233 + }, + { + "epoch": 6.824432166973604, + "grad_norm": 0.21634186804294586, + "learning_rate": 2.419940002319297e-05, + "loss": 1.6641, + "step": 22234 + }, + { + "epoch": 6.824739103744628, + "grad_norm": 0.1941644847393036, + "learning_rate": 2.4195142461792818e-05, + "loss": 1.7198, + "step": 22235 + }, + { + "epoch": 6.8250460405156534, + "grad_norm": 0.20209947228431702, + "learning_rate": 2.4190885155411398e-05, + "loss": 1.7137, + "step": 22236 + }, + { + "epoch": 6.825352977286679, + "grad_norm": 0.17161925137043, + "learning_rate": 2.4186628104090757e-05, + "loss": 1.7059, + "step": 22237 + }, + { + "epoch": 6.825659914057704, + "grad_norm": 0.19352135062217712, + "learning_rate": 2.4182371307873025e-05, + "loss": 1.6699, + "step": 22238 + }, + { + "epoch": 6.8259668508287294, + "grad_norm": 0.20384716987609863, + "learning_rate": 2.417811476680019e-05, + "loss": 1.7167, + "step": 22239 + }, + { + "epoch": 6.826273787599755, + "grad_norm": 0.22764970362186432, + "learning_rate": 2.4173858480914402e-05, + "loss": 1.7085, + "step": 22240 + }, + { + "epoch": 6.82658072437078, + "grad_norm": 0.1988842487335205, + "learning_rate": 2.4169602450257645e-05, + "loss": 1.7458, + "step": 22241 + }, + { + "epoch": 6.826887661141805, + "grad_norm": 0.20511481165885925, + "learning_rate": 2.416534667487203e-05, + "loss": 1.7597, + "step": 22242 + }, + { + "epoch": 6.82719459791283, + "grad_norm": 0.20906902849674225, + "learning_rate": 2.4161091154799608e-05, + "loss": 1.7418, + "step": 22243 + }, + { + "epoch": 6.827501534683855, + "grad_norm": 0.22555884718894958, + "learning_rate": 2.4156835890082426e-05, + "loss": 1.8198, + "step": 22244 + }, + { + "epoch": 6.827808471454881, + "grad_norm": 0.25855058431625366, + "learning_rate": 2.4152580880762553e-05, + "loss": 1.7588, + "step": 22245 + }, + { + "epoch": 6.828115408225905, + "grad_norm": 0.16975226998329163, + "learning_rate": 2.4148326126881993e-05, + "loss": 1.6897, + "step": 22246 + }, + { + "epoch": 6.82842234499693, + "grad_norm": 0.2336781919002533, + "learning_rate": 2.414407162848284e-05, + "loss": 1.7412, + "step": 22247 + }, + { + "epoch": 6.828729281767956, + "grad_norm": 0.1660032868385315, + "learning_rate": 2.4139817385607126e-05, + "loss": 1.6221, + "step": 22248 + }, + { + "epoch": 6.829036218538981, + "grad_norm": 0.22926606237888336, + "learning_rate": 2.41355633982969e-05, + "loss": 1.7201, + "step": 22249 + }, + { + "epoch": 6.829343155310006, + "grad_norm": 0.1759374737739563, + "learning_rate": 2.4131309666594193e-05, + "loss": 1.6842, + "step": 22250 + }, + { + "epoch": 6.829650092081032, + "grad_norm": 0.23005764186382294, + "learning_rate": 2.4127056190541042e-05, + "loss": 1.7327, + "step": 22251 + }, + { + "epoch": 6.829957028852056, + "grad_norm": 0.2216579169034958, + "learning_rate": 2.412280297017949e-05, + "loss": 1.7856, + "step": 22252 + }, + { + "epoch": 6.8302639656230815, + "grad_norm": 0.22133000195026398, + "learning_rate": 2.4118550005551565e-05, + "loss": 1.7711, + "step": 22253 + }, + { + "epoch": 6.830570902394107, + "grad_norm": 0.21860742568969727, + "learning_rate": 2.41142972966993e-05, + "loss": 1.7276, + "step": 22254 + }, + { + "epoch": 6.830877839165132, + "grad_norm": 0.2484082579612732, + "learning_rate": 2.4110044843664726e-05, + "loss": 1.7038, + "step": 22255 + }, + { + "epoch": 6.8311847759361575, + "grad_norm": 0.22288921475410461, + "learning_rate": 2.410579264648984e-05, + "loss": 1.7149, + "step": 22256 + }, + { + "epoch": 6.831491712707182, + "grad_norm": 0.23635484278202057, + "learning_rate": 2.4101540705216724e-05, + "loss": 1.7296, + "step": 22257 + }, + { + "epoch": 6.831798649478207, + "grad_norm": 0.24334096908569336, + "learning_rate": 2.4097289019887324e-05, + "loss": 1.7458, + "step": 22258 + }, + { + "epoch": 6.832105586249233, + "grad_norm": 0.23019789159297943, + "learning_rate": 2.4093037590543716e-05, + "loss": 1.7296, + "step": 22259 + }, + { + "epoch": 6.832412523020258, + "grad_norm": 0.23739024996757507, + "learning_rate": 2.4088786417227895e-05, + "loss": 1.7844, + "step": 22260 + }, + { + "epoch": 6.832719459791283, + "grad_norm": 0.1969252973794937, + "learning_rate": 2.4084535499981873e-05, + "loss": 1.6692, + "step": 22261 + }, + { + "epoch": 6.833026396562309, + "grad_norm": 0.20111167430877686, + "learning_rate": 2.4080284838847682e-05, + "loss": 1.7813, + "step": 22262 + }, + { + "epoch": 6.833333333333333, + "grad_norm": 0.26112934947013855, + "learning_rate": 2.4076034433867268e-05, + "loss": 1.6852, + "step": 22263 + }, + { + "epoch": 6.833640270104358, + "grad_norm": 0.24244411289691925, + "learning_rate": 2.40717842850827e-05, + "loss": 1.7054, + "step": 22264 + }, + { + "epoch": 6.833947206875384, + "grad_norm": 0.22703053057193756, + "learning_rate": 2.406753439253595e-05, + "loss": 1.7655, + "step": 22265 + }, + { + "epoch": 6.834254143646409, + "grad_norm": 0.23935651779174805, + "learning_rate": 2.4063284756269027e-05, + "loss": 1.7462, + "step": 22266 + }, + { + "epoch": 6.834561080417434, + "grad_norm": 0.2169155478477478, + "learning_rate": 2.4059035376323928e-05, + "loss": 1.7059, + "step": 22267 + }, + { + "epoch": 6.834868017188459, + "grad_norm": 0.2045663446187973, + "learning_rate": 2.4054786252742645e-05, + "loss": 1.7166, + "step": 22268 + }, + { + "epoch": 6.835174953959484, + "grad_norm": 0.22796253859996796, + "learning_rate": 2.4050537385567172e-05, + "loss": 1.7361, + "step": 22269 + }, + { + "epoch": 6.8354818907305095, + "grad_norm": 0.20807915925979614, + "learning_rate": 2.4046288774839497e-05, + "loss": 1.7007, + "step": 22270 + }, + { + "epoch": 6.835788827501535, + "grad_norm": 0.22157903015613556, + "learning_rate": 2.4042040420601607e-05, + "loss": 1.7409, + "step": 22271 + }, + { + "epoch": 6.83609576427256, + "grad_norm": 0.21494148671627045, + "learning_rate": 2.4037792322895492e-05, + "loss": 1.7975, + "step": 22272 + }, + { + "epoch": 6.8364027010435855, + "grad_norm": 0.2275875061750412, + "learning_rate": 2.403354448176311e-05, + "loss": 1.6759, + "step": 22273 + }, + { + "epoch": 6.83670963781461, + "grad_norm": 0.21105073392391205, + "learning_rate": 2.4029296897246496e-05, + "loss": 1.7229, + "step": 22274 + }, + { + "epoch": 6.837016574585635, + "grad_norm": 0.21957579255104065, + "learning_rate": 2.4025049569387553e-05, + "loss": 1.737, + "step": 22275 + }, + { + "epoch": 6.837323511356661, + "grad_norm": 0.2291470617055893, + "learning_rate": 2.4020802498228335e-05, + "loss": 1.6731, + "step": 22276 + }, + { + "epoch": 6.837630448127686, + "grad_norm": 0.18196065723896027, + "learning_rate": 2.401655568381074e-05, + "loss": 1.6823, + "step": 22277 + }, + { + "epoch": 6.83793738489871, + "grad_norm": 0.20915214717388153, + "learning_rate": 2.401230912617678e-05, + "loss": 1.7038, + "step": 22278 + }, + { + "epoch": 6.838244321669736, + "grad_norm": 0.2060854732990265, + "learning_rate": 2.4008062825368437e-05, + "loss": 1.7514, + "step": 22279 + }, + { + "epoch": 6.838551258440761, + "grad_norm": 0.20858527719974518, + "learning_rate": 2.400381678142762e-05, + "loss": 1.7494, + "step": 22280 + }, + { + "epoch": 6.838858195211786, + "grad_norm": 0.19124718010425568, + "learning_rate": 2.3999570994396352e-05, + "loss": 1.7641, + "step": 22281 + }, + { + "epoch": 6.839165131982812, + "grad_norm": 0.28222304582595825, + "learning_rate": 2.3995325464316525e-05, + "loss": 1.7204, + "step": 22282 + }, + { + "epoch": 6.839472068753837, + "grad_norm": 0.20047026872634888, + "learning_rate": 2.399108019123016e-05, + "loss": 1.7261, + "step": 22283 + }, + { + "epoch": 6.8397790055248615, + "grad_norm": 0.2758225202560425, + "learning_rate": 2.3986835175179178e-05, + "loss": 1.6903, + "step": 22284 + }, + { + "epoch": 6.840085942295887, + "grad_norm": 0.2719727158546448, + "learning_rate": 2.3982590416205535e-05, + "loss": 1.8716, + "step": 22285 + }, + { + "epoch": 6.840392879066912, + "grad_norm": 0.3524060845375061, + "learning_rate": 2.3978345914351193e-05, + "loss": 1.7778, + "step": 22286 + }, + { + "epoch": 6.8406998158379375, + "grad_norm": 0.2711596190929413, + "learning_rate": 2.397410166965808e-05, + "loss": 1.7111, + "step": 22287 + }, + { + "epoch": 6.841006752608963, + "grad_norm": 0.2818336486816406, + "learning_rate": 2.396985768216815e-05, + "loss": 1.7292, + "step": 22288 + }, + { + "epoch": 6.841313689379987, + "grad_norm": 0.19677700102329254, + "learning_rate": 2.3965613951923343e-05, + "loss": 1.6975, + "step": 22289 + }, + { + "epoch": 6.841620626151013, + "grad_norm": 0.300997257232666, + "learning_rate": 2.3961370478965583e-05, + "loss": 1.7014, + "step": 22290 + }, + { + "epoch": 6.841927562922038, + "grad_norm": 0.23549453914165497, + "learning_rate": 2.395712726333686e-05, + "loss": 1.7052, + "step": 22291 + }, + { + "epoch": 6.842234499693063, + "grad_norm": 0.29898303747177124, + "learning_rate": 2.3952884305079026e-05, + "loss": 1.7828, + "step": 22292 + }, + { + "epoch": 6.842541436464089, + "grad_norm": 0.26108843088150024, + "learning_rate": 2.3948641604234096e-05, + "loss": 1.7023, + "step": 22293 + }, + { + "epoch": 6.842848373235114, + "grad_norm": 0.18781059980392456, + "learning_rate": 2.394439916084392e-05, + "loss": 1.6808, + "step": 22294 + }, + { + "epoch": 6.843155310006138, + "grad_norm": 0.22659730911254883, + "learning_rate": 2.3940156974950485e-05, + "loss": 1.7224, + "step": 22295 + }, + { + "epoch": 6.843462246777164, + "grad_norm": 0.17422057688236237, + "learning_rate": 2.3935915046595713e-05, + "loss": 1.668, + "step": 22296 + }, + { + "epoch": 6.843769183548189, + "grad_norm": 0.2008846402168274, + "learning_rate": 2.393167337582146e-05, + "loss": 1.7283, + "step": 22297 + }, + { + "epoch": 6.844076120319214, + "grad_norm": 0.20376072824001312, + "learning_rate": 2.392743196266973e-05, + "loss": 1.74, + "step": 22298 + }, + { + "epoch": 6.84438305709024, + "grad_norm": 0.16353756189346313, + "learning_rate": 2.3923190807182372e-05, + "loss": 1.717, + "step": 22299 + }, + { + "epoch": 6.844689993861264, + "grad_norm": 0.18436652421951294, + "learning_rate": 2.3918949909401335e-05, + "loss": 1.7257, + "step": 22300 + }, + { + "epoch": 6.8449969306322895, + "grad_norm": 0.2038460522890091, + "learning_rate": 2.3914709269368523e-05, + "loss": 1.7254, + "step": 22301 + }, + { + "epoch": 6.845303867403315, + "grad_norm": 0.17111587524414062, + "learning_rate": 2.3910468887125842e-05, + "loss": 1.6993, + "step": 22302 + }, + { + "epoch": 6.84561080417434, + "grad_norm": 0.20049406588077545, + "learning_rate": 2.3906228762715207e-05, + "loss": 1.7099, + "step": 22303 + }, + { + "epoch": 6.8459177409453655, + "grad_norm": 0.2168554663658142, + "learning_rate": 2.39019888961785e-05, + "loss": 1.725, + "step": 22304 + }, + { + "epoch": 6.846224677716391, + "grad_norm": 0.2228514850139618, + "learning_rate": 2.3897749287557647e-05, + "loss": 1.7348, + "step": 22305 + }, + { + "epoch": 6.846531614487415, + "grad_norm": 0.17166151106357574, + "learning_rate": 2.3893509936894532e-05, + "loss": 1.7451, + "step": 22306 + }, + { + "epoch": 6.846838551258441, + "grad_norm": 0.24896936118602753, + "learning_rate": 2.3889270844231026e-05, + "loss": 1.7397, + "step": 22307 + }, + { + "epoch": 6.847145488029466, + "grad_norm": 0.1984332948923111, + "learning_rate": 2.3885032009609098e-05, + "loss": 1.7167, + "step": 22308 + }, + { + "epoch": 6.847452424800491, + "grad_norm": 0.20763449370861053, + "learning_rate": 2.388079343307055e-05, + "loss": 1.7154, + "step": 22309 + }, + { + "epoch": 6.847759361571516, + "grad_norm": 0.21818630397319794, + "learning_rate": 2.3876555114657346e-05, + "loss": 1.7364, + "step": 22310 + }, + { + "epoch": 6.848066298342541, + "grad_norm": 0.21220166981220245, + "learning_rate": 2.3872317054411298e-05, + "loss": 1.74, + "step": 22311 + }, + { + "epoch": 6.848373235113566, + "grad_norm": 0.17486892640590668, + "learning_rate": 2.3868079252374343e-05, + "loss": 1.68, + "step": 22312 + }, + { + "epoch": 6.848680171884592, + "grad_norm": 0.20809298753738403, + "learning_rate": 2.386384170858837e-05, + "loss": 1.8102, + "step": 22313 + }, + { + "epoch": 6.848987108655617, + "grad_norm": 0.19927671551704407, + "learning_rate": 2.385960442309519e-05, + "loss": 1.7742, + "step": 22314 + }, + { + "epoch": 6.849294045426642, + "grad_norm": 0.18705040216445923, + "learning_rate": 2.3855367395936757e-05, + "loss": 1.689, + "step": 22315 + }, + { + "epoch": 6.849600982197668, + "grad_norm": 0.22023466229438782, + "learning_rate": 2.385113062715487e-05, + "loss": 1.7819, + "step": 22316 + }, + { + "epoch": 6.849907918968692, + "grad_norm": 0.24443435668945312, + "learning_rate": 2.384689411679146e-05, + "loss": 1.6533, + "step": 22317 + }, + { + "epoch": 6.850214855739718, + "grad_norm": 0.20103834569454193, + "learning_rate": 2.3842657864888368e-05, + "loss": 1.7274, + "step": 22318 + }, + { + "epoch": 6.850521792510743, + "grad_norm": 0.2265254408121109, + "learning_rate": 2.3838421871487465e-05, + "loss": 1.7874, + "step": 22319 + }, + { + "epoch": 6.850828729281768, + "grad_norm": 0.2775460183620453, + "learning_rate": 2.383418613663061e-05, + "loss": 1.8038, + "step": 22320 + }, + { + "epoch": 6.851135666052793, + "grad_norm": 0.2001011073589325, + "learning_rate": 2.3829950660359663e-05, + "loss": 1.7135, + "step": 22321 + }, + { + "epoch": 6.851442602823818, + "grad_norm": 0.21427330374717712, + "learning_rate": 2.382571544271648e-05, + "loss": 1.7155, + "step": 22322 + }, + { + "epoch": 6.851749539594843, + "grad_norm": 0.18420884013175964, + "learning_rate": 2.382148048374292e-05, + "loss": 1.7178, + "step": 22323 + }, + { + "epoch": 6.852056476365869, + "grad_norm": 0.19436471164226532, + "learning_rate": 2.3817245783480813e-05, + "loss": 1.7396, + "step": 22324 + }, + { + "epoch": 6.852363413136894, + "grad_norm": 0.23191674053668976, + "learning_rate": 2.381301134197207e-05, + "loss": 1.7102, + "step": 22325 + }, + { + "epoch": 6.852670349907919, + "grad_norm": 0.20381706953048706, + "learning_rate": 2.3808777159258462e-05, + "loss": 1.7671, + "step": 22326 + }, + { + "epoch": 6.852977286678944, + "grad_norm": 0.20202197134494781, + "learning_rate": 2.3804543235381897e-05, + "loss": 1.6774, + "step": 22327 + }, + { + "epoch": 6.853284223449969, + "grad_norm": 0.23496322333812714, + "learning_rate": 2.380030957038416e-05, + "loss": 1.7745, + "step": 22328 + }, + { + "epoch": 6.8535911602209945, + "grad_norm": 0.22473813593387604, + "learning_rate": 2.379607616430714e-05, + "loss": 1.7319, + "step": 22329 + }, + { + "epoch": 6.85389809699202, + "grad_norm": 0.2149224430322647, + "learning_rate": 2.3791843017192667e-05, + "loss": 1.77, + "step": 22330 + }, + { + "epoch": 6.854205033763045, + "grad_norm": 0.21146108210086823, + "learning_rate": 2.378761012908253e-05, + "loss": 1.762, + "step": 22331 + }, + { + "epoch": 6.85451197053407, + "grad_norm": 0.2031458169221878, + "learning_rate": 2.3783377500018626e-05, + "loss": 1.7007, + "step": 22332 + }, + { + "epoch": 6.854818907305095, + "grad_norm": 0.19763319194316864, + "learning_rate": 2.377914513004272e-05, + "loss": 1.6899, + "step": 22333 + }, + { + "epoch": 6.85512584407612, + "grad_norm": 0.17337046563625336, + "learning_rate": 2.3774913019196688e-05, + "loss": 1.683, + "step": 22334 + }, + { + "epoch": 6.855432780847146, + "grad_norm": 0.1850815862417221, + "learning_rate": 2.3770681167522328e-05, + "loss": 1.7284, + "step": 22335 + }, + { + "epoch": 6.855739717618171, + "grad_norm": 0.19693362712860107, + "learning_rate": 2.3766449575061477e-05, + "loss": 1.7694, + "step": 22336 + }, + { + "epoch": 6.856046654389196, + "grad_norm": 0.1981547325849533, + "learning_rate": 2.376221824185595e-05, + "loss": 1.736, + "step": 22337 + }, + { + "epoch": 6.856353591160221, + "grad_norm": 0.17638558149337769, + "learning_rate": 2.375798716794756e-05, + "loss": 1.6979, + "step": 22338 + }, + { + "epoch": 6.856660527931246, + "grad_norm": 0.20189990103244781, + "learning_rate": 2.3753756353378116e-05, + "loss": 1.7876, + "step": 22339 + }, + { + "epoch": 6.856967464702271, + "grad_norm": 0.1880224347114563, + "learning_rate": 2.3749525798189438e-05, + "loss": 1.7134, + "step": 22340 + }, + { + "epoch": 6.857274401473297, + "grad_norm": 0.2464265078306198, + "learning_rate": 2.3745295502423316e-05, + "loss": 1.7782, + "step": 22341 + }, + { + "epoch": 6.857581338244322, + "grad_norm": 0.19218963384628296, + "learning_rate": 2.3741065466121604e-05, + "loss": 1.7027, + "step": 22342 + }, + { + "epoch": 6.8578882750153465, + "grad_norm": 0.27446448802948, + "learning_rate": 2.3736835689326043e-05, + "loss": 1.772, + "step": 22343 + }, + { + "epoch": 6.858195211786372, + "grad_norm": 0.19315828382968903, + "learning_rate": 2.3732606172078497e-05, + "loss": 1.6855, + "step": 22344 + }, + { + "epoch": 6.858502148557397, + "grad_norm": 0.2668892741203308, + "learning_rate": 2.372837691442072e-05, + "loss": 1.7703, + "step": 22345 + }, + { + "epoch": 6.8588090853284225, + "grad_norm": 0.23552054166793823, + "learning_rate": 2.3724147916394497e-05, + "loss": 1.7184, + "step": 22346 + }, + { + "epoch": 6.859116022099448, + "grad_norm": 0.3194984793663025, + "learning_rate": 2.3719919178041682e-05, + "loss": 1.7531, + "step": 22347 + }, + { + "epoch": 6.859422958870473, + "grad_norm": 0.19298717379570007, + "learning_rate": 2.371569069940399e-05, + "loss": 1.7064, + "step": 22348 + }, + { + "epoch": 6.859729895641498, + "grad_norm": 0.2990693151950836, + "learning_rate": 2.3711462480523293e-05, + "loss": 1.7434, + "step": 22349 + }, + { + "epoch": 6.860036832412523, + "grad_norm": 0.1976640820503235, + "learning_rate": 2.370723452144129e-05, + "loss": 1.6881, + "step": 22350 + }, + { + "epoch": 6.860343769183548, + "grad_norm": 0.24306917190551758, + "learning_rate": 2.3703006822199825e-05, + "loss": 1.7791, + "step": 22351 + }, + { + "epoch": 6.860650705954574, + "grad_norm": 0.20065687596797943, + "learning_rate": 2.3698779382840657e-05, + "loss": 1.7162, + "step": 22352 + }, + { + "epoch": 6.860957642725598, + "grad_norm": 0.21599936485290527, + "learning_rate": 2.3694552203405574e-05, + "loss": 1.7702, + "step": 22353 + }, + { + "epoch": 6.861264579496623, + "grad_norm": 0.16836890578269958, + "learning_rate": 2.3690325283936338e-05, + "loss": 1.6676, + "step": 22354 + }, + { + "epoch": 6.861571516267649, + "grad_norm": 0.1756831407546997, + "learning_rate": 2.368609862447473e-05, + "loss": 1.6934, + "step": 22355 + }, + { + "epoch": 6.861878453038674, + "grad_norm": 0.18676789104938507, + "learning_rate": 2.3681872225062517e-05, + "loss": 1.6879, + "step": 22356 + }, + { + "epoch": 6.862185389809699, + "grad_norm": 0.18018634617328644, + "learning_rate": 2.3677646085741473e-05, + "loss": 1.7143, + "step": 22357 + }, + { + "epoch": 6.862492326580725, + "grad_norm": 0.1789008378982544, + "learning_rate": 2.3673420206553332e-05, + "loss": 1.6914, + "step": 22358 + }, + { + "epoch": 6.862799263351749, + "grad_norm": 0.1869693398475647, + "learning_rate": 2.366919458753993e-05, + "loss": 1.7431, + "step": 22359 + }, + { + "epoch": 6.8631062001227745, + "grad_norm": 0.1958019733428955, + "learning_rate": 2.3664969228742934e-05, + "loss": 1.7132, + "step": 22360 + }, + { + "epoch": 6.8634131368938, + "grad_norm": 0.199384868144989, + "learning_rate": 2.366074413020419e-05, + "loss": 1.7095, + "step": 22361 + }, + { + "epoch": 6.863720073664825, + "grad_norm": 0.2125246673822403, + "learning_rate": 2.365651929196539e-05, + "loss": 1.7125, + "step": 22362 + }, + { + "epoch": 6.8640270104358505, + "grad_norm": 0.1574707180261612, + "learning_rate": 2.3652294714068284e-05, + "loss": 1.6386, + "step": 22363 + }, + { + "epoch": 6.864333947206875, + "grad_norm": 0.30648529529571533, + "learning_rate": 2.364807039655469e-05, + "loss": 1.7665, + "step": 22364 + }, + { + "epoch": 6.8646408839779, + "grad_norm": 0.19746489822864532, + "learning_rate": 2.364384633946627e-05, + "loss": 1.6736, + "step": 22365 + }, + { + "epoch": 6.864947820748926, + "grad_norm": 0.25084391236305237, + "learning_rate": 2.3639622542844842e-05, + "loss": 1.7346, + "step": 22366 + }, + { + "epoch": 6.865254757519951, + "grad_norm": 0.1884133219718933, + "learning_rate": 2.3635399006732077e-05, + "loss": 1.6868, + "step": 22367 + }, + { + "epoch": 6.865561694290976, + "grad_norm": 0.21225856244564056, + "learning_rate": 2.3631175731169774e-05, + "loss": 1.7438, + "step": 22368 + }, + { + "epoch": 6.865868631062002, + "grad_norm": 0.1863771378993988, + "learning_rate": 2.3626952716199647e-05, + "loss": 1.7677, + "step": 22369 + }, + { + "epoch": 6.866175567833026, + "grad_norm": 0.1839088648557663, + "learning_rate": 2.362272996186343e-05, + "loss": 1.6902, + "step": 22370 + }, + { + "epoch": 6.866482504604051, + "grad_norm": 0.18304915726184845, + "learning_rate": 2.3618507468202856e-05, + "loss": 1.7142, + "step": 22371 + }, + { + "epoch": 6.866789441375077, + "grad_norm": 0.21228280663490295, + "learning_rate": 2.3614285235259655e-05, + "loss": 1.8277, + "step": 22372 + }, + { + "epoch": 6.867096378146102, + "grad_norm": 0.19515320658683777, + "learning_rate": 2.361006326307555e-05, + "loss": 1.7029, + "step": 22373 + }, + { + "epoch": 6.867403314917127, + "grad_norm": 0.16277433931827545, + "learning_rate": 2.360584155169227e-05, + "loss": 1.672, + "step": 22374 + }, + { + "epoch": 6.867710251688152, + "grad_norm": 0.2180202454328537, + "learning_rate": 2.360162010115151e-05, + "loss": 1.7516, + "step": 22375 + }, + { + "epoch": 6.868017188459177, + "grad_norm": 0.17940378189086914, + "learning_rate": 2.3597398911495055e-05, + "loss": 1.6782, + "step": 22376 + }, + { + "epoch": 6.8683241252302025, + "grad_norm": 0.20751933753490448, + "learning_rate": 2.3593177982764543e-05, + "loss": 1.7954, + "step": 22377 + }, + { + "epoch": 6.868631062001228, + "grad_norm": 0.23098444938659668, + "learning_rate": 2.3588957315001758e-05, + "loss": 1.7472, + "step": 22378 + }, + { + "epoch": 6.868937998772253, + "grad_norm": 0.2351236343383789, + "learning_rate": 2.358473690824836e-05, + "loss": 1.7959, + "step": 22379 + }, + { + "epoch": 6.8692449355432785, + "grad_norm": 0.1890626847743988, + "learning_rate": 2.3580516762546055e-05, + "loss": 1.7015, + "step": 22380 + }, + { + "epoch": 6.869551872314303, + "grad_norm": 0.21120475232601166, + "learning_rate": 2.3576296877936604e-05, + "loss": 1.7998, + "step": 22381 + }, + { + "epoch": 6.869858809085328, + "grad_norm": 0.18141280114650726, + "learning_rate": 2.3572077254461638e-05, + "loss": 1.6973, + "step": 22382 + }, + { + "epoch": 6.870165745856354, + "grad_norm": 0.19084444642066956, + "learning_rate": 2.356785789216293e-05, + "loss": 1.6853, + "step": 22383 + }, + { + "epoch": 6.870472682627379, + "grad_norm": 0.18046700954437256, + "learning_rate": 2.356363879108211e-05, + "loss": 1.7476, + "step": 22384 + }, + { + "epoch": 6.870779619398404, + "grad_norm": 0.19875061511993408, + "learning_rate": 2.3559419951260926e-05, + "loss": 1.7223, + "step": 22385 + }, + { + "epoch": 6.871086556169429, + "grad_norm": 0.2377827763557434, + "learning_rate": 2.3555201372741047e-05, + "loss": 1.7976, + "step": 22386 + }, + { + "epoch": 6.871393492940454, + "grad_norm": 0.17645993828773499, + "learning_rate": 2.3550983055564168e-05, + "loss": 1.6726, + "step": 22387 + }, + { + "epoch": 6.871700429711479, + "grad_norm": 0.19499735534191132, + "learning_rate": 2.3546764999771976e-05, + "loss": 1.67, + "step": 22388 + }, + { + "epoch": 6.872007366482505, + "grad_norm": 0.22010546922683716, + "learning_rate": 2.3542547205406163e-05, + "loss": 1.8461, + "step": 22389 + }, + { + "epoch": 6.87231430325353, + "grad_norm": 0.2101692259311676, + "learning_rate": 2.3538329672508396e-05, + "loss": 1.6922, + "step": 22390 + }, + { + "epoch": 6.872621240024555, + "grad_norm": 0.1926269382238388, + "learning_rate": 2.3534112401120372e-05, + "loss": 1.6934, + "step": 22391 + }, + { + "epoch": 6.87292817679558, + "grad_norm": 0.20662687718868256, + "learning_rate": 2.3529895391283742e-05, + "loss": 1.7284, + "step": 22392 + }, + { + "epoch": 6.873235113566605, + "grad_norm": 0.2392960786819458, + "learning_rate": 2.3525678643040235e-05, + "loss": 1.7207, + "step": 22393 + }, + { + "epoch": 6.8735420503376305, + "grad_norm": 0.2067870795726776, + "learning_rate": 2.3521462156431452e-05, + "loss": 1.7269, + "step": 22394 + }, + { + "epoch": 6.873848987108656, + "grad_norm": 0.2544265687465668, + "learning_rate": 2.351724593149914e-05, + "loss": 1.7358, + "step": 22395 + }, + { + "epoch": 6.87415592387968, + "grad_norm": 0.2243366837501526, + "learning_rate": 2.3513029968284907e-05, + "loss": 1.7625, + "step": 22396 + }, + { + "epoch": 6.874462860650706, + "grad_norm": 0.23003467917442322, + "learning_rate": 2.3508814266830414e-05, + "loss": 1.6943, + "step": 22397 + }, + { + "epoch": 6.874769797421731, + "grad_norm": 0.19257886707782745, + "learning_rate": 2.3504598827177383e-05, + "loss": 1.7393, + "step": 22398 + }, + { + "epoch": 6.875076734192756, + "grad_norm": 0.23782171308994293, + "learning_rate": 2.3500383649367404e-05, + "loss": 1.7758, + "step": 22399 + }, + { + "epoch": 6.875383670963782, + "grad_norm": 0.18137066066265106, + "learning_rate": 2.3496168733442197e-05, + "loss": 1.7083, + "step": 22400 + }, + { + "epoch": 6.875690607734807, + "grad_norm": 0.21970662474632263, + "learning_rate": 2.3491954079443344e-05, + "loss": 1.7552, + "step": 22401 + }, + { + "epoch": 6.8759975445058314, + "grad_norm": 0.2032134085893631, + "learning_rate": 2.3487739687412562e-05, + "loss": 1.7653, + "step": 22402 + }, + { + "epoch": 6.876304481276857, + "grad_norm": 0.22016118466854095, + "learning_rate": 2.348352555739148e-05, + "loss": 1.7277, + "step": 22403 + }, + { + "epoch": 6.876611418047882, + "grad_norm": 0.2250203788280487, + "learning_rate": 2.3479311689421736e-05, + "loss": 1.7451, + "step": 22404 + }, + { + "epoch": 6.8769183548189075, + "grad_norm": 0.19726359844207764, + "learning_rate": 2.3475098083544977e-05, + "loss": 1.728, + "step": 22405 + }, + { + "epoch": 6.877225291589933, + "grad_norm": 0.21295994520187378, + "learning_rate": 2.3470884739802844e-05, + "loss": 1.7438, + "step": 22406 + }, + { + "epoch": 6.877532228360957, + "grad_norm": 0.19653508067131042, + "learning_rate": 2.346667165823698e-05, + "loss": 1.7189, + "step": 22407 + }, + { + "epoch": 6.877839165131983, + "grad_norm": 0.21406517922878265, + "learning_rate": 2.3462458838889016e-05, + "loss": 1.7475, + "step": 22408 + }, + { + "epoch": 6.878146101903008, + "grad_norm": 0.20569753646850586, + "learning_rate": 2.3458246281800595e-05, + "loss": 1.7262, + "step": 22409 + }, + { + "epoch": 6.878453038674033, + "grad_norm": 0.19365517795085907, + "learning_rate": 2.3454033987013334e-05, + "loss": 1.6938, + "step": 22410 + }, + { + "epoch": 6.878759975445059, + "grad_norm": 0.20935405790805817, + "learning_rate": 2.344982195456885e-05, + "loss": 1.724, + "step": 22411 + }, + { + "epoch": 6.879066912216084, + "grad_norm": 0.2104228436946869, + "learning_rate": 2.3445610184508826e-05, + "loss": 1.7474, + "step": 22412 + }, + { + "epoch": 6.879373848987108, + "grad_norm": 0.19795742630958557, + "learning_rate": 2.3441398676874826e-05, + "loss": 1.7572, + "step": 22413 + }, + { + "epoch": 6.879680785758134, + "grad_norm": 0.20640577375888824, + "learning_rate": 2.3437187431708472e-05, + "loss": 1.7258, + "step": 22414 + }, + { + "epoch": 6.879987722529159, + "grad_norm": 0.2092565894126892, + "learning_rate": 2.3432976449051442e-05, + "loss": 1.7437, + "step": 22415 + }, + { + "epoch": 6.880294659300184, + "grad_norm": 0.2083825170993805, + "learning_rate": 2.3428765728945275e-05, + "loss": 1.7127, + "step": 22416 + }, + { + "epoch": 6.88060159607121, + "grad_norm": 0.20619866251945496, + "learning_rate": 2.3424555271431647e-05, + "loss": 1.7729, + "step": 22417 + }, + { + "epoch": 6.880908532842234, + "grad_norm": 0.22689959406852722, + "learning_rate": 2.3420345076552107e-05, + "loss": 1.7142, + "step": 22418 + }, + { + "epoch": 6.8812154696132595, + "grad_norm": 0.16664449870586395, + "learning_rate": 2.3416135144348316e-05, + "loss": 1.6857, + "step": 22419 + }, + { + "epoch": 6.881522406384285, + "grad_norm": 0.1895827353000641, + "learning_rate": 2.3411925474861856e-05, + "loss": 1.7075, + "step": 22420 + }, + { + "epoch": 6.88182934315531, + "grad_norm": 0.2058400958776474, + "learning_rate": 2.3407716068134334e-05, + "loss": 1.7623, + "step": 22421 + }, + { + "epoch": 6.8821362799263355, + "grad_norm": 0.18390826880931854, + "learning_rate": 2.3403506924207346e-05, + "loss": 1.6686, + "step": 22422 + }, + { + "epoch": 6.882443216697361, + "grad_norm": 0.1742098331451416, + "learning_rate": 2.3399298043122497e-05, + "loss": 1.6846, + "step": 22423 + }, + { + "epoch": 6.882750153468385, + "grad_norm": 0.18958622217178345, + "learning_rate": 2.3395089424921368e-05, + "loss": 1.7603, + "step": 22424 + }, + { + "epoch": 6.883057090239411, + "grad_norm": 0.21827174723148346, + "learning_rate": 2.3390881069645564e-05, + "loss": 1.6706, + "step": 22425 + }, + { + "epoch": 6.883364027010436, + "grad_norm": 0.17859303951263428, + "learning_rate": 2.338667297733667e-05, + "loss": 1.7612, + "step": 22426 + }, + { + "epoch": 6.883670963781461, + "grad_norm": 0.22383756935596466, + "learning_rate": 2.338246514803627e-05, + "loss": 1.7507, + "step": 22427 + }, + { + "epoch": 6.883977900552486, + "grad_norm": 0.20317313075065613, + "learning_rate": 2.3378257581785934e-05, + "loss": 1.6912, + "step": 22428 + }, + { + "epoch": 6.884284837323511, + "grad_norm": 0.20238614082336426, + "learning_rate": 2.3374050278627297e-05, + "loss": 1.7336, + "step": 22429 + }, + { + "epoch": 6.884591774094536, + "grad_norm": 0.2134159654378891, + "learning_rate": 2.336984323860188e-05, + "loss": 1.7252, + "step": 22430 + }, + { + "epoch": 6.884898710865562, + "grad_norm": 0.17153076827526093, + "learning_rate": 2.3365636461751277e-05, + "loss": 1.6769, + "step": 22431 + }, + { + "epoch": 6.885205647636587, + "grad_norm": 0.19001254439353943, + "learning_rate": 2.3361429948117075e-05, + "loss": 1.7812, + "step": 22432 + }, + { + "epoch": 6.885512584407612, + "grad_norm": 0.2074522078037262, + "learning_rate": 2.335722369774081e-05, + "loss": 1.7433, + "step": 22433 + }, + { + "epoch": 6.885819521178637, + "grad_norm": 0.22863705456256866, + "learning_rate": 2.3353017710664117e-05, + "loss": 1.7476, + "step": 22434 + }, + { + "epoch": 6.886126457949662, + "grad_norm": 0.19350804388523102, + "learning_rate": 2.334881198692848e-05, + "loss": 1.7071, + "step": 22435 + }, + { + "epoch": 6.8864333947206875, + "grad_norm": 0.22915633022785187, + "learning_rate": 2.3344606526575524e-05, + "loss": 1.7283, + "step": 22436 + }, + { + "epoch": 6.886740331491713, + "grad_norm": 0.21576058864593506, + "learning_rate": 2.3340401329646795e-05, + "loss": 1.7062, + "step": 22437 + }, + { + "epoch": 6.887047268262738, + "grad_norm": 0.17844067513942719, + "learning_rate": 2.333619639618384e-05, + "loss": 1.6994, + "step": 22438 + }, + { + "epoch": 6.887354205033763, + "grad_norm": 0.21019738912582397, + "learning_rate": 2.333199172622822e-05, + "loss": 1.6654, + "step": 22439 + }, + { + "epoch": 6.887661141804788, + "grad_norm": 0.1901654452085495, + "learning_rate": 2.3327787319821486e-05, + "loss": 1.7847, + "step": 22440 + }, + { + "epoch": 6.887968078575813, + "grad_norm": 0.21838930249214172, + "learning_rate": 2.3323583177005198e-05, + "loss": 1.6517, + "step": 22441 + }, + { + "epoch": 6.888275015346839, + "grad_norm": 0.16078172624111176, + "learning_rate": 2.3319379297820892e-05, + "loss": 1.7052, + "step": 22442 + }, + { + "epoch": 6.888581952117864, + "grad_norm": 0.19161897897720337, + "learning_rate": 2.331517568231012e-05, + "loss": 1.675, + "step": 22443 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 0.1874416172504425, + "learning_rate": 2.331097233051442e-05, + "loss": 1.7025, + "step": 22444 + }, + { + "epoch": 6.889195825659914, + "grad_norm": 0.1817546933889389, + "learning_rate": 2.3306769242475318e-05, + "loss": 1.7103, + "step": 22445 + }, + { + "epoch": 6.889502762430939, + "grad_norm": 0.18423372507095337, + "learning_rate": 2.3302566418234406e-05, + "loss": 1.6883, + "step": 22446 + }, + { + "epoch": 6.889809699201964, + "grad_norm": 0.1712140440940857, + "learning_rate": 2.3298363857833162e-05, + "loss": 1.7076, + "step": 22447 + }, + { + "epoch": 6.89011663597299, + "grad_norm": 0.15992864966392517, + "learning_rate": 2.3294161561313133e-05, + "loss": 1.6514, + "step": 22448 + }, + { + "epoch": 6.890423572744015, + "grad_norm": 0.24126072227954865, + "learning_rate": 2.3289959528715855e-05, + "loss": 1.7385, + "step": 22449 + }, + { + "epoch": 6.8907305095150395, + "grad_norm": 0.18130798637866974, + "learning_rate": 2.3285757760082832e-05, + "loss": 1.691, + "step": 22450 + }, + { + "epoch": 6.891037446286065, + "grad_norm": 0.20070049166679382, + "learning_rate": 2.3281556255455644e-05, + "loss": 1.7166, + "step": 22451 + }, + { + "epoch": 6.89134438305709, + "grad_norm": 0.20706996321678162, + "learning_rate": 2.327735501487574e-05, + "loss": 1.6763, + "step": 22452 + }, + { + "epoch": 6.8916513198281155, + "grad_norm": 0.22404810786247253, + "learning_rate": 2.327315403838472e-05, + "loss": 1.761, + "step": 22453 + }, + { + "epoch": 6.891958256599141, + "grad_norm": 0.21240194141864777, + "learning_rate": 2.3268953326024013e-05, + "loss": 1.7038, + "step": 22454 + }, + { + "epoch": 6.892265193370166, + "grad_norm": 0.24251966178417206, + "learning_rate": 2.32647528778352e-05, + "loss": 1.7829, + "step": 22455 + }, + { + "epoch": 6.892572130141191, + "grad_norm": 0.21213467419147491, + "learning_rate": 2.3260552693859765e-05, + "loss": 1.7433, + "step": 22456 + }, + { + "epoch": 6.892879066912216, + "grad_norm": 0.18008530139923096, + "learning_rate": 2.325635277413922e-05, + "loss": 1.7238, + "step": 22457 + }, + { + "epoch": 6.893186003683241, + "grad_norm": 0.18252789974212646, + "learning_rate": 2.325215311871508e-05, + "loss": 1.7143, + "step": 22458 + }, + { + "epoch": 6.893492940454267, + "grad_norm": 0.17830567061901093, + "learning_rate": 2.3247953727628833e-05, + "loss": 1.687, + "step": 22459 + }, + { + "epoch": 6.893799877225292, + "grad_norm": 0.19980686902999878, + "learning_rate": 2.3243754600921992e-05, + "loss": 1.7096, + "step": 22460 + }, + { + "epoch": 6.894106813996316, + "grad_norm": 0.1713438183069229, + "learning_rate": 2.3239555738636044e-05, + "loss": 1.6791, + "step": 22461 + }, + { + "epoch": 6.894413750767342, + "grad_norm": 0.17678281664848328, + "learning_rate": 2.3235357140812475e-05, + "loss": 1.6689, + "step": 22462 + }, + { + "epoch": 6.894720687538367, + "grad_norm": 0.20409992337226868, + "learning_rate": 2.3231158807492837e-05, + "loss": 1.7746, + "step": 22463 + }, + { + "epoch": 6.895027624309392, + "grad_norm": 0.19227825105190277, + "learning_rate": 2.3226960738718552e-05, + "loss": 1.7101, + "step": 22464 + }, + { + "epoch": 6.895334561080418, + "grad_norm": 0.24029433727264404, + "learning_rate": 2.3222762934531132e-05, + "loss": 1.7842, + "step": 22465 + }, + { + "epoch": 6.895641497851443, + "grad_norm": 0.21887856721878052, + "learning_rate": 2.321856539497207e-05, + "loss": 1.7032, + "step": 22466 + }, + { + "epoch": 6.8959484346224675, + "grad_norm": 0.17346082627773285, + "learning_rate": 2.321436812008282e-05, + "loss": 1.683, + "step": 22467 + }, + { + "epoch": 6.896255371393493, + "grad_norm": 0.18920177221298218, + "learning_rate": 2.3210171109904914e-05, + "loss": 1.7057, + "step": 22468 + }, + { + "epoch": 6.896562308164518, + "grad_norm": 0.21199388802051544, + "learning_rate": 2.320597436447977e-05, + "loss": 1.7534, + "step": 22469 + }, + { + "epoch": 6.8968692449355435, + "grad_norm": 0.1867530792951584, + "learning_rate": 2.320177788384893e-05, + "loss": 1.7185, + "step": 22470 + }, + { + "epoch": 6.897176181706568, + "grad_norm": 0.21009495854377747, + "learning_rate": 2.3197581668053785e-05, + "loss": 1.7379, + "step": 22471 + }, + { + "epoch": 6.897483118477593, + "grad_norm": 0.20078743994235992, + "learning_rate": 2.3193385717135874e-05, + "loss": 1.7226, + "step": 22472 + }, + { + "epoch": 6.897790055248619, + "grad_norm": 0.2135045975446701, + "learning_rate": 2.318919003113663e-05, + "loss": 1.7531, + "step": 22473 + }, + { + "epoch": 6.898096992019644, + "grad_norm": 0.18811136484146118, + "learning_rate": 2.3184994610097526e-05, + "loss": 1.6542, + "step": 22474 + }, + { + "epoch": 6.898403928790669, + "grad_norm": 0.2323937565088272, + "learning_rate": 2.3180799454060025e-05, + "loss": 1.7369, + "step": 22475 + }, + { + "epoch": 6.898710865561695, + "grad_norm": 0.19270992279052734, + "learning_rate": 2.317660456306558e-05, + "loss": 1.6818, + "step": 22476 + }, + { + "epoch": 6.899017802332719, + "grad_norm": 0.18951043486595154, + "learning_rate": 2.3172409937155654e-05, + "loss": 1.7183, + "step": 22477 + }, + { + "epoch": 6.899324739103744, + "grad_norm": 0.1758934110403061, + "learning_rate": 2.3168215576371694e-05, + "loss": 1.6826, + "step": 22478 + }, + { + "epoch": 6.89963167587477, + "grad_norm": 0.2048143893480301, + "learning_rate": 2.3164021480755133e-05, + "loss": 1.7769, + "step": 22479 + }, + { + "epoch": 6.899938612645795, + "grad_norm": 0.20538486540317535, + "learning_rate": 2.315982765034748e-05, + "loss": 1.7035, + "step": 22480 + }, + { + "epoch": 6.9002455494168204, + "grad_norm": 0.18417708575725555, + "learning_rate": 2.3155634085190124e-05, + "loss": 1.7533, + "step": 22481 + }, + { + "epoch": 6.900552486187845, + "grad_norm": 0.1978628784418106, + "learning_rate": 2.315144078532453e-05, + "loss": 1.691, + "step": 22482 + }, + { + "epoch": 6.90085942295887, + "grad_norm": 0.17665794491767883, + "learning_rate": 2.3147247750792128e-05, + "loss": 1.7018, + "step": 22483 + }, + { + "epoch": 6.901166359729896, + "grad_norm": 0.20218273997306824, + "learning_rate": 2.314305498163435e-05, + "loss": 1.7277, + "step": 22484 + }, + { + "epoch": 6.901473296500921, + "grad_norm": 0.18791642785072327, + "learning_rate": 2.3138862477892674e-05, + "loss": 1.7247, + "step": 22485 + }, + { + "epoch": 6.901780233271946, + "grad_norm": 0.1945842206478119, + "learning_rate": 2.313467023960847e-05, + "loss": 1.6648, + "step": 22486 + }, + { + "epoch": 6.902087170042972, + "grad_norm": 0.1871321201324463, + "learning_rate": 2.3130478266823237e-05, + "loss": 1.6978, + "step": 22487 + }, + { + "epoch": 6.902394106813996, + "grad_norm": 0.20094287395477295, + "learning_rate": 2.312628655957833e-05, + "loss": 1.7763, + "step": 22488 + }, + { + "epoch": 6.902701043585021, + "grad_norm": 0.1804366111755371, + "learning_rate": 2.3122095117915226e-05, + "loss": 1.689, + "step": 22489 + }, + { + "epoch": 6.903007980356047, + "grad_norm": 0.1846652776002884, + "learning_rate": 2.311790394187534e-05, + "loss": 1.7088, + "step": 22490 + }, + { + "epoch": 6.903314917127072, + "grad_norm": 0.18339675664901733, + "learning_rate": 2.311371303150008e-05, + "loss": 1.6974, + "step": 22491 + }, + { + "epoch": 6.903621853898097, + "grad_norm": 0.21333162486553192, + "learning_rate": 2.3109522386830863e-05, + "loss": 1.7614, + "step": 22492 + }, + { + "epoch": 6.903928790669122, + "grad_norm": 0.19845318794250488, + "learning_rate": 2.3105332007909104e-05, + "loss": 1.6895, + "step": 22493 + }, + { + "epoch": 6.904235727440147, + "grad_norm": 0.21082347631454468, + "learning_rate": 2.3101141894776224e-05, + "loss": 1.7397, + "step": 22494 + }, + { + "epoch": 6.9045426642111725, + "grad_norm": 0.16360893845558167, + "learning_rate": 2.3096952047473623e-05, + "loss": 1.6716, + "step": 22495 + }, + { + "epoch": 6.904849600982198, + "grad_norm": 0.2287478744983673, + "learning_rate": 2.3092762466042687e-05, + "loss": 1.7673, + "step": 22496 + }, + { + "epoch": 6.905156537753223, + "grad_norm": 0.17231078445911407, + "learning_rate": 2.308857315052489e-05, + "loss": 1.6744, + "step": 22497 + }, + { + "epoch": 6.9054634745242485, + "grad_norm": 0.2887173295021057, + "learning_rate": 2.3084384100961565e-05, + "loss": 1.7358, + "step": 22498 + }, + { + "epoch": 6.905770411295273, + "grad_norm": 0.1977192759513855, + "learning_rate": 2.3080195317394127e-05, + "loss": 1.7514, + "step": 22499 + }, + { + "epoch": 6.906077348066298, + "grad_norm": 0.24933035671710968, + "learning_rate": 2.307600679986398e-05, + "loss": 1.6845, + "step": 22500 + }, + { + "epoch": 6.906384284837324, + "grad_norm": 0.17288708686828613, + "learning_rate": 2.30718185484125e-05, + "loss": 1.7211, + "step": 22501 + }, + { + "epoch": 6.906691221608349, + "grad_norm": 0.22192007303237915, + "learning_rate": 2.306763056308112e-05, + "loss": 1.6924, + "step": 22502 + }, + { + "epoch": 6.906998158379373, + "grad_norm": 0.20500123500823975, + "learning_rate": 2.3063442843911172e-05, + "loss": 1.7412, + "step": 22503 + }, + { + "epoch": 6.907305095150399, + "grad_norm": 0.30658698081970215, + "learning_rate": 2.30592553909441e-05, + "loss": 1.7965, + "step": 22504 + }, + { + "epoch": 6.907612031921424, + "grad_norm": 0.177829772233963, + "learning_rate": 2.3055068204221224e-05, + "loss": 1.6914, + "step": 22505 + }, + { + "epoch": 6.907918968692449, + "grad_norm": 0.20281876623630524, + "learning_rate": 2.3050881283783977e-05, + "loss": 1.6946, + "step": 22506 + }, + { + "epoch": 6.908225905463475, + "grad_norm": 0.16111700236797333, + "learning_rate": 2.3046694629673716e-05, + "loss": 1.7004, + "step": 22507 + }, + { + "epoch": 6.9085328422345, + "grad_norm": 0.1911575049161911, + "learning_rate": 2.3042508241931814e-05, + "loss": 1.7013, + "step": 22508 + }, + { + "epoch": 6.9088397790055245, + "grad_norm": 0.17862342298030853, + "learning_rate": 2.303832212059965e-05, + "loss": 1.7053, + "step": 22509 + }, + { + "epoch": 6.90914671577655, + "grad_norm": 0.2268948256969452, + "learning_rate": 2.303413626571858e-05, + "loss": 1.7241, + "step": 22510 + }, + { + "epoch": 6.909453652547575, + "grad_norm": 0.1997457593679428, + "learning_rate": 2.3029950677329992e-05, + "loss": 1.6927, + "step": 22511 + }, + { + "epoch": 6.9097605893186005, + "grad_norm": 0.22120819985866547, + "learning_rate": 2.3025765355475232e-05, + "loss": 1.7447, + "step": 22512 + }, + { + "epoch": 6.910067526089626, + "grad_norm": 0.22097964584827423, + "learning_rate": 2.302158030019565e-05, + "loss": 1.7399, + "step": 22513 + }, + { + "epoch": 6.91037446286065, + "grad_norm": 0.2171044498682022, + "learning_rate": 2.3017395511532664e-05, + "loss": 1.7252, + "step": 22514 + }, + { + "epoch": 6.910681399631676, + "grad_norm": 0.1987348347902298, + "learning_rate": 2.301321098952757e-05, + "loss": 1.7071, + "step": 22515 + }, + { + "epoch": 6.910988336402701, + "grad_norm": 0.2131081372499466, + "learning_rate": 2.3009026734221746e-05, + "loss": 1.7314, + "step": 22516 + }, + { + "epoch": 6.911295273173726, + "grad_norm": 0.18867900967597961, + "learning_rate": 2.3004842745656536e-05, + "loss": 1.7431, + "step": 22517 + }, + { + "epoch": 6.911602209944752, + "grad_norm": 0.22853058576583862, + "learning_rate": 2.3000659023873277e-05, + "loss": 1.7234, + "step": 22518 + }, + { + "epoch": 6.911909146715777, + "grad_norm": 0.23441165685653687, + "learning_rate": 2.2996475568913366e-05, + "loss": 1.7535, + "step": 22519 + }, + { + "epoch": 6.912216083486801, + "grad_norm": 0.2376382052898407, + "learning_rate": 2.299229238081807e-05, + "loss": 1.7582, + "step": 22520 + }, + { + "epoch": 6.912523020257827, + "grad_norm": 0.2571510076522827, + "learning_rate": 2.2988109459628814e-05, + "loss": 1.722, + "step": 22521 + }, + { + "epoch": 6.912829957028852, + "grad_norm": 0.19782103598117828, + "learning_rate": 2.298392680538685e-05, + "loss": 1.7052, + "step": 22522 + }, + { + "epoch": 6.913136893799877, + "grad_norm": 0.24070625007152557, + "learning_rate": 2.297974441813358e-05, + "loss": 1.7306, + "step": 22523 + }, + { + "epoch": 6.913443830570903, + "grad_norm": 0.1783500611782074, + "learning_rate": 2.2975562297910307e-05, + "loss": 1.7077, + "step": 22524 + }, + { + "epoch": 6.913750767341927, + "grad_norm": 0.19469089806079865, + "learning_rate": 2.2971380444758373e-05, + "loss": 1.7275, + "step": 22525 + }, + { + "epoch": 6.9140577041129525, + "grad_norm": 0.21449480950832367, + "learning_rate": 2.2967198858719092e-05, + "loss": 1.7682, + "step": 22526 + }, + { + "epoch": 6.914364640883978, + "grad_norm": 0.21686261892318726, + "learning_rate": 2.2963017539833803e-05, + "loss": 1.6794, + "step": 22527 + }, + { + "epoch": 6.914671577655003, + "grad_norm": 0.2061273604631424, + "learning_rate": 2.2958836488143813e-05, + "loss": 1.7612, + "step": 22528 + }, + { + "epoch": 6.9149785144260285, + "grad_norm": 0.2708517611026764, + "learning_rate": 2.295465570369046e-05, + "loss": 1.7291, + "step": 22529 + }, + { + "epoch": 6.915285451197054, + "grad_norm": 0.17011860013008118, + "learning_rate": 2.295047518651503e-05, + "loss": 1.6541, + "step": 22530 + }, + { + "epoch": 6.915592387968078, + "grad_norm": 0.255305677652359, + "learning_rate": 2.294629493665889e-05, + "loss": 1.7063, + "step": 22531 + }, + { + "epoch": 6.915899324739104, + "grad_norm": 0.20172207057476044, + "learning_rate": 2.2942114954163306e-05, + "loss": 1.6678, + "step": 22532 + }, + { + "epoch": 6.916206261510129, + "grad_norm": 0.23726679384708405, + "learning_rate": 2.2937935239069603e-05, + "loss": 1.6762, + "step": 22533 + }, + { + "epoch": 6.916513198281154, + "grad_norm": 0.17716684937477112, + "learning_rate": 2.2933755791419082e-05, + "loss": 1.7302, + "step": 22534 + }, + { + "epoch": 6.91682013505218, + "grad_norm": 0.2513270974159241, + "learning_rate": 2.2929576611253035e-05, + "loss": 1.7371, + "step": 22535 + }, + { + "epoch": 6.917127071823204, + "grad_norm": 0.21994394063949585, + "learning_rate": 2.292539769861281e-05, + "loss": 1.7007, + "step": 22536 + }, + { + "epoch": 6.917434008594229, + "grad_norm": 0.2095540314912796, + "learning_rate": 2.292121905353964e-05, + "loss": 1.71, + "step": 22537 + }, + { + "epoch": 6.917740945365255, + "grad_norm": 0.24400855600833893, + "learning_rate": 2.2917040676074892e-05, + "loss": 1.7859, + "step": 22538 + }, + { + "epoch": 6.91804788213628, + "grad_norm": 0.23217935860157013, + "learning_rate": 2.2912862566259785e-05, + "loss": 1.8218, + "step": 22539 + }, + { + "epoch": 6.918354818907305, + "grad_norm": 0.23555497825145721, + "learning_rate": 2.2908684724135666e-05, + "loss": 1.7145, + "step": 22540 + }, + { + "epoch": 6.918661755678331, + "grad_norm": 0.17844347655773163, + "learning_rate": 2.2904507149743804e-05, + "loss": 1.6767, + "step": 22541 + }, + { + "epoch": 6.918968692449355, + "grad_norm": 0.20810428261756897, + "learning_rate": 2.290032984312548e-05, + "loss": 1.7359, + "step": 22542 + }, + { + "epoch": 6.9192756292203805, + "grad_norm": 0.20082542300224304, + "learning_rate": 2.289615280432198e-05, + "loss": 1.7623, + "step": 22543 + }, + { + "epoch": 6.919582565991406, + "grad_norm": 0.2005007117986679, + "learning_rate": 2.2891976033374584e-05, + "loss": 1.745, + "step": 22544 + }, + { + "epoch": 6.919889502762431, + "grad_norm": 0.18054969608783722, + "learning_rate": 2.2887799530324572e-05, + "loss": 1.6959, + "step": 22545 + }, + { + "epoch": 6.920196439533456, + "grad_norm": 0.18410442769527435, + "learning_rate": 2.2883623295213214e-05, + "loss": 1.7052, + "step": 22546 + }, + { + "epoch": 6.920503376304481, + "grad_norm": 0.17380426824092865, + "learning_rate": 2.2879447328081765e-05, + "loss": 1.6735, + "step": 22547 + }, + { + "epoch": 6.920810313075506, + "grad_norm": 0.19082246720790863, + "learning_rate": 2.2875271628971557e-05, + "loss": 1.7192, + "step": 22548 + }, + { + "epoch": 6.921117249846532, + "grad_norm": 0.17682792246341705, + "learning_rate": 2.2871096197923784e-05, + "loss": 1.649, + "step": 22549 + }, + { + "epoch": 6.921424186617557, + "grad_norm": 0.19127340614795685, + "learning_rate": 2.286692103497975e-05, + "loss": 1.7366, + "step": 22550 + }, + { + "epoch": 6.921731123388582, + "grad_norm": 0.1636040210723877, + "learning_rate": 2.2862746140180696e-05, + "loss": 1.6749, + "step": 22551 + }, + { + "epoch": 6.922038060159607, + "grad_norm": 0.2121013104915619, + "learning_rate": 2.285857151356788e-05, + "loss": 1.7342, + "step": 22552 + }, + { + "epoch": 6.922344996930632, + "grad_norm": 0.19183295965194702, + "learning_rate": 2.28543971551826e-05, + "loss": 1.7506, + "step": 22553 + }, + { + "epoch": 6.922651933701657, + "grad_norm": 0.23838891088962555, + "learning_rate": 2.285022306506604e-05, + "loss": 1.6875, + "step": 22554 + }, + { + "epoch": 6.922958870472683, + "grad_norm": 0.17147624492645264, + "learning_rate": 2.2846049243259526e-05, + "loss": 1.7074, + "step": 22555 + }, + { + "epoch": 6.923265807243708, + "grad_norm": 0.2254270762205124, + "learning_rate": 2.2841875689804236e-05, + "loss": 1.7589, + "step": 22556 + }, + { + "epoch": 6.9235727440147325, + "grad_norm": 0.249015673995018, + "learning_rate": 2.2837702404741462e-05, + "loss": 1.7708, + "step": 22557 + }, + { + "epoch": 6.923879680785758, + "grad_norm": 0.19401927292346954, + "learning_rate": 2.283352938811244e-05, + "loss": 1.696, + "step": 22558 + }, + { + "epoch": 6.924186617556783, + "grad_norm": 0.21134993433952332, + "learning_rate": 2.2829356639958398e-05, + "loss": 1.7136, + "step": 22559 + }, + { + "epoch": 6.9244935543278086, + "grad_norm": 0.17600105702877045, + "learning_rate": 2.2825184160320578e-05, + "loss": 1.679, + "step": 22560 + }, + { + "epoch": 6.924800491098834, + "grad_norm": 0.2426912486553192, + "learning_rate": 2.282101194924022e-05, + "loss": 1.7011, + "step": 22561 + }, + { + "epoch": 6.925107427869859, + "grad_norm": 0.20040342211723328, + "learning_rate": 2.281684000675855e-05, + "loss": 1.6844, + "step": 22562 + }, + { + "epoch": 6.925414364640884, + "grad_norm": 0.23790770769119263, + "learning_rate": 2.2812668332916798e-05, + "loss": 1.7318, + "step": 22563 + }, + { + "epoch": 6.925721301411909, + "grad_norm": 0.21387948095798492, + "learning_rate": 2.2808496927756196e-05, + "loss": 1.6903, + "step": 22564 + }, + { + "epoch": 6.926028238182934, + "grad_norm": 0.20471405982971191, + "learning_rate": 2.280432579131796e-05, + "loss": 1.7231, + "step": 22565 + }, + { + "epoch": 6.92633517495396, + "grad_norm": 0.1953156590461731, + "learning_rate": 2.280015492364332e-05, + "loss": 1.7322, + "step": 22566 + }, + { + "epoch": 6.926642111724985, + "grad_norm": 0.3107415437698364, + "learning_rate": 2.279598432477349e-05, + "loss": 1.7833, + "step": 22567 + }, + { + "epoch": 6.9269490484960095, + "grad_norm": 0.2114095836877823, + "learning_rate": 2.279181399474969e-05, + "loss": 1.6923, + "step": 22568 + }, + { + "epoch": 6.927255985267035, + "grad_norm": 0.21373972296714783, + "learning_rate": 2.2787643933613107e-05, + "loss": 1.6897, + "step": 22569 + }, + { + "epoch": 6.92756292203806, + "grad_norm": 0.17955096065998077, + "learning_rate": 2.278347414140502e-05, + "loss": 1.7443, + "step": 22570 + }, + { + "epoch": 6.9278698588090855, + "grad_norm": 0.19275230169296265, + "learning_rate": 2.2779304618166554e-05, + "loss": 1.7109, + "step": 22571 + }, + { + "epoch": 6.928176795580111, + "grad_norm": 0.16774436831474304, + "learning_rate": 2.277513536393899e-05, + "loss": 1.7059, + "step": 22572 + }, + { + "epoch": 6.928483732351136, + "grad_norm": 0.25093573331832886, + "learning_rate": 2.2770966378763457e-05, + "loss": 1.7501, + "step": 22573 + }, + { + "epoch": 6.928790669122161, + "grad_norm": 0.24859540164470673, + "learning_rate": 2.2766797662681216e-05, + "loss": 1.7315, + "step": 22574 + }, + { + "epoch": 6.929097605893186, + "grad_norm": 0.1736115962266922, + "learning_rate": 2.2762629215733438e-05, + "loss": 1.7422, + "step": 22575 + }, + { + "epoch": 6.929404542664211, + "grad_norm": 0.23705001175403595, + "learning_rate": 2.2758461037961326e-05, + "loss": 1.7818, + "step": 22576 + }, + { + "epoch": 6.929711479435237, + "grad_norm": 0.21123656630516052, + "learning_rate": 2.2754293129406073e-05, + "loss": 1.7652, + "step": 22577 + }, + { + "epoch": 6.930018416206261, + "grad_norm": 0.2195751667022705, + "learning_rate": 2.2750125490108858e-05, + "loss": 1.7103, + "step": 22578 + }, + { + "epoch": 6.930325352977286, + "grad_norm": 0.17324887216091156, + "learning_rate": 2.274595812011088e-05, + "loss": 1.7386, + "step": 22579 + }, + { + "epoch": 6.930632289748312, + "grad_norm": 0.3175726532936096, + "learning_rate": 2.2741791019453313e-05, + "loss": 1.7608, + "step": 22580 + }, + { + "epoch": 6.930939226519337, + "grad_norm": 0.26266980171203613, + "learning_rate": 2.273762418817734e-05, + "loss": 1.691, + "step": 22581 + }, + { + "epoch": 6.931246163290362, + "grad_norm": 0.21905983984470367, + "learning_rate": 2.273345762632415e-05, + "loss": 1.6886, + "step": 22582 + }, + { + "epoch": 6.931553100061388, + "grad_norm": 0.2201247364282608, + "learning_rate": 2.2729291333934914e-05, + "loss": 1.7313, + "step": 22583 + }, + { + "epoch": 6.931860036832412, + "grad_norm": 0.2844204306602478, + "learning_rate": 2.2725125311050805e-05, + "loss": 1.6918, + "step": 22584 + }, + { + "epoch": 6.9321669736034375, + "grad_norm": 0.22451715171337128, + "learning_rate": 2.272095955771299e-05, + "loss": 1.699, + "step": 22585 + }, + { + "epoch": 6.932473910374463, + "grad_norm": 0.27357545495033264, + "learning_rate": 2.2716794073962645e-05, + "loss": 1.7709, + "step": 22586 + }, + { + "epoch": 6.932780847145488, + "grad_norm": 0.2605188190937042, + "learning_rate": 2.271262885984093e-05, + "loss": 1.7812, + "step": 22587 + }, + { + "epoch": 6.9330877839165135, + "grad_norm": 0.1866278201341629, + "learning_rate": 2.270846391538899e-05, + "loss": 1.7204, + "step": 22588 + }, + { + "epoch": 6.933394720687538, + "grad_norm": 0.24624690413475037, + "learning_rate": 2.2704299240648043e-05, + "loss": 1.7345, + "step": 22589 + }, + { + "epoch": 6.933701657458563, + "grad_norm": 0.18003861606121063, + "learning_rate": 2.2700134835659175e-05, + "loss": 1.73, + "step": 22590 + }, + { + "epoch": 6.934008594229589, + "grad_norm": 0.2330949604511261, + "learning_rate": 2.269597070046359e-05, + "loss": 1.7614, + "step": 22591 + }, + { + "epoch": 6.934315531000614, + "grad_norm": 0.18806515634059906, + "learning_rate": 2.269180683510243e-05, + "loss": 1.7364, + "step": 22592 + }, + { + "epoch": 6.934622467771639, + "grad_norm": 0.23998546600341797, + "learning_rate": 2.268764323961684e-05, + "loss": 1.6858, + "step": 22593 + }, + { + "epoch": 6.934929404542665, + "grad_norm": 0.1707296371459961, + "learning_rate": 2.268347991404797e-05, + "loss": 1.6703, + "step": 22594 + }, + { + "epoch": 6.935236341313689, + "grad_norm": 0.19724871218204498, + "learning_rate": 2.267931685843696e-05, + "loss": 1.7338, + "step": 22595 + }, + { + "epoch": 6.935543278084714, + "grad_norm": 0.20384611189365387, + "learning_rate": 2.2675154072824955e-05, + "loss": 1.7224, + "step": 22596 + }, + { + "epoch": 6.93585021485574, + "grad_norm": 0.18632391095161438, + "learning_rate": 2.2670991557253092e-05, + "loss": 1.7006, + "step": 22597 + }, + { + "epoch": 6.936157151626765, + "grad_norm": 0.22928105294704437, + "learning_rate": 2.2666829311762505e-05, + "loss": 1.7462, + "step": 22598 + }, + { + "epoch": 6.93646408839779, + "grad_norm": 0.1905689388513565, + "learning_rate": 2.266266733639434e-05, + "loss": 1.7071, + "step": 22599 + }, + { + "epoch": 6.936771025168815, + "grad_norm": 0.2051437795162201, + "learning_rate": 2.2658505631189708e-05, + "loss": 1.6872, + "step": 22600 + }, + { + "epoch": 6.93707796193984, + "grad_norm": 0.178196981549263, + "learning_rate": 2.265434419618976e-05, + "loss": 1.7044, + "step": 22601 + }, + { + "epoch": 6.9373848987108655, + "grad_norm": 0.21399027109146118, + "learning_rate": 2.26501830314356e-05, + "loss": 1.7529, + "step": 22602 + }, + { + "epoch": 6.937691835481891, + "grad_norm": 0.21747443079948425, + "learning_rate": 2.264602213696837e-05, + "loss": 1.7662, + "step": 22603 + }, + { + "epoch": 6.937998772252916, + "grad_norm": 0.1939898282289505, + "learning_rate": 2.2641861512829177e-05, + "loss": 1.7194, + "step": 22604 + }, + { + "epoch": 6.9383057090239415, + "grad_norm": 0.2183499038219452, + "learning_rate": 2.2637701159059128e-05, + "loss": 1.6659, + "step": 22605 + }, + { + "epoch": 6.938612645794966, + "grad_norm": 0.21971984207630157, + "learning_rate": 2.2633541075699387e-05, + "loss": 1.7729, + "step": 22606 + }, + { + "epoch": 6.938919582565991, + "grad_norm": 0.2611743211746216, + "learning_rate": 2.2629381262790998e-05, + "loss": 1.8, + "step": 22607 + }, + { + "epoch": 6.939226519337017, + "grad_norm": 0.22962158918380737, + "learning_rate": 2.2625221720375144e-05, + "loss": 1.7244, + "step": 22608 + }, + { + "epoch": 6.939533456108042, + "grad_norm": 0.20961032807826996, + "learning_rate": 2.2621062448492858e-05, + "loss": 1.7107, + "step": 22609 + }, + { + "epoch": 6.939840392879067, + "grad_norm": 0.2370155155658722, + "learning_rate": 2.2616903447185293e-05, + "loss": 1.7185, + "step": 22610 + }, + { + "epoch": 6.940147329650092, + "grad_norm": 0.19033893942832947, + "learning_rate": 2.2612744716493544e-05, + "loss": 1.7034, + "step": 22611 + }, + { + "epoch": 6.940454266421117, + "grad_norm": 0.22657649219036102, + "learning_rate": 2.2608586256458704e-05, + "loss": 1.6987, + "step": 22612 + }, + { + "epoch": 6.940761203192142, + "grad_norm": 0.17767953872680664, + "learning_rate": 2.2604428067121862e-05, + "loss": 1.6934, + "step": 22613 + }, + { + "epoch": 6.941068139963168, + "grad_norm": 0.209768146276474, + "learning_rate": 2.2600270148524123e-05, + "loss": 1.7148, + "step": 22614 + }, + { + "epoch": 6.941375076734193, + "grad_norm": 0.21234147250652313, + "learning_rate": 2.2596112500706574e-05, + "loss": 1.7147, + "step": 22615 + }, + { + "epoch": 6.941682013505218, + "grad_norm": 0.17608872056007385, + "learning_rate": 2.2591955123710307e-05, + "loss": 1.6873, + "step": 22616 + }, + { + "epoch": 6.941988950276243, + "grad_norm": 0.1743561178445816, + "learning_rate": 2.25877980175764e-05, + "loss": 1.7273, + "step": 22617 + }, + { + "epoch": 6.942295887047268, + "grad_norm": 0.22064091265201569, + "learning_rate": 2.258364118234594e-05, + "loss": 1.7785, + "step": 22618 + }, + { + "epoch": 6.9426028238182935, + "grad_norm": 0.20353585481643677, + "learning_rate": 2.2579484618060005e-05, + "loss": 1.7518, + "step": 22619 + }, + { + "epoch": 6.942909760589319, + "grad_norm": 0.23978710174560547, + "learning_rate": 2.2575328324759676e-05, + "loss": 1.7576, + "step": 22620 + }, + { + "epoch": 6.943216697360343, + "grad_norm": 0.24991966784000397, + "learning_rate": 2.257117230248603e-05, + "loss": 1.7383, + "step": 22621 + }, + { + "epoch": 6.943523634131369, + "grad_norm": 0.20734381675720215, + "learning_rate": 2.256701655128011e-05, + "loss": 1.7063, + "step": 22622 + }, + { + "epoch": 6.943830570902394, + "grad_norm": 0.20097215473651886, + "learning_rate": 2.2562861071183057e-05, + "loss": 1.7647, + "step": 22623 + }, + { + "epoch": 6.944137507673419, + "grad_norm": 0.20144836604595184, + "learning_rate": 2.2558705862235852e-05, + "loss": 1.7165, + "step": 22624 + }, + { + "epoch": 6.944444444444445, + "grad_norm": 0.20394138991832733, + "learning_rate": 2.255455092447964e-05, + "loss": 1.7048, + "step": 22625 + }, + { + "epoch": 6.94475138121547, + "grad_norm": 0.21430160105228424, + "learning_rate": 2.2550396257955396e-05, + "loss": 1.7233, + "step": 22626 + }, + { + "epoch": 6.945058317986494, + "grad_norm": 0.19071494042873383, + "learning_rate": 2.254624186270425e-05, + "loss": 1.7407, + "step": 22627 + }, + { + "epoch": 6.94536525475752, + "grad_norm": 0.19658641517162323, + "learning_rate": 2.2542087738767232e-05, + "loss": 1.6371, + "step": 22628 + }, + { + "epoch": 6.945672191528545, + "grad_norm": 0.19009098410606384, + "learning_rate": 2.25379338861854e-05, + "loss": 1.7515, + "step": 22629 + }, + { + "epoch": 6.94597912829957, + "grad_norm": 0.21250933408737183, + "learning_rate": 2.2533780304999796e-05, + "loss": 1.7308, + "step": 22630 + }, + { + "epoch": 6.946286065070596, + "grad_norm": 0.22148491442203522, + "learning_rate": 2.2529626995251475e-05, + "loss": 1.705, + "step": 22631 + }, + { + "epoch": 6.94659300184162, + "grad_norm": 0.190248504281044, + "learning_rate": 2.252547395698148e-05, + "loss": 1.7507, + "step": 22632 + }, + { + "epoch": 6.9468999386126455, + "grad_norm": 0.20005743205547333, + "learning_rate": 2.2521321190230855e-05, + "loss": 1.7622, + "step": 22633 + }, + { + "epoch": 6.947206875383671, + "grad_norm": 0.24233438074588776, + "learning_rate": 2.251716869504064e-05, + "loss": 1.7119, + "step": 22634 + }, + { + "epoch": 6.947513812154696, + "grad_norm": 0.20823299884796143, + "learning_rate": 2.2513016471451874e-05, + "loss": 1.69, + "step": 22635 + }, + { + "epoch": 6.9478207489257215, + "grad_norm": 0.21486341953277588, + "learning_rate": 2.250886451950559e-05, + "loss": 1.6528, + "step": 22636 + }, + { + "epoch": 6.948127685696747, + "grad_norm": 0.22201848030090332, + "learning_rate": 2.2504712839242813e-05, + "loss": 1.7454, + "step": 22637 + }, + { + "epoch": 6.948434622467771, + "grad_norm": 0.25179341435432434, + "learning_rate": 2.2500561430704588e-05, + "loss": 1.7226, + "step": 22638 + }, + { + "epoch": 6.948741559238797, + "grad_norm": 0.2510581910610199, + "learning_rate": 2.2496410293931913e-05, + "loss": 1.7048, + "step": 22639 + }, + { + "epoch": 6.949048496009822, + "grad_norm": 0.2406487911939621, + "learning_rate": 2.2492259428965866e-05, + "loss": 1.6751, + "step": 22640 + }, + { + "epoch": 6.949355432780847, + "grad_norm": 0.2555276155471802, + "learning_rate": 2.24881088358474e-05, + "loss": 1.7369, + "step": 22641 + }, + { + "epoch": 6.949662369551873, + "grad_norm": 0.19703364372253418, + "learning_rate": 2.2483958514617597e-05, + "loss": 1.7196, + "step": 22642 + }, + { + "epoch": 6.949969306322897, + "grad_norm": 0.18491938710212708, + "learning_rate": 2.2479808465317414e-05, + "loss": 1.6923, + "step": 22643 + }, + { + "epoch": 6.9502762430939224, + "grad_norm": 0.21588458120822906, + "learning_rate": 2.247565868798791e-05, + "loss": 1.6797, + "step": 22644 + }, + { + "epoch": 6.950583179864948, + "grad_norm": 0.18480601906776428, + "learning_rate": 2.247150918267008e-05, + "loss": 1.6672, + "step": 22645 + }, + { + "epoch": 6.950890116635973, + "grad_norm": 0.261846125125885, + "learning_rate": 2.246735994940493e-05, + "loss": 1.7594, + "step": 22646 + }, + { + "epoch": 6.9511970534069984, + "grad_norm": 0.24510261416435242, + "learning_rate": 2.2463210988233468e-05, + "loss": 1.7712, + "step": 22647 + }, + { + "epoch": 6.951503990178024, + "grad_norm": 0.25896379351615906, + "learning_rate": 2.24590622991967e-05, + "loss": 1.6811, + "step": 22648 + }, + { + "epoch": 6.951810926949048, + "grad_norm": 0.26284709572792053, + "learning_rate": 2.245491388233561e-05, + "loss": 1.7269, + "step": 22649 + }, + { + "epoch": 6.952117863720074, + "grad_norm": 0.1613062471151352, + "learning_rate": 2.245076573769121e-05, + "loss": 1.6162, + "step": 22650 + }, + { + "epoch": 6.952424800491099, + "grad_norm": 0.203482523560524, + "learning_rate": 2.244661786530449e-05, + "loss": 1.7124, + "step": 22651 + }, + { + "epoch": 6.952731737262124, + "grad_norm": 0.18294258415699005, + "learning_rate": 2.2442470265216446e-05, + "loss": 1.7101, + "step": 22652 + }, + { + "epoch": 6.953038674033149, + "grad_norm": 0.1841319352388382, + "learning_rate": 2.2438322937468058e-05, + "loss": 1.723, + "step": 22653 + }, + { + "epoch": 6.953345610804174, + "grad_norm": 0.1600010097026825, + "learning_rate": 2.2434175882100322e-05, + "loss": 1.6867, + "step": 22654 + }, + { + "epoch": 6.953652547575199, + "grad_norm": 0.16904005408287048, + "learning_rate": 2.243002909915421e-05, + "loss": 1.6993, + "step": 22655 + }, + { + "epoch": 6.953959484346225, + "grad_norm": 0.20069406926631927, + "learning_rate": 2.2425882588670692e-05, + "loss": 1.6995, + "step": 22656 + }, + { + "epoch": 6.95426642111725, + "grad_norm": 0.170061394572258, + "learning_rate": 2.2421736350690808e-05, + "loss": 1.7217, + "step": 22657 + }, + { + "epoch": 6.954573357888275, + "grad_norm": 0.20549608767032623, + "learning_rate": 2.241759038525545e-05, + "loss": 1.7229, + "step": 22658 + }, + { + "epoch": 6.9548802946593, + "grad_norm": 0.20916205644607544, + "learning_rate": 2.241344469240566e-05, + "loss": 1.7499, + "step": 22659 + }, + { + "epoch": 6.955187231430325, + "grad_norm": 0.156641885638237, + "learning_rate": 2.2409299272182348e-05, + "loss": 1.6827, + "step": 22660 + }, + { + "epoch": 6.9554941682013505, + "grad_norm": 0.17876049876213074, + "learning_rate": 2.240515412462653e-05, + "loss": 1.6745, + "step": 22661 + }, + { + "epoch": 6.955801104972376, + "grad_norm": 0.17265759408473969, + "learning_rate": 2.2401009249779153e-05, + "loss": 1.7687, + "step": 22662 + }, + { + "epoch": 6.956108041743401, + "grad_norm": 0.18822525441646576, + "learning_rate": 2.2396864647681175e-05, + "loss": 1.6974, + "step": 22663 + }, + { + "epoch": 6.956414978514426, + "grad_norm": 0.18686626851558685, + "learning_rate": 2.2392720318373567e-05, + "loss": 1.7522, + "step": 22664 + }, + { + "epoch": 6.956721915285451, + "grad_norm": 0.1668211668729782, + "learning_rate": 2.238857626189727e-05, + "loss": 1.7198, + "step": 22665 + }, + { + "epoch": 6.957028852056476, + "grad_norm": 0.23307017982006073, + "learning_rate": 2.238443247829325e-05, + "loss": 1.7377, + "step": 22666 + }, + { + "epoch": 6.957335788827502, + "grad_norm": 0.1771896481513977, + "learning_rate": 2.2380288967602453e-05, + "loss": 1.7626, + "step": 22667 + }, + { + "epoch": 6.957642725598527, + "grad_norm": 0.185984805226326, + "learning_rate": 2.237614572986583e-05, + "loss": 1.7328, + "step": 22668 + }, + { + "epoch": 6.957949662369552, + "grad_norm": 0.3076271414756775, + "learning_rate": 2.2372002765124327e-05, + "loss": 1.7081, + "step": 22669 + }, + { + "epoch": 6.958256599140577, + "grad_norm": 0.17874667048454285, + "learning_rate": 2.2367860073418885e-05, + "loss": 1.6752, + "step": 22670 + }, + { + "epoch": 6.958563535911602, + "grad_norm": 0.2044304609298706, + "learning_rate": 2.2363717654790445e-05, + "loss": 1.7325, + "step": 22671 + }, + { + "epoch": 6.958870472682627, + "grad_norm": 0.19335824251174927, + "learning_rate": 2.2359575509279945e-05, + "loss": 1.7192, + "step": 22672 + }, + { + "epoch": 6.959177409453653, + "grad_norm": 0.19514116644859314, + "learning_rate": 2.23554336369283e-05, + "loss": 1.7186, + "step": 22673 + }, + { + "epoch": 6.959484346224678, + "grad_norm": 0.2779110372066498, + "learning_rate": 2.23512920377765e-05, + "loss": 1.7391, + "step": 22674 + }, + { + "epoch": 6.9597912829957025, + "grad_norm": 0.17390480637550354, + "learning_rate": 2.2347150711865406e-05, + "loss": 1.6538, + "step": 22675 + }, + { + "epoch": 6.960098219766728, + "grad_norm": 0.1640262007713318, + "learning_rate": 2.234300965923601e-05, + "loss": 1.6534, + "step": 22676 + }, + { + "epoch": 6.960405156537753, + "grad_norm": 0.17519034445285797, + "learning_rate": 2.2338868879929165e-05, + "loss": 1.6931, + "step": 22677 + }, + { + "epoch": 6.9607120933087785, + "grad_norm": 0.16885873675346375, + "learning_rate": 2.2334728373985847e-05, + "loss": 1.7204, + "step": 22678 + }, + { + "epoch": 6.961019030079804, + "grad_norm": 0.16997110843658447, + "learning_rate": 2.2330588141446963e-05, + "loss": 1.7063, + "step": 22679 + }, + { + "epoch": 6.961325966850829, + "grad_norm": 0.17793773114681244, + "learning_rate": 2.2326448182353422e-05, + "loss": 1.7382, + "step": 22680 + }, + { + "epoch": 6.961632903621854, + "grad_norm": 0.1809101551771164, + "learning_rate": 2.2322308496746134e-05, + "loss": 1.6874, + "step": 22681 + }, + { + "epoch": 6.961939840392879, + "grad_norm": 0.19095295667648315, + "learning_rate": 2.2318169084666023e-05, + "loss": 1.7122, + "step": 22682 + }, + { + "epoch": 6.962246777163904, + "grad_norm": 0.19206218421459198, + "learning_rate": 2.2314029946153992e-05, + "loss": 1.6733, + "step": 22683 + }, + { + "epoch": 6.96255371393493, + "grad_norm": 0.21243152022361755, + "learning_rate": 2.2309891081250938e-05, + "loss": 1.7026, + "step": 22684 + }, + { + "epoch": 6.962860650705955, + "grad_norm": 0.17602933943271637, + "learning_rate": 2.2305752489997777e-05, + "loss": 1.7073, + "step": 22685 + }, + { + "epoch": 6.963167587476979, + "grad_norm": 0.21810807287693024, + "learning_rate": 2.2301614172435398e-05, + "loss": 1.7323, + "step": 22686 + }, + { + "epoch": 6.963474524248005, + "grad_norm": 0.20711791515350342, + "learning_rate": 2.2297476128604706e-05, + "loss": 1.7228, + "step": 22687 + }, + { + "epoch": 6.96378146101903, + "grad_norm": 0.20376695692539215, + "learning_rate": 2.2293338358546583e-05, + "loss": 1.715, + "step": 22688 + }, + { + "epoch": 6.964088397790055, + "grad_norm": 0.20096196234226227, + "learning_rate": 2.228920086230194e-05, + "loss": 1.7239, + "step": 22689 + }, + { + "epoch": 6.964395334561081, + "grad_norm": 0.24215486645698547, + "learning_rate": 2.228506363991163e-05, + "loss": 1.7879, + "step": 22690 + }, + { + "epoch": 6.964702271332106, + "grad_norm": 0.1917567104101181, + "learning_rate": 2.2280926691416603e-05, + "loss": 1.6903, + "step": 22691 + }, + { + "epoch": 6.9650092081031305, + "grad_norm": 0.19827421009540558, + "learning_rate": 2.2276790016857673e-05, + "loss": 1.7654, + "step": 22692 + }, + { + "epoch": 6.965316144874156, + "grad_norm": 0.20852476358413696, + "learning_rate": 2.2272653616275784e-05, + "loss": 1.7452, + "step": 22693 + }, + { + "epoch": 6.965623081645181, + "grad_norm": 0.21223776042461395, + "learning_rate": 2.2268517489711755e-05, + "loss": 1.6973, + "step": 22694 + }, + { + "epoch": 6.9659300184162065, + "grad_norm": 0.1903543621301651, + "learning_rate": 2.22643816372065e-05, + "loss": 1.7398, + "step": 22695 + }, + { + "epoch": 6.966236955187231, + "grad_norm": 0.21726597845554352, + "learning_rate": 2.2260246058800888e-05, + "loss": 1.7813, + "step": 22696 + }, + { + "epoch": 6.966543891958256, + "grad_norm": 0.1710241734981537, + "learning_rate": 2.225611075453578e-05, + "loss": 1.6647, + "step": 22697 + }, + { + "epoch": 6.966850828729282, + "grad_norm": 0.199532151222229, + "learning_rate": 2.2251975724452045e-05, + "loss": 1.7503, + "step": 22698 + }, + { + "epoch": 6.967157765500307, + "grad_norm": 0.18966728448867798, + "learning_rate": 2.224784096859055e-05, + "loss": 1.8113, + "step": 22699 + }, + { + "epoch": 6.967464702271332, + "grad_norm": 0.1977413445711136, + "learning_rate": 2.2243706486992162e-05, + "loss": 1.7036, + "step": 22700 + }, + { + "epoch": 6.967771639042358, + "grad_norm": 0.1794840395450592, + "learning_rate": 2.223957227969773e-05, + "loss": 1.714, + "step": 22701 + }, + { + "epoch": 6.968078575813382, + "grad_norm": 0.1811632663011551, + "learning_rate": 2.2235438346748117e-05, + "loss": 1.6845, + "step": 22702 + }, + { + "epoch": 6.968385512584407, + "grad_norm": 0.17478540539741516, + "learning_rate": 2.2231304688184172e-05, + "loss": 1.7078, + "step": 22703 + }, + { + "epoch": 6.968692449355433, + "grad_norm": 0.22631226480007172, + "learning_rate": 2.2227171304046756e-05, + "loss": 1.7576, + "step": 22704 + }, + { + "epoch": 6.968999386126458, + "grad_norm": 0.20498304069042206, + "learning_rate": 2.2223038194376712e-05, + "loss": 1.7342, + "step": 22705 + }, + { + "epoch": 6.969306322897483, + "grad_norm": 0.18556833267211914, + "learning_rate": 2.221890535921488e-05, + "loss": 1.6583, + "step": 22706 + }, + { + "epoch": 6.969613259668508, + "grad_norm": 0.19878216087818146, + "learning_rate": 2.221477279860209e-05, + "loss": 1.7536, + "step": 22707 + }, + { + "epoch": 6.969920196439533, + "grad_norm": 0.20304621756076813, + "learning_rate": 2.221064051257924e-05, + "loss": 1.7263, + "step": 22708 + }, + { + "epoch": 6.9702271332105585, + "grad_norm": 0.18725872039794922, + "learning_rate": 2.220650850118709e-05, + "loss": 1.7174, + "step": 22709 + }, + { + "epoch": 6.970534069981584, + "grad_norm": 0.28994759917259216, + "learning_rate": 2.2202376764466554e-05, + "loss": 1.7401, + "step": 22710 + }, + { + "epoch": 6.970841006752609, + "grad_norm": 0.19320951402187347, + "learning_rate": 2.2198245302458383e-05, + "loss": 1.7204, + "step": 22711 + }, + { + "epoch": 6.9711479435236345, + "grad_norm": 0.24737104773521423, + "learning_rate": 2.2194114115203464e-05, + "loss": 1.7418, + "step": 22712 + }, + { + "epoch": 6.971454880294659, + "grad_norm": 0.18811406195163727, + "learning_rate": 2.218998320274261e-05, + "loss": 1.6999, + "step": 22713 + }, + { + "epoch": 6.971761817065684, + "grad_norm": 0.20729362964630127, + "learning_rate": 2.2185852565116638e-05, + "loss": 1.6833, + "step": 22714 + }, + { + "epoch": 6.97206875383671, + "grad_norm": 0.1862284392118454, + "learning_rate": 2.2181722202366378e-05, + "loss": 1.7232, + "step": 22715 + }, + { + "epoch": 6.972375690607735, + "grad_norm": 0.24128347635269165, + "learning_rate": 2.217759211453264e-05, + "loss": 1.7081, + "step": 22716 + }, + { + "epoch": 6.97268262737876, + "grad_norm": 0.2007059007883072, + "learning_rate": 2.217346230165625e-05, + "loss": 1.7383, + "step": 22717 + }, + { + "epoch": 6.972989564149785, + "grad_norm": 0.2177598625421524, + "learning_rate": 2.216933276377801e-05, + "loss": 1.7494, + "step": 22718 + }, + { + "epoch": 6.97329650092081, + "grad_norm": 0.20965704321861267, + "learning_rate": 2.2165203500938735e-05, + "loss": 1.7326, + "step": 22719 + }, + { + "epoch": 6.973603437691835, + "grad_norm": 0.17255879938602448, + "learning_rate": 2.2161074513179237e-05, + "loss": 1.6713, + "step": 22720 + }, + { + "epoch": 6.973910374462861, + "grad_norm": 0.21480637788772583, + "learning_rate": 2.215694580054032e-05, + "loss": 1.7248, + "step": 22721 + }, + { + "epoch": 6.974217311233886, + "grad_norm": 0.15835267305374146, + "learning_rate": 2.215281736306278e-05, + "loss": 1.7086, + "step": 22722 + }, + { + "epoch": 6.974524248004911, + "grad_norm": 0.20524290204048157, + "learning_rate": 2.2148689200787415e-05, + "loss": 1.7472, + "step": 22723 + }, + { + "epoch": 6.974831184775936, + "grad_norm": 0.16152524948120117, + "learning_rate": 2.214456131375502e-05, + "loss": 1.6373, + "step": 22724 + }, + { + "epoch": 6.975138121546961, + "grad_norm": 0.1995699107646942, + "learning_rate": 2.2140433702006425e-05, + "loss": 1.6949, + "step": 22725 + }, + { + "epoch": 6.975445058317987, + "grad_norm": 0.19927829504013062, + "learning_rate": 2.213630636558236e-05, + "loss": 1.7875, + "step": 22726 + }, + { + "epoch": 6.975751995089012, + "grad_norm": 0.19159351289272308, + "learning_rate": 2.213217930452368e-05, + "loss": 1.7067, + "step": 22727 + }, + { + "epoch": 6.976058931860036, + "grad_norm": 0.21832366287708282, + "learning_rate": 2.2128052518871107e-05, + "loss": 1.6952, + "step": 22728 + }, + { + "epoch": 6.976365868631062, + "grad_norm": 0.2433125376701355, + "learning_rate": 2.212392600866547e-05, + "loss": 1.7503, + "step": 22729 + }, + { + "epoch": 6.976672805402087, + "grad_norm": 0.25504401326179504, + "learning_rate": 2.2119799773947535e-05, + "loss": 1.7289, + "step": 22730 + }, + { + "epoch": 6.976979742173112, + "grad_norm": 0.20463863015174866, + "learning_rate": 2.211567381475808e-05, + "loss": 1.7442, + "step": 22731 + }, + { + "epoch": 6.977286678944138, + "grad_norm": 0.21862375736236572, + "learning_rate": 2.2111548131137883e-05, + "loss": 1.7266, + "step": 22732 + }, + { + "epoch": 6.977593615715163, + "grad_norm": 0.2124018520116806, + "learning_rate": 2.210742272312771e-05, + "loss": 1.7555, + "step": 22733 + }, + { + "epoch": 6.9779005524861875, + "grad_norm": 0.2911135256290436, + "learning_rate": 2.2103297590768334e-05, + "loss": 1.711, + "step": 22734 + }, + { + "epoch": 6.978207489257213, + "grad_norm": 0.2172393649816513, + "learning_rate": 2.2099172734100525e-05, + "loss": 1.7054, + "step": 22735 + }, + { + "epoch": 6.978514426028238, + "grad_norm": 0.28964513540267944, + "learning_rate": 2.2095048153165043e-05, + "loss": 1.7231, + "step": 22736 + }, + { + "epoch": 6.9788213627992635, + "grad_norm": 0.2557905316352844, + "learning_rate": 2.209092384800265e-05, + "loss": 1.7219, + "step": 22737 + }, + { + "epoch": 6.979128299570289, + "grad_norm": 0.23358628153800964, + "learning_rate": 2.2086799818654102e-05, + "loss": 1.7627, + "step": 22738 + }, + { + "epoch": 6.979435236341313, + "grad_norm": 0.18856312334537506, + "learning_rate": 2.2082676065160163e-05, + "loss": 1.6577, + "step": 22739 + }, + { + "epoch": 6.979742173112339, + "grad_norm": 0.18412479758262634, + "learning_rate": 2.207855258756158e-05, + "loss": 1.6661, + "step": 22740 + }, + { + "epoch": 6.980049109883364, + "grad_norm": 0.20592401921749115, + "learning_rate": 2.207442938589911e-05, + "loss": 1.6737, + "step": 22741 + }, + { + "epoch": 6.980356046654389, + "grad_norm": 0.2015630006790161, + "learning_rate": 2.2070306460213493e-05, + "loss": 1.73, + "step": 22742 + }, + { + "epoch": 6.980662983425415, + "grad_norm": 0.23446126282215118, + "learning_rate": 2.2066183810545454e-05, + "loss": 1.7391, + "step": 22743 + }, + { + "epoch": 6.98096992019644, + "grad_norm": 0.1810954511165619, + "learning_rate": 2.2062061436935803e-05, + "loss": 1.689, + "step": 22744 + }, + { + "epoch": 6.981276856967464, + "grad_norm": 0.25031471252441406, + "learning_rate": 2.20579393394252e-05, + "loss": 1.8161, + "step": 22745 + }, + { + "epoch": 6.98158379373849, + "grad_norm": 0.183212012052536, + "learning_rate": 2.2053817518054433e-05, + "loss": 1.6494, + "step": 22746 + }, + { + "epoch": 6.981890730509515, + "grad_norm": 0.2115766555070877, + "learning_rate": 2.204969597286422e-05, + "loss": 1.6912, + "step": 22747 + }, + { + "epoch": 6.98219766728054, + "grad_norm": 0.19966226816177368, + "learning_rate": 2.2045574703895296e-05, + "loss": 1.7002, + "step": 22748 + }, + { + "epoch": 6.982504604051566, + "grad_norm": 0.20601172745227814, + "learning_rate": 2.2041453711188385e-05, + "loss": 1.7839, + "step": 22749 + }, + { + "epoch": 6.98281154082259, + "grad_norm": 0.2174808531999588, + "learning_rate": 2.2037332994784222e-05, + "loss": 1.7169, + "step": 22750 + }, + { + "epoch": 6.9831184775936155, + "grad_norm": 0.1921808421611786, + "learning_rate": 2.2033212554723514e-05, + "loss": 1.6754, + "step": 22751 + }, + { + "epoch": 6.983425414364641, + "grad_norm": 0.1977350264787674, + "learning_rate": 2.2029092391046997e-05, + "loss": 1.7408, + "step": 22752 + }, + { + "epoch": 6.983732351135666, + "grad_norm": 0.18366695940494537, + "learning_rate": 2.2024972503795383e-05, + "loss": 1.6818, + "step": 22753 + }, + { + "epoch": 6.9840392879066915, + "grad_norm": 0.18127809464931488, + "learning_rate": 2.2020852893009387e-05, + "loss": 1.7392, + "step": 22754 + }, + { + "epoch": 6.984346224677717, + "grad_norm": 0.1973503679037094, + "learning_rate": 2.2016733558729718e-05, + "loss": 1.7416, + "step": 22755 + }, + { + "epoch": 6.984653161448741, + "grad_norm": 0.1971634328365326, + "learning_rate": 2.2012614500997096e-05, + "loss": 1.7545, + "step": 22756 + }, + { + "epoch": 6.984960098219767, + "grad_norm": 0.17244087159633636, + "learning_rate": 2.2008495719852218e-05, + "loss": 1.7348, + "step": 22757 + }, + { + "epoch": 6.985267034990792, + "grad_norm": 0.19024424254894257, + "learning_rate": 2.200437721533579e-05, + "loss": 1.6647, + "step": 22758 + }, + { + "epoch": 6.985573971761817, + "grad_norm": 0.18455122411251068, + "learning_rate": 2.200025898748852e-05, + "loss": 1.7528, + "step": 22759 + }, + { + "epoch": 6.985880908532843, + "grad_norm": 0.24437187612056732, + "learning_rate": 2.199614103635108e-05, + "loss": 1.7101, + "step": 22760 + }, + { + "epoch": 6.986187845303867, + "grad_norm": 0.18844331800937653, + "learning_rate": 2.1992023361964224e-05, + "loss": 1.6864, + "step": 22761 + }, + { + "epoch": 6.986494782074892, + "grad_norm": 0.18768003582954407, + "learning_rate": 2.1987905964368576e-05, + "loss": 1.6482, + "step": 22762 + }, + { + "epoch": 6.986801718845918, + "grad_norm": 0.19491778314113617, + "learning_rate": 2.1983788843604898e-05, + "loss": 1.7106, + "step": 22763 + }, + { + "epoch": 6.987108655616943, + "grad_norm": 0.23565757274627686, + "learning_rate": 2.1979671999713797e-05, + "loss": 1.7362, + "step": 22764 + }, + { + "epoch": 6.987415592387968, + "grad_norm": 0.2097240835428238, + "learning_rate": 2.1975555432736018e-05, + "loss": 1.7305, + "step": 22765 + }, + { + "epoch": 6.987722529158994, + "grad_norm": 0.2171555608510971, + "learning_rate": 2.197143914271223e-05, + "loss": 1.7213, + "step": 22766 + }, + { + "epoch": 6.988029465930018, + "grad_norm": 0.1993926763534546, + "learning_rate": 2.196732312968311e-05, + "loss": 1.6901, + "step": 22767 + }, + { + "epoch": 6.9883364027010435, + "grad_norm": 0.2345978319644928, + "learning_rate": 2.1963207393689346e-05, + "loss": 1.7456, + "step": 22768 + }, + { + "epoch": 6.988643339472069, + "grad_norm": 0.20831161737442017, + "learning_rate": 2.1959091934771564e-05, + "loss": 1.764, + "step": 22769 + }, + { + "epoch": 6.988950276243094, + "grad_norm": 0.24944809079170227, + "learning_rate": 2.195497675297049e-05, + "loss": 1.7398, + "step": 22770 + }, + { + "epoch": 6.989257213014119, + "grad_norm": 0.25463199615478516, + "learning_rate": 2.1950861848326777e-05, + "loss": 1.7002, + "step": 22771 + }, + { + "epoch": 6.989564149785144, + "grad_norm": 0.2298898696899414, + "learning_rate": 2.194674722088108e-05, + "loss": 1.755, + "step": 22772 + }, + { + "epoch": 6.989871086556169, + "grad_norm": 0.21839721500873566, + "learning_rate": 2.194263287067408e-05, + "loss": 1.6667, + "step": 22773 + }, + { + "epoch": 6.990178023327195, + "grad_norm": 0.2197437435388565, + "learning_rate": 2.1938518797746417e-05, + "loss": 1.6774, + "step": 22774 + }, + { + "epoch": 6.99048496009822, + "grad_norm": 0.23588024079799652, + "learning_rate": 2.1934405002138763e-05, + "loss": 1.6916, + "step": 22775 + }, + { + "epoch": 6.990791896869245, + "grad_norm": 0.20632316172122955, + "learning_rate": 2.1930291483891767e-05, + "loss": 1.6682, + "step": 22776 + }, + { + "epoch": 6.99109883364027, + "grad_norm": 0.22786293923854828, + "learning_rate": 2.192617824304607e-05, + "loss": 1.7138, + "step": 22777 + }, + { + "epoch": 6.991405770411295, + "grad_norm": 0.3235599994659424, + "learning_rate": 2.1922065279642363e-05, + "loss": 1.7545, + "step": 22778 + }, + { + "epoch": 6.99171270718232, + "grad_norm": 0.1919393390417099, + "learning_rate": 2.191795259372123e-05, + "loss": 1.7422, + "step": 22779 + }, + { + "epoch": 6.992019643953346, + "grad_norm": 0.16472585499286652, + "learning_rate": 2.1913840185323385e-05, + "loss": 1.6824, + "step": 22780 + }, + { + "epoch": 6.992326580724371, + "grad_norm": 0.21422579884529114, + "learning_rate": 2.1909728054489397e-05, + "loss": 1.696, + "step": 22781 + }, + { + "epoch": 6.9926335174953955, + "grad_norm": 0.18965782225131989, + "learning_rate": 2.190561620125996e-05, + "loss": 1.7026, + "step": 22782 + }, + { + "epoch": 6.992940454266421, + "grad_norm": 0.184856116771698, + "learning_rate": 2.190150462567569e-05, + "loss": 1.7202, + "step": 22783 + }, + { + "epoch": 6.993247391037446, + "grad_norm": 0.18382076919078827, + "learning_rate": 2.1897393327777223e-05, + "loss": 1.7525, + "step": 22784 + }, + { + "epoch": 6.9935543278084715, + "grad_norm": 0.17239750921726227, + "learning_rate": 2.1893282307605202e-05, + "loss": 1.7297, + "step": 22785 + }, + { + "epoch": 6.993861264579497, + "grad_norm": 0.18522322177886963, + "learning_rate": 2.18891715652002e-05, + "loss": 1.6952, + "step": 22786 + }, + { + "epoch": 6.994168201350522, + "grad_norm": 0.1946135014295578, + "learning_rate": 2.18850611006029e-05, + "loss": 1.6879, + "step": 22787 + }, + { + "epoch": 6.994475138121547, + "grad_norm": 0.2028069645166397, + "learning_rate": 2.188095091385391e-05, + "loss": 1.7412, + "step": 22788 + }, + { + "epoch": 6.994782074892572, + "grad_norm": 0.18794523179531097, + "learning_rate": 2.1876841004993838e-05, + "loss": 1.6936, + "step": 22789 + }, + { + "epoch": 6.995089011663597, + "grad_norm": 0.1912194788455963, + "learning_rate": 2.187273137406331e-05, + "loss": 1.7051, + "step": 22790 + }, + { + "epoch": 6.995395948434623, + "grad_norm": 0.1528688222169876, + "learning_rate": 2.1868622021102934e-05, + "loss": 1.6816, + "step": 22791 + }, + { + "epoch": 6.995702885205648, + "grad_norm": 0.2108357548713684, + "learning_rate": 2.1864512946153325e-05, + "loss": 1.7018, + "step": 22792 + }, + { + "epoch": 6.996009821976672, + "grad_norm": 0.16667310893535614, + "learning_rate": 2.1860404149255092e-05, + "loss": 1.7235, + "step": 22793 + }, + { + "epoch": 6.996316758747698, + "grad_norm": 0.16995872557163239, + "learning_rate": 2.185629563044882e-05, + "loss": 1.7086, + "step": 22794 + }, + { + "epoch": 6.996623695518723, + "grad_norm": 0.1962304711341858, + "learning_rate": 2.1852187389775165e-05, + "loss": 1.7523, + "step": 22795 + }, + { + "epoch": 6.996930632289748, + "grad_norm": 0.17774102091789246, + "learning_rate": 2.1848079427274655e-05, + "loss": 1.6649, + "step": 22796 + }, + { + "epoch": 6.997237569060774, + "grad_norm": 0.18844567239284515, + "learning_rate": 2.184397174298796e-05, + "loss": 1.7281, + "step": 22797 + }, + { + "epoch": 6.997544505831799, + "grad_norm": 0.15324150025844574, + "learning_rate": 2.1839864336955607e-05, + "loss": 1.6496, + "step": 22798 + }, + { + "epoch": 6.9978514426028235, + "grad_norm": 0.25148099660873413, + "learning_rate": 2.1835757209218233e-05, + "loss": 1.7889, + "step": 22799 + }, + { + "epoch": 6.998158379373849, + "grad_norm": 0.22258763015270233, + "learning_rate": 2.1831650359816414e-05, + "loss": 1.7303, + "step": 22800 + }, + { + "epoch": 6.998465316144874, + "grad_norm": 0.21465472877025604, + "learning_rate": 2.182754378879074e-05, + "loss": 1.733, + "step": 22801 + }, + { + "epoch": 6.9987722529158995, + "grad_norm": 0.1894017904996872, + "learning_rate": 2.182343749618181e-05, + "loss": 1.7104, + "step": 22802 + }, + { + "epoch": 6.999079189686924, + "grad_norm": 0.19616369903087616, + "learning_rate": 2.181933148203014e-05, + "loss": 1.7015, + "step": 22803 + }, + { + "epoch": 6.999386126457949, + "grad_norm": 0.1720295250415802, + "learning_rate": 2.181522574637638e-05, + "loss": 1.6609, + "step": 22804 + }, + { + "epoch": 6.999693063228975, + "grad_norm": 0.2508579194545746, + "learning_rate": 2.1811120289261077e-05, + "loss": 1.7485, + "step": 22805 + }, + { + "epoch": 7.0, + "grad_norm": 0.1701229363679886, + "learning_rate": 2.1807015110724805e-05, + "loss": 1.6822, + "step": 22806 + }, + { + "epoch": 7.000306936771025, + "grad_norm": 0.17413921654224396, + "learning_rate": 2.1802910210808135e-05, + "loss": 1.6944, + "step": 22807 + }, + { + "epoch": 7.000613873542051, + "grad_norm": 0.22573722898960114, + "learning_rate": 2.179880558955163e-05, + "loss": 1.7499, + "step": 22808 + }, + { + "epoch": 7.000920810313075, + "grad_norm": 0.2477746456861496, + "learning_rate": 2.1794701246995857e-05, + "loss": 1.7663, + "step": 22809 + }, + { + "epoch": 7.0012277470841005, + "grad_norm": 0.15338411927223206, + "learning_rate": 2.1790597183181384e-05, + "loss": 1.6425, + "step": 22810 + }, + { + "epoch": 7.001534683855126, + "grad_norm": 0.2119540572166443, + "learning_rate": 2.1786493398148738e-05, + "loss": 1.6695, + "step": 22811 + }, + { + "epoch": 7.001841620626151, + "grad_norm": 0.283037930727005, + "learning_rate": 2.178238989193854e-05, + "loss": 1.7479, + "step": 22812 + }, + { + "epoch": 7.0021485573971765, + "grad_norm": 0.2939838767051697, + "learning_rate": 2.1778286664591276e-05, + "loss": 1.733, + "step": 22813 + }, + { + "epoch": 7.002455494168202, + "grad_norm": 0.21681749820709229, + "learning_rate": 2.1774183716147552e-05, + "loss": 1.6804, + "step": 22814 + }, + { + "epoch": 7.002762430939226, + "grad_norm": 0.29066696763038635, + "learning_rate": 2.177008104664785e-05, + "loss": 1.7435, + "step": 22815 + }, + { + "epoch": 7.003069367710252, + "grad_norm": 0.17104873061180115, + "learning_rate": 2.1765978656132773e-05, + "loss": 1.6637, + "step": 22816 + }, + { + "epoch": 7.003376304481277, + "grad_norm": 0.29808685183525085, + "learning_rate": 2.1761876544642846e-05, + "loss": 1.7342, + "step": 22817 + }, + { + "epoch": 7.003683241252302, + "grad_norm": 0.20467214286327362, + "learning_rate": 2.1757774712218603e-05, + "loss": 1.7638, + "step": 22818 + }, + { + "epoch": 7.003990178023328, + "grad_norm": 0.23166583478450775, + "learning_rate": 2.1753673158900607e-05, + "loss": 1.6972, + "step": 22819 + }, + { + "epoch": 7.004297114794352, + "grad_norm": 0.20098255574703217, + "learning_rate": 2.1749571884729332e-05, + "loss": 1.6973, + "step": 22820 + }, + { + "epoch": 7.004604051565377, + "grad_norm": 0.212421715259552, + "learning_rate": 2.1745470889745358e-05, + "loss": 1.7183, + "step": 22821 + }, + { + "epoch": 7.004910988336403, + "grad_norm": 0.2496720403432846, + "learning_rate": 2.17413701739892e-05, + "loss": 1.7928, + "step": 22822 + }, + { + "epoch": 7.005217925107428, + "grad_norm": 0.21050602197647095, + "learning_rate": 2.1737269737501394e-05, + "loss": 1.7379, + "step": 22823 + }, + { + "epoch": 7.005524861878453, + "grad_norm": 0.18321558833122253, + "learning_rate": 2.1733169580322448e-05, + "loss": 1.733, + "step": 22824 + }, + { + "epoch": 7.005831798649478, + "grad_norm": 0.19890302419662476, + "learning_rate": 2.1729069702492887e-05, + "loss": 1.6799, + "step": 22825 + }, + { + "epoch": 7.006138735420503, + "grad_norm": 0.19961030781269073, + "learning_rate": 2.172497010405323e-05, + "loss": 1.6754, + "step": 22826 + }, + { + "epoch": 7.0064456721915285, + "grad_norm": 0.19672131538391113, + "learning_rate": 2.1720870785043988e-05, + "loss": 1.7099, + "step": 22827 + }, + { + "epoch": 7.006752608962554, + "grad_norm": 0.16798892617225647, + "learning_rate": 2.1716771745505666e-05, + "loss": 1.7096, + "step": 22828 + }, + { + "epoch": 7.007059545733579, + "grad_norm": 0.2276654690504074, + "learning_rate": 2.1712672985478815e-05, + "loss": 1.7627, + "step": 22829 + }, + { + "epoch": 7.0073664825046045, + "grad_norm": 0.17108316719532013, + "learning_rate": 2.1708574505003872e-05, + "loss": 1.6941, + "step": 22830 + }, + { + "epoch": 7.007673419275629, + "grad_norm": 0.2094760239124298, + "learning_rate": 2.1704476304121413e-05, + "loss": 1.7152, + "step": 22831 + }, + { + "epoch": 7.007980356046654, + "grad_norm": 0.17183393239974976, + "learning_rate": 2.1700378382871872e-05, + "loss": 1.6668, + "step": 22832 + }, + { + "epoch": 7.00828729281768, + "grad_norm": 0.2075900435447693, + "learning_rate": 2.1696280741295795e-05, + "loss": 1.7732, + "step": 22833 + }, + { + "epoch": 7.008594229588705, + "grad_norm": 0.20075511932373047, + "learning_rate": 2.169218337943368e-05, + "loss": 1.7228, + "step": 22834 + }, + { + "epoch": 7.00890116635973, + "grad_norm": 0.19461359083652496, + "learning_rate": 2.168808629732596e-05, + "loss": 1.6942, + "step": 22835 + }, + { + "epoch": 7.009208103130755, + "grad_norm": 0.18972480297088623, + "learning_rate": 2.16839894950132e-05, + "loss": 1.7087, + "step": 22836 + }, + { + "epoch": 7.00951503990178, + "grad_norm": 0.19522632658481598, + "learning_rate": 2.167989297253582e-05, + "loss": 1.7427, + "step": 22837 + }, + { + "epoch": 7.009821976672805, + "grad_norm": 0.2088990956544876, + "learning_rate": 2.1675796729934355e-05, + "loss": 1.786, + "step": 22838 + }, + { + "epoch": 7.010128913443831, + "grad_norm": 0.2052021473646164, + "learning_rate": 2.167170076724927e-05, + "loss": 1.765, + "step": 22839 + }, + { + "epoch": 7.010435850214856, + "grad_norm": 0.19566771388053894, + "learning_rate": 2.1667605084521043e-05, + "loss": 1.703, + "step": 22840 + }, + { + "epoch": 7.0107427869858805, + "grad_norm": 0.24589677155017853, + "learning_rate": 2.166350968179014e-05, + "loss": 1.7544, + "step": 22841 + }, + { + "epoch": 7.011049723756906, + "grad_norm": 0.28059569001197815, + "learning_rate": 2.1659414559097053e-05, + "loss": 1.7081, + "step": 22842 + }, + { + "epoch": 7.011356660527931, + "grad_norm": 0.20781446993350983, + "learning_rate": 2.1655319716482237e-05, + "loss": 1.6968, + "step": 22843 + }, + { + "epoch": 7.0116635972989565, + "grad_norm": 0.31703317165374756, + "learning_rate": 2.1651225153986167e-05, + "loss": 1.704, + "step": 22844 + }, + { + "epoch": 7.011970534069982, + "grad_norm": 0.19668029248714447, + "learning_rate": 2.1647130871649283e-05, + "loss": 1.738, + "step": 22845 + }, + { + "epoch": 7.012277470841007, + "grad_norm": 0.3768141567707062, + "learning_rate": 2.1643036869512105e-05, + "loss": 1.7407, + "step": 22846 + }, + { + "epoch": 7.012584407612032, + "grad_norm": 0.22228674590587616, + "learning_rate": 2.1638943147615032e-05, + "loss": 1.7162, + "step": 22847 + }, + { + "epoch": 7.012891344383057, + "grad_norm": 0.26087433099746704, + "learning_rate": 2.1634849705998572e-05, + "loss": 1.6916, + "step": 22848 + }, + { + "epoch": 7.013198281154082, + "grad_norm": 0.19660449028015137, + "learning_rate": 2.1630756544703117e-05, + "loss": 1.7024, + "step": 22849 + }, + { + "epoch": 7.013505217925108, + "grad_norm": 0.2287406474351883, + "learning_rate": 2.1626663663769176e-05, + "loss": 1.6761, + "step": 22850 + }, + { + "epoch": 7.013812154696133, + "grad_norm": 0.18974192440509796, + "learning_rate": 2.162257106323719e-05, + "loss": 1.6721, + "step": 22851 + }, + { + "epoch": 7.014119091467157, + "grad_norm": 0.25081944465637207, + "learning_rate": 2.1618478743147558e-05, + "loss": 1.7042, + "step": 22852 + }, + { + "epoch": 7.014426028238183, + "grad_norm": 0.187479630112648, + "learning_rate": 2.1614386703540785e-05, + "loss": 1.7057, + "step": 22853 + }, + { + "epoch": 7.014732965009208, + "grad_norm": 0.24785932898521423, + "learning_rate": 2.1610294944457243e-05, + "loss": 1.8033, + "step": 22854 + }, + { + "epoch": 7.015039901780233, + "grad_norm": 0.21570228040218353, + "learning_rate": 2.160620346593743e-05, + "loss": 1.7129, + "step": 22855 + }, + { + "epoch": 7.015346838551259, + "grad_norm": 0.19304436445236206, + "learning_rate": 2.160211226802175e-05, + "loss": 1.7384, + "step": 22856 + }, + { + "epoch": 7.015653775322283, + "grad_norm": 0.18901783227920532, + "learning_rate": 2.1598021350750648e-05, + "loss": 1.6851, + "step": 22857 + }, + { + "epoch": 7.0159607120933085, + "grad_norm": 0.21754276752471924, + "learning_rate": 2.159393071416454e-05, + "loss": 1.7242, + "step": 22858 + }, + { + "epoch": 7.016267648864334, + "grad_norm": 0.18334844708442688, + "learning_rate": 2.1589840358303858e-05, + "loss": 1.66, + "step": 22859 + }, + { + "epoch": 7.016574585635359, + "grad_norm": 0.17688371241092682, + "learning_rate": 2.1585750283209026e-05, + "loss": 1.6693, + "step": 22860 + }, + { + "epoch": 7.0168815224063845, + "grad_norm": 0.17173215746879578, + "learning_rate": 2.158166048892047e-05, + "loss": 1.675, + "step": 22861 + }, + { + "epoch": 7.01718845917741, + "grad_norm": 0.2144075632095337, + "learning_rate": 2.157757097547857e-05, + "loss": 1.7843, + "step": 22862 + }, + { + "epoch": 7.017495395948434, + "grad_norm": 0.18811818957328796, + "learning_rate": 2.1573481742923824e-05, + "loss": 1.6932, + "step": 22863 + }, + { + "epoch": 7.01780233271946, + "grad_norm": 0.19978533685207367, + "learning_rate": 2.1569392791296548e-05, + "loss": 1.7426, + "step": 22864 + }, + { + "epoch": 7.018109269490485, + "grad_norm": 0.19639068841934204, + "learning_rate": 2.1565304120637237e-05, + "loss": 1.7479, + "step": 22865 + }, + { + "epoch": 7.01841620626151, + "grad_norm": 0.2269967794418335, + "learning_rate": 2.1561215730986212e-05, + "loss": 1.7507, + "step": 22866 + }, + { + "epoch": 7.018723143032536, + "grad_norm": 0.19511014223098755, + "learning_rate": 2.1557127622383948e-05, + "loss": 1.7317, + "step": 22867 + }, + { + "epoch": 7.01903007980356, + "grad_norm": 0.23975026607513428, + "learning_rate": 2.1553039794870834e-05, + "loss": 1.7901, + "step": 22868 + }, + { + "epoch": 7.019337016574585, + "grad_norm": 0.20757955312728882, + "learning_rate": 2.154895224848722e-05, + "loss": 1.7823, + "step": 22869 + }, + { + "epoch": 7.019643953345611, + "grad_norm": 0.1893112063407898, + "learning_rate": 2.154486498327357e-05, + "loss": 1.6939, + "step": 22870 + }, + { + "epoch": 7.019950890116636, + "grad_norm": 0.23006685078144073, + "learning_rate": 2.1540777999270205e-05, + "loss": 1.8061, + "step": 22871 + }, + { + "epoch": 7.020257826887661, + "grad_norm": 0.25516194105148315, + "learning_rate": 2.1536691296517573e-05, + "loss": 1.6801, + "step": 22872 + }, + { + "epoch": 7.020564763658686, + "grad_norm": 0.2138557732105255, + "learning_rate": 2.153260487505604e-05, + "loss": 1.7689, + "step": 22873 + }, + { + "epoch": 7.020871700429711, + "grad_norm": 0.2618521749973297, + "learning_rate": 2.152851873492599e-05, + "loss": 1.712, + "step": 22874 + }, + { + "epoch": 7.0211786372007365, + "grad_norm": 0.19639171659946442, + "learning_rate": 2.1524432876167812e-05, + "loss": 1.6883, + "step": 22875 + }, + { + "epoch": 7.021485573971762, + "grad_norm": 0.20283572375774384, + "learning_rate": 2.152034729882187e-05, + "loss": 1.7259, + "step": 22876 + }, + { + "epoch": 7.021792510742787, + "grad_norm": 0.247970849275589, + "learning_rate": 2.151626200292855e-05, + "loss": 1.6714, + "step": 22877 + }, + { + "epoch": 7.0220994475138125, + "grad_norm": 0.20877771079540253, + "learning_rate": 2.1512176988528227e-05, + "loss": 1.7378, + "step": 22878 + }, + { + "epoch": 7.022406384284837, + "grad_norm": 0.2515791356563568, + "learning_rate": 2.1508092255661245e-05, + "loss": 1.743, + "step": 22879 + }, + { + "epoch": 7.022713321055862, + "grad_norm": 0.21451319754123688, + "learning_rate": 2.150400780436804e-05, + "loss": 1.7102, + "step": 22880 + }, + { + "epoch": 7.023020257826888, + "grad_norm": 0.23944756388664246, + "learning_rate": 2.1499923634688886e-05, + "loss": 1.7739, + "step": 22881 + }, + { + "epoch": 7.023327194597913, + "grad_norm": 0.22423309087753296, + "learning_rate": 2.149583974666423e-05, + "loss": 1.7598, + "step": 22882 + }, + { + "epoch": 7.023634131368938, + "grad_norm": 0.31337371468544006, + "learning_rate": 2.1491756140334358e-05, + "loss": 1.7417, + "step": 22883 + }, + { + "epoch": 7.023941068139963, + "grad_norm": 0.22430868446826935, + "learning_rate": 2.148767281573968e-05, + "loss": 1.712, + "step": 22884 + }, + { + "epoch": 7.024248004910988, + "grad_norm": 0.26083487272262573, + "learning_rate": 2.148358977292054e-05, + "loss": 1.6816, + "step": 22885 + }, + { + "epoch": 7.024554941682013, + "grad_norm": 0.2283557504415512, + "learning_rate": 2.1479507011917255e-05, + "loss": 1.7539, + "step": 22886 + }, + { + "epoch": 7.024861878453039, + "grad_norm": 0.22732317447662354, + "learning_rate": 2.1475424532770232e-05, + "loss": 1.697, + "step": 22887 + }, + { + "epoch": 7.025168815224064, + "grad_norm": 0.19614318013191223, + "learning_rate": 2.1471342335519746e-05, + "loss": 1.7267, + "step": 22888 + }, + { + "epoch": 7.0254757519950894, + "grad_norm": 0.23076513409614563, + "learning_rate": 2.1467260420206192e-05, + "loss": 1.7749, + "step": 22889 + }, + { + "epoch": 7.025782688766114, + "grad_norm": 0.1969364732503891, + "learning_rate": 2.1463178786869892e-05, + "loss": 1.6975, + "step": 22890 + }, + { + "epoch": 7.026089625537139, + "grad_norm": 0.2126578837633133, + "learning_rate": 2.145909743555119e-05, + "loss": 1.6815, + "step": 22891 + }, + { + "epoch": 7.026396562308165, + "grad_norm": 0.20841559767723083, + "learning_rate": 2.1455016366290414e-05, + "loss": 1.727, + "step": 22892 + }, + { + "epoch": 7.02670349907919, + "grad_norm": 0.2523893713951111, + "learning_rate": 2.1450935579127896e-05, + "loss": 1.7213, + "step": 22893 + }, + { + "epoch": 7.027010435850215, + "grad_norm": 0.16219666600227356, + "learning_rate": 2.1446855074103968e-05, + "loss": 1.6406, + "step": 22894 + }, + { + "epoch": 7.02731737262124, + "grad_norm": 0.28709226846694946, + "learning_rate": 2.144277485125895e-05, + "loss": 1.7021, + "step": 22895 + }, + { + "epoch": 7.027624309392265, + "grad_norm": 0.23238243162631989, + "learning_rate": 2.1438694910633174e-05, + "loss": 1.7347, + "step": 22896 + }, + { + "epoch": 7.02793124616329, + "grad_norm": 0.2692428231239319, + "learning_rate": 2.1434615252266948e-05, + "loss": 1.7192, + "step": 22897 + }, + { + "epoch": 7.028238182934316, + "grad_norm": 0.21163232624530792, + "learning_rate": 2.1430535876200584e-05, + "loss": 1.7437, + "step": 22898 + }, + { + "epoch": 7.028545119705341, + "grad_norm": 0.23896420001983643, + "learning_rate": 2.1426456782474446e-05, + "loss": 1.6773, + "step": 22899 + }, + { + "epoch": 7.0288520564763655, + "grad_norm": 0.19021281599998474, + "learning_rate": 2.142237797112877e-05, + "loss": 1.7084, + "step": 22900 + }, + { + "epoch": 7.029158993247391, + "grad_norm": 0.23483091592788696, + "learning_rate": 2.1418299442203926e-05, + "loss": 1.7678, + "step": 22901 + }, + { + "epoch": 7.029465930018416, + "grad_norm": 0.20831161737442017, + "learning_rate": 2.1414221195740213e-05, + "loss": 1.7454, + "step": 22902 + }, + { + "epoch": 7.0297728667894415, + "grad_norm": 0.1961016207933426, + "learning_rate": 2.141014323177789e-05, + "loss": 1.7231, + "step": 22903 + }, + { + "epoch": 7.030079803560467, + "grad_norm": 0.1877545267343521, + "learning_rate": 2.1406065550357322e-05, + "loss": 1.6925, + "step": 22904 + }, + { + "epoch": 7.030386740331492, + "grad_norm": 0.20815789699554443, + "learning_rate": 2.1401988151518738e-05, + "loss": 1.7762, + "step": 22905 + }, + { + "epoch": 7.030693677102517, + "grad_norm": 0.1902543157339096, + "learning_rate": 2.1397911035302487e-05, + "loss": 1.7663, + "step": 22906 + }, + { + "epoch": 7.031000613873542, + "grad_norm": 0.20552431046962738, + "learning_rate": 2.1393834201748846e-05, + "loss": 1.7048, + "step": 22907 + }, + { + "epoch": 7.031307550644567, + "grad_norm": 0.2380477488040924, + "learning_rate": 2.13897576508981e-05, + "loss": 1.7685, + "step": 22908 + }, + { + "epoch": 7.031614487415593, + "grad_norm": 0.18351083993911743, + "learning_rate": 2.1385681382790536e-05, + "loss": 1.7058, + "step": 22909 + }, + { + "epoch": 7.031921424186618, + "grad_norm": 0.21992792189121246, + "learning_rate": 2.1381605397466442e-05, + "loss": 1.7608, + "step": 22910 + }, + { + "epoch": 7.032228360957642, + "grad_norm": 0.24412932991981506, + "learning_rate": 2.1377529694966097e-05, + "loss": 1.7205, + "step": 22911 + }, + { + "epoch": 7.032535297728668, + "grad_norm": 0.20398534834384918, + "learning_rate": 2.137345427532978e-05, + "loss": 1.7318, + "step": 22912 + }, + { + "epoch": 7.032842234499693, + "grad_norm": 0.2346884161233902, + "learning_rate": 2.136937913859776e-05, + "loss": 1.7159, + "step": 22913 + }, + { + "epoch": 7.033149171270718, + "grad_norm": 0.19422392547130585, + "learning_rate": 2.1365304284810327e-05, + "loss": 1.7229, + "step": 22914 + }, + { + "epoch": 7.033456108041744, + "grad_norm": 0.24088126420974731, + "learning_rate": 2.1361229714007714e-05, + "loss": 1.77, + "step": 22915 + }, + { + "epoch": 7.033763044812768, + "grad_norm": 0.18886598944664001, + "learning_rate": 2.135715542623026e-05, + "loss": 1.7724, + "step": 22916 + }, + { + "epoch": 7.0340699815837935, + "grad_norm": 0.18816733360290527, + "learning_rate": 2.135308142151814e-05, + "loss": 1.7174, + "step": 22917 + }, + { + "epoch": 7.034376918354819, + "grad_norm": 0.184849813580513, + "learning_rate": 2.1349007699911694e-05, + "loss": 1.7026, + "step": 22918 + }, + { + "epoch": 7.034683855125844, + "grad_norm": 0.1638055443763733, + "learning_rate": 2.134493426145113e-05, + "loss": 1.683, + "step": 22919 + }, + { + "epoch": 7.0349907918968695, + "grad_norm": 0.18030275404453278, + "learning_rate": 2.1340861106176713e-05, + "loss": 1.6963, + "step": 22920 + }, + { + "epoch": 7.035297728667895, + "grad_norm": 0.221226304769516, + "learning_rate": 2.133678823412873e-05, + "loss": 1.7851, + "step": 22921 + }, + { + "epoch": 7.035604665438919, + "grad_norm": 0.18877451121807098, + "learning_rate": 2.1332715645347373e-05, + "loss": 1.7111, + "step": 22922 + }, + { + "epoch": 7.035911602209945, + "grad_norm": 0.17179232835769653, + "learning_rate": 2.1328643339872938e-05, + "loss": 1.6737, + "step": 22923 + }, + { + "epoch": 7.03621853898097, + "grad_norm": 0.17912441492080688, + "learning_rate": 2.1324571317745657e-05, + "loss": 1.7798, + "step": 22924 + }, + { + "epoch": 7.036525475751995, + "grad_norm": 0.2120780050754547, + "learning_rate": 2.132049957900577e-05, + "loss": 1.7353, + "step": 22925 + }, + { + "epoch": 7.036832412523021, + "grad_norm": 0.17286419868469238, + "learning_rate": 2.1316428123693517e-05, + "loss": 1.667, + "step": 22926 + }, + { + "epoch": 7.037139349294045, + "grad_norm": 0.1824301779270172, + "learning_rate": 2.1312356951849126e-05, + "loss": 1.6925, + "step": 22927 + }, + { + "epoch": 7.03744628606507, + "grad_norm": 0.16392327845096588, + "learning_rate": 2.1308286063512843e-05, + "loss": 1.7145, + "step": 22928 + }, + { + "epoch": 7.037753222836096, + "grad_norm": 0.18268297612667084, + "learning_rate": 2.1304215458724895e-05, + "loss": 1.7251, + "step": 22929 + }, + { + "epoch": 7.038060159607121, + "grad_norm": 0.19878868758678436, + "learning_rate": 2.1300145137525505e-05, + "loss": 1.7192, + "step": 22930 + }, + { + "epoch": 7.038367096378146, + "grad_norm": 0.18570293486118317, + "learning_rate": 2.1296075099954908e-05, + "loss": 1.718, + "step": 22931 + }, + { + "epoch": 7.038674033149171, + "grad_norm": 0.16497015953063965, + "learning_rate": 2.12920053460533e-05, + "loss": 1.6914, + "step": 22932 + }, + { + "epoch": 7.038980969920196, + "grad_norm": 0.20224586129188538, + "learning_rate": 2.128793587586096e-05, + "loss": 1.6941, + "step": 22933 + }, + { + "epoch": 7.0392879066912215, + "grad_norm": 0.22124920785427094, + "learning_rate": 2.1283866689418024e-05, + "loss": 1.7921, + "step": 22934 + }, + { + "epoch": 7.039594843462247, + "grad_norm": 0.20548123121261597, + "learning_rate": 2.127979778676479e-05, + "loss": 1.7488, + "step": 22935 + }, + { + "epoch": 7.039901780233272, + "grad_norm": 0.17604656517505646, + "learning_rate": 2.1275729167941405e-05, + "loss": 1.7145, + "step": 22936 + }, + { + "epoch": 7.0402087170042975, + "grad_norm": 0.17899781465530396, + "learning_rate": 2.127166083298809e-05, + "loss": 1.6703, + "step": 22937 + }, + { + "epoch": 7.040515653775322, + "grad_norm": 0.16101998090744019, + "learning_rate": 2.126759278194509e-05, + "loss": 1.715, + "step": 22938 + }, + { + "epoch": 7.040822590546347, + "grad_norm": 0.22807051241397858, + "learning_rate": 2.1263525014852542e-05, + "loss": 1.7409, + "step": 22939 + }, + { + "epoch": 7.041129527317373, + "grad_norm": 0.19442932307720184, + "learning_rate": 2.125945753175072e-05, + "loss": 1.6953, + "step": 22940 + }, + { + "epoch": 7.041436464088398, + "grad_norm": 0.24816946685314178, + "learning_rate": 2.1255390332679755e-05, + "loss": 1.7527, + "step": 22941 + }, + { + "epoch": 7.041743400859423, + "grad_norm": 0.26748740673065186, + "learning_rate": 2.1251323417679882e-05, + "loss": 1.7703, + "step": 22942 + }, + { + "epoch": 7.042050337630448, + "grad_norm": 0.19965825974941254, + "learning_rate": 2.124725678679128e-05, + "loss": 1.7303, + "step": 22943 + }, + { + "epoch": 7.042357274401473, + "grad_norm": 0.2442217618227005, + "learning_rate": 2.124319044005414e-05, + "loss": 1.7183, + "step": 22944 + }, + { + "epoch": 7.042664211172498, + "grad_norm": 0.21421664953231812, + "learning_rate": 2.1239124377508646e-05, + "loss": 1.7348, + "step": 22945 + }, + { + "epoch": 7.042971147943524, + "grad_norm": 0.26072144508361816, + "learning_rate": 2.1235058599194984e-05, + "loss": 1.7396, + "step": 22946 + }, + { + "epoch": 7.043278084714549, + "grad_norm": 0.20694412291049957, + "learning_rate": 2.1230993105153335e-05, + "loss": 1.7871, + "step": 22947 + }, + { + "epoch": 7.043585021485574, + "grad_norm": 0.298551082611084, + "learning_rate": 2.122692789542387e-05, + "loss": 1.7051, + "step": 22948 + }, + { + "epoch": 7.043891958256599, + "grad_norm": 0.22547855973243713, + "learning_rate": 2.1222862970046752e-05, + "loss": 1.7392, + "step": 22949 + }, + { + "epoch": 7.044198895027624, + "grad_norm": 0.3150571882724762, + "learning_rate": 2.1218798329062205e-05, + "loss": 1.6705, + "step": 22950 + }, + { + "epoch": 7.0445058317986495, + "grad_norm": 0.2025378942489624, + "learning_rate": 2.1214733972510327e-05, + "loss": 1.7114, + "step": 22951 + }, + { + "epoch": 7.044812768569675, + "grad_norm": 0.29046711325645447, + "learning_rate": 2.1210669900431353e-05, + "loss": 1.7745, + "step": 22952 + }, + { + "epoch": 7.0451197053407, + "grad_norm": 0.23395368456840515, + "learning_rate": 2.1206606112865396e-05, + "loss": 1.7829, + "step": 22953 + }, + { + "epoch": 7.045426642111725, + "grad_norm": 0.21395133435726166, + "learning_rate": 2.1202542609852616e-05, + "loss": 1.7211, + "step": 22954 + }, + { + "epoch": 7.04573357888275, + "grad_norm": 0.18077452480793, + "learning_rate": 2.1198479391433223e-05, + "loss": 1.7584, + "step": 22955 + }, + { + "epoch": 7.046040515653775, + "grad_norm": 0.17318682372570038, + "learning_rate": 2.1194416457647302e-05, + "loss": 1.7525, + "step": 22956 + }, + { + "epoch": 7.046347452424801, + "grad_norm": 0.18798092007637024, + "learning_rate": 2.119035380853508e-05, + "loss": 1.7525, + "step": 22957 + }, + { + "epoch": 7.046654389195826, + "grad_norm": 0.18679840862751007, + "learning_rate": 2.118629144413663e-05, + "loss": 1.7729, + "step": 22958 + }, + { + "epoch": 7.04696132596685, + "grad_norm": 0.17846907675266266, + "learning_rate": 2.1182229364492156e-05, + "loss": 1.7354, + "step": 22959 + }, + { + "epoch": 7.047268262737876, + "grad_norm": 0.22771520912647247, + "learning_rate": 2.1178167569641783e-05, + "loss": 1.7086, + "step": 22960 + }, + { + "epoch": 7.047575199508901, + "grad_norm": 0.1541738212108612, + "learning_rate": 2.1174106059625642e-05, + "loss": 1.67, + "step": 22961 + }, + { + "epoch": 7.047882136279926, + "grad_norm": 0.17698390781879425, + "learning_rate": 2.117004483448389e-05, + "loss": 1.68, + "step": 22962 + }, + { + "epoch": 7.048189073050952, + "grad_norm": 0.2220597118139267, + "learning_rate": 2.1165983894256647e-05, + "loss": 1.7783, + "step": 22963 + }, + { + "epoch": 7.048496009821977, + "grad_norm": 0.20971544086933136, + "learning_rate": 2.1161923238984055e-05, + "loss": 1.7318, + "step": 22964 + }, + { + "epoch": 7.0488029465930016, + "grad_norm": 0.2032100409269333, + "learning_rate": 2.1157862868706242e-05, + "loss": 1.6736, + "step": 22965 + }, + { + "epoch": 7.049109883364027, + "grad_norm": 0.19177256524562836, + "learning_rate": 2.115380278346331e-05, + "loss": 1.74, + "step": 22966 + }, + { + "epoch": 7.049416820135052, + "grad_norm": 0.1956746131181717, + "learning_rate": 2.1149742983295446e-05, + "loss": 1.7251, + "step": 22967 + }, + { + "epoch": 7.0497237569060776, + "grad_norm": 0.16200929880142212, + "learning_rate": 2.114568346824269e-05, + "loss": 1.6735, + "step": 22968 + }, + { + "epoch": 7.050030693677103, + "grad_norm": 0.19551095366477966, + "learning_rate": 2.1141624238345242e-05, + "loss": 1.7185, + "step": 22969 + }, + { + "epoch": 7.050337630448127, + "grad_norm": 0.17967839539051056, + "learning_rate": 2.1137565293643158e-05, + "loss": 1.7262, + "step": 22970 + }, + { + "epoch": 7.050644567219153, + "grad_norm": 0.15093082189559937, + "learning_rate": 2.1133506634176552e-05, + "loss": 1.6695, + "step": 22971 + }, + { + "epoch": 7.050951503990178, + "grad_norm": 0.20207351446151733, + "learning_rate": 2.1129448259985595e-05, + "loss": 1.7448, + "step": 22972 + }, + { + "epoch": 7.051258440761203, + "grad_norm": 0.20243801176548004, + "learning_rate": 2.112539017111031e-05, + "loss": 1.7496, + "step": 22973 + }, + { + "epoch": 7.051565377532229, + "grad_norm": 0.1967451572418213, + "learning_rate": 2.112133236759088e-05, + "loss": 1.718, + "step": 22974 + }, + { + "epoch": 7.051872314303253, + "grad_norm": 0.17668583989143372, + "learning_rate": 2.1117274849467334e-05, + "loss": 1.7295, + "step": 22975 + }, + { + "epoch": 7.0521792510742785, + "grad_norm": 0.17461778223514557, + "learning_rate": 2.1113217616779824e-05, + "loss": 1.7166, + "step": 22976 + }, + { + "epoch": 7.052486187845304, + "grad_norm": 0.18184112012386322, + "learning_rate": 2.110916066956843e-05, + "loss": 1.7092, + "step": 22977 + }, + { + "epoch": 7.052793124616329, + "grad_norm": 0.18001540005207062, + "learning_rate": 2.1105104007873246e-05, + "loss": 1.7129, + "step": 22978 + }, + { + "epoch": 7.0531000613873545, + "grad_norm": 0.15966519713401794, + "learning_rate": 2.1101047631734355e-05, + "loss": 1.7121, + "step": 22979 + }, + { + "epoch": 7.05340699815838, + "grad_norm": 0.20201170444488525, + "learning_rate": 2.109699154119185e-05, + "loss": 1.7266, + "step": 22980 + }, + { + "epoch": 7.053713934929404, + "grad_norm": 0.19559438526630402, + "learning_rate": 2.1092935736285817e-05, + "loss": 1.7492, + "step": 22981 + }, + { + "epoch": 7.05402087170043, + "grad_norm": 0.17783302068710327, + "learning_rate": 2.108888021705634e-05, + "loss": 1.6901, + "step": 22982 + }, + { + "epoch": 7.054327808471455, + "grad_norm": 0.22052957117557526, + "learning_rate": 2.108482498354347e-05, + "loss": 1.6771, + "step": 22983 + }, + { + "epoch": 7.05463474524248, + "grad_norm": 0.1899181455373764, + "learning_rate": 2.1080770035787346e-05, + "loss": 1.7011, + "step": 22984 + }, + { + "epoch": 7.054941682013506, + "grad_norm": 0.19773316383361816, + "learning_rate": 2.1076715373827964e-05, + "loss": 1.7535, + "step": 22985 + }, + { + "epoch": 7.05524861878453, + "grad_norm": 0.2244229018688202, + "learning_rate": 2.1072660997705475e-05, + "loss": 1.7938, + "step": 22986 + }, + { + "epoch": 7.055555555555555, + "grad_norm": 0.18881015479564667, + "learning_rate": 2.106860690745988e-05, + "loss": 1.6753, + "step": 22987 + }, + { + "epoch": 7.055862492326581, + "grad_norm": 0.19642052054405212, + "learning_rate": 2.106455310313126e-05, + "loss": 1.735, + "step": 22988 + }, + { + "epoch": 7.056169429097606, + "grad_norm": 0.23549412190914154, + "learning_rate": 2.106049958475971e-05, + "loss": 1.7705, + "step": 22989 + }, + { + "epoch": 7.056476365868631, + "grad_norm": 0.21001911163330078, + "learning_rate": 2.1056446352385235e-05, + "loss": 1.6802, + "step": 22990 + }, + { + "epoch": 7.056783302639656, + "grad_norm": 0.1821003556251526, + "learning_rate": 2.1052393406047953e-05, + "loss": 1.7144, + "step": 22991 + }, + { + "epoch": 7.057090239410681, + "grad_norm": 0.1979309767484665, + "learning_rate": 2.104834074578786e-05, + "loss": 1.6983, + "step": 22992 + }, + { + "epoch": 7.0573971761817065, + "grad_norm": 0.18264134228229523, + "learning_rate": 2.1044288371645045e-05, + "loss": 1.7001, + "step": 22993 + }, + { + "epoch": 7.057704112952732, + "grad_norm": 0.17276059091091156, + "learning_rate": 2.104023628365954e-05, + "loss": 1.6976, + "step": 22994 + }, + { + "epoch": 7.058011049723757, + "grad_norm": 0.18879400193691254, + "learning_rate": 2.1036184481871402e-05, + "loss": 1.6954, + "step": 22995 + }, + { + "epoch": 7.0583179864947825, + "grad_norm": 0.1956210434436798, + "learning_rate": 2.103213296632066e-05, + "loss": 1.7329, + "step": 22996 + }, + { + "epoch": 7.058624923265807, + "grad_norm": 0.21108154952526093, + "learning_rate": 2.1028081737047356e-05, + "loss": 1.7299, + "step": 22997 + }, + { + "epoch": 7.058931860036832, + "grad_norm": 0.17981186509132385, + "learning_rate": 2.1024030794091537e-05, + "loss": 1.7162, + "step": 22998 + }, + { + "epoch": 7.059238796807858, + "grad_norm": 0.1699269711971283, + "learning_rate": 2.101998013749322e-05, + "loss": 1.6842, + "step": 22999 + }, + { + "epoch": 7.059545733578883, + "grad_norm": 0.17033198475837708, + "learning_rate": 2.1015929767292435e-05, + "loss": 1.6735, + "step": 23000 + }, + { + "epoch": 7.059852670349908, + "grad_norm": 0.18620076775550842, + "learning_rate": 2.101187968352925e-05, + "loss": 1.7328, + "step": 23001 + }, + { + "epoch": 7.060159607120933, + "grad_norm": 0.17528964579105377, + "learning_rate": 2.100782988624363e-05, + "loss": 1.6567, + "step": 23002 + }, + { + "epoch": 7.060466543891958, + "grad_norm": 0.1946999728679657, + "learning_rate": 2.100378037547566e-05, + "loss": 1.7349, + "step": 23003 + }, + { + "epoch": 7.060773480662983, + "grad_norm": 0.23345647752285004, + "learning_rate": 2.0999731151265312e-05, + "loss": 1.7185, + "step": 23004 + }, + { + "epoch": 7.061080417434009, + "grad_norm": 0.20169813930988312, + "learning_rate": 2.0995682213652603e-05, + "loss": 1.7223, + "step": 23005 + }, + { + "epoch": 7.061387354205034, + "grad_norm": 0.2397730052471161, + "learning_rate": 2.0991633562677594e-05, + "loss": 1.7542, + "step": 23006 + }, + { + "epoch": 7.0616942909760585, + "grad_norm": 0.20421954989433289, + "learning_rate": 2.0987585198380227e-05, + "loss": 1.6888, + "step": 23007 + }, + { + "epoch": 7.062001227747084, + "grad_norm": 0.21555101871490479, + "learning_rate": 2.0983537120800584e-05, + "loss": 1.6796, + "step": 23008 + }, + { + "epoch": 7.062308164518109, + "grad_norm": 0.17311134934425354, + "learning_rate": 2.0979489329978603e-05, + "loss": 1.7199, + "step": 23009 + }, + { + "epoch": 7.0626151012891345, + "grad_norm": 0.25064393877983093, + "learning_rate": 2.0975441825954334e-05, + "loss": 1.6947, + "step": 23010 + }, + { + "epoch": 7.06292203806016, + "grad_norm": 0.19135847687721252, + "learning_rate": 2.0971394608767757e-05, + "loss": 1.702, + "step": 23011 + }, + { + "epoch": 7.063228974831185, + "grad_norm": 0.22994364798069, + "learning_rate": 2.0967347678458876e-05, + "loss": 1.6814, + "step": 23012 + }, + { + "epoch": 7.06353591160221, + "grad_norm": 0.21897611021995544, + "learning_rate": 2.0963301035067685e-05, + "loss": 1.7063, + "step": 23013 + }, + { + "epoch": 7.063842848373235, + "grad_norm": 0.23615150153636932, + "learning_rate": 2.0959254678634166e-05, + "loss": 1.7299, + "step": 23014 + }, + { + "epoch": 7.06414978514426, + "grad_norm": 0.1837770640850067, + "learning_rate": 2.0955208609198314e-05, + "loss": 1.7236, + "step": 23015 + }, + { + "epoch": 7.064456721915286, + "grad_norm": 0.16823385655879974, + "learning_rate": 2.0951162826800118e-05, + "loss": 1.6687, + "step": 23016 + }, + { + "epoch": 7.064763658686311, + "grad_norm": 0.17042338848114014, + "learning_rate": 2.094711733147954e-05, + "loss": 1.6907, + "step": 23017 + }, + { + "epoch": 7.065070595457335, + "grad_norm": 0.1753006875514984, + "learning_rate": 2.094307212327661e-05, + "loss": 1.7313, + "step": 23018 + }, + { + "epoch": 7.065377532228361, + "grad_norm": 0.19618375599384308, + "learning_rate": 2.093902720223123e-05, + "loss": 1.7147, + "step": 23019 + }, + { + "epoch": 7.065684468999386, + "grad_norm": 0.20214296877384186, + "learning_rate": 2.093498256838346e-05, + "loss": 1.7056, + "step": 23020 + }, + { + "epoch": 7.065991405770411, + "grad_norm": 0.20230883359909058, + "learning_rate": 2.093093822177321e-05, + "loss": 1.6628, + "step": 23021 + }, + { + "epoch": 7.066298342541437, + "grad_norm": 0.19913128018379211, + "learning_rate": 2.0926894162440446e-05, + "loss": 1.7286, + "step": 23022 + }, + { + "epoch": 7.066605279312462, + "grad_norm": 0.19535091519355774, + "learning_rate": 2.0922850390425193e-05, + "loss": 1.745, + "step": 23023 + }, + { + "epoch": 7.0669122160834865, + "grad_norm": 0.19679825007915497, + "learning_rate": 2.0918806905767337e-05, + "loss": 1.694, + "step": 23024 + }, + { + "epoch": 7.067219152854512, + "grad_norm": 0.1821403056383133, + "learning_rate": 2.0914763708506913e-05, + "loss": 1.7163, + "step": 23025 + }, + { + "epoch": 7.067526089625537, + "grad_norm": 0.17138415575027466, + "learning_rate": 2.0910720798683803e-05, + "loss": 1.6946, + "step": 23026 + }, + { + "epoch": 7.0678330263965625, + "grad_norm": 0.20219111442565918, + "learning_rate": 2.0906678176338017e-05, + "loss": 1.7437, + "step": 23027 + }, + { + "epoch": 7.068139963167588, + "grad_norm": 0.1985882669687271, + "learning_rate": 2.0902635841509494e-05, + "loss": 1.6762, + "step": 23028 + }, + { + "epoch": 7.068446899938612, + "grad_norm": 0.18586322665214539, + "learning_rate": 2.0898593794238174e-05, + "loss": 1.7296, + "step": 23029 + }, + { + "epoch": 7.068753836709638, + "grad_norm": 0.19222751259803772, + "learning_rate": 2.0894552034564013e-05, + "loss": 1.7186, + "step": 23030 + }, + { + "epoch": 7.069060773480663, + "grad_norm": 0.16107569634914398, + "learning_rate": 2.0890510562526944e-05, + "loss": 1.6898, + "step": 23031 + }, + { + "epoch": 7.069367710251688, + "grad_norm": 0.23859064280986786, + "learning_rate": 2.088646937816691e-05, + "loss": 1.7992, + "step": 23032 + }, + { + "epoch": 7.069674647022714, + "grad_norm": 0.22927051782608032, + "learning_rate": 2.0882428481523853e-05, + "loss": 1.7162, + "step": 23033 + }, + { + "epoch": 7.069981583793738, + "grad_norm": 0.18094350397586823, + "learning_rate": 2.0878387872637684e-05, + "loss": 1.7297, + "step": 23034 + }, + { + "epoch": 7.070288520564763, + "grad_norm": 0.20562811195850372, + "learning_rate": 2.087434755154839e-05, + "loss": 1.7475, + "step": 23035 + }, + { + "epoch": 7.070595457335789, + "grad_norm": 0.18405984342098236, + "learning_rate": 2.087030751829583e-05, + "loss": 1.6954, + "step": 23036 + }, + { + "epoch": 7.070902394106814, + "grad_norm": 0.26286160945892334, + "learning_rate": 2.0866267772919994e-05, + "loss": 1.7406, + "step": 23037 + }, + { + "epoch": 7.071209330877839, + "grad_norm": 0.1688467413187027, + "learning_rate": 2.086222831546077e-05, + "loss": 1.7375, + "step": 23038 + }, + { + "epoch": 7.071516267648865, + "grad_norm": 0.25445011258125305, + "learning_rate": 2.0858189145958057e-05, + "loss": 1.7479, + "step": 23039 + }, + { + "epoch": 7.071823204419889, + "grad_norm": 0.20637978613376617, + "learning_rate": 2.085415026445184e-05, + "loss": 1.7653, + "step": 23040 + }, + { + "epoch": 7.0721301411909145, + "grad_norm": 0.21693937480449677, + "learning_rate": 2.0850111670981952e-05, + "loss": 1.7392, + "step": 23041 + }, + { + "epoch": 7.07243707796194, + "grad_norm": 0.1999017745256424, + "learning_rate": 2.0846073365588388e-05, + "loss": 1.753, + "step": 23042 + }, + { + "epoch": 7.072744014732965, + "grad_norm": 0.2271260917186737, + "learning_rate": 2.0842035348310973e-05, + "loss": 1.7136, + "step": 23043 + }, + { + "epoch": 7.0730509515039905, + "grad_norm": 0.1915169358253479, + "learning_rate": 2.0837997619189675e-05, + "loss": 1.7142, + "step": 23044 + }, + { + "epoch": 7.073357888275015, + "grad_norm": 0.2250204086303711, + "learning_rate": 2.0833960178264377e-05, + "loss": 1.8039, + "step": 23045 + }, + { + "epoch": 7.07366482504604, + "grad_norm": 0.20920081436634064, + "learning_rate": 2.0829923025574976e-05, + "loss": 1.767, + "step": 23046 + }, + { + "epoch": 7.073971761817066, + "grad_norm": 0.16039173305034637, + "learning_rate": 2.082588616116138e-05, + "loss": 1.6895, + "step": 23047 + }, + { + "epoch": 7.074278698588091, + "grad_norm": 0.1849806159734726, + "learning_rate": 2.082184958506347e-05, + "loss": 1.7323, + "step": 23048 + }, + { + "epoch": 7.074585635359116, + "grad_norm": 0.22370420396327972, + "learning_rate": 2.081781329732115e-05, + "loss": 1.7478, + "step": 23049 + }, + { + "epoch": 7.074892572130141, + "grad_norm": 0.1600474864244461, + "learning_rate": 2.0813777297974296e-05, + "loss": 1.6754, + "step": 23050 + }, + { + "epoch": 7.075199508901166, + "grad_norm": 0.18357187509536743, + "learning_rate": 2.080974158706281e-05, + "loss": 1.694, + "step": 23051 + }, + { + "epoch": 7.0755064456721914, + "grad_norm": 0.17667005956172943, + "learning_rate": 2.080570616462656e-05, + "loss": 1.7053, + "step": 23052 + }, + { + "epoch": 7.075813382443217, + "grad_norm": 0.19393591582775116, + "learning_rate": 2.0801671030705417e-05, + "loss": 1.7917, + "step": 23053 + }, + { + "epoch": 7.076120319214242, + "grad_norm": 0.19432564079761505, + "learning_rate": 2.0797636185339307e-05, + "loss": 1.7276, + "step": 23054 + }, + { + "epoch": 7.0764272559852675, + "grad_norm": 0.17960594594478607, + "learning_rate": 2.079360162856806e-05, + "loss": 1.6988, + "step": 23055 + }, + { + "epoch": 7.076734192756292, + "grad_norm": 0.183505579829216, + "learning_rate": 2.0789567360431538e-05, + "loss": 1.7106, + "step": 23056 + }, + { + "epoch": 7.077041129527317, + "grad_norm": 0.27859750390052795, + "learning_rate": 2.0785533380969673e-05, + "loss": 1.779, + "step": 23057 + }, + { + "epoch": 7.077348066298343, + "grad_norm": 0.1903255134820938, + "learning_rate": 2.078149969022225e-05, + "loss": 1.7334, + "step": 23058 + }, + { + "epoch": 7.077655003069368, + "grad_norm": 0.2221076786518097, + "learning_rate": 2.0777466288229207e-05, + "loss": 1.6863, + "step": 23059 + }, + { + "epoch": 7.077961939840393, + "grad_norm": 0.15516065061092377, + "learning_rate": 2.0773433175030336e-05, + "loss": 1.6633, + "step": 23060 + }, + { + "epoch": 7.078268876611418, + "grad_norm": 0.20073910057544708, + "learning_rate": 2.0769400350665553e-05, + "loss": 1.7057, + "step": 23061 + }, + { + "epoch": 7.078575813382443, + "grad_norm": 0.1680205762386322, + "learning_rate": 2.076536781517468e-05, + "loss": 1.6659, + "step": 23062 + }, + { + "epoch": 7.078882750153468, + "grad_norm": 0.20825456082820892, + "learning_rate": 2.0761335568597584e-05, + "loss": 1.751, + "step": 23063 + }, + { + "epoch": 7.079189686924494, + "grad_norm": 0.17365674674510956, + "learning_rate": 2.0757303610974098e-05, + "loss": 1.6591, + "step": 23064 + }, + { + "epoch": 7.079496623695519, + "grad_norm": 0.21712929010391235, + "learning_rate": 2.0753271942344087e-05, + "loss": 1.7357, + "step": 23065 + }, + { + "epoch": 7.0798035604665435, + "grad_norm": 0.1841089278459549, + "learning_rate": 2.074924056274738e-05, + "loss": 1.6818, + "step": 23066 + }, + { + "epoch": 7.080110497237569, + "grad_norm": 0.20433486998081207, + "learning_rate": 2.074520947222382e-05, + "loss": 1.76, + "step": 23067 + }, + { + "epoch": 7.080417434008594, + "grad_norm": 0.1712963879108429, + "learning_rate": 2.074117867081325e-05, + "loss": 1.6426, + "step": 23068 + }, + { + "epoch": 7.0807243707796195, + "grad_norm": 0.19894109666347504, + "learning_rate": 2.0737148158555504e-05, + "loss": 1.7529, + "step": 23069 + }, + { + "epoch": 7.081031307550645, + "grad_norm": 0.19338269531726837, + "learning_rate": 2.0733117935490386e-05, + "loss": 1.8274, + "step": 23070 + }, + { + "epoch": 7.08133824432167, + "grad_norm": 0.20883139967918396, + "learning_rate": 2.0729088001657794e-05, + "loss": 1.7275, + "step": 23071 + }, + { + "epoch": 7.081645181092695, + "grad_norm": 0.18498694896697998, + "learning_rate": 2.0725058357097487e-05, + "loss": 1.6648, + "step": 23072 + }, + { + "epoch": 7.08195211786372, + "grad_norm": 0.1727421134710312, + "learning_rate": 2.0721029001849313e-05, + "loss": 1.7709, + "step": 23073 + }, + { + "epoch": 7.082259054634745, + "grad_norm": 0.16965949535369873, + "learning_rate": 2.0716999935953096e-05, + "loss": 1.6876, + "step": 23074 + }, + { + "epoch": 7.082565991405771, + "grad_norm": 0.16905519366264343, + "learning_rate": 2.0712971159448623e-05, + "loss": 1.6576, + "step": 23075 + }, + { + "epoch": 7.082872928176796, + "grad_norm": 0.2863580882549286, + "learning_rate": 2.0708942672375776e-05, + "loss": 1.7631, + "step": 23076 + }, + { + "epoch": 7.08317986494782, + "grad_norm": 0.26248931884765625, + "learning_rate": 2.070491447477429e-05, + "loss": 1.7692, + "step": 23077 + }, + { + "epoch": 7.083486801718846, + "grad_norm": 0.17670878767967224, + "learning_rate": 2.0700886566684024e-05, + "loss": 1.6725, + "step": 23078 + }, + { + "epoch": 7.083793738489871, + "grad_norm": 0.19245800375938416, + "learning_rate": 2.0696858948144775e-05, + "loss": 1.7249, + "step": 23079 + }, + { + "epoch": 7.084100675260896, + "grad_norm": 0.18651939928531647, + "learning_rate": 2.0692831619196335e-05, + "loss": 1.7616, + "step": 23080 + }, + { + "epoch": 7.084407612031922, + "grad_norm": 0.21432510018348694, + "learning_rate": 2.0688804579878514e-05, + "loss": 1.743, + "step": 23081 + }, + { + "epoch": 7.084714548802946, + "grad_norm": 0.18530069291591644, + "learning_rate": 2.0684777830231106e-05, + "loss": 1.7257, + "step": 23082 + }, + { + "epoch": 7.0850214855739715, + "grad_norm": 0.1974172443151474, + "learning_rate": 2.0680751370293903e-05, + "loss": 1.6918, + "step": 23083 + }, + { + "epoch": 7.085328422344997, + "grad_norm": 0.19517268240451813, + "learning_rate": 2.0676725200106706e-05, + "loss": 1.7421, + "step": 23084 + }, + { + "epoch": 7.085635359116022, + "grad_norm": 0.28572699427604675, + "learning_rate": 2.067269931970929e-05, + "loss": 1.7575, + "step": 23085 + }, + { + "epoch": 7.0859422958870475, + "grad_norm": 0.2062397003173828, + "learning_rate": 2.0668673729141452e-05, + "loss": 1.7085, + "step": 23086 + }, + { + "epoch": 7.086249232658073, + "grad_norm": 0.21619725227355957, + "learning_rate": 2.0664648428442973e-05, + "loss": 1.7783, + "step": 23087 + }, + { + "epoch": 7.086556169429097, + "grad_norm": 0.2732481360435486, + "learning_rate": 2.066062341765363e-05, + "loss": 1.7089, + "step": 23088 + }, + { + "epoch": 7.086863106200123, + "grad_norm": 0.19897356629371643, + "learning_rate": 2.06565986968132e-05, + "loss": 1.6487, + "step": 23089 + }, + { + "epoch": 7.087170042971148, + "grad_norm": 0.2578796148300171, + "learning_rate": 2.0652574265961466e-05, + "loss": 1.7385, + "step": 23090 + }, + { + "epoch": 7.087476979742173, + "grad_norm": 0.18980316817760468, + "learning_rate": 2.0648550125138195e-05, + "loss": 1.6651, + "step": 23091 + }, + { + "epoch": 7.087783916513199, + "grad_norm": 0.279580682516098, + "learning_rate": 2.064452627438313e-05, + "loss": 1.7189, + "step": 23092 + }, + { + "epoch": 7.088090853284223, + "grad_norm": 0.18652775883674622, + "learning_rate": 2.0640502713736103e-05, + "loss": 1.7085, + "step": 23093 + }, + { + "epoch": 7.088397790055248, + "grad_norm": 0.2729358673095703, + "learning_rate": 2.06364794432368e-05, + "loss": 1.6812, + "step": 23094 + }, + { + "epoch": 7.088704726826274, + "grad_norm": 0.1756472885608673, + "learning_rate": 2.0632456462925053e-05, + "loss": 1.6835, + "step": 23095 + }, + { + "epoch": 7.089011663597299, + "grad_norm": 0.2352994978427887, + "learning_rate": 2.062843377284055e-05, + "loss": 1.6898, + "step": 23096 + }, + { + "epoch": 7.089318600368324, + "grad_norm": 0.20231495797634125, + "learning_rate": 2.0624411373023093e-05, + "loss": 1.7294, + "step": 23097 + }, + { + "epoch": 7.08962553713935, + "grad_norm": 0.276114821434021, + "learning_rate": 2.0620389263512424e-05, + "loss": 1.6864, + "step": 23098 + }, + { + "epoch": 7.089932473910374, + "grad_norm": 0.2178632766008377, + "learning_rate": 2.0616367444348288e-05, + "loss": 1.7353, + "step": 23099 + }, + { + "epoch": 7.0902394106813995, + "grad_norm": 0.20966552197933197, + "learning_rate": 2.061234591557043e-05, + "loss": 1.6579, + "step": 23100 + }, + { + "epoch": 7.090546347452425, + "grad_norm": 0.16496559977531433, + "learning_rate": 2.0608324677218592e-05, + "loss": 1.7137, + "step": 23101 + }, + { + "epoch": 7.09085328422345, + "grad_norm": 0.19176827371120453, + "learning_rate": 2.0604303729332525e-05, + "loss": 1.6996, + "step": 23102 + }, + { + "epoch": 7.0911602209944755, + "grad_norm": 0.20933480560779572, + "learning_rate": 2.060028307195195e-05, + "loss": 1.7887, + "step": 23103 + }, + { + "epoch": 7.0914671577655, + "grad_norm": 0.1925809681415558, + "learning_rate": 2.0596262705116613e-05, + "loss": 1.6974, + "step": 23104 + }, + { + "epoch": 7.091774094536525, + "grad_norm": 0.1582585573196411, + "learning_rate": 2.0592242628866236e-05, + "loss": 1.6731, + "step": 23105 + }, + { + "epoch": 7.092081031307551, + "grad_norm": 0.20380592346191406, + "learning_rate": 2.058822284324056e-05, + "loss": 1.6911, + "step": 23106 + }, + { + "epoch": 7.092387968078576, + "grad_norm": 0.17984862625598907, + "learning_rate": 2.0584203348279307e-05, + "loss": 1.7218, + "step": 23107 + }, + { + "epoch": 7.092694904849601, + "grad_norm": 0.22097790241241455, + "learning_rate": 2.058018414402219e-05, + "loss": 1.7223, + "step": 23108 + }, + { + "epoch": 7.093001841620626, + "grad_norm": 0.20519912242889404, + "learning_rate": 2.0576165230508926e-05, + "loss": 1.7197, + "step": 23109 + }, + { + "epoch": 7.093308778391651, + "grad_norm": 0.2156807780265808, + "learning_rate": 2.0572146607779274e-05, + "loss": 1.7079, + "step": 23110 + }, + { + "epoch": 7.093615715162676, + "grad_norm": 0.21810726821422577, + "learning_rate": 2.056812827587288e-05, + "loss": 1.7456, + "step": 23111 + }, + { + "epoch": 7.093922651933702, + "grad_norm": 0.2288726568222046, + "learning_rate": 2.0564110234829536e-05, + "loss": 1.8113, + "step": 23112 + }, + { + "epoch": 7.094229588704727, + "grad_norm": 0.21279199421405792, + "learning_rate": 2.056009248468887e-05, + "loss": 1.7554, + "step": 23113 + }, + { + "epoch": 7.094536525475752, + "grad_norm": 0.18577606976032257, + "learning_rate": 2.055607502549064e-05, + "loss": 1.661, + "step": 23114 + }, + { + "epoch": 7.094843462246777, + "grad_norm": 0.17938728630542755, + "learning_rate": 2.0552057857274536e-05, + "loss": 1.6998, + "step": 23115 + }, + { + "epoch": 7.095150399017802, + "grad_norm": 0.1946432888507843, + "learning_rate": 2.0548040980080258e-05, + "loss": 1.7146, + "step": 23116 + }, + { + "epoch": 7.0954573357888275, + "grad_norm": 0.21220463514328003, + "learning_rate": 2.0544024393947496e-05, + "loss": 1.7345, + "step": 23117 + }, + { + "epoch": 7.095764272559853, + "grad_norm": 0.2006370723247528, + "learning_rate": 2.0540008098915954e-05, + "loss": 1.7636, + "step": 23118 + }, + { + "epoch": 7.096071209330878, + "grad_norm": 0.17251192033290863, + "learning_rate": 2.0535992095025312e-05, + "loss": 1.7103, + "step": 23119 + }, + { + "epoch": 7.096378146101903, + "grad_norm": 0.2393570840358734, + "learning_rate": 2.0531976382315277e-05, + "loss": 1.7636, + "step": 23120 + }, + { + "epoch": 7.096685082872928, + "grad_norm": 0.16999265551567078, + "learning_rate": 2.0527960960825516e-05, + "loss": 1.6571, + "step": 23121 + }, + { + "epoch": 7.096992019643953, + "grad_norm": 0.17626826465129852, + "learning_rate": 2.052394583059572e-05, + "loss": 1.713, + "step": 23122 + }, + { + "epoch": 7.097298956414979, + "grad_norm": 0.18373346328735352, + "learning_rate": 2.051993099166557e-05, + "loss": 1.7102, + "step": 23123 + }, + { + "epoch": 7.097605893186004, + "grad_norm": 0.1913219541311264, + "learning_rate": 2.0515916444074734e-05, + "loss": 1.7441, + "step": 23124 + }, + { + "epoch": 7.097912829957028, + "grad_norm": 0.19664399325847626, + "learning_rate": 2.0511902187862903e-05, + "loss": 1.6866, + "step": 23125 + }, + { + "epoch": 7.098219766728054, + "grad_norm": 0.16524936258792877, + "learning_rate": 2.050788822306971e-05, + "loss": 1.6709, + "step": 23126 + }, + { + "epoch": 7.098526703499079, + "grad_norm": 0.19291190803050995, + "learning_rate": 2.050387454973489e-05, + "loss": 1.7033, + "step": 23127 + }, + { + "epoch": 7.098833640270104, + "grad_norm": 0.19915525615215302, + "learning_rate": 2.0499861167898037e-05, + "loss": 1.7425, + "step": 23128 + }, + { + "epoch": 7.09914057704113, + "grad_norm": 0.21295227110385895, + "learning_rate": 2.0495848077598883e-05, + "loss": 1.7516, + "step": 23129 + }, + { + "epoch": 7.099447513812155, + "grad_norm": 0.21469831466674805, + "learning_rate": 2.0491835278877014e-05, + "loss": 1.7129, + "step": 23130 + }, + { + "epoch": 7.0997544505831796, + "grad_norm": 0.16860374808311462, + "learning_rate": 2.0487822771772143e-05, + "loss": 1.7172, + "step": 23131 + }, + { + "epoch": 7.100061387354205, + "grad_norm": 0.22386015951633453, + "learning_rate": 2.04838105563239e-05, + "loss": 1.7829, + "step": 23132 + }, + { + "epoch": 7.10036832412523, + "grad_norm": 0.22635474801063538, + "learning_rate": 2.047979863257195e-05, + "loss": 1.6956, + "step": 23133 + }, + { + "epoch": 7.100675260896256, + "grad_norm": 0.20508790016174316, + "learning_rate": 2.0475787000555924e-05, + "loss": 1.7404, + "step": 23134 + }, + { + "epoch": 7.100982197667281, + "grad_norm": 0.2055993378162384, + "learning_rate": 2.047177566031548e-05, + "loss": 1.7064, + "step": 23135 + }, + { + "epoch": 7.101289134438305, + "grad_norm": 0.19258326292037964, + "learning_rate": 2.0467764611890254e-05, + "loss": 1.7078, + "step": 23136 + }, + { + "epoch": 7.101596071209331, + "grad_norm": 0.20766718685626984, + "learning_rate": 2.046375385531989e-05, + "loss": 1.6854, + "step": 23137 + }, + { + "epoch": 7.101903007980356, + "grad_norm": 0.17945602536201477, + "learning_rate": 2.045974339064402e-05, + "loss": 1.6986, + "step": 23138 + }, + { + "epoch": 7.102209944751381, + "grad_norm": 0.17283397912979126, + "learning_rate": 2.045573321790228e-05, + "loss": 1.7296, + "step": 23139 + }, + { + "epoch": 7.102516881522407, + "grad_norm": 0.19000805914402008, + "learning_rate": 2.0451723337134298e-05, + "loss": 1.7005, + "step": 23140 + }, + { + "epoch": 7.102823818293431, + "grad_norm": 0.1966131180524826, + "learning_rate": 2.044771374837971e-05, + "loss": 1.7574, + "step": 23141 + }, + { + "epoch": 7.1031307550644565, + "grad_norm": 0.2411719709634781, + "learning_rate": 2.0443704451678137e-05, + "loss": 1.7599, + "step": 23142 + }, + { + "epoch": 7.103437691835482, + "grad_norm": 0.23902751505374908, + "learning_rate": 2.0439695447069173e-05, + "loss": 1.6805, + "step": 23143 + }, + { + "epoch": 7.103744628606507, + "grad_norm": 0.19117529690265656, + "learning_rate": 2.0435686734592508e-05, + "loss": 1.7482, + "step": 23144 + }, + { + "epoch": 7.1040515653775325, + "grad_norm": 0.18491674959659576, + "learning_rate": 2.0431678314287678e-05, + "loss": 1.6764, + "step": 23145 + }, + { + "epoch": 7.104358502148558, + "grad_norm": 0.21000699698925018, + "learning_rate": 2.042767018619437e-05, + "loss": 1.7185, + "step": 23146 + }, + { + "epoch": 7.104665438919582, + "grad_norm": 0.17373491823673248, + "learning_rate": 2.0423662350352117e-05, + "loss": 1.6945, + "step": 23147 + }, + { + "epoch": 7.104972375690608, + "grad_norm": 0.18387937545776367, + "learning_rate": 2.041965480680059e-05, + "loss": 1.766, + "step": 23148 + }, + { + "epoch": 7.105279312461633, + "grad_norm": 0.15976013243198395, + "learning_rate": 2.0415647555579376e-05, + "loss": 1.6446, + "step": 23149 + }, + { + "epoch": 7.105586249232658, + "grad_norm": 0.19251346588134766, + "learning_rate": 2.0411640596728066e-05, + "loss": 1.7122, + "step": 23150 + }, + { + "epoch": 7.105893186003684, + "grad_norm": 0.1640147864818573, + "learning_rate": 2.040763393028627e-05, + "loss": 1.7057, + "step": 23151 + }, + { + "epoch": 7.106200122774708, + "grad_norm": 0.20366166532039642, + "learning_rate": 2.0403627556293577e-05, + "loss": 1.7173, + "step": 23152 + }, + { + "epoch": 7.106507059545733, + "grad_norm": 0.18549348413944244, + "learning_rate": 2.039962147478958e-05, + "loss": 1.7215, + "step": 23153 + }, + { + "epoch": 7.106813996316759, + "grad_norm": 0.16964925825595856, + "learning_rate": 2.039561568581388e-05, + "loss": 1.6931, + "step": 23154 + }, + { + "epoch": 7.107120933087784, + "grad_norm": 0.16923274099826813, + "learning_rate": 2.0391610189406058e-05, + "loss": 1.6976, + "step": 23155 + }, + { + "epoch": 7.107427869858809, + "grad_norm": 0.17707234621047974, + "learning_rate": 2.038760498560569e-05, + "loss": 1.7102, + "step": 23156 + }, + { + "epoch": 7.107734806629834, + "grad_norm": 0.2048260122537613, + "learning_rate": 2.0383600074452376e-05, + "loss": 1.7116, + "step": 23157 + }, + { + "epoch": 7.108041743400859, + "grad_norm": 0.17328095436096191, + "learning_rate": 2.037959545598568e-05, + "loss": 1.6683, + "step": 23158 + }, + { + "epoch": 7.1083486801718845, + "grad_norm": 0.15829013288021088, + "learning_rate": 2.037559113024518e-05, + "loss": 1.6617, + "step": 23159 + }, + { + "epoch": 7.10865561694291, + "grad_norm": 0.21150968968868256, + "learning_rate": 2.037158709727044e-05, + "loss": 1.7057, + "step": 23160 + }, + { + "epoch": 7.108962553713935, + "grad_norm": 0.20321892201900482, + "learning_rate": 2.0367583357101072e-05, + "loss": 1.6811, + "step": 23161 + }, + { + "epoch": 7.1092694904849605, + "grad_norm": 0.19491781294345856, + "learning_rate": 2.0363579909776583e-05, + "loss": 1.6794, + "step": 23162 + }, + { + "epoch": 7.109576427255985, + "grad_norm": 0.155877947807312, + "learning_rate": 2.0359576755336594e-05, + "loss": 1.7434, + "step": 23163 + }, + { + "epoch": 7.10988336402701, + "grad_norm": 0.17822639644145966, + "learning_rate": 2.0355573893820613e-05, + "loss": 1.7029, + "step": 23164 + }, + { + "epoch": 7.110190300798036, + "grad_norm": 0.18152910470962524, + "learning_rate": 2.0351571325268242e-05, + "loss": 1.7277, + "step": 23165 + }, + { + "epoch": 7.110497237569061, + "grad_norm": 0.19928498566150665, + "learning_rate": 2.034756904971902e-05, + "loss": 1.7852, + "step": 23166 + }, + { + "epoch": 7.110804174340086, + "grad_norm": 0.19099318981170654, + "learning_rate": 2.0343567067212504e-05, + "loss": 1.7258, + "step": 23167 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 0.19800841808319092, + "learning_rate": 2.033956537778824e-05, + "loss": 1.7647, + "step": 23168 + }, + { + "epoch": 7.111418047882136, + "grad_norm": 0.20110327005386353, + "learning_rate": 2.0335563981485768e-05, + "loss": 1.7111, + "step": 23169 + }, + { + "epoch": 7.111724984653161, + "grad_norm": 0.1875200718641281, + "learning_rate": 2.0331562878344645e-05, + "loss": 1.7145, + "step": 23170 + }, + { + "epoch": 7.112031921424187, + "grad_norm": 0.17586658895015717, + "learning_rate": 2.032756206840441e-05, + "loss": 1.663, + "step": 23171 + }, + { + "epoch": 7.112338858195212, + "grad_norm": 0.1783432811498642, + "learning_rate": 2.032356155170459e-05, + "loss": 1.7146, + "step": 23172 + }, + { + "epoch": 7.112645794966237, + "grad_norm": 0.16075368225574493, + "learning_rate": 2.0319561328284737e-05, + "loss": 1.6414, + "step": 23173 + }, + { + "epoch": 7.112952731737262, + "grad_norm": 0.22822627425193787, + "learning_rate": 2.0315561398184367e-05, + "loss": 1.7363, + "step": 23174 + }, + { + "epoch": 7.113259668508287, + "grad_norm": 0.1882331818342209, + "learning_rate": 2.0311561761443026e-05, + "loss": 1.7384, + "step": 23175 + }, + { + "epoch": 7.1135666052793125, + "grad_norm": 0.21478623151779175, + "learning_rate": 2.0307562418100228e-05, + "loss": 1.7314, + "step": 23176 + }, + { + "epoch": 7.113873542050338, + "grad_norm": 0.18545235693454742, + "learning_rate": 2.0303563368195483e-05, + "loss": 1.7046, + "step": 23177 + }, + { + "epoch": 7.114180478821363, + "grad_norm": 0.1965286284685135, + "learning_rate": 2.0299564611768367e-05, + "loss": 1.7423, + "step": 23178 + }, + { + "epoch": 7.114487415592388, + "grad_norm": 0.1679733693599701, + "learning_rate": 2.0295566148858332e-05, + "loss": 1.6861, + "step": 23179 + }, + { + "epoch": 7.114794352363413, + "grad_norm": 0.18930186331272125, + "learning_rate": 2.029156797950495e-05, + "loss": 1.6609, + "step": 23180 + }, + { + "epoch": 7.115101289134438, + "grad_norm": 0.20774266123771667, + "learning_rate": 2.0287570103747672e-05, + "loss": 1.6919, + "step": 23181 + }, + { + "epoch": 7.115408225905464, + "grad_norm": 0.1866706907749176, + "learning_rate": 2.028357252162606e-05, + "loss": 1.7385, + "step": 23182 + }, + { + "epoch": 7.115715162676489, + "grad_norm": 0.21728016436100006, + "learning_rate": 2.0279575233179605e-05, + "loss": 1.7574, + "step": 23183 + }, + { + "epoch": 7.116022099447513, + "grad_norm": 0.16665934026241302, + "learning_rate": 2.02755782384478e-05, + "loss": 1.7046, + "step": 23184 + }, + { + "epoch": 7.116329036218539, + "grad_norm": 0.17275744676589966, + "learning_rate": 2.027158153747016e-05, + "loss": 1.6914, + "step": 23185 + }, + { + "epoch": 7.116635972989564, + "grad_norm": 0.15803802013397217, + "learning_rate": 2.026758513028617e-05, + "loss": 1.6932, + "step": 23186 + }, + { + "epoch": 7.116942909760589, + "grad_norm": 0.17434535920619965, + "learning_rate": 2.0263589016935336e-05, + "loss": 1.6714, + "step": 23187 + }, + { + "epoch": 7.117249846531615, + "grad_norm": 0.18005578219890594, + "learning_rate": 2.025959319745714e-05, + "loss": 1.6728, + "step": 23188 + }, + { + "epoch": 7.11755678330264, + "grad_norm": 0.19545695185661316, + "learning_rate": 2.025559767189108e-05, + "loss": 1.7475, + "step": 23189 + }, + { + "epoch": 7.1178637200736645, + "grad_norm": 0.19226810336112976, + "learning_rate": 2.025160244027663e-05, + "loss": 1.7447, + "step": 23190 + }, + { + "epoch": 7.11817065684469, + "grad_norm": 0.1682211458683014, + "learning_rate": 2.0247607502653286e-05, + "loss": 1.687, + "step": 23191 + }, + { + "epoch": 7.118477593615715, + "grad_norm": 0.1883849948644638, + "learning_rate": 2.0243612859060524e-05, + "loss": 1.7556, + "step": 23192 + }, + { + "epoch": 7.1187845303867405, + "grad_norm": 0.16668641567230225, + "learning_rate": 2.0239618509537817e-05, + "loss": 1.6683, + "step": 23193 + }, + { + "epoch": 7.119091467157766, + "grad_norm": 0.21448664367198944, + "learning_rate": 2.023562445412463e-05, + "loss": 1.709, + "step": 23194 + }, + { + "epoch": 7.11939840392879, + "grad_norm": 0.24347564578056335, + "learning_rate": 2.0231630692860476e-05, + "loss": 1.7775, + "step": 23195 + }, + { + "epoch": 7.119705340699816, + "grad_norm": 0.20289309322834015, + "learning_rate": 2.0227637225784767e-05, + "loss": 1.8258, + "step": 23196 + }, + { + "epoch": 7.120012277470841, + "grad_norm": 0.20075447857379913, + "learning_rate": 2.022364405293703e-05, + "loss": 1.686, + "step": 23197 + }, + { + "epoch": 7.120319214241866, + "grad_norm": 0.17129302024841309, + "learning_rate": 2.021965117435666e-05, + "loss": 1.6937, + "step": 23198 + }, + { + "epoch": 7.120626151012892, + "grad_norm": 0.222218856215477, + "learning_rate": 2.0215658590083164e-05, + "loss": 1.6812, + "step": 23199 + }, + { + "epoch": 7.120933087783916, + "grad_norm": 0.1955309957265854, + "learning_rate": 2.0211666300155996e-05, + "loss": 1.7652, + "step": 23200 + }, + { + "epoch": 7.121240024554941, + "grad_norm": 0.20479047298431396, + "learning_rate": 2.0207674304614595e-05, + "loss": 1.7393, + "step": 23201 + }, + { + "epoch": 7.121546961325967, + "grad_norm": 0.14726878702640533, + "learning_rate": 2.020368260349842e-05, + "loss": 1.6766, + "step": 23202 + }, + { + "epoch": 7.121853898096992, + "grad_norm": 0.19149260222911835, + "learning_rate": 2.0199691196846914e-05, + "loss": 1.7176, + "step": 23203 + }, + { + "epoch": 7.122160834868017, + "grad_norm": 0.17182055115699768, + "learning_rate": 2.019570008469953e-05, + "loss": 1.6828, + "step": 23204 + }, + { + "epoch": 7.122467771639043, + "grad_norm": 0.16044408082962036, + "learning_rate": 2.019170926709571e-05, + "loss": 1.6595, + "step": 23205 + }, + { + "epoch": 7.122774708410067, + "grad_norm": 0.21787980198860168, + "learning_rate": 2.0187718744074885e-05, + "loss": 1.7114, + "step": 23206 + }, + { + "epoch": 7.1230816451810925, + "grad_norm": 0.16959737241268158, + "learning_rate": 2.01837285156765e-05, + "loss": 1.7128, + "step": 23207 + }, + { + "epoch": 7.123388581952118, + "grad_norm": 0.28120318055152893, + "learning_rate": 2.0179738581939983e-05, + "loss": 1.8386, + "step": 23208 + }, + { + "epoch": 7.123695518723143, + "grad_norm": 0.19752691686153412, + "learning_rate": 2.017574894290477e-05, + "loss": 1.7123, + "step": 23209 + }, + { + "epoch": 7.1240024554941686, + "grad_norm": 0.19860398769378662, + "learning_rate": 2.0171759598610286e-05, + "loss": 1.7041, + "step": 23210 + }, + { + "epoch": 7.124309392265193, + "grad_norm": 0.17429523169994354, + "learning_rate": 2.0167770549095937e-05, + "loss": 1.6963, + "step": 23211 + }, + { + "epoch": 7.124616329036218, + "grad_norm": 0.27635815739631653, + "learning_rate": 2.01637817944012e-05, + "loss": 1.8261, + "step": 23212 + }, + { + "epoch": 7.124923265807244, + "grad_norm": 0.17512556910514832, + "learning_rate": 2.0159793334565424e-05, + "loss": 1.7311, + "step": 23213 + }, + { + "epoch": 7.125230202578269, + "grad_norm": 0.1964988112449646, + "learning_rate": 2.01558051696281e-05, + "loss": 1.6829, + "step": 23214 + }, + { + "epoch": 7.125537139349294, + "grad_norm": 0.20796819031238556, + "learning_rate": 2.0151817299628563e-05, + "loss": 1.7084, + "step": 23215 + }, + { + "epoch": 7.12584407612032, + "grad_norm": 0.19875051081180573, + "learning_rate": 2.0147829724606278e-05, + "loss": 1.7197, + "step": 23216 + }, + { + "epoch": 7.126151012891344, + "grad_norm": 0.22590650618076324, + "learning_rate": 2.0143842444600635e-05, + "loss": 1.7923, + "step": 23217 + }, + { + "epoch": 7.1264579496623695, + "grad_norm": 0.19106422364711761, + "learning_rate": 2.0139855459651042e-05, + "loss": 1.7096, + "step": 23218 + }, + { + "epoch": 7.126764886433395, + "grad_norm": 0.2105991542339325, + "learning_rate": 2.01358687697969e-05, + "loss": 1.6836, + "step": 23219 + }, + { + "epoch": 7.12707182320442, + "grad_norm": 0.18826960027217865, + "learning_rate": 2.013188237507761e-05, + "loss": 1.7347, + "step": 23220 + }, + { + "epoch": 7.1273787599754455, + "grad_norm": 0.1865578591823578, + "learning_rate": 2.012789627553256e-05, + "loss": 1.7115, + "step": 23221 + }, + { + "epoch": 7.12768569674647, + "grad_norm": 0.18389549851417542, + "learning_rate": 2.0123910471201145e-05, + "loss": 1.6817, + "step": 23222 + }, + { + "epoch": 7.127992633517495, + "grad_norm": 0.18351595103740692, + "learning_rate": 2.0119924962122766e-05, + "loss": 1.6898, + "step": 23223 + }, + { + "epoch": 7.128299570288521, + "grad_norm": 0.1913219839334488, + "learning_rate": 2.01159397483368e-05, + "loss": 1.7536, + "step": 23224 + }, + { + "epoch": 7.128606507059546, + "grad_norm": 0.17707225680351257, + "learning_rate": 2.0111954829882628e-05, + "loss": 1.6894, + "step": 23225 + }, + { + "epoch": 7.128913443830571, + "grad_norm": 0.17774651944637299, + "learning_rate": 2.0107970206799637e-05, + "loss": 1.6599, + "step": 23226 + }, + { + "epoch": 7.129220380601596, + "grad_norm": 0.14530350267887115, + "learning_rate": 2.0103985879127207e-05, + "loss": 1.6264, + "step": 23227 + }, + { + "epoch": 7.129527317372621, + "grad_norm": 0.15673531591892242, + "learning_rate": 2.010000184690471e-05, + "loss": 1.6577, + "step": 23228 + }, + { + "epoch": 7.129834254143646, + "grad_norm": 0.20691752433776855, + "learning_rate": 2.009601811017152e-05, + "loss": 1.7129, + "step": 23229 + }, + { + "epoch": 7.130141190914672, + "grad_norm": 0.16686022281646729, + "learning_rate": 2.0092034668966987e-05, + "loss": 1.6738, + "step": 23230 + }, + { + "epoch": 7.130448127685697, + "grad_norm": 0.17799030244350433, + "learning_rate": 2.0088051523330536e-05, + "loss": 1.7312, + "step": 23231 + }, + { + "epoch": 7.1307550644567215, + "grad_norm": 0.16749511659145355, + "learning_rate": 2.0084068673301454e-05, + "loss": 1.6616, + "step": 23232 + }, + { + "epoch": 7.131062001227747, + "grad_norm": 0.18347670137882233, + "learning_rate": 2.0080086118919156e-05, + "loss": 1.6622, + "step": 23233 + }, + { + "epoch": 7.131368937998772, + "grad_norm": 0.19747060537338257, + "learning_rate": 2.007610386022299e-05, + "loss": 1.7341, + "step": 23234 + }, + { + "epoch": 7.1316758747697975, + "grad_norm": 0.21067634224891663, + "learning_rate": 2.0072121897252295e-05, + "loss": 1.7252, + "step": 23235 + }, + { + "epoch": 7.131982811540823, + "grad_norm": 0.2095600962638855, + "learning_rate": 2.006814023004644e-05, + "loss": 1.7769, + "step": 23236 + }, + { + "epoch": 7.132289748311848, + "grad_norm": 0.23090791702270508, + "learning_rate": 2.0064158858644765e-05, + "loss": 1.7734, + "step": 23237 + }, + { + "epoch": 7.132596685082873, + "grad_norm": 0.19060610234737396, + "learning_rate": 2.0060177783086614e-05, + "loss": 1.7209, + "step": 23238 + }, + { + "epoch": 7.132903621853898, + "grad_norm": 0.18050087988376617, + "learning_rate": 2.0056197003411342e-05, + "loss": 1.6882, + "step": 23239 + }, + { + "epoch": 7.133210558624923, + "grad_norm": 0.1504158228635788, + "learning_rate": 2.005221651965828e-05, + "loss": 1.687, + "step": 23240 + }, + { + "epoch": 7.133517495395949, + "grad_norm": 0.22980810701847076, + "learning_rate": 2.004823633186676e-05, + "loss": 1.7254, + "step": 23241 + }, + { + "epoch": 7.133824432166974, + "grad_norm": 0.20092199742794037, + "learning_rate": 2.004425644007613e-05, + "loss": 1.7234, + "step": 23242 + }, + { + "epoch": 7.134131368937998, + "grad_norm": 0.21002927422523499, + "learning_rate": 2.0040276844325718e-05, + "loss": 1.7272, + "step": 23243 + }, + { + "epoch": 7.134438305709024, + "grad_norm": 0.18524625897407532, + "learning_rate": 2.003629754465484e-05, + "loss": 1.7189, + "step": 23244 + }, + { + "epoch": 7.134745242480049, + "grad_norm": 0.21095192432403564, + "learning_rate": 2.0032318541102845e-05, + "loss": 1.7177, + "step": 23245 + }, + { + "epoch": 7.135052179251074, + "grad_norm": 0.1700662076473236, + "learning_rate": 2.0028339833709037e-05, + "loss": 1.6925, + "step": 23246 + }, + { + "epoch": 7.1353591160221, + "grad_norm": 0.2123938947916031, + "learning_rate": 2.002436142251272e-05, + "loss": 1.7623, + "step": 23247 + }, + { + "epoch": 7.135666052793125, + "grad_norm": 0.194299578666687, + "learning_rate": 2.0020383307553275e-05, + "loss": 1.6898, + "step": 23248 + }, + { + "epoch": 7.1359729895641495, + "grad_norm": 0.18740688264369965, + "learning_rate": 2.001640548886993e-05, + "loss": 1.6519, + "step": 23249 + }, + { + "epoch": 7.136279926335175, + "grad_norm": 0.18891027569770813, + "learning_rate": 2.0012427966502085e-05, + "loss": 1.6895, + "step": 23250 + }, + { + "epoch": 7.1365868631062, + "grad_norm": 0.21313735842704773, + "learning_rate": 2.000845074048896e-05, + "loss": 1.6829, + "step": 23251 + }, + { + "epoch": 7.1368937998772255, + "grad_norm": 0.2438332885503769, + "learning_rate": 2.0004473810869923e-05, + "loss": 1.7723, + "step": 23252 + }, + { + "epoch": 7.137200736648251, + "grad_norm": 0.24475115537643433, + "learning_rate": 2.0000497177684257e-05, + "loss": 1.7192, + "step": 23253 + }, + { + "epoch": 7.137507673419275, + "grad_norm": 0.1936563402414322, + "learning_rate": 1.9996520840971267e-05, + "loss": 1.7462, + "step": 23254 + }, + { + "epoch": 7.137814610190301, + "grad_norm": 0.22365616261959076, + "learning_rate": 1.9992544800770236e-05, + "loss": 1.7405, + "step": 23255 + }, + { + "epoch": 7.138121546961326, + "grad_norm": 0.191316619515419, + "learning_rate": 1.9988569057120472e-05, + "loss": 1.6466, + "step": 23256 + }, + { + "epoch": 7.138428483732351, + "grad_norm": 0.24758055806159973, + "learning_rate": 1.9984593610061253e-05, + "loss": 1.7689, + "step": 23257 + }, + { + "epoch": 7.138735420503377, + "grad_norm": 0.2144414782524109, + "learning_rate": 1.9980618459631874e-05, + "loss": 1.7158, + "step": 23258 + }, + { + "epoch": 7.139042357274401, + "grad_norm": 0.24254034459590912, + "learning_rate": 1.9976643605871614e-05, + "loss": 1.7998, + "step": 23259 + }, + { + "epoch": 7.139349294045426, + "grad_norm": 0.21013480424880981, + "learning_rate": 1.9972669048819765e-05, + "loss": 1.7231, + "step": 23260 + }, + { + "epoch": 7.139656230816452, + "grad_norm": 0.2169421911239624, + "learning_rate": 1.9968694788515603e-05, + "loss": 1.7182, + "step": 23261 + }, + { + "epoch": 7.139963167587477, + "grad_norm": 0.19591476023197174, + "learning_rate": 1.9964720824998395e-05, + "loss": 1.7114, + "step": 23262 + }, + { + "epoch": 7.140270104358502, + "grad_norm": 0.1775221824645996, + "learning_rate": 1.9960747158307417e-05, + "loss": 1.6754, + "step": 23263 + }, + { + "epoch": 7.140577041129528, + "grad_norm": 0.19318300485610962, + "learning_rate": 1.995677378848193e-05, + "loss": 1.6794, + "step": 23264 + }, + { + "epoch": 7.140883977900552, + "grad_norm": 0.19659662246704102, + "learning_rate": 1.995280071556125e-05, + "loss": 1.703, + "step": 23265 + }, + { + "epoch": 7.1411909146715775, + "grad_norm": 0.22100697457790375, + "learning_rate": 1.994882793958457e-05, + "loss": 1.6821, + "step": 23266 + }, + { + "epoch": 7.141497851442603, + "grad_norm": 0.20475365221500397, + "learning_rate": 1.9944855460591217e-05, + "loss": 1.727, + "step": 23267 + }, + { + "epoch": 7.141804788213628, + "grad_norm": 0.2202025055885315, + "learning_rate": 1.9940883278620383e-05, + "loss": 1.7248, + "step": 23268 + }, + { + "epoch": 7.1421117249846535, + "grad_norm": 0.1800462007522583, + "learning_rate": 1.993691139371138e-05, + "loss": 1.7276, + "step": 23269 + }, + { + "epoch": 7.142418661755678, + "grad_norm": 0.2896895110607147, + "learning_rate": 1.9932939805903433e-05, + "loss": 1.7275, + "step": 23270 + }, + { + "epoch": 7.142725598526703, + "grad_norm": 0.21308782696723938, + "learning_rate": 1.99289685152358e-05, + "loss": 1.6645, + "step": 23271 + }, + { + "epoch": 7.143032535297729, + "grad_norm": 0.20210005342960358, + "learning_rate": 1.992499752174773e-05, + "loss": 1.6899, + "step": 23272 + }, + { + "epoch": 7.143339472068754, + "grad_norm": 0.18419797718524933, + "learning_rate": 1.9921026825478455e-05, + "loss": 1.7088, + "step": 23273 + }, + { + "epoch": 7.143646408839779, + "grad_norm": 0.19155149161815643, + "learning_rate": 1.9917056426467227e-05, + "loss": 1.719, + "step": 23274 + }, + { + "epoch": 7.143953345610804, + "grad_norm": 0.17220313847064972, + "learning_rate": 1.9913086324753278e-05, + "loss": 1.7408, + "step": 23275 + }, + { + "epoch": 7.144260282381829, + "grad_norm": 0.18474969267845154, + "learning_rate": 1.990911652037585e-05, + "loss": 1.7189, + "step": 23276 + }, + { + "epoch": 7.144567219152854, + "grad_norm": 0.18529154360294342, + "learning_rate": 1.9905147013374165e-05, + "loss": 1.7075, + "step": 23277 + }, + { + "epoch": 7.14487415592388, + "grad_norm": 0.18569569289684296, + "learning_rate": 1.9901177803787452e-05, + "loss": 1.7116, + "step": 23278 + }, + { + "epoch": 7.145181092694905, + "grad_norm": 0.17149175703525543, + "learning_rate": 1.9897208891654946e-05, + "loss": 1.6873, + "step": 23279 + }, + { + "epoch": 7.14548802946593, + "grad_norm": 0.18012240529060364, + "learning_rate": 1.9893240277015868e-05, + "loss": 1.709, + "step": 23280 + }, + { + "epoch": 7.145794966236955, + "grad_norm": 0.18372172117233276, + "learning_rate": 1.9889271959909412e-05, + "loss": 1.7134, + "step": 23281 + }, + { + "epoch": 7.14610190300798, + "grad_norm": 0.20667128264904022, + "learning_rate": 1.9885303940374856e-05, + "loss": 1.7452, + "step": 23282 + }, + { + "epoch": 7.1464088397790055, + "grad_norm": 0.18145184218883514, + "learning_rate": 1.9881336218451346e-05, + "loss": 1.7358, + "step": 23283 + }, + { + "epoch": 7.146715776550031, + "grad_norm": 0.179911807179451, + "learning_rate": 1.987736879417816e-05, + "loss": 1.6698, + "step": 23284 + }, + { + "epoch": 7.147022713321056, + "grad_norm": 0.18944865465164185, + "learning_rate": 1.9873401667594426e-05, + "loss": 1.7725, + "step": 23285 + }, + { + "epoch": 7.147329650092081, + "grad_norm": 0.1926117241382599, + "learning_rate": 1.986943483873942e-05, + "loss": 1.7829, + "step": 23286 + }, + { + "epoch": 7.147636586863106, + "grad_norm": 0.330503910779953, + "learning_rate": 1.9865468307652318e-05, + "loss": 1.7408, + "step": 23287 + }, + { + "epoch": 7.147943523634131, + "grad_norm": 0.22677597403526306, + "learning_rate": 1.9861502074372324e-05, + "loss": 1.7013, + "step": 23288 + }, + { + "epoch": 7.148250460405157, + "grad_norm": 0.1859201192855835, + "learning_rate": 1.9857536138938627e-05, + "loss": 1.7215, + "step": 23289 + }, + { + "epoch": 7.148557397176182, + "grad_norm": 0.22151269018650055, + "learning_rate": 1.9853570501390427e-05, + "loss": 1.6781, + "step": 23290 + }, + { + "epoch": 7.148864333947207, + "grad_norm": 0.16455405950546265, + "learning_rate": 1.984960516176691e-05, + "loss": 1.6518, + "step": 23291 + }, + { + "epoch": 7.149171270718232, + "grad_norm": 0.19687162339687347, + "learning_rate": 1.9845640120107267e-05, + "loss": 1.7375, + "step": 23292 + }, + { + "epoch": 7.149478207489257, + "grad_norm": 0.19174890220165253, + "learning_rate": 1.9841675376450686e-05, + "loss": 1.7017, + "step": 23293 + }, + { + "epoch": 7.149785144260282, + "grad_norm": 0.18458877503871918, + "learning_rate": 1.983771093083634e-05, + "loss": 1.7256, + "step": 23294 + }, + { + "epoch": 7.150092081031308, + "grad_norm": 0.212035670876503, + "learning_rate": 1.983374678330342e-05, + "loss": 1.698, + "step": 23295 + }, + { + "epoch": 7.150399017802333, + "grad_norm": 0.1793123185634613, + "learning_rate": 1.982978293389109e-05, + "loss": 1.7012, + "step": 23296 + }, + { + "epoch": 7.150705954573358, + "grad_norm": 0.2359405905008316, + "learning_rate": 1.9825819382638526e-05, + "loss": 1.7423, + "step": 23297 + }, + { + "epoch": 7.151012891344383, + "grad_norm": 0.17125526070594788, + "learning_rate": 1.9821856129584888e-05, + "loss": 1.6825, + "step": 23298 + }, + { + "epoch": 7.151319828115408, + "grad_norm": 0.2084828019142151, + "learning_rate": 1.9817893174769392e-05, + "loss": 1.6991, + "step": 23299 + }, + { + "epoch": 7.151626764886434, + "grad_norm": 0.27647483348846436, + "learning_rate": 1.9813930518231127e-05, + "loss": 1.7425, + "step": 23300 + }, + { + "epoch": 7.151933701657459, + "grad_norm": 0.23517926037311554, + "learning_rate": 1.980996816000933e-05, + "loss": 1.8411, + "step": 23301 + }, + { + "epoch": 7.152240638428483, + "grad_norm": 0.19960010051727295, + "learning_rate": 1.980600610014309e-05, + "loss": 1.7302, + "step": 23302 + }, + { + "epoch": 7.152547575199509, + "grad_norm": 0.18953165411949158, + "learning_rate": 1.9802044338671604e-05, + "loss": 1.7252, + "step": 23303 + }, + { + "epoch": 7.152854511970534, + "grad_norm": 0.1718905121088028, + "learning_rate": 1.979808287563402e-05, + "loss": 1.656, + "step": 23304 + }, + { + "epoch": 7.153161448741559, + "grad_norm": 0.17233465611934662, + "learning_rate": 1.9794121711069487e-05, + "loss": 1.6732, + "step": 23305 + }, + { + "epoch": 7.153468385512585, + "grad_norm": 0.17677003145217896, + "learning_rate": 1.979016084501714e-05, + "loss": 1.7266, + "step": 23306 + }, + { + "epoch": 7.153775322283609, + "grad_norm": 0.1815326064825058, + "learning_rate": 1.9786200277516136e-05, + "loss": 1.7029, + "step": 23307 + }, + { + "epoch": 7.1540822590546345, + "grad_norm": 0.20937341451644897, + "learning_rate": 1.978224000860561e-05, + "loss": 1.711, + "step": 23308 + }, + { + "epoch": 7.15438919582566, + "grad_norm": 0.2045155018568039, + "learning_rate": 1.97782800383247e-05, + "loss": 1.7557, + "step": 23309 + }, + { + "epoch": 7.154696132596685, + "grad_norm": 0.16426041722297668, + "learning_rate": 1.9774320366712533e-05, + "loss": 1.7373, + "step": 23310 + }, + { + "epoch": 7.1550030693677105, + "grad_norm": 0.18058224022388458, + "learning_rate": 1.977036099380825e-05, + "loss": 1.6957, + "step": 23311 + }, + { + "epoch": 7.155310006138736, + "grad_norm": 0.23552078008651733, + "learning_rate": 1.9766401919650983e-05, + "loss": 1.8032, + "step": 23312 + }, + { + "epoch": 7.15561694290976, + "grad_norm": 0.19097596406936646, + "learning_rate": 1.9762443144279852e-05, + "loss": 1.7447, + "step": 23313 + }, + { + "epoch": 7.155923879680786, + "grad_norm": 0.17892403900623322, + "learning_rate": 1.975848466773398e-05, + "loss": 1.7117, + "step": 23314 + }, + { + "epoch": 7.156230816451811, + "grad_norm": 0.18331217765808105, + "learning_rate": 1.9754526490052467e-05, + "loss": 1.6669, + "step": 23315 + }, + { + "epoch": 7.156537753222836, + "grad_norm": 0.19914311170578003, + "learning_rate": 1.975056861127449e-05, + "loss": 1.6731, + "step": 23316 + }, + { + "epoch": 7.156844689993862, + "grad_norm": 0.21710485219955444, + "learning_rate": 1.9746611031439083e-05, + "loss": 1.7214, + "step": 23317 + }, + { + "epoch": 7.157151626764886, + "grad_norm": 0.19703111052513123, + "learning_rate": 1.9742653750585437e-05, + "loss": 1.7185, + "step": 23318 + }, + { + "epoch": 7.157458563535911, + "grad_norm": 0.18581365048885345, + "learning_rate": 1.9738696768752585e-05, + "loss": 1.7113, + "step": 23319 + }, + { + "epoch": 7.157765500306937, + "grad_norm": 0.1703677624464035, + "learning_rate": 1.9734740085979687e-05, + "loss": 1.6755, + "step": 23320 + }, + { + "epoch": 7.158072437077962, + "grad_norm": 0.16760937869548798, + "learning_rate": 1.9730783702305826e-05, + "loss": 1.7082, + "step": 23321 + }, + { + "epoch": 7.158379373848987, + "grad_norm": 0.20183983445167542, + "learning_rate": 1.97268276177701e-05, + "loss": 1.7503, + "step": 23322 + }, + { + "epoch": 7.158686310620013, + "grad_norm": 0.18407952785491943, + "learning_rate": 1.972287183241163e-05, + "loss": 1.6807, + "step": 23323 + }, + { + "epoch": 7.158993247391037, + "grad_norm": 0.20135276019573212, + "learning_rate": 1.9718916346269446e-05, + "loss": 1.8001, + "step": 23324 + }, + { + "epoch": 7.1593001841620625, + "grad_norm": 0.1781267672777176, + "learning_rate": 1.9714961159382693e-05, + "loss": 1.683, + "step": 23325 + }, + { + "epoch": 7.159607120933088, + "grad_norm": 0.24990373849868774, + "learning_rate": 1.971100627179045e-05, + "loss": 1.7235, + "step": 23326 + }, + { + "epoch": 7.159914057704113, + "grad_norm": 0.19463174045085907, + "learning_rate": 1.9707051683531796e-05, + "loss": 1.735, + "step": 23327 + }, + { + "epoch": 7.1602209944751385, + "grad_norm": 0.1988895982503891, + "learning_rate": 1.9703097394645813e-05, + "loss": 1.7495, + "step": 23328 + }, + { + "epoch": 7.160527931246163, + "grad_norm": 0.1760931760072708, + "learning_rate": 1.9699143405171576e-05, + "loss": 1.6914, + "step": 23329 + }, + { + "epoch": 7.160834868017188, + "grad_norm": 0.18537557125091553, + "learning_rate": 1.9695189715148166e-05, + "loss": 1.7601, + "step": 23330 + }, + { + "epoch": 7.161141804788214, + "grad_norm": 0.2476375252008438, + "learning_rate": 1.9691236324614654e-05, + "loss": 1.8218, + "step": 23331 + }, + { + "epoch": 7.161448741559239, + "grad_norm": 0.17736093699932098, + "learning_rate": 1.968728323361009e-05, + "loss": 1.6872, + "step": 23332 + }, + { + "epoch": 7.161755678330264, + "grad_norm": 0.1851162612438202, + "learning_rate": 1.9683330442173598e-05, + "loss": 1.712, + "step": 23333 + }, + { + "epoch": 7.162062615101289, + "grad_norm": 0.20326650142669678, + "learning_rate": 1.967937795034417e-05, + "loss": 1.7668, + "step": 23334 + }, + { + "epoch": 7.162369551872314, + "grad_norm": 0.21020451188087463, + "learning_rate": 1.9675425758160925e-05, + "loss": 1.7135, + "step": 23335 + }, + { + "epoch": 7.162676488643339, + "grad_norm": 0.21629111468791962, + "learning_rate": 1.967147386566287e-05, + "loss": 1.7181, + "step": 23336 + }, + { + "epoch": 7.162983425414365, + "grad_norm": 0.18086732923984528, + "learning_rate": 1.9667522272889104e-05, + "loss": 1.7107, + "step": 23337 + }, + { + "epoch": 7.16329036218539, + "grad_norm": 0.16542381048202515, + "learning_rate": 1.9663570979878658e-05, + "loss": 1.7156, + "step": 23338 + }, + { + "epoch": 7.163597298956415, + "grad_norm": 0.18775032460689545, + "learning_rate": 1.9659619986670587e-05, + "loss": 1.6955, + "step": 23339 + }, + { + "epoch": 7.16390423572744, + "grad_norm": 0.19227592647075653, + "learning_rate": 1.9655669293303953e-05, + "loss": 1.7545, + "step": 23340 + }, + { + "epoch": 7.164211172498465, + "grad_norm": 0.1935085654258728, + "learning_rate": 1.9651718899817746e-05, + "loss": 1.7183, + "step": 23341 + }, + { + "epoch": 7.1645181092694905, + "grad_norm": 0.17873792350292206, + "learning_rate": 1.9647768806251056e-05, + "loss": 1.6644, + "step": 23342 + }, + { + "epoch": 7.164825046040516, + "grad_norm": 0.25024256110191345, + "learning_rate": 1.96438190126429e-05, + "loss": 1.7621, + "step": 23343 + }, + { + "epoch": 7.165131982811541, + "grad_norm": 0.15957331657409668, + "learning_rate": 1.9639869519032323e-05, + "loss": 1.6525, + "step": 23344 + }, + { + "epoch": 7.165438919582566, + "grad_norm": 0.19967027008533478, + "learning_rate": 1.9635920325458347e-05, + "loss": 1.7533, + "step": 23345 + }, + { + "epoch": 7.165745856353591, + "grad_norm": 0.17413713037967682, + "learning_rate": 1.9631971431960005e-05, + "loss": 1.6962, + "step": 23346 + }, + { + "epoch": 7.166052793124616, + "grad_norm": 0.19787384569644928, + "learning_rate": 1.9628022838576315e-05, + "loss": 1.7369, + "step": 23347 + }, + { + "epoch": 7.166359729895642, + "grad_norm": 0.1726577877998352, + "learning_rate": 1.962407454534631e-05, + "loss": 1.7004, + "step": 23348 + }, + { + "epoch": 7.166666666666667, + "grad_norm": 0.2136315256357193, + "learning_rate": 1.962012655230899e-05, + "loss": 1.7411, + "step": 23349 + }, + { + "epoch": 7.166973603437691, + "grad_norm": 0.18257126212120056, + "learning_rate": 1.9616178859503414e-05, + "loss": 1.7155, + "step": 23350 + }, + { + "epoch": 7.167280540208717, + "grad_norm": 0.18696577847003937, + "learning_rate": 1.961223146696854e-05, + "loss": 1.7272, + "step": 23351 + }, + { + "epoch": 7.167587476979742, + "grad_norm": 0.16375793516635895, + "learning_rate": 1.9608284374743435e-05, + "loss": 1.6706, + "step": 23352 + }, + { + "epoch": 7.167894413750767, + "grad_norm": 0.19589200615882874, + "learning_rate": 1.960433758286704e-05, + "loss": 1.7018, + "step": 23353 + }, + { + "epoch": 7.168201350521793, + "grad_norm": 0.18434208631515503, + "learning_rate": 1.9600391091378417e-05, + "loss": 1.6776, + "step": 23354 + }, + { + "epoch": 7.168508287292818, + "grad_norm": 0.23839476704597473, + "learning_rate": 1.9596444900316545e-05, + "loss": 1.7501, + "step": 23355 + }, + { + "epoch": 7.1688152240638425, + "grad_norm": 0.20229686796665192, + "learning_rate": 1.9592499009720428e-05, + "loss": 1.7249, + "step": 23356 + }, + { + "epoch": 7.169122160834868, + "grad_norm": 0.2422642856836319, + "learning_rate": 1.9588553419629076e-05, + "loss": 1.7621, + "step": 23357 + }, + { + "epoch": 7.169429097605893, + "grad_norm": 0.21856555342674255, + "learning_rate": 1.9584608130081422e-05, + "loss": 1.7362, + "step": 23358 + }, + { + "epoch": 7.1697360343769185, + "grad_norm": 0.19434040784835815, + "learning_rate": 1.958066314111652e-05, + "loss": 1.6888, + "step": 23359 + }, + { + "epoch": 7.170042971147944, + "grad_norm": 0.19806630909442902, + "learning_rate": 1.9576718452773335e-05, + "loss": 1.7461, + "step": 23360 + }, + { + "epoch": 7.170349907918968, + "grad_norm": 0.19190531969070435, + "learning_rate": 1.957277406509085e-05, + "loss": 1.6992, + "step": 23361 + }, + { + "epoch": 7.170656844689994, + "grad_norm": 0.20990152657032013, + "learning_rate": 1.9568829978108044e-05, + "loss": 1.7095, + "step": 23362 + }, + { + "epoch": 7.170963781461019, + "grad_norm": 0.18638263642787933, + "learning_rate": 1.9564886191863897e-05, + "loss": 1.7024, + "step": 23363 + }, + { + "epoch": 7.171270718232044, + "grad_norm": 0.1974666863679886, + "learning_rate": 1.9560942706397383e-05, + "loss": 1.6901, + "step": 23364 + }, + { + "epoch": 7.17157765500307, + "grad_norm": 0.171469047665596, + "learning_rate": 1.955699952174747e-05, + "loss": 1.717, + "step": 23365 + }, + { + "epoch": 7.171884591774095, + "grad_norm": 0.17386725544929504, + "learning_rate": 1.955305663795312e-05, + "loss": 1.7069, + "step": 23366 + }, + { + "epoch": 7.172191528545119, + "grad_norm": 0.1869814246892929, + "learning_rate": 1.954911405505334e-05, + "loss": 1.7478, + "step": 23367 + }, + { + "epoch": 7.172498465316145, + "grad_norm": 0.19253556430339813, + "learning_rate": 1.9545171773087033e-05, + "loss": 1.7129, + "step": 23368 + }, + { + "epoch": 7.17280540208717, + "grad_norm": 0.1625998616218567, + "learning_rate": 1.954122979209322e-05, + "loss": 1.7055, + "step": 23369 + }, + { + "epoch": 7.173112338858195, + "grad_norm": 0.172325998544693, + "learning_rate": 1.953728811211079e-05, + "loss": 1.71, + "step": 23370 + }, + { + "epoch": 7.173419275629221, + "grad_norm": 0.22542965412139893, + "learning_rate": 1.9533346733178753e-05, + "loss": 1.7548, + "step": 23371 + }, + { + "epoch": 7.173726212400245, + "grad_norm": 0.1547299474477768, + "learning_rate": 1.9529405655336042e-05, + "loss": 1.6509, + "step": 23372 + }, + { + "epoch": 7.1740331491712706, + "grad_norm": 0.21720515191555023, + "learning_rate": 1.95254648786216e-05, + "loss": 1.7427, + "step": 23373 + }, + { + "epoch": 7.174340085942296, + "grad_norm": 0.18855944275856018, + "learning_rate": 1.95215244030744e-05, + "loss": 1.7471, + "step": 23374 + }, + { + "epoch": 7.174647022713321, + "grad_norm": 0.21088628470897675, + "learning_rate": 1.951758422873332e-05, + "loss": 1.7457, + "step": 23375 + }, + { + "epoch": 7.1749539594843466, + "grad_norm": 0.20596840977668762, + "learning_rate": 1.951364435563736e-05, + "loss": 1.7098, + "step": 23376 + }, + { + "epoch": 7.175260896255371, + "grad_norm": 0.20098064839839935, + "learning_rate": 1.9509704783825433e-05, + "loss": 1.7225, + "step": 23377 + }, + { + "epoch": 7.175567833026396, + "grad_norm": 0.20860125124454498, + "learning_rate": 1.950576551333647e-05, + "loss": 1.7071, + "step": 23378 + }, + { + "epoch": 7.175874769797422, + "grad_norm": 0.1914912760257721, + "learning_rate": 1.950182654420941e-05, + "loss": 1.7262, + "step": 23379 + }, + { + "epoch": 7.176181706568447, + "grad_norm": 0.21109424531459808, + "learning_rate": 1.9497887876483178e-05, + "loss": 1.6601, + "step": 23380 + }, + { + "epoch": 7.176488643339472, + "grad_norm": 0.20514877140522003, + "learning_rate": 1.949394951019669e-05, + "loss": 1.7612, + "step": 23381 + }, + { + "epoch": 7.176795580110497, + "grad_norm": 0.20280246436595917, + "learning_rate": 1.949001144538888e-05, + "loss": 1.6754, + "step": 23382 + }, + { + "epoch": 7.177102516881522, + "grad_norm": 0.1724841594696045, + "learning_rate": 1.9486073682098654e-05, + "loss": 1.7252, + "step": 23383 + }, + { + "epoch": 7.1774094536525475, + "grad_norm": 0.16961625218391418, + "learning_rate": 1.948213622036493e-05, + "loss": 1.6835, + "step": 23384 + }, + { + "epoch": 7.177716390423573, + "grad_norm": 0.17938925325870514, + "learning_rate": 1.947819906022661e-05, + "loss": 1.6909, + "step": 23385 + }, + { + "epoch": 7.178023327194598, + "grad_norm": 0.19711901247501373, + "learning_rate": 1.9474262201722655e-05, + "loss": 1.7275, + "step": 23386 + }, + { + "epoch": 7.1783302639656235, + "grad_norm": 0.19549165666103363, + "learning_rate": 1.947032564489189e-05, + "loss": 1.7609, + "step": 23387 + }, + { + "epoch": 7.178637200736648, + "grad_norm": 0.20358525216579437, + "learning_rate": 1.9466389389773284e-05, + "loss": 1.7127, + "step": 23388 + }, + { + "epoch": 7.178944137507673, + "grad_norm": 0.18345355987548828, + "learning_rate": 1.946245343640571e-05, + "loss": 1.6807, + "step": 23389 + }, + { + "epoch": 7.179251074278699, + "grad_norm": 0.20261847972869873, + "learning_rate": 1.9458517784828074e-05, + "loss": 1.717, + "step": 23390 + }, + { + "epoch": 7.179558011049724, + "grad_norm": 0.18042106926441193, + "learning_rate": 1.9454582435079275e-05, + "loss": 1.7415, + "step": 23391 + }, + { + "epoch": 7.179864947820749, + "grad_norm": 0.1731836199760437, + "learning_rate": 1.945064738719817e-05, + "loss": 1.6661, + "step": 23392 + }, + { + "epoch": 7.180171884591774, + "grad_norm": 0.1971052885055542, + "learning_rate": 1.9446712641223685e-05, + "loss": 1.753, + "step": 23393 + }, + { + "epoch": 7.180478821362799, + "grad_norm": 0.22370313107967377, + "learning_rate": 1.94427781971947e-05, + "loss": 1.7118, + "step": 23394 + }, + { + "epoch": 7.180785758133824, + "grad_norm": 0.23129026591777802, + "learning_rate": 1.9438844055150086e-05, + "loss": 1.8087, + "step": 23395 + }, + { + "epoch": 7.18109269490485, + "grad_norm": 0.26353758573532104, + "learning_rate": 1.9434910215128727e-05, + "loss": 1.7147, + "step": 23396 + }, + { + "epoch": 7.181399631675875, + "grad_norm": 0.22333624958992004, + "learning_rate": 1.9430976677169504e-05, + "loss": 1.7403, + "step": 23397 + }, + { + "epoch": 7.1817065684469, + "grad_norm": 0.22191296517848969, + "learning_rate": 1.9427043441311284e-05, + "loss": 1.7125, + "step": 23398 + }, + { + "epoch": 7.182013505217925, + "grad_norm": 0.19174177944660187, + "learning_rate": 1.942311050759294e-05, + "loss": 1.7026, + "step": 23399 + }, + { + "epoch": 7.18232044198895, + "grad_norm": 0.2175525426864624, + "learning_rate": 1.9419177876053342e-05, + "loss": 1.6947, + "step": 23400 + }, + { + "epoch": 7.1826273787599755, + "grad_norm": 0.19419047236442566, + "learning_rate": 1.9415245546731348e-05, + "loss": 1.7309, + "step": 23401 + }, + { + "epoch": 7.182934315531001, + "grad_norm": 0.22568467259407043, + "learning_rate": 1.9411313519665806e-05, + "loss": 1.7177, + "step": 23402 + }, + { + "epoch": 7.183241252302026, + "grad_norm": 0.26983609795570374, + "learning_rate": 1.9407381794895635e-05, + "loss": 1.6779, + "step": 23403 + }, + { + "epoch": 7.183548189073051, + "grad_norm": 0.1651962548494339, + "learning_rate": 1.9403450372459602e-05, + "loss": 1.6718, + "step": 23404 + }, + { + "epoch": 7.183855125844076, + "grad_norm": 0.2337920367717743, + "learning_rate": 1.9399519252396653e-05, + "loss": 1.7271, + "step": 23405 + }, + { + "epoch": 7.184162062615101, + "grad_norm": 0.20093166828155518, + "learning_rate": 1.9395588434745547e-05, + "loss": 1.7274, + "step": 23406 + }, + { + "epoch": 7.184468999386127, + "grad_norm": 0.22497716546058655, + "learning_rate": 1.9391657919545193e-05, + "loss": 1.7419, + "step": 23407 + }, + { + "epoch": 7.184775936157152, + "grad_norm": 0.22474822402000427, + "learning_rate": 1.938772770683443e-05, + "loss": 1.8317, + "step": 23408 + }, + { + "epoch": 7.185082872928176, + "grad_norm": 0.18015392124652863, + "learning_rate": 1.9383797796652052e-05, + "loss": 1.6568, + "step": 23409 + }, + { + "epoch": 7.185389809699202, + "grad_norm": 0.18696026504039764, + "learning_rate": 1.9379868189036947e-05, + "loss": 1.6722, + "step": 23410 + }, + { + "epoch": 7.185696746470227, + "grad_norm": 0.1828698217868805, + "learning_rate": 1.9375938884027934e-05, + "loss": 1.7477, + "step": 23411 + }, + { + "epoch": 7.186003683241252, + "grad_norm": 0.20442047715187073, + "learning_rate": 1.937200988166384e-05, + "loss": 1.7269, + "step": 23412 + }, + { + "epoch": 7.186310620012278, + "grad_norm": 0.17201031744480133, + "learning_rate": 1.9368081181983494e-05, + "loss": 1.6893, + "step": 23413 + }, + { + "epoch": 7.186617556783303, + "grad_norm": 0.21501687169075012, + "learning_rate": 1.9364152785025723e-05, + "loss": 1.771, + "step": 23414 + }, + { + "epoch": 7.1869244935543275, + "grad_norm": 0.18059030175209045, + "learning_rate": 1.936022469082936e-05, + "loss": 1.7088, + "step": 23415 + }, + { + "epoch": 7.187231430325353, + "grad_norm": 0.18079128861427307, + "learning_rate": 1.9356296899433206e-05, + "loss": 1.764, + "step": 23416 + }, + { + "epoch": 7.187538367096378, + "grad_norm": 0.1960453987121582, + "learning_rate": 1.9352369410876086e-05, + "loss": 1.7302, + "step": 23417 + }, + { + "epoch": 7.1878453038674035, + "grad_norm": 0.19896337389945984, + "learning_rate": 1.9348442225196815e-05, + "loss": 1.7228, + "step": 23418 + }, + { + "epoch": 7.188152240638429, + "grad_norm": 0.19272227585315704, + "learning_rate": 1.9344515342434192e-05, + "loss": 1.7164, + "step": 23419 + }, + { + "epoch": 7.188459177409453, + "grad_norm": 0.16746973991394043, + "learning_rate": 1.9340588762627066e-05, + "loss": 1.696, + "step": 23420 + }, + { + "epoch": 7.188766114180479, + "grad_norm": 0.2421095222234726, + "learning_rate": 1.9336662485814178e-05, + "loss": 1.766, + "step": 23421 + }, + { + "epoch": 7.189073050951504, + "grad_norm": 0.17857256531715393, + "learning_rate": 1.93327365120344e-05, + "loss": 1.7216, + "step": 23422 + }, + { + "epoch": 7.189379987722529, + "grad_norm": 0.19336672127246857, + "learning_rate": 1.932881084132646e-05, + "loss": 1.7124, + "step": 23423 + }, + { + "epoch": 7.189686924493555, + "grad_norm": 0.1555519700050354, + "learning_rate": 1.9324885473729204e-05, + "loss": 1.6491, + "step": 23424 + }, + { + "epoch": 7.189993861264579, + "grad_norm": 0.17879530787467957, + "learning_rate": 1.9320960409281425e-05, + "loss": 1.697, + "step": 23425 + }, + { + "epoch": 7.190300798035604, + "grad_norm": 0.17966939508914948, + "learning_rate": 1.9317035648021862e-05, + "loss": 1.6786, + "step": 23426 + }, + { + "epoch": 7.19060773480663, + "grad_norm": 0.21742603182792664, + "learning_rate": 1.9313111189989375e-05, + "loss": 1.734, + "step": 23427 + }, + { + "epoch": 7.190914671577655, + "grad_norm": 0.22135521471500397, + "learning_rate": 1.9309187035222675e-05, + "loss": 1.7154, + "step": 23428 + }, + { + "epoch": 7.19122160834868, + "grad_norm": 0.17866137623786926, + "learning_rate": 1.930526318376059e-05, + "loss": 1.6723, + "step": 23429 + }, + { + "epoch": 7.191528545119706, + "grad_norm": 0.26034823060035706, + "learning_rate": 1.9301339635641887e-05, + "loss": 1.6975, + "step": 23430 + }, + { + "epoch": 7.19183548189073, + "grad_norm": 0.21550825238227844, + "learning_rate": 1.929741639090534e-05, + "loss": 1.7401, + "step": 23431 + }, + { + "epoch": 7.1921424186617555, + "grad_norm": 0.19205132126808167, + "learning_rate": 1.9293493449589718e-05, + "loss": 1.6543, + "step": 23432 + }, + { + "epoch": 7.192449355432781, + "grad_norm": 0.18724635243415833, + "learning_rate": 1.928957081173379e-05, + "loss": 1.7752, + "step": 23433 + }, + { + "epoch": 7.192756292203806, + "grad_norm": 0.2392650544643402, + "learning_rate": 1.928564847737633e-05, + "loss": 1.7008, + "step": 23434 + }, + { + "epoch": 7.1930632289748315, + "grad_norm": 0.18950903415679932, + "learning_rate": 1.9281726446556088e-05, + "loss": 1.7193, + "step": 23435 + }, + { + "epoch": 7.193370165745856, + "grad_norm": 0.2542276978492737, + "learning_rate": 1.9277804719311808e-05, + "loss": 1.7192, + "step": 23436 + }, + { + "epoch": 7.193677102516881, + "grad_norm": 0.1987142711877823, + "learning_rate": 1.927388329568231e-05, + "loss": 1.6943, + "step": 23437 + }, + { + "epoch": 7.193984039287907, + "grad_norm": 0.18837273120880127, + "learning_rate": 1.9269962175706275e-05, + "loss": 1.7443, + "step": 23438 + }, + { + "epoch": 7.194290976058932, + "grad_norm": 0.20432044565677643, + "learning_rate": 1.9266041359422514e-05, + "loss": 1.741, + "step": 23439 + }, + { + "epoch": 7.194597912829957, + "grad_norm": 0.17763052880764008, + "learning_rate": 1.9262120846869715e-05, + "loss": 1.6696, + "step": 23440 + }, + { + "epoch": 7.194904849600983, + "grad_norm": 0.1747766137123108, + "learning_rate": 1.9258200638086665e-05, + "loss": 1.6727, + "step": 23441 + }, + { + "epoch": 7.195211786372007, + "grad_norm": 0.22058527171611786, + "learning_rate": 1.9254280733112117e-05, + "loss": 1.7387, + "step": 23442 + }, + { + "epoch": 7.195518723143032, + "grad_norm": 0.2247757911682129, + "learning_rate": 1.925036113198475e-05, + "loss": 1.7828, + "step": 23443 + }, + { + "epoch": 7.195825659914058, + "grad_norm": 0.16923101246356964, + "learning_rate": 1.924644183474337e-05, + "loss": 1.6655, + "step": 23444 + }, + { + "epoch": 7.196132596685083, + "grad_norm": 0.1599757820367813, + "learning_rate": 1.924252284142665e-05, + "loss": 1.7002, + "step": 23445 + }, + { + "epoch": 7.196439533456108, + "grad_norm": 0.1916438341140747, + "learning_rate": 1.9238604152073358e-05, + "loss": 1.71, + "step": 23446 + }, + { + "epoch": 7.196746470227133, + "grad_norm": 0.18037991225719452, + "learning_rate": 1.9234685766722216e-05, + "loss": 1.6786, + "step": 23447 + }, + { + "epoch": 7.197053406998158, + "grad_norm": 0.20671263337135315, + "learning_rate": 1.9230767685411938e-05, + "loss": 1.7228, + "step": 23448 + }, + { + "epoch": 7.1973603437691835, + "grad_norm": 0.18949514627456665, + "learning_rate": 1.9226849908181243e-05, + "loss": 1.7794, + "step": 23449 + }, + { + "epoch": 7.197667280540209, + "grad_norm": 0.19457660615444183, + "learning_rate": 1.9222932435068857e-05, + "loss": 1.7153, + "step": 23450 + }, + { + "epoch": 7.197974217311234, + "grad_norm": 0.16834792494773865, + "learning_rate": 1.9219015266113494e-05, + "loss": 1.646, + "step": 23451 + }, + { + "epoch": 7.198281154082259, + "grad_norm": 0.21668508648872375, + "learning_rate": 1.9215098401353866e-05, + "loss": 1.7232, + "step": 23452 + }, + { + "epoch": 7.198588090853284, + "grad_norm": 0.1675579994916916, + "learning_rate": 1.9211181840828656e-05, + "loss": 1.6963, + "step": 23453 + }, + { + "epoch": 7.198895027624309, + "grad_norm": 0.19915352761745453, + "learning_rate": 1.9207265584576627e-05, + "loss": 1.7043, + "step": 23454 + }, + { + "epoch": 7.199201964395335, + "grad_norm": 0.23872216045856476, + "learning_rate": 1.920334963263642e-05, + "loss": 1.7784, + "step": 23455 + }, + { + "epoch": 7.19950890116636, + "grad_norm": 0.261321485042572, + "learning_rate": 1.919943398504679e-05, + "loss": 1.8024, + "step": 23456 + }, + { + "epoch": 7.199815837937384, + "grad_norm": 0.17026741802692413, + "learning_rate": 1.9195518641846377e-05, + "loss": 1.7451, + "step": 23457 + }, + { + "epoch": 7.20012277470841, + "grad_norm": 0.20935678482055664, + "learning_rate": 1.9191603603073915e-05, + "loss": 1.752, + "step": 23458 + }, + { + "epoch": 7.200429711479435, + "grad_norm": 0.1756788194179535, + "learning_rate": 1.9187688868768107e-05, + "loss": 1.7008, + "step": 23459 + }, + { + "epoch": 7.2007366482504604, + "grad_norm": 0.23286345601081848, + "learning_rate": 1.9183774438967577e-05, + "loss": 1.7603, + "step": 23460 + }, + { + "epoch": 7.201043585021486, + "grad_norm": 0.17519986629486084, + "learning_rate": 1.917986031371109e-05, + "loss": 1.7127, + "step": 23461 + }, + { + "epoch": 7.201350521792511, + "grad_norm": 0.2603212893009186, + "learning_rate": 1.917594649303725e-05, + "loss": 1.7169, + "step": 23462 + }, + { + "epoch": 7.201657458563536, + "grad_norm": 0.2664981484413147, + "learning_rate": 1.9172032976984792e-05, + "loss": 1.7349, + "step": 23463 + }, + { + "epoch": 7.201964395334561, + "grad_norm": 0.15484265983104706, + "learning_rate": 1.9168119765592375e-05, + "loss": 1.6753, + "step": 23464 + }, + { + "epoch": 7.202271332105586, + "grad_norm": 0.22310250997543335, + "learning_rate": 1.9164206858898664e-05, + "loss": 1.6994, + "step": 23465 + }, + { + "epoch": 7.202578268876612, + "grad_norm": 0.1998710036277771, + "learning_rate": 1.9160294256942336e-05, + "loss": 1.7556, + "step": 23466 + }, + { + "epoch": 7.202885205647637, + "grad_norm": 0.2092670500278473, + "learning_rate": 1.9156381959762058e-05, + "loss": 1.6883, + "step": 23467 + }, + { + "epoch": 7.203192142418661, + "grad_norm": 0.20657336711883545, + "learning_rate": 1.915246996739649e-05, + "loss": 1.8035, + "step": 23468 + }, + { + "epoch": 7.203499079189687, + "grad_norm": 0.2175077497959137, + "learning_rate": 1.9148558279884294e-05, + "loss": 1.7173, + "step": 23469 + }, + { + "epoch": 7.203806015960712, + "grad_norm": 0.16851630806922913, + "learning_rate": 1.9144646897264114e-05, + "loss": 1.6874, + "step": 23470 + }, + { + "epoch": 7.204112952731737, + "grad_norm": 0.23194117844104767, + "learning_rate": 1.9140735819574647e-05, + "loss": 1.7156, + "step": 23471 + }, + { + "epoch": 7.204419889502763, + "grad_norm": 0.17139053344726562, + "learning_rate": 1.9136825046854483e-05, + "loss": 1.6997, + "step": 23472 + }, + { + "epoch": 7.204726826273788, + "grad_norm": 0.18561725318431854, + "learning_rate": 1.913291457914234e-05, + "loss": 1.6575, + "step": 23473 + }, + { + "epoch": 7.2050337630448125, + "grad_norm": 0.2333156019449234, + "learning_rate": 1.9129004416476793e-05, + "loss": 1.7453, + "step": 23474 + }, + { + "epoch": 7.205340699815838, + "grad_norm": 0.2594338655471802, + "learning_rate": 1.9125094558896534e-05, + "loss": 1.7087, + "step": 23475 + }, + { + "epoch": 7.205647636586863, + "grad_norm": 0.16303664445877075, + "learning_rate": 1.91211850064402e-05, + "loss": 1.6985, + "step": 23476 + }, + { + "epoch": 7.2059545733578885, + "grad_norm": 0.2592144012451172, + "learning_rate": 1.9117275759146387e-05, + "loss": 1.7196, + "step": 23477 + }, + { + "epoch": 7.206261510128914, + "grad_norm": 0.1643611341714859, + "learning_rate": 1.9113366817053784e-05, + "loss": 1.686, + "step": 23478 + }, + { + "epoch": 7.206568446899938, + "grad_norm": 0.19730710983276367, + "learning_rate": 1.9109458180200966e-05, + "loss": 1.6883, + "step": 23479 + }, + { + "epoch": 7.206875383670964, + "grad_norm": 0.16942749917507172, + "learning_rate": 1.9105549848626602e-05, + "loss": 1.7272, + "step": 23480 + }, + { + "epoch": 7.207182320441989, + "grad_norm": 0.21967467665672302, + "learning_rate": 1.91016418223693e-05, + "loss": 1.7501, + "step": 23481 + }, + { + "epoch": 7.207489257213014, + "grad_norm": 0.17037035524845123, + "learning_rate": 1.9097734101467684e-05, + "loss": 1.72, + "step": 23482 + }, + { + "epoch": 7.20779619398404, + "grad_norm": 0.21497979760169983, + "learning_rate": 1.9093826685960374e-05, + "loss": 1.6993, + "step": 23483 + }, + { + "epoch": 7.208103130755064, + "grad_norm": 0.1462371051311493, + "learning_rate": 1.9089919575885985e-05, + "loss": 1.6249, + "step": 23484 + }, + { + "epoch": 7.208410067526089, + "grad_norm": 0.1863165646791458, + "learning_rate": 1.9086012771283122e-05, + "loss": 1.6343, + "step": 23485 + }, + { + "epoch": 7.208717004297115, + "grad_norm": 0.1705196648836136, + "learning_rate": 1.9082106272190403e-05, + "loss": 1.7115, + "step": 23486 + }, + { + "epoch": 7.20902394106814, + "grad_norm": 0.20928895473480225, + "learning_rate": 1.9078200078646413e-05, + "loss": 1.6953, + "step": 23487 + }, + { + "epoch": 7.209330877839165, + "grad_norm": 0.2172931581735611, + "learning_rate": 1.9074294190689812e-05, + "loss": 1.7436, + "step": 23488 + }, + { + "epoch": 7.209637814610191, + "grad_norm": 0.1760822981595993, + "learning_rate": 1.9070388608359124e-05, + "loss": 1.6898, + "step": 23489 + }, + { + "epoch": 7.209944751381215, + "grad_norm": 0.28154727816581726, + "learning_rate": 1.9066483331693018e-05, + "loss": 1.7583, + "step": 23490 + }, + { + "epoch": 7.2102516881522405, + "grad_norm": 0.28375890851020813, + "learning_rate": 1.9062578360730027e-05, + "loss": 1.7428, + "step": 23491 + }, + { + "epoch": 7.210558624923266, + "grad_norm": 0.2173614352941513, + "learning_rate": 1.905867369550878e-05, + "loss": 1.6902, + "step": 23492 + }, + { + "epoch": 7.210865561694291, + "grad_norm": 0.2525392174720764, + "learning_rate": 1.9054769336067875e-05, + "loss": 1.7205, + "step": 23493 + }, + { + "epoch": 7.2111724984653165, + "grad_norm": 0.22913219034671783, + "learning_rate": 1.905086528244584e-05, + "loss": 1.7269, + "step": 23494 + }, + { + "epoch": 7.211479435236341, + "grad_norm": 0.2174263298511505, + "learning_rate": 1.9046961534681327e-05, + "loss": 1.7058, + "step": 23495 + }, + { + "epoch": 7.211786372007366, + "grad_norm": 0.2277042120695114, + "learning_rate": 1.9043058092812848e-05, + "loss": 1.7048, + "step": 23496 + }, + { + "epoch": 7.212093308778392, + "grad_norm": 0.17835062742233276, + "learning_rate": 1.9039154956879036e-05, + "loss": 1.7258, + "step": 23497 + }, + { + "epoch": 7.212400245549417, + "grad_norm": 0.22751156985759735, + "learning_rate": 1.903525212691844e-05, + "loss": 1.708, + "step": 23498 + }, + { + "epoch": 7.212707182320442, + "grad_norm": 0.21247950196266174, + "learning_rate": 1.903134960296963e-05, + "loss": 1.7142, + "step": 23499 + }, + { + "epoch": 7.213014119091467, + "grad_norm": 0.2256091684103012, + "learning_rate": 1.9027447385071175e-05, + "loss": 1.6826, + "step": 23500 + }, + { + "epoch": 7.213321055862492, + "grad_norm": 0.16704921424388885, + "learning_rate": 1.902354547326164e-05, + "loss": 1.6639, + "step": 23501 + }, + { + "epoch": 7.213627992633517, + "grad_norm": 0.20211774110794067, + "learning_rate": 1.901964386757958e-05, + "loss": 1.7448, + "step": 23502 + }, + { + "epoch": 7.213934929404543, + "grad_norm": 0.2090187519788742, + "learning_rate": 1.901574256806356e-05, + "loss": 1.7425, + "step": 23503 + }, + { + "epoch": 7.214241866175568, + "grad_norm": 0.1942494809627533, + "learning_rate": 1.9011841574752114e-05, + "loss": 1.721, + "step": 23504 + }, + { + "epoch": 7.214548802946593, + "grad_norm": 0.1842714548110962, + "learning_rate": 1.900794088768385e-05, + "loss": 1.7092, + "step": 23505 + }, + { + "epoch": 7.214855739717618, + "grad_norm": 0.16807401180267334, + "learning_rate": 1.900404050689724e-05, + "loss": 1.6788, + "step": 23506 + }, + { + "epoch": 7.215162676488643, + "grad_norm": 0.16467349231243134, + "learning_rate": 1.9000140432430907e-05, + "loss": 1.6544, + "step": 23507 + }, + { + "epoch": 7.2154696132596685, + "grad_norm": 0.1806645542383194, + "learning_rate": 1.899624066432332e-05, + "loss": 1.6871, + "step": 23508 + }, + { + "epoch": 7.215776550030694, + "grad_norm": 0.16891708970069885, + "learning_rate": 1.8992341202613073e-05, + "loss": 1.6912, + "step": 23509 + }, + { + "epoch": 7.216083486801719, + "grad_norm": 0.21191391348838806, + "learning_rate": 1.89884420473387e-05, + "loss": 1.7843, + "step": 23510 + }, + { + "epoch": 7.216390423572744, + "grad_norm": 0.18484020233154297, + "learning_rate": 1.8984543198538684e-05, + "loss": 1.699, + "step": 23511 + }, + { + "epoch": 7.216697360343769, + "grad_norm": 0.2106105536222458, + "learning_rate": 1.8980644656251627e-05, + "loss": 1.7239, + "step": 23512 + }, + { + "epoch": 7.217004297114794, + "grad_norm": 0.19923320412635803, + "learning_rate": 1.8976746420515988e-05, + "loss": 1.7989, + "step": 23513 + }, + { + "epoch": 7.21731123388582, + "grad_norm": 0.21371988952159882, + "learning_rate": 1.897284849137034e-05, + "loss": 1.7071, + "step": 23514 + }, + { + "epoch": 7.217618170656845, + "grad_norm": 0.20450851321220398, + "learning_rate": 1.8968950868853184e-05, + "loss": 1.7051, + "step": 23515 + }, + { + "epoch": 7.21792510742787, + "grad_norm": 0.22700995206832886, + "learning_rate": 1.8965053553003055e-05, + "loss": 1.7556, + "step": 23516 + }, + { + "epoch": 7.218232044198895, + "grad_norm": 0.26295945048332214, + "learning_rate": 1.896115654385845e-05, + "loss": 1.7893, + "step": 23517 + }, + { + "epoch": 7.21853898096992, + "grad_norm": 0.17091867327690125, + "learning_rate": 1.8957259841457885e-05, + "loss": 1.7289, + "step": 23518 + }, + { + "epoch": 7.218845917740945, + "grad_norm": 0.24840304255485535, + "learning_rate": 1.8953363445839877e-05, + "loss": 1.6958, + "step": 23519 + }, + { + "epoch": 7.219152854511971, + "grad_norm": 0.20042046904563904, + "learning_rate": 1.8949467357042926e-05, + "loss": 1.743, + "step": 23520 + }, + { + "epoch": 7.219459791282996, + "grad_norm": 0.18286047875881195, + "learning_rate": 1.894557157510552e-05, + "loss": 1.7065, + "step": 23521 + }, + { + "epoch": 7.2197667280540205, + "grad_norm": 0.18324656784534454, + "learning_rate": 1.894167610006622e-05, + "loss": 1.7083, + "step": 23522 + }, + { + "epoch": 7.220073664825046, + "grad_norm": 0.17110426723957062, + "learning_rate": 1.8937780931963432e-05, + "loss": 1.7016, + "step": 23523 + }, + { + "epoch": 7.220380601596071, + "grad_norm": 0.19164881110191345, + "learning_rate": 1.8933886070835743e-05, + "loss": 1.7011, + "step": 23524 + }, + { + "epoch": 7.2206875383670965, + "grad_norm": 0.16899923980236053, + "learning_rate": 1.892999151672157e-05, + "loss": 1.7227, + "step": 23525 + }, + { + "epoch": 7.220994475138122, + "grad_norm": 0.18763495981693268, + "learning_rate": 1.8926097269659437e-05, + "loss": 1.6956, + "step": 23526 + }, + { + "epoch": 7.221301411909146, + "grad_norm": 0.1665162295103073, + "learning_rate": 1.8922203329687847e-05, + "loss": 1.7039, + "step": 23527 + }, + { + "epoch": 7.221608348680172, + "grad_norm": 0.20766250789165497, + "learning_rate": 1.8918309696845226e-05, + "loss": 1.7703, + "step": 23528 + }, + { + "epoch": 7.221915285451197, + "grad_norm": 0.1813010275363922, + "learning_rate": 1.891441637117012e-05, + "loss": 1.6709, + "step": 23529 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 0.15327073633670807, + "learning_rate": 1.891052335270094e-05, + "loss": 1.6518, + "step": 23530 + }, + { + "epoch": 7.222529158993248, + "grad_norm": 0.17191094160079956, + "learning_rate": 1.8906630641476203e-05, + "loss": 1.7193, + "step": 23531 + }, + { + "epoch": 7.222836095764273, + "grad_norm": 0.17976176738739014, + "learning_rate": 1.8902738237534363e-05, + "loss": 1.7162, + "step": 23532 + }, + { + "epoch": 7.223143032535297, + "grad_norm": 0.1828993558883667, + "learning_rate": 1.8898846140913894e-05, + "loss": 1.7163, + "step": 23533 + }, + { + "epoch": 7.223449969306323, + "grad_norm": 0.15828034281730652, + "learning_rate": 1.889495435165326e-05, + "loss": 1.6734, + "step": 23534 + }, + { + "epoch": 7.223756906077348, + "grad_norm": 0.2171369194984436, + "learning_rate": 1.8891062869790915e-05, + "loss": 1.7508, + "step": 23535 + }, + { + "epoch": 7.224063842848373, + "grad_norm": 0.18747110664844513, + "learning_rate": 1.888717169536532e-05, + "loss": 1.7162, + "step": 23536 + }, + { + "epoch": 7.224370779619399, + "grad_norm": 0.19177328050136566, + "learning_rate": 1.8883280828414927e-05, + "loss": 1.7044, + "step": 23537 + }, + { + "epoch": 7.224677716390423, + "grad_norm": 0.175906702876091, + "learning_rate": 1.88793902689782e-05, + "loss": 1.7126, + "step": 23538 + }, + { + "epoch": 7.2249846531614486, + "grad_norm": 0.17842896282672882, + "learning_rate": 1.887550001709357e-05, + "loss": 1.7469, + "step": 23539 + }, + { + "epoch": 7.225291589932474, + "grad_norm": 0.23797607421875, + "learning_rate": 1.8871610072799478e-05, + "loss": 1.7343, + "step": 23540 + }, + { + "epoch": 7.225598526703499, + "grad_norm": 0.2297922819852829, + "learning_rate": 1.8867720436134412e-05, + "loss": 1.7453, + "step": 23541 + }, + { + "epoch": 7.225905463474525, + "grad_norm": 0.19950568675994873, + "learning_rate": 1.8863831107136748e-05, + "loss": 1.6984, + "step": 23542 + }, + { + "epoch": 7.226212400245549, + "grad_norm": 0.2809087038040161, + "learning_rate": 1.8859942085844974e-05, + "loss": 1.7815, + "step": 23543 + }, + { + "epoch": 7.226519337016574, + "grad_norm": 0.20534642040729523, + "learning_rate": 1.8856053372297515e-05, + "loss": 1.7455, + "step": 23544 + }, + { + "epoch": 7.2268262737876, + "grad_norm": 0.20052307844161987, + "learning_rate": 1.885216496653276e-05, + "loss": 1.6655, + "step": 23545 + }, + { + "epoch": 7.227133210558625, + "grad_norm": 0.1948573738336563, + "learning_rate": 1.8848276868589205e-05, + "loss": 1.7036, + "step": 23546 + }, + { + "epoch": 7.22744014732965, + "grad_norm": 0.16764269769191742, + "learning_rate": 1.8844389078505197e-05, + "loss": 1.6605, + "step": 23547 + }, + { + "epoch": 7.227747084100676, + "grad_norm": 0.17951633036136627, + "learning_rate": 1.8840501596319214e-05, + "loss": 1.6948, + "step": 23548 + }, + { + "epoch": 7.2280540208717, + "grad_norm": 0.1906418353319168, + "learning_rate": 1.883661442206966e-05, + "loss": 1.7122, + "step": 23549 + }, + { + "epoch": 7.2283609576427255, + "grad_norm": 0.19535204768180847, + "learning_rate": 1.8832727555794943e-05, + "loss": 1.7089, + "step": 23550 + }, + { + "epoch": 7.228667894413751, + "grad_norm": 0.20654071867465973, + "learning_rate": 1.8828840997533488e-05, + "loss": 1.7113, + "step": 23551 + }, + { + "epoch": 7.228974831184776, + "grad_norm": 0.18860456347465515, + "learning_rate": 1.8824954747323692e-05, + "loss": 1.7475, + "step": 23552 + }, + { + "epoch": 7.2292817679558015, + "grad_norm": 0.21949729323387146, + "learning_rate": 1.882106880520396e-05, + "loss": 1.7819, + "step": 23553 + }, + { + "epoch": 7.229588704726826, + "grad_norm": 0.2177286595106125, + "learning_rate": 1.881718317121271e-05, + "loss": 1.7554, + "step": 23554 + }, + { + "epoch": 7.229895641497851, + "grad_norm": 0.21143296360969543, + "learning_rate": 1.8813297845388328e-05, + "loss": 1.7811, + "step": 23555 + }, + { + "epoch": 7.230202578268877, + "grad_norm": 0.24787208437919617, + "learning_rate": 1.880941282776922e-05, + "loss": 1.707, + "step": 23556 + }, + { + "epoch": 7.230509515039902, + "grad_norm": 0.18048164248466492, + "learning_rate": 1.880552811839375e-05, + "loss": 1.6841, + "step": 23557 + }, + { + "epoch": 7.230816451810927, + "grad_norm": 0.24056772887706757, + "learning_rate": 1.8801643717300375e-05, + "loss": 1.7868, + "step": 23558 + }, + { + "epoch": 7.231123388581952, + "grad_norm": 0.18564146757125854, + "learning_rate": 1.879775962452741e-05, + "loss": 1.7506, + "step": 23559 + }, + { + "epoch": 7.231430325352977, + "grad_norm": 0.25965458154678345, + "learning_rate": 1.87938758401133e-05, + "loss": 1.7307, + "step": 23560 + }, + { + "epoch": 7.231737262124002, + "grad_norm": 0.17774315178394318, + "learning_rate": 1.8789992364096394e-05, + "loss": 1.7089, + "step": 23561 + }, + { + "epoch": 7.232044198895028, + "grad_norm": 0.2488560527563095, + "learning_rate": 1.878610919651505e-05, + "loss": 1.6811, + "step": 23562 + }, + { + "epoch": 7.232351135666053, + "grad_norm": 0.1963108628988266, + "learning_rate": 1.8782226337407703e-05, + "loss": 1.6512, + "step": 23563 + }, + { + "epoch": 7.232658072437078, + "grad_norm": 0.25702449679374695, + "learning_rate": 1.8778343786812663e-05, + "loss": 1.7697, + "step": 23564 + }, + { + "epoch": 7.232965009208103, + "grad_norm": 0.18145591020584106, + "learning_rate": 1.8774461544768347e-05, + "loss": 1.6842, + "step": 23565 + }, + { + "epoch": 7.233271945979128, + "grad_norm": 0.2482728213071823, + "learning_rate": 1.87705796113131e-05, + "loss": 1.7028, + "step": 23566 + }, + { + "epoch": 7.2335788827501535, + "grad_norm": 0.16365976631641388, + "learning_rate": 1.8766697986485293e-05, + "loss": 1.7266, + "step": 23567 + }, + { + "epoch": 7.233885819521179, + "grad_norm": 0.1877463459968567, + "learning_rate": 1.876281667032328e-05, + "loss": 1.6909, + "step": 23568 + }, + { + "epoch": 7.234192756292204, + "grad_norm": 0.19121702015399933, + "learning_rate": 1.8758935662865423e-05, + "loss": 1.7303, + "step": 23569 + }, + { + "epoch": 7.234499693063229, + "grad_norm": 0.1783505082130432, + "learning_rate": 1.8755054964150072e-05, + "loss": 1.7209, + "step": 23570 + }, + { + "epoch": 7.234806629834254, + "grad_norm": 0.172771617770195, + "learning_rate": 1.8751174574215585e-05, + "loss": 1.6824, + "step": 23571 + }, + { + "epoch": 7.235113566605279, + "grad_norm": 0.1675102859735489, + "learning_rate": 1.8747294493100304e-05, + "loss": 1.6664, + "step": 23572 + }, + { + "epoch": 7.235420503376305, + "grad_norm": 0.18213391304016113, + "learning_rate": 1.8743414720842578e-05, + "loss": 1.6725, + "step": 23573 + }, + { + "epoch": 7.23572744014733, + "grad_norm": 0.2204304337501526, + "learning_rate": 1.8739535257480728e-05, + "loss": 1.7662, + "step": 23574 + }, + { + "epoch": 7.236034376918354, + "grad_norm": 0.22732098400592804, + "learning_rate": 1.873565610305315e-05, + "loss": 1.7808, + "step": 23575 + }, + { + "epoch": 7.23634131368938, + "grad_norm": 0.17859263718128204, + "learning_rate": 1.8731777257598128e-05, + "loss": 1.6767, + "step": 23576 + }, + { + "epoch": 7.236648250460405, + "grad_norm": 0.16690675914287567, + "learning_rate": 1.8727898721154007e-05, + "loss": 1.6523, + "step": 23577 + }, + { + "epoch": 7.23695518723143, + "grad_norm": 0.17576774954795837, + "learning_rate": 1.872402049375912e-05, + "loss": 1.6951, + "step": 23578 + }, + { + "epoch": 7.237262124002456, + "grad_norm": 0.20455172657966614, + "learning_rate": 1.8720142575451777e-05, + "loss": 1.7402, + "step": 23579 + }, + { + "epoch": 7.237569060773481, + "grad_norm": 0.2122879922389984, + "learning_rate": 1.8716264966270352e-05, + "loss": 1.7571, + "step": 23580 + }, + { + "epoch": 7.2378759975445055, + "grad_norm": 0.17752611637115479, + "learning_rate": 1.87123876662531e-05, + "loss": 1.7185, + "step": 23581 + }, + { + "epoch": 7.238182934315531, + "grad_norm": 0.21253602206707, + "learning_rate": 1.87085106754384e-05, + "loss": 1.7281, + "step": 23582 + }, + { + "epoch": 7.238489871086556, + "grad_norm": 0.19470329582691193, + "learning_rate": 1.8704633993864514e-05, + "loss": 1.6772, + "step": 23583 + }, + { + "epoch": 7.2387968078575815, + "grad_norm": 0.19556869566440582, + "learning_rate": 1.8700757621569786e-05, + "loss": 1.6888, + "step": 23584 + }, + { + "epoch": 7.239103744628607, + "grad_norm": 0.20525780320167542, + "learning_rate": 1.869688155859252e-05, + "loss": 1.7517, + "step": 23585 + }, + { + "epoch": 7.239410681399631, + "grad_norm": 0.23367032408714294, + "learning_rate": 1.869300580497102e-05, + "loss": 1.781, + "step": 23586 + }, + { + "epoch": 7.239717618170657, + "grad_norm": 0.1893240362405777, + "learning_rate": 1.8689130360743583e-05, + "loss": 1.7265, + "step": 23587 + }, + { + "epoch": 7.240024554941682, + "grad_norm": 0.17136700451374054, + "learning_rate": 1.868525522594851e-05, + "loss": 1.6631, + "step": 23588 + }, + { + "epoch": 7.240331491712707, + "grad_norm": 0.1984632909297943, + "learning_rate": 1.8681380400624103e-05, + "loss": 1.7337, + "step": 23589 + }, + { + "epoch": 7.240638428483733, + "grad_norm": 0.19046886265277863, + "learning_rate": 1.867750588480865e-05, + "loss": 1.7094, + "step": 23590 + }, + { + "epoch": 7.240945365254758, + "grad_norm": 0.18242189288139343, + "learning_rate": 1.8673631678540427e-05, + "loss": 1.692, + "step": 23591 + }, + { + "epoch": 7.241252302025782, + "grad_norm": 0.1741522252559662, + "learning_rate": 1.8669757781857768e-05, + "loss": 1.6975, + "step": 23592 + }, + { + "epoch": 7.241559238796808, + "grad_norm": 0.1778191328048706, + "learning_rate": 1.866588419479891e-05, + "loss": 1.7092, + "step": 23593 + }, + { + "epoch": 7.241866175567833, + "grad_norm": 0.17402158677577972, + "learning_rate": 1.866201091740215e-05, + "loss": 1.7072, + "step": 23594 + }, + { + "epoch": 7.242173112338858, + "grad_norm": 0.22215119004249573, + "learning_rate": 1.8658137949705763e-05, + "loss": 1.7205, + "step": 23595 + }, + { + "epoch": 7.242480049109884, + "grad_norm": 0.15291182696819305, + "learning_rate": 1.8654265291748013e-05, + "loss": 1.7341, + "step": 23596 + }, + { + "epoch": 7.242786985880908, + "grad_norm": 0.18226875364780426, + "learning_rate": 1.8650392943567217e-05, + "loss": 1.6731, + "step": 23597 + }, + { + "epoch": 7.2430939226519335, + "grad_norm": 0.19169047474861145, + "learning_rate": 1.864652090520158e-05, + "loss": 1.777, + "step": 23598 + }, + { + "epoch": 7.243400859422959, + "grad_norm": 0.2063349187374115, + "learning_rate": 1.8642649176689437e-05, + "loss": 1.7258, + "step": 23599 + }, + { + "epoch": 7.243707796193984, + "grad_norm": 0.18550212681293488, + "learning_rate": 1.863877775806898e-05, + "loss": 1.7041, + "step": 23600 + }, + { + "epoch": 7.2440147329650095, + "grad_norm": 0.21196649968624115, + "learning_rate": 1.8634906649378514e-05, + "loss": 1.6672, + "step": 23601 + }, + { + "epoch": 7.244321669736034, + "grad_norm": 0.26801541447639465, + "learning_rate": 1.863103585065629e-05, + "loss": 1.6981, + "step": 23602 + }, + { + "epoch": 7.244628606507059, + "grad_norm": 0.1854090690612793, + "learning_rate": 1.862716536194055e-05, + "loss": 1.7406, + "step": 23603 + }, + { + "epoch": 7.244935543278085, + "grad_norm": 0.15906888246536255, + "learning_rate": 1.8623295183269556e-05, + "loss": 1.6721, + "step": 23604 + }, + { + "epoch": 7.24524248004911, + "grad_norm": 0.2210245132446289, + "learning_rate": 1.8619425314681547e-05, + "loss": 1.7717, + "step": 23605 + }, + { + "epoch": 7.245549416820135, + "grad_norm": 0.17654140293598175, + "learning_rate": 1.861555575621477e-05, + "loss": 1.7428, + "step": 23606 + }, + { + "epoch": 7.245856353591161, + "grad_norm": 0.1582319736480713, + "learning_rate": 1.8611686507907466e-05, + "loss": 1.6814, + "step": 23607 + }, + { + "epoch": 7.246163290362185, + "grad_norm": 0.18817248940467834, + "learning_rate": 1.8607817569797852e-05, + "loss": 1.74, + "step": 23608 + }, + { + "epoch": 7.24647022713321, + "grad_norm": 0.26141074299812317, + "learning_rate": 1.8603948941924227e-05, + "loss": 1.6966, + "step": 23609 + }, + { + "epoch": 7.246777163904236, + "grad_norm": 0.16877111792564392, + "learning_rate": 1.8600080624324757e-05, + "loss": 1.6849, + "step": 23610 + }, + { + "epoch": 7.247084100675261, + "grad_norm": 0.16188141703605652, + "learning_rate": 1.8596212617037694e-05, + "loss": 1.6342, + "step": 23611 + }, + { + "epoch": 7.247391037446286, + "grad_norm": 0.19506491720676422, + "learning_rate": 1.8592344920101267e-05, + "loss": 1.6874, + "step": 23612 + }, + { + "epoch": 7.247697974217311, + "grad_norm": 0.1865006536245346, + "learning_rate": 1.8588477533553677e-05, + "loss": 1.7365, + "step": 23613 + }, + { + "epoch": 7.248004910988336, + "grad_norm": 0.16737428307533264, + "learning_rate": 1.85846104574332e-05, + "loss": 1.6971, + "step": 23614 + }, + { + "epoch": 7.2483118477593615, + "grad_norm": 0.1754695028066635, + "learning_rate": 1.858074369177798e-05, + "loss": 1.7133, + "step": 23615 + }, + { + "epoch": 7.248618784530387, + "grad_norm": 0.21066173911094666, + "learning_rate": 1.85768772366263e-05, + "loss": 1.7737, + "step": 23616 + }, + { + "epoch": 7.248925721301412, + "grad_norm": 0.2530418932437897, + "learning_rate": 1.8573011092016303e-05, + "loss": 1.7962, + "step": 23617 + }, + { + "epoch": 7.249232658072437, + "grad_norm": 0.17780029773712158, + "learning_rate": 1.8569145257986247e-05, + "loss": 1.6691, + "step": 23618 + }, + { + "epoch": 7.249539594843462, + "grad_norm": 0.2105826437473297, + "learning_rate": 1.856527973457432e-05, + "loss": 1.6943, + "step": 23619 + }, + { + "epoch": 7.249846531614487, + "grad_norm": 0.20929837226867676, + "learning_rate": 1.856141452181872e-05, + "loss": 1.7223, + "step": 23620 + }, + { + "epoch": 7.250153468385513, + "grad_norm": 0.17105531692504883, + "learning_rate": 1.8557549619757653e-05, + "loss": 1.6956, + "step": 23621 + }, + { + "epoch": 7.250460405156538, + "grad_norm": 0.21282736957073212, + "learning_rate": 1.8553685028429306e-05, + "loss": 1.7299, + "step": 23622 + }, + { + "epoch": 7.250767341927563, + "grad_norm": 0.1673511266708374, + "learning_rate": 1.8549820747871882e-05, + "loss": 1.7184, + "step": 23623 + }, + { + "epoch": 7.251074278698588, + "grad_norm": 0.1877487152814865, + "learning_rate": 1.854595677812356e-05, + "loss": 1.6989, + "step": 23624 + }, + { + "epoch": 7.251381215469613, + "grad_norm": 0.1709173619747162, + "learning_rate": 1.8542093119222504e-05, + "loss": 1.6994, + "step": 23625 + }, + { + "epoch": 7.2516881522406385, + "grad_norm": 0.18894633650779724, + "learning_rate": 1.8538229771206962e-05, + "loss": 1.665, + "step": 23626 + }, + { + "epoch": 7.251995089011664, + "grad_norm": 0.17623448371887207, + "learning_rate": 1.8534366734115056e-05, + "loss": 1.6999, + "step": 23627 + }, + { + "epoch": 7.252302025782689, + "grad_norm": 0.20008981227874756, + "learning_rate": 1.8530504007984982e-05, + "loss": 1.7147, + "step": 23628 + }, + { + "epoch": 7.252608962553714, + "grad_norm": 0.2506260573863983, + "learning_rate": 1.852664159285491e-05, + "loss": 1.7485, + "step": 23629 + }, + { + "epoch": 7.252915899324739, + "grad_norm": 0.17746438086032867, + "learning_rate": 1.8522779488763e-05, + "loss": 1.7534, + "step": 23630 + }, + { + "epoch": 7.253222836095764, + "grad_norm": 0.1910836547613144, + "learning_rate": 1.8518917695747462e-05, + "loss": 1.7167, + "step": 23631 + }, + { + "epoch": 7.25352977286679, + "grad_norm": 0.18009543418884277, + "learning_rate": 1.8515056213846398e-05, + "loss": 1.6849, + "step": 23632 + }, + { + "epoch": 7.253836709637815, + "grad_norm": 0.18150615692138672, + "learning_rate": 1.851119504309804e-05, + "loss": 1.7077, + "step": 23633 + }, + { + "epoch": 7.25414364640884, + "grad_norm": 0.1874052882194519, + "learning_rate": 1.850733418354047e-05, + "loss": 1.7398, + "step": 23634 + }, + { + "epoch": 7.254450583179865, + "grad_norm": 0.18285217881202698, + "learning_rate": 1.8503473635211897e-05, + "loss": 1.7433, + "step": 23635 + }, + { + "epoch": 7.25475751995089, + "grad_norm": 0.19326861202716827, + "learning_rate": 1.8499613398150463e-05, + "loss": 1.7095, + "step": 23636 + }, + { + "epoch": 7.255064456721915, + "grad_norm": 0.21128259599208832, + "learning_rate": 1.849575347239431e-05, + "loss": 1.7352, + "step": 23637 + }, + { + "epoch": 7.255371393492941, + "grad_norm": 0.19309113919734955, + "learning_rate": 1.849189385798159e-05, + "loss": 1.7098, + "step": 23638 + }, + { + "epoch": 7.255678330263966, + "grad_norm": 0.1877751648426056, + "learning_rate": 1.848803455495044e-05, + "loss": 1.7279, + "step": 23639 + }, + { + "epoch": 7.2559852670349905, + "grad_norm": 0.18840502202510834, + "learning_rate": 1.8484175563339e-05, + "loss": 1.7174, + "step": 23640 + }, + { + "epoch": 7.256292203806016, + "grad_norm": 0.1912582963705063, + "learning_rate": 1.848031688318541e-05, + "loss": 1.6964, + "step": 23641 + }, + { + "epoch": 7.256599140577041, + "grad_norm": 0.188243106007576, + "learning_rate": 1.847645851452779e-05, + "loss": 1.7296, + "step": 23642 + }, + { + "epoch": 7.2569060773480665, + "grad_norm": 0.15838554501533508, + "learning_rate": 1.8472600457404317e-05, + "loss": 1.6276, + "step": 23643 + }, + { + "epoch": 7.257213014119092, + "grad_norm": 0.1605941653251648, + "learning_rate": 1.8468742711853065e-05, + "loss": 1.7015, + "step": 23644 + }, + { + "epoch": 7.257519950890116, + "grad_norm": 0.23647825419902802, + "learning_rate": 1.846488527791218e-05, + "loss": 1.775, + "step": 23645 + }, + { + "epoch": 7.257826887661142, + "grad_norm": 0.2414257973432541, + "learning_rate": 1.846102815561978e-05, + "loss": 1.7456, + "step": 23646 + }, + { + "epoch": 7.258133824432167, + "grad_norm": 0.221851646900177, + "learning_rate": 1.845717134501397e-05, + "loss": 1.6875, + "step": 23647 + }, + { + "epoch": 7.258440761203192, + "grad_norm": 0.20732705295085907, + "learning_rate": 1.8453314846132914e-05, + "loss": 1.6619, + "step": 23648 + }, + { + "epoch": 7.258747697974218, + "grad_norm": 0.18818728625774384, + "learning_rate": 1.8449458659014657e-05, + "loss": 1.6961, + "step": 23649 + }, + { + "epoch": 7.259054634745242, + "grad_norm": 0.19335074722766876, + "learning_rate": 1.8445602783697374e-05, + "loss": 1.6816, + "step": 23650 + }, + { + "epoch": 7.259361571516267, + "grad_norm": 0.27334100008010864, + "learning_rate": 1.844174722021911e-05, + "loss": 1.7435, + "step": 23651 + }, + { + "epoch": 7.259668508287293, + "grad_norm": 0.18763858079910278, + "learning_rate": 1.843789196861801e-05, + "loss": 1.713, + "step": 23652 + }, + { + "epoch": 7.259975445058318, + "grad_norm": 0.2585131525993347, + "learning_rate": 1.843403702893216e-05, + "loss": 1.7151, + "step": 23653 + }, + { + "epoch": 7.260282381829343, + "grad_norm": 0.182148277759552, + "learning_rate": 1.843018240119966e-05, + "loss": 1.7018, + "step": 23654 + }, + { + "epoch": 7.260589318600369, + "grad_norm": 0.31881436705589294, + "learning_rate": 1.84263280854586e-05, + "loss": 1.7428, + "step": 23655 + }, + { + "epoch": 7.260896255371393, + "grad_norm": 0.20997895300388336, + "learning_rate": 1.8422474081747073e-05, + "loss": 1.724, + "step": 23656 + }, + { + "epoch": 7.2612031921424185, + "grad_norm": 0.25038522481918335, + "learning_rate": 1.8418620390103163e-05, + "loss": 1.739, + "step": 23657 + }, + { + "epoch": 7.261510128913444, + "grad_norm": 0.22313323616981506, + "learning_rate": 1.841476701056496e-05, + "loss": 1.7493, + "step": 23658 + }, + { + "epoch": 7.261817065684469, + "grad_norm": 0.22516389191150665, + "learning_rate": 1.8410913943170522e-05, + "loss": 1.79, + "step": 23659 + }, + { + "epoch": 7.2621240024554945, + "grad_norm": 0.1966279298067093, + "learning_rate": 1.8407061187957982e-05, + "loss": 1.7418, + "step": 23660 + }, + { + "epoch": 7.262430939226519, + "grad_norm": 0.18697889149188995, + "learning_rate": 1.840320874496536e-05, + "loss": 1.7347, + "step": 23661 + }, + { + "epoch": 7.262737875997544, + "grad_norm": 0.18226566910743713, + "learning_rate": 1.8399356614230755e-05, + "loss": 1.6979, + "step": 23662 + }, + { + "epoch": 7.26304481276857, + "grad_norm": 0.18880577385425568, + "learning_rate": 1.839550479579223e-05, + "loss": 1.6612, + "step": 23663 + }, + { + "epoch": 7.263351749539595, + "grad_norm": 0.2048085480928421, + "learning_rate": 1.8391653289687826e-05, + "loss": 1.7313, + "step": 23664 + }, + { + "epoch": 7.26365868631062, + "grad_norm": 0.238912895321846, + "learning_rate": 1.838780209595567e-05, + "loss": 1.7522, + "step": 23665 + }, + { + "epoch": 7.263965623081646, + "grad_norm": 0.1656452864408493, + "learning_rate": 1.838395121463375e-05, + "loss": 1.6742, + "step": 23666 + }, + { + "epoch": 7.26427255985267, + "grad_norm": 0.2209266573190689, + "learning_rate": 1.8380100645760186e-05, + "loss": 1.6592, + "step": 23667 + }, + { + "epoch": 7.264579496623695, + "grad_norm": 0.19701217114925385, + "learning_rate": 1.8376250389372967e-05, + "loss": 1.7211, + "step": 23668 + }, + { + "epoch": 7.264886433394721, + "grad_norm": 0.229326069355011, + "learning_rate": 1.837240044551019e-05, + "loss": 1.7044, + "step": 23669 + }, + { + "epoch": 7.265193370165746, + "grad_norm": 0.18499960005283356, + "learning_rate": 1.8368550814209894e-05, + "loss": 1.705, + "step": 23670 + }, + { + "epoch": 7.265500306936771, + "grad_norm": 0.25504955649375916, + "learning_rate": 1.8364701495510117e-05, + "loss": 1.7246, + "step": 23671 + }, + { + "epoch": 7.265807243707796, + "grad_norm": 0.25998997688293457, + "learning_rate": 1.8360852489448903e-05, + "loss": 1.8311, + "step": 23672 + }, + { + "epoch": 7.266114180478821, + "grad_norm": 0.2437162697315216, + "learning_rate": 1.8357003796064294e-05, + "loss": 1.6467, + "step": 23673 + }, + { + "epoch": 7.2664211172498465, + "grad_norm": 0.20784614980220795, + "learning_rate": 1.8353155415394315e-05, + "loss": 1.7361, + "step": 23674 + }, + { + "epoch": 7.266728054020872, + "grad_norm": 0.22633932530879974, + "learning_rate": 1.8349307347476998e-05, + "loss": 1.6518, + "step": 23675 + }, + { + "epoch": 7.267034990791897, + "grad_norm": 0.19307547807693481, + "learning_rate": 1.8345459592350367e-05, + "loss": 1.7469, + "step": 23676 + }, + { + "epoch": 7.267341927562922, + "grad_norm": 0.20418168604373932, + "learning_rate": 1.8341612150052483e-05, + "loss": 1.6892, + "step": 23677 + }, + { + "epoch": 7.267648864333947, + "grad_norm": 0.1574825942516327, + "learning_rate": 1.8337765020621332e-05, + "loss": 1.6682, + "step": 23678 + }, + { + "epoch": 7.267955801104972, + "grad_norm": 0.31023111939430237, + "learning_rate": 1.8333918204094947e-05, + "loss": 1.7382, + "step": 23679 + }, + { + "epoch": 7.268262737875998, + "grad_norm": 0.18148623406887054, + "learning_rate": 1.833007170051134e-05, + "loss": 1.726, + "step": 23680 + }, + { + "epoch": 7.268569674647023, + "grad_norm": 0.19278696179389954, + "learning_rate": 1.832622550990851e-05, + "loss": 1.7176, + "step": 23681 + }, + { + "epoch": 7.268876611418047, + "grad_norm": 0.18298377096652985, + "learning_rate": 1.832237963232452e-05, + "loss": 1.6703, + "step": 23682 + }, + { + "epoch": 7.269183548189073, + "grad_norm": 0.2019357681274414, + "learning_rate": 1.8318534067797304e-05, + "loss": 1.7771, + "step": 23683 + }, + { + "epoch": 7.269490484960098, + "grad_norm": 0.21978864073753357, + "learning_rate": 1.8314688816364944e-05, + "loss": 1.7938, + "step": 23684 + }, + { + "epoch": 7.269797421731123, + "grad_norm": 0.20009377598762512, + "learning_rate": 1.831084387806536e-05, + "loss": 1.7312, + "step": 23685 + }, + { + "epoch": 7.270104358502149, + "grad_norm": 0.16587263345718384, + "learning_rate": 1.8306999252936608e-05, + "loss": 1.7098, + "step": 23686 + }, + { + "epoch": 7.270411295273174, + "grad_norm": 0.20567362010478973, + "learning_rate": 1.8303154941016666e-05, + "loss": 1.6893, + "step": 23687 + }, + { + "epoch": 7.2707182320441985, + "grad_norm": 0.1916830986738205, + "learning_rate": 1.8299310942343527e-05, + "loss": 1.7995, + "step": 23688 + }, + { + "epoch": 7.271025168815224, + "grad_norm": 0.18361486494541168, + "learning_rate": 1.8295467256955174e-05, + "loss": 1.6708, + "step": 23689 + }, + { + "epoch": 7.271332105586249, + "grad_norm": 0.20620734989643097, + "learning_rate": 1.8291623884889597e-05, + "loss": 1.7314, + "step": 23690 + }, + { + "epoch": 7.2716390423572745, + "grad_norm": 0.22560660541057587, + "learning_rate": 1.828778082618478e-05, + "loss": 1.7418, + "step": 23691 + }, + { + "epoch": 7.2719459791283, + "grad_norm": 0.2113492786884308, + "learning_rate": 1.8283938080878697e-05, + "loss": 1.724, + "step": 23692 + }, + { + "epoch": 7.272252915899324, + "grad_norm": 0.26234012842178345, + "learning_rate": 1.8280095649009334e-05, + "loss": 1.7723, + "step": 23693 + }, + { + "epoch": 7.27255985267035, + "grad_norm": 0.1675095111131668, + "learning_rate": 1.827625353061465e-05, + "loss": 1.7473, + "step": 23694 + }, + { + "epoch": 7.272866789441375, + "grad_norm": 0.17751236259937286, + "learning_rate": 1.8272411725732623e-05, + "loss": 1.7374, + "step": 23695 + }, + { + "epoch": 7.2731737262124, + "grad_norm": 0.23158904910087585, + "learning_rate": 1.826857023440122e-05, + "loss": 1.8111, + "step": 23696 + }, + { + "epoch": 7.273480662983426, + "grad_norm": 0.17262183129787445, + "learning_rate": 1.8264729056658407e-05, + "loss": 1.7546, + "step": 23697 + }, + { + "epoch": 7.273787599754451, + "grad_norm": 0.20811094343662262, + "learning_rate": 1.8260888192542126e-05, + "loss": 1.8059, + "step": 23698 + }, + { + "epoch": 7.274094536525475, + "grad_norm": 0.17156411707401276, + "learning_rate": 1.825704764209038e-05, + "loss": 1.7261, + "step": 23699 + }, + { + "epoch": 7.274401473296501, + "grad_norm": 0.18523572385311127, + "learning_rate": 1.8253207405341067e-05, + "loss": 1.7139, + "step": 23700 + }, + { + "epoch": 7.274708410067526, + "grad_norm": 0.20626066625118256, + "learning_rate": 1.824936748233219e-05, + "loss": 1.7269, + "step": 23701 + }, + { + "epoch": 7.2750153468385514, + "grad_norm": 0.1717548966407776, + "learning_rate": 1.8245527873101647e-05, + "loss": 1.7168, + "step": 23702 + }, + { + "epoch": 7.275322283609577, + "grad_norm": 0.16322405636310577, + "learning_rate": 1.8241688577687426e-05, + "loss": 1.7392, + "step": 23703 + }, + { + "epoch": 7.275629220380601, + "grad_norm": 0.19775766134262085, + "learning_rate": 1.8237849596127447e-05, + "loss": 1.7055, + "step": 23704 + }, + { + "epoch": 7.275936157151627, + "grad_norm": 0.1969427913427353, + "learning_rate": 1.823401092845966e-05, + "loss": 1.7418, + "step": 23705 + }, + { + "epoch": 7.276243093922652, + "grad_norm": 0.1791812628507614, + "learning_rate": 1.8230172574721992e-05, + "loss": 1.6512, + "step": 23706 + }, + { + "epoch": 7.276550030693677, + "grad_norm": 0.18583156168460846, + "learning_rate": 1.8226334534952384e-05, + "loss": 1.7357, + "step": 23707 + }, + { + "epoch": 7.276856967464703, + "grad_norm": 0.20729652047157288, + "learning_rate": 1.822249680918876e-05, + "loss": 1.7323, + "step": 23708 + }, + { + "epoch": 7.277163904235728, + "grad_norm": 0.20089028775691986, + "learning_rate": 1.8218659397469045e-05, + "loss": 1.6835, + "step": 23709 + }, + { + "epoch": 7.277470841006752, + "grad_norm": 0.16569854319095612, + "learning_rate": 1.8214822299831168e-05, + "loss": 1.7486, + "step": 23710 + }, + { + "epoch": 7.277777777777778, + "grad_norm": 0.19979944825172424, + "learning_rate": 1.8210985516313044e-05, + "loss": 1.7338, + "step": 23711 + }, + { + "epoch": 7.278084714548803, + "grad_norm": 0.23528912663459778, + "learning_rate": 1.82071490469526e-05, + "loss": 1.8086, + "step": 23712 + }, + { + "epoch": 7.278391651319828, + "grad_norm": 0.18231599032878876, + "learning_rate": 1.8203312891787737e-05, + "loss": 1.744, + "step": 23713 + }, + { + "epoch": 7.278698588090854, + "grad_norm": 0.2208651602268219, + "learning_rate": 1.8199477050856374e-05, + "loss": 1.7592, + "step": 23714 + }, + { + "epoch": 7.279005524861878, + "grad_norm": 0.22329792380332947, + "learning_rate": 1.8195641524196417e-05, + "loss": 1.7242, + "step": 23715 + }, + { + "epoch": 7.2793124616329035, + "grad_norm": 0.17745757102966309, + "learning_rate": 1.8191806311845778e-05, + "loss": 1.7162, + "step": 23716 + }, + { + "epoch": 7.279619398403929, + "grad_norm": 0.19536735117435455, + "learning_rate": 1.8187971413842324e-05, + "loss": 1.6814, + "step": 23717 + }, + { + "epoch": 7.279926335174954, + "grad_norm": 0.21853455901145935, + "learning_rate": 1.8184136830224025e-05, + "loss": 1.7049, + "step": 23718 + }, + { + "epoch": 7.2802332719459795, + "grad_norm": 0.1701575070619583, + "learning_rate": 1.8180302561028696e-05, + "loss": 1.6879, + "step": 23719 + }, + { + "epoch": 7.280540208717004, + "grad_norm": 0.18729525804519653, + "learning_rate": 1.8176468606294288e-05, + "loss": 1.6944, + "step": 23720 + }, + { + "epoch": 7.280847145488029, + "grad_norm": 0.20020832121372223, + "learning_rate": 1.8172634966058667e-05, + "loss": 1.7415, + "step": 23721 + }, + { + "epoch": 7.281154082259055, + "grad_norm": 0.1983461081981659, + "learning_rate": 1.8168801640359724e-05, + "loss": 1.7198, + "step": 23722 + }, + { + "epoch": 7.28146101903008, + "grad_norm": 0.17578791081905365, + "learning_rate": 1.8164968629235334e-05, + "loss": 1.7155, + "step": 23723 + }, + { + "epoch": 7.281767955801105, + "grad_norm": 0.1944401115179062, + "learning_rate": 1.8161135932723388e-05, + "loss": 1.7579, + "step": 23724 + }, + { + "epoch": 7.28207489257213, + "grad_norm": 0.20413067936897278, + "learning_rate": 1.8157303550861753e-05, + "loss": 1.7105, + "step": 23725 + }, + { + "epoch": 7.282381829343155, + "grad_norm": 0.17515964806079865, + "learning_rate": 1.8153471483688318e-05, + "loss": 1.7448, + "step": 23726 + }, + { + "epoch": 7.28268876611418, + "grad_norm": 0.2039034515619278, + "learning_rate": 1.8149639731240938e-05, + "loss": 1.691, + "step": 23727 + }, + { + "epoch": 7.282995702885206, + "grad_norm": 0.2136354148387909, + "learning_rate": 1.8145808293557483e-05, + "loss": 1.656, + "step": 23728 + }, + { + "epoch": 7.283302639656231, + "grad_norm": 0.23029537498950958, + "learning_rate": 1.814197717067582e-05, + "loss": 1.7588, + "step": 23729 + }, + { + "epoch": 7.283609576427256, + "grad_norm": 0.371910035610199, + "learning_rate": 1.8138146362633816e-05, + "loss": 1.8138, + "step": 23730 + }, + { + "epoch": 7.283916513198281, + "grad_norm": 0.2273472249507904, + "learning_rate": 1.8134315869469327e-05, + "loss": 1.6985, + "step": 23731 + }, + { + "epoch": 7.284223449969306, + "grad_norm": 0.33206698298454285, + "learning_rate": 1.81304856912202e-05, + "loss": 1.7015, + "step": 23732 + }, + { + "epoch": 7.2845303867403315, + "grad_norm": 0.20799405872821808, + "learning_rate": 1.8126655827924295e-05, + "loss": 1.6932, + "step": 23733 + }, + { + "epoch": 7.284837323511357, + "grad_norm": 0.28721246123313904, + "learning_rate": 1.8122826279619437e-05, + "loss": 1.7726, + "step": 23734 + }, + { + "epoch": 7.285144260282382, + "grad_norm": 0.2365201711654663, + "learning_rate": 1.8118997046343533e-05, + "loss": 1.7609, + "step": 23735 + }, + { + "epoch": 7.285451197053407, + "grad_norm": 0.24772630631923676, + "learning_rate": 1.811516812813435e-05, + "loss": 1.7057, + "step": 23736 + }, + { + "epoch": 7.285758133824432, + "grad_norm": 0.19344007968902588, + "learning_rate": 1.8111339525029802e-05, + "loss": 1.7526, + "step": 23737 + }, + { + "epoch": 7.286065070595457, + "grad_norm": 0.2454877346754074, + "learning_rate": 1.8107511237067648e-05, + "loss": 1.6474, + "step": 23738 + }, + { + "epoch": 7.286372007366483, + "grad_norm": 0.18084865808486938, + "learning_rate": 1.810368326428578e-05, + "loss": 1.7381, + "step": 23739 + }, + { + "epoch": 7.286678944137508, + "grad_norm": 0.26264744997024536, + "learning_rate": 1.8099855606722012e-05, + "loss": 1.6585, + "step": 23740 + }, + { + "epoch": 7.286985880908533, + "grad_norm": 0.20219333469867706, + "learning_rate": 1.809602826441416e-05, + "loss": 1.7552, + "step": 23741 + }, + { + "epoch": 7.287292817679558, + "grad_norm": 0.23982326686382294, + "learning_rate": 1.8092201237400064e-05, + "loss": 1.6784, + "step": 23742 + }, + { + "epoch": 7.287599754450583, + "grad_norm": 0.22838538885116577, + "learning_rate": 1.8088374525717534e-05, + "loss": 1.6976, + "step": 23743 + }, + { + "epoch": 7.287906691221608, + "grad_norm": 0.22077307105064392, + "learning_rate": 1.8084548129404395e-05, + "loss": 1.721, + "step": 23744 + }, + { + "epoch": 7.288213627992634, + "grad_norm": 0.19811047613620758, + "learning_rate": 1.8080722048498448e-05, + "loss": 1.7317, + "step": 23745 + }, + { + "epoch": 7.288520564763659, + "grad_norm": 0.25160667300224304, + "learning_rate": 1.8076896283037525e-05, + "loss": 1.7725, + "step": 23746 + }, + { + "epoch": 7.2888275015346835, + "grad_norm": 0.19819392263889313, + "learning_rate": 1.807307083305942e-05, + "loss": 1.7243, + "step": 23747 + }, + { + "epoch": 7.289134438305709, + "grad_norm": 0.21769097447395325, + "learning_rate": 1.806924569860194e-05, + "loss": 1.74, + "step": 23748 + }, + { + "epoch": 7.289441375076734, + "grad_norm": 0.23126530647277832, + "learning_rate": 1.806542087970289e-05, + "loss": 1.7479, + "step": 23749 + }, + { + "epoch": 7.2897483118477595, + "grad_norm": 0.21002748608589172, + "learning_rate": 1.8061596376400065e-05, + "loss": 1.6547, + "step": 23750 + }, + { + "epoch": 7.290055248618785, + "grad_norm": 0.242569699883461, + "learning_rate": 1.8057772188731255e-05, + "loss": 1.7587, + "step": 23751 + }, + { + "epoch": 7.290362185389809, + "grad_norm": 0.19619157910346985, + "learning_rate": 1.8053948316734287e-05, + "loss": 1.6619, + "step": 23752 + }, + { + "epoch": 7.290669122160835, + "grad_norm": 0.2086232304573059, + "learning_rate": 1.8050124760446896e-05, + "loss": 1.6535, + "step": 23753 + }, + { + "epoch": 7.29097605893186, + "grad_norm": 0.1955464631319046, + "learning_rate": 1.8046301519906932e-05, + "loss": 1.6814, + "step": 23754 + }, + { + "epoch": 7.291282995702885, + "grad_norm": 0.20373155176639557, + "learning_rate": 1.8042478595152117e-05, + "loss": 1.7006, + "step": 23755 + }, + { + "epoch": 7.291589932473911, + "grad_norm": 0.20233015716075897, + "learning_rate": 1.8038655986220272e-05, + "loss": 1.7478, + "step": 23756 + }, + { + "epoch": 7.291896869244935, + "grad_norm": 0.18800894916057587, + "learning_rate": 1.803483369314916e-05, + "loss": 1.747, + "step": 23757 + }, + { + "epoch": 7.29220380601596, + "grad_norm": 0.1838926076889038, + "learning_rate": 1.8031011715976558e-05, + "loss": 1.7086, + "step": 23758 + }, + { + "epoch": 7.292510742786986, + "grad_norm": 0.1806635707616806, + "learning_rate": 1.8027190054740234e-05, + "loss": 1.6682, + "step": 23759 + }, + { + "epoch": 7.292817679558011, + "grad_norm": 0.19762687385082245, + "learning_rate": 1.802336870947796e-05, + "loss": 1.7514, + "step": 23760 + }, + { + "epoch": 7.293124616329036, + "grad_norm": 0.1739082932472229, + "learning_rate": 1.80195476802275e-05, + "loss": 1.7031, + "step": 23761 + }, + { + "epoch": 7.293431553100062, + "grad_norm": 0.18887469172477722, + "learning_rate": 1.8015726967026615e-05, + "loss": 1.7199, + "step": 23762 + }, + { + "epoch": 7.293738489871086, + "grad_norm": 0.17344269156455994, + "learning_rate": 1.8011906569913056e-05, + "loss": 1.693, + "step": 23763 + }, + { + "epoch": 7.2940454266421115, + "grad_norm": 0.16480129957199097, + "learning_rate": 1.800808648892459e-05, + "loss": 1.722, + "step": 23764 + }, + { + "epoch": 7.294352363413137, + "grad_norm": 0.17336638271808624, + "learning_rate": 1.8004266724098963e-05, + "loss": 1.6635, + "step": 23765 + }, + { + "epoch": 7.294659300184162, + "grad_norm": 0.16539151966571808, + "learning_rate": 1.8000447275473925e-05, + "loss": 1.7709, + "step": 23766 + }, + { + "epoch": 7.2949662369551875, + "grad_norm": 0.20660065114498138, + "learning_rate": 1.7996628143087226e-05, + "loss": 1.7262, + "step": 23767 + }, + { + "epoch": 7.295273173726212, + "grad_norm": 0.2292039543390274, + "learning_rate": 1.7992809326976584e-05, + "loss": 1.7444, + "step": 23768 + }, + { + "epoch": 7.295580110497237, + "grad_norm": 0.20323103666305542, + "learning_rate": 1.7988990827179795e-05, + "loss": 1.7456, + "step": 23769 + }, + { + "epoch": 7.295887047268263, + "grad_norm": 0.16919885575771332, + "learning_rate": 1.7985172643734532e-05, + "loss": 1.7304, + "step": 23770 + }, + { + "epoch": 7.296193984039288, + "grad_norm": 0.19135236740112305, + "learning_rate": 1.798135477667859e-05, + "loss": 1.7067, + "step": 23771 + }, + { + "epoch": 7.296500920810313, + "grad_norm": 0.19812993705272675, + "learning_rate": 1.7977537226049627e-05, + "loss": 1.7701, + "step": 23772 + }, + { + "epoch": 7.296807857581339, + "grad_norm": 0.22823916375637054, + "learning_rate": 1.797371999188543e-05, + "loss": 1.737, + "step": 23773 + }, + { + "epoch": 7.297114794352363, + "grad_norm": 0.1862197369337082, + "learning_rate": 1.7969903074223705e-05, + "loss": 1.675, + "step": 23774 + }, + { + "epoch": 7.297421731123388, + "grad_norm": 0.18780425190925598, + "learning_rate": 1.7966086473102168e-05, + "loss": 1.7237, + "step": 23775 + }, + { + "epoch": 7.297728667894414, + "grad_norm": 0.174093559384346, + "learning_rate": 1.7962270188558543e-05, + "loss": 1.7129, + "step": 23776 + }, + { + "epoch": 7.298035604665439, + "grad_norm": 0.22659943997859955, + "learning_rate": 1.7958454220630543e-05, + "loss": 1.7257, + "step": 23777 + }, + { + "epoch": 7.298342541436464, + "grad_norm": 0.18077917397022247, + "learning_rate": 1.7954638569355875e-05, + "loss": 1.6972, + "step": 23778 + }, + { + "epoch": 7.298649478207489, + "grad_norm": 0.18380658328533173, + "learning_rate": 1.795082323477225e-05, + "loss": 1.6577, + "step": 23779 + }, + { + "epoch": 7.298956414978514, + "grad_norm": 0.17016704380512238, + "learning_rate": 1.7947008216917384e-05, + "loss": 1.7222, + "step": 23780 + }, + { + "epoch": 7.2992633517495396, + "grad_norm": 0.2016153484582901, + "learning_rate": 1.794319351582896e-05, + "loss": 1.6833, + "step": 23781 + }, + { + "epoch": 7.299570288520565, + "grad_norm": 0.26723918318748474, + "learning_rate": 1.7939379131544687e-05, + "loss": 1.7417, + "step": 23782 + }, + { + "epoch": 7.29987722529159, + "grad_norm": 0.2555576264858246, + "learning_rate": 1.7935565064102267e-05, + "loss": 1.7373, + "step": 23783 + }, + { + "epoch": 7.300184162062616, + "grad_norm": 0.2036418914794922, + "learning_rate": 1.793175131353938e-05, + "loss": 1.7052, + "step": 23784 + }, + { + "epoch": 7.30049109883364, + "grad_norm": 0.1789570152759552, + "learning_rate": 1.792793787989371e-05, + "loss": 1.6327, + "step": 23785 + }, + { + "epoch": 7.300798035604665, + "grad_norm": 0.2353249490261078, + "learning_rate": 1.7924124763202987e-05, + "loss": 1.7771, + "step": 23786 + }, + { + "epoch": 7.301104972375691, + "grad_norm": 0.19072949886322021, + "learning_rate": 1.792031196350483e-05, + "loss": 1.7095, + "step": 23787 + }, + { + "epoch": 7.301411909146716, + "grad_norm": 0.24063248932361603, + "learning_rate": 1.791649948083699e-05, + "loss": 1.7247, + "step": 23788 + }, + { + "epoch": 7.301718845917741, + "grad_norm": 0.1916036456823349, + "learning_rate": 1.791268731523707e-05, + "loss": 1.6844, + "step": 23789 + }, + { + "epoch": 7.302025782688766, + "grad_norm": 0.2606290876865387, + "learning_rate": 1.7908875466742797e-05, + "loss": 1.771, + "step": 23790 + }, + { + "epoch": 7.302332719459791, + "grad_norm": 0.23444804549217224, + "learning_rate": 1.7905063935391824e-05, + "loss": 1.747, + "step": 23791 + }, + { + "epoch": 7.3026396562308165, + "grad_norm": 0.28058725595474243, + "learning_rate": 1.7901252721221822e-05, + "loss": 1.7284, + "step": 23792 + }, + { + "epoch": 7.302946593001842, + "grad_norm": 0.23268578946590424, + "learning_rate": 1.7897441824270456e-05, + "loss": 1.7222, + "step": 23793 + }, + { + "epoch": 7.303253529772867, + "grad_norm": 0.275336354970932, + "learning_rate": 1.789363124457539e-05, + "loss": 1.7495, + "step": 23794 + }, + { + "epoch": 7.303560466543892, + "grad_norm": 0.21838977932929993, + "learning_rate": 1.788982098217427e-05, + "loss": 1.725, + "step": 23795 + }, + { + "epoch": 7.303867403314917, + "grad_norm": 0.24108058214187622, + "learning_rate": 1.7886011037104767e-05, + "loss": 1.7804, + "step": 23796 + }, + { + "epoch": 7.304174340085942, + "grad_norm": 0.23003144562244415, + "learning_rate": 1.788220140940452e-05, + "loss": 1.8189, + "step": 23797 + }, + { + "epoch": 7.304481276856968, + "grad_norm": 0.20129653811454773, + "learning_rate": 1.7878392099111186e-05, + "loss": 1.6603, + "step": 23798 + }, + { + "epoch": 7.304788213627993, + "grad_norm": 0.26172930002212524, + "learning_rate": 1.7874583106262404e-05, + "loss": 1.7095, + "step": 23799 + }, + { + "epoch": 7.305095150399017, + "grad_norm": 0.212156742811203, + "learning_rate": 1.7870774430895825e-05, + "loss": 1.7272, + "step": 23800 + }, + { + "epoch": 7.305402087170043, + "grad_norm": 0.2775247097015381, + "learning_rate": 1.7866966073049084e-05, + "loss": 1.773, + "step": 23801 + }, + { + "epoch": 7.305709023941068, + "grad_norm": 0.23456308245658875, + "learning_rate": 1.7863158032759803e-05, + "loss": 1.7173, + "step": 23802 + }, + { + "epoch": 7.306015960712093, + "grad_norm": 0.23986588418483734, + "learning_rate": 1.785935031006566e-05, + "loss": 1.6924, + "step": 23803 + }, + { + "epoch": 7.306322897483119, + "grad_norm": 0.1909915804862976, + "learning_rate": 1.7855542905004225e-05, + "loss": 1.7047, + "step": 23804 + }, + { + "epoch": 7.306629834254144, + "grad_norm": 0.20676325261592865, + "learning_rate": 1.7851735817613192e-05, + "loss": 1.6606, + "step": 23805 + }, + { + "epoch": 7.3069367710251685, + "grad_norm": 0.1910121887922287, + "learning_rate": 1.7847929047930106e-05, + "loss": 1.7555, + "step": 23806 + }, + { + "epoch": 7.307243707796194, + "grad_norm": 0.22737936675548553, + "learning_rate": 1.784412259599265e-05, + "loss": 1.7346, + "step": 23807 + }, + { + "epoch": 7.307550644567219, + "grad_norm": 0.1553424894809723, + "learning_rate": 1.7840316461838426e-05, + "loss": 1.6755, + "step": 23808 + }, + { + "epoch": 7.3078575813382445, + "grad_norm": 0.17937089502811432, + "learning_rate": 1.7836510645505044e-05, + "loss": 1.684, + "step": 23809 + }, + { + "epoch": 7.30816451810927, + "grad_norm": 0.20183639228343964, + "learning_rate": 1.783270514703011e-05, + "loss": 1.7617, + "step": 23810 + }, + { + "epoch": 7.308471454880294, + "grad_norm": 0.21359068155288696, + "learning_rate": 1.782889996645124e-05, + "loss": 1.6897, + "step": 23811 + }, + { + "epoch": 7.30877839165132, + "grad_norm": 0.19640007615089417, + "learning_rate": 1.782509510380604e-05, + "loss": 1.7029, + "step": 23812 + }, + { + "epoch": 7.309085328422345, + "grad_norm": 0.22678261995315552, + "learning_rate": 1.7821290559132104e-05, + "loss": 1.7241, + "step": 23813 + }, + { + "epoch": 7.30939226519337, + "grad_norm": 0.1797642707824707, + "learning_rate": 1.7817486332467037e-05, + "loss": 1.7127, + "step": 23814 + }, + { + "epoch": 7.309699201964396, + "grad_norm": 0.18758134543895721, + "learning_rate": 1.7813682423848432e-05, + "loss": 1.7394, + "step": 23815 + }, + { + "epoch": 7.310006138735421, + "grad_norm": 0.2064354121685028, + "learning_rate": 1.7809878833313887e-05, + "loss": 1.7477, + "step": 23816 + }, + { + "epoch": 7.310313075506445, + "grad_norm": 0.30564701557159424, + "learning_rate": 1.780607556090098e-05, + "loss": 1.7006, + "step": 23817 + }, + { + "epoch": 7.310620012277471, + "grad_norm": 0.23694200813770294, + "learning_rate": 1.7802272606647308e-05, + "loss": 1.7821, + "step": 23818 + }, + { + "epoch": 7.310926949048496, + "grad_norm": 0.20436422526836395, + "learning_rate": 1.779846997059043e-05, + "loss": 1.6681, + "step": 23819 + }, + { + "epoch": 7.311233885819521, + "grad_norm": 0.21899428963661194, + "learning_rate": 1.779466765276798e-05, + "loss": 1.7416, + "step": 23820 + }, + { + "epoch": 7.311540822590547, + "grad_norm": 0.24186378717422485, + "learning_rate": 1.779086565321747e-05, + "loss": 1.7258, + "step": 23821 + }, + { + "epoch": 7.311847759361571, + "grad_norm": 0.22940407693386078, + "learning_rate": 1.778706397197653e-05, + "loss": 1.7211, + "step": 23822 + }, + { + "epoch": 7.3121546961325965, + "grad_norm": 0.18643233180046082, + "learning_rate": 1.778326260908268e-05, + "loss": 1.6778, + "step": 23823 + }, + { + "epoch": 7.312461632903622, + "grad_norm": 0.25372037291526794, + "learning_rate": 1.7779461564573526e-05, + "loss": 1.7252, + "step": 23824 + }, + { + "epoch": 7.312768569674647, + "grad_norm": 0.21126380562782288, + "learning_rate": 1.7775660838486612e-05, + "loss": 1.6655, + "step": 23825 + }, + { + "epoch": 7.3130755064456725, + "grad_norm": 0.19614748656749725, + "learning_rate": 1.777186043085951e-05, + "loss": 1.7223, + "step": 23826 + }, + { + "epoch": 7.313382443216697, + "grad_norm": 0.2111951857805252, + "learning_rate": 1.7768060341729768e-05, + "loss": 1.708, + "step": 23827 + }, + { + "epoch": 7.313689379987722, + "grad_norm": 0.2675856053829193, + "learning_rate": 1.7764260571134956e-05, + "loss": 1.7387, + "step": 23828 + }, + { + "epoch": 7.313996316758748, + "grad_norm": 0.19827900826931, + "learning_rate": 1.7760461119112603e-05, + "loss": 1.6809, + "step": 23829 + }, + { + "epoch": 7.314303253529773, + "grad_norm": 0.24213160574436188, + "learning_rate": 1.775666198570028e-05, + "loss": 1.7064, + "step": 23830 + }, + { + "epoch": 7.314610190300798, + "grad_norm": 0.20035916566848755, + "learning_rate": 1.7752863170935514e-05, + "loss": 1.6874, + "step": 23831 + }, + { + "epoch": 7.314917127071823, + "grad_norm": 0.23662878572940826, + "learning_rate": 1.774906467485586e-05, + "loss": 1.7651, + "step": 23832 + }, + { + "epoch": 7.315224063842848, + "grad_norm": 0.18523871898651123, + "learning_rate": 1.7745266497498847e-05, + "loss": 1.7003, + "step": 23833 + }, + { + "epoch": 7.315531000613873, + "grad_norm": 0.21452756226062775, + "learning_rate": 1.7741468638902016e-05, + "loss": 1.7012, + "step": 23834 + }, + { + "epoch": 7.315837937384899, + "grad_norm": 0.17513468861579895, + "learning_rate": 1.7737671099102904e-05, + "loss": 1.6965, + "step": 23835 + }, + { + "epoch": 7.316144874155924, + "grad_norm": 0.29025998711586, + "learning_rate": 1.7733873878139012e-05, + "loss": 1.7347, + "step": 23836 + }, + { + "epoch": 7.316451810926949, + "grad_norm": 0.14812500774860382, + "learning_rate": 1.7730076976047926e-05, + "loss": 1.6469, + "step": 23837 + }, + { + "epoch": 7.316758747697974, + "grad_norm": 0.23575027287006378, + "learning_rate": 1.77262803928671e-05, + "loss": 1.7267, + "step": 23838 + }, + { + "epoch": 7.317065684468999, + "grad_norm": 0.17986448109149933, + "learning_rate": 1.7722484128634125e-05, + "loss": 1.7206, + "step": 23839 + }, + { + "epoch": 7.3173726212400245, + "grad_norm": 0.22515927255153656, + "learning_rate": 1.7718688183386446e-05, + "loss": 1.7216, + "step": 23840 + }, + { + "epoch": 7.31767955801105, + "grad_norm": 0.1903398036956787, + "learning_rate": 1.7714892557161624e-05, + "loss": 1.7108, + "step": 23841 + }, + { + "epoch": 7.317986494782075, + "grad_norm": 0.23623183369636536, + "learning_rate": 1.7711097249997162e-05, + "loss": 1.6866, + "step": 23842 + }, + { + "epoch": 7.3182934315531, + "grad_norm": 0.18501855432987213, + "learning_rate": 1.7707302261930554e-05, + "loss": 1.6643, + "step": 23843 + }, + { + "epoch": 7.318600368324125, + "grad_norm": 0.21865275502204895, + "learning_rate": 1.770350759299932e-05, + "loss": 1.6932, + "step": 23844 + }, + { + "epoch": 7.31890730509515, + "grad_norm": 0.22363261878490448, + "learning_rate": 1.7699713243240945e-05, + "loss": 1.721, + "step": 23845 + }, + { + "epoch": 7.319214241866176, + "grad_norm": 0.25587835907936096, + "learning_rate": 1.769591921269294e-05, + "loss": 1.7375, + "step": 23846 + }, + { + "epoch": 7.319521178637201, + "grad_norm": 0.22086483240127563, + "learning_rate": 1.76921255013928e-05, + "loss": 1.6957, + "step": 23847 + }, + { + "epoch": 7.319828115408226, + "grad_norm": 0.21197499334812164, + "learning_rate": 1.7688332109378007e-05, + "loss": 1.6993, + "step": 23848 + }, + { + "epoch": 7.320135052179251, + "grad_norm": 0.21211451292037964, + "learning_rate": 1.7684539036686054e-05, + "loss": 1.7329, + "step": 23849 + }, + { + "epoch": 7.320441988950276, + "grad_norm": 0.16938872635364532, + "learning_rate": 1.7680746283354433e-05, + "loss": 1.6895, + "step": 23850 + }, + { + "epoch": 7.320748925721301, + "grad_norm": 0.21465681493282318, + "learning_rate": 1.7676953849420613e-05, + "loss": 1.7156, + "step": 23851 + }, + { + "epoch": 7.321055862492327, + "grad_norm": 0.16188180446624756, + "learning_rate": 1.7673161734922084e-05, + "loss": 1.6307, + "step": 23852 + }, + { + "epoch": 7.321362799263352, + "grad_norm": 0.2152155190706253, + "learning_rate": 1.7669369939896302e-05, + "loss": 1.7135, + "step": 23853 + }, + { + "epoch": 7.3216697360343765, + "grad_norm": 0.15789814293384552, + "learning_rate": 1.7665578464380788e-05, + "loss": 1.7269, + "step": 23854 + }, + { + "epoch": 7.321976672805402, + "grad_norm": 0.17263127863407135, + "learning_rate": 1.7661787308412948e-05, + "loss": 1.6624, + "step": 23855 + }, + { + "epoch": 7.322283609576427, + "grad_norm": 0.19711650907993317, + "learning_rate": 1.7657996472030308e-05, + "loss": 1.7837, + "step": 23856 + }, + { + "epoch": 7.3225905463474525, + "grad_norm": 0.1847725212574005, + "learning_rate": 1.765420595527027e-05, + "loss": 1.707, + "step": 23857 + }, + { + "epoch": 7.322897483118478, + "grad_norm": 0.21316368877887726, + "learning_rate": 1.7650415758170345e-05, + "loss": 1.715, + "step": 23858 + }, + { + "epoch": 7.323204419889503, + "grad_norm": 0.1912030428647995, + "learning_rate": 1.7646625880767976e-05, + "loss": 1.7465, + "step": 23859 + }, + { + "epoch": 7.323511356660528, + "grad_norm": 0.16245616972446442, + "learning_rate": 1.7642836323100614e-05, + "loss": 1.7365, + "step": 23860 + }, + { + "epoch": 7.323818293431553, + "grad_norm": 0.20665429532527924, + "learning_rate": 1.76390470852057e-05, + "loss": 1.7435, + "step": 23861 + }, + { + "epoch": 7.324125230202578, + "grad_norm": 0.17079970240592957, + "learning_rate": 1.76352581671207e-05, + "loss": 1.7094, + "step": 23862 + }, + { + "epoch": 7.324432166973604, + "grad_norm": 0.17388395965099335, + "learning_rate": 1.7631469568883042e-05, + "loss": 1.7275, + "step": 23863 + }, + { + "epoch": 7.324739103744629, + "grad_norm": 0.20209765434265137, + "learning_rate": 1.7627681290530175e-05, + "loss": 1.7755, + "step": 23864 + }, + { + "epoch": 7.3250460405156534, + "grad_norm": 0.16459977626800537, + "learning_rate": 1.7623893332099538e-05, + "loss": 1.6765, + "step": 23865 + }, + { + "epoch": 7.325352977286679, + "grad_norm": 0.18313255906105042, + "learning_rate": 1.7620105693628556e-05, + "loss": 1.6792, + "step": 23866 + }, + { + "epoch": 7.325659914057704, + "grad_norm": 0.1651672124862671, + "learning_rate": 1.761631837515468e-05, + "loss": 1.6999, + "step": 23867 + }, + { + "epoch": 7.3259668508287294, + "grad_norm": 0.17414255440235138, + "learning_rate": 1.7612531376715317e-05, + "loss": 1.69, + "step": 23868 + }, + { + "epoch": 7.326273787599755, + "grad_norm": 0.1824718415737152, + "learning_rate": 1.7608744698347908e-05, + "loss": 1.6822, + "step": 23869 + }, + { + "epoch": 7.326580724370779, + "grad_norm": 0.19557121396064758, + "learning_rate": 1.760495834008986e-05, + "loss": 1.6852, + "step": 23870 + }, + { + "epoch": 7.326887661141805, + "grad_norm": 0.17803436517715454, + "learning_rate": 1.7601172301978606e-05, + "loss": 1.7523, + "step": 23871 + }, + { + "epoch": 7.32719459791283, + "grad_norm": 0.24077050387859344, + "learning_rate": 1.7597386584051545e-05, + "loss": 1.8044, + "step": 23872 + }, + { + "epoch": 7.327501534683855, + "grad_norm": 0.20061948895454407, + "learning_rate": 1.7593601186346127e-05, + "loss": 1.7298, + "step": 23873 + }, + { + "epoch": 7.327808471454881, + "grad_norm": 0.17362944781780243, + "learning_rate": 1.758981610889971e-05, + "loss": 1.7116, + "step": 23874 + }, + { + "epoch": 7.328115408225905, + "grad_norm": 0.20858663320541382, + "learning_rate": 1.758603135174974e-05, + "loss": 1.6765, + "step": 23875 + }, + { + "epoch": 7.32842234499693, + "grad_norm": 0.1805036962032318, + "learning_rate": 1.7582246914933604e-05, + "loss": 1.694, + "step": 23876 + }, + { + "epoch": 7.328729281767956, + "grad_norm": 0.26010429859161377, + "learning_rate": 1.7578462798488704e-05, + "loss": 1.7373, + "step": 23877 + }, + { + "epoch": 7.329036218538981, + "grad_norm": 0.19902443885803223, + "learning_rate": 1.7574679002452444e-05, + "loss": 1.72, + "step": 23878 + }, + { + "epoch": 7.329343155310006, + "grad_norm": 0.21231114864349365, + "learning_rate": 1.7570895526862202e-05, + "loss": 1.7526, + "step": 23879 + }, + { + "epoch": 7.329650092081032, + "grad_norm": 0.2075740098953247, + "learning_rate": 1.7567112371755384e-05, + "loss": 1.773, + "step": 23880 + }, + { + "epoch": 7.329957028852056, + "grad_norm": 0.21381771564483643, + "learning_rate": 1.756332953716937e-05, + "loss": 1.733, + "step": 23881 + }, + { + "epoch": 7.3302639656230815, + "grad_norm": 0.21689461171627045, + "learning_rate": 1.755954702314155e-05, + "loss": 1.7234, + "step": 23882 + }, + { + "epoch": 7.330570902394107, + "grad_norm": 0.21094383299350739, + "learning_rate": 1.755576482970929e-05, + "loss": 1.7074, + "step": 23883 + }, + { + "epoch": 7.330877839165132, + "grad_norm": 0.18460774421691895, + "learning_rate": 1.7551982956909985e-05, + "loss": 1.6706, + "step": 23884 + }, + { + "epoch": 7.3311847759361575, + "grad_norm": 0.18868015706539154, + "learning_rate": 1.7548201404781e-05, + "loss": 1.6371, + "step": 23885 + }, + { + "epoch": 7.331491712707182, + "grad_norm": 0.18036094307899475, + "learning_rate": 1.7544420173359715e-05, + "loss": 1.7115, + "step": 23886 + }, + { + "epoch": 7.331798649478207, + "grad_norm": 0.17143553495407104, + "learning_rate": 1.754063926268349e-05, + "loss": 1.668, + "step": 23887 + }, + { + "epoch": 7.332105586249233, + "grad_norm": 0.1700706034898758, + "learning_rate": 1.7536858672789684e-05, + "loss": 1.7244, + "step": 23888 + }, + { + "epoch": 7.332412523020258, + "grad_norm": 0.1740385890007019, + "learning_rate": 1.7533078403715665e-05, + "loss": 1.7163, + "step": 23889 + }, + { + "epoch": 7.332719459791283, + "grad_norm": 0.206922248005867, + "learning_rate": 1.752929845549882e-05, + "loss": 1.7572, + "step": 23890 + }, + { + "epoch": 7.333026396562309, + "grad_norm": 0.22770223021507263, + "learning_rate": 1.7525518828176445e-05, + "loss": 1.7391, + "step": 23891 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.203486829996109, + "learning_rate": 1.7521739521785962e-05, + "loss": 1.7664, + "step": 23892 + }, + { + "epoch": 7.333640270104358, + "grad_norm": 0.15539827942848206, + "learning_rate": 1.7517960536364652e-05, + "loss": 1.675, + "step": 23893 + }, + { + "epoch": 7.333947206875384, + "grad_norm": 0.18226636946201324, + "learning_rate": 1.7514181871949913e-05, + "loss": 1.7097, + "step": 23894 + }, + { + "epoch": 7.334254143646409, + "grad_norm": 0.1522573083639145, + "learning_rate": 1.751040352857907e-05, + "loss": 1.6783, + "step": 23895 + }, + { + "epoch": 7.334561080417434, + "grad_norm": 0.18082024157047272, + "learning_rate": 1.750662550628946e-05, + "loss": 1.752, + "step": 23896 + }, + { + "epoch": 7.334868017188459, + "grad_norm": 0.1968161165714264, + "learning_rate": 1.750284780511844e-05, + "loss": 1.7773, + "step": 23897 + }, + { + "epoch": 7.335174953959484, + "grad_norm": 0.17520470917224884, + "learning_rate": 1.7499070425103286e-05, + "loss": 1.7244, + "step": 23898 + }, + { + "epoch": 7.3354818907305095, + "grad_norm": 0.32224342226982117, + "learning_rate": 1.749529336628139e-05, + "loss": 1.8087, + "step": 23899 + }, + { + "epoch": 7.335788827501535, + "grad_norm": 0.25473707914352417, + "learning_rate": 1.7491516628690053e-05, + "loss": 1.7677, + "step": 23900 + }, + { + "epoch": 7.33609576427256, + "grad_norm": 0.20730654895305634, + "learning_rate": 1.7487740212366604e-05, + "loss": 1.7261, + "step": 23901 + }, + { + "epoch": 7.336402701043585, + "grad_norm": 0.22070205211639404, + "learning_rate": 1.748396411734836e-05, + "loss": 1.8024, + "step": 23902 + }, + { + "epoch": 7.33670963781461, + "grad_norm": 0.16921460628509521, + "learning_rate": 1.7480188343672647e-05, + "loss": 1.6823, + "step": 23903 + }, + { + "epoch": 7.337016574585635, + "grad_norm": 0.16576658189296722, + "learning_rate": 1.747641289137677e-05, + "loss": 1.6563, + "step": 23904 + }, + { + "epoch": 7.337323511356661, + "grad_norm": 0.19541388750076294, + "learning_rate": 1.7472637760498046e-05, + "loss": 1.8023, + "step": 23905 + }, + { + "epoch": 7.337630448127686, + "grad_norm": 0.19848179817199707, + "learning_rate": 1.7468862951073754e-05, + "loss": 1.7395, + "step": 23906 + }, + { + "epoch": 7.337937384898711, + "grad_norm": 0.1627921313047409, + "learning_rate": 1.746508846314127e-05, + "loss": 1.6569, + "step": 23907 + }, + { + "epoch": 7.338244321669736, + "grad_norm": 0.1798046976327896, + "learning_rate": 1.7461314296737813e-05, + "loss": 1.6927, + "step": 23908 + }, + { + "epoch": 7.338551258440761, + "grad_norm": 0.17935742437839508, + "learning_rate": 1.7457540451900757e-05, + "loss": 1.701, + "step": 23909 + }, + { + "epoch": 7.338858195211786, + "grad_norm": 0.16761814057826996, + "learning_rate": 1.745376692866732e-05, + "loss": 1.6701, + "step": 23910 + }, + { + "epoch": 7.339165131982812, + "grad_norm": 0.1733570694923401, + "learning_rate": 1.7449993727074855e-05, + "loss": 1.705, + "step": 23911 + }, + { + "epoch": 7.339472068753837, + "grad_norm": 0.21162372827529907, + "learning_rate": 1.7446220847160626e-05, + "loss": 1.7703, + "step": 23912 + }, + { + "epoch": 7.3397790055248615, + "grad_norm": 0.18743988871574402, + "learning_rate": 1.7442448288961928e-05, + "loss": 1.6899, + "step": 23913 + }, + { + "epoch": 7.340085942295887, + "grad_norm": 0.19185546040534973, + "learning_rate": 1.743867605251605e-05, + "loss": 1.7483, + "step": 23914 + }, + { + "epoch": 7.340392879066912, + "grad_norm": 0.23066233098506927, + "learning_rate": 1.7434904137860232e-05, + "loss": 1.7564, + "step": 23915 + }, + { + "epoch": 7.3406998158379375, + "grad_norm": 0.18159757554531097, + "learning_rate": 1.743113254503179e-05, + "loss": 1.7136, + "step": 23916 + }, + { + "epoch": 7.341006752608963, + "grad_norm": 0.22666020691394806, + "learning_rate": 1.7427361274067995e-05, + "loss": 1.7589, + "step": 23917 + }, + { + "epoch": 7.341313689379987, + "grad_norm": 0.18986108899116516, + "learning_rate": 1.74235903250061e-05, + "loss": 1.7429, + "step": 23918 + }, + { + "epoch": 7.341620626151013, + "grad_norm": 0.17987726628780365, + "learning_rate": 1.741981969788338e-05, + "loss": 1.7457, + "step": 23919 + }, + { + "epoch": 7.341927562922038, + "grad_norm": 0.2370992749929428, + "learning_rate": 1.7416049392737093e-05, + "loss": 1.7594, + "step": 23920 + }, + { + "epoch": 7.342234499693063, + "grad_norm": 0.18698690831661224, + "learning_rate": 1.7412279409604508e-05, + "loss": 1.7555, + "step": 23921 + }, + { + "epoch": 7.342541436464089, + "grad_norm": 0.18401117622852325, + "learning_rate": 1.7408509748522882e-05, + "loss": 1.7355, + "step": 23922 + }, + { + "epoch": 7.342848373235114, + "grad_norm": 0.22045543789863586, + "learning_rate": 1.7404740409529448e-05, + "loss": 1.7227, + "step": 23923 + }, + { + "epoch": 7.343155310006138, + "grad_norm": 0.24414709210395813, + "learning_rate": 1.7400971392661502e-05, + "loss": 1.7551, + "step": 23924 + }, + { + "epoch": 7.343462246777164, + "grad_norm": 0.1906892955303192, + "learning_rate": 1.739720269795623e-05, + "loss": 1.7204, + "step": 23925 + }, + { + "epoch": 7.343769183548189, + "grad_norm": 0.1840149164199829, + "learning_rate": 1.7393434325450948e-05, + "loss": 1.74, + "step": 23926 + }, + { + "epoch": 7.344076120319214, + "grad_norm": 0.21434549987316132, + "learning_rate": 1.7389666275182825e-05, + "loss": 1.6961, + "step": 23927 + }, + { + "epoch": 7.34438305709024, + "grad_norm": 0.19110503792762756, + "learning_rate": 1.7385898547189146e-05, + "loss": 1.7731, + "step": 23928 + }, + { + "epoch": 7.344689993861264, + "grad_norm": 0.18905460834503174, + "learning_rate": 1.7382131141507136e-05, + "loss": 1.6925, + "step": 23929 + }, + { + "epoch": 7.3449969306322895, + "grad_norm": 0.16336308419704437, + "learning_rate": 1.7378364058174024e-05, + "loss": 1.7073, + "step": 23930 + }, + { + "epoch": 7.345303867403315, + "grad_norm": 0.16707782447338104, + "learning_rate": 1.7374597297227056e-05, + "loss": 1.7036, + "step": 23931 + }, + { + "epoch": 7.34561080417434, + "grad_norm": 0.19958938658237457, + "learning_rate": 1.7370830858703406e-05, + "loss": 1.7035, + "step": 23932 + }, + { + "epoch": 7.3459177409453655, + "grad_norm": 0.18446899950504303, + "learning_rate": 1.7367064742640348e-05, + "loss": 1.754, + "step": 23933 + }, + { + "epoch": 7.346224677716391, + "grad_norm": 0.19238999485969543, + "learning_rate": 1.736329894907508e-05, + "loss": 1.6903, + "step": 23934 + }, + { + "epoch": 7.346531614487415, + "grad_norm": 0.1985396146774292, + "learning_rate": 1.7359533478044825e-05, + "loss": 1.7342, + "step": 23935 + }, + { + "epoch": 7.346838551258441, + "grad_norm": 0.19200150668621063, + "learning_rate": 1.7355768329586784e-05, + "loss": 1.6915, + "step": 23936 + }, + { + "epoch": 7.347145488029466, + "grad_norm": 0.19772231578826904, + "learning_rate": 1.7352003503738186e-05, + "loss": 1.7341, + "step": 23937 + }, + { + "epoch": 7.347452424800491, + "grad_norm": 0.1961035579442978, + "learning_rate": 1.7348239000536214e-05, + "loss": 1.7395, + "step": 23938 + }, + { + "epoch": 7.347759361571517, + "grad_norm": 0.15188434720039368, + "learning_rate": 1.7344474820018087e-05, + "loss": 1.635, + "step": 23939 + }, + { + "epoch": 7.348066298342541, + "grad_norm": 0.18748410046100616, + "learning_rate": 1.734071096222098e-05, + "loss": 1.6878, + "step": 23940 + }, + { + "epoch": 7.348373235113566, + "grad_norm": 0.19337952136993408, + "learning_rate": 1.7336947427182143e-05, + "loss": 1.7532, + "step": 23941 + }, + { + "epoch": 7.348680171884592, + "grad_norm": 0.14804427325725555, + "learning_rate": 1.73331842149387e-05, + "loss": 1.683, + "step": 23942 + }, + { + "epoch": 7.348987108655617, + "grad_norm": 0.18310968577861786, + "learning_rate": 1.7329421325527916e-05, + "loss": 1.718, + "step": 23943 + }, + { + "epoch": 7.349294045426642, + "grad_norm": 0.18589583039283752, + "learning_rate": 1.7325658758986906e-05, + "loss": 1.7115, + "step": 23944 + }, + { + "epoch": 7.349600982197667, + "grad_norm": 0.1618955284357071, + "learning_rate": 1.7321896515352904e-05, + "loss": 1.6757, + "step": 23945 + }, + { + "epoch": 7.349907918968692, + "grad_norm": 0.20092655718326569, + "learning_rate": 1.731813459466307e-05, + "loss": 1.7537, + "step": 23946 + }, + { + "epoch": 7.350214855739718, + "grad_norm": 0.17287038266658783, + "learning_rate": 1.7314372996954592e-05, + "loss": 1.6744, + "step": 23947 + }, + { + "epoch": 7.350521792510743, + "grad_norm": 0.19176220893859863, + "learning_rate": 1.731061172226465e-05, + "loss": 1.7279, + "step": 23948 + }, + { + "epoch": 7.350828729281768, + "grad_norm": 0.2060871571302414, + "learning_rate": 1.7306850770630367e-05, + "loss": 1.7802, + "step": 23949 + }, + { + "epoch": 7.351135666052793, + "grad_norm": 0.27185341715812683, + "learning_rate": 1.7303090142088967e-05, + "loss": 1.7234, + "step": 23950 + }, + { + "epoch": 7.351442602823818, + "grad_norm": 0.19845733046531677, + "learning_rate": 1.729932983667759e-05, + "loss": 1.7503, + "step": 23951 + }, + { + "epoch": 7.351749539594843, + "grad_norm": 0.19455648958683014, + "learning_rate": 1.729556985443341e-05, + "loss": 1.8096, + "step": 23952 + }, + { + "epoch": 7.352056476365869, + "grad_norm": 0.19090545177459717, + "learning_rate": 1.729181019539357e-05, + "loss": 1.6776, + "step": 23953 + }, + { + "epoch": 7.352363413136894, + "grad_norm": 0.16086700558662415, + "learning_rate": 1.728805085959524e-05, + "loss": 1.6829, + "step": 23954 + }, + { + "epoch": 7.352670349907919, + "grad_norm": 0.2156524360179901, + "learning_rate": 1.7284291847075555e-05, + "loss": 1.7147, + "step": 23955 + }, + { + "epoch": 7.352977286678944, + "grad_norm": 0.20258861780166626, + "learning_rate": 1.728053315787168e-05, + "loss": 1.7085, + "step": 23956 + }, + { + "epoch": 7.353284223449969, + "grad_norm": 0.1877330094575882, + "learning_rate": 1.7276774792020735e-05, + "loss": 1.7311, + "step": 23957 + }, + { + "epoch": 7.3535911602209945, + "grad_norm": 0.22096484899520874, + "learning_rate": 1.727301674955992e-05, + "loss": 1.6712, + "step": 23958 + }, + { + "epoch": 7.35389809699202, + "grad_norm": 0.21456706523895264, + "learning_rate": 1.726925903052629e-05, + "loss": 1.7773, + "step": 23959 + }, + { + "epoch": 7.354205033763045, + "grad_norm": 0.2114667296409607, + "learning_rate": 1.7265501634957072e-05, + "loss": 1.669, + "step": 23960 + }, + { + "epoch": 7.35451197053407, + "grad_norm": 0.1676410287618637, + "learning_rate": 1.726174456288931e-05, + "loss": 1.6673, + "step": 23961 + }, + { + "epoch": 7.354818907305095, + "grad_norm": 0.19883838295936584, + "learning_rate": 1.72579878143602e-05, + "loss": 1.6821, + "step": 23962 + }, + { + "epoch": 7.35512584407612, + "grad_norm": 0.19240599870681763, + "learning_rate": 1.725423138940684e-05, + "loss": 1.741, + "step": 23963 + }, + { + "epoch": 7.355432780847146, + "grad_norm": 0.230613574385643, + "learning_rate": 1.7250475288066363e-05, + "loss": 1.6937, + "step": 23964 + }, + { + "epoch": 7.355739717618171, + "grad_norm": 0.17126981914043427, + "learning_rate": 1.7246719510375898e-05, + "loss": 1.6791, + "step": 23965 + }, + { + "epoch": 7.356046654389196, + "grad_norm": 0.1852734386920929, + "learning_rate": 1.7242964056372518e-05, + "loss": 1.7196, + "step": 23966 + }, + { + "epoch": 7.356353591160221, + "grad_norm": 0.1922985464334488, + "learning_rate": 1.723920892609338e-05, + "loss": 1.794, + "step": 23967 + }, + { + "epoch": 7.356660527931246, + "grad_norm": 0.1918993592262268, + "learning_rate": 1.7235454119575582e-05, + "loss": 1.7725, + "step": 23968 + }, + { + "epoch": 7.356967464702271, + "grad_norm": 0.21787014603614807, + "learning_rate": 1.723169963685623e-05, + "loss": 1.7382, + "step": 23969 + }, + { + "epoch": 7.357274401473297, + "grad_norm": 0.23753544688224792, + "learning_rate": 1.722794547797243e-05, + "loss": 1.7924, + "step": 23970 + }, + { + "epoch": 7.357581338244322, + "grad_norm": 0.2251000851392746, + "learning_rate": 1.722419164296128e-05, + "loss": 1.6794, + "step": 23971 + }, + { + "epoch": 7.3578882750153465, + "grad_norm": 0.21573983132839203, + "learning_rate": 1.7220438131859878e-05, + "loss": 1.796, + "step": 23972 + }, + { + "epoch": 7.358195211786372, + "grad_norm": 0.217384472489357, + "learning_rate": 1.721668494470532e-05, + "loss": 1.7305, + "step": 23973 + }, + { + "epoch": 7.358502148557397, + "grad_norm": 0.21815331280231476, + "learning_rate": 1.7212932081534677e-05, + "loss": 1.7348, + "step": 23974 + }, + { + "epoch": 7.3588090853284225, + "grad_norm": 0.19974499940872192, + "learning_rate": 1.7209179542385097e-05, + "loss": 1.7383, + "step": 23975 + }, + { + "epoch": 7.359116022099448, + "grad_norm": 0.20518191158771515, + "learning_rate": 1.7205427327293582e-05, + "loss": 1.7087, + "step": 23976 + }, + { + "epoch": 7.359422958870473, + "grad_norm": 0.17104744911193848, + "learning_rate": 1.7201675436297293e-05, + "loss": 1.718, + "step": 23977 + }, + { + "epoch": 7.359729895641498, + "grad_norm": 0.2165975421667099, + "learning_rate": 1.7197923869433235e-05, + "loss": 1.7907, + "step": 23978 + }, + { + "epoch": 7.360036832412523, + "grad_norm": 0.1784742921590805, + "learning_rate": 1.719417262673854e-05, + "loss": 1.6354, + "step": 23979 + }, + { + "epoch": 7.360343769183548, + "grad_norm": 0.1867162138223648, + "learning_rate": 1.719042170825026e-05, + "loss": 1.7264, + "step": 23980 + }, + { + "epoch": 7.360650705954574, + "grad_norm": 0.19704937934875488, + "learning_rate": 1.7186671114005458e-05, + "loss": 1.72, + "step": 23981 + }, + { + "epoch": 7.360957642725599, + "grad_norm": 0.20316866040229797, + "learning_rate": 1.718292084404123e-05, + "loss": 1.759, + "step": 23982 + }, + { + "epoch": 7.361264579496623, + "grad_norm": 0.20339833199977875, + "learning_rate": 1.717917089839457e-05, + "loss": 1.7537, + "step": 23983 + }, + { + "epoch": 7.361571516267649, + "grad_norm": 0.18114012479782104, + "learning_rate": 1.71754212771026e-05, + "loss": 1.7207, + "step": 23984 + }, + { + "epoch": 7.361878453038674, + "grad_norm": 0.16071686148643494, + "learning_rate": 1.7171671980202353e-05, + "loss": 1.6534, + "step": 23985 + }, + { + "epoch": 7.362185389809699, + "grad_norm": 0.15212370455265045, + "learning_rate": 1.7167923007730892e-05, + "loss": 1.6638, + "step": 23986 + }, + { + "epoch": 7.362492326580725, + "grad_norm": 0.16284595429897308, + "learning_rate": 1.7164174359725253e-05, + "loss": 1.7442, + "step": 23987 + }, + { + "epoch": 7.362799263351749, + "grad_norm": 0.18302884697914124, + "learning_rate": 1.7160426036222494e-05, + "loss": 1.7087, + "step": 23988 + }, + { + "epoch": 7.3631062001227745, + "grad_norm": 0.18764640390872955, + "learning_rate": 1.715667803725965e-05, + "loss": 1.702, + "step": 23989 + }, + { + "epoch": 7.3634131368938, + "grad_norm": 0.16912522912025452, + "learning_rate": 1.7152930362873758e-05, + "loss": 1.742, + "step": 23990 + }, + { + "epoch": 7.363720073664825, + "grad_norm": 0.21137015521526337, + "learning_rate": 1.714918301310185e-05, + "loss": 1.7074, + "step": 23991 + }, + { + "epoch": 7.3640270104358505, + "grad_norm": 0.17562401294708252, + "learning_rate": 1.7145435987981008e-05, + "loss": 1.69, + "step": 23992 + }, + { + "epoch": 7.364333947206875, + "grad_norm": 0.15575642883777618, + "learning_rate": 1.714168928754818e-05, + "loss": 1.6986, + "step": 23993 + }, + { + "epoch": 7.3646408839779, + "grad_norm": 0.18057680130004883, + "learning_rate": 1.7137942911840477e-05, + "loss": 1.7661, + "step": 23994 + }, + { + "epoch": 7.364947820748926, + "grad_norm": 0.18899883329868317, + "learning_rate": 1.7134196860894853e-05, + "loss": 1.6841, + "step": 23995 + }, + { + "epoch": 7.365254757519951, + "grad_norm": 0.15350781381130219, + "learning_rate": 1.7130451134748367e-05, + "loss": 1.7005, + "step": 23996 + }, + { + "epoch": 7.365561694290976, + "grad_norm": 0.20394811034202576, + "learning_rate": 1.7126705733438037e-05, + "loss": 1.7342, + "step": 23997 + }, + { + "epoch": 7.365868631062002, + "grad_norm": 0.1881636083126068, + "learning_rate": 1.7122960657000864e-05, + "loss": 1.6985, + "step": 23998 + }, + { + "epoch": 7.366175567833026, + "grad_norm": 0.1619534194469452, + "learning_rate": 1.711921590547388e-05, + "loss": 1.6579, + "step": 23999 + }, + { + "epoch": 7.366482504604051, + "grad_norm": 0.16795861721038818, + "learning_rate": 1.711547147889404e-05, + "loss": 1.717, + "step": 24000 + }, + { + "epoch": 7.366789441375077, + "grad_norm": 0.1452684998512268, + "learning_rate": 1.711172737729841e-05, + "loss": 1.6792, + "step": 24001 + }, + { + "epoch": 7.367096378146102, + "grad_norm": 0.14940062165260315, + "learning_rate": 1.710798360072396e-05, + "loss": 1.6731, + "step": 24002 + }, + { + "epoch": 7.367403314917127, + "grad_norm": 0.21277321875095367, + "learning_rate": 1.7104240149207694e-05, + "loss": 1.7145, + "step": 24003 + }, + { + "epoch": 7.367710251688152, + "grad_norm": 0.17097726464271545, + "learning_rate": 1.710049702278661e-05, + "loss": 1.7052, + "step": 24004 + }, + { + "epoch": 7.368017188459177, + "grad_norm": 0.15970511734485626, + "learning_rate": 1.7096754221497702e-05, + "loss": 1.6586, + "step": 24005 + }, + { + "epoch": 7.3683241252302025, + "grad_norm": 0.198451429605484, + "learning_rate": 1.7093011745377945e-05, + "loss": 1.7449, + "step": 24006 + }, + { + "epoch": 7.368631062001228, + "grad_norm": 0.19554266333580017, + "learning_rate": 1.7089269594464342e-05, + "loss": 1.7455, + "step": 24007 + }, + { + "epoch": 7.368937998772253, + "grad_norm": 0.1854190230369568, + "learning_rate": 1.7085527768793847e-05, + "loss": 1.7355, + "step": 24008 + }, + { + "epoch": 7.3692449355432785, + "grad_norm": 0.17093004286289215, + "learning_rate": 1.708178626840349e-05, + "loss": 1.6813, + "step": 24009 + }, + { + "epoch": 7.369551872314303, + "grad_norm": 0.15385115146636963, + "learning_rate": 1.707804509333018e-05, + "loss": 1.664, + "step": 24010 + }, + { + "epoch": 7.369858809085328, + "grad_norm": 0.18747489154338837, + "learning_rate": 1.7074304243610963e-05, + "loss": 1.787, + "step": 24011 + }, + { + "epoch": 7.370165745856354, + "grad_norm": 0.21749509871006012, + "learning_rate": 1.7070563719282734e-05, + "loss": 1.723, + "step": 24012 + }, + { + "epoch": 7.370472682627379, + "grad_norm": 0.18973985314369202, + "learning_rate": 1.7066823520382508e-05, + "loss": 1.7415, + "step": 24013 + }, + { + "epoch": 7.370779619398404, + "grad_norm": 0.24844922125339508, + "learning_rate": 1.706308364694724e-05, + "loss": 1.7617, + "step": 24014 + }, + { + "epoch": 7.371086556169429, + "grad_norm": 0.16565518081188202, + "learning_rate": 1.705934409901388e-05, + "loss": 1.6781, + "step": 24015 + }, + { + "epoch": 7.371393492940454, + "grad_norm": 0.22595234215259552, + "learning_rate": 1.705560487661941e-05, + "loss": 1.7706, + "step": 24016 + }, + { + "epoch": 7.371700429711479, + "grad_norm": 0.2452661544084549, + "learning_rate": 1.7051865979800723e-05, + "loss": 1.8227, + "step": 24017 + }, + { + "epoch": 7.372007366482505, + "grad_norm": 0.2285550981760025, + "learning_rate": 1.7048127408594834e-05, + "loss": 1.7554, + "step": 24018 + }, + { + "epoch": 7.37231430325353, + "grad_norm": 0.22723950445652008, + "learning_rate": 1.7044389163038656e-05, + "loss": 1.7152, + "step": 24019 + }, + { + "epoch": 7.3726212400245545, + "grad_norm": 0.20335997641086578, + "learning_rate": 1.7040651243169143e-05, + "loss": 1.6661, + "step": 24020 + }, + { + "epoch": 7.37292817679558, + "grad_norm": 0.27618682384490967, + "learning_rate": 1.703691364902323e-05, + "loss": 1.8375, + "step": 24021 + }, + { + "epoch": 7.373235113566605, + "grad_norm": 0.24076996743679047, + "learning_rate": 1.7033176380637856e-05, + "loss": 1.7581, + "step": 24022 + }, + { + "epoch": 7.3735420503376305, + "grad_norm": 0.21615716814994812, + "learning_rate": 1.702943943804996e-05, + "loss": 1.7047, + "step": 24023 + }, + { + "epoch": 7.373848987108656, + "grad_norm": 0.23503927886486053, + "learning_rate": 1.7025702821296462e-05, + "loss": 1.7926, + "step": 24024 + }, + { + "epoch": 7.37415592387968, + "grad_norm": 0.2344675064086914, + "learning_rate": 1.7021966530414303e-05, + "loss": 1.747, + "step": 24025 + }, + { + "epoch": 7.374462860650706, + "grad_norm": 0.20946700870990753, + "learning_rate": 1.701823056544039e-05, + "loss": 1.746, + "step": 24026 + }, + { + "epoch": 7.374769797421731, + "grad_norm": 0.26749730110168457, + "learning_rate": 1.7014494926411645e-05, + "loss": 1.7375, + "step": 24027 + }, + { + "epoch": 7.375076734192756, + "grad_norm": 0.19716335833072662, + "learning_rate": 1.701075961336503e-05, + "loss": 1.6677, + "step": 24028 + }, + { + "epoch": 7.375383670963782, + "grad_norm": 0.1999496966600418, + "learning_rate": 1.7007024626337382e-05, + "loss": 1.6665, + "step": 24029 + }, + { + "epoch": 7.375690607734807, + "grad_norm": 0.188812255859375, + "learning_rate": 1.7003289965365676e-05, + "loss": 1.7344, + "step": 24030 + }, + { + "epoch": 7.3759975445058314, + "grad_norm": 0.20171904563903809, + "learning_rate": 1.6999555630486795e-05, + "loss": 1.7452, + "step": 24031 + }, + { + "epoch": 7.376304481276857, + "grad_norm": 0.21260966360569, + "learning_rate": 1.6995821621737655e-05, + "loss": 1.7759, + "step": 24032 + }, + { + "epoch": 7.376611418047882, + "grad_norm": 0.1913561075925827, + "learning_rate": 1.699208793915516e-05, + "loss": 1.7342, + "step": 24033 + }, + { + "epoch": 7.3769183548189075, + "grad_norm": 0.1907757967710495, + "learning_rate": 1.6988354582776166e-05, + "loss": 1.6511, + "step": 24034 + }, + { + "epoch": 7.377225291589933, + "grad_norm": 0.15012076497077942, + "learning_rate": 1.6984621552637625e-05, + "loss": 1.6638, + "step": 24035 + }, + { + "epoch": 7.377532228360957, + "grad_norm": 0.17761732637882233, + "learning_rate": 1.6980888848776394e-05, + "loss": 1.7035, + "step": 24036 + }, + { + "epoch": 7.377839165131983, + "grad_norm": 0.15940140187740326, + "learning_rate": 1.6977156471229376e-05, + "loss": 1.6532, + "step": 24037 + }, + { + "epoch": 7.378146101903008, + "grad_norm": 0.19022013247013092, + "learning_rate": 1.6973424420033455e-05, + "loss": 1.7545, + "step": 24038 + }, + { + "epoch": 7.378453038674033, + "grad_norm": 0.1900233030319214, + "learning_rate": 1.6969692695225513e-05, + "loss": 1.7051, + "step": 24039 + }, + { + "epoch": 7.378759975445059, + "grad_norm": 0.17687582969665527, + "learning_rate": 1.6965961296842425e-05, + "loss": 1.6819, + "step": 24040 + }, + { + "epoch": 7.379066912216084, + "grad_norm": 0.16323260962963104, + "learning_rate": 1.696223022492107e-05, + "loss": 1.6642, + "step": 24041 + }, + { + "epoch": 7.379373848987108, + "grad_norm": 0.21163886785507202, + "learning_rate": 1.695849947949832e-05, + "loss": 1.6973, + "step": 24042 + }, + { + "epoch": 7.379680785758134, + "grad_norm": 0.1713307648897171, + "learning_rate": 1.6954769060611043e-05, + "loss": 1.677, + "step": 24043 + }, + { + "epoch": 7.379987722529159, + "grad_norm": 0.19575951993465424, + "learning_rate": 1.695103896829609e-05, + "loss": 1.7305, + "step": 24044 + }, + { + "epoch": 7.380294659300184, + "grad_norm": 0.16087177395820618, + "learning_rate": 1.6947309202590377e-05, + "loss": 1.6435, + "step": 24045 + }, + { + "epoch": 7.38060159607121, + "grad_norm": 0.2088652402162552, + "learning_rate": 1.6943579763530692e-05, + "loss": 1.7136, + "step": 24046 + }, + { + "epoch": 7.380908532842234, + "grad_norm": 0.18253973126411438, + "learning_rate": 1.693985065115396e-05, + "loss": 1.7461, + "step": 24047 + }, + { + "epoch": 7.3812154696132595, + "grad_norm": 0.272062212228775, + "learning_rate": 1.6936121865496967e-05, + "loss": 1.7455, + "step": 24048 + }, + { + "epoch": 7.381522406384285, + "grad_norm": 0.1884320080280304, + "learning_rate": 1.6932393406596613e-05, + "loss": 1.7242, + "step": 24049 + }, + { + "epoch": 7.38182934315531, + "grad_norm": 0.22986121475696564, + "learning_rate": 1.6928665274489748e-05, + "loss": 1.7461, + "step": 24050 + }, + { + "epoch": 7.3821362799263355, + "grad_norm": 0.19400665163993835, + "learning_rate": 1.6924937469213158e-05, + "loss": 1.7468, + "step": 24051 + }, + { + "epoch": 7.382443216697361, + "grad_norm": 0.1990167796611786, + "learning_rate": 1.6921209990803744e-05, + "loss": 1.7253, + "step": 24052 + }, + { + "epoch": 7.382750153468385, + "grad_norm": 0.16667480766773224, + "learning_rate": 1.691748283929832e-05, + "loss": 1.6763, + "step": 24053 + }, + { + "epoch": 7.383057090239411, + "grad_norm": 0.20539991557598114, + "learning_rate": 1.691375601473372e-05, + "loss": 1.7408, + "step": 24054 + }, + { + "epoch": 7.383364027010436, + "grad_norm": 0.18021859228610992, + "learning_rate": 1.6910029517146776e-05, + "loss": 1.7075, + "step": 24055 + }, + { + "epoch": 7.383670963781461, + "grad_norm": 0.17450939118862152, + "learning_rate": 1.6906303346574314e-05, + "loss": 1.7074, + "step": 24056 + }, + { + "epoch": 7.383977900552487, + "grad_norm": 0.1690986454486847, + "learning_rate": 1.690257750305316e-05, + "loss": 1.6911, + "step": 24057 + }, + { + "epoch": 7.384284837323511, + "grad_norm": 0.19716380536556244, + "learning_rate": 1.6898851986620136e-05, + "loss": 1.7075, + "step": 24058 + }, + { + "epoch": 7.384591774094536, + "grad_norm": 0.20165397226810455, + "learning_rate": 1.6895126797312054e-05, + "loss": 1.7201, + "step": 24059 + }, + { + "epoch": 7.384898710865562, + "grad_norm": 0.22149543464183807, + "learning_rate": 1.6891401935165734e-05, + "loss": 1.7407, + "step": 24060 + }, + { + "epoch": 7.385205647636587, + "grad_norm": 0.1575438529253006, + "learning_rate": 1.6887677400217966e-05, + "loss": 1.6451, + "step": 24061 + }, + { + "epoch": 7.385512584407612, + "grad_norm": 0.18075503408908844, + "learning_rate": 1.688395319250562e-05, + "loss": 1.7084, + "step": 24062 + }, + { + "epoch": 7.385819521178637, + "grad_norm": 0.16428421437740326, + "learning_rate": 1.6880229312065414e-05, + "loss": 1.7047, + "step": 24063 + }, + { + "epoch": 7.386126457949662, + "grad_norm": 0.18372805416584015, + "learning_rate": 1.6876505758934237e-05, + "loss": 1.6726, + "step": 24064 + }, + { + "epoch": 7.3864333947206875, + "grad_norm": 0.199292853474617, + "learning_rate": 1.687278253314882e-05, + "loss": 1.7472, + "step": 24065 + }, + { + "epoch": 7.386740331491713, + "grad_norm": 0.20381483435630798, + "learning_rate": 1.686905963474597e-05, + "loss": 1.7128, + "step": 24066 + }, + { + "epoch": 7.387047268262738, + "grad_norm": 0.18497546017169952, + "learning_rate": 1.6865337063762527e-05, + "loss": 1.736, + "step": 24067 + }, + { + "epoch": 7.387354205033763, + "grad_norm": 0.21320439875125885, + "learning_rate": 1.6861614820235206e-05, + "loss": 1.7391, + "step": 24068 + }, + { + "epoch": 7.387661141804788, + "grad_norm": 0.22324618697166443, + "learning_rate": 1.6857892904200863e-05, + "loss": 1.7384, + "step": 24069 + }, + { + "epoch": 7.387968078575813, + "grad_norm": 0.18035978078842163, + "learning_rate": 1.6854171315696216e-05, + "loss": 1.7029, + "step": 24070 + }, + { + "epoch": 7.388275015346839, + "grad_norm": 0.1727912276983261, + "learning_rate": 1.6850450054758092e-05, + "loss": 1.6649, + "step": 24071 + }, + { + "epoch": 7.388581952117864, + "grad_norm": 0.19713124632835388, + "learning_rate": 1.6846729121423256e-05, + "loss": 1.7508, + "step": 24072 + }, + { + "epoch": 7.388888888888889, + "grad_norm": 0.19403581321239471, + "learning_rate": 1.6843008515728464e-05, + "loss": 1.7807, + "step": 24073 + }, + { + "epoch": 7.389195825659914, + "grad_norm": 0.20204444229602814, + "learning_rate": 1.6839288237710503e-05, + "loss": 1.778, + "step": 24074 + }, + { + "epoch": 7.389502762430939, + "grad_norm": 0.20021478831768036, + "learning_rate": 1.6835568287406127e-05, + "loss": 1.7544, + "step": 24075 + }, + { + "epoch": 7.389809699201964, + "grad_norm": 0.2247730791568756, + "learning_rate": 1.6831848664852107e-05, + "loss": 1.7422, + "step": 24076 + }, + { + "epoch": 7.39011663597299, + "grad_norm": 0.21600402891635895, + "learning_rate": 1.68281293700852e-05, + "loss": 1.7491, + "step": 24077 + }, + { + "epoch": 7.390423572744015, + "grad_norm": 0.1854497194290161, + "learning_rate": 1.6824410403142145e-05, + "loss": 1.7292, + "step": 24078 + }, + { + "epoch": 7.3907305095150395, + "grad_norm": 0.21738949418067932, + "learning_rate": 1.6820691764059736e-05, + "loss": 1.6996, + "step": 24079 + }, + { + "epoch": 7.391037446286065, + "grad_norm": 0.20114775002002716, + "learning_rate": 1.6816973452874674e-05, + "loss": 1.7299, + "step": 24080 + }, + { + "epoch": 7.39134438305709, + "grad_norm": 0.17267082631587982, + "learning_rate": 1.681325546962376e-05, + "loss": 1.7181, + "step": 24081 + }, + { + "epoch": 7.3916513198281155, + "grad_norm": 0.1681009829044342, + "learning_rate": 1.680953781434369e-05, + "loss": 1.6826, + "step": 24082 + }, + { + "epoch": 7.391958256599141, + "grad_norm": 0.18807077407836914, + "learning_rate": 1.6805820487071205e-05, + "loss": 1.6934, + "step": 24083 + }, + { + "epoch": 7.392265193370166, + "grad_norm": 0.1859835982322693, + "learning_rate": 1.680210348784309e-05, + "loss": 1.7065, + "step": 24084 + }, + { + "epoch": 7.392572130141191, + "grad_norm": 0.20433956384658813, + "learning_rate": 1.679838681669601e-05, + "loss": 1.7934, + "step": 24085 + }, + { + "epoch": 7.392879066912216, + "grad_norm": 0.2428809553384781, + "learning_rate": 1.679467047366677e-05, + "loss": 1.7619, + "step": 24086 + }, + { + "epoch": 7.393186003683241, + "grad_norm": 0.25117191672325134, + "learning_rate": 1.6790954458792025e-05, + "loss": 1.7254, + "step": 24087 + }, + { + "epoch": 7.393492940454267, + "grad_norm": 0.19429172575473785, + "learning_rate": 1.6787238772108544e-05, + "loss": 1.6946, + "step": 24088 + }, + { + "epoch": 7.393799877225292, + "grad_norm": 0.18574993312358856, + "learning_rate": 1.678352341365304e-05, + "loss": 1.6953, + "step": 24089 + }, + { + "epoch": 7.394106813996316, + "grad_norm": 0.21022208034992218, + "learning_rate": 1.6779808383462227e-05, + "loss": 1.7866, + "step": 24090 + }, + { + "epoch": 7.394413750767342, + "grad_norm": 0.16711890697479248, + "learning_rate": 1.6776093681572818e-05, + "loss": 1.6988, + "step": 24091 + }, + { + "epoch": 7.394720687538367, + "grad_norm": 0.23661695420742035, + "learning_rate": 1.6772379308021524e-05, + "loss": 1.7152, + "step": 24092 + }, + { + "epoch": 7.395027624309392, + "grad_norm": 0.18410098552703857, + "learning_rate": 1.6768665262845052e-05, + "loss": 1.6643, + "step": 24093 + }, + { + "epoch": 7.395334561080418, + "grad_norm": 0.19566760957241058, + "learning_rate": 1.676495154608011e-05, + "loss": 1.7371, + "step": 24094 + }, + { + "epoch": 7.395641497851442, + "grad_norm": 0.18130381405353546, + "learning_rate": 1.6761238157763375e-05, + "loss": 1.6934, + "step": 24095 + }, + { + "epoch": 7.3959484346224675, + "grad_norm": 0.16141927242279053, + "learning_rate": 1.6757525097931603e-05, + "loss": 1.6629, + "step": 24096 + }, + { + "epoch": 7.396255371393493, + "grad_norm": 0.18370656669139862, + "learning_rate": 1.6753812366621418e-05, + "loss": 1.6931, + "step": 24097 + }, + { + "epoch": 7.396562308164518, + "grad_norm": 0.17368416488170624, + "learning_rate": 1.675009996386958e-05, + "loss": 1.7028, + "step": 24098 + }, + { + "epoch": 7.3968692449355435, + "grad_norm": 0.1704222410917282, + "learning_rate": 1.6746387889712722e-05, + "loss": 1.7241, + "step": 24099 + }, + { + "epoch": 7.397176181706568, + "grad_norm": 0.19127961993217468, + "learning_rate": 1.674267614418754e-05, + "loss": 1.6606, + "step": 24100 + }, + { + "epoch": 7.397483118477593, + "grad_norm": 0.20173178613185883, + "learning_rate": 1.673896472733075e-05, + "loss": 1.7293, + "step": 24101 + }, + { + "epoch": 7.397790055248619, + "grad_norm": 0.194651797413826, + "learning_rate": 1.6735253639178977e-05, + "loss": 1.6889, + "step": 24102 + }, + { + "epoch": 7.398096992019644, + "grad_norm": 0.16184480488300323, + "learning_rate": 1.6731542879768957e-05, + "loss": 1.6929, + "step": 24103 + }, + { + "epoch": 7.398403928790669, + "grad_norm": 0.21806742250919342, + "learning_rate": 1.67278324491373e-05, + "loss": 1.6944, + "step": 24104 + }, + { + "epoch": 7.398710865561695, + "grad_norm": 0.1599469929933548, + "learning_rate": 1.6724122347320715e-05, + "loss": 1.7107, + "step": 24105 + }, + { + "epoch": 7.399017802332719, + "grad_norm": 0.18621234595775604, + "learning_rate": 1.672041257435586e-05, + "loss": 1.6856, + "step": 24106 + }, + { + "epoch": 7.399324739103744, + "grad_norm": 0.20682603120803833, + "learning_rate": 1.6716703130279393e-05, + "loss": 1.7699, + "step": 24107 + }, + { + "epoch": 7.39963167587477, + "grad_norm": 0.19649554789066315, + "learning_rate": 1.6712994015127976e-05, + "loss": 1.7049, + "step": 24108 + }, + { + "epoch": 7.399938612645795, + "grad_norm": 0.15894706547260284, + "learning_rate": 1.6709285228938255e-05, + "loss": 1.7352, + "step": 24109 + }, + { + "epoch": 7.4002455494168204, + "grad_norm": 0.22186337411403656, + "learning_rate": 1.6705576771746896e-05, + "loss": 1.7353, + "step": 24110 + }, + { + "epoch": 7.400552486187845, + "grad_norm": 0.14689651131629944, + "learning_rate": 1.670186864359054e-05, + "loss": 1.7155, + "step": 24111 + }, + { + "epoch": 7.40085942295887, + "grad_norm": 0.2055603563785553, + "learning_rate": 1.6698160844505817e-05, + "loss": 1.6897, + "step": 24112 + }, + { + "epoch": 7.401166359729896, + "grad_norm": 0.1641531139612198, + "learning_rate": 1.6694453374529423e-05, + "loss": 1.67, + "step": 24113 + }, + { + "epoch": 7.401473296500921, + "grad_norm": 0.21150687336921692, + "learning_rate": 1.6690746233697923e-05, + "loss": 1.7507, + "step": 24114 + }, + { + "epoch": 7.401780233271946, + "grad_norm": 0.1844765543937683, + "learning_rate": 1.6687039422048035e-05, + "loss": 1.702, + "step": 24115 + }, + { + "epoch": 7.402087170042972, + "grad_norm": 0.1695966124534607, + "learning_rate": 1.6683332939616326e-05, + "loss": 1.6683, + "step": 24116 + }, + { + "epoch": 7.402394106813996, + "grad_norm": 0.17938567698001862, + "learning_rate": 1.667962678643943e-05, + "loss": 1.6947, + "step": 24117 + }, + { + "epoch": 7.402701043585021, + "grad_norm": 0.16420964896678925, + "learning_rate": 1.6675920962554027e-05, + "loss": 1.755, + "step": 24118 + }, + { + "epoch": 7.403007980356047, + "grad_norm": 0.16095438599586487, + "learning_rate": 1.667221546799667e-05, + "loss": 1.6855, + "step": 24119 + }, + { + "epoch": 7.403314917127072, + "grad_norm": 0.2089291363954544, + "learning_rate": 1.6668510302804052e-05, + "loss": 1.7213, + "step": 24120 + }, + { + "epoch": 7.403621853898097, + "grad_norm": 0.18369436264038086, + "learning_rate": 1.6664805467012717e-05, + "loss": 1.6913, + "step": 24121 + }, + { + "epoch": 7.403928790669122, + "grad_norm": 0.16405323147773743, + "learning_rate": 1.6661100960659326e-05, + "loss": 1.6529, + "step": 24122 + }, + { + "epoch": 7.404235727440147, + "grad_norm": 0.20792648196220398, + "learning_rate": 1.6657396783780477e-05, + "loss": 1.6855, + "step": 24123 + }, + { + "epoch": 7.4045426642111725, + "grad_norm": 0.17733097076416016, + "learning_rate": 1.6653692936412773e-05, + "loss": 1.727, + "step": 24124 + }, + { + "epoch": 7.404849600982198, + "grad_norm": 0.16196851432323456, + "learning_rate": 1.6649989418592825e-05, + "loss": 1.7376, + "step": 24125 + }, + { + "epoch": 7.405156537753223, + "grad_norm": 0.17193716764450073, + "learning_rate": 1.664628623035723e-05, + "loss": 1.6802, + "step": 24126 + }, + { + "epoch": 7.4054634745242485, + "grad_norm": 0.22076182067394257, + "learning_rate": 1.6642583371742576e-05, + "loss": 1.7512, + "step": 24127 + }, + { + "epoch": 7.405770411295273, + "grad_norm": 0.20766951143741608, + "learning_rate": 1.663888084278547e-05, + "loss": 1.7457, + "step": 24128 + }, + { + "epoch": 7.406077348066298, + "grad_norm": 0.16815492510795593, + "learning_rate": 1.663517864352248e-05, + "loss": 1.6867, + "step": 24129 + }, + { + "epoch": 7.406384284837324, + "grad_norm": 0.19644804298877716, + "learning_rate": 1.6631476773990246e-05, + "loss": 1.6996, + "step": 24130 + }, + { + "epoch": 7.406691221608349, + "grad_norm": 0.18717117607593536, + "learning_rate": 1.662777523422528e-05, + "loss": 1.7745, + "step": 24131 + }, + { + "epoch": 7.406998158379374, + "grad_norm": 0.1679331511259079, + "learning_rate": 1.662407402426423e-05, + "loss": 1.7213, + "step": 24132 + }, + { + "epoch": 7.407305095150399, + "grad_norm": 0.1721929907798767, + "learning_rate": 1.662037314414363e-05, + "loss": 1.6759, + "step": 24133 + }, + { + "epoch": 7.407612031921424, + "grad_norm": 0.15507890284061432, + "learning_rate": 1.661667259390005e-05, + "loss": 1.6658, + "step": 24134 + }, + { + "epoch": 7.407918968692449, + "grad_norm": 0.20528049767017365, + "learning_rate": 1.6612972373570114e-05, + "loss": 1.7508, + "step": 24135 + }, + { + "epoch": 7.408225905463475, + "grad_norm": 0.20593658089637756, + "learning_rate": 1.6609272483190315e-05, + "loss": 1.8078, + "step": 24136 + }, + { + "epoch": 7.4085328422345, + "grad_norm": 0.19905441999435425, + "learning_rate": 1.6605572922797292e-05, + "loss": 1.7933, + "step": 24137 + }, + { + "epoch": 7.4088397790055245, + "grad_norm": 0.17571881413459778, + "learning_rate": 1.6601873692427537e-05, + "loss": 1.6908, + "step": 24138 + }, + { + "epoch": 7.40914671577655, + "grad_norm": 0.2244982272386551, + "learning_rate": 1.6598174792117655e-05, + "loss": 1.6998, + "step": 24139 + }, + { + "epoch": 7.409453652547575, + "grad_norm": 0.15267951786518097, + "learning_rate": 1.6594476221904193e-05, + "loss": 1.6399, + "step": 24140 + }, + { + "epoch": 7.4097605893186005, + "grad_norm": 0.24161390960216522, + "learning_rate": 1.659077798182369e-05, + "loss": 1.6776, + "step": 24141 + }, + { + "epoch": 7.410067526089626, + "grad_norm": 0.17184343934059143, + "learning_rate": 1.658708007191271e-05, + "loss": 1.7169, + "step": 24142 + }, + { + "epoch": 7.41037446286065, + "grad_norm": 0.1589801162481308, + "learning_rate": 1.6583382492207778e-05, + "loss": 1.6727, + "step": 24143 + }, + { + "epoch": 7.410681399631676, + "grad_norm": 0.18666890263557434, + "learning_rate": 1.6579685242745452e-05, + "loss": 1.7429, + "step": 24144 + }, + { + "epoch": 7.410988336402701, + "grad_norm": 0.22418901324272156, + "learning_rate": 1.6575988323562265e-05, + "loss": 1.7834, + "step": 24145 + }, + { + "epoch": 7.411295273173726, + "grad_norm": 0.1897875964641571, + "learning_rate": 1.6572291734694734e-05, + "loss": 1.7271, + "step": 24146 + }, + { + "epoch": 7.411602209944752, + "grad_norm": 0.18204644322395325, + "learning_rate": 1.6568595476179445e-05, + "loss": 1.7003, + "step": 24147 + }, + { + "epoch": 7.411909146715777, + "grad_norm": 0.19130240380764008, + "learning_rate": 1.6564899548052853e-05, + "loss": 1.6803, + "step": 24148 + }, + { + "epoch": 7.412216083486801, + "grad_norm": 0.19467706978321075, + "learning_rate": 1.6561203950351554e-05, + "loss": 1.7529, + "step": 24149 + }, + { + "epoch": 7.412523020257827, + "grad_norm": 0.20290352404117584, + "learning_rate": 1.655750868311202e-05, + "loss": 1.7742, + "step": 24150 + }, + { + "epoch": 7.412829957028852, + "grad_norm": 0.18538729846477509, + "learning_rate": 1.6553813746370772e-05, + "loss": 1.68, + "step": 24151 + }, + { + "epoch": 7.413136893799877, + "grad_norm": 0.23339742422103882, + "learning_rate": 1.655011914016437e-05, + "loss": 1.7499, + "step": 24152 + }, + { + "epoch": 7.413443830570903, + "grad_norm": 0.21964092552661896, + "learning_rate": 1.654642486452927e-05, + "loss": 1.7394, + "step": 24153 + }, + { + "epoch": 7.413750767341927, + "grad_norm": 0.2131531536579132, + "learning_rate": 1.6542730919502032e-05, + "loss": 1.6928, + "step": 24154 + }, + { + "epoch": 7.4140577041129525, + "grad_norm": 0.20840130746364594, + "learning_rate": 1.653903730511911e-05, + "loss": 1.6785, + "step": 24155 + }, + { + "epoch": 7.414364640883978, + "grad_norm": 0.1519836038351059, + "learning_rate": 1.653534402141705e-05, + "loss": 1.6882, + "step": 24156 + }, + { + "epoch": 7.414671577655003, + "grad_norm": 0.21539351344108582, + "learning_rate": 1.653165106843233e-05, + "loss": 1.7041, + "step": 24157 + }, + { + "epoch": 7.4149785144260285, + "grad_norm": 0.2050703912973404, + "learning_rate": 1.6527958446201453e-05, + "loss": 1.7854, + "step": 24158 + }, + { + "epoch": 7.415285451197054, + "grad_norm": 0.21595771610736847, + "learning_rate": 1.652426615476091e-05, + "loss": 1.7305, + "step": 24159 + }, + { + "epoch": 7.415592387968078, + "grad_norm": 0.19248713552951813, + "learning_rate": 1.6520574194147186e-05, + "loss": 1.6834, + "step": 24160 + }, + { + "epoch": 7.415899324739104, + "grad_norm": 0.178158700466156, + "learning_rate": 1.6516882564396774e-05, + "loss": 1.7312, + "step": 24161 + }, + { + "epoch": 7.416206261510129, + "grad_norm": 0.18686197698116302, + "learning_rate": 1.6513191265546152e-05, + "loss": 1.7025, + "step": 24162 + }, + { + "epoch": 7.416513198281154, + "grad_norm": 0.1544325053691864, + "learning_rate": 1.6509500297631787e-05, + "loss": 1.6773, + "step": 24163 + }, + { + "epoch": 7.41682013505218, + "grad_norm": 0.1787567138671875, + "learning_rate": 1.6505809660690197e-05, + "loss": 1.6941, + "step": 24164 + }, + { + "epoch": 7.417127071823204, + "grad_norm": 0.16545183956623077, + "learning_rate": 1.65021193547578e-05, + "loss": 1.6618, + "step": 24165 + }, + { + "epoch": 7.417434008594229, + "grad_norm": 0.23889821767807007, + "learning_rate": 1.6498429379871126e-05, + "loss": 1.7651, + "step": 24166 + }, + { + "epoch": 7.417740945365255, + "grad_norm": 0.2012832909822464, + "learning_rate": 1.649473973606659e-05, + "loss": 1.7477, + "step": 24167 + }, + { + "epoch": 7.41804788213628, + "grad_norm": 0.18035975098609924, + "learning_rate": 1.6491050423380662e-05, + "loss": 1.6747, + "step": 24168 + }, + { + "epoch": 7.418354818907305, + "grad_norm": 0.14925292134284973, + "learning_rate": 1.6487361441849842e-05, + "loss": 1.6817, + "step": 24169 + }, + { + "epoch": 7.41866175567833, + "grad_norm": 0.19253355264663696, + "learning_rate": 1.6483672791510523e-05, + "loss": 1.6943, + "step": 24170 + }, + { + "epoch": 7.418968692449355, + "grad_norm": 0.17203082144260406, + "learning_rate": 1.6479984472399234e-05, + "loss": 1.692, + "step": 24171 + }, + { + "epoch": 7.4192756292203805, + "grad_norm": 0.19132022559642792, + "learning_rate": 1.647629648455235e-05, + "loss": 1.7029, + "step": 24172 + }, + { + "epoch": 7.419582565991406, + "grad_norm": 0.17949101328849792, + "learning_rate": 1.647260882800637e-05, + "loss": 1.6944, + "step": 24173 + }, + { + "epoch": 7.419889502762431, + "grad_norm": 0.17752930521965027, + "learning_rate": 1.646892150279772e-05, + "loss": 1.6875, + "step": 24174 + }, + { + "epoch": 7.420196439533456, + "grad_norm": 0.19464492797851562, + "learning_rate": 1.6465234508962836e-05, + "loss": 1.6988, + "step": 24175 + }, + { + "epoch": 7.420503376304481, + "grad_norm": 0.20154574513435364, + "learning_rate": 1.6461547846538168e-05, + "loss": 1.7305, + "step": 24176 + }, + { + "epoch": 7.420810313075506, + "grad_norm": 0.20944970846176147, + "learning_rate": 1.6457861515560136e-05, + "loss": 1.7699, + "step": 24177 + }, + { + "epoch": 7.421117249846532, + "grad_norm": 0.22422203421592712, + "learning_rate": 1.6454175516065175e-05, + "loss": 1.6607, + "step": 24178 + }, + { + "epoch": 7.421424186617557, + "grad_norm": 0.16106431186199188, + "learning_rate": 1.6450489848089717e-05, + "loss": 1.7204, + "step": 24179 + }, + { + "epoch": 7.421731123388582, + "grad_norm": 0.24394269287586212, + "learning_rate": 1.644680451167018e-05, + "loss": 1.7161, + "step": 24180 + }, + { + "epoch": 7.422038060159607, + "grad_norm": 0.1999186873435974, + "learning_rate": 1.644311950684299e-05, + "loss": 1.7486, + "step": 24181 + }, + { + "epoch": 7.422344996930632, + "grad_norm": 0.1865876019001007, + "learning_rate": 1.6439434833644545e-05, + "loss": 1.737, + "step": 24182 + }, + { + "epoch": 7.422651933701657, + "grad_norm": 0.18088236451148987, + "learning_rate": 1.643575049211131e-05, + "loss": 1.6821, + "step": 24183 + }, + { + "epoch": 7.422958870472683, + "grad_norm": 0.17456914484500885, + "learning_rate": 1.643206648227964e-05, + "loss": 1.7379, + "step": 24184 + }, + { + "epoch": 7.423265807243708, + "grad_norm": 0.18160004913806915, + "learning_rate": 1.642838280418595e-05, + "loss": 1.7364, + "step": 24185 + }, + { + "epoch": 7.4235727440147325, + "grad_norm": 0.18081973493099213, + "learning_rate": 1.6424699457866688e-05, + "loss": 1.7591, + "step": 24186 + }, + { + "epoch": 7.423879680785758, + "grad_norm": 0.20753513276576996, + "learning_rate": 1.6421016443358195e-05, + "loss": 1.7299, + "step": 24187 + }, + { + "epoch": 7.424186617556783, + "grad_norm": 0.2102874517440796, + "learning_rate": 1.641733376069693e-05, + "loss": 1.7876, + "step": 24188 + }, + { + "epoch": 7.4244935543278086, + "grad_norm": 0.19360920786857605, + "learning_rate": 1.6413651409919224e-05, + "loss": 1.7578, + "step": 24189 + }, + { + "epoch": 7.424800491098834, + "grad_norm": 0.1954938918352127, + "learning_rate": 1.6409969391061514e-05, + "loss": 1.7074, + "step": 24190 + }, + { + "epoch": 7.425107427869859, + "grad_norm": 0.2228705734014511, + "learning_rate": 1.6406287704160177e-05, + "loss": 1.7261, + "step": 24191 + }, + { + "epoch": 7.425414364640884, + "grad_norm": 0.18695802986621857, + "learning_rate": 1.6402606349251597e-05, + "loss": 1.7074, + "step": 24192 + }, + { + "epoch": 7.425721301411909, + "grad_norm": 0.19026046991348267, + "learning_rate": 1.639892532637215e-05, + "loss": 1.7546, + "step": 24193 + }, + { + "epoch": 7.426028238182934, + "grad_norm": 0.2086167335510254, + "learning_rate": 1.639524463555822e-05, + "loss": 1.7551, + "step": 24194 + }, + { + "epoch": 7.42633517495396, + "grad_norm": 0.201420396566391, + "learning_rate": 1.639156427684618e-05, + "loss": 1.6961, + "step": 24195 + }, + { + "epoch": 7.426642111724985, + "grad_norm": 0.1735599786043167, + "learning_rate": 1.6387884250272394e-05, + "loss": 1.7461, + "step": 24196 + }, + { + "epoch": 7.4269490484960095, + "grad_norm": 0.23944853246212006, + "learning_rate": 1.6384204555873238e-05, + "loss": 1.7001, + "step": 24197 + }, + { + "epoch": 7.427255985267035, + "grad_norm": 0.15605413913726807, + "learning_rate": 1.638052519368508e-05, + "loss": 1.7105, + "step": 24198 + }, + { + "epoch": 7.42756292203806, + "grad_norm": 0.21450987458229065, + "learning_rate": 1.6376846163744257e-05, + "loss": 1.7309, + "step": 24199 + }, + { + "epoch": 7.4278698588090855, + "grad_norm": 0.20542307198047638, + "learning_rate": 1.637316746608718e-05, + "loss": 1.72, + "step": 24200 + }, + { + "epoch": 7.428176795580111, + "grad_norm": 0.18612053990364075, + "learning_rate": 1.6369489100750157e-05, + "loss": 1.6714, + "step": 24201 + }, + { + "epoch": 7.428483732351136, + "grad_norm": 0.16587957739830017, + "learning_rate": 1.6365811067769553e-05, + "loss": 1.7494, + "step": 24202 + }, + { + "epoch": 7.428790669122161, + "grad_norm": 0.247777059674263, + "learning_rate": 1.636213336718172e-05, + "loss": 1.7048, + "step": 24203 + }, + { + "epoch": 7.429097605893186, + "grad_norm": 0.2000289410352707, + "learning_rate": 1.635845599902298e-05, + "loss": 1.7568, + "step": 24204 + }, + { + "epoch": 7.429404542664211, + "grad_norm": 0.21887128055095673, + "learning_rate": 1.6354778963329732e-05, + "loss": 1.6708, + "step": 24205 + }, + { + "epoch": 7.429711479435237, + "grad_norm": 0.18932145833969116, + "learning_rate": 1.6351102260138247e-05, + "loss": 1.7184, + "step": 24206 + }, + { + "epoch": 7.430018416206262, + "grad_norm": 0.20103856921195984, + "learning_rate": 1.63474258894849e-05, + "loss": 1.7031, + "step": 24207 + }, + { + "epoch": 7.430325352977286, + "grad_norm": 0.22598737478256226, + "learning_rate": 1.634374985140602e-05, + "loss": 1.7803, + "step": 24208 + }, + { + "epoch": 7.430632289748312, + "grad_norm": 0.22468316555023193, + "learning_rate": 1.6340074145937934e-05, + "loss": 1.7635, + "step": 24209 + }, + { + "epoch": 7.430939226519337, + "grad_norm": 0.16173744201660156, + "learning_rate": 1.6336398773116962e-05, + "loss": 1.6877, + "step": 24210 + }, + { + "epoch": 7.431246163290362, + "grad_norm": 0.17869406938552856, + "learning_rate": 1.6332723732979426e-05, + "loss": 1.6436, + "step": 24211 + }, + { + "epoch": 7.431553100061388, + "grad_norm": 0.1828129142522812, + "learning_rate": 1.6329049025561648e-05, + "loss": 1.7191, + "step": 24212 + }, + { + "epoch": 7.431860036832412, + "grad_norm": 0.19169248640537262, + "learning_rate": 1.6325374650899944e-05, + "loss": 1.7607, + "step": 24213 + }, + { + "epoch": 7.4321669736034375, + "grad_norm": 0.1680343598127365, + "learning_rate": 1.632170060903062e-05, + "loss": 1.6736, + "step": 24214 + }, + { + "epoch": 7.432473910374463, + "grad_norm": 0.20647180080413818, + "learning_rate": 1.6318026899989996e-05, + "loss": 1.7875, + "step": 24215 + }, + { + "epoch": 7.432780847145488, + "grad_norm": 0.29225587844848633, + "learning_rate": 1.6314353523814352e-05, + "loss": 1.8164, + "step": 24216 + }, + { + "epoch": 7.4330877839165135, + "grad_norm": 0.1633446216583252, + "learning_rate": 1.6310680480540048e-05, + "loss": 1.6529, + "step": 24217 + }, + { + "epoch": 7.433394720687538, + "grad_norm": 0.21215081214904785, + "learning_rate": 1.6307007770203326e-05, + "loss": 1.6323, + "step": 24218 + }, + { + "epoch": 7.433701657458563, + "grad_norm": 0.1934979110956192, + "learning_rate": 1.63033353928405e-05, + "loss": 1.7299, + "step": 24219 + }, + { + "epoch": 7.434008594229589, + "grad_norm": 0.2581390142440796, + "learning_rate": 1.6299663348487865e-05, + "loss": 1.7308, + "step": 24220 + }, + { + "epoch": 7.434315531000614, + "grad_norm": 0.2711075246334076, + "learning_rate": 1.629599163718169e-05, + "loss": 1.8736, + "step": 24221 + }, + { + "epoch": 7.434622467771639, + "grad_norm": 0.2620790898799896, + "learning_rate": 1.6292320258958316e-05, + "loss": 1.7326, + "step": 24222 + }, + { + "epoch": 7.434929404542665, + "grad_norm": 0.16254334151744843, + "learning_rate": 1.6288649213853958e-05, + "loss": 1.6996, + "step": 24223 + }, + { + "epoch": 7.435236341313689, + "grad_norm": 0.22968515753746033, + "learning_rate": 1.628497850190496e-05, + "loss": 1.694, + "step": 24224 + }, + { + "epoch": 7.435543278084714, + "grad_norm": 0.20458953082561493, + "learning_rate": 1.6281308123147533e-05, + "loss": 1.7558, + "step": 24225 + }, + { + "epoch": 7.43585021485574, + "grad_norm": 0.2327413409948349, + "learning_rate": 1.6277638077617995e-05, + "loss": 1.7581, + "step": 24226 + }, + { + "epoch": 7.436157151626765, + "grad_norm": 0.18312111496925354, + "learning_rate": 1.6273968365352604e-05, + "loss": 1.6713, + "step": 24227 + }, + { + "epoch": 7.43646408839779, + "grad_norm": 0.15935418009757996, + "learning_rate": 1.6270298986387628e-05, + "loss": 1.6996, + "step": 24228 + }, + { + "epoch": 7.436771025168815, + "grad_norm": 0.17424416542053223, + "learning_rate": 1.6266629940759322e-05, + "loss": 1.6826, + "step": 24229 + }, + { + "epoch": 7.43707796193984, + "grad_norm": 0.18982923030853271, + "learning_rate": 1.6262961228503953e-05, + "loss": 1.741, + "step": 24230 + }, + { + "epoch": 7.4373848987108655, + "grad_norm": 0.16608789563179016, + "learning_rate": 1.6259292849657777e-05, + "loss": 1.7205, + "step": 24231 + }, + { + "epoch": 7.437691835481891, + "grad_norm": 0.19830825924873352, + "learning_rate": 1.625562480425704e-05, + "loss": 1.7159, + "step": 24232 + }, + { + "epoch": 7.437998772252916, + "grad_norm": 0.1889072209596634, + "learning_rate": 1.6251957092337988e-05, + "loss": 1.7427, + "step": 24233 + }, + { + "epoch": 7.4383057090239415, + "grad_norm": 0.18454046547412872, + "learning_rate": 1.6248289713936903e-05, + "loss": 1.6962, + "step": 24234 + }, + { + "epoch": 7.438612645794966, + "grad_norm": 0.20041033625602722, + "learning_rate": 1.6244622669089987e-05, + "loss": 1.7763, + "step": 24235 + }, + { + "epoch": 7.438919582565991, + "grad_norm": 0.17226676642894745, + "learning_rate": 1.62409559578335e-05, + "loss": 1.6783, + "step": 24236 + }, + { + "epoch": 7.439226519337017, + "grad_norm": 0.1761687994003296, + "learning_rate": 1.6237289580203662e-05, + "loss": 1.6761, + "step": 24237 + }, + { + "epoch": 7.439533456108042, + "grad_norm": 0.24213027954101562, + "learning_rate": 1.6233623536236707e-05, + "loss": 1.724, + "step": 24238 + }, + { + "epoch": 7.439840392879067, + "grad_norm": 0.15541739761829376, + "learning_rate": 1.6229957825968913e-05, + "loss": 1.6594, + "step": 24239 + }, + { + "epoch": 7.440147329650092, + "grad_norm": 0.20755749940872192, + "learning_rate": 1.622629244943643e-05, + "loss": 1.7229, + "step": 24240 + }, + { + "epoch": 7.440454266421117, + "grad_norm": 0.20716612040996552, + "learning_rate": 1.6222627406675555e-05, + "loss": 1.699, + "step": 24241 + }, + { + "epoch": 7.440761203192142, + "grad_norm": 0.17423541843891144, + "learning_rate": 1.621896269772244e-05, + "loss": 1.7175, + "step": 24242 + }, + { + "epoch": 7.441068139963168, + "grad_norm": 0.17913730442523956, + "learning_rate": 1.6215298322613347e-05, + "loss": 1.7287, + "step": 24243 + }, + { + "epoch": 7.441375076734193, + "grad_norm": 0.21801607310771942, + "learning_rate": 1.6211634281384486e-05, + "loss": 1.8157, + "step": 24244 + }, + { + "epoch": 7.4416820135052175, + "grad_norm": 0.23132582008838654, + "learning_rate": 1.6207970574072056e-05, + "loss": 1.7921, + "step": 24245 + }, + { + "epoch": 7.441988950276243, + "grad_norm": 0.18289685249328613, + "learning_rate": 1.6204307200712266e-05, + "loss": 1.7222, + "step": 24246 + }, + { + "epoch": 7.442295887047268, + "grad_norm": 0.15289388597011566, + "learning_rate": 1.620064416134132e-05, + "loss": 1.6409, + "step": 24247 + }, + { + "epoch": 7.4426028238182935, + "grad_norm": 0.1684839129447937, + "learning_rate": 1.619698145599542e-05, + "loss": 1.7362, + "step": 24248 + }, + { + "epoch": 7.442909760589319, + "grad_norm": 0.16812102496623993, + "learning_rate": 1.619331908471076e-05, + "loss": 1.6849, + "step": 24249 + }, + { + "epoch": 7.443216697360343, + "grad_norm": 0.16095775365829468, + "learning_rate": 1.6189657047523526e-05, + "loss": 1.7032, + "step": 24250 + }, + { + "epoch": 7.443523634131369, + "grad_norm": 0.167144313454628, + "learning_rate": 1.6185995344469946e-05, + "loss": 1.6539, + "step": 24251 + }, + { + "epoch": 7.443830570902394, + "grad_norm": 0.18129989504814148, + "learning_rate": 1.618233397558616e-05, + "loss": 1.7057, + "step": 24252 + }, + { + "epoch": 7.444137507673419, + "grad_norm": 0.17299556732177734, + "learning_rate": 1.6178672940908374e-05, + "loss": 1.6965, + "step": 24253 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 0.14944438636302948, + "learning_rate": 1.6175012240472765e-05, + "loss": 1.6666, + "step": 24254 + }, + { + "epoch": 7.44475138121547, + "grad_norm": 0.20333626866340637, + "learning_rate": 1.6171351874315494e-05, + "loss": 1.748, + "step": 24255 + }, + { + "epoch": 7.445058317986494, + "grad_norm": 0.2233068197965622, + "learning_rate": 1.6167691842472783e-05, + "loss": 1.7662, + "step": 24256 + }, + { + "epoch": 7.44536525475752, + "grad_norm": 0.22628507018089294, + "learning_rate": 1.6164032144980738e-05, + "loss": 1.747, + "step": 24257 + }, + { + "epoch": 7.445672191528545, + "grad_norm": 0.18167820572853088, + "learning_rate": 1.6160372781875594e-05, + "loss": 1.7311, + "step": 24258 + }, + { + "epoch": 7.44597912829957, + "grad_norm": 0.1975218504667282, + "learning_rate": 1.6156713753193446e-05, + "loss": 1.7334, + "step": 24259 + }, + { + "epoch": 7.446286065070596, + "grad_norm": 0.18606813251972198, + "learning_rate": 1.6153055058970508e-05, + "loss": 1.7118, + "step": 24260 + }, + { + "epoch": 7.44659300184162, + "grad_norm": 0.14817847311496735, + "learning_rate": 1.6149396699242914e-05, + "loss": 1.6385, + "step": 24261 + }, + { + "epoch": 7.4468999386126455, + "grad_norm": 0.19018684327602386, + "learning_rate": 1.6145738674046825e-05, + "loss": 1.7511, + "step": 24262 + }, + { + "epoch": 7.447206875383671, + "grad_norm": 0.17089374363422394, + "learning_rate": 1.6142080983418385e-05, + "loss": 1.7523, + "step": 24263 + }, + { + "epoch": 7.447513812154696, + "grad_norm": 0.16370832920074463, + "learning_rate": 1.613842362739375e-05, + "loss": 1.6636, + "step": 24264 + }, + { + "epoch": 7.4478207489257215, + "grad_norm": 0.16432829201221466, + "learning_rate": 1.6134766606009055e-05, + "loss": 1.7355, + "step": 24265 + }, + { + "epoch": 7.448127685696747, + "grad_norm": 0.15270906686782837, + "learning_rate": 1.6131109919300453e-05, + "loss": 1.7169, + "step": 24266 + }, + { + "epoch": 7.448434622467771, + "grad_norm": 0.14986950159072876, + "learning_rate": 1.6127453567304053e-05, + "loss": 1.7021, + "step": 24267 + }, + { + "epoch": 7.448741559238797, + "grad_norm": 0.17727383971214294, + "learning_rate": 1.6123797550056042e-05, + "loss": 1.7144, + "step": 24268 + }, + { + "epoch": 7.449048496009822, + "grad_norm": 0.1471523940563202, + "learning_rate": 1.6120141867592504e-05, + "loss": 1.694, + "step": 24269 + }, + { + "epoch": 7.449355432780847, + "grad_norm": 0.15561319887638092, + "learning_rate": 1.611648651994958e-05, + "loss": 1.6672, + "step": 24270 + }, + { + "epoch": 7.449662369551873, + "grad_norm": 0.19121745228767395, + "learning_rate": 1.61128315071634e-05, + "loss": 1.7317, + "step": 24271 + }, + { + "epoch": 7.449969306322897, + "grad_norm": 0.27333202958106995, + "learning_rate": 1.6109176829270062e-05, + "loss": 1.7943, + "step": 24272 + }, + { + "epoch": 7.4502762430939224, + "grad_norm": 0.16996058821678162, + "learning_rate": 1.6105522486305736e-05, + "loss": 1.6883, + "step": 24273 + }, + { + "epoch": 7.450583179864948, + "grad_norm": 0.17687207460403442, + "learning_rate": 1.610186847830647e-05, + "loss": 1.6967, + "step": 24274 + }, + { + "epoch": 7.450890116635973, + "grad_norm": 0.2191249281167984, + "learning_rate": 1.6098214805308436e-05, + "loss": 1.7644, + "step": 24275 + }, + { + "epoch": 7.4511970534069984, + "grad_norm": 0.17267808318138123, + "learning_rate": 1.6094561467347684e-05, + "loss": 1.6963, + "step": 24276 + }, + { + "epoch": 7.451503990178024, + "grad_norm": 0.16276031732559204, + "learning_rate": 1.609090846446037e-05, + "loss": 1.6795, + "step": 24277 + }, + { + "epoch": 7.451810926949048, + "grad_norm": 0.16677677631378174, + "learning_rate": 1.6087255796682572e-05, + "loss": 1.699, + "step": 24278 + }, + { + "epoch": 7.452117863720074, + "grad_norm": 0.17163679003715515, + "learning_rate": 1.6083603464050383e-05, + "loss": 1.6906, + "step": 24279 + }, + { + "epoch": 7.452424800491099, + "grad_norm": 0.16087757050991058, + "learning_rate": 1.6079951466599908e-05, + "loss": 1.7173, + "step": 24280 + }, + { + "epoch": 7.452731737262124, + "grad_norm": 0.19389556348323822, + "learning_rate": 1.6076299804367228e-05, + "loss": 1.6985, + "step": 24281 + }, + { + "epoch": 7.45303867403315, + "grad_norm": 0.20400559902191162, + "learning_rate": 1.6072648477388447e-05, + "loss": 1.7336, + "step": 24282 + }, + { + "epoch": 7.453345610804174, + "grad_norm": 0.16443994641304016, + "learning_rate": 1.6068997485699632e-05, + "loss": 1.6909, + "step": 24283 + }, + { + "epoch": 7.453652547575199, + "grad_norm": 0.18333028256893158, + "learning_rate": 1.606534682933686e-05, + "loss": 1.6749, + "step": 24284 + }, + { + "epoch": 7.453959484346225, + "grad_norm": 0.21596840023994446, + "learning_rate": 1.6061696508336244e-05, + "loss": 1.7856, + "step": 24285 + }, + { + "epoch": 7.45426642111725, + "grad_norm": 0.18656609952449799, + "learning_rate": 1.6058046522733827e-05, + "loss": 1.6892, + "step": 24286 + }, + { + "epoch": 7.454573357888275, + "grad_norm": 0.18110665678977966, + "learning_rate": 1.6054396872565687e-05, + "loss": 1.7063, + "step": 24287 + }, + { + "epoch": 7.4548802946593, + "grad_norm": 0.19452248513698578, + "learning_rate": 1.605074755786789e-05, + "loss": 1.7637, + "step": 24288 + }, + { + "epoch": 7.455187231430325, + "grad_norm": 0.18945640325546265, + "learning_rate": 1.604709857867649e-05, + "loss": 1.7498, + "step": 24289 + }, + { + "epoch": 7.4554941682013505, + "grad_norm": 0.1847696155309677, + "learning_rate": 1.6043449935027592e-05, + "loss": 1.702, + "step": 24290 + }, + { + "epoch": 7.455801104972376, + "grad_norm": 0.18882444500923157, + "learning_rate": 1.6039801626957197e-05, + "loss": 1.728, + "step": 24291 + }, + { + "epoch": 7.456108041743401, + "grad_norm": 0.1981150358915329, + "learning_rate": 1.603615365450142e-05, + "loss": 1.7114, + "step": 24292 + }, + { + "epoch": 7.456414978514426, + "grad_norm": 0.2305375188589096, + "learning_rate": 1.6032506017696242e-05, + "loss": 1.7234, + "step": 24293 + }, + { + "epoch": 7.456721915285451, + "grad_norm": 0.17539730668067932, + "learning_rate": 1.6028858716577764e-05, + "loss": 1.6305, + "step": 24294 + }, + { + "epoch": 7.457028852056476, + "grad_norm": 0.19684432446956635, + "learning_rate": 1.602521175118202e-05, + "loss": 1.6958, + "step": 24295 + }, + { + "epoch": 7.457335788827502, + "grad_norm": 0.20957234501838684, + "learning_rate": 1.602156512154504e-05, + "loss": 1.6984, + "step": 24296 + }, + { + "epoch": 7.457642725598527, + "grad_norm": 0.18523702025413513, + "learning_rate": 1.6017918827702877e-05, + "loss": 1.7817, + "step": 24297 + }, + { + "epoch": 7.457949662369552, + "grad_norm": 0.1964758187532425, + "learning_rate": 1.601427286969155e-05, + "loss": 1.7597, + "step": 24298 + }, + { + "epoch": 7.458256599140577, + "grad_norm": 0.199961856007576, + "learning_rate": 1.6010627247547106e-05, + "loss": 1.6988, + "step": 24299 + }, + { + "epoch": 7.458563535911602, + "grad_norm": 0.16149461269378662, + "learning_rate": 1.6006981961305555e-05, + "loss": 1.6673, + "step": 24300 + }, + { + "epoch": 7.458870472682627, + "grad_norm": 0.2198258489370346, + "learning_rate": 1.600333701100293e-05, + "loss": 1.7159, + "step": 24301 + }, + { + "epoch": 7.459177409453653, + "grad_norm": 0.157994344830513, + "learning_rate": 1.5999692396675277e-05, + "loss": 1.7118, + "step": 24302 + }, + { + "epoch": 7.459484346224678, + "grad_norm": 0.21911758184432983, + "learning_rate": 1.5996048118358575e-05, + "loss": 1.7209, + "step": 24303 + }, + { + "epoch": 7.4597912829957025, + "grad_norm": 0.20648738741874695, + "learning_rate": 1.599240417608886e-05, + "loss": 1.7844, + "step": 24304 + }, + { + "epoch": 7.460098219766728, + "grad_norm": 0.18746837973594666, + "learning_rate": 1.598876056990214e-05, + "loss": 1.7079, + "step": 24305 + }, + { + "epoch": 7.460405156537753, + "grad_norm": 0.17767341434955597, + "learning_rate": 1.5985117299834407e-05, + "loss": 1.7579, + "step": 24306 + }, + { + "epoch": 7.4607120933087785, + "grad_norm": 0.18997585773468018, + "learning_rate": 1.598147436592171e-05, + "loss": 1.7556, + "step": 24307 + }, + { + "epoch": 7.461019030079804, + "grad_norm": 0.19356711208820343, + "learning_rate": 1.597783176819999e-05, + "loss": 1.7315, + "step": 24308 + }, + { + "epoch": 7.461325966850829, + "grad_norm": 0.23354102671146393, + "learning_rate": 1.597418950670531e-05, + "loss": 1.7622, + "step": 24309 + }, + { + "epoch": 7.461632903621854, + "grad_norm": 0.18773409724235535, + "learning_rate": 1.5970547581473604e-05, + "loss": 1.6582, + "step": 24310 + }, + { + "epoch": 7.461939840392879, + "grad_norm": 0.23704196512699127, + "learning_rate": 1.596690599254091e-05, + "loss": 1.7207, + "step": 24311 + }, + { + "epoch": 7.462246777163904, + "grad_norm": 0.1943788379430771, + "learning_rate": 1.596326473994319e-05, + "loss": 1.696, + "step": 24312 + }, + { + "epoch": 7.46255371393493, + "grad_norm": 0.22303985059261322, + "learning_rate": 1.595962382371644e-05, + "loss": 1.6963, + "step": 24313 + }, + { + "epoch": 7.462860650705955, + "grad_norm": 0.20158524811267853, + "learning_rate": 1.5955983243896643e-05, + "loss": 1.7017, + "step": 24314 + }, + { + "epoch": 7.463167587476979, + "grad_norm": 0.18768194317817688, + "learning_rate": 1.595234300051977e-05, + "loss": 1.6743, + "step": 24315 + }, + { + "epoch": 7.463474524248005, + "grad_norm": 0.27407020330429077, + "learning_rate": 1.5948703093621803e-05, + "loss": 1.7522, + "step": 24316 + }, + { + "epoch": 7.46378146101903, + "grad_norm": 0.2027997523546219, + "learning_rate": 1.5945063523238706e-05, + "loss": 1.7515, + "step": 24317 + }, + { + "epoch": 7.464088397790055, + "grad_norm": 0.2728271782398224, + "learning_rate": 1.5941424289406454e-05, + "loss": 1.7611, + "step": 24318 + }, + { + "epoch": 7.464395334561081, + "grad_norm": 0.1704578548669815, + "learning_rate": 1.593778539216101e-05, + "loss": 1.6602, + "step": 24319 + }, + { + "epoch": 7.464702271332105, + "grad_norm": 0.19684311747550964, + "learning_rate": 1.5934146831538332e-05, + "loss": 1.6824, + "step": 24320 + }, + { + "epoch": 7.4650092081031305, + "grad_norm": 0.196905255317688, + "learning_rate": 1.5930508607574386e-05, + "loss": 1.691, + "step": 24321 + }, + { + "epoch": 7.465316144874156, + "grad_norm": 0.18543855845928192, + "learning_rate": 1.5926870720305122e-05, + "loss": 1.6936, + "step": 24322 + }, + { + "epoch": 7.465623081645181, + "grad_norm": 0.24634000658988953, + "learning_rate": 1.592323316976647e-05, + "loss": 1.6857, + "step": 24323 + }, + { + "epoch": 7.4659300184162065, + "grad_norm": 0.1976090669631958, + "learning_rate": 1.5919595955994444e-05, + "loss": 1.7248, + "step": 24324 + }, + { + "epoch": 7.466236955187231, + "grad_norm": 0.21902409195899963, + "learning_rate": 1.5915959079024907e-05, + "loss": 1.7184, + "step": 24325 + }, + { + "epoch": 7.466543891958256, + "grad_norm": 0.14501455426216125, + "learning_rate": 1.591232253889387e-05, + "loss": 1.6351, + "step": 24326 + }, + { + "epoch": 7.466850828729282, + "grad_norm": 0.20591090619564056, + "learning_rate": 1.5908686335637213e-05, + "loss": 1.7188, + "step": 24327 + }, + { + "epoch": 7.467157765500307, + "grad_norm": 0.17669445276260376, + "learning_rate": 1.590505046929091e-05, + "loss": 1.6735, + "step": 24328 + }, + { + "epoch": 7.467464702271332, + "grad_norm": 0.19642697274684906, + "learning_rate": 1.590141493989089e-05, + "loss": 1.6599, + "step": 24329 + }, + { + "epoch": 7.467771639042358, + "grad_norm": 0.2049490511417389, + "learning_rate": 1.589777974747307e-05, + "loss": 1.77, + "step": 24330 + }, + { + "epoch": 7.468078575813382, + "grad_norm": 0.1877276450395584, + "learning_rate": 1.5894144892073377e-05, + "loss": 1.6774, + "step": 24331 + }, + { + "epoch": 7.468385512584407, + "grad_norm": 0.18437768518924713, + "learning_rate": 1.5890510373727735e-05, + "loss": 1.7054, + "step": 24332 + }, + { + "epoch": 7.468692449355433, + "grad_norm": 0.1850978136062622, + "learning_rate": 1.5886876192472062e-05, + "loss": 1.6664, + "step": 24333 + }, + { + "epoch": 7.468999386126458, + "grad_norm": 0.16257111728191376, + "learning_rate": 1.588324234834227e-05, + "loss": 1.7438, + "step": 24334 + }, + { + "epoch": 7.469306322897483, + "grad_norm": 0.1776656061410904, + "learning_rate": 1.5879608841374277e-05, + "loss": 1.6913, + "step": 24335 + }, + { + "epoch": 7.469613259668508, + "grad_norm": 0.183144673705101, + "learning_rate": 1.587597567160398e-05, + "loss": 1.6737, + "step": 24336 + }, + { + "epoch": 7.469920196439533, + "grad_norm": 0.15030701458454132, + "learning_rate": 1.5872342839067306e-05, + "loss": 1.6776, + "step": 24337 + }, + { + "epoch": 7.4702271332105585, + "grad_norm": 0.1987701952457428, + "learning_rate": 1.586871034380013e-05, + "loss": 1.7119, + "step": 24338 + }, + { + "epoch": 7.470534069981584, + "grad_norm": 0.20000997185707092, + "learning_rate": 1.5865078185838373e-05, + "loss": 1.6794, + "step": 24339 + }, + { + "epoch": 7.470841006752609, + "grad_norm": 0.1674201786518097, + "learning_rate": 1.5861446365217902e-05, + "loss": 1.6826, + "step": 24340 + }, + { + "epoch": 7.4711479435236345, + "grad_norm": 0.22385969758033752, + "learning_rate": 1.585781488197466e-05, + "loss": 1.7012, + "step": 24341 + }, + { + "epoch": 7.471454880294659, + "grad_norm": 0.18635201454162598, + "learning_rate": 1.585418373614446e-05, + "loss": 1.7086, + "step": 24342 + }, + { + "epoch": 7.471761817065684, + "grad_norm": 0.17345300316810608, + "learning_rate": 1.5850552927763274e-05, + "loss": 1.7068, + "step": 24343 + }, + { + "epoch": 7.47206875383671, + "grad_norm": 0.1777433305978775, + "learning_rate": 1.5846922456866904e-05, + "loss": 1.6618, + "step": 24344 + }, + { + "epoch": 7.472375690607735, + "grad_norm": 0.1821276843547821, + "learning_rate": 1.584329232349128e-05, + "loss": 1.7451, + "step": 24345 + }, + { + "epoch": 7.47268262737876, + "grad_norm": 0.1714404970407486, + "learning_rate": 1.5839662527672262e-05, + "loss": 1.7289, + "step": 24346 + }, + { + "epoch": 7.472989564149785, + "grad_norm": 0.159423828125, + "learning_rate": 1.583603306944572e-05, + "loss": 1.667, + "step": 24347 + }, + { + "epoch": 7.47329650092081, + "grad_norm": 0.22563552856445312, + "learning_rate": 1.5832403948847523e-05, + "loss": 1.7755, + "step": 24348 + }, + { + "epoch": 7.473603437691835, + "grad_norm": 0.17239433526992798, + "learning_rate": 1.582877516591354e-05, + "loss": 1.6577, + "step": 24349 + }, + { + "epoch": 7.473910374462861, + "grad_norm": 0.1671951860189438, + "learning_rate": 1.5825146720679624e-05, + "loss": 1.7438, + "step": 24350 + }, + { + "epoch": 7.474217311233886, + "grad_norm": 0.1802397519350052, + "learning_rate": 1.582151861318164e-05, + "loss": 1.686, + "step": 24351 + }, + { + "epoch": 7.474524248004911, + "grad_norm": 0.21424922347068787, + "learning_rate": 1.5817890843455442e-05, + "loss": 1.7871, + "step": 24352 + }, + { + "epoch": 7.474831184775936, + "grad_norm": 0.2275305986404419, + "learning_rate": 1.5814263411536884e-05, + "loss": 1.7461, + "step": 24353 + }, + { + "epoch": 7.475138121546961, + "grad_norm": 0.1682458072900772, + "learning_rate": 1.581063631746181e-05, + "loss": 1.6362, + "step": 24354 + }, + { + "epoch": 7.475445058317987, + "grad_norm": 0.165358304977417, + "learning_rate": 1.5807009561266068e-05, + "loss": 1.7057, + "step": 24355 + }, + { + "epoch": 7.475751995089012, + "grad_norm": 0.18032164871692657, + "learning_rate": 1.5803383142985496e-05, + "loss": 1.7645, + "step": 24356 + }, + { + "epoch": 7.476058931860037, + "grad_norm": 0.1694670170545578, + "learning_rate": 1.5799757062655935e-05, + "loss": 1.6848, + "step": 24357 + }, + { + "epoch": 7.476365868631062, + "grad_norm": 0.17879679799079895, + "learning_rate": 1.5796131320313225e-05, + "loss": 1.7425, + "step": 24358 + }, + { + "epoch": 7.476672805402087, + "grad_norm": 0.16042493283748627, + "learning_rate": 1.579250591599317e-05, + "loss": 1.6389, + "step": 24359 + }, + { + "epoch": 7.476979742173112, + "grad_norm": 0.19134685397148132, + "learning_rate": 1.5788880849731658e-05, + "loss": 1.7504, + "step": 24360 + }, + { + "epoch": 7.477286678944138, + "grad_norm": 0.16545429825782776, + "learning_rate": 1.578525612156444e-05, + "loss": 1.7184, + "step": 24361 + }, + { + "epoch": 7.477593615715163, + "grad_norm": 0.18139231204986572, + "learning_rate": 1.5781631731527397e-05, + "loss": 1.6794, + "step": 24362 + }, + { + "epoch": 7.4779005524861875, + "grad_norm": 0.19043901562690735, + "learning_rate": 1.5778007679656326e-05, + "loss": 1.7184, + "step": 24363 + }, + { + "epoch": 7.478207489257213, + "grad_norm": 0.19410157203674316, + "learning_rate": 1.577438396598703e-05, + "loss": 1.7599, + "step": 24364 + }, + { + "epoch": 7.478514426028238, + "grad_norm": 0.18464741110801697, + "learning_rate": 1.5770760590555344e-05, + "loss": 1.652, + "step": 24365 + }, + { + "epoch": 7.4788213627992635, + "grad_norm": 0.19959059357643127, + "learning_rate": 1.576713755339706e-05, + "loss": 1.7509, + "step": 24366 + }, + { + "epoch": 7.479128299570289, + "grad_norm": 0.20312312245368958, + "learning_rate": 1.576351485454799e-05, + "loss": 1.758, + "step": 24367 + }, + { + "epoch": 7.479435236341313, + "grad_norm": 0.23994365334510803, + "learning_rate": 1.5759892494043933e-05, + "loss": 1.7124, + "step": 24368 + }, + { + "epoch": 7.479742173112339, + "grad_norm": 0.22661323845386505, + "learning_rate": 1.575627047192068e-05, + "loss": 1.7251, + "step": 24369 + }, + { + "epoch": 7.480049109883364, + "grad_norm": 0.2599529027938843, + "learning_rate": 1.5752648788214038e-05, + "loss": 1.7351, + "step": 24370 + }, + { + "epoch": 7.480356046654389, + "grad_norm": 0.17298145592212677, + "learning_rate": 1.5749027442959795e-05, + "loss": 1.681, + "step": 24371 + }, + { + "epoch": 7.480662983425415, + "grad_norm": 0.18189257383346558, + "learning_rate": 1.574540643619373e-05, + "loss": 1.6938, + "step": 24372 + }, + { + "epoch": 7.48096992019644, + "grad_norm": 0.2658606767654419, + "learning_rate": 1.5741785767951645e-05, + "loss": 1.7043, + "step": 24373 + }, + { + "epoch": 7.481276856967464, + "grad_norm": 0.17898595333099365, + "learning_rate": 1.573816543826931e-05, + "loss": 1.7299, + "step": 24374 + }, + { + "epoch": 7.48158379373849, + "grad_norm": 0.2529693841934204, + "learning_rate": 1.573454544718251e-05, + "loss": 1.6378, + "step": 24375 + }, + { + "epoch": 7.481890730509515, + "grad_norm": 0.1542833298444748, + "learning_rate": 1.5730925794726993e-05, + "loss": 1.6847, + "step": 24376 + }, + { + "epoch": 7.48219766728054, + "grad_norm": 0.24731594324111938, + "learning_rate": 1.5727306480938586e-05, + "loss": 1.7028, + "step": 24377 + }, + { + "epoch": 7.482504604051566, + "grad_norm": 0.21095556020736694, + "learning_rate": 1.572368750585299e-05, + "loss": 1.7371, + "step": 24378 + }, + { + "epoch": 7.48281154082259, + "grad_norm": 0.24208855628967285, + "learning_rate": 1.5720068869506037e-05, + "loss": 1.7982, + "step": 24379 + }, + { + "epoch": 7.4831184775936155, + "grad_norm": 0.23290614783763885, + "learning_rate": 1.571645057193343e-05, + "loss": 1.7443, + "step": 24380 + }, + { + "epoch": 7.483425414364641, + "grad_norm": 0.2146376222372055, + "learning_rate": 1.5712832613170963e-05, + "loss": 1.7258, + "step": 24381 + }, + { + "epoch": 7.483732351135666, + "grad_norm": 0.20540264248847961, + "learning_rate": 1.5709214993254385e-05, + "loss": 1.6495, + "step": 24382 + }, + { + "epoch": 7.4840392879066915, + "grad_norm": 0.16472755372524261, + "learning_rate": 1.570559771221944e-05, + "loss": 1.7118, + "step": 24383 + }, + { + "epoch": 7.484346224677717, + "grad_norm": 0.194668248295784, + "learning_rate": 1.5701980770101876e-05, + "loss": 1.6948, + "step": 24384 + }, + { + "epoch": 7.484653161448741, + "grad_norm": 0.19188909232616425, + "learning_rate": 1.569836416693744e-05, + "loss": 1.7376, + "step": 24385 + }, + { + "epoch": 7.484960098219767, + "grad_norm": 0.1935901939868927, + "learning_rate": 1.569474790276188e-05, + "loss": 1.7009, + "step": 24386 + }, + { + "epoch": 7.485267034990792, + "grad_norm": 0.18449221551418304, + "learning_rate": 1.5691131977610924e-05, + "loss": 1.7542, + "step": 24387 + }, + { + "epoch": 7.485573971761817, + "grad_norm": 0.18543820083141327, + "learning_rate": 1.568751639152031e-05, + "loss": 1.7125, + "step": 24388 + }, + { + "epoch": 7.485880908532843, + "grad_norm": 0.17343461513519287, + "learning_rate": 1.5683901144525776e-05, + "loss": 1.7189, + "step": 24389 + }, + { + "epoch": 7.486187845303867, + "grad_norm": 0.16813276708126068, + "learning_rate": 1.568028623666304e-05, + "loss": 1.6416, + "step": 24390 + }, + { + "epoch": 7.486494782074892, + "grad_norm": 0.16296882927417755, + "learning_rate": 1.567667166796783e-05, + "loss": 1.6971, + "step": 24391 + }, + { + "epoch": 7.486801718845918, + "grad_norm": 0.206793412566185, + "learning_rate": 1.5673057438475875e-05, + "loss": 1.8139, + "step": 24392 + }, + { + "epoch": 7.487108655616943, + "grad_norm": 0.1937340795993805, + "learning_rate": 1.566944354822286e-05, + "loss": 1.7606, + "step": 24393 + }, + { + "epoch": 7.487415592387968, + "grad_norm": 0.19251857697963715, + "learning_rate": 1.566582999724456e-05, + "loss": 1.7225, + "step": 24394 + }, + { + "epoch": 7.487722529158993, + "grad_norm": 0.1551857739686966, + "learning_rate": 1.566221678557663e-05, + "loss": 1.6546, + "step": 24395 + }, + { + "epoch": 7.488029465930018, + "grad_norm": 0.19435563683509827, + "learning_rate": 1.565860391325482e-05, + "loss": 1.7444, + "step": 24396 + }, + { + "epoch": 7.4883364027010435, + "grad_norm": 0.21196971833705902, + "learning_rate": 1.565499138031479e-05, + "loss": 1.7124, + "step": 24397 + }, + { + "epoch": 7.488643339472069, + "grad_norm": 0.2145242542028427, + "learning_rate": 1.5651379186792276e-05, + "loss": 1.7571, + "step": 24398 + }, + { + "epoch": 7.488950276243094, + "grad_norm": 0.17056338489055634, + "learning_rate": 1.5647767332722964e-05, + "loss": 1.6514, + "step": 24399 + }, + { + "epoch": 7.4892572130141195, + "grad_norm": 0.17161786556243896, + "learning_rate": 1.5644155818142553e-05, + "loss": 1.675, + "step": 24400 + }, + { + "epoch": 7.489564149785144, + "grad_norm": 0.18978877365589142, + "learning_rate": 1.564054464308673e-05, + "loss": 1.7123, + "step": 24401 + }, + { + "epoch": 7.489871086556169, + "grad_norm": 0.16004881262779236, + "learning_rate": 1.5636933807591186e-05, + "loss": 1.6555, + "step": 24402 + }, + { + "epoch": 7.490178023327195, + "grad_norm": 0.19739225506782532, + "learning_rate": 1.56333233116916e-05, + "loss": 1.7441, + "step": 24403 + }, + { + "epoch": 7.49048496009822, + "grad_norm": 0.20770032703876495, + "learning_rate": 1.5629713155423657e-05, + "loss": 1.6704, + "step": 24404 + }, + { + "epoch": 7.490791896869245, + "grad_norm": 0.17897675931453705, + "learning_rate": 1.5626103338823033e-05, + "loss": 1.7281, + "step": 24405 + }, + { + "epoch": 7.49109883364027, + "grad_norm": 0.20801669359207153, + "learning_rate": 1.5622493861925402e-05, + "loss": 1.7008, + "step": 24406 + }, + { + "epoch": 7.491405770411295, + "grad_norm": 0.2027266025543213, + "learning_rate": 1.5618884724766442e-05, + "loss": 1.7619, + "step": 24407 + }, + { + "epoch": 7.49171270718232, + "grad_norm": 0.19207318127155304, + "learning_rate": 1.5615275927381806e-05, + "loss": 1.6985, + "step": 24408 + }, + { + "epoch": 7.492019643953346, + "grad_norm": 0.19694732129573822, + "learning_rate": 1.5611667469807175e-05, + "loss": 1.7455, + "step": 24409 + }, + { + "epoch": 7.492326580724371, + "grad_norm": 0.170238196849823, + "learning_rate": 1.560805935207818e-05, + "loss": 1.7179, + "step": 24410 + }, + { + "epoch": 7.4926335174953955, + "grad_norm": 0.16890759766101837, + "learning_rate": 1.5604451574230532e-05, + "loss": 1.7323, + "step": 24411 + }, + { + "epoch": 7.492940454266421, + "grad_norm": 0.18043142557144165, + "learning_rate": 1.5600844136299824e-05, + "loss": 1.6958, + "step": 24412 + }, + { + "epoch": 7.493247391037446, + "grad_norm": 0.23966364562511444, + "learning_rate": 1.5597237038321764e-05, + "loss": 1.754, + "step": 24413 + }, + { + "epoch": 7.4935543278084715, + "grad_norm": 0.23342584073543549, + "learning_rate": 1.5593630280331945e-05, + "loss": 1.8008, + "step": 24414 + }, + { + "epoch": 7.493861264579497, + "grad_norm": 0.17365418374538422, + "learning_rate": 1.5590023862366054e-05, + "loss": 1.7166, + "step": 24415 + }, + { + "epoch": 7.494168201350522, + "grad_norm": 0.1934911608695984, + "learning_rate": 1.558641778445971e-05, + "loss": 1.7113, + "step": 24416 + }, + { + "epoch": 7.494475138121547, + "grad_norm": 0.1935805231332779, + "learning_rate": 1.558281204664856e-05, + "loss": 1.7549, + "step": 24417 + }, + { + "epoch": 7.494782074892572, + "grad_norm": 0.18467992544174194, + "learning_rate": 1.5579206648968236e-05, + "loss": 1.6889, + "step": 24418 + }, + { + "epoch": 7.495089011663597, + "grad_norm": 0.17173317074775696, + "learning_rate": 1.5575601591454365e-05, + "loss": 1.686, + "step": 24419 + }, + { + "epoch": 7.495395948434623, + "grad_norm": 0.1706855744123459, + "learning_rate": 1.5571996874142574e-05, + "loss": 1.6747, + "step": 24420 + }, + { + "epoch": 7.495702885205648, + "grad_norm": 0.2233184576034546, + "learning_rate": 1.556839249706849e-05, + "loss": 1.7855, + "step": 24421 + }, + { + "epoch": 7.496009821976672, + "grad_norm": 0.22118456661701202, + "learning_rate": 1.5564788460267733e-05, + "loss": 1.7487, + "step": 24422 + }, + { + "epoch": 7.496316758747698, + "grad_norm": 0.21284142136573792, + "learning_rate": 1.5561184763775916e-05, + "loss": 1.7367, + "step": 24423 + }, + { + "epoch": 7.496623695518723, + "grad_norm": 0.17366403341293335, + "learning_rate": 1.5557581407628656e-05, + "loss": 1.655, + "step": 24424 + }, + { + "epoch": 7.496930632289748, + "grad_norm": 0.19864381849765778, + "learning_rate": 1.555397839186157e-05, + "loss": 1.6691, + "step": 24425 + }, + { + "epoch": 7.497237569060774, + "grad_norm": 0.1787605881690979, + "learning_rate": 1.555037571651025e-05, + "loss": 1.7063, + "step": 24426 + }, + { + "epoch": 7.497544505831799, + "grad_norm": 0.19520068168640137, + "learning_rate": 1.5546773381610302e-05, + "loss": 1.7044, + "step": 24427 + }, + { + "epoch": 7.4978514426028235, + "grad_norm": 0.18771123886108398, + "learning_rate": 1.5543171387197362e-05, + "loss": 1.6959, + "step": 24428 + }, + { + "epoch": 7.498158379373849, + "grad_norm": 0.21876849234104156, + "learning_rate": 1.5539569733306964e-05, + "loss": 1.7486, + "step": 24429 + }, + { + "epoch": 7.498465316144874, + "grad_norm": 0.21685563027858734, + "learning_rate": 1.5535968419974772e-05, + "loss": 1.7541, + "step": 24430 + }, + { + "epoch": 7.4987722529158995, + "grad_norm": 0.19595225155353546, + "learning_rate": 1.5532367447236307e-05, + "loss": 1.6882, + "step": 24431 + }, + { + "epoch": 7.499079189686925, + "grad_norm": 0.18359199166297913, + "learning_rate": 1.5528766815127198e-05, + "loss": 1.687, + "step": 24432 + }, + { + "epoch": 7.499386126457949, + "grad_norm": 0.17955231666564941, + "learning_rate": 1.5525166523683028e-05, + "loss": 1.6759, + "step": 24433 + }, + { + "epoch": 7.499693063228975, + "grad_norm": 0.18786758184432983, + "learning_rate": 1.5521566572939368e-05, + "loss": 1.7118, + "step": 24434 + }, + { + "epoch": 7.5, + "grad_norm": 0.16672605276107788, + "learning_rate": 1.551796696293179e-05, + "loss": 1.6618, + "step": 24435 + }, + { + "epoch": 7.500306936771025, + "grad_norm": 0.17066839337348938, + "learning_rate": 1.5514367693695875e-05, + "loss": 1.6974, + "step": 24436 + }, + { + "epoch": 7.500613873542051, + "grad_norm": 0.17299650609493256, + "learning_rate": 1.5510768765267193e-05, + "loss": 1.7074, + "step": 24437 + }, + { + "epoch": 7.500920810313076, + "grad_norm": 0.17507639527320862, + "learning_rate": 1.5507170177681306e-05, + "loss": 1.7295, + "step": 24438 + }, + { + "epoch": 7.5012277470841005, + "grad_norm": 0.1909082531929016, + "learning_rate": 1.5503571930973786e-05, + "loss": 1.7153, + "step": 24439 + }, + { + "epoch": 7.501534683855126, + "grad_norm": 0.2334289401769638, + "learning_rate": 1.5499974025180185e-05, + "loss": 1.713, + "step": 24440 + }, + { + "epoch": 7.501841620626151, + "grad_norm": 0.18382340669631958, + "learning_rate": 1.5496376460336058e-05, + "loss": 1.6706, + "step": 24441 + }, + { + "epoch": 7.5021485573971765, + "grad_norm": 0.1901310533285141, + "learning_rate": 1.5492779236476967e-05, + "loss": 1.7106, + "step": 24442 + }, + { + "epoch": 7.502455494168201, + "grad_norm": 0.17336180806159973, + "learning_rate": 1.5489182353638452e-05, + "loss": 1.7467, + "step": 24443 + }, + { + "epoch": 7.502762430939226, + "grad_norm": 0.18670998513698578, + "learning_rate": 1.548558581185605e-05, + "loss": 1.7101, + "step": 24444 + }, + { + "epoch": 7.503069367710252, + "grad_norm": 0.18341238796710968, + "learning_rate": 1.5481989611165353e-05, + "loss": 1.719, + "step": 24445 + }, + { + "epoch": 7.503376304481277, + "grad_norm": 0.21832694113254547, + "learning_rate": 1.5478393751601833e-05, + "loss": 1.7143, + "step": 24446 + }, + { + "epoch": 7.503683241252302, + "grad_norm": 0.1715303659439087, + "learning_rate": 1.5474798233201094e-05, + "loss": 1.6962, + "step": 24447 + }, + { + "epoch": 7.503990178023328, + "grad_norm": 0.26411953568458557, + "learning_rate": 1.5471203055998595e-05, + "loss": 1.7182, + "step": 24448 + }, + { + "epoch": 7.504297114794352, + "grad_norm": 0.1646965742111206, + "learning_rate": 1.5467608220029926e-05, + "loss": 1.6979, + "step": 24449 + }, + { + "epoch": 7.504604051565377, + "grad_norm": 0.1664915233850479, + "learning_rate": 1.5464013725330595e-05, + "loss": 1.6809, + "step": 24450 + }, + { + "epoch": 7.504910988336403, + "grad_norm": 0.1711970716714859, + "learning_rate": 1.5460419571936125e-05, + "loss": 1.6975, + "step": 24451 + }, + { + "epoch": 7.505217925107428, + "grad_norm": 0.19235998392105103, + "learning_rate": 1.5456825759882028e-05, + "loss": 1.7515, + "step": 24452 + }, + { + "epoch": 7.505524861878453, + "grad_norm": 0.2137441486120224, + "learning_rate": 1.5453232289203822e-05, + "loss": 1.7575, + "step": 24453 + }, + { + "epoch": 7.505831798649478, + "grad_norm": 0.19337041676044464, + "learning_rate": 1.544963915993703e-05, + "loss": 1.776, + "step": 24454 + }, + { + "epoch": 7.506138735420503, + "grad_norm": 0.227366104722023, + "learning_rate": 1.5446046372117152e-05, + "loss": 1.7736, + "step": 24455 + }, + { + "epoch": 7.5064456721915285, + "grad_norm": 0.1712712198495865, + "learning_rate": 1.5442453925779694e-05, + "loss": 1.6663, + "step": 24456 + }, + { + "epoch": 7.506752608962554, + "grad_norm": 0.19359993934631348, + "learning_rate": 1.5438861820960164e-05, + "loss": 1.6826, + "step": 24457 + }, + { + "epoch": 7.507059545733579, + "grad_norm": 0.22883851826190948, + "learning_rate": 1.5435270057694056e-05, + "loss": 1.7782, + "step": 24458 + }, + { + "epoch": 7.5073664825046045, + "grad_norm": 0.17109328508377075, + "learning_rate": 1.543167863601687e-05, + "loss": 1.7435, + "step": 24459 + }, + { + "epoch": 7.507673419275629, + "grad_norm": 0.21545098721981049, + "learning_rate": 1.54280875559641e-05, + "loss": 1.7277, + "step": 24460 + }, + { + "epoch": 7.507980356046654, + "grad_norm": 0.18345774710178375, + "learning_rate": 1.542449681757121e-05, + "loss": 1.7255, + "step": 24461 + }, + { + "epoch": 7.50828729281768, + "grad_norm": 0.15472757816314697, + "learning_rate": 1.5420906420873744e-05, + "loss": 1.6615, + "step": 24462 + }, + { + "epoch": 7.508594229588705, + "grad_norm": 0.2084251195192337, + "learning_rate": 1.5417316365907113e-05, + "loss": 1.6747, + "step": 24463 + }, + { + "epoch": 7.50890116635973, + "grad_norm": 0.19010984897613525, + "learning_rate": 1.5413726652706868e-05, + "loss": 1.7188, + "step": 24464 + }, + { + "epoch": 7.509208103130755, + "grad_norm": 0.22481444478034973, + "learning_rate": 1.5410137281308408e-05, + "loss": 1.8028, + "step": 24465 + }, + { + "epoch": 7.50951503990178, + "grad_norm": 0.22309516370296478, + "learning_rate": 1.5406548251747266e-05, + "loss": 1.7806, + "step": 24466 + }, + { + "epoch": 7.509821976672805, + "grad_norm": 0.19050204753875732, + "learning_rate": 1.540295956405889e-05, + "loss": 1.7188, + "step": 24467 + }, + { + "epoch": 7.510128913443831, + "grad_norm": 0.1956445276737213, + "learning_rate": 1.5399371218278745e-05, + "loss": 1.7468, + "step": 24468 + }, + { + "epoch": 7.510435850214856, + "grad_norm": 0.3492142856121063, + "learning_rate": 1.5395783214442294e-05, + "loss": 1.7502, + "step": 24469 + }, + { + "epoch": 7.510742786985881, + "grad_norm": 0.15318654477596283, + "learning_rate": 1.5392195552584997e-05, + "loss": 1.6782, + "step": 24470 + }, + { + "epoch": 7.511049723756906, + "grad_norm": 0.18576723337173462, + "learning_rate": 1.5388608232742308e-05, + "loss": 1.7455, + "step": 24471 + }, + { + "epoch": 7.511356660527931, + "grad_norm": 0.14923253655433655, + "learning_rate": 1.5385021254949677e-05, + "loss": 1.687, + "step": 24472 + }, + { + "epoch": 7.5116635972989565, + "grad_norm": 0.17453742027282715, + "learning_rate": 1.5381434619242553e-05, + "loss": 1.7072, + "step": 24473 + }, + { + "epoch": 7.511970534069982, + "grad_norm": 0.18869875371456146, + "learning_rate": 1.5377848325656384e-05, + "loss": 1.7681, + "step": 24474 + }, + { + "epoch": 7.512277470841006, + "grad_norm": 0.22205953299999237, + "learning_rate": 1.5374262374226612e-05, + "loss": 1.7526, + "step": 24475 + }, + { + "epoch": 7.512584407612032, + "grad_norm": 0.1634155809879303, + "learning_rate": 1.537067676498867e-05, + "loss": 1.704, + "step": 24476 + }, + { + "epoch": 7.512891344383057, + "grad_norm": 0.19530873000621796, + "learning_rate": 1.5367091497978004e-05, + "loss": 1.7469, + "step": 24477 + }, + { + "epoch": 7.513198281154082, + "grad_norm": 0.17038139700889587, + "learning_rate": 1.5363506573230017e-05, + "loss": 1.6363, + "step": 24478 + }, + { + "epoch": 7.513505217925108, + "grad_norm": 0.17695361375808716, + "learning_rate": 1.535992199078019e-05, + "loss": 1.7191, + "step": 24479 + }, + { + "epoch": 7.513812154696133, + "grad_norm": 0.2216692715883255, + "learning_rate": 1.535633775066389e-05, + "loss": 1.8042, + "step": 24480 + }, + { + "epoch": 7.514119091467157, + "grad_norm": 0.16862058639526367, + "learning_rate": 1.5352753852916595e-05, + "loss": 1.697, + "step": 24481 + }, + { + "epoch": 7.514426028238183, + "grad_norm": 0.20376496016979218, + "learning_rate": 1.5349170297573662e-05, + "loss": 1.7274, + "step": 24482 + }, + { + "epoch": 7.514732965009208, + "grad_norm": 0.16290763020515442, + "learning_rate": 1.5345587084670554e-05, + "loss": 1.6929, + "step": 24483 + }, + { + "epoch": 7.515039901780233, + "grad_norm": 0.21416328847408295, + "learning_rate": 1.5342004214242667e-05, + "loss": 1.756, + "step": 24484 + }, + { + "epoch": 7.515346838551259, + "grad_norm": 0.14708222448825836, + "learning_rate": 1.533842168632541e-05, + "loss": 1.6816, + "step": 24485 + }, + { + "epoch": 7.515653775322283, + "grad_norm": 0.1860494166612625, + "learning_rate": 1.5334839500954178e-05, + "loss": 1.7114, + "step": 24486 + }, + { + "epoch": 7.5159607120933085, + "grad_norm": 0.16551998257637024, + "learning_rate": 1.533125765816439e-05, + "loss": 1.6564, + "step": 24487 + }, + { + "epoch": 7.516267648864334, + "grad_norm": 0.16971731185913086, + "learning_rate": 1.5327676157991428e-05, + "loss": 1.6722, + "step": 24488 + }, + { + "epoch": 7.516574585635359, + "grad_norm": 0.17433905601501465, + "learning_rate": 1.532409500047069e-05, + "loss": 1.6944, + "step": 24489 + }, + { + "epoch": 7.5168815224063845, + "grad_norm": 0.15625490248203278, + "learning_rate": 1.5320514185637575e-05, + "loss": 1.6997, + "step": 24490 + }, + { + "epoch": 7.51718845917741, + "grad_norm": 0.19038623571395874, + "learning_rate": 1.531693371352746e-05, + "loss": 1.6999, + "step": 24491 + }, + { + "epoch": 7.517495395948434, + "grad_norm": 0.16037517786026, + "learning_rate": 1.5313353584175736e-05, + "loss": 1.6568, + "step": 24492 + }, + { + "epoch": 7.51780233271946, + "grad_norm": 0.1515430361032486, + "learning_rate": 1.5309773797617787e-05, + "loss": 1.693, + "step": 24493 + }, + { + "epoch": 7.518109269490485, + "grad_norm": 0.1792028695344925, + "learning_rate": 1.530619435388898e-05, + "loss": 1.7034, + "step": 24494 + }, + { + "epoch": 7.51841620626151, + "grad_norm": 0.18456964194774628, + "learning_rate": 1.530261525302468e-05, + "loss": 1.7565, + "step": 24495 + }, + { + "epoch": 7.518723143032536, + "grad_norm": 0.17504090070724487, + "learning_rate": 1.529903649506031e-05, + "loss": 1.7121, + "step": 24496 + }, + { + "epoch": 7.51903007980356, + "grad_norm": 0.19688715040683746, + "learning_rate": 1.529545808003116e-05, + "loss": 1.7507, + "step": 24497 + }, + { + "epoch": 7.519337016574585, + "grad_norm": 0.21039338409900665, + "learning_rate": 1.529188000797267e-05, + "loss": 1.709, + "step": 24498 + }, + { + "epoch": 7.519643953345611, + "grad_norm": 0.18255522847175598, + "learning_rate": 1.5288302278920136e-05, + "loss": 1.7497, + "step": 24499 + }, + { + "epoch": 7.519950890116636, + "grad_norm": 0.19913412630558014, + "learning_rate": 1.5284724892908958e-05, + "loss": 1.7244, + "step": 24500 + }, + { + "epoch": 7.520257826887661, + "grad_norm": 0.15792223811149597, + "learning_rate": 1.5281147849974476e-05, + "loss": 1.6916, + "step": 24501 + }, + { + "epoch": 7.520564763658687, + "grad_norm": 0.2078406661748886, + "learning_rate": 1.5277571150152038e-05, + "loss": 1.6959, + "step": 24502 + }, + { + "epoch": 7.520871700429711, + "grad_norm": 0.15596020221710205, + "learning_rate": 1.5273994793477e-05, + "loss": 1.7217, + "step": 24503 + }, + { + "epoch": 7.5211786372007365, + "grad_norm": 0.18951189517974854, + "learning_rate": 1.527041877998469e-05, + "loss": 1.7322, + "step": 24504 + }, + { + "epoch": 7.521485573971762, + "grad_norm": 0.16445964574813843, + "learning_rate": 1.526684310971046e-05, + "loss": 1.6668, + "step": 24505 + }, + { + "epoch": 7.521792510742787, + "grad_norm": 0.19513604044914246, + "learning_rate": 1.5263267782689644e-05, + "loss": 1.7464, + "step": 24506 + }, + { + "epoch": 7.5220994475138125, + "grad_norm": 0.20289716124534607, + "learning_rate": 1.525969279895758e-05, + "loss": 1.7472, + "step": 24507 + }, + { + "epoch": 7.522406384284837, + "grad_norm": 0.1716226041316986, + "learning_rate": 1.5256118158549588e-05, + "loss": 1.6872, + "step": 24508 + }, + { + "epoch": 7.522713321055862, + "grad_norm": 0.18939872086048126, + "learning_rate": 1.5252543861501006e-05, + "loss": 1.7365, + "step": 24509 + }, + { + "epoch": 7.523020257826888, + "grad_norm": 0.21382616460323334, + "learning_rate": 1.524896990784715e-05, + "loss": 1.7129, + "step": 24510 + }, + { + "epoch": 7.523327194597913, + "grad_norm": 0.18226614594459534, + "learning_rate": 1.5245396297623338e-05, + "loss": 1.7426, + "step": 24511 + }, + { + "epoch": 7.523634131368938, + "grad_norm": 0.15880146622657776, + "learning_rate": 1.5241823030864893e-05, + "loss": 1.6848, + "step": 24512 + }, + { + "epoch": 7.523941068139964, + "grad_norm": 0.1782255917787552, + "learning_rate": 1.5238250107607121e-05, + "loss": 1.7263, + "step": 24513 + }, + { + "epoch": 7.524248004910988, + "grad_norm": 0.20365844666957855, + "learning_rate": 1.5234677527885328e-05, + "loss": 1.7035, + "step": 24514 + }, + { + "epoch": 7.524554941682013, + "grad_norm": 0.1776183694601059, + "learning_rate": 1.5231105291734855e-05, + "loss": 1.6837, + "step": 24515 + }, + { + "epoch": 7.524861878453039, + "grad_norm": 0.14594987034797668, + "learning_rate": 1.5227533399190946e-05, + "loss": 1.6428, + "step": 24516 + }, + { + "epoch": 7.525168815224064, + "grad_norm": 0.19371397793293, + "learning_rate": 1.5223961850288947e-05, + "loss": 1.7108, + "step": 24517 + }, + { + "epoch": 7.525475751995089, + "grad_norm": 0.1695355474948883, + "learning_rate": 1.5220390645064148e-05, + "loss": 1.6777, + "step": 24518 + }, + { + "epoch": 7.525782688766114, + "grad_norm": 0.14815635979175568, + "learning_rate": 1.5216819783551828e-05, + "loss": 1.6967, + "step": 24519 + }, + { + "epoch": 7.526089625537139, + "grad_norm": 0.19655495882034302, + "learning_rate": 1.5213249265787283e-05, + "loss": 1.7358, + "step": 24520 + }, + { + "epoch": 7.526396562308165, + "grad_norm": 0.1817864030599594, + "learning_rate": 1.5209679091805795e-05, + "loss": 1.7132, + "step": 24521 + }, + { + "epoch": 7.52670349907919, + "grad_norm": 0.209315687417984, + "learning_rate": 1.5206109261642654e-05, + "loss": 1.7161, + "step": 24522 + }, + { + "epoch": 7.527010435850215, + "grad_norm": 0.18493252992630005, + "learning_rate": 1.520253977533313e-05, + "loss": 1.7136, + "step": 24523 + }, + { + "epoch": 7.52731737262124, + "grad_norm": 0.21916678547859192, + "learning_rate": 1.5198970632912508e-05, + "loss": 1.7464, + "step": 24524 + }, + { + "epoch": 7.527624309392265, + "grad_norm": 0.14470849931240082, + "learning_rate": 1.519540183441605e-05, + "loss": 1.6676, + "step": 24525 + }, + { + "epoch": 7.52793124616329, + "grad_norm": 0.20077016949653625, + "learning_rate": 1.5191833379879033e-05, + "loss": 1.7052, + "step": 24526 + }, + { + "epoch": 7.528238182934316, + "grad_norm": 0.17593151330947876, + "learning_rate": 1.5188265269336722e-05, + "loss": 1.7309, + "step": 24527 + }, + { + "epoch": 7.528545119705341, + "grad_norm": 0.20170791447162628, + "learning_rate": 1.518469750282438e-05, + "loss": 1.7335, + "step": 24528 + }, + { + "epoch": 7.5288520564763655, + "grad_norm": 0.1703701615333557, + "learning_rate": 1.518113008037726e-05, + "loss": 1.7141, + "step": 24529 + }, + { + "epoch": 7.529158993247391, + "grad_norm": 0.1897478848695755, + "learning_rate": 1.517756300203062e-05, + "loss": 1.7059, + "step": 24530 + }, + { + "epoch": 7.529465930018416, + "grad_norm": 0.17487141489982605, + "learning_rate": 1.5173996267819695e-05, + "loss": 1.7559, + "step": 24531 + }, + { + "epoch": 7.5297728667894415, + "grad_norm": 0.19167299568653107, + "learning_rate": 1.5170429877779785e-05, + "loss": 1.7287, + "step": 24532 + }, + { + "epoch": 7.530079803560467, + "grad_norm": 0.19433172047138214, + "learning_rate": 1.5166863831946072e-05, + "loss": 1.7182, + "step": 24533 + }, + { + "epoch": 7.530386740331492, + "grad_norm": 0.293734073638916, + "learning_rate": 1.5163298130353853e-05, + "loss": 1.7362, + "step": 24534 + }, + { + "epoch": 7.530693677102517, + "grad_norm": 0.18647685647010803, + "learning_rate": 1.515973277303831e-05, + "loss": 1.7271, + "step": 24535 + }, + { + "epoch": 7.531000613873542, + "grad_norm": 0.20918485522270203, + "learning_rate": 1.5156167760034729e-05, + "loss": 1.7225, + "step": 24536 + }, + { + "epoch": 7.531307550644567, + "grad_norm": 0.22056303918361664, + "learning_rate": 1.5152603091378315e-05, + "loss": 1.6524, + "step": 24537 + }, + { + "epoch": 7.531614487415593, + "grad_norm": 0.13695760071277618, + "learning_rate": 1.5149038767104307e-05, + "loss": 1.6639, + "step": 24538 + }, + { + "epoch": 7.531921424186618, + "grad_norm": 0.25396111607551575, + "learning_rate": 1.514547478724792e-05, + "loss": 1.7025, + "step": 24539 + }, + { + "epoch": 7.532228360957642, + "grad_norm": 0.18192961812019348, + "learning_rate": 1.5141911151844384e-05, + "loss": 1.7288, + "step": 24540 + }, + { + "epoch": 7.532535297728668, + "grad_norm": 0.24748951196670532, + "learning_rate": 1.5138347860928908e-05, + "loss": 1.7379, + "step": 24541 + }, + { + "epoch": 7.532842234499693, + "grad_norm": 0.1841045767068863, + "learning_rate": 1.5134784914536715e-05, + "loss": 1.7876, + "step": 24542 + }, + { + "epoch": 7.533149171270718, + "grad_norm": 0.21867021918296814, + "learning_rate": 1.5131222312703014e-05, + "loss": 1.7608, + "step": 24543 + }, + { + "epoch": 7.533456108041744, + "grad_norm": 0.1972149908542633, + "learning_rate": 1.512766005546301e-05, + "loss": 1.6927, + "step": 24544 + }, + { + "epoch": 7.533763044812769, + "grad_norm": 0.1728486567735672, + "learning_rate": 1.5124098142851906e-05, + "loss": 1.7656, + "step": 24545 + }, + { + "epoch": 7.5340699815837935, + "grad_norm": 0.2591659724712372, + "learning_rate": 1.512053657490491e-05, + "loss": 1.6844, + "step": 24546 + }, + { + "epoch": 7.534376918354819, + "grad_norm": 0.17187906801700592, + "learning_rate": 1.5116975351657215e-05, + "loss": 1.707, + "step": 24547 + }, + { + "epoch": 7.534683855125844, + "grad_norm": 0.26111504435539246, + "learning_rate": 1.5113414473143993e-05, + "loss": 1.7273, + "step": 24548 + }, + { + "epoch": 7.5349907918968695, + "grad_norm": 0.2153446227312088, + "learning_rate": 1.5109853939400498e-05, + "loss": 1.7458, + "step": 24549 + }, + { + "epoch": 7.535297728667894, + "grad_norm": 0.20768530666828156, + "learning_rate": 1.5106293750461835e-05, + "loss": 1.749, + "step": 24550 + }, + { + "epoch": 7.535604665438919, + "grad_norm": 0.2211574763059616, + "learning_rate": 1.5102733906363264e-05, + "loss": 1.7236, + "step": 24551 + }, + { + "epoch": 7.535911602209945, + "grad_norm": 0.15983305871486664, + "learning_rate": 1.5099174407139905e-05, + "loss": 1.6682, + "step": 24552 + }, + { + "epoch": 7.53621853898097, + "grad_norm": 0.23821383714675903, + "learning_rate": 1.5095615252826967e-05, + "loss": 1.7173, + "step": 24553 + }, + { + "epoch": 7.536525475751995, + "grad_norm": 0.1726350039243698, + "learning_rate": 1.5092056443459624e-05, + "loss": 1.7566, + "step": 24554 + }, + { + "epoch": 7.536832412523021, + "grad_norm": 0.19859814643859863, + "learning_rate": 1.5088497979073035e-05, + "loss": 1.7005, + "step": 24555 + }, + { + "epoch": 7.537139349294045, + "grad_norm": 0.14776331186294556, + "learning_rate": 1.508493985970239e-05, + "loss": 1.68, + "step": 24556 + }, + { + "epoch": 7.53744628606507, + "grad_norm": 0.20928993821144104, + "learning_rate": 1.50813820853828e-05, + "loss": 1.7536, + "step": 24557 + }, + { + "epoch": 7.537753222836096, + "grad_norm": 0.18914662301540375, + "learning_rate": 1.5077824656149475e-05, + "loss": 1.7476, + "step": 24558 + }, + { + "epoch": 7.538060159607121, + "grad_norm": 0.24415937066078186, + "learning_rate": 1.5074267572037554e-05, + "loss": 1.7225, + "step": 24559 + }, + { + "epoch": 7.538367096378146, + "grad_norm": 0.18504458665847778, + "learning_rate": 1.5070710833082196e-05, + "loss": 1.7028, + "step": 24560 + }, + { + "epoch": 7.538674033149171, + "grad_norm": 0.1846696138381958, + "learning_rate": 1.5067154439318542e-05, + "loss": 1.7204, + "step": 24561 + }, + { + "epoch": 7.538980969920196, + "grad_norm": 0.20846717059612274, + "learning_rate": 1.5063598390781747e-05, + "loss": 1.73, + "step": 24562 + }, + { + "epoch": 7.5392879066912215, + "grad_norm": 0.1950647234916687, + "learning_rate": 1.5060042687506943e-05, + "loss": 1.7008, + "step": 24563 + }, + { + "epoch": 7.539594843462247, + "grad_norm": 0.1880638748407364, + "learning_rate": 1.5056487329529278e-05, + "loss": 1.6965, + "step": 24564 + }, + { + "epoch": 7.539901780233272, + "grad_norm": 0.24405652284622192, + "learning_rate": 1.5052932316883872e-05, + "loss": 1.7407, + "step": 24565 + }, + { + "epoch": 7.5402087170042975, + "grad_norm": 0.15719062089920044, + "learning_rate": 1.5049377649605906e-05, + "loss": 1.6613, + "step": 24566 + }, + { + "epoch": 7.540515653775322, + "grad_norm": 0.20888090133666992, + "learning_rate": 1.5045823327730441e-05, + "loss": 1.7805, + "step": 24567 + }, + { + "epoch": 7.540822590546347, + "grad_norm": 0.1656443029642105, + "learning_rate": 1.504226935129267e-05, + "loss": 1.7047, + "step": 24568 + }, + { + "epoch": 7.541129527317373, + "grad_norm": 0.28847959637641907, + "learning_rate": 1.503871572032765e-05, + "loss": 1.8711, + "step": 24569 + }, + { + "epoch": 7.541436464088398, + "grad_norm": 0.1724858433008194, + "learning_rate": 1.5035162434870548e-05, + "loss": 1.6734, + "step": 24570 + }, + { + "epoch": 7.541743400859423, + "grad_norm": 0.2064351737499237, + "learning_rate": 1.5031609494956484e-05, + "loss": 1.7032, + "step": 24571 + }, + { + "epoch": 7.542050337630448, + "grad_norm": 0.175388365983963, + "learning_rate": 1.5028056900620513e-05, + "loss": 1.6606, + "step": 24572 + }, + { + "epoch": 7.542357274401473, + "grad_norm": 0.20802471041679382, + "learning_rate": 1.5024504651897814e-05, + "loss": 1.7324, + "step": 24573 + }, + { + "epoch": 7.542664211172498, + "grad_norm": 0.187152698636055, + "learning_rate": 1.502095274882343e-05, + "loss": 1.7222, + "step": 24574 + }, + { + "epoch": 7.542971147943524, + "grad_norm": 0.20112092792987823, + "learning_rate": 1.5017401191432511e-05, + "loss": 1.6959, + "step": 24575 + }, + { + "epoch": 7.543278084714549, + "grad_norm": 0.17968857288360596, + "learning_rate": 1.5013849979760136e-05, + "loss": 1.6957, + "step": 24576 + }, + { + "epoch": 7.543585021485574, + "grad_norm": 0.20532584190368652, + "learning_rate": 1.5010299113841397e-05, + "loss": 1.7471, + "step": 24577 + }, + { + "epoch": 7.543891958256599, + "grad_norm": 0.16475969552993774, + "learning_rate": 1.5006748593711394e-05, + "loss": 1.7665, + "step": 24578 + }, + { + "epoch": 7.544198895027624, + "grad_norm": 0.17632076144218445, + "learning_rate": 1.5003198419405213e-05, + "loss": 1.7317, + "step": 24579 + }, + { + "epoch": 7.5445058317986495, + "grad_norm": 0.18197286128997803, + "learning_rate": 1.4999648590957937e-05, + "loss": 1.7278, + "step": 24580 + }, + { + "epoch": 7.544812768569675, + "grad_norm": 0.18043744564056396, + "learning_rate": 1.4996099108404648e-05, + "loss": 1.7335, + "step": 24581 + }, + { + "epoch": 7.5451197053407, + "grad_norm": 0.17072297632694244, + "learning_rate": 1.4992549971780407e-05, + "loss": 1.7236, + "step": 24582 + }, + { + "epoch": 7.545426642111725, + "grad_norm": 0.17413046956062317, + "learning_rate": 1.4989001181120338e-05, + "loss": 1.6794, + "step": 24583 + }, + { + "epoch": 7.54573357888275, + "grad_norm": 0.1684887856245041, + "learning_rate": 1.4985452736459443e-05, + "loss": 1.718, + "step": 24584 + }, + { + "epoch": 7.546040515653775, + "grad_norm": 0.19497069716453552, + "learning_rate": 1.4981904637832866e-05, + "loss": 1.7323, + "step": 24585 + }, + { + "epoch": 7.546347452424801, + "grad_norm": 0.24838820099830627, + "learning_rate": 1.4978356885275596e-05, + "loss": 1.7584, + "step": 24586 + }, + { + "epoch": 7.546654389195826, + "grad_norm": 0.20870071649551392, + "learning_rate": 1.4974809478822749e-05, + "loss": 1.738, + "step": 24587 + }, + { + "epoch": 7.546961325966851, + "grad_norm": 0.21980242431163788, + "learning_rate": 1.497126241850938e-05, + "loss": 1.763, + "step": 24588 + }, + { + "epoch": 7.547268262737876, + "grad_norm": 0.2156188189983368, + "learning_rate": 1.4967715704370488e-05, + "loss": 1.7357, + "step": 24589 + }, + { + "epoch": 7.547575199508901, + "grad_norm": 0.1864207684993744, + "learning_rate": 1.4964169336441202e-05, + "loss": 1.676, + "step": 24590 + }, + { + "epoch": 7.547882136279926, + "grad_norm": 0.18940003216266632, + "learning_rate": 1.4960623314756494e-05, + "loss": 1.7614, + "step": 24591 + }, + { + "epoch": 7.548189073050952, + "grad_norm": 0.19220350682735443, + "learning_rate": 1.4957077639351463e-05, + "loss": 1.7266, + "step": 24592 + }, + { + "epoch": 7.548496009821976, + "grad_norm": 0.15492811799049377, + "learning_rate": 1.4953532310261126e-05, + "loss": 1.7359, + "step": 24593 + }, + { + "epoch": 7.5488029465930016, + "grad_norm": 0.25591567158699036, + "learning_rate": 1.4949987327520526e-05, + "loss": 1.7, + "step": 24594 + }, + { + "epoch": 7.549109883364027, + "grad_norm": 0.18157868087291718, + "learning_rate": 1.4946442691164697e-05, + "loss": 1.7204, + "step": 24595 + }, + { + "epoch": 7.549416820135052, + "grad_norm": 0.17679910361766815, + "learning_rate": 1.4942898401228662e-05, + "loss": 1.6871, + "step": 24596 + }, + { + "epoch": 7.5497237569060776, + "grad_norm": 0.2000853717327118, + "learning_rate": 1.4939354457747456e-05, + "loss": 1.7186, + "step": 24597 + }, + { + "epoch": 7.550030693677103, + "grad_norm": 0.19947710633277893, + "learning_rate": 1.49358108607561e-05, + "loss": 1.6853, + "step": 24598 + }, + { + "epoch": 7.550337630448127, + "grad_norm": 0.16325148940086365, + "learning_rate": 1.4932267610289596e-05, + "loss": 1.7027, + "step": 24599 + }, + { + "epoch": 7.550644567219153, + "grad_norm": 0.22839638590812683, + "learning_rate": 1.4928724706383007e-05, + "loss": 1.7887, + "step": 24600 + }, + { + "epoch": 7.550951503990178, + "grad_norm": 0.16242358088493347, + "learning_rate": 1.4925182149071286e-05, + "loss": 1.6617, + "step": 24601 + }, + { + "epoch": 7.551258440761203, + "grad_norm": 0.1674090027809143, + "learning_rate": 1.4921639938389504e-05, + "loss": 1.656, + "step": 24602 + }, + { + "epoch": 7.551565377532229, + "grad_norm": 0.1628156453371048, + "learning_rate": 1.4918098074372605e-05, + "loss": 1.683, + "step": 24603 + }, + { + "epoch": 7.551872314303253, + "grad_norm": 0.19156567752361298, + "learning_rate": 1.4914556557055637e-05, + "loss": 1.7174, + "step": 24604 + }, + { + "epoch": 7.5521792510742785, + "grad_norm": 0.19634003937244415, + "learning_rate": 1.4911015386473603e-05, + "loss": 1.6605, + "step": 24605 + }, + { + "epoch": 7.552486187845304, + "grad_norm": 0.19273599982261658, + "learning_rate": 1.490747456266145e-05, + "loss": 1.7092, + "step": 24606 + }, + { + "epoch": 7.552793124616329, + "grad_norm": 0.23641756176948547, + "learning_rate": 1.4903934085654231e-05, + "loss": 1.7524, + "step": 24607 + }, + { + "epoch": 7.5531000613873545, + "grad_norm": 0.19623206555843353, + "learning_rate": 1.490039395548688e-05, + "loss": 1.7281, + "step": 24608 + }, + { + "epoch": 7.55340699815838, + "grad_norm": 0.1978278011083603, + "learning_rate": 1.489685417219442e-05, + "loss": 1.7099, + "step": 24609 + }, + { + "epoch": 7.553713934929404, + "grad_norm": 0.19635866582393646, + "learning_rate": 1.489331473581182e-05, + "loss": 1.7146, + "step": 24610 + }, + { + "epoch": 7.55402087170043, + "grad_norm": 0.2121066302061081, + "learning_rate": 1.4889775646374065e-05, + "loss": 1.7598, + "step": 24611 + }, + { + "epoch": 7.554327808471455, + "grad_norm": 0.17944596707820892, + "learning_rate": 1.4886236903916122e-05, + "loss": 1.6778, + "step": 24612 + }, + { + "epoch": 7.55463474524248, + "grad_norm": 0.15834666788578033, + "learning_rate": 1.488269850847297e-05, + "loss": 1.6498, + "step": 24613 + }, + { + "epoch": 7.554941682013506, + "grad_norm": 0.18597754836082458, + "learning_rate": 1.4879160460079573e-05, + "loss": 1.7145, + "step": 24614 + }, + { + "epoch": 7.55524861878453, + "grad_norm": 0.18300876021385193, + "learning_rate": 1.4875622758770897e-05, + "loss": 1.7253, + "step": 24615 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 0.17805244028568268, + "learning_rate": 1.4872085404581887e-05, + "loss": 1.7152, + "step": 24616 + }, + { + "epoch": 7.555862492326581, + "grad_norm": 0.1987949162721634, + "learning_rate": 1.486854839754755e-05, + "loss": 1.7501, + "step": 24617 + }, + { + "epoch": 7.556169429097606, + "grad_norm": 0.17301858961582184, + "learning_rate": 1.4865011737702777e-05, + "loss": 1.7122, + "step": 24618 + }, + { + "epoch": 7.556476365868631, + "grad_norm": 0.180507093667984, + "learning_rate": 1.4861475425082583e-05, + "loss": 1.7192, + "step": 24619 + }, + { + "epoch": 7.556783302639657, + "grad_norm": 0.16658489406108856, + "learning_rate": 1.4857939459721854e-05, + "loss": 1.6879, + "step": 24620 + }, + { + "epoch": 7.557090239410681, + "grad_norm": 0.19498902559280396, + "learning_rate": 1.4854403841655578e-05, + "loss": 1.7395, + "step": 24621 + }, + { + "epoch": 7.5573971761817065, + "grad_norm": 0.1737620085477829, + "learning_rate": 1.4850868570918702e-05, + "loss": 1.7029, + "step": 24622 + }, + { + "epoch": 7.557704112952732, + "grad_norm": 0.1600165218114853, + "learning_rate": 1.4847333647546113e-05, + "loss": 1.7194, + "step": 24623 + }, + { + "epoch": 7.558011049723757, + "grad_norm": 0.18392407894134521, + "learning_rate": 1.4843799071572806e-05, + "loss": 1.6838, + "step": 24624 + }, + { + "epoch": 7.558317986494782, + "grad_norm": 0.19074605405330658, + "learning_rate": 1.4840264843033651e-05, + "loss": 1.7069, + "step": 24625 + }, + { + "epoch": 7.558624923265807, + "grad_norm": 0.18156903982162476, + "learning_rate": 1.4836730961963619e-05, + "loss": 1.6494, + "step": 24626 + }, + { + "epoch": 7.558931860036832, + "grad_norm": 0.16716471314430237, + "learning_rate": 1.4833197428397627e-05, + "loss": 1.7516, + "step": 24627 + }, + { + "epoch": 7.559238796807858, + "grad_norm": 0.18882833421230316, + "learning_rate": 1.4829664242370588e-05, + "loss": 1.7117, + "step": 24628 + }, + { + "epoch": 7.559545733578883, + "grad_norm": 0.19933676719665527, + "learning_rate": 1.482613140391742e-05, + "loss": 1.6928, + "step": 24629 + }, + { + "epoch": 7.559852670349908, + "grad_norm": 0.15574946999549866, + "learning_rate": 1.4822598913073039e-05, + "loss": 1.702, + "step": 24630 + }, + { + "epoch": 7.560159607120933, + "grad_norm": 0.1953001618385315, + "learning_rate": 1.4819066769872353e-05, + "loss": 1.75, + "step": 24631 + }, + { + "epoch": 7.560466543891958, + "grad_norm": 0.18364208936691284, + "learning_rate": 1.481553497435027e-05, + "loss": 1.6697, + "step": 24632 + }, + { + "epoch": 7.560773480662983, + "grad_norm": 0.16670002043247223, + "learning_rate": 1.4812003526541673e-05, + "loss": 1.6919, + "step": 24633 + }, + { + "epoch": 7.561080417434009, + "grad_norm": 0.19388388097286224, + "learning_rate": 1.4808472426481518e-05, + "loss": 1.7412, + "step": 24634 + }, + { + "epoch": 7.561387354205034, + "grad_norm": 0.19203592836856842, + "learning_rate": 1.4804941674204631e-05, + "loss": 1.7128, + "step": 24635 + }, + { + "epoch": 7.5616942909760585, + "grad_norm": 0.18893340229988098, + "learning_rate": 1.4801411269745974e-05, + "loss": 1.7018, + "step": 24636 + }, + { + "epoch": 7.562001227747084, + "grad_norm": 0.1825447529554367, + "learning_rate": 1.4797881213140363e-05, + "loss": 1.7216, + "step": 24637 + }, + { + "epoch": 7.562308164518109, + "grad_norm": 0.19031697511672974, + "learning_rate": 1.4794351504422743e-05, + "loss": 1.7479, + "step": 24638 + }, + { + "epoch": 7.5626151012891345, + "grad_norm": 0.18328487873077393, + "learning_rate": 1.4790822143627991e-05, + "loss": 1.7222, + "step": 24639 + }, + { + "epoch": 7.56292203806016, + "grad_norm": 0.17531271278858185, + "learning_rate": 1.4787293130790941e-05, + "loss": 1.7197, + "step": 24640 + }, + { + "epoch": 7.563228974831185, + "grad_norm": 0.17078469693660736, + "learning_rate": 1.4783764465946526e-05, + "loss": 1.7715, + "step": 24641 + }, + { + "epoch": 7.56353591160221, + "grad_norm": 0.1859765648841858, + "learning_rate": 1.4780236149129567e-05, + "loss": 1.698, + "step": 24642 + }, + { + "epoch": 7.563842848373235, + "grad_norm": 0.18488194048404694, + "learning_rate": 1.4776708180374965e-05, + "loss": 1.6943, + "step": 24643 + }, + { + "epoch": 7.56414978514426, + "grad_norm": 0.1741705685853958, + "learning_rate": 1.4773180559717586e-05, + "loss": 1.6966, + "step": 24644 + }, + { + "epoch": 7.564456721915286, + "grad_norm": 0.20310313999652863, + "learning_rate": 1.476965328719228e-05, + "loss": 1.7572, + "step": 24645 + }, + { + "epoch": 7.564763658686311, + "grad_norm": 0.20557743310928345, + "learning_rate": 1.476612636283391e-05, + "loss": 1.7419, + "step": 24646 + }, + { + "epoch": 7.565070595457335, + "grad_norm": 0.20597940683364868, + "learning_rate": 1.4762599786677329e-05, + "loss": 1.7147, + "step": 24647 + }, + { + "epoch": 7.565377532228361, + "grad_norm": 0.21609526872634888, + "learning_rate": 1.4759073558757391e-05, + "loss": 1.7678, + "step": 24648 + }, + { + "epoch": 7.565684468999386, + "grad_norm": 0.2233472615480423, + "learning_rate": 1.4755547679108945e-05, + "loss": 1.7381, + "step": 24649 + }, + { + "epoch": 7.565991405770411, + "grad_norm": 0.19561493396759033, + "learning_rate": 1.4752022147766814e-05, + "loss": 1.7254, + "step": 24650 + }, + { + "epoch": 7.566298342541437, + "grad_norm": 0.16491469740867615, + "learning_rate": 1.4748496964765896e-05, + "loss": 1.6834, + "step": 24651 + }, + { + "epoch": 7.566605279312462, + "grad_norm": 0.16946618258953094, + "learning_rate": 1.4744972130140955e-05, + "loss": 1.7154, + "step": 24652 + }, + { + "epoch": 7.5669122160834865, + "grad_norm": 0.1625654697418213, + "learning_rate": 1.4741447643926904e-05, + "loss": 1.6941, + "step": 24653 + }, + { + "epoch": 7.567219152854512, + "grad_norm": 0.16875535249710083, + "learning_rate": 1.4737923506158491e-05, + "loss": 1.6875, + "step": 24654 + }, + { + "epoch": 7.567526089625537, + "grad_norm": 0.1625872105360031, + "learning_rate": 1.4734399716870607e-05, + "loss": 1.6558, + "step": 24655 + }, + { + "epoch": 7.5678330263965625, + "grad_norm": 0.17323140799999237, + "learning_rate": 1.4730876276098071e-05, + "loss": 1.7468, + "step": 24656 + }, + { + "epoch": 7.568139963167588, + "grad_norm": 0.18788693845272064, + "learning_rate": 1.472735318387566e-05, + "loss": 1.7345, + "step": 24657 + }, + { + "epoch": 7.568446899938612, + "grad_norm": 0.18096889555454254, + "learning_rate": 1.472383044023824e-05, + "loss": 1.725, + "step": 24658 + }, + { + "epoch": 7.568753836709638, + "grad_norm": 0.2327791154384613, + "learning_rate": 1.4720308045220577e-05, + "loss": 1.7367, + "step": 24659 + }, + { + "epoch": 7.569060773480663, + "grad_norm": 0.187728151679039, + "learning_rate": 1.4716785998857525e-05, + "loss": 1.6967, + "step": 24660 + }, + { + "epoch": 7.569367710251688, + "grad_norm": 0.18520617485046387, + "learning_rate": 1.4713264301183876e-05, + "loss": 1.6576, + "step": 24661 + }, + { + "epoch": 7.569674647022714, + "grad_norm": 0.20537808537483215, + "learning_rate": 1.4709742952234428e-05, + "loss": 1.6911, + "step": 24662 + }, + { + "epoch": 7.569981583793739, + "grad_norm": 0.18872039020061493, + "learning_rate": 1.4706221952043986e-05, + "loss": 1.745, + "step": 24663 + }, + { + "epoch": 7.570288520564763, + "grad_norm": 0.16083933413028717, + "learning_rate": 1.4702701300647343e-05, + "loss": 1.6875, + "step": 24664 + }, + { + "epoch": 7.570595457335789, + "grad_norm": 0.19390366971492767, + "learning_rate": 1.4699180998079293e-05, + "loss": 1.6996, + "step": 24665 + }, + { + "epoch": 7.570902394106814, + "grad_norm": 0.20478816330432892, + "learning_rate": 1.4695661044374632e-05, + "loss": 1.7359, + "step": 24666 + }, + { + "epoch": 7.571209330877839, + "grad_norm": 0.17485570907592773, + "learning_rate": 1.4692141439568136e-05, + "loss": 1.696, + "step": 24667 + }, + { + "epoch": 7.571516267648864, + "grad_norm": 0.18266968429088593, + "learning_rate": 1.4688622183694594e-05, + "loss": 1.713, + "step": 24668 + }, + { + "epoch": 7.571823204419889, + "grad_norm": 0.14412200450897217, + "learning_rate": 1.468510327678877e-05, + "loss": 1.6938, + "step": 24669 + }, + { + "epoch": 7.5721301411909145, + "grad_norm": 0.18144819140434265, + "learning_rate": 1.4681584718885488e-05, + "loss": 1.7523, + "step": 24670 + }, + { + "epoch": 7.57243707796194, + "grad_norm": 0.32198768854141235, + "learning_rate": 1.467806651001945e-05, + "loss": 1.71, + "step": 24671 + }, + { + "epoch": 7.572744014732965, + "grad_norm": 0.1535005122423172, + "learning_rate": 1.4674548650225483e-05, + "loss": 1.6912, + "step": 24672 + }, + { + "epoch": 7.5730509515039905, + "grad_norm": 0.17982423305511475, + "learning_rate": 1.4671031139538343e-05, + "loss": 1.6928, + "step": 24673 + }, + { + "epoch": 7.573357888275015, + "grad_norm": 0.16811783611774445, + "learning_rate": 1.4667513977992747e-05, + "loss": 1.6954, + "step": 24674 + }, + { + "epoch": 7.57366482504604, + "grad_norm": 0.18918997049331665, + "learning_rate": 1.4663997165623522e-05, + "loss": 1.6967, + "step": 24675 + }, + { + "epoch": 7.573971761817066, + "grad_norm": 0.16559816896915436, + "learning_rate": 1.4660480702465357e-05, + "loss": 1.7097, + "step": 24676 + }, + { + "epoch": 7.574278698588091, + "grad_norm": 0.20471042394638062, + "learning_rate": 1.4656964588553046e-05, + "loss": 1.7032, + "step": 24677 + }, + { + "epoch": 7.574585635359116, + "grad_norm": 0.16387851536273956, + "learning_rate": 1.4653448823921329e-05, + "loss": 1.7066, + "step": 24678 + }, + { + "epoch": 7.574892572130141, + "grad_norm": 0.19144418835639954, + "learning_rate": 1.4649933408604949e-05, + "loss": 1.7272, + "step": 24679 + }, + { + "epoch": 7.575199508901166, + "grad_norm": 0.17270216345787048, + "learning_rate": 1.4646418342638646e-05, + "loss": 1.7456, + "step": 24680 + }, + { + "epoch": 7.5755064456721914, + "grad_norm": 0.1937440037727356, + "learning_rate": 1.4642903626057159e-05, + "loss": 1.6973, + "step": 24681 + }, + { + "epoch": 7.575813382443217, + "grad_norm": 0.18958482146263123, + "learning_rate": 1.463938925889522e-05, + "loss": 1.7549, + "step": 24682 + }, + { + "epoch": 7.576120319214242, + "grad_norm": 0.20584101974964142, + "learning_rate": 1.4635875241187558e-05, + "loss": 1.7013, + "step": 24683 + }, + { + "epoch": 7.5764272559852675, + "grad_norm": 0.22839057445526123, + "learning_rate": 1.463236157296891e-05, + "loss": 1.7282, + "step": 24684 + }, + { + "epoch": 7.576734192756292, + "grad_norm": 0.19894570112228394, + "learning_rate": 1.4628848254273996e-05, + "loss": 1.7115, + "step": 24685 + }, + { + "epoch": 7.577041129527317, + "grad_norm": 0.1880837082862854, + "learning_rate": 1.4625335285137515e-05, + "loss": 1.6526, + "step": 24686 + }, + { + "epoch": 7.577348066298343, + "grad_norm": 0.21545001864433289, + "learning_rate": 1.4621822665594238e-05, + "loss": 1.6709, + "step": 24687 + }, + { + "epoch": 7.577655003069368, + "grad_norm": 0.2091502994298935, + "learning_rate": 1.4618310395678813e-05, + "loss": 1.6792, + "step": 24688 + }, + { + "epoch": 7.577961939840393, + "grad_norm": 0.2100556343793869, + "learning_rate": 1.4614798475426018e-05, + "loss": 1.7112, + "step": 24689 + }, + { + "epoch": 7.578268876611418, + "grad_norm": 0.17702727019786835, + "learning_rate": 1.4611286904870502e-05, + "loss": 1.6353, + "step": 24690 + }, + { + "epoch": 7.578575813382443, + "grad_norm": 0.1935967355966568, + "learning_rate": 1.4607775684046975e-05, + "loss": 1.6638, + "step": 24691 + }, + { + "epoch": 7.578882750153468, + "grad_norm": 0.13495506346225739, + "learning_rate": 1.4604264812990193e-05, + "loss": 1.6526, + "step": 24692 + }, + { + "epoch": 7.579189686924494, + "grad_norm": 0.20418134331703186, + "learning_rate": 1.4600754291734774e-05, + "loss": 1.731, + "step": 24693 + }, + { + "epoch": 7.579496623695519, + "grad_norm": 0.1541702151298523, + "learning_rate": 1.4597244120315467e-05, + "loss": 1.7047, + "step": 24694 + }, + { + "epoch": 7.579803560466544, + "grad_norm": 0.2106262892484665, + "learning_rate": 1.4593734298766942e-05, + "loss": 1.696, + "step": 24695 + }, + { + "epoch": 7.580110497237569, + "grad_norm": 0.15727077424526215, + "learning_rate": 1.4590224827123889e-05, + "loss": 1.6782, + "step": 24696 + }, + { + "epoch": 7.580417434008594, + "grad_norm": 0.19231721758842468, + "learning_rate": 1.4586715705420983e-05, + "loss": 1.7832, + "step": 24697 + }, + { + "epoch": 7.5807243707796195, + "grad_norm": 0.18290117383003235, + "learning_rate": 1.4583206933692916e-05, + "loss": 1.6715, + "step": 24698 + }, + { + "epoch": 7.581031307550645, + "grad_norm": 0.21551427245140076, + "learning_rate": 1.4579698511974355e-05, + "loss": 1.7326, + "step": 24699 + }, + { + "epoch": 7.581338244321669, + "grad_norm": 0.21561767160892487, + "learning_rate": 1.457619044029997e-05, + "loss": 1.6682, + "step": 24700 + }, + { + "epoch": 7.581645181092695, + "grad_norm": 0.15537963807582855, + "learning_rate": 1.457268271870444e-05, + "loss": 1.719, + "step": 24701 + }, + { + "epoch": 7.58195211786372, + "grad_norm": 0.18738612532615662, + "learning_rate": 1.456917534722242e-05, + "loss": 1.7415, + "step": 24702 + }, + { + "epoch": 7.582259054634745, + "grad_norm": 0.15522584319114685, + "learning_rate": 1.456566832588856e-05, + "loss": 1.6931, + "step": 24703 + }, + { + "epoch": 7.582565991405771, + "grad_norm": 0.192890003323555, + "learning_rate": 1.4562161654737567e-05, + "loss": 1.7726, + "step": 24704 + }, + { + "epoch": 7.582872928176796, + "grad_norm": 0.2163987159729004, + "learning_rate": 1.4558655333804028e-05, + "loss": 1.7459, + "step": 24705 + }, + { + "epoch": 7.58317986494782, + "grad_norm": 0.1635672152042389, + "learning_rate": 1.4555149363122667e-05, + "loss": 1.7407, + "step": 24706 + }, + { + "epoch": 7.583486801718846, + "grad_norm": 0.1858159899711609, + "learning_rate": 1.4551643742728072e-05, + "loss": 1.7175, + "step": 24707 + }, + { + "epoch": 7.583793738489871, + "grad_norm": 0.23077011108398438, + "learning_rate": 1.4548138472654904e-05, + "loss": 1.7739, + "step": 24708 + }, + { + "epoch": 7.584100675260896, + "grad_norm": 0.22413180768489838, + "learning_rate": 1.4544633552937836e-05, + "loss": 1.7208, + "step": 24709 + }, + { + "epoch": 7.584407612031922, + "grad_norm": 0.16147246956825256, + "learning_rate": 1.4541128983611445e-05, + "loss": 1.7021, + "step": 24710 + }, + { + "epoch": 7.584714548802946, + "grad_norm": 0.17363815009593964, + "learning_rate": 1.4537624764710439e-05, + "loss": 1.6863, + "step": 24711 + }, + { + "epoch": 7.5850214855739715, + "grad_norm": 0.14971798658370972, + "learning_rate": 1.4534120896269377e-05, + "loss": 1.655, + "step": 24712 + }, + { + "epoch": 7.585328422344997, + "grad_norm": 0.15934213995933533, + "learning_rate": 1.4530617378322937e-05, + "loss": 1.6771, + "step": 24713 + }, + { + "epoch": 7.585635359116022, + "grad_norm": 0.17807291448116302, + "learning_rate": 1.4527114210905724e-05, + "loss": 1.7419, + "step": 24714 + }, + { + "epoch": 7.5859422958870475, + "grad_norm": 0.1727002114057541, + "learning_rate": 1.4523611394052356e-05, + "loss": 1.7232, + "step": 24715 + }, + { + "epoch": 7.586249232658073, + "grad_norm": 0.1625738888978958, + "learning_rate": 1.452010892779746e-05, + "loss": 1.6967, + "step": 24716 + }, + { + "epoch": 7.586556169429097, + "grad_norm": 0.2153816670179367, + "learning_rate": 1.4516606812175636e-05, + "loss": 1.7339, + "step": 24717 + }, + { + "epoch": 7.586863106200123, + "grad_norm": 0.19343912601470947, + "learning_rate": 1.451310504722151e-05, + "loss": 1.7059, + "step": 24718 + }, + { + "epoch": 7.587170042971148, + "grad_norm": 0.16220279037952423, + "learning_rate": 1.450960363296967e-05, + "loss": 1.6825, + "step": 24719 + }, + { + "epoch": 7.587476979742173, + "grad_norm": 0.1678459346294403, + "learning_rate": 1.4506102569454716e-05, + "loss": 1.728, + "step": 24720 + }, + { + "epoch": 7.587783916513199, + "grad_norm": 0.19833502173423767, + "learning_rate": 1.4502601856711295e-05, + "loss": 1.7733, + "step": 24721 + }, + { + "epoch": 7.588090853284223, + "grad_norm": 0.1593111902475357, + "learning_rate": 1.4499101494773931e-05, + "loss": 1.7017, + "step": 24722 + }, + { + "epoch": 7.588397790055248, + "grad_norm": 0.2083328664302826, + "learning_rate": 1.449560148367729e-05, + "loss": 1.7661, + "step": 24723 + }, + { + "epoch": 7.588704726826274, + "grad_norm": 0.19797182083129883, + "learning_rate": 1.4492101823455906e-05, + "loss": 1.788, + "step": 24724 + }, + { + "epoch": 7.589011663597299, + "grad_norm": 0.15613096952438354, + "learning_rate": 1.4488602514144373e-05, + "loss": 1.7295, + "step": 24725 + }, + { + "epoch": 7.589318600368324, + "grad_norm": 0.18078529834747314, + "learning_rate": 1.4485103555777307e-05, + "loss": 1.7165, + "step": 24726 + }, + { + "epoch": 7.58962553713935, + "grad_norm": 0.14951148629188538, + "learning_rate": 1.4481604948389238e-05, + "loss": 1.6431, + "step": 24727 + }, + { + "epoch": 7.589932473910374, + "grad_norm": 0.19518490135669708, + "learning_rate": 1.4478106692014797e-05, + "loss": 1.7332, + "step": 24728 + }, + { + "epoch": 7.5902394106813995, + "grad_norm": 0.17438004910945892, + "learning_rate": 1.4474608786688493e-05, + "loss": 1.6677, + "step": 24729 + }, + { + "epoch": 7.590546347452425, + "grad_norm": 0.2767544090747833, + "learning_rate": 1.4471111232444944e-05, + "loss": 1.7649, + "step": 24730 + }, + { + "epoch": 7.59085328422345, + "grad_norm": 0.21649987995624542, + "learning_rate": 1.4467614029318699e-05, + "loss": 1.7349, + "step": 24731 + }, + { + "epoch": 7.5911602209944755, + "grad_norm": 0.26566463708877563, + "learning_rate": 1.4464117177344316e-05, + "loss": 1.7474, + "step": 24732 + }, + { + "epoch": 7.5914671577655, + "grad_norm": 0.19050925970077515, + "learning_rate": 1.4460620676556358e-05, + "loss": 1.7066, + "step": 24733 + }, + { + "epoch": 7.591774094536525, + "grad_norm": 0.20030665397644043, + "learning_rate": 1.4457124526989375e-05, + "loss": 1.6589, + "step": 24734 + }, + { + "epoch": 7.592081031307551, + "grad_norm": 0.18715742230415344, + "learning_rate": 1.4453628728677921e-05, + "loss": 1.7186, + "step": 24735 + }, + { + "epoch": 7.592387968078576, + "grad_norm": 0.241498664021492, + "learning_rate": 1.4450133281656542e-05, + "loss": 1.6686, + "step": 24736 + }, + { + "epoch": 7.592694904849601, + "grad_norm": 0.20305299758911133, + "learning_rate": 1.4446638185959765e-05, + "loss": 1.7351, + "step": 24737 + }, + { + "epoch": 7.593001841620627, + "grad_norm": 0.177521750330925, + "learning_rate": 1.444314344162218e-05, + "loss": 1.6383, + "step": 24738 + }, + { + "epoch": 7.593308778391651, + "grad_norm": 0.19877439737319946, + "learning_rate": 1.443964904867826e-05, + "loss": 1.7335, + "step": 24739 + }, + { + "epoch": 7.593615715162676, + "grad_norm": 0.16544201970100403, + "learning_rate": 1.4436155007162605e-05, + "loss": 1.6952, + "step": 24740 + }, + { + "epoch": 7.593922651933702, + "grad_norm": 0.20925499498844147, + "learning_rate": 1.443266131710969e-05, + "loss": 1.7042, + "step": 24741 + }, + { + "epoch": 7.594229588704727, + "grad_norm": 0.16688574850559235, + "learning_rate": 1.4429167978554054e-05, + "loss": 1.6797, + "step": 24742 + }, + { + "epoch": 7.5945365254757515, + "grad_norm": 0.2231293022632599, + "learning_rate": 1.4425674991530258e-05, + "loss": 1.8697, + "step": 24743 + }, + { + "epoch": 7.594843462246777, + "grad_norm": 0.2114260196685791, + "learning_rate": 1.442218235607276e-05, + "loss": 1.7404, + "step": 24744 + }, + { + "epoch": 7.595150399017802, + "grad_norm": 0.1842830628156662, + "learning_rate": 1.441869007221614e-05, + "loss": 1.7687, + "step": 24745 + }, + { + "epoch": 7.5954573357888275, + "grad_norm": 0.17780441045761108, + "learning_rate": 1.4415198139994846e-05, + "loss": 1.7492, + "step": 24746 + }, + { + "epoch": 7.595764272559853, + "grad_norm": 0.18805068731307983, + "learning_rate": 1.4411706559443438e-05, + "loss": 1.757, + "step": 24747 + }, + { + "epoch": 7.596071209330878, + "grad_norm": 0.18918974697589874, + "learning_rate": 1.4408215330596403e-05, + "loss": 1.7006, + "step": 24748 + }, + { + "epoch": 7.596378146101903, + "grad_norm": 0.17850689589977264, + "learning_rate": 1.440472445348825e-05, + "loss": 1.6565, + "step": 24749 + }, + { + "epoch": 7.596685082872928, + "grad_norm": 0.20043544471263885, + "learning_rate": 1.4401233928153468e-05, + "loss": 1.7314, + "step": 24750 + }, + { + "epoch": 7.596992019643953, + "grad_norm": 0.1963229477405548, + "learning_rate": 1.4397743754626564e-05, + "loss": 1.6946, + "step": 24751 + }, + { + "epoch": 7.597298956414979, + "grad_norm": 0.2203695923089981, + "learning_rate": 1.4394253932942014e-05, + "loss": 1.7128, + "step": 24752 + }, + { + "epoch": 7.597605893186004, + "grad_norm": 0.19254128634929657, + "learning_rate": 1.4390764463134322e-05, + "loss": 1.6748, + "step": 24753 + }, + { + "epoch": 7.597912829957028, + "grad_norm": 0.19880495965480804, + "learning_rate": 1.438727534523795e-05, + "loss": 1.7155, + "step": 24754 + }, + { + "epoch": 7.598219766728054, + "grad_norm": 0.17486177384853363, + "learning_rate": 1.4383786579287428e-05, + "loss": 1.7484, + "step": 24755 + }, + { + "epoch": 7.598526703499079, + "grad_norm": 0.17247791588306427, + "learning_rate": 1.4380298165317168e-05, + "loss": 1.7225, + "step": 24756 + }, + { + "epoch": 7.598833640270104, + "grad_norm": 0.1802847534418106, + "learning_rate": 1.4376810103361714e-05, + "loss": 1.7009, + "step": 24757 + }, + { + "epoch": 7.59914057704113, + "grad_norm": 0.1934153437614441, + "learning_rate": 1.4373322393455485e-05, + "loss": 1.6957, + "step": 24758 + }, + { + "epoch": 7.599447513812155, + "grad_norm": 0.1508229374885559, + "learning_rate": 1.436983503563295e-05, + "loss": 1.6677, + "step": 24759 + }, + { + "epoch": 7.5997544505831796, + "grad_norm": 0.16684283316135406, + "learning_rate": 1.4366348029928623e-05, + "loss": 1.7394, + "step": 24760 + }, + { + "epoch": 7.600061387354205, + "grad_norm": 0.22492031753063202, + "learning_rate": 1.4362861376376896e-05, + "loss": 1.7302, + "step": 24761 + }, + { + "epoch": 7.60036832412523, + "grad_norm": 0.1654716283082962, + "learning_rate": 1.4359375075012294e-05, + "loss": 1.6487, + "step": 24762 + }, + { + "epoch": 7.600675260896256, + "grad_norm": 0.17514392733573914, + "learning_rate": 1.4355889125869198e-05, + "loss": 1.6952, + "step": 24763 + }, + { + "epoch": 7.600982197667281, + "grad_norm": 0.21000738441944122, + "learning_rate": 1.4352403528982123e-05, + "loss": 1.714, + "step": 24764 + }, + { + "epoch": 7.601289134438305, + "grad_norm": 0.18791960179805756, + "learning_rate": 1.4348918284385481e-05, + "loss": 1.7334, + "step": 24765 + }, + { + "epoch": 7.601596071209331, + "grad_norm": 0.267089307308197, + "learning_rate": 1.4345433392113734e-05, + "loss": 1.7567, + "step": 24766 + }, + { + "epoch": 7.601903007980356, + "grad_norm": 0.1814621239900589, + "learning_rate": 1.4341948852201304e-05, + "loss": 1.7031, + "step": 24767 + }, + { + "epoch": 7.602209944751381, + "grad_norm": 0.16144737601280212, + "learning_rate": 1.4338464664682639e-05, + "loss": 1.6844, + "step": 24768 + }, + { + "epoch": 7.602516881522407, + "grad_norm": 0.14824162423610687, + "learning_rate": 1.433498082959217e-05, + "loss": 1.6854, + "step": 24769 + }, + { + "epoch": 7.602823818293432, + "grad_norm": 0.1837405115365982, + "learning_rate": 1.4331497346964318e-05, + "loss": 1.7087, + "step": 24770 + }, + { + "epoch": 7.6031307550644565, + "grad_norm": 0.20706148445606232, + "learning_rate": 1.4328014216833508e-05, + "loss": 1.7816, + "step": 24771 + }, + { + "epoch": 7.603437691835482, + "grad_norm": 0.16134382784366608, + "learning_rate": 1.4324531439234196e-05, + "loss": 1.7095, + "step": 24772 + }, + { + "epoch": 7.603744628606507, + "grad_norm": 0.15924426913261414, + "learning_rate": 1.4321049014200737e-05, + "loss": 1.7115, + "step": 24773 + }, + { + "epoch": 7.6040515653775325, + "grad_norm": 0.14942041039466858, + "learning_rate": 1.4317566941767625e-05, + "loss": 1.6872, + "step": 24774 + }, + { + "epoch": 7.604358502148557, + "grad_norm": 0.1646505445241928, + "learning_rate": 1.4314085221969209e-05, + "loss": 1.663, + "step": 24775 + }, + { + "epoch": 7.604665438919582, + "grad_norm": 0.17342600226402283, + "learning_rate": 1.4310603854839904e-05, + "loss": 1.7702, + "step": 24776 + }, + { + "epoch": 7.604972375690608, + "grad_norm": 0.17148490250110626, + "learning_rate": 1.4307122840414167e-05, + "loss": 1.7392, + "step": 24777 + }, + { + "epoch": 7.605279312461633, + "grad_norm": 0.22112305462360382, + "learning_rate": 1.4303642178726328e-05, + "loss": 1.6784, + "step": 24778 + }, + { + "epoch": 7.605586249232658, + "grad_norm": 0.22548529505729675, + "learning_rate": 1.4300161869810846e-05, + "loss": 1.7405, + "step": 24779 + }, + { + "epoch": 7.605893186003684, + "grad_norm": 0.179958313703537, + "learning_rate": 1.4296681913702065e-05, + "loss": 1.6848, + "step": 24780 + }, + { + "epoch": 7.606200122774708, + "grad_norm": 0.16872282326221466, + "learning_rate": 1.4293202310434407e-05, + "loss": 1.6973, + "step": 24781 + }, + { + "epoch": 7.606507059545733, + "grad_norm": 0.20554648339748383, + "learning_rate": 1.428972306004226e-05, + "loss": 1.7111, + "step": 24782 + }, + { + "epoch": 7.606813996316759, + "grad_norm": 0.1803034543991089, + "learning_rate": 1.4286244162559993e-05, + "loss": 1.6895, + "step": 24783 + }, + { + "epoch": 7.607120933087784, + "grad_norm": 0.18902915716171265, + "learning_rate": 1.4282765618021999e-05, + "loss": 1.766, + "step": 24784 + }, + { + "epoch": 7.607427869858809, + "grad_norm": 0.16692081093788147, + "learning_rate": 1.4279287426462646e-05, + "loss": 1.688, + "step": 24785 + }, + { + "epoch": 7.607734806629834, + "grad_norm": 0.1538083851337433, + "learning_rate": 1.4275809587916317e-05, + "loss": 1.6611, + "step": 24786 + }, + { + "epoch": 7.608041743400859, + "grad_norm": 0.1921710968017578, + "learning_rate": 1.4272332102417369e-05, + "loss": 1.7338, + "step": 24787 + }, + { + "epoch": 7.6083486801718845, + "grad_norm": 0.1812380999326706, + "learning_rate": 1.4268854970000167e-05, + "loss": 1.7613, + "step": 24788 + }, + { + "epoch": 7.60865561694291, + "grad_norm": 0.1762949675321579, + "learning_rate": 1.4265378190699108e-05, + "loss": 1.6796, + "step": 24789 + }, + { + "epoch": 7.608962553713935, + "grad_norm": 0.17698180675506592, + "learning_rate": 1.4261901764548497e-05, + "loss": 1.7065, + "step": 24790 + }, + { + "epoch": 7.6092694904849605, + "grad_norm": 0.18398644030094147, + "learning_rate": 1.4258425691582756e-05, + "loss": 1.7322, + "step": 24791 + }, + { + "epoch": 7.609576427255985, + "grad_norm": 0.18370044231414795, + "learning_rate": 1.425494997183618e-05, + "loss": 1.7565, + "step": 24792 + }, + { + "epoch": 7.60988336402701, + "grad_norm": 0.19615988433361053, + "learning_rate": 1.4251474605343124e-05, + "loss": 1.7507, + "step": 24793 + }, + { + "epoch": 7.610190300798036, + "grad_norm": 0.17218533158302307, + "learning_rate": 1.4247999592137979e-05, + "loss": 1.6692, + "step": 24794 + }, + { + "epoch": 7.610497237569061, + "grad_norm": 0.19105172157287598, + "learning_rate": 1.4244524932255027e-05, + "loss": 1.7421, + "step": 24795 + }, + { + "epoch": 7.610804174340086, + "grad_norm": 0.21565218269824982, + "learning_rate": 1.424105062572867e-05, + "loss": 1.7143, + "step": 24796 + }, + { + "epoch": 7.611111111111111, + "grad_norm": 0.17394152283668518, + "learning_rate": 1.4237576672593178e-05, + "loss": 1.7202, + "step": 24797 + }, + { + "epoch": 7.611418047882136, + "grad_norm": 0.18680404126644135, + "learning_rate": 1.4234103072882926e-05, + "loss": 1.7155, + "step": 24798 + }, + { + "epoch": 7.611724984653161, + "grad_norm": 0.16173312067985535, + "learning_rate": 1.4230629826632237e-05, + "loss": 1.6549, + "step": 24799 + }, + { + "epoch": 7.612031921424187, + "grad_norm": 0.2055300772190094, + "learning_rate": 1.4227156933875423e-05, + "loss": 1.7382, + "step": 24800 + }, + { + "epoch": 7.612338858195212, + "grad_norm": 0.17331050336360931, + "learning_rate": 1.4223684394646813e-05, + "loss": 1.719, + "step": 24801 + }, + { + "epoch": 7.612645794966237, + "grad_norm": 0.23106786608695984, + "learning_rate": 1.4220212208980727e-05, + "loss": 1.7083, + "step": 24802 + }, + { + "epoch": 7.612952731737262, + "grad_norm": 0.21011751890182495, + "learning_rate": 1.4216740376911469e-05, + "loss": 1.7629, + "step": 24803 + }, + { + "epoch": 7.613259668508287, + "grad_norm": 0.15120279788970947, + "learning_rate": 1.4213268898473359e-05, + "loss": 1.673, + "step": 24804 + }, + { + "epoch": 7.6135666052793125, + "grad_norm": 0.17431862652301788, + "learning_rate": 1.4209797773700684e-05, + "loss": 1.672, + "step": 24805 + }, + { + "epoch": 7.613873542050338, + "grad_norm": 0.1592133790254593, + "learning_rate": 1.42063270026278e-05, + "loss": 1.7102, + "step": 24806 + }, + { + "epoch": 7.614180478821363, + "grad_norm": 0.22535641491413116, + "learning_rate": 1.4202856585288954e-05, + "loss": 1.7177, + "step": 24807 + }, + { + "epoch": 7.614487415592388, + "grad_norm": 0.2111314982175827, + "learning_rate": 1.4199386521718455e-05, + "loss": 1.7399, + "step": 24808 + }, + { + "epoch": 7.614794352363413, + "grad_norm": 0.18377532064914703, + "learning_rate": 1.419591681195061e-05, + "loss": 1.6713, + "step": 24809 + }, + { + "epoch": 7.615101289134438, + "grad_norm": 0.19743949174880981, + "learning_rate": 1.4192447456019681e-05, + "loss": 1.7761, + "step": 24810 + }, + { + "epoch": 7.615408225905464, + "grad_norm": 0.17827409505844116, + "learning_rate": 1.4188978453960006e-05, + "loss": 1.7091, + "step": 24811 + }, + { + "epoch": 7.615715162676489, + "grad_norm": 0.18304505944252014, + "learning_rate": 1.4185509805805802e-05, + "loss": 1.7496, + "step": 24812 + }, + { + "epoch": 7.616022099447514, + "grad_norm": 0.19510503113269806, + "learning_rate": 1.4182041511591415e-05, + "loss": 1.7436, + "step": 24813 + }, + { + "epoch": 7.616329036218539, + "grad_norm": 0.17127136886119843, + "learning_rate": 1.4178573571351056e-05, + "loss": 1.6598, + "step": 24814 + }, + { + "epoch": 7.616635972989564, + "grad_norm": 0.20133370161056519, + "learning_rate": 1.4175105985119041e-05, + "loss": 1.7802, + "step": 24815 + }, + { + "epoch": 7.616942909760589, + "grad_norm": 0.17706145346164703, + "learning_rate": 1.4171638752929634e-05, + "loss": 1.7105, + "step": 24816 + }, + { + "epoch": 7.617249846531615, + "grad_norm": 0.179647758603096, + "learning_rate": 1.4168171874817088e-05, + "loss": 1.732, + "step": 24817 + }, + { + "epoch": 7.617556783302639, + "grad_norm": 0.16380085051059723, + "learning_rate": 1.4164705350815665e-05, + "loss": 1.6671, + "step": 24818 + }, + { + "epoch": 7.6178637200736645, + "grad_norm": 0.19407404959201813, + "learning_rate": 1.4161239180959635e-05, + "loss": 1.7261, + "step": 24819 + }, + { + "epoch": 7.61817065684469, + "grad_norm": 0.1647375524044037, + "learning_rate": 1.415777336528324e-05, + "loss": 1.7438, + "step": 24820 + }, + { + "epoch": 7.618477593615715, + "grad_norm": 0.21532754600048065, + "learning_rate": 1.4154307903820735e-05, + "loss": 1.7674, + "step": 24821 + }, + { + "epoch": 7.6187845303867405, + "grad_norm": 0.1834939867258072, + "learning_rate": 1.4150842796606372e-05, + "loss": 1.7027, + "step": 24822 + }, + { + "epoch": 7.619091467157766, + "grad_norm": 0.15102218091487885, + "learning_rate": 1.4147378043674397e-05, + "loss": 1.6858, + "step": 24823 + }, + { + "epoch": 7.61939840392879, + "grad_norm": 0.161713644862175, + "learning_rate": 1.4143913645059038e-05, + "loss": 1.7149, + "step": 24824 + }, + { + "epoch": 7.619705340699816, + "grad_norm": 0.15568867325782776, + "learning_rate": 1.4140449600794547e-05, + "loss": 1.6642, + "step": 24825 + }, + { + "epoch": 7.620012277470841, + "grad_norm": 0.15993504226207733, + "learning_rate": 1.4136985910915147e-05, + "loss": 1.6497, + "step": 24826 + }, + { + "epoch": 7.620319214241866, + "grad_norm": 0.16981028020381927, + "learning_rate": 1.4133522575455055e-05, + "loss": 1.7347, + "step": 24827 + }, + { + "epoch": 7.620626151012892, + "grad_norm": 0.16143053770065308, + "learning_rate": 1.4130059594448547e-05, + "loss": 1.7166, + "step": 24828 + }, + { + "epoch": 7.620933087783916, + "grad_norm": 0.16914571821689606, + "learning_rate": 1.4126596967929789e-05, + "loss": 1.7008, + "step": 24829 + }, + { + "epoch": 7.621240024554941, + "grad_norm": 0.20040032267570496, + "learning_rate": 1.4123134695933049e-05, + "loss": 1.7099, + "step": 24830 + }, + { + "epoch": 7.621546961325967, + "grad_norm": 0.17086143791675568, + "learning_rate": 1.4119672778492493e-05, + "loss": 1.6913, + "step": 24831 + }, + { + "epoch": 7.621853898096992, + "grad_norm": 0.16268399357795715, + "learning_rate": 1.4116211215642378e-05, + "loss": 1.6919, + "step": 24832 + }, + { + "epoch": 7.622160834868017, + "grad_norm": 0.21211197972297668, + "learning_rate": 1.4112750007416891e-05, + "loss": 1.7493, + "step": 24833 + }, + { + "epoch": 7.622467771639043, + "grad_norm": 0.16767694056034088, + "learning_rate": 1.4109289153850247e-05, + "loss": 1.6863, + "step": 24834 + }, + { + "epoch": 7.622774708410067, + "grad_norm": 0.1769869178533554, + "learning_rate": 1.4105828654976639e-05, + "loss": 1.7303, + "step": 24835 + }, + { + "epoch": 7.6230816451810925, + "grad_norm": 0.2202748954296112, + "learning_rate": 1.4102368510830278e-05, + "loss": 1.7648, + "step": 24836 + }, + { + "epoch": 7.623388581952118, + "grad_norm": 0.18347454071044922, + "learning_rate": 1.4098908721445342e-05, + "loss": 1.7615, + "step": 24837 + }, + { + "epoch": 7.623695518723143, + "grad_norm": 0.17966698110103607, + "learning_rate": 1.4095449286856039e-05, + "loss": 1.7031, + "step": 24838 + }, + { + "epoch": 7.6240024554941686, + "grad_norm": 0.1794397532939911, + "learning_rate": 1.409199020709655e-05, + "loss": 1.7129, + "step": 24839 + }, + { + "epoch": 7.624309392265193, + "grad_norm": 0.1838780641555786, + "learning_rate": 1.4088531482201056e-05, + "loss": 1.6936, + "step": 24840 + }, + { + "epoch": 7.624616329036218, + "grad_norm": 0.1940378099679947, + "learning_rate": 1.4085073112203745e-05, + "loss": 1.71, + "step": 24841 + }, + { + "epoch": 7.624923265807244, + "grad_norm": 0.17340345680713654, + "learning_rate": 1.4081615097138796e-05, + "loss": 1.711, + "step": 24842 + }, + { + "epoch": 7.625230202578269, + "grad_norm": 0.23193266987800598, + "learning_rate": 1.4078157437040374e-05, + "loss": 1.7366, + "step": 24843 + }, + { + "epoch": 7.625537139349294, + "grad_norm": 0.1742531955242157, + "learning_rate": 1.4074700131942653e-05, + "loss": 1.7179, + "step": 24844 + }, + { + "epoch": 7.62584407612032, + "grad_norm": 0.22453147172927856, + "learning_rate": 1.4071243181879806e-05, + "loss": 1.708, + "step": 24845 + }, + { + "epoch": 7.626151012891344, + "grad_norm": 0.16176854074001312, + "learning_rate": 1.4067786586885977e-05, + "loss": 1.7012, + "step": 24846 + }, + { + "epoch": 7.6264579496623695, + "grad_norm": 0.16796015202999115, + "learning_rate": 1.4064330346995369e-05, + "loss": 1.6918, + "step": 24847 + }, + { + "epoch": 7.626764886433395, + "grad_norm": 0.1737142950296402, + "learning_rate": 1.4060874462242085e-05, + "loss": 1.6908, + "step": 24848 + }, + { + "epoch": 7.62707182320442, + "grad_norm": 0.1697089523077011, + "learning_rate": 1.4057418932660315e-05, + "loss": 1.6811, + "step": 24849 + }, + { + "epoch": 7.627378759975445, + "grad_norm": 0.19860011339187622, + "learning_rate": 1.40539637582842e-05, + "loss": 1.7803, + "step": 24850 + }, + { + "epoch": 7.62768569674647, + "grad_norm": 0.16383512318134308, + "learning_rate": 1.4050508939147883e-05, + "loss": 1.7004, + "step": 24851 + }, + { + "epoch": 7.627992633517495, + "grad_norm": 0.18878768384456635, + "learning_rate": 1.404705447528551e-05, + "loss": 1.6916, + "step": 24852 + }, + { + "epoch": 7.628299570288521, + "grad_norm": 0.1417449563741684, + "learning_rate": 1.4043600366731213e-05, + "loss": 1.6908, + "step": 24853 + }, + { + "epoch": 7.628606507059546, + "grad_norm": 0.19786077737808228, + "learning_rate": 1.4040146613519134e-05, + "loss": 1.7307, + "step": 24854 + }, + { + "epoch": 7.628913443830571, + "grad_norm": 0.17295710742473602, + "learning_rate": 1.40366932156834e-05, + "loss": 1.7111, + "step": 24855 + }, + { + "epoch": 7.629220380601596, + "grad_norm": 0.2160167098045349, + "learning_rate": 1.4033240173258144e-05, + "loss": 1.71, + "step": 24856 + }, + { + "epoch": 7.629527317372621, + "grad_norm": 0.1741226315498352, + "learning_rate": 1.402978748627749e-05, + "loss": 1.7024, + "step": 24857 + }, + { + "epoch": 7.629834254143646, + "grad_norm": 0.18043182790279388, + "learning_rate": 1.4026335154775561e-05, + "loss": 1.7046, + "step": 24858 + }, + { + "epoch": 7.630141190914672, + "grad_norm": 0.1592903584241867, + "learning_rate": 1.4022883178786472e-05, + "loss": 1.6913, + "step": 24859 + }, + { + "epoch": 7.630448127685697, + "grad_norm": 0.25504007935523987, + "learning_rate": 1.4019431558344337e-05, + "loss": 1.7221, + "step": 24860 + }, + { + "epoch": 7.6307550644567215, + "grad_norm": 0.15307627618312836, + "learning_rate": 1.4015980293483272e-05, + "loss": 1.6725, + "step": 24861 + }, + { + "epoch": 7.631062001227747, + "grad_norm": 0.2595232129096985, + "learning_rate": 1.4012529384237372e-05, + "loss": 1.7309, + "step": 24862 + }, + { + "epoch": 7.631368937998772, + "grad_norm": 0.19494156539440155, + "learning_rate": 1.4009078830640743e-05, + "loss": 1.737, + "step": 24863 + }, + { + "epoch": 7.6316758747697975, + "grad_norm": 0.19264118373394012, + "learning_rate": 1.4005628632727518e-05, + "loss": 1.7337, + "step": 24864 + }, + { + "epoch": 7.631982811540823, + "grad_norm": 0.18758688867092133, + "learning_rate": 1.400217879053174e-05, + "loss": 1.684, + "step": 24865 + }, + { + "epoch": 7.632289748311848, + "grad_norm": 0.17094476521015167, + "learning_rate": 1.399872930408756e-05, + "loss": 1.6724, + "step": 24866 + }, + { + "epoch": 7.632596685082873, + "grad_norm": 0.18967430293560028, + "learning_rate": 1.3995280173429003e-05, + "loss": 1.6852, + "step": 24867 + }, + { + "epoch": 7.632903621853898, + "grad_norm": 0.1686837375164032, + "learning_rate": 1.399183139859021e-05, + "loss": 1.6673, + "step": 24868 + }, + { + "epoch": 7.633210558624923, + "grad_norm": 0.19091126322746277, + "learning_rate": 1.398838297960524e-05, + "loss": 1.7423, + "step": 24869 + }, + { + "epoch": 7.633517495395949, + "grad_norm": 0.20197629928588867, + "learning_rate": 1.3984934916508186e-05, + "loss": 1.7217, + "step": 24870 + }, + { + "epoch": 7.633824432166974, + "grad_norm": 0.1490679830312729, + "learning_rate": 1.3981487209333105e-05, + "loss": 1.6367, + "step": 24871 + }, + { + "epoch": 7.634131368937998, + "grad_norm": 0.14664824306964874, + "learning_rate": 1.3978039858114084e-05, + "loss": 1.68, + "step": 24872 + }, + { + "epoch": 7.634438305709024, + "grad_norm": 0.19181138277053833, + "learning_rate": 1.3974592862885182e-05, + "loss": 1.766, + "step": 24873 + }, + { + "epoch": 7.634745242480049, + "grad_norm": 0.17716391384601593, + "learning_rate": 1.397114622368047e-05, + "loss": 1.7479, + "step": 24874 + }, + { + "epoch": 7.635052179251074, + "grad_norm": 0.16603589057922363, + "learning_rate": 1.3967699940534006e-05, + "loss": 1.6455, + "step": 24875 + }, + { + "epoch": 7.6353591160221, + "grad_norm": 0.19060885906219482, + "learning_rate": 1.3964254013479855e-05, + "loss": 1.7367, + "step": 24876 + }, + { + "epoch": 7.635666052793125, + "grad_norm": 0.18182092905044556, + "learning_rate": 1.3960808442552064e-05, + "loss": 1.7235, + "step": 24877 + }, + { + "epoch": 7.6359729895641495, + "grad_norm": 0.22578656673431396, + "learning_rate": 1.3957363227784691e-05, + "loss": 1.7229, + "step": 24878 + }, + { + "epoch": 7.636279926335175, + "grad_norm": 0.25397053360939026, + "learning_rate": 1.3953918369211776e-05, + "loss": 1.7094, + "step": 24879 + }, + { + "epoch": 7.6365868631062, + "grad_norm": 0.164917454123497, + "learning_rate": 1.3950473866867353e-05, + "loss": 1.695, + "step": 24880 + }, + { + "epoch": 7.6368937998772255, + "grad_norm": 0.18737520277500153, + "learning_rate": 1.3947029720785503e-05, + "loss": 1.6719, + "step": 24881 + }, + { + "epoch": 7.637200736648251, + "grad_norm": 0.1839492917060852, + "learning_rate": 1.3943585931000213e-05, + "loss": 1.7136, + "step": 24882 + }, + { + "epoch": 7.637507673419275, + "grad_norm": 0.17182856798171997, + "learning_rate": 1.3940142497545566e-05, + "loss": 1.678, + "step": 24883 + }, + { + "epoch": 7.637814610190301, + "grad_norm": 0.20733827352523804, + "learning_rate": 1.393669942045554e-05, + "loss": 1.6398, + "step": 24884 + }, + { + "epoch": 7.638121546961326, + "grad_norm": 0.19326196610927582, + "learning_rate": 1.3933256699764196e-05, + "loss": 1.7351, + "step": 24885 + }, + { + "epoch": 7.638428483732351, + "grad_norm": 0.2368818074464798, + "learning_rate": 1.3929814335505552e-05, + "loss": 1.7567, + "step": 24886 + }, + { + "epoch": 7.638735420503377, + "grad_norm": 0.16702532768249512, + "learning_rate": 1.3926372327713626e-05, + "loss": 1.6791, + "step": 24887 + }, + { + "epoch": 7.639042357274402, + "grad_norm": 0.18634511530399323, + "learning_rate": 1.3922930676422435e-05, + "loss": 1.691, + "step": 24888 + }, + { + "epoch": 7.639349294045426, + "grad_norm": 0.19349521398544312, + "learning_rate": 1.3919489381665985e-05, + "loss": 1.7037, + "step": 24889 + }, + { + "epoch": 7.639656230816452, + "grad_norm": 0.16760465502738953, + "learning_rate": 1.3916048443478286e-05, + "loss": 1.6871, + "step": 24890 + }, + { + "epoch": 7.639963167587477, + "grad_norm": 0.25489017367362976, + "learning_rate": 1.3912607861893351e-05, + "loss": 1.6914, + "step": 24891 + }, + { + "epoch": 7.640270104358502, + "grad_norm": 0.17488406598567963, + "learning_rate": 1.390916763694517e-05, + "loss": 1.6826, + "step": 24892 + }, + { + "epoch": 7.640577041129527, + "grad_norm": 0.2128411829471588, + "learning_rate": 1.3905727768667753e-05, + "loss": 1.711, + "step": 24893 + }, + { + "epoch": 7.640883977900552, + "grad_norm": 0.17478415369987488, + "learning_rate": 1.3902288257095087e-05, + "loss": 1.7174, + "step": 24894 + }, + { + "epoch": 7.6411909146715775, + "grad_norm": 0.20493042469024658, + "learning_rate": 1.3898849102261168e-05, + "loss": 1.7649, + "step": 24895 + }, + { + "epoch": 7.641497851442603, + "grad_norm": 0.16712170839309692, + "learning_rate": 1.3895410304199979e-05, + "loss": 1.6785, + "step": 24896 + }, + { + "epoch": 7.641804788213628, + "grad_norm": 0.18580594658851624, + "learning_rate": 1.3891971862945497e-05, + "loss": 1.7001, + "step": 24897 + }, + { + "epoch": 7.6421117249846535, + "grad_norm": 0.19040817022323608, + "learning_rate": 1.3888533778531737e-05, + "loss": 1.709, + "step": 24898 + }, + { + "epoch": 7.642418661755678, + "grad_norm": 0.17573465406894684, + "learning_rate": 1.3885096050992624e-05, + "loss": 1.7205, + "step": 24899 + }, + { + "epoch": 7.642725598526703, + "grad_norm": 0.19123490154743195, + "learning_rate": 1.3881658680362186e-05, + "loss": 1.6882, + "step": 24900 + }, + { + "epoch": 7.643032535297729, + "grad_norm": 0.18465565145015717, + "learning_rate": 1.387822166667434e-05, + "loss": 1.7294, + "step": 24901 + }, + { + "epoch": 7.643339472068754, + "grad_norm": 0.17927341163158417, + "learning_rate": 1.3874785009963098e-05, + "loss": 1.7625, + "step": 24902 + }, + { + "epoch": 7.643646408839779, + "grad_norm": 0.15983298420906067, + "learning_rate": 1.38713487102624e-05, + "loss": 1.6939, + "step": 24903 + }, + { + "epoch": 7.643953345610804, + "grad_norm": 0.20288127660751343, + "learning_rate": 1.3867912767606211e-05, + "loss": 1.7461, + "step": 24904 + }, + { + "epoch": 7.644260282381829, + "grad_norm": 0.18587160110473633, + "learning_rate": 1.3864477182028484e-05, + "loss": 1.7389, + "step": 24905 + }, + { + "epoch": 7.644567219152854, + "grad_norm": 0.17089903354644775, + "learning_rate": 1.3861041953563175e-05, + "loss": 1.6697, + "step": 24906 + }, + { + "epoch": 7.64487415592388, + "grad_norm": 0.20302993059158325, + "learning_rate": 1.3857607082244228e-05, + "loss": 1.7199, + "step": 24907 + }, + { + "epoch": 7.645181092694905, + "grad_norm": 0.14781002700328827, + "learning_rate": 1.3854172568105594e-05, + "loss": 1.687, + "step": 24908 + }, + { + "epoch": 7.64548802946593, + "grad_norm": 0.17847368121147156, + "learning_rate": 1.3850738411181214e-05, + "loss": 1.6511, + "step": 24909 + }, + { + "epoch": 7.645794966236955, + "grad_norm": 0.1448936015367508, + "learning_rate": 1.3847304611505019e-05, + "loss": 1.6601, + "step": 24910 + }, + { + "epoch": 7.64610190300798, + "grad_norm": 0.19413447380065918, + "learning_rate": 1.3843871169110955e-05, + "loss": 1.6901, + "step": 24911 + }, + { + "epoch": 7.6464088397790055, + "grad_norm": 0.18118292093276978, + "learning_rate": 1.3840438084032947e-05, + "loss": 1.7574, + "step": 24912 + }, + { + "epoch": 7.646715776550031, + "grad_norm": 0.16136041283607483, + "learning_rate": 1.3837005356304921e-05, + "loss": 1.6826, + "step": 24913 + }, + { + "epoch": 7.647022713321056, + "grad_norm": 0.1773926019668579, + "learning_rate": 1.3833572985960792e-05, + "loss": 1.7136, + "step": 24914 + }, + { + "epoch": 7.647329650092081, + "grad_norm": 0.15100078284740448, + "learning_rate": 1.3830140973034522e-05, + "loss": 1.7331, + "step": 24915 + }, + { + "epoch": 7.647636586863106, + "grad_norm": 0.16588352620601654, + "learning_rate": 1.3826709317559966e-05, + "loss": 1.6883, + "step": 24916 + }, + { + "epoch": 7.647943523634131, + "grad_norm": 0.14271478354930878, + "learning_rate": 1.3823278019571106e-05, + "loss": 1.6566, + "step": 24917 + }, + { + "epoch": 7.648250460405157, + "grad_norm": 0.18383146822452545, + "learning_rate": 1.3819847079101782e-05, + "loss": 1.7006, + "step": 24918 + }, + { + "epoch": 7.648557397176182, + "grad_norm": 0.20069970190525055, + "learning_rate": 1.3816416496185952e-05, + "loss": 1.696, + "step": 24919 + }, + { + "epoch": 7.648864333947207, + "grad_norm": 0.15686273574829102, + "learning_rate": 1.3812986270857497e-05, + "loss": 1.6998, + "step": 24920 + }, + { + "epoch": 7.649171270718232, + "grad_norm": 0.14733602106571198, + "learning_rate": 1.3809556403150326e-05, + "loss": 1.6692, + "step": 24921 + }, + { + "epoch": 7.649478207489257, + "grad_norm": 0.16720153391361237, + "learning_rate": 1.3806126893098332e-05, + "loss": 1.6841, + "step": 24922 + }, + { + "epoch": 7.649785144260282, + "grad_norm": 0.1548861712217331, + "learning_rate": 1.3802697740735404e-05, + "loss": 1.6914, + "step": 24923 + }, + { + "epoch": 7.650092081031308, + "grad_norm": 0.1591617912054062, + "learning_rate": 1.3799268946095433e-05, + "loss": 1.7121, + "step": 24924 + }, + { + "epoch": 7.650399017802332, + "grad_norm": 0.19735665619373322, + "learning_rate": 1.3795840509212305e-05, + "loss": 1.741, + "step": 24925 + }, + { + "epoch": 7.650705954573358, + "grad_norm": 0.16886921226978302, + "learning_rate": 1.37924124301199e-05, + "loss": 1.7166, + "step": 24926 + }, + { + "epoch": 7.651012891344383, + "grad_norm": 0.2084806114435196, + "learning_rate": 1.3788984708852098e-05, + "loss": 1.7525, + "step": 24927 + }, + { + "epoch": 7.651319828115408, + "grad_norm": 0.15286533534526825, + "learning_rate": 1.3785557345442773e-05, + "loss": 1.6754, + "step": 24928 + }, + { + "epoch": 7.651626764886434, + "grad_norm": 0.19647163152694702, + "learning_rate": 1.3782130339925792e-05, + "loss": 1.7114, + "step": 24929 + }, + { + "epoch": 7.651933701657459, + "grad_norm": 0.18526645004749298, + "learning_rate": 1.3778703692335031e-05, + "loss": 1.7258, + "step": 24930 + }, + { + "epoch": 7.652240638428484, + "grad_norm": 0.19880451261997223, + "learning_rate": 1.3775277402704334e-05, + "loss": 1.7065, + "step": 24931 + }, + { + "epoch": 7.652547575199509, + "grad_norm": 0.18702107667922974, + "learning_rate": 1.377185147106761e-05, + "loss": 1.7171, + "step": 24932 + }, + { + "epoch": 7.652854511970534, + "grad_norm": 0.1455291509628296, + "learning_rate": 1.3768425897458654e-05, + "loss": 1.6824, + "step": 24933 + }, + { + "epoch": 7.653161448741559, + "grad_norm": 0.16770213842391968, + "learning_rate": 1.3765000681911377e-05, + "loss": 1.6544, + "step": 24934 + }, + { + "epoch": 7.653468385512585, + "grad_norm": 0.18496285378932953, + "learning_rate": 1.3761575824459572e-05, + "loss": 1.7206, + "step": 24935 + }, + { + "epoch": 7.653775322283609, + "grad_norm": 0.1832813024520874, + "learning_rate": 1.3758151325137131e-05, + "loss": 1.7673, + "step": 24936 + }, + { + "epoch": 7.6540822590546345, + "grad_norm": 0.20916350185871124, + "learning_rate": 1.3754727183977878e-05, + "loss": 1.7224, + "step": 24937 + }, + { + "epoch": 7.65438919582566, + "grad_norm": 0.1878765970468521, + "learning_rate": 1.3751303401015653e-05, + "loss": 1.6966, + "step": 24938 + }, + { + "epoch": 7.654696132596685, + "grad_norm": 0.17944355309009552, + "learning_rate": 1.37478799762843e-05, + "loss": 1.6752, + "step": 24939 + }, + { + "epoch": 7.6550030693677105, + "grad_norm": 0.20930083096027374, + "learning_rate": 1.3744456909817638e-05, + "loss": 1.7632, + "step": 24940 + }, + { + "epoch": 7.655310006138736, + "grad_norm": 0.19838237762451172, + "learning_rate": 1.3741034201649511e-05, + "loss": 1.7039, + "step": 24941 + }, + { + "epoch": 7.65561694290976, + "grad_norm": 0.233023539185524, + "learning_rate": 1.373761185181373e-05, + "loss": 1.7117, + "step": 24942 + }, + { + "epoch": 7.655923879680786, + "grad_norm": 0.16270874440670013, + "learning_rate": 1.3734189860344127e-05, + "loss": 1.6603, + "step": 24943 + }, + { + "epoch": 7.656230816451811, + "grad_norm": 0.18456563353538513, + "learning_rate": 1.373076822727451e-05, + "loss": 1.6891, + "step": 24944 + }, + { + "epoch": 7.656537753222836, + "grad_norm": 0.17064985632896423, + "learning_rate": 1.3727346952638703e-05, + "loss": 1.6788, + "step": 24945 + }, + { + "epoch": 7.656844689993862, + "grad_norm": 0.17548689246177673, + "learning_rate": 1.3723926036470513e-05, + "loss": 1.6699, + "step": 24946 + }, + { + "epoch": 7.657151626764886, + "grad_norm": 0.1660275012254715, + "learning_rate": 1.3720505478803753e-05, + "loss": 1.6706, + "step": 24947 + }, + { + "epoch": 7.657458563535911, + "grad_norm": 0.2977990508079529, + "learning_rate": 1.3717085279672199e-05, + "loss": 1.7463, + "step": 24948 + }, + { + "epoch": 7.657765500306937, + "grad_norm": 0.24440810084342957, + "learning_rate": 1.3713665439109708e-05, + "loss": 1.7528, + "step": 24949 + }, + { + "epoch": 7.658072437077962, + "grad_norm": 0.1579941064119339, + "learning_rate": 1.3710245957150015e-05, + "loss": 1.6902, + "step": 24950 + }, + { + "epoch": 7.658379373848987, + "grad_norm": 0.197731152176857, + "learning_rate": 1.3706826833826968e-05, + "loss": 1.7377, + "step": 24951 + }, + { + "epoch": 7.658686310620013, + "grad_norm": 0.16704770922660828, + "learning_rate": 1.3703408069174301e-05, + "loss": 1.7057, + "step": 24952 + }, + { + "epoch": 7.658993247391037, + "grad_norm": 0.2167888730764389, + "learning_rate": 1.3699989663225848e-05, + "loss": 1.7668, + "step": 24953 + }, + { + "epoch": 7.6593001841620625, + "grad_norm": 0.16870343685150146, + "learning_rate": 1.369657161601537e-05, + "loss": 1.6781, + "step": 24954 + }, + { + "epoch": 7.659607120933088, + "grad_norm": 0.22422032058238983, + "learning_rate": 1.3693153927576646e-05, + "loss": 1.7034, + "step": 24955 + }, + { + "epoch": 7.659914057704113, + "grad_norm": 0.20777738094329834, + "learning_rate": 1.3689736597943465e-05, + "loss": 1.7401, + "step": 24956 + }, + { + "epoch": 7.6602209944751385, + "grad_norm": 0.17802980542182922, + "learning_rate": 1.3686319627149579e-05, + "loss": 1.7067, + "step": 24957 + }, + { + "epoch": 7.660527931246163, + "grad_norm": 0.21444065868854523, + "learning_rate": 1.368290301522877e-05, + "loss": 1.6731, + "step": 24958 + }, + { + "epoch": 7.660834868017188, + "grad_norm": 0.17638131976127625, + "learning_rate": 1.3679486762214805e-05, + "loss": 1.738, + "step": 24959 + }, + { + "epoch": 7.661141804788214, + "grad_norm": 0.1900044083595276, + "learning_rate": 1.3676070868141432e-05, + "loss": 1.7673, + "step": 24960 + }, + { + "epoch": 7.661448741559239, + "grad_norm": 0.20749469101428986, + "learning_rate": 1.3672655333042422e-05, + "loss": 1.7341, + "step": 24961 + }, + { + "epoch": 7.661755678330264, + "grad_norm": 0.21292604506015778, + "learning_rate": 1.3669240156951518e-05, + "loss": 1.7114, + "step": 24962 + }, + { + "epoch": 7.66206261510129, + "grad_norm": 0.21506401896476746, + "learning_rate": 1.3665825339902482e-05, + "loss": 1.7412, + "step": 24963 + }, + { + "epoch": 7.662369551872314, + "grad_norm": 0.21838976442813873, + "learning_rate": 1.3662410881929055e-05, + "loss": 1.7178, + "step": 24964 + }, + { + "epoch": 7.662676488643339, + "grad_norm": 0.18973253667354584, + "learning_rate": 1.365899678306497e-05, + "loss": 1.7161, + "step": 24965 + }, + { + "epoch": 7.662983425414365, + "grad_norm": 0.19278603792190552, + "learning_rate": 1.3655583043344006e-05, + "loss": 1.6952, + "step": 24966 + }, + { + "epoch": 7.66329036218539, + "grad_norm": 0.2025471180677414, + "learning_rate": 1.365216966279984e-05, + "loss": 1.6893, + "step": 24967 + }, + { + "epoch": 7.6635972989564145, + "grad_norm": 0.14461325109004974, + "learning_rate": 1.364875664146627e-05, + "loss": 1.6762, + "step": 24968 + }, + { + "epoch": 7.66390423572744, + "grad_norm": 0.22851425409317017, + "learning_rate": 1.3645343979376962e-05, + "loss": 1.7743, + "step": 24969 + }, + { + "epoch": 7.664211172498465, + "grad_norm": 0.16862350702285767, + "learning_rate": 1.3641931676565688e-05, + "loss": 1.6385, + "step": 24970 + }, + { + "epoch": 7.6645181092694905, + "grad_norm": 0.20482461154460907, + "learning_rate": 1.3638519733066157e-05, + "loss": 1.7824, + "step": 24971 + }, + { + "epoch": 7.664825046040516, + "grad_norm": 0.18505734205245972, + "learning_rate": 1.3635108148912085e-05, + "loss": 1.6845, + "step": 24972 + }, + { + "epoch": 7.665131982811541, + "grad_norm": 0.18774990737438202, + "learning_rate": 1.3631696924137189e-05, + "loss": 1.7091, + "step": 24973 + }, + { + "epoch": 7.665438919582566, + "grad_norm": 0.1967296153306961, + "learning_rate": 1.362828605877518e-05, + "loss": 1.6953, + "step": 24974 + }, + { + "epoch": 7.665745856353591, + "grad_norm": 0.16951262950897217, + "learning_rate": 1.3624875552859767e-05, + "loss": 1.7302, + "step": 24975 + }, + { + "epoch": 7.666052793124616, + "grad_norm": 0.21003109216690063, + "learning_rate": 1.3621465406424656e-05, + "loss": 1.7567, + "step": 24976 + }, + { + "epoch": 7.666359729895642, + "grad_norm": 0.19087877869606018, + "learning_rate": 1.361805561950354e-05, + "loss": 1.7373, + "step": 24977 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 0.17799946665763855, + "learning_rate": 1.3614646192130126e-05, + "loss": 1.7121, + "step": 24978 + }, + { + "epoch": 7.666973603437691, + "grad_norm": 0.15956062078475952, + "learning_rate": 1.3611237124338105e-05, + "loss": 1.6654, + "step": 24979 + }, + { + "epoch": 7.667280540208717, + "grad_norm": 0.1963697075843811, + "learning_rate": 1.3607828416161167e-05, + "loss": 1.7902, + "step": 24980 + }, + { + "epoch": 7.667587476979742, + "grad_norm": 0.22204460203647614, + "learning_rate": 1.3604420067632995e-05, + "loss": 1.8199, + "step": 24981 + }, + { + "epoch": 7.667894413750767, + "grad_norm": 0.20523740351200104, + "learning_rate": 1.3601012078787268e-05, + "loss": 1.7253, + "step": 24982 + }, + { + "epoch": 7.668201350521793, + "grad_norm": 0.18693773448467255, + "learning_rate": 1.3597604449657697e-05, + "loss": 1.7032, + "step": 24983 + }, + { + "epoch": 7.668508287292818, + "grad_norm": 0.17661312222480774, + "learning_rate": 1.3594197180277906e-05, + "loss": 1.6648, + "step": 24984 + }, + { + "epoch": 7.6688152240638425, + "grad_norm": 0.19099490344524384, + "learning_rate": 1.3590790270681631e-05, + "loss": 1.7107, + "step": 24985 + }, + { + "epoch": 7.669122160834868, + "grad_norm": 0.1854488104581833, + "learning_rate": 1.3587383720902469e-05, + "loss": 1.7241, + "step": 24986 + }, + { + "epoch": 7.669429097605893, + "grad_norm": 0.18763068318367004, + "learning_rate": 1.3583977530974146e-05, + "loss": 1.7207, + "step": 24987 + }, + { + "epoch": 7.6697360343769185, + "grad_norm": 0.15608854591846466, + "learning_rate": 1.3580571700930295e-05, + "loss": 1.6835, + "step": 24988 + }, + { + "epoch": 7.670042971147944, + "grad_norm": 0.1587948501110077, + "learning_rate": 1.3577166230804584e-05, + "loss": 1.6801, + "step": 24989 + }, + { + "epoch": 7.670349907918968, + "grad_norm": 0.21106089651584625, + "learning_rate": 1.3573761120630668e-05, + "loss": 1.7411, + "step": 24990 + }, + { + "epoch": 7.670656844689994, + "grad_norm": 0.17361705005168915, + "learning_rate": 1.3570356370442188e-05, + "loss": 1.7123, + "step": 24991 + }, + { + "epoch": 7.670963781461019, + "grad_norm": 0.16272610425949097, + "learning_rate": 1.3566951980272802e-05, + "loss": 1.7002, + "step": 24992 + }, + { + "epoch": 7.671270718232044, + "grad_norm": 0.18787643313407898, + "learning_rate": 1.3563547950156147e-05, + "loss": 1.7364, + "step": 24993 + }, + { + "epoch": 7.67157765500307, + "grad_norm": 0.18257403373718262, + "learning_rate": 1.3560144280125869e-05, + "loss": 1.6783, + "step": 24994 + }, + { + "epoch": 7.671884591774095, + "grad_norm": 0.21298269927501678, + "learning_rate": 1.3556740970215608e-05, + "loss": 1.815, + "step": 24995 + }, + { + "epoch": 7.672191528545119, + "grad_norm": 0.1805877983570099, + "learning_rate": 1.3553338020458988e-05, + "loss": 1.719, + "step": 24996 + }, + { + "epoch": 7.672498465316145, + "grad_norm": 0.210116446018219, + "learning_rate": 1.3549935430889643e-05, + "loss": 1.7603, + "step": 24997 + }, + { + "epoch": 7.67280540208717, + "grad_norm": 0.18893682956695557, + "learning_rate": 1.35465332015412e-05, + "loss": 1.6681, + "step": 24998 + }, + { + "epoch": 7.673112338858195, + "grad_norm": 0.17718489468097687, + "learning_rate": 1.354313133244729e-05, + "loss": 1.6799, + "step": 24999 + }, + { + "epoch": 7.67341927562922, + "grad_norm": 0.20092631876468658, + "learning_rate": 1.3539729823641517e-05, + "loss": 1.7273, + "step": 25000 + }, + { + "epoch": 7.673726212400245, + "grad_norm": 0.20800542831420898, + "learning_rate": 1.353632867515749e-05, + "loss": 1.7214, + "step": 25001 + }, + { + "epoch": 7.6740331491712706, + "grad_norm": 0.2119656354188919, + "learning_rate": 1.3532927887028861e-05, + "loss": 1.6701, + "step": 25002 + }, + { + "epoch": 7.674340085942296, + "grad_norm": 0.1645115315914154, + "learning_rate": 1.3529527459289188e-05, + "loss": 1.7199, + "step": 25003 + }, + { + "epoch": 7.674647022713321, + "grad_norm": 0.24434153735637665, + "learning_rate": 1.3526127391972116e-05, + "loss": 1.7295, + "step": 25004 + }, + { + "epoch": 7.6749539594843466, + "grad_norm": 0.20978261530399323, + "learning_rate": 1.3522727685111231e-05, + "loss": 1.8069, + "step": 25005 + }, + { + "epoch": 7.675260896255372, + "grad_norm": 0.19354932010173798, + "learning_rate": 1.3519328338740128e-05, + "loss": 1.7601, + "step": 25006 + }, + { + "epoch": 7.675567833026396, + "grad_norm": 0.19636447727680206, + "learning_rate": 1.3515929352892403e-05, + "loss": 1.7871, + "step": 25007 + }, + { + "epoch": 7.675874769797422, + "grad_norm": 0.18915504217147827, + "learning_rate": 1.3512530727601653e-05, + "loss": 1.6926, + "step": 25008 + }, + { + "epoch": 7.676181706568447, + "grad_norm": 0.18168985843658447, + "learning_rate": 1.3509132462901458e-05, + "loss": 1.7272, + "step": 25009 + }, + { + "epoch": 7.676488643339472, + "grad_norm": 0.17246222496032715, + "learning_rate": 1.3505734558825406e-05, + "loss": 1.7186, + "step": 25010 + }, + { + "epoch": 7.676795580110497, + "grad_norm": 0.2694617211818695, + "learning_rate": 1.3502337015407074e-05, + "loss": 1.8334, + "step": 25011 + }, + { + "epoch": 7.677102516881522, + "grad_norm": 0.1549377590417862, + "learning_rate": 1.3498939832680035e-05, + "loss": 1.7003, + "step": 25012 + }, + { + "epoch": 7.6774094536525475, + "grad_norm": 0.1559179425239563, + "learning_rate": 1.349554301067787e-05, + "loss": 1.7028, + "step": 25013 + }, + { + "epoch": 7.677716390423573, + "grad_norm": 0.17349909245967865, + "learning_rate": 1.3492146549434149e-05, + "loss": 1.6749, + "step": 25014 + }, + { + "epoch": 7.678023327194598, + "grad_norm": 0.19697749614715576, + "learning_rate": 1.348875044898243e-05, + "loss": 1.8291, + "step": 25015 + }, + { + "epoch": 7.6783302639656235, + "grad_norm": 0.17260968685150146, + "learning_rate": 1.3485354709356279e-05, + "loss": 1.6686, + "step": 25016 + }, + { + "epoch": 7.678637200736648, + "grad_norm": 0.16892582178115845, + "learning_rate": 1.3481959330589255e-05, + "loss": 1.755, + "step": 25017 + }, + { + "epoch": 7.678944137507673, + "grad_norm": 0.17961645126342773, + "learning_rate": 1.3478564312714898e-05, + "loss": 1.6937, + "step": 25018 + }, + { + "epoch": 7.679251074278699, + "grad_norm": 0.20795513689517975, + "learning_rate": 1.34751696557668e-05, + "loss": 1.799, + "step": 25019 + }, + { + "epoch": 7.679558011049724, + "grad_norm": 0.16439545154571533, + "learning_rate": 1.3471775359778461e-05, + "loss": 1.6942, + "step": 25020 + }, + { + "epoch": 7.679864947820749, + "grad_norm": 0.19526144862174988, + "learning_rate": 1.3468381424783472e-05, + "loss": 1.7255, + "step": 25021 + }, + { + "epoch": 7.680171884591774, + "grad_norm": 0.18183457851409912, + "learning_rate": 1.3464987850815319e-05, + "loss": 1.7027, + "step": 25022 + }, + { + "epoch": 7.680478821362799, + "grad_norm": 0.18443404138088226, + "learning_rate": 1.3461594637907587e-05, + "loss": 1.6973, + "step": 25023 + }, + { + "epoch": 7.680785758133824, + "grad_norm": 0.18545331060886383, + "learning_rate": 1.3458201786093794e-05, + "loss": 1.7479, + "step": 25024 + }, + { + "epoch": 7.68109269490485, + "grad_norm": 0.18329958617687225, + "learning_rate": 1.3454809295407467e-05, + "loss": 1.7301, + "step": 25025 + }, + { + "epoch": 7.681399631675875, + "grad_norm": 0.19131959974765778, + "learning_rate": 1.3451417165882136e-05, + "loss": 1.7402, + "step": 25026 + }, + { + "epoch": 7.6817065684469, + "grad_norm": 0.1782912164926529, + "learning_rate": 1.3448025397551323e-05, + "loss": 1.6771, + "step": 25027 + }, + { + "epoch": 7.682013505217925, + "grad_norm": 0.1757265031337738, + "learning_rate": 1.3444633990448546e-05, + "loss": 1.7336, + "step": 25028 + }, + { + "epoch": 7.68232044198895, + "grad_norm": 0.16550128161907196, + "learning_rate": 1.3441242944607318e-05, + "loss": 1.6335, + "step": 25029 + }, + { + "epoch": 7.6826273787599755, + "grad_norm": 0.18069832026958466, + "learning_rate": 1.3437852260061162e-05, + "loss": 1.7172, + "step": 25030 + }, + { + "epoch": 7.682934315531001, + "grad_norm": 0.21195535361766815, + "learning_rate": 1.3434461936843573e-05, + "loss": 1.7248, + "step": 25031 + }, + { + "epoch": 7.683241252302026, + "grad_norm": 0.17209839820861816, + "learning_rate": 1.3431071974988068e-05, + "loss": 1.666, + "step": 25032 + }, + { + "epoch": 7.683548189073051, + "grad_norm": 0.20565249025821686, + "learning_rate": 1.342768237452814e-05, + "loss": 1.7839, + "step": 25033 + }, + { + "epoch": 7.683855125844076, + "grad_norm": 0.2549617290496826, + "learning_rate": 1.342429313549729e-05, + "loss": 1.714, + "step": 25034 + }, + { + "epoch": 7.684162062615101, + "grad_norm": 0.1980191171169281, + "learning_rate": 1.3420904257929001e-05, + "loss": 1.7267, + "step": 25035 + }, + { + "epoch": 7.684468999386127, + "grad_norm": 0.1763298362493515, + "learning_rate": 1.3417515741856806e-05, + "loss": 1.6754, + "step": 25036 + }, + { + "epoch": 7.684775936157152, + "grad_norm": 0.15831413865089417, + "learning_rate": 1.341412758731413e-05, + "loss": 1.6885, + "step": 25037 + }, + { + "epoch": 7.685082872928177, + "grad_norm": 0.15696564316749573, + "learning_rate": 1.341073979433452e-05, + "loss": 1.7032, + "step": 25038 + }, + { + "epoch": 7.685389809699202, + "grad_norm": 0.19193214178085327, + "learning_rate": 1.3407352362951392e-05, + "loss": 1.7708, + "step": 25039 + }, + { + "epoch": 7.685696746470227, + "grad_norm": 0.1886630803346634, + "learning_rate": 1.3403965293198273e-05, + "loss": 1.7323, + "step": 25040 + }, + { + "epoch": 7.686003683241252, + "grad_norm": 0.16137991845607758, + "learning_rate": 1.340057858510862e-05, + "loss": 1.703, + "step": 25041 + }, + { + "epoch": 7.686310620012278, + "grad_norm": 0.21111373603343964, + "learning_rate": 1.33971922387159e-05, + "loss": 1.7428, + "step": 25042 + }, + { + "epoch": 7.686617556783302, + "grad_norm": 0.20256482064723969, + "learning_rate": 1.3393806254053582e-05, + "loss": 1.7651, + "step": 25043 + }, + { + "epoch": 7.6869244935543275, + "grad_norm": 0.19125118851661682, + "learning_rate": 1.3390420631155121e-05, + "loss": 1.7253, + "step": 25044 + }, + { + "epoch": 7.687231430325353, + "grad_norm": 0.22446562349796295, + "learning_rate": 1.3387035370053985e-05, + "loss": 1.7363, + "step": 25045 + }, + { + "epoch": 7.687538367096378, + "grad_norm": 0.17356424033641815, + "learning_rate": 1.3383650470783621e-05, + "loss": 1.7384, + "step": 25046 + }, + { + "epoch": 7.6878453038674035, + "grad_norm": 0.27287909388542175, + "learning_rate": 1.3380265933377489e-05, + "loss": 1.6754, + "step": 25047 + }, + { + "epoch": 7.688152240638429, + "grad_norm": 0.14978452026844025, + "learning_rate": 1.3376881757869032e-05, + "loss": 1.6693, + "step": 25048 + }, + { + "epoch": 7.688459177409453, + "grad_norm": 0.1746874898672104, + "learning_rate": 1.3373497944291691e-05, + "loss": 1.6878, + "step": 25049 + }, + { + "epoch": 7.688766114180479, + "grad_norm": 0.18032371997833252, + "learning_rate": 1.3370114492678915e-05, + "loss": 1.7153, + "step": 25050 + }, + { + "epoch": 7.689073050951504, + "grad_norm": 0.23111680150032043, + "learning_rate": 1.3366731403064131e-05, + "loss": 1.7132, + "step": 25051 + }, + { + "epoch": 7.689379987722529, + "grad_norm": 0.1587868630886078, + "learning_rate": 1.3363348675480768e-05, + "loss": 1.6692, + "step": 25052 + }, + { + "epoch": 7.689686924493555, + "grad_norm": 0.14336444437503815, + "learning_rate": 1.3359966309962301e-05, + "loss": 1.6648, + "step": 25053 + }, + { + "epoch": 7.689993861264579, + "grad_norm": 0.3048984408378601, + "learning_rate": 1.3356584306542086e-05, + "loss": 1.8109, + "step": 25054 + }, + { + "epoch": 7.690300798035604, + "grad_norm": 0.19389018416404724, + "learning_rate": 1.3353202665253617e-05, + "loss": 1.6725, + "step": 25055 + }, + { + "epoch": 7.69060773480663, + "grad_norm": 0.19246982038021088, + "learning_rate": 1.3349821386130246e-05, + "loss": 1.726, + "step": 25056 + }, + { + "epoch": 7.690914671577655, + "grad_norm": 0.19062727689743042, + "learning_rate": 1.3346440469205435e-05, + "loss": 1.7685, + "step": 25057 + }, + { + "epoch": 7.69122160834868, + "grad_norm": 0.16987577080726624, + "learning_rate": 1.3343059914512585e-05, + "loss": 1.7032, + "step": 25058 + }, + { + "epoch": 7.691528545119706, + "grad_norm": 0.17328599095344543, + "learning_rate": 1.3339679722085103e-05, + "loss": 1.7271, + "step": 25059 + }, + { + "epoch": 7.69183548189073, + "grad_norm": 0.2677443325519562, + "learning_rate": 1.3336299891956405e-05, + "loss": 1.8, + "step": 25060 + }, + { + "epoch": 7.6921424186617555, + "grad_norm": 0.18369975686073303, + "learning_rate": 1.333292042415985e-05, + "loss": 1.7483, + "step": 25061 + }, + { + "epoch": 7.692449355432781, + "grad_norm": 0.17269635200500488, + "learning_rate": 1.3329541318728883e-05, + "loss": 1.7016, + "step": 25062 + }, + { + "epoch": 7.692756292203806, + "grad_norm": 0.17280563712120056, + "learning_rate": 1.3326162575696889e-05, + "loss": 1.742, + "step": 25063 + }, + { + "epoch": 7.6930632289748315, + "grad_norm": 0.2000025361776352, + "learning_rate": 1.3322784195097243e-05, + "loss": 1.6947, + "step": 25064 + }, + { + "epoch": 7.693370165745856, + "grad_norm": 0.17853626608848572, + "learning_rate": 1.3319406176963344e-05, + "loss": 1.7075, + "step": 25065 + }, + { + "epoch": 7.693677102516881, + "grad_norm": 0.18445543944835663, + "learning_rate": 1.3316028521328571e-05, + "loss": 1.7138, + "step": 25066 + }, + { + "epoch": 7.693984039287907, + "grad_norm": 0.1965894103050232, + "learning_rate": 1.3312651228226302e-05, + "loss": 1.6904, + "step": 25067 + }, + { + "epoch": 7.694290976058932, + "grad_norm": 0.1890837699174881, + "learning_rate": 1.3309274297689923e-05, + "loss": 1.7307, + "step": 25068 + }, + { + "epoch": 7.694597912829957, + "grad_norm": 0.2157326638698578, + "learning_rate": 1.3305897729752787e-05, + "loss": 1.7466, + "step": 25069 + }, + { + "epoch": 7.694904849600983, + "grad_norm": 0.19773493707180023, + "learning_rate": 1.3302521524448302e-05, + "loss": 1.7265, + "step": 25070 + }, + { + "epoch": 7.695211786372007, + "grad_norm": 0.16688357293605804, + "learning_rate": 1.3299145681809776e-05, + "loss": 1.7049, + "step": 25071 + }, + { + "epoch": 7.695518723143032, + "grad_norm": 0.24347764253616333, + "learning_rate": 1.3295770201870639e-05, + "loss": 1.7706, + "step": 25072 + }, + { + "epoch": 7.695825659914058, + "grad_norm": 0.16198144853115082, + "learning_rate": 1.3292395084664183e-05, + "loss": 1.6873, + "step": 25073 + }, + { + "epoch": 7.696132596685083, + "grad_norm": 0.17321841418743134, + "learning_rate": 1.3289020330223806e-05, + "loss": 1.7463, + "step": 25074 + }, + { + "epoch": 7.696439533456108, + "grad_norm": 0.2611647844314575, + "learning_rate": 1.3285645938582847e-05, + "loss": 1.811, + "step": 25075 + }, + { + "epoch": 7.696746470227133, + "grad_norm": 0.18129383027553558, + "learning_rate": 1.3282271909774657e-05, + "loss": 1.7257, + "step": 25076 + }, + { + "epoch": 7.697053406998158, + "grad_norm": 0.19985437393188477, + "learning_rate": 1.3278898243832588e-05, + "loss": 1.7311, + "step": 25077 + }, + { + "epoch": 7.6973603437691835, + "grad_norm": 0.21517722308635712, + "learning_rate": 1.3275524940789941e-05, + "loss": 1.7582, + "step": 25078 + }, + { + "epoch": 7.697667280540209, + "grad_norm": 0.2302769422531128, + "learning_rate": 1.32721520006801e-05, + "loss": 1.7192, + "step": 25079 + }, + { + "epoch": 7.697974217311234, + "grad_norm": 0.18356913328170776, + "learning_rate": 1.3268779423536375e-05, + "loss": 1.6916, + "step": 25080 + }, + { + "epoch": 7.6982811540822595, + "grad_norm": 0.19134142994880676, + "learning_rate": 1.3265407209392105e-05, + "loss": 1.7309, + "step": 25081 + }, + { + "epoch": 7.698588090853284, + "grad_norm": 0.17634150385856628, + "learning_rate": 1.3262035358280605e-05, + "loss": 1.7537, + "step": 25082 + }, + { + "epoch": 7.698895027624309, + "grad_norm": 0.1921558827161789, + "learning_rate": 1.325866387023521e-05, + "loss": 1.7102, + "step": 25083 + }, + { + "epoch": 7.699201964395335, + "grad_norm": 0.15972480177879333, + "learning_rate": 1.3255292745289233e-05, + "loss": 1.6759, + "step": 25084 + }, + { + "epoch": 7.69950890116636, + "grad_norm": 0.15172120928764343, + "learning_rate": 1.325192198347599e-05, + "loss": 1.6766, + "step": 25085 + }, + { + "epoch": 7.699815837937384, + "grad_norm": 0.17827558517456055, + "learning_rate": 1.3248551584828777e-05, + "loss": 1.7421, + "step": 25086 + }, + { + "epoch": 7.70012277470841, + "grad_norm": 0.1675274819135666, + "learning_rate": 1.3245181549380948e-05, + "loss": 1.701, + "step": 25087 + }, + { + "epoch": 7.700429711479435, + "grad_norm": 0.17937950789928436, + "learning_rate": 1.3241811877165744e-05, + "loss": 1.7284, + "step": 25088 + }, + { + "epoch": 7.7007366482504604, + "grad_norm": 0.16373637318611145, + "learning_rate": 1.3238442568216535e-05, + "loss": 1.6834, + "step": 25089 + }, + { + "epoch": 7.701043585021486, + "grad_norm": 0.16055652499198914, + "learning_rate": 1.3235073622566552e-05, + "loss": 1.7087, + "step": 25090 + }, + { + "epoch": 7.701350521792511, + "grad_norm": 0.15083225071430206, + "learning_rate": 1.3231705040249131e-05, + "loss": 1.7313, + "step": 25091 + }, + { + "epoch": 7.701657458563536, + "grad_norm": 0.21110820770263672, + "learning_rate": 1.322833682129756e-05, + "loss": 1.6758, + "step": 25092 + }, + { + "epoch": 7.701964395334561, + "grad_norm": 0.18439972400665283, + "learning_rate": 1.322496896574511e-05, + "loss": 1.737, + "step": 25093 + }, + { + "epoch": 7.702271332105586, + "grad_norm": 0.18655124306678772, + "learning_rate": 1.322160147362509e-05, + "loss": 1.7268, + "step": 25094 + }, + { + "epoch": 7.702578268876612, + "grad_norm": 0.17620640993118286, + "learning_rate": 1.3218234344970725e-05, + "loss": 1.6829, + "step": 25095 + }, + { + "epoch": 7.702885205647637, + "grad_norm": 0.19085893034934998, + "learning_rate": 1.3214867579815343e-05, + "loss": 1.7382, + "step": 25096 + }, + { + "epoch": 7.703192142418661, + "grad_norm": 0.2206689864397049, + "learning_rate": 1.3211501178192203e-05, + "loss": 1.7666, + "step": 25097 + }, + { + "epoch": 7.703499079189687, + "grad_norm": 0.2047509402036667, + "learning_rate": 1.320813514013457e-05, + "loss": 1.7209, + "step": 25098 + }, + { + "epoch": 7.703806015960712, + "grad_norm": 0.22249147295951843, + "learning_rate": 1.3204769465675709e-05, + "loss": 1.8067, + "step": 25099 + }, + { + "epoch": 7.704112952731737, + "grad_norm": 0.16225707530975342, + "learning_rate": 1.3201404154848885e-05, + "loss": 1.6715, + "step": 25100 + }, + { + "epoch": 7.704419889502763, + "grad_norm": 0.19165070354938507, + "learning_rate": 1.3198039207687352e-05, + "loss": 1.7233, + "step": 25101 + }, + { + "epoch": 7.704726826273788, + "grad_norm": 0.18720564246177673, + "learning_rate": 1.3194674624224368e-05, + "loss": 1.7129, + "step": 25102 + }, + { + "epoch": 7.7050337630448125, + "grad_norm": 0.16703814268112183, + "learning_rate": 1.3191310404493163e-05, + "loss": 1.7314, + "step": 25103 + }, + { + "epoch": 7.705340699815838, + "grad_norm": 0.20206168293952942, + "learning_rate": 1.3187946548527036e-05, + "loss": 1.7278, + "step": 25104 + }, + { + "epoch": 7.705647636586863, + "grad_norm": 0.1774030476808548, + "learning_rate": 1.3184583056359163e-05, + "loss": 1.6986, + "step": 25105 + }, + { + "epoch": 7.7059545733578885, + "grad_norm": 0.1729336827993393, + "learning_rate": 1.3181219928022853e-05, + "loss": 1.7251, + "step": 25106 + }, + { + "epoch": 7.706261510128914, + "grad_norm": 0.23351258039474487, + "learning_rate": 1.3177857163551276e-05, + "loss": 1.7311, + "step": 25107 + }, + { + "epoch": 7.706568446899938, + "grad_norm": 0.2041054517030716, + "learning_rate": 1.3174494762977713e-05, + "loss": 1.7122, + "step": 25108 + }, + { + "epoch": 7.706875383670964, + "grad_norm": 0.178013876080513, + "learning_rate": 1.3171132726335373e-05, + "loss": 1.7255, + "step": 25109 + }, + { + "epoch": 7.707182320441989, + "grad_norm": 0.19265221059322357, + "learning_rate": 1.3167771053657491e-05, + "loss": 1.6747, + "step": 25110 + }, + { + "epoch": 7.707489257213014, + "grad_norm": 0.18968601524829865, + "learning_rate": 1.3164409744977297e-05, + "loss": 1.71, + "step": 25111 + }, + { + "epoch": 7.70779619398404, + "grad_norm": 0.17041562497615814, + "learning_rate": 1.3161048800327963e-05, + "loss": 1.7202, + "step": 25112 + }, + { + "epoch": 7.708103130755065, + "grad_norm": 0.20094618201255798, + "learning_rate": 1.3157688219742754e-05, + "loss": 1.7375, + "step": 25113 + }, + { + "epoch": 7.708410067526089, + "grad_norm": 0.14012686908245087, + "learning_rate": 1.3154328003254862e-05, + "loss": 1.6426, + "step": 25114 + }, + { + "epoch": 7.708717004297115, + "grad_norm": 0.18826791644096375, + "learning_rate": 1.3150968150897497e-05, + "loss": 1.7114, + "step": 25115 + }, + { + "epoch": 7.70902394106814, + "grad_norm": 0.15521864593029022, + "learning_rate": 1.3147608662703864e-05, + "loss": 1.7031, + "step": 25116 + }, + { + "epoch": 7.709330877839165, + "grad_norm": 0.19424815475940704, + "learning_rate": 1.314424953870716e-05, + "loss": 1.6815, + "step": 25117 + }, + { + "epoch": 7.70963781461019, + "grad_norm": 0.30089494585990906, + "learning_rate": 1.3140890778940584e-05, + "loss": 1.7444, + "step": 25118 + }, + { + "epoch": 7.709944751381215, + "grad_norm": 0.1784239560365677, + "learning_rate": 1.3137532383437334e-05, + "loss": 1.6659, + "step": 25119 + }, + { + "epoch": 7.7102516881522405, + "grad_norm": 0.18670935928821564, + "learning_rate": 1.3134174352230571e-05, + "loss": 1.7007, + "step": 25120 + }, + { + "epoch": 7.710558624923266, + "grad_norm": 0.21140475571155548, + "learning_rate": 1.3130816685353541e-05, + "loss": 1.7716, + "step": 25121 + }, + { + "epoch": 7.710865561694291, + "grad_norm": 0.20546187460422516, + "learning_rate": 1.3127459382839363e-05, + "loss": 1.6434, + "step": 25122 + }, + { + "epoch": 7.7111724984653165, + "grad_norm": 0.15188902616500854, + "learning_rate": 1.312410244472127e-05, + "loss": 1.6843, + "step": 25123 + }, + { + "epoch": 7.711479435236341, + "grad_norm": 0.2020019143819809, + "learning_rate": 1.3120745871032375e-05, + "loss": 1.6846, + "step": 25124 + }, + { + "epoch": 7.711786372007366, + "grad_norm": 0.19839881360530853, + "learning_rate": 1.3117389661805907e-05, + "loss": 1.7026, + "step": 25125 + }, + { + "epoch": 7.712093308778392, + "grad_norm": 0.19400818645954132, + "learning_rate": 1.311403381707501e-05, + "loss": 1.705, + "step": 25126 + }, + { + "epoch": 7.712400245549417, + "grad_norm": 0.21366959810256958, + "learning_rate": 1.311067833687285e-05, + "loss": 1.7184, + "step": 25127 + }, + { + "epoch": 7.712707182320442, + "grad_norm": 0.17402227222919464, + "learning_rate": 1.3107323221232604e-05, + "loss": 1.6613, + "step": 25128 + }, + { + "epoch": 7.713014119091467, + "grad_norm": 0.24356254935264587, + "learning_rate": 1.3103968470187384e-05, + "loss": 1.7343, + "step": 25129 + }, + { + "epoch": 7.713321055862492, + "grad_norm": 0.18612951040267944, + "learning_rate": 1.3100614083770386e-05, + "loss": 1.7298, + "step": 25130 + }, + { + "epoch": 7.713627992633517, + "grad_norm": 0.27073535323143005, + "learning_rate": 1.3097260062014743e-05, + "loss": 1.7554, + "step": 25131 + }, + { + "epoch": 7.713934929404543, + "grad_norm": 0.1498921662569046, + "learning_rate": 1.309390640495361e-05, + "loss": 1.6506, + "step": 25132 + }, + { + "epoch": 7.714241866175568, + "grad_norm": 0.2159748524427414, + "learning_rate": 1.309055311262013e-05, + "loss": 1.6549, + "step": 25133 + }, + { + "epoch": 7.714548802946593, + "grad_norm": 0.2060365229845047, + "learning_rate": 1.3087200185047433e-05, + "loss": 1.7224, + "step": 25134 + }, + { + "epoch": 7.714855739717618, + "grad_norm": 0.22525639832019806, + "learning_rate": 1.3083847622268659e-05, + "loss": 1.7508, + "step": 25135 + }, + { + "epoch": 7.715162676488643, + "grad_norm": 0.20023567974567413, + "learning_rate": 1.3080495424316936e-05, + "loss": 1.7277, + "step": 25136 + }, + { + "epoch": 7.7154696132596685, + "grad_norm": 0.19702760875225067, + "learning_rate": 1.3077143591225389e-05, + "loss": 1.7291, + "step": 25137 + }, + { + "epoch": 7.715776550030694, + "grad_norm": 0.1713123917579651, + "learning_rate": 1.3073792123027173e-05, + "loss": 1.689, + "step": 25138 + }, + { + "epoch": 7.716083486801719, + "grad_norm": 0.17696695029735565, + "learning_rate": 1.3070441019755358e-05, + "loss": 1.6816, + "step": 25139 + }, + { + "epoch": 7.716390423572744, + "grad_norm": 0.1802004724740982, + "learning_rate": 1.3067090281443122e-05, + "loss": 1.754, + "step": 25140 + }, + { + "epoch": 7.716697360343769, + "grad_norm": 0.1829070895910263, + "learning_rate": 1.3063739908123518e-05, + "loss": 1.7389, + "step": 25141 + }, + { + "epoch": 7.717004297114794, + "grad_norm": 0.16842049360275269, + "learning_rate": 1.30603898998297e-05, + "loss": 1.7257, + "step": 25142 + }, + { + "epoch": 7.71731123388582, + "grad_norm": 0.18215791881084442, + "learning_rate": 1.305704025659476e-05, + "loss": 1.6765, + "step": 25143 + }, + { + "epoch": 7.717618170656845, + "grad_norm": 0.16992273926734924, + "learning_rate": 1.3053690978451799e-05, + "loss": 1.6729, + "step": 25144 + }, + { + "epoch": 7.71792510742787, + "grad_norm": 0.1847899854183197, + "learning_rate": 1.3050342065433935e-05, + "loss": 1.6972, + "step": 25145 + }, + { + "epoch": 7.718232044198895, + "grad_norm": 0.18730273842811584, + "learning_rate": 1.3046993517574219e-05, + "loss": 1.6996, + "step": 25146 + }, + { + "epoch": 7.71853898096992, + "grad_norm": 0.1695355772972107, + "learning_rate": 1.304364533490578e-05, + "loss": 1.7581, + "step": 25147 + }, + { + "epoch": 7.718845917740945, + "grad_norm": 0.17106328904628754, + "learning_rate": 1.3040297517461709e-05, + "loss": 1.6479, + "step": 25148 + }, + { + "epoch": 7.719152854511971, + "grad_norm": 0.1726374626159668, + "learning_rate": 1.3036950065275072e-05, + "loss": 1.7078, + "step": 25149 + }, + { + "epoch": 7.719459791282996, + "grad_norm": 0.21725010871887207, + "learning_rate": 1.3033602978378962e-05, + "loss": 1.8195, + "step": 25150 + }, + { + "epoch": 7.7197667280540205, + "grad_norm": 0.24786241352558136, + "learning_rate": 1.3030256256806455e-05, + "loss": 1.7439, + "step": 25151 + }, + { + "epoch": 7.720073664825046, + "grad_norm": 0.16550323367118835, + "learning_rate": 1.3026909900590622e-05, + "loss": 1.7267, + "step": 25152 + }, + { + "epoch": 7.720380601596071, + "grad_norm": 0.1833605021238327, + "learning_rate": 1.3023563909764542e-05, + "loss": 1.6675, + "step": 25153 + }, + { + "epoch": 7.7206875383670965, + "grad_norm": 0.16360491514205933, + "learning_rate": 1.3020218284361268e-05, + "loss": 1.684, + "step": 25154 + }, + { + "epoch": 7.720994475138122, + "grad_norm": 0.20423299074172974, + "learning_rate": 1.3016873024413878e-05, + "loss": 1.708, + "step": 25155 + }, + { + "epoch": 7.721301411909147, + "grad_norm": 0.1743123084306717, + "learning_rate": 1.301352812995541e-05, + "loss": 1.7497, + "step": 25156 + }, + { + "epoch": 7.721608348680172, + "grad_norm": 0.237883523106575, + "learning_rate": 1.301018360101896e-05, + "loss": 1.6859, + "step": 25157 + }, + { + "epoch": 7.721915285451197, + "grad_norm": 0.17953886091709137, + "learning_rate": 1.300683943763753e-05, + "loss": 1.6948, + "step": 25158 + }, + { + "epoch": 7.722222222222222, + "grad_norm": 0.19036953151226044, + "learning_rate": 1.3003495639844209e-05, + "loss": 1.7207, + "step": 25159 + }, + { + "epoch": 7.722529158993248, + "grad_norm": 0.17385275661945343, + "learning_rate": 1.3000152207672028e-05, + "loss": 1.7088, + "step": 25160 + }, + { + "epoch": 7.722836095764272, + "grad_norm": 0.1848379373550415, + "learning_rate": 1.2996809141154031e-05, + "loss": 1.7351, + "step": 25161 + }, + { + "epoch": 7.723143032535297, + "grad_norm": 0.1964390128850937, + "learning_rate": 1.2993466440323271e-05, + "loss": 1.7243, + "step": 25162 + }, + { + "epoch": 7.723449969306323, + "grad_norm": 0.23729266226291656, + "learning_rate": 1.299012410521273e-05, + "loss": 1.7588, + "step": 25163 + }, + { + "epoch": 7.723756906077348, + "grad_norm": 0.16980098187923431, + "learning_rate": 1.2986782135855496e-05, + "loss": 1.7092, + "step": 25164 + }, + { + "epoch": 7.724063842848373, + "grad_norm": 0.1993054747581482, + "learning_rate": 1.2983440532284568e-05, + "loss": 1.7245, + "step": 25165 + }, + { + "epoch": 7.724370779619399, + "grad_norm": 0.18817138671875, + "learning_rate": 1.2980099294532982e-05, + "loss": 1.7019, + "step": 25166 + }, + { + "epoch": 7.724677716390423, + "grad_norm": 0.20675966143608093, + "learning_rate": 1.297675842263375e-05, + "loss": 1.6949, + "step": 25167 + }, + { + "epoch": 7.7249846531614486, + "grad_norm": 0.21214626729488373, + "learning_rate": 1.2973417916619895e-05, + "loss": 1.7056, + "step": 25168 + }, + { + "epoch": 7.725291589932474, + "grad_norm": 0.1676976978778839, + "learning_rate": 1.2970077776524426e-05, + "loss": 1.7183, + "step": 25169 + }, + { + "epoch": 7.725598526703499, + "grad_norm": 0.2368413507938385, + "learning_rate": 1.2966738002380347e-05, + "loss": 1.7868, + "step": 25170 + }, + { + "epoch": 7.725905463474525, + "grad_norm": 0.22054153680801392, + "learning_rate": 1.2963398594220672e-05, + "loss": 1.7214, + "step": 25171 + }, + { + "epoch": 7.726212400245549, + "grad_norm": 0.20026426017284393, + "learning_rate": 1.2960059552078402e-05, + "loss": 1.7703, + "step": 25172 + }, + { + "epoch": 7.726519337016574, + "grad_norm": 0.1900193840265274, + "learning_rate": 1.2956720875986516e-05, + "loss": 1.7513, + "step": 25173 + }, + { + "epoch": 7.7268262737876, + "grad_norm": 0.17151880264282227, + "learning_rate": 1.2953382565978057e-05, + "loss": 1.7382, + "step": 25174 + }, + { + "epoch": 7.727133210558625, + "grad_norm": 0.2654723525047302, + "learning_rate": 1.2950044622085955e-05, + "loss": 1.7526, + "step": 25175 + }, + { + "epoch": 7.72744014732965, + "grad_norm": 0.19927532970905304, + "learning_rate": 1.2946707044343259e-05, + "loss": 1.7208, + "step": 25176 + }, + { + "epoch": 7.727747084100676, + "grad_norm": 0.3037160038948059, + "learning_rate": 1.2943369832782887e-05, + "loss": 1.8081, + "step": 25177 + }, + { + "epoch": 7.7280540208717, + "grad_norm": 0.20067723095417023, + "learning_rate": 1.2940032987437873e-05, + "loss": 1.685, + "step": 25178 + }, + { + "epoch": 7.7283609576427255, + "grad_norm": 0.16820429265499115, + "learning_rate": 1.2936696508341189e-05, + "loss": 1.7328, + "step": 25179 + }, + { + "epoch": 7.728667894413751, + "grad_norm": 0.15474672615528107, + "learning_rate": 1.2933360395525763e-05, + "loss": 1.708, + "step": 25180 + }, + { + "epoch": 7.728974831184776, + "grad_norm": 0.17825615406036377, + "learning_rate": 1.2930024649024609e-05, + "loss": 1.7416, + "step": 25181 + }, + { + "epoch": 7.7292817679558015, + "grad_norm": 0.20498061180114746, + "learning_rate": 1.292668926887068e-05, + "loss": 1.736, + "step": 25182 + }, + { + "epoch": 7.729588704726826, + "grad_norm": 0.22965869307518005, + "learning_rate": 1.2923354255096937e-05, + "loss": 1.7167, + "step": 25183 + }, + { + "epoch": 7.729895641497851, + "grad_norm": 0.1687164008617401, + "learning_rate": 1.2920019607736338e-05, + "loss": 1.6988, + "step": 25184 + }, + { + "epoch": 7.730202578268877, + "grad_norm": 0.18255390226840973, + "learning_rate": 1.2916685326821842e-05, + "loss": 1.6891, + "step": 25185 + }, + { + "epoch": 7.730509515039902, + "grad_norm": 0.1519697606563568, + "learning_rate": 1.2913351412386393e-05, + "loss": 1.6553, + "step": 25186 + }, + { + "epoch": 7.730816451810927, + "grad_norm": 0.19137845933437347, + "learning_rate": 1.2910017864462942e-05, + "loss": 1.7246, + "step": 25187 + }, + { + "epoch": 7.731123388581953, + "grad_norm": 0.19998718798160553, + "learning_rate": 1.2906684683084436e-05, + "loss": 1.7324, + "step": 25188 + }, + { + "epoch": 7.731430325352977, + "grad_norm": 0.18066956102848053, + "learning_rate": 1.2903351868283808e-05, + "loss": 1.7299, + "step": 25189 + }, + { + "epoch": 7.731737262124002, + "grad_norm": 0.18489640951156616, + "learning_rate": 1.290001942009399e-05, + "loss": 1.7249, + "step": 25190 + }, + { + "epoch": 7.732044198895028, + "grad_norm": 0.14994095265865326, + "learning_rate": 1.2896687338547958e-05, + "loss": 1.6466, + "step": 25191 + }, + { + "epoch": 7.732351135666053, + "grad_norm": 0.19937917590141296, + "learning_rate": 1.2893355623678571e-05, + "loss": 1.7298, + "step": 25192 + }, + { + "epoch": 7.7326580724370775, + "grad_norm": 0.1435725837945938, + "learning_rate": 1.2890024275518826e-05, + "loss": 1.7384, + "step": 25193 + }, + { + "epoch": 7.732965009208103, + "grad_norm": 0.23283594846725464, + "learning_rate": 1.2886693294101582e-05, + "loss": 1.7765, + "step": 25194 + }, + { + "epoch": 7.733271945979128, + "grad_norm": 0.15489891171455383, + "learning_rate": 1.2883362679459803e-05, + "loss": 1.6911, + "step": 25195 + }, + { + "epoch": 7.7335788827501535, + "grad_norm": 0.17880970239639282, + "learning_rate": 1.2880032431626404e-05, + "loss": 1.6557, + "step": 25196 + }, + { + "epoch": 7.733885819521179, + "grad_norm": 0.1717783808708191, + "learning_rate": 1.287670255063425e-05, + "loss": 1.7112, + "step": 25197 + }, + { + "epoch": 7.734192756292204, + "grad_norm": 0.17371709644794464, + "learning_rate": 1.2873373036516313e-05, + "loss": 1.7591, + "step": 25198 + }, + { + "epoch": 7.734499693063229, + "grad_norm": 0.15894445776939392, + "learning_rate": 1.2870043889305432e-05, + "loss": 1.6615, + "step": 25199 + }, + { + "epoch": 7.734806629834254, + "grad_norm": 0.17047199606895447, + "learning_rate": 1.2866715109034554e-05, + "loss": 1.7376, + "step": 25200 + }, + { + "epoch": 7.735113566605279, + "grad_norm": 0.17434459924697876, + "learning_rate": 1.2863386695736562e-05, + "loss": 1.6871, + "step": 25201 + }, + { + "epoch": 7.735420503376305, + "grad_norm": 0.18515460193157196, + "learning_rate": 1.2860058649444351e-05, + "loss": 1.7475, + "step": 25202 + }, + { + "epoch": 7.73572744014733, + "grad_norm": 0.1510036140680313, + "learning_rate": 1.2856730970190806e-05, + "loss": 1.7101, + "step": 25203 + }, + { + "epoch": 7.736034376918354, + "grad_norm": 0.1886061728000641, + "learning_rate": 1.2853403658008817e-05, + "loss": 1.7253, + "step": 25204 + }, + { + "epoch": 7.73634131368938, + "grad_norm": 0.15830372273921967, + "learning_rate": 1.2850076712931269e-05, + "loss": 1.7024, + "step": 25205 + }, + { + "epoch": 7.736648250460405, + "grad_norm": 0.3030432462692261, + "learning_rate": 1.2846750134991031e-05, + "loss": 1.7702, + "step": 25206 + }, + { + "epoch": 7.73695518723143, + "grad_norm": 0.1946970373392105, + "learning_rate": 1.2843423924220977e-05, + "loss": 1.7199, + "step": 25207 + }, + { + "epoch": 7.737262124002456, + "grad_norm": 0.19842801988124847, + "learning_rate": 1.2840098080654012e-05, + "loss": 1.7435, + "step": 25208 + }, + { + "epoch": 7.737569060773481, + "grad_norm": 0.17269715666770935, + "learning_rate": 1.2836772604322945e-05, + "loss": 1.6837, + "step": 25209 + }, + { + "epoch": 7.7378759975445055, + "grad_norm": 0.14366893470287323, + "learning_rate": 1.2833447495260703e-05, + "loss": 1.6453, + "step": 25210 + }, + { + "epoch": 7.738182934315531, + "grad_norm": 0.2189856618642807, + "learning_rate": 1.283012275350009e-05, + "loss": 1.7341, + "step": 25211 + }, + { + "epoch": 7.738489871086556, + "grad_norm": 0.14334678649902344, + "learning_rate": 1.2826798379074007e-05, + "loss": 1.6505, + "step": 25212 + }, + { + "epoch": 7.7387968078575815, + "grad_norm": 0.2020469605922699, + "learning_rate": 1.2823474372015304e-05, + "loss": 1.7915, + "step": 25213 + }, + { + "epoch": 7.739103744628607, + "grad_norm": 0.14702250063419342, + "learning_rate": 1.2820150732356783e-05, + "loss": 1.6682, + "step": 25214 + }, + { + "epoch": 7.739410681399631, + "grad_norm": 0.2310563623905182, + "learning_rate": 1.281682746013136e-05, + "loss": 1.7447, + "step": 25215 + }, + { + "epoch": 7.739717618170657, + "grad_norm": 0.16534216701984406, + "learning_rate": 1.2813504555371808e-05, + "loss": 1.6641, + "step": 25216 + }, + { + "epoch": 7.740024554941682, + "grad_norm": 0.1390565037727356, + "learning_rate": 1.2810182018111012e-05, + "loss": 1.6912, + "step": 25217 + }, + { + "epoch": 7.740331491712707, + "grad_norm": 0.16568928956985474, + "learning_rate": 1.2806859848381797e-05, + "loss": 1.7375, + "step": 25218 + }, + { + "epoch": 7.740638428483733, + "grad_norm": 0.18870174884796143, + "learning_rate": 1.2803538046216995e-05, + "loss": 1.7158, + "step": 25219 + }, + { + "epoch": 7.740945365254758, + "grad_norm": 0.18347607553005219, + "learning_rate": 1.2800216611649429e-05, + "loss": 1.7766, + "step": 25220 + }, + { + "epoch": 7.741252302025782, + "grad_norm": 0.21285377442836761, + "learning_rate": 1.2796895544711929e-05, + "loss": 1.6876, + "step": 25221 + }, + { + "epoch": 7.741559238796808, + "grad_norm": 0.26524603366851807, + "learning_rate": 1.2793574845437311e-05, + "loss": 1.6679, + "step": 25222 + }, + { + "epoch": 7.741866175567833, + "grad_norm": 0.1671147346496582, + "learning_rate": 1.2790254513858397e-05, + "loss": 1.6853, + "step": 25223 + }, + { + "epoch": 7.742173112338858, + "grad_norm": 0.21713866293430328, + "learning_rate": 1.2786934550007979e-05, + "loss": 1.8124, + "step": 25224 + }, + { + "epoch": 7.742480049109884, + "grad_norm": 0.17161360383033752, + "learning_rate": 1.2783614953918916e-05, + "loss": 1.6862, + "step": 25225 + }, + { + "epoch": 7.742786985880908, + "grad_norm": 0.1513087898492813, + "learning_rate": 1.2780295725623947e-05, + "loss": 1.6644, + "step": 25226 + }, + { + "epoch": 7.7430939226519335, + "grad_norm": 0.13013005256652832, + "learning_rate": 1.2776976865155948e-05, + "loss": 1.6612, + "step": 25227 + }, + { + "epoch": 7.743400859422959, + "grad_norm": 0.15204063057899475, + "learning_rate": 1.2773658372547648e-05, + "loss": 1.6391, + "step": 25228 + }, + { + "epoch": 7.743707796193984, + "grad_norm": 0.15421196818351746, + "learning_rate": 1.2770340247831891e-05, + "loss": 1.7005, + "step": 25229 + }, + { + "epoch": 7.7440147329650095, + "grad_norm": 0.14045587182044983, + "learning_rate": 1.276702249104147e-05, + "loss": 1.6448, + "step": 25230 + }, + { + "epoch": 7.744321669736035, + "grad_norm": 0.17244049906730652, + "learning_rate": 1.2763705102209123e-05, + "loss": 1.6737, + "step": 25231 + }, + { + "epoch": 7.744628606507059, + "grad_norm": 0.16891124844551086, + "learning_rate": 1.2760388081367697e-05, + "loss": 1.6625, + "step": 25232 + }, + { + "epoch": 7.744935543278085, + "grad_norm": 0.18271134793758392, + "learning_rate": 1.275707142854991e-05, + "loss": 1.6963, + "step": 25233 + }, + { + "epoch": 7.74524248004911, + "grad_norm": 0.18582625687122345, + "learning_rate": 1.2753755143788593e-05, + "loss": 1.6731, + "step": 25234 + }, + { + "epoch": 7.745549416820135, + "grad_norm": 0.17610707879066467, + "learning_rate": 1.2750439227116495e-05, + "loss": 1.6976, + "step": 25235 + }, + { + "epoch": 7.74585635359116, + "grad_norm": 0.20406337082386017, + "learning_rate": 1.2747123678566391e-05, + "loss": 1.7287, + "step": 25236 + }, + { + "epoch": 7.746163290362185, + "grad_norm": 0.16879913210868835, + "learning_rate": 1.2743808498171046e-05, + "loss": 1.6594, + "step": 25237 + }, + { + "epoch": 7.74647022713321, + "grad_norm": 0.1405191272497177, + "learning_rate": 1.2740493685963217e-05, + "loss": 1.6565, + "step": 25238 + }, + { + "epoch": 7.746777163904236, + "grad_norm": 0.1460784375667572, + "learning_rate": 1.2737179241975671e-05, + "loss": 1.6336, + "step": 25239 + }, + { + "epoch": 7.747084100675261, + "grad_norm": 0.16206084191799164, + "learning_rate": 1.273386516624116e-05, + "loss": 1.7501, + "step": 25240 + }, + { + "epoch": 7.747391037446286, + "grad_norm": 0.17040394246578217, + "learning_rate": 1.2730551458792422e-05, + "loss": 1.7532, + "step": 25241 + }, + { + "epoch": 7.747697974217311, + "grad_norm": 0.15487439930438995, + "learning_rate": 1.2727238119662243e-05, + "loss": 1.6757, + "step": 25242 + }, + { + "epoch": 7.748004910988336, + "grad_norm": 0.139495387673378, + "learning_rate": 1.272392514888332e-05, + "loss": 1.6431, + "step": 25243 + }, + { + "epoch": 7.7483118477593615, + "grad_norm": 0.16329489648342133, + "learning_rate": 1.2720612546488447e-05, + "loss": 1.7353, + "step": 25244 + }, + { + "epoch": 7.748618784530387, + "grad_norm": 0.14997398853302002, + "learning_rate": 1.27173003125103e-05, + "loss": 1.6977, + "step": 25245 + }, + { + "epoch": 7.748925721301412, + "grad_norm": 0.2005717009305954, + "learning_rate": 1.2713988446981656e-05, + "loss": 1.757, + "step": 25246 + }, + { + "epoch": 7.749232658072437, + "grad_norm": 0.2027040272951126, + "learning_rate": 1.2710676949935246e-05, + "loss": 1.7506, + "step": 25247 + }, + { + "epoch": 7.749539594843462, + "grad_norm": 0.18176981806755066, + "learning_rate": 1.2707365821403755e-05, + "loss": 1.7132, + "step": 25248 + }, + { + "epoch": 7.749846531614487, + "grad_norm": 0.18690772354602814, + "learning_rate": 1.2704055061419961e-05, + "loss": 1.7725, + "step": 25249 + }, + { + "epoch": 7.750153468385513, + "grad_norm": 0.18360945582389832, + "learning_rate": 1.270074467001653e-05, + "loss": 1.6779, + "step": 25250 + }, + { + "epoch": 7.750460405156538, + "grad_norm": 0.18498149514198303, + "learning_rate": 1.269743464722621e-05, + "loss": 1.7105, + "step": 25251 + }, + { + "epoch": 7.750767341927563, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.2694124993081707e-05, + "loss": 1.7273, + "step": 25252 + }, + { + "epoch": 7.751074278698588, + "grad_norm": 0.17312094569206238, + "learning_rate": 1.2690815707615727e-05, + "loss": 1.7532, + "step": 25253 + }, + { + "epoch": 7.751381215469613, + "grad_norm": 0.18758632242679596, + "learning_rate": 1.2687506790860976e-05, + "loss": 1.7394, + "step": 25254 + }, + { + "epoch": 7.7516881522406385, + "grad_norm": 0.1642044633626938, + "learning_rate": 1.2684198242850149e-05, + "loss": 1.6699, + "step": 25255 + }, + { + "epoch": 7.751995089011664, + "grad_norm": 0.34566664695739746, + "learning_rate": 1.2680890063615947e-05, + "loss": 1.7048, + "step": 25256 + }, + { + "epoch": 7.752302025782689, + "grad_norm": 0.15046556293964386, + "learning_rate": 1.2677582253191066e-05, + "loss": 1.659, + "step": 25257 + }, + { + "epoch": 7.752608962553714, + "grad_norm": 0.1504966914653778, + "learning_rate": 1.2674274811608171e-05, + "loss": 1.6841, + "step": 25258 + }, + { + "epoch": 7.752915899324739, + "grad_norm": 0.2226656973361969, + "learning_rate": 1.2670967738900009e-05, + "loss": 1.7139, + "step": 25259 + }, + { + "epoch": 7.753222836095764, + "grad_norm": 0.18797673285007477, + "learning_rate": 1.2667661035099188e-05, + "loss": 1.7726, + "step": 25260 + }, + { + "epoch": 7.75352977286679, + "grad_norm": 0.15428531169891357, + "learning_rate": 1.266435470023845e-05, + "loss": 1.6831, + "step": 25261 + }, + { + "epoch": 7.753836709637815, + "grad_norm": 0.20027057826519012, + "learning_rate": 1.2661048734350412e-05, + "loss": 1.741, + "step": 25262 + }, + { + "epoch": 7.75414364640884, + "grad_norm": 0.14779487252235413, + "learning_rate": 1.2657743137467793e-05, + "loss": 1.6974, + "step": 25263 + }, + { + "epoch": 7.754450583179865, + "grad_norm": 0.17618241906166077, + "learning_rate": 1.2654437909623258e-05, + "loss": 1.7374, + "step": 25264 + }, + { + "epoch": 7.75475751995089, + "grad_norm": 0.18769881129264832, + "learning_rate": 1.2651133050849423e-05, + "loss": 1.7241, + "step": 25265 + }, + { + "epoch": 7.755064456721915, + "grad_norm": 0.18645870685577393, + "learning_rate": 1.2647828561179015e-05, + "loss": 1.7176, + "step": 25266 + }, + { + "epoch": 7.755371393492941, + "grad_norm": 0.17507290840148926, + "learning_rate": 1.2644524440644628e-05, + "loss": 1.6994, + "step": 25267 + }, + { + "epoch": 7.755678330263965, + "grad_norm": 0.15264524519443512, + "learning_rate": 1.264122068927896e-05, + "loss": 1.6993, + "step": 25268 + }, + { + "epoch": 7.7559852670349905, + "grad_norm": 0.1749732941389084, + "learning_rate": 1.263791730711465e-05, + "loss": 1.7265, + "step": 25269 + }, + { + "epoch": 7.756292203806016, + "grad_norm": 0.15777049958705902, + "learning_rate": 1.2634614294184332e-05, + "loss": 1.6219, + "step": 25270 + }, + { + "epoch": 7.756599140577041, + "grad_norm": 0.17740310728549957, + "learning_rate": 1.263131165052066e-05, + "loss": 1.7373, + "step": 25271 + }, + { + "epoch": 7.7569060773480665, + "grad_norm": 0.22577044367790222, + "learning_rate": 1.262800937615627e-05, + "loss": 1.7492, + "step": 25272 + }, + { + "epoch": 7.757213014119092, + "grad_norm": 0.155413419008255, + "learning_rate": 1.2624707471123791e-05, + "loss": 1.7037, + "step": 25273 + }, + { + "epoch": 7.757519950890116, + "grad_norm": 0.1755802482366562, + "learning_rate": 1.2621405935455866e-05, + "loss": 1.7057, + "step": 25274 + }, + { + "epoch": 7.757826887661142, + "grad_norm": 0.15870101749897003, + "learning_rate": 1.2618104769185096e-05, + "loss": 1.6951, + "step": 25275 + }, + { + "epoch": 7.758133824432167, + "grad_norm": 0.18285419046878815, + "learning_rate": 1.2614803972344158e-05, + "loss": 1.7443, + "step": 25276 + }, + { + "epoch": 7.758440761203192, + "grad_norm": 0.1669059544801712, + "learning_rate": 1.2611503544965609e-05, + "loss": 1.6442, + "step": 25277 + }, + { + "epoch": 7.758747697974218, + "grad_norm": 0.17830590903759003, + "learning_rate": 1.2608203487082121e-05, + "loss": 1.7432, + "step": 25278 + }, + { + "epoch": 7.759054634745242, + "grad_norm": 0.18318989872932434, + "learning_rate": 1.2604903798726259e-05, + "loss": 1.7128, + "step": 25279 + }, + { + "epoch": 7.759361571516267, + "grad_norm": 0.17735294997692108, + "learning_rate": 1.2601604479930663e-05, + "loss": 1.6719, + "step": 25280 + }, + { + "epoch": 7.759668508287293, + "grad_norm": 0.14324752986431122, + "learning_rate": 1.2598305530727949e-05, + "loss": 1.688, + "step": 25281 + }, + { + "epoch": 7.759975445058318, + "grad_norm": 0.17677859961986542, + "learning_rate": 1.2595006951150678e-05, + "loss": 1.7016, + "step": 25282 + }, + { + "epoch": 7.760282381829343, + "grad_norm": 0.16832831501960754, + "learning_rate": 1.2591708741231495e-05, + "loss": 1.6669, + "step": 25283 + }, + { + "epoch": 7.760589318600369, + "grad_norm": 0.20717547833919525, + "learning_rate": 1.2588410901002944e-05, + "loss": 1.7275, + "step": 25284 + }, + { + "epoch": 7.760896255371393, + "grad_norm": 0.2471853792667389, + "learning_rate": 1.2585113430497658e-05, + "loss": 1.779, + "step": 25285 + }, + { + "epoch": 7.7612031921424185, + "grad_norm": 0.2646878957748413, + "learning_rate": 1.2581816329748214e-05, + "loss": 1.8003, + "step": 25286 + }, + { + "epoch": 7.761510128913444, + "grad_norm": 0.2102949321269989, + "learning_rate": 1.2578519598787191e-05, + "loss": 1.764, + "step": 25287 + }, + { + "epoch": 7.761817065684469, + "grad_norm": 0.16151423752307892, + "learning_rate": 1.2575223237647171e-05, + "loss": 1.7233, + "step": 25288 + }, + { + "epoch": 7.7621240024554945, + "grad_norm": 0.22221817076206207, + "learning_rate": 1.2571927246360727e-05, + "loss": 1.7485, + "step": 25289 + }, + { + "epoch": 7.762430939226519, + "grad_norm": 0.16470851004123688, + "learning_rate": 1.2568631624960441e-05, + "loss": 1.6844, + "step": 25290 + }, + { + "epoch": 7.762737875997544, + "grad_norm": 0.17529261112213135, + "learning_rate": 1.256533637347887e-05, + "loss": 1.7409, + "step": 25291 + }, + { + "epoch": 7.76304481276857, + "grad_norm": 0.19055718183517456, + "learning_rate": 1.2562041491948579e-05, + "loss": 1.6861, + "step": 25292 + }, + { + "epoch": 7.763351749539595, + "grad_norm": 0.19183041155338287, + "learning_rate": 1.2558746980402159e-05, + "loss": 1.7493, + "step": 25293 + }, + { + "epoch": 7.76365868631062, + "grad_norm": 0.20031596720218658, + "learning_rate": 1.2555452838872123e-05, + "loss": 1.705, + "step": 25294 + }, + { + "epoch": 7.763965623081646, + "grad_norm": 0.16234149038791656, + "learning_rate": 1.2552159067391072e-05, + "loss": 1.7407, + "step": 25295 + }, + { + "epoch": 7.76427255985267, + "grad_norm": 0.15412569046020508, + "learning_rate": 1.254886566599151e-05, + "loss": 1.6599, + "step": 25296 + }, + { + "epoch": 7.764579496623695, + "grad_norm": 0.17393885552883148, + "learning_rate": 1.2545572634706022e-05, + "loss": 1.7372, + "step": 25297 + }, + { + "epoch": 7.764886433394721, + "grad_norm": 0.18662036955356598, + "learning_rate": 1.254227997356715e-05, + "loss": 1.7681, + "step": 25298 + }, + { + "epoch": 7.765193370165746, + "grad_norm": 0.16661690175533295, + "learning_rate": 1.2538987682607395e-05, + "loss": 1.754, + "step": 25299 + }, + { + "epoch": 7.765500306936771, + "grad_norm": 0.21453191339969635, + "learning_rate": 1.253569576185935e-05, + "loss": 1.7802, + "step": 25300 + }, + { + "epoch": 7.765807243707796, + "grad_norm": 0.14639903604984283, + "learning_rate": 1.2532404211355486e-05, + "loss": 1.6478, + "step": 25301 + }, + { + "epoch": 7.766114180478821, + "grad_norm": 0.17430682480335236, + "learning_rate": 1.2529113031128382e-05, + "loss": 1.687, + "step": 25302 + }, + { + "epoch": 7.7664211172498465, + "grad_norm": 0.21582552790641785, + "learning_rate": 1.2525822221210543e-05, + "loss": 1.7723, + "step": 25303 + }, + { + "epoch": 7.766728054020872, + "grad_norm": 0.21142803132534027, + "learning_rate": 1.2522531781634495e-05, + "loss": 1.7986, + "step": 25304 + }, + { + "epoch": 7.767034990791897, + "grad_norm": 0.1637791097164154, + "learning_rate": 1.251924171243275e-05, + "loss": 1.6884, + "step": 25305 + }, + { + "epoch": 7.7673419275629225, + "grad_norm": 0.19218359887599945, + "learning_rate": 1.2515952013637832e-05, + "loss": 1.7972, + "step": 25306 + }, + { + "epoch": 7.767648864333947, + "grad_norm": 0.14534975588321686, + "learning_rate": 1.2512662685282245e-05, + "loss": 1.6602, + "step": 25307 + }, + { + "epoch": 7.767955801104972, + "grad_norm": 0.2955080568790436, + "learning_rate": 1.2509373727398494e-05, + "loss": 1.763, + "step": 25308 + }, + { + "epoch": 7.768262737875998, + "grad_norm": 0.17220059037208557, + "learning_rate": 1.2506085140019086e-05, + "loss": 1.672, + "step": 25309 + }, + { + "epoch": 7.768569674647023, + "grad_norm": 0.17092043161392212, + "learning_rate": 1.2502796923176524e-05, + "loss": 1.7014, + "step": 25310 + }, + { + "epoch": 7.768876611418047, + "grad_norm": 0.2363509237766266, + "learning_rate": 1.2499509076903288e-05, + "loss": 1.7489, + "step": 25311 + }, + { + "epoch": 7.769183548189073, + "grad_norm": 0.19223156571388245, + "learning_rate": 1.2496221601231906e-05, + "loss": 1.7194, + "step": 25312 + }, + { + "epoch": 7.769490484960098, + "grad_norm": 0.18292652070522308, + "learning_rate": 1.249293449619483e-05, + "loss": 1.7422, + "step": 25313 + }, + { + "epoch": 7.769797421731123, + "grad_norm": 0.17120866477489471, + "learning_rate": 1.2489647761824547e-05, + "loss": 1.7367, + "step": 25314 + }, + { + "epoch": 7.770104358502149, + "grad_norm": 0.22178049385547638, + "learning_rate": 1.248636139815358e-05, + "loss": 1.7451, + "step": 25315 + }, + { + "epoch": 7.770411295273174, + "grad_norm": 0.15707750618457794, + "learning_rate": 1.2483075405214346e-05, + "loss": 1.6748, + "step": 25316 + }, + { + "epoch": 7.7707182320441985, + "grad_norm": 0.1570693850517273, + "learning_rate": 1.2479789783039381e-05, + "loss": 1.6895, + "step": 25317 + }, + { + "epoch": 7.771025168815224, + "grad_norm": 0.1687897890806198, + "learning_rate": 1.2476504531661093e-05, + "loss": 1.7145, + "step": 25318 + }, + { + "epoch": 7.771332105586249, + "grad_norm": 0.16047275066375732, + "learning_rate": 1.2473219651112e-05, + "loss": 1.6675, + "step": 25319 + }, + { + "epoch": 7.7716390423572745, + "grad_norm": 0.16817785799503326, + "learning_rate": 1.2469935141424544e-05, + "loss": 1.6678, + "step": 25320 + }, + { + "epoch": 7.7719459791283, + "grad_norm": 0.1511528342962265, + "learning_rate": 1.246665100263118e-05, + "loss": 1.7054, + "step": 25321 + }, + { + "epoch": 7.772252915899324, + "grad_norm": 0.145367830991745, + "learning_rate": 1.2463367234764373e-05, + "loss": 1.7037, + "step": 25322 + }, + { + "epoch": 7.77255985267035, + "grad_norm": 0.1794048696756363, + "learning_rate": 1.2460083837856573e-05, + "loss": 1.7372, + "step": 25323 + }, + { + "epoch": 7.772866789441375, + "grad_norm": 0.21238376200199127, + "learning_rate": 1.2456800811940227e-05, + "loss": 1.7796, + "step": 25324 + }, + { + "epoch": 7.7731737262124, + "grad_norm": 0.23305723071098328, + "learning_rate": 1.2453518157047784e-05, + "loss": 1.7124, + "step": 25325 + }, + { + "epoch": 7.773480662983426, + "grad_norm": 0.18229269981384277, + "learning_rate": 1.2450235873211673e-05, + "loss": 1.7202, + "step": 25326 + }, + { + "epoch": 7.773787599754451, + "grad_norm": 0.19145874679088593, + "learning_rate": 1.2446953960464346e-05, + "loss": 1.6701, + "step": 25327 + }, + { + "epoch": 7.774094536525475, + "grad_norm": 0.26310765743255615, + "learning_rate": 1.2443672418838215e-05, + "loss": 1.7674, + "step": 25328 + }, + { + "epoch": 7.774401473296501, + "grad_norm": 0.18370535969734192, + "learning_rate": 1.2440391248365756e-05, + "loss": 1.7027, + "step": 25329 + }, + { + "epoch": 7.774708410067526, + "grad_norm": 0.24704128503799438, + "learning_rate": 1.2437110449079348e-05, + "loss": 1.7238, + "step": 25330 + }, + { + "epoch": 7.7750153468385514, + "grad_norm": 0.194215789437294, + "learning_rate": 1.2433830021011433e-05, + "loss": 1.735, + "step": 25331 + }, + { + "epoch": 7.775322283609577, + "grad_norm": 0.24099037051200867, + "learning_rate": 1.2430549964194427e-05, + "loss": 1.7335, + "step": 25332 + }, + { + "epoch": 7.775629220380601, + "grad_norm": 0.1665026843547821, + "learning_rate": 1.242727027866073e-05, + "loss": 1.7245, + "step": 25333 + }, + { + "epoch": 7.775936157151627, + "grad_norm": 0.18005968630313873, + "learning_rate": 1.24239909644428e-05, + "loss": 1.6227, + "step": 25334 + }, + { + "epoch": 7.776243093922652, + "grad_norm": 0.2306728959083557, + "learning_rate": 1.2420712021572983e-05, + "loss": 1.7136, + "step": 25335 + }, + { + "epoch": 7.776550030693677, + "grad_norm": 0.1916062831878662, + "learning_rate": 1.2417433450083738e-05, + "loss": 1.7912, + "step": 25336 + }, + { + "epoch": 7.776856967464703, + "grad_norm": 0.1999555081129074, + "learning_rate": 1.2414155250007437e-05, + "loss": 1.7685, + "step": 25337 + }, + { + "epoch": 7.777163904235728, + "grad_norm": 0.18222710490226746, + "learning_rate": 1.2410877421376488e-05, + "loss": 1.7024, + "step": 25338 + }, + { + "epoch": 7.777470841006752, + "grad_norm": 0.22534650564193726, + "learning_rate": 1.2407599964223276e-05, + "loss": 1.7263, + "step": 25339 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.3313053250312805, + "learning_rate": 1.2404322878580199e-05, + "loss": 1.6988, + "step": 25340 + }, + { + "epoch": 7.778084714548803, + "grad_norm": 0.23691575229167938, + "learning_rate": 1.2401046164479635e-05, + "loss": 1.7771, + "step": 25341 + }, + { + "epoch": 7.778391651319828, + "grad_norm": 0.2119995355606079, + "learning_rate": 1.2397769821953976e-05, + "loss": 1.709, + "step": 25342 + }, + { + "epoch": 7.778698588090853, + "grad_norm": 0.20468266308307648, + "learning_rate": 1.2394493851035588e-05, + "loss": 1.7914, + "step": 25343 + }, + { + "epoch": 7.779005524861878, + "grad_norm": 0.19825033843517303, + "learning_rate": 1.2391218251756854e-05, + "loss": 1.727, + "step": 25344 + }, + { + "epoch": 7.7793124616329035, + "grad_norm": 0.19072072207927704, + "learning_rate": 1.2387943024150134e-05, + "loss": 1.7498, + "step": 25345 + }, + { + "epoch": 7.779619398403929, + "grad_norm": 0.15986371040344238, + "learning_rate": 1.2384668168247832e-05, + "loss": 1.6807, + "step": 25346 + }, + { + "epoch": 7.779926335174954, + "grad_norm": 0.1731162816286087, + "learning_rate": 1.238139368408227e-05, + "loss": 1.7, + "step": 25347 + }, + { + "epoch": 7.7802332719459795, + "grad_norm": 0.1496593952178955, + "learning_rate": 1.237811957168583e-05, + "loss": 1.6558, + "step": 25348 + }, + { + "epoch": 7.780540208717004, + "grad_norm": 0.1982542872428894, + "learning_rate": 1.2374845831090859e-05, + "loss": 1.7888, + "step": 25349 + }, + { + "epoch": 7.780847145488029, + "grad_norm": 0.1517801433801651, + "learning_rate": 1.2371572462329706e-05, + "loss": 1.6743, + "step": 25350 + }, + { + "epoch": 7.781154082259055, + "grad_norm": 0.23794496059417725, + "learning_rate": 1.2368299465434752e-05, + "loss": 1.7332, + "step": 25351 + }, + { + "epoch": 7.78146101903008, + "grad_norm": 0.20220822095870972, + "learning_rate": 1.2365026840438288e-05, + "loss": 1.7444, + "step": 25352 + }, + { + "epoch": 7.781767955801105, + "grad_norm": 0.18997377157211304, + "learning_rate": 1.236175458737272e-05, + "loss": 1.771, + "step": 25353 + }, + { + "epoch": 7.78207489257213, + "grad_norm": 0.15465202927589417, + "learning_rate": 1.2358482706270325e-05, + "loss": 1.7072, + "step": 25354 + }, + { + "epoch": 7.782381829343155, + "grad_norm": 0.1759808510541916, + "learning_rate": 1.235521119716348e-05, + "loss": 1.6761, + "step": 25355 + }, + { + "epoch": 7.78268876611418, + "grad_norm": 0.17520606517791748, + "learning_rate": 1.2351940060084505e-05, + "loss": 1.6702, + "step": 25356 + }, + { + "epoch": 7.782995702885206, + "grad_norm": 0.20305509865283966, + "learning_rate": 1.2348669295065717e-05, + "loss": 1.746, + "step": 25357 + }, + { + "epoch": 7.783302639656231, + "grad_norm": 0.14459536969661713, + "learning_rate": 1.2345398902139454e-05, + "loss": 1.6907, + "step": 25358 + }, + { + "epoch": 7.783609576427256, + "grad_norm": 0.18058347702026367, + "learning_rate": 1.2342128881338027e-05, + "loss": 1.796, + "step": 25359 + }, + { + "epoch": 7.783916513198281, + "grad_norm": 0.1778976023197174, + "learning_rate": 1.2338859232693756e-05, + "loss": 1.715, + "step": 25360 + }, + { + "epoch": 7.784223449969306, + "grad_norm": 0.1644120067358017, + "learning_rate": 1.2335589956238953e-05, + "loss": 1.6786, + "step": 25361 + }, + { + "epoch": 7.7845303867403315, + "grad_norm": 0.15315432846546173, + "learning_rate": 1.2332321052005907e-05, + "loss": 1.6503, + "step": 25362 + }, + { + "epoch": 7.784837323511357, + "grad_norm": 0.19160087406635284, + "learning_rate": 1.2329052520026973e-05, + "loss": 1.7131, + "step": 25363 + }, + { + "epoch": 7.785144260282382, + "grad_norm": 0.1778041124343872, + "learning_rate": 1.2325784360334408e-05, + "loss": 1.754, + "step": 25364 + }, + { + "epoch": 7.785451197053407, + "grad_norm": 0.17478828132152557, + "learning_rate": 1.2322516572960519e-05, + "loss": 1.7122, + "step": 25365 + }, + { + "epoch": 7.785758133824432, + "grad_norm": 0.2239549458026886, + "learning_rate": 1.2319249157937612e-05, + "loss": 1.7589, + "step": 25366 + }, + { + "epoch": 7.786065070595457, + "grad_norm": 0.21565821766853333, + "learning_rate": 1.2315982115297953e-05, + "loss": 1.7468, + "step": 25367 + }, + { + "epoch": 7.786372007366483, + "grad_norm": 0.1859208643436432, + "learning_rate": 1.231271544507387e-05, + "loss": 1.7289, + "step": 25368 + }, + { + "epoch": 7.786678944137508, + "grad_norm": 0.14813102781772614, + "learning_rate": 1.2309449147297596e-05, + "loss": 1.6543, + "step": 25369 + }, + { + "epoch": 7.786985880908533, + "grad_norm": 0.14101989567279816, + "learning_rate": 1.2306183222001472e-05, + "loss": 1.6775, + "step": 25370 + }, + { + "epoch": 7.787292817679558, + "grad_norm": 0.2041245847940445, + "learning_rate": 1.2302917669217701e-05, + "loss": 1.6874, + "step": 25371 + }, + { + "epoch": 7.787599754450583, + "grad_norm": 0.17343124747276306, + "learning_rate": 1.2299652488978614e-05, + "loss": 1.7005, + "step": 25372 + }, + { + "epoch": 7.787906691221608, + "grad_norm": 0.20174655318260193, + "learning_rate": 1.2296387681316451e-05, + "loss": 1.8073, + "step": 25373 + }, + { + "epoch": 7.788213627992634, + "grad_norm": 0.21615192294120789, + "learning_rate": 1.2293123246263488e-05, + "loss": 1.7045, + "step": 25374 + }, + { + "epoch": 7.788520564763659, + "grad_norm": 0.18587705492973328, + "learning_rate": 1.2289859183851981e-05, + "loss": 1.7497, + "step": 25375 + }, + { + "epoch": 7.7888275015346835, + "grad_norm": 0.16649113595485687, + "learning_rate": 1.228659549411419e-05, + "loss": 1.6695, + "step": 25376 + }, + { + "epoch": 7.789134438305709, + "grad_norm": 0.16547587513923645, + "learning_rate": 1.2283332177082362e-05, + "loss": 1.7119, + "step": 25377 + }, + { + "epoch": 7.789441375076734, + "grad_norm": 0.17672663927078247, + "learning_rate": 1.2280069232788755e-05, + "loss": 1.7458, + "step": 25378 + }, + { + "epoch": 7.7897483118477595, + "grad_norm": 0.15436655282974243, + "learning_rate": 1.22768066612656e-05, + "loss": 1.723, + "step": 25379 + }, + { + "epoch": 7.790055248618785, + "grad_norm": 0.1699141561985016, + "learning_rate": 1.2273544462545178e-05, + "loss": 1.7083, + "step": 25380 + }, + { + "epoch": 7.79036218538981, + "grad_norm": 0.18014399707317352, + "learning_rate": 1.2270282636659686e-05, + "loss": 1.7512, + "step": 25381 + }, + { + "epoch": 7.790669122160835, + "grad_norm": 0.1807268261909485, + "learning_rate": 1.2267021183641375e-05, + "loss": 1.7404, + "step": 25382 + }, + { + "epoch": 7.79097605893186, + "grad_norm": 0.16704204678535461, + "learning_rate": 1.2263760103522481e-05, + "loss": 1.6723, + "step": 25383 + }, + { + "epoch": 7.791282995702885, + "grad_norm": 0.1551518738269806, + "learning_rate": 1.2260499396335206e-05, + "loss": 1.7, + "step": 25384 + }, + { + "epoch": 7.791589932473911, + "grad_norm": 0.16270415484905243, + "learning_rate": 1.225723906211183e-05, + "loss": 1.7238, + "step": 25385 + }, + { + "epoch": 7.791896869244935, + "grad_norm": 0.19548700749874115, + "learning_rate": 1.225397910088451e-05, + "loss": 1.7192, + "step": 25386 + }, + { + "epoch": 7.79220380601596, + "grad_norm": 0.19115851819515228, + "learning_rate": 1.225071951268552e-05, + "loss": 1.753, + "step": 25387 + }, + { + "epoch": 7.792510742786986, + "grad_norm": 0.1557070016860962, + "learning_rate": 1.224746029754702e-05, + "loss": 1.6791, + "step": 25388 + }, + { + "epoch": 7.792817679558011, + "grad_norm": 0.16580358147621155, + "learning_rate": 1.2244201455501252e-05, + "loss": 1.6799, + "step": 25389 + }, + { + "epoch": 7.793124616329036, + "grad_norm": 0.18099573254585266, + "learning_rate": 1.2240942986580422e-05, + "loss": 1.7546, + "step": 25390 + }, + { + "epoch": 7.793431553100062, + "grad_norm": 0.2411479502916336, + "learning_rate": 1.223768489081672e-05, + "loss": 1.7315, + "step": 25391 + }, + { + "epoch": 7.793738489871086, + "grad_norm": 0.14678087830543518, + "learning_rate": 1.2234427168242351e-05, + "loss": 1.6733, + "step": 25392 + }, + { + "epoch": 7.7940454266421115, + "grad_norm": 0.17501497268676758, + "learning_rate": 1.223116981888951e-05, + "loss": 1.7416, + "step": 25393 + }, + { + "epoch": 7.794352363413137, + "grad_norm": 0.25460878014564514, + "learning_rate": 1.2227912842790384e-05, + "loss": 1.7873, + "step": 25394 + }, + { + "epoch": 7.794659300184162, + "grad_norm": 0.1701650321483612, + "learning_rate": 1.2224656239977161e-05, + "loss": 1.686, + "step": 25395 + }, + { + "epoch": 7.7949662369551875, + "grad_norm": 0.15684448182582855, + "learning_rate": 1.2221400010482009e-05, + "loss": 1.6768, + "step": 25396 + }, + { + "epoch": 7.795273173726212, + "grad_norm": 0.19048964977264404, + "learning_rate": 1.2218144154337158e-05, + "loss": 1.744, + "step": 25397 + }, + { + "epoch": 7.795580110497237, + "grad_norm": 0.20939184725284576, + "learning_rate": 1.2214888671574737e-05, + "loss": 1.818, + "step": 25398 + }, + { + "epoch": 7.795887047268263, + "grad_norm": 0.18450765311717987, + "learning_rate": 1.2211633562226932e-05, + "loss": 1.6972, + "step": 25399 + }, + { + "epoch": 7.796193984039288, + "grad_norm": 0.20349545776844025, + "learning_rate": 1.2208378826325912e-05, + "loss": 1.7784, + "step": 25400 + }, + { + "epoch": 7.796500920810313, + "grad_norm": 0.17835615575313568, + "learning_rate": 1.2205124463903828e-05, + "loss": 1.7203, + "step": 25401 + }, + { + "epoch": 7.796807857581339, + "grad_norm": 0.1525154411792755, + "learning_rate": 1.2201870474992882e-05, + "loss": 1.7194, + "step": 25402 + }, + { + "epoch": 7.797114794352363, + "grad_norm": 0.15197598934173584, + "learning_rate": 1.2198616859625184e-05, + "loss": 1.6787, + "step": 25403 + }, + { + "epoch": 7.797421731123388, + "grad_norm": 0.1602524071931839, + "learning_rate": 1.2195363617832934e-05, + "loss": 1.6919, + "step": 25404 + }, + { + "epoch": 7.797728667894414, + "grad_norm": 0.15638625621795654, + "learning_rate": 1.2192110749648233e-05, + "loss": 1.6945, + "step": 25405 + }, + { + "epoch": 7.798035604665439, + "grad_norm": 0.15247012674808502, + "learning_rate": 1.2188858255103264e-05, + "loss": 1.673, + "step": 25406 + }, + { + "epoch": 7.798342541436464, + "grad_norm": 0.16753807663917542, + "learning_rate": 1.218560613423016e-05, + "loss": 1.7088, + "step": 25407 + }, + { + "epoch": 7.798649478207489, + "grad_norm": 0.17434635758399963, + "learning_rate": 1.2182354387061063e-05, + "loss": 1.7279, + "step": 25408 + }, + { + "epoch": 7.798956414978514, + "grad_norm": 0.21984371542930603, + "learning_rate": 1.2179103013628108e-05, + "loss": 1.7203, + "step": 25409 + }, + { + "epoch": 7.7992633517495396, + "grad_norm": 0.18304525315761566, + "learning_rate": 1.2175852013963418e-05, + "loss": 1.6937, + "step": 25410 + }, + { + "epoch": 7.799570288520565, + "grad_norm": 0.20372866094112396, + "learning_rate": 1.2172601388099131e-05, + "loss": 1.6911, + "step": 25411 + }, + { + "epoch": 7.79987722529159, + "grad_norm": 0.2012174129486084, + "learning_rate": 1.216935113606737e-05, + "loss": 1.7365, + "step": 25412 + }, + { + "epoch": 7.800184162062616, + "grad_norm": 0.2146923542022705, + "learning_rate": 1.2166101257900236e-05, + "loss": 1.711, + "step": 25413 + }, + { + "epoch": 7.80049109883364, + "grad_norm": 0.202762633562088, + "learning_rate": 1.2162851753629895e-05, + "loss": 1.7459, + "step": 25414 + }, + { + "epoch": 7.800798035604665, + "grad_norm": 0.19161204993724823, + "learning_rate": 1.2159602623288418e-05, + "loss": 1.687, + "step": 25415 + }, + { + "epoch": 7.801104972375691, + "grad_norm": 0.2027188539505005, + "learning_rate": 1.2156353866907927e-05, + "loss": 1.7482, + "step": 25416 + }, + { + "epoch": 7.801411909146716, + "grad_norm": 0.17790403962135315, + "learning_rate": 1.2153105484520521e-05, + "loss": 1.7047, + "step": 25417 + }, + { + "epoch": 7.8017188459177405, + "grad_norm": 0.18325060606002808, + "learning_rate": 1.21498574761583e-05, + "loss": 1.693, + "step": 25418 + }, + { + "epoch": 7.802025782688766, + "grad_norm": 0.14223991334438324, + "learning_rate": 1.2146609841853401e-05, + "loss": 1.7168, + "step": 25419 + }, + { + "epoch": 7.802332719459791, + "grad_norm": 0.18397340178489685, + "learning_rate": 1.2143362581637863e-05, + "loss": 1.7234, + "step": 25420 + }, + { + "epoch": 7.8026396562308165, + "grad_norm": 0.16903668642044067, + "learning_rate": 1.214011569554383e-05, + "loss": 1.6884, + "step": 25421 + }, + { + "epoch": 7.802946593001842, + "grad_norm": 0.15086103975772858, + "learning_rate": 1.2136869183603339e-05, + "loss": 1.6712, + "step": 25422 + }, + { + "epoch": 7.803253529772867, + "grad_norm": 0.1743185818195343, + "learning_rate": 1.2133623045848507e-05, + "loss": 1.7167, + "step": 25423 + }, + { + "epoch": 7.803560466543892, + "grad_norm": 0.160976842045784, + "learning_rate": 1.2130377282311411e-05, + "loss": 1.7749, + "step": 25424 + }, + { + "epoch": 7.803867403314917, + "grad_norm": 0.2554323971271515, + "learning_rate": 1.2127131893024123e-05, + "loss": 1.7156, + "step": 25425 + }, + { + "epoch": 7.804174340085942, + "grad_norm": 0.1582731157541275, + "learning_rate": 1.2123886878018714e-05, + "loss": 1.7088, + "step": 25426 + }, + { + "epoch": 7.804481276856968, + "grad_norm": 0.18008622527122498, + "learning_rate": 1.2120642237327257e-05, + "loss": 1.6928, + "step": 25427 + }, + { + "epoch": 7.804788213627993, + "grad_norm": 0.29349491000175476, + "learning_rate": 1.2117397970981815e-05, + "loss": 1.7596, + "step": 25428 + }, + { + "epoch": 7.805095150399017, + "grad_norm": 0.20927627384662628, + "learning_rate": 1.211415407901445e-05, + "loss": 1.7113, + "step": 25429 + }, + { + "epoch": 7.805402087170043, + "grad_norm": 0.2126142680644989, + "learning_rate": 1.21109105614572e-05, + "loss": 1.7125, + "step": 25430 + }, + { + "epoch": 7.805709023941068, + "grad_norm": 0.20456665754318237, + "learning_rate": 1.2107667418342172e-05, + "loss": 1.7619, + "step": 25431 + }, + { + "epoch": 7.806015960712093, + "grad_norm": 0.17268066108226776, + "learning_rate": 1.2104424649701373e-05, + "loss": 1.6462, + "step": 25432 + }, + { + "epoch": 7.806322897483119, + "grad_norm": 0.16213946044445038, + "learning_rate": 1.2101182255566856e-05, + "loss": 1.6787, + "step": 25433 + }, + { + "epoch": 7.806629834254144, + "grad_norm": 0.17202046513557434, + "learning_rate": 1.2097940235970673e-05, + "loss": 1.7081, + "step": 25434 + }, + { + "epoch": 7.8069367710251685, + "grad_norm": 0.2076229751110077, + "learning_rate": 1.2094698590944842e-05, + "loss": 1.6832, + "step": 25435 + }, + { + "epoch": 7.807243707796194, + "grad_norm": 0.17209482192993164, + "learning_rate": 1.2091457320521448e-05, + "loss": 1.7722, + "step": 25436 + }, + { + "epoch": 7.807550644567219, + "grad_norm": 0.2185208946466446, + "learning_rate": 1.2088216424732463e-05, + "loss": 1.7536, + "step": 25437 + }, + { + "epoch": 7.8078575813382445, + "grad_norm": 0.1812329739332199, + "learning_rate": 1.2084975903609968e-05, + "loss": 1.7275, + "step": 25438 + }, + { + "epoch": 7.80816451810927, + "grad_norm": 0.20143690705299377, + "learning_rate": 1.208173575718594e-05, + "loss": 1.7533, + "step": 25439 + }, + { + "epoch": 7.808471454880294, + "grad_norm": 0.18351776897907257, + "learning_rate": 1.2078495985492433e-05, + "loss": 1.6831, + "step": 25440 + }, + { + "epoch": 7.80877839165132, + "grad_norm": 0.15470999479293823, + "learning_rate": 1.2075256588561462e-05, + "loss": 1.6862, + "step": 25441 + }, + { + "epoch": 7.809085328422345, + "grad_norm": 0.1751607209444046, + "learning_rate": 1.2072017566425032e-05, + "loss": 1.7182, + "step": 25442 + }, + { + "epoch": 7.80939226519337, + "grad_norm": 0.16465237736701965, + "learning_rate": 1.2068778919115153e-05, + "loss": 1.7055, + "step": 25443 + }, + { + "epoch": 7.809699201964396, + "grad_norm": 0.13899528980255127, + "learning_rate": 1.2065540646663832e-05, + "loss": 1.634, + "step": 25444 + }, + { + "epoch": 7.810006138735421, + "grad_norm": 0.21526047587394714, + "learning_rate": 1.2062302749103072e-05, + "loss": 1.759, + "step": 25445 + }, + { + "epoch": 7.810313075506445, + "grad_norm": 0.1628599315881729, + "learning_rate": 1.2059065226464872e-05, + "loss": 1.6782, + "step": 25446 + }, + { + "epoch": 7.810620012277471, + "grad_norm": 0.16853751242160797, + "learning_rate": 1.2055828078781217e-05, + "loss": 1.7123, + "step": 25447 + }, + { + "epoch": 7.810926949048496, + "grad_norm": 0.17399325966835022, + "learning_rate": 1.2052591306084138e-05, + "loss": 1.7394, + "step": 25448 + }, + { + "epoch": 7.811233885819521, + "grad_norm": 0.16147997975349426, + "learning_rate": 1.2049354908405574e-05, + "loss": 1.66, + "step": 25449 + }, + { + "epoch": 7.811540822590547, + "grad_norm": 0.1806066632270813, + "learning_rate": 1.204611888577753e-05, + "loss": 1.7193, + "step": 25450 + }, + { + "epoch": 7.811847759361571, + "grad_norm": 0.14491340517997742, + "learning_rate": 1.2042883238231984e-05, + "loss": 1.6996, + "step": 25451 + }, + { + "epoch": 7.8121546961325965, + "grad_norm": 0.24257591366767883, + "learning_rate": 1.2039647965800905e-05, + "loss": 1.734, + "step": 25452 + }, + { + "epoch": 7.812461632903622, + "grad_norm": 0.17281031608581543, + "learning_rate": 1.2036413068516295e-05, + "loss": 1.7469, + "step": 25453 + }, + { + "epoch": 7.812768569674647, + "grad_norm": 0.16350387036800385, + "learning_rate": 1.2033178546410073e-05, + "loss": 1.6755, + "step": 25454 + }, + { + "epoch": 7.8130755064456725, + "grad_norm": 0.21092571318149567, + "learning_rate": 1.202994439951427e-05, + "loss": 1.7538, + "step": 25455 + }, + { + "epoch": 7.813382443216698, + "grad_norm": 0.13705989718437195, + "learning_rate": 1.2026710627860777e-05, + "loss": 1.6563, + "step": 25456 + }, + { + "epoch": 7.813689379987722, + "grad_norm": 0.2368711531162262, + "learning_rate": 1.20234772314816e-05, + "loss": 1.7685, + "step": 25457 + }, + { + "epoch": 7.813996316758748, + "grad_norm": 0.19303718209266663, + "learning_rate": 1.2020244210408682e-05, + "loss": 1.7286, + "step": 25458 + }, + { + "epoch": 7.814303253529773, + "grad_norm": 0.17113862931728363, + "learning_rate": 1.2017011564673974e-05, + "loss": 1.6336, + "step": 25459 + }, + { + "epoch": 7.814610190300798, + "grad_norm": 0.2151467204093933, + "learning_rate": 1.2013779294309418e-05, + "loss": 1.7585, + "step": 25460 + }, + { + "epoch": 7.814917127071823, + "grad_norm": 0.21620413661003113, + "learning_rate": 1.2010547399346961e-05, + "loss": 1.7058, + "step": 25461 + }, + { + "epoch": 7.815224063842848, + "grad_norm": 0.20134735107421875, + "learning_rate": 1.2007315879818537e-05, + "loss": 1.7833, + "step": 25462 + }, + { + "epoch": 7.815531000613873, + "grad_norm": 0.16653650999069214, + "learning_rate": 1.2004084735756088e-05, + "loss": 1.7022, + "step": 25463 + }, + { + "epoch": 7.815837937384899, + "grad_norm": 0.2135760486125946, + "learning_rate": 1.2000853967191527e-05, + "loss": 1.7502, + "step": 25464 + }, + { + "epoch": 7.816144874155924, + "grad_norm": 0.19773945212364197, + "learning_rate": 1.199762357415683e-05, + "loss": 1.7369, + "step": 25465 + }, + { + "epoch": 7.816451810926949, + "grad_norm": 0.1873825341463089, + "learning_rate": 1.1994393556683876e-05, + "loss": 1.6921, + "step": 25466 + }, + { + "epoch": 7.816758747697974, + "grad_norm": 0.19304445385932922, + "learning_rate": 1.1991163914804604e-05, + "loss": 1.6934, + "step": 25467 + }, + { + "epoch": 7.817065684468999, + "grad_norm": 0.16338905692100525, + "learning_rate": 1.1987934648550924e-05, + "loss": 1.6523, + "step": 25468 + }, + { + "epoch": 7.8173726212400245, + "grad_norm": 0.16972069442272186, + "learning_rate": 1.198470575795474e-05, + "loss": 1.6907, + "step": 25469 + }, + { + "epoch": 7.81767955801105, + "grad_norm": 0.17251834273338318, + "learning_rate": 1.1981477243048e-05, + "loss": 1.7336, + "step": 25470 + }, + { + "epoch": 7.817986494782075, + "grad_norm": 0.17767611145973206, + "learning_rate": 1.197824910386256e-05, + "loss": 1.6809, + "step": 25471 + }, + { + "epoch": 7.8182934315531, + "grad_norm": 0.1854296773672104, + "learning_rate": 1.197502134043038e-05, + "loss": 1.6938, + "step": 25472 + }, + { + "epoch": 7.818600368324125, + "grad_norm": 0.15811395645141602, + "learning_rate": 1.1971793952783295e-05, + "loss": 1.6346, + "step": 25473 + }, + { + "epoch": 7.81890730509515, + "grad_norm": 0.1668241322040558, + "learning_rate": 1.196856694095324e-05, + "loss": 1.7014, + "step": 25474 + }, + { + "epoch": 7.819214241866176, + "grad_norm": 0.16705112159252167, + "learning_rate": 1.1965340304972105e-05, + "loss": 1.7509, + "step": 25475 + }, + { + "epoch": 7.819521178637201, + "grad_norm": 0.1737189143896103, + "learning_rate": 1.1962114044871764e-05, + "loss": 1.6934, + "step": 25476 + }, + { + "epoch": 7.819828115408226, + "grad_norm": 0.21887148916721344, + "learning_rate": 1.1958888160684112e-05, + "loss": 1.7163, + "step": 25477 + }, + { + "epoch": 7.820135052179251, + "grad_norm": 0.19267810881137848, + "learning_rate": 1.1955662652441018e-05, + "loss": 1.6941, + "step": 25478 + }, + { + "epoch": 7.820441988950276, + "grad_norm": 0.19797572493553162, + "learning_rate": 1.195243752017437e-05, + "loss": 1.7067, + "step": 25479 + }, + { + "epoch": 7.820748925721301, + "grad_norm": 0.20177066326141357, + "learning_rate": 1.1949212763916035e-05, + "loss": 1.7186, + "step": 25480 + }, + { + "epoch": 7.821055862492327, + "grad_norm": 0.1789240539073944, + "learning_rate": 1.1945988383697876e-05, + "loss": 1.7533, + "step": 25481 + }, + { + "epoch": 7.821362799263352, + "grad_norm": 0.2210909128189087, + "learning_rate": 1.1942764379551769e-05, + "loss": 1.7255, + "step": 25482 + }, + { + "epoch": 7.8216697360343765, + "grad_norm": 0.17705149948596954, + "learning_rate": 1.193954075150957e-05, + "loss": 1.6797, + "step": 25483 + }, + { + "epoch": 7.821976672805402, + "grad_norm": 0.17962488532066345, + "learning_rate": 1.1936317499603134e-05, + "loss": 1.7134, + "step": 25484 + }, + { + "epoch": 7.822283609576427, + "grad_norm": 0.2144375741481781, + "learning_rate": 1.193309462386432e-05, + "loss": 1.6837, + "step": 25485 + }, + { + "epoch": 7.8225905463474525, + "grad_norm": 0.19018805027008057, + "learning_rate": 1.1929872124324976e-05, + "loss": 1.7377, + "step": 25486 + }, + { + "epoch": 7.822897483118478, + "grad_norm": 0.2281246781349182, + "learning_rate": 1.1926650001016953e-05, + "loss": 1.755, + "step": 25487 + }, + { + "epoch": 7.823204419889503, + "grad_norm": 0.17724375426769257, + "learning_rate": 1.1923428253972069e-05, + "loss": 1.7018, + "step": 25488 + }, + { + "epoch": 7.823511356660528, + "grad_norm": 0.19313837587833405, + "learning_rate": 1.1920206883222218e-05, + "loss": 1.705, + "step": 25489 + }, + { + "epoch": 7.823818293431553, + "grad_norm": 0.1883455514907837, + "learning_rate": 1.191698588879917e-05, + "loss": 1.66, + "step": 25490 + }, + { + "epoch": 7.824125230202578, + "grad_norm": 0.20110155642032623, + "learning_rate": 1.1913765270734805e-05, + "loss": 1.7456, + "step": 25491 + }, + { + "epoch": 7.824432166973604, + "grad_norm": 0.23234841227531433, + "learning_rate": 1.1910545029060938e-05, + "loss": 1.6987, + "step": 25492 + }, + { + "epoch": 7.824739103744628, + "grad_norm": 0.208989679813385, + "learning_rate": 1.1907325163809386e-05, + "loss": 1.7753, + "step": 25493 + }, + { + "epoch": 7.8250460405156534, + "grad_norm": 0.19063059985637665, + "learning_rate": 1.1904105675011972e-05, + "loss": 1.6664, + "step": 25494 + }, + { + "epoch": 7.825352977286679, + "grad_norm": 0.16878041625022888, + "learning_rate": 1.1900886562700519e-05, + "loss": 1.6886, + "step": 25495 + }, + { + "epoch": 7.825659914057704, + "grad_norm": 0.19139298796653748, + "learning_rate": 1.1897667826906834e-05, + "loss": 1.7195, + "step": 25496 + }, + { + "epoch": 7.8259668508287294, + "grad_norm": 0.255795419216156, + "learning_rate": 1.1894449467662728e-05, + "loss": 1.7835, + "step": 25497 + }, + { + "epoch": 7.826273787599755, + "grad_norm": 0.17967084050178528, + "learning_rate": 1.1891231485000004e-05, + "loss": 1.6959, + "step": 25498 + }, + { + "epoch": 7.82658072437078, + "grad_norm": 0.23582984507083893, + "learning_rate": 1.1888013878950471e-05, + "loss": 1.7252, + "step": 25499 + }, + { + "epoch": 7.826887661141805, + "grad_norm": 0.189914271235466, + "learning_rate": 1.188479664954592e-05, + "loss": 1.7216, + "step": 25500 + }, + { + "epoch": 7.82719459791283, + "grad_norm": 0.19840605556964874, + "learning_rate": 1.1881579796818148e-05, + "loss": 1.714, + "step": 25501 + }, + { + "epoch": 7.827501534683855, + "grad_norm": 0.25255537033081055, + "learning_rate": 1.1878363320798946e-05, + "loss": 1.7008, + "step": 25502 + }, + { + "epoch": 7.827808471454881, + "grad_norm": 0.1863456666469574, + "learning_rate": 1.1875147221520105e-05, + "loss": 1.7804, + "step": 25503 + }, + { + "epoch": 7.828115408225905, + "grad_norm": 0.2700684368610382, + "learning_rate": 1.1871931499013405e-05, + "loss": 1.6756, + "step": 25504 + }, + { + "epoch": 7.82842234499693, + "grad_norm": 0.19838537275791168, + "learning_rate": 1.1868716153310604e-05, + "loss": 1.6828, + "step": 25505 + }, + { + "epoch": 7.828729281767956, + "grad_norm": 0.1896767020225525, + "learning_rate": 1.1865501184443533e-05, + "loss": 1.7014, + "step": 25506 + }, + { + "epoch": 7.829036218538981, + "grad_norm": 0.2330249398946762, + "learning_rate": 1.1862286592443905e-05, + "loss": 1.7509, + "step": 25507 + }, + { + "epoch": 7.829343155310006, + "grad_norm": 0.17078560590744019, + "learning_rate": 1.1859072377343539e-05, + "loss": 1.6742, + "step": 25508 + }, + { + "epoch": 7.829650092081032, + "grad_norm": 0.2834900915622711, + "learning_rate": 1.1855858539174146e-05, + "loss": 1.7676, + "step": 25509 + }, + { + "epoch": 7.829957028852056, + "grad_norm": 0.18936461210250854, + "learning_rate": 1.1852645077967533e-05, + "loss": 1.7374, + "step": 25510 + }, + { + "epoch": 7.8302639656230815, + "grad_norm": 0.2720448970794678, + "learning_rate": 1.1849431993755439e-05, + "loss": 1.7001, + "step": 25511 + }, + { + "epoch": 7.830570902394107, + "grad_norm": 0.18198262155056, + "learning_rate": 1.184621928656962e-05, + "loss": 1.6679, + "step": 25512 + }, + { + "epoch": 7.830877839165132, + "grad_norm": 0.16957701742649078, + "learning_rate": 1.1843006956441821e-05, + "loss": 1.7064, + "step": 25513 + }, + { + "epoch": 7.8311847759361575, + "grad_norm": 0.18632464110851288, + "learning_rate": 1.1839795003403798e-05, + "loss": 1.6857, + "step": 25514 + }, + { + "epoch": 7.831491712707182, + "grad_norm": 0.15639352798461914, + "learning_rate": 1.183658342748728e-05, + "loss": 1.695, + "step": 25515 + }, + { + "epoch": 7.831798649478207, + "grad_norm": 0.17000986635684967, + "learning_rate": 1.1833372228724016e-05, + "loss": 1.696, + "step": 25516 + }, + { + "epoch": 7.832105586249233, + "grad_norm": 0.23334810137748718, + "learning_rate": 1.1830161407145735e-05, + "loss": 1.7574, + "step": 25517 + }, + { + "epoch": 7.832412523020258, + "grad_norm": 0.16260294616222382, + "learning_rate": 1.1826950962784177e-05, + "loss": 1.667, + "step": 25518 + }, + { + "epoch": 7.832719459791283, + "grad_norm": 0.18244150280952454, + "learning_rate": 1.1823740895671059e-05, + "loss": 1.6836, + "step": 25519 + }, + { + "epoch": 7.833026396562309, + "grad_norm": 0.18404243886470795, + "learning_rate": 1.182053120583811e-05, + "loss": 1.6922, + "step": 25520 + }, + { + "epoch": 7.833333333333333, + "grad_norm": 0.22713635861873627, + "learning_rate": 1.1817321893317052e-05, + "loss": 1.8055, + "step": 25521 + }, + { + "epoch": 7.833640270104358, + "grad_norm": 0.14314736425876617, + "learning_rate": 1.1814112958139577e-05, + "loss": 1.6624, + "step": 25522 + }, + { + "epoch": 7.833947206875384, + "grad_norm": 0.1947709321975708, + "learning_rate": 1.1810904400337458e-05, + "loss": 1.8108, + "step": 25523 + }, + { + "epoch": 7.834254143646409, + "grad_norm": 0.1811491698026657, + "learning_rate": 1.1807696219942326e-05, + "loss": 1.7258, + "step": 25524 + }, + { + "epoch": 7.834561080417434, + "grad_norm": 0.16776522994041443, + "learning_rate": 1.1804488416985966e-05, + "loss": 1.6834, + "step": 25525 + }, + { + "epoch": 7.834868017188459, + "grad_norm": 0.1590484231710434, + "learning_rate": 1.1801280991500002e-05, + "loss": 1.6797, + "step": 25526 + }, + { + "epoch": 7.835174953959484, + "grad_norm": 0.1564435064792633, + "learning_rate": 1.179807394351618e-05, + "loss": 1.7035, + "step": 25527 + }, + { + "epoch": 7.8354818907305095, + "grad_norm": 0.17740637063980103, + "learning_rate": 1.1794867273066184e-05, + "loss": 1.6844, + "step": 25528 + }, + { + "epoch": 7.835788827501535, + "grad_norm": 0.17152990400791168, + "learning_rate": 1.1791660980181707e-05, + "loss": 1.6745, + "step": 25529 + }, + { + "epoch": 7.83609576427256, + "grad_norm": 0.17763324081897736, + "learning_rate": 1.1788455064894427e-05, + "loss": 1.6941, + "step": 25530 + }, + { + "epoch": 7.8364027010435855, + "grad_norm": 0.16168560087680817, + "learning_rate": 1.178524952723603e-05, + "loss": 1.6955, + "step": 25531 + }, + { + "epoch": 7.83670963781461, + "grad_norm": 0.1819266527891159, + "learning_rate": 1.1782044367238199e-05, + "loss": 1.6838, + "step": 25532 + }, + { + "epoch": 7.837016574585635, + "grad_norm": 0.16239593923091888, + "learning_rate": 1.1778839584932605e-05, + "loss": 1.7045, + "step": 25533 + }, + { + "epoch": 7.837323511356661, + "grad_norm": 0.18346372246742249, + "learning_rate": 1.177563518035092e-05, + "loss": 1.7418, + "step": 25534 + }, + { + "epoch": 7.837630448127686, + "grad_norm": 0.18437781929969788, + "learning_rate": 1.177243115352481e-05, + "loss": 1.7138, + "step": 25535 + }, + { + "epoch": 7.83793738489871, + "grad_norm": 0.16199420392513275, + "learning_rate": 1.1769227504485942e-05, + "loss": 1.7115, + "step": 25536 + }, + { + "epoch": 7.838244321669736, + "grad_norm": 0.174173504114151, + "learning_rate": 1.1766024233265977e-05, + "loss": 1.7115, + "step": 25537 + }, + { + "epoch": 7.838551258440761, + "grad_norm": 0.1924828737974167, + "learning_rate": 1.1762821339896567e-05, + "loss": 1.7343, + "step": 25538 + }, + { + "epoch": 7.838858195211786, + "grad_norm": 0.20509763062000275, + "learning_rate": 1.1759618824409357e-05, + "loss": 1.7296, + "step": 25539 + }, + { + "epoch": 7.839165131982812, + "grad_norm": 0.1762499213218689, + "learning_rate": 1.1756416686836035e-05, + "loss": 1.6721, + "step": 25540 + }, + { + "epoch": 7.839472068753837, + "grad_norm": 0.17260326445102692, + "learning_rate": 1.175321492720819e-05, + "loss": 1.7238, + "step": 25541 + }, + { + "epoch": 7.8397790055248615, + "grad_norm": 0.21378587186336517, + "learning_rate": 1.175001354555752e-05, + "loss": 1.7442, + "step": 25542 + }, + { + "epoch": 7.840085942295887, + "grad_norm": 0.20900048315525055, + "learning_rate": 1.1746812541915608e-05, + "loss": 1.7426, + "step": 25543 + }, + { + "epoch": 7.840392879066912, + "grad_norm": 0.2082734853029251, + "learning_rate": 1.1743611916314129e-05, + "loss": 1.7209, + "step": 25544 + }, + { + "epoch": 7.8406998158379375, + "grad_norm": 0.1696191281080246, + "learning_rate": 1.1740411668784701e-05, + "loss": 1.7039, + "step": 25545 + }, + { + "epoch": 7.841006752608963, + "grad_norm": 0.18812915682792664, + "learning_rate": 1.173721179935895e-05, + "loss": 1.6873, + "step": 25546 + }, + { + "epoch": 7.841313689379987, + "grad_norm": 0.19983457028865814, + "learning_rate": 1.1734012308068493e-05, + "loss": 1.701, + "step": 25547 + }, + { + "epoch": 7.841620626151013, + "grad_norm": 0.18811485171318054, + "learning_rate": 1.1730813194944962e-05, + "loss": 1.7466, + "step": 25548 + }, + { + "epoch": 7.841927562922038, + "grad_norm": 0.16648226976394653, + "learning_rate": 1.172761446001996e-05, + "loss": 1.7449, + "step": 25549 + }, + { + "epoch": 7.842234499693063, + "grad_norm": 0.17902494966983795, + "learning_rate": 1.1724416103325104e-05, + "loss": 1.7395, + "step": 25550 + }, + { + "epoch": 7.842541436464089, + "grad_norm": 0.2420952469110489, + "learning_rate": 1.1721218124892003e-05, + "loss": 1.728, + "step": 25551 + }, + { + "epoch": 7.842848373235114, + "grad_norm": 0.16240666806697845, + "learning_rate": 1.1718020524752266e-05, + "loss": 1.6368, + "step": 25552 + }, + { + "epoch": 7.843155310006138, + "grad_norm": 0.17968396842479706, + "learning_rate": 1.1714823302937483e-05, + "loss": 1.729, + "step": 25553 + }, + { + "epoch": 7.843462246777164, + "grad_norm": 0.17617417871952057, + "learning_rate": 1.1711626459479252e-05, + "loss": 1.6975, + "step": 25554 + }, + { + "epoch": 7.843769183548189, + "grad_norm": 0.1679859161376953, + "learning_rate": 1.1708429994409176e-05, + "loss": 1.6955, + "step": 25555 + }, + { + "epoch": 7.844076120319214, + "grad_norm": 0.1653962880373001, + "learning_rate": 1.1705233907758823e-05, + "loss": 1.7107, + "step": 25556 + }, + { + "epoch": 7.84438305709024, + "grad_norm": 0.190699502825737, + "learning_rate": 1.1702038199559817e-05, + "loss": 1.75, + "step": 25557 + }, + { + "epoch": 7.844689993861264, + "grad_norm": 0.17185768485069275, + "learning_rate": 1.1698842869843696e-05, + "loss": 1.7087, + "step": 25558 + }, + { + "epoch": 7.8449969306322895, + "grad_norm": 0.17880931496620178, + "learning_rate": 1.1695647918642084e-05, + "loss": 1.7082, + "step": 25559 + }, + { + "epoch": 7.845303867403315, + "grad_norm": 0.15360671281814575, + "learning_rate": 1.1692453345986498e-05, + "loss": 1.7028, + "step": 25560 + }, + { + "epoch": 7.84561080417434, + "grad_norm": 0.16576705873012543, + "learning_rate": 1.168925915190856e-05, + "loss": 1.7147, + "step": 25561 + }, + { + "epoch": 7.8459177409453655, + "grad_norm": 0.14623773097991943, + "learning_rate": 1.1686065336439817e-05, + "loss": 1.682, + "step": 25562 + }, + { + "epoch": 7.846224677716391, + "grad_norm": 0.16677425801753998, + "learning_rate": 1.168287189961183e-05, + "loss": 1.7089, + "step": 25563 + }, + { + "epoch": 7.846531614487415, + "grad_norm": 0.160381019115448, + "learning_rate": 1.1679678841456164e-05, + "loss": 1.6929, + "step": 25564 + }, + { + "epoch": 7.846838551258441, + "grad_norm": 0.1775302290916443, + "learning_rate": 1.1676486162004374e-05, + "loss": 1.6947, + "step": 25565 + }, + { + "epoch": 7.847145488029466, + "grad_norm": 0.1681419014930725, + "learning_rate": 1.1673293861288003e-05, + "loss": 1.7173, + "step": 25566 + }, + { + "epoch": 7.847452424800491, + "grad_norm": 0.18374401330947876, + "learning_rate": 1.1670101939338613e-05, + "loss": 1.7175, + "step": 25567 + }, + { + "epoch": 7.847759361571516, + "grad_norm": 0.19383086264133453, + "learning_rate": 1.1666910396187736e-05, + "loss": 1.6962, + "step": 25568 + }, + { + "epoch": 7.848066298342541, + "grad_norm": 0.16849574446678162, + "learning_rate": 1.1663719231866921e-05, + "loss": 1.6717, + "step": 25569 + }, + { + "epoch": 7.848373235113566, + "grad_norm": 0.2510664165019989, + "learning_rate": 1.1660528446407703e-05, + "loss": 1.7983, + "step": 25570 + }, + { + "epoch": 7.848680171884592, + "grad_norm": 0.21037714183330536, + "learning_rate": 1.1657338039841614e-05, + "loss": 1.7287, + "step": 25571 + }, + { + "epoch": 7.848987108655617, + "grad_norm": 0.15170596539974213, + "learning_rate": 1.1654148012200184e-05, + "loss": 1.7076, + "step": 25572 + }, + { + "epoch": 7.849294045426642, + "grad_norm": 0.2093864530324936, + "learning_rate": 1.1650958363514919e-05, + "loss": 1.7469, + "step": 25573 + }, + { + "epoch": 7.849600982197668, + "grad_norm": 0.15684813261032104, + "learning_rate": 1.1647769093817395e-05, + "loss": 1.6731, + "step": 25574 + }, + { + "epoch": 7.849907918968692, + "grad_norm": 0.1600468009710312, + "learning_rate": 1.1644580203139066e-05, + "loss": 1.6394, + "step": 25575 + }, + { + "epoch": 7.850214855739718, + "grad_norm": 0.1863955557346344, + "learning_rate": 1.1641391691511505e-05, + "loss": 1.7025, + "step": 25576 + }, + { + "epoch": 7.850521792510743, + "grad_norm": 0.189132422208786, + "learning_rate": 1.1638203558966166e-05, + "loss": 1.7095, + "step": 25577 + }, + { + "epoch": 7.850828729281768, + "grad_norm": 0.166460782289505, + "learning_rate": 1.1635015805534593e-05, + "loss": 1.6756, + "step": 25578 + }, + { + "epoch": 7.851135666052793, + "grad_norm": 0.15910424292087555, + "learning_rate": 1.1631828431248288e-05, + "loss": 1.6664, + "step": 25579 + }, + { + "epoch": 7.851442602823818, + "grad_norm": 0.14848501980304718, + "learning_rate": 1.1628641436138738e-05, + "loss": 1.6434, + "step": 25580 + }, + { + "epoch": 7.851749539594843, + "grad_norm": 0.1700928956270218, + "learning_rate": 1.1625454820237446e-05, + "loss": 1.7039, + "step": 25581 + }, + { + "epoch": 7.852056476365869, + "grad_norm": 0.17468976974487305, + "learning_rate": 1.1622268583575902e-05, + "loss": 1.7073, + "step": 25582 + }, + { + "epoch": 7.852363413136894, + "grad_norm": 0.18980912864208221, + "learning_rate": 1.1619082726185587e-05, + "loss": 1.6939, + "step": 25583 + }, + { + "epoch": 7.852670349907919, + "grad_norm": 0.1658385694026947, + "learning_rate": 1.1615897248098e-05, + "loss": 1.6892, + "step": 25584 + }, + { + "epoch": 7.852977286678944, + "grad_norm": 0.18137763440608978, + "learning_rate": 1.1612712149344612e-05, + "loss": 1.6608, + "step": 25585 + }, + { + "epoch": 7.853284223449969, + "grad_norm": 0.1642989218235016, + "learning_rate": 1.16095274299569e-05, + "loss": 1.6527, + "step": 25586 + }, + { + "epoch": 7.8535911602209945, + "grad_norm": 0.17476631700992584, + "learning_rate": 1.1606343089966343e-05, + "loss": 1.6622, + "step": 25587 + }, + { + "epoch": 7.85389809699202, + "grad_norm": 0.14995649456977844, + "learning_rate": 1.16031591294044e-05, + "loss": 1.6382, + "step": 25588 + }, + { + "epoch": 7.854205033763045, + "grad_norm": 0.16073103249073029, + "learning_rate": 1.1599975548302549e-05, + "loss": 1.6888, + "step": 25589 + }, + { + "epoch": 7.85451197053407, + "grad_norm": 0.1630357801914215, + "learning_rate": 1.159679234669223e-05, + "loss": 1.6717, + "step": 25590 + }, + { + "epoch": 7.854818907305095, + "grad_norm": 0.1537420153617859, + "learning_rate": 1.1593609524604948e-05, + "loss": 1.6836, + "step": 25591 + }, + { + "epoch": 7.85512584407612, + "grad_norm": 0.16389401257038116, + "learning_rate": 1.1590427082072103e-05, + "loss": 1.6941, + "step": 25592 + }, + { + "epoch": 7.855432780847146, + "grad_norm": 0.24554979801177979, + "learning_rate": 1.1587245019125192e-05, + "loss": 1.8018, + "step": 25593 + }, + { + "epoch": 7.855739717618171, + "grad_norm": 0.15020978450775146, + "learning_rate": 1.1584063335795614e-05, + "loss": 1.6815, + "step": 25594 + }, + { + "epoch": 7.856046654389196, + "grad_norm": 0.1830887496471405, + "learning_rate": 1.1580882032114853e-05, + "loss": 1.7134, + "step": 25595 + }, + { + "epoch": 7.856353591160221, + "grad_norm": 0.2381841540336609, + "learning_rate": 1.157770110811433e-05, + "loss": 1.7505, + "step": 25596 + }, + { + "epoch": 7.856660527931246, + "grad_norm": 0.210253044962883, + "learning_rate": 1.1574520563825491e-05, + "loss": 1.8048, + "step": 25597 + }, + { + "epoch": 7.856967464702271, + "grad_norm": 0.15428896248340607, + "learning_rate": 1.1571340399279756e-05, + "loss": 1.6624, + "step": 25598 + }, + { + "epoch": 7.857274401473297, + "grad_norm": 0.2932582199573517, + "learning_rate": 1.1568160614508567e-05, + "loss": 1.7192, + "step": 25599 + }, + { + "epoch": 7.857581338244322, + "grad_norm": 0.19450223445892334, + "learning_rate": 1.156498120954333e-05, + "loss": 1.753, + "step": 25600 + }, + { + "epoch": 7.8578882750153465, + "grad_norm": 0.16950540244579315, + "learning_rate": 1.1561802184415482e-05, + "loss": 1.7107, + "step": 25601 + }, + { + "epoch": 7.858195211786372, + "grad_norm": 0.18616287410259247, + "learning_rate": 1.1558623539156433e-05, + "loss": 1.6747, + "step": 25602 + }, + { + "epoch": 7.858502148557397, + "grad_norm": 0.20991890132427216, + "learning_rate": 1.1555445273797599e-05, + "loss": 1.6635, + "step": 25603 + }, + { + "epoch": 7.8588090853284225, + "grad_norm": 0.18592311441898346, + "learning_rate": 1.1552267388370386e-05, + "loss": 1.7327, + "step": 25604 + }, + { + "epoch": 7.859116022099448, + "grad_norm": 0.16478584706783295, + "learning_rate": 1.1549089882906206e-05, + "loss": 1.6523, + "step": 25605 + }, + { + "epoch": 7.859422958870473, + "grad_norm": 0.17281852662563324, + "learning_rate": 1.154591275743645e-05, + "loss": 1.7282, + "step": 25606 + }, + { + "epoch": 7.859729895641498, + "grad_norm": 0.17098689079284668, + "learning_rate": 1.1542736011992512e-05, + "loss": 1.7533, + "step": 25607 + }, + { + "epoch": 7.860036832412523, + "grad_norm": 0.1766287386417389, + "learning_rate": 1.1539559646605824e-05, + "loss": 1.6338, + "step": 25608 + }, + { + "epoch": 7.860343769183548, + "grad_norm": 0.15519756078720093, + "learning_rate": 1.1536383661307726e-05, + "loss": 1.6908, + "step": 25609 + }, + { + "epoch": 7.860650705954574, + "grad_norm": 0.18422503769397736, + "learning_rate": 1.1533208056129651e-05, + "loss": 1.6983, + "step": 25610 + }, + { + "epoch": 7.860957642725598, + "grad_norm": 0.1900123953819275, + "learning_rate": 1.1530032831102933e-05, + "loss": 1.7082, + "step": 25611 + }, + { + "epoch": 7.861264579496623, + "grad_norm": 0.15542784333229065, + "learning_rate": 1.1526857986259e-05, + "loss": 1.6979, + "step": 25612 + }, + { + "epoch": 7.861571516267649, + "grad_norm": 0.17173884809017181, + "learning_rate": 1.1523683521629197e-05, + "loss": 1.7329, + "step": 25613 + }, + { + "epoch": 7.861878453038674, + "grad_norm": 0.2399773746728897, + "learning_rate": 1.1520509437244908e-05, + "loss": 1.7224, + "step": 25614 + }, + { + "epoch": 7.862185389809699, + "grad_norm": 0.14101925492286682, + "learning_rate": 1.1517335733137502e-05, + "loss": 1.6676, + "step": 25615 + }, + { + "epoch": 7.862492326580725, + "grad_norm": 0.18625333905220032, + "learning_rate": 1.1514162409338336e-05, + "loss": 1.7269, + "step": 25616 + }, + { + "epoch": 7.862799263351749, + "grad_norm": 0.18385125696659088, + "learning_rate": 1.1510989465878774e-05, + "loss": 1.7197, + "step": 25617 + }, + { + "epoch": 7.8631062001227745, + "grad_norm": 0.16189569234848022, + "learning_rate": 1.1507816902790176e-05, + "loss": 1.662, + "step": 25618 + }, + { + "epoch": 7.8634131368938, + "grad_norm": 0.18526791036128998, + "learning_rate": 1.1504644720103885e-05, + "loss": 1.7521, + "step": 25619 + }, + { + "epoch": 7.863720073664825, + "grad_norm": 0.16588367521762848, + "learning_rate": 1.1501472917851263e-05, + "loss": 1.7238, + "step": 25620 + }, + { + "epoch": 7.8640270104358505, + "grad_norm": 0.15427199006080627, + "learning_rate": 1.1498301496063652e-05, + "loss": 1.6566, + "step": 25621 + }, + { + "epoch": 7.864333947206875, + "grad_norm": 0.1694655865430832, + "learning_rate": 1.149513045477239e-05, + "loss": 1.7446, + "step": 25622 + }, + { + "epoch": 7.8646408839779, + "grad_norm": 0.18305882811546326, + "learning_rate": 1.1491959794008823e-05, + "loss": 1.7093, + "step": 25623 + }, + { + "epoch": 7.864947820748926, + "grad_norm": 0.15975148975849152, + "learning_rate": 1.148878951380426e-05, + "loss": 1.6911, + "step": 25624 + }, + { + "epoch": 7.865254757519951, + "grad_norm": 0.18298782408237457, + "learning_rate": 1.148561961419008e-05, + "loss": 1.7188, + "step": 25625 + }, + { + "epoch": 7.865561694290976, + "grad_norm": 0.16258102655410767, + "learning_rate": 1.148245009519755e-05, + "loss": 1.6901, + "step": 25626 + }, + { + "epoch": 7.865868631062002, + "grad_norm": 0.19591568410396576, + "learning_rate": 1.1479280956858057e-05, + "loss": 1.7521, + "step": 25627 + }, + { + "epoch": 7.866175567833026, + "grad_norm": 0.15821373462677002, + "learning_rate": 1.1476112199202853e-05, + "loss": 1.6503, + "step": 25628 + }, + { + "epoch": 7.866482504604051, + "grad_norm": 0.1531122773885727, + "learning_rate": 1.147294382226331e-05, + "loss": 1.6802, + "step": 25629 + }, + { + "epoch": 7.866789441375077, + "grad_norm": 0.2105177342891693, + "learning_rate": 1.1469775826070711e-05, + "loss": 1.7705, + "step": 25630 + }, + { + "epoch": 7.867096378146102, + "grad_norm": 0.22782234847545624, + "learning_rate": 1.1466608210656377e-05, + "loss": 1.6813, + "step": 25631 + }, + { + "epoch": 7.867403314917127, + "grad_norm": 0.1824047863483429, + "learning_rate": 1.1463440976051598e-05, + "loss": 1.7149, + "step": 25632 + }, + { + "epoch": 7.867710251688152, + "grad_norm": 0.19195812940597534, + "learning_rate": 1.1460274122287685e-05, + "loss": 1.6912, + "step": 25633 + }, + { + "epoch": 7.868017188459177, + "grad_norm": 0.22274719178676605, + "learning_rate": 1.1457107649395937e-05, + "loss": 1.8499, + "step": 25634 + }, + { + "epoch": 7.8683241252302025, + "grad_norm": 0.21217535436153412, + "learning_rate": 1.1453941557407638e-05, + "loss": 1.7345, + "step": 25635 + }, + { + "epoch": 7.868631062001228, + "grad_norm": 0.20042434334754944, + "learning_rate": 1.1450775846354078e-05, + "loss": 1.6902, + "step": 25636 + }, + { + "epoch": 7.868937998772253, + "grad_norm": 0.17045147716999054, + "learning_rate": 1.1447610516266548e-05, + "loss": 1.6641, + "step": 25637 + }, + { + "epoch": 7.8692449355432785, + "grad_norm": 0.18817269802093506, + "learning_rate": 1.1444445567176326e-05, + "loss": 1.7063, + "step": 25638 + }, + { + "epoch": 7.869551872314303, + "grad_norm": 0.1746743619441986, + "learning_rate": 1.1441280999114694e-05, + "loss": 1.6838, + "step": 25639 + }, + { + "epoch": 7.869858809085328, + "grad_norm": 0.1734321415424347, + "learning_rate": 1.1438116812112925e-05, + "loss": 1.6939, + "step": 25640 + }, + { + "epoch": 7.870165745856354, + "grad_norm": 0.1745334416627884, + "learning_rate": 1.1434953006202281e-05, + "loss": 1.71, + "step": 25641 + }, + { + "epoch": 7.870472682627379, + "grad_norm": 0.20883594453334808, + "learning_rate": 1.1431789581414043e-05, + "loss": 1.6941, + "step": 25642 + }, + { + "epoch": 7.870779619398404, + "grad_norm": 0.1664251685142517, + "learning_rate": 1.1428626537779447e-05, + "loss": 1.6995, + "step": 25643 + }, + { + "epoch": 7.871086556169429, + "grad_norm": 0.16561046242713928, + "learning_rate": 1.1425463875329795e-05, + "loss": 1.7093, + "step": 25644 + }, + { + "epoch": 7.871393492940454, + "grad_norm": 0.21409009397029877, + "learning_rate": 1.1422301594096297e-05, + "loss": 1.6919, + "step": 25645 + }, + { + "epoch": 7.871700429711479, + "grad_norm": 0.19574479758739471, + "learning_rate": 1.1419139694110236e-05, + "loss": 1.777, + "step": 25646 + }, + { + "epoch": 7.872007366482505, + "grad_norm": 0.15032227337360382, + "learning_rate": 1.1415978175402853e-05, + "loss": 1.6759, + "step": 25647 + }, + { + "epoch": 7.87231430325353, + "grad_norm": 0.18372420966625214, + "learning_rate": 1.1412817038005386e-05, + "loss": 1.7304, + "step": 25648 + }, + { + "epoch": 7.872621240024555, + "grad_norm": 0.16073383390903473, + "learning_rate": 1.1409656281949077e-05, + "loss": 1.6784, + "step": 25649 + }, + { + "epoch": 7.87292817679558, + "grad_norm": 0.15698374807834625, + "learning_rate": 1.1406495907265163e-05, + "loss": 1.6877, + "step": 25650 + }, + { + "epoch": 7.873235113566605, + "grad_norm": 0.18749327957630157, + "learning_rate": 1.140333591398488e-05, + "loss": 1.708, + "step": 25651 + }, + { + "epoch": 7.8735420503376305, + "grad_norm": 0.15412451326847076, + "learning_rate": 1.1400176302139448e-05, + "loss": 1.6661, + "step": 25652 + }, + { + "epoch": 7.873848987108656, + "grad_norm": 0.22467148303985596, + "learning_rate": 1.1397017071760102e-05, + "loss": 1.8204, + "step": 25653 + }, + { + "epoch": 7.87415592387968, + "grad_norm": 0.14625288546085358, + "learning_rate": 1.1393858222878063e-05, + "loss": 1.7008, + "step": 25654 + }, + { + "epoch": 7.874462860650706, + "grad_norm": 0.14440159499645233, + "learning_rate": 1.1390699755524537e-05, + "loss": 1.652, + "step": 25655 + }, + { + "epoch": 7.874769797421731, + "grad_norm": 0.14738808572292328, + "learning_rate": 1.138754166973075e-05, + "loss": 1.6305, + "step": 25656 + }, + { + "epoch": 7.875076734192756, + "grad_norm": 0.17714212834835052, + "learning_rate": 1.1384383965527906e-05, + "loss": 1.7011, + "step": 25657 + }, + { + "epoch": 7.875383670963782, + "grad_norm": 0.17601121962070465, + "learning_rate": 1.1381226642947213e-05, + "loss": 1.7425, + "step": 25658 + }, + { + "epoch": 7.875690607734807, + "grad_norm": 0.1893182396888733, + "learning_rate": 1.1378069702019877e-05, + "loss": 1.7215, + "step": 25659 + }, + { + "epoch": 7.8759975445058314, + "grad_norm": 0.20073552429676056, + "learning_rate": 1.1374913142777077e-05, + "loss": 1.7025, + "step": 25660 + }, + { + "epoch": 7.876304481276857, + "grad_norm": 0.17025165259838104, + "learning_rate": 1.1371756965250052e-05, + "loss": 1.7046, + "step": 25661 + }, + { + "epoch": 7.876611418047882, + "grad_norm": 0.17612501978874207, + "learning_rate": 1.1368601169469933e-05, + "loss": 1.7452, + "step": 25662 + }, + { + "epoch": 7.8769183548189075, + "grad_norm": 0.2542072534561157, + "learning_rate": 1.1365445755467974e-05, + "loss": 1.765, + "step": 25663 + }, + { + "epoch": 7.877225291589933, + "grad_norm": 0.25291866064071655, + "learning_rate": 1.1362290723275293e-05, + "loss": 1.7477, + "step": 25664 + }, + { + "epoch": 7.877532228360957, + "grad_norm": 0.1848495602607727, + "learning_rate": 1.1359136072923121e-05, + "loss": 1.7278, + "step": 25665 + }, + { + "epoch": 7.877839165131983, + "grad_norm": 0.18354780972003937, + "learning_rate": 1.1355981804442605e-05, + "loss": 1.7469, + "step": 25666 + }, + { + "epoch": 7.878146101903008, + "grad_norm": 0.1843772530555725, + "learning_rate": 1.1352827917864934e-05, + "loss": 1.7654, + "step": 25667 + }, + { + "epoch": 7.878453038674033, + "grad_norm": 0.144758403301239, + "learning_rate": 1.1349674413221267e-05, + "loss": 1.6649, + "step": 25668 + }, + { + "epoch": 7.878759975445059, + "grad_norm": 0.15747511386871338, + "learning_rate": 1.1346521290542772e-05, + "loss": 1.6386, + "step": 25669 + }, + { + "epoch": 7.879066912216084, + "grad_norm": 0.17898736894130707, + "learning_rate": 1.134336854986061e-05, + "loss": 1.7, + "step": 25670 + }, + { + "epoch": 7.879373848987108, + "grad_norm": 0.19453589618206024, + "learning_rate": 1.1340216191205939e-05, + "loss": 1.7108, + "step": 25671 + }, + { + "epoch": 7.879680785758134, + "grad_norm": 0.17470498383045197, + "learning_rate": 1.1337064214609905e-05, + "loss": 1.7705, + "step": 25672 + }, + { + "epoch": 7.879987722529159, + "grad_norm": 0.1897793561220169, + "learning_rate": 1.1333912620103665e-05, + "loss": 1.7358, + "step": 25673 + }, + { + "epoch": 7.880294659300184, + "grad_norm": 0.1659744381904602, + "learning_rate": 1.1330761407718366e-05, + "loss": 1.724, + "step": 25674 + }, + { + "epoch": 7.88060159607121, + "grad_norm": 0.15303891897201538, + "learning_rate": 1.1327610577485148e-05, + "loss": 1.6878, + "step": 25675 + }, + { + "epoch": 7.880908532842234, + "grad_norm": 0.16346490383148193, + "learning_rate": 1.1324460129435144e-05, + "loss": 1.6544, + "step": 25676 + }, + { + "epoch": 7.8812154696132595, + "grad_norm": 0.19887791574001312, + "learning_rate": 1.1321310063599483e-05, + "loss": 1.7169, + "step": 25677 + }, + { + "epoch": 7.881522406384285, + "grad_norm": 0.1658533811569214, + "learning_rate": 1.1318160380009334e-05, + "loss": 1.6902, + "step": 25678 + }, + { + "epoch": 7.88182934315531, + "grad_norm": 0.16859948635101318, + "learning_rate": 1.131501107869577e-05, + "loss": 1.7015, + "step": 25679 + }, + { + "epoch": 7.8821362799263355, + "grad_norm": 0.20775821805000305, + "learning_rate": 1.1311862159689968e-05, + "loss": 1.7519, + "step": 25680 + }, + { + "epoch": 7.882443216697361, + "grad_norm": 0.18174295127391815, + "learning_rate": 1.1308713623022987e-05, + "loss": 1.7161, + "step": 25681 + }, + { + "epoch": 7.882750153468385, + "grad_norm": 0.1843954473733902, + "learning_rate": 1.1305565468725993e-05, + "loss": 1.6753, + "step": 25682 + }, + { + "epoch": 7.883057090239411, + "grad_norm": 0.1856461614370346, + "learning_rate": 1.130241769683008e-05, + "loss": 1.7139, + "step": 25683 + }, + { + "epoch": 7.883364027010436, + "grad_norm": 0.15803632140159607, + "learning_rate": 1.129927030736636e-05, + "loss": 1.6705, + "step": 25684 + }, + { + "epoch": 7.883670963781461, + "grad_norm": 0.1680101901292801, + "learning_rate": 1.1296123300365947e-05, + "loss": 1.6757, + "step": 25685 + }, + { + "epoch": 7.883977900552486, + "grad_norm": 0.157195046544075, + "learning_rate": 1.1292976675859895e-05, + "loss": 1.6922, + "step": 25686 + }, + { + "epoch": 7.884284837323511, + "grad_norm": 0.17270046472549438, + "learning_rate": 1.1289830433879356e-05, + "loss": 1.6909, + "step": 25687 + }, + { + "epoch": 7.884591774094536, + "grad_norm": 0.1880030781030655, + "learning_rate": 1.1286684574455398e-05, + "loss": 1.7139, + "step": 25688 + }, + { + "epoch": 7.884898710865562, + "grad_norm": 0.1882653832435608, + "learning_rate": 1.1283539097619112e-05, + "loss": 1.7464, + "step": 25689 + }, + { + "epoch": 7.885205647636587, + "grad_norm": 0.2060890644788742, + "learning_rate": 1.128039400340159e-05, + "loss": 1.6749, + "step": 25690 + }, + { + "epoch": 7.885512584407612, + "grad_norm": 0.20780493319034576, + "learning_rate": 1.1277249291833903e-05, + "loss": 1.7581, + "step": 25691 + }, + { + "epoch": 7.885819521178637, + "grad_norm": 0.1929686814546585, + "learning_rate": 1.1274104962947135e-05, + "loss": 1.6962, + "step": 25692 + }, + { + "epoch": 7.886126457949662, + "grad_norm": 0.21474432945251465, + "learning_rate": 1.1270961016772363e-05, + "loss": 1.6984, + "step": 25693 + }, + { + "epoch": 7.8864333947206875, + "grad_norm": 0.17453257739543915, + "learning_rate": 1.126781745334064e-05, + "loss": 1.679, + "step": 25694 + }, + { + "epoch": 7.886740331491713, + "grad_norm": 0.21506772935390472, + "learning_rate": 1.1264674272683073e-05, + "loss": 1.7209, + "step": 25695 + }, + { + "epoch": 7.887047268262738, + "grad_norm": 0.2470129430294037, + "learning_rate": 1.1261531474830672e-05, + "loss": 1.7183, + "step": 25696 + }, + { + "epoch": 7.887354205033763, + "grad_norm": 0.2026570737361908, + "learning_rate": 1.1258389059814545e-05, + "loss": 1.6579, + "step": 25697 + }, + { + "epoch": 7.887661141804788, + "grad_norm": 0.18859948217868805, + "learning_rate": 1.1255247027665699e-05, + "loss": 1.6831, + "step": 25698 + }, + { + "epoch": 7.887968078575813, + "grad_norm": 0.2106257677078247, + "learning_rate": 1.1252105378415229e-05, + "loss": 1.724, + "step": 25699 + }, + { + "epoch": 7.888275015346839, + "grad_norm": 0.17260697484016418, + "learning_rate": 1.1248964112094162e-05, + "loss": 1.6875, + "step": 25700 + }, + { + "epoch": 7.888581952117864, + "grad_norm": 0.20596550405025482, + "learning_rate": 1.1245823228733542e-05, + "loss": 1.7569, + "step": 25701 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 0.1724967509508133, + "learning_rate": 1.1242682728364428e-05, + "loss": 1.7063, + "step": 25702 + }, + { + "epoch": 7.889195825659914, + "grad_norm": 0.2189379185438156, + "learning_rate": 1.123954261101781e-05, + "loss": 1.789, + "step": 25703 + }, + { + "epoch": 7.889502762430939, + "grad_norm": 0.1539442539215088, + "learning_rate": 1.1236402876724766e-05, + "loss": 1.6573, + "step": 25704 + }, + { + "epoch": 7.889809699201964, + "grad_norm": 0.2854970693588257, + "learning_rate": 1.1233263525516313e-05, + "loss": 1.7683, + "step": 25705 + }, + { + "epoch": 7.89011663597299, + "grad_norm": 0.18263237178325653, + "learning_rate": 1.1230124557423465e-05, + "loss": 1.6911, + "step": 25706 + }, + { + "epoch": 7.890423572744015, + "grad_norm": 0.2098342627286911, + "learning_rate": 1.122698597247725e-05, + "loss": 1.7306, + "step": 25707 + }, + { + "epoch": 7.8907305095150395, + "grad_norm": 0.20822781324386597, + "learning_rate": 1.122384777070869e-05, + "loss": 1.7777, + "step": 25708 + }, + { + "epoch": 7.891037446286065, + "grad_norm": 0.24466483294963837, + "learning_rate": 1.122070995214879e-05, + "loss": 1.6966, + "step": 25709 + }, + { + "epoch": 7.89134438305709, + "grad_norm": 0.1500372439622879, + "learning_rate": 1.1217572516828561e-05, + "loss": 1.6787, + "step": 25710 + }, + { + "epoch": 7.8916513198281155, + "grad_norm": 0.2238166481256485, + "learning_rate": 1.1214435464779006e-05, + "loss": 1.7957, + "step": 25711 + }, + { + "epoch": 7.891958256599141, + "grad_norm": 0.22993433475494385, + "learning_rate": 1.1211298796031156e-05, + "loss": 1.7142, + "step": 25712 + }, + { + "epoch": 7.892265193370166, + "grad_norm": 0.15912945568561554, + "learning_rate": 1.1208162510615955e-05, + "loss": 1.7188, + "step": 25713 + }, + { + "epoch": 7.892572130141191, + "grad_norm": 0.2096986174583435, + "learning_rate": 1.1205026608564461e-05, + "loss": 1.7409, + "step": 25714 + }, + { + "epoch": 7.892879066912216, + "grad_norm": 0.18928684294223785, + "learning_rate": 1.1201891089907601e-05, + "loss": 1.6703, + "step": 25715 + }, + { + "epoch": 7.893186003683241, + "grad_norm": 0.19096077978610992, + "learning_rate": 1.119875595467641e-05, + "loss": 1.7393, + "step": 25716 + }, + { + "epoch": 7.893492940454267, + "grad_norm": 0.2286420315504074, + "learning_rate": 1.1195621202901851e-05, + "loss": 1.6995, + "step": 25717 + }, + { + "epoch": 7.893799877225292, + "grad_norm": 0.16288414597511292, + "learning_rate": 1.1192486834614912e-05, + "loss": 1.7334, + "step": 25718 + }, + { + "epoch": 7.894106813996316, + "grad_norm": 0.17358547449111938, + "learning_rate": 1.118935284984658e-05, + "loss": 1.7114, + "step": 25719 + }, + { + "epoch": 7.894413750767342, + "grad_norm": 0.16833151876926422, + "learning_rate": 1.1186219248627777e-05, + "loss": 1.6998, + "step": 25720 + }, + { + "epoch": 7.894720687538367, + "grad_norm": 0.14409767091274261, + "learning_rate": 1.118308603098952e-05, + "loss": 1.713, + "step": 25721 + }, + { + "epoch": 7.895027624309392, + "grad_norm": 0.18832024931907654, + "learning_rate": 1.1179953196962761e-05, + "loss": 1.6862, + "step": 25722 + }, + { + "epoch": 7.895334561080418, + "grad_norm": 0.1837761402130127, + "learning_rate": 1.1176820746578454e-05, + "loss": 1.6674, + "step": 25723 + }, + { + "epoch": 7.895641497851443, + "grad_norm": 0.14717474579811096, + "learning_rate": 1.1173688679867561e-05, + "loss": 1.6619, + "step": 25724 + }, + { + "epoch": 7.8959484346224675, + "grad_norm": 0.13512545824050903, + "learning_rate": 1.1170556996861032e-05, + "loss": 1.664, + "step": 25725 + }, + { + "epoch": 7.896255371393493, + "grad_norm": 0.21533837914466858, + "learning_rate": 1.1167425697589817e-05, + "loss": 1.7205, + "step": 25726 + }, + { + "epoch": 7.896562308164518, + "grad_norm": 0.15241803228855133, + "learning_rate": 1.1164294782084866e-05, + "loss": 1.6838, + "step": 25727 + }, + { + "epoch": 7.8968692449355435, + "grad_norm": 0.14889933168888092, + "learning_rate": 1.1161164250377099e-05, + "loss": 1.7197, + "step": 25728 + }, + { + "epoch": 7.897176181706568, + "grad_norm": 0.15948614478111267, + "learning_rate": 1.11580341024975e-05, + "loss": 1.6948, + "step": 25729 + }, + { + "epoch": 7.897483118477593, + "grad_norm": 0.17862235009670258, + "learning_rate": 1.1154904338476946e-05, + "loss": 1.743, + "step": 25730 + }, + { + "epoch": 7.897790055248619, + "grad_norm": 0.18168844282627106, + "learning_rate": 1.1151774958346422e-05, + "loss": 1.7291, + "step": 25731 + }, + { + "epoch": 7.898096992019644, + "grad_norm": 0.17636772990226746, + "learning_rate": 1.11486459621368e-05, + "loss": 1.7428, + "step": 25732 + }, + { + "epoch": 7.898403928790669, + "grad_norm": 0.1677904576063156, + "learning_rate": 1.1145517349879048e-05, + "loss": 1.7026, + "step": 25733 + }, + { + "epoch": 7.898710865561695, + "grad_norm": 0.1851150244474411, + "learning_rate": 1.1142389121604063e-05, + "loss": 1.7743, + "step": 25734 + }, + { + "epoch": 7.899017802332719, + "grad_norm": 0.19713786244392395, + "learning_rate": 1.1139261277342767e-05, + "loss": 1.7287, + "step": 25735 + }, + { + "epoch": 7.899324739103744, + "grad_norm": 0.2060006707906723, + "learning_rate": 1.1136133817126076e-05, + "loss": 1.7377, + "step": 25736 + }, + { + "epoch": 7.89963167587477, + "grad_norm": 0.18026013672351837, + "learning_rate": 1.1133006740984864e-05, + "loss": 1.7322, + "step": 25737 + }, + { + "epoch": 7.899938612645795, + "grad_norm": 0.1787644922733307, + "learning_rate": 1.1129880048950075e-05, + "loss": 1.7457, + "step": 25738 + }, + { + "epoch": 7.9002455494168204, + "grad_norm": 0.16092467308044434, + "learning_rate": 1.1126753741052593e-05, + "loss": 1.7451, + "step": 25739 + }, + { + "epoch": 7.900552486187845, + "grad_norm": 0.15322941541671753, + "learning_rate": 1.1123627817323318e-05, + "loss": 1.667, + "step": 25740 + }, + { + "epoch": 7.90085942295887, + "grad_norm": 0.1488087922334671, + "learning_rate": 1.1120502277793137e-05, + "loss": 1.684, + "step": 25741 + }, + { + "epoch": 7.901166359729896, + "grad_norm": 0.15332907438278198, + "learning_rate": 1.111737712249294e-05, + "loss": 1.6646, + "step": 25742 + }, + { + "epoch": 7.901473296500921, + "grad_norm": 0.19801980257034302, + "learning_rate": 1.1114252351453614e-05, + "loss": 1.7469, + "step": 25743 + }, + { + "epoch": 7.901780233271946, + "grad_norm": 0.17123407125473022, + "learning_rate": 1.1111127964706035e-05, + "loss": 1.7319, + "step": 25744 + }, + { + "epoch": 7.902087170042972, + "grad_norm": 0.1753319650888443, + "learning_rate": 1.1108003962281066e-05, + "loss": 1.7212, + "step": 25745 + }, + { + "epoch": 7.902394106813996, + "grad_norm": 0.1598043441772461, + "learning_rate": 1.1104880344209634e-05, + "loss": 1.6823, + "step": 25746 + }, + { + "epoch": 7.902701043585021, + "grad_norm": 0.14227038621902466, + "learning_rate": 1.1101757110522538e-05, + "loss": 1.6665, + "step": 25747 + }, + { + "epoch": 7.903007980356047, + "grad_norm": 0.1531791388988495, + "learning_rate": 1.1098634261250706e-05, + "loss": 1.717, + "step": 25748 + }, + { + "epoch": 7.903314917127072, + "grad_norm": 0.18077540397644043, + "learning_rate": 1.109551179642494e-05, + "loss": 1.7237, + "step": 25749 + }, + { + "epoch": 7.903621853898097, + "grad_norm": 0.22373250126838684, + "learning_rate": 1.1092389716076145e-05, + "loss": 1.7678, + "step": 25750 + }, + { + "epoch": 7.903928790669122, + "grad_norm": 0.16022193431854248, + "learning_rate": 1.1089268020235166e-05, + "loss": 1.6985, + "step": 25751 + }, + { + "epoch": 7.904235727440147, + "grad_norm": 0.17306078970432281, + "learning_rate": 1.1086146708932837e-05, + "loss": 1.6653, + "step": 25752 + }, + { + "epoch": 7.9045426642111725, + "grad_norm": 0.16284874081611633, + "learning_rate": 1.1083025782200035e-05, + "loss": 1.6762, + "step": 25753 + }, + { + "epoch": 7.904849600982198, + "grad_norm": 0.17309556901454926, + "learning_rate": 1.107990524006755e-05, + "loss": 1.7103, + "step": 25754 + }, + { + "epoch": 7.905156537753223, + "grad_norm": 0.1508374810218811, + "learning_rate": 1.107678508256627e-05, + "loss": 1.6932, + "step": 25755 + }, + { + "epoch": 7.9054634745242485, + "grad_norm": 0.1941400021314621, + "learning_rate": 1.1073665309727016e-05, + "loss": 1.7922, + "step": 25756 + }, + { + "epoch": 7.905770411295273, + "grad_norm": 0.1890190988779068, + "learning_rate": 1.107054592158061e-05, + "loss": 1.6765, + "step": 25757 + }, + { + "epoch": 7.906077348066298, + "grad_norm": 0.19425363838672638, + "learning_rate": 1.1067426918157892e-05, + "loss": 1.7284, + "step": 25758 + }, + { + "epoch": 7.906384284837324, + "grad_norm": 0.18147888779640198, + "learning_rate": 1.1064308299489678e-05, + "loss": 1.7099, + "step": 25759 + }, + { + "epoch": 7.906691221608349, + "grad_norm": 0.19644278287887573, + "learning_rate": 1.106119006560679e-05, + "loss": 1.7691, + "step": 25760 + }, + { + "epoch": 7.906998158379373, + "grad_norm": 0.14809735119342804, + "learning_rate": 1.1058072216540045e-05, + "loss": 1.6735, + "step": 25761 + }, + { + "epoch": 7.907305095150399, + "grad_norm": 0.17835088074207306, + "learning_rate": 1.105495475232024e-05, + "loss": 1.6928, + "step": 25762 + }, + { + "epoch": 7.907612031921424, + "grad_norm": 0.18341144919395447, + "learning_rate": 1.1051837672978227e-05, + "loss": 1.7393, + "step": 25763 + }, + { + "epoch": 7.907918968692449, + "grad_norm": 0.2026391327381134, + "learning_rate": 1.1048720978544753e-05, + "loss": 1.7037, + "step": 25764 + }, + { + "epoch": 7.908225905463475, + "grad_norm": 0.19855152070522308, + "learning_rate": 1.104560466905068e-05, + "loss": 1.7341, + "step": 25765 + }, + { + "epoch": 7.9085328422345, + "grad_norm": 0.18974080681800842, + "learning_rate": 1.1042488744526741e-05, + "loss": 1.6717, + "step": 25766 + }, + { + "epoch": 7.9088397790055245, + "grad_norm": 0.1727920025587082, + "learning_rate": 1.1039373205003784e-05, + "loss": 1.6994, + "step": 25767 + }, + { + "epoch": 7.90914671577655, + "grad_norm": 0.20549818873405457, + "learning_rate": 1.1036258050512566e-05, + "loss": 1.7055, + "step": 25768 + }, + { + "epoch": 7.909453652547575, + "grad_norm": 0.15696507692337036, + "learning_rate": 1.1033143281083891e-05, + "loss": 1.678, + "step": 25769 + }, + { + "epoch": 7.9097605893186005, + "grad_norm": 0.1568988859653473, + "learning_rate": 1.1030028896748546e-05, + "loss": 1.6855, + "step": 25770 + }, + { + "epoch": 7.910067526089626, + "grad_norm": 0.17795592546463013, + "learning_rate": 1.1026914897537266e-05, + "loss": 1.7306, + "step": 25771 + }, + { + "epoch": 7.91037446286065, + "grad_norm": 0.19906511902809143, + "learning_rate": 1.1023801283480872e-05, + "loss": 1.7125, + "step": 25772 + }, + { + "epoch": 7.910681399631676, + "grad_norm": 0.16972185671329498, + "learning_rate": 1.1020688054610118e-05, + "loss": 1.714, + "step": 25773 + }, + { + "epoch": 7.910988336402701, + "grad_norm": 0.20585502684116364, + "learning_rate": 1.1017575210955772e-05, + "loss": 1.7342, + "step": 25774 + }, + { + "epoch": 7.911295273173726, + "grad_norm": 0.1772177368402481, + "learning_rate": 1.1014462752548592e-05, + "loss": 1.7091, + "step": 25775 + }, + { + "epoch": 7.911602209944752, + "grad_norm": 0.1818380057811737, + "learning_rate": 1.1011350679419341e-05, + "loss": 1.7131, + "step": 25776 + }, + { + "epoch": 7.911909146715777, + "grad_norm": 0.17451459169387817, + "learning_rate": 1.1008238991598779e-05, + "loss": 1.6633, + "step": 25777 + }, + { + "epoch": 7.912216083486801, + "grad_norm": 0.18837687373161316, + "learning_rate": 1.100512768911765e-05, + "loss": 1.7132, + "step": 25778 + }, + { + "epoch": 7.912523020257827, + "grad_norm": 0.15283817052841187, + "learning_rate": 1.1002016772006695e-05, + "loss": 1.6833, + "step": 25779 + }, + { + "epoch": 7.912829957028852, + "grad_norm": 0.15264299511909485, + "learning_rate": 1.0998906240296692e-05, + "loss": 1.7098, + "step": 25780 + }, + { + "epoch": 7.913136893799877, + "grad_norm": 0.18866822123527527, + "learning_rate": 1.099579609401833e-05, + "loss": 1.7173, + "step": 25781 + }, + { + "epoch": 7.913443830570903, + "grad_norm": 0.19261083006858826, + "learning_rate": 1.0992686333202401e-05, + "loss": 1.7269, + "step": 25782 + }, + { + "epoch": 7.913750767341927, + "grad_norm": 0.19681799411773682, + "learning_rate": 1.0989576957879577e-05, + "loss": 1.6594, + "step": 25783 + }, + { + "epoch": 7.9140577041129525, + "grad_norm": 0.21298938989639282, + "learning_rate": 1.0986467968080639e-05, + "loss": 1.8509, + "step": 25784 + }, + { + "epoch": 7.914364640883978, + "grad_norm": 0.17769277095794678, + "learning_rate": 1.0983359363836287e-05, + "loss": 1.7177, + "step": 25785 + }, + { + "epoch": 7.914671577655003, + "grad_norm": 0.19831274449825287, + "learning_rate": 1.0980251145177246e-05, + "loss": 1.7107, + "step": 25786 + }, + { + "epoch": 7.9149785144260285, + "grad_norm": 0.16204139590263367, + "learning_rate": 1.0977143312134248e-05, + "loss": 1.7052, + "step": 25787 + }, + { + "epoch": 7.915285451197054, + "grad_norm": 0.1709459275007248, + "learning_rate": 1.0974035864737958e-05, + "loss": 1.6944, + "step": 25788 + }, + { + "epoch": 7.915592387968078, + "grad_norm": 0.17710284888744354, + "learning_rate": 1.0970928803019142e-05, + "loss": 1.7253, + "step": 25789 + }, + { + "epoch": 7.915899324739104, + "grad_norm": 0.17316623032093048, + "learning_rate": 1.0967822127008481e-05, + "loss": 1.6458, + "step": 25790 + }, + { + "epoch": 7.916206261510129, + "grad_norm": 0.15644441545009613, + "learning_rate": 1.0964715836736677e-05, + "loss": 1.6749, + "step": 25791 + }, + { + "epoch": 7.916513198281154, + "grad_norm": 0.1425870954990387, + "learning_rate": 1.096160993223443e-05, + "loss": 1.7283, + "step": 25792 + }, + { + "epoch": 7.91682013505218, + "grad_norm": 0.1724596619606018, + "learning_rate": 1.0958504413532438e-05, + "loss": 1.7152, + "step": 25793 + }, + { + "epoch": 7.917127071823204, + "grad_norm": 0.20472319424152374, + "learning_rate": 1.0955399280661383e-05, + "loss": 1.7818, + "step": 25794 + }, + { + "epoch": 7.917434008594229, + "grad_norm": 0.18012158572673798, + "learning_rate": 1.0952294533651963e-05, + "loss": 1.6995, + "step": 25795 + }, + { + "epoch": 7.917740945365255, + "grad_norm": 0.1460564136505127, + "learning_rate": 1.0949190172534851e-05, + "loss": 1.6752, + "step": 25796 + }, + { + "epoch": 7.91804788213628, + "grad_norm": 0.16467545926570892, + "learning_rate": 1.0946086197340733e-05, + "loss": 1.7, + "step": 25797 + }, + { + "epoch": 7.918354818907305, + "grad_norm": 0.20123273134231567, + "learning_rate": 1.0942982608100266e-05, + "loss": 1.7423, + "step": 25798 + }, + { + "epoch": 7.918661755678331, + "grad_norm": 0.160671204328537, + "learning_rate": 1.0939879404844167e-05, + "loss": 1.6992, + "step": 25799 + }, + { + "epoch": 7.918968692449355, + "grad_norm": 0.18679293990135193, + "learning_rate": 1.0936776587603043e-05, + "loss": 1.7789, + "step": 25800 + }, + { + "epoch": 7.9192756292203805, + "grad_norm": 0.1598452925682068, + "learning_rate": 1.0933674156407602e-05, + "loss": 1.6961, + "step": 25801 + }, + { + "epoch": 7.919582565991406, + "grad_norm": 0.13918142020702362, + "learning_rate": 1.0930572111288506e-05, + "loss": 1.6727, + "step": 25802 + }, + { + "epoch": 7.919889502762431, + "grad_norm": 0.16652320325374603, + "learning_rate": 1.0927470452276367e-05, + "loss": 1.7135, + "step": 25803 + }, + { + "epoch": 7.920196439533456, + "grad_norm": 0.1637706309556961, + "learning_rate": 1.0924369179401893e-05, + "loss": 1.7078, + "step": 25804 + }, + { + "epoch": 7.920503376304481, + "grad_norm": 0.19709086418151855, + "learning_rate": 1.092126829269568e-05, + "loss": 1.7425, + "step": 25805 + }, + { + "epoch": 7.920810313075506, + "grad_norm": 0.13402192294597626, + "learning_rate": 1.091816779218841e-05, + "loss": 1.663, + "step": 25806 + }, + { + "epoch": 7.921117249846532, + "grad_norm": 0.18932323157787323, + "learning_rate": 1.0915067677910718e-05, + "loss": 1.7651, + "step": 25807 + }, + { + "epoch": 7.921424186617557, + "grad_norm": 0.1586374193429947, + "learning_rate": 1.0911967949893231e-05, + "loss": 1.6709, + "step": 25808 + }, + { + "epoch": 7.921731123388582, + "grad_norm": 0.1570933312177658, + "learning_rate": 1.0908868608166589e-05, + "loss": 1.7166, + "step": 25809 + }, + { + "epoch": 7.922038060159607, + "grad_norm": 0.19786952435970306, + "learning_rate": 1.0905769652761416e-05, + "loss": 1.7347, + "step": 25810 + }, + { + "epoch": 7.922344996930632, + "grad_norm": 0.14969857037067413, + "learning_rate": 1.0902671083708343e-05, + "loss": 1.6471, + "step": 25811 + }, + { + "epoch": 7.922651933701657, + "grad_norm": 0.17460933327674866, + "learning_rate": 1.089957290103799e-05, + "loss": 1.7594, + "step": 25812 + }, + { + "epoch": 7.922958870472683, + "grad_norm": 0.17380566895008087, + "learning_rate": 1.0896475104780974e-05, + "loss": 1.6721, + "step": 25813 + }, + { + "epoch": 7.923265807243708, + "grad_norm": 0.1599249392747879, + "learning_rate": 1.0893377694967916e-05, + "loss": 1.6842, + "step": 25814 + }, + { + "epoch": 7.9235727440147325, + "grad_norm": 0.15319927036762238, + "learning_rate": 1.0890280671629398e-05, + "loss": 1.6529, + "step": 25815 + }, + { + "epoch": 7.923879680785758, + "grad_norm": 0.20122043788433075, + "learning_rate": 1.0887184034796082e-05, + "loss": 1.8009, + "step": 25816 + }, + { + "epoch": 7.924186617556783, + "grad_norm": 0.1726430058479309, + "learning_rate": 1.0884087784498515e-05, + "loss": 1.7595, + "step": 25817 + }, + { + "epoch": 7.9244935543278086, + "grad_norm": 0.1657346487045288, + "learning_rate": 1.0880991920767336e-05, + "loss": 1.7051, + "step": 25818 + }, + { + "epoch": 7.924800491098834, + "grad_norm": 0.19500960409641266, + "learning_rate": 1.0877896443633117e-05, + "loss": 1.6809, + "step": 25819 + }, + { + "epoch": 7.925107427869859, + "grad_norm": 0.18751180171966553, + "learning_rate": 1.087480135312644e-05, + "loss": 1.7613, + "step": 25820 + }, + { + "epoch": 7.925414364640884, + "grad_norm": 0.20735877752304077, + "learning_rate": 1.0871706649277935e-05, + "loss": 1.7515, + "step": 25821 + }, + { + "epoch": 7.925721301411909, + "grad_norm": 0.19349408149719238, + "learning_rate": 1.0868612332118133e-05, + "loss": 1.7053, + "step": 25822 + }, + { + "epoch": 7.926028238182934, + "grad_norm": 0.15639854967594147, + "learning_rate": 1.0865518401677649e-05, + "loss": 1.6907, + "step": 25823 + }, + { + "epoch": 7.92633517495396, + "grad_norm": 0.18366692960262299, + "learning_rate": 1.0862424857987059e-05, + "loss": 1.6791, + "step": 25824 + }, + { + "epoch": 7.926642111724985, + "grad_norm": 0.1648077666759491, + "learning_rate": 1.0859331701076913e-05, + "loss": 1.6671, + "step": 25825 + }, + { + "epoch": 7.9269490484960095, + "grad_norm": 0.17894984781742096, + "learning_rate": 1.0856238930977802e-05, + "loss": 1.736, + "step": 25826 + }, + { + "epoch": 7.927255985267035, + "grad_norm": 0.13542817533016205, + "learning_rate": 1.0853146547720278e-05, + "loss": 1.6613, + "step": 25827 + }, + { + "epoch": 7.92756292203806, + "grad_norm": 0.1598762571811676, + "learning_rate": 1.0850054551334905e-05, + "loss": 1.6828, + "step": 25828 + }, + { + "epoch": 7.9278698588090855, + "grad_norm": 0.19212616980075836, + "learning_rate": 1.0846962941852235e-05, + "loss": 1.8198, + "step": 25829 + }, + { + "epoch": 7.928176795580111, + "grad_norm": 0.19344113767147064, + "learning_rate": 1.0843871719302829e-05, + "loss": 1.7804, + "step": 25830 + }, + { + "epoch": 7.928483732351136, + "grad_norm": 0.15460920333862305, + "learning_rate": 1.0840780883717233e-05, + "loss": 1.7372, + "step": 25831 + }, + { + "epoch": 7.928790669122161, + "grad_norm": 0.19987867772579193, + "learning_rate": 1.083769043512598e-05, + "loss": 1.6923, + "step": 25832 + }, + { + "epoch": 7.929097605893186, + "grad_norm": 0.15390315651893616, + "learning_rate": 1.083460037355965e-05, + "loss": 1.6864, + "step": 25833 + }, + { + "epoch": 7.929404542664211, + "grad_norm": 0.18596698343753815, + "learning_rate": 1.0831510699048724e-05, + "loss": 1.7135, + "step": 25834 + }, + { + "epoch": 7.929711479435237, + "grad_norm": 0.172935351729393, + "learning_rate": 1.0828421411623796e-05, + "loss": 1.7426, + "step": 25835 + }, + { + "epoch": 7.930018416206261, + "grad_norm": 0.2046828418970108, + "learning_rate": 1.0825332511315356e-05, + "loss": 1.7178, + "step": 25836 + }, + { + "epoch": 7.930325352977286, + "grad_norm": 0.1382901519536972, + "learning_rate": 1.0822243998153925e-05, + "loss": 1.6811, + "step": 25837 + }, + { + "epoch": 7.930632289748312, + "grad_norm": 0.1675405353307724, + "learning_rate": 1.0819155872170068e-05, + "loss": 1.7278, + "step": 25838 + }, + { + "epoch": 7.930939226519337, + "grad_norm": 0.16732639074325562, + "learning_rate": 1.0816068133394252e-05, + "loss": 1.6847, + "step": 25839 + }, + { + "epoch": 7.931246163290362, + "grad_norm": 0.17154982686042786, + "learning_rate": 1.0812980781857047e-05, + "loss": 1.7411, + "step": 25840 + }, + { + "epoch": 7.931553100061388, + "grad_norm": 0.16475310921669006, + "learning_rate": 1.08098938175889e-05, + "loss": 1.7222, + "step": 25841 + }, + { + "epoch": 7.931860036832412, + "grad_norm": 0.1613023579120636, + "learning_rate": 1.080680724062037e-05, + "loss": 1.718, + "step": 25842 + }, + { + "epoch": 7.9321669736034375, + "grad_norm": 0.16330939531326294, + "learning_rate": 1.0803721050981941e-05, + "loss": 1.7087, + "step": 25843 + }, + { + "epoch": 7.932473910374463, + "grad_norm": 0.15881259739398956, + "learning_rate": 1.0800635248704117e-05, + "loss": 1.7309, + "step": 25844 + }, + { + "epoch": 7.932780847145488, + "grad_norm": 0.19191724061965942, + "learning_rate": 1.0797549833817389e-05, + "loss": 1.7131, + "step": 25845 + }, + { + "epoch": 7.9330877839165135, + "grad_norm": 0.17083698511123657, + "learning_rate": 1.079446480635225e-05, + "loss": 1.7117, + "step": 25846 + }, + { + "epoch": 7.933394720687538, + "grad_norm": 0.18097929656505585, + "learning_rate": 1.0791380166339193e-05, + "loss": 1.7017, + "step": 25847 + }, + { + "epoch": 7.933701657458563, + "grad_norm": 0.1556827276945114, + "learning_rate": 1.0788295913808694e-05, + "loss": 1.7589, + "step": 25848 + }, + { + "epoch": 7.934008594229589, + "grad_norm": 0.1667819619178772, + "learning_rate": 1.0785212048791226e-05, + "loss": 1.6735, + "step": 25849 + }, + { + "epoch": 7.934315531000614, + "grad_norm": 0.18772241473197937, + "learning_rate": 1.0782128571317302e-05, + "loss": 1.6984, + "step": 25850 + }, + { + "epoch": 7.934622467771639, + "grad_norm": 0.1752445250749588, + "learning_rate": 1.0779045481417343e-05, + "loss": 1.6662, + "step": 25851 + }, + { + "epoch": 7.934929404542665, + "grad_norm": 0.16619165241718292, + "learning_rate": 1.0775962779121873e-05, + "loss": 1.765, + "step": 25852 + }, + { + "epoch": 7.935236341313689, + "grad_norm": 0.1685585081577301, + "learning_rate": 1.0772880464461316e-05, + "loss": 1.6692, + "step": 25853 + }, + { + "epoch": 7.935543278084714, + "grad_norm": 0.16806848347187042, + "learning_rate": 1.076979853746613e-05, + "loss": 1.7081, + "step": 25854 + }, + { + "epoch": 7.93585021485574, + "grad_norm": 0.14273032546043396, + "learning_rate": 1.076671699816682e-05, + "loss": 1.6668, + "step": 25855 + }, + { + "epoch": 7.936157151626765, + "grad_norm": 0.24727863073349, + "learning_rate": 1.0763635846593778e-05, + "loss": 1.7624, + "step": 25856 + }, + { + "epoch": 7.93646408839779, + "grad_norm": 0.15679748356342316, + "learning_rate": 1.0760555082777506e-05, + "loss": 1.6851, + "step": 25857 + }, + { + "epoch": 7.936771025168815, + "grad_norm": 0.23388828337192535, + "learning_rate": 1.075747470674841e-05, + "loss": 1.7557, + "step": 25858 + }, + { + "epoch": 7.93707796193984, + "grad_norm": 0.15266747772693634, + "learning_rate": 1.0754394718536958e-05, + "loss": 1.6559, + "step": 25859 + }, + { + "epoch": 7.9373848987108655, + "grad_norm": 0.1945476084947586, + "learning_rate": 1.0751315118173577e-05, + "loss": 1.745, + "step": 25860 + }, + { + "epoch": 7.937691835481891, + "grad_norm": 0.18018878996372223, + "learning_rate": 1.0748235905688709e-05, + "loss": 1.7016, + "step": 25861 + }, + { + "epoch": 7.937998772252916, + "grad_norm": 0.1748870611190796, + "learning_rate": 1.0745157081112777e-05, + "loss": 1.6989, + "step": 25862 + }, + { + "epoch": 7.9383057090239415, + "grad_norm": 0.18253664672374725, + "learning_rate": 1.0742078644476217e-05, + "loss": 1.7554, + "step": 25863 + }, + { + "epoch": 7.938612645794966, + "grad_norm": 0.17009632289409637, + "learning_rate": 1.073900059580944e-05, + "loss": 1.7244, + "step": 25864 + }, + { + "epoch": 7.938919582565991, + "grad_norm": 0.17612707614898682, + "learning_rate": 1.0735922935142873e-05, + "loss": 1.6939, + "step": 25865 + }, + { + "epoch": 7.939226519337017, + "grad_norm": 0.21207575500011444, + "learning_rate": 1.0732845662506913e-05, + "loss": 1.7097, + "step": 25866 + }, + { + "epoch": 7.939533456108042, + "grad_norm": 0.2073012739419937, + "learning_rate": 1.0729768777932014e-05, + "loss": 1.7658, + "step": 25867 + }, + { + "epoch": 7.939840392879067, + "grad_norm": 0.18888477981090546, + "learning_rate": 1.072669228144853e-05, + "loss": 1.7496, + "step": 25868 + }, + { + "epoch": 7.940147329650092, + "grad_norm": 0.1822361946105957, + "learning_rate": 1.0723616173086926e-05, + "loss": 1.7344, + "step": 25869 + }, + { + "epoch": 7.940454266421117, + "grad_norm": 0.18642890453338623, + "learning_rate": 1.0720540452877547e-05, + "loss": 1.7135, + "step": 25870 + }, + { + "epoch": 7.940761203192142, + "grad_norm": 0.19198815524578094, + "learning_rate": 1.0717465120850795e-05, + "loss": 1.7128, + "step": 25871 + }, + { + "epoch": 7.941068139963168, + "grad_norm": 0.1886969953775406, + "learning_rate": 1.0714390177037109e-05, + "loss": 1.7161, + "step": 25872 + }, + { + "epoch": 7.941375076734193, + "grad_norm": 0.19693820178508759, + "learning_rate": 1.0711315621466816e-05, + "loss": 1.7086, + "step": 25873 + }, + { + "epoch": 7.941682013505218, + "grad_norm": 0.19052870571613312, + "learning_rate": 1.0708241454170353e-05, + "loss": 1.7274, + "step": 25874 + }, + { + "epoch": 7.941988950276243, + "grad_norm": 0.23586300015449524, + "learning_rate": 1.0705167675178057e-05, + "loss": 1.7169, + "step": 25875 + }, + { + "epoch": 7.942295887047268, + "grad_norm": 0.2077670842409134, + "learning_rate": 1.0702094284520336e-05, + "loss": 1.7573, + "step": 25876 + }, + { + "epoch": 7.9426028238182935, + "grad_norm": 0.20345431566238403, + "learning_rate": 1.069902128222755e-05, + "loss": 1.6821, + "step": 25877 + }, + { + "epoch": 7.942909760589319, + "grad_norm": 0.1869240552186966, + "learning_rate": 1.0695948668330075e-05, + "loss": 1.6978, + "step": 25878 + }, + { + "epoch": 7.943216697360343, + "grad_norm": 0.17814506590366364, + "learning_rate": 1.0692876442858274e-05, + "loss": 1.7027, + "step": 25879 + }, + { + "epoch": 7.943523634131369, + "grad_norm": 0.19093535840511322, + "learning_rate": 1.0689804605842502e-05, + "loss": 1.7863, + "step": 25880 + }, + { + "epoch": 7.943830570902394, + "grad_norm": 0.17859873175621033, + "learning_rate": 1.0686733157313123e-05, + "loss": 1.7431, + "step": 25881 + }, + { + "epoch": 7.944137507673419, + "grad_norm": 0.16613568365573883, + "learning_rate": 1.0683662097300484e-05, + "loss": 1.7517, + "step": 25882 + }, + { + "epoch": 7.944444444444445, + "grad_norm": 0.1588357836008072, + "learning_rate": 1.0680591425834934e-05, + "loss": 1.7017, + "step": 25883 + }, + { + "epoch": 7.94475138121547, + "grad_norm": 0.1667826622724533, + "learning_rate": 1.067752114294685e-05, + "loss": 1.6965, + "step": 25884 + }, + { + "epoch": 7.945058317986494, + "grad_norm": 0.2015296071767807, + "learning_rate": 1.0674451248666522e-05, + "loss": 1.7625, + "step": 25885 + }, + { + "epoch": 7.94536525475752, + "grad_norm": 0.17073483765125275, + "learning_rate": 1.0671381743024344e-05, + "loss": 1.7194, + "step": 25886 + }, + { + "epoch": 7.945672191528545, + "grad_norm": 0.16649815440177917, + "learning_rate": 1.0668312626050608e-05, + "loss": 1.7233, + "step": 25887 + }, + { + "epoch": 7.94597912829957, + "grad_norm": 0.14395855367183685, + "learning_rate": 1.0665243897775645e-05, + "loss": 1.6859, + "step": 25888 + }, + { + "epoch": 7.946286065070596, + "grad_norm": 0.18934515118598938, + "learning_rate": 1.0662175558229826e-05, + "loss": 1.6832, + "step": 25889 + }, + { + "epoch": 7.94659300184162, + "grad_norm": 0.16819562017917633, + "learning_rate": 1.0659107607443419e-05, + "loss": 1.7592, + "step": 25890 + }, + { + "epoch": 7.9468999386126455, + "grad_norm": 0.1701207458972931, + "learning_rate": 1.0656040045446798e-05, + "loss": 1.6909, + "step": 25891 + }, + { + "epoch": 7.947206875383671, + "grad_norm": 0.18011561036109924, + "learning_rate": 1.0652972872270217e-05, + "loss": 1.7687, + "step": 25892 + }, + { + "epoch": 7.947513812154696, + "grad_norm": 0.15422853827476501, + "learning_rate": 1.0649906087944034e-05, + "loss": 1.6957, + "step": 25893 + }, + { + "epoch": 7.9478207489257215, + "grad_norm": 0.17223568260669708, + "learning_rate": 1.0646839692498545e-05, + "loss": 1.7368, + "step": 25894 + }, + { + "epoch": 7.948127685696747, + "grad_norm": 0.16706988215446472, + "learning_rate": 1.0643773685964053e-05, + "loss": 1.6981, + "step": 25895 + }, + { + "epoch": 7.948434622467771, + "grad_norm": 0.15490150451660156, + "learning_rate": 1.0640708068370853e-05, + "loss": 1.705, + "step": 25896 + }, + { + "epoch": 7.948741559238797, + "grad_norm": 0.16119123995304108, + "learning_rate": 1.0637642839749246e-05, + "loss": 1.7519, + "step": 25897 + }, + { + "epoch": 7.949048496009822, + "grad_norm": 0.1669061779975891, + "learning_rate": 1.0634578000129524e-05, + "loss": 1.7228, + "step": 25898 + }, + { + "epoch": 7.949355432780847, + "grad_norm": 0.1974606215953827, + "learning_rate": 1.0631513549541976e-05, + "loss": 1.7188, + "step": 25899 + }, + { + "epoch": 7.949662369551873, + "grad_norm": 0.204077810049057, + "learning_rate": 1.0628449488016873e-05, + "loss": 1.7397, + "step": 25900 + }, + { + "epoch": 7.949969306322897, + "grad_norm": 0.13561539351940155, + "learning_rate": 1.0625385815584537e-05, + "loss": 1.6457, + "step": 25901 + }, + { + "epoch": 7.9502762430939224, + "grad_norm": 0.1736447811126709, + "learning_rate": 1.0622322532275186e-05, + "loss": 1.7278, + "step": 25902 + }, + { + "epoch": 7.950583179864948, + "grad_norm": 0.1712762862443924, + "learning_rate": 1.061925963811915e-05, + "loss": 1.7208, + "step": 25903 + }, + { + "epoch": 7.950890116635973, + "grad_norm": 0.15313011407852173, + "learning_rate": 1.0616197133146661e-05, + "loss": 1.671, + "step": 25904 + }, + { + "epoch": 7.9511970534069984, + "grad_norm": 0.15110735595226288, + "learning_rate": 1.0613135017387981e-05, + "loss": 1.6568, + "step": 25905 + }, + { + "epoch": 7.951503990178024, + "grad_norm": 0.22678901255130768, + "learning_rate": 1.0610073290873413e-05, + "loss": 1.7415, + "step": 25906 + }, + { + "epoch": 7.951810926949048, + "grad_norm": 0.16936101019382477, + "learning_rate": 1.0607011953633162e-05, + "loss": 1.6983, + "step": 25907 + }, + { + "epoch": 7.952117863720074, + "grad_norm": 0.18443427979946136, + "learning_rate": 1.0603951005697533e-05, + "loss": 1.7334, + "step": 25908 + }, + { + "epoch": 7.952424800491099, + "grad_norm": 0.2290949672460556, + "learning_rate": 1.0600890447096729e-05, + "loss": 1.7219, + "step": 25909 + }, + { + "epoch": 7.952731737262124, + "grad_norm": 0.19244399666786194, + "learning_rate": 1.0597830277861026e-05, + "loss": 1.7047, + "step": 25910 + }, + { + "epoch": 7.953038674033149, + "grad_norm": 0.15806549787521362, + "learning_rate": 1.0594770498020657e-05, + "loss": 1.667, + "step": 25911 + }, + { + "epoch": 7.953345610804174, + "grad_norm": 0.23782655596733093, + "learning_rate": 1.0591711107605867e-05, + "loss": 1.7271, + "step": 25912 + }, + { + "epoch": 7.953652547575199, + "grad_norm": 0.18427079916000366, + "learning_rate": 1.0588652106646885e-05, + "loss": 1.7644, + "step": 25913 + }, + { + "epoch": 7.953959484346225, + "grad_norm": 0.18687991797924042, + "learning_rate": 1.058559349517394e-05, + "loss": 1.7045, + "step": 25914 + }, + { + "epoch": 7.95426642111725, + "grad_norm": 0.17435906827449799, + "learning_rate": 1.0582535273217265e-05, + "loss": 1.6681, + "step": 25915 + }, + { + "epoch": 7.954573357888275, + "grad_norm": 0.17601260542869568, + "learning_rate": 1.0579477440807079e-05, + "loss": 1.7141, + "step": 25916 + }, + { + "epoch": 7.9548802946593, + "grad_norm": 0.19225506484508514, + "learning_rate": 1.0576419997973586e-05, + "loss": 1.7224, + "step": 25917 + }, + { + "epoch": 7.955187231430325, + "grad_norm": 0.18801991641521454, + "learning_rate": 1.0573362944747045e-05, + "loss": 1.715, + "step": 25918 + }, + { + "epoch": 7.9554941682013505, + "grad_norm": 0.21490465104579926, + "learning_rate": 1.0570306281157616e-05, + "loss": 1.7931, + "step": 25919 + }, + { + "epoch": 7.955801104972376, + "grad_norm": 0.1877163052558899, + "learning_rate": 1.0567250007235557e-05, + "loss": 1.7365, + "step": 25920 + }, + { + "epoch": 7.956108041743401, + "grad_norm": 0.18460121750831604, + "learning_rate": 1.0564194123011029e-05, + "loss": 1.7092, + "step": 25921 + }, + { + "epoch": 7.956414978514426, + "grad_norm": 0.1663859337568283, + "learning_rate": 1.0561138628514239e-05, + "loss": 1.6847, + "step": 25922 + }, + { + "epoch": 7.956721915285451, + "grad_norm": 0.1676093488931656, + "learning_rate": 1.0558083523775413e-05, + "loss": 1.6788, + "step": 25923 + }, + { + "epoch": 7.957028852056476, + "grad_norm": 0.17470842599868774, + "learning_rate": 1.0555028808824702e-05, + "loss": 1.7658, + "step": 25924 + }, + { + "epoch": 7.957335788827502, + "grad_norm": 0.17770788073539734, + "learning_rate": 1.0551974483692346e-05, + "loss": 1.6875, + "step": 25925 + }, + { + "epoch": 7.957642725598527, + "grad_norm": 0.17924711108207703, + "learning_rate": 1.054892054840847e-05, + "loss": 1.7024, + "step": 25926 + }, + { + "epoch": 7.957949662369552, + "grad_norm": 0.19387175142765045, + "learning_rate": 1.0545867003003296e-05, + "loss": 1.7806, + "step": 25927 + }, + { + "epoch": 7.958256599140577, + "grad_norm": 0.176667258143425, + "learning_rate": 1.0542813847506988e-05, + "loss": 1.7187, + "step": 25928 + }, + { + "epoch": 7.958563535911602, + "grad_norm": 0.1730370670557022, + "learning_rate": 1.0539761081949723e-05, + "loss": 1.6912, + "step": 25929 + }, + { + "epoch": 7.958870472682627, + "grad_norm": 0.1836516112089157, + "learning_rate": 1.0536708706361665e-05, + "loss": 1.684, + "step": 25930 + }, + { + "epoch": 7.959177409453653, + "grad_norm": 0.17236517369747162, + "learning_rate": 1.0533656720772983e-05, + "loss": 1.6799, + "step": 25931 + }, + { + "epoch": 7.959484346224678, + "grad_norm": 0.1655581295490265, + "learning_rate": 1.0530605125213832e-05, + "loss": 1.755, + "step": 25932 + }, + { + "epoch": 7.9597912829957025, + "grad_norm": 0.1801871806383133, + "learning_rate": 1.0527553919714383e-05, + "loss": 1.6998, + "step": 25933 + }, + { + "epoch": 7.960098219766728, + "grad_norm": 0.20504651963710785, + "learning_rate": 1.052450310430476e-05, + "loss": 1.7793, + "step": 25934 + }, + { + "epoch": 7.960405156537753, + "grad_norm": 0.2522159516811371, + "learning_rate": 1.052145267901517e-05, + "loss": 1.754, + "step": 25935 + }, + { + "epoch": 7.9607120933087785, + "grad_norm": 0.18074269592761993, + "learning_rate": 1.0518402643875691e-05, + "loss": 1.717, + "step": 25936 + }, + { + "epoch": 7.961019030079804, + "grad_norm": 0.16463595628738403, + "learning_rate": 1.0515352998916527e-05, + "loss": 1.6994, + "step": 25937 + }, + { + "epoch": 7.961325966850829, + "grad_norm": 0.17102178931236267, + "learning_rate": 1.0512303744167778e-05, + "loss": 1.6571, + "step": 25938 + }, + { + "epoch": 7.961632903621854, + "grad_norm": 0.14453014731407166, + "learning_rate": 1.0509254879659569e-05, + "loss": 1.6725, + "step": 25939 + }, + { + "epoch": 7.961939840392879, + "grad_norm": 0.1980808526277542, + "learning_rate": 1.050620640542208e-05, + "loss": 1.6847, + "step": 25940 + }, + { + "epoch": 7.962246777163904, + "grad_norm": 0.15021857619285583, + "learning_rate": 1.0503158321485378e-05, + "loss": 1.6896, + "step": 25941 + }, + { + "epoch": 7.96255371393493, + "grad_norm": 0.2223394513130188, + "learning_rate": 1.0500110627879639e-05, + "loss": 1.7167, + "step": 25942 + }, + { + "epoch": 7.962860650705955, + "grad_norm": 0.17636358737945557, + "learning_rate": 1.0497063324634937e-05, + "loss": 1.6625, + "step": 25943 + }, + { + "epoch": 7.963167587476979, + "grad_norm": 0.1823662370443344, + "learning_rate": 1.049401641178142e-05, + "loss": 1.7139, + "step": 25944 + }, + { + "epoch": 7.963474524248005, + "grad_norm": 0.1740594059228897, + "learning_rate": 1.0490969889349189e-05, + "loss": 1.7447, + "step": 25945 + }, + { + "epoch": 7.96378146101903, + "grad_norm": 0.15838129818439484, + "learning_rate": 1.0487923757368351e-05, + "loss": 1.7051, + "step": 25946 + }, + { + "epoch": 7.964088397790055, + "grad_norm": 0.4309011399745941, + "learning_rate": 1.0484878015869005e-05, + "loss": 1.7442, + "step": 25947 + }, + { + "epoch": 7.964395334561081, + "grad_norm": 0.17090202867984772, + "learning_rate": 1.0481832664881257e-05, + "loss": 1.652, + "step": 25948 + }, + { + "epoch": 7.964702271332106, + "grad_norm": 0.16977159678936005, + "learning_rate": 1.0478787704435206e-05, + "loss": 1.6894, + "step": 25949 + }, + { + "epoch": 7.9650092081031305, + "grad_norm": 0.20473513007164001, + "learning_rate": 1.0475743134560934e-05, + "loss": 1.8141, + "step": 25950 + }, + { + "epoch": 7.965316144874156, + "grad_norm": 0.1775660663843155, + "learning_rate": 1.0472698955288535e-05, + "loss": 1.7204, + "step": 25951 + }, + { + "epoch": 7.965623081645181, + "grad_norm": 0.21351923048496246, + "learning_rate": 1.046965516664809e-05, + "loss": 1.7364, + "step": 25952 + }, + { + "epoch": 7.9659300184162065, + "grad_norm": 0.2034255862236023, + "learning_rate": 1.0466611768669671e-05, + "loss": 1.7096, + "step": 25953 + }, + { + "epoch": 7.966236955187231, + "grad_norm": 0.17075900733470917, + "learning_rate": 1.0463568761383396e-05, + "loss": 1.6928, + "step": 25954 + }, + { + "epoch": 7.966543891958256, + "grad_norm": 0.18142712116241455, + "learning_rate": 1.0460526144819288e-05, + "loss": 1.7146, + "step": 25955 + }, + { + "epoch": 7.966850828729282, + "grad_norm": 0.14901846647262573, + "learning_rate": 1.0457483919007427e-05, + "loss": 1.6841, + "step": 25956 + }, + { + "epoch": 7.967157765500307, + "grad_norm": 0.17380031943321228, + "learning_rate": 1.0454442083977912e-05, + "loss": 1.6911, + "step": 25957 + }, + { + "epoch": 7.967464702271332, + "grad_norm": 0.15983760356903076, + "learning_rate": 1.045140063976075e-05, + "loss": 1.6866, + "step": 25958 + }, + { + "epoch": 7.967771639042358, + "grad_norm": 0.1559101641178131, + "learning_rate": 1.0448359586386058e-05, + "loss": 1.6793, + "step": 25959 + }, + { + "epoch": 7.968078575813382, + "grad_norm": 0.14843949675559998, + "learning_rate": 1.0445318923883829e-05, + "loss": 1.6835, + "step": 25960 + }, + { + "epoch": 7.968385512584407, + "grad_norm": 0.16452330350875854, + "learning_rate": 1.0442278652284155e-05, + "loss": 1.7304, + "step": 25961 + }, + { + "epoch": 7.968692449355433, + "grad_norm": 0.18997763097286224, + "learning_rate": 1.0439238771617066e-05, + "loss": 1.7425, + "step": 25962 + }, + { + "epoch": 7.968999386126458, + "grad_norm": 0.1654025912284851, + "learning_rate": 1.0436199281912611e-05, + "loss": 1.6909, + "step": 25963 + }, + { + "epoch": 7.969306322897483, + "grad_norm": 0.1313011646270752, + "learning_rate": 1.0433160183200823e-05, + "loss": 1.6572, + "step": 25964 + }, + { + "epoch": 7.969613259668508, + "grad_norm": 0.1584165096282959, + "learning_rate": 1.043012147551174e-05, + "loss": 1.7257, + "step": 25965 + }, + { + "epoch": 7.969920196439533, + "grad_norm": 0.17830775678157806, + "learning_rate": 1.0427083158875384e-05, + "loss": 1.7382, + "step": 25966 + }, + { + "epoch": 7.9702271332105585, + "grad_norm": 0.19006042182445526, + "learning_rate": 1.0424045233321788e-05, + "loss": 1.7366, + "step": 25967 + }, + { + "epoch": 7.970534069981584, + "grad_norm": 0.15366297960281372, + "learning_rate": 1.0421007698880974e-05, + "loss": 1.7235, + "step": 25968 + }, + { + "epoch": 7.970841006752609, + "grad_norm": 0.14415831863880157, + "learning_rate": 1.0417970555582963e-05, + "loss": 1.6945, + "step": 25969 + }, + { + "epoch": 7.9711479435236345, + "grad_norm": 0.16916446387767792, + "learning_rate": 1.041493380345775e-05, + "loss": 1.7099, + "step": 25970 + }, + { + "epoch": 7.971454880294659, + "grad_norm": 0.1456119269132614, + "learning_rate": 1.041189744253539e-05, + "loss": 1.6544, + "step": 25971 + }, + { + "epoch": 7.971761817065684, + "grad_norm": 0.20085962116718292, + "learning_rate": 1.040886147284585e-05, + "loss": 1.699, + "step": 25972 + }, + { + "epoch": 7.97206875383671, + "grad_norm": 0.1815454363822937, + "learning_rate": 1.0405825894419141e-05, + "loss": 1.7503, + "step": 25973 + }, + { + "epoch": 7.972375690607735, + "grad_norm": 0.2010805308818817, + "learning_rate": 1.040279070728527e-05, + "loss": 1.7061, + "step": 25974 + }, + { + "epoch": 7.97268262737876, + "grad_norm": 0.22105813026428223, + "learning_rate": 1.0399755911474218e-05, + "loss": 1.7262, + "step": 25975 + }, + { + "epoch": 7.972989564149785, + "grad_norm": 0.16186046600341797, + "learning_rate": 1.0396721507016017e-05, + "loss": 1.7229, + "step": 25976 + }, + { + "epoch": 7.97329650092081, + "grad_norm": 0.19990484416484833, + "learning_rate": 1.0393687493940597e-05, + "loss": 1.7006, + "step": 25977 + }, + { + "epoch": 7.973603437691835, + "grad_norm": 0.2377716600894928, + "learning_rate": 1.0390653872277983e-05, + "loss": 1.7302, + "step": 25978 + }, + { + "epoch": 7.973910374462861, + "grad_norm": 0.14087189733982086, + "learning_rate": 1.0387620642058148e-05, + "loss": 1.6563, + "step": 25979 + }, + { + "epoch": 7.974217311233886, + "grad_norm": 0.246252179145813, + "learning_rate": 1.0384587803311063e-05, + "loss": 1.6661, + "step": 25980 + }, + { + "epoch": 7.974524248004911, + "grad_norm": 0.18734396994113922, + "learning_rate": 1.0381555356066697e-05, + "loss": 1.7566, + "step": 25981 + }, + { + "epoch": 7.974831184775936, + "grad_norm": 0.1621570736169815, + "learning_rate": 1.0378523300355025e-05, + "loss": 1.6863, + "step": 25982 + }, + { + "epoch": 7.975138121546961, + "grad_norm": 0.2571845054626465, + "learning_rate": 1.0375491636206002e-05, + "loss": 1.7589, + "step": 25983 + }, + { + "epoch": 7.975445058317987, + "grad_norm": 0.1880367249250412, + "learning_rate": 1.0372460363649606e-05, + "loss": 1.6999, + "step": 25984 + }, + { + "epoch": 7.975751995089012, + "grad_norm": 0.20473778247833252, + "learning_rate": 1.0369429482715776e-05, + "loss": 1.749, + "step": 25985 + }, + { + "epoch": 7.976058931860036, + "grad_norm": 0.19917427003383636, + "learning_rate": 1.0366398993434473e-05, + "loss": 1.701, + "step": 25986 + }, + { + "epoch": 7.976365868631062, + "grad_norm": 0.1758740097284317, + "learning_rate": 1.0363368895835635e-05, + "loss": 1.6774, + "step": 25987 + }, + { + "epoch": 7.976672805402087, + "grad_norm": 0.26412737369537354, + "learning_rate": 1.0360339189949242e-05, + "loss": 1.6778, + "step": 25988 + }, + { + "epoch": 7.976979742173112, + "grad_norm": 0.19599425792694092, + "learning_rate": 1.0357309875805194e-05, + "loss": 1.777, + "step": 25989 + }, + { + "epoch": 7.977286678944138, + "grad_norm": 0.2095821648836136, + "learning_rate": 1.0354280953433449e-05, + "loss": 1.7106, + "step": 25990 + }, + { + "epoch": 7.977593615715163, + "grad_norm": 0.1743748039007187, + "learning_rate": 1.0351252422863934e-05, + "loss": 1.6891, + "step": 25991 + }, + { + "epoch": 7.9779005524861875, + "grad_norm": 0.17273737490177155, + "learning_rate": 1.0348224284126573e-05, + "loss": 1.7254, + "step": 25992 + }, + { + "epoch": 7.978207489257213, + "grad_norm": 0.2032385915517807, + "learning_rate": 1.0345196537251322e-05, + "loss": 1.707, + "step": 25993 + }, + { + "epoch": 7.978514426028238, + "grad_norm": 0.17978399991989136, + "learning_rate": 1.0342169182268057e-05, + "loss": 1.695, + "step": 25994 + }, + { + "epoch": 7.9788213627992635, + "grad_norm": 0.20567134022712708, + "learning_rate": 1.0339142219206744e-05, + "loss": 1.6726, + "step": 25995 + }, + { + "epoch": 7.979128299570289, + "grad_norm": 0.19649706780910492, + "learning_rate": 1.033611564809725e-05, + "loss": 1.737, + "step": 25996 + }, + { + "epoch": 7.979435236341313, + "grad_norm": 0.1640859991312027, + "learning_rate": 1.033308946896952e-05, + "loss": 1.6993, + "step": 25997 + }, + { + "epoch": 7.979742173112339, + "grad_norm": 0.21497343480587006, + "learning_rate": 1.0330063681853452e-05, + "loss": 1.7387, + "step": 25998 + }, + { + "epoch": 7.980049109883364, + "grad_norm": 0.14995479583740234, + "learning_rate": 1.0327038286778946e-05, + "loss": 1.6671, + "step": 25999 + }, + { + "epoch": 7.980356046654389, + "grad_norm": 0.1836833655834198, + "learning_rate": 1.0324013283775895e-05, + "loss": 1.7279, + "step": 26000 + }, + { + "epoch": 7.980662983425415, + "grad_norm": 0.14769285917282104, + "learning_rate": 1.032098867287421e-05, + "loss": 1.707, + "step": 26001 + }, + { + "epoch": 7.98096992019644, + "grad_norm": 0.24206426739692688, + "learning_rate": 1.0317964454103762e-05, + "loss": 1.8122, + "step": 26002 + }, + { + "epoch": 7.981276856967464, + "grad_norm": 0.16573204100131989, + "learning_rate": 1.0314940627494451e-05, + "loss": 1.7079, + "step": 26003 + }, + { + "epoch": 7.98158379373849, + "grad_norm": 0.1825968325138092, + "learning_rate": 1.0311917193076143e-05, + "loss": 1.6795, + "step": 26004 + }, + { + "epoch": 7.981890730509515, + "grad_norm": 0.14462140202522278, + "learning_rate": 1.0308894150878761e-05, + "loss": 1.7152, + "step": 26005 + }, + { + "epoch": 7.98219766728054, + "grad_norm": 0.15220513939857483, + "learning_rate": 1.0305871500932135e-05, + "loss": 1.6657, + "step": 26006 + }, + { + "epoch": 7.982504604051566, + "grad_norm": 0.17780731618404388, + "learning_rate": 1.030284924326615e-05, + "loss": 1.6852, + "step": 26007 + }, + { + "epoch": 7.98281154082259, + "grad_norm": 0.13492488861083984, + "learning_rate": 1.0299827377910681e-05, + "loss": 1.6331, + "step": 26008 + }, + { + "epoch": 7.9831184775936155, + "grad_norm": 0.1566525399684906, + "learning_rate": 1.0296805904895568e-05, + "loss": 1.6918, + "step": 26009 + }, + { + "epoch": 7.983425414364641, + "grad_norm": 0.17075398564338684, + "learning_rate": 1.0293784824250725e-05, + "loss": 1.7107, + "step": 26010 + }, + { + "epoch": 7.983732351135666, + "grad_norm": 0.16693715751171112, + "learning_rate": 1.0290764136005937e-05, + "loss": 1.6773, + "step": 26011 + }, + { + "epoch": 7.9840392879066915, + "grad_norm": 0.23020583391189575, + "learning_rate": 1.0287743840191122e-05, + "loss": 1.7389, + "step": 26012 + }, + { + "epoch": 7.984346224677717, + "grad_norm": 0.2185986489057541, + "learning_rate": 1.0284723936836071e-05, + "loss": 1.7039, + "step": 26013 + }, + { + "epoch": 7.984653161448741, + "grad_norm": 0.1527925282716751, + "learning_rate": 1.0281704425970673e-05, + "loss": 1.6981, + "step": 26014 + }, + { + "epoch": 7.984960098219767, + "grad_norm": 0.23389141261577606, + "learning_rate": 1.0278685307624747e-05, + "loss": 1.7511, + "step": 26015 + }, + { + "epoch": 7.985267034990792, + "grad_norm": 0.1481025218963623, + "learning_rate": 1.0275666581828137e-05, + "loss": 1.6551, + "step": 26016 + }, + { + "epoch": 7.985573971761817, + "grad_norm": 0.18131811916828156, + "learning_rate": 1.0272648248610672e-05, + "loss": 1.7024, + "step": 26017 + }, + { + "epoch": 7.985880908532843, + "grad_norm": 0.15969321131706238, + "learning_rate": 1.0269630308002182e-05, + "loss": 1.7269, + "step": 26018 + }, + { + "epoch": 7.986187845303867, + "grad_norm": 0.16655376553535461, + "learning_rate": 1.026661276003249e-05, + "loss": 1.6649, + "step": 26019 + }, + { + "epoch": 7.986494782074892, + "grad_norm": 0.16438528895378113, + "learning_rate": 1.0263595604731425e-05, + "loss": 1.6901, + "step": 26020 + }, + { + "epoch": 7.986801718845918, + "grad_norm": 0.23586809635162354, + "learning_rate": 1.0260578842128782e-05, + "loss": 1.7983, + "step": 26021 + }, + { + "epoch": 7.987108655616943, + "grad_norm": 0.15142324566841125, + "learning_rate": 1.0257562472254417e-05, + "loss": 1.6327, + "step": 26022 + }, + { + "epoch": 7.987415592387968, + "grad_norm": 0.17198510468006134, + "learning_rate": 1.0254546495138096e-05, + "loss": 1.7119, + "step": 26023 + }, + { + "epoch": 7.987722529158994, + "grad_norm": 0.1675531417131424, + "learning_rate": 1.0251530910809648e-05, + "loss": 1.695, + "step": 26024 + }, + { + "epoch": 7.988029465930018, + "grad_norm": 0.17403315007686615, + "learning_rate": 1.0248515719298867e-05, + "loss": 1.7216, + "step": 26025 + }, + { + "epoch": 7.9883364027010435, + "grad_norm": 0.16039720177650452, + "learning_rate": 1.0245500920635537e-05, + "loss": 1.7315, + "step": 26026 + }, + { + "epoch": 7.988643339472069, + "grad_norm": 0.19715416431427002, + "learning_rate": 1.0242486514849498e-05, + "loss": 1.7308, + "step": 26027 + }, + { + "epoch": 7.988950276243094, + "grad_norm": 0.14576783776283264, + "learning_rate": 1.0239472501970482e-05, + "loss": 1.6589, + "step": 26028 + }, + { + "epoch": 7.989257213014119, + "grad_norm": 0.1631615310907364, + "learning_rate": 1.0236458882028333e-05, + "loss": 1.7494, + "step": 26029 + }, + { + "epoch": 7.989564149785144, + "grad_norm": 0.19368192553520203, + "learning_rate": 1.023344565505277e-05, + "loss": 1.735, + "step": 26030 + }, + { + "epoch": 7.989871086556169, + "grad_norm": 0.1902317851781845, + "learning_rate": 1.023043282107362e-05, + "loss": 1.7573, + "step": 26031 + }, + { + "epoch": 7.990178023327195, + "grad_norm": 0.18496233224868774, + "learning_rate": 1.0227420380120651e-05, + "loss": 1.7368, + "step": 26032 + }, + { + "epoch": 7.99048496009822, + "grad_norm": 0.172613263130188, + "learning_rate": 1.0224408332223617e-05, + "loss": 1.6943, + "step": 26033 + }, + { + "epoch": 7.990791896869245, + "grad_norm": 0.19840112328529358, + "learning_rate": 1.0221396677412293e-05, + "loss": 1.7562, + "step": 26034 + }, + { + "epoch": 7.99109883364027, + "grad_norm": 0.18129339814186096, + "learning_rate": 1.0218385415716441e-05, + "loss": 1.6746, + "step": 26035 + }, + { + "epoch": 7.991405770411295, + "grad_norm": 0.17933470010757446, + "learning_rate": 1.021537454716583e-05, + "loss": 1.7324, + "step": 26036 + }, + { + "epoch": 7.99171270718232, + "grad_norm": 0.14947326481342316, + "learning_rate": 1.0212364071790198e-05, + "loss": 1.632, + "step": 26037 + }, + { + "epoch": 7.992019643953346, + "grad_norm": 0.18452878296375275, + "learning_rate": 1.0209353989619291e-05, + "loss": 1.6737, + "step": 26038 + }, + { + "epoch": 7.992326580724371, + "grad_norm": 0.18882198631763458, + "learning_rate": 1.0206344300682901e-05, + "loss": 1.7529, + "step": 26039 + }, + { + "epoch": 7.9926335174953955, + "grad_norm": 0.1855655312538147, + "learning_rate": 1.0203335005010722e-05, + "loss": 1.7347, + "step": 26040 + }, + { + "epoch": 7.992940454266421, + "grad_norm": 0.16447728872299194, + "learning_rate": 1.0200326102632518e-05, + "loss": 1.6659, + "step": 26041 + }, + { + "epoch": 7.993247391037446, + "grad_norm": 0.17379891872406006, + "learning_rate": 1.0197317593578016e-05, + "loss": 1.6962, + "step": 26042 + }, + { + "epoch": 7.9935543278084715, + "grad_norm": 0.16298875212669373, + "learning_rate": 1.0194309477876934e-05, + "loss": 1.6815, + "step": 26043 + }, + { + "epoch": 7.993861264579497, + "grad_norm": 0.1883227378129959, + "learning_rate": 1.0191301755559047e-05, + "loss": 1.7053, + "step": 26044 + }, + { + "epoch": 7.994168201350522, + "grad_norm": 0.20746919512748718, + "learning_rate": 1.0188294426654021e-05, + "loss": 1.7476, + "step": 26045 + }, + { + "epoch": 7.994475138121547, + "grad_norm": 0.1882137805223465, + "learning_rate": 1.0185287491191631e-05, + "loss": 1.7078, + "step": 26046 + }, + { + "epoch": 7.994782074892572, + "grad_norm": 0.21140792965888977, + "learning_rate": 1.0182280949201539e-05, + "loss": 1.7729, + "step": 26047 + }, + { + "epoch": 7.995089011663597, + "grad_norm": 0.18779736757278442, + "learning_rate": 1.0179274800713501e-05, + "loss": 1.7413, + "step": 26048 + }, + { + "epoch": 7.995395948434623, + "grad_norm": 0.1841782033443451, + "learning_rate": 1.0176269045757202e-05, + "loss": 1.7058, + "step": 26049 + }, + { + "epoch": 7.995702885205648, + "grad_norm": 0.19872064888477325, + "learning_rate": 1.017326368436236e-05, + "loss": 1.7522, + "step": 26050 + }, + { + "epoch": 7.996009821976672, + "grad_norm": 0.1763429492712021, + "learning_rate": 1.0170258716558667e-05, + "loss": 1.7178, + "step": 26051 + }, + { + "epoch": 7.996316758747698, + "grad_norm": 0.20209169387817383, + "learning_rate": 1.0167254142375826e-05, + "loss": 1.723, + "step": 26052 + }, + { + "epoch": 7.996623695518723, + "grad_norm": 0.15985172986984253, + "learning_rate": 1.0164249961843519e-05, + "loss": 1.6985, + "step": 26053 + }, + { + "epoch": 7.996930632289748, + "grad_norm": 0.1985132247209549, + "learning_rate": 1.0161246174991451e-05, + "loss": 1.7982, + "step": 26054 + }, + { + "epoch": 7.997237569060774, + "grad_norm": 0.17600803077220917, + "learning_rate": 1.0158242781849292e-05, + "loss": 1.7009, + "step": 26055 + }, + { + "epoch": 7.997544505831799, + "grad_norm": 0.15485480427742004, + "learning_rate": 1.015523978244673e-05, + "loss": 1.675, + "step": 26056 + }, + { + "epoch": 7.9978514426028235, + "grad_norm": 0.18465322256088257, + "learning_rate": 1.0152237176813446e-05, + "loss": 1.7156, + "step": 26057 + }, + { + "epoch": 7.998158379373849, + "grad_norm": 0.2183876633644104, + "learning_rate": 1.014923496497911e-05, + "loss": 1.7805, + "step": 26058 + }, + { + "epoch": 7.998465316144874, + "grad_norm": 0.18724960088729858, + "learning_rate": 1.014623314697339e-05, + "loss": 1.7047, + "step": 26059 + }, + { + "epoch": 7.9987722529158995, + "grad_norm": 0.15459159016609192, + "learning_rate": 1.0143231722825936e-05, + "loss": 1.6595, + "step": 26060 + }, + { + "epoch": 7.999079189686924, + "grad_norm": 0.16338171064853668, + "learning_rate": 1.0140230692566454e-05, + "loss": 1.6907, + "step": 26061 + }, + { + "epoch": 7.999386126457949, + "grad_norm": 0.16223935782909393, + "learning_rate": 1.013723005622455e-05, + "loss": 1.6866, + "step": 26062 + }, + { + "epoch": 7.999693063228975, + "grad_norm": 0.18934771418571472, + "learning_rate": 1.0134229813829931e-05, + "loss": 1.706, + "step": 26063 + }, + { + "epoch": 8.0, + "grad_norm": 0.19117574393749237, + "learning_rate": 1.0131229965412191e-05, + "loss": 1.7392, + "step": 26064 + }, + { + "epoch": 8.000306936771025, + "grad_norm": 0.20491363108158112, + "learning_rate": 1.0128230511001019e-05, + "loss": 1.7488, + "step": 26065 + }, + { + "epoch": 8.00061387354205, + "grad_norm": 0.16383573412895203, + "learning_rate": 1.0125231450626043e-05, + "loss": 1.6958, + "step": 26066 + }, + { + "epoch": 8.000920810313076, + "grad_norm": 0.17405575513839722, + "learning_rate": 1.0122232784316898e-05, + "loss": 1.701, + "step": 26067 + }, + { + "epoch": 8.001227747084101, + "grad_norm": 0.1504749059677124, + "learning_rate": 1.0119234512103226e-05, + "loss": 1.6588, + "step": 26068 + }, + { + "epoch": 8.001534683855127, + "grad_norm": 0.15705156326293945, + "learning_rate": 1.0116236634014647e-05, + "loss": 1.6746, + "step": 26069 + }, + { + "epoch": 8.00184162062615, + "grad_norm": 0.18729639053344727, + "learning_rate": 1.01132391500808e-05, + "loss": 1.7634, + "step": 26070 + }, + { + "epoch": 8.002148557397176, + "grad_norm": 0.1855447143316269, + "learning_rate": 1.0110242060331304e-05, + "loss": 1.7588, + "step": 26071 + }, + { + "epoch": 8.002455494168201, + "grad_norm": 0.16488726437091827, + "learning_rate": 1.010724536479577e-05, + "loss": 1.7406, + "step": 26072 + }, + { + "epoch": 8.002762430939226, + "grad_norm": 0.17228275537490845, + "learning_rate": 1.0104249063503823e-05, + "loss": 1.7323, + "step": 26073 + }, + { + "epoch": 8.003069367710252, + "grad_norm": 0.1483743041753769, + "learning_rate": 1.0101253156485069e-05, + "loss": 1.7033, + "step": 26074 + }, + { + "epoch": 8.003376304481277, + "grad_norm": 0.2499883621931076, + "learning_rate": 1.0098257643769116e-05, + "loss": 1.7127, + "step": 26075 + }, + { + "epoch": 8.003683241252302, + "grad_norm": 0.22971376776695251, + "learning_rate": 1.0095262525385568e-05, + "loss": 1.7582, + "step": 26076 + }, + { + "epoch": 8.003990178023328, + "grad_norm": 0.18424302339553833, + "learning_rate": 1.0092267801364014e-05, + "loss": 1.6948, + "step": 26077 + }, + { + "epoch": 8.004297114794353, + "grad_norm": 0.20067891478538513, + "learning_rate": 1.0089273471734085e-05, + "loss": 1.7259, + "step": 26078 + }, + { + "epoch": 8.004604051565378, + "grad_norm": 0.2022552639245987, + "learning_rate": 1.0086279536525322e-05, + "loss": 1.7332, + "step": 26079 + }, + { + "epoch": 8.004910988336404, + "grad_norm": 0.1658320426940918, + "learning_rate": 1.0083285995767362e-05, + "loss": 1.7424, + "step": 26080 + }, + { + "epoch": 8.005217925107427, + "grad_norm": 0.16180957853794098, + "learning_rate": 1.0080292849489741e-05, + "loss": 1.6797, + "step": 26081 + }, + { + "epoch": 8.005524861878452, + "grad_norm": 0.18383777141571045, + "learning_rate": 1.007730009772208e-05, + "loss": 1.7597, + "step": 26082 + }, + { + "epoch": 8.005831798649478, + "grad_norm": 0.17468489706516266, + "learning_rate": 1.0074307740493938e-05, + "loss": 1.7266, + "step": 26083 + }, + { + "epoch": 8.006138735420503, + "grad_norm": 0.1647786945104599, + "learning_rate": 1.0071315777834883e-05, + "loss": 1.6742, + "step": 26084 + }, + { + "epoch": 8.006445672191528, + "grad_norm": 0.23006537556648254, + "learning_rate": 1.0068324209774493e-05, + "loss": 1.6649, + "step": 26085 + }, + { + "epoch": 8.006752608962554, + "grad_norm": 0.19266989827156067, + "learning_rate": 1.0065333036342328e-05, + "loss": 1.7484, + "step": 26086 + }, + { + "epoch": 8.00705954573358, + "grad_norm": 0.1709250807762146, + "learning_rate": 1.0062342257567947e-05, + "loss": 1.6569, + "step": 26087 + }, + { + "epoch": 8.007366482504604, + "grad_norm": 0.15847361087799072, + "learning_rate": 1.005935187348091e-05, + "loss": 1.6907, + "step": 26088 + }, + { + "epoch": 8.00767341927563, + "grad_norm": 0.14707811176776886, + "learning_rate": 1.0056361884110765e-05, + "loss": 1.7121, + "step": 26089 + }, + { + "epoch": 8.007980356046655, + "grad_norm": 0.1740313321352005, + "learning_rate": 1.0053372289487067e-05, + "loss": 1.6978, + "step": 26090 + }, + { + "epoch": 8.008287292817679, + "grad_norm": 0.17271417379379272, + "learning_rate": 1.0050383089639354e-05, + "loss": 1.7673, + "step": 26091 + }, + { + "epoch": 8.008594229588704, + "grad_norm": 0.179611936211586, + "learning_rate": 1.0047394284597173e-05, + "loss": 1.7291, + "step": 26092 + }, + { + "epoch": 8.00890116635973, + "grad_norm": 0.1823183298110962, + "learning_rate": 1.0044405874390057e-05, + "loss": 1.7215, + "step": 26093 + }, + { + "epoch": 8.009208103130755, + "grad_norm": 0.2914387881755829, + "learning_rate": 1.004141785904753e-05, + "loss": 1.8169, + "step": 26094 + }, + { + "epoch": 8.00951503990178, + "grad_norm": 0.21860483288764954, + "learning_rate": 1.0038430238599156e-05, + "loss": 1.8372, + "step": 26095 + }, + { + "epoch": 8.009821976672805, + "grad_norm": 0.2060404270887375, + "learning_rate": 1.0035443013074407e-05, + "loss": 1.7224, + "step": 26096 + }, + { + "epoch": 8.01012891344383, + "grad_norm": 0.21953152120113373, + "learning_rate": 1.003245618250287e-05, + "loss": 1.7571, + "step": 26097 + }, + { + "epoch": 8.010435850214856, + "grad_norm": 0.16731835901737213, + "learning_rate": 1.0029469746913995e-05, + "loss": 1.7222, + "step": 26098 + }, + { + "epoch": 8.010742786985881, + "grad_norm": 0.19284974038600922, + "learning_rate": 1.0026483706337336e-05, + "loss": 1.6582, + "step": 26099 + }, + { + "epoch": 8.011049723756907, + "grad_norm": 0.14466765522956848, + "learning_rate": 1.00234980608024e-05, + "loss": 1.6772, + "step": 26100 + }, + { + "epoch": 8.011356660527932, + "grad_norm": 0.19553600251674652, + "learning_rate": 1.0020512810338688e-05, + "loss": 1.6841, + "step": 26101 + }, + { + "epoch": 8.011663597298956, + "grad_norm": 0.19986452162265778, + "learning_rate": 1.0017527954975698e-05, + "loss": 1.7025, + "step": 26102 + }, + { + "epoch": 8.011970534069981, + "grad_norm": 0.17204077541828156, + "learning_rate": 1.0014543494742933e-05, + "loss": 1.7508, + "step": 26103 + }, + { + "epoch": 8.012277470841006, + "grad_norm": 0.19889704883098602, + "learning_rate": 1.0011559429669887e-05, + "loss": 1.6973, + "step": 26104 + }, + { + "epoch": 8.012584407612032, + "grad_norm": 0.16140232980251312, + "learning_rate": 1.0008575759786042e-05, + "loss": 1.7932, + "step": 26105 + }, + { + "epoch": 8.012891344383057, + "grad_norm": 0.21359173953533173, + "learning_rate": 1.0005592485120896e-05, + "loss": 1.6986, + "step": 26106 + }, + { + "epoch": 8.013198281154082, + "grad_norm": 0.1766652911901474, + "learning_rate": 1.0002609605703927e-05, + "loss": 1.7275, + "step": 26107 + }, + { + "epoch": 8.013505217925108, + "grad_norm": 0.176233172416687, + "learning_rate": 9.999627121564614e-06, + "loss": 1.6787, + "step": 26108 + }, + { + "epoch": 8.013812154696133, + "grad_norm": 0.15688678622245789, + "learning_rate": 9.996645032732426e-06, + "loss": 1.6917, + "step": 26109 + }, + { + "epoch": 8.014119091467158, + "grad_norm": 0.1363043189048767, + "learning_rate": 9.993663339236842e-06, + "loss": 1.6621, + "step": 26110 + }, + { + "epoch": 8.014426028238184, + "grad_norm": 0.1586332768201828, + "learning_rate": 9.990682041107313e-06, + "loss": 1.7161, + "step": 26111 + }, + { + "epoch": 8.014732965009209, + "grad_norm": 0.19763816893100739, + "learning_rate": 9.987701138373334e-06, + "loss": 1.736, + "step": 26112 + }, + { + "epoch": 8.015039901780233, + "grad_norm": 0.15302304923534393, + "learning_rate": 9.984720631064326e-06, + "loss": 1.6814, + "step": 26113 + }, + { + "epoch": 8.015346838551258, + "grad_norm": 0.1768827736377716, + "learning_rate": 9.981740519209786e-06, + "loss": 1.7006, + "step": 26114 + }, + { + "epoch": 8.015653775322283, + "grad_norm": 0.14857567846775055, + "learning_rate": 9.978760802839116e-06, + "loss": 1.6891, + "step": 26115 + }, + { + "epoch": 8.015960712093309, + "grad_norm": 0.20578980445861816, + "learning_rate": 9.9757814819818e-06, + "loss": 1.7798, + "step": 26116 + }, + { + "epoch": 8.016267648864334, + "grad_norm": 0.16164197027683258, + "learning_rate": 9.97280255666727e-06, + "loss": 1.6855, + "step": 26117 + }, + { + "epoch": 8.01657458563536, + "grad_norm": 0.2176574170589447, + "learning_rate": 9.969824026924968e-06, + "loss": 1.8144, + "step": 26118 + }, + { + "epoch": 8.016881522406385, + "grad_norm": 0.16946040093898773, + "learning_rate": 9.966845892784326e-06, + "loss": 1.7029, + "step": 26119 + }, + { + "epoch": 8.01718845917741, + "grad_norm": 0.17593413591384888, + "learning_rate": 9.96386815427478e-06, + "loss": 1.6993, + "step": 26120 + }, + { + "epoch": 8.017495395948435, + "grad_norm": 0.16679200530052185, + "learning_rate": 9.96089081142575e-06, + "loss": 1.6993, + "step": 26121 + }, + { + "epoch": 8.01780233271946, + "grad_norm": 0.19294987618923187, + "learning_rate": 9.957913864266667e-06, + "loss": 1.7417, + "step": 26122 + }, + { + "epoch": 8.018109269490484, + "grad_norm": 0.17427025735378265, + "learning_rate": 9.954937312826951e-06, + "loss": 1.6957, + "step": 26123 + }, + { + "epoch": 8.01841620626151, + "grad_norm": 0.1996718794107437, + "learning_rate": 9.951961157136013e-06, + "loss": 1.7348, + "step": 26124 + }, + { + "epoch": 8.018723143032535, + "grad_norm": 0.19701123237609863, + "learning_rate": 9.948985397223271e-06, + "loss": 1.7336, + "step": 26125 + }, + { + "epoch": 8.01903007980356, + "grad_norm": 0.15205782651901245, + "learning_rate": 9.946010033118124e-06, + "loss": 1.6971, + "step": 26126 + }, + { + "epoch": 8.019337016574585, + "grad_norm": 0.16516798734664917, + "learning_rate": 9.943035064849986e-06, + "loss": 1.7176, + "step": 26127 + }, + { + "epoch": 8.01964395334561, + "grad_norm": 0.18073998391628265, + "learning_rate": 9.94006049244825e-06, + "loss": 1.7344, + "step": 26128 + }, + { + "epoch": 8.019950890116636, + "grad_norm": 0.15453651547431946, + "learning_rate": 9.937086315942324e-06, + "loss": 1.7268, + "step": 26129 + }, + { + "epoch": 8.020257826887661, + "grad_norm": 0.17114359140396118, + "learning_rate": 9.934112535361574e-06, + "loss": 1.6708, + "step": 26130 + }, + { + "epoch": 8.020564763658687, + "grad_norm": 0.15452778339385986, + "learning_rate": 9.931139150735431e-06, + "loss": 1.697, + "step": 26131 + }, + { + "epoch": 8.020871700429712, + "grad_norm": 0.18605299293994904, + "learning_rate": 9.928166162093234e-06, + "loss": 1.7463, + "step": 26132 + }, + { + "epoch": 8.021178637200737, + "grad_norm": 0.14081695675849915, + "learning_rate": 9.925193569464398e-06, + "loss": 1.678, + "step": 26133 + }, + { + "epoch": 8.021485573971761, + "grad_norm": 0.15573516488075256, + "learning_rate": 9.922221372878288e-06, + "loss": 1.7125, + "step": 26134 + }, + { + "epoch": 8.021792510742786, + "grad_norm": 0.1690043956041336, + "learning_rate": 9.919249572364275e-06, + "loss": 1.7067, + "step": 26135 + }, + { + "epoch": 8.022099447513812, + "grad_norm": 0.1895153820514679, + "learning_rate": 9.91627816795173e-06, + "loss": 1.7098, + "step": 26136 + }, + { + "epoch": 8.022406384284837, + "grad_norm": 0.1467704176902771, + "learning_rate": 9.913307159670022e-06, + "loss": 1.666, + "step": 26137 + }, + { + "epoch": 8.022713321055862, + "grad_norm": 0.17272399365901947, + "learning_rate": 9.910336547548505e-06, + "loss": 1.7017, + "step": 26138 + }, + { + "epoch": 8.023020257826888, + "grad_norm": 0.16714219748973846, + "learning_rate": 9.907366331616541e-06, + "loss": 1.7096, + "step": 26139 + }, + { + "epoch": 8.023327194597913, + "grad_norm": 0.1545754224061966, + "learning_rate": 9.90439651190348e-06, + "loss": 1.6768, + "step": 26140 + }, + { + "epoch": 8.023634131368938, + "grad_norm": 0.17502975463867188, + "learning_rate": 9.901427088438675e-06, + "loss": 1.6879, + "step": 26141 + }, + { + "epoch": 8.023941068139964, + "grad_norm": 0.15835684537887573, + "learning_rate": 9.898458061251465e-06, + "loss": 1.6908, + "step": 26142 + }, + { + "epoch": 8.024248004910989, + "grad_norm": 0.19534549117088318, + "learning_rate": 9.895489430371202e-06, + "loss": 1.7235, + "step": 26143 + }, + { + "epoch": 8.024554941682014, + "grad_norm": 0.18291355669498444, + "learning_rate": 9.89252119582722e-06, + "loss": 1.7618, + "step": 26144 + }, + { + "epoch": 8.024861878453038, + "grad_norm": 0.1474599689245224, + "learning_rate": 9.889553357648844e-06, + "loss": 1.7011, + "step": 26145 + }, + { + "epoch": 8.025168815224063, + "grad_norm": 0.1801324188709259, + "learning_rate": 9.886585915865421e-06, + "loss": 1.7386, + "step": 26146 + }, + { + "epoch": 8.025475751995089, + "grad_norm": 0.16178105771541595, + "learning_rate": 9.883618870506245e-06, + "loss": 1.6903, + "step": 26147 + }, + { + "epoch": 8.025782688766114, + "grad_norm": 0.15138550102710724, + "learning_rate": 9.880652221600694e-06, + "loss": 1.7064, + "step": 26148 + }, + { + "epoch": 8.02608962553714, + "grad_norm": 0.22056828439235687, + "learning_rate": 9.877685969178018e-06, + "loss": 1.7879, + "step": 26149 + }, + { + "epoch": 8.026396562308165, + "grad_norm": 0.15810613334178925, + "learning_rate": 9.874720113267599e-06, + "loss": 1.6895, + "step": 26150 + }, + { + "epoch": 8.02670349907919, + "grad_norm": 0.15241321921348572, + "learning_rate": 9.871754653898685e-06, + "loss": 1.7103, + "step": 26151 + }, + { + "epoch": 8.027010435850215, + "grad_norm": 0.1609175056219101, + "learning_rate": 9.868789591100625e-06, + "loss": 1.6845, + "step": 26152 + }, + { + "epoch": 8.02731737262124, + "grad_norm": 0.16068117320537567, + "learning_rate": 9.865824924902706e-06, + "loss": 1.6688, + "step": 26153 + }, + { + "epoch": 8.027624309392266, + "grad_norm": 0.14036257565021515, + "learning_rate": 9.862860655334233e-06, + "loss": 1.6881, + "step": 26154 + }, + { + "epoch": 8.027931246163291, + "grad_norm": 0.16418461501598358, + "learning_rate": 9.859896782424494e-06, + "loss": 1.7265, + "step": 26155 + }, + { + "epoch": 8.028238182934315, + "grad_norm": 0.19456401467323303, + "learning_rate": 9.856933306202782e-06, + "loss": 1.7152, + "step": 26156 + }, + { + "epoch": 8.02854511970534, + "grad_norm": 0.14537569880485535, + "learning_rate": 9.853970226698384e-06, + "loss": 1.6918, + "step": 26157 + }, + { + "epoch": 8.028852056476365, + "grad_norm": 0.18725928664207458, + "learning_rate": 9.851007543940578e-06, + "loss": 1.6815, + "step": 26158 + }, + { + "epoch": 8.02915899324739, + "grad_norm": 0.17676733434200287, + "learning_rate": 9.848045257958649e-06, + "loss": 1.7741, + "step": 26159 + }, + { + "epoch": 8.029465930018416, + "grad_norm": 0.1890053004026413, + "learning_rate": 9.845083368781877e-06, + "loss": 1.7433, + "step": 26160 + }, + { + "epoch": 8.029772866789441, + "grad_norm": 0.16931703686714172, + "learning_rate": 9.84212187643952e-06, + "loss": 1.7474, + "step": 26161 + }, + { + "epoch": 8.030079803560467, + "grad_norm": 0.17416565120220184, + "learning_rate": 9.839160780960855e-06, + "loss": 1.7259, + "step": 26162 + }, + { + "epoch": 8.030386740331492, + "grad_norm": 0.17702054977416992, + "learning_rate": 9.83620008237514e-06, + "loss": 1.7166, + "step": 26163 + }, + { + "epoch": 8.030693677102517, + "grad_norm": 0.1579936146736145, + "learning_rate": 9.833239780711622e-06, + "loss": 1.6593, + "step": 26164 + }, + { + "epoch": 8.031000613873543, + "grad_norm": 0.2263452112674713, + "learning_rate": 9.830279875999604e-06, + "loss": 1.7735, + "step": 26165 + }, + { + "epoch": 8.031307550644566, + "grad_norm": 0.160926952958107, + "learning_rate": 9.827320368268273e-06, + "loss": 1.7, + "step": 26166 + }, + { + "epoch": 8.031614487415592, + "grad_norm": 0.21756359934806824, + "learning_rate": 9.824361257546938e-06, + "loss": 1.736, + "step": 26167 + }, + { + "epoch": 8.031921424186617, + "grad_norm": 0.20553551614284515, + "learning_rate": 9.821402543864783e-06, + "loss": 1.7254, + "step": 26168 + }, + { + "epoch": 8.032228360957642, + "grad_norm": 0.14283208549022675, + "learning_rate": 9.818444227251089e-06, + "loss": 1.6532, + "step": 26169 + }, + { + "epoch": 8.032535297728668, + "grad_norm": 0.22624479234218597, + "learning_rate": 9.815486307735084e-06, + "loss": 1.7933, + "step": 26170 + }, + { + "epoch": 8.032842234499693, + "grad_norm": 0.15582896769046783, + "learning_rate": 9.812528785345999e-06, + "loss": 1.6959, + "step": 26171 + }, + { + "epoch": 8.033149171270718, + "grad_norm": 0.19829398393630981, + "learning_rate": 9.809571660113055e-06, + "loss": 1.7431, + "step": 26172 + }, + { + "epoch": 8.033456108041744, + "grad_norm": 0.1469334214925766, + "learning_rate": 9.806614932065477e-06, + "loss": 1.7441, + "step": 26173 + }, + { + "epoch": 8.033763044812769, + "grad_norm": 0.17737391591072083, + "learning_rate": 9.803658601232491e-06, + "loss": 1.719, + "step": 26174 + }, + { + "epoch": 8.034069981583794, + "grad_norm": 0.16895830631256104, + "learning_rate": 9.800702667643314e-06, + "loss": 1.7169, + "step": 26175 + }, + { + "epoch": 8.03437691835482, + "grad_norm": 0.17256470024585724, + "learning_rate": 9.79774713132715e-06, + "loss": 1.712, + "step": 26176 + }, + { + "epoch": 8.034683855125843, + "grad_norm": 0.1516820341348648, + "learning_rate": 9.794791992313213e-06, + "loss": 1.6345, + "step": 26177 + }, + { + "epoch": 8.034990791896869, + "grad_norm": 0.20021840929985046, + "learning_rate": 9.79183725063071e-06, + "loss": 1.6962, + "step": 26178 + }, + { + "epoch": 8.035297728667894, + "grad_norm": 0.19088859856128693, + "learning_rate": 9.788882906308832e-06, + "loss": 1.7719, + "step": 26179 + }, + { + "epoch": 8.03560466543892, + "grad_norm": 0.16831208765506744, + "learning_rate": 9.78592895937679e-06, + "loss": 1.7101, + "step": 26180 + }, + { + "epoch": 8.035911602209945, + "grad_norm": 0.15665093064308167, + "learning_rate": 9.782975409863749e-06, + "loss": 1.7328, + "step": 26181 + }, + { + "epoch": 8.03621853898097, + "grad_norm": 0.20523908734321594, + "learning_rate": 9.780022257798943e-06, + "loss": 1.7338, + "step": 26182 + }, + { + "epoch": 8.036525475751995, + "grad_norm": 0.15819329023361206, + "learning_rate": 9.777069503211505e-06, + "loss": 1.7116, + "step": 26183 + }, + { + "epoch": 8.03683241252302, + "grad_norm": 0.14828373491764069, + "learning_rate": 9.774117146130673e-06, + "loss": 1.6671, + "step": 26184 + }, + { + "epoch": 8.037139349294046, + "grad_norm": 0.17743347585201263, + "learning_rate": 9.771165186585563e-06, + "loss": 1.7474, + "step": 26185 + }, + { + "epoch": 8.037446286065071, + "grad_norm": 0.14112113416194916, + "learning_rate": 9.768213624605388e-06, + "loss": 1.6324, + "step": 26186 + }, + { + "epoch": 8.037753222836097, + "grad_norm": 0.14532047510147095, + "learning_rate": 9.76526246021931e-06, + "loss": 1.6814, + "step": 26187 + }, + { + "epoch": 8.03806015960712, + "grad_norm": 0.16272012889385223, + "learning_rate": 9.762311693456489e-06, + "loss": 1.6556, + "step": 26188 + }, + { + "epoch": 8.038367096378146, + "grad_norm": 0.17599201202392578, + "learning_rate": 9.759361324346088e-06, + "loss": 1.7186, + "step": 26189 + }, + { + "epoch": 8.03867403314917, + "grad_norm": 0.20449498295783997, + "learning_rate": 9.75641135291726e-06, + "loss": 1.7324, + "step": 26190 + }, + { + "epoch": 8.038980969920196, + "grad_norm": 0.1787404716014862, + "learning_rate": 9.753461779199168e-06, + "loss": 1.7038, + "step": 26191 + }, + { + "epoch": 8.039287906691222, + "grad_norm": 0.15954211354255676, + "learning_rate": 9.750512603220956e-06, + "loss": 1.6926, + "step": 26192 + }, + { + "epoch": 8.039594843462247, + "grad_norm": 0.21806633472442627, + "learning_rate": 9.747563825011768e-06, + "loss": 1.7317, + "step": 26193 + }, + { + "epoch": 8.039901780233272, + "grad_norm": 0.14846986532211304, + "learning_rate": 9.744615444600746e-06, + "loss": 1.655, + "step": 26194 + }, + { + "epoch": 8.040208717004298, + "grad_norm": 0.17799098789691925, + "learning_rate": 9.74166746201703e-06, + "loss": 1.6899, + "step": 26195 + }, + { + "epoch": 8.040515653775323, + "grad_norm": 0.1648644655942917, + "learning_rate": 9.738719877289754e-06, + "loss": 1.7181, + "step": 26196 + }, + { + "epoch": 8.040822590546348, + "grad_norm": 0.17811881005764008, + "learning_rate": 9.735772690448042e-06, + "loss": 1.7257, + "step": 26197 + }, + { + "epoch": 8.041129527317372, + "grad_norm": 0.19059741497039795, + "learning_rate": 9.732825901521014e-06, + "loss": 1.7306, + "step": 26198 + }, + { + "epoch": 8.041436464088397, + "grad_norm": 0.17326456308364868, + "learning_rate": 9.729879510537825e-06, + "loss": 1.6922, + "step": 26199 + }, + { + "epoch": 8.041743400859422, + "grad_norm": 0.1428811252117157, + "learning_rate": 9.726933517527548e-06, + "loss": 1.6495, + "step": 26200 + }, + { + "epoch": 8.042050337630448, + "grad_norm": 0.1494823843240738, + "learning_rate": 9.72398792251934e-06, + "loss": 1.6779, + "step": 26201 + }, + { + "epoch": 8.042357274401473, + "grad_norm": 0.19112205505371094, + "learning_rate": 9.721042725542267e-06, + "loss": 1.7794, + "step": 26202 + }, + { + "epoch": 8.042664211172498, + "grad_norm": 0.15820644795894623, + "learning_rate": 9.718097926625468e-06, + "loss": 1.6834, + "step": 26203 + }, + { + "epoch": 8.042971147943524, + "grad_norm": 0.17020943760871887, + "learning_rate": 9.715153525798043e-06, + "loss": 1.6852, + "step": 26204 + }, + { + "epoch": 8.043278084714549, + "grad_norm": 0.18933680653572083, + "learning_rate": 9.712209523089072e-06, + "loss": 1.7412, + "step": 26205 + }, + { + "epoch": 8.043585021485574, + "grad_norm": 0.16407641768455505, + "learning_rate": 9.709265918527666e-06, + "loss": 1.7209, + "step": 26206 + }, + { + "epoch": 8.0438919582566, + "grad_norm": 0.19043506681919098, + "learning_rate": 9.706322712142912e-06, + "loss": 1.7351, + "step": 26207 + }, + { + "epoch": 8.044198895027625, + "grad_norm": 0.14904475212097168, + "learning_rate": 9.703379903963889e-06, + "loss": 1.7484, + "step": 26208 + }, + { + "epoch": 8.044505831798649, + "grad_norm": 0.14778849482536316, + "learning_rate": 9.700437494019682e-06, + "loss": 1.7231, + "step": 26209 + }, + { + "epoch": 8.044812768569674, + "grad_norm": 0.186212420463562, + "learning_rate": 9.697495482339374e-06, + "loss": 1.7153, + "step": 26210 + }, + { + "epoch": 8.0451197053407, + "grad_norm": 0.13795694708824158, + "learning_rate": 9.694553868952044e-06, + "loss": 1.693, + "step": 26211 + }, + { + "epoch": 8.045426642111725, + "grad_norm": 0.16083405911922455, + "learning_rate": 9.69161265388675e-06, + "loss": 1.669, + "step": 26212 + }, + { + "epoch": 8.04573357888275, + "grad_norm": 0.15548262000083923, + "learning_rate": 9.688671837172569e-06, + "loss": 1.7265, + "step": 26213 + }, + { + "epoch": 8.046040515653775, + "grad_norm": 0.14771351218223572, + "learning_rate": 9.685731418838556e-06, + "loss": 1.6978, + "step": 26214 + }, + { + "epoch": 8.0463474524248, + "grad_norm": 0.1525130569934845, + "learning_rate": 9.682791398913765e-06, + "loss": 1.731, + "step": 26215 + }, + { + "epoch": 8.046654389195826, + "grad_norm": 0.16103293001651764, + "learning_rate": 9.679851777427284e-06, + "loss": 1.7015, + "step": 26216 + }, + { + "epoch": 8.046961325966851, + "grad_norm": 0.16990229487419128, + "learning_rate": 9.676912554408112e-06, + "loss": 1.6995, + "step": 26217 + }, + { + "epoch": 8.047268262737877, + "grad_norm": 0.14605717360973358, + "learning_rate": 9.673973729885355e-06, + "loss": 1.7085, + "step": 26218 + }, + { + "epoch": 8.047575199508902, + "grad_norm": 0.19646432995796204, + "learning_rate": 9.671035303887993e-06, + "loss": 1.8441, + "step": 26219 + }, + { + "epoch": 8.047882136279926, + "grad_norm": 0.2000361531972885, + "learning_rate": 9.668097276445115e-06, + "loss": 1.7126, + "step": 26220 + }, + { + "epoch": 8.04818907305095, + "grad_norm": 0.2262575775384903, + "learning_rate": 9.665159647585736e-06, + "loss": 1.7721, + "step": 26221 + }, + { + "epoch": 8.048496009821976, + "grad_norm": 0.1880655288696289, + "learning_rate": 9.662222417338895e-06, + "loss": 1.7151, + "step": 26222 + }, + { + "epoch": 8.048802946593002, + "grad_norm": 0.1746743619441986, + "learning_rate": 9.659285585733613e-06, + "loss": 1.6745, + "step": 26223 + }, + { + "epoch": 8.049109883364027, + "grad_norm": 0.14917364716529846, + "learning_rate": 9.656349152798916e-06, + "loss": 1.6541, + "step": 26224 + }, + { + "epoch": 8.049416820135052, + "grad_norm": 0.18189994990825653, + "learning_rate": 9.65341311856382e-06, + "loss": 1.7361, + "step": 26225 + }, + { + "epoch": 8.049723756906078, + "grad_norm": 0.16237786412239075, + "learning_rate": 9.650477483057346e-06, + "loss": 1.7446, + "step": 26226 + }, + { + "epoch": 8.050030693677103, + "grad_norm": 0.1651264876127243, + "learning_rate": 9.647542246308506e-06, + "loss": 1.7604, + "step": 26227 + }, + { + "epoch": 8.050337630448128, + "grad_norm": 0.1673632264137268, + "learning_rate": 9.644607408346296e-06, + "loss": 1.678, + "step": 26228 + }, + { + "epoch": 8.050644567219154, + "grad_norm": 0.20457343757152557, + "learning_rate": 9.641672969199738e-06, + "loss": 1.6963, + "step": 26229 + }, + { + "epoch": 8.050951503990179, + "grad_norm": 0.15247805416584015, + "learning_rate": 9.638738928897816e-06, + "loss": 1.7036, + "step": 26230 + }, + { + "epoch": 8.051258440761202, + "grad_norm": 0.21655996143817902, + "learning_rate": 9.635805287469535e-06, + "loss": 1.7422, + "step": 26231 + }, + { + "epoch": 8.051565377532228, + "grad_norm": 0.1631101369857788, + "learning_rate": 9.632872044943869e-06, + "loss": 1.6681, + "step": 26232 + }, + { + "epoch": 8.051872314303253, + "grad_norm": 0.18587349355220795, + "learning_rate": 9.629939201349853e-06, + "loss": 1.7036, + "step": 26233 + }, + { + "epoch": 8.052179251074278, + "grad_norm": 0.272533655166626, + "learning_rate": 9.627006756716405e-06, + "loss": 1.818, + "step": 26234 + }, + { + "epoch": 8.052486187845304, + "grad_norm": 0.1740235984325409, + "learning_rate": 9.624074711072572e-06, + "loss": 1.7074, + "step": 26235 + }, + { + "epoch": 8.05279312461633, + "grad_norm": 0.21405693888664246, + "learning_rate": 9.621143064447274e-06, + "loss": 1.7473, + "step": 26236 + }, + { + "epoch": 8.053100061387354, + "grad_norm": 0.172579824924469, + "learning_rate": 9.618211816869515e-06, + "loss": 1.7154, + "step": 26237 + }, + { + "epoch": 8.05340699815838, + "grad_norm": 0.19767756760120392, + "learning_rate": 9.615280968368257e-06, + "loss": 1.7011, + "step": 26238 + }, + { + "epoch": 8.053713934929405, + "grad_norm": 0.18467654287815094, + "learning_rate": 9.612350518972463e-06, + "loss": 1.6922, + "step": 26239 + }, + { + "epoch": 8.05402087170043, + "grad_norm": 0.1530679613351822, + "learning_rate": 9.609420468711088e-06, + "loss": 1.6633, + "step": 26240 + }, + { + "epoch": 8.054327808471454, + "grad_norm": 0.3850557804107666, + "learning_rate": 9.6064908176131e-06, + "loss": 1.7637, + "step": 26241 + }, + { + "epoch": 8.05463474524248, + "grad_norm": 0.1556573212146759, + "learning_rate": 9.603561565707441e-06, + "loss": 1.6853, + "step": 26242 + }, + { + "epoch": 8.054941682013505, + "grad_norm": 0.2009180188179016, + "learning_rate": 9.600632713023067e-06, + "loss": 1.7172, + "step": 26243 + }, + { + "epoch": 8.05524861878453, + "grad_norm": 0.18538115918636322, + "learning_rate": 9.597704259588919e-06, + "loss": 1.7517, + "step": 26244 + }, + { + "epoch": 8.055555555555555, + "grad_norm": 0.1626463681459427, + "learning_rate": 9.594776205433936e-06, + "loss": 1.697, + "step": 26245 + }, + { + "epoch": 8.05586249232658, + "grad_norm": 0.15908029675483704, + "learning_rate": 9.591848550587062e-06, + "loss": 1.7355, + "step": 26246 + }, + { + "epoch": 8.056169429097606, + "grad_norm": 0.1679108589887619, + "learning_rate": 9.588921295077219e-06, + "loss": 1.6732, + "step": 26247 + }, + { + "epoch": 8.056476365868631, + "grad_norm": 0.17123237252235413, + "learning_rate": 9.585994438933344e-06, + "loss": 1.7627, + "step": 26248 + }, + { + "epoch": 8.056783302639657, + "grad_norm": 0.2438436597585678, + "learning_rate": 9.583067982184346e-06, + "loss": 1.7475, + "step": 26249 + }, + { + "epoch": 8.057090239410682, + "grad_norm": 0.18769577145576477, + "learning_rate": 9.580141924859182e-06, + "loss": 1.7165, + "step": 26250 + }, + { + "epoch": 8.057397176181707, + "grad_norm": 0.18146662414073944, + "learning_rate": 9.577216266986727e-06, + "loss": 1.7601, + "step": 26251 + }, + { + "epoch": 8.057704112952731, + "grad_norm": 0.20209676027297974, + "learning_rate": 9.574291008595932e-06, + "loss": 1.7635, + "step": 26252 + }, + { + "epoch": 8.058011049723756, + "grad_norm": 0.16949260234832764, + "learning_rate": 9.571366149715665e-06, + "loss": 1.7437, + "step": 26253 + }, + { + "epoch": 8.058317986494782, + "grad_norm": 0.14449356496334076, + "learning_rate": 9.568441690374868e-06, + "loss": 1.6906, + "step": 26254 + }, + { + "epoch": 8.058624923265807, + "grad_norm": 0.21796976029872894, + "learning_rate": 9.565517630602428e-06, + "loss": 1.7986, + "step": 26255 + }, + { + "epoch": 8.058931860036832, + "grad_norm": 0.15194009244441986, + "learning_rate": 9.562593970427241e-06, + "loss": 1.6838, + "step": 26256 + }, + { + "epoch": 8.059238796807858, + "grad_norm": 0.19820080697536469, + "learning_rate": 9.559670709878198e-06, + "loss": 1.7327, + "step": 26257 + }, + { + "epoch": 8.059545733578883, + "grad_norm": 0.1478637307882309, + "learning_rate": 9.5567478489842e-06, + "loss": 1.6814, + "step": 26258 + }, + { + "epoch": 8.059852670349908, + "grad_norm": 0.147980734705925, + "learning_rate": 9.553825387774118e-06, + "loss": 1.693, + "step": 26259 + }, + { + "epoch": 8.060159607120934, + "grad_norm": 0.16274768114089966, + "learning_rate": 9.550903326276839e-06, + "loss": 1.7275, + "step": 26260 + }, + { + "epoch": 8.060466543891959, + "grad_norm": 0.16221144795417786, + "learning_rate": 9.547981664521244e-06, + "loss": 1.7071, + "step": 26261 + }, + { + "epoch": 8.060773480662984, + "grad_norm": 0.18921487033367157, + "learning_rate": 9.545060402536204e-06, + "loss": 1.6771, + "step": 26262 + }, + { + "epoch": 8.061080417434008, + "grad_norm": 0.19136327505111694, + "learning_rate": 9.542139540350586e-06, + "loss": 1.7235, + "step": 26263 + }, + { + "epoch": 8.061387354205033, + "grad_norm": 0.18764656782150269, + "learning_rate": 9.539219077993261e-06, + "loss": 1.7374, + "step": 26264 + }, + { + "epoch": 8.061694290976058, + "grad_norm": 0.16516967117786407, + "learning_rate": 9.53629901549309e-06, + "loss": 1.7124, + "step": 26265 + }, + { + "epoch": 8.062001227747084, + "grad_norm": 0.1457880437374115, + "learning_rate": 9.533379352878907e-06, + "loss": 1.6471, + "step": 26266 + }, + { + "epoch": 8.06230816451811, + "grad_norm": 0.1898411363363266, + "learning_rate": 9.530460090179622e-06, + "loss": 1.7745, + "step": 26267 + }, + { + "epoch": 8.062615101289135, + "grad_norm": 0.18252579867839813, + "learning_rate": 9.52754122742402e-06, + "loss": 1.7165, + "step": 26268 + }, + { + "epoch": 8.06292203806016, + "grad_norm": 0.1838676929473877, + "learning_rate": 9.524622764641006e-06, + "loss": 1.7169, + "step": 26269 + }, + { + "epoch": 8.063228974831185, + "grad_norm": 0.1684531718492508, + "learning_rate": 9.521704701859362e-06, + "loss": 1.6831, + "step": 26270 + }, + { + "epoch": 8.06353591160221, + "grad_norm": 0.18296435475349426, + "learning_rate": 9.51878703910798e-06, + "loss": 1.6952, + "step": 26271 + }, + { + "epoch": 8.063842848373236, + "grad_norm": 0.20634715259075165, + "learning_rate": 9.515869776415665e-06, + "loss": 1.6899, + "step": 26272 + }, + { + "epoch": 8.06414978514426, + "grad_norm": 0.18681001663208008, + "learning_rate": 9.512952913811252e-06, + "loss": 1.6648, + "step": 26273 + }, + { + "epoch": 8.064456721915285, + "grad_norm": 0.19397646188735962, + "learning_rate": 9.510036451323568e-06, + "loss": 1.7309, + "step": 26274 + }, + { + "epoch": 8.06476365868631, + "grad_norm": 0.17254865169525146, + "learning_rate": 9.507120388981438e-06, + "loss": 1.6671, + "step": 26275 + }, + { + "epoch": 8.065070595457335, + "grad_norm": 0.16224531829357147, + "learning_rate": 9.504204726813682e-06, + "loss": 1.6881, + "step": 26276 + }, + { + "epoch": 8.06537753222836, + "grad_norm": 0.16534289717674255, + "learning_rate": 9.501289464849106e-06, + "loss": 1.7372, + "step": 26277 + }, + { + "epoch": 8.065684468999386, + "grad_norm": 0.20247776806354523, + "learning_rate": 9.498374603116523e-06, + "loss": 1.7108, + "step": 26278 + }, + { + "epoch": 8.065991405770411, + "grad_norm": 0.1420232504606247, + "learning_rate": 9.49546014164474e-06, + "loss": 1.6403, + "step": 26279 + }, + { + "epoch": 8.066298342541437, + "grad_norm": 0.139396533370018, + "learning_rate": 9.492546080462567e-06, + "loss": 1.6578, + "step": 26280 + }, + { + "epoch": 8.066605279312462, + "grad_norm": 0.17437872290611267, + "learning_rate": 9.489632419598788e-06, + "loss": 1.7094, + "step": 26281 + }, + { + "epoch": 8.066912216083487, + "grad_norm": 0.29614368081092834, + "learning_rate": 9.486719159082209e-06, + "loss": 1.773, + "step": 26282 + }, + { + "epoch": 8.067219152854513, + "grad_norm": 0.20771834254264832, + "learning_rate": 9.483806298941617e-06, + "loss": 1.7421, + "step": 26283 + }, + { + "epoch": 8.067526089625536, + "grad_norm": 0.20772570371627808, + "learning_rate": 9.4808938392058e-06, + "loss": 1.7437, + "step": 26284 + }, + { + "epoch": 8.067833026396562, + "grad_norm": 0.1837359070777893, + "learning_rate": 9.477981779903522e-06, + "loss": 1.7142, + "step": 26285 + }, + { + "epoch": 8.068139963167587, + "grad_norm": 0.18425285816192627, + "learning_rate": 9.475070121063607e-06, + "loss": 1.6804, + "step": 26286 + }, + { + "epoch": 8.068446899938612, + "grad_norm": 0.16501453518867493, + "learning_rate": 9.472158862714775e-06, + "loss": 1.7466, + "step": 26287 + }, + { + "epoch": 8.068753836709638, + "grad_norm": 0.17685455083847046, + "learning_rate": 9.469248004885839e-06, + "loss": 1.6839, + "step": 26288 + }, + { + "epoch": 8.069060773480663, + "grad_norm": 0.18923965096473694, + "learning_rate": 9.466337547605547e-06, + "loss": 1.6774, + "step": 26289 + }, + { + "epoch": 8.069367710251688, + "grad_norm": 0.17584268748760223, + "learning_rate": 9.463427490902665e-06, + "loss": 1.6904, + "step": 26290 + }, + { + "epoch": 8.069674647022714, + "grad_norm": 0.25477278232574463, + "learning_rate": 9.460517834805966e-06, + "loss": 1.7898, + "step": 26291 + }, + { + "epoch": 8.069981583793739, + "grad_norm": 0.23453976213932037, + "learning_rate": 9.457608579344169e-06, + "loss": 1.7456, + "step": 26292 + }, + { + "epoch": 8.070288520564764, + "grad_norm": 0.20332537591457367, + "learning_rate": 9.45469972454605e-06, + "loss": 1.76, + "step": 26293 + }, + { + "epoch": 8.07059545733579, + "grad_norm": 0.1937316656112671, + "learning_rate": 9.451791270440358e-06, + "loss": 1.698, + "step": 26294 + }, + { + "epoch": 8.070902394106813, + "grad_norm": 0.19909465312957764, + "learning_rate": 9.448883217055832e-06, + "loss": 1.7373, + "step": 26295 + }, + { + "epoch": 8.071209330877839, + "grad_norm": 0.16824916005134583, + "learning_rate": 9.445975564421206e-06, + "loss": 1.6619, + "step": 26296 + }, + { + "epoch": 8.071516267648864, + "grad_norm": 0.17873473465442657, + "learning_rate": 9.443068312565222e-06, + "loss": 1.7438, + "step": 26297 + }, + { + "epoch": 8.07182320441989, + "grad_norm": 0.152094304561615, + "learning_rate": 9.440161461516606e-06, + "loss": 1.6513, + "step": 26298 + }, + { + "epoch": 8.072130141190915, + "grad_norm": 0.14592084288597107, + "learning_rate": 9.43725501130409e-06, + "loss": 1.6503, + "step": 26299 + }, + { + "epoch": 8.07243707796194, + "grad_norm": 0.16904598474502563, + "learning_rate": 9.434348961956396e-06, + "loss": 1.6929, + "step": 26300 + }, + { + "epoch": 8.072744014732965, + "grad_norm": 0.15297052264213562, + "learning_rate": 9.431443313502235e-06, + "loss": 1.6871, + "step": 26301 + }, + { + "epoch": 8.07305095150399, + "grad_norm": 0.20306609570980072, + "learning_rate": 9.428538065970321e-06, + "loss": 1.7779, + "step": 26302 + }, + { + "epoch": 8.073357888275016, + "grad_norm": 0.177826926112175, + "learning_rate": 9.425633219389401e-06, + "loss": 1.7021, + "step": 26303 + }, + { + "epoch": 8.073664825046041, + "grad_norm": 0.22192324697971344, + "learning_rate": 9.422728773788125e-06, + "loss": 1.7713, + "step": 26304 + }, + { + "epoch": 8.073971761817067, + "grad_norm": 0.16998204588890076, + "learning_rate": 9.419824729195253e-06, + "loss": 1.6994, + "step": 26305 + }, + { + "epoch": 8.07427869858809, + "grad_norm": 0.1606592983007431, + "learning_rate": 9.416921085639436e-06, + "loss": 1.7274, + "step": 26306 + }, + { + "epoch": 8.074585635359115, + "grad_norm": 0.17434780299663544, + "learning_rate": 9.414017843149398e-06, + "loss": 1.714, + "step": 26307 + }, + { + "epoch": 8.07489257213014, + "grad_norm": 0.16548825800418854, + "learning_rate": 9.411115001753839e-06, + "loss": 1.7361, + "step": 26308 + }, + { + "epoch": 8.075199508901166, + "grad_norm": 0.23958922922611237, + "learning_rate": 9.408212561481405e-06, + "loss": 1.7286, + "step": 26309 + }, + { + "epoch": 8.075506445672191, + "grad_norm": 0.1900513619184494, + "learning_rate": 9.405310522360821e-06, + "loss": 1.7309, + "step": 26310 + }, + { + "epoch": 8.075813382443217, + "grad_norm": 0.1576761156320572, + "learning_rate": 9.402408884420755e-06, + "loss": 1.7039, + "step": 26311 + }, + { + "epoch": 8.076120319214242, + "grad_norm": 0.17078427970409393, + "learning_rate": 9.399507647689875e-06, + "loss": 1.737, + "step": 26312 + }, + { + "epoch": 8.076427255985267, + "grad_norm": 0.138477623462677, + "learning_rate": 9.396606812196856e-06, + "loss": 1.6673, + "step": 26313 + }, + { + "epoch": 8.076734192756293, + "grad_norm": 0.1546505093574524, + "learning_rate": 9.393706377970368e-06, + "loss": 1.7146, + "step": 26314 + }, + { + "epoch": 8.077041129527318, + "grad_norm": 0.14440344274044037, + "learning_rate": 9.390806345039077e-06, + "loss": 1.7044, + "step": 26315 + }, + { + "epoch": 8.077348066298342, + "grad_norm": 0.1944594532251358, + "learning_rate": 9.387906713431632e-06, + "loss": 1.7685, + "step": 26316 + }, + { + "epoch": 8.077655003069367, + "grad_norm": 0.17758207023143768, + "learning_rate": 9.385007483176706e-06, + "loss": 1.7068, + "step": 26317 + }, + { + "epoch": 8.077961939840392, + "grad_norm": 0.20713698863983154, + "learning_rate": 9.382108654302934e-06, + "loss": 1.6488, + "step": 26318 + }, + { + "epoch": 8.078268876611418, + "grad_norm": 0.14699894189834595, + "learning_rate": 9.379210226838958e-06, + "loss": 1.6746, + "step": 26319 + }, + { + "epoch": 8.078575813382443, + "grad_norm": 0.15119978785514832, + "learning_rate": 9.376312200813465e-06, + "loss": 1.6919, + "step": 26320 + }, + { + "epoch": 8.078882750153468, + "grad_norm": 0.14071249961853027, + "learning_rate": 9.373414576255041e-06, + "loss": 1.6755, + "step": 26321 + }, + { + "epoch": 8.079189686924494, + "grad_norm": 0.22004422545433044, + "learning_rate": 9.370517353192365e-06, + "loss": 1.7808, + "step": 26322 + }, + { + "epoch": 8.079496623695519, + "grad_norm": 0.15764497220516205, + "learning_rate": 9.36762053165403e-06, + "loss": 1.7108, + "step": 26323 + }, + { + "epoch": 8.079803560466544, + "grad_norm": 0.17802847921848297, + "learning_rate": 9.364724111668693e-06, + "loss": 1.7274, + "step": 26324 + }, + { + "epoch": 8.08011049723757, + "grad_norm": 0.16950444877147675, + "learning_rate": 9.361828093264984e-06, + "loss": 1.7196, + "step": 26325 + }, + { + "epoch": 8.080417434008595, + "grad_norm": 0.16647809743881226, + "learning_rate": 9.358932476471488e-06, + "loss": 1.7027, + "step": 26326 + }, + { + "epoch": 8.080724370779619, + "grad_norm": 0.20012708008289337, + "learning_rate": 9.356037261316863e-06, + "loss": 1.7101, + "step": 26327 + }, + { + "epoch": 8.081031307550644, + "grad_norm": 0.19795066118240356, + "learning_rate": 9.353142447829672e-06, + "loss": 1.7142, + "step": 26328 + }, + { + "epoch": 8.08133824432167, + "grad_norm": 0.1786295473575592, + "learning_rate": 9.350248036038567e-06, + "loss": 1.6646, + "step": 26329 + }, + { + "epoch": 8.081645181092695, + "grad_norm": 0.17646436393260956, + "learning_rate": 9.347354025972138e-06, + "loss": 1.7044, + "step": 26330 + }, + { + "epoch": 8.08195211786372, + "grad_norm": 0.24095231294631958, + "learning_rate": 9.344460417658979e-06, + "loss": 1.823, + "step": 26331 + }, + { + "epoch": 8.082259054634745, + "grad_norm": 0.16094247996807098, + "learning_rate": 9.341567211127694e-06, + "loss": 1.6933, + "step": 26332 + }, + { + "epoch": 8.08256599140577, + "grad_norm": 0.22386589646339417, + "learning_rate": 9.338674406406872e-06, + "loss": 1.7219, + "step": 26333 + }, + { + "epoch": 8.082872928176796, + "grad_norm": 0.2110683023929596, + "learning_rate": 9.3357820035251e-06, + "loss": 1.6951, + "step": 26334 + }, + { + "epoch": 8.083179864947821, + "grad_norm": 0.2240242063999176, + "learning_rate": 9.33289000251097e-06, + "loss": 1.756, + "step": 26335 + }, + { + "epoch": 8.083486801718847, + "grad_norm": 0.19035838544368744, + "learning_rate": 9.329998403393036e-06, + "loss": 1.7657, + "step": 26336 + }, + { + "epoch": 8.083793738489872, + "grad_norm": 0.20213502645492554, + "learning_rate": 9.327107206199925e-06, + "loss": 1.6938, + "step": 26337 + }, + { + "epoch": 8.084100675260895, + "grad_norm": 0.20297139883041382, + "learning_rate": 9.324216410960157e-06, + "loss": 1.7476, + "step": 26338 + }, + { + "epoch": 8.08440761203192, + "grad_norm": 0.23968154191970825, + "learning_rate": 9.321326017702348e-06, + "loss": 1.7418, + "step": 26339 + }, + { + "epoch": 8.084714548802946, + "grad_norm": 0.19853347539901733, + "learning_rate": 9.318436026455008e-06, + "loss": 1.6943, + "step": 26340 + }, + { + "epoch": 8.085021485573971, + "grad_norm": 0.1835598647594452, + "learning_rate": 9.315546437246742e-06, + "loss": 1.7071, + "step": 26341 + }, + { + "epoch": 8.085328422344997, + "grad_norm": 0.22876964509487152, + "learning_rate": 9.312657250106106e-06, + "loss": 1.7717, + "step": 26342 + }, + { + "epoch": 8.085635359116022, + "grad_norm": 0.1632407158613205, + "learning_rate": 9.309768465061613e-06, + "loss": 1.6506, + "step": 26343 + }, + { + "epoch": 8.085942295887047, + "grad_norm": 0.1812858134508133, + "learning_rate": 9.306880082141861e-06, + "loss": 1.6826, + "step": 26344 + }, + { + "epoch": 8.086249232658073, + "grad_norm": 0.24607063829898834, + "learning_rate": 9.303992101375347e-06, + "loss": 1.7109, + "step": 26345 + }, + { + "epoch": 8.086556169429098, + "grad_norm": 0.1401972472667694, + "learning_rate": 9.301104522790648e-06, + "loss": 1.6612, + "step": 26346 + }, + { + "epoch": 8.086863106200123, + "grad_norm": 0.22876517474651337, + "learning_rate": 9.298217346416287e-06, + "loss": 1.6857, + "step": 26347 + }, + { + "epoch": 8.087170042971149, + "grad_norm": 0.22353915870189667, + "learning_rate": 9.295330572280803e-06, + "loss": 1.7071, + "step": 26348 + }, + { + "epoch": 8.087476979742172, + "grad_norm": 0.22349561750888824, + "learning_rate": 9.292444200412715e-06, + "loss": 1.7098, + "step": 26349 + }, + { + "epoch": 8.087783916513198, + "grad_norm": 0.17078392207622528, + "learning_rate": 9.289558230840556e-06, + "loss": 1.6732, + "step": 26350 + }, + { + "epoch": 8.088090853284223, + "grad_norm": 0.19569413363933563, + "learning_rate": 9.286672663592843e-06, + "loss": 1.7489, + "step": 26351 + }, + { + "epoch": 8.088397790055248, + "grad_norm": 0.1565880924463272, + "learning_rate": 9.283787498698093e-06, + "loss": 1.6984, + "step": 26352 + }, + { + "epoch": 8.088704726826274, + "grad_norm": 0.21362969279289246, + "learning_rate": 9.28090273618481e-06, + "loss": 1.7157, + "step": 26353 + }, + { + "epoch": 8.089011663597299, + "grad_norm": 0.15077799558639526, + "learning_rate": 9.278018376081532e-06, + "loss": 1.707, + "step": 26354 + }, + { + "epoch": 8.089318600368324, + "grad_norm": 0.19006888568401337, + "learning_rate": 9.27513441841672e-06, + "loss": 1.7379, + "step": 26355 + }, + { + "epoch": 8.08962553713935, + "grad_norm": 0.17935799062252045, + "learning_rate": 9.272250863218928e-06, + "loss": 1.7529, + "step": 26356 + }, + { + "epoch": 8.089932473910375, + "grad_norm": 0.1539749801158905, + "learning_rate": 9.269367710516596e-06, + "loss": 1.6717, + "step": 26357 + }, + { + "epoch": 8.0902394106814, + "grad_norm": 0.20954270660877228, + "learning_rate": 9.266484960338262e-06, + "loss": 1.7511, + "step": 26358 + }, + { + "epoch": 8.090546347452424, + "grad_norm": 0.1744573712348938, + "learning_rate": 9.263602612712408e-06, + "loss": 1.747, + "step": 26359 + }, + { + "epoch": 8.09085328422345, + "grad_norm": 0.198909193277359, + "learning_rate": 9.260720667667482e-06, + "loss": 1.6854, + "step": 26360 + }, + { + "epoch": 8.091160220994475, + "grad_norm": 0.16504423320293427, + "learning_rate": 9.25783912523202e-06, + "loss": 1.7346, + "step": 26361 + }, + { + "epoch": 8.0914671577655, + "grad_norm": 0.16309323906898499, + "learning_rate": 9.254957985434449e-06, + "loss": 1.695, + "step": 26362 + }, + { + "epoch": 8.091774094536525, + "grad_norm": 0.178558811545372, + "learning_rate": 9.25207724830327e-06, + "loss": 1.7091, + "step": 26363 + }, + { + "epoch": 8.09208103130755, + "grad_norm": 0.1758749783039093, + "learning_rate": 9.249196913866954e-06, + "loss": 1.732, + "step": 26364 + }, + { + "epoch": 8.092387968078576, + "grad_norm": 0.16251471638679504, + "learning_rate": 9.246316982153957e-06, + "loss": 1.6783, + "step": 26365 + }, + { + "epoch": 8.092694904849601, + "grad_norm": 0.1818319857120514, + "learning_rate": 9.243437453192739e-06, + "loss": 1.7208, + "step": 26366 + }, + { + "epoch": 8.093001841620627, + "grad_norm": 0.2009693682193756, + "learning_rate": 9.240558327011761e-06, + "loss": 1.7345, + "step": 26367 + }, + { + "epoch": 8.093308778391652, + "grad_norm": 0.19003108143806458, + "learning_rate": 9.237679603639477e-06, + "loss": 1.7141, + "step": 26368 + }, + { + "epoch": 8.093615715162677, + "grad_norm": 0.19530169665813446, + "learning_rate": 9.234801283104338e-06, + "loss": 1.6945, + "step": 26369 + }, + { + "epoch": 8.0939226519337, + "grad_norm": 0.14184506237506866, + "learning_rate": 9.231923365434769e-06, + "loss": 1.6484, + "step": 26370 + }, + { + "epoch": 8.094229588704726, + "grad_norm": 0.14682452380657196, + "learning_rate": 9.229045850659252e-06, + "loss": 1.6534, + "step": 26371 + }, + { + "epoch": 8.094536525475752, + "grad_norm": 0.21143727004528046, + "learning_rate": 9.22616873880618e-06, + "loss": 1.7439, + "step": 26372 + }, + { + "epoch": 8.094843462246777, + "grad_norm": 0.1664114147424698, + "learning_rate": 9.223292029904029e-06, + "loss": 1.7568, + "step": 26373 + }, + { + "epoch": 8.095150399017802, + "grad_norm": 0.17671625316143036, + "learning_rate": 9.22041572398118e-06, + "loss": 1.6594, + "step": 26374 + }, + { + "epoch": 8.095457335788828, + "grad_norm": 0.1968437135219574, + "learning_rate": 9.217539821066101e-06, + "loss": 1.734, + "step": 26375 + }, + { + "epoch": 8.095764272559853, + "grad_norm": 0.18740740418434143, + "learning_rate": 9.214664321187206e-06, + "loss": 1.7223, + "step": 26376 + }, + { + "epoch": 8.096071209330878, + "grad_norm": 0.16954728960990906, + "learning_rate": 9.21178922437288e-06, + "loss": 1.7282, + "step": 26377 + }, + { + "epoch": 8.096378146101904, + "grad_norm": 0.1979333609342575, + "learning_rate": 9.20891453065158e-06, + "loss": 1.7254, + "step": 26378 + }, + { + "epoch": 8.096685082872929, + "grad_norm": 0.1495361626148224, + "learning_rate": 9.206040240051677e-06, + "loss": 1.6936, + "step": 26379 + }, + { + "epoch": 8.096992019643954, + "grad_norm": 0.159287691116333, + "learning_rate": 9.203166352601605e-06, + "loss": 1.6658, + "step": 26380 + }, + { + "epoch": 8.097298956414978, + "grad_norm": 0.175196573138237, + "learning_rate": 9.200292868329751e-06, + "loss": 1.7779, + "step": 26381 + }, + { + "epoch": 8.097605893186003, + "grad_norm": 0.17131435871124268, + "learning_rate": 9.197419787264522e-06, + "loss": 1.7435, + "step": 26382 + }, + { + "epoch": 8.097912829957028, + "grad_norm": 0.14529173076152802, + "learning_rate": 9.194547109434299e-06, + "loss": 1.7083, + "step": 26383 + }, + { + "epoch": 8.098219766728054, + "grad_norm": 0.1824452430009842, + "learning_rate": 9.191674834867482e-06, + "loss": 1.7134, + "step": 26384 + }, + { + "epoch": 8.098526703499079, + "grad_norm": 0.18507611751556396, + "learning_rate": 9.188802963592453e-06, + "loss": 1.673, + "step": 26385 + }, + { + "epoch": 8.098833640270104, + "grad_norm": 0.19102542102336884, + "learning_rate": 9.185931495637595e-06, + "loss": 1.7058, + "step": 26386 + }, + { + "epoch": 8.09914057704113, + "grad_norm": 0.17001433670520782, + "learning_rate": 9.183060431031271e-06, + "loss": 1.6827, + "step": 26387 + }, + { + "epoch": 8.099447513812155, + "grad_norm": 0.1718425452709198, + "learning_rate": 9.18018976980189e-06, + "loss": 1.7375, + "step": 26388 + }, + { + "epoch": 8.09975445058318, + "grad_norm": 0.15681782364845276, + "learning_rate": 9.177319511977772e-06, + "loss": 1.6989, + "step": 26389 + }, + { + "epoch": 8.100061387354206, + "grad_norm": 0.156332865357399, + "learning_rate": 9.174449657587341e-06, + "loss": 1.7229, + "step": 26390 + }, + { + "epoch": 8.10036832412523, + "grad_norm": 0.2014407366514206, + "learning_rate": 9.171580206658898e-06, + "loss": 1.7589, + "step": 26391 + }, + { + "epoch": 8.100675260896255, + "grad_norm": 0.16946980357170105, + "learning_rate": 9.168711159220845e-06, + "loss": 1.7053, + "step": 26392 + }, + { + "epoch": 8.10098219766728, + "grad_norm": 0.1604216992855072, + "learning_rate": 9.165842515301526e-06, + "loss": 1.7338, + "step": 26393 + }, + { + "epoch": 8.101289134438305, + "grad_norm": 0.19191038608551025, + "learning_rate": 9.162974274929265e-06, + "loss": 1.721, + "step": 26394 + }, + { + "epoch": 8.10159607120933, + "grad_norm": 0.17082683742046356, + "learning_rate": 9.160106438132454e-06, + "loss": 1.707, + "step": 26395 + }, + { + "epoch": 8.101903007980356, + "grad_norm": 0.15988127887248993, + "learning_rate": 9.157239004939377e-06, + "loss": 1.6787, + "step": 26396 + }, + { + "epoch": 8.102209944751381, + "grad_norm": 0.21586796641349792, + "learning_rate": 9.154371975378423e-06, + "loss": 1.7105, + "step": 26397 + }, + { + "epoch": 8.102516881522407, + "grad_norm": 0.17289277911186218, + "learning_rate": 9.151505349477902e-06, + "loss": 1.7165, + "step": 26398 + }, + { + "epoch": 8.102823818293432, + "grad_norm": 0.16819556057453156, + "learning_rate": 9.148639127266145e-06, + "loss": 1.6965, + "step": 26399 + }, + { + "epoch": 8.103130755064457, + "grad_norm": 0.2234455943107605, + "learning_rate": 9.145773308771483e-06, + "loss": 1.8059, + "step": 26400 + }, + { + "epoch": 8.103437691835483, + "grad_norm": 0.15835164487361908, + "learning_rate": 9.142907894022235e-06, + "loss": 1.6851, + "step": 26401 + }, + { + "epoch": 8.103744628606506, + "grad_norm": 0.18604053556919098, + "learning_rate": 9.140042883046718e-06, + "loss": 1.7105, + "step": 26402 + }, + { + "epoch": 8.104051565377532, + "grad_norm": 0.1927308589220047, + "learning_rate": 9.137178275873243e-06, + "loss": 1.7236, + "step": 26403 + }, + { + "epoch": 8.104358502148557, + "grad_norm": 0.16214077174663544, + "learning_rate": 9.134314072530115e-06, + "loss": 1.7394, + "step": 26404 + }, + { + "epoch": 8.104665438919582, + "grad_norm": 0.2051863819360733, + "learning_rate": 9.131450273045667e-06, + "loss": 1.701, + "step": 26405 + }, + { + "epoch": 8.104972375690608, + "grad_norm": 0.1917528212070465, + "learning_rate": 9.128586877448158e-06, + "loss": 1.6984, + "step": 26406 + }, + { + "epoch": 8.105279312461633, + "grad_norm": 0.19591490924358368, + "learning_rate": 9.125723885765935e-06, + "loss": 1.7678, + "step": 26407 + }, + { + "epoch": 8.105586249232658, + "grad_norm": 0.22388321161270142, + "learning_rate": 9.122861298027242e-06, + "loss": 1.7398, + "step": 26408 + }, + { + "epoch": 8.105893186003684, + "grad_norm": 0.13983963429927826, + "learning_rate": 9.119999114260402e-06, + "loss": 1.6868, + "step": 26409 + }, + { + "epoch": 8.106200122774709, + "grad_norm": 0.16611455380916595, + "learning_rate": 9.117137334493708e-06, + "loss": 1.7029, + "step": 26410 + }, + { + "epoch": 8.106507059545734, + "grad_norm": 0.22045908868312836, + "learning_rate": 9.114275958755397e-06, + "loss": 1.7598, + "step": 26411 + }, + { + "epoch": 8.10681399631676, + "grad_norm": 0.1717766672372818, + "learning_rate": 9.111414987073801e-06, + "loss": 1.7197, + "step": 26412 + }, + { + "epoch": 8.107120933087783, + "grad_norm": 0.1627349704504013, + "learning_rate": 9.108554419477138e-06, + "loss": 1.6514, + "step": 26413 + }, + { + "epoch": 8.107427869858808, + "grad_norm": 0.16213741898536682, + "learning_rate": 9.105694255993725e-06, + "loss": 1.6873, + "step": 26414 + }, + { + "epoch": 8.107734806629834, + "grad_norm": 0.15004312992095947, + "learning_rate": 9.102834496651812e-06, + "loss": 1.7057, + "step": 26415 + }, + { + "epoch": 8.10804174340086, + "grad_norm": 0.16030706465244293, + "learning_rate": 9.099975141479655e-06, + "loss": 1.7006, + "step": 26416 + }, + { + "epoch": 8.108348680171884, + "grad_norm": 0.18823765218257904, + "learning_rate": 9.097116190505516e-06, + "loss": 1.6734, + "step": 26417 + }, + { + "epoch": 8.10865561694291, + "grad_norm": 0.19617006182670593, + "learning_rate": 9.094257643757653e-06, + "loss": 1.7135, + "step": 26418 + }, + { + "epoch": 8.108962553713935, + "grad_norm": 0.2009502351284027, + "learning_rate": 9.091399501264308e-06, + "loss": 1.7573, + "step": 26419 + }, + { + "epoch": 8.10926949048496, + "grad_norm": 0.1545785665512085, + "learning_rate": 9.088541763053732e-06, + "loss": 1.7154, + "step": 26420 + }, + { + "epoch": 8.109576427255986, + "grad_norm": 0.19506138563156128, + "learning_rate": 9.085684429154152e-06, + "loss": 1.7116, + "step": 26421 + }, + { + "epoch": 8.109883364027011, + "grad_norm": 0.15998101234436035, + "learning_rate": 9.082827499593843e-06, + "loss": 1.7107, + "step": 26422 + }, + { + "epoch": 8.110190300798035, + "grad_norm": 0.16210505366325378, + "learning_rate": 9.079970974400992e-06, + "loss": 1.6625, + "step": 26423 + }, + { + "epoch": 8.11049723756906, + "grad_norm": 0.14739912748336792, + "learning_rate": 9.077114853603875e-06, + "loss": 1.6993, + "step": 26424 + }, + { + "epoch": 8.110804174340085, + "grad_norm": 0.16882890462875366, + "learning_rate": 9.074259137230667e-06, + "loss": 1.7666, + "step": 26425 + }, + { + "epoch": 8.11111111111111, + "grad_norm": 0.1667594611644745, + "learning_rate": 9.071403825309633e-06, + "loss": 1.6876, + "step": 26426 + }, + { + "epoch": 8.111418047882136, + "grad_norm": 0.14678725600242615, + "learning_rate": 9.06854891786899e-06, + "loss": 1.6458, + "step": 26427 + }, + { + "epoch": 8.111724984653161, + "grad_norm": 0.15207096934318542, + "learning_rate": 9.06569441493691e-06, + "loss": 1.6551, + "step": 26428 + }, + { + "epoch": 8.112031921424187, + "grad_norm": 0.2019769251346588, + "learning_rate": 9.062840316541654e-06, + "loss": 1.7812, + "step": 26429 + }, + { + "epoch": 8.112338858195212, + "grad_norm": 0.12371024489402771, + "learning_rate": 9.05998662271138e-06, + "loss": 1.6389, + "step": 26430 + }, + { + "epoch": 8.112645794966237, + "grad_norm": 0.21813201904296875, + "learning_rate": 9.057133333474332e-06, + "loss": 1.6922, + "step": 26431 + }, + { + "epoch": 8.112952731737263, + "grad_norm": 0.15330322086811066, + "learning_rate": 9.054280448858682e-06, + "loss": 1.6975, + "step": 26432 + }, + { + "epoch": 8.113259668508288, + "grad_norm": 0.17849069833755493, + "learning_rate": 9.051427968892635e-06, + "loss": 1.7239, + "step": 26433 + }, + { + "epoch": 8.113566605279312, + "grad_norm": 0.13501322269439697, + "learning_rate": 9.048575893604377e-06, + "loss": 1.66, + "step": 26434 + }, + { + "epoch": 8.113873542050337, + "grad_norm": 0.1584496796131134, + "learning_rate": 9.045724223022096e-06, + "loss": 1.6864, + "step": 26435 + }, + { + "epoch": 8.114180478821362, + "grad_norm": 0.1788417398929596, + "learning_rate": 9.04287295717397e-06, + "loss": 1.7785, + "step": 26436 + }, + { + "epoch": 8.114487415592388, + "grad_norm": 0.16028213500976562, + "learning_rate": 9.04002209608818e-06, + "loss": 1.6908, + "step": 26437 + }, + { + "epoch": 8.114794352363413, + "grad_norm": 0.19472184777259827, + "learning_rate": 9.037171639792895e-06, + "loss": 1.7963, + "step": 26438 + }, + { + "epoch": 8.115101289134438, + "grad_norm": 0.155779629945755, + "learning_rate": 9.034321588316297e-06, + "loss": 1.6975, + "step": 26439 + }, + { + "epoch": 8.115408225905464, + "grad_norm": 0.191580668091774, + "learning_rate": 9.031471941686525e-06, + "loss": 1.6926, + "step": 26440 + }, + { + "epoch": 8.115715162676489, + "grad_norm": 0.13917100429534912, + "learning_rate": 9.028622699931788e-06, + "loss": 1.6735, + "step": 26441 + }, + { + "epoch": 8.116022099447514, + "grad_norm": 0.13983212411403656, + "learning_rate": 9.025773863080188e-06, + "loss": 1.6995, + "step": 26442 + }, + { + "epoch": 8.11632903621854, + "grad_norm": 0.1471131443977356, + "learning_rate": 9.022925431159922e-06, + "loss": 1.7002, + "step": 26443 + }, + { + "epoch": 8.116635972989565, + "grad_norm": 0.16679814457893372, + "learning_rate": 9.020077404199134e-06, + "loss": 1.7124, + "step": 26444 + }, + { + "epoch": 8.116942909760589, + "grad_norm": 0.1366356909275055, + "learning_rate": 9.017229782225938e-06, + "loss": 1.663, + "step": 26445 + }, + { + "epoch": 8.117249846531614, + "grad_norm": 0.1389543116092682, + "learning_rate": 9.01438256526852e-06, + "loss": 1.6991, + "step": 26446 + }, + { + "epoch": 8.11755678330264, + "grad_norm": 0.1784060299396515, + "learning_rate": 9.011535753354972e-06, + "loss": 1.769, + "step": 26447 + }, + { + "epoch": 8.117863720073665, + "grad_norm": 0.17633236944675446, + "learning_rate": 9.008689346513466e-06, + "loss": 1.7466, + "step": 26448 + }, + { + "epoch": 8.11817065684469, + "grad_norm": 0.15887171030044556, + "learning_rate": 9.005843344772119e-06, + "loss": 1.7395, + "step": 26449 + }, + { + "epoch": 8.118477593615715, + "grad_norm": 0.20275244116783142, + "learning_rate": 9.002997748159054e-06, + "loss": 1.6971, + "step": 26450 + }, + { + "epoch": 8.11878453038674, + "grad_norm": 0.18063177168369293, + "learning_rate": 9.00015255670239e-06, + "loss": 1.7438, + "step": 26451 + }, + { + "epoch": 8.119091467157766, + "grad_norm": 0.14861668646335602, + "learning_rate": 8.997307770430252e-06, + "loss": 1.645, + "step": 26452 + }, + { + "epoch": 8.119398403928791, + "grad_norm": 0.20455077290534973, + "learning_rate": 8.99446338937075e-06, + "loss": 1.6791, + "step": 26453 + }, + { + "epoch": 8.119705340699817, + "grad_norm": 0.15492217242717743, + "learning_rate": 8.991619413551999e-06, + "loss": 1.6897, + "step": 26454 + }, + { + "epoch": 8.120012277470842, + "grad_norm": 0.1854604184627533, + "learning_rate": 8.988775843002095e-06, + "loss": 1.7379, + "step": 26455 + }, + { + "epoch": 8.120319214241865, + "grad_norm": 0.16705256700515747, + "learning_rate": 8.985932677749155e-06, + "loss": 1.7181, + "step": 26456 + }, + { + "epoch": 8.12062615101289, + "grad_norm": 0.1571042388677597, + "learning_rate": 8.983089917821246e-06, + "loss": 1.6962, + "step": 26457 + }, + { + "epoch": 8.120933087783916, + "grad_norm": 0.1818968802690506, + "learning_rate": 8.980247563246508e-06, + "loss": 1.6954, + "step": 26458 + }, + { + "epoch": 8.121240024554941, + "grad_norm": 0.1823234111070633, + "learning_rate": 8.977405614052986e-06, + "loss": 1.6936, + "step": 26459 + }, + { + "epoch": 8.121546961325967, + "grad_norm": 0.1767190843820572, + "learning_rate": 8.97456407026881e-06, + "loss": 1.7147, + "step": 26460 + }, + { + "epoch": 8.121853898096992, + "grad_norm": 0.17461732029914856, + "learning_rate": 8.971722931922023e-06, + "loss": 1.7039, + "step": 26461 + }, + { + "epoch": 8.122160834868017, + "grad_norm": 0.13968271017074585, + "learning_rate": 8.968882199040702e-06, + "loss": 1.655, + "step": 26462 + }, + { + "epoch": 8.122467771639043, + "grad_norm": 0.16950756311416626, + "learning_rate": 8.966041871652969e-06, + "loss": 1.689, + "step": 26463 + }, + { + "epoch": 8.122774708410068, + "grad_norm": 0.148970365524292, + "learning_rate": 8.963201949786831e-06, + "loss": 1.6998, + "step": 26464 + }, + { + "epoch": 8.123081645181093, + "grad_norm": 0.2081855684518814, + "learning_rate": 8.960362433470392e-06, + "loss": 1.7287, + "step": 26465 + }, + { + "epoch": 8.123388581952117, + "grad_norm": 0.14865393936634064, + "learning_rate": 8.957523322731714e-06, + "loss": 1.6789, + "step": 26466 + }, + { + "epoch": 8.123695518723142, + "grad_norm": 0.19252106547355652, + "learning_rate": 8.954684617598841e-06, + "loss": 1.7475, + "step": 26467 + }, + { + "epoch": 8.124002455494168, + "grad_norm": 0.1915684938430786, + "learning_rate": 8.951846318099837e-06, + "loss": 1.6937, + "step": 26468 + }, + { + "epoch": 8.124309392265193, + "grad_norm": 0.15057072043418884, + "learning_rate": 8.949008424262744e-06, + "loss": 1.6748, + "step": 26469 + }, + { + "epoch": 8.124616329036218, + "grad_norm": 0.1801072657108307, + "learning_rate": 8.946170936115611e-06, + "loss": 1.7411, + "step": 26470 + }, + { + "epoch": 8.124923265807244, + "grad_norm": 0.1449461281299591, + "learning_rate": 8.943333853686476e-06, + "loss": 1.6751, + "step": 26471 + }, + { + "epoch": 8.125230202578269, + "grad_norm": 0.19249948859214783, + "learning_rate": 8.940497177003383e-06, + "loss": 1.6876, + "step": 26472 + }, + { + "epoch": 8.125537139349294, + "grad_norm": 0.19512195885181427, + "learning_rate": 8.937660906094359e-06, + "loss": 1.7275, + "step": 26473 + }, + { + "epoch": 8.12584407612032, + "grad_norm": 0.15998144447803497, + "learning_rate": 8.934825040987433e-06, + "loss": 1.7151, + "step": 26474 + }, + { + "epoch": 8.126151012891345, + "grad_norm": 0.17573381960391998, + "learning_rate": 8.931989581710654e-06, + "loss": 1.713, + "step": 26475 + }, + { + "epoch": 8.12645794966237, + "grad_norm": 0.16745707392692566, + "learning_rate": 8.929154528292e-06, + "loss": 1.7758, + "step": 26476 + }, + { + "epoch": 8.126764886433394, + "grad_norm": 0.14445005357265472, + "learning_rate": 8.926319880759538e-06, + "loss": 1.6821, + "step": 26477 + }, + { + "epoch": 8.12707182320442, + "grad_norm": 0.20462681353092194, + "learning_rate": 8.923485639141244e-06, + "loss": 1.7083, + "step": 26478 + }, + { + "epoch": 8.127378759975445, + "grad_norm": 0.16262570023536682, + "learning_rate": 8.92065180346513e-06, + "loss": 1.7031, + "step": 26479 + }, + { + "epoch": 8.12768569674647, + "grad_norm": 0.14214366674423218, + "learning_rate": 8.917818373759235e-06, + "loss": 1.6752, + "step": 26480 + }, + { + "epoch": 8.127992633517495, + "grad_norm": 0.18373169004917145, + "learning_rate": 8.914985350051513e-06, + "loss": 1.7211, + "step": 26481 + }, + { + "epoch": 8.12829957028852, + "grad_norm": 0.1702071875333786, + "learning_rate": 8.912152732370015e-06, + "loss": 1.7513, + "step": 26482 + }, + { + "epoch": 8.128606507059546, + "grad_norm": 0.16515198349952698, + "learning_rate": 8.90932052074268e-06, + "loss": 1.7379, + "step": 26483 + }, + { + "epoch": 8.128913443830571, + "grad_norm": 0.17008109390735626, + "learning_rate": 8.906488715197537e-06, + "loss": 1.7243, + "step": 26484 + }, + { + "epoch": 8.129220380601597, + "grad_norm": 0.15695080161094666, + "learning_rate": 8.903657315762554e-06, + "loss": 1.6951, + "step": 26485 + }, + { + "epoch": 8.129527317372622, + "grad_norm": 0.16403819620609283, + "learning_rate": 8.900826322465716e-06, + "loss": 1.7755, + "step": 26486 + }, + { + "epoch": 8.129834254143647, + "grad_norm": 0.21355034410953522, + "learning_rate": 8.897995735335007e-06, + "loss": 1.7505, + "step": 26487 + }, + { + "epoch": 8.13014119091467, + "grad_norm": 0.15604349970817566, + "learning_rate": 8.895165554398394e-06, + "loss": 1.7452, + "step": 26488 + }, + { + "epoch": 8.130448127685696, + "grad_norm": 0.18299458920955658, + "learning_rate": 8.892335779683842e-06, + "loss": 1.6737, + "step": 26489 + }, + { + "epoch": 8.130755064456721, + "grad_norm": 0.1939994990825653, + "learning_rate": 8.889506411219329e-06, + "loss": 1.7219, + "step": 26490 + }, + { + "epoch": 8.131062001227747, + "grad_norm": 0.17785221338272095, + "learning_rate": 8.886677449032794e-06, + "loss": 1.7007, + "step": 26491 + }, + { + "epoch": 8.131368937998772, + "grad_norm": 0.2067573517560959, + "learning_rate": 8.88384889315223e-06, + "loss": 1.7918, + "step": 26492 + }, + { + "epoch": 8.131675874769797, + "grad_norm": 0.18033906817436218, + "learning_rate": 8.88102074360555e-06, + "loss": 1.7, + "step": 26493 + }, + { + "epoch": 8.131982811540823, + "grad_norm": 0.17076243460178375, + "learning_rate": 8.878193000420748e-06, + "loss": 1.6883, + "step": 26494 + }, + { + "epoch": 8.132289748311848, + "grad_norm": 0.19102394580841064, + "learning_rate": 8.875365663625729e-06, + "loss": 1.7387, + "step": 26495 + }, + { + "epoch": 8.132596685082873, + "grad_norm": 0.22587478160858154, + "learning_rate": 8.872538733248442e-06, + "loss": 1.7852, + "step": 26496 + }, + { + "epoch": 8.132903621853899, + "grad_norm": 0.17067384719848633, + "learning_rate": 8.869712209316861e-06, + "loss": 1.6813, + "step": 26497 + }, + { + "epoch": 8.133210558624924, + "grad_norm": 0.19232873618602753, + "learning_rate": 8.866886091858856e-06, + "loss": 1.6644, + "step": 26498 + }, + { + "epoch": 8.133517495395948, + "grad_norm": 0.18685118854045868, + "learning_rate": 8.864060380902423e-06, + "loss": 1.6766, + "step": 26499 + }, + { + "epoch": 8.133824432166973, + "grad_norm": 0.18342606723308563, + "learning_rate": 8.861235076475433e-06, + "loss": 1.6694, + "step": 26500 + }, + { + "epoch": 8.134131368937998, + "grad_norm": 0.15469637513160706, + "learning_rate": 8.858410178605842e-06, + "loss": 1.6882, + "step": 26501 + }, + { + "epoch": 8.134438305709024, + "grad_norm": 0.19094935059547424, + "learning_rate": 8.855585687321549e-06, + "loss": 1.6662, + "step": 26502 + }, + { + "epoch": 8.134745242480049, + "grad_norm": 0.19613660871982574, + "learning_rate": 8.852761602650479e-06, + "loss": 1.6518, + "step": 26503 + }, + { + "epoch": 8.135052179251074, + "grad_norm": 0.1342541128396988, + "learning_rate": 8.849937924620538e-06, + "loss": 1.6728, + "step": 26504 + }, + { + "epoch": 8.1353591160221, + "grad_norm": 0.19099827110767365, + "learning_rate": 8.847114653259624e-06, + "loss": 1.714, + "step": 26505 + }, + { + "epoch": 8.135666052793125, + "grad_norm": 0.18886728584766388, + "learning_rate": 8.84429178859565e-06, + "loss": 1.7222, + "step": 26506 + }, + { + "epoch": 8.13597298956415, + "grad_norm": 0.16177545487880707, + "learning_rate": 8.841469330656499e-06, + "loss": 1.754, + "step": 26507 + }, + { + "epoch": 8.136279926335176, + "grad_norm": 0.1589137762784958, + "learning_rate": 8.838647279470063e-06, + "loss": 1.6889, + "step": 26508 + }, + { + "epoch": 8.1365868631062, + "grad_norm": 0.16074521839618683, + "learning_rate": 8.835825635064266e-06, + "loss": 1.6882, + "step": 26509 + }, + { + "epoch": 8.136893799877225, + "grad_norm": 0.15532740950584412, + "learning_rate": 8.833004397466937e-06, + "loss": 1.6786, + "step": 26510 + }, + { + "epoch": 8.13720073664825, + "grad_norm": 0.18151862919330597, + "learning_rate": 8.830183566706019e-06, + "loss": 1.7075, + "step": 26511 + }, + { + "epoch": 8.137507673419275, + "grad_norm": 0.15345066785812378, + "learning_rate": 8.827363142809342e-06, + "loss": 1.6895, + "step": 26512 + }, + { + "epoch": 8.1378146101903, + "grad_norm": 0.16954976320266724, + "learning_rate": 8.824543125804785e-06, + "loss": 1.727, + "step": 26513 + }, + { + "epoch": 8.138121546961326, + "grad_norm": 0.1679479032754898, + "learning_rate": 8.821723515720249e-06, + "loss": 1.7391, + "step": 26514 + }, + { + "epoch": 8.138428483732351, + "grad_norm": 0.15377631783485413, + "learning_rate": 8.818904312583547e-06, + "loss": 1.6954, + "step": 26515 + }, + { + "epoch": 8.138735420503377, + "grad_norm": 0.20345479249954224, + "learning_rate": 8.8160855164226e-06, + "loss": 1.7424, + "step": 26516 + }, + { + "epoch": 8.139042357274402, + "grad_norm": 0.18770255148410797, + "learning_rate": 8.813267127265207e-06, + "loss": 1.67, + "step": 26517 + }, + { + "epoch": 8.139349294045427, + "grad_norm": 0.16253206133842468, + "learning_rate": 8.810449145139265e-06, + "loss": 1.7004, + "step": 26518 + }, + { + "epoch": 8.139656230816453, + "grad_norm": 0.18429701030254364, + "learning_rate": 8.807631570072606e-06, + "loss": 1.7289, + "step": 26519 + }, + { + "epoch": 8.139963167587476, + "grad_norm": 0.18926598131656647, + "learning_rate": 8.80481440209307e-06, + "loss": 1.7907, + "step": 26520 + }, + { + "epoch": 8.140270104358502, + "grad_norm": 0.17855983972549438, + "learning_rate": 8.80199764122851e-06, + "loss": 1.7008, + "step": 26521 + }, + { + "epoch": 8.140577041129527, + "grad_norm": 0.20559640228748322, + "learning_rate": 8.799181287506752e-06, + "loss": 1.724, + "step": 26522 + }, + { + "epoch": 8.140883977900552, + "grad_norm": 0.1707194298505783, + "learning_rate": 8.79636534095563e-06, + "loss": 1.7274, + "step": 26523 + }, + { + "epoch": 8.141190914671578, + "grad_norm": 0.1882070004940033, + "learning_rate": 8.793549801602984e-06, + "loss": 1.7503, + "step": 26524 + }, + { + "epoch": 8.141497851442603, + "grad_norm": 0.24269217252731323, + "learning_rate": 8.790734669476613e-06, + "loss": 1.7459, + "step": 26525 + }, + { + "epoch": 8.141804788213628, + "grad_norm": 0.20310194790363312, + "learning_rate": 8.787919944604383e-06, + "loss": 1.7158, + "step": 26526 + }, + { + "epoch": 8.142111724984654, + "grad_norm": 0.18653319776058197, + "learning_rate": 8.785105627014056e-06, + "loss": 1.7135, + "step": 26527 + }, + { + "epoch": 8.142418661755679, + "grad_norm": 0.1896388828754425, + "learning_rate": 8.782291716733499e-06, + "loss": 1.7407, + "step": 26528 + }, + { + "epoch": 8.142725598526704, + "grad_norm": 0.17392487823963165, + "learning_rate": 8.779478213790482e-06, + "loss": 1.6863, + "step": 26529 + }, + { + "epoch": 8.14303253529773, + "grad_norm": 0.2389729917049408, + "learning_rate": 8.776665118212807e-06, + "loss": 1.7565, + "step": 26530 + }, + { + "epoch": 8.143339472068753, + "grad_norm": 0.1907578408718109, + "learning_rate": 8.773852430028312e-06, + "loss": 1.7135, + "step": 26531 + }, + { + "epoch": 8.143646408839778, + "grad_norm": 0.1867230087518692, + "learning_rate": 8.771040149264748e-06, + "loss": 1.657, + "step": 26532 + }, + { + "epoch": 8.143953345610804, + "grad_norm": 0.16111065447330475, + "learning_rate": 8.768228275949953e-06, + "loss": 1.6849, + "step": 26533 + }, + { + "epoch": 8.144260282381829, + "grad_norm": 0.24071912467479706, + "learning_rate": 8.76541681011167e-06, + "loss": 1.7563, + "step": 26534 + }, + { + "epoch": 8.144567219152854, + "grad_norm": 0.18996769189834595, + "learning_rate": 8.76260575177772e-06, + "loss": 1.7099, + "step": 26535 + }, + { + "epoch": 8.14487415592388, + "grad_norm": 0.17230607569217682, + "learning_rate": 8.75979510097587e-06, + "loss": 1.6848, + "step": 26536 + }, + { + "epoch": 8.145181092694905, + "grad_norm": 0.19319802522659302, + "learning_rate": 8.756984857733896e-06, + "loss": 1.7806, + "step": 26537 + }, + { + "epoch": 8.14548802946593, + "grad_norm": 0.16848497092723846, + "learning_rate": 8.754175022079569e-06, + "loss": 1.7099, + "step": 26538 + }, + { + "epoch": 8.145794966236956, + "grad_norm": 0.16230639815330505, + "learning_rate": 8.751365594040662e-06, + "loss": 1.6618, + "step": 26539 + }, + { + "epoch": 8.146101903007981, + "grad_norm": 0.15458232164382935, + "learning_rate": 8.748556573644935e-06, + "loss": 1.6975, + "step": 26540 + }, + { + "epoch": 8.146408839779005, + "grad_norm": 0.15948891639709473, + "learning_rate": 8.745747960920153e-06, + "loss": 1.6977, + "step": 26541 + }, + { + "epoch": 8.14671577655003, + "grad_norm": 0.17533692717552185, + "learning_rate": 8.742939755894053e-06, + "loss": 1.7314, + "step": 26542 + }, + { + "epoch": 8.147022713321055, + "grad_norm": 0.13606345653533936, + "learning_rate": 8.740131958594433e-06, + "loss": 1.6245, + "step": 26543 + }, + { + "epoch": 8.14732965009208, + "grad_norm": 0.1749604493379593, + "learning_rate": 8.737324569048993e-06, + "loss": 1.6881, + "step": 26544 + }, + { + "epoch": 8.147636586863106, + "grad_norm": 0.15416191518306732, + "learning_rate": 8.7345175872855e-06, + "loss": 1.6755, + "step": 26545 + }, + { + "epoch": 8.147943523634131, + "grad_norm": 0.19732356071472168, + "learning_rate": 8.731711013331695e-06, + "loss": 1.7068, + "step": 26546 + }, + { + "epoch": 8.148250460405157, + "grad_norm": 0.19295896589756012, + "learning_rate": 8.728904847215291e-06, + "loss": 1.7282, + "step": 26547 + }, + { + "epoch": 8.148557397176182, + "grad_norm": 0.18414302170276642, + "learning_rate": 8.726099088964069e-06, + "loss": 1.7059, + "step": 26548 + }, + { + "epoch": 8.148864333947207, + "grad_norm": 0.17527544498443604, + "learning_rate": 8.723293738605697e-06, + "loss": 1.6947, + "step": 26549 + }, + { + "epoch": 8.149171270718233, + "grad_norm": 0.1913319230079651, + "learning_rate": 8.720488796167958e-06, + "loss": 1.6988, + "step": 26550 + }, + { + "epoch": 8.149478207489258, + "grad_norm": 0.1604306846857071, + "learning_rate": 8.71768426167852e-06, + "loss": 1.6937, + "step": 26551 + }, + { + "epoch": 8.149785144260282, + "grad_norm": 0.1562403291463852, + "learning_rate": 8.714880135165132e-06, + "loss": 1.6633, + "step": 26552 + }, + { + "epoch": 8.150092081031307, + "grad_norm": 0.16940948367118835, + "learning_rate": 8.712076416655495e-06, + "loss": 1.6774, + "step": 26553 + }, + { + "epoch": 8.150399017802332, + "grad_norm": 0.14607203006744385, + "learning_rate": 8.709273106177324e-06, + "loss": 1.6912, + "step": 26554 + }, + { + "epoch": 8.150705954573358, + "grad_norm": 0.1811707615852356, + "learning_rate": 8.706470203758316e-06, + "loss": 1.7291, + "step": 26555 + }, + { + "epoch": 8.151012891344383, + "grad_norm": 0.18188659846782684, + "learning_rate": 8.703667709426166e-06, + "loss": 1.6994, + "step": 26556 + }, + { + "epoch": 8.151319828115408, + "grad_norm": 0.16499698162078857, + "learning_rate": 8.700865623208581e-06, + "loss": 1.7065, + "step": 26557 + }, + { + "epoch": 8.151626764886434, + "grad_norm": 0.17506305873394012, + "learning_rate": 8.69806394513325e-06, + "loss": 1.75, + "step": 26558 + }, + { + "epoch": 8.151933701657459, + "grad_norm": 0.14843741059303284, + "learning_rate": 8.695262675227844e-06, + "loss": 1.6645, + "step": 26559 + }, + { + "epoch": 8.152240638428484, + "grad_norm": 0.15281017124652863, + "learning_rate": 8.692461813520087e-06, + "loss": 1.7166, + "step": 26560 + }, + { + "epoch": 8.15254757519951, + "grad_norm": 0.17245371639728546, + "learning_rate": 8.689661360037621e-06, + "loss": 1.7418, + "step": 26561 + }, + { + "epoch": 8.152854511970535, + "grad_norm": 0.17387856543064117, + "learning_rate": 8.686861314808131e-06, + "loss": 1.6865, + "step": 26562 + }, + { + "epoch": 8.153161448741558, + "grad_norm": 0.1463180035352707, + "learning_rate": 8.684061677859296e-06, + "loss": 1.6867, + "step": 26563 + }, + { + "epoch": 8.153468385512584, + "grad_norm": 0.16704687476158142, + "learning_rate": 8.681262449218769e-06, + "loss": 1.6985, + "step": 26564 + }, + { + "epoch": 8.15377532228361, + "grad_norm": 0.17754648625850677, + "learning_rate": 8.678463628914246e-06, + "loss": 1.7067, + "step": 26565 + }, + { + "epoch": 8.154082259054634, + "grad_norm": 0.12470053881406784, + "learning_rate": 8.675665216973339e-06, + "loss": 1.6468, + "step": 26566 + }, + { + "epoch": 8.15438919582566, + "grad_norm": 0.17551906406879425, + "learning_rate": 8.672867213423757e-06, + "loss": 1.76, + "step": 26567 + }, + { + "epoch": 8.154696132596685, + "grad_norm": 0.13165321946144104, + "learning_rate": 8.670069618293098e-06, + "loss": 1.6672, + "step": 26568 + }, + { + "epoch": 8.15500306936771, + "grad_norm": 0.1410796046257019, + "learning_rate": 8.667272431609041e-06, + "loss": 1.649, + "step": 26569 + }, + { + "epoch": 8.155310006138736, + "grad_norm": 0.17227822542190552, + "learning_rate": 8.664475653399235e-06, + "loss": 1.7028, + "step": 26570 + }, + { + "epoch": 8.155616942909761, + "grad_norm": 0.15770387649536133, + "learning_rate": 8.661679283691298e-06, + "loss": 1.7608, + "step": 26571 + }, + { + "epoch": 8.155923879680786, + "grad_norm": 0.1425134390592575, + "learning_rate": 8.658883322512885e-06, + "loss": 1.6821, + "step": 26572 + }, + { + "epoch": 8.15623081645181, + "grad_norm": 0.19647212326526642, + "learning_rate": 8.656087769891608e-06, + "loss": 1.7787, + "step": 26573 + }, + { + "epoch": 8.156537753222835, + "grad_norm": 0.15315282344818115, + "learning_rate": 8.653292625855108e-06, + "loss": 1.6464, + "step": 26574 + }, + { + "epoch": 8.15684468999386, + "grad_norm": 0.1664622575044632, + "learning_rate": 8.650497890431009e-06, + "loss": 1.7189, + "step": 26575 + }, + { + "epoch": 8.157151626764886, + "grad_norm": 0.19525103271007538, + "learning_rate": 8.647703563646908e-06, + "loss": 1.71, + "step": 26576 + }, + { + "epoch": 8.157458563535911, + "grad_norm": 0.2435453087091446, + "learning_rate": 8.644909645530464e-06, + "loss": 1.7312, + "step": 26577 + }, + { + "epoch": 8.157765500306937, + "grad_norm": 0.20554441213607788, + "learning_rate": 8.642116136109252e-06, + "loss": 1.7102, + "step": 26578 + }, + { + "epoch": 8.158072437077962, + "grad_norm": 0.21100008487701416, + "learning_rate": 8.639323035410885e-06, + "loss": 1.6513, + "step": 26579 + }, + { + "epoch": 8.158379373848987, + "grad_norm": 0.20069560408592224, + "learning_rate": 8.636530343462973e-06, + "loss": 1.7457, + "step": 26580 + }, + { + "epoch": 8.158686310620013, + "grad_norm": 0.19240780174732208, + "learning_rate": 8.633738060293095e-06, + "loss": 1.6761, + "step": 26581 + }, + { + "epoch": 8.158993247391038, + "grad_norm": 0.17970497906208038, + "learning_rate": 8.63094618592889e-06, + "loss": 1.7571, + "step": 26582 + }, + { + "epoch": 8.159300184162063, + "grad_norm": 0.19709791243076324, + "learning_rate": 8.628154720397902e-06, + "loss": 1.7826, + "step": 26583 + }, + { + "epoch": 8.159607120933087, + "grad_norm": 0.2084866315126419, + "learning_rate": 8.62536366372776e-06, + "loss": 1.7113, + "step": 26584 + }, + { + "epoch": 8.159914057704112, + "grad_norm": 0.18584266304969788, + "learning_rate": 8.622573015945995e-06, + "loss": 1.675, + "step": 26585 + }, + { + "epoch": 8.160220994475138, + "grad_norm": 0.21233049035072327, + "learning_rate": 8.619782777080232e-06, + "loss": 1.7438, + "step": 26586 + }, + { + "epoch": 8.160527931246163, + "grad_norm": 0.180323526263237, + "learning_rate": 8.61699294715803e-06, + "loss": 1.6923, + "step": 26587 + }, + { + "epoch": 8.160834868017188, + "grad_norm": 0.182667076587677, + "learning_rate": 8.614203526206955e-06, + "loss": 1.7302, + "step": 26588 + }, + { + "epoch": 8.161141804788214, + "grad_norm": 0.19673213362693787, + "learning_rate": 8.611414514254584e-06, + "loss": 1.7282, + "step": 26589 + }, + { + "epoch": 8.161448741559239, + "grad_norm": 0.14357072114944458, + "learning_rate": 8.608625911328466e-06, + "loss": 1.6964, + "step": 26590 + }, + { + "epoch": 8.161755678330264, + "grad_norm": 0.25598716735839844, + "learning_rate": 8.605837717456172e-06, + "loss": 1.788, + "step": 26591 + }, + { + "epoch": 8.16206261510129, + "grad_norm": 0.16914238035678864, + "learning_rate": 8.603049932665252e-06, + "loss": 1.6069, + "step": 26592 + }, + { + "epoch": 8.162369551872315, + "grad_norm": 0.1468336582183838, + "learning_rate": 8.60026255698324e-06, + "loss": 1.7009, + "step": 26593 + }, + { + "epoch": 8.16267648864334, + "grad_norm": 0.20125585794448853, + "learning_rate": 8.597475590437726e-06, + "loss": 1.7166, + "step": 26594 + }, + { + "epoch": 8.162983425414364, + "grad_norm": 0.12715741991996765, + "learning_rate": 8.594689033056214e-06, + "loss": 1.6488, + "step": 26595 + }, + { + "epoch": 8.16329036218539, + "grad_norm": 0.2659800350666046, + "learning_rate": 8.591902884866254e-06, + "loss": 1.7325, + "step": 26596 + }, + { + "epoch": 8.163597298956415, + "grad_norm": 0.1939239799976349, + "learning_rate": 8.589117145895376e-06, + "loss": 1.6882, + "step": 26597 + }, + { + "epoch": 8.16390423572744, + "grad_norm": 0.18982990086078644, + "learning_rate": 8.586331816171101e-06, + "loss": 1.7222, + "step": 26598 + }, + { + "epoch": 8.164211172498465, + "grad_norm": 0.16025054454803467, + "learning_rate": 8.583546895720995e-06, + "loss": 1.6672, + "step": 26599 + }, + { + "epoch": 8.16451810926949, + "grad_norm": 0.1923390030860901, + "learning_rate": 8.580762384572533e-06, + "loss": 1.7261, + "step": 26600 + }, + { + "epoch": 8.164825046040516, + "grad_norm": 0.1467374712228775, + "learning_rate": 8.577978282753274e-06, + "loss": 1.6969, + "step": 26601 + }, + { + "epoch": 8.165131982811541, + "grad_norm": 0.2210266888141632, + "learning_rate": 8.575194590290685e-06, + "loss": 1.74, + "step": 26602 + }, + { + "epoch": 8.165438919582567, + "grad_norm": 0.1852598935365677, + "learning_rate": 8.572411307212319e-06, + "loss": 1.7522, + "step": 26603 + }, + { + "epoch": 8.165745856353592, + "grad_norm": 0.19316701591014862, + "learning_rate": 8.569628433545662e-06, + "loss": 1.7389, + "step": 26604 + }, + { + "epoch": 8.166052793124617, + "grad_norm": 0.2102174311876297, + "learning_rate": 8.566845969318227e-06, + "loss": 1.7134, + "step": 26605 + }, + { + "epoch": 8.16635972989564, + "grad_norm": 0.1948329359292984, + "learning_rate": 8.564063914557496e-06, + "loss": 1.7368, + "step": 26606 + }, + { + "epoch": 8.166666666666666, + "grad_norm": 0.14721956849098206, + "learning_rate": 8.561282269290977e-06, + "loss": 1.6526, + "step": 26607 + }, + { + "epoch": 8.166973603437691, + "grad_norm": 0.17424573004245758, + "learning_rate": 8.558501033546158e-06, + "loss": 1.6954, + "step": 26608 + }, + { + "epoch": 8.167280540208717, + "grad_norm": 0.14784085750579834, + "learning_rate": 8.555720207350514e-06, + "loss": 1.7166, + "step": 26609 + }, + { + "epoch": 8.167587476979742, + "grad_norm": 0.1619582176208496, + "learning_rate": 8.55293979073154e-06, + "loss": 1.716, + "step": 26610 + }, + { + "epoch": 8.167894413750767, + "grad_norm": 0.2342625856399536, + "learning_rate": 8.550159783716705e-06, + "loss": 1.7399, + "step": 26611 + }, + { + "epoch": 8.168201350521793, + "grad_norm": 0.16116589307785034, + "learning_rate": 8.547380186333482e-06, + "loss": 1.6727, + "step": 26612 + }, + { + "epoch": 8.168508287292818, + "grad_norm": 0.20995540916919708, + "learning_rate": 8.544600998609349e-06, + "loss": 1.703, + "step": 26613 + }, + { + "epoch": 8.168815224063843, + "grad_norm": 0.18031500279903412, + "learning_rate": 8.541822220571766e-06, + "loss": 1.6953, + "step": 26614 + }, + { + "epoch": 8.169122160834869, + "grad_norm": 0.1851302981376648, + "learning_rate": 8.539043852248197e-06, + "loss": 1.6931, + "step": 26615 + }, + { + "epoch": 8.169429097605892, + "grad_norm": 0.2262948453426361, + "learning_rate": 8.536265893666096e-06, + "loss": 1.7167, + "step": 26616 + }, + { + "epoch": 8.169736034376918, + "grad_norm": 0.1456020325422287, + "learning_rate": 8.533488344852903e-06, + "loss": 1.6686, + "step": 26617 + }, + { + "epoch": 8.170042971147943, + "grad_norm": 0.17165613174438477, + "learning_rate": 8.530711205836112e-06, + "loss": 1.6641, + "step": 26618 + }, + { + "epoch": 8.170349907918968, + "grad_norm": 0.18926110863685608, + "learning_rate": 8.527934476643112e-06, + "loss": 1.7155, + "step": 26619 + }, + { + "epoch": 8.170656844689994, + "grad_norm": 0.1722220927476883, + "learning_rate": 8.525158157301383e-06, + "loss": 1.7188, + "step": 26620 + }, + { + "epoch": 8.170963781461019, + "grad_norm": 0.1791582554578781, + "learning_rate": 8.522382247838351e-06, + "loss": 1.7195, + "step": 26621 + }, + { + "epoch": 8.171270718232044, + "grad_norm": 0.18020455539226532, + "learning_rate": 8.519606748281445e-06, + "loss": 1.7068, + "step": 26622 + }, + { + "epoch": 8.17157765500307, + "grad_norm": 0.17394676804542542, + "learning_rate": 8.516831658658098e-06, + "loss": 1.6977, + "step": 26623 + }, + { + "epoch": 8.171884591774095, + "grad_norm": 0.24079330265522003, + "learning_rate": 8.514056978995739e-06, + "loss": 1.7152, + "step": 26624 + }, + { + "epoch": 8.17219152854512, + "grad_norm": 0.16567498445510864, + "learning_rate": 8.511282709321784e-06, + "loss": 1.7048, + "step": 26625 + }, + { + "epoch": 8.172498465316146, + "grad_norm": 0.21935853362083435, + "learning_rate": 8.508508849663649e-06, + "loss": 1.7445, + "step": 26626 + }, + { + "epoch": 8.17280540208717, + "grad_norm": 0.18325531482696533, + "learning_rate": 8.505735400048748e-06, + "loss": 1.7343, + "step": 26627 + }, + { + "epoch": 8.173112338858195, + "grad_norm": 0.16334550082683563, + "learning_rate": 8.50296236050449e-06, + "loss": 1.727, + "step": 26628 + }, + { + "epoch": 8.17341927562922, + "grad_norm": 0.23685503005981445, + "learning_rate": 8.500189731058284e-06, + "loss": 1.6718, + "step": 26629 + }, + { + "epoch": 8.173726212400245, + "grad_norm": 0.17057496309280396, + "learning_rate": 8.49741751173752e-06, + "loss": 1.7083, + "step": 26630 + }, + { + "epoch": 8.17403314917127, + "grad_norm": 0.19941039383411407, + "learning_rate": 8.49464570256961e-06, + "loss": 1.6496, + "step": 26631 + }, + { + "epoch": 8.174340085942296, + "grad_norm": 0.1887839138507843, + "learning_rate": 8.49187430358193e-06, + "loss": 1.7896, + "step": 26632 + }, + { + "epoch": 8.174647022713321, + "grad_norm": 0.16285917162895203, + "learning_rate": 8.489103314801883e-06, + "loss": 1.6923, + "step": 26633 + }, + { + "epoch": 8.174953959484347, + "grad_norm": 0.1405196487903595, + "learning_rate": 8.48633273625683e-06, + "loss": 1.6907, + "step": 26634 + }, + { + "epoch": 8.175260896255372, + "grad_norm": 0.17885157465934753, + "learning_rate": 8.483562567974196e-06, + "loss": 1.7036, + "step": 26635 + }, + { + "epoch": 8.175567833026397, + "grad_norm": 0.1427285224199295, + "learning_rate": 8.480792809981309e-06, + "loss": 1.6997, + "step": 26636 + }, + { + "epoch": 8.175874769797423, + "grad_norm": 0.15711882710456848, + "learning_rate": 8.478023462305579e-06, + "loss": 1.6874, + "step": 26637 + }, + { + "epoch": 8.176181706568446, + "grad_norm": 0.19080850481987, + "learning_rate": 8.47525452497434e-06, + "loss": 1.7078, + "step": 26638 + }, + { + "epoch": 8.176488643339471, + "grad_norm": 0.17063139379024506, + "learning_rate": 8.472485998014984e-06, + "loss": 1.7147, + "step": 26639 + }, + { + "epoch": 8.176795580110497, + "grad_norm": 0.151056706905365, + "learning_rate": 8.469717881454865e-06, + "loss": 1.685, + "step": 26640 + }, + { + "epoch": 8.177102516881522, + "grad_norm": 0.16712957620620728, + "learning_rate": 8.466950175321331e-06, + "loss": 1.7142, + "step": 26641 + }, + { + "epoch": 8.177409453652547, + "grad_norm": 0.13982228934764862, + "learning_rate": 8.46418287964174e-06, + "loss": 1.6707, + "step": 26642 + }, + { + "epoch": 8.177716390423573, + "grad_norm": 0.14738497138023376, + "learning_rate": 8.461415994443439e-06, + "loss": 1.7381, + "step": 26643 + }, + { + "epoch": 8.178023327194598, + "grad_norm": 0.1691005975008011, + "learning_rate": 8.45864951975377e-06, + "loss": 1.6956, + "step": 26644 + }, + { + "epoch": 8.178330263965623, + "grad_norm": 0.1477413773536682, + "learning_rate": 8.455883455600078e-06, + "loss": 1.6646, + "step": 26645 + }, + { + "epoch": 8.178637200736649, + "grad_norm": 0.15620499849319458, + "learning_rate": 8.453117802009697e-06, + "loss": 1.7031, + "step": 26646 + }, + { + "epoch": 8.178944137507674, + "grad_norm": 0.1572941690683365, + "learning_rate": 8.45035255900995e-06, + "loss": 1.6509, + "step": 26647 + }, + { + "epoch": 8.1792510742787, + "grad_norm": 0.20386455953121185, + "learning_rate": 8.447587726628176e-06, + "loss": 1.7166, + "step": 26648 + }, + { + "epoch": 8.179558011049723, + "grad_norm": 0.2131095975637436, + "learning_rate": 8.444823304891697e-06, + "loss": 1.6934, + "step": 26649 + }, + { + "epoch": 8.179864947820748, + "grad_norm": 0.15402472019195557, + "learning_rate": 8.442059293827826e-06, + "loss": 1.7538, + "step": 26650 + }, + { + "epoch": 8.180171884591774, + "grad_norm": 0.17687393724918365, + "learning_rate": 8.439295693463872e-06, + "loss": 1.7374, + "step": 26651 + }, + { + "epoch": 8.180478821362799, + "grad_norm": 0.16971834003925323, + "learning_rate": 8.436532503827188e-06, + "loss": 1.7142, + "step": 26652 + }, + { + "epoch": 8.180785758133824, + "grad_norm": 0.17651747167110443, + "learning_rate": 8.433769724945017e-06, + "loss": 1.7109, + "step": 26653 + }, + { + "epoch": 8.18109269490485, + "grad_norm": 0.18742668628692627, + "learning_rate": 8.431007356844728e-06, + "loss": 1.7024, + "step": 26654 + }, + { + "epoch": 8.181399631675875, + "grad_norm": 0.1686297208070755, + "learning_rate": 8.428245399553559e-06, + "loss": 1.7669, + "step": 26655 + }, + { + "epoch": 8.1817065684469, + "grad_norm": 0.1667923480272293, + "learning_rate": 8.425483853098848e-06, + "loss": 1.6928, + "step": 26656 + }, + { + "epoch": 8.182013505217926, + "grad_norm": 0.16002421081066132, + "learning_rate": 8.422722717507874e-06, + "loss": 1.7058, + "step": 26657 + }, + { + "epoch": 8.182320441988951, + "grad_norm": 0.1531311571598053, + "learning_rate": 8.419961992807928e-06, + "loss": 1.7096, + "step": 26658 + }, + { + "epoch": 8.182627378759975, + "grad_norm": 0.16212326288223267, + "learning_rate": 8.417201679026282e-06, + "loss": 1.6849, + "step": 26659 + }, + { + "epoch": 8.182934315531, + "grad_norm": 0.17276698350906372, + "learning_rate": 8.414441776190224e-06, + "loss": 1.6697, + "step": 26660 + }, + { + "epoch": 8.183241252302025, + "grad_norm": 0.15050961077213287, + "learning_rate": 8.411682284327028e-06, + "loss": 1.6972, + "step": 26661 + }, + { + "epoch": 8.18354818907305, + "grad_norm": 0.14593006670475006, + "learning_rate": 8.40892320346396e-06, + "loss": 1.7005, + "step": 26662 + }, + { + "epoch": 8.183855125844076, + "grad_norm": 0.18584349751472473, + "learning_rate": 8.406164533628291e-06, + "loss": 1.7366, + "step": 26663 + }, + { + "epoch": 8.184162062615101, + "grad_norm": 0.18662385642528534, + "learning_rate": 8.403406274847287e-06, + "loss": 1.77, + "step": 26664 + }, + { + "epoch": 8.184468999386127, + "grad_norm": 0.1735418438911438, + "learning_rate": 8.4006484271482e-06, + "loss": 1.692, + "step": 26665 + }, + { + "epoch": 8.184775936157152, + "grad_norm": 0.22115837037563324, + "learning_rate": 8.397890990558283e-06, + "loss": 1.7321, + "step": 26666 + }, + { + "epoch": 8.185082872928177, + "grad_norm": 0.1662493795156479, + "learning_rate": 8.395133965104796e-06, + "loss": 1.7016, + "step": 26667 + }, + { + "epoch": 8.185389809699203, + "grad_norm": 0.20966672897338867, + "learning_rate": 8.392377350814967e-06, + "loss": 1.6703, + "step": 26668 + }, + { + "epoch": 8.185696746470228, + "grad_norm": 0.16722753643989563, + "learning_rate": 8.389621147716076e-06, + "loss": 1.7429, + "step": 26669 + }, + { + "epoch": 8.186003683241251, + "grad_norm": 0.20280788838863373, + "learning_rate": 8.386865355835316e-06, + "loss": 1.7155, + "step": 26670 + }, + { + "epoch": 8.186310620012277, + "grad_norm": 0.20596744120121002, + "learning_rate": 8.384109975199967e-06, + "loss": 1.7266, + "step": 26671 + }, + { + "epoch": 8.186617556783302, + "grad_norm": 0.1525292545557022, + "learning_rate": 8.381355005837205e-06, + "loss": 1.6692, + "step": 26672 + }, + { + "epoch": 8.186924493554327, + "grad_norm": 0.21745061874389648, + "learning_rate": 8.378600447774304e-06, + "loss": 1.7048, + "step": 26673 + }, + { + "epoch": 8.187231430325353, + "grad_norm": 0.2355356216430664, + "learning_rate": 8.375846301038465e-06, + "loss": 1.7842, + "step": 26674 + }, + { + "epoch": 8.187538367096378, + "grad_norm": 0.18660607933998108, + "learning_rate": 8.37309256565691e-06, + "loss": 1.698, + "step": 26675 + }, + { + "epoch": 8.187845303867404, + "grad_norm": 0.1690683364868164, + "learning_rate": 8.370339241656855e-06, + "loss": 1.6967, + "step": 26676 + }, + { + "epoch": 8.188152240638429, + "grad_norm": 0.16226762533187866, + "learning_rate": 8.367586329065508e-06, + "loss": 1.6849, + "step": 26677 + }, + { + "epoch": 8.188459177409454, + "grad_norm": 0.192795068025589, + "learning_rate": 8.364833827910074e-06, + "loss": 1.7037, + "step": 26678 + }, + { + "epoch": 8.18876611418048, + "grad_norm": 0.13591274619102478, + "learning_rate": 8.362081738217752e-06, + "loss": 1.6517, + "step": 26679 + }, + { + "epoch": 8.189073050951505, + "grad_norm": 0.16879263520240784, + "learning_rate": 8.359330060015747e-06, + "loss": 1.6751, + "step": 26680 + }, + { + "epoch": 8.189379987722528, + "grad_norm": 0.16385328769683838, + "learning_rate": 8.356578793331243e-06, + "loss": 1.7151, + "step": 26681 + }, + { + "epoch": 8.189686924493554, + "grad_norm": 0.14804807305335999, + "learning_rate": 8.353827938191438e-06, + "loss": 1.6601, + "step": 26682 + }, + { + "epoch": 8.189993861264579, + "grad_norm": 0.1534065157175064, + "learning_rate": 8.351077494623516e-06, + "loss": 1.7664, + "step": 26683 + }, + { + "epoch": 8.190300798035604, + "grad_norm": 0.16167859733104706, + "learning_rate": 8.348327462654659e-06, + "loss": 1.6573, + "step": 26684 + }, + { + "epoch": 8.19060773480663, + "grad_norm": 0.1433487832546234, + "learning_rate": 8.34557784231203e-06, + "loss": 1.6768, + "step": 26685 + }, + { + "epoch": 8.190914671577655, + "grad_norm": 0.1636372059583664, + "learning_rate": 8.342828633622834e-06, + "loss": 1.6648, + "step": 26686 + }, + { + "epoch": 8.19122160834868, + "grad_norm": 0.13938350975513458, + "learning_rate": 8.340079836614206e-06, + "loss": 1.6511, + "step": 26687 + }, + { + "epoch": 8.191528545119706, + "grad_norm": 0.19098511338233948, + "learning_rate": 8.337331451313346e-06, + "loss": 1.7305, + "step": 26688 + }, + { + "epoch": 8.191835481890731, + "grad_norm": 0.15734615921974182, + "learning_rate": 8.33458347774737e-06, + "loss": 1.6777, + "step": 26689 + }, + { + "epoch": 8.192142418661756, + "grad_norm": 0.1523539125919342, + "learning_rate": 8.331835915943475e-06, + "loss": 1.7173, + "step": 26690 + }, + { + "epoch": 8.192449355432782, + "grad_norm": 0.17726896703243256, + "learning_rate": 8.329088765928799e-06, + "loss": 1.6904, + "step": 26691 + }, + { + "epoch": 8.192756292203805, + "grad_norm": 0.18954375386238098, + "learning_rate": 8.326342027730493e-06, + "loss": 1.7062, + "step": 26692 + }, + { + "epoch": 8.19306322897483, + "grad_norm": 0.21199224889278412, + "learning_rate": 8.323595701375702e-06, + "loss": 1.7747, + "step": 26693 + }, + { + "epoch": 8.193370165745856, + "grad_norm": 0.15305975079536438, + "learning_rate": 8.320849786891566e-06, + "loss": 1.6829, + "step": 26694 + }, + { + "epoch": 8.193677102516881, + "grad_norm": 0.1407271921634674, + "learning_rate": 8.318104284305216e-06, + "loss": 1.6774, + "step": 26695 + }, + { + "epoch": 8.193984039287907, + "grad_norm": 0.15379782021045685, + "learning_rate": 8.315359193643796e-06, + "loss": 1.7037, + "step": 26696 + }, + { + "epoch": 8.194290976058932, + "grad_norm": 0.21377405524253845, + "learning_rate": 8.31261451493443e-06, + "loss": 1.7258, + "step": 26697 + }, + { + "epoch": 8.194597912829957, + "grad_norm": 0.1975884586572647, + "learning_rate": 8.309870248204238e-06, + "loss": 1.718, + "step": 26698 + }, + { + "epoch": 8.194904849600983, + "grad_norm": 0.1985187530517578, + "learning_rate": 8.307126393480341e-06, + "loss": 1.7199, + "step": 26699 + }, + { + "epoch": 8.195211786372008, + "grad_norm": 0.17664451897144318, + "learning_rate": 8.304382950789857e-06, + "loss": 1.744, + "step": 26700 + }, + { + "epoch": 8.195518723143033, + "grad_norm": 0.16517753899097443, + "learning_rate": 8.301639920159904e-06, + "loss": 1.7289, + "step": 26701 + }, + { + "epoch": 8.195825659914057, + "grad_norm": 0.15431733429431915, + "learning_rate": 8.29889730161757e-06, + "loss": 1.6854, + "step": 26702 + }, + { + "epoch": 8.196132596685082, + "grad_norm": 0.14390075206756592, + "learning_rate": 8.296155095190005e-06, + "loss": 1.6806, + "step": 26703 + }, + { + "epoch": 8.196439533456108, + "grad_norm": 0.1450011432170868, + "learning_rate": 8.293413300904246e-06, + "loss": 1.6579, + "step": 26704 + }, + { + "epoch": 8.196746470227133, + "grad_norm": 0.20312175154685974, + "learning_rate": 8.290671918787452e-06, + "loss": 1.7053, + "step": 26705 + }, + { + "epoch": 8.197053406998158, + "grad_norm": 0.13979235291481018, + "learning_rate": 8.287930948866656e-06, + "loss": 1.6751, + "step": 26706 + }, + { + "epoch": 8.197360343769184, + "grad_norm": 0.1665562391281128, + "learning_rate": 8.28519039116899e-06, + "loss": 1.7523, + "step": 26707 + }, + { + "epoch": 8.197667280540209, + "grad_norm": 0.15326659381389618, + "learning_rate": 8.282450245721524e-06, + "loss": 1.6788, + "step": 26708 + }, + { + "epoch": 8.197974217311234, + "grad_norm": 0.14121493697166443, + "learning_rate": 8.279710512551331e-06, + "loss": 1.6351, + "step": 26709 + }, + { + "epoch": 8.19828115408226, + "grad_norm": 0.16965799033641815, + "learning_rate": 8.276971191685495e-06, + "loss": 1.7694, + "step": 26710 + }, + { + "epoch": 8.198588090853285, + "grad_norm": 0.21316587924957275, + "learning_rate": 8.274232283151085e-06, + "loss": 1.6922, + "step": 26711 + }, + { + "epoch": 8.19889502762431, + "grad_norm": 0.1613110601902008, + "learning_rate": 8.271493786975165e-06, + "loss": 1.7221, + "step": 26712 + }, + { + "epoch": 8.199201964395334, + "grad_norm": 0.19140063226222992, + "learning_rate": 8.268755703184804e-06, + "loss": 1.7457, + "step": 26713 + }, + { + "epoch": 8.199508901166359, + "grad_norm": 0.1680840253829956, + "learning_rate": 8.26601803180706e-06, + "loss": 1.6948, + "step": 26714 + }, + { + "epoch": 8.199815837937384, + "grad_norm": 0.17642726004123688, + "learning_rate": 8.263280772868982e-06, + "loss": 1.6996, + "step": 26715 + }, + { + "epoch": 8.20012277470841, + "grad_norm": 0.21370023488998413, + "learning_rate": 8.26054392639763e-06, + "loss": 1.7585, + "step": 26716 + }, + { + "epoch": 8.200429711479435, + "grad_norm": 0.20721369981765747, + "learning_rate": 8.257807492420044e-06, + "loss": 1.7127, + "step": 26717 + }, + { + "epoch": 8.20073664825046, + "grad_norm": 0.14441120624542236, + "learning_rate": 8.255071470963272e-06, + "loss": 1.6627, + "step": 26718 + }, + { + "epoch": 8.201043585021486, + "grad_norm": 0.17547503113746643, + "learning_rate": 8.25233586205434e-06, + "loss": 1.7764, + "step": 26719 + }, + { + "epoch": 8.201350521792511, + "grad_norm": 0.1724909394979477, + "learning_rate": 8.24960066572032e-06, + "loss": 1.6978, + "step": 26720 + }, + { + "epoch": 8.201657458563536, + "grad_norm": 0.16465766727924347, + "learning_rate": 8.246865881988186e-06, + "loss": 1.7302, + "step": 26721 + }, + { + "epoch": 8.201964395334562, + "grad_norm": 0.18594282865524292, + "learning_rate": 8.244131510885023e-06, + "loss": 1.7354, + "step": 26722 + }, + { + "epoch": 8.202271332105587, + "grad_norm": 0.163459911942482, + "learning_rate": 8.241397552437803e-06, + "loss": 1.7069, + "step": 26723 + }, + { + "epoch": 8.20257826887661, + "grad_norm": 0.1712186485528946, + "learning_rate": 8.23866400667358e-06, + "loss": 1.7029, + "step": 26724 + }, + { + "epoch": 8.202885205647636, + "grad_norm": 0.155457004904747, + "learning_rate": 8.235930873619357e-06, + "loss": 1.6806, + "step": 26725 + }, + { + "epoch": 8.203192142418661, + "grad_norm": 0.19597770273685455, + "learning_rate": 8.233198153302146e-06, + "loss": 1.7271, + "step": 26726 + }, + { + "epoch": 8.203499079189687, + "grad_norm": 0.17909370362758636, + "learning_rate": 8.230465845748946e-06, + "loss": 1.7334, + "step": 26727 + }, + { + "epoch": 8.203806015960712, + "grad_norm": 0.1566748470067978, + "learning_rate": 8.227733950986766e-06, + "loss": 1.7965, + "step": 26728 + }, + { + "epoch": 8.204112952731737, + "grad_norm": 0.23624123632907867, + "learning_rate": 8.225002469042603e-06, + "loss": 1.7154, + "step": 26729 + }, + { + "epoch": 8.204419889502763, + "grad_norm": 0.17100931704044342, + "learning_rate": 8.222271399943448e-06, + "loss": 1.6745, + "step": 26730 + }, + { + "epoch": 8.204726826273788, + "grad_norm": 0.1762385219335556, + "learning_rate": 8.219540743716298e-06, + "loss": 1.7199, + "step": 26731 + }, + { + "epoch": 8.205033763044813, + "grad_norm": 0.19741147756576538, + "learning_rate": 8.216810500388134e-06, + "loss": 1.7582, + "step": 26732 + }, + { + "epoch": 8.205340699815839, + "grad_norm": 0.14669859409332275, + "learning_rate": 8.214080669985941e-06, + "loss": 1.6859, + "step": 26733 + }, + { + "epoch": 8.205647636586862, + "grad_norm": 0.16434574127197266, + "learning_rate": 8.211351252536692e-06, + "loss": 1.7129, + "step": 26734 + }, + { + "epoch": 8.205954573357888, + "grad_norm": 0.17041419446468353, + "learning_rate": 8.208622248067361e-06, + "loss": 1.7145, + "step": 26735 + }, + { + "epoch": 8.206261510128913, + "grad_norm": 0.16507895290851593, + "learning_rate": 8.205893656604907e-06, + "loss": 1.7486, + "step": 26736 + }, + { + "epoch": 8.206568446899938, + "grad_norm": 0.19548171758651733, + "learning_rate": 8.203165478176334e-06, + "loss": 1.7135, + "step": 26737 + }, + { + "epoch": 8.206875383670964, + "grad_norm": 0.16964592039585114, + "learning_rate": 8.200437712808556e-06, + "loss": 1.703, + "step": 26738 + }, + { + "epoch": 8.207182320441989, + "grad_norm": 0.1599748432636261, + "learning_rate": 8.197710360528571e-06, + "loss": 1.7065, + "step": 26739 + }, + { + "epoch": 8.207489257213014, + "grad_norm": 0.1665380746126175, + "learning_rate": 8.194983421363294e-06, + "loss": 1.6927, + "step": 26740 + }, + { + "epoch": 8.20779619398404, + "grad_norm": 0.13410761952400208, + "learning_rate": 8.192256895339701e-06, + "loss": 1.6373, + "step": 26741 + }, + { + "epoch": 8.208103130755065, + "grad_norm": 0.17461349070072174, + "learning_rate": 8.189530782484733e-06, + "loss": 1.7058, + "step": 26742 + }, + { + "epoch": 8.20841006752609, + "grad_norm": 0.15213793516159058, + "learning_rate": 8.186805082825327e-06, + "loss": 1.6664, + "step": 26743 + }, + { + "epoch": 8.208717004297116, + "grad_norm": 0.17611466348171234, + "learning_rate": 8.184079796388421e-06, + "loss": 1.7029, + "step": 26744 + }, + { + "epoch": 8.20902394106814, + "grad_norm": 0.16301874816417694, + "learning_rate": 8.181354923200945e-06, + "loss": 1.7024, + "step": 26745 + }, + { + "epoch": 8.209330877839164, + "grad_norm": 0.12992535531520844, + "learning_rate": 8.178630463289833e-06, + "loss": 1.6471, + "step": 26746 + }, + { + "epoch": 8.20963781461019, + "grad_norm": 0.1948312669992447, + "learning_rate": 8.175906416682006e-06, + "loss": 1.7359, + "step": 26747 + }, + { + "epoch": 8.209944751381215, + "grad_norm": 0.16086861491203308, + "learning_rate": 8.173182783404387e-06, + "loss": 1.7312, + "step": 26748 + }, + { + "epoch": 8.21025168815224, + "grad_norm": 0.20091786980628967, + "learning_rate": 8.17045956348389e-06, + "loss": 1.7038, + "step": 26749 + }, + { + "epoch": 8.210558624923266, + "grad_norm": 0.18929384648799896, + "learning_rate": 8.16773675694743e-06, + "loss": 1.7129, + "step": 26750 + }, + { + "epoch": 8.210865561694291, + "grad_norm": 0.1536511927843094, + "learning_rate": 8.16501436382191e-06, + "loss": 1.7031, + "step": 26751 + }, + { + "epoch": 8.211172498465316, + "grad_norm": 0.15490883588790894, + "learning_rate": 8.162292384134245e-06, + "loss": 1.6625, + "step": 26752 + }, + { + "epoch": 8.211479435236342, + "grad_norm": 0.18852801620960236, + "learning_rate": 8.159570817911311e-06, + "loss": 1.7691, + "step": 26753 + }, + { + "epoch": 8.211786372007367, + "grad_norm": 0.21555860340595245, + "learning_rate": 8.15684966518005e-06, + "loss": 1.7919, + "step": 26754 + }, + { + "epoch": 8.212093308778392, + "grad_norm": 0.19634628295898438, + "learning_rate": 8.154128925967297e-06, + "loss": 1.7174, + "step": 26755 + }, + { + "epoch": 8.212400245549416, + "grad_norm": 0.15788821876049042, + "learning_rate": 8.151408600299998e-06, + "loss": 1.6956, + "step": 26756 + }, + { + "epoch": 8.212707182320441, + "grad_norm": 0.17314517498016357, + "learning_rate": 8.148688688204975e-06, + "loss": 1.75, + "step": 26757 + }, + { + "epoch": 8.213014119091467, + "grad_norm": 0.15606027841567993, + "learning_rate": 8.145969189709158e-06, + "loss": 1.6696, + "step": 26758 + }, + { + "epoch": 8.213321055862492, + "grad_norm": 0.17407195270061493, + "learning_rate": 8.143250104839406e-06, + "loss": 1.7279, + "step": 26759 + }, + { + "epoch": 8.213627992633517, + "grad_norm": 0.1557784378528595, + "learning_rate": 8.140531433622589e-06, + "loss": 1.7221, + "step": 26760 + }, + { + "epoch": 8.213934929404543, + "grad_norm": 0.1544533222913742, + "learning_rate": 8.137813176085574e-06, + "loss": 1.6805, + "step": 26761 + }, + { + "epoch": 8.214241866175568, + "grad_norm": 0.1605178564786911, + "learning_rate": 8.135095332255222e-06, + "loss": 1.7783, + "step": 26762 + }, + { + "epoch": 8.214548802946593, + "grad_norm": 0.14513778686523438, + "learning_rate": 8.1323779021584e-06, + "loss": 1.6933, + "step": 26763 + }, + { + "epoch": 8.214855739717619, + "grad_norm": 0.1282239407300949, + "learning_rate": 8.12966088582196e-06, + "loss": 1.6598, + "step": 26764 + }, + { + "epoch": 8.215162676488644, + "grad_norm": 0.1373436003923416, + "learning_rate": 8.126944283272748e-06, + "loss": 1.6227, + "step": 26765 + }, + { + "epoch": 8.215469613259668, + "grad_norm": 0.1634049266576767, + "learning_rate": 8.124228094537617e-06, + "loss": 1.7346, + "step": 26766 + }, + { + "epoch": 8.215776550030693, + "grad_norm": 0.16928012669086456, + "learning_rate": 8.12151231964341e-06, + "loss": 1.6958, + "step": 26767 + }, + { + "epoch": 8.216083486801718, + "grad_norm": 0.15764811635017395, + "learning_rate": 8.11879695861696e-06, + "loss": 1.6965, + "step": 26768 + }, + { + "epoch": 8.216390423572744, + "grad_norm": 0.1514546275138855, + "learning_rate": 8.11608201148511e-06, + "loss": 1.6804, + "step": 26769 + }, + { + "epoch": 8.216697360343769, + "grad_norm": 0.17304199934005737, + "learning_rate": 8.113367478274686e-06, + "loss": 1.7869, + "step": 26770 + }, + { + "epoch": 8.217004297114794, + "grad_norm": 0.19664239883422852, + "learning_rate": 8.11065335901251e-06, + "loss": 1.7082, + "step": 26771 + }, + { + "epoch": 8.21731123388582, + "grad_norm": 0.13926036655902863, + "learning_rate": 8.107939653725405e-06, + "loss": 1.6758, + "step": 26772 + }, + { + "epoch": 8.217618170656845, + "grad_norm": 0.14624418318271637, + "learning_rate": 8.10522636244021e-06, + "loss": 1.6716, + "step": 26773 + }, + { + "epoch": 8.21792510742787, + "grad_norm": 0.15462076663970947, + "learning_rate": 8.102513485183704e-06, + "loss": 1.6953, + "step": 26774 + }, + { + "epoch": 8.218232044198896, + "grad_norm": 0.21293844282627106, + "learning_rate": 8.099801021982729e-06, + "loss": 1.69, + "step": 26775 + }, + { + "epoch": 8.218538980969921, + "grad_norm": 0.16696035861968994, + "learning_rate": 8.09708897286408e-06, + "loss": 1.721, + "step": 26776 + }, + { + "epoch": 8.218845917740945, + "grad_norm": 0.1741570085287094, + "learning_rate": 8.094377337854553e-06, + "loss": 1.69, + "step": 26777 + }, + { + "epoch": 8.21915285451197, + "grad_norm": 0.17061090469360352, + "learning_rate": 8.091666116980957e-06, + "loss": 1.6886, + "step": 26778 + }, + { + "epoch": 8.219459791282995, + "grad_norm": 0.16761218011379242, + "learning_rate": 8.088955310270075e-06, + "loss": 1.6951, + "step": 26779 + }, + { + "epoch": 8.21976672805402, + "grad_norm": 0.21173669397830963, + "learning_rate": 8.086244917748703e-06, + "loss": 1.7714, + "step": 26780 + }, + { + "epoch": 8.220073664825046, + "grad_norm": 0.1629040539264679, + "learning_rate": 8.083534939443626e-06, + "loss": 1.6712, + "step": 26781 + }, + { + "epoch": 8.220380601596071, + "grad_norm": 0.14620709419250488, + "learning_rate": 8.080825375381623e-06, + "loss": 1.6638, + "step": 26782 + }, + { + "epoch": 8.220687538367097, + "grad_norm": 0.16511180996894836, + "learning_rate": 8.078116225589477e-06, + "loss": 1.6739, + "step": 26783 + }, + { + "epoch": 8.220994475138122, + "grad_norm": 0.155776247382164, + "learning_rate": 8.075407490093951e-06, + "loss": 1.7098, + "step": 26784 + }, + { + "epoch": 8.221301411909147, + "grad_norm": 0.18273292481899261, + "learning_rate": 8.072699168921826e-06, + "loss": 1.7595, + "step": 26785 + }, + { + "epoch": 8.221608348680173, + "grad_norm": 0.20691648125648499, + "learning_rate": 8.069991262099862e-06, + "loss": 1.7044, + "step": 26786 + }, + { + "epoch": 8.221915285451198, + "grad_norm": 0.13940884172916412, + "learning_rate": 8.06728376965482e-06, + "loss": 1.6651, + "step": 26787 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 0.1676037758588791, + "learning_rate": 8.064576691613457e-06, + "loss": 1.7215, + "step": 26788 + }, + { + "epoch": 8.222529158993247, + "grad_norm": 0.18815284967422485, + "learning_rate": 8.06187002800251e-06, + "loss": 1.771, + "step": 26789 + }, + { + "epoch": 8.222836095764272, + "grad_norm": 0.16505572199821472, + "learning_rate": 8.059163778848771e-06, + "loss": 1.7072, + "step": 26790 + }, + { + "epoch": 8.223143032535297, + "grad_norm": 0.15086548030376434, + "learning_rate": 8.056457944178936e-06, + "loss": 1.6874, + "step": 26791 + }, + { + "epoch": 8.223449969306323, + "grad_norm": 0.13147135078907013, + "learning_rate": 8.053752524019792e-06, + "loss": 1.6604, + "step": 26792 + }, + { + "epoch": 8.223756906077348, + "grad_norm": 0.13695500791072845, + "learning_rate": 8.051047518398024e-06, + "loss": 1.6498, + "step": 26793 + }, + { + "epoch": 8.224063842848373, + "grad_norm": 0.16654162108898163, + "learning_rate": 8.048342927340407e-06, + "loss": 1.6993, + "step": 26794 + }, + { + "epoch": 8.224370779619399, + "grad_norm": 0.15318933129310608, + "learning_rate": 8.045638750873652e-06, + "loss": 1.716, + "step": 26795 + }, + { + "epoch": 8.224677716390424, + "grad_norm": 0.17502783238887787, + "learning_rate": 8.04293498902448e-06, + "loss": 1.6953, + "step": 26796 + }, + { + "epoch": 8.22498465316145, + "grad_norm": 0.17295950651168823, + "learning_rate": 8.040231641819623e-06, + "loss": 1.6794, + "step": 26797 + }, + { + "epoch": 8.225291589932475, + "grad_norm": 0.14702807366847992, + "learning_rate": 8.03752870928579e-06, + "loss": 1.6389, + "step": 26798 + }, + { + "epoch": 8.225598526703498, + "grad_norm": 0.21157263219356537, + "learning_rate": 8.034826191449691e-06, + "loss": 1.6817, + "step": 26799 + }, + { + "epoch": 8.225905463474524, + "grad_norm": 0.1675570011138916, + "learning_rate": 8.03212408833804e-06, + "loss": 1.7636, + "step": 26800 + }, + { + "epoch": 8.226212400245549, + "grad_norm": 0.24485285580158234, + "learning_rate": 8.029422399977531e-06, + "loss": 1.7017, + "step": 26801 + }, + { + "epoch": 8.226519337016574, + "grad_norm": 0.15588007867336273, + "learning_rate": 8.026721126394871e-06, + "loss": 1.6781, + "step": 26802 + }, + { + "epoch": 8.2268262737876, + "grad_norm": 0.16810667514801025, + "learning_rate": 8.024020267616756e-06, + "loss": 1.7046, + "step": 26803 + }, + { + "epoch": 8.227133210558625, + "grad_norm": 0.2029539942741394, + "learning_rate": 8.021319823669875e-06, + "loss": 1.6735, + "step": 26804 + }, + { + "epoch": 8.22744014732965, + "grad_norm": 0.18706166744232178, + "learning_rate": 8.018619794580917e-06, + "loss": 1.6818, + "step": 26805 + }, + { + "epoch": 8.227747084100676, + "grad_norm": 0.18221300840377808, + "learning_rate": 8.01592018037655e-06, + "loss": 1.7349, + "step": 26806 + }, + { + "epoch": 8.228054020871701, + "grad_norm": 0.20281676948070526, + "learning_rate": 8.013220981083492e-06, + "loss": 1.6942, + "step": 26807 + }, + { + "epoch": 8.228360957642726, + "grad_norm": 0.16217820346355438, + "learning_rate": 8.01052219672837e-06, + "loss": 1.6693, + "step": 26808 + }, + { + "epoch": 8.22866789441375, + "grad_norm": 0.19438619911670685, + "learning_rate": 8.007823827337901e-06, + "loss": 1.7195, + "step": 26809 + }, + { + "epoch": 8.228974831184775, + "grad_norm": 0.229817733168602, + "learning_rate": 8.005125872938707e-06, + "loss": 1.7621, + "step": 26810 + }, + { + "epoch": 8.2292817679558, + "grad_norm": 0.20305906236171722, + "learning_rate": 8.002428333557488e-06, + "loss": 1.7132, + "step": 26811 + }, + { + "epoch": 8.229588704726826, + "grad_norm": 0.16244050860404968, + "learning_rate": 7.999731209220884e-06, + "loss": 1.729, + "step": 26812 + }, + { + "epoch": 8.229895641497851, + "grad_norm": 0.18119513988494873, + "learning_rate": 7.997034499955552e-06, + "loss": 1.7431, + "step": 26813 + }, + { + "epoch": 8.230202578268877, + "grad_norm": 0.1475009173154831, + "learning_rate": 7.99433820578816e-06, + "loss": 1.7229, + "step": 26814 + }, + { + "epoch": 8.230509515039902, + "grad_norm": 0.16200442612171173, + "learning_rate": 7.991642326745314e-06, + "loss": 1.7491, + "step": 26815 + }, + { + "epoch": 8.230816451810927, + "grad_norm": 0.17432551085948944, + "learning_rate": 7.988946862853686e-06, + "loss": 1.6997, + "step": 26816 + }, + { + "epoch": 8.231123388581953, + "grad_norm": 0.2010595202445984, + "learning_rate": 7.986251814139916e-06, + "loss": 1.795, + "step": 26817 + }, + { + "epoch": 8.231430325352978, + "grad_norm": 0.15220746397972107, + "learning_rate": 7.983557180630625e-06, + "loss": 1.6912, + "step": 26818 + }, + { + "epoch": 8.231737262124003, + "grad_norm": 0.1524961143732071, + "learning_rate": 7.980862962352454e-06, + "loss": 1.6924, + "step": 26819 + }, + { + "epoch": 8.232044198895027, + "grad_norm": 0.16850624978542328, + "learning_rate": 7.978169159332016e-06, + "loss": 1.7111, + "step": 26820 + }, + { + "epoch": 8.232351135666052, + "grad_norm": 0.19621838629245758, + "learning_rate": 7.975475771595947e-06, + "loss": 1.7237, + "step": 26821 + }, + { + "epoch": 8.232658072437077, + "grad_norm": 0.23287613689899445, + "learning_rate": 7.972782799170858e-06, + "loss": 1.7222, + "step": 26822 + }, + { + "epoch": 8.232965009208103, + "grad_norm": 0.15631796419620514, + "learning_rate": 7.970090242083344e-06, + "loss": 1.7252, + "step": 26823 + }, + { + "epoch": 8.233271945979128, + "grad_norm": 0.17921209335327148, + "learning_rate": 7.967398100360062e-06, + "loss": 1.7018, + "step": 26824 + }, + { + "epoch": 8.233578882750153, + "grad_norm": 0.16767734289169312, + "learning_rate": 7.964706374027564e-06, + "loss": 1.7457, + "step": 26825 + }, + { + "epoch": 8.233885819521179, + "grad_norm": 0.15360240638256073, + "learning_rate": 7.9620150631125e-06, + "loss": 1.6886, + "step": 26826 + }, + { + "epoch": 8.234192756292204, + "grad_norm": 0.17534345388412476, + "learning_rate": 7.959324167641413e-06, + "loss": 1.7167, + "step": 26827 + }, + { + "epoch": 8.23449969306323, + "grad_norm": 0.17453409731388092, + "learning_rate": 7.956633687640941e-06, + "loss": 1.7468, + "step": 26828 + }, + { + "epoch": 8.234806629834255, + "grad_norm": 0.1416994333267212, + "learning_rate": 7.953943623137654e-06, + "loss": 1.6991, + "step": 26829 + }, + { + "epoch": 8.23511356660528, + "grad_norm": 0.14629559218883514, + "learning_rate": 7.951253974158147e-06, + "loss": 1.6891, + "step": 26830 + }, + { + "epoch": 8.235420503376304, + "grad_norm": 0.15972918272018433, + "learning_rate": 7.948564740728998e-06, + "loss": 1.711, + "step": 26831 + }, + { + "epoch": 8.235727440147329, + "grad_norm": 0.184038445353508, + "learning_rate": 7.945875922876761e-06, + "loss": 1.7481, + "step": 26832 + }, + { + "epoch": 8.236034376918354, + "grad_norm": 0.1788245588541031, + "learning_rate": 7.943187520628037e-06, + "loss": 1.7744, + "step": 26833 + }, + { + "epoch": 8.23634131368938, + "grad_norm": 0.18042324483394623, + "learning_rate": 7.940499534009382e-06, + "loss": 1.6905, + "step": 26834 + }, + { + "epoch": 8.236648250460405, + "grad_norm": 0.16115914285182953, + "learning_rate": 7.937811963047364e-06, + "loss": 1.6923, + "step": 26835 + }, + { + "epoch": 8.23695518723143, + "grad_norm": 0.18805812299251556, + "learning_rate": 7.935124807768546e-06, + "loss": 1.7636, + "step": 26836 + }, + { + "epoch": 8.237262124002456, + "grad_norm": 0.14013023674488068, + "learning_rate": 7.932438068199477e-06, + "loss": 1.657, + "step": 26837 + }, + { + "epoch": 8.237569060773481, + "grad_norm": 0.17245794832706451, + "learning_rate": 7.929751744366709e-06, + "loss": 1.7162, + "step": 26838 + }, + { + "epoch": 8.237875997544506, + "grad_norm": 0.20234355330467224, + "learning_rate": 7.927065836296793e-06, + "loss": 1.741, + "step": 26839 + }, + { + "epoch": 8.238182934315532, + "grad_norm": 0.1728539764881134, + "learning_rate": 7.924380344016264e-06, + "loss": 1.7037, + "step": 26840 + }, + { + "epoch": 8.238489871086557, + "grad_norm": 0.20881959795951843, + "learning_rate": 7.921695267551688e-06, + "loss": 1.7446, + "step": 26841 + }, + { + "epoch": 8.23879680785758, + "grad_norm": 0.15921615064144135, + "learning_rate": 7.919010606929562e-06, + "loss": 1.6777, + "step": 26842 + }, + { + "epoch": 8.239103744628606, + "grad_norm": 0.15142741799354553, + "learning_rate": 7.916326362176462e-06, + "loss": 1.6647, + "step": 26843 + }, + { + "epoch": 8.239410681399631, + "grad_norm": 0.14777293801307678, + "learning_rate": 7.913642533318865e-06, + "loss": 1.7008, + "step": 26844 + }, + { + "epoch": 8.239717618170657, + "grad_norm": 0.14506451785564423, + "learning_rate": 7.910959120383332e-06, + "loss": 1.7156, + "step": 26845 + }, + { + "epoch": 8.240024554941682, + "grad_norm": 0.17617642879486084, + "learning_rate": 7.908276123396369e-06, + "loss": 1.707, + "step": 26846 + }, + { + "epoch": 8.240331491712707, + "grad_norm": 0.1640050709247589, + "learning_rate": 7.905593542384493e-06, + "loss": 1.6965, + "step": 26847 + }, + { + "epoch": 8.240638428483733, + "grad_norm": 0.2035178244113922, + "learning_rate": 7.902911377374229e-06, + "loss": 1.7679, + "step": 26848 + }, + { + "epoch": 8.240945365254758, + "grad_norm": 0.16591937839984894, + "learning_rate": 7.900229628392041e-06, + "loss": 1.705, + "step": 26849 + }, + { + "epoch": 8.241252302025783, + "grad_norm": 0.1770060807466507, + "learning_rate": 7.897548295464474e-06, + "loss": 1.6812, + "step": 26850 + }, + { + "epoch": 8.241559238796809, + "grad_norm": 0.1637604683637619, + "learning_rate": 7.89486737861801e-06, + "loss": 1.718, + "step": 26851 + }, + { + "epoch": 8.241866175567832, + "grad_norm": 0.1458534151315689, + "learning_rate": 7.892186877879148e-06, + "loss": 1.6834, + "step": 26852 + }, + { + "epoch": 8.242173112338858, + "grad_norm": 0.14899462461471558, + "learning_rate": 7.889506793274371e-06, + "loss": 1.6815, + "step": 26853 + }, + { + "epoch": 8.242480049109883, + "grad_norm": 0.16069386899471283, + "learning_rate": 7.88682712483017e-06, + "loss": 1.7522, + "step": 26854 + }, + { + "epoch": 8.242786985880908, + "grad_norm": 0.17499712109565735, + "learning_rate": 7.884147872573034e-06, + "loss": 1.6805, + "step": 26855 + }, + { + "epoch": 8.243093922651934, + "grad_norm": 0.1455364227294922, + "learning_rate": 7.881469036529427e-06, + "loss": 1.6797, + "step": 26856 + }, + { + "epoch": 8.243400859422959, + "grad_norm": 0.2292124629020691, + "learning_rate": 7.878790616725818e-06, + "loss": 1.6923, + "step": 26857 + }, + { + "epoch": 8.243707796193984, + "grad_norm": 0.17365983128547668, + "learning_rate": 7.876112613188713e-06, + "loss": 1.713, + "step": 26858 + }, + { + "epoch": 8.24401473296501, + "grad_norm": 0.17498542368412018, + "learning_rate": 7.873435025944525e-06, + "loss": 1.6834, + "step": 26859 + }, + { + "epoch": 8.244321669736035, + "grad_norm": 0.19340896606445312, + "learning_rate": 7.870757855019772e-06, + "loss": 1.7246, + "step": 26860 + }, + { + "epoch": 8.24462860650706, + "grad_norm": 0.16443613171577454, + "learning_rate": 7.868081100440855e-06, + "loss": 1.7217, + "step": 26861 + }, + { + "epoch": 8.244935543278086, + "grad_norm": 0.1470339596271515, + "learning_rate": 7.865404762234268e-06, + "loss": 1.6504, + "step": 26862 + }, + { + "epoch": 8.245242480049109, + "grad_norm": 0.14689552783966064, + "learning_rate": 7.862728840426453e-06, + "loss": 1.7231, + "step": 26863 + }, + { + "epoch": 8.245549416820134, + "grad_norm": 0.25354984402656555, + "learning_rate": 7.860053335043843e-06, + "loss": 1.7951, + "step": 26864 + }, + { + "epoch": 8.24585635359116, + "grad_norm": 0.1774766445159912, + "learning_rate": 7.857378246112896e-06, + "loss": 1.6702, + "step": 26865 + }, + { + "epoch": 8.246163290362185, + "grad_norm": 0.16365554928779602, + "learning_rate": 7.854703573660015e-06, + "loss": 1.6945, + "step": 26866 + }, + { + "epoch": 8.24647022713321, + "grad_norm": 0.15043000876903534, + "learning_rate": 7.852029317711669e-06, + "loss": 1.6341, + "step": 26867 + }, + { + "epoch": 8.246777163904236, + "grad_norm": 0.18268270790576935, + "learning_rate": 7.849355478294274e-06, + "loss": 1.7246, + "step": 26868 + }, + { + "epoch": 8.247084100675261, + "grad_norm": 0.2022860199213028, + "learning_rate": 7.84668205543425e-06, + "loss": 1.7527, + "step": 26869 + }, + { + "epoch": 8.247391037446286, + "grad_norm": 0.15406467020511627, + "learning_rate": 7.844009049158024e-06, + "loss": 1.6678, + "step": 26870 + }, + { + "epoch": 8.247697974217312, + "grad_norm": 0.168084055185318, + "learning_rate": 7.841336459492005e-06, + "loss": 1.7018, + "step": 26871 + }, + { + "epoch": 8.248004910988337, + "grad_norm": 0.15184715390205383, + "learning_rate": 7.83866428646261e-06, + "loss": 1.6636, + "step": 26872 + }, + { + "epoch": 8.248311847759362, + "grad_norm": 0.18516378104686737, + "learning_rate": 7.835992530096248e-06, + "loss": 1.7746, + "step": 26873 + }, + { + "epoch": 8.248618784530386, + "grad_norm": 0.22552374005317688, + "learning_rate": 7.833321190419313e-06, + "loss": 1.7307, + "step": 26874 + }, + { + "epoch": 8.248925721301411, + "grad_norm": 0.14845159649848938, + "learning_rate": 7.830650267458228e-06, + "loss": 1.6831, + "step": 26875 + }, + { + "epoch": 8.249232658072437, + "grad_norm": 0.17764155566692352, + "learning_rate": 7.827979761239356e-06, + "loss": 1.7569, + "step": 26876 + }, + { + "epoch": 8.249539594843462, + "grad_norm": 0.13525958359241486, + "learning_rate": 7.825309671789128e-06, + "loss": 1.6447, + "step": 26877 + }, + { + "epoch": 8.249846531614487, + "grad_norm": 0.1541098952293396, + "learning_rate": 7.822639999133885e-06, + "loss": 1.7054, + "step": 26878 + }, + { + "epoch": 8.250153468385513, + "grad_norm": 0.1462734043598175, + "learning_rate": 7.819970743300042e-06, + "loss": 1.6801, + "step": 26879 + }, + { + "epoch": 8.250460405156538, + "grad_norm": 0.16271938383579254, + "learning_rate": 7.817301904313979e-06, + "loss": 1.7342, + "step": 26880 + }, + { + "epoch": 8.250767341927563, + "grad_norm": 0.18730363249778748, + "learning_rate": 7.814633482202055e-06, + "loss": 1.7656, + "step": 26881 + }, + { + "epoch": 8.251074278698589, + "grad_norm": 0.1343161165714264, + "learning_rate": 7.811965476990663e-06, + "loss": 1.6738, + "step": 26882 + }, + { + "epoch": 8.251381215469614, + "grad_norm": 0.18782657384872437, + "learning_rate": 7.809297888706135e-06, + "loss": 1.6946, + "step": 26883 + }, + { + "epoch": 8.25168815224064, + "grad_norm": 0.16619306802749634, + "learning_rate": 7.806630717374862e-06, + "loss": 1.7024, + "step": 26884 + }, + { + "epoch": 8.251995089011663, + "grad_norm": 0.18570290505886078, + "learning_rate": 7.803963963023192e-06, + "loss": 1.7602, + "step": 26885 + }, + { + "epoch": 8.252302025782688, + "grad_norm": 0.19790740311145782, + "learning_rate": 7.80129762567749e-06, + "loss": 1.6965, + "step": 26886 + }, + { + "epoch": 8.252608962553714, + "grad_norm": 0.17269279062747955, + "learning_rate": 7.79863170536409e-06, + "loss": 1.7585, + "step": 26887 + }, + { + "epoch": 8.252915899324739, + "grad_norm": 0.17961835861206055, + "learning_rate": 7.79596620210935e-06, + "loss": 1.6992, + "step": 26888 + }, + { + "epoch": 8.253222836095764, + "grad_norm": 0.15848924219608307, + "learning_rate": 7.793301115939611e-06, + "loss": 1.6849, + "step": 26889 + }, + { + "epoch": 8.25352977286679, + "grad_norm": 0.16328901052474976, + "learning_rate": 7.790636446881205e-06, + "loss": 1.7049, + "step": 26890 + }, + { + "epoch": 8.253836709637815, + "grad_norm": 0.15410196781158447, + "learning_rate": 7.787972194960463e-06, + "loss": 1.6764, + "step": 26891 + }, + { + "epoch": 8.25414364640884, + "grad_norm": 0.15541456639766693, + "learning_rate": 7.78530836020374e-06, + "loss": 1.6692, + "step": 26892 + }, + { + "epoch": 8.254450583179866, + "grad_norm": 0.1663745492696762, + "learning_rate": 7.782644942637318e-06, + "loss": 1.708, + "step": 26893 + }, + { + "epoch": 8.254757519950891, + "grad_norm": 0.2212733030319214, + "learning_rate": 7.779981942287567e-06, + "loss": 1.7978, + "step": 26894 + }, + { + "epoch": 8.255064456721914, + "grad_norm": 0.15269914269447327, + "learning_rate": 7.777319359180756e-06, + "loss": 1.6688, + "step": 26895 + }, + { + "epoch": 8.25537139349294, + "grad_norm": 0.18167565762996674, + "learning_rate": 7.774657193343238e-06, + "loss": 1.7394, + "step": 26896 + }, + { + "epoch": 8.255678330263965, + "grad_norm": 0.18649235367774963, + "learning_rate": 7.771995444801306e-06, + "loss": 1.7438, + "step": 26897 + }, + { + "epoch": 8.25598526703499, + "grad_norm": 0.14753280580043793, + "learning_rate": 7.769334113581267e-06, + "loss": 1.6624, + "step": 26898 + }, + { + "epoch": 8.256292203806016, + "grad_norm": 0.1815260797739029, + "learning_rate": 7.76667319970943e-06, + "loss": 1.7091, + "step": 26899 + }, + { + "epoch": 8.256599140577041, + "grad_norm": 0.18099220097064972, + "learning_rate": 7.764012703212059e-06, + "loss": 1.7285, + "step": 26900 + }, + { + "epoch": 8.256906077348066, + "grad_norm": 0.15976406633853912, + "learning_rate": 7.76135262411548e-06, + "loss": 1.7038, + "step": 26901 + }, + { + "epoch": 8.257213014119092, + "grad_norm": 0.20424988865852356, + "learning_rate": 7.758692962445974e-06, + "loss": 1.7398, + "step": 26902 + }, + { + "epoch": 8.257519950890117, + "grad_norm": 0.17021317780017853, + "learning_rate": 7.756033718229816e-06, + "loss": 1.7422, + "step": 26903 + }, + { + "epoch": 8.257826887661142, + "grad_norm": 0.2599583566188812, + "learning_rate": 7.753374891493298e-06, + "loss": 1.6943, + "step": 26904 + }, + { + "epoch": 8.258133824432168, + "grad_norm": 0.16305646300315857, + "learning_rate": 7.750716482262693e-06, + "loss": 1.7129, + "step": 26905 + }, + { + "epoch": 8.258440761203191, + "grad_norm": 0.136509507894516, + "learning_rate": 7.74805849056427e-06, + "loss": 1.666, + "step": 26906 + }, + { + "epoch": 8.258747697974217, + "grad_norm": 0.14928071200847626, + "learning_rate": 7.745400916424294e-06, + "loss": 1.6842, + "step": 26907 + }, + { + "epoch": 8.259054634745242, + "grad_norm": 0.20410865545272827, + "learning_rate": 7.74274375986902e-06, + "loss": 1.7376, + "step": 26908 + }, + { + "epoch": 8.259361571516267, + "grad_norm": 0.16844697296619415, + "learning_rate": 7.740087020924746e-06, + "loss": 1.7125, + "step": 26909 + }, + { + "epoch": 8.259668508287293, + "grad_norm": 0.1874905675649643, + "learning_rate": 7.737430699617681e-06, + "loss": 1.7534, + "step": 26910 + }, + { + "epoch": 8.259975445058318, + "grad_norm": 0.15867100656032562, + "learning_rate": 7.734774795974114e-06, + "loss": 1.7329, + "step": 26911 + }, + { + "epoch": 8.260282381829343, + "grad_norm": 0.14987660944461823, + "learning_rate": 7.732119310020258e-06, + "loss": 1.7038, + "step": 26912 + }, + { + "epoch": 8.260589318600369, + "grad_norm": 0.259883314371109, + "learning_rate": 7.729464241782381e-06, + "loss": 1.7677, + "step": 26913 + }, + { + "epoch": 8.260896255371394, + "grad_norm": 0.2080366462469101, + "learning_rate": 7.726809591286716e-06, + "loss": 1.7662, + "step": 26914 + }, + { + "epoch": 8.26120319214242, + "grad_norm": 0.1707276701927185, + "learning_rate": 7.724155358559492e-06, + "loss": 1.671, + "step": 26915 + }, + { + "epoch": 8.261510128913443, + "grad_norm": 0.17241668701171875, + "learning_rate": 7.721501543626958e-06, + "loss": 1.7227, + "step": 26916 + }, + { + "epoch": 8.261817065684468, + "grad_norm": 0.18578803539276123, + "learning_rate": 7.718848146515301e-06, + "loss": 1.6962, + "step": 26917 + }, + { + "epoch": 8.262124002455494, + "grad_norm": 0.16692428290843964, + "learning_rate": 7.716195167250778e-06, + "loss": 1.6918, + "step": 26918 + }, + { + "epoch": 8.262430939226519, + "grad_norm": 0.18908677995204926, + "learning_rate": 7.713542605859602e-06, + "loss": 1.7271, + "step": 26919 + }, + { + "epoch": 8.262737875997544, + "grad_norm": 0.2003175914287567, + "learning_rate": 7.710890462367981e-06, + "loss": 1.729, + "step": 26920 + }, + { + "epoch": 8.26304481276857, + "grad_norm": 0.16058455407619476, + "learning_rate": 7.708238736802125e-06, + "loss": 1.671, + "step": 26921 + }, + { + "epoch": 8.263351749539595, + "grad_norm": 0.1803000271320343, + "learning_rate": 7.705587429188244e-06, + "loss": 1.7582, + "step": 26922 + }, + { + "epoch": 8.26365868631062, + "grad_norm": 0.218659445643425, + "learning_rate": 7.70293653955254e-06, + "loss": 1.7431, + "step": 26923 + }, + { + "epoch": 8.263965623081646, + "grad_norm": 0.13701553642749786, + "learning_rate": 7.700286067921204e-06, + "loss": 1.6806, + "step": 26924 + }, + { + "epoch": 8.264272559852671, + "grad_norm": 0.15342164039611816, + "learning_rate": 7.697636014320436e-06, + "loss": 1.6501, + "step": 26925 + }, + { + "epoch": 8.264579496623696, + "grad_norm": 0.18738442659378052, + "learning_rate": 7.69498637877642e-06, + "loss": 1.7032, + "step": 26926 + }, + { + "epoch": 8.26488643339472, + "grad_norm": 0.14805950224399567, + "learning_rate": 7.692337161315338e-06, + "loss": 1.6641, + "step": 26927 + }, + { + "epoch": 8.265193370165745, + "grad_norm": 0.18155299127101898, + "learning_rate": 7.689688361963398e-06, + "loss": 1.6967, + "step": 26928 + }, + { + "epoch": 8.26550030693677, + "grad_norm": 0.13954955339431763, + "learning_rate": 7.68703998074673e-06, + "loss": 1.6865, + "step": 26929 + }, + { + "epoch": 8.265807243707796, + "grad_norm": 0.1464248150587082, + "learning_rate": 7.684392017691549e-06, + "loss": 1.6702, + "step": 26930 + }, + { + "epoch": 8.266114180478821, + "grad_norm": 0.16407039761543274, + "learning_rate": 7.68174447282401e-06, + "loss": 1.7265, + "step": 26931 + }, + { + "epoch": 8.266421117249847, + "grad_norm": 0.13243085145950317, + "learning_rate": 7.679097346170272e-06, + "loss": 1.67, + "step": 26932 + }, + { + "epoch": 8.266728054020872, + "grad_norm": 0.18284925818443298, + "learning_rate": 7.67645063775651e-06, + "loss": 1.7524, + "step": 26933 + }, + { + "epoch": 8.267034990791897, + "grad_norm": 0.16042175889015198, + "learning_rate": 7.673804347608849e-06, + "loss": 1.7244, + "step": 26934 + }, + { + "epoch": 8.267341927562923, + "grad_norm": 0.18213023245334625, + "learning_rate": 7.67115847575347e-06, + "loss": 1.7241, + "step": 26935 + }, + { + "epoch": 8.267648864333948, + "grad_norm": 0.1590288132429123, + "learning_rate": 7.668513022216517e-06, + "loss": 1.7056, + "step": 26936 + }, + { + "epoch": 8.267955801104973, + "grad_norm": 0.17236095666885376, + "learning_rate": 7.665867987024122e-06, + "loss": 1.7251, + "step": 26937 + }, + { + "epoch": 8.268262737875997, + "grad_norm": 0.14264018833637238, + "learning_rate": 7.663223370202439e-06, + "loss": 1.6672, + "step": 26938 + }, + { + "epoch": 8.268569674647022, + "grad_norm": 0.15768232941627502, + "learning_rate": 7.660579171777599e-06, + "loss": 1.6846, + "step": 26939 + }, + { + "epoch": 8.268876611418047, + "grad_norm": 0.12978656589984894, + "learning_rate": 7.657935391775727e-06, + "loss": 1.6615, + "step": 26940 + }, + { + "epoch": 8.269183548189073, + "grad_norm": 0.18869580328464508, + "learning_rate": 7.655292030222955e-06, + "loss": 1.7056, + "step": 26941 + }, + { + "epoch": 8.269490484960098, + "grad_norm": 0.16662544012069702, + "learning_rate": 7.652649087145409e-06, + "loss": 1.7559, + "step": 26942 + }, + { + "epoch": 8.269797421731123, + "grad_norm": 0.20138496160507202, + "learning_rate": 7.650006562569201e-06, + "loss": 1.7428, + "step": 26943 + }, + { + "epoch": 8.270104358502149, + "grad_norm": 0.16201090812683105, + "learning_rate": 7.647364456520439e-06, + "loss": 1.7456, + "step": 26944 + }, + { + "epoch": 8.270411295273174, + "grad_norm": 0.16562269628047943, + "learning_rate": 7.644722769025275e-06, + "loss": 1.7282, + "step": 26945 + }, + { + "epoch": 8.2707182320442, + "grad_norm": 0.1434047371149063, + "learning_rate": 7.642081500109754e-06, + "loss": 1.6959, + "step": 26946 + }, + { + "epoch": 8.271025168815225, + "grad_norm": 0.1424918919801712, + "learning_rate": 7.63944064980004e-06, + "loss": 1.7133, + "step": 26947 + }, + { + "epoch": 8.27133210558625, + "grad_norm": 0.23540155589580536, + "learning_rate": 7.636800218122176e-06, + "loss": 1.7156, + "step": 26948 + }, + { + "epoch": 8.271639042357274, + "grad_norm": 0.1890154927968979, + "learning_rate": 7.634160205102292e-06, + "loss": 1.7452, + "step": 26949 + }, + { + "epoch": 8.271945979128299, + "grad_norm": 0.1555023491382599, + "learning_rate": 7.631520610766486e-06, + "loss": 1.7096, + "step": 26950 + }, + { + "epoch": 8.272252915899324, + "grad_norm": 0.16713875532150269, + "learning_rate": 7.628881435140794e-06, + "loss": 1.6832, + "step": 26951 + }, + { + "epoch": 8.27255985267035, + "grad_norm": 0.18925394117832184, + "learning_rate": 7.626242678251349e-06, + "loss": 1.7755, + "step": 26952 + }, + { + "epoch": 8.272866789441375, + "grad_norm": 0.19905491173267365, + "learning_rate": 7.6236043401242074e-06, + "loss": 1.6915, + "step": 26953 + }, + { + "epoch": 8.2731737262124, + "grad_norm": 0.13694030046463013, + "learning_rate": 7.620966420785447e-06, + "loss": 1.6935, + "step": 26954 + }, + { + "epoch": 8.273480662983426, + "grad_norm": 0.1292782723903656, + "learning_rate": 7.61832892026113e-06, + "loss": 1.6823, + "step": 26955 + }, + { + "epoch": 8.273787599754451, + "grad_norm": 0.15123988687992096, + "learning_rate": 7.615691838577333e-06, + "loss": 1.6807, + "step": 26956 + }, + { + "epoch": 8.274094536525476, + "grad_norm": 0.14225423336029053, + "learning_rate": 7.6130551757601084e-06, + "loss": 1.6616, + "step": 26957 + }, + { + "epoch": 8.274401473296502, + "grad_norm": 0.15328221023082733, + "learning_rate": 7.610418931835517e-06, + "loss": 1.7211, + "step": 26958 + }, + { + "epoch": 8.274708410067525, + "grad_norm": 0.168446883559227, + "learning_rate": 7.6077831068296134e-06, + "loss": 1.7211, + "step": 26959 + }, + { + "epoch": 8.27501534683855, + "grad_norm": 0.1877220869064331, + "learning_rate": 7.6051477007684444e-06, + "loss": 1.7139, + "step": 26960 + }, + { + "epoch": 8.275322283609576, + "grad_norm": 0.14273744821548462, + "learning_rate": 7.602512713678039e-06, + "loss": 1.6996, + "step": 26961 + }, + { + "epoch": 8.275629220380601, + "grad_norm": 0.1611991822719574, + "learning_rate": 7.599878145584477e-06, + "loss": 1.6837, + "step": 26962 + }, + { + "epoch": 8.275936157151627, + "grad_norm": 0.13847516477108002, + "learning_rate": 7.597243996513747e-06, + "loss": 1.6449, + "step": 26963 + }, + { + "epoch": 8.276243093922652, + "grad_norm": 0.16816900670528412, + "learning_rate": 7.59461026649193e-06, + "loss": 1.747, + "step": 26964 + }, + { + "epoch": 8.276550030693677, + "grad_norm": 0.15942460298538208, + "learning_rate": 7.5919769555450046e-06, + "loss": 1.7461, + "step": 26965 + }, + { + "epoch": 8.276856967464703, + "grad_norm": 0.16706149280071259, + "learning_rate": 7.589344063699033e-06, + "loss": 1.7136, + "step": 26966 + }, + { + "epoch": 8.277163904235728, + "grad_norm": 0.16727334260940552, + "learning_rate": 7.586711590980028e-06, + "loss": 1.7186, + "step": 26967 + }, + { + "epoch": 8.277470841006753, + "grad_norm": 0.1510261744260788, + "learning_rate": 7.5840795374139795e-06, + "loss": 1.6795, + "step": 26968 + }, + { + "epoch": 8.277777777777779, + "grad_norm": 0.1705521196126938, + "learning_rate": 7.581447903026939e-06, + "loss": 1.6903, + "step": 26969 + }, + { + "epoch": 8.278084714548802, + "grad_norm": 0.15767472982406616, + "learning_rate": 7.57881668784487e-06, + "loss": 1.7264, + "step": 26970 + }, + { + "epoch": 8.278391651319827, + "grad_norm": 0.15771441161632538, + "learning_rate": 7.576185891893805e-06, + "loss": 1.7091, + "step": 26971 + }, + { + "epoch": 8.278698588090853, + "grad_norm": 0.22973434627056122, + "learning_rate": 7.5735555151997425e-06, + "loss": 1.7357, + "step": 26972 + }, + { + "epoch": 8.279005524861878, + "grad_norm": 0.15931910276412964, + "learning_rate": 7.570925557788672e-06, + "loss": 1.7026, + "step": 26973 + }, + { + "epoch": 8.279312461632903, + "grad_norm": 0.1451634019613266, + "learning_rate": 7.568296019686583e-06, + "loss": 1.6824, + "step": 26974 + }, + { + "epoch": 8.279619398403929, + "grad_norm": 0.14617015421390533, + "learning_rate": 7.56566690091946e-06, + "loss": 1.677, + "step": 26975 + }, + { + "epoch": 8.279926335174954, + "grad_norm": 0.14465895295143127, + "learning_rate": 7.5630382015132895e-06, + "loss": 1.7193, + "step": 26976 + }, + { + "epoch": 8.28023327194598, + "grad_norm": 0.1751926839351654, + "learning_rate": 7.560409921494044e-06, + "loss": 1.7366, + "step": 26977 + }, + { + "epoch": 8.280540208717005, + "grad_norm": 0.1478777974843979, + "learning_rate": 7.557782060887697e-06, + "loss": 1.6948, + "step": 26978 + }, + { + "epoch": 8.28084714548803, + "grad_norm": 0.25690537691116333, + "learning_rate": 7.555154619720245e-06, + "loss": 1.7284, + "step": 26979 + }, + { + "epoch": 8.281154082259055, + "grad_norm": 0.1380864977836609, + "learning_rate": 7.552527598017611e-06, + "loss": 1.6753, + "step": 26980 + }, + { + "epoch": 8.281461019030079, + "grad_norm": 0.21658651530742645, + "learning_rate": 7.5499009958057975e-06, + "loss": 1.8076, + "step": 26981 + }, + { + "epoch": 8.281767955801104, + "grad_norm": 0.16225802898406982, + "learning_rate": 7.547274813110727e-06, + "loss": 1.6716, + "step": 26982 + }, + { + "epoch": 8.28207489257213, + "grad_norm": 0.18264736235141754, + "learning_rate": 7.544649049958375e-06, + "loss": 1.7241, + "step": 26983 + }, + { + "epoch": 8.282381829343155, + "grad_norm": 0.17512252926826477, + "learning_rate": 7.542023706374695e-06, + "loss": 1.6709, + "step": 26984 + }, + { + "epoch": 8.28268876611418, + "grad_norm": 0.16799452900886536, + "learning_rate": 7.5393987823856035e-06, + "loss": 1.7333, + "step": 26985 + }, + { + "epoch": 8.282995702885206, + "grad_norm": 0.1569952517747879, + "learning_rate": 7.5367742780170835e-06, + "loss": 1.6701, + "step": 26986 + }, + { + "epoch": 8.283302639656231, + "grad_norm": 0.17452387511730194, + "learning_rate": 7.534150193295026e-06, + "loss": 1.6843, + "step": 26987 + }, + { + "epoch": 8.283609576427256, + "grad_norm": 0.1564214676618576, + "learning_rate": 7.531526528245392e-06, + "loss": 1.7154, + "step": 26988 + }, + { + "epoch": 8.283916513198282, + "grad_norm": 0.14093104004859924, + "learning_rate": 7.528903282894107e-06, + "loss": 1.6448, + "step": 26989 + }, + { + "epoch": 8.284223449969307, + "grad_norm": 0.2950015664100647, + "learning_rate": 7.526280457267093e-06, + "loss": 1.7657, + "step": 26990 + }, + { + "epoch": 8.284530386740332, + "grad_norm": 0.1342417150735855, + "learning_rate": 7.5236580513902756e-06, + "loss": 1.6761, + "step": 26991 + }, + { + "epoch": 8.284837323511356, + "grad_norm": 0.16559085249900818, + "learning_rate": 7.52103606528956e-06, + "loss": 1.7029, + "step": 26992 + }, + { + "epoch": 8.285144260282381, + "grad_norm": 0.14937730133533478, + "learning_rate": 7.5184144989908665e-06, + "loss": 1.6848, + "step": 26993 + }, + { + "epoch": 8.285451197053407, + "grad_norm": 0.14847339689731598, + "learning_rate": 7.515793352520095e-06, + "loss": 1.6735, + "step": 26994 + }, + { + "epoch": 8.285758133824432, + "grad_norm": 0.1866399198770523, + "learning_rate": 7.513172625903148e-06, + "loss": 1.6553, + "step": 26995 + }, + { + "epoch": 8.286065070595457, + "grad_norm": 0.15781863033771515, + "learning_rate": 7.510552319165953e-06, + "loss": 1.699, + "step": 26996 + }, + { + "epoch": 8.286372007366483, + "grad_norm": 0.1402381956577301, + "learning_rate": 7.507932432334358e-06, + "loss": 1.6778, + "step": 26997 + }, + { + "epoch": 8.286678944137508, + "grad_norm": 0.16515657305717468, + "learning_rate": 7.505312965434308e-06, + "loss": 1.6834, + "step": 26998 + }, + { + "epoch": 8.286985880908533, + "grad_norm": 0.16752316057682037, + "learning_rate": 7.502693918491638e-06, + "loss": 1.7714, + "step": 26999 + }, + { + "epoch": 8.287292817679559, + "grad_norm": 0.17935164272785187, + "learning_rate": 7.500075291532266e-06, + "loss": 1.6858, + "step": 27000 + }, + { + "epoch": 8.287599754450584, + "grad_norm": 0.1805913746356964, + "learning_rate": 7.497457084582065e-06, + "loss": 1.7451, + "step": 27001 + }, + { + "epoch": 8.287906691221608, + "grad_norm": 0.15834343433380127, + "learning_rate": 7.494839297666889e-06, + "loss": 1.6675, + "step": 27002 + }, + { + "epoch": 8.288213627992633, + "grad_norm": 0.18627049028873444, + "learning_rate": 7.492221930812648e-06, + "loss": 1.7207, + "step": 27003 + }, + { + "epoch": 8.288520564763658, + "grad_norm": 0.15027324855327606, + "learning_rate": 7.489604984045157e-06, + "loss": 1.686, + "step": 27004 + }, + { + "epoch": 8.288827501534684, + "grad_norm": 0.14771342277526855, + "learning_rate": 7.48698845739032e-06, + "loss": 1.6647, + "step": 27005 + }, + { + "epoch": 8.289134438305709, + "grad_norm": 0.14141151309013367, + "learning_rate": 7.48437235087398e-06, + "loss": 1.7005, + "step": 27006 + }, + { + "epoch": 8.289441375076734, + "grad_norm": 0.14843317866325378, + "learning_rate": 7.481756664521994e-06, + "loss": 1.6768, + "step": 27007 + }, + { + "epoch": 8.28974831184776, + "grad_norm": 0.21505968272686005, + "learning_rate": 7.479141398360206e-06, + "loss": 1.764, + "step": 27008 + }, + { + "epoch": 8.290055248618785, + "grad_norm": 0.1906919926404953, + "learning_rate": 7.476526552414464e-06, + "loss": 1.7079, + "step": 27009 + }, + { + "epoch": 8.29036218538981, + "grad_norm": 0.15975503623485565, + "learning_rate": 7.473912126710614e-06, + "loss": 1.7035, + "step": 27010 + }, + { + "epoch": 8.290669122160836, + "grad_norm": 0.16221746802330017, + "learning_rate": 7.471298121274489e-06, + "loss": 1.6707, + "step": 27011 + }, + { + "epoch": 8.29097605893186, + "grad_norm": 0.17168673872947693, + "learning_rate": 7.468684536131909e-06, + "loss": 1.7119, + "step": 27012 + }, + { + "epoch": 8.291282995702884, + "grad_norm": 0.15114913880825043, + "learning_rate": 7.466071371308742e-06, + "loss": 1.6867, + "step": 27013 + }, + { + "epoch": 8.29158993247391, + "grad_norm": 0.20300740003585815, + "learning_rate": 7.463458626830766e-06, + "loss": 1.7578, + "step": 27014 + }, + { + "epoch": 8.291896869244935, + "grad_norm": 0.1570715457201004, + "learning_rate": 7.460846302723845e-06, + "loss": 1.6588, + "step": 27015 + }, + { + "epoch": 8.29220380601596, + "grad_norm": 0.21273213624954224, + "learning_rate": 7.458234399013747e-06, + "loss": 1.7467, + "step": 27016 + }, + { + "epoch": 8.292510742786986, + "grad_norm": 0.16550743579864502, + "learning_rate": 7.455622915726324e-06, + "loss": 1.699, + "step": 27017 + }, + { + "epoch": 8.292817679558011, + "grad_norm": 0.20360049605369568, + "learning_rate": 7.453011852887387e-06, + "loss": 1.7572, + "step": 27018 + }, + { + "epoch": 8.293124616329036, + "grad_norm": 0.2043008953332901, + "learning_rate": 7.4504012105227004e-06, + "loss": 1.7181, + "step": 27019 + }, + { + "epoch": 8.293431553100062, + "grad_norm": 0.18581026792526245, + "learning_rate": 7.44779098865811e-06, + "loss": 1.742, + "step": 27020 + }, + { + "epoch": 8.293738489871087, + "grad_norm": 0.18011118471622467, + "learning_rate": 7.445181187319367e-06, + "loss": 1.7329, + "step": 27021 + }, + { + "epoch": 8.294045426642112, + "grad_norm": 0.18868795037269592, + "learning_rate": 7.442571806532295e-06, + "loss": 1.7289, + "step": 27022 + }, + { + "epoch": 8.294352363413138, + "grad_norm": 0.15835118293762207, + "learning_rate": 7.439962846322673e-06, + "loss": 1.6878, + "step": 27023 + }, + { + "epoch": 8.294659300184161, + "grad_norm": 0.23331916332244873, + "learning_rate": 7.437354306716282e-06, + "loss": 1.7144, + "step": 27024 + }, + { + "epoch": 8.294966236955187, + "grad_norm": 0.18101559579372406, + "learning_rate": 7.434746187738906e-06, + "loss": 1.7452, + "step": 27025 + }, + { + "epoch": 8.295273173726212, + "grad_norm": 0.16906292736530304, + "learning_rate": 7.432138489416318e-06, + "loss": 1.6772, + "step": 27026 + }, + { + "epoch": 8.295580110497237, + "grad_norm": 0.20603033900260925, + "learning_rate": 7.429531211774282e-06, + "loss": 1.7622, + "step": 27027 + }, + { + "epoch": 8.295887047268263, + "grad_norm": 0.19412389397621155, + "learning_rate": 7.426924354838571e-06, + "loss": 1.6973, + "step": 27028 + }, + { + "epoch": 8.296193984039288, + "grad_norm": 0.1702510118484497, + "learning_rate": 7.424317918634938e-06, + "loss": 1.7119, + "step": 27029 + }, + { + "epoch": 8.296500920810313, + "grad_norm": 0.1476033478975296, + "learning_rate": 7.421711903189171e-06, + "loss": 1.6961, + "step": 27030 + }, + { + "epoch": 8.296807857581339, + "grad_norm": 0.16404536366462708, + "learning_rate": 7.419106308526979e-06, + "loss": 1.6928, + "step": 27031 + }, + { + "epoch": 8.297114794352364, + "grad_norm": 0.15021127462387085, + "learning_rate": 7.416501134674159e-06, + "loss": 1.642, + "step": 27032 + }, + { + "epoch": 8.29742173112339, + "grad_norm": 0.20728830993175507, + "learning_rate": 7.4138963816564266e-06, + "loss": 1.7142, + "step": 27033 + }, + { + "epoch": 8.297728667894415, + "grad_norm": 0.16802074015140533, + "learning_rate": 7.411292049499513e-06, + "loss": 1.6983, + "step": 27034 + }, + { + "epoch": 8.298035604665438, + "grad_norm": 0.15957842767238617, + "learning_rate": 7.408688138229198e-06, + "loss": 1.6535, + "step": 27035 + }, + { + "epoch": 8.298342541436464, + "grad_norm": 0.17618007957935333, + "learning_rate": 7.40608464787117e-06, + "loss": 1.7024, + "step": 27036 + }, + { + "epoch": 8.298649478207489, + "grad_norm": 0.14615842700004578, + "learning_rate": 7.4034815784511994e-06, + "loss": 1.7188, + "step": 27037 + }, + { + "epoch": 8.298956414978514, + "grad_norm": 0.16748850047588348, + "learning_rate": 7.40087892999497e-06, + "loss": 1.6763, + "step": 27038 + }, + { + "epoch": 8.29926335174954, + "grad_norm": 0.15271888673305511, + "learning_rate": 7.398276702528229e-06, + "loss": 1.6766, + "step": 27039 + }, + { + "epoch": 8.299570288520565, + "grad_norm": 0.21336700022220612, + "learning_rate": 7.395674896076693e-06, + "loss": 1.7113, + "step": 27040 + }, + { + "epoch": 8.29987722529159, + "grad_norm": 0.15377891063690186, + "learning_rate": 7.3930735106660655e-06, + "loss": 1.7083, + "step": 27041 + }, + { + "epoch": 8.300184162062616, + "grad_norm": 0.1341678500175476, + "learning_rate": 7.390472546322058e-06, + "loss": 1.6411, + "step": 27042 + }, + { + "epoch": 8.300491098833641, + "grad_norm": 0.1506323516368866, + "learning_rate": 7.3878720030703785e-06, + "loss": 1.6784, + "step": 27043 + }, + { + "epoch": 8.300798035604666, + "grad_norm": 0.20630323886871338, + "learning_rate": 7.385271880936723e-06, + "loss": 1.7296, + "step": 27044 + }, + { + "epoch": 8.30110497237569, + "grad_norm": 0.1514928787946701, + "learning_rate": 7.382672179946787e-06, + "loss": 1.631, + "step": 27045 + }, + { + "epoch": 8.301411909146715, + "grad_norm": 0.21939171850681305, + "learning_rate": 7.3800729001262505e-06, + "loss": 1.7484, + "step": 27046 + }, + { + "epoch": 8.30171884591774, + "grad_norm": 0.13756778836250305, + "learning_rate": 7.377474041500837e-06, + "loss": 1.71, + "step": 27047 + }, + { + "epoch": 8.302025782688766, + "grad_norm": 0.23617541790008545, + "learning_rate": 7.374875604096188e-06, + "loss": 1.7366, + "step": 27048 + }, + { + "epoch": 8.302332719459791, + "grad_norm": 0.236005499958992, + "learning_rate": 7.37227758793802e-06, + "loss": 1.7263, + "step": 27049 + }, + { + "epoch": 8.302639656230816, + "grad_norm": 0.28162217140197754, + "learning_rate": 7.369679993051981e-06, + "loss": 1.7159, + "step": 27050 + }, + { + "epoch": 8.302946593001842, + "grad_norm": 0.18274159729480743, + "learning_rate": 7.3670828194637385e-06, + "loss": 1.695, + "step": 27051 + }, + { + "epoch": 8.303253529772867, + "grad_norm": 0.14628291130065918, + "learning_rate": 7.364486067198994e-06, + "loss": 1.712, + "step": 27052 + }, + { + "epoch": 8.303560466543892, + "grad_norm": 0.16443926095962524, + "learning_rate": 7.361889736283362e-06, + "loss": 1.7003, + "step": 27053 + }, + { + "epoch": 8.303867403314918, + "grad_norm": 0.24396912753582, + "learning_rate": 7.3592938267425525e-06, + "loss": 1.7882, + "step": 27054 + }, + { + "epoch": 8.304174340085943, + "grad_norm": 0.16564849019050598, + "learning_rate": 7.356698338602169e-06, + "loss": 1.7095, + "step": 27055 + }, + { + "epoch": 8.304481276856967, + "grad_norm": 0.17034487426280975, + "learning_rate": 7.3541032718879024e-06, + "loss": 1.7198, + "step": 27056 + }, + { + "epoch": 8.304788213627992, + "grad_norm": 0.15630117058753967, + "learning_rate": 7.351508626625381e-06, + "loss": 1.6642, + "step": 27057 + }, + { + "epoch": 8.305095150399017, + "grad_norm": 0.17507393658161163, + "learning_rate": 7.348914402840246e-06, + "loss": 1.7295, + "step": 27058 + }, + { + "epoch": 8.305402087170043, + "grad_norm": 0.13145345449447632, + "learning_rate": 7.346320600558138e-06, + "loss": 1.6654, + "step": 27059 + }, + { + "epoch": 8.305709023941068, + "grad_norm": 0.17676126956939697, + "learning_rate": 7.343727219804692e-06, + "loss": 1.7347, + "step": 27060 + }, + { + "epoch": 8.306015960712093, + "grad_norm": 0.16341568529605865, + "learning_rate": 7.341134260605536e-06, + "loss": 1.6905, + "step": 27061 + }, + { + "epoch": 8.306322897483119, + "grad_norm": 0.18549038469791412, + "learning_rate": 7.338541722986292e-06, + "loss": 1.7508, + "step": 27062 + }, + { + "epoch": 8.306629834254144, + "grad_norm": 0.15528292953968048, + "learning_rate": 7.335949606972575e-06, + "loss": 1.7261, + "step": 27063 + }, + { + "epoch": 8.30693677102517, + "grad_norm": 0.14363928139209747, + "learning_rate": 7.333357912590028e-06, + "loss": 1.6494, + "step": 27064 + }, + { + "epoch": 8.307243707796195, + "grad_norm": 0.33007505536079407, + "learning_rate": 7.3307666398642285e-06, + "loss": 1.7844, + "step": 27065 + }, + { + "epoch": 8.307550644567218, + "grad_norm": 0.18550951778888702, + "learning_rate": 7.328175788820818e-06, + "loss": 1.7699, + "step": 27066 + }, + { + "epoch": 8.307857581338244, + "grad_norm": 0.1789010763168335, + "learning_rate": 7.325585359485382e-06, + "loss": 1.6903, + "step": 27067 + }, + { + "epoch": 8.308164518109269, + "grad_norm": 0.17079691588878632, + "learning_rate": 7.322995351883505e-06, + "loss": 1.6704, + "step": 27068 + }, + { + "epoch": 8.308471454880294, + "grad_norm": 0.17510086297988892, + "learning_rate": 7.320405766040828e-06, + "loss": 1.7222, + "step": 27069 + }, + { + "epoch": 8.30877839165132, + "grad_norm": 0.1619461178779602, + "learning_rate": 7.317816601982896e-06, + "loss": 1.6573, + "step": 27070 + }, + { + "epoch": 8.309085328422345, + "grad_norm": 0.15886032581329346, + "learning_rate": 7.315227859735335e-06, + "loss": 1.7281, + "step": 27071 + }, + { + "epoch": 8.30939226519337, + "grad_norm": 0.1636921614408493, + "learning_rate": 7.31263953932369e-06, + "loss": 1.7061, + "step": 27072 + }, + { + "epoch": 8.309699201964396, + "grad_norm": 0.16119423508644104, + "learning_rate": 7.3100516407735745e-06, + "loss": 1.7102, + "step": 27073 + }, + { + "epoch": 8.310006138735421, + "grad_norm": 0.2373964637517929, + "learning_rate": 7.3074641641105445e-06, + "loss": 1.7585, + "step": 27074 + }, + { + "epoch": 8.310313075506446, + "grad_norm": 0.17123030126094818, + "learning_rate": 7.304877109360181e-06, + "loss": 1.737, + "step": 27075 + }, + { + "epoch": 8.310620012277472, + "grad_norm": 0.14955085515975952, + "learning_rate": 7.302290476548046e-06, + "loss": 1.6676, + "step": 27076 + }, + { + "epoch": 8.310926949048495, + "grad_norm": 0.19933636486530304, + "learning_rate": 7.299704265699703e-06, + "loss": 1.6926, + "step": 27077 + }, + { + "epoch": 8.31123388581952, + "grad_norm": 0.15449854731559753, + "learning_rate": 7.297118476840709e-06, + "loss": 1.6826, + "step": 27078 + }, + { + "epoch": 8.311540822590546, + "grad_norm": 0.16641317307949066, + "learning_rate": 7.294533109996621e-06, + "loss": 1.7117, + "step": 27079 + }, + { + "epoch": 8.311847759361571, + "grad_norm": 0.18311664462089539, + "learning_rate": 7.291948165192974e-06, + "loss": 1.7376, + "step": 27080 + }, + { + "epoch": 8.312154696132596, + "grad_norm": 0.17437715828418732, + "learning_rate": 7.289363642455349e-06, + "loss": 1.7373, + "step": 27081 + }, + { + "epoch": 8.312461632903622, + "grad_norm": 0.16356121003627777, + "learning_rate": 7.286779541809241e-06, + "loss": 1.6847, + "step": 27082 + }, + { + "epoch": 8.312768569674647, + "grad_norm": 0.182320237159729, + "learning_rate": 7.284195863280241e-06, + "loss": 1.6853, + "step": 27083 + }, + { + "epoch": 8.313075506445673, + "grad_norm": 0.1541421264410019, + "learning_rate": 7.281612606893839e-06, + "loss": 1.7121, + "step": 27084 + }, + { + "epoch": 8.313382443216698, + "grad_norm": 0.16640879213809967, + "learning_rate": 7.2790297726755716e-06, + "loss": 1.6914, + "step": 27085 + }, + { + "epoch": 8.313689379987723, + "grad_norm": 0.18245746195316315, + "learning_rate": 7.27644736065099e-06, + "loss": 1.7544, + "step": 27086 + }, + { + "epoch": 8.313996316758749, + "grad_norm": 0.13833735883235931, + "learning_rate": 7.273865370845573e-06, + "loss": 1.6519, + "step": 27087 + }, + { + "epoch": 8.314303253529772, + "grad_norm": 0.19455993175506592, + "learning_rate": 7.271283803284889e-06, + "loss": 1.7017, + "step": 27088 + }, + { + "epoch": 8.314610190300797, + "grad_norm": 0.16859467327594757, + "learning_rate": 7.268702657994397e-06, + "loss": 1.7173, + "step": 27089 + }, + { + "epoch": 8.314917127071823, + "grad_norm": 0.1667163074016571, + "learning_rate": 7.266121934999642e-06, + "loss": 1.731, + "step": 27090 + }, + { + "epoch": 8.315224063842848, + "grad_norm": 0.161153182387352, + "learning_rate": 7.263541634326115e-06, + "loss": 1.7223, + "step": 27091 + }, + { + "epoch": 8.315531000613873, + "grad_norm": 0.17027638852596283, + "learning_rate": 7.2609617559993234e-06, + "loss": 1.6741, + "step": 27092 + }, + { + "epoch": 8.315837937384899, + "grad_norm": 0.1516280472278595, + "learning_rate": 7.2583823000447526e-06, + "loss": 1.6974, + "step": 27093 + }, + { + "epoch": 8.316144874155924, + "grad_norm": 0.18429140746593475, + "learning_rate": 7.2558032664879035e-06, + "loss": 1.7003, + "step": 27094 + }, + { + "epoch": 8.31645181092695, + "grad_norm": 0.13946834206581116, + "learning_rate": 7.253224655354257e-06, + "loss": 1.7349, + "step": 27095 + }, + { + "epoch": 8.316758747697975, + "grad_norm": 0.17642852663993835, + "learning_rate": 7.250646466669303e-06, + "loss": 1.7131, + "step": 27096 + }, + { + "epoch": 8.317065684469, + "grad_norm": 0.1700926125049591, + "learning_rate": 7.2480687004585155e-06, + "loss": 1.7496, + "step": 27097 + }, + { + "epoch": 8.317372621240025, + "grad_norm": 0.19472727179527283, + "learning_rate": 7.245491356747369e-06, + "loss": 1.73, + "step": 27098 + }, + { + "epoch": 8.317679558011049, + "grad_norm": 0.16857488453388214, + "learning_rate": 7.242914435561327e-06, + "loss": 1.7275, + "step": 27099 + }, + { + "epoch": 8.317986494782074, + "grad_norm": 0.18735560774803162, + "learning_rate": 7.240337936925884e-06, + "loss": 1.7236, + "step": 27100 + }, + { + "epoch": 8.3182934315531, + "grad_norm": 0.2252741903066635, + "learning_rate": 7.237761860866476e-06, + "loss": 1.7347, + "step": 27101 + }, + { + "epoch": 8.318600368324125, + "grad_norm": 0.16848546266555786, + "learning_rate": 7.2351862074085674e-06, + "loss": 1.6956, + "step": 27102 + }, + { + "epoch": 8.31890730509515, + "grad_norm": 0.13781076669692993, + "learning_rate": 7.232610976577614e-06, + "loss": 1.7018, + "step": 27103 + }, + { + "epoch": 8.319214241866176, + "grad_norm": 0.13122199475765228, + "learning_rate": 7.230036168399052e-06, + "loss": 1.652, + "step": 27104 + }, + { + "epoch": 8.319521178637201, + "grad_norm": 0.16110749542713165, + "learning_rate": 7.22746178289837e-06, + "loss": 1.6778, + "step": 27105 + }, + { + "epoch": 8.319828115408226, + "grad_norm": 0.19378480315208435, + "learning_rate": 7.224887820100951e-06, + "loss": 1.7753, + "step": 27106 + }, + { + "epoch": 8.320135052179252, + "grad_norm": 0.18464957177639008, + "learning_rate": 7.2223142800322775e-06, + "loss": 1.7455, + "step": 27107 + }, + { + "epoch": 8.320441988950277, + "grad_norm": 0.16992080211639404, + "learning_rate": 7.2197411627177636e-06, + "loss": 1.731, + "step": 27108 + }, + { + "epoch": 8.3207489257213, + "grad_norm": 0.16602276265621185, + "learning_rate": 7.2171684681828444e-06, + "loss": 1.7236, + "step": 27109 + }, + { + "epoch": 8.321055862492326, + "grad_norm": 0.16713769733905792, + "learning_rate": 7.214596196452944e-06, + "loss": 1.6636, + "step": 27110 + }, + { + "epoch": 8.321362799263351, + "grad_norm": 0.14015473425388336, + "learning_rate": 7.212024347553475e-06, + "loss": 1.6785, + "step": 27111 + }, + { + "epoch": 8.321669736034377, + "grad_norm": 0.25452539324760437, + "learning_rate": 7.209452921509868e-06, + "loss": 1.7434, + "step": 27112 + }, + { + "epoch": 8.321976672805402, + "grad_norm": 0.14998821914196014, + "learning_rate": 7.206881918347524e-06, + "loss": 1.6973, + "step": 27113 + }, + { + "epoch": 8.322283609576427, + "grad_norm": 0.16751673817634583, + "learning_rate": 7.2043113380918515e-06, + "loss": 1.7364, + "step": 27114 + }, + { + "epoch": 8.322590546347453, + "grad_norm": 0.14287763833999634, + "learning_rate": 7.201741180768262e-06, + "loss": 1.6576, + "step": 27115 + }, + { + "epoch": 8.322897483118478, + "grad_norm": 0.14396314322948456, + "learning_rate": 7.199171446402136e-06, + "loss": 1.6541, + "step": 27116 + }, + { + "epoch": 8.323204419889503, + "grad_norm": 0.1835038661956787, + "learning_rate": 7.196602135018915e-06, + "loss": 1.6925, + "step": 27117 + }, + { + "epoch": 8.323511356660529, + "grad_norm": 0.15047648549079895, + "learning_rate": 7.194033246643939e-06, + "loss": 1.7234, + "step": 27118 + }, + { + "epoch": 8.323818293431554, + "grad_norm": 0.1479605883359909, + "learning_rate": 7.19146478130262e-06, + "loss": 1.6702, + "step": 27119 + }, + { + "epoch": 8.324125230202577, + "grad_norm": 0.15971851348876953, + "learning_rate": 7.188896739020335e-06, + "loss": 1.7189, + "step": 27120 + }, + { + "epoch": 8.324432166973603, + "grad_norm": 0.1598353087902069, + "learning_rate": 7.186329119822455e-06, + "loss": 1.7015, + "step": 27121 + }, + { + "epoch": 8.324739103744628, + "grad_norm": 0.18845009803771973, + "learning_rate": 7.183761923734389e-06, + "loss": 1.6771, + "step": 27122 + }, + { + "epoch": 8.325046040515653, + "grad_norm": 0.15288181602954865, + "learning_rate": 7.181195150781456e-06, + "loss": 1.69, + "step": 27123 + }, + { + "epoch": 8.325352977286679, + "grad_norm": 0.16455978155136108, + "learning_rate": 7.178628800989073e-06, + "loss": 1.74, + "step": 27124 + }, + { + "epoch": 8.325659914057704, + "grad_norm": 0.23335149884223938, + "learning_rate": 7.176062874382561e-06, + "loss": 1.7591, + "step": 27125 + }, + { + "epoch": 8.32596685082873, + "grad_norm": 0.16988953948020935, + "learning_rate": 7.173497370987303e-06, + "loss": 1.744, + "step": 27126 + }, + { + "epoch": 8.326273787599755, + "grad_norm": 0.16113093495368958, + "learning_rate": 7.170932290828647e-06, + "loss": 1.6717, + "step": 27127 + }, + { + "epoch": 8.32658072437078, + "grad_norm": 0.16654139757156372, + "learning_rate": 7.168367633931938e-06, + "loss": 1.6797, + "step": 27128 + }, + { + "epoch": 8.326887661141805, + "grad_norm": 0.16671477258205414, + "learning_rate": 7.165803400322524e-06, + "loss": 1.7299, + "step": 27129 + }, + { + "epoch": 8.32719459791283, + "grad_norm": 0.18269041180610657, + "learning_rate": 7.16323959002575e-06, + "loss": 1.7371, + "step": 27130 + }, + { + "epoch": 8.327501534683854, + "grad_norm": 0.17919829487800598, + "learning_rate": 7.160676203066946e-06, + "loss": 1.7158, + "step": 27131 + }, + { + "epoch": 8.32780847145488, + "grad_norm": 0.17928342521190643, + "learning_rate": 7.158113239471453e-06, + "loss": 1.6964, + "step": 27132 + }, + { + "epoch": 8.328115408225905, + "grad_norm": 0.19797661900520325, + "learning_rate": 7.155550699264585e-06, + "loss": 1.7244, + "step": 27133 + }, + { + "epoch": 8.32842234499693, + "grad_norm": 0.15853050351142883, + "learning_rate": 7.1529885824716926e-06, + "loss": 1.6674, + "step": 27134 + }, + { + "epoch": 8.328729281767956, + "grad_norm": 0.20006918907165527, + "learning_rate": 7.150426889118078e-06, + "loss": 1.7601, + "step": 27135 + }, + { + "epoch": 8.329036218538981, + "grad_norm": 0.18851491808891296, + "learning_rate": 7.147865619229055e-06, + "loss": 1.7139, + "step": 27136 + }, + { + "epoch": 8.329343155310006, + "grad_norm": 0.2384614497423172, + "learning_rate": 7.145304772829936e-06, + "loss": 1.7343, + "step": 27137 + }, + { + "epoch": 8.329650092081032, + "grad_norm": 0.15243887901306152, + "learning_rate": 7.142744349946029e-06, + "loss": 1.7071, + "step": 27138 + }, + { + "epoch": 8.329957028852057, + "grad_norm": 0.20257025957107544, + "learning_rate": 7.140184350602663e-06, + "loss": 1.7255, + "step": 27139 + }, + { + "epoch": 8.330263965623082, + "grad_norm": 0.18863585591316223, + "learning_rate": 7.137624774825091e-06, + "loss": 1.6798, + "step": 27140 + }, + { + "epoch": 8.330570902394108, + "grad_norm": 0.19403952360153198, + "learning_rate": 7.135065622638659e-06, + "loss": 1.7354, + "step": 27141 + }, + { + "epoch": 8.330877839165131, + "grad_norm": 0.17294439673423767, + "learning_rate": 7.132506894068608e-06, + "loss": 1.6935, + "step": 27142 + }, + { + "epoch": 8.331184775936157, + "grad_norm": 0.20410899817943573, + "learning_rate": 7.129948589140262e-06, + "loss": 1.7625, + "step": 27143 + }, + { + "epoch": 8.331491712707182, + "grad_norm": 0.1795405000448227, + "learning_rate": 7.127390707878889e-06, + "loss": 1.6756, + "step": 27144 + }, + { + "epoch": 8.331798649478207, + "grad_norm": 0.1823110431432724, + "learning_rate": 7.12483325030977e-06, + "loss": 1.6844, + "step": 27145 + }, + { + "epoch": 8.332105586249233, + "grad_norm": 0.18655838072299957, + "learning_rate": 7.122276216458179e-06, + "loss": 1.7289, + "step": 27146 + }, + { + "epoch": 8.332412523020258, + "grad_norm": 0.16892722249031067, + "learning_rate": 7.119719606349384e-06, + "loss": 1.7003, + "step": 27147 + }, + { + "epoch": 8.332719459791283, + "grad_norm": 0.17768113315105438, + "learning_rate": 7.117163420008654e-06, + "loss": 1.6859, + "step": 27148 + }, + { + "epoch": 8.333026396562309, + "grad_norm": 0.14221824705600739, + "learning_rate": 7.114607657461253e-06, + "loss": 1.6752, + "step": 27149 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.17095401883125305, + "learning_rate": 7.112052318732421e-06, + "loss": 1.7354, + "step": 27150 + }, + { + "epoch": 8.33364027010436, + "grad_norm": 0.1910656839609146, + "learning_rate": 7.109497403847448e-06, + "loss": 1.7124, + "step": 27151 + }, + { + "epoch": 8.333947206875383, + "grad_norm": 0.1857171505689621, + "learning_rate": 7.106942912831549e-06, + "loss": 1.7716, + "step": 27152 + }, + { + "epoch": 8.334254143646408, + "grad_norm": 0.16951163113117218, + "learning_rate": 7.104388845709981e-06, + "loss": 1.7508, + "step": 27153 + }, + { + "epoch": 8.334561080417433, + "grad_norm": 0.18096883594989777, + "learning_rate": 7.101835202507983e-06, + "loss": 1.7064, + "step": 27154 + }, + { + "epoch": 8.334868017188459, + "grad_norm": 0.19499589502811432, + "learning_rate": 7.099281983250783e-06, + "loss": 1.712, + "step": 27155 + }, + { + "epoch": 8.335174953959484, + "grad_norm": 0.23200182616710663, + "learning_rate": 7.096729187963647e-06, + "loss": 1.8253, + "step": 27156 + }, + { + "epoch": 8.33548189073051, + "grad_norm": 0.3447387218475342, + "learning_rate": 7.094176816671755e-06, + "loss": 1.7531, + "step": 27157 + }, + { + "epoch": 8.335788827501535, + "grad_norm": 0.14633947610855103, + "learning_rate": 7.091624869400376e-06, + "loss": 1.6866, + "step": 27158 + }, + { + "epoch": 8.33609576427256, + "grad_norm": 0.19512905180454254, + "learning_rate": 7.0890733461746905e-06, + "loss": 1.6853, + "step": 27159 + }, + { + "epoch": 8.336402701043585, + "grad_norm": 0.20525458455085754, + "learning_rate": 7.086522247019944e-06, + "loss": 1.69, + "step": 27160 + }, + { + "epoch": 8.33670963781461, + "grad_norm": 0.15972889959812164, + "learning_rate": 7.08397157196134e-06, + "loss": 1.6949, + "step": 27161 + }, + { + "epoch": 8.337016574585636, + "grad_norm": 0.18894724547863007, + "learning_rate": 7.081421321024079e-06, + "loss": 1.7254, + "step": 27162 + }, + { + "epoch": 8.33732351135666, + "grad_norm": 0.17392434179782867, + "learning_rate": 7.078871494233364e-06, + "loss": 1.7449, + "step": 27163 + }, + { + "epoch": 8.337630448127685, + "grad_norm": 0.16262824833393097, + "learning_rate": 7.076322091614401e-06, + "loss": 1.734, + "step": 27164 + }, + { + "epoch": 8.33793738489871, + "grad_norm": 0.1960107982158661, + "learning_rate": 7.073773113192383e-06, + "loss": 1.6464, + "step": 27165 + }, + { + "epoch": 8.338244321669736, + "grad_norm": 0.1750497817993164, + "learning_rate": 7.071224558992501e-06, + "loss": 1.7187, + "step": 27166 + }, + { + "epoch": 8.338551258440761, + "grad_norm": 0.2179764360189438, + "learning_rate": 7.068676429039928e-06, + "loss": 1.7207, + "step": 27167 + }, + { + "epoch": 8.338858195211786, + "grad_norm": 0.17758040130138397, + "learning_rate": 7.066128723359877e-06, + "loss": 1.7248, + "step": 27168 + }, + { + "epoch": 8.339165131982812, + "grad_norm": 0.16506128013134003, + "learning_rate": 7.063581441977496e-06, + "loss": 1.7788, + "step": 27169 + }, + { + "epoch": 8.339472068753837, + "grad_norm": 0.18444709479808807, + "learning_rate": 7.061034584917963e-06, + "loss": 1.6958, + "step": 27170 + }, + { + "epoch": 8.339779005524862, + "grad_norm": 0.19419504702091217, + "learning_rate": 7.0584881522064605e-06, + "loss": 1.7459, + "step": 27171 + }, + { + "epoch": 8.340085942295888, + "grad_norm": 0.19482584297657013, + "learning_rate": 7.055942143868133e-06, + "loss": 1.7043, + "step": 27172 + }, + { + "epoch": 8.340392879066913, + "grad_norm": 0.20925387740135193, + "learning_rate": 7.053396559928183e-06, + "loss": 1.7817, + "step": 27173 + }, + { + "epoch": 8.340699815837937, + "grad_norm": 0.2067698836326599, + "learning_rate": 7.050851400411712e-06, + "loss": 1.729, + "step": 27174 + }, + { + "epoch": 8.341006752608962, + "grad_norm": 0.1617327481508255, + "learning_rate": 7.048306665343923e-06, + "loss": 1.6888, + "step": 27175 + }, + { + "epoch": 8.341313689379987, + "grad_norm": 0.16514994204044342, + "learning_rate": 7.045762354749924e-06, + "loss": 1.7152, + "step": 27176 + }, + { + "epoch": 8.341620626151013, + "grad_norm": 0.17930150032043457, + "learning_rate": 7.043218468654889e-06, + "loss": 1.8112, + "step": 27177 + }, + { + "epoch": 8.341927562922038, + "grad_norm": 0.17400570213794708, + "learning_rate": 7.040675007083941e-06, + "loss": 1.7071, + "step": 27178 + }, + { + "epoch": 8.342234499693063, + "grad_norm": 0.18226927518844604, + "learning_rate": 7.038131970062228e-06, + "loss": 1.7786, + "step": 27179 + }, + { + "epoch": 8.342541436464089, + "grad_norm": 0.15586300194263458, + "learning_rate": 7.035589357614869e-06, + "loss": 1.7414, + "step": 27180 + }, + { + "epoch": 8.342848373235114, + "grad_norm": 0.18447721004486084, + "learning_rate": 7.033047169767004e-06, + "loss": 1.7123, + "step": 27181 + }, + { + "epoch": 8.34315531000614, + "grad_norm": 0.16714699566364288, + "learning_rate": 7.030505406543747e-06, + "loss": 1.728, + "step": 27182 + }, + { + "epoch": 8.343462246777165, + "grad_norm": 0.15295952558517456, + "learning_rate": 7.027964067970228e-06, + "loss": 1.6926, + "step": 27183 + }, + { + "epoch": 8.34376918354819, + "grad_norm": 0.14499974250793457, + "learning_rate": 7.025423154071537e-06, + "loss": 1.6841, + "step": 27184 + }, + { + "epoch": 8.344076120319214, + "grad_norm": 0.15066829323768616, + "learning_rate": 7.022882664872827e-06, + "loss": 1.6593, + "step": 27185 + }, + { + "epoch": 8.344383057090239, + "grad_norm": 0.17318779230117798, + "learning_rate": 7.020342600399166e-06, + "loss": 1.698, + "step": 27186 + }, + { + "epoch": 8.344689993861264, + "grad_norm": 0.19946762919425964, + "learning_rate": 7.017802960675674e-06, + "loss": 1.7257, + "step": 27187 + }, + { + "epoch": 8.34499693063229, + "grad_norm": 0.17052631080150604, + "learning_rate": 7.015263745727441e-06, + "loss": 1.7299, + "step": 27188 + }, + { + "epoch": 8.345303867403315, + "grad_norm": 0.16269686818122864, + "learning_rate": 7.012724955579558e-06, + "loss": 1.7385, + "step": 27189 + }, + { + "epoch": 8.34561080417434, + "grad_norm": 0.19195757806301117, + "learning_rate": 7.010186590257145e-06, + "loss": 1.7264, + "step": 27190 + }, + { + "epoch": 8.345917740945366, + "grad_norm": 0.14985592663288116, + "learning_rate": 7.007648649785248e-06, + "loss": 1.7135, + "step": 27191 + }, + { + "epoch": 8.34622467771639, + "grad_norm": 0.16438701748847961, + "learning_rate": 7.00511113418898e-06, + "loss": 1.6876, + "step": 27192 + }, + { + "epoch": 8.346531614487416, + "grad_norm": 0.241184800863266, + "learning_rate": 7.002574043493387e-06, + "loss": 1.8587, + "step": 27193 + }, + { + "epoch": 8.346838551258442, + "grad_norm": 0.17353931069374084, + "learning_rate": 7.000037377723567e-06, + "loss": 1.7465, + "step": 27194 + }, + { + "epoch": 8.347145488029465, + "grad_norm": 0.1923576444387436, + "learning_rate": 6.997501136904583e-06, + "loss": 1.7859, + "step": 27195 + }, + { + "epoch": 8.34745242480049, + "grad_norm": 0.1997295618057251, + "learning_rate": 6.994965321061492e-06, + "loss": 1.7612, + "step": 27196 + }, + { + "epoch": 8.347759361571516, + "grad_norm": 0.184821218252182, + "learning_rate": 6.992429930219363e-06, + "loss": 1.6761, + "step": 27197 + }, + { + "epoch": 8.348066298342541, + "grad_norm": 0.14091727137565613, + "learning_rate": 6.989894964403248e-06, + "loss": 1.6541, + "step": 27198 + }, + { + "epoch": 8.348373235113566, + "grad_norm": 0.13829854130744934, + "learning_rate": 6.987360423638206e-06, + "loss": 1.6814, + "step": 27199 + }, + { + "epoch": 8.348680171884592, + "grad_norm": 0.12685348093509674, + "learning_rate": 6.984826307949272e-06, + "loss": 1.6498, + "step": 27200 + }, + { + "epoch": 8.348987108655617, + "grad_norm": 0.17062726616859436, + "learning_rate": 6.9822926173614856e-06, + "loss": 1.7138, + "step": 27201 + }, + { + "epoch": 8.349294045426642, + "grad_norm": 0.15178726613521576, + "learning_rate": 6.979759351899923e-06, + "loss": 1.756, + "step": 27202 + }, + { + "epoch": 8.349600982197668, + "grad_norm": 0.1897916942834854, + "learning_rate": 6.97722651158958e-06, + "loss": 1.7317, + "step": 27203 + }, + { + "epoch": 8.349907918968693, + "grad_norm": 0.13750115036964417, + "learning_rate": 6.974694096455503e-06, + "loss": 1.6853, + "step": 27204 + }, + { + "epoch": 8.350214855739718, + "grad_norm": 0.17380347847938538, + "learning_rate": 6.972162106522717e-06, + "loss": 1.728, + "step": 27205 + }, + { + "epoch": 8.350521792510742, + "grad_norm": 0.1593543291091919, + "learning_rate": 6.96963054181623e-06, + "loss": 1.6904, + "step": 27206 + }, + { + "epoch": 8.350828729281767, + "grad_norm": 0.1569581925868988, + "learning_rate": 6.967099402361099e-06, + "loss": 1.6995, + "step": 27207 + }, + { + "epoch": 8.351135666052793, + "grad_norm": 0.180283784866333, + "learning_rate": 6.9645686881822935e-06, + "loss": 1.6755, + "step": 27208 + }, + { + "epoch": 8.351442602823818, + "grad_norm": 0.2145276516675949, + "learning_rate": 6.9620383993048654e-06, + "loss": 1.7705, + "step": 27209 + }, + { + "epoch": 8.351749539594843, + "grad_norm": 0.15903061628341675, + "learning_rate": 6.959508535753772e-06, + "loss": 1.702, + "step": 27210 + }, + { + "epoch": 8.352056476365869, + "grad_norm": 0.16429775953292847, + "learning_rate": 6.9569790975540565e-06, + "loss": 1.6656, + "step": 27211 + }, + { + "epoch": 8.352363413136894, + "grad_norm": 0.1546638011932373, + "learning_rate": 6.954450084730707e-06, + "loss": 1.681, + "step": 27212 + }, + { + "epoch": 8.35267034990792, + "grad_norm": 0.17022907733917236, + "learning_rate": 6.951921497308705e-06, + "loss": 1.7094, + "step": 27213 + }, + { + "epoch": 8.352977286678945, + "grad_norm": 0.18317057192325592, + "learning_rate": 6.949393335313048e-06, + "loss": 1.7395, + "step": 27214 + }, + { + "epoch": 8.35328422344997, + "grad_norm": 0.1707061231136322, + "learning_rate": 6.94686559876872e-06, + "loss": 1.6918, + "step": 27215 + }, + { + "epoch": 8.353591160220994, + "grad_norm": 0.171799436211586, + "learning_rate": 6.944338287700697e-06, + "loss": 1.7173, + "step": 27216 + }, + { + "epoch": 8.353898096992019, + "grad_norm": 0.14982536435127258, + "learning_rate": 6.941811402133963e-06, + "loss": 1.7244, + "step": 27217 + }, + { + "epoch": 8.354205033763044, + "grad_norm": 0.1584668904542923, + "learning_rate": 6.939284942093471e-06, + "loss": 1.7023, + "step": 27218 + }, + { + "epoch": 8.35451197053407, + "grad_norm": 0.18367518484592438, + "learning_rate": 6.93675890760423e-06, + "loss": 1.6977, + "step": 27219 + }, + { + "epoch": 8.354818907305095, + "grad_norm": 0.2665458619594574, + "learning_rate": 6.934233298691167e-06, + "loss": 1.7711, + "step": 27220 + }, + { + "epoch": 8.35512584407612, + "grad_norm": 0.1657658815383911, + "learning_rate": 6.931708115379249e-06, + "loss": 1.6957, + "step": 27221 + }, + { + "epoch": 8.355432780847146, + "grad_norm": 0.17687681317329407, + "learning_rate": 6.929183357693436e-06, + "loss": 1.7163, + "step": 27222 + }, + { + "epoch": 8.355739717618171, + "grad_norm": 0.1775265783071518, + "learning_rate": 6.926659025658666e-06, + "loss": 1.7595, + "step": 27223 + }, + { + "epoch": 8.356046654389196, + "grad_norm": 0.1962285041809082, + "learning_rate": 6.924135119299919e-06, + "loss": 1.7852, + "step": 27224 + }, + { + "epoch": 8.356353591160222, + "grad_norm": 0.17352642118930817, + "learning_rate": 6.921611638642095e-06, + "loss": 1.748, + "step": 27225 + }, + { + "epoch": 8.356660527931247, + "grad_norm": 0.19602125883102417, + "learning_rate": 6.919088583710176e-06, + "loss": 1.685, + "step": 27226 + }, + { + "epoch": 8.35696746470227, + "grad_norm": 0.15199948847293854, + "learning_rate": 6.9165659545290525e-06, + "loss": 1.6641, + "step": 27227 + }, + { + "epoch": 8.357274401473296, + "grad_norm": 0.15671736001968384, + "learning_rate": 6.914043751123683e-06, + "loss": 1.6915, + "step": 27228 + }, + { + "epoch": 8.357581338244321, + "grad_norm": 0.19513672590255737, + "learning_rate": 6.911521973518992e-06, + "loss": 1.7526, + "step": 27229 + }, + { + "epoch": 8.357888275015346, + "grad_norm": 0.15108506381511688, + "learning_rate": 6.9090006217398975e-06, + "loss": 1.7167, + "step": 27230 + }, + { + "epoch": 8.358195211786372, + "grad_norm": 0.19638952612876892, + "learning_rate": 6.906479695811307e-06, + "loss": 1.6937, + "step": 27231 + }, + { + "epoch": 8.358502148557397, + "grad_norm": 0.14345301687717438, + "learning_rate": 6.903959195758148e-06, + "loss": 1.7295, + "step": 27232 + }, + { + "epoch": 8.358809085328422, + "grad_norm": 0.1557627171278, + "learning_rate": 6.901439121605324e-06, + "loss": 1.7146, + "step": 27233 + }, + { + "epoch": 8.359116022099448, + "grad_norm": 0.15030202269554138, + "learning_rate": 6.898919473377741e-06, + "loss": 1.6974, + "step": 27234 + }, + { + "epoch": 8.359422958870473, + "grad_norm": 0.24213968217372894, + "learning_rate": 6.896400251100283e-06, + "loss": 1.8179, + "step": 27235 + }, + { + "epoch": 8.359729895641498, + "grad_norm": 0.1646348387002945, + "learning_rate": 6.893881454797885e-06, + "loss": 1.7001, + "step": 27236 + }, + { + "epoch": 8.360036832412524, + "grad_norm": 0.18399927020072937, + "learning_rate": 6.891363084495406e-06, + "loss": 1.746, + "step": 27237 + }, + { + "epoch": 8.360343769183547, + "grad_norm": 0.19470340013504028, + "learning_rate": 6.8888451402177365e-06, + "loss": 1.7442, + "step": 27238 + }, + { + "epoch": 8.360650705954573, + "grad_norm": 0.1420234590768814, + "learning_rate": 6.886327621989775e-06, + "loss": 1.6481, + "step": 27239 + }, + { + "epoch": 8.360957642725598, + "grad_norm": 0.1827881634235382, + "learning_rate": 6.883810529836382e-06, + "loss": 1.6842, + "step": 27240 + }, + { + "epoch": 8.361264579496623, + "grad_norm": 0.19096913933753967, + "learning_rate": 6.881293863782468e-06, + "loss": 1.7061, + "step": 27241 + }, + { + "epoch": 8.361571516267649, + "grad_norm": 0.1871458888053894, + "learning_rate": 6.878777623852855e-06, + "loss": 1.7607, + "step": 27242 + }, + { + "epoch": 8.361878453038674, + "grad_norm": 0.13643455505371094, + "learning_rate": 6.876261810072459e-06, + "loss": 1.6747, + "step": 27243 + }, + { + "epoch": 8.3621853898097, + "grad_norm": 0.16990543901920319, + "learning_rate": 6.8737464224660985e-06, + "loss": 1.7318, + "step": 27244 + }, + { + "epoch": 8.362492326580725, + "grad_norm": 0.16357167065143585, + "learning_rate": 6.871231461058658e-06, + "loss": 1.6609, + "step": 27245 + }, + { + "epoch": 8.36279926335175, + "grad_norm": 0.20114652812480927, + "learning_rate": 6.868716925874996e-06, + "loss": 1.7647, + "step": 27246 + }, + { + "epoch": 8.363106200122775, + "grad_norm": 0.18387655913829803, + "learning_rate": 6.866202816939949e-06, + "loss": 1.7213, + "step": 27247 + }, + { + "epoch": 8.3634131368938, + "grad_norm": 0.18712659180164337, + "learning_rate": 6.863689134278367e-06, + "loss": 1.7144, + "step": 27248 + }, + { + "epoch": 8.363720073664824, + "grad_norm": 0.19831795990467072, + "learning_rate": 6.861175877915088e-06, + "loss": 1.7396, + "step": 27249 + }, + { + "epoch": 8.36402701043585, + "grad_norm": 0.2181798815727234, + "learning_rate": 6.858663047874958e-06, + "loss": 1.7523, + "step": 27250 + }, + { + "epoch": 8.364333947206875, + "grad_norm": 0.17912371456623077, + "learning_rate": 6.856150644182807e-06, + "loss": 1.7617, + "step": 27251 + }, + { + "epoch": 8.3646408839779, + "grad_norm": 0.16200366616249084, + "learning_rate": 6.85363866686346e-06, + "loss": 1.6886, + "step": 27252 + }, + { + "epoch": 8.364947820748926, + "grad_norm": 0.18456755578517914, + "learning_rate": 6.851127115941747e-06, + "loss": 1.6873, + "step": 27253 + }, + { + "epoch": 8.365254757519951, + "grad_norm": 0.1649440973997116, + "learning_rate": 6.848615991442487e-06, + "loss": 1.7024, + "step": 27254 + }, + { + "epoch": 8.365561694290976, + "grad_norm": 0.17722025513648987, + "learning_rate": 6.846105293390492e-06, + "loss": 1.7401, + "step": 27255 + }, + { + "epoch": 8.365868631062002, + "grad_norm": 0.18342679738998413, + "learning_rate": 6.843595021810578e-06, + "loss": 1.7285, + "step": 27256 + }, + { + "epoch": 8.366175567833027, + "grad_norm": 0.13590754568576813, + "learning_rate": 6.841085176727557e-06, + "loss": 1.6704, + "step": 27257 + }, + { + "epoch": 8.366482504604052, + "grad_norm": 0.16721662878990173, + "learning_rate": 6.838575758166221e-06, + "loss": 1.7371, + "step": 27258 + }, + { + "epoch": 8.366789441375076, + "grad_norm": 0.15011465549468994, + "learning_rate": 6.836066766151372e-06, + "loss": 1.668, + "step": 27259 + }, + { + "epoch": 8.367096378146101, + "grad_norm": 0.15394380688667297, + "learning_rate": 6.833558200707835e-06, + "loss": 1.7402, + "step": 27260 + }, + { + "epoch": 8.367403314917127, + "grad_norm": 0.2134244441986084, + "learning_rate": 6.83105006186035e-06, + "loss": 1.6979, + "step": 27261 + }, + { + "epoch": 8.367710251688152, + "grad_norm": 0.2169496864080429, + "learning_rate": 6.8285423496337375e-06, + "loss": 1.7821, + "step": 27262 + }, + { + "epoch": 8.368017188459177, + "grad_norm": 0.16033586859703064, + "learning_rate": 6.8260350640527774e-06, + "loss": 1.6976, + "step": 27263 + }, + { + "epoch": 8.368324125230203, + "grad_norm": 0.2089877724647522, + "learning_rate": 6.823528205142244e-06, + "loss": 1.7532, + "step": 27264 + }, + { + "epoch": 8.368631062001228, + "grad_norm": 0.12897463142871857, + "learning_rate": 6.821021772926911e-06, + "loss": 1.6445, + "step": 27265 + }, + { + "epoch": 8.368937998772253, + "grad_norm": 0.18726956844329834, + "learning_rate": 6.818515767431549e-06, + "loss": 1.7296, + "step": 27266 + }, + { + "epoch": 8.369244935543279, + "grad_norm": 0.1857292354106903, + "learning_rate": 6.816010188680927e-06, + "loss": 1.7747, + "step": 27267 + }, + { + "epoch": 8.369551872314304, + "grad_norm": 0.24680334329605103, + "learning_rate": 6.813505036699802e-06, + "loss": 1.7877, + "step": 27268 + }, + { + "epoch": 8.36985880908533, + "grad_norm": 0.1404808908700943, + "learning_rate": 6.811000311512927e-06, + "loss": 1.6769, + "step": 27269 + }, + { + "epoch": 8.370165745856353, + "grad_norm": 0.18543009459972382, + "learning_rate": 6.808496013145066e-06, + "loss": 1.7325, + "step": 27270 + }, + { + "epoch": 8.370472682627378, + "grad_norm": 0.13881617784500122, + "learning_rate": 6.805992141620959e-06, + "loss": 1.7022, + "step": 27271 + }, + { + "epoch": 8.370779619398403, + "grad_norm": 0.18534715473651886, + "learning_rate": 6.80348869696536e-06, + "loss": 1.7609, + "step": 27272 + }, + { + "epoch": 8.371086556169429, + "grad_norm": 0.20225360989570618, + "learning_rate": 6.800985679202998e-06, + "loss": 1.7159, + "step": 27273 + }, + { + "epoch": 8.371393492940454, + "grad_norm": 0.1462840884923935, + "learning_rate": 6.79848308835862e-06, + "loss": 1.6607, + "step": 27274 + }, + { + "epoch": 8.37170042971148, + "grad_norm": 0.17453989386558533, + "learning_rate": 6.795980924456952e-06, + "loss": 1.7705, + "step": 27275 + }, + { + "epoch": 8.372007366482505, + "grad_norm": 0.15709565579891205, + "learning_rate": 6.793479187522711e-06, + "loss": 1.6961, + "step": 27276 + }, + { + "epoch": 8.37231430325353, + "grad_norm": 0.14979243278503418, + "learning_rate": 6.790977877580656e-06, + "loss": 1.6817, + "step": 27277 + }, + { + "epoch": 8.372621240024555, + "grad_norm": 0.16452275216579437, + "learning_rate": 6.7884769946554575e-06, + "loss": 1.693, + "step": 27278 + }, + { + "epoch": 8.37292817679558, + "grad_norm": 0.18353265523910522, + "learning_rate": 6.785976538771882e-06, + "loss": 1.7003, + "step": 27279 + }, + { + "epoch": 8.373235113566606, + "grad_norm": 0.15123683214187622, + "learning_rate": 6.783476509954595e-06, + "loss": 1.6611, + "step": 27280 + }, + { + "epoch": 8.37354205033763, + "grad_norm": 0.19939517974853516, + "learning_rate": 6.780976908228332e-06, + "loss": 1.7969, + "step": 27281 + }, + { + "epoch": 8.373848987108655, + "grad_norm": 0.2997080981731415, + "learning_rate": 6.778477733617783e-06, + "loss": 1.7822, + "step": 27282 + }, + { + "epoch": 8.37415592387968, + "grad_norm": 0.13474299013614655, + "learning_rate": 6.775978986147657e-06, + "loss": 1.7155, + "step": 27283 + }, + { + "epoch": 8.374462860650706, + "grad_norm": 0.15992368757724762, + "learning_rate": 6.773480665842635e-06, + "loss": 1.6985, + "step": 27284 + }, + { + "epoch": 8.374769797421731, + "grad_norm": 0.15250587463378906, + "learning_rate": 6.770982772727413e-06, + "loss": 1.7007, + "step": 27285 + }, + { + "epoch": 8.375076734192756, + "grad_norm": 0.1373993456363678, + "learning_rate": 6.768485306826683e-06, + "loss": 1.6852, + "step": 27286 + }, + { + "epoch": 8.375383670963782, + "grad_norm": 0.15772612392902374, + "learning_rate": 6.765988268165113e-06, + "loss": 1.6881, + "step": 27287 + }, + { + "epoch": 8.375690607734807, + "grad_norm": 0.13689690828323364, + "learning_rate": 6.76349165676739e-06, + "loss": 1.6747, + "step": 27288 + }, + { + "epoch": 8.375997544505832, + "grad_norm": 0.18657375872135162, + "learning_rate": 6.7609954726581825e-06, + "loss": 1.7324, + "step": 27289 + }, + { + "epoch": 8.376304481276858, + "grad_norm": 0.16617898643016815, + "learning_rate": 6.758499715862166e-06, + "loss": 1.6832, + "step": 27290 + }, + { + "epoch": 8.376611418047883, + "grad_norm": 0.16960306465625763, + "learning_rate": 6.756004386403996e-06, + "loss": 1.7353, + "step": 27291 + }, + { + "epoch": 8.376918354818907, + "grad_norm": 0.17030803859233856, + "learning_rate": 6.753509484308334e-06, + "loss": 1.7079, + "step": 27292 + }, + { + "epoch": 8.377225291589932, + "grad_norm": 0.16151085495948792, + "learning_rate": 6.751015009599831e-06, + "loss": 1.6706, + "step": 27293 + }, + { + "epoch": 8.377532228360957, + "grad_norm": 0.1715710461139679, + "learning_rate": 6.748520962303173e-06, + "loss": 1.7116, + "step": 27294 + }, + { + "epoch": 8.377839165131983, + "grad_norm": 0.20747625827789307, + "learning_rate": 6.746027342442951e-06, + "loss": 1.731, + "step": 27295 + }, + { + "epoch": 8.378146101903008, + "grad_norm": 0.1645912081003189, + "learning_rate": 6.743534150043867e-06, + "loss": 1.7076, + "step": 27296 + }, + { + "epoch": 8.378453038674033, + "grad_norm": 0.16044393181800842, + "learning_rate": 6.741041385130509e-06, + "loss": 1.7105, + "step": 27297 + }, + { + "epoch": 8.378759975445059, + "grad_norm": 0.18224483728408813, + "learning_rate": 6.738549047727543e-06, + "loss": 1.7258, + "step": 27298 + }, + { + "epoch": 8.379066912216084, + "grad_norm": 0.17351657152175903, + "learning_rate": 6.7360571378595915e-06, + "loss": 1.7369, + "step": 27299 + }, + { + "epoch": 8.37937384898711, + "grad_norm": 0.18293599784374237, + "learning_rate": 6.733565655551283e-06, + "loss": 1.7151, + "step": 27300 + }, + { + "epoch": 8.379680785758135, + "grad_norm": 0.1593983918428421, + "learning_rate": 6.731074600827242e-06, + "loss": 1.6544, + "step": 27301 + }, + { + "epoch": 8.379987722529158, + "grad_norm": 0.16315947473049164, + "learning_rate": 6.728583973712077e-06, + "loss": 1.7442, + "step": 27302 + }, + { + "epoch": 8.380294659300183, + "grad_norm": 0.13841219246387482, + "learning_rate": 6.726093774230408e-06, + "loss": 1.6639, + "step": 27303 + }, + { + "epoch": 8.380601596071209, + "grad_norm": 0.14162768423557281, + "learning_rate": 6.723604002406847e-06, + "loss": 1.6713, + "step": 27304 + }, + { + "epoch": 8.380908532842234, + "grad_norm": 0.1737380474805832, + "learning_rate": 6.721114658265992e-06, + "loss": 1.7197, + "step": 27305 + }, + { + "epoch": 8.38121546961326, + "grad_norm": 0.15531061589717865, + "learning_rate": 6.718625741832452e-06, + "loss": 1.7337, + "step": 27306 + }, + { + "epoch": 8.381522406384285, + "grad_norm": 0.1833781898021698, + "learning_rate": 6.716137253130816e-06, + "loss": 1.7838, + "step": 27307 + }, + { + "epoch": 8.38182934315531, + "grad_norm": 0.23010820150375366, + "learning_rate": 6.713649192185683e-06, + "loss": 1.7023, + "step": 27308 + }, + { + "epoch": 8.382136279926335, + "grad_norm": 0.14409376680850983, + "learning_rate": 6.7111615590216445e-06, + "loss": 1.6968, + "step": 27309 + }, + { + "epoch": 8.38244321669736, + "grad_norm": 0.19448643922805786, + "learning_rate": 6.7086743536632635e-06, + "loss": 1.7117, + "step": 27310 + }, + { + "epoch": 8.382750153468386, + "grad_norm": 0.18580564856529236, + "learning_rate": 6.706187576135159e-06, + "loss": 1.8183, + "step": 27311 + }, + { + "epoch": 8.383057090239411, + "grad_norm": 0.20270103216171265, + "learning_rate": 6.7037012264618675e-06, + "loss": 1.7666, + "step": 27312 + }, + { + "epoch": 8.383364027010435, + "grad_norm": 0.16575069725513458, + "learning_rate": 6.7012153046679904e-06, + "loss": 1.7542, + "step": 27313 + }, + { + "epoch": 8.38367096378146, + "grad_norm": 0.16375242173671722, + "learning_rate": 6.698729810778065e-06, + "loss": 1.7117, + "step": 27314 + }, + { + "epoch": 8.383977900552486, + "grad_norm": 0.2082248479127884, + "learning_rate": 6.696244744816682e-06, + "loss": 1.7687, + "step": 27315 + }, + { + "epoch": 8.384284837323511, + "grad_norm": 0.1562620848417282, + "learning_rate": 6.693760106808389e-06, + "loss": 1.6782, + "step": 27316 + }, + { + "epoch": 8.384591774094536, + "grad_norm": 0.1883714199066162, + "learning_rate": 6.6912758967777435e-06, + "loss": 1.7023, + "step": 27317 + }, + { + "epoch": 8.384898710865562, + "grad_norm": 0.17445886135101318, + "learning_rate": 6.688792114749292e-06, + "loss": 1.7019, + "step": 27318 + }, + { + "epoch": 8.385205647636587, + "grad_norm": 0.20479950308799744, + "learning_rate": 6.686308760747584e-06, + "loss": 1.7514, + "step": 27319 + }, + { + "epoch": 8.385512584407612, + "grad_norm": 0.21790143847465515, + "learning_rate": 6.683825834797153e-06, + "loss": 1.7243, + "step": 27320 + }, + { + "epoch": 8.385819521178638, + "grad_norm": 0.1784016340970993, + "learning_rate": 6.681343336922552e-06, + "loss": 1.7301, + "step": 27321 + }, + { + "epoch": 8.386126457949663, + "grad_norm": 0.22286179661750793, + "learning_rate": 6.678861267148301e-06, + "loss": 1.7231, + "step": 27322 + }, + { + "epoch": 8.386433394720688, + "grad_norm": 0.17854957282543182, + "learning_rate": 6.676379625498935e-06, + "loss": 1.7216, + "step": 27323 + }, + { + "epoch": 8.386740331491712, + "grad_norm": 0.1750447154045105, + "learning_rate": 6.67389841199898e-06, + "loss": 1.7603, + "step": 27324 + }, + { + "epoch": 8.387047268262737, + "grad_norm": 0.17893844842910767, + "learning_rate": 6.6714176266729545e-06, + "loss": 1.7229, + "step": 27325 + }, + { + "epoch": 8.387354205033763, + "grad_norm": 0.18705782294273376, + "learning_rate": 6.6689372695453725e-06, + "loss": 1.7021, + "step": 27326 + }, + { + "epoch": 8.387661141804788, + "grad_norm": 0.18719066679477692, + "learning_rate": 6.666457340640742e-06, + "loss": 1.7216, + "step": 27327 + }, + { + "epoch": 8.387968078575813, + "grad_norm": 0.16408847272396088, + "learning_rate": 6.663977839983604e-06, + "loss": 1.6937, + "step": 27328 + }, + { + "epoch": 8.388275015346839, + "grad_norm": 0.1739223599433899, + "learning_rate": 6.661498767598407e-06, + "loss": 1.6533, + "step": 27329 + }, + { + "epoch": 8.388581952117864, + "grad_norm": 0.19943352043628693, + "learning_rate": 6.6590201235097075e-06, + "loss": 1.753, + "step": 27330 + }, + { + "epoch": 8.38888888888889, + "grad_norm": 0.1412268429994583, + "learning_rate": 6.656541907741954e-06, + "loss": 1.6669, + "step": 27331 + }, + { + "epoch": 8.389195825659915, + "grad_norm": 0.17952445149421692, + "learning_rate": 6.654064120319664e-06, + "loss": 1.6921, + "step": 27332 + }, + { + "epoch": 8.38950276243094, + "grad_norm": 0.22117477655410767, + "learning_rate": 6.65158676126732e-06, + "loss": 1.7677, + "step": 27333 + }, + { + "epoch": 8.389809699201965, + "grad_norm": 0.1926339566707611, + "learning_rate": 6.649109830609401e-06, + "loss": 1.7237, + "step": 27334 + }, + { + "epoch": 8.390116635972989, + "grad_norm": 0.3306657671928406, + "learning_rate": 6.646633328370394e-06, + "loss": 1.7735, + "step": 27335 + }, + { + "epoch": 8.390423572744014, + "grad_norm": 0.14908578991889954, + "learning_rate": 6.644157254574762e-06, + "loss": 1.7109, + "step": 27336 + }, + { + "epoch": 8.39073050951504, + "grad_norm": 0.20824603736400604, + "learning_rate": 6.64168160924698e-06, + "loss": 1.7177, + "step": 27337 + }, + { + "epoch": 8.391037446286065, + "grad_norm": 0.22669748961925507, + "learning_rate": 6.6392063924115125e-06, + "loss": 1.7842, + "step": 27338 + }, + { + "epoch": 8.39134438305709, + "grad_norm": 0.16690780222415924, + "learning_rate": 6.6367316040928215e-06, + "loss": 1.739, + "step": 27339 + }, + { + "epoch": 8.391651319828116, + "grad_norm": 0.17900501191616058, + "learning_rate": 6.634257244315367e-06, + "loss": 1.705, + "step": 27340 + }, + { + "epoch": 8.39195825659914, + "grad_norm": 0.18606948852539062, + "learning_rate": 6.631783313103595e-06, + "loss": 1.7324, + "step": 27341 + }, + { + "epoch": 8.392265193370166, + "grad_norm": 0.15370480716228485, + "learning_rate": 6.629309810481965e-06, + "loss": 1.6834, + "step": 27342 + }, + { + "epoch": 8.392572130141192, + "grad_norm": 0.13654825091362, + "learning_rate": 6.626836736474917e-06, + "loss": 1.6729, + "step": 27343 + }, + { + "epoch": 8.392879066912217, + "grad_norm": 0.21128645539283752, + "learning_rate": 6.624364091106877e-06, + "loss": 1.7494, + "step": 27344 + }, + { + "epoch": 8.39318600368324, + "grad_norm": 0.1608622819185257, + "learning_rate": 6.621891874402314e-06, + "loss": 1.6951, + "step": 27345 + }, + { + "epoch": 8.393492940454266, + "grad_norm": 0.20148086547851562, + "learning_rate": 6.619420086385619e-06, + "loss": 1.7616, + "step": 27346 + }, + { + "epoch": 8.393799877225291, + "grad_norm": 0.1927247792482376, + "learning_rate": 6.616948727081262e-06, + "loss": 1.7088, + "step": 27347 + }, + { + "epoch": 8.394106813996316, + "grad_norm": 0.18318399786949158, + "learning_rate": 6.614477796513629e-06, + "loss": 1.7176, + "step": 27348 + }, + { + "epoch": 8.394413750767342, + "grad_norm": 0.20923443138599396, + "learning_rate": 6.612007294707162e-06, + "loss": 1.758, + "step": 27349 + }, + { + "epoch": 8.394720687538367, + "grad_norm": 0.20041905343532562, + "learning_rate": 6.609537221686268e-06, + "loss": 1.6843, + "step": 27350 + }, + { + "epoch": 8.395027624309392, + "grad_norm": 0.13480354845523834, + "learning_rate": 6.607067577475362e-06, + "loss": 1.6766, + "step": 27351 + }, + { + "epoch": 8.395334561080418, + "grad_norm": 0.2022085338830948, + "learning_rate": 6.604598362098846e-06, + "loss": 1.7448, + "step": 27352 + }, + { + "epoch": 8.395641497851443, + "grad_norm": 0.21842770278453827, + "learning_rate": 6.602129575581123e-06, + "loss": 1.7202, + "step": 27353 + }, + { + "epoch": 8.395948434622468, + "grad_norm": 0.16519947350025177, + "learning_rate": 6.599661217946596e-06, + "loss": 1.7036, + "step": 27354 + }, + { + "epoch": 8.396255371393494, + "grad_norm": 0.14931483566761017, + "learning_rate": 6.59719328921965e-06, + "loss": 1.7244, + "step": 27355 + }, + { + "epoch": 8.396562308164517, + "grad_norm": 0.22807423770427704, + "learning_rate": 6.594725789424683e-06, + "loss": 1.7758, + "step": 27356 + }, + { + "epoch": 8.396869244935543, + "grad_norm": 0.15723249316215515, + "learning_rate": 6.592258718586075e-06, + "loss": 1.7033, + "step": 27357 + }, + { + "epoch": 8.397176181706568, + "grad_norm": 0.1934487521648407, + "learning_rate": 6.589792076728207e-06, + "loss": 1.7767, + "step": 27358 + }, + { + "epoch": 8.397483118477593, + "grad_norm": 0.16923396289348602, + "learning_rate": 6.587325863875454e-06, + "loss": 1.7125, + "step": 27359 + }, + { + "epoch": 8.397790055248619, + "grad_norm": 0.1533476561307907, + "learning_rate": 6.584860080052196e-06, + "loss": 1.7245, + "step": 27360 + }, + { + "epoch": 8.398096992019644, + "grad_norm": 0.1610613465309143, + "learning_rate": 6.582394725282786e-06, + "loss": 1.6974, + "step": 27361 + }, + { + "epoch": 8.39840392879067, + "grad_norm": 0.19170965254306793, + "learning_rate": 6.579929799591622e-06, + "loss": 1.6956, + "step": 27362 + }, + { + "epoch": 8.398710865561695, + "grad_norm": 0.17479272186756134, + "learning_rate": 6.5774653030030164e-06, + "loss": 1.699, + "step": 27363 + }, + { + "epoch": 8.39901780233272, + "grad_norm": 0.15651267766952515, + "learning_rate": 6.575001235541378e-06, + "loss": 1.655, + "step": 27364 + }, + { + "epoch": 8.399324739103745, + "grad_norm": 0.13939335942268372, + "learning_rate": 6.572537597230999e-06, + "loss": 1.6963, + "step": 27365 + }, + { + "epoch": 8.399631675874769, + "grad_norm": 0.16157624125480652, + "learning_rate": 6.570074388096275e-06, + "loss": 1.6811, + "step": 27366 + }, + { + "epoch": 8.399938612645794, + "grad_norm": 0.16065873205661774, + "learning_rate": 6.567611608161528e-06, + "loss": 1.7104, + "step": 27367 + }, + { + "epoch": 8.40024554941682, + "grad_norm": 0.1657525599002838, + "learning_rate": 6.565149257451098e-06, + "loss": 1.6884, + "step": 27368 + }, + { + "epoch": 8.400552486187845, + "grad_norm": 0.1757468432188034, + "learning_rate": 6.56268733598932e-06, + "loss": 1.7112, + "step": 27369 + }, + { + "epoch": 8.40085942295887, + "grad_norm": 0.16591452062129974, + "learning_rate": 6.560225843800527e-06, + "loss": 1.7227, + "step": 27370 + }, + { + "epoch": 8.401166359729896, + "grad_norm": 0.12153175473213196, + "learning_rate": 6.557764780909048e-06, + "loss": 1.6843, + "step": 27371 + }, + { + "epoch": 8.401473296500921, + "grad_norm": 0.13953842222690582, + "learning_rate": 6.5553041473391914e-06, + "loss": 1.6518, + "step": 27372 + }, + { + "epoch": 8.401780233271946, + "grad_norm": 0.22707831859588623, + "learning_rate": 6.552843943115289e-06, + "loss": 1.7594, + "step": 27373 + }, + { + "epoch": 8.402087170042972, + "grad_norm": 0.18743011355400085, + "learning_rate": 6.550384168261647e-06, + "loss": 1.705, + "step": 27374 + }, + { + "epoch": 8.402394106813997, + "grad_norm": 0.1784582883119583, + "learning_rate": 6.547924822802576e-06, + "loss": 1.7861, + "step": 27375 + }, + { + "epoch": 8.402701043585022, + "grad_norm": 0.18942677974700928, + "learning_rate": 6.545465906762377e-06, + "loss": 1.7489, + "step": 27376 + }, + { + "epoch": 8.403007980356048, + "grad_norm": 0.1783999502658844, + "learning_rate": 6.543007420165354e-06, + "loss": 1.7533, + "step": 27377 + }, + { + "epoch": 8.403314917127071, + "grad_norm": 0.1497674137353897, + "learning_rate": 6.540549363035791e-06, + "loss": 1.6768, + "step": 27378 + }, + { + "epoch": 8.403621853898096, + "grad_norm": 0.15912608802318573, + "learning_rate": 6.538091735398016e-06, + "loss": 1.7656, + "step": 27379 + }, + { + "epoch": 8.403928790669122, + "grad_norm": 0.1886531114578247, + "learning_rate": 6.535634537276269e-06, + "loss": 1.7368, + "step": 27380 + }, + { + "epoch": 8.404235727440147, + "grad_norm": 0.1976786106824875, + "learning_rate": 6.5331777686948756e-06, + "loss": 1.7627, + "step": 27381 + }, + { + "epoch": 8.404542664211172, + "grad_norm": 0.1442447006702423, + "learning_rate": 6.5307214296780775e-06, + "loss": 1.6787, + "step": 27382 + }, + { + "epoch": 8.404849600982198, + "grad_norm": 0.21066388487815857, + "learning_rate": 6.528265520250182e-06, + "loss": 1.741, + "step": 27383 + }, + { + "epoch": 8.405156537753223, + "grad_norm": 0.19657589495182037, + "learning_rate": 6.525810040435443e-06, + "loss": 1.74, + "step": 27384 + }, + { + "epoch": 8.405463474524248, + "grad_norm": 0.20377841591835022, + "learning_rate": 6.5233549902581296e-06, + "loss": 1.7086, + "step": 27385 + }, + { + "epoch": 8.405770411295274, + "grad_norm": 0.16641706228256226, + "learning_rate": 6.520900369742505e-06, + "loss": 1.6897, + "step": 27386 + }, + { + "epoch": 8.4060773480663, + "grad_norm": 0.177897647023201, + "learning_rate": 6.518446178912829e-06, + "loss": 1.7781, + "step": 27387 + }, + { + "epoch": 8.406384284837323, + "grad_norm": 0.2529480755329132, + "learning_rate": 6.515992417793354e-06, + "loss": 1.7227, + "step": 27388 + }, + { + "epoch": 8.406691221608348, + "grad_norm": 0.17020392417907715, + "learning_rate": 6.513539086408327e-06, + "loss": 1.6836, + "step": 27389 + }, + { + "epoch": 8.406998158379373, + "grad_norm": 0.1621706336736679, + "learning_rate": 6.5110861847819944e-06, + "loss": 1.7263, + "step": 27390 + }, + { + "epoch": 8.407305095150399, + "grad_norm": 0.15788327157497406, + "learning_rate": 6.508633712938594e-06, + "loss": 1.7155, + "step": 27391 + }, + { + "epoch": 8.407612031921424, + "grad_norm": 0.1595151722431183, + "learning_rate": 6.5061816709023724e-06, + "loss": 1.7051, + "step": 27392 + }, + { + "epoch": 8.40791896869245, + "grad_norm": 0.2065821886062622, + "learning_rate": 6.503730058697555e-06, + "loss": 1.7435, + "step": 27393 + }, + { + "epoch": 8.408225905463475, + "grad_norm": 0.18513742089271545, + "learning_rate": 6.501278876348371e-06, + "loss": 1.7976, + "step": 27394 + }, + { + "epoch": 8.4085328422345, + "grad_norm": 0.1819298416376114, + "learning_rate": 6.4988281238790305e-06, + "loss": 1.7656, + "step": 27395 + }, + { + "epoch": 8.408839779005525, + "grad_norm": 0.17593856155872345, + "learning_rate": 6.496377801313791e-06, + "loss": 1.7436, + "step": 27396 + }, + { + "epoch": 8.40914671577655, + "grad_norm": 0.1425786167383194, + "learning_rate": 6.493927908676822e-06, + "loss": 1.7365, + "step": 27397 + }, + { + "epoch": 8.409453652547576, + "grad_norm": 0.1689717322587967, + "learning_rate": 6.491478445992383e-06, + "loss": 1.7116, + "step": 27398 + }, + { + "epoch": 8.4097605893186, + "grad_norm": 0.1530478596687317, + "learning_rate": 6.489029413284631e-06, + "loss": 1.7232, + "step": 27399 + }, + { + "epoch": 8.410067526089625, + "grad_norm": 0.16928789019584656, + "learning_rate": 6.486580810577802e-06, + "loss": 1.713, + "step": 27400 + }, + { + "epoch": 8.41037446286065, + "grad_norm": 0.19086188077926636, + "learning_rate": 6.484132637896085e-06, + "loss": 1.7495, + "step": 27401 + }, + { + "epoch": 8.410681399631676, + "grad_norm": 0.18510590493679047, + "learning_rate": 6.481684895263679e-06, + "loss": 1.7445, + "step": 27402 + }, + { + "epoch": 8.410988336402701, + "grad_norm": 0.144667387008667, + "learning_rate": 6.479237582704767e-06, + "loss": 1.6994, + "step": 27403 + }, + { + "epoch": 8.411295273173726, + "grad_norm": 0.15467962622642517, + "learning_rate": 6.476790700243535e-06, + "loss": 1.6807, + "step": 27404 + }, + { + "epoch": 8.411602209944752, + "grad_norm": 0.13533028960227966, + "learning_rate": 6.474344247904168e-06, + "loss": 1.6746, + "step": 27405 + }, + { + "epoch": 8.411909146715777, + "grad_norm": 0.13948698341846466, + "learning_rate": 6.471898225710843e-06, + "loss": 1.7072, + "step": 27406 + }, + { + "epoch": 8.412216083486802, + "grad_norm": 0.1758929044008255, + "learning_rate": 6.469452633687734e-06, + "loss": 1.6993, + "step": 27407 + }, + { + "epoch": 8.412523020257828, + "grad_norm": 0.20594100654125214, + "learning_rate": 6.46700747185901e-06, + "loss": 1.7468, + "step": 27408 + }, + { + "epoch": 8.412829957028851, + "grad_norm": 0.18665185570716858, + "learning_rate": 6.464562740248831e-06, + "loss": 1.6829, + "step": 27409 + }, + { + "epoch": 8.413136893799877, + "grad_norm": 0.1637166142463684, + "learning_rate": 6.4621184388813595e-06, + "loss": 1.7118, + "step": 27410 + }, + { + "epoch": 8.413443830570902, + "grad_norm": 0.1653725504875183, + "learning_rate": 6.459674567780749e-06, + "loss": 1.6986, + "step": 27411 + }, + { + "epoch": 8.413750767341927, + "grad_norm": 0.16381777822971344, + "learning_rate": 6.457231126971158e-06, + "loss": 1.7389, + "step": 27412 + }, + { + "epoch": 8.414057704112953, + "grad_norm": 0.14706309139728546, + "learning_rate": 6.454788116476734e-06, + "loss": 1.6629, + "step": 27413 + }, + { + "epoch": 8.414364640883978, + "grad_norm": 0.17818714678287506, + "learning_rate": 6.4523455363215964e-06, + "loss": 1.761, + "step": 27414 + }, + { + "epoch": 8.414671577655003, + "grad_norm": 0.18425707519054413, + "learning_rate": 6.449903386529932e-06, + "loss": 1.7169, + "step": 27415 + }, + { + "epoch": 8.414978514426029, + "grad_norm": 0.182805597782135, + "learning_rate": 6.4474616671258255e-06, + "loss": 1.6916, + "step": 27416 + }, + { + "epoch": 8.415285451197054, + "grad_norm": 0.1802895963191986, + "learning_rate": 6.4450203781334426e-06, + "loss": 1.7786, + "step": 27417 + }, + { + "epoch": 8.41559238796808, + "grad_norm": 0.18067243695259094, + "learning_rate": 6.442579519576891e-06, + "loss": 1.7489, + "step": 27418 + }, + { + "epoch": 8.415899324739105, + "grad_norm": 0.20373223721981049, + "learning_rate": 6.4401390914803075e-06, + "loss": 1.7519, + "step": 27419 + }, + { + "epoch": 8.416206261510128, + "grad_norm": 0.1414610594511032, + "learning_rate": 6.437699093867794e-06, + "loss": 1.6656, + "step": 27420 + }, + { + "epoch": 8.416513198281153, + "grad_norm": 0.14516517519950867, + "learning_rate": 6.4352595267634706e-06, + "loss": 1.6599, + "step": 27421 + }, + { + "epoch": 8.416820135052179, + "grad_norm": 0.16276796162128448, + "learning_rate": 6.4328203901914465e-06, + "loss": 1.7026, + "step": 27422 + }, + { + "epoch": 8.417127071823204, + "grad_norm": 0.15957671403884888, + "learning_rate": 6.430381684175829e-06, + "loss": 1.7185, + "step": 27423 + }, + { + "epoch": 8.41743400859423, + "grad_norm": 0.1594170182943344, + "learning_rate": 6.4279434087407166e-06, + "loss": 1.7144, + "step": 27424 + }, + { + "epoch": 8.417740945365255, + "grad_norm": 0.14235691726207733, + "learning_rate": 6.425505563910206e-06, + "loss": 1.6487, + "step": 27425 + }, + { + "epoch": 8.41804788213628, + "grad_norm": 0.17203880846500397, + "learning_rate": 6.423068149708389e-06, + "loss": 1.7252, + "step": 27426 + }, + { + "epoch": 8.418354818907305, + "grad_norm": 0.15193019807338715, + "learning_rate": 6.420631166159352e-06, + "loss": 1.7346, + "step": 27427 + }, + { + "epoch": 8.41866175567833, + "grad_norm": 0.17005006968975067, + "learning_rate": 6.418194613287182e-06, + "loss": 1.7679, + "step": 27428 + }, + { + "epoch": 8.418968692449356, + "grad_norm": 0.15492422878742218, + "learning_rate": 6.415758491115953e-06, + "loss": 1.6962, + "step": 27429 + }, + { + "epoch": 8.419275629220381, + "grad_norm": 0.13465845584869385, + "learning_rate": 6.413322799669752e-06, + "loss": 1.676, + "step": 27430 + }, + { + "epoch": 8.419582565991405, + "grad_norm": 0.20086030662059784, + "learning_rate": 6.410887538972626e-06, + "loss": 1.7341, + "step": 27431 + }, + { + "epoch": 8.41988950276243, + "grad_norm": 0.12862804532051086, + "learning_rate": 6.408452709048679e-06, + "loss": 1.6456, + "step": 27432 + }, + { + "epoch": 8.420196439533456, + "grad_norm": 0.1520070731639862, + "learning_rate": 6.40601830992193e-06, + "loss": 1.7169, + "step": 27433 + }, + { + "epoch": 8.420503376304481, + "grad_norm": 0.15394441783428192, + "learning_rate": 6.4035843416164865e-06, + "loss": 1.6876, + "step": 27434 + }, + { + "epoch": 8.420810313075506, + "grad_norm": 0.15149196982383728, + "learning_rate": 6.4011508041563475e-06, + "loss": 1.7126, + "step": 27435 + }, + { + "epoch": 8.421117249846532, + "grad_norm": 0.14014703035354614, + "learning_rate": 6.398717697565604e-06, + "loss": 1.6554, + "step": 27436 + }, + { + "epoch": 8.421424186617557, + "grad_norm": 0.1493537575006485, + "learning_rate": 6.3962850218682865e-06, + "loss": 1.6915, + "step": 27437 + }, + { + "epoch": 8.421731123388582, + "grad_norm": 0.16197362542152405, + "learning_rate": 6.393852777088438e-06, + "loss": 1.7108, + "step": 27438 + }, + { + "epoch": 8.422038060159608, + "grad_norm": 0.2058446705341339, + "learning_rate": 6.391420963250094e-06, + "loss": 1.806, + "step": 27439 + }, + { + "epoch": 8.422344996930633, + "grad_norm": 0.16983431577682495, + "learning_rate": 6.388989580377291e-06, + "loss": 1.7265, + "step": 27440 + }, + { + "epoch": 8.422651933701658, + "grad_norm": 0.15896758437156677, + "learning_rate": 6.386558628494049e-06, + "loss": 1.7081, + "step": 27441 + }, + { + "epoch": 8.422958870472682, + "grad_norm": 0.15534810721874237, + "learning_rate": 6.384128107624399e-06, + "loss": 1.7218, + "step": 27442 + }, + { + "epoch": 8.423265807243707, + "grad_norm": 0.20577791333198547, + "learning_rate": 6.381698017792365e-06, + "loss": 1.7799, + "step": 27443 + }, + { + "epoch": 8.423572744014733, + "grad_norm": 0.183476984500885, + "learning_rate": 6.37926835902195e-06, + "loss": 1.7432, + "step": 27444 + }, + { + "epoch": 8.423879680785758, + "grad_norm": 0.1834617555141449, + "learning_rate": 6.376839131337175e-06, + "loss": 1.7333, + "step": 27445 + }, + { + "epoch": 8.424186617556783, + "grad_norm": 0.15556102991104126, + "learning_rate": 6.374410334762043e-06, + "loss": 1.7119, + "step": 27446 + }, + { + "epoch": 8.424493554327809, + "grad_norm": 0.14469701051712036, + "learning_rate": 6.3719819693205565e-06, + "loss": 1.6883, + "step": 27447 + }, + { + "epoch": 8.424800491098834, + "grad_norm": 0.1339770257472992, + "learning_rate": 6.369554035036706e-06, + "loss": 1.692, + "step": 27448 + }, + { + "epoch": 8.42510742786986, + "grad_norm": 0.18144701421260834, + "learning_rate": 6.367126531934514e-06, + "loss": 1.7192, + "step": 27449 + }, + { + "epoch": 8.425414364640885, + "grad_norm": 0.20075814425945282, + "learning_rate": 6.364699460037931e-06, + "loss": 1.6681, + "step": 27450 + }, + { + "epoch": 8.42572130141191, + "grad_norm": 0.14828181266784668, + "learning_rate": 6.36227281937099e-06, + "loss": 1.6955, + "step": 27451 + }, + { + "epoch": 8.426028238182933, + "grad_norm": 0.1502649486064911, + "learning_rate": 6.35984660995762e-06, + "loss": 1.6695, + "step": 27452 + }, + { + "epoch": 8.426335174953959, + "grad_norm": 0.16594241559505463, + "learning_rate": 6.3574208318218364e-06, + "loss": 1.7092, + "step": 27453 + }, + { + "epoch": 8.426642111724984, + "grad_norm": 0.2585645020008087, + "learning_rate": 6.354995484987597e-06, + "loss": 1.7358, + "step": 27454 + }, + { + "epoch": 8.42694904849601, + "grad_norm": 0.1694081574678421, + "learning_rate": 6.352570569478877e-06, + "loss": 1.7421, + "step": 27455 + }, + { + "epoch": 8.427255985267035, + "grad_norm": 0.178135946393013, + "learning_rate": 6.350146085319647e-06, + "loss": 1.7157, + "step": 27456 + }, + { + "epoch": 8.42756292203806, + "grad_norm": 0.19647614657878876, + "learning_rate": 6.347722032533837e-06, + "loss": 1.7843, + "step": 27457 + }, + { + "epoch": 8.427869858809085, + "grad_norm": 0.1510474979877472, + "learning_rate": 6.345298411145434e-06, + "loss": 1.688, + "step": 27458 + }, + { + "epoch": 8.42817679558011, + "grad_norm": 0.2130916565656662, + "learning_rate": 6.342875221178374e-06, + "loss": 1.7817, + "step": 27459 + }, + { + "epoch": 8.428483732351136, + "grad_norm": 0.1456206738948822, + "learning_rate": 6.340452462656615e-06, + "loss": 1.6839, + "step": 27460 + }, + { + "epoch": 8.428790669122161, + "grad_norm": 0.16592659056186676, + "learning_rate": 6.338030135604089e-06, + "loss": 1.7395, + "step": 27461 + }, + { + "epoch": 8.429097605893187, + "grad_norm": 0.15017202496528625, + "learning_rate": 6.335608240044744e-06, + "loss": 1.6815, + "step": 27462 + }, + { + "epoch": 8.42940454266421, + "grad_norm": 0.14279332756996155, + "learning_rate": 6.333186776002514e-06, + "loss": 1.6845, + "step": 27463 + }, + { + "epoch": 8.429711479435236, + "grad_norm": 0.15117228031158447, + "learning_rate": 6.330765743501321e-06, + "loss": 1.7421, + "step": 27464 + }, + { + "epoch": 8.430018416206261, + "grad_norm": 0.19822575151920319, + "learning_rate": 6.328345142565084e-06, + "loss": 1.7297, + "step": 27465 + }, + { + "epoch": 8.430325352977286, + "grad_norm": 0.1589222550392151, + "learning_rate": 6.325924973217762e-06, + "loss": 1.7151, + "step": 27466 + }, + { + "epoch": 8.430632289748312, + "grad_norm": 0.19120970368385315, + "learning_rate": 6.323505235483229e-06, + "loss": 1.7373, + "step": 27467 + }, + { + "epoch": 8.430939226519337, + "grad_norm": 0.1859981119632721, + "learning_rate": 6.321085929385434e-06, + "loss": 1.6912, + "step": 27468 + }, + { + "epoch": 8.431246163290362, + "grad_norm": 0.1745872050523758, + "learning_rate": 6.318667054948246e-06, + "loss": 1.6773, + "step": 27469 + }, + { + "epoch": 8.431553100061388, + "grad_norm": 0.13402412831783295, + "learning_rate": 6.316248612195607e-06, + "loss": 1.6905, + "step": 27470 + }, + { + "epoch": 8.431860036832413, + "grad_norm": 0.22629496455192566, + "learning_rate": 6.3138306011514045e-06, + "loss": 1.7012, + "step": 27471 + }, + { + "epoch": 8.432166973603438, + "grad_norm": 0.18746718764305115, + "learning_rate": 6.31141302183953e-06, + "loss": 1.7573, + "step": 27472 + }, + { + "epoch": 8.432473910374464, + "grad_norm": 0.18313723802566528, + "learning_rate": 6.308995874283891e-06, + "loss": 1.7358, + "step": 27473 + }, + { + "epoch": 8.432780847145487, + "grad_norm": 0.19075456261634827, + "learning_rate": 6.306579158508341e-06, + "loss": 1.7091, + "step": 27474 + }, + { + "epoch": 8.433087783916513, + "grad_norm": 0.18092980980873108, + "learning_rate": 6.304162874536796e-06, + "loss": 1.6739, + "step": 27475 + }, + { + "epoch": 8.433394720687538, + "grad_norm": 0.15624219179153442, + "learning_rate": 6.301747022393123e-06, + "loss": 1.6637, + "step": 27476 + }, + { + "epoch": 8.433701657458563, + "grad_norm": 0.14825348556041718, + "learning_rate": 6.299331602101199e-06, + "loss": 1.6865, + "step": 27477 + }, + { + "epoch": 8.434008594229589, + "grad_norm": 0.2204820215702057, + "learning_rate": 6.2969166136848946e-06, + "loss": 1.7842, + "step": 27478 + }, + { + "epoch": 8.434315531000614, + "grad_norm": 0.15570053458213806, + "learning_rate": 6.294502057168072e-06, + "loss": 1.69, + "step": 27479 + }, + { + "epoch": 8.43462246777164, + "grad_norm": 0.1686720848083496, + "learning_rate": 6.292087932574603e-06, + "loss": 1.6787, + "step": 27480 + }, + { + "epoch": 8.434929404542665, + "grad_norm": 0.2100359946489334, + "learning_rate": 6.289674239928334e-06, + "loss": 1.7374, + "step": 27481 + }, + { + "epoch": 8.43523634131369, + "grad_norm": 0.1607038378715515, + "learning_rate": 6.287260979253112e-06, + "loss": 1.7067, + "step": 27482 + }, + { + "epoch": 8.435543278084715, + "grad_norm": 0.153702512383461, + "learning_rate": 6.2848481505728254e-06, + "loss": 1.6762, + "step": 27483 + }, + { + "epoch": 8.43585021485574, + "grad_norm": 0.15967734158039093, + "learning_rate": 6.282435753911264e-06, + "loss": 1.6543, + "step": 27484 + }, + { + "epoch": 8.436157151626764, + "grad_norm": 0.18866287171840668, + "learning_rate": 6.280023789292322e-06, + "loss": 1.7481, + "step": 27485 + }, + { + "epoch": 8.43646408839779, + "grad_norm": 0.13347187638282776, + "learning_rate": 6.277612256739784e-06, + "loss": 1.6398, + "step": 27486 + }, + { + "epoch": 8.436771025168815, + "grad_norm": 0.1626890003681183, + "learning_rate": 6.275201156277521e-06, + "loss": 1.7258, + "step": 27487 + }, + { + "epoch": 8.43707796193984, + "grad_norm": 0.21519014239311218, + "learning_rate": 6.272790487929353e-06, + "loss": 1.7762, + "step": 27488 + }, + { + "epoch": 8.437384898710865, + "grad_norm": 0.1610138863325119, + "learning_rate": 6.2703802517190935e-06, + "loss": 1.6999, + "step": 27489 + }, + { + "epoch": 8.43769183548189, + "grad_norm": 0.20251847803592682, + "learning_rate": 6.267970447670579e-06, + "loss": 1.6953, + "step": 27490 + }, + { + "epoch": 8.437998772252916, + "grad_norm": 0.15717832744121552, + "learning_rate": 6.265561075807591e-06, + "loss": 1.623, + "step": 27491 + }, + { + "epoch": 8.438305709023942, + "grad_norm": 0.1399519294500351, + "learning_rate": 6.2631521361539716e-06, + "loss": 1.693, + "step": 27492 + }, + { + "epoch": 8.438612645794967, + "grad_norm": 0.17747904360294342, + "learning_rate": 6.260743628733517e-06, + "loss": 1.7019, + "step": 27493 + }, + { + "epoch": 8.438919582565992, + "grad_norm": 0.1724942922592163, + "learning_rate": 6.258335553570032e-06, + "loss": 1.6647, + "step": 27494 + }, + { + "epoch": 8.439226519337016, + "grad_norm": 0.15294337272644043, + "learning_rate": 6.255927910687315e-06, + "loss": 1.7492, + "step": 27495 + }, + { + "epoch": 8.439533456108041, + "grad_norm": 0.16880661249160767, + "learning_rate": 6.253520700109156e-06, + "loss": 1.731, + "step": 27496 + }, + { + "epoch": 8.439840392879066, + "grad_norm": 0.16098125278949738, + "learning_rate": 6.251113921859347e-06, + "loss": 1.6668, + "step": 27497 + }, + { + "epoch": 8.440147329650092, + "grad_norm": 0.17218537628650665, + "learning_rate": 6.248707575961671e-06, + "loss": 1.6943, + "step": 27498 + }, + { + "epoch": 8.440454266421117, + "grad_norm": 0.19593006372451782, + "learning_rate": 6.2463016624398965e-06, + "loss": 1.7213, + "step": 27499 + }, + { + "epoch": 8.440761203192142, + "grad_norm": 0.15833450853824615, + "learning_rate": 6.243896181317837e-06, + "loss": 1.6787, + "step": 27500 + }, + { + "epoch": 8.441068139963168, + "grad_norm": 0.1378611922264099, + "learning_rate": 6.241491132619226e-06, + "loss": 1.6777, + "step": 27501 + }, + { + "epoch": 8.441375076734193, + "grad_norm": 0.25010615587234497, + "learning_rate": 6.239086516367865e-06, + "loss": 1.7474, + "step": 27502 + }, + { + "epoch": 8.441682013505218, + "grad_norm": 0.1281466782093048, + "learning_rate": 6.236682332587474e-06, + "loss": 1.6946, + "step": 27503 + }, + { + "epoch": 8.441988950276244, + "grad_norm": 0.19045543670654297, + "learning_rate": 6.234278581301855e-06, + "loss": 1.7198, + "step": 27504 + }, + { + "epoch": 8.442295887047269, + "grad_norm": 0.17753495275974274, + "learning_rate": 6.231875262534748e-06, + "loss": 1.7324, + "step": 27505 + }, + { + "epoch": 8.442602823818293, + "grad_norm": 0.14088352024555206, + "learning_rate": 6.229472376309897e-06, + "loss": 1.6683, + "step": 27506 + }, + { + "epoch": 8.442909760589318, + "grad_norm": 0.16781100630760193, + "learning_rate": 6.2270699226510685e-06, + "loss": 1.7271, + "step": 27507 + }, + { + "epoch": 8.443216697360343, + "grad_norm": 0.1857508271932602, + "learning_rate": 6.224667901581971e-06, + "loss": 1.7596, + "step": 27508 + }, + { + "epoch": 8.443523634131369, + "grad_norm": 0.18411888182163239, + "learning_rate": 6.222266313126374e-06, + "loss": 1.8193, + "step": 27509 + }, + { + "epoch": 8.443830570902394, + "grad_norm": 0.1530957967042923, + "learning_rate": 6.2198651573079965e-06, + "loss": 1.6958, + "step": 27510 + }, + { + "epoch": 8.44413750767342, + "grad_norm": 0.19102713465690613, + "learning_rate": 6.217464434150572e-06, + "loss": 1.7172, + "step": 27511 + }, + { + "epoch": 8.444444444444445, + "grad_norm": 0.16886062920093536, + "learning_rate": 6.215064143677829e-06, + "loss": 1.6811, + "step": 27512 + }, + { + "epoch": 8.44475138121547, + "grad_norm": 0.15974819660186768, + "learning_rate": 6.212664285913483e-06, + "loss": 1.694, + "step": 27513 + }, + { + "epoch": 8.445058317986495, + "grad_norm": 0.19709718227386475, + "learning_rate": 6.2102648608812544e-06, + "loss": 1.7647, + "step": 27514 + }, + { + "epoch": 8.44536525475752, + "grad_norm": 0.15339697897434235, + "learning_rate": 6.207865868604857e-06, + "loss": 1.7169, + "step": 27515 + }, + { + "epoch": 8.445672191528546, + "grad_norm": 0.14088544249534607, + "learning_rate": 6.2054673091079815e-06, + "loss": 1.6902, + "step": 27516 + }, + { + "epoch": 8.44597912829957, + "grad_norm": 0.17412640154361725, + "learning_rate": 6.203069182414367e-06, + "loss": 1.7205, + "step": 27517 + }, + { + "epoch": 8.446286065070595, + "grad_norm": 0.18837641179561615, + "learning_rate": 6.200671488547677e-06, + "loss": 1.7756, + "step": 27518 + }, + { + "epoch": 8.44659300184162, + "grad_norm": 0.18904593586921692, + "learning_rate": 6.198274227531642e-06, + "loss": 1.732, + "step": 27519 + }, + { + "epoch": 8.446899938612646, + "grad_norm": 0.13136132061481476, + "learning_rate": 6.19587739938991e-06, + "loss": 1.6844, + "step": 27520 + }, + { + "epoch": 8.44720687538367, + "grad_norm": 0.15678717195987701, + "learning_rate": 6.1934810041462066e-06, + "loss": 1.7029, + "step": 27521 + }, + { + "epoch": 8.447513812154696, + "grad_norm": 0.1661362200975418, + "learning_rate": 6.191085041824207e-06, + "loss": 1.6656, + "step": 27522 + }, + { + "epoch": 8.447820748925722, + "grad_norm": 0.1749318689107895, + "learning_rate": 6.188689512447565e-06, + "loss": 1.7412, + "step": 27523 + }, + { + "epoch": 8.448127685696747, + "grad_norm": 0.17242331802845, + "learning_rate": 6.18629441603999e-06, + "loss": 1.7037, + "step": 27524 + }, + { + "epoch": 8.448434622467772, + "grad_norm": 0.16092433035373688, + "learning_rate": 6.183899752625116e-06, + "loss": 1.6817, + "step": 27525 + }, + { + "epoch": 8.448741559238798, + "grad_norm": 0.16177381575107574, + "learning_rate": 6.1815055222266325e-06, + "loss": 1.6678, + "step": 27526 + }, + { + "epoch": 8.449048496009823, + "grad_norm": 0.1489405483007431, + "learning_rate": 6.179111724868197e-06, + "loss": 1.6839, + "step": 27527 + }, + { + "epoch": 8.449355432780846, + "grad_norm": 0.15873265266418457, + "learning_rate": 6.176718360573458e-06, + "loss": 1.6749, + "step": 27528 + }, + { + "epoch": 8.449662369551872, + "grad_norm": 0.17511235177516937, + "learning_rate": 6.174325429366079e-06, + "loss": 1.6962, + "step": 27529 + }, + { + "epoch": 8.449969306322897, + "grad_norm": 0.1452886015176773, + "learning_rate": 6.171932931269702e-06, + "loss": 1.7141, + "step": 27530 + }, + { + "epoch": 8.450276243093922, + "grad_norm": 0.20559509098529816, + "learning_rate": 6.169540866307977e-06, + "loss": 1.7116, + "step": 27531 + }, + { + "epoch": 8.450583179864948, + "grad_norm": 0.17642420530319214, + "learning_rate": 6.167149234504532e-06, + "loss": 1.7209, + "step": 27532 + }, + { + "epoch": 8.450890116635973, + "grad_norm": 0.13833492994308472, + "learning_rate": 6.164758035883001e-06, + "loss": 1.6522, + "step": 27533 + }, + { + "epoch": 8.451197053406998, + "grad_norm": 0.18079428374767303, + "learning_rate": 6.162367270467045e-06, + "loss": 1.7348, + "step": 27534 + }, + { + "epoch": 8.451503990178024, + "grad_norm": 0.19325628876686096, + "learning_rate": 6.159976938280249e-06, + "loss": 1.6947, + "step": 27535 + }, + { + "epoch": 8.45181092694905, + "grad_norm": 0.17844507098197937, + "learning_rate": 6.15758703934628e-06, + "loss": 1.7206, + "step": 27536 + }, + { + "epoch": 8.452117863720074, + "grad_norm": 0.186324343085289, + "learning_rate": 6.155197573688703e-06, + "loss": 1.743, + "step": 27537 + }, + { + "epoch": 8.452424800491098, + "grad_norm": 0.15700562298297882, + "learning_rate": 6.152808541331184e-06, + "loss": 1.7109, + "step": 27538 + }, + { + "epoch": 8.452731737262123, + "grad_norm": 0.13879023492336273, + "learning_rate": 6.150419942297314e-06, + "loss": 1.6737, + "step": 27539 + }, + { + "epoch": 8.453038674033149, + "grad_norm": 0.14589501917362213, + "learning_rate": 6.148031776610675e-06, + "loss": 1.6884, + "step": 27540 + }, + { + "epoch": 8.453345610804174, + "grad_norm": 0.14402590692043304, + "learning_rate": 6.1456440442949125e-06, + "loss": 1.6949, + "step": 27541 + }, + { + "epoch": 8.4536525475752, + "grad_norm": 0.16506166756153107, + "learning_rate": 6.143256745373571e-06, + "loss": 1.725, + "step": 27542 + }, + { + "epoch": 8.453959484346225, + "grad_norm": 0.15663643181324005, + "learning_rate": 6.140869879870287e-06, + "loss": 1.7069, + "step": 27543 + }, + { + "epoch": 8.45426642111725, + "grad_norm": 0.16058720648288727, + "learning_rate": 6.138483447808635e-06, + "loss": 1.7264, + "step": 27544 + }, + { + "epoch": 8.454573357888275, + "grad_norm": 0.23160551488399506, + "learning_rate": 6.136097449212197e-06, + "loss": 1.7573, + "step": 27545 + }, + { + "epoch": 8.4548802946593, + "grad_norm": 0.15130533277988434, + "learning_rate": 6.133711884104554e-06, + "loss": 1.705, + "step": 27546 + }, + { + "epoch": 8.455187231430326, + "grad_norm": 0.16825515031814575, + "learning_rate": 6.131326752509281e-06, + "loss": 1.7405, + "step": 27547 + }, + { + "epoch": 8.455494168201351, + "grad_norm": 0.19265486299991608, + "learning_rate": 6.128942054449943e-06, + "loss": 1.7026, + "step": 27548 + }, + { + "epoch": 8.455801104972375, + "grad_norm": 0.18873640894889832, + "learning_rate": 6.126557789950121e-06, + "loss": 1.6825, + "step": 27549 + }, + { + "epoch": 8.4561080417434, + "grad_norm": 0.13833044469356537, + "learning_rate": 6.124173959033358e-06, + "loss": 1.6589, + "step": 27550 + }, + { + "epoch": 8.456414978514426, + "grad_norm": 0.16894219815731049, + "learning_rate": 6.1217905617232394e-06, + "loss": 1.7781, + "step": 27551 + }, + { + "epoch": 8.456721915285451, + "grad_norm": 0.18338344991207123, + "learning_rate": 6.119407598043292e-06, + "loss": 1.7348, + "step": 27552 + }, + { + "epoch": 8.457028852056476, + "grad_norm": 0.17766039073467255, + "learning_rate": 6.117025068017096e-06, + "loss": 1.7126, + "step": 27553 + }, + { + "epoch": 8.457335788827502, + "grad_norm": 0.18717309832572937, + "learning_rate": 6.114642971668155e-06, + "loss": 1.7193, + "step": 27554 + }, + { + "epoch": 8.457642725598527, + "grad_norm": 0.15229196846485138, + "learning_rate": 6.112261309020045e-06, + "loss": 1.665, + "step": 27555 + }, + { + "epoch": 8.457949662369552, + "grad_norm": 0.15391093492507935, + "learning_rate": 6.109880080096303e-06, + "loss": 1.6813, + "step": 27556 + }, + { + "epoch": 8.458256599140578, + "grad_norm": 0.1363036334514618, + "learning_rate": 6.107499284920432e-06, + "loss": 1.6912, + "step": 27557 + }, + { + "epoch": 8.458563535911603, + "grad_norm": 0.15193909406661987, + "learning_rate": 6.105118923516001e-06, + "loss": 1.7219, + "step": 27558 + }, + { + "epoch": 8.458870472682626, + "grad_norm": 0.1312003880739212, + "learning_rate": 6.102738995906487e-06, + "loss": 1.7317, + "step": 27559 + }, + { + "epoch": 8.459177409453652, + "grad_norm": 0.12835659086704254, + "learning_rate": 6.100359502115449e-06, + "loss": 1.6556, + "step": 27560 + }, + { + "epoch": 8.459484346224677, + "grad_norm": 0.17296236753463745, + "learning_rate": 6.09798044216639e-06, + "loss": 1.7331, + "step": 27561 + }, + { + "epoch": 8.459791282995702, + "grad_norm": 0.1607210338115692, + "learning_rate": 6.095601816082819e-06, + "loss": 1.7297, + "step": 27562 + }, + { + "epoch": 8.460098219766728, + "grad_norm": 0.1841181367635727, + "learning_rate": 6.093223623888245e-06, + "loss": 1.7382, + "step": 27563 + }, + { + "epoch": 8.460405156537753, + "grad_norm": 0.15751226246356964, + "learning_rate": 6.090845865606165e-06, + "loss": 1.6952, + "step": 27564 + }, + { + "epoch": 8.460712093308778, + "grad_norm": 0.15703023970127106, + "learning_rate": 6.0884685412600835e-06, + "loss": 1.7476, + "step": 27565 + }, + { + "epoch": 8.461019030079804, + "grad_norm": 0.17819096148014069, + "learning_rate": 6.0860916508734985e-06, + "loss": 1.7761, + "step": 27566 + }, + { + "epoch": 8.46132596685083, + "grad_norm": 0.168768510222435, + "learning_rate": 6.08371519446988e-06, + "loss": 1.7534, + "step": 27567 + }, + { + "epoch": 8.461632903621854, + "grad_norm": 0.1577196717262268, + "learning_rate": 6.081339172072747e-06, + "loss": 1.6533, + "step": 27568 + }, + { + "epoch": 8.46193984039288, + "grad_norm": 0.19285355508327484, + "learning_rate": 6.078963583705544e-06, + "loss": 1.7127, + "step": 27569 + }, + { + "epoch": 8.462246777163903, + "grad_norm": 0.15905390679836273, + "learning_rate": 6.076588429391788e-06, + "loss": 1.6851, + "step": 27570 + }, + { + "epoch": 8.462553713934929, + "grad_norm": 0.14860354363918304, + "learning_rate": 6.074213709154908e-06, + "loss": 1.7016, + "step": 27571 + }, + { + "epoch": 8.462860650705954, + "grad_norm": 0.2003553956747055, + "learning_rate": 6.0718394230184e-06, + "loss": 1.819, + "step": 27572 + }, + { + "epoch": 8.46316758747698, + "grad_norm": 0.1739475131034851, + "learning_rate": 6.069465571005733e-06, + "loss": 1.7539, + "step": 27573 + }, + { + "epoch": 8.463474524248005, + "grad_norm": 0.20145776867866516, + "learning_rate": 6.067092153140341e-06, + "loss": 1.7472, + "step": 27574 + }, + { + "epoch": 8.46378146101903, + "grad_norm": 0.2065812349319458, + "learning_rate": 6.06471916944571e-06, + "loss": 1.7871, + "step": 27575 + }, + { + "epoch": 8.464088397790055, + "grad_norm": 0.16987882554531097, + "learning_rate": 6.0623466199452585e-06, + "loss": 1.7299, + "step": 27576 + }, + { + "epoch": 8.46439533456108, + "grad_norm": 0.1477213054895401, + "learning_rate": 6.059974504662458e-06, + "loss": 1.6829, + "step": 27577 + }, + { + "epoch": 8.464702271332106, + "grad_norm": 0.16443482041358948, + "learning_rate": 6.05760282362074e-06, + "loss": 1.7352, + "step": 27578 + }, + { + "epoch": 8.465009208103131, + "grad_norm": 0.15927115082740784, + "learning_rate": 6.055231576843551e-06, + "loss": 1.7175, + "step": 27579 + }, + { + "epoch": 8.465316144874157, + "grad_norm": 0.17477387189865112, + "learning_rate": 6.052860764354318e-06, + "loss": 1.6609, + "step": 27580 + }, + { + "epoch": 8.46562308164518, + "grad_norm": 0.22039631009101868, + "learning_rate": 6.050490386176477e-06, + "loss": 1.7664, + "step": 27581 + }, + { + "epoch": 8.465930018416206, + "grad_norm": 0.1699618101119995, + "learning_rate": 6.048120442333449e-06, + "loss": 1.7231, + "step": 27582 + }, + { + "epoch": 8.466236955187231, + "grad_norm": 0.1548585742712021, + "learning_rate": 6.045750932848654e-06, + "loss": 1.7503, + "step": 27583 + }, + { + "epoch": 8.466543891958256, + "grad_norm": 0.17046836018562317, + "learning_rate": 6.043381857745506e-06, + "loss": 1.6993, + "step": 27584 + }, + { + "epoch": 8.466850828729282, + "grad_norm": 0.1857844740152359, + "learning_rate": 6.041013217047431e-06, + "loss": 1.7132, + "step": 27585 + }, + { + "epoch": 8.467157765500307, + "grad_norm": 0.15656128525733948, + "learning_rate": 6.0386450107778105e-06, + "loss": 1.6713, + "step": 27586 + }, + { + "epoch": 8.467464702271332, + "grad_norm": 0.20369650423526764, + "learning_rate": 6.036277238960092e-06, + "loss": 1.7296, + "step": 27587 + }, + { + "epoch": 8.467771639042358, + "grad_norm": 0.15926989912986755, + "learning_rate": 6.0339099016176295e-06, + "loss": 1.6766, + "step": 27588 + }, + { + "epoch": 8.468078575813383, + "grad_norm": 0.16353332996368408, + "learning_rate": 6.0315429987738596e-06, + "loss": 1.7084, + "step": 27589 + }, + { + "epoch": 8.468385512584408, + "grad_norm": 0.16328907012939453, + "learning_rate": 6.029176530452141e-06, + "loss": 1.715, + "step": 27590 + }, + { + "epoch": 8.468692449355434, + "grad_norm": 0.20153367519378662, + "learning_rate": 6.026810496675861e-06, + "loss": 1.7363, + "step": 27591 + }, + { + "epoch": 8.468999386126457, + "grad_norm": 0.1374381184577942, + "learning_rate": 6.024444897468435e-06, + "loss": 1.6633, + "step": 27592 + }, + { + "epoch": 8.469306322897483, + "grad_norm": 0.20331406593322754, + "learning_rate": 6.022079732853198e-06, + "loss": 1.7544, + "step": 27593 + }, + { + "epoch": 8.469613259668508, + "grad_norm": 0.18052712082862854, + "learning_rate": 6.019715002853554e-06, + "loss": 1.7032, + "step": 27594 + }, + { + "epoch": 8.469920196439533, + "grad_norm": 0.18305034935474396, + "learning_rate": 6.017350707492863e-06, + "loss": 1.7249, + "step": 27595 + }, + { + "epoch": 8.470227133210559, + "grad_norm": 0.1608239710330963, + "learning_rate": 6.014986846794496e-06, + "loss": 1.7049, + "step": 27596 + }, + { + "epoch": 8.470534069981584, + "grad_norm": 0.16582928597927094, + "learning_rate": 6.012623420781804e-06, + "loss": 1.6777, + "step": 27597 + }, + { + "epoch": 8.47084100675261, + "grad_norm": 0.18023556470870972, + "learning_rate": 6.010260429478154e-06, + "loss": 1.6996, + "step": 27598 + }, + { + "epoch": 8.471147943523635, + "grad_norm": 0.1994815319776535, + "learning_rate": 6.007897872906892e-06, + "loss": 1.7455, + "step": 27599 + }, + { + "epoch": 8.47145488029466, + "grad_norm": 0.17772625386714935, + "learning_rate": 6.005535751091368e-06, + "loss": 1.7431, + "step": 27600 + }, + { + "epoch": 8.471761817065685, + "grad_norm": 0.17297807335853577, + "learning_rate": 6.003174064054929e-06, + "loss": 1.7087, + "step": 27601 + }, + { + "epoch": 8.472068753836709, + "grad_norm": 0.14986321330070496, + "learning_rate": 6.000812811820905e-06, + "loss": 1.681, + "step": 27602 + }, + { + "epoch": 8.472375690607734, + "grad_norm": 0.17512932419776917, + "learning_rate": 5.998451994412629e-06, + "loss": 1.7669, + "step": 27603 + }, + { + "epoch": 8.47268262737876, + "grad_norm": 0.18424493074417114, + "learning_rate": 5.996091611853466e-06, + "loss": 1.7296, + "step": 27604 + }, + { + "epoch": 8.472989564149785, + "grad_norm": 0.1246834322810173, + "learning_rate": 5.9937316641666906e-06, + "loss": 1.6747, + "step": 27605 + }, + { + "epoch": 8.47329650092081, + "grad_norm": 0.14435335993766785, + "learning_rate": 5.991372151375674e-06, + "loss": 1.6225, + "step": 27606 + }, + { + "epoch": 8.473603437691835, + "grad_norm": 0.16726957261562347, + "learning_rate": 5.989013073503702e-06, + "loss": 1.7052, + "step": 27607 + }, + { + "epoch": 8.47391037446286, + "grad_norm": 0.15307356417179108, + "learning_rate": 5.98665443057409e-06, + "loss": 1.7199, + "step": 27608 + }, + { + "epoch": 8.474217311233886, + "grad_norm": 0.14373189210891724, + "learning_rate": 5.984296222610175e-06, + "loss": 1.6808, + "step": 27609 + }, + { + "epoch": 8.474524248004911, + "grad_norm": 0.13142740726470947, + "learning_rate": 5.981938449635222e-06, + "loss": 1.6868, + "step": 27610 + }, + { + "epoch": 8.474831184775937, + "grad_norm": 0.13838545978069305, + "learning_rate": 5.979581111672572e-06, + "loss": 1.6723, + "step": 27611 + }, + { + "epoch": 8.475138121546962, + "grad_norm": 0.15346096456050873, + "learning_rate": 5.977224208745485e-06, + "loss": 1.7066, + "step": 27612 + }, + { + "epoch": 8.475445058317986, + "grad_norm": 0.127261221408844, + "learning_rate": 5.974867740877283e-06, + "loss": 1.6285, + "step": 27613 + }, + { + "epoch": 8.475751995089011, + "grad_norm": 0.12636838853359222, + "learning_rate": 5.972511708091239e-06, + "loss": 1.6707, + "step": 27614 + }, + { + "epoch": 8.476058931860036, + "grad_norm": 0.22297553718090057, + "learning_rate": 5.970156110410641e-06, + "loss": 1.693, + "step": 27615 + }, + { + "epoch": 8.476365868631062, + "grad_norm": 0.21933813393115997, + "learning_rate": 5.967800947858765e-06, + "loss": 1.7622, + "step": 27616 + }, + { + "epoch": 8.476672805402087, + "grad_norm": 0.19202767312526703, + "learning_rate": 5.965446220458887e-06, + "loss": 1.723, + "step": 27617 + }, + { + "epoch": 8.476979742173112, + "grad_norm": 0.13845433294773102, + "learning_rate": 5.963091928234283e-06, + "loss": 1.6824, + "step": 27618 + }, + { + "epoch": 8.477286678944138, + "grad_norm": 0.1829427033662796, + "learning_rate": 5.960738071208211e-06, + "loss": 1.7441, + "step": 27619 + }, + { + "epoch": 8.477593615715163, + "grad_norm": 0.17720428109169006, + "learning_rate": 5.958384649403931e-06, + "loss": 1.7108, + "step": 27620 + }, + { + "epoch": 8.477900552486188, + "grad_norm": 0.12632785737514496, + "learning_rate": 5.95603166284473e-06, + "loss": 1.6762, + "step": 27621 + }, + { + "epoch": 8.478207489257214, + "grad_norm": 0.15774594247341156, + "learning_rate": 5.953679111553812e-06, + "loss": 1.7076, + "step": 27622 + }, + { + "epoch": 8.478514426028239, + "grad_norm": 0.16115643084049225, + "learning_rate": 5.9513269955544795e-06, + "loss": 1.757, + "step": 27623 + }, + { + "epoch": 8.478821362799263, + "grad_norm": 0.13887029886245728, + "learning_rate": 5.948975314869937e-06, + "loss": 1.7462, + "step": 27624 + }, + { + "epoch": 8.479128299570288, + "grad_norm": 0.1517426073551178, + "learning_rate": 5.946624069523432e-06, + "loss": 1.6912, + "step": 27625 + }, + { + "epoch": 8.479435236341313, + "grad_norm": 0.15509237349033356, + "learning_rate": 5.94427325953823e-06, + "loss": 1.7022, + "step": 27626 + }, + { + "epoch": 8.479742173112339, + "grad_norm": 0.1656811237335205, + "learning_rate": 5.9419228849375175e-06, + "loss": 1.713, + "step": 27627 + }, + { + "epoch": 8.480049109883364, + "grad_norm": 0.2257215678691864, + "learning_rate": 5.93957294574457e-06, + "loss": 1.7452, + "step": 27628 + }, + { + "epoch": 8.48035604665439, + "grad_norm": 0.15382499992847443, + "learning_rate": 5.9372234419825645e-06, + "loss": 1.7056, + "step": 27629 + }, + { + "epoch": 8.480662983425415, + "grad_norm": 0.1773097813129425, + "learning_rate": 5.934874373674754e-06, + "loss": 1.7161, + "step": 27630 + }, + { + "epoch": 8.48096992019644, + "grad_norm": 0.16455380618572235, + "learning_rate": 5.932525740844341e-06, + "loss": 1.7454, + "step": 27631 + }, + { + "epoch": 8.481276856967465, + "grad_norm": 0.15213815867900848, + "learning_rate": 5.930177543514542e-06, + "loss": 1.7049, + "step": 27632 + }, + { + "epoch": 8.48158379373849, + "grad_norm": 0.17395392060279846, + "learning_rate": 5.927829781708555e-06, + "loss": 1.7026, + "step": 27633 + }, + { + "epoch": 8.481890730509516, + "grad_norm": 0.18553678691387177, + "learning_rate": 5.925482455449588e-06, + "loss": 1.7437, + "step": 27634 + }, + { + "epoch": 8.48219766728054, + "grad_norm": 0.15735404193401337, + "learning_rate": 5.9231355647608346e-06, + "loss": 1.7171, + "step": 27635 + }, + { + "epoch": 8.482504604051565, + "grad_norm": 0.14466318488121033, + "learning_rate": 5.920789109665487e-06, + "loss": 1.6698, + "step": 27636 + }, + { + "epoch": 8.48281154082259, + "grad_norm": 0.159750834107399, + "learning_rate": 5.918443090186732e-06, + "loss": 1.7045, + "step": 27637 + }, + { + "epoch": 8.483118477593615, + "grad_norm": 0.14026959240436554, + "learning_rate": 5.916097506347773e-06, + "loss": 1.6751, + "step": 27638 + }, + { + "epoch": 8.48342541436464, + "grad_norm": 0.18119752407073975, + "learning_rate": 5.913752358171765e-06, + "loss": 1.7768, + "step": 27639 + }, + { + "epoch": 8.483732351135666, + "grad_norm": 0.20957626402378082, + "learning_rate": 5.91140764568191e-06, + "loss": 1.72, + "step": 27640 + }, + { + "epoch": 8.484039287906691, + "grad_norm": 0.1649177372455597, + "learning_rate": 5.909063368901357e-06, + "loss": 1.6938, + "step": 27641 + }, + { + "epoch": 8.484346224677717, + "grad_norm": 0.17464084923267365, + "learning_rate": 5.906719527853271e-06, + "loss": 1.7369, + "step": 27642 + }, + { + "epoch": 8.484653161448742, + "grad_norm": 0.14213840663433075, + "learning_rate": 5.90437612256085e-06, + "loss": 1.6985, + "step": 27643 + }, + { + "epoch": 8.484960098219767, + "grad_norm": 0.2008642852306366, + "learning_rate": 5.902033153047209e-06, + "loss": 1.7394, + "step": 27644 + }, + { + "epoch": 8.485267034990791, + "grad_norm": 0.15051651000976562, + "learning_rate": 5.899690619335541e-06, + "loss": 1.6729, + "step": 27645 + }, + { + "epoch": 8.485573971761816, + "grad_norm": 0.17977653443813324, + "learning_rate": 5.897348521448958e-06, + "loss": 1.7501, + "step": 27646 + }, + { + "epoch": 8.485880908532842, + "grad_norm": 0.2593468427658081, + "learning_rate": 5.89500685941064e-06, + "loss": 1.7174, + "step": 27647 + }, + { + "epoch": 8.486187845303867, + "grad_norm": 0.23924550414085388, + "learning_rate": 5.8926656332437105e-06, + "loss": 1.7383, + "step": 27648 + }, + { + "epoch": 8.486494782074892, + "grad_norm": 0.1751977503299713, + "learning_rate": 5.8903248429713124e-06, + "loss": 1.7024, + "step": 27649 + }, + { + "epoch": 8.486801718845918, + "grad_norm": 0.21737132966518402, + "learning_rate": 5.887984488616582e-06, + "loss": 1.7214, + "step": 27650 + }, + { + "epoch": 8.487108655616943, + "grad_norm": 0.2042747437953949, + "learning_rate": 5.885644570202636e-06, + "loss": 1.7126, + "step": 27651 + }, + { + "epoch": 8.487415592387968, + "grad_norm": 0.14556188881397247, + "learning_rate": 5.883305087752611e-06, + "loss": 1.6919, + "step": 27652 + }, + { + "epoch": 8.487722529158994, + "grad_norm": 0.210098534822464, + "learning_rate": 5.880966041289626e-06, + "loss": 1.6728, + "step": 27653 + }, + { + "epoch": 8.488029465930019, + "grad_norm": 0.26891016960144043, + "learning_rate": 5.878627430836781e-06, + "loss": 1.7356, + "step": 27654 + }, + { + "epoch": 8.488336402701044, + "grad_norm": 0.13008984923362732, + "learning_rate": 5.876289256417217e-06, + "loss": 1.6685, + "step": 27655 + }, + { + "epoch": 8.488643339472068, + "grad_norm": 0.2077993005514145, + "learning_rate": 5.873951518054005e-06, + "loss": 1.6983, + "step": 27656 + }, + { + "epoch": 8.488950276243093, + "grad_norm": 0.19198927283287048, + "learning_rate": 5.871614215770294e-06, + "loss": 1.6703, + "step": 27657 + }, + { + "epoch": 8.489257213014119, + "grad_norm": 0.18122628331184387, + "learning_rate": 5.869277349589137e-06, + "loss": 1.8012, + "step": 27658 + }, + { + "epoch": 8.489564149785144, + "grad_norm": 0.2359529435634613, + "learning_rate": 5.866940919533642e-06, + "loss": 1.7194, + "step": 27659 + }, + { + "epoch": 8.48987108655617, + "grad_norm": 0.15916365385055542, + "learning_rate": 5.864604925626921e-06, + "loss": 1.6929, + "step": 27660 + }, + { + "epoch": 8.490178023327195, + "grad_norm": 0.16607709228992462, + "learning_rate": 5.862269367892026e-06, + "loss": 1.7001, + "step": 27661 + }, + { + "epoch": 8.49048496009822, + "grad_norm": 0.17609505355358124, + "learning_rate": 5.859934246352072e-06, + "loss": 1.736, + "step": 27662 + }, + { + "epoch": 8.490791896869245, + "grad_norm": 0.17898498475551605, + "learning_rate": 5.857599561030103e-06, + "loss": 1.7397, + "step": 27663 + }, + { + "epoch": 8.49109883364027, + "grad_norm": 0.17502975463867188, + "learning_rate": 5.855265311949215e-06, + "loss": 1.6874, + "step": 27664 + }, + { + "epoch": 8.491405770411296, + "grad_norm": 0.16041016578674316, + "learning_rate": 5.852931499132469e-06, + "loss": 1.7494, + "step": 27665 + }, + { + "epoch": 8.491712707182321, + "grad_norm": 0.12939618527889252, + "learning_rate": 5.850598122602929e-06, + "loss": 1.6397, + "step": 27666 + }, + { + "epoch": 8.492019643953345, + "grad_norm": 0.1685323715209961, + "learning_rate": 5.848265182383656e-06, + "loss": 1.7465, + "step": 27667 + }, + { + "epoch": 8.49232658072437, + "grad_norm": 0.14007940888404846, + "learning_rate": 5.845932678497707e-06, + "loss": 1.6718, + "step": 27668 + }, + { + "epoch": 8.492633517495396, + "grad_norm": 0.14807704091072083, + "learning_rate": 5.843600610968125e-06, + "loss": 1.6858, + "step": 27669 + }, + { + "epoch": 8.49294045426642, + "grad_norm": 0.14770758152008057, + "learning_rate": 5.841268979817965e-06, + "loss": 1.6655, + "step": 27670 + }, + { + "epoch": 8.493247391037446, + "grad_norm": 0.13218273222446442, + "learning_rate": 5.838937785070258e-06, + "loss": 1.7132, + "step": 27671 + }, + { + "epoch": 8.493554327808472, + "grad_norm": 0.1349583864212036, + "learning_rate": 5.836607026748076e-06, + "loss": 1.6704, + "step": 27672 + }, + { + "epoch": 8.493861264579497, + "grad_norm": 0.22880202531814575, + "learning_rate": 5.834276704874403e-06, + "loss": 1.7297, + "step": 27673 + }, + { + "epoch": 8.494168201350522, + "grad_norm": 0.17375829815864563, + "learning_rate": 5.831946819472317e-06, + "loss": 1.6857, + "step": 27674 + }, + { + "epoch": 8.494475138121548, + "grad_norm": 0.15201902389526367, + "learning_rate": 5.829617370564805e-06, + "loss": 1.7148, + "step": 27675 + }, + { + "epoch": 8.494782074892573, + "grad_norm": 0.1489444226026535, + "learning_rate": 5.827288358174898e-06, + "loss": 1.7477, + "step": 27676 + }, + { + "epoch": 8.495089011663598, + "grad_norm": 0.1331137716770172, + "learning_rate": 5.824959782325634e-06, + "loss": 1.7282, + "step": 27677 + }, + { + "epoch": 8.495395948434622, + "grad_norm": 0.1779918074607849, + "learning_rate": 5.822631643039994e-06, + "loss": 1.6677, + "step": 27678 + }, + { + "epoch": 8.495702885205647, + "grad_norm": 0.17707432806491852, + "learning_rate": 5.820303940341021e-06, + "loss": 1.7627, + "step": 27679 + }, + { + "epoch": 8.496009821976672, + "grad_norm": 0.19686660170555115, + "learning_rate": 5.817976674251674e-06, + "loss": 1.8057, + "step": 27680 + }, + { + "epoch": 8.496316758747698, + "grad_norm": 0.17378473281860352, + "learning_rate": 5.81564984479499e-06, + "loss": 1.763, + "step": 27681 + }, + { + "epoch": 8.496623695518723, + "grad_norm": 0.13753214478492737, + "learning_rate": 5.813323451993952e-06, + "loss": 1.6567, + "step": 27682 + }, + { + "epoch": 8.496930632289748, + "grad_norm": 0.19319739937782288, + "learning_rate": 5.810997495871551e-06, + "loss": 1.7447, + "step": 27683 + }, + { + "epoch": 8.497237569060774, + "grad_norm": 0.1459372490644455, + "learning_rate": 5.808671976450775e-06, + "loss": 1.6978, + "step": 27684 + }, + { + "epoch": 8.497544505831799, + "grad_norm": 0.1829099804162979, + "learning_rate": 5.806346893754599e-06, + "loss": 1.7399, + "step": 27685 + }, + { + "epoch": 8.497851442602824, + "grad_norm": 0.14952246844768524, + "learning_rate": 5.804022247806007e-06, + "loss": 1.683, + "step": 27686 + }, + { + "epoch": 8.49815837937385, + "grad_norm": 0.14325882494449615, + "learning_rate": 5.801698038627973e-06, + "loss": 1.689, + "step": 27687 + }, + { + "epoch": 8.498465316144873, + "grad_norm": 0.17999286949634552, + "learning_rate": 5.799374266243451e-06, + "loss": 1.7358, + "step": 27688 + }, + { + "epoch": 8.498772252915899, + "grad_norm": 0.17262579500675201, + "learning_rate": 5.797050930675441e-06, + "loss": 1.7249, + "step": 27689 + }, + { + "epoch": 8.499079189686924, + "grad_norm": 0.17032817006111145, + "learning_rate": 5.794728031946861e-06, + "loss": 1.7124, + "step": 27690 + }, + { + "epoch": 8.49938612645795, + "grad_norm": 0.16629208624362946, + "learning_rate": 5.7924055700807115e-06, + "loss": 1.6981, + "step": 27691 + }, + { + "epoch": 8.499693063228975, + "grad_norm": 0.19601507484912872, + "learning_rate": 5.7900835450999115e-06, + "loss": 1.6582, + "step": 27692 + }, + { + "epoch": 8.5, + "grad_norm": 0.2122369408607483, + "learning_rate": 5.787761957027405e-06, + "loss": 1.7509, + "step": 27693 + }, + { + "epoch": 8.500306936771025, + "grad_norm": 0.16086016595363617, + "learning_rate": 5.785440805886166e-06, + "loss": 1.7011, + "step": 27694 + }, + { + "epoch": 8.50061387354205, + "grad_norm": 0.15793873369693756, + "learning_rate": 5.783120091699101e-06, + "loss": 1.6879, + "step": 27695 + }, + { + "epoch": 8.500920810313076, + "grad_norm": 0.15392783284187317, + "learning_rate": 5.7807998144891735e-06, + "loss": 1.6973, + "step": 27696 + }, + { + "epoch": 8.501227747084101, + "grad_norm": 0.17782802879810333, + "learning_rate": 5.778479974279288e-06, + "loss": 1.7319, + "step": 27697 + }, + { + "epoch": 8.501534683855127, + "grad_norm": 0.139020636677742, + "learning_rate": 5.776160571092387e-06, + "loss": 1.6655, + "step": 27698 + }, + { + "epoch": 8.50184162062615, + "grad_norm": 0.1582586020231247, + "learning_rate": 5.773841604951391e-06, + "loss": 1.7134, + "step": 27699 + }, + { + "epoch": 8.502148557397176, + "grad_norm": 0.1685703545808792, + "learning_rate": 5.77152307587921e-06, + "loss": 1.7504, + "step": 27700 + }, + { + "epoch": 8.502455494168201, + "grad_norm": 0.15043340623378754, + "learning_rate": 5.769204983898763e-06, + "loss": 1.6837, + "step": 27701 + }, + { + "epoch": 8.502762430939226, + "grad_norm": 0.18134978413581848, + "learning_rate": 5.7668873290329605e-06, + "loss": 1.7698, + "step": 27702 + }, + { + "epoch": 8.503069367710252, + "grad_norm": 0.18589314818382263, + "learning_rate": 5.764570111304696e-06, + "loss": 1.7565, + "step": 27703 + }, + { + "epoch": 8.503376304481277, + "grad_norm": 0.17075087130069733, + "learning_rate": 5.762253330736883e-06, + "loss": 1.6888, + "step": 27704 + }, + { + "epoch": 8.503683241252302, + "grad_norm": 0.13238663971424103, + "learning_rate": 5.759936987352399e-06, + "loss": 1.6708, + "step": 27705 + }, + { + "epoch": 8.503990178023328, + "grad_norm": 0.1714777648448944, + "learning_rate": 5.75762108117417e-06, + "loss": 1.6934, + "step": 27706 + }, + { + "epoch": 8.504297114794353, + "grad_norm": 0.13476133346557617, + "learning_rate": 5.755305612225037e-06, + "loss": 1.707, + "step": 27707 + }, + { + "epoch": 8.504604051565378, + "grad_norm": 0.1355150043964386, + "learning_rate": 5.7529905805279285e-06, + "loss": 1.695, + "step": 27708 + }, + { + "epoch": 8.504910988336402, + "grad_norm": 0.15239351987838745, + "learning_rate": 5.750675986105686e-06, + "loss": 1.7146, + "step": 27709 + }, + { + "epoch": 8.505217925107427, + "grad_norm": 0.1348891258239746, + "learning_rate": 5.748361828981197e-06, + "loss": 1.7087, + "step": 27710 + }, + { + "epoch": 8.505524861878452, + "grad_norm": 0.1657278686761856, + "learning_rate": 5.746048109177349e-06, + "loss": 1.7222, + "step": 27711 + }, + { + "epoch": 8.505831798649478, + "grad_norm": 0.17044055461883545, + "learning_rate": 5.743734826716967e-06, + "loss": 1.7917, + "step": 27712 + }, + { + "epoch": 8.506138735420503, + "grad_norm": 0.13258327543735504, + "learning_rate": 5.741421981622963e-06, + "loss": 1.6859, + "step": 27713 + }, + { + "epoch": 8.506445672191528, + "grad_norm": 0.13243085145950317, + "learning_rate": 5.7391095739181495e-06, + "loss": 1.6832, + "step": 27714 + }, + { + "epoch": 8.506752608962554, + "grad_norm": 0.14863869547843933, + "learning_rate": 5.736797603625405e-06, + "loss": 1.6961, + "step": 27715 + }, + { + "epoch": 8.50705954573358, + "grad_norm": 0.13942895829677582, + "learning_rate": 5.73448607076757e-06, + "loss": 1.6847, + "step": 27716 + }, + { + "epoch": 8.507366482504604, + "grad_norm": 0.13684460520744324, + "learning_rate": 5.732174975367482e-06, + "loss": 1.6888, + "step": 27717 + }, + { + "epoch": 8.50767341927563, + "grad_norm": 0.1887209117412567, + "learning_rate": 5.7298643174479974e-06, + "loss": 1.7091, + "step": 27718 + }, + { + "epoch": 8.507980356046655, + "grad_norm": 0.17502547800540924, + "learning_rate": 5.727554097031934e-06, + "loss": 1.7103, + "step": 27719 + }, + { + "epoch": 8.50828729281768, + "grad_norm": 0.17275308072566986, + "learning_rate": 5.725244314142137e-06, + "loss": 1.7392, + "step": 27720 + }, + { + "epoch": 8.508594229588704, + "grad_norm": 0.13890086114406586, + "learning_rate": 5.722934968801419e-06, + "loss": 1.6711, + "step": 27721 + }, + { + "epoch": 8.50890116635973, + "grad_norm": 0.16987508535385132, + "learning_rate": 5.720626061032603e-06, + "loss": 1.6784, + "step": 27722 + }, + { + "epoch": 8.509208103130755, + "grad_norm": 0.12734577059745789, + "learning_rate": 5.718317590858529e-06, + "loss": 1.668, + "step": 27723 + }, + { + "epoch": 8.50951503990178, + "grad_norm": 0.17097610235214233, + "learning_rate": 5.716009558301977e-06, + "loss": 1.7419, + "step": 27724 + }, + { + "epoch": 8.509821976672805, + "grad_norm": 0.15415556728839874, + "learning_rate": 5.713701963385798e-06, + "loss": 1.6794, + "step": 27725 + }, + { + "epoch": 8.51012891344383, + "grad_norm": 0.115156389772892, + "learning_rate": 5.711394806132758e-06, + "loss": 1.6364, + "step": 27726 + }, + { + "epoch": 8.510435850214856, + "grad_norm": 0.1583303064107895, + "learning_rate": 5.709088086565667e-06, + "loss": 1.7185, + "step": 27727 + }, + { + "epoch": 8.510742786985881, + "grad_norm": 0.17150144279003143, + "learning_rate": 5.706781804707345e-06, + "loss": 1.7122, + "step": 27728 + }, + { + "epoch": 8.511049723756907, + "grad_norm": 0.14469772577285767, + "learning_rate": 5.7044759605805464e-06, + "loss": 1.6806, + "step": 27729 + }, + { + "epoch": 8.511356660527932, + "grad_norm": 0.1671745926141739, + "learning_rate": 5.702170554208102e-06, + "loss": 1.7051, + "step": 27730 + }, + { + "epoch": 8.511663597298956, + "grad_norm": 0.14769956469535828, + "learning_rate": 5.699865585612746e-06, + "loss": 1.7052, + "step": 27731 + }, + { + "epoch": 8.511970534069981, + "grad_norm": 0.17527055740356445, + "learning_rate": 5.697561054817296e-06, + "loss": 1.7397, + "step": 27732 + }, + { + "epoch": 8.512277470841006, + "grad_norm": 0.16712914407253265, + "learning_rate": 5.695256961844519e-06, + "loss": 1.7025, + "step": 27733 + }, + { + "epoch": 8.512584407612032, + "grad_norm": 0.14546720683574677, + "learning_rate": 5.6929533067171745e-06, + "loss": 1.667, + "step": 27734 + }, + { + "epoch": 8.512891344383057, + "grad_norm": 0.1326368749141693, + "learning_rate": 5.690650089458038e-06, + "loss": 1.7109, + "step": 27735 + }, + { + "epoch": 8.513198281154082, + "grad_norm": 0.14168506860733032, + "learning_rate": 5.688347310089864e-06, + "loss": 1.6497, + "step": 27736 + }, + { + "epoch": 8.513505217925108, + "grad_norm": 0.18198592960834503, + "learning_rate": 5.686044968635418e-06, + "loss": 1.7167, + "step": 27737 + }, + { + "epoch": 8.513812154696133, + "grad_norm": 0.14291147887706757, + "learning_rate": 5.683743065117447e-06, + "loss": 1.6855, + "step": 27738 + }, + { + "epoch": 8.514119091467158, + "grad_norm": 0.17336830496788025, + "learning_rate": 5.681441599558701e-06, + "loss": 1.738, + "step": 27739 + }, + { + "epoch": 8.514426028238184, + "grad_norm": 0.1447203904390335, + "learning_rate": 5.679140571981922e-06, + "loss": 1.7217, + "step": 27740 + }, + { + "epoch": 8.514732965009209, + "grad_norm": 0.19665221869945526, + "learning_rate": 5.676839982409849e-06, + "loss": 1.7395, + "step": 27741 + }, + { + "epoch": 8.515039901780233, + "grad_norm": 0.1405279040336609, + "learning_rate": 5.6745398308652386e-06, + "loss": 1.6559, + "step": 27742 + }, + { + "epoch": 8.515346838551258, + "grad_norm": 0.15195727348327637, + "learning_rate": 5.672240117370797e-06, + "loss": 1.6977, + "step": 27743 + }, + { + "epoch": 8.515653775322283, + "grad_norm": 0.11381472647190094, + "learning_rate": 5.669940841949261e-06, + "loss": 1.6594, + "step": 27744 + }, + { + "epoch": 8.515960712093309, + "grad_norm": 0.17271532118320465, + "learning_rate": 5.667642004623347e-06, + "loss": 1.7323, + "step": 27745 + }, + { + "epoch": 8.516267648864334, + "grad_norm": 0.15365839004516602, + "learning_rate": 5.665343605415774e-06, + "loss": 1.7257, + "step": 27746 + }, + { + "epoch": 8.51657458563536, + "grad_norm": 0.22701260447502136, + "learning_rate": 5.66304564434928e-06, + "loss": 1.6939, + "step": 27747 + }, + { + "epoch": 8.516881522406385, + "grad_norm": 0.14642612636089325, + "learning_rate": 5.660748121446535e-06, + "loss": 1.6985, + "step": 27748 + }, + { + "epoch": 8.51718845917741, + "grad_norm": 0.1659226268529892, + "learning_rate": 5.658451036730272e-06, + "loss": 1.7439, + "step": 27749 + }, + { + "epoch": 8.517495395948435, + "grad_norm": 0.14763525128364563, + "learning_rate": 5.65615439022319e-06, + "loss": 1.6714, + "step": 27750 + }, + { + "epoch": 8.51780233271946, + "grad_norm": 0.17457270622253418, + "learning_rate": 5.65385818194798e-06, + "loss": 1.7214, + "step": 27751 + }, + { + "epoch": 8.518109269490484, + "grad_norm": 0.15170279145240784, + "learning_rate": 5.651562411927335e-06, + "loss": 1.7121, + "step": 27752 + }, + { + "epoch": 8.51841620626151, + "grad_norm": 0.16129034757614136, + "learning_rate": 5.649267080183945e-06, + "loss": 1.6916, + "step": 27753 + }, + { + "epoch": 8.518723143032535, + "grad_norm": 0.20800361037254333, + "learning_rate": 5.64697218674049e-06, + "loss": 1.7482, + "step": 27754 + }, + { + "epoch": 8.51903007980356, + "grad_norm": 0.16350114345550537, + "learning_rate": 5.644677731619652e-06, + "loss": 1.6705, + "step": 27755 + }, + { + "epoch": 8.519337016574585, + "grad_norm": 0.15720658004283905, + "learning_rate": 5.642383714844107e-06, + "loss": 1.6871, + "step": 27756 + }, + { + "epoch": 8.51964395334561, + "grad_norm": 0.21885983645915985, + "learning_rate": 5.640090136436526e-06, + "loss": 1.7057, + "step": 27757 + }, + { + "epoch": 8.519950890116636, + "grad_norm": 0.1411464810371399, + "learning_rate": 5.637796996419564e-06, + "loss": 1.7103, + "step": 27758 + }, + { + "epoch": 8.520257826887661, + "grad_norm": 0.14518170058727264, + "learning_rate": 5.635504294815913e-06, + "loss": 1.7184, + "step": 27759 + }, + { + "epoch": 8.520564763658687, + "grad_norm": 0.17998449504375458, + "learning_rate": 5.633212031648199e-06, + "loss": 1.6822, + "step": 27760 + }, + { + "epoch": 8.520871700429712, + "grad_norm": 0.1301501840353012, + "learning_rate": 5.630920206939094e-06, + "loss": 1.6878, + "step": 27761 + }, + { + "epoch": 8.521178637200737, + "grad_norm": 0.16201011836528778, + "learning_rate": 5.628628820711235e-06, + "loss": 1.7581, + "step": 27762 + }, + { + "epoch": 8.521485573971761, + "grad_norm": 0.20399747788906097, + "learning_rate": 5.626337872987269e-06, + "loss": 1.7281, + "step": 27763 + }, + { + "epoch": 8.521792510742786, + "grad_norm": 0.18675439059734344, + "learning_rate": 5.624047363789858e-06, + "loss": 1.7445, + "step": 27764 + }, + { + "epoch": 8.522099447513812, + "grad_norm": 0.1858585625886917, + "learning_rate": 5.621757293141594e-06, + "loss": 1.729, + "step": 27765 + }, + { + "epoch": 8.522406384284837, + "grad_norm": 0.1731054186820984, + "learning_rate": 5.619467661065164e-06, + "loss": 1.6709, + "step": 27766 + }, + { + "epoch": 8.522713321055862, + "grad_norm": 0.2048177868127823, + "learning_rate": 5.617178467583145e-06, + "loss": 1.8187, + "step": 27767 + }, + { + "epoch": 8.523020257826888, + "grad_norm": 0.1944245547056198, + "learning_rate": 5.614889712718191e-06, + "loss": 1.7238, + "step": 27768 + }, + { + "epoch": 8.523327194597913, + "grad_norm": 0.16106872260570526, + "learning_rate": 5.612601396492906e-06, + "loss": 1.7089, + "step": 27769 + }, + { + "epoch": 8.523634131368938, + "grad_norm": 0.1933506578207016, + "learning_rate": 5.610313518929916e-06, + "loss": 1.6702, + "step": 27770 + }, + { + "epoch": 8.523941068139964, + "grad_norm": 0.14211905002593994, + "learning_rate": 5.608026080051826e-06, + "loss": 1.686, + "step": 27771 + }, + { + "epoch": 8.524248004910989, + "grad_norm": 0.1588355004787445, + "learning_rate": 5.605739079881239e-06, + "loss": 1.691, + "step": 27772 + }, + { + "epoch": 8.524554941682014, + "grad_norm": 0.2026119977235794, + "learning_rate": 5.60345251844076e-06, + "loss": 1.7024, + "step": 27773 + }, + { + "epoch": 8.524861878453038, + "grad_norm": 0.19816550612449646, + "learning_rate": 5.601166395752988e-06, + "loss": 1.7793, + "step": 27774 + }, + { + "epoch": 8.525168815224063, + "grad_norm": 0.1687595695257187, + "learning_rate": 5.59888071184051e-06, + "loss": 1.7066, + "step": 27775 + }, + { + "epoch": 8.525475751995089, + "grad_norm": 0.1844881922006607, + "learning_rate": 5.5965954667259125e-06, + "loss": 1.7091, + "step": 27776 + }, + { + "epoch": 8.525782688766114, + "grad_norm": 0.13911494612693787, + "learning_rate": 5.5943106604317895e-06, + "loss": 1.6611, + "step": 27777 + }, + { + "epoch": 8.52608962553714, + "grad_norm": 0.215097114443779, + "learning_rate": 5.592026292980718e-06, + "loss": 1.7436, + "step": 27778 + }, + { + "epoch": 8.526396562308165, + "grad_norm": 0.19177651405334473, + "learning_rate": 5.589742364395267e-06, + "loss": 1.7198, + "step": 27779 + }, + { + "epoch": 8.52670349907919, + "grad_norm": 0.16470259428024292, + "learning_rate": 5.587458874697998e-06, + "loss": 1.7405, + "step": 27780 + }, + { + "epoch": 8.527010435850215, + "grad_norm": 0.13213464617729187, + "learning_rate": 5.585175823911515e-06, + "loss": 1.6651, + "step": 27781 + }, + { + "epoch": 8.52731737262124, + "grad_norm": 0.18105588853359222, + "learning_rate": 5.582893212058338e-06, + "loss": 1.7169, + "step": 27782 + }, + { + "epoch": 8.527624309392266, + "grad_norm": 0.19358783960342407, + "learning_rate": 5.580611039161065e-06, + "loss": 1.7165, + "step": 27783 + }, + { + "epoch": 8.527931246163291, + "grad_norm": 0.13674969971179962, + "learning_rate": 5.578329305242208e-06, + "loss": 1.7086, + "step": 27784 + }, + { + "epoch": 8.528238182934315, + "grad_norm": 0.1365654170513153, + "learning_rate": 5.5760480103243475e-06, + "loss": 1.7031, + "step": 27785 + }, + { + "epoch": 8.52854511970534, + "grad_norm": 0.17749033868312836, + "learning_rate": 5.573767154430015e-06, + "loss": 1.7717, + "step": 27786 + }, + { + "epoch": 8.528852056476365, + "grad_norm": 0.16521626710891724, + "learning_rate": 5.5714867375817545e-06, + "loss": 1.6859, + "step": 27787 + }, + { + "epoch": 8.52915899324739, + "grad_norm": 0.14327271282672882, + "learning_rate": 5.569206759802103e-06, + "loss": 1.6996, + "step": 27788 + }, + { + "epoch": 8.529465930018416, + "grad_norm": 0.1895138919353485, + "learning_rate": 5.5669272211135934e-06, + "loss": 1.7127, + "step": 27789 + }, + { + "epoch": 8.529772866789441, + "grad_norm": 0.16256090998649597, + "learning_rate": 5.564648121538757e-06, + "loss": 1.7083, + "step": 27790 + }, + { + "epoch": 8.530079803560467, + "grad_norm": 0.18591371178627014, + "learning_rate": 5.562369461100103e-06, + "loss": 1.7852, + "step": 27791 + }, + { + "epoch": 8.530386740331492, + "grad_norm": 0.15933659672737122, + "learning_rate": 5.560091239820165e-06, + "loss": 1.69, + "step": 27792 + }, + { + "epoch": 8.530693677102517, + "grad_norm": 0.15374226868152618, + "learning_rate": 5.5578134577214505e-06, + "loss": 1.7397, + "step": 27793 + }, + { + "epoch": 8.531000613873543, + "grad_norm": 0.1786707490682602, + "learning_rate": 5.555536114826476e-06, + "loss": 1.7456, + "step": 27794 + }, + { + "epoch": 8.531307550644566, + "grad_norm": 0.16859668493270874, + "learning_rate": 5.553259211157741e-06, + "loss": 1.724, + "step": 27795 + }, + { + "epoch": 8.531614487415592, + "grad_norm": 0.21200759708881378, + "learning_rate": 5.5509827467377485e-06, + "loss": 1.7326, + "step": 27796 + }, + { + "epoch": 8.531921424186617, + "grad_norm": 0.16948217153549194, + "learning_rate": 5.548706721588986e-06, + "loss": 1.7082, + "step": 27797 + }, + { + "epoch": 8.532228360957642, + "grad_norm": 0.17014150321483612, + "learning_rate": 5.546431135733976e-06, + "loss": 1.7344, + "step": 27798 + }, + { + "epoch": 8.532535297728668, + "grad_norm": 0.20479294657707214, + "learning_rate": 5.544155989195171e-06, + "loss": 1.8121, + "step": 27799 + }, + { + "epoch": 8.532842234499693, + "grad_norm": 0.16958604753017426, + "learning_rate": 5.541881281995093e-06, + "loss": 1.773, + "step": 27800 + }, + { + "epoch": 8.533149171270718, + "grad_norm": 0.17606206238269806, + "learning_rate": 5.539607014156184e-06, + "loss": 1.6937, + "step": 27801 + }, + { + "epoch": 8.533456108041744, + "grad_norm": 0.1357482373714447, + "learning_rate": 5.537333185700943e-06, + "loss": 1.7234, + "step": 27802 + }, + { + "epoch": 8.533763044812769, + "grad_norm": 0.17217469215393066, + "learning_rate": 5.535059796651837e-06, + "loss": 1.722, + "step": 27803 + }, + { + "epoch": 8.534069981583794, + "grad_norm": 0.14100955426692963, + "learning_rate": 5.532786847031335e-06, + "loss": 1.6574, + "step": 27804 + }, + { + "epoch": 8.53437691835482, + "grad_norm": 0.1515544354915619, + "learning_rate": 5.530514336861897e-06, + "loss": 1.7489, + "step": 27805 + }, + { + "epoch": 8.534683855125843, + "grad_norm": 0.15518932044506073, + "learning_rate": 5.528242266165978e-06, + "loss": 1.7338, + "step": 27806 + }, + { + "epoch": 8.534990791896869, + "grad_norm": 0.15764978528022766, + "learning_rate": 5.525970634966033e-06, + "loss": 1.6971, + "step": 27807 + }, + { + "epoch": 8.535297728667894, + "grad_norm": 0.13838590681552887, + "learning_rate": 5.523699443284513e-06, + "loss": 1.723, + "step": 27808 + }, + { + "epoch": 8.53560466543892, + "grad_norm": 0.17713284492492676, + "learning_rate": 5.521428691143865e-06, + "loss": 1.7227, + "step": 27809 + }, + { + "epoch": 8.535911602209945, + "grad_norm": 0.19389420747756958, + "learning_rate": 5.51915837856653e-06, + "loss": 1.703, + "step": 27810 + }, + { + "epoch": 8.53621853898097, + "grad_norm": 0.13955099880695343, + "learning_rate": 5.516888505574941e-06, + "loss": 1.7093, + "step": 27811 + }, + { + "epoch": 8.536525475751995, + "grad_norm": 0.1319018006324768, + "learning_rate": 5.514619072191535e-06, + "loss": 1.7093, + "step": 27812 + }, + { + "epoch": 8.53683241252302, + "grad_norm": 0.14604489505290985, + "learning_rate": 5.512350078438733e-06, + "loss": 1.7113, + "step": 27813 + }, + { + "epoch": 8.537139349294046, + "grad_norm": 0.14439311623573303, + "learning_rate": 5.510081524338956e-06, + "loss": 1.7164, + "step": 27814 + }, + { + "epoch": 8.537446286065071, + "grad_norm": 0.17546533048152924, + "learning_rate": 5.507813409914647e-06, + "loss": 1.7432, + "step": 27815 + }, + { + "epoch": 8.537753222836095, + "grad_norm": 0.15710201859474182, + "learning_rate": 5.505545735188189e-06, + "loss": 1.7353, + "step": 27816 + }, + { + "epoch": 8.53806015960712, + "grad_norm": 0.19635994732379913, + "learning_rate": 5.503278500182019e-06, + "loss": 1.7042, + "step": 27817 + }, + { + "epoch": 8.538367096378146, + "grad_norm": 0.17653462290763855, + "learning_rate": 5.501011704918519e-06, + "loss": 1.7007, + "step": 27818 + }, + { + "epoch": 8.53867403314917, + "grad_norm": 0.1532578021287918, + "learning_rate": 5.498745349420109e-06, + "loss": 1.7111, + "step": 27819 + }, + { + "epoch": 8.538980969920196, + "grad_norm": 0.15368299186229706, + "learning_rate": 5.496479433709178e-06, + "loss": 1.7073, + "step": 27820 + }, + { + "epoch": 8.539287906691222, + "grad_norm": 0.19518911838531494, + "learning_rate": 5.494213957808126e-06, + "loss": 1.756, + "step": 27821 + }, + { + "epoch": 8.539594843462247, + "grad_norm": 0.13748668134212494, + "learning_rate": 5.4919489217393376e-06, + "loss": 1.6636, + "step": 27822 + }, + { + "epoch": 8.539901780233272, + "grad_norm": 0.2104724794626236, + "learning_rate": 5.489684325525191e-06, + "loss": 1.7734, + "step": 27823 + }, + { + "epoch": 8.540208717004298, + "grad_norm": 0.15495489537715912, + "learning_rate": 5.4874201691880786e-06, + "loss": 1.6858, + "step": 27824 + }, + { + "epoch": 8.540515653775323, + "grad_norm": 0.16447420418262482, + "learning_rate": 5.4851564527503674e-06, + "loss": 1.7053, + "step": 27825 + }, + { + "epoch": 8.540822590546348, + "grad_norm": 0.1427844911813736, + "learning_rate": 5.482893176234433e-06, + "loss": 1.6885, + "step": 27826 + }, + { + "epoch": 8.541129527317374, + "grad_norm": 0.14386583864688873, + "learning_rate": 5.4806303396626344e-06, + "loss": 1.6762, + "step": 27827 + }, + { + "epoch": 8.541436464088397, + "grad_norm": 0.15933938324451447, + "learning_rate": 5.478367943057344e-06, + "loss": 1.6945, + "step": 27828 + }, + { + "epoch": 8.541743400859422, + "grad_norm": 0.3127610385417938, + "learning_rate": 5.476105986440922e-06, + "loss": 1.772, + "step": 27829 + }, + { + "epoch": 8.542050337630448, + "grad_norm": 0.168161079287529, + "learning_rate": 5.473844469835709e-06, + "loss": 1.7398, + "step": 27830 + }, + { + "epoch": 8.542357274401473, + "grad_norm": 0.17208287119865417, + "learning_rate": 5.471583393264057e-06, + "loss": 1.7345, + "step": 27831 + }, + { + "epoch": 8.542664211172498, + "grad_norm": 0.18009017407894135, + "learning_rate": 5.469322756748335e-06, + "loss": 1.7785, + "step": 27832 + }, + { + "epoch": 8.542971147943524, + "grad_norm": 0.17091695964336395, + "learning_rate": 5.467062560310843e-06, + "loss": 1.689, + "step": 27833 + }, + { + "epoch": 8.543278084714549, + "grad_norm": 0.1495637446641922, + "learning_rate": 5.4648028039739675e-06, + "loss": 1.7409, + "step": 27834 + }, + { + "epoch": 8.543585021485574, + "grad_norm": 0.19924791157245636, + "learning_rate": 5.462543487759986e-06, + "loss": 1.7136, + "step": 27835 + }, + { + "epoch": 8.5438919582566, + "grad_norm": 0.19490383565425873, + "learning_rate": 5.460284611691269e-06, + "loss": 1.7371, + "step": 27836 + }, + { + "epoch": 8.544198895027625, + "grad_norm": 0.20383320748806, + "learning_rate": 5.458026175790127e-06, + "loss": 1.7268, + "step": 27837 + }, + { + "epoch": 8.544505831798649, + "grad_norm": 0.20110821723937988, + "learning_rate": 5.455768180078869e-06, + "loss": 1.7069, + "step": 27838 + }, + { + "epoch": 8.544812768569674, + "grad_norm": 0.16181184351444244, + "learning_rate": 5.453510624579827e-06, + "loss": 1.7158, + "step": 27839 + }, + { + "epoch": 8.5451197053407, + "grad_norm": 0.17110773921012878, + "learning_rate": 5.451253509315296e-06, + "loss": 1.6925, + "step": 27840 + }, + { + "epoch": 8.545426642111725, + "grad_norm": 0.16039033234119415, + "learning_rate": 5.448996834307591e-06, + "loss": 1.7281, + "step": 27841 + }, + { + "epoch": 8.54573357888275, + "grad_norm": 0.12631241977214813, + "learning_rate": 5.446740599579014e-06, + "loss": 1.6816, + "step": 27842 + }, + { + "epoch": 8.546040515653775, + "grad_norm": 0.20419110357761383, + "learning_rate": 5.444484805151856e-06, + "loss": 1.7594, + "step": 27843 + }, + { + "epoch": 8.5463474524248, + "grad_norm": 0.25453490018844604, + "learning_rate": 5.442229451048414e-06, + "loss": 1.7423, + "step": 27844 + }, + { + "epoch": 8.546654389195826, + "grad_norm": 0.15445558726787567, + "learning_rate": 5.439974537290982e-06, + "loss": 1.729, + "step": 27845 + }, + { + "epoch": 8.546961325966851, + "grad_norm": 0.16175805032253265, + "learning_rate": 5.43772006390183e-06, + "loss": 1.7515, + "step": 27846 + }, + { + "epoch": 8.547268262737877, + "grad_norm": 0.1958928406238556, + "learning_rate": 5.435466030903253e-06, + "loss": 1.7203, + "step": 27847 + }, + { + "epoch": 8.547575199508902, + "grad_norm": 0.17533376812934875, + "learning_rate": 5.433212438317514e-06, + "loss": 1.7393, + "step": 27848 + }, + { + "epoch": 8.547882136279926, + "grad_norm": 0.16437608003616333, + "learning_rate": 5.430959286166904e-06, + "loss": 1.7284, + "step": 27849 + }, + { + "epoch": 8.54818907305095, + "grad_norm": 0.16348768770694733, + "learning_rate": 5.428706574473663e-06, + "loss": 1.7284, + "step": 27850 + }, + { + "epoch": 8.548496009821976, + "grad_norm": 0.136602982878685, + "learning_rate": 5.426454303260081e-06, + "loss": 1.6606, + "step": 27851 + }, + { + "epoch": 8.548802946593002, + "grad_norm": 0.1359151154756546, + "learning_rate": 5.42420247254839e-06, + "loss": 1.6989, + "step": 27852 + }, + { + "epoch": 8.549109883364027, + "grad_norm": 0.17593000829219818, + "learning_rate": 5.421951082360866e-06, + "loss": 1.7483, + "step": 27853 + }, + { + "epoch": 8.549416820135052, + "grad_norm": 0.1791890412569046, + "learning_rate": 5.419700132719746e-06, + "loss": 1.7032, + "step": 27854 + }, + { + "epoch": 8.549723756906078, + "grad_norm": 0.15925002098083496, + "learning_rate": 5.417449623647281e-06, + "loss": 1.7055, + "step": 27855 + }, + { + "epoch": 8.550030693677103, + "grad_norm": 0.16391295194625854, + "learning_rate": 5.415199555165706e-06, + "loss": 1.6555, + "step": 27856 + }, + { + "epoch": 8.550337630448128, + "grad_norm": 0.18588928878307343, + "learning_rate": 5.412949927297262e-06, + "loss": 1.6723, + "step": 27857 + }, + { + "epoch": 8.550644567219154, + "grad_norm": 0.15956605970859528, + "learning_rate": 5.410700740064184e-06, + "loss": 1.7148, + "step": 27858 + }, + { + "epoch": 8.550951503990177, + "grad_norm": 0.14419449865818024, + "learning_rate": 5.408451993488689e-06, + "loss": 1.6997, + "step": 27859 + }, + { + "epoch": 8.551258440761202, + "grad_norm": 0.18104690313339233, + "learning_rate": 5.406203687593014e-06, + "loss": 1.7121, + "step": 27860 + }, + { + "epoch": 8.551565377532228, + "grad_norm": 0.15283553302288055, + "learning_rate": 5.40395582239937e-06, + "loss": 1.6536, + "step": 27861 + }, + { + "epoch": 8.551872314303253, + "grad_norm": 0.14498579502105713, + "learning_rate": 5.401708397929972e-06, + "loss": 1.6649, + "step": 27862 + }, + { + "epoch": 8.552179251074278, + "grad_norm": 0.1828843504190445, + "learning_rate": 5.39946141420703e-06, + "loss": 1.718, + "step": 27863 + }, + { + "epoch": 8.552486187845304, + "grad_norm": 0.20626986026763916, + "learning_rate": 5.397214871252754e-06, + "loss": 1.7561, + "step": 27864 + }, + { + "epoch": 8.55279312461633, + "grad_norm": 0.16986799240112305, + "learning_rate": 5.394968769089331e-06, + "loss": 1.7386, + "step": 27865 + }, + { + "epoch": 8.553100061387354, + "grad_norm": 0.16921544075012207, + "learning_rate": 5.392723107738995e-06, + "loss": 1.6939, + "step": 27866 + }, + { + "epoch": 8.55340699815838, + "grad_norm": 0.19882866740226746, + "learning_rate": 5.390477887223888e-06, + "loss": 1.7376, + "step": 27867 + }, + { + "epoch": 8.553713934929405, + "grad_norm": 0.17440463602542877, + "learning_rate": 5.3882331075662486e-06, + "loss": 1.7142, + "step": 27868 + }, + { + "epoch": 8.55402087170043, + "grad_norm": 0.1494864523410797, + "learning_rate": 5.38598876878822e-06, + "loss": 1.6953, + "step": 27869 + }, + { + "epoch": 8.554327808471456, + "grad_norm": 0.18791508674621582, + "learning_rate": 5.383744870912006e-06, + "loss": 1.7863, + "step": 27870 + }, + { + "epoch": 8.55463474524248, + "grad_norm": 0.19124576449394226, + "learning_rate": 5.381501413959777e-06, + "loss": 1.6668, + "step": 27871 + }, + { + "epoch": 8.554941682013505, + "grad_norm": 0.17011114954948425, + "learning_rate": 5.3792583979537016e-06, + "loss": 1.7356, + "step": 27872 + }, + { + "epoch": 8.55524861878453, + "grad_norm": 0.1780267208814621, + "learning_rate": 5.377015822915949e-06, + "loss": 1.7428, + "step": 27873 + }, + { + "epoch": 8.555555555555555, + "grad_norm": 0.18539096415042877, + "learning_rate": 5.374773688868678e-06, + "loss": 1.7534, + "step": 27874 + }, + { + "epoch": 8.55586249232658, + "grad_norm": 0.1668393909931183, + "learning_rate": 5.372531995834051e-06, + "loss": 1.6884, + "step": 27875 + }, + { + "epoch": 8.556169429097606, + "grad_norm": 0.15957699716091156, + "learning_rate": 5.3702907438342165e-06, + "loss": 1.6739, + "step": 27876 + }, + { + "epoch": 8.556476365868631, + "grad_norm": 0.17210347950458527, + "learning_rate": 5.368049932891334e-06, + "loss": 1.7062, + "step": 27877 + }, + { + "epoch": 8.556783302639657, + "grad_norm": 0.1614166796207428, + "learning_rate": 5.365809563027535e-06, + "loss": 1.675, + "step": 27878 + }, + { + "epoch": 8.557090239410682, + "grad_norm": 0.17495310306549072, + "learning_rate": 5.36356963426497e-06, + "loss": 1.7694, + "step": 27879 + }, + { + "epoch": 8.557397176181707, + "grad_norm": 0.1660371571779251, + "learning_rate": 5.361330146625771e-06, + "loss": 1.6573, + "step": 27880 + }, + { + "epoch": 8.557704112952731, + "grad_norm": 0.1997743546962738, + "learning_rate": 5.359091100132074e-06, + "loss": 1.7006, + "step": 27881 + }, + { + "epoch": 8.558011049723756, + "grad_norm": 0.21383358538150787, + "learning_rate": 5.356852494805992e-06, + "loss": 1.7677, + "step": 27882 + }, + { + "epoch": 8.558317986494782, + "grad_norm": 0.15339766442775726, + "learning_rate": 5.354614330669677e-06, + "loss": 1.6852, + "step": 27883 + }, + { + "epoch": 8.558624923265807, + "grad_norm": 0.16808396577835083, + "learning_rate": 5.352376607745213e-06, + "loss": 1.7046, + "step": 27884 + }, + { + "epoch": 8.558931860036832, + "grad_norm": 0.19627085328102112, + "learning_rate": 5.350139326054748e-06, + "loss": 1.7255, + "step": 27885 + }, + { + "epoch": 8.559238796807858, + "grad_norm": 0.16882671415805817, + "learning_rate": 5.347902485620365e-06, + "loss": 1.6823, + "step": 27886 + }, + { + "epoch": 8.559545733578883, + "grad_norm": 0.19045037031173706, + "learning_rate": 5.3456660864641846e-06, + "loss": 1.7901, + "step": 27887 + }, + { + "epoch": 8.559852670349908, + "grad_norm": 0.16998142004013062, + "learning_rate": 5.3434301286083064e-06, + "loss": 1.7226, + "step": 27888 + }, + { + "epoch": 8.560159607120934, + "grad_norm": 0.16370677947998047, + "learning_rate": 5.341194612074824e-06, + "loss": 1.7151, + "step": 27889 + }, + { + "epoch": 8.560466543891959, + "grad_norm": 0.16379667818546295, + "learning_rate": 5.3389595368858345e-06, + "loss": 1.6742, + "step": 27890 + }, + { + "epoch": 8.560773480662984, + "grad_norm": 0.1741562932729721, + "learning_rate": 5.336724903063423e-06, + "loss": 1.7162, + "step": 27891 + }, + { + "epoch": 8.561080417434008, + "grad_norm": 0.17712807655334473, + "learning_rate": 5.334490710629675e-06, + "loss": 1.71, + "step": 27892 + }, + { + "epoch": 8.561387354205033, + "grad_norm": 0.16719931364059448, + "learning_rate": 5.332256959606669e-06, + "loss": 1.7299, + "step": 27893 + }, + { + "epoch": 8.561694290976058, + "grad_norm": 0.3024488389492035, + "learning_rate": 5.330023650016475e-06, + "loss": 1.7435, + "step": 27894 + }, + { + "epoch": 8.562001227747084, + "grad_norm": 0.13923676311969757, + "learning_rate": 5.3277907818811755e-06, + "loss": 1.6856, + "step": 27895 + }, + { + "epoch": 8.56230816451811, + "grad_norm": 0.1582731008529663, + "learning_rate": 5.325558355222826e-06, + "loss": 1.7057, + "step": 27896 + }, + { + "epoch": 8.562615101289135, + "grad_norm": 0.17576326429843903, + "learning_rate": 5.323326370063497e-06, + "loss": 1.7439, + "step": 27897 + }, + { + "epoch": 8.56292203806016, + "grad_norm": 0.16990134119987488, + "learning_rate": 5.321094826425238e-06, + "loss": 1.7366, + "step": 27898 + }, + { + "epoch": 8.563228974831185, + "grad_norm": 0.14154621958732605, + "learning_rate": 5.318863724330114e-06, + "loss": 1.6824, + "step": 27899 + }, + { + "epoch": 8.56353591160221, + "grad_norm": 0.1460665911436081, + "learning_rate": 5.3166330638001635e-06, + "loss": 1.729, + "step": 27900 + }, + { + "epoch": 8.563842848373236, + "grad_norm": 0.14366431534290314, + "learning_rate": 5.314402844857424e-06, + "loss": 1.704, + "step": 27901 + }, + { + "epoch": 8.56414978514426, + "grad_norm": 0.15405386686325073, + "learning_rate": 5.312173067523968e-06, + "loss": 1.7357, + "step": 27902 + }, + { + "epoch": 8.564456721915285, + "grad_norm": 0.12789638340473175, + "learning_rate": 5.309943731821787e-06, + "loss": 1.634, + "step": 27903 + }, + { + "epoch": 8.56476365868631, + "grad_norm": 0.17007184028625488, + "learning_rate": 5.307714837772948e-06, + "loss": 1.7065, + "step": 27904 + }, + { + "epoch": 8.565070595457335, + "grad_norm": 0.1982787400484085, + "learning_rate": 5.305486385399466e-06, + "loss": 1.7459, + "step": 27905 + }, + { + "epoch": 8.56537753222836, + "grad_norm": 0.18433566391468048, + "learning_rate": 5.303258374723363e-06, + "loss": 1.7414, + "step": 27906 + }, + { + "epoch": 8.565684468999386, + "grad_norm": 0.13842104375362396, + "learning_rate": 5.30103080576666e-06, + "loss": 1.6988, + "step": 27907 + }, + { + "epoch": 8.565991405770411, + "grad_norm": 0.14736461639404297, + "learning_rate": 5.298803678551373e-06, + "loss": 1.6828, + "step": 27908 + }, + { + "epoch": 8.566298342541437, + "grad_norm": 0.14953723549842834, + "learning_rate": 5.2965769930995e-06, + "loss": 1.6896, + "step": 27909 + }, + { + "epoch": 8.566605279312462, + "grad_norm": 0.15445443987846375, + "learning_rate": 5.294350749433058e-06, + "loss": 1.7096, + "step": 27910 + }, + { + "epoch": 8.566912216083487, + "grad_norm": 0.180703803896904, + "learning_rate": 5.292124947574045e-06, + "loss": 1.7191, + "step": 27911 + }, + { + "epoch": 8.567219152854513, + "grad_norm": 0.13825593888759613, + "learning_rate": 5.289899587544461e-06, + "loss": 1.6928, + "step": 27912 + }, + { + "epoch": 8.567526089625538, + "grad_norm": 0.15663209557533264, + "learning_rate": 5.287674669366294e-06, + "loss": 1.7004, + "step": 27913 + }, + { + "epoch": 8.567833026396562, + "grad_norm": 0.14148147404193878, + "learning_rate": 5.285450193061526e-06, + "loss": 1.6961, + "step": 27914 + }, + { + "epoch": 8.568139963167587, + "grad_norm": 0.12393147498369217, + "learning_rate": 5.283226158652155e-06, + "loss": 1.6515, + "step": 27915 + }, + { + "epoch": 8.568446899938612, + "grad_norm": 0.1855689138174057, + "learning_rate": 5.281002566160148e-06, + "loss": 1.8017, + "step": 27916 + }, + { + "epoch": 8.568753836709638, + "grad_norm": 0.1665579080581665, + "learning_rate": 5.2787794156074824e-06, + "loss": 1.6935, + "step": 27917 + }, + { + "epoch": 8.569060773480663, + "grad_norm": 0.1853685826063156, + "learning_rate": 5.276556707016123e-06, + "loss": 1.7504, + "step": 27918 + }, + { + "epoch": 8.569367710251688, + "grad_norm": 0.16065651178359985, + "learning_rate": 5.274334440408063e-06, + "loss": 1.7549, + "step": 27919 + }, + { + "epoch": 8.569674647022714, + "grad_norm": 0.1630239635705948, + "learning_rate": 5.272112615805225e-06, + "loss": 1.7404, + "step": 27920 + }, + { + "epoch": 8.569981583793739, + "grad_norm": 0.1681451052427292, + "learning_rate": 5.269891233229607e-06, + "loss": 1.704, + "step": 27921 + }, + { + "epoch": 8.570288520564764, + "grad_norm": 0.14546994864940643, + "learning_rate": 5.267670292703119e-06, + "loss": 1.6656, + "step": 27922 + }, + { + "epoch": 8.57059545733579, + "grad_norm": 0.1499837189912796, + "learning_rate": 5.265449794247746e-06, + "loss": 1.6908, + "step": 27923 + }, + { + "epoch": 8.570902394106813, + "grad_norm": 0.14691168069839478, + "learning_rate": 5.263229737885417e-06, + "loss": 1.6887, + "step": 27924 + }, + { + "epoch": 8.571209330877839, + "grad_norm": 0.16261856257915497, + "learning_rate": 5.261010123638066e-06, + "loss": 1.6981, + "step": 27925 + }, + { + "epoch": 8.571516267648864, + "grad_norm": 0.1549815535545349, + "learning_rate": 5.2587909515276425e-06, + "loss": 1.6971, + "step": 27926 + }, + { + "epoch": 8.57182320441989, + "grad_norm": 0.15067234635353088, + "learning_rate": 5.256572221576067e-06, + "loss": 1.7101, + "step": 27927 + }, + { + "epoch": 8.572130141190915, + "grad_norm": 0.13761483132839203, + "learning_rate": 5.254353933805273e-06, + "loss": 1.6657, + "step": 27928 + }, + { + "epoch": 8.57243707796194, + "grad_norm": 0.1590275913476944, + "learning_rate": 5.252136088237175e-06, + "loss": 1.6776, + "step": 27929 + }, + { + "epoch": 8.572744014732965, + "grad_norm": 0.1633618026971817, + "learning_rate": 5.249918684893695e-06, + "loss": 1.724, + "step": 27930 + }, + { + "epoch": 8.57305095150399, + "grad_norm": 0.2603756785392761, + "learning_rate": 5.247701723796755e-06, + "loss": 1.7071, + "step": 27931 + }, + { + "epoch": 8.573357888275016, + "grad_norm": 0.21079567074775696, + "learning_rate": 5.245485204968248e-06, + "loss": 1.7983, + "step": 27932 + }, + { + "epoch": 8.573664825046041, + "grad_norm": 0.15369223058223724, + "learning_rate": 5.243269128430095e-06, + "loss": 1.7566, + "step": 27933 + }, + { + "epoch": 8.573971761817067, + "grad_norm": 0.19392070174217224, + "learning_rate": 5.241053494204185e-06, + "loss": 1.7287, + "step": 27934 + }, + { + "epoch": 8.57427869858809, + "grad_norm": 0.16017836332321167, + "learning_rate": 5.23883830231241e-06, + "loss": 1.6909, + "step": 27935 + }, + { + "epoch": 8.574585635359115, + "grad_norm": 0.1943294107913971, + "learning_rate": 5.2366235527766876e-06, + "loss": 1.7844, + "step": 27936 + }, + { + "epoch": 8.57489257213014, + "grad_norm": 0.17875424027442932, + "learning_rate": 5.234409245618871e-06, + "loss": 1.7385, + "step": 27937 + }, + { + "epoch": 8.575199508901166, + "grad_norm": 0.1900254637002945, + "learning_rate": 5.232195380860877e-06, + "loss": 1.7303, + "step": 27938 + }, + { + "epoch": 8.575506445672191, + "grad_norm": 0.13633303344249725, + "learning_rate": 5.229981958524549e-06, + "loss": 1.6949, + "step": 27939 + }, + { + "epoch": 8.575813382443217, + "grad_norm": 0.18683885037899017, + "learning_rate": 5.227768978631792e-06, + "loss": 1.7366, + "step": 27940 + }, + { + "epoch": 8.576120319214242, + "grad_norm": 0.15012286603450775, + "learning_rate": 5.2255564412044656e-06, + "loss": 1.71, + "step": 27941 + }, + { + "epoch": 8.576427255985267, + "grad_norm": 0.14521601796150208, + "learning_rate": 5.22334434626443e-06, + "loss": 1.724, + "step": 27942 + }, + { + "epoch": 8.576734192756293, + "grad_norm": 0.1809433549642563, + "learning_rate": 5.221132693833547e-06, + "loss": 1.7851, + "step": 27943 + }, + { + "epoch": 8.577041129527318, + "grad_norm": 0.1676371693611145, + "learning_rate": 5.218921483933681e-06, + "loss": 1.7542, + "step": 27944 + }, + { + "epoch": 8.577348066298342, + "grad_norm": 0.16963952779769897, + "learning_rate": 5.216710716586676e-06, + "loss": 1.767, + "step": 27945 + }, + { + "epoch": 8.577655003069367, + "grad_norm": 0.18276773393154144, + "learning_rate": 5.214500391814387e-06, + "loss": 1.662, + "step": 27946 + }, + { + "epoch": 8.577961939840392, + "grad_norm": 0.16285058856010437, + "learning_rate": 5.212290509638656e-06, + "loss": 1.6853, + "step": 27947 + }, + { + "epoch": 8.578268876611418, + "grad_norm": 0.18186792731285095, + "learning_rate": 5.210081070081318e-06, + "loss": 1.7408, + "step": 27948 + }, + { + "epoch": 8.578575813382443, + "grad_norm": 0.15637101233005524, + "learning_rate": 5.207872073164216e-06, + "loss": 1.7026, + "step": 27949 + }, + { + "epoch": 8.578882750153468, + "grad_norm": 0.16442300379276276, + "learning_rate": 5.2056635189091704e-06, + "loss": 1.7136, + "step": 27950 + }, + { + "epoch": 8.579189686924494, + "grad_norm": 0.18907669186592102, + "learning_rate": 5.203455407338015e-06, + "loss": 1.7706, + "step": 27951 + }, + { + "epoch": 8.579496623695519, + "grad_norm": 0.17700283229351044, + "learning_rate": 5.201247738472559e-06, + "loss": 1.7104, + "step": 27952 + }, + { + "epoch": 8.579803560466544, + "grad_norm": 0.19882333278656006, + "learning_rate": 5.199040512334647e-06, + "loss": 1.7692, + "step": 27953 + }, + { + "epoch": 8.58011049723757, + "grad_norm": 0.14343376457691193, + "learning_rate": 5.19683372894606e-06, + "loss": 1.6775, + "step": 27954 + }, + { + "epoch": 8.580417434008595, + "grad_norm": 0.13688595592975616, + "learning_rate": 5.194627388328638e-06, + "loss": 1.6787, + "step": 27955 + }, + { + "epoch": 8.580724370779619, + "grad_norm": 0.15786845982074738, + "learning_rate": 5.192421490504157e-06, + "loss": 1.7218, + "step": 27956 + }, + { + "epoch": 8.581031307550644, + "grad_norm": 0.3297908902168274, + "learning_rate": 5.190216035494433e-06, + "loss": 1.7533, + "step": 27957 + }, + { + "epoch": 8.58133824432167, + "grad_norm": 0.16763067245483398, + "learning_rate": 5.18801102332126e-06, + "loss": 1.7278, + "step": 27958 + }, + { + "epoch": 8.581645181092695, + "grad_norm": 0.18505536019802094, + "learning_rate": 5.185806454006426e-06, + "loss": 1.7291, + "step": 27959 + }, + { + "epoch": 8.58195211786372, + "grad_norm": 0.1536751091480255, + "learning_rate": 5.183602327571718e-06, + "loss": 1.7014, + "step": 27960 + }, + { + "epoch": 8.582259054634745, + "grad_norm": 0.2561737596988678, + "learning_rate": 5.181398644038921e-06, + "loss": 1.8127, + "step": 27961 + }, + { + "epoch": 8.58256599140577, + "grad_norm": 0.15304888784885406, + "learning_rate": 5.17919540342981e-06, + "loss": 1.7001, + "step": 27962 + }, + { + "epoch": 8.582872928176796, + "grad_norm": 0.16688644886016846, + "learning_rate": 5.176992605766162e-06, + "loss": 1.7398, + "step": 27963 + }, + { + "epoch": 8.583179864947821, + "grad_norm": 0.1351930946111679, + "learning_rate": 5.174790251069744e-06, + "loss": 1.6947, + "step": 27964 + }, + { + "epoch": 8.583486801718847, + "grad_norm": 0.23985813558101654, + "learning_rate": 5.172588339362322e-06, + "loss": 1.7495, + "step": 27965 + }, + { + "epoch": 8.58379373848987, + "grad_norm": 0.17094407975673676, + "learning_rate": 5.170386870665656e-06, + "loss": 1.74, + "step": 27966 + }, + { + "epoch": 8.584100675260895, + "grad_norm": 0.17786560952663422, + "learning_rate": 5.168185845001505e-06, + "loss": 1.7438, + "step": 27967 + }, + { + "epoch": 8.58440761203192, + "grad_norm": 0.16682226955890656, + "learning_rate": 5.165985262391615e-06, + "loss": 1.7193, + "step": 27968 + }, + { + "epoch": 8.584714548802946, + "grad_norm": 0.17371125519275665, + "learning_rate": 5.163785122857728e-06, + "loss": 1.677, + "step": 27969 + }, + { + "epoch": 8.585021485573971, + "grad_norm": 0.16753411293029785, + "learning_rate": 5.161585426421617e-06, + "loss": 1.6558, + "step": 27970 + }, + { + "epoch": 8.585328422344997, + "grad_norm": 0.14469672739505768, + "learning_rate": 5.159386173104979e-06, + "loss": 1.7, + "step": 27971 + }, + { + "epoch": 8.585635359116022, + "grad_norm": 0.14450986683368683, + "learning_rate": 5.157187362929583e-06, + "loss": 1.6843, + "step": 27972 + }, + { + "epoch": 8.585942295887047, + "grad_norm": 0.15462568402290344, + "learning_rate": 5.1549889959171315e-06, + "loss": 1.7028, + "step": 27973 + }, + { + "epoch": 8.586249232658073, + "grad_norm": 0.19757840037345886, + "learning_rate": 5.1527910720893694e-06, + "loss": 1.7578, + "step": 27974 + }, + { + "epoch": 8.586556169429098, + "grad_norm": 0.16309098899364471, + "learning_rate": 5.150593591468017e-06, + "loss": 1.6736, + "step": 27975 + }, + { + "epoch": 8.586863106200123, + "grad_norm": 0.20989231765270233, + "learning_rate": 5.14839655407478e-06, + "loss": 1.7418, + "step": 27976 + }, + { + "epoch": 8.587170042971149, + "grad_norm": 0.14988306164741516, + "learning_rate": 5.14619995993138e-06, + "loss": 1.6834, + "step": 27977 + }, + { + "epoch": 8.587476979742172, + "grad_norm": 0.1826607882976532, + "learning_rate": 5.144003809059522e-06, + "loss": 1.7598, + "step": 27978 + }, + { + "epoch": 8.587783916513198, + "grad_norm": 0.16675019264221191, + "learning_rate": 5.141808101480905e-06, + "loss": 1.7388, + "step": 27979 + }, + { + "epoch": 8.588090853284223, + "grad_norm": 0.17474086582660675, + "learning_rate": 5.139612837217233e-06, + "loss": 1.6897, + "step": 27980 + }, + { + "epoch": 8.588397790055248, + "grad_norm": 0.15096940100193024, + "learning_rate": 5.137418016290207e-06, + "loss": 1.6959, + "step": 27981 + }, + { + "epoch": 8.588704726826274, + "grad_norm": 0.13225309550762177, + "learning_rate": 5.1352236387215035e-06, + "loss": 1.6946, + "step": 27982 + }, + { + "epoch": 8.589011663597299, + "grad_norm": 0.13731913268566132, + "learning_rate": 5.133029704532821e-06, + "loss": 1.7076, + "step": 27983 + }, + { + "epoch": 8.589318600368324, + "grad_norm": 0.1227266862988472, + "learning_rate": 5.130836213745832e-06, + "loss": 1.6966, + "step": 27984 + }, + { + "epoch": 8.58962553713935, + "grad_norm": 0.16979724168777466, + "learning_rate": 5.128643166382224e-06, + "loss": 1.7365, + "step": 27985 + }, + { + "epoch": 8.589932473910375, + "grad_norm": 0.13253070414066315, + "learning_rate": 5.126450562463653e-06, + "loss": 1.6748, + "step": 27986 + }, + { + "epoch": 8.5902394106814, + "grad_norm": 0.13287228345870972, + "learning_rate": 5.124258402011817e-06, + "loss": 1.666, + "step": 27987 + }, + { + "epoch": 8.590546347452424, + "grad_norm": 0.1884436458349228, + "learning_rate": 5.122066685048338e-06, + "loss": 1.6974, + "step": 27988 + }, + { + "epoch": 8.59085328422345, + "grad_norm": 0.17336542904376984, + "learning_rate": 5.119875411594927e-06, + "loss": 1.6884, + "step": 27989 + }, + { + "epoch": 8.591160220994475, + "grad_norm": 0.19136151671409607, + "learning_rate": 5.117684581673188e-06, + "loss": 1.6976, + "step": 27990 + }, + { + "epoch": 8.5914671577655, + "grad_norm": 0.18627271056175232, + "learning_rate": 5.115494195304804e-06, + "loss": 1.7255, + "step": 27991 + }, + { + "epoch": 8.591774094536525, + "grad_norm": 0.1341535747051239, + "learning_rate": 5.1133042525114194e-06, + "loss": 1.661, + "step": 27992 + }, + { + "epoch": 8.59208103130755, + "grad_norm": 0.172500878572464, + "learning_rate": 5.1111147533146665e-06, + "loss": 1.7408, + "step": 27993 + }, + { + "epoch": 8.592387968078576, + "grad_norm": 0.14429397881031036, + "learning_rate": 5.108925697736188e-06, + "loss": 1.7025, + "step": 27994 + }, + { + "epoch": 8.592694904849601, + "grad_norm": 0.16930191218852997, + "learning_rate": 5.106737085797625e-06, + "loss": 1.7451, + "step": 27995 + }, + { + "epoch": 8.593001841620627, + "grad_norm": 0.17311960458755493, + "learning_rate": 5.104548917520591e-06, + "loss": 1.7077, + "step": 27996 + }, + { + "epoch": 8.593308778391652, + "grad_norm": 0.17147377133369446, + "learning_rate": 5.102361192926719e-06, + "loss": 1.701, + "step": 27997 + }, + { + "epoch": 8.593615715162677, + "grad_norm": 0.16215240955352783, + "learning_rate": 5.100173912037631e-06, + "loss": 1.6896, + "step": 27998 + }, + { + "epoch": 8.5939226519337, + "grad_norm": 0.1764577031135559, + "learning_rate": 5.097987074874944e-06, + "loss": 1.6895, + "step": 27999 + }, + { + "epoch": 8.594229588704726, + "grad_norm": 0.1574433147907257, + "learning_rate": 5.095800681460261e-06, + "loss": 1.7219, + "step": 28000 + }, + { + "epoch": 8.594536525475752, + "grad_norm": 0.1465912163257599, + "learning_rate": 5.0936147318152e-06, + "loss": 1.7077, + "step": 28001 + }, + { + "epoch": 8.594843462246777, + "grad_norm": 0.2024395614862442, + "learning_rate": 5.0914292259613524e-06, + "loss": 1.7956, + "step": 28002 + }, + { + "epoch": 8.595150399017802, + "grad_norm": 0.16168762743473053, + "learning_rate": 5.0892441639203205e-06, + "loss": 1.7311, + "step": 28003 + }, + { + "epoch": 8.595457335788828, + "grad_norm": 0.1713251769542694, + "learning_rate": 5.0870595457137185e-06, + "loss": 1.7123, + "step": 28004 + }, + { + "epoch": 8.595764272559853, + "grad_norm": 0.22206412255764008, + "learning_rate": 5.084875371363096e-06, + "loss": 1.7057, + "step": 28005 + }, + { + "epoch": 8.596071209330878, + "grad_norm": 0.14937512576580048, + "learning_rate": 5.082691640890081e-06, + "loss": 1.7231, + "step": 28006 + }, + { + "epoch": 8.596378146101904, + "grad_norm": 0.22501800954341888, + "learning_rate": 5.0805083543162155e-06, + "loss": 1.7729, + "step": 28007 + }, + { + "epoch": 8.596685082872929, + "grad_norm": 0.150779128074646, + "learning_rate": 5.0783255116631015e-06, + "loss": 1.6887, + "step": 28008 + }, + { + "epoch": 8.596992019643952, + "grad_norm": 0.1489362120628357, + "learning_rate": 5.076143112952308e-06, + "loss": 1.6774, + "step": 28009 + }, + { + "epoch": 8.597298956414978, + "grad_norm": 0.17022615671157837, + "learning_rate": 5.073961158205398e-06, + "loss": 1.6974, + "step": 28010 + }, + { + "epoch": 8.597605893186003, + "grad_norm": 0.16300532221794128, + "learning_rate": 5.071779647443931e-06, + "loss": 1.7194, + "step": 28011 + }, + { + "epoch": 8.597912829957028, + "grad_norm": 0.14973211288452148, + "learning_rate": 5.069598580689477e-06, + "loss": 1.7238, + "step": 28012 + }, + { + "epoch": 8.598219766728054, + "grad_norm": 0.1345965713262558, + "learning_rate": 5.067417957963583e-06, + "loss": 1.6372, + "step": 28013 + }, + { + "epoch": 8.598526703499079, + "grad_norm": 0.18125082552433014, + "learning_rate": 5.065237779287802e-06, + "loss": 1.7174, + "step": 28014 + }, + { + "epoch": 8.598833640270104, + "grad_norm": 0.1619734913110733, + "learning_rate": 5.063058044683671e-06, + "loss": 1.6951, + "step": 28015 + }, + { + "epoch": 8.59914057704113, + "grad_norm": 0.14732249081134796, + "learning_rate": 5.060878754172749e-06, + "loss": 1.7291, + "step": 28016 + }, + { + "epoch": 8.599447513812155, + "grad_norm": 0.14982318878173828, + "learning_rate": 5.058699907776554e-06, + "loss": 1.6962, + "step": 28017 + }, + { + "epoch": 8.59975445058318, + "grad_norm": 0.15376806259155273, + "learning_rate": 5.056521505516632e-06, + "loss": 1.6867, + "step": 28018 + }, + { + "epoch": 8.600061387354206, + "grad_norm": 0.1546332985162735, + "learning_rate": 5.054343547414509e-06, + "loss": 1.7219, + "step": 28019 + }, + { + "epoch": 8.600368324125231, + "grad_norm": 0.17485050857067108, + "learning_rate": 5.0521660334916895e-06, + "loss": 1.7266, + "step": 28020 + }, + { + "epoch": 8.600675260896255, + "grad_norm": 0.15625739097595215, + "learning_rate": 5.049988963769736e-06, + "loss": 1.7328, + "step": 28021 + }, + { + "epoch": 8.60098219766728, + "grad_norm": 0.26432421803474426, + "learning_rate": 5.0478123382701136e-06, + "loss": 1.7452, + "step": 28022 + }, + { + "epoch": 8.601289134438305, + "grad_norm": 0.16437242925167084, + "learning_rate": 5.045636157014377e-06, + "loss": 1.6945, + "step": 28023 + }, + { + "epoch": 8.60159607120933, + "grad_norm": 0.17274139821529388, + "learning_rate": 5.043460420023999e-06, + "loss": 1.6952, + "step": 28024 + }, + { + "epoch": 8.601903007980356, + "grad_norm": 0.2380651980638504, + "learning_rate": 5.0412851273205e-06, + "loss": 1.7412, + "step": 28025 + }, + { + "epoch": 8.602209944751381, + "grad_norm": 0.1543026566505432, + "learning_rate": 5.039110278925374e-06, + "loss": 1.7063, + "step": 28026 + }, + { + "epoch": 8.602516881522407, + "grad_norm": 0.15819939970970154, + "learning_rate": 5.036935874860111e-06, + "loss": 1.703, + "step": 28027 + }, + { + "epoch": 8.602823818293432, + "grad_norm": 0.20054341852664948, + "learning_rate": 5.034761915146208e-06, + "loss": 1.741, + "step": 28028 + }, + { + "epoch": 8.603130755064457, + "grad_norm": 0.1404278427362442, + "learning_rate": 5.032588399805127e-06, + "loss": 1.6822, + "step": 28029 + }, + { + "epoch": 8.603437691835483, + "grad_norm": 0.1339765340089798, + "learning_rate": 5.030415328858374e-06, + "loss": 1.6741, + "step": 28030 + }, + { + "epoch": 8.603744628606506, + "grad_norm": 0.17520250380039215, + "learning_rate": 5.028242702327413e-06, + "loss": 1.7655, + "step": 28031 + }, + { + "epoch": 8.604051565377532, + "grad_norm": 0.1701551079750061, + "learning_rate": 5.0260705202337165e-06, + "loss": 1.7219, + "step": 28032 + }, + { + "epoch": 8.604358502148557, + "grad_norm": 0.1882735937833786, + "learning_rate": 5.023898782598752e-06, + "loss": 1.7482, + "step": 28033 + }, + { + "epoch": 8.604665438919582, + "grad_norm": 0.1356845200061798, + "learning_rate": 5.021727489443984e-06, + "loss": 1.6647, + "step": 28034 + }, + { + "epoch": 8.604972375690608, + "grad_norm": 0.1686328649520874, + "learning_rate": 5.019556640790862e-06, + "loss": 1.7454, + "step": 28035 + }, + { + "epoch": 8.605279312461633, + "grad_norm": 0.16747170686721802, + "learning_rate": 5.017386236660848e-06, + "loss": 1.6747, + "step": 28036 + }, + { + "epoch": 8.605586249232658, + "grad_norm": 0.18954692780971527, + "learning_rate": 5.0152162770753795e-06, + "loss": 1.7351, + "step": 28037 + }, + { + "epoch": 8.605893186003684, + "grad_norm": 0.19075840711593628, + "learning_rate": 5.013046762055929e-06, + "loss": 1.8257, + "step": 28038 + }, + { + "epoch": 8.606200122774709, + "grad_norm": 0.22513258457183838, + "learning_rate": 5.010877691623894e-06, + "loss": 1.7548, + "step": 28039 + }, + { + "epoch": 8.606507059545734, + "grad_norm": 0.15815886855125427, + "learning_rate": 5.00870906580076e-06, + "loss": 1.6793, + "step": 28040 + }, + { + "epoch": 8.60681399631676, + "grad_norm": 0.15267199277877808, + "learning_rate": 5.006540884607913e-06, + "loss": 1.6703, + "step": 28041 + }, + { + "epoch": 8.607120933087783, + "grad_norm": 0.14877180755138397, + "learning_rate": 5.00437314806681e-06, + "loss": 1.6859, + "step": 28042 + }, + { + "epoch": 8.607427869858808, + "grad_norm": 0.18780232965946198, + "learning_rate": 5.002205856198861e-06, + "loss": 1.7205, + "step": 28043 + }, + { + "epoch": 8.607734806629834, + "grad_norm": 0.1645117998123169, + "learning_rate": 5.000039009025492e-06, + "loss": 1.7726, + "step": 28044 + }, + { + "epoch": 8.60804174340086, + "grad_norm": 0.1449744552373886, + "learning_rate": 4.997872606568116e-06, + "loss": 1.6704, + "step": 28045 + }, + { + "epoch": 8.608348680171884, + "grad_norm": 0.15839919447898865, + "learning_rate": 4.9957066488481255e-06, + "loss": 1.6844, + "step": 28046 + }, + { + "epoch": 8.60865561694291, + "grad_norm": 0.16456182301044464, + "learning_rate": 4.993541135886948e-06, + "loss": 1.7141, + "step": 28047 + }, + { + "epoch": 8.608962553713935, + "grad_norm": 0.154433935880661, + "learning_rate": 4.991376067705977e-06, + "loss": 1.7077, + "step": 28048 + }, + { + "epoch": 8.60926949048496, + "grad_norm": 0.13631665706634521, + "learning_rate": 4.989211444326608e-06, + "loss": 1.6819, + "step": 28049 + }, + { + "epoch": 8.609576427255986, + "grad_norm": 0.13026617467403412, + "learning_rate": 4.987047265770234e-06, + "loss": 1.6929, + "step": 28050 + }, + { + "epoch": 8.609883364027011, + "grad_norm": 0.1359538435935974, + "learning_rate": 4.984883532058243e-06, + "loss": 1.6534, + "step": 28051 + }, + { + "epoch": 8.610190300798035, + "grad_norm": 0.13192327320575714, + "learning_rate": 4.982720243212014e-06, + "loss": 1.694, + "step": 28052 + }, + { + "epoch": 8.61049723756906, + "grad_norm": 0.17191945016384125, + "learning_rate": 4.980557399252928e-06, + "loss": 1.7402, + "step": 28053 + }, + { + "epoch": 8.610804174340085, + "grad_norm": 0.12728241086006165, + "learning_rate": 4.978395000202363e-06, + "loss": 1.7231, + "step": 28054 + }, + { + "epoch": 8.61111111111111, + "grad_norm": 0.15232713520526886, + "learning_rate": 4.976233046081685e-06, + "loss": 1.6805, + "step": 28055 + }, + { + "epoch": 8.611418047882136, + "grad_norm": 0.13869190216064453, + "learning_rate": 4.974071536912256e-06, + "loss": 1.6771, + "step": 28056 + }, + { + "epoch": 8.611724984653161, + "grad_norm": 0.16099198162555695, + "learning_rate": 4.971910472715458e-06, + "loss": 1.6853, + "step": 28057 + }, + { + "epoch": 8.612031921424187, + "grad_norm": 0.147923544049263, + "learning_rate": 4.969749853512612e-06, + "loss": 1.7173, + "step": 28058 + }, + { + "epoch": 8.612338858195212, + "grad_norm": 0.16606341302394867, + "learning_rate": 4.967589679325102e-06, + "loss": 1.7262, + "step": 28059 + }, + { + "epoch": 8.612645794966237, + "grad_norm": 0.12743404507637024, + "learning_rate": 4.965429950174266e-06, + "loss": 1.6612, + "step": 28060 + }, + { + "epoch": 8.612952731737263, + "grad_norm": 0.12468522787094116, + "learning_rate": 4.9632706660814436e-06, + "loss": 1.6835, + "step": 28061 + }, + { + "epoch": 8.613259668508288, + "grad_norm": 0.16881446540355682, + "learning_rate": 4.9611118270679935e-06, + "loss": 1.7433, + "step": 28062 + }, + { + "epoch": 8.613566605279313, + "grad_norm": 0.2030627429485321, + "learning_rate": 4.958953433155211e-06, + "loss": 1.7739, + "step": 28063 + }, + { + "epoch": 8.613873542050337, + "grad_norm": 0.18076404929161072, + "learning_rate": 4.956795484364457e-06, + "loss": 1.7316, + "step": 28064 + }, + { + "epoch": 8.614180478821362, + "grad_norm": 0.12519899010658264, + "learning_rate": 4.954637980717058e-06, + "loss": 1.6686, + "step": 28065 + }, + { + "epoch": 8.614487415592388, + "grad_norm": 0.16320455074310303, + "learning_rate": 4.95248092223432e-06, + "loss": 1.744, + "step": 28066 + }, + { + "epoch": 8.614794352363413, + "grad_norm": 0.18789352476596832, + "learning_rate": 4.950324308937576e-06, + "loss": 1.7619, + "step": 28067 + }, + { + "epoch": 8.615101289134438, + "grad_norm": 0.13703711330890656, + "learning_rate": 4.948168140848125e-06, + "loss": 1.6652, + "step": 28068 + }, + { + "epoch": 8.615408225905464, + "grad_norm": 0.16874989867210388, + "learning_rate": 4.946012417987289e-06, + "loss": 1.6783, + "step": 28069 + }, + { + "epoch": 8.615715162676489, + "grad_norm": 0.1780901849269867, + "learning_rate": 4.943857140376362e-06, + "loss": 1.7224, + "step": 28070 + }, + { + "epoch": 8.616022099447514, + "grad_norm": 0.19460240006446838, + "learning_rate": 4.941702308036644e-06, + "loss": 1.7314, + "step": 28071 + }, + { + "epoch": 8.61632903621854, + "grad_norm": 0.14954718947410583, + "learning_rate": 4.9395479209894404e-06, + "loss": 1.708, + "step": 28072 + }, + { + "epoch": 8.616635972989565, + "grad_norm": 0.17461352050304413, + "learning_rate": 4.937393979256016e-06, + "loss": 1.7458, + "step": 28073 + }, + { + "epoch": 8.616942909760589, + "grad_norm": 0.17088642716407776, + "learning_rate": 4.935240482857706e-06, + "loss": 1.7315, + "step": 28074 + }, + { + "epoch": 8.617249846531614, + "grad_norm": 0.1478833556175232, + "learning_rate": 4.933087431815736e-06, + "loss": 1.6646, + "step": 28075 + }, + { + "epoch": 8.61755678330264, + "grad_norm": 0.1860690414905548, + "learning_rate": 4.930934826151435e-06, + "loss": 1.6472, + "step": 28076 + }, + { + "epoch": 8.617863720073665, + "grad_norm": 0.23674537241458893, + "learning_rate": 4.928782665886028e-06, + "loss": 1.7677, + "step": 28077 + }, + { + "epoch": 8.61817065684469, + "grad_norm": 0.1638643592596054, + "learning_rate": 4.926630951040817e-06, + "loss": 1.7438, + "step": 28078 + }, + { + "epoch": 8.618477593615715, + "grad_norm": 0.1631689965724945, + "learning_rate": 4.924479681637067e-06, + "loss": 1.7167, + "step": 28079 + }, + { + "epoch": 8.61878453038674, + "grad_norm": 0.1493348926305771, + "learning_rate": 4.922328857696012e-06, + "loss": 1.6929, + "step": 28080 + }, + { + "epoch": 8.619091467157766, + "grad_norm": 0.1545657068490982, + "learning_rate": 4.920178479238935e-06, + "loss": 1.7048, + "step": 28081 + }, + { + "epoch": 8.619398403928791, + "grad_norm": 0.20011793076992035, + "learning_rate": 4.918028546287073e-06, + "loss": 1.726, + "step": 28082 + }, + { + "epoch": 8.619705340699817, + "grad_norm": 0.1705177128314972, + "learning_rate": 4.915879058861678e-06, + "loss": 1.7774, + "step": 28083 + }, + { + "epoch": 8.620012277470842, + "grad_norm": 0.15467505156993866, + "learning_rate": 4.913730016983992e-06, + "loss": 1.6933, + "step": 28084 + }, + { + "epoch": 8.620319214241865, + "grad_norm": 0.1319204419851303, + "learning_rate": 4.911581420675248e-06, + "loss": 1.7309, + "step": 28085 + }, + { + "epoch": 8.62062615101289, + "grad_norm": 0.163784459233284, + "learning_rate": 4.909433269956687e-06, + "loss": 1.7221, + "step": 28086 + }, + { + "epoch": 8.620933087783916, + "grad_norm": 0.15852972865104675, + "learning_rate": 4.907285564849534e-06, + "loss": 1.7018, + "step": 28087 + }, + { + "epoch": 8.621240024554941, + "grad_norm": 0.14603203535079956, + "learning_rate": 4.905138305375018e-06, + "loss": 1.6786, + "step": 28088 + }, + { + "epoch": 8.621546961325967, + "grad_norm": 0.14899590611457825, + "learning_rate": 4.902991491554348e-06, + "loss": 1.7039, + "step": 28089 + }, + { + "epoch": 8.621853898096992, + "grad_norm": 0.13559244573116302, + "learning_rate": 4.9008451234087426e-06, + "loss": 1.6831, + "step": 28090 + }, + { + "epoch": 8.622160834868017, + "grad_norm": 0.1433703601360321, + "learning_rate": 4.898699200959439e-06, + "loss": 1.6567, + "step": 28091 + }, + { + "epoch": 8.622467771639043, + "grad_norm": 0.12275373190641403, + "learning_rate": 4.89655372422761e-06, + "loss": 1.6897, + "step": 28092 + }, + { + "epoch": 8.622774708410068, + "grad_norm": 0.12706153094768524, + "learning_rate": 4.894408693234487e-06, + "loss": 1.6287, + "step": 28093 + }, + { + "epoch": 8.623081645181093, + "grad_norm": 0.18988971412181854, + "learning_rate": 4.892264108001232e-06, + "loss": 1.7021, + "step": 28094 + }, + { + "epoch": 8.623388581952117, + "grad_norm": 0.17477858066558838, + "learning_rate": 4.8901199685490785e-06, + "loss": 1.7289, + "step": 28095 + }, + { + "epoch": 8.623695518723142, + "grad_norm": 0.16172516345977783, + "learning_rate": 4.887976274899203e-06, + "loss": 1.7265, + "step": 28096 + }, + { + "epoch": 8.624002455494168, + "grad_norm": 0.14414304494857788, + "learning_rate": 4.885833027072772e-06, + "loss": 1.6795, + "step": 28097 + }, + { + "epoch": 8.624309392265193, + "grad_norm": 0.17894591391086578, + "learning_rate": 4.8836902250909975e-06, + "loss": 1.7564, + "step": 28098 + }, + { + "epoch": 8.624616329036218, + "grad_norm": 0.141717329621315, + "learning_rate": 4.881547868975022e-06, + "loss": 1.7047, + "step": 28099 + }, + { + "epoch": 8.624923265807244, + "grad_norm": 0.2184356302022934, + "learning_rate": 4.879405958746047e-06, + "loss": 1.7447, + "step": 28100 + }, + { + "epoch": 8.625230202578269, + "grad_norm": 0.1739104986190796, + "learning_rate": 4.877264494425227e-06, + "loss": 1.7003, + "step": 28101 + }, + { + "epoch": 8.625537139349294, + "grad_norm": 0.17033645510673523, + "learning_rate": 4.875123476033721e-06, + "loss": 1.7019, + "step": 28102 + }, + { + "epoch": 8.62584407612032, + "grad_norm": 0.1620563268661499, + "learning_rate": 4.872982903592699e-06, + "loss": 1.6955, + "step": 28103 + }, + { + "epoch": 8.626151012891345, + "grad_norm": 0.16582414507865906, + "learning_rate": 4.870842777123308e-06, + "loss": 1.6687, + "step": 28104 + }, + { + "epoch": 8.62645794966237, + "grad_norm": 0.1620030403137207, + "learning_rate": 4.8687030966466985e-06, + "loss": 1.6762, + "step": 28105 + }, + { + "epoch": 8.626764886433394, + "grad_norm": 0.16777098178863525, + "learning_rate": 4.86656386218402e-06, + "loss": 1.7117, + "step": 28106 + }, + { + "epoch": 8.62707182320442, + "grad_norm": 0.16074253618717194, + "learning_rate": 4.8644250737564014e-06, + "loss": 1.7205, + "step": 28107 + }, + { + "epoch": 8.627378759975445, + "grad_norm": 0.1414494514465332, + "learning_rate": 4.862286731385007e-06, + "loss": 1.6936, + "step": 28108 + }, + { + "epoch": 8.62768569674647, + "grad_norm": 0.206336110830307, + "learning_rate": 4.860148835090933e-06, + "loss": 1.7443, + "step": 28109 + }, + { + "epoch": 8.627992633517495, + "grad_norm": 0.16304929554462433, + "learning_rate": 4.858011384895345e-06, + "loss": 1.7525, + "step": 28110 + }, + { + "epoch": 8.62829957028852, + "grad_norm": 0.16839462518692017, + "learning_rate": 4.855874380819325e-06, + "loss": 1.7462, + "step": 28111 + }, + { + "epoch": 8.628606507059546, + "grad_norm": 0.16088010370731354, + "learning_rate": 4.8537378228840246e-06, + "loss": 1.7662, + "step": 28112 + }, + { + "epoch": 8.628913443830571, + "grad_norm": 0.1818089783191681, + "learning_rate": 4.851601711110559e-06, + "loss": 1.752, + "step": 28113 + }, + { + "epoch": 8.629220380601597, + "grad_norm": 0.19034543633460999, + "learning_rate": 4.8494660455200065e-06, + "loss": 1.8474, + "step": 28114 + }, + { + "epoch": 8.629527317372622, + "grad_norm": 0.15762893855571747, + "learning_rate": 4.847330826133517e-06, + "loss": 1.7615, + "step": 28115 + }, + { + "epoch": 8.629834254143645, + "grad_norm": 0.14152835309505463, + "learning_rate": 4.845196052972145e-06, + "loss": 1.702, + "step": 28116 + }, + { + "epoch": 8.63014119091467, + "grad_norm": 0.14755114912986755, + "learning_rate": 4.8430617260570245e-06, + "loss": 1.7044, + "step": 28117 + }, + { + "epoch": 8.630448127685696, + "grad_norm": 0.1483534872531891, + "learning_rate": 4.840927845409238e-06, + "loss": 1.6798, + "step": 28118 + }, + { + "epoch": 8.630755064456721, + "grad_norm": 0.15526263415813446, + "learning_rate": 4.8387944110498685e-06, + "loss": 1.7316, + "step": 28119 + }, + { + "epoch": 8.631062001227747, + "grad_norm": 0.21519999206066132, + "learning_rate": 4.836661422999999e-06, + "loss": 1.763, + "step": 28120 + }, + { + "epoch": 8.631368937998772, + "grad_norm": 0.14445212483406067, + "learning_rate": 4.8345288812807144e-06, + "loss": 1.6894, + "step": 28121 + }, + { + "epoch": 8.631675874769797, + "grad_norm": 0.1482388973236084, + "learning_rate": 4.832396785913091e-06, + "loss": 1.6629, + "step": 28122 + }, + { + "epoch": 8.631982811540823, + "grad_norm": 0.17132261395454407, + "learning_rate": 4.830265136918194e-06, + "loss": 1.7254, + "step": 28123 + }, + { + "epoch": 8.632289748311848, + "grad_norm": 0.1567879170179367, + "learning_rate": 4.828133934317081e-06, + "loss": 1.711, + "step": 28124 + }, + { + "epoch": 8.632596685082873, + "grad_norm": 0.18352550268173218, + "learning_rate": 4.826003178130845e-06, + "loss": 1.6853, + "step": 28125 + }, + { + "epoch": 8.632903621853899, + "grad_norm": 0.17370788753032684, + "learning_rate": 4.823872868380502e-06, + "loss": 1.7716, + "step": 28126 + }, + { + "epoch": 8.633210558624924, + "grad_norm": 0.14186492562294006, + "learning_rate": 4.821743005087148e-06, + "loss": 1.7003, + "step": 28127 + }, + { + "epoch": 8.633517495395948, + "grad_norm": 0.1501329094171524, + "learning_rate": 4.819613588271788e-06, + "loss": 1.7249, + "step": 28128 + }, + { + "epoch": 8.633824432166973, + "grad_norm": 0.13921687006950378, + "learning_rate": 4.817484617955498e-06, + "loss": 1.6646, + "step": 28129 + }, + { + "epoch": 8.634131368937998, + "grad_norm": 0.14346352219581604, + "learning_rate": 4.815356094159318e-06, + "loss": 1.6784, + "step": 28130 + }, + { + "epoch": 8.634438305709024, + "grad_norm": 0.1550782024860382, + "learning_rate": 4.813228016904247e-06, + "loss": 1.7052, + "step": 28131 + }, + { + "epoch": 8.634745242480049, + "grad_norm": 0.13514211773872375, + "learning_rate": 4.81110038621137e-06, + "loss": 1.7095, + "step": 28132 + }, + { + "epoch": 8.635052179251074, + "grad_norm": 0.14162956178188324, + "learning_rate": 4.8089732021016575e-06, + "loss": 1.7001, + "step": 28133 + }, + { + "epoch": 8.6353591160221, + "grad_norm": 0.14066293835639954, + "learning_rate": 4.806846464596177e-06, + "loss": 1.7037, + "step": 28134 + }, + { + "epoch": 8.635666052793125, + "grad_norm": 0.1918545961380005, + "learning_rate": 4.804720173715921e-06, + "loss": 1.7334, + "step": 28135 + }, + { + "epoch": 8.63597298956415, + "grad_norm": 0.13358080387115479, + "learning_rate": 4.802594329481913e-06, + "loss": 1.7063, + "step": 28136 + }, + { + "epoch": 8.636279926335176, + "grad_norm": 0.14988988637924194, + "learning_rate": 4.800468931915158e-06, + "loss": 1.6871, + "step": 28137 + }, + { + "epoch": 8.6365868631062, + "grad_norm": 0.1423332244157791, + "learning_rate": 4.798343981036663e-06, + "loss": 1.7133, + "step": 28138 + }, + { + "epoch": 8.636893799877225, + "grad_norm": 0.1372760534286499, + "learning_rate": 4.796219476867425e-06, + "loss": 1.6522, + "step": 28139 + }, + { + "epoch": 8.63720073664825, + "grad_norm": 0.14779186248779297, + "learning_rate": 4.794095419428446e-06, + "loss": 1.669, + "step": 28140 + }, + { + "epoch": 8.637507673419275, + "grad_norm": 0.1412673145532608, + "learning_rate": 4.7919718087406975e-06, + "loss": 1.6767, + "step": 28141 + }, + { + "epoch": 8.6378146101903, + "grad_norm": 0.13006745278835297, + "learning_rate": 4.789848644825201e-06, + "loss": 1.6804, + "step": 28142 + }, + { + "epoch": 8.638121546961326, + "grad_norm": 0.15673677623271942, + "learning_rate": 4.787725927702896e-06, + "loss": 1.7053, + "step": 28143 + }, + { + "epoch": 8.638428483732351, + "grad_norm": 0.17693878710269928, + "learning_rate": 4.785603657394805e-06, + "loss": 1.7207, + "step": 28144 + }, + { + "epoch": 8.638735420503377, + "grad_norm": 0.15449829399585724, + "learning_rate": 4.7834818339218654e-06, + "loss": 1.7433, + "step": 28145 + }, + { + "epoch": 8.639042357274402, + "grad_norm": 0.14260755479335785, + "learning_rate": 4.781360457305062e-06, + "loss": 1.6707, + "step": 28146 + }, + { + "epoch": 8.639349294045427, + "grad_norm": 0.13936764001846313, + "learning_rate": 4.7792395275653715e-06, + "loss": 1.6749, + "step": 28147 + }, + { + "epoch": 8.639656230816453, + "grad_norm": 0.14369705319404602, + "learning_rate": 4.7771190447237215e-06, + "loss": 1.6943, + "step": 28148 + }, + { + "epoch": 8.639963167587476, + "grad_norm": 0.18439368903636932, + "learning_rate": 4.774999008801107e-06, + "loss": 1.7714, + "step": 28149 + }, + { + "epoch": 8.640270104358502, + "grad_norm": 0.15348297357559204, + "learning_rate": 4.772879419818438e-06, + "loss": 1.7315, + "step": 28150 + }, + { + "epoch": 8.640577041129527, + "grad_norm": 0.16643862426280975, + "learning_rate": 4.770760277796693e-06, + "loss": 1.7196, + "step": 28151 + }, + { + "epoch": 8.640883977900552, + "grad_norm": 0.16105540096759796, + "learning_rate": 4.768641582756811e-06, + "loss": 1.7504, + "step": 28152 + }, + { + "epoch": 8.641190914671578, + "grad_norm": 0.135291188955307, + "learning_rate": 4.766523334719714e-06, + "loss": 1.663, + "step": 28153 + }, + { + "epoch": 8.641497851442603, + "grad_norm": 0.15021322667598724, + "learning_rate": 4.764405533706351e-06, + "loss": 1.7318, + "step": 28154 + }, + { + "epoch": 8.641804788213628, + "grad_norm": 0.13949114084243774, + "learning_rate": 4.762288179737645e-06, + "loss": 1.6909, + "step": 28155 + }, + { + "epoch": 8.642111724984654, + "grad_norm": 0.17211735248565674, + "learning_rate": 4.760171272834524e-06, + "loss": 1.7539, + "step": 28156 + }, + { + "epoch": 8.642418661755679, + "grad_norm": 0.12576675415039062, + "learning_rate": 4.7580548130179034e-06, + "loss": 1.6816, + "step": 28157 + }, + { + "epoch": 8.642725598526704, + "grad_norm": 0.18624669313430786, + "learning_rate": 4.755938800308696e-06, + "loss": 1.7976, + "step": 28158 + }, + { + "epoch": 8.643032535297728, + "grad_norm": 0.20610935986042023, + "learning_rate": 4.753823234727834e-06, + "loss": 1.7192, + "step": 28159 + }, + { + "epoch": 8.643339472068753, + "grad_norm": 0.15127690136432648, + "learning_rate": 4.751708116296194e-06, + "loss": 1.6918, + "step": 28160 + }, + { + "epoch": 8.643646408839778, + "grad_norm": 0.14993508160114288, + "learning_rate": 4.7495934450347115e-06, + "loss": 1.7075, + "step": 28161 + }, + { + "epoch": 8.643953345610804, + "grad_norm": 0.16896332800388336, + "learning_rate": 4.747479220964252e-06, + "loss": 1.6971, + "step": 28162 + }, + { + "epoch": 8.644260282381829, + "grad_norm": 0.20022685825824738, + "learning_rate": 4.745365444105737e-06, + "loss": 1.7479, + "step": 28163 + }, + { + "epoch": 8.644567219152854, + "grad_norm": 0.1731337308883667, + "learning_rate": 4.7432521144800565e-06, + "loss": 1.7384, + "step": 28164 + }, + { + "epoch": 8.64487415592388, + "grad_norm": 0.13517920672893524, + "learning_rate": 4.7411392321080605e-06, + "loss": 1.6611, + "step": 28165 + }, + { + "epoch": 8.645181092694905, + "grad_norm": 0.177021324634552, + "learning_rate": 4.739026797010676e-06, + "loss": 1.7779, + "step": 28166 + }, + { + "epoch": 8.64548802946593, + "grad_norm": 0.14956676959991455, + "learning_rate": 4.736914809208737e-06, + "loss": 1.6933, + "step": 28167 + }, + { + "epoch": 8.645794966236956, + "grad_norm": 0.15683145821094513, + "learning_rate": 4.734803268723143e-06, + "loss": 1.7067, + "step": 28168 + }, + { + "epoch": 8.646101903007981, + "grad_norm": 0.198720321059227, + "learning_rate": 4.732692175574755e-06, + "loss": 1.6567, + "step": 28169 + }, + { + "epoch": 8.646408839779006, + "grad_norm": 0.18899580836296082, + "learning_rate": 4.730581529784439e-06, + "loss": 1.7069, + "step": 28170 + }, + { + "epoch": 8.64671577655003, + "grad_norm": 0.17795316874980927, + "learning_rate": 4.728471331373041e-06, + "loss": 1.6803, + "step": 28171 + }, + { + "epoch": 8.647022713321055, + "grad_norm": 0.18296107649803162, + "learning_rate": 4.7263615803614325e-06, + "loss": 1.7774, + "step": 28172 + }, + { + "epoch": 8.64732965009208, + "grad_norm": 0.13994812965393066, + "learning_rate": 4.724252276770453e-06, + "loss": 1.6826, + "step": 28173 + }, + { + "epoch": 8.647636586863106, + "grad_norm": 0.14969824254512787, + "learning_rate": 4.722143420620945e-06, + "loss": 1.6529, + "step": 28174 + }, + { + "epoch": 8.647943523634131, + "grad_norm": 0.14949028193950653, + "learning_rate": 4.7200350119337485e-06, + "loss": 1.7007, + "step": 28175 + }, + { + "epoch": 8.648250460405157, + "grad_norm": 0.14786000549793243, + "learning_rate": 4.71792705072972e-06, + "loss": 1.6999, + "step": 28176 + }, + { + "epoch": 8.648557397176182, + "grad_norm": 0.12665456533432007, + "learning_rate": 4.715819537029659e-06, + "loss": 1.6414, + "step": 28177 + }, + { + "epoch": 8.648864333947207, + "grad_norm": 0.19015786051750183, + "learning_rate": 4.713712470854437e-06, + "loss": 1.7328, + "step": 28178 + }, + { + "epoch": 8.649171270718233, + "grad_norm": 0.20775510370731354, + "learning_rate": 4.711605852224827e-06, + "loss": 1.7735, + "step": 28179 + }, + { + "epoch": 8.649478207489258, + "grad_norm": 0.13774684071540833, + "learning_rate": 4.709499681161678e-06, + "loss": 1.7139, + "step": 28180 + }, + { + "epoch": 8.649785144260282, + "grad_norm": 0.17355668544769287, + "learning_rate": 4.707393957685813e-06, + "loss": 1.7046, + "step": 28181 + }, + { + "epoch": 8.650092081031307, + "grad_norm": 0.21687985956668854, + "learning_rate": 4.70528868181801e-06, + "loss": 1.6736, + "step": 28182 + }, + { + "epoch": 8.650399017802332, + "grad_norm": 0.13978178799152374, + "learning_rate": 4.703183853579107e-06, + "loss": 1.6841, + "step": 28183 + }, + { + "epoch": 8.650705954573358, + "grad_norm": 0.1476740539073944, + "learning_rate": 4.701079472989878e-06, + "loss": 1.6633, + "step": 28184 + }, + { + "epoch": 8.651012891344383, + "grad_norm": 0.17175909876823425, + "learning_rate": 4.698975540071138e-06, + "loss": 1.7059, + "step": 28185 + }, + { + "epoch": 8.651319828115408, + "grad_norm": 0.16164059937000275, + "learning_rate": 4.696872054843671e-06, + "loss": 1.7038, + "step": 28186 + }, + { + "epoch": 8.651626764886434, + "grad_norm": 0.1541287899017334, + "learning_rate": 4.694769017328271e-06, + "loss": 1.6583, + "step": 28187 + }, + { + "epoch": 8.651933701657459, + "grad_norm": 0.19379135966300964, + "learning_rate": 4.6926664275457165e-06, + "loss": 1.7375, + "step": 28188 + }, + { + "epoch": 8.652240638428484, + "grad_norm": 0.12427667528390884, + "learning_rate": 4.690564285516785e-06, + "loss": 1.6434, + "step": 28189 + }, + { + "epoch": 8.65254757519951, + "grad_norm": 0.15416522324085236, + "learning_rate": 4.6884625912622605e-06, + "loss": 1.7551, + "step": 28190 + }, + { + "epoch": 8.652854511970535, + "grad_norm": 0.1467018723487854, + "learning_rate": 4.6863613448029035e-06, + "loss": 1.704, + "step": 28191 + }, + { + "epoch": 8.653161448741558, + "grad_norm": 0.15078933537006378, + "learning_rate": 4.684260546159469e-06, + "loss": 1.7382, + "step": 28192 + }, + { + "epoch": 8.653468385512584, + "grad_norm": 0.13681283593177795, + "learning_rate": 4.682160195352758e-06, + "loss": 1.6732, + "step": 28193 + }, + { + "epoch": 8.65377532228361, + "grad_norm": 0.16412119567394257, + "learning_rate": 4.680060292403476e-06, + "loss": 1.7394, + "step": 28194 + }, + { + "epoch": 8.654082259054634, + "grad_norm": 0.14504186809062958, + "learning_rate": 4.677960837332423e-06, + "loss": 1.6602, + "step": 28195 + }, + { + "epoch": 8.65438919582566, + "grad_norm": 0.15267091989517212, + "learning_rate": 4.6758618301603105e-06, + "loss": 1.7041, + "step": 28196 + }, + { + "epoch": 8.654696132596685, + "grad_norm": 0.1807365119457245, + "learning_rate": 4.673763270907899e-06, + "loss": 1.7556, + "step": 28197 + }, + { + "epoch": 8.65500306936771, + "grad_norm": 0.16227813065052032, + "learning_rate": 4.671665159595939e-06, + "loss": 1.6976, + "step": 28198 + }, + { + "epoch": 8.655310006138736, + "grad_norm": 0.16095015406608582, + "learning_rate": 4.6695674962451305e-06, + "loss": 1.7078, + "step": 28199 + }, + { + "epoch": 8.655616942909761, + "grad_norm": 0.1518808901309967, + "learning_rate": 4.667470280876246e-06, + "loss": 1.6999, + "step": 28200 + }, + { + "epoch": 8.655923879680786, + "grad_norm": 0.13343939185142517, + "learning_rate": 4.665373513509974e-06, + "loss": 1.7186, + "step": 28201 + }, + { + "epoch": 8.65623081645181, + "grad_norm": 0.1545572429895401, + "learning_rate": 4.6632771941670535e-06, + "loss": 1.7281, + "step": 28202 + }, + { + "epoch": 8.656537753222835, + "grad_norm": 0.13296550512313843, + "learning_rate": 4.661181322868208e-06, + "loss": 1.6632, + "step": 28203 + }, + { + "epoch": 8.65684468999386, + "grad_norm": 0.15362371504306793, + "learning_rate": 4.659085899634141e-06, + "loss": 1.7415, + "step": 28204 + }, + { + "epoch": 8.657151626764886, + "grad_norm": 0.14498870074748993, + "learning_rate": 4.65699092448556e-06, + "loss": 1.7342, + "step": 28205 + }, + { + "epoch": 8.657458563535911, + "grad_norm": 0.19409331679344177, + "learning_rate": 4.654896397443176e-06, + "loss": 1.7562, + "step": 28206 + }, + { + "epoch": 8.657765500306937, + "grad_norm": 0.15481562912464142, + "learning_rate": 4.652802318527677e-06, + "loss": 1.6905, + "step": 28207 + }, + { + "epoch": 8.658072437077962, + "grad_norm": 0.17566657066345215, + "learning_rate": 4.650708687759769e-06, + "loss": 1.6902, + "step": 28208 + }, + { + "epoch": 8.658379373848987, + "grad_norm": 0.13994581997394562, + "learning_rate": 4.648615505160125e-06, + "loss": 1.672, + "step": 28209 + }, + { + "epoch": 8.658686310620013, + "grad_norm": 0.34969639778137207, + "learning_rate": 4.646522770749467e-06, + "loss": 1.6959, + "step": 28210 + }, + { + "epoch": 8.658993247391038, + "grad_norm": 0.16637352108955383, + "learning_rate": 4.644430484548428e-06, + "loss": 1.7119, + "step": 28211 + }, + { + "epoch": 8.659300184162063, + "grad_norm": 0.16540484130382538, + "learning_rate": 4.642338646577738e-06, + "loss": 1.7541, + "step": 28212 + }, + { + "epoch": 8.659607120933089, + "grad_norm": 0.13890287280082703, + "learning_rate": 4.640247256858016e-06, + "loss": 1.7117, + "step": 28213 + }, + { + "epoch": 8.659914057704112, + "grad_norm": 0.1403251439332962, + "learning_rate": 4.63815631540997e-06, + "loss": 1.697, + "step": 28214 + }, + { + "epoch": 8.660220994475138, + "grad_norm": 0.13313040137290955, + "learning_rate": 4.63606582225426e-06, + "loss": 1.6587, + "step": 28215 + }, + { + "epoch": 8.660527931246163, + "grad_norm": 0.12887243926525116, + "learning_rate": 4.63397577741152e-06, + "loss": 1.6441, + "step": 28216 + }, + { + "epoch": 8.660834868017188, + "grad_norm": 0.15074272453784943, + "learning_rate": 4.631886180902434e-06, + "loss": 1.7176, + "step": 28217 + }, + { + "epoch": 8.661141804788214, + "grad_norm": 0.12572859227657318, + "learning_rate": 4.629797032747624e-06, + "loss": 1.6779, + "step": 28218 + }, + { + "epoch": 8.661448741559239, + "grad_norm": 0.1607646495103836, + "learning_rate": 4.627708332967762e-06, + "loss": 1.747, + "step": 28219 + }, + { + "epoch": 8.661755678330264, + "grad_norm": 0.14080339670181274, + "learning_rate": 4.625620081583482e-06, + "loss": 1.7063, + "step": 28220 + }, + { + "epoch": 8.66206261510129, + "grad_norm": 0.17140309512615204, + "learning_rate": 4.623532278615411e-06, + "loss": 1.7265, + "step": 28221 + }, + { + "epoch": 8.662369551872315, + "grad_norm": 0.1564357578754425, + "learning_rate": 4.621444924084195e-06, + "loss": 1.7265, + "step": 28222 + }, + { + "epoch": 8.66267648864334, + "grad_norm": 0.20058012008666992, + "learning_rate": 4.619358018010461e-06, + "loss": 1.7824, + "step": 28223 + }, + { + "epoch": 8.662983425414364, + "grad_norm": 0.16060246527194977, + "learning_rate": 4.617271560414827e-06, + "loss": 1.7329, + "step": 28224 + }, + { + "epoch": 8.66329036218539, + "grad_norm": 0.1967579573392868, + "learning_rate": 4.6151855513179136e-06, + "loss": 1.7386, + "step": 28225 + }, + { + "epoch": 8.663597298956415, + "grad_norm": 0.14853200316429138, + "learning_rate": 4.613099990740338e-06, + "loss": 1.6727, + "step": 28226 + }, + { + "epoch": 8.66390423572744, + "grad_norm": 0.1625850945711136, + "learning_rate": 4.611014878702713e-06, + "loss": 1.7074, + "step": 28227 + }, + { + "epoch": 8.664211172498465, + "grad_norm": 0.15605251491069794, + "learning_rate": 4.608930215225627e-06, + "loss": 1.7092, + "step": 28228 + }, + { + "epoch": 8.66451810926949, + "grad_norm": 0.14355498552322388, + "learning_rate": 4.606846000329723e-06, + "loss": 1.6819, + "step": 28229 + }, + { + "epoch": 8.664825046040516, + "grad_norm": 0.16151221096515656, + "learning_rate": 4.604762234035548e-06, + "loss": 1.7251, + "step": 28230 + }, + { + "epoch": 8.665131982811541, + "grad_norm": 0.1165589988231659, + "learning_rate": 4.60267891636374e-06, + "loss": 1.644, + "step": 28231 + }, + { + "epoch": 8.665438919582567, + "grad_norm": 0.13766367733478546, + "learning_rate": 4.6005960473348594e-06, + "loss": 1.6526, + "step": 28232 + }, + { + "epoch": 8.665745856353592, + "grad_norm": 0.15400783717632294, + "learning_rate": 4.598513626969486e-06, + "loss": 1.7356, + "step": 28233 + }, + { + "epoch": 8.666052793124617, + "grad_norm": 0.1635274887084961, + "learning_rate": 4.596431655288236e-06, + "loss": 1.6846, + "step": 28234 + }, + { + "epoch": 8.66635972989564, + "grad_norm": 0.17310741543769836, + "learning_rate": 4.5943501323116365e-06, + "loss": 1.7321, + "step": 28235 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.14390932023525238, + "learning_rate": 4.592269058060295e-06, + "loss": 1.6606, + "step": 28236 + }, + { + "epoch": 8.666973603437691, + "grad_norm": 0.15254996716976166, + "learning_rate": 4.590188432554759e-06, + "loss": 1.6796, + "step": 28237 + }, + { + "epoch": 8.667280540208717, + "grad_norm": 0.16224564611911774, + "learning_rate": 4.588108255815599e-06, + "loss": 1.7139, + "step": 28238 + }, + { + "epoch": 8.667587476979742, + "grad_norm": 0.14472807943820953, + "learning_rate": 4.586028527863373e-06, + "loss": 1.681, + "step": 28239 + }, + { + "epoch": 8.667894413750767, + "grad_norm": 0.17748364806175232, + "learning_rate": 4.583949248718627e-06, + "loss": 1.7205, + "step": 28240 + }, + { + "epoch": 8.668201350521793, + "grad_norm": 0.16917170584201813, + "learning_rate": 4.581870418401918e-06, + "loss": 1.7475, + "step": 28241 + }, + { + "epoch": 8.668508287292818, + "grad_norm": 0.15715333819389343, + "learning_rate": 4.579792036933784e-06, + "loss": 1.6988, + "step": 28242 + }, + { + "epoch": 8.668815224063843, + "grad_norm": 0.18384969234466553, + "learning_rate": 4.577714104334768e-06, + "loss": 1.715, + "step": 28243 + }, + { + "epoch": 8.669122160834869, + "grad_norm": 0.20845188200473785, + "learning_rate": 4.575636620625401e-06, + "loss": 1.784, + "step": 28244 + }, + { + "epoch": 8.669429097605892, + "grad_norm": 0.16388222575187683, + "learning_rate": 4.5735595858262095e-06, + "loss": 1.7091, + "step": 28245 + }, + { + "epoch": 8.669736034376918, + "grad_norm": 0.27372440695762634, + "learning_rate": 4.571482999957744e-06, + "loss": 1.6903, + "step": 28246 + }, + { + "epoch": 8.670042971147943, + "grad_norm": 0.14129513502120972, + "learning_rate": 4.569406863040493e-06, + "loss": 1.692, + "step": 28247 + }, + { + "epoch": 8.670349907918968, + "grad_norm": 0.1707242876291275, + "learning_rate": 4.567331175095013e-06, + "loss": 1.7542, + "step": 28248 + }, + { + "epoch": 8.670656844689994, + "grad_norm": 0.16061219573020935, + "learning_rate": 4.565255936141783e-06, + "loss": 1.7086, + "step": 28249 + }, + { + "epoch": 8.670963781461019, + "grad_norm": 0.186256542801857, + "learning_rate": 4.5631811462013116e-06, + "loss": 1.7298, + "step": 28250 + }, + { + "epoch": 8.671270718232044, + "grad_norm": 0.19365312159061432, + "learning_rate": 4.561106805294141e-06, + "loss": 1.7714, + "step": 28251 + }, + { + "epoch": 8.67157765500307, + "grad_norm": 0.12306032329797745, + "learning_rate": 4.55903291344072e-06, + "loss": 1.7148, + "step": 28252 + }, + { + "epoch": 8.671884591774095, + "grad_norm": 0.14681962132453918, + "learning_rate": 4.556959470661592e-06, + "loss": 1.6909, + "step": 28253 + }, + { + "epoch": 8.67219152854512, + "grad_norm": 0.22181211411952972, + "learning_rate": 4.554886476977205e-06, + "loss": 1.7747, + "step": 28254 + }, + { + "epoch": 8.672498465316146, + "grad_norm": 0.15168124437332153, + "learning_rate": 4.5528139324080784e-06, + "loss": 1.7206, + "step": 28255 + }, + { + "epoch": 8.67280540208717, + "grad_norm": 0.15613441169261932, + "learning_rate": 4.550741836974676e-06, + "loss": 1.7062, + "step": 28256 + }, + { + "epoch": 8.673112338858195, + "grad_norm": 0.1939506232738495, + "learning_rate": 4.548670190697485e-06, + "loss": 1.747, + "step": 28257 + }, + { + "epoch": 8.67341927562922, + "grad_norm": 0.15883082151412964, + "learning_rate": 4.5465989935969785e-06, + "loss": 1.7169, + "step": 28258 + }, + { + "epoch": 8.673726212400245, + "grad_norm": 0.14583253860473633, + "learning_rate": 4.5445282456936185e-06, + "loss": 1.6918, + "step": 28259 + }, + { + "epoch": 8.67403314917127, + "grad_norm": 0.12797339260578156, + "learning_rate": 4.5424579470078725e-06, + "loss": 1.6791, + "step": 28260 + }, + { + "epoch": 8.674340085942296, + "grad_norm": 0.18248072266578674, + "learning_rate": 4.5403880975602e-06, + "loss": 1.7338, + "step": 28261 + }, + { + "epoch": 8.674647022713321, + "grad_norm": 0.1547573208808899, + "learning_rate": 4.538318697371047e-06, + "loss": 1.6259, + "step": 28262 + }, + { + "epoch": 8.674953959484347, + "grad_norm": 0.18609635531902313, + "learning_rate": 4.536249746460897e-06, + "loss": 1.6943, + "step": 28263 + }, + { + "epoch": 8.675260896255372, + "grad_norm": 0.15615214407444, + "learning_rate": 4.534181244850161e-06, + "loss": 1.6851, + "step": 28264 + }, + { + "epoch": 8.675567833026397, + "grad_norm": 0.17061203718185425, + "learning_rate": 4.532113192559296e-06, + "loss": 1.7612, + "step": 28265 + }, + { + "epoch": 8.675874769797423, + "grad_norm": 0.17611360549926758, + "learning_rate": 4.530045589608739e-06, + "loss": 1.7109, + "step": 28266 + }, + { + "epoch": 8.676181706568446, + "grad_norm": 0.14381951093673706, + "learning_rate": 4.527978436018915e-06, + "loss": 1.6914, + "step": 28267 + }, + { + "epoch": 8.676488643339471, + "grad_norm": 0.18309952318668365, + "learning_rate": 4.525911731810273e-06, + "loss": 1.8044, + "step": 28268 + }, + { + "epoch": 8.676795580110497, + "grad_norm": 0.16398122906684875, + "learning_rate": 4.523845477003208e-06, + "loss": 1.7002, + "step": 28269 + }, + { + "epoch": 8.677102516881522, + "grad_norm": 0.12263865768909454, + "learning_rate": 4.521779671618176e-06, + "loss": 1.6777, + "step": 28270 + }, + { + "epoch": 8.677409453652547, + "grad_norm": 0.17702268064022064, + "learning_rate": 4.519714315675555e-06, + "loss": 1.697, + "step": 28271 + }, + { + "epoch": 8.677716390423573, + "grad_norm": 0.1558506339788437, + "learning_rate": 4.517649409195779e-06, + "loss": 1.7151, + "step": 28272 + }, + { + "epoch": 8.678023327194598, + "grad_norm": 0.19969215989112854, + "learning_rate": 4.5155849521992536e-06, + "loss": 1.7952, + "step": 28273 + }, + { + "epoch": 8.678330263965623, + "grad_norm": 0.14770828187465668, + "learning_rate": 4.513520944706379e-06, + "loss": 1.6846, + "step": 28274 + }, + { + "epoch": 8.678637200736649, + "grad_norm": 0.22692953050136566, + "learning_rate": 4.511457386737544e-06, + "loss": 1.7599, + "step": 28275 + }, + { + "epoch": 8.678944137507674, + "grad_norm": 0.1689091920852661, + "learning_rate": 4.509394278313156e-06, + "loss": 1.67, + "step": 28276 + }, + { + "epoch": 8.6792510742787, + "grad_norm": 0.12909743189811707, + "learning_rate": 4.507331619453592e-06, + "loss": 1.7062, + "step": 28277 + }, + { + "epoch": 8.679558011049723, + "grad_norm": 0.15877538919448853, + "learning_rate": 4.505269410179241e-06, + "loss": 1.688, + "step": 28278 + }, + { + "epoch": 8.679864947820748, + "grad_norm": 0.13565565645694733, + "learning_rate": 4.503207650510477e-06, + "loss": 1.6742, + "step": 28279 + }, + { + "epoch": 8.680171884591774, + "grad_norm": 0.1718231737613678, + "learning_rate": 4.501146340467699e-06, + "loss": 1.71, + "step": 28280 + }, + { + "epoch": 8.680478821362799, + "grad_norm": 0.14713016152381897, + "learning_rate": 4.499085480071252e-06, + "loss": 1.698, + "step": 28281 + }, + { + "epoch": 8.680785758133824, + "grad_norm": 0.15546689927577972, + "learning_rate": 4.49702506934151e-06, + "loss": 1.6863, + "step": 28282 + }, + { + "epoch": 8.68109269490485, + "grad_norm": 0.1528242826461792, + "learning_rate": 4.494965108298832e-06, + "loss": 1.7236, + "step": 28283 + }, + { + "epoch": 8.681399631675875, + "grad_norm": 0.14601372182369232, + "learning_rate": 4.4929055969635755e-06, + "loss": 1.7008, + "step": 28284 + }, + { + "epoch": 8.6817065684469, + "grad_norm": 0.18398553133010864, + "learning_rate": 4.490846535356119e-06, + "loss": 1.7117, + "step": 28285 + }, + { + "epoch": 8.682013505217926, + "grad_norm": 0.16242702305316925, + "learning_rate": 4.4887879234967675e-06, + "loss": 1.7204, + "step": 28286 + }, + { + "epoch": 8.682320441988951, + "grad_norm": 0.11883296817541122, + "learning_rate": 4.486729761405911e-06, + "loss": 1.665, + "step": 28287 + }, + { + "epoch": 8.682627378759975, + "grad_norm": 0.157135009765625, + "learning_rate": 4.484672049103844e-06, + "loss": 1.7438, + "step": 28288 + }, + { + "epoch": 8.682934315531, + "grad_norm": 0.17938226461410522, + "learning_rate": 4.482614786610939e-06, + "loss": 1.7022, + "step": 28289 + }, + { + "epoch": 8.683241252302025, + "grad_norm": 0.20547567307949066, + "learning_rate": 4.480557973947514e-06, + "loss": 1.7818, + "step": 28290 + }, + { + "epoch": 8.68354818907305, + "grad_norm": 0.2329530566930771, + "learning_rate": 4.478501611133889e-06, + "loss": 1.7702, + "step": 28291 + }, + { + "epoch": 8.683855125844076, + "grad_norm": 0.1893717646598816, + "learning_rate": 4.476445698190396e-06, + "loss": 1.7614, + "step": 28292 + }, + { + "epoch": 8.684162062615101, + "grad_norm": 0.17520616948604584, + "learning_rate": 4.474390235137349e-06, + "loss": 1.7585, + "step": 28293 + }, + { + "epoch": 8.684468999386127, + "grad_norm": 0.14743252098560333, + "learning_rate": 4.4723352219950605e-06, + "loss": 1.7008, + "step": 28294 + }, + { + "epoch": 8.684775936157152, + "grad_norm": 0.1734410971403122, + "learning_rate": 4.470280658783843e-06, + "loss": 1.6979, + "step": 28295 + }, + { + "epoch": 8.685082872928177, + "grad_norm": 0.1811109185218811, + "learning_rate": 4.468226545523985e-06, + "loss": 1.7124, + "step": 28296 + }, + { + "epoch": 8.685389809699203, + "grad_norm": 0.12056677043437958, + "learning_rate": 4.466172882235819e-06, + "loss": 1.6642, + "step": 28297 + }, + { + "epoch": 8.685696746470228, + "grad_norm": 0.159573495388031, + "learning_rate": 4.464119668939609e-06, + "loss": 1.7055, + "step": 28298 + }, + { + "epoch": 8.686003683241251, + "grad_norm": 0.17341920733451843, + "learning_rate": 4.46206690565566e-06, + "loss": 1.7036, + "step": 28299 + }, + { + "epoch": 8.686310620012277, + "grad_norm": 0.1660631000995636, + "learning_rate": 4.46001459240426e-06, + "loss": 1.7303, + "step": 28300 + }, + { + "epoch": 8.686617556783302, + "grad_norm": 0.18377192318439484, + "learning_rate": 4.4579627292056724e-06, + "loss": 1.7301, + "step": 28301 + }, + { + "epoch": 8.686924493554327, + "grad_norm": 0.13730384409427643, + "learning_rate": 4.455911316080213e-06, + "loss": 1.6399, + "step": 28302 + }, + { + "epoch": 8.687231430325353, + "grad_norm": 0.25353705883026123, + "learning_rate": 4.453860353048112e-06, + "loss": 1.7682, + "step": 28303 + }, + { + "epoch": 8.687538367096378, + "grad_norm": 0.15051604807376862, + "learning_rate": 4.451809840129673e-06, + "loss": 1.7268, + "step": 28304 + }, + { + "epoch": 8.687845303867404, + "grad_norm": 0.2090475857257843, + "learning_rate": 4.449759777345131e-06, + "loss": 1.7697, + "step": 28305 + }, + { + "epoch": 8.688152240638429, + "grad_norm": 0.13042283058166504, + "learning_rate": 4.4477101647147745e-06, + "loss": 1.667, + "step": 28306 + }, + { + "epoch": 8.688459177409454, + "grad_norm": 0.1518186628818512, + "learning_rate": 4.445661002258838e-06, + "loss": 1.7095, + "step": 28307 + }, + { + "epoch": 8.68876611418048, + "grad_norm": 0.13992765545845032, + "learning_rate": 4.443612289997584e-06, + "loss": 1.6761, + "step": 28308 + }, + { + "epoch": 8.689073050951503, + "grad_norm": 0.17726075649261475, + "learning_rate": 4.44156402795125e-06, + "loss": 1.7444, + "step": 28309 + }, + { + "epoch": 8.689379987722528, + "grad_norm": 0.15143834054470062, + "learning_rate": 4.439516216140088e-06, + "loss": 1.7078, + "step": 28310 + }, + { + "epoch": 8.689686924493554, + "grad_norm": 0.17791767418384552, + "learning_rate": 4.437468854584326e-06, + "loss": 1.7402, + "step": 28311 + }, + { + "epoch": 8.689993861264579, + "grad_norm": 0.19582994282245636, + "learning_rate": 4.435421943304208e-06, + "loss": 1.757, + "step": 28312 + }, + { + "epoch": 8.690300798035604, + "grad_norm": 0.19730351865291595, + "learning_rate": 4.43337548231994e-06, + "loss": 1.6982, + "step": 28313 + }, + { + "epoch": 8.69060773480663, + "grad_norm": 0.16093717515468597, + "learning_rate": 4.43132947165179e-06, + "loss": 1.7116, + "step": 28314 + }, + { + "epoch": 8.690914671577655, + "grad_norm": 0.16639035940170288, + "learning_rate": 4.429283911319937e-06, + "loss": 1.7166, + "step": 28315 + }, + { + "epoch": 8.69122160834868, + "grad_norm": 0.13834281265735626, + "learning_rate": 4.427238801344608e-06, + "loss": 1.7058, + "step": 28316 + }, + { + "epoch": 8.691528545119706, + "grad_norm": 0.1761016994714737, + "learning_rate": 4.4251941417460194e-06, + "loss": 1.7155, + "step": 28317 + }, + { + "epoch": 8.691835481890731, + "grad_norm": 0.17754366993904114, + "learning_rate": 4.423149932544363e-06, + "loss": 1.768, + "step": 28318 + }, + { + "epoch": 8.692142418661756, + "grad_norm": 0.1563618779182434, + "learning_rate": 4.42110617375987e-06, + "loss": 1.706, + "step": 28319 + }, + { + "epoch": 8.692449355432782, + "grad_norm": 0.16851158440113068, + "learning_rate": 4.419062865412704e-06, + "loss": 1.7084, + "step": 28320 + }, + { + "epoch": 8.692756292203805, + "grad_norm": 0.16056731343269348, + "learning_rate": 4.4170200075230925e-06, + "loss": 1.6771, + "step": 28321 + }, + { + "epoch": 8.69306322897483, + "grad_norm": 0.17098097503185272, + "learning_rate": 4.414977600111192e-06, + "loss": 1.712, + "step": 28322 + }, + { + "epoch": 8.693370165745856, + "grad_norm": 0.17442475259304047, + "learning_rate": 4.412935643197208e-06, + "loss": 1.7725, + "step": 28323 + }, + { + "epoch": 8.693677102516881, + "grad_norm": 0.16090531647205353, + "learning_rate": 4.410894136801308e-06, + "loss": 1.6996, + "step": 28324 + }, + { + "epoch": 8.693984039287907, + "grad_norm": 0.17448033392429352, + "learning_rate": 4.408853080943681e-06, + "loss": 1.6934, + "step": 28325 + }, + { + "epoch": 8.694290976058932, + "grad_norm": 0.15201367437839508, + "learning_rate": 4.406812475644484e-06, + "loss": 1.6671, + "step": 28326 + }, + { + "epoch": 8.694597912829957, + "grad_norm": 0.15211759507656097, + "learning_rate": 4.404772320923889e-06, + "loss": 1.7281, + "step": 28327 + }, + { + "epoch": 8.694904849600983, + "grad_norm": 0.1757364720106125, + "learning_rate": 4.402732616802063e-06, + "loss": 1.7085, + "step": 28328 + }, + { + "epoch": 8.695211786372008, + "grad_norm": 0.17995139956474304, + "learning_rate": 4.400693363299152e-06, + "loss": 1.7335, + "step": 28329 + }, + { + "epoch": 8.695518723143033, + "grad_norm": 0.1404990553855896, + "learning_rate": 4.398654560435312e-06, + "loss": 1.7102, + "step": 28330 + }, + { + "epoch": 8.695825659914057, + "grad_norm": 0.17141692340373993, + "learning_rate": 4.396616208230708e-06, + "loss": 1.7195, + "step": 28331 + }, + { + "epoch": 8.696132596685082, + "grad_norm": 0.17162097990512848, + "learning_rate": 4.394578306705471e-06, + "loss": 1.7075, + "step": 28332 + }, + { + "epoch": 8.696439533456108, + "grad_norm": 0.18884550034999847, + "learning_rate": 4.392540855879734e-06, + "loss": 1.72, + "step": 28333 + }, + { + "epoch": 8.696746470227133, + "grad_norm": 0.21365602314472198, + "learning_rate": 4.3905038557736425e-06, + "loss": 1.8024, + "step": 28334 + }, + { + "epoch": 8.697053406998158, + "grad_norm": 0.1939813494682312, + "learning_rate": 4.388467306407318e-06, + "loss": 1.6694, + "step": 28335 + }, + { + "epoch": 8.697360343769184, + "grad_norm": 0.20518864691257477, + "learning_rate": 4.386431207800906e-06, + "loss": 1.7708, + "step": 28336 + }, + { + "epoch": 8.697667280540209, + "grad_norm": 0.16070924699306488, + "learning_rate": 4.3843955599745025e-06, + "loss": 1.7496, + "step": 28337 + }, + { + "epoch": 8.697974217311234, + "grad_norm": 0.17010091245174408, + "learning_rate": 4.3823603629482514e-06, + "loss": 1.6996, + "step": 28338 + }, + { + "epoch": 8.69828115408226, + "grad_norm": 0.14453141391277313, + "learning_rate": 4.380325616742237e-06, + "loss": 1.7032, + "step": 28339 + }, + { + "epoch": 8.698588090853285, + "grad_norm": 0.1959836632013321, + "learning_rate": 4.378291321376593e-06, + "loss": 1.7861, + "step": 28340 + }, + { + "epoch": 8.69889502762431, + "grad_norm": 0.12473960220813751, + "learning_rate": 4.376257476871415e-06, + "loss": 1.6465, + "step": 28341 + }, + { + "epoch": 8.699201964395334, + "grad_norm": 0.17088855803012848, + "learning_rate": 4.374224083246797e-06, + "loss": 1.7701, + "step": 28342 + }, + { + "epoch": 8.699508901166359, + "grad_norm": 0.17513783276081085, + "learning_rate": 4.372191140522846e-06, + "loss": 1.7107, + "step": 28343 + }, + { + "epoch": 8.699815837937384, + "grad_norm": 0.15522748231887817, + "learning_rate": 4.370158648719641e-06, + "loss": 1.6961, + "step": 28344 + }, + { + "epoch": 8.70012277470841, + "grad_norm": 0.1434583216905594, + "learning_rate": 4.36812660785727e-06, + "loss": 1.6927, + "step": 28345 + }, + { + "epoch": 8.700429711479435, + "grad_norm": 0.1571590155363083, + "learning_rate": 4.366095017955824e-06, + "loss": 1.6747, + "step": 28346 + }, + { + "epoch": 8.70073664825046, + "grad_norm": 0.15448859333992004, + "learning_rate": 4.364063879035357e-06, + "loss": 1.7052, + "step": 28347 + }, + { + "epoch": 8.701043585021486, + "grad_norm": 0.18512596189975739, + "learning_rate": 4.362033191115983e-06, + "loss": 1.7516, + "step": 28348 + }, + { + "epoch": 8.701350521792511, + "grad_norm": 0.14646342396736145, + "learning_rate": 4.360002954217734e-06, + "loss": 1.7152, + "step": 28349 + }, + { + "epoch": 8.701657458563536, + "grad_norm": 0.15107101202011108, + "learning_rate": 4.357973168360691e-06, + "loss": 1.6659, + "step": 28350 + }, + { + "epoch": 8.701964395334562, + "grad_norm": 0.1887415051460266, + "learning_rate": 4.355943833564908e-06, + "loss": 1.7506, + "step": 28351 + }, + { + "epoch": 8.702271332105585, + "grad_norm": 0.17195916175842285, + "learning_rate": 4.353914949850424e-06, + "loss": 1.7571, + "step": 28352 + }, + { + "epoch": 8.70257826887661, + "grad_norm": 0.1679403930902481, + "learning_rate": 4.35188651723733e-06, + "loss": 1.7321, + "step": 28353 + }, + { + "epoch": 8.702885205647636, + "grad_norm": 0.1917678713798523, + "learning_rate": 4.349858535745633e-06, + "loss": 1.7387, + "step": 28354 + }, + { + "epoch": 8.703192142418661, + "grad_norm": 0.1321115791797638, + "learning_rate": 4.347831005395408e-06, + "loss": 1.7221, + "step": 28355 + }, + { + "epoch": 8.703499079189687, + "grad_norm": 0.14510731399059296, + "learning_rate": 4.345803926206654e-06, + "loss": 1.6905, + "step": 28356 + }, + { + "epoch": 8.703806015960712, + "grad_norm": 0.158061221241951, + "learning_rate": 4.343777298199431e-06, + "loss": 1.6605, + "step": 28357 + }, + { + "epoch": 8.704112952731737, + "grad_norm": 0.15366631746292114, + "learning_rate": 4.341751121393767e-06, + "loss": 1.7069, + "step": 28358 + }, + { + "epoch": 8.704419889502763, + "grad_norm": 0.20126941800117493, + "learning_rate": 4.339725395809674e-06, + "loss": 1.7704, + "step": 28359 + }, + { + "epoch": 8.704726826273788, + "grad_norm": 0.14276063442230225, + "learning_rate": 4.337700121467181e-06, + "loss": 1.6704, + "step": 28360 + }, + { + "epoch": 8.705033763044813, + "grad_norm": 0.15362146496772766, + "learning_rate": 4.335675298386293e-06, + "loss": 1.6486, + "step": 28361 + }, + { + "epoch": 8.705340699815839, + "grad_norm": 0.16178739070892334, + "learning_rate": 4.333650926587035e-06, + "loss": 1.703, + "step": 28362 + }, + { + "epoch": 8.705647636586864, + "grad_norm": 0.16188332438468933, + "learning_rate": 4.331627006089395e-06, + "loss": 1.6912, + "step": 28363 + }, + { + "epoch": 8.705954573357888, + "grad_norm": 0.1567341834306717, + "learning_rate": 4.3296035369133846e-06, + "loss": 1.6767, + "step": 28364 + }, + { + "epoch": 8.706261510128913, + "grad_norm": 0.16202545166015625, + "learning_rate": 4.327580519079011e-06, + "loss": 1.6836, + "step": 28365 + }, + { + "epoch": 8.706568446899938, + "grad_norm": 0.17161825299263, + "learning_rate": 4.325557952606252e-06, + "loss": 1.7271, + "step": 28366 + }, + { + "epoch": 8.706875383670964, + "grad_norm": 0.14774417877197266, + "learning_rate": 4.323535837515097e-06, + "loss": 1.6815, + "step": 28367 + }, + { + "epoch": 8.707182320441989, + "grad_norm": 0.19654276967048645, + "learning_rate": 4.321514173825531e-06, + "loss": 1.6633, + "step": 28368 + }, + { + "epoch": 8.707489257213014, + "grad_norm": 0.18064813315868378, + "learning_rate": 4.319492961557531e-06, + "loss": 1.7222, + "step": 28369 + }, + { + "epoch": 8.70779619398404, + "grad_norm": 0.14830774068832397, + "learning_rate": 4.317472200731087e-06, + "loss": 1.6921, + "step": 28370 + }, + { + "epoch": 8.708103130755065, + "grad_norm": 0.17077864706516266, + "learning_rate": 4.315451891366146e-06, + "loss": 1.6785, + "step": 28371 + }, + { + "epoch": 8.70841006752609, + "grad_norm": 0.1815696656703949, + "learning_rate": 4.313432033482701e-06, + "loss": 1.6865, + "step": 28372 + }, + { + "epoch": 8.708717004297116, + "grad_norm": 0.17936676740646362, + "learning_rate": 4.311412627100686e-06, + "loss": 1.7477, + "step": 28373 + }, + { + "epoch": 8.70902394106814, + "grad_norm": 0.16955824196338654, + "learning_rate": 4.30939367224007e-06, + "loss": 1.6906, + "step": 28374 + }, + { + "epoch": 8.709330877839164, + "grad_norm": 0.14489254355430603, + "learning_rate": 4.307375168920813e-06, + "loss": 1.6777, + "step": 28375 + }, + { + "epoch": 8.70963781461019, + "grad_norm": 0.18070191144943237, + "learning_rate": 4.305357117162856e-06, + "loss": 1.6955, + "step": 28376 + }, + { + "epoch": 8.709944751381215, + "grad_norm": 0.18469898402690887, + "learning_rate": 4.3033395169861375e-06, + "loss": 1.7364, + "step": 28377 + }, + { + "epoch": 8.71025168815224, + "grad_norm": 0.13740944862365723, + "learning_rate": 4.301322368410604e-06, + "loss": 1.6781, + "step": 28378 + }, + { + "epoch": 8.710558624923266, + "grad_norm": 0.16305440664291382, + "learning_rate": 4.299305671456189e-06, + "loss": 1.7277, + "step": 28379 + }, + { + "epoch": 8.710865561694291, + "grad_norm": 0.15460261702537537, + "learning_rate": 4.29728942614282e-06, + "loss": 1.7536, + "step": 28380 + }, + { + "epoch": 8.711172498465316, + "grad_norm": 0.13714177906513214, + "learning_rate": 4.2952736324904205e-06, + "loss": 1.7417, + "step": 28381 + }, + { + "epoch": 8.711479435236342, + "grad_norm": 0.22590506076812744, + "learning_rate": 4.29325829051892e-06, + "loss": 1.6888, + "step": 28382 + }, + { + "epoch": 8.711786372007367, + "grad_norm": 0.17581406235694885, + "learning_rate": 4.291243400248229e-06, + "loss": 1.7781, + "step": 28383 + }, + { + "epoch": 8.712093308778392, + "grad_norm": 0.15321393311023712, + "learning_rate": 4.289228961698266e-06, + "loss": 1.6613, + "step": 28384 + }, + { + "epoch": 8.712400245549416, + "grad_norm": 0.1657101809978485, + "learning_rate": 4.287214974888931e-06, + "loss": 1.7152, + "step": 28385 + }, + { + "epoch": 8.712707182320441, + "grad_norm": 0.18134190142154694, + "learning_rate": 4.28520143984013e-06, + "loss": 1.7265, + "step": 28386 + }, + { + "epoch": 8.713014119091467, + "grad_norm": 0.1232382282614708, + "learning_rate": 4.28318835657176e-06, + "loss": 1.6457, + "step": 28387 + }, + { + "epoch": 8.713321055862492, + "grad_norm": 0.1339728981256485, + "learning_rate": 4.281175725103715e-06, + "loss": 1.6516, + "step": 28388 + }, + { + "epoch": 8.713627992633517, + "grad_norm": 0.15603719651699066, + "learning_rate": 4.2791635454559e-06, + "loss": 1.717, + "step": 28389 + }, + { + "epoch": 8.713934929404543, + "grad_norm": 0.17226538062095642, + "learning_rate": 4.277151817648179e-06, + "loss": 1.7088, + "step": 28390 + }, + { + "epoch": 8.714241866175568, + "grad_norm": 0.17237617075443268, + "learning_rate": 4.275140541700445e-06, + "loss": 1.7467, + "step": 28391 + }, + { + "epoch": 8.714548802946593, + "grad_norm": 0.1798042505979538, + "learning_rate": 4.2731297176325734e-06, + "loss": 1.7157, + "step": 28392 + }, + { + "epoch": 8.714855739717619, + "grad_norm": 0.1701999455690384, + "learning_rate": 4.271119345464436e-06, + "loss": 1.7575, + "step": 28393 + }, + { + "epoch": 8.715162676488644, + "grad_norm": 0.13981005549430847, + "learning_rate": 4.2691094252159e-06, + "loss": 1.7315, + "step": 28394 + }, + { + "epoch": 8.715469613259668, + "grad_norm": 0.19189679622650146, + "learning_rate": 4.267099956906828e-06, + "loss": 1.7338, + "step": 28395 + }, + { + "epoch": 8.715776550030693, + "grad_norm": 0.14194947481155396, + "learning_rate": 4.265090940557076e-06, + "loss": 1.6999, + "step": 28396 + }, + { + "epoch": 8.716083486801718, + "grad_norm": 0.15809695422649384, + "learning_rate": 4.263082376186506e-06, + "loss": 1.6643, + "step": 28397 + }, + { + "epoch": 8.716390423572744, + "grad_norm": 0.12897074222564697, + "learning_rate": 4.261074263814963e-06, + "loss": 1.7096, + "step": 28398 + }, + { + "epoch": 8.716697360343769, + "grad_norm": 0.1517125964164734, + "learning_rate": 4.259066603462292e-06, + "loss": 1.7101, + "step": 28399 + }, + { + "epoch": 8.717004297114794, + "grad_norm": 0.1489602029323578, + "learning_rate": 4.257059395148333e-06, + "loss": 1.7097, + "step": 28400 + }, + { + "epoch": 8.71731123388582, + "grad_norm": 0.15182913839817047, + "learning_rate": 4.255052638892926e-06, + "loss": 1.7161, + "step": 28401 + }, + { + "epoch": 8.717618170656845, + "grad_norm": 0.1973588615655899, + "learning_rate": 4.253046334715899e-06, + "loss": 1.7452, + "step": 28402 + }, + { + "epoch": 8.71792510742787, + "grad_norm": 0.17291557788848877, + "learning_rate": 4.251040482637081e-06, + "loss": 1.7671, + "step": 28403 + }, + { + "epoch": 8.718232044198896, + "grad_norm": 0.1525208055973053, + "learning_rate": 4.249035082676295e-06, + "loss": 1.6891, + "step": 28404 + }, + { + "epoch": 8.718538980969921, + "grad_norm": 0.1681409627199173, + "learning_rate": 4.247030134853352e-06, + "loss": 1.728, + "step": 28405 + }, + { + "epoch": 8.718845917740946, + "grad_norm": 0.18142938613891602, + "learning_rate": 4.245025639188094e-06, + "loss": 1.6952, + "step": 28406 + }, + { + "epoch": 8.71915285451197, + "grad_norm": 0.17891576886177063, + "learning_rate": 4.243021595700286e-06, + "loss": 1.7304, + "step": 28407 + }, + { + "epoch": 8.719459791282995, + "grad_norm": 0.1676199585199356, + "learning_rate": 4.24101800440978e-06, + "loss": 1.6756, + "step": 28408 + }, + { + "epoch": 8.71976672805402, + "grad_norm": 0.16762350499629974, + "learning_rate": 4.239014865336339e-06, + "loss": 1.6899, + "step": 28409 + }, + { + "epoch": 8.720073664825046, + "grad_norm": 0.14751142263412476, + "learning_rate": 4.2370121784997776e-06, + "loss": 1.677, + "step": 28410 + }, + { + "epoch": 8.720380601596071, + "grad_norm": 0.16818544268608093, + "learning_rate": 4.235009943919887e-06, + "loss": 1.7132, + "step": 28411 + }, + { + "epoch": 8.720687538367097, + "grad_norm": 0.14754259586334229, + "learning_rate": 4.233008161616453e-06, + "loss": 1.6744, + "step": 28412 + }, + { + "epoch": 8.720994475138122, + "grad_norm": 0.1303185522556305, + "learning_rate": 4.231006831609258e-06, + "loss": 1.6783, + "step": 28413 + }, + { + "epoch": 8.721301411909147, + "grad_norm": 0.14147131145000458, + "learning_rate": 4.229005953918075e-06, + "loss": 1.6911, + "step": 28414 + }, + { + "epoch": 8.721608348680173, + "grad_norm": 0.19011028110980988, + "learning_rate": 4.227005528562688e-06, + "loss": 1.7245, + "step": 28415 + }, + { + "epoch": 8.721915285451198, + "grad_norm": 0.1327231526374817, + "learning_rate": 4.225005555562855e-06, + "loss": 1.6676, + "step": 28416 + }, + { + "epoch": 8.722222222222221, + "grad_norm": 0.13436436653137207, + "learning_rate": 4.223006034938354e-06, + "loss": 1.6926, + "step": 28417 + }, + { + "epoch": 8.722529158993247, + "grad_norm": 0.18722930550575256, + "learning_rate": 4.221006966708929e-06, + "loss": 1.7759, + "step": 28418 + }, + { + "epoch": 8.722836095764272, + "grad_norm": 0.18999920785427094, + "learning_rate": 4.219008350894355e-06, + "loss": 1.7385, + "step": 28419 + }, + { + "epoch": 8.723143032535297, + "grad_norm": 0.14250624179840088, + "learning_rate": 4.217010187514364e-06, + "loss": 1.7263, + "step": 28420 + }, + { + "epoch": 8.723449969306323, + "grad_norm": 0.1577407717704773, + "learning_rate": 4.21501247658872e-06, + "loss": 1.8055, + "step": 28421 + }, + { + "epoch": 8.723756906077348, + "grad_norm": 0.120110422372818, + "learning_rate": 4.213015218137145e-06, + "loss": 1.6519, + "step": 28422 + }, + { + "epoch": 8.724063842848373, + "grad_norm": 0.17998605966567993, + "learning_rate": 4.211018412179407e-06, + "loss": 1.6827, + "step": 28423 + }, + { + "epoch": 8.724370779619399, + "grad_norm": 0.14941653609275818, + "learning_rate": 4.209022058735213e-06, + "loss": 1.7089, + "step": 28424 + }, + { + "epoch": 8.724677716390424, + "grad_norm": 0.13641475141048431, + "learning_rate": 4.207026157824312e-06, + "loss": 1.6825, + "step": 28425 + }, + { + "epoch": 8.72498465316145, + "grad_norm": 0.1666809320449829, + "learning_rate": 4.205030709466401e-06, + "loss": 1.6958, + "step": 28426 + }, + { + "epoch": 8.725291589932475, + "grad_norm": 0.1236952468752861, + "learning_rate": 4.20303571368123e-06, + "loss": 1.6417, + "step": 28427 + }, + { + "epoch": 8.725598526703498, + "grad_norm": 0.1483321338891983, + "learning_rate": 4.201041170488501e-06, + "loss": 1.7082, + "step": 28428 + }, + { + "epoch": 8.725905463474524, + "grad_norm": 0.17827022075653076, + "learning_rate": 4.1990470799079255e-06, + "loss": 1.6506, + "step": 28429 + }, + { + "epoch": 8.726212400245549, + "grad_norm": 0.17171478271484375, + "learning_rate": 4.197053441959215e-06, + "loss": 1.7403, + "step": 28430 + }, + { + "epoch": 8.726519337016574, + "grad_norm": 0.18554572761058807, + "learning_rate": 4.195060256662064e-06, + "loss": 1.6899, + "step": 28431 + }, + { + "epoch": 8.7268262737876, + "grad_norm": 0.30604809522628784, + "learning_rate": 4.193067524036176e-06, + "loss": 1.7656, + "step": 28432 + }, + { + "epoch": 8.727133210558625, + "grad_norm": 0.1759488433599472, + "learning_rate": 4.191075244101245e-06, + "loss": 1.7167, + "step": 28433 + }, + { + "epoch": 8.72744014732965, + "grad_norm": 0.15285685658454895, + "learning_rate": 4.18908341687696e-06, + "loss": 1.6576, + "step": 28434 + }, + { + "epoch": 8.727747084100676, + "grad_norm": 0.17283809185028076, + "learning_rate": 4.187092042382995e-06, + "loss": 1.719, + "step": 28435 + }, + { + "epoch": 8.728054020871701, + "grad_norm": 0.1511228382587433, + "learning_rate": 4.1851011206390455e-06, + "loss": 1.6499, + "step": 28436 + }, + { + "epoch": 8.728360957642726, + "grad_norm": 0.13646523654460907, + "learning_rate": 4.183110651664779e-06, + "loss": 1.703, + "step": 28437 + }, + { + "epoch": 8.72866789441375, + "grad_norm": 0.16112352907657623, + "learning_rate": 4.181120635479863e-06, + "loss": 1.6963, + "step": 28438 + }, + { + "epoch": 8.728974831184775, + "grad_norm": 0.23064331710338593, + "learning_rate": 4.179131072103964e-06, + "loss": 1.7347, + "step": 28439 + }, + { + "epoch": 8.7292817679558, + "grad_norm": 0.17859068512916565, + "learning_rate": 4.177141961556763e-06, + "loss": 1.7963, + "step": 28440 + }, + { + "epoch": 8.729588704726826, + "grad_norm": 0.16455049812793732, + "learning_rate": 4.175153303857887e-06, + "loss": 1.6893, + "step": 28441 + }, + { + "epoch": 8.729895641497851, + "grad_norm": 0.1353607475757599, + "learning_rate": 4.173165099027021e-06, + "loss": 1.7165, + "step": 28442 + }, + { + "epoch": 8.730202578268877, + "grad_norm": 0.20421212911605835, + "learning_rate": 4.171177347083783e-06, + "loss": 1.7256, + "step": 28443 + }, + { + "epoch": 8.730509515039902, + "grad_norm": 0.17925186455249786, + "learning_rate": 4.169190048047833e-06, + "loss": 1.6819, + "step": 28444 + }, + { + "epoch": 8.730816451810927, + "grad_norm": 0.17959848046302795, + "learning_rate": 4.167203201938819e-06, + "loss": 1.7275, + "step": 28445 + }, + { + "epoch": 8.731123388581953, + "grad_norm": 0.13794639706611633, + "learning_rate": 4.165216808776357e-06, + "loss": 1.6694, + "step": 28446 + }, + { + "epoch": 8.731430325352978, + "grad_norm": 0.15895675122737885, + "learning_rate": 4.163230868580092e-06, + "loss": 1.7159, + "step": 28447 + }, + { + "epoch": 8.731737262124003, + "grad_norm": 0.16645625233650208, + "learning_rate": 4.161245381369644e-06, + "loss": 1.7068, + "step": 28448 + }, + { + "epoch": 8.732044198895027, + "grad_norm": 0.17593564093112946, + "learning_rate": 4.15926034716464e-06, + "loss": 1.7013, + "step": 28449 + }, + { + "epoch": 8.732351135666052, + "grad_norm": 0.1613699495792389, + "learning_rate": 4.157275765984692e-06, + "loss": 1.6925, + "step": 28450 + }, + { + "epoch": 8.732658072437077, + "grad_norm": 0.21205542981624603, + "learning_rate": 4.155291637849412e-06, + "loss": 1.8401, + "step": 28451 + }, + { + "epoch": 8.732965009208103, + "grad_norm": 0.16209860146045685, + "learning_rate": 4.153307962778408e-06, + "loss": 1.7068, + "step": 28452 + }, + { + "epoch": 8.733271945979128, + "grad_norm": 0.17571625113487244, + "learning_rate": 4.1513247407912905e-06, + "loss": 1.7245, + "step": 28453 + }, + { + "epoch": 8.733578882750153, + "grad_norm": 0.12565423548221588, + "learning_rate": 4.149341971907655e-06, + "loss": 1.6714, + "step": 28454 + }, + { + "epoch": 8.733885819521179, + "grad_norm": 0.14843232929706573, + "learning_rate": 4.147359656147093e-06, + "loss": 1.6685, + "step": 28455 + }, + { + "epoch": 8.734192756292204, + "grad_norm": 0.1699068695306778, + "learning_rate": 4.145377793529193e-06, + "loss": 1.6808, + "step": 28456 + }, + { + "epoch": 8.73449969306323, + "grad_norm": 0.18543531000614166, + "learning_rate": 4.143396384073556e-06, + "loss": 1.7721, + "step": 28457 + }, + { + "epoch": 8.734806629834255, + "grad_norm": 0.15792638063430786, + "learning_rate": 4.141415427799744e-06, + "loss": 1.6804, + "step": 28458 + }, + { + "epoch": 8.735113566605278, + "grad_norm": 0.19353818893432617, + "learning_rate": 4.139434924727359e-06, + "loss": 1.7062, + "step": 28459 + }, + { + "epoch": 8.735420503376304, + "grad_norm": 0.14087705314159393, + "learning_rate": 4.137454874875935e-06, + "loss": 1.6287, + "step": 28460 + }, + { + "epoch": 8.735727440147329, + "grad_norm": 0.14594002068042755, + "learning_rate": 4.135475278265077e-06, + "loss": 1.6741, + "step": 28461 + }, + { + "epoch": 8.736034376918354, + "grad_norm": 0.13943135738372803, + "learning_rate": 4.133496134914333e-06, + "loss": 1.7261, + "step": 28462 + }, + { + "epoch": 8.73634131368938, + "grad_norm": 0.20119191706180573, + "learning_rate": 4.131517444843264e-06, + "loss": 1.7719, + "step": 28463 + }, + { + "epoch": 8.736648250460405, + "grad_norm": 0.15612776577472687, + "learning_rate": 4.12953920807142e-06, + "loss": 1.6694, + "step": 28464 + }, + { + "epoch": 8.73695518723143, + "grad_norm": 0.15517298877239227, + "learning_rate": 4.127561424618359e-06, + "loss": 1.7225, + "step": 28465 + }, + { + "epoch": 8.737262124002456, + "grad_norm": 0.18650169670581818, + "learning_rate": 4.125584094503626e-06, + "loss": 1.7589, + "step": 28466 + }, + { + "epoch": 8.737569060773481, + "grad_norm": 0.19337934255599976, + "learning_rate": 4.123607217746755e-06, + "loss": 1.6754, + "step": 28467 + }, + { + "epoch": 8.737875997544506, + "grad_norm": 0.15818046033382416, + "learning_rate": 4.121630794367287e-06, + "loss": 1.7176, + "step": 28468 + }, + { + "epoch": 8.738182934315532, + "grad_norm": 0.14257800579071045, + "learning_rate": 4.11965482438475e-06, + "loss": 1.6961, + "step": 28469 + }, + { + "epoch": 8.738489871086557, + "grad_norm": 0.15100477635860443, + "learning_rate": 4.1176793078186785e-06, + "loss": 1.7161, + "step": 28470 + }, + { + "epoch": 8.73879680785758, + "grad_norm": 0.14171260595321655, + "learning_rate": 4.115704244688595e-06, + "loss": 1.6812, + "step": 28471 + }, + { + "epoch": 8.739103744628606, + "grad_norm": 0.13742563128471375, + "learning_rate": 4.1137296350140134e-06, + "loss": 1.6968, + "step": 28472 + }, + { + "epoch": 8.739410681399631, + "grad_norm": 0.131202831864357, + "learning_rate": 4.111755478814439e-06, + "loss": 1.6859, + "step": 28473 + }, + { + "epoch": 8.739717618170657, + "grad_norm": 0.14671406149864197, + "learning_rate": 4.109781776109411e-06, + "loss": 1.7227, + "step": 28474 + }, + { + "epoch": 8.740024554941682, + "grad_norm": 0.17391672730445862, + "learning_rate": 4.107808526918405e-06, + "loss": 1.6926, + "step": 28475 + }, + { + "epoch": 8.740331491712707, + "grad_norm": 0.16088297963142395, + "learning_rate": 4.105835731260943e-06, + "loss": 1.7296, + "step": 28476 + }, + { + "epoch": 8.740638428483733, + "grad_norm": 0.15273302793502808, + "learning_rate": 4.1038633891564985e-06, + "loss": 1.6888, + "step": 28477 + }, + { + "epoch": 8.740945365254758, + "grad_norm": 0.16602970659732819, + "learning_rate": 4.101891500624588e-06, + "loss": 1.6924, + "step": 28478 + }, + { + "epoch": 8.741252302025783, + "grad_norm": 0.13952100276947021, + "learning_rate": 4.099920065684681e-06, + "loss": 1.6972, + "step": 28479 + }, + { + "epoch": 8.741559238796809, + "grad_norm": 0.18140468001365662, + "learning_rate": 4.097949084356273e-06, + "loss": 1.7417, + "step": 28480 + }, + { + "epoch": 8.741866175567832, + "grad_norm": 0.19571609795093536, + "learning_rate": 4.095978556658831e-06, + "loss": 1.7261, + "step": 28481 + }, + { + "epoch": 8.742173112338858, + "grad_norm": 0.1748526245355606, + "learning_rate": 4.094008482611838e-06, + "loss": 1.7975, + "step": 28482 + }, + { + "epoch": 8.742480049109883, + "grad_norm": 0.1984734982252121, + "learning_rate": 4.092038862234759e-06, + "loss": 1.7941, + "step": 28483 + }, + { + "epoch": 8.742786985880908, + "grad_norm": 0.1336900144815445, + "learning_rate": 4.090069695547055e-06, + "loss": 1.6612, + "step": 28484 + }, + { + "epoch": 8.743093922651934, + "grad_norm": 0.1755249798297882, + "learning_rate": 4.088100982568193e-06, + "loss": 1.679, + "step": 28485 + }, + { + "epoch": 8.743400859422959, + "grad_norm": 0.17111645638942719, + "learning_rate": 4.086132723317631e-06, + "loss": 1.739, + "step": 28486 + }, + { + "epoch": 8.743707796193984, + "grad_norm": 0.18933364748954773, + "learning_rate": 4.084164917814815e-06, + "loss": 1.7469, + "step": 28487 + }, + { + "epoch": 8.74401473296501, + "grad_norm": 0.15212221443653107, + "learning_rate": 4.082197566079188e-06, + "loss": 1.7137, + "step": 28488 + }, + { + "epoch": 8.744321669736035, + "grad_norm": 0.1428573727607727, + "learning_rate": 4.080230668130203e-06, + "loss": 1.67, + "step": 28489 + }, + { + "epoch": 8.74462860650706, + "grad_norm": 0.1688205450773239, + "learning_rate": 4.078264223987283e-06, + "loss": 1.7149, + "step": 28490 + }, + { + "epoch": 8.744935543278086, + "grad_norm": 0.23390214145183563, + "learning_rate": 4.07629823366989e-06, + "loss": 1.7647, + "step": 28491 + }, + { + "epoch": 8.745242480049109, + "grad_norm": 0.163333460688591, + "learning_rate": 4.074332697197419e-06, + "loss": 1.7047, + "step": 28492 + }, + { + "epoch": 8.745549416820134, + "grad_norm": 0.14970998466014862, + "learning_rate": 4.072367614589323e-06, + "loss": 1.6921, + "step": 28493 + }, + { + "epoch": 8.74585635359116, + "grad_norm": 0.18369705975055695, + "learning_rate": 4.070402985864996e-06, + "loss": 1.7266, + "step": 28494 + }, + { + "epoch": 8.746163290362185, + "grad_norm": 0.17579036951065063, + "learning_rate": 4.068438811043873e-06, + "loss": 1.742, + "step": 28495 + }, + { + "epoch": 8.74647022713321, + "grad_norm": 0.1286322921514511, + "learning_rate": 4.066475090145355e-06, + "loss": 1.6656, + "step": 28496 + }, + { + "epoch": 8.746777163904236, + "grad_norm": 0.1595929116010666, + "learning_rate": 4.06451182318886e-06, + "loss": 1.7079, + "step": 28497 + }, + { + "epoch": 8.747084100675261, + "grad_norm": 0.14556388556957245, + "learning_rate": 4.062549010193778e-06, + "loss": 1.6948, + "step": 28498 + }, + { + "epoch": 8.747391037446286, + "grad_norm": 0.19447384774684906, + "learning_rate": 4.060586651179516e-06, + "loss": 1.7648, + "step": 28499 + }, + { + "epoch": 8.747697974217312, + "grad_norm": 0.147284135222435, + "learning_rate": 4.058624746165457e-06, + "loss": 1.713, + "step": 28500 + }, + { + "epoch": 8.748004910988337, + "grad_norm": 0.17068512737751007, + "learning_rate": 4.056663295170998e-06, + "loss": 1.708, + "step": 28501 + }, + { + "epoch": 8.74831184775936, + "grad_norm": 0.15625207126140594, + "learning_rate": 4.054702298215523e-06, + "loss": 1.7152, + "step": 28502 + }, + { + "epoch": 8.748618784530386, + "grad_norm": 0.14633874595165253, + "learning_rate": 4.052741755318407e-06, + "loss": 1.7221, + "step": 28503 + }, + { + "epoch": 8.748925721301411, + "grad_norm": 0.15166686475276947, + "learning_rate": 4.0507816664990265e-06, + "loss": 1.7179, + "step": 28504 + }, + { + "epoch": 8.749232658072437, + "grad_norm": 0.12509481608867645, + "learning_rate": 4.0488220317767555e-06, + "loss": 1.6743, + "step": 28505 + }, + { + "epoch": 8.749539594843462, + "grad_norm": 0.20686158537864685, + "learning_rate": 4.046862851170957e-06, + "loss": 1.6925, + "step": 28506 + }, + { + "epoch": 8.749846531614487, + "grad_norm": 0.12619495391845703, + "learning_rate": 4.044904124700983e-06, + "loss": 1.6932, + "step": 28507 + }, + { + "epoch": 8.750153468385513, + "grad_norm": 0.1770995706319809, + "learning_rate": 4.0429458523862205e-06, + "loss": 1.7948, + "step": 28508 + }, + { + "epoch": 8.750460405156538, + "grad_norm": 0.22418050467967987, + "learning_rate": 4.040988034245991e-06, + "loss": 1.7008, + "step": 28509 + }, + { + "epoch": 8.750767341927563, + "grad_norm": 0.14798377454280853, + "learning_rate": 4.039030670299665e-06, + "loss": 1.6673, + "step": 28510 + }, + { + "epoch": 8.751074278698589, + "grad_norm": 0.182883620262146, + "learning_rate": 4.037073760566562e-06, + "loss": 1.7223, + "step": 28511 + }, + { + "epoch": 8.751381215469614, + "grad_norm": 0.14968620240688324, + "learning_rate": 4.035117305066044e-06, + "loss": 1.6656, + "step": 28512 + }, + { + "epoch": 8.75168815224064, + "grad_norm": 0.19700272381305695, + "learning_rate": 4.03316130381744e-06, + "loss": 1.7207, + "step": 28513 + }, + { + "epoch": 8.751995089011663, + "grad_norm": 0.17926210165023804, + "learning_rate": 4.031205756840073e-06, + "loss": 1.7131, + "step": 28514 + }, + { + "epoch": 8.752302025782688, + "grad_norm": 0.1471911519765854, + "learning_rate": 4.029250664153278e-06, + "loss": 1.6731, + "step": 28515 + }, + { + "epoch": 8.752608962553714, + "grad_norm": 0.18923047184944153, + "learning_rate": 4.0272960257763725e-06, + "loss": 1.7795, + "step": 28516 + }, + { + "epoch": 8.752915899324739, + "grad_norm": 0.14930424094200134, + "learning_rate": 4.025341841728675e-06, + "loss": 1.7201, + "step": 28517 + }, + { + "epoch": 8.753222836095764, + "grad_norm": 0.17335213720798492, + "learning_rate": 4.0233881120294915e-06, + "loss": 1.7297, + "step": 28518 + }, + { + "epoch": 8.75352977286679, + "grad_norm": 0.14489638805389404, + "learning_rate": 4.021434836698135e-06, + "loss": 1.7314, + "step": 28519 + }, + { + "epoch": 8.753836709637815, + "grad_norm": 0.16861389577388763, + "learning_rate": 4.019482015753912e-06, + "loss": 1.7362, + "step": 28520 + }, + { + "epoch": 8.75414364640884, + "grad_norm": 0.1467277705669403, + "learning_rate": 4.0175296492161115e-06, + "loss": 1.6607, + "step": 28521 + }, + { + "epoch": 8.754450583179866, + "grad_norm": 0.1556902825832367, + "learning_rate": 4.015577737104037e-06, + "loss": 1.747, + "step": 28522 + }, + { + "epoch": 8.754757519950891, + "grad_norm": 0.13337039947509766, + "learning_rate": 4.013626279436977e-06, + "loss": 1.7271, + "step": 28523 + }, + { + "epoch": 8.755064456721914, + "grad_norm": 0.1599043607711792, + "learning_rate": 4.011675276234206e-06, + "loss": 1.6859, + "step": 28524 + }, + { + "epoch": 8.75537139349294, + "grad_norm": 0.11567290872335434, + "learning_rate": 4.009724727515035e-06, + "loss": 1.6577, + "step": 28525 + }, + { + "epoch": 8.755678330263965, + "grad_norm": 0.16317762434482574, + "learning_rate": 4.0077746332987e-06, + "loss": 1.7041, + "step": 28526 + }, + { + "epoch": 8.75598526703499, + "grad_norm": 0.13116325438022614, + "learning_rate": 4.005824993604506e-06, + "loss": 1.6847, + "step": 28527 + }, + { + "epoch": 8.756292203806016, + "grad_norm": 0.14927831292152405, + "learning_rate": 4.003875808451696e-06, + "loss": 1.6312, + "step": 28528 + }, + { + "epoch": 8.756599140577041, + "grad_norm": 0.15273495018482208, + "learning_rate": 4.001927077859552e-06, + "loss": 1.7027, + "step": 28529 + }, + { + "epoch": 8.756906077348066, + "grad_norm": 0.17557594180107117, + "learning_rate": 3.999978801847326e-06, + "loss": 1.7294, + "step": 28530 + }, + { + "epoch": 8.757213014119092, + "grad_norm": 0.16061940789222717, + "learning_rate": 3.998030980434269e-06, + "loss": 1.7179, + "step": 28531 + }, + { + "epoch": 8.757519950890117, + "grad_norm": 0.1431310772895813, + "learning_rate": 3.996083613639634e-06, + "loss": 1.6811, + "step": 28532 + }, + { + "epoch": 8.757826887661142, + "grad_norm": 0.16931994259357452, + "learning_rate": 3.994136701482659e-06, + "loss": 1.7246, + "step": 28533 + }, + { + "epoch": 8.758133824432168, + "grad_norm": 0.13671527802944183, + "learning_rate": 3.992190243982596e-06, + "loss": 1.6877, + "step": 28534 + }, + { + "epoch": 8.758440761203191, + "grad_norm": 0.11943815648555756, + "learning_rate": 3.990244241158675e-06, + "loss": 1.6476, + "step": 28535 + }, + { + "epoch": 8.758747697974217, + "grad_norm": 0.17011673748493195, + "learning_rate": 3.988298693030124e-06, + "loss": 1.7105, + "step": 28536 + }, + { + "epoch": 8.759054634745242, + "grad_norm": 0.1379362791776657, + "learning_rate": 3.986353599616177e-06, + "loss": 1.6691, + "step": 28537 + }, + { + "epoch": 8.759361571516267, + "grad_norm": 0.13264621794223785, + "learning_rate": 3.984408960936048e-06, + "loss": 1.6766, + "step": 28538 + }, + { + "epoch": 8.759668508287293, + "grad_norm": 0.16023825109004974, + "learning_rate": 3.982464777008965e-06, + "loss": 1.6906, + "step": 28539 + }, + { + "epoch": 8.759975445058318, + "grad_norm": 0.1602984219789505, + "learning_rate": 3.980521047854135e-06, + "loss": 1.7094, + "step": 28540 + }, + { + "epoch": 8.760282381829343, + "grad_norm": 0.15421636402606964, + "learning_rate": 3.978577773490772e-06, + "loss": 1.7467, + "step": 28541 + }, + { + "epoch": 8.760589318600369, + "grad_norm": 0.1427018642425537, + "learning_rate": 3.976634953938074e-06, + "loss": 1.7093, + "step": 28542 + }, + { + "epoch": 8.760896255371394, + "grad_norm": 0.143124058842659, + "learning_rate": 3.97469258921524e-06, + "loss": 1.6795, + "step": 28543 + }, + { + "epoch": 8.76120319214242, + "grad_norm": 0.14654754102230072, + "learning_rate": 3.97275067934148e-06, + "loss": 1.7246, + "step": 28544 + }, + { + "epoch": 8.761510128913443, + "grad_norm": 0.17374441027641296, + "learning_rate": 3.970809224335964e-06, + "loss": 1.6828, + "step": 28545 + }, + { + "epoch": 8.761817065684468, + "grad_norm": 0.1596260517835617, + "learning_rate": 3.968868224217898e-06, + "loss": 1.7816, + "step": 28546 + }, + { + "epoch": 8.762124002455494, + "grad_norm": 0.1467326581478119, + "learning_rate": 3.966927679006455e-06, + "loss": 1.6933, + "step": 28547 + }, + { + "epoch": 8.762430939226519, + "grad_norm": 0.12959735095500946, + "learning_rate": 3.9649875887208085e-06, + "loss": 1.6839, + "step": 28548 + }, + { + "epoch": 8.762737875997544, + "grad_norm": 0.13395267724990845, + "learning_rate": 3.963047953380145e-06, + "loss": 1.6968, + "step": 28549 + }, + { + "epoch": 8.76304481276857, + "grad_norm": 0.1369883418083191, + "learning_rate": 3.961108773003619e-06, + "loss": 1.6849, + "step": 28550 + }, + { + "epoch": 8.763351749539595, + "grad_norm": 0.19795149564743042, + "learning_rate": 3.959170047610405e-06, + "loss": 1.7593, + "step": 28551 + }, + { + "epoch": 8.76365868631062, + "grad_norm": 0.14946505427360535, + "learning_rate": 3.9572317772196555e-06, + "loss": 1.7309, + "step": 28552 + }, + { + "epoch": 8.763965623081646, + "grad_norm": 0.14034941792488098, + "learning_rate": 3.955293961850526e-06, + "loss": 1.6906, + "step": 28553 + }, + { + "epoch": 8.764272559852671, + "grad_norm": 0.1528625339269638, + "learning_rate": 3.9533566015221735e-06, + "loss": 1.7318, + "step": 28554 + }, + { + "epoch": 8.764579496623696, + "grad_norm": 0.15130504965782166, + "learning_rate": 3.951419696253733e-06, + "loss": 1.7147, + "step": 28555 + }, + { + "epoch": 8.764886433394722, + "grad_norm": 0.12917234003543854, + "learning_rate": 3.949483246064361e-06, + "loss": 1.687, + "step": 28556 + }, + { + "epoch": 8.765193370165745, + "grad_norm": 0.1918531060218811, + "learning_rate": 3.947547250973182e-06, + "loss": 1.7411, + "step": 28557 + }, + { + "epoch": 8.76550030693677, + "grad_norm": 0.16794945299625397, + "learning_rate": 3.9456117109993366e-06, + "loss": 1.762, + "step": 28558 + }, + { + "epoch": 8.765807243707796, + "grad_norm": 0.18833400309085846, + "learning_rate": 3.9436766261619465e-06, + "loss": 1.7641, + "step": 28559 + }, + { + "epoch": 8.766114180478821, + "grad_norm": 0.1939263939857483, + "learning_rate": 3.941741996480131e-06, + "loss": 1.7633, + "step": 28560 + }, + { + "epoch": 8.766421117249847, + "grad_norm": 0.15766844153404236, + "learning_rate": 3.939807821973029e-06, + "loss": 1.6989, + "step": 28561 + }, + { + "epoch": 8.766728054020872, + "grad_norm": 0.14704185724258423, + "learning_rate": 3.937874102659733e-06, + "loss": 1.7006, + "step": 28562 + }, + { + "epoch": 8.767034990791897, + "grad_norm": 0.1752765029668808, + "learning_rate": 3.935940838559376e-06, + "loss": 1.6738, + "step": 28563 + }, + { + "epoch": 8.767341927562923, + "grad_norm": 0.1801508069038391, + "learning_rate": 3.934008029691033e-06, + "loss": 1.7578, + "step": 28564 + }, + { + "epoch": 8.767648864333948, + "grad_norm": 0.17966793477535248, + "learning_rate": 3.932075676073838e-06, + "loss": 1.7347, + "step": 28565 + }, + { + "epoch": 8.767955801104973, + "grad_norm": 0.1435980200767517, + "learning_rate": 3.930143777726863e-06, + "loss": 1.6907, + "step": 28566 + }, + { + "epoch": 8.768262737875997, + "grad_norm": 0.1439833641052246, + "learning_rate": 3.928212334669218e-06, + "loss": 1.6804, + "step": 28567 + }, + { + "epoch": 8.768569674647022, + "grad_norm": 0.18037080764770508, + "learning_rate": 3.92628134691998e-06, + "loss": 1.7287, + "step": 28568 + }, + { + "epoch": 8.768876611418047, + "grad_norm": 0.1484454721212387, + "learning_rate": 3.924350814498229e-06, + "loss": 1.7128, + "step": 28569 + }, + { + "epoch": 8.769183548189073, + "grad_norm": 0.1302090734243393, + "learning_rate": 3.922420737423055e-06, + "loss": 1.647, + "step": 28570 + }, + { + "epoch": 8.769490484960098, + "grad_norm": 0.16756890714168549, + "learning_rate": 3.920491115713526e-06, + "loss": 1.7613, + "step": 28571 + }, + { + "epoch": 8.769797421731123, + "grad_norm": 0.17668041586875916, + "learning_rate": 3.918561949388705e-06, + "loss": 1.6957, + "step": 28572 + }, + { + "epoch": 8.770104358502149, + "grad_norm": 0.14288358390331268, + "learning_rate": 3.916633238467671e-06, + "loss": 1.6879, + "step": 28573 + }, + { + "epoch": 8.770411295273174, + "grad_norm": 0.16978147625923157, + "learning_rate": 3.9147049829694746e-06, + "loss": 1.7456, + "step": 28574 + }, + { + "epoch": 8.7707182320442, + "grad_norm": 0.13802385330200195, + "learning_rate": 3.91277718291318e-06, + "loss": 1.6799, + "step": 28575 + }, + { + "epoch": 8.771025168815225, + "grad_norm": 0.16819354891777039, + "learning_rate": 3.910849838317826e-06, + "loss": 1.7277, + "step": 28576 + }, + { + "epoch": 8.77133210558625, + "grad_norm": 0.16395528614521027, + "learning_rate": 3.908922949202465e-06, + "loss": 1.6976, + "step": 28577 + }, + { + "epoch": 8.771639042357274, + "grad_norm": 0.14518797397613525, + "learning_rate": 3.906996515586159e-06, + "loss": 1.6962, + "step": 28578 + }, + { + "epoch": 8.771945979128299, + "grad_norm": 0.17786560952663422, + "learning_rate": 3.905070537487909e-06, + "loss": 1.6593, + "step": 28579 + }, + { + "epoch": 8.772252915899324, + "grad_norm": 0.1793101727962494, + "learning_rate": 3.9031450149267845e-06, + "loss": 1.7699, + "step": 28580 + }, + { + "epoch": 8.77255985267035, + "grad_norm": 0.2498319298028946, + "learning_rate": 3.901219947921786e-06, + "loss": 1.745, + "step": 28581 + }, + { + "epoch": 8.772866789441375, + "grad_norm": 0.14886927604675293, + "learning_rate": 3.899295336491959e-06, + "loss": 1.6886, + "step": 28582 + }, + { + "epoch": 8.7731737262124, + "grad_norm": 0.1918812394142151, + "learning_rate": 3.897371180656317e-06, + "loss": 1.7717, + "step": 28583 + }, + { + "epoch": 8.773480662983426, + "grad_norm": 0.15470977127552032, + "learning_rate": 3.895447480433873e-06, + "loss": 1.6747, + "step": 28584 + }, + { + "epoch": 8.773787599754451, + "grad_norm": 0.15075071156024933, + "learning_rate": 3.893524235843648e-06, + "loss": 1.6753, + "step": 28585 + }, + { + "epoch": 8.774094536525476, + "grad_norm": 0.14186562597751617, + "learning_rate": 3.891601446904625e-06, + "loss": 1.6535, + "step": 28586 + }, + { + "epoch": 8.774401473296502, + "grad_norm": 0.16147254407405853, + "learning_rate": 3.8896791136358305e-06, + "loss": 1.6939, + "step": 28587 + }, + { + "epoch": 8.774708410067525, + "grad_norm": 0.1621028035879135, + "learning_rate": 3.8877572360562554e-06, + "loss": 1.7311, + "step": 28588 + }, + { + "epoch": 8.77501534683855, + "grad_norm": 0.1451268047094345, + "learning_rate": 3.885835814184885e-06, + "loss": 1.7029, + "step": 28589 + }, + { + "epoch": 8.775322283609576, + "grad_norm": 0.1404246985912323, + "learning_rate": 3.883914848040715e-06, + "loss": 1.7338, + "step": 28590 + }, + { + "epoch": 8.775629220380601, + "grad_norm": 0.15817701816558838, + "learning_rate": 3.881994337642731e-06, + "loss": 1.6944, + "step": 28591 + }, + { + "epoch": 8.775936157151627, + "grad_norm": 0.15462549030780792, + "learning_rate": 3.880074283009905e-06, + "loss": 1.7406, + "step": 28592 + }, + { + "epoch": 8.776243093922652, + "grad_norm": 0.1545121818780899, + "learning_rate": 3.878154684161217e-06, + "loss": 1.7009, + "step": 28593 + }, + { + "epoch": 8.776550030693677, + "grad_norm": 0.13072805106639862, + "learning_rate": 3.8762355411156305e-06, + "loss": 1.6798, + "step": 28594 + }, + { + "epoch": 8.776856967464703, + "grad_norm": 0.16369932889938354, + "learning_rate": 3.8743168538921344e-06, + "loss": 1.7046, + "step": 28595 + }, + { + "epoch": 8.777163904235728, + "grad_norm": 0.151187926530838, + "learning_rate": 3.8723986225096596e-06, + "loss": 1.7383, + "step": 28596 + }, + { + "epoch": 8.777470841006753, + "grad_norm": 0.16651193797588348, + "learning_rate": 3.8704808469871955e-06, + "loss": 1.7178, + "step": 28597 + }, + { + "epoch": 8.777777777777779, + "grad_norm": 0.1387864351272583, + "learning_rate": 3.868563527343655e-06, + "loss": 1.6644, + "step": 28598 + }, + { + "epoch": 8.778084714548802, + "grad_norm": 0.14454610645771027, + "learning_rate": 3.866646663598022e-06, + "loss": 1.6699, + "step": 28599 + }, + { + "epoch": 8.778391651319827, + "grad_norm": 0.1706279069185257, + "learning_rate": 3.864730255769223e-06, + "loss": 1.7251, + "step": 28600 + }, + { + "epoch": 8.778698588090853, + "grad_norm": 0.14636628329753876, + "learning_rate": 3.8628143038762e-06, + "loss": 1.6774, + "step": 28601 + }, + { + "epoch": 8.779005524861878, + "grad_norm": 0.17533506453037262, + "learning_rate": 3.860898807937902e-06, + "loss": 1.7587, + "step": 28602 + }, + { + "epoch": 8.779312461632903, + "grad_norm": 0.2628023326396942, + "learning_rate": 3.858983767973223e-06, + "loss": 1.7571, + "step": 28603 + }, + { + "epoch": 8.779619398403929, + "grad_norm": 0.1412924826145172, + "learning_rate": 3.857069184001116e-06, + "loss": 1.699, + "step": 28604 + }, + { + "epoch": 8.779926335174954, + "grad_norm": 0.16076254844665527, + "learning_rate": 3.855155056040505e-06, + "loss": 1.7327, + "step": 28605 + }, + { + "epoch": 8.78023327194598, + "grad_norm": 0.1440654993057251, + "learning_rate": 3.85324138411029e-06, + "loss": 1.6941, + "step": 28606 + }, + { + "epoch": 8.780540208717005, + "grad_norm": 0.1956651359796524, + "learning_rate": 3.8513281682293956e-06, + "loss": 1.728, + "step": 28607 + }, + { + "epoch": 8.78084714548803, + "grad_norm": 0.14176496863365173, + "learning_rate": 3.849415408416723e-06, + "loss": 1.7139, + "step": 28608 + }, + { + "epoch": 8.781154082259054, + "grad_norm": 0.18848197162151337, + "learning_rate": 3.84750310469118e-06, + "loss": 1.7092, + "step": 28609 + }, + { + "epoch": 8.781461019030079, + "grad_norm": 0.1622554361820221, + "learning_rate": 3.8455912570716565e-06, + "loss": 1.7137, + "step": 28610 + }, + { + "epoch": 8.781767955801104, + "grad_norm": 0.14255301654338837, + "learning_rate": 3.843679865577049e-06, + "loss": 1.6759, + "step": 28611 + }, + { + "epoch": 8.78207489257213, + "grad_norm": 0.15052112936973572, + "learning_rate": 3.841768930226264e-06, + "loss": 1.6749, + "step": 28612 + }, + { + "epoch": 8.782381829343155, + "grad_norm": 0.19591687619686127, + "learning_rate": 3.8398584510381584e-06, + "loss": 1.7263, + "step": 28613 + }, + { + "epoch": 8.78268876611418, + "grad_norm": 0.1651594340801239, + "learning_rate": 3.83794842803164e-06, + "loss": 1.763, + "step": 28614 + }, + { + "epoch": 8.782995702885206, + "grad_norm": 0.15854987502098083, + "learning_rate": 3.83603886122556e-06, + "loss": 1.7128, + "step": 28615 + }, + { + "epoch": 8.783302639656231, + "grad_norm": 0.14012815058231354, + "learning_rate": 3.834129750638804e-06, + "loss": 1.6711, + "step": 28616 + }, + { + "epoch": 8.783609576427256, + "grad_norm": 0.19335302710533142, + "learning_rate": 3.832221096290245e-06, + "loss": 1.7082, + "step": 28617 + }, + { + "epoch": 8.783916513198282, + "grad_norm": 0.13030263781547546, + "learning_rate": 3.830312898198729e-06, + "loss": 1.6831, + "step": 28618 + }, + { + "epoch": 8.784223449969307, + "grad_norm": 0.14048850536346436, + "learning_rate": 3.82840515638313e-06, + "loss": 1.7419, + "step": 28619 + }, + { + "epoch": 8.784530386740332, + "grad_norm": 0.1761157363653183, + "learning_rate": 3.826497870862284e-06, + "loss": 1.7285, + "step": 28620 + }, + { + "epoch": 8.784837323511356, + "grad_norm": 0.16928929090499878, + "learning_rate": 3.824591041655051e-06, + "loss": 1.7597, + "step": 28621 + }, + { + "epoch": 8.785144260282381, + "grad_norm": 0.12604424357414246, + "learning_rate": 3.822684668780275e-06, + "loss": 1.6895, + "step": 28622 + }, + { + "epoch": 8.785451197053407, + "grad_norm": 0.1835777759552002, + "learning_rate": 3.820778752256793e-06, + "loss": 1.7131, + "step": 28623 + }, + { + "epoch": 8.785758133824432, + "grad_norm": 0.1577402502298355, + "learning_rate": 3.818873292103447e-06, + "loss": 1.7159, + "step": 28624 + }, + { + "epoch": 8.786065070595457, + "grad_norm": 0.14781227707862854, + "learning_rate": 3.8169682883390565e-06, + "loss": 1.7179, + "step": 28625 + }, + { + "epoch": 8.786372007366483, + "grad_norm": 0.19881610572338104, + "learning_rate": 3.815063740982461e-06, + "loss": 1.7586, + "step": 28626 + }, + { + "epoch": 8.786678944137508, + "grad_norm": 0.16822806000709534, + "learning_rate": 3.813159650052467e-06, + "loss": 1.7628, + "step": 28627 + }, + { + "epoch": 8.786985880908533, + "grad_norm": 0.14510734379291534, + "learning_rate": 3.811256015567899e-06, + "loss": 1.654, + "step": 28628 + }, + { + "epoch": 8.787292817679559, + "grad_norm": 0.1547134667634964, + "learning_rate": 3.8093528375475863e-06, + "loss": 1.7204, + "step": 28629 + }, + { + "epoch": 8.787599754450584, + "grad_norm": 0.19592107832431793, + "learning_rate": 3.8074501160103027e-06, + "loss": 1.7084, + "step": 28630 + }, + { + "epoch": 8.787906691221608, + "grad_norm": 0.1543792486190796, + "learning_rate": 3.8055478509748887e-06, + "loss": 1.7322, + "step": 28631 + }, + { + "epoch": 8.788213627992633, + "grad_norm": 0.17076534032821655, + "learning_rate": 3.8036460424601128e-06, + "loss": 1.7004, + "step": 28632 + }, + { + "epoch": 8.788520564763658, + "grad_norm": 0.13622300326824188, + "learning_rate": 3.8017446904847875e-06, + "loss": 1.6867, + "step": 28633 + }, + { + "epoch": 8.788827501534684, + "grad_norm": 0.3221909999847412, + "learning_rate": 3.7998437950677035e-06, + "loss": 1.7559, + "step": 28634 + }, + { + "epoch": 8.789134438305709, + "grad_norm": 0.1811852902173996, + "learning_rate": 3.79794335622764e-06, + "loss": 1.7439, + "step": 28635 + }, + { + "epoch": 8.789441375076734, + "grad_norm": 0.1573752760887146, + "learning_rate": 3.7960433739833877e-06, + "loss": 1.7129, + "step": 28636 + }, + { + "epoch": 8.78974831184776, + "grad_norm": 0.13165032863616943, + "learning_rate": 3.7941438483536986e-06, + "loss": 1.6926, + "step": 28637 + }, + { + "epoch": 8.790055248618785, + "grad_norm": 0.14245405793190002, + "learning_rate": 3.792244779357368e-06, + "loss": 1.7072, + "step": 28638 + }, + { + "epoch": 8.79036218538981, + "grad_norm": 0.16790303587913513, + "learning_rate": 3.790346167013159e-06, + "loss": 1.6979, + "step": 28639 + }, + { + "epoch": 8.790669122160836, + "grad_norm": 0.15134595334529877, + "learning_rate": 3.7884480113398345e-06, + "loss": 1.7035, + "step": 28640 + }, + { + "epoch": 8.79097605893186, + "grad_norm": 0.1418851763010025, + "learning_rate": 3.7865503123561575e-06, + "loss": 1.6462, + "step": 28641 + }, + { + "epoch": 8.791282995702884, + "grad_norm": 0.13052044808864594, + "learning_rate": 3.784653070080868e-06, + "loss": 1.6559, + "step": 28642 + }, + { + "epoch": 8.79158993247391, + "grad_norm": 0.14758886396884918, + "learning_rate": 3.782756284532729e-06, + "loss": 1.6948, + "step": 28643 + }, + { + "epoch": 8.791896869244935, + "grad_norm": 0.1561112254858017, + "learning_rate": 3.7808599557304814e-06, + "loss": 1.6465, + "step": 28644 + }, + { + "epoch": 8.79220380601596, + "grad_norm": 0.17403864860534668, + "learning_rate": 3.77896408369286e-06, + "loss": 1.7397, + "step": 28645 + }, + { + "epoch": 8.792510742786986, + "grad_norm": 0.147226944565773, + "learning_rate": 3.7770686684386158e-06, + "loss": 1.6707, + "step": 28646 + }, + { + "epoch": 8.792817679558011, + "grad_norm": 0.1681959182024002, + "learning_rate": 3.7751737099864627e-06, + "loss": 1.6786, + "step": 28647 + }, + { + "epoch": 8.793124616329036, + "grad_norm": 0.15970535576343536, + "learning_rate": 3.773279208355146e-06, + "loss": 1.6652, + "step": 28648 + }, + { + "epoch": 8.793431553100062, + "grad_norm": 0.18252034485340118, + "learning_rate": 3.771385163563368e-06, + "loss": 1.7478, + "step": 28649 + }, + { + "epoch": 8.793738489871087, + "grad_norm": 0.22270283102989197, + "learning_rate": 3.7694915756298576e-06, + "loss": 1.7683, + "step": 28650 + }, + { + "epoch": 8.794045426642112, + "grad_norm": 0.13913489878177643, + "learning_rate": 3.7675984445733337e-06, + "loss": 1.7275, + "step": 28651 + }, + { + "epoch": 8.794352363413136, + "grad_norm": 0.16266898810863495, + "learning_rate": 3.7657057704124976e-06, + "loss": 1.7145, + "step": 28652 + }, + { + "epoch": 8.794659300184161, + "grad_norm": 0.18106494843959808, + "learning_rate": 3.763813553166068e-06, + "loss": 1.6936, + "step": 28653 + }, + { + "epoch": 8.794966236955187, + "grad_norm": 0.17213653028011322, + "learning_rate": 3.761921792852713e-06, + "loss": 1.7223, + "step": 28654 + }, + { + "epoch": 8.795273173726212, + "grad_norm": 0.14013275504112244, + "learning_rate": 3.7600304894911562e-06, + "loss": 1.7082, + "step": 28655 + }, + { + "epoch": 8.795580110497237, + "grad_norm": 0.1625421643257141, + "learning_rate": 3.758139643100078e-06, + "loss": 1.719, + "step": 28656 + }, + { + "epoch": 8.795887047268263, + "grad_norm": 0.15947094559669495, + "learning_rate": 3.756249253698174e-06, + "loss": 1.7448, + "step": 28657 + }, + { + "epoch": 8.796193984039288, + "grad_norm": 0.16739755868911743, + "learning_rate": 3.754359321304113e-06, + "loss": 1.7048, + "step": 28658 + }, + { + "epoch": 8.796500920810313, + "grad_norm": 0.17619092762470245, + "learning_rate": 3.7524698459365794e-06, + "loss": 1.7247, + "step": 28659 + }, + { + "epoch": 8.796807857581339, + "grad_norm": 0.19410766661167145, + "learning_rate": 3.7505808276142473e-06, + "loss": 1.6918, + "step": 28660 + }, + { + "epoch": 8.797114794352364, + "grad_norm": 0.13881324231624603, + "learning_rate": 3.74869226635578e-06, + "loss": 1.6997, + "step": 28661 + }, + { + "epoch": 8.79742173112339, + "grad_norm": 0.16185659170150757, + "learning_rate": 3.74680416217984e-06, + "loss": 1.6951, + "step": 28662 + }, + { + "epoch": 8.797728667894415, + "grad_norm": 0.4652320444583893, + "learning_rate": 3.744916515105107e-06, + "loss": 1.7521, + "step": 28663 + }, + { + "epoch": 8.798035604665438, + "grad_norm": 0.1286199539899826, + "learning_rate": 3.7430293251501992e-06, + "loss": 1.7106, + "step": 28664 + }, + { + "epoch": 8.798342541436464, + "grad_norm": 0.18184927105903625, + "learning_rate": 3.741142592333807e-06, + "loss": 1.7297, + "step": 28665 + }, + { + "epoch": 8.798649478207489, + "grad_norm": 0.1292438805103302, + "learning_rate": 3.7392563166745443e-06, + "loss": 1.6701, + "step": 28666 + }, + { + "epoch": 8.798956414978514, + "grad_norm": 0.16631865501403809, + "learning_rate": 3.7373704981910673e-06, + "loss": 1.7572, + "step": 28667 + }, + { + "epoch": 8.79926335174954, + "grad_norm": 0.13093185424804688, + "learning_rate": 3.7354851369020117e-06, + "loss": 1.6912, + "step": 28668 + }, + { + "epoch": 8.799570288520565, + "grad_norm": 0.16165922582149506, + "learning_rate": 3.7336002328260123e-06, + "loss": 1.668, + "step": 28669 + }, + { + "epoch": 8.79987722529159, + "grad_norm": 0.1431419402360916, + "learning_rate": 3.7317157859816987e-06, + "loss": 1.6499, + "step": 28670 + }, + { + "epoch": 8.800184162062616, + "grad_norm": 0.16933713853359222, + "learning_rate": 3.729831796387667e-06, + "loss": 1.7081, + "step": 28671 + }, + { + "epoch": 8.800491098833641, + "grad_norm": 0.15956951677799225, + "learning_rate": 3.727948264062575e-06, + "loss": 1.6981, + "step": 28672 + }, + { + "epoch": 8.800798035604666, + "grad_norm": 0.17684711515903473, + "learning_rate": 3.726065189025013e-06, + "loss": 1.7254, + "step": 28673 + }, + { + "epoch": 8.80110497237569, + "grad_norm": 0.20180673897266388, + "learning_rate": 3.7241825712935997e-06, + "loss": 1.764, + "step": 28674 + }, + { + "epoch": 8.801411909146715, + "grad_norm": 0.165853351354599, + "learning_rate": 3.7223004108869307e-06, + "loss": 1.7275, + "step": 28675 + }, + { + "epoch": 8.80171884591774, + "grad_norm": 0.25295981764793396, + "learning_rate": 3.72041870782362e-06, + "loss": 1.8427, + "step": 28676 + }, + { + "epoch": 8.802025782688766, + "grad_norm": 0.14879196882247925, + "learning_rate": 3.7185374621222567e-06, + "loss": 1.6921, + "step": 28677 + }, + { + "epoch": 8.802332719459791, + "grad_norm": 0.159479022026062, + "learning_rate": 3.716656673801433e-06, + "loss": 1.699, + "step": 28678 + }, + { + "epoch": 8.802639656230816, + "grad_norm": 0.1288701742887497, + "learning_rate": 3.714776342879722e-06, + "loss": 1.6872, + "step": 28679 + }, + { + "epoch": 8.802946593001842, + "grad_norm": 0.15079650282859802, + "learning_rate": 3.712896469375743e-06, + "loss": 1.6873, + "step": 28680 + }, + { + "epoch": 8.803253529772867, + "grad_norm": 0.1662154346704483, + "learning_rate": 3.7110170533080304e-06, + "loss": 1.7451, + "step": 28681 + }, + { + "epoch": 8.803560466543892, + "grad_norm": 0.1374291628599167, + "learning_rate": 3.709138094695197e-06, + "loss": 1.6698, + "step": 28682 + }, + { + "epoch": 8.803867403314918, + "grad_norm": 0.13723774254322052, + "learning_rate": 3.707259593555773e-06, + "loss": 1.734, + "step": 28683 + }, + { + "epoch": 8.804174340085943, + "grad_norm": 0.15156403183937073, + "learning_rate": 3.7053815499083543e-06, + "loss": 1.7228, + "step": 28684 + }, + { + "epoch": 8.804481276856967, + "grad_norm": 0.15390744805335999, + "learning_rate": 3.7035039637714876e-06, + "loss": 1.7659, + "step": 28685 + }, + { + "epoch": 8.804788213627992, + "grad_norm": 0.13234136998653412, + "learning_rate": 3.7016268351637297e-06, + "loss": 1.684, + "step": 28686 + }, + { + "epoch": 8.805095150399017, + "grad_norm": 0.20412379503250122, + "learning_rate": 3.699750164103638e-06, + "loss": 1.7228, + "step": 28687 + }, + { + "epoch": 8.805402087170043, + "grad_norm": 0.15076974034309387, + "learning_rate": 3.697873950609737e-06, + "loss": 1.7029, + "step": 28688 + }, + { + "epoch": 8.805709023941068, + "grad_norm": 0.13920028507709503, + "learning_rate": 3.6959981947005952e-06, + "loss": 1.6905, + "step": 28689 + }, + { + "epoch": 8.806015960712093, + "grad_norm": 0.13444112241268158, + "learning_rate": 3.694122896394736e-06, + "loss": 1.6483, + "step": 28690 + }, + { + "epoch": 8.806322897483119, + "grad_norm": 0.18719401955604553, + "learning_rate": 3.692248055710701e-06, + "loss": 1.7326, + "step": 28691 + }, + { + "epoch": 8.806629834254144, + "grad_norm": 0.2103775292634964, + "learning_rate": 3.690373672667008e-06, + "loss": 1.8134, + "step": 28692 + }, + { + "epoch": 8.80693677102517, + "grad_norm": 0.14053337275981903, + "learning_rate": 3.6884997472821814e-06, + "loss": 1.713, + "step": 28693 + }, + { + "epoch": 8.807243707796195, + "grad_norm": 0.21146062016487122, + "learning_rate": 3.686626279574751e-06, + "loss": 1.767, + "step": 28694 + }, + { + "epoch": 8.807550644567218, + "grad_norm": 0.1462959349155426, + "learning_rate": 3.6847532695632236e-06, + "loss": 1.7002, + "step": 28695 + }, + { + "epoch": 8.807857581338244, + "grad_norm": 0.13064992427825928, + "learning_rate": 3.682880717266102e-06, + "loss": 1.6927, + "step": 28696 + }, + { + "epoch": 8.808164518109269, + "grad_norm": 0.11652515083551407, + "learning_rate": 3.6810086227019147e-06, + "loss": 1.6717, + "step": 28697 + }, + { + "epoch": 8.808471454880294, + "grad_norm": 0.14266341924667358, + "learning_rate": 3.679136985889131e-06, + "loss": 1.6843, + "step": 28698 + }, + { + "epoch": 8.80877839165132, + "grad_norm": 0.15322953462600708, + "learning_rate": 3.677265806846286e-06, + "loss": 1.6947, + "step": 28699 + }, + { + "epoch": 8.809085328422345, + "grad_norm": 0.1330055147409439, + "learning_rate": 3.675395085591832e-06, + "loss": 1.7386, + "step": 28700 + }, + { + "epoch": 8.80939226519337, + "grad_norm": 0.14793124794960022, + "learning_rate": 3.6735248221442807e-06, + "loss": 1.6841, + "step": 28701 + }, + { + "epoch": 8.809699201964396, + "grad_norm": 0.13912439346313477, + "learning_rate": 3.6716550165221185e-06, + "loss": 1.697, + "step": 28702 + }, + { + "epoch": 8.810006138735421, + "grad_norm": 0.17170770466327667, + "learning_rate": 3.669785668743808e-06, + "loss": 1.7158, + "step": 28703 + }, + { + "epoch": 8.810313075506446, + "grad_norm": 0.14432193338871002, + "learning_rate": 3.66791677882784e-06, + "loss": 1.6617, + "step": 28704 + }, + { + "epoch": 8.810620012277472, + "grad_norm": 0.14610548317432404, + "learning_rate": 3.666048346792661e-06, + "loss": 1.6677, + "step": 28705 + }, + { + "epoch": 8.810926949048497, + "grad_norm": 0.15598154067993164, + "learning_rate": 3.664180372656756e-06, + "loss": 1.6847, + "step": 28706 + }, + { + "epoch": 8.81123388581952, + "grad_norm": 0.11805412918329239, + "learning_rate": 3.662312856438577e-06, + "loss": 1.668, + "step": 28707 + }, + { + "epoch": 8.811540822590546, + "grad_norm": 0.16846078634262085, + "learning_rate": 3.660445798156581e-06, + "loss": 1.7295, + "step": 28708 + }, + { + "epoch": 8.811847759361571, + "grad_norm": 0.11984262615442276, + "learning_rate": 3.658579197829226e-06, + "loss": 1.6711, + "step": 28709 + }, + { + "epoch": 8.812154696132596, + "grad_norm": 0.13624878227710724, + "learning_rate": 3.6567130554749476e-06, + "loss": 1.665, + "step": 28710 + }, + { + "epoch": 8.812461632903622, + "grad_norm": 0.19053621590137482, + "learning_rate": 3.654847371112197e-06, + "loss": 1.7301, + "step": 28711 + }, + { + "epoch": 8.812768569674647, + "grad_norm": 0.12689290940761566, + "learning_rate": 3.6529821447594036e-06, + "loss": 1.6683, + "step": 28712 + }, + { + "epoch": 8.813075506445673, + "grad_norm": 0.20414969325065613, + "learning_rate": 3.6511173764350094e-06, + "loss": 1.7787, + "step": 28713 + }, + { + "epoch": 8.813382443216698, + "grad_norm": 0.1935388743877411, + "learning_rate": 3.6492530661574377e-06, + "loss": 1.7021, + "step": 28714 + }, + { + "epoch": 8.813689379987723, + "grad_norm": 0.15490898489952087, + "learning_rate": 3.6473892139451072e-06, + "loss": 1.7155, + "step": 28715 + }, + { + "epoch": 8.813996316758749, + "grad_norm": 0.2282942682504654, + "learning_rate": 3.6455258198164587e-06, + "loss": 1.6895, + "step": 28716 + }, + { + "epoch": 8.814303253529772, + "grad_norm": 0.12892891466617584, + "learning_rate": 3.643662883789878e-06, + "loss": 1.6478, + "step": 28717 + }, + { + "epoch": 8.814610190300797, + "grad_norm": 0.12005404382944107, + "learning_rate": 3.641800405883811e-06, + "loss": 1.6955, + "step": 28718 + }, + { + "epoch": 8.814917127071823, + "grad_norm": 0.15036113560199738, + "learning_rate": 3.639938386116626e-06, + "loss": 1.7104, + "step": 28719 + }, + { + "epoch": 8.815224063842848, + "grad_norm": 0.13082142174243927, + "learning_rate": 3.6380768245067478e-06, + "loss": 1.6797, + "step": 28720 + }, + { + "epoch": 8.815531000613873, + "grad_norm": 0.12086073309183121, + "learning_rate": 3.6362157210725778e-06, + "loss": 1.6478, + "step": 28721 + }, + { + "epoch": 8.815837937384899, + "grad_norm": 0.15807145833969116, + "learning_rate": 3.6343550758324797e-06, + "loss": 1.6987, + "step": 28722 + }, + { + "epoch": 8.816144874155924, + "grad_norm": 0.1517954170703888, + "learning_rate": 3.6324948888048715e-06, + "loss": 1.7048, + "step": 28723 + }, + { + "epoch": 8.81645181092695, + "grad_norm": 0.12381365150213242, + "learning_rate": 3.6306351600081223e-06, + "loss": 1.6788, + "step": 28724 + }, + { + "epoch": 8.816758747697975, + "grad_norm": 0.14769119024276733, + "learning_rate": 3.6287758894606173e-06, + "loss": 1.6961, + "step": 28725 + }, + { + "epoch": 8.817065684469, + "grad_norm": 0.13606438040733337, + "learning_rate": 3.6269170771807305e-06, + "loss": 1.6603, + "step": 28726 + }, + { + "epoch": 8.817372621240025, + "grad_norm": 0.1724759191274643, + "learning_rate": 3.625058723186825e-06, + "loss": 1.7054, + "step": 28727 + }, + { + "epoch": 8.817679558011049, + "grad_norm": 0.1703757792711258, + "learning_rate": 3.6232008274972753e-06, + "loss": 1.7539, + "step": 28728 + }, + { + "epoch": 8.817986494782074, + "grad_norm": 0.17725473642349243, + "learning_rate": 3.621343390130433e-06, + "loss": 1.7774, + "step": 28729 + }, + { + "epoch": 8.8182934315531, + "grad_norm": 0.12104978412389755, + "learning_rate": 3.6194864111046558e-06, + "loss": 1.6966, + "step": 28730 + }, + { + "epoch": 8.818600368324125, + "grad_norm": 0.15737809240818024, + "learning_rate": 3.6176298904383066e-06, + "loss": 1.7527, + "step": 28731 + }, + { + "epoch": 8.81890730509515, + "grad_norm": 0.2053712159395218, + "learning_rate": 3.61577382814971e-06, + "loss": 1.695, + "step": 28732 + }, + { + "epoch": 8.819214241866176, + "grad_norm": 0.17244333028793335, + "learning_rate": 3.61391822425724e-06, + "loss": 1.7748, + "step": 28733 + }, + { + "epoch": 8.819521178637201, + "grad_norm": 0.10550814867019653, + "learning_rate": 3.612063078779204e-06, + "loss": 1.6216, + "step": 28734 + }, + { + "epoch": 8.819828115408226, + "grad_norm": 0.12428541481494904, + "learning_rate": 3.6102083917339657e-06, + "loss": 1.6863, + "step": 28735 + }, + { + "epoch": 8.820135052179252, + "grad_norm": 0.1403985470533371, + "learning_rate": 3.608354163139821e-06, + "loss": 1.7582, + "step": 28736 + }, + { + "epoch": 8.820441988950277, + "grad_norm": 0.14146897196769714, + "learning_rate": 3.6065003930151163e-06, + "loss": 1.6711, + "step": 28737 + }, + { + "epoch": 8.8207489257213, + "grad_norm": 0.1309487670660019, + "learning_rate": 3.6046470813781763e-06, + "loss": 1.6553, + "step": 28738 + }, + { + "epoch": 8.821055862492326, + "grad_norm": 0.16398943960666656, + "learning_rate": 3.602794228247297e-06, + "loss": 1.7097, + "step": 28739 + }, + { + "epoch": 8.821362799263351, + "grad_norm": 0.13138768076896667, + "learning_rate": 3.6009418336408085e-06, + "loss": 1.6641, + "step": 28740 + }, + { + "epoch": 8.821669736034377, + "grad_norm": 0.14470353722572327, + "learning_rate": 3.599089897576996e-06, + "loss": 1.6626, + "step": 28741 + }, + { + "epoch": 8.821976672805402, + "grad_norm": 0.17124676704406738, + "learning_rate": 3.597238420074178e-06, + "loss": 1.7347, + "step": 28742 + }, + { + "epoch": 8.822283609576427, + "grad_norm": 0.19663479924201965, + "learning_rate": 3.595387401150652e-06, + "loss": 1.7267, + "step": 28743 + }, + { + "epoch": 8.822590546347453, + "grad_norm": 0.14935022592544556, + "learning_rate": 3.5935368408247016e-06, + "loss": 1.7001, + "step": 28744 + }, + { + "epoch": 8.822897483118478, + "grad_norm": 0.13796019554138184, + "learning_rate": 3.591686739114625e-06, + "loss": 1.6774, + "step": 28745 + }, + { + "epoch": 8.823204419889503, + "grad_norm": 0.19741731882095337, + "learning_rate": 3.5898370960386952e-06, + "loss": 1.6887, + "step": 28746 + }, + { + "epoch": 8.823511356660529, + "grad_norm": 0.17089900374412537, + "learning_rate": 3.5879879116151984e-06, + "loss": 1.6869, + "step": 28747 + }, + { + "epoch": 8.823818293431554, + "grad_norm": 0.13532526791095734, + "learning_rate": 3.5861391858624083e-06, + "loss": 1.6525, + "step": 28748 + }, + { + "epoch": 8.824125230202577, + "grad_norm": 0.15727277100086212, + "learning_rate": 3.5842909187985886e-06, + "loss": 1.725, + "step": 28749 + }, + { + "epoch": 8.824432166973603, + "grad_norm": 0.14250576496124268, + "learning_rate": 3.5824431104420298e-06, + "loss": 1.6728, + "step": 28750 + }, + { + "epoch": 8.824739103744628, + "grad_norm": 0.1596658080816269, + "learning_rate": 3.580595760810951e-06, + "loss": 1.6933, + "step": 28751 + }, + { + "epoch": 8.825046040515653, + "grad_norm": 0.2319880872964859, + "learning_rate": 3.5787488699236537e-06, + "loss": 1.744, + "step": 28752 + }, + { + "epoch": 8.825352977286679, + "grad_norm": 0.12813101708889008, + "learning_rate": 3.5769024377983517e-06, + "loss": 1.7022, + "step": 28753 + }, + { + "epoch": 8.825659914057704, + "grad_norm": 0.1346128284931183, + "learning_rate": 3.5750564644533137e-06, + "loss": 1.6755, + "step": 28754 + }, + { + "epoch": 8.82596685082873, + "grad_norm": 0.1405024230480194, + "learning_rate": 3.5732109499067913e-06, + "loss": 1.6662, + "step": 28755 + }, + { + "epoch": 8.826273787599755, + "grad_norm": 0.16663044691085815, + "learning_rate": 3.571365894176992e-06, + "loss": 1.7237, + "step": 28756 + }, + { + "epoch": 8.82658072437078, + "grad_norm": 0.19339314103126526, + "learning_rate": 3.56952129728218e-06, + "loss": 1.729, + "step": 28757 + }, + { + "epoch": 8.826887661141805, + "grad_norm": 0.18851202726364136, + "learning_rate": 3.5676771592405624e-06, + "loss": 1.6923, + "step": 28758 + }, + { + "epoch": 8.82719459791283, + "grad_norm": 0.15386530756950378, + "learning_rate": 3.5658334800703797e-06, + "loss": 1.695, + "step": 28759 + }, + { + "epoch": 8.827501534683854, + "grad_norm": 0.17883063852787018, + "learning_rate": 3.5639902597898455e-06, + "loss": 1.746, + "step": 28760 + }, + { + "epoch": 8.82780847145488, + "grad_norm": 0.15690109133720398, + "learning_rate": 3.5621474984171733e-06, + "loss": 1.6937, + "step": 28761 + }, + { + "epoch": 8.828115408225905, + "grad_norm": 0.19555453956127167, + "learning_rate": 3.5603051959705815e-06, + "loss": 1.7524, + "step": 28762 + }, + { + "epoch": 8.82842234499693, + "grad_norm": 0.13835586607456207, + "learning_rate": 3.558463352468272e-06, + "loss": 1.6975, + "step": 28763 + }, + { + "epoch": 8.828729281767956, + "grad_norm": 0.13608703017234802, + "learning_rate": 3.556621967928453e-06, + "loss": 1.6588, + "step": 28764 + }, + { + "epoch": 8.829036218538981, + "grad_norm": 0.1849900633096695, + "learning_rate": 3.5547810423693096e-06, + "loss": 1.7236, + "step": 28765 + }, + { + "epoch": 8.829343155310006, + "grad_norm": 0.13603585958480835, + "learning_rate": 3.5529405758090382e-06, + "loss": 1.69, + "step": 28766 + }, + { + "epoch": 8.829650092081032, + "grad_norm": 0.12596213817596436, + "learning_rate": 3.5511005682658473e-06, + "loss": 1.7069, + "step": 28767 + }, + { + "epoch": 8.829957028852057, + "grad_norm": 0.17949149012565613, + "learning_rate": 3.549261019757888e-06, + "loss": 1.7836, + "step": 28768 + }, + { + "epoch": 8.830263965623082, + "grad_norm": 0.17237712442874908, + "learning_rate": 3.547421930303374e-06, + "loss": 1.6978, + "step": 28769 + }, + { + "epoch": 8.830570902394108, + "grad_norm": 0.16467876732349396, + "learning_rate": 3.5455832999204517e-06, + "loss": 1.7526, + "step": 28770 + }, + { + "epoch": 8.830877839165131, + "grad_norm": 0.1549120396375656, + "learning_rate": 3.5437451286273014e-06, + "loss": 1.6955, + "step": 28771 + }, + { + "epoch": 8.831184775936157, + "grad_norm": 0.24028703570365906, + "learning_rate": 3.541907416442103e-06, + "loss": 1.7547, + "step": 28772 + }, + { + "epoch": 8.831491712707182, + "grad_norm": 0.17325441539287567, + "learning_rate": 3.5400701633829856e-06, + "loss": 1.7041, + "step": 28773 + }, + { + "epoch": 8.831798649478207, + "grad_norm": 0.15597397089004517, + "learning_rate": 3.5382333694681467e-06, + "loss": 1.6997, + "step": 28774 + }, + { + "epoch": 8.832105586249233, + "grad_norm": 0.14938347041606903, + "learning_rate": 3.5363970347156994e-06, + "loss": 1.7271, + "step": 28775 + }, + { + "epoch": 8.832412523020258, + "grad_norm": 0.17745234072208405, + "learning_rate": 3.534561159143823e-06, + "loss": 1.714, + "step": 28776 + }, + { + "epoch": 8.832719459791283, + "grad_norm": 0.15323567390441895, + "learning_rate": 3.532725742770643e-06, + "loss": 1.7079, + "step": 28777 + }, + { + "epoch": 8.833026396562309, + "grad_norm": 0.15351314842700958, + "learning_rate": 3.5308907856143046e-06, + "loss": 1.733, + "step": 28778 + }, + { + "epoch": 8.833333333333334, + "grad_norm": 0.19209100306034088, + "learning_rate": 3.5290562876929388e-06, + "loss": 1.7362, + "step": 28779 + }, + { + "epoch": 8.83364027010436, + "grad_norm": 0.2092818021774292, + "learning_rate": 3.5272222490246753e-06, + "loss": 1.7682, + "step": 28780 + }, + { + "epoch": 8.833947206875383, + "grad_norm": 0.21600767970085144, + "learning_rate": 3.5253886696276383e-06, + "loss": 1.8015, + "step": 28781 + }, + { + "epoch": 8.834254143646408, + "grad_norm": 0.11457479000091553, + "learning_rate": 3.5235555495199525e-06, + "loss": 1.6582, + "step": 28782 + }, + { + "epoch": 8.834561080417433, + "grad_norm": 0.1698341816663742, + "learning_rate": 3.5217228887197253e-06, + "loss": 1.7348, + "step": 28783 + }, + { + "epoch": 8.834868017188459, + "grad_norm": 0.1234394982457161, + "learning_rate": 3.5198906872450866e-06, + "loss": 1.6819, + "step": 28784 + }, + { + "epoch": 8.835174953959484, + "grad_norm": 0.15412946045398712, + "learning_rate": 3.518058945114117e-06, + "loss": 1.6972, + "step": 28785 + }, + { + "epoch": 8.83548189073051, + "grad_norm": 0.16202808916568756, + "learning_rate": 3.516227662344951e-06, + "loss": 1.7439, + "step": 28786 + }, + { + "epoch": 8.835788827501535, + "grad_norm": 0.1599927842617035, + "learning_rate": 3.514396838955658e-06, + "loss": 1.7012, + "step": 28787 + }, + { + "epoch": 8.83609576427256, + "grad_norm": 0.1487586498260498, + "learning_rate": 3.512566474964335e-06, + "loss": 1.6844, + "step": 28788 + }, + { + "epoch": 8.836402701043585, + "grad_norm": 0.18033012747764587, + "learning_rate": 3.5107365703890892e-06, + "loss": 1.7855, + "step": 28789 + }, + { + "epoch": 8.83670963781461, + "grad_norm": 0.18171031773090363, + "learning_rate": 3.508907125247979e-06, + "loss": 1.703, + "step": 28790 + }, + { + "epoch": 8.837016574585636, + "grad_norm": 0.14102062582969666, + "learning_rate": 3.507078139559117e-06, + "loss": 1.6627, + "step": 28791 + }, + { + "epoch": 8.83732351135666, + "grad_norm": 0.16365323960781097, + "learning_rate": 3.505249613340539e-06, + "loss": 1.7317, + "step": 28792 + }, + { + "epoch": 8.837630448127685, + "grad_norm": 0.1492282748222351, + "learning_rate": 3.5034215466103417e-06, + "loss": 1.6633, + "step": 28793 + }, + { + "epoch": 8.83793738489871, + "grad_norm": 0.18670693039894104, + "learning_rate": 3.5015939393865937e-06, + "loss": 1.7233, + "step": 28794 + }, + { + "epoch": 8.838244321669736, + "grad_norm": 0.16062071919441223, + "learning_rate": 3.499766791687342e-06, + "loss": 1.7238, + "step": 28795 + }, + { + "epoch": 8.838551258440761, + "grad_norm": 0.158021941781044, + "learning_rate": 3.4979401035306504e-06, + "loss": 1.705, + "step": 28796 + }, + { + "epoch": 8.838858195211786, + "grad_norm": 0.14865651726722717, + "learning_rate": 3.49611387493457e-06, + "loss": 1.6777, + "step": 28797 + }, + { + "epoch": 8.839165131982812, + "grad_norm": 0.12111876904964447, + "learning_rate": 3.4942881059171483e-06, + "loss": 1.6273, + "step": 28798 + }, + { + "epoch": 8.839472068753837, + "grad_norm": 0.12468799948692322, + "learning_rate": 3.4924627964964318e-06, + "loss": 1.6626, + "step": 28799 + }, + { + "epoch": 8.839779005524862, + "grad_norm": 0.12292506545782089, + "learning_rate": 3.490637946690445e-06, + "loss": 1.6448, + "step": 28800 + }, + { + "epoch": 8.840085942295888, + "grad_norm": 0.16731779277324677, + "learning_rate": 3.4888135565172563e-06, + "loss": 1.7541, + "step": 28801 + }, + { + "epoch": 8.840392879066911, + "grad_norm": 0.16351507604122162, + "learning_rate": 3.486989625994852e-06, + "loss": 1.699, + "step": 28802 + }, + { + "epoch": 8.840699815837937, + "grad_norm": 0.12385114282369614, + "learning_rate": 3.485166155141295e-06, + "loss": 1.6852, + "step": 28803 + }, + { + "epoch": 8.841006752608962, + "grad_norm": 0.20780152082443237, + "learning_rate": 3.4833431439745822e-06, + "loss": 1.7179, + "step": 28804 + }, + { + "epoch": 8.841313689379987, + "grad_norm": 0.16182561218738556, + "learning_rate": 3.481520592512727e-06, + "loss": 1.7457, + "step": 28805 + }, + { + "epoch": 8.841620626151013, + "grad_norm": 0.1332414746284485, + "learning_rate": 3.4796985007737705e-06, + "loss": 1.7272, + "step": 28806 + }, + { + "epoch": 8.841927562922038, + "grad_norm": 0.14266319572925568, + "learning_rate": 3.477876868775681e-06, + "loss": 1.7207, + "step": 28807 + }, + { + "epoch": 8.842234499693063, + "grad_norm": 0.162164106965065, + "learning_rate": 3.4760556965364953e-06, + "loss": 1.6948, + "step": 28808 + }, + { + "epoch": 8.842541436464089, + "grad_norm": 0.14134974777698517, + "learning_rate": 3.474234984074182e-06, + "loss": 1.676, + "step": 28809 + }, + { + "epoch": 8.842848373235114, + "grad_norm": 0.16302376985549927, + "learning_rate": 3.4724147314067534e-06, + "loss": 1.7279, + "step": 28810 + }, + { + "epoch": 8.84315531000614, + "grad_norm": 0.1352432370185852, + "learning_rate": 3.4705949385521964e-06, + "loss": 1.7065, + "step": 28811 + }, + { + "epoch": 8.843462246777165, + "grad_norm": 0.13483819365501404, + "learning_rate": 3.46877560552849e-06, + "loss": 1.7275, + "step": 28812 + }, + { + "epoch": 8.84376918354819, + "grad_norm": 0.12226319313049316, + "learning_rate": 3.4669567323536157e-06, + "loss": 1.6965, + "step": 28813 + }, + { + "epoch": 8.844076120319214, + "grad_norm": 0.1687331646680832, + "learning_rate": 3.465138319045552e-06, + "loss": 1.6949, + "step": 28814 + }, + { + "epoch": 8.844383057090239, + "grad_norm": 0.17721997201442719, + "learning_rate": 3.4633203656222635e-06, + "loss": 1.6981, + "step": 28815 + }, + { + "epoch": 8.844689993861264, + "grad_norm": 0.14818120002746582, + "learning_rate": 3.4615028721017186e-06, + "loss": 1.687, + "step": 28816 + }, + { + "epoch": 8.84499693063229, + "grad_norm": 0.15871183574199677, + "learning_rate": 3.459685838501875e-06, + "loss": 1.7403, + "step": 28817 + }, + { + "epoch": 8.845303867403315, + "grad_norm": 0.16533036530017853, + "learning_rate": 3.4578692648407076e-06, + "loss": 1.7879, + "step": 28818 + }, + { + "epoch": 8.84561080417434, + "grad_norm": 0.18678778409957886, + "learning_rate": 3.456053151136135e-06, + "loss": 1.7474, + "step": 28819 + }, + { + "epoch": 8.845917740945366, + "grad_norm": 0.12712402641773224, + "learning_rate": 3.4542374974061488e-06, + "loss": 1.6635, + "step": 28820 + }, + { + "epoch": 8.84622467771639, + "grad_norm": 0.15502063930034637, + "learning_rate": 3.4524223036686566e-06, + "loss": 1.7133, + "step": 28821 + }, + { + "epoch": 8.846531614487416, + "grad_norm": 0.17015717923641205, + "learning_rate": 3.4506075699416e-06, + "loss": 1.7514, + "step": 28822 + }, + { + "epoch": 8.846838551258442, + "grad_norm": 0.15805409848690033, + "learning_rate": 3.4487932962429415e-06, + "loss": 1.7253, + "step": 28823 + }, + { + "epoch": 8.847145488029465, + "grad_norm": 0.14090047776699066, + "learning_rate": 3.446979482590579e-06, + "loss": 1.6763, + "step": 28824 + }, + { + "epoch": 8.84745242480049, + "grad_norm": 0.18115323781967163, + "learning_rate": 3.445166129002464e-06, + "loss": 1.7575, + "step": 28825 + }, + { + "epoch": 8.847759361571516, + "grad_norm": 0.18050703406333923, + "learning_rate": 3.443353235496488e-06, + "loss": 1.7688, + "step": 28826 + }, + { + "epoch": 8.848066298342541, + "grad_norm": 0.13750851154327393, + "learning_rate": 3.441540802090587e-06, + "loss": 1.7416, + "step": 28827 + }, + { + "epoch": 8.848373235113566, + "grad_norm": 0.14183515310287476, + "learning_rate": 3.439728828802674e-06, + "loss": 1.6924, + "step": 28828 + }, + { + "epoch": 8.848680171884592, + "grad_norm": 0.16401416063308716, + "learning_rate": 3.4379173156506517e-06, + "loss": 1.7041, + "step": 28829 + }, + { + "epoch": 8.848987108655617, + "grad_norm": 0.1347450613975525, + "learning_rate": 3.4361062626524166e-06, + "loss": 1.7331, + "step": 28830 + }, + { + "epoch": 8.849294045426642, + "grad_norm": 0.16579827666282654, + "learning_rate": 3.4342956698258768e-06, + "loss": 1.7628, + "step": 28831 + }, + { + "epoch": 8.849600982197668, + "grad_norm": 0.18201382458209991, + "learning_rate": 3.4324855371889177e-06, + "loss": 1.7054, + "step": 28832 + }, + { + "epoch": 8.849907918968693, + "grad_norm": 0.1637437641620636, + "learning_rate": 3.430675864759425e-06, + "loss": 1.7393, + "step": 28833 + }, + { + "epoch": 8.850214855739718, + "grad_norm": 0.1596134454011917, + "learning_rate": 3.4288666525552848e-06, + "loss": 1.7102, + "step": 28834 + }, + { + "epoch": 8.850521792510742, + "grad_norm": 0.1999501883983612, + "learning_rate": 3.4270579005943994e-06, + "loss": 1.7547, + "step": 28835 + }, + { + "epoch": 8.850828729281767, + "grad_norm": 0.15011270344257355, + "learning_rate": 3.4252496088946097e-06, + "loss": 1.6387, + "step": 28836 + }, + { + "epoch": 8.851135666052793, + "grad_norm": 0.12606796622276306, + "learning_rate": 3.4234417774738124e-06, + "loss": 1.6633, + "step": 28837 + }, + { + "epoch": 8.851442602823818, + "grad_norm": 0.19459915161132812, + "learning_rate": 3.421634406349855e-06, + "loss": 1.7424, + "step": 28838 + }, + { + "epoch": 8.851749539594843, + "grad_norm": 0.1512998342514038, + "learning_rate": 3.4198274955406062e-06, + "loss": 1.7007, + "step": 28839 + }, + { + "epoch": 8.852056476365869, + "grad_norm": 0.19419771432876587, + "learning_rate": 3.4180210450639295e-06, + "loss": 1.7223, + "step": 28840 + }, + { + "epoch": 8.852363413136894, + "grad_norm": 0.17737379670143127, + "learning_rate": 3.41621505493766e-06, + "loss": 1.7309, + "step": 28841 + }, + { + "epoch": 8.85267034990792, + "grad_norm": 0.14393949508666992, + "learning_rate": 3.414409525179674e-06, + "loss": 1.7213, + "step": 28842 + }, + { + "epoch": 8.852977286678945, + "grad_norm": 0.11586382240056992, + "learning_rate": 3.412604455807783e-06, + "loss": 1.6675, + "step": 28843 + }, + { + "epoch": 8.85328422344997, + "grad_norm": 0.18049278855323792, + "learning_rate": 3.410799846839846e-06, + "loss": 1.7558, + "step": 28844 + }, + { + "epoch": 8.853591160220994, + "grad_norm": 0.20962421596050262, + "learning_rate": 3.408995698293693e-06, + "loss": 1.7222, + "step": 28845 + }, + { + "epoch": 8.853898096992019, + "grad_norm": 0.12382032722234726, + "learning_rate": 3.4071920101871547e-06, + "loss": 1.7149, + "step": 28846 + }, + { + "epoch": 8.854205033763044, + "grad_norm": 0.15395772457122803, + "learning_rate": 3.405388782538049e-06, + "loss": 1.6986, + "step": 28847 + }, + { + "epoch": 8.85451197053407, + "grad_norm": 0.1579637974500656, + "learning_rate": 3.403586015364202e-06, + "loss": 1.7208, + "step": 28848 + }, + { + "epoch": 8.854818907305095, + "grad_norm": 0.18486931920051575, + "learning_rate": 3.4017837086834315e-06, + "loss": 1.7554, + "step": 28849 + }, + { + "epoch": 8.85512584407612, + "grad_norm": 0.1619080752134323, + "learning_rate": 3.399981862513546e-06, + "loss": 1.7581, + "step": 28850 + }, + { + "epoch": 8.855432780847146, + "grad_norm": 0.14540675282478333, + "learning_rate": 3.3981804768723425e-06, + "loss": 1.7391, + "step": 28851 + }, + { + "epoch": 8.855739717618171, + "grad_norm": 0.17640653252601624, + "learning_rate": 3.396379551777651e-06, + "loss": 1.807, + "step": 28852 + }, + { + "epoch": 8.856046654389196, + "grad_norm": 0.18279080092906952, + "learning_rate": 3.394579087247235e-06, + "loss": 1.7195, + "step": 28853 + }, + { + "epoch": 8.856353591160222, + "grad_norm": 0.17531390488147736, + "learning_rate": 3.3927790832989247e-06, + "loss": 1.7253, + "step": 28854 + }, + { + "epoch": 8.856660527931247, + "grad_norm": 0.14441180229187012, + "learning_rate": 3.3909795399504783e-06, + "loss": 1.7078, + "step": 28855 + }, + { + "epoch": 8.856967464702272, + "grad_norm": 0.16991926729679108, + "learning_rate": 3.3891804572196816e-06, + "loss": 1.6953, + "step": 28856 + }, + { + "epoch": 8.857274401473296, + "grad_norm": 0.17067831754684448, + "learning_rate": 3.3873818351243426e-06, + "loss": 1.7294, + "step": 28857 + }, + { + "epoch": 8.857581338244321, + "grad_norm": 0.14316415786743164, + "learning_rate": 3.3855836736821967e-06, + "loss": 1.7152, + "step": 28858 + }, + { + "epoch": 8.857888275015346, + "grad_norm": 0.13260309398174286, + "learning_rate": 3.383785972911052e-06, + "loss": 1.6761, + "step": 28859 + }, + { + "epoch": 8.858195211786372, + "grad_norm": 0.12228702753782272, + "learning_rate": 3.3819887328286394e-06, + "loss": 1.6802, + "step": 28860 + }, + { + "epoch": 8.858502148557397, + "grad_norm": 0.18033485114574432, + "learning_rate": 3.3801919534527495e-06, + "loss": 1.7828, + "step": 28861 + }, + { + "epoch": 8.858809085328422, + "grad_norm": 0.1613384336233139, + "learning_rate": 3.3783956348011235e-06, + "loss": 1.7068, + "step": 28862 + }, + { + "epoch": 8.859116022099448, + "grad_norm": 0.19849342107772827, + "learning_rate": 3.3765997768915204e-06, + "loss": 1.7139, + "step": 28863 + }, + { + "epoch": 8.859422958870473, + "grad_norm": 0.1470731794834137, + "learning_rate": 3.3748043797416804e-06, + "loss": 1.7104, + "step": 28864 + }, + { + "epoch": 8.859729895641498, + "grad_norm": 0.15868861973285675, + "learning_rate": 3.373009443369357e-06, + "loss": 1.7662, + "step": 28865 + }, + { + "epoch": 8.860036832412524, + "grad_norm": 0.17230434715747833, + "learning_rate": 3.37121496779228e-06, + "loss": 1.6877, + "step": 28866 + }, + { + "epoch": 8.860343769183547, + "grad_norm": 0.1297665536403656, + "learning_rate": 3.3694209530281905e-06, + "loss": 1.6687, + "step": 28867 + }, + { + "epoch": 8.860650705954573, + "grad_norm": 0.13699746131896973, + "learning_rate": 3.3676273990948136e-06, + "loss": 1.6773, + "step": 28868 + }, + { + "epoch": 8.860957642725598, + "grad_norm": 0.12981395423412323, + "learning_rate": 3.3658343060098685e-06, + "loss": 1.6752, + "step": 28869 + }, + { + "epoch": 8.861264579496623, + "grad_norm": 0.15934717655181885, + "learning_rate": 3.3640416737910794e-06, + "loss": 1.7449, + "step": 28870 + }, + { + "epoch": 8.861571516267649, + "grad_norm": 0.13023978471755981, + "learning_rate": 3.3622495024561827e-06, + "loss": 1.698, + "step": 28871 + }, + { + "epoch": 8.861878453038674, + "grad_norm": 0.14700792729854584, + "learning_rate": 3.3604577920228585e-06, + "loss": 1.732, + "step": 28872 + }, + { + "epoch": 8.8621853898097, + "grad_norm": 0.1421707421541214, + "learning_rate": 3.3586665425088314e-06, + "loss": 1.7032, + "step": 28873 + }, + { + "epoch": 8.862492326580725, + "grad_norm": 0.1941523402929306, + "learning_rate": 3.356875753931793e-06, + "loss": 1.7407, + "step": 28874 + }, + { + "epoch": 8.86279926335175, + "grad_norm": 0.15837855637073517, + "learning_rate": 3.3550854263094454e-06, + "loss": 1.755, + "step": 28875 + }, + { + "epoch": 8.863106200122775, + "grad_norm": 0.1624121218919754, + "learning_rate": 3.3532955596594916e-06, + "loss": 1.738, + "step": 28876 + }, + { + "epoch": 8.8634131368938, + "grad_norm": 0.15944771468639374, + "learning_rate": 3.3515061539996007e-06, + "loss": 1.6955, + "step": 28877 + }, + { + "epoch": 8.863720073664824, + "grad_norm": 0.17303216457366943, + "learning_rate": 3.349717209347475e-06, + "loss": 1.7012, + "step": 28878 + }, + { + "epoch": 8.86402701043585, + "grad_norm": 0.14601273834705353, + "learning_rate": 3.347928725720789e-06, + "loss": 1.696, + "step": 28879 + }, + { + "epoch": 8.864333947206875, + "grad_norm": 0.1746055781841278, + "learning_rate": 3.3461407031372125e-06, + "loss": 1.6991, + "step": 28880 + }, + { + "epoch": 8.8646408839779, + "grad_norm": 0.12818776071071625, + "learning_rate": 3.3443531416144147e-06, + "loss": 1.6828, + "step": 28881 + }, + { + "epoch": 8.864947820748926, + "grad_norm": 0.12297061085700989, + "learning_rate": 3.3425660411700697e-06, + "loss": 1.6483, + "step": 28882 + }, + { + "epoch": 8.865254757519951, + "grad_norm": 0.1359318494796753, + "learning_rate": 3.3407794018218307e-06, + "loss": 1.7182, + "step": 28883 + }, + { + "epoch": 8.865561694290976, + "grad_norm": 0.11981796473264694, + "learning_rate": 3.3389932235873612e-06, + "loss": 1.6935, + "step": 28884 + }, + { + "epoch": 8.865868631062002, + "grad_norm": 0.1271422654390335, + "learning_rate": 3.337207506484308e-06, + "loss": 1.6776, + "step": 28885 + }, + { + "epoch": 8.866175567833027, + "grad_norm": 0.1494673788547516, + "learning_rate": 3.335422250530318e-06, + "loss": 1.7041, + "step": 28886 + }, + { + "epoch": 8.866482504604052, + "grad_norm": 0.15046460926532745, + "learning_rate": 3.3336374557430272e-06, + "loss": 1.6714, + "step": 28887 + }, + { + "epoch": 8.866789441375076, + "grad_norm": 0.17862144112586975, + "learning_rate": 3.331853122140105e-06, + "loss": 1.7805, + "step": 28888 + }, + { + "epoch": 8.867096378146101, + "grad_norm": 0.13172993063926697, + "learning_rate": 3.3300692497391483e-06, + "loss": 1.6841, + "step": 28889 + }, + { + "epoch": 8.867403314917127, + "grad_norm": 0.20627157390117645, + "learning_rate": 3.3282858385578098e-06, + "loss": 1.8127, + "step": 28890 + }, + { + "epoch": 8.867710251688152, + "grad_norm": 0.22035779058933258, + "learning_rate": 3.326502888613697e-06, + "loss": 1.7813, + "step": 28891 + }, + { + "epoch": 8.868017188459177, + "grad_norm": 0.15250372886657715, + "learning_rate": 3.3247203999244358e-06, + "loss": 1.7192, + "step": 28892 + }, + { + "epoch": 8.868324125230203, + "grad_norm": 0.1745261251926422, + "learning_rate": 3.3229383725076614e-06, + "loss": 1.72, + "step": 28893 + }, + { + "epoch": 8.868631062001228, + "grad_norm": 0.1768372803926468, + "learning_rate": 3.3211568063809483e-06, + "loss": 1.7582, + "step": 28894 + }, + { + "epoch": 8.868937998772253, + "grad_norm": 0.14829827845096588, + "learning_rate": 3.3193757015619443e-06, + "loss": 1.6749, + "step": 28895 + }, + { + "epoch": 8.869244935543279, + "grad_norm": 0.13321566581726074, + "learning_rate": 3.3175950580682123e-06, + "loss": 1.6854, + "step": 28896 + }, + { + "epoch": 8.869551872314304, + "grad_norm": 0.12003330886363983, + "learning_rate": 3.315814875917372e-06, + "loss": 1.6611, + "step": 28897 + }, + { + "epoch": 8.86985880908533, + "grad_norm": 0.1468251645565033, + "learning_rate": 3.3140351551270157e-06, + "loss": 1.6674, + "step": 28898 + }, + { + "epoch": 8.870165745856355, + "grad_norm": 0.2222270667552948, + "learning_rate": 3.312255895714722e-06, + "loss": 1.6472, + "step": 28899 + }, + { + "epoch": 8.870472682627378, + "grad_norm": 0.14377200603485107, + "learning_rate": 3.3104770976980836e-06, + "loss": 1.6835, + "step": 28900 + }, + { + "epoch": 8.870779619398403, + "grad_norm": 0.19064709544181824, + "learning_rate": 3.3086987610946807e-06, + "loss": 1.7172, + "step": 28901 + }, + { + "epoch": 8.871086556169429, + "grad_norm": 0.21035094559192657, + "learning_rate": 3.306920885922077e-06, + "loss": 1.7199, + "step": 28902 + }, + { + "epoch": 8.871393492940454, + "grad_norm": 0.1529282182455063, + "learning_rate": 3.3051434721978526e-06, + "loss": 1.672, + "step": 28903 + }, + { + "epoch": 8.87170042971148, + "grad_norm": 0.13990004360675812, + "learning_rate": 3.3033665199395546e-06, + "loss": 1.7204, + "step": 28904 + }, + { + "epoch": 8.872007366482505, + "grad_norm": 0.20450010895729065, + "learning_rate": 3.3015900291647805e-06, + "loss": 1.7619, + "step": 28905 + }, + { + "epoch": 8.87231430325353, + "grad_norm": 0.13215813040733337, + "learning_rate": 3.2998139998910547e-06, + "loss": 1.6999, + "step": 28906 + }, + { + "epoch": 8.872621240024555, + "grad_norm": 0.12693628668785095, + "learning_rate": 3.2980384321359413e-06, + "loss": 1.7075, + "step": 28907 + }, + { + "epoch": 8.87292817679558, + "grad_norm": 0.1447865515947342, + "learning_rate": 3.2962633259169817e-06, + "loss": 1.697, + "step": 28908 + }, + { + "epoch": 8.873235113566606, + "grad_norm": 0.16820397973060608, + "learning_rate": 3.2944886812517173e-06, + "loss": 1.7087, + "step": 28909 + }, + { + "epoch": 8.87354205033763, + "grad_norm": 0.12102416902780533, + "learning_rate": 3.2927144981577007e-06, + "loss": 1.6655, + "step": 28910 + }, + { + "epoch": 8.873848987108655, + "grad_norm": 0.17087550461292267, + "learning_rate": 3.290940776652446e-06, + "loss": 1.7518, + "step": 28911 + }, + { + "epoch": 8.87415592387968, + "grad_norm": 0.15695004165172577, + "learning_rate": 3.2891675167535054e-06, + "loss": 1.6848, + "step": 28912 + }, + { + "epoch": 8.874462860650706, + "grad_norm": 0.16303250193595886, + "learning_rate": 3.2873947184783705e-06, + "loss": 1.7705, + "step": 28913 + }, + { + "epoch": 8.874769797421731, + "grad_norm": 0.1679360568523407, + "learning_rate": 3.2856223818445885e-06, + "loss": 1.6923, + "step": 28914 + }, + { + "epoch": 8.875076734192756, + "grad_norm": 0.1721598356962204, + "learning_rate": 3.283850506869668e-06, + "loss": 1.7164, + "step": 28915 + }, + { + "epoch": 8.875383670963782, + "grad_norm": 0.14126230776309967, + "learning_rate": 3.2820790935711223e-06, + "loss": 1.6794, + "step": 28916 + }, + { + "epoch": 8.875690607734807, + "grad_norm": 0.14232057332992554, + "learning_rate": 3.2803081419664484e-06, + "loss": 1.6844, + "step": 28917 + }, + { + "epoch": 8.875997544505832, + "grad_norm": 0.15812624990940094, + "learning_rate": 3.278537652073149e-06, + "loss": 1.6951, + "step": 28918 + }, + { + "epoch": 8.876304481276858, + "grad_norm": 0.15904119610786438, + "learning_rate": 3.276767623908733e-06, + "loss": 1.6761, + "step": 28919 + }, + { + "epoch": 8.876611418047883, + "grad_norm": 0.18227824568748474, + "learning_rate": 3.2749980574906803e-06, + "loss": 1.7714, + "step": 28920 + }, + { + "epoch": 8.876918354818907, + "grad_norm": 0.1715840995311737, + "learning_rate": 3.2732289528364766e-06, + "loss": 1.7491, + "step": 28921 + }, + { + "epoch": 8.877225291589932, + "grad_norm": 0.15899239480495453, + "learning_rate": 3.2714603099636256e-06, + "loss": 1.7188, + "step": 28922 + }, + { + "epoch": 8.877532228360957, + "grad_norm": 0.14183032512664795, + "learning_rate": 3.269692128889584e-06, + "loss": 1.71, + "step": 28923 + }, + { + "epoch": 8.877839165131983, + "grad_norm": 0.145817831158638, + "learning_rate": 3.2679244096318396e-06, + "loss": 1.7475, + "step": 28924 + }, + { + "epoch": 8.878146101903008, + "grad_norm": 0.20818611979484558, + "learning_rate": 3.2661571522078493e-06, + "loss": 1.7292, + "step": 28925 + }, + { + "epoch": 8.878453038674033, + "grad_norm": 0.18658684194087982, + "learning_rate": 3.264390356635083e-06, + "loss": 1.7588, + "step": 28926 + }, + { + "epoch": 8.878759975445059, + "grad_norm": 0.14851678907871246, + "learning_rate": 3.2626240229310214e-06, + "loss": 1.7177, + "step": 28927 + }, + { + "epoch": 8.879066912216084, + "grad_norm": 0.14433394372463226, + "learning_rate": 3.260858151113083e-06, + "loss": 1.7033, + "step": 28928 + }, + { + "epoch": 8.87937384898711, + "grad_norm": 0.18791940808296204, + "learning_rate": 3.2590927411987547e-06, + "loss": 1.7142, + "step": 28929 + }, + { + "epoch": 8.879680785758135, + "grad_norm": 0.15765266120433807, + "learning_rate": 3.2573277932054504e-06, + "loss": 1.7294, + "step": 28930 + }, + { + "epoch": 8.879987722529158, + "grad_norm": 0.17016790807247162, + "learning_rate": 3.255563307150644e-06, + "loss": 1.7263, + "step": 28931 + }, + { + "epoch": 8.880294659300183, + "grad_norm": 0.18677684664726257, + "learning_rate": 3.2537992830517505e-06, + "loss": 1.708, + "step": 28932 + }, + { + "epoch": 8.880601596071209, + "grad_norm": 0.13736851513385773, + "learning_rate": 3.2520357209262165e-06, + "loss": 1.6971, + "step": 28933 + }, + { + "epoch": 8.880908532842234, + "grad_norm": 0.15366335213184357, + "learning_rate": 3.250272620791467e-06, + "loss": 1.7093, + "step": 28934 + }, + { + "epoch": 8.88121546961326, + "grad_norm": 0.15538384020328522, + "learning_rate": 3.248509982664921e-06, + "loss": 1.7036, + "step": 28935 + }, + { + "epoch": 8.881522406384285, + "grad_norm": 0.137898787856102, + "learning_rate": 3.2467478065639988e-06, + "loss": 1.6654, + "step": 28936 + }, + { + "epoch": 8.88182934315531, + "grad_norm": 0.15095695853233337, + "learning_rate": 3.244986092506125e-06, + "loss": 1.736, + "step": 28937 + }, + { + "epoch": 8.882136279926335, + "grad_norm": 0.15554696321487427, + "learning_rate": 3.2432248405086908e-06, + "loss": 1.7172, + "step": 28938 + }, + { + "epoch": 8.88244321669736, + "grad_norm": 0.18302778899669647, + "learning_rate": 3.241464050589127e-06, + "loss": 1.7441, + "step": 28939 + }, + { + "epoch": 8.882750153468386, + "grad_norm": 0.18259480595588684, + "learning_rate": 3.2397037227648142e-06, + "loss": 1.6983, + "step": 28940 + }, + { + "epoch": 8.883057090239411, + "grad_norm": 0.14723163843154907, + "learning_rate": 3.2379438570531608e-06, + "loss": 1.7007, + "step": 28941 + }, + { + "epoch": 8.883364027010435, + "grad_norm": 0.1403069794178009, + "learning_rate": 3.2361844534715524e-06, + "loss": 1.6545, + "step": 28942 + }, + { + "epoch": 8.88367096378146, + "grad_norm": 0.1433728039264679, + "learning_rate": 3.2344255120373644e-06, + "loss": 1.6977, + "step": 28943 + }, + { + "epoch": 8.883977900552486, + "grad_norm": 0.18680740892887115, + "learning_rate": 3.2326670327680165e-06, + "loss": 1.756, + "step": 28944 + }, + { + "epoch": 8.884284837323511, + "grad_norm": 0.13080160319805145, + "learning_rate": 3.2309090156808498e-06, + "loss": 1.703, + "step": 28945 + }, + { + "epoch": 8.884591774094536, + "grad_norm": 0.126779243350029, + "learning_rate": 3.2291514607932616e-06, + "loss": 1.6717, + "step": 28946 + }, + { + "epoch": 8.884898710865562, + "grad_norm": 0.15787595510482788, + "learning_rate": 3.2273943681225992e-06, + "loss": 1.7005, + "step": 28947 + }, + { + "epoch": 8.885205647636587, + "grad_norm": 0.13189679384231567, + "learning_rate": 3.225637737686249e-06, + "loss": 1.6599, + "step": 28948 + }, + { + "epoch": 8.885512584407612, + "grad_norm": 0.13954944908618927, + "learning_rate": 3.2238815695015635e-06, + "loss": 1.7261, + "step": 28949 + }, + { + "epoch": 8.885819521178638, + "grad_norm": 0.2115267813205719, + "learning_rate": 3.2221258635858897e-06, + "loss": 1.7459, + "step": 28950 + }, + { + "epoch": 8.886126457949663, + "grad_norm": 0.15017318725585938, + "learning_rate": 3.220370619956592e-06, + "loss": 1.6929, + "step": 28951 + }, + { + "epoch": 8.886433394720687, + "grad_norm": 0.16980741918087006, + "learning_rate": 3.218615838631006e-06, + "loss": 1.802, + "step": 28952 + }, + { + "epoch": 8.886740331491712, + "grad_norm": 0.1366024613380432, + "learning_rate": 3.216861519626485e-06, + "loss": 1.6886, + "step": 28953 + }, + { + "epoch": 8.887047268262737, + "grad_norm": 0.16248583793640137, + "learning_rate": 3.2151076629603537e-06, + "loss": 1.6992, + "step": 28954 + }, + { + "epoch": 8.887354205033763, + "grad_norm": 0.1727447360754013, + "learning_rate": 3.213354268649943e-06, + "loss": 1.7412, + "step": 28955 + }, + { + "epoch": 8.887661141804788, + "grad_norm": 0.12872622907161713, + "learning_rate": 3.2116013367125996e-06, + "loss": 1.641, + "step": 28956 + }, + { + "epoch": 8.887968078575813, + "grad_norm": 0.12361441552639008, + "learning_rate": 3.2098488671656323e-06, + "loss": 1.6764, + "step": 28957 + }, + { + "epoch": 8.888275015346839, + "grad_norm": 0.1612539142370224, + "learning_rate": 3.2080968600263604e-06, + "loss": 1.6646, + "step": 28958 + }, + { + "epoch": 8.888581952117864, + "grad_norm": 0.15859587490558624, + "learning_rate": 3.2063453153121035e-06, + "loss": 1.6981, + "step": 28959 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 0.12860243022441864, + "learning_rate": 3.204594233040159e-06, + "loss": 1.6645, + "step": 28960 + }, + { + "epoch": 8.889195825659915, + "grad_norm": 0.232563316822052, + "learning_rate": 3.202843613227857e-06, + "loss": 1.6965, + "step": 28961 + }, + { + "epoch": 8.88950276243094, + "grad_norm": 0.15783043205738068, + "learning_rate": 3.2010934558924676e-06, + "loss": 1.7294, + "step": 28962 + }, + { + "epoch": 8.889809699201965, + "grad_norm": 0.13369722664356232, + "learning_rate": 3.199343761051321e-06, + "loss": 1.6778, + "step": 28963 + }, + { + "epoch": 8.890116635972989, + "grad_norm": 0.14463269710540771, + "learning_rate": 3.1975945287216756e-06, + "loss": 1.7211, + "step": 28964 + }, + { + "epoch": 8.890423572744014, + "grad_norm": 0.22744107246398926, + "learning_rate": 3.1958457589208346e-06, + "loss": 1.7234, + "step": 28965 + }, + { + "epoch": 8.89073050951504, + "grad_norm": 0.17402450740337372, + "learning_rate": 3.1940974516660836e-06, + "loss": 1.7355, + "step": 28966 + }, + { + "epoch": 8.891037446286065, + "grad_norm": 0.14022772014141083, + "learning_rate": 3.1923496069746927e-06, + "loss": 1.7029, + "step": 28967 + }, + { + "epoch": 8.89134438305709, + "grad_norm": 0.18977795541286469, + "learning_rate": 3.1906022248639368e-06, + "loss": 1.7213, + "step": 28968 + }, + { + "epoch": 8.891651319828116, + "grad_norm": 0.11371618509292603, + "learning_rate": 3.1888553053510905e-06, + "loss": 1.6521, + "step": 28969 + }, + { + "epoch": 8.89195825659914, + "grad_norm": 0.16720212996006012, + "learning_rate": 3.1871088484534073e-06, + "loss": 1.7186, + "step": 28970 + }, + { + "epoch": 8.892265193370166, + "grad_norm": 0.1317000538110733, + "learning_rate": 3.1853628541881563e-06, + "loss": 1.6905, + "step": 28971 + }, + { + "epoch": 8.892572130141192, + "grad_norm": 0.15759915113449097, + "learning_rate": 3.1836173225725797e-06, + "loss": 1.7293, + "step": 28972 + }, + { + "epoch": 8.892879066912217, + "grad_norm": 0.1597949117422104, + "learning_rate": 3.181872253623952e-06, + "loss": 1.6696, + "step": 28973 + }, + { + "epoch": 8.89318600368324, + "grad_norm": 0.12234945595264435, + "learning_rate": 3.1801276473594934e-06, + "loss": 1.7154, + "step": 28974 + }, + { + "epoch": 8.893492940454266, + "grad_norm": 0.12929682433605194, + "learning_rate": 3.1783835037964616e-06, + "loss": 1.7071, + "step": 28975 + }, + { + "epoch": 8.893799877225291, + "grad_norm": 0.1875714361667633, + "learning_rate": 3.176639822952082e-06, + "loss": 1.7708, + "step": 28976 + }, + { + "epoch": 8.894106813996316, + "grad_norm": 0.13817653059959412, + "learning_rate": 3.1748966048435858e-06, + "loss": 1.6894, + "step": 28977 + }, + { + "epoch": 8.894413750767342, + "grad_norm": 0.16731882095336914, + "learning_rate": 3.1731538494882198e-06, + "loss": 1.7706, + "step": 28978 + }, + { + "epoch": 8.894720687538367, + "grad_norm": 0.16811375319957733, + "learning_rate": 3.171411556903181e-06, + "loss": 1.7372, + "step": 28979 + }, + { + "epoch": 8.895027624309392, + "grad_norm": 0.11702638864517212, + "learning_rate": 3.1696697271057117e-06, + "loss": 1.6523, + "step": 28980 + }, + { + "epoch": 8.895334561080418, + "grad_norm": 0.12287343293428421, + "learning_rate": 3.1679283601130037e-06, + "loss": 1.6938, + "step": 28981 + }, + { + "epoch": 8.895641497851443, + "grad_norm": 0.10473133623600006, + "learning_rate": 3.166187455942282e-06, + "loss": 1.6731, + "step": 28982 + }, + { + "epoch": 8.895948434622468, + "grad_norm": 0.13022342324256897, + "learning_rate": 3.164447014610744e-06, + "loss": 1.679, + "step": 28983 + }, + { + "epoch": 8.896255371393494, + "grad_norm": 0.16077135503292084, + "learning_rate": 3.1627070361355925e-06, + "loss": 1.7466, + "step": 28984 + }, + { + "epoch": 8.896562308164517, + "grad_norm": 0.14103242754936218, + "learning_rate": 3.160967520534025e-06, + "loss": 1.6936, + "step": 28985 + }, + { + "epoch": 8.896869244935543, + "grad_norm": 0.12953349947929382, + "learning_rate": 3.1592284678232277e-06, + "loss": 1.7125, + "step": 28986 + }, + { + "epoch": 8.897176181706568, + "grad_norm": 0.11083797365427017, + "learning_rate": 3.157489878020392e-06, + "loss": 1.6455, + "step": 28987 + }, + { + "epoch": 8.897483118477593, + "grad_norm": 0.12037435173988342, + "learning_rate": 3.1557517511426936e-06, + "loss": 1.6569, + "step": 28988 + }, + { + "epoch": 8.897790055248619, + "grad_norm": 0.17309941351413727, + "learning_rate": 3.154014087207302e-06, + "loss": 1.7142, + "step": 28989 + }, + { + "epoch": 8.898096992019644, + "grad_norm": 0.15349642932415009, + "learning_rate": 3.15227688623142e-06, + "loss": 1.7375, + "step": 28990 + }, + { + "epoch": 8.89840392879067, + "grad_norm": 0.175978422164917, + "learning_rate": 3.1505401482321896e-06, + "loss": 1.7023, + "step": 28991 + }, + { + "epoch": 8.898710865561695, + "grad_norm": 0.13710327446460724, + "learning_rate": 3.14880387322678e-06, + "loss": 1.6462, + "step": 28992 + }, + { + "epoch": 8.89901780233272, + "grad_norm": 0.11777636408805847, + "learning_rate": 3.14706806123235e-06, + "loss": 1.6187, + "step": 28993 + }, + { + "epoch": 8.899324739103745, + "grad_norm": 0.1707836240530014, + "learning_rate": 3.145332712266047e-06, + "loss": 1.7314, + "step": 28994 + }, + { + "epoch": 8.899631675874769, + "grad_norm": 0.15286721289157867, + "learning_rate": 3.143597826345046e-06, + "loss": 1.6874, + "step": 28995 + }, + { + "epoch": 8.899938612645794, + "grad_norm": 0.1401689052581787, + "learning_rate": 3.141863403486456e-06, + "loss": 1.6795, + "step": 28996 + }, + { + "epoch": 8.90024554941682, + "grad_norm": 0.13194917142391205, + "learning_rate": 3.1401294437074512e-06, + "loss": 1.6967, + "step": 28997 + }, + { + "epoch": 8.900552486187845, + "grad_norm": 0.1518833339214325, + "learning_rate": 3.1383959470251413e-06, + "loss": 1.6914, + "step": 28998 + }, + { + "epoch": 8.90085942295887, + "grad_norm": 0.12354082614183426, + "learning_rate": 3.1366629134566727e-06, + "loss": 1.6809, + "step": 28999 + }, + { + "epoch": 8.901166359729896, + "grad_norm": 0.2156827449798584, + "learning_rate": 3.1349303430191712e-06, + "loss": 1.7617, + "step": 29000 + }, + { + "epoch": 8.901473296500921, + "grad_norm": 0.15934047102928162, + "learning_rate": 3.133198235729756e-06, + "loss": 1.7443, + "step": 29001 + }, + { + "epoch": 8.901780233271946, + "grad_norm": 0.13422276079654694, + "learning_rate": 3.1314665916055473e-06, + "loss": 1.7238, + "step": 29002 + }, + { + "epoch": 8.902087170042972, + "grad_norm": 0.1727958619594574, + "learning_rate": 3.1297354106636535e-06, + "loss": 1.7208, + "step": 29003 + }, + { + "epoch": 8.902394106813997, + "grad_norm": 0.14110971987247467, + "learning_rate": 3.1280046929211827e-06, + "loss": 1.6586, + "step": 29004 + }, + { + "epoch": 8.902701043585022, + "grad_norm": 0.1527067869901657, + "learning_rate": 3.126274438395249e-06, + "loss": 1.6908, + "step": 29005 + }, + { + "epoch": 8.903007980356048, + "grad_norm": 0.1663844734430313, + "learning_rate": 3.1245446471029392e-06, + "loss": 1.7263, + "step": 29006 + }, + { + "epoch": 8.903314917127071, + "grad_norm": 0.23200902342796326, + "learning_rate": 3.1228153190613563e-06, + "loss": 1.7564, + "step": 29007 + }, + { + "epoch": 8.903621853898096, + "grad_norm": 0.1557004153728485, + "learning_rate": 3.1210864542875917e-06, + "loss": 1.721, + "step": 29008 + }, + { + "epoch": 8.903928790669122, + "grad_norm": 0.1682535856962204, + "learning_rate": 3.1193580527987208e-06, + "loss": 1.7244, + "step": 29009 + }, + { + "epoch": 8.904235727440147, + "grad_norm": 0.17813025414943695, + "learning_rate": 3.117630114611836e-06, + "loss": 1.6873, + "step": 29010 + }, + { + "epoch": 8.904542664211172, + "grad_norm": 0.16720467805862427, + "learning_rate": 3.1159026397440007e-06, + "loss": 1.7588, + "step": 29011 + }, + { + "epoch": 8.904849600982198, + "grad_norm": 0.12350224703550339, + "learning_rate": 3.114175628212307e-06, + "loss": 1.6641, + "step": 29012 + }, + { + "epoch": 8.905156537753223, + "grad_norm": 0.16594655811786652, + "learning_rate": 3.112449080033797e-06, + "loss": 1.6896, + "step": 29013 + }, + { + "epoch": 8.905463474524248, + "grad_norm": 0.11925587058067322, + "learning_rate": 3.110722995225562e-06, + "loss": 1.6751, + "step": 29014 + }, + { + "epoch": 8.905770411295274, + "grad_norm": 0.15165284276008606, + "learning_rate": 3.108997373804634e-06, + "loss": 1.6983, + "step": 29015 + }, + { + "epoch": 8.9060773480663, + "grad_norm": 0.1934432089328766, + "learning_rate": 3.107272215788082e-06, + "loss": 1.6972, + "step": 29016 + }, + { + "epoch": 8.906384284837323, + "grad_norm": 0.1574355512857437, + "learning_rate": 3.1055475211929474e-06, + "loss": 1.751, + "step": 29017 + }, + { + "epoch": 8.906691221608348, + "grad_norm": 0.17686793208122253, + "learning_rate": 3.1038232900362787e-06, + "loss": 1.7705, + "step": 29018 + }, + { + "epoch": 8.906998158379373, + "grad_norm": 0.20089837908744812, + "learning_rate": 3.102099522335117e-06, + "loss": 1.8083, + "step": 29019 + }, + { + "epoch": 8.907305095150399, + "grad_norm": 0.1398555189371109, + "learning_rate": 3.1003762181064986e-06, + "loss": 1.7181, + "step": 29020 + }, + { + "epoch": 8.907612031921424, + "grad_norm": 0.14177222549915314, + "learning_rate": 3.09865337736745e-06, + "loss": 1.671, + "step": 29021 + }, + { + "epoch": 8.90791896869245, + "grad_norm": 0.17582249641418457, + "learning_rate": 3.0969310001349948e-06, + "loss": 1.7112, + "step": 29022 + }, + { + "epoch": 8.908225905463475, + "grad_norm": 0.16887766122817993, + "learning_rate": 3.0952090864261594e-06, + "loss": 1.7281, + "step": 29023 + }, + { + "epoch": 8.9085328422345, + "grad_norm": 0.1768682301044464, + "learning_rate": 3.093487636257958e-06, + "loss": 1.6584, + "step": 29024 + }, + { + "epoch": 8.908839779005525, + "grad_norm": 0.15997330844402313, + "learning_rate": 3.0917666496474095e-06, + "loss": 1.7051, + "step": 29025 + }, + { + "epoch": 8.90914671577655, + "grad_norm": 0.16596661508083344, + "learning_rate": 3.0900461266115124e-06, + "loss": 1.6899, + "step": 29026 + }, + { + "epoch": 8.909453652547576, + "grad_norm": 0.1477203071117401, + "learning_rate": 3.088326067167274e-06, + "loss": 1.6982, + "step": 29027 + }, + { + "epoch": 8.9097605893186, + "grad_norm": 0.170956552028656, + "learning_rate": 3.086606471331699e-06, + "loss": 1.6561, + "step": 29028 + }, + { + "epoch": 8.910067526089625, + "grad_norm": 0.1777859330177307, + "learning_rate": 3.0848873391217727e-06, + "loss": 1.7638, + "step": 29029 + }, + { + "epoch": 8.91037446286065, + "grad_norm": 0.20077209174633026, + "learning_rate": 3.083168670554476e-06, + "loss": 1.7588, + "step": 29030 + }, + { + "epoch": 8.910681399631676, + "grad_norm": 0.15471714735031128, + "learning_rate": 3.0814504656468234e-06, + "loss": 1.682, + "step": 29031 + }, + { + "epoch": 8.910988336402701, + "grad_norm": 0.1711329072713852, + "learning_rate": 3.0797327244157624e-06, + "loss": 1.6883, + "step": 29032 + }, + { + "epoch": 8.911295273173726, + "grad_norm": 0.11440590023994446, + "learning_rate": 3.0780154468782905e-06, + "loss": 1.6861, + "step": 29033 + }, + { + "epoch": 8.911602209944752, + "grad_norm": 0.15305832028388977, + "learning_rate": 3.0762986330513722e-06, + "loss": 1.7208, + "step": 29034 + }, + { + "epoch": 8.911909146715777, + "grad_norm": 0.13767275214195251, + "learning_rate": 3.0745822829519766e-06, + "loss": 1.7319, + "step": 29035 + }, + { + "epoch": 8.912216083486802, + "grad_norm": 0.15172621607780457, + "learning_rate": 3.0728663965970573e-06, + "loss": 1.7003, + "step": 29036 + }, + { + "epoch": 8.912523020257828, + "grad_norm": 0.16932672262191772, + "learning_rate": 3.071150974003578e-06, + "loss": 1.709, + "step": 29037 + }, + { + "epoch": 8.912829957028851, + "grad_norm": 0.13176152110099792, + "learning_rate": 3.069436015188493e-06, + "loss": 1.6714, + "step": 29038 + }, + { + "epoch": 8.913136893799877, + "grad_norm": 0.17337891459465027, + "learning_rate": 3.067721520168748e-06, + "loss": 1.7786, + "step": 29039 + }, + { + "epoch": 8.913443830570902, + "grad_norm": 0.12546442449092865, + "learning_rate": 3.0660074889612867e-06, + "loss": 1.7219, + "step": 29040 + }, + { + "epoch": 8.913750767341927, + "grad_norm": 0.21087953448295593, + "learning_rate": 3.0642939215830444e-06, + "loss": 1.7541, + "step": 29041 + }, + { + "epoch": 8.914057704112953, + "grad_norm": 0.16880549490451813, + "learning_rate": 3.062580818050964e-06, + "loss": 1.7299, + "step": 29042 + }, + { + "epoch": 8.914364640883978, + "grad_norm": 0.15600517392158508, + "learning_rate": 3.0608681783819705e-06, + "loss": 1.6801, + "step": 29043 + }, + { + "epoch": 8.914671577655003, + "grad_norm": 0.11458457261323929, + "learning_rate": 3.059156002592989e-06, + "loss": 1.6393, + "step": 29044 + }, + { + "epoch": 8.914978514426029, + "grad_norm": 0.15529881417751312, + "learning_rate": 3.0574442907009393e-06, + "loss": 1.7288, + "step": 29045 + }, + { + "epoch": 8.915285451197054, + "grad_norm": 0.15211673080921173, + "learning_rate": 3.0557330427227415e-06, + "loss": 1.6784, + "step": 29046 + }, + { + "epoch": 8.91559238796808, + "grad_norm": 0.13714905083179474, + "learning_rate": 3.054022258675293e-06, + "loss": 1.7047, + "step": 29047 + }, + { + "epoch": 8.915899324739105, + "grad_norm": 0.1595524698495865, + "learning_rate": 3.0523119385755304e-06, + "loss": 1.722, + "step": 29048 + }, + { + "epoch": 8.91620626151013, + "grad_norm": 0.16744185984134674, + "learning_rate": 3.0506020824403235e-06, + "loss": 1.6754, + "step": 29049 + }, + { + "epoch": 8.916513198281153, + "grad_norm": 0.13333237171173096, + "learning_rate": 3.048892690286598e-06, + "loss": 1.7332, + "step": 29050 + }, + { + "epoch": 8.916820135052179, + "grad_norm": 0.19067470729351044, + "learning_rate": 3.0471837621312228e-06, + "loss": 1.7034, + "step": 29051 + }, + { + "epoch": 8.917127071823204, + "grad_norm": 0.1292569637298584, + "learning_rate": 3.0454752979911018e-06, + "loss": 1.652, + "step": 29052 + }, + { + "epoch": 8.91743400859423, + "grad_norm": 0.15452222526073456, + "learning_rate": 3.0437672978831155e-06, + "loss": 1.7183, + "step": 29053 + }, + { + "epoch": 8.917740945365255, + "grad_norm": 0.16528162360191345, + "learning_rate": 3.04205976182414e-06, + "loss": 1.7099, + "step": 29054 + }, + { + "epoch": 8.91804788213628, + "grad_norm": 0.22729776799678802, + "learning_rate": 3.0403526898310553e-06, + "loss": 1.7353, + "step": 29055 + }, + { + "epoch": 8.918354818907305, + "grad_norm": 0.134805828332901, + "learning_rate": 3.038646081920732e-06, + "loss": 1.6975, + "step": 29056 + }, + { + "epoch": 8.91866175567833, + "grad_norm": 0.15781652927398682, + "learning_rate": 3.0369399381100282e-06, + "loss": 1.7197, + "step": 29057 + }, + { + "epoch": 8.918968692449356, + "grad_norm": 0.19794493913650513, + "learning_rate": 3.0352342584158146e-06, + "loss": 1.6894, + "step": 29058 + }, + { + "epoch": 8.919275629220381, + "grad_norm": 0.14306722581386566, + "learning_rate": 3.033529042854938e-06, + "loss": 1.6885, + "step": 29059 + }, + { + "epoch": 8.919582565991405, + "grad_norm": 0.1341150999069214, + "learning_rate": 3.0318242914442574e-06, + "loss": 1.7154, + "step": 29060 + }, + { + "epoch": 8.91988950276243, + "grad_norm": 0.2001344859600067, + "learning_rate": 3.0301200042006208e-06, + "loss": 1.7537, + "step": 29061 + }, + { + "epoch": 8.920196439533456, + "grad_norm": 0.22544899582862854, + "learning_rate": 3.028416181140864e-06, + "loss": 1.7656, + "step": 29062 + }, + { + "epoch": 8.920503376304481, + "grad_norm": 0.13061828911304474, + "learning_rate": 3.0267128222818298e-06, + "loss": 1.6929, + "step": 29063 + }, + { + "epoch": 8.920810313075506, + "grad_norm": 0.19021448493003845, + "learning_rate": 3.025009927640349e-06, + "loss": 1.7858, + "step": 29064 + }, + { + "epoch": 8.921117249846532, + "grad_norm": 0.15748682618141174, + "learning_rate": 3.023307497233263e-06, + "loss": 1.6983, + "step": 29065 + }, + { + "epoch": 8.921424186617557, + "grad_norm": 0.20138932764530182, + "learning_rate": 3.0216055310773704e-06, + "loss": 1.7891, + "step": 29066 + }, + { + "epoch": 8.921731123388582, + "grad_norm": 0.11930065602064133, + "learning_rate": 3.0199040291895242e-06, + "loss": 1.6733, + "step": 29067 + }, + { + "epoch": 8.922038060159608, + "grad_norm": 0.17451462149620056, + "learning_rate": 3.0182029915865107e-06, + "loss": 1.717, + "step": 29068 + }, + { + "epoch": 8.922344996930633, + "grad_norm": 0.13890404999256134, + "learning_rate": 3.0165024182851553e-06, + "loss": 1.6821, + "step": 29069 + }, + { + "epoch": 8.922651933701658, + "grad_norm": 0.15502439439296722, + "learning_rate": 3.0148023093022613e-06, + "loss": 1.6746, + "step": 29070 + }, + { + "epoch": 8.922958870472682, + "grad_norm": 0.14066965878009796, + "learning_rate": 3.013102664654627e-06, + "loss": 1.6979, + "step": 29071 + }, + { + "epoch": 8.923265807243707, + "grad_norm": 0.15466643869876862, + "learning_rate": 3.01140348435906e-06, + "loss": 1.7306, + "step": 29072 + }, + { + "epoch": 8.923572744014733, + "grad_norm": 0.15576320886611938, + "learning_rate": 3.0097047684323363e-06, + "loss": 1.7241, + "step": 29073 + }, + { + "epoch": 8.923879680785758, + "grad_norm": 0.15748077630996704, + "learning_rate": 3.008006516891254e-06, + "loss": 1.7053, + "step": 29074 + }, + { + "epoch": 8.924186617556783, + "grad_norm": 0.19139769673347473, + "learning_rate": 3.0063087297525995e-06, + "loss": 1.7361, + "step": 29075 + }, + { + "epoch": 8.924493554327809, + "grad_norm": 0.12561291456222534, + "learning_rate": 3.0046114070331423e-06, + "loss": 1.6982, + "step": 29076 + }, + { + "epoch": 8.924800491098834, + "grad_norm": 0.140936940908432, + "learning_rate": 3.002914548749658e-06, + "loss": 1.66, + "step": 29077 + }, + { + "epoch": 8.92510742786986, + "grad_norm": 0.19634532928466797, + "learning_rate": 3.001218154918922e-06, + "loss": 1.6947, + "step": 29078 + }, + { + "epoch": 8.925414364640885, + "grad_norm": 0.1971811205148697, + "learning_rate": 2.999522225557694e-06, + "loss": 1.7133, + "step": 29079 + }, + { + "epoch": 8.92572130141191, + "grad_norm": 0.15782490372657776, + "learning_rate": 2.9978267606827314e-06, + "loss": 1.6724, + "step": 29080 + }, + { + "epoch": 8.926028238182933, + "grad_norm": 0.1563064008951187, + "learning_rate": 2.9961317603107887e-06, + "loss": 1.7942, + "step": 29081 + }, + { + "epoch": 8.926335174953959, + "grad_norm": 0.1192200556397438, + "learning_rate": 2.994437224458635e-06, + "loss": 1.6736, + "step": 29082 + }, + { + "epoch": 8.926642111724984, + "grad_norm": 0.14355097711086273, + "learning_rate": 2.9927431531429905e-06, + "loss": 1.6968, + "step": 29083 + }, + { + "epoch": 8.92694904849601, + "grad_norm": 0.17257769405841827, + "learning_rate": 2.9910495463806255e-06, + "loss": 1.7353, + "step": 29084 + }, + { + "epoch": 8.927255985267035, + "grad_norm": 0.16805051267147064, + "learning_rate": 2.9893564041882484e-06, + "loss": 1.7711, + "step": 29085 + }, + { + "epoch": 8.92756292203806, + "grad_norm": 0.123812235891819, + "learning_rate": 2.9876637265826123e-06, + "loss": 1.6197, + "step": 29086 + }, + { + "epoch": 8.927869858809085, + "grad_norm": 0.38423335552215576, + "learning_rate": 2.985971513580432e-06, + "loss": 1.726, + "step": 29087 + }, + { + "epoch": 8.92817679558011, + "grad_norm": 0.14887484908103943, + "learning_rate": 2.9842797651984443e-06, + "loss": 1.7067, + "step": 29088 + }, + { + "epoch": 8.928483732351136, + "grad_norm": 0.17092695832252502, + "learning_rate": 2.982588481453358e-06, + "loss": 1.6883, + "step": 29089 + }, + { + "epoch": 8.928790669122161, + "grad_norm": 0.1591298133134842, + "learning_rate": 2.9808976623618867e-06, + "loss": 1.7219, + "step": 29090 + }, + { + "epoch": 8.929097605893187, + "grad_norm": 0.17864398658275604, + "learning_rate": 2.979207307940746e-06, + "loss": 1.7378, + "step": 29091 + }, + { + "epoch": 8.92940454266421, + "grad_norm": 0.15053904056549072, + "learning_rate": 2.977517418206638e-06, + "loss": 1.679, + "step": 29092 + }, + { + "epoch": 8.929711479435236, + "grad_norm": 0.15586422383785248, + "learning_rate": 2.975827993176267e-06, + "loss": 1.7276, + "step": 29093 + }, + { + "epoch": 8.930018416206261, + "grad_norm": 0.13955895602703094, + "learning_rate": 2.9741390328663243e-06, + "loss": 1.6727, + "step": 29094 + }, + { + "epoch": 8.930325352977286, + "grad_norm": 0.15469470620155334, + "learning_rate": 2.9724505372934973e-06, + "loss": 1.6993, + "step": 29095 + }, + { + "epoch": 8.930632289748312, + "grad_norm": 0.13510502874851227, + "learning_rate": 2.970762506474484e-06, + "loss": 1.6991, + "step": 29096 + }, + { + "epoch": 8.930939226519337, + "grad_norm": 0.13071557879447937, + "learning_rate": 2.9690749404259587e-06, + "loss": 1.6787, + "step": 29097 + }, + { + "epoch": 8.931246163290362, + "grad_norm": 0.13370119035243988, + "learning_rate": 2.9673878391645927e-06, + "loss": 1.6966, + "step": 29098 + }, + { + "epoch": 8.931553100061388, + "grad_norm": 0.21600082516670227, + "learning_rate": 2.9657012027070774e-06, + "loss": 1.7137, + "step": 29099 + }, + { + "epoch": 8.931860036832413, + "grad_norm": 0.17746025323867798, + "learning_rate": 2.964015031070061e-06, + "loss": 1.7406, + "step": 29100 + }, + { + "epoch": 8.932166973603438, + "grad_norm": 0.1861608922481537, + "learning_rate": 2.96232932427023e-06, + "loss": 1.7615, + "step": 29101 + }, + { + "epoch": 8.932473910374462, + "grad_norm": 0.128297820687294, + "learning_rate": 2.9606440823242155e-06, + "loss": 1.6525, + "step": 29102 + }, + { + "epoch": 8.932780847145487, + "grad_norm": 0.1617307960987091, + "learning_rate": 2.958959305248693e-06, + "loss": 1.6735, + "step": 29103 + }, + { + "epoch": 8.933087783916513, + "grad_norm": 0.1898767054080963, + "learning_rate": 2.9572749930603107e-06, + "loss": 1.7426, + "step": 29104 + }, + { + "epoch": 8.933394720687538, + "grad_norm": 0.14279016852378845, + "learning_rate": 2.955591145775705e-06, + "loss": 1.6855, + "step": 29105 + }, + { + "epoch": 8.933701657458563, + "grad_norm": 0.15879136323928833, + "learning_rate": 2.953907763411523e-06, + "loss": 1.6833, + "step": 29106 + }, + { + "epoch": 8.934008594229589, + "grad_norm": 0.14285622537136078, + "learning_rate": 2.9522248459843972e-06, + "loss": 1.6821, + "step": 29107 + }, + { + "epoch": 8.934315531000614, + "grad_norm": 0.1237918958067894, + "learning_rate": 2.950542393510963e-06, + "loss": 1.6676, + "step": 29108 + }, + { + "epoch": 8.93462246777164, + "grad_norm": 0.16011624038219452, + "learning_rate": 2.9488604060078473e-06, + "loss": 1.6881, + "step": 29109 + }, + { + "epoch": 8.934929404542665, + "grad_norm": 0.19365482032299042, + "learning_rate": 2.9471788834916692e-06, + "loss": 1.6895, + "step": 29110 + }, + { + "epoch": 8.93523634131369, + "grad_norm": 0.1855025440454483, + "learning_rate": 2.9454978259790435e-06, + "loss": 1.7745, + "step": 29111 + }, + { + "epoch": 8.935543278084715, + "grad_norm": 0.1319892704486847, + "learning_rate": 2.9438172334865898e-06, + "loss": 1.6836, + "step": 29112 + }, + { + "epoch": 8.93585021485574, + "grad_norm": 0.19831378757953644, + "learning_rate": 2.942137106030918e-06, + "loss": 1.7398, + "step": 29113 + }, + { + "epoch": 8.936157151626764, + "grad_norm": 0.16073055565357208, + "learning_rate": 2.9404574436286246e-06, + "loss": 1.6617, + "step": 29114 + }, + { + "epoch": 8.93646408839779, + "grad_norm": 0.19067524373531342, + "learning_rate": 2.938778246296309e-06, + "loss": 1.7244, + "step": 29115 + }, + { + "epoch": 8.936771025168815, + "grad_norm": 0.13316050171852112, + "learning_rate": 2.9370995140505843e-06, + "loss": 1.6371, + "step": 29116 + }, + { + "epoch": 8.93707796193984, + "grad_norm": 0.19948840141296387, + "learning_rate": 2.9354212469080156e-06, + "loss": 1.7279, + "step": 29117 + }, + { + "epoch": 8.937384898710865, + "grad_norm": 0.15221990644931793, + "learning_rate": 2.933743444885206e-06, + "loss": 1.7516, + "step": 29118 + }, + { + "epoch": 8.93769183548189, + "grad_norm": 0.15257437527179718, + "learning_rate": 2.932066107998721e-06, + "loss": 1.7471, + "step": 29119 + }, + { + "epoch": 8.937998772252916, + "grad_norm": 0.1491934210062027, + "learning_rate": 2.930389236265152e-06, + "loss": 1.6896, + "step": 29120 + }, + { + "epoch": 8.938305709023942, + "grad_norm": 0.12303795665502548, + "learning_rate": 2.928712829701069e-06, + "loss": 1.6793, + "step": 29121 + }, + { + "epoch": 8.938612645794967, + "grad_norm": 0.09865713864564896, + "learning_rate": 2.9270368883230313e-06, + "loss": 1.6063, + "step": 29122 + }, + { + "epoch": 8.938919582565992, + "grad_norm": 0.1656254678964615, + "learning_rate": 2.9253614121476037e-06, + "loss": 1.7507, + "step": 29123 + }, + { + "epoch": 8.939226519337016, + "grad_norm": 0.11997068673372269, + "learning_rate": 2.9236864011913445e-06, + "loss": 1.6393, + "step": 29124 + }, + { + "epoch": 8.939533456108041, + "grad_norm": 0.16391901671886444, + "learning_rate": 2.922011855470813e-06, + "loss": 1.6926, + "step": 29125 + }, + { + "epoch": 8.939840392879066, + "grad_norm": 0.1461794674396515, + "learning_rate": 2.920337775002552e-06, + "loss": 1.7243, + "step": 29126 + }, + { + "epoch": 8.940147329650092, + "grad_norm": 0.12928323447704315, + "learning_rate": 2.918664159803108e-06, + "loss": 1.6457, + "step": 29127 + }, + { + "epoch": 8.940454266421117, + "grad_norm": 0.16596664488315582, + "learning_rate": 2.9169910098890196e-06, + "loss": 1.6878, + "step": 29128 + }, + { + "epoch": 8.940761203192142, + "grad_norm": 0.1567634493112564, + "learning_rate": 2.9153183252768224e-06, + "loss": 1.6947, + "step": 29129 + }, + { + "epoch": 8.941068139963168, + "grad_norm": 0.1472834199666977, + "learning_rate": 2.9136461059830476e-06, + "loss": 1.6707, + "step": 29130 + }, + { + "epoch": 8.941375076734193, + "grad_norm": 0.1658584028482437, + "learning_rate": 2.9119743520242217e-06, + "loss": 1.7321, + "step": 29131 + }, + { + "epoch": 8.941682013505218, + "grad_norm": 0.20524124801158905, + "learning_rate": 2.9103030634168525e-06, + "loss": 1.7065, + "step": 29132 + }, + { + "epoch": 8.941988950276244, + "grad_norm": 0.16881074011325836, + "learning_rate": 2.908632240177489e-06, + "loss": 1.7052, + "step": 29133 + }, + { + "epoch": 8.942295887047269, + "grad_norm": 0.15819382667541504, + "learning_rate": 2.906961882322601e-06, + "loss": 1.7388, + "step": 29134 + }, + { + "epoch": 8.942602823818293, + "grad_norm": 0.13994456827640533, + "learning_rate": 2.905291989868736e-06, + "loss": 1.6932, + "step": 29135 + }, + { + "epoch": 8.942909760589318, + "grad_norm": 0.18177597224712372, + "learning_rate": 2.9036225628323644e-06, + "loss": 1.707, + "step": 29136 + }, + { + "epoch": 8.943216697360343, + "grad_norm": 0.14273816347122192, + "learning_rate": 2.9019536012300063e-06, + "loss": 1.6902, + "step": 29137 + }, + { + "epoch": 8.943523634131369, + "grad_norm": 0.2221340835094452, + "learning_rate": 2.9002851050781486e-06, + "loss": 1.7369, + "step": 29138 + }, + { + "epoch": 8.943830570902394, + "grad_norm": 0.14513340592384338, + "learning_rate": 2.8986170743932782e-06, + "loss": 1.7307, + "step": 29139 + }, + { + "epoch": 8.94413750767342, + "grad_norm": 0.16813357174396515, + "learning_rate": 2.8969495091918763e-06, + "loss": 1.769, + "step": 29140 + }, + { + "epoch": 8.944444444444445, + "grad_norm": 0.15906141698360443, + "learning_rate": 2.895282409490435e-06, + "loss": 1.6929, + "step": 29141 + }, + { + "epoch": 8.94475138121547, + "grad_norm": 0.16236159205436707, + "learning_rate": 2.893615775305419e-06, + "loss": 1.7309, + "step": 29142 + }, + { + "epoch": 8.945058317986495, + "grad_norm": 0.12328501045703888, + "learning_rate": 2.891949606653299e-06, + "loss": 1.7063, + "step": 29143 + }, + { + "epoch": 8.94536525475752, + "grad_norm": 0.15831345319747925, + "learning_rate": 2.89028390355055e-06, + "loss": 1.6602, + "step": 29144 + }, + { + "epoch": 8.945672191528544, + "grad_norm": 0.12445748597383499, + "learning_rate": 2.8886186660136206e-06, + "loss": 1.6565, + "step": 29145 + }, + { + "epoch": 8.94597912829957, + "grad_norm": 0.12890103459358215, + "learning_rate": 2.88695389405898e-06, + "loss": 1.7209, + "step": 29146 + }, + { + "epoch": 8.946286065070595, + "grad_norm": 0.14477044343948364, + "learning_rate": 2.885289587703072e-06, + "loss": 1.6782, + "step": 29147 + }, + { + "epoch": 8.94659300184162, + "grad_norm": 0.12625789642333984, + "learning_rate": 2.8836257469623482e-06, + "loss": 1.6538, + "step": 29148 + }, + { + "epoch": 8.946899938612646, + "grad_norm": 0.16041505336761475, + "learning_rate": 2.8819623718532418e-06, + "loss": 1.7327, + "step": 29149 + }, + { + "epoch": 8.94720687538367, + "grad_norm": 0.16730013489723206, + "learning_rate": 2.880299462392216e-06, + "loss": 1.7036, + "step": 29150 + }, + { + "epoch": 8.947513812154696, + "grad_norm": 0.1525142341852188, + "learning_rate": 2.87863701859567e-06, + "loss": 1.7013, + "step": 29151 + }, + { + "epoch": 8.947820748925722, + "grad_norm": 0.10877451300621033, + "learning_rate": 2.876975040480073e-06, + "loss": 1.6294, + "step": 29152 + }, + { + "epoch": 8.948127685696747, + "grad_norm": 0.11804116517305374, + "learning_rate": 2.875313528061807e-06, + "loss": 1.6885, + "step": 29153 + }, + { + "epoch": 8.948434622467772, + "grad_norm": 0.1718084067106247, + "learning_rate": 2.873652481357325e-06, + "loss": 1.682, + "step": 29154 + }, + { + "epoch": 8.948741559238798, + "grad_norm": 0.1881963163614273, + "learning_rate": 2.871991900383031e-06, + "loss": 1.7851, + "step": 29155 + }, + { + "epoch": 8.949048496009823, + "grad_norm": 0.14475038647651672, + "learning_rate": 2.8703317851553334e-06, + "loss": 1.6933, + "step": 29156 + }, + { + "epoch": 8.949355432780846, + "grad_norm": 0.15759755671024323, + "learning_rate": 2.8686721356906423e-06, + "loss": 1.7322, + "step": 29157 + }, + { + "epoch": 8.949662369551872, + "grad_norm": 0.13722626864910126, + "learning_rate": 2.8670129520053547e-06, + "loss": 1.7027, + "step": 29158 + }, + { + "epoch": 8.949969306322897, + "grad_norm": 0.14574597775936127, + "learning_rate": 2.8653542341158744e-06, + "loss": 1.6934, + "step": 29159 + }, + { + "epoch": 8.950276243093922, + "grad_norm": 0.1554742455482483, + "learning_rate": 2.863695982038589e-06, + "loss": 1.7272, + "step": 29160 + }, + { + "epoch": 8.950583179864948, + "grad_norm": 0.17200839519500732, + "learning_rate": 2.8620381957898845e-06, + "loss": 1.7501, + "step": 29161 + }, + { + "epoch": 8.950890116635973, + "grad_norm": 0.18733108043670654, + "learning_rate": 2.860380875386154e-06, + "loss": 1.8017, + "step": 29162 + }, + { + "epoch": 8.951197053406998, + "grad_norm": 0.13730700314044952, + "learning_rate": 2.8587240208437614e-06, + "loss": 1.6831, + "step": 29163 + }, + { + "epoch": 8.951503990178024, + "grad_norm": 0.1442563533782959, + "learning_rate": 2.8570676321790946e-06, + "loss": 1.7231, + "step": 29164 + }, + { + "epoch": 8.95181092694905, + "grad_norm": 0.14817926287651062, + "learning_rate": 2.855411709408512e-06, + "loss": 1.7043, + "step": 29165 + }, + { + "epoch": 8.952117863720074, + "grad_norm": 0.14757658541202545, + "learning_rate": 2.8537562525483787e-06, + "loss": 1.6519, + "step": 29166 + }, + { + "epoch": 8.952424800491098, + "grad_norm": 0.17929381132125854, + "learning_rate": 2.85210126161507e-06, + "loss": 1.7523, + "step": 29167 + }, + { + "epoch": 8.952731737262123, + "grad_norm": 0.13454876840114594, + "learning_rate": 2.850446736624923e-06, + "loss": 1.6921, + "step": 29168 + }, + { + "epoch": 8.953038674033149, + "grad_norm": 0.17734326422214508, + "learning_rate": 2.8487926775943085e-06, + "loss": 1.7082, + "step": 29169 + }, + { + "epoch": 8.953345610804174, + "grad_norm": 0.15544986724853516, + "learning_rate": 2.8471390845395406e-06, + "loss": 1.7067, + "step": 29170 + }, + { + "epoch": 8.9536525475752, + "grad_norm": 0.1256217509508133, + "learning_rate": 2.8454859574769955e-06, + "loss": 1.6546, + "step": 29171 + }, + { + "epoch": 8.953959484346225, + "grad_norm": 0.17201638221740723, + "learning_rate": 2.843833296422993e-06, + "loss": 1.7554, + "step": 29172 + }, + { + "epoch": 8.95426642111725, + "grad_norm": 0.1437663435935974, + "learning_rate": 2.8421811013938703e-06, + "loss": 1.6985, + "step": 29173 + }, + { + "epoch": 8.954573357888275, + "grad_norm": 0.11889111250638962, + "learning_rate": 2.8405293724059532e-06, + "loss": 1.7046, + "step": 29174 + }, + { + "epoch": 8.9548802946593, + "grad_norm": 0.21805889904499054, + "learning_rate": 2.838878109475568e-06, + "loss": 1.7835, + "step": 29175 + }, + { + "epoch": 8.955187231430326, + "grad_norm": 0.17459547519683838, + "learning_rate": 2.8372273126190342e-06, + "loss": 1.6986, + "step": 29176 + }, + { + "epoch": 8.955494168201351, + "grad_norm": 0.16686071455478668, + "learning_rate": 2.835576981852656e-06, + "loss": 1.6858, + "step": 29177 + }, + { + "epoch": 8.955801104972375, + "grad_norm": 0.19014745950698853, + "learning_rate": 2.833927117192753e-06, + "loss": 1.742, + "step": 29178 + }, + { + "epoch": 8.9561080417434, + "grad_norm": 0.10640473663806915, + "learning_rate": 2.832277718655629e-06, + "loss": 1.6363, + "step": 29179 + }, + { + "epoch": 8.956414978514426, + "grad_norm": 0.12378805875778198, + "learning_rate": 2.8306287862575777e-06, + "loss": 1.6359, + "step": 29180 + }, + { + "epoch": 8.956721915285451, + "grad_norm": 0.1519845575094223, + "learning_rate": 2.828980320014901e-06, + "loss": 1.7112, + "step": 29181 + }, + { + "epoch": 8.957028852056476, + "grad_norm": 0.1550975888967514, + "learning_rate": 2.827332319943893e-06, + "loss": 1.7417, + "step": 29182 + }, + { + "epoch": 8.957335788827502, + "grad_norm": 0.1387033611536026, + "learning_rate": 2.8256847860608224e-06, + "loss": 1.6567, + "step": 29183 + }, + { + "epoch": 8.957642725598527, + "grad_norm": 0.14006295800209045, + "learning_rate": 2.8240377183820053e-06, + "loss": 1.7156, + "step": 29184 + }, + { + "epoch": 8.957949662369552, + "grad_norm": 0.13202004134655, + "learning_rate": 2.8223911169236782e-06, + "loss": 1.6567, + "step": 29185 + }, + { + "epoch": 8.958256599140578, + "grad_norm": 0.12789477407932281, + "learning_rate": 2.8207449817021505e-06, + "loss": 1.7102, + "step": 29186 + }, + { + "epoch": 8.958563535911603, + "grad_norm": 0.1773017793893814, + "learning_rate": 2.8190993127336583e-06, + "loss": 1.7004, + "step": 29187 + }, + { + "epoch": 8.958870472682626, + "grad_norm": 0.17584890127182007, + "learning_rate": 2.81745411003449e-06, + "loss": 1.7513, + "step": 29188 + }, + { + "epoch": 8.959177409453652, + "grad_norm": 0.1679183840751648, + "learning_rate": 2.8158093736208923e-06, + "loss": 1.7319, + "step": 29189 + }, + { + "epoch": 8.959484346224677, + "grad_norm": 0.14683100581169128, + "learning_rate": 2.8141651035091255e-06, + "loss": 1.6594, + "step": 29190 + }, + { + "epoch": 8.959791282995702, + "grad_norm": 0.17727963626384735, + "learning_rate": 2.8125212997154316e-06, + "loss": 1.7577, + "step": 29191 + }, + { + "epoch": 8.960098219766728, + "grad_norm": 0.12865738570690155, + "learning_rate": 2.810877962256059e-06, + "loss": 1.656, + "step": 29192 + }, + { + "epoch": 8.960405156537753, + "grad_norm": 0.15322017669677734, + "learning_rate": 2.80923509114725e-06, + "loss": 1.6994, + "step": 29193 + }, + { + "epoch": 8.960712093308778, + "grad_norm": 0.11874222010374069, + "learning_rate": 2.8075926864052417e-06, + "loss": 1.6514, + "step": 29194 + }, + { + "epoch": 8.961019030079804, + "grad_norm": 0.13674114644527435, + "learning_rate": 2.80595074804626e-06, + "loss": 1.6781, + "step": 29195 + }, + { + "epoch": 8.96132596685083, + "grad_norm": 0.13738766312599182, + "learning_rate": 2.8043092760865364e-06, + "loss": 1.7214, + "step": 29196 + }, + { + "epoch": 8.961632903621854, + "grad_norm": 0.15917620062828064, + "learning_rate": 2.8026682705422914e-06, + "loss": 1.7561, + "step": 29197 + }, + { + "epoch": 8.96193984039288, + "grad_norm": 0.18082000315189362, + "learning_rate": 2.8010277314297395e-06, + "loss": 1.7021, + "step": 29198 + }, + { + "epoch": 8.962246777163905, + "grad_norm": 0.1440226435661316, + "learning_rate": 2.799387658765096e-06, + "loss": 1.6829, + "step": 29199 + }, + { + "epoch": 8.962553713934929, + "grad_norm": 0.18358100950717926, + "learning_rate": 2.7977480525645692e-06, + "loss": 1.7207, + "step": 29200 + }, + { + "epoch": 8.962860650705954, + "grad_norm": 0.12614849209785461, + "learning_rate": 2.796108912844364e-06, + "loss": 1.705, + "step": 29201 + }, + { + "epoch": 8.96316758747698, + "grad_norm": 0.11331766098737717, + "learning_rate": 2.7944702396206666e-06, + "loss": 1.6343, + "step": 29202 + }, + { + "epoch": 8.963474524248005, + "grad_norm": 0.17110171914100647, + "learning_rate": 2.792832032909698e-06, + "loss": 1.8129, + "step": 29203 + }, + { + "epoch": 8.96378146101903, + "grad_norm": 0.19446058571338654, + "learning_rate": 2.791194292727617e-06, + "loss": 1.7015, + "step": 29204 + }, + { + "epoch": 8.964088397790055, + "grad_norm": 0.17975226044654846, + "learning_rate": 2.789557019090644e-06, + "loss": 1.7408, + "step": 29205 + }, + { + "epoch": 8.96439533456108, + "grad_norm": 0.15492287278175354, + "learning_rate": 2.787920212014922e-06, + "loss": 1.7307, + "step": 29206 + }, + { + "epoch": 8.964702271332106, + "grad_norm": 0.14430275559425354, + "learning_rate": 2.7862838715166485e-06, + "loss": 1.7112, + "step": 29207 + }, + { + "epoch": 8.965009208103131, + "grad_norm": 0.13850049674510956, + "learning_rate": 2.7846479976119944e-06, + "loss": 1.7177, + "step": 29208 + }, + { + "epoch": 8.965316144874157, + "grad_norm": 0.17376014590263367, + "learning_rate": 2.783012590317119e-06, + "loss": 1.7612, + "step": 29209 + }, + { + "epoch": 8.96562308164518, + "grad_norm": 0.13757693767547607, + "learning_rate": 2.7813776496481868e-06, + "loss": 1.7246, + "step": 29210 + }, + { + "epoch": 8.965930018416206, + "grad_norm": 0.17782050371170044, + "learning_rate": 2.7797431756213633e-06, + "loss": 1.7196, + "step": 29211 + }, + { + "epoch": 8.966236955187231, + "grad_norm": 0.14082394540309906, + "learning_rate": 2.7781091682527906e-06, + "loss": 1.7074, + "step": 29212 + }, + { + "epoch": 8.966543891958256, + "grad_norm": 0.2748696506023407, + "learning_rate": 2.7764756275586168e-06, + "loss": 1.819, + "step": 29213 + }, + { + "epoch": 8.966850828729282, + "grad_norm": 0.134973406791687, + "learning_rate": 2.774842553554996e-06, + "loss": 1.6725, + "step": 29214 + }, + { + "epoch": 8.967157765500307, + "grad_norm": 0.15217997133731842, + "learning_rate": 2.7732099462580594e-06, + "loss": 1.6953, + "step": 29215 + }, + { + "epoch": 8.967464702271332, + "grad_norm": 0.15674369037151337, + "learning_rate": 2.771577805683939e-06, + "loss": 1.7108, + "step": 29216 + }, + { + "epoch": 8.967771639042358, + "grad_norm": 0.13885504007339478, + "learning_rate": 2.769946131848772e-06, + "loss": 1.7106, + "step": 29217 + }, + { + "epoch": 8.968078575813383, + "grad_norm": 0.13795867562294006, + "learning_rate": 2.768314924768678e-06, + "loss": 1.6831, + "step": 29218 + }, + { + "epoch": 8.968385512584408, + "grad_norm": 0.15533487498760223, + "learning_rate": 2.7666841844597724e-06, + "loss": 1.7278, + "step": 29219 + }, + { + "epoch": 8.968692449355434, + "grad_norm": 0.13686540722846985, + "learning_rate": 2.7650539109381867e-06, + "loss": 1.6854, + "step": 29220 + }, + { + "epoch": 8.968999386126457, + "grad_norm": 0.1479746252298355, + "learning_rate": 2.763424104220019e-06, + "loss": 1.7119, + "step": 29221 + }, + { + "epoch": 8.969306322897483, + "grad_norm": 0.12035561352968216, + "learning_rate": 2.7617947643213906e-06, + "loss": 1.6295, + "step": 29222 + }, + { + "epoch": 8.969613259668508, + "grad_norm": 0.12784910202026367, + "learning_rate": 2.7601658912583763e-06, + "loss": 1.6952, + "step": 29223 + }, + { + "epoch": 8.969920196439533, + "grad_norm": 0.14596527814865112, + "learning_rate": 2.7585374850471025e-06, + "loss": 1.7003, + "step": 29224 + }, + { + "epoch": 8.970227133210559, + "grad_norm": 0.17561540007591248, + "learning_rate": 2.7569095457036455e-06, + "loss": 1.7687, + "step": 29225 + }, + { + "epoch": 8.970534069981584, + "grad_norm": 0.17456963658332825, + "learning_rate": 2.7552820732441032e-06, + "loss": 1.6927, + "step": 29226 + }, + { + "epoch": 8.97084100675261, + "grad_norm": 0.15346206724643707, + "learning_rate": 2.7536550676845574e-06, + "loss": 1.7057, + "step": 29227 + }, + { + "epoch": 8.971147943523635, + "grad_norm": 0.113531194627285, + "learning_rate": 2.752028529041073e-06, + "loss": 1.6844, + "step": 29228 + }, + { + "epoch": 8.97145488029466, + "grad_norm": 0.18523596227169037, + "learning_rate": 2.7504024573297426e-06, + "loss": 1.7468, + "step": 29229 + }, + { + "epoch": 8.971761817065685, + "grad_norm": 0.14123110473155975, + "learning_rate": 2.7487768525666313e-06, + "loss": 1.699, + "step": 29230 + }, + { + "epoch": 8.972068753836709, + "grad_norm": 0.17675861716270447, + "learning_rate": 2.747151714767798e-06, + "loss": 1.745, + "step": 29231 + }, + { + "epoch": 8.972375690607734, + "grad_norm": 0.1529264897108078, + "learning_rate": 2.7455270439493085e-06, + "loss": 1.686, + "step": 29232 + }, + { + "epoch": 8.97268262737876, + "grad_norm": 0.14173699915409088, + "learning_rate": 2.743902840127216e-06, + "loss": 1.6717, + "step": 29233 + }, + { + "epoch": 8.972989564149785, + "grad_norm": 0.15535210072994232, + "learning_rate": 2.7422791033175743e-06, + "loss": 1.7433, + "step": 29234 + }, + { + "epoch": 8.97329650092081, + "grad_norm": 0.12831814587116241, + "learning_rate": 2.740655833536432e-06, + "loss": 1.7548, + "step": 29235 + }, + { + "epoch": 8.973603437691835, + "grad_norm": 0.19681085646152496, + "learning_rate": 2.739033030799815e-06, + "loss": 1.7841, + "step": 29236 + }, + { + "epoch": 8.97391037446286, + "grad_norm": 0.1496504247188568, + "learning_rate": 2.737410695123793e-06, + "loss": 1.6646, + "step": 29237 + }, + { + "epoch": 8.974217311233886, + "grad_norm": 0.15000486373901367, + "learning_rate": 2.735788826524366e-06, + "loss": 1.6938, + "step": 29238 + }, + { + "epoch": 8.974524248004911, + "grad_norm": 0.11816641688346863, + "learning_rate": 2.734167425017592e-06, + "loss": 1.6738, + "step": 29239 + }, + { + "epoch": 8.974831184775937, + "grad_norm": 0.12041781097650528, + "learning_rate": 2.7325464906194585e-06, + "loss": 1.6798, + "step": 29240 + }, + { + "epoch": 8.975138121546962, + "grad_norm": 0.1780797690153122, + "learning_rate": 2.7309260233460143e-06, + "loss": 1.7608, + "step": 29241 + }, + { + "epoch": 8.975445058317986, + "grad_norm": 0.19122804701328278, + "learning_rate": 2.7293060232132683e-06, + "loss": 1.7706, + "step": 29242 + }, + { + "epoch": 8.975751995089011, + "grad_norm": 0.16770713031291962, + "learning_rate": 2.7276864902372244e-06, + "loss": 1.736, + "step": 29243 + }, + { + "epoch": 8.976058931860036, + "grad_norm": 0.17613980174064636, + "learning_rate": 2.7260674244338922e-06, + "loss": 1.7674, + "step": 29244 + }, + { + "epoch": 8.976365868631062, + "grad_norm": 0.17744678258895874, + "learning_rate": 2.7244488258192648e-06, + "loss": 1.7564, + "step": 29245 + }, + { + "epoch": 8.976672805402087, + "grad_norm": 0.15087327361106873, + "learning_rate": 2.7228306944093394e-06, + "loss": 1.7245, + "step": 29246 + }, + { + "epoch": 8.976979742173112, + "grad_norm": 0.16417519748210907, + "learning_rate": 2.721213030220121e-06, + "loss": 1.7329, + "step": 29247 + }, + { + "epoch": 8.977286678944138, + "grad_norm": 0.15511249005794525, + "learning_rate": 2.7195958332675796e-06, + "loss": 1.6803, + "step": 29248 + }, + { + "epoch": 8.977593615715163, + "grad_norm": 0.18222862482070923, + "learning_rate": 2.7179791035677083e-06, + "loss": 1.7186, + "step": 29249 + }, + { + "epoch": 8.977900552486188, + "grad_norm": 0.16677385568618774, + "learning_rate": 2.716362841136477e-06, + "loss": 1.688, + "step": 29250 + }, + { + "epoch": 8.978207489257214, + "grad_norm": 0.1820213794708252, + "learning_rate": 2.714747045989863e-06, + "loss": 1.7801, + "step": 29251 + }, + { + "epoch": 8.978514426028239, + "grad_norm": 0.1464485377073288, + "learning_rate": 2.7131317181438355e-06, + "loss": 1.6667, + "step": 29252 + }, + { + "epoch": 8.978821362799263, + "grad_norm": 0.13353987038135529, + "learning_rate": 2.711516857614349e-06, + "loss": 1.6492, + "step": 29253 + }, + { + "epoch": 8.979128299570288, + "grad_norm": 0.14857034385204315, + "learning_rate": 2.70990246441738e-06, + "loss": 1.6902, + "step": 29254 + }, + { + "epoch": 8.979435236341313, + "grad_norm": 0.1581316888332367, + "learning_rate": 2.708288538568865e-06, + "loss": 1.7188, + "step": 29255 + }, + { + "epoch": 8.979742173112339, + "grad_norm": 0.1437988132238388, + "learning_rate": 2.7066750800847695e-06, + "loss": 1.6982, + "step": 29256 + }, + { + "epoch": 8.980049109883364, + "grad_norm": 0.15172283351421356, + "learning_rate": 2.705062088981014e-06, + "loss": 1.6898, + "step": 29257 + }, + { + "epoch": 8.98035604665439, + "grad_norm": 0.2507859170436859, + "learning_rate": 2.703449565273569e-06, + "loss": 1.7433, + "step": 29258 + }, + { + "epoch": 8.980662983425415, + "grad_norm": 0.19917117059230804, + "learning_rate": 2.701837508978361e-06, + "loss": 1.7411, + "step": 29259 + }, + { + "epoch": 8.98096992019644, + "grad_norm": 0.17466393113136292, + "learning_rate": 2.7002259201113044e-06, + "loss": 1.712, + "step": 29260 + }, + { + "epoch": 8.981276856967465, + "grad_norm": 0.1595284342765808, + "learning_rate": 2.698614798688348e-06, + "loss": 1.768, + "step": 29261 + }, + { + "epoch": 8.98158379373849, + "grad_norm": 0.1435062289237976, + "learning_rate": 2.6970041447253956e-06, + "loss": 1.6715, + "step": 29262 + }, + { + "epoch": 8.981890730509516, + "grad_norm": 0.16341650485992432, + "learning_rate": 2.695393958238379e-06, + "loss": 1.7563, + "step": 29263 + }, + { + "epoch": 8.98219766728054, + "grad_norm": 0.1981598138809204, + "learning_rate": 2.6937842392432023e-06, + "loss": 1.744, + "step": 29264 + }, + { + "epoch": 8.982504604051565, + "grad_norm": 0.1611155867576599, + "learning_rate": 2.6921749877557802e-06, + "loss": 1.6874, + "step": 29265 + }, + { + "epoch": 8.98281154082259, + "grad_norm": 0.17430151998996735, + "learning_rate": 2.690566203792011e-06, + "loss": 1.7338, + "step": 29266 + }, + { + "epoch": 8.983118477593615, + "grad_norm": 0.13210003077983856, + "learning_rate": 2.688957887367799e-06, + "loss": 1.7221, + "step": 29267 + }, + { + "epoch": 8.98342541436464, + "grad_norm": 0.167892724275589, + "learning_rate": 2.6873500384990313e-06, + "loss": 1.6985, + "step": 29268 + }, + { + "epoch": 8.983732351135666, + "grad_norm": 0.1600649207830429, + "learning_rate": 2.685742657201601e-06, + "loss": 1.7309, + "step": 29269 + }, + { + "epoch": 8.984039287906691, + "grad_norm": 0.1755276322364807, + "learning_rate": 2.6841357434913892e-06, + "loss": 1.7173, + "step": 29270 + }, + { + "epoch": 8.984346224677717, + "grad_norm": 0.14754937589168549, + "learning_rate": 2.682529297384295e-06, + "loss": 1.6948, + "step": 29271 + }, + { + "epoch": 8.984653161448742, + "grad_norm": 0.1670856773853302, + "learning_rate": 2.6809233188961614e-06, + "loss": 1.7302, + "step": 29272 + }, + { + "epoch": 8.984960098219767, + "grad_norm": 0.18906234204769135, + "learning_rate": 2.6793178080428973e-06, + "loss": 1.7336, + "step": 29273 + }, + { + "epoch": 8.985267034990791, + "grad_norm": 0.17759168148040771, + "learning_rate": 2.6777127648403345e-06, + "loss": 1.762, + "step": 29274 + }, + { + "epoch": 8.985573971761816, + "grad_norm": 0.12218867987394333, + "learning_rate": 2.676108189304355e-06, + "loss": 1.6987, + "step": 29275 + }, + { + "epoch": 8.985880908532842, + "grad_norm": 0.1504579335451126, + "learning_rate": 2.674504081450824e-06, + "loss": 1.6683, + "step": 29276 + }, + { + "epoch": 8.986187845303867, + "grad_norm": 0.15826797485351562, + "learning_rate": 2.6729004412955616e-06, + "loss": 1.7131, + "step": 29277 + }, + { + "epoch": 8.986494782074892, + "grad_norm": 0.12599892914295197, + "learning_rate": 2.671297268854456e-06, + "loss": 1.6603, + "step": 29278 + }, + { + "epoch": 8.986801718845918, + "grad_norm": 0.17663413286209106, + "learning_rate": 2.6696945641433157e-06, + "loss": 1.7231, + "step": 29279 + }, + { + "epoch": 8.987108655616943, + "grad_norm": 0.16194280982017517, + "learning_rate": 2.668092327178001e-06, + "loss": 1.695, + "step": 29280 + }, + { + "epoch": 8.987415592387968, + "grad_norm": 0.1310044527053833, + "learning_rate": 2.6664905579743384e-06, + "loss": 1.6997, + "step": 29281 + }, + { + "epoch": 8.987722529158994, + "grad_norm": 0.18553194403648376, + "learning_rate": 2.6648892565481587e-06, + "loss": 1.7594, + "step": 29282 + }, + { + "epoch": 8.988029465930019, + "grad_norm": 0.17653048038482666, + "learning_rate": 2.6632884229152887e-06, + "loss": 1.7687, + "step": 29283 + }, + { + "epoch": 8.988336402701044, + "grad_norm": 0.14085285365581512, + "learning_rate": 2.661688057091549e-06, + "loss": 1.6875, + "step": 29284 + }, + { + "epoch": 8.988643339472068, + "grad_norm": 0.14821402728557587, + "learning_rate": 2.6600881590927553e-06, + "loss": 1.7579, + "step": 29285 + }, + { + "epoch": 8.988950276243093, + "grad_norm": 0.16718199849128723, + "learning_rate": 2.658488728934716e-06, + "loss": 1.7093, + "step": 29286 + }, + { + "epoch": 8.989257213014119, + "grad_norm": 0.16012485325336456, + "learning_rate": 2.6568897666332303e-06, + "loss": 1.6937, + "step": 29287 + }, + { + "epoch": 8.989564149785144, + "grad_norm": 0.186227485537529, + "learning_rate": 2.655291272204119e-06, + "loss": 1.6682, + "step": 29288 + }, + { + "epoch": 8.98987108655617, + "grad_norm": 0.15328755974769592, + "learning_rate": 2.653693245663158e-06, + "loss": 1.7221, + "step": 29289 + }, + { + "epoch": 8.990178023327195, + "grad_norm": 0.11358486860990524, + "learning_rate": 2.6520956870261684e-06, + "loss": 1.6721, + "step": 29290 + }, + { + "epoch": 8.99048496009822, + "grad_norm": 0.16672687232494354, + "learning_rate": 2.6504985963089035e-06, + "loss": 1.7192, + "step": 29291 + }, + { + "epoch": 8.990791896869245, + "grad_norm": 0.13929708302021027, + "learning_rate": 2.6489019735271734e-06, + "loss": 1.69, + "step": 29292 + }, + { + "epoch": 8.99109883364027, + "grad_norm": 0.1592891961336136, + "learning_rate": 2.647305818696749e-06, + "loss": 1.6943, + "step": 29293 + }, + { + "epoch": 8.991405770411296, + "grad_norm": 0.1534394770860672, + "learning_rate": 2.6457101318333957e-06, + "loss": 1.6993, + "step": 29294 + }, + { + "epoch": 8.99171270718232, + "grad_norm": 0.17096973955631256, + "learning_rate": 2.6441149129529e-06, + "loss": 1.7627, + "step": 29295 + }, + { + "epoch": 8.992019643953345, + "grad_norm": 0.13695703446865082, + "learning_rate": 2.642520162071005e-06, + "loss": 1.7047, + "step": 29296 + }, + { + "epoch": 8.99232658072437, + "grad_norm": 0.13649116456508636, + "learning_rate": 2.6409258792034873e-06, + "loss": 1.6666, + "step": 29297 + }, + { + "epoch": 8.992633517495396, + "grad_norm": 0.13003148138523102, + "learning_rate": 2.639332064366096e-06, + "loss": 1.6862, + "step": 29298 + }, + { + "epoch": 8.99294045426642, + "grad_norm": 0.1290612667798996, + "learning_rate": 2.6377387175745894e-06, + "loss": 1.703, + "step": 29299 + }, + { + "epoch": 8.993247391037446, + "grad_norm": 0.14106552302837372, + "learning_rate": 2.636145838844706e-06, + "loss": 1.6771, + "step": 29300 + }, + { + "epoch": 8.993554327808472, + "grad_norm": 0.13510754704475403, + "learning_rate": 2.6345534281921937e-06, + "loss": 1.6569, + "step": 29301 + }, + { + "epoch": 8.993861264579497, + "grad_norm": 0.11940879374742508, + "learning_rate": 2.632961485632779e-06, + "loss": 1.6719, + "step": 29302 + }, + { + "epoch": 8.994168201350522, + "grad_norm": 0.22212430834770203, + "learning_rate": 2.6313700111822104e-06, + "loss": 1.7285, + "step": 29303 + }, + { + "epoch": 8.994475138121548, + "grad_norm": 0.144329234957695, + "learning_rate": 2.629779004856192e-06, + "loss": 1.6928, + "step": 29304 + }, + { + "epoch": 8.994782074892573, + "grad_norm": 0.14428433775901794, + "learning_rate": 2.6281884666704837e-06, + "loss": 1.7371, + "step": 29305 + }, + { + "epoch": 8.995089011663598, + "grad_norm": 0.12600816786289215, + "learning_rate": 2.6265983966407615e-06, + "loss": 1.6803, + "step": 29306 + }, + { + "epoch": 8.995395948434622, + "grad_norm": 0.14739328622817993, + "learning_rate": 2.6250087947827793e-06, + "loss": 1.7135, + "step": 29307 + }, + { + "epoch": 8.995702885205647, + "grad_norm": 0.14694075286388397, + "learning_rate": 2.623419661112209e-06, + "loss": 1.7161, + "step": 29308 + }, + { + "epoch": 8.996009821976672, + "grad_norm": 0.1703605204820633, + "learning_rate": 2.6218309956447864e-06, + "loss": 1.7415, + "step": 29309 + }, + { + "epoch": 8.996316758747698, + "grad_norm": 0.1334623247385025, + "learning_rate": 2.6202427983961996e-06, + "loss": 1.7227, + "step": 29310 + }, + { + "epoch": 8.996623695518723, + "grad_norm": 0.16613437235355377, + "learning_rate": 2.6186550693821364e-06, + "loss": 1.6925, + "step": 29311 + }, + { + "epoch": 8.996930632289748, + "grad_norm": 0.12817926704883575, + "learning_rate": 2.617067808618301e-06, + "loss": 1.6296, + "step": 29312 + }, + { + "epoch": 8.997237569060774, + "grad_norm": 0.13783088326454163, + "learning_rate": 2.6154810161203693e-06, + "loss": 1.6801, + "step": 29313 + }, + { + "epoch": 8.997544505831799, + "grad_norm": 0.19866502285003662, + "learning_rate": 2.6138946919040285e-06, + "loss": 1.7817, + "step": 29314 + }, + { + "epoch": 8.997851442602824, + "grad_norm": 0.12466265261173248, + "learning_rate": 2.61230883598495e-06, + "loss": 1.7001, + "step": 29315 + }, + { + "epoch": 8.99815837937385, + "grad_norm": 0.13250842690467834, + "learning_rate": 2.6107234483788158e-06, + "loss": 1.6932, + "step": 29316 + }, + { + "epoch": 8.998465316144873, + "grad_norm": 0.13475441932678223, + "learning_rate": 2.6091385291012904e-06, + "loss": 1.6906, + "step": 29317 + }, + { + "epoch": 8.998772252915899, + "grad_norm": 0.14250501990318298, + "learning_rate": 2.6075540781680284e-06, + "loss": 1.7032, + "step": 29318 + }, + { + "epoch": 8.999079189686924, + "grad_norm": 0.11724159866571426, + "learning_rate": 2.6059700955947007e-06, + "loss": 1.6319, + "step": 29319 + }, + { + "epoch": 8.99938612645795, + "grad_norm": 0.15192265808582306, + "learning_rate": 2.6043865813969505e-06, + "loss": 1.699, + "step": 29320 + }, + { + "epoch": 8.999693063228975, + "grad_norm": 0.14814937114715576, + "learning_rate": 2.6028035355904257e-06, + "loss": 1.7313, + "step": 29321 + }, + { + "epoch": 9.0, + "grad_norm": 0.20881028473377228, + "learning_rate": 2.6012209581907922e-06, + "loss": 1.8009, + "step": 29322 + }, + { + "epoch": 9.000306936771025, + "grad_norm": 0.15227021276950836, + "learning_rate": 2.5996388492136593e-06, + "loss": 1.7501, + "step": 29323 + }, + { + "epoch": 9.00061387354205, + "grad_norm": 0.1541164219379425, + "learning_rate": 2.598057208674692e-06, + "loss": 1.727, + "step": 29324 + }, + { + "epoch": 9.000920810313076, + "grad_norm": 0.15358538925647736, + "learning_rate": 2.596476036589496e-06, + "loss": 1.7363, + "step": 29325 + }, + { + "epoch": 9.001227747084101, + "grad_norm": 0.13264121115207672, + "learning_rate": 2.5948953329737126e-06, + "loss": 1.6988, + "step": 29326 + }, + { + "epoch": 9.001534683855127, + "grad_norm": 0.13748973608016968, + "learning_rate": 2.593315097842963e-06, + "loss": 1.7003, + "step": 29327 + }, + { + "epoch": 9.00184162062615, + "grad_norm": 0.1346716433763504, + "learning_rate": 2.5917353312128467e-06, + "loss": 1.6819, + "step": 29328 + }, + { + "epoch": 9.002148557397176, + "grad_norm": 0.13923269510269165, + "learning_rate": 2.5901560330990006e-06, + "loss": 1.7161, + "step": 29329 + }, + { + "epoch": 9.002455494168201, + "grad_norm": 0.17402863502502441, + "learning_rate": 2.588577203517012e-06, + "loss": 1.7039, + "step": 29330 + }, + { + "epoch": 9.002762430939226, + "grad_norm": 0.14584888517856598, + "learning_rate": 2.5869988424824964e-06, + "loss": 1.7306, + "step": 29331 + }, + { + "epoch": 9.003069367710252, + "grad_norm": 0.12232481688261032, + "learning_rate": 2.5854209500110472e-06, + "loss": 1.6689, + "step": 29332 + }, + { + "epoch": 9.003376304481277, + "grad_norm": 0.15231020748615265, + "learning_rate": 2.583843526118257e-06, + "loss": 1.7308, + "step": 29333 + }, + { + "epoch": 9.003683241252302, + "grad_norm": 0.1362350732088089, + "learning_rate": 2.582266570819719e-06, + "loss": 1.7089, + "step": 29334 + }, + { + "epoch": 9.003990178023328, + "grad_norm": 0.16162967681884766, + "learning_rate": 2.5806900841310154e-06, + "loss": 1.7254, + "step": 29335 + }, + { + "epoch": 9.004297114794353, + "grad_norm": 0.19027012586593628, + "learning_rate": 2.579114066067723e-06, + "loss": 1.7523, + "step": 29336 + }, + { + "epoch": 9.004604051565378, + "grad_norm": 0.15073107182979584, + "learning_rate": 2.5775385166454224e-06, + "loss": 1.7219, + "step": 29337 + }, + { + "epoch": 9.004910988336404, + "grad_norm": 0.18943648040294647, + "learning_rate": 2.5759634358796746e-06, + "loss": 1.7052, + "step": 29338 + }, + { + "epoch": 9.005217925107427, + "grad_norm": 0.17359869182109833, + "learning_rate": 2.5743888237860615e-06, + "loss": 1.7475, + "step": 29339 + }, + { + "epoch": 9.005524861878452, + "grad_norm": 0.1170465275645256, + "learning_rate": 2.5728146803801256e-06, + "loss": 1.6514, + "step": 29340 + }, + { + "epoch": 9.005831798649478, + "grad_norm": 0.19763801991939545, + "learning_rate": 2.5712410056774494e-06, + "loss": 1.7476, + "step": 29341 + }, + { + "epoch": 9.006138735420503, + "grad_norm": 0.11056608706712723, + "learning_rate": 2.569667799693548e-06, + "loss": 1.6644, + "step": 29342 + }, + { + "epoch": 9.006445672191528, + "grad_norm": 0.11823355406522751, + "learning_rate": 2.5680950624440038e-06, + "loss": 1.6476, + "step": 29343 + }, + { + "epoch": 9.006752608962554, + "grad_norm": 0.12750595808029175, + "learning_rate": 2.5665227939443425e-06, + "loss": 1.6715, + "step": 29344 + }, + { + "epoch": 9.00705954573358, + "grad_norm": 0.14100933074951172, + "learning_rate": 2.5649509942100967e-06, + "loss": 1.6917, + "step": 29345 + }, + { + "epoch": 9.007366482504604, + "grad_norm": 0.15324008464813232, + "learning_rate": 2.5633796632568207e-06, + "loss": 1.7051, + "step": 29346 + }, + { + "epoch": 9.00767341927563, + "grad_norm": 0.112611785531044, + "learning_rate": 2.5618088011000183e-06, + "loss": 1.6693, + "step": 29347 + }, + { + "epoch": 9.007980356046655, + "grad_norm": 0.1416759490966797, + "learning_rate": 2.560238407755228e-06, + "loss": 1.725, + "step": 29348 + }, + { + "epoch": 9.008287292817679, + "grad_norm": 0.10026350617408752, + "learning_rate": 2.558668483237969e-06, + "loss": 1.6199, + "step": 29349 + }, + { + "epoch": 9.008594229588704, + "grad_norm": 0.17179331183433533, + "learning_rate": 2.5570990275637585e-06, + "loss": 1.708, + "step": 29350 + }, + { + "epoch": 9.00890116635973, + "grad_norm": 0.17252036929130554, + "learning_rate": 2.5555300407480996e-06, + "loss": 1.6981, + "step": 29351 + }, + { + "epoch": 9.009208103130755, + "grad_norm": 0.1174364760518074, + "learning_rate": 2.5539615228064973e-06, + "loss": 1.6498, + "step": 29352 + }, + { + "epoch": 9.00951503990178, + "grad_norm": 0.16481025516986847, + "learning_rate": 2.552393473754461e-06, + "loss": 1.6855, + "step": 29353 + }, + { + "epoch": 9.009821976672805, + "grad_norm": 0.15297551453113556, + "learning_rate": 2.5508258936074836e-06, + "loss": 1.7085, + "step": 29354 + }, + { + "epoch": 9.01012891344383, + "grad_norm": 0.182330921292305, + "learning_rate": 2.5492587823810476e-06, + "loss": 1.6955, + "step": 29355 + }, + { + "epoch": 9.010435850214856, + "grad_norm": 0.13587582111358643, + "learning_rate": 2.547692140090657e-06, + "loss": 1.6933, + "step": 29356 + }, + { + "epoch": 9.010742786985881, + "grad_norm": 0.16220442950725555, + "learning_rate": 2.5461259667517723e-06, + "loss": 1.7386, + "step": 29357 + }, + { + "epoch": 9.011049723756907, + "grad_norm": 0.14470438659191132, + "learning_rate": 2.5445602623799025e-06, + "loss": 1.6613, + "step": 29358 + }, + { + "epoch": 9.011356660527932, + "grad_norm": 0.17066054046154022, + "learning_rate": 2.5429950269904856e-06, + "loss": 1.6925, + "step": 29359 + }, + { + "epoch": 9.011663597298956, + "grad_norm": 0.178157240152359, + "learning_rate": 2.541430260599026e-06, + "loss": 1.7966, + "step": 29360 + }, + { + "epoch": 9.011970534069981, + "grad_norm": 0.19744886457920074, + "learning_rate": 2.5398659632209552e-06, + "loss": 1.7737, + "step": 29361 + }, + { + "epoch": 9.012277470841006, + "grad_norm": 0.1326957792043686, + "learning_rate": 2.538302134871745e-06, + "loss": 1.6749, + "step": 29362 + }, + { + "epoch": 9.012584407612032, + "grad_norm": 0.1415095329284668, + "learning_rate": 2.5367387755668602e-06, + "loss": 1.68, + "step": 29363 + }, + { + "epoch": 9.012891344383057, + "grad_norm": 0.13428185880184174, + "learning_rate": 2.535175885321733e-06, + "loss": 1.6674, + "step": 29364 + }, + { + "epoch": 9.013198281154082, + "grad_norm": 0.1266496479511261, + "learning_rate": 2.5336134641518183e-06, + "loss": 1.6538, + "step": 29365 + }, + { + "epoch": 9.013505217925108, + "grad_norm": 0.1252683401107788, + "learning_rate": 2.532051512072564e-06, + "loss": 1.6552, + "step": 29366 + }, + { + "epoch": 9.013812154696133, + "grad_norm": 0.13982512056827545, + "learning_rate": 2.5304900290993916e-06, + "loss": 1.6614, + "step": 29367 + }, + { + "epoch": 9.014119091467158, + "grad_norm": 0.15743471682071686, + "learning_rate": 2.528929015247744e-06, + "loss": 1.759, + "step": 29368 + }, + { + "epoch": 9.014426028238184, + "grad_norm": 0.12230109423398972, + "learning_rate": 2.5273684705330424e-06, + "loss": 1.6955, + "step": 29369 + }, + { + "epoch": 9.014732965009209, + "grad_norm": 0.13204556703567505, + "learning_rate": 2.525808394970708e-06, + "loss": 1.7259, + "step": 29370 + }, + { + "epoch": 9.015039901780233, + "grad_norm": 0.15656854212284088, + "learning_rate": 2.5242487885761614e-06, + "loss": 1.7027, + "step": 29371 + }, + { + "epoch": 9.015346838551258, + "grad_norm": 0.1528550535440445, + "learning_rate": 2.5226896513648178e-06, + "loss": 1.6665, + "step": 29372 + }, + { + "epoch": 9.015653775322283, + "grad_norm": 0.26738816499710083, + "learning_rate": 2.5211309833520825e-06, + "loss": 1.7587, + "step": 29373 + }, + { + "epoch": 9.015960712093309, + "grad_norm": 0.19041690230369568, + "learning_rate": 2.519572784553348e-06, + "loss": 1.7452, + "step": 29374 + }, + { + "epoch": 9.016267648864334, + "grad_norm": 0.1666717827320099, + "learning_rate": 2.518015054984041e-06, + "loss": 1.7117, + "step": 29375 + }, + { + "epoch": 9.01657458563536, + "grad_norm": 0.18895253539085388, + "learning_rate": 2.5164577946595214e-06, + "loss": 1.7671, + "step": 29376 + }, + { + "epoch": 9.016881522406385, + "grad_norm": 0.1346922218799591, + "learning_rate": 2.5149010035952158e-06, + "loss": 1.6986, + "step": 29377 + }, + { + "epoch": 9.01718845917741, + "grad_norm": 0.15223844349384308, + "learning_rate": 2.5133446818064786e-06, + "loss": 1.725, + "step": 29378 + }, + { + "epoch": 9.017495395948435, + "grad_norm": 0.19043175876140594, + "learning_rate": 2.511788829308703e-06, + "loss": 1.733, + "step": 29379 + }, + { + "epoch": 9.01780233271946, + "grad_norm": 0.17035910487174988, + "learning_rate": 2.510233446117272e-06, + "loss": 1.6885, + "step": 29380 + }, + { + "epoch": 9.018109269490484, + "grad_norm": 0.18320874869823456, + "learning_rate": 2.5086785322475325e-06, + "loss": 1.7783, + "step": 29381 + }, + { + "epoch": 9.01841620626151, + "grad_norm": 0.13961733877658844, + "learning_rate": 2.507124087714885e-06, + "loss": 1.6768, + "step": 29382 + }, + { + "epoch": 9.018723143032535, + "grad_norm": 0.12573479115962982, + "learning_rate": 2.505570112534661e-06, + "loss": 1.6902, + "step": 29383 + }, + { + "epoch": 9.01903007980356, + "grad_norm": 0.15192000567913055, + "learning_rate": 2.504016606722237e-06, + "loss": 1.7115, + "step": 29384 + }, + { + "epoch": 9.019337016574585, + "grad_norm": 0.16358907520771027, + "learning_rate": 2.5024635702929565e-06, + "loss": 1.7075, + "step": 29385 + }, + { + "epoch": 9.01964395334561, + "grad_norm": 0.13516998291015625, + "learning_rate": 2.500911003262174e-06, + "loss": 1.6584, + "step": 29386 + }, + { + "epoch": 9.019950890116636, + "grad_norm": 0.13729408383369446, + "learning_rate": 2.4993589056452215e-06, + "loss": 1.7039, + "step": 29387 + }, + { + "epoch": 9.020257826887661, + "grad_norm": 0.1284191608428955, + "learning_rate": 2.4978072774574533e-06, + "loss": 1.6652, + "step": 29388 + }, + { + "epoch": 9.020564763658687, + "grad_norm": 0.1911778301000595, + "learning_rate": 2.4962561187141906e-06, + "loss": 1.747, + "step": 29389 + }, + { + "epoch": 9.020871700429712, + "grad_norm": 0.1233893632888794, + "learning_rate": 2.4947054294307714e-06, + "loss": 1.6631, + "step": 29390 + }, + { + "epoch": 9.021178637200737, + "grad_norm": 0.1692901849746704, + "learning_rate": 2.493155209622511e-06, + "loss": 1.7598, + "step": 29391 + }, + { + "epoch": 9.021485573971761, + "grad_norm": 0.1659780591726303, + "learning_rate": 2.4916054593047468e-06, + "loss": 1.7915, + "step": 29392 + }, + { + "epoch": 9.021792510742786, + "grad_norm": 0.1684655100107193, + "learning_rate": 2.4900561784927667e-06, + "loss": 1.7324, + "step": 29393 + }, + { + "epoch": 9.022099447513812, + "grad_norm": 0.21327704191207886, + "learning_rate": 2.488507367201914e-06, + "loss": 1.729, + "step": 29394 + }, + { + "epoch": 9.022406384284837, + "grad_norm": 0.16245315968990326, + "learning_rate": 2.486959025447472e-06, + "loss": 1.7569, + "step": 29395 + }, + { + "epoch": 9.022713321055862, + "grad_norm": 0.15231920778751373, + "learning_rate": 2.4854111532447435e-06, + "loss": 1.7024, + "step": 29396 + }, + { + "epoch": 9.023020257826888, + "grad_norm": 0.20816101133823395, + "learning_rate": 2.4838637506090447e-06, + "loss": 1.7198, + "step": 29397 + }, + { + "epoch": 9.023327194597913, + "grad_norm": 0.1711280196905136, + "learning_rate": 2.4823168175556357e-06, + "loss": 1.7073, + "step": 29398 + }, + { + "epoch": 9.023634131368938, + "grad_norm": 0.14723099768161774, + "learning_rate": 2.480770354099843e-06, + "loss": 1.7472, + "step": 29399 + }, + { + "epoch": 9.023941068139964, + "grad_norm": 0.23221471905708313, + "learning_rate": 2.47922436025691e-06, + "loss": 1.7639, + "step": 29400 + }, + { + "epoch": 9.024248004910989, + "grad_norm": 0.13510727882385254, + "learning_rate": 2.4776788360421466e-06, + "loss": 1.7258, + "step": 29401 + }, + { + "epoch": 9.024554941682014, + "grad_norm": 0.2099999636411667, + "learning_rate": 2.476133781470813e-06, + "loss": 1.8019, + "step": 29402 + }, + { + "epoch": 9.024861878453038, + "grad_norm": 0.13297688961029053, + "learning_rate": 2.47458919655818e-06, + "loss": 1.7031, + "step": 29403 + }, + { + "epoch": 9.025168815224063, + "grad_norm": 0.14716757833957672, + "learning_rate": 2.4730450813195138e-06, + "loss": 1.677, + "step": 29404 + }, + { + "epoch": 9.025475751995089, + "grad_norm": 0.14763082563877106, + "learning_rate": 2.4715014357700683e-06, + "loss": 1.6863, + "step": 29405 + }, + { + "epoch": 9.025782688766114, + "grad_norm": 0.15744271874427795, + "learning_rate": 2.469958259925109e-06, + "loss": 1.7236, + "step": 29406 + }, + { + "epoch": 9.02608962553714, + "grad_norm": 0.19316953420639038, + "learning_rate": 2.4684155537998743e-06, + "loss": 1.7547, + "step": 29407 + }, + { + "epoch": 9.026396562308165, + "grad_norm": 0.14727036654949188, + "learning_rate": 2.4668733174096126e-06, + "loss": 1.7346, + "step": 29408 + }, + { + "epoch": 9.02670349907919, + "grad_norm": 0.14740467071533203, + "learning_rate": 2.465331550769584e-06, + "loss": 1.7109, + "step": 29409 + }, + { + "epoch": 9.027010435850215, + "grad_norm": 0.1295071691274643, + "learning_rate": 2.463790253894993e-06, + "loss": 1.711, + "step": 29410 + }, + { + "epoch": 9.02731737262124, + "grad_norm": 0.20718778669834137, + "learning_rate": 2.4622494268011054e-06, + "loss": 1.7146, + "step": 29411 + }, + { + "epoch": 9.027624309392266, + "grad_norm": 0.13038063049316406, + "learning_rate": 2.46070906950312e-06, + "loss": 1.6533, + "step": 29412 + }, + { + "epoch": 9.027931246163291, + "grad_norm": 0.18726535141468048, + "learning_rate": 2.459169182016269e-06, + "loss": 1.7144, + "step": 29413 + }, + { + "epoch": 9.028238182934315, + "grad_norm": 0.1343640834093094, + "learning_rate": 2.4576297643557843e-06, + "loss": 1.7014, + "step": 29414 + }, + { + "epoch": 9.02854511970534, + "grad_norm": 0.1509372591972351, + "learning_rate": 2.4560908165368544e-06, + "loss": 1.6999, + "step": 29415 + }, + { + "epoch": 9.028852056476365, + "grad_norm": 0.1541101038455963, + "learning_rate": 2.4545523385747172e-06, + "loss": 1.6962, + "step": 29416 + }, + { + "epoch": 9.02915899324739, + "grad_norm": 0.16334660351276398, + "learning_rate": 2.4530143304845432e-06, + "loss": 1.7293, + "step": 29417 + }, + { + "epoch": 9.029465930018416, + "grad_norm": 0.14802905917167664, + "learning_rate": 2.4514767922815595e-06, + "loss": 1.6747, + "step": 29418 + }, + { + "epoch": 9.029772866789441, + "grad_norm": 0.19622576236724854, + "learning_rate": 2.4499397239809487e-06, + "loss": 1.764, + "step": 29419 + }, + { + "epoch": 9.030079803560467, + "grad_norm": 0.14734432101249695, + "learning_rate": 2.4484031255979036e-06, + "loss": 1.7371, + "step": 29420 + }, + { + "epoch": 9.030386740331492, + "grad_norm": 0.16914428770542145, + "learning_rate": 2.4468669971476123e-06, + "loss": 1.7148, + "step": 29421 + }, + { + "epoch": 9.030693677102517, + "grad_norm": 0.13942086696624756, + "learning_rate": 2.4453313386452516e-06, + "loss": 1.704, + "step": 29422 + }, + { + "epoch": 9.031000613873543, + "grad_norm": 0.12403316050767899, + "learning_rate": 2.4437961501060036e-06, + "loss": 1.6567, + "step": 29423 + }, + { + "epoch": 9.031307550644566, + "grad_norm": 0.14684323966503143, + "learning_rate": 2.4422614315450287e-06, + "loss": 1.7452, + "step": 29424 + }, + { + "epoch": 9.031614487415592, + "grad_norm": 0.1687471866607666, + "learning_rate": 2.440727182977498e-06, + "loss": 1.7265, + "step": 29425 + }, + { + "epoch": 9.031921424186617, + "grad_norm": 0.14509400725364685, + "learning_rate": 2.439193404418588e-06, + "loss": 1.6855, + "step": 29426 + }, + { + "epoch": 9.032228360957642, + "grad_norm": 0.13958261907100677, + "learning_rate": 2.4376600958834373e-06, + "loss": 1.7458, + "step": 29427 + }, + { + "epoch": 9.032535297728668, + "grad_norm": 0.18749283254146576, + "learning_rate": 2.436127257387211e-06, + "loss": 1.725, + "step": 29428 + }, + { + "epoch": 9.032842234499693, + "grad_norm": 0.1423102170228958, + "learning_rate": 2.434594888945052e-06, + "loss": 1.6655, + "step": 29429 + }, + { + "epoch": 9.033149171270718, + "grad_norm": 0.17062890529632568, + "learning_rate": 2.433062990572099e-06, + "loss": 1.7059, + "step": 29430 + }, + { + "epoch": 9.033456108041744, + "grad_norm": 0.15203866362571716, + "learning_rate": 2.4315315622835124e-06, + "loss": 1.709, + "step": 29431 + }, + { + "epoch": 9.033763044812769, + "grad_norm": 0.21039290726184845, + "learning_rate": 2.4300006040943956e-06, + "loss": 1.7815, + "step": 29432 + }, + { + "epoch": 9.034069981583794, + "grad_norm": 0.17041321098804474, + "learning_rate": 2.428470116019904e-06, + "loss": 1.7417, + "step": 29433 + }, + { + "epoch": 9.03437691835482, + "grad_norm": 0.19286702573299408, + "learning_rate": 2.426940098075148e-06, + "loss": 1.7186, + "step": 29434 + }, + { + "epoch": 9.034683855125843, + "grad_norm": 0.20875763893127441, + "learning_rate": 2.425410550275253e-06, + "loss": 1.7379, + "step": 29435 + }, + { + "epoch": 9.034990791896869, + "grad_norm": 0.16214729845523834, + "learning_rate": 2.4238814726353365e-06, + "loss": 1.7419, + "step": 29436 + }, + { + "epoch": 9.035297728667894, + "grad_norm": 0.16366153955459595, + "learning_rate": 2.422352865170513e-06, + "loss": 1.7399, + "step": 29437 + }, + { + "epoch": 9.03560466543892, + "grad_norm": 0.15280435979366302, + "learning_rate": 2.420824727895882e-06, + "loss": 1.6898, + "step": 29438 + }, + { + "epoch": 9.035911602209945, + "grad_norm": 0.1929275393486023, + "learning_rate": 2.4192970608265477e-06, + "loss": 1.7452, + "step": 29439 + }, + { + "epoch": 9.03621853898097, + "grad_norm": 0.15144196152687073, + "learning_rate": 2.417769863977609e-06, + "loss": 1.6924, + "step": 29440 + }, + { + "epoch": 9.036525475751995, + "grad_norm": 0.11187378317117691, + "learning_rate": 2.4162431373641546e-06, + "loss": 1.6537, + "step": 29441 + }, + { + "epoch": 9.03683241252302, + "grad_norm": 0.14815855026245117, + "learning_rate": 2.4147168810012664e-06, + "loss": 1.7089, + "step": 29442 + }, + { + "epoch": 9.037139349294046, + "grad_norm": 0.18288609385490417, + "learning_rate": 2.413191094904055e-06, + "loss": 1.685, + "step": 29443 + }, + { + "epoch": 9.037446286065071, + "grad_norm": 0.13843944668769836, + "learning_rate": 2.4116657790875686e-06, + "loss": 1.6736, + "step": 29444 + }, + { + "epoch": 9.037753222836097, + "grad_norm": 0.11480217427015305, + "learning_rate": 2.410140933566901e-06, + "loss": 1.6416, + "step": 29445 + }, + { + "epoch": 9.03806015960712, + "grad_norm": 0.16542355716228485, + "learning_rate": 2.408616558357113e-06, + "loss": 1.7019, + "step": 29446 + }, + { + "epoch": 9.038367096378146, + "grad_norm": 0.1372150480747223, + "learning_rate": 2.4070926534732586e-06, + "loss": 1.6731, + "step": 29447 + }, + { + "epoch": 9.03867403314917, + "grad_norm": 0.16052548587322235, + "learning_rate": 2.4055692189304257e-06, + "loss": 1.7016, + "step": 29448 + }, + { + "epoch": 9.038980969920196, + "grad_norm": 0.14994394779205322, + "learning_rate": 2.4040462547436416e-06, + "loss": 1.7227, + "step": 29449 + }, + { + "epoch": 9.039287906691222, + "grad_norm": 0.1549554169178009, + "learning_rate": 2.4025237609279827e-06, + "loss": 1.7085, + "step": 29450 + }, + { + "epoch": 9.039594843462247, + "grad_norm": 0.13443107903003693, + "learning_rate": 2.401001737498465e-06, + "loss": 1.6809, + "step": 29451 + }, + { + "epoch": 9.039901780233272, + "grad_norm": 0.18695014715194702, + "learning_rate": 2.39948018447016e-06, + "loss": 1.7208, + "step": 29452 + }, + { + "epoch": 9.040208717004298, + "grad_norm": 0.1901451200246811, + "learning_rate": 2.397959101858083e-06, + "loss": 1.7576, + "step": 29453 + }, + { + "epoch": 9.040515653775323, + "grad_norm": 0.13147258758544922, + "learning_rate": 2.396438489677283e-06, + "loss": 1.6713, + "step": 29454 + }, + { + "epoch": 9.040822590546348, + "grad_norm": 0.1695723831653595, + "learning_rate": 2.3949183479427704e-06, + "loss": 1.7205, + "step": 29455 + }, + { + "epoch": 9.041129527317372, + "grad_norm": 0.1526571363210678, + "learning_rate": 2.393398676669584e-06, + "loss": 1.6849, + "step": 29456 + }, + { + "epoch": 9.041436464088397, + "grad_norm": 0.13576491177082062, + "learning_rate": 2.3918794758727325e-06, + "loss": 1.6911, + "step": 29457 + }, + { + "epoch": 9.041743400859422, + "grad_norm": 0.15050055086612701, + "learning_rate": 2.390360745567233e-06, + "loss": 1.7058, + "step": 29458 + }, + { + "epoch": 9.042050337630448, + "grad_norm": 0.16959871351718903, + "learning_rate": 2.3888424857680837e-06, + "loss": 1.741, + "step": 29459 + }, + { + "epoch": 9.042357274401473, + "grad_norm": 0.1468123197555542, + "learning_rate": 2.3873246964903116e-06, + "loss": 1.6996, + "step": 29460 + }, + { + "epoch": 9.042664211172498, + "grad_norm": 0.14826932549476624, + "learning_rate": 2.385807377748894e-06, + "loss": 1.7072, + "step": 29461 + }, + { + "epoch": 9.042971147943524, + "grad_norm": 0.13068771362304688, + "learning_rate": 2.384290529558847e-06, + "loss": 1.6627, + "step": 29462 + }, + { + "epoch": 9.043278084714549, + "grad_norm": 0.18755924701690674, + "learning_rate": 2.382774151935141e-06, + "loss": 1.7313, + "step": 29463 + }, + { + "epoch": 9.043585021485574, + "grad_norm": 0.1287360042333603, + "learning_rate": 2.38125824489277e-06, + "loss": 1.6894, + "step": 29464 + }, + { + "epoch": 9.0438919582566, + "grad_norm": 0.1582459807395935, + "learning_rate": 2.3797428084467223e-06, + "loss": 1.7206, + "step": 29465 + }, + { + "epoch": 9.044198895027625, + "grad_norm": 0.20703738927841187, + "learning_rate": 2.3782278426119575e-06, + "loss": 1.7241, + "step": 29466 + }, + { + "epoch": 9.044505831798649, + "grad_norm": 0.14492042362689972, + "learning_rate": 2.3767133474034696e-06, + "loss": 1.7058, + "step": 29467 + }, + { + "epoch": 9.044812768569674, + "grad_norm": 0.16977067291736603, + "learning_rate": 2.375199322836197e-06, + "loss": 1.7059, + "step": 29468 + }, + { + "epoch": 9.0451197053407, + "grad_norm": 0.1448739618062973, + "learning_rate": 2.3736857689251267e-06, + "loss": 1.7367, + "step": 29469 + }, + { + "epoch": 9.045426642111725, + "grad_norm": 0.13738159835338593, + "learning_rate": 2.372172685685209e-06, + "loss": 1.7058, + "step": 29470 + }, + { + "epoch": 9.04573357888275, + "grad_norm": 0.1473991870880127, + "learning_rate": 2.3706600731313976e-06, + "loss": 1.6706, + "step": 29471 + }, + { + "epoch": 9.046040515653775, + "grad_norm": 0.18705418705940247, + "learning_rate": 2.369147931278637e-06, + "loss": 1.7088, + "step": 29472 + }, + { + "epoch": 9.0463474524248, + "grad_norm": 0.14573143422603607, + "learning_rate": 2.3676362601418757e-06, + "loss": 1.6886, + "step": 29473 + }, + { + "epoch": 9.046654389195826, + "grad_norm": 0.1586790531873703, + "learning_rate": 2.3661250597360518e-06, + "loss": 1.6573, + "step": 29474 + }, + { + "epoch": 9.046961325966851, + "grad_norm": 0.14579340815544128, + "learning_rate": 2.364614330076098e-06, + "loss": 1.7235, + "step": 29475 + }, + { + "epoch": 9.047268262737877, + "grad_norm": 0.11558994650840759, + "learning_rate": 2.3631040711769358e-06, + "loss": 1.6597, + "step": 29476 + }, + { + "epoch": 9.047575199508902, + "grad_norm": 0.1311790943145752, + "learning_rate": 2.36159428305352e-06, + "loss": 1.6846, + "step": 29477 + }, + { + "epoch": 9.047882136279926, + "grad_norm": 0.17676955461502075, + "learning_rate": 2.3600849657207323e-06, + "loss": 1.7511, + "step": 29478 + }, + { + "epoch": 9.04818907305095, + "grad_norm": 0.1472693681716919, + "learning_rate": 2.358576119193523e-06, + "loss": 1.6836, + "step": 29479 + }, + { + "epoch": 9.048496009821976, + "grad_norm": 0.15737339854240417, + "learning_rate": 2.3570677434867795e-06, + "loss": 1.7285, + "step": 29480 + }, + { + "epoch": 9.048802946593002, + "grad_norm": 0.17748746275901794, + "learning_rate": 2.355559838615412e-06, + "loss": 1.7267, + "step": 29481 + }, + { + "epoch": 9.049109883364027, + "grad_norm": 0.12016935646533966, + "learning_rate": 2.3540524045943425e-06, + "loss": 1.6677, + "step": 29482 + }, + { + "epoch": 9.049416820135052, + "grad_norm": 0.1696930080652237, + "learning_rate": 2.352545441438442e-06, + "loss": 1.6765, + "step": 29483 + }, + { + "epoch": 9.049723756906078, + "grad_norm": 0.17330607771873474, + "learning_rate": 2.3510389491626208e-06, + "loss": 1.727, + "step": 29484 + }, + { + "epoch": 9.050030693677103, + "grad_norm": 0.14688768982887268, + "learning_rate": 2.3495329277817502e-06, + "loss": 1.7149, + "step": 29485 + }, + { + "epoch": 9.050337630448128, + "grad_norm": 0.14381086826324463, + "learning_rate": 2.3480273773107297e-06, + "loss": 1.6448, + "step": 29486 + }, + { + "epoch": 9.050644567219154, + "grad_norm": 0.14638835191726685, + "learning_rate": 2.3465222977644364e-06, + "loss": 1.6979, + "step": 29487 + }, + { + "epoch": 9.050951503990179, + "grad_norm": 0.13770419359207153, + "learning_rate": 2.345017689157736e-06, + "loss": 1.6991, + "step": 29488 + }, + { + "epoch": 9.051258440761202, + "grad_norm": 0.16549327969551086, + "learning_rate": 2.3435135515055053e-06, + "loss": 1.681, + "step": 29489 + }, + { + "epoch": 9.051565377532228, + "grad_norm": 0.19915145635604858, + "learning_rate": 2.3420098848226046e-06, + "loss": 1.7726, + "step": 29490 + }, + { + "epoch": 9.051872314303253, + "grad_norm": 0.15350405871868134, + "learning_rate": 2.3405066891238945e-06, + "loss": 1.7159, + "step": 29491 + }, + { + "epoch": 9.052179251074278, + "grad_norm": 0.1314122974872589, + "learning_rate": 2.3390039644242356e-06, + "loss": 1.7224, + "step": 29492 + }, + { + "epoch": 9.052486187845304, + "grad_norm": 0.18343986570835114, + "learning_rate": 2.3375017107384655e-06, + "loss": 1.7572, + "step": 29493 + }, + { + "epoch": 9.05279312461633, + "grad_norm": 0.1556810587644577, + "learning_rate": 2.3359999280814506e-06, + "loss": 1.6476, + "step": 29494 + }, + { + "epoch": 9.053100061387354, + "grad_norm": 0.11017484217882156, + "learning_rate": 2.334498616468017e-06, + "loss": 1.6544, + "step": 29495 + }, + { + "epoch": 9.05340699815838, + "grad_norm": 0.1391851007938385, + "learning_rate": 2.332997775913004e-06, + "loss": 1.6594, + "step": 29496 + }, + { + "epoch": 9.053713934929405, + "grad_norm": 0.1584119200706482, + "learning_rate": 2.3314974064312433e-06, + "loss": 1.7588, + "step": 29497 + }, + { + "epoch": 9.05402087170043, + "grad_norm": 0.10139171034097672, + "learning_rate": 2.3299975080375625e-06, + "loss": 1.6621, + "step": 29498 + }, + { + "epoch": 9.054327808471454, + "grad_norm": 0.14895425736904144, + "learning_rate": 2.3284980807467994e-06, + "loss": 1.7145, + "step": 29499 + }, + { + "epoch": 9.05463474524248, + "grad_norm": 0.11982736736536026, + "learning_rate": 2.326999124573742e-06, + "loss": 1.6654, + "step": 29500 + }, + { + "epoch": 9.054941682013505, + "grad_norm": 0.15541890263557434, + "learning_rate": 2.32550063953324e-06, + "loss": 1.6889, + "step": 29501 + }, + { + "epoch": 9.05524861878453, + "grad_norm": 0.13237549364566803, + "learning_rate": 2.324002625640065e-06, + "loss": 1.7191, + "step": 29502 + }, + { + "epoch": 9.055555555555555, + "grad_norm": 0.16847456991672516, + "learning_rate": 2.3225050829090546e-06, + "loss": 1.7064, + "step": 29503 + }, + { + "epoch": 9.05586249232658, + "grad_norm": 0.16782483458518982, + "learning_rate": 2.321008011354986e-06, + "loss": 1.7303, + "step": 29504 + }, + { + "epoch": 9.056169429097606, + "grad_norm": 0.1684166043996811, + "learning_rate": 2.3195114109926643e-06, + "loss": 1.7071, + "step": 29505 + }, + { + "epoch": 9.056476365868631, + "grad_norm": 0.11413996666669846, + "learning_rate": 2.3180152818368774e-06, + "loss": 1.6664, + "step": 29506 + }, + { + "epoch": 9.056783302639657, + "grad_norm": 0.14353851974010468, + "learning_rate": 2.316519623902408e-06, + "loss": 1.7375, + "step": 29507 + }, + { + "epoch": 9.057090239410682, + "grad_norm": 0.20431090891361237, + "learning_rate": 2.315024437204044e-06, + "loss": 1.7307, + "step": 29508 + }, + { + "epoch": 9.057397176181707, + "grad_norm": 0.1507789045572281, + "learning_rate": 2.3135297217565576e-06, + "loss": 1.6582, + "step": 29509 + }, + { + "epoch": 9.057704112952731, + "grad_norm": 0.1449059545993805, + "learning_rate": 2.3120354775747143e-06, + "loss": 1.6808, + "step": 29510 + }, + { + "epoch": 9.058011049723756, + "grad_norm": 0.11667517572641373, + "learning_rate": 2.3105417046732915e-06, + "loss": 1.7207, + "step": 29511 + }, + { + "epoch": 9.058317986494782, + "grad_norm": 0.13248896598815918, + "learning_rate": 2.3090484030670488e-06, + "loss": 1.6908, + "step": 29512 + }, + { + "epoch": 9.058624923265807, + "grad_norm": 0.1595017910003662, + "learning_rate": 2.307555572770742e-06, + "loss": 1.7026, + "step": 29513 + }, + { + "epoch": 9.058931860036832, + "grad_norm": 0.22244125604629517, + "learning_rate": 2.3060632137991257e-06, + "loss": 1.7571, + "step": 29514 + }, + { + "epoch": 9.059238796807858, + "grad_norm": 0.1424504965543747, + "learning_rate": 2.3045713261669433e-06, + "loss": 1.701, + "step": 29515 + }, + { + "epoch": 9.059545733578883, + "grad_norm": 0.12159547954797745, + "learning_rate": 2.3030799098889444e-06, + "loss": 1.7167, + "step": 29516 + }, + { + "epoch": 9.059852670349908, + "grad_norm": 0.1438741683959961, + "learning_rate": 2.301588964979856e-06, + "loss": 1.6736, + "step": 29517 + }, + { + "epoch": 9.060159607120934, + "grad_norm": 0.19870363175868988, + "learning_rate": 2.3000984914544386e-06, + "loss": 1.7801, + "step": 29518 + }, + { + "epoch": 9.060466543891959, + "grad_norm": 0.14005307853221893, + "learning_rate": 2.298608489327392e-06, + "loss": 1.6933, + "step": 29519 + }, + { + "epoch": 9.060773480662984, + "grad_norm": 0.15449295938014984, + "learning_rate": 2.297118958613459e-06, + "loss": 1.6894, + "step": 29520 + }, + { + "epoch": 9.061080417434008, + "grad_norm": 0.15363426506519318, + "learning_rate": 2.2956298993273615e-06, + "loss": 1.6945, + "step": 29521 + }, + { + "epoch": 9.061387354205033, + "grad_norm": 0.20762746036052704, + "learning_rate": 2.294141311483805e-06, + "loss": 1.6774, + "step": 29522 + }, + { + "epoch": 9.061694290976058, + "grad_norm": 0.1773165762424469, + "learning_rate": 2.2926531950975107e-06, + "loss": 1.7868, + "step": 29523 + }, + { + "epoch": 9.062001227747084, + "grad_norm": 0.13610224425792694, + "learning_rate": 2.291165550183172e-06, + "loss": 1.6945, + "step": 29524 + }, + { + "epoch": 9.06230816451811, + "grad_norm": 0.13063403964042664, + "learning_rate": 2.2896783767555053e-06, + "loss": 1.684, + "step": 29525 + }, + { + "epoch": 9.062615101289135, + "grad_norm": 0.1523241400718689, + "learning_rate": 2.2881916748291987e-06, + "loss": 1.7392, + "step": 29526 + }, + { + "epoch": 9.06292203806016, + "grad_norm": 0.17883025109767914, + "learning_rate": 2.286705444418946e-06, + "loss": 1.7474, + "step": 29527 + }, + { + "epoch": 9.063228974831185, + "grad_norm": 0.14900827407836914, + "learning_rate": 2.2852196855394358e-06, + "loss": 1.7096, + "step": 29528 + }, + { + "epoch": 9.06353591160221, + "grad_norm": 0.1691586673259735, + "learning_rate": 2.2837343982053503e-06, + "loss": 1.7373, + "step": 29529 + }, + { + "epoch": 9.063842848373236, + "grad_norm": 0.1183643490076065, + "learning_rate": 2.282249582431367e-06, + "loss": 1.6689, + "step": 29530 + }, + { + "epoch": 9.06414978514426, + "grad_norm": 0.16844353079795837, + "learning_rate": 2.280765238232163e-06, + "loss": 1.7345, + "step": 29531 + }, + { + "epoch": 9.064456721915285, + "grad_norm": 0.13235628604888916, + "learning_rate": 2.27928136562241e-06, + "loss": 1.69, + "step": 29532 + }, + { + "epoch": 9.06476365868631, + "grad_norm": 0.13285794854164124, + "learning_rate": 2.277797964616768e-06, + "loss": 1.6842, + "step": 29533 + }, + { + "epoch": 9.065070595457335, + "grad_norm": 0.13197976350784302, + "learning_rate": 2.2763150352298866e-06, + "loss": 1.7036, + "step": 29534 + }, + { + "epoch": 9.06537753222836, + "grad_norm": 0.13822008669376373, + "learning_rate": 2.274832577476449e-06, + "loss": 1.7079, + "step": 29535 + }, + { + "epoch": 9.065684468999386, + "grad_norm": 0.14020980894565582, + "learning_rate": 2.2733505913710705e-06, + "loss": 1.716, + "step": 29536 + }, + { + "epoch": 9.065991405770411, + "grad_norm": 0.13733944296836853, + "learning_rate": 2.271869076928429e-06, + "loss": 1.7167, + "step": 29537 + }, + { + "epoch": 9.066298342541437, + "grad_norm": 0.13786739110946655, + "learning_rate": 2.27038803416314e-06, + "loss": 1.7111, + "step": 29538 + }, + { + "epoch": 9.066605279312462, + "grad_norm": 0.17205199599266052, + "learning_rate": 2.268907463089859e-06, + "loss": 1.7095, + "step": 29539 + }, + { + "epoch": 9.066912216083487, + "grad_norm": 0.16810791194438934, + "learning_rate": 2.2674273637232123e-06, + "loss": 1.7002, + "step": 29540 + }, + { + "epoch": 9.067219152854513, + "grad_norm": 0.15370075404644012, + "learning_rate": 2.2659477360778226e-06, + "loss": 1.7309, + "step": 29541 + }, + { + "epoch": 9.067526089625536, + "grad_norm": 0.18854720890522003, + "learning_rate": 2.2644685801683165e-06, + "loss": 1.7731, + "step": 29542 + }, + { + "epoch": 9.067833026396562, + "grad_norm": 0.14275872707366943, + "learning_rate": 2.2629898960093097e-06, + "loss": 1.7042, + "step": 29543 + }, + { + "epoch": 9.068139963167587, + "grad_norm": 0.13044105470180511, + "learning_rate": 2.261511683615414e-06, + "loss": 1.6886, + "step": 29544 + }, + { + "epoch": 9.068446899938612, + "grad_norm": 0.1964588612318039, + "learning_rate": 2.2600339430012442e-06, + "loss": 1.756, + "step": 29545 + }, + { + "epoch": 9.068753836709638, + "grad_norm": 0.15589101612567902, + "learning_rate": 2.2585566741814e-06, + "loss": 1.6922, + "step": 29546 + }, + { + "epoch": 9.069060773480663, + "grad_norm": 0.1840185523033142, + "learning_rate": 2.257079877170476e-06, + "loss": 1.7668, + "step": 29547 + }, + { + "epoch": 9.069367710251688, + "grad_norm": 0.11688835173845291, + "learning_rate": 2.2556035519830765e-06, + "loss": 1.6866, + "step": 29548 + }, + { + "epoch": 9.069674647022714, + "grad_norm": 0.16568957269191742, + "learning_rate": 2.2541276986337844e-06, + "loss": 1.7353, + "step": 29549 + }, + { + "epoch": 9.069981583793739, + "grad_norm": 0.1312425136566162, + "learning_rate": 2.252652317137188e-06, + "loss": 1.7476, + "step": 29550 + }, + { + "epoch": 9.070288520564764, + "grad_norm": 0.12554149329662323, + "learning_rate": 2.251177407507865e-06, + "loss": 1.6452, + "step": 29551 + }, + { + "epoch": 9.07059545733579, + "grad_norm": 0.14966057240962982, + "learning_rate": 2.249702969760398e-06, + "loss": 1.7362, + "step": 29552 + }, + { + "epoch": 9.070902394106813, + "grad_norm": 0.1935591846704483, + "learning_rate": 2.248229003909347e-06, + "loss": 1.7284, + "step": 29553 + }, + { + "epoch": 9.071209330877839, + "grad_norm": 0.1565311849117279, + "learning_rate": 2.246755509969295e-06, + "loss": 1.7068, + "step": 29554 + }, + { + "epoch": 9.071516267648864, + "grad_norm": 0.14980174601078033, + "learning_rate": 2.2452824879547806e-06, + "loss": 1.7455, + "step": 29555 + }, + { + "epoch": 9.07182320441989, + "grad_norm": 0.16639313101768494, + "learning_rate": 2.243809937880381e-06, + "loss": 1.7515, + "step": 29556 + }, + { + "epoch": 9.072130141190915, + "grad_norm": 0.12895835936069489, + "learning_rate": 2.242337859760646e-06, + "loss": 1.6401, + "step": 29557 + }, + { + "epoch": 9.07243707796194, + "grad_norm": 0.175545796751976, + "learning_rate": 2.240866253610119e-06, + "loss": 1.6815, + "step": 29558 + }, + { + "epoch": 9.072744014732965, + "grad_norm": 0.16137781739234924, + "learning_rate": 2.2393951194433437e-06, + "loss": 1.7161, + "step": 29559 + }, + { + "epoch": 9.07305095150399, + "grad_norm": 0.15323428809642792, + "learning_rate": 2.2379244572748536e-06, + "loss": 1.6917, + "step": 29560 + }, + { + "epoch": 9.073357888275016, + "grad_norm": 0.13572439551353455, + "learning_rate": 2.2364542671191978e-06, + "loss": 1.732, + "step": 29561 + }, + { + "epoch": 9.073664825046041, + "grad_norm": 0.1529226154088974, + "learning_rate": 2.234984548990887e-06, + "loss": 1.7082, + "step": 29562 + }, + { + "epoch": 9.073971761817067, + "grad_norm": 0.16901282966136932, + "learning_rate": 2.2335153029044598e-06, + "loss": 1.6796, + "step": 29563 + }, + { + "epoch": 9.07427869858809, + "grad_norm": 0.120974101126194, + "learning_rate": 2.2320465288744317e-06, + "loss": 1.6871, + "step": 29564 + }, + { + "epoch": 9.074585635359115, + "grad_norm": 0.1303488165140152, + "learning_rate": 2.2305782269153143e-06, + "loss": 1.7193, + "step": 29565 + }, + { + "epoch": 9.07489257213014, + "grad_norm": 0.13454987108707428, + "learning_rate": 2.2291103970416227e-06, + "loss": 1.6841, + "step": 29566 + }, + { + "epoch": 9.075199508901166, + "grad_norm": 0.14908376336097717, + "learning_rate": 2.2276430392678628e-06, + "loss": 1.7352, + "step": 29567 + }, + { + "epoch": 9.075506445672191, + "grad_norm": 0.16618986427783966, + "learning_rate": 2.226176153608528e-06, + "loss": 1.711, + "step": 29568 + }, + { + "epoch": 9.075813382443217, + "grad_norm": 0.1883801370859146, + "learning_rate": 2.224709740078135e-06, + "loss": 1.7297, + "step": 29569 + }, + { + "epoch": 9.076120319214242, + "grad_norm": 0.16342709958553314, + "learning_rate": 2.2232437986911492e-06, + "loss": 1.7207, + "step": 29570 + }, + { + "epoch": 9.076427255985267, + "grad_norm": 0.16771680116653442, + "learning_rate": 2.221778329462082e-06, + "loss": 1.6875, + "step": 29571 + }, + { + "epoch": 9.076734192756293, + "grad_norm": 0.1790522187948227, + "learning_rate": 2.2203133324053936e-06, + "loss": 1.675, + "step": 29572 + }, + { + "epoch": 9.077041129527318, + "grad_norm": 0.1973496973514557, + "learning_rate": 2.2188488075355785e-06, + "loss": 1.7385, + "step": 29573 + }, + { + "epoch": 9.077348066298342, + "grad_norm": 0.1493360847234726, + "learning_rate": 2.2173847548671077e-06, + "loss": 1.6645, + "step": 29574 + }, + { + "epoch": 9.077655003069367, + "grad_norm": 0.18652872741222382, + "learning_rate": 2.2159211744144424e-06, + "loss": 1.739, + "step": 29575 + }, + { + "epoch": 9.077961939840392, + "grad_norm": 0.1569397747516632, + "learning_rate": 2.2144580661920544e-06, + "loss": 1.6857, + "step": 29576 + }, + { + "epoch": 9.078268876611418, + "grad_norm": 0.14565426111221313, + "learning_rate": 2.212995430214404e-06, + "loss": 1.6808, + "step": 29577 + }, + { + "epoch": 9.078575813382443, + "grad_norm": 0.186843141913414, + "learning_rate": 2.2115332664959353e-06, + "loss": 1.6877, + "step": 29578 + }, + { + "epoch": 9.078882750153468, + "grad_norm": 0.11076909303665161, + "learning_rate": 2.2100715750511038e-06, + "loss": 1.6692, + "step": 29579 + }, + { + "epoch": 9.079189686924494, + "grad_norm": 0.15020497143268585, + "learning_rate": 2.2086103558943583e-06, + "loss": 1.71, + "step": 29580 + }, + { + "epoch": 9.079496623695519, + "grad_norm": 0.17952266335487366, + "learning_rate": 2.207149609040138e-06, + "loss": 1.6728, + "step": 29581 + }, + { + "epoch": 9.079803560466544, + "grad_norm": 0.14447824656963348, + "learning_rate": 2.205689334502875e-06, + "loss": 1.679, + "step": 29582 + }, + { + "epoch": 9.08011049723757, + "grad_norm": 0.12692491710186005, + "learning_rate": 2.204229532297003e-06, + "loss": 1.6881, + "step": 29583 + }, + { + "epoch": 9.080417434008595, + "grad_norm": 0.15565918385982513, + "learning_rate": 2.2027702024369547e-06, + "loss": 1.6702, + "step": 29584 + }, + { + "epoch": 9.080724370779619, + "grad_norm": 0.14430411159992218, + "learning_rate": 2.201311344937135e-06, + "loss": 1.6992, + "step": 29585 + }, + { + "epoch": 9.081031307550644, + "grad_norm": 0.2374502569437027, + "learning_rate": 2.1998529598119823e-06, + "loss": 1.834, + "step": 29586 + }, + { + "epoch": 9.08133824432167, + "grad_norm": 0.13957563042640686, + "learning_rate": 2.1983950470758907e-06, + "loss": 1.6617, + "step": 29587 + }, + { + "epoch": 9.081645181092695, + "grad_norm": 0.16792097687721252, + "learning_rate": 2.196937606743288e-06, + "loss": 1.7357, + "step": 29588 + }, + { + "epoch": 9.08195211786372, + "grad_norm": 0.18628741800785065, + "learning_rate": 2.195480638828551e-06, + "loss": 1.728, + "step": 29589 + }, + { + "epoch": 9.082259054634745, + "grad_norm": 0.1528443992137909, + "learning_rate": 2.1940241433461016e-06, + "loss": 1.7374, + "step": 29590 + }, + { + "epoch": 9.08256599140577, + "grad_norm": 0.1556825041770935, + "learning_rate": 2.1925681203103287e-06, + "loss": 1.744, + "step": 29591 + }, + { + "epoch": 9.082872928176796, + "grad_norm": 0.14697785675525665, + "learning_rate": 2.191112569735615e-06, + "loss": 1.6721, + "step": 29592 + }, + { + "epoch": 9.083179864947821, + "grad_norm": 0.18705244362354279, + "learning_rate": 2.1896574916363488e-06, + "loss": 1.7284, + "step": 29593 + }, + { + "epoch": 9.083486801718847, + "grad_norm": 0.2209276407957077, + "learning_rate": 2.188202886026908e-06, + "loss": 1.7261, + "step": 29594 + }, + { + "epoch": 9.083793738489872, + "grad_norm": 0.13894188404083252, + "learning_rate": 2.1867487529216748e-06, + "loss": 1.7112, + "step": 29595 + }, + { + "epoch": 9.084100675260895, + "grad_norm": 0.13467033207416534, + "learning_rate": 2.18529509233501e-06, + "loss": 1.6566, + "step": 29596 + }, + { + "epoch": 9.08440761203192, + "grad_norm": 0.11996985971927643, + "learning_rate": 2.1838419042812862e-06, + "loss": 1.652, + "step": 29597 + }, + { + "epoch": 9.084714548802946, + "grad_norm": 0.12615782022476196, + "learning_rate": 2.182389188774864e-06, + "loss": 1.7007, + "step": 29598 + }, + { + "epoch": 9.085021485573971, + "grad_norm": 0.15084239840507507, + "learning_rate": 2.1809369458300976e-06, + "loss": 1.6605, + "step": 29599 + }, + { + "epoch": 9.085328422344997, + "grad_norm": 0.15964055061340332, + "learning_rate": 2.1794851754613375e-06, + "loss": 1.7452, + "step": 29600 + }, + { + "epoch": 9.085635359116022, + "grad_norm": 0.15842875838279724, + "learning_rate": 2.178033877682939e-06, + "loss": 1.7223, + "step": 29601 + }, + { + "epoch": 9.085942295887047, + "grad_norm": 0.1889149248600006, + "learning_rate": 2.176583052509229e-06, + "loss": 1.7568, + "step": 29602 + }, + { + "epoch": 9.086249232658073, + "grad_norm": 0.14189517498016357, + "learning_rate": 2.1751326999545683e-06, + "loss": 1.7275, + "step": 29603 + }, + { + "epoch": 9.086556169429098, + "grad_norm": 0.13787707686424255, + "learning_rate": 2.1736828200332625e-06, + "loss": 1.7101, + "step": 29604 + }, + { + "epoch": 9.086863106200123, + "grad_norm": 0.1241447925567627, + "learning_rate": 2.1722334127596723e-06, + "loss": 1.6529, + "step": 29605 + }, + { + "epoch": 9.087170042971149, + "grad_norm": 0.14460496604442596, + "learning_rate": 2.1707844781480858e-06, + "loss": 1.6891, + "step": 29606 + }, + { + "epoch": 9.087476979742172, + "grad_norm": 0.1723712533712387, + "learning_rate": 2.169336016212853e-06, + "loss": 1.6892, + "step": 29607 + }, + { + "epoch": 9.087783916513198, + "grad_norm": 0.20372143387794495, + "learning_rate": 2.1678880269682734e-06, + "loss": 1.7786, + "step": 29608 + }, + { + "epoch": 9.088090853284223, + "grad_norm": 0.1281127631664276, + "learning_rate": 2.166440510428658e-06, + "loss": 1.6971, + "step": 29609 + }, + { + "epoch": 9.088397790055248, + "grad_norm": 0.17418532073497772, + "learning_rate": 2.1649934666083118e-06, + "loss": 1.6813, + "step": 29610 + }, + { + "epoch": 9.088704726826274, + "grad_norm": 0.13917995989322662, + "learning_rate": 2.16354689552154e-06, + "loss": 1.6787, + "step": 29611 + }, + { + "epoch": 9.089011663597299, + "grad_norm": 0.12206067144870758, + "learning_rate": 2.1621007971826367e-06, + "loss": 1.6792, + "step": 29612 + }, + { + "epoch": 9.089318600368324, + "grad_norm": 0.14317838847637177, + "learning_rate": 2.1606551716058907e-06, + "loss": 1.732, + "step": 29613 + }, + { + "epoch": 9.08962553713935, + "grad_norm": 0.1607116013765335, + "learning_rate": 2.1592100188055907e-06, + "loss": 1.6927, + "step": 29614 + }, + { + "epoch": 9.089932473910375, + "grad_norm": 0.14611779153347015, + "learning_rate": 2.1577653387960197e-06, + "loss": 1.7165, + "step": 29615 + }, + { + "epoch": 9.0902394106814, + "grad_norm": 0.15042389929294586, + "learning_rate": 2.156321131591449e-06, + "loss": 1.7221, + "step": 29616 + }, + { + "epoch": 9.090546347452424, + "grad_norm": 0.1669636368751526, + "learning_rate": 2.1548773972061563e-06, + "loss": 1.714, + "step": 29617 + }, + { + "epoch": 9.09085328422345, + "grad_norm": 0.22214718163013458, + "learning_rate": 2.1534341356544086e-06, + "loss": 1.6956, + "step": 29618 + }, + { + "epoch": 9.091160220994475, + "grad_norm": 0.16929394006729126, + "learning_rate": 2.151991346950466e-06, + "loss": 1.7095, + "step": 29619 + }, + { + "epoch": 9.0914671577655, + "grad_norm": 0.15387636423110962, + "learning_rate": 2.150549031108595e-06, + "loss": 1.7566, + "step": 29620 + }, + { + "epoch": 9.091774094536525, + "grad_norm": 0.19231966137886047, + "learning_rate": 2.1491071881430348e-06, + "loss": 1.7671, + "step": 29621 + }, + { + "epoch": 9.09208103130755, + "grad_norm": 0.15853071212768555, + "learning_rate": 2.1476658180680566e-06, + "loss": 1.7098, + "step": 29622 + }, + { + "epoch": 9.092387968078576, + "grad_norm": 0.11180046200752258, + "learning_rate": 2.146224920897877e-06, + "loss": 1.6501, + "step": 29623 + }, + { + "epoch": 9.092694904849601, + "grad_norm": 0.13134215772151947, + "learning_rate": 2.1447844966467625e-06, + "loss": 1.7099, + "step": 29624 + }, + { + "epoch": 9.093001841620627, + "grad_norm": 0.1667555719614029, + "learning_rate": 2.143344545328929e-06, + "loss": 1.7205, + "step": 29625 + }, + { + "epoch": 9.093308778391652, + "grad_norm": 0.18456818163394928, + "learning_rate": 2.1419050669586216e-06, + "loss": 1.737, + "step": 29626 + }, + { + "epoch": 9.093615715162677, + "grad_norm": 0.1580527424812317, + "learning_rate": 2.1404660615500506e-06, + "loss": 1.7092, + "step": 29627 + }, + { + "epoch": 9.0939226519337, + "grad_norm": 0.1242590993642807, + "learning_rate": 2.1390275291174542e-06, + "loss": 1.6641, + "step": 29628 + }, + { + "epoch": 9.094229588704726, + "grad_norm": 0.10987458378076553, + "learning_rate": 2.137589469675033e-06, + "loss": 1.6275, + "step": 29629 + }, + { + "epoch": 9.094536525475752, + "grad_norm": 0.1660260111093521, + "learning_rate": 2.1361518832370087e-06, + "loss": 1.7781, + "step": 29630 + }, + { + "epoch": 9.094843462246777, + "grad_norm": 0.11643832921981812, + "learning_rate": 2.134714769817586e-06, + "loss": 1.6596, + "step": 29631 + }, + { + "epoch": 9.095150399017802, + "grad_norm": 0.13046015799045563, + "learning_rate": 2.1332781294309654e-06, + "loss": 1.7046, + "step": 29632 + }, + { + "epoch": 9.095457335788828, + "grad_norm": 0.12697182595729828, + "learning_rate": 2.1318419620913466e-06, + "loss": 1.6422, + "step": 29633 + }, + { + "epoch": 9.095764272559853, + "grad_norm": 0.15039731562137604, + "learning_rate": 2.1304062678129233e-06, + "loss": 1.7088, + "step": 29634 + }, + { + "epoch": 9.096071209330878, + "grad_norm": 0.12595078349113464, + "learning_rate": 2.128971046609879e-06, + "loss": 1.7156, + "step": 29635 + }, + { + "epoch": 9.096378146101904, + "grad_norm": 0.13836300373077393, + "learning_rate": 2.1275362984963966e-06, + "loss": 1.7024, + "step": 29636 + }, + { + "epoch": 9.096685082872929, + "grad_norm": 0.15840092301368713, + "learning_rate": 2.12610202348667e-06, + "loss": 1.7581, + "step": 29637 + }, + { + "epoch": 9.096992019643954, + "grad_norm": 0.11084351688623428, + "learning_rate": 2.1246682215948556e-06, + "loss": 1.6444, + "step": 29638 + }, + { + "epoch": 9.097298956414978, + "grad_norm": 0.148295059800148, + "learning_rate": 2.1232348928351353e-06, + "loss": 1.6947, + "step": 29639 + }, + { + "epoch": 9.097605893186003, + "grad_norm": 0.1266920119524002, + "learning_rate": 2.121802037221665e-06, + "loss": 1.6838, + "step": 29640 + }, + { + "epoch": 9.097912829957028, + "grad_norm": 0.15683111548423767, + "learning_rate": 2.1203696547686116e-06, + "loss": 1.7522, + "step": 29641 + }, + { + "epoch": 9.098219766728054, + "grad_norm": 0.15225628018379211, + "learning_rate": 2.118937745490124e-06, + "loss": 1.7464, + "step": 29642 + }, + { + "epoch": 9.098526703499079, + "grad_norm": 0.12527376413345337, + "learning_rate": 2.1175063094003632e-06, + "loss": 1.6595, + "step": 29643 + }, + { + "epoch": 9.098833640270104, + "grad_norm": 0.13361993432044983, + "learning_rate": 2.1160753465134685e-06, + "loss": 1.6694, + "step": 29644 + }, + { + "epoch": 9.09914057704113, + "grad_norm": 0.22824819386005402, + "learning_rate": 2.114644856843584e-06, + "loss": 1.7497, + "step": 29645 + }, + { + "epoch": 9.099447513812155, + "grad_norm": 0.14481870830059052, + "learning_rate": 2.1132148404048424e-06, + "loss": 1.7107, + "step": 29646 + }, + { + "epoch": 9.09975445058318, + "grad_norm": 0.12271635234355927, + "learning_rate": 2.1117852972113828e-06, + "loss": 1.7115, + "step": 29647 + }, + { + "epoch": 9.100061387354206, + "grad_norm": 0.15120261907577515, + "learning_rate": 2.110356227277327e-06, + "loss": 1.7039, + "step": 29648 + }, + { + "epoch": 9.10036832412523, + "grad_norm": 0.15357789397239685, + "learning_rate": 2.1089276306168025e-06, + "loss": 1.7183, + "step": 29649 + }, + { + "epoch": 9.100675260896255, + "grad_norm": 0.1110130101442337, + "learning_rate": 2.1074995072439207e-06, + "loss": 1.6682, + "step": 29650 + }, + { + "epoch": 9.10098219766728, + "grad_norm": 0.14395900070667267, + "learning_rate": 2.106071857172803e-06, + "loss": 1.6806, + "step": 29651 + }, + { + "epoch": 9.101289134438305, + "grad_norm": 0.14335933327674866, + "learning_rate": 2.1046446804175555e-06, + "loss": 1.714, + "step": 29652 + }, + { + "epoch": 9.10159607120933, + "grad_norm": 0.12558147311210632, + "learning_rate": 2.103217976992272e-06, + "loss": 1.6841, + "step": 29653 + }, + { + "epoch": 9.101903007980356, + "grad_norm": 0.122133269906044, + "learning_rate": 2.101791746911075e-06, + "loss": 1.6745, + "step": 29654 + }, + { + "epoch": 9.102209944751381, + "grad_norm": 0.13860413432121277, + "learning_rate": 2.1003659901880357e-06, + "loss": 1.724, + "step": 29655 + }, + { + "epoch": 9.102516881522407, + "grad_norm": 0.16283336281776428, + "learning_rate": 2.098940706837266e-06, + "loss": 1.7328, + "step": 29656 + }, + { + "epoch": 9.102823818293432, + "grad_norm": 0.16371138393878937, + "learning_rate": 2.097515896872826e-06, + "loss": 1.7541, + "step": 29657 + }, + { + "epoch": 9.103130755064457, + "grad_norm": 0.1482359617948532, + "learning_rate": 2.096091560308816e-06, + "loss": 1.721, + "step": 29658 + }, + { + "epoch": 9.103437691835483, + "grad_norm": 0.15985986590385437, + "learning_rate": 2.0946676971593083e-06, + "loss": 1.7878, + "step": 29659 + }, + { + "epoch": 9.103744628606506, + "grad_norm": 0.1820739209651947, + "learning_rate": 2.0932443074383747e-06, + "loss": 1.717, + "step": 29660 + }, + { + "epoch": 9.104051565377532, + "grad_norm": 0.14114773273468018, + "learning_rate": 2.091821391160076e-06, + "loss": 1.6881, + "step": 29661 + }, + { + "epoch": 9.104358502148557, + "grad_norm": 0.14509153366088867, + "learning_rate": 2.090398948338479e-06, + "loss": 1.7082, + "step": 29662 + }, + { + "epoch": 9.104665438919582, + "grad_norm": 0.1653892993927002, + "learning_rate": 2.088976978987639e-06, + "loss": 1.747, + "step": 29663 + }, + { + "epoch": 9.104972375690608, + "grad_norm": 0.1548600047826767, + "learning_rate": 2.0875554831216116e-06, + "loss": 1.6506, + "step": 29664 + }, + { + "epoch": 9.105279312461633, + "grad_norm": 0.15069860219955444, + "learning_rate": 2.086134460754446e-06, + "loss": 1.681, + "step": 29665 + }, + { + "epoch": 9.105586249232658, + "grad_norm": 0.17018845677375793, + "learning_rate": 2.0847139119001824e-06, + "loss": 1.7066, + "step": 29666 + }, + { + "epoch": 9.105893186003684, + "grad_norm": 0.137167289853096, + "learning_rate": 2.0832938365728582e-06, + "loss": 1.6858, + "step": 29667 + }, + { + "epoch": 9.106200122774709, + "grad_norm": 0.13983163237571716, + "learning_rate": 2.081874234786507e-06, + "loss": 1.7306, + "step": 29668 + }, + { + "epoch": 9.106507059545734, + "grad_norm": 0.20317591726779938, + "learning_rate": 2.0804551065551626e-06, + "loss": 1.7506, + "step": 29669 + }, + { + "epoch": 9.10681399631676, + "grad_norm": 0.16218522191047668, + "learning_rate": 2.0790364518928406e-06, + "loss": 1.7475, + "step": 29670 + }, + { + "epoch": 9.107120933087783, + "grad_norm": 0.11892718076705933, + "learning_rate": 2.0776182708135805e-06, + "loss": 1.6876, + "step": 29671 + }, + { + "epoch": 9.107427869858808, + "grad_norm": 0.13815937936306, + "learning_rate": 2.076200563331371e-06, + "loss": 1.7187, + "step": 29672 + }, + { + "epoch": 9.107734806629834, + "grad_norm": 0.12870736420154572, + "learning_rate": 2.07478332946025e-06, + "loss": 1.6836, + "step": 29673 + }, + { + "epoch": 9.10804174340086, + "grad_norm": 0.13736192882061005, + "learning_rate": 2.0733665692142024e-06, + "loss": 1.6865, + "step": 29674 + }, + { + "epoch": 9.108348680171884, + "grad_norm": 0.12006348371505737, + "learning_rate": 2.071950282607238e-06, + "loss": 1.6911, + "step": 29675 + }, + { + "epoch": 9.10865561694291, + "grad_norm": 0.16973024606704712, + "learning_rate": 2.070534469653351e-06, + "loss": 1.7203, + "step": 29676 + }, + { + "epoch": 9.108962553713935, + "grad_norm": 0.12767069041728973, + "learning_rate": 2.069119130366537e-06, + "loss": 1.6747, + "step": 29677 + }, + { + "epoch": 9.10926949048496, + "grad_norm": 0.14068815112113953, + "learning_rate": 2.0677042647607837e-06, + "loss": 1.6851, + "step": 29678 + }, + { + "epoch": 9.109576427255986, + "grad_norm": 0.13680805265903473, + "learning_rate": 2.066289872850069e-06, + "loss": 1.6913, + "step": 29679 + }, + { + "epoch": 9.109883364027011, + "grad_norm": 0.126765176653862, + "learning_rate": 2.064875954648371e-06, + "loss": 1.6624, + "step": 29680 + }, + { + "epoch": 9.110190300798035, + "grad_norm": 0.15233641862869263, + "learning_rate": 2.0634625101696615e-06, + "loss": 1.673, + "step": 29681 + }, + { + "epoch": 9.11049723756906, + "grad_norm": 0.165065735578537, + "learning_rate": 2.0620495394279182e-06, + "loss": 1.7752, + "step": 29682 + }, + { + "epoch": 9.110804174340085, + "grad_norm": 0.16008982062339783, + "learning_rate": 2.060637042437097e-06, + "loss": 1.7509, + "step": 29683 + }, + { + "epoch": 9.11111111111111, + "grad_norm": 0.14805258810520172, + "learning_rate": 2.0592250192111585e-06, + "loss": 1.7481, + "step": 29684 + }, + { + "epoch": 9.111418047882136, + "grad_norm": 0.15095308423042297, + "learning_rate": 2.0578134697640585e-06, + "loss": 1.7003, + "step": 29685 + }, + { + "epoch": 9.111724984653161, + "grad_norm": 0.15223002433776855, + "learning_rate": 2.056402394109741e-06, + "loss": 1.6619, + "step": 29686 + }, + { + "epoch": 9.112031921424187, + "grad_norm": 0.21639372408390045, + "learning_rate": 2.0549917922621564e-06, + "loss": 1.6929, + "step": 29687 + }, + { + "epoch": 9.112338858195212, + "grad_norm": 0.17735840380191803, + "learning_rate": 2.053581664235249e-06, + "loss": 1.732, + "step": 29688 + }, + { + "epoch": 9.112645794966237, + "grad_norm": 0.14142274856567383, + "learning_rate": 2.052172010042941e-06, + "loss": 1.6888, + "step": 29689 + }, + { + "epoch": 9.112952731737263, + "grad_norm": 0.15996430814266205, + "learning_rate": 2.050762829699182e-06, + "loss": 1.724, + "step": 29690 + }, + { + "epoch": 9.113259668508288, + "grad_norm": 0.15198329091072083, + "learning_rate": 2.0493541232178835e-06, + "loss": 1.7437, + "step": 29691 + }, + { + "epoch": 9.113566605279312, + "grad_norm": 0.15780989825725555, + "learning_rate": 2.047945890612979e-06, + "loss": 1.7422, + "step": 29692 + }, + { + "epoch": 9.113873542050337, + "grad_norm": 0.11914275586605072, + "learning_rate": 2.046538131898368e-06, + "loss": 1.6689, + "step": 29693 + }, + { + "epoch": 9.114180478821362, + "grad_norm": 0.1314009428024292, + "learning_rate": 2.0451308470879782e-06, + "loss": 1.6496, + "step": 29694 + }, + { + "epoch": 9.114487415592388, + "grad_norm": 0.12608365714550018, + "learning_rate": 2.0437240361957154e-06, + "loss": 1.6798, + "step": 29695 + }, + { + "epoch": 9.114794352363413, + "grad_norm": 0.1259870082139969, + "learning_rate": 2.0423176992354797e-06, + "loss": 1.6673, + "step": 29696 + }, + { + "epoch": 9.115101289134438, + "grad_norm": 0.13520261645317078, + "learning_rate": 2.0409118362211654e-06, + "loss": 1.682, + "step": 29697 + }, + { + "epoch": 9.115408225905464, + "grad_norm": 0.12047206610441208, + "learning_rate": 2.0395064471666727e-06, + "loss": 1.6831, + "step": 29698 + }, + { + "epoch": 9.115715162676489, + "grad_norm": 0.17553697526454926, + "learning_rate": 2.0381015320858896e-06, + "loss": 1.7069, + "step": 29699 + }, + { + "epoch": 9.116022099447514, + "grad_norm": 0.15869703888893127, + "learning_rate": 2.0366970909926952e-06, + "loss": 1.7271, + "step": 29700 + }, + { + "epoch": 9.11632903621854, + "grad_norm": 0.13005055487155914, + "learning_rate": 2.035293123900972e-06, + "loss": 1.7204, + "step": 29701 + }, + { + "epoch": 9.116635972989565, + "grad_norm": 0.16763219237327576, + "learning_rate": 2.033889630824598e-06, + "loss": 1.7708, + "step": 29702 + }, + { + "epoch": 9.116942909760589, + "grad_norm": 0.1338036209344864, + "learning_rate": 2.03248661177744e-06, + "loss": 1.6735, + "step": 29703 + }, + { + "epoch": 9.117249846531614, + "grad_norm": 0.1424967646598816, + "learning_rate": 2.0310840667733643e-06, + "loss": 1.7344, + "step": 29704 + }, + { + "epoch": 9.11755678330264, + "grad_norm": 0.20512424409389496, + "learning_rate": 2.029681995826227e-06, + "loss": 1.7609, + "step": 29705 + }, + { + "epoch": 9.117863720073665, + "grad_norm": 0.12276902049779892, + "learning_rate": 2.028280398949889e-06, + "loss": 1.6959, + "step": 29706 + }, + { + "epoch": 9.11817065684469, + "grad_norm": 0.17198625206947327, + "learning_rate": 2.026879276158211e-06, + "loss": 1.7131, + "step": 29707 + }, + { + "epoch": 9.118477593615715, + "grad_norm": 0.1957482397556305, + "learning_rate": 2.025478627465016e-06, + "loss": 1.7182, + "step": 29708 + }, + { + "epoch": 9.11878453038674, + "grad_norm": 0.1316002756357193, + "learning_rate": 2.0240784528841707e-06, + "loss": 1.6694, + "step": 29709 + }, + { + "epoch": 9.119091467157766, + "grad_norm": 0.1468227654695511, + "learning_rate": 2.022678752429491e-06, + "loss": 1.7023, + "step": 29710 + }, + { + "epoch": 9.119398403928791, + "grad_norm": 0.14300107955932617, + "learning_rate": 2.0212795261148277e-06, + "loss": 1.693, + "step": 29711 + }, + { + "epoch": 9.119705340699817, + "grad_norm": 0.13721270859241486, + "learning_rate": 2.0198807739540026e-06, + "loss": 1.7132, + "step": 29712 + }, + { + "epoch": 9.120012277470842, + "grad_norm": 0.16772721707820892, + "learning_rate": 2.0184824959608386e-06, + "loss": 1.7564, + "step": 29713 + }, + { + "epoch": 9.120319214241865, + "grad_norm": 0.20897400379180908, + "learning_rate": 2.0170846921491516e-06, + "loss": 1.75, + "step": 29714 + }, + { + "epoch": 9.12062615101289, + "grad_norm": 0.11614206433296204, + "learning_rate": 2.0156873625327534e-06, + "loss": 1.7058, + "step": 29715 + }, + { + "epoch": 9.120933087783916, + "grad_norm": 0.11942286789417267, + "learning_rate": 2.0142905071254603e-06, + "loss": 1.6813, + "step": 29716 + }, + { + "epoch": 9.121240024554941, + "grad_norm": 0.13220490515232086, + "learning_rate": 2.0128941259410727e-06, + "loss": 1.6625, + "step": 29717 + }, + { + "epoch": 9.121546961325967, + "grad_norm": 0.11992443352937698, + "learning_rate": 2.0114982189933962e-06, + "loss": 1.7028, + "step": 29718 + }, + { + "epoch": 9.121853898096992, + "grad_norm": 0.1317398101091385, + "learning_rate": 2.010102786296214e-06, + "loss": 1.6832, + "step": 29719 + }, + { + "epoch": 9.122160834868017, + "grad_norm": 0.1504088193178177, + "learning_rate": 2.008707827863332e-06, + "loss": 1.6874, + "step": 29720 + }, + { + "epoch": 9.122467771639043, + "grad_norm": 0.138368159532547, + "learning_rate": 2.0073133437085224e-06, + "loss": 1.7256, + "step": 29721 + }, + { + "epoch": 9.122774708410068, + "grad_norm": 0.11481283605098724, + "learning_rate": 2.0059193338455683e-06, + "loss": 1.6599, + "step": 29722 + }, + { + "epoch": 9.123081645181093, + "grad_norm": 0.179039865732193, + "learning_rate": 2.004525798288248e-06, + "loss": 1.7319, + "step": 29723 + }, + { + "epoch": 9.123388581952117, + "grad_norm": 0.14884190261363983, + "learning_rate": 2.0031327370503506e-06, + "loss": 1.7074, + "step": 29724 + }, + { + "epoch": 9.123695518723142, + "grad_norm": 0.14200903475284576, + "learning_rate": 2.001740150145609e-06, + "loss": 1.6827, + "step": 29725 + }, + { + "epoch": 9.124002455494168, + "grad_norm": 0.12509983777999878, + "learning_rate": 2.0003480375878182e-06, + "loss": 1.6687, + "step": 29726 + }, + { + "epoch": 9.124309392265193, + "grad_norm": 0.1458035707473755, + "learning_rate": 1.998956399390711e-06, + "loss": 1.7075, + "step": 29727 + }, + { + "epoch": 9.124616329036218, + "grad_norm": 0.15756329894065857, + "learning_rate": 1.9975652355680554e-06, + "loss": 1.7024, + "step": 29728 + }, + { + "epoch": 9.124923265807244, + "grad_norm": 0.15760551393032074, + "learning_rate": 1.9961745461335947e-06, + "loss": 1.7479, + "step": 29729 + }, + { + "epoch": 9.125230202578269, + "grad_norm": 0.1852855086326599, + "learning_rate": 1.994784331101074e-06, + "loss": 1.7166, + "step": 29730 + }, + { + "epoch": 9.125537139349294, + "grad_norm": 0.14625653624534607, + "learning_rate": 1.993394590484232e-06, + "loss": 1.6654, + "step": 29731 + }, + { + "epoch": 9.12584407612032, + "grad_norm": 0.15028734505176544, + "learning_rate": 1.992005324296803e-06, + "loss": 1.6855, + "step": 29732 + }, + { + "epoch": 9.126151012891345, + "grad_norm": 0.15989474952220917, + "learning_rate": 1.990616532552514e-06, + "loss": 1.7315, + "step": 29733 + }, + { + "epoch": 9.12645794966237, + "grad_norm": 0.168121799826622, + "learning_rate": 1.9892282152650933e-06, + "loss": 1.7168, + "step": 29734 + }, + { + "epoch": 9.126764886433394, + "grad_norm": 0.15154367685317993, + "learning_rate": 1.9878403724482576e-06, + "loss": 1.7084, + "step": 29735 + }, + { + "epoch": 9.12707182320442, + "grad_norm": 0.18086697161197662, + "learning_rate": 1.9864530041157235e-06, + "loss": 1.78, + "step": 29736 + }, + { + "epoch": 9.127378759975445, + "grad_norm": 0.10857624560594559, + "learning_rate": 1.985066110281203e-06, + "loss": 1.635, + "step": 29737 + }, + { + "epoch": 9.12768569674647, + "grad_norm": 0.14876055717468262, + "learning_rate": 1.983679690958401e-06, + "loss": 1.6972, + "step": 29738 + }, + { + "epoch": 9.127992633517495, + "grad_norm": 0.144441157579422, + "learning_rate": 1.9822937461610235e-06, + "loss": 1.6871, + "step": 29739 + }, + { + "epoch": 9.12829957028852, + "grad_norm": 0.12115978449583054, + "learning_rate": 1.980908275902754e-06, + "loss": 1.6496, + "step": 29740 + }, + { + "epoch": 9.128606507059546, + "grad_norm": 0.11610052734613419, + "learning_rate": 1.979523280197304e-06, + "loss": 1.6429, + "step": 29741 + }, + { + "epoch": 9.128913443830571, + "grad_norm": 0.10996486991643906, + "learning_rate": 1.97813875905834e-06, + "loss": 1.6551, + "step": 29742 + }, + { + "epoch": 9.129220380601597, + "grad_norm": 0.1537560224533081, + "learning_rate": 1.9767547124995677e-06, + "loss": 1.6836, + "step": 29743 + }, + { + "epoch": 9.129527317372622, + "grad_norm": 0.11715234071016312, + "learning_rate": 1.975371140534643e-06, + "loss": 1.6968, + "step": 29744 + }, + { + "epoch": 9.129834254143647, + "grad_norm": 0.11332523077726364, + "learning_rate": 1.973988043177255e-06, + "loss": 1.6753, + "step": 29745 + }, + { + "epoch": 9.13014119091467, + "grad_norm": 0.1348869651556015, + "learning_rate": 1.9726054204410595e-06, + "loss": 1.7083, + "step": 29746 + }, + { + "epoch": 9.130448127685696, + "grad_norm": 0.15482230484485626, + "learning_rate": 1.971223272339734e-06, + "loss": 1.6969, + "step": 29747 + }, + { + "epoch": 9.130755064456721, + "grad_norm": 0.1418905109167099, + "learning_rate": 1.9698415988869346e-06, + "loss": 1.7373, + "step": 29748 + }, + { + "epoch": 9.131062001227747, + "grad_norm": 0.13672807812690735, + "learning_rate": 1.9684604000963002e-06, + "loss": 1.7197, + "step": 29749 + }, + { + "epoch": 9.131368937998772, + "grad_norm": 0.1771068125963211, + "learning_rate": 1.967079675981498e-06, + "loss": 1.7223, + "step": 29750 + }, + { + "epoch": 9.131675874769797, + "grad_norm": 0.1820068508386612, + "learning_rate": 1.965699426556167e-06, + "loss": 1.7198, + "step": 29751 + }, + { + "epoch": 9.131982811540823, + "grad_norm": 0.1300581693649292, + "learning_rate": 1.9643196518339457e-06, + "loss": 1.6856, + "step": 29752 + }, + { + "epoch": 9.132289748311848, + "grad_norm": 0.13473594188690186, + "learning_rate": 1.962940351828474e-06, + "loss": 1.6889, + "step": 29753 + }, + { + "epoch": 9.132596685082873, + "grad_norm": 0.170193150639534, + "learning_rate": 1.961561526553385e-06, + "loss": 1.7336, + "step": 29754 + }, + { + "epoch": 9.132903621853899, + "grad_norm": 0.14752201735973358, + "learning_rate": 1.9601831760222954e-06, + "loss": 1.7528, + "step": 29755 + }, + { + "epoch": 9.133210558624924, + "grad_norm": 0.18119682371616364, + "learning_rate": 1.9588053002488337e-06, + "loss": 1.7296, + "step": 29756 + }, + { + "epoch": 9.133517495395948, + "grad_norm": 0.1837453842163086, + "learning_rate": 1.957427899246611e-06, + "loss": 1.6643, + "step": 29757 + }, + { + "epoch": 9.133824432166973, + "grad_norm": 0.18625833094120026, + "learning_rate": 1.956050973029261e-06, + "loss": 1.78, + "step": 29758 + }, + { + "epoch": 9.134131368937998, + "grad_norm": 0.15884144604206085, + "learning_rate": 1.9546745216103558e-06, + "loss": 1.7168, + "step": 29759 + }, + { + "epoch": 9.134438305709024, + "grad_norm": 0.13788993656635284, + "learning_rate": 1.953298545003535e-06, + "loss": 1.7016, + "step": 29760 + }, + { + "epoch": 9.134745242480049, + "grad_norm": 0.14895956218242645, + "learning_rate": 1.951923043222359e-06, + "loss": 1.6961, + "step": 29761 + }, + { + "epoch": 9.135052179251074, + "grad_norm": 0.1548876017332077, + "learning_rate": 1.9505480162804567e-06, + "loss": 1.7352, + "step": 29762 + }, + { + "epoch": 9.1353591160221, + "grad_norm": 0.14169646799564362, + "learning_rate": 1.949173464191395e-06, + "loss": 1.7123, + "step": 29763 + }, + { + "epoch": 9.135666052793125, + "grad_norm": 0.14068526029586792, + "learning_rate": 1.9477993869687684e-06, + "loss": 1.7013, + "step": 29764 + }, + { + "epoch": 9.13597298956415, + "grad_norm": 0.15116369724273682, + "learning_rate": 1.9464257846261548e-06, + "loss": 1.6831, + "step": 29765 + }, + { + "epoch": 9.136279926335176, + "grad_norm": 0.17049194872379303, + "learning_rate": 1.9450526571771154e-06, + "loss": 1.72, + "step": 29766 + }, + { + "epoch": 9.1365868631062, + "grad_norm": 0.1429831087589264, + "learning_rate": 1.94368000463524e-06, + "loss": 1.6903, + "step": 29767 + }, + { + "epoch": 9.136893799877225, + "grad_norm": 0.2263873964548111, + "learning_rate": 1.9423078270140838e-06, + "loss": 1.7919, + "step": 29768 + }, + { + "epoch": 9.13720073664825, + "grad_norm": 0.14157186448574066, + "learning_rate": 1.940936124327214e-06, + "loss": 1.7151, + "step": 29769 + }, + { + "epoch": 9.137507673419275, + "grad_norm": 0.19576019048690796, + "learning_rate": 1.939564896588175e-06, + "loss": 1.7046, + "step": 29770 + }, + { + "epoch": 9.1378146101903, + "grad_norm": 0.15183357894420624, + "learning_rate": 1.9381941438105288e-06, + "loss": 1.6889, + "step": 29771 + }, + { + "epoch": 9.138121546961326, + "grad_norm": 0.11827339977025986, + "learning_rate": 1.936823866007814e-06, + "loss": 1.6737, + "step": 29772 + }, + { + "epoch": 9.138428483732351, + "grad_norm": 0.14976255595684052, + "learning_rate": 1.935454063193581e-06, + "loss": 1.725, + "step": 29773 + }, + { + "epoch": 9.138735420503377, + "grad_norm": 0.18152090907096863, + "learning_rate": 1.934084735381353e-06, + "loss": 1.7727, + "step": 29774 + }, + { + "epoch": 9.139042357274402, + "grad_norm": 0.19068580865859985, + "learning_rate": 1.9327158825846848e-06, + "loss": 1.7291, + "step": 29775 + }, + { + "epoch": 9.139349294045427, + "grad_norm": 0.1304289698600769, + "learning_rate": 1.9313475048170827e-06, + "loss": 1.6755, + "step": 29776 + }, + { + "epoch": 9.139656230816453, + "grad_norm": 0.14543502032756805, + "learning_rate": 1.9299796020920857e-06, + "loss": 1.7131, + "step": 29777 + }, + { + "epoch": 9.139963167587476, + "grad_norm": 0.16456526517868042, + "learning_rate": 1.9286121744231946e-06, + "loss": 1.734, + "step": 29778 + }, + { + "epoch": 9.140270104358502, + "grad_norm": 0.20563676953315735, + "learning_rate": 1.9272452218239424e-06, + "loss": 1.711, + "step": 29779 + }, + { + "epoch": 9.140577041129527, + "grad_norm": 0.12036823481321335, + "learning_rate": 1.925878744307824e-06, + "loss": 1.648, + "step": 29780 + }, + { + "epoch": 9.140883977900552, + "grad_norm": 0.13387446105480194, + "learning_rate": 1.924512741888351e-06, + "loss": 1.6611, + "step": 29781 + }, + { + "epoch": 9.141190914671578, + "grad_norm": 0.14060257375240326, + "learning_rate": 1.92314721457903e-06, + "loss": 1.699, + "step": 29782 + }, + { + "epoch": 9.141497851442603, + "grad_norm": 0.1897846907377243, + "learning_rate": 1.921782162393332e-06, + "loss": 1.6847, + "step": 29783 + }, + { + "epoch": 9.141804788213628, + "grad_norm": 0.1610451191663742, + "learning_rate": 1.920417585344769e-06, + "loss": 1.7417, + "step": 29784 + }, + { + "epoch": 9.142111724984654, + "grad_norm": 0.16606128215789795, + "learning_rate": 1.91905348344682e-06, + "loss": 1.7026, + "step": 29785 + }, + { + "epoch": 9.142418661755679, + "grad_norm": 0.13305748999118805, + "learning_rate": 1.9176898567129675e-06, + "loss": 1.6664, + "step": 29786 + }, + { + "epoch": 9.142725598526704, + "grad_norm": 0.1632613241672516, + "learning_rate": 1.9163267051566845e-06, + "loss": 1.7245, + "step": 29787 + }, + { + "epoch": 9.14303253529773, + "grad_norm": 0.14564020931720734, + "learning_rate": 1.9149640287914437e-06, + "loss": 1.6775, + "step": 29788 + }, + { + "epoch": 9.143339472068753, + "grad_norm": 0.14515992999076843, + "learning_rate": 1.9136018276307123e-06, + "loss": 1.6254, + "step": 29789 + }, + { + "epoch": 9.143646408839778, + "grad_norm": 0.14360931515693665, + "learning_rate": 1.9122401016879455e-06, + "loss": 1.7002, + "step": 29790 + }, + { + "epoch": 9.143953345610804, + "grad_norm": 0.14381720125675201, + "learning_rate": 1.9108788509766107e-06, + "loss": 1.7165, + "step": 29791 + }, + { + "epoch": 9.144260282381829, + "grad_norm": 0.14533990621566772, + "learning_rate": 1.909518075510164e-06, + "loss": 1.7003, + "step": 29792 + }, + { + "epoch": 9.144567219152854, + "grad_norm": 0.17832323908805847, + "learning_rate": 1.908157775302033e-06, + "loss": 1.7542, + "step": 29793 + }, + { + "epoch": 9.14487415592388, + "grad_norm": 0.15718503296375275, + "learning_rate": 1.9067979503656907e-06, + "loss": 1.681, + "step": 29794 + }, + { + "epoch": 9.145181092694905, + "grad_norm": 0.14168475568294525, + "learning_rate": 1.905438600714543e-06, + "loss": 1.694, + "step": 29795 + }, + { + "epoch": 9.14548802946593, + "grad_norm": 0.16925476491451263, + "learning_rate": 1.9040797263620514e-06, + "loss": 1.7163, + "step": 29796 + }, + { + "epoch": 9.145794966236956, + "grad_norm": 0.14622162282466888, + "learning_rate": 1.902721327321627e-06, + "loss": 1.7218, + "step": 29797 + }, + { + "epoch": 9.146101903007981, + "grad_norm": 0.14731308817863464, + "learning_rate": 1.9013634036067096e-06, + "loss": 1.6846, + "step": 29798 + }, + { + "epoch": 9.146408839779005, + "grad_norm": 0.16700461506843567, + "learning_rate": 1.9000059552307103e-06, + "loss": 1.7606, + "step": 29799 + }, + { + "epoch": 9.14671577655003, + "grad_norm": 0.12352433800697327, + "learning_rate": 1.8986489822070353e-06, + "loss": 1.6729, + "step": 29800 + }, + { + "epoch": 9.147022713321055, + "grad_norm": 0.1929595023393631, + "learning_rate": 1.897292484549107e-06, + "loss": 1.7356, + "step": 29801 + }, + { + "epoch": 9.14732965009208, + "grad_norm": 0.14892888069152832, + "learning_rate": 1.8959364622703313e-06, + "loss": 1.7074, + "step": 29802 + }, + { + "epoch": 9.147636586863106, + "grad_norm": 0.13839972019195557, + "learning_rate": 1.8945809153841031e-06, + "loss": 1.7107, + "step": 29803 + }, + { + "epoch": 9.147943523634131, + "grad_norm": 0.14454330503940582, + "learning_rate": 1.893225843903823e-06, + "loss": 1.7082, + "step": 29804 + }, + { + "epoch": 9.148250460405157, + "grad_norm": 0.16385792195796967, + "learning_rate": 1.8918712478428857e-06, + "loss": 1.7244, + "step": 29805 + }, + { + "epoch": 9.148557397176182, + "grad_norm": 0.17219752073287964, + "learning_rate": 1.8905171272146694e-06, + "loss": 1.7198, + "step": 29806 + }, + { + "epoch": 9.148864333947207, + "grad_norm": 0.1170208603143692, + "learning_rate": 1.8891634820325633e-06, + "loss": 1.6718, + "step": 29807 + }, + { + "epoch": 9.149171270718233, + "grad_norm": 0.13478094339370728, + "learning_rate": 1.8878103123099345e-06, + "loss": 1.6929, + "step": 29808 + }, + { + "epoch": 9.149478207489258, + "grad_norm": 0.15352758765220642, + "learning_rate": 1.8864576180601722e-06, + "loss": 1.6868, + "step": 29809 + }, + { + "epoch": 9.149785144260282, + "grad_norm": 0.253282368183136, + "learning_rate": 1.8851053992966273e-06, + "loss": 1.7221, + "step": 29810 + }, + { + "epoch": 9.150092081031307, + "grad_norm": 0.22674274444580078, + "learning_rate": 1.8837536560326829e-06, + "loss": 1.7743, + "step": 29811 + }, + { + "epoch": 9.150399017802332, + "grad_norm": 0.15920962393283844, + "learning_rate": 1.8824023882816788e-06, + "loss": 1.6974, + "step": 29812 + }, + { + "epoch": 9.150705954573358, + "grad_norm": 0.19905294477939606, + "learning_rate": 1.8810515960569819e-06, + "loss": 1.7343, + "step": 29813 + }, + { + "epoch": 9.151012891344383, + "grad_norm": 0.16976027190685272, + "learning_rate": 1.8797012793719316e-06, + "loss": 1.7161, + "step": 29814 + }, + { + "epoch": 9.151319828115408, + "grad_norm": 0.1899489164352417, + "learning_rate": 1.8783514382398837e-06, + "loss": 1.7435, + "step": 29815 + }, + { + "epoch": 9.151626764886434, + "grad_norm": 0.1622486710548401, + "learning_rate": 1.8770020726741722e-06, + "loss": 1.7077, + "step": 29816 + }, + { + "epoch": 9.151933701657459, + "grad_norm": 0.16020219027996063, + "learning_rate": 1.8756531826881197e-06, + "loss": 1.7145, + "step": 29817 + }, + { + "epoch": 9.152240638428484, + "grad_norm": 0.14600935578346252, + "learning_rate": 1.874304768295082e-06, + "loss": 1.6632, + "step": 29818 + }, + { + "epoch": 9.15254757519951, + "grad_norm": 0.15835213661193848, + "learning_rate": 1.8729568295083656e-06, + "loss": 1.7302, + "step": 29819 + }, + { + "epoch": 9.152854511970535, + "grad_norm": 0.1220908835530281, + "learning_rate": 1.871609366341298e-06, + "loss": 1.6872, + "step": 29820 + }, + { + "epoch": 9.153161448741558, + "grad_norm": 0.1540200263261795, + "learning_rate": 1.8702623788072027e-06, + "loss": 1.7784, + "step": 29821 + }, + { + "epoch": 9.153468385512584, + "grad_norm": 0.1889277845621109, + "learning_rate": 1.8689158669193795e-06, + "loss": 1.7541, + "step": 29822 + }, + { + "epoch": 9.15377532228361, + "grad_norm": 0.14910344779491425, + "learning_rate": 1.8675698306911348e-06, + "loss": 1.6919, + "step": 29823 + }, + { + "epoch": 9.154082259054634, + "grad_norm": 0.15983067452907562, + "learning_rate": 1.8662242701357857e-06, + "loss": 1.6909, + "step": 29824 + }, + { + "epoch": 9.15438919582566, + "grad_norm": 0.13017424941062927, + "learning_rate": 1.8648791852666103e-06, + "loss": 1.7112, + "step": 29825 + }, + { + "epoch": 9.154696132596685, + "grad_norm": 0.15777593851089478, + "learning_rate": 1.8635345760969204e-06, + "loss": 1.7162, + "step": 29826 + }, + { + "epoch": 9.15500306936771, + "grad_norm": 0.18125833570957184, + "learning_rate": 1.8621904426399882e-06, + "loss": 1.7825, + "step": 29827 + }, + { + "epoch": 9.155310006138736, + "grad_norm": 0.182005375623703, + "learning_rate": 1.8608467849091149e-06, + "loss": 1.7526, + "step": 29828 + }, + { + "epoch": 9.155616942909761, + "grad_norm": 0.11626452207565308, + "learning_rate": 1.8595036029175562e-06, + "loss": 1.6564, + "step": 29829 + }, + { + "epoch": 9.155923879680786, + "grad_norm": 0.15099090337753296, + "learning_rate": 1.8581608966786069e-06, + "loss": 1.6712, + "step": 29830 + }, + { + "epoch": 9.15623081645181, + "grad_norm": 0.16302652657032013, + "learning_rate": 1.8568186662055286e-06, + "loss": 1.699, + "step": 29831 + }, + { + "epoch": 9.156537753222835, + "grad_norm": 0.14438454806804657, + "learning_rate": 1.8554769115115834e-06, + "loss": 1.6718, + "step": 29832 + }, + { + "epoch": 9.15684468999386, + "grad_norm": 0.13968545198440552, + "learning_rate": 1.8541356326100433e-06, + "loss": 1.6846, + "step": 29833 + }, + { + "epoch": 9.157151626764886, + "grad_norm": 0.13068513572216034, + "learning_rate": 1.8527948295141372e-06, + "loss": 1.6761, + "step": 29834 + }, + { + "epoch": 9.157458563535911, + "grad_norm": 0.14249193668365479, + "learning_rate": 1.8514545022371433e-06, + "loss": 1.6798, + "step": 29835 + }, + { + "epoch": 9.157765500306937, + "grad_norm": 0.1251843273639679, + "learning_rate": 1.850114650792295e-06, + "loss": 1.6956, + "step": 29836 + }, + { + "epoch": 9.158072437077962, + "grad_norm": 0.1275821328163147, + "learning_rate": 1.8487752751928323e-06, + "loss": 1.6795, + "step": 29837 + }, + { + "epoch": 9.158379373848987, + "grad_norm": 0.21461836993694305, + "learning_rate": 1.8474363754519997e-06, + "loss": 1.7701, + "step": 29838 + }, + { + "epoch": 9.158686310620013, + "grad_norm": 0.11155051738023758, + "learning_rate": 1.84609795158302e-06, + "loss": 1.6207, + "step": 29839 + }, + { + "epoch": 9.158993247391038, + "grad_norm": 0.10695862770080566, + "learning_rate": 1.8447600035991275e-06, + "loss": 1.6756, + "step": 29840 + }, + { + "epoch": 9.159300184162063, + "grad_norm": 0.13841044902801514, + "learning_rate": 1.843422531513539e-06, + "loss": 1.7111, + "step": 29841 + }, + { + "epoch": 9.159607120933087, + "grad_norm": 0.14116619527339935, + "learning_rate": 1.8420855353394718e-06, + "loss": 1.653, + "step": 29842 + }, + { + "epoch": 9.159914057704112, + "grad_norm": 0.16781140863895416, + "learning_rate": 1.8407490150901485e-06, + "loss": 1.7357, + "step": 29843 + }, + { + "epoch": 9.160220994475138, + "grad_norm": 0.21595926582813263, + "learning_rate": 1.8394129707787589e-06, + "loss": 1.7183, + "step": 29844 + }, + { + "epoch": 9.160527931246163, + "grad_norm": 0.14073456823825836, + "learning_rate": 1.838077402418531e-06, + "loss": 1.6756, + "step": 29845 + }, + { + "epoch": 9.160834868017188, + "grad_norm": 0.15962691605091095, + "learning_rate": 1.8367423100226377e-06, + "loss": 1.7247, + "step": 29846 + }, + { + "epoch": 9.161141804788214, + "grad_norm": 0.17450691759586334, + "learning_rate": 1.8354076936043018e-06, + "loss": 1.7286, + "step": 29847 + }, + { + "epoch": 9.161448741559239, + "grad_norm": 0.13126425445079803, + "learning_rate": 1.834073553176685e-06, + "loss": 1.6559, + "step": 29848 + }, + { + "epoch": 9.161755678330264, + "grad_norm": 0.14806927740573883, + "learning_rate": 1.8327398887529878e-06, + "loss": 1.7202, + "step": 29849 + }, + { + "epoch": 9.16206261510129, + "grad_norm": 0.17844204604625702, + "learning_rate": 1.8314067003463942e-06, + "loss": 1.6669, + "step": 29850 + }, + { + "epoch": 9.162369551872315, + "grad_norm": 0.14012929797172546, + "learning_rate": 1.83007398797006e-06, + "loss": 1.6912, + "step": 29851 + }, + { + "epoch": 9.16267648864334, + "grad_norm": 0.1496121734380722, + "learning_rate": 1.8287417516371751e-06, + "loss": 1.7157, + "step": 29852 + }, + { + "epoch": 9.162983425414364, + "grad_norm": 0.1662236452102661, + "learning_rate": 1.8274099913608955e-06, + "loss": 1.6586, + "step": 29853 + }, + { + "epoch": 9.16329036218539, + "grad_norm": 0.14133767783641815, + "learning_rate": 1.8260787071543884e-06, + "loss": 1.7107, + "step": 29854 + }, + { + "epoch": 9.163597298956415, + "grad_norm": 0.2213003784418106, + "learning_rate": 1.8247478990308041e-06, + "loss": 1.6686, + "step": 29855 + }, + { + "epoch": 9.16390423572744, + "grad_norm": 0.14967088401317596, + "learning_rate": 1.8234175670032993e-06, + "loss": 1.7537, + "step": 29856 + }, + { + "epoch": 9.164211172498465, + "grad_norm": 0.1617511510848999, + "learning_rate": 1.8220877110850187e-06, + "loss": 1.7581, + "step": 29857 + }, + { + "epoch": 9.16451810926949, + "grad_norm": 0.15769065916538239, + "learning_rate": 1.8207583312891075e-06, + "loss": 1.7908, + "step": 29858 + }, + { + "epoch": 9.164825046040516, + "grad_norm": 0.14482183754444122, + "learning_rate": 1.8194294276286994e-06, + "loss": 1.6987, + "step": 29859 + }, + { + "epoch": 9.165131982811541, + "grad_norm": 0.19343525171279907, + "learning_rate": 1.8181010001169285e-06, + "loss": 1.7083, + "step": 29860 + }, + { + "epoch": 9.165438919582567, + "grad_norm": 0.16402462124824524, + "learning_rate": 1.8167730487669176e-06, + "loss": 1.7532, + "step": 29861 + }, + { + "epoch": 9.165745856353592, + "grad_norm": 0.13917924463748932, + "learning_rate": 1.8154455735918118e-06, + "loss": 1.7184, + "step": 29862 + }, + { + "epoch": 9.166052793124617, + "grad_norm": 0.12260928750038147, + "learning_rate": 1.8141185746047006e-06, + "loss": 1.6783, + "step": 29863 + }, + { + "epoch": 9.16635972989564, + "grad_norm": 0.1644967645406723, + "learning_rate": 1.8127920518187235e-06, + "loss": 1.7101, + "step": 29864 + }, + { + "epoch": 9.166666666666666, + "grad_norm": 0.14527414739131927, + "learning_rate": 1.8114660052469645e-06, + "loss": 1.7104, + "step": 29865 + }, + { + "epoch": 9.166973603437691, + "grad_norm": 0.10901240259408951, + "learning_rate": 1.810140434902552e-06, + "loss": 1.6169, + "step": 29866 + }, + { + "epoch": 9.167280540208717, + "grad_norm": 0.135842964053154, + "learning_rate": 1.8088153407985809e-06, + "loss": 1.7481, + "step": 29867 + }, + { + "epoch": 9.167587476979742, + "grad_norm": 0.15822531282901764, + "learning_rate": 1.8074907229481298e-06, + "loss": 1.744, + "step": 29868 + }, + { + "epoch": 9.167894413750767, + "grad_norm": 0.1288236677646637, + "learning_rate": 1.8061665813643158e-06, + "loss": 1.7064, + "step": 29869 + }, + { + "epoch": 9.168201350521793, + "grad_norm": 0.15224573016166687, + "learning_rate": 1.8048429160602009e-06, + "loss": 1.7302, + "step": 29870 + }, + { + "epoch": 9.168508287292818, + "grad_norm": 0.16641436517238617, + "learning_rate": 1.8035197270488802e-06, + "loss": 1.7236, + "step": 29871 + }, + { + "epoch": 9.168815224063843, + "grad_norm": 0.15526805818080902, + "learning_rate": 1.8021970143434264e-06, + "loss": 1.7224, + "step": 29872 + }, + { + "epoch": 9.169122160834869, + "grad_norm": 0.15454156696796417, + "learning_rate": 1.8008747779569125e-06, + "loss": 1.721, + "step": 29873 + }, + { + "epoch": 9.169429097605892, + "grad_norm": 0.20796442031860352, + "learning_rate": 1.7995530179024001e-06, + "loss": 1.775, + "step": 29874 + }, + { + "epoch": 9.169736034376918, + "grad_norm": 0.186804860830307, + "learning_rate": 1.7982317341929623e-06, + "loss": 1.7164, + "step": 29875 + }, + { + "epoch": 9.170042971147943, + "grad_norm": 0.16180850565433502, + "learning_rate": 1.796910926841644e-06, + "loss": 1.7413, + "step": 29876 + }, + { + "epoch": 9.170349907918968, + "grad_norm": 0.15675058960914612, + "learning_rate": 1.7955905958615071e-06, + "loss": 1.7242, + "step": 29877 + }, + { + "epoch": 9.170656844689994, + "grad_norm": 0.13031265139579773, + "learning_rate": 1.794270741265597e-06, + "loss": 1.7362, + "step": 29878 + }, + { + "epoch": 9.170963781461019, + "grad_norm": 0.16068242490291595, + "learning_rate": 1.7929513630669636e-06, + "loss": 1.7262, + "step": 29879 + }, + { + "epoch": 9.171270718232044, + "grad_norm": 0.11941052973270416, + "learning_rate": 1.791632461278625e-06, + "loss": 1.6661, + "step": 29880 + }, + { + "epoch": 9.17157765500307, + "grad_norm": 0.1486428678035736, + "learning_rate": 1.7903140359136483e-06, + "loss": 1.7032, + "step": 29881 + }, + { + "epoch": 9.171884591774095, + "grad_norm": 0.1541515737771988, + "learning_rate": 1.7889960869850342e-06, + "loss": 1.7176, + "step": 29882 + }, + { + "epoch": 9.17219152854512, + "grad_norm": 0.17397575080394745, + "learning_rate": 1.7876786145058167e-06, + "loss": 1.7475, + "step": 29883 + }, + { + "epoch": 9.172498465316146, + "grad_norm": 0.15537402033805847, + "learning_rate": 1.7863616184890297e-06, + "loss": 1.6786, + "step": 29884 + }, + { + "epoch": 9.17280540208717, + "grad_norm": 0.20951804518699646, + "learning_rate": 1.785045098947663e-06, + "loss": 1.8126, + "step": 29885 + }, + { + "epoch": 9.173112338858195, + "grad_norm": 0.1401960551738739, + "learning_rate": 1.7837290558947506e-06, + "loss": 1.7059, + "step": 29886 + }, + { + "epoch": 9.17341927562922, + "grad_norm": 0.13450580835342407, + "learning_rate": 1.7824134893432764e-06, + "loss": 1.6921, + "step": 29887 + }, + { + "epoch": 9.173726212400245, + "grad_norm": 0.12671135365962982, + "learning_rate": 1.7810983993062579e-06, + "loss": 1.7248, + "step": 29888 + }, + { + "epoch": 9.17403314917127, + "grad_norm": 0.13940559327602386, + "learning_rate": 1.7797837857966904e-06, + "loss": 1.6749, + "step": 29889 + }, + { + "epoch": 9.174340085942296, + "grad_norm": 0.13822492957115173, + "learning_rate": 1.7784696488275576e-06, + "loss": 1.6912, + "step": 29890 + }, + { + "epoch": 9.174647022713321, + "grad_norm": 0.1438322365283966, + "learning_rate": 1.7771559884118549e-06, + "loss": 1.6909, + "step": 29891 + }, + { + "epoch": 9.174953959484347, + "grad_norm": 0.13645079731941223, + "learning_rate": 1.7758428045625608e-06, + "loss": 1.6932, + "step": 29892 + }, + { + "epoch": 9.175260896255372, + "grad_norm": 0.16978910565376282, + "learning_rate": 1.7745300972926538e-06, + "loss": 1.7474, + "step": 29893 + }, + { + "epoch": 9.175567833026397, + "grad_norm": 0.1612422913312912, + "learning_rate": 1.7732178666151067e-06, + "loss": 1.7152, + "step": 29894 + }, + { + "epoch": 9.175874769797423, + "grad_norm": 0.20364105701446533, + "learning_rate": 1.7719061125428815e-06, + "loss": 1.8032, + "step": 29895 + }, + { + "epoch": 9.176181706568446, + "grad_norm": 0.1400647908449173, + "learning_rate": 1.7705948350889567e-06, + "loss": 1.6878, + "step": 29896 + }, + { + "epoch": 9.176488643339471, + "grad_norm": 0.17033728957176208, + "learning_rate": 1.769284034266272e-06, + "loss": 1.7088, + "step": 29897 + }, + { + "epoch": 9.176795580110497, + "grad_norm": 0.1421220600605011, + "learning_rate": 1.7679737100878002e-06, + "loss": 1.6616, + "step": 29898 + }, + { + "epoch": 9.177102516881522, + "grad_norm": 0.16700543463230133, + "learning_rate": 1.76666386256647e-06, + "loss": 1.6915, + "step": 29899 + }, + { + "epoch": 9.177409453652547, + "grad_norm": 0.11176354438066483, + "learning_rate": 1.7653544917152487e-06, + "loss": 1.6733, + "step": 29900 + }, + { + "epoch": 9.177716390423573, + "grad_norm": 0.1324780434370041, + "learning_rate": 1.7640455975470648e-06, + "loss": 1.7397, + "step": 29901 + }, + { + "epoch": 9.178023327194598, + "grad_norm": 0.19537372887134552, + "learning_rate": 1.762737180074847e-06, + "loss": 1.7479, + "step": 29902 + }, + { + "epoch": 9.178330263965623, + "grad_norm": 0.1455310732126236, + "learning_rate": 1.7614292393115462e-06, + "loss": 1.6603, + "step": 29903 + }, + { + "epoch": 9.178637200736649, + "grad_norm": 0.15979693830013275, + "learning_rate": 1.7601217752700627e-06, + "loss": 1.7247, + "step": 29904 + }, + { + "epoch": 9.178944137507674, + "grad_norm": 0.1877484917640686, + "learning_rate": 1.7588147879633365e-06, + "loss": 1.6787, + "step": 29905 + }, + { + "epoch": 9.1792510742787, + "grad_norm": 0.1619114726781845, + "learning_rate": 1.757508277404274e-06, + "loss": 1.7523, + "step": 29906 + }, + { + "epoch": 9.179558011049723, + "grad_norm": 0.19995933771133423, + "learning_rate": 1.7562022436057922e-06, + "loss": 1.7835, + "step": 29907 + }, + { + "epoch": 9.179864947820748, + "grad_norm": 0.17540034651756287, + "learning_rate": 1.7548966865807982e-06, + "loss": 1.7116, + "step": 29908 + }, + { + "epoch": 9.180171884591774, + "grad_norm": 0.1773085743188858, + "learning_rate": 1.753591606342192e-06, + "loss": 1.7527, + "step": 29909 + }, + { + "epoch": 9.180478821362799, + "grad_norm": 0.18704703450202942, + "learning_rate": 1.7522870029028694e-06, + "loss": 1.7245, + "step": 29910 + }, + { + "epoch": 9.180785758133824, + "grad_norm": 0.12332191318273544, + "learning_rate": 1.7509828762757253e-06, + "loss": 1.6869, + "step": 29911 + }, + { + "epoch": 9.18109269490485, + "grad_norm": 0.16095921397209167, + "learning_rate": 1.7496792264736439e-06, + "loss": 1.7862, + "step": 29912 + }, + { + "epoch": 9.181399631675875, + "grad_norm": 0.1321704238653183, + "learning_rate": 1.7483760535095262e-06, + "loss": 1.6952, + "step": 29913 + }, + { + "epoch": 9.1817065684469, + "grad_norm": 0.14660334587097168, + "learning_rate": 1.7470733573962227e-06, + "loss": 1.7295, + "step": 29914 + }, + { + "epoch": 9.182013505217926, + "grad_norm": 0.18334107100963593, + "learning_rate": 1.7457711381466345e-06, + "loss": 1.7875, + "step": 29915 + }, + { + "epoch": 9.182320441988951, + "grad_norm": 0.13693606853485107, + "learning_rate": 1.7444693957736069e-06, + "loss": 1.6882, + "step": 29916 + }, + { + "epoch": 9.182627378759975, + "grad_norm": 0.1939692199230194, + "learning_rate": 1.7431681302900238e-06, + "loss": 1.7296, + "step": 29917 + }, + { + "epoch": 9.182934315531, + "grad_norm": 0.219837948679924, + "learning_rate": 1.7418673417087417e-06, + "loss": 1.7595, + "step": 29918 + }, + { + "epoch": 9.183241252302025, + "grad_norm": 0.1344659924507141, + "learning_rate": 1.7405670300426002e-06, + "loss": 1.707, + "step": 29919 + }, + { + "epoch": 9.18354818907305, + "grad_norm": 0.1565396636724472, + "learning_rate": 1.7392671953044725e-06, + "loss": 1.7312, + "step": 29920 + }, + { + "epoch": 9.183855125844076, + "grad_norm": 0.1617916077375412, + "learning_rate": 1.7379678375071818e-06, + "loss": 1.6859, + "step": 29921 + }, + { + "epoch": 9.184162062615101, + "grad_norm": 0.26025474071502686, + "learning_rate": 1.7366689566635841e-06, + "loss": 1.6916, + "step": 29922 + }, + { + "epoch": 9.184468999386127, + "grad_norm": 0.10923932492733002, + "learning_rate": 1.7353705527865138e-06, + "loss": 1.6574, + "step": 29923 + }, + { + "epoch": 9.184775936157152, + "grad_norm": 0.13846524059772491, + "learning_rate": 1.7340726258887997e-06, + "loss": 1.7057, + "step": 29924 + }, + { + "epoch": 9.185082872928177, + "grad_norm": 0.16603818535804749, + "learning_rate": 1.73277517598327e-06, + "loss": 1.6955, + "step": 29925 + }, + { + "epoch": 9.185389809699203, + "grad_norm": 0.14902694523334503, + "learning_rate": 1.731478203082748e-06, + "loss": 1.6999, + "step": 29926 + }, + { + "epoch": 9.185696746470228, + "grad_norm": 0.12260756641626358, + "learning_rate": 1.7301817072000459e-06, + "loss": 1.7097, + "step": 29927 + }, + { + "epoch": 9.186003683241251, + "grad_norm": 0.1545649915933609, + "learning_rate": 1.7288856883479809e-06, + "loss": 1.6913, + "step": 29928 + }, + { + "epoch": 9.186310620012277, + "grad_norm": 0.1564372181892395, + "learning_rate": 1.7275901465393595e-06, + "loss": 1.7428, + "step": 29929 + }, + { + "epoch": 9.186617556783302, + "grad_norm": 0.14948883652687073, + "learning_rate": 1.726295081786994e-06, + "loss": 1.6928, + "step": 29930 + }, + { + "epoch": 9.186924493554327, + "grad_norm": 0.19552940130233765, + "learning_rate": 1.7250004941036568e-06, + "loss": 1.7277, + "step": 29931 + }, + { + "epoch": 9.187231430325353, + "grad_norm": 0.13902166485786438, + "learning_rate": 1.7237063835021771e-06, + "loss": 1.7208, + "step": 29932 + }, + { + "epoch": 9.187538367096378, + "grad_norm": 0.13597513735294342, + "learning_rate": 1.7224127499953169e-06, + "loss": 1.712, + "step": 29933 + }, + { + "epoch": 9.187845303867404, + "grad_norm": 0.14096584916114807, + "learning_rate": 1.7211195935958713e-06, + "loss": 1.6927, + "step": 29934 + }, + { + "epoch": 9.188152240638429, + "grad_norm": 0.1446818709373474, + "learning_rate": 1.71982691431663e-06, + "loss": 1.6898, + "step": 29935 + }, + { + "epoch": 9.188459177409454, + "grad_norm": 0.12654201686382294, + "learning_rate": 1.7185347121703388e-06, + "loss": 1.675, + "step": 29936 + }, + { + "epoch": 9.18876611418048, + "grad_norm": 0.18681016564369202, + "learning_rate": 1.7172429871698037e-06, + "loss": 1.7206, + "step": 29937 + }, + { + "epoch": 9.189073050951505, + "grad_norm": 0.10353434830904007, + "learning_rate": 1.715951739327759e-06, + "loss": 1.6492, + "step": 29938 + }, + { + "epoch": 9.189379987722528, + "grad_norm": 0.16447822749614716, + "learning_rate": 1.7146609686569837e-06, + "loss": 1.7189, + "step": 29939 + }, + { + "epoch": 9.189686924493554, + "grad_norm": 0.159690260887146, + "learning_rate": 1.713370675170234e-06, + "loss": 1.7335, + "step": 29940 + }, + { + "epoch": 9.189993861264579, + "grad_norm": 0.17329075932502747, + "learning_rate": 1.7120808588802495e-06, + "loss": 1.7113, + "step": 29941 + }, + { + "epoch": 9.190300798035604, + "grad_norm": 0.12317316979169846, + "learning_rate": 1.7107915197997925e-06, + "loss": 1.7149, + "step": 29942 + }, + { + "epoch": 9.19060773480663, + "grad_norm": 0.2204972505569458, + "learning_rate": 1.7095026579415918e-06, + "loss": 1.7845, + "step": 29943 + }, + { + "epoch": 9.190914671577655, + "grad_norm": 0.13796095550060272, + "learning_rate": 1.7082142733183925e-06, + "loss": 1.7121, + "step": 29944 + }, + { + "epoch": 9.19122160834868, + "grad_norm": 0.14287333190441132, + "learning_rate": 1.7069263659429236e-06, + "loss": 1.7026, + "step": 29945 + }, + { + "epoch": 9.191528545119706, + "grad_norm": 0.19072957336902618, + "learning_rate": 1.705638935827908e-06, + "loss": 1.7604, + "step": 29946 + }, + { + "epoch": 9.191835481890731, + "grad_norm": 0.19318242371082306, + "learning_rate": 1.7043519829860855e-06, + "loss": 1.7107, + "step": 29947 + }, + { + "epoch": 9.192142418661756, + "grad_norm": 0.1858752965927124, + "learning_rate": 1.7030655074301517e-06, + "loss": 1.7408, + "step": 29948 + }, + { + "epoch": 9.192449355432782, + "grad_norm": 0.17308852076530457, + "learning_rate": 1.701779509172846e-06, + "loss": 1.6848, + "step": 29949 + }, + { + "epoch": 9.192756292203805, + "grad_norm": 0.12158332020044327, + "learning_rate": 1.7004939882268478e-06, + "loss": 1.6964, + "step": 29950 + }, + { + "epoch": 9.19306322897483, + "grad_norm": 0.12801475822925568, + "learning_rate": 1.6992089446048908e-06, + "loss": 1.6643, + "step": 29951 + }, + { + "epoch": 9.193370165745856, + "grad_norm": 0.13018257915973663, + "learning_rate": 1.6979243783196596e-06, + "loss": 1.6741, + "step": 29952 + }, + { + "epoch": 9.193677102516881, + "grad_norm": 0.1402437686920166, + "learning_rate": 1.696640289383844e-06, + "loss": 1.737, + "step": 29953 + }, + { + "epoch": 9.193984039287907, + "grad_norm": 0.15448710322380066, + "learning_rate": 1.6953566778101448e-06, + "loss": 1.7147, + "step": 29954 + }, + { + "epoch": 9.194290976058932, + "grad_norm": 0.19089701771736145, + "learning_rate": 1.6940735436112409e-06, + "loss": 1.7047, + "step": 29955 + }, + { + "epoch": 9.194597912829957, + "grad_norm": 0.13311919569969177, + "learning_rate": 1.692790886799811e-06, + "loss": 1.6698, + "step": 29956 + }, + { + "epoch": 9.194904849600983, + "grad_norm": 0.14337676763534546, + "learning_rate": 1.691508707388545e-06, + "loss": 1.7124, + "step": 29957 + }, + { + "epoch": 9.195211786372008, + "grad_norm": 0.15666979551315308, + "learning_rate": 1.6902270053900993e-06, + "loss": 1.6884, + "step": 29958 + }, + { + "epoch": 9.195518723143033, + "grad_norm": 0.15445134043693542, + "learning_rate": 1.6889457808171472e-06, + "loss": 1.7395, + "step": 29959 + }, + { + "epoch": 9.195825659914057, + "grad_norm": 0.1683775633573532, + "learning_rate": 1.6876650336823452e-06, + "loss": 1.7808, + "step": 29960 + }, + { + "epoch": 9.196132596685082, + "grad_norm": 0.2521384060382843, + "learning_rate": 1.686384763998361e-06, + "loss": 1.7684, + "step": 29961 + }, + { + "epoch": 9.196439533456108, + "grad_norm": 0.15807218849658966, + "learning_rate": 1.6851049717778345e-06, + "loss": 1.7253, + "step": 29962 + }, + { + "epoch": 9.196746470227133, + "grad_norm": 0.18106147646903992, + "learning_rate": 1.683825657033411e-06, + "loss": 1.773, + "step": 29963 + }, + { + "epoch": 9.197053406998158, + "grad_norm": 0.14914186298847198, + "learning_rate": 1.6825468197777582e-06, + "loss": 1.6628, + "step": 29964 + }, + { + "epoch": 9.197360343769184, + "grad_norm": 0.12124781310558319, + "learning_rate": 1.681268460023483e-06, + "loss": 1.6634, + "step": 29965 + }, + { + "epoch": 9.197667280540209, + "grad_norm": 0.15450555086135864, + "learning_rate": 1.679990577783247e-06, + "loss": 1.741, + "step": 29966 + }, + { + "epoch": 9.197974217311234, + "grad_norm": 0.21389459073543549, + "learning_rate": 1.678713173069657e-06, + "loss": 1.7145, + "step": 29967 + }, + { + "epoch": 9.19828115408226, + "grad_norm": 0.1850728541612625, + "learning_rate": 1.6774362458953474e-06, + "loss": 1.7674, + "step": 29968 + }, + { + "epoch": 9.198588090853285, + "grad_norm": 0.160726860165596, + "learning_rate": 1.6761597962729413e-06, + "loss": 1.7598, + "step": 29969 + }, + { + "epoch": 9.19889502762431, + "grad_norm": 0.15501825511455536, + "learning_rate": 1.6748838242150344e-06, + "loss": 1.7443, + "step": 29970 + }, + { + "epoch": 9.199201964395334, + "grad_norm": 0.17127695679664612, + "learning_rate": 1.6736083297342609e-06, + "loss": 1.7289, + "step": 29971 + }, + { + "epoch": 9.199508901166359, + "grad_norm": 0.13027416169643402, + "learning_rate": 1.672333312843205e-06, + "loss": 1.6673, + "step": 29972 + }, + { + "epoch": 9.199815837937384, + "grad_norm": 0.16939190030097961, + "learning_rate": 1.6710587735544847e-06, + "loss": 1.7582, + "step": 29973 + }, + { + "epoch": 9.20012277470841, + "grad_norm": 0.19931311905384064, + "learning_rate": 1.6697847118806898e-06, + "loss": 1.7894, + "step": 29974 + }, + { + "epoch": 9.200429711479435, + "grad_norm": 0.16785076260566711, + "learning_rate": 1.6685111278344045e-06, + "loss": 1.7051, + "step": 29975 + }, + { + "epoch": 9.20073664825046, + "grad_norm": 0.18373487889766693, + "learning_rate": 1.667238021428219e-06, + "loss": 1.7078, + "step": 29976 + }, + { + "epoch": 9.201043585021486, + "grad_norm": 0.1502874493598938, + "learning_rate": 1.6659653926747232e-06, + "loss": 1.702, + "step": 29977 + }, + { + "epoch": 9.201350521792511, + "grad_norm": 0.17113728821277618, + "learning_rate": 1.6646932415864791e-06, + "loss": 1.6796, + "step": 29978 + }, + { + "epoch": 9.201657458563536, + "grad_norm": 0.14872509241104126, + "learning_rate": 1.6634215681760712e-06, + "loss": 1.6883, + "step": 29979 + }, + { + "epoch": 9.201964395334562, + "grad_norm": 0.14375372231006622, + "learning_rate": 1.662150372456056e-06, + "loss": 1.6833, + "step": 29980 + }, + { + "epoch": 9.202271332105587, + "grad_norm": 0.20072759687900543, + "learning_rate": 1.6608796544390127e-06, + "loss": 1.7408, + "step": 29981 + }, + { + "epoch": 9.20257826887661, + "grad_norm": 0.14475533366203308, + "learning_rate": 1.6596094141374807e-06, + "loss": 1.7138, + "step": 29982 + }, + { + "epoch": 9.202885205647636, + "grad_norm": 0.16516630351543427, + "learning_rate": 1.6583396515640338e-06, + "loss": 1.7765, + "step": 29983 + }, + { + "epoch": 9.203192142418661, + "grad_norm": 0.1530120074748993, + "learning_rate": 1.6570703667311894e-06, + "loss": 1.7047, + "step": 29984 + }, + { + "epoch": 9.203499079189687, + "grad_norm": 0.14001020789146423, + "learning_rate": 1.655801559651521e-06, + "loss": 1.691, + "step": 29985 + }, + { + "epoch": 9.203806015960712, + "grad_norm": 0.15876981616020203, + "learning_rate": 1.6545332303375626e-06, + "loss": 1.7238, + "step": 29986 + }, + { + "epoch": 9.204112952731737, + "grad_norm": 0.1669185608625412, + "learning_rate": 1.6532653788018326e-06, + "loss": 1.7345, + "step": 29987 + }, + { + "epoch": 9.204419889502763, + "grad_norm": 0.12812626361846924, + "learning_rate": 1.6519980050568817e-06, + "loss": 1.6792, + "step": 29988 + }, + { + "epoch": 9.204726826273788, + "grad_norm": 0.1336258500814438, + "learning_rate": 1.6507311091152166e-06, + "loss": 1.688, + "step": 29989 + }, + { + "epoch": 9.205033763044813, + "grad_norm": 0.18334448337554932, + "learning_rate": 1.6494646909893663e-06, + "loss": 1.745, + "step": 29990 + }, + { + "epoch": 9.205340699815839, + "grad_norm": 0.1458664983510971, + "learning_rate": 1.6481987506918428e-06, + "loss": 1.6967, + "step": 29991 + }, + { + "epoch": 9.205647636586862, + "grad_norm": 0.13565613329410553, + "learning_rate": 1.646933288235164e-06, + "loss": 1.6649, + "step": 29992 + }, + { + "epoch": 9.205954573357888, + "grad_norm": 0.1161680594086647, + "learning_rate": 1.6456683036318255e-06, + "loss": 1.6838, + "step": 29993 + }, + { + "epoch": 9.206261510128913, + "grad_norm": 0.1749819964170456, + "learning_rate": 1.6444037968943394e-06, + "loss": 1.7567, + "step": 29994 + }, + { + "epoch": 9.206568446899938, + "grad_norm": 0.1397893726825714, + "learning_rate": 1.6431397680351957e-06, + "loss": 1.7191, + "step": 29995 + }, + { + "epoch": 9.206875383670964, + "grad_norm": 0.13551786541938782, + "learning_rate": 1.64187621706689e-06, + "loss": 1.6938, + "step": 29996 + }, + { + "epoch": 9.207182320441989, + "grad_norm": 0.13458238542079926, + "learning_rate": 1.6406131440019012e-06, + "loss": 1.6701, + "step": 29997 + }, + { + "epoch": 9.207489257213014, + "grad_norm": 0.14004193246364594, + "learning_rate": 1.6393505488527194e-06, + "loss": 1.6758, + "step": 29998 + }, + { + "epoch": 9.20779619398404, + "grad_norm": 0.1691395789384842, + "learning_rate": 1.6380884316318179e-06, + "loss": 1.704, + "step": 29999 + }, + { + "epoch": 9.208103130755065, + "grad_norm": 0.13417977094650269, + "learning_rate": 1.636826792351681e-06, + "loss": 1.7207, + "step": 30000 + }, + { + "epoch": 9.20841006752609, + "grad_norm": 0.12645697593688965, + "learning_rate": 1.6355656310247658e-06, + "loss": 1.6594, + "step": 30001 + }, + { + "epoch": 9.208717004297116, + "grad_norm": 0.17769555747509003, + "learning_rate": 1.634304947663534e-06, + "loss": 1.7316, + "step": 30002 + }, + { + "epoch": 9.20902394106814, + "grad_norm": 0.12273482233285904, + "learning_rate": 1.633044742280454e-06, + "loss": 1.6724, + "step": 30003 + }, + { + "epoch": 9.209330877839164, + "grad_norm": 0.15213249623775482, + "learning_rate": 1.6317850148879654e-06, + "loss": 1.7555, + "step": 30004 + }, + { + "epoch": 9.20963781461019, + "grad_norm": 0.22034598886966705, + "learning_rate": 1.6305257654985361e-06, + "loss": 1.7395, + "step": 30005 + }, + { + "epoch": 9.209944751381215, + "grad_norm": 0.1581713706254959, + "learning_rate": 1.6292669941245953e-06, + "loss": 1.7504, + "step": 30006 + }, + { + "epoch": 9.21025168815224, + "grad_norm": 0.1384512335062027, + "learning_rate": 1.6280087007785939e-06, + "loss": 1.6991, + "step": 30007 + }, + { + "epoch": 9.210558624923266, + "grad_norm": 0.15608127415180206, + "learning_rate": 1.6267508854729608e-06, + "loss": 1.7229, + "step": 30008 + }, + { + "epoch": 9.210865561694291, + "grad_norm": 0.22049592435359955, + "learning_rate": 1.625493548220125e-06, + "loss": 1.7395, + "step": 30009 + }, + { + "epoch": 9.211172498465316, + "grad_norm": 0.13226120173931122, + "learning_rate": 1.6242366890325155e-06, + "loss": 1.6797, + "step": 30010 + }, + { + "epoch": 9.211479435236342, + "grad_norm": 0.17857056856155396, + "learning_rate": 1.6229803079225559e-06, + "loss": 1.7725, + "step": 30011 + }, + { + "epoch": 9.211786372007367, + "grad_norm": 0.14409810304641724, + "learning_rate": 1.6217244049026581e-06, + "loss": 1.6777, + "step": 30012 + }, + { + "epoch": 9.212093308778392, + "grad_norm": 0.15496647357940674, + "learning_rate": 1.6204689799852401e-06, + "loss": 1.7171, + "step": 30013 + }, + { + "epoch": 9.212400245549416, + "grad_norm": 0.1262955516576767, + "learning_rate": 1.6192140331826977e-06, + "loss": 1.7066, + "step": 30014 + }, + { + "epoch": 9.212707182320441, + "grad_norm": 0.14165538549423218, + "learning_rate": 1.6179595645074431e-06, + "loss": 1.7425, + "step": 30015 + }, + { + "epoch": 9.213014119091467, + "grad_norm": 0.1557457596063614, + "learning_rate": 1.6167055739718605e-06, + "loss": 1.7181, + "step": 30016 + }, + { + "epoch": 9.213321055862492, + "grad_norm": 0.13509629666805267, + "learning_rate": 1.6154520615883627e-06, + "loss": 1.6942, + "step": 30017 + }, + { + "epoch": 9.213627992633517, + "grad_norm": 0.14409126341342926, + "learning_rate": 1.614199027369323e-06, + "loss": 1.6949, + "step": 30018 + }, + { + "epoch": 9.213934929404543, + "grad_norm": 0.14323770999908447, + "learning_rate": 1.6129464713271315e-06, + "loss": 1.7007, + "step": 30019 + }, + { + "epoch": 9.214241866175568, + "grad_norm": 0.12424668669700623, + "learning_rate": 1.6116943934741558e-06, + "loss": 1.7118, + "step": 30020 + }, + { + "epoch": 9.214548802946593, + "grad_norm": 0.16182856261730194, + "learning_rate": 1.6104427938227807e-06, + "loss": 1.7683, + "step": 30021 + }, + { + "epoch": 9.214855739717619, + "grad_norm": 0.136052668094635, + "learning_rate": 1.609191672385374e-06, + "loss": 1.6438, + "step": 30022 + }, + { + "epoch": 9.215162676488644, + "grad_norm": 0.14279018342494965, + "learning_rate": 1.6079410291742924e-06, + "loss": 1.7062, + "step": 30023 + }, + { + "epoch": 9.215469613259668, + "grad_norm": 0.11300359666347504, + "learning_rate": 1.6066908642019097e-06, + "loss": 1.6509, + "step": 30024 + }, + { + "epoch": 9.215776550030693, + "grad_norm": 0.14017970860004425, + "learning_rate": 1.6054411774805655e-06, + "loss": 1.68, + "step": 30025 + }, + { + "epoch": 9.216083486801718, + "grad_norm": 0.12801769375801086, + "learning_rate": 1.604191969022617e-06, + "loss": 1.7377, + "step": 30026 + }, + { + "epoch": 9.216390423572744, + "grad_norm": 0.16302450001239777, + "learning_rate": 1.6029432388404097e-06, + "loss": 1.6966, + "step": 30027 + }, + { + "epoch": 9.216697360343769, + "grad_norm": 0.12138327211141586, + "learning_rate": 1.6016949869462894e-06, + "loss": 1.6836, + "step": 30028 + }, + { + "epoch": 9.217004297114794, + "grad_norm": 0.14843621850013733, + "learning_rate": 1.6004472133525794e-06, + "loss": 1.6891, + "step": 30029 + }, + { + "epoch": 9.21731123388582, + "grad_norm": 0.1426590085029602, + "learning_rate": 1.59919991807162e-06, + "loss": 1.6759, + "step": 30030 + }, + { + "epoch": 9.217618170656845, + "grad_norm": 0.1690209060907364, + "learning_rate": 1.59795310111574e-06, + "loss": 1.7315, + "step": 30031 + }, + { + "epoch": 9.21792510742787, + "grad_norm": 0.1929413378238678, + "learning_rate": 1.596706762497252e-06, + "loss": 1.7137, + "step": 30032 + }, + { + "epoch": 9.218232044198896, + "grad_norm": 0.16534923017024994, + "learning_rate": 1.5954609022284739e-06, + "loss": 1.7599, + "step": 30033 + }, + { + "epoch": 9.218538980969921, + "grad_norm": 0.16535919904708862, + "learning_rate": 1.594215520321729e-06, + "loss": 1.7358, + "step": 30034 + }, + { + "epoch": 9.218845917740945, + "grad_norm": 0.1476306915283203, + "learning_rate": 1.5929706167893188e-06, + "loss": 1.6952, + "step": 30035 + }, + { + "epoch": 9.21915285451197, + "grad_norm": 0.12421105802059174, + "learning_rate": 1.5917261916435388e-06, + "loss": 1.6731, + "step": 30036 + }, + { + "epoch": 9.219459791282995, + "grad_norm": 0.18759414553642273, + "learning_rate": 1.5904822448967017e-06, + "loss": 1.6516, + "step": 30037 + }, + { + "epoch": 9.21976672805402, + "grad_norm": 0.16421522200107574, + "learning_rate": 1.5892387765610806e-06, + "loss": 1.6702, + "step": 30038 + }, + { + "epoch": 9.220073664825046, + "grad_norm": 0.15226107835769653, + "learning_rate": 1.587995786648988e-06, + "loss": 1.6868, + "step": 30039 + }, + { + "epoch": 9.220380601596071, + "grad_norm": 0.18976561725139618, + "learning_rate": 1.5867532751726865e-06, + "loss": 1.7359, + "step": 30040 + }, + { + "epoch": 9.220687538367097, + "grad_norm": 0.1367981731891632, + "learning_rate": 1.5855112421444774e-06, + "loss": 1.6977, + "step": 30041 + }, + { + "epoch": 9.220994475138122, + "grad_norm": 0.13698583841323853, + "learning_rate": 1.5842696875766116e-06, + "loss": 1.7305, + "step": 30042 + }, + { + "epoch": 9.221301411909147, + "grad_norm": 0.14987944066524506, + "learning_rate": 1.5830286114813742e-06, + "loss": 1.706, + "step": 30043 + }, + { + "epoch": 9.221608348680173, + "grad_norm": 0.1334082931280136, + "learning_rate": 1.5817880138710273e-06, + "loss": 1.6489, + "step": 30044 + }, + { + "epoch": 9.221915285451198, + "grad_norm": 0.27590668201446533, + "learning_rate": 1.580547894757828e-06, + "loss": 1.8041, + "step": 30045 + }, + { + "epoch": 9.222222222222221, + "grad_norm": 0.13377591967582703, + "learning_rate": 1.5793082541540327e-06, + "loss": 1.7251, + "step": 30046 + }, + { + "epoch": 9.222529158993247, + "grad_norm": 0.15182198584079742, + "learning_rate": 1.5780690920718988e-06, + "loss": 1.6932, + "step": 30047 + }, + { + "epoch": 9.222836095764272, + "grad_norm": 0.12374742329120636, + "learning_rate": 1.5768304085236663e-06, + "loss": 1.6808, + "step": 30048 + }, + { + "epoch": 9.223143032535297, + "grad_norm": 0.14800786972045898, + "learning_rate": 1.5755922035215753e-06, + "loss": 1.7341, + "step": 30049 + }, + { + "epoch": 9.223449969306323, + "grad_norm": 0.18947643041610718, + "learning_rate": 1.574354477077855e-06, + "loss": 1.7121, + "step": 30050 + }, + { + "epoch": 9.223756906077348, + "grad_norm": 0.13209564983844757, + "learning_rate": 1.5731172292047625e-06, + "loss": 1.6491, + "step": 30051 + }, + { + "epoch": 9.224063842848373, + "grad_norm": 0.1743779480457306, + "learning_rate": 1.5718804599145043e-06, + "loss": 1.7364, + "step": 30052 + }, + { + "epoch": 9.224370779619399, + "grad_norm": 0.1696232557296753, + "learning_rate": 1.5706441692193096e-06, + "loss": 1.7148, + "step": 30053 + }, + { + "epoch": 9.224677716390424, + "grad_norm": 0.38987866044044495, + "learning_rate": 1.5694083571313912e-06, + "loss": 1.7351, + "step": 30054 + }, + { + "epoch": 9.22498465316145, + "grad_norm": 0.18110236525535583, + "learning_rate": 1.568173023662961e-06, + "loss": 1.7497, + "step": 30055 + }, + { + "epoch": 9.225291589932475, + "grad_norm": 0.11834049224853516, + "learning_rate": 1.566938168826243e-06, + "loss": 1.6824, + "step": 30056 + }, + { + "epoch": 9.225598526703498, + "grad_norm": 0.1685422658920288, + "learning_rate": 1.5657037926334162e-06, + "loss": 1.6938, + "step": 30057 + }, + { + "epoch": 9.225905463474524, + "grad_norm": 0.17743349075317383, + "learning_rate": 1.5644698950967095e-06, + "loss": 1.7747, + "step": 30058 + }, + { + "epoch": 9.226212400245549, + "grad_norm": 0.13532224297523499, + "learning_rate": 1.5632364762282859e-06, + "loss": 1.6867, + "step": 30059 + }, + { + "epoch": 9.226519337016574, + "grad_norm": 0.1925237476825714, + "learning_rate": 1.5620035360403517e-06, + "loss": 1.7638, + "step": 30060 + }, + { + "epoch": 9.2268262737876, + "grad_norm": 0.15505637228488922, + "learning_rate": 1.560771074545092e-06, + "loss": 1.6801, + "step": 30061 + }, + { + "epoch": 9.227133210558625, + "grad_norm": 0.15233661234378815, + "learning_rate": 1.559539091754686e-06, + "loss": 1.6854, + "step": 30062 + }, + { + "epoch": 9.22744014732965, + "grad_norm": 0.1538659930229187, + "learning_rate": 1.5583075876813013e-06, + "loss": 1.7107, + "step": 30063 + }, + { + "epoch": 9.227747084100676, + "grad_norm": 0.1162392795085907, + "learning_rate": 1.5570765623371176e-06, + "loss": 1.6593, + "step": 30064 + }, + { + "epoch": 9.228054020871701, + "grad_norm": 0.1888103187084198, + "learning_rate": 1.5558460157342913e-06, + "loss": 1.7406, + "step": 30065 + }, + { + "epoch": 9.228360957642726, + "grad_norm": 0.13712546229362488, + "learning_rate": 1.5546159478849964e-06, + "loss": 1.6892, + "step": 30066 + }, + { + "epoch": 9.22866789441375, + "grad_norm": 0.20172229409217834, + "learning_rate": 1.553386358801373e-06, + "loss": 1.7729, + "step": 30067 + }, + { + "epoch": 9.228974831184775, + "grad_norm": 0.16218116879463196, + "learning_rate": 1.5521572484955893e-06, + "loss": 1.7331, + "step": 30068 + }, + { + "epoch": 9.2292817679558, + "grad_norm": 0.15987847745418549, + "learning_rate": 1.5509286169797798e-06, + "loss": 1.7177, + "step": 30069 + }, + { + "epoch": 9.229588704726826, + "grad_norm": 0.14362195134162903, + "learning_rate": 1.5497004642660907e-06, + "loss": 1.6616, + "step": 30070 + }, + { + "epoch": 9.229895641497851, + "grad_norm": 0.15351802110671997, + "learning_rate": 1.5484727903666618e-06, + "loss": 1.7356, + "step": 30071 + }, + { + "epoch": 9.230202578268877, + "grad_norm": 0.1514216959476471, + "learning_rate": 1.5472455952936116e-06, + "loss": 1.7386, + "step": 30072 + }, + { + "epoch": 9.230509515039902, + "grad_norm": 0.1280907839536667, + "learning_rate": 1.5460188790590967e-06, + "loss": 1.6783, + "step": 30073 + }, + { + "epoch": 9.230816451810927, + "grad_norm": 0.20153765380382538, + "learning_rate": 1.544792641675208e-06, + "loss": 1.6954, + "step": 30074 + }, + { + "epoch": 9.231123388581953, + "grad_norm": 0.1277652084827423, + "learning_rate": 1.5435668831540905e-06, + "loss": 1.692, + "step": 30075 + }, + { + "epoch": 9.231430325352978, + "grad_norm": 0.1274770349264145, + "learning_rate": 1.5423416035078408e-06, + "loss": 1.7265, + "step": 30076 + }, + { + "epoch": 9.231737262124003, + "grad_norm": 0.11994244903326035, + "learning_rate": 1.5411168027485712e-06, + "loss": 1.6476, + "step": 30077 + }, + { + "epoch": 9.232044198895027, + "grad_norm": 0.1459321826696396, + "learning_rate": 1.539892480888394e-06, + "loss": 1.6627, + "step": 30078 + }, + { + "epoch": 9.232351135666052, + "grad_norm": 0.15515929460525513, + "learning_rate": 1.5386686379394e-06, + "loss": 1.6974, + "step": 30079 + }, + { + "epoch": 9.232658072437077, + "grad_norm": 0.1805061250925064, + "learning_rate": 1.5374452739136846e-06, + "loss": 1.7336, + "step": 30080 + }, + { + "epoch": 9.232965009208103, + "grad_norm": 0.10603496432304382, + "learning_rate": 1.5362223888233384e-06, + "loss": 1.6526, + "step": 30081 + }, + { + "epoch": 9.233271945979128, + "grad_norm": 0.15579989552497864, + "learning_rate": 1.5349999826804517e-06, + "loss": 1.7093, + "step": 30082 + }, + { + "epoch": 9.233578882750153, + "grad_norm": 0.15068648755550385, + "learning_rate": 1.5337780554971037e-06, + "loss": 1.6486, + "step": 30083 + }, + { + "epoch": 9.233885819521179, + "grad_norm": 0.13521051406860352, + "learning_rate": 1.532556607285357e-06, + "loss": 1.6972, + "step": 30084 + }, + { + "epoch": 9.234192756292204, + "grad_norm": 0.15651237964630127, + "learning_rate": 1.5313356380573074e-06, + "loss": 1.7232, + "step": 30085 + }, + { + "epoch": 9.23449969306323, + "grad_norm": 0.18412761390209198, + "learning_rate": 1.530115147825001e-06, + "loss": 1.7127, + "step": 30086 + }, + { + "epoch": 9.234806629834255, + "grad_norm": 0.13278020918369293, + "learning_rate": 1.528895136600511e-06, + "loss": 1.6963, + "step": 30087 + }, + { + "epoch": 9.23511356660528, + "grad_norm": 0.126597598195076, + "learning_rate": 1.527675604395884e-06, + "loss": 1.6899, + "step": 30088 + }, + { + "epoch": 9.235420503376304, + "grad_norm": 0.1658754050731659, + "learning_rate": 1.526456551223171e-06, + "loss": 1.6655, + "step": 30089 + }, + { + "epoch": 9.235727440147329, + "grad_norm": 0.2280663400888443, + "learning_rate": 1.5252379770944402e-06, + "loss": 1.7804, + "step": 30090 + }, + { + "epoch": 9.236034376918354, + "grad_norm": 0.15943841636180878, + "learning_rate": 1.5240198820217044e-06, + "loss": 1.7023, + "step": 30091 + }, + { + "epoch": 9.23634131368938, + "grad_norm": 0.12864334881305695, + "learning_rate": 1.5228022660170315e-06, + "loss": 1.6951, + "step": 30092 + }, + { + "epoch": 9.236648250460405, + "grad_norm": 0.11842049658298492, + "learning_rate": 1.5215851290924233e-06, + "loss": 1.6864, + "step": 30093 + }, + { + "epoch": 9.23695518723143, + "grad_norm": 0.11744343489408493, + "learning_rate": 1.5203684712599364e-06, + "loss": 1.6594, + "step": 30094 + }, + { + "epoch": 9.237262124002456, + "grad_norm": 0.15188898146152496, + "learning_rate": 1.5191522925315838e-06, + "loss": 1.7024, + "step": 30095 + }, + { + "epoch": 9.237569060773481, + "grad_norm": 0.16257372498512268, + "learning_rate": 1.517936592919378e-06, + "loss": 1.7568, + "step": 30096 + }, + { + "epoch": 9.237875997544506, + "grad_norm": 0.2373557835817337, + "learning_rate": 1.5167213724353424e-06, + "loss": 1.7712, + "step": 30097 + }, + { + "epoch": 9.238182934315532, + "grad_norm": 0.13525256514549255, + "learning_rate": 1.5155066310914846e-06, + "loss": 1.7128, + "step": 30098 + }, + { + "epoch": 9.238489871086557, + "grad_norm": 0.1386425495147705, + "learning_rate": 1.5142923688998055e-06, + "loss": 1.6655, + "step": 30099 + }, + { + "epoch": 9.23879680785758, + "grad_norm": 0.16497959196567535, + "learning_rate": 1.5130785858723072e-06, + "loss": 1.7426, + "step": 30100 + }, + { + "epoch": 9.239103744628606, + "grad_norm": 0.13364866375923157, + "learning_rate": 1.51186528202098e-06, + "loss": 1.6917, + "step": 30101 + }, + { + "epoch": 9.239410681399631, + "grad_norm": 0.15585513412952423, + "learning_rate": 1.5106524573578308e-06, + "loss": 1.7263, + "step": 30102 + }, + { + "epoch": 9.239717618170657, + "grad_norm": 0.17002388834953308, + "learning_rate": 1.5094401118948332e-06, + "loss": 1.7525, + "step": 30103 + }, + { + "epoch": 9.240024554941682, + "grad_norm": 0.147446408867836, + "learning_rate": 1.5082282456439666e-06, + "loss": 1.6941, + "step": 30104 + }, + { + "epoch": 9.240331491712707, + "grad_norm": 0.2109186351299286, + "learning_rate": 1.5070168586172106e-06, + "loss": 1.7316, + "step": 30105 + }, + { + "epoch": 9.240638428483733, + "grad_norm": 0.16860739886760712, + "learning_rate": 1.5058059508265276e-06, + "loss": 1.673, + "step": 30106 + }, + { + "epoch": 9.240945365254758, + "grad_norm": 0.16476429998874664, + "learning_rate": 1.504595522283908e-06, + "loss": 1.7225, + "step": 30107 + }, + { + "epoch": 9.241252302025783, + "grad_norm": 0.1818271279335022, + "learning_rate": 1.5033855730012925e-06, + "loss": 1.7372, + "step": 30108 + }, + { + "epoch": 9.241559238796809, + "grad_norm": 0.15022529661655426, + "learning_rate": 1.502176102990649e-06, + "loss": 1.7114, + "step": 30109 + }, + { + "epoch": 9.241866175567832, + "grad_norm": 0.11522844433784485, + "learning_rate": 1.500967112263918e-06, + "loss": 1.6815, + "step": 30110 + }, + { + "epoch": 9.242173112338858, + "grad_norm": 0.16297772526741028, + "learning_rate": 1.4997586008330622e-06, + "loss": 1.6865, + "step": 30111 + }, + { + "epoch": 9.242480049109883, + "grad_norm": 0.14999376237392426, + "learning_rate": 1.4985505687100222e-06, + "loss": 1.741, + "step": 30112 + }, + { + "epoch": 9.242786985880908, + "grad_norm": 0.1419779509305954, + "learning_rate": 1.4973430159067326e-06, + "loss": 1.6859, + "step": 30113 + }, + { + "epoch": 9.243093922651934, + "grad_norm": 0.10719183832406998, + "learning_rate": 1.4961359424351228e-06, + "loss": 1.6533, + "step": 30114 + }, + { + "epoch": 9.243400859422959, + "grad_norm": 0.16076189279556274, + "learning_rate": 1.494929348307128e-06, + "loss": 1.6874, + "step": 30115 + }, + { + "epoch": 9.243707796193984, + "grad_norm": 0.18850088119506836, + "learning_rate": 1.4937232335346719e-06, + "loss": 1.7701, + "step": 30116 + }, + { + "epoch": 9.24401473296501, + "grad_norm": 0.12545527517795563, + "learning_rate": 1.4925175981296725e-06, + "loss": 1.6876, + "step": 30117 + }, + { + "epoch": 9.244321669736035, + "grad_norm": 0.13523197174072266, + "learning_rate": 1.4913124421040426e-06, + "loss": 1.6807, + "step": 30118 + }, + { + "epoch": 9.24462860650706, + "grad_norm": 0.1360730528831482, + "learning_rate": 1.490107765469706e-06, + "loss": 1.6776, + "step": 30119 + }, + { + "epoch": 9.244935543278086, + "grad_norm": 0.11223732680082321, + "learning_rate": 1.4889035682385476e-06, + "loss": 1.644, + "step": 30120 + }, + { + "epoch": 9.245242480049109, + "grad_norm": 0.13906998932361603, + "learning_rate": 1.4876998504224804e-06, + "loss": 1.6728, + "step": 30121 + }, + { + "epoch": 9.245549416820134, + "grad_norm": 0.1429383009672165, + "learning_rate": 1.4864966120333946e-06, + "loss": 1.7227, + "step": 30122 + }, + { + "epoch": 9.24585635359116, + "grad_norm": 0.1267431229352951, + "learning_rate": 1.4852938530831806e-06, + "loss": 1.6871, + "step": 30123 + }, + { + "epoch": 9.246163290362185, + "grad_norm": 0.20933954417705536, + "learning_rate": 1.48409157358374e-06, + "loss": 1.7602, + "step": 30124 + }, + { + "epoch": 9.24647022713321, + "grad_norm": 0.15941432118415833, + "learning_rate": 1.4828897735469305e-06, + "loss": 1.7505, + "step": 30125 + }, + { + "epoch": 9.246777163904236, + "grad_norm": 0.16897402703762054, + "learning_rate": 1.4816884529846531e-06, + "loss": 1.831, + "step": 30126 + }, + { + "epoch": 9.247084100675261, + "grad_norm": 0.16803954541683197, + "learning_rate": 1.480487611908754e-06, + "loss": 1.7239, + "step": 30127 + }, + { + "epoch": 9.247391037446286, + "grad_norm": 0.11253303289413452, + "learning_rate": 1.479287250331124e-06, + "loss": 1.6937, + "step": 30128 + }, + { + "epoch": 9.247697974217312, + "grad_norm": 0.1583312302827835, + "learning_rate": 1.4780873682636142e-06, + "loss": 1.7121, + "step": 30129 + }, + { + "epoch": 9.248004910988337, + "grad_norm": 0.16783545911312103, + "learning_rate": 1.4768879657180822e-06, + "loss": 1.7107, + "step": 30130 + }, + { + "epoch": 9.248311847759362, + "grad_norm": 0.1669779270887375, + "learning_rate": 1.4756890427063852e-06, + "loss": 1.7255, + "step": 30131 + }, + { + "epoch": 9.248618784530386, + "grad_norm": 0.13612589240074158, + "learning_rate": 1.474490599240369e-06, + "loss": 1.7388, + "step": 30132 + }, + { + "epoch": 9.248925721301411, + "grad_norm": 0.13164813816547394, + "learning_rate": 1.4732926353318798e-06, + "loss": 1.6854, + "step": 30133 + }, + { + "epoch": 9.249232658072437, + "grad_norm": 0.1553371399641037, + "learning_rate": 1.4720951509927582e-06, + "loss": 1.6898, + "step": 30134 + }, + { + "epoch": 9.249539594843462, + "grad_norm": 0.1533356010913849, + "learning_rate": 1.470898146234828e-06, + "loss": 1.6911, + "step": 30135 + }, + { + "epoch": 9.249846531614487, + "grad_norm": 0.14966778457164764, + "learning_rate": 1.4697016210699354e-06, + "loss": 1.7064, + "step": 30136 + }, + { + "epoch": 9.250153468385513, + "grad_norm": 0.10848648101091385, + "learning_rate": 1.4685055755098876e-06, + "loss": 1.646, + "step": 30137 + }, + { + "epoch": 9.250460405156538, + "grad_norm": 0.13840216398239136, + "learning_rate": 1.4673100095665193e-06, + "loss": 1.7318, + "step": 30138 + }, + { + "epoch": 9.250767341927563, + "grad_norm": 0.12550130486488342, + "learning_rate": 1.466114923251638e-06, + "loss": 1.6526, + "step": 30139 + }, + { + "epoch": 9.251074278698589, + "grad_norm": 0.1806049644947052, + "learning_rate": 1.4649203165770454e-06, + "loss": 1.7221, + "step": 30140 + }, + { + "epoch": 9.251381215469614, + "grad_norm": 0.14293114840984344, + "learning_rate": 1.4637261895545763e-06, + "loss": 1.7138, + "step": 30141 + }, + { + "epoch": 9.25168815224064, + "grad_norm": 0.1573718935251236, + "learning_rate": 1.4625325421959935e-06, + "loss": 1.7337, + "step": 30142 + }, + { + "epoch": 9.251995089011663, + "grad_norm": 0.1105809286236763, + "learning_rate": 1.4613393745131321e-06, + "loss": 1.6805, + "step": 30143 + }, + { + "epoch": 9.252302025782688, + "grad_norm": 0.16229867935180664, + "learning_rate": 1.4601466865177493e-06, + "loss": 1.7644, + "step": 30144 + }, + { + "epoch": 9.252608962553714, + "grad_norm": 0.17748580873012543, + "learning_rate": 1.4589544782216524e-06, + "loss": 1.7499, + "step": 30145 + }, + { + "epoch": 9.252915899324739, + "grad_norm": 0.1022428423166275, + "learning_rate": 1.4577627496366153e-06, + "loss": 1.6395, + "step": 30146 + }, + { + "epoch": 9.253222836095764, + "grad_norm": 0.11667326092720032, + "learning_rate": 1.4565715007744229e-06, + "loss": 1.6806, + "step": 30147 + }, + { + "epoch": 9.25352977286679, + "grad_norm": 0.1549718827009201, + "learning_rate": 1.4553807316468381e-06, + "loss": 1.7174, + "step": 30148 + }, + { + "epoch": 9.253836709637815, + "grad_norm": 0.13834457099437714, + "learning_rate": 1.4541904422656406e-06, + "loss": 1.6922, + "step": 30149 + }, + { + "epoch": 9.25414364640884, + "grad_norm": 0.15639884769916534, + "learning_rate": 1.4530006326425815e-06, + "loss": 1.7715, + "step": 30150 + }, + { + "epoch": 9.254450583179866, + "grad_norm": 0.15321171283721924, + "learning_rate": 1.4518113027894243e-06, + "loss": 1.7332, + "step": 30151 + }, + { + "epoch": 9.254757519950891, + "grad_norm": 0.12793070077896118, + "learning_rate": 1.4506224527179257e-06, + "loss": 1.6968, + "step": 30152 + }, + { + "epoch": 9.255064456721914, + "grad_norm": 0.11214233934879303, + "learning_rate": 1.4494340824398322e-06, + "loss": 1.6751, + "step": 30153 + }, + { + "epoch": 9.25537139349294, + "grad_norm": 0.16913802921772003, + "learning_rate": 1.4482461919668844e-06, + "loss": 1.7314, + "step": 30154 + }, + { + "epoch": 9.255678330263965, + "grad_norm": 0.14455991983413696, + "learning_rate": 1.4470587813108282e-06, + "loss": 1.6877, + "step": 30155 + }, + { + "epoch": 9.25598526703499, + "grad_norm": 0.15350060164928436, + "learning_rate": 1.4458718504833934e-06, + "loss": 1.7414, + "step": 30156 + }, + { + "epoch": 9.256292203806016, + "grad_norm": 0.1487266719341278, + "learning_rate": 1.4446853994963094e-06, + "loss": 1.7093, + "step": 30157 + }, + { + "epoch": 9.256599140577041, + "grad_norm": 0.13964293897151947, + "learning_rate": 1.4434994283613058e-06, + "loss": 1.6863, + "step": 30158 + }, + { + "epoch": 9.256906077348066, + "grad_norm": 0.13903170824050903, + "learning_rate": 1.442313937090095e-06, + "loss": 1.7268, + "step": 30159 + }, + { + "epoch": 9.257213014119092, + "grad_norm": 0.12514057755470276, + "learning_rate": 1.4411289256944072e-06, + "loss": 1.6836, + "step": 30160 + }, + { + "epoch": 9.257519950890117, + "grad_norm": 0.16892002522945404, + "learning_rate": 1.439944394185938e-06, + "loss": 1.6942, + "step": 30161 + }, + { + "epoch": 9.257826887661142, + "grad_norm": 0.22416932880878448, + "learning_rate": 1.4387603425764007e-06, + "loss": 1.8449, + "step": 30162 + }, + { + "epoch": 9.258133824432168, + "grad_norm": 0.13895165920257568, + "learning_rate": 1.4375767708775022e-06, + "loss": 1.7309, + "step": 30163 + }, + { + "epoch": 9.258440761203191, + "grad_norm": 0.13725127279758453, + "learning_rate": 1.436393679100928e-06, + "loss": 1.7508, + "step": 30164 + }, + { + "epoch": 9.258747697974217, + "grad_norm": 0.1684611737728119, + "learning_rate": 1.4352110672583796e-06, + "loss": 1.7656, + "step": 30165 + }, + { + "epoch": 9.259054634745242, + "grad_norm": 0.166968435049057, + "learning_rate": 1.4340289353615365e-06, + "loss": 1.7277, + "step": 30166 + }, + { + "epoch": 9.259361571516267, + "grad_norm": 0.2129509150981903, + "learning_rate": 1.4328472834220896e-06, + "loss": 1.8107, + "step": 30167 + }, + { + "epoch": 9.259668508287293, + "grad_norm": 0.15415063500404358, + "learning_rate": 1.4316661114517072e-06, + "loss": 1.7248, + "step": 30168 + }, + { + "epoch": 9.259975445058318, + "grad_norm": 0.10856158286333084, + "learning_rate": 1.4304854194620688e-06, + "loss": 1.6306, + "step": 30169 + }, + { + "epoch": 9.260282381829343, + "grad_norm": 0.16899555921554565, + "learning_rate": 1.4293052074648427e-06, + "loss": 1.7068, + "step": 30170 + }, + { + "epoch": 9.260589318600369, + "grad_norm": 0.1331903636455536, + "learning_rate": 1.4281254754716867e-06, + "loss": 1.682, + "step": 30171 + }, + { + "epoch": 9.260896255371394, + "grad_norm": 0.10237281024456024, + "learning_rate": 1.4269462234942631e-06, + "loss": 1.6859, + "step": 30172 + }, + { + "epoch": 9.26120319214242, + "grad_norm": 0.13941270112991333, + "learning_rate": 1.4257674515442298e-06, + "loss": 1.6922, + "step": 30173 + }, + { + "epoch": 9.261510128913443, + "grad_norm": 0.16863791644573212, + "learning_rate": 1.4245891596332328e-06, + "loss": 1.7276, + "step": 30174 + }, + { + "epoch": 9.261817065684468, + "grad_norm": 0.1314782202243805, + "learning_rate": 1.4234113477729184e-06, + "loss": 1.6829, + "step": 30175 + }, + { + "epoch": 9.262124002455494, + "grad_norm": 0.19281591475009918, + "learning_rate": 1.4222340159749158e-06, + "loss": 1.7281, + "step": 30176 + }, + { + "epoch": 9.262430939226519, + "grad_norm": 0.14531417191028595, + "learning_rate": 1.421057164250883e-06, + "loss": 1.7226, + "step": 30177 + }, + { + "epoch": 9.262737875997544, + "grad_norm": 0.15508733689785004, + "learning_rate": 1.4198807926124213e-06, + "loss": 1.7588, + "step": 30178 + }, + { + "epoch": 9.26304481276857, + "grad_norm": 0.09654982388019562, + "learning_rate": 1.418704901071183e-06, + "loss": 1.6742, + "step": 30179 + }, + { + "epoch": 9.263351749539595, + "grad_norm": 0.18973948061466217, + "learning_rate": 1.4175294896387693e-06, + "loss": 1.71, + "step": 30180 + }, + { + "epoch": 9.26365868631062, + "grad_norm": 0.15489214658737183, + "learning_rate": 1.41635455832681e-06, + "loss": 1.7074, + "step": 30181 + }, + { + "epoch": 9.263965623081646, + "grad_norm": 0.15990005433559418, + "learning_rate": 1.4151801071469072e-06, + "loss": 1.6822, + "step": 30182 + }, + { + "epoch": 9.264272559852671, + "grad_norm": 0.17423443496227264, + "learning_rate": 1.4140061361106737e-06, + "loss": 1.7677, + "step": 30183 + }, + { + "epoch": 9.264579496623696, + "grad_norm": 0.15427646040916443, + "learning_rate": 1.4128326452297058e-06, + "loss": 1.7021, + "step": 30184 + }, + { + "epoch": 9.26488643339472, + "grad_norm": 0.13731053471565247, + "learning_rate": 1.4116596345156053e-06, + "loss": 1.7235, + "step": 30185 + }, + { + "epoch": 9.265193370165745, + "grad_norm": 0.13132283091545105, + "learning_rate": 1.4104871039799627e-06, + "loss": 1.7159, + "step": 30186 + }, + { + "epoch": 9.26550030693677, + "grad_norm": 0.12384344637393951, + "learning_rate": 1.409315053634369e-06, + "loss": 1.6785, + "step": 30187 + }, + { + "epoch": 9.265807243707796, + "grad_norm": 0.16857418417930603, + "learning_rate": 1.4081434834903984e-06, + "loss": 1.7453, + "step": 30188 + }, + { + "epoch": 9.266114180478821, + "grad_norm": 0.13803976774215698, + "learning_rate": 1.4069723935596412e-06, + "loss": 1.6826, + "step": 30189 + }, + { + "epoch": 9.266421117249847, + "grad_norm": 0.16141049563884735, + "learning_rate": 1.4058017838536552e-06, + "loss": 1.7113, + "step": 30190 + }, + { + "epoch": 9.266728054020872, + "grad_norm": 0.13290546834468842, + "learning_rate": 1.4046316543840254e-06, + "loss": 1.7262, + "step": 30191 + }, + { + "epoch": 9.267034990791897, + "grad_norm": 0.163112610578537, + "learning_rate": 1.4034620051623037e-06, + "loss": 1.7002, + "step": 30192 + }, + { + "epoch": 9.267341927562923, + "grad_norm": 0.11482264846563339, + "learning_rate": 1.402292836200053e-06, + "loss": 1.6714, + "step": 30193 + }, + { + "epoch": 9.267648864333948, + "grad_norm": 0.15263767540454865, + "learning_rate": 1.4011241475088367e-06, + "loss": 1.7323, + "step": 30194 + }, + { + "epoch": 9.267955801104973, + "grad_norm": 0.16607387363910675, + "learning_rate": 1.3999559391001838e-06, + "loss": 1.7116, + "step": 30195 + }, + { + "epoch": 9.268262737875997, + "grad_norm": 0.15621553361415863, + "learning_rate": 1.398788210985663e-06, + "loss": 1.7069, + "step": 30196 + }, + { + "epoch": 9.268569674647022, + "grad_norm": 0.13450275361537933, + "learning_rate": 1.3976209631767934e-06, + "loss": 1.6924, + "step": 30197 + }, + { + "epoch": 9.268876611418047, + "grad_norm": 0.18217138946056366, + "learning_rate": 1.3964541956851263e-06, + "loss": 1.7349, + "step": 30198 + }, + { + "epoch": 9.269183548189073, + "grad_norm": 0.18020178377628326, + "learning_rate": 1.3952879085221858e-06, + "loss": 1.7358, + "step": 30199 + }, + { + "epoch": 9.269490484960098, + "grad_norm": 0.1362251341342926, + "learning_rate": 1.3941221016994965e-06, + "loss": 1.724, + "step": 30200 + }, + { + "epoch": 9.269797421731123, + "grad_norm": 0.15907861292362213, + "learning_rate": 1.392956775228582e-06, + "loss": 1.712, + "step": 30201 + }, + { + "epoch": 9.270104358502149, + "grad_norm": 0.12772800028324127, + "learning_rate": 1.3917919291209614e-06, + "loss": 1.6744, + "step": 30202 + }, + { + "epoch": 9.270411295273174, + "grad_norm": 0.12429596483707428, + "learning_rate": 1.3906275633881416e-06, + "loss": 1.721, + "step": 30203 + }, + { + "epoch": 9.2707182320442, + "grad_norm": 0.20072144269943237, + "learning_rate": 1.3894636780416303e-06, + "loss": 1.7024, + "step": 30204 + }, + { + "epoch": 9.271025168815225, + "grad_norm": 0.13898633420467377, + "learning_rate": 1.3883002730929296e-06, + "loss": 1.6943, + "step": 30205 + }, + { + "epoch": 9.27133210558625, + "grad_norm": 0.11137440800666809, + "learning_rate": 1.387137348553541e-06, + "loss": 1.6666, + "step": 30206 + }, + { + "epoch": 9.271639042357274, + "grad_norm": 0.13952526450157166, + "learning_rate": 1.3859749044349501e-06, + "loss": 1.6988, + "step": 30207 + }, + { + "epoch": 9.271945979128299, + "grad_norm": 0.1566372960805893, + "learning_rate": 1.3848129407486477e-06, + "loss": 1.6942, + "step": 30208 + }, + { + "epoch": 9.272252915899324, + "grad_norm": 0.1273697465658188, + "learning_rate": 1.3836514575061244e-06, + "loss": 1.6926, + "step": 30209 + }, + { + "epoch": 9.27255985267035, + "grad_norm": 0.15591974556446075, + "learning_rate": 1.3824904547188434e-06, + "loss": 1.734, + "step": 30210 + }, + { + "epoch": 9.272866789441375, + "grad_norm": 0.14875155687332153, + "learning_rate": 1.3813299323982954e-06, + "loss": 1.7229, + "step": 30211 + }, + { + "epoch": 9.2731737262124, + "grad_norm": 0.15695714950561523, + "learning_rate": 1.3801698905559325e-06, + "loss": 1.727, + "step": 30212 + }, + { + "epoch": 9.273480662983426, + "grad_norm": 0.16134092211723328, + "learning_rate": 1.3790103292032398e-06, + "loss": 1.7321, + "step": 30213 + }, + { + "epoch": 9.273787599754451, + "grad_norm": 0.16619402170181274, + "learning_rate": 1.3778512483516527e-06, + "loss": 1.6804, + "step": 30214 + }, + { + "epoch": 9.274094536525476, + "grad_norm": 0.12403136491775513, + "learning_rate": 1.3766926480126452e-06, + "loss": 1.7067, + "step": 30215 + }, + { + "epoch": 9.274401473296502, + "grad_norm": 0.13903765380382538, + "learning_rate": 1.3755345281976584e-06, + "loss": 1.7138, + "step": 30216 + }, + { + "epoch": 9.274708410067525, + "grad_norm": 0.10627007484436035, + "learning_rate": 1.3743768889181385e-06, + "loss": 1.6693, + "step": 30217 + }, + { + "epoch": 9.27501534683855, + "grad_norm": 0.12304051220417023, + "learning_rate": 1.3732197301855265e-06, + "loss": 1.6838, + "step": 30218 + }, + { + "epoch": 9.275322283609576, + "grad_norm": 0.12596885859966278, + "learning_rate": 1.3720630520112632e-06, + "loss": 1.6924, + "step": 30219 + }, + { + "epoch": 9.275629220380601, + "grad_norm": 0.16624486446380615, + "learning_rate": 1.3709068544067672e-06, + "loss": 1.7316, + "step": 30220 + }, + { + "epoch": 9.275936157151627, + "grad_norm": 0.11655814945697784, + "learning_rate": 1.3697511373834737e-06, + "loss": 1.6877, + "step": 30221 + }, + { + "epoch": 9.276243093922652, + "grad_norm": 0.1264163851737976, + "learning_rate": 1.368595900952807e-06, + "loss": 1.6995, + "step": 30222 + }, + { + "epoch": 9.276550030693677, + "grad_norm": 0.10144982486963272, + "learning_rate": 1.3674411451261748e-06, + "loss": 1.6426, + "step": 30223 + }, + { + "epoch": 9.276856967464703, + "grad_norm": 0.13389989733695984, + "learning_rate": 1.3662868699149955e-06, + "loss": 1.7072, + "step": 30224 + }, + { + "epoch": 9.277163904235728, + "grad_norm": 0.18326976895332336, + "learning_rate": 1.3651330753306769e-06, + "loss": 1.7426, + "step": 30225 + }, + { + "epoch": 9.277470841006753, + "grad_norm": 0.1679212898015976, + "learning_rate": 1.363979761384615e-06, + "loss": 1.7158, + "step": 30226 + }, + { + "epoch": 9.277777777777779, + "grad_norm": 0.26792997121810913, + "learning_rate": 1.3628269280882066e-06, + "loss": 1.7862, + "step": 30227 + }, + { + "epoch": 9.278084714548802, + "grad_norm": 0.1797039955854416, + "learning_rate": 1.361674575452865e-06, + "loss": 1.8311, + "step": 30228 + }, + { + "epoch": 9.278391651319827, + "grad_norm": 0.14270684123039246, + "learning_rate": 1.360522703489947e-06, + "loss": 1.6823, + "step": 30229 + }, + { + "epoch": 9.278698588090853, + "grad_norm": 0.12262453138828278, + "learning_rate": 1.3593713122108665e-06, + "loss": 1.6576, + "step": 30230 + }, + { + "epoch": 9.279005524861878, + "grad_norm": 0.20434293150901794, + "learning_rate": 1.358220401626975e-06, + "loss": 1.7508, + "step": 30231 + }, + { + "epoch": 9.279312461632903, + "grad_norm": 0.12360373884439468, + "learning_rate": 1.3570699717496637e-06, + "loss": 1.6636, + "step": 30232 + }, + { + "epoch": 9.279619398403929, + "grad_norm": 0.1771468222141266, + "learning_rate": 1.3559200225903013e-06, + "loss": 1.6926, + "step": 30233 + }, + { + "epoch": 9.279926335174954, + "grad_norm": 0.13039356470108032, + "learning_rate": 1.3547705541602451e-06, + "loss": 1.6671, + "step": 30234 + }, + { + "epoch": 9.28023327194598, + "grad_norm": 0.12824147939682007, + "learning_rate": 1.3536215664708586e-06, + "loss": 1.6835, + "step": 30235 + }, + { + "epoch": 9.280540208717005, + "grad_norm": 0.15304934978485107, + "learning_rate": 1.3524730595334933e-06, + "loss": 1.7216, + "step": 30236 + }, + { + "epoch": 9.28084714548803, + "grad_norm": 0.13606427609920502, + "learning_rate": 1.3513250333595074e-06, + "loss": 1.7062, + "step": 30237 + }, + { + "epoch": 9.281154082259055, + "grad_norm": 0.1449199616909027, + "learning_rate": 1.3501774879602414e-06, + "loss": 1.6988, + "step": 30238 + }, + { + "epoch": 9.281461019030079, + "grad_norm": 0.11309704929590225, + "learning_rate": 1.3490304233470307e-06, + "loss": 1.6721, + "step": 30239 + }, + { + "epoch": 9.281767955801104, + "grad_norm": 0.17013555765151978, + "learning_rate": 1.3478838395312222e-06, + "loss": 1.7045, + "step": 30240 + }, + { + "epoch": 9.28207489257213, + "grad_norm": 0.11972448974847794, + "learning_rate": 1.3467377365241396e-06, + "loss": 1.7015, + "step": 30241 + }, + { + "epoch": 9.282381829343155, + "grad_norm": 0.17848798632621765, + "learning_rate": 1.345592114337113e-06, + "loss": 1.7063, + "step": 30242 + }, + { + "epoch": 9.28268876611418, + "grad_norm": 0.1346857249736786, + "learning_rate": 1.3444469729814612e-06, + "loss": 1.7126, + "step": 30243 + }, + { + "epoch": 9.282995702885206, + "grad_norm": 0.17026859521865845, + "learning_rate": 1.3433023124684974e-06, + "loss": 1.7094, + "step": 30244 + }, + { + "epoch": 9.283302639656231, + "grad_norm": 0.12969297170639038, + "learning_rate": 1.3421581328095456e-06, + "loss": 1.717, + "step": 30245 + }, + { + "epoch": 9.283609576427256, + "grad_norm": 0.19405554234981537, + "learning_rate": 1.3410144340159026e-06, + "loss": 1.7221, + "step": 30246 + }, + { + "epoch": 9.283916513198282, + "grad_norm": 0.16258898377418518, + "learning_rate": 1.3398712160988814e-06, + "loss": 1.7174, + "step": 30247 + }, + { + "epoch": 9.284223449969307, + "grad_norm": 0.18568632006645203, + "learning_rate": 1.338728479069762e-06, + "loss": 1.7093, + "step": 30248 + }, + { + "epoch": 9.284530386740332, + "grad_norm": 0.11301061511039734, + "learning_rate": 1.3375862229398518e-06, + "loss": 1.7053, + "step": 30249 + }, + { + "epoch": 9.284837323511356, + "grad_norm": 0.15475797653198242, + "learning_rate": 1.3364444477204418e-06, + "loss": 1.7773, + "step": 30250 + }, + { + "epoch": 9.285144260282381, + "grad_norm": 0.153490349650383, + "learning_rate": 1.3353031534228067e-06, + "loss": 1.69, + "step": 30251 + }, + { + "epoch": 9.285451197053407, + "grad_norm": 0.14238356053829193, + "learning_rate": 1.3341623400582314e-06, + "loss": 1.6917, + "step": 30252 + }, + { + "epoch": 9.285758133824432, + "grad_norm": 0.24802085757255554, + "learning_rate": 1.3330220076379906e-06, + "loss": 1.7581, + "step": 30253 + }, + { + "epoch": 9.286065070595457, + "grad_norm": 0.1755116581916809, + "learning_rate": 1.3318821561733474e-06, + "loss": 1.7433, + "step": 30254 + }, + { + "epoch": 9.286372007366483, + "grad_norm": 0.142706498503685, + "learning_rate": 1.3307427856755705e-06, + "loss": 1.8094, + "step": 30255 + }, + { + "epoch": 9.286678944137508, + "grad_norm": 0.10654154419898987, + "learning_rate": 1.3296038961559177e-06, + "loss": 1.6768, + "step": 30256 + }, + { + "epoch": 9.286985880908533, + "grad_norm": 0.1446719765663147, + "learning_rate": 1.3284654876256464e-06, + "loss": 1.763, + "step": 30257 + }, + { + "epoch": 9.287292817679559, + "grad_norm": 0.128647580742836, + "learning_rate": 1.3273275600960089e-06, + "loss": 1.7217, + "step": 30258 + }, + { + "epoch": 9.287599754450584, + "grad_norm": 0.16537147760391235, + "learning_rate": 1.3261901135782462e-06, + "loss": 1.7158, + "step": 30259 + }, + { + "epoch": 9.287906691221608, + "grad_norm": 0.12634962797164917, + "learning_rate": 1.3250531480836048e-06, + "loss": 1.7062, + "step": 30260 + }, + { + "epoch": 9.288213627992633, + "grad_norm": 0.14017465710639954, + "learning_rate": 1.323916663623309e-06, + "loss": 1.7008, + "step": 30261 + }, + { + "epoch": 9.288520564763658, + "grad_norm": 0.14252761006355286, + "learning_rate": 1.3227806602086113e-06, + "loss": 1.7241, + "step": 30262 + }, + { + "epoch": 9.288827501534684, + "grad_norm": 0.16626526415348053, + "learning_rate": 1.3216451378507132e-06, + "loss": 1.7422, + "step": 30263 + }, + { + "epoch": 9.289134438305709, + "grad_norm": 0.17778219282627106, + "learning_rate": 1.3205100965608564e-06, + "loss": 1.7595, + "step": 30264 + }, + { + "epoch": 9.289441375076734, + "grad_norm": 0.1335630863904953, + "learning_rate": 1.319375536350248e-06, + "loss": 1.7238, + "step": 30265 + }, + { + "epoch": 9.28974831184776, + "grad_norm": 0.18150761723518372, + "learning_rate": 1.3182414572301017e-06, + "loss": 1.7575, + "step": 30266 + }, + { + "epoch": 9.290055248618785, + "grad_norm": 0.10502864420413971, + "learning_rate": 1.3171078592116304e-06, + "loss": 1.6641, + "step": 30267 + }, + { + "epoch": 9.29036218538981, + "grad_norm": 0.18388547003269196, + "learning_rate": 1.315974742306031e-06, + "loss": 1.7128, + "step": 30268 + }, + { + "epoch": 9.290669122160836, + "grad_norm": 0.16178761422634125, + "learning_rate": 1.3148421065245054e-06, + "loss": 1.8073, + "step": 30269 + }, + { + "epoch": 9.29097605893186, + "grad_norm": 0.28871726989746094, + "learning_rate": 1.3137099518782449e-06, + "loss": 1.7344, + "step": 30270 + }, + { + "epoch": 9.291282995702884, + "grad_norm": 0.12639513611793518, + "learning_rate": 1.3125782783784403e-06, + "loss": 1.7105, + "step": 30271 + }, + { + "epoch": 9.29158993247391, + "grad_norm": 0.12210296839475632, + "learning_rate": 1.3114470860362716e-06, + "loss": 1.6964, + "step": 30272 + }, + { + "epoch": 9.291896869244935, + "grad_norm": 0.1808413416147232, + "learning_rate": 1.3103163748629187e-06, + "loss": 1.6897, + "step": 30273 + }, + { + "epoch": 9.29220380601596, + "grad_norm": 0.12490539252758026, + "learning_rate": 1.309186144869562e-06, + "loss": 1.6775, + "step": 30274 + }, + { + "epoch": 9.292510742786986, + "grad_norm": 0.14661727845668793, + "learning_rate": 1.3080563960673641e-06, + "loss": 1.705, + "step": 30275 + }, + { + "epoch": 9.292817679558011, + "grad_norm": 0.14526040852069855, + "learning_rate": 1.3069271284674888e-06, + "loss": 1.7507, + "step": 30276 + }, + { + "epoch": 9.293124616329036, + "grad_norm": 0.1486021727323532, + "learning_rate": 1.3057983420811049e-06, + "loss": 1.7162, + "step": 30277 + }, + { + "epoch": 9.293431553100062, + "grad_norm": 0.11850638687610626, + "learning_rate": 1.3046700369193532e-06, + "loss": 1.6996, + "step": 30278 + }, + { + "epoch": 9.293738489871087, + "grad_norm": 0.12612518668174744, + "learning_rate": 1.3035422129934027e-06, + "loss": 1.6846, + "step": 30279 + }, + { + "epoch": 9.294045426642112, + "grad_norm": 0.2112930864095688, + "learning_rate": 1.3024148703143834e-06, + "loss": 1.7302, + "step": 30280 + }, + { + "epoch": 9.294352363413138, + "grad_norm": 0.142434224486351, + "learning_rate": 1.3012880088934532e-06, + "loss": 1.7411, + "step": 30281 + }, + { + "epoch": 9.294659300184161, + "grad_norm": 0.20386098325252533, + "learning_rate": 1.3001616287417251e-06, + "loss": 1.748, + "step": 30282 + }, + { + "epoch": 9.294966236955187, + "grad_norm": 0.22800381481647491, + "learning_rate": 1.2990357298703514e-06, + "loss": 1.7431, + "step": 30283 + }, + { + "epoch": 9.295273173726212, + "grad_norm": 0.1692253053188324, + "learning_rate": 1.2979103122904512e-06, + "loss": 1.6908, + "step": 30284 + }, + { + "epoch": 9.295580110497237, + "grad_norm": 0.17138120532035828, + "learning_rate": 1.2967853760131431e-06, + "loss": 1.7099, + "step": 30285 + }, + { + "epoch": 9.295887047268263, + "grad_norm": 0.16712112724781036, + "learning_rate": 1.2956609210495518e-06, + "loss": 1.7331, + "step": 30286 + }, + { + "epoch": 9.296193984039288, + "grad_norm": 0.14170047640800476, + "learning_rate": 1.2945369474107849e-06, + "loss": 1.7089, + "step": 30287 + }, + { + "epoch": 9.296500920810313, + "grad_norm": 0.1860484778881073, + "learning_rate": 1.2934134551079503e-06, + "loss": 1.7737, + "step": 30288 + }, + { + "epoch": 9.296807857581339, + "grad_norm": 0.16710804402828217, + "learning_rate": 1.29229044415215e-06, + "loss": 1.7499, + "step": 30289 + }, + { + "epoch": 9.297114794352364, + "grad_norm": 0.11533838510513306, + "learning_rate": 1.2911679145544863e-06, + "loss": 1.6506, + "step": 30290 + }, + { + "epoch": 9.29742173112339, + "grad_norm": 0.1814284324645996, + "learning_rate": 1.2900458663260506e-06, + "loss": 1.7121, + "step": 30291 + }, + { + "epoch": 9.297728667894415, + "grad_norm": 0.11727334558963776, + "learning_rate": 1.2889242994779282e-06, + "loss": 1.6581, + "step": 30292 + }, + { + "epoch": 9.298035604665438, + "grad_norm": 0.2274969071149826, + "learning_rate": 1.2878032140212103e-06, + "loss": 1.7406, + "step": 30293 + }, + { + "epoch": 9.298342541436464, + "grad_norm": 0.12290076911449432, + "learning_rate": 1.2866826099669716e-06, + "loss": 1.6568, + "step": 30294 + }, + { + "epoch": 9.298649478207489, + "grad_norm": 0.2026246190071106, + "learning_rate": 1.2855624873262807e-06, + "loss": 1.7296, + "step": 30295 + }, + { + "epoch": 9.298956414978514, + "grad_norm": 0.13751426339149475, + "learning_rate": 1.284442846110223e-06, + "loss": 1.6897, + "step": 30296 + }, + { + "epoch": 9.29926335174954, + "grad_norm": 0.13357232511043549, + "learning_rate": 1.2833236863298459e-06, + "loss": 1.6609, + "step": 30297 + }, + { + "epoch": 9.299570288520565, + "grad_norm": 0.1956695318222046, + "learning_rate": 1.282205007996229e-06, + "loss": 1.7066, + "step": 30298 + }, + { + "epoch": 9.29987722529159, + "grad_norm": 0.11530495434999466, + "learning_rate": 1.2810868111204022e-06, + "loss": 1.6769, + "step": 30299 + }, + { + "epoch": 9.300184162062616, + "grad_norm": 0.1230783686041832, + "learning_rate": 1.2799690957134402e-06, + "loss": 1.665, + "step": 30300 + }, + { + "epoch": 9.300491098833641, + "grad_norm": 0.14144892990589142, + "learning_rate": 1.2788518617863787e-06, + "loss": 1.7247, + "step": 30301 + }, + { + "epoch": 9.300798035604666, + "grad_norm": 0.13692058622837067, + "learning_rate": 1.2777351093502588e-06, + "loss": 1.7165, + "step": 30302 + }, + { + "epoch": 9.30110497237569, + "grad_norm": 0.14115191996097565, + "learning_rate": 1.2766188384161159e-06, + "loss": 1.6991, + "step": 30303 + }, + { + "epoch": 9.301411909146715, + "grad_norm": 0.1208532303571701, + "learning_rate": 1.2755030489949803e-06, + "loss": 1.6621, + "step": 30304 + }, + { + "epoch": 9.30171884591774, + "grad_norm": 0.15770223736763, + "learning_rate": 1.2743877410978877e-06, + "loss": 1.7525, + "step": 30305 + }, + { + "epoch": 9.302025782688766, + "grad_norm": 0.1563788652420044, + "learning_rate": 1.2732729147358514e-06, + "loss": 1.7488, + "step": 30306 + }, + { + "epoch": 9.302332719459791, + "grad_norm": 0.13665367662906647, + "learning_rate": 1.2721585699198956e-06, + "loss": 1.7311, + "step": 30307 + }, + { + "epoch": 9.302639656230816, + "grad_norm": 0.23698623478412628, + "learning_rate": 1.2710447066610287e-06, + "loss": 1.7541, + "step": 30308 + }, + { + "epoch": 9.302946593001842, + "grad_norm": 0.17781539261341095, + "learning_rate": 1.2699313249702528e-06, + "loss": 1.7821, + "step": 30309 + }, + { + "epoch": 9.303253529772867, + "grad_norm": 0.14912116527557373, + "learning_rate": 1.2688184248585811e-06, + "loss": 1.7339, + "step": 30310 + }, + { + "epoch": 9.303560466543892, + "grad_norm": 0.18003590404987335, + "learning_rate": 1.2677060063370106e-06, + "loss": 1.7297, + "step": 30311 + }, + { + "epoch": 9.303867403314918, + "grad_norm": 0.11753804981708527, + "learning_rate": 1.2665940694165268e-06, + "loss": 1.6855, + "step": 30312 + }, + { + "epoch": 9.304174340085943, + "grad_norm": 0.1824817657470703, + "learning_rate": 1.2654826141081323e-06, + "loss": 1.7109, + "step": 30313 + }, + { + "epoch": 9.304481276856967, + "grad_norm": 0.13189560174942017, + "learning_rate": 1.26437164042279e-06, + "loss": 1.6894, + "step": 30314 + }, + { + "epoch": 9.304788213627992, + "grad_norm": 0.13488316535949707, + "learning_rate": 1.2632611483715029e-06, + "loss": 1.7074, + "step": 30315 + }, + { + "epoch": 9.305095150399017, + "grad_norm": 0.1344909518957138, + "learning_rate": 1.2621511379652284e-06, + "loss": 1.7152, + "step": 30316 + }, + { + "epoch": 9.305402087170043, + "grad_norm": 0.1880505383014679, + "learning_rate": 1.2610416092149468e-06, + "loss": 1.7673, + "step": 30317 + }, + { + "epoch": 9.305709023941068, + "grad_norm": 0.14804401993751526, + "learning_rate": 1.259932562131616e-06, + "loss": 1.686, + "step": 30318 + }, + { + "epoch": 9.306015960712093, + "grad_norm": 0.17230413854122162, + "learning_rate": 1.2588239967261994e-06, + "loss": 1.7075, + "step": 30319 + }, + { + "epoch": 9.306322897483119, + "grad_norm": 0.14153720438480377, + "learning_rate": 1.257715913009655e-06, + "loss": 1.6613, + "step": 30320 + }, + { + "epoch": 9.306629834254144, + "grad_norm": 0.20363643765449524, + "learning_rate": 1.2566083109929293e-06, + "loss": 1.7175, + "step": 30321 + }, + { + "epoch": 9.30693677102517, + "grad_norm": 0.1647050678730011, + "learning_rate": 1.2555011906869695e-06, + "loss": 1.719, + "step": 30322 + }, + { + "epoch": 9.307243707796195, + "grad_norm": 0.12517094612121582, + "learning_rate": 1.2543945521027167e-06, + "loss": 1.6589, + "step": 30323 + }, + { + "epoch": 9.307550644567218, + "grad_norm": 0.12023728340864182, + "learning_rate": 1.2532883952511066e-06, + "loss": 1.6757, + "step": 30324 + }, + { + "epoch": 9.307857581338244, + "grad_norm": 0.1275765597820282, + "learning_rate": 1.2521827201430692e-06, + "loss": 1.6908, + "step": 30325 + }, + { + "epoch": 9.308164518109269, + "grad_norm": 0.11408694088459015, + "learning_rate": 1.2510775267895403e-06, + "loss": 1.6488, + "step": 30326 + }, + { + "epoch": 9.308471454880294, + "grad_norm": 0.13963791728019714, + "learning_rate": 1.2499728152014334e-06, + "loss": 1.7251, + "step": 30327 + }, + { + "epoch": 9.30877839165132, + "grad_norm": 0.1533326357603073, + "learning_rate": 1.2488685853896676e-06, + "loss": 1.7334, + "step": 30328 + }, + { + "epoch": 9.309085328422345, + "grad_norm": 0.1821897327899933, + "learning_rate": 1.2477648373651563e-06, + "loss": 1.7077, + "step": 30329 + }, + { + "epoch": 9.30939226519337, + "grad_norm": 0.1111680120229721, + "learning_rate": 1.246661571138813e-06, + "loss": 1.6781, + "step": 30330 + }, + { + "epoch": 9.309699201964396, + "grad_norm": 0.13651998341083527, + "learning_rate": 1.2455587867215234e-06, + "loss": 1.718, + "step": 30331 + }, + { + "epoch": 9.310006138735421, + "grad_norm": 0.14687657356262207, + "learning_rate": 1.2444564841242123e-06, + "loss": 1.7526, + "step": 30332 + }, + { + "epoch": 9.310313075506446, + "grad_norm": 0.09129049628973007, + "learning_rate": 1.243354663357743e-06, + "loss": 1.6513, + "step": 30333 + }, + { + "epoch": 9.310620012277472, + "grad_norm": 0.11914718151092529, + "learning_rate": 1.2422533244330348e-06, + "loss": 1.6698, + "step": 30334 + }, + { + "epoch": 9.310926949048495, + "grad_norm": 0.1276206523180008, + "learning_rate": 1.2411524673609454e-06, + "loss": 1.6659, + "step": 30335 + }, + { + "epoch": 9.31123388581952, + "grad_norm": 0.12232425808906555, + "learning_rate": 1.2400520921523718e-06, + "loss": 1.6637, + "step": 30336 + }, + { + "epoch": 9.311540822590546, + "grad_norm": 0.1205383911728859, + "learning_rate": 1.238952198818183e-06, + "loss": 1.6681, + "step": 30337 + }, + { + "epoch": 9.311847759361571, + "grad_norm": 0.15745756030082703, + "learning_rate": 1.2378527873692481e-06, + "loss": 1.6975, + "step": 30338 + }, + { + "epoch": 9.312154696132596, + "grad_norm": 0.11485351622104645, + "learning_rate": 1.2367538578164307e-06, + "loss": 1.6621, + "step": 30339 + }, + { + "epoch": 9.312461632903622, + "grad_norm": 0.1697990894317627, + "learning_rate": 1.2356554101705942e-06, + "loss": 1.7309, + "step": 30340 + }, + { + "epoch": 9.312768569674647, + "grad_norm": 0.1388407200574875, + "learning_rate": 1.2345574444425912e-06, + "loss": 1.7437, + "step": 30341 + }, + { + "epoch": 9.313075506445673, + "grad_norm": 0.16522379219532013, + "learning_rate": 1.233459960643274e-06, + "loss": 1.7683, + "step": 30342 + }, + { + "epoch": 9.313382443216698, + "grad_norm": 0.13259927928447723, + "learning_rate": 1.2323629587834895e-06, + "loss": 1.7042, + "step": 30343 + }, + { + "epoch": 9.313689379987723, + "grad_norm": 0.1397528201341629, + "learning_rate": 1.2312664388740791e-06, + "loss": 1.7123, + "step": 30344 + }, + { + "epoch": 9.313996316758749, + "grad_norm": 0.1758471429347992, + "learning_rate": 1.2301704009258785e-06, + "loss": 1.6722, + "step": 30345 + }, + { + "epoch": 9.314303253529772, + "grad_norm": 0.18485552072525024, + "learning_rate": 1.229074844949718e-06, + "loss": 1.6827, + "step": 30346 + }, + { + "epoch": 9.314610190300797, + "grad_norm": 0.14430436491966248, + "learning_rate": 1.2279797709564222e-06, + "loss": 1.767, + "step": 30347 + }, + { + "epoch": 9.314917127071823, + "grad_norm": 0.16392521560192108, + "learning_rate": 1.226885178956816e-06, + "loss": 1.7177, + "step": 30348 + }, + { + "epoch": 9.315224063842848, + "grad_norm": 0.16354848444461823, + "learning_rate": 1.2257910689617235e-06, + "loss": 1.7661, + "step": 30349 + }, + { + "epoch": 9.315531000613873, + "grad_norm": 0.1507464200258255, + "learning_rate": 1.2246974409819424e-06, + "loss": 1.7143, + "step": 30350 + }, + { + "epoch": 9.315837937384899, + "grad_norm": 0.136259064078331, + "learning_rate": 1.2236042950282967e-06, + "loss": 1.6617, + "step": 30351 + }, + { + "epoch": 9.316144874155924, + "grad_norm": 0.1246718019247055, + "learning_rate": 1.222511631111578e-06, + "loss": 1.6955, + "step": 30352 + }, + { + "epoch": 9.31645181092695, + "grad_norm": 0.14265364408493042, + "learning_rate": 1.221419449242589e-06, + "loss": 1.6978, + "step": 30353 + }, + { + "epoch": 9.316758747697975, + "grad_norm": 0.1196669489145279, + "learning_rate": 1.2203277494321263e-06, + "loss": 1.6989, + "step": 30354 + }, + { + "epoch": 9.317065684469, + "grad_norm": 0.11023372411727905, + "learning_rate": 1.2192365316909705e-06, + "loss": 1.6807, + "step": 30355 + }, + { + "epoch": 9.317372621240025, + "grad_norm": 0.12376198917627335, + "learning_rate": 1.2181457960299237e-06, + "loss": 1.679, + "step": 30356 + }, + { + "epoch": 9.317679558011049, + "grad_norm": 0.1426834762096405, + "learning_rate": 1.217055542459733e-06, + "loss": 1.7351, + "step": 30357 + }, + { + "epoch": 9.317986494782074, + "grad_norm": 0.14629580080509186, + "learning_rate": 1.215965770991201e-06, + "loss": 1.7454, + "step": 30358 + }, + { + "epoch": 9.3182934315531, + "grad_norm": 0.13081271946430206, + "learning_rate": 1.214876481635091e-06, + "loss": 1.713, + "step": 30359 + }, + { + "epoch": 9.318600368324125, + "grad_norm": 0.2170068770647049, + "learning_rate": 1.2137876744021614e-06, + "loss": 1.7831, + "step": 30360 + }, + { + "epoch": 9.31890730509515, + "grad_norm": 0.13917239010334015, + "learning_rate": 1.2126993493031814e-06, + "loss": 1.702, + "step": 30361 + }, + { + "epoch": 9.319214241866176, + "grad_norm": 0.14852571487426758, + "learning_rate": 1.2116115063488975e-06, + "loss": 1.699, + "step": 30362 + }, + { + "epoch": 9.319521178637201, + "grad_norm": 0.1458083689212799, + "learning_rate": 1.2105241455500682e-06, + "loss": 1.6817, + "step": 30363 + }, + { + "epoch": 9.319828115408226, + "grad_norm": 0.1341833621263504, + "learning_rate": 1.209437266917429e-06, + "loss": 1.7099, + "step": 30364 + }, + { + "epoch": 9.320135052179252, + "grad_norm": 0.1942918747663498, + "learning_rate": 1.2083508704617274e-06, + "loss": 1.7301, + "step": 30365 + }, + { + "epoch": 9.320441988950277, + "grad_norm": 0.11925941705703735, + "learning_rate": 1.2072649561937099e-06, + "loss": 1.6722, + "step": 30366 + }, + { + "epoch": 9.3207489257213, + "grad_norm": 0.11189054697751999, + "learning_rate": 1.2061795241240904e-06, + "loss": 1.6457, + "step": 30367 + }, + { + "epoch": 9.321055862492326, + "grad_norm": 0.1742805689573288, + "learning_rate": 1.20509457426361e-06, + "loss": 1.7477, + "step": 30368 + }, + { + "epoch": 9.321362799263351, + "grad_norm": 0.2269359976053238, + "learning_rate": 1.204010106622977e-06, + "loss": 1.6954, + "step": 30369 + }, + { + "epoch": 9.321669736034377, + "grad_norm": 0.1383572667837143, + "learning_rate": 1.2029261212129218e-06, + "loss": 1.6834, + "step": 30370 + }, + { + "epoch": 9.321976672805402, + "grad_norm": 0.17733120918273926, + "learning_rate": 1.2018426180441466e-06, + "loss": 1.7426, + "step": 30371 + }, + { + "epoch": 9.322283609576427, + "grad_norm": 0.1365019828081131, + "learning_rate": 1.200759597127371e-06, + "loss": 1.7037, + "step": 30372 + }, + { + "epoch": 9.322590546347453, + "grad_norm": 0.1320653259754181, + "learning_rate": 1.1996770584732919e-06, + "loss": 1.7051, + "step": 30373 + }, + { + "epoch": 9.322897483118478, + "grad_norm": 0.16690899431705475, + "learning_rate": 1.1985950020926007e-06, + "loss": 1.7237, + "step": 30374 + }, + { + "epoch": 9.323204419889503, + "grad_norm": 0.10169432312250137, + "learning_rate": 1.1975134279959944e-06, + "loss": 1.6557, + "step": 30375 + }, + { + "epoch": 9.323511356660529, + "grad_norm": 0.10515127331018448, + "learning_rate": 1.1964323361941699e-06, + "loss": 1.6733, + "step": 30376 + }, + { + "epoch": 9.323818293431554, + "grad_norm": 0.13177691400051117, + "learning_rate": 1.1953517266978076e-06, + "loss": 1.7029, + "step": 30377 + }, + { + "epoch": 9.324125230202577, + "grad_norm": 0.12130782753229141, + "learning_rate": 1.1942715995175824e-06, + "loss": 1.6479, + "step": 30378 + }, + { + "epoch": 9.324432166973603, + "grad_norm": 0.1792365312576294, + "learning_rate": 1.193191954664169e-06, + "loss": 1.7026, + "step": 30379 + }, + { + "epoch": 9.324739103744628, + "grad_norm": 0.1391845941543579, + "learning_rate": 1.1921127921482422e-06, + "loss": 1.7122, + "step": 30380 + }, + { + "epoch": 9.325046040515653, + "grad_norm": 0.1593550443649292, + "learning_rate": 1.1910341119804657e-06, + "loss": 1.7014, + "step": 30381 + }, + { + "epoch": 9.325352977286679, + "grad_norm": 0.12819503247737885, + "learning_rate": 1.1899559141714922e-06, + "loss": 1.6717, + "step": 30382 + }, + { + "epoch": 9.325659914057704, + "grad_norm": 0.1585071086883545, + "learning_rate": 1.1888781987319907e-06, + "loss": 1.7021, + "step": 30383 + }, + { + "epoch": 9.32596685082873, + "grad_norm": 0.11215679347515106, + "learning_rate": 1.187800965672592e-06, + "loss": 1.6531, + "step": 30384 + }, + { + "epoch": 9.326273787599755, + "grad_norm": 0.10981804877519608, + "learning_rate": 1.1867242150039648e-06, + "loss": 1.6521, + "step": 30385 + }, + { + "epoch": 9.32658072437078, + "grad_norm": 0.1629389524459839, + "learning_rate": 1.1856479467367342e-06, + "loss": 1.7423, + "step": 30386 + }, + { + "epoch": 9.326887661141805, + "grad_norm": 0.1501983404159546, + "learning_rate": 1.1845721608815418e-06, + "loss": 1.7384, + "step": 30387 + }, + { + "epoch": 9.32719459791283, + "grad_norm": 0.13212816417217255, + "learning_rate": 1.1834968574490235e-06, + "loss": 1.6723, + "step": 30388 + }, + { + "epoch": 9.327501534683854, + "grad_norm": 0.140591561794281, + "learning_rate": 1.1824220364497984e-06, + "loss": 1.6677, + "step": 30389 + }, + { + "epoch": 9.32780847145488, + "grad_norm": 0.1365015208721161, + "learning_rate": 1.181347697894497e-06, + "loss": 1.6791, + "step": 30390 + }, + { + "epoch": 9.328115408225905, + "grad_norm": 0.16453112661838531, + "learning_rate": 1.1802738417937165e-06, + "loss": 1.7321, + "step": 30391 + }, + { + "epoch": 9.32842234499693, + "grad_norm": 0.18619593977928162, + "learning_rate": 1.1792004681580981e-06, + "loss": 1.7275, + "step": 30392 + }, + { + "epoch": 9.328729281767956, + "grad_norm": 0.2532525956630707, + "learning_rate": 1.178127576998228e-06, + "loss": 1.7376, + "step": 30393 + }, + { + "epoch": 9.329036218538981, + "grad_norm": 0.17427068948745728, + "learning_rate": 1.17705516832472e-06, + "loss": 1.7054, + "step": 30394 + }, + { + "epoch": 9.329343155310006, + "grad_norm": 0.13894926011562347, + "learning_rate": 1.1759832421481654e-06, + "loss": 1.6931, + "step": 30395 + }, + { + "epoch": 9.329650092081032, + "grad_norm": 0.12709759175777435, + "learning_rate": 1.174911798479167e-06, + "loss": 1.6846, + "step": 30396 + }, + { + "epoch": 9.329957028852057, + "grad_norm": 0.10510111600160599, + "learning_rate": 1.173840837328305e-06, + "loss": 1.666, + "step": 30397 + }, + { + "epoch": 9.330263965623082, + "grad_norm": 0.15923313796520233, + "learning_rate": 1.1727703587061655e-06, + "loss": 1.7103, + "step": 30398 + }, + { + "epoch": 9.330570902394108, + "grad_norm": 0.16868524253368378, + "learning_rate": 1.171700362623318e-06, + "loss": 1.7608, + "step": 30399 + }, + { + "epoch": 9.330877839165131, + "grad_norm": 0.2206472009420395, + "learning_rate": 1.170630849090365e-06, + "loss": 1.7612, + "step": 30400 + }, + { + "epoch": 9.331184775936157, + "grad_norm": 0.1557077318429947, + "learning_rate": 1.1695618181178426e-06, + "loss": 1.7387, + "step": 30401 + }, + { + "epoch": 9.331491712707182, + "grad_norm": 0.1106661707162857, + "learning_rate": 1.168493269716342e-06, + "loss": 1.7141, + "step": 30402 + }, + { + "epoch": 9.331798649478207, + "grad_norm": 0.13843196630477905, + "learning_rate": 1.1674252038963996e-06, + "loss": 1.7252, + "step": 30403 + }, + { + "epoch": 9.332105586249233, + "grad_norm": 0.1141132041811943, + "learning_rate": 1.1663576206685955e-06, + "loss": 1.6685, + "step": 30404 + }, + { + "epoch": 9.332412523020258, + "grad_norm": 0.15236155688762665, + "learning_rate": 1.1652905200434604e-06, + "loss": 1.7137, + "step": 30405 + }, + { + "epoch": 9.332719459791283, + "grad_norm": 0.15942497551441193, + "learning_rate": 1.164223902031547e-06, + "loss": 1.7117, + "step": 30406 + }, + { + "epoch": 9.333026396562309, + "grad_norm": 0.11390705406665802, + "learning_rate": 1.163157766643408e-06, + "loss": 1.6491, + "step": 30407 + }, + { + "epoch": 9.333333333333334, + "grad_norm": 0.21758639812469482, + "learning_rate": 1.1620921138895514e-06, + "loss": 1.7287, + "step": 30408 + }, + { + "epoch": 9.33364027010436, + "grad_norm": 0.13287439942359924, + "learning_rate": 1.1610269437805353e-06, + "loss": 1.6963, + "step": 30409 + }, + { + "epoch": 9.333947206875383, + "grad_norm": 0.15917138755321503, + "learning_rate": 1.1599622563268742e-06, + "loss": 1.7565, + "step": 30410 + }, + { + "epoch": 9.334254143646408, + "grad_norm": 0.13716933131217957, + "learning_rate": 1.1588980515390923e-06, + "loss": 1.6761, + "step": 30411 + }, + { + "epoch": 9.334561080417433, + "grad_norm": 0.19529521465301514, + "learning_rate": 1.1578343294277039e-06, + "loss": 1.7228, + "step": 30412 + }, + { + "epoch": 9.334868017188459, + "grad_norm": 0.2123236358165741, + "learning_rate": 1.156771090003228e-06, + "loss": 1.6959, + "step": 30413 + }, + { + "epoch": 9.335174953959484, + "grad_norm": 0.11489806324243546, + "learning_rate": 1.1557083332761675e-06, + "loss": 1.7124, + "step": 30414 + }, + { + "epoch": 9.33548189073051, + "grad_norm": 0.15767377614974976, + "learning_rate": 1.1546460592570252e-06, + "loss": 1.7181, + "step": 30415 + }, + { + "epoch": 9.335788827501535, + "grad_norm": 0.12808682024478912, + "learning_rate": 1.1535842679562924e-06, + "loss": 1.6773, + "step": 30416 + }, + { + "epoch": 9.33609576427256, + "grad_norm": 0.13981541991233826, + "learning_rate": 1.1525229593844832e-06, + "loss": 1.7389, + "step": 30417 + }, + { + "epoch": 9.336402701043585, + "grad_norm": 0.17036983370780945, + "learning_rate": 1.1514621335520614e-06, + "loss": 1.7441, + "step": 30418 + }, + { + "epoch": 9.33670963781461, + "grad_norm": 0.16650976240634918, + "learning_rate": 1.1504017904695296e-06, + "loss": 1.7574, + "step": 30419 + }, + { + "epoch": 9.337016574585636, + "grad_norm": 0.12821979820728302, + "learning_rate": 1.1493419301473518e-06, + "loss": 1.6709, + "step": 30420 + }, + { + "epoch": 9.33732351135666, + "grad_norm": 0.15850982069969177, + "learning_rate": 1.148282552596014e-06, + "loss": 1.6789, + "step": 30421 + }, + { + "epoch": 9.337630448127685, + "grad_norm": 0.17631210386753082, + "learning_rate": 1.1472236578259799e-06, + "loss": 1.692, + "step": 30422 + }, + { + "epoch": 9.33793738489871, + "grad_norm": 0.11740653961896896, + "learning_rate": 1.1461652458477135e-06, + "loss": 1.6718, + "step": 30423 + }, + { + "epoch": 9.338244321669736, + "grad_norm": 0.1481964886188507, + "learning_rate": 1.1451073166716841e-06, + "loss": 1.6934, + "step": 30424 + }, + { + "epoch": 9.338551258440761, + "grad_norm": 0.12850868701934814, + "learning_rate": 1.144049870308328e-06, + "loss": 1.7084, + "step": 30425 + }, + { + "epoch": 9.338858195211786, + "grad_norm": 0.12728431820869446, + "learning_rate": 1.142992906768109e-06, + "loss": 1.6646, + "step": 30426 + }, + { + "epoch": 9.339165131982812, + "grad_norm": 0.1583695262670517, + "learning_rate": 1.141936426061474e-06, + "loss": 1.6967, + "step": 30427 + }, + { + "epoch": 9.339472068753837, + "grad_norm": 0.1379023641347885, + "learning_rate": 1.1408804281988595e-06, + "loss": 1.7037, + "step": 30428 + }, + { + "epoch": 9.339779005524862, + "grad_norm": 0.12713885307312012, + "learning_rate": 1.1398249131907013e-06, + "loss": 1.675, + "step": 30429 + }, + { + "epoch": 9.340085942295888, + "grad_norm": 0.14857660233974457, + "learning_rate": 1.1387698810474302e-06, + "loss": 1.7329, + "step": 30430 + }, + { + "epoch": 9.340392879066913, + "grad_norm": 0.15104521811008453, + "learning_rate": 1.1377153317794765e-06, + "loss": 1.7059, + "step": 30431 + }, + { + "epoch": 9.340699815837937, + "grad_norm": 0.12101757526397705, + "learning_rate": 1.136661265397254e-06, + "loss": 1.6618, + "step": 30432 + }, + { + "epoch": 9.341006752608962, + "grad_norm": 0.13291479647159576, + "learning_rate": 1.1356076819111828e-06, + "loss": 1.6648, + "step": 30433 + }, + { + "epoch": 9.341313689379987, + "grad_norm": 0.13364644348621368, + "learning_rate": 1.134554581331687e-06, + "loss": 1.6737, + "step": 30434 + }, + { + "epoch": 9.341620626151013, + "grad_norm": 0.1292208731174469, + "learning_rate": 1.1335019636691535e-06, + "loss": 1.684, + "step": 30435 + }, + { + "epoch": 9.341927562922038, + "grad_norm": 0.11852065473794937, + "learning_rate": 1.1324498289340013e-06, + "loss": 1.7319, + "step": 30436 + }, + { + "epoch": 9.342234499693063, + "grad_norm": 0.1357669234275818, + "learning_rate": 1.1313981771366166e-06, + "loss": 1.6737, + "step": 30437 + }, + { + "epoch": 9.342541436464089, + "grad_norm": 0.10864339023828506, + "learning_rate": 1.1303470082874024e-06, + "loss": 1.6515, + "step": 30438 + }, + { + "epoch": 9.342848373235114, + "grad_norm": 0.1678614318370819, + "learning_rate": 1.129296322396739e-06, + "loss": 1.6871, + "step": 30439 + }, + { + "epoch": 9.34315531000614, + "grad_norm": 0.13384899497032166, + "learning_rate": 1.1282461194750182e-06, + "loss": 1.6888, + "step": 30440 + }, + { + "epoch": 9.343462246777165, + "grad_norm": 0.12848152220249176, + "learning_rate": 1.1271963995326151e-06, + "loss": 1.6952, + "step": 30441 + }, + { + "epoch": 9.34376918354819, + "grad_norm": 0.12591496109962463, + "learning_rate": 1.1261471625798937e-06, + "loss": 1.7404, + "step": 30442 + }, + { + "epoch": 9.344076120319214, + "grad_norm": 0.12495042383670807, + "learning_rate": 1.1250984086272397e-06, + "loss": 1.7052, + "step": 30443 + }, + { + "epoch": 9.344383057090239, + "grad_norm": 0.1944572478532791, + "learning_rate": 1.1240501376850066e-06, + "loss": 1.7768, + "step": 30444 + }, + { + "epoch": 9.344689993861264, + "grad_norm": 0.15033382177352905, + "learning_rate": 1.1230023497635579e-06, + "loss": 1.7285, + "step": 30445 + }, + { + "epoch": 9.34499693063229, + "grad_norm": 0.15685971081256866, + "learning_rate": 1.1219550448732463e-06, + "loss": 1.7317, + "step": 30446 + }, + { + "epoch": 9.345303867403315, + "grad_norm": 0.13611333072185516, + "learning_rate": 1.120908223024425e-06, + "loss": 1.6924, + "step": 30447 + }, + { + "epoch": 9.34561080417434, + "grad_norm": 0.16727523505687714, + "learning_rate": 1.1198618842274411e-06, + "loss": 1.7314, + "step": 30448 + }, + { + "epoch": 9.345917740945366, + "grad_norm": 0.11468715965747833, + "learning_rate": 1.1188160284926252e-06, + "loss": 1.6648, + "step": 30449 + }, + { + "epoch": 9.34622467771639, + "grad_norm": 0.1359895020723343, + "learning_rate": 1.1177706558303192e-06, + "loss": 1.7235, + "step": 30450 + }, + { + "epoch": 9.346531614487416, + "grad_norm": 0.12796089053153992, + "learning_rate": 1.116725766250859e-06, + "loss": 1.7034, + "step": 30451 + }, + { + "epoch": 9.346838551258442, + "grad_norm": 0.14425326883792877, + "learning_rate": 1.1156813597645588e-06, + "loss": 1.6618, + "step": 30452 + }, + { + "epoch": 9.347145488029465, + "grad_norm": 0.12873579561710358, + "learning_rate": 1.1146374363817602e-06, + "loss": 1.7002, + "step": 30453 + }, + { + "epoch": 9.34745242480049, + "grad_norm": 0.1240401417016983, + "learning_rate": 1.113593996112755e-06, + "loss": 1.7085, + "step": 30454 + }, + { + "epoch": 9.347759361571516, + "grad_norm": 0.16717098653316498, + "learning_rate": 1.1125510389678738e-06, + "loss": 1.7644, + "step": 30455 + }, + { + "epoch": 9.348066298342541, + "grad_norm": 0.14225825667381287, + "learning_rate": 1.1115085649574143e-06, + "loss": 1.754, + "step": 30456 + }, + { + "epoch": 9.348373235113566, + "grad_norm": 0.16719453036785126, + "learning_rate": 1.1104665740916787e-06, + "loss": 1.7757, + "step": 30457 + }, + { + "epoch": 9.348680171884592, + "grad_norm": 0.14928758144378662, + "learning_rate": 1.1094250663809812e-06, + "loss": 1.6919, + "step": 30458 + }, + { + "epoch": 9.348987108655617, + "grad_norm": 0.16433440148830414, + "learning_rate": 1.1083840418355862e-06, + "loss": 1.7464, + "step": 30459 + }, + { + "epoch": 9.349294045426642, + "grad_norm": 0.16641557216644287, + "learning_rate": 1.1073435004657961e-06, + "loss": 1.6872, + "step": 30460 + }, + { + "epoch": 9.349600982197668, + "grad_norm": 0.1351664960384369, + "learning_rate": 1.106303442281903e-06, + "loss": 1.6792, + "step": 30461 + }, + { + "epoch": 9.349907918968693, + "grad_norm": 0.13160523772239685, + "learning_rate": 1.1052638672941707e-06, + "loss": 1.7087, + "step": 30462 + }, + { + "epoch": 9.350214855739718, + "grad_norm": 0.13107560575008392, + "learning_rate": 1.1042247755128854e-06, + "loss": 1.655, + "step": 30463 + }, + { + "epoch": 9.350521792510742, + "grad_norm": 0.1115984246134758, + "learning_rate": 1.1031861669483058e-06, + "loss": 1.6939, + "step": 30464 + }, + { + "epoch": 9.350828729281767, + "grad_norm": 0.2041286677122116, + "learning_rate": 1.1021480416106956e-06, + "loss": 1.7502, + "step": 30465 + }, + { + "epoch": 9.351135666052793, + "grad_norm": 0.1607433408498764, + "learning_rate": 1.1011103995103245e-06, + "loss": 1.7618, + "step": 30466 + }, + { + "epoch": 9.351442602823818, + "grad_norm": 0.15420445799827576, + "learning_rate": 1.1000732406574343e-06, + "loss": 1.7348, + "step": 30467 + }, + { + "epoch": 9.351749539594843, + "grad_norm": 0.1475592702627182, + "learning_rate": 1.099036565062289e-06, + "loss": 1.6618, + "step": 30468 + }, + { + "epoch": 9.352056476365869, + "grad_norm": 0.12382391095161438, + "learning_rate": 1.0980003727351196e-06, + "loss": 1.668, + "step": 30469 + }, + { + "epoch": 9.352363413136894, + "grad_norm": 0.14605712890625, + "learning_rate": 1.096964663686184e-06, + "loss": 1.7274, + "step": 30470 + }, + { + "epoch": 9.35267034990792, + "grad_norm": 0.1413935273885727, + "learning_rate": 1.0959294379256913e-06, + "loss": 1.7173, + "step": 30471 + }, + { + "epoch": 9.352977286678945, + "grad_norm": 0.1893736571073532, + "learning_rate": 1.0948946954638994e-06, + "loss": 1.7243, + "step": 30472 + }, + { + "epoch": 9.35328422344997, + "grad_norm": 0.13228827714920044, + "learning_rate": 1.0938604363110172e-06, + "loss": 1.6907, + "step": 30473 + }, + { + "epoch": 9.353591160220994, + "grad_norm": 0.13724558055400848, + "learning_rate": 1.0928266604772697e-06, + "loss": 1.6925, + "step": 30474 + }, + { + "epoch": 9.353898096992019, + "grad_norm": 0.1286490261554718, + "learning_rate": 1.091793367972882e-06, + "loss": 1.6977, + "step": 30475 + }, + { + "epoch": 9.354205033763044, + "grad_norm": 0.17098230123519897, + "learning_rate": 1.0907605588080517e-06, + "loss": 1.7392, + "step": 30476 + }, + { + "epoch": 9.35451197053407, + "grad_norm": 0.14103081822395325, + "learning_rate": 1.0897282329929924e-06, + "loss": 1.6872, + "step": 30477 + }, + { + "epoch": 9.354818907305095, + "grad_norm": 0.14384165406227112, + "learning_rate": 1.0886963905379077e-06, + "loss": 1.6996, + "step": 30478 + }, + { + "epoch": 9.35512584407612, + "grad_norm": 0.12110382318496704, + "learning_rate": 1.087665031452989e-06, + "loss": 1.6688, + "step": 30479 + }, + { + "epoch": 9.355432780847146, + "grad_norm": 0.1337585598230362, + "learning_rate": 1.0866341557484394e-06, + "loss": 1.6703, + "step": 30480 + }, + { + "epoch": 9.355739717618171, + "grad_norm": 0.16640827059745789, + "learning_rate": 1.0856037634344341e-06, + "loss": 1.7675, + "step": 30481 + }, + { + "epoch": 9.356046654389196, + "grad_norm": 0.1333245038986206, + "learning_rate": 1.0845738545211702e-06, + "loss": 1.7147, + "step": 30482 + }, + { + "epoch": 9.356353591160222, + "grad_norm": 0.13712866604328156, + "learning_rate": 1.0835444290188124e-06, + "loss": 1.7219, + "step": 30483 + }, + { + "epoch": 9.356660527931247, + "grad_norm": 0.14520063996315002, + "learning_rate": 1.0825154869375353e-06, + "loss": 1.6548, + "step": 30484 + }, + { + "epoch": 9.35696746470227, + "grad_norm": 0.10503572225570679, + "learning_rate": 1.08148702828752e-06, + "loss": 1.6853, + "step": 30485 + }, + { + "epoch": 9.357274401473296, + "grad_norm": 0.12749113142490387, + "learning_rate": 1.080459053078914e-06, + "loss": 1.6837, + "step": 30486 + }, + { + "epoch": 9.357581338244321, + "grad_norm": 0.13570766150951385, + "learning_rate": 1.079431561321892e-06, + "loss": 1.7209, + "step": 30487 + }, + { + "epoch": 9.357888275015346, + "grad_norm": 0.10935094952583313, + "learning_rate": 1.0784045530265907e-06, + "loss": 1.6559, + "step": 30488 + }, + { + "epoch": 9.358195211786372, + "grad_norm": 0.2123469114303589, + "learning_rate": 1.0773780282031799e-06, + "loss": 1.7223, + "step": 30489 + }, + { + "epoch": 9.358502148557397, + "grad_norm": 0.12153031677007675, + "learning_rate": 1.07635198686179e-06, + "loss": 1.6842, + "step": 30490 + }, + { + "epoch": 9.358809085328422, + "grad_norm": 0.1416035294532776, + "learning_rate": 1.0753264290125576e-06, + "loss": 1.7064, + "step": 30491 + }, + { + "epoch": 9.359116022099448, + "grad_norm": 0.12089719623327255, + "learning_rate": 1.0743013546656356e-06, + "loss": 1.6848, + "step": 30492 + }, + { + "epoch": 9.359422958870473, + "grad_norm": 0.13979336619377136, + "learning_rate": 1.073276763831138e-06, + "loss": 1.6967, + "step": 30493 + }, + { + "epoch": 9.359729895641498, + "grad_norm": 0.14014959335327148, + "learning_rate": 1.072252656519196e-06, + "loss": 1.686, + "step": 30494 + }, + { + "epoch": 9.360036832412524, + "grad_norm": 0.17366288602352142, + "learning_rate": 1.0712290327399344e-06, + "loss": 1.6709, + "step": 30495 + }, + { + "epoch": 9.360343769183547, + "grad_norm": 0.1098582074046135, + "learning_rate": 1.070205892503462e-06, + "loss": 1.6552, + "step": 30496 + }, + { + "epoch": 9.360650705954573, + "grad_norm": 0.17184807360172272, + "learning_rate": 1.0691832358198984e-06, + "loss": 1.7052, + "step": 30497 + }, + { + "epoch": 9.360957642725598, + "grad_norm": 0.1819550096988678, + "learning_rate": 1.068161062699341e-06, + "loss": 1.718, + "step": 30498 + }, + { + "epoch": 9.361264579496623, + "grad_norm": 0.11239949613809586, + "learning_rate": 1.0671393731518985e-06, + "loss": 1.6928, + "step": 30499 + }, + { + "epoch": 9.361571516267649, + "grad_norm": 0.13595740497112274, + "learning_rate": 1.066118167187663e-06, + "loss": 1.6987, + "step": 30500 + }, + { + "epoch": 9.361878453038674, + "grad_norm": 0.1424037665128708, + "learning_rate": 1.0650974448167316e-06, + "loss": 1.7194, + "step": 30501 + }, + { + "epoch": 9.3621853898097, + "grad_norm": 0.17475293576717377, + "learning_rate": 1.0640772060491855e-06, + "loss": 1.7191, + "step": 30502 + }, + { + "epoch": 9.362492326580725, + "grad_norm": 0.22121796011924744, + "learning_rate": 1.0630574508951108e-06, + "loss": 1.7752, + "step": 30503 + }, + { + "epoch": 9.36279926335175, + "grad_norm": 0.19642120599746704, + "learning_rate": 1.0620381793645885e-06, + "loss": 1.6985, + "step": 30504 + }, + { + "epoch": 9.363106200122775, + "grad_norm": 0.16090667247772217, + "learning_rate": 1.0610193914676825e-06, + "loss": 1.7547, + "step": 30505 + }, + { + "epoch": 9.3634131368938, + "grad_norm": 0.15036262571811676, + "learning_rate": 1.0600010872144794e-06, + "loss": 1.7545, + "step": 30506 + }, + { + "epoch": 9.363720073664824, + "grad_norm": 0.13965867459774017, + "learning_rate": 1.0589832666150213e-06, + "loss": 1.7093, + "step": 30507 + }, + { + "epoch": 9.36402701043585, + "grad_norm": 0.14103607833385468, + "learning_rate": 1.057965929679372e-06, + "loss": 1.7519, + "step": 30508 + }, + { + "epoch": 9.364333947206875, + "grad_norm": 0.11406313627958298, + "learning_rate": 1.056949076417596e-06, + "loss": 1.6431, + "step": 30509 + }, + { + "epoch": 9.3646408839779, + "grad_norm": 0.14929352700710297, + "learning_rate": 1.0559327068397296e-06, + "loss": 1.6863, + "step": 30510 + }, + { + "epoch": 9.364947820748926, + "grad_norm": 0.12195751070976257, + "learning_rate": 1.0549168209558312e-06, + "loss": 1.6888, + "step": 30511 + }, + { + "epoch": 9.365254757519951, + "grad_norm": 0.14742396771907806, + "learning_rate": 1.0539014187759267e-06, + "loss": 1.6808, + "step": 30512 + }, + { + "epoch": 9.365561694290976, + "grad_norm": 0.20945298671722412, + "learning_rate": 1.0528865003100573e-06, + "loss": 1.7754, + "step": 30513 + }, + { + "epoch": 9.365868631062002, + "grad_norm": 0.13752134144306183, + "learning_rate": 1.0518720655682545e-06, + "loss": 1.723, + "step": 30514 + }, + { + "epoch": 9.366175567833027, + "grad_norm": 0.20715954899787903, + "learning_rate": 1.0508581145605379e-06, + "loss": 1.7787, + "step": 30515 + }, + { + "epoch": 9.366482504604052, + "grad_norm": 0.11915310472249985, + "learning_rate": 1.0498446472969326e-06, + "loss": 1.668, + "step": 30516 + }, + { + "epoch": 9.366789441375076, + "grad_norm": 0.15565282106399536, + "learning_rate": 1.0488316637874529e-06, + "loss": 1.7415, + "step": 30517 + }, + { + "epoch": 9.367096378146101, + "grad_norm": 0.17260490357875824, + "learning_rate": 1.0478191640421132e-06, + "loss": 1.7511, + "step": 30518 + }, + { + "epoch": 9.367403314917127, + "grad_norm": 0.15730834007263184, + "learning_rate": 1.0468071480709163e-06, + "loss": 1.7225, + "step": 30519 + }, + { + "epoch": 9.367710251688152, + "grad_norm": 0.11092279106378555, + "learning_rate": 1.0457956158838544e-06, + "loss": 1.6866, + "step": 30520 + }, + { + "epoch": 9.368017188459177, + "grad_norm": 0.1350366175174713, + "learning_rate": 1.0447845674909417e-06, + "loss": 1.7203, + "step": 30521 + }, + { + "epoch": 9.368324125230203, + "grad_norm": 0.13730715215206146, + "learning_rate": 1.0437740029021591e-06, + "loss": 1.7076, + "step": 30522 + }, + { + "epoch": 9.368631062001228, + "grad_norm": 0.13333722949028015, + "learning_rate": 1.0427639221274988e-06, + "loss": 1.7061, + "step": 30523 + }, + { + "epoch": 9.368937998772253, + "grad_norm": 0.18173889815807343, + "learning_rate": 1.0417543251769413e-06, + "loss": 1.7102, + "step": 30524 + }, + { + "epoch": 9.369244935543279, + "grad_norm": 0.09129618853330612, + "learning_rate": 1.040745212060451e-06, + "loss": 1.62, + "step": 30525 + }, + { + "epoch": 9.369551872314304, + "grad_norm": 0.1274579018354416, + "learning_rate": 1.0397365827880256e-06, + "loss": 1.7183, + "step": 30526 + }, + { + "epoch": 9.36985880908533, + "grad_norm": 0.14064618945121765, + "learning_rate": 1.0387284373696126e-06, + "loss": 1.679, + "step": 30527 + }, + { + "epoch": 9.370165745856353, + "grad_norm": 0.1963305026292801, + "learning_rate": 1.037720775815193e-06, + "loss": 1.6764, + "step": 30528 + }, + { + "epoch": 9.370472682627378, + "grad_norm": 0.14961928129196167, + "learning_rate": 1.0367135981346977e-06, + "loss": 1.7171, + "step": 30529 + }, + { + "epoch": 9.370779619398403, + "grad_norm": 0.16405031085014343, + "learning_rate": 1.0357069043381073e-06, + "loss": 1.7747, + "step": 30530 + }, + { + "epoch": 9.371086556169429, + "grad_norm": 0.1538914144039154, + "learning_rate": 1.0347006944353588e-06, + "loss": 1.6935, + "step": 30531 + }, + { + "epoch": 9.371393492940454, + "grad_norm": 0.13590097427368164, + "learning_rate": 1.0336949684363995e-06, + "loss": 1.734, + "step": 30532 + }, + { + "epoch": 9.37170042971148, + "grad_norm": 0.09966246783733368, + "learning_rate": 1.0326897263511602e-06, + "loss": 1.6684, + "step": 30533 + }, + { + "epoch": 9.372007366482505, + "grad_norm": 0.15132254362106323, + "learning_rate": 1.031684968189589e-06, + "loss": 1.6975, + "step": 30534 + }, + { + "epoch": 9.37231430325353, + "grad_norm": 0.18380047380924225, + "learning_rate": 1.0306806939616055e-06, + "loss": 1.7704, + "step": 30535 + }, + { + "epoch": 9.372621240024555, + "grad_norm": 0.2081698179244995, + "learning_rate": 1.0296769036771347e-06, + "loss": 1.7853, + "step": 30536 + }, + { + "epoch": 9.37292817679558, + "grad_norm": 0.1174221932888031, + "learning_rate": 1.028673597346097e-06, + "loss": 1.6579, + "step": 30537 + }, + { + "epoch": 9.373235113566606, + "grad_norm": 0.17011094093322754, + "learning_rate": 1.0276707749784175e-06, + "loss": 1.7402, + "step": 30538 + }, + { + "epoch": 9.37354205033763, + "grad_norm": 0.13609996438026428, + "learning_rate": 1.026668436583994e-06, + "loss": 1.7082, + "step": 30539 + }, + { + "epoch": 9.373848987108655, + "grad_norm": 0.1455853134393692, + "learning_rate": 1.0256665821727406e-06, + "loss": 1.751, + "step": 30540 + }, + { + "epoch": 9.37415592387968, + "grad_norm": 0.1282152682542801, + "learning_rate": 1.0246652117545552e-06, + "loss": 1.6659, + "step": 30541 + }, + { + "epoch": 9.374462860650706, + "grad_norm": 0.17218823730945587, + "learning_rate": 1.023664325339324e-06, + "loss": 1.6571, + "step": 30542 + }, + { + "epoch": 9.374769797421731, + "grad_norm": 0.1530035436153412, + "learning_rate": 1.0226639229369618e-06, + "loss": 1.6975, + "step": 30543 + }, + { + "epoch": 9.375076734192756, + "grad_norm": 0.15473347902297974, + "learning_rate": 1.0216640045573267e-06, + "loss": 1.7283, + "step": 30544 + }, + { + "epoch": 9.375383670963782, + "grad_norm": 0.17946626245975494, + "learning_rate": 1.0206645702103279e-06, + "loss": 1.7391, + "step": 30545 + }, + { + "epoch": 9.375690607734807, + "grad_norm": 0.12358242273330688, + "learning_rate": 1.0196656199058186e-06, + "loss": 1.6633, + "step": 30546 + }, + { + "epoch": 9.375997544505832, + "grad_norm": 0.1423409879207611, + "learning_rate": 1.0186671536536907e-06, + "loss": 1.6969, + "step": 30547 + }, + { + "epoch": 9.376304481276858, + "grad_norm": 0.15845637023448944, + "learning_rate": 1.0176691714637976e-06, + "loss": 1.743, + "step": 30548 + }, + { + "epoch": 9.376611418047883, + "grad_norm": 0.13585655391216278, + "learning_rate": 1.0166716733460091e-06, + "loss": 1.6668, + "step": 30549 + }, + { + "epoch": 9.376918354818907, + "grad_norm": 0.13910886645317078, + "learning_rate": 1.015674659310184e-06, + "loss": 1.7184, + "step": 30550 + }, + { + "epoch": 9.377225291589932, + "grad_norm": 0.15852247178554535, + "learning_rate": 1.01467812936617e-06, + "loss": 1.7708, + "step": 30551 + }, + { + "epoch": 9.377532228360957, + "grad_norm": 0.15506471693515778, + "learning_rate": 1.0136820835238148e-06, + "loss": 1.7377, + "step": 30552 + }, + { + "epoch": 9.377839165131983, + "grad_norm": 0.13877533376216888, + "learning_rate": 1.0126865217929715e-06, + "loss": 1.7335, + "step": 30553 + }, + { + "epoch": 9.378146101903008, + "grad_norm": 0.12510280311107635, + "learning_rate": 1.0116914441834657e-06, + "loss": 1.6979, + "step": 30554 + }, + { + "epoch": 9.378453038674033, + "grad_norm": 0.17626170814037323, + "learning_rate": 1.0106968507051451e-06, + "loss": 1.7316, + "step": 30555 + }, + { + "epoch": 9.378759975445059, + "grad_norm": 0.17140509188175201, + "learning_rate": 1.009702741367824e-06, + "loss": 1.7576, + "step": 30556 + }, + { + "epoch": 9.379066912216084, + "grad_norm": 0.17579251527786255, + "learning_rate": 1.008709116181339e-06, + "loss": 1.6942, + "step": 30557 + }, + { + "epoch": 9.37937384898711, + "grad_norm": 0.1375150978565216, + "learning_rate": 1.0077159751555099e-06, + "loss": 1.7043, + "step": 30558 + }, + { + "epoch": 9.379680785758135, + "grad_norm": 0.11679084599018097, + "learning_rate": 1.0067233183001346e-06, + "loss": 1.6868, + "step": 30559 + }, + { + "epoch": 9.379987722529158, + "grad_norm": 0.15186625719070435, + "learning_rate": 1.0057311456250495e-06, + "loss": 1.702, + "step": 30560 + }, + { + "epoch": 9.380294659300183, + "grad_norm": 0.18598486483097076, + "learning_rate": 1.0047394571400304e-06, + "loss": 1.8104, + "step": 30561 + }, + { + "epoch": 9.380601596071209, + "grad_norm": 0.12907341122627258, + "learning_rate": 1.003748252854908e-06, + "loss": 1.6878, + "step": 30562 + }, + { + "epoch": 9.380908532842234, + "grad_norm": 0.15694235265254974, + "learning_rate": 1.0027575327794525e-06, + "loss": 1.7079, + "step": 30563 + }, + { + "epoch": 9.38121546961326, + "grad_norm": 0.12046566605567932, + "learning_rate": 1.0017672969234671e-06, + "loss": 1.6602, + "step": 30564 + }, + { + "epoch": 9.381522406384285, + "grad_norm": 0.12011182308197021, + "learning_rate": 1.0007775452967383e-06, + "loss": 1.6756, + "step": 30565 + }, + { + "epoch": 9.38182934315531, + "grad_norm": 0.13124582171440125, + "learning_rate": 9.997882779090473e-07, + "loss": 1.6726, + "step": 30566 + }, + { + "epoch": 9.382136279926335, + "grad_norm": 0.1443175971508026, + "learning_rate": 9.98799494770164e-07, + "loss": 1.7491, + "step": 30567 + }, + { + "epoch": 9.38244321669736, + "grad_norm": 0.09302258491516113, + "learning_rate": 9.978111958898639e-07, + "loss": 1.6237, + "step": 30568 + }, + { + "epoch": 9.382750153468386, + "grad_norm": 0.13836117088794708, + "learning_rate": 9.968233812779172e-07, + "loss": 1.735, + "step": 30569 + }, + { + "epoch": 9.383057090239411, + "grad_norm": 0.1278647780418396, + "learning_rate": 9.958360509440879e-07, + "loss": 1.6629, + "step": 30570 + }, + { + "epoch": 9.383364027010435, + "grad_norm": 0.1527305543422699, + "learning_rate": 9.948492048981183e-07, + "loss": 1.7194, + "step": 30571 + }, + { + "epoch": 9.38367096378146, + "grad_norm": 0.1453726887702942, + "learning_rate": 9.938628431497844e-07, + "loss": 1.723, + "step": 30572 + }, + { + "epoch": 9.383977900552486, + "grad_norm": 0.1687985509634018, + "learning_rate": 9.92876965708811e-07, + "loss": 1.7092, + "step": 30573 + }, + { + "epoch": 9.384284837323511, + "grad_norm": 0.1347656548023224, + "learning_rate": 9.918915725849577e-07, + "loss": 1.6471, + "step": 30574 + }, + { + "epoch": 9.384591774094536, + "grad_norm": 0.15019412338733673, + "learning_rate": 9.90906663787955e-07, + "loss": 1.6936, + "step": 30575 + }, + { + "epoch": 9.384898710865562, + "grad_norm": 0.10301146656274796, + "learning_rate": 9.899222393275342e-07, + "loss": 1.6845, + "step": 30576 + }, + { + "epoch": 9.385205647636587, + "grad_norm": 0.15683500468730927, + "learning_rate": 9.889382992134323e-07, + "loss": 1.6929, + "step": 30577 + }, + { + "epoch": 9.385512584407612, + "grad_norm": 0.13546130061149597, + "learning_rate": 9.879548434553631e-07, + "loss": 1.6783, + "step": 30578 + }, + { + "epoch": 9.385819521178638, + "grad_norm": 0.1424054205417633, + "learning_rate": 9.869718720630583e-07, + "loss": 1.7276, + "step": 30579 + }, + { + "epoch": 9.386126457949663, + "grad_norm": 0.19902123510837555, + "learning_rate": 9.859893850462155e-07, + "loss": 1.7244, + "step": 30580 + }, + { + "epoch": 9.386433394720688, + "grad_norm": 0.13931868970394135, + "learning_rate": 9.8500738241456e-07, + "loss": 1.7272, + "step": 30581 + }, + { + "epoch": 9.386740331491712, + "grad_norm": 0.12193772196769714, + "learning_rate": 9.8402586417779e-07, + "loss": 1.6856, + "step": 30582 + }, + { + "epoch": 9.387047268262737, + "grad_norm": 0.13566039502620697, + "learning_rate": 9.83044830345603e-07, + "loss": 1.7083, + "step": 30583 + }, + { + "epoch": 9.387354205033763, + "grad_norm": 0.15738597512245178, + "learning_rate": 9.82064280927697e-07, + "loss": 1.6692, + "step": 30584 + }, + { + "epoch": 9.387661141804788, + "grad_norm": 0.13515286147594452, + "learning_rate": 9.81084215933764e-07, + "loss": 1.6891, + "step": 30585 + }, + { + "epoch": 9.387968078575813, + "grad_norm": 0.15392665565013885, + "learning_rate": 9.80104635373491e-07, + "loss": 1.7277, + "step": 30586 + }, + { + "epoch": 9.388275015346839, + "grad_norm": 0.11712920665740967, + "learning_rate": 9.79125539256548e-07, + "loss": 1.6878, + "step": 30587 + }, + { + "epoch": 9.388581952117864, + "grad_norm": 0.17001082003116608, + "learning_rate": 9.781469275926214e-07, + "loss": 1.7397, + "step": 30588 + }, + { + "epoch": 9.38888888888889, + "grad_norm": 0.154278963804245, + "learning_rate": 9.771688003913816e-07, + "loss": 1.71, + "step": 30589 + }, + { + "epoch": 9.389195825659915, + "grad_norm": 0.12404046952724457, + "learning_rate": 9.761911576624872e-07, + "loss": 1.6946, + "step": 30590 + }, + { + "epoch": 9.38950276243094, + "grad_norm": 0.152077317237854, + "learning_rate": 9.75213999415614e-07, + "loss": 1.6986, + "step": 30591 + }, + { + "epoch": 9.389809699201965, + "grad_norm": 0.11967775225639343, + "learning_rate": 9.742373256604099e-07, + "loss": 1.6376, + "step": 30592 + }, + { + "epoch": 9.390116635972989, + "grad_norm": 0.12324173748493195, + "learning_rate": 9.732611364065169e-07, + "loss": 1.704, + "step": 30593 + }, + { + "epoch": 9.390423572744014, + "grad_norm": 0.19685424864292145, + "learning_rate": 9.722854316636054e-07, + "loss": 1.7799, + "step": 30594 + }, + { + "epoch": 9.39073050951504, + "grad_norm": 0.18277420103549957, + "learning_rate": 9.713102114412953e-07, + "loss": 1.8062, + "step": 30595 + }, + { + "epoch": 9.391037446286065, + "grad_norm": 0.12882667779922485, + "learning_rate": 9.7033547574924e-07, + "loss": 1.7086, + "step": 30596 + }, + { + "epoch": 9.39134438305709, + "grad_norm": 0.11336109042167664, + "learning_rate": 9.693612245970652e-07, + "loss": 1.691, + "step": 30597 + }, + { + "epoch": 9.391651319828116, + "grad_norm": 0.1724751889705658, + "learning_rate": 9.683874579943964e-07, + "loss": 1.7329, + "step": 30598 + }, + { + "epoch": 9.39195825659914, + "grad_norm": 0.12712900340557098, + "learning_rate": 9.674141759508704e-07, + "loss": 1.6889, + "step": 30599 + }, + { + "epoch": 9.392265193370166, + "grad_norm": 0.18404419720172882, + "learning_rate": 9.664413784760907e-07, + "loss": 1.6947, + "step": 30600 + }, + { + "epoch": 9.392572130141192, + "grad_norm": 0.12651921808719635, + "learning_rate": 9.654690655796772e-07, + "loss": 1.7164, + "step": 30601 + }, + { + "epoch": 9.392879066912217, + "grad_norm": 0.1299905627965927, + "learning_rate": 9.64497237271239e-07, + "loss": 1.7192, + "step": 30602 + }, + { + "epoch": 9.39318600368324, + "grad_norm": 0.14098776876926422, + "learning_rate": 9.635258935603796e-07, + "loss": 1.6535, + "step": 30603 + }, + { + "epoch": 9.393492940454266, + "grad_norm": 0.13803884387016296, + "learning_rate": 9.62555034456697e-07, + "loss": 1.6703, + "step": 30604 + }, + { + "epoch": 9.393799877225291, + "grad_norm": 0.1579771488904953, + "learning_rate": 9.61584659969783e-07, + "loss": 1.7882, + "step": 30605 + }, + { + "epoch": 9.394106813996316, + "grad_norm": 0.11700218915939331, + "learning_rate": 9.606147701092416e-07, + "loss": 1.6802, + "step": 30606 + }, + { + "epoch": 9.394413750767342, + "grad_norm": 0.16874761879444122, + "learning_rate": 9.596453648846426e-07, + "loss": 1.6999, + "step": 30607 + }, + { + "epoch": 9.394720687538367, + "grad_norm": 0.14294692873954773, + "learning_rate": 9.586764443055785e-07, + "loss": 1.6757, + "step": 30608 + }, + { + "epoch": 9.395027624309392, + "grad_norm": 0.13398779928684235, + "learning_rate": 9.57708008381608e-07, + "loss": 1.7234, + "step": 30609 + }, + { + "epoch": 9.395334561080418, + "grad_norm": 0.15532025694847107, + "learning_rate": 9.567400571223129e-07, + "loss": 1.7504, + "step": 30610 + }, + { + "epoch": 9.395641497851443, + "grad_norm": 0.12451089173555374, + "learning_rate": 9.557725905372627e-07, + "loss": 1.6537, + "step": 30611 + }, + { + "epoch": 9.395948434622468, + "grad_norm": 0.18524393439292908, + "learning_rate": 9.548056086360114e-07, + "loss": 1.6706, + "step": 30612 + }, + { + "epoch": 9.396255371393494, + "grad_norm": 0.12702727317810059, + "learning_rate": 9.538391114281175e-07, + "loss": 1.6772, + "step": 30613 + }, + { + "epoch": 9.396562308164517, + "grad_norm": 0.1752685308456421, + "learning_rate": 9.528730989231294e-07, + "loss": 1.755, + "step": 30614 + }, + { + "epoch": 9.396869244935543, + "grad_norm": 0.13985255360603333, + "learning_rate": 9.519075711306003e-07, + "loss": 1.672, + "step": 30615 + }, + { + "epoch": 9.397176181706568, + "grad_norm": 0.14705638587474823, + "learning_rate": 9.50942528060067e-07, + "loss": 1.6884, + "step": 30616 + }, + { + "epoch": 9.397483118477593, + "grad_norm": 0.14204713702201843, + "learning_rate": 9.499779697210665e-07, + "loss": 1.6903, + "step": 30617 + }, + { + "epoch": 9.397790055248619, + "grad_norm": 0.16127781569957733, + "learning_rate": 9.490138961231355e-07, + "loss": 1.6854, + "step": 30618 + }, + { + "epoch": 9.398096992019644, + "grad_norm": 0.11951326578855515, + "learning_rate": 9.480503072757996e-07, + "loss": 1.6556, + "step": 30619 + }, + { + "epoch": 9.39840392879067, + "grad_norm": 0.11818456649780273, + "learning_rate": 9.470872031885791e-07, + "loss": 1.6873, + "step": 30620 + }, + { + "epoch": 9.398710865561695, + "grad_norm": 0.14344888925552368, + "learning_rate": 9.461245838709942e-07, + "loss": 1.6942, + "step": 30621 + }, + { + "epoch": 9.39901780233272, + "grad_norm": 0.141475647687912, + "learning_rate": 9.451624493325539e-07, + "loss": 1.7613, + "step": 30622 + }, + { + "epoch": 9.399324739103745, + "grad_norm": 0.13234710693359375, + "learning_rate": 9.442007995827784e-07, + "loss": 1.6922, + "step": 30623 + }, + { + "epoch": 9.399631675874769, + "grad_norm": 0.12975256145000458, + "learning_rate": 9.432396346311545e-07, + "loss": 1.7071, + "step": 30624 + }, + { + "epoch": 9.399938612645794, + "grad_norm": 0.12574951350688934, + "learning_rate": 9.422789544872024e-07, + "loss": 1.6663, + "step": 30625 + }, + { + "epoch": 9.40024554941682, + "grad_norm": 0.13539808988571167, + "learning_rate": 9.413187591603922e-07, + "loss": 1.7087, + "step": 30626 + }, + { + "epoch": 9.400552486187845, + "grad_norm": 0.18200458586215973, + "learning_rate": 9.403590486602221e-07, + "loss": 1.7497, + "step": 30627 + }, + { + "epoch": 9.40085942295887, + "grad_norm": 0.1656341254711151, + "learning_rate": 9.393998229961898e-07, + "loss": 1.7424, + "step": 30628 + }, + { + "epoch": 9.401166359729896, + "grad_norm": 0.13709864020347595, + "learning_rate": 9.384410821777545e-07, + "loss": 1.7228, + "step": 30629 + }, + { + "epoch": 9.401473296500921, + "grad_norm": 0.1603628247976303, + "learning_rate": 9.374828262144031e-07, + "loss": 1.7145, + "step": 30630 + }, + { + "epoch": 9.401780233271946, + "grad_norm": 0.14841997623443604, + "learning_rate": 9.365250551156002e-07, + "loss": 1.7022, + "step": 30631 + }, + { + "epoch": 9.402087170042972, + "grad_norm": 0.12113026529550552, + "learning_rate": 9.35567768890816e-07, + "loss": 1.6537, + "step": 30632 + }, + { + "epoch": 9.402394106813997, + "grad_norm": 0.1314094364643097, + "learning_rate": 9.346109675495096e-07, + "loss": 1.69, + "step": 30633 + }, + { + "epoch": 9.402701043585022, + "grad_norm": 0.1479753851890564, + "learning_rate": 9.336546511011346e-07, + "loss": 1.7403, + "step": 30634 + }, + { + "epoch": 9.403007980356048, + "grad_norm": 0.15644671022891998, + "learning_rate": 9.326988195551445e-07, + "loss": 1.7251, + "step": 30635 + }, + { + "epoch": 9.403314917127071, + "grad_norm": 0.14952129125595093, + "learning_rate": 9.317434729209817e-07, + "loss": 1.6915, + "step": 30636 + }, + { + "epoch": 9.403621853898096, + "grad_norm": 0.11758700013160706, + "learning_rate": 9.307886112080943e-07, + "loss": 1.6961, + "step": 30637 + }, + { + "epoch": 9.403928790669122, + "grad_norm": 0.10613285005092621, + "learning_rate": 9.298342344259081e-07, + "loss": 1.6668, + "step": 30638 + }, + { + "epoch": 9.404235727440147, + "grad_norm": 0.11807837337255478, + "learning_rate": 9.288803425838655e-07, + "loss": 1.6642, + "step": 30639 + }, + { + "epoch": 9.404542664211172, + "grad_norm": 0.17462679743766785, + "learning_rate": 9.279269356913866e-07, + "loss": 1.6935, + "step": 30640 + }, + { + "epoch": 9.404849600982198, + "grad_norm": 0.12297552078962326, + "learning_rate": 9.26974013757892e-07, + "loss": 1.6782, + "step": 30641 + }, + { + "epoch": 9.405156537753223, + "grad_norm": 0.11738404631614685, + "learning_rate": 9.260215767928127e-07, + "loss": 1.6913, + "step": 30642 + }, + { + "epoch": 9.405463474524248, + "grad_norm": 0.20638801157474518, + "learning_rate": 9.250696248055468e-07, + "loss": 1.7434, + "step": 30643 + }, + { + "epoch": 9.405770411295274, + "grad_norm": 0.23646225035190582, + "learning_rate": 9.241181578055036e-07, + "loss": 1.754, + "step": 30644 + }, + { + "epoch": 9.4060773480663, + "grad_norm": 0.1305943727493286, + "learning_rate": 9.231671758020921e-07, + "loss": 1.7006, + "step": 30645 + }, + { + "epoch": 9.406384284837323, + "grad_norm": 0.1624198704957962, + "learning_rate": 9.222166788047049e-07, + "loss": 1.7205, + "step": 30646 + }, + { + "epoch": 9.406691221608348, + "grad_norm": 0.17408986389636993, + "learning_rate": 9.212666668227399e-07, + "loss": 1.7302, + "step": 30647 + }, + { + "epoch": 9.406998158379373, + "grad_norm": 0.19994081556797028, + "learning_rate": 9.203171398655785e-07, + "loss": 1.7616, + "step": 30648 + }, + { + "epoch": 9.407305095150399, + "grad_norm": 0.12456551194190979, + "learning_rate": 9.19368097942619e-07, + "loss": 1.6915, + "step": 30649 + }, + { + "epoch": 9.407612031921424, + "grad_norm": 0.11373740434646606, + "learning_rate": 9.184195410632257e-07, + "loss": 1.6679, + "step": 30650 + }, + { + "epoch": 9.40791896869245, + "grad_norm": 0.1356983780860901, + "learning_rate": 9.174714692367748e-07, + "loss": 1.7142, + "step": 30651 + }, + { + "epoch": 9.408225905463475, + "grad_norm": 0.17130546271800995, + "learning_rate": 9.165238824726474e-07, + "loss": 1.7554, + "step": 30652 + }, + { + "epoch": 9.4085328422345, + "grad_norm": 0.12105514854192734, + "learning_rate": 9.155767807801918e-07, + "loss": 1.6938, + "step": 30653 + }, + { + "epoch": 9.408839779005525, + "grad_norm": 0.1510905921459198, + "learning_rate": 9.146301641687837e-07, + "loss": 1.6835, + "step": 30654 + }, + { + "epoch": 9.40914671577655, + "grad_norm": 0.1589810699224472, + "learning_rate": 9.136840326477658e-07, + "loss": 1.7135, + "step": 30655 + }, + { + "epoch": 9.409453652547576, + "grad_norm": 0.14998911321163177, + "learning_rate": 9.127383862264915e-07, + "loss": 1.7078, + "step": 30656 + }, + { + "epoch": 9.4097605893186, + "grad_norm": 0.1262497901916504, + "learning_rate": 9.11793224914309e-07, + "loss": 1.7015, + "step": 30657 + }, + { + "epoch": 9.410067526089625, + "grad_norm": 0.17526039481163025, + "learning_rate": 9.108485487205498e-07, + "loss": 1.725, + "step": 30658 + }, + { + "epoch": 9.41037446286065, + "grad_norm": 0.18700073659420013, + "learning_rate": 9.099043576545674e-07, + "loss": 1.711, + "step": 30659 + }, + { + "epoch": 9.410681399631676, + "grad_norm": 0.12407290935516357, + "learning_rate": 9.089606517256821e-07, + "loss": 1.6912, + "step": 30660 + }, + { + "epoch": 9.410988336402701, + "grad_norm": 0.14186540246009827, + "learning_rate": 9.080174309432199e-07, + "loss": 1.7082, + "step": 30661 + }, + { + "epoch": 9.411295273173726, + "grad_norm": 0.16852159798145294, + "learning_rate": 9.07074695316501e-07, + "loss": 1.7107, + "step": 30662 + }, + { + "epoch": 9.411602209944752, + "grad_norm": 0.18337292969226837, + "learning_rate": 9.061324448548403e-07, + "loss": 1.7359, + "step": 30663 + }, + { + "epoch": 9.411909146715777, + "grad_norm": 0.1463366150856018, + "learning_rate": 9.051906795675635e-07, + "loss": 1.6903, + "step": 30664 + }, + { + "epoch": 9.412216083486802, + "grad_norm": 0.1920327991247177, + "learning_rate": 9.042493994639579e-07, + "loss": 1.7113, + "step": 30665 + }, + { + "epoch": 9.412523020257828, + "grad_norm": 0.2031734585762024, + "learning_rate": 9.033086045533434e-07, + "loss": 1.7663, + "step": 30666 + }, + { + "epoch": 9.412829957028851, + "grad_norm": 0.15997421741485596, + "learning_rate": 9.023682948450019e-07, + "loss": 1.7607, + "step": 30667 + }, + { + "epoch": 9.413136893799877, + "grad_norm": 0.1264960914850235, + "learning_rate": 9.014284703482422e-07, + "loss": 1.6638, + "step": 30668 + }, + { + "epoch": 9.413443830570902, + "grad_norm": 0.13021783530712128, + "learning_rate": 9.004891310723407e-07, + "loss": 1.6783, + "step": 30669 + }, + { + "epoch": 9.413750767341927, + "grad_norm": 0.13910266757011414, + "learning_rate": 8.995502770265785e-07, + "loss": 1.7262, + "step": 30670 + }, + { + "epoch": 9.414057704112953, + "grad_norm": 0.1369626671075821, + "learning_rate": 8.986119082202482e-07, + "loss": 1.6998, + "step": 30671 + }, + { + "epoch": 9.414364640883978, + "grad_norm": 0.1432434767484665, + "learning_rate": 8.976740246626092e-07, + "loss": 1.7156, + "step": 30672 + }, + { + "epoch": 9.414671577655003, + "grad_norm": 0.2088400423526764, + "learning_rate": 8.967366263629373e-07, + "loss": 1.7551, + "step": 30673 + }, + { + "epoch": 9.414978514426029, + "grad_norm": 0.1348891705274582, + "learning_rate": 8.957997133304918e-07, + "loss": 1.6757, + "step": 30674 + }, + { + "epoch": 9.415285451197054, + "grad_norm": 0.15271534025669098, + "learning_rate": 8.94863285574532e-07, + "loss": 1.7382, + "step": 30675 + }, + { + "epoch": 9.41559238796808, + "grad_norm": 0.14035186171531677, + "learning_rate": 8.939273431043227e-07, + "loss": 1.7186, + "step": 30676 + }, + { + "epoch": 9.415899324739105, + "grad_norm": 0.11167564988136292, + "learning_rate": 8.929918859291009e-07, + "loss": 1.6706, + "step": 30677 + }, + { + "epoch": 9.416206261510128, + "grad_norm": 0.12790827453136444, + "learning_rate": 8.920569140581148e-07, + "loss": 1.6824, + "step": 30678 + }, + { + "epoch": 9.416513198281153, + "grad_norm": 0.11640806496143341, + "learning_rate": 8.911224275006069e-07, + "loss": 1.7236, + "step": 30679 + }, + { + "epoch": 9.416820135052179, + "grad_norm": 0.19866923987865448, + "learning_rate": 8.901884262658089e-07, + "loss": 1.7744, + "step": 30680 + }, + { + "epoch": 9.417127071823204, + "grad_norm": 0.12702670693397522, + "learning_rate": 8.892549103629577e-07, + "loss": 1.6858, + "step": 30681 + }, + { + "epoch": 9.41743400859423, + "grad_norm": 0.10487339645624161, + "learning_rate": 8.883218798012683e-07, + "loss": 1.6862, + "step": 30682 + }, + { + "epoch": 9.417740945365255, + "grad_norm": 0.1706196665763855, + "learning_rate": 8.87389334589972e-07, + "loss": 1.7688, + "step": 30683 + }, + { + "epoch": 9.41804788213628, + "grad_norm": 0.18874917924404144, + "learning_rate": 8.864572747382782e-07, + "loss": 1.7335, + "step": 30684 + }, + { + "epoch": 9.418354818907305, + "grad_norm": 0.12817202508449554, + "learning_rate": 8.855257002553963e-07, + "loss": 1.7276, + "step": 30685 + }, + { + "epoch": 9.41866175567833, + "grad_norm": 0.16661255061626434, + "learning_rate": 8.84594611150541e-07, + "loss": 1.7635, + "step": 30686 + }, + { + "epoch": 9.418968692449356, + "grad_norm": 0.1407301425933838, + "learning_rate": 8.836640074329106e-07, + "loss": 1.6675, + "step": 30687 + }, + { + "epoch": 9.419275629220381, + "grad_norm": 0.1266261488199234, + "learning_rate": 8.827338891116976e-07, + "loss": 1.685, + "step": 30688 + }, + { + "epoch": 9.419582565991405, + "grad_norm": 0.1475544422864914, + "learning_rate": 8.818042561961004e-07, + "loss": 1.7009, + "step": 30689 + }, + { + "epoch": 9.41988950276243, + "grad_norm": 0.15595827996730804, + "learning_rate": 8.808751086953005e-07, + "loss": 1.7571, + "step": 30690 + }, + { + "epoch": 9.420196439533456, + "grad_norm": 0.1931566298007965, + "learning_rate": 8.799464466184793e-07, + "loss": 1.7627, + "step": 30691 + }, + { + "epoch": 9.420503376304481, + "grad_norm": 0.12590163946151733, + "learning_rate": 8.790182699748128e-07, + "loss": 1.6673, + "step": 30692 + }, + { + "epoch": 9.420810313075506, + "grad_norm": 0.135042205452919, + "learning_rate": 8.780905787734939e-07, + "loss": 1.7102, + "step": 30693 + }, + { + "epoch": 9.421117249846532, + "grad_norm": 0.15336740016937256, + "learning_rate": 8.77163373023665e-07, + "loss": 1.7079, + "step": 30694 + }, + { + "epoch": 9.421424186617557, + "grad_norm": 0.1408243626356125, + "learning_rate": 8.762366527345022e-07, + "loss": 1.7039, + "step": 30695 + }, + { + "epoch": 9.421731123388582, + "grad_norm": 0.16094304621219635, + "learning_rate": 8.753104179151595e-07, + "loss": 1.7415, + "step": 30696 + }, + { + "epoch": 9.422038060159608, + "grad_norm": 0.1549450308084488, + "learning_rate": 8.743846685747903e-07, + "loss": 1.709, + "step": 30697 + }, + { + "epoch": 9.422344996930633, + "grad_norm": 0.1558622568845749, + "learning_rate": 8.734594047225486e-07, + "loss": 1.7329, + "step": 30698 + }, + { + "epoch": 9.422651933701658, + "grad_norm": 0.10400709509849548, + "learning_rate": 8.725346263675716e-07, + "loss": 1.6449, + "step": 30699 + }, + { + "epoch": 9.422958870472682, + "grad_norm": 0.12266384065151215, + "learning_rate": 8.716103335190073e-07, + "loss": 1.6983, + "step": 30700 + }, + { + "epoch": 9.423265807243707, + "grad_norm": 0.14934386312961578, + "learning_rate": 8.70686526185982e-07, + "loss": 1.7348, + "step": 30701 + }, + { + "epoch": 9.423572744014733, + "grad_norm": 0.18102359771728516, + "learning_rate": 8.697632043776271e-07, + "loss": 1.7326, + "step": 30702 + }, + { + "epoch": 9.423879680785758, + "grad_norm": 0.16218020021915436, + "learning_rate": 8.688403681030688e-07, + "loss": 1.7721, + "step": 30703 + }, + { + "epoch": 9.424186617556783, + "grad_norm": 0.15908999741077423, + "learning_rate": 8.679180173714275e-07, + "loss": 1.7115, + "step": 30704 + }, + { + "epoch": 9.424493554327809, + "grad_norm": 0.13521069288253784, + "learning_rate": 8.66996152191818e-07, + "loss": 1.683, + "step": 30705 + }, + { + "epoch": 9.424800491098834, + "grad_norm": 0.1464395523071289, + "learning_rate": 8.660747725733497e-07, + "loss": 1.7209, + "step": 30706 + }, + { + "epoch": 9.42510742786986, + "grad_norm": 0.10634544491767883, + "learning_rate": 8.651538785251267e-07, + "loss": 1.6366, + "step": 30707 + }, + { + "epoch": 9.425414364640885, + "grad_norm": 0.15586215257644653, + "learning_rate": 8.642334700562526e-07, + "loss": 1.7363, + "step": 30708 + }, + { + "epoch": 9.42572130141191, + "grad_norm": 0.14794576168060303, + "learning_rate": 8.633135471758203e-07, + "loss": 1.7143, + "step": 30709 + }, + { + "epoch": 9.426028238182933, + "grad_norm": 0.14911554753780365, + "learning_rate": 8.623941098929334e-07, + "loss": 1.6894, + "step": 30710 + }, + { + "epoch": 9.426335174953959, + "grad_norm": 0.16456535458564758, + "learning_rate": 8.614751582166625e-07, + "loss": 1.7059, + "step": 30711 + }, + { + "epoch": 9.426642111724984, + "grad_norm": 0.2710132300853729, + "learning_rate": 8.605566921560948e-07, + "loss": 1.738, + "step": 30712 + }, + { + "epoch": 9.42694904849601, + "grad_norm": 0.12046913802623749, + "learning_rate": 8.596387117203064e-07, + "loss": 1.7124, + "step": 30713 + }, + { + "epoch": 9.427255985267035, + "grad_norm": 0.1438749134540558, + "learning_rate": 8.587212169183679e-07, + "loss": 1.7014, + "step": 30714 + }, + { + "epoch": 9.42756292203806, + "grad_norm": 0.13070370256900787, + "learning_rate": 8.578042077593551e-07, + "loss": 1.6561, + "step": 30715 + }, + { + "epoch": 9.427869858809085, + "grad_norm": 0.13055887818336487, + "learning_rate": 8.568876842523166e-07, + "loss": 1.6791, + "step": 30716 + }, + { + "epoch": 9.42817679558011, + "grad_norm": 0.13492754101753235, + "learning_rate": 8.559716464063284e-07, + "loss": 1.6803, + "step": 30717 + }, + { + "epoch": 9.428483732351136, + "grad_norm": 0.17521773278713226, + "learning_rate": 8.55056094230422e-07, + "loss": 1.7152, + "step": 30718 + }, + { + "epoch": 9.428790669122161, + "grad_norm": 0.18804030120372772, + "learning_rate": 8.541410277336625e-07, + "loss": 1.7344, + "step": 30719 + }, + { + "epoch": 9.429097605893187, + "grad_norm": 0.14698217809200287, + "learning_rate": 8.532264469250873e-07, + "loss": 1.6978, + "step": 30720 + }, + { + "epoch": 9.42940454266421, + "grad_norm": 0.10534154623746872, + "learning_rate": 8.523123518137277e-07, + "loss": 1.6536, + "step": 30721 + }, + { + "epoch": 9.429711479435236, + "grad_norm": 0.13445980846881866, + "learning_rate": 8.513987424086323e-07, + "loss": 1.6999, + "step": 30722 + }, + { + "epoch": 9.430018416206261, + "grad_norm": 0.19551974534988403, + "learning_rate": 8.50485618718816e-07, + "loss": 1.7358, + "step": 30723 + }, + { + "epoch": 9.430325352977286, + "grad_norm": 0.13450706005096436, + "learning_rate": 8.495729807533104e-07, + "loss": 1.7157, + "step": 30724 + }, + { + "epoch": 9.430632289748312, + "grad_norm": 0.17215101420879364, + "learning_rate": 8.486608285211306e-07, + "loss": 1.7139, + "step": 30725 + }, + { + "epoch": 9.430939226519337, + "grad_norm": 0.15021352469921112, + "learning_rate": 8.477491620312916e-07, + "loss": 1.7179, + "step": 30726 + }, + { + "epoch": 9.431246163290362, + "grad_norm": 0.13625288009643555, + "learning_rate": 8.468379812928084e-07, + "loss": 1.6991, + "step": 30727 + }, + { + "epoch": 9.431553100061388, + "grad_norm": 0.09747711569070816, + "learning_rate": 8.459272863146794e-07, + "loss": 1.6463, + "step": 30728 + }, + { + "epoch": 9.431860036832413, + "grad_norm": 0.13644148409366608, + "learning_rate": 8.450170771059085e-07, + "loss": 1.7026, + "step": 30729 + }, + { + "epoch": 9.432166973603438, + "grad_norm": 0.12617720663547516, + "learning_rate": 8.441073536754884e-07, + "loss": 1.6705, + "step": 30730 + }, + { + "epoch": 9.432473910374464, + "grad_norm": 0.12123163044452667, + "learning_rate": 8.431981160324065e-07, + "loss": 1.681, + "step": 30731 + }, + { + "epoch": 9.432780847145487, + "grad_norm": 0.18256647884845734, + "learning_rate": 8.422893641856611e-07, + "loss": 1.7564, + "step": 30732 + }, + { + "epoch": 9.433087783916513, + "grad_norm": 0.14204107224941254, + "learning_rate": 8.413810981442171e-07, + "loss": 1.7232, + "step": 30733 + }, + { + "epoch": 9.433394720687538, + "grad_norm": 0.1158083900809288, + "learning_rate": 8.404733179170677e-07, + "loss": 1.665, + "step": 30734 + }, + { + "epoch": 9.433701657458563, + "grad_norm": 0.18204176425933838, + "learning_rate": 8.395660235131608e-07, + "loss": 1.7477, + "step": 30735 + }, + { + "epoch": 9.434008594229589, + "grad_norm": 0.15896224975585938, + "learning_rate": 8.38659214941484e-07, + "loss": 1.8096, + "step": 30736 + }, + { + "epoch": 9.434315531000614, + "grad_norm": 0.10890607535839081, + "learning_rate": 8.377528922109912e-07, + "loss": 1.6713, + "step": 30737 + }, + { + "epoch": 9.43462246777164, + "grad_norm": 0.117277592420578, + "learning_rate": 8.368470553306417e-07, + "loss": 1.7043, + "step": 30738 + }, + { + "epoch": 9.434929404542665, + "grad_norm": 0.12226385623216629, + "learning_rate": 8.359417043093787e-07, + "loss": 1.6709, + "step": 30739 + }, + { + "epoch": 9.43523634131369, + "grad_norm": 0.16085174679756165, + "learning_rate": 8.350368391561614e-07, + "loss": 1.7212, + "step": 30740 + }, + { + "epoch": 9.435543278084715, + "grad_norm": 0.15585030615329742, + "learning_rate": 8.341324598799216e-07, + "loss": 1.743, + "step": 30741 + }, + { + "epoch": 9.43585021485574, + "grad_norm": 0.11419086158275604, + "learning_rate": 8.332285664896078e-07, + "loss": 1.6775, + "step": 30742 + }, + { + "epoch": 9.436157151626764, + "grad_norm": 0.1748945116996765, + "learning_rate": 8.323251589941405e-07, + "loss": 1.7916, + "step": 30743 + }, + { + "epoch": 9.43646408839779, + "grad_norm": 0.14767593145370483, + "learning_rate": 8.314222374024572e-07, + "loss": 1.7165, + "step": 30744 + }, + { + "epoch": 9.436771025168815, + "grad_norm": 0.1396973431110382, + "learning_rate": 8.305198017234783e-07, + "loss": 1.68, + "step": 30745 + }, + { + "epoch": 9.43707796193984, + "grad_norm": 0.14533667266368866, + "learning_rate": 8.296178519661246e-07, + "loss": 1.6742, + "step": 30746 + }, + { + "epoch": 9.437384898710865, + "grad_norm": 0.14526952803134918, + "learning_rate": 8.287163881393001e-07, + "loss": 1.7084, + "step": 30747 + }, + { + "epoch": 9.43769183548189, + "grad_norm": 0.10500401258468628, + "learning_rate": 8.278154102519198e-07, + "loss": 1.6581, + "step": 30748 + }, + { + "epoch": 9.437998772252916, + "grad_norm": 0.12266987562179565, + "learning_rate": 8.269149183128988e-07, + "loss": 1.7261, + "step": 30749 + }, + { + "epoch": 9.438305709023942, + "grad_norm": 0.11223867535591125, + "learning_rate": 8.260149123311134e-07, + "loss": 1.6873, + "step": 30750 + }, + { + "epoch": 9.438612645794967, + "grad_norm": 0.1576405167579651, + "learning_rate": 8.251153923154842e-07, + "loss": 1.6975, + "step": 30751 + }, + { + "epoch": 9.438919582565992, + "grad_norm": 0.14165537059307098, + "learning_rate": 8.242163582748763e-07, + "loss": 1.6916, + "step": 30752 + }, + { + "epoch": 9.439226519337016, + "grad_norm": 0.11340904235839844, + "learning_rate": 8.233178102181882e-07, + "loss": 1.6931, + "step": 30753 + }, + { + "epoch": 9.439533456108041, + "grad_norm": 0.13339579105377197, + "learning_rate": 8.224197481542962e-07, + "loss": 1.717, + "step": 30754 + }, + { + "epoch": 9.439840392879066, + "grad_norm": 0.19762879610061646, + "learning_rate": 8.215221720920762e-07, + "loss": 1.7263, + "step": 30755 + }, + { + "epoch": 9.440147329650092, + "grad_norm": 0.13339634239673615, + "learning_rate": 8.206250820403993e-07, + "loss": 1.7135, + "step": 30756 + }, + { + "epoch": 9.440454266421117, + "grad_norm": 0.17574037611484528, + "learning_rate": 8.197284780081305e-07, + "loss": 1.704, + "step": 30757 + }, + { + "epoch": 9.440761203192142, + "grad_norm": 0.15657347440719604, + "learning_rate": 8.188323600041293e-07, + "loss": 1.7323, + "step": 30758 + }, + { + "epoch": 9.441068139963168, + "grad_norm": 0.14541512727737427, + "learning_rate": 8.179367280372552e-07, + "loss": 1.6631, + "step": 30759 + }, + { + "epoch": 9.441375076734193, + "grad_norm": 0.13230635225772858, + "learning_rate": 8.170415821163568e-07, + "loss": 1.7423, + "step": 30760 + }, + { + "epoch": 9.441682013505218, + "grad_norm": 0.11934958398342133, + "learning_rate": 8.161469222502771e-07, + "loss": 1.6604, + "step": 30761 + }, + { + "epoch": 9.441988950276244, + "grad_norm": 0.27107498049736023, + "learning_rate": 8.152527484478645e-07, + "loss": 1.7381, + "step": 30762 + }, + { + "epoch": 9.442295887047269, + "grad_norm": 0.13428844511508942, + "learning_rate": 8.143590607179508e-07, + "loss": 1.7329, + "step": 30763 + }, + { + "epoch": 9.442602823818293, + "grad_norm": 0.1932329535484314, + "learning_rate": 8.134658590693678e-07, + "loss": 1.7865, + "step": 30764 + }, + { + "epoch": 9.442909760589318, + "grad_norm": 0.14267316460609436, + "learning_rate": 8.125731435109419e-07, + "loss": 1.6631, + "step": 30765 + }, + { + "epoch": 9.443216697360343, + "grad_norm": 0.14356668293476105, + "learning_rate": 8.116809140515047e-07, + "loss": 1.7113, + "step": 30766 + }, + { + "epoch": 9.443523634131369, + "grad_norm": 0.13832272589206696, + "learning_rate": 8.107891706998605e-07, + "loss": 1.6945, + "step": 30767 + }, + { + "epoch": 9.443830570902394, + "grad_norm": 0.12506070733070374, + "learning_rate": 8.098979134648355e-07, + "loss": 1.6563, + "step": 30768 + }, + { + "epoch": 9.44413750767342, + "grad_norm": 0.12485837191343307, + "learning_rate": 8.090071423552226e-07, + "loss": 1.6704, + "step": 30769 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.14637434482574463, + "learning_rate": 8.08116857379837e-07, + "loss": 1.696, + "step": 30770 + }, + { + "epoch": 9.44475138121547, + "grad_norm": 0.16358232498168945, + "learning_rate": 8.072270585474773e-07, + "loss": 1.6779, + "step": 30771 + }, + { + "epoch": 9.445058317986495, + "grad_norm": 0.1596413105726242, + "learning_rate": 8.063377458669252e-07, + "loss": 1.7176, + "step": 30772 + }, + { + "epoch": 9.44536525475752, + "grad_norm": 0.20140093564987183, + "learning_rate": 8.054489193469794e-07, + "loss": 1.7194, + "step": 30773 + }, + { + "epoch": 9.445672191528546, + "grad_norm": 0.15662498772144318, + "learning_rate": 8.045605789964216e-07, + "loss": 1.7523, + "step": 30774 + }, + { + "epoch": 9.44597912829957, + "grad_norm": 0.13946685194969177, + "learning_rate": 8.036727248240339e-07, + "loss": 1.6938, + "step": 30775 + }, + { + "epoch": 9.446286065070595, + "grad_norm": 0.1664799004793167, + "learning_rate": 8.027853568385812e-07, + "loss": 1.7221, + "step": 30776 + }, + { + "epoch": 9.44659300184162, + "grad_norm": 0.11461447924375534, + "learning_rate": 8.018984750488456e-07, + "loss": 1.6839, + "step": 30777 + }, + { + "epoch": 9.446899938612646, + "grad_norm": 0.12960290908813477, + "learning_rate": 8.010120794635812e-07, + "loss": 1.6936, + "step": 30778 + }, + { + "epoch": 9.44720687538367, + "grad_norm": 0.16561156511306763, + "learning_rate": 8.00126170091553e-07, + "loss": 1.693, + "step": 30779 + }, + { + "epoch": 9.447513812154696, + "grad_norm": 0.1481138914823532, + "learning_rate": 7.992407469415152e-07, + "loss": 1.721, + "step": 30780 + }, + { + "epoch": 9.447820748925722, + "grad_norm": 0.15538595616817474, + "learning_rate": 7.983558100222166e-07, + "loss": 1.6943, + "step": 30781 + }, + { + "epoch": 9.448127685696747, + "grad_norm": 0.1411004513502121, + "learning_rate": 7.974713593423999e-07, + "loss": 1.7118, + "step": 30782 + }, + { + "epoch": 9.448434622467772, + "grad_norm": 0.14798758924007416, + "learning_rate": 7.965873949108193e-07, + "loss": 1.7267, + "step": 30783 + }, + { + "epoch": 9.448741559238798, + "grad_norm": 0.12365894019603729, + "learning_rate": 7.957039167361902e-07, + "loss": 1.6921, + "step": 30784 + }, + { + "epoch": 9.449048496009823, + "grad_norm": 0.12739478051662445, + "learning_rate": 7.948209248272609e-07, + "loss": 1.6827, + "step": 30785 + }, + { + "epoch": 9.449355432780846, + "grad_norm": 0.11038246005773544, + "learning_rate": 7.939384191927469e-07, + "loss": 1.6778, + "step": 30786 + }, + { + "epoch": 9.449662369551872, + "grad_norm": 0.14928553998470306, + "learning_rate": 7.930563998413798e-07, + "loss": 1.7197, + "step": 30787 + }, + { + "epoch": 9.449969306322897, + "grad_norm": 0.2465045005083084, + "learning_rate": 7.921748667818695e-07, + "loss": 1.751, + "step": 30788 + }, + { + "epoch": 9.450276243093922, + "grad_norm": 0.1846035271883011, + "learning_rate": 7.912938200229259e-07, + "loss": 1.753, + "step": 30789 + }, + { + "epoch": 9.450583179864948, + "grad_norm": 0.13683682680130005, + "learning_rate": 7.904132595732639e-07, + "loss": 1.7112, + "step": 30790 + }, + { + "epoch": 9.450890116635973, + "grad_norm": 0.1144467145204544, + "learning_rate": 7.895331854415766e-07, + "loss": 1.678, + "step": 30791 + }, + { + "epoch": 9.451197053406998, + "grad_norm": 0.11407051235437393, + "learning_rate": 7.886535976365628e-07, + "loss": 1.6573, + "step": 30792 + }, + { + "epoch": 9.451503990178024, + "grad_norm": 0.14853791892528534, + "learning_rate": 7.877744961669209e-07, + "loss": 1.7284, + "step": 30793 + }, + { + "epoch": 9.45181092694905, + "grad_norm": 0.15787862241268158, + "learning_rate": 7.868958810413385e-07, + "loss": 1.7638, + "step": 30794 + }, + { + "epoch": 9.452117863720074, + "grad_norm": 0.1264905035495758, + "learning_rate": 7.86017752268492e-07, + "loss": 1.6968, + "step": 30795 + }, + { + "epoch": 9.452424800491098, + "grad_norm": 0.15339265763759613, + "learning_rate": 7.851401098570632e-07, + "loss": 1.6885, + "step": 30796 + }, + { + "epoch": 9.452731737262123, + "grad_norm": 0.14742697775363922, + "learning_rate": 7.842629538157286e-07, + "loss": 1.7038, + "step": 30797 + }, + { + "epoch": 9.453038674033149, + "grad_norm": 0.16144371032714844, + "learning_rate": 7.833862841531536e-07, + "loss": 1.7374, + "step": 30798 + }, + { + "epoch": 9.453345610804174, + "grad_norm": 0.15689444541931152, + "learning_rate": 7.825101008779979e-07, + "loss": 1.7509, + "step": 30799 + }, + { + "epoch": 9.4536525475752, + "grad_norm": 0.16697221994400024, + "learning_rate": 7.81634403998932e-07, + "loss": 1.7841, + "step": 30800 + }, + { + "epoch": 9.453959484346225, + "grad_norm": 0.11735955625772476, + "learning_rate": 7.80759193524594e-07, + "loss": 1.6864, + "step": 30801 + }, + { + "epoch": 9.45426642111725, + "grad_norm": 0.13182209432125092, + "learning_rate": 7.798844694636487e-07, + "loss": 1.6834, + "step": 30802 + }, + { + "epoch": 9.454573357888275, + "grad_norm": 0.12708893418312073, + "learning_rate": 7.790102318247283e-07, + "loss": 1.6529, + "step": 30803 + }, + { + "epoch": 9.4548802946593, + "grad_norm": 0.11800631135702133, + "learning_rate": 7.781364806164815e-07, + "loss": 1.7123, + "step": 30804 + }, + { + "epoch": 9.455187231430326, + "grad_norm": 0.2169203758239746, + "learning_rate": 7.772632158475401e-07, + "loss": 1.7522, + "step": 30805 + }, + { + "epoch": 9.455494168201351, + "grad_norm": 0.1831941157579422, + "learning_rate": 7.763904375265307e-07, + "loss": 1.8109, + "step": 30806 + }, + { + "epoch": 9.455801104972375, + "grad_norm": 0.1484314352273941, + "learning_rate": 7.755181456620852e-07, + "loss": 1.7101, + "step": 30807 + }, + { + "epoch": 9.4561080417434, + "grad_norm": 0.10662242770195007, + "learning_rate": 7.74646340262819e-07, + "loss": 1.679, + "step": 30808 + }, + { + "epoch": 9.456414978514426, + "grad_norm": 0.13147766888141632, + "learning_rate": 7.737750213373529e-07, + "loss": 1.6738, + "step": 30809 + }, + { + "epoch": 9.456721915285451, + "grad_norm": 0.14727403223514557, + "learning_rate": 7.729041888942911e-07, + "loss": 1.7048, + "step": 30810 + }, + { + "epoch": 9.457028852056476, + "grad_norm": 0.1278834491968155, + "learning_rate": 7.720338429422436e-07, + "loss": 1.7102, + "step": 30811 + }, + { + "epoch": 9.457335788827502, + "grad_norm": 0.13472500443458557, + "learning_rate": 7.711639834898143e-07, + "loss": 1.7265, + "step": 30812 + }, + { + "epoch": 9.457642725598527, + "grad_norm": 0.1379247009754181, + "learning_rate": 7.702946105455911e-07, + "loss": 1.7381, + "step": 30813 + }, + { + "epoch": 9.457949662369552, + "grad_norm": 0.13163436949253082, + "learning_rate": 7.694257241181723e-07, + "loss": 1.7077, + "step": 30814 + }, + { + "epoch": 9.458256599140578, + "grad_norm": 0.18956807255744934, + "learning_rate": 7.685573242161459e-07, + "loss": 1.7148, + "step": 30815 + }, + { + "epoch": 9.458563535911603, + "grad_norm": 0.0954909548163414, + "learning_rate": 7.676894108480881e-07, + "loss": 1.625, + "step": 30816 + }, + { + "epoch": 9.458870472682626, + "grad_norm": 0.16598805785179138, + "learning_rate": 7.668219840225866e-07, + "loss": 1.7074, + "step": 30817 + }, + { + "epoch": 9.459177409453652, + "grad_norm": 0.13503910601139069, + "learning_rate": 7.659550437481955e-07, + "loss": 1.6738, + "step": 30818 + }, + { + "epoch": 9.459484346224677, + "grad_norm": 0.15524166822433472, + "learning_rate": 7.650885900335025e-07, + "loss": 1.7235, + "step": 30819 + }, + { + "epoch": 9.459791282995702, + "grad_norm": 0.1390114575624466, + "learning_rate": 7.642226228870563e-07, + "loss": 1.6718, + "step": 30820 + }, + { + "epoch": 9.460098219766728, + "grad_norm": 0.10782946646213531, + "learning_rate": 7.63357142317428e-07, + "loss": 1.6511, + "step": 30821 + }, + { + "epoch": 9.460405156537753, + "grad_norm": 0.29216310381889343, + "learning_rate": 7.624921483331549e-07, + "loss": 1.7298, + "step": 30822 + }, + { + "epoch": 9.460712093308778, + "grad_norm": 0.14348210394382477, + "learning_rate": 7.616276409427969e-07, + "loss": 1.7039, + "step": 30823 + }, + { + "epoch": 9.461019030079804, + "grad_norm": 0.15576337277889252, + "learning_rate": 7.607636201548918e-07, + "loss": 1.7253, + "step": 30824 + }, + { + "epoch": 9.46132596685083, + "grad_norm": 0.12783481180667877, + "learning_rate": 7.599000859779826e-07, + "loss": 1.6927, + "step": 30825 + }, + { + "epoch": 9.461632903621854, + "grad_norm": 0.1323290467262268, + "learning_rate": 7.590370384206014e-07, + "loss": 1.6943, + "step": 30826 + }, + { + "epoch": 9.46193984039288, + "grad_norm": 0.10137525945901871, + "learning_rate": 7.581744774912747e-07, + "loss": 1.6775, + "step": 30827 + }, + { + "epoch": 9.462246777163903, + "grad_norm": 0.10773646086454391, + "learning_rate": 7.573124031985346e-07, + "loss": 1.6618, + "step": 30828 + }, + { + "epoch": 9.462553713934929, + "grad_norm": 0.12834392488002777, + "learning_rate": 7.564508155508909e-07, + "loss": 1.6934, + "step": 30829 + }, + { + "epoch": 9.462860650705954, + "grad_norm": 0.17545762658119202, + "learning_rate": 7.555897145568646e-07, + "loss": 1.7689, + "step": 30830 + }, + { + "epoch": 9.46316758747698, + "grad_norm": 0.13099749386310577, + "learning_rate": 7.547291002249657e-07, + "loss": 1.6901, + "step": 30831 + }, + { + "epoch": 9.463474524248005, + "grad_norm": 0.14668162167072296, + "learning_rate": 7.538689725636927e-07, + "loss": 1.7553, + "step": 30832 + }, + { + "epoch": 9.46378146101903, + "grad_norm": 0.14195361733436584, + "learning_rate": 7.530093315815557e-07, + "loss": 1.7015, + "step": 30833 + }, + { + "epoch": 9.464088397790055, + "grad_norm": 0.11229286342859268, + "learning_rate": 7.521501772870421e-07, + "loss": 1.6858, + "step": 30834 + }, + { + "epoch": 9.46439533456108, + "grad_norm": 0.15487706661224365, + "learning_rate": 7.512915096886397e-07, + "loss": 1.7377, + "step": 30835 + }, + { + "epoch": 9.464702271332106, + "grad_norm": 0.10888294875621796, + "learning_rate": 7.504333287948529e-07, + "loss": 1.651, + "step": 30836 + }, + { + "epoch": 9.465009208103131, + "grad_norm": 0.11357124894857407, + "learning_rate": 7.495756346141358e-07, + "loss": 1.6881, + "step": 30837 + }, + { + "epoch": 9.465316144874157, + "grad_norm": 0.11690666526556015, + "learning_rate": 7.48718427154993e-07, + "loss": 1.6567, + "step": 30838 + }, + { + "epoch": 9.46562308164518, + "grad_norm": 0.189022496342659, + "learning_rate": 7.478617064258675e-07, + "loss": 1.7489, + "step": 30839 + }, + { + "epoch": 9.465930018416206, + "grad_norm": 0.15130119025707245, + "learning_rate": 7.470054724352527e-07, + "loss": 1.7174, + "step": 30840 + }, + { + "epoch": 9.466236955187231, + "grad_norm": 0.13578876852989197, + "learning_rate": 7.461497251915917e-07, + "loss": 1.7101, + "step": 30841 + }, + { + "epoch": 9.466543891958256, + "grad_norm": 0.10819463431835175, + "learning_rate": 7.452944647033499e-07, + "loss": 1.6773, + "step": 30842 + }, + { + "epoch": 9.466850828729282, + "grad_norm": 0.23427242040634155, + "learning_rate": 7.444396909789763e-07, + "loss": 1.7527, + "step": 30843 + }, + { + "epoch": 9.467157765500307, + "grad_norm": 0.16425447165966034, + "learning_rate": 7.435854040269197e-07, + "loss": 1.6726, + "step": 30844 + }, + { + "epoch": 9.467464702271332, + "grad_norm": 0.14800399541854858, + "learning_rate": 7.427316038556231e-07, + "loss": 1.7177, + "step": 30845 + }, + { + "epoch": 9.467771639042358, + "grad_norm": 0.16622939705848694, + "learning_rate": 7.418782904735189e-07, + "loss": 1.7018, + "step": 30846 + }, + { + "epoch": 9.468078575813383, + "grad_norm": 0.1474144607782364, + "learning_rate": 7.410254638890501e-07, + "loss": 1.7278, + "step": 30847 + }, + { + "epoch": 9.468385512584408, + "grad_norm": 0.13024532794952393, + "learning_rate": 7.40173124110638e-07, + "loss": 1.7262, + "step": 30848 + }, + { + "epoch": 9.468692449355434, + "grad_norm": 0.12134112417697906, + "learning_rate": 7.393212711467035e-07, + "loss": 1.722, + "step": 30849 + }, + { + "epoch": 9.468999386126457, + "grad_norm": 0.1322898268699646, + "learning_rate": 7.384699050056731e-07, + "loss": 1.6728, + "step": 30850 + }, + { + "epoch": 9.469306322897483, + "grad_norm": 0.14417654275894165, + "learning_rate": 7.37619025695957e-07, + "loss": 1.667, + "step": 30851 + }, + { + "epoch": 9.469613259668508, + "grad_norm": 0.15618880093097687, + "learning_rate": 7.367686332259538e-07, + "loss": 1.7028, + "step": 30852 + }, + { + "epoch": 9.469920196439533, + "grad_norm": 0.14198319613933563, + "learning_rate": 7.359187276040902e-07, + "loss": 1.6945, + "step": 30853 + }, + { + "epoch": 9.470227133210559, + "grad_norm": 0.19188794493675232, + "learning_rate": 7.350693088387428e-07, + "loss": 1.7103, + "step": 30854 + }, + { + "epoch": 9.470534069981584, + "grad_norm": 0.11949232220649719, + "learning_rate": 7.342203769383216e-07, + "loss": 1.6757, + "step": 30855 + }, + { + "epoch": 9.47084100675261, + "grad_norm": 0.1419954150915146, + "learning_rate": 7.333719319112031e-07, + "loss": 1.751, + "step": 30856 + }, + { + "epoch": 9.471147943523635, + "grad_norm": 0.13624246418476105, + "learning_rate": 7.325239737657863e-07, + "loss": 1.7212, + "step": 30857 + }, + { + "epoch": 9.47145488029466, + "grad_norm": 0.1910800039768219, + "learning_rate": 7.316765025104422e-07, + "loss": 1.7812, + "step": 30858 + }, + { + "epoch": 9.471761817065685, + "grad_norm": 0.1337525099515915, + "learning_rate": 7.308295181535474e-07, + "loss": 1.7106, + "step": 30859 + }, + { + "epoch": 9.472068753836709, + "grad_norm": 0.1155819520354271, + "learning_rate": 7.299830207034731e-07, + "loss": 1.6483, + "step": 30860 + }, + { + "epoch": 9.472375690607734, + "grad_norm": 0.12981106340885162, + "learning_rate": 7.291370101685846e-07, + "loss": 1.6897, + "step": 30861 + }, + { + "epoch": 9.47268262737876, + "grad_norm": 0.1460549235343933, + "learning_rate": 7.282914865572421e-07, + "loss": 1.7478, + "step": 30862 + }, + { + "epoch": 9.472989564149785, + "grad_norm": 0.14573179185390472, + "learning_rate": 7.274464498778055e-07, + "loss": 1.7013, + "step": 30863 + }, + { + "epoch": 9.47329650092081, + "grad_norm": 0.2089526355266571, + "learning_rate": 7.266019001386182e-07, + "loss": 1.7517, + "step": 30864 + }, + { + "epoch": 9.473603437691835, + "grad_norm": 0.14519059658050537, + "learning_rate": 7.257578373480345e-07, + "loss": 1.7181, + "step": 30865 + }, + { + "epoch": 9.47391037446286, + "grad_norm": 0.17337870597839355, + "learning_rate": 7.249142615143922e-07, + "loss": 1.7488, + "step": 30866 + }, + { + "epoch": 9.474217311233886, + "grad_norm": 0.1789073348045349, + "learning_rate": 7.240711726460237e-07, + "loss": 1.7748, + "step": 30867 + }, + { + "epoch": 9.474524248004911, + "grad_norm": 0.12607963383197784, + "learning_rate": 7.232285707512664e-07, + "loss": 1.7183, + "step": 30868 + }, + { + "epoch": 9.474831184775937, + "grad_norm": 0.15094679594039917, + "learning_rate": 7.223864558384475e-07, + "loss": 1.7163, + "step": 30869 + }, + { + "epoch": 9.475138121546962, + "grad_norm": 0.12432575970888138, + "learning_rate": 7.215448279158932e-07, + "loss": 1.7252, + "step": 30870 + }, + { + "epoch": 9.475445058317986, + "grad_norm": 0.16342738270759583, + "learning_rate": 7.207036869919082e-07, + "loss": 1.7132, + "step": 30871 + }, + { + "epoch": 9.475751995089011, + "grad_norm": 0.11726677417755127, + "learning_rate": 7.198630330748191e-07, + "loss": 1.6582, + "step": 30872 + }, + { + "epoch": 9.476058931860036, + "grad_norm": 0.13808207213878632, + "learning_rate": 7.190228661729193e-07, + "loss": 1.6936, + "step": 30873 + }, + { + "epoch": 9.476365868631062, + "grad_norm": 0.13612079620361328, + "learning_rate": 7.181831862945298e-07, + "loss": 1.6744, + "step": 30874 + }, + { + "epoch": 9.476672805402087, + "grad_norm": 0.13610509037971497, + "learning_rate": 7.173439934479332e-07, + "loss": 1.7474, + "step": 30875 + }, + { + "epoch": 9.476979742173112, + "grad_norm": 0.17372582852840424, + "learning_rate": 7.165052876414335e-07, + "loss": 1.7328, + "step": 30876 + }, + { + "epoch": 9.477286678944138, + "grad_norm": 0.1300712525844574, + "learning_rate": 7.156670688833078e-07, + "loss": 1.6887, + "step": 30877 + }, + { + "epoch": 9.477593615715163, + "grad_norm": 0.17369040846824646, + "learning_rate": 7.148293371818493e-07, + "loss": 1.7768, + "step": 30878 + }, + { + "epoch": 9.477900552486188, + "grad_norm": 0.15355315804481506, + "learning_rate": 7.139920925453347e-07, + "loss": 1.7148, + "step": 30879 + }, + { + "epoch": 9.478207489257214, + "grad_norm": 0.1690572053194046, + "learning_rate": 7.131553349820408e-07, + "loss": 1.711, + "step": 30880 + }, + { + "epoch": 9.478514426028239, + "grad_norm": 0.12726818025112152, + "learning_rate": 7.123190645002332e-07, + "loss": 1.6829, + "step": 30881 + }, + { + "epoch": 9.478821362799263, + "grad_norm": 0.12314258515834808, + "learning_rate": 7.114832811081717e-07, + "loss": 1.6579, + "step": 30882 + }, + { + "epoch": 9.479128299570288, + "grad_norm": 0.093282051384449, + "learning_rate": 7.106479848141279e-07, + "loss": 1.6268, + "step": 30883 + }, + { + "epoch": 9.479435236341313, + "grad_norm": 0.14540770649909973, + "learning_rate": 7.098131756263449e-07, + "loss": 1.7037, + "step": 30884 + }, + { + "epoch": 9.479742173112339, + "grad_norm": 0.12486393749713898, + "learning_rate": 7.089788535530828e-07, + "loss": 1.6861, + "step": 30885 + }, + { + "epoch": 9.480049109883364, + "grad_norm": 0.1135348379611969, + "learning_rate": 7.08145018602574e-07, + "loss": 1.6523, + "step": 30886 + }, + { + "epoch": 9.48035604665439, + "grad_norm": 0.18895356357097626, + "learning_rate": 7.073116707830729e-07, + "loss": 1.6879, + "step": 30887 + }, + { + "epoch": 9.480662983425415, + "grad_norm": 0.14413176476955414, + "learning_rate": 7.064788101028063e-07, + "loss": 1.6939, + "step": 30888 + }, + { + "epoch": 9.48096992019644, + "grad_norm": 0.16126643121242523, + "learning_rate": 7.056464365700122e-07, + "loss": 1.7301, + "step": 30889 + }, + { + "epoch": 9.481276856967465, + "grad_norm": 0.1249922662973404, + "learning_rate": 7.048145501929115e-07, + "loss": 1.6933, + "step": 30890 + }, + { + "epoch": 9.48158379373849, + "grad_norm": 0.1359063982963562, + "learning_rate": 7.039831509797202e-07, + "loss": 1.6888, + "step": 30891 + }, + { + "epoch": 9.481890730509516, + "grad_norm": 0.19966992735862732, + "learning_rate": 7.031522389386702e-07, + "loss": 1.7428, + "step": 30892 + }, + { + "epoch": 9.48219766728054, + "grad_norm": 0.10133275389671326, + "learning_rate": 7.023218140779553e-07, + "loss": 1.6776, + "step": 30893 + }, + { + "epoch": 9.482504604051565, + "grad_norm": 0.12074444442987442, + "learning_rate": 7.014918764057965e-07, + "loss": 1.732, + "step": 30894 + }, + { + "epoch": 9.48281154082259, + "grad_norm": 0.12305136024951935, + "learning_rate": 7.006624259303873e-07, + "loss": 1.6904, + "step": 30895 + }, + { + "epoch": 9.483118477593615, + "grad_norm": 0.12707793712615967, + "learning_rate": 6.998334626599268e-07, + "loss": 1.6395, + "step": 30896 + }, + { + "epoch": 9.48342541436464, + "grad_norm": 0.22196513414382935, + "learning_rate": 6.990049866026082e-07, + "loss": 1.7651, + "step": 30897 + }, + { + "epoch": 9.483732351135666, + "grad_norm": 0.1324261873960495, + "learning_rate": 6.981769977666197e-07, + "loss": 1.7045, + "step": 30898 + }, + { + "epoch": 9.484039287906691, + "grad_norm": 0.14185984432697296, + "learning_rate": 6.973494961601435e-07, + "loss": 1.713, + "step": 30899 + }, + { + "epoch": 9.484346224677717, + "grad_norm": 0.18500623106956482, + "learning_rate": 6.965224817913507e-07, + "loss": 1.7883, + "step": 30900 + }, + { + "epoch": 9.484653161448742, + "grad_norm": 0.21934804320335388, + "learning_rate": 6.956959546684294e-07, + "loss": 1.7406, + "step": 30901 + }, + { + "epoch": 9.484960098219767, + "grad_norm": 0.10997944325208664, + "learning_rate": 6.948699147995341e-07, + "loss": 1.6881, + "step": 30902 + }, + { + "epoch": 9.485267034990791, + "grad_norm": 0.14439432322978973, + "learning_rate": 6.94044362192825e-07, + "loss": 1.7053, + "step": 30903 + }, + { + "epoch": 9.485573971761816, + "grad_norm": 0.20071901381015778, + "learning_rate": 6.932192968564843e-07, + "loss": 1.7561, + "step": 30904 + }, + { + "epoch": 9.485880908532842, + "grad_norm": 0.1546691358089447, + "learning_rate": 6.92394718798639e-07, + "loss": 1.7001, + "step": 30905 + }, + { + "epoch": 9.486187845303867, + "grad_norm": 0.18300898373126984, + "learning_rate": 6.915706280274547e-07, + "loss": 1.7027, + "step": 30906 + }, + { + "epoch": 9.486494782074892, + "grad_norm": 0.17844128608703613, + "learning_rate": 6.907470245510639e-07, + "loss": 1.6942, + "step": 30907 + }, + { + "epoch": 9.486801718845918, + "grad_norm": 0.12263448536396027, + "learning_rate": 6.899239083776154e-07, + "loss": 1.6811, + "step": 30908 + }, + { + "epoch": 9.487108655616943, + "grad_norm": 0.10036440938711166, + "learning_rate": 6.891012795152419e-07, + "loss": 1.7091, + "step": 30909 + }, + { + "epoch": 9.487415592387968, + "grad_norm": 0.15316228568553925, + "learning_rate": 6.882791379720699e-07, + "loss": 1.7291, + "step": 30910 + }, + { + "epoch": 9.487722529158994, + "grad_norm": 0.10985010862350464, + "learning_rate": 6.874574837562265e-07, + "loss": 1.6553, + "step": 30911 + }, + { + "epoch": 9.488029465930019, + "grad_norm": 0.1258542537689209, + "learning_rate": 6.866363168758327e-07, + "loss": 1.6887, + "step": 30912 + }, + { + "epoch": 9.488336402701044, + "grad_norm": 0.1341710239648819, + "learning_rate": 6.858156373390045e-07, + "loss": 1.68, + "step": 30913 + }, + { + "epoch": 9.488643339472068, + "grad_norm": 0.12450239062309265, + "learning_rate": 6.849954451538465e-07, + "loss": 1.6523, + "step": 30914 + }, + { + "epoch": 9.488950276243093, + "grad_norm": 0.1216820552945137, + "learning_rate": 6.841757403284687e-07, + "loss": 1.7078, + "step": 30915 + }, + { + "epoch": 9.489257213014119, + "grad_norm": 0.1473001092672348, + "learning_rate": 6.833565228709705e-07, + "loss": 1.6773, + "step": 30916 + }, + { + "epoch": 9.489564149785144, + "grad_norm": 0.14543893933296204, + "learning_rate": 6.825377927894505e-07, + "loss": 1.7538, + "step": 30917 + }, + { + "epoch": 9.48987108655617, + "grad_norm": 0.10436581820249557, + "learning_rate": 6.817195500919915e-07, + "loss": 1.6591, + "step": 30918 + }, + { + "epoch": 9.490178023327195, + "grad_norm": 0.1426854431629181, + "learning_rate": 6.809017947866925e-07, + "loss": 1.7502, + "step": 30919 + }, + { + "epoch": 9.49048496009822, + "grad_norm": 0.1720554381608963, + "learning_rate": 6.800845268816248e-07, + "loss": 1.7449, + "step": 30920 + }, + { + "epoch": 9.490791896869245, + "grad_norm": 0.16149570047855377, + "learning_rate": 6.792677463848762e-07, + "loss": 1.732, + "step": 30921 + }, + { + "epoch": 9.49109883364027, + "grad_norm": 0.1278751790523529, + "learning_rate": 6.784514533045017e-07, + "loss": 1.6638, + "step": 30922 + }, + { + "epoch": 9.491405770411296, + "grad_norm": 0.16824519634246826, + "learning_rate": 6.77635647648589e-07, + "loss": 1.7073, + "step": 30923 + }, + { + "epoch": 9.491712707182321, + "grad_norm": 0.1375180333852768, + "learning_rate": 6.768203294251818e-07, + "loss": 1.6884, + "step": 30924 + }, + { + "epoch": 9.492019643953345, + "grad_norm": 0.1789846420288086, + "learning_rate": 6.760054986423459e-07, + "loss": 1.7331, + "step": 30925 + }, + { + "epoch": 9.49232658072437, + "grad_norm": 0.17068323493003845, + "learning_rate": 6.751911553081358e-07, + "loss": 1.6998, + "step": 30926 + }, + { + "epoch": 9.492633517495396, + "grad_norm": 0.1423347294330597, + "learning_rate": 6.743772994305952e-07, + "loss": 1.7104, + "step": 30927 + }, + { + "epoch": 9.49294045426642, + "grad_norm": 0.16446225345134735, + "learning_rate": 6.735639310177733e-07, + "loss": 1.7329, + "step": 30928 + }, + { + "epoch": 9.493247391037446, + "grad_norm": 0.12990720570087433, + "learning_rate": 6.727510500776968e-07, + "loss": 1.6933, + "step": 30929 + }, + { + "epoch": 9.493554327808472, + "grad_norm": 0.09939338266849518, + "learning_rate": 6.719386566184093e-07, + "loss": 1.6667, + "step": 30930 + }, + { + "epoch": 9.493861264579497, + "grad_norm": 0.14358317852020264, + "learning_rate": 6.711267506479379e-07, + "loss": 1.7067, + "step": 30931 + }, + { + "epoch": 9.494168201350522, + "grad_norm": 0.15358752012252808, + "learning_rate": 6.703153321743039e-07, + "loss": 1.7861, + "step": 30932 + }, + { + "epoch": 9.494475138121548, + "grad_norm": 0.14822594821453094, + "learning_rate": 6.695044012055229e-07, + "loss": 1.6869, + "step": 30933 + }, + { + "epoch": 9.494782074892573, + "grad_norm": 0.18011552095413208, + "learning_rate": 6.686939577496165e-07, + "loss": 1.7522, + "step": 30934 + }, + { + "epoch": 9.495089011663598, + "grad_norm": 0.1966308206319809, + "learning_rate": 6.678840018145893e-07, + "loss": 1.7308, + "step": 30935 + }, + { + "epoch": 9.495395948434622, + "grad_norm": 0.19889011979103088, + "learning_rate": 6.670745334084517e-07, + "loss": 1.7796, + "step": 30936 + }, + { + "epoch": 9.495702885205647, + "grad_norm": 0.10640931874513626, + "learning_rate": 6.662655525391859e-07, + "loss": 1.6536, + "step": 30937 + }, + { + "epoch": 9.496009821976672, + "grad_norm": 0.1334729939699173, + "learning_rate": 6.654570592148135e-07, + "loss": 1.7313, + "step": 30938 + }, + { + "epoch": 9.496316758747698, + "grad_norm": 0.1538962870836258, + "learning_rate": 6.646490534433003e-07, + "loss": 1.7212, + "step": 30939 + }, + { + "epoch": 9.496623695518723, + "grad_norm": 0.13990063965320587, + "learning_rate": 6.63841535232651e-07, + "loss": 1.7655, + "step": 30940 + }, + { + "epoch": 9.496930632289748, + "grad_norm": 0.14489619433879852, + "learning_rate": 6.63034504590826e-07, + "loss": 1.7151, + "step": 30941 + }, + { + "epoch": 9.497237569060774, + "grad_norm": 0.14994287490844727, + "learning_rate": 6.622279615258187e-07, + "loss": 1.7778, + "step": 30942 + }, + { + "epoch": 9.497544505831799, + "grad_norm": 0.15099942684173584, + "learning_rate": 6.614219060455895e-07, + "loss": 1.6923, + "step": 30943 + }, + { + "epoch": 9.497851442602824, + "grad_norm": 0.16680224239826202, + "learning_rate": 6.606163381581099e-07, + "loss": 1.6958, + "step": 30944 + }, + { + "epoch": 9.49815837937385, + "grad_norm": 0.1341257095336914, + "learning_rate": 6.598112578713344e-07, + "loss": 1.7436, + "step": 30945 + }, + { + "epoch": 9.498465316144873, + "grad_norm": 0.14608977735042572, + "learning_rate": 6.590066651932237e-07, + "loss": 1.7464, + "step": 30946 + }, + { + "epoch": 9.498772252915899, + "grad_norm": 0.22711209952831268, + "learning_rate": 6.582025601317321e-07, + "loss": 1.6592, + "step": 30947 + }, + { + "epoch": 9.499079189686924, + "grad_norm": 0.11007440835237503, + "learning_rate": 6.573989426948035e-07, + "loss": 1.6821, + "step": 30948 + }, + { + "epoch": 9.49938612645795, + "grad_norm": 0.135493203997612, + "learning_rate": 6.56595812890376e-07, + "loss": 1.6916, + "step": 30949 + }, + { + "epoch": 9.499693063228975, + "grad_norm": 0.11300768703222275, + "learning_rate": 6.557931707263875e-07, + "loss": 1.6867, + "step": 30950 + }, + { + "epoch": 9.5, + "grad_norm": 0.14597927033901215, + "learning_rate": 6.549910162107764e-07, + "loss": 1.682, + "step": 30951 + }, + { + "epoch": 9.500306936771025, + "grad_norm": 0.17950420081615448, + "learning_rate": 6.54189349351464e-07, + "loss": 1.7943, + "step": 30952 + }, + { + "epoch": 9.50061387354205, + "grad_norm": 0.1679387390613556, + "learning_rate": 6.533881701563771e-07, + "loss": 1.7104, + "step": 30953 + }, + { + "epoch": 9.500920810313076, + "grad_norm": 0.12778639793395996, + "learning_rate": 6.525874786334263e-07, + "loss": 1.6447, + "step": 30954 + }, + { + "epoch": 9.501227747084101, + "grad_norm": 0.17508088052272797, + "learning_rate": 6.517872747905384e-07, + "loss": 1.7618, + "step": 30955 + }, + { + "epoch": 9.501534683855127, + "grad_norm": 0.1603916436433792, + "learning_rate": 6.509875586356073e-07, + "loss": 1.7083, + "step": 30956 + }, + { + "epoch": 9.50184162062615, + "grad_norm": 0.15757711231708527, + "learning_rate": 6.501883301765432e-07, + "loss": 1.7301, + "step": 30957 + }, + { + "epoch": 9.502148557397176, + "grad_norm": 0.12382685393095016, + "learning_rate": 6.493895894212399e-07, + "loss": 1.6719, + "step": 30958 + }, + { + "epoch": 9.502455494168201, + "grad_norm": 0.16945087909698486, + "learning_rate": 6.485913363775964e-07, + "loss": 1.6606, + "step": 30959 + }, + { + "epoch": 9.502762430939226, + "grad_norm": 0.157539501786232, + "learning_rate": 6.477935710534955e-07, + "loss": 1.7092, + "step": 30960 + }, + { + "epoch": 9.503069367710252, + "grad_norm": 0.11866376549005508, + "learning_rate": 6.469962934568308e-07, + "loss": 1.6525, + "step": 30961 + }, + { + "epoch": 9.503376304481277, + "grad_norm": 0.15672917664051056, + "learning_rate": 6.461995035954737e-07, + "loss": 1.7218, + "step": 30962 + }, + { + "epoch": 9.503683241252302, + "grad_norm": 0.10983888059854507, + "learning_rate": 6.454032014772959e-07, + "loss": 1.6658, + "step": 30963 + }, + { + "epoch": 9.503990178023328, + "grad_norm": 0.14017660915851593, + "learning_rate": 6.446073871101744e-07, + "loss": 1.7096, + "step": 30964 + }, + { + "epoch": 9.504297114794353, + "grad_norm": 0.14705055952072144, + "learning_rate": 6.438120605019693e-07, + "loss": 1.7113, + "step": 30965 + }, + { + "epoch": 9.504604051565378, + "grad_norm": 0.13271331787109375, + "learning_rate": 6.430172216605468e-07, + "loss": 1.6483, + "step": 30966 + }, + { + "epoch": 9.504910988336402, + "grad_norm": 0.13414405286312103, + "learning_rate": 6.422228705937505e-07, + "loss": 1.7011, + "step": 30967 + }, + { + "epoch": 9.505217925107427, + "grad_norm": 0.12676768004894257, + "learning_rate": 6.414290073094409e-07, + "loss": 1.6963, + "step": 30968 + }, + { + "epoch": 9.505524861878452, + "grad_norm": 0.1459144800901413, + "learning_rate": 6.406356318154616e-07, + "loss": 1.7426, + "step": 30969 + }, + { + "epoch": 9.505831798649478, + "grad_norm": 0.13834135234355927, + "learning_rate": 6.398427441196509e-07, + "loss": 1.7045, + "step": 30970 + }, + { + "epoch": 9.506138735420503, + "grad_norm": 0.1961667686700821, + "learning_rate": 6.390503442298413e-07, + "loss": 1.7121, + "step": 30971 + }, + { + "epoch": 9.506445672191528, + "grad_norm": 0.10918349772691727, + "learning_rate": 6.382584321538709e-07, + "loss": 1.6482, + "step": 30972 + }, + { + "epoch": 9.506752608962554, + "grad_norm": 0.16137553751468658, + "learning_rate": 6.37467007899556e-07, + "loss": 1.7211, + "step": 30973 + }, + { + "epoch": 9.50705954573358, + "grad_norm": 0.14611978828907013, + "learning_rate": 6.366760714747344e-07, + "loss": 1.7762, + "step": 30974 + }, + { + "epoch": 9.507366482504604, + "grad_norm": 0.1840377300977707, + "learning_rate": 6.358856228872057e-07, + "loss": 1.7359, + "step": 30975 + }, + { + "epoch": 9.50767341927563, + "grad_norm": 0.15308772027492523, + "learning_rate": 6.350956621447968e-07, + "loss": 1.6772, + "step": 30976 + }, + { + "epoch": 9.507980356046655, + "grad_norm": 0.09826724231243134, + "learning_rate": 6.34306189255296e-07, + "loss": 1.6529, + "step": 30977 + }, + { + "epoch": 9.50828729281768, + "grad_norm": 0.135554239153862, + "learning_rate": 6.335172042265192e-07, + "loss": 1.6707, + "step": 30978 + }, + { + "epoch": 9.508594229588704, + "grad_norm": 0.13289806246757507, + "learning_rate": 6.327287070662658e-07, + "loss": 1.7411, + "step": 30979 + }, + { + "epoch": 9.50890116635973, + "grad_norm": 0.11493640393018723, + "learning_rate": 6.319406977823128e-07, + "loss": 1.6771, + "step": 30980 + }, + { + "epoch": 9.509208103130755, + "grad_norm": 0.17868508398532867, + "learning_rate": 6.311531763824596e-07, + "loss": 1.7614, + "step": 30981 + }, + { + "epoch": 9.50951503990178, + "grad_norm": 0.1414751559495926, + "learning_rate": 6.303661428744889e-07, + "loss": 1.7023, + "step": 30982 + }, + { + "epoch": 9.509821976672805, + "grad_norm": 0.11903268843889236, + "learning_rate": 6.295795972661777e-07, + "loss": 1.7241, + "step": 30983 + }, + { + "epoch": 9.51012891344383, + "grad_norm": 0.12880147993564606, + "learning_rate": 6.287935395652977e-07, + "loss": 1.6842, + "step": 30984 + }, + { + "epoch": 9.510435850214856, + "grad_norm": 0.11090810596942902, + "learning_rate": 6.280079697796148e-07, + "loss": 1.6561, + "step": 30985 + }, + { + "epoch": 9.510742786985881, + "grad_norm": 0.12199088931083679, + "learning_rate": 6.272228879168951e-07, + "loss": 1.6541, + "step": 30986 + }, + { + "epoch": 9.511049723756907, + "grad_norm": 0.19049455225467682, + "learning_rate": 6.264382939848989e-07, + "loss": 1.7782, + "step": 30987 + }, + { + "epoch": 9.511356660527932, + "grad_norm": 0.14614251255989075, + "learning_rate": 6.256541879913813e-07, + "loss": 1.733, + "step": 30988 + }, + { + "epoch": 9.511663597298956, + "grad_norm": 0.13675597310066223, + "learning_rate": 6.24870569944086e-07, + "loss": 1.6957, + "step": 30989 + }, + { + "epoch": 9.511970534069981, + "grad_norm": 0.11168385297060013, + "learning_rate": 6.240874398507513e-07, + "loss": 1.6745, + "step": 30990 + }, + { + "epoch": 9.512277470841006, + "grad_norm": 0.13322143256664276, + "learning_rate": 6.233047977191375e-07, + "loss": 1.6831, + "step": 30991 + }, + { + "epoch": 9.512584407612032, + "grad_norm": 0.16648098826408386, + "learning_rate": 6.225226435569553e-07, + "loss": 1.7878, + "step": 30992 + }, + { + "epoch": 9.512891344383057, + "grad_norm": 0.16310833394527435, + "learning_rate": 6.21740977371954e-07, + "loss": 1.6797, + "step": 30993 + }, + { + "epoch": 9.513198281154082, + "grad_norm": 0.13475677371025085, + "learning_rate": 6.209597991718441e-07, + "loss": 1.7132, + "step": 30994 + }, + { + "epoch": 9.513505217925108, + "grad_norm": 0.1621815413236618, + "learning_rate": 6.201791089643528e-07, + "loss": 1.7452, + "step": 30995 + }, + { + "epoch": 9.513812154696133, + "grad_norm": 0.11439715325832367, + "learning_rate": 6.193989067571959e-07, + "loss": 1.707, + "step": 30996 + }, + { + "epoch": 9.514119091467158, + "grad_norm": 0.182517409324646, + "learning_rate": 6.186191925580786e-07, + "loss": 1.6871, + "step": 30997 + }, + { + "epoch": 9.514426028238184, + "grad_norm": 0.1009940356016159, + "learning_rate": 6.17839966374717e-07, + "loss": 1.6409, + "step": 30998 + }, + { + "epoch": 9.514732965009209, + "grad_norm": 0.22212521731853485, + "learning_rate": 6.170612282147936e-07, + "loss": 1.7206, + "step": 30999 + }, + { + "epoch": 9.515039901780233, + "grad_norm": 0.1333693414926529, + "learning_rate": 6.162829780860247e-07, + "loss": 1.6929, + "step": 31000 + }, + { + "epoch": 9.515346838551258, + "grad_norm": 0.12046591937541962, + "learning_rate": 6.155052159960873e-07, + "loss": 1.6484, + "step": 31001 + }, + { + "epoch": 9.515653775322283, + "grad_norm": 0.13430583477020264, + "learning_rate": 6.147279419526753e-07, + "loss": 1.6677, + "step": 31002 + }, + { + "epoch": 9.515960712093309, + "grad_norm": 0.12045972794294357, + "learning_rate": 6.139511559634659e-07, + "loss": 1.7014, + "step": 31003 + }, + { + "epoch": 9.516267648864334, + "grad_norm": 0.1649526059627533, + "learning_rate": 6.131748580361363e-07, + "loss": 1.7326, + "step": 31004 + }, + { + "epoch": 9.51657458563536, + "grad_norm": 0.1313924789428711, + "learning_rate": 6.123990481783636e-07, + "loss": 1.7441, + "step": 31005 + }, + { + "epoch": 9.516881522406385, + "grad_norm": 0.145765021443367, + "learning_rate": 6.116237263978031e-07, + "loss": 1.7383, + "step": 31006 + }, + { + "epoch": 9.51718845917741, + "grad_norm": 0.14247392117977142, + "learning_rate": 6.108488927021261e-07, + "loss": 1.6956, + "step": 31007 + }, + { + "epoch": 9.517495395948435, + "grad_norm": 0.12804681062698364, + "learning_rate": 6.100745470989933e-07, + "loss": 1.6864, + "step": 31008 + }, + { + "epoch": 9.51780233271946, + "grad_norm": 0.15574663877487183, + "learning_rate": 6.093006895960485e-07, + "loss": 1.6709, + "step": 31009 + }, + { + "epoch": 9.518109269490484, + "grad_norm": 0.14249230921268463, + "learning_rate": 6.085273202009467e-07, + "loss": 1.7125, + "step": 31010 + }, + { + "epoch": 9.51841620626151, + "grad_norm": 0.13120415806770325, + "learning_rate": 6.077544389213207e-07, + "loss": 1.6875, + "step": 31011 + }, + { + "epoch": 9.518723143032535, + "grad_norm": 0.11910203844308853, + "learning_rate": 6.069820457648201e-07, + "loss": 1.7113, + "step": 31012 + }, + { + "epoch": 9.51903007980356, + "grad_norm": 0.13545389473438263, + "learning_rate": 6.062101407390775e-07, + "loss": 1.7356, + "step": 31013 + }, + { + "epoch": 9.519337016574585, + "grad_norm": 0.1885189414024353, + "learning_rate": 6.05438723851709e-07, + "loss": 1.7558, + "step": 31014 + }, + { + "epoch": 9.51964395334561, + "grad_norm": 0.1113700196146965, + "learning_rate": 6.04667795110353e-07, + "loss": 1.6546, + "step": 31015 + }, + { + "epoch": 9.519950890116636, + "grad_norm": 0.18005676567554474, + "learning_rate": 6.038973545226089e-07, + "loss": 1.7657, + "step": 31016 + }, + { + "epoch": 9.520257826887661, + "grad_norm": 0.12435733526945114, + "learning_rate": 6.031274020961152e-07, + "loss": 1.7219, + "step": 31017 + }, + { + "epoch": 9.520564763658687, + "grad_norm": 0.20083987712860107, + "learning_rate": 6.023579378384659e-07, + "loss": 1.7779, + "step": 31018 + }, + { + "epoch": 9.520871700429712, + "grad_norm": 0.15939640998840332, + "learning_rate": 6.015889617572656e-07, + "loss": 1.6895, + "step": 31019 + }, + { + "epoch": 9.521178637200737, + "grad_norm": 0.20790094137191772, + "learning_rate": 6.008204738601198e-07, + "loss": 1.7553, + "step": 31020 + }, + { + "epoch": 9.521485573971761, + "grad_norm": 0.10034120082855225, + "learning_rate": 6.000524741546165e-07, + "loss": 1.6232, + "step": 31021 + }, + { + "epoch": 9.521792510742786, + "grad_norm": 0.11239612102508545, + "learning_rate": 5.992849626483498e-07, + "loss": 1.6838, + "step": 31022 + }, + { + "epoch": 9.522099447513812, + "grad_norm": 0.13167715072631836, + "learning_rate": 5.985179393489083e-07, + "loss": 1.6638, + "step": 31023 + }, + { + "epoch": 9.522406384284837, + "grad_norm": 0.1241912767291069, + "learning_rate": 5.977514042638577e-07, + "loss": 1.6935, + "step": 31024 + }, + { + "epoch": 9.522713321055862, + "grad_norm": 0.125594824552536, + "learning_rate": 5.969853574007922e-07, + "loss": 1.6935, + "step": 31025 + }, + { + "epoch": 9.523020257826888, + "grad_norm": 0.1614350974559784, + "learning_rate": 5.962197987672668e-07, + "loss": 1.7097, + "step": 31026 + }, + { + "epoch": 9.523327194597913, + "grad_norm": 0.15176361799240112, + "learning_rate": 5.954547283708644e-07, + "loss": 1.6911, + "step": 31027 + }, + { + "epoch": 9.523634131368938, + "grad_norm": 0.09742459654808044, + "learning_rate": 5.946901462191234e-07, + "loss": 1.6243, + "step": 31028 + }, + { + "epoch": 9.523941068139964, + "grad_norm": 0.15997633337974548, + "learning_rate": 5.939260523196155e-07, + "loss": 1.7174, + "step": 31029 + }, + { + "epoch": 9.524248004910989, + "grad_norm": 0.21839283406734467, + "learning_rate": 5.931624466798957e-07, + "loss": 1.8457, + "step": 31030 + }, + { + "epoch": 9.524554941682014, + "grad_norm": 0.16808728873729706, + "learning_rate": 5.923993293074914e-07, + "loss": 1.7274, + "step": 31031 + }, + { + "epoch": 9.524861878453038, + "grad_norm": 0.11654167622327805, + "learning_rate": 5.916367002099688e-07, + "loss": 1.6902, + "step": 31032 + }, + { + "epoch": 9.525168815224063, + "grad_norm": 0.12978383898735046, + "learning_rate": 5.908745593948383e-07, + "loss": 1.7095, + "step": 31033 + }, + { + "epoch": 9.525475751995089, + "grad_norm": 0.13306757807731628, + "learning_rate": 5.901129068696498e-07, + "loss": 1.7218, + "step": 31034 + }, + { + "epoch": 9.525782688766114, + "grad_norm": 0.20930147171020508, + "learning_rate": 5.893517426419304e-07, + "loss": 1.8132, + "step": 31035 + }, + { + "epoch": 9.52608962553714, + "grad_norm": 0.14664147794246674, + "learning_rate": 5.885910667191907e-07, + "loss": 1.6764, + "step": 31036 + }, + { + "epoch": 9.526396562308165, + "grad_norm": 0.20831573009490967, + "learning_rate": 5.878308791089582e-07, + "loss": 1.7434, + "step": 31037 + }, + { + "epoch": 9.52670349907919, + "grad_norm": 0.13942310214042664, + "learning_rate": 5.870711798187433e-07, + "loss": 1.7272, + "step": 31038 + }, + { + "epoch": 9.527010435850215, + "grad_norm": 0.15469035506248474, + "learning_rate": 5.863119688560514e-07, + "loss": 1.6838, + "step": 31039 + }, + { + "epoch": 9.52731737262124, + "grad_norm": 0.13903473317623138, + "learning_rate": 5.855532462283875e-07, + "loss": 1.7166, + "step": 31040 + }, + { + "epoch": 9.527624309392266, + "grad_norm": 0.12209124863147736, + "learning_rate": 5.847950119432455e-07, + "loss": 1.6259, + "step": 31041 + }, + { + "epoch": 9.527931246163291, + "grad_norm": 0.09797443449497223, + "learning_rate": 5.840372660081251e-07, + "loss": 1.676, + "step": 31042 + }, + { + "epoch": 9.528238182934315, + "grad_norm": 0.14228491485118866, + "learning_rate": 5.83280008430509e-07, + "loss": 1.7104, + "step": 31043 + }, + { + "epoch": 9.52854511970534, + "grad_norm": 0.1535727083683014, + "learning_rate": 5.825232392178914e-07, + "loss": 1.7169, + "step": 31044 + }, + { + "epoch": 9.528852056476365, + "grad_norm": 0.14102879166603088, + "learning_rate": 5.817669583777386e-07, + "loss": 1.7182, + "step": 31045 + }, + { + "epoch": 9.52915899324739, + "grad_norm": 0.17063194513320923, + "learning_rate": 5.810111659175333e-07, + "loss": 1.7164, + "step": 31046 + }, + { + "epoch": 9.529465930018416, + "grad_norm": 0.15687642991542816, + "learning_rate": 5.802558618447418e-07, + "loss": 1.7198, + "step": 31047 + }, + { + "epoch": 9.529772866789441, + "grad_norm": 0.18693117797374725, + "learning_rate": 5.795010461668193e-07, + "loss": 1.7213, + "step": 31048 + }, + { + "epoch": 9.530079803560467, + "grad_norm": 0.14518466591835022, + "learning_rate": 5.787467188912432e-07, + "loss": 1.7147, + "step": 31049 + }, + { + "epoch": 9.530386740331492, + "grad_norm": 0.14564110338687897, + "learning_rate": 5.77992880025452e-07, + "loss": 1.7007, + "step": 31050 + }, + { + "epoch": 9.530693677102517, + "grad_norm": 0.14775414764881134, + "learning_rate": 5.772395295769007e-07, + "loss": 1.6947, + "step": 31051 + }, + { + "epoch": 9.531000613873543, + "grad_norm": 0.18668405711650848, + "learning_rate": 5.76486667553039e-07, + "loss": 1.7567, + "step": 31052 + }, + { + "epoch": 9.531307550644566, + "grad_norm": 0.12053389847278595, + "learning_rate": 5.757342939613053e-07, + "loss": 1.6763, + "step": 31053 + }, + { + "epoch": 9.531614487415592, + "grad_norm": 0.13077262043952942, + "learning_rate": 5.749824088091382e-07, + "loss": 1.6783, + "step": 31054 + }, + { + "epoch": 9.531921424186617, + "grad_norm": 0.1928776055574417, + "learning_rate": 5.742310121039596e-07, + "loss": 1.7303, + "step": 31055 + }, + { + "epoch": 9.532228360957642, + "grad_norm": 0.13202275335788727, + "learning_rate": 5.734801038531967e-07, + "loss": 1.7008, + "step": 31056 + }, + { + "epoch": 9.532535297728668, + "grad_norm": 0.1478370577096939, + "learning_rate": 5.72729684064277e-07, + "loss": 1.7352, + "step": 31057 + }, + { + "epoch": 9.532842234499693, + "grad_norm": 0.1766318529844284, + "learning_rate": 5.719797527446058e-07, + "loss": 1.7635, + "step": 31058 + }, + { + "epoch": 9.533149171270718, + "grad_norm": 0.13437522947788239, + "learning_rate": 5.712303099016103e-07, + "loss": 1.6607, + "step": 31059 + }, + { + "epoch": 9.533456108041744, + "grad_norm": 0.1521230787038803, + "learning_rate": 5.704813555426847e-07, + "loss": 1.6922, + "step": 31060 + }, + { + "epoch": 9.533763044812769, + "grad_norm": 0.14926433563232422, + "learning_rate": 5.697328896752341e-07, + "loss": 1.7459, + "step": 31061 + }, + { + "epoch": 9.534069981583794, + "grad_norm": 0.14931491017341614, + "learning_rate": 5.689849123066526e-07, + "loss": 1.7042, + "step": 31062 + }, + { + "epoch": 9.53437691835482, + "grad_norm": 0.1489458531141281, + "learning_rate": 5.682374234443344e-07, + "loss": 1.7353, + "step": 31063 + }, + { + "epoch": 9.534683855125843, + "grad_norm": 0.1196800023317337, + "learning_rate": 5.674904230956735e-07, + "loss": 1.6928, + "step": 31064 + }, + { + "epoch": 9.534990791896869, + "grad_norm": 0.11683658510446548, + "learning_rate": 5.667439112680417e-07, + "loss": 1.6829, + "step": 31065 + }, + { + "epoch": 9.535297728667894, + "grad_norm": 0.14123310148715973, + "learning_rate": 5.659978879688221e-07, + "loss": 1.6971, + "step": 31066 + }, + { + "epoch": 9.53560466543892, + "grad_norm": 0.13458828628063202, + "learning_rate": 5.652523532053811e-07, + "loss": 1.7138, + "step": 31067 + }, + { + "epoch": 9.535911602209945, + "grad_norm": 0.1536986231803894, + "learning_rate": 5.645073069850903e-07, + "loss": 1.7484, + "step": 31068 + }, + { + "epoch": 9.53621853898097, + "grad_norm": 0.12006396800279617, + "learning_rate": 5.637627493153164e-07, + "loss": 1.669, + "step": 31069 + }, + { + "epoch": 9.536525475751995, + "grad_norm": 0.1644553393125534, + "learning_rate": 5.630186802034143e-07, + "loss": 1.6823, + "step": 31070 + }, + { + "epoch": 9.53683241252302, + "grad_norm": 0.12596864998340607, + "learning_rate": 5.622750996567395e-07, + "loss": 1.7111, + "step": 31071 + }, + { + "epoch": 9.537139349294046, + "grad_norm": 0.14411930739879608, + "learning_rate": 5.615320076826358e-07, + "loss": 1.7183, + "step": 31072 + }, + { + "epoch": 9.537446286065071, + "grad_norm": 0.14459045231342316, + "learning_rate": 5.607894042884531e-07, + "loss": 1.7366, + "step": 31073 + }, + { + "epoch": 9.537753222836095, + "grad_norm": 0.12643924355506897, + "learning_rate": 5.600472894815245e-07, + "loss": 1.6678, + "step": 31074 + }, + { + "epoch": 9.53806015960712, + "grad_norm": 0.13994373381137848, + "learning_rate": 5.593056632691829e-07, + "loss": 1.7825, + "step": 31075 + }, + { + "epoch": 9.538367096378146, + "grad_norm": 0.11746983230113983, + "learning_rate": 5.585645256587668e-07, + "loss": 1.688, + "step": 31076 + }, + { + "epoch": 9.53867403314917, + "grad_norm": 0.13083167374134064, + "learning_rate": 5.578238766575871e-07, + "loss": 1.7332, + "step": 31077 + }, + { + "epoch": 9.538980969920196, + "grad_norm": 0.12449757009744644, + "learning_rate": 5.57083716272977e-07, + "loss": 1.7419, + "step": 31078 + }, + { + "epoch": 9.539287906691222, + "grad_norm": 0.11567985266447067, + "learning_rate": 5.563440445122415e-07, + "loss": 1.6608, + "step": 31079 + }, + { + "epoch": 9.539594843462247, + "grad_norm": 0.10740742087364197, + "learning_rate": 5.55604861382697e-07, + "loss": 1.6866, + "step": 31080 + }, + { + "epoch": 9.539901780233272, + "grad_norm": 0.1555785983800888, + "learning_rate": 5.548661668916489e-07, + "loss": 1.6391, + "step": 31081 + }, + { + "epoch": 9.540208717004298, + "grad_norm": 0.12961047887802124, + "learning_rate": 5.541279610463857e-07, + "loss": 1.6912, + "step": 31082 + }, + { + "epoch": 9.540515653775323, + "grad_norm": 0.17427892982959747, + "learning_rate": 5.533902438542183e-07, + "loss": 1.7527, + "step": 31083 + }, + { + "epoch": 9.540822590546348, + "grad_norm": 0.137424036860466, + "learning_rate": 5.526530153224241e-07, + "loss": 1.7119, + "step": 31084 + }, + { + "epoch": 9.541129527317374, + "grad_norm": 0.15986669063568115, + "learning_rate": 5.519162754582974e-07, + "loss": 1.7379, + "step": 31085 + }, + { + "epoch": 9.541436464088397, + "grad_norm": 0.34904229640960693, + "learning_rate": 5.511800242691157e-07, + "loss": 1.776, + "step": 31086 + }, + { + "epoch": 9.541743400859422, + "grad_norm": 0.10629575699567795, + "learning_rate": 5.504442617621563e-07, + "loss": 1.6572, + "step": 31087 + }, + { + "epoch": 9.542050337630448, + "grad_norm": 0.14238065481185913, + "learning_rate": 5.497089879446915e-07, + "loss": 1.6707, + "step": 31088 + }, + { + "epoch": 9.542357274401473, + "grad_norm": 0.14475369453430176, + "learning_rate": 5.48974202823982e-07, + "loss": 1.718, + "step": 31089 + }, + { + "epoch": 9.542664211172498, + "grad_norm": 0.17306506633758545, + "learning_rate": 5.482399064072996e-07, + "loss": 1.7558, + "step": 31090 + }, + { + "epoch": 9.542971147943524, + "grad_norm": 0.10227597504854202, + "learning_rate": 5.475060987018943e-07, + "loss": 1.6278, + "step": 31091 + }, + { + "epoch": 9.543278084714549, + "grad_norm": 0.15417295694351196, + "learning_rate": 5.467727797150102e-07, + "loss": 1.7579, + "step": 31092 + }, + { + "epoch": 9.543585021485574, + "grad_norm": 0.1255696415901184, + "learning_rate": 5.460399494539136e-07, + "loss": 1.7061, + "step": 31093 + }, + { + "epoch": 9.5438919582566, + "grad_norm": 0.14167217910289764, + "learning_rate": 5.453076079258268e-07, + "loss": 1.7164, + "step": 31094 + }, + { + "epoch": 9.544198895027625, + "grad_norm": 0.16300976276397705, + "learning_rate": 5.445757551380048e-07, + "loss": 1.7092, + "step": 31095 + }, + { + "epoch": 9.544505831798649, + "grad_norm": 0.12125522643327713, + "learning_rate": 5.438443910976699e-07, + "loss": 1.697, + "step": 31096 + }, + { + "epoch": 9.544812768569674, + "grad_norm": 0.15089687705039978, + "learning_rate": 5.431135158120493e-07, + "loss": 1.6578, + "step": 31097 + }, + { + "epoch": 9.5451197053407, + "grad_norm": 0.17200914025306702, + "learning_rate": 5.423831292883708e-07, + "loss": 1.689, + "step": 31098 + }, + { + "epoch": 9.545426642111725, + "grad_norm": 0.14511042833328247, + "learning_rate": 5.416532315338508e-07, + "loss": 1.7301, + "step": 31099 + }, + { + "epoch": 9.54573357888275, + "grad_norm": 0.12074702233076096, + "learning_rate": 5.409238225557001e-07, + "loss": 1.6845, + "step": 31100 + }, + { + "epoch": 9.546040515653775, + "grad_norm": 0.12752333283424377, + "learning_rate": 5.401949023611297e-07, + "loss": 1.7021, + "step": 31101 + }, + { + "epoch": 9.5463474524248, + "grad_norm": 0.1448252946138382, + "learning_rate": 5.394664709573394e-07, + "loss": 1.7037, + "step": 31102 + }, + { + "epoch": 9.546654389195826, + "grad_norm": 0.17957226932048798, + "learning_rate": 5.387385283515345e-07, + "loss": 1.7527, + "step": 31103 + }, + { + "epoch": 9.546961325966851, + "grad_norm": 0.13432875275611877, + "learning_rate": 5.380110745509093e-07, + "loss": 1.7244, + "step": 31104 + }, + { + "epoch": 9.547268262737877, + "grad_norm": 0.13721013069152832, + "learning_rate": 5.372841095626413e-07, + "loss": 1.6907, + "step": 31105 + }, + { + "epoch": 9.547575199508902, + "grad_norm": 0.14336919784545898, + "learning_rate": 5.365576333939304e-07, + "loss": 1.7179, + "step": 31106 + }, + { + "epoch": 9.547882136279926, + "grad_norm": 0.13788890838623047, + "learning_rate": 5.358316460519431e-07, + "loss": 1.7157, + "step": 31107 + }, + { + "epoch": 9.54818907305095, + "grad_norm": 0.15330001711845398, + "learning_rate": 5.351061475438623e-07, + "loss": 1.7515, + "step": 31108 + }, + { + "epoch": 9.548496009821976, + "grad_norm": 0.11875810474157333, + "learning_rate": 5.343811378768492e-07, + "loss": 1.6855, + "step": 31109 + }, + { + "epoch": 9.548802946593002, + "grad_norm": 0.1445886343717575, + "learning_rate": 5.336566170580814e-07, + "loss": 1.7519, + "step": 31110 + }, + { + "epoch": 9.549109883364027, + "grad_norm": 0.2866973578929901, + "learning_rate": 5.329325850947087e-07, + "loss": 1.6697, + "step": 31111 + }, + { + "epoch": 9.549416820135052, + "grad_norm": 0.15357863903045654, + "learning_rate": 5.322090419938919e-07, + "loss": 1.7397, + "step": 31112 + }, + { + "epoch": 9.549723756906078, + "grad_norm": 0.12374851852655411, + "learning_rate": 5.314859877627754e-07, + "loss": 1.7267, + "step": 31113 + }, + { + "epoch": 9.550030693677103, + "grad_norm": 0.12979474663734436, + "learning_rate": 5.307634224085145e-07, + "loss": 1.7158, + "step": 31114 + }, + { + "epoch": 9.550337630448128, + "grad_norm": 0.10462703555822372, + "learning_rate": 5.300413459382425e-07, + "loss": 1.6312, + "step": 31115 + }, + { + "epoch": 9.550644567219154, + "grad_norm": 0.11557597666978836, + "learning_rate": 5.293197583590926e-07, + "loss": 1.6961, + "step": 31116 + }, + { + "epoch": 9.550951503990177, + "grad_norm": 0.13233163952827454, + "learning_rate": 5.285986596782089e-07, + "loss": 1.6665, + "step": 31117 + }, + { + "epoch": 9.551258440761202, + "grad_norm": 0.13464027643203735, + "learning_rate": 5.278780499027025e-07, + "loss": 1.709, + "step": 31118 + }, + { + "epoch": 9.551565377532228, + "grad_norm": 0.15500648319721222, + "learning_rate": 5.27157929039701e-07, + "loss": 1.7837, + "step": 31119 + }, + { + "epoch": 9.551872314303253, + "grad_norm": 0.10849796235561371, + "learning_rate": 5.264382970963267e-07, + "loss": 1.6777, + "step": 31120 + }, + { + "epoch": 9.552179251074278, + "grad_norm": 0.12520049512386322, + "learning_rate": 5.25719154079679e-07, + "loss": 1.6912, + "step": 31121 + }, + { + "epoch": 9.552486187845304, + "grad_norm": 0.171976700425148, + "learning_rate": 5.250004999968806e-07, + "loss": 1.7431, + "step": 31122 + }, + { + "epoch": 9.55279312461633, + "grad_norm": 0.15759800374507904, + "learning_rate": 5.242823348550197e-07, + "loss": 1.7266, + "step": 31123 + }, + { + "epoch": 9.553100061387354, + "grad_norm": 0.14026059210300446, + "learning_rate": 5.235646586612075e-07, + "loss": 1.6999, + "step": 31124 + }, + { + "epoch": 9.55340699815838, + "grad_norm": 0.16142502427101135, + "learning_rate": 5.228474714225218e-07, + "loss": 1.7189, + "step": 31125 + }, + { + "epoch": 9.553713934929405, + "grad_norm": 0.19895243644714355, + "learning_rate": 5.221307731460567e-07, + "loss": 1.7703, + "step": 31126 + }, + { + "epoch": 9.55402087170043, + "grad_norm": 0.12162072211503983, + "learning_rate": 5.214145638388956e-07, + "loss": 1.6722, + "step": 31127 + }, + { + "epoch": 9.554327808471456, + "grad_norm": 0.15602703392505646, + "learning_rate": 5.206988435081162e-07, + "loss": 1.7385, + "step": 31128 + }, + { + "epoch": 9.55463474524248, + "grad_norm": 0.14179575443267822, + "learning_rate": 5.199836121607959e-07, + "loss": 1.7018, + "step": 31129 + }, + { + "epoch": 9.554941682013505, + "grad_norm": 0.1313495635986328, + "learning_rate": 5.192688698039904e-07, + "loss": 1.6959, + "step": 31130 + }, + { + "epoch": 9.55524861878453, + "grad_norm": 0.10791079699993134, + "learning_rate": 5.185546164447774e-07, + "loss": 1.6555, + "step": 31131 + }, + { + "epoch": 9.555555555555555, + "grad_norm": 0.14998406171798706, + "learning_rate": 5.178408520902123e-07, + "loss": 1.7, + "step": 31132 + }, + { + "epoch": 9.55586249232658, + "grad_norm": 0.1362425684928894, + "learning_rate": 5.171275767473394e-07, + "loss": 1.6853, + "step": 31133 + }, + { + "epoch": 9.556169429097606, + "grad_norm": 0.1443333774805069, + "learning_rate": 5.164147904232197e-07, + "loss": 1.7404, + "step": 31134 + }, + { + "epoch": 9.556476365868631, + "grad_norm": 0.14398255944252014, + "learning_rate": 5.157024931248866e-07, + "loss": 1.6841, + "step": 31135 + }, + { + "epoch": 9.556783302639657, + "grad_norm": 0.1562454253435135, + "learning_rate": 5.149906848593899e-07, + "loss": 1.7195, + "step": 31136 + }, + { + "epoch": 9.557090239410682, + "grad_norm": 0.10564878582954407, + "learning_rate": 5.142793656337575e-07, + "loss": 1.6851, + "step": 31137 + }, + { + "epoch": 9.557397176181707, + "grad_norm": 0.15394751727581024, + "learning_rate": 5.135685354550223e-07, + "loss": 1.7149, + "step": 31138 + }, + { + "epoch": 9.557704112952731, + "grad_norm": 0.17012141644954681, + "learning_rate": 5.128581943302069e-07, + "loss": 1.7559, + "step": 31139 + }, + { + "epoch": 9.558011049723756, + "grad_norm": 0.14832472801208496, + "learning_rate": 5.121483422663332e-07, + "loss": 1.7165, + "step": 31140 + }, + { + "epoch": 9.558317986494782, + "grad_norm": 0.16663455963134766, + "learning_rate": 5.114389792704177e-07, + "loss": 1.7719, + "step": 31141 + }, + { + "epoch": 9.558624923265807, + "grad_norm": 0.15087881684303284, + "learning_rate": 5.107301053494607e-07, + "loss": 1.673, + "step": 31142 + }, + { + "epoch": 9.558931860036832, + "grad_norm": 0.1716073453426361, + "learning_rate": 5.10021720510484e-07, + "loss": 1.7244, + "step": 31143 + }, + { + "epoch": 9.559238796807858, + "grad_norm": 0.1661565750837326, + "learning_rate": 5.093138247604768e-07, + "loss": 1.6895, + "step": 31144 + }, + { + "epoch": 9.559545733578883, + "grad_norm": 0.14260123670101166, + "learning_rate": 5.086064181064332e-07, + "loss": 1.7166, + "step": 31145 + }, + { + "epoch": 9.559852670349908, + "grad_norm": 0.12638737261295319, + "learning_rate": 5.078995005553533e-07, + "loss": 1.6924, + "step": 31146 + }, + { + "epoch": 9.560159607120934, + "grad_norm": 0.1578296571969986, + "learning_rate": 5.071930721142148e-07, + "loss": 1.7215, + "step": 31147 + }, + { + "epoch": 9.560466543891959, + "grad_norm": 0.12237422913312912, + "learning_rate": 5.064871327900067e-07, + "loss": 1.6672, + "step": 31148 + }, + { + "epoch": 9.560773480662984, + "grad_norm": 0.11540009081363678, + "learning_rate": 5.057816825897011e-07, + "loss": 1.6942, + "step": 31149 + }, + { + "epoch": 9.561080417434008, + "grad_norm": 0.11710464954376221, + "learning_rate": 5.050767215202701e-07, + "loss": 1.6721, + "step": 31150 + }, + { + "epoch": 9.561387354205033, + "grad_norm": 0.1241387203335762, + "learning_rate": 5.04372249588686e-07, + "loss": 1.6574, + "step": 31151 + }, + { + "epoch": 9.561694290976058, + "grad_norm": 0.15445421636104584, + "learning_rate": 5.036682668018933e-07, + "loss": 1.6976, + "step": 31152 + }, + { + "epoch": 9.562001227747084, + "grad_norm": 0.15151409804821014, + "learning_rate": 5.029647731668752e-07, + "loss": 1.7067, + "step": 31153 + }, + { + "epoch": 9.56230816451811, + "grad_norm": 0.18623974919319153, + "learning_rate": 5.022617686905596e-07, + "loss": 1.7709, + "step": 31154 + }, + { + "epoch": 9.562615101289135, + "grad_norm": 0.14912709593772888, + "learning_rate": 5.015592533799074e-07, + "loss": 1.6918, + "step": 31155 + }, + { + "epoch": 9.56292203806016, + "grad_norm": 0.13887201249599457, + "learning_rate": 5.008572272418633e-07, + "loss": 1.6851, + "step": 31156 + }, + { + "epoch": 9.563228974831185, + "grad_norm": 0.1401492953300476, + "learning_rate": 5.001556902833548e-07, + "loss": 1.6886, + "step": 31157 + }, + { + "epoch": 9.56353591160221, + "grad_norm": 0.13679155707359314, + "learning_rate": 4.994546425113266e-07, + "loss": 1.7129, + "step": 31158 + }, + { + "epoch": 9.563842848373236, + "grad_norm": 0.12003178894519806, + "learning_rate": 4.987540839326954e-07, + "loss": 1.7186, + "step": 31159 + }, + { + "epoch": 9.56414978514426, + "grad_norm": 0.12413342297077179, + "learning_rate": 4.980540145543944e-07, + "loss": 1.6818, + "step": 31160 + }, + { + "epoch": 9.564456721915285, + "grad_norm": 0.16514070332050323, + "learning_rate": 4.973544343833347e-07, + "loss": 1.7551, + "step": 31161 + }, + { + "epoch": 9.56476365868631, + "grad_norm": 0.1000957265496254, + "learning_rate": 4.966553434264276e-07, + "loss": 1.6372, + "step": 31162 + }, + { + "epoch": 9.565070595457335, + "grad_norm": 0.16715119779109955, + "learning_rate": 4.959567416906008e-07, + "loss": 1.783, + "step": 31163 + }, + { + "epoch": 9.56537753222836, + "grad_norm": 0.1515718400478363, + "learning_rate": 4.952586291827321e-07, + "loss": 1.7858, + "step": 31164 + }, + { + "epoch": 9.565684468999386, + "grad_norm": 0.14952874183654785, + "learning_rate": 4.945610059097439e-07, + "loss": 1.7515, + "step": 31165 + }, + { + "epoch": 9.565991405770411, + "grad_norm": 0.11136786639690399, + "learning_rate": 4.938638718785138e-07, + "loss": 1.671, + "step": 31166 + }, + { + "epoch": 9.566298342541437, + "grad_norm": 0.10691037774085999, + "learning_rate": 4.931672270959308e-07, + "loss": 1.6479, + "step": 31167 + }, + { + "epoch": 9.566605279312462, + "grad_norm": 0.1559297740459442, + "learning_rate": 4.924710715689007e-07, + "loss": 1.704, + "step": 31168 + }, + { + "epoch": 9.566912216083487, + "grad_norm": 0.13859638571739197, + "learning_rate": 4.917754053042733e-07, + "loss": 1.7035, + "step": 31169 + }, + { + "epoch": 9.567219152854513, + "grad_norm": 0.13970541954040527, + "learning_rate": 4.910802283089544e-07, + "loss": 1.6903, + "step": 31170 + }, + { + "epoch": 9.567526089625538, + "grad_norm": 0.10885283350944519, + "learning_rate": 4.903855405897884e-07, + "loss": 1.669, + "step": 31171 + }, + { + "epoch": 9.567833026396562, + "grad_norm": 0.13587352633476257, + "learning_rate": 4.896913421536531e-07, + "loss": 1.7033, + "step": 31172 + }, + { + "epoch": 9.568139963167587, + "grad_norm": 0.1579197496175766, + "learning_rate": 4.889976330074042e-07, + "loss": 1.7772, + "step": 31173 + }, + { + "epoch": 9.568446899938612, + "grad_norm": 0.172073096036911, + "learning_rate": 4.883044131579029e-07, + "loss": 1.7545, + "step": 31174 + }, + { + "epoch": 9.568753836709638, + "grad_norm": 0.15477560460567474, + "learning_rate": 4.876116826119992e-07, + "loss": 1.6961, + "step": 31175 + }, + { + "epoch": 9.569060773480663, + "grad_norm": 0.12151028960943222, + "learning_rate": 4.869194413765376e-07, + "loss": 1.6918, + "step": 31176 + }, + { + "epoch": 9.569367710251688, + "grad_norm": 0.11448194086551666, + "learning_rate": 4.862276894583573e-07, + "loss": 1.695, + "step": 31177 + }, + { + "epoch": 9.569674647022714, + "grad_norm": 0.13363254070281982, + "learning_rate": 4.855364268642915e-07, + "loss": 1.6802, + "step": 31178 + }, + { + "epoch": 9.569981583793739, + "grad_norm": 0.13119351863861084, + "learning_rate": 4.848456536011792e-07, + "loss": 1.7026, + "step": 31179 + }, + { + "epoch": 9.570288520564764, + "grad_norm": 0.1255909502506256, + "learning_rate": 4.841553696758483e-07, + "loss": 1.6627, + "step": 31180 + }, + { + "epoch": 9.57059545733579, + "grad_norm": 0.13161277770996094, + "learning_rate": 4.8346557509511e-07, + "loss": 1.6906, + "step": 31181 + }, + { + "epoch": 9.570902394106813, + "grad_norm": 0.15130144357681274, + "learning_rate": 4.827762698657922e-07, + "loss": 1.7056, + "step": 31182 + }, + { + "epoch": 9.571209330877839, + "grad_norm": 0.11054715514183044, + "learning_rate": 4.820874539947007e-07, + "loss": 1.6326, + "step": 31183 + }, + { + "epoch": 9.571516267648864, + "grad_norm": 0.22645193338394165, + "learning_rate": 4.813991274886354e-07, + "loss": 1.7981, + "step": 31184 + }, + { + "epoch": 9.57182320441989, + "grad_norm": 0.09784482419490814, + "learning_rate": 4.807112903544242e-07, + "loss": 1.6359, + "step": 31185 + }, + { + "epoch": 9.572130141190915, + "grad_norm": 0.1499309092760086, + "learning_rate": 4.80023942598834e-07, + "loss": 1.7026, + "step": 31186 + }, + { + "epoch": 9.57243707796194, + "grad_norm": 0.1763381063938141, + "learning_rate": 4.793370842286815e-07, + "loss": 1.7516, + "step": 31187 + }, + { + "epoch": 9.572744014732965, + "grad_norm": 0.16786764562129974, + "learning_rate": 4.786507152507391e-07, + "loss": 1.7487, + "step": 31188 + }, + { + "epoch": 9.57305095150399, + "grad_norm": 0.1416286677122116, + "learning_rate": 4.779648356717958e-07, + "loss": 1.7016, + "step": 31189 + }, + { + "epoch": 9.573357888275016, + "grad_norm": 0.10985523462295532, + "learning_rate": 4.772794454986296e-07, + "loss": 1.6818, + "step": 31190 + }, + { + "epoch": 9.573664825046041, + "grad_norm": 0.16240783035755157, + "learning_rate": 4.7659454473801825e-07, + "loss": 1.7029, + "step": 31191 + }, + { + "epoch": 9.573971761817067, + "grad_norm": 0.16602420806884766, + "learning_rate": 4.7591013339672306e-07, + "loss": 1.7176, + "step": 31192 + }, + { + "epoch": 9.57427869858809, + "grad_norm": 0.11623486876487732, + "learning_rate": 4.7522621148151093e-07, + "loss": 1.6796, + "step": 31193 + }, + { + "epoch": 9.574585635359115, + "grad_norm": 0.1628381758928299, + "learning_rate": 4.74542778999143e-07, + "loss": 1.7524, + "step": 31194 + }, + { + "epoch": 9.57489257213014, + "grad_norm": 0.2524288296699524, + "learning_rate": 4.738598359563695e-07, + "loss": 1.7632, + "step": 31195 + }, + { + "epoch": 9.575199508901166, + "grad_norm": 0.13695289194583893, + "learning_rate": 4.731773823599406e-07, + "loss": 1.7155, + "step": 31196 + }, + { + "epoch": 9.575506445672191, + "grad_norm": 0.16224917769432068, + "learning_rate": 4.7249541821660637e-07, + "loss": 1.7063, + "step": 31197 + }, + { + "epoch": 9.575813382443217, + "grad_norm": 0.13433055579662323, + "learning_rate": 4.718139435330893e-07, + "loss": 1.708, + "step": 31198 + }, + { + "epoch": 9.576120319214242, + "grad_norm": 0.1861371546983719, + "learning_rate": 4.7113295831615054e-07, + "loss": 1.6628, + "step": 31199 + }, + { + "epoch": 9.576427255985267, + "grad_norm": 0.19167616963386536, + "learning_rate": 4.70452462572496e-07, + "loss": 1.7231, + "step": 31200 + }, + { + "epoch": 9.576734192756293, + "grad_norm": 0.13869838416576385, + "learning_rate": 4.6977245630886455e-07, + "loss": 1.6509, + "step": 31201 + }, + { + "epoch": 9.577041129527318, + "grad_norm": 0.14124059677124023, + "learning_rate": 4.690929395319732e-07, + "loss": 1.7077, + "step": 31202 + }, + { + "epoch": 9.577348066298342, + "grad_norm": 0.13248983025550842, + "learning_rate": 4.684139122485331e-07, + "loss": 1.6877, + "step": 31203 + }, + { + "epoch": 9.577655003069367, + "grad_norm": 0.08696278929710388, + "learning_rate": 4.6773537446526125e-07, + "loss": 1.6193, + "step": 31204 + }, + { + "epoch": 9.577961939840392, + "grad_norm": 0.1554766148328781, + "learning_rate": 4.670573261888578e-07, + "loss": 1.7169, + "step": 31205 + }, + { + "epoch": 9.578268876611418, + "grad_norm": 0.13041824102401733, + "learning_rate": 4.663797674260284e-07, + "loss": 1.7088, + "step": 31206 + }, + { + "epoch": 9.578575813382443, + "grad_norm": 0.10228250920772552, + "learning_rate": 4.6570269818346224e-07, + "loss": 1.6636, + "step": 31207 + }, + { + "epoch": 9.578882750153468, + "grad_norm": 0.11005907505750656, + "learning_rate": 4.6502611846785947e-07, + "loss": 1.6668, + "step": 31208 + }, + { + "epoch": 9.579189686924494, + "grad_norm": 0.10490129142999649, + "learning_rate": 4.643500282858981e-07, + "loss": 1.6666, + "step": 31209 + }, + { + "epoch": 9.579496623695519, + "grad_norm": 0.1278064250946045, + "learning_rate": 4.636744276442673e-07, + "loss": 1.6794, + "step": 31210 + }, + { + "epoch": 9.579803560466544, + "grad_norm": 0.1835307478904724, + "learning_rate": 4.6299931654963937e-07, + "loss": 1.7349, + "step": 31211 + }, + { + "epoch": 9.58011049723757, + "grad_norm": 0.14156827330589294, + "learning_rate": 4.623246950086868e-07, + "loss": 1.667, + "step": 31212 + }, + { + "epoch": 9.580417434008595, + "grad_norm": 0.1438005119562149, + "learning_rate": 4.61650563028071e-07, + "loss": 1.7248, + "step": 31213 + }, + { + "epoch": 9.580724370779619, + "grad_norm": 0.18247459828853607, + "learning_rate": 4.609769206144698e-07, + "loss": 1.8198, + "step": 31214 + }, + { + "epoch": 9.581031307550644, + "grad_norm": 0.12175338715314865, + "learning_rate": 4.6030376777452255e-07, + "loss": 1.6584, + "step": 31215 + }, + { + "epoch": 9.58133824432167, + "grad_norm": 0.1831531524658203, + "learning_rate": 4.5963110451489045e-07, + "loss": 1.709, + "step": 31216 + }, + { + "epoch": 9.581645181092695, + "grad_norm": 0.137215718626976, + "learning_rate": 4.5895893084222377e-07, + "loss": 1.767, + "step": 31217 + }, + { + "epoch": 9.58195211786372, + "grad_norm": 0.15977118909358978, + "learning_rate": 4.5828724676315606e-07, + "loss": 1.7384, + "step": 31218 + }, + { + "epoch": 9.582259054634745, + "grad_norm": 0.12897618114948273, + "learning_rate": 4.576160522843376e-07, + "loss": 1.6501, + "step": 31219 + }, + { + "epoch": 9.58256599140577, + "grad_norm": 0.13793621957302094, + "learning_rate": 4.5694534741239084e-07, + "loss": 1.7039, + "step": 31220 + }, + { + "epoch": 9.582872928176796, + "grad_norm": 0.11358989775180817, + "learning_rate": 4.562751321539549e-07, + "loss": 1.6699, + "step": 31221 + }, + { + "epoch": 9.583179864947821, + "grad_norm": 0.16851121187210083, + "learning_rate": 4.5560540651563565e-07, + "loss": 1.7399, + "step": 31222 + }, + { + "epoch": 9.583486801718847, + "grad_norm": 0.14942096173763275, + "learning_rate": 4.549361705040722e-07, + "loss": 1.6769, + "step": 31223 + }, + { + "epoch": 9.58379373848987, + "grad_norm": 0.13010743260383606, + "learning_rate": 4.542674241258649e-07, + "loss": 1.6713, + "step": 31224 + }, + { + "epoch": 9.584100675260895, + "grad_norm": 0.10744872689247131, + "learning_rate": 4.5359916738762497e-07, + "loss": 1.6919, + "step": 31225 + }, + { + "epoch": 9.58440761203192, + "grad_norm": 0.14843374490737915, + "learning_rate": 4.5293140029595836e-07, + "loss": 1.6961, + "step": 31226 + }, + { + "epoch": 9.584714548802946, + "grad_norm": 0.12312567979097366, + "learning_rate": 4.522641228574709e-07, + "loss": 1.6818, + "step": 31227 + }, + { + "epoch": 9.585021485573971, + "grad_norm": 0.15777400135993958, + "learning_rate": 4.5159733507874057e-07, + "loss": 1.6914, + "step": 31228 + }, + { + "epoch": 9.585328422344997, + "grad_norm": 0.12530489265918732, + "learning_rate": 4.509310369663733e-07, + "loss": 1.7149, + "step": 31229 + }, + { + "epoch": 9.585635359116022, + "grad_norm": 0.1540595442056656, + "learning_rate": 4.5026522852694155e-07, + "loss": 1.7282, + "step": 31230 + }, + { + "epoch": 9.585942295887047, + "grad_norm": 0.1336304396390915, + "learning_rate": 4.4959990976704005e-07, + "loss": 1.702, + "step": 31231 + }, + { + "epoch": 9.586249232658073, + "grad_norm": 0.23668836057186127, + "learning_rate": 4.4893508069322467e-07, + "loss": 1.7604, + "step": 31232 + }, + { + "epoch": 9.586556169429098, + "grad_norm": 0.14577987790107727, + "learning_rate": 4.482707413120846e-07, + "loss": 1.6841, + "step": 31233 + }, + { + "epoch": 9.586863106200123, + "grad_norm": 0.12077435851097107, + "learning_rate": 4.476068916301701e-07, + "loss": 1.7066, + "step": 31234 + }, + { + "epoch": 9.587170042971149, + "grad_norm": 0.10890510678291321, + "learning_rate": 4.469435316540427e-07, + "loss": 1.6594, + "step": 31235 + }, + { + "epoch": 9.587476979742172, + "grad_norm": 0.1251889169216156, + "learning_rate": 4.462806613902748e-07, + "loss": 1.7127, + "step": 31236 + }, + { + "epoch": 9.587783916513198, + "grad_norm": 0.2560112774372101, + "learning_rate": 4.4561828084540013e-07, + "loss": 1.8146, + "step": 31237 + }, + { + "epoch": 9.588090853284223, + "grad_norm": 0.1295570433139801, + "learning_rate": 4.4495639002597455e-07, + "loss": 1.6671, + "step": 31238 + }, + { + "epoch": 9.588397790055248, + "grad_norm": 0.1236012801527977, + "learning_rate": 4.4429498893852617e-07, + "loss": 1.6916, + "step": 31239 + }, + { + "epoch": 9.588704726826274, + "grad_norm": 0.16924844682216644, + "learning_rate": 4.436340775896053e-07, + "loss": 1.6838, + "step": 31240 + }, + { + "epoch": 9.589011663597299, + "grad_norm": 0.1686296910047531, + "learning_rate": 4.4297365598574e-07, + "loss": 1.7578, + "step": 31241 + }, + { + "epoch": 9.589318600368324, + "grad_norm": 0.13647985458374023, + "learning_rate": 4.4231372413345296e-07, + "loss": 1.6968, + "step": 31242 + }, + { + "epoch": 9.58962553713935, + "grad_norm": 0.13135603070259094, + "learning_rate": 4.4165428203927216e-07, + "loss": 1.6594, + "step": 31243 + }, + { + "epoch": 9.589932473910375, + "grad_norm": 0.13832809031009674, + "learning_rate": 4.409953297097036e-07, + "loss": 1.7405, + "step": 31244 + }, + { + "epoch": 9.5902394106814, + "grad_norm": 0.1193947121500969, + "learning_rate": 4.403368671512753e-07, + "loss": 1.6682, + "step": 31245 + }, + { + "epoch": 9.590546347452424, + "grad_norm": 0.11434894800186157, + "learning_rate": 4.3967889437048214e-07, + "loss": 1.6781, + "step": 31246 + }, + { + "epoch": 9.59085328422345, + "grad_norm": 0.14688155055046082, + "learning_rate": 4.3902141137382444e-07, + "loss": 1.7705, + "step": 31247 + }, + { + "epoch": 9.591160220994475, + "grad_norm": 0.13387629389762878, + "learning_rate": 4.383644181678137e-07, + "loss": 1.6999, + "step": 31248 + }, + { + "epoch": 9.5914671577655, + "grad_norm": 0.21924255788326263, + "learning_rate": 4.377079147589336e-07, + "loss": 1.7334, + "step": 31249 + }, + { + "epoch": 9.591774094536525, + "grad_norm": 0.14692620933055878, + "learning_rate": 4.3705190115367335e-07, + "loss": 1.6817, + "step": 31250 + }, + { + "epoch": 9.59208103130755, + "grad_norm": 0.11326060444116592, + "learning_rate": 4.3639637735851115e-07, + "loss": 1.6599, + "step": 31251 + }, + { + "epoch": 9.592387968078576, + "grad_norm": 0.12073694914579391, + "learning_rate": 4.3574134337993066e-07, + "loss": 1.6924, + "step": 31252 + }, + { + "epoch": 9.592694904849601, + "grad_norm": 0.14962032437324524, + "learning_rate": 4.3508679922441566e-07, + "loss": 1.7259, + "step": 31253 + }, + { + "epoch": 9.593001841620627, + "grad_norm": 0.1624862551689148, + "learning_rate": 4.344327448984109e-07, + "loss": 1.7194, + "step": 31254 + }, + { + "epoch": 9.593308778391652, + "grad_norm": 0.12331227213144302, + "learning_rate": 4.3377918040840017e-07, + "loss": 1.6871, + "step": 31255 + }, + { + "epoch": 9.593615715162677, + "grad_norm": 0.17856283485889435, + "learning_rate": 4.3312610576082825e-07, + "loss": 1.7479, + "step": 31256 + }, + { + "epoch": 9.5939226519337, + "grad_norm": 0.097813680768013, + "learning_rate": 4.324735209621622e-07, + "loss": 1.6385, + "step": 31257 + }, + { + "epoch": 9.594229588704726, + "grad_norm": 0.1290784329175949, + "learning_rate": 4.318214260188469e-07, + "loss": 1.6802, + "step": 31258 + }, + { + "epoch": 9.594536525475752, + "grad_norm": 0.1114344522356987, + "learning_rate": 4.3116982093732163e-07, + "loss": 1.6691, + "step": 31259 + }, + { + "epoch": 9.594843462246777, + "grad_norm": 0.12479976564645767, + "learning_rate": 4.305187057240312e-07, + "loss": 1.6891, + "step": 31260 + }, + { + "epoch": 9.595150399017802, + "grad_norm": 0.1734507828950882, + "learning_rate": 4.2986808038540385e-07, + "loss": 1.7337, + "step": 31261 + }, + { + "epoch": 9.595457335788828, + "grad_norm": 0.14148491621017456, + "learning_rate": 4.2921794492787884e-07, + "loss": 1.7019, + "step": 31262 + }, + { + "epoch": 9.595764272559853, + "grad_norm": 0.11479593068361282, + "learning_rate": 4.285682993578788e-07, + "loss": 1.6509, + "step": 31263 + }, + { + "epoch": 9.596071209330878, + "grad_norm": 0.13279953598976135, + "learning_rate": 4.279191436818153e-07, + "loss": 1.692, + "step": 31264 + }, + { + "epoch": 9.596378146101904, + "grad_norm": 0.13242286443710327, + "learning_rate": 4.27270477906111e-07, + "loss": 1.6787, + "step": 31265 + }, + { + "epoch": 9.596685082872929, + "grad_norm": 0.1530013382434845, + "learning_rate": 4.2662230203717737e-07, + "loss": 1.7245, + "step": 31266 + }, + { + "epoch": 9.596992019643952, + "grad_norm": 0.10855519771575928, + "learning_rate": 4.259746160814204e-07, + "loss": 1.6938, + "step": 31267 + }, + { + "epoch": 9.597298956414978, + "grad_norm": 0.16191129386425018, + "learning_rate": 4.253274200452351e-07, + "loss": 1.7375, + "step": 31268 + }, + { + "epoch": 9.597605893186003, + "grad_norm": 0.13151034712791443, + "learning_rate": 4.2468071393501617e-07, + "loss": 1.713, + "step": 31269 + }, + { + "epoch": 9.597912829957028, + "grad_norm": 0.11667583882808685, + "learning_rate": 4.2403449775716977e-07, + "loss": 1.6956, + "step": 31270 + }, + { + "epoch": 9.598219766728054, + "grad_norm": 0.13867171108722687, + "learning_rate": 4.233887715180629e-07, + "loss": 1.7364, + "step": 31271 + }, + { + "epoch": 9.598526703499079, + "grad_norm": 0.09936422109603882, + "learning_rate": 4.2274353522409606e-07, + "loss": 1.6493, + "step": 31272 + }, + { + "epoch": 9.598833640270104, + "grad_norm": 0.1310657113790512, + "learning_rate": 4.2209878888162524e-07, + "loss": 1.6937, + "step": 31273 + }, + { + "epoch": 9.59914057704113, + "grad_norm": 0.1411616951227188, + "learning_rate": 4.214545324970398e-07, + "loss": 1.7071, + "step": 31274 + }, + { + "epoch": 9.599447513812155, + "grad_norm": 0.16063901782035828, + "learning_rate": 4.208107660766958e-07, + "loss": 1.7328, + "step": 31275 + }, + { + "epoch": 9.59975445058318, + "grad_norm": 0.19482840597629547, + "learning_rate": 4.2016748962696027e-07, + "loss": 1.7442, + "step": 31276 + }, + { + "epoch": 9.600061387354206, + "grad_norm": 0.1624516397714615, + "learning_rate": 4.195247031541893e-07, + "loss": 1.7261, + "step": 31277 + }, + { + "epoch": 9.600368324125231, + "grad_norm": 0.1904727965593338, + "learning_rate": 4.1888240666473345e-07, + "loss": 1.6999, + "step": 31278 + }, + { + "epoch": 9.600675260896255, + "grad_norm": 0.0954340398311615, + "learning_rate": 4.1824060016494307e-07, + "loss": 1.6738, + "step": 31279 + }, + { + "epoch": 9.60098219766728, + "grad_norm": 0.185276597738266, + "learning_rate": 4.175992836611631e-07, + "loss": 1.7959, + "step": 31280 + }, + { + "epoch": 9.601289134438305, + "grad_norm": 0.13276509940624237, + "learning_rate": 4.1695845715972184e-07, + "loss": 1.6988, + "step": 31281 + }, + { + "epoch": 9.60159607120933, + "grad_norm": 0.145119309425354, + "learning_rate": 4.163181206669642e-07, + "loss": 1.6603, + "step": 31282 + }, + { + "epoch": 9.601903007980356, + "grad_norm": 0.2778591513633728, + "learning_rate": 4.156782741892129e-07, + "loss": 1.8033, + "step": 31283 + }, + { + "epoch": 9.602209944751381, + "grad_norm": 0.12991562485694885, + "learning_rate": 4.150389177327907e-07, + "loss": 1.6833, + "step": 31284 + }, + { + "epoch": 9.602516881522407, + "grad_norm": 0.19052881002426147, + "learning_rate": 4.144000513040147e-07, + "loss": 1.727, + "step": 31285 + }, + { + "epoch": 9.602823818293432, + "grad_norm": 0.180231973528862, + "learning_rate": 4.137616749091966e-07, + "loss": 1.7223, + "step": 31286 + }, + { + "epoch": 9.603130755064457, + "grad_norm": 0.11801919341087341, + "learning_rate": 4.131237885546535e-07, + "loss": 1.6674, + "step": 31287 + }, + { + "epoch": 9.603437691835483, + "grad_norm": 0.1323625147342682, + "learning_rate": 4.1248639224668596e-07, + "loss": 1.6999, + "step": 31288 + }, + { + "epoch": 9.603744628606506, + "grad_norm": 0.16466714441776276, + "learning_rate": 4.1184948599159443e-07, + "loss": 1.6886, + "step": 31289 + }, + { + "epoch": 9.604051565377532, + "grad_norm": 0.16557957231998444, + "learning_rate": 4.112130697956629e-07, + "loss": 1.7503, + "step": 31290 + }, + { + "epoch": 9.604358502148557, + "grad_norm": 0.12221503257751465, + "learning_rate": 4.1057714366519173e-07, + "loss": 1.6695, + "step": 31291 + }, + { + "epoch": 9.604665438919582, + "grad_norm": 0.12496510148048401, + "learning_rate": 4.0994170760646487e-07, + "loss": 1.6728, + "step": 31292 + }, + { + "epoch": 9.604972375690608, + "grad_norm": 0.12658068537712097, + "learning_rate": 4.0930676162576063e-07, + "loss": 1.6813, + "step": 31293 + }, + { + "epoch": 9.605279312461633, + "grad_norm": 0.1092144325375557, + "learning_rate": 4.0867230572935176e-07, + "loss": 1.6728, + "step": 31294 + }, + { + "epoch": 9.605586249232658, + "grad_norm": 0.13999344408512115, + "learning_rate": 4.0803833992350547e-07, + "loss": 1.6931, + "step": 31295 + }, + { + "epoch": 9.605893186003684, + "grad_norm": 0.1349373310804367, + "learning_rate": 4.0740486421449455e-07, + "loss": 1.7247, + "step": 31296 + }, + { + "epoch": 9.606200122774709, + "grad_norm": 0.17605085670948029, + "learning_rate": 4.0677187860857503e-07, + "loss": 1.7334, + "step": 31297 + }, + { + "epoch": 9.606507059545734, + "grad_norm": 0.1366586685180664, + "learning_rate": 4.061393831120086e-07, + "loss": 1.7097, + "step": 31298 + }, + { + "epoch": 9.60681399631676, + "grad_norm": 0.11512716114521027, + "learning_rate": 4.0550737773103475e-07, + "loss": 1.6844, + "step": 31299 + }, + { + "epoch": 9.607120933087783, + "grad_norm": 0.1779230386018753, + "learning_rate": 4.04875862471904e-07, + "loss": 1.706, + "step": 31300 + }, + { + "epoch": 9.607427869858808, + "grad_norm": 0.11504211276769638, + "learning_rate": 4.042448373408614e-07, + "loss": 1.6816, + "step": 31301 + }, + { + "epoch": 9.607734806629834, + "grad_norm": 0.17073078453540802, + "learning_rate": 4.036143023441408e-07, + "loss": 1.6883, + "step": 31302 + }, + { + "epoch": 9.60804174340086, + "grad_norm": 0.15582023561000824, + "learning_rate": 4.0298425748797606e-07, + "loss": 1.7218, + "step": 31303 + }, + { + "epoch": 9.608348680171884, + "grad_norm": 0.1295994520187378, + "learning_rate": 4.0235470277858454e-07, + "loss": 1.6722, + "step": 31304 + }, + { + "epoch": 9.60865561694291, + "grad_norm": 0.11748214811086655, + "learning_rate": 4.0172563822219457e-07, + "loss": 1.7255, + "step": 31305 + }, + { + "epoch": 9.608962553713935, + "grad_norm": 0.15344174206256866, + "learning_rate": 4.010970638250289e-07, + "loss": 1.7227, + "step": 31306 + }, + { + "epoch": 9.60926949048496, + "grad_norm": 0.12453699111938477, + "learning_rate": 4.0046897959328256e-07, + "loss": 1.7004, + "step": 31307 + }, + { + "epoch": 9.609576427255986, + "grad_norm": 0.11904565244913101, + "learning_rate": 3.9984138553318395e-07, + "loss": 1.6949, + "step": 31308 + }, + { + "epoch": 9.609883364027011, + "grad_norm": 0.1816912293434143, + "learning_rate": 3.9921428165091703e-07, + "loss": 1.7344, + "step": 31309 + }, + { + "epoch": 9.610190300798035, + "grad_norm": 0.17511364817619324, + "learning_rate": 3.9858766795268785e-07, + "loss": 1.715, + "step": 31310 + }, + { + "epoch": 9.61049723756906, + "grad_norm": 0.14724890887737274, + "learning_rate": 3.9796154444468604e-07, + "loss": 1.7115, + "step": 31311 + }, + { + "epoch": 9.610804174340085, + "grad_norm": 0.13168582320213318, + "learning_rate": 3.97335911133101e-07, + "loss": 1.6975, + "step": 31312 + }, + { + "epoch": 9.61111111111111, + "grad_norm": 0.10625627636909485, + "learning_rate": 3.967107680241222e-07, + "loss": 1.6475, + "step": 31313 + }, + { + "epoch": 9.611418047882136, + "grad_norm": 0.16010381281375885, + "learning_rate": 3.9608611512391704e-07, + "loss": 1.7232, + "step": 31314 + }, + { + "epoch": 9.611724984653161, + "grad_norm": 0.1410607546567917, + "learning_rate": 3.9546195243865826e-07, + "loss": 1.7167, + "step": 31315 + }, + { + "epoch": 9.612031921424187, + "grad_norm": 0.1656857579946518, + "learning_rate": 3.948382799745243e-07, + "loss": 1.7294, + "step": 31316 + }, + { + "epoch": 9.612338858195212, + "grad_norm": 0.12383712828159332, + "learning_rate": 3.942150977376713e-07, + "loss": 1.6826, + "step": 31317 + }, + { + "epoch": 9.612645794966237, + "grad_norm": 0.12091368436813354, + "learning_rate": 3.9359240573426105e-07, + "loss": 1.7067, + "step": 31318 + }, + { + "epoch": 9.612952731737263, + "grad_norm": 0.11942148953676224, + "learning_rate": 3.9297020397044416e-07, + "loss": 1.6991, + "step": 31319 + }, + { + "epoch": 9.613259668508288, + "grad_norm": 0.19631130993366241, + "learning_rate": 3.9234849245237126e-07, + "loss": 1.7333, + "step": 31320 + }, + { + "epoch": 9.613566605279313, + "grad_norm": 0.11581625044345856, + "learning_rate": 3.917272711861819e-07, + "loss": 1.6685, + "step": 31321 + }, + { + "epoch": 9.613873542050337, + "grad_norm": 0.1485711932182312, + "learning_rate": 3.9110654017802675e-07, + "loss": 1.7233, + "step": 31322 + }, + { + "epoch": 9.614180478821362, + "grad_norm": 0.16040800511837006, + "learning_rate": 3.9048629943403415e-07, + "loss": 1.7581, + "step": 31323 + }, + { + "epoch": 9.614487415592388, + "grad_norm": 0.18484649062156677, + "learning_rate": 3.8986654896032705e-07, + "loss": 1.7376, + "step": 31324 + }, + { + "epoch": 9.614794352363413, + "grad_norm": 0.11399713158607483, + "learning_rate": 3.892472887630394e-07, + "loss": 1.6831, + "step": 31325 + }, + { + "epoch": 9.615101289134438, + "grad_norm": 0.14001138508319855, + "learning_rate": 3.8862851884828855e-07, + "loss": 1.7201, + "step": 31326 + }, + { + "epoch": 9.615408225905464, + "grad_norm": 0.12577788531780243, + "learning_rate": 3.880102392221863e-07, + "loss": 1.6992, + "step": 31327 + }, + { + "epoch": 9.615715162676489, + "grad_norm": 0.20776085555553436, + "learning_rate": 3.873924498908443e-07, + "loss": 1.6801, + "step": 31328 + }, + { + "epoch": 9.616022099447514, + "grad_norm": 0.1547452211380005, + "learning_rate": 3.867751508603745e-07, + "loss": 1.7288, + "step": 31329 + }, + { + "epoch": 9.61632903621854, + "grad_norm": 0.16533677279949188, + "learning_rate": 3.861583421368664e-07, + "loss": 1.6842, + "step": 31330 + }, + { + "epoch": 9.616635972989565, + "grad_norm": 0.1557091921567917, + "learning_rate": 3.8554202372642623e-07, + "loss": 1.7522, + "step": 31331 + }, + { + "epoch": 9.616942909760589, + "grad_norm": 0.1304699331521988, + "learning_rate": 3.84926195635138e-07, + "loss": 1.6621, + "step": 31332 + }, + { + "epoch": 9.617249846531614, + "grad_norm": 0.2067500501871109, + "learning_rate": 3.8431085786908573e-07, + "loss": 1.7404, + "step": 31333 + }, + { + "epoch": 9.61755678330264, + "grad_norm": 0.15577533841133118, + "learning_rate": 3.83696010434359e-07, + "loss": 1.7357, + "step": 31334 + }, + { + "epoch": 9.617863720073665, + "grad_norm": 0.13889038562774658, + "learning_rate": 3.8308165333703073e-07, + "loss": 1.6994, + "step": 31335 + }, + { + "epoch": 9.61817065684469, + "grad_norm": 0.10292867571115494, + "learning_rate": 3.824677865831683e-07, + "loss": 1.6838, + "step": 31336 + }, + { + "epoch": 9.618477593615715, + "grad_norm": 0.19257314503192902, + "learning_rate": 3.8185441017883905e-07, + "loss": 1.7868, + "step": 31337 + }, + { + "epoch": 9.61878453038674, + "grad_norm": 0.13351574540138245, + "learning_rate": 3.8124152413010486e-07, + "loss": 1.7221, + "step": 31338 + }, + { + "epoch": 9.619091467157766, + "grad_norm": 0.14897382259368896, + "learning_rate": 3.8062912844302746e-07, + "loss": 1.7551, + "step": 31339 + }, + { + "epoch": 9.619398403928791, + "grad_norm": 0.16135838627815247, + "learning_rate": 3.800172231236576e-07, + "loss": 1.7806, + "step": 31340 + }, + { + "epoch": 9.619705340699817, + "grad_norm": 0.11817923933267593, + "learning_rate": 3.794058081780405e-07, + "loss": 1.7063, + "step": 31341 + }, + { + "epoch": 9.620012277470842, + "grad_norm": 0.11679195612668991, + "learning_rate": 3.787948836122157e-07, + "loss": 1.6841, + "step": 31342 + }, + { + "epoch": 9.620319214241865, + "grad_norm": 0.1286752074956894, + "learning_rate": 3.7818444943222287e-07, + "loss": 1.6883, + "step": 31343 + }, + { + "epoch": 9.62062615101289, + "grad_norm": 0.28080862760543823, + "learning_rate": 3.775745056441016e-07, + "loss": 1.7409, + "step": 31344 + }, + { + "epoch": 9.620933087783916, + "grad_norm": 0.11734452843666077, + "learning_rate": 3.7696505225386924e-07, + "loss": 1.6682, + "step": 31345 + }, + { + "epoch": 9.621240024554941, + "grad_norm": 0.10224849730730057, + "learning_rate": 3.763560892675544e-07, + "loss": 1.6771, + "step": 31346 + }, + { + "epoch": 9.621546961325967, + "grad_norm": 0.15901216864585876, + "learning_rate": 3.7574761669117443e-07, + "loss": 1.6718, + "step": 31347 + }, + { + "epoch": 9.621853898096992, + "grad_norm": 0.1088409572839737, + "learning_rate": 3.751396345307412e-07, + "loss": 1.6783, + "step": 31348 + }, + { + "epoch": 9.622160834868017, + "grad_norm": 0.1764845997095108, + "learning_rate": 3.7453214279226654e-07, + "loss": 1.7556, + "step": 31349 + }, + { + "epoch": 9.622467771639043, + "grad_norm": 0.11249416321516037, + "learning_rate": 3.739251414817457e-07, + "loss": 1.686, + "step": 31350 + }, + { + "epoch": 9.622774708410068, + "grad_norm": 0.1254713088274002, + "learning_rate": 3.7331863060519055e-07, + "loss": 1.6517, + "step": 31351 + }, + { + "epoch": 9.623081645181093, + "grad_norm": 0.16272024810314178, + "learning_rate": 3.727126101685852e-07, + "loss": 1.7195, + "step": 31352 + }, + { + "epoch": 9.623388581952117, + "grad_norm": 0.1234750747680664, + "learning_rate": 3.721070801779192e-07, + "loss": 1.7122, + "step": 31353 + }, + { + "epoch": 9.623695518723142, + "grad_norm": 0.17801089584827423, + "learning_rate": 3.7150204063918223e-07, + "loss": 1.703, + "step": 31354 + }, + { + "epoch": 9.624002455494168, + "grad_norm": 0.16611720621585846, + "learning_rate": 3.708974915583474e-07, + "loss": 1.7806, + "step": 31355 + }, + { + "epoch": 9.624309392265193, + "grad_norm": 0.18672671914100647, + "learning_rate": 3.702934329413932e-07, + "loss": 1.7299, + "step": 31356 + }, + { + "epoch": 9.624616329036218, + "grad_norm": 0.14166928827762604, + "learning_rate": 3.6968986479428705e-07, + "loss": 1.7213, + "step": 31357 + }, + { + "epoch": 9.624923265807244, + "grad_norm": 0.1553429216146469, + "learning_rate": 3.690867871229964e-07, + "loss": 1.7198, + "step": 31358 + }, + { + "epoch": 9.625230202578269, + "grad_norm": 0.12247302383184433, + "learning_rate": 3.6848419993348315e-07, + "loss": 1.7261, + "step": 31359 + }, + { + "epoch": 9.625537139349294, + "grad_norm": 0.11835172772407532, + "learning_rate": 3.6788210323169256e-07, + "loss": 1.6798, + "step": 31360 + }, + { + "epoch": 9.62584407612032, + "grad_norm": 0.13140064477920532, + "learning_rate": 3.67280497023581e-07, + "loss": 1.693, + "step": 31361 + }, + { + "epoch": 9.626151012891345, + "grad_norm": 0.15596047043800354, + "learning_rate": 3.6667938131509925e-07, + "loss": 1.7312, + "step": 31362 + }, + { + "epoch": 9.62645794966237, + "grad_norm": 0.1632358282804489, + "learning_rate": 3.6607875611218146e-07, + "loss": 1.7315, + "step": 31363 + }, + { + "epoch": 9.626764886433394, + "grad_norm": 0.1374986320734024, + "learning_rate": 3.654786214207617e-07, + "loss": 1.6954, + "step": 31364 + }, + { + "epoch": 9.62707182320442, + "grad_norm": 0.154662624001503, + "learning_rate": 3.648789772467742e-07, + "loss": 1.7133, + "step": 31365 + }, + { + "epoch": 9.627378759975445, + "grad_norm": 0.1405872106552124, + "learning_rate": 3.6427982359614753e-07, + "loss": 1.7156, + "step": 31366 + }, + { + "epoch": 9.62768569674647, + "grad_norm": 0.11641019582748413, + "learning_rate": 3.6368116047479914e-07, + "loss": 1.6961, + "step": 31367 + }, + { + "epoch": 9.627992633517495, + "grad_norm": 0.15025056898593903, + "learning_rate": 3.630829878886466e-07, + "loss": 1.6984, + "step": 31368 + }, + { + "epoch": 9.62829957028852, + "grad_norm": 0.1593703031539917, + "learning_rate": 3.6248530584360175e-07, + "loss": 1.7281, + "step": 31369 + }, + { + "epoch": 9.628606507059546, + "grad_norm": 0.16070005297660828, + "learning_rate": 3.6188811434557103e-07, + "loss": 1.6851, + "step": 31370 + }, + { + "epoch": 9.628913443830571, + "grad_norm": 0.1515837013721466, + "learning_rate": 3.612914134004552e-07, + "loss": 1.705, + "step": 31371 + }, + { + "epoch": 9.629220380601597, + "grad_norm": 0.21579277515411377, + "learning_rate": 3.606952030141497e-07, + "loss": 1.765, + "step": 31372 + }, + { + "epoch": 9.629527317372622, + "grad_norm": 0.11283712834119797, + "learning_rate": 3.6009948319254973e-07, + "loss": 1.67, + "step": 31373 + }, + { + "epoch": 9.629834254143645, + "grad_norm": 0.10959877073764801, + "learning_rate": 3.5950425394154497e-07, + "loss": 1.6336, + "step": 31374 + }, + { + "epoch": 9.63014119091467, + "grad_norm": 0.15441931784152985, + "learning_rate": 3.5890951526700857e-07, + "loss": 1.7151, + "step": 31375 + }, + { + "epoch": 9.630448127685696, + "grad_norm": 0.10803858935832977, + "learning_rate": 3.583152671748302e-07, + "loss": 1.7114, + "step": 31376 + }, + { + "epoch": 9.630755064456721, + "grad_norm": 0.10860857367515564, + "learning_rate": 3.5772150967086637e-07, + "loss": 1.6905, + "step": 31377 + }, + { + "epoch": 9.631062001227747, + "grad_norm": 0.1574680209159851, + "learning_rate": 3.5712824276100674e-07, + "loss": 1.7139, + "step": 31378 + }, + { + "epoch": 9.631368937998772, + "grad_norm": 0.14044490456581116, + "learning_rate": 3.565354664510967e-07, + "loss": 1.7163, + "step": 31379 + }, + { + "epoch": 9.631675874769797, + "grad_norm": 0.11367516964673996, + "learning_rate": 3.559431807469982e-07, + "loss": 1.714, + "step": 31380 + }, + { + "epoch": 9.631982811540823, + "grad_norm": 0.15081267058849335, + "learning_rate": 3.553513856545676e-07, + "loss": 1.7021, + "step": 31381 + }, + { + "epoch": 9.632289748311848, + "grad_norm": 0.11578520387411118, + "learning_rate": 3.5476008117965586e-07, + "loss": 1.6566, + "step": 31382 + }, + { + "epoch": 9.632596685082873, + "grad_norm": 0.10944022983312607, + "learning_rate": 3.541692673280972e-07, + "loss": 1.6499, + "step": 31383 + }, + { + "epoch": 9.632903621853899, + "grad_norm": 0.18682554364204407, + "learning_rate": 3.5357894410574243e-07, + "loss": 1.8172, + "step": 31384 + }, + { + "epoch": 9.633210558624924, + "grad_norm": 0.14995524287223816, + "learning_rate": 3.5298911151841475e-07, + "loss": 1.7285, + "step": 31385 + }, + { + "epoch": 9.633517495395948, + "grad_norm": 0.13728348910808563, + "learning_rate": 3.523997695719483e-07, + "loss": 1.6732, + "step": 31386 + }, + { + "epoch": 9.633824432166973, + "grad_norm": 0.14575724303722382, + "learning_rate": 3.518109182721718e-07, + "loss": 1.7013, + "step": 31387 + }, + { + "epoch": 9.634131368937998, + "grad_norm": 0.140236034989357, + "learning_rate": 3.512225576248918e-07, + "loss": 1.7609, + "step": 31388 + }, + { + "epoch": 9.634438305709024, + "grad_norm": 0.14315754175186157, + "learning_rate": 3.506346876359368e-07, + "loss": 1.713, + "step": 31389 + }, + { + "epoch": 9.634745242480049, + "grad_norm": 0.17747996747493744, + "learning_rate": 3.500473083111022e-07, + "loss": 1.6988, + "step": 31390 + }, + { + "epoch": 9.635052179251074, + "grad_norm": 0.1338483840227127, + "learning_rate": 3.4946041965621124e-07, + "loss": 1.6602, + "step": 31391 + }, + { + "epoch": 9.6353591160221, + "grad_norm": 0.14221277832984924, + "learning_rate": 3.488740216770481e-07, + "loss": 1.712, + "step": 31392 + }, + { + "epoch": 9.635666052793125, + "grad_norm": 0.18484778702259064, + "learning_rate": 3.482881143794137e-07, + "loss": 1.7008, + "step": 31393 + }, + { + "epoch": 9.63597298956415, + "grad_norm": 0.11398128420114517, + "learning_rate": 3.4770269776909783e-07, + "loss": 1.6884, + "step": 31394 + }, + { + "epoch": 9.636279926335176, + "grad_norm": 0.20213046669960022, + "learning_rate": 3.4711777185188477e-07, + "loss": 1.7539, + "step": 31395 + }, + { + "epoch": 9.6365868631062, + "grad_norm": 0.15737096965312958, + "learning_rate": 3.465333366335588e-07, + "loss": 1.7451, + "step": 31396 + }, + { + "epoch": 9.636893799877225, + "grad_norm": 0.18838335573673248, + "learning_rate": 3.459493921198931e-07, + "loss": 1.6942, + "step": 31397 + }, + { + "epoch": 9.63720073664825, + "grad_norm": 0.1837395280599594, + "learning_rate": 3.453659383166552e-07, + "loss": 1.7428, + "step": 31398 + }, + { + "epoch": 9.637507673419275, + "grad_norm": 0.153046116232872, + "learning_rate": 3.4478297522961834e-07, + "loss": 1.6806, + "step": 31399 + }, + { + "epoch": 9.6378146101903, + "grad_norm": 0.16290830075740814, + "learning_rate": 3.44200502864539e-07, + "loss": 1.7669, + "step": 31400 + }, + { + "epoch": 9.638121546961326, + "grad_norm": 0.17401064932346344, + "learning_rate": 3.4361852122717364e-07, + "loss": 1.7221, + "step": 31401 + }, + { + "epoch": 9.638428483732351, + "grad_norm": 0.176009401679039, + "learning_rate": 3.4303703032327325e-07, + "loss": 1.7514, + "step": 31402 + }, + { + "epoch": 9.638735420503377, + "grad_norm": 0.1500163972377777, + "learning_rate": 3.424560301585888e-07, + "loss": 1.7216, + "step": 31403 + }, + { + "epoch": 9.639042357274402, + "grad_norm": 0.10302964597940445, + "learning_rate": 3.418755207388602e-07, + "loss": 1.6702, + "step": 31404 + }, + { + "epoch": 9.639349294045427, + "grad_norm": 0.13488547503948212, + "learning_rate": 3.412955020698216e-07, + "loss": 1.7303, + "step": 31405 + }, + { + "epoch": 9.639656230816453, + "grad_norm": 0.11274787783622742, + "learning_rate": 3.407159741572019e-07, + "loss": 1.6658, + "step": 31406 + }, + { + "epoch": 9.639963167587476, + "grad_norm": 0.17834068834781647, + "learning_rate": 3.401369370067353e-07, + "loss": 1.75, + "step": 31407 + }, + { + "epoch": 9.640270104358502, + "grad_norm": 0.1692495495080948, + "learning_rate": 3.395583906241506e-07, + "loss": 1.7364, + "step": 31408 + }, + { + "epoch": 9.640577041129527, + "grad_norm": 0.1486683338880539, + "learning_rate": 3.3898033501514323e-07, + "loss": 1.7056, + "step": 31409 + }, + { + "epoch": 9.640883977900552, + "grad_norm": 0.1396656632423401, + "learning_rate": 3.384027701854531e-07, + "loss": 1.7026, + "step": 31410 + }, + { + "epoch": 9.641190914671578, + "grad_norm": 0.09748127311468124, + "learning_rate": 3.3782569614076444e-07, + "loss": 1.6905, + "step": 31411 + }, + { + "epoch": 9.641497851442603, + "grad_norm": 0.24635939300060272, + "learning_rate": 3.3724911288679494e-07, + "loss": 1.7301, + "step": 31412 + }, + { + "epoch": 9.641804788213628, + "grad_norm": 0.1656247079372406, + "learning_rate": 3.3667302042923453e-07, + "loss": 1.7279, + "step": 31413 + }, + { + "epoch": 9.642111724984654, + "grad_norm": 0.1069309264421463, + "learning_rate": 3.360974187737842e-07, + "loss": 1.666, + "step": 31414 + }, + { + "epoch": 9.642418661755679, + "grad_norm": 0.16244177520275116, + "learning_rate": 3.355223079261227e-07, + "loss": 1.8195, + "step": 31415 + }, + { + "epoch": 9.642725598526704, + "grad_norm": 0.11351195722818375, + "learning_rate": 3.3494768789194554e-07, + "loss": 1.669, + "step": 31416 + }, + { + "epoch": 9.643032535297728, + "grad_norm": 0.20543862879276276, + "learning_rate": 3.3437355867692034e-07, + "loss": 1.7004, + "step": 31417 + }, + { + "epoch": 9.643339472068753, + "grad_norm": 0.12174477428197861, + "learning_rate": 3.337999202867259e-07, + "loss": 1.6945, + "step": 31418 + }, + { + "epoch": 9.643646408839778, + "grad_norm": 0.14274805784225464, + "learning_rate": 3.332267727270355e-07, + "loss": 1.7221, + "step": 31419 + }, + { + "epoch": 9.643953345610804, + "grad_norm": 0.13756579160690308, + "learning_rate": 3.326541160035057e-07, + "loss": 1.6995, + "step": 31420 + }, + { + "epoch": 9.644260282381829, + "grad_norm": 0.1515035182237625, + "learning_rate": 3.320819501217931e-07, + "loss": 1.7469, + "step": 31421 + }, + { + "epoch": 9.644567219152854, + "grad_norm": 0.13177438080310822, + "learning_rate": 3.315102750875654e-07, + "loss": 1.665, + "step": 31422 + }, + { + "epoch": 9.64487415592388, + "grad_norm": 0.13083817064762115, + "learning_rate": 3.309390909064625e-07, + "loss": 1.7156, + "step": 31423 + }, + { + "epoch": 9.645181092694905, + "grad_norm": 0.16704332828521729, + "learning_rate": 3.303683975841299e-07, + "loss": 1.7149, + "step": 31424 + }, + { + "epoch": 9.64548802946593, + "grad_norm": 0.11540384590625763, + "learning_rate": 3.29798195126213e-07, + "loss": 1.6848, + "step": 31425 + }, + { + "epoch": 9.645794966236956, + "grad_norm": 0.13248707354068756, + "learning_rate": 3.292284835383408e-07, + "loss": 1.6887, + "step": 31426 + }, + { + "epoch": 9.646101903007981, + "grad_norm": 0.14763472974300385, + "learning_rate": 3.2865926282614755e-07, + "loss": 1.7406, + "step": 31427 + }, + { + "epoch": 9.646408839779006, + "grad_norm": 0.17477329075336456, + "learning_rate": 3.2809053299525105e-07, + "loss": 1.7448, + "step": 31428 + }, + { + "epoch": 9.64671577655003, + "grad_norm": 0.2105390578508377, + "learning_rate": 3.275222940512801e-07, + "loss": 1.7695, + "step": 31429 + }, + { + "epoch": 9.647022713321055, + "grad_norm": 0.14712996780872345, + "learning_rate": 3.2695454599985243e-07, + "loss": 1.7161, + "step": 31430 + }, + { + "epoch": 9.64732965009208, + "grad_norm": 0.15937888622283936, + "learning_rate": 3.263872888465691e-07, + "loss": 1.7598, + "step": 31431 + }, + { + "epoch": 9.647636586863106, + "grad_norm": 0.10824455320835114, + "learning_rate": 3.258205225970423e-07, + "loss": 1.6662, + "step": 31432 + }, + { + "epoch": 9.647943523634131, + "grad_norm": 0.12431895732879639, + "learning_rate": 3.2525424725687315e-07, + "loss": 1.6932, + "step": 31433 + }, + { + "epoch": 9.648250460405157, + "grad_norm": 0.14159630239009857, + "learning_rate": 3.246884628316571e-07, + "loss": 1.7091, + "step": 31434 + }, + { + "epoch": 9.648557397176182, + "grad_norm": 0.17578476667404175, + "learning_rate": 3.241231693269842e-07, + "loss": 1.6818, + "step": 31435 + }, + { + "epoch": 9.648864333947207, + "grad_norm": 0.17417892813682556, + "learning_rate": 3.235583667484443e-07, + "loss": 1.7712, + "step": 31436 + }, + { + "epoch": 9.649171270718233, + "grad_norm": 0.12163690477609634, + "learning_rate": 3.2299405510161087e-07, + "loss": 1.7261, + "step": 31437 + }, + { + "epoch": 9.649478207489258, + "grad_norm": 0.1171955019235611, + "learning_rate": 3.224302343920738e-07, + "loss": 1.6785, + "step": 31438 + }, + { + "epoch": 9.649785144260282, + "grad_norm": 0.11423932015895844, + "learning_rate": 3.2186690462539524e-07, + "loss": 1.7166, + "step": 31439 + }, + { + "epoch": 9.650092081031307, + "grad_norm": 0.16560381650924683, + "learning_rate": 3.213040658071431e-07, + "loss": 1.7179, + "step": 31440 + }, + { + "epoch": 9.650399017802332, + "grad_norm": 0.1309049129486084, + "learning_rate": 3.207417179428851e-07, + "loss": 1.6982, + "step": 31441 + }, + { + "epoch": 9.650705954573358, + "grad_norm": 0.13441912829875946, + "learning_rate": 3.201798610381723e-07, + "loss": 1.7376, + "step": 31442 + }, + { + "epoch": 9.651012891344383, + "grad_norm": 0.10977588593959808, + "learning_rate": 3.1961849509856143e-07, + "loss": 1.7127, + "step": 31443 + }, + { + "epoch": 9.651319828115408, + "grad_norm": 0.11772170662879944, + "learning_rate": 3.190576201296036e-07, + "loss": 1.6816, + "step": 31444 + }, + { + "epoch": 9.651626764886434, + "grad_norm": 0.17650476098060608, + "learning_rate": 3.1849723613683323e-07, + "loss": 1.713, + "step": 31445 + }, + { + "epoch": 9.651933701657459, + "grad_norm": 0.12182165682315826, + "learning_rate": 3.1793734312579037e-07, + "loss": 1.6243, + "step": 31446 + }, + { + "epoch": 9.652240638428484, + "grad_norm": 0.1657133251428604, + "learning_rate": 3.17377941102015e-07, + "loss": 1.7207, + "step": 31447 + }, + { + "epoch": 9.65254757519951, + "grad_norm": 0.15303701162338257, + "learning_rate": 3.1681903007102496e-07, + "loss": 1.7345, + "step": 31448 + }, + { + "epoch": 9.652854511970535, + "grad_norm": 0.17544081807136536, + "learning_rate": 3.162606100383547e-07, + "loss": 1.7176, + "step": 31449 + }, + { + "epoch": 9.653161448741558, + "grad_norm": 0.12232106178998947, + "learning_rate": 3.157026810095165e-07, + "loss": 1.7007, + "step": 31450 + }, + { + "epoch": 9.653468385512584, + "grad_norm": 0.12764953076839447, + "learning_rate": 3.1514524299002255e-07, + "loss": 1.7214, + "step": 31451 + }, + { + "epoch": 9.65377532228361, + "grad_norm": 0.19449979066848755, + "learning_rate": 3.1458829598539077e-07, + "loss": 1.721, + "step": 31452 + }, + { + "epoch": 9.654082259054634, + "grad_norm": 0.15264229476451874, + "learning_rate": 3.1403184000111106e-07, + "loss": 1.7486, + "step": 31453 + }, + { + "epoch": 9.65438919582566, + "grad_norm": 0.12420966476202011, + "learning_rate": 3.134758750426958e-07, + "loss": 1.6993, + "step": 31454 + }, + { + "epoch": 9.654696132596685, + "grad_norm": 0.16511085629463196, + "learning_rate": 3.1292040111563503e-07, + "loss": 1.7311, + "step": 31455 + }, + { + "epoch": 9.65500306936771, + "grad_norm": 0.16847728192806244, + "learning_rate": 3.123654182254132e-07, + "loss": 1.7988, + "step": 31456 + }, + { + "epoch": 9.655310006138736, + "grad_norm": 0.1573457270860672, + "learning_rate": 3.118109263775204e-07, + "loss": 1.752, + "step": 31457 + }, + { + "epoch": 9.655616942909761, + "grad_norm": 0.11476359516382217, + "learning_rate": 3.1125692557743555e-07, + "loss": 1.7258, + "step": 31458 + }, + { + "epoch": 9.655923879680786, + "grad_norm": 0.14234037697315216, + "learning_rate": 3.1070341583063767e-07, + "loss": 1.6909, + "step": 31459 + }, + { + "epoch": 9.65623081645181, + "grad_norm": 0.11332587152719498, + "learning_rate": 3.101503971425834e-07, + "loss": 1.641, + "step": 31460 + }, + { + "epoch": 9.656537753222835, + "grad_norm": 0.10850653052330017, + "learning_rate": 3.0959786951875735e-07, + "loss": 1.678, + "step": 31461 + }, + { + "epoch": 9.65684468999386, + "grad_norm": 0.14826613664627075, + "learning_rate": 3.0904583296459953e-07, + "loss": 1.7056, + "step": 31462 + }, + { + "epoch": 9.657151626764886, + "grad_norm": 0.10023099184036255, + "learning_rate": 3.084942874855834e-07, + "loss": 1.6457, + "step": 31463 + }, + { + "epoch": 9.657458563535911, + "grad_norm": 0.12071017175912857, + "learning_rate": 3.07943233087149e-07, + "loss": 1.667, + "step": 31464 + }, + { + "epoch": 9.657765500306937, + "grad_norm": 0.13804757595062256, + "learning_rate": 3.07392669774742e-07, + "loss": 1.7054, + "step": 31465 + }, + { + "epoch": 9.658072437077962, + "grad_norm": 0.1364121288061142, + "learning_rate": 3.0684259755380805e-07, + "loss": 1.7159, + "step": 31466 + }, + { + "epoch": 9.658379373848987, + "grad_norm": 0.11550064384937286, + "learning_rate": 3.062930164297817e-07, + "loss": 1.7289, + "step": 31467 + }, + { + "epoch": 9.658686310620013, + "grad_norm": 0.13400794565677643, + "learning_rate": 3.0574392640809744e-07, + "loss": 1.7216, + "step": 31468 + }, + { + "epoch": 9.658993247391038, + "grad_norm": 0.12369029968976974, + "learning_rate": 3.0519532749417876e-07, + "loss": 1.6923, + "step": 31469 + }, + { + "epoch": 9.659300184162063, + "grad_norm": 0.1034984290599823, + "learning_rate": 3.046472196934436e-07, + "loss": 1.6398, + "step": 31470 + }, + { + "epoch": 9.659607120933089, + "grad_norm": 0.14667385816574097, + "learning_rate": 3.040996030113097e-07, + "loss": 1.7016, + "step": 31471 + }, + { + "epoch": 9.659914057704112, + "grad_norm": 0.14836667478084564, + "learning_rate": 3.0355247745319505e-07, + "loss": 1.7003, + "step": 31472 + }, + { + "epoch": 9.660220994475138, + "grad_norm": 0.1664000302553177, + "learning_rate": 3.0300584302450643e-07, + "loss": 1.7092, + "step": 31473 + }, + { + "epoch": 9.660527931246163, + "grad_norm": 0.16207198798656464, + "learning_rate": 3.0245969973063393e-07, + "loss": 1.7219, + "step": 31474 + }, + { + "epoch": 9.660834868017188, + "grad_norm": 0.1146533191204071, + "learning_rate": 3.0191404757698995e-07, + "loss": 1.6975, + "step": 31475 + }, + { + "epoch": 9.661141804788214, + "grad_norm": 0.12706562876701355, + "learning_rate": 3.01368886568959e-07, + "loss": 1.7127, + "step": 31476 + }, + { + "epoch": 9.661448741559239, + "grad_norm": 0.16857001185417175, + "learning_rate": 3.0082421671192575e-07, + "loss": 1.7362, + "step": 31477 + }, + { + "epoch": 9.661755678330264, + "grad_norm": 0.11784416437149048, + "learning_rate": 3.002800380112802e-07, + "loss": 1.6776, + "step": 31478 + }, + { + "epoch": 9.66206261510129, + "grad_norm": 0.12407553941011429, + "learning_rate": 2.99736350472396e-07, + "loss": 1.6977, + "step": 31479 + }, + { + "epoch": 9.662369551872315, + "grad_norm": 0.09917214512825012, + "learning_rate": 2.9919315410065205e-07, + "loss": 1.6435, + "step": 31480 + }, + { + "epoch": 9.66267648864334, + "grad_norm": 0.1242169663310051, + "learning_rate": 2.9865044890140524e-07, + "loss": 1.7232, + "step": 31481 + }, + { + "epoch": 9.662983425414364, + "grad_norm": 0.15999211370944977, + "learning_rate": 2.98108234880029e-07, + "loss": 1.7018, + "step": 31482 + }, + { + "epoch": 9.66329036218539, + "grad_norm": 0.14428645372390747, + "learning_rate": 2.9756651204188026e-07, + "loss": 1.7605, + "step": 31483 + }, + { + "epoch": 9.663597298956415, + "grad_norm": 0.11703366786241531, + "learning_rate": 2.970252803923046e-07, + "loss": 1.6428, + "step": 31484 + }, + { + "epoch": 9.66390423572744, + "grad_norm": 0.11491703242063522, + "learning_rate": 2.9648453993666446e-07, + "loss": 1.6709, + "step": 31485 + }, + { + "epoch": 9.664211172498465, + "grad_norm": 0.13316640257835388, + "learning_rate": 2.959442906802945e-07, + "loss": 1.6886, + "step": 31486 + }, + { + "epoch": 9.66451810926949, + "grad_norm": 0.11723330616950989, + "learning_rate": 2.9540453262853486e-07, + "loss": 1.694, + "step": 31487 + }, + { + "epoch": 9.664825046040516, + "grad_norm": 0.20565512776374817, + "learning_rate": 2.948652657867146e-07, + "loss": 1.7406, + "step": 31488 + }, + { + "epoch": 9.665131982811541, + "grad_norm": 0.147149458527565, + "learning_rate": 2.9432649016017387e-07, + "loss": 1.7548, + "step": 31489 + }, + { + "epoch": 9.665438919582567, + "grad_norm": 0.15741130709648132, + "learning_rate": 2.937882057542363e-07, + "loss": 1.7005, + "step": 31490 + }, + { + "epoch": 9.665745856353592, + "grad_norm": 0.1358392834663391, + "learning_rate": 2.9325041257421414e-07, + "loss": 1.6991, + "step": 31491 + }, + { + "epoch": 9.666052793124617, + "grad_norm": 0.12195859849452972, + "learning_rate": 2.9271311062541994e-07, + "loss": 1.6764, + "step": 31492 + }, + { + "epoch": 9.66635972989564, + "grad_norm": 0.12507489323616028, + "learning_rate": 2.921762999131772e-07, + "loss": 1.7158, + "step": 31493 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 0.12813931703567505, + "learning_rate": 2.9163998044277606e-07, + "loss": 1.7066, + "step": 31494 + }, + { + "epoch": 9.666973603437691, + "grad_norm": 0.2424009144306183, + "learning_rate": 2.91104152219529e-07, + "loss": 1.7572, + "step": 31495 + }, + { + "epoch": 9.667280540208717, + "grad_norm": 0.21357449889183044, + "learning_rate": 2.905688152487207e-07, + "loss": 1.7907, + "step": 31496 + }, + { + "epoch": 9.667587476979742, + "grad_norm": 0.15474599599838257, + "learning_rate": 2.900339695356524e-07, + "loss": 1.7448, + "step": 31497 + }, + { + "epoch": 9.667894413750767, + "grad_norm": 0.16011640429496765, + "learning_rate": 2.894996150856033e-07, + "loss": 1.7227, + "step": 31498 + }, + { + "epoch": 9.668201350521793, + "grad_norm": 0.1319362372159958, + "learning_rate": 2.8896575190385246e-07, + "loss": 1.6923, + "step": 31499 + }, + { + "epoch": 9.668508287292818, + "grad_norm": 0.11635458469390869, + "learning_rate": 2.8843237999567897e-07, + "loss": 1.6809, + "step": 31500 + }, + { + "epoch": 9.668815224063843, + "grad_norm": 0.13584496080875397, + "learning_rate": 2.8789949936635643e-07, + "loss": 1.6735, + "step": 31501 + }, + { + "epoch": 9.669122160834869, + "grad_norm": 0.16113825142383575, + "learning_rate": 2.873671100211528e-07, + "loss": 1.7549, + "step": 31502 + }, + { + "epoch": 9.669429097605892, + "grad_norm": 0.20962439477443695, + "learning_rate": 2.8683521196531394e-07, + "loss": 1.7772, + "step": 31503 + }, + { + "epoch": 9.669736034376918, + "grad_norm": 0.10806034505367279, + "learning_rate": 2.863038052041134e-07, + "loss": 1.6577, + "step": 31504 + }, + { + "epoch": 9.670042971147943, + "grad_norm": 0.12040059268474579, + "learning_rate": 2.857728897427969e-07, + "loss": 1.6706, + "step": 31505 + }, + { + "epoch": 9.670349907918968, + "grad_norm": 0.1953112930059433, + "learning_rate": 2.8524246558661036e-07, + "loss": 1.7453, + "step": 31506 + }, + { + "epoch": 9.670656844689994, + "grad_norm": 0.17382025718688965, + "learning_rate": 2.8471253274079404e-07, + "loss": 1.7831, + "step": 31507 + }, + { + "epoch": 9.670963781461019, + "grad_norm": 0.18416909873485565, + "learning_rate": 2.8418309121058804e-07, + "loss": 1.7017, + "step": 31508 + }, + { + "epoch": 9.671270718232044, + "grad_norm": 0.21286524832248688, + "learning_rate": 2.836541410012272e-07, + "loss": 1.7681, + "step": 31509 + }, + { + "epoch": 9.67157765500307, + "grad_norm": 0.11352343112230301, + "learning_rate": 2.831256821179351e-07, + "loss": 1.7172, + "step": 31510 + }, + { + "epoch": 9.671884591774095, + "grad_norm": 0.14935974776744843, + "learning_rate": 2.825977145659298e-07, + "loss": 1.7043, + "step": 31511 + }, + { + "epoch": 9.67219152854512, + "grad_norm": 0.13719774782657623, + "learning_rate": 2.8207023835044035e-07, + "loss": 1.6705, + "step": 31512 + }, + { + "epoch": 9.672498465316146, + "grad_norm": 0.1506626158952713, + "learning_rate": 2.815432534766738e-07, + "loss": 1.7178, + "step": 31513 + }, + { + "epoch": 9.67280540208717, + "grad_norm": 0.11403869092464447, + "learning_rate": 2.81016759949837e-07, + "loss": 1.6708, + "step": 31514 + }, + { + "epoch": 9.673112338858195, + "grad_norm": 0.13970594108104706, + "learning_rate": 2.804907577751259e-07, + "loss": 1.7231, + "step": 31515 + }, + { + "epoch": 9.67341927562922, + "grad_norm": 0.21075570583343506, + "learning_rate": 2.7996524695775296e-07, + "loss": 1.7665, + "step": 31516 + }, + { + "epoch": 9.673726212400245, + "grad_norm": 0.2385234236717224, + "learning_rate": 2.7944022750290844e-07, + "loss": 1.7734, + "step": 31517 + }, + { + "epoch": 9.67403314917127, + "grad_norm": 0.1346839815378189, + "learning_rate": 2.7891569941577155e-07, + "loss": 1.7094, + "step": 31518 + }, + { + "epoch": 9.674340085942296, + "grad_norm": 0.2111053764820099, + "learning_rate": 2.7839166270153814e-07, + "loss": 1.7638, + "step": 31519 + }, + { + "epoch": 9.674647022713321, + "grad_norm": 0.14439715445041656, + "learning_rate": 2.7786811736537633e-07, + "loss": 1.7203, + "step": 31520 + }, + { + "epoch": 9.674953959484347, + "grad_norm": 0.14776118099689484, + "learning_rate": 2.773450634124708e-07, + "loss": 1.7155, + "step": 31521 + }, + { + "epoch": 9.675260896255372, + "grad_norm": 0.1370704621076584, + "learning_rate": 2.768225008479786e-07, + "loss": 1.7356, + "step": 31522 + }, + { + "epoch": 9.675567833026397, + "grad_norm": 0.10558994114398956, + "learning_rate": 2.7630042967707327e-07, + "loss": 1.6784, + "step": 31523 + }, + { + "epoch": 9.675874769797423, + "grad_norm": 0.13506318628787994, + "learning_rate": 2.757788499049063e-07, + "loss": 1.7027, + "step": 31524 + }, + { + "epoch": 9.676181706568446, + "grad_norm": 0.15606056153774261, + "learning_rate": 2.7525776153664585e-07, + "loss": 1.7167, + "step": 31525 + }, + { + "epoch": 9.676488643339471, + "grad_norm": 0.13950656354427338, + "learning_rate": 2.747371645774266e-07, + "loss": 1.7167, + "step": 31526 + }, + { + "epoch": 9.676795580110497, + "grad_norm": 0.11195974797010422, + "learning_rate": 2.742170590324e-07, + "loss": 1.6788, + "step": 31527 + }, + { + "epoch": 9.677102516881522, + "grad_norm": 0.13597041368484497, + "learning_rate": 2.7369744490670093e-07, + "loss": 1.7004, + "step": 31528 + }, + { + "epoch": 9.677409453652547, + "grad_norm": 0.1279800981283188, + "learning_rate": 2.731783222054807e-07, + "loss": 1.6619, + "step": 31529 + }, + { + "epoch": 9.677716390423573, + "grad_norm": 0.11803285032510757, + "learning_rate": 2.7265969093384635e-07, + "loss": 1.6938, + "step": 31530 + }, + { + "epoch": 9.678023327194598, + "grad_norm": 0.09654967486858368, + "learning_rate": 2.7214155109694384e-07, + "loss": 1.6319, + "step": 31531 + }, + { + "epoch": 9.678330263965623, + "grad_norm": 0.14024733006954193, + "learning_rate": 2.7162390269988015e-07, + "loss": 1.7084, + "step": 31532 + }, + { + "epoch": 9.678637200736649, + "grad_norm": 0.19366827607154846, + "learning_rate": 2.7110674574777895e-07, + "loss": 1.7398, + "step": 31533 + }, + { + "epoch": 9.678944137507674, + "grad_norm": 0.15786738693714142, + "learning_rate": 2.705900802457473e-07, + "loss": 1.797, + "step": 31534 + }, + { + "epoch": 9.6792510742787, + "grad_norm": 0.1426011621952057, + "learning_rate": 2.7007390619888663e-07, + "loss": 1.7103, + "step": 31535 + }, + { + "epoch": 9.679558011049723, + "grad_norm": 0.1344282180070877, + "learning_rate": 2.695582236123151e-07, + "loss": 1.7135, + "step": 31536 + }, + { + "epoch": 9.679864947820748, + "grad_norm": 0.10107547789812088, + "learning_rate": 2.690430324911064e-07, + "loss": 1.6692, + "step": 31537 + }, + { + "epoch": 9.680171884591774, + "grad_norm": 0.19397902488708496, + "learning_rate": 2.6852833284036205e-07, + "loss": 1.7143, + "step": 31538 + }, + { + "epoch": 9.680478821362799, + "grad_norm": 0.15761269629001617, + "learning_rate": 2.6801412466517794e-07, + "loss": 1.6885, + "step": 31539 + }, + { + "epoch": 9.680785758133824, + "grad_norm": 0.12093541026115417, + "learning_rate": 2.675004079706223e-07, + "loss": 1.6949, + "step": 31540 + }, + { + "epoch": 9.68109269490485, + "grad_norm": 0.2050214260816574, + "learning_rate": 2.6698718276177424e-07, + "loss": 1.7948, + "step": 31541 + }, + { + "epoch": 9.681399631675875, + "grad_norm": 0.1070958599448204, + "learning_rate": 2.6647444904370766e-07, + "loss": 1.6969, + "step": 31542 + }, + { + "epoch": 9.6817065684469, + "grad_norm": 0.1629544496536255, + "learning_rate": 2.659622068214962e-07, + "loss": 1.6925, + "step": 31543 + }, + { + "epoch": 9.682013505217926, + "grad_norm": 0.15261006355285645, + "learning_rate": 2.6545045610019134e-07, + "loss": 1.7208, + "step": 31544 + }, + { + "epoch": 9.682320441988951, + "grad_norm": 0.2154887616634369, + "learning_rate": 2.649391968848558e-07, + "loss": 1.7614, + "step": 31545 + }, + { + "epoch": 9.682627378759975, + "grad_norm": 0.13233666121959686, + "learning_rate": 2.6442842918054657e-07, + "loss": 1.6904, + "step": 31546 + }, + { + "epoch": 9.682934315531, + "grad_norm": 0.10197919607162476, + "learning_rate": 2.6391815299230404e-07, + "loss": 1.6598, + "step": 31547 + }, + { + "epoch": 9.683241252302025, + "grad_norm": 0.14219482243061066, + "learning_rate": 2.634083683251742e-07, + "loss": 1.7124, + "step": 31548 + }, + { + "epoch": 9.68354818907305, + "grad_norm": 0.1305442601442337, + "learning_rate": 2.62899075184192e-07, + "loss": 1.6668, + "step": 31549 + }, + { + "epoch": 9.683855125844076, + "grad_norm": 0.13593846559524536, + "learning_rate": 2.6239027357439215e-07, + "loss": 1.7191, + "step": 31550 + }, + { + "epoch": 9.684162062615101, + "grad_norm": 0.16108329594135284, + "learning_rate": 2.618819635008041e-07, + "loss": 1.7046, + "step": 31551 + }, + { + "epoch": 9.684468999386127, + "grad_norm": 0.11882323026657104, + "learning_rate": 2.613741449684515e-07, + "loss": 1.6884, + "step": 31552 + }, + { + "epoch": 9.684775936157152, + "grad_norm": 0.15399985015392303, + "learning_rate": 2.6086681798235813e-07, + "loss": 1.7531, + "step": 31553 + }, + { + "epoch": 9.685082872928177, + "grad_norm": 0.14765115082263947, + "learning_rate": 2.6035998254752556e-07, + "loss": 1.7497, + "step": 31554 + }, + { + "epoch": 9.685389809699203, + "grad_norm": 0.13859078288078308, + "learning_rate": 2.5985363866897207e-07, + "loss": 1.7389, + "step": 31555 + }, + { + "epoch": 9.685696746470228, + "grad_norm": 0.11727506667375565, + "learning_rate": 2.5934778635169355e-07, + "loss": 1.7132, + "step": 31556 + }, + { + "epoch": 9.686003683241251, + "grad_norm": 0.13762840628623962, + "learning_rate": 2.5884242560069715e-07, + "loss": 1.657, + "step": 31557 + }, + { + "epoch": 9.686310620012277, + "grad_norm": 0.107251837849617, + "learning_rate": 2.583375564209789e-07, + "loss": 1.6836, + "step": 31558 + }, + { + "epoch": 9.686617556783302, + "grad_norm": 0.11991941183805466, + "learning_rate": 2.578331788175181e-07, + "loss": 1.6818, + "step": 31559 + }, + { + "epoch": 9.686924493554327, + "grad_norm": 0.17173689603805542, + "learning_rate": 2.5732929279530524e-07, + "loss": 1.7343, + "step": 31560 + }, + { + "epoch": 9.687231430325353, + "grad_norm": 0.1334245204925537, + "learning_rate": 2.568258983593197e-07, + "loss": 1.7081, + "step": 31561 + }, + { + "epoch": 9.687538367096378, + "grad_norm": 0.1360604166984558, + "learning_rate": 2.563229955145352e-07, + "loss": 1.7082, + "step": 31562 + }, + { + "epoch": 9.687845303867404, + "grad_norm": 0.3039763569831848, + "learning_rate": 2.558205842659256e-07, + "loss": 1.7443, + "step": 31563 + }, + { + "epoch": 9.688152240638429, + "grad_norm": 0.17424632608890533, + "learning_rate": 2.55318664618448e-07, + "loss": 1.7217, + "step": 31564 + }, + { + "epoch": 9.688459177409454, + "grad_norm": 0.131890669465065, + "learning_rate": 2.5481723657707066e-07, + "loss": 1.6749, + "step": 31565 + }, + { + "epoch": 9.68876611418048, + "grad_norm": 0.12297450006008148, + "learning_rate": 2.543163001467452e-07, + "loss": 1.6953, + "step": 31566 + }, + { + "epoch": 9.689073050951503, + "grad_norm": 0.1080961599946022, + "learning_rate": 2.5381585533242325e-07, + "loss": 1.6325, + "step": 31567 + }, + { + "epoch": 9.689379987722528, + "grad_norm": 0.13262009620666504, + "learning_rate": 2.533159021390508e-07, + "loss": 1.6726, + "step": 31568 + }, + { + "epoch": 9.689686924493554, + "grad_norm": 0.1416144073009491, + "learning_rate": 2.5281644057156826e-07, + "loss": 1.6882, + "step": 31569 + }, + { + "epoch": 9.689993861264579, + "grad_norm": 0.16881975531578064, + "learning_rate": 2.5231747063491076e-07, + "loss": 1.7309, + "step": 31570 + }, + { + "epoch": 9.690300798035604, + "grad_norm": 0.15745007991790771, + "learning_rate": 2.518189923340075e-07, + "loss": 1.7729, + "step": 31571 + }, + { + "epoch": 9.69060773480663, + "grad_norm": 0.10348693281412125, + "learning_rate": 2.513210056737936e-07, + "loss": 1.6627, + "step": 31572 + }, + { + "epoch": 9.690914671577655, + "grad_norm": 0.11885415762662888, + "learning_rate": 2.5082351065917607e-07, + "loss": 1.6765, + "step": 31573 + }, + { + "epoch": 9.69122160834868, + "grad_norm": 0.1581162065267563, + "learning_rate": 2.5032650729508444e-07, + "loss": 1.7229, + "step": 31574 + }, + { + "epoch": 9.691528545119706, + "grad_norm": 0.18366876244544983, + "learning_rate": 2.4982999558642583e-07, + "loss": 1.7569, + "step": 31575 + }, + { + "epoch": 9.691835481890731, + "grad_norm": 0.1248086616396904, + "learning_rate": 2.493339755381074e-07, + "loss": 1.693, + "step": 31576 + }, + { + "epoch": 9.692142418661756, + "grad_norm": 0.10602928698062897, + "learning_rate": 2.4883844715503093e-07, + "loss": 1.6759, + "step": 31577 + }, + { + "epoch": 9.692449355432782, + "grad_norm": 0.12804557383060455, + "learning_rate": 2.4834341044208677e-07, + "loss": 1.6957, + "step": 31578 + }, + { + "epoch": 9.692756292203805, + "grad_norm": 0.14855320751667023, + "learning_rate": 2.4784886540417664e-07, + "loss": 1.7114, + "step": 31579 + }, + { + "epoch": 9.69306322897483, + "grad_norm": 0.10958930104970932, + "learning_rate": 2.47354812046191e-07, + "loss": 1.6565, + "step": 31580 + }, + { + "epoch": 9.693370165745856, + "grad_norm": 0.09138862043619156, + "learning_rate": 2.4686125037299833e-07, + "loss": 1.6219, + "step": 31581 + }, + { + "epoch": 9.693677102516881, + "grad_norm": 0.1569548100233078, + "learning_rate": 2.4636818038948906e-07, + "loss": 1.7323, + "step": 31582 + }, + { + "epoch": 9.693984039287907, + "grad_norm": 0.11487089842557907, + "learning_rate": 2.4587560210052593e-07, + "loss": 1.6862, + "step": 31583 + }, + { + "epoch": 9.694290976058932, + "grad_norm": 0.1095786914229393, + "learning_rate": 2.4538351551098293e-07, + "loss": 1.6837, + "step": 31584 + }, + { + "epoch": 9.694597912829957, + "grad_norm": 0.13925141096115112, + "learning_rate": 2.4489192062572277e-07, + "loss": 1.7273, + "step": 31585 + }, + { + "epoch": 9.694904849600983, + "grad_norm": 0.1378251016139984, + "learning_rate": 2.4440081744960264e-07, + "loss": 1.7314, + "step": 31586 + }, + { + "epoch": 9.695211786372008, + "grad_norm": 0.11287980526685715, + "learning_rate": 2.439102059874798e-07, + "loss": 1.6696, + "step": 31587 + }, + { + "epoch": 9.695518723143033, + "grad_norm": 0.13116686046123505, + "learning_rate": 2.4342008624419487e-07, + "loss": 1.7259, + "step": 31588 + }, + { + "epoch": 9.695825659914057, + "grad_norm": 0.1313004493713379, + "learning_rate": 2.4293045822459945e-07, + "loss": 1.6822, + "step": 31589 + }, + { + "epoch": 9.696132596685082, + "grad_norm": 0.10652224719524384, + "learning_rate": 2.4244132193352864e-07, + "loss": 1.6898, + "step": 31590 + }, + { + "epoch": 9.696439533456108, + "grad_norm": 0.16992691159248352, + "learning_rate": 2.4195267737581183e-07, + "loss": 1.6827, + "step": 31591 + }, + { + "epoch": 9.696746470227133, + "grad_norm": 0.13106754422187805, + "learning_rate": 2.414645245562841e-07, + "loss": 1.6914, + "step": 31592 + }, + { + "epoch": 9.697053406998158, + "grad_norm": 0.11900182068347931, + "learning_rate": 2.409768634797749e-07, + "loss": 1.6585, + "step": 31593 + }, + { + "epoch": 9.697360343769184, + "grad_norm": 0.16071198880672455, + "learning_rate": 2.4048969415109147e-07, + "loss": 1.6643, + "step": 31594 + }, + { + "epoch": 9.697667280540209, + "grad_norm": 0.15770921111106873, + "learning_rate": 2.400030165750522e-07, + "loss": 1.6851, + "step": 31595 + }, + { + "epoch": 9.697974217311234, + "grad_norm": 0.14619939029216766, + "learning_rate": 2.3951683075646994e-07, + "loss": 1.6998, + "step": 31596 + }, + { + "epoch": 9.69828115408226, + "grad_norm": 0.13628968596458435, + "learning_rate": 2.3903113670015186e-07, + "loss": 1.7756, + "step": 31597 + }, + { + "epoch": 9.698588090853285, + "grad_norm": 0.12398962676525116, + "learning_rate": 2.385459344108887e-07, + "loss": 1.6911, + "step": 31598 + }, + { + "epoch": 9.69889502762431, + "grad_norm": 0.10792331397533417, + "learning_rate": 2.3806122389348761e-07, + "loss": 1.6899, + "step": 31599 + }, + { + "epoch": 9.699201964395334, + "grad_norm": 0.18480929732322693, + "learning_rate": 2.3757700515272264e-07, + "loss": 1.7635, + "step": 31600 + }, + { + "epoch": 9.699508901166359, + "grad_norm": 0.15164418518543243, + "learning_rate": 2.3709327819339543e-07, + "loss": 1.7156, + "step": 31601 + }, + { + "epoch": 9.699815837937384, + "grad_norm": 0.11357399821281433, + "learning_rate": 2.3661004302027444e-07, + "loss": 1.6584, + "step": 31602 + }, + { + "epoch": 9.70012277470841, + "grad_norm": 0.12062408030033112, + "learning_rate": 2.3612729963814473e-07, + "loss": 1.6768, + "step": 31603 + }, + { + "epoch": 9.700429711479435, + "grad_norm": 0.12755636870861053, + "learning_rate": 2.3564504805176912e-07, + "loss": 1.7017, + "step": 31604 + }, + { + "epoch": 9.70073664825046, + "grad_norm": 0.09880411624908447, + "learning_rate": 2.3516328826591605e-07, + "loss": 1.679, + "step": 31605 + }, + { + "epoch": 9.701043585021486, + "grad_norm": 0.18026186525821686, + "learning_rate": 2.3468202028535392e-07, + "loss": 1.7464, + "step": 31606 + }, + { + "epoch": 9.701350521792511, + "grad_norm": 0.10250361263751984, + "learning_rate": 2.342012441148289e-07, + "loss": 1.6973, + "step": 31607 + }, + { + "epoch": 9.701657458563536, + "grad_norm": 0.12328560650348663, + "learning_rate": 2.337209597590928e-07, + "loss": 1.6532, + "step": 31608 + }, + { + "epoch": 9.701964395334562, + "grad_norm": 0.10625593364238739, + "learning_rate": 2.3324116722289734e-07, + "loss": 1.714, + "step": 31609 + }, + { + "epoch": 9.702271332105585, + "grad_norm": 0.13381624221801758, + "learning_rate": 2.327618665109832e-07, + "loss": 1.7343, + "step": 31610 + }, + { + "epoch": 9.70257826887661, + "grad_norm": 0.14365731179714203, + "learning_rate": 2.3228305762808545e-07, + "loss": 1.6985, + "step": 31611 + }, + { + "epoch": 9.702885205647636, + "grad_norm": 0.12659169733524323, + "learning_rate": 2.3180474057893364e-07, + "loss": 1.682, + "step": 31612 + }, + { + "epoch": 9.703192142418661, + "grad_norm": 0.17809925973415375, + "learning_rate": 2.313269153682629e-07, + "loss": 1.7569, + "step": 31613 + }, + { + "epoch": 9.703499079189687, + "grad_norm": 0.15143713355064392, + "learning_rate": 2.308495820007861e-07, + "loss": 1.6935, + "step": 31614 + }, + { + "epoch": 9.703806015960712, + "grad_norm": 0.1365015059709549, + "learning_rate": 2.303727404812217e-07, + "loss": 1.7068, + "step": 31615 + }, + { + "epoch": 9.704112952731737, + "grad_norm": 0.13594263792037964, + "learning_rate": 2.2989639081428816e-07, + "loss": 1.7117, + "step": 31616 + }, + { + "epoch": 9.704419889502763, + "grad_norm": 0.10336802899837494, + "learning_rate": 2.2942053300468724e-07, + "loss": 1.6819, + "step": 31617 + }, + { + "epoch": 9.704726826273788, + "grad_norm": 0.19912748038768768, + "learning_rate": 2.2894516705713188e-07, + "loss": 1.7343, + "step": 31618 + }, + { + "epoch": 9.705033763044813, + "grad_norm": 0.15230657160282135, + "learning_rate": 2.2847029297630162e-07, + "loss": 1.7344, + "step": 31619 + }, + { + "epoch": 9.705340699815839, + "grad_norm": 0.18917563557624817, + "learning_rate": 2.2799591076690386e-07, + "loss": 1.7307, + "step": 31620 + }, + { + "epoch": 9.705647636586864, + "grad_norm": 0.1437673717737198, + "learning_rate": 2.2752202043362924e-07, + "loss": 1.7074, + "step": 31621 + }, + { + "epoch": 9.705954573357888, + "grad_norm": 0.14478498697280884, + "learning_rate": 2.2704862198114628e-07, + "loss": 1.7035, + "step": 31622 + }, + { + "epoch": 9.706261510128913, + "grad_norm": 0.1284007877111435, + "learning_rate": 2.265757154141457e-07, + "loss": 1.7298, + "step": 31623 + }, + { + "epoch": 9.706568446899938, + "grad_norm": 0.1506684273481369, + "learning_rate": 2.261033007372959e-07, + "loss": 1.7109, + "step": 31624 + }, + { + "epoch": 9.706875383670964, + "grad_norm": 0.13655513525009155, + "learning_rate": 2.2563137795526545e-07, + "loss": 1.7158, + "step": 31625 + }, + { + "epoch": 9.707182320441989, + "grad_norm": 0.1190224140882492, + "learning_rate": 2.2515994707271725e-07, + "loss": 1.7004, + "step": 31626 + }, + { + "epoch": 9.707489257213014, + "grad_norm": 0.12282036989927292, + "learning_rate": 2.246890080943198e-07, + "loss": 1.6922, + "step": 31627 + }, + { + "epoch": 9.70779619398404, + "grad_norm": 0.12748375535011292, + "learning_rate": 2.2421856102471383e-07, + "loss": 1.7268, + "step": 31628 + }, + { + "epoch": 9.708103130755065, + "grad_norm": 0.12438757717609406, + "learning_rate": 2.2374860586855671e-07, + "loss": 1.6938, + "step": 31629 + }, + { + "epoch": 9.70841006752609, + "grad_norm": 0.11385367810726166, + "learning_rate": 2.2327914263048922e-07, + "loss": 1.6872, + "step": 31630 + }, + { + "epoch": 9.708717004297116, + "grad_norm": 0.13927948474884033, + "learning_rate": 2.2281017131515757e-07, + "loss": 1.7017, + "step": 31631 + }, + { + "epoch": 9.70902394106814, + "grad_norm": 0.15019075572490692, + "learning_rate": 2.2234169192718035e-07, + "loss": 1.7064, + "step": 31632 + }, + { + "epoch": 9.709330877839164, + "grad_norm": 0.12574142217636108, + "learning_rate": 2.2187370447120936e-07, + "loss": 1.7117, + "step": 31633 + }, + { + "epoch": 9.70963781461019, + "grad_norm": 0.13135144114494324, + "learning_rate": 2.2140620895185203e-07, + "loss": 1.6889, + "step": 31634 + }, + { + "epoch": 9.709944751381215, + "grad_norm": 0.10573926568031311, + "learning_rate": 2.2093920537373803e-07, + "loss": 1.6459, + "step": 31635 + }, + { + "epoch": 9.71025168815224, + "grad_norm": 0.1492786854505539, + "learning_rate": 2.204726937414747e-07, + "loss": 1.725, + "step": 31636 + }, + { + "epoch": 9.710558624923266, + "grad_norm": 0.17757928371429443, + "learning_rate": 2.2000667405968067e-07, + "loss": 1.7061, + "step": 31637 + }, + { + "epoch": 9.710865561694291, + "grad_norm": 0.15048767626285553, + "learning_rate": 2.1954114633295774e-07, + "loss": 1.7005, + "step": 31638 + }, + { + "epoch": 9.711172498465316, + "grad_norm": 0.12468421459197998, + "learning_rate": 2.1907611056590226e-07, + "loss": 1.7114, + "step": 31639 + }, + { + "epoch": 9.711479435236342, + "grad_norm": 0.13750670850276947, + "learning_rate": 2.1861156676312167e-07, + "loss": 1.6938, + "step": 31640 + }, + { + "epoch": 9.711786372007367, + "grad_norm": 0.13854531943798065, + "learning_rate": 2.181475149291956e-07, + "loss": 1.7466, + "step": 31641 + }, + { + "epoch": 9.712093308778392, + "grad_norm": 0.18358713388442993, + "learning_rate": 2.176839550687093e-07, + "loss": 1.7657, + "step": 31642 + }, + { + "epoch": 9.712400245549416, + "grad_norm": 0.0844811424612999, + "learning_rate": 2.1722088718625354e-07, + "loss": 1.6235, + "step": 31643 + }, + { + "epoch": 9.712707182320441, + "grad_norm": 0.12278879433870316, + "learning_rate": 2.167583112863969e-07, + "loss": 1.6819, + "step": 31644 + }, + { + "epoch": 9.713014119091467, + "grad_norm": 0.13768786191940308, + "learning_rate": 2.16296227373719e-07, + "loss": 1.7045, + "step": 31645 + }, + { + "epoch": 9.713321055862492, + "grad_norm": 0.15438923239707947, + "learning_rate": 2.1583463545277739e-07, + "loss": 1.7322, + "step": 31646 + }, + { + "epoch": 9.713627992633517, + "grad_norm": 0.19160570204257965, + "learning_rate": 2.1537353552813498e-07, + "loss": 1.8026, + "step": 31647 + }, + { + "epoch": 9.713934929404543, + "grad_norm": 0.11172829568386078, + "learning_rate": 2.149129276043549e-07, + "loss": 1.6935, + "step": 31648 + }, + { + "epoch": 9.714241866175568, + "grad_norm": 0.16613627970218658, + "learning_rate": 2.1445281168598342e-07, + "loss": 1.6866, + "step": 31649 + }, + { + "epoch": 9.714548802946593, + "grad_norm": 0.12793566286563873, + "learning_rate": 2.1399318777756695e-07, + "loss": 1.6836, + "step": 31650 + }, + { + "epoch": 9.714855739717619, + "grad_norm": 0.13563989102840424, + "learning_rate": 2.1353405588365182e-07, + "loss": 1.6679, + "step": 31651 + }, + { + "epoch": 9.715162676488644, + "grad_norm": 0.15428829193115234, + "learning_rate": 2.1307541600877888e-07, + "loss": 1.6762, + "step": 31652 + }, + { + "epoch": 9.715469613259668, + "grad_norm": 0.14353898167610168, + "learning_rate": 2.1261726815746673e-07, + "loss": 1.6908, + "step": 31653 + }, + { + "epoch": 9.715776550030693, + "grad_norm": 0.12383358925580978, + "learning_rate": 2.1215961233426174e-07, + "loss": 1.6732, + "step": 31654 + }, + { + "epoch": 9.716083486801718, + "grad_norm": 0.14675362408161163, + "learning_rate": 2.117024485436714e-07, + "loss": 1.7162, + "step": 31655 + }, + { + "epoch": 9.716390423572744, + "grad_norm": 0.11572350561618805, + "learning_rate": 2.1124577679021985e-07, + "loss": 1.631, + "step": 31656 + }, + { + "epoch": 9.716697360343769, + "grad_norm": 0.1518344134092331, + "learning_rate": 2.1078959707842015e-07, + "loss": 1.7398, + "step": 31657 + }, + { + "epoch": 9.717004297114794, + "grad_norm": 0.11649021506309509, + "learning_rate": 2.1033390941277985e-07, + "loss": 1.6581, + "step": 31658 + }, + { + "epoch": 9.71731123388582, + "grad_norm": 0.12223310023546219, + "learning_rate": 2.098787137978009e-07, + "loss": 1.7291, + "step": 31659 + }, + { + "epoch": 9.717618170656845, + "grad_norm": 0.15575721859931946, + "learning_rate": 2.094240102379852e-07, + "loss": 1.686, + "step": 31660 + }, + { + "epoch": 9.71792510742787, + "grad_norm": 0.10441846400499344, + "learning_rate": 2.0896979873782918e-07, + "loss": 1.6717, + "step": 31661 + }, + { + "epoch": 9.718232044198896, + "grad_norm": 0.13644640147686005, + "learning_rate": 2.0851607930180706e-07, + "loss": 1.715, + "step": 31662 + }, + { + "epoch": 9.718538980969921, + "grad_norm": 0.1860501617193222, + "learning_rate": 2.0806285193442077e-07, + "loss": 1.707, + "step": 31663 + }, + { + "epoch": 9.718845917740946, + "grad_norm": 0.12100571393966675, + "learning_rate": 2.0761011664013897e-07, + "loss": 1.6997, + "step": 31664 + }, + { + "epoch": 9.71915285451197, + "grad_norm": 0.09347312152385712, + "learning_rate": 2.0715787342343584e-07, + "loss": 1.6688, + "step": 31665 + }, + { + "epoch": 9.719459791282995, + "grad_norm": 0.19816496968269348, + "learning_rate": 2.067061222887856e-07, + "loss": 1.713, + "step": 31666 + }, + { + "epoch": 9.71976672805402, + "grad_norm": 0.16399987041950226, + "learning_rate": 2.0625486324065135e-07, + "loss": 1.7973, + "step": 31667 + }, + { + "epoch": 9.720073664825046, + "grad_norm": 0.12751246988773346, + "learning_rate": 2.058040962834906e-07, + "loss": 1.7204, + "step": 31668 + }, + { + "epoch": 9.720380601596071, + "grad_norm": 0.16934554278850555, + "learning_rate": 2.0535382142176096e-07, + "loss": 1.7078, + "step": 31669 + }, + { + "epoch": 9.720687538367097, + "grad_norm": 0.18634845316410065, + "learning_rate": 2.0490403865990325e-07, + "loss": 1.7486, + "step": 31670 + }, + { + "epoch": 9.720994475138122, + "grad_norm": 0.1632041186094284, + "learning_rate": 2.0445474800237508e-07, + "loss": 1.7102, + "step": 31671 + }, + { + "epoch": 9.721301411909147, + "grad_norm": 0.13699625432491302, + "learning_rate": 2.0400594945361172e-07, + "loss": 1.7066, + "step": 31672 + }, + { + "epoch": 9.721608348680173, + "grad_norm": 0.11776915192604065, + "learning_rate": 2.0355764301804858e-07, + "loss": 1.7083, + "step": 31673 + }, + { + "epoch": 9.721915285451198, + "grad_norm": 0.10446945577859879, + "learning_rate": 2.031098287001154e-07, + "loss": 1.6481, + "step": 31674 + }, + { + "epoch": 9.722222222222221, + "grad_norm": 0.09323536604642868, + "learning_rate": 2.026625065042309e-07, + "loss": 1.6474, + "step": 31675 + }, + { + "epoch": 9.722529158993247, + "grad_norm": 0.11908341199159622, + "learning_rate": 2.022156764348304e-07, + "loss": 1.6824, + "step": 31676 + }, + { + "epoch": 9.722836095764272, + "grad_norm": 0.14512252807617188, + "learning_rate": 2.0176933849631596e-07, + "loss": 1.6755, + "step": 31677 + }, + { + "epoch": 9.723143032535297, + "grad_norm": 0.11505481600761414, + "learning_rate": 2.0132349269311178e-07, + "loss": 1.6893, + "step": 31678 + }, + { + "epoch": 9.723449969306323, + "grad_norm": 0.12112774699926376, + "learning_rate": 2.0087813902960884e-07, + "loss": 1.6801, + "step": 31679 + }, + { + "epoch": 9.723756906077348, + "grad_norm": 0.14546430110931396, + "learning_rate": 2.0043327751022579e-07, + "loss": 1.7217, + "step": 31680 + }, + { + "epoch": 9.724063842848373, + "grad_norm": 0.1829695701599121, + "learning_rate": 1.9998890813934247e-07, + "loss": 1.7101, + "step": 31681 + }, + { + "epoch": 9.724370779619399, + "grad_norm": 0.1490027755498886, + "learning_rate": 1.995450309213609e-07, + "loss": 1.7312, + "step": 31682 + }, + { + "epoch": 9.724677716390424, + "grad_norm": 0.14473678171634674, + "learning_rate": 1.9910164586066093e-07, + "loss": 1.6541, + "step": 31683 + }, + { + "epoch": 9.72498465316145, + "grad_norm": 0.15499809384346008, + "learning_rate": 1.9865875296162793e-07, + "loss": 1.7028, + "step": 31684 + }, + { + "epoch": 9.725291589932475, + "grad_norm": 0.17470933496952057, + "learning_rate": 1.9821635222864176e-07, + "loss": 1.7429, + "step": 31685 + }, + { + "epoch": 9.725598526703498, + "grad_norm": 0.16423273086547852, + "learning_rate": 1.977744436660711e-07, + "loss": 1.6993, + "step": 31686 + }, + { + "epoch": 9.725905463474524, + "grad_norm": 0.1388999968767166, + "learning_rate": 1.9733302727827918e-07, + "loss": 1.7257, + "step": 31687 + }, + { + "epoch": 9.726212400245549, + "grad_norm": 0.12693311274051666, + "learning_rate": 1.9689210306963467e-07, + "loss": 1.6897, + "step": 31688 + }, + { + "epoch": 9.726519337016574, + "grad_norm": 0.15273548662662506, + "learning_rate": 1.9645167104449524e-07, + "loss": 1.7357, + "step": 31689 + }, + { + "epoch": 9.7268262737876, + "grad_norm": 0.15459993481636047, + "learning_rate": 1.960117312072074e-07, + "loss": 1.7333, + "step": 31690 + }, + { + "epoch": 9.727133210558625, + "grad_norm": 0.13200953602790833, + "learning_rate": 1.9557228356212875e-07, + "loss": 1.6882, + "step": 31691 + }, + { + "epoch": 9.72744014732965, + "grad_norm": 0.10575802624225616, + "learning_rate": 1.9513332811358919e-07, + "loss": 1.6204, + "step": 31692 + }, + { + "epoch": 9.727747084100676, + "grad_norm": 0.20376244187355042, + "learning_rate": 1.9469486486593525e-07, + "loss": 1.7743, + "step": 31693 + }, + { + "epoch": 9.728054020871701, + "grad_norm": 0.10688602179288864, + "learning_rate": 1.9425689382350232e-07, + "loss": 1.6624, + "step": 31694 + }, + { + "epoch": 9.728360957642726, + "grad_norm": 0.15587441623210907, + "learning_rate": 1.9381941499060918e-07, + "loss": 1.6399, + "step": 31695 + }, + { + "epoch": 9.72866789441375, + "grad_norm": 0.1695834845304489, + "learning_rate": 1.9338242837159126e-07, + "loss": 1.7186, + "step": 31696 + }, + { + "epoch": 9.728974831184775, + "grad_norm": 0.14353398978710175, + "learning_rate": 1.9294593397075623e-07, + "loss": 1.6899, + "step": 31697 + }, + { + "epoch": 9.7292817679558, + "grad_norm": 0.12760649621486664, + "learning_rate": 1.9250993179242284e-07, + "loss": 1.7436, + "step": 31698 + }, + { + "epoch": 9.729588704726826, + "grad_norm": 0.16516657173633575, + "learning_rate": 1.920744218409043e-07, + "loss": 1.7082, + "step": 31699 + }, + { + "epoch": 9.729895641497851, + "grad_norm": 0.10934159904718399, + "learning_rate": 1.9163940412049165e-07, + "loss": 1.6901, + "step": 31700 + }, + { + "epoch": 9.730202578268877, + "grad_norm": 0.16668133437633514, + "learning_rate": 1.9120487863549807e-07, + "loss": 1.7259, + "step": 31701 + }, + { + "epoch": 9.730509515039902, + "grad_norm": 0.12656927108764648, + "learning_rate": 1.9077084539020908e-07, + "loss": 1.6655, + "step": 31702 + }, + { + "epoch": 9.730816451810927, + "grad_norm": 0.13380050659179688, + "learning_rate": 1.903373043889156e-07, + "loss": 1.7056, + "step": 31703 + }, + { + "epoch": 9.731123388581953, + "grad_norm": 0.19093738496303558, + "learning_rate": 1.8990425563590319e-07, + "loss": 1.7915, + "step": 31704 + }, + { + "epoch": 9.731430325352978, + "grad_norm": 0.14314888417720795, + "learning_rate": 1.8947169913545725e-07, + "loss": 1.6864, + "step": 31705 + }, + { + "epoch": 9.731737262124003, + "grad_norm": 0.12564614415168762, + "learning_rate": 1.8903963489184107e-07, + "loss": 1.6877, + "step": 31706 + }, + { + "epoch": 9.732044198895027, + "grad_norm": 0.15374313294887543, + "learning_rate": 1.8860806290932897e-07, + "loss": 1.6809, + "step": 31707 + }, + { + "epoch": 9.732351135666052, + "grad_norm": 0.16379213333129883, + "learning_rate": 1.8817698319219535e-07, + "loss": 1.7481, + "step": 31708 + }, + { + "epoch": 9.732658072437077, + "grad_norm": 0.24672576785087585, + "learning_rate": 1.8774639574468677e-07, + "loss": 1.7644, + "step": 31709 + }, + { + "epoch": 9.732965009208103, + "grad_norm": 0.13296177983283997, + "learning_rate": 1.8731630057106653e-07, + "loss": 1.7087, + "step": 31710 + }, + { + "epoch": 9.733271945979128, + "grad_norm": 0.12447187304496765, + "learning_rate": 1.868866976755812e-07, + "loss": 1.7066, + "step": 31711 + }, + { + "epoch": 9.733578882750153, + "grad_norm": 0.15150503814220428, + "learning_rate": 1.8645758706247741e-07, + "loss": 1.7114, + "step": 31712 + }, + { + "epoch": 9.733885819521179, + "grad_norm": 0.1416541039943695, + "learning_rate": 1.8602896873599619e-07, + "loss": 1.695, + "step": 31713 + }, + { + "epoch": 9.734192756292204, + "grad_norm": 0.18281929194927216, + "learning_rate": 1.8560084270037304e-07, + "loss": 1.6934, + "step": 31714 + }, + { + "epoch": 9.73449969306323, + "grad_norm": 0.16674144566059113, + "learning_rate": 1.8517320895984347e-07, + "loss": 1.7168, + "step": 31715 + }, + { + "epoch": 9.734806629834255, + "grad_norm": 0.12002552300691605, + "learning_rate": 1.8474606751862632e-07, + "loss": 1.6761, + "step": 31716 + }, + { + "epoch": 9.735113566605278, + "grad_norm": 0.12910617887973785, + "learning_rate": 1.84319418380946e-07, + "loss": 1.6985, + "step": 31717 + }, + { + "epoch": 9.735420503376304, + "grad_norm": 0.14131152629852295, + "learning_rate": 1.838932615510214e-07, + "loss": 1.6874, + "step": 31718 + }, + { + "epoch": 9.735727440147329, + "grad_norm": 0.11726522445678711, + "learning_rate": 1.834675970330546e-07, + "loss": 1.6776, + "step": 31719 + }, + { + "epoch": 9.736034376918354, + "grad_norm": 0.13109390437602997, + "learning_rate": 1.830424248312701e-07, + "loss": 1.7053, + "step": 31720 + }, + { + "epoch": 9.73634131368938, + "grad_norm": 0.17507077753543854, + "learning_rate": 1.826177449498534e-07, + "loss": 1.7277, + "step": 31721 + }, + { + "epoch": 9.736648250460405, + "grad_norm": 0.17200970649719238, + "learning_rate": 1.821935573930067e-07, + "loss": 1.6439, + "step": 31722 + }, + { + "epoch": 9.73695518723143, + "grad_norm": 0.11237013339996338, + "learning_rate": 1.8176986216492665e-07, + "loss": 1.6619, + "step": 31723 + }, + { + "epoch": 9.737262124002456, + "grad_norm": 0.1799420714378357, + "learning_rate": 1.8134665926978767e-07, + "loss": 1.7776, + "step": 31724 + }, + { + "epoch": 9.737569060773481, + "grad_norm": 0.13868120312690735, + "learning_rate": 1.809239487117864e-07, + "loss": 1.7625, + "step": 31725 + }, + { + "epoch": 9.737875997544506, + "grad_norm": 0.2101743519306183, + "learning_rate": 1.8050173049509177e-07, + "loss": 1.7365, + "step": 31726 + }, + { + "epoch": 9.738182934315532, + "grad_norm": 0.14918360114097595, + "learning_rate": 1.8008000462388375e-07, + "loss": 1.724, + "step": 31727 + }, + { + "epoch": 9.738489871086557, + "grad_norm": 0.14308972656726837, + "learning_rate": 1.7965877110232565e-07, + "loss": 1.7002, + "step": 31728 + }, + { + "epoch": 9.73879680785758, + "grad_norm": 0.09658967703580856, + "learning_rate": 1.792380299345753e-07, + "loss": 1.659, + "step": 31729 + }, + { + "epoch": 9.739103744628606, + "grad_norm": 0.14846646785736084, + "learning_rate": 1.7881778112479596e-07, + "loss": 1.7108, + "step": 31730 + }, + { + "epoch": 9.739410681399631, + "grad_norm": 0.13030952215194702, + "learning_rate": 1.783980246771455e-07, + "loss": 1.6758, + "step": 31731 + }, + { + "epoch": 9.739717618170657, + "grad_norm": 0.15918081998825073, + "learning_rate": 1.77978760595765e-07, + "loss": 1.7569, + "step": 31732 + }, + { + "epoch": 9.740024554941682, + "grad_norm": 0.15745976567268372, + "learning_rate": 1.7755998888479563e-07, + "loss": 1.7562, + "step": 31733 + }, + { + "epoch": 9.740331491712707, + "grad_norm": 0.21078935265541077, + "learning_rate": 1.7714170954838405e-07, + "loss": 1.8209, + "step": 31734 + }, + { + "epoch": 9.740638428483733, + "grad_norm": 0.14896774291992188, + "learning_rate": 1.767239225906603e-07, + "loss": 1.6878, + "step": 31735 + }, + { + "epoch": 9.740945365254758, + "grad_norm": 0.146200031042099, + "learning_rate": 1.7630662801575438e-07, + "loss": 1.7155, + "step": 31736 + }, + { + "epoch": 9.741252302025783, + "grad_norm": 0.09577162563800812, + "learning_rate": 1.7588982582778523e-07, + "loss": 1.6537, + "step": 31737 + }, + { + "epoch": 9.741559238796809, + "grad_norm": 0.09476766735315323, + "learning_rate": 1.7547351603088292e-07, + "loss": 1.6383, + "step": 31738 + }, + { + "epoch": 9.741866175567832, + "grad_norm": 0.166427344083786, + "learning_rate": 1.7505769862914412e-07, + "loss": 1.7393, + "step": 31739 + }, + { + "epoch": 9.742173112338858, + "grad_norm": 0.12341952323913574, + "learning_rate": 1.7464237362669333e-07, + "loss": 1.6721, + "step": 31740 + }, + { + "epoch": 9.742480049109883, + "grad_norm": 0.16770128905773163, + "learning_rate": 1.7422754102763283e-07, + "loss": 1.7883, + "step": 31741 + }, + { + "epoch": 9.742786985880908, + "grad_norm": 0.099067822098732, + "learning_rate": 1.7381320083605935e-07, + "loss": 1.6347, + "step": 31742 + }, + { + "epoch": 9.743093922651934, + "grad_norm": 0.12981869280338287, + "learning_rate": 1.7339935305606404e-07, + "loss": 1.7064, + "step": 31743 + }, + { + "epoch": 9.743400859422959, + "grad_norm": 0.12666809558868408, + "learning_rate": 1.7298599769173806e-07, + "loss": 1.6429, + "step": 31744 + }, + { + "epoch": 9.743707796193984, + "grad_norm": 0.14717376232147217, + "learning_rate": 1.7257313474717817e-07, + "loss": 1.7033, + "step": 31745 + }, + { + "epoch": 9.74401473296501, + "grad_norm": 0.12324973195791245, + "learning_rate": 1.7216076422644777e-07, + "loss": 1.7018, + "step": 31746 + }, + { + "epoch": 9.744321669736035, + "grad_norm": 0.15551744401454926, + "learning_rate": 1.717488861336325e-07, + "loss": 1.7483, + "step": 31747 + }, + { + "epoch": 9.74462860650706, + "grad_norm": 0.1447838693857193, + "learning_rate": 1.7133750047280128e-07, + "loss": 1.709, + "step": 31748 + }, + { + "epoch": 9.744935543278086, + "grad_norm": 0.10742588341236115, + "learning_rate": 1.7092660724801756e-07, + "loss": 1.6927, + "step": 31749 + }, + { + "epoch": 9.745242480049109, + "grad_norm": 0.16011138260364532, + "learning_rate": 1.7051620646333922e-07, + "loss": 1.7005, + "step": 31750 + }, + { + "epoch": 9.745549416820134, + "grad_norm": 0.11633095890283585, + "learning_rate": 1.7010629812282962e-07, + "loss": 1.6779, + "step": 31751 + }, + { + "epoch": 9.74585635359116, + "grad_norm": 0.14908172190189362, + "learning_rate": 1.696968822305356e-07, + "loss": 1.7064, + "step": 31752 + }, + { + "epoch": 9.746163290362185, + "grad_norm": 0.11630599200725555, + "learning_rate": 1.692879587904983e-07, + "loss": 1.6689, + "step": 31753 + }, + { + "epoch": 9.74647022713321, + "grad_norm": 0.15253309905529022, + "learning_rate": 1.6887952780677008e-07, + "loss": 1.6946, + "step": 31754 + }, + { + "epoch": 9.746777163904236, + "grad_norm": 0.11310866475105286, + "learning_rate": 1.6847158928338103e-07, + "loss": 1.6834, + "step": 31755 + }, + { + "epoch": 9.747084100675261, + "grad_norm": 0.13006237149238586, + "learning_rate": 1.6806414322436127e-07, + "loss": 1.6968, + "step": 31756 + }, + { + "epoch": 9.747391037446286, + "grad_norm": 0.16225489974021912, + "learning_rate": 1.676571896337409e-07, + "loss": 1.7624, + "step": 31757 + }, + { + "epoch": 9.747697974217312, + "grad_norm": 0.16863548755645752, + "learning_rate": 1.672507285155389e-07, + "loss": 1.8187, + "step": 31758 + }, + { + "epoch": 9.748004910988337, + "grad_norm": 0.17859725654125214, + "learning_rate": 1.6684475987377434e-07, + "loss": 1.7323, + "step": 31759 + }, + { + "epoch": 9.74831184775936, + "grad_norm": 0.14921754598617554, + "learning_rate": 1.664392837124551e-07, + "loss": 1.6843, + "step": 31760 + }, + { + "epoch": 9.748618784530386, + "grad_norm": 0.12703189253807068, + "learning_rate": 1.6603430003558906e-07, + "loss": 1.6854, + "step": 31761 + }, + { + "epoch": 9.748925721301411, + "grad_norm": 0.09212498366832733, + "learning_rate": 1.6562980884718414e-07, + "loss": 1.6299, + "step": 31762 + }, + { + "epoch": 9.749232658072437, + "grad_norm": 0.14094288647174835, + "learning_rate": 1.6522581015123718e-07, + "loss": 1.7041, + "step": 31763 + }, + { + "epoch": 9.749539594843462, + "grad_norm": 0.16065463423728943, + "learning_rate": 1.6482230395173382e-07, + "loss": 1.7474, + "step": 31764 + }, + { + "epoch": 9.749846531614487, + "grad_norm": 0.13509607315063477, + "learning_rate": 1.6441929025266533e-07, + "loss": 1.7262, + "step": 31765 + }, + { + "epoch": 9.750153468385513, + "grad_norm": 0.20273075997829437, + "learning_rate": 1.6401676905801743e-07, + "loss": 1.7401, + "step": 31766 + }, + { + "epoch": 9.750460405156538, + "grad_norm": 0.14658035337924957, + "learning_rate": 1.6361474037176473e-07, + "loss": 1.766, + "step": 31767 + }, + { + "epoch": 9.750767341927563, + "grad_norm": 0.13443495333194733, + "learning_rate": 1.6321320419788177e-07, + "loss": 1.6687, + "step": 31768 + }, + { + "epoch": 9.751074278698589, + "grad_norm": 0.15590953826904297, + "learning_rate": 1.628121605403321e-07, + "loss": 1.6915, + "step": 31769 + }, + { + "epoch": 9.751381215469614, + "grad_norm": 0.11670281738042831, + "learning_rate": 1.6241160940308476e-07, + "loss": 1.7105, + "step": 31770 + }, + { + "epoch": 9.75168815224064, + "grad_norm": 0.12388762086629868, + "learning_rate": 1.6201155079010322e-07, + "loss": 1.7009, + "step": 31771 + }, + { + "epoch": 9.751995089011663, + "grad_norm": 0.15842701494693756, + "learning_rate": 1.6161198470532878e-07, + "loss": 1.7133, + "step": 31772 + }, + { + "epoch": 9.752302025782688, + "grad_norm": 0.1379842609167099, + "learning_rate": 1.612129111527194e-07, + "loss": 1.7162, + "step": 31773 + }, + { + "epoch": 9.752608962553714, + "grad_norm": 0.1491837501525879, + "learning_rate": 1.6081433013621084e-07, + "loss": 1.7535, + "step": 31774 + }, + { + "epoch": 9.752915899324739, + "grad_norm": 0.17003466188907623, + "learning_rate": 1.6041624165974989e-07, + "loss": 1.746, + "step": 31775 + }, + { + "epoch": 9.753222836095764, + "grad_norm": 0.15136978030204773, + "learning_rate": 1.6001864572726676e-07, + "loss": 1.7365, + "step": 31776 + }, + { + "epoch": 9.75352977286679, + "grad_norm": 0.12367337197065353, + "learning_rate": 1.596215423426861e-07, + "loss": 1.706, + "step": 31777 + }, + { + "epoch": 9.753836709637815, + "grad_norm": 0.16162081062793732, + "learning_rate": 1.5922493150994365e-07, + "loss": 1.7657, + "step": 31778 + }, + { + "epoch": 9.75414364640884, + "grad_norm": 0.11211063712835312, + "learning_rate": 1.5882881323295294e-07, + "loss": 1.6785, + "step": 31779 + }, + { + "epoch": 9.754450583179866, + "grad_norm": 0.12570419907569885, + "learning_rate": 1.584331875156275e-07, + "loss": 1.6994, + "step": 31780 + }, + { + "epoch": 9.754757519950891, + "grad_norm": 0.11583932489156723, + "learning_rate": 1.5803805436188092e-07, + "loss": 1.6528, + "step": 31781 + }, + { + "epoch": 9.755064456721914, + "grad_norm": 0.1248580664396286, + "learning_rate": 1.5764341377561554e-07, + "loss": 1.7237, + "step": 31782 + }, + { + "epoch": 9.75537139349294, + "grad_norm": 0.1245606392621994, + "learning_rate": 1.572492657607283e-07, + "loss": 1.6775, + "step": 31783 + }, + { + "epoch": 9.755678330263965, + "grad_norm": 0.13548308610916138, + "learning_rate": 1.5685561032111607e-07, + "loss": 1.7073, + "step": 31784 + }, + { + "epoch": 9.75598526703499, + "grad_norm": 0.12355189025402069, + "learning_rate": 1.5646244746067572e-07, + "loss": 1.6977, + "step": 31785 + }, + { + "epoch": 9.756292203806016, + "grad_norm": 0.14441610872745514, + "learning_rate": 1.5606977718328197e-07, + "loss": 1.7283, + "step": 31786 + }, + { + "epoch": 9.756599140577041, + "grad_norm": 0.12972392141819, + "learning_rate": 1.556775994928261e-07, + "loss": 1.7062, + "step": 31787 + }, + { + "epoch": 9.756906077348066, + "grad_norm": 0.1203489899635315, + "learning_rate": 1.552859143931773e-07, + "loss": 1.6927, + "step": 31788 + }, + { + "epoch": 9.757213014119092, + "grad_norm": 0.14732889831066132, + "learning_rate": 1.548947218882102e-07, + "loss": 1.7197, + "step": 31789 + }, + { + "epoch": 9.757519950890117, + "grad_norm": 0.20930984616279602, + "learning_rate": 1.5450402198178283e-07, + "loss": 1.7926, + "step": 31790 + }, + { + "epoch": 9.757826887661142, + "grad_norm": 0.15674839913845062, + "learning_rate": 1.5411381467776986e-07, + "loss": 1.7204, + "step": 31791 + }, + { + "epoch": 9.758133824432168, + "grad_norm": 0.15836498141288757, + "learning_rate": 1.537240999800127e-07, + "loss": 1.7032, + "step": 31792 + }, + { + "epoch": 9.758440761203191, + "grad_norm": 0.11274401843547821, + "learning_rate": 1.5333487789237488e-07, + "loss": 1.6818, + "step": 31793 + }, + { + "epoch": 9.758747697974217, + "grad_norm": 0.10347522795200348, + "learning_rate": 1.5294614841869226e-07, + "loss": 1.6521, + "step": 31794 + }, + { + "epoch": 9.759054634745242, + "grad_norm": 0.17067036032676697, + "learning_rate": 1.525579115628173e-07, + "loss": 1.716, + "step": 31795 + }, + { + "epoch": 9.759361571516267, + "grad_norm": 0.11318463832139969, + "learning_rate": 1.5217016732858024e-07, + "loss": 1.6751, + "step": 31796 + }, + { + "epoch": 9.759668508287293, + "grad_norm": 0.15316587686538696, + "learning_rate": 1.517829157198114e-07, + "loss": 1.7025, + "step": 31797 + }, + { + "epoch": 9.759975445058318, + "grad_norm": 0.13108935952186584, + "learning_rate": 1.5139615674034658e-07, + "loss": 1.671, + "step": 31798 + }, + { + "epoch": 9.760282381829343, + "grad_norm": 0.09524109214544296, + "learning_rate": 1.5100989039399939e-07, + "loss": 1.6556, + "step": 31799 + }, + { + "epoch": 9.760589318600369, + "grad_norm": 0.13735005259513855, + "learning_rate": 1.5062411668458898e-07, + "loss": 1.6774, + "step": 31800 + }, + { + "epoch": 9.760896255371394, + "grad_norm": 0.09719503670930862, + "learning_rate": 1.50238835615929e-07, + "loss": 1.6379, + "step": 31801 + }, + { + "epoch": 9.76120319214242, + "grad_norm": 0.10058867186307907, + "learning_rate": 1.4985404719182194e-07, + "loss": 1.6724, + "step": 31802 + }, + { + "epoch": 9.761510128913443, + "grad_norm": 0.15335088968276978, + "learning_rate": 1.4946975141608143e-07, + "loss": 1.7056, + "step": 31803 + }, + { + "epoch": 9.761817065684468, + "grad_norm": 0.10160773992538452, + "learning_rate": 1.4908594829249889e-07, + "loss": 1.679, + "step": 31804 + }, + { + "epoch": 9.762124002455494, + "grad_norm": 0.14888420701026917, + "learning_rate": 1.487026378248657e-07, + "loss": 1.7528, + "step": 31805 + }, + { + "epoch": 9.762430939226519, + "grad_norm": 0.15325312316417694, + "learning_rate": 1.483198200169733e-07, + "loss": 1.7099, + "step": 31806 + }, + { + "epoch": 9.762737875997544, + "grad_norm": 0.15677091479301453, + "learning_rate": 1.47937494872602e-07, + "loss": 1.7254, + "step": 31807 + }, + { + "epoch": 9.76304481276857, + "grad_norm": 0.1600019633769989, + "learning_rate": 1.4755566239553209e-07, + "loss": 1.7397, + "step": 31808 + }, + { + "epoch": 9.763351749539595, + "grad_norm": 0.12324594706296921, + "learning_rate": 1.4717432258953834e-07, + "loss": 1.6872, + "step": 31809 + }, + { + "epoch": 9.76365868631062, + "grad_norm": 0.14790895581245422, + "learning_rate": 1.467934754583844e-07, + "loss": 1.7191, + "step": 31810 + }, + { + "epoch": 9.763965623081646, + "grad_norm": 0.17064516246318817, + "learning_rate": 1.464131210058395e-07, + "loss": 1.7568, + "step": 31811 + }, + { + "epoch": 9.764272559852671, + "grad_norm": 0.10418350994586945, + "learning_rate": 1.460332592356617e-07, + "loss": 1.6212, + "step": 31812 + }, + { + "epoch": 9.764579496623696, + "grad_norm": 0.13646866381168365, + "learning_rate": 1.4565389015159803e-07, + "loss": 1.7569, + "step": 31813 + }, + { + "epoch": 9.764886433394722, + "grad_norm": 0.1761285811662674, + "learning_rate": 1.452750137574066e-07, + "loss": 1.7455, + "step": 31814 + }, + { + "epoch": 9.765193370165745, + "grad_norm": 0.13357558846473694, + "learning_rate": 1.4489663005682885e-07, + "loss": 1.7087, + "step": 31815 + }, + { + "epoch": 9.76550030693677, + "grad_norm": 0.13213472068309784, + "learning_rate": 1.445187390536007e-07, + "loss": 1.6868, + "step": 31816 + }, + { + "epoch": 9.765807243707796, + "grad_norm": 0.1197780966758728, + "learning_rate": 1.4414134075146358e-07, + "loss": 1.6629, + "step": 31817 + }, + { + "epoch": 9.766114180478821, + "grad_norm": 0.15487425029277802, + "learning_rate": 1.437644351541423e-07, + "loss": 1.7581, + "step": 31818 + }, + { + "epoch": 9.766421117249847, + "grad_norm": 0.1623966544866562, + "learning_rate": 1.4338802226536165e-07, + "loss": 1.7389, + "step": 31819 + }, + { + "epoch": 9.766728054020872, + "grad_norm": 0.14654842019081116, + "learning_rate": 1.4301210208884085e-07, + "loss": 1.7342, + "step": 31820 + }, + { + "epoch": 9.767034990791897, + "grad_norm": 0.10672096908092499, + "learning_rate": 1.4263667462829923e-07, + "loss": 1.6767, + "step": 31821 + }, + { + "epoch": 9.767341927562923, + "grad_norm": 0.15439334511756897, + "learning_rate": 1.4226173988744485e-07, + "loss": 1.7443, + "step": 31822 + }, + { + "epoch": 9.767648864333948, + "grad_norm": 0.15827670693397522, + "learning_rate": 1.4188729786998034e-07, + "loss": 1.7237, + "step": 31823 + }, + { + "epoch": 9.767955801104973, + "grad_norm": 0.19204497337341309, + "learning_rate": 1.4151334857960828e-07, + "loss": 1.7565, + "step": 31824 + }, + { + "epoch": 9.768262737875997, + "grad_norm": 0.1090131625533104, + "learning_rate": 1.4113989202002575e-07, + "loss": 1.6489, + "step": 31825 + }, + { + "epoch": 9.768569674647022, + "grad_norm": 0.09001188725233078, + "learning_rate": 1.4076692819491865e-07, + "loss": 1.6905, + "step": 31826 + }, + { + "epoch": 9.768876611418047, + "grad_norm": 0.16483090817928314, + "learning_rate": 1.4039445710797849e-07, + "loss": 1.7372, + "step": 31827 + }, + { + "epoch": 9.769183548189073, + "grad_norm": 0.1322876214981079, + "learning_rate": 1.4002247876288565e-07, + "loss": 1.698, + "step": 31828 + }, + { + "epoch": 9.769490484960098, + "grad_norm": 0.13790275156497955, + "learning_rate": 1.3965099316331498e-07, + "loss": 1.7089, + "step": 31829 + }, + { + "epoch": 9.769797421731123, + "grad_norm": 0.1355939358472824, + "learning_rate": 1.392800003129302e-07, + "loss": 1.7114, + "step": 31830 + }, + { + "epoch": 9.770104358502149, + "grad_norm": 0.11927379667758942, + "learning_rate": 1.389095002154117e-07, + "loss": 1.6954, + "step": 31831 + }, + { + "epoch": 9.770411295273174, + "grad_norm": 0.14327041804790497, + "learning_rate": 1.3853949287441215e-07, + "loss": 1.6788, + "step": 31832 + }, + { + "epoch": 9.7707182320442, + "grad_norm": 0.11641334742307663, + "learning_rate": 1.381699782935897e-07, + "loss": 1.6544, + "step": 31833 + }, + { + "epoch": 9.771025168815225, + "grad_norm": 0.1395263820886612, + "learning_rate": 1.3780095647659696e-07, + "loss": 1.7292, + "step": 31834 + }, + { + "epoch": 9.77133210558625, + "grad_norm": 0.09742346405982971, + "learning_rate": 1.3743242742708108e-07, + "loss": 1.6545, + "step": 31835 + }, + { + "epoch": 9.771639042357274, + "grad_norm": 0.1271921843290329, + "learning_rate": 1.3706439114868354e-07, + "loss": 1.6955, + "step": 31836 + }, + { + "epoch": 9.771945979128299, + "grad_norm": 0.1599912941455841, + "learning_rate": 1.366968476450403e-07, + "loss": 1.7292, + "step": 31837 + }, + { + "epoch": 9.772252915899324, + "grad_norm": 0.12538601458072662, + "learning_rate": 1.3632979691978186e-07, + "loss": 1.6533, + "step": 31838 + }, + { + "epoch": 9.77255985267035, + "grad_norm": 0.14297179877758026, + "learning_rate": 1.3596323897654418e-07, + "loss": 1.6559, + "step": 31839 + }, + { + "epoch": 9.772866789441375, + "grad_norm": 0.2182641178369522, + "learning_rate": 1.3559717381894098e-07, + "loss": 1.6973, + "step": 31840 + }, + { + "epoch": 9.7731737262124, + "grad_norm": 0.12279269844293594, + "learning_rate": 1.3523160145059165e-07, + "loss": 1.6714, + "step": 31841 + }, + { + "epoch": 9.773480662983426, + "grad_norm": 0.107692651450634, + "learning_rate": 1.3486652187510994e-07, + "loss": 1.6728, + "step": 31842 + }, + { + "epoch": 9.773787599754451, + "grad_norm": 0.09669972956180573, + "learning_rate": 1.345019350961041e-07, + "loss": 1.6375, + "step": 31843 + }, + { + "epoch": 9.774094536525476, + "grad_norm": 0.2592116594314575, + "learning_rate": 1.341378411171823e-07, + "loss": 1.8469, + "step": 31844 + }, + { + "epoch": 9.774401473296502, + "grad_norm": 0.1268083155155182, + "learning_rate": 1.337742399419306e-07, + "loss": 1.6961, + "step": 31845 + }, + { + "epoch": 9.774708410067525, + "grad_norm": 0.11143521219491959, + "learning_rate": 1.3341113157395723e-07, + "loss": 1.6951, + "step": 31846 + }, + { + "epoch": 9.77501534683855, + "grad_norm": 0.15122225880622864, + "learning_rate": 1.330485160168371e-07, + "loss": 1.7612, + "step": 31847 + }, + { + "epoch": 9.775322283609576, + "grad_norm": 0.09748775511980057, + "learning_rate": 1.3268639327416177e-07, + "loss": 1.6303, + "step": 31848 + }, + { + "epoch": 9.775629220380601, + "grad_norm": 0.132316455245018, + "learning_rate": 1.3232476334950615e-07, + "loss": 1.6842, + "step": 31849 + }, + { + "epoch": 9.775936157151627, + "grad_norm": 0.13874708116054535, + "learning_rate": 1.319636262464452e-07, + "loss": 1.7458, + "step": 31850 + }, + { + "epoch": 9.776243093922652, + "grad_norm": 0.13404351472854614, + "learning_rate": 1.3160298196854827e-07, + "loss": 1.7135, + "step": 31851 + }, + { + "epoch": 9.776550030693677, + "grad_norm": 0.13872766494750977, + "learning_rate": 1.312428305193847e-07, + "loss": 1.7276, + "step": 31852 + }, + { + "epoch": 9.776856967464703, + "grad_norm": 0.11643758416175842, + "learning_rate": 1.3088317190250165e-07, + "loss": 1.6704, + "step": 31853 + }, + { + "epoch": 9.777163904235728, + "grad_norm": 0.10052239894866943, + "learning_rate": 1.30524006121463e-07, + "loss": 1.6525, + "step": 31854 + }, + { + "epoch": 9.777470841006753, + "grad_norm": 0.1288158893585205, + "learning_rate": 1.3016533317981582e-07, + "loss": 1.7037, + "step": 31855 + }, + { + "epoch": 9.777777777777779, + "grad_norm": 0.17798054218292236, + "learning_rate": 1.2980715308110737e-07, + "loss": 1.7172, + "step": 31856 + }, + { + "epoch": 9.778084714548802, + "grad_norm": 0.19317014515399933, + "learning_rate": 1.294494658288681e-07, + "loss": 1.8052, + "step": 31857 + }, + { + "epoch": 9.778391651319827, + "grad_norm": 0.16400828957557678, + "learning_rate": 1.2909227142664515e-07, + "loss": 1.7391, + "step": 31858 + }, + { + "epoch": 9.778698588090853, + "grad_norm": 0.17417314648628235, + "learning_rate": 1.2873556987795798e-07, + "loss": 1.7649, + "step": 31859 + }, + { + "epoch": 9.779005524861878, + "grad_norm": 0.1729496717453003, + "learning_rate": 1.2837936118633708e-07, + "loss": 1.7499, + "step": 31860 + }, + { + "epoch": 9.779312461632903, + "grad_norm": 0.14423789083957672, + "learning_rate": 1.2802364535530742e-07, + "loss": 1.6673, + "step": 31861 + }, + { + "epoch": 9.779619398403929, + "grad_norm": 0.15968292951583862, + "learning_rate": 1.2766842238837172e-07, + "loss": 1.662, + "step": 31862 + }, + { + "epoch": 9.779926335174954, + "grad_norm": 0.21190059185028076, + "learning_rate": 1.27313692289055e-07, + "loss": 1.7089, + "step": 31863 + }, + { + "epoch": 9.78023327194598, + "grad_norm": 0.16070419549942017, + "learning_rate": 1.2695945506084884e-07, + "loss": 1.7201, + "step": 31864 + }, + { + "epoch": 9.780540208717005, + "grad_norm": 0.15129558742046356, + "learning_rate": 1.2660571070726157e-07, + "loss": 1.6998, + "step": 31865 + }, + { + "epoch": 9.78084714548803, + "grad_norm": 0.11870043724775314, + "learning_rate": 1.2625245923179042e-07, + "loss": 1.6716, + "step": 31866 + }, + { + "epoch": 9.781154082259054, + "grad_norm": 0.1265040785074234, + "learning_rate": 1.25899700637927e-07, + "loss": 1.7115, + "step": 31867 + }, + { + "epoch": 9.781461019030079, + "grad_norm": 0.15591993927955627, + "learning_rate": 1.2554743492915188e-07, + "loss": 1.741, + "step": 31868 + }, + { + "epoch": 9.781767955801104, + "grad_norm": 0.1468917280435562, + "learning_rate": 1.2519566210895117e-07, + "loss": 1.7176, + "step": 31869 + }, + { + "epoch": 9.78207489257213, + "grad_norm": 0.1019337847828865, + "learning_rate": 1.248443821807943e-07, + "loss": 1.6766, + "step": 31870 + }, + { + "epoch": 9.782381829343155, + "grad_norm": 0.1504385769367218, + "learning_rate": 1.2449359514816183e-07, + "loss": 1.6557, + "step": 31871 + }, + { + "epoch": 9.78268876611418, + "grad_norm": 0.11650592088699341, + "learning_rate": 1.2414330101451765e-07, + "loss": 1.6646, + "step": 31872 + }, + { + "epoch": 9.782995702885206, + "grad_norm": 0.13004426658153534, + "learning_rate": 1.2379349978332012e-07, + "loss": 1.6843, + "step": 31873 + }, + { + "epoch": 9.783302639656231, + "grad_norm": 0.1746869832277298, + "learning_rate": 1.234441914580331e-07, + "loss": 1.7148, + "step": 31874 + }, + { + "epoch": 9.783609576427256, + "grad_norm": 0.18265002965927124, + "learning_rate": 1.230953760420983e-07, + "loss": 1.7844, + "step": 31875 + }, + { + "epoch": 9.783916513198282, + "grad_norm": 0.14182110130786896, + "learning_rate": 1.227470535389741e-07, + "loss": 1.7137, + "step": 31876 + }, + { + "epoch": 9.784223449969307, + "grad_norm": 0.12887395918369293, + "learning_rate": 1.2239922395209102e-07, + "loss": 1.7023, + "step": 31877 + }, + { + "epoch": 9.784530386740332, + "grad_norm": 0.15748070180416107, + "learning_rate": 1.2205188728489636e-07, + "loss": 1.7341, + "step": 31878 + }, + { + "epoch": 9.784837323511356, + "grad_norm": 0.13010992109775543, + "learning_rate": 1.2170504354082068e-07, + "loss": 1.6693, + "step": 31879 + }, + { + "epoch": 9.785144260282381, + "grad_norm": 0.15437988936901093, + "learning_rate": 1.2135869272328905e-07, + "loss": 1.6842, + "step": 31880 + }, + { + "epoch": 9.785451197053407, + "grad_norm": 0.12763908505439758, + "learning_rate": 1.2101283483572644e-07, + "loss": 1.6812, + "step": 31881 + }, + { + "epoch": 9.785758133824432, + "grad_norm": 0.1640697419643402, + "learning_rate": 1.206674698815524e-07, + "loss": 1.7035, + "step": 31882 + }, + { + "epoch": 9.786065070595457, + "grad_norm": 0.17316879332065582, + "learning_rate": 1.203225978641753e-07, + "loss": 1.7718, + "step": 31883 + }, + { + "epoch": 9.786372007366483, + "grad_norm": 0.13569143414497375, + "learning_rate": 1.1997821878700355e-07, + "loss": 1.6925, + "step": 31884 + }, + { + "epoch": 9.786678944137508, + "grad_norm": 0.12150706350803375, + "learning_rate": 1.1963433265344548e-07, + "loss": 1.6742, + "step": 31885 + }, + { + "epoch": 9.786985880908533, + "grad_norm": 0.120942622423172, + "learning_rate": 1.1929093946689284e-07, + "loss": 1.6946, + "step": 31886 + }, + { + "epoch": 9.787292817679559, + "grad_norm": 0.15821385383605957, + "learning_rate": 1.1894803923074849e-07, + "loss": 1.7163, + "step": 31887 + }, + { + "epoch": 9.787599754450584, + "grad_norm": 0.14717862010002136, + "learning_rate": 1.1860563194839302e-07, + "loss": 1.6732, + "step": 31888 + }, + { + "epoch": 9.787906691221608, + "grad_norm": 0.17104555666446686, + "learning_rate": 1.1826371762321264e-07, + "loss": 1.7943, + "step": 31889 + }, + { + "epoch": 9.788213627992633, + "grad_norm": 0.10379209369421005, + "learning_rate": 1.1792229625858797e-07, + "loss": 1.6595, + "step": 31890 + }, + { + "epoch": 9.788520564763658, + "grad_norm": 0.1118491068482399, + "learning_rate": 1.1758136785788854e-07, + "loss": 1.6865, + "step": 31891 + }, + { + "epoch": 9.788827501534684, + "grad_norm": 0.14659619331359863, + "learning_rate": 1.1724093242448941e-07, + "loss": 1.6714, + "step": 31892 + }, + { + "epoch": 9.789134438305709, + "grad_norm": 0.17299702763557434, + "learning_rate": 1.1690098996175458e-07, + "loss": 1.6933, + "step": 31893 + }, + { + "epoch": 9.789441375076734, + "grad_norm": 0.1982281357049942, + "learning_rate": 1.1656154047303691e-07, + "loss": 1.7074, + "step": 31894 + }, + { + "epoch": 9.78974831184776, + "grad_norm": 0.17668111622333527, + "learning_rate": 1.1622258396170594e-07, + "loss": 1.7077, + "step": 31895 + }, + { + "epoch": 9.790055248618785, + "grad_norm": 0.1569826602935791, + "learning_rate": 1.1588412043109232e-07, + "loss": 1.6985, + "step": 31896 + }, + { + "epoch": 9.79036218538981, + "grad_norm": 0.12177947908639908, + "learning_rate": 1.1554614988454893e-07, + "loss": 1.6546, + "step": 31897 + }, + { + "epoch": 9.790669122160836, + "grad_norm": 0.1377127766609192, + "learning_rate": 1.1520867232541755e-07, + "loss": 1.6724, + "step": 31898 + }, + { + "epoch": 9.79097605893186, + "grad_norm": 0.13367579877376556, + "learning_rate": 1.1487168775703439e-07, + "loss": 1.715, + "step": 31899 + }, + { + "epoch": 9.791282995702884, + "grad_norm": 0.14254575967788696, + "learning_rate": 1.1453519618273012e-07, + "loss": 1.6968, + "step": 31900 + }, + { + "epoch": 9.79158993247391, + "grad_norm": 0.15228238701820374, + "learning_rate": 1.1419919760582432e-07, + "loss": 1.7052, + "step": 31901 + }, + { + "epoch": 9.791896869244935, + "grad_norm": 0.14899186789989471, + "learning_rate": 1.1386369202964209e-07, + "loss": 1.7289, + "step": 31902 + }, + { + "epoch": 9.79220380601596, + "grad_norm": 0.10609392821788788, + "learning_rate": 1.13528679457503e-07, + "loss": 1.6753, + "step": 31903 + }, + { + "epoch": 9.792510742786986, + "grad_norm": 0.1678643375635147, + "learning_rate": 1.1319415989270443e-07, + "loss": 1.7145, + "step": 31904 + }, + { + "epoch": 9.792817679558011, + "grad_norm": 0.1617528349161148, + "learning_rate": 1.1286013333856594e-07, + "loss": 1.7059, + "step": 31905 + }, + { + "epoch": 9.793124616329036, + "grad_norm": 0.13943657279014587, + "learning_rate": 1.1252659979837932e-07, + "loss": 1.6964, + "step": 31906 + }, + { + "epoch": 9.793431553100062, + "grad_norm": 0.18889422714710236, + "learning_rate": 1.121935592754475e-07, + "loss": 1.7841, + "step": 31907 + }, + { + "epoch": 9.793738489871087, + "grad_norm": 0.1229872852563858, + "learning_rate": 1.118610117730623e-07, + "loss": 1.7406, + "step": 31908 + }, + { + "epoch": 9.794045426642112, + "grad_norm": 0.1400493085384369, + "learning_rate": 1.115289572945044e-07, + "loss": 1.7183, + "step": 31909 + }, + { + "epoch": 9.794352363413136, + "grad_norm": 0.24427293241024017, + "learning_rate": 1.1119739584305456e-07, + "loss": 1.6977, + "step": 31910 + }, + { + "epoch": 9.794659300184161, + "grad_norm": 0.19268591701984406, + "learning_rate": 1.1086632742199343e-07, + "loss": 1.7625, + "step": 31911 + }, + { + "epoch": 9.794966236955187, + "grad_norm": 0.10926581919193268, + "learning_rate": 1.105357520345962e-07, + "loss": 1.68, + "step": 31912 + }, + { + "epoch": 9.795273173726212, + "grad_norm": 0.16322609782218933, + "learning_rate": 1.1020566968412138e-07, + "loss": 1.7282, + "step": 31913 + }, + { + "epoch": 9.795580110497237, + "grad_norm": 0.1540069282054901, + "learning_rate": 1.0987608037383857e-07, + "loss": 1.7384, + "step": 31914 + }, + { + "epoch": 9.795887047268263, + "grad_norm": 0.20092691481113434, + "learning_rate": 1.095469841070007e-07, + "loss": 1.7189, + "step": 31915 + }, + { + "epoch": 9.796193984039288, + "grad_norm": 0.1929512470960617, + "learning_rate": 1.0921838088686076e-07, + "loss": 1.7626, + "step": 31916 + }, + { + "epoch": 9.796500920810313, + "grad_norm": 0.17819680273532867, + "learning_rate": 1.0889027071667168e-07, + "loss": 1.6963, + "step": 31917 + }, + { + "epoch": 9.796807857581339, + "grad_norm": 0.10324428975582123, + "learning_rate": 1.0856265359966422e-07, + "loss": 1.6863, + "step": 31918 + }, + { + "epoch": 9.797114794352364, + "grad_norm": 0.17684327065944672, + "learning_rate": 1.0823552953908578e-07, + "loss": 1.7065, + "step": 31919 + }, + { + "epoch": 9.79742173112339, + "grad_norm": 0.11119870841503143, + "learning_rate": 1.079088985381671e-07, + "loss": 1.6706, + "step": 31920 + }, + { + "epoch": 9.797728667894415, + "grad_norm": 0.16475334763526917, + "learning_rate": 1.0758276060013339e-07, + "loss": 1.7312, + "step": 31921 + }, + { + "epoch": 9.798035604665438, + "grad_norm": 0.08758127689361572, + "learning_rate": 1.0725711572820984e-07, + "loss": 1.6422, + "step": 31922 + }, + { + "epoch": 9.798342541436464, + "grad_norm": 0.17832210659980774, + "learning_rate": 1.069319639256161e-07, + "loss": 1.7204, + "step": 31923 + }, + { + "epoch": 9.798649478207489, + "grad_norm": 0.13107214868068695, + "learning_rate": 1.0660730519556628e-07, + "loss": 1.6636, + "step": 31924 + }, + { + "epoch": 9.798956414978514, + "grad_norm": 0.10268855839967728, + "learning_rate": 1.0628313954126335e-07, + "loss": 1.6844, + "step": 31925 + }, + { + "epoch": 9.79926335174954, + "grad_norm": 0.16402679681777954, + "learning_rate": 1.0595946696591586e-07, + "loss": 1.7028, + "step": 31926 + }, + { + "epoch": 9.799570288520565, + "grad_norm": 0.1430855542421341, + "learning_rate": 1.056362874727157e-07, + "loss": 1.6792, + "step": 31927 + }, + { + "epoch": 9.79987722529159, + "grad_norm": 0.16672997176647186, + "learning_rate": 1.0531360106486587e-07, + "loss": 1.7266, + "step": 31928 + }, + { + "epoch": 9.800184162062616, + "grad_norm": 0.18226337432861328, + "learning_rate": 1.0499140774555272e-07, + "loss": 1.7236, + "step": 31929 + }, + { + "epoch": 9.800491098833641, + "grad_norm": 0.1977282166481018, + "learning_rate": 1.0466970751795701e-07, + "loss": 1.7739, + "step": 31930 + }, + { + "epoch": 9.800798035604666, + "grad_norm": 0.17272333800792694, + "learning_rate": 1.0434850038525956e-07, + "loss": 1.7177, + "step": 31931 + }, + { + "epoch": 9.80110497237569, + "grad_norm": 0.1538221389055252, + "learning_rate": 1.0402778635063004e-07, + "loss": 1.7038, + "step": 31932 + }, + { + "epoch": 9.801411909146715, + "grad_norm": 0.1327001303434372, + "learning_rate": 1.0370756541724924e-07, + "loss": 1.6885, + "step": 31933 + }, + { + "epoch": 9.80171884591774, + "grad_norm": 0.15500570833683014, + "learning_rate": 1.0338783758827575e-07, + "loss": 1.7787, + "step": 31934 + }, + { + "epoch": 9.802025782688766, + "grad_norm": 0.14874790608882904, + "learning_rate": 1.0306860286686815e-07, + "loss": 1.731, + "step": 31935 + }, + { + "epoch": 9.802332719459791, + "grad_norm": 0.13585814833641052, + "learning_rate": 1.0274986125617947e-07, + "loss": 1.6973, + "step": 31936 + }, + { + "epoch": 9.802639656230816, + "grad_norm": 0.15876494348049164, + "learning_rate": 1.0243161275936274e-07, + "loss": 1.7255, + "step": 31937 + }, + { + "epoch": 9.802946593001842, + "grad_norm": 0.18510127067565918, + "learning_rate": 1.0211385737956546e-07, + "loss": 1.7395, + "step": 31938 + }, + { + "epoch": 9.803253529772867, + "grad_norm": 0.13107381761074066, + "learning_rate": 1.01796595119924e-07, + "loss": 1.6876, + "step": 31939 + }, + { + "epoch": 9.803560466543892, + "grad_norm": 0.10170239210128784, + "learning_rate": 1.0147982598357474e-07, + "loss": 1.7082, + "step": 31940 + }, + { + "epoch": 9.803867403314918, + "grad_norm": 0.15952639281749725, + "learning_rate": 1.0116354997364851e-07, + "loss": 1.718, + "step": 31941 + }, + { + "epoch": 9.804174340085943, + "grad_norm": 0.11146245896816254, + "learning_rate": 1.0084776709327059e-07, + "loss": 1.6555, + "step": 31942 + }, + { + "epoch": 9.804481276856967, + "grad_norm": 0.13348564505577087, + "learning_rate": 1.0053247734556071e-07, + "loss": 1.7032, + "step": 31943 + }, + { + "epoch": 9.804788213627992, + "grad_norm": 0.10820803791284561, + "learning_rate": 1.0021768073363858e-07, + "loss": 1.7091, + "step": 31944 + }, + { + "epoch": 9.805095150399017, + "grad_norm": 0.11644341796636581, + "learning_rate": 9.990337726061283e-08, + "loss": 1.678, + "step": 31945 + }, + { + "epoch": 9.805402087170043, + "grad_norm": 0.1656201332807541, + "learning_rate": 9.958956692958655e-08, + "loss": 1.7609, + "step": 31946 + }, + { + "epoch": 9.805709023941068, + "grad_norm": 0.12365484982728958, + "learning_rate": 9.927624974366279e-08, + "loss": 1.6953, + "step": 31947 + }, + { + "epoch": 9.806015960712093, + "grad_norm": 0.14887237548828125, + "learning_rate": 9.896342570593909e-08, + "loss": 1.6941, + "step": 31948 + }, + { + "epoch": 9.806322897483119, + "grad_norm": 0.14070530235767365, + "learning_rate": 9.86510948195074e-08, + "loss": 1.7096, + "step": 31949 + }, + { + "epoch": 9.806629834254144, + "grad_norm": 0.14970767498016357, + "learning_rate": 9.833925708745418e-08, + "loss": 1.7519, + "step": 31950 + }, + { + "epoch": 9.80693677102517, + "grad_norm": 0.1032944917678833, + "learning_rate": 9.802791251286026e-08, + "loss": 1.6524, + "step": 31951 + }, + { + "epoch": 9.807243707796195, + "grad_norm": 0.13888783752918243, + "learning_rate": 9.771706109880652e-08, + "loss": 1.706, + "step": 31952 + }, + { + "epoch": 9.807550644567218, + "grad_norm": 0.16892662644386292, + "learning_rate": 9.740670284835718e-08, + "loss": 1.7865, + "step": 31953 + }, + { + "epoch": 9.807857581338244, + "grad_norm": 0.15382327139377594, + "learning_rate": 9.709683776458755e-08, + "loss": 1.7028, + "step": 31954 + }, + { + "epoch": 9.808164518109269, + "grad_norm": 0.1603674590587616, + "learning_rate": 9.678746585055077e-08, + "loss": 1.7402, + "step": 31955 + }, + { + "epoch": 9.808471454880294, + "grad_norm": 0.13476061820983887, + "learning_rate": 9.647858710931102e-08, + "loss": 1.7446, + "step": 31956 + }, + { + "epoch": 9.80877839165132, + "grad_norm": 0.14579132199287415, + "learning_rate": 9.617020154392142e-08, + "loss": 1.732, + "step": 31957 + }, + { + "epoch": 9.809085328422345, + "grad_norm": 0.13551945984363556, + "learning_rate": 9.586230915742955e-08, + "loss": 1.7255, + "step": 31958 + }, + { + "epoch": 9.80939226519337, + "grad_norm": 0.120823934674263, + "learning_rate": 9.555490995287186e-08, + "loss": 1.6622, + "step": 31959 + }, + { + "epoch": 9.809699201964396, + "grad_norm": 0.15336096286773682, + "learning_rate": 9.524800393329037e-08, + "loss": 1.7586, + "step": 31960 + }, + { + "epoch": 9.810006138735421, + "grad_norm": 0.14215800166130066, + "learning_rate": 9.494159110172151e-08, + "loss": 1.7418, + "step": 31961 + }, + { + "epoch": 9.810313075506446, + "grad_norm": 0.12923559546470642, + "learning_rate": 9.463567146118513e-08, + "loss": 1.697, + "step": 31962 + }, + { + "epoch": 9.810620012277472, + "grad_norm": 0.11306928843259811, + "learning_rate": 9.43302450147121e-08, + "loss": 1.7063, + "step": 31963 + }, + { + "epoch": 9.810926949048497, + "grad_norm": 0.1500793844461441, + "learning_rate": 9.40253117653167e-08, + "loss": 1.7492, + "step": 31964 + }, + { + "epoch": 9.81123388581952, + "grad_norm": 0.13170574605464935, + "learning_rate": 9.372087171601873e-08, + "loss": 1.73, + "step": 31965 + }, + { + "epoch": 9.811540822590546, + "grad_norm": 0.15149074792861938, + "learning_rate": 9.341692486981579e-08, + "loss": 1.716, + "step": 31966 + }, + { + "epoch": 9.811847759361571, + "grad_norm": 0.10818208009004593, + "learning_rate": 9.311347122972769e-08, + "loss": 1.6936, + "step": 31967 + }, + { + "epoch": 9.812154696132596, + "grad_norm": 0.1262877881526947, + "learning_rate": 9.281051079873537e-08, + "loss": 1.7042, + "step": 31968 + }, + { + "epoch": 9.812461632903622, + "grad_norm": 0.18285219371318817, + "learning_rate": 9.2508043579842e-08, + "loss": 1.6812, + "step": 31969 + }, + { + "epoch": 9.812768569674647, + "grad_norm": 0.30483585596084595, + "learning_rate": 9.220606957603406e-08, + "loss": 1.8422, + "step": 31970 + }, + { + "epoch": 9.813075506445673, + "grad_norm": 0.14469990134239197, + "learning_rate": 9.190458879030362e-08, + "loss": 1.7624, + "step": 31971 + }, + { + "epoch": 9.813382443216698, + "grad_norm": 0.20810872316360474, + "learning_rate": 9.16036012256205e-08, + "loss": 1.7293, + "step": 31972 + }, + { + "epoch": 9.813689379987723, + "grad_norm": 0.16292090713977814, + "learning_rate": 9.130310688496013e-08, + "loss": 1.681, + "step": 31973 + }, + { + "epoch": 9.813996316758749, + "grad_norm": 0.17718647420406342, + "learning_rate": 9.100310577130345e-08, + "loss": 1.728, + "step": 31974 + }, + { + "epoch": 9.814303253529772, + "grad_norm": 0.13324975967407227, + "learning_rate": 9.070359788759808e-08, + "loss": 1.6776, + "step": 31975 + }, + { + "epoch": 9.814610190300797, + "grad_norm": 0.15602141618728638, + "learning_rate": 9.040458323681389e-08, + "loss": 1.6902, + "step": 31976 + }, + { + "epoch": 9.814917127071823, + "grad_norm": 0.12438742071390152, + "learning_rate": 9.010606182190962e-08, + "loss": 1.7132, + "step": 31977 + }, + { + "epoch": 9.815224063842848, + "grad_norm": 0.15935616195201874, + "learning_rate": 8.980803364582734e-08, + "loss": 1.7432, + "step": 31978 + }, + { + "epoch": 9.815531000613873, + "grad_norm": 0.18075346946716309, + "learning_rate": 8.951049871151474e-08, + "loss": 1.7268, + "step": 31979 + }, + { + "epoch": 9.815837937384899, + "grad_norm": 0.11405523866415024, + "learning_rate": 8.921345702191386e-08, + "loss": 1.6414, + "step": 31980 + }, + { + "epoch": 9.816144874155924, + "grad_norm": 0.12962454557418823, + "learning_rate": 8.891690857995572e-08, + "loss": 1.6943, + "step": 31981 + }, + { + "epoch": 9.81645181092695, + "grad_norm": 0.14757606387138367, + "learning_rate": 8.862085338857685e-08, + "loss": 1.7313, + "step": 31982 + }, + { + "epoch": 9.816758747697975, + "grad_norm": 0.16997574269771576, + "learning_rate": 8.832529145070267e-08, + "loss": 1.7795, + "step": 31983 + }, + { + "epoch": 9.817065684469, + "grad_norm": 0.13103361427783966, + "learning_rate": 8.80302227692531e-08, + "loss": 1.7153, + "step": 31984 + }, + { + "epoch": 9.817372621240025, + "grad_norm": 0.13774408400058746, + "learning_rate": 8.773564734713691e-08, + "loss": 1.7125, + "step": 31985 + }, + { + "epoch": 9.817679558011049, + "grad_norm": 0.10313444584608078, + "learning_rate": 8.744156518727398e-08, + "loss": 1.7061, + "step": 31986 + }, + { + "epoch": 9.817986494782074, + "grad_norm": 0.14256370067596436, + "learning_rate": 8.71479762925731e-08, + "loss": 1.7078, + "step": 31987 + }, + { + "epoch": 9.8182934315531, + "grad_norm": 0.13552837073802948, + "learning_rate": 8.685488066592639e-08, + "loss": 1.7447, + "step": 31988 + }, + { + "epoch": 9.818600368324125, + "grad_norm": 0.1388518065214157, + "learning_rate": 8.656227831023711e-08, + "loss": 1.7332, + "step": 31989 + }, + { + "epoch": 9.81890730509515, + "grad_norm": 0.09268537908792496, + "learning_rate": 8.627016922839182e-08, + "loss": 1.6371, + "step": 31990 + }, + { + "epoch": 9.819214241866176, + "grad_norm": 0.10252194851636887, + "learning_rate": 8.597855342328265e-08, + "loss": 1.6794, + "step": 31991 + }, + { + "epoch": 9.819521178637201, + "grad_norm": 0.08967567980289459, + "learning_rate": 8.568743089778509e-08, + "loss": 1.6455, + "step": 31992 + }, + { + "epoch": 9.819828115408226, + "grad_norm": 0.15265701711177826, + "learning_rate": 8.539680165478569e-08, + "loss": 1.7244, + "step": 31993 + }, + { + "epoch": 9.820135052179252, + "grad_norm": 0.16557417809963226, + "learning_rate": 8.510666569714332e-08, + "loss": 1.7676, + "step": 31994 + }, + { + "epoch": 9.820441988950277, + "grad_norm": 0.09994948655366898, + "learning_rate": 8.481702302773897e-08, + "loss": 1.6743, + "step": 31995 + }, + { + "epoch": 9.8207489257213, + "grad_norm": 0.13728035986423492, + "learning_rate": 8.452787364943149e-08, + "loss": 1.6864, + "step": 31996 + }, + { + "epoch": 9.821055862492326, + "grad_norm": 0.21103262901306152, + "learning_rate": 8.423921756506858e-08, + "loss": 1.7673, + "step": 31997 + }, + { + "epoch": 9.821362799263351, + "grad_norm": 0.146772101521492, + "learning_rate": 8.395105477751464e-08, + "loss": 1.7245, + "step": 31998 + }, + { + "epoch": 9.821669736034377, + "grad_norm": 0.1592164784669876, + "learning_rate": 8.366338528961182e-08, + "loss": 1.7612, + "step": 31999 + }, + { + "epoch": 9.821976672805402, + "grad_norm": 0.15586064755916595, + "learning_rate": 8.337620910420229e-08, + "loss": 1.7142, + "step": 32000 + }, + { + "epoch": 9.822283609576427, + "grad_norm": 0.14506274461746216, + "learning_rate": 8.30895262241338e-08, + "loss": 1.7085, + "step": 32001 + }, + { + "epoch": 9.822590546347453, + "grad_norm": 0.11904678493738174, + "learning_rate": 8.280333665222073e-08, + "loss": 1.7024, + "step": 32002 + }, + { + "epoch": 9.822897483118478, + "grad_norm": 0.14538206160068512, + "learning_rate": 8.251764039131083e-08, + "loss": 1.7207, + "step": 32003 + }, + { + "epoch": 9.823204419889503, + "grad_norm": 0.17649157345294952, + "learning_rate": 8.223243744421849e-08, + "loss": 1.684, + "step": 32004 + }, + { + "epoch": 9.823511356660529, + "grad_norm": 0.13790307939052582, + "learning_rate": 8.194772781375815e-08, + "loss": 1.7083, + "step": 32005 + }, + { + "epoch": 9.823818293431554, + "grad_norm": 0.12401477247476578, + "learning_rate": 8.166351150274976e-08, + "loss": 1.6712, + "step": 32006 + }, + { + "epoch": 9.824125230202577, + "grad_norm": 0.13443689048290253, + "learning_rate": 8.137978851400219e-08, + "loss": 1.7134, + "step": 32007 + }, + { + "epoch": 9.824432166973603, + "grad_norm": 0.11961400508880615, + "learning_rate": 8.109655885031875e-08, + "loss": 1.6478, + "step": 32008 + }, + { + "epoch": 9.824739103744628, + "grad_norm": 0.14795053005218506, + "learning_rate": 8.081382251449721e-08, + "loss": 1.7182, + "step": 32009 + }, + { + "epoch": 9.825046040515653, + "grad_norm": 0.10425613820552826, + "learning_rate": 8.053157950932977e-08, + "loss": 1.6385, + "step": 32010 + }, + { + "epoch": 9.825352977286679, + "grad_norm": 0.11885244399309158, + "learning_rate": 8.024982983760864e-08, + "loss": 1.6764, + "step": 32011 + }, + { + "epoch": 9.825659914057704, + "grad_norm": 0.11422543227672577, + "learning_rate": 7.99685735021205e-08, + "loss": 1.6778, + "step": 32012 + }, + { + "epoch": 9.82596685082873, + "grad_norm": 0.12039226293563843, + "learning_rate": 7.968781050564089e-08, + "loss": 1.6843, + "step": 32013 + }, + { + "epoch": 9.826273787599755, + "grad_norm": 0.13094797730445862, + "learning_rate": 7.940754085094537e-08, + "loss": 1.6757, + "step": 32014 + }, + { + "epoch": 9.82658072437078, + "grad_norm": 0.14221440255641937, + "learning_rate": 7.91277645407984e-08, + "loss": 1.7307, + "step": 32015 + }, + { + "epoch": 9.826887661141805, + "grad_norm": 0.11989296972751617, + "learning_rate": 7.884848157798109e-08, + "loss": 1.6651, + "step": 32016 + }, + { + "epoch": 9.82719459791283, + "grad_norm": 0.1768631786108017, + "learning_rate": 7.856969196523567e-08, + "loss": 1.7294, + "step": 32017 + }, + { + "epoch": 9.827501534683854, + "grad_norm": 0.1401507407426834, + "learning_rate": 7.829139570532662e-08, + "loss": 1.6958, + "step": 32018 + }, + { + "epoch": 9.82780847145488, + "grad_norm": 0.1531054675579071, + "learning_rate": 7.801359280099618e-08, + "loss": 1.7176, + "step": 32019 + }, + { + "epoch": 9.828115408225905, + "grad_norm": 0.17227032780647278, + "learning_rate": 7.773628325500326e-08, + "loss": 1.6941, + "step": 32020 + }, + { + "epoch": 9.82842234499693, + "grad_norm": 0.15229587256908417, + "learning_rate": 7.745946707007345e-08, + "loss": 1.6899, + "step": 32021 + }, + { + "epoch": 9.828729281767956, + "grad_norm": 0.1732887476682663, + "learning_rate": 7.718314424895457e-08, + "loss": 1.7557, + "step": 32022 + }, + { + "epoch": 9.829036218538981, + "grad_norm": 0.11568398028612137, + "learning_rate": 7.690731479437218e-08, + "loss": 1.7077, + "step": 32023 + }, + { + "epoch": 9.829343155310006, + "grad_norm": 0.12425289303064346, + "learning_rate": 7.663197870905747e-08, + "loss": 1.6748, + "step": 32024 + }, + { + "epoch": 9.829650092081032, + "grad_norm": 0.13480359315872192, + "learning_rate": 7.635713599571936e-08, + "loss": 1.6874, + "step": 32025 + }, + { + "epoch": 9.829957028852057, + "grad_norm": 0.1616349071264267, + "learning_rate": 7.608278665708346e-08, + "loss": 1.7273, + "step": 32026 + }, + { + "epoch": 9.830263965623082, + "grad_norm": 0.15407976508140564, + "learning_rate": 7.58089306958587e-08, + "loss": 1.6929, + "step": 32027 + }, + { + "epoch": 9.830570902394108, + "grad_norm": 0.14456650614738464, + "learning_rate": 7.553556811475404e-08, + "loss": 1.6634, + "step": 32028 + }, + { + "epoch": 9.830877839165131, + "grad_norm": 0.11235690861940384, + "learning_rate": 7.526269891646176e-08, + "loss": 1.6862, + "step": 32029 + }, + { + "epoch": 9.831184775936157, + "grad_norm": 0.11624839901924133, + "learning_rate": 7.49903231036908e-08, + "loss": 1.6836, + "step": 32030 + }, + { + "epoch": 9.831491712707182, + "grad_norm": 0.1717003732919693, + "learning_rate": 7.471844067912792e-08, + "loss": 1.7182, + "step": 32031 + }, + { + "epoch": 9.831798649478207, + "grad_norm": 0.1300148069858551, + "learning_rate": 7.444705164545429e-08, + "loss": 1.7267, + "step": 32032 + }, + { + "epoch": 9.832105586249233, + "grad_norm": 0.18420568108558655, + "learning_rate": 7.417615600536221e-08, + "loss": 1.7433, + "step": 32033 + }, + { + "epoch": 9.832412523020258, + "grad_norm": 0.16578641533851624, + "learning_rate": 7.390575376152176e-08, + "loss": 1.7013, + "step": 32034 + }, + { + "epoch": 9.832719459791283, + "grad_norm": 0.19031740725040436, + "learning_rate": 7.363584491660858e-08, + "loss": 1.783, + "step": 32035 + }, + { + "epoch": 9.833026396562309, + "grad_norm": 0.14676955342292786, + "learning_rate": 7.336642947328721e-08, + "loss": 1.6847, + "step": 32036 + }, + { + "epoch": 9.833333333333334, + "grad_norm": 0.10915904492139816, + "learning_rate": 7.309750743422217e-08, + "loss": 1.6494, + "step": 32037 + }, + { + "epoch": 9.83364027010436, + "grad_norm": 0.20945672690868378, + "learning_rate": 7.282907880207245e-08, + "loss": 1.6974, + "step": 32038 + }, + { + "epoch": 9.833947206875383, + "grad_norm": 0.12456732988357544, + "learning_rate": 7.25611435794915e-08, + "loss": 1.6804, + "step": 32039 + }, + { + "epoch": 9.834254143646408, + "grad_norm": 0.1883053332567215, + "learning_rate": 7.229370176911609e-08, + "loss": 1.7588, + "step": 32040 + }, + { + "epoch": 9.834561080417433, + "grad_norm": 0.15548336505889893, + "learning_rate": 7.202675337360521e-08, + "loss": 1.7235, + "step": 32041 + }, + { + "epoch": 9.834868017188459, + "grad_norm": 0.12813648581504822, + "learning_rate": 7.176029839558451e-08, + "loss": 1.6852, + "step": 32042 + }, + { + "epoch": 9.835174953959484, + "grad_norm": 0.1417354941368103, + "learning_rate": 7.149433683769635e-08, + "loss": 1.6887, + "step": 32043 + }, + { + "epoch": 9.83548189073051, + "grad_norm": 0.16405703127384186, + "learning_rate": 7.12288687025664e-08, + "loss": 1.793, + "step": 32044 + }, + { + "epoch": 9.835788827501535, + "grad_norm": 0.10414276272058487, + "learning_rate": 7.096389399281478e-08, + "loss": 1.6463, + "step": 32045 + }, + { + "epoch": 9.83609576427256, + "grad_norm": 0.1333547830581665, + "learning_rate": 7.069941271106162e-08, + "loss": 1.6769, + "step": 32046 + }, + { + "epoch": 9.836402701043585, + "grad_norm": 0.13679614663124084, + "learning_rate": 7.043542485992149e-08, + "loss": 1.6981, + "step": 32047 + }, + { + "epoch": 9.83670963781461, + "grad_norm": 0.19633722305297852, + "learning_rate": 7.017193044200343e-08, + "loss": 1.7121, + "step": 32048 + }, + { + "epoch": 9.837016574585636, + "grad_norm": 0.1266251504421234, + "learning_rate": 6.99089294599109e-08, + "loss": 1.6858, + "step": 32049 + }, + { + "epoch": 9.83732351135666, + "grad_norm": 0.12430547177791595, + "learning_rate": 6.964642191624182e-08, + "loss": 1.7402, + "step": 32050 + }, + { + "epoch": 9.837630448127685, + "grad_norm": 0.11596968024969101, + "learning_rate": 6.938440781359413e-08, + "loss": 1.6893, + "step": 32051 + }, + { + "epoch": 9.83793738489871, + "grad_norm": 0.1783151626586914, + "learning_rate": 6.912288715455461e-08, + "loss": 1.7444, + "step": 32052 + }, + { + "epoch": 9.838244321669736, + "grad_norm": 0.15675026178359985, + "learning_rate": 6.886185994170458e-08, + "loss": 1.7031, + "step": 32053 + }, + { + "epoch": 9.838551258440761, + "grad_norm": 0.12373685091733932, + "learning_rate": 6.860132617763081e-08, + "loss": 1.6879, + "step": 32054 + }, + { + "epoch": 9.838858195211786, + "grad_norm": 0.11986403167247772, + "learning_rate": 6.834128586490352e-08, + "loss": 1.7276, + "step": 32055 + }, + { + "epoch": 9.839165131982812, + "grad_norm": 0.12817466259002686, + "learning_rate": 6.808173900609838e-08, + "loss": 1.7128, + "step": 32056 + }, + { + "epoch": 9.839472068753837, + "grad_norm": 0.15844331681728363, + "learning_rate": 6.782268560376892e-08, + "loss": 1.7278, + "step": 32057 + }, + { + "epoch": 9.839779005524862, + "grad_norm": 0.1530577689409256, + "learning_rate": 6.756412566048531e-08, + "loss": 1.7616, + "step": 32058 + }, + { + "epoch": 9.840085942295888, + "grad_norm": 0.12964992225170135, + "learning_rate": 6.730605917879551e-08, + "loss": 1.7037, + "step": 32059 + }, + { + "epoch": 9.840392879066911, + "grad_norm": 0.1531256139278412, + "learning_rate": 6.704848616125858e-08, + "loss": 1.7433, + "step": 32060 + }, + { + "epoch": 9.840699815837937, + "grad_norm": 0.15467914938926697, + "learning_rate": 6.679140661041139e-08, + "loss": 1.737, + "step": 32061 + }, + { + "epoch": 9.841006752608962, + "grad_norm": 0.13379620015621185, + "learning_rate": 6.653482052880189e-08, + "loss": 1.7009, + "step": 32062 + }, + { + "epoch": 9.841313689379987, + "grad_norm": 0.1608572006225586, + "learning_rate": 6.62787279189614e-08, + "loss": 1.7222, + "step": 32063 + }, + { + "epoch": 9.841620626151013, + "grad_norm": 0.10191282629966736, + "learning_rate": 6.60231287834212e-08, + "loss": 1.6493, + "step": 32064 + }, + { + "epoch": 9.841927562922038, + "grad_norm": 0.1067260280251503, + "learning_rate": 6.576802312470709e-08, + "loss": 1.6836, + "step": 32065 + }, + { + "epoch": 9.842234499693063, + "grad_norm": 0.09046047180891037, + "learning_rate": 6.551341094533925e-08, + "loss": 1.6442, + "step": 32066 + }, + { + "epoch": 9.842541436464089, + "grad_norm": 0.16846902668476105, + "learning_rate": 6.525929224783789e-08, + "loss": 1.7371, + "step": 32067 + }, + { + "epoch": 9.842848373235114, + "grad_norm": 0.13322049379348755, + "learning_rate": 6.500566703470657e-08, + "loss": 1.744, + "step": 32068 + }, + { + "epoch": 9.84315531000614, + "grad_norm": 0.11230573058128357, + "learning_rate": 6.475253530846548e-08, + "loss": 1.6508, + "step": 32069 + }, + { + "epoch": 9.843462246777165, + "grad_norm": 0.14198845624923706, + "learning_rate": 6.449989707160153e-08, + "loss": 1.7364, + "step": 32070 + }, + { + "epoch": 9.84376918354819, + "grad_norm": 0.2092641144990921, + "learning_rate": 6.424775232661828e-08, + "loss": 1.7172, + "step": 32071 + }, + { + "epoch": 9.844076120319214, + "grad_norm": 0.1266733705997467, + "learning_rate": 6.399610107600818e-08, + "loss": 1.6914, + "step": 32072 + }, + { + "epoch": 9.844383057090239, + "grad_norm": 0.2110438197851181, + "learning_rate": 6.374494332225812e-08, + "loss": 1.7657, + "step": 32073 + }, + { + "epoch": 9.844689993861264, + "grad_norm": 0.13018962740898132, + "learning_rate": 6.349427906784944e-08, + "loss": 1.6803, + "step": 32074 + }, + { + "epoch": 9.84499693063229, + "grad_norm": 0.14762617647647858, + "learning_rate": 6.324410831525795e-08, + "loss": 1.73, + "step": 32075 + }, + { + "epoch": 9.845303867403315, + "grad_norm": 0.15824922919273376, + "learning_rate": 6.299443106695945e-08, + "loss": 1.763, + "step": 32076 + }, + { + "epoch": 9.84561080417434, + "grad_norm": 0.11844678223133087, + "learning_rate": 6.27452473254131e-08, + "loss": 1.6935, + "step": 32077 + }, + { + "epoch": 9.845917740945366, + "grad_norm": 0.11517791450023651, + "learning_rate": 6.249655709309465e-08, + "loss": 1.6934, + "step": 32078 + }, + { + "epoch": 9.84622467771639, + "grad_norm": 0.162859708070755, + "learning_rate": 6.224836037244663e-08, + "loss": 1.7281, + "step": 32079 + }, + { + "epoch": 9.846531614487416, + "grad_norm": 0.09734068065881729, + "learning_rate": 6.200065716593373e-08, + "loss": 1.6473, + "step": 32080 + }, + { + "epoch": 9.846838551258442, + "grad_norm": 0.115218386054039, + "learning_rate": 6.175344747600397e-08, + "loss": 1.6662, + "step": 32081 + }, + { + "epoch": 9.847145488029465, + "grad_norm": 0.11634491384029388, + "learning_rate": 6.150673130508877e-08, + "loss": 1.6455, + "step": 32082 + }, + { + "epoch": 9.84745242480049, + "grad_norm": 0.10781900584697723, + "learning_rate": 6.126050865563615e-08, + "loss": 1.654, + "step": 32083 + }, + { + "epoch": 9.847759361571516, + "grad_norm": 0.14688703417778015, + "learning_rate": 6.101477953008305e-08, + "loss": 1.7057, + "step": 32084 + }, + { + "epoch": 9.848066298342541, + "grad_norm": 0.14795304834842682, + "learning_rate": 6.076954393084421e-08, + "loss": 1.7032, + "step": 32085 + }, + { + "epoch": 9.848373235113566, + "grad_norm": 0.12772250175476074, + "learning_rate": 6.052480186035658e-08, + "loss": 1.7101, + "step": 32086 + }, + { + "epoch": 9.848680171884592, + "grad_norm": 0.14158354699611664, + "learning_rate": 6.028055332102933e-08, + "loss": 1.6748, + "step": 32087 + }, + { + "epoch": 9.848987108655617, + "grad_norm": 0.13286559283733368, + "learning_rate": 6.003679831528275e-08, + "loss": 1.6909, + "step": 32088 + }, + { + "epoch": 9.849294045426642, + "grad_norm": 0.10677133500576019, + "learning_rate": 5.979353684552047e-08, + "loss": 1.6806, + "step": 32089 + }, + { + "epoch": 9.849600982197668, + "grad_norm": 0.09260063618421555, + "learning_rate": 5.955076891415168e-08, + "loss": 1.6604, + "step": 32090 + }, + { + "epoch": 9.849907918968693, + "grad_norm": 0.17723138630390167, + "learning_rate": 5.9308494523574453e-08, + "loss": 1.7538, + "step": 32091 + }, + { + "epoch": 9.850214855739718, + "grad_norm": 0.14554916322231293, + "learning_rate": 5.9066713676181326e-08, + "loss": 1.7595, + "step": 32092 + }, + { + "epoch": 9.850521792510742, + "grad_norm": 0.14164261519908905, + "learning_rate": 5.882542637435928e-08, + "loss": 1.7205, + "step": 32093 + }, + { + "epoch": 9.850828729281767, + "grad_norm": 0.1607130765914917, + "learning_rate": 5.858463262050085e-08, + "loss": 1.7433, + "step": 32094 + }, + { + "epoch": 9.851135666052793, + "grad_norm": 0.10517904162406921, + "learning_rate": 5.834433241697634e-08, + "loss": 1.6795, + "step": 32095 + }, + { + "epoch": 9.851442602823818, + "grad_norm": 0.11845014989376068, + "learning_rate": 5.810452576616721e-08, + "loss": 1.7099, + "step": 32096 + }, + { + "epoch": 9.851749539594843, + "grad_norm": 0.17924906313419342, + "learning_rate": 5.786521267043821e-08, + "loss": 1.7513, + "step": 32097 + }, + { + "epoch": 9.852056476365869, + "grad_norm": 0.20598645508289337, + "learning_rate": 5.762639313215967e-08, + "loss": 1.7387, + "step": 32098 + }, + { + "epoch": 9.852363413136894, + "grad_norm": 0.18959027528762817, + "learning_rate": 5.738806715369083e-08, + "loss": 1.7327, + "step": 32099 + }, + { + "epoch": 9.85267034990792, + "grad_norm": 0.13945116102695465, + "learning_rate": 5.7150234737379795e-08, + "loss": 1.6933, + "step": 32100 + }, + { + "epoch": 9.852977286678945, + "grad_norm": 0.12638016045093536, + "learning_rate": 5.6912895885585795e-08, + "loss": 1.6602, + "step": 32101 + }, + { + "epoch": 9.85328422344997, + "grad_norm": 0.1453823745250702, + "learning_rate": 5.66760506006514e-08, + "loss": 1.6894, + "step": 32102 + }, + { + "epoch": 9.853591160220994, + "grad_norm": 0.1257086992263794, + "learning_rate": 5.643969888491918e-08, + "loss": 1.6999, + "step": 32103 + }, + { + "epoch": 9.853898096992019, + "grad_norm": 0.1332065314054489, + "learning_rate": 5.6203840740720605e-08, + "loss": 1.7494, + "step": 32104 + }, + { + "epoch": 9.854205033763044, + "grad_norm": 0.10547174513339996, + "learning_rate": 5.596847617038714e-08, + "loss": 1.7052, + "step": 32105 + }, + { + "epoch": 9.85451197053407, + "grad_norm": 0.12532146275043488, + "learning_rate": 5.5733605176250256e-08, + "loss": 1.6831, + "step": 32106 + }, + { + "epoch": 9.854818907305095, + "grad_norm": 0.1575230360031128, + "learning_rate": 5.549922776062477e-08, + "loss": 1.7032, + "step": 32107 + }, + { + "epoch": 9.85512584407612, + "grad_norm": 0.13303294777870178, + "learning_rate": 5.52653439258255e-08, + "loss": 1.6917, + "step": 32108 + }, + { + "epoch": 9.855432780847146, + "grad_norm": 0.10225910693407059, + "learning_rate": 5.50319536741728e-08, + "loss": 1.7108, + "step": 32109 + }, + { + "epoch": 9.855739717618171, + "grad_norm": 0.11767458915710449, + "learning_rate": 5.479905700796484e-08, + "loss": 1.6651, + "step": 32110 + }, + { + "epoch": 9.856046654389196, + "grad_norm": 0.099602110683918, + "learning_rate": 5.456665392951088e-08, + "loss": 1.6674, + "step": 32111 + }, + { + "epoch": 9.856353591160222, + "grad_norm": 0.11690317094326019, + "learning_rate": 5.433474444109799e-08, + "loss": 1.6533, + "step": 32112 + }, + { + "epoch": 9.856660527931247, + "grad_norm": 0.14385253190994263, + "learning_rate": 5.410332854502431e-08, + "loss": 1.6759, + "step": 32113 + }, + { + "epoch": 9.856967464702272, + "grad_norm": 0.16568076610565186, + "learning_rate": 5.387240624357692e-08, + "loss": 1.7523, + "step": 32114 + }, + { + "epoch": 9.857274401473296, + "grad_norm": 0.1166546419262886, + "learning_rate": 5.364197753903732e-08, + "loss": 1.6936, + "step": 32115 + }, + { + "epoch": 9.857581338244321, + "grad_norm": 0.1372339427471161, + "learning_rate": 5.3412042433681473e-08, + "loss": 1.6941, + "step": 32116 + }, + { + "epoch": 9.857888275015346, + "grad_norm": 0.14886748790740967, + "learning_rate": 5.318260092978533e-08, + "loss": 1.7423, + "step": 32117 + }, + { + "epoch": 9.858195211786372, + "grad_norm": 0.10235906392335892, + "learning_rate": 5.29536530296082e-08, + "loss": 1.6717, + "step": 32118 + }, + { + "epoch": 9.858502148557397, + "grad_norm": 0.13623642921447754, + "learning_rate": 5.2725198735420475e-08, + "loss": 1.6712, + "step": 32119 + }, + { + "epoch": 9.858809085328422, + "grad_norm": 0.14319658279418945, + "learning_rate": 5.249723804948148e-08, + "loss": 1.701, + "step": 32120 + }, + { + "epoch": 9.859116022099448, + "grad_norm": 0.14662912487983704, + "learning_rate": 5.226977097403385e-08, + "loss": 1.6885, + "step": 32121 + }, + { + "epoch": 9.859422958870473, + "grad_norm": 0.1491306722164154, + "learning_rate": 5.204279751133134e-08, + "loss": 1.7074, + "step": 32122 + }, + { + "epoch": 9.859729895641498, + "grad_norm": 0.1779826581478119, + "learning_rate": 5.181631766362216e-08, + "loss": 1.6887, + "step": 32123 + }, + { + "epoch": 9.860036832412524, + "grad_norm": 0.14555446803569794, + "learning_rate": 5.159033143313785e-08, + "loss": 1.7373, + "step": 32124 + }, + { + "epoch": 9.860343769183547, + "grad_norm": 0.10940683633089066, + "learning_rate": 5.136483882210996e-08, + "loss": 1.6554, + "step": 32125 + }, + { + "epoch": 9.860650705954573, + "grad_norm": 0.14117297530174255, + "learning_rate": 5.1139839832775594e-08, + "loss": 1.7088, + "step": 32126 + }, + { + "epoch": 9.860957642725598, + "grad_norm": 0.11220337450504303, + "learning_rate": 5.091533446734964e-08, + "loss": 1.6698, + "step": 32127 + }, + { + "epoch": 9.861264579496623, + "grad_norm": 0.19136083126068115, + "learning_rate": 5.06913227280581e-08, + "loss": 1.7741, + "step": 32128 + }, + { + "epoch": 9.861571516267649, + "grad_norm": 0.16426582634449005, + "learning_rate": 5.0467804617110317e-08, + "loss": 1.7405, + "step": 32129 + }, + { + "epoch": 9.861878453038674, + "grad_norm": 0.16608738899230957, + "learning_rate": 5.024478013671563e-08, + "loss": 1.7209, + "step": 32130 + }, + { + "epoch": 9.8621853898097, + "grad_norm": 0.224944606423378, + "learning_rate": 5.002224928907229e-08, + "loss": 1.7206, + "step": 32131 + }, + { + "epoch": 9.862492326580725, + "grad_norm": 0.09932999312877655, + "learning_rate": 4.980021207639518e-08, + "loss": 1.6681, + "step": 32132 + }, + { + "epoch": 9.86279926335175, + "grad_norm": 0.11509741097688675, + "learning_rate": 4.95786685008659e-08, + "loss": 1.7207, + "step": 32133 + }, + { + "epoch": 9.863106200122775, + "grad_norm": 0.1009160503745079, + "learning_rate": 4.9357618564671584e-08, + "loss": 1.6293, + "step": 32134 + }, + { + "epoch": 9.8634131368938, + "grad_norm": 0.11737551540136337, + "learning_rate": 4.913706227001047e-08, + "loss": 1.658, + "step": 32135 + }, + { + "epoch": 9.863720073664824, + "grad_norm": 0.1895657181739807, + "learning_rate": 4.89169996190475e-08, + "loss": 1.7315, + "step": 32136 + }, + { + "epoch": 9.86402701043585, + "grad_norm": 0.12624821066856384, + "learning_rate": 4.869743061396981e-08, + "loss": 1.678, + "step": 32137 + }, + { + "epoch": 9.864333947206875, + "grad_norm": 0.12830981612205505, + "learning_rate": 4.847835525693678e-08, + "loss": 1.7305, + "step": 32138 + }, + { + "epoch": 9.8646408839779, + "grad_norm": 0.1154761090874672, + "learning_rate": 4.82597735501189e-08, + "loss": 1.7024, + "step": 32139 + }, + { + "epoch": 9.864947820748926, + "grad_norm": 0.18320058286190033, + "learning_rate": 4.804168549567556e-08, + "loss": 1.7165, + "step": 32140 + }, + { + "epoch": 9.865254757519951, + "grad_norm": 0.1479901671409607, + "learning_rate": 4.782409109576613e-08, + "loss": 1.7079, + "step": 32141 + }, + { + "epoch": 9.865561694290976, + "grad_norm": 0.11338557302951813, + "learning_rate": 4.760699035253335e-08, + "loss": 1.6924, + "step": 32142 + }, + { + "epoch": 9.865868631062002, + "grad_norm": 0.1415034830570221, + "learning_rate": 4.73903832681255e-08, + "loss": 1.6969, + "step": 32143 + }, + { + "epoch": 9.866175567833027, + "grad_norm": 0.139898419380188, + "learning_rate": 4.7174269844685315e-08, + "loss": 1.6787, + "step": 32144 + }, + { + "epoch": 9.866482504604052, + "grad_norm": 0.16872167587280273, + "learning_rate": 4.695865008434997e-08, + "loss": 1.7279, + "step": 32145 + }, + { + "epoch": 9.866789441375076, + "grad_norm": 0.1443173587322235, + "learning_rate": 4.674352398924553e-08, + "loss": 1.7436, + "step": 32146 + }, + { + "epoch": 9.867096378146101, + "grad_norm": 0.2038755714893341, + "learning_rate": 4.652889156149809e-08, + "loss": 1.7499, + "step": 32147 + }, + { + "epoch": 9.867403314917127, + "grad_norm": 0.11941488832235336, + "learning_rate": 4.6314752803233716e-08, + "loss": 1.6899, + "step": 32148 + }, + { + "epoch": 9.867710251688152, + "grad_norm": 0.1467728614807129, + "learning_rate": 4.610110771656184e-08, + "loss": 1.7133, + "step": 32149 + }, + { + "epoch": 9.868017188459177, + "grad_norm": 0.18277500569820404, + "learning_rate": 4.5887956303602985e-08, + "loss": 1.7038, + "step": 32150 + }, + { + "epoch": 9.868324125230203, + "grad_norm": 0.09188520163297653, + "learning_rate": 4.567529856645547e-08, + "loss": 1.6514, + "step": 32151 + }, + { + "epoch": 9.868631062001228, + "grad_norm": 0.1508881002664566, + "learning_rate": 4.546313450722317e-08, + "loss": 1.7354, + "step": 32152 + }, + { + "epoch": 9.868937998772253, + "grad_norm": 0.19286566972732544, + "learning_rate": 4.525146412800441e-08, + "loss": 1.7437, + "step": 32153 + }, + { + "epoch": 9.869244935543279, + "grad_norm": 0.13278965651988983, + "learning_rate": 4.504028743089195e-08, + "loss": 1.7508, + "step": 32154 + }, + { + "epoch": 9.869551872314304, + "grad_norm": 0.17647281289100647, + "learning_rate": 4.4829604417967466e-08, + "loss": 1.7637, + "step": 32155 + }, + { + "epoch": 9.86985880908533, + "grad_norm": 0.12501446902751923, + "learning_rate": 4.461941509131817e-08, + "loss": 1.6799, + "step": 32156 + }, + { + "epoch": 9.870165745856355, + "grad_norm": 0.15084847807884216, + "learning_rate": 4.440971945302019e-08, + "loss": 1.7139, + "step": 32157 + }, + { + "epoch": 9.870472682627378, + "grad_norm": 0.1984490007162094, + "learning_rate": 4.420051750514409e-08, + "loss": 1.7522, + "step": 32158 + }, + { + "epoch": 9.870779619398403, + "grad_norm": 0.15516258776187897, + "learning_rate": 4.399180924975488e-08, + "loss": 1.7365, + "step": 32159 + }, + { + "epoch": 9.871086556169429, + "grad_norm": 0.1323643922805786, + "learning_rate": 4.3783594688923124e-08, + "loss": 1.6581, + "step": 32160 + }, + { + "epoch": 9.871393492940454, + "grad_norm": 0.13200242817401886, + "learning_rate": 4.357587382470274e-08, + "loss": 1.6713, + "step": 32161 + }, + { + "epoch": 9.87170042971148, + "grad_norm": 0.0954132005572319, + "learning_rate": 4.3368646659147635e-08, + "loss": 1.6607, + "step": 32162 + }, + { + "epoch": 9.872007366482505, + "grad_norm": 0.15339840948581696, + "learning_rate": 4.316191319430063e-08, + "loss": 1.6888, + "step": 32163 + }, + { + "epoch": 9.87231430325353, + "grad_norm": 0.27716484665870667, + "learning_rate": 4.295567343221008e-08, + "loss": 1.7807, + "step": 32164 + }, + { + "epoch": 9.872621240024555, + "grad_norm": 0.1060333251953125, + "learning_rate": 4.2749927374907684e-08, + "loss": 1.7056, + "step": 32165 + }, + { + "epoch": 9.87292817679558, + "grad_norm": 0.16034503281116486, + "learning_rate": 4.2544675024436266e-08, + "loss": 1.717, + "step": 32166 + }, + { + "epoch": 9.873235113566606, + "grad_norm": 0.12173280119895935, + "learning_rate": 4.233991638281642e-08, + "loss": 1.7021, + "step": 32167 + }, + { + "epoch": 9.87354205033763, + "grad_norm": 0.1884598582983017, + "learning_rate": 4.213565145207987e-08, + "loss": 1.6926, + "step": 32168 + }, + { + "epoch": 9.873848987108655, + "grad_norm": 0.12239779531955719, + "learning_rate": 4.193188023423611e-08, + "loss": 1.6969, + "step": 32169 + }, + { + "epoch": 9.87415592387968, + "grad_norm": 0.15470372140407562, + "learning_rate": 4.172860273130019e-08, + "loss": 1.7963, + "step": 32170 + }, + { + "epoch": 9.874462860650706, + "grad_norm": 0.11103082448244095, + "learning_rate": 4.152581894528717e-08, + "loss": 1.6866, + "step": 32171 + }, + { + "epoch": 9.874769797421731, + "grad_norm": 0.14944078028202057, + "learning_rate": 4.132352887819546e-08, + "loss": 1.7383, + "step": 32172 + }, + { + "epoch": 9.875076734192756, + "grad_norm": 0.11603175848722458, + "learning_rate": 4.1121732532029e-08, + "loss": 1.6626, + "step": 32173 + }, + { + "epoch": 9.875383670963782, + "grad_norm": 0.16313737630844116, + "learning_rate": 4.092042990878064e-08, + "loss": 1.7547, + "step": 32174 + }, + { + "epoch": 9.875690607734807, + "grad_norm": 0.10700001567602158, + "learning_rate": 4.0719621010437694e-08, + "loss": 1.6582, + "step": 32175 + }, + { + "epoch": 9.875997544505832, + "grad_norm": 0.09969279915094376, + "learning_rate": 4.0519305838981894e-08, + "loss": 1.6598, + "step": 32176 + }, + { + "epoch": 9.876304481276858, + "grad_norm": 0.18154063820838928, + "learning_rate": 4.031948439640054e-08, + "loss": 1.6844, + "step": 32177 + }, + { + "epoch": 9.876611418047883, + "grad_norm": 0.10725349187850952, + "learning_rate": 4.012015668466429e-08, + "loss": 1.6638, + "step": 32178 + }, + { + "epoch": 9.876918354818907, + "grad_norm": 0.15481308102607727, + "learning_rate": 3.992132270573823e-08, + "loss": 1.7646, + "step": 32179 + }, + { + "epoch": 9.877225291589932, + "grad_norm": 0.2573716640472412, + "learning_rate": 3.9722982461593005e-08, + "loss": 1.741, + "step": 32180 + }, + { + "epoch": 9.877532228360957, + "grad_norm": 0.14982570707798004, + "learning_rate": 3.952513595419372e-08, + "loss": 1.7354, + "step": 32181 + }, + { + "epoch": 9.877839165131983, + "grad_norm": 0.15668633580207825, + "learning_rate": 3.932778318548325e-08, + "loss": 1.7226, + "step": 32182 + }, + { + "epoch": 9.878146101903008, + "grad_norm": 0.12578873336315155, + "learning_rate": 3.913092415742114e-08, + "loss": 1.7206, + "step": 32183 + }, + { + "epoch": 9.878453038674033, + "grad_norm": 0.12647871673107147, + "learning_rate": 3.8934558871950296e-08, + "loss": 1.7102, + "step": 32184 + }, + { + "epoch": 9.878759975445059, + "grad_norm": 0.14217160642147064, + "learning_rate": 3.8738687331013603e-08, + "loss": 1.6851, + "step": 32185 + }, + { + "epoch": 9.879066912216084, + "grad_norm": 0.12461835891008377, + "learning_rate": 3.8543309536542835e-08, + "loss": 1.6956, + "step": 32186 + }, + { + "epoch": 9.87937384898711, + "grad_norm": 0.11051438748836517, + "learning_rate": 3.8348425490469796e-08, + "loss": 1.6719, + "step": 32187 + }, + { + "epoch": 9.879680785758135, + "grad_norm": 0.11611293256282806, + "learning_rate": 3.815403519472072e-08, + "loss": 1.6766, + "step": 32188 + }, + { + "epoch": 9.879987722529158, + "grad_norm": 0.17132268846035004, + "learning_rate": 3.796013865121628e-08, + "loss": 1.7112, + "step": 32189 + }, + { + "epoch": 9.880294659300183, + "grad_norm": 0.13943015038967133, + "learning_rate": 3.776673586187718e-08, + "loss": 1.7382, + "step": 32190 + }, + { + "epoch": 9.880601596071209, + "grad_norm": 0.11459454149007797, + "learning_rate": 3.757382682860744e-08, + "loss": 1.674, + "step": 32191 + }, + { + "epoch": 9.880908532842234, + "grad_norm": 0.1549069583415985, + "learning_rate": 3.738141155331665e-08, + "loss": 1.7275, + "step": 32192 + }, + { + "epoch": 9.88121546961326, + "grad_norm": 0.09938697516918182, + "learning_rate": 3.7189490037908834e-08, + "loss": 1.6483, + "step": 32193 + }, + { + "epoch": 9.881522406384285, + "grad_norm": 0.10582483559846878, + "learning_rate": 3.6998062284276934e-08, + "loss": 1.6667, + "step": 32194 + }, + { + "epoch": 9.88182934315531, + "grad_norm": 0.1391625851392746, + "learning_rate": 3.6807128294319426e-08, + "loss": 1.6919, + "step": 32195 + }, + { + "epoch": 9.882136279926335, + "grad_norm": 0.10145086795091629, + "learning_rate": 3.661668806991259e-08, + "loss": 1.6726, + "step": 32196 + }, + { + "epoch": 9.88244321669736, + "grad_norm": 0.12674877047538757, + "learning_rate": 3.642674161294379e-08, + "loss": 1.693, + "step": 32197 + }, + { + "epoch": 9.882750153468386, + "grad_norm": 0.16183172166347504, + "learning_rate": 3.6237288925294875e-08, + "loss": 1.66, + "step": 32198 + }, + { + "epoch": 9.883057090239411, + "grad_norm": 0.11870484054088593, + "learning_rate": 3.604833000883101e-08, + "loss": 1.6869, + "step": 32199 + }, + { + "epoch": 9.883364027010435, + "grad_norm": 0.149629145860672, + "learning_rate": 3.585986486542292e-08, + "loss": 1.6774, + "step": 32200 + }, + { + "epoch": 9.88367096378146, + "grad_norm": 0.13439494371414185, + "learning_rate": 3.567189349693023e-08, + "loss": 1.7167, + "step": 32201 + }, + { + "epoch": 9.883977900552486, + "grad_norm": 0.10757558792829514, + "learning_rate": 3.5484415905218114e-08, + "loss": 1.6832, + "step": 32202 + }, + { + "epoch": 9.884284837323511, + "grad_norm": 0.1354834884405136, + "learning_rate": 3.5297432092129544e-08, + "loss": 1.7285, + "step": 32203 + }, + { + "epoch": 9.884591774094536, + "grad_norm": 0.13512718677520752, + "learning_rate": 3.5110942059518594e-08, + "loss": 1.6989, + "step": 32204 + }, + { + "epoch": 9.884898710865562, + "grad_norm": 0.14214816689491272, + "learning_rate": 3.492494580922823e-08, + "loss": 1.7081, + "step": 32205 + }, + { + "epoch": 9.885205647636587, + "grad_norm": 0.12680695950984955, + "learning_rate": 3.4739443343090315e-08, + "loss": 1.7195, + "step": 32206 + }, + { + "epoch": 9.885512584407612, + "grad_norm": 0.11334585398435593, + "learning_rate": 3.455443466294783e-08, + "loss": 1.6965, + "step": 32207 + }, + { + "epoch": 9.885819521178638, + "grad_norm": 0.15353024005889893, + "learning_rate": 3.4369919770621536e-08, + "loss": 1.7505, + "step": 32208 + }, + { + "epoch": 9.886126457949663, + "grad_norm": 0.14484186470508575, + "learning_rate": 3.4185898667937756e-08, + "loss": 1.7272, + "step": 32209 + }, + { + "epoch": 9.886433394720687, + "grad_norm": 0.1442519873380661, + "learning_rate": 3.400237135671169e-08, + "loss": 1.7159, + "step": 32210 + }, + { + "epoch": 9.886740331491712, + "grad_norm": 0.15484102070331573, + "learning_rate": 3.381933783876412e-08, + "loss": 1.7064, + "step": 32211 + }, + { + "epoch": 9.887047268262737, + "grad_norm": 0.09997449070215225, + "learning_rate": 3.36367981159047e-08, + "loss": 1.6768, + "step": 32212 + }, + { + "epoch": 9.887354205033763, + "grad_norm": 0.1351270228624344, + "learning_rate": 3.3454752189926444e-08, + "loss": 1.7302, + "step": 32213 + }, + { + "epoch": 9.887661141804788, + "grad_norm": 0.12122789025306702, + "learning_rate": 3.327320006263346e-08, + "loss": 1.6963, + "step": 32214 + }, + { + "epoch": 9.887968078575813, + "grad_norm": 0.12483847141265869, + "learning_rate": 3.309214173582431e-08, + "loss": 1.6803, + "step": 32215 + }, + { + "epoch": 9.888275015346839, + "grad_norm": 0.13801445066928864, + "learning_rate": 3.2911577211280905e-08, + "loss": 1.7139, + "step": 32216 + }, + { + "epoch": 9.888581952117864, + "grad_norm": 0.19149911403656006, + "learning_rate": 3.273150649079626e-08, + "loss": 1.7526, + "step": 32217 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 0.15660177171230316, + "learning_rate": 3.255192957614672e-08, + "loss": 1.7487, + "step": 32218 + }, + { + "epoch": 9.889195825659915, + "grad_norm": 0.13127101957798004, + "learning_rate": 3.2372846469103104e-08, + "loss": 1.7105, + "step": 32219 + }, + { + "epoch": 9.88950276243094, + "grad_norm": 0.09861146658658981, + "learning_rate": 3.21942571714362e-08, + "loss": 1.6766, + "step": 32220 + }, + { + "epoch": 9.889809699201965, + "grad_norm": 0.14133897423744202, + "learning_rate": 3.201616168491683e-08, + "loss": 1.7335, + "step": 32221 + }, + { + "epoch": 9.890116635972989, + "grad_norm": 0.12263017147779465, + "learning_rate": 3.1838560011299145e-08, + "loss": 1.6699, + "step": 32222 + }, + { + "epoch": 9.890423572744014, + "grad_norm": 0.12454384565353394, + "learning_rate": 3.166145215233729e-08, + "loss": 1.7091, + "step": 32223 + }, + { + "epoch": 9.89073050951504, + "grad_norm": 0.11563286185264587, + "learning_rate": 3.148483810979097e-08, + "loss": 1.6885, + "step": 32224 + }, + { + "epoch": 9.891037446286065, + "grad_norm": 0.18573540449142456, + "learning_rate": 3.1308717885392136e-08, + "loss": 1.7238, + "step": 32225 + }, + { + "epoch": 9.89134438305709, + "grad_norm": 0.16926386952400208, + "learning_rate": 3.113309148088939e-08, + "loss": 1.6886, + "step": 32226 + }, + { + "epoch": 9.891651319828116, + "grad_norm": 0.11649619042873383, + "learning_rate": 3.0957958898020226e-08, + "loss": 1.6767, + "step": 32227 + }, + { + "epoch": 9.89195825659914, + "grad_norm": 0.10409758239984512, + "learning_rate": 3.078332013851104e-08, + "loss": 1.6549, + "step": 32228 + }, + { + "epoch": 9.892265193370166, + "grad_norm": 0.20817892253398895, + "learning_rate": 3.0609175204088234e-08, + "loss": 1.7321, + "step": 32229 + }, + { + "epoch": 9.892572130141192, + "grad_norm": 0.15646634995937347, + "learning_rate": 3.0435524096478207e-08, + "loss": 1.7278, + "step": 32230 + }, + { + "epoch": 9.892879066912217, + "grad_norm": 0.10567045956850052, + "learning_rate": 3.026236681738515e-08, + "loss": 1.6612, + "step": 32231 + }, + { + "epoch": 9.89318600368324, + "grad_norm": 0.13315534591674805, + "learning_rate": 3.0089703368529895e-08, + "loss": 1.7413, + "step": 32232 + }, + { + "epoch": 9.893492940454266, + "grad_norm": 0.11580394208431244, + "learning_rate": 2.9917533751616655e-08, + "loss": 1.6394, + "step": 32233 + }, + { + "epoch": 9.893799877225291, + "grad_norm": 0.12873095273971558, + "learning_rate": 2.974585796834961e-08, + "loss": 1.6906, + "step": 32234 + }, + { + "epoch": 9.894106813996316, + "grad_norm": 0.2108859121799469, + "learning_rate": 2.9574676020421853e-08, + "loss": 1.7822, + "step": 32235 + }, + { + "epoch": 9.894413750767342, + "grad_norm": 0.12524257600307465, + "learning_rate": 2.9403987909520926e-08, + "loss": 1.6733, + "step": 32236 + }, + { + "epoch": 9.894720687538367, + "grad_norm": 0.1606513410806656, + "learning_rate": 2.9233793637345464e-08, + "loss": 1.7441, + "step": 32237 + }, + { + "epoch": 9.895027624309392, + "grad_norm": 0.11567132920026779, + "learning_rate": 2.9064093205566358e-08, + "loss": 1.6924, + "step": 32238 + }, + { + "epoch": 9.895334561080418, + "grad_norm": 0.18729543685913086, + "learning_rate": 2.889488661586559e-08, + "loss": 1.7346, + "step": 32239 + }, + { + "epoch": 9.895641497851443, + "grad_norm": 0.11518693715333939, + "learning_rate": 2.8726173869908502e-08, + "loss": 1.6796, + "step": 32240 + }, + { + "epoch": 9.895948434622468, + "grad_norm": 0.12286285310983658, + "learning_rate": 2.8557954969377078e-08, + "loss": 1.6804, + "step": 32241 + }, + { + "epoch": 9.896255371393494, + "grad_norm": 0.11524228751659393, + "learning_rate": 2.8390229915919998e-08, + "loss": 1.6649, + "step": 32242 + }, + { + "epoch": 9.896562308164517, + "grad_norm": 0.1211162805557251, + "learning_rate": 2.8222998711202598e-08, + "loss": 1.686, + "step": 32243 + }, + { + "epoch": 9.896869244935543, + "grad_norm": 0.11552423238754272, + "learning_rate": 2.805626135687356e-08, + "loss": 1.6709, + "step": 32244 + }, + { + "epoch": 9.897176181706568, + "grad_norm": 0.15391238033771515, + "learning_rate": 2.7890017854587114e-08, + "loss": 1.7138, + "step": 32245 + }, + { + "epoch": 9.897483118477593, + "grad_norm": 0.15275777876377106, + "learning_rate": 2.772426820597529e-08, + "loss": 1.7328, + "step": 32246 + }, + { + "epoch": 9.897790055248619, + "grad_norm": 0.12468124181032181, + "learning_rate": 2.7559012412681218e-08, + "loss": 1.6861, + "step": 32247 + }, + { + "epoch": 9.898096992019644, + "grad_norm": 0.1204581931233406, + "learning_rate": 2.7394250476342475e-08, + "loss": 1.7059, + "step": 32248 + }, + { + "epoch": 9.89840392879067, + "grad_norm": 0.15671482682228088, + "learning_rate": 2.722998239857999e-08, + "loss": 1.7289, + "step": 32249 + }, + { + "epoch": 9.898710865561695, + "grad_norm": 0.13706350326538086, + "learning_rate": 2.7066208181025786e-08, + "loss": 1.7048, + "step": 32250 + }, + { + "epoch": 9.89901780233272, + "grad_norm": 0.15076833963394165, + "learning_rate": 2.6902927825289694e-08, + "loss": 1.7355, + "step": 32251 + }, + { + "epoch": 9.899324739103745, + "grad_norm": 0.14177745580673218, + "learning_rate": 2.674014133298708e-08, + "loss": 1.6877, + "step": 32252 + }, + { + "epoch": 9.899631675874769, + "grad_norm": 0.1680639237165451, + "learning_rate": 2.6577848705733322e-08, + "loss": 1.6612, + "step": 32253 + }, + { + "epoch": 9.899938612645794, + "grad_norm": 0.13200677931308746, + "learning_rate": 2.641604994512714e-08, + "loss": 1.7169, + "step": 32254 + }, + { + "epoch": 9.90024554941682, + "grad_norm": 0.1324564814567566, + "learning_rate": 2.625474505276726e-08, + "loss": 1.6906, + "step": 32255 + }, + { + "epoch": 9.900552486187845, + "grad_norm": 0.11010903865098953, + "learning_rate": 2.6093934030246846e-08, + "loss": 1.6665, + "step": 32256 + }, + { + "epoch": 9.90085942295887, + "grad_norm": 0.1629243791103363, + "learning_rate": 2.5933616879159073e-08, + "loss": 1.7339, + "step": 32257 + }, + { + "epoch": 9.901166359729896, + "grad_norm": 0.10520602762699127, + "learning_rate": 2.5773793601080453e-08, + "loss": 1.649, + "step": 32258 + }, + { + "epoch": 9.901473296500921, + "grad_norm": 0.13441254198551178, + "learning_rate": 2.561446419760416e-08, + "loss": 1.7146, + "step": 32259 + }, + { + "epoch": 9.901780233271946, + "grad_norm": 0.15586842596530914, + "learning_rate": 2.5455628670290054e-08, + "loss": 1.6921, + "step": 32260 + }, + { + "epoch": 9.902087170042972, + "grad_norm": 0.1360539346933365, + "learning_rate": 2.5297287020720206e-08, + "loss": 1.6973, + "step": 32261 + }, + { + "epoch": 9.902394106813997, + "grad_norm": 0.1683451533317566, + "learning_rate": 2.5139439250448927e-08, + "loss": 1.7456, + "step": 32262 + }, + { + "epoch": 9.902701043585022, + "grad_norm": 0.12836389243602753, + "learning_rate": 2.498208536104163e-08, + "loss": 1.6723, + "step": 32263 + }, + { + "epoch": 9.903007980356048, + "grad_norm": 0.14135409891605377, + "learning_rate": 2.482522535405263e-08, + "loss": 1.7411, + "step": 32264 + }, + { + "epoch": 9.903314917127071, + "grad_norm": 0.13020414113998413, + "learning_rate": 2.4668859231036236e-08, + "loss": 1.7077, + "step": 32265 + }, + { + "epoch": 9.903621853898096, + "grad_norm": 0.14027753472328186, + "learning_rate": 2.4512986993530106e-08, + "loss": 1.7259, + "step": 32266 + }, + { + "epoch": 9.903928790669122, + "grad_norm": 0.19937944412231445, + "learning_rate": 2.4357608643077455e-08, + "loss": 1.7673, + "step": 32267 + }, + { + "epoch": 9.904235727440147, + "grad_norm": 0.12452827394008636, + "learning_rate": 2.4202724181215942e-08, + "loss": 1.6824, + "step": 32268 + }, + { + "epoch": 9.904542664211172, + "grad_norm": 0.15908023715019226, + "learning_rate": 2.4048333609472116e-08, + "loss": 1.7399, + "step": 32269 + }, + { + "epoch": 9.904849600982198, + "grad_norm": 0.1503656804561615, + "learning_rate": 2.3894436929378094e-08, + "loss": 1.7216, + "step": 32270 + }, + { + "epoch": 9.905156537753223, + "grad_norm": 0.12779399752616882, + "learning_rate": 2.3741034142449324e-08, + "loss": 1.7447, + "step": 32271 + }, + { + "epoch": 9.905463474524248, + "grad_norm": 0.15011703968048096, + "learning_rate": 2.3588125250206815e-08, + "loss": 1.709, + "step": 32272 + }, + { + "epoch": 9.905770411295274, + "grad_norm": 0.13510404527187347, + "learning_rate": 2.3435710254154918e-08, + "loss": 1.6968, + "step": 32273 + }, + { + "epoch": 9.9060773480663, + "grad_norm": 0.1107151061296463, + "learning_rate": 2.3283789155803536e-08, + "loss": 1.6723, + "step": 32274 + }, + { + "epoch": 9.906384284837323, + "grad_norm": 0.15149912238121033, + "learning_rate": 2.3132361956657024e-08, + "loss": 1.7406, + "step": 32275 + }, + { + "epoch": 9.906691221608348, + "grad_norm": 0.14119799435138702, + "learning_rate": 2.2981428658208627e-08, + "loss": 1.757, + "step": 32276 + }, + { + "epoch": 9.906998158379373, + "grad_norm": 0.1312095820903778, + "learning_rate": 2.2830989261946045e-08, + "loss": 1.7049, + "step": 32277 + }, + { + "epoch": 9.907305095150399, + "grad_norm": 0.10459209233522415, + "learning_rate": 2.268104376936253e-08, + "loss": 1.6757, + "step": 32278 + }, + { + "epoch": 9.907612031921424, + "grad_norm": 0.16587966680526733, + "learning_rate": 2.253159218194023e-08, + "loss": 1.7207, + "step": 32279 + }, + { + "epoch": 9.90791896869245, + "grad_norm": 0.18351085484027863, + "learning_rate": 2.238263450115019e-08, + "loss": 1.7385, + "step": 32280 + }, + { + "epoch": 9.908225905463475, + "grad_norm": 0.1720595806837082, + "learning_rate": 2.2234170728469005e-08, + "loss": 1.7432, + "step": 32281 + }, + { + "epoch": 9.9085328422345, + "grad_norm": 0.1220058798789978, + "learning_rate": 2.2086200865362172e-08, + "loss": 1.7197, + "step": 32282 + }, + { + "epoch": 9.908839779005525, + "grad_norm": 0.18978485465049744, + "learning_rate": 2.1938724913295183e-08, + "loss": 1.776, + "step": 32283 + }, + { + "epoch": 9.90914671577655, + "grad_norm": 0.2161943018436432, + "learning_rate": 2.1791742873716882e-08, + "loss": 1.7852, + "step": 32284 + }, + { + "epoch": 9.909453652547576, + "grad_norm": 0.12366054207086563, + "learning_rate": 2.1645254748092757e-08, + "loss": 1.6733, + "step": 32285 + }, + { + "epoch": 9.9097605893186, + "grad_norm": 0.15332402288913727, + "learning_rate": 2.1499260537855002e-08, + "loss": 1.7465, + "step": 32286 + }, + { + "epoch": 9.910067526089625, + "grad_norm": 0.13514944911003113, + "learning_rate": 2.1353760244463562e-08, + "loss": 1.7004, + "step": 32287 + }, + { + "epoch": 9.91037446286065, + "grad_norm": 0.1976264864206314, + "learning_rate": 2.1208753869339516e-08, + "loss": 1.8062, + "step": 32288 + }, + { + "epoch": 9.910681399631676, + "grad_norm": 0.12862536311149597, + "learning_rate": 2.1064241413931707e-08, + "loss": 1.6782, + "step": 32289 + }, + { + "epoch": 9.910988336402701, + "grad_norm": 0.19731375575065613, + "learning_rate": 2.092022287965567e-08, + "loss": 1.7668, + "step": 32290 + }, + { + "epoch": 9.911295273173726, + "grad_norm": 0.11489395052194595, + "learning_rate": 2.0776698267943594e-08, + "loss": 1.6704, + "step": 32291 + }, + { + "epoch": 9.911602209944752, + "grad_norm": 0.15996041893959045, + "learning_rate": 2.0633667580205463e-08, + "loss": 1.7058, + "step": 32292 + }, + { + "epoch": 9.911909146715777, + "grad_norm": 0.12133777141571045, + "learning_rate": 2.049113081786236e-08, + "loss": 1.6644, + "step": 32293 + }, + { + "epoch": 9.912216083486802, + "grad_norm": 0.15602417290210724, + "learning_rate": 2.0349087982318714e-08, + "loss": 1.7097, + "step": 32294 + }, + { + "epoch": 9.912523020257828, + "grad_norm": 0.16324558854103088, + "learning_rate": 2.0207539074978966e-08, + "loss": 1.6846, + "step": 32295 + }, + { + "epoch": 9.912829957028851, + "grad_norm": 0.15360431373119354, + "learning_rate": 2.0066484097241988e-08, + "loss": 1.7227, + "step": 32296 + }, + { + "epoch": 9.913136893799877, + "grad_norm": 0.17100133001804352, + "learning_rate": 1.9925923050506667e-08, + "loss": 1.6693, + "step": 32297 + }, + { + "epoch": 9.913443830570902, + "grad_norm": 0.11901558190584183, + "learning_rate": 1.9785855936149677e-08, + "loss": 1.7466, + "step": 32298 + }, + { + "epoch": 9.913750767341927, + "grad_norm": 0.10561197996139526, + "learning_rate": 1.96462827555699e-08, + "loss": 1.6526, + "step": 32299 + }, + { + "epoch": 9.914057704112953, + "grad_norm": 0.10759133845567703, + "learning_rate": 1.9507203510138463e-08, + "loss": 1.6842, + "step": 32300 + }, + { + "epoch": 9.914364640883978, + "grad_norm": 0.13747403025627136, + "learning_rate": 1.9368618201232036e-08, + "loss": 1.7276, + "step": 32301 + }, + { + "epoch": 9.914671577655003, + "grad_norm": 0.18556538224220276, + "learning_rate": 1.92305268302162e-08, + "loss": 1.7007, + "step": 32302 + }, + { + "epoch": 9.914978514426029, + "grad_norm": 0.22288267314434052, + "learning_rate": 1.9092929398462078e-08, + "loss": 1.7438, + "step": 32303 + }, + { + "epoch": 9.915285451197054, + "grad_norm": 0.11585120111703873, + "learning_rate": 1.8955825907324142e-08, + "loss": 1.6834, + "step": 32304 + }, + { + "epoch": 9.91559238796808, + "grad_norm": 0.14063316583633423, + "learning_rate": 1.8819216358156864e-08, + "loss": 1.698, + "step": 32305 + }, + { + "epoch": 9.915899324739105, + "grad_norm": 0.10423889756202698, + "learning_rate": 1.8683100752320272e-08, + "loss": 1.6796, + "step": 32306 + }, + { + "epoch": 9.91620626151013, + "grad_norm": 0.10526315122842789, + "learning_rate": 1.8547479091146626e-08, + "loss": 1.6681, + "step": 32307 + }, + { + "epoch": 9.916513198281153, + "grad_norm": 0.12726645171642303, + "learning_rate": 1.8412351375984848e-08, + "loss": 1.6959, + "step": 32308 + }, + { + "epoch": 9.916820135052179, + "grad_norm": 0.13809795677661896, + "learning_rate": 1.827771760816721e-08, + "loss": 1.7263, + "step": 32309 + }, + { + "epoch": 9.917127071823204, + "grad_norm": 0.15422095358371735, + "learning_rate": 1.8143577789020426e-08, + "loss": 1.7651, + "step": 32310 + }, + { + "epoch": 9.91743400859423, + "grad_norm": 0.1087057963013649, + "learning_rate": 1.8009931919876767e-08, + "loss": 1.6589, + "step": 32311 + }, + { + "epoch": 9.917740945365255, + "grad_norm": 0.1274532526731491, + "learning_rate": 1.787678000205739e-08, + "loss": 1.698, + "step": 32312 + }, + { + "epoch": 9.91804788213628, + "grad_norm": 0.14955148100852966, + "learning_rate": 1.774412203687237e-08, + "loss": 1.7187, + "step": 32313 + }, + { + "epoch": 9.918354818907305, + "grad_norm": 0.12892384827136993, + "learning_rate": 1.761195802563731e-08, + "loss": 1.6668, + "step": 32314 + }, + { + "epoch": 9.91866175567833, + "grad_norm": 0.12298917770385742, + "learning_rate": 1.7480287969651178e-08, + "loss": 1.6624, + "step": 32315 + }, + { + "epoch": 9.918968692449356, + "grad_norm": 0.10288118571043015, + "learning_rate": 1.7349111870224032e-08, + "loss": 1.6632, + "step": 32316 + }, + { + "epoch": 9.919275629220381, + "grad_norm": 0.15588083863258362, + "learning_rate": 1.7218429728649287e-08, + "loss": 1.7164, + "step": 32317 + }, + { + "epoch": 9.919582565991405, + "grad_norm": 0.13187600672245026, + "learning_rate": 1.708824154622035e-08, + "loss": 1.6939, + "step": 32318 + }, + { + "epoch": 9.91988950276243, + "grad_norm": 0.1224738210439682, + "learning_rate": 1.695854732421398e-08, + "loss": 1.6847, + "step": 32319 + }, + { + "epoch": 9.920196439533456, + "grad_norm": 0.12615568935871124, + "learning_rate": 1.6829347063923584e-08, + "loss": 1.7332, + "step": 32320 + }, + { + "epoch": 9.920503376304481, + "grad_norm": 0.10515398532152176, + "learning_rate": 1.670064076662592e-08, + "loss": 1.6249, + "step": 32321 + }, + { + "epoch": 9.920810313075506, + "grad_norm": 0.11620636284351349, + "learning_rate": 1.657242843358109e-08, + "loss": 1.6856, + "step": 32322 + }, + { + "epoch": 9.921117249846532, + "grad_norm": 0.14267602562904358, + "learning_rate": 1.644471006606585e-08, + "loss": 1.6957, + "step": 32323 + }, + { + "epoch": 9.921424186617557, + "grad_norm": 0.14195942878723145, + "learning_rate": 1.6317485665345855e-08, + "loss": 1.7102, + "step": 32324 + }, + { + "epoch": 9.921731123388582, + "grad_norm": 0.13764344155788422, + "learning_rate": 1.6190755232664556e-08, + "loss": 1.7028, + "step": 32325 + }, + { + "epoch": 9.922038060159608, + "grad_norm": 0.13899104297161102, + "learning_rate": 1.6064518769287605e-08, + "loss": 1.6844, + "step": 32326 + }, + { + "epoch": 9.922344996930633, + "grad_norm": 0.11225128173828125, + "learning_rate": 1.5938776276458457e-08, + "loss": 1.6489, + "step": 32327 + }, + { + "epoch": 9.922651933701658, + "grad_norm": 0.10915616899728775, + "learning_rate": 1.5813527755415e-08, + "loss": 1.6916, + "step": 32328 + }, + { + "epoch": 9.922958870472682, + "grad_norm": 0.15568388998508453, + "learning_rate": 1.56887732074007e-08, + "loss": 1.7295, + "step": 32329 + }, + { + "epoch": 9.923265807243707, + "grad_norm": 0.12068216502666473, + "learning_rate": 1.556451263364789e-08, + "loss": 1.7293, + "step": 32330 + }, + { + "epoch": 9.923572744014733, + "grad_norm": 0.1622546762228012, + "learning_rate": 1.544074603538337e-08, + "loss": 1.7351, + "step": 32331 + }, + { + "epoch": 9.923879680785758, + "grad_norm": 0.10042760521173477, + "learning_rate": 1.5317473413828388e-08, + "loss": 1.6729, + "step": 32332 + }, + { + "epoch": 9.924186617556783, + "grad_norm": 0.15807488560676575, + "learning_rate": 1.5194694770204187e-08, + "loss": 1.7114, + "step": 32333 + }, + { + "epoch": 9.924493554327809, + "grad_norm": 0.1204007938504219, + "learning_rate": 1.5072410105720914e-08, + "loss": 1.7242, + "step": 32334 + }, + { + "epoch": 9.924800491098834, + "grad_norm": 0.1176806390285492, + "learning_rate": 1.495061942159426e-08, + "loss": 1.6778, + "step": 32335 + }, + { + "epoch": 9.92510742786986, + "grad_norm": 0.2244664430618286, + "learning_rate": 1.4829322719017713e-08, + "loss": 1.7206, + "step": 32336 + }, + { + "epoch": 9.925414364640885, + "grad_norm": 0.11579646915197372, + "learning_rate": 1.4708519999195868e-08, + "loss": 1.6595, + "step": 32337 + }, + { + "epoch": 9.92572130141191, + "grad_norm": 0.08797867596149445, + "learning_rate": 1.4588211263322215e-08, + "loss": 1.6587, + "step": 32338 + }, + { + "epoch": 9.926028238182933, + "grad_norm": 0.118585966527462, + "learning_rate": 1.4468396512584693e-08, + "loss": 1.6704, + "step": 32339 + }, + { + "epoch": 9.926335174953959, + "grad_norm": 0.16289199888706207, + "learning_rate": 1.4349075748171236e-08, + "loss": 1.7494, + "step": 32340 + }, + { + "epoch": 9.926642111724984, + "grad_norm": 0.09592059999704361, + "learning_rate": 1.4230248971253135e-08, + "loss": 1.6613, + "step": 32341 + }, + { + "epoch": 9.92694904849601, + "grad_norm": 0.17101891338825226, + "learning_rate": 1.4111916183012775e-08, + "loss": 1.746, + "step": 32342 + }, + { + "epoch": 9.927255985267035, + "grad_norm": 0.12958920001983643, + "learning_rate": 1.3994077384615889e-08, + "loss": 1.6704, + "step": 32343 + }, + { + "epoch": 9.92756292203806, + "grad_norm": 0.1180882677435875, + "learning_rate": 1.3876732577228212e-08, + "loss": 1.6892, + "step": 32344 + }, + { + "epoch": 9.927869858809085, + "grad_norm": 0.13923440873622894, + "learning_rate": 1.375988176200438e-08, + "loss": 1.7205, + "step": 32345 + }, + { + "epoch": 9.92817679558011, + "grad_norm": 0.11700796335935593, + "learning_rate": 1.3643524940104569e-08, + "loss": 1.6689, + "step": 32346 + }, + { + "epoch": 9.928483732351136, + "grad_norm": 0.14296385645866394, + "learning_rate": 1.3527662112677863e-08, + "loss": 1.7583, + "step": 32347 + }, + { + "epoch": 9.928790669122161, + "grad_norm": 0.14136703312397003, + "learning_rate": 1.3412293280867794e-08, + "loss": 1.7217, + "step": 32348 + }, + { + "epoch": 9.929097605893187, + "grad_norm": 0.19926518201828003, + "learning_rate": 1.3297418445817889e-08, + "loss": 1.7181, + "step": 32349 + }, + { + "epoch": 9.92940454266421, + "grad_norm": 0.12384761869907379, + "learning_rate": 1.3183037608660576e-08, + "loss": 1.7106, + "step": 32350 + }, + { + "epoch": 9.929711479435236, + "grad_norm": 0.1384219080209732, + "learning_rate": 1.3069150770528282e-08, + "loss": 1.6729, + "step": 32351 + }, + { + "epoch": 9.930018416206261, + "grad_norm": 0.11504645645618439, + "learning_rate": 1.2955757932542333e-08, + "loss": 1.7014, + "step": 32352 + }, + { + "epoch": 9.930325352977286, + "grad_norm": 0.172870472073555, + "learning_rate": 1.2842859095824056e-08, + "loss": 1.778, + "step": 32353 + }, + { + "epoch": 9.930632289748312, + "grad_norm": 0.13782678544521332, + "learning_rate": 1.2730454261494774e-08, + "loss": 1.7095, + "step": 32354 + }, + { + "epoch": 9.930939226519337, + "grad_norm": 0.12346980720758438, + "learning_rate": 1.2618543430659157e-08, + "loss": 1.6769, + "step": 32355 + }, + { + "epoch": 9.931246163290362, + "grad_norm": 0.10613575577735901, + "learning_rate": 1.2507126604427433e-08, + "loss": 1.6659, + "step": 32356 + }, + { + "epoch": 9.931553100061388, + "grad_norm": 0.16232433915138245, + "learning_rate": 1.2396203783898719e-08, + "loss": 1.741, + "step": 32357 + }, + { + "epoch": 9.931860036832413, + "grad_norm": 0.11868718266487122, + "learning_rate": 1.2285774970166586e-08, + "loss": 1.7018, + "step": 32358 + }, + { + "epoch": 9.932166973603438, + "grad_norm": 0.17840202152729034, + "learning_rate": 1.2175840164330155e-08, + "loss": 1.7294, + "step": 32359 + }, + { + "epoch": 9.932473910374462, + "grad_norm": 0.12258690595626831, + "learning_rate": 1.2066399367466342e-08, + "loss": 1.6724, + "step": 32360 + }, + { + "epoch": 9.932780847145487, + "grad_norm": 0.1263471096754074, + "learning_rate": 1.1957452580663164e-08, + "loss": 1.7157, + "step": 32361 + }, + { + "epoch": 9.933087783916513, + "grad_norm": 0.1078755185008049, + "learning_rate": 1.184899980499754e-08, + "loss": 1.671, + "step": 32362 + }, + { + "epoch": 9.933394720687538, + "grad_norm": 0.15112191438674927, + "learning_rate": 1.1741041041535284e-08, + "loss": 1.7683, + "step": 32363 + }, + { + "epoch": 9.933701657458563, + "grad_norm": 0.08500932902097702, + "learning_rate": 1.163357629134776e-08, + "loss": 1.63, + "step": 32364 + }, + { + "epoch": 9.934008594229589, + "grad_norm": 0.14534896612167358, + "learning_rate": 1.152660555549523e-08, + "loss": 1.7348, + "step": 32365 + }, + { + "epoch": 9.934315531000614, + "grad_norm": 0.107171930372715, + "learning_rate": 1.1420128835037958e-08, + "loss": 1.6783, + "step": 32366 + }, + { + "epoch": 9.93462246777164, + "grad_norm": 0.14311735332012177, + "learning_rate": 1.1314146131030656e-08, + "loss": 1.7189, + "step": 32367 + }, + { + "epoch": 9.934929404542665, + "grad_norm": 0.1567717045545578, + "learning_rate": 1.1208657444511384e-08, + "loss": 1.7471, + "step": 32368 + }, + { + "epoch": 9.93523634131369, + "grad_norm": 0.17283129692077637, + "learning_rate": 1.1103662776523749e-08, + "loss": 1.7218, + "step": 32369 + }, + { + "epoch": 9.935543278084715, + "grad_norm": 0.10981162637472153, + "learning_rate": 1.0999162128116913e-08, + "loss": 1.6921, + "step": 32370 + }, + { + "epoch": 9.93585021485574, + "grad_norm": 0.1108628660440445, + "learning_rate": 1.0895155500312281e-08, + "loss": 1.6755, + "step": 32371 + }, + { + "epoch": 9.936157151626764, + "grad_norm": 0.15141257643699646, + "learning_rate": 1.079164289413681e-08, + "loss": 1.7308, + "step": 32372 + }, + { + "epoch": 9.93646408839779, + "grad_norm": 0.2009151577949524, + "learning_rate": 1.0688624310623007e-08, + "loss": 1.7415, + "step": 32373 + }, + { + "epoch": 9.936771025168815, + "grad_norm": 0.12966850399971008, + "learning_rate": 1.0586099750786727e-08, + "loss": 1.7394, + "step": 32374 + }, + { + "epoch": 9.93707796193984, + "grad_norm": 0.13342911005020142, + "learning_rate": 1.048406921563272e-08, + "loss": 1.6998, + "step": 32375 + }, + { + "epoch": 9.937384898710865, + "grad_norm": 0.13602954149246216, + "learning_rate": 1.038253270617684e-08, + "loss": 1.6902, + "step": 32376 + }, + { + "epoch": 9.93769183548189, + "grad_norm": 0.09679561108350754, + "learning_rate": 1.0281490223418289e-08, + "loss": 1.6706, + "step": 32377 + }, + { + "epoch": 9.937998772252916, + "grad_norm": 0.1325666606426239, + "learning_rate": 1.0180941768361817e-08, + "loss": 1.677, + "step": 32378 + }, + { + "epoch": 9.938305709023942, + "grad_norm": 0.18245433270931244, + "learning_rate": 1.0080887341995526e-08, + "loss": 1.7808, + "step": 32379 + }, + { + "epoch": 9.938612645794967, + "grad_norm": 0.22659125924110413, + "learning_rate": 9.981326945313063e-09, + "loss": 1.7197, + "step": 32380 + }, + { + "epoch": 9.938919582565992, + "grad_norm": 0.13232065737247467, + "learning_rate": 9.882260579291425e-09, + "loss": 1.7133, + "step": 32381 + }, + { + "epoch": 9.939226519337016, + "grad_norm": 0.3453350365161896, + "learning_rate": 9.783688244913158e-09, + "loss": 1.8062, + "step": 32382 + }, + { + "epoch": 9.939533456108041, + "grad_norm": 0.15529876947402954, + "learning_rate": 9.685609943155261e-09, + "loss": 1.7438, + "step": 32383 + }, + { + "epoch": 9.939840392879066, + "grad_norm": 0.2087012678384781, + "learning_rate": 9.588025674983626e-09, + "loss": 1.7239, + "step": 32384 + }, + { + "epoch": 9.940147329650092, + "grad_norm": 0.14322242140769958, + "learning_rate": 9.490935441358595e-09, + "loss": 1.7341, + "step": 32385 + }, + { + "epoch": 9.940454266421117, + "grad_norm": 0.11070089042186737, + "learning_rate": 9.394339243251615e-09, + "loss": 1.6735, + "step": 32386 + }, + { + "epoch": 9.940761203192142, + "grad_norm": 0.1307358294725418, + "learning_rate": 9.298237081606376e-09, + "loss": 1.6458, + "step": 32387 + }, + { + "epoch": 9.941068139963168, + "grad_norm": 0.21708574891090393, + "learning_rate": 9.202628957377668e-09, + "loss": 1.7589, + "step": 32388 + }, + { + "epoch": 9.941375076734193, + "grad_norm": 0.12621861696243286, + "learning_rate": 9.107514871509182e-09, + "loss": 1.7402, + "step": 32389 + }, + { + "epoch": 9.941682013505218, + "grad_norm": 0.13067953288555145, + "learning_rate": 9.012894824939056e-09, + "loss": 1.6896, + "step": 32390 + }, + { + "epoch": 9.941988950276244, + "grad_norm": 0.18594002723693848, + "learning_rate": 8.918768818605427e-09, + "loss": 1.7461, + "step": 32391 + }, + { + "epoch": 9.942295887047269, + "grad_norm": 0.17440444231033325, + "learning_rate": 8.825136853435333e-09, + "loss": 1.6969, + "step": 32392 + }, + { + "epoch": 9.942602823818293, + "grad_norm": 0.12859460711479187, + "learning_rate": 8.731998930361363e-09, + "loss": 1.724, + "step": 32393 + }, + { + "epoch": 9.942909760589318, + "grad_norm": 0.14894992113113403, + "learning_rate": 8.639355050293896e-09, + "loss": 1.6802, + "step": 32394 + }, + { + "epoch": 9.943216697360343, + "grad_norm": 0.16252176463603973, + "learning_rate": 8.54720521415442e-09, + "loss": 1.7391, + "step": 32395 + }, + { + "epoch": 9.943523634131369, + "grad_norm": 0.18194718658924103, + "learning_rate": 8.455549422853315e-09, + "loss": 1.7539, + "step": 32396 + }, + { + "epoch": 9.943830570902394, + "grad_norm": 0.1416047215461731, + "learning_rate": 8.364387677295415e-09, + "loss": 1.7356, + "step": 32397 + }, + { + "epoch": 9.94413750767342, + "grad_norm": 0.1490311175584793, + "learning_rate": 8.27371997838e-09, + "loss": 1.7323, + "step": 32398 + }, + { + "epoch": 9.944444444444445, + "grad_norm": 0.1581144481897354, + "learning_rate": 8.183546327006353e-09, + "loss": 1.6542, + "step": 32399 + }, + { + "epoch": 9.94475138121547, + "grad_norm": 0.16656135022640228, + "learning_rate": 8.09386672406265e-09, + "loss": 1.7551, + "step": 32400 + }, + { + "epoch": 9.945058317986495, + "grad_norm": 0.1854424625635147, + "learning_rate": 8.004681170437067e-09, + "loss": 1.7196, + "step": 32401 + }, + { + "epoch": 9.94536525475752, + "grad_norm": 0.15835405886173248, + "learning_rate": 7.915989667006685e-09, + "loss": 1.6704, + "step": 32402 + }, + { + "epoch": 9.945672191528544, + "grad_norm": 0.12599913775920868, + "learning_rate": 7.82779221465968e-09, + "loss": 1.7035, + "step": 32403 + }, + { + "epoch": 9.94597912829957, + "grad_norm": 0.16099520027637482, + "learning_rate": 7.740088814256475e-09, + "loss": 1.6739, + "step": 32404 + }, + { + "epoch": 9.946286065070595, + "grad_norm": 0.12222954630851746, + "learning_rate": 7.652879466663043e-09, + "loss": 1.6991, + "step": 32405 + }, + { + "epoch": 9.94659300184162, + "grad_norm": 0.12297282367944717, + "learning_rate": 7.566164172750911e-09, + "loss": 1.6848, + "step": 32406 + }, + { + "epoch": 9.946899938612646, + "grad_norm": 0.12966325879096985, + "learning_rate": 7.479942933369399e-09, + "loss": 1.6989, + "step": 32407 + }, + { + "epoch": 9.94720687538367, + "grad_norm": 0.12732411921024323, + "learning_rate": 7.394215749367828e-09, + "loss": 1.6812, + "step": 32408 + }, + { + "epoch": 9.947513812154696, + "grad_norm": 0.11722669005393982, + "learning_rate": 7.308982621606619e-09, + "loss": 1.6813, + "step": 32409 + }, + { + "epoch": 9.947820748925722, + "grad_norm": 0.13928887248039246, + "learning_rate": 7.22424355091289e-09, + "loss": 1.6754, + "step": 32410 + }, + { + "epoch": 9.948127685696747, + "grad_norm": 0.1875402331352234, + "learning_rate": 7.139998538135961e-09, + "loss": 1.7694, + "step": 32411 + }, + { + "epoch": 9.948434622467772, + "grad_norm": 0.10383447259664536, + "learning_rate": 7.0562475841029485e-09, + "loss": 1.6683, + "step": 32412 + }, + { + "epoch": 9.948741559238798, + "grad_norm": 0.15648451447486877, + "learning_rate": 6.972990689635417e-09, + "loss": 1.7515, + "step": 32413 + }, + { + "epoch": 9.949048496009823, + "grad_norm": 0.15558132529258728, + "learning_rate": 6.890227855571585e-09, + "loss": 1.7311, + "step": 32414 + }, + { + "epoch": 9.949355432780846, + "grad_norm": 0.1698763221502304, + "learning_rate": 6.8079590827163645e-09, + "loss": 1.7688, + "step": 32415 + }, + { + "epoch": 9.949662369551872, + "grad_norm": 0.11423872411251068, + "learning_rate": 6.726184371885769e-09, + "loss": 1.6848, + "step": 32416 + }, + { + "epoch": 9.949969306322897, + "grad_norm": 0.11946321278810501, + "learning_rate": 6.6449037238902615e-09, + "loss": 1.6749, + "step": 32417 + }, + { + "epoch": 9.950276243093922, + "grad_norm": 0.16556024551391602, + "learning_rate": 6.564117139529202e-09, + "loss": 1.7386, + "step": 32418 + }, + { + "epoch": 9.950583179864948, + "grad_norm": 0.13177451491355896, + "learning_rate": 6.483824619607504e-09, + "loss": 1.691, + "step": 32419 + }, + { + "epoch": 9.950890116635973, + "grad_norm": 0.10789786279201508, + "learning_rate": 6.404026164913424e-09, + "loss": 1.6424, + "step": 32420 + }, + { + "epoch": 9.951197053406998, + "grad_norm": 0.1662123203277588, + "learning_rate": 6.32472177623522e-09, + "loss": 1.7511, + "step": 32421 + }, + { + "epoch": 9.951503990178024, + "grad_norm": 0.12785036861896515, + "learning_rate": 6.245911454361153e-09, + "loss": 1.6522, + "step": 32422 + }, + { + "epoch": 9.95181092694905, + "grad_norm": 0.12330711632966995, + "learning_rate": 6.167595200062825e-09, + "loss": 1.6776, + "step": 32423 + }, + { + "epoch": 9.952117863720074, + "grad_norm": 0.09788266569375992, + "learning_rate": 6.089773014122946e-09, + "loss": 1.6613, + "step": 32424 + }, + { + "epoch": 9.952424800491098, + "grad_norm": 0.18258565664291382, + "learning_rate": 6.0124448973075675e-09, + "loss": 1.6977, + "step": 32425 + }, + { + "epoch": 9.952731737262123, + "grad_norm": 0.13971279561519623, + "learning_rate": 5.935610850377194e-09, + "loss": 1.7138, + "step": 32426 + }, + { + "epoch": 9.953038674033149, + "grad_norm": 0.1644059717655182, + "learning_rate": 5.859270874092326e-09, + "loss": 1.6821, + "step": 32427 + }, + { + "epoch": 9.953345610804174, + "grad_norm": 0.20486295223236084, + "learning_rate": 5.783424969207918e-09, + "loss": 1.715, + "step": 32428 + }, + { + "epoch": 9.9536525475752, + "grad_norm": 0.16751137375831604, + "learning_rate": 5.7080731364733684e-09, + "loss": 1.75, + "step": 32429 + }, + { + "epoch": 9.953959484346225, + "grad_norm": 0.10631072521209717, + "learning_rate": 5.633215376638079e-09, + "loss": 1.6589, + "step": 32430 + }, + { + "epoch": 9.95426642111725, + "grad_norm": 0.12222696095705032, + "learning_rate": 5.5588516904403475e-09, + "loss": 1.6957, + "step": 32431 + }, + { + "epoch": 9.954573357888275, + "grad_norm": 0.1868433952331543, + "learning_rate": 5.48498207860737e-09, + "loss": 1.7432, + "step": 32432 + }, + { + "epoch": 9.9548802946593, + "grad_norm": 0.12732042372226715, + "learning_rate": 5.411606541877446e-09, + "loss": 1.6819, + "step": 32433 + }, + { + "epoch": 9.955187231430326, + "grad_norm": 0.1370798945426941, + "learning_rate": 5.338725080972218e-09, + "loss": 1.711, + "step": 32434 + }, + { + "epoch": 9.955494168201351, + "grad_norm": 0.13998152315616608, + "learning_rate": 5.266337696607782e-09, + "loss": 1.687, + "step": 32435 + }, + { + "epoch": 9.955801104972375, + "grad_norm": 0.08052362501621246, + "learning_rate": 5.194444389511333e-09, + "loss": 1.629, + "step": 32436 + }, + { + "epoch": 9.9561080417434, + "grad_norm": 0.1393222063779831, + "learning_rate": 5.123045160382311e-09, + "loss": 1.7062, + "step": 32437 + }, + { + "epoch": 9.956414978514426, + "grad_norm": 0.1532362848520279, + "learning_rate": 5.0521400099312614e-09, + "loss": 1.7417, + "step": 32438 + }, + { + "epoch": 9.956721915285451, + "grad_norm": 0.12600642442703247, + "learning_rate": 4.9817289388576215e-09, + "loss": 1.6931, + "step": 32439 + }, + { + "epoch": 9.957028852056476, + "grad_norm": 0.10447245836257935, + "learning_rate": 4.911811947860834e-09, + "loss": 1.6598, + "step": 32440 + }, + { + "epoch": 9.957335788827502, + "grad_norm": 0.13217110931873322, + "learning_rate": 4.842389037623684e-09, + "loss": 1.704, + "step": 32441 + }, + { + "epoch": 9.957642725598527, + "grad_norm": 0.13356858491897583, + "learning_rate": 4.773460208840064e-09, + "loss": 1.7246, + "step": 32442 + }, + { + "epoch": 9.957949662369552, + "grad_norm": 0.09947375953197479, + "learning_rate": 4.705025462187207e-09, + "loss": 1.649, + "step": 32443 + }, + { + "epoch": 9.958256599140578, + "grad_norm": 0.12298106402158737, + "learning_rate": 4.63708479834235e-09, + "loss": 1.7012, + "step": 32444 + }, + { + "epoch": 9.958563535911603, + "grad_norm": 0.11899507790803909, + "learning_rate": 4.569638217977179e-09, + "loss": 1.662, + "step": 32445 + }, + { + "epoch": 9.958870472682626, + "grad_norm": 0.10871480405330658, + "learning_rate": 4.502685721757827e-09, + "loss": 1.6718, + "step": 32446 + }, + { + "epoch": 9.959177409453652, + "grad_norm": 0.17709551751613617, + "learning_rate": 4.436227310344876e-09, + "loss": 1.755, + "step": 32447 + }, + { + "epoch": 9.959484346224677, + "grad_norm": 0.1668638288974762, + "learning_rate": 4.3702629843989095e-09, + "loss": 1.7693, + "step": 32448 + }, + { + "epoch": 9.959791282995702, + "grad_norm": 0.1600068211555481, + "learning_rate": 4.304792744569408e-09, + "loss": 1.6912, + "step": 32449 + }, + { + "epoch": 9.960098219766728, + "grad_norm": 0.10293091088533401, + "learning_rate": 4.2398165915003e-09, + "loss": 1.6801, + "step": 32450 + }, + { + "epoch": 9.960405156537753, + "grad_norm": 0.16815342009067535, + "learning_rate": 4.175334525841068e-09, + "loss": 1.7739, + "step": 32451 + }, + { + "epoch": 9.960712093308778, + "grad_norm": 0.08967147767543793, + "learning_rate": 4.111346548218986e-09, + "loss": 1.6412, + "step": 32452 + }, + { + "epoch": 9.961019030079804, + "grad_norm": 0.13091377913951874, + "learning_rate": 4.047852659277984e-09, + "loss": 1.7352, + "step": 32453 + }, + { + "epoch": 9.96132596685083, + "grad_norm": 0.1545487344264984, + "learning_rate": 3.9848528596397875e-09, + "loss": 1.6858, + "step": 32454 + }, + { + "epoch": 9.961632903621854, + "grad_norm": 0.1344275325536728, + "learning_rate": 3.9223471499205685e-09, + "loss": 1.7114, + "step": 32455 + }, + { + "epoch": 9.96193984039288, + "grad_norm": 0.13304302096366882, + "learning_rate": 3.860335530747605e-09, + "loss": 1.7032, + "step": 32456 + }, + { + "epoch": 9.962246777163905, + "grad_norm": 0.1125492975115776, + "learning_rate": 3.798818002731519e-09, + "loss": 1.7028, + "step": 32457 + }, + { + "epoch": 9.962553713934929, + "grad_norm": 0.12480182945728302, + "learning_rate": 3.7377945664773815e-09, + "loss": 1.7062, + "step": 32458 + }, + { + "epoch": 9.962860650705954, + "grad_norm": 0.15090548992156982, + "learning_rate": 3.677265222595816e-09, + "loss": 1.7534, + "step": 32459 + }, + { + "epoch": 9.96316758747698, + "grad_norm": 0.15135593712329865, + "learning_rate": 3.61722997167524e-09, + "loss": 1.7628, + "step": 32460 + }, + { + "epoch": 9.963474524248005, + "grad_norm": 0.14519482851028442, + "learning_rate": 3.5576888143096232e-09, + "loss": 1.6912, + "step": 32461 + }, + { + "epoch": 9.96378146101903, + "grad_norm": 0.13425637781620026, + "learning_rate": 3.4986417510929347e-09, + "loss": 1.6583, + "step": 32462 + }, + { + "epoch": 9.964088397790055, + "grad_norm": 0.15377762913703918, + "learning_rate": 3.440088782608042e-09, + "loss": 1.7223, + "step": 32463 + }, + { + "epoch": 9.96439533456108, + "grad_norm": 0.1342972368001938, + "learning_rate": 3.3820299094322604e-09, + "loss": 1.6861, + "step": 32464 + }, + { + "epoch": 9.964702271332106, + "grad_norm": 0.5013613700866699, + "learning_rate": 3.3244651321373555e-09, + "loss": 1.7253, + "step": 32465 + }, + { + "epoch": 9.965009208103131, + "grad_norm": 0.09804642200469971, + "learning_rate": 3.2673944512950915e-09, + "loss": 1.657, + "step": 32466 + }, + { + "epoch": 9.965316144874157, + "grad_norm": 0.12614911794662476, + "learning_rate": 3.210817867471683e-09, + "loss": 1.6729, + "step": 32467 + }, + { + "epoch": 9.96562308164518, + "grad_norm": 0.09813954681158066, + "learning_rate": 3.154735381216689e-09, + "loss": 1.6665, + "step": 32468 + }, + { + "epoch": 9.965930018416206, + "grad_norm": 0.11637084931135178, + "learning_rate": 3.0991469930963244e-09, + "loss": 1.6882, + "step": 32469 + }, + { + "epoch": 9.966236955187231, + "grad_norm": 0.11884592473506927, + "learning_rate": 3.0440527036490474e-09, + "loss": 1.6898, + "step": 32470 + }, + { + "epoch": 9.966543891958256, + "grad_norm": 0.11903903633356094, + "learning_rate": 2.989452513429969e-09, + "loss": 1.7055, + "step": 32471 + }, + { + "epoch": 9.966850828729282, + "grad_norm": 0.11886774003505707, + "learning_rate": 2.9353464229719962e-09, + "loss": 1.681, + "step": 32472 + }, + { + "epoch": 9.967157765500307, + "grad_norm": 0.1473800390958786, + "learning_rate": 2.8817344328080365e-09, + "loss": 1.7348, + "step": 32473 + }, + { + "epoch": 9.967464702271332, + "grad_norm": 0.16137374937534332, + "learning_rate": 2.8286165434709967e-09, + "loss": 1.7044, + "step": 32474 + }, + { + "epoch": 9.967771639042358, + "grad_norm": 0.20311129093170166, + "learning_rate": 2.7759927554882326e-09, + "loss": 1.791, + "step": 32475 + }, + { + "epoch": 9.968078575813383, + "grad_norm": 0.20193049311637878, + "learning_rate": 2.723863069375998e-09, + "loss": 1.7007, + "step": 32476 + }, + { + "epoch": 9.968385512584408, + "grad_norm": 0.208475723862648, + "learning_rate": 2.672227485656098e-09, + "loss": 1.739, + "step": 32477 + }, + { + "epoch": 9.968692449355434, + "grad_norm": 0.1389041393995285, + "learning_rate": 2.6210860048281325e-09, + "loss": 1.6907, + "step": 32478 + }, + { + "epoch": 9.968999386126457, + "grad_norm": 0.13423459231853485, + "learning_rate": 2.5704386274028046e-09, + "loss": 1.6886, + "step": 32479 + }, + { + "epoch": 9.969306322897483, + "grad_norm": 0.10988432168960571, + "learning_rate": 2.5202853538797144e-09, + "loss": 1.6567, + "step": 32480 + }, + { + "epoch": 9.969613259668508, + "grad_norm": 0.1565593034029007, + "learning_rate": 2.470626184758462e-09, + "loss": 1.7901, + "step": 32481 + }, + { + "epoch": 9.969920196439533, + "grad_norm": 0.14571799337863922, + "learning_rate": 2.421461120527546e-09, + "loss": 1.7062, + "step": 32482 + }, + { + "epoch": 9.970227133210559, + "grad_norm": 0.11386661976575851, + "learning_rate": 2.3727901616699134e-09, + "loss": 1.6825, + "step": 32483 + }, + { + "epoch": 9.970534069981584, + "grad_norm": 0.11852385848760605, + "learning_rate": 2.3246133086740617e-09, + "loss": 1.6912, + "step": 32484 + }, + { + "epoch": 9.97084100675261, + "grad_norm": 0.10368720442056656, + "learning_rate": 2.276930562006285e-09, + "loss": 1.6892, + "step": 32485 + }, + { + "epoch": 9.971147943523635, + "grad_norm": 0.13299435377120972, + "learning_rate": 2.22974192214398e-09, + "loss": 1.7124, + "step": 32486 + }, + { + "epoch": 9.97145488029466, + "grad_norm": 0.132483571767807, + "learning_rate": 2.1830473895478875e-09, + "loss": 1.6835, + "step": 32487 + }, + { + "epoch": 9.971761817065685, + "grad_norm": 0.11850076913833618, + "learning_rate": 2.1368469646898537e-09, + "loss": 1.6911, + "step": 32488 + }, + { + "epoch": 9.972068753836709, + "grad_norm": 0.15215659141540527, + "learning_rate": 2.091140648013967e-09, + "loss": 1.7441, + "step": 32489 + }, + { + "epoch": 9.972375690607734, + "grad_norm": 0.17135567963123322, + "learning_rate": 2.045928439980971e-09, + "loss": 1.7527, + "step": 32490 + }, + { + "epoch": 9.97268262737876, + "grad_norm": 0.185814768075943, + "learning_rate": 2.0012103410349537e-09, + "loss": 1.748, + "step": 32491 + }, + { + "epoch": 9.972989564149785, + "grad_norm": 0.15464171767234802, + "learning_rate": 1.956986351620005e-09, + "loss": 1.7463, + "step": 32492 + }, + { + "epoch": 9.97329650092081, + "grad_norm": 0.16765901446342468, + "learning_rate": 1.9132564721691114e-09, + "loss": 1.7198, + "step": 32493 + }, + { + "epoch": 9.973603437691835, + "grad_norm": 0.14002041518688202, + "learning_rate": 1.8700207031152606e-09, + "loss": 1.7073, + "step": 32494 + }, + { + "epoch": 9.97391037446286, + "grad_norm": 0.1588892936706543, + "learning_rate": 1.8272790448858879e-09, + "loss": 1.7064, + "step": 32495 + }, + { + "epoch": 9.974217311233886, + "grad_norm": 0.12804476916790009, + "learning_rate": 1.7850314979084292e-09, + "loss": 1.7077, + "step": 32496 + }, + { + "epoch": 9.974524248004911, + "grad_norm": 0.1506362110376358, + "learning_rate": 1.7432780625936672e-09, + "loss": 1.7021, + "step": 32497 + }, + { + "epoch": 9.974831184775937, + "grad_norm": 0.11829007416963577, + "learning_rate": 1.7020187393579356e-09, + "loss": 1.6897, + "step": 32498 + }, + { + "epoch": 9.975138121546962, + "grad_norm": 0.11921420693397522, + "learning_rate": 1.6612535286064656e-09, + "loss": 1.6818, + "step": 32499 + }, + { + "epoch": 9.975445058317986, + "grad_norm": 0.14553767442703247, + "learning_rate": 1.6209824307444888e-09, + "loss": 1.6768, + "step": 32500 + }, + { + "epoch": 9.975751995089011, + "grad_norm": 0.13848000764846802, + "learning_rate": 1.581205446166134e-09, + "loss": 1.7069, + "step": 32501 + }, + { + "epoch": 9.976058931860036, + "grad_norm": 0.1786017268896103, + "learning_rate": 1.541922575265531e-09, + "loss": 1.784, + "step": 32502 + }, + { + "epoch": 9.976365868631062, + "grad_norm": 0.1475924253463745, + "learning_rate": 1.5031338184368082e-09, + "loss": 1.716, + "step": 32503 + }, + { + "epoch": 9.976672805402087, + "grad_norm": 0.13834452629089355, + "learning_rate": 1.464839176062993e-09, + "loss": 1.6978, + "step": 32504 + }, + { + "epoch": 9.976979742173112, + "grad_norm": 0.20726680755615234, + "learning_rate": 1.4270386485104591e-09, + "loss": 1.8085, + "step": 32505 + }, + { + "epoch": 9.977286678944138, + "grad_norm": 0.13717865943908691, + "learning_rate": 1.3897322361677845e-09, + "loss": 1.7291, + "step": 32506 + }, + { + "epoch": 9.977593615715163, + "grad_norm": 0.14107772707939148, + "learning_rate": 1.3529199393902404e-09, + "loss": 1.745, + "step": 32507 + }, + { + "epoch": 9.977900552486188, + "grad_norm": 0.1317019909620285, + "learning_rate": 1.3166017585553026e-09, + "loss": 1.727, + "step": 32508 + }, + { + "epoch": 9.978207489257214, + "grad_norm": 0.1404808610677719, + "learning_rate": 1.2807776940126915e-09, + "loss": 1.758, + "step": 32509 + }, + { + "epoch": 9.978514426028239, + "grad_norm": 0.1774541139602661, + "learning_rate": 1.2454477461176784e-09, + "loss": 1.7361, + "step": 32510 + }, + { + "epoch": 9.978821362799263, + "grad_norm": 0.153715580701828, + "learning_rate": 1.2106119152199835e-09, + "loss": 1.7816, + "step": 32511 + }, + { + "epoch": 9.979128299570288, + "grad_norm": 0.15706269443035126, + "learning_rate": 1.1762702016637762e-09, + "loss": 1.7328, + "step": 32512 + }, + { + "epoch": 9.979435236341313, + "grad_norm": 0.13986578583717346, + "learning_rate": 1.142422605787674e-09, + "loss": 1.7263, + "step": 32513 + }, + { + "epoch": 9.979742173112339, + "grad_norm": 0.12627844512462616, + "learning_rate": 1.1090691279302957e-09, + "loss": 1.6769, + "step": 32514 + }, + { + "epoch": 9.980049109883364, + "grad_norm": 0.1719161719083786, + "learning_rate": 1.0762097684191563e-09, + "loss": 1.7306, + "step": 32515 + }, + { + "epoch": 9.98035604665439, + "grad_norm": 0.11803223192691803, + "learning_rate": 1.043844527576221e-09, + "loss": 1.6714, + "step": 32516 + }, + { + "epoch": 9.980662983425415, + "grad_norm": 0.18038241565227509, + "learning_rate": 1.0119734057234542e-09, + "loss": 1.7246, + "step": 32517 + }, + { + "epoch": 9.98096992019644, + "grad_norm": 0.11659211665391922, + "learning_rate": 9.805964031717186e-10, + "loss": 1.7338, + "step": 32518 + }, + { + "epoch": 9.981276856967465, + "grad_norm": 0.13037735223770142, + "learning_rate": 9.497135202429785e-10, + "loss": 1.6738, + "step": 32519 + }, + { + "epoch": 9.98158379373849, + "grad_norm": 0.1371074765920639, + "learning_rate": 9.193247572314434e-10, + "loss": 1.7392, + "step": 32520 + }, + { + "epoch": 9.981890730509516, + "grad_norm": 0.11290184408426285, + "learning_rate": 8.894301144368733e-10, + "loss": 1.6831, + "step": 32521 + }, + { + "epoch": 9.98219766728054, + "grad_norm": 0.12521374225616455, + "learning_rate": 8.600295921590285e-10, + "loss": 1.6805, + "step": 32522 + }, + { + "epoch": 9.982504604051565, + "grad_norm": 0.15203195810317993, + "learning_rate": 8.31123190692118e-10, + "loss": 1.6945, + "step": 32523 + }, + { + "epoch": 9.98281154082259, + "grad_norm": 0.153851717710495, + "learning_rate": 8.027109103136976e-10, + "loss": 1.7082, + "step": 32524 + }, + { + "epoch": 9.983118477593615, + "grad_norm": 0.16608181595802307, + "learning_rate": 7.747927513124254e-10, + "loss": 1.6922, + "step": 32525 + }, + { + "epoch": 9.98342541436464, + "grad_norm": 0.12815186381340027, + "learning_rate": 7.473687139547548e-10, + "loss": 1.6925, + "step": 32526 + }, + { + "epoch": 9.983732351135666, + "grad_norm": 0.16700038313865662, + "learning_rate": 7.204387985237926e-10, + "loss": 1.7956, + "step": 32527 + }, + { + "epoch": 9.984039287906691, + "grad_norm": 0.1810149997472763, + "learning_rate": 6.940030052748902e-10, + "loss": 1.7493, + "step": 32528 + }, + { + "epoch": 9.984346224677717, + "grad_norm": 0.14241847395896912, + "learning_rate": 6.680613344745013e-10, + "loss": 1.693, + "step": 32529 + }, + { + "epoch": 9.984653161448742, + "grad_norm": 0.1244758740067482, + "learning_rate": 6.426137863779768e-10, + "loss": 1.7357, + "step": 32530 + }, + { + "epoch": 9.984960098219767, + "grad_norm": 0.12879830598831177, + "learning_rate": 6.176603612351173e-10, + "loss": 1.7195, + "step": 32531 + }, + { + "epoch": 9.985267034990791, + "grad_norm": 0.16025955975055695, + "learning_rate": 5.932010592901715e-10, + "loss": 1.7286, + "step": 32532 + }, + { + "epoch": 9.985573971761816, + "grad_norm": 0.13949505984783173, + "learning_rate": 5.69235880798491e-10, + "loss": 1.7682, + "step": 32533 + }, + { + "epoch": 9.985880908532842, + "grad_norm": 0.10753419995307922, + "learning_rate": 5.457648259821202e-10, + "loss": 1.6935, + "step": 32534 + }, + { + "epoch": 9.986187845303867, + "grad_norm": 0.1572565734386444, + "learning_rate": 5.22787895074206e-10, + "loss": 1.6922, + "step": 32535 + }, + { + "epoch": 9.986494782074892, + "grad_norm": 0.1194307953119278, + "learning_rate": 5.003050883134464e-10, + "loss": 1.7086, + "step": 32536 + }, + { + "epoch": 9.986801718845918, + "grad_norm": 0.13817691802978516, + "learning_rate": 4.783164059107836e-10, + "loss": 1.7197, + "step": 32537 + }, + { + "epoch": 9.987108655616943, + "grad_norm": 0.12894465029239655, + "learning_rate": 4.5682184808271135e-10, + "loss": 1.699, + "step": 32538 + }, + { + "epoch": 9.987415592387968, + "grad_norm": 0.16827619075775146, + "learning_rate": 4.3582141505127403e-10, + "loss": 1.7492, + "step": 32539 + }, + { + "epoch": 9.987722529158994, + "grad_norm": 0.1456209421157837, + "learning_rate": 4.153151070163119e-10, + "loss": 1.7054, + "step": 32540 + }, + { + "epoch": 9.988029465930019, + "grad_norm": 0.11338894069194794, + "learning_rate": 3.9530292418321624e-10, + "loss": 1.6704, + "step": 32541 + }, + { + "epoch": 9.988336402701044, + "grad_norm": 0.19609282910823822, + "learning_rate": 3.757848667518271e-10, + "loss": 1.7457, + "step": 32542 + }, + { + "epoch": 9.988643339472068, + "grad_norm": 0.10050071775913239, + "learning_rate": 3.5676093491088245e-10, + "loss": 1.6396, + "step": 32543 + }, + { + "epoch": 9.988950276243093, + "grad_norm": 0.13579551875591278, + "learning_rate": 3.3823112884912024e-10, + "loss": 1.7123, + "step": 32544 + }, + { + "epoch": 9.989257213014119, + "grad_norm": 0.10279065370559692, + "learning_rate": 3.201954487552783e-10, + "loss": 1.6712, + "step": 32545 + }, + { + "epoch": 9.989564149785144, + "grad_norm": 0.1435621976852417, + "learning_rate": 3.0265389479589015e-10, + "loss": 1.6891, + "step": 32546 + }, + { + "epoch": 9.98987108655617, + "grad_norm": 0.14991097152233124, + "learning_rate": 2.8560646715969365e-10, + "loss": 1.7021, + "step": 32547 + }, + { + "epoch": 9.990178023327195, + "grad_norm": 0.08986492455005646, + "learning_rate": 2.6905316600212006e-10, + "loss": 1.636, + "step": 32548 + }, + { + "epoch": 9.99048496009822, + "grad_norm": 0.1458725482225418, + "learning_rate": 2.529939914897028e-10, + "loss": 1.7235, + "step": 32549 + }, + { + "epoch": 9.990791896869245, + "grad_norm": 0.12907643616199493, + "learning_rate": 2.374289437834243e-10, + "loss": 1.6899, + "step": 32550 + }, + { + "epoch": 9.99109883364027, + "grad_norm": 0.12231683731079102, + "learning_rate": 2.2235802303871567e-10, + "loss": 1.775, + "step": 32551 + }, + { + "epoch": 9.991405770411296, + "grad_norm": 0.11932116001844406, + "learning_rate": 2.077812294054571e-10, + "loss": 1.7045, + "step": 32552 + }, + { + "epoch": 9.99171270718232, + "grad_norm": 0.10503232479095459, + "learning_rate": 1.9369856301687528e-10, + "loss": 1.6758, + "step": 32553 + }, + { + "epoch": 9.992019643953345, + "grad_norm": 0.12878039479255676, + "learning_rate": 1.8011002402840148e-10, + "loss": 1.7219, + "step": 32554 + }, + { + "epoch": 9.99232658072437, + "grad_norm": 0.13469719886779785, + "learning_rate": 1.6701561256216025e-10, + "loss": 1.7188, + "step": 32555 + }, + { + "epoch": 9.992633517495396, + "grad_norm": 0.14357881247997284, + "learning_rate": 1.544153287513783e-10, + "loss": 1.7209, + "step": 32556 + }, + { + "epoch": 9.99294045426642, + "grad_norm": 0.11817539483308792, + "learning_rate": 1.4230917271818022e-10, + "loss": 1.6481, + "step": 32557 + }, + { + "epoch": 9.993247391037446, + "grad_norm": 0.13635072112083435, + "learning_rate": 1.306971445846905e-10, + "loss": 1.7076, + "step": 32558 + }, + { + "epoch": 9.993554327808472, + "grad_norm": 0.11874140799045563, + "learning_rate": 1.195792444674826e-10, + "loss": 1.6623, + "step": 32559 + }, + { + "epoch": 9.993861264579497, + "grad_norm": 0.15637235343456268, + "learning_rate": 1.0895547247757875e-10, + "loss": 1.7316, + "step": 32560 + }, + { + "epoch": 9.994168201350522, + "grad_norm": 0.11796044558286667, + "learning_rate": 9.882582870934798e-11, + "loss": 1.6645, + "step": 32561 + }, + { + "epoch": 9.994475138121548, + "grad_norm": 0.12259721755981445, + "learning_rate": 8.919031327936367e-11, + "loss": 1.6972, + "step": 32562 + }, + { + "epoch": 9.994782074892573, + "grad_norm": 0.12288567423820496, + "learning_rate": 8.004892626534144e-11, + "loss": 1.6727, + "step": 32563 + }, + { + "epoch": 9.995089011663598, + "grad_norm": 0.13289032876491547, + "learning_rate": 7.140166777275248e-11, + "loss": 1.7406, + "step": 32564 + }, + { + "epoch": 9.995395948434622, + "grad_norm": 0.15300492942333221, + "learning_rate": 6.324853787376128e-11, + "loss": 1.6974, + "step": 32565 + }, + { + "epoch": 9.995702885205647, + "grad_norm": 0.1378776878118515, + "learning_rate": 5.5589536662736805e-11, + "loss": 1.7007, + "step": 32566 + }, + { + "epoch": 9.996009821976672, + "grad_norm": 0.12087785452604294, + "learning_rate": 4.842466420629244e-11, + "loss": 1.681, + "step": 32567 + }, + { + "epoch": 9.996316758747698, + "grad_norm": 0.14573785662651062, + "learning_rate": 4.175392057659266e-11, + "loss": 1.7968, + "step": 32568 + }, + { + "epoch": 9.996623695518723, + "grad_norm": 0.1530621200799942, + "learning_rate": 3.557730583469976e-11, + "loss": 1.6813, + "step": 32569 + }, + { + "epoch": 9.996930632289748, + "grad_norm": 0.17444150149822235, + "learning_rate": 2.989482005277822e-11, + "loss": 1.6974, + "step": 32570 + }, + { + "epoch": 9.997237569060774, + "grad_norm": 0.09050992876291275, + "learning_rate": 2.4706463280788073e-11, + "loss": 1.6643, + "step": 32571 + }, + { + "epoch": 9.997544505831799, + "grad_norm": 0.11656873673200607, + "learning_rate": 2.001223557424048e-11, + "loss": 1.6748, + "step": 32572 + }, + { + "epoch": 9.997851442602824, + "grad_norm": 0.14353762567043304, + "learning_rate": 1.5812136971993242e-11, + "loss": 1.7003, + "step": 32573 + }, + { + "epoch": 9.99815837937385, + "grad_norm": 0.10622245073318481, + "learning_rate": 1.210616751845528e-11, + "loss": 1.6748, + "step": 32574 + }, + { + "epoch": 9.998465316144873, + "grad_norm": 0.1387772560119629, + "learning_rate": 8.894327252484403e-12, + "loss": 1.7133, + "step": 32575 + }, + { + "epoch": 9.998772252915899, + "grad_norm": 0.11832106858491898, + "learning_rate": 6.1766162018361824e-12, + "loss": 1.6548, + "step": 32576 + }, + { + "epoch": 9.999079189686924, + "grad_norm": 0.14122611284255981, + "learning_rate": 3.953034399817312e-12, + "loss": 1.737, + "step": 32577 + }, + { + "epoch": 9.99938612645795, + "grad_norm": 0.22168445587158203, + "learning_rate": 2.2235818630811368e-12, + "loss": 1.7414, + "step": 32578 + }, + { + "epoch": 9.999693063228975, + "grad_norm": 0.1424793303012848, + "learning_rate": 9.88258608281001e-13, + "loss": 1.6799, + "step": 32579 + }, + { + "epoch": 10.0, + "grad_norm": 0.1208924725651741, + "learning_rate": 2.4706465207025023e-13, + "loss": 1.6843, + "step": 32580 + }, + { + "epoch": 10.0, + "step": 32580, + "total_flos": 1.213259355550921e+21, + "train_loss": 0.34183392285788405, + "train_runtime": 59878.4696, + "train_samples_per_second": 4457.039, + "train_steps_per_second": 0.544 + } + ], + "logging_steps": 1.0, + "max_steps": 32580, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.213259355550921e+21, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..b6af1535a0e2165d1952d5e6c3ab94ece29b3d2e --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9c7985017961b4019da284385f3d0f4b95f69682383f9cf12749177ae67aa87 +size 7288